# HG changeset patch # User mahtabm # Date 1365660113 14400 # Node ID 1f6dce3d34e06d84b04e117b23f3d067423d8343 Uploaded diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Align/AlignI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Align/AlignI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,824 @@ +# $Id: AlignI.pm,v 1.7 2002/10/22 07:45:10 lapp Exp $ +# +# BioPerl module for Bio::Align::AlignI +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Align::AlignI - An interface for describing sequence alignments. + +=head1 SYNOPSIS + + # get a Bio::Align::AlignI somehow - typically using Bio::AlignIO system + # some descriptors + print $aln->length, "\n"; + print $aln->no_residues, "\n"; + print $aln->is_flush, "\n"; + print $aln->no_sequences, "\n"; + print $aln->percentage_identity, "\n"; + print $aln->consensus_string(50), "\n"; + + # find the position in the alignment for a sequence location + $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6; + + # extract sequences and check values for the alignment column $pos + foreach $seq ($aln->each_seq) { + $res = $seq->subseq($pos, $pos); + $count{$res}++; + } + foreach $res (keys %count) { + printf "Res: %s Count: %2d\n", $res, $count{$res}; + } + +=head1 DESCRIPTION + +This interface describes the basis for alignment objects. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +=head1 CONTRIBUTORS + +Ewan Birney, birney@ebi.ac.uk +Heikki Lehvaslaiho, heikki@ebi.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Align::AlignI; +use vars qw(@ISA); +use strict; + +use Bio::Root::RootI; + +@ISA = qw(Bio::Root::RootI); + +=head1 Modifier methods + +These methods modify the MSE by adding, removing or shuffling complete +sequences. + +=head2 add_seq + + Title : add_seq + Usage : $myalign->add_seq($newseq); + Function : Adds another sequence to the alignment. *Does not* align + it - just adds it to the hashes. + Returns : nothing + Argument : a Bio::LocatableSeq object + order (optional) + +See L for more information. + +=cut + +sub add_seq { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 remove_seq + + Title : remove_seq + Usage : $aln->remove_seq($seq); + Function : Removes a single sequence from an alignment + Returns : + Argument : a Bio::LocatableSeq object + +=cut + +sub remove_seq { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 purge + + Title : purge + Usage : $aln->purge(0.7); + Function: + + Removes sequences above whatever %id. + + This function will grind on large alignments. Beware! + (perhaps not ideally implemented) + + Example : + Returns : An array of the removed sequences + Argument: + + +=cut + +sub purge { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 sort_alphabetically + + Title : sort_alphabetically + Usage : $ali->sort_alphabetically + Function : + + Changes the order of the alignemnt to alphabetical on name + followed by numerical by number. + + Returns : + Argument : + +=cut + +sub sort_alphabetically { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head1 Sequence selection methods + +Methods returning one or more sequences objects. + +=head2 each_seq + + Title : each_seq + Usage : foreach $seq ( $align->each_seq() ) + Function : Gets an array of Seq objects from the alignment + Returns : an array + Argument : + +=cut + +sub each_seq { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 each_alphabetically + + Title : each_alphabetically + Usage : foreach $seq ( $ali->each_alphabetically() ) + Function : + + Returns an array of sequence object sorted alphabetically + by name and then by start point. + Does not change the order of the alignment + + Returns : + Argument : + +=cut + +sub each_alphabetically { + my($self) = @_; + $self->throw_not_implemented(); +} + +=head2 each_seq_with_id + + Title : each_seq_with_id + Usage : foreach $seq ( $align->each_seq_with_id() ) + Function : + + Gets an array of Seq objects from the + alignment, the contents being those sequences + with the given name (there may be more than one) + + Returns : an array + Argument : a seq name + +=cut + +sub each_seq_with_id { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 get_seq_by_pos + + Title : get_seq_by_pos + Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment + Function : + + Gets a sequence based on its position in the alignment. + Numbering starts from 1. Sequence positions larger than + no_sequences() will thow an error. + + Returns : a Bio::LocatableSeq object + Argument : positive integer for the sequence osition + +=cut + +sub get_seq_by_pos { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head1 Create new alignments + +The result of these methods are horizontal or vertical subsets of the +current MSE. + +=head2 select + + Title : select + Usage : $aln2 = $aln->select(1, 3) # three first sequences + Function : + + Creates a new alignment from a continuous subset of + sequences. Numbering starts from 1. Sequence positions + larger than no_sequences() will thow an error. + + Returns : a Bio::SimpleAlign object + Argument : positive integer for the first sequence + positive integer for the last sequence to include (optional) + +=cut + +sub select { + my ($self) = @_; + $self->throw_not_implemented(); +} + + +=head2 select_noncont + + Title : select_noncont + Usage : $aln2 = $aln->select_noncont(1, 3) # first and 3rd sequences + Function : + + Creates a new alignment from a subset of + sequences. Numbering starts from 1. Sequence positions + larger than no_sequences() will thow an error. + + Returns : a Bio::SimpleAlign object + Args : array of integers for the sequences + +=cut + +sub select_noncont { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 slice + + Title : slice + Usage : $aln2 = $aln->slice(20, 30) + Function : + + Creates a slice from the alignment inclusive of start and + end columns. Sequences with no residues in the slice are + excluded from the new alignment and a warning is printed. + Slice beyond the length of the sequence does not do + padding. + + Returns : a Bio::SimpleAlign object + Argument : positive integer for start column + positive integer for end column + +=cut + +sub slice { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head1 Change sequences within the MSE + +These methods affect characters in all sequences without changeing the +alignment. + + +=head2 map_chars + + Title : map_chars + Usage : $ali->map_chars('\.','-') + Function : + + Does a s/$arg1/$arg2/ on the sequences. Useful for gap + characters + + Notice that the from (arg1) is interpretted as a regex, + so be careful about quoting meta characters (eg + $ali->map_chars('.','-') wont do what you want) + + Returns : + Argument : 'from' rexexp + 'to' string + +=cut + +sub map_chars { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 uppercase + + Title : uppercase() + Usage : $ali->uppercase() + Function : Sets all the sequences to uppercase + Returns : + Argument : + +=cut + +sub uppercase { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 match_line + + Title : match_line() + Usage : $align->match_line() + Function : Generates a match line - much like consensus string + except that a line indicating the '*' for a match. + Argument : (optional) Match line characters ('*' by default) + (optional) Strong match char (':' by default) + (optional) Weak match char ('.' by default) + +=cut + +sub match_line { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 match + + Title : match() + Usage : $ali->match() + Function : + + Goes through all columns and changes residues that are + identical to residue in first sequence to match '.' + character. Sets match_char. + + USE WITH CARE: Most MSE formats do not support match + characters in sequences, so this is mostly for output + only. NEXUS format (Bio::AlignIO::nexus) can handle + it. + + Returns : 1 + Argument : a match character, optional, defaults to '.' + +=cut + +sub match { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 unmatch + + Title : unmatch() + Usage : $ali->unmatch() + Function : + + Undoes the effect of method match. Unsets match_char. + + Returns : 1 + Argument : a match character, optional, defaults to '.' + +=cut + +sub unmatch { + my ($self) = @_; + $self->throw_not_implemented(); +} + + +=head1 MSE attibutes + +Methods for setting and reading the MSE attributes. + +Note that the methods defining character semantics depend on the user +to set them sensibly. They are needed only by certain input/output +methods. Unset them by setting to an empty string (''). + +=head2 id + + Title : id + Usage : $myalign->id("Ig") + Function : Gets/sets the id field of the alignment + Returns : An id string + Argument : An id string (optional) + +=cut + +sub id { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 missing_char + + Title : missing_char + Usage : $myalign->missing_char("?") + Function : Gets/sets the missing_char attribute of the alignment + It is generally recommended to set it to 'n' or 'N' + for nucleotides and to 'X' for protein. + Returns : An missing_char string, + Argument : An missing_char string (optional) + +=cut + +sub missing_char { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 match_char + + Title : match_char + Usage : $myalign->match_char('.') + Function : Gets/sets the match_char attribute of the alignment + Returns : An match_char string, + Argument : An match_char string (optional) + +=cut + +sub match_char { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 gap_char + + Title : gap_char + Usage : $myalign->gap_char('-') + Function : Gets/sets the gap_char attribute of the alignment + Returns : An gap_char string, defaults to '-' + Argument : An gap_char string (optional) + +=cut + +sub gap_char { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 symbol_chars + + Title : symbol_chars + Usage : my @symbolchars = $aln->symbol_chars; + Function: Returns all the seen symbols (other than gaps) + Returns : array of characters that are the seen symbols + Argument: boolean to include the gap/missing/match characters + +=cut + +sub symbol_chars{ + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head1 Alignment descriptors + +These read only methods describe the MSE in various ways. + + +=head2 consensus_string + + Title : consensus_string + Usage : $str = $ali->consensus_string($threshold_percent) + Function : Makes a strict consensus + Returns : + Argument : Optional treshold ranging from 0 to 100. + The consensus residue has to appear at least threshold % + of the sequences at a given location, otherwise a '?' + character will be placed at that location. + (Default value = 0%) + +=cut + +sub consensus_string { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 consensus_iupac + + Title : consensus_iupac + Usage : $str = $ali->consensus_iupac() + Function : + + Makes a consensus using IUPAC ambiguity codes from DNA + and RNA. The output is in upper case except when gaps in + a column force output to be in lower case. + + Note that if your alignment sequences contain a lot of + IUPAC ambiquity codes you often have to manually set + alphabet. Bio::PrimarySeq::_guess_type thinks they + indicate a protein sequence. + + Returns : consensus string + Argument : none + Throws : on protein sequences + + +=cut + +sub consensus_iupac { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 is_flush + + Title : is_flush + Usage : if( $ali->is_flush() ) + : + : + Function : Tells you whether the alignment + : is flush, ie all of the same length + : + : + Returns : 1 or 0 + Argument : + +=cut + +sub is_flush { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 length + + Title : length() + Usage : $len = $ali->length() + Function : Returns the maximum length of the alignment. + To be sure the alignment is a block, use is_flush + Returns : + Argument : + +=cut + +sub length { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 maxdisplayname_length + + Title : maxdisplayname_length + Usage : $ali->maxdisplayname_length() + Function : + + Gets the maximum length of the displayname in the + alignment. Used in writing out various MSE formats. + + Returns : integer + Argument : + +=cut + +sub maxname_length { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 no_residues + + Title : no_residues + Usage : $no = $ali->no_residues + Function : number of residues in total in the alignment + Returns : integer + Argument : + +=cut + +sub no_residues { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 no_sequences + + Title : no_sequences + Usage : $depth = $ali->no_sequences + Function : number of sequence in the sequence alignment + Returns : integer + Argument : None + +=cut + +sub no_sequences { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 percentage_identity + + Title : percentage_identity + Usage : $id = $align->percentage_identity + Function: The function calculates the percentage identity of the alignment + Returns : The percentage identity of the alignment (as defined by the + implementation) + Argument: None + +=cut + +sub percentage_identity{ + my ($self) = @_; + $self->throw_not_implemeneted(); +} + +=head2 overall_percentage_identity + + Title : percentage_identity + Usage : $id = $align->percentage_identity + Function: The function calculates the percentage identity of + the conserved columns + Returns : The percentage identity of the conserved columns + Args : None + +=cut + +sub overall_percentage_identity{ + my ($self) = @_; + $self->throw_not_implemented(); +} + + +=head2 average_percentage_identity + + Title : average_percentage_identity + Usage : $id = $align->average_percentage_identity + Function: The function uses a fast method to calculate the average + percentage identity of the alignment + Returns : The average percentage identity of the alignment + Args : None + +=cut + +sub average_percentage_identity{ + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head1 Alignment positions + +Methods to map a sequence position into an alignment column and back. +column_from_residue_number() does the former. The latter is really a +property of the sequence object and can done using +L: + + # select somehow a sequence from the alignment, e.g. + my $seq = $aln->get_seq_by_pos(1); + #$loc is undef or Bio::LocationI object + my $loc = $seq->location_from_column(5); + + +=head2 column_from_residue_number + + Title : column_from_residue_number + Usage : $col = $ali->column_from_residue_number( $seqname, $resnumber) + Function: + + This function gives the position in the alignment + (i.e. column number) of the given residue number in the + sequence with the given name. For example, for the + alignment + + Seq1/91-97 AC..DEF.GH + Seq2/24-30 ACGG.RTY.. + Seq3/43-51 AC.DDEFGHI + + column_from_residue_number( "Seq1", 94 ) returns 5. + column_from_residue_number( "Seq2", 25 ) returns 2. + column_from_residue_number( "Seq3", 50 ) returns 9. + + An exception is thrown if the residue number would lie + outside the length of the aligment + (e.g. column_from_residue_number( "Seq2", 22 ) + + Note: If the the parent sequence is represented by more than + one alignment sequence and the residue number is present in + them, this method finds only the first one. + + Returns : A column number for the position in the alignment of the + given residue in the given sequence (1 = first column) + Args : A sequence id/name (not a name/start-end) + A residue number in the whole sequence (not just that + segment of it in the alignment) + +=cut + +sub column_from_residue_number { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head1 Sequence names + +Methods to manipulate the display name. The default name based on the +sequence id and subsequence positions can be overridden in various +ways. + +=head2 displayname + + Title : displayname + Usage : $myalign->displayname("Ig", "IgA") + Function : Gets/sets the display name of a sequence in the alignment + : + Returns : A display name string + Argument : name of the sequence + displayname of the sequence (optional) + +=cut + +sub displayname { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 set_displayname_count + + Title : set_displayname_count + Usage : $ali->set_displayname_count + Function : + + Sets the names to be name_# where # is the number of + times this name has been used. + + Returns : None + Argument : None + +=cut + +sub set_displayname_count { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 set_displayname_flat + + Title : set_displayname_flat + Usage : $ali->set_displayname_flat() + Function : Makes all the sequences be displayed as just their name, + not name/start-end + Returns : 1 + Argument : None + +=cut + +sub set_displayname_flat { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 set_displayname_normal + + Title : set_displayname_normal + Usage : $ali->set_displayname_normal() + Function : Makes all the sequences be displayed as name/start-end + Returns : None + Argument : None + +=cut + +sub set_displayname_normal { + my ($self) = @_; + $self->throw_not_implemented(); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Align/DNAStatistics.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Align/DNAStatistics.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,683 @@ +# $Id: DNAStatistics.pm,v 1.4 2002/10/22 07:45:10 lapp Exp $ +# +# BioPerl module for Bio::Align::DNAStatistics +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Align::DNAStatistics - Calculate some statistics for a DNA alignment + +=head1 SYNOPSIS + + use Bio::Align::DNAStatistics; + use Bio::AlignIO; + + my $stats = new Bio::Align::PairwiseStatistics; + my $alignin = new Bio::AlignIO(-format => 'emboss', + -file => 't/data/insulin.water'); + my $jc = $stats->distance($aln, 'Jukes-Cantor'); + foreach my $r ( @$jc ) { + print "\t"; + foreach my $r ( @$d ) { + print "$r\t"; + } + print "\n"; + } + +=head1 DESCRIPTION + +This object contains routines for calculating various statistics and +distances for DNA alignments. The routines are not well tested and do +contain errors at this point. Work is underway to correct them, but +do not expect this code to give you the right answer currently! Use +dnadist/distmat in the PHLYIP or EMBOSS packages to calculate the +distances. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Align::DNAStatistics; +use vars qw(@ISA %DNAChanges @Nucleotides %NucleotideIndexes + $GapChars $SeqCount $DefaultGapPenalty %DistanceMethods); +use strict; +use Bio::Align::PairwiseStatistics; +use Bio::Root::Root; + +BEGIN { + $GapChars = '(\.|\-)'; + @Nucleotides = qw(A G T C); + $SeqCount = 2; + # these values come from EMBOSS distmat implementation + %NucleotideIndexes = ( 'A' => 0, + 'T' => 1, + 'C' => 2, + 'G' => 3, + + 'AT' => 0, + 'AC' => 1, + 'AG' => 2, + 'CT' => 3, + 'GT' => 4, + 'CG' => 5, + +# these are wrong now +# 'S' => [ 1, 3], +# 'W' => [ 0, 4], +# 'Y' => [ 2, 3], +# 'R' => [ 0, 1], +# 'M' => [ 0, 3], +# 'K' => [ 1, 2], +# 'B' => [ 1, 2, 3], +# 'H' => [ 0, 2, 3], +# 'V' => [ 0, 1, 3], +# 'D' => [ 0, 1, 2], + ); + + $DefaultGapPenalty = 0; + # could put ambiguities here? + %DNAChanges = ( 'Transversions' => { 'A' => [ 'T', 'C'], + 'T' => [ 'A', 'G'], + 'C' => [ 'A', 'G'], + 'G' => [ 'C', 'T'], + }, + 'Transitions' => { 'A' => [ 'G' ], + 'G' => [ 'A' ], + 'C' => [ 'T' ], + 'T' => [ 'C' ], + }, + ); + %DistanceMethods = ( 'jc|jukes|jukes-cantor' => 'JukesCantor', + 'f81' => 'F81', + 'k2|k2p|k80|kimura' => 'Kimura', + 't92|tamura|tamura92' => 'Tamura', + 'f84' => 'F84', + 'tajimanei|tajima-nei' => 'TajimaNei' ); +} + +@ISA = qw( Bio::Root::Root Bio::Align::StatisticsI ); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Align::DNAStatistics(); + Function: Builds a new Bio::Align::DNAStatistics object + Returns : Bio::Align::DNAStatistics + Args : none + + +=cut + +sub new { + my ($class,@args) = @_; + my $self = $class->SUPER::new(@args); + + $self->pairwise_stats( new Bio::Align::PairwiseStatistics()); + + return $self; +} + + +=head2 distance + + Title : distance + Usage : my $distance_mat = $stats->distance(-align => $aln, + -method => $method); + Function: Calculates a distance matrix for all pairwise distances of + sequences in an alignment. + Returns : Array ref + Args : -align => Bio::Align::AlignI object + -method => String specifying specific distance method + (implementing class may assume a default) + +=cut + +sub distance{ + my ($self,@args) = @_; + my ($aln,$method) = $self->_rearrange([qw(ALIGN METHOD)],@args); + if( ! defined $aln || ! ref ($aln) || ! $aln->isa('Bio::Align::AlignI') ) { + $self->throw("Must supply a valid Bio::Align::AlignI for the -align parameter in distance"); + } + $method ||= 'JukesCantor'; + foreach my $m ( keys %DistanceMethods ) { + if(defined $m && $method =~ /$m/i ) { + my $mtd = "D_$DistanceMethods{$m}"; + return $self->$mtd($aln); + } + } + $self->warn("Unrecognized distance method $method must be one of [". + join(',',$self->available_distance_methods())."]"); + return undef; +} + +=head2 available_distance_methods + + Title : available_distance_methods + Usage : my @methods = $stats->available_distance_methods(); + Function: Enumerates the possible distance methods + Returns : Array of strings + Args : none + + +=cut + +sub available_distance_methods{ + my ($self,@args) = @_; + return values %DistanceMethods; +} + +=head2 D - distance methods + +=cut + +=head2 D_JukesCantor + + Title : D_JukesCantor + Usage : my $d = $stat->D_JukesCantor($aln) + Function: Calculates D (pairwise distance) between 2 sequences in an + alignment using the Jukes-Cantor 1 parameter model. + Returns : ArrayRef of all pairwise distances of all sequence pairs in the alignment + Args : Bio::Align::AlignI of DNA sequences + double - gap penalty + + +=cut + +sub D_JukesCantor{ + my ($self,$aln,$gappenalty) = @_; + return 0 unless $self->_check_arg($aln); + $gappenalty = $DefaultGapPenalty unless defined $gappenalty; + # ambiguities ignored at this point + + my (@seqs); + foreach my $seq ( $aln->each_seq) { + push @seqs, [ split(//,uc $seq->seq())]; + } + my $seqct = scalar @seqs; + my @DVals; + for(my $i = 1; $i <= $seqct; $i++ ) { + for( my $j = $i+1; $j <= $seqct; $j++ ) { + my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i-1], + $seqs[$j-1]); + # just want diagonals + my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] + + $matrix->[2]->[2] + $matrix->[3]->[3] ); + my $D = 1 - ( $m / ($aln->length - $gaps + ( $gaps * $gappenalty))); + my $d = (- 3 / 4) * log ( 1 - (4 * $D/ 3)); + $DVals[$i]->[$j] = $DVals[$j]->[$i] = $d; + } + } + return \@DVals; +} + +=head2 D_F81 + + Title : D_F81 + Usage : my $d = $stat->D_F81($aln) + Function: Calculates D (pairwise distance) between 2 sequences in an + alignment using the Felsenstein 1981 distance model. + Returns : ArrayRef of a 2d array of all pairwise distances in the alignment + Args : Bio::Align::AlignI of DNA sequences + + +=cut + +sub D_F81{ + my ($self,$aln) = @_; + return 0 unless $self->_check_arg($aln); + $self->throw("This isn't implemented yet - sorry"); +} + + +# M Kimura, J. Mol. Evol., 1980, 16, 111. + +=head2 D_Kimura + + Title : D_Kimura + Usage : my $d = $stat->D_Kimura($aln) + Function: Calculates D (pairwise distance) between 2 sequences in an + alignment using the Kimura 2 parameter model. + Returns : ArrayRef of pairwise distances between all sequences in alignment + Args : Bio::Align::AlignI of DNA sequences + + +=cut + +sub D_Kimura{ + my ($self,$aln) = @_; + return 0 unless $self->_check_arg($aln); + my $seqct = $aln->no_sequences; + my @KVals; + for( my $i = 1; $i <= $seqct; $i++ ) { + for( my $j = $i+1; $j <= $seqct; $j++ ) { + my $pairwise = $aln->select_noncont($i,$j); + my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise); + my $P = $self->transitions($pairwise) / $L; + my $Q = $self->transversions($pairwise) / $L; + + my $a = 1 / ( 1 - (2 * $P) - $Q); + my $b = 1 / ( 1 - 2 * $Q ); + my $K = (1/2) * log ( $a ) + (1/4) * log($b); + $KVals[$i]->[$j] = $K; + $KVals[$j]->[$i] = $K; + } + } + return \@KVals; +} + +# K Tamura, Mol. Biol. Evol. 1992, 9, 678. + +=head2 D_Tamura + + Title : D_Tamura + Usage : + Function: + Returns : + Args : + + +=cut + +sub D_Tamura{ + my ($self,$aln) = @_; + my $seqct = $aln->no_sequences; + my @KVals; + for( my $i = 1; $i <= $seqct; $i++ ) { + for( my $j = $i+1; $j <= $seqct; $j++ ) { + } + } + my $O = 0.25; + my $t = 0; + my $a = 0; + my $b = 0; + + + my $d = 4 * $O * ( 1 - $O ) * $a * $t + 2 * $b * $t; + return $d; +} + +=head2 D_F84 + + Title : D_F84 + Usage : my $d = $stat->D_F84($aln) + Function: Calculates D (pairwise distance) between 2 sequences in an + alignment using the Felsenstein 1984 distance model. + Returns : Distance value + Args : Bio::Align::AlignI of DNA sequences + double - gap penalty + +=cut + +sub D_F84{ + my ($self,$aln) = @_; + return 0 unless $self->_check_arg($aln); +} + +# Tajima and Nei, Mol. Biol. Evol. 1984, 1, 269. + +=head2 D_TajimaNei + + Title : D_TajimaNei + Usage : my $d = $stat->D_TajimaNei($aln) + Function: Calculates D (pairwise distance) between 2 sequences in an + alignment using the TajimaNei 1984 distance model. + Returns : Distance value + Args : Bio::Align::AlignI of DNA sequences + + +=cut + +sub D_TajimaNei{ + my ($self,$aln) = @_; + $self->warn("The result from this method is not correct right now"); + my (@seqs); + foreach my $seq ( $aln->each_seq) { + push @seqs, [ split(//,uc $seq->seq())]; + } + my $seqct = scalar @seqs; + my @DVals; + for(my $i = 1; $i <= $seqct; $i++ ) { + for( my $j = $i+1; $j <= $seqct; $j++ ) { + my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i-1], + $seqs[$j-1]); + my $fij2; + my $slen = $aln->length - $gaps; + for( my $bs = 0; $bs < 4; $bs++ ) { + my $fi = 0; + map {$fi += $matrix->[$bs]->[$_] } 0..3; + my $fj = 0; + map { $fj += $matrix->[$_]->[$bs] } 0..3; + my $fij = ( $fi && $fj ) ? ($fi + $fj) /( 2 * $slen) : 0; + $fij2 += $fij**2; + } + my ($pair,$h) = (0,0); + for( my $bs = 0; $bs < 3; $bs++ ) { + for( my $bs1 = $bs+1; $bs1 <= 3; $bs1++ ) { + my $fij = $pfreq->[$pair++] / $slen; + if( $fij ) { + + my ($ci1,$ci2,$cj1,$cj2) = (0,0,0,0); + + map { $ci1 += $matrix->[$_]->[$bs] } 0..3; + map { $cj1 += $matrix->[$bs]->[$_] } 0..3; + map { $ci2 += $matrix->[$_]->[$bs1] } 0..3; + map { $cj2 += $matrix->[$bs1]->[$_] } 0..3; + + $h += ( $fij*$fij / 2 ) / + ( ( ( $ci1 + $cj1 ) / 2 * $slen ) * + ( ( $ci2 + $cj2 ) /2 * $slen ) + ); + $self->debug( "h is $h fij = $fij ci1 =$ci1 cj1=$cj1 ci2=$ci2 cj2=$cj2\n"); + } + } + } + # just want diagonals first + + my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] + + $matrix->[2]->[2] + $matrix->[3]->[3] ); + my $D = 1 - ( $m / $slen); + + my $b = (1-$fij2+(($D**2)/$h)) / 2; + $self->debug("h is $h fij2 is $fij2 b is $b\n"); + + my $d = (-1 * $b) * log ( 1 - $D/ $b); + $DVals[$i]->[$j] = $DVals[$j]->[$i] = $d; + } + } + return \@DVals; + + +} + +# HKY -- HASEGAWA, M., H. KISHINO, and T. YANO. 1985 +# Tamura and Nei 1993? +# GTR? + +=head2 K - sequence substitution methods + +=cut + +=head2 K_JukesCantor + + Title : K_JukesCantor + Usage : my $k = $stats->K_JukesCantor($aln) + Function: Calculates K - the number of nucleotide substitutions between + 2 seqs - according to the Jukes-Cantor 1 parameter model + This only involves the number of changes between two sequences. + Returns : double + Args : Bio::Align::AlignI + + +=cut + +sub K_JukesCantor{ + my ($self,$aln) = @_; + return 0 unless $self->_check_arg($aln); + my $seqct = $aln->no_sequences; + my @KVals; + for( my $i = 1; $i <= $seqct; $i++ ) { + for( my $j = $i+1; $j <= $seqct; $j++ ) { + my $pairwise = $aln->select_noncont($i,$j); + my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise); + my $N = $self->pairwise_stats->number_of_differences($pairwise); + my $p = $N / $L; + my $K = - ( 3 / 4) * log ( 1 - (( 4 * $p) / 3 )); + $KVals[$i]->[$j] = $KVals[$j]->[$i] = $K; + } + } + return \@KVals; +} + +=head2 K_TajimaNei + + Title : K_TajimaNei + Usage : my $k = $stats->K_TajimaNei($aln) + Function: Calculates K - the number of nucleotide substitutions between + 2 seqs - according to the Kimura 2 parameter model. + This does not assume equal frequencies among all the nucleotides. + Returns : ArrayRef of 2d matrix which contains pairwise K values for + all sequences in the alignment + Args : Bio::Align::AlignI + +=cut + +sub K_TajimaNei { + my ($self,$aln) = @_; + return 0 unless $self->_check_arg($aln); + + my @seqs; + foreach my $seq ( $aln->each_seq) { + push @seqs, [ split(//,uc $seq->seq())]; + } + my @KVals; + my $L = $self->pairwise_stats->number_of_comparable_bases($aln); + my $seqct = scalar @seqs; + for( my $i = 1; $i <= $seqct; $i++ ) { + for( my $j = $i+1; $j <= $seqct; $j++ ) { + my (%q,%y); + my ($first,$second) = ($seqs[$i-1],$seqs[$j-1]); + + for (my $k = 0;$k<$aln->length; $k++ ) { + next if( $first->[$k] =~ /^$GapChars$/ || + $second->[$k] =~ /^$GapChars$/); + + $q{$second->[$k]}++; + $q{$first->[$k]}++; + if( $first->[$k] ne $second->[$k] ) { + $y{$first->[$k]}->{$second->[$k]}++; + } + } + + my $q_sum = 0; + foreach my $let ( @Nucleotides ) { + # ct is the number of sequences compared (2) + # L is the length of the alignment without gaps + # $ct * $L = total number of nt compared + my $avg = $q{$let} / ( $SeqCount * $L ); + $q_sum += $avg**2; + } + my $b1 = 1 - $q_sum; + my $h = 0; + for( my $i = 0; $i <= 2; $i++ ) { + for( my $j = $i+1; $j <= 3; $j++) { + $y{$Nucleotides[$i]}->{$Nucleotides[$j]} ||= 0; + $y{$Nucleotides[$j]}->{$Nucleotides[$i]} ||= 0; + my $x = ($y{$Nucleotides[$i]}->{$Nucleotides[$j]} + + $y{$Nucleotides[$j]}->{$Nucleotides[$i]}) / $L; + $h += ($x ** 2) / ( 2 * $q{$Nucleotides[$i]} * + $q{$Nucleotides[$j]} ); + } + } + my $N = $self->pairwise_stats->number_of_differences($aln); + my $p = $N / $L; + my $b = ( $b1 + $p ** 2 / $h ) / 2; + my $K = - $b * log ( 1 - $p / $b ); + $KVals[$i]->[$j] = $KVals[$j]->[$i] = $K; + } + } + return \@KVals; +} + + + +=head2 transversions + + Title : transversions + Usage : my $transversions = $stats->transversion($aln); + Function: Calculates the number of transversions between two sequences in + an alignment + Returns : integer + Args : Bio::Align::AlignI + + +=cut + +sub transversions{ + my ($self,$aln) = @_; + return $self->_trans_count_helper($aln, $DNAChanges{'Transversions'}); +} + +=head2 transitions + + Title : transitions + Usage : my $transitions = Bio::Align::DNAStatistics->transitions($aln); + Function: Calculates the number of transitions in a given DNA alignment + Returns : integer representing the number of transitions + Args : Bio::Align::AlignI object + + +=cut + +sub transitions{ + my ($self,$aln) = @_; + return $self->_trans_count_helper($aln, $DNAChanges{'Transitions'}); +} + + +sub _trans_count_helper { + my ($self,$aln,$type) = @_; + return 0 unless( $self->_check_arg($aln) ); + if( ! $aln->is_flush ) { $self->throw("must be flush") } + my (@seqs,@tcount); + foreach my $seq ( $aln->get_seq_by_pos(1), $aln->get_seq_by_pos(2) ) { + push @seqs, [ split(//,$seq->seq())]; + } + my ($first,$second) = @seqs; + + for (my $i = 0;$i<$aln->length; $i++ ) { + next if( $first->[$i] =~ /^$GapChars$/ || + $second->[$i] =~ /^$GapChars$/); + if( $first->[$i] ne $second->[$i] ) { + foreach my $nt ( @{$type->{$first->[$i]}} ) { + if( $nt eq $second->[$i]) { + $tcount[$i]++; + } + } + } + } + my $sum = 0; + map { if( $_) { $sum += $_} } @tcount; + return $sum; +} + +# this will generate a matrix which records across the row, the number +# of DNA subst +# +sub _build_nt_matrix { + my ($self,$seqa,$seqb) = @_; + + + my $basect_matrix = [ [ qw(0 0 0 0) ], # number of bases that match + [ qw(0 0 0 0) ], + [ qw(0 0 0 0) ], + [ qw(0 0 0 0) ] ]; + my $gaps = 0; # number of gaps + my $pfreq = [ qw( 0 0 0 0 0 0)]; # matrix for pair frequency + + for( my $i = 0; $i < scalar @$seqa; $i++) { + + my ($ti,$tj) = ($seqa->[$i],$seqb->[$i]); + $ti =~ tr/U/T/; + $tj =~ tr/U/T/; + + if( $ti =~ /^$GapChars$/) { $gaps++; next; } + if( $tj =~ /^$GapChars$/) { $gaps++; next } + + my $ti_index = $NucleotideIndexes{$ti}; + my $tj_index = $NucleotideIndexes{$tj}; + + if( ! defined $ti_index ) { + print "ti_index not defined for $ti\n"; + next; + } + + $basect_matrix->[$ti_index]->[$tj_index]++; + + if( $ti ne $tj ) { + $pfreq->[$NucleotideIndexes{join('',sort ($ti,$tj))}]++; + } + } + return ($basect_matrix,$pfreq,$gaps); +} + +sub _check_arg { + my($self,$aln ) = @_; + if( ! defined $aln || ! $aln->isa('Bio::Align::AlignI') ) { + $self->warn("Must provide a Bio::Align::AlignI compliant object to Bio::Align::DNAStatistics"); + return 0; + } elsif( $aln->get_seq_by_pos(1)->alphabet ne 'dna' ) { + $self->warn("Must provide a DNA alignment to Bio::Align::DNAStatistics, you provided a " . $aln->get_seq_by_pos(1)->alphabet); + return 0; + } + return 1; +} + +=head2 Data Methods + +=cut + +=head2 pairwise_stats + + Title : pairwise_stats + Usage : $obj->pairwise_stats($newval) + Function: + Returns : value of pairwise_stats + Args : newvalue (optional) + + +=cut + +sub pairwise_stats{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'_pairwise_stats'} = $value; + } + return $self->{'_pairwise_stats'}; + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Align/PairwiseStatistics.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Align/PairwiseStatistics.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,175 @@ +# $Id: PairwiseStatistics.pm,v 1.2 2002/10/22 07:45:10 lapp Exp $ +# +# BioPerl module for Bio::Align::PairwiseStatistics +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Align::PairwiseStatistics - Base statistic object for Pairwise Alignments + +=head1 SYNOPSIS + +Give standard usage here + +=head1 DESCRIPTION + +Describe the object here + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Align::PairwiseStatistics; +use vars qw(@ISA $GapChars); +use strict; + +use Bio::Align::StatisticsI; +use Bio::Root::Root; + +BEGIN { $GapChars = '(\.|\-)'; } + +@ISA = qw(Bio::Root::Root Bio::Align::StatisticsI ); + +=head2 number_of_comparable_bases + + Title : number_of_comparable_bases + Usage : my $bases = $stat->number_of_comparable_bases($aln); + Function: Returns the count of the number of bases that can be + compared (L) in this alignment ( length - gaps) + Returns : integer + Args : Bio::Align::AlignI + + +=cut + +sub number_of_comparable_bases{ + my ($self,$aln) = @_; + if( ! defined $aln || ! $aln->isa('Bio::Align::AlignI') ) { + $self->warn("Must provide a Bio::Align::AlignI compliant object to Bio::Align::PairwiseStatistics"); + return 0; + } elsif( $aln->no_sequences != 2 ) { + $self->warn("only pairwise calculations currently supported"); + } + my $L = $aln->length - $self->number_of_gaps($aln); + return $L; +} + +=head2 number_of_differences + + Title : number_of_differences + Usage : my $nd = $stat->number_of_distances($aln); + Function: Returns the number of differences between two + Returns : integer + Args : Bio::Align::AlignI + + +=cut + +sub number_of_differences{ + my ($self,$aln) = @_; + if( ! defined $aln || ! $aln->isa('Bio::Align::AlignI') ) { + $self->warn("Must provide a Bio::Align::AlignI compliant object to Bio::Align::PairwiseStatistics"); + return 0; + } elsif( $aln->no_sequences != 2 ) { + $self->warn("only pairwise calculations currently supported"); + } + my (@seqs); + foreach my $seq ( $aln->each_seq) { + push @seqs, [ split(//,$seq->seq())]; + } + my $firstseq = shift @seqs; +# my $secondseq = shift @seqs; + my $diffcount = 0; + for (my $i = 0;$i<$aln->length; $i++ ) { + next if( $firstseq->[$i] =~ /^$GapChars$/); + foreach my $seq ( @seqs ) { + next if( $seq->[$i] =~ /^$GapChars$/); + if( $firstseq->[$i] ne $seq->[$i] ) { + $diffcount++; + } + } + } + return $diffcount; +} + +=head2 number_of_gaps + + Title : number_of_gaps + Usage : my $nd = $stat->number_of_gaps($aln); + Function: Returns the number of differences between two + Example : + Returns : + Args : + + +=cut + +sub number_of_gaps{ + my ($self,$aln) = @_; + if( ! defined $aln || ! $aln->isa('Bio::Align::AlignI') ) { + $self->warn("Must provide a Bio::Align::AlignI compliant object to Bio::Align::PairwiseStatistics"); + return 0; + } elsif( $aln->no_sequences != 2 ) { + $self->warn("only pairwise calculations currently supported"); + } + my (@seqs); + foreach my $seq ( $aln->each_seq) { + push @seqs, [ split(//,$seq->seq())]; + } + my $firstseq = shift @seqs; +# my $secondseq = shift @seqs; + my $gapcount = 0; + for (my $i = 0;$i<$aln->length; $i++ ) { + ($gapcount++) && next if( $firstseq->[$i] =~ /^$GapChars$/); + foreach my $seq ( @seqs ) { + ($gapcount++) && next if( $seq->[$i] =~ /^$GapChars$/); + } + } + return $gapcount; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Align/StatisticsI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Align/StatisticsI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,109 @@ +# $Id: StatisticsI.pm,v 1.4 2002/10/22 07:45:10 lapp Exp $ +# +# BioPerl module for Bio::Align::StatisticsI +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Align::StatisticsI - Calculate some statistics for an alignment + +=head1 SYNOPSIS + +Give standard usage here + +=head1 DESCRIPTION + +Describe the interface here + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Align::StatisticsI; +use strict; +use vars qw(@ISA); + +use Bio::Root::RootI; + +@ISA = qw(Bio::Root::RootI); + +=head2 distance + + Title : distance + Usage : my $distance_mat = $stats->distance(-align => $aln, + -method => $method); + Function: Calculates a distance matrix for all pairwise distances of + sequences in an alignment. + Returns : Array ref + Args : -align => Bio::Align::AlignI object + -method => String specifying specific distance method + (implementing class may assume a default) + +=cut + +sub distance{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +=head2 available_distance_methods + + Title : available_distance_methods + Usage : my @methods = $stats->available_distance_methods(); + Function: Enumerates the possible distance methods + Returns : Array of strings + Args : none + + +=cut + +sub available_distance_methods{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Align/Utilities.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Align/Utilities.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,161 @@ +# $Id: Utilities.pm,v 1.8 2002/11/11 18:39:19 jason Exp $ +# +# BioPerl module for Bio::Align::Utilities +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Align::Utilities - A collection of utilities regarding converting and manipulating alignment objects + +=head1 SYNOPSIS + +use Bio::Align::Utilities qw(aa_to_dna_aln); + +my $dna_aln = aa_to_dna_aln($aaaln,\%dnaseqs); + + +=head1 DESCRIPTION + +This module contains utility methods for manipulating sequence +alignments ( L) objects. + +The B utility is essentially the same as the B +program by Bill Pearson available at +ftp://ftp.virginia.edu/pub/fasta/other/mrtrans.shar. Of course this +is a pure-perl implementation, but just to mention that if anything +seems odd you can check the alignments generated against Bill's +program. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + +#' keep my emacs happy +# Let the code begin... + + +package Bio::Align::Utilities; +use vars qw(@ISA @EXPORT @EXPORT_OK); +use strict; +use Carp; +require Exporter; + +@ISA = qw(Exporter); + +@EXPORT = qw(); +@EXPORT_OK = qw(aa_to_dna_aln); + +use constant CODONSIZE => 3; + +=head2 aa_to_dna_aln + + Title : aa_to_dna_aln + Usage : my $dnaaln = aa_to_dna_aln($aa_aln, \%seqs); + Function: Will convert an AA alignment to DNA space given the + corresponding DNA sequences. Note that this method expects + the DNA sequences to be in frame +1 (GFF frame 0) as it will + start to project into coordinates starting at the first base of + the DNA sequence, if this alignment represents a different + frame for the cDNA you will need to edit the DNA sequences + to remove the 1st or 2nd bases (and revcom if things should be). + Returns : Bio::Align::AlignI object + Args : 2 arguments, the alignment and a hashref. + Alignment is a Bio::Align::AlignI of amino acid sequences. + The hash reference should have keys which are + the display_ids for the aa + sequences in the alignment and the values are a + Bio::PrimarySeqI object for the corresponding + spliced cDNA sequence. + +See also: L, L, L + +=cut + +sub aa_to_dna_aln { + my ($aln,$dnaseqs) = @_; + unless( defined $aln && + ref($aln) && + $aln->isa('Bio::Align::AlignI') ) { + croak('Must provide a valid Bio::Align::AlignI object as the first argument to aa_to_dna_aln, see the documentation for proper usage and the method signature'); + } + my $alnlen = $aln->length; + #print "HSP length is $alnlen\n"; + my $dnaalign = new Bio::SimpleAlign; + foreach my $seq ( $aln->each_seq ) { + my $newseq; + my $dnaseq = $dnaseqs->{$seq->display_id} || croak("cannot find ". + $seq->display_id); + foreach my $pos ( 1..$alnlen ) { + my $loc = $seq->location_from_column($pos); + my $dna = ''; + if( !defined $loc || $loc->location_type ne 'EXACT' ) { + $dna = '---'; + } else { + # To readjust to codon boundaries + # end needs to be +1 so we can just multiply by CODONSIZE + # to get this + + my ($start,$end) = ((($loc->start - 1)* CODONSIZE) +1, + ($loc->end)* CODONSIZE); + + if( $start <=0 || $end > $dnaseq->length() ) { + print STDERR "start is ", $loc->start, " end is ", $loc->end, " while dnaseq length is ", $dnaseq->length(), " and start/end projected are $start,$end \n"; + warn("codons don't seem to be matching up for $start,$end"); + $dna = '---'; + } else { + $dna = $dnaseq->subseq($start,$end); + } + } + $newseq .= $dna; + } + # funky looking math is to readjust to codon boundaries and deal + # with fact that sequence start with 1 + my $newdna = new Bio::LocatableSeq(-display_id => $seq->id(), + -start => (($seq->start - 1) * + CODONSIZE) + 1, + -end => ($seq->end * CODONSIZE), + -strand => $seq->strand, + -seq => $newseq); + $dnaalign->add_seq($newdna); + } + return $dnaalign; +} +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/AlignIO.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/AlignIO.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,505 @@ +# $Id: AlignIO.pm,v 1.28 2002/10/22 07:38:23 lapp Exp $ +# +# BioPerl module for Bio::AlignIO +# +# based on the Bio::SeqIO module +# by Ewan Birney +# and Lincoln Stein +# +# Copyright Peter Schattner +# +# You may distribute this module under the same terms as perl itself +# +# _history +# October 18, 1999 SeqIO largely rewritten by Lincoln Stein +# September, 2000 AlignIO written by Peter Schattner + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::AlignIO - Handler for AlignIO Formats + +=head1 SYNOPSIS + + use Bio::AlignIO; + + $inputfilename = "testaln.fasta"; + $in = Bio::AlignIO->new(-file => $inputfilename , + '-format' => 'fasta'); + $out = Bio::AlignIO->new(-file => ">out.aln.pfam" , + '-format' => 'pfam'); + # note: we quote -format to keep older perl's from complaining. + + while ( my $aln = $in->next_aln() ) { + $out->write_aln($aln); + } + +or + + use Bio::AlignIO; + + $inputfilename = "testaln.fasta"; + $in = Bio::AlignIO->newFh(-file => $inputfilename , + '-format' => 'fasta'); + $out = Bio::AlignIO->newFh('-format' => 'pfam'); + + # World's shortest Fasta<->pfam format converter: + print $out $_ while <$in>; + +=head1 DESCRIPTION + +Bio::AlignIO is a handler module for the formats in the AlignIO set +(eg, Bio::AlignIO::fasta). It is the officially sanctioned way of +getting at the alignment objects, which most people should use. The +resulting alignment is a Bio::Align::AlignI compliant object. See +L for more information. + +The idea is that you request a stream object for a particular format. +All the stream objects have a notion of an internal file that is read +from or written to. A particular AlignIO object instance is configured +for either input or output. A specific example of a stream object is +the Bio::AlignIO::fasta object. + +Each stream object has functions + + $stream->next_aln(); + +and + + $stream->write_aln($aln); + +also + + $stream->type() # returns 'INPUT' or 'OUTPUT' + +As an added bonus, you can recover a filehandle that is tied to the +AlignIO object, allowing you to use the standard EE and print +operations to read and write sequence objects: + + use Bio::AlignIO; + + # read from standard input + $stream = Bio::AlignIO->newFh(-format => 'Fasta'); + + while ( $aln = <$stream> ) { + # do something with $aln + } + +and + + print $stream $aln; # when stream is in output mode + +This makes the simplest ever reformatter + + #!/usr/local/bin/perl + + $format1 = shift; + $format2 = shift || + die "Usage: reformat format1 format2 < input > output"; + + use Bio::AlignIO; + + $in = Bio::AlignIO->newFh(-format => $format1 ); + $out = Bio::AlignIO->newFh(-format => $format2 ); + # note: you might want to quote -format to keep + # older perl's from complaining. + + print $out $_ while <$in>; + +AlignIO.pm is patterned on the module SeqIO.pm and shares most the +SeqIO.pm features. One significant difference currently is that +AlignIO.pm usually handles IO for only a single alignment at a time +(SeqIO.pm handles IO for multiple sequences in a single stream.) The +principal reason for this is that whereas simultaneously handling +multiple sequences is a common requirement, simultaneous handling of +multiple alignments is not. The only current exception is format +"bl2seq" which parses results of the Blast bl2seq program and which +may produce several alignment pairs. This set of alignment pairs can +be read using multiple calls to next_aln. + +Capability for IO for more than one multiple alignment - other than +for bl2seq format -(which may be of use for certain applications such +as IO for Pfam libraries) may be included in the future. For this +reason we keep the name "next_aln()" for the alignment input routine, +even though in most cases only one alignment is read (or written) at a +time and the name "read_aln()" might be more appropriate. + +=head1 CONSTRUCTORS + +=head2 Bio::AlignIO-Enew() + + $seqIO = Bio::AlignIO->new(-file => 'filename', -format=>$format); + $seqIO = Bio::AlignIO->new(-fh => \*FILEHANDLE, -format=>$format); + $seqIO = Bio::AlignIO->new(-format => $format); + +The new() class method constructs a new Bio::AlignIO object. The +returned object can be used to retrieve or print BioAlign +objects. new() accepts the following parameters: + +=over 4 + +=item -file + +A file path to be opened for reading or writing. The usual Perl +conventions apply: + + 'file' # open file for reading + '>file' # open file for writing + '>>file' # open file for appending + '+new(-fh => \*STDIN); + +Note that you must pass filehandles as references to globs. + +If neither a filehandle nor a filename is specified, then the module +will read from the @ARGV array or STDIN, using the familiar EE +semantics. + +=item -format + +Specify the format of the file. Supported formats include: + + fasta FASTA format + selex selex (hmmer) format + stockholm stockholm format + prodom prodom (protein domain) format + clustalw clustalw (.aln) format + msf msf (GCG) format + mase mase (seaview) format + bl2seq Bl2seq Blast output + nexus Swofford et al NEXUS format + pfam Pfam sequence alignment format + phylip Felsenstein's PHYLIP format + emboss EMBOSS water and needle format + mega MEGA format + meme MEME format + psi PSI-BLAST format + +Currently only those formats which were implemented in L +have been incorporated in AlignIO.pm. Specifically, mase, stockholm +and prodom have only been implemented for input. See the specific module +(e.g. L) for notes on supported versions. + +If no format is specified and a filename is given, then the module +will attempt to deduce it from the filename suffix. If this is unsuccessful, +Fasta format is assumed. + +The format name is case insensitive. 'FASTA', 'Fasta' and 'fasta' are +all supported. + +=back + +=head2 Bio::AlignIO-EnewFh() + + $fh = Bio::AlignIO->newFh(-fh => \*FILEHANDLE, -format=>$format); + $fh = Bio::AlignIO->newFh(-format => $format); + # etc. + +This constructor behaves like new(), but returns a tied filehandle +rather than a Bio::AlignIO object. You can read sequences from this +object using the familiar EE operator, and write to it using print(). +The usual array and $_ semantics work. For example, you can read all +sequence objects into an array like this: + + @sequences = <$fh>; + +Other operations, such as read(), sysread(), write(), close(), and printf() +are not supported. + +=over 1 + +=item -flush + +By default, all files (or filehandles) opened for writing alignments +will be flushed after each write_aln() (making the file immediately +usable). If you don't need this facility and would like to marginally +improve the efficiency of writing multiple sequences to the same file +(or filehandle), pass the -flush option '0' or any other value that +evaluates as defined but false: + + my $clustal = new Bio::AlignIO -file => " "clustalw"; + my $msf = new Bio::AlignIO -file => ">prot.msf", + -format => "msf", + -flush => 0; # go as fast as we can! + while($seq = $clustal->next_aln) { $msf->write_aln($seq) } + +=back + +=head1 OBJECT METHODS + +See below for more detailed summaries. The main methods are: + +=head2 $alignment = $AlignIO-Enext_aln() + +Fetch an alignment from a formatted file. + +=head2 $AlignIO-Ewrite_aln($aln) + +Write the specified alignment to a file.. + +=head2 TIEHANDLE(), READLINE(), PRINT() + +These provide the tie interface. See L for more details. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Peter Schattner + +Email: schattner@alum.mit.edu + +=head1 CONTRIBUTORS + +Jason Stajich, jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# 'Let the code begin... + +package Bio::AlignIO; + +use strict; +use vars qw(@ISA); + +use Bio::Root::Root; +use Bio::Seq; +use Bio::LocatableSeq; +use Bio::SimpleAlign; +use Bio::Root::IO; +@ISA = qw(Bio::Root::Root Bio::Root::IO); + +=head2 new + + Title : new + Usage : $stream = Bio::AlignIO->new(-file => $filename, + '-format' => 'Format') + Function: Returns a new seqstream + Returns : A Bio::AlignIO::Handler initialised with + the appropriate format + Args : -file => $filename + -format => format + -fh => filehandle to attach to + +=cut + +sub new { + my ($caller,@args) = @_; + my $class = ref($caller) || $caller; + + # or do we want to call SUPER on an object if $caller is an + # object? + if( $class =~ /Bio::AlignIO::(\S+)/ ) { + my ($self) = $class->SUPER::new(@args); + $self->_initialize(@args); + return $self; + } else { + + my %param = @args; + @param{ map { lc $_ } keys %param } = values %param; # lowercase keys + my $format = $param{'-format'} || + $class->_guess_format( $param{-file} || $ARGV[0] ) || + 'fasta'; + $format = "\L$format"; # normalize capitalization to lower case + + # normalize capitalization + return undef unless( $class->_load_format_module($format) ); + return "Bio::AlignIO::$format"->new(@args); + } +} + + +=head2 newFh + + Title : newFh + Usage : $fh = Bio::AlignIO->newFh(-file=>$filename,-format=>'Format') + Function: does a new() followed by an fh() + Example : $fh = Bio::AlignIO->newFh(-file=>$filename,-format=>'Format') + $sequence = <$fh>; # read a sequence object + print $fh $sequence; # write a sequence object + Returns : filehandle tied to the Bio::AlignIO::Fh class + Args : + +=cut + +sub newFh { + my $class = shift; + return unless my $self = $class->new(@_); + return $self->fh; +} + +=head2 fh + + Title : fh + Usage : $obj->fh + Function: + Example : $fh = $obj->fh; # make a tied filehandle + $sequence = <$fh>; # read a sequence object + print $fh $sequence; # write a sequence object + Returns : filehandle tied to the Bio::AlignIO::Fh class + Args : + +=cut + + +sub fh { + my $self = shift; + my $class = ref($self) || $self; + my $s = Symbol::gensym; + tie $$s,$class,$self; + return $s; +} + +# _initialize is where the heavy stuff will happen when new is called + +sub _initialize { + my($self,@args) = @_; + + $self->_initialize_io(@args); + 1; +} + +=head2 _load_format_module + + Title : _load_format_module + Usage : *INTERNAL AlignIO stuff* + Function: Loads up (like use) a module at run time on demand + Example : + Returns : + Args : + +=cut + +sub _load_format_module { + my ($self,$format) = @_; + my $module = "Bio::AlignIO::" . $format; + my $ok; + + eval { + $ok = $self->_load_module($module); + }; + if ( $@ ) { + print STDERR <next_aln + Function: reads the next $aln object from the stream + Returns : a Bio::Align::AlignI compliant object + Args : + +=cut + +sub next_aln { + my ($self,$aln) = @_; + $self->throw("Sorry, you cannot read from a generic Bio::AlignIO object."); +} + +=head2 write_aln + + Title : write_aln + Usage : $stream->write_aln($aln) + Function: writes the $aln object into the stream + Returns : 1 for success and 0 for error + Args : Bio::Seq object + +=cut + +sub write_aln { + my ($self,$aln) = @_; + $self->throw("Sorry, you cannot write to a generic Bio::AlignIO object."); +} + +=head2 _guess_format + + Title : _guess_format + Usage : $obj->_guess_format($filename) + Function: + Example : + Returns : guessed format of filename (lower case) + Args : + +=cut + +sub _guess_format { + my $class = shift; + return unless $_ = shift; + return 'fasta' if /\.(fasta|fast|seq|fa|fsa|nt|aa)$/i; + return 'msf' if /\.(msf|pileup|gcg)$/i; + return 'pfam' if /\.(pfam|pfm)$/i; + return 'selex' if /\.(selex|slx|selx|slex|sx)$/i; + return 'phylip' if /\.(phylip|phlp|phyl|phy|phy|ph)$/i; + return 'nexus' if /\.(nexus|nex)$/i; + return 'mega' if( /\.(meg|mega)$/i ); + return 'clustalw' if( /\.aln$/i ); + return 'meme' if( /\.meme$/i ); + return 'emboss' if( /\.(water|needle)$/i ); + return 'psi' if( /\.psi$/i ); +} + +sub DESTROY { + my $self = shift; + $self->close(); +} + +sub TIEHANDLE { + my $class = shift; + return bless {'alignio' => shift},$class; +} + +sub READLINE { + my $self = shift; + return $self->{'alignio'}->next_aln() unless wantarray; + my (@list,$obj); + push @list,$obj while $obj = $self->{'alignio'}->next_aln(); + return @list; +} + +sub PRINT { + my $self = shift; + $self->{'alignio'}->write_aln(@_); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/AlignIO/bl2seq.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/AlignIO/bl2seq.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,190 @@ +# $Id: bl2seq.pm,v 1.13.2.1 2003/06/18 12:19:52 jason Exp $ +# +# BioPerl module for Bio::AlignIO::bl2seq + +# based on the Bio::SeqIO modules +# by Ewan Birney +# and Lincoln Stein +# +# the Bio::Tools::BPlite modules by +# Ian Korf (ikorf@sapiens.wustl.edu, http://sapiens.wustl.edu/~ikorf), +# Lorenz Pollak (lorenz@ist.org, bioperl port) +# +# and the SimpleAlign.pm module of Ewan Birney +# +# Copyright Peter Schattner +# +# You may distribute this module under the same terms as perl itself +# _history +# September 5, 2000 +# POD documentation - main docs before the code + +=head1 NAME + +Bio::AlignIO::bl2seq - bl2seq sequence input/output stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the L class, as in: + + use Bio::AlignIO; + + $in = Bio::AlignIO->new(-file => "inputfilename" , '-format' => 'bl2seq'); + $aln = $in->next_aln(); + + +=head1 DESCRIPTION + +This object can create L sequence alignment objects (of +2 sequences) from bl2seq BLAST reports. + +A nice feature of this module is that- in combination with +StandAloneBlast.pm or remote blasting - it can be used to align 2 +sequences and make a SimpleAlign object from them which can then be +manipulated using any SimpleAlign.pm methods, eg: + + #Get 2 sequences + $str = Bio::SeqIO->new(-file=>'t/amino.fa' , '-format' => 'Fasta', ); + my $seq3 = $str->next_seq(); + my $seq4 = $str->next_seq(); + + # Run bl2seq on them + $factory = Bio::Tools::StandAloneBlast->new('program' => 'blastp', + 'outfile' => 'bl2seq.out'); + my $bl2seq_report = $factory->bl2seq($seq3, $seq4); + + # Use AlignIO.pm to create a SimpleAlign object from the bl2seq report + $str = Bio::AlignIO->new(-file=> 'bl2seq.out','-format' => 'bl2seq'); + $aln = $str->next_aln(); + + Pass in -report_type flag when initializing the object to have this + pass through to the Bio::Tools::BPbl2seq object. See that object. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Peter Schattner + +Email: schattner@alum.mit.edu + + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::AlignIO::bl2seq; +use vars qw(@ISA); +use strict; +# Object preamble - inherits from Bio::Root::Object + +use Bio::AlignIO; +use Bio::Tools::BPbl2seq; + +@ISA = qw(Bio::AlignIO); + + + +sub _initialize { + my ($self,@args) = @_; + $self->SUPER::_initialize(@args); + ($self->{'report_type'}) = $self->_rearrange([qw(REPORT_TYPE)], + @args); + return 1; +} + +=head2 next_aln + + Title : next_aln + Usage : $aln = $stream->next_aln() + Function: returns the next alignment in the stream. + Returns : L object - returns 0 on end of file + or on error + Args : NONE + +=cut + +sub next_aln { + my $self = shift; + my ($start,$end,$name,$seqname,$seq,$seqchar); + my $aln = Bio::SimpleAlign->new(-source => 'bl2seq'); + $self->{'bl2seqobj'} = + $self->{'bl2seqobj'} || Bio::Tools::BPbl2seq->new(-fh => $self->_fh, + -report_type => $self->{'report_type'}); + my $bl2seqobj = $self->{'bl2seqobj'}; + my $hsp = $bl2seqobj->next_feature; + $seqchar = $hsp->querySeq; + $start = $hsp->query->start; + $end = $hsp->query->end; + $seqname = 'Query-sequence'; # Query name not present in bl2seq report + +# unless ($seqchar && $start && $end && $seqname) {return 0} ; + unless ($seqchar && $start && $end ) {return 0} ; + + $seq = new Bio::LocatableSeq('-seq'=>$seqchar, + '-id'=>$seqname, + '-start'=>$start, + '-end'=>$end, + ); + + $aln->add_seq($seq); + + $seqchar = $hsp->sbjctSeq; + $start = $hsp->hit->start; + $end = $hsp->hit->end; + $seqname = $bl2seqobj->sbjctName; + + unless ($seqchar && $start && $end && $seqname) {return 0} ; + + $seq = new Bio::LocatableSeq('-seq'=>$seqchar, + '-id'=>$seqname, + '-start'=>$start, + '-end'=>$end, + ); + + $aln->add_seq($seq); + + return $aln; + +} + + +=head2 write_aln + + Title : write_aln + Usage : $stream->write_aln(@aln) + Function: writes the $aln object into the stream in bl2seq format + Returns : 1 for success and 0 for error + Args : L object + + +=cut + +sub write_aln { + my ($self,@aln) = @_; + + $self->throw("Sorry: writing bl2seq output is not available! /n"); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/AlignIO/clustalw.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/AlignIO/clustalw.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,280 @@ +# $Id: clustalw.pm,v 1.21 2002/10/22 07:38:25 lapp Exp $ +# +# BioPerl module for Bio::AlignIO::clustalw + +# based on the Bio::SeqIO modules +# by Ewan Birney +# and Lincoln Stein +# +# and the SimpleAlign.pm module of Ewan Birney +# +# Copyright Peter Schattner +# +# You may distribute this module under the same terms as perl itself +# _history +# September 5, 2000 +# POD documentation - main docs before the code + +=head1 NAME + +Bio::AlignIO::clustalw - clustalw sequence input/output stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the Bio::AlignIO class. + +=head1 DESCRIPTION + +This object can transform Bio::Align::AlignI objects to and from clustalw flat +file databases. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS - Peter Schattner + +Email: schattner@alum.mit.edu + + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::AlignIO::clustalw; +use vars qw(@ISA $LINELENGTH); +use strict; + +use Bio::AlignIO; +use Bio::LocatableSeq; +use Bio::SimpleAlign; # to be Bio::Align::Simple + +$LINELENGTH = 60; + +@ISA = qw(Bio::AlignIO); + +=head2 new + + Title : new + Usage : $alignio = new Bio::AlignIO(-format => 'clustalw', + -file => 'filename'); + Function: returns a new Bio::AlignIO object to handle clustalw files + Returns : Bio::AlignIO::clustalw object + Args : -verbose => verbosity setting (-1,0,1,2) + -file => name of file to read in or with ">" - writeout + -fh => alternative to -file param - provide a filehandle + to read from/write to + -format => type of Alignment Format to process or produce + -percentages => (clustalw only) display a percentage of identity + in each line of the alignment. + + -linelength=> Set the alignment output line length (default 60) + +=cut + +sub _initialize { + my ($self, @args) = @_; + $self->SUPER::_initialize(@args); + my ($percentages, + $ll) = $self->_rearrange([qw(PERCENTAGES LINELENGTH)], @args); + defined $percentages && $self->percentages($percentages); + $self->line_length($ll || $LINELENGTH); +} + +=head2 next_aln + + Title : next_aln + Usage : $aln = $stream->next_aln() + Function: returns the next alignment in the stream + Returns : Bio::Align::AlignI object + Args : NONE + +See L for details + +=cut + +sub next_aln { + my ($self) = @_; + + my $first_line; + if( defined ($first_line = $self->_readline ) + && $first_line !~ /CLUSTAL/ ) { + $self->warn("trying to parse a file which does not start with a CLUSTAL header"); + } + my %alignments; + my $aln = Bio::SimpleAlign->new(-source => 'clustalw'); + my $order = 0; + my %order; + $self->{_lastline} = ''; + while( defined ($_ = $self->_readline) ) { + next if ( /^\s+$/ ); + + my ($seqname, $aln_line) = ('', ''); + if( /^\s*(\S+)\s*\/\s*(\d+)-(\d+)\s+(\S+)\s*$/ ) { + # clustal 1.4 format + ($seqname,$aln_line) = ("$1:$2-$3",$4); + } elsif( /^(\S+)\s+([A-Z\-]+)\s*$/ ) { + ($seqname,$aln_line) = ($1,$2); + } else { $self->{_lastline} = $_; next } + + if( !exists $order{$seqname} ) { + $order{$seqname} = $order++; + } + + $alignments{$seqname} .= $aln_line; + } + my ($sname,$start,$end); + foreach my $name ( sort { $order{$a} <=> $order{$b} } keys %alignments ) { + if( $name =~ /(\S+):(\d+)-(\d+)/ ) { + ($sname,$start,$end) = ($1,$2,$3); + } else { + ($sname, $start) = ($name,1); + my $str = $alignments{$name}; + $str =~ s/[^A-Za-z]//g; + $end = length($str); + } + my $seq = new Bio::LocatableSeq('-seq' => $alignments{$name}, + '-id' => $sname, + '-start' => $start, + '-end' => $end); + $aln->add_seq($seq); + } + undef $aln if( !defined $end || $end <= 0); + return $aln; +} + +=head2 write_aln + + Title : write_aln + Usage : $stream->write_aln(@aln) + Function: writes the clustalw-format object (.aln) into the stream + Returns : 1 for success and 0 for error + Args : L object + + +=cut + +sub write_aln { + my ($self,@aln) = @_; + my ($count,$length,$seq,@seq,$tempcount,$line_len); + $line_len = $self->line_length || $LINELENGTH; + foreach my $aln (@aln) { + if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) { + $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln"); + next; + } + my $matchline = $aln->match_line; + + $self->_print (sprintf("CLUSTAL W(1.81) multiple sequence alignment\n\n\n")) or return; + + $length = $aln->length(); + $count = $tempcount = 0; + @seq = $aln->each_seq(); + my $max = 22; + foreach $seq ( @seq ) { + $max = length ($aln->displayname($seq->get_nse())) + if( length ($aln->displayname($seq->get_nse())) > $max ); + } + while( $count < $length ) { + foreach $seq ( @seq ) { +# +# Following lines are to suppress warnings +# if some sequences in the alignment are much longer than others. + + my ($substring); + my $seqchars = $seq->seq(); + SWITCH: { + if (length($seqchars) >= ($count + $line_len)) { + $substring = substr($seqchars,$count,$line_len); + last SWITCH; + } elsif (length($seqchars) >= $count) { + $substring = substr($seqchars,$count); + last SWITCH; + } + $substring = ""; + } + + $self->_print (sprintf("%-".$max."s %s\n", + $aln->displayname($seq->get_nse()), + $substring)) or return; + } + + my $linesubstr = substr($matchline, $count,$line_len); + my $percentages = ''; + if( $self->percentages ) { + my ($strcpy) = ($linesubstr); + my $count = ($strcpy =~ tr/\*//); + $percentages = sprintf("\t%d%%", 100 * ($count / length($linesubstr))); + } + $self->_print (sprintf("%-".$max."s %s%s\n", '', $linesubstr, + $percentages)); + $self->_print (sprintf("\n\n")) or return; + $count += $line_len; + } + } + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; +} + +=head2 percentages + + Title : percentages + Usage : $obj->percentages($newval) + Function: Set the percentages flag - whether or not to show percentages in + each output line + Returns : value of percentages + Args : newvalue (optional) + + +=cut + +sub percentages { + my ($self,$value) = @_; + if( defined $value) { + $self->{'_percentages'} = $value; + } + return $self->{'_percentages'}; +} + +=head2 line_length + + Title : line_length + Usage : $obj->line_length($newval) + Function: Set the alignment output line length + Returns : value of line_length + Args : newvalue (optional) + + +=cut + +sub line_length { + my ($self,$value) = @_; + if( defined $value) { + $self->{'_line_length'} = $value; + } + return $self->{'_line_length'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/AlignIO/emboss.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/AlignIO/emboss.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,251 @@ +# $Id: emboss.pm,v 1.11 2002/10/22 07:45:10 lapp Exp $ +# +# BioPerl module for Bio::AlignIO::emboss +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::AlignIO::emboss - Parse EMBOSS alignment output (from applications water and needle) + +=head1 SYNOPSIS + + # do not use the object directly + use Bio::AlignIO; + # read in an alignment from the EMBOSS program water + my $in = new Bio::AlignIO(-format => 'emboss', + -file => 'seq.water'); + while( my $aln = $in->next_aln ) { + # do something with the alignment + } + +=head1 DESCRIPTION + +This object handles parsing and writing pairwise sequence alignments +from the EMBOSS suite. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::AlignIO::emboss; +use vars qw(@ISA $EMBOSSTitleLen $EMBOSSLineLen); +use strict; + +use Bio::AlignIO; +use Bio::LocatableSeq; + +@ISA = qw(Bio::AlignIO ); + +BEGIN { + $EMBOSSTitleLen = 13; + $EMBOSSLineLen = 50; +} + +sub _initialize { + my($self,@args) = @_; + $self->SUPER::_initialize(@args); + $self->{'_type'} = undef; +} + +=head2 next_aln + + Title : next_aln + Usage : $aln = $stream->next_aln() + Function: returns the next alignment in the stream. + Returns : L object - returns 0 on end of file + or on error + Args : NONE + +=cut + +sub next_aln { + my ($self) = @_; + my $seenbegin = 0; + my %data = ( 'seq1' => { + 'start'=> undef, + 'end'=> undef, + 'name' => '', + 'data' => '' }, + 'seq2' => { + 'start'=> undef, + 'end'=> undef, + 'name' => '', + 'data' => '' }, + 'align' => '', + 'type' => $self->{'_type'}, # to restore type from + # previous aln if possible + ); + my %names; + while( defined($_ = $self->_readline) ) { + next if( /^\#?\s+$/ || /^\#+\s*$/ ); + if( /^\#(\=|\-)+\s*$/) { + last if( $seenbegin); + } elsif( /(Local|Global):\s*(\S+)\s+vs\s+(\S+)/ || + /^\#\s+Program:\s+(\S+)/ ) + { + my ($name1,$name2) = ($2,$3); + if( ! defined $name1 ) { # Handle EMBOSS 2.2.X + $data{'type'} = $1; + $name1 = $name2 = ''; + } else { + $data{'type'} = $1 eq 'Local' ? 'water' : 'needle'; + } + $data{'seq1'}->{'name'} = $name1; + $data{'seq2'}->{'name'} = $name2; + + $self->{'_type'} = $data{'type'}; + + } elsif( /Score:\s+(\S+)/ ) { + $data{'score'} = $1; + } elsif( /^\#\s+(1|2):\s+(\S+)/ && ! $data{"seq$1"}->{'name'} ) { + my $nm = $2; + $nm = substr($nm,0,$EMBOSSTitleLen); # emboss has a max seq length + if( $names{$nm} ) { + $nm .= "-". $names{$nm}; + } + $names{$nm}++; + $data{"seq$1"}->{'name'} = $nm; + } elsif( $data{'seq1'}->{'name'} && + /^$data{'seq1'}->{'name'}/ ) { + my $count = 0; + $seenbegin = 1; + my @current; + while( defined ($_) ) { + my $align_other = ''; + my $delayed; + if($count == 0 || $count == 2 ) { + my @l = split; + my ($seq,$align,$start,$end); + if( $count == 2 && $data{'seq2'}->{'name'} eq '' ) { + # weird boundary condition + ($start,$align,$end) = @l; + } elsif( @l == 3 ) { + $align = ''; + ($seq,$start,$end) = @l + } else { + ($seq,$start,$align,$end) = @l; + } + + my $seqname = sprintf("seq%d", ($count == 0) ? '1' : '2'); + $data{$seqname}->{'data'} .= $align; + $data{$seqname}->{'start'} ||= $start; + $data{$seqname}->{'end'} = $end; + $current[$count] = [ $start,$align || '']; + } else { + s/^\s+//; + s/\s+$//; + $data{'align'} .= $_; + } + + BOTTOM: + last if( $count++ == 2); + $_ = $self->_readline(); + } + + if( $data{'type'} eq 'needle' ) { + # which ever one is shorter we want to bring it up to + # length. Man this stinks. + my ($s1,$s2) = ($data{'seq1'}, $data{'seq2'}); + + my $d = length($current[0]->[1]) - length($current[2]->[1]); + if( $d < 0 ) { # s1 is smaller, need to add some + # compare the starting points for this alignment line + if( $current[0]->[0] <= 1 && $current[2]->[0] > 1) { + $s1->{'data'} = ('-' x abs($d)) . $s1->{'data'}; + $data{'align'} = (' 'x abs($d)).$data{'align'}; + } else { + $s1->{'data'} .= '-' x abs($d); + $data{'align'} .= ' 'x abs($d); + } + } elsif( $d > 0) { # s2 is smaller, need to add some + if( $current[2]->[0] <= 1 && $current[0]->[0] > 1) { + $s2->{'data'} = ('-' x abs($d)) . $s2->{'data'}; + $data{'align'} = (' 'x abs($d)).$data{'align'}; + } else { + $s2->{'data'} .= '-' x abs($d); + $data{'align'} .= ' 'x abs($d); + } + } + } + + } + } + return undef unless $seenbegin; + my $aln = Bio::SimpleAlign->new(-verbose => $self->verbose(), + -source => "EMBOSS-".$data{'type'}); + + foreach my $seqname ( qw(seq1 seq2) ) { + return undef unless ( defined $data{$seqname} ); + $data{$seqname}->{'name'} ||= $seqname; + my $seq = new Bio::LocatableSeq('-seq' => $data{$seqname}->{'data'}, + '-id' => $data{$seqname}->{'name'}, + '-start'=> $data{$seqname}->{'start'}, + '-end' => $data{$seqname}->{'end'}, + ); + $aln->add_seq($seq); + } + return $aln; +} + +=head2 write_aln + + Title : write_aln + Usage : $stream->write_aln(@aln) + Function: writes the $aln object into the stream in emboss format + Returns : 1 for success and 0 for error + Args : L object + + +=cut + +sub write_aln { + my ($self,@aln) = @_; + + $self->throw("Sorry: writing emboss output is not currently available! \n"); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/AlignIO/fasta.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/AlignIO/fasta.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,199 @@ +# $Id: fasta.pm,v 1.11 2002/12/14 19:09:05 birney Exp $ +# +# BioPerl module for Bio::AlignIO::fasta + +# based on the Bio::SeqIO::fasta module +# by Ewan Birney +# and Lincoln Stein +# +# and the SimpleAlign.pm module of Ewan Birney +# +# Copyright Peter Schattner +# +# You may distribute this module under the same terms as perl itself +# _history +# September 5, 2000 +# POD documentation - main docs before the code + +=head1 NAME + +Bio::AlignIO::fasta - FastA MSA Sequence input/output stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the L class. + +=head1 DESCRIPTION + +This object can transform L objects to and from +fasta flat file databases. This is for the fasta sequence format NOT +FastA analysis program. To process the pairwise alignments from a +FastA (FastX, FastN, FastP, tFastA, etc) use the Bio::SearchIO module. + +=head1 FEEDBACK + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS - Peter Schattner + +Email: schattner@alum.mit.edu + + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::AlignIO::fasta; +use vars qw(@ISA); +use strict; + +use Bio::AlignIO; +use Bio::SimpleAlign; + +@ISA = qw(Bio::AlignIO); + + +=head2 next_aln + + Title : next_aln + Usage : $aln = $stream->next_aln() + Function: returns the next alignment in the stream. + Returns : L object - returns 0 on end of file + or on error + Args : NONE + +=cut + +sub next_aln { + my $self = shift; + my $entry; + my ($start,$end,$name,$seqname,$seq,$seqchar,$tempname,%align); + my $aln = Bio::SimpleAlign->new(); + + while(defined ($entry = $self->_readline)) { + if($entry =~ /^>(\S+)/ ) { + $tempname = $1; + if( defined $name ) { + # put away last name and sequence + + if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) { + $seqname = $1; + $start = $2; + $end = $3; + } else { + $seqname=$name; + $start = 1; + $end = length($seqchar); #ps 9/6/00 + } +# print STDERR "Going to add with $seqchar $seqname\n"; + $seq = new Bio::LocatableSeq('-seq'=>$seqchar, + '-id'=>$seqname, + '-start'=>$start, + '-end'=>$end, + ); + + $aln->add_seq($seq); + } + $name = $tempname; + $seqchar = ""; + next; + } + $entry =~ s/[^A-Za-z\.\-]//g; + $seqchar .= $entry; + + } +# +# Next two lines are to silence warnings that +# otherwise occur at EOF when using <$fh> + + if (!defined $name) {$name="";} + if (!defined $seqchar) {$seqchar="";} + +# Put away last name and sequence + if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) { + $seqname = $1; + $start = $2; + $end = $3; + } else { + $seqname=$name; + $start = 1; + $end = length($seqchar); #ps 9/6/00 +# $end = length($align{$name}); + } + + +# If $end <= 0, we have either reached the end of +# file in <> or we have encountered some other error +# + if ($end <= 0) { undef $aln; return $aln;} + +# This logic now also reads empty lines at the +# end of the file. Skip this is seqchar and seqname is null + if( length($seqchar) == 0 && length($seqname) == 0 ) { + # skip + } else { +# print STDERR "end to add with $seqchar $seqname\n"; + $seq = new Bio::LocatableSeq('-seq'=>$seqchar, + '-id'=>$seqname, + '-start'=>$start, + '-end'=>$end, + ); + + $aln->add_seq($seq); + } + + return $aln; + +} + + +=head2 write_aln + + Title : write_aln + Usage : $stream->write_aln(@aln) + Function: writes the $aln object into the stream in fasta format + Returns : 1 for success and 0 for error + Args : L object + + +=cut + +sub write_aln { + my ($self,@aln) = @_; + my ($seq,$rseq,$name,$count,$length,$seqsub); + + foreach my $aln (@aln) { + if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) { + $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln"); + next; + } + foreach $rseq ( $aln->each_seq() ) { + $name = $aln->displayname($rseq->get_nse()); + $seq = $rseq->seq(); + $self->_print (">$name\n") or return ; + $count =0; + $length = length($seq); + while( ($count * 60 ) < $length ) { + $seqsub = substr($seq,$count*60,60); + $self->_print ("$seqsub\n") or return ; + $count++; + } + } + } + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/AlignIO/mase.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/AlignIO/mase.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,155 @@ +# $Id: mase.pm,v 1.9 2002/10/22 07:38:25 lapp Exp $ +# +# BioPerl module for Bio::AlignIO::mase + +# based on the Bio::SeqIO::mase module +# by Ewan Birney +# and Lincoln Stein +# +# and the SimpleAlign.pm module of Ewan Birney +# +# Copyright Peter Schattner +# +# You may distribute this module under the same terms as perl itself +# _history +# September 5, 2000 +# POD documentation - main docs before the code + +=head1 NAME + +Bio::AlignIO::mase - mase sequence input/output stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the L class. + +=head1 DESCRIPTION + +This object can transform L objects to and from mase flat +file databases. + +=head1 FEEDBACK + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS - Peter Schattner + +Email: schattner@alum.mit.edu + + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::AlignIO::mase; +use vars qw(@ISA); +use strict; + +use Bio::AlignIO; + +@ISA = qw(Bio::AlignIO); + + +=head2 next_aln + + Title : next_aln + Usage : $aln = $stream->next_aln() + Function: returns the next alignment in the stream. + Returns : L object + Args : NONE + +=cut + +sub next_aln { + my $self = shift; + my $entry; + my $name; + my $start; + my $end; + my $seq; + my $add; + my $count = 0; + my $seq_residues; + + my $aln = Bio::SimpleAlign->new(-source => 'mase'); + + + while( $entry = $self->_readline) { + $entry =~ /^;/ && next; + if( $entry =~ /^(\S+)\/(\d+)-(\d+)/ ) { + $name = $1; + $start = $2; + $end = $3; + } else { + $entry =~ s/\s//g; + $name = $entry; + $end = -1; + } + + $seq = ""; + + while( $entry = $self->_readline) { + $entry =~ /^;/ && last; + $entry =~ s/[^A-Za-z\.\-]//g; + $seq .= $entry; + } + if( $end == -1) { + $start = 1; + + $seq_residues = $seq; + $seq_residues =~ s/\W//g; + $end = length($seq_residues); + } + + $add = new Bio::LocatableSeq('-seq'=>$seq, + '-id'=>$name, + '-start'=>$start, + '-end'=>$end, + ); + + + $aln->add_seq($add); + + +# If $end <= 0, we have either reached the end of +# file in <> or we have encountered some other error +# + if ($end <= 0) { undef $aln;} + + } + + return $aln; +} + + + +=head2 write_aln + + Title : write_aln + Usage : $stream->write_aln(@aln) + Function: writes the $aln object into the stream in mase format ###Not yet implemented!### + Returns : 1 for success and 0 for error + Args : L object + + +=cut + +sub write_aln { + my ($self,@aln) = @_; + + $self->throw("Sorry: mase-format output, not yet implemented! /n"); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/AlignIO/mega.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/AlignIO/mega.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,229 @@ +# $Id: mega.pm,v 1.8 2002/10/22 07:45:10 lapp Exp $ +# +# BioPerl module for Bio::AlignIO::mega +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::AlignIO::mega - Parse and Create MEGA format data files + +=head1 SYNOPSIS + + use Bio::AlignIO; + my $alignio = new Bio::AlignIO(-format => 'mega', + -file => 't/data/hemoglobinA.meg'); + + while( my $aln = $alignio->next_aln ) { + # process each alignment or convert to another format like NEXUS + } + +=head1 DESCRIPTION + +This object handles reading and writing data streams in the MEGA +format (Kumar and Nei). + + +=head1 FEEDBACK + + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::AlignIO::mega; +use vars qw(@ISA $MEGANAMELEN %VALID_TYPES $LINELEN $BLOCKLEN); +use strict; + +use Bio::AlignIO; +use Bio::SimpleAlign; +use Bio::LocatableSeq; + +BEGIN { + $MEGANAMELEN = 10; + $LINELEN = 60; + $BLOCKLEN = 10; + %VALID_TYPES = map {$_, 1} qw( dna rna protein standard); +} +@ISA = qw(Bio::AlignIO ); + + +=head2 next_aln + + Title : next_aln + Usage : $aln = $stream->next_aln() + Function: returns the next alignment in the stream. + Supports the following MEGA format features: + - The file has to start with '#mega' + - Reads in the name of the alignment from a comment + (anything after '!TITLE: ') . + - Reads in the format parameters datatype + + Returns : L object - returns 0 on end of file + or on error + Args : NONE + + +=cut + +sub next_aln{ + my ($self) = @_; + my $entry; + my ($alphabet,%seqs); + + my $aln = Bio::SimpleAlign->new(-source => 'mega'); + + while( defined($entry = $self->_readline()) && ($entry =~ /^\s+$/) ) {} + + $self->throw("Not a valid MEGA file! [#mega] not starting the file!") + unless $entry =~ /^#mega/i; + + while( defined($entry = $self->_readline() ) ) { + local($_) = $entry; + if(/\!Title:\s*([^\;]+)\s*/i) { $aln->id($1)} + elsif( s/\!Format\s+([^\;]+)\s*/$1/ ) { + my (@fields) = split(/\s+/,$1); + foreach my $f ( @fields ) { + my ($name,$value) = split(/\=/,$f); + if( $name eq 'datatype' ) { + $alphabet = $value; + } elsif( $name eq 'identical' ) { + $aln->match_char($value); + } elsif( $name eq 'indel' ) { + $aln->gap_char($value); + } + } + } elsif( /^\#/ ) { + last; + } + } + my @order; + while( defined($entry) ) { + if( $entry !~ /^\s+$/ ) { + # this is to skip the leading '#' + my $seqname = substr($entry,1,$MEGANAMELEN-1); + $seqname =~ s/(\S+)\s+$/$1/g; + my $line = substr($entry,$MEGANAMELEN); + $line =~ s/\s+//g; + if( ! defined $seqs{$seqname} ) {push @order, $seqname; } + $seqs{$seqname} .= $line; + } + $entry = $self->_readline(); + } + + foreach my $seqname ( @order ) { + my $s = $seqs{$seqname}; + $s =~ s/\-//g; + my $end = length($s); + my $seq = new Bio::LocatableSeq(-alphabet => $alphabet, + -id => $seqname, + -seq => $seqs{$seqname}, + -start => 1, + -end => $end); + + $aln->add_seq($seq); + } + return $aln; +} + +=head2 write_aln + + Title : write_aln + Usage : $stream->write_aln(@aln) + Function: writes the $aln object into the stream in MEGA format + Returns : 1 for success and 0 for error + Args : L object + +=cut + +sub write_aln{ + my ($self,@aln) = @_; + my $count = 0; + my $wrapped = 0; + my $maxname; + + foreach my $aln ( @aln ) { + if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) { + $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln"); + return 0; + } elsif( ! $aln->is_flush($self->verbose) ) { + $self->warn("All Sequences in the alignment must be the same length"); + return 0; + } + $aln->match(); + my $len = $aln->length(); + my $format = sprintf('datatype=%s identical=%s indel=%s;', + $aln->get_seq_by_pos(1)->alphabet(), + $aln->match_char, $aln->gap_char); + + $self->_print(sprintf("#mega\n!Title: %s;\n!Format %s\n\n\n", + $aln->id, $format)); + + my ($count, $blockcount,$length) = ( 0,0,$aln->length()); + $aln->set_displayname_flat(); + while( $count < $length ) { + foreach my $seq ( $aln->each_seq ) { + my $seqchars = $seq->seq(); + $blockcount = 0; + my $substring = substr($seqchars, $count, $LINELEN); + my @blocks; + while( $blockcount < length($substring) ) { + push @blocks, substr($substring, $blockcount,$BLOCKLEN); + $blockcount += $BLOCKLEN; + } + $self->_print(sprintf("#%-".($MEGANAMELEN-1)."s%s\n", + substr($aln->displayname($seq->get_nse()), + 0,$MEGANAMELEN-2), + join(' ', @blocks))); + } + $self->_print("\n"); + $count += $LINELEN; + } + } + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/AlignIO/meme.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/AlignIO/meme.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,215 @@ +# $id $ +# +# BioPerl module for Bio::AlignIO::meme +# based on the Bio::SeqIO modules +# by Ewan Birney +# and Lincoln Stein +# +# and the SimpleAlign.pm module of Ewan Birney +# +# Copyright Benjamin Berman +# +# You may distribute this module under the same terms as perl itself +# _history + +=head1 NAME + +Bio::AlignIO::meme - meme sequence input/output stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the Bio::AlignIO class. + +=head1 DESCRIPTION + +This object transforms the "sites sorted by p-value" sections of a meme +(text) output file into a series of Bio::SimpleAlign objects. Each +SimpleAlign object contains Bio::LocatableSeq objects which represent the +individual aligned sites as defined by the central portion of the "site" +field in the meme file. The start and end coordinates are derived from +the "Start" field. See L and L for +more information. + +This module can only parse MEME version 3.0 and greater. Previous versions +have output formats that are more difficult to parse correctly. If the meme +output file is not version 3.0 or greater, we signal an error. + +=head1 FEEDBACK + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. +Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS - Benjamin Berman + + (based on the Bio::SeqIO modules by Ewan Birney and others) + Email: benb@fruitfly.berkeley.edu + + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with an +underscore. + +=cut + +# Let the code begin... + +package Bio::AlignIO::meme; +use vars qw(@ISA); +use strict; + +use Bio::AlignIO; +use Bio::LocatableSeq; + +@ISA = qw(Bio::AlignIO); + +# Constants +my $MEME_VERS_ERR = "MEME output file must be generated by version 3.0 or higher"; +my $MEME_NO_HEADER_ERR = "MEME output file contains no header line (ex: MEME version 3.0)"; +my $HTML_VERS_ERR = "MEME output file must be generated with the -text option"; + +=head2 next_aln + + Title : next_aln + Usage : $aln = $stream->next_aln() + Function: returns the next alignment in the stream + Returns : Bio::SimpleAlign object + Args : NONE + +=cut + +sub next_aln { + my ($self) = @_; + my $aln = Bio::SimpleAlign->new(-source => 'meme'); + my $line; + my $good_align_sec = 0; + my $in_align_sec = 0; + while (!$good_align_sec && defined($line = $self->_readline())) + { + if (!$in_align_sec) + { + # Check for the meme header + if ($line =~ /^\s*[Mm][Ee][Mm][Ee]\s+version\s+((?:\d+)?\.\d+)/) + { + $self->{'meme_vers'} = $1; + $self->throw($MEME_VERS_ERR) unless ($self->{'meme_vers'} >= 3.0); + $self->{'seen_header'} = 1; + } + + # Check if they've output the HTML version + if ($line =~ /\<[Tt][Ii][Tt][Ll][Ee]\>/) + { + $self->throw($HTML_VERS_ERR); + } + + # Check if we're going into an alignment section + if ($line =~ /sites sorted by position/) # meme vers > 3.0 + { + $self->throw($MEME_NO_HEADER_ERR) unless ($self->{'seen_header'}); + $in_align_sec = 1; + } + } + elsif ($line =~ /^\s*(\S+)\s+([+-])\s+(\d+)\s+(\S+)\s+([\.ACTGactg]*) ([ACTGactg]+) ([\.ACTGactg]*)/) + { + # Got a sequence line + my $seq_name = $1; + my $strand = ($2 eq '+') ? 1 : -1; + my $start_pos = $3; + # my $p_val = $4; + # my $left_flank = uc($5); + my $central = uc($6); + # my $right_flank = uc($7); + + # Info about the sequence + my $seq_res = $central; + my $seq_len = length($seq_res); + + # Info about the flanking sequence + # my $left_len = length($left_flank); + # my $right_len = length($right_flank); + # my $start_len = ($strand > 0) ? $left_len : $right_len; + # my $end_len = ($strand > 0) ? $right_len : $left_len; + + # Make the sequence. Meme gives the start coordinate at the left + # hand side of the motif relative to the INPUT sequence. + my $start_coord = $start_pos; + my $end_coord = $start_coord + $seq_len - 1; + my $seq = new Bio::LocatableSeq('-seq'=>$seq_res, + '-id'=>$seq_name, + '-start'=>$start_coord, + '-end'=>$end_coord, + '-strand'=>$strand); + + # Make a seq_feature out of the motif + $aln->add_seq($seq); + } + elsif (($line =~ /^\-/) || ($line =~ /Sequence name/)) + { + # These are acceptable things to be in the site section + } + elsif ($line =~ /^\s*$/) + { + # This ends the site section + $in_align_sec = 0; + $good_align_sec = 1; + } + else + { + $self->warn("Unrecognized format:\n$line"); + return 0; + } + } + + # Signal an error if we didn't find a header section + $self->throw($MEME_NO_HEADER_ERR) unless ($self->{'seen_header'}); + + return (($good_align_sec) ? $aln : 0); +} + + + +=head2 write_aln + + Title : write_aln + Usage : $stream->write_aln(@aln) + Function: Not implemented + Returns : 1 for success and 0 for error + Args : Bio::SimpleAlign object + +=cut + +sub write_aln { + my ($self,@aln) = @_; + + # Don't handle it yet. + $self->throw("AlignIO::meme::write_aln not implemented"); + return 0; +} + + + +# ---------------------------------------- +# - Private methods +# ---------------------------------------- + + + +sub _initialize { + my($self,@args) = @_; + + # Call into our base version + $self->SUPER::_initialize(@args); + + # Then initialize our data variables + $self->{'seen_header'} = 0; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/AlignIO/msf.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/AlignIO/msf.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,242 @@ +# $Id: msf.pm,v 1.16 2002/11/26 16:34:39 jason Exp $ +# +# BioPerl module for Bio::AlignIO::msf + +# based on the Bio::SeqIO::msf module +# by Ewan Birney +# and Lincoln Stein +# +# and the SimpleAlign.pm module of Ewan Birney +# +# Copyright Peter Schattner +# +# You may distribute this module under the same terms as perl itself +# _history +# September 5, 2000 +# POD documentation - main docs before the code + +=head1 NAME + +Bio::AlignIO::msf - msf sequence input/output stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the L class. + +=head1 DESCRIPTION + +This object can transform L objects to and from msf flat +file databases. + +=head1 FEEDBACK + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS - Peter Schattner + +Email: schattner@alum.mit.edu + + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::AlignIO::msf; +use vars qw(@ISA %valid_type); +use strict; + +use Bio::AlignIO; +use Bio::SeqIO::gcg; # for GCG_checksum() +use Bio::SimpleAlign; + +@ISA = qw(Bio::AlignIO); + +BEGIN { + %valid_type = qw( dna N rna N protein P ); +} + +=head2 next_aln + + Title : next_aln + Usage : $aln = $stream->next_aln() + Function: returns the next alignment in the stream. Tries to read *all* MSF + It reads all non whitespace characters in the alignment + area. For MSFs with weird gaps (eg ~~~) map them by using + $al->map_chars('~','-') + Returns : L object + Args : NONE + +=cut + +sub next_aln { + my $self = shift; + my $entry; + my (%hash,$name,$str,@names,$seqname,$start,$end,$count,$seq); + + my $aln = Bio::SimpleAlign->new(-source => 'gcg' ); + + while( $entry = $self->_readline) { + $entry =~ /\/\// && last; # move to alignment section + $entry =~ /Name:\s+(\S+)/ && do { $name = $1; + $hash{$name} = ""; # blank line + push(@names,$name); # we need it ordered! + }; + # otherwise - skip + } + + # alignment section + + while( $entry = $self->_readline) { + next if ( $entry =~ /^\s+(\d+)/ ) ; + $entry =~ /^\s*(\S+)\s+(.*)$/ && do { + $name = $1; + $str = $2; + if( ! exists $hash{$name} ) { + $self->throw("$name exists as an alignment line but not in the header. Not confident of what is going on!"); + } + $str =~ s/\s//g; + $hash{$name} .= $str; + }; + } + + return 0 if scalar @names < 1; + + # now got this as a name - sequence hash. Lets make some sequences! + + foreach $name ( @names ) { + if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) { + $seqname = $1; + $start = $2; + $end = $3; + } else { + $seqname=$name; + $start = 1; + $str = $hash{$name}; + $str =~ s/[^A-Za-z]//g; + $end = length($str); + } + + $seq = new Bio::LocatableSeq('-seq'=>$hash{$name}, + '-id'=>$seqname, + '-start'=>$start, + '-end'=>$end, + ); + + $aln->add_seq($seq); + + +# If $end <= 0, we have either reached the end of +# file in <> or we have encountered some other error +# +# if ($end <= 0) { undef $aln;} + + + } + + return $aln; +} + + + + +=head2 write_aln + + Title : write_aln + Usage : $stream->write_aln(@aln) + Function: writes the $aln object into the stream in MSF format + Sequence type of the alignment is determined by the first sequence. + Returns : 1 for success and 0 for error + Args : L object + + +=cut + +sub write_aln { + my ($self,@aln) = @_; + my $msftag; + my $type; + my $count = 0; + my $maxname; + my ($length,$date,$name,$seq,$miss,$pad,%hash,@arr,$tempcount,$index); + foreach my $aln (@aln) { + if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) { + $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln"); + next; + } + $date = localtime(time); + $msftag = "MSF"; + $type = $valid_type{$aln->get_seq_by_pos(1)->alphabet}; + $maxname = $aln->maxdisplayname_length(); + $length = $aln->length(); + $name = $aln->id(); + if( !defined $name ) { + $name = "Align"; + } + + + $self->_print (sprintf("\n%s MSF: %d Type: %s %s Check: 00 ..\n\n", + $name, $aln->no_sequences, $type, $date)); + + + foreach $seq ( $aln->each_seq() ) { + + + $name = $aln->displayname($seq->get_nse()); + $miss = $maxname - length ($name); + $miss += 2; + $pad = " " x $miss; + + $self->_print (sprintf(" Name: %s%sLen: %d Check: %d Weight: 1.00\n",$name,$pad,length $seq->seq(), Bio::SeqIO::gcg->GCG_checksum($seq))); + + $hash{$name} = $seq->seq(); + push(@arr,$name); + } + # ok - heavy handed, but there you go. + # + $self->_print ("\n//\n\n\n"); + + while( $count < $length ) { + # there is another block to go! + foreach $name ( @arr ) { + $self->_print (sprintf("%-20s ",$name)); + + $tempcount = $count; + $index = 0; + while( ($tempcount + 10 < $length) && ($index < 5) ) { + + $self->_print (sprintf("%s ",substr($hash{$name},$tempcount,10))); + + $tempcount += 10; + $index++; + } # + # ok, could be the very last guy ;) + # + if( $index < 5) { + # space to print! + # + $self->_print (sprintf("%s ",substr($hash{$name},$tempcount))); + $tempcount += 10; + } + $self->_print ("\n"); + } + $self->_print ("\n\n"); + $count = $tempcount; + } + } + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/AlignIO/nexus.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/AlignIO/nexus.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,343 @@ +# $Id: nexus.pm,v 1.12.2.1 2003/04/07 15:17:17 heikki Exp $ +# +# BioPerl module for Bio::AlignIO::nexus +# +# Copyright Heikki Lehvaslaiho +# + +=head1 NAME + +Bio::AlignIO::nexus - NEXUS format sequence input/output stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the L class. + +=head1 DESCRIPTION + +This object can transform L objects to and from NEXUS +data blocks. See method documentation for supported NEXUS features. + +=head1 ACKNOWLEDGEMENTS + +Will Fisher has written an excellent standalone NEXUS format parser in +perl, readnexus. A number of tricks were adapted from it. + +=head1 FEEDBACK + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS - Heikki Lehvaslaiho + +Email: heikki@ebi.ac.uk + + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::AlignIO::nexus; +use vars qw(@ISA %valid_type); +use strict; +no strict "refs"; + +use Bio::AlignIO; + +@ISA = qw(Bio::AlignIO); + +BEGIN { + %valid_type = map {$_, 1} qw( dna rna protein standard); +} + +=head2 next_aln + + Title : next_aln + Usage : $aln = $stream->next_aln() + Function: Returns the next alignment in the stream. + + Supports the following NEXUS format features: + - The file has to start with '#NEXUS' + - Reads in the name of the alignment from a comment + (anything after 'TITLE: ') . + - Sequence names can be given in a taxa block, too. + - If matchchar notation is used, converts + them back to sequence characters. + - Does character conversions specified in the + NEXUS equate command. + - Sequence names of type 'Homo sapiens' and + Homo_sapiens are treated identically. + + Returns : L object + Args : + +=cut + +sub next_aln { + my $self = shift; + my $entry; + my ($aln_name, $seqcount, $residuecount, %hash, $alphabet, + $match, $gap, $missing, $equate, $interleave, + $name,$str,@names,$seqname,$start,$end,$count,$seq); + + my $aln = Bio::SimpleAlign->new(-source => 'nexus'); + + # file starts with '#NEXUS' but we allow white space only lines before it + $entry = $self->_readline; + $entry = $self->_readline while $entry =~ /^\s+$/; + + return unless $entry; + $self->throw("Not a valid interleaved NEXUS file! [#NEXUS] not starting the file\n$entry") + unless $entry =~ /^#NEXUS/i; + + # skip anything before either the taxa or data block + # but read in the optional title in a comment + while (defined($entry = $self->_readline)) { + local ($_) = $entry; + /\[TITLE. *([^\]]+)]\s+/i and $aln_name = $1; + last if /^begin +data/i || /^begin +taxa/i; + } + $aln_name =~ s/\s/_/g and $aln->id($aln_name) if $aln_name; + + # data and taxa blocks + my $taxlabels; + while ($entry = $self->_readline) { + local ($_) = $entry; + + # read in seq names if in taxa block + $taxlabels = 1 if /taxlabels/i; + if ($taxlabels) { + @names = $self->_read_taxlabels; + $taxlabels = 0; + } + + /ntax ?= ?(\d+)/i and $seqcount = $1; + /nchar ?= ?(\d+)/i and $residuecount = $1; + /matchchar ?= ?(.)/i and $match = $1; + /gap ?= ?(.)/i and $gap = $1; + /missing ?= ?(.)/i and $missing = $1; + /equate ?= ?"([^\"]+)/i and $equate = $1; # "e.g. equate="T=C G=A"; + /datatype ?= ?(\w+)/i and $alphabet = lc $1; + /interleave/i and $interleave = 1 ; + + last if /matrix/i; + } + $self->throw("Not a valid NEXUS sequence file. Datatype not specified") + unless $alphabet; + $self->throw("Not a valid NEXUS sequence file. Datatype should not be [$alphabet]") + unless $valid_type{$alphabet}; + + $aln->gap_char($gap); + $aln->missing_char($missing); + + # + # if data is not right after the matrix line + # read the empty lines out + # + while ($entry = $self->_readline) { + unless ($entry =~ /^\s+$/) { + $self->_pushback($entry); + last; + } + } + + # + # matrix command + # + # first alignment section + if (@names == 0) { # taxa block did not exist + while ($entry = $self->_readline) { + local ($_) = $entry; + + s/\[[^[]+\]//g; #] remove comments + if ($interleave) { + /^\s+$/ and last; + } else { + /^\s+$/ and next; + } + /^\s*;\s*$/ and last; + if (/^\s*('([^']*?)'|([^']\S*))\s+(.*)\s$/) { #' + $name = ($2 || $3); + $str = $4; + $name =~ s/ /_/g; + push @names, $name; + + $str =~ s/\s//g; + $count = @names; + $hash{$count} = $str; + }; + $self->throw("Not a valid interleaved NEXUS file! +seqcount [$count] > predeclared [$seqcount] in the first section") if $count > $seqcount; + } + } + + # interleaved sections + $count = 0; + while( $entry = $self->_readline) { + local ($_) = $entry; + s/\[[^[]+\]//g; #] remove comments + last if /^\s*;/; + + $count = 0, next if $entry =~ /^\s*$/; + if (/^\s*('([^']*?)'|([^']\S*))\s+(.*)\s$/) { #' + $str = $4; + $str =~ s/\s//g; + $count++; + $hash{$count} .= $str; + }; + $self->throw("Not a valid interleaved NEXUS file! +seqcount [$count] > predeclared [$seqcount] ") if $count > $seqcount; + + } + + return 0 if @names < 1; + + # sequence creation + $count = 0; + foreach $name ( @names ) { + $count++; + if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) { + $seqname = $1; + $start = $2; + $end = $3; + } else { + $seqname=$name; + $start = 1; + $str = $hash{$count}; + $str =~ s/[^A-Za-z]//g; + $end = length($str); + } + + # consistency test + $self->throw("Length of sequence [$seqname] is not [$residuecount]! ") + unless CORE::length($hash{$count}) == $residuecount; + + $seq = new Bio::LocatableSeq('-seq'=>$hash{$count}, + '-id'=>$seqname, + '-start'=>$start, + '-end'=>$end, + 'alphabet'=>$alphabet + ); + $aln->add_seq($seq); + } + + # if matchchar is used + $aln->unmatch($match) if $match; + + # if equate ( e.g. equate="T=C G=A") is used + if ($equate) { + $aln->map_chars($1, $2) while $equate =~ /(\S)=(\S)/g; + } + + while ($entry !~ /endblock/i) { + $entry = $self->_readline; + } + + return $aln; +} + +sub _read_taxlabels { + my ($self) = @_; + my ($name, @names); + while (my $entry = $self->_readline) { + ($name) = $entry =~ /\s*(\S+)\s+/; + $name =~ s/\[[^\[]+\]//g; + $name =~ s/\W/_/g; + push @names, $name; + last if /^\s*;/; + } + return @names; +} + +=head2 write_aln + + Title : write_aln + Usage : $stream->write_aln(@aln) + Function: Writes the $aln object into the stream in interleaved NEXUS + format. Everything is written into a data block. + SimpleAlign methods match_char, missing_char and gap_char must be set + if you want to see them in the output. + Returns : 1 for success and 0 for error + Args : L object + +=cut + +sub write_aln { + my ($self,@aln) = @_; + my $count = 0; + my $wrapped = 0; + my $maxname; + my ($length,$date,$name,$seq,$miss,$pad,%hash,@arr,$tempcount,$index ); + my ($match, $missing, $gap,$symbols) = ('', '', '',''); + + foreach my $aln (@aln) { + if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) { + $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln"); + next; + } + $self->throw("All sequences in the alignment must be the same length") + unless $aln->is_flush($self->verbose); + + $length = $aln->length(); + + $self->_print (sprintf("#NEXUS\n[TITLE: %s]\n\nbegin data;\ndimensions ntax=%s nchar=%s;\n", + $aln->id, $aln->no_sequences, $length)); + $match = "match=". $aln->match_char if $aln->match_char; + $missing = "missing=". $aln->missing_char if $aln->missing_char; + $gap = "gap=". $aln->gap_char if $aln->gap_char; + $symbols = 'symbols="'.join('',$aln->symbol_chars). '"' if( $aln->symbol_chars); + $self->_print (sprintf("format interleave datatype=%s %s %s %s %s;\n\nmatrix\n", + $aln->get_seq_by_pos(1)->alphabet, $match, $missing, $gap, $symbols)); + + my $indent = $aln->maxdisplayname_length; + $aln->set_displayname_flat(); + foreach $seq ( $aln->each_seq() ) { + $name = $aln->displayname($seq->get_nse()); + $name = sprintf("%-${indent}s", $name); + $hash{$name} = $seq->seq(); + push(@arr,$name); + } + + while( $count < $length ) { + # there is another block to go! + foreach $name ( @arr ) { + my $dispname = $name; +# $dispname = '' if $wrapped; + $self->_print (sprintf("%${indent}s ",$dispname)); + $tempcount = $count; + $index = 0; + while( ($tempcount + 10 < $length) && ($index < 5) ) { + $self->_print (sprintf("%s ",substr($hash{$name},$tempcount,10))); + $tempcount += 10; + $index++; + } + # last + if( $index < 5) { + # space to print! + $self->_print (sprintf("%s ",substr($hash{$name},$tempcount))); + $tempcount += 10; + } + $self->_print ("\n"); + } + $self->_print ("\n\n"); + $count = $tempcount; + $wrapped = 1; + } + $self->_print (";\n\nendblock;\n"); + } + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/AlignIO/pfam.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/AlignIO/pfam.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,154 @@ +# $Id: pfam.pm,v 1.10 2002/10/22 07:38:26 lapp Exp $ +# +# BioPerl module for Bio::AlignIO::pfam + +# based on the Bio::SeqIO:: modules +# by Ewan Birney +# and Lincoln Stein +# +# and the SimpleAlign.pm module of Ewan Birney +# +# Copyright Peter Schattner +# +# You may distribute this module under the same terms as perl itself +# _history +# September 5, 2000 +# POD documentation - main docs before the code + +=head1 NAME + +Bio::AlignIO::pfam - pfam sequence input/output stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the L class. + +=head1 DESCRIPTION + +This object can transform Bio::SimpleAlign objects to and from pfam flat +file databases. + +=head1 FEEDBACK + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS - Peter Schattner + +Email: schattner@alum.mit.edu + + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::AlignIO::pfam; +use vars qw(@ISA); +use strict; + +use Bio::AlignIO; +use Bio::SimpleAlign; +@ISA = qw(Bio::AlignIO); + +=head2 next_aln + + Title : next_aln + Usage : $aln = $stream->next_aln() + Function: returns the next alignment in the stream + Returns : L object + Args : NONE + +=cut + +sub next_aln { + my $self = shift; + my $entry; + my $name; + my $start; + my $end; + my $seq; + my $add; + my $acc; + my %names; + + my $aln = Bio::SimpleAlign->new(-source => 'pfam'); + + while( $entry = $self->_readline) { + chomp $entry; + $entry =~ /^\/\// && last; + if($entry !~ /^(\S+)\/(\d+)-(\d+)\s+(\S+)\s*/ ) { + $self->throw("Found a bad line [$_] in the pfam format alignment"); + next; + } + + $name = $1; + $start = $2; + $end = $3; + $seq = $4; + + + $add = new Bio::LocatableSeq('-seq'=>$seq, + '-id'=>$name, + '-start'=>$start, + '-end'=>$end, + ); + + $aln->add_seq($add); + + } + +# If $end <= 0, we have either reached the end of +# file in <> or we have encountered some other error +# + if ($end <= 0) { undef $aln;} + + return $aln; +} + + + +=head2 write_aln + + Title : write_aln + Usage : $stream->write_aln(@aln) + Function: writes the $aln object into the stream + Returns : 1 for success and 0 for error + Args : L object + + +=cut + +sub write_aln { + my ($self,@aln) = @_; + if( @aln > 1 ) { $self->warn("Only the 1st pfam alignment will be output since the format does not support multiple alignments in the same file"); } + my $aln = shift @aln; + if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) { + $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln"); + next; + } + my ($namestr,$seq,$add); + my ($maxn); + $maxn = $aln->maxdisplayname_length(); + + foreach $seq ( $aln->each_seq() ) { + $namestr = $aln->displayname($seq->get_nse()); + $add = $maxn - length($namestr) + 2; + $namestr .= " " x $add; + $self->_print (sprintf("%s %s\n",$namestr,$seq->seq())) or return; + } + $self->flush() if $self->_flush_on_write && defined $self->_fh; + return 1; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/AlignIO/phylip.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/AlignIO/phylip.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,400 @@ +# $Id: phylip.pm,v 1.24.2.1 2003/01/26 15:52:30 jason Exp $ +# +# BioPerl module for Bio::AlignIO::phylip +# +# Copyright Heikki Lehvaslaiho +# + +=head1 NAME + +Bio::AlignIO::phylip - PHYLIP format sequence input/output stream + +=head1 SYNOPSIS + +# Do not use this module directly. Use it via the Bio::AlignIO class. + + use Bio::AlignIO; + use Bio::SimpleAlign; + #you can set the name length to something other than the default 10 + #if you use a version of phylip (hacked) that accepts ids > 10 + my $phylipstream = new Bio::AlignIO(-format => 'phylip', + -fh => \*STDOUT, + -idlength=>30); + # convert data from one format to another + my $gcgstream = new Bio::AlignIO(-format => 'msf', + -file => 't/data/cysprot1a.msf'); + + while( my $aln = $gcgstream->next_aln ) { + $phylipstream->write_aln($aln); + } + + # do it again with phylip sequential format format + $phylipstream->interleaved(0); + # can also initialize the object like this + $phylipstream = new Bio::AlignIO(-interleaved => 0, + -format => 'phylip', + -fh => \*STDOUT, + -idlength=>10); + $gcgstream = new Bio::AlignIO(-format => 'msf', + -file => 't/data/cysprot1a.msf'); + + while( my $aln = $gcgstream->next_aln ) { + $phylipstream->write_aln($aln); + } + +=head1 DESCRIPTION + +This object can transform Bio::SimpleAlign objects to and from PHYLIP +interleaved format. It will not work with PHYLIP sequencial format. + +This module will output PHYLIP sequential format. By specifying the +flag -interleaved =E 0 in the initialization the module can output +data in interleaved format. + +=head1 FEEDBACK + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS - Heikki Lehvaslaiho and Jason Stajich + +Email: heikki@ebi.ac.uk +Email: jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::AlignIO::phylip; +use vars qw(@ISA $DEFAULTIDLENGTH $DEFAULTLINELEN); +use strict; + +use Bio::SimpleAlign; +use Bio::AlignIO; + +@ISA = qw(Bio::AlignIO); + +BEGIN { + $DEFAULTIDLENGTH = 10; + $DEFAULTLINELEN = 60; +} + +=head2 new + + Title : new + Usage : my $alignio = new Bio::AlignIO(-format => 'phylip' + -file => '>file', + -idlength => 10, + -idlinebreak => 1); + Function: Initialize a new L reader or writer + Returns : L object + Args : [specific for writing of phylip format files] + -idlength => integer - length of the id (will pad w/ + spaces if needed) + -interleaved => boolean - whether or not write as interleaved + or sequential format + -linelength => integer of how long a sequence lines should be + -idlinebreak => insert a line break after the sequence id + so that sequence starts on the next line + +=cut + +sub _initialize { + my($self,@args) = @_; + $self->SUPER::_initialize(@args); + + my ($interleave,$linelen,$idlinebreak, + $idlength) = $self->_rearrange([qw(INTERLEAVED + LINELENGTH + IDLINEBREAK + IDLENGTH)],@args); + $self->interleaved(1) if( $interleave || ! defined $interleave); + $self->idlength($idlength || $DEFAULTIDLENGTH); + $self->id_linebreak(1) if( $idlinebreak ); + $self->line_length($linelen) if defined $linelen && $linelen > 0; + 1; +} + +=head2 next_aln + + Title : next_aln + Usage : $aln = $stream->next_aln() + Function: returns the next alignment in the stream. + Throws an exception if trying to read in PHYLIP + sequential format. + Returns : L object + Args : + +=cut + +sub next_aln { + my $self = shift; + my $entry; + my ($seqcount, $residuecount, %hash, $name,$str, + @names,$seqname,$start,$end,$count,$seq); + + my $aln = Bio::SimpleAlign->new(-source => 'phylip'); + $entry = $self->_readline and + ($seqcount, $residuecount) = $entry =~ /\s*(\d+)\s+(\d+)/; + return 0 unless $seqcount and $residuecount; + + # first alignment section + my $idlen = $self->idlength; + $count = 0; + my $non_interleaved = ! $self->interleaved ; + while( $entry = $self->_readline) { + last if( $entry =~ /^\s?$/ && ! $non_interleaved ); + + if( $entry =~ /^\s+(.+)$/ ) { + $str = $1; + $non_interleaved = 1; + $str =~ s/\s//g; + $count = scalar @names; + $hash{$count} .= $str; + } elsif( $entry =~ /^(.{$idlen})\s+(.*)\s$/ ) { + $name = $1; + $str = $2; + $name =~ s/[\s\/]/_/g; + $name =~ s/_+$//; # remove any trailing _'s + push @names, $name; + $str =~ s/\s//g; + $count = scalar @names; + $hash{$count} = $str; + } + $self->throw("Not a valid interleaved PHYLIP file!") if $count > $seqcount; + } + + unless( $non_interleaved ) { + # interleaved sections + $count = 0; + while( $entry = $self->_readline) { + # finish current entry + if($entry =~/\s*\d+\s+\d+/){ + $self->_pushback($entry); + last; + } + $count = 0, next if $entry =~ /^\s$/; + + $entry =~ /\s*(.*)$/ && do { + $str = $1; + $str =~ s/\s//g; + $count++; + $hash{$count} .= $str; + }; + $self->throw("Not a valid interleaved PHYLIP file!") if $count > $seqcount; + } + } + return 0 if scalar @names < 1; + + # sequence creation + $count = 0; + foreach $name ( @names ) { + $count++; + if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) { + $seqname = $1; + $start = $2; + $end = $3; + } else { + $seqname=$name; + $start = 1; + $str = $hash{$count}; + $str =~ s/[^A-Za-z]//g; + $end = length($str); + } + # consistency test + $self->throw("Length of sequence [$seqname] is not [$residuecount]! ") + unless CORE::length($hash{$count}) == $residuecount; + + $seq = new Bio::LocatableSeq('-seq'=>$hash{$count}, + '-id'=>$seqname, + '-start'=>$start, + '-end'=>$end, + ); + + $aln->add_seq($seq); + + } + return $aln; +} + + +=head2 write_aln + + Title : write_aln + Usage : $stream->write_aln(@aln) + Function: writes the $aln object into the stream in MSF format + Returns : 1 for success and 0 for error + Args : L object + +=cut + +sub write_aln { + my ($self,@aln) = @_; + my $count = 0; + my $wrapped = 0; + my $maxname; + my ($length,$date,$name,$seq,$miss,$pad, + %hash,@arr,$tempcount,$index,$idlength); + + foreach my $aln (@aln) { + if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) { + $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln"); + next; + } + $self->throw("All sequences in the alignment must be the same length") + unless $aln->is_flush(1) ; + + $aln->set_displayname_flat(); # plain + $length = $aln->length(); + $self->_print (sprintf(" %s %s\n", $aln->no_sequences, $aln->length)); + + $idlength = $self->idlength(); + foreach $seq ( $aln->each_seq() ) { + $name = $aln->displayname($seq->get_nse); + $name = substr($name, 0, $idlength) if length($name) > $idlength; + $name = sprintf("%-".$idlength."s",$name); + if( $self->interleaved() ) { + $name .= ' ' ; + } elsif( $self->id_linebreak) { + $name .= "\n"; + } + + #phylip needs dashes not dots + my $seq = $seq->seq(); + $seq=~s/\./-/g; + $hash{$name} = $seq; + push(@arr,$name); + } + + if( $self->interleaved() ) { + while( $count < $length ) { + + # there is another block to go! + foreach $name ( @arr ) { + my $dispname = $name; + $dispname = '' if $wrapped; + $self->_print (sprintf("%".($idlength+3)."s",$dispname)); + $tempcount = $count; + $index = 0; + while( ($tempcount + $idlength < $length) && ($index < 5) ) { + $self->_print (sprintf("%s ",substr($hash{$name}, + $tempcount, + $idlength))); + $tempcount += $idlength; + $index++; + } + # last + if( $index < 5) { + # space to print! + $self->_print (sprintf("%s ",substr($hash{$name}, + $tempcount))); + $tempcount += $idlength; + } + $self->_print ("\n"); + } + $self->_print ("\n"); + $count = $tempcount; + $wrapped = 1; + } + } else { + foreach $name ( @arr ) { + my $dispname = $name; + $dispname = '' if $wrapped; + $self->_print (sprintf("%s%s\n",$dispname,$hash{$name})); + } + } + } + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; +} + +=head2 interleaved + + Title : interleaved + Usage : my $interleaved = $obj->interleaved + Function: Get/Set Interleaved status + Returns : boolean + Args : boolean + + +=cut + +sub interleaved{ + my ($self,$value) = @_; + my $previous = $self->{'_interleaved'}; + if( defined $value ) { + $self->{'_interleaved'} = $value; + } + return $previous; +} + +=head2 idlength + + Title : idlength + Usage : my $idlength = $obj->interleaved + Function: Get/Set value of id length + Returns : string + Args : string + + +=cut + +sub idlength { + my($self,$value) = @_; + if (defined $value){ + $self->{'_idlength'} = $value; + } + return $self->{'_idlength'}; +} + +=head2 line_length + + Title : line_length + Usage : $obj->line_length($newval) + Function: + Returns : value of line_length + Args : newvalue (optional) + + +=cut + +sub line_length{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'line_length'} = $value; + } + return $self->{'line_length'} || $DEFAULTLINELEN; + +} + +=head2 id_linebreak + + Title : id_linebreak + Usage : $obj->id_linebreak($newval) + Function: + Returns : value of id_linebreak + Args : newvalue (optional) + + +=cut + +sub id_linebreak{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'_id_linebreak'} = $value; + } + return $self->{'_id_linebreak'} || 0; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/AlignIO/prodom.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/AlignIO/prodom.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,137 @@ +# $Id: prodom.pm,v 1.8 2002/10/22 07:38:26 lapp Exp $ +# +# BioPerl module for Bio::AlignIO::prodom + +# based on the Bio::SeqIO::prodom module +# by Ewan Birney +# and Lincoln Stein +# +# and the SimpleAlign.pm module of Ewan Birney +# +# Copyright Peter Schattner +# +# You may distribute this module under the same terms as perl itself +# _history +# September 5, 2000 +# POD documentation - main docs before the code + +=head1 NAME + +Bio::AlignIO::prodom - prodom sequence input/output stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the L class. + +=head1 DESCRIPTION + +This object can transform L objects to and from prodom flat +file databases. + +=head1 FEEDBACK + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS - Peter Schattner + +Email: schattner@alum.mit.edu + + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::AlignIO::prodom; +use vars qw(@ISA); +use strict; + +use Bio::AlignIO; + +@ISA = qw(Bio::AlignIO); + +=head2 next_aln + + Title : next_aln + Usage : $aln = $stream->next_aln() + Function: returns the next alignment in the stream. + Returns : L object + Args : NONE + +=cut + +sub next_aln { + my $self = shift; + my $entry; + my ($acc, $fake_id, $start, $end, $seq, $add, %names); + + my $aln = Bio::SimpleAlign->new(-source => 'prodom'); + + while( $entry = $self->_readline) { + + if ($entry =~ /^AC\s+(\S+)\s*$/) { #ps 9/12/00 + $aln->id( $1 ); + } + elsif ($entry =~ /^AL\s+(\S+)\|(\S+)\s+(\d+)\s+(\d+)\s+\S+\s+(\S+)\s*$/){ #ps 9/12/00 + $acc=$1; + $fake_id=$2; # Accessions have _species appended + $start=$3; + $end=$4; + $seq=$5; + + $names{'fake_id'} = $fake_id; + + $add = new Bio::LocatableSeq('-seq'=>$seq, + '-id'=>$acc, + '-start'=>$start, + '-end'=>$end, + ); + + $aln->add_seq($add); + } + elsif ($entry =~ /^CO/) { + # the consensus line marks the end of the alignment part of the entry + last; + } + } + +# If $end <= 0, we have either reached the end of +# file in <> or we have encountered some other error +# + if ($end <= 0) { undef $aln;} + + + return $aln; +} + + + +=head2 write_aln + + Title : write_aln + Usage : $stream->write_aln(@aln) + Function: writes the $aln object into the stream in prodom format ###Not yet implemented!### + Returns : 1 for success and 0 for error + Args : L object + + +=cut + +sub write_aln { + my ($self,@aln) = @_; + + $self->throw("Sorry: prodom-format output, not yet implemented! /n"); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/AlignIO/psi.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/AlignIO/psi.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,166 @@ +# $Id: psi.pm,v 1.6 2002/12/23 19:36:39 jason Exp $ +# +# BioPerl module for Bio::AlignIO::psi +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::AlignIO::psi - Read/Write PSI-BLAST profile alignment files + +=head1 SYNOPSIS + +This module will parse PSI-BLAST output of the format seqid XXXX + +=head1 DESCRIPTION + +Describe the object here + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::AlignIO::psi; +use vars qw(@ISA $BlockLen $IdLength); +use strict; + +$BlockLen = 100; +$IdLength = 13; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::SimpleAlign; +use Bio::AlignIO; +use Bio::LocatableSeq; + +@ISA = qw(Bio::AlignIO); + +=head2 new + + Title : new + Usage : my $obj = new Bio::AlignIO::psi(); + Function: Builds a new Bio::AlignIO::psi object + Returns : Bio::AlignIO::psi + Args : + +=cut + +=head2 next_aln + + Title : next_aln + Usage : $aln = $stream->next_aln() + Function: returns the next alignment in the stream + Returns : L object + Args : NONE + +=cut + +sub next_aln { + my ($self) = @_; + my $aln; + my %seqs; + my @order; + while( defined ($_ = $self->_readline ) ) { + next if( /^\s+$/); + if( !defined $aln ) { + $aln = new Bio::SimpleAlign; + } + my ($id,$s) = split; + push @order, $id if( ! defined $seqs{$id}); + $seqs{$id} .= $s; + } + foreach my $id ( @order) { + my $seq = new Bio::LocatableSeq(-seq => $seqs{$id}, + -id => $id, + -start => 1, + -end => length($seqs{$id})); + $aln->add_seq($seq); + } + return $aln; +} + +=head2 write_aln + + Title : write_aln + Usage : $stream->write_aln(@aln) + Function: writes the NCBI psi-format object (.aln) into the stream + Returns : 1 for success and 0 for error + Args : L object + + +=cut + +sub write_aln { + my ($self,$aln) = @_; + unless( defined $aln && ref($aln) && + $aln->isa('Bio::Align::AlignI') ) { + $self->warn("Must provide a valid Bio::Align::AlignI to write_aln"); + return 0; + } + my $ct = 0; + my @seqs = $aln->each_seq; + my $len = 1; + my $alnlen = $aln->length; + my $idlen = $IdLength; + my @ids = map { substr($_->display_id,0,$idlen) } @seqs; + while( $len < $alnlen ) { + my $start = $len; + my $end = $len + $BlockLen; + if( $end > $alnlen ) { $end = $alnlen; } + my $c = 0; + foreach my $seq ( @seqs ) { + $self->_print(sprintf("%-".$idlen."s %s\n", + $ids[$c++], + $seq->subseq($start,$end))); + } + $self->_print("\n"); + $len += $BlockLen+1; + } + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/AlignIO/selex.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/AlignIO/selex.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,169 @@ +# $Id: selex.pm,v 1.10 2002/10/22 07:38:26 lapp Exp $ +# +# BioPerl module for Bio::AlignIO::selex + +# based on the Bio::SeqIO::selex module +# by Ewan Birney +# and Lincoln Stein +# +# and the SimpleAlign.pm module of Ewan Birney +# +# Copyright Peter Schattner +# +# You may distribute this module under the same terms as perl itself +# _history +# September 5, 2000 +# POD documentation - main docs before the code + +=head1 NAME + +Bio::AlignIO::selex - selex sequence input/output stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the L class. + +=head1 DESCRIPTION + +This object can transform L objects to and from selex flat +file databases. + +=head1 FEEDBACK + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS - Peter Schattner + +Email: schattner@alum.mit.edu + + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::AlignIO::selex; +use vars qw(@ISA); +use strict; +use Bio::AlignIO; + +@ISA = qw(Bio::AlignIO); + +=head2 next_aln + + Title : next_aln + Usage : $aln = $stream->next_aln() + Function: returns the next alignment in the stream. Tries to read *all* selex + It reads all non whitespace characters in the alignment + area. For selexs with weird gaps (eg ~~~) map them by using + $al->map_chars('~','-') + Returns : L object + Args : NONE + +=cut + +sub next_aln { + my $self = shift; + my $entry; + my ($start,$end,%align,$name,$seqname,$seq,$count,%hash,%c2name, %accession, $no); + my $aln = Bio::SimpleAlign->new(-source => 'selex'); + + # in selex format, every non-blank line that does not start + # with '#=' is an alignment segment; the '#=' lines are mark up lines. + # Of particular interest are the '#=GF AC ' + # lines, which give accession numbers for each segment + + while( $entry = $self->_readline) { + $entry =~ /^\#=GS\s+(\S+)\s+AC\s+(\S+)/ && do { + $accession{ $1 } = $2; + next; + }; + $entry !~ /^([^\#]\S+)\s+([A-Za-z\.\-]+)\s*/ && next; + + $name = $1; + $seq = $2; + + if( ! defined $align{$name} ) { + $count++; + $c2name{$count} = $name; + } + $align{$name} .= $seq; + } + + # ok... now we can make the sequences + + $count = 0; + foreach $no ( sort { $a <=> $b } keys %c2name ) { + $name = $c2name{$no}; + + if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) { + $seqname = $1; + $start = $2; + $end = $3; + } else { + $seqname=$name; + $start = 1; + $end = length($align{$name}); + } + $seq = new Bio::LocatableSeq('-seq'=>$align{$name}, + '-id'=>$seqname, + '-start'=>$start, + '-end'=>$end, + '-type'=>'aligned', + '-accession_number' => $accession{$name}, + + ); + + $aln->add_seq($seq); + $count++; + } + +# If $end <= 0, we have either reached the end of +# file in <> or we have encountered some other error +# + if ($end <= 0) { undef $aln;} + + return $aln; +} + + +=head2 write_aln + + Title : write_aln + Usage : $stream->write_aln(@aln) + Function: writes the $aln object into the stream in selex format + Returns : 1 for success and 0 for error + Args : L object + + +=cut + +sub write_aln { + my ($self,@aln) = @_; + my ($namestr,$seq,$add); + my ($maxn); + foreach my $aln (@aln) { + $maxn = $aln->maxdisplayname_length(); + foreach $seq ( $aln->each_seq() ) { + $namestr = $aln->displayname($seq->get_nse()); + $add = $maxn - length($namestr) + 2; + $namestr .= " " x $add; + $self->_print (sprintf("%s %s\n",$namestr,$seq->seq())) or return; + } + } + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/AlignIO/stockholm.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/AlignIO/stockholm.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,190 @@ +# $Id: stockholm.pm,v 1.10.2.1 2003/03/14 09:14:59 heikki Exp $ +# +# BioPerl module for Bio::AlignIO::stockholm + +# based on the Bio::SeqIO::stockholm module +# by Ewan Birney +# and Lincoln Stein +# +# and the SimpleAlign.pm module of Ewan Birney +# +# Copyright Peter Schattner +# +# You may distribute this module under the same terms as perl itself +# _history +# September 5, 2000 +# POD documentation - main docs before the code + +=head1 NAME + +Bio::AlignIO::stockholm - stockholm sequence input/output stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the L class. + +=head1 DESCRIPTION + +This object can transform L objects to and from +stockholm flat file databases. + +=head1 FEEDBACK + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS - Peter Schattner + +Email: schattner@alum.mit.edu + +=head1 CONTRIBUTORS + +Andreas Kahari, ak@ebi.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::AlignIO::stockholm; +use vars qw(@ISA); +use strict; + +use Bio::AlignIO; + +@ISA = qw(Bio::AlignIO); + +=head2 next_aln + + Title : next_aln + Usage : $aln = $stream->next_aln() + Function: returns the next alignment in the stream. + Returns : L object + Args : NONE + +=cut + +sub next_aln { + my $self = shift; + my $entry; + + my ($start,$end,%align,$name,$seqname,$seq,$count, + %hash,%c2name, %accession, $no); + + # in stockholm format, every non-blank line that does not start + # with '#=' is an alignment segment; the '#=' lines are mark up lines. + # Of particular interest are the '#=GF AC ' + # lines, which give accession numbers for each segment + + my $aln = Bio::SimpleAlign->new(-source => 'stockholm'); + + while( defined($entry = $self->_readline) ) { + $entry !~ /\w+/ && next; + + if ($entry =~ /^#\s*STOCKHOLM\s+/) { + last; + } + else { + $self->throw("Not Stockholm format: Expecting \"# STOCKHOLM 1.0\"; Found \"$_\""); + } + } +# +# Next section is same as for selex format +# + while( defined($entry = $self->_readline) ) { + # Double slash (//) signals end of file. The flat Pfam-A data from + # ftp://ftp.sanger.ac.uk/pub/databases/Pfam/Pfam-A.full.gz consists + # of several concatenated Stockholm-formatted files. The following + # line makes it possible to parse it without this module trying to + # read the whole file into memory. Andreas Kähäri 10/3/2003. + last if $entry =~ /^\/\//; + + # Extra bonus: Get the name of the alignment. + # Andreas Kähäri 10/3/2003. + if ($entry =~ /^#=GF\s+AC\s+(\S+)/) { + $aln->id($1); + next; + } + + $entry =~ /^#=GS\s+(\S+)\s+AC\s+(\S+)/ && do { + $accession{ $1 } = $2; + next; + }; + $entry =~ /^([A-Za-z.-]+)$/ && ( $align{$name} .= $1 ) && next; + $entry !~ /^([^#]\S+)\s+([A-Za-z.-]+)\s*/ && next; + + + $name = $1; + $seq = $2; + + if( ! defined $align{$name} ) { + $count++; + $c2name{$count} = $name; + } + $align{$name} .= $seq; + } + + # ok... now we can make the sequences + + foreach $no ( sort { $a <=> $b } keys %c2name ) { + $name = $c2name{$no}; + + if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) { + $seqname = $1; + $start = $2; + $end = $3; + } else { + $seqname=$name; + $start = 1; + $end = length($align{$name}); + } + $seq = new Bio::LocatableSeq('-seq'=>$align{$name}, + '-id'=>$seqname, + '-start'=>$start, + '-end'=>$end, + '-type'=>'aligned', + '-accession_number' => $accession{$name}, + + ); + + $aln->add_seq($seq); + + } + +# If $end <= 0, we have either reached the end of +# file in or we have encountered some other error +# + if ($end <= 0) { undef $aln;} + + return $aln; +} + + +=head2 write_aln + + Title : write_aln + Usage : $stream->write_aln(@aln) + Function: writes the $aln object into the stream in stockholm format ###Not yet implemented!### + Returns : 1 for success and 0 for error + Args : L object + + +=cut + +sub write_aln { + my ($self,@aln) = @_; + + $self->throw("Sorry: stockholm-format output, not yet implemented! /n"); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/AnalysisI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/AnalysisI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,827 @@ +# $Id: AnalysisI.pm,v 1.5.2.1 2003/07/04 02:40:29 shawnh Exp $ +# +# BioPerl module for Bio::AnalysisI +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. +# + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::AnalysisI - An interface to any (local or remote) analysis tool + +=head1 SYNOPSIS + +This is an interface module - you do not instantiate it. +Use C module: + + use Bio::Tools::Run::Analysis; + my $tool = new Bio::Tools::Run::Analysis (@args); + +=head1 DESCRIPTION + +This interface contains all public methods for accessing and +controlling local and remote analysis tools. It is meant to be used on +the client side. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bioperl.org/bioperl-bugs/ + +=head1 AUTHOR + +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2003, Martin Senger and EMBL-EBI. +All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=head1 SEE ALSO + +=over + +=item * + +http://industry.ebi.ac.uk/soaplab/Perl_Client.html + +=back + +=head1 APPENDIX + +This is actually the main documentation... + +If you try to call any of these methods directly on this +C object you will get a I error +message. You need to call them on a C object instead. + +=cut + + +# Let the code begin... + +package Bio::AnalysisI; +use vars qw(@ISA $Revision); +use strict; +use Bio::Root::RootI; + +@ISA = qw(Bio::Root::RootI); + +BEGIN { + $Revision = q$Id: AnalysisI.pm,v 1.5.2.1 2003/07/04 02:40:29 shawnh Exp $; +} + +# ----------------------------------------------------------------------------- + +=head2 analysis_name + + Usage : $tool->analysis_name; + Returns : a name of this analysis + Args : none + +=cut + +sub analysis_name { shift->throw_not_implemented(); } + +# ----------------------------------------------------------------------------- + +=head2 analysis_spec + + Usage : $tool->analysis_spec; + Returns : a hash reference describing this analysis + Args : none + +The returned hash reference uses the following keys (not all of them always +present, perhaps others present as well): C, C, C, +C, C, C. + +Here is an example output: + + Analysis 'edit::seqret': + installation => EMBL-EBI + description => Reads and writes (returns) sequences + supplier => EMBOSS + version => 2.6.0 + type => edit + name => seqret + +=cut + +sub analysis_spec { shift->throw_not_implemented(); } + +# ----------------------------------------------------------------------------- + +=head2 describe + + Usage : $tool->analysis_spec; + Returns : an XML detailed description of this analysis + Args : none + +The returned XML string contains metadata describing this analysis +service. It includes also metadata returned (and easier used) by +method C, C and C. + +The DTD used for returned metadata is based on the adopted standard +(BSA specification for analysis engine): + + + + + + + + + + + + + + + + + + + + + +But the DTD may be extended by provider-specific metadata. For +example, the EBI experimental SOAP-based service on top of EMBOSS uses +DTD explained at C. + +=cut + +sub describe { shift->throw_not_implemented(); } + +# ----------------------------------------------------------------------------- + +=head2 input_spec + + Usage : $tool->input_spec; + Returns : an array reference with hashes as elements + Args : none + +The analysis input data are named, and can be also associated with a +default value, with allowed values and with few other attributes. The +names are important for feeding the service with the input data (the +inputs are given to methods C, C, and/or C +as name/value pairs). + +Here is a (slightly shortened) example of an input specification: + + $input_spec = [ + { + 'mandatory' => 'false', + 'type' => 'String', + 'name' => 'sequence_usa' + }, + { + 'mandatory' => 'false', + 'type' => 'String', + 'name' => 'sequence_direct_data' + }, + { + 'mandatory' => 'false', + 'allowed_values' => [ + 'gcg', + 'gcg8', + ... + 'raw' + ], + 'type' => 'String', + 'name' => 'sformat' + }, + { + 'mandatory' => 'false', + 'type' => 'String', + 'name' => 'sbegin' + }, + { + 'mandatory' => 'false', + 'type' => 'String', + 'name' => 'send' + }, + { + 'mandatory' => 'false', + 'type' => 'String', + 'name' => 'sprotein' + }, + { + 'mandatory' => 'false', + 'type' => 'String', + 'name' => 'snucleotide' + }, + { + 'mandatory' => 'false', + 'type' => 'String', + 'name' => 'sreverse' + }, + { + 'mandatory' => 'false', + 'type' => 'String', + 'name' => 'slower' + }, + { + 'mandatory' => 'false', + 'type' => 'String', + 'name' => 'supper' + }, + { + 'mandatory' => 'false', + 'default' => 'false', + 'type' => 'String', + 'name' => 'firstonly' + }, + { + 'mandatory' => 'false', + 'default' => 'fasta', + 'allowed_values' => [ + 'gcg', + 'gcg8', + 'embl', + ... + 'raw' + ], + 'type' => 'String', + 'name' => 'osformat' + } + ]; + +=cut + +sub input_spec { shift->throw_not_implemented(); } + +# ----------------------------------------------------------------------------- + +=head2 result_spec + + Usage : $tool->result_spec; + Returns : a hash reference with result names as keys + and result types as values + Args : none + +The analysis results are named and can be retrieved using their names +by methods C and C. + +Here is an example of the result specification (again for the service +I): + + $result_spec = { + 'outseq' => 'String', + 'report' => 'String', + 'detailed_status' => 'String' + }; + +=cut + +sub result_spec { shift->throw_not_implemented(); } + +# ----------------------------------------------------------------------------- + +=head2 create_job + + Usage : $tool->create_job ( {'sequence'=>'tatat'} ) + Returns : Bio::Tools::Run::Analysis::Job + Args : data and parameters for this execution + (in various formats) + +Create an object representing a single execution of this analysis +tool. + +Call this method if you wish to "stage the scene" - to create a job +with all input data but without actually running it. This method is +called automatically from other methods (C and C) so +usually you do not need to call it directly. + +The input data and prameters for this execution can be specified in +various ways: + +=over + +=item array reference + +The array has scalar elements of the form + + name = [[@]value] + +where C is the name of an input data or input parameter (see +method C for finding what names are recognized by this +analysis) and C is a value for this data/parameter. If C +is missing a 1 is assumed (which is convenient for the boolean +options). If C starts with C<@> it is treated as a local +filename, and its contents is used as the data/parameter value. + +=item hash reference + +The same as with the array reference but now there is no need to use +an equal sign. The hash keys are input names and hash values their +data. The values can again start with a C<@> sign indicating a local +filename. + +=item scalar + +In this case, the parameter represents a job ID obtained in some +previous invocation - such job already exists on the server side, and +we are just re-creating it here using the same job ID. + +I + +=item undef + +Finally, if the parameter is undefined, ask server to create an empty +job. The input data may be added later using C +method(s) - see scripts/papplmaker.PLS for details. + +=back + +=cut + +sub create_job { shift->throw_not_implemented(); } + +# ----------------------------------------------------------------------------- + +=head2 run + + Usage : $tool->run ( ['sequence=@my.seq', 'osformat=embl'] ) + Returns : Bio::Tools::Run::Analysis::Job, + representing started job (an execution) + Args : the same as for create_job + +Create a job and start it, but do not wait for its completion. + +=cut + +sub run { shift->throw_not_implemented(); } + +# ----------------------------------------------------------------------------- + +=head2 wait_for + + Usage : $tool->wait_for ( { 'sequence' => '@my,file' } ) + Returns : Bio::Tools::Run::Analysis::Job, + representing finished job + Args : the same as for create_job + +Create a job, start it and wait for its completion. + +Note that this is a blocking method. It returns only after the +executed job finishes, either normally or by an error. + +Usually, after this call, you ask for results of the finished job: + + $analysis->wait_for (...)->results; + +=cut + +sub wait_for { shift->throw_not_implemented(); } + +# ----------------------------------------------------------------------------- +# +# Bio::AnalysisI::JobI +# +# ----------------------------------------------------------------------------- + +package Bio::AnalysisI::JobI; + +=head1 Module Bio::AnalysisI::JobI + +An interface to the public methods provided by C +objects. + +The C objects represent a created, +running, or finished execution of an analysis tool. + +The factory for these objects is module C +where the following methods return an +C object: + + create_job (returning a prepared job) + run (returning a running job) + wait_for (returning a finished job) + +=cut + +use vars qw(@ISA); +use strict; +use Bio::Root::RootI; + +@ISA = qw(Bio::Root::RootI); + +# ----------------------------------------------------------------------------- + +=head2 id + + Usage : $job->id; + Returns : this job ID + Args : none + +Each job (an execution) is identifiable by this unique ID which can be +used later to re-create the same job (in other words: to re-connect to +the same job). It is useful in cases when a job takes long time to +finish and your client program does not want to wait for it within the +same session. + +=cut + +sub id { shift->throw_not_implemented(); } + +# ----------------------------------------------------------------------------- + +=head2 run + + Usage : $job->run + Returns : itself + Args : none + +It starts previously created job. The job already must have all input +data filled-in. This differs from the method of the same name of the +C object where the C method creates +also a new job allowing to set input data. + +=cut + +sub run { shift->throw_not_implemented(); } + +# ----------------------------------------------------------------------------- + +=head2 wait_for + + Usage : $job->wait_for + Returns : itself + Args : none + +It waits until a previously started execution of this job finishes. + +=cut + +sub wait_for { shift->throw_not_implemented(); } + +# ----------------------------------------------------------------------------- + +=head2 terminate + + Usage : $job->terminate + Returns : itself + Args : none + +Stop the currently running job (represented by this object). This is a +definitive stop, there is no way to resume it later. + +=cut + +sub terminate { shift->throw_not_implemented(); } + +# ----------------------------------------------------------------------------- + +=head2 last_event + + Usage : $job->last_event + Returns : an XML string + Args : none + +It returns a short XML document showing what happened last with this +job. This is the used DTD: + + + + + + + + + + + + + + + + + + + + + + + + + +Here is an example what is returned after a job was created and +started, but before it finishes (note that the example uses an +analysis 'showdb' which does not need any input data): + + use Bio::Tools::Run::Analysis; + print new Bio::Tools::Run::Analysis (-name => 'display::showdb') + ->run + ->last_event; + +It prints: + + + + Mar 3, 2003 5:14:46 PM (Europe/London) + + + +The same example but now after it finishes: + + use Bio::Tools::Run::Analysis; + print new Bio::Tools::Run::Analysis (-name => 'display::showdb') + ->wait_for + ->last_event; + + + + Mar 3, 2003 5:17:14 PM (Europe/London) + + + +=cut + +sub last_event { shift->throw_not_implemented(); } + +# ----------------------------------------------------------------------------- + +=head2 status + + Usage : $job->status + Returns : string describing the job status + Args : none + +It returns one of the following strings (and perhaps more if a server +implementation extended possible job states): + + CREATED + RUNNING + COMPLETED + TERMINATED_BY_REQUEST + TERMINATED_BY_ERROR + +=cut + +sub status { shift->throw_not_implemented(); } + +# ----------------------------------------------------------------------------- + +=head2 created + + Usage : $job->created (1) + Returns : time when this job was created + Args : optional + +Without any argument it returns a time of creation of this job in +seconds, counting from the beginning of the UNIX epoch +(1.1.1970). With a true argument it returns a formatted time, using +rules described in C. + +=cut + +sub created { shift->throw_not_implemented(); } + +# ----------------------------------------------------------------------------- + +=head2 started + + Usage : $job->started (1) + Returns : time when this job was started + Args : optional + +See C. + +=cut + +sub started { shift->throw_not_implemented(); } + +# ----------------------------------------------------------------------------- + +=head2 ended + + Usage : $job->ended (1) + Returns : time when this job was terminated + Args : optional + +See C. + +=cut + +sub ended { shift->throw_not_implemented(); } + +# ----------------------------------------------------------------------------- + +=head2 elapsed + + Usage : $job->elapsed + Returns : elapsed time of the execution of the given job + (in milliseconds), or 0 of job was not yet started + Args : none + +Note that some server implementations cannot count in millisecond - so +the returned time may be rounded to seconds. + +=cut + +sub elapsed { shift->throw_not_implemented(); } + +# ----------------------------------------------------------------------------- + +=head2 times + + Usage : $job->times ('formatted') + Returns : a hash refrence with all time characteristics + Args : optional + +It is a convenient method returning a hash reference with the folowing +keys: + + created + started + ended + elapsed + +See C for remarks on time formating. + +An example - both for unformatted and formatted times: + + use Data::Dumper; + use Bio::Tools::Run::Analysis; + my $rh = new Bio::Tools::Run::Analysis (-name => 'nucleic_cpg_islands::cpgplot') + ->wait_for ( { 'sequence_usa' => 'embl:hsu52852' } ) + ->times (1); + print Data::Dumper->Dump ( [$rh], ['Times']); + $rh = new Bio::Tools::Run::Analysis (-name => 'nucleic_cpg_islands::cpgplot') + ->wait_for ( { 'sequence_usa' => 'embl:AL499624' } ) + ->times; + print Data::Dumper->Dump ( [$rh], ['Times']); + + $Times = { + 'ended' => 'Mon Mar 3 17:52:06 2003', + 'started' => 'Mon Mar 3 17:52:05 2003', + 'elapsed' => '1000', + 'created' => 'Mon Mar 3 17:52:05 2003' + }; + $Times = { + 'ended' => '1046713961', + 'started' => '1046713926', + 'elapsed' => '35000', + 'created' => '1046713926' + }; + +=cut + +sub times { shift->throw_not_implemented(); } + +# ----------------------------------------------------------------------------- + +=head2 results + + Usage : $job->results (...) + Returns : one or more results created by this job + Args : various, see belou + +This is a complex method trying to make sense for all kinds of +results. Especially it tries to help to put binary results (such as +images) into local files. Generally it deals with fhe following facts: + +=over + +=item * + +Each analysis tool may produce more results. + +=item * + +Some results may contain binary data not suitable for printing into a +terminal window. + +=item * + +Some results may be split into variable number of parts (this is +mainly true for the image results that can consist of more *.png +files). + +=back + +Note also that results have names to distinguish if there are more of +them. The names can be obtained by method C. + +Here are the rules how the method works: + + Retrieving NAMED results: + ------------------------- + results ('name1', ...) => return results as they are, no storing into files + + results ( { 'name1' => 'filename', ... } ) => store into 'filename', return 'filename' + results ( 'name1=filename', ...) => ditto + + results ( { 'name1' => '-', ... } ) => send result to the STDOUT, do not return anything + results ( 'name1=-', ...) => ditto + + results ( { 'name1' => '@', ... } ) => store into file whose name is invented by + this method, perhaps using RESULT_NAME_TEMPLATE env + results ( 'name1=@', ...) => ditto + + results ( { 'name1' => '?', ... } ) => find of what type is this result and then use + {'name1'=>'@' for binary files, and a regular + return for non-binary files + results ( 'name=?', ...) => ditto + + Retrieving ALL results: + ----------------------- + results() => return all results as they are, no storing into files + + results ('@') => return all results, as if each of them given + as {'name' => '@'} (see above) + + results ('?') => return all results, as if each of them given + as {'name' => '?'} (see above) + + Misc: + ----- + * any result can be returned as a scalar value, or as an array reference + (the latter is used for results consisting of more parts, such images); + this applies regardless whether the returned result is the result itself + or a filename created for the result + + * look in the documentation of the C script for examples + (especially how to use various templates for inventing file names) + +=cut + +sub results { shift->throw_not_implemented(); } + +# ----------------------------------------------------------------------------- + +=head2 result + + Usage : $job->result (...) + Returns : the first result + Args : see 'results' + +=cut + +sub result { shift->throw_not_implemented(); } + +# ----------------------------------------------------------------------------- + +=head2 remove + + Usage : $job->remove + Returns : 1 + Args : none + +The job object is not actually removed in this time but it is marked +(setting 1 to C<_destroy_on_exit> attribute) as ready for deletion when +the client program ends (including a request to server to forget the job +mirror object on the server side). + +=cut + +sub remove { shift->throw_not_implemented(); } + +# ----------------------------------------------------------------------------- + +1; +__END__ + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/AnalysisParserI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/AnalysisParserI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,195 @@ +#--------------------------------------------------------------- +# $Id: AnalysisParserI.pm,v 1.7 2002/12/01 00:05:19 jason Exp $ +# +# BioPerl module Bio::AnalysisParserI +# +# Cared for by Steve Chervitz +# +# Derived from Bio::SeqAnalysisParserI by Jason Stajich, Hilmar Lapp. +# +# You may distribute this module under the same terms as perl itself +#--------------------------------------------------------------- + +=head1 NAME + +Bio::AnalysisParserI - Generic analysis output parser interface + +=head1 SYNOPSIS + + # get a AnalysisParserI somehow. + # Eventually, there may be an Bio::Factory::AnalysisParserFactory. + # For now a SearchIO object, an implementation of AnalysisParserI, can be created + # directly, as in the following: + my $parser = Bio::SearchIO->new( + '-file' => 'inputfile', + '-format' => 'blast'); + + while( my $result = $parser->next_result() ) { + print "Result: ", $result->analysis_method, + ", Query: ", $result->query_name, "\n"; + + while( my $feature = $result->next_feature() ) { + print "Feature from ", $feature->start, " to ", + $feature->end, "\n"; + } + } + +=head1 DESCRIPTION + +AnalysisParserI is a interface for describing generic analysis +result parsers. This module makes no assumption about the nature of +analysis being parsed, only that zero or more result sets can be +obtained from the input source. + +This module was derived from Bio::SeqAnalysisParserI, the differences being + +=over 4 + +=item 1. next_feature() was replaced with next_result(). + +Instead of flattening a stream containing potentially multiple +analysis results into a single set of features, AnalysisParserI +segments the stream in terms of analysis result sets +(Bio::AnalysisResultI objects). Each AnalysisResultI can then be +queried for its features (if any) as well as other information +about the result + +=item 2. AnalysisParserI is a pure interface. + +It does not inherit from Bio::Root::RootI and does not provide a new() +method. Implementations are free to choose how to implement it. + +=back + +=head2 Rationale (copied from Bio::SeqAnalysisParserI) + +The concept behind this interface is to have a generic interface in sequence +annotation pipelines (as used e.g. in high-throughput automated +sequence annotation). This interface enables plug-and-play for new analysis +methods and their corresponding parsers without the necessity for modifying +the core of the annotation pipeline. In this concept the annotation pipeline +has to rely on only a list of methods for which to process the results, and a +factory from which it can obtain the corresponding parser implementing this +interface. + +=head2 TODO + +Create Bio::Factory::AnalysisParserFactoryI and +Bio::Factory::AnalysisParserFactory for interface and an implementation. +Note that this factory could return Bio::SearchIO-derived objects. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Steve Chervitz, Jason Stajich, Hilmar Lapp + +Email sac@bioperl.org + +Authors of Bio::SeqAnalysisParserI on which this module is based: +Email jason@bioperl.org +Email hlapp@gmx.net + +=head1 COPYRIGHT + +Copyright (c) 2001 Steve Chervitz. All Rights Reserved. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + +package Bio::AnalysisParserI; +use strict; +use vars qw(@ISA); + +use Bio::Root::RootI; + +@ISA = qw(Bio::Root::RootI); + +=head2 next_result + + Title : next_result + Usage : $result = $obj->next_result(); + Function: Returns the next result available from the input, or + undef if there are no more results. + Example : + Returns : A Bio::Search::Result::ResultI implementing object, + or undef if there are no more results. + Args : none + +=cut + +sub next_result { + my ($self); + $self->throw_not_implemented; +} + + + +=head2 result_factory + + Title : result_factory + Usage : $res_fact = $obj->result_factory; (get) + : $obj->result_factory( $factory ); (set) + Function: Sets/Gets a factory object to create result objects for this AnalysisParser. + Returns : Bio::Factory::ResultFactoryI object + Args : Bio::Factory::ResultFactoryI object (when setting) + Comments: A AnalysisParserI implementation should provide a default result factory. + obtainable by the default_result_factory_class() method. + +=cut + +sub result_factory { + my $self = shift; + $self->throw_not_implemented; +} + +=head2 default_result_factory_class + + Title : default_result_factory_class + Usage : $res_factory = $obj->default_result_factory_class()->new( @args ) + Function: Returns the name of the default class to be used for creating + Bio::AnalysisResultI objects. + Example : + Returns : A string containing the name of a class that implements + the Bio::Factory::ResultFactoryI interface. + Args : none + +=cut + +sub default_result_factory_class { + my $self = shift; +# TODO: Uncomment this when Jason's SearchIO code conforms +# $self->throw_not_implemented; +} + +1; +__END__ + +NOTE: My ten-month old son Russell added the following line. +It doesn't look like it will compile so I'm putting it here: +mt6 j7qa diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/AnalysisResultI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/AnalysisResultI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,229 @@ +#----------------------------------------------------------------- +# $Id: AnalysisResultI.pm,v 1.5 2002/10/22 07:38:24 lapp Exp $ +# +# BioPerl module Bio::AnalysisResultI +# +# Cared for by Steve Chervitz +# +# Derived from Bio::Tools::AnalysisResult by Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself +#----------------------------------------------------------------- + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::AnalysisResultI - Interface for analysis result objects + +=head1 SYNOPSIS + +Bio::AnalysisResultI defines an interface that must be implemented by +a subclass. So you cannot create Bio::AnalysisResultI objects, +only objects that inherit from Bio::AnalysisResultI. + +=head1 DESCRIPTION + +The AnalysisResultI module provides an interface for modules +encapsulating the result of an analysis that was carried out with a +query sequence and an optional subject dataset. + +The notion of an analysis represented by this base class is that of a unary or +binary operator, taking either one query or a query and a subject and producing +a result. The query is e.g. a sequence, and a subject is either a sequence, +too, or a database of sequences. + +This interface defines methods to access analysis result data and does +not impose any contraints on how the analysis result data is acquired. + +Note that this module does not provide support for B an analysis. +Rather, it is positioned in the subsequent parsing step (concerned with +turning raw results into BioPerl objects). + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Steve Chervitz, Hilmar Lapp + +Email sac@bioperl.org +Email hlapp@gmx.net (author of Bio::Tools::AnalysisResult on which this module is based) + +=head1 COPYRIGHT + +Copyright (c) 2001 Steve Chervitz. All Rights Reserved. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::AnalysisResultI; +use strict; +use vars qw(@ISA); + +use Bio::Root::RootI; + +@ISA = qw( Bio::Root::RootI ); + + +=head2 analysis_query + + Usage : $query_obj = $result->analysis_query(); + Purpose : Get a Bio::PrimarySeqI-compatible object representing the entity + on which the analysis was performed. Lacks sequence information. + Argument : n/a + Returns : A Bio::PrimarySeqI-compatible object without sequence information. + The sequence will have display_id, description, moltype, and length data. + +=cut + +#--------------------- +sub analysis_query { +#--------------------- + my ($self) = @_; + $self->throw_not_implemented; +} + + +=head2 analysis_subject + + Usage : $obj = $result->analyis_subject(); + Purpose : Get the subject of the analysis against which it was + performed. For similarity searches it will probably be a database, + and for sequence feature predictions (exons, promoters, etc) it + may be a collection of models or homologous sequences that were + used, or undefined. + Returns : An object of a type the depends on the implementation + May also return undef for analyses that don\'t involve subjects. + Argument : n/a + Comments : Implementation of this method is optional. + AnalysisResultI provides a default behavior of returning undef. + +=cut + +#--------------- +sub analysis_subject { +#--------------- + my ($self) = @_; + return undef; +} + +=head2 analysis_subject_version + + Usage : $vers = $result->analyis_subject_version(); + Purpose : Get the version string of the subject of the analysis. + Returns : String or undef for analyses that don\'t involve subjects. + Argument : n/a + Comments : Implementation of this method is optional. + AnalysisResultI provides a default behavior of returning undef. + +=cut + +#--------------- +sub analysis_subject_version { +#--------------- + my ($self) = @_; + return undef; +} + + +=head2 analysis_date + + Usage : $date = $result->analysis_date(); + Purpose : Get the date on which the analysis was performed. + Returns : String + Argument : n/a + +=cut + +#--------------------- +sub analysis_date { +#--------------------- + my ($self) = @_; + $self->throw_not_implemented; +} + +=head2 analysis_method + + Usage : $meth = $result->analysis_method(); + Purpose : Get the name of the sequence analysis method that was used + to produce this result (BLASTP, FASTA, etc.). May also be the + actual name of a program. + Returns : String + Argument : n/a + +=cut + +#------------- +sub analysis_method { +#------------- + my ($self) = @_; + $self->throw_not_implemented; +} + +=head2 analysis_method_version + + Usage : $vers = $result->analysis_method_version(); + Purpose : Get the version string of the analysis program. + : (e.g., 1.4.9MP, 2.0a19MP-WashU). + Returns : String + Argument : n/a + +=cut + +#--------------------- +sub analysis_method_version { +#--------------------- + my ($self) = @_; + $self->throw_not_implemented; +} + +=head2 next_feature + + Title : next_feature + Usage : $seqfeature = $obj->next_feature(); + Function: Returns the next feature available in the analysis result, or + undef if there are no more features. + Example : + Returns : A Bio::SeqFeatureI implementing object, or undef if there are no + more features. + Args : none + +=cut + +#--------------------- +sub next_feature { +#--------------------- + my ($self); + $self->throw_not_implemented; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/AnnotatableI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/AnnotatableI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,107 @@ +# $Id: AnnotatableI.pm,v 1.2 2002/12/31 13:09:06 birney Exp $ +# +# BioPerl module for Bio::AnnotatableI +# +# Cared for by Hilmar Lapp +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::AnnotatableI - the base interface an annotatable object must implement + +=head1 SYNOPSIS + + use Bio::SeqIO; + # get an annotatable object somehow: for example, Bio::SeqI objects + # are annotatable + my $seqio = Bio::SeqIO->new(-fh => \*STDIN, -format => 'genbank); + while (my $seq = $seqio->next_seq()) { + # $seq is-a Bio::AnnotatableI, hence: + my $ann_coll = $seq->annotation(); + # $ann_coll is-a Bio::AnnotationCollectionI, hence: + my @all_anns = $ann_coll->get_Annotations(); + # do something with the annotation objects + } + +=head1 DESCRIPTION + +This is the base interface that all annotatable objects must implement. A good +example is Bio::Seq which is an AnnotableI object; if you are a little confused +about what this module does, start a Bio::Seq. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bioperl.org/bioperl-bugs/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp at gmx.net + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::AnnotatableI; +use vars qw(@ISA); +use strict; +use Carp; +use Bio::Root::RootI; + +@ISA = qw( Bio::Root::RootI ); + +=head2 annotation + + Title : annotation + Usage : $obj->annotation($newval) + Function: Get the annotation collection (see L) + for this annotatable object. + Example : + Returns : a Bio::AnnotationCollectionI implementing object, or undef + Args : on set, new value (a Bio::AnnotationCollectionI + implementing object, optional) (an implementation may not + support changing the annotation collection) + + +=cut + +sub annotation{ + shift->throw_not_implemented(); +} + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Annotation/AnnotationFactory.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Annotation/AnnotationFactory.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,247 @@ +# $Id: AnnotationFactory.pm,v 1.1 2002/10/31 09:45:39 lapp Exp $ +# +# BioPerl module for Bio::Annotation::AnnotationFactory +# +# Cared for by Hilmar Lapp +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# +# (c) Hilmar Lapp, hlapp at gmx.net, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Annotation::AnnotationFactory - Instantiates a new Bio::AnnotationI (or derived class) through a factory + +=head1 SYNOPSIS + + use Bio::Annotation::AnnotationFactory; + # + my $factory = new Bio::Annotation::AnnotationFactory(-type => 'Bio::Annotation::SimpleValue'); + my $ann = $factory->create_object(-value => 'peroxisome', + -tagname => 'cellular component'); + + +=head1 DESCRIPTION + +This object will build L objects generically. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp at gmx.net + + +=head1 CONTRIBUTORS + +This is mostly copy-and-paste with subsequent adaptation from +Bio::Seq::SeqFactory by Jason Stajich. Most credits should in fact go +to him. + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Annotation::AnnotationFactory; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::Factory::ObjectFactoryI; + +@ISA = qw(Bio::Root::Root Bio::Factory::ObjectFactoryI); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Annotation::AnnotationFactory(); + Function: Builds a new Bio::Annotation::AnnotationFactory object + Returns : Bio::Annotation::AnnotationFactory + Args : -type => string, name of a L derived class. + The default is L. + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + my ($type) = $self->_rearrange([qw(TYPE)], @args); + + $self->{'_loaded_types'} = {}; + $self->type($type) if $type; + + return $self; +} + + +=head2 create_object + + Title : create_object + Usage : my $seq = $factory->create_object(); + Function: Instantiates new Bio::AnnotationI (or one of its child classes) + + This object allows us to genericize the instantiation of + cluster objects. + + Returns : L compliant object + The return type is configurable using new(-type =>"..."). + Args : initialization parameters specific to the type of annotation + object we want. + +=cut + +sub create_object { + my ($self,@args) = @_; + + my $type = $self->type(); + if(! $type) { + # we need to guess this + $type = $self->_guess_type(@args); + if(! $type) { + $self->throw("No annotation type set and unable to guess."); + } + # load dynamically if it hasn't been loaded yet + if(! $self->{'_loaded_types'}->{$type}) { + eval { + $self->_load_module($type); + $self->{'_loaded_types'}->{$type} = 1; + }; + if($@) { + $self->throw("Bio::AnnotationI implementation $type ". + "failed to load: ".$@); + } + } + } + return $type->new(-verbose => $self->verbose, @args); +} + +=head2 type + + Title : type + Usage : $obj->type($newval) + Function: Get/set the type of L object to be created. + + This may be changed at any time during the lifetime of this + factory. + + Returns : value of type + Args : newvalue (optional) + + +=cut + +sub type{ + my $self = shift; + + if(@_) { + my $type = shift; + if($type && (! $self->{'_loaded_types'}->{$type})) { + eval { + $self->_load_module($type); + }; + if( $@ ) { + $self->throw("Annotation class '$type' failed to load: ". + $@); + } + my $a = bless {},$type; + if( ! $a->isa('Bio::AnnotationI') ) { + $self->throw("'$type' does not implement Bio::AnnotationI. ". + "Too bad."); + } + $self->{'_loaded_types'}->{$type} = 1; + } + return $self->{'type'} = $type; + } + return $self->{'type'}; +} + +=head2 _guess_type + + Title : _guess_type + Usage : + Function: Guesses the right type of L implementation + based on initialization parameters for the prospective + object. + Example : + Returns : the type (a string, the module name) + Args : initialization parameters to be passed to the prospective + cluster object + + +=cut + +sub _guess_type{ + my ($self,@args) = @_; + my $type; + + # we can only guess from a certain number of arguments + my ($val,$db,$text,$name,$authors) = + $self->_rearrange([qw(VALUE + DATABASE + TEXT + NAME + AUTHORS + )], @args); + SWITCH: { + $val && do { $type = "SimpleValue"; last SWITCH; }; + $authors && do { $type = "Reference"; last SWITCH; }; + $db && do { $type = "DBLink"; last SWITCH; }; + $text && do { $type = "Comment"; last SWITCH; }; + $name && do { $type = "OntologyTerm"; last SWITCH; }; + # what else could we look for? + } + $type = "Bio::Annotation::".$type; + + return $type; +} + +##################################################################### +# aliases for naming consistency or other reasons # +##################################################################### + +*create = \&create_object; + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Annotation/Collection.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Annotation/Collection.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,687 @@ +# $Id: Collection.pm,v 1.16 2002/11/22 22:48:25 birney Exp $ + +# +# BioPerl module for Bio::Annotation::Collection.pm +# +# Cared for by Ewan Birney +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Annotation::Collection - Default Perl implementation of AnnotationCollectionI + +=head1 SYNOPSIS + + # get an AnnotationCollectionI somehow, eg + + $ac = $seq->annotation(); + + foreach $key ( $ac->get_all_annotation_keys() ) { + @values = $ac->get_Annotations($key); + foreach $value ( @values ) { + # value is an Bio::AnnotationI, and defines a "as_text" method + print "Annotation ",$key," stringified value ",$value->as_text,"\n"; + + # also defined hash_tree method, which allows data orientated + # access into this object + $hash = $value->hash_tree(); + } + } + +=head1 DESCRIPTION + +Bioperl implementation for Bio::AnnotationCollecitonI + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney + +Email birney@ebi.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Annotation::Collection; + +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::AnnotationCollectionI; +use Bio::AnnotationI; +use Bio::Root::Root; +use Bio::Annotation::TypeManager; +use Bio::Annotation::SimpleValue; + + +@ISA = qw(Bio::Root::Root Bio::AnnotationCollectionI Bio::AnnotationI); + + +=head2 new + + Title : new + Usage : $coll = Bio::Annotation::Collection->new() + Function: Makes a new Annotation::Collection object. + Returns : Bio::Annotation::Collection + Args : none + +=cut + +sub new{ + my ($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + $self->{'_annotation'} = {}; + $self->_typemap(Bio::Annotation::TypeManager->new()); + + return $self; +} + + +=head1 L implementing methods + +=cut + +=head2 get_all_annotation_keys + + Title : get_all_annotation_keys + Usage : $ac->get_all_annotation_keys() + Function: gives back a list of annotation keys, which are simple text strings + Returns : list of strings + Args : none + +=cut + +sub get_all_annotation_keys{ + my ($self) = @_; + return keys %{$self->{'_annotation'}}; +} + +=head2 get_Annotations + + Title : get_Annotations + Usage : my @annotations = $collection->get_Annotations('key') + Function: Retrieves all the Bio::AnnotationI objects for one or more + specific key(s). + + If no key is given, returns all annotation objects. + + The returned objects will have their tagname() attribute set to + the key under which they were attached, unless the tagname was + already set. + + Returns : list of Bio::AnnotationI - empty if no objects stored for a key + Args : keys (list of strings) for annotations (optional) + +=cut + +sub get_Annotations{ + my ($self,@keys) = @_; + + my @anns = (); + @keys = $self->get_all_annotation_keys() unless @keys; + foreach my $key (@keys) { + if(exists($self->{'_annotation'}->{$key})) { + push(@anns, + map { + $_->tagname($key) if ! $_->tagname(); $_; + } @{$self->{'_annotation'}->{$key}}); + } + } + return @anns; +} + +=head2 get_all_Annotations + + Title : get_all_Annotations + Usage : + Function: Similar to get_Annotations, but traverses and flattens nested + annotation collections. This means that collections in the + tree will be replaced by their components. + + Keys will not be passed on to nested collections. I.e., if the + tag name of a nested collection matches the key, it will be + flattened in its entirety. + + Hence, for un-nested annotation collections this will be identical + to get_Annotations. + Example : + Returns : an array of L compliant objects + Args : keys (list of strings) for annotations (optional) + + +=cut + +sub get_all_Annotations{ + my ($self,@keys) = @_; + + return map { + $_->isa("Bio::AnnotationCollectionI") ? + $_->get_all_Annotations() : $_; + } $self->get_Annotations(@keys); +} + +=head2 get_num_of_annotations + + Title : get_num_of_annotations + Usage : my $count = $collection->get_num_of_annotations() + Function: Returns the count of all annotations stored in this collection + Returns : integer + Args : none + + +=cut + +sub get_num_of_annotations{ + my ($self) = @_; + my $count = 0; + map { $count += scalar @$_ } values %{$self->{'_annotation'}}; + return $count; +} + +=head1 Implementation specific functions - mainly for adding + +=cut + +=head2 add_Annotation + + Title : add_Annotation + Usage : $self->add_Annotation('reference',$object); + $self->add_Annotation($object,'Bio::MyInterface::DiseaseI'); + $self->add_Annotation($object); + $self->add_Annotation('disease',$object,'Bio::MyInterface::DiseaseI'); + Function: Adds an annotation for a specific key. + + If the key is omitted, the object to be added must provide a value + via its tagname(). + + If the archetype is provided, this and future objects added under + that tag have to comply with the archetype and will be rejected + otherwise. + + Returns : none + Args : annotation key ('disease', 'dblink', ...) + object to store (must be Bio::AnnotationI compliant) + [optional] object archetype to map future storage of object + of these types to + +=cut + +sub add_Annotation{ + my ($self,$key,$object,$archetype) = @_; + + # if there's no key we use the tagname() as key + if(ref($key) && $key->isa("Bio::AnnotationI") && + (! ($object && ref($object)))) { + $archetype = $object if $object; + $object = $key; + $key = $object->tagname(); + $key = $key->name() if $key && ref($key); # OntologyTermI + $self->throw("Annotation object must have a tagname if key omitted") + unless $key; + } + + if( !defined $object ) { + $self->throw("Must have at least key and object in add_Annotation"); + } + + if( !ref $object ) { + $self->throw("Must add an object. Use Bio::Annotation::{Comment,SimpleValue,OntologyTerm} for simple text additions"); + } + + if( !$object->isa("Bio::AnnotationI") ) { + $self->throw("object must be AnnotationI compliant, otherwise we wont add it!"); + } + + # ok, now we are ready! If we don't have an archetype, set it + # from the type of the object + + if( !defined $archetype ) { + $archetype = ref $object; + } + + # check typemap, storing if needed. + my $stored_map = $self->_typemap->type_for_key($key); + + if( defined $stored_map ) { + # check validity, irregardless of archetype. A little cheeky + # this means isa stuff is executed correctly + + if( !$self->_typemap()->is_valid($key,$object) ) { + $self->throw("Object $object was not valid with key $key. If you were adding new keys in, perhaps you want to make use of the archetype method to allow registration to a more basic type"); + } + } else { + $self->_typemap->_add_type_map($key,$archetype); + } + + # we are ok to store + + if( !defined $self->{'_annotation'}->{$key} ) { + $self->{'_annotation'}->{$key} = []; + } + + push(@{$self->{'_annotation'}->{$key}},$object); + + return 1; +} + +=head2 remove_Annotations + + Title : remove_Annotations + Usage : + Function: Remove the annotations for the specified key from this collection. + Example : + Returns : an array Bio::AnnotationI compliant objects which were stored + under the given key(s) + Args : the key(s) (tag name(s), one or more strings) for which to + remove annotations (optional; if none given, flushes all + annotations) + + +=cut + +sub remove_Annotations{ + my ($self, @keys) = @_; + + @keys = $self->get_all_annotation_keys() unless @keys; + my @anns = $self->get_Annotations(@keys); + # flush + foreach (@keys) { + delete $self->{'_annotation'}->{$_}; + } + return @anns; +} + +=head2 flatten_Annotations + + Title : flatten_Annotations + Usage : + Function: Flattens part or all of the annotations in this collection. + + This is a convenience method for getting the flattened + annotation for the given keys, removing the annotation for + those keys, and adding back the flattened array. + + This should not change anything for un-nested collections. + Example : + Returns : an array Bio::AnnotationI compliant objects which were stored + under the given key(s) + Args : list of keys (strings) the annotation for which to flatten, + defaults to all keys if not given + + +=cut + +sub flatten_Annotations{ + my ($self,@keys) = @_; + + my @anns = $self->get_all_Annotations(@keys); + my @origanns = $self->remove_Annotations(@keys); + foreach (@anns) { + $self->add_Annotation($_); + } + return @origanns; +} + +=head1 Bio::AnnotationI methods implementations + + This is to allow nested annotation: you can a collection as an + annotation object to an annotation collection. + +=cut + +=head2 as_text + + Title : as_text + Usage : + Function: See L + Example : + Returns : a string + Args : none + + +=cut + +sub as_text{ + my $self = shift; + + my $txt = "Collection consisting of "; + my @texts = (); + foreach my $ann ($self->get_Annotations()) { + push(@texts, $ann->as_text()); + } + if(@texts) { + $txt .= join(", ", map { '['.$_.']'; } @texts); + } else { + $txt .= "no elements"; + } + return $txt; +} + +=head2 hash_tree + + Title : hash_tree + Usage : + Function: See L + Example : + Returns : a hash reference + Args : none + + +=cut + +sub hash_tree{ + my $self = shift; + my $tree = {}; + + foreach my $key ($self->get_all_annotation_keys()) { + # all contained objects will support hash_tree() + # (they are AnnotationIs) + $tree->{$key} = [$self->get_Annotations($key)]; + } + return $tree; +} + +=head2 tagname + + Title : tagname + Usage : $obj->tagname($newval) + Function: Get/set the tagname for this annotation value. + + Setting this is optional. If set, it obviates the need to + provide a tag to Bio::AnnotationCollectionI when adding + this object. When obtaining an AnnotationI object from the + collection, the collection will set the value to the tag + under which it was stored unless the object has a tag + stored already. + + Example : + Returns : value of tagname (a scalar) + Args : new value (a scalar, optional) + + +=cut + +sub tagname{ + my $self = shift; + + return $self->{'tagname'} = shift if @_; + return $self->{'tagname'}; +} + + +=head1 Backward compatible functions + +Functions put in for backward compatibility with old +Bio::Annotation.pm stuff + +=cut + +=head2 description + + Title : description + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub description{ + my ($self,$value) = @_; + + $self->deprecated("Using old style annotation call on new Annotation::Collection object"); + + if( defined $value ) { + my $val = Bio::Annotation::SimpleValue->new(); + $val->value($value); + $self->add_Annotation('description',$val); + } + + my ($desc) = $self->get_Annotations('description'); + + # If no description tag exists, do not attempt to call value on undef: + return $desc ? $desc->value : undef; +} + + +=head2 add_gene_name + + Title : add_gene_name + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub add_gene_name{ + my ($self,$value) = @_; + + $self->deprecated("Old style add_gene_name called on new style Annotation::Collection"); + + my $val = Bio::Annotation::SimpleValue->new(); + $val->value($value); + $self->add_Annotation('gene_name',$val); +} + +=head2 each_gene_name + + Title : each_gene_name + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub each_gene_name{ + my ($self) = @_; + + $self->deprecated("Old style each_gene_name called on new style Annotation::Collection"); + + my @out; + my @gene = $self->get_Annotations('gene_name'); + + foreach my $g ( @gene ) { + push(@out,$g->value); + } + + return @out; +} + +=head2 add_Reference + + Title : add_Reference + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub add_Reference{ + my ($self, @values) = @_; + + $self->deprecated("add_Reference (old style Annotation) on new style Annotation::Collection"); + + # Allow multiple (or no) references to be passed, as per old method + foreach my $value (@values) { + $self->add_Annotation('reference',$value); + } +} + +=head2 each_Reference + + Title : each_Reference + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub each_Reference{ + my ($self) = @_; + + $self->deprecated("each_Reference (old style Annotation) on new style Annotation::Collection"); + + return $self->get_Annotations('reference'); +} + + +=head2 add_Comment + + Title : add_Comment + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub add_Comment{ + my ($self,$value) = @_; + + $self->deprecated("add_Comment (old style Annotation) on new style Annotation::Collection"); + + $self->add_Annotation('comment',$value); + +} + +=head2 each_Comment + + Title : each_Comment + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub each_Comment{ + my ($self) = @_; + + $self->deprecated("each_Comment (old style Annotation) on new style Annotation::Collection"); + + return $self->get_Annotations('comment'); +} + + + +=head2 add_DBLink + + Title : add_DBLink + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub add_DBLink{ + my ($self,$value) = @_; + + $self->deprecated("add_DBLink (old style Annotation) on new style Annotation::Collection"); + + $self->add_Annotation('dblink',$value); + +} + +=head2 each_DBLink + + Title : each_DBLink + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub each_DBLink{ + my ($self) = @_; + + $self->deprecated("each_DBLink (old style Annotation) on new style Annotation::Collection - use get_Annotations('dblink')"); + + return $self->get_Annotations('dblink'); +} + + + +=head1 Implementation management functions + +=cut + +=head2 _typemap + + Title : _typemap + Usage : $obj->_typemap($newval) + Function: + Example : + Returns : value of _typemap + Args : newvalue (optional) + + +=cut + +sub _typemap{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'_typemap'} = $value; + } + return $self->{'_typemap'}; + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Annotation/Comment.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Annotation/Comment.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,176 @@ +# $Id: Comment.pm,v 1.8 2002/09/25 18:11:33 lapp Exp $ +# +# BioPerl module for Bio::Annotation::Comment +# +# Cared for by Ewan Birney +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Annotation::Comment - A comment object, holding text + +=head1 SYNOPSIS + + + $comment = Bio::Annotation::Comment->new(); + $comment->text("This is the text of this comment"); + $annotation->add_Annotation('comment', $comment); + + +=head1 DESCRIPTION + +A holder for comments in annotations, just plain text. This is a very simple +object, and justifiably so. + +=head1 CONTACT + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Annotation::Comment; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::AnnotationI; + +@ISA = qw(Bio::Root::Root Bio::AnnotationI); + +=head2 new + + Title : new + Usage : $comment = Bio::Annotation::Comment->new( '-text' => 'some text for this comment'); + Function: This returns a new comment object, optionally with + text filed + Example : + Returns : a Bio::Annotation::Comment object + Args : a hash with -text optionally set + + +=cut + + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + my ($text,$tag) = $self->_rearrange([qw(TEXT TAGNAME)], @args); + + defined $text && $self->text($text); + defined $tag && $self->tagname($tag); + + return $self; +} + +=head1 AnnotationI implementing functions + +=cut + +=head2 as_text + + Title : as_text + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub as_text{ + my ($self) = @_; + + return "Comment: ".$self->text; +} + +=head2 hash_tree + + Title : hash_tree + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub hash_tree{ + my ($self) = @_; + + my $h = {}; + $h->{'text'} = $self->text; +} + +=head2 tagname + + Title : tagname + Usage : $obj->tagname($newval) + Function: Get/set the tagname for this annotation value. + + Setting this is optional. If set, it obviates the need to provide + a tag to Bio::AnnotationCollectionI when adding this object. When + obtaining an AnnotationI object from the collection, the collection + will set the value to the tag under which it was stored unless the + object has a tag stored already. + Example : + Returns : value of tagname (a scalar) + Args : new value (a scalar, optional) + + +=cut + +sub tagname{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'tagname'} = $value; + } + return $self->{'tagname'}; +} + +=head1 Specific accessors for Comments + +=cut + + +=head2 text + + Title : text + Usage : $value = $self->text($newval) + Function: get/set for the text field. A comment object + just holds a single string which is accessible through + this method + Example : + Returns : value of text + Args : newvalue (optional) + + +=cut + +sub text{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'text'} = $value; + } + return $self->{'text'}; + +} + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Annotation/DBLink.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Annotation/DBLink.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,352 @@ +# $Id: DBLink.pm,v 1.12 2002/10/23 18:07:49 lapp Exp $ +# +# BioPerl module for Bio::Annotation::Link +# +# Cared for by Ewan Birney +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Annotation::DBLink - DESCRIPTION of Object + +=head1 SYNOPSIS + + $link1 = new Bio::Annotation::DBLink(-database => 'TSC', + -primary_id => 'TSC0000030' + ); + + #or + + $link2 = new Bio::Annotation::DBLink(); + $link2->database('dbSNP'); + $link2->primary_id('2367'); + + # DBLink is-a Bio::AnnotationI object, can be added to annotation + # collections, e.g. the one on features or seqs + $feat->annotation->add_Annotation('dblink', $link2); + + +=head1 DESCRIPTION + +Provides an object which represents a link from one object to something +in another database without prescribing what is in the other database + +=head1 AUTHOR - Ewan Birney + +Ewan Birney - birney@ebi.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Annotation::DBLink; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::AnnotationI; +use Bio::IdentifiableI; + +@ISA = qw(Bio::Root::Root Bio::AnnotationI Bio::IdentifiableI); + + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + my ($database, $primary_id, $optional_id, $comment, $tag, $ns, $auth, $v) = + $self->_rearrange([qw(DATABASE + PRIMARY_ID + OPTIONAL_ID + COMMENT + TAGNAME + NAMESPACE + AUTHORITY + VERSION + )], @args); + + $database && $self->database($database); + $primary_id && $self->primary_id($primary_id); + $optional_id && $self->optional_id($optional_id); + $comment && $self->comment($comment); + $tag && $self->tagname($tag); + # Bio::IdentifiableI parameters: + $ns && $self->namespace($ns); # this will override $database + $auth && $self->authority($auth); + defined($v) && $self->version($v); + + return $self; +} + +=head1 AnnotationI implementing functions + +=cut + + +=head2 as_text + + Title : as_text + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub as_text{ + my ($self) = @_; + + return "Direct database link to ".$self->primary_id." in database ".$self->database; +} + +=head2 hash_tree + + Title : hash_tree + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub hash_tree{ + my ($self) = @_; + + my $h = {}; + $h->{'database'} = $self->database; + $h->{'primary_id'} = $self->primary_id; + if( defined $self->optional_id ) { + $h->{'optional_id'} = $self->optional_id; + } + if( defined $self->comment ) { + # we know that comments have hash_tree methods + $h->{'comment'} = $self->comment; + } + + return $h; +} + +=head2 tagname + + Title : tagname + Usage : $obj->tagname($newval) + Function: Get/set the tagname for this annotation value. + + Setting this is optional. If set, it obviates the need to + provide a tag to Bio::AnnotationCollectionI when adding + this object. When obtaining an AnnotationI object from the + collection, the collection will set the value to the tag + under which it was stored unless the object has a tag + stored already. + + Example : + Returns : value of tagname (a scalar) + Args : new value (a scalar, optional) + + +=cut + +sub tagname{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'tagname'} = $value; + } + return $self->{'tagname'}; +} + +=head1 Specific accessors for DBLinks + +=cut + +=head2 database + + Title : database + Usage : $self->database($newval) + Function: set/get on the database string. Databases are just + a string here which can then be interpretted elsewhere + Example : + Returns : value of database + Args : newvalue (optional) + +=cut + +sub database{ + my ($self,$value) = @_; + + if( defined $value) { + $self->{'database'} = $value; + } + return $self->{'database'}; + +} + +=head2 primary_id + + Title : primary_id + Usage : $self->primary_id($newval) + Function: set/get on the primary id (a string) + The primary id is the main identifier used for this object in + the database. Good examples would be accession numbers. The id + is meant to be the main, stable identifier for this object + Example : + Returns : value of primary_id + Args : newvalue (optional) + +=cut + +sub primary_id{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'primary_id'} = $value; + } + return $self->{'primary_id'}; + +} + +=head2 optional_id + + Title : optional_id + Usage : $self->optional_id($newval) + Function: get/set for the optional_id (a string) + + optional id is a slot for people to use as they wish. The + main issue is that some databases do not have a clean + single string identifier scheme. It is hoped that the + primary_id can behave like a reasonably sane "single string + identifier" of objects, and people can use/abuse optional + ids to their heart's content to provide precise mappings. + + Example : + Returns : value of optional_id + Args : newvalue (optional) + +=cut + +#' + +sub optional_id{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'optional_id'} = $value; + } + return $self->{'optional_id'}; + +} + +=head2 comment + + Title : comment + Usage : $self->comment($newval) + Function: get/set of comments (comment object) + Sets or gets comments of this dblink, which is sometimes relevant + Example : + Returns : value of comment (Bio::Annotation::Comment) + Args : newvalue (optional) + +=cut + +sub comment { + my ($self,$value) = @_; + if( defined $value) { + $self->{'comment'} = $value; + } + return $self->{'comment'}; +} + +=head1 Methods for Bio::IdentifiableI compliance + +=head2 object_id + + Title : object_id + Usage : $string = $obj->object_id() + Function: a string which represents the stable primary identifier + in this namespace of this object. For DNA sequences this + is its accession_number, similarly for protein sequences + + This is aliased to primary_id(). + Returns : A scalar + + +=cut + +sub object_id { + return shift->primary_id(@_); +} + +=head2 version + + Title : version + Usage : $version = $obj->version() + Function: a number which differentiates between versions of + the same object. Higher numbers are considered to be + later and more relevant, but a single object described + the same identifier should represent the same concept + + Returns : A number + +=cut + +sub version{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'_version'} = $value; + } + return $self->{'_version'}; +} + + +=head2 authority + + Title : authority + Usage : $authority = $obj->authority() + Function: a string which represents the organisation which + granted the namespace, written as the DNS name for + organisation (eg, wormbase.org) + + Returns : A scalar + +=cut + +sub authority { + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'authority'} = $value; + } + return $obj->{'authority'}; +} + +=head2 namespace + + Title : namespace + Usage : $string = $obj->namespace() + Function: A string representing the name space this identifier + is valid in, often the database name or the name + describing the collection + + For DBLink this is the same as database(). + Returns : A scalar + + +=cut + +sub namespace{ + return shift->database(@_); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Annotation/OntologyTerm.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Annotation/OntologyTerm.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,490 @@ +# $Id: OntologyTerm.pm,v 1.4.2.2 2003/04/04 15:53:20 lapp Exp $ +# +# BioPerl module for Bio::Annotation::OntologyTerm +# +# Cared for by Hilmar Lapp +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# +# (c) Hilmar Lapp, hlapp at gmx.net, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Annotation::OntologyTerm - An ontology term adapted to AnnotationI + +=head1 SYNOPSIS + + use Bio::Annotation::OntologyTerm; + use Bio::Annotation::Collection; + use Bio::Ontology::Term; + + my $coll = new Bio::Annotation::Collection; + + # this also implements a tag/value pair, where tag _and_ value are treated + # as ontology terms + my $annterm = new Bio::Annotation::OntologyTerm(-label => 'ABC1', + -tagname => 'Gene Name'); + # ontology terms can be added directly - they implicitly have a tag + $coll->add_Annotation($annterm); + + # implementation is by composition - you can get/set the term object + # e.g. + my $term = $annterm->term(); # term is-a Bio::Ontology::TermI + print "ontology term ",$term->name()," (ID ",$term->identifier(), + "), ontology ",$term->ontology()->name(),"\n"; + $term = Bio::Ontology::Term->new(-name => 'ABC2', -ontology => 'Gene Name'); + $annterm->term($term); + +=head1 DESCRIPTION + +Ontology term annotation object + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp + +Email bioperl-l@bio.perl.org +Email hlapp at gmx.net + + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Annotation::OntologyTerm; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::AnnotationI; +use Bio::Ontology::TermI; +use Bio::Ontology::Term; +use Bio::Root::Root; + +@ISA = qw(Bio::Root::Root Bio::AnnotationI Bio::Ontology::TermI); + +=head2 new + + Title : new + Usage : my $sv = new Bio::Annotation::OntologyTerm; + Function: Instantiate a new OntologyTerm object + Returns : Bio::Annotation::OntologyTerm object + Args : -term => $term to initialize the term data field [optional] + Most named arguments that Bio::Ontology::Term accepts will work + here too. -label is a synonym for -name, -tagname is a synonym for + -ontology. + +=cut + +sub new{ + my ($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + my ($term,$name,$label,$identifier,$definition,$ont,$tag) = + $self->_rearrange([qw(TERM + NAME + LABEL + IDENTIFIER + DEFINITION + ONTOLOGY + TAGNAME)], + @args); + if($term) { + $self->term($term); + } else { + $self->name($name || $label) if $name || $label; + $self->identifier($identifier) if $identifier; + $self->definition($definition) if $definition; + } + $self->ontology($ont || $tag) if $ont || $tag; + + return $self; +} + + +=head1 AnnotationI implementing functions + +=cut + +=head2 as_text + + Title : as_text + Usage : my $text = $obj->as_text + Function: return the string "Name: $v" where $v is the name of the term + Returns : string + Args : none + + +=cut + +sub as_text{ + my ($self) = @_; + + return $self->tagname()."|".$self->name()."|".$self->identifier(); +} + +=head2 hash_tree + + Title : hash_tree + Usage : my $hashtree = $value->hash_tree + Function: For supporting the AnnotationI interface just returns the value + as a hashref with the key 'value' pointing to the value + Returns : hashrf + Args : none + + +=cut + +sub hash_tree{ + my ($self) = @_; + + my $h = {}; + $h->{'name'} = $self->name(); + $h->{'identifier'} = $self->identifier(); + $h->{'definition'} = $self->definition(); + $h->{'synonyms'} = [$self->each_synonym()]; +} + + +=head2 tagname + + Title : tagname + Usage : $obj->tagname($newval) + Function: Get/set the tagname for this annotation value. + + Setting this is optional. If set, it obviates the need to provide + a tag to AnnotationCollection when adding this object. + + This is aliased to ontology() here. + Example : + Returns : value of tagname (a scalar) + Args : new value (a scalar, optional) + + +=cut + +sub tagname{ + my $self = shift; + + return $self->ontology(@_) if @_; + # if in get mode we need to get the name from the ontology + my $ont = $self->ontology(); + return ref($ont) ? $ont->name() : $ont; +} + +=head1 Methods for Bio::Ontology::TermI compliance + +=cut + +=head2 term + + Title : term + Usage : $obj->term($newval) + Function: Get/set the Bio::Ontology::TermI implementing object. + + We implement TermI by composition, and this method sets/gets the + object we delegate to. + Example : + Returns : value of term (a Bio::Ontology::TermI compliant object) + Args : new value (a Bio::Ontology::TermI compliant object, optional) + + +=cut + +sub term{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'term'} = $value; + } + if(! exists($self->{'term'})) { + $self->{'term'} = Bio::Ontology::Term->new(); + } + return $self->{'term'}; +} + +=head2 identifier + + Title : identifier + Usage : $term->identifier( "0003947" ); + or + print $term->identifier(); + Function: Set/get for the identifier of this Term. + Returns : The identifier [scalar]. + Args : The identifier [scalar] (optional). + +=cut + +sub identifier { + return shift->term()->identifier(@_); +} # identifier + +=head2 name + + Title : name + Usage : $term->name( "N-acetylgalactosaminyltransferase" ); + or + print $term->name(); + Function: Set/get for the name of this Term. + Returns : The name [scalar]. + Args : The name [scalar] (optional). + +=cut + +sub name { + return shift->term()->name(@_); +} # name + + +=head2 definition + + Title : definition + Usage : $term->definition( "Catalysis of ..." ); + or + print $term->definition(); + Function: Set/get for the definition of this Term. + Returns : The definition [scalar]. + Args : The definition [scalar] (optional). + +=cut + +sub definition { + return shift->term()->definition(@_); +} # definition + +=head2 ontology + + Title : ontology + Usage : $term->ontology( $top ); + or + $top = $term->ontology(); + Function: Set/get for a relationship between this Term and + another Term (e.g. the top level of the ontology). + Returns : The ontology of this Term [TermI]. + Args : The ontology of this Term [TermI or scalar -- which + becomes the name of the catagory term] (optional). + +=cut + +sub ontology { + return shift->term()->ontology(@_); +} + +=head2 is_obsolete + + Title : is_obsolete + Usage : $term->is_obsolete( 1 ); + or + if ( $term->is_obsolete() ) + Function: Set/get for the obsoleteness of this Term. + Returns : the obsoleteness [0 or 1]. + Args : the obsoleteness [0 or 1] (optional). + +=cut + +sub is_obsolete { + return shift->term()->is_obsolete(@_); +} # is_obsolete + +=head2 comment + + Title : comment + Usage : $term->comment( "Consider the term ..." ); + or + print $term->comment(); + Function: Set/get for an arbitrary comment about this Term. + Returns : A comment. + Args : A comment (optional). + +=cut + +sub comment { + return shift->term()->comment(@_); +} # comment + +=head2 get_synonyms + + Title : get_synonyms() + Usage : @aliases = $term->get_synonyms(); + Function: Returns a list of aliases of this Term. + Returns : A list of aliases [array of [scalar]]. + Args : + +=cut + +sub get_synonyms { + return shift->term()->get_synonyms(@_); +} # get_synonyms + +=head2 add_synonym + + Title : add_synonym + Usage : $term->add_synonym( @asynonyms ); + or + $term->add_synonym( $synonym ); + Function: Pushes one or more synonyms into the list of synonyms. + Returns : + Args : One synonym [scalar] or a list of synonyms [array of [scalar]]. + +=cut + +sub add_synonym { + return shift->term()->add_synonym(@_); +} # add_synonym + + +=head2 remove_synonyms + + Title : remove_synonyms() + Usage : $term->remove_synonyms(); + Function: Deletes (and returns) the synonyms of this Term. + Returns : A list of synonyms [array of [scalar]]. + Args : + +=cut + +sub remove_synonyms { + return shift->term()->remove_synonyms(@_); +} # remove_synonyms + +=head2 get_dblinks + + Title : get_dblinks() + Usage : @ds = $term->get_dblinks(); + Function: Returns a list of each dblinks of this GO term. + Returns : A list of dblinks [array of [scalars]]. + Args : + +=cut + +sub get_dblinks { + return shift->term->get_dblinks(@_); +} # get_dblinks + + +=head2 add_dblink + + Title : add_dblink + Usage : $term->add_dblink( @dbls ); + or + $term->add_dblink( $dbl ); + Function: Pushes one or more dblinks + into the list of dblinks. + Returns : + Args : One dblink [scalar] or a list of + dblinks [array of [scalars]]. + +=cut + +sub add_dblink { + return shift->term->add_dblink(@_); +} # add_dblink + + +=head2 remove_dblinks + + Title : remove_dblinks() + Usage : $term->remove_dblinks(); + Function: Deletes (and returns) the definition references of this GO term. + Returns : A list of definition references [array of [scalars]]. + Args : + +=cut + +sub remove_dblinks { + return shift->term->remove_dblinks(@_); +} # remove_dblinks + +=head2 get_secondary_ids + + Title : get_secondary_ids + Usage : @ids = $term->get_secondary_ids(); + Function: Returns a list of secondary identifiers of this Term. + + Secondary identifiers mostly originate from merging terms, + or possibly also from splitting terms. + + Returns : A list of secondary identifiers [array of [scalar]] + Args : + +=cut + +sub get_secondary_ids { + return shift->term->get_secondary_ids(@_); +} # get_secondary_ids + + +=head2 add_secondary_id + + Title : add_secondary_id + Usage : $term->add_secondary_id( @ids ); + or + $term->add_secondary_id( $id ); + Function: Adds one or more secondary identifiers to this term. + Returns : + Args : One or more secondary identifiers [scalars] + +=cut + +sub add_secondary_id { + return shift->term->add_secondary_id(@_); +} # add_secondary_id + + +=head2 remove_secondary_ids + + Title : remove_secondary_ids + Usage : $term->remove_secondary_ids(); + Function: Deletes (and returns) the secondary identifiers of this Term. + Returns : The previous list of secondary identifiers [array of [scalars]] + Args : + +=cut + +sub remove_secondary_ids { + return shift->term->remove_secondary_ids(@_); +} # remove_secondary_ids + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Annotation/Reference.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Annotation/Reference.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,477 @@ +# $Id: Reference.pm,v 1.18 2002/09/25 18:11:33 lapp Exp $ +# +# BioPerl module for Bio::Annotation::Reference +# +# Cared for by Ewan Birney +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Annotation::Reference - Specialised DBLink object for Literature References + +=head1 SYNOPSIS + + $reg = Bio::Annotation::Reference->new( -title => 'title line', + -location => 'location line', + -authors => 'author line', + -medline => 998122 ); + +=head1 DESCRIPTION + +Object which presents a literature reference. This is considered to be +a specialised form of database link. The additional methods provided +are all set/get methods to store strings commonly associated with +references, in particular title, location (ie, journal page) and +authors line. + +There is no attempt to do anything more than store these things as +strings for processing elsewhere. This is mainly because parsing these +things suck and generally are specific to the specific format one is +using. To provide an easy route to go format --E object --E format +without losing data, we keep them as strings. Feel free to post the +list for a better solution, but in general this gets very messy very +fast... + +=head1 CONTACT + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Annotation::Reference; +use vars qw(@ISA); +use strict; + +use Bio::Annotation::DBLink; +use Bio::AnnotationI; + +@ISA = qw(Bio::Annotation::DBLink); + +=head2 new + + Title : new + Usage : $ref = Bio::Annotation::Reference->new( -title => 'title line', + -authors => 'author line', + -location => 'location line', + -medline => 9988812); + Function: + Example : + Returns : a new Bio::Annotation::Reference object + Args : a hash with optional title, authors, location, medline, start and end + attributes + + +=cut + +sub new{ + my ($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + my ($start,$end,$authors,$location,$title,$medline,$tag) = + $self->_rearrange([qw(START + END + AUTHORS + LOCATION + TITLE + MEDLINE + TAGNAME + )],@args); + + defined $start && $self->start($start); + defined $end && $self->end($end); + defined $authors && $self->authors($authors); + defined $location && $self->location($location); + defined $title && $self->title($title); + defined $medline && $self->medline($medline); + defined $tag && $self->tagname($tag); + + return $self; +} + + +=head1 AnnotationI implementing functions + +=cut + +=head2 as_text + + Title : as_text + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub as_text{ + my ($self) = @_; + + # this could get out of hand! + return "Reference: ".$self->title; +} + + +=head2 hash_tree + + Title : hash_tree + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub hash_tree{ + my ($self) = @_; + + my $h = {}; + $h->{'title'} = $self->title; + $h->{'authors'} = $self->authors; + $h->{'location'} = $self->location; + if( defined $self->start ) { + $h->{'start'} = $self->start; + } + if( defined $self->end ) { + $h->{'end'} = $self->end; + } + $h->{'medline'} = $self->medline; + + return $h; +} + +=head2 tagname + + Title : tagname + Usage : $obj->tagname($newval) + Function: Get/set the tagname for this annotation value. + + Setting this is optional. If set, it obviates the need to provide + a tag to Bio::AnnotationCollectionI when adding this object. When + obtaining an AnnotationI object from the collection, the collection + will set the value to the tag under which it was stored unless the + object has a tag stored already. + Example : + Returns : value of tagname (a scalar) + Args : new value (a scalar, optional) + + +=cut + +sub tagname{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'tagname'} = $value; + } + return $self->{'tagname'}; +} + + +=head1 Specific accessors for References + +=cut + + +=head2 start + + Title : start + Usage : $self->start($newval) + Function: Gives the reference start base + Example : + Returns : value of start + Args : newvalue (optional) + + +=cut + +sub start { + my ($self,$value) = @_; + if( defined $value) { + $self->{'start'} = $value; + } + return $self->{'start'}; + +} + +=head2 end + + Title : end + Usage : $self->end($newval) + Function: Gives the reference end base + Example : + Returns : value of end + Args : newvalue (optional) + + +=cut + +sub end { + my ($self,$value) = @_; + if( defined $value) { + $self->{'end'} = $value; + } + return $self->{'end'}; +} + +=head2 rp + + Title : rp + Usage : $self->rp($newval) + Function: Gives the RP line. No attempt is made to parse this line. + Example : + Returns : value of rp + Args : newvalue (optional) + + +=cut + +sub rp{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'rp'} = $value; + } + return $self->{'rp'}; + +} + +=head2 authors + + Title : authors + Usage : $self->authors($newval) + Function: Gives the author line. No attempt is made to parse the author line + Example : + Returns : value of authors + Args : newvalue (optional) + + +=cut + +sub authors{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'authors'} = $value; + } + return $self->{'authors'}; + +} + +=head2 location + + Title : location + Usage : $self->location($newval) + Function: Gives the location line. No attempt is made to parse the location line + Example : + Returns : value of location + Args : newvalue (optional) + + +=cut + +sub location{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'location'} = $value; + } + return $self->{'location'}; + +} + +=head2 title + + Title : title + Usage : $self->title($newval) + Function: Gives the title line (if exists) + Example : + Returns : value of title + Args : newvalue (optional) + + +=cut + +sub title{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'title'} = $value; + } + return $self->{'title'}; + +} + +=head2 medline + + Title : medline + Usage : $self->medline($newval) + Function: Gives the medline number + Example : + Returns : value of medline + Args : newvalue (optional) + + +=cut + +sub medline{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'medline'} = $value; + } + return $self->{'medline'}; +} + +=head2 pubmed + + Title : pubmed + Usage : $refobj->pubmed($newval) + Function: Get/Set the PubMed number, if it is different from the MedLine + number. + Example : + Returns : value of medline + Args : newvalue (optional) + + +=cut + +sub pubmed { + my ($self,$value) = @_; + if( defined $value) { + $self->{'pubmed'} = $value; + } + return $self->{'pubmed'}; +} + +=head2 database + + Title : database + Usage : + Function: Overrides DBLink database to be hard coded to 'MEDLINE', unless + the database has been set explicitely before. + Example : + Returns : + Args : + + +=cut + +sub database{ + my ($self, @args) = @_; + + return $self->SUPER::database(@args) || 'MEDLINE'; +} + +=head2 primary_id + + Title : primary_id + Usage : + Function: Overrides DBLink primary_id to provide medline number + Example : + Returns : + Args : + + +=cut + +sub primary_id{ + my ($self, @args) = @_; + + return $self->medline(@args); +} + +=head2 optional_id + + Title : optional_id + Usage : + Function: Overrides DBLink optional_id to provide the PubMed number. + Example : + Returns : + Args : + + +=cut + +sub optional_id{ + my ($self, @args) = @_; + + return $self->pubmed(@args); +} + +=head2 publisher + + Title : publisher + Usage : $self->publisher($newval) + Function: Gives the publisher line. No attempt is made to parse the publisher line + Example : + Returns : value of publisher + Args : newvalue (optional) + + +=cut + +sub publisher { + my ($self,$value) = @_; + if( defined $value) { + $self->{'publisher'} = $value; + } + return $self->{'publisher'}; +} + + +=head2 editors + + Title : editors + Usage : $self->editors($newval) + Function: Gives the editors line. No attempt is made to parse the editors line + Example : + Returns : value of editors + Args : newvalue (optional) + + +=cut + +sub editors { + my ($self,$value) = @_; + if( defined $value) { + $self->{'editors'} = $value; + } + return $self->{'editors'}; +} + + +=head2 encoded_ref + + Title : encoded_ref + Usage : $self->encoded_ref($newval) + Function: Gives the encoded_ref line. No attempt is made to parse the encoded_ref line + (this is added for reading PDB records (REFN record), where this contains + ISBN/ISSN/ASTM code) + Example : + Returns : value of encoded_ref + Args : newvalue (optional) + + +=cut + +sub encoded_ref { + my ($self,$value) = @_; + if( defined $value) { + $self->{'encoded_ref'} = $value; + } + return $self->{'encoded_ref'}; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Annotation/SimpleValue.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Annotation/SimpleValue.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,237 @@ +# $Id: SimpleValue.pm,v 1.9.2.1 2003/03/10 22:04:56 lapp Exp $ +# +# BioPerl module for Bio::Annotation::SimpleValue +# +# Cared for by bioperl +# +# Copyright bioperl +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Annotation::SimpleValue - A simple scalar + +=head1 SYNOPSIS + + use Bio::Annotation::SimpleValue; + use Bio::Annotation::Collection; + + my $col = new Bio::Annotation::Collection; + my $sv = new Bio::Annotation::SimpleValue(-value => 'someval'); + $col->add_Annotation('tagname', $sv); + +=head1 DESCRIPTION + +Scalar value annotation object + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - bioperl + +Email bioperl-l@bio.perl.org + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Annotation::SimpleValue; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::AnnotationI; +#use Bio::Ontology::TermI; +use Bio::Root::Root; + +@ISA = qw(Bio::Root::Root Bio::AnnotationI); + +=head2 new + + Title : new + Usage : my $sv = new Bio::Annotation::SimpleValue; + Function: Instantiate a new SimpleValue object + Returns : Bio::Annotation::SimpleValue object + Args : -value => $value to initialize the object data field [optional] + -tagname => $tag to initialize the tagname [optional] + -tag_term => ontology term representation of the tag [optional] + +=cut + +sub new{ + my ($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + my ($value,$tag,$term) = + $self->_rearrange([qw(VALUE TAGNAME TAG_TERM)], @args); + + # set the term first + defined $term && $self->tag_term($term); + defined $value && $self->value($value); + defined $tag && $self->tagname($tag); + + return $self; +} + + +=head1 AnnotationI implementing functions + +=cut + +=head2 as_text + + Title : as_text + Usage : my $text = $obj->as_text + Function: return the string "Value: $v" where $v is the value + Returns : string + Args : none + + +=cut + +sub as_text{ + my ($self) = @_; + + return "Value: ".$self->value; +} + +=head2 hash_tree + + Title : hash_tree + Usage : my $hashtree = $value->hash_tree + Function: For supporting the AnnotationI interface just returns the value + as a hashref with the key 'value' pointing to the value + Returns : hashrf + Args : none + + +=cut + +sub hash_tree{ + my ($self) = @_; + + my $h = {}; + $h->{'value'} = $self->value; +} + +=head2 tagname + + Title : tagname + Usage : $obj->tagname($newval) + Function: Get/set the tagname for this annotation value. + + Setting this is optional. If set, it obviates the need to + provide a tag to AnnotationCollection when adding this + object. + + Example : + Returns : value of tagname (a scalar) + Args : new value (a scalar, optional) + + +=cut + +sub tagname{ + my $self = shift; + + # check for presence of an ontology term + if($self->{'_tag_term'}) { + # keep a copy in case the term is removed later + $self->{'tagname'} = $_[0] if @_; + # delegate to the ontology term object + return $self->tag_term->name(@_); + } + return $self->{'tagname'} = shift if @_; + return $self->{'tagname'}; +} + + +=head1 Specific accessors for SimpleValue + +=cut + +=head2 value + + Title : value + Usage : $obj->value($newval) + Function: Get/Set the value for simplevalue + Returns : value of value + Args : newvalue (optional) + + +=cut + +sub value{ + my ($self,$value) = @_; + + if( defined $value) { + $self->{'value'} = $value; + } + return $self->{'value'}; +} + +=head2 tag_term + + Title : tag_term + Usage : $obj->tag_term($newval) + Function: Get/set the L object representing + the tag name. + + This is so you can specifically relate the tag of this + annotation to an entry in an ontology. You may want to do + this to associate an identifier with the tag, or a + particular category, such that you can better match the tag + against a controlled vocabulary. + + This accessor will return undef if it has never been set + before in order to allow this annotation to stay + light-weight if an ontology term representation of the tag + is not needed. Once it is set to a valid value, tagname() + will actually delegate to the name() of this term. + + Example : + Returns : a L compliant object, or undef + Args : on set, new value (a L compliant + object or undef, optional) + + +=cut + +sub tag_term{ + my $self = shift; + + return $self->{'_tag_term'} = shift if @_; + return $self->{'_tag_term'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Annotation/StructuredValue.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Annotation/StructuredValue.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,352 @@ +# $Id: StructuredValue.pm,v 1.2 2002/10/22 07:38:26 lapp Exp $ +# +# BioPerl module for Bio::Annotation::StructuredValue +# +# Cared for by Hilmar Lapp +# + +# +# (c) Hilmar Lapp, hlapp at gmx.net, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Annotation::StructuredValue - A scalar with embedded structured + information + +=head1 SYNOPSIS + + use Bio::Annotation::StructuredValue; + use Bio::Annotation::Collection; + + my $col = new Bio::Annotation::Collection; + my $sv = new Bio::Annotation::StructuredValue(-value => 'someval'); + $col->add_Annotation('tagname', $sv); + +=head1 DESCRIPTION + +Scalar value annotation object + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - bioperl + +Email bioperl-l@bio.perl.org + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Annotation::StructuredValue; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::AnnotationI; +use Bio::Annotation::SimpleValue; + +@ISA = qw(Bio::Annotation::SimpleValue); + +=head2 new + + Title : new + Usage : my $sv = new Bio::Annotation::StructuredValue; + Function: Instantiate a new StructuredValue object + Returns : Bio::Annotation::StructuredValue object + Args : -value => $value to initialize the object data field [optional] + -tagname => $tag to initialize the tagname [optional] + +=cut + +sub new{ + my ($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + my ($value,$tag) = $self->_rearrange([qw(VALUE TAGNAME)], @args); + + $self->{'values'} = []; + defined $value && $self->value($value); + defined $tag && $self->tagname($tag); + + return $self; +} + + +=head1 AnnotationI implementing functions + +=cut + +=head2 as_text + + Title : as_text + Usage : my $text = $obj->as_text + Function: return the string "Value: $v" where $v is the value + Returns : string + Args : none + + +=cut + +sub as_text{ + my ($self) = @_; + + return "Value: ".$self->value; +} + +=head2 hash_tree + + Title : hash_tree + Usage : my $hashtree = $value->hash_tree + Function: For supporting the AnnotationI interface just returns the value + as a hashref with the key 'value' pointing to the value + Returns : hashrf + Args : none + + +=cut + +sub hash_tree{ + my ($self) = @_; + + my $h = {}; + $h->{'value'} = $self->value; +} + +=head2 tagname + + Title : tagname + Usage : $obj->tagname($newval) + Function: Get/set the tagname for this annotation value. + + Setting this is optional. If set, it obviates the need to provide + a tag to AnnotationCollection when adding this object. + Example : + Returns : value of tagname (a scalar) + Args : new value (a scalar, optional) + + +=cut + +sub tagname{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'tagname'} = $value; + } + return $self->{'tagname'}; +} + + +=head1 Specific accessors for StructuredValue + +=cut + +=head2 value + + Title : value + Usage : $obj->value($newval) + Function: Get/set the value for this annotation. + + Set mode is here only to retain compatibility with + SimpleValue. It is equivalent to calling + add_value([0], $newval). + + In get mode, this implementation allows to pass additional + parameters that control how the structured annotation + components will be joined together to form a + string. Recognized are presently + -joins a reference to an array of join strings, the + elements at index i applying to joining + annotations at dimension i. The last element + will be re-used for dimensions higher than i. + Defaults to ['; ']. + -brackets a reference to an array of two strings + denoting the opening and closing brackets for + the elements of one dimension, if there is + more than one element in the dimension. + Defaults to ['(',')']. + + Returns : value of value + Args : newvalue (optional) + + +=cut + +sub value{ + my ($self,$value,@args) = @_; + + # set mode? + return $self->add_value([0], $value) if defined($value) && (@args == 0); + # no, get mode + # determine joins and brackets + unshift(@args, $value); + my ($joins, $brackets) = + $self->_rearrange([qw(JOINS BRACKETS)], @args); + $joins = ['; '] unless $joins; + $brackets = ['(', ')'] unless $brackets; + my $txt = &_to_text($self->{'values'}, $joins, $brackets); + # if there's only brackets at the start and end, remove them + if((@{$self->{'values'}} == 1) && + (length($brackets->[0]) == 1) && (length($brackets->[1]) == 1)) { + my $re = '\\'.$brackets->[0]. + '([^\\'.$brackets->[1].']*)\\'.$brackets->[1]; + $txt =~ s/^$re$/$1/; + } + return $txt; +} + +sub _to_text{ + my ($arr, $joins, $brackets, $rec_n) = @_; + + $rec_n = 0 unless defined($rec_n); + my $i = $rec_n >= @$joins ? @$joins-1 : $rec_n; + my $txt = join($joins->[$i], + map { + ref($_) ? + (ref($_) eq "ARRAY" ? + &_to_text($_, $joins, $brackets, $rec_n+1) : + $_->value()) : + $_; + } @$arr); + if($rec_n && (@$arr > 1)) { + $txt = $brackets->[0] . $txt . $brackets->[1]; + } + return $txt; +} + +=head2 get_values + + Title : get_values + Usage : + Function: Get the top-level array of values. Each of the elements will + recursively be a reference to an array or a scalar, depending + on the depth of this structured value annotation. + Example : + Returns : an array + Args : none + + +=cut + +sub get_values{ + my $self = shift; + + return @{$self->{'values'}}; +} + +=head2 get_all_values + + Title : get_all_values + Usage : + Function: Flattens all values in this structured annotation and + returns them as an array. + Example : + Returns : the (flat) array of values + Args : none + + +=cut + +sub get_all_values{ + my ($self) = @_; + + # we code lazy here and just take advantage of value() + my $txt = $self->value(-joins => ['@!@'], -brackets => ['','']); + return split(/\@!\@/, $txt); +} + +=head2 add_value + + Title : add_value + Usage : + Function: Adds the given value to the structured annotation at the + given index. + + The index is multi-dimensional, with the first dimension + applying to the first level, and so forth. If a particular + dimension or a particular index does not exist yet, it will + be created. If it does exist and adding the value would + mean replacing a scalar with an array reference, we throw + an exception to prevent unintended damage. An index of -1 + at any dimension means append. + + If an array of values is to be added, it will create an + additional dimension at the index specified, unless the + last index value is -1, in which case they will all be + appended to the last dimension. + + Example : + Returns : none + Args : the index at which to add (a reference to an array) + the value(s) to add + + +=cut + +sub add_value{ + my ($self,$index,@values) = @_; + + my $tree = $self->{'values'}; + my $lastidx = pop(@$index); + foreach my $i (@$index) { + if($i < 0) { + my $subtree = []; + push(@$tree, $subtree); + $tree = $subtree; + } elsif((! $tree->[$i]) || (ref($tree->[$i]) eq "ARRAY")) { + $tree->[$i] = [] unless ref($tree->[$i]) eq "ARRAY"; + $tree = $tree->[$i]; + } else { + $self->throw("element $i is a scalar but not in last dimension"); + } + } + if($lastidx < 0) { + push(@$tree, @values); + } elsif(@values < 2) { + $tree->[$lastidx] = shift(@values); + } else { + $tree->[$lastidx] = [@values]; + } + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Annotation/TypeManager.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Annotation/TypeManager.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,175 @@ +# $Id: TypeManager.pm,v 1.4 2002/10/22 07:38:26 lapp Exp $ +# +# BioPerl module for Bio::Annotation::TypeManager +# +# Cared for by Ewan Birney +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Annotation::TypeManager - Manages types for annotation collections + +=head1 SYNOPSIS + + # default type manager + + $tm = Bio::Annotation::TypeManager->new(); + + # $key is a string or a Bio::Ontology::TermI compliant object + print "The type for $key is ",$tm->type_for_key($key),"\n"; + + if( !$tm->is_valid($key,$object) ) { + $self->throw("Invalid object for key $key"); + } + +=head1 DESCRIPTION + +Manages types for annotation collections. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bio.perl.org + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney + +Email birney@ebi.ac.uk + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Annotation::TypeManager; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::Root::Root; + + +@ISA = qw(Bio::Root::Root); +# new() can be inherited from Bio::Root::Root + +=head2 new + + Title : new + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub new{ + my ($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + $self->{'_type'} = {}; + + $self->_add_type_map('reference',"Bio::Annotation::Reference"); + $self->_add_type_map('comment',"Bio::Annotation::Comment"); + $self->_add_type_map('dblink',"Bio::Annotation::DBLink"); + + return $self; +} + + +=head2 type_for_key + + Title : type_for_key + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub type_for_key{ + my ($self,$key) = @_; + + $key = $key->name() if ref($key) && $key->isa("Bio::Ontology::TermI"); + return $self->{'_type'}->{$key}; +} + + +=head2 is_valid + + Title : is_valid + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub is_valid{ + my ($self,$key,$object) = @_; + + if( !defined $object || !ref $object ) { + $self->throw("Cannot type an object [$object]!"); + } + + if( !$object->isa($self->type_for_key($key)) ) { + return 0; + } else { + return 1; + } +} + + +=head2 _add_type_map + + Title : _add_type_map + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub _add_type_map{ + my ($self,$key,$type) = @_; + + $key = $key->name() if ref($key) && $key->isa("Bio::Ontology::TermI"); + $self->{'_type'}->{$key} = $type; +} + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/AnnotationCollectionI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/AnnotationCollectionI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,177 @@ +# $Id: AnnotationCollectionI.pm,v 1.9 2002/10/22 07:38:24 lapp Exp $ + +# +# BioPerl module for Bio::AnnotationCollectionI +# +# Cared for by Ewan Birney +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::AnnotationCollectionI - Interface for annotation collections + +=head1 SYNOPSIS + + # get an AnnotationCollectionI somehow, eg + + $ac = $seq->annotation(); + + foreach $key ( $ac->get_all_annotation_keys() ) { + @values = $ac->get_Annotations($key); + foreach $value ( @values ) { + # value is an Bio::AnnotationI, and defines a "as_text" method + print "Annotation ",$key," stringified value ",$value->as_text,"\n"; + + # also defined hash_tree method, which allows data orientated + # access into this object + $hash = $value->hash_tree(); + } + } + +=head1 DESCRIPTION + +Annotation Collections are a way of storing a series of "interesting +facts" about something. We call an "interesting fact" in Bioperl an +Annotation (this differs from a Sequence Feature, which is called +a Sequence Feature and may or may not have an Annotation Collection). + +The trouble about this is we are not that sure what "interesting +facts" someone might want to store: the possibility is endless. + +Bioperl's approach is that the "interesting facts" are represented by +Bio::AnnotationI objects. The interface Bio::AnnotationI guarentees +two methods + + $obj->as_text(); # string formated to display to users + +and + + $obj->hash_tree(); # hash with defined rules for data-orientated discovery + +The hash_tree method is designed to play well with XML output and +other "nested-tag-of-data-values" think BoulderIO and/or Ace stuff. For more +info read Bio::AnnotationI docs + +Annotations are stored in AnnotationCollections, each Annotation under a +different "tag". The tags allow simple discovery of the available annotations, +and in some cases (like the tag "gene_name") indicate how to interpret the +data underneath the tag. The tag is only one tag deep and each tag can have an +array of values. + +In addition, AnnotationCollectionI's are guarentee to maintain a consistent +set object values under each tag - at least that each object complies to one +interface. The "standard" AnnotationCollection insists the following rules +are set up + + Tag Object + --- ------ + reference Bio::Annotation::Reference + comment Bio::Annotation::Comment + dblink Bio::Annotation::DBLink + gene_name Bio::Annotation::SimpleValue + description Bio::Annotation::SimpleValue + +These tags are the implict tags that the SeqIO system needs to round-trip +GenBank/EMBL/Swissprot. + +However, you as a user and us collectively as a community can grow the +"standard" tag mapping over time and specifically for a particular +area. + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bio.perl.org + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney + +Email birney@ebi.ac.uk + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::AnnotationCollectionI; +use vars qw(@ISA); +use strict; + +# Interface preamble - inherits from Bio::Root::RootI + +use Bio::Root::RootI; + +@ISA = qw(Bio::Root::RootI); + + +=head2 get_all_annotation_keys + + Title : get_all_annotation_keys + Usage : $ac->get_all_annotation_keys() + Function: gives back a list of annotation keys, which are simple text strings + Returns : list of strings + Args : none + +=cut + +sub get_all_annotation_keys{ + shift->throw_not_implemented(); +} + + +=head2 get_Annotations + + Title : get_Annotations + Usage : my @annotations = $collection->get_Annotations('key') + Function: Retrieves all the Bio::AnnotationI objects for a specific key + Returns : list of Bio::AnnotationI - empty if no objects stored for a key + Args : string which is key for annotations + +=cut + +sub get_Annotations{ + shift->throw_not_implemented(); +} + +=head2 get_num_of_annotations + + Title : get_num_of_annotations + Usage : my $count = $collection->get_num_of_annotations() + Function: Returns the count of all annotations stored in this collection + Returns : integer + Args : none + + +=cut + +sub get_num_of_annotations{ + shift->throw_not_implemented(); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/AnnotationI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/AnnotationI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,180 @@ +# $Id: AnnotationI.pm,v 1.7 2002/10/22 07:38:24 lapp Exp $ + +# +# BioPerl module for Bio::AnnotationI +# +# Cared for by Ewan Birney +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::AnnotationI - Annotation interface + +=head1 SYNOPSIS + + # generally you get AnnotationI's from AnnotationCollectionI's + + foreach $key ( $ac->get_all_annotation_keys() ) { + @values = $ac->get_Annotations($key); + foreach $value ( @values ) { + # value is an Bio::AnnotationI, and defines a "as_text" method + print "Annotation ",$key," stringified value ",$value->as_text,"\n"; + # you can also use a generic hash_tree method for getting + # stuff out say into XML format + $hash_tree = $value->hash_tree(); + } + } + + +=head1 DESCRIPTION + +Interface all annotations must support. There are two things that each +annotation has to support. + + $annotation->as_text() + +Annotations have to support an "as_text" method. This should be a +single text string, without newlines representing the annotation, +mainly for human readability. It is not aimed at being able to +store/represent the annotation. + +The second method allows annotations to at least attempt to represent +themselves as pure data for storage/display/whatever. The method +hash_tree + + $hash = $annotation->hash_tree(); + +should return an anonymous hash with "XML-like" formatting. The +formatting is as follows. + + (1) For each key in the hash, if the value is a reference'd array - + + (2) For each element of the array if the value is a object - + Assumme the object has the method "hash_tree"; + (3) else if the value is a referene to a hash + Recurse again from point (1) + (4) else + Assumme the value is a scalar, and handle it directly as text + + (5) else (if not an array) apply rules 2,3 and 4 to value + +The XML path in tags is represented by the keys taken in the +hashes. When arrays are encountered they are all present in the path +level of this tag + +This is a pretty "natural" representation of an object tree in an XML +style, without forcing everything to inheriet off some super-generic +interface for representing things in the hash. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bio.perl.org + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney + +Email birney@ebi.ac.uk + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + +#' +# Let the code begin... + + +package Bio::AnnotationI; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::Root::RootI; + + +@ISA = qw(Bio::Root::RootI); + + +=head2 as_text + + Title : as_text + Usage : + Function: single text string, without newlines representing the + annotation, mainly for human readability. It is not aimed + at being able to store/represent the annotation. + Example : + Returns : a string + Args : none + + +=cut + +sub as_text{ + shift->throw_not_implemented(); +} + +=head2 hash_tree + + Title : hash_tree + Usage : + Function: should return an anonymous hash with "XML-like" formatting + Example : + Returns : a hash reference + Args : none + + +=cut + +sub hash_tree{ + shift->throw_not_implemented(); +} + +=head2 tagname + + Title : tagname + Usage : $obj->tagname($newval) + Function: Get/set the tagname for this annotation value. + + Setting this is optional. If set, it obviates the need to + provide a tag to Bio::AnnotationCollectionI when adding + this object. When obtaining an AnnotationI object from the + collection, the collection will set the value to the tag + under which it was stored unless the object has a tag + stored already. + + Example : + Returns : value of tagname (a scalar) + Args : new value (a scalar, optional) + + +=cut + +sub tagname{ + shift->throw_not_implemented(); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Assembly/Contig.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Assembly/Contig.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,2091 @@ +# $Id: Contig.pm,v 1.1 2002/11/04 11:50:11 heikki Exp $ +# +# BioPerl module for Bio::Assembly::Contig +# Mostly based on Bio::SimpleAlign by Ewan Birney +# +# Cared for by Robson francisco de Souza +# +# Copyright Robson Francisco de Souza +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Assembly::Contig - Perl module to hold and manipulate + sequence assembly contigs. + +=head1 SYNOPSYS + + # Module loading + use Bio::Assembly::IO; + + # Assembly loading methods + $aio = new Bio::Assembly::IO(-file=>"test.ace.1", + -format=>'phrap'); + + $assembly = $aio->next_assembly; + foreach $contig ($assembly->all_contigs) { + # do something + } + + # OR, if you want to build the contig yourself, + + use Bio::Assembly::Contig; + $c = Bio::Assembly::Contig->new(-id=>"1"); + + $ls = Bio::LocatableSeq->new(-seq=>"ACCG-T", + -id=>"r1", + -alphabet=>'dna'); + $ls2 = Bio::LocatableSeq->new(-seq=>"ACA-CG-T", + -id=>"r2", + -alphabet=>'dna'); + + $ls_coord = Bio::SeqFeature::Generic->new(-start=>3, + -end=>8, + -strand=>1); + $ls2_coord = Bio::SeqFeature::Generic->new(-start=>1, + -end=>8, + -strand=>1); + $c->add_seq($ls); + $c->add_seq($ls2); + $c->set_seq_coord($ls_coord,$ls); + $c->set_seq_coord($ls2_coord,$ls2); + + $con = Bio::LocatableSeq->new(-seq=>"ACACCG-T", + -alphabet=>'dna'); + $c->set_consensus_sequence($con); + + $l = $c->change_coord('unaligned r2','ungapped consensus',6); + print "6 in unaligned r2 => $l in ungapped consensus\n"; + + +=head1 DESCRIPTION + +A contig is as a set of sequences, locally aligned to each other, so +that every sequence has overlapping regions with at least one sequence +in the contig, such that a continuous of overlapping sequences is +formed, allowing the deduction of a consensus sequence which may be +longer than any of the sequences from which it was deduced. + +In this documentation we refer to the overlapping sequences used to +build the contig as "aligned sequences" and to the sequence deduced +from the overlap of aligned sequences as the "consensus". Methods to +deduce the consensus sequence from aligned sequences were not yet +implemented in this module, but its posssible to add a consensus +sequence deduced by other means, e.g, by the assembly program used to +build the alignment. + +All aligned sequences in a Bio::Assembly::Contig must be Bio::Assembly::Locatable +objects and have a unique ID. The unique ID restriction is due to the +nature of the module's internal data structures and is also a request +of some assembly programs. If two sequences with the same ID are added +to a contig, the first sequence added is replaced by the second one. + +=head2 Coordinate_systems + +There are four base coordinate systems in Bio::Assembly::Contig. When +you need to access contig elements or data that exists on a certain +range or location, you may be specifying coordinates in relation to +different sequences, which may be either the contig consensus or one +of the aligned sequences that were used to do the assembly. + + ========================================================= + Name | Referenced sequence + --------------------------------------------------------- + "gapped consensus" | Contig (with gaps) + "ungapped consensus" | Contig (without gaps) + "aligned $seqID" | sequence $seqID (with gaps) + "unaligned $seqID" | sequence $seqID (without gaps) + ========================================================= + +"gapped consensus" refers to positions in the aligned consensus +sequence, which is the consensus sequence including the gaps inserted +to align it agains the aligned sequences that were used to assemble +the contig. So, its limits are [ 1, (consensus length + number of gaps +in consensus) ] + +"ungapped consensus" is a coordinate system based on the consensus +sequence, but excluding consensus gaps. This is just the coordinate +system that you have when considering the consensus sequence alone, +instead of aligned to other sequences. + +"aligned $seqID" refers to locations in the sequence $seqID after +alignment of $seqID against the consensus sequence (reverse +complementing the original sequence, if needed). Coordinate 1 in +"aligned $seqID" is equivalent to the start location (first base) of +$seqID in the consensus sequence, just like if the aligned sequence +$seqID was a feature of the consensus sequence. + +"unaligned $seqID" is equivalent to a location in the isolated +sequence, just like you would have when considering the sequence +alone, out of an alignment. When changing coordinates from "aligned +$seq2" to "unaligned $seq2", if $seq2 was reverse complemented when +included in the alignment, the output coordinates will be reversed to +fit that fact, i.e. 1 will be changed to length($seq2), 2 will be +length($seq)-1 and so on. + +An important note: when you change gap coordinates from a gapped +system ("gapped consensus" or "aligned $seqID") to a system that does +not include gaps ("ungapped consensus" or "unaligned $seqID"), the +position returned will be the first location before all gaps +neighboring the input location. + +=head2 Feature_collection + +Bio::Assembly::Contig stores much information about a contig in a +Bio::Assembly::SeqFeature::Collection object. Relevant information on the +alignment is accessed by selecting features based on their primary +tags (e.g. all features which have a primary tag of the form +'_aligned_coord:$seqID', where $seqID is an aligned sequence ID, are +coordinates for sequences in the contig alignment) and, by using +methods from Bio::Assembly::SeqFeature::Collection, it's possible to select +features by overlap with other features. + +We suggest that you use the primary tags of features as identifiers +for feature classes. By convention, features with primary tags +starting with a '_' are generated by modules that populate the contig +data structure and return the contig object, maybe as part of an +assembly object, e.g. drivers from the Bio::Assembly::IO set. + +Features in the features collection may be associated with particular +aligned sequences. To obtain this, you must attach the sequence to the +feature, using attach() seq from Bio::Assembly::SeqFeatureI, before you add the +feature to the feature collection. We also suggest to add the sequence +id to the primary tag, so that is easy to select feature for a +particular sequence. + +There is only one feature class that some methods in +Bio::Assembly::Contig expect to find in the feature collection: features +with primary tags of the form '_aligned_coord:$seqID', where $seqID is +the aligned sequence id (like returned by $seq-Eid()). These features +describe the position (in "gapped consensus" coordinates) of aligned +sequences, and the method set_seq_coord() automatically changes a +feature's primary tag to this form whenever the feature is added to +the collection by this method. Only two methods in Bio::Assembly::Contig +will not work unless there are features from this class: +change_coord() and get_seq_coord(). + +Other feature classes will be automatically available only when +Bio::Assembly::Contig objects are created by a specific module. Such +feature classes are (or should be) documented in the documentation of +the module which create them, to which the user should refer. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Robson Francisco de Souza + +rfsouza@citri.iq.usp.br + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +#' +package Bio::Assembly::Contig; + +use strict; +use vars qw(@ISA $VERSION); + +use Bio::Root::Root; +use Bio::Align::AlignI; +use Bio::SeqFeature::Collection; +use Bio::Seq::PrimaryQual; + +@ISA = qw(Bio::Root::Root Bio::Align::AlignI); + +=head1 Object creator + +=head2 new + + Title : new + Usage : my $contig = new Bio::Assembly::Contig(); + Function : Creates a new contig object + Returns : Bio::Assembly::Contig + Args : -source => string representing the source + program where this contig came + from + -id => contig unique ID + +=cut + +#----------- +sub new { +#----------- + my ($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + my ($src, $id) = $self->_rearrange([qw(SOURCE ID)], @args); + $src && $self->source($src); + ($id && $self->id($id)) || ($self->{'_id'} = 'NoName'); # Alignment (contig) name + ($id && $self->id($id)) || ($self->{'_source'} = 'Unknown'); # Program used to build the contig + # we need to set up internal hashes first! + + # Bio::SimpleAlign derived fields (check which ones are needed for AlignI compatibility) + $self->{'_elem'} = {}; # contig elements: aligned sequence objects (keyed by ID) + $self->{'_order'} = {}; # store sequence order +# $self->{'start_end_lists'} = {}; # References to entries in {'_seq'}. Keyed by seq ids. +# $self->{'_dis_name'} = {}; # Display names for each sequence + $self->{'_symbols'} = {}; # List of symbols + + #Contig specific slots + $self->{'_consensus_sequence'} = undef; + $self->{'_consensus_quality'} = undef; + $self->{'_nof_residues'} = 0; + $self->{'_nof_seqs'} = 0; +# $self->{'_nof_segments'} = 0; # Let's not make it heavier than needed by now... + $self->{'_sfc'} = Bio::SeqFeature::Collection->new(); + + # Assembly specifcs + $self->{'_assembly'} = undef; # Reference to a Bio::Assembly::Scaffold object, if contig belongs to one. + $self->{'_strand'} = 0; # Reverse (-1) or forward (1), if contig is in a scaffold. 0 otherwise + $self->{'_neighbor_start'} = undef; # Will hold a reference to another contig + $self->{'_neighbor_end'} = undef; # Will hold a reference to another contig + + return $self; # success - we hope! +} + +=head1 Assembly related methods + +These methods exist to enable adding information about possible +relations among contigs, e.g. when you already have a scaffold for +your assembly, describing the ordering of contigs in the final +assembly, but no sequences covering the gaps between neighboring +contigs. + +=head2 source + + Title : source + Usage : $contig->source($program); + Function : Get/Set program used to build this contig + Returns : string + Argument : [optional] string + +=cut + +sub source { + my $self = shift; + my $source = shift; + + $self->{'_source'} = $source if (defined $source); + return $self->{'_source'}; +} + +=head2 assembly + + Title : assembly + Usage : $contig->assembly($assembly); + Function : Get/Set assembly object for this contig + Returns : a Bio::Assembly::Scaffold object + Argument : a Bio::Assembly::Scaffold object + +=cut + +sub assembly { + my $self = shift; + my $assembly = shift; + + $self->throw("Using non Bio::Assembly::Scaffold object when assign contig to assembly") + if (defined $assembly && ! $assembly->isa("Bio::Assembly::Scaffold")); + + $self->{'_assembly'} = $assembly if (defined $assembly); + return $self->{'_assembly'}; +} + +=head2 strand + + Title : strand + Usage : $contig->strand($num); + Function : Get/Set contig orientation in a scaffold/assembly. + Its equivalent to the strand property of sequence + objects and sets whether the contig consensus should + be reversed and complemented before being added to a + scaffold or assembly. + Returns : integer + Argument : 1 if orientaion is forward, -1 if reverse and + 0 if none + +=cut + +sub strand { + my $self = shift; + my $ori = shift; + + $self->throw("Contig strand must be either 1, -1 or 0") + unless (defined $ori && ($ori == 1 || $ori == 0 || $ori == -1)); + + $self->{'_strand'} = $ori; + return $self->{'_strand'}; +} + +=head2 upstream_neighbor + + Title : upstream_neighbor + Usage : $contig->upstream_neighbor($contig); + Function : Get/Set a contig neighbor for the current contig when + building a scaffold. The upstream neighbor is + located before $contig first base + Returns : nothing + Argument : Bio::Assembly::Contig + +=cut + +sub upstream_neighbor { + my $self = shift; + my $ref = shift; + + $self->throw("Trying to assign a non Bio::Assembly::Contig object to upstream contig") + if (defined $ref && ! $ref->isa("Bio::Assembly::Contig")); + + $self->{'_neighbor_start'} = $ref if (defined $ref); + return $self->{'_neighbor_start'}; +} + +=head2 downstream_neighbor + + Title : downstream_neighbor + Usage : $contig->downstream_neighbor($num); + Function : Get/Set a contig neighbor for the current contig when + building a scaffold. The downstream neighbor is + located after $contig last base + Returns : nothing + Argument : Bio::Assembly::Contig + +=cut + +sub downstream_neighbor { + my $self = shift; + my $ref = shift; + + $self->throw("Trying to assign a non Bio::Assembly::Contig object to downstream contig") + if (defined $ref && ! $ref->isa("Bio::Assembly::Contig")); + $self->{'_neighbor_end'} = $ref if (defined $ref); + return $self->{'_neighbor_end'}; +} + +=head1 Contig feature collection methods + +=head2 add_features + + Title : add_features + Usage : $contig->add_features($feat,$flag) + Function : + + Add an array of features to the contig feature + collection. The consensus sequence may be attached to the + added feature, if $flag is set to 1. If $flag is 0 and + the feature attached to one of the contig aligned + sequences, the feature is registered as an aligned + sequence feature. If $flag is 0 and the feature is not + attched to any sequence in the contig, the feature is + simply added to the feature collection and no attachment + or registration is made. + + Note: You must attach aligned sequences to their features + prior to calling add_features, otherwise you won't be + able to access the feature through get_seq_feat_by_tag() + method. + + Returns : number of features added. + Argument : + $feat : A reference to an array of Bio::SeqFeatureI + $flag : boolean - true if consensus sequence object + should be attached to this feature, false if + no consensus attachment should be made. + Default: false. + +=cut + +sub add_features { + my ($self, $args, $flag) = @_; + + # Adding shortcuts for aligned sequence features + $flag = 0 unless (defined $flag); + if ($flag && defined $self->{'_consensus_sequence'}) { + foreach my $feat (@$args) { + next if (defined $feat->seq); + $feat->attach_seq($self->{'_consensus_sequence'}); + } + } elsif (!$flag) { # Register aligned sequence features + foreach my $feat (@$args) { + if (my $seq = $feat->entire_seq()) { + my $seqID = $seq->id() || $seq->display_id || $seq->primary_id; + $self->warn("Adding contig feature attached to unknown sequence $seqID!") + unless (exists $self->{'_elem'}{$seqID}); + my $tag = $feat->primary_tag; + $self->{'_elem'}{$seqID}{'_feat'}{$tag} = $feat; + } + } + } + + # Add feature to feature collection + my $nof_added = $self->{'_sfc'}->add_features($args); + + return $nof_added; +} + +=head2 remove_features + + Title : remove_features + Usage : $contig->remove_features(@feat) + Function : Remove an array of contig features + Returns : number of features removed. + Argument : An array of Bio::SeqFeatureI + +=cut + +sub remove_features { + my ($self, @args) = @_; + + # Removing shortcuts for aligned sequence features + foreach my $feat (@args) { + if (my $seq = $feat->entire_seq()) { + my $seqID = $seq->id() || $seq->display_id || $seq->primary_id; + my $tag = $feat->primary_tag; + $tag =~ s/:$seqID$/$1/g; + delete( $self->{'_elem'}{$seqID}{'_feat'}{$tag} ) + if (exists $self->{'_elem'}{$seqID}{'_feat'}{$tag} && + $self->{'_elem'}{$seqID}{'_feat'}{$tag} eq $feat); + } + } + + return $self->{'_sfc'}->remove_features(\@args); +} + +=head2 get_features_collection + + Title : get_features_collection + Usage : $contig->get_features_collection() + Function : Get the collection of all contig features + Returns : Bio::SeqFeature::Collection + Argument : none + +=cut + +sub get_features_collection { + my $self = shift; + + return $self->{'_sfc'}; +} + +=head1 Coordinate system's related methods + +See L above. + +=head2 change_coord + + Title : change_coord + Usage : $contig->change_coord($in,$out,$query) + Function : + + Change coordinate system for $query. This method + transforms locations between coordinate systems described + in section "Coordinate Systems" of this document. + + Note: this method will throw an exception when changing + coordinates between "ungapped consensus" and other + systems if consensus sequence was not set. It will also + throw exceptions when changing coordinates among aligned + sequence, either with or without gaps, and other systems + if sequence locations were not set with set_seq_coord(). + + Returns : integer + Argument : + $in : [string] input coordinate system + $out : [string] output coordinate system + $query : [integer] a position in a sequence + +=cut + +sub change_coord { + my $self = shift; + my $type_in = shift; + my $type_out = shift; + my $query = shift; + + # Parsing arguments + # Loading read objects (these calls will throw exceptions whether $read_in or + # $read_out is not found + my ($read_in,$read_out) = (undef,undef); + my $in_ID = ( split(' ',$type_in) )[1]; + my $out_ID = ( split(' ',$type_out) )[1]; + + if ($in_ID ne 'consensus') { + $read_in = $self->get_seq_coord( $self->get_seq_by_name($in_ID) ); + $self->throw("Can't change coordinates without sequence location for $in_ID") + unless (defined $read_in); + } + if ($out_ID ne 'consensus') { + $read_out = $self->get_seq_coord( $self->get_seq_by_name($out_ID) ); + $self->throw("Can't change coordinates without sequence location for $out_ID") + unless (defined $read_out); + } + + # Performing transformation between coordinates + SWITCH1: { + + # Transformations between contig padded and contig unpadded + (($type_in eq 'gapped consensus') && ($type_out eq 'ungapped consensus')) && do { + $self->throw("Can't use ungapped consensus coordinates without a consensus sequence") + unless (defined $self->{'_consensus_sequence'}); + $query = &_padded_unpadded($self->{'_consensus_gaps'}, $query); + last SWITCH1; + }; + (($type_in eq 'ungapped consensus') && ($type_out eq 'gapped consensus')) && do { + $self->throw("Can't use ungapped consensus coordinates without a consensus sequence") + unless (defined $self->{'_consensus_sequence'}); + $query = &_unpadded_padded($self->{'_consensus_gaps'},$query); + last SWITCH1; + }; + + # Transformations between contig (padded) and read (padded) + (($type_in eq 'gapped consensus') && + ($type_out =~ /^aligned /) && defined($read_out)) && do { + $query = $query - $read_out->start() + 1; + last SWITCH1; + }; + (($type_in =~ /^aligned /) && defined($read_in) && + ($type_out eq 'gapped consensus')) && do { + $query = $query + $read_in->start() - 1; + last SWITCH1; + }; + + # Transformations between contig (unpadded) and read (padded) + (($type_in eq 'ungapped consensus') && + ($type_out =~ /^aligned /) && defined($read_out)) && do { + $query = $self->change_coord('ungapped consensus','gapped consensus',$query); + $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query); + last SWITCH1; + }; + (($type_in =~ /^aligned /) && defined($read_in) && + ($type_out eq 'ungapped consensus')) && do { + $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query); + $query = $self->change_coord('gapped consensus','ungapped consensus',$query); + last SWITCH1; + }; + + # Transformations between seq $read_in padded and seq $read_out padded + (defined($read_in) && ($type_in =~ /^aligned /) && + defined($read_out) && ($type_out =~ /^aligned /)) && do { + $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query); + $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query); + last SWITCH1; + }; + + # Transformations between seq $read_in padded and seq $read_out unpadded + (defined($read_in) && ($type_in =~ /^aligned /) && + defined($read_out) && ($type_out =~ /^unaligned /)) && do { + if ($read_in ne $read_out) { + $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query); + $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query); + } + my $list_out = $self->{'_elem'}{$out_ID}{'_gaps'}; + $query = &_padded_unpadded($list_out,$query); + # Changing read orientation if read was reverse complemented when aligned + if ($read_out->strand == -1) { + my ($length) = $read_out->length(); + $length = $length - &_nof_gaps($list_out,$length); + $query = $length - $query + 1; + } + last SWITCH1; + }; + (defined($read_in) && ($type_in =~ /^unaligned /) && + defined($read_out) && ($type_out =~ /^aligned /)) && do { + my $list_in = $self->{'_elem'}{$in_ID}{'_gaps'}; + # Changing read orientation if read was reverse complemented when aligned + if ($read_in->strand == -1) { + my ($length) = $read_in->length(); + $length = $length - &_nof_gaps($list_in,$length); + $query = $length - $query + 1; + } + $query = &_unpadded_padded($list_in,$query); + if ($read_in ne $read_out) { + $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query); + $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query); + } + last SWITCH1; + }; + + # Transformations between seq $read_in unpadded and seq $read_out unpadded + (defined($read_in) && ($type_in =~ /^unaligned /) && + defined($read_out) && ($type_out =~ /^unaligned /)) && do { + $query = $self->change_coord("unaligned $in_ID","aligned $out_ID",$query); + $query = $self->change_coord("aligned $out_ID","unaligned $out_ID",$query); + last SWITCH1; + }; + + # Transformations between contig (padded) and read (unpadded) + (($type_in eq 'gapped consensus') && + ($type_out =~ /^unaligned /) && defined($read_out)) && do { + $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query); + $query = $self->change_coord("aligned $out_ID","unaligned $out_ID",$query); + last SWITCH1; + }; + (($type_in =~ /^unaligned /) && defined($read_in) && + ($type_out eq 'gapped consensus')) && do { + $query = $self->change_coord("unaligned $in_ID","aligned $in_ID",$query); + $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query); + last SWITCH1; + }; + + # Transformations between contig (unpadded) and read (unpadded) + (($type_in eq 'ungapped consensus') && + ($type_out =~ /^unaligned /) && defined($read_out)) && do { + $query = $self->change_coord('ungapped consensus','gapped consensus',$query); + $query = $self->change_coord('gapped consensus',"unaligned $out_ID",$query); + last SWITCH1; + }; + (($type_in =~ /^unaligned /) && defined($read_in) && + ($type_out eq 'ungapped consensus')) && do { + $query = $self->change_coord("unaligned $in_ID",'gapped consensus',$query); + $query = $self->change_coord('gapped consensus','ungapped consensus',$query); + last SWITCH1; + }; + + $self->throw("Unknow coordinate system. Args: $type_in, $type_out."); + $query = undef; # If a coordinate systems just requested is unknown + } + + return $query; +} + +=head2 get_seq_coord + + Title : get_seq_coord + Usage : $contig->get_seq_coord($seq); + Function : Get "gapped consensus" location for aligned sequence + Returns : Bio::SeqFeature::Generic for coordinates or undef. + A warning is printed if sequence coordinates were not set. + Argument : Bio::LocatabaleSeq object + +=cut + +sub get_seq_coord { + my ($self,$seq) = @_; + + if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) { + $self->throw("$seq is not a Bio::LocatableSeq"); + } + my $seqID = $seq->id() || $seq->display_id || $seq->primary_id; + + unless (exists( $self->{'_elem'}{$seqID} )) { + $self->warn("No such sequence ($seqID) in contig ".$self->id); + return undef; + } + unless (exists( $self->{'_elem'}{$seqID}{'_feat'}{"_aligned_coord:$seqID"} )) { + $self->warn("Location not set for sequence ($seqID) in contig ".$self->id); + return undef; + } + + return $self->{'_elem'}{$seqID}{'_feat'}{"_aligned_coord:$seqID"}; +} + +=head2 set_seq_coord + + Title : set_seq_coord + Usage : $contig->set_seq_coord($feat,$seq); + Function : + + Set "gapped consensus" location for an aligned + sequence. If the sequence was previously added using + add_seq, its coordinates are changed/set. Otherwise, + add_seq is called and the sequence is added to the + contig. + + Returns : Bio::SeqFeature::Generic for old coordinates or undef. + Argument : + $feat : a Bio::SeqFeature::Generic object + representing a location for the + aligned sequence, in "gapped + consensus" coordinates. + + Note: the original feature primary tag will + be lost. + + $seq : a Bio::LocatabaleSeq object + +=cut + +sub set_seq_coord { + my ($self,$feat,$seq) = @_; + + if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) { + $self->throw("Unable to process non locatable sequences [".ref($seq)."]"); + } + + # Complaining about inadequate feature object + $self->throw("Coordinates must be a Bio::SeqFeature::Generic object!") + unless ( $feat->isa("Bio::SeqFeature::Generic") ); + $self->throw("Sequence coordinates must have an end!") + unless (defined $feat->end); + $self->throw("Sequence coordinates must have a start!") + unless (defined $feat->start); + + my $seqID = $seq->id() || $seq->display_id || $seq->primary_id; + if (exists( $self->{'_elem'}{$seqID} ) && + exists( $self->{'_elem'}{$seqID}{'_seq'} ) && + defined( $self->{'_elem'}{$seqID}{'_seq'} ) && + ($seq ne $self->{'_elem'}{$seqID}{'_seq'}) ) { + $self->warn("Replacing sequence $seqID\n"); + $self->remove_seq($self->{'_elem'}{$seqID}{'_seq'}); + } + $self->add_seq($seq); + + # Remove previous coordinates, if any + $self->remove_features($feat); + + # Add new Bio::Generic::SeqFeature + $feat->add_tag_value('contig',$self->id) + unless ( $feat->has_tag('contig') ); + $feat->primary_tag("_aligned_coord:$seqID"); + $feat->attach_seq($seq); + $self->{'_elem'}{$seqID}{'_feat'}{"_aligned_coord:$seqID"} = $feat; + $self->add_features([ $feat ]); +} + +=head1 Bio::Assembly::Contig consensus methods + +=head2 set_consensus_sequence + + Title : set_consensus_sequence + Usage : $contig->set_consensus_sequence($seq) + Function : Set the consensus sequence object for this contig + Returns : consensus length + Argument : Bio::LocatableSeq + +=cut + +sub set_consensus_sequence { + my $self = shift; + my $seq = shift; + + $self->throw("Consensus sequence must be a Bio::LocatableSeq!") + unless ($seq->isa("Bio::LocatableSeq")); + + my $con_len = $seq->length; + $seq->start(1); $seq->end($con_len); + + $self->{'_consensus_gaps'} = []; # Consensus Gap registry + $self->_register_gaps($seq->seq, + $self->{'_consensus_gaps'}); + $self->{'_consensus_sequence'} = $seq; + + return $con_len; +} + +=head2 set_consensus_quality + + Title : set_consensus_quality + Usage : $contig->set_consensus_quality($qual) + Function : Set the quality object for consensus sequence + Returns : nothing + Argument : Bio::Seq::QualI object + +=cut + +sub set_consensus_quality { + my $self = shift; + my $qual = shift; + + $self->throw("Consensus quality must be a Bio::Seq::QualI object!") + unless ( $qual->isa("Bio::Seq::QualI") ); + + $self->throw("Consensus quality can't be added before you set the consensus sequence!") + unless (defined $self->{'_consensus_sequence'}); + + $self->{'_consensus_quality'} = $qual; +} + +=head2 get_consensus_length + + Title : get_consensus_length + Usage : $contig->get_consensus_length() + Function : Get consensus sequence length + Returns : integer + Argument : none + +=cut + +sub get_consensus_length { + my $self = shift; + + return $self->{'_consensus_sequence'}->length(); +} + +=head2 get_consensus_sequence + + Title : get_consensus_sequence + Usage : $contig->get_consensus_sequence() + Function : Get a reference to the consensus sequence object + for this contig + Returns : Bio::SeqI object + Argument : none + +=cut + +sub get_consensus_sequence { + my ($self, @args) = @_; + + return $self->{'_consensus_sequence'}; +} + +=head2 get_consensus_quality + + Title : get_consensus_quality + Usage : $contig->get_consensus_quality() + Function : Get a reference to the consensus quality object + for this contig. + Returns : A Bio::QualI object + Argument : none + +=cut + +sub get_consensus_quality { + my ($self, @args) = @_; + + return $self->{'_consensus_quality'}; +} + +=head1 Bio::Assembly::Contig aligned sequences methods + +=head2 set_seq_qual + + Title : set_seq_qual + Usage : $contig->set_seq_qual($seq,$qual); + Function : Adds quality to an aligned sequence. + Returns : nothing + Argument : a Bio::LocatableSeq object and + a Bio::Seq::QualI object + +See L for more information. + +=cut + +sub set_seq_qual { + my ($self,$seq,$qual) = @_; + + if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) { + $self->throw("Unable to process non locatable sequences [", ref($seq), "]"); + } + my $seqID = $seq->id() || $seq->display_id || $seq->primary_id; + + $self->throw("Consensus quality must be a Bio::Seq::QualI object!") + unless ( $qual->isa("Bio::Seq::QualI") ); + $self->throw("Use add_seq first: aligned sequence qualities can't be added before you load the sequence!") + unless (exists $self->{'_elem'}{$seqID}{'_seq'}); + $self->throw("Use set_seq_coord first: aligned sequence qualities can't be added before you add coordinates for the sequence!") unless (defined( $self->get_seq_coord($seq) )); + + # Adding gaps to quality object + my $sequence = $self->{'_elem'}{$seqID}{'_seq'}->seq(); + my $tmp = $qual->qual(); + @{$tmp} = reverse(@{$tmp}) if ($self->get_seq_coord($seq)->strand() == -1); + my @quality = (); + my $previous = 0; + my $next = 0; + my $i = 0; my $j = 0; + while ($i<=$#{$tmp}) { + # IF base is a gap, quality is the average for neighbouring sites + if (substr($sequence,$j,1) eq '-') { + $previous = $tmp->[$i-1] unless ($i == 0); + if ($i < $#{$tmp}) { + $next = $tmp->[$i+1]; + } else { + $next = 0; + } + push(@quality,int( ($previous+$next)/2 )); + } else { + push(@quality,$tmp->[$i]); + $i++; + } + $j++; + } + + $self->{'_elem'}{$seqID}{'_qual'} = Bio::Seq::PrimaryQual->new(-qual=>join(" ",@quality), + -id=>$seqID); +} + +=head2 get_seq_ids + + Title : get_seq_ids + Usage : $contig->get_seq_ids(-start=>$start, + -end=>$end, + -type=>"gapped A0QR67B08.b"); + Function : Get list of sequence IDs overlapping inteval [$start, $end] + The default interval is [1,$contig->length] + Default coordinate system is "gapped contig" + Returns : An array + Argument : A hash with optional elements: + -start : consensus subsequence start + -end : consensus subsequence end + -type : the coordinate system type for $start and $end arguments + Coordinate system avaliable are: + "gapped consensus" : consensus coordinates with gaps + "ungapped consensus" : consensus coordinates without gaps + "aligned $ReadID" : read $ReadID coordinates with gaps + "unaligned $ReadID" : read $ReadID coordinates without gaps + + +=cut + +sub get_seq_ids { + my ($self, @args) = @_; + + my ($type,$start,$end) = + $self->_rearrange([qw(TYPE START END)], @args); + + if (defined($start) && defined($end)) { + if (defined($type) && ($type ne 'gapped consensus')) { + $start = $self->change_coord($type,'gapped consensus',$start); + $end = $self->change_coord($type,'gapped consensus',$end); + } + + my @list = grep { $_->isa("Bio::SeqFeature::Generic") && + ($_->primary_tag =~ /^_aligned_coord:/) } + $self->{'_sfc'}->features_in_range(-start=>$start, + -end=>$end, + -contain=>0, + -strandmatch=>'ignore'); + @list = map { $_->entire_seq->id } @list; + return @list; + } + + # Entire aligned sequences list + return map { $self->{'_order'}{$_} } sort { $a cmp $b } keys %{ $self->{'_order'} }; +} + +=head2 get_seq_feat_by_tag + + Title : get_seq_feat_by_tag + Usage : $seq = $contig->get_seq_feat_by_tag($seq,"_aligned_coord:$seqID") + Function : + + Get a sequence feature based on its primary_tag. + When you add + + Returns : a Bio::SeqFeature object + Argument : a Bio::LocatableSeq and a string (feature primary tag) + +=cut + +sub get_seq_feat_by_tag { + my ($self,$seq,$tag) = @_; + + if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) { + $self->throw("Unable to process non locatable sequences [", ref($seq), "]"); + } + my $seqID = $seq->id() || $seq->display_id || $seq->primary_id; + + return $self->{'_elem'}{$seqID}{'_feat'}{$tag}; +} + +=head2 get_seq_by_name + + Title : get_seq_by_name + Usage : $seq = $contig->get_seq_by_name('Seq1') + Function : Gets a sequence based on its id. + Returns : a Bio::LocatableSeq object + undef if name is not found + Argument : string + +=cut + +sub get_seq_by_name { + my $self = shift; + my ($seqID) = @_; + + unless (exists $self->{'_elem'}{$seqID}{'_seq'}) { + $self->throw("Could not find sequence $seqID in contig ".$self->id); + return undef; + } + + return $self->{'_elem'}{$seqID}{'_seq'}; +} + +=head2 get_qual_by_name + + Title : get_qual_by_name + Usage : $seq = $contig->get_qual_by_name('Seq1') + Function : + + Gets Bio::Seq::QualI object for a sequence + through its id ( as given by $qual->id() ). + + Returns : a Bio::Seq::QualI object. + undef if name is not found + Argument : string + +=cut + +sub get_qual_by_name { + my $self = shift; + my ($seqID) = @_; + + unless (exists $self->{'_elem'}{$seqID}{'_qual'}) { + $self->warn("Could not find quality for $seqID in contig!"); + return undef; + } + + return $self->{'_elem'}{$seqID}{'_qual'}; +} + +=head1 Bio::Align::AlignI compatible methods + +=head2 Modifier methods + +These methods modify the MSE by adding, removing or shuffling complete +sequences. + +=head2 add_seq + + Title : add_seq + Usage : $contig->add_seq($newseq); + Function : + + Adds a sequence to the contig. *Does* + *not* align it - just adds it to the + hashes. + + Returns : nothing + Argument : a Bio::LocatableSeq object + +See L for more information. + +=cut + +sub add_seq { + my $self = shift; + my $seq = shift; + + if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) { + $self->throw("Unable to process non locatable sequences [", ref($seq), "]"); + } + + my $seqID = $seq->id() || $seq->display_id || $seq->primary_id; + $self->{'_elem'}{$seqID} = {} unless (exists $self->{'elem'}{$seqID}); + + if (exists( $self->{'_elem'}{$seqID}{'_seq'} ) && + ($seq eq $self->{'_elem'}{$seqID}{'_seq'}) ) { + $self->warn("Adding sequence $seqID, which has already been added"); + } + + # Our locatable sequences are always considered to be complete sequences + $seq->start(1); $seq->end($seq->length()); + + $self->warn("Adding non-nucleotidic sequence ".$seqID) + if (lc($seq->alphabet) ne 'dna' && lc($seq->alphabet) ne 'rna'); + + # build the symbol list for this sequence, + # will prune out the gap and missing/match chars + # when actually asked for the symbol list in the + # symbol_chars + if (defined $seq->seq) { + map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq); + } else { + $self->{'_symbols'} = {}; + } + + my $seq_no = ++$self->{'_nof_seqs'}; + + if (ref( $self->{'_elem'}{$seqID}{'_seq'} )) { + $self->warn("Replacing one sequence [$seqID]\n"); + } else { + #print STDERR "Assigning $seqID to $order\n"; + $self->{'_order'}->{$seq_no} = $seqID; +# $self->{'_start_end_lists'}->{$id} = [] +# unless(exists $self->{'_start_end_lists'}->{$id}); +# push @{$self->{'_start_end_lists'}->{$id}}, $seq; + } + + $self->{'_elem'}{$seqID}{'_seq'} = $seq; + $self->{'_elem'}{$seqID}{'_feat'} = {}; + $self->{'_elem'}{$seqID}{'_gaps'} = []; + my $dbref = $self->{'_elem'}{$seqID}{'_gaps'}; + my $nofgaps = $self->_register_gaps($seq->seq,$dbref); + + # Updating residue count + $self->{'_nof_residues'} += $seq->length - $nofgaps; + + return 1; +} + +=head2 remove_seq + + Title : remove_seq + Usage : $contig->remove_seq($seq); + Function : Removes a single sequence from an alignment + Returns : 1 on success, 0 otherwise + Argument : a Bio::LocatableSeq object + +=cut + +sub remove_seq { + my ($self,$seq) = @_; + + if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) { + $self->throw("Unable to process non locatable sequences [", ref($seq), "]"); + } + + my $seqID = $seq->id() || $seq->display_id || $seq->primary_id; + unless (exists $self->{'_elem'}{$seqID} ) { + $self->warn("No sequence named $seqID [$seq]"); + return 0; + } + + # Updating residue count + $self->{'_nof_residues'} -= $seq->length() + + &_nof_gaps( $self->{'_elem'}{$seqID}{'_gaps'}, $seq->length ); + + # Remove all references to features of this sequence + my @feats = (); + foreach my $tag (keys %{ $self->{'_elem'}{$seqID}{'_feat'} }) { + push(@feats, $self->{'_elem'}{$seqID}{'_feat'}{$tag}); + } + $self->{'_sfc'}->remove_features(\@feats); + delete $self->{'_elem'}{$seqID}; + + return 1; +} + +=head2 purge + + Title : purge + Usage : $contig->purge(0.7); + Function: + + Removes sequences above whatever %id. + + This function will grind on large alignments. Beware! + (perhaps not ideally implemented) + + Example : + Returns : An array of the removed sequences + Argument: + + +=cut + +sub purge { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 sort_alphabetically + + Title : sort_alphabetically + Usage : $contig->sort_alphabetically + Function : + + Changes the order of the alignemnt to alphabetical on name + followed by numerical by number. + + Returns : + Argument : + +=cut + +sub sort_alphabetically { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 Sequence selection methods + +Methods returning one or more sequences objects. + +=head2 each_seq + + Title : each_seq + Usage : foreach $seq ( $contig->each_seq() ) + Function : Gets an array of Seq objects from the alignment + Returns : an array + Argument : + +=cut + +sub each_seq { + my ($self) = @_; + + my (@arr,$seqID); + + foreach $seqID ( map { $self->{'_order'}{$_} } sort { $a <=> $b } keys %{$self->{'_order'}} ) { + push(@arr,$self->{'_elem'}{$seqID}{'_seq'}); + } + + return @arr; +} + +=head2 each_alphabetically + + Title : each_alphabetically + Usage : foreach $seq ( $contig->each_alphabetically() ) + Function : + + Returns an array of sequence object sorted alphabetically + by name and then by start point. + Does not change the order of the alignment + + Returns : + Argument : + +=cut + +sub each_alphabetically { + my($self) = @_; + $self->throw_not_implemented(); +} + +=head2 each_seq_with_id + + Title : each_seq_with_id + Usage : foreach $seq ( $contig->each_seq_with_id() ) + Function : + + Gets an array of Seq objects from the + alignment, the contents being those sequences + with the given name (there may be more than one) + + Returns : an array + Argument : a seq name + +=cut + +sub each_seq_with_id { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 get_seq_by_pos + + Title : get_seq_by_pos + Usage : $seq = $contig->get_seq_by_pos(3) + Function : + + Gets a sequence based on its position in the alignment. + Numbering starts from 1. Sequence positions larger than + no_sequences() will thow an error. + + Returns : a Bio::LocatableSeq object + Argument : positive integer for the sequence osition + +=cut + +sub get_seq_by_pos { + my $self = shift; + my ($pos) = @_; + + $self->throw("Sequence position has to be a positive integer, not [$pos]") + unless $pos =~ /^\d+$/ and $pos > 0; + $self->throw("No sequence at position [$pos]") + unless $pos <= $self->no_sequences ; + + my $seqID = $self->{'_order'}->{--$pos}; + return $self->{'_elem'}{$seqID}{'_seq'}; +} + +=head2 Create new alignments + +The result of these methods are horizontal or vertical subsets of the +current MSE. + +=head2 select + + Title : select + Usage : $contig2 = $contig->select(1, 3) # three first sequences + Function : + + Creates a new alignment from a continuous subset of + sequences. Numbering starts from 1. Sequence positions + larger than no_sequences() will thow an error. + + Returns : a Bio::Assembly::Contig object + Argument : positive integer for the first sequence + positive integer for the last sequence to include (optional) + +=cut + +sub select { + my ($self) = @_; + $self->throw_not_implemented(); +} + + +=head2 select_noncont + + Title : select_noncont + Usage : $contig2 = $contig->select_noncont(1, 3) # first and 3rd sequences + Function : + + Creates a new alignment from a subset of + sequences. Numbering starts from 1. Sequence positions + larger than no_sequences() will thow an error. + + Returns : a Bio::Assembly::Contig object + Args : array of integers for the sequences + +=cut + +sub select_noncont { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 slice + + Title : slice + Usage : $contig2 = $contig->slice(20, 30) + Function : + + Creates a slice from the alignment inclusive of start and + end columns. Sequences with no residues in the slice are + excluded from the new alignment and a warning is printed. + Slice beyond the length of the sequence does not do + padding. + + Returns : a Bio::Assembly::Contig object + Argument : positive integer for start column + positive integer for end column + +=cut + +sub slice { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 Change sequences within the MSE + +These methods affect characters in all sequences without changeing the +alignment. + + +=head2 map_chars + + Title : map_chars + Usage : $contig->map_chars('\.','-') + Function : + + Does a s/$arg1/$arg2/ on the sequences. Useful for gap + characters + + Notice that the from (arg1) is interpretted as a regex, + so be careful about quoting meta characters (eg + $contig->map_chars('.','-') wont do what you want) + + Returns : + Argument : 'from' rexexp + 'to' string + +=cut + +sub map_chars { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 uppercase + + Title : uppercase() + Usage : $contig->uppercase() + Function : Sets all the sequences to uppercase + Returns : + Argument : + +=cut + +sub uppercase { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 match_line + + Title : match_line() + Usage : $contig->match_line() + Function : Generates a match line - much like consensus string + except that a line indicating the '*' for a match. + Argument : (optional) Match line characters ('*' by default) + (optional) Strong match char (':' by default) + (optional) Weak match char ('.' by default) + +=cut + +sub match_line { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 match + + Title : match() + Usage : $contig->match() + Function : + + Goes through all columns and changes residues that are + identical to residue in first sequence to match '.' + character. Sets match_char. + + USE WITH CARE: Most MSE formats do not support match + characters in sequences, so this is mostly for output + only. NEXUS format (Bio::AlignIO::nexus) can handle + it. + + Returns : 1 + Argument : a match character, optional, defaults to '.' + +=cut + +sub match { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 unmatch + + Title : unmatch() + Usage : $contig->unmatch() + Function : + + Undoes the effect of method match. Unsets match_char. + + Returns : 1 + Argument : a match character, optional, defaults to '.' + +=cut + +sub unmatch { + my ($self) = @_; + $self->throw_not_implemented(); +} + + +=head2 MSE attibutes + +Methods for setting and reading the MSE attributes. + +Note that the methods defining character semantics depend on the user +to set them sensibly. They are needed only by certain input/output +methods. Unset them by setting to an empty string (''). + +=head2 id + + Title : id + Usage : $contig->id("Ig") + Function : Gets/sets the id field of the alignment + Returns : An id string + Argument : An id string (optional) + +=cut + +sub id { + my ($self, $contig_name) = @_; + + if (defined( $contig_name )) { + $self->{'_id'} = $contig_name; + } + + return $self->{'_id'}; +} + +=head2 missing_char + + Title : missing_char + Usage : $contig->missing_char("?") + Function : Gets/sets the missing_char attribute of the alignment + It is generally recommended to set it to 'n' or 'N' + for nucleotides and to 'X' for protein. + Returns : An missing_char string, + Argument : An missing_char string (optional) + +=cut + +sub missing_char { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 match_char + + Title : match_char + Usage : $contig->match_char('.') + Function : Gets/sets the match_char attribute of the alignment + Returns : An match_char string, + Argument : An match_char string (optional) + +=cut + +sub match_char { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 gap_char + + Title : gap_char + Usage : $contig->gap_char('-') + Function : Gets/sets the gap_char attribute of the alignment + Returns : An gap_char string, defaults to '-' + Argument : An gap_char string (optional) + +=cut + +sub gap_char { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 symbol_chars + + Title : symbol_chars + Usage : my @symbolchars = $contig->symbol_chars; + Function: Returns all the seen symbols (other than gaps) + Returns : array of characters that are the seen symbols + Argument: boolean to include the gap/missing/match characters + +=cut + +sub symbol_chars{ + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 Alignment descriptors + +These read only methods describe the MSE in various ways. + + +=head2 consensus_string + + Title : consensus_string + Usage : $str = $contig->consensus_string($threshold_percent) + Function : Makes a strict consensus + Returns : + Argument : Optional treshold ranging from 0 to 100. + The consensus residue has to appear at least threshold % + of the sequences at a given location, otherwise a '?' + character will be placed at that location. + (Default value = 0%) + +=cut + +sub consensus_string { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 consensus_iupac + + Title : consensus_iupac + Usage : $str = $contig->consensus_iupac() + Function : + + Makes a consensus using IUPAC ambiguity codes from DNA + and RNA. The output is in upper case except when gaps in + a column force output to be in lower case. + + Note that if your alignment sequences contain a lot of + IUPAC ambiquity codes you often have to manually set + alphabet. Bio::PrimarySeq::_guess_type thinks they + indicate a protein sequence. + + Returns : consensus string + Argument : none + Throws : on protein sequences + + +=cut + +sub consensus_iupac { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 is_flush + + Title : is_flush + Usage : if( $contig->is_flush() ) + : + : + Function : Tells you whether the alignment + : is flush, ie all of the same length + : + : + Returns : 1 or 0 + Argument : + +=cut + +sub is_flush { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 length + + Title : length() + Usage : $len = $contig->length() + Function : Returns the maximum length of the alignment. + To be sure the alignment is a block, use is_flush + Returns : + Argument : + +=cut + +sub length { + my ($self) = @_; + + $self->throw_not_implemented(); +} + +=head2 maxdisplayname_length + + Title : maxdisplayname_length + Usage : $contig->maxdisplayname_length() + Function : + + Gets the maximum length of the displayname in the + alignment. Used in writing out various MSE formats. + + Returns : integer + Argument : + +=cut + +sub maxname_length { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 no_residues + + Title : no_residues + Usage : $no = $contig->no_residues + Function : number of residues in total in the alignment + Returns : integer + Argument : + +=cut + +sub no_residues { + my ($self) = @_; + + return $self->{'_nof_residues'}; +} + +=head2 no_sequences + + Title : no_sequences + Usage : $depth = $contig->no_sequences + Function : number of sequence in the sequence alignment + Returns : integer + Argument : None + +=cut + +sub no_sequences { + my ($self) = @_; + + return scalar( keys %{ $self->{'_elem'} } ); +} + +=head2 percentage_identity + + Title : percentage_identity + Usage : $id = $contig->percentage_identity + Function: The function calculates the percentage identity of the alignment + Returns : The percentage identity of the alignment (as defined by the + implementation) + Argument: None + +=cut + +sub percentage_identity{ + my ($self) = @_; + + $self->throw_not_implemeneted(); +} + +=head2 overall_percentage_identity + + Title : percentage_identity + Usage : $id = $contig->percentage_identity + Function: The function calculates the percentage identity of + the conserved columns + Returns : The percentage identity of the conserved columns + Args : None + +=cut + +sub overall_percentage_identity{ + my ($self) = @_; + $self->throw_not_implemented(); +} + + +=head2 average_percentage_identity + + Title : average_percentage_identity + Usage : $id = $contig->average_percentage_identity + Function: The function uses a fast method to calculate the average + percentage identity of the alignment + Returns : The average percentage identity of the alignment + Args : None + +=cut + +sub average_percentage_identity { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 Alignment positions + +Methods to map a sequence position into an alignment column and back. +column_from_residue_number() does the former. The latter is really a +property of the sequence object and can done using +L: + + # select somehow a sequence from the alignment, e.g. + my $seq = $contig->get_seq_by_pos(1); + #$loc is undef or Bio::LocationI object + my $loc = $seq->location_from_column(5); + + +=head2 column_from_residue_number + + Title : column_from_residue_number + Usage : $col = $contig->column_from_residue_number( $seqname, $resnumber) + Function: + + This function gives the position in the alignment + (i.e. column number) of the given residue number in the + sequence with the given name. For example, for the + alignment + + Seq1/91-97 AC..DEF.GH + Seq2/24-30 ACGG.RTY.. + Seq3/43-51 AC.DDEFGHI + + column_from_residue_number( "Seq1", 94 ) returns 5. + column_from_residue_number( "Seq2", 25 ) returns 2. + column_from_residue_number( "Seq3", 50 ) returns 9. + + An exception is thrown if the residue number would lie + outside the length of the aligment + (e.g. column_from_residue_number( "Seq2", 22 ) + + Note: If the the parent sequence is represented by more than + one alignment sequence and the residue number is present in + them, this method finds only the first one. + + Returns : A column number for the position in the alignment of the + given residue in the given sequence (1 = first column) + Args : A sequence id/name (not a name/start-end) + A residue number in the whole sequence (not just that + segment of it in the alignment) + +=cut + +sub column_from_residue_number { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 Sequence names + +Methods to manipulate the display name. The default name based on the +sequence id and subsequence positions can be overridden in various +ways. + +=head2 displayname + + Title : displayname + Usage : $contig->displayname("Ig", "IgA") + Function : Gets/sets the display name of a sequence in the alignment + : + Returns : A display name string + Argument : name of the sequence + displayname of the sequence (optional) + +=cut + +sub displayname { # Do nothing +} + +=head2 set_displayname_count + + Title : set_displayname_count + Usage : $contig->set_displayname_count + Function : + + Sets the names to be name_# where # is the number of + times this name has been used. + + Returns : None + Argument : None + +=cut + +sub set_displayname_count { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 set_displayname_flat + + Title : set_displayname_flat + Usage : $contig->set_displayname_flat() + Function : Makes all the sequences be displayed as just their name, + not name/start-end + Returns : 1 + Argument : None + +=cut + +sub set_displayname_flat { # Do nothing! +} + +=head2 set_displayname_normal + + Title : set_displayname_normal + Usage : $contig->set_displayname_normal() + Function : Makes all the sequences be displayed as name/start-end + Returns : None + Argument : None + +=cut + +sub set_displayname_normal { # Do nothing! +} + +=head1 Internal Methods + +=head2 _binary_search + + Title : _binary_search + Usage : _binary_search($list,$query) + Function : + + Find a number in a sorted list of numbers. Return values + may be on or two integers. One positive integer or zero + (>=0) is the index of the element that stores the queried + value. Two positive integers (or zero and another + number) are the indexes of elements among which the + queried value should be placed. Negative single values + mean: + + -1: $query is smaller than smallest element in list + -2: $query is greater than greatest element in list + + Returns : array of integers + Argument : + $list : array reference + $query : integer + +=cut + +sub _binary_search { + my $list = shift; + my $query = shift; + # + # If there is only one element in list + if (!$#{$list} && ($query == $list->[0])) { return (0) } + # If there are others... + my $start = 0; + my $end = $#{$list}; + (&_compare($query,$list->[$start]) == 0) && do { return ($start) }; + (&_compare($query,$list->[$end]) == 0) && do { return ($end) }; + (&_compare($query,$list->[$start]) < 0) && do { return (-1) }; + (&_compare($query,$list->[$end]) > 0) && do { return (-2) }; + my $middle = 0; + while ($end - $start > 1) { + $middle = int(($end+$middle)/2); + (&_compare($query,$list->[$middle]) == 0) && return ($middle); + (&_compare($query,$list->[$middle]) < 0) && do { $end = $middle ; $middle = 0; next }; + $start = $middle; # If &_compare() > 0, move region beggining + } + return ($start,$end); +} + +=head2 _compare + + Title : _compare + Usage : _compare($arg1,$arg2) + Function: Perform numeric or string comparisons + Returns : integer (0, 1 or -1) + Args : values to be compared + +=cut + +sub _compare { + my $arg1 = shift; + my $arg2 = shift; + # + if (($arg1 =~ /^\d+$/) && ($arg2 =~ /^\d+$/)) { return $arg1 <=> $arg2 } + else { return $arg1 cmp $arg2 } +} + +=head2 _nof_gaps + + Title : _nof_gaps + Usage : _nof_gaps($array_ref, $query) + Function: number of gaps found before position $query + Returns : integer + Args : + $array_ref : gap registry reference + $query : [integer] a position in a sequence + +=cut + +#' emacs... +sub _nof_gaps { + my $list = shift; + my $query = shift; + # If there are no gaps in this contig + return 0 unless (defined($list) && scalar(@{$list})); + # Locate query index in gap list (if any) + my @index = &_binary_search($list,$query); + # If after all alignments, correct using total number of align + if ($index[0] == -2) { $query = scalar(@{$list}) } + # If before any alignment, return 0 + elsif ($index[0] == -1) { $query = 0 } + elsif ($index[0] >= 0) { + # If query is between alignments, translate coordinates + if ($#index > 0) { $query = $index[0] + 1 } + # If query sits upon an alignment, do another correction + elsif ($#index == 0) { $query = $index[0] } + } + # + return $query; +} + +=head2 _padded_unpadded + + Title : _padded_unpadded + Usage : _padded_unpadded($array_ref, $query) + Function: + + Returns a coordinate corresponding to + position $query after gaps were + removed from a sequence. + + Returns : integer + Args : + $array_ref : reference to this gap registry + $query : [integer] coordionate to change + +=cut + +sub _padded_unpadded { + my $list = shift; + my $query = shift; + + my $align = &_nof_gaps($list,$query); + $query-- if (defined($list->[$align]) && ($list->[$align] == $query)); + $query = $query - $align; + # + return $query; +} + +=head2 _unpadded_padded + + Title : _unpadded_padded + Usage : _unpadded_padded($array_ref, $query) + Function: + + Returns the value corresponding to + ungapped position $query when gaps are + counted as valid sites in a sequence + + Returns : + Args : $array_ref = a reference to this sequence's gap registry + $query = [integer] location to change + +=cut + +#' +sub _unpadded_padded { + my $list = shift; + my $query = shift; + + my $align = &_nof_gaps($list,$query); + $query = $query + $align; + my $new_align = &_nof_gaps($list,$query); + while ($new_align - $align > 0) { + $query = $query + $new_align - $align; + $align = $new_align; + $new_align = &_nof_gaps($list,$query); + } + # If current position is also a align, look for the first upstream base + while (defined($list->[$align]) && ($list->[$align] == $query)) { + $query++; $align++; + } + # + return $query; +} + +=head2 _register_gaps + + Title : _register_gaps + Usage : $self->_register_gaps($seq, $array_ref) + Function: stores gap locations for a sequence + Returns : number of gaps found + Args : + $seq : sequence string + $array_ref : a reference to an array, + where gap locations will + be stored + +=cut + +sub _register_gaps { + my $self = shift; + my $sequence = shift; + my $dbref = shift; + + $self->throw("Not an aligned sequence string to register gaps") + if (ref($sequence)); + + $self->throw("Not an array reference for gap registry") + unless (ref($dbref) eq 'ARRAY'); + + # Registering alignments + @{$dbref} = (); # Cleaning registry + if (defined $sequence) { + my $i = -1; + while(1) { + $i = index($sequence,"-",$i+1); + last if ($i == -1); + push(@{$dbref},$i+1); + } + } else { +# $self->warn("Found undefined sequence while registering gaps"); + return 0; + } + + return scalar(@{$dbref}); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Assembly/ContigAnalysis.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Assembly/ContigAnalysis.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,500 @@ +# $Id: ContigAnalysis.pm,v 1.2 2002/12/01 00:03:28 jason Exp $ +# +# BioPerl module for Bio::Assembly::ContigAnalysis +# +# Cared for by Robson francisco de Souza +# +# Copyright Robson Francisco de Souza +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Assembly::ContigAnalysis - + Perform analysis on sequence assembly contigs. + +=head1 SYNOPSIS + + # Module loading + use Bio::Assembly::ContigAnalysis; + + # Assembly loading methods + my $ca = new Bio::Assembly::ContigAnalysis( -contig=>$contigOBJ ); + + my @lcq = $ca->low_consensus_quality; + my @hqd = $ca->high_quality_discrepancies; + my @ss = $ca->single_strand_regions; + +=head1 DESCRIPTION + +A contig is as a set of sequences, locally aligned to each other, when +the sequences in a pair may be aligned. It may also include a +consensus sequence. Bio::Assembly::ContigAnalysis is a module +holding a collection of methods to analyze contig objects. It was +developed around the Bio::Assembly::Contig implementation of contigs and +can not work with another contig interface. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Robson Francisco de Souza + +Email: rfsouza@citri.iq.usp.br + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +package Bio::Assembly::ContigAnalysis; + +use Bio::Root::Root; +use strict; +use vars qw(@ISA); + +@ISA = qw(Bio::Root::Root); + +=head1 Object creator + +=head2 new + + Title : new + Usage : my $contig = Bio::Assembly::ContigAnalysis->new(-contig=>$contigOBJ); + Function : Creates a new contig analysis object + Returns : Bio::Assembly::ContigAnalysis + Args : + -contig : a Bio::Assembly::Contig object + +=cut + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + + my ($contigOBJ) = $self->_rearrange([qw(CONTIG)],@args); + unless ($contigOBJ->isa("Bio::Assembly::Contig")) { + $self->throw("ContigAnal works only on Bio::Assembly::Contig objects\n"); + } + + $self->{'_objref'} = $contigOBJ; + return $self; +} + +=head1 Analysis methods + +=head2 high_quality_discrepancies + + Title : high_quality_discrepancies + Usage : my $sfc = $ContigAnal->high_quality_discrepancies(); + Function : + + Locates all high quality discrepancies among aligned + sequences and the consensus sequence. + + Note: see Bio::Assembly::Contig POD documentation, + section "Coordinate System", for a definition of + available types. Default coordinate system type is + "gapped consensus", i.e. consensus sequence (with gaps) + coordinates. If limits are not specified, the entire + alignment is analyzed. + + Returns : Bio::SeqFeature::Collection + Args : optional arguments are + -threshold : cutoff value for low quality (minimum high quality) + Default: 40 + -ignore : number of bases that will not be analysed at + both ends of contig aligned elements + Default: 5 + -start : start of interval that will be analyzed + -end : start of interval that will be analyzed + -type : coordinate system type for interval + +=cut + +sub high_quality_discrepancies { + my ($self,@args) = shift; # Package reference + + my ($threshold,$ignore,$start,$end,$type) = + $self->_rearrange([qw(THRESHOLD IGNORE START END TYPE)],@args); + + # Defining default threhold and HQD_ignore + $threshold = 40 unless (defined($threshold)); + $ignore = 5 unless (defined($ignore)); + $type = 'gapped consensus' unless (defined($type)); + + # Changing input coordinates system (if needed) + if (defined $start && $type ne 'gapped consensus') { + $start = $self->{'_objref'}->change_coord($type,'gapped consensus',$start); + } elsif (!defined $start) { + $start = 1; + } + if (defined $end && $type ne 'gapped consensus') { + $end = $self->{'_objref'}->change_coord($type,'gapped consensus',$end); + } elsif (!defined $end) { + $end = $self->{'_objref'}->get_consensus_length(); + } + + # Scanning each read sequence and the contig sequence and + # adding discrepancies to Bio::SeqFeature::Collection + my @seqIDs = $self->{'_objref'}->get_seq_ids(-start=>$start, + -end=>$end, + -type=>$type); + my $consensus = $self->{'_objref'}->get_consensus_sequence()->seq; + + my @HQD = (); + foreach my $seqID (@seqIDs) { + # Setting aligned read sub-sequence limits and loading data + my $seq = $self->{'_objref'}->get_seq_by_name($seqID); + my $qual = $self->{'_objref'}->get_qual_by_name($seqID); + unless (defined $qual) { + $self->warn("Can't correctly evaluate HQD without aligned sequence qualities for $seqID"); + next; + } + my $sequence = $seq->seq; + my @quality = @{ $qual->qual }; + + # Scanning the aligned region of each read + my $seq_ix = 0; + my $coord = $self->{'_objref'}->get_seq_feat_by_tag($seq,"_align_clipping:$seqID"); + my ($astart,$aend) = ($coord->start,$coord->end); + $astart = $astart + $ignore; # Redefining limits to evaluate HQDs (jump $ignore at start) + $aend = $aend - $ignore; # Redefining limits to evaluate HQDs (stop $ignore before end) + + my ($d_start,$d_end,$i); + for ($i=$astart-1; $i<=$aend-1; $i++) { + # Changing coordinate $i+1 from 'gapped consensus' mode to "aligned $seqID" (coordinate $seq_ix) + $seq_ix = $self->{'_objref'}->change_coord('gapped consensus',"aligned $seqID",$i+1); + next unless (($i >= $start) && ($i <= $end)); + + my $r_base = uc(substr($sequence,$seq_ix-1,1)); + my $c_base = uc(substr($consensus,$i,1)); + + # Discrepant region start: store $d_start and $type + (!defined($d_start) && + ($r_base ne $c_base) && + ($quality[$seq_ix-1] >= $threshold)) && do { + $d_start = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i+1); + #print $seqID," ",$r_base," ",$i+1," ",$c_base," ",$contig_ix-1," ",$quality[$i]," $type\n"; + next; + }; + + # Quality change or end of discrepant region: store limits and undef $d_start + if (defined($d_start) && + (($quality[$seq_ix-1] < $threshold) || + (uc($r_base) eq uc($c_base)))) { + $d_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i); + #print $seqID," ",$r_base," ",$i+1," ",$c_base," ",$contig_ix-1," ",$quality[$i]," $type\n"; + push(@HQD, Bio::SeqFeature::Generic->new(-primary=>"high_quality_discrepancy:$seqID", + -start=>$d_start, + -end=>$d_end, + -strand=>$seq->strand()) ); + $d_start = undef; + next; + } + } # for ($i=$astart-1; $i<=$aend-1; $i++) + + # Loading discrepancies located at sub-sequence end, if any. + if (defined($d_start)) { + $d_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i); + push(@HQD, Bio::SeqFeature::Generic->new(-primary=>"high_quality_discrepancy:$seqID", + -start=>$d_start, + -end=>$d_end, + -strand=>$seq->strand()) ); + } + } # foreach my $seqID (@seqIDs) + + return @HQD; +} + +=head2 low_consensus_quality + + Title : low_consensus_quality + Usage : my $sfc = $ContigAnal->low_consensus_quality(); + Function : Locates all low quality regions in the consensus + Returns : an array of Bio::SeqFeature::Generic objects + Args : optional arguments are + -threshold : cutoff value for low quality (minimum high quality) + Default: 25 + -start : start of interval that will be analyzed + -end : start of interval that will be analyzed + -type : coordinate system type for interval + +=cut + +sub low_consensus_quality { + my ($self,@args) = shift; # Packege reference + + my ($threshold,$start,$end,$type) = + $self->_rearrange([qw(THRESHOLD START END TYPE)],@args); + + # Setting default value for threshold + $threshold = 25 unless (defined($threshold)); + + # Loading qualities + my @quality = @{ $self->{'_objref'}->get_consensus_quality()->qual }; + + # Changing coordinates to gap mode noaln (consed: consensus without alignments) + $start = 1 unless (defined($start)); + if (defined $start && defined $type && ($type ne 'gapped consensus')) { + $start = $self->{'objref'}->change_coord($type,'gapped consensus',$start); + $end = $self->{'objref'}->change_coord($type,'gapped consensus',$end) if (defined($end)); + } + $end = $self->{'_objref'}->get_consensus_length unless (defined $end); + + # Scanning @quality vector and storing intervals limits with base qualities less then + # the threshold value + my ($lcq_start); + my ($i,@LCQ); + for ($i=$start-1; $i<=$end-1; $i++) { +# print $quality[$i],"\t",$i,"\n"; + if (!defined($lcq_start) && (($quality[$i] <= $threshold) || ($quality[$i] == 98))) { + $lcq_start = $i+1; + } elsif (defined($lcq_start) && ($quality[$i] > $threshold)) { + $lcq_start = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$lcq_start); + my $lcq_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i); + push(@LCQ, Bio::SeqFeature::Generic->new(-start=>$lcq_start, + -end=>$lcq_end, + -primary=>'low_consensus_quality') ); + $lcq_start = undef; + } + } + + if (defined $lcq_start) { + $lcq_start = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$lcq_start); + my $lcq_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i); + push(@LCQ, Bio::SeqFeature::Generic->new(-start=>$lcq_start, + -end=>$lcq_end, + -primary=>'low_consensus_quality') ); + } + + return @LCQ; +} + +=head2 not_confirmed_on_both_strands + + Title : low_quality_consensus + Usage : my $sfc = $ContigAnal->low_quality_consensus(); + Function : + + Locates all regions whose consensus bases were not + confirmed by bases from sequences aligned in both + orientations, i.e., in such regions, no bases in aligned + sequences of either +1 or -1 strand agree with the + consensus bases. + + Returns : an array of Bio::SeqFeature::Generic objects + Args : optional arguments are + -start : start of interval that will be analyzed + -end : start of interval that will be analyzed + -type : coordinate system type for interval + +=cut + +sub not_confirmed_on_both_strands { + my ($self,@args) = shift; # Package reference + + my ($start,$end,$type) = + $self->_rearrange([qw(START END TYPE)],@args); + + # Changing coordinates to default system 'align' (contig sequence with alignments) + $start = 1 unless (defined($start)); + if (defined($type) && ($type ne 'gapped consensus')) { + $start = $self->{'_objref'}->change_coord($type,'gapped consensus',$start); + $end = $self->{'_objref'}->change_coord($type,'gapped consensus',$end) if (defined($end)); + } + $end = $self->{'_objref'}->get_consensus_length unless (defined($end)); + + # Scanning alignment + my %confirmed = (); # If ($confirmed{$orientation}[$i] > 0) then $i is confirmed in $orientation strand + my ($i); + my $consensus = $self->{'_objref'}->get_consensus_sequence()->seq; + foreach my $seqID ($self->{'_objref'}->get_seq_ids) { + # Setting aligned read sub-sequence limits and loading data + my $seq = $self->{'_objref'}->get_seq_by_name($seqID); + my $sequence = $seq->seq; + + # Scanning the aligned regions of each read and registering confirmed sites + my $contig_ix = 0; + my $coord = $self->{'_objref'}->get_seq_feat_by_tag($seq,"_align_clipping:$seqID"); + my ($astart,$aend,$orientation) = ($coord->start,$coord->end,$coord->strand); + $astart = $self->{'_objref'}->change_coord('gapped consensus',"aligned $seqID",$astart); + $aend = $self->{'_objref'}->change_coord('gapped consensus',"aligned $seqID",$aend); + for ($i=$astart-1; $i<=$aend-1; $i++) { + # $i+1 in 'align' mode is $contig_ix + $contig_ix = $self->{'_objref'}->change_coord("aligned $seqID",'gapped consensus',$i+1); + next unless (($contig_ix >= $start) && ($contig_ix <= $end)); + my $r_base = uc(substr($sequence,$i,1)); + my $c_base = uc(substr($consensus,$contig_ix-1,1)); + if ($c_base eq '-') { + $confirmed{$orientation}[$contig_ix] = -1; + } elsif (uc($r_base) eq uc($c_base)) { # Non discrepant region found + $confirmed{$orientation}[$contig_ix]++; + } + } # for ($i=$astart-1; $i<=$aend-1; $i++) + } # foreach $seqID (@reads) + + # Locating non-confirmed aligned regions for each orientation in $confirmed registry + my ($orientation); + my @NCBS = (); + foreach $orientation (keys %confirmed) { + my ($ncbs_start,$ncbs_end); + + for ($i=$start; $i<=$end; $i++) { + if (!defined($ncbs_start) && + (!defined($confirmed{$orientation}[$i]) || ($confirmed{$orientation}[$i] == 0))) { + $ncbs_start = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i); + } elsif (defined($ncbs_start) && + defined($confirmed{$orientation}[$i]) && + ($confirmed{$orientation}[$i] > 0)) { + $ncbs_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i-1); + push(@NCBS, Bio::SeqFeature::Generic->new(-start=>$ncbs_start, + -end=>$ncbs_end, + -strand=>$orientation, + -primary=>"not_confirmed_on_both_strands") ); + $ncbs_start = undef; + } + } + + if (defined($ncbs_start)) { # NCBS at the end of contig + $ncbs_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$end); + push(@NCBS, Bio::SeqFeature::Generic->new(-start=>$ncbs_start, + -end=>$ncbs_end, + -strand=>$orientation, + -primary=>'not_confirmed_on_both_strands') ); + } + } + + return @NCBS; +} + +=head2 single_strand + + Title : single_strand + Usage : my $sfc = $ContigAnal->single_strand(); + Function : + + Locates all regions covered by aligned sequences only in + one of the two strands, i.e., regions for which aligned + sequence's strand() method returns +1 or -1 for all + sequences. + + Returns : an array of Bio::SeqFeature::Generic objects + Args : optional arguments are + -start : start of interval that will be analyzed + -end : start of interval that will be analyzed + -type : coordinate system type for interval + +=cut + +#' +sub single_strand { + my ($self,@args) = shift; # Package reference + + my ($start,$end,$type) = + $self->_rearrange([qw(START END TYPE)],@args); + + # Changing coordinates to gap mode align (consed: consensus sequence with alignments) + $type = 'gapped consensus' unless(defined($type)); + $start = 1 unless (defined($start)); + if (defined($type) && $type ne 'gapped consensus') { + $start = $self->{'objref'}->change_coord($type,'gapped consensus',$start); + $end = $self->{'objref'}->change_coord($type,'gapped consensus',$end) if (defined($end)); + } + ($end) = $self->{'_objref'}->get_consensus_length unless (defined($end)); + + # Loading complete list of coordinates for aligned sequences + my $sfc = $self->{'_objref'}->get_features_collection(); + my @forward = grep { $_->primary_tag =~ /^_aligned_coord:/ } + $sfc->features_in_range(-start=>$start, + -end=>$end, + -contain=>0, + -strand=>1, + -strandmatch=>'strong'); + my @reverse = grep { $_->primary_tag =~ /^_aligned_coord:/ } + $sfc->features_in_range(-start=>$start, + -end=>$end, + -contain=>0, + -strand=>-1, + -strandmatch=>'strong'); + # Merging overlapping features + @forward = $self->_merge_overlapping_features(@forward); + @reverse = $self->_merge_overlapping_features(@reverse); + + # Finding single stranded regions + my ($length) = $self->{'_objref'}->get_consensus_length; + $length = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$length); + @forward = $self->_complementary_features_list(1,$length,@forward); + @reverse = $self->_complementary_features_list(1,$length,@reverse); + + my @SS = (); + foreach my $feat (@forward, @reverse) { + $feat->primary_tag('single_strand_region'); + push(@SS,$feat); + } + + return @SS; +} + +=head1 Internal Methods + +=head2 _merge_overlapping_features + + Title : _merge_overlapping_features + Usage : my @feat = $ContigAnal->_merge_overlapping_features(@features); + Function : Merge all overlapping features into features + that hold original features as sub-features + Returns : array of Bio::SeqFeature::Generic objects + Args : array of Bio::SeqFeature::Generic objects + +=cut + +sub _merge_overlapping_features { + my ($self,@feat) = @_; + + $self->throw_not_implemented(); +} + +=head2 _complementary_features_list + + Title : _complementary_features_list + Usage : @feat = $ContigAnal->_complementary_features_list($start,$end,@features); + Function : Build a list of features for regions + not covered by features in @features array + Returns : array of Bio::SeqFeature::Generic objects + Args : + $start : [integer] start of first output feature + $end : [integer] end of last output feature + @features : array of Bio::SeqFeature::Generic objects + +=cut + +sub _complementary_features_list { + my ($self,$start,$end,@feat) = @_; + + $self->throw_not_implemented(); +} + +1; + +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Assembly/IO.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Assembly/IO.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,229 @@ +# $Id: IO.pm,v 1.1 2002/11/04 11:47:49 heikki Exp $ +# +# BioPerl module for Bio::Assembly::IO +# +# based on the Bio::SeqIO module +# by Ewan Birney +# and Lincoln Stein +# +# Copyright Robson Francisco de Souza +# +# You may distribute this module under the same terms as perl itself +# +# _history + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Assembly::IO - Handler for Assembly::IO Formats + +=head1 SYNOPSIS + + use Bio::Assembly::IO; + + $in = Bio::Assembly::IO->new(-file=>"'phrap'); + $out = Bio::Assembly::IO->new(-file=>">outputfilename", + -format=>'phrap'); + + while ( my $seq = $in->next_seq() ) { + $out->write_seq($seq); + } + +=head1 DESCRIPTION + +Bio::Assembly::IO is a handler module for formats in the Assembly::IO set +(e.g. Bio::Assembly::IO::phrap). + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Robson Francisco de Souza + +E-mail: rfsouza@citri.iq.usp.br + +=head1 CONTRIBUTORS + +# + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +package Bio::Assembly::IO; + +use Bio::Root::Root; +use Bio::Root::IO; + +use strict; +use vars qw(@ISA); + +@ISA = qw(Bio::Root::Root Bio::Root::IO); + +=head2 new + + Title : new + Usage : Bio::Assembly::IO->new(-file =>$filename,-format=>'format') + Function: Returns a new assembly stream + Returns : A Bio::Assembly::IO::Handler initialised + with the appropriate format + Args : -file => $filename + -format => format + +=cut + +sub new { + my ($caller,@args) = @_; + my $class = ref($caller) || $caller; + + # or do we want to call SUPER on an object if $caller is an + # object? + if( $class =~ /Bio::Assembly::IO::(\S+)/ ) { + my ($self) = $class->SUPER::new(@args); + $self->_initialize(@args); + return $self; + } else { + + my %param = @args; + @param{ map { lc $_ } keys %param } = values %param; # lowercase keys + + $class->throw("Need at least a file name to proceed!") + unless (defined $param{'-file'} || defined $ARGV[0]); + + my $format = $param{'-format'} || + $class->_guess_format( $param{-file} || $ARGV[0] ); + $format = "\L$format"; # normalize capitalization to lower case + + # normalize capitalization + return undef unless( $class->_load_format_module($format) ); + return "Bio::Assembly::IO::$format"->new(@args); + } +} + +# _initialize is chained for all SeqIO classes + +sub _initialize { + my($self, @args) = @_; + # initialize the IO part + $self->_initialize_io(@args); +} + +=head2 next_assembly + + Title : next_assembly + Usage : $cluster = $stream->next_assembly() + Function: Reads the next assembly object from the stream and returns it. + Returns : a Bio::Assembly::ScaffoldI compliant object + Args : none + +=cut + +sub next_assembly { + my ($self, $seq) = @_; + $self->throw("Sorry, you cannot read from a generic Bio::Assembly::IO object."); +} + + +=head2 _load_format_module + + Title : _load_format_module + Usage : *INTERNAL Assembly::IO stuff* + Function: Loads up (like use) a module at run time on demand + Example : + Returns : + Args : + +=cut + +sub _load_format_module { + my ($self,$format) = @_; + my $module = "Bio::Assembly::IO::" . $format; + my $ok; + + eval { + $ok = $self->_load_module($module); + }; + if ( $@ ) { + print STDERR <_guess_format($filename) + Function: guess format based on file suffix + Example : + Returns : guessed format of filename (lower case) + Args : + Notes : formats that _filehandle() will guess includes + only phrap, by now. + +=cut + +sub _guess_format { + my $class = shift; + my $arg = shift; + + return unless defined($arg); + return 'ace' if ($arg =~ /\.ace\.\d+$/i); + return 'phrap' if ($arg =~ /\.phrap\.out$/i); +} + +sub DESTROY { + my $self = shift; + + $self->close(); +} + +# I need some direction on these!! The module works so I haven't fiddled with them! +# Me neither! (rfsouza) + +sub TIEHANDLE { + my ($class,$val) = @_; + return bless {'seqio' => $val}, $class; +} + +sub READLINE { + my $self = shift; + return $self->{'seqio'}->next_seq() unless wantarray; + my (@list, $obj); + push @list, $obj while $obj = $self->{'seqio'}->next_seq(); + return @list; +} + +sub PRINT { + my $self = shift; + $self->{'seqio'}->write_seq(@_); +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Assembly/IO/ace.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Assembly/IO/ace.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,383 @@ +# $Id: ace.pm,v 1.1 2002/11/04 14:38:14 heikki Exp $ +# +## BioPerl module for Bio::Assembly::IO::ace +# +# Copyright by Robson F. de Souza +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Assembly::IO::ace - module to load phrap ACE files. + +=head1 SYNOPSYS + + # Building an input stream + use Bio::Assembly::IO; + + # Assembly loading methods + $io = new Bio::Assembly::IO(-file=>"SGC0-424.fasta.screen.ace.1", + -format=>"ace"); + + $assembly = $io->next_assembly; + +=head1 DESCRIPTION + +This package loads the ACE files from the (phred/phrap/consed) package +by Phill Green. It was written to be used as a driver module for +Bio::Assembly::IO input/output. + +=head2 Implemention + +Assemblies are loaded into Bio::Assembly::Scaffold objects composed by +Bio::Assembly::Contig objects. In addition to default +"_aligned_coord:$seqID" feature class from Bio::Assembly::Contig, contig +objects loaded by this module will have the following special feature +classes in their feature collection: + +"_align_clipping:$seqID" : location of subsequence in sequence $seqID + which is aligned to the contig + +"_quality_clipping:$seqID" : location of good quality subsequence for + sequence $seqID + +"consensus tags", as they are called in Consed's documentation, is +equivalent to a bioperl sequence feature and, therefore, are added to +the feature collection using their type field (see Consed's README.txt +file) as primary tag. + +"read tags" are also sequence features and are stored as +sub_SeqFeatures of the sequence's coordinate feature (the +corresponding "_aligned_coord:$seqID" feature, easily accessed through +get_seq_coord() method). + +"whole assembly tags" have no start and end, as they are not +associated to any particular sequence in the assembly, and are added +to the assembly's annotation collection using phrap as tag. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Robson Francisco de Souza + +Email rfsouza@citri.iq.usp.br + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +package Bio::Assembly::IO::ace; + +use strict; +use vars qw(@ISA); + +use Bio::Assembly::IO; +use Bio::Assembly::Scaffold; +use Bio::Assembly::Contig; +use Bio::LocatableSeq; +use Bio::Annotation::SimpleValue; +use Bio::Seq::PrimaryQual; +use Bio::SeqFeature::Generic; + +@ISA = qw(Bio::Assembly::IO); + +=head1 Parser methods + +=head2 next_assembly + + Title : next_assembly + Usage : $unigene = $stream->next_assembly() + Function: returns the next assembly in the stream + Returns : Bio::Assembly::Scaffold object + Args : NONE + +=cut + +sub next_assembly { + my $self = shift; # Object reference + local $/="\n"; + + # Resetting assembly data structure + my $assembly = Bio::Assembly::Scaffold->new(-source=>'phrap'); + + # Looping over all ACE file lines + my ($contigOBJ,$read_name); + my $read_data = {}; # Temporary holder for read data + while ($_ = $self->_readline) { # By now, ACE files hold a single assembly + chomp; + + # Loading assembly information (ASsembly field) +# (/^AS (\d+) (\d+)/) && do { +# $assembly->_set_nof_contigs($1); +# $assembly->_set_nof_sequences_in_contigs($2); +# }; + + # Loading contig sequence (COntig sequence field) + (/^CO Contig(\d+) (\d+) (\d+) (\d+) (\w+)/) && do { # New contig found! + my $contigID = $1; + $contigOBJ = Bio::Assembly::Contig->new(-source=>'phrap', -id=>$contigID); +# $contigOBJ->set_nof_bases($2); # Contig length in base pairs +# $contigOBJ->set_nof_reads($3); # Number of reads in this contig +# $contigOBJ->set_nof_segments($4); # Number of read segments selected for consensus assembly + my $ori = ($5 eq 'U' ? 1 : -1); # 'C' if contig was complemented (using consed) or U if not (default) + $contigOBJ->strand($ori); + my $consensus_sequence = undef; + while ($_ = $self->_readline) { # Looping over contig lines + chomp; # Drop (\n) on current line + last if (/^$/); # Stop if empty line (contig end) is found + s/\*/-/g; # Forcing '-' as gap symbol + $consensus_sequence .= $_; + } + + my $consensus_length = length($consensus_sequence); + $consensus_sequence = Bio::LocatableSeq->new(-seq=>$consensus_sequence, + -start=>1, + -end=>$consensus_length, + -id=>$contigID); + $contigOBJ->set_consensus_sequence($consensus_sequence); + $assembly->add_contig($contigOBJ); + }; + + # Loading contig qualities... (Base Quality field) + /^BQ/ && do { + my $consensus = $contigOBJ->get_consensus_sequence()->seq(); + my ($i,$j,@tmp); + my @quality = (); + $j = 0; + while ($_ = $self->_readline) { + chomp; + last if (/^$/); + @tmp = grep { /^\d+$/ } split(/\s+/); + $i = 0; + my $previous = 0; + my $next = 0; + while ($i<=$#tmp) { + # IF base is a gap, quality is the average for neighbouring sites + if (substr($consensus,$j,1) eq '-') { + $previous = $tmp[$i-1] unless ($i == 0); + if ($i < $#tmp) { + $next = $tmp[$i+1]; + } else { + $next = 0; + } + push(@quality,int(($previous+$next)/2)); + } else { + push(@quality,$tmp[$i]); + $i++; + } + $j++; + } + } + + my $qual = Bio::Seq::PrimaryQual->new(-qual=>join(" ",@quality), + -id=>$contigOBJ->id()); + $contigOBJ->set_consensus_quality($qual); + }; + + # Loading read info... (Assembled From field) + /^AF (\S+) (C|U) (-*\d+)/ && do { + $read_name = $1; my $ori = $2; + $read_data->{$read_name}{'padded_start'} = $3; # aligned start + $ori = ( $ori eq 'U' ? 1 : -1); + $read_data->{$read_name}{'strand'} = $ori; + }; + + # Loading base segments definitions (Base Segment field) +# /^BS (\d+) (\d+) (\S+)/ && do { +# if (exists($self->{'contigs'}[$contig]{'reads'}{$3}{'segments'})) { +# $self->{'contigs'}[$contig]{'reads'}{$3}{'segments'} .= " " . $1 . " " . $2; +# } else { $self->{'contigs'}[$contig]{'reads'}{$3}{'segments'} = $1 . " " . $2 } +# }; + + # Loading reads... (ReaD sequence field + /^RD (\S+) (-*\d+) (\d+) (\d+)/ && do { + $read_name = $1; + $read_data->{$read_name}{'length'} = $2; # number_of_padded_bases + $read_data->{$read_name}{'contig'} = $contigOBJ; +# $read_data->{$read_name}{'number_of_read_info_items'} = $3; +# $read_data->{$read_name}{'number_of_tags'} = $4; + my $read_sequence; + while ($_ = $self->_readline) { + chomp; + last if (/^$/); + s/\*/-/g; # Forcing '-' as gap symbol + $read_sequence .= $_; # aligned read sequence + } + my $read = Bio::LocatableSeq->new(-seq=>$read_sequence, + -start=>1, + -end=>$read_data->{$read_name}{'length'}, + -strand=>$read_data->{$read_name}{'strand'}, + -id=>$read_name, + -primary_id=>$read_name, + -alphabet=>'dna'); + + # Adding read location and sequence to contig ("gapped consensus" coordinates) + my $padded_start = $read_data->{$read_name}{'padded_start'}; + my $padded_end = $padded_start + $read_data->{$read_name}{'length'} - 1; + my $coord = Bio::SeqFeature::Generic->new(-start=>$padded_start, + -end=>$padded_end, + -strand=>$read_data->{$read_name}{'strand'}, + -tag => { 'contig' => $contigOBJ->id } + ); + $contigOBJ->set_seq_coord($coord,$read); + }; + + # Loading read trimming and alignment ranges... + /^QA (-?\d+) (-?\d+) (-?\d+) (-?\d+)/ && do { + my $qual_start = $1; my $qual_end = $2; + my $align_start = $3; my $align_end = $4; + + unless ($align_start == -1 && $align_end == -1) { + $align_start = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$align_start); + $align_end = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$align_end); + my $align_feat = Bio::SeqFeature::Generic->new(-start=>$align_start, + -end=>$align_end, + -strand=>$read_data->{$read_name}{'strand'}, + -primary=>"_align_clipping:$read_name"); + $align_feat->attach_seq( $contigOBJ->get_seq_by_name($read_name) ); + $contigOBJ->add_features([ $align_feat ], 0); + } + + unless ($qual_start == -1 && $qual_end == -1) { + $qual_start = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$qual_start); + $qual_end = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$qual_end); + my $qual_feat = Bio::SeqFeature::Generic->new(-start=>$qual_start, + -end=>$qual_end, + -strand=>$read_data->{$read_name}{'strand'}, + -primary=>"_quality_clipping:$read_name"); + $qual_feat->attach_seq( $contigOBJ->get_seq_by_name($read_name) ); + $contigOBJ->add_features([ $qual_feat ], 0); + } + }; + + # Loading read description (DeScription fields) +# /^DS / && do { +# /CHEM: (\S+)/ && do { +# $self->{'contigs'}[$contig]{'reads'}{$read_name}{'chemistry'} = $1; +# }; +# /CHROMAT_FILE: (\S+)/ && do { +# $self->{'contigs'}[$contig]{'reads'}{$read_name}{'chromat_file'} = $1; +# }; +# /DIRECTION: (\w+)/ && do { +# my $ori = $1; +# if ($ori eq 'rev') { $ori = 'C' } +# elsif ($ori eq 'fwd') { $ori = 'U' } +# $self->{'contigs'}[$contig]{'reads'}{$read_name}{'strand'} = $ori; +# }; +# /DYE: (\S+)/ && do { +# $self->{'contigs'}[$contig]{'reads'}{$read_name}{'dye'} = $1; +# }; +# /PHD_FILE: (\S+)/ && do { +# $self->{'contigs'}[$contig]{'reads'}{$read_name}{'phd_file'} = $1; +# }; +# /TEMPLATE: (\S+)/ && do { +# $self->{'contigs'}[$contig]{'reads'}{$read_name}{'template'} = $1; +# }; +# /TIME: (\S+ \S+ \d+ \d+\:\d+\:\d+ \d+)/ && do { +# $self->{'contigs'}[$contig]{'reads'}{$read_name}{'phd_time'} = $1; +# }; +# }; + + # Loading contig tags ('tags' in phrap terminology, but Bioperl calls them features) + /^CT\s*\{/ && do { + my ($contigID,$type,$source,$start,$end,$date) = split(' ',$self->_readline); + $contigID =~ s/^Contig//i; + my $extra_info = undef; + while ($_ = $self->_readline) { + last if (/\}/); + $extra_info .= $_; + } + my $contig_tag = Bio::SeqFeature::Generic->new(-start=>$start, + -end=>$end, + -primary=>$type, + -tag=>{ 'source' => $source, + 'creation_date' => $date, + 'extra_info' => $extra_info + }); + $assembly->get_contig_by_id($contigID)->add_features([ $contig_tag ],1); + }; + + # Loading read tags + /^RT\s*\{/ && do { + my ($readID,$type,$source,$start,$end,$date) = split(' ',$self->_readline); + my $extra_info = undef; + while ($_ = $self->_readline) { + last if (/\}/); + $extra_info .= $_; + } + $start = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$start); + $end = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$end); + my $read_tag = Bio::SeqFeature::Generic->new(-start=>$start, + -end=>$end, + -primary=>$type, + -tag=>{ 'source' => $source, + 'creation_date' => $date, + 'extra_info' => $extra_info + }); + my $contig = $read_data->{$readID}{'contig'}; + my $coord = $contig->get_seq_coord( $contig->get_seq_by_name($readID) ); + $coord->add_sub_SeqFeature($read_tag); + }; + + # Loading read tags + /^WA\s*\{/ && do { + my ($type,$source,$date) = split(' ',$self->_readline); + my $extra_info = undef; + while ($_ = $self->_readline) { + last if (/\}/); + $extra_info = $_; + } +#? push(@line,\@extra_info); + my $assembly_tag = join(" ","TYPE: ",$type,"PROGRAM:",$source, + "DATE:",$date,"DATA:",$extra_info); + $assembly_tag = Bio::Annotation::SimpleValue->new(-value=>$assembly_tag); + $assembly->annotation->add_Annotation('phrap',$assembly_tag); + }; + + } # while ($_ = $self->_readline) + + $assembly->update_seq_list(); + return $assembly; +} + +=head2 write_assembly + + Title : write_assembly + Usage : $ass_io->write_assembly($assembly) + Function: Write the assembly object in Phrap compatible ACE format + Returns : 1 on success, 0 for error + Args : A Bio::Assembly::Scaffold object + +=cut + +sub write_assemebly { + my $self = shift; + + $self->throw("Writing phrap ACE files is not implemented yet! Sorry..."); +} + +1; + +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Assembly/IO/phrap.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Assembly/IO/phrap.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,313 @@ +# $Id: phrap.pm,v 1.1 2002/11/04 14:38:14 heikki Exp $ +# +# BioPerl driver for phrap.out file +# +# Copyright by Robson F. de Souza +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Assembly::IO::phrap - driver to load phrap.out files. + +=head1 SYNOPSYS + + # Building an input stream + use Bio::Assembly::IO; + + # Assembly loading methods + $io = new Bio::Assembly::IO(-file=>"SGC0-424.phrap.out", + -format=>"phrap"); + + $assembly = $io->next_assembly; + +=head1 DESCRIPTION + +This package was developed to load the phrap.out files from the +(phred/phrap/consed) package by Phill Green. This files contain just +the messages printed to standard out by phrap when building an +assembly. This output is redirected by phredPhrap perl-script to a +file in the project's directory and hold some bit of information +regarding assembly quality, connections between contigs and clone's +position inside contigs. It should be noted that such files have no +data about the sequence. neither for contig consensus nor for any +aligned sequence. Anyway, such information may be loaded from Fasta +files in the projects directory and added to the assembly object +later. + +Note that, because no sequence is loaded for the contig consensus and +locations for aligned sequences are only given in "ungapped consensus" +coordinates in a phrap.out file, you can't make coordinate changes in +assemblies loaded by pharp.pm, unless you add an aligned +coordinates for each sequence to each contig's features collection +yourself. See L and +L.. + +This driver also loads singlets into the assembly contigs as Bio::Seq +objects, altough without their sequence strings. It also adds a +feature for the entire sequence, thus storing the singlet length in +its end position, and adds a tag '_nof_trimmed_nonX' to the feature, +which stores the number of non-vector bases in the singlet. + +=head2 Implementation + +Assemblies are loaded into Bio::Assembly::Scaffold objects composed by +Bio::Assembly::Contig objects. No features are added to Bio::Assembly::Contig +"_aligned_coord:$seqID" feature class, therefore you can't make +coordinate changes in contigs loaded by this module. Contig objects +created by this module will have the following special feature +classes, identified by their primary tags, in their features +collection: + +"_main_contig_feature:$ID" : main feature for contig $ID. This + feature is used to store information + about the entire consensus + sequence. This feature always start at + base 1 and its end position is the + consensus sequence length. A tag, + 'trimmed_length' holds the length of the + trimmed good quality region inside the + consensus sequence. + +"_covered_region:$index" : coordinates for valid clones inside the + contig. $index is the covered region + number, starting at 1 for the covered + region closest to the consensus sequence + first base. + +"_unalign_coord:$seqID" : location of a sequence in "ungapped + consensus" coordinates (consensus + sequence without gaps). Primary and + secondary scores, indel and + substitutions statistics are stored as + feature tags. + +"_internal_clones:$cloneID" : clones inside contigs $cloneID should be + used as the unique id for each + clone. These features have six tags: + '_1st_name', which is the id of the + upstream (5') aligned sequence + delimiting the clone; '_1st_strand', the + upstream sequence strand in the + alignment; '_2nd_name', downstream (3') + sequence id; '_2nd_strand', the + downstream sequence strand in the + alignment; '_length', unaligned clone + length; '_rejected', a boolean flag, + which is false if the clone is valid and + true if it was rejected. + +All coordinates for the features above are expressed as "ungapped +consensus" coordinates (See L.. + +=head2 Feature collection + +# + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + + +=head1 AUTHOR - Robson Francisco de Souza + +Email rfsouza@citri.iq.usp.br + +head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +package Bio::Assembly::IO::phrap; + +use strict; +use vars qw(@ISA); + +use Bio::Assembly::IO; +use Bio::Assembly::Scaffold; +use Bio::Assembly::Contig; +use Bio::LocatableSeq; +use Bio::Seq; +use Bio::SeqFeature::Generic; + +@ISA = qw(Bio::Assembly::IO); + +=head2 next_assembly + + Title : next_assembly + Usage : $unigene = $stream->next_assembly() + Function: returns the next assembly in the stream + Returns : Bio::Assembly::Scaffold object + Args : NONE + +=cut + +sub next_assembly { + my $self = shift; # Package reference + + # Resetting assembly data structure + my $Assembly = Bio::Assembly::Scaffold->new(-source=>'phrap'); + + # Looping over all phrap out file lines + my ($contigOBJ); + while ($_ = $self->_readline) { + chomp; + + # Loading exact dupicated reads list +# /Exact duplicate reads:/ && do { +# my @exact_dupl; +# while () { +# last if (/^\s*$/); +# /(\S+)\s+(\S+)/ && do { +# push(@exact_dupl,[$1,$2]); +# }; +# $self->{'assembly'}{'exact_dupl_reads'} = +# new Data::Table(\@exact_dupl,['included','excluded'],0); +# } +# }; + + # Loading singlets reads data + /^(\d+) isolated singletons/ && do { + while ($_ = $self->_readline) { + chomp; + last if (/^$/); + if (/^\s+(\S+)\s+(\d+)\s+\((\d+)\)/) { + my $seqID = $1; my $length = $2; + my $nof_trimmed_nonX = $3; + my $seq = new Bio::Seq(-strand=>1, + -primary_id=>$seqID); + my $f = Bio::SeqFeature::Generic->new + (-start=>1, -end=>$seq->length(), + -primary=>$seq->primary_id(), + -tag=>{ '_nof_trimmed_nonX' => $nof_trimmed_nonX } + ); + $seq->add_SeqFeature($f); + $Assembly->add_singlet($seq); + } + } + }; + + # Loading contig information + /^Contig (\d+)\.\s+(\d+) reads?; (\d+) bp \(untrimmed\), (\d+) \(trimmed\)\./ && do { + my $nof_reads = $2; my $length = $3; my $trimmed_length = $4; + $contigOBJ = Bio::Assembly::Contig->new(-id=>$1, -source=>'phrap'); + my $feat = Bio::SeqFeature::Generic->new(-start=>1, + -end=>$length, + -primary=>"_main_contig_feature:".$contigOBJ->id(), + -tag=>{ '_trimmed_length' => $trimmed_length } + ); + $contigOBJ->add_features([ $feat ],1); + $Assembly->add_contig($contigOBJ); + }; + + # Loading read information + /^(C?)\s+(-?\d+)\s+(\d+)\s+(\S+)\s+(\d+)\s+\(\s*(\d+)\)\s+(\d+\.\d*)\s+(\d+\.\d*)\s+(\d+\.\d*)/ && do { + my $strand = ($1 eq 'C' ? -1 : 1); + my $readID = $4; my $start = $2; my $end = $3; + my $primary_score = $5; my $secondary_score = $6; + my $substitutions = $7; my $deletions = $8; my $insertions = $9; + my $seq = Bio::LocatableSeq->new(-start=>$start, + -end=>$end, + -strand=>$strand, + -id=>$readID, + -primary_id=>$readID, + -alphabet=>'dna'); + my $unalign_coord = Bio::SeqFeature::Generic->new(-start=>$start, + -end=>$end, + -primary=>"_unalign_coord:$readID", + -tag=>{'_primary_score'=>$primary_score, + '_secondary_score'=>$secondary_score, + '_substitutions'=>$substitutions, + '_insertions'=>,$insertions, + '_deletions'=>$deletions } + ); + $unalign_coord->attach_seq($seq); + $contigOBJ->add_seq($seq); $contigOBJ->add_features([ $unalign_coord ]); + }; + + # Loading INTERNAL clones description + /INTERNAL\s+Contig\s+(\d+)\s+opp\s+sense/ && do { + my $contigID = $1; + my $contig = $Assembly->get_contig_by_id($contigID); + while ($_ = $self->_readline) { + my (@data,$rejected,$c1_strand,$c2_strand); + + (@data = /\s+(\*?)\s+(C?)\s+(\S+)\s+(C?)\s+(\S+)\s+(-?\d+)\s+(-?\d+)\s+(-?\d+)/) && do { + if ($data[0] eq '*') { $rejected = 1 } else { $rejected = 0 } + $c1_strand = ($data[1] eq 'C' ? -1 : 1); + $c2_strand = ($data[3] eq 'C' ? -1 : 1); + (my $clone_name = $data[2]) =~ s/^(\S+)\.\w.*/$1/; + my $clone = Bio::SeqFeature::Generic->new(-start=>$data[6], + -end=>$data[7], + -strand=>0, + -primary=>"_internal_clone:$clone_name", + -tag=>{'_1st_strand'=>,$c1_strand, + '_2nd_strand'=>,$c2_strand, + '_1st_name'=>$data[2], + '_2nd_name'=>$data[4], + '_length'=>$data[5], + '_rejected'=>$rejected + } + ); + $contig->add_features([ $clone ]); + }; + + /Covered regions:/ && do { + my %coord = /(\d+)/g; my $i = 0; + foreach my $start (sort { $a <=> $b } keys %coord) { + my $cov = Bio::SeqFeature::Generic->new(-start=>$start, + -end=>$coord{$start}, + -primary=>'_covered_region:'.++$i + ); + # 1: attach feature to contig consensus, if any + $contig->add_features([ $cov ],1); + } + last; # exit while loop + }; # /Covered regions:/ + + } # while ($_ = $self->_readline) + }; # /INTERNAL\s+Contig\s+(\d+)\s+opp\s+sense/ + + } # while ($_ = $self->_readline) + return $Assembly; +} + +=head2 write_assembly + + Title : write_assembly + Usage : $ass_io->write_assembly($assembly) + Function: Write the assembly object in Phrap compatible ACE format + Returns : 1 on success, 0 for error + Args : A Bio::Assembly::Scaffold object + +=cut + +sub write_assemebly { + my $self = shift; + + $self->throw("Writing phrap.out files is not implemented yet! Sorry..."); +} + +1; + +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Assembly/Scaffold.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Assembly/Scaffold.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,615 @@ +# $Id: Scaffold.pm,v 1.2 2002/11/11 18:16:30 lapp Exp $ +# +# BioPerl module for Bio::Assembly::Scaffold +# +# Copyright by Robson F. de Souza +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Assembly::Scaffold - Perl module to hold and manipulate sequence assembly data. + +=head1 SYNOPSYS + + # Module loading + use Bio::Assembly::IO; + + # Assembly loading methods + my $aio = new Bio::Assembly::IO(-file=>"test.ace.1", -format=>'phrap'); + my $assembly = $aio->next_assembly; + + foreach my $contig ($assembly->all_contigs) { + # do something... (see Bio::Assembly::Contig) + } + +=head1 DESCRIPTION + +Bio::Assembly::Scaffold was developed to store and manipulate data +from sequence assembly programs like Phrap. It implements the +ScaffoldI interface and intends to be generic enough to be used by +Bio::Assembly::IO drivers written to programs other than Phrap. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Robson Francisco de Souza + +rfsouza@citri.iq.usp.br + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +package Bio::Assembly::Scaffold; + +use strict; +use vars qw(@ISA $VERSION); + +use Bio::Root::Root; +use Bio::Assembly::ScaffoldI; +use Bio::Annotation::Collection; + +$VERSION = '0.0.1'; +@ISA = qw(Bio::Root::Root Bio::Assembly::ScaffoldI); + +=head2 new () + + Title : new + Usage : $assembly = new (-source=>'program_name', + -contigs=>\@contigs, + -id=>"assembly 1"); + Function: creates a new assembly object + Returns : + Args : + -source : [string] sequence assembly program + -contigs : reference to array of + Bio::Assembly::Contig objects + -id : [string] assembly name + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + my ($src,$contigs,$id) = $self->_rearrange([qw(SOURCE CONTIGS ID)], @args); + + $self->{'_contigs'} = {}; + $self->{'_singlets'} = {}; + $self->{'_seqs'} = {}; + $self->{'_annotation'} = Bio::Annotation::Collection->new(); + $self->{'_id'} = 'NoName'; + + if (defined $contigs && ref($contigs = 'ARRAY')) { + foreach my $contig (@{$contigs}) { + $self->add_contig($contig); + } + } + + $self->{'_id'} = $id if (defined $id); + + return $self; +} + +=head1 Accessing general assembly data + +=cut + +=head2 id + + Title : id + Usage : $assembly->id() + Function: Get/Set assembly ID + Returns : string or undef + Args : string + +=cut + +sub id { + my $self = shift; + my $id = shift; + + $self->{'_id'} = $id if (defined $id); + + return $self->{'_id'}; +} + +=head2 annotation + + Title : annotation + Usage : $assembly->annotation() + Function: Get/Set assembly annotation object + Returns : Bio::Annotation::Collection + Args : none + +=cut + +sub annotation { + my ($self,$ref) = shift; + + $self->{'_annotation'} = $ref if (defined $ref); + return $self->{'_annotation'}; +} + +=head2 get_nof_contigs + + Title : get_nof_contigs + Usage : $assembly->get_nof_contigs() + Function: Get the number of contigs included in the assembly + Returns : integer + Args : none + +=cut + +sub get_nof_contigs { + my $self = shift; + + return scalar( $self->get_contig_ids() ); +} + +=head2 get_nof_sequences_in_contigs + + Title : get_nof_sequences_in_contigs + Usage : $assembly->get_nof_sequences_in_contigs() + Function: + + Get the number of sequences included in the + assembly. This number refers only to the sequences used + to build contigs in this assembly. It does not includes + contig consensus sequences or singlets. + + Returns : integer + Args : none + +=cut + +sub get_nof_sequences_in_contigs { + my $self = shift; + + my $nof_seqs = 0; + foreach my $contig ($self->all_contigs) { + $nof_seqs += scalar( $contig->get_seq_ids() ); + } + + return $nof_seqs; +} + +=head2 get_nof_singlets + + Title : nof_singlets + Usage : $assembly->nof_singlets() + Function: Get the number of singlets included in the assembly + Returns : integer + Args : none + +=cut + +sub get_nof_singlets { + my $self = shift; + + return scalar( $self->get_singlet_ids() ); +} + +=head2 get_seq_ids + + Title : get_seq_ids + Usage : $assembly->get_seq_ids() + Function: + + Get the ID of sequences from all contigs. This list + refers only to the aligned sequences in contigs. It does + not includes contig consensus sequences or singlets. + + Returns : array of strings + Args : none + +=cut + +sub get_seq_ids { + my $self = shift; + + return keys %{ $self->{'_seqs'} }; +} + +=head2 get_contig_ids + + Title : get_contig_ids + Usage : $assembly->get_contig_ids() + Function: Access list of contig IDs from assembly + Returns : an array, if there are any contigs in the + assembly. An empty array otherwise + Args : none + +=cut + +sub get_contig_ids { + my $self = shift; + + return sort keys %{$self->{'_contigs'}}; +} + +=head2 get_singlet_ids + + Title : get_singlet_ids + Usage : $assembly->get_singlet_ids() + Function: Access list of singlet IDs from assembly + Returns : array of strings if there are any singlets + otherwise an empty array + Args : none + +=cut + +sub get_singlet_ids { + my $self = shift; + + return sort keys %{$self->{'_singlets'}}; +} + +=head2 get_seq_by_id + + Title : get_seq_by_id + Usage : $assembly->get_seq_by_id($id) + Function: + + Get a reference for an aligned sequence + This sequence must be part of a contig + in the assembly. + + Returns : a Bio::LocatableSeq object + undef if sequence $id is not found + in any contig + Args : [string] sequence identifier (id) + +=cut + +sub get_seq_by_id { + my $self = shift; + my $seqID = shift; + + return undef unless (exists $self->{'_seqs'}{$seqID}); + + return $self->{'_seqs'}{$seqID}->get_seq_by_name($seqID); +} + +=head2 get_contig_by_id + + Title : get_contig_by_id + Usage : $assembly->get_contig_by_id($id) + Function: Get a reference for a contig + Returns : a Bio::Assembly::Contig object or undef + Args : [string] contig unique identifier (ID) + +=cut + +sub get_contig_by_id { + my $self = shift; + my $contigID = shift; + + return undef unless (exists $self->{'_contigs'}{$contigID}); + + return $self->{'_contigs'}{$contigID}; +} + +=head2 get_singlet_by_id + + Title : get_singlet_by_id + Usage : $assembly->get_singlet_by_id() + Function: Get a reference for a singlet + Returns : Bio::PrimarySeqI object or undef + Args : [string] a singlet ID + +=cut + +sub get_singlet_by_id { + my $self = shift; + + my $singletID = shift; + + return undef unless (exists $self->{'_singlets'}{$singletID}); + + return $self->{'_singlets'}{$singletID}; +} + +=head1 Modifier methods + +=cut + +=head2 add_contig + + Title : add_contig + Usage : $assembly->add_contig($contig) + Function: Add a contig to the assembly + Returns : 1 on success + Args : a Bio::Assembly::Contig object + order (optional) + +=cut + +sub add_contig { + my $self = shift; + my $contig = shift; + + if( !ref $contig || ! $contig->isa('Bio::Assembly::Contig') ) { + $self->throw("Unable to process non Bio::Assembly::Contig object [", ref($contig), "]"); + } + my $contigID = $contig->id(); + if( !defined $contigID ) { + $contigID = 'Unknown_' . ($self->get_nof_contigs() + 1); + $contig->id($contigID); + $self->warn("Attributing ID $contigID to unidentified Bio::Assembly::Contig object."); + } + + $self->warn("Replacing contig $contigID with a new contig object") + if (exists $self->{'_contigs'}{$contigID}); + $self->{'_contigs'}{$contigID} = $contig; + + foreach my $seqID ($contig->get_seq_ids()) { + if (exists $self->{'_seqs'}{$seqID}) { + $self->warn( "Sequence $seqID already assigned to contig ". + $self->{'_seqs'}{$seqID}->id().". Moving to contig $contigID") + unless ($self->{'_seqs'}{$seqID} eq $contig); + } + $self->{'_seqs'}{$seqID} = $contig; + } + + return 1; +} + +=head2 add_singlet + + Title : add_singlet + Usage : $assembly->add_singlet($seq) + Function: Add a singlet to the assembly + Returns : 1 on success, 0 otherwise + Args : a Bio::PrimarySeqI object + order (optional) + +=cut + +sub add_singlet { + my $self = shift; + my $singlet = shift; + + if( !ref $singlet || ! $singlet->isa('Bio::PrimarySeqI') ) { + $self->warn("Unable to process non Bio::SeqI object [", ref($singlet), "]"); + return 0; + } + + my $singletID = $singlet->id(); + $self->warn("Replacing singlet $singletID wih a new sequence object") + if (exists $self->{'_contigs'}{$singletID}); + $self->{'_singlets'}{$singletID} = $singlet; + + return 1; +} + +=head2 update_seq_list + + Title : update_seq_list + Usage : $assembly->update_seq_list() + Function: + + Synchronizes the assembly registry for sequences in + contigs and contig actual aligned sequences content. You + probably want to run this after you remove/add a + sequence from/to a contig in the assembly. + + Returns : nothing + Args : none + +=cut + +sub update_seq_list { + my $self = shift; + + $self->{'_seqs'} = {}; + foreach my $contig ($self->all_contigs) { + foreach my $seqID ($contig->get_seq_ids) { + $self->{'_seqs'}{$seqID} = $contig; + } + } + + return 1; +} + +=head2 remove_contigs + + Title : remove_contigs + Usage : $assembly->remove_contigs(1..4) + Function: Remove contig from assembly object + Returns : an array of removed Bio::Assembly::Contig + objects + Args : an array of contig IDs + + See function get_contig_ids() above + +=cut + +#--------------------- +sub remove_contigs { +#--------------------- + my ($self,@args) = @_; + + my @ret = (); + foreach my $contigID (@args) { + foreach my $seqID ($self->get_contig_by_id($contigID)->get_seq_ids()) { + delete $self->{'_seqs'}{$seqID}; + } + push(@ret,$self->{'_contigs'}{$contigID}); + delete $self->{'_contigs'}{$contigID}; + } + + return @ret; +} + +=head2 remove_singlets + + Title : remove_singlets + Usage : $assembly->remove_singlets(@singlet_ids) + Function: Remove singlet from assembly object + Returns : the Bio::SeqI objects removed + Args : a list of singlet IDs + + See function get_singlet_ids() above + +=cut + +#--------------------- +sub remove_singlets { +#--------------------- + my ($self,@args) = @_; + + my @ret = (); + foreach my $singletID (@args) { + push(@ret,$self->{'_singlets'}{$singletID}); + delete $self->{'_singlets'}{$singletID}; + } + + return @ret; +} + +=head1 Contig and singlet selection methos + +=cut + +=head2 select_contigs + + Title : select_contigs + Usage : $assembly->select_contigs(@list) + Function: Select an array of contigs from the assembly + Returns : an array of Bio::Assembly::Contig objects + Args : an array of contig ids + + See function get_contig_ids() above + +=cut + +#--------------------- +sub select_contigs { +#--------------------- + my ($self,@args) = @_; + + my @contigs = (); + foreach my $contig (@args) { + unless (exists $self->{'_contigs'}{$contig}) { + $self->warn("$contig contig not found. Ignoring..."); + next; + } + push(@contigs, $self->{'_contigs'}{$contig}); + } + + return @contigs; +} + +=head2 select_singlets + + Title : select_singlets + Usage : $assembly->select_singlets(@list) + Function: Selects an array of singlets from the assembly + Returns : an array of Bio::SeqI objects + Args : an array of singlet ids + + See function get_singlet_ids() above + +=cut + +#--------------------- +sub select_singlets { +#--------------------- + my ($self,@args) = @_; + + my @singlets = (); + foreach my $singlet (@args) { + unless (exists $self->{'_singlets'}{$singlet}) { + $self->warn("$singlet singlet not found. Ignoring..."); + next; + } + push(@singlets, $self->{'_singlets'}{$singlet}); + } + + return @singlets; +} + +=head2 all_contigs + + Title : all_contigs + Usage : my @contigs = $assembly->all_contigs + Function: + + Returns a list of all contigs in this assembly. Contigs + are both clusters and alignments of one or more reads, + with an associated consensus sequence. + + Returns : array of Bio::Assembly::Contig (in lexical id order) + Args : none + +=cut + +#--------------------- +sub all_contigs { +#--------------------- + my ($self) = @_; + + my @contigs = (); + foreach my $contig (sort { $a cmp $b } keys %{ $self->{'_contigs'} }) { + push(@contigs, $self->{'_contigs'}{$contig}); + } + + return @contigs; +} + +=head2 all_singlets + + Title : all_singlets + Usage : my @singlets = $assembly->all_singlets + Function: + + Returns a list of all singlets in this assembly. + Singlets are isolated reads, without non-vector + matches to any other read in the assembly. + + Returns : array of Bio::SeqI (in lexical order by id) + Args : none + +=cut + +#--------------------- +sub all_singlets { +#--------------------- + my ($self) = @_; + + my @singlets = (); + foreach my $singlet (sort { $a cmp $b } keys %{ $self->{'_singlets'} }) { + push(@singlets, $self->{'_singlets'}{$singlet}); + } + + return @singlets; +} + +# =head1 Internal Methods + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Assembly/ScaffoldI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Assembly/ScaffoldI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,340 @@ +# $Id: ScaffoldI.pm,v 1.2 2002/11/11 18:16:30 lapp Exp $ +# +# BioPerl module for Bio::Assembly::ScaffoldI +# +# Copyright by Robson F. de Souza +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Assembly::ScaffoldI - Abstract Inteface of Sequence Assemblies + +=head1 SYNOPSYS + + # get a Bio::Assembly::ScaffoldI object somehow + + foreach my $contig ($assembly->all_contigs) { + # do something (see Bio::Assembly::Contig) + } + +=head1 DESCRIPTION + +This interface defines the basic set of methods an object should have +to manipulate assembly data. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Robson Francisco de Souza + +Email: rfsouza@citri.iq.usp.br + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# +# Now, let's code! + + +package Bio::Assembly::ScaffoldI; + +use strict; +use vars qw(@ISA); +use Carp; +use Bio::Root::RootI; + +# Inheritance + +@ISA = qw(Bio::Root::RootI); + +# +# Methods + +=head1 Accessing general assembly data + +=cut + +=head2 get_nof_contigs + + Title : get_nof_contigs + Usage : $assembly->get_nof_contigs() + Function: Get the number of contigs included in the assembly + Returns : integer + Args : none + +=cut + +sub get_nof_contigs { + my $self = shift; + + $self->throw_not_implemented(); +} + +=head2 get_nof_singlets + + Title : get_nof_singlets + Usage : $assembly->get_nof_singlets() + Function: Get the number of singlets included in the assembly + Returns : integer + Args : none + +=cut + +sub get_nof_singlets { + my $self = shift; + + $self->throw_not_implemented(); +} + +=head2 get_contig_ids + + Title : get_contig_ids + Usage : $assembly->get_contig_ids() + Function: Access list of contig IDs from assembly + Returns : an array if there are any contigs in the assembly. + undef otherwise + Args : an array of contig IDs + +=cut + +sub get_contig_ids { + my $self = shift; + + $self->throw_not_implemented(); +} + +=head2 get_singlet_ids + + Title : get_singlet_ids + Usage : $assembly->get_singlet_ids() + Function: Access list of singlet IDs from assembly + Returns : an array if there are any singlets in the assembly. + undef otherwise + Args : an array of singlet IDs + +=cut + +sub get_singlet_ids { + my $self = shift; + + $self->throw_not_implemented(); +} + +=head2 get_contig_by_id + + Title : get_contig_by_id + Usage : $assembly->get_contig_by_id($id) + Function: Get a reference for a contig from the assembly + Returns : a Bio::Assembly::Contig object or undef + Args : [string] contig unique identifier (ID) + +=cut + +sub get_contig_by_id { + my $self = shift; + $self->throw_not_implemented(); +} + +=head2 get_singlet_by_id + + Title : get_singlet_by_id + Usage : $assembly->get_singlet_by_id() + Function: Get a reference for a singlet from the assembly + Returns : Bio::PrimarySeqI object or undef + Args : [string] a singlet ID + +=cut + +sub get_singlet_by_id { + my $self = shift; + $self->throw_not_implemented(); +} + +=head1 Modifier methods + +Implementation of these methods is optional in the sense that +read-only implementations may not have these. If an object implements +one of them, it should however implement all. + +=cut + +=head2 add_contig + + Title : add_contig + Usage : $assembly->add_contig($contig) + Function: Add another contig to the Bio::Assembly::ScaffoldI object + Returns : 1 on success, 0 otherwise + Args : a Bio::Assembly:Contig object + + See Bio::Assembly::Contig for more information + +=cut + +#--------------------- +sub add_contig { +#--------------------- + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 add_singlet + + Title : add_singlet + Usage : $assembly->add_singlet($seq) + Function: Add another singlet to the Bio::Assembly::ScaffoldI object + Returns : 1 on success, 0 otherwise + Args : a Bio::Align::Singlet object + +=cut + +#--------------------- +sub add_singlet { +#--------------------- + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 remove_contigs + + Title : remove_contigs + Usage : $assembly->remove_contigs(1..4) + Function: Remove contig from assembly object + Returns : a Bio::Assembly::Contig object + Args : a list of contig IDs + + See function get_contig_ids() above + +=cut + +#--------------------- +sub remove_contigs { +#--------------------- + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 remove_singlets + + Title : remove_singlets + Usage : $assembly->remove_singlets(1..4) + Function: Remove singlet from assembly object + Returns : a Bio::SeqI object + Args : a list of singlet IDs + + See function get_singlet_ids() above + +=cut + +#--------------------- +sub remove_singlets { +#--------------------- + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head1 Contig and singlet selection methos + +=cut + +=head2 select_contigs + + Title : select_contig + Usage : $assembly->select_contig + Function: Selects an array of contigs from the assembly + Returns : an array of Bio::Assembly::Contig objects + Args : an array of contig ids + + See function get_contig_ids() above + +=cut + +#--------------------- +sub select_contigs { +#--------------------- + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 select_singlets + + Title : select_singlets + Usage : $assembly->select_singlets(@list) + Function: Selects an array of singlets from the assembly + Returns : an array of Bio::SeqI objects + Args : an array of singlet ids + + See function get_singlet_ids() above + +=cut + +#--------------------- +sub select_singlets { +#--------------------- + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 all_contigs + + Title : all_contigs + Usage : my @contigs = $assembly->all_contigs + Function: Returns a list of all contigs in this assembly. + Contigs are both clusters and alignments of one + or more reads, with an associated consensus + sequence. + Returns : array of Bio::Assembly::Contig + Args : none + +=cut + +#--------------------- +sub all_contigs { +#--------------------- + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 all_singlets + + Title : all_singlets + Usage : my @singlets = $assembly->all_singlets + Function: Returns a list of all singlets in this assembly. + Singlets are isolated reads, without non-vector + matches to any other read in the assembly. + Returns : array of Bio::Assembly::Contig + Args : none + +=cut + +#--------------------- +sub all_singlets { +#--------------------- + my ($self) = @_; + $self->throw_not_implemented(); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Biblio.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Biblio.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,372 @@ +# $Id: Biblio.pm,v 1.7 2002/10/22 07:45:09 lapp Exp $ +# +# BioPerl module Bio::Biblio +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. +# + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Biblio - A Bibliographic Query Service module + +=head1 SYNOPSIS + + use Bio::Biblio; + my $biblio = new Bio::Biblio; + + print $biblio->find ('perl')->get_count . "\n"; + + my $collection = $biblio->find ('brazma', 'authors'); + while ( $collection->has_next ) { + print $collection->get_next; + } + +Here are some one-liners: + + perl -MBio::Biblio -e 'print new Bio::Biblio->get_by_id ("94033980")' + perl -MBio::Biblio \ + -e 'print join ("\n", @{ new Bio::Biblio->find ("brazma")->get_all_ids })' + perl -MBio::Biblio \ + -e 'print new Bio::Biblio->find ("Java")->find ("perl")->get_count' + +The C method can get parameters, for example: + + my $biblio = Bio::Biblio + (-access => 'soap', + -location => 'http://industry.ebi.ac.uk/soap/openBQS', + -destroy_on_exit => '0'); + +=head1 DESCRIPTION + +This is a class whose instances can access bibliographic +repositories. It allows to query a bibliographic database (such as +MEDLINE) and then to retrieve resulting citations from it. The +citations are returned in an XML format which is native to the +repository but there are also supporting modules for converting them +into Perl objects. + +The detailed descriptions of all query and retrieval methods are in +L (an interface). All those methods should be +called on instances of this (Bio::Biblio) module. + +The module complies (with some simplifications) with the specification +described in the B project. Its home page is at +I. There are also links to +available servers providing access to the bibliographic repositories +(namely to I). + +The module also gives an access to a set of controlled vocabularies +and their values. It allows to introspect bibliographic repositories +and to find what citation resource types (such as journal and book +articles, patents or technical reports) are provided, and what +attributes they have, eventually what attribute values are allowed. + +=head1 OVERVIEW OF CLASSES AND PACKAGES + +=over + +=item B + +This is the main class to be used by the end users. It +loads a real implementation for a particular access protocol according +to the argument I<-access>. At the time of writing this documentation +there is only one available access module implementing all query and +retrieval methods: + + -access => soap + +This module implements all methods defined in the interface +I (see L) by delegating +calls to a loaded low-level module (e.g. see +L). + +Note that there is also another module (and perhaps more) which does +not use SOAP protocol and do not implement all query methods - +nevertheless it has retrieval methods and it can be used in the same +way: + + -access => biofetch + + +=item Bio::DB::BiblioI + +This is an interface defining all methods that can be called on +I instances. + +=item Bio::DB::Biblio::soap + +This is a real implementation of all methods defined in +Bio::DB::BiblioI using SOAP protocol (calling a WebService +based on SOAP). This class should not be instantiated directly (use +I instead). See L for details. + +=item Bio::Biblio::IO + +This module instantiates and uses a converter of the citations read by +any of the access methods mentioned above. See L for +details. + +=item Bio::Biblio::IO::medlinexml and Bio::Biblio::IO::medline2ref + +A converter of MEDLINE citations in XML into Perl objects. + +=item Bio::Biblio::IO::pubmedxml and Bio::Biblio::IO::pubmed2ref + +A converter of PUBMED citations in XML into Perl objects. + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=head1 SEE ALSO + +=over + +=item * + +OpenBQS home page: http://industry.ebi.ac.uk/openBQS + +=item * + +Comments to the Perl client: http://industry.ebi.ac.uk/openBQS/Client_perl.html + +=back + +=head1 APPENDIX + +The main documentation details are to be found in +L. + +Here is the rest of the object methods. Internal methods are preceded +with an underscore _. + +=cut + + +# Let the code begin... + + +package Bio::Biblio; +use vars qw(@ISA $VERSION $Revision); +use strict; + +use Bio::Root::Root; +use Bio::DB::BiblioI; + +@ISA = qw(Bio::Root::Root Bio::DB::BiblioI); + + +BEGIN { + $VERSION = do { my @r = (q$Revision: 1.7 $ =~ /\d+/g); sprintf "%d.%-02d", @r }; + $Revision = q$Id: Biblio.pm,v 1.7 2002/10/22 07:45:09 lapp Exp $; +} + +# ----------------------------------------------------------------------------- + +=head2 new + + Usage : my $obj = new Bio::Biblio (@args); + Returns : Bio::Biblio object on success, or undef on failure + Args : This module recognizes and uses: + + -access => 'soap' + It indicates what lower-level module to load. + Default is 'soap'. + + -location => 'http://...' + It says where to find a bibliographic query service. + The format and contents of this argument is dependent + on the '-access' argument. + + For 'soap' access it is a URL of a WebService. + Default is http://industry.ebi.ac.uk/soap/openBQS + + Other arguments can be given here but they are + recognized by the lower-level module + (e.g. see Bio::DB::Biblio::soap). + +It builds, populates and returns a new I object. This is +how it is seen from the outside. But in fact, it builds, populates and +returns a more specific lower-level object, for example +I object - which one it is depends on the +parameter I<-access>. + +The real initialization is done in the method I<_initialize> of the +lower-level object. + +This method can also be used for I an existing object and +changing or adding new attributes to it in the same time. This is, +however, not particulary useful for the casual users of this module, +because the query methods (see L) themselves +already return cloned objects with more refined query +collections. Anyway this is how the cloning can be done: + + use Bio::Biblio; + my $biblio = new Bio::Biblio; + + # this will create a new object which will NOT send a 'destroy' + # message to the remote server when its life ends + my $clone = $biblio->new (-destroy-on-exit => '0'); + +=cut + +sub new { + my ($caller,@args) = @_; + my $class = ref($caller) || $caller; + + # if $caller is an object, or if it is an underlying + # 'real-work-doing' class (e.g. Bio::DB::Biblio::soap) then + # we want to call SUPER to create and bless an object + + if ($class =~ /Bio::DB::Biblio::(\S+)/) { + my ($self) = $class->SUPER::new (@args); + + # now the $self is an empty object - we will populate it from + # the $caller - if $caller is an object + + if (ref ($caller)) { + %{ $self } = %{ $caller }; + } + + # and finally add values from '@args' into the newly created + # object (the values will overwrite the values copied above) + + $self->_initialize (@args); + return $self; + + # this is called only the first time when somebody calls: 'new + # Bio::Biblio (...)', and it actually loads a 'real-work-doing' + # module and call this new() method again (unless the loaded + # module has its own new() method) + + } else { + my %param = @args; + @param { map { lc $_ } keys %param } = values %param; # lowercase keys + my $access = + $param {'-access'} || + $class->_guess_access ( $param {'-location'} ) || + 'soap'; + $access = "\L$access"; # normalize capitalization to lower case + + # load module with the real implementation - as defined in $access + return undef unless (&_load_access_module ($access)); + + # this will call this same method new() - but rather its the + # upper (object) branche + return "Bio::DB::Biblio::$access"->new (@args); + } +} + +# ----------------------------------------------------------------------------- + +=head2 _load_access_module + + Usage : $class->_load_access_module ($access) + Returns : 1 on success, undef on failure + Args : 'access' should contain the last part of the + name of a module who does the real implementation + +It does (in run-time) a similar thing as + + require Bio::DB::Biblio::$access + +It prints an error on STDERR if it fails to find and load the module +(for example, because of the compilation errors in the module). + +=cut + +sub _load_access_module { + my ($access) = @_; + my ($module, $load, $m); + + $module = "_throw (<_guess_access ($location) + Returns : string with a guessed access protocol (e.g. 'soap') + Args : 'location' defines where to find a bibliographic service + in a protocol-dependent manner (e.g. for SOAP it is + a URL of a bibliographic WebService) + +It makes an expert guess what kind of access/transport protocol should +be used based on the I of the service (e.g. if the +I looks like an IOR then the access protocol is probably +CORBA). + +=cut + +# this is kept here for the future when more access protocols +# (e.g. CORBA) may be available for accessing bibliographic query +# services + +sub _guess_access { +# my ($class, $location) = @_; + return 'soap'; +} + +=head2 VERSION and Revision + + Usage : print $Bio::Biblio::VERSION; + print $Bio::Biblio::Revision; + +=cut + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Biblio/Article.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Biblio/Article.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,137 @@ +# $Id: Article.pm,v 1.7 2002/10/22 07:45:11 lapp Exp $ +# +# BioPerl module for Bio::Biblio::Article +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Biblio::Article - Representation of a general article + +=head1 SYNOPSIS + + $obj = new Bio::Biblio::Article (-identifier => '123abc', + -first_page => 23, + -last_page => 68); + --- OR --- + + $obj = new Bio::Biblio::Article; + $obj->identifier ('123abc'); + $obj->first_page (23); + $obj->last_page (68); + +=head1 DESCRIPTION + +A storage object for a general article. +See its place in the class hierarchy in +http://industry.ebi.ac.uk/openBQS/images/bibobjects_perl.gif + +=head2 Attributes + +The following attributes are specific to this class +(however, you can also set and get all attributes defined in the parent classes): + + first_page + last_page + + +=head1 SEE ALSO + +=over + +=item * + +OpenBQS home page: http://industry.ebi.ac.uk/openBQS + +=item * + +Comments to the Perl client: http://industry.ebi.ac.uk/openBQS/Client_perl.html + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS + +Heikki Lehvaslaiho (heikki@ebi.ac.uk), +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=cut + + +# Let the code begin... + + +package Bio::Biblio::Article; +use strict; +use vars qw(@ISA); + +use Bio::Biblio::Ref; + +@ISA = qw( Bio::Biblio::Ref); + +# +# a closure with a list of allowed attribute names (these names +# correspond with the allowed 'get' and 'set' methods); each name also +# keep what type the attribute should be (use 'undef' if it is a +# simple scalar) +# +{ + my %_allowed = + ( + _first_page => undef, + _last_page => undef, + ); + + # return 1 if $attr is allowed to be set/get in this class + sub _accessible { + my ($self, $attr) = @_; + exists $_allowed{$attr} or $self->SUPER::_accessible ($attr); + } + + # return an expected type of given $attr + sub _attr_type { + my ($self, $attr) = @_; + if (exists $_allowed{$attr}) { + return $_allowed{$attr}; + } else { + return $self->SUPER::_attr_type ($attr); + } + } +} + + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Biblio/BiblioBase.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Biblio/BiblioBase.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,188 @@ +# $Id: BiblioBase.pm,v 1.9 2002/10/22 07:45:11 lapp Exp $ +# +# BioPerl module for Bio::Biblio::BiblioBase +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Biblio::BiblioBase - An abstract base for other biblio classes + +=head1 SYNOPSIS + + # do not instantiate this class directly + +=head1 DESCRIPTION + +It is a base class where all other biblio data storage classes inherit +from. It does not reflect any real-world object, it exists only for +convenience, in order to have a place for shared code. + +=head2 new() + +The I class method constructs a new biblio storage object. It +accepts list of named arguments - the same names as attribute names +prefixed with a minus sign. Available attribute names are listed in +the documentation of the individual biblio storage objects. + +=head2 Accessors + +All attribute names can be used as method names. When used without any +parameter the method returns current value of the attribute (or +undef), when used with a value the method sets the attribute to this +value and also returns it back. The set method also checks if the type +of the new value is correct. + +=head2 Custom classes + +If there is a need for new attributes, create your own class which +usually inherits from I. For new types of providers +and journals, let your class inherit directly from this +I class. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=cut + + +# Let the code begin... + + +package Bio::Biblio::BiblioBase; +use strict; +use vars qw(@ISA $AUTOLOAD); + +use Bio::Root::Root; + +@ISA = qw(Bio::Root::Root); + +# these methods should not be called here; +# they should be implemented by a subclass +sub _accessible { shift->throw_not_implemented(); } +sub _attr_type { shift->throw_not_implemented(); } + +# +# deal with 'set_' and 'get_' methods +# +sub AUTOLOAD { + my ($self, $newval) = @_; + + if ($AUTOLOAD =~ /.*::(\w+)/ && $self->_accessible ("_$1")) { + my $attr_name = "_$1"; + my $attr_type = $self->_attr_type ($attr_name); + my $ref_sub = + sub { + my ($this, $new_value) = @_; + return $this->{$attr_name} unless defined $new_value; + + # here we continue with 'set' method + my ($newval_type) = ref ($new_value) || 'string'; + my ($expected_type) = $attr_type || 'string'; +# $this->throw ("In method $AUTOLOAD, trying to set a value of type '$newval_type' but '$expected_type' is expected.") + $this->throw ($this->_wrong_type_msg ($newval_type, $expected_type, $AUTOLOAD)) + unless ($newval_type eq $expected_type) or + UNIVERSAL::isa ($new_value, $expected_type); + + $this->{$attr_name} = $new_value; + return $new_value; + }; + + no strict 'refs'; + *{$AUTOLOAD} = $ref_sub; + use strict 'refs'; + return $ref_sub->($self, $newval); + } + + $self->throw ("No such method: $AUTOLOAD"); +} + +# + +sub new { + my ($caller, @args) = @_; + my $class = ref ($caller) || $caller; + + # create and bless a new instance + my ($self) = $class->SUPER::new (@args); + + # make a hashtable from @args + my %param = @args; + @param { map { lc $_ } keys %param } = values %param; # lowercase keys + + # set all @args into this object with 'set' values; + # change '-key' into '_key', and making keys lowercase + my $new_key; + foreach my $key (keys %param) { + ($new_key = $key) =~ s/-/_/og; # change it everywhere, why not + my $method = lc (substr ($new_key, 1)); # omitting the first '_' + no strict 'refs'; + $method->($self, $param { $key }); + } + + # done + return $self; +} + +# +# set methods test whether incoming value is of a correct type; +# here we return message explaining it +# +sub _wrong_type_msg { + my ($self, $given_type, $expected_type, $method) = @_; + my $msg = 'In method '; + if (defined $method) { + $msg .= $method; + } else { + $msg .= (caller(1))[3]; + } + return ("$msg: Trying to set a value of type '$given_type' but '$expected_type' is expected."); +} + +# +# probably just for debugging +# TBD: to decide... +# +sub print_me { + my ($self) = @_; + require Data::Dumper; + return Data::Dumper->Dump ( [$self], ['Citation']); +} + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Biblio/Book.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Biblio/Book.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,143 @@ +# $Id: Book.pm,v 1.7 2002/10/22 07:45:11 lapp Exp $ +# +# BioPerl module for Bio::Biblio::Book +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Biblio::Book - Representation of a book + +=head1 SYNOPSIS + + $obj = new Bio::Biblio::Book (-identifier => '123abc', + -editor => new Bio::Biblio::Person + (-lastname => 'Loukides'), + -isbn => '0-596-00068-5'); + --- OR --- + + $obj = new Bio::Biblio::Book; + $obj->isbn ('0-596-00068-5'); + +=head1 DESCRIPTION + +A storage object for a book. +See its place in the class hierarchy in +http://industry.ebi.ac.uk/openBQS/images/bibobjects_perl.gif + +=head2 Attributes + +The following attributes are specific to this class +(however, you can also set and get all attributes defined in the parent classes): + + edition + editor type: Bio::Biblio::Provider + isbn + series + title + volume + +=head1 SEE ALSO + +=over + +=item * + +OpenBQS home page: http://industry.ebi.ac.uk/openBQS + +=item * + +Comments to the Perl client: http://industry.ebi.ac.uk/openBQS/Client_perl.html + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS + +Heikki Lehvaslaiho (heikki@ebi.ac.uk), +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=cut + + +# Let the code begin... + + +package Bio::Biblio::Book; +use strict; +use vars qw(@ISA); + +use Bio::Biblio::Ref; + +@ISA = qw(Bio::Biblio::Ref); + +# +# a closure with a list of allowed attribute names (these names +# correspond with the allowed 'get' and 'set' methods); each name also +# keep what type the attribute should be (use 'undef' if it is a +# simple scalar) +# +{ + my %_allowed = + ( + _edition => undef, + _editor => 'Bio::Biblio::Provider', + _isbn => undef, + _series => undef, + _title => undef, + _volume => undef, + ); + + # return 1 if $attr is allowed to be set/get in this class + sub _accessible { + my ($self, $attr) = @_; + exists $_allowed{$attr} or $self->SUPER::_accessible ($attr); + } + + # return an expected type of given $attr + sub _attr_type { + my ($self, $attr) = @_; + if (exists $_allowed{$attr}) { + return $_allowed{$attr}; + } else { + return $self->SUPER::_attr_type ($attr); + } + } +} + + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Biblio/BookArticle.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Biblio/BookArticle.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,132 @@ +# $Id: BookArticle.pm,v 1.6 2002/10/22 07:45:11 lapp Exp $ +# +# BioPerl module for Bio::Biblio::BookArticle +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Biblio::BookArticle - Representation of a book article + +=head1 SYNOPSIS + + $obj = new Bio::Biblio::BookArticle (-identifier => '123abc', + -book => new Bio::Biblio::Book); + --- OR --- + + $obj = new Bio::Biblio::BookArticle; + $obj->book (new Bio::Biblio::Book); + + +=head1 DESCRIPTION + +A storage object for a book article. +See its place in the class hierarchy in +http://industry.ebi.ac.uk/openBQS/images/bibobjects_perl.gif + +=head2 Attributes + +The following attributes are specific to this class +(however, you can also set and get all attributes defined in the parent classes): + + book type: Bio::Biblio::Book + +=head1 SEE ALSO + +=over + +=item * + +OpenBQS home page: http://industry.ebi.ac.uk/openBQS + +=item * + +Comments to the Perl client: http://industry.ebi.ac.uk/openBQS/Client_perl.html + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS + +Heikki Lehvaslaiho (heikki@ebi.ac.uk), +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=cut + + +# Let the code begin... + + +package Bio::Biblio::BookArticle; +use strict; +use vars qw(@ISA); + +use Bio::Biblio::Article; + +@ISA = qw(Bio::Biblio::Article); + +# +# a closure with a list of allowed attribute names (these names +# correspond with the allowed 'get' and 'set' methods); each name also +# keep what type the attribute should be (use 'undef' if it is a +# simple scalar) +# +{ + my %_allowed = + ( + _book => 'Bio::Biblio::Book', + ); + + # return 1 if $attr is allowed to be set/get in this class + sub _accessible { + my ($self, $attr) = @_; + exists $_allowed{$attr} or $self->SUPER::_accessible ($attr); + } + + # return an expected type of given $attr + sub _attr_type { + my ($self, $attr) = @_; + if (exists $_allowed{$attr}) { + return $_allowed{$attr}; + } else { + return $self->SUPER::_attr_type ($attr); + } + } +} + + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Biblio/IO.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Biblio/IO.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,373 @@ +# $Id: IO.pm,v 1.8 2002/10/22 07:45:11 lapp Exp $ +# +# BioPerl module for Bio::Biblio::IO +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Biblio::IO - Handling the bibliographic references + +=head1 SYNOPSIS + + use Bio::Biblio::IO; + + # getting citations from a file + $in = Bio::Biblio::IO->new ('-file' => 'myfile.xml' , + '-format' => 'medlinexml'); + --- OR --- + + # getting citations from a string + $in = Bio::Biblio::IO->new ('-data' => '...' , + '-format' => 'medlinexml'); + --- OR --- + + # getting citations from a string if IO::String is installed + use IO::String; + $in = Bio::Biblio::IO->new ('-fh' => IO::String->new ($citation), + '-format' => 'medlinexml'); + + $in = Bio::Biblio::IO->new(-fh => $io_handle , '-format' => 'medlinexml'); + + --- OR --- + + # getting citations from any IO handler + $in = Bio::Biblio::IO->new('-fh' => $io_handle , + '-format' => 'medlinexml'); + + + # now, having $in, we can read all citations + while ( my $citation = $in->next_bibref() ) { + &do_something_with_citation ($citation); + } + + --- OR --- + + # again reading all citation but now a callback defined in your + # code is used (note that the reading starts already when new() + # is called) + $io = new Bio::Biblio::IO ('-format' => 'medlinexml', + '-file' => $testfile, + '-callback' => \&callback); + sub callback { + my $citation = shift; + print $citation->{'_identifier'} . "\n"; + } + +Now, to actually get a citation in an XML format, +use I module which returns an XML string: + + use Bio::Biblio; + my $xml = new Bio::Biblio->get_by_id ('94033980'); + my $reader = Bio::Biblio::IO->new ('-data' => $xml, + '-format' => 'medlinexml'); + + while (my $citation = $reader->next_bibref()) { + ... do something here with $citation + } + +And, finally, the resulting citation can be received in different +output formats: + + $io = new Bio::Biblio::IO ('-format' => 'medlinexml', + '-result' => 'raw'); + --- OR --- + + $io = new Bio::Biblio::IO ('-format' => 'medlinexml', + '-result' => 'medline2ref'); + + --- OR --- + + $io = new Bio::Biblio::IO ('-format' => 'pubmedxml', + '-result' => 'pubmed2ref'); + +=head1 DESCRIPTION + +Bio::Biblio::IO is a handler module for accessing bibliographic +citations. The citations can be in different formats - assuming that +there is a corresponding module knowing that format in Bio::Biblio::IO +directory (e.g. Bio::Biblio::IO::medlinexml). The format (and the +module name) is given by the argument I<-format>. + +Once an instance of C class is available, the +citations can be read by calling repeatedly method I: + + while (my $citation = $reader->next_bibref()) { + ... do something here with $citation + } + +However, this may imply that all citations were already read into the +memory. If you expect a huge amount of citations to be read, you may +choose a I option. Your subroutine is specified in the +C method and is called everytime a new citation is available +(see an example above in SYNOPSIS). + +The citations returned by I or given to your callback +routine can be of different formats depending on the argument +I<-result>. One result type is I and it is represented by a +simple, not blessed hash table: + + $io = new Bio::Biblio::IO ('-result' => 'raw'); + +What other result formats are available depends on the module who +reads the citations in the first place. At the moment, the following +ones are available: + + $io = new Bio::Biblio::IO ('-result' => 'medline2ref'); + +This is a default result format for reading citations by the +I module. The C module is again the default +one. Which means that you can almost omit arguments (you still need to +say where the citations come from): + + $io = new Bio::Biblio::IO ('-file' => 'data/medline_data.xml'); + +Another result format available is for PUBMED citations (which is a +super-set of the MEDLINE citations having few more tags): + + $io = new Bio::Biblio::IO ('-format' => 'pubmedxml', + '-result' => 'pubmed2ref', + '-data' => $citation); + +Or, because C is a default one for PUBMED citations, you can say just: + + $io = new Bio::Biblio::IO ('-format' => 'pubmedxml', + '-data' => $citation); + +Both C and C results are objects defined in +the directory C. + +=head1 SEE ALSO + +=over + +=item * + +An example script I. It has many options and its +own help. The relevant options to this IO module are I<-f> +(specifying what file to read) and I<-O> (specifying what result +format to achieve). + +=item * + +OpenBQS home page: http://industry.ebi.ac.uk/openBQS + +=item * + +Comments to the Perl client: http://industry.ebi.ac.uk/openBQS/Client_perl.html + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Biblio::IO; + +use strict; +use vars qw(@ISA); + +use Bio::Root::Root; +use Bio::Root::IO; +use Symbol(); + +@ISA = qw(Bio::Root::Root Bio::Root::IO); + +my $entry = 0; + +sub new { + my ($caller, @args) = @_; + my $class = ref ($caller) || $caller; + + # if $caller is an object, or if it is an underlying + # 'real-work-doing' class (e.g. Bio::Biblio::IO::medlinexml) then + # we want to call SUPER to create and bless an object + if( $class =~ /Bio::Biblio::IO::(\S+)/ ) { + my ($self) = $class->SUPER::new (@args); + $self->_initialize (@args); + return $self; + + # this is called only the first time when somebody calls: 'new + # Bio::Biblio::IO (...)', and it actually loads a 'real-work-doing' + # module and call this new() method again (unless the loaded + # module has its own new() method) + } else { + my %param = @args; + @param{ map { lc $_ } keys %param } = values %param; # lowercase keys + my $format = $param{'-format'} || + $class->_guess_format( $param{-file} || $ARGV[0] ) || + 'medlinexml'; + $format = "\L$format"; # normalize capitalization to lower case + + # load module with the real implementation - as defined in $format + return undef unless (&_load_format_module ($format)); + + # this will call this same method new() - but rather its + # upper (object) branche + return "Bio::Biblio::IO::$format"->new(@args); + } +} + +sub newFh { + my $class = shift; + return unless my $self = $class->new(@_); + return $self->fh; +} + + +sub fh { + my $self = shift; + my $class = ref($self) || $self; + my $s = Symbol::gensym; + tie $$s,$class,$self; + return $s; +} + +# _initialize is chained for all Bio::Biblio::IO classes + +sub _initialize { + my ($self, @args) = @_; + # initialize the IO part + $self->_initialize_io (@args); +} + +=head2 next_bibref + + Usage : $citation = stream->next_bibref + Function: Reads the next citation object from the stream and returns it. + Returns : a Bio::Biblio::Ref citation object, or something else + (depending on the '-result' argument given in the 'new()' + method). + Args : none + +=cut + +sub next_bibref { + my ($self) = shift; + $self->throw ("Sorry, you cannot read from a generic Bio::Biblio::IO object."); +} + +# ----------------------------------------------------------------------------- + +=head2 _load_format_module + + Usage : $class->_load_format_module ($format) + Returns : 1 on success, undef on failure + Args : 'format' should contain the last part of the + name of a module who does the real implementation + +It does (in run-time) a similar thing as + + require Bio::Biblio::IO::$format + +It throws an exception if it fails to find and load the module +(for example, because of the compilation errors in the module). + +=cut + +sub _load_format_module { + my ($format) = @_; + my ($module, $load, $m); + + $module = "_throw (<_guess_format ($filename) + Returns : string with a guessed format of the input data (e.g. 'medlinexml') + Args : a file name whose extension can help to guess its format + +It makes an expert guess what kind of data are in the given file +(but be prepare that $filename may be empty). + +=cut + +sub _guess_format { + my $class = shift; + return unless $_ = shift; + return 'medlinexml' if (/\.(xml|medlinexml)$/i); + return; +} + +sub DESTROY { + my $self = shift; + + $self->close(); +} + +sub TIEHANDLE { + my ($class,$val) = @_; + return bless {'biblio' => $val}, $class; +} + +sub READLINE { + my $self = shift; + return $self->{'biblio'}->next_bibref() unless wantarray; + my (@list, $obj); + push @list, $obj while $obj = $self->{'biblio'}->next_bibref(); + return @list; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Biblio/IO/medline2ref.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Biblio/IO/medline2ref.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,542 @@ +# $Id: medline2ref.pm,v 1.10 2002/10/22 07:45:13 lapp Exp $ +# +# BioPerl module Bio::Biblio::IO::medline2ref.pm +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Biblio::IO::medline2ref - A converter of a raw hash to MEDLINE citations + +=head1 SYNOPSIS + + # to be written + +=head1 DESCRIPTION + + # to be written + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=head1 APPENDIX + +Here is the rest of the object methods. Internal methods are preceded +with an underscore _. + +=cut + + +# Let the code begin... + + +package Bio::Biblio::IO::medline2ref; + +use strict; +use vars qw(@ISA $VERSION $Revision); + +use Bio::Root::Root; +use Bio::Biblio::MedlineJournal; +use Bio::Biblio::MedlineBook; +use Bio::Biblio::Provider; +use Bio::Biblio::Person; +use Bio::Biblio::Organisation; + +@ISA = qw(Bio::Root::Root); + +BEGIN { + # set the version for version checking + $VERSION = do { my @r = (q$Revision: 1.10 $ =~ /\d+/g); sprintf "%d.%-02d", @r }; + $Revision = q$Id: medline2ref.pm,v 1.10 2002/10/22 07:45:13 lapp Exp $; +} + +# ----------------------------------------------------------------------------- +sub new { + my ($caller, @args) = @_; + my $class = ref ($caller) || $caller; + + # object creation and blessing + my ($self) = $class->SUPER::new (@args); + + # make a hashtable from @args + my %param = @args; + @param { map { lc $_ } keys %param } = values %param; # lowercase keys + + # copy all @args into this object (overwriting what may already be + # there) - changing '-key' into '_key', and making keys lowercase + my $new_key; + foreach my $key (keys %param) { + ($new_key = $key) =~ s/^-/_/; + $self->{ lc $new_key } = $param { $key }; + } + + # done + return $self; +} + +# --------------------------------------------------------------------- +# +# Here is the core... +# +# --------------------------------------------------------------------- + +sub _load_instance { + my ($self, $source) = @_; + + # + # MEDLINE has only JournalArticles and BookArticles + # but we may create a general Ref if there is no attribute 'article' + # + my $result; + my $article = $$source{'article'}; + if (defined $article) { + if (defined $$article{'journal'}) { + $result = $self->_new_instance ('Bio::Biblio::MedlineJournalArticle'); + $result->type ('JournalArticle'); + } elsif (defined $$article{'book'}) { + $result = $self->_new_instance ('Bio::Biblio::MedlineBookArticle'); + $result->type ('BookArticle'); + } else { + $result->type ('MedlineArticle'); + } + } + $result = $self->_new_instance ('Bio::Biblio::Ref') unless defined $result; + return $result; +} + +sub convert { + my ($self, $source) = @_; + my $result = $self->_load_instance ($source); + + if (defined $result->type) { + if ($result->type eq 'JournalArticle') { + &_convert_journal_article ($result, $source); + } elsif ($result->type eq 'BookArticle') { + &_convert_book_article ($result, $source); + } elsif ($result->type eq 'Article') { + &_convert_article ($result, $source); + } + } + + # + # now do the attributes which are the same for all resource types + # + + # ...identification is now by MedlineID but the trend is to replace + # it by PMID (I have heard) theefore we keep both also separately + # from the 'identifier' + if (defined $$source{'medlineID'}) { + $result->identifier ($$source{'medlineID'}); + } else { + $result->identifier ($$source{'PMID'}); + } + $result->pmid ($$source{'PMID'}) if defined $$source{'PMID'}; + $result->medline_id ($$source{'medlineID'}) if defined $$source{'medlineID'}; + + # ...few others + $result->citation_owner ($$source{'owner'}) if defined $$source{'owner'}; + $result->status ($$source{'status'}) if defined $$source{'status'}; + $result->number_of_references ($$source{'numberOfReferences'}) if defined $$source{'numberOfReferences'}; + + # ...entry status of the citation in the repository + my $date; + if (defined $$source{'dateRevised'}) { + $result->last_modified_date (&_convert_date ($$source{'dateRevised'})); + $date = &_convert_date ($$source{'dateCreated'}); + $result->date_created ($date) if defined $date; + $date = &_convert_date ($$source{'dateCompleted'}); + $result->date_completed ($date) if defined $date; + } elsif (defined $$source{'dateCompleted'}) { + $result->last_modified_date (&_convert_date ($$source{'dateCompleted'})); + $date = &_convert_date ($$source{'dateCreated'}); + $result->date_created ($date) if defined $date; + } elsif (defined $$source{'dateCreated'}) { + $result->last_modified_date (&_convert_date ($$source{'dateCreated'})); + } + + # ...put citation subsets in a comma-separated string + if (defined $$source{'citationSubsets'}) { + $result->repository_subset (join (',', @{ $$source{'citationSubsets'} })); + } + + # ...MEDLINE's Comments & Corrections will be arrays of hashes + if (defined $$source{'commentsCorrections'}) { + my $corr = $$source{'commentsCorrections'}; + $result->comment_ons ($$corr{'commentOns'}) if defined $$corr{'commentOns'}; + $result->comment_ins ($$corr{'commentIns'}) if defined $$corr{'commentIns'}; + $result->erratum_ins ($$corr{'erratumIns'}) if defined $$corr{'erratumIns'}; + $result->erratum_fors ($$corr{'erratumFors'}) if defined $$corr{'erratumFors'}; + $result->original_report_ins ($$corr{'originalReportIns'}) if defined $$corr{'originalReportIns'}; + $result->republished_froms ($$corr{'republishedFroms'}) if defined $$corr{'republishedFroms'}; + $result->republished_ins ($$corr{'republishedIns'}) if defined $$corr{'republishedIns'}; + $result->retraction_ofs ($$corr{'retractionOfs'}) if defined $$corr{'retractionOfs'}; + $result->retraction_ins ($$corr{'retractionIns'}) if defined $$corr{'retractionIns'}; + $result->summary_for_patients_ins ($$corr{'summaryForPatientsIns'}) if defined $$corr{'summaryForPatientsIns'}; + $result->update_ins ($$corr{'updateIns'}) if defined $$corr{'updateIns'}; + $result->update_ofs ($$corr{'updateOfs'}) if defined $$corr{'updateOfs'}; + } + + # ...MEDLINE's GeneSymbols are put in a comma-separated string + if (defined $$source{'geneSymbols'}) { + $result->gene_symbols (join (',', @{ $$source{'geneSymbols'} })); + } + + # ...MEDLINE's GeneralNotes into an array of hashtables, each one + # having keys for the 'owner' and the 'note' + $result->general_notes ($$source{'generalNotes'}) if defined $$source{'generalNotes'}; + + # ...MEDLINE's PersonalNameSubjects into contributors (TBD: is that correct?) + if (defined $$source{'personalNameSubjects'}) { + my @contributors; + foreach my $person ( @{ $$source{'personalNameSubjects'} } ) { + push (@contributors, &_convert_personal_name ($person)); + } + $result->contributors (\@contributors); + } + + # ...MEDLINE's OtherAbstract into an array of hashtables, each one + # having keys for the 'type', 'AbstractText' and the 'copyright' + $result->other_abstracts ($$source{'otherAbstracts'}) if defined $$source{'otherAbstracts'}; +# if (defined $$source{'otherAbstracts'}) { +# my @other_abstracts = (); +# foreach my $oa ( @{ $$source{'otherAbstracts'} } ) { +# if (defined $$oa{'abstractText'}) { +# my $abstract = $$oa{'abstractText'}; +# delete $$oa{'abstractText'}; +# $$oa{'abstract'} = $$abstract{'abstractText'}; +# $$oa{'rights'} = $$abstract{'copyrightInformation'} if defined $$abstract{'copyrightInformation'}; +# push (@other_abstracts, $oa); +# } +# } +# $result->other_abstracts (\@other_abstracts); +# } + + # ...MEDLINE's OtherIDs into an array of hashtables, each one + # having keys for the 'id', and 'source' + $result->other_ids ($$source{'otherIDs'}) if defined $$source{'otherIDs'}; + + # ...MEDLINE's Chemicals - store them as an array of hashtables + # (each one for each Chemical) + $result->chemicals ($$source{'chemicals'}) if defined $$source{'chemicals'}; + + # MeshHeadings are put on two places: + # - a complete information in a property called "MeshHeadings", and + # - only descriptors in the hashtable "subject_headings", together + # with the word "MeSH" in "subject_headings_source" + if (defined $$source{'meshHeadings'}) { + $result->mesh_headings ($$source{'meshHeadings'}); + my %subject_headings; + foreach my $mesh ( @{ $$source{'meshHeadings'} } ) { + $subject_headings{ $$mesh{'descriptorName'} } = 1 if defined $$mesh{'descriptorName'}; + } + if (%subject_headings) { + $result->subject_headings (\%subject_headings); + $result->subject_headings_source ('Mesh'); + } + } + + # ...MEDLINE's keyword lists are merger all together (this may not + # be good idea - but again the keywords are better accessible + # -TBD?) + if (defined $$source{'keywordLists'}) { + my %keywords; + foreach my $keywords ( @{ $$source{'keywordLists'} } ) { + if ($$keywords{'keywords'}) { + foreach my $keyword ( @{ $$keywords{'keywords'} } ) { + $keywords{$keyword} = 1; + } + } + } + $result->keywords (\%keywords) if %keywords; + } + + # Done! + return $result; +} + +# load a module (given as a real module name, e.g. 'Bio::Biblio::MedlineJournalArticle'), +# call new() method on it, and return the instance returned by the new() method +sub _new_instance { + my ($self, $module) = @_; + my ($filename); + ($filename = $module . '.pm') =~ s|\:\:|/|g; + eval { require $filename; }; + $self->throw ("Loading error when trying '$filename'. $@\n") if $@; + return $module->new; +} + +# +# see OpenBQS specification (http://industry.ebi.ac.uk/openBQS) how +# a date should be coded; +# TBD: this can be improved - checking is missing, timezones, +# converting to UTC... +# Also note that this routine does not convert 'medline_date' - it +# is stored in a separate attribute without ant conversion. +# +sub _convert_date { + my ($date) = @_; + return undef unless + exists $$date{'year'} or + exists $$date{'month'} or + exists $$date{'day'} or + exists $$date{'hour'} or + exists $$date{'minute'} or + exists $$date{'second'}; + + + my $converted = (exists $$date{'year'} ? $$date{'year'} : '0000'); + + if (exists $$date{'month'}) { + $converted .= '-' . $$date{'month'}; + } elsif (exists $$date{'day'}) { + $converted .= '-00'; + } + + if (exists $$date{'day'}) { + $converted .= '-' . $$date{'day'}; + } elsif (exists $$date{'hour'}) { + $converted .= '-00'; + } + + if (exists $$date{'hour'}) { + $converted .= 'T' . $$date{'hour'} . + ':' . (exists $$date{'minute'} ? $$date{'minute'} : '00') . + ':' . (exists $$date{'second'} ? $$date{'second'} : '00') . 'Z'; + } + return $converted; +} + +# $person is a hash with persons attributes - we need to create and +# return a Bio::Biblio::Person object +sub _convert_personal_name { + my ($person) = @_; + foreach my $key (keys %$person) { + $$person{"_$key"} = $$person{$key}; + delete $$person{$key}; + } + new Bio::Biblio::Person (%$person); +} + +# +# takes journal article related attributes from $article and convert +# them into $result and at the end call _convert_article (which is +# shared with book article) +# +sub _convert_journal_article { + my ($result, $source) = @_; + my $article = $$source{'article'}; + + # create and populate both a Journal and a resulting Article objects + my $from_journal = $$article{'journal'}; + my $journal = new Bio::Biblio::MedlineJournal; + $journal->name ($$from_journal{'title'}) if defined $$from_journal{'title'}; + $journal->issn ($$from_journal{'iSSN'}) if defined $$from_journal{'iSSN'}; + $journal->abbreviation ($$from_journal{'iSOAbbreviation'}) if defined $$from_journal{'iSOAbbreviation'}; + $journal->coden ($$from_journal{'coden'}) if defined $$from_journal{'coden'}; + if (defined $$from_journal{'journalIssue'}) { + my $issue = $$from_journal{'journalIssue'}; + $result->volume ($$issue{'volume'}) if defined $$issue{'volume'}; + $result->issue ($$issue{'issue'}) if defined $$issue{'issue'}; + + if (defined $$issue{'pubDate'}) { + my $pub_date = $$issue{'pubDate'}; + my $converted = &_convert_date ($pub_date); + $result->date ($converted) if defined $converted; + + # Some parts of a MEDLINE date are stored just as properties + # because they have almost non-parseable format :-). + $result->medline_date ($$pub_date{'medlineDate'}) if defined $$pub_date{'medlineDate'}; + $result->season ($$pub_date{'season'}) if defined $$pub_date{'season'}; + } + } + + # ...some attributes are in journalInfo (which is outside of the article) + my $journal_info = $$source{'journalInfo'}; + if (defined $journal_info) { + $journal->country ($$journal_info{'country'}) if defined $$journal_info{'country'}; + $journal->medline_ta ($$journal_info{'medlineTA'}) if defined $$journal_info{'medlineTA'}; + $journal->medline_code ($$journal_info{'medlineCode'}) if defined $$journal_info{'medlineCode'}; + $journal->nlm_unique_id ($$journal_info{'nlmUniqueID'}) if defined $$journal_info{'nlmUniqueID'}; + } + + $result->journal ($journal); + &_convert_article ($result, $source); +} + +# +# takes book article related attributes from $article and convert +# them into $result and at the end call _convert_article (which is +# shared with journal article) +# +sub _convert_book_article { + my ($result, $source) = @_; + my $article = $$source{'article'}; + + # create and populate both book and resulting article objects + my $from_book = $$article{'book'}; + my $book = new Bio::Biblio::MedlineBook; + $book->title ($$from_book{'title'}) if defined $$from_book{'title'}; + $book->volume ($$from_book{'volume'}) if defined $$from_book{'volume'}; + $book->series ($$from_book{'collectionTitle'}) if defined $$from_book{'collectionTitle'}; + + if (defined $$from_book{'pubDate'}) { + my $pub_date = $$from_book{'pubDate'}; + my $converted = &_convert_date ($pub_date); + $result->pub_date ($converted) if defined $converted; + + # Some parts of a MEDLINE date are stored just as properties + # because they have almost non-parseable format :-). + $result->medline_date ($$pub_date{'medlineDate'}) if defined $$pub_date{'medlineDate'}; + $result->season ($$pub_date{'season'}) if defined $$pub_date{'season'}; + } + + if (defined $$from_book{'publisher'}) { + my $publisher = new Bio::Biblio::Organisation; + $publisher->name ($$from_book{'publisher'}); + $book->publisher ($publisher); + } + + my @authors = &_convert_providers ($$from_book{'authors'}); + $book->authors (\@authors) if @authors; + + $result->book ($book); + &_convert_article ($result, $source); +} + +# +# takes from $source article related attributes and convert them into +# $article (these attributes are the same both for journal and book +# articles +# +sub _convert_article { + my ($article, $source) = @_; + my $from_article = $$source{'article'}; + + $article->title ($$from_article{'articleTitle'}) if defined $$from_article{'articleTitle'}; + $article->affiliation ($$from_article{'affiliation'}) if defined $$from_article{'affiliation'}; + $article->vernacular_title ($$from_article{'vernacularTitle'}) if defined $$from_article{'vernacularTitle'}; + $article->date_of_electronic_publication + ($$from_article{'dateOfElectronicPublication'}) if defined $$from_article{'dateOfElectronicPublication'}; + + if (defined $$from_article{'pagination'}) { + my $pagination = $$from_article{'pagination'}; + $article->first_page ($$pagination{'startPage'}) if defined $$pagination{'startPage'}; + $article->last_page ($$pagination{'endPage'}) if defined $$pagination{'endPage'}; + $article->medline_page ($$pagination{'medlinePgn'}) if defined $$pagination{'medlinePgn'}; + } + + if (defined $$from_article{'abstract'}) { + my $abstract = $$from_article{'abstract'}; + $article->abstract ($$abstract{'abstractText'}) if defined $$abstract{'abstractText'}; + $article->abstract_type ('text/plain'); + $article->rights ($$abstract{'copyrightInformation'}) if defined $$abstract{'copyrightInformation'}; + } + + if (defined $$from_article{'languages'}) { + my $languages = $$from_article{'languages'}; # ref-array + if ( @{ $languages } > 0) { + $article->language ( $$languages[0] ); + } + if ( @{ $languages } > 1) { + $article->other_languages (join (',', @{ $languages })); + } + } + + my @authors = &_convert_providers ($$from_article{'authors'}); + if (@authors) { + $article->authors (\@authors); + $article->author_list_complete + ($$from_article{'authorListComplete'}) if defined $$from_article{'authorListComplete'}; + } + + # references to database entries are prefixed with database name + # (separated by a slash) + use Bio::Annotation::DBLink; + if (defined $$from_article{'dataBanks'}) { + my $databanks = $$from_article{'dataBanks'}; # a ref-array + my @references; + foreach my $bank ( @{ $databanks } ) { + my $db_name = $$bank{'dataBankName'}; + if (defined $$bank{'accessionNumbers'}) { + foreach my $accn ( @{ $$bank{'accessionNumbers'} } ) { + my $dblink = new Bio::Annotation::DBLink (-primary_id => $accn); + $dblink->database ($db_name); # it does not matter if it is undef + push (@references, $dblink); + } + } + } + if (@references) { + $article->cross_references (\@references); + $article->cross_references_list_complete + ($$from_article{'dataBankListComplete'}) if defined $$from_article{'dataBankListComplete'}; + } + } + + # grants are stored in an array of hashtables (each of the + # hashtables has keys agency, grantID and acronym) + $article->grants ($$from_article{'grants'}) if defined $$from_article{'grants'}; + $article->grant_list_complete + ($$from_article{'grantListComplete'}) if defined $$from_article{'grandListComplete'}; + +} + +# +# takes a ref-array of providers - they can be persons or +# organisations, and returns an array of converted providers +# +sub _convert_providers { + my ($providers) = @_; + return () unless defined $providers; + + my @results; + foreach my $provider ( @{ $providers } ) { + if (defined $$provider{'personalName'}) { + my $converted = &_convert_personal_name ($$provider{'personalName'}); + push (@results, $converted) if defined $converted; + } elsif (defined $$provider{'collectiveName'}) { + push (@results, new Bio::Biblio::Organisation (-name => $$provider{'collectiveName'})); + } else { + new Bio::Biblio::Provider; + } + } + return () unless @results; + return @results; +} + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Biblio/IO/medlinexml.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Biblio/IO/medlinexml.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,744 @@ +# $Id: medlinexml.pm,v 1.5 2002/10/22 07:45:13 lapp Exp $ +# +# BioPerl module Bio::Biblio::IO::medlinexml.pm +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Biblio::IO::medlinexml - A converter of XML files with MEDLINE citations + +=head1 SYNOPSIS + +Do not use this object directly, it is recommended to access it and use +it through the I module: + + use Bio::Biblio::IO; + my $io = new Bio::Biblio::IO (-format => 'medlinexml'); + +=head1 DESCRIPTION + +This object reads bibliographic citations in XML/MEDLINE format and +converts them into I objects. It is an +implementation of methods defined in I. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=head1 APPENDIX + +The main documentation details are to be found in +L. + +Here is the rest of the object methods. Internal methods are preceded +with an underscore _. + +=cut + + +# Let the code begin... + + +package Bio::Biblio::IO::medlinexml; +use vars qw(@ISA $VERSION $Revision); +use vars qw(@Citations $Callback $Convert @ObjectStack @PCDataStack); +use vars qw(%PCDATA_NAMES %SIMPLE_TREATMENT %POP_DATA_AND_PEEK_OBJ %POP_OBJ_AND_PEEK_OBJ); +use vars qw(%POP_AND_ADD_ELEMENT %POP_AND_ADD_DATA_ELEMENT); + +use strict; + +use Bio::Biblio::IO; +use XML::Parser; + +@ISA = qw(Bio::Biblio::IO); + +BEGIN { + # set the version for version checking + $VERSION = do { my @r = (q$Revision: 1.5 $ =~ /\d+/g); sprintf "%d.%-02d", @r }; + $Revision = q$Id: medlinexml.pm,v 1.5 2002/10/22 07:45:13 lapp Exp $; +} + +# ----------------------------------------------------------------------------- + +sub _initialize { + my ($self, @args) = @_; + + # make a hashtable from @args + my %param = @args; + @param { map { lc $_ } keys %param } = values %param; # lowercase keys + + # copy all @args into this object (overwriting what may already be + # there) - changing '-key' into '_key', and making keys lowercase + my $new_key; + foreach my $key (keys %param) { + ($new_key = $key) =~ s/^-/_/; + $self->{ lc $new_key } = $param { $key }; + } + + # find the format for output - and put it into a global $Convert + # because it will be used by the event handler who knows nothing + # about this object + my $result = $self->{'_result'} || 'medline2ref'; + $result = "\L$result"; # normalize capitalization to lower case + + # a special case is 'raw' when no converting module is loaded + # and citations will be returned as a hashtable (the one which + # is created during parsing XML file/stream) + unless ($result eq 'raw') { + + # load module with output converter - as defined in $result + if (defined &Bio::Biblio::IO::_load_format_module ($result)) { + $Convert = "Bio::Biblio::IO::$result"->new (@args); + } + } + + # create an instance of the XML parser + # (unless it is already there...) + $self->{'_xml_parser'} = new XML::Parser (Handlers => {Init => \&handle_doc_start, + Start => \&handle_start, + End => \&handle_end, + Char => \&handle_char, + Final => \&handle_doc_end}) + unless $self->{'_xml_parser'}; + + # if there is an argument '-callback' then start parsing at once - + # the registered event handlers will use 'callback' to report + # back after each citation + # + # we need to remember this situation also in a global variable + # because the event handler subroutines know nothing about this + # object (unfortunately) + if ($Callback = $self->{'_callback'}) { + $self->_parse; + } +} + +# ----------------------------------------------------------------------------- + +sub _parse { + my ($self) = shift; + + + if (defined $self->{'_file'}) { + $self->{'_xml_parser'}->parsefile ($self->{'_file'}); + } elsif (defined $self->{'_fh'}) { + my $fh = $self->{'_fh'}; + if (ref ($fh) and UNIVERSAL::isa ($fh, 'IO::Handler')) { + $self->{'_xml_parser'}->parse ($fh); + } else { + my $data; + $data .= $_ while <$fh>; + $self->{'_xml_parser'}->parse ($data); + } + } elsif ($self->{'_data'}) { + $self->{'_xml_parser'}->parse ($self->{'_data'}); + } else { + $self->throw ("XML source to be parsed is unknown. Should be given in the new()."); + } + + # when parsing is done all citations have already been delivered + # to the caller using her callbacks - and nothing to be stored + # here, or parser put all citations into global @Cittaions where + # we want to copy there into this instance - so any caller can + # start parsing other XML input without overwriting already read + # citations from the first parser + if (@Citations) { + $self->{'_citations'} = []; + foreach my $cit (@Citations) { + push (@{ $self->{'_citations'} }, $cit); + undef $cit; + } + undef @Citations; + } +} + +# --------------------------------------------------------------------- +# +# Here is an implementation of Bio::Biblio::IO methods +# +# --------------------------------------------------------------------- + +# global variables used by the XML event handlers +# TBD: make them accessible at least ONLY from this module... +@Citations = (); +$Callback = undef; +$Convert = undef; +@ObjectStack = (); # it has Hash-Ref elements +@PCDataStack = (); # it has String elements + +sub next_bibref { + my ($self) = @_; + $self->throw ("Method 'next_bibref' should not be called when a '-callback' argument given.") + if $self->{'_callback'}; + + # parse the whole input into memory (global @Citations) + # and then copy it into this object + $self->_parse unless $self->{'_citations'}; + + # return the next citation (and forget it here) + shift (@{ $self->{'_citations'} }); +} + +# --------------------------------------------------------------------- +# +# Here are the event handlers (they do the real job!) +# +# Note that these methods do not know anything about the object they +# are part of - they are called as subroutines. not as methods. +# It also means that they need to use global variables to store and +# exchnage intermediate results. +# +# --------------------------------------------------------------------- + +# +# This is a list of #PCDATA elements. +# +%PCDATA_NAMES = ( + 'AbstractText' => 1, + 'AccessionNumber' => 1, + 'Acronym' => 1, + 'Affiliation' => 1, + 'Agency' => 1, + 'ArticleTitle' => 1, + 'CASRegistryNumber' => 1, + 'CitationSubset' => 1, + 'Coden' => 1, + 'CollectionTitle' => 1, + 'CollectiveName' => 1, + 'CopyrightInformation' => 1, + 'Country' => 1, + 'DataBankName' => 1, + 'DateOfElectronicPublication' => 1, + 'Day' => 1, + 'Descriptor' => 1, + 'DescriptorName' => 1, + 'EndPage' => 1, + 'FirstName' => 1, + 'ForeName' => 1, + 'GeneralNote' => 1, + 'GeneSymbol' => 1, + 'GrantID' => 1, + 'Hour' => 1, + 'ISOAbbreviation' => 1, + 'ISSN' => 1, + 'Initials' => 1, + 'Issue' => 1, + 'Keyword' => 1, + 'Language' => 1, + 'LastName' => 1, + 'MedlineCode' => 1, + 'MedlineDate' => 1, + 'MedlineID' => 1, + 'MedlinePgn' => 1, + 'MedlineTA' => 1, + 'MiddleName' => 1, + 'Minute' => 1, + 'Month' => 1, + 'NameOfSubstance' => 1, + 'NlmUniqueID' => 1, + 'Note' => 1, + 'NumberOfReferences' => 1, + 'OtherID' => 1, + 'PMID' => 1, + 'PublicationType' => 1, + 'Publisher' => 1, + 'QualifierName' => 1, + 'RefSource' => 1, + 'RegistryNumber' => 1, + 'Season' => 1, + 'Second' => 1, + 'SpaceFlightMission' => 1, + 'StartPage' => 1, + 'SubHeading' => 1, + 'Suffix' => 1, + 'Title' => 1, + 'VernacularTitle' => 1, + 'Volume' => 1, + 'Year' => 1, + ); + +%SIMPLE_TREATMENT = ( + 'MeshHeading' => 1, + 'Author' => 1, + 'Article' => 1, + 'Book' => 1, + 'Investigator' => 1, + 'Chemical' => 1, + 'Pagination' => 1, + 'MedlineJournalInfo' => 1, + 'JournalIssue' => 1, + 'Journal' => 1, + 'DateCreated' => 1, + 'DateCompleted' => 1, + 'DateRevised' => 1, + 'PubDate' => 1, + 'Abstract' => 1, + 'Grant' => 1, + 'CommentsCorrections' => 1, + 'CommentOn' => 1, + 'CommentIn' => 1, + 'ErratumFor' => 1, + 'ErratumIn' => 1, + 'OriginalReportIn' => 1, + 'RepublishedFrom' => 1, + 'RepublishedIn' => 1, + 'RetractionOf' => 1, + 'RetractionIn' => 1, + 'SummaryForPatientsIn' => 1, + 'UpdateIn' => 1, + 'UpdateOf' => 1, + 'DataBank' => 1, + 'KeywordList' => 1, + 'DeleteCitation' => 1, + ); + +%POP_DATA_AND_PEEK_OBJ = ( + 'Descriptor' => 1, + 'DescriptorName' => 1, + 'Year' => 1, + 'Month' => 1, + 'Day' => 1, + 'LastName' => 1, + 'Initials' => 1, + 'FirstName' => 1, + 'ForeName' => 1, + 'NameOfSubstance' => 1, + 'RegistryNumber' => 1, + 'CASRegistryNumber' => 1, + 'MiddleName' => 1, + 'NlmUniqueID' => 1, + 'MedlineTA' => 1, + 'MedlinePgn' => 1, + 'MedlineCode' => 1, + 'Country' => 1, + 'ISSN' => 1, + 'ArticleTitle' => 1, + 'Issue' => 1, + 'AbstractText' => 1, + 'VernacularTitle' => 1, + 'GrantID' => 1, + 'Agency' => 1, + 'Acronym' => 1, + 'MedlineDate' => 1, + 'NumberOfReferences' => 1, + 'RefSource' => 1, + 'DataBankName' => 1, + 'CopyrightInformation' => 1, + 'Suffix' => 1, + 'Note' => 1, + 'CollectiveName' => 1, + 'Hour' => 1, + 'Minute' => 1, + 'Second' => 1, + 'Season' => 1, + 'Coden' => 1, + 'ISOAbbreviation' => 1, + 'Publisher' => 1, + 'CollectionTitle' => 1, + 'DateOfElectronicPublication' => 1, + 'StartPage' => 1, + 'EndPage' => 1, + 'Volume' => 1, + 'Title' => 1, + ); + +%POP_OBJ_AND_PEEK_OBJ = ( + 'Pagination' => 1, + 'JournalIssue' => 1, + 'Journal' => 1, + 'DateCreated' => 1, + 'Article' => 1, + 'DateCompleted' => 1, + 'DateRevised' => 1, + 'CommentsCorrections' => 1, + 'Book' => 1, + 'PubDate' => 1, + 'Abstract' => 1, + ); + +%POP_AND_ADD_DATA_ELEMENT = ( + 'Keyword' => 'keywords', + 'PublicationType' => 'publicationTypes', + 'CitationSubset' => 'citationSubsets', + 'Language' => 'languages', + 'AccessionNumber' => 'accessionNumbers', + 'GeneSymbol' => 'geneSymbols', + 'SpaceFlightMission' => 'spaceFlightMissions', + ); + + +%POP_AND_ADD_ELEMENT = ( + 'OtherAbstract' => 'otherAbstracts', + 'Chemical' => 'chemicals', + 'KeywordList' => 'keywordLists', + 'Grant' => 'grants', + 'UpdateIn' => 'updateIns', + 'CommentOn' => 'commentOns', + 'CommentIn' => 'commentIns', + 'DataBank' => 'dataBanks', + 'PersonalNameSubject' => 'personalNameSubjects', + 'ErratumFor' => 'erratumFors', + 'ErratumIn' => 'erratumIns', + 'RepublishedFrom' => 'republishedFroms', + 'RepublishedIn' => 'republishedIns', + 'RetractionOf' => 'retractionOfs', + 'RetractionIn' => 'retractionIns', + 'UpdateOf' => 'updateOfs', + 'OriginalReportIn' => 'originalReportIns', + 'SummaryForPatientsIn' => 'summaryForPatientsIns', + 'MeshHeading' => 'meshHeadings', + ); + +sub handle_doc_start { + @Citations = (); + @ObjectStack = (); + @PCDataStack = (); +} + +sub handle_doc_end { + undef @ObjectStack; + undef @PCDataStack; +} + +sub handle_char { + my ($expat, $str) = @_; + + # this may happen with whitespaces between tags; + # but because I have not created an entry for data on the stack + # I can also ignore such data, can't I + return if $#PCDataStack < 0; + + $PCDataStack [$#PCDataStack] .= $str; +} + + + + +=head2 VERSION and Revision + + Usage : print $Bio::Biblio::IO::medlinexml::VERSION; + print $Bio::Biblio::IO::medlinexml::Revision; + +=cut + + +sub handle_start { + my ($expat, $e, %attrs) = @_; +# &_debug_object_stack ("START", $e); + + # + # The #PCDATA elements which have an attribute list must + # be first here - because for them I create entries both on + # the @PCDataStack _and_ on @ObjectStack. + # + if ($e eq 'QualifierName' or + $e eq 'SubHeading') { + my %p = (); + $p{'majorTopic'} = "Y" if $attrs{'MajorTopicYN'}; + push (@ObjectStack, \%p); + } + + if ($e eq 'GeneralNote') { + my %p = (); + $p{'owner'} = $attrs{'Owner'} if $attrs{'Owner'}; + push (@ObjectStack, \%p); + } + + if ($e eq 'OtherID') { + my %p = (); + $p{'source'} = $attrs{'Source'}; + push (@ObjectStack, \%p); + } + + # + # A special treatment is for attributes for personal name. + # Because there is no XML element 'PersonalName' I need to + # to put yet another object on @ObjectStack unless there is + # already one. + # + if ($e eq 'LastName' or + $e eq 'FirstName' or + $e eq 'MidleName' or + $e eq 'Initials' or + $e eq 'ForeName' or + $e eq 'Suffix') { + my $peek = $ObjectStack[$#ObjectStack]; + push (@ObjectStack, {'type' => 'PersonalName'}) + unless (ref $peek and &_eq_hash_elem ($peek, 'type', 'PersonalName')); + } + + # + # Then we have #PCDATA elements without an attribute list. + # For them I create an entry on @PCDataStack. + # + if (exists $PCDATA_NAMES{$e}) { + push (@PCDataStack, ''); + + # + # And finally, all non-PCDATA elements go to the objectStack + # + } elsif (exists $SIMPLE_TREATMENT{$e}) { + push (@ObjectStack, {}); + + } elsif ($e eq 'PersonalNameSubject') { + push (@ObjectStack, {'type' => 'PersonalName'}); + + } elsif ($e eq 'DescriptorName' or + $e eq 'Descriptor') { + if (&_eq_hash_elem (\%attrs, 'MajorTopicYN', "Y")) { + my $peek = $ObjectStack[$#ObjectStack]; + $$peek{'descriptorMajorTopic'} = "Y"; + } + + } elsif ($e eq 'MedlineCitation' || + $e eq 'NCBIArticle') { + my %p = ( 'type' => 'MedlineCitation' ); + $p{'owner'} = $attrs{'Owner'} if $attrs{'Owner'}; + $p{'status'} = $attrs{'Status'} if $attrs{'Status'}; + push (@ObjectStack, \%p); + + } elsif ($e eq 'GrantList') { + if (&_eq_hash_elem (\%attrs, 'CompleteYN', "N")) { + my $peek = $ObjectStack[$#ObjectStack]; + $$peek{'grantListComplete'} = "N"; + } + + } elsif ($e eq 'DataBankList') { + if (&_eq_hash_elem (\%attrs, 'CompleteYN', "N")) { + my $peek = $ObjectStack[$#ObjectStack]; + $$peek{'dataBankListComplete'} = "N"; + } + + } elsif ($e eq 'AuthorList') { + if (&_eq_hash_elem (\%attrs, 'CompleteYN', "N")) { + my $peek = $ObjectStack[$#ObjectStack]; + $$peek{'authorListComplete'} = "N"; + } + + } elsif ($e eq 'OtherAbstract') { + my %p = (); + $p{'type'} = $attrs{'Type'} if $attrs{'Type'}; + push (@ObjectStack, \%p); +# push (@ObjectStack, { 'type' => 'Abstract' }); + + } +} + +sub handle_end { + my ($expat, $e) = @_; + # + # First I have to deal with those elements which are both PCDATA + # (and therefore they are on the pcdataStack) and which have an + # attribute list (therefore they are also known as a separate + # p-object on the objectStack. + # + if ($e eq 'QualifierName' or + $e eq 'SubHeading') { + my $p = pop @ObjectStack; # pSubHeading + $$p{'subHeading'} = pop @PCDataStack; + &_add_element ('subHeadings', $p); # adding to pMeshHeadings +# &_debug_object_stack ("END", $e); + return; + + } elsif ($e eq 'GeneralNote') { + my $p = pop @ObjectStack; # pGeneralNote + $$p{'generalNote'} = pop @PCDataStack; + &_add_element ('generalNotes', $p); # adding to pMedlineCitation +# &_debug_object_stack ("END", $e); + return; + + } elsif ($e eq 'OtherID') { + my $p = pop @ObjectStack; # pOtherID + $$p{'otherID'} = pop @PCDataStack; + &_add_element ('otherIDs', $p); # adding to pMedlineCitation +# &_debug_object_stack ("END", $e); + return; + } + + # + # both object and pcdata stacks elements mixed here together + # (the element names appear in the order of frequency in the + # medline data set) + # + + if (exists $POP_DATA_AND_PEEK_OBJ{$e}) { + &_data2obj ("\l$e"); + + } elsif (exists $POP_OBJ_AND_PEEK_OBJ{$e}) { + &_obj2obj ("\l$e"); + + } elsif (exists $POP_AND_ADD_ELEMENT{$e}) { + &_add_element ($POP_AND_ADD_ELEMENT{$e}, pop @ObjectStack); + + } elsif (exists $POP_AND_ADD_DATA_ELEMENT{$e}) { + &_add_element ($POP_AND_ADD_DATA_ELEMENT{$e}); + + } elsif ($e eq 'Author' or + $e eq 'Investigator') { + my $pAuthor; + my $p = pop @ObjectStack; # pPersonalName or pAuthor + if (&_eq_hash_elem ($p, 'type', 'PersonalName')) { + $pAuthor = pop @ObjectStack; + $$pAuthor{'personalName'} = $p; + } else { + $pAuthor = $p; + } + my $peek = $ObjectStack[$#ObjectStack]; # pMedlineCitation, pArticle or pBook + if (&_eq_hash_elem ($peek, 'type', 'MedlineCitation')) { + &_add_element ('investigators', $pAuthor); + } else { + &_add_element ('authors', $pAuthor); + } + + } elsif ($e eq 'MedlineJournalInfo') { + &_obj2obj ('journalInfo'); + + } elsif ($e eq 'PMID') { + my $peek = $ObjectStack[$#ObjectStack]; # pMedlineCitation, pReference or pDeleteCitation + if (&_eq_hash_elem ($peek, 'type', 'DeleteCitation')) { + &_add_element ('PMIDs'); + } else { + $$peek{'PMID'} = pop @PCDataStack; + } + + } elsif ($e eq 'MedlineID') { + my $peek = $ObjectStack[$#ObjectStack]; # pMedlineCitation, pReference or pDeleteCitation + if (&_eq_hash_elem ($peek, 'type', 'DeleteCitation')) { + &_add_element ('MedlineIDs'); + } else { + $$peek{'medlineID'} = pop @PCDataStack; + } + +# } elsif ($e eq 'OtherAbstract') { +# my $pAbstract = pop @ObjectStack; +# my $pOtherAbstract = pop @ObjectStack; +# $$pOtherAbstract{'abstract'} = $pAbstract +# &_add_element ('otherAbstracts', $pOtherAbstract); + + } elsif ($e eq 'Affiliation') { + my $peek = $ObjectStack[$#ObjectStack]; + if (&_eq_hash_elem ($peek, 'type', 'PersonalName')) { + my $peek2 = $ObjectStack[$#ObjectStack - 1]; + $$peek2{'affiliation'} = pop @PCDataStack; + } else { + $$peek{'affiliation'} = pop @PCDataStack; + } + + } elsif ($e eq 'DeleteCitation') { + pop @ObjectStack; +### warn ("'DeleteCitation' tag found. Not known what to do with it."); # silently ignored + + } elsif ($e eq 'MedlineCitation') { + + # + # Here we finally have the whole citation ready. + # + &_process_citation (pop @ObjectStack); + + # + # ERROR: if we are here, there was an unexpected element + # + } elsif (exists $PCDATA_NAMES{$e}) { + pop @PCDataStack; + warn ("An unexpected element found: $e"); + } +# &_debug_object_stack ("END", $e); + +} + +# what to do when we have the whole $citation ready +sub _process_citation { + my ($citation) = @_; + $citation = $Convert->convert ($citation) if defined $Convert; + + if ($Callback) { + &$Callback ($citation); + } else { + push (@Citations, $citation); + } +} + +# add $element into an array named $key to the top object at @ObjectStack; +# if $element is empty, take it from @PCDataStack +sub _add_element { + my ($key, $element) = @_; + my $peek = $ObjectStack[$#ObjectStack]; + $$peek{$key} = [] unless $$peek{$key}; + push (@{ $$peek{$key} }, (defined $element ? $element : pop @PCDataStack)); +} + +# remove top of @PCDataStack and put it into top object at @ObjectStack under name $key +sub _data2obj { + my ($key) = @_; + my $peek = $ObjectStack[$#ObjectStack]; + $$peek{$key} = pop @PCDataStack; +} + +# remove top of @ObjectStack and put it into now-top at @ObjectStack under name $key +sub _obj2obj { + my ($key) = @_; + my $p = pop @ObjectStack; + my $peek = $ObjectStack[$#ObjectStack]; + $$peek{$key} = $p; +} + +# check if a $key exists in a ref-hash $rh and if it is equal to $value +sub _eq_hash_elem { + my ($rh, $key, $value) = @_; + return (defined $$rh{$key} and $$rh{$key} eq $value); +} + +# +# --- only for debugging +# +use vars qw(%DEBUGSTACK); +%DEBUGSTACK = (); +sub _debug_object_stack { + my ($action, $element) = @_; + if ($action =~ /^START/o) { + $DEBUGSTACK{$element} = (@ObjectStack+0); + } else { + return if $element eq 'LastName'; + print "Element $element starts on " . + $DEBUGSTACK{$element} . 'and ends on ' . (@ObjectStack+0) . "\n" + if $DEBUGSTACK{$element} != (@ObjectStack+0); + } +} + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Biblio/IO/pubmed2ref.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Biblio/IO/pubmed2ref.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,144 @@ +# $Id: pubmed2ref.pm,v 1.2 2002/10/22 07:45:13 lapp Exp $ +# +# BioPerl module Bio::Biblio::IO::pubmed2ref.pm +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Biblio::IO::pubmed2ref - A converter of a raw hash to PUBMED citations + +=head1 SYNOPSIS + + # to be written + +=head1 DESCRIPTION + + # to be written + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=head1 APPENDIX + +Here is the rest of the object methods. Internal methods are preceded +with an underscore _. + +=cut + + +# Let the code begin... + + +package Bio::Biblio::IO::pubmed2ref; + +use strict; +use vars qw(@ISA $VERSION $Revision); + +use Bio::Biblio::IO::medline2ref; +@ISA = qw(Bio::Biblio::IO::medline2ref); + +BEGIN { + # set the version for version checking + $VERSION = do { my @r = (q$Revision: 1.2 $ =~ /\d+/g); sprintf "%d.%-02d", @r }; + $Revision = q$Id: pubmed2ref.pm,v 1.2 2002/10/22 07:45:13 lapp Exp $; +} + +# --------------------------------------------------------------------- +# +# Here is the core... +# +# --------------------------------------------------------------------- + +sub _load_instance { + my ($self, $source) = @_; + + my $result; + my $article = $$source{'article'}; + if (defined $article) { + if (defined $$article{'journal'}) { + $result = $self->_new_instance ('Bio::Biblio::PubmedJournalArticle'); + $result->type ('JournalArticle'); + } elsif (defined $$article{'book'}) { + $result = $self->_new_instance ('Bio::Biblio::PubmedBookArticle'); + $result->type ('BookArticle'); + } else { + $result->type ('PubmedArticle'); + } + } + $result = $self->_new_instance ('Bio::Biblio::Ref') unless defined $result; + return $result; +} + +sub convert { + my ($self, $source) = @_; + my $result = $self->SUPER::convert ($source->{'Citation'}); + + # here we do PUBMED's specific stuff + my $pubmed_data = $$source{'PubmedData'}; + if (defined $pubmed_data) { + + # ... just take it (perhaps rename it) + $result->pubmed_status ($$pubmed_data{'publicationStatus'}) if defined $$pubmed_data{'publicationStatus'}; + $result->pubmed_provider_id ($$pubmed_data{'providerId'}) if defined $$pubmed_data{'providerId'}; + $result->pubmed_article_id_list ($$pubmed_data{'pubmedArticleIds'}) if defined $$pubmed_data{'pubmedArticleIds'}; + $result->pubmed_url_list ($$pubmed_data{'pubmedURLs'}) if defined $$pubmed_data{'pubmedURLs'}; + + # ... put all dates from all 'histories' into one array + if (defined $$pubmed_data{'histories'}) { + my @history_list; + foreach my $history ( @{ $$pubmed_data{'histories'} } ) { + my $ra_pub_dates = $$history{'pubDates'}; + foreach my $pub_date ( @{ $ra_pub_dates } ) { + my %history = (); + my $converted_date = &Bio::Biblio::IO::medline2ref::_convert_date ($pub_date); + $history{'date'} = $converted_date if defined $converted_date; + $history{'pub_status'} = $$pub_date{'pubStatus'} if defined $$pub_date{'pubStatus'}; + push (@history_list, \%history); + } + } + $result->pubmed_history_list (\@history_list); + } + } + + # Done! + return $result; +} + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Biblio/IO/pubmedxml.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Biblio/IO/pubmedxml.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,309 @@ +# $Id: pubmedxml.pm,v 1.4 2002/10/22 07:45:13 lapp Exp $ +# +# BioPerl module Bio::Biblio::IO::pubmedxml.pm +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Biblio::IO::pubmedxml - A converter of XML files with PUBMED citations + +=head1 SYNOPSIS + +Do not use this object directly, it is recommended to access it and use +it through the I module: + + use Bio::Biblio::IO; + my $io = new Bio::Biblio::IO (-format => 'pubmedxml'); + +=head1 DESCRIPTION + +This object reads bibliographic citations in XML/MEDLINE format and +converts them into I objects. It is an +implementation of methods defined in I. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=head1 APPENDIX + +The main documentation details are to be found in +L. + +Here is the rest of the object methods. Internal methods are preceded +with an underscore _. + +=cut + + +# Let the code begin... + + +package Bio::Biblio::IO::pubmedxml; +use vars qw(@ISA $VERSION $Revision); +use vars qw(%PCDATA_NAMES %SIMPLE_TREATMENT %POP_DATA_AND_PEEK_OBJ %POP_AND_ADD_DATA_ELEMENT); + +use strict; + +use Bio::Biblio::IO::medlinexml; + +@ISA = qw(Bio::Biblio::IO::medlinexml); + +BEGIN { + # set the version for version checking + $VERSION = do { my @r = (q$Revision: 1.4 $ =~ /\d+/g); sprintf "%d.%-02d", @r }; + $Revision = q$Id: pubmedxml.pm,v 1.4 2002/10/22 07:45:13 lapp Exp $; +} + +sub _initialize { + my ($self, @args) = @_; + + # make a hashtable from @args + my %param = @args; + @param { map { lc $_ } keys %param } = values %param; # lowercase keys + + # copy all @args into this object (overwriting what may already be + # there) - changing '-key' into '_key', and making keys lowercase + my $new_key; + foreach my $key (keys %param) { + ($new_key = $key) =~ s/^-/_/; + $self->{ lc $new_key } = $param { $key }; + } + + # find the format for output - and put it into a global $Convert + # because it will be used by the event handler who knows nothing + # about this object + my $result = $self->{'_result'} || 'pubmed2ref'; + $result = "\L$result"; # normalize capitalization to lower case + + # a special case is 'raw' when no converting module is loaded + # and citations will be returned as a hashtable (the one which + # is created during parsing XML file/stream) + unless ($result eq 'raw') { + + # load module with output converter - as defined in $result + if (defined &Bio::Biblio::IO::_load_format_module ($result)) { + $Bio::Biblio::IO::medlinexml::Convert = "Bio::Biblio::IO::$result"->new (@args); + } + } + + # create an instance of the XML parser + # (unless it is already there...) + $self->{'_xml_parser'} = new XML::Parser (Handlers => {Init => \&Bio::Biblio::IO::medlinexml::handle_doc_start, + Start => \&handle_start, + End => \&handle_end, + Char => \&Bio::Biblio::IO::medlinexml::handle_char, + Final => \&Bio::Biblio::IO::medlinexml::handle_doc_end}) + unless $self->{'_xml_parser'}; + + # if there is an argument '-callback' then start parsing at once - + # the registered event handlers will use 'callback' to report + # back after each citation + # + # we need to remember this situation also in a global variable + # because the event handler subroutines know nothing about this + # object (unfortunately) + if ($SUPER::Callback = $self->{'_callback'}) { + $self->_parse; + } +} + +# --------------------------------------------------------------------- +# +# Here are the event handlers (they do the real job!) +# +# Note that these methods do not know anything about the object they +# are part of - they are called as subroutines. not as methods. +# It also means that they need to use global variables to store and +# exchnage intermediate results. +# +# --------------------------------------------------------------------- + +# +# This is a list of #PCDATA elements. +# +%PCDATA_NAMES = + ( + 'PublicationStatus' => 1, + 'ProviderId' => 1, + 'ArticleId' => 1, + 'URL' => 1, + ); + +%SIMPLE_TREATMENT = + ( + 'History' => 1, + 'PubMedArticle' => 1, + 'PubmedArticle' => 1, + 'PubmedData' => 1, + ); + +%POP_DATA_AND_PEEK_OBJ = + ( + 'Year' => 1, + 'Month' => 1, + 'Day' => 1, + 'Hour' => 1, + 'Minute' => 1, + 'Second' => 1, + 'ProviderId' => 1, + 'PublicationStatus' => 1, + ); + +%POP_AND_ADD_DATA_ELEMENT = + ( + 'PubMedPubDate' => 'pubDates', + 'History' => 'histories', + ); + + +=head2 VERSION and Revision + + Usage : print $Bio::Biblio::IO::pubmedxml::VERSION; + print $Bio::Biblio::IO::pubmedxml::Revision; + +=cut + + +sub handle_start { + my ($expat, $e, %attrs) = @_; +# &Bio::Biblio::IO::medlinexml::_debug_object_stack ("START", $e); + + # + # The #PCDATA elements which have an attribute list must + # be first here - because for them I create entries both on + # the @PCDataStack _and_ on @ObjectStack. + # + if ($e eq 'ArticleId') { + my %p = (); + $p{'idType'} = (defined $attrs{'IdType'} ? $attrs{'IdType'} : 'pubmed'); + push (@Bio::Biblio::IO::medlinexml::ObjectStack, \%p); + } + + if ($e eq 'URL') { + my %p = (); + $p{'type'} = $attrs{'type'} if $attrs{'type'}; + $p{'lang'} = $attrs{'lang'} if $attrs{'lang'}; + push (@Bio::Biblio::IO::medlinexml::ObjectStack, \%p); + } + + # + # Then we have #PCDATA elements without an attribute list. + # For them I create an entry on @PCDataStack. + # + if (exists $PCDATA_NAMES{$e}) { + push (@Bio::Biblio::IO::medlinexml::PCDataStack, ''); + + # + # And finally, all non-PCDATA elements go to the objectStack + # + } elsif (exists $SIMPLE_TREATMENT{$e}) { + push (@Bio::Biblio::IO::medlinexml::ObjectStack, {}); + + } elsif ($e eq 'ArticleIdList') { + ; + + } elsif ($e eq 'PubMedPubDate') { + my %p = (); + $p{'pubStatus'} = $attrs{'PubStatus'} if $attrs{'PubStatus'}; + push (@Bio::Biblio::IO::medlinexml::ObjectStack, \%p); + + } else { + &Bio::Biblio::IO::medlinexml::handle_start ($expat, $e, %attrs); + } +} + +sub handle_end { + my ($expat, $e) = @_; + + # + # First I have to deal with those elements which are both PCDATA + # (and therefore they are on the pcdataStack) and which have an + # attribute list (therefore they are also known as a separate + # p-object on the objectStack. + # + if ($e eq 'ArticleId') { + &Bio::Biblio::IO::medlinexml::_data2obj ('id'); + &Bio::Biblio::IO::medlinexml::_add_element ('pubmedArticleIds', pop @Bio::Biblio::IO::medlinexml::ObjectStack); +# &Bio::Biblio::IO::medlinexml::_debug_object_stack ("END", $e); + return; + } + + if ($e eq 'URL') { + &Bio::Biblio::IO::medlinexml::_data2obj ('URL'); + &Bio::Biblio::IO::medlinexml::_add_element ('pubmedURLs', pop @Bio::Biblio::IO::medlinexml::ObjectStack); +# &Bio::Biblio::IO::medlinexml::_debug_object_stack ("END", $e); + return; + } + + + # + # both object and pcdata stacks elements mixed here together + # + + if (exists $POP_DATA_AND_PEEK_OBJ{$e}) { + &Bio::Biblio::IO::medlinexml::_data2obj ("\l$e"); + + } elsif (exists $POP_AND_ADD_DATA_ELEMENT{$e}) { + &Bio::Biblio::IO::medlinexml::_add_element ($POP_AND_ADD_DATA_ELEMENT{$e}, pop @Bio::Biblio::IO::medlinexml::ObjectStack); + + } elsif ($e eq 'MedlineCitation' || + $e eq 'NCBIArticle') { + &Bio::Biblio::IO::medlinexml::_obj2obj ('Citation'); + + } elsif ($e eq 'PubmedData') { + &Bio::Biblio::IO::medlinexml::_obj2obj ('PubmedData'); + + } elsif ($e eq 'PubMedArticle' || + $e eq 'PubmedArticle') { + + # + # Here we finally have the whole citation ready. + # + &Bio::Biblio::IO::medlinexml::_process_citation (pop @Bio::Biblio::IO::medlinexml::ObjectStack); + + } else { + &Bio::Biblio::IO::medlinexml::handle_end ($expat, $e); + } + +# &Bio::Biblio::IO::medlinexml::_debug_object_stack ("END", $e); + +} + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Biblio/Journal.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Biblio/Journal.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,131 @@ +# $Id: Journal.pm,v 1.6 2002/10/22 07:45:11 lapp Exp $ +# +# BioPerl module for Bio::Biblio::Journal +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Biblio::Journal - Representation of a journal + +=head1 SYNOPSIS + + $obj = new Bio::Biblio::Journal (-name => 'The Perl Journal', + -issn => '1087-903X'); + --- OR --- + + $obj = new Bio::Biblio::Journal; + $obj->issn ('1087-903X'); + +=head1 DESCRIPTION + +A storage object for a journal. +See its place in the class hierarchy in +http://industry.ebi.ac.uk/openBQS/images/bibobjects_perl.gif + +=head2 Attributes + +The following attributes are specific to this class +(however, you can also set and get all attributes defined in the parent classes): + + abbreviation + issn + name + provider type: Bio::Biblio::Provider + +=head1 SEE ALSO + +=over + +=item * + +OpenBQS home page: http://industry.ebi.ac.uk/openBQS + +=item * + +Comments to the Perl client: http://industry.ebi.ac.uk/openBQS/Client_perl.html + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS + +Heikki Lehvaslaiho (heikki@ebi.ac.uk), +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=cut + + +# Let the code begin... + +package Bio::Biblio::Journal; +use strict; +use vars qw(@ISA); + +use Bio::Biblio::BiblioBase; + +@ISA = qw(Bio::Biblio::BiblioBase); + +# +# a closure with a list of allowed attribute names (these names +# correspond with the allowed 'get' and 'set' methods); each name also +# keep what type the attribute should be (use 'undef' if it is a +# simple scalar) +# +{ + my %_allowed = + ( + _abbreviation => undef, + _issn => undef, + _name => undef, + _provider => 'Bio::Biblio::Provider', + ); + + # return 1 if $attr is allowed to be set/get in this class + sub _accessible { + my ($self, $attr) = @_; + exists $_allowed{$attr}; + } + + # return an expected type of given $attr + sub _attr_type { + my ($self, $attr) = @_; + $_allowed{$attr}; + } +} + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Biblio/JournalArticle.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Biblio/JournalArticle.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,138 @@ +# $Id: JournalArticle.pm,v 1.7 2002/10/22 07:45:11 lapp Exp $ +# +# BioPerl module for Bio::Biblio::JournalArticle +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Biblio::JournalArticle - Representation of a journal article + +=head1 SYNOPSIS + + $obj = new Bio::Biblio::JournalArticle (-title = 'Come to grief', + -journal => new Bio::Biblio::Journal); + --- OR --- + + $obj = new Bio::Biblio::JournalArticle; + $obj->title ('Come to grief'); + $obj->journal (new Bio::Biblio::Journal (-name => 'English Mysteries')); + +=head1 DESCRIPTION + +A storage object for a journal article. +See its place in the class hierarchy in +http://industry.ebi.ac.uk/openBQS/images/bibobjects_perl.gif + +=head2 Attributes + +The following attributes are specific to this class +(however, you can also set and get all attributes defined in the parent classes): + + issue + issue_supplement + journal type: Bio::Biblio::Journal + volume + +=head1 SEE ALSO + +=over + +=item * + +OpenBQS home page: http://industry.ebi.ac.uk/openBQS + +=item * + +Comments to the Perl client: http://industry.ebi.ac.uk/openBQS/Client_perl.html + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS + +Heikki Lehvaslaiho (heikki@ebi.ac.uk), +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=cut + + +# Let the code begin... + + +package Bio::Biblio::JournalArticle; +use strict; +use vars qw(@ISA); + +use Bio::Biblio::Article; + +@ISA = qw(Bio::Biblio::Article); + +# +# a closure with a list of allowed attribute names (these names +# correspond with the allowed 'get' and 'set' methods); each name also +# keep what type the attribute should be (use 'undef' if it is a +# simple scalar) +# +{ + my %_allowed = + ( + _issue => undef, + _issue_supplement => undef, + _journal => 'Bio::Biblio::Journal', + _volume => undef, + ); + + # return 1 if $attr is allowed to be set/get in this class + sub _accessible { + my ($self, $attr) = @_; + exists $_allowed{$attr} or $self->SUPER::_accessible ($attr); + } + + # return an expected type of given $attr + sub _attr_type { + my ($self, $attr) = @_; + if (exists $_allowed{$attr}) { + return $_allowed{$attr}; + } else { + return $self->SUPER::_attr_type ($attr); + } + } +} + + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Biblio/MedlineArticle.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Biblio/MedlineArticle.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,205 @@ +# $Id: MedlineArticle.pm,v 1.5 2002/10/22 07:45:11 lapp Exp $ +# +# BioPerl module for Bio::Biblio::MedlineArticle +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Biblio::MedlineArticle - Representation of a MEDLINE article + +=head1 SYNOPSIS + + $obj = new Bio::Biblio::MedlineArticle (-mesh_headings => ...); + + # how are Mesh terms stored: + use Data::Dumper; + print Data::Dumper->Dump ( [$obj->mesh_headings], ['MeshHeadings']); + + It produces (something like) this: + 'MeshHeadings' => [ + { 'descriptorName' => 'Adult' }, + { 'descriptorName' => 'Cardiovascular Diseases', + 'subHeadings' => [ { 'subHeading' => 'etiology' }, + { 'majorTopic' => 'Y', + 'subHeading' => 'mortality' } ] }, + { 'descriptorName' => 'Child Development', + 'subHeadings' => [ { 'majorTopic' => 'Y', + 'subHeading' => 'physiology' } ] }, + { 'descriptorName' => 'Human' }, + ] + +=head1 DESCRIPTION + +A storage object for a MEDLINE article. +See its place in the class hierarchy in +http://industry.ebi.ac.uk/openBQS/images/bibobjects_perl.gif + +=head2 Attributes + +The following attributes are specific to this class +(however, you can also set and get all attributes defined in the parent classes): + + affiliation + chemicals type: array ref of hashes + citation_owner + comment_ins type: array ref of hashes + comment_ons type: array ref of hashes + date_of_electronic_publication + erratum_fors type: array ref of hashes + erratum_in type: array ref of hashes + gene_symbols + general_notes type: array ref of hashes + grant_list_complete + grants type: array ref of hashes + medline_date + medline_id + medline_page + mesh_headings type: array ref of hashes + number_of_references + original_report_ins type: array ref of hashes + other_abstracts type: array ref of hashes + other_ids type: array ref of hashes + other_languages + pmid + republished_froms type: array ref of hashes + republished_ins type: array ref of hashes + retraction_ins type: array ref of hashes + retraction_ofs type: array ref of hashes + season + status + summary_for_patients_ins type: array ref of hashes + update_ins type: array ref of hashes + update_ofs type: array ref of hashes + vernacular_title + +=head1 SEE ALSO + +=over + +=item * + +OpenBQS home page: http://industry.ebi.ac.uk/openBQS + +=item * + +Comments to the Perl client: http://industry.ebi.ac.uk/openBQS/Client_perl.html + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS + +Heikki Lehvaslaiho (heikki@ebi.ac.uk), +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=cut + + +# Let the code begin... + + +package Bio::Biblio::MedlineArticle; +use strict; +use vars qw(@ISA); + +use Bio::Biblio::Article; + +@ISA = qw(Bio::Biblio::Article); + +# +# a closure with a list of allowed attribute names (these names +# correspond with the allowed 'get' and 'set' methods); each name also +# keep what type the attribute should be (use 'undef' if it is a +# simple scalar) +# +{ + my %_allowed = + ( + _affiliation => undef, + _chemicals => 'ARRAY', + _citation_owner => undef, + _comment_ins => 'ARRAY', + _comment_ons => 'ARRAY', + _date_of_electronic_publication => undef, + _erratum_fors => 'ARRAY', + _erratum_ins => 'ARRAY', + _gene_symbols => undef, + _general_notes => 'ARRAY', + _grant_list_complete => undef, + _grants => 'ARRAY', + _medline_date => undef, + _medline_id => undef, + _medline_page => undef, + _mesh_headings => 'ARRAY', + _number_of_references => undef, + _original_report_ins => 'ARRAY', + _other_abstracts => 'ARRAY', + _other_ids => 'ARRAY', + _other_languages => undef, + _pmid => undef, + _republished_froms => 'ARRAY', + _republished_ins => 'ARRAY', + _retraction_ins => 'ARRAY', + _retraction_ofs => 'ARRAY', + _season => undef, + _status => undef, + _summary_for_patients_ins => 'ARRAY', + _update_ins => 'ARRAY', + _update_ofs => 'ARRAY', + _vernacular_title => undef, + ); + + # return 1 if $attr is allowed to be set/get in this class + sub _accessible { + my ($self, $attr) = @_; + exists $_allowed{$attr} or $self->SUPER::_accessible ($attr); + } + + # return an expected type of given $attr + sub _attr_type { + my ($self, $attr) = @_; + if (exists $_allowed{$attr}) { + return $_allowed{$attr}; + } else { + return $self->SUPER::_attr_type ($attr); + } + } +} + + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Biblio/MedlineBook.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Biblio/MedlineBook.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,131 @@ +# $Id: MedlineBook.pm,v 1.3 2002/10/22 07:45:11 lapp Exp $ +# +# BioPerl module for Bio::Biblio::MedlineBook +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Biblio::MedlineBook - Representation of a MEDLINE book + +=head1 SYNOPSIS + + $obj = new Bio::Biblio::MedlineBook + (-editor => new Bio::Biblio::Person + (-lastname => 'Loukides'), + -isbn => '0-596-00068-5'); + --- OR --- + + $obj = new Bio::Biblio::MedlineBook; + $obj->isbn ('0-596-00068-5'); + +=head1 DESCRIPTION + +A storage object for a MEDLINE book. +See its place in the class hierarchy in +http://industry.ebi.ac.uk/openBQS/images/bibobjects_perl.gif + +=head2 Attributes + +There are no specific attributes in this class +(however, you can set and get all attributes defined in the parent classes). +The main raison d'etre of this class is to be associated with MEDLINE book articles. + +=head1 SEE ALSO + +=over + +=item * + +OpenBQS home page: http://industry.ebi.ac.uk/openBQS + +=item * + +Comments to the Perl client: http://industry.ebi.ac.uk/openBQS/Client_perl.html + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS + +Heikki Lehvaslaiho (heikki@ebi.ac.uk), +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=cut + + +# Let the code begin... + + +package Bio::Biblio::MedlineBook; +use strict; +use vars qw(@ISA); + +use Bio::Biblio::Book; + +@ISA = qw(Bio::Biblio::Book); + +# +# a closure with a list of allowed attribute names (these names +# correspond with the allowed 'get' and 'set' methods); each name also +# keep what type the attribute should be (use 'undef' if it is a +# simple scalar) +# +{ + my %_allowed = + ( + ); + + # return 1 if $attr is allowed to be set/get in this class + sub _accessible { + my ($self, $attr) = @_; + exists $_allowed{$attr} or $self->SUPER::_accessible ($attr); + } + + # return an expected type of given $attr + sub _attr_type { + my ($self, $attr) = @_; + if (exists $_allowed{$attr}) { + return $_allowed{$attr}; + } else { + return $self->SUPER::_attr_type ($attr); + } + } +} + + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Biblio/MedlineBookArticle.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Biblio/MedlineBookArticle.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,143 @@ +# $Id: MedlineBookArticle.pm,v 1.3 2002/10/22 07:45:11 lapp Exp $ +# +# BioPerl module for Bio::Biblio::MedlineBookArticle +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Biblio::MedlineBookArticle - Representation of a MEDLINE book article + +=head1 SYNOPSIS + + $obj = new Bio::Biblio::MedlineBookArticle + (-title => 'Getting started'. + -book => new Bio::Biblio::MedlineBook); + --- OR --- + + $obj = new Bio::Biblio::MedlineBookArticle; + $obj->title ('Getting started'); + +=head1 DESCRIPTION + +A storage object for a MEDLINE book. +See its place in the class hierarchy in +http://industry.ebi.ac.uk/openBQS/images/bibobjects_perl.gif + +=head2 Attributes + +The following attributes are specific to this class +(however, you can also set and get all attributes defined in the parent classes): + + book type: Bio::Biblio::MedlineBook + +=head1 SEE ALSO + +=over + +=item * + +OpenBQS home page: http://industry.ebi.ac.uk/openBQS + +=item * + +Comments to the Perl client: http://industry.ebi.ac.uk/openBQS/Client_perl.html + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS + +Heikki Lehvaslaiho (heikki@ebi.ac.uk), +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=cut + + +# Let the code begin... + + +package Bio::Biblio::MedlineBookArticle; +use strict; +use vars qw(@ISA); + +use Bio::Biblio::BookArticle; +use Bio::Biblio::MedlineArticle; + +@ISA = qw(Bio::Biblio::BookArticle Bio::Biblio::MedlineArticle); + +# +# a closure with a list of allowed attribute names (these names +# correspond with the allowed 'get' and 'set' methods); each name also +# keep what type the attribute should be (use 'undef' if it is a +# simple scalar) +# +{ + my %_allowed = + ( + _book => 'Bio::Biblio::MedlineBook', + ); + + # return 1 if $attr is allowed to be set/get in this class + sub _accessible { + my ($self, $attr) = @_; + exists $_allowed{$attr} or $self->SUPER::_accessible ($attr); + return 1 if exists $_allowed{$attr}; + foreach my $parent (@ISA) { + return 1 if $parent->_accessible ($attr); + } + } + + # return an expected type of given $attr + # return an expected type of given $attr + sub _attr_type { + my ($self, $attr) = @_; + if (exists $_allowed{$attr}) { + return $_allowed{$attr}; + } else { + foreach my $parent (@ISA) { + if ($parent->_accessible ($attr)) { + return $parent->_attr_type ($attr); + } + } + } + return 'unknown'; + } +} + + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Biblio/MedlineJournal.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Biblio/MedlineJournal.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,137 @@ +# $Id: MedlineJournal.pm,v 1.3 2002/10/22 07:45:11 lapp Exp $ +# +# BioPerl module for Bio::Biblio::MedlineJournal +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Biblio::MedlineJournal - Representation of a MEDLINE journal + +=head1 SYNOPSIS + + $obj = new Bio::Biblio::MedlineJournal + (-medline_ta => 'J Vasc Interv Radiol'); + --- OR --- + + $obj = new Bio::Biblio::MedlineJournal; + $obj->medline_ta ('J Vasc Interv Radiol'); + +=head1 DESCRIPTION + +A storage object for a MEDLINE journal. +See its place in the class hierarchy in +http://industry.ebi.ac.uk/openBQS/images/bibobjects_perl.gif + +=head2 Attributes + +The following attributes are specific to this class +(however, you can also set and get all attributes defined in the parent classes): + + coden + country + medline_code + medline_ta + nlm_unique_id + +=head1 SEE ALSO + +=over + +=item * + +OpenBQS home page: http://industry.ebi.ac.uk/openBQS + +=item * + +Comments to the Perl client: http://industry.ebi.ac.uk/openBQS/Client_perl.html + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS + +Heikki Lehvaslaiho (heikki@ebi.ac.uk), +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=cut + + +# Let the code begin... + +package Bio::Biblio::MedlineJournal; +use strict; +use vars qw(@ISA); + +use Bio::Biblio::Journal; + +@ISA = qw(Bio::Biblio::Journal); + +# +# a closure with a list of allowed attribute names (these names +# correspond with the allowed 'get' and 'set' methods); each name also +# keep what type the attribute should be (use 'undef' if it is a +# simple scalar) +# +{ + my %_allowed = + ( + _coden => undef, + _country => undef, + _medline_code => undef, + _medline_ta => undef, + _nlm_unique_id => undef, + ); + + # return 1 if $attr is allowed to be set/get in this class + sub _accessible { + my ($self, $attr) = @_; + exists $_allowed{$attr} or $self->SUPER::_accessible ($attr); + } + + # return an expected type of given $attr + sub _attr_type { + my ($self, $attr) = @_; + if (exists $_allowed{$attr}) { + return $_allowed{$attr}; + } else { + return $self->SUPER::_attr_type ($attr); + } + } +} + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Biblio/MedlineJournalArticle.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Biblio/MedlineJournalArticle.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,144 @@ +# $Id: MedlineJournalArticle.pm,v 1.3 2002/10/22 07:45:11 lapp Exp $ +# +# BioPerl module for Bio::Biblio::MedlineJournalArticle +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Biblio::MedlineJournalArticle - Representation of a MEDLINE journal article + +=head1 SYNOPSIS + + $obj = new Bio::Biblio::MedlineJournalArticle ( + -title => 'Thermal adaptation analyzed by comparison of protein sequences from mesophilic and extremely thermophilic Methanococcus species.', + -journal => new Bio::Biblio::MedlineJournal (-issn => '0027-8424'), + -volume => 96, + -issue => 7); + --- OR --- + + $obj = new Bio::Biblio::MedlineJournalArticle; + $obj->title ('...'); + $obj->journal (new Bio::Biblio::MedlineJournal (-issn => '0027-8424')); + +=head1 DESCRIPTION + +A storage object for a MEDLINE journal article. +See its place in the class hierarchy in +http://industry.ebi.ac.uk/openBQS/images/bibobjects_perl.gif + +=head2 Attributes + +The following attributes are specific to this class +(however, you can also set and get all attributes defined in the parent classes): + + journal type: Bio::Biblio::MedlineJournal + +=head1 SEE ALSO + +=over + +=item * + +OpenBQS home page: http://industry.ebi.ac.uk/openBQS + +=item * + +Comments to the Perl client: http://industry.ebi.ac.uk/openBQS/Client_perl.html + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS + +Heikki Lehvaslaiho (heikki@ebi.ac.uk), +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=cut + + +# Let the code begin... + + +package Bio::Biblio::MedlineJournalArticle; +use strict; +use vars qw(@ISA); + +use Bio::Biblio::JournalArticle; +use Bio::Biblio::MedlineArticle; + +@ISA = qw(Bio::Biblio::MedlineArticle Bio::Biblio::JournalArticle); + +# +# a closure with a list of allowed attribute names (these names +# correspond with the allowed 'get' and 'set' methods); each name also +# keep what type the attribute should be (use 'undef' if it is a +# simple scalar) +# +{ + my %_allowed = + ( + _journal => 'Bio::Biblio::MedlineJournal', + ); + + # return 1 if $attr is allowed to be set/get in this class + sub _accessible { + my ($self, $attr) = @_; + return 1 if exists $_allowed{$attr}; + foreach my $parent (@ISA) { + return 1 if $parent->_accessible ($attr); + } + } + + # return an expected type of given $attr + sub _attr_type { + my ($self, $attr) = @_; + if (exists $_allowed{$attr}) { + return $_allowed{$attr}; + } else { + foreach my $parent (@ISA) { + if ($parent->_accessible ($attr)) { + return $parent->_attr_type ($attr); + } + } + } + return 'unknown'; + } +} + + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Biblio/Organisation.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Biblio/Organisation.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,129 @@ +# $Id: Organisation.pm,v 1.7 2002/10/22 07:45:11 lapp Exp $ +# +# BioPerl module for Bio::Biblio::Organisation +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Biblio::Organisation - Representation of an organisation + +=head1 SYNOPSIS + + $obj = new Bio::Biblio::Organisation (-name => 'O\'Reilly'); + + --- OR --- + + $obj = new Bio::Biblio::Organisation; + $obj->name ('O\'Reilly'); + +=head1 DESCRIPTION + +A storage object for an organisation related to a bibliographic resource. + +=head2 Attributes + +The following attributes are specific to this class +(however, you can also set and get all attributes defined in the parent classes): + + name + +=head1 SEE ALSO + +=over + +=item * + +OpenBQS home page: http://industry.ebi.ac.uk/openBQS + +=item * + +Comments to the Perl client: http://industry.ebi.ac.uk/openBQS/Client_perl.html + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS + +Heikki Lehvaslaiho (heikki@ebi.ac.uk), +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=cut + + +# Let the code begin... + + +package Bio::Biblio::Organisation; +use strict; +use vars qw(@ISA); + +use Bio::Biblio::Provider; + +@ISA = qw(Bio::Biblio::Provider); + +# +# a closure with a list of allowed attribute names (these names +# correspond with the allowed 'get' and 'set' methods); each name also +# keep what type the attribute should be (use 'undef' if it is a +# simple scalar) +# +{ + my %_allowed = + ( + _name => undef, + ); + + # return 1 if $attr is allowed to be set/get in this class + sub _accessible { + my ($self, $attr) = @_; + exists $_allowed{$attr} or $self->SUPER::_accessible ($attr); + } + + # return an expected type of given $attr + sub _attr_type { + my ($self, $attr) = @_; + if (exists $_allowed{$attr}) { + return $_allowed{$attr}; + } else { + return $self->SUPER::_attr_type ($attr); + } + } +} + + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Biblio/Patent.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Biblio/Patent.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,135 @@ +# $Id: Patent.pm,v 1.7 2002/10/22 07:45:11 lapp Exp $ +# +# BioPerl module for Bio::Biblio::Patent +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Biblio::Patent - Representation of a patent + +=head1 SYNOPSIS + + $obj = new Bio::Biblio::Patent (-doc_number => '1-2-3-4-5'); + + --- OR --- + + $obj = new Bio::Biblio::Patent; + $obj->doc_number ('1-2-3-4-5'); + +=head1 DESCRIPTION + +A storage object for a patent. +See its place in the class hierarchy in +http://industry.ebi.ac.uk/openBQS/images/bibobjects_perl.gif + +=head2 Attributes + +The following attributes are specific to this class +(however, you can also set and get all attributes defined in the parent classes): + + doc_number + doc_office + doc_type + applicants type: array ref of Bio::Biblio::Providers + +=head1 SEE ALSO + +=over + +=item * + +OpenBQS home page: http://industry.ebi.ac.uk/openBQS + +=item * + +Comments to the Perl client: http://industry.ebi.ac.uk/openBQS/Client_perl.html + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS + +Heikki Lehvaslaiho (heikki@ebi.ac.uk), +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=cut + + +# Let the code begin... + + +package Bio::Biblio::Patent; +use strict; +use vars qw(@ISA); + +use Bio::Biblio::Ref; + +@ISA = qw(Bio::Biblio::Ref); + +# +# a closure with a list of allowed attribute names (these names +# correspond with the allowed 'get' and 'set' methods); each name also +# keep what type the attribute should be (use 'undef' if it is a +# simple scalar) +# +{ + my %_allowed = ( + _doc_number => undef, + _doc_office => undef, + _doc_type => undef, + _applicants => 'ARRAY', + ); + + # return 1 if $attr is allowed to be set/get in this class + sub _accessible { + my ($self, $attr) = @_; + exists $_allowed{$attr} or $self->SUPER::_accessible ($attr); + } + + # return an expected type of given $attr + sub _attr_type { + my ($self, $attr) = @_; + if (exists $_allowed{$attr}) { + return $_allowed{$attr}; + } else { + return $self->SUPER::_attr_type ($attr); + } + } +} + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Biblio/Person.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Biblio/Person.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,146 @@ +# $Id: Person.pm,v 1.8 2002/10/22 07:45:11 lapp Exp $ +# +# BioPerl module for Bio::Biblio::Person +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Biblio::Person - Representation of a person + +=head1 SYNOPSIS + + $obj = new Bio::Biblio::Person (-lastname => 'Capek', + -firstname => 'Karel'); + --- OR --- + + $obj = new Bio::Biblio::Person; + $obj->firstname ('Karel'); + $obj->lastname ('Capek'); + +=head1 DESCRIPTION + +A storage object for a person related to a bibliographic resource. + +=head2 Attributes + +The following attributes are specific to this class +(however, you can also set and get all attributes defined in the parent classes): + + affiliation + email + firstname + forename + initials + lastname + middlename + postal_address + suffix + +=head1 SEE ALSO + +=over + +=item * + +OpenBQS home page: http://industry.ebi.ac.uk/openBQS + +=item * + +Comments to the Perl client: http://industry.ebi.ac.uk/openBQS/Client_perl.html + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS + +Heikki Lehvaslaiho (heikki@ebi.ac.uk) +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=cut + + +# Let the code begin... + + +package Bio::Biblio::Person; +use strict; +use vars qw(@ISA); + +use Bio::Biblio::Provider; + +@ISA = qw( Bio::Biblio::Provider); + +# +# a closure with a list of allowed attribute names (these names +# correspond with the allowed 'get' and 'set' methods); each name also +# keep what type the attribute should be (use 'undef' if it is a +# simple scalar) +# +{ + my %_allowed = + ( + _affiliation => undef, + _email => undef, + _firstname => undef, + _forename => undef, + _initials => undef, + _lastname => undef, + _middlename => undef, + _postal_address => undef, + _suffix => undef, + ); + + # return 1 if $attr is allowed to be set/get in this class + sub _accessible { + my ($self, $attr) = @_; + exists $_allowed{$attr} or $self->SUPER::_accessible ($attr); + } + + # return an expected type of given $attr + sub _attr_type { + my ($self, $attr) = @_; + if (exists $_allowed{$attr}) { + return $_allowed{$attr}; + } else { + return $self->SUPER::_attr_type ($attr); + } + } +} + + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Biblio/Proceeding.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Biblio/Proceeding.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,133 @@ +# $Id: Proceeding.pm,v 1.6 2002/10/22 07:45:11 lapp Exp $ +# +# BioPerl module for Bio::Biblio::Proceeding +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Biblio::Proceeding - Representation of a conference proceeding + +=head1 SYNOPSIS + + $obj = new Bio::Biblio::Proceeding (-title => 'JavaONE'); + + --- OR --- + + $obj = new Bio::Biblio::Proceeding; + $obj->title ('JavaONE'); + +=head1 DESCRIPTION + +A storage object for a conference proceeding. +See its place in the class hierarchy in +http://industry.ebi.ac.uk/openBQS/images/bibobjects_perl.gif + +=head2 Attributes + +There are no specific attributes in this class +(however, you can set and get all attributes defined in the parent classes). + +=head1 SEE ALSO + +=over + +=item * + +OpenBQS home page: http://industry.ebi.ac.uk/openBQS + +=item * + +Comments to the Perl client: http://industry.ebi.ac.uk/openBQS/Client_perl.html + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS + +Heikki Lehvaslaiho (heikki@ebi.ac.uk), +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=head1 BUGS AND LIMITATIONS + +This class should be probably somewhere else in the class hierarchy +because a proceeding is actrually a collection of resources. Perhaps +this will be changed in the future. + +=cut + + +# Let the code begin... + + +package Bio::Biblio::Proceeding; +use strict; +use vars qw(@ISA); + +use Bio::Biblio::Ref; + +@ISA = qw( Bio::Biblio::Ref); + +# +# a closure with a list of allowed attribute names (these names +# correspond with the allowed 'get' and 'set' methods); each name also +# keep what type the attribute should be (use 'undef' if it is a +# simple scalar) +# +{ + my %_allowed = ( + ); + + # return 1 if $attr is allowed to be set/get in this class + sub _accessible { + my ($self, $attr) = @_; + exists $_allowed{$attr} or $self->SUPER::_accessible ($attr); + } + + # return an expected type of given $attr + sub _attr_type { + my ($self, $attr) = @_; + if (exists $_allowed{$attr}) { + return $_allowed{$attr}; + } else { + return $self->SUPER::_attr_type ($attr); + } + } +} + + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Biblio/Provider.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Biblio/Provider.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,126 @@ +# $Id: Provider.pm,v 1.4 2002/10/22 07:45:11 lapp Exp $ +# +# BioPerl module for Bio::Biblio::Provider +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Biblio::Provider - Representation of a general provider + +=head1 SYNOPSIS + + # usually this class is not instantiated but can be... + $obj = new Bio::Biblio::Provider (-type => 'Department'); + + --- OR --- + + $obj = new Bio::Biblio::Provider; + $obj->type ('Department'); + +=head1 DESCRIPTION + +A storage object for a general bibliographic resource provider +(a rpovider can be a person, a organisation, or even a program). + +=head2 Attributes + +The following attributes are specific to this class, +and they are inherited by all provider types. + + type + +=head1 SEE ALSO + +=over + +=item * + +OpenBQS home page: http://industry.ebi.ac.uk/openBQS + +=item * + +Comments to the Perl client: http://industry.ebi.ac.uk/openBQS/Client_perl.html + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS + +Heikki Lehvaslaiho (heikki@ebi.ac.uk), +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=cut + + +# Let the code begin... + + +package Bio::Biblio::Provider; +use strict; +use vars qw(@ISA $AUTOLOAD); + +use Bio::Biblio::BiblioBase; + +@ISA = qw(Bio::Biblio::BiblioBase); + +# +# a closure with a list of allowed attribute names (these names +# correspond with the allowed 'get' and 'set' methods); each name also +# keep what type the attribute should be (use 'undef' if it is a +# simple scalar) +# +{ + my %_allowed = + ( + _type => undef, + ); + + # return 1 if $attr is allowed to be set/get in this class + sub _accessible { + my ($self, $attr) = @_; + exists $_allowed{$attr}; + } + + # return an expected type of given $attr + sub _attr_type { + my ($self, $attr) = @_; + $_allowed{$attr}; + } +} + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Biblio/PubmedArticle.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Biblio/PubmedArticle.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,149 @@ +# $Id: PubmedArticle.pm,v 1.3 2002/10/22 07:45:11 lapp Exp $ +# +# BioPerl module for Bio::Biblio::PubmedArticle +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Biblio::PubmedArticle - Representation of a PUBMED article + +=head1 SYNOPSIS + + $obj = new Bio::Biblio::PubmedArticle + (-pubmed_history_list => + [ { 'pub_status' => 'pubmed', + 'date' => '2001-12-1T10:0:00Z' }, + { 'pub_status' => 'medline', + 'date' => '2002-1-5T10:1:00Z' } ], + -pubmed_status => 'ppublish'); + --- OR --- + + $obj = new Bio::Biblio::PubmedArticle; + $obj->pubmed_status ('ppublish'); + +=head1 DESCRIPTION + +A storage object for a general PUBMED article. +See its place in the class hierarchy in +http://industry.ebi.ac.uk/openBQS/images/bibobjects_perl.gif + +=head2 Attributes + +The following attributes are specific to this class +(however, you can also set and get all attributes defined in the parent classes): + + pubmed_status + pubmed_provider_id + pubmed_history_list type: array ref of hashes + pubmed_article_id_list type: array ref of hashes + pubmed_url_list type: array ref of hashes + +=head1 SEE ALSO + +=over + +=item * + +OpenBQS home page: http://industry.ebi.ac.uk/openBQS + +=item * + +Comments to the Perl client: http://industry.ebi.ac.uk/openBQS/Client_perl.html + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=cut + + +# Let the code begin... + + +package Bio::Biblio::PubmedArticle; +use strict; +use vars qw(@ISA); + +use Bio::Biblio::MedlineArticle; +@ISA = qw(Bio::Biblio::MedlineArticle); + +# +# a closure with a list of allowed attribute names (these names +# correspond with the allowed 'get' and 'set' methods); each name also +# keep what type the attribute should be (use 'undef' if it is a +# simple scalar) +# +{ + my %_allowed = + ( + _pubmed_status => undef, + _pubmed_provider_id => undef, + _pubmed_history_list => 'ARRAY', + _pubmed_article_id_list => 'ARRAY', + _pubmed_url_list => 'ARRAY', + ); + + # return 1 if $attr is allowed to be set/get in this class + sub _accessible { + my ($self, $attr) = @_; + return 1 if exists $_allowed{$attr}; + foreach my $parent (@ISA) { + return 1 if $parent->_accessible ($attr); + } + } + + # return an expected type of given $attr + sub _attr_type { + my ($self, $attr) = @_; + if (exists $_allowed{$attr}) { + return $_allowed{$attr}; + } else { + foreach my $parent (@ISA) { + if ($parent->_accessible ($attr)) { + return $parent->_attr_type ($attr); + } + } + } + return 'unknown'; + } +} + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Biblio/PubmedBookArticle.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Biblio/PubmedBookArticle.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,138 @@ +# $Id: PubmedBookArticle.pm,v 1.3 2002/10/22 07:45:11 lapp Exp $ +# +# BioPerl module for Bio::Biblio::PubmedBookArticle +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Biblio::PubmedBookArticle - Representation of a PUBMED book article + +=head1 SYNOPSIS + + $obj = new Bio::Biblio::PubmedBookArticle + (-title => 'Still getting started'. + -book => new Bio::Biblio::MedlineBook); + # note that there is no specialised class PubmedBook + + --- OR --- + + $obj = new Bio::Biblio::PubmedBookArticle; + $obj->title ('Still getting started'); + +=head1 DESCRIPTION + +A storage object for a PUBMED book article. +See its place in the class hierarchy in +http://industry.ebi.ac.uk/openBQS/images/bibobjects_perl.gif + +=head2 Attributes + +There are no specific attributes in this class +(however, you can set and get all attributes defined in the parent classes). + +=head1 SEE ALSO + +=over + +=item * + +OpenBQS home page: http://industry.ebi.ac.uk/openBQS + +=item * + +Comments to the Perl client: http://industry.ebi.ac.uk/openBQS/Client_perl.html + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=cut + + +# Let the code begin... + + +package Bio::Biblio::PubmedBookArticle; +use strict; +use vars qw(@ISA); + +use Bio::Biblio::PubmedArticle; +use Bio::Biblio::MedlineBookArticle; +@ISA = qw(Bio::Biblio::PubmedArticle Bio::Biblio::MedlineBookArticle); + +# +# a closure with a list of allowed attribute names (these names +# correspond with the allowed 'get' and 'set' methods); each name also +# keep what type the attribute should be (use 'undef' if it is a +# simple scalar) +# +{ + my %_allowed = + ( + ); + + # return 1 if $attr is allowed to be set/get in this class + sub _accessible { + my ($self, $attr) = @_; + return 1 if exists $_allowed{$attr}; + foreach my $parent (@ISA) { + return 1 if $parent->_accessible ($attr); + } + } + + # return an expected type of given $attr + sub _attr_type { + my ($self, $attr) = @_; + if (exists $_allowed{$attr}) { + return $_allowed{$attr}; + } else { + foreach my $parent (@ISA) { + if ($parent->_accessible ($attr)) { + return $parent->_attr_type ($attr); + } + } + } + return 'unknown'; + } +} + + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Biblio/PubmedJournalArticle.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Biblio/PubmedJournalArticle.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,151 @@ +# $Id: PubmedJournalArticle.pm,v 1.4 2002/10/22 07:45:11 lapp Exp $ +# +# BioPerl module for Bio::Biblio::PubmedJournalArticle +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Biblio::PubmedJournalArticle - Representation of a PUBMED journal article + +=head1 SYNOPSIS + + $obj = new Bio::Biblio::PubmedJournalArticle ( + + # some attributes from MedlineJournalArticle + -title => 'Thermal adaptation analyzed by comparison of protein sequences from mesophilic and extremely thermophilic Methanococcus species.', + -journal => new Bio::Biblio::MedlineJournal (-issn => '0027-8424'), + -volume => 96, + -issue => 7, + + # and some from PubmedArticle + -pubmed_history_list => + [ { 'pub_status' => 'pubmed', + 'date' => '2001-12-1T10:0:00Z' }, + { 'pub_status' => 'medline', + 'date' => '2002-1-5T10:1:00Z' } ], + -pubmed_status => 'ppublish'); + --- OR --- + + $obj = new Bio::Biblio::PubmedJournalArticle; + $obj->title ('...'); + $obj->journal (new Bio::Biblio::MedlineJournal (-issn => '0027-8424')); + $obj->pubmed_status ('ppublish'); + + +=head1 DESCRIPTION + +A storage object for a PUBMED journal article. +See its place in the class hierarchy in +http://industry.ebi.ac.uk/openBQS/images/bibobjects_perl.gif + +=head2 Attributes + +There are no specific attributes in this class +(however, you can set and get all attributes defined in the parent classes). + +=head1 SEE ALSO + +=over + +=item * + +OpenBQS home page: http://industry.ebi.ac.uk/openBQS + +=item * + +Comments to the Perl client: http://industry.ebi.ac.uk/openBQS/Client_perl.html + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=cut + + +# Let the code begin... + + +package Bio::Biblio::PubmedJournalArticle; +use strict; +use vars qw(@ISA); + +use Bio::Biblio::PubmedArticle; +use Bio::Biblio::MedlineJournalArticle; +@ISA = qw(Bio::Biblio::PubmedArticle Bio::Biblio::MedlineJournalArticle); + +# +# a closure with a list of allowed attribute names (these names +# correspond with the allowed 'get' and 'set' methods); each name also +# keep what type the attribute should be (use 'undef' if it is a +# simple scalar) +# +{ + my %_allowed = + ( + ); + + # return 1 if $attr is allowed to be set/get in this class + sub _accessible { + my ($self, $attr) = @_; + return 1 if exists $_allowed{$attr}; + foreach my $parent (@ISA) { + return 1 if $parent->_accessible ($attr); + } + } + + # return an expected type of given $attr + sub _attr_type { + my ($self, $attr) = @_; + if (exists $_allowed{$attr}) { + return $_allowed{$attr}; + } else { + foreach my $parent (@ISA) { + if ($parent->_accessible ($attr)) { + return $parent->_attr_type ($attr); + } + } + } + return 'unknown'; + } +} + + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Biblio/Ref.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Biblio/Ref.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,251 @@ +# $Id: Ref.pm,v 1.5 2002/10/22 07:45:11 lapp Exp $ +# +# BioPerl module for Bio::Biblio::Ref +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Biblio::Ref - Representation of a bibliographic reference + +=head1 SYNOPSIS + + $obj = new Bio::Biblio::Ref (-type => 'Letter', + -title => 'Onegin to Tatiana'); + --- OR --- + + $obj = new Bio::Biblio::Ref; + $obj->type ('Letter'); + +=head1 DESCRIPTION + +A storage object for a general bibliographic reference (a citation). +See its place in the class hierarchy in +http://industry.ebi.ac.uk/openBQS/images/bibobjects_perl.gif + +=head2 Attributes + +The following attributes are specific to this class, +and they are inherited by all citation types. + + author_list_complete values: 'Y' (default) or 'N' + authors type: array ref of Bio::Biblio::Provider's + cross_references type: array ref of Bio::Annotation::DBLink's + cross_references_list_complete values: 'Y' (default) or 'N' + abstract + abstract_language + abstract_type + codes type: hash ref + contributors type: array ref of Bio::Biblio::Provider's + date + date_completed + date_created + date_revised + format + identifier + keywords + language + last_modified_date + publisher type: Bio::Biblio::Provider + repository_subset + rights + spatial_location + subject_headings type: hash ref + subject_headings_source + temporal_period + title + toc + toc_type + type + +=head1 SEE ALSO + +=over + +=item * + +OpenBQS home page: http://industry.ebi.ac.uk/openBQS + +=item * + +Comments to the Perl client: http://industry.ebi.ac.uk/openBQS/Client_perl.html + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS + +Heikki Lehvaslaiho (heikki@ebi.ac.uk), +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Biblio::Ref; +use strict; +use vars qw(@ISA $AUTOLOAD); + +use Bio::Biblio::BiblioBase; +use Bio::Annotation::DBLink; + +@ISA = qw(Bio::Biblio::BiblioBase); + +# +# a closure with a list of allowed attribute names (these names +# correspond with the allowed 'get' and 'set' methods); each name also +# keep what type the attribute should be (use 'undef' if it is a +# simple scalar) +# +{ + my %_allowed = + ( + _author_list_complete => undef, + _authors => 'ARRAY', # of Bio::Biblio::Provider + _cross_references => 'ARRAY', # of Bio::Annotation::DBLink + _cross_references_list_complete => undef, + _abstract => undef, + _abstract_language => undef, + _abstract_type => undef, + _codes => 'HASH', + _contributors => 'ARRAY', # of Bio::Biblio::Provider + _date => undef, + _date_completed => undef, + _date_created => undef, + _date_revised => undef, + _format => undef, + _identifier => undef, + _keywords => 'HASH', + _language => undef, + _last_modified_date => undef, + _publisher => 'Bio::Biblio::Provider', + _repository_subset => undef, + _rights => undef, + _spatial_location => undef, + _subject_headings => 'HASH', + _subject_headings_source => undef, + _temporal_period => undef, + _title => undef, + _toc => undef, + _toc_type => undef, + _type => undef, + ); + + # return 1 if $attr is allowed to be set/get in this class + sub _accessible { + my ($self, $attr) = @_; + exists $_allowed{$attr}; + } + + # return an expected type of given $attr + sub _attr_type { + my ($self, $attr) = @_; + $_allowed{$attr}; + } +} + + +=head2 add_cross_reference + + Usage : $self->add_cross_reference + (new Bio::Annotation::DBLink (-database => 'EMBL', + -primary_id => 'V00808'); + Function: adding a link to a database entry + Returns : new value of 'cross_references' + Args : an object of type Bio::Annotation::DBLink + +=cut + +sub add_cross_reference { + my ($self, $value) = @_; + $self->throw ($self->_wrong_type_msg (ref $value, 'Bio::Annotation::DBLink')) + unless (UNIVERSAL::isa ($value, 'Bio::Annotation::DBLink')); + (defined $self->cross_references) ? + push (@{ $self->cross_references }, $value) : + return $self->cross_references ( [$value] ); + return $self->cross_references; +} + + +=head2 add_author + + Usage : $self->add_author (new Bio::Biblio::Person (-lastname => 'Novak'); + Function: adding an author to a list of authors + Returns : new value of 'authors' (a full list) + Args : an object of type Bio::Biblio::Provider + +=cut + + +sub add_author { + my ($self, $value) = @_; + $self->throw ($self->_wrong_type_msg (ref $value, 'Bio::Biblio::Provider')) + unless (UNIVERSAL::isa ($value, 'Bio::Biblio::Provider')); + (defined $self->authors) ? + push (@{ $self->authors }, $value) : + return $self->authors ( [$value] ); + return $self->authors; +} + +=head2 add_contributor + + Usage : $self->add_contributor (new Bio::Biblio::Person (-lastname => 'Novak'); + Function: adding a contributor to a list of contributors + Returns : new value of 'contributors' (a full list) + Args : an object of type Bio::Biblio::Provider + +=cut + +sub add_contributor { + my ($self, $value) = @_; + $self->throw ($self->_wrong_type_msg (ref $value, 'Bio::Biblio::Provider')) + unless (UNIVERSAL::isa ($value, 'Bio::Biblio::Provider')); + (defined $self->contributors) ? + push (@{ $self->contributors }, $value) : + return $self->contributors ( [$value] ); + return $self->contributors; +} + + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Biblio/Service.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Biblio/Service.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,128 @@ +# $Id: Service.pm,v 1.6 2002/10/22 07:45:11 lapp Exp $ +# +# BioPerl module for Bio::Biblio::Service +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Biblio::Service - Representation of a provider of type service + +=head1 SYNOPSIS + + $obj = new Bio::Biblio::Service (-name => 'Report generator'); + + --- OR --- + + $obj = new Bio::Biblio::Service; + $obj->name ('Report generator'); + +=head1 DESCRIPTION + +A storage object for a service (such a computer service) related to a bibliographic resource. + +=head2 Attributes + +The following attributes are specific to this class +(however, you can also set and get all attributes defined in the parent classes): + + name + +=head1 SEE ALSO + +=over + +=item * + +OpenBQS home page: http://industry.ebi.ac.uk/openBQS + +=item * + +Comments to the Perl client: http://industry.ebi.ac.uk/openBQS/Client_perl.html + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS + +Heikki Lehvaslaiho (heikki@ebi.ac.uk), +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=cut + + +# Let the code begin... + + +package Bio::Biblio::Service; +use strict; +use vars qw(@ISA); + +use Bio::Biblio::Provider; + +@ISA = qw( Bio::Biblio::Provider); + +# +# a closure with a list of allowed attribute names (these names +# correspond with the allowed 'get' and 'set' methods); each name also +# keep what type the attribute should be (use 'undef' if it is a +# simple scalar) +# +{ + my %_allowed = ( + _name => undef, + ); + + # return 1 if $attr is allowed to be set/get in this class + sub _accessible { + my ($self, $attr) = @_; + exists $_allowed{$attr} or $self->SUPER::_accessible ($attr); + } + + # return an expected type of given $attr + sub _attr_type { + my ($self, $attr) = @_; + if (exists $_allowed{$attr}) { + return $_allowed{$attr}; + } else { + return $self->SUPER::_attr_type ($attr); + } + } +} + + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Biblio/TechReport.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Biblio/TechReport.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,128 @@ +# $Id: TechReport.pm,v 1.6 2002/10/22 07:45:11 lapp Exp $ +# +# BioPerl module for Bio::Biblio::TechReport +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Biblio::TechReport - Representation of a technical report + +=head1 SYNOPSIS + + $obj = new Bio::Biblio::TechReport + (-authors => [ + new Bio::Biblio::Person (-lastname => 'Hasek'), + new Bio::Biblio::Person (-lastname => 'Jagr'), + new Bio::Biblio::Organisation (-name => 'NHL'), + ] + -title => 'Pinned in the corner'); + +=head1 DESCRIPTION + +A storage object for a technical report. +See its place in the class hierarchy in +http://industry.ebi.ac.uk/openBQS/images/bibobjects_perl.gif + +=head2 Attributes + +There are no specific attributes in this class +(however, you can set and get all attributes defined in the parent classes). + +=head1 SEE ALSO + +=over + +=item * + +OpenBQS home page: http://industry.ebi.ac.uk/openBQS + +=item * + +Comments to the Perl client: http://industry.ebi.ac.uk/openBQS/Client_perl.html + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS + +Heikki Lehvaslaiho (heikki@ebi.ac.uk), +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=cut + + +# Let the code begin... + + +package Bio::Biblio::TechReport; +use strict; +use vars qw(@ISA); + +use Bio::Biblio::Ref; + +@ISA = qw( Bio::Biblio::Ref); + +# +# a closure with a list of allowed attribute names (these names +# correspond with the allowed 'get' and 'set' methods); each name also +# keep what type the attribute should be (use 'undef' if it is a +# simple scalar) +# +{ + my %_allowed = ( + ); + + # return 1 if $attr is allowed to be set/get in this class + sub _accessible { + my ($self, $attr) = @_; + exists $_allowed{$attr} or $self->SUPER::_accessible ($attr); + } + + # return an expected type of given $attr + sub _attr_type { + my ($self, $attr) = @_; + if (exists $_allowed{$attr}) { + return $_allowed{$attr}; + } else { + return $self->SUPER::_attr_type ($attr); + } + } +} + + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Biblio/Thesis.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Biblio/Thesis.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,127 @@ +# $Id: Thesis.pm,v 1.5 2002/10/22 07:45:11 lapp Exp $ +# +# BioPerl module for Bio::Biblio::Thesis +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Biblio::Thesis - Representation of thesis + +=head1 SYNOPSIS + + $obj = new Bio::Biblio::Thesis (-title => 'Perl on the edge'); + + --- OR --- + + $obj = new Bio::Biblio::Thesis; + $obj->title ('Perl on the edge'); + +=head1 DESCRIPTION + +A storage object for thesis. +See its place in the class hierarchy in +http://industry.ebi.ac.uk/openBQS/images/bibobjects_perl.gif + +=head2 Attributes + +There are no specific attributes in this class +(however, you can set and get all attributes defined in the parent classes). + +=head1 SEE ALSO + +=over + +=item * + +OpenBQS home page: http://industry.ebi.ac.uk/openBQS + +=item * + +Comments to the Perl client: http://industry.ebi.ac.uk/openBQS/Client_perl.html + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS + +Heikki Lehvaslaiho (heikki@ebi.ac.uk), +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=cut + + +# Let the code begin... + + +package Bio::Biblio::Thesis; +use strict; +use vars qw(@ISA); + +use Bio::Biblio::Ref; + +@ISA = qw( Bio::Biblio::Ref); + +# +# a closure with a list of allowed attribute names (these names +# correspond with the allowed 'get' and 'set' methods); each name also +# keep what type the attribute should be (use 'undef' if it is a +# simple scalar) +# +{ + my %_allowed = ( + ); + + # return 1 if $attr is allowed to be set/get in this class + sub _accessible { + my ($self, $attr) = @_; + exists $_allowed{$attr} or $self->SUPER::_accessible ($attr); + } + + # return an expected type of given $attr + sub _attr_type { + my ($self, $attr) = @_; + if (exists $_allowed{$attr}) { + return $_allowed{$attr}; + } else { + return $self->SUPER::_attr_type ($attr); + } + } +} + + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Biblio/WebResource.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Biblio/WebResource.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,135 @@ +# $Id: WebResource.pm,v 1.6 2002/10/22 07:45:11 lapp Exp $ +# +# BioPerl module for Bio::Biblio::WebResource +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Biblio::WebResource - Representation of a web resource + +=head1 SYNOPSIS + + $obj = new Bio::Biblio::WebResource + (-url => 'http://resources/best.html', + -estimated_size => 45000); + --- OR --- + + $obj = new Bio::Biblio::WebResource; + $obj->cost ('0.3 EURO'); + +=head1 DESCRIPTION + +A storage object for a citation quoting a web resource. +See its place in the class hierarchy in +http://industry.ebi.ac.uk/openBQS/images/bibobjects_perl.gif + +=head2 Attributes + +The following attributes are specific to this class +(however, you can also set and get all attributes defined in the parent classes): + + url + estimated_size + cost + +=head1 SEE ALSO + +=over + +=item * + +OpenBQS home page: http://industry.ebi.ac.uk/openBQS + +=item * + +Comments to the Perl client: http://industry.ebi.ac.uk/openBQS/Client_perl.html + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS + +Heikki Lehvaslaiho (heikki@ebi.ac.uk), +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=cut + + +# Let the code begin... + + +package Bio::Biblio::WebResource; +use strict; +use vars qw(@ISA); + +use Bio::Biblio::Ref; + +@ISA = qw( Bio::Biblio::Ref); + +# +# a closure with a list of allowed attribute names (these names +# correspond with the allowed 'get' and 'set' methods); each name also +# keep what type the attribute should be (use 'undef' if it is a +# simple scalar) +# +{ + my %_allowed = ( + _url => undef, + _estimated_size => undef, + _cost => undef, + ); + + # return 1 if $attr is allowed to be set/get in this class + sub _accessible { + my ($self, $attr) = @_; + exists $_allowed{$attr} or $self->SUPER::_accessible ($attr); + } + + # return an expected type of given $attr + sub _attr_type { + my ($self, $attr) = @_; + if (exists $_allowed{$attr}) { + return $_allowed{$attr}; + } else { + return $self->SUPER::_attr_type ($attr); + } + } +} + + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Cluster/ClusterFactory.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Cluster/ClusterFactory.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,248 @@ +# $Id: ClusterFactory.pm,v 1.2 2002/10/31 09:45:39 lapp Exp $ +# +# BioPerl module for Bio::Cluster::ClusterFactory +# +# Cared for by Hilmar Lapp +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# +# (c) Hilmar Lapp, hlapp at gmx.net, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Cluster::ClusterFactory - Instantiates a new Bio::ClusterI (or derived class) through a factory + +=head1 SYNOPSIS + + use Bio::Cluster::ClusterFactory; + # if you don't provide a default type, the factory will try + # some guesswork based on display_id and namespace + my $factory = new Bio::Cluster::ClusterFactory(-type => 'Bio::Cluster::UniGene'); + my $clu = $factory->create_object(-description => 'NAT', + -display_id => 'Hs.2'); + + +=head1 DESCRIPTION + +This object will build L objects generically. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp at gmx.net + + +=head1 CONTRIBUTORS + +This is mostly copy-and-paste with subsequent adaptation from +Bio::Seq::SeqFactory by Jason Stajich. Most credits should in fact go +to him. + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Cluster::ClusterFactory; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::Factory::ObjectFactoryI; + +@ISA = qw(Bio::Root::Root Bio::Factory::ObjectFactoryI); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Cluster::ClusterFactory(); + Function: Builds a new Bio::Cluster::ClusterFactory object + Returns : Bio::Cluster::ClusterFactory + Args : -type => string, name of a ClusterI derived class. + If not provided, the factory will have to guess + from ID and namespace, which may or may not be + successful. + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + my ($type) = $self->_rearrange([qw(TYPE)], @args); + + $self->{'_loaded_types'} = {}; + $self->type($type) if $type; + + return $self; +} + + +=head2 create_object + + Title : create_object + Usage : my $seq = $factory->create_object(); + Function: Instantiates new Bio::ClusterI (or one of its child classes) + + This object allows us to genericize the instantiation of + cluster objects. + + Returns : L compliant object + The return type is configurable using new(-type =>"..."). + Args : initialization parameters specific to the type of cluster + object we want. Typically + -display_id => $name + -description => description of the cluster + -members => arrayref, members of the cluster + +=cut + +sub create_object { + my ($self,@args) = @_; + + my $type = $self->type(); + if(! $type) { + # we need to guess this + $type = $self->_guess_type(@args); + if(! $type) { + $self->throw("No cluster type set and unable to guess."); + } + # load dynamically if it hasn't been loaded yet + if(! $self->{'_loaded_types'}->{$type}) { + eval { + $self->_load_module($type); + $self->{'_loaded_types'}->{$type} = 1; + }; + if($@) { + $self->throw("Bio::ClusterI implementation $type ". + "failed to load: ".$@); + } + } + } + return $type->new(-verbose => $self->verbose, @args); +} + +=head2 type + + Title : type + Usage : $obj->type($newval) + Function: Get/set the type of L object to be created. + + This may be changed at any time during the lifetime of this + factory. + + Returns : value of type + Args : newvalue (optional) + + +=cut + +sub type{ + my $self = shift; + + if(@_) { + my $type = shift; + if($type && (! $self->{'_loaded_types'}->{$type})) { + eval { + $self->_load_module($type); + }; + if( $@ ) { + $self->throw("Cluster implementation '$type' failed to load: ". + $@); + } + my $a = bless {},$type; + if( ! $a->isa('Bio::ClusterI') ) { + $self->throw("'$type' does not implement Bio::ClusterI. ". + "Too bad."); + } + $self->{'_loaded_types'}->{$type} = 1; + } + return $self->{'type'} = $type; + } + return $self->{'type'}; +} + +=head2 _guess_type + + Title : _guess_type + Usage : + Function: Guesses the right type of L implementation + based on initialization parameters for the prospective + object. + Example : + Returns : the type (a string, the module name) + Args : initialization parameters to be passed to the prospective + cluster object + + +=cut + +sub _guess_type{ + my ($self,@args) = @_; + my $type; + + # we can only guess from a certain number of arguments + my ($dispid, $ns, $members) = + $self->_rearrange([qw(DISPLAY_ID + NAMESPACE + MEMBERS + )], @args); + # Unigene namespace or ID? + if($ns && (lc($ns) eq "unigene")) { + $type = 'Bio::Cluster::UniGene'; + } elsif($dispid && ($dispid =~ /^Hs\.[0-9]/)) { + $type = 'Bio::Cluster::UniGene'; + } + # what else could we look for? + return $type; +} + +##################################################################### +# aliases for naming consistency or other reasons # +##################################################################### + +*create = \&create_object; + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Cluster/FamilyI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Cluster/FamilyI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,183 @@ +# +# BioPerl module for Bio::Cluster::FamilyI +# +# Cared for by Shawn Hoon +# +# Copyright Shawn Hoon +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Cluster::FamilyI - Family Interface + +=head1 SYNOPSIS + +# see the implementations of this interface for details but +# basically + + my $cluster= $cluster->new(-description=>"POLYUBIQUITIN", + -members =>[$seq1,$seq2]); + my @members = $cluster->get_members(); + my @sub_members = $cluster->get_members(-species=>"homo sapiens"); + + + +=head1 DESCRIPTION + +This interface if for a Family object representing a family of +biological objects. A generic implementation for this may be +found a Bio::Cluster::Family. + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Shawn Hoon + +Email shawnh@fugu-sg.org + + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +package Bio::Cluster::FamilyI; +use vars qw(@ISA); +use strict; + +use Bio::ClusterI; + +@ISA = qw(Bio::ClusterI); + +=head2 new + + We dont mandate but encourage implementors to support at least the + following named parameters upon object initialization. + + Arguments Description + --------- ----------- + -family_id the name of the family + -description the consensus description of the family + -annotation_score the confidence by which the consensus description is + representative of the family + -members the members belonging to the family + -alignment the multiple alignment of the members + +=cut + + +=head2 family_id + + Title : family_id + Usage : Bio::Cluster::FamilyI->family_id("znfp"); + Function: get/set for the family id + Returns : the family id + Args : the family id + +=cut + +sub family_id{ + shift->throw_not_implemented(); +} + +=head2 family_score + + Title : family_score + Usage : Bio::Cluster::FamilyI->family_score(95); + Function: get/set for the score of algorithm used to generate + the family if present + Returns : the score + Args : the score + +=cut + +sub family_score { + shift->throw_not_implemented(); +} + + +=head1 Methods inherited from L + +=cut + +=head2 display_id + + Title : display_id + Usage : + Function: Get the display name or identifier for the cluster + Returns : a string + Args : + +=cut + +=head2 get_members + + Title : get_members + Usage : Bio::Cluster::FamilyI->get_members(); + Function: get the members of the family + Returns : the array of members + Args : the array of members + +=cut + +=head2 description + + Title : description + Usage : Bio::Cluster::FamilyI->description("Zinc Finger Protein"); + Function: get/set for the description of the family + Returns : the description + Args : the description + +=cut + + +=head2 size + + Title : size + Usage : Bio::Cluster::FamilyI->size(); + Function: get/set for the description of the family + Returns : size + Args : + +=cut + +=head2 cluster_score + + Title : cluster_score + Usage : $cluster ->cluster_score(100); + Function: get/set for cluster_score which + represent the score in which the clustering + algorithm assigns to this cluster. + Returns : a number + +=cut + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Cluster/SequenceFamily.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Cluster/SequenceFamily.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,445 @@ +# $Id: SequenceFamily.pm,v 1.4 2002/12/01 00:34:58 jason Exp $ +# +# BioPerl module for Bio::Cluster::SequenceFamily +# +# Cared for by Shawn Hoon +# +# Copyright Shawn Hoon +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Cluster::SequenceFamily - Sequence Family object + +=head1 SYNOPSIS + + use Bio::Cluster::SequenceFamily + + use Bio::SeqIO; + use Bio::Cluster::SequenceFamily; + + my $file = Bio::Root::IO->catfile('t','data','swiss.dat'); + my $seqio= new Bio::SeqIO('-format' => 'swiss', + '-file' => $file); + my @mem; + while(my $seq = $seqio->next_seq){ + push @mem, $seq; + } + + #create the family + my $family = Bio::Cluster::SequenceFamily->new(-family_id=>"Family_1", + -description=>"Family Description Here", + -annotation_score=>"100", + -members=>\@mem); + + #access the family + + foreach my $mem ($family->get_members){ + print $mem->display_id."\t".$mem->desc."\n"; + } + + #select members if members have a Bio::Species Object + + my @mem = $family->get_members(-binomial=>"Homo sapiens"); + @mem = $family->get_members(-ncbi_taxid => 9606); + @mem = $family->get_members(-common_name=>"Human"); + @mem = $family->get_members(-species=>"sapiens"); + @mem = $family->get_members(-genus=>"Homo"); + + + +=head1 DESCRIPTION + +This is a simple Family object that may hold any group of object. For more +specific families, one should derive from FamilyI. + +=head1 FEEDBACK + + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Shawn Hoon + +Email shawnh@fugu-sg.org + + +=head1 APPENDIX + + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a "_". + +=cut + +# Let the code begin... + + +package Bio::Cluster::SequenceFamily; + +use strict; +use vars qw(@ISA); + + +use Bio::Root::Root; +use Bio::Cluster::FamilyI; + +@ISA = qw(Bio::Root::Root Bio::Cluster::FamilyI); + + +=head2 new + + Title : new + Usage : my $family = Bio::Cluster::SequenceFamily->new(-family_id=>"Family_1", + -description=>"Family Description Here", + -annotation_score=>"100", + -members=>\@mem); + Function: Constructor for SequenceFamily object + Returns : L object + +=cut + +sub new { + my ($class,@args) = @_; + my $self = $class->SUPER::new(@args); + my ($id,$description,$version,$annot_score, + $family_score,$members) = $self->_rearrange([qw(FAMILY_ID DESCRIPTION VERSION + ANNOTATION_SCORE + FAMILY_SCORE MEMBERS)],@args); + $self->{'_members'} = []; + $id && $self->family_id($id); + $description && $self->description($description); + $version && $self->version($version); + $annot_score && $self->annotation_score($annot_score); + $family_score && $self->family_score($family_score); + $members && $self->add_members($members); + + return $self; + +} + +=head2 version + + Title : version + Usage : $family->version("1.0"); + Function: get/set for version + Returns : a string version of the family generated. + +=cut + +sub version{ + my ($self,$value) = @_; + if($value){ + $self->{'_version'} =$value; + } + return $self->{'_version'}; +} + +=head2 annotation_score + + Title : annotation_score + Usage : $family->annotation_score(100); + Function: get/set for annotation_score which + represent the confidence in which the + consensus description has been assigned + to the family. + Returns : L + +=cut + +sub annotation_score{ + my ($self,$score) = @_; + if($score){ + $self->{'_annotation_score'} = $score; + } + return $self->{'_annotation_score'}; +} + +=head2 alignment + + Title : alignment + Usage : $family->alignment($align); + Function: get/set for an alignment object representing + the multiple alignment of the members of the family. + Returns : L + +=cut + +sub alignment { + my ($self,$align) = @_; + if($align){ + $self->{'_alignment'} = $align; + } + return $self->{'_alignment'}; +} + +=head2 tree + + Title : tree + Usage : $family->tree($tree); + Function: get/set for an tree object representing + the phylogenetic tree of the family. + Returns : L + +=cut + +sub tree { + my ($self,$tree) = @_; + if($tree) { + $self->{'_tree'} = $tree; + } + return $self->{'_tree'}; +} + +=head1 L methods + +=cut + +=head2 family_score + + Title : family_score + Usage : Bio::Cluster::FamilyI->family_score(95); + Function: get/set for the score of algorithm used to generate + the family if present + + This is aliased to cluster_score(). + + Returns : the score + Args : the score + +=cut + +sub family_score { + return shift->cluster_score(@_); +} + + +=head2 family_id + + Title : family_id + Usage : $family->family_id("Family_1"); + Function: get/set for family id + + This is aliased to display_id(). + + Returns : a string specifying identifier of the family + +=cut + +sub family_id{ + return shift->display_id(@_); +} + +=head1 L methods + +=cut + +=head2 display_id + + Title : display_id + Usage : + Function: Get/set the display name or identifier for the cluster + Returns : a string + Args : optional, on set the display ID ( a string) + +=cut + +sub display_id{ + my ($self,$id) = @_; + if($id){ + $self->{'_cluster_id'} = $id; + } + return $self->{'_cluster_id'}; +} + +=head2 description + + Title : description + Usage : $fam->description("POLYUBIQUITIN") + Function: get/set for the consensus description of the cluster + Returns : the description string + Args : Optional the description string + +=cut + +sub description{ + my ($self,$desc) = @_; + if($desc){ + $self->{'_description'} = $desc; + } + return $self->{'_description'}; +} + +=head2 get_members + + Title : get_members + Usage : Valid criteria: + -common_name + -binomial + -ncbi_taxid + -organelle + -genus + $family->get_members(-common_name =>"human"); + $family->get_members(-species =>"homo sapiens"); + $family->get_members(-ncbi_taxid => 9606); + For now, multiple critieria are ORed. + + Will return all members if no criteria are provided. + + Function: get members using methods from L + the phylogenetic tree of the family. + Returns : an array of objects that are member of this family. + +=cut + +sub get_members { + my $self = shift; + my @ret; + + if(@_) { + my %hash = @_; + foreach my $mem ( @{$self->{'_members'}} ) { + foreach my $key ( keys %hash){ + my $method = $key; + $method=~s/-//g; + if($mem->can('species')){ + my $species = $mem->species; + $species->can($method) || + $self->throw("$method is an invalid criteria"); + if($species->$method() eq $hash{$key} ){ + push @ret, $mem; + } + } + } + } + return @ret; + } + return @{$self->{'_members'}}; +} + +=head2 size + + Title : size + Usage : $fam->size(); + Function: get/set for the size of the family, + calculated from the number of members + Returns : the size of the family + Args : + +=cut + +sub size { + my ($self) = @_; + + return scalar(@{$self->{'_members'}}); + +} + +=head2 cluster_score + + Title : cluster_score + Usage : $fam->cluster_score(100); + Function: get/set for cluster_score which + represent the score in which the clustering + algorithm assigns to this cluster. + Returns : a number + +=cut + +sub cluster_score{ + my ($self,$score) = @_; + if($score){ + $self->{'_cluster_score'} = $score; + } + return $self->{'_cluster_score'}; +} + + +=head1 Implementation specific methods + + These are mostly for adding/removing/changing. + +=cut + +=head2 add_members + + Title : add_members + Usage : $fam->add_member([$seq1,$seq1]); + Function: add members to a family + Returns : + Args : the member(s) to add, as an array or arrayref + +=cut + +sub add_members{ + my ($self,@mems) = @_; + + my $mem = shift(@mems); + if(ref($mem) eq "ARRAY"){ + push @{$self->{'_members'}},@{$mem}; + } else { + push @{$self->{'_members'}},$mem; + } + push @{$self->{'_members'}}, @mems; + + return 1; +} + +=head2 remove_members + + Title : remove_members + Usage : $fam->remove_members(); + Function: remove all members from a family + Returns : the previous array of members + Args : none + +=cut + +sub remove_members{ + my ($self) = @_; + my $mems = $self->{'_members'}; + $self->{'_members'} = []; + return @$mems; +} + +##################################################################### +# aliases for naming consistency or other reasons # +##################################################################### + +*flush_members = \&remove_members; +*add_member = \&add_members; + +sub members{ + my $self = shift; + if(@_) { + # this is in set mode + $self->warn("setting members() in ".ref($self)." is deprecated.\n". + "Use add_members() instead."); + return $self->add_members(@_); + } else { + # get mode + $self->warn("members() in ".ref($self)." is deprecated.\n". + "Use get_members() instead."); + return $self->get_members(); + } +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Cluster/UniGene.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Cluster/UniGene.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1396 @@ +# $Id: UniGene.pm,v 1.23.2.2 2003/09/15 01:52:03 andrew Exp $ +# +# BioPerl module for Bio::Cluster::UniGene.pm +# +# Cared for by Andrew Macgregor +# +# Copyright Andrew Macgregor, Jo-Ann Stanton, David Green +# Molecular Embryology Group, Anatomy & Structural Biology, University of Otago +# http://meg.otago.ac.nz/ +# +# You may distribute this module under the same terms as perl itself +# +# _history +# April 17, 2002 - Initial implementation by Andrew Macgregor +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Cluster::UniGene - UniGene object + +=head1 SYNOPSIS + + use Bio::Cluster::UniGene; + use Bio::ClusterIO; + + $stream = Bio::ClusterIO->new('-file' => "Hs.data", + '-format' => "unigene"); + # note: we quote -format to keep older perl's from complaining. + + while ( my $in = $stream->next_cluster() ) { + print $in->unigene_id() . "\n"; + while ( my $sequence = $in->next_seq() ) { + print $sequence->accession_number() . "\n"; + } + } + +=head1 DESCRIPTION + +This UniGene object implements the L interface +for the representation if UniGene clusters in Bioperl. It is returned +by the L parser for unigene format and contains all +the data associated with one UniGene record. + +This class implements several interfaces and hence can be used +wherever instances of such interfaces are expected. In particular, the +interfaces are L as the base interface for all cluster +representations, and in addition L and +L. + +The following lists the UniGene specific methods that are available +(see below for details). Be aware next_XXX iterators take a snapshot +of the array property when they are first called, and this snapshot is +not reset until the iterator is exhausted. Hence, once called you need +to exhaust the iterator to see any changes that have been made to the +property in the meantime. You will usually want to use the +non-iterator equivalents and loop over the elements yourself. + +new() - standard new call + +unigene_id() - set/get unigene_id + +title() - set/get title (description) + +gene() - set/get gene + +cytoband() - set/get cytoband + +mgi() - set/get mgi + +locuslink() - set/get locuslink + +gnm_terminus() - set/get gnm_terminus + +scount() - set/get scount + +express() - set/get express, currently takes/returns a reference to an +array of expressed tissues + +next_express() - returns the next tissue expression from the expressed +tissue array + +chromosome() - set/get chromosome, currently takes/returns a reference +to an array of chromosome lines + +next_chromosome() - returns the next chromosome line from the array of +chromosome lines + +sts() - set/get sts, currently takes/returns a reference to an array +of sts lines + +next_sts() - returns the next sts line from the array of sts lines + +txmap() - set/get txmap, currently takes/returns a reference to an +array of txmap lines + +next_txmap() - returns the next txmap line from the array of txmap +lines + +protsim() - set/get protsim, currently takes/returns a reference to an +array of protsim lines + +next_protsim() - returns the next protsim line from the array of +protsim lines + +sequences() - set/get sequence, currently takes/returns a reference to +an array of references to seq info + +next_seq() - returns a Seq object that currently only contains an +accession number + + +=head1 Implemented Interfaces + +This class implementes the following interfaces. + +=over 4 + +=item Bio::Cluster::UniGeneI + +This includes implementing Bio::ClusterI. + +=item Bio::IdentifiableI + +=item Bio::DescribableI + +=item Bio::AnnotatableI + +=item Bio::Factory::SequenceStreamI + +=back + +=head1 FEEDBACK + + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Andrew Macgregor + +Email andrew@anatomy.otago.ac.nz + +=head1 CONTRIBUTORS + +Hilmar Lapp, hlapp at gmx.net + +=head1 APPENDIX + + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a "_". + +=cut + +# Let the code begin... + + +package Bio::Cluster::UniGene; +use vars qw(@ISA); +use strict; + + +use Bio::Root::Root; +use Bio::IdentifiableI; +use Bio::DescribableI; +use Bio::AnnotatableI; +use Bio::Annotation::Collection; +use Bio::Annotation::DBLink; +use Bio::Annotation::SimpleValue; +use Bio::Species; +use Bio::Factory::SequenceStreamI; +use Bio::Seq::SeqFactory; +use Bio::Cluster::UniGeneI; + +@ISA = qw(Bio::Root::Root Bio::Cluster::UniGeneI + Bio::IdentifiableI Bio::DescribableI Bio::AnnotatableI + Bio::Factory::SequenceStreamI); + +my %species_map = ( + 'Aga' => "Anopheles gambiae", + 'At' => "Arabidopsis thaliana", + 'Bt' => "Bos taurus", + 'Cel' => "Caenorhabditis elegans", + 'Cin' => "Ciona intestinalis", + 'Cre' => "Chlamydomonas reinhardtii", + 'Ddi' => "Dictyostelium discoideum", + 'Dr' => "Danio rerio", + 'Dm' => "Drosophila melanogaster", + 'Gga' => "Gallus gallus", + 'Gma' => "Glycine max", + 'Hs' => "Homo sapiens", + 'Hv' => "Hordeum vulgare", + 'Les' => "Lycopersicon esculentum", + 'Mtr' => "Medicago truncatula", + 'Mm' => "Mus musculus", + 'Os' => "Oryza sativa", + 'Ola' => "Oryzias latipes", + 'Rn' => "Rattus norvegicus", + 'Str' => "Silurana tropicalis", + 'Sbi' => "Sorghum bicolor", + 'Ssc' => "Sus scrofa", + 'Ta' => "Triticum aestivum", + 'Xl' => "Xenopus laevis", + 'Zm' => "Zea mays", + ); + + +=head2 new + + Title : new + Usage : used by ClusterIO + Returns : a new Bio::Cluster::Unigene object + +=cut + +sub new { + # standard new call.. + my($caller,@args) = @_; + my $self = $caller->SUPER::new(@args); + + my ($ugid,$desc,$mems,$size,$species,$dispid,$id,$ns,$auth,$v,$seqfact) = + $self->_rearrange([qw(UNIGENE_ID + DESCRIPTION + MEMBERS + SIZE + SPECIES + DISPLAY_ID + OBJECT_ID + NAMESPACE + AUTHORITY + VERSION + SEQFACTORY + )], @args); + + $self->{'_alphabet'} = 'dna'; + + $self->unigene_id($ugid) if $ugid; + $self->description($desc) if $desc; + $self->sequences($mems) if $mems; + $self->size($size) if defined($size); + $self->display_id($dispid) if $dispid; # overwrites ugid + $self->object_id($id) if $id; # overwrites dispid + $self->namespace($ns || 'UniGene'); + $self->authority($auth || 'NCBI'); + $self->version($v) if defined($v); + if( ! defined $seqfact ) { + $seqfact = new Bio::Seq::SeqFactory + (-verbose => $self->verbose(), + -type => 'Bio::Seq::RichSeq'); + } + $self->sequence_factory($seqfact); + if( (! $species) && (defined $self->unigene_id() && + $self->unigene_id() =~ /^([A-Za-z]+)\.[0-9]/)) { + # try set a default one depending on the ID + $species = $species_map{$1}; + } + $self->species($species); + return $self; +} + + +=head1 L methods + +=cut + +=head2 unigene_id + + Title : unigene_id + Usage : unigene_id(); + Function: Returns the unigene_id associated with the object. + Example : $id = $unigene->unigene_id or $unigene->unigene_id($id) + Returns : A string + Args : None or an id + + +=cut + +sub unigene_id { + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'unigene_id'} = $value; + } + return $obj->{'unigene_id'}; +} + + + +=head2 title + + Title : title + Usage : title(); + Function: Returns the title associated with the object. + Example : $title = $unigene->title or $unigene->title($title) + Returns : A string + Args : None or a title + + +=cut + +sub title { + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'title'} = $value; + } + return $obj->{'title'}; +} + + +=head2 gene + + Title : gene + Usage : gene(); + Function: Returns the gene associated with the object. + Example : $gene = $unigene->gene or $unigene->gene($gene) + Returns : A string + Args : None or a gene + + +=cut + +sub gene { + my $self = shift; + return $self->_annotation_value('gene_name', @_); +} + + +=head2 cytoband + + Title : cytoband + Usage : cytoband(); + Function: Returns the cytoband associated with the object. + Example : $cytoband = $unigene->cytoband or $unigene->cytoband($cytoband) + Returns : A string + Args : None or a cytoband + + +=cut + +sub cytoband { + my $self = shift; + return $self->_annotation_value('cyto_band', @_); +} + +=head2 mgi + + Title : mgi + Usage : mgi(); + Function: Returns the mgi associated with the object. + Example : $mgi = $unigene->mgi or $unigene->mgi($mgi) + Returns : A string + Args : None or a mgi + + +=cut + +sub mgi { + my $self = shift; + my $acc; + + if(@_) { + # purge first + $self->_remove_dblink('dblink','MGI'); + # then add if a valid value is present + if($acc = shift) { + $self->_annotation_dblink('dblink','MGI',$acc); + } + } else { + ($acc) = $self->_annotation_dblink('dblink','MGI'); + } + return $acc; +} + + +=head2 locuslink + + Title : locuslink + Usage : locuslink(); + Function: Returns or stores a reference to an array containing locuslink data. + Returns : An array reference + Args : None or an array reference + +=cut + +sub locuslink { + my ($self,$ll) = @_; + + if($ll) { + # purge first + $self->_remove_dblink('dblink','LocusLink'); + # then add as many accessions as are present + foreach my $acc (@$ll) { + $self->_annotation_dblink('dblink','LocusLink',$acc); + } + } else { + my @accs = $self->_annotation_dblink('dblink','LocusLink'); + $ll = [@accs]; + } + return $ll; +} + + +=head2 gnm_terminus + + Title : gnm_terminus + Usage : gnm_terminus(); + Function: Returns the gnm_terminus associated with the object. + Example : $gnm_terminus = $unigene->gnm_terminus or + $unigene->gnm_terminus($gnm_terminus) + Returns : A string + Args : None or a gnm_terminus + +=cut + +sub gnm_terminus { + my $self = shift; + return $self->_annotation_value('gnm_terminus', @_); +} + +=head2 scount + + Title : scount + Usage : scount(); + Function: Returns the scount associated with the object. + Example : $scount = $unigene->scount or $unigene->scount($scount) + Returns : A string + Args : None or a scount + +=cut + +sub scount { + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'scount'} = $value; + } elsif((! defined($obj->{'scount'})) && defined($obj->sequences())) { + $obj->{'scount'} = $obj->size(); + } + return $obj->{'scount'}; +} + + +=head2 express + + Title : express + Usage : express(); + Function: Returns or stores a reference to an array containing + tissue expression data + Returns : An array reference + Args : None or an array reference + +=cut + +sub express { + my $self = shift; + + return $self->_annotation_value_ary('expressed',@_); +} + + +=head2 chromosome + + Title : chromosome + Usage : chromosome(); + Function: Returns or stores a reference to an array containing + chromosome lines + Returns : An array reference + Args : None or an array reference + +=cut + +sub chromosome { + my $self = shift; + + return $self->_annotation_value_ary('chromosome',@_); + } + + +=head2 sts + + Title : sts + Usage : sts(); + Function: Returns or stores a reference to an array containing sts lines + + Returns : An array reference + Args : None or an array reference + +=cut + +sub sts { + my $self = shift; + + return $self->_annotation_value_ary('sts',@_); +} + + +=head2 txmap + + Title : txmap + Usage : txmap(); + Function: Returns or stores a reference to an array containing txmap lines + + Returns : An array reference + Args : None or an array reference + +=cut + +sub txmap { + my $self = shift; + + return $self->_annotation_value_ary('txmap',@_); +} + + +=head2 protsim + + Title : protsim + Usage : protsim(); + Function: Returns or stores a reference to an array containing protsim lines + This should really only be used by ClusterIO, not directly + Returns : An array reference + Args : None or an array reference + +=cut + +sub protsim { + my $self = shift; + + return $self->_annotation_value_ary('protsim',@_); +} + + +=head2 sequences + + Title : sequences + Usage : sequences(); + Function: Returns or stores a reference to an array containing + sequence data. + + This is mostly reserved for ClusterIO parsers. You should + use get_members() for get and add_member()/remove_members() + for set. + + Returns : An array reference, or undef + Args : None or an array reference or undef + +=cut + +sub sequences { + my $self = shift; + + return $self->{'members'} = shift if @_; + return $self->{'members'}; +} + +=head2 species + + Title : species + Usage : $obj->species($newval) + Function: Get/set the species object for this Unigene cluster. + Example : + Returns : value of species (a L object) + Args : on set, new value (a L object or + the binomial name, or undef, optional) + + +=cut + +sub species{ + my $self = shift; + + if(@_) { + my $species = shift; + if($species && (! ref($species))) { + my @class = reverse(split(' ',$species)); + $species = Bio::Species->new(-classification => \@class); + } + return $self->{'species'} = $species; + } + return $self->{'species'}; +} + + +=head1 L methods + +=cut + +=head2 display_id + + Title : display_id + Usage : + Function: Get/set the display name or identifier for the cluster + + This is aliased to unigene_id(). + + Returns : a string + Args : optional, on set the display ID ( a string) + +=cut + +sub display_id{ + return shift->unigene_id(@_); +} + +=head2 description + + Title : description + Usage : Bio::ClusterI->description("POLYUBIQUITIN") + Function: get/set for the consensus description of the cluster + + This is aliased to title(). + + Returns : the description string + Args : Optional the description string + +=cut + +sub description{ + return shift->title(@_); +} + +=head2 size + + Title : size + Usage : Bio::ClusterI->size(); + Function: get for the size of the family, + calculated from the number of members + + This is aliased to scount(). + + Returns : the size of the cluster + Args : + +=cut + +sub size { + my $self = shift; + + # hard-wiring the size is allowed if there are no sequences + return $self->scount(@_) unless defined($self->sequences()); + # but we can't change the number of members through this method + my $n = scalar(@{$self->sequences()}); + if(@_ && ($n != $_[0])) { + $self->throw("Cannot change cluster size using size() from $n to ". + $_[0]); + } + return $n; +} + +=head2 cluster_score + + Title : cluster_score + Usage : $cluster ->cluster_score(100); + Function: get/set for cluster_score which + represent the score in which the clustering + algorithm assigns to this cluster. + + For UniGene clusters, there really is no cluster score that + would come with the data. However, we provide an + implementation here so that you can score UniGene clusters + if you want to. + + Returns : a number + Args : optionally, on set a number + +=cut + +sub cluster_score{ + my $self = shift; + + return $self->{'cluster_score'} = shift if @_; + return $self->{'cluster_score'}; +} + +=head2 get_members + + Title : get_members + Usage : Bio::ClusterI->get_members(($seq1, $seq2)); + Function: retrieve the members of the family by some criteria + + Will return all members if no criteria are provided. + + At this time this implementation does not support + specifying criteria and will always return all members. + + Returns : the array of members + Args : + +=cut + +sub get_members { + my $self = shift; + + my $mems = $self->sequences() || []; + # already objects? + if(@$mems && (ref($mems->[0]) eq "HASH")) { + # nope, we need to build the object list from scratch + my @memlist = (); + while(my $seq = $self->next_seq()) { + push(@memlist, $seq); + } + # we cache this array of objects as the new member list + $mems = \@memlist; + $self->sequences($mems); + } + # done + return @$mems; +} + + +=head1 Annotatable view at the object properties + +=cut + +=head2 annotation + + Title : annotation + Usage : $obj->annotation($newval) + Function: Get/set the L object for + this UniGene cluster. + + Many attributes of this class are actually stored within + the annotation collection object as L + compliant objects, so you can conveniently access them + through the same interface as you would e.g. access + L annotation properties. + + If you call this method in set mode and replace the + annotation collection with another one you should know + exactly what you are doing. + + Example : + Returns : a L compliant object + Args : on set, new value (a L + compliant object or undef, optional) + + +=cut + +sub annotation{ + my $self = shift; + + if(@_) { + return $self->{'annotation'} = shift; + } elsif(! exists($self->{'annotation'})) { + $self->{'annotation'} = Bio::Annotation::Collection->new(); + } + return $self->{'annotation'}; +} + + +=head1 Implementation specific methods + + These are mostly for adding/removing to array properties, and for + methods with special functionality. + +=cut + +=head2 add_member + + Title : add_member + Usage : + Function: Adds a member object to the list of members. + Example : + Returns : TRUE if the new member was successfuly added, and FALSE + otherwise. + Args : The member to add. + + +=cut + +sub add_member{ + my ($self,@mems) = @_; + + my $memlist = $self->{'members'} || []; + # this is an object interface; is the member list already objects? + if(@$memlist && (ref($memlist->[0]) eq "HASH")) { + # nope, convert to objects + $memlist = [$self->get_members()]; + } + # add new member(s) + push(@$memlist, @mems); + # store if we created this array ref ourselves + $self->sequences($memlist); + # done + return 1; +} + +=head2 remove_members + + Title : remove_members + Usage : + Function: Remove the list of members for this cluster such that the + member list is undefined afterwards (as opposed to zero members). + Example : + Returns : the previous list of members + Args : none + + +=cut + +sub remove_members{ + my $self = shift; + + my @mems = $self->get_members(); + $self->sequences(undef); + return @mems; +} + + +=head2 next_locuslink + + Title : next_locuslink + Usage : next_locuslink(); + Function: Returns the next locuslink from an array referred + to using $obj->{'locuslink'} + + If you call this iterator again after it returned undef, it + will re-cycle through the list of elements. Changes in the + underlying array property while you loop over this iterator + will not be reflected until you exhaust the iterator. + + Example : while ( my $locuslink = $in->next_locuslink() ) { + print "$locuslink\n"; + } + Returns : String + Args : None + +=cut + +sub next_locuslink { + my ($obj) = @_; + + return $obj->_next_element("ll","locuslink"); +} + +=head2 next_express + + Title : next_express + Usage : next_express(); + Function: Returns the next tissue from an array referred + to using $obj->{'express'} + + If you call this iterator again after it returned undef, it + will re-cycle through the list of elements. Changes in the + underlying array property while you loop over this iterator + will not be reflected until you exhaust the iterator. + + Example : while ( my $express = $in->next_express() ) { + print "$express\n"; + } + Returns : String + Args : None + +=cut + +sub next_express { + my ($obj) = @_; + + return $obj->_next_element("express","express"); +} + + +=head2 next_chromosome + + Title : next_chromosome + Usage : next_chromosome(); + Function: Returns the next chromosome line from an array referred + to using $obj->{'chromosome'} + + If you call this iterator again after it returned undef, it + will re-cycle through the list of elements. Changes in the + underlying array property while you loop over this iterator + will not be reflected until you exhaust the iterator. + + Example : while ( my $chromosome = $in->next_chromosome() ) { + print "$chromosome\n"; + } + Returns : String + Args : None + +=cut + +sub next_chromosome { + my ($obj) = @_; + + return $obj->_next_element("chr","chromosome"); +} + + +=head2 next_protsim + + Title : next_protsim + Usage : next_protsim(); + Function: Returns the next protsim line from an array referred + to using $obj->{'protsim'} + + If you call this iterator again after it returned undef, it + will re-cycle through the list of elements. Changes in the + underlying array property while you loop over this iterator + will not be reflected until you exhaust the iterator. + + Example : while ( my $protsim = $in->next_protsim() ) { + print "$protsim\n"; + } + Returns : String + Args : None + +=cut + +sub next_protsim { + my ($obj) = @_; + + return $obj->_next_element("protsim","protsim"); +} + + +=head2 next_sts + + Title : next_sts + Usage : next_sts(); + Function: Returns the next sts line from an array referred + to using $obj->{'sts'} + + If you call this iterator again after it returned undef, it + will re-cycle through the list of elements. Changes in the + underlying array property while you loop over this iterator + will not be reflected until you exhaust the iterator. + + Example : while ( my $sts = $in->next_sts() ) { + print "$sts\n"; + } + Returns : String + Args : None + +=cut + +sub next_sts { + my ($obj) = @_; + + return $obj->_next_element("sts","sts"); +} + + +=head2 next_txmap + + Title : next_txmap + Usage : next_txmap(); + Function: Returns the next txmap line from an array + referred to using $obj->{'txmap'} + + If you call this iterator again after it returned undef, it + will re-cycle through the list of elements. Changes in the + underlying array property while you loop over this iterator + will not be reflected until you exhaust the iterator. + + Example : while ( my $tsmap = $in->next_txmap() ) { + print "$txmap\n"; + } + Returns : String + Args : None + +=cut + +sub next_txmap { + my ($obj) = @_; + + return $obj->_next_element("txmap","txmap"); +} + +############################### +# private method +# +# args: prefix name for the queue +# name of the method from which to re-fill +# returns: the next element from that queue, or undef if the queue is empty +############################### +sub _next_element{ + my ($self,$queuename,$meth) = @_; + + $queuename = "_".$queuename."_queue"; + if(! exists($self->{$queuename})) { + # re-initialize from array of sequence data + $self->{$queuename} = [@{$self->$meth() }]; + } + my $queue = $self->{$queuename}; + # is queue exhausted (equivalent to end of stream)? + if(! @$queue) { + # yes, remove queue and signal to the caller + delete $self->{$queuename}; + return undef; + } + return shift(@$queue); +} + +=head1 L methods + +=cut + +=head2 object_id + + Title : object_id + Usage : $string = $obj->object_id() + Function: a string which represents the stable primary identifier + in this namespace of this object. For DNA sequences this + is its accession_number, similarly for protein sequences + + This is aliased to unigene_id(). + + Returns : A scalar + + +=cut + +sub object_id { + return shift->unigene_id(@_); +} + +=head2 version + + Title : version + Usage : $version = $obj->version() + Function: a number which differentiates between versions of + the same object. Higher numbers are considered to be + later and more relevant, but a single object described + the same identifier should represent the same concept + + Unigene clusters usually won''t have a version, so this + will be mostly undefined. + + Returns : A number + Args : on set, new value (a scalar or undef, optional) + + +=cut + +sub version { + my $self = shift; + + return $self->{'version'} = shift if @_; + return $self->{'version'}; +} + + +=head2 authority + + Title : authority + Usage : $authority = $obj->authority() + Function: a string which represents the organisation which + granted the namespace, written as the DNS name for + organisation (eg, wormbase.org) + + Returns : A scalar + Args : on set, new value (a scalar or undef, optional) + + +=cut + +sub authority { + my $self = shift; + + return $self->{'authority'} = shift if @_; + return $self->{'authority'}; +} + + +=head2 namespace + + Title : namespace + Usage : $string = $obj->namespace() + Function: A string representing the name space this identifier + is valid in, often the database name or the name + describing the collection + + Returns : A scalar + Args : on set, new value (a scalar or undef, optional) + + +=cut + +sub namespace { + my $self = shift; + + return $self->{'namespace'} = shift if @_; + return $self->{'namespace'}; +} + +=head1 L methods + +=cut + +=head2 display_name + + Title : display_name + Usage : $string = $obj->display_name() + Function: A string which is what should be displayed to the user + the string should have no spaces (ideally, though a cautious + user of this interface would not assumme this) and should be + less than thirty characters (though again, double checking + this is a good idea) + + This is aliased to unigene_id(). + + Returns : A scalar + Status : Virtual + +=cut + +sub display_name { + return shift->unigene_id(@_); +} + + +=head2 description() + + Title : description + Usage : $string = $obj->description() + Function: A text string suitable for displaying to the user a + description. This string is likely to have spaces, but + should not have any newlines or formatting - just plain + text. The string should not be greater than 255 characters + and clients can feel justified at truncating strings at 255 + characters for the purposes of display + + This is already demanded by Bio::ClusterI and hence is + present anyway. + + Returns : A scalar + + +=cut + + +=head1 L methods + +=cut + +=head2 next_seq + + Title : next_seq + Usage : next_seq(); + Function: Returns the next seq as a Seq object as defined by + $seq->sequence_factory(), + at present an empty Bio::Seq::RichSeq object with + just the accession_number() and pid() set + + This iterator will not exhaust the array of member + sequences. If you call next_seq() again after it returned + undef, it will re-cycle through the list of member + sequences. + + Example : while ( my $sequence = $in->next_seq() ) { + print $sequence->accession_number() . "\n"; + } + Returns : Bio::PrimarySeqI object + Args : None + +=cut + +sub next_seq { + my ($obj) = @_; + + if(! exists($obj->{'_seq_queue'})) { + # re-initialize from array of sequence data + $obj->{'_seq_queue'} = [@{$obj->sequences()}]; + } + my $queue = $obj->{'_seq_queue'}; + # is queue exhausted (equivalent to end of stream)? + if(! @$queue) { + # yes, remove queue and signal to the caller + delete $obj->{'_seq_queue'}; + return undef; + } + # no, still data in the queue: get the next one from the queue + my $seq_h = shift(@$queue); + # if this is not a simple hash ref, it's an object already, and we'll + # return just that + return $seq_h if(ref($seq_h) ne 'HASH'); + # nope, we need to assemble this object from scratch + # + # assemble the annotation collection + my $ac = Bio::Annotation::Collection->new(); + foreach my $k (keys %$seq_h) { + next if $k =~ /acc|pid|nid|version/; + my $ann = Bio::Annotation::SimpleValue->new(-tagname => $k, + -value => $seq_h->{$k}); + $ac->add_Annotation($ann); + } + # assemble the initialization parameters and create object + my $seqobj = $obj->sequence_factory->create( + -accession_number => $seq_h->{acc}, + -pid => $seq_h->{pid}, + # why does NCBI prepend a 'g' to its own identifiers?? + -primary_id => $seq_h->{nid} && $seq_h->{nid} =~ /^g\d+$/ ? + substr($seq_h->{nid},1) : $seq_h->{nid}, + -display_id => $seq_h->{acc}, + -seq_version => $seq_h->{version}, + -alphabet => $obj->{'_alphabet'}, + -namespace => $seq_h->{acc} =~ /^NM_/ ? 'RefSeq' : 'GenBank', + -authority => $obj->authority(), # default is NCBI + -species => $obj->species(), + -annotation => $ac + ); + return $seqobj; +} + +=head2 sequence_factory + + Title : sequence_factory + Usage : $seqio->sequence_factory($seqfactory) + Function: Get/Set the Bio::Factory::SequenceFactoryI + Returns : Bio::Factory::SequenceFactoryI + Args : [optional] Bio::Factory::SequenceFactoryI + + +=cut + +sub sequence_factory { + my ($self,$obj) = @_; + if( defined $obj ) { + if( ! ref($obj) || ! $obj->isa('Bio::Factory::SequenceFactoryI') ) { + $self->throw("Must provide a valid Bio::Factory::SequenceFactoryI object to ".ref($self)." sequence_factory()"); + } + $self->{'_seqfactory'} = $obj; + } + $self->{'_seqfactory'}; +} + +=head1 Private methods + +=cut + +=head2 _annotation_value + + Title : _annotation_value + Usage : + Function: Private method. + Example : + Returns : the value (a string) + Args : annotation key (a string) + on set, annotation value (a string) + + +=cut + +sub _annotation_value{ + my $self = shift; + my $key = shift; + + my ($ann, $val); + if(@_) { + $val = shift; + if(! defined($val)) { + ($ann) = $self->annotation->remove_Annotations($key); + return $ann ? $ann->value() : undef; + } + } + ($ann) = $self->annotation->get_Annotations($key); + if($ann && (! $val)) { + # get mode and exists + $val = $ann->value(); + } elsif($val) { + # set mode + if(! $ann) { + $ann = Bio::Annotation::SimpleValue->new(-tagname => $key); + $self->annotation->add_Annotation($ann); + } + $ann->value($val); + } + return $val; +} + + +=head2 _annotation_value_ary + + Title : _annotation_value_ary + Usage : + Function: Private method. + Example : + Returns : reference to the array of values + Args : annotation key (a string) + on set, reference to an array holding the values + + +=cut + +sub _annotation_value_ary{ + my ($self,$key,$arr) = @_; + + my $ac = $self->annotation; + if($arr) { + # purge first + $ac->remove_Annotations($key); + # then add as many values as are present + foreach my $val (@$arr) { + my $ann = Bio::Annotation::SimpleValue->new(-value => $val, + -tagname => $key + ); + $ac->add_Annotation($ann); + } + } else { + my @vals = map { $_->value(); } $ac->get_Annotations($key); + $arr = [@vals]; + } + return $arr; +} + + +=head2 _annotation_dblink + + Title : _annotation_dblink + Usage : + Function: Private method. + Example : + Returns : array of accessions for the given database (namespace) + Args : annotation key (a string) + dbname (a string) (optional on get, mandatory on set) + on set, accession or ID (a string), and version + + +=cut + +sub _annotation_dblink{ + my ($self,$key,$dbname,$acc,$version) = @_; + + if($acc) { + # set mode -- this is adding here + my $ann = Bio::Annotation::DBLink->new(-tagname => $key, + -primary_id => $acc, + -database => $dbname, + -version => $version); + $self->annotation->add_Annotation($ann); + return 1; + } else { + # get mode + my @anns = $self->annotation->get_Annotations($key); + # filter out those that don't match the requested database + if($dbname) { + @anns = grep { $_->database() eq $dbname; } @anns; + } + return map { $_->primary_id(); } @anns; + } +} + +=head2 _remove_dblink + + Title : _remove_dblink + Usage : + Function: Private method. + Example : + Returns : array of accessions for the given database (namespace) + Args : annotation key (a string) + dbname (a string) (optional) + + +=cut + +sub _remove_dblink{ + my ($self,$key,$dbname) = @_; + + my $ac = $self->annotation(); + my @anns = (); + if($dbname) { + foreach my $ann ($ac->remove_Annotations($key)) { + if($ann->database() eq $dbname) { + push(@anns, $ann); + } else { + $ac->add_Annotation($ann); + } + } + } else { + @anns = $ac->remove_Annotations($key); + } + return map { $_->primary_id(); } @anns; +} + + +##################################################################### +# aliases for naming consistency or other reasons # +##################################################################### + +*sequence = \&sequences; + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Cluster/UniGeneI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Cluster/UniGeneI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,405 @@ +# $Id: UniGeneI.pm,v 1.9 2002/10/25 22:49:03 lapp Exp $ +# +# BioPerl module for Bio::Cluster::UniGeneI.pm +# +# Cared for by Andrew Macgregor +# +# Copyright Andrew Macgregor, Jo-Ann Stanton, David Green +# Molecular Embryology Group, Anatomy & Structural Biology, University of Otago +# http://anatomy.otago.ac.nz/meg +# +# You may distribute this module under the same terms as perl itself +# +# _history +# April 31, 2002 - Initial implementation by Andrew Macgregor +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Cluster::UniGeneI - abstract interface of UniGene object + +=head1 SYNOPSIS + + # + +=head1 DESCRIPTION + +This is the general interface for a UniGene cluster representation in Bioperl. You cannot use this module directly, use an implementation instead. + +You can create UniGene cluster objects yourself by instantiating +L. If you read UniGene clusters from a +ClusterIO parser, you will get objects implementing this interface, +most likely instances of said UniGene class. + +L inherits from L, so you can +use it wherever a cluster object is expected. + +=head1 FEEDBACK + + # + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Andrew Macgregor + +Email andrew@anatomy.otago.ac.nz + + +=head1 APPENDIX + + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a "_". + +=cut + +# Let the code begin... + + +package Bio::Cluster::UniGeneI; +use vars qw(@ISA $VERSION); +use strict; + +use Bio::ClusterI; + +$VERSION = '1.0'; +@ISA = qw(Bio::ClusterI); + + +=head2 unigene_id + + Title : unigene_id + Usage : unigene_id(); + Function: Returns the unigene_id associated with the object. + Example : $id = $unigene->unigene_id or $unigene->unigene_id($id) + Returns : A string + Args : None or an id + + +=cut + +sub unigene_id { + my ($self) = @_; + $self->throw_not_implemented; +} + + + +=head2 title + + Title : title + Usage : title(); + Function: Returns the title associated with the object. + Example : $title = $unigene->title or $unigene->title($title) + Returns : A string + Args : None or a title + + +=cut + +sub title { + my ($self) = @_; + $self->throw_not_implemented; +} + + +=head2 gene + + Title : gene + Usage : gene(); + Function: Returns the gene associated with the object. + Example : $gene = $unigene->gene or $unigene->gene($gene) + Returns : A string + Args : None or a gene + + +=cut + +sub gene { + my ($self) = @_; + $self->throw_not_implemented; +} + + +=head2 cytoband + + Title : cytoband + Usage : cytoband(); + Function: Returns the cytoband associated with the object. + Example : $cytoband = $unigene->cytoband or $unigene->cytoband($cytoband) + Returns : A string + Args : None or a cytoband + + +=cut + +sub cytoband { + my ($self) = @_; + $self->throw_not_implemented; +} + + +=head2 mgi + + Title : mgi + Usage : mgi(); + Function: Returns the mgi associated with the object. + Example : $mgi = $unigene->mgi or $unigene->mgi($mgi) + Returns : A string + Args : None or a mgi + + +=cut + +sub mgi { + my ($self) = @_; + $self->throw_not_implemented; +} + + +=head2 locuslink + + Title : locuslink + Usage : locuslink(); + Function: Returns or stores a reference to an array containing locuslink data. + This should really only be used by ClusterIO, not directly + Returns : An array reference + Args : None or an array reference + +=cut + +sub locuslink { + my ($self) = @_; + $self->throw_not_implemented; +} + + +=head2 gnm_terminus + + Title : gnm_terminus + Usage : gnm_terminus(); + Function: Returns the gnm_terminus associated with the object. + Example : $gnm_terminus = $unigene->gnm_terminus or $unigene->gnm_terminus($gnm_terminus) + Returns : A string + Args : None or a gnm_terminus + +=cut + +sub gnm_terminus { + my ($self) = @_; + $self->throw_not_implemented; +} + + +=head2 scount + + Title : scount + Usage : scount(); + Function: Returns the scount associated with the object. + Example : $scount = $unigene->scount or $unigene->scount($scount) + Returns : A string + Args : None or a scount + +=cut + +sub scount { + my ($self) = @_; + $self->throw_not_implemented; +} + + + +=head2 express + + Title : express + Usage : express(); + Function: Returns or stores a reference to an array containing tissue expression data. + This should really only be used by ClusterIO, not directly + Returns : An array reference + Args : None or an array reference + +=cut + +sub express { + my ($self) = @_; + $self->throw_not_implemented; +} + + +=head2 chromosome + + Title : chromosome + Usage : chromosome(); + Function: Returns or stores a reference to an array containing chromosome lines + This should really only be used by ClusterIO, not directly + Returns : An array reference + Args : None or an array reference + +=cut + +sub chromosome { + my ($self) = @_; + $self->throw_not_implemented; +} + + +=head2 sts + + Title : sts + Usage : sts(); + Function: Returns or stores a reference to an array containing sts lines + This should really only be used by ClusterIO, not directly + Returns : An array reference + Args : None or an array reference + +=cut + +sub sts { + my ($self) = @_; + $self->throw_not_implemented; +} + + +=head2 txmap + + Title : txmap + Usage : txmap(); + Function: Returns or stores a reference to an array containing txmap lines + Returns : An array reference + Args : None or an array reference + +=cut + +sub txmap { + my ($self) = @_; + $self->throw_not_implemented; +} + + +=head2 protsim + + Title : protsim + Usage : protsim(); + Function: Returns or stores a reference to an array containing protsim lines + This should really only be used by ClusterIO, not directly + Returns : An array reference + Args : None or an array reference + +=cut + +sub protsim { + my ($self) = @_; + $self->throw_not_implemented; +} + + +=head2 sequence + + Title : sequence + Usage : sequence(); + Function: Returns or stores a reference to an array containing sequence data + This should really only be used by ClusterIO, not directly + Returns : An array reference + Args : None or an array reference + +=cut + +sub sequence { + my ($self) = @_; + $self->throw_not_implemented; +} + +=head2 species + + Title : species + Usage : $obj->species($newval) + Function: Get the species object for this Unigene cluster. + Example : + Returns : value of species (a L object) + Args : + + +=cut + +sub species{ + shift->throw_not_implemented(); +} + +=head1 Methods inherited from L + +=cut + +=head2 display_id + + Title : display_id + Usage : + Function: Get/set the display name or identifier for the cluster + Returns : a string + Args : optional, on set the display ID ( a string) + +=cut + +=head2 description + + Title : description + Usage : Bio::ClusterI->description("POLYUBIQUITIN") + Function: get/set for the consensus description of the cluster + Returns : the description string + Args : Optional the description string + +=cut + +=head2 size + + Title : size + Usage : Bio::ClusterI->size(); + Function: get/set for the size of the family, + calculated from the number of members + Returns : the size of the family + Args : + +=cut + +=head2 cluster_score + + Title : cluster_score + Usage : $cluster ->cluster_score(100); + Function: get/set for cluster_score which + represent the score in which the clustering + algorithm assigns to this cluster. + Returns : a number + +=cut + +=head2 get_members + + Title : get_members + Usage : Bio::ClusterI->get_members(($seq1, $seq2)); + Function: retrieve the members of the family by some criteria, for + example : + $cluster->get_members(-species => 'homo sapiens'); + + Will return all members if no criteria are provided. + + Returns : the array of members + Args : + +=cut + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/ClusterI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/ClusterI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,178 @@ +# $Id: ClusterI.pm,v 1.3 2002/10/25 01:29:37 lapp Exp $ +# +# BioPerl module for Bio::ClusterI +# +# Cared for by Shawn Hoon +# +# Copyright Shawn Hoon +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::ClusterI - Cluster Interface + +=head1 SYNOPSIS + +# see the implementations of this interface for details but +# basically + + my $cluster= $cluster->new(-description=>"POLYUBIQUITIN", + -members =>[$seq1,$seq2]); + my @members = $cluster->get_members(); + my @sub_members = $cluster->get_members(-species=>"homo sapiens"); + + + +=head1 DESCRIPTION + +This interface is the basic structure for a cluster of bioperl objects. +In this case it is up to the implementer to check arguments +and initialize whatever new object the implementing class is designed for. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Shawn Hoon + +Email shawnh@fugu-sg.org + + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::ClusterI; +use vars qw(@ISA); +use strict; + +use Bio::Root::RootI; + +@ISA = qw(Bio::Root::RootI); + +=head1 Implementation Specific Functions + +These functions are the ones that a specific implementation must +define. + +=head2 new + + We dont mandate but encourage implementors to support at least the + following named parameters upon object initialization. + + Argument Description + -------- ----------- + -display_id the display ID or name for the cluster + -description the consensus description or name of the cluster + -members the array of objects belonging to the family + +=cut + +=head2 display_id + + Title : display_id + Usage : + Function: Get the display name or identifier for the cluster + Returns : a string + Args : + +=cut + +sub display_id{ + shift->throw_not_implemented(); +} + + +=head2 description + + Title : description + Usage : Bio::ClusterI->description("POLYUBIQUITIN") + Function: get/set for the consensus description of the cluster + Returns : the description string + Args : Optional the description string + +=cut + +sub description{ + shift->throw_not_implemented(); +} + +=head2 size + + Title : size + Usage : Bio::ClusterI->size(); + Function: get/set for the size of the family, + calculated from the number of members + Returns : the size of the family + Args : + +=cut + +sub size { + shift->throw_not_implemented(); +} + +=head2 cluster_score + + Title : cluster_score + Usage : $cluster ->cluster_score(100); + Function: get/set for cluster_score which + represent the score in which the clustering + algorithm assigns to this cluster. + Returns : a number + +=cut + +sub cluster_score{ + shift->throw_not_implemented(); +} + +=head2 get_members + + Title : get_members + Usage : Bio::ClusterI->get_members(($seq1, $seq2)); + Function: retrieve the members of the family by some criteria, for + example : + $cluster->get_members(-species => 'homo sapiens'); + + Will return all members if no criteria are provided. + + Returns : the array of members + Args : + +=cut + +sub get_members { + shift->throw_not_implemented(); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/ClusterIO.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/ClusterIO.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,319 @@ +# $Id: ClusterIO.pm,v 1.11.2.1 2003/01/21 01:11:17 jason Exp $ +# +# BioPerl module for Bio::ClusterIO.pm +# +# Cared for by Andrew Macgregor +# +# Copyright Andrew Macgregor, Jo-Ann Stanton, David Green +# Molecular Embryology Group, Anatomy & Structural Biology, University of Otago +# http://anatomy.otago.ac.nz/meg +# +# You may distribute this module under the same terms as perl itself +# +# _history +# +# May 7, 2002 - changed from UniGene.pm to more generic ClusterIO.pm +# by Andrew Macgregor +# +# April 17, 2002 - Initial implementation by Andrew Macgregor +# POD documentation - main docs before the code + +=head1 NAME + +Bio::ClusterIO - Handler for Cluster Formats + +=head1 SYNOPSIS + + #NB: This example is unigene specific + + use Bio::ClusterIO; + + $stream = Bio::ClusterIO->new('-file' => "Hs.data", + '-format' => "unigene"); + # note: we quote -format to keep older perl's from complaining. + + while ( my $in = $stream->next_cluster() ) { + print $in->unigene_id() . "\n"; + while ( my $sequence = $in->next_seq() ) { + print $sequence->accession_number() . "\n"; + } + } + # Parsing errors are printed to STDERR. + +=head1 DESCRIPTION + +The ClusterIO module works with the ClusterIO format module to read +various cluster formats such as NCBI UniGene. + + +=head1 CONSTRUCTORS + +=head2 Bio::ClusterIO-Enew() + + $str = Bio::ClusterIO->new(-file => 'filename', + -format=>$format); + +The new() class method constructs a new Bio::ClusterIO object. The +returned object can be used to retrieve or print cluster +objects. new() accepts the following parameters: + +=over 4 + +=item -file + +A file path to be opened for reading. + +=item -format + +Specify the format of the file. Supported formats include: + + unigene *.data UniGene build files. + dbsnp *.xml dbSNP XML files + +If no format is specified and a filename is given, then the module +will attempt to deduce it from the filename. If this is unsuccessful, +the main UniGene build format is assumed. + +The format name is case insensitive. 'UNIGENE', 'UniGene' and +'unigene' are all supported, as are dbSNP, dbsnp, and DBSNP + +=back + +=head1 OBJECT METHODS + +See below for more detailed summaries. The main methods are: + +=head2 $cluster = $str-Enext_cluster() + +Fetch the next cluster from the stream. + + +=head2 TIEHANDLE(), READLINE(), PRINT() + +These I've left in here because they were in the SeqIO +module. Feedback appreciated. There they provide the tie interface. +See L for more details. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Andrew Macgregor + +Email andrew@anatomy.otago.ac.nz + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +#' +# Let the code begin... + +package Bio::ClusterIO; + +use strict; +use vars qw(@ISA); + +use Bio::Root::Root; +use Bio::Root::IO; + +@ISA = qw(Bio::Root::Root Bio::Root::IO); + + + +=head2 new + + Title : new + Usage : Bio::ClusterIO->new(-file => $filename, -format => 'format') + Function: Returns a new cluster stream + Returns : A Bio::ClusterIO::Handler initialised with the appropriate format + Args : -file => $filename + -format => format + +=cut + + +my $entry = 0; + +sub new { + my ($caller,@args) = @_; + my $class = ref($caller) || $caller; + + # or do we want to call SUPER on an object if $caller is an + # object? + if( $class =~ /Bio::ClusterIO::(\S+)/ ) { + my ($self) = $class->SUPER::new(@args); + $self->_initialize(@args); + return $self; + } else { + + my %param = @args; + @param{ map { lc $_ } keys %param } = values %param; # lowercase keys + my $format = $param{'-format'} || + $class->_guess_format( $param{-file} || $ARGV[0] ); + $format = "\L$format"; # normalize capitalization to lower case + + return undef unless( $class->_load_format_module($format) ); + return "Bio::ClusterIO::$format"->new(@args); + } +} + + +# _initialize is chained for all ClusterIO classes + +sub _initialize { + my($self, @args) = @_; + # initialize the IO part + $self->_initialize_io(@args); +} + +=head2 next_cluster + + Title : next_cluster + Usage : $cluster = $stream->next_cluster() + Function: Reads the next cluster object from the stream and returns it. + Returns : a L compliant object + Args : none + + +=cut + +sub next_cluster { + my ($self, $seq) = @_; + $self->throw("Sorry, you cannot read from a generic Bio::ClusterIO object."); +} + +=head2 cluster_factory + + Title : cluster_factory + Usage : $obj->cluster_factory($newval) + Function: Get/set the object factory to use for creating the cluster + objects. + Example : + Returns : a L compliant object + Args : on set, new value (a L + compliant object or undef, optional) + + +=cut + +sub cluster_factory{ + my $self = shift; + + return $self->{'cluster_factory'} = shift if @_; + return $self->{'cluster_factory'}; +} + +=head2 object_factory + + Title : object_factory + Usage : $obj->object_factory($newval) + Function: This is an alias to cluster_factory with a more generic name. + Example : + Returns : a L compliant object + Args : on set, new value (a L + compliant object or undef, optional) + + +=cut + +sub object_factory{ + return shift->cluster_factory(@_); +} + +=head2 _load_format_module + + Title : _load_format_module + Usage : *INTERNAL ClusterIO stuff* + Function: Loads up (like use) a module at run time on demand + Example : + Returns : + Args : + +=cut + +sub _load_format_module { + my ($self,$format) = @_; + my $module = "Bio::ClusterIO::" . $format; + my $ok; + + eval { + $ok = $self->_load_module($module); + }; + if ( $@ ) { + print STDERR <_guess_format($filename) + Function: guess format based on file suffix + Example : + Returns : guessed format of filename (lower case) + Args : + Notes : formats that _filehandle() will guess include unigene and dbsnp + +=cut + +sub _guess_format { + my $class = shift; + return unless $_ = shift; + return 'unigene' if /\.(data)$/i; + return 'dbsnp' if /\.(xml)$/i; +} + +sub DESTROY { + my $self = shift; + + $self->close(); +} + +# I need some direction on these!! The module works so I haven't fiddled with them! + +sub TIEHANDLE { + my ($class,$val) = @_; + return bless {'seqio' => $val}, $class; +} + +sub READLINE { + my $self = shift; + return $self->{'seqio'}->next_seq() unless wantarray; + my (@list, $obj); + push @list, $obj while $obj = $self->{'seqio'}->next_seq(); + return @list; +} + +sub PRINT { + my $self = shift; + $self->{'seqio'}->write_seq(@_); +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/ClusterIO/dbsnp.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/ClusterIO/dbsnp.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,366 @@ +# $Id: dbsnp.pm,v 1.7.2.1 2003/08/21 21:07:06 allenday Exp $ +# BioPerl module for Bio::ClusterIO::dbsnp +# +# Copyright Allen Day , Stan Nelson +# Human Genetics, UCLA Medical School, University of California, Los Angeles + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::ClusterIO::dbsnp - dbSNP input stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the Bio::ClusterIO class. + +=head1 DESCRIPTION + +Parse dbSNP XML files, one refSNP entry at a time. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Allen Day Eallenday@ucla.eduE + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... +package Bio::ClusterIO::dbsnp; + +use strict; +use Bio::Root::Root; +use Bio::ClusterIO; +use Bio::Variation::SNP; +use XML::Parser::PerlSAX; +use XML::Handler::Subs; +use Data::Dumper; +use IO::File; + +use vars qw(@ISA $DTD $DEBUG %MODEMAP %MAPPING); +$DTD = 'ftp://ftp.ncbi.nih.gov/snp/specs/NSE.dtd'; +@ISA = qw(Bio::ClusterIO); + +BEGIN { + %MAPPING = ( +#the ones commented out i haven't written methods for yet... -Allen + 'NSE-rs_refsnp-id' => 'id', +# 'NSE-rs_taxid' => 'tax_id', +# 'NSE-rs_organism' => 'organism', + 'NSE-rs_snp-type' => {'type' => 'value'}, + 'NSE-rs_observed' => 'observed', + 'NSE-rs_seq-5_E' => 'seq_5', + 'NSE-rs_seq-3_E' => 'seq_3', +# 'NSE-rs_seq-ss-exemplar' => 'exemplar_subsnp', + 'NSE-rs_ncbi-build-id' => 'ncbi_build', + 'NSE-rs_ncbi-num-chr-hits' => 'ncbi_chr_hits', + 'NSE-rs_ncbi-num-ctg-hits' => 'ncbi_ctg_hits', + 'NSE-rs_ncbi-num-seq-loc' => 'ncbi_seq_loc', +# 'NSE-rs_ncbi-mapweight' => 'ncbi_mapweight', + 'NSE-rs_ucsc-build-id' => 'ucsc_build', + 'NSE-rs_ucsc-num-chr-hits' => 'ucsc_chr_hits', + 'NSE-rs_ucsc-num-seq-loc' => 'ucsc_ctg_hits', +# 'NSE-rs_ucsc-mapweight' => 'ucsc_mapweight', + 'NSE-rs_het' => 'heterozygous', + 'NSE-rs_het-SE' => 'heterozygous_SE', + 'NSE-rs_validated' => {'validated' => 'value'}, + 'NSE-rs_genotype' => {'genotype' => 'value'}, + + 'NSE-ss_handle' => 'handle', + 'NSE-ss_batch-id' => 'batch_id', + 'NSE-ss_subsnp-id' => 'id', +# 'NSE-ss_loc-snp-id' => 'loc_id', +# 'NSE-ss_orient' => {'orient' => 'value'}, +# 'NSE-ss_build-id' => 'build', + 'NSE-ss_method-class' => {'method' => 'value'}, +# 'NSE-ss_accession_E' => 'accession', +# 'NSE-ss_comment_E' => 'comment', +# 'NSE-ss_genename' => 'gene_name', +# 'NSE-ss_assay-5_E' => 'seq_5', +# 'NSE-ss_assay-3_E' => 'seq_3', +# 'NSE-ss_observed' => 'observed', + +# 'NSE-ss-popinfo_type' => 'pop_type', +# 'NSE-ss-popinfo_batch-id' => 'pop_batch_id', +# 'NSE-ss-popinfo_pop-name' => 'pop_name', +# 'NSE-ss-popinfo_samplesize' => 'pop_samplesize', +# 'NSE-ss_popinfo_est-het' => 'pop_est_heterozygous', +# 'NSE-ss_popinfo_est-het-se-sq' => 'pop_est_heterozygous_se_sq', + +# 'NSE-ss-alleleinfo_type' => 'allele_type', +# 'NSE-ss-alleleinfo_batch-id' => 'allele_batch_id', +# 'NSE-ss-alleleinfo_pop-id' => 'allele_pop_id', +# 'NSE-ss-alleleinfo_snp-allele' => 'allele_snp', +# 'NSE-ss-alleleinfo_other-allele' => 'allele_other', +# 'NSE-ss-alleleinfo_freq' => 'allele_freq', +# 'NSE-ss-alleleinfo_count' => 'allele_count', + +# 'NSE-rsContigHit_contig-id' => 'contig_hit', +# 'NSE-rsContigHit_accession' => 'accession_hit', +# 'NSE-rsContigHit_version' => 'version', +# 'NSE-rsContigHit_chromosome' => 'chromosome_hit', + +# 'NSE-rsMaploc_asn-from' => 'asn_from', +# 'NSE-rsMaploc_asn-to' => 'asn_to', +# 'NSE-rsMaploc_loc-type' => {'loc_type' => 'value'}, +# 'NSE-rsMaploc_hit-quality' => {'hit_quality' => 'value'}, +# 'NSE-rsMaploc_orient' => {'orient' => 'value'}, +# 'NSE-rsMaploc_physmap-str' => 'phys_from', +# 'NSE-rsMaploc_physmap-int' => 'phys_to', + + 'NSE-FxnSet_locusid' => 'locus_id', + 'NSE-FxnSet_symbol' => 'symbol', + 'NSE-FxnSet_mrna-acc' => 'mrna', + 'NSE-FxnSet_prot-acc' => 'protein', + 'NSE-FxnSet_fxn-class-contig' => {'functional_class' => 'value'}, + + #... + #... + #there are lots more, but i don't need them at the moment... -Allen + ); +} + +sub _initialize{ + my ($self,@args) = @_; + $self->SUPER::_initialize(@args); + my ($usetempfile) = $self->_rearrange([qw(TEMPFILE)],@args); + defined $usetempfile && $self->use_tempfile($usetempfile); + $self->{'_xmlparser'} = new XML::Parser::PerlSAX(); + $DEBUG = 1 if( ! defined $DEBUG && $self->verbose > 0); +} + +=head2 next_cluster + + Title : next_cluster + Usage : $dbsnp = $stream->next_cluster() + Function: returns the next refSNP in the stream + Returns : Bio::Variation::SNP object representing composite refSNP + and its component subSNP(s). + Args : NONE + +=cut + +### +#Adapted from Jason's blastxml.pm +### +sub next_cluster { + my $self = shift; + my $data = ''; + my($tfh); + + if( $self->use_tempfile ) { + $tfh = IO::File->new_tmpfile or $self->throw("Unable to open temp file: $!"); + $tfh->autoflush(1); + } + + my $start = 1; + while( defined( $_ = $self->_readline ) ){ + #skip to beginning of refSNP entry + if($_ !~ m!! && $start){ + next; + } elsif($_ =~ m!! && $start){ + $start = 0; + } + + #slurp up the data + if( defined $tfh ) { + print $tfh $_; + } else { + $data .= $_; + } + + #and stop at the end of the refSNP entry + last if $_ =~ m!!; + } + + #if we didn't find a start tag + return undef if $start; + + my %parser_args; + if( defined $tfh ) { + seek($tfh,0,0); + %parser_args = ('Source' => { 'ByteStream' => $tfh }, + 'Handler' => $self); + } else { + %parser_args = ('Source' => { 'String' => $data }, + 'Handler' => $self); + } + + my $starttime; + my $result; + + if( $DEBUG ) { $starttime = [ Time::HiRes::gettimeofday() ]; } + + eval { + $result = $self->{'_xmlparser'}->parse(%parser_args); + }; + + if( $@ ) { + $self->warn("error in parsing a report:\n $@"); + $result = undef; + } + + if( $DEBUG ) { + $self->debug( sprintf("parsing took %f seconds\n", Time::HiRes::tv_interval($starttime))); + } + + return $self->refsnp; +} + +=head2 SAX methods + +=cut + +=head2 start_document + + Title : start_document + Usage : $parser->start_document; + Function: SAX method to indicate starting to parse a new document. + Creates a Bio::Variation::SNP + Returns : none + Args : none + +=cut + +sub start_document{ + my ($self) = @_; + $self->{refsnp} = Bio::Variation::SNP->new; +} + +sub refsnp { + return shift->{refsnp}; +} + +=head2 end_document + + Title : end_document + Usage : $parser->end_document; + Function: SAX method to indicate finishing parsing a new document + Returns : none + Args : none + +=cut + +sub end_document{ + my ($self,@args) = @_; +} + +=head2 start_element + + Title : start_element + Usage : $parser->start_element($data) + Function: SAX method to indicate starting a new element + Returns : none + Args : hash ref for data + +=cut + +sub start_element{ + my ($self,$data) = @_; + my $nm = $data->{'Name'}; + my $at = $data->{'Attributes'}; + + if($nm eq 'NSE-ss'){ + $self->refsnp->add_subsnp; + return; + } + if(my $type = $MAPPING{$nm}){ + if(ref $type eq 'HASH'){ + #okay, this is nasty. what can you do? + $self->{will_handle} = (keys %$type)[0]; + my $valkey = (values %$type)[0]; + $self->{last_data} = $at->{$valkey}; + } else { + $self->{will_handle} = $type; + $self->{last_data} = undef; + } + } else { + undef $self->{will_handle}; + } +} + +=head2 end_element + + Title : end_element + Usage : $parser->end_element($data) + Function: Signals finishing an element + Returns : none + Args : hash ref for data + +=cut + +sub end_element { + my ($self,$data) = @_; + my $nm = $data->{'Name'}; + my $at = $data->{'Attributes'}; + + my $method = $self->{will_handle}; + if($method){ + if($nm =~ /^NSE-rs/ or $nm =~ /^NSE-SeqLoc/ or $nm =~ /^NSE-FxnSet/){ + $self->refsnp->$method($self->{last_data}); + } elsif ($nm =~ /^NSE-ss/){ + $self->refsnp->subsnp->$method($self->{last_data}); + } + } +} + +=head2 characters + + Title : characters + Usage : $parser->characters($data) + Function: Signals new characters to be processed + Returns : characters read + Args : hash ref with the key 'Data' + +=cut + +sub characters{ + my ($self,$data) = @_; + $self->{last_data} = $data->{Data} + if $data->{Data} =~ /\S/; #whitespace is meaningless -ad +} + +=head2 use_tempfile + + Title : use_tempfile + Usage : $obj->use_tempfile($newval) + Function: Get/Set boolean flag on whether or not use a tempfile + Example : + Returns : value of use_tempfile + Args : newvalue (optional) + +=cut + +sub use_tempfile{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'_use_tempfile'} = $value; + } + return $self->{'_use_tempfile'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/ClusterIO/unigene.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/ClusterIO/unigene.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,252 @@ +# $Id: unigene.pm,v 1.16.2.2 2003/09/15 01:50:47 andrew Exp $ +# BioPerl module for Bio::ClusterIO::unigene +# +# Cared for by Andrew Macgregor +# +# Copyright Andrew Macgregor, Jo-Ann Stanton, David Green +# Molecular Embryology Group, Anatomy & Structural Biology, University of Otago +# http://meg.otago.ac.nz +# +# You may distribute this module under the same terms as perl itself +# +# _history +# April 17, 2002 - Initial implementation by Andrew Macgregor + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::ClusterIO::unigene - UniGene input stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the Bio::ClusterIO class. + +=head1 DESCRIPTION + +This object reads from Unigene *.data files downloaded from ftp://ftp.ncbi.nih.gov/repository/UniGene/. +It doesn't download and decompress the file, you have to do that yourself. + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS - Andrew Macgregor + +Email: andrew@anatomy.otago.ac.nz + + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +#' +# Let the code begin... + +package Bio::ClusterIO::unigene; +use vars qw(@ISA); +use strict; + +use Bio::ClusterIO; +use Bio::Cluster::UniGene; +use Bio::Cluster::ClusterFactory; + +@ISA = qw(Bio::ClusterIO); + +my %line_is = ( + ID => q/ID\s+(\w{2,3}\.\d+)/, + TITLE => q/TITLE\s+(\S.*)/, + GENE => q/GENE\s+(\S.*)/, + CYTOBAND => q/CYTOBAND\s+(\S.*)/, + MGI => q/MGI\s+(\S.*)/, + LOCUSLINK => q/LOCUSLINK\s+(\S.*)/, + EXPRESS => q/EXPRESS\s+(\S.*)/, + GNM_TERMINUS => q/GNM_TERMINUS\s+(\S.*)/, + CHROMOSOME => q/CHROMOSOME\s+(\S.*)/, + STS => q/STS\s+(\S.*)/, + TXMAP => q/TXMAP\s+(\S.*)/, + PROTSIM => q/PROTSIM\s+(\S.*)/, + SCOUNT => q/SCOUNT\s+(\S.*)/, + SEQUENCE => q/SEQUENCE\s+(\S.*)/, + ACC => q/ACC=(\w+)\.?(\d*)/, + NID => q/NID=\s*(\S.*)/, + PID => q/PID=\s*(\S.*)/, + CLONE => q/CLONE=\s*(\S.*)/, + END => q/END=\s*(\S.*)/, + LID => q/LID=\s*(\S.*)/, + MGC => q/MGC=\s*(\S.*)/, + SEQTYPE => q/SEQTYPE=\s*(\S.*)/, + TRACE => q/TRACE=\s*(\S.*)/, + DELIMITER => q/^\/\// +); + +# we set the right factory here +sub _initialize { + my($self, @args) = @_; + + $self->SUPER::_initialize(@args); + if(! $self->cluster_factory()) { + $self->cluster_factory(Bio::Cluster::ClusterFactory->new( + -type => 'Bio::Cluster::UniGene')); + } +} + +=head2 next_cluster + + Title : next_cluster + Usage : $unigene = $stream->next_cluster() + Function: returns the next unigene in the stream + Returns : Bio::Cluster::UniGene object + Args : NONE + +=cut + +sub next_cluster { + my( $self) = @_; + local $/ = "//"; + return unless my $entry = $self->_readline; + +# set up the variables we'll need + my (%unigene,@express,@locuslink,@chromosome, + @sts,@txmap,@protsim,@sequence); + my $UGobj; + +# set up the regexes + +# add whitespace parsing and precompile regexes +#foreach (values %line_is) { +# $_ =~ s/\s+/\\s+/g; +# print STDERR "Regex is $_\n"; +# #$_ = qr/$_/x; +#} + +#$line_is{'TITLE'} = qq/TITLE\\s+(\\S.+)/; + +# run each line in an entry against the regexes + foreach my $line (split /\n/, $entry) { + #print STDERR "Wanting to match $line\n"; + if ($line =~ /$line_is{ID}/gcx) { + $unigene{ID} = $1; + } + elsif ($line =~ /$line_is{TITLE}/gcx ) { + #print STDERR "MATCHED with [$1]\n"; + $unigene{TITLE} = $1; + } + elsif ($line =~ /$line_is{GENE}/gcx) { + $unigene{GENE} = $1; + } + elsif ($line =~ /$line_is{CYTOBAND}/gcx) { + $unigene{CYTOBAND} = $1; + } + elsif ($line =~ /$line_is{MGI}/gcx) { + $unigene{MGI} = $1; + } + elsif ($line =~ /$line_is{LOCUSLINK}/gcx) { + @locuslink = split /;/, $1; + } + elsif ($line =~ /$line_is{EXPRESS}/gcx) { + my $express = $1; + # remove initial semicolon if present + $express =~ s/^;//; + @express = split /\s*;/, $express; + } + elsif ($line =~ /$line_is{GNM_TERMINUS}/gcx) { + $unigene{GNM_TERMINUS} = $1; + } + elsif ($line =~ /$line_is{CHROMOSOME}/gcx) { + push @chromosome, $1; + } + elsif ($line =~ /$line_is{TXMAP}/gcx) { + push @txmap, $1; + } + elsif ($line =~ /$line_is{STS}/gcx) { + push @sts, $1; + } + elsif ($line =~ /$line_is{PROTSIM}/gcx) { + push @protsim, $1; + } + elsif ($line =~ /$line_is{SCOUNT}/gcx) { + $unigene{SCOUNT} = $1; + } + elsif ($line =~ /$line_is{SEQUENCE}/gcx) { + # parse into each sequence line + my $seq = {}; + # add unigene id to each seq + #$seq->{unigene_id} = $unigene{ID}; + my @items = split /;/,$1; + foreach (@items) { + if (/$line_is{ACC}/gcx) { + $seq->{acc} = $1; + $seq->{version} = $2 if defined $2; + } + elsif (/$line_is{NID}/gcx) { + $seq->{nid} = $1; + } + elsif (/$line_is{PID}/gcx) { + $seq->{pid} = $1; + } + elsif (/$line_is{CLONE}/gcx) { + $seq->{clone} = $1; + } + elsif (/$line_is{END}/gcx) { + $seq->{end} = $1; + } + elsif (/$line_is{LID}/gcx) { + $seq->{lid} = $1; + } + elsif (/$line_is{MGC}/gcx) { + $seq->{mgc} = $1; + } + elsif (/$line_is{SEQTYPE}/gcx) { + $seq->{seqtype} = $1; + } + elsif (/$line_is{TRACE}/gcx) { + $seq->{trace} = $1; + } + } + push @sequence, $seq; + } + elsif ($line =~ /$line_is{DELIMITER}/gcx) { + # at the end of the record, add data to the object + $UGobj = $self->cluster_factory->create_object( + -display_id => $unigene{ID}, + -description => $unigene{TITLE}, + -size => $unigene{SCOUNT}, + -members => \@sequence); + $UGobj->gene($unigene{GENE}) if defined ($unigene{GENE}); + $UGobj->cytoband($unigene{CYTOBAND}) if defined($unigene{CYTOBAND}); + $UGobj->mgi($unigene{MGI}) if defined ($unigene{MGI}); + $UGobj->locuslink(\@locuslink); + $UGobj->express(\@express); + $UGobj->gnm_terminus($unigene{GNM_TERMINUS}) if defined ($unigene{GNM_TERMINUS}); + $UGobj->chromosome(\@chromosome); + $UGobj->sts(\@sts); + $UGobj->txmap(\@txmap); + $UGobj->protsim(\@protsim); + } + } + return $UGobj; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Coordinate/Chain.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Coordinate/Chain.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,214 @@ +# $Id: Chain.pm,v 1.1 2002/10/24 17:35:30 heikki Exp $ +# +# bioperl module for Bio::Coordinate::Chain +# +# Cared for by Heikki Lehvaslaiho +# +# Copyright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Coordinate::Chain - Mapping locations through a chain of coordinate mappers + +=head1 SYNOPSIS + + # create Bio::Coordinate::Pairs, or any MapperIs, somehow + $pair1; $pair2; + + # add them into a Chain + $collection = Bio::Coordinate::Chain->new; + $collection->add_mapper($pair1); + $collection->add_mapper($pair2); + + # create a position and map it + $pos = Bio::Location::Simple->new (-start => 5, -end => 9 ); + $match = $collection->map($pos); + if ($match) { + sprintf "Matches at %d-%d\n", $match->start, $match->end, + } else { + print "No match\n"; + } + +=head1 DESCRIPTION + +This class assumes that you have built several mappers and want to +link them together so that output from the previous mapper is the next +mappers input. This way you can build arbitrarily complex mappers from +simpler components. + +Note that Chain does not do any sanity checking on its mappers. You +are solely responsible that input and output coordinate systems, +direction of mapping and parameters internal to mappers make sense +when chained together. + +To put it bluntly, the present class is just a glorified foreach loop +over an array of mappers calling the map method. + +It would be neat to an internal function that would generate a new +single step mapper from those included in the chain. It should speed +things up considerably. Any volunteers? + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bio.perl.org/bioperl-bugs/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email: heikki@ebi.ac.uk +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 CONTRIBUTORS + +Ewan Birney, birney@ebi.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Coordinate::Chain; +use vars qw(@ISA ); +use strict; + +# Object preamble - inherits from Bio::Root::Root +use Bio::Root::Root; +use Bio::Coordinate::MapperI; +use Bio::Coordinate::Result; +use Bio::Coordinate::Collection; + +@ISA = qw(Bio::Coordinate::Collection Bio::Coordinate::MapperI); + + +=head2 map + + Title : map + Usage : $newpos = $obj->map($pos); + Function: Map the location through all the mappers in the chain. + Example : + Returns : new Location in the output coordiante system + Args : a Bio::Location::Simple object + +=cut + +sub map { + my ($self,$value) = @_; + + $self->throw("Need to pass me a value.") + unless defined $value; + $self->throw("I need a Bio::Location, not [$value]") + unless $value->isa('Bio::LocationI'); + $self->throw("No coordinate mappers!") + unless $self->each_mapper; + + my $res = new Bio::Coordinate::Result; + + foreach my $mapper ($self->each_mapper) { + + my $res = $mapper->map($value); + return undef unless $res->each_match; + $value = $res->match; + } + + return $value; +} + + +=head2 Inherited methods + +=cut + +=head2 add_mapper + + Title : add_mapper + Usage : $obj->add_mapper($mapper) + Function: Pushes one Bio::Coodinate::MapperI into the list of mappers. + Sets _is_sorted() to false. + Example : + Returns : 1 when succeeds, 0 for failure. + Args : mapper object + +=cut + +=head2 mappers + + Title : mappers + Usage : $obj->mappers(); + Function: Returns or sets a list of mappers. + Example : + Returns : array of mappers + Args : array of mappers + +=cut + +=head2 each_mapper + + Title : each_mapper + Usage : $obj->each_mapper(); + Function: Returns a list of mappers. + Example : + Returns : array of mappers + Args : none + +=cut + +=head2 swap + + Title : swap + Usage : $obj->swap; + Function: Swap the direction of mapping;input <-> output + Example : + Returns : 1 + Args : + +=cut + +=head2 test + + Title : test + Usage : $obj->test; + Function: test that both components of all pairs are of the same length. + Ran automatically. + Example : + Returns : boolean + Args : + +=cut + + + +sub sort{ + my ($self) = @_; + $self->warn("You do not really want to sort your chain, do you!\nDoing nothing."); +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Coordinate/Collection.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Coordinate/Collection.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,405 @@ +# $Id: Collection.pm,v 1.11.2.1 2003/02/20 05:11:45 heikki Exp $ +# +# bioperl module for Bio::Coordinate::Collection +# +# Cared for by Heikki Lehvaslaiho +# +# Copyright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Coordinate::Collection - Noncontinuous match between two coordinate sets + +=head1 SYNOPSIS + + # create Bio::Coordinate::Pairs or other Bio::Coordinate::MapperIs somehow + $pair1; $pair2; + + # add them into a Collection + $collection = Bio::Coordinate::Collection->new; + $collection->add_mapper($pair1); + $collection->add_mapper($pair2); + + # create a position and map it + $pos = Bio::Location::Simple->new (-start => 5, -end => 9 ); + $res = $collection->map($pos); + $res->match->start == 1; + $res->match-> == 5; + + # if mapping is many to one (*>1) or many-to-many (*>*) + # you have to give seq_id not get unrelevant entries + $pos = Bio::Location::Simple->new + (-start => 5, -end => 9 -seq_id=>'clone1'); + +=head1 DESCRIPTION + +Generic, context neutral mapper to provide coordinate transforms +between two B coordinate systems. It brings into Bioperl the +functionality from Ewan Birney's Bio::EnsEMBL::Mapper ported into +current bioperl. + +This class is aimed for representing mapping between whole chromosomes +and contigs, or between contigs and clones, or between sequencing +reads and assembly. The submaps are automatically sorted, so they can +be added in any order. + +To map coordinates to the other direction, you have to swap() the +collection. Keeping track of the direction and ID restrictions +are left to the calling code. + + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email: heikki@ebi.ac.uk +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 CONTRIBUTORS + +Ewan Birney, birney@ebi.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Coordinate::Collection; +use vars qw(@ISA ); +use strict; + +# Object preamble - inherits from Bio::Root::Root +use Bio::Root::Root; +use Bio::Coordinate::MapperI; +use Bio::Coordinate::Result; +use Bio::Coordinate::Result::Gap; + +@ISA = qw(Bio::Root::Root Bio::Coordinate::MapperI); + + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + + $self->{'_mappers'} = []; + + my($in, $out, $strict, $mappers, $return_match) = + $self->_rearrange([qw(IN + OUT + STRICT + MAPPERS + RETURN_MATCH + )], + @args); + + $in && $self->in($in); + $out && $self->out($out); + $mappers && $self->mappers($mappers); + $return_match && $self->return_match('return_match'); + return $self; # success - we hope! +} + + +=head2 add_mapper + + Title : add_mapper + Usage : $obj->add_mapper($mapper) + Function: Pushes one Bio::Coodinate::MapperI into the list of mappers. + Sets _is_sorted() to false. + Example : + Returns : 1 when succeeds, 0 for failure. + Args : mapper object + +=cut + +sub add_mapper { + my ($self,$value) = @_; + + $self->throw("Is not a Bio::Coordinate::MapperI but a [$self]") + unless defined $value && $value->isa('Bio::Coordinate::MapperI'); + + # test pair range lengths + $self->warn("Coodinates in pair [". $value . ":" . + $value->in->seq_id . "/". $value->in->seq_id . + "] are not right.") + unless $value->test; + + $self->_is_sorted(0); + push(@{$self->{'_mappers'}},$value); +} + +=head2 mappers + + Title : mappers + Usage : $obj->mappers(); + Function: Returns or sets a list of mappers. + Example : + Returns : array of mappers + Args : array of mappers + +=cut + +sub mappers{ + my ($self,@args) = @_; + + if (@args) { + + $self->throw("Is not a Bio::Coordinate::MapperI but a [$self]") + unless defined $args[0] && $args[0]->isa('Bio::Coordinate::MapperI'); + push(@{$self->{'_mappers'}}, @args); + } + + return @{$self->{'_mappers'}}; +} + + +=head2 each_mapper + + Title : each_mapper + Usage : $obj->each_mapper(); + Function: Returns a list of mappers. + Example : + Returns : list of mappers + Args : none + +=cut + +sub each_mapper{ + my ($self) = @_; + return @{$self->{'_mappers'}}; +} + + +=head2 swap + + Title : swap + Usage : $obj->swap; + Function: Swap the direction of mapping;input <-> output + Example : + Returns : 1 + Args : + +=cut + +sub swap { + my ($self) = @_; + use Data::Dumper; + + $self->sort unless $self->_is_sorted; + map {$_->swap;} @{$self->{'_mappers'}}; + ($self->{'_in_ids'}, $self->{'_out_ids'}) = + ($self->{'_out_ids'}, $self->{'_in_ids'}); + 1; +} + +=head2 test + + Title : test + Usage : $obj->test; + Function: test that both components of all pairs are of the same length. + Ran automatically. + Example : + Returns : boolean + Args : + +=cut + +sub test { + my ($self) = @_; + + my $res = 1; + + foreach my $mapper ($self->each_mapper) { + $self->warn("Coodinates in pair [". $mapper . ":" . + $mapper->in->seq_id . "/". $mapper->in->seq_id . + "] are not right.") && ($res = 0) + unless $mapper->test; + } + $res; +} + + +=head2 map + + Title : map + Usage : $newpos = $obj->map($pos); + Function: Map the location from the input coordinate system + to a new value in the output coordinate system. + Example : + Returns : new value in the output coordinate system + Args : integer + +=cut + +sub map { + my ($self,$value) = @_; + + $self->throw("Need to pass me a value.") + unless defined $value; + $self->throw("I need a Bio::Location, not [$value]") + unless $value->isa('Bio::LocationI'); + $self->throw("No coordinate mappers!") + unless $self->each_mapper; + + $self->sort unless $self->_is_sorted; + + + if ($value->isa("Bio::Location::SplitLocationI")) { + + my $result = new Bio::Coordinate::Result; + foreach my $loc ( $value->sub_Location(1) ) { + + my $res = $self->_map($loc); + map { $result->add_sub_Location($_) } $res->each_Location; + + } + return $result; + + } else { + return $self->_map($value); + } + + +} + + +=head2 _map + + Title : _map + Usage : $newpos = $obj->_map($simpleloc); + Function: Internal method that does the actual mapping. Called multiple times + by map() if the location to be mapped is a split location + + Example : + Returns : new location in the output coordinate system or undef + Args : Bio::Location::Simple + +=cut + +sub _map { + my ($self,$value) = @_; + + my $result = Bio::Coordinate::Result->new(-is_remote=>1); + +IDMATCH: { + + # bail out now we if are forcing the use of an ID + # and it is not in this collection + last IDMATCH if defined $value->seq_id && + ! $self->{'_in_ids'}->{$value->seq_id}; + + foreach my $pair ($self->each_mapper) { + + # if we are limiting input to a certain ID + next if defined $value->seq_id && $value->seq_id ne $pair->in->seq_id; + + # if we haven't even reached the start, move on + next if $pair->in->end < $value->start; + # if we have over run, break + last if $pair->in->start > $value->end; + + my $subres = $pair->map($value); + $result->add_result($subres); + } + } + + $result->seq_id($result->match->seq_id) if $result->match; + unless ($result->each_Location) { + #build one gap; + my $gap = Bio::Location::Simple->new(-start => $value->start, + -end => $value->end, + -strand => $value->strand, + -location_type => $value->location_type + ); + $gap->seq_id($value->seq_id) if defined $value->seq_id; + bless $gap, 'Bio::Coordinate::Result::Gap'; + $result->seq_id($value->seq_id) if defined $value->seq_id; + $result->add_sub_Location($gap); + } + return $result; +} + + +=head2 sort + + Title : sort + Usage : $obj->sort; + Function: Sort function so that all mappings are sorted by + input coordinate start + Example : + Returns : 1 + Args : + +=cut + +sub sort{ + my ($self) = @_; + + @{$self->{'_mappers'}} = map { $_->[0] } + sort { $a->[1] <=> $b->[1] } + map { [ $_, $_->in->start] } + @{$self->{'_mappers'}}; + + #create hashes for sequence ids + $self->{'_in_ids'} = (); + $self->{'_out_ids'} = (); + foreach ($self->each_mapper) { + $self->{'_in_ids'}->{$_->in->seq_id} = 1; + $self->{'_out_ids'}->{$_->out->seq_id} = 1; + } + + $self->_is_sorted(1); +} + +=head2 _is_sorted + + Title : _is_sorted + Usage : $newpos = $obj->_is_sorted; + Function: toggle for whether the (internal) coodinate mapper data are sorted + Example : + Returns : boolean + Args : boolean + +=cut + +sub _is_sorted{ + my ($self,$value) = @_; + + $self->{'_is_sorted'} = 1 if defined $value && $value; + return $self->{'_is_sorted'}; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Coordinate/ExtrapolatingPair.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Coordinate/ExtrapolatingPair.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,240 @@ +# $Id: ExtrapolatingPair.pm,v 1.6.2.1 2003/02/20 05:11:45 heikki Exp $ +# +# bioperl module for Bio::Coordinate::ExtrapolatingPair +# +# Cared for by Heikki Lehvaslaiho +# +# Copyright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Coordinate::ExtrapolatingPair - Continuous match between two coordinate sets + +=head1 SYNOPSIS + + + use Bio::Location::Simple; + use Bio::Coordinate::ExtrapolatingPair; + + + $match1 = Bio::Location::Simple->new + (-seq_id => 'propeptide', -start => 21, -end => 40, -strand=>1 ); + $match2 = Bio::Location::Simple->new + (-seq_id => 'peptide', -start => 1, -end => 20, -strand=>1 ); + + $pair = Bio::Coordinate::ExtrapolatingPair-> + new(-in => $match1, + -out => $match2, + -strict => 1 + ); + + $pos = Bio::Location::Simple->new + (-start => 40, -end => 60, -strand=> 1 ); + $res = $pair->map($pos); + $res->start eq 20; + $res->end eq 20; + +=head1 DESCRIPTION + +This class represents a one continuous match between two coordinate +systems represented by Bio::Location::Simple objects. The relationship +is directed and reversible. It implements methods to ensure internal +consistency, and map continuous and split locations from one +coordinate system to another. + +This class is an elaboration of Bio::Coordoinate::Pair. The map +function returns only matches which is the mode needed most of +tehtime. By default the matching regions between coordinate systems +are boundless, so that you can say e.g. that gene starts from here in +the chromosomal coordinate system and extends indefinetely in both +directions. If you want to define the matching regions exactly, you +can do that and set strict() to true. + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email: heikki@ebi.ac.uk +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Coordinate::ExtrapolatingPair; +use vars qw(@ISA ); +use strict; + +# Object preamble - inherits from Bio::Root::Root +use Bio::Root::Root; +use Bio::LocationI; +use Bio::Coordinate::Pair; + +@ISA = qw(Bio::Coordinate::Pair); + + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + + my($strict) = + $self->_rearrange([qw(STRICT + )], + @args); + + $strict && $self->strict($strict); + return $self; +} + + +=head2 strict + + Title : strict + Usage : $obj->strict(1); + Function: Set and read the strictness of the coordinate system. + Example : + Returns : value of input system + Args : boolean + +=cut + +sub strict { + my ($self,$value) = @_; + if( defined $value) { + $self->{'_strict'} = 1 if $value; + } + return $self->{'_strict'}; +} + + +=head2 map + + Title : map + Usage : $newpos = $obj->map($loc); + Function: Map the location from the input coordinate system + to a new value in the output coordinate system. + + In extrapolating coodinate system there is no location zero. + Locations are... + Example : + Returns : new location in the output coordinate system or undef + Args : Bio::Location::Simple + +=cut + +sub map { + my ($self,$value) = @_; + + $self->throw("Need to pass me a value.") + unless defined $value; + $self->throw("I need a Bio::Location, not [$value]") + unless $value->isa('Bio::LocationI'); + $self->throw("Input coordinate system not set") + unless $self->in; + $self->throw("Output coordinate system not set") + unless $self->out; + + my $match; + + if ($value->isa("Bio::Location::SplitLocationI")) { + + my $split = Bio::Coordinate::Result->new(-seq_id=>$self->out->seq_id); + foreach my $loc ( sort { $a->start <=> $b->start } + $value->sub_Location ) { + + $match = $self->_map($loc); + $split->add_sub_Location($match) if $match; + + } + $split->each_Location ? (return $split) : (return undef) ; + + } else { + return $self->_map($value); + } +} + + +=head2 _map + + Title : _map + Usage : $newpos = $obj->_map($simpleloc); + Function: Internal method that does the actual mapping. Called + multiple times by map() if the location to be mapped is a + split location + + Example : + Returns : new location in the output coordinate system or undef + Args : Bio::Location::Simple + +=cut + +sub _map { + my ($self,$value) = @_; + + my ($offset, $start, $end); + + if ($self->strand == -1) { + $offset = $self->in->end + $self->out->start; + $start = $offset - $value->end; + $end = $offset - $value->start ; + } else { # undef, 0 or 1 + $offset = $self->in->start - $self->out->start; + $start = $value->start - $offset; + $end = $value->end - $offset; + } + + # strict prevents matches outside stated range + if ($self->strict) { + return undef if $start < 0 and $end < 0; + return undef if $start > $self->out->end; + $start = 1 if $start < 0; + $end = $self->out->end if $end > $self->out->end; + } + + my $match = Bio::Location::Simple-> + new(-start => $start, + -end => $end, + -strand => $self->strand, + -seq_id => $self->out->seq_id, + -location_type => $value->location_type + ); + $match->strand($match->strand * $value->strand) if $value->strand; + bless $match, 'Bio::Coordinate::Result::Match'; + + return $match; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Coordinate/GeneMapper.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Coordinate/GeneMapper.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1317 @@ +# $Id: GeneMapper.pm,v 1.13.2.2 2003/03/13 11:56:30 heikki Exp $ +# +# bioperl module for Bio::Coordinate::GeneMapper +# +# Cared for by Heikki Lehvaslaiho +# +# Copyright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Coordinate::GeneMapper - transformations between gene related coordinate systems + +=head1 SYNOPSIS + + use Bio::Coordinate::GeneMapper; + + # get a Bio::RangeI representing the start, end and strand of the CDS + # in chromosomal (or entry) coordinates + my $cds; + + # get a Bio::Location::Split or an array of Bio::LocationI objects + # holding the start, end and strand of all the exons in chromosomal + # (or entry) coordinates + my $exons; + + # create a gene mapper and set it to map from chromosomal to cds coordinates + my $gene = Bio::Coordinate::GeneMapper->new(-in=>'chr', + -out=>'cds', + -cds=>$cds, + -exons=>$exons + ); + + # get a a Bio::Location or sequence feature in input (chr) coordinates + my $loc; + + # map the location into output coordinates and get a new location object + $newloc = $gene->map($loc); + + +=head1 DESCRIPTION + +Bio::Coordinate::GeneMapper is a module for simplifying the mappings +of coodinate locations between various gene related locations in human +genetics. It also adds a special human genetics twist to coordinate +systems by making it possible to disable the use of zero +(0). Locations before position one start from -1. See method +L. + +It understands by name the following coordinate systems and mapping +between them: + + peptide (peptide length) + ^ + | -peptide_offset + | + frame propeptide (propeptide length) + ^ ^ + \ | + translate \ | + \ | + cds (transcript start and end) + ^ + negative_intron | \ + ^ | \ transcribe + \ | \ + intron exon \ + ^ ^ ^ / + splice \ \ / | / + \ \ / | / + \ inex | / + \ ^ | / + \ \ |/ + ----- gene (gene_length) + ^ + | - gene_offset + | + chr (or entry) + + +This structure is kept in the global variable $DAG which is a +representation of a Directed Acyclic Graph. The path calculations +traversing this graph are done in a helper class. See +L. + +Of these, two operations are special cases, translate and splice. +Translating and reverse translating are implemented as internal +methods that do the simple 1E-E3 conversion. Splicing needs +additional information that is provided by method L which takes +in an array of Bio::LocationI objects. + +Most of the coordinate system names should be selfexplanatory to +anyone familiar with genes. Negative intron coordinate system is +starts counting backwards from -1 as the last nucleotide in the +intron. This used when only exon and a few flanking intron nucleotides +are known. + + +This class models coordinates within one transcript of a gene, so to +tackle multiple transcripts you need several instances of the +class. It is therefore valid to argue that the name of the class +should be TranscriptMapper. GeneMapper is a catchier name, so it +stuck. + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email: heikki@ebi.ac.uk +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Coordinate::GeneMapper; +use vars qw(@ISA %COORDINATE_SYSTEMS %COORDINATE_INTS $TRANSLATION $DAG + $NOZERO_VALUES $NOZERO_KEYS); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::Root::Root; +use Bio::Coordinate::Result; +use Bio::Location::Simple; +use Bio::Coordinate::Graph; +use Bio::Coordinate::Collection; +use Bio::Coordinate::Pair; +use Bio::Coordinate::ExtrapolatingPair; + +@ISA = qw(Bio::Root::Root Bio::Coordinate::MapperI); + +# first set internal values for all translation tables + +%COORDINATE_SYSTEMS = ( + peptide => 10, + propeptide => 9, + frame => 8, + cds => 7, + negative_intron => 6, + intron => 5, + exon => 4, + inex => 3, + gene => 2, + chr => 1 + ); + +%COORDINATE_INTS = ( + 10 => 'peptide', + 9 => 'propeptide', + 8 => 'frame', + 7 => 'cds', + 6 => 'negative_intron', + 5 => 'intron', + 4 => 'exon', + 3 => 'inex', + 2 => 'gene', + 1 => 'chr' + ); + +$TRANSLATION = $COORDINATE_SYSTEMS{'cds'}. "-". + $COORDINATE_SYSTEMS{'propeptide'}; + +$DAG = { + 10 => [], + 9 => [10], + 8 => [], + 7 => [8, 9], + 6 => [], + 5 => [6], + 4 => [7], + 3 => [4, 5], + 2 => [3, 4, 5, 7], + 1 => [2] + }; + +$NOZERO_VALUES = {0 => 0, 'in' => 1, 'out' => 2, 'in&out' => 3 }; +$NOZERO_KEYS = { 0 => 0, 1 => 'in', 2 => 'out', 3 => 'in&out' }; + + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + + # prime the graph + my $graph = new Bio::Coordinate::Graph; + $graph->hash_of_arrays($DAG); + $self->graph($graph); + + my($in, $out, $peptide_offset, $exons, + $cds, $nozero, $strict) = + $self->_rearrange([qw(IN + OUT + PEPTIDE_OFFSET + EXONS + CDS + NOZERO + STRICT + )], + @args); + + # direction of mapping when going chr to protein + $self->{_direction} = 1; + + $in && $self->in($in); + $out && $self->out($out); + $cds && $self->cds($cds); + $exons && ref($exons) =~ /ARRAY/i && $self->exons(@$exons); + $peptide_offset && $self->peptide_offset($peptide_offset); + $nozero && $self->nozero($nozero); + $strict && $self->strict($strict); + + return $self; # success - we hope! +} + +=head2 in + + Title : in + Usage : $obj->in('peptide'); + Function: Set and read the input coordinate system. + Example : + Returns : value of input system + Args : new value (optional) + +=cut + +sub in { + my ($self,$value) = @_; + if( defined $value) { + $self->throw("Not a valid input coordinate system name [$value]\n". + "Valid values are ". join(", ", keys %COORDINATE_SYSTEMS )) + unless defined $COORDINATE_SYSTEMS{$value}; + + $self->{'_in'} = $COORDINATE_SYSTEMS{$value}; + } + return $COORDINATE_INTS{ $self->{'_in'} }; +} + + +=head2 out + + Title : out + Usage : $obj->out('peptide'); + Function: Set and read the output coordinate system. + Example : + Returns : value of output system + Args : new value (optional) + +=cut + +sub out { + my ($self,$value) = @_; + if( defined $value) { + $self->throw("Not a valid input coordinate system name [$value]\n". + "Valid values are ". join(", ", keys %COORDINATE_SYSTEMS )) + unless defined $COORDINATE_SYSTEMS{$value}; + + $self->{'_out'} = $COORDINATE_SYSTEMS{$value}; + } + return $COORDINATE_INTS{ $self->{'_out'} }; +} + +=head2 strict + + Title : strict + Usage : $obj->strict('peptide'); + Function: Set and read weather strict boundaried of coordinate + systems are enforced. + When strict is on, the end of the coordinate range must be defined. + Example : + Returns : boolean + Args : boolean (optional) + +=cut + +sub strict { + my ($self,$value) = @_; + if( defined $value) { + $value ? ( $self->{'_strict'} = 1 ) : ( $self->{'_strict'} = 0 ); + ## update in each mapper !! + } + return $self->{'_strict'} || 0 ; +} + + +=head2 nozero + + Title : nozero + Usage : $obj->nozero(1); + Function: Flag to disable the use of zero in the input, + output or both coordinate systems. Use of coordinate + systems without zero is a peculiarity common in + human genetics community. + Example : + Returns : 0 (default), or 'in', 'out', 'in&out' + Args : 0 (default), or 'in', 'out', 'in&out' + +=cut + +sub nozero { + my ($self,$value) = @_; + + if (defined $value) { + $self->throw("Not a valid value for nozero [$value]\n". + "Valid values are ". join(", ", keys %{$NOZERO_VALUES} )) + unless defined $NOZERO_VALUES->{$value}; + $self->{'_nozero'} = $NOZERO_VALUES->{$value}; + } + + my $res = $self->{'_nozero'} || 0; + return $NOZERO_KEYS->{$res}; +} + +=head2 graph + + Title : graph + Usage : $obj->graph($new_graph); + Function: Set and read the graph object representing relationships + between coordinate systems + Example : + Returns : Bio::Coordinate::Graph object + Args : new Bio::Coordinate::Graph object (optional) + +=cut + +sub graph { + my ($self,$value) = @_; + if( defined $value) { + $self->throw("Not a valid graph [$value]\n") + unless $value->isa('Bio::Coordinate::Graph'); + $self->{'_graph'} = $value; + } + return $self->{'_graph'}; +} + +=head2 peptide + + Title : peptide + Usage : $obj->peptide_offset($peptide_coord); + Function: Read and write the offset of peptide from the start of propeptide + and peptide length + Returns : a Bio::Location::Simple object + Args : a Bio::LocationI object + +=cut + +sub peptide { + my ($self, $value) = @_; + if( defined $value) { + $self->throw("I need a Bio::LocationI, not [". $value. "]") + unless $value->isa('Bio::LocationI'); + + $self->throw("Peptide start not defined") + unless defined $value->start; + $self->{'_peptide_offset'} = $value->start - 1; + + $self->throw("Peptide end not defined") + unless defined $value->end; + $self->{'_peptide_length'} = $value->end - $self->{'_peptide_offset'}; + + + my $a = $self->_create_pair + ('propeptide', 'peptide', $self->strict, + $self->{'_peptide_offset'}, $self->{'_peptide_length'} ); + my $mapper = $COORDINATE_SYSTEMS{'propeptide'}. "-". $COORDINATE_SYSTEMS{'peptide'}; + $self->{'_mappers'}->{$mapper} = $a; + } + return Bio::Location::Simple->new + (-seq_id => 'propeptide', + -start => $self->{'_peptide_offset'} + 1 , + -end => $self->{'_peptide_length'} + $self->{'_peptide_offset'}, + -strand => 1 + ); +} + +=head2 peptide_offset + + Title : peptide_offset + Usage : $obj->peptide_offset(20); + Function: Set and read the offset of peptide from the start of propeptide + Returns : set value or 0 + Args : new value (optional) + +=cut + +sub peptide_offset { + my ($self,$offset, $len) = @_; + if( defined $offset) { + $self->throw("I need an integer, not [$offset]") + unless $offset =~ /^[+-]?\d+$/; + $self->{'_peptide_offset'} = $offset; + + if (defined $len) { + $self->throw("I need an integer, not [$len]") + unless $len =~ /^[+-]?\d+$/; + $self->{'_peptide_length'} = $len; + } + + my $a = $self->_create_pair + ('propeptide', 'peptide', $self->strict, $offset, $self->{'_peptide_length'} ); + my $mapper = $COORDINATE_SYSTEMS{'propeptide'}. "-". $COORDINATE_SYSTEMS{'peptide'}; + $self->{'_mappers'}->{$mapper} = $a; + } + return $self->{'_peptide_offset'} || 0; +} + +=head2 peptide_length + + Title : peptide_length + Usage : $obj->peptide_length(20); + Function: Set and read the offset of peptide from the start of propeptide + Returns : set value or 0 + Args : new value (optional) + +=cut + + +sub peptide_length { + my ($self, $len) = @_; + if( defined $len) { + $self->throw("I need an integer, not [$len]") + if defined $len && $len !~ /^[+-]?\d+$/; + $self->{'_peptide_length'} = $len; + } + return $self->{'_peptide_length'}; +} + + +=head2 exons + + Title : exons + Usage : $obj->exons(@exons); + Function: Set and read the offset of CDS from the start of transcipt + You do not have to sort the exons before calling this method as + they will be sorted automatically. + If you have not defined the CDS, is will be set to span all + exons here. + Returns : array of Bio::LocationI exons in genome coordinates or 0 + Args : array of Bio::LocationI exons in genome (or entry) coordinates + +=cut + +sub exons { + my ($self,@value) = @_; + my $cds_mapper = $COORDINATE_SYSTEMS{'gene'}. "-". $COORDINATE_SYSTEMS{'cds'}; + my $inex_mapper = + $COORDINATE_SYSTEMS{'gene'}. "-". $COORDINATE_SYSTEMS{'inex'}; + my $exon_mapper = + $COORDINATE_SYSTEMS{'gene'}. "-". $COORDINATE_SYSTEMS{'exon'}; + my $intron_mapper = + $COORDINATE_SYSTEMS{'gene'}. "-". $COORDINATE_SYSTEMS{'intron'}; + my $negative_intron_mapper = + $COORDINATE_SYSTEMS{'intron'}. "-". $COORDINATE_SYSTEMS{'negative_intron'}; + my $exon_cds_mapper = $COORDINATE_SYSTEMS{'exon'}. "-". $COORDINATE_SYSTEMS{'cds'}; + + if(@value) { + if (ref($value[0]) && + $value[0]->isa('Bio::SeqFeatureI') and + $value[0]->location->isa('Bio::Location::SplitLocationI')) { + @value = $value[0]->location->each_Location; + } else { + $self->throw("I need an array , not [@value]") + unless ref \@value eq 'ARRAY'; + $self->throw("I need a reference to an array of Bio::LocationIs, not to [". + $value[0]. "]") + unless ref $value[0] and $value[0]->isa('Bio::LocationI'); + } + + # + # sort the input array + # + # and if the used has not defined CDS assume it is the complete exonic range + if (defined $value[0]->strand && $value[0]->strand == - 1) { #reverse strand + @value = map { $_->[0] } + sort { $b->[1] <=> $a->[1] } + map { [ $_, $_->start] } + @value; + + unless ($self->cds) { + $self->cds(new Bio::Location::Simple(-start => $value[-1]->start, + -end => $value[0]->end, + -strand=> $value[0]->strand, + -seq_id=> $value[0]->seq_id, + ) + ); + } + } else { #undef or forward strand + @value = map { $_->[0] } + sort { $a->[1] <=> $b->[1] } + map { [ $_, $_->start] } + @value; + unless ($self->cds) { + $self->cds(new Bio::Location::Simple(-start => $value[0]->start, + -end => $value[-1]->end, + -strand=> $value[0]->strand, + -seq_id=> $value[0]->seq_id, + ) + ); + } + + } + + $self->{'_chr_exons'} = \@value; + + # transform exons from chromosome to gene coordinates + # but only if gene coordinate system has been set + my @exons ; + #my $gene_mapper = $self->$COORDINATE_SYSTEMS{'chr'}. "-". $COORDINATE_SYSTEMS{'gene'}; + my $gene_mapper = "1-2"; + if (defined $self->{'_mappers'}->{$gene_mapper} ) { + + my $tmp_in = $self->{'_in'}; + my $tmp_out = $self->{'_out'}; + my $tmp_verb = $self->verbose; + $self->verbose(0); + + $self->in('chr'); + $self->out('gene'); + + @exons = map {$self->map($_)} @value; + + $self->{'_in'} = ($tmp_in); + $self->{'_out'} = ($tmp_out); + $self->verbose($tmp_verb); + } else { + @exons = @value; + } + + my $cds_map = Bio::Coordinate::Collection->new; + my $inex_map = Bio::Coordinate::Collection->new; + my $exon_map = Bio::Coordinate::Collection->new; + my $exon_cds_map = Bio::Coordinate::Collection->new; + my $intron_map = Bio::Coordinate::Collection->new; + my $negative_intron_map = Bio::Coordinate::Collection->new; + + my $tr_end = 0; + my $coffset; + my $exon_counter; + my $prev_exon_end; + + for my $exon ( @exons ) { + + $exon_counter++; + + # + # gene -> cds + # + + my $match1 = Bio::Location::Simple->new + (-seq_id =>'gene' , + -start => $exon->start, + -end => $exon->end, -strand=>1 ); + my $match2 = Bio::Location::Simple->new + (-seq_id => 'cds', + -start => $tr_end + 1, + -end => $tr_end + $exon->end - $exon->start +1, + -strand=>$exon->strand ); + + $cds_map->add_mapper(Bio::Coordinate::Pair->new + (-in => $match1, + -out => $match2, + ) + ); + + if ($exon->start <= 1 and $exon->end >= 1) { + $coffset = $tr_end - $exon->start + 1; + } + $tr_end = $tr_end + $exon->end - $exon->start + 1; + + # + # gene -> intron + # + + if (defined $prev_exon_end) { + my $match3 = Bio::Location::Simple->new + (-seq_id =>'gene', + -start => $prev_exon_end + 1, + -end => $exon->start -1, -strand=>$exon->strand ); + + my $match4 = Bio::Location::Simple->new + (-seq_id => 'intron'. ($exon_counter -1), + -start => 1, + -end => $exon->start - 1 - $prev_exon_end, + -strand=>$exon->strand ); + + # negative intron coordinates + my $match5 = Bio::Location::Simple->new + (-seq_id => 'intron'. ($exon_counter -1), + -start => -1 * ($exon->start - 2 - $prev_exon_end) -1, + -end => -1, + -strand=>$exon->strand ); + + $inex_map->add_mapper(Bio::Coordinate::Pair->new + (-in => $match3, + -out => $match4 + ) + ); + $intron_map->add_mapper(Bio::Coordinate::Pair->new + (-in => $self->_clone_loc($match3), + -out => $self->_clone_loc($match4) + ) + ); + $negative_intron_map->add_mapper(Bio::Coordinate::Pair->new + (-in => $self->_clone_loc($match4), + -out => $match5 + )); + + } + + # store the value + $prev_exon_end = $exon->end; + + # + # gene -> exon + # + my $match6 = Bio::Location::Simple->new + (-seq_id => 'exon'. $exon_counter, + -start => 1, + -end => $exon->end - $exon->start +1, + -strand=> $exon->strand ); + + my $pair2 = Bio::Coordinate::Pair->new(-in => $self->_clone_loc($match1), + -out => $match6 + ); + my $pair3 = Bio::Coordinate::Pair->new(-in => $self->_clone_loc($match6), + -out => $self->_clone_loc($match2) + ); + $inex_map->add_mapper(Bio::Coordinate::Pair->new + (-in => $self->_clone_loc($match1), + -out => $match6 + ) + ); + $exon_map->add_mapper(Bio::Coordinate::Pair->new + (-in => $self->_clone_loc($match1), + -out => $self->_clone_loc($match6) + ) + ); + $exon_cds_map->add_mapper(Bio::Coordinate::Pair->new + (-in => $self->_clone_loc($match6), + -out => $self->_clone_loc($match2) + ) + ); + + } + + # move coordinate start if exons have negative values + if ($coffset) { + foreach my $m ($cds_map->each_mapper) { + $m->out->start($m->out->start - $coffset); + $m->out->end($m->out->end - $coffset); + } + + } + + $self->{'_mappers'}->{$cds_mapper} = $cds_map; + $self->{'_mappers'}->{$exon_cds_mapper} = $exon_cds_map; + $self->{'_mappers'}->{$inex_mapper} = $inex_map; + $self->{'_mappers'}->{$exon_mapper} = $exon_map; + $self->{'_mappers'}->{$intron_mapper} = $intron_map; + $self->{'_mappers'}->{$negative_intron_mapper} = $negative_intron_map; + } + return @{$self->{'_chr_exons'}} || 0; +} + +=head2 _clone_loc + + Title : _clone_loc + Usage : $copy_of_loc = $obj->_clone_loc($loc); + Function: Make a deep copy of a simple location + Returns : a Bio::Location::Simple object + Args : a Bio::Location::Simple object to be cloned + +=cut + + +sub _clone_loc { # clone a simple location + my ($self,$loc) = @_; + + $self->throw("I need a Bio::Location::Simple , not [". ref $loc. "]") + unless $loc->isa('Bio::Location::Simple'); + + return Bio::Location::Simple->new + (-seq_id => $loc->seq_id, + -start => $loc->start, + -end => $loc->end, + -strand=> $loc->strand, + -location_type => $loc->location_type + ); +} + + +=head2 cds + + Title : cds + Usage : $obj->cds(20); + Function: Set and read the offset of CDS from the start of transcipt + + Simple input can be an integer which gives the start of the + coding region in genomic coordinate. If you want to provide + the end of the coding region or indicate the use of the + opposite strand, you have to pass a Bio::RangeI + (e.g. Bio::Location::Simple or Bio::SegFeature::Generic) + object to this method. + + Returns : set value or 0 + Args : new value (optional) + +=cut + +sub cds { + my ($self,$value) = @_; + if( defined $value) { + if ($value =~ /^[+-]?\d+$/ ) { + my $loc = Bio::Location::Simple->new(-start=>$value); + $self->{'_cds'} = $loc; + } + elsif (ref $value && $value->isa('Bio::RangeI') ) { + $self->{'_cds'} = $value; + } else { + $self->throw("I need an integer or Bio::RangeI, not [$value]") + } + # strand !! + my $len; + + $len = $self->{'_cds'}->end - $self->{'_cds'}->start +1 + if defined $self->{'_cds'}->end; + + my $a = $self->_create_pair + ('chr', 'gene', 0, + $self->{'_cds'}->start-1, + $len, + $self->{'_cds'}->strand); + my $mapper = $COORDINATE_SYSTEMS{'chr'}. "-". $COORDINATE_SYSTEMS{'gene'}; + $self->{'_mappers'}->{$mapper} = $a; + + # recalculate exon-based mappers + if ( defined $self->{'_chr_exons'} ) { + $self->exons(@{$self->{'_chr_exons'}}); + } + + } + return $self->{'_cds'} || 0; +} + + +=head2 map + + Title : map + Usage : $newpos = $obj->map(5); + Function: Map the location from the input coordinate system + to a new value in the output coordinate system. + Example : + Returns : new value in the output coordiante system + Args : a Bio::Location::Simple + +=cut + +sub map { + my ($self,$value) = @_; + my ($res); + + $self->throw("Need to pass me a Bio::Location::Simple or ". + "Bio::Location::Simple or Bio::SeqFeatureI, not [". + ref($value). "]") + unless ref($value) && ($value->isa('Bio::Location::Simple') or + $value->isa('Bio::Location::SplitLocationI') or + $value->isa('Bio::SeqFeatureI')); + $self->throw("Input coordinate system not set") + unless $self->{'_in'}; + $self->throw("Output coordinate system not set") + unless $self->{'_out'}; + $self->throw("Do not be silly. Input and output coordinate ". + "systems are the same!") + unless $self->{'_in'} != $self->{'_out'}; + + $self->_check_direction(); + + $value = $value->location if $value->isa('Bio::SeqFeatureI'); + print STDERR "=== Start location: ". $value->start. ",". + $value->end. " (". $value->strand. ")\n" if $self->verbose > 0; + + + # if nozero coordinate system is used in the input values + if ( defined $self->{'_nozero'} && + ( $self->{'_nozero'} == 1 || $self->{'_nozero'} == 3 ) ) { + $value->start($value->start + 1) + if defined $value->start && $value->start < 1; + $value->end($value->end + 1) + if defined $value->end && $value->end < 1; + } + + my @steps = $self->_get_path(); + print "mapping ", $self->{'_in'}, "->", $self->{'_out'}, + " Mappers: ", join(", ", @steps), "\n" if $self->verbose > 0; + + foreach my $mapper (@steps) { + if ($mapper eq $TRANSLATION) { + if ($self->direction == 1) { + $value = $self->_translate($value); + print STDERR "+ $TRANSLATION cds -> propeptide (translate) \n" + if $self->verbose > 0; + } else { + $value = $self->_reverse_translate($value); + print STDERR "+ $TRANSLATION propeptide -> cds (reverse translate) \n" + if $self->verbose > 0; + } + } + # keep the start and end values, and go on to next iteration + # if this mapper is not set + elsif ( ! defined $self->{'_mappers'}->{$mapper} ) { + # update mapper name + $mapper =~ /\d+-(\d+)/; my ($counter) = $1; + $value->seq_id($COORDINATE_INTS{$counter}); + print STDERR "- $mapper\n" if $self->verbose > 0; + } else { + + # + # the DEFAULT : generic mapping + # + $value = $self->{'_mappers'}->{$mapper}->map($value); + $value->purge_gaps + if ($value && $value->isa('Bio::Location::SplitLocationI') && $value->can('gap')); + print STDERR "+ $mapper (", $self->direction, "): start ", + $value->start, " end ", $value->end, "\n" + if $value && $self->verbose > 0; + } + } + + # if nozero coordinate system is asked to be used in the output values + if ( defined $value && defined $self->{'_nozero'} && + ( $self->{'_nozero'} == 2 || $self->{'_nozero'} == 3 ) ) { + + $value->start($value->start - 1) + if defined $value->start && $value->start < 1; + $value->end($value->end - 1) + if defined $value->end && $value->end < 1; + } + + # handle merging of adjacent split locations! + + if (ref $value eq "Bio::Coordinate::Result" && $value->each_match > 1 ) { + my $prevloc; + my $merging = 0; + my $newvalue; + my @matches; + foreach my $loc ( $value->each_Location(1) ) { + unless ($prevloc) { + $prevloc = $loc; + push @matches, $prevloc; + next; + } + if ($prevloc->end == ($loc->start - 1) && $prevloc->seq_id eq $loc->seq_id) { + $prevloc->end($loc->end); + $merging = 1; + } else { + push @matches, $loc; + $prevloc = $loc; + } + } + if ($merging) { + if (@matches > 1 ) { + $newvalue = Bio::Coordinate::Result->new; + map {$newvalue->add_sub_Location} @matches; + } else { + $newvalue = Bio::Coordinate::Result::Match->new + (-seq_id => $matches[0]->seq_id, + -start => $matches[0]->start, + -end => $matches[0]->end, + -strand=> $matches[0]->strand ); + } + $value = $newvalue; + } + } + elsif (ref $value eq "Bio::Coordinate::Result" && $value->each_match == 1 ){ + $value = $value->match; + } + + + return $value; +} + +=head2 direction + + Title : direction + Usage : $obj->direction('peptide'); + Function: Read-only method for the direction of mapping deduced from + predefined input and output coordinate names. + Example : + Returns : 1 or -1, mapping direction + Args : new value (optional) + +=cut + +sub direction { + my ($self) = @_; + return $self->{'_direction'}; +} + + +=head2 swap + + Title : swap + Usage : $obj->swap; + Function: Swap the direction of transformation + (input <-> output) + Example : + Returns : 1 + Args : + +=cut + +sub swap { + my ($self,$value) = @_; + + ($self->{'_in'}, $self->{'_out'}) = ($self->{'_out'}, $self->{'_in'}); + map { $self->{'_mappers'}->{$_}->swap } keys %{$self->{'_mappers'}}; + + # record the changed direction; + $self->{_direction} *= -1; + + return 1; +} + + +=head2 to_string + + Title : to_string + Usage : $newpos = $obj->to_string(5); + Function: Dump the internal mapper values into a human readable format + Example : + Returns : string + Args : + +=cut + +sub to_string { + my ($self) = shift; + + print "-" x 40, "\n"; + + # chr-gene + my $mapper_str = 'chr-gene'; + my $mapper = $self->_mapper_string2code($mapper_str); + + printf "\n %-12s (%s)\n", $mapper_str, $mapper ; + if (defined $self->cds) { + my $end = $self->cds->end -1 if defined $self->cds->end; + printf "%16s%s: %s (%s)\n", ' ', 'gene offset', $self->cds->start-1 , $end || ''; + printf "%16s%s: %s\n", ' ', 'gene strand', $self->cds->strand || 0; + } + + # gene-intron + $mapper_str = 'gene-intron'; + $mapper = $self->_mapper_string2code($mapper_str); + printf "\n %-12s (%s)\n", $mapper_str, $mapper ; + + my $i = 1; + foreach my $pair ( $self->{'_mappers'}->{$mapper}->each_mapper ) { + printf "%8s :%8s -> %-12s\n", $i, $pair->in->start, $pair->out->start ; + printf "%8s :%8s -> %-12s\n", '', $pair->in->end, $pair->out->end ; + $i++; + } + + # intron-negative_intron + $mapper_str = 'intron-negative_intron'; + $mapper = $self->_mapper_string2code($mapper_str); + printf "\n %-12s (%s)\n", $mapper_str, $mapper ; + + $i = 1; + foreach my $pair ( $self->{'_mappers'}->{$mapper}->each_mapper ) { + printf "%8s :%8s -> %-12s\n", $i, $pair->in->start, $pair->out->start ; + printf "%8s :%8s -> %-12s\n", '', $pair->in->end, $pair->out->end ; + $i++; + } + + + # gene-exon + $mapper_str = 'gene-exon'; + $mapper = $self->_mapper_string2code($mapper_str); + printf "\n %-12s (%s)\n", $mapper_str, $mapper ; + + $i = 1; + foreach my $pair ( $self->{'_mappers'}->{$mapper}->each_mapper ) { + printf "%8s :%8s -> %-12s\n", $i, $pair->in->start, $pair->out->start ; + printf "%8s :%8s -> %-12s\n", '', $pair->in->end, $pair->out->end ; + $i++; + } + + + # gene-cds + $mapper_str = 'gene-cds'; + $mapper = $self->_mapper_string2code($mapper_str); + printf "\n %-12s (%s)\n", $mapper_str, $mapper ; + + $i = 1; + foreach my $pair ( $self->{'_mappers'}->{$mapper}->each_mapper ) { + printf "%8s :%8s -> %-12s\n", $i, $pair->in->start, $pair->out->start ; + printf "%8s :%8s -> %-12s\n", '', $pair->in->end, $pair->out->end ; + $i++; + } + + # cds-propeptide + $mapper_str = 'cds-propeptide'; + $mapper = $self->_mapper_string2code($mapper_str); + printf "\n %-12s (%s)\n", $mapper_str, $mapper ; + printf "%9s%-12s\n", "", '"translate"'; + + + # propeptide-peptide + $mapper_str = 'propeptide-peptide'; + $mapper = $self->_mapper_string2code($mapper_str); + printf "\n %-12s (%s)\n", $mapper_str, $mapper ; + printf "%16s%s: %s\n", ' ', "peptide offset", $self->peptide_offset; + + + + print "\nin : ", $self->in, "\n"; + print "out: ", $self->out, "\n"; + my $dir; + $self->direction ? ($dir='forward') : ($dir='reverse'); + printf "direction: %-8s(%s)\n", $dir, $self->direction; + print "\n", "-" x 40, "\n"; + + 1; +} + +sub _mapper_code2string { + my ($self, $code) = @_; + my ($a, $b) = $code =~ /(\d+)-(\d+)/; + return $COORDINATE_INTS{$a}. '-'. $COORDINATE_INTS{$b}; + +} + +sub _mapper_string2code { + my ($self, $string) =@_; + my ($a, $b) = $string =~ /([^-]+)-(.*)/; + return $COORDINATE_SYSTEMS{$a}. '-'. $COORDINATE_SYSTEMS{$b}; +} + + +=head2 _create_pair + + Title : _create_pair + Usage : $mapper = $obj->_create_pair('chr', 'gene', 0, 2555, 10000, -1); + Function: Internal helper method to create a mapper between + two coordinate systems + Returns : a Bio::Coordinate::Pair object + Args : string, input coordinate system name, + string, output coordinate system name, + boolean, strict mapping + positive integer, offset + positive integer, length + 1 || -1 , strand + +=cut + +sub _create_pair { + my ($self, $in, $out, $strict, $offset, $length, $strand ) = @_; + $strict ||= 0; + $strand ||= 1; + $length ||= 20; + + my $match1 = Bio::Location::Simple->new + (-seq_id => $in, + -start => $offset+1, + -end => $offset+$length, -strand=>1 ); + + my $match2 = Bio::Location::Simple->new + (-seq_id => $out, + -start => 1, + -end => $length, -strand=>$strand ); + + my $pair = Bio::Coordinate::ExtrapolatingPair->new + (-in => $match1, + -out => $match2, + -strict => $strict + ); + + return $pair; + +} + + +=head2 _translate + + Title : _translate + Usage : $newpos = $obj->_translate($loc); + Function: Translate the location from the CDS coordinate system + to a new value in the propeptide coordinate system. + Example : + Returns : new location + Args : a Bio::Location::Simple or Bio::Location::SplitLocationI + +=cut + +sub _translate { + my ($self,$value) = @_; + + $self->throw("Need to pass me a Bio::Location::Simple or ". + "Bio::Location::SplitLocationI, not [". ref($value). "]") + unless defined $value && + ($value->isa('Bio::Location::Simple') || $value->isa('Bio::Location::SplitLocationI')); + + my $seqid = 'propeptide'; + + if ($value->isa("Bio::Location::SplitLocationI")) { + my $split = new Bio::Location::Split(-seq_id=>$seqid); + foreach my $loc ( $value->each_Location(1) ) { + + my $match = new Bio::Location::Simple(-start => int($loc->start / 3 )+1, + -end => int($loc->end / 3 )+1, + -seq_id => $seqid, + -strand => 1 + ); + $split->add_sub_Location($match); + } + return $split; + + } else { + return new Bio::Location::Simple(-start => int($value->start / 3 )+1, + -end => int($value->end / 3 )+1, + -seq_id => $seqid, + -strand => 1 + ); + } +} + +sub _frame { + my ($self,$value) = @_; + + $self->throw("Need to pass me a Bio::Location::Simple or ". + "Bio::Location::SplitLocationI, not [". ref($value). "]") + unless defined $value && + ($value->isa('Bio::Location::Simple') || $value->isa('Bio::Location::SplitLocationI')); + + my $seqid = 'propeptide'; + + if ($value->isa("Bio::Location::SplitLocationI")) { + my $split = new Bio::Location::Split(-seq_id=>$seqid); + foreach my $loc ( $value->each_Location(1) ) { + + my $match = new Bio::Location::Simple(-start => ($value->start-1) % 3 +1, + -end => ($value->end-1) % 3 +1, + -seq_id => 'frame', + -strand => 1 + ); + $split->add_sub_Location($match); + } + return $split; + } else { + return new Bio::Location::Simple(-start => ($value->start-1) % 3 +1, + -end => ($value->end-1) % 3 +1, + -seq_id => 'frame', + -strand => 1 + ); + } +} + + +=head2 _reverse_translate + + Title : _reverse_translate + Usage : $newpos = $obj->_reverse_translate(5); + Function: Reverse translate the location from the propeptide + coordinate system to a new value in the CSD. + Note that a single peptide location expands to cover + the codon triplet + Example : + Returns : new location in the CDS coordinate system + Args : a Bio::Location::Simple or Bio::Location::SplitLocationI + +=cut + +sub _reverse_translate { + my ($self,$value) = @_; + + + $self->throw("Need to pass me a Bio::Location::Simple or ". + "Bio::Location::SplitLocationI, not [". ref($value). "]") + unless defined $value && + ($value->isa('Bio::Location::Simple') || $value->isa('Bio::Location::SplitLocationI')); + + my $seqid = 'cds'; + + if ($value->isa("Bio::Location::SplitLocationI")) { + my $split = new Bio::Location::Split(-seq_id=>$seqid); + foreach my $loc ( $value->each_Location(1) ) { + + my $match = new Bio::Location::Simple(-start => $value->start * 3 - 2, + -end => $value->end * 3, + -seq_id => $seqid, + -strand => 1 + ); + $split->add_sub_Location($match); + } + return $split; + + } else { + return new Bio::Location::Simple(-start => $value->start * 3 - 2, + -end => $value->end * 3, + -seq_id => $seqid, + -strand => 1 + ); + } +} + + +=head2 _check_direction + + Title : _check_direction + Usage : $obj->_check_direction(); + Function: Check and swap when needed the direction the location + mapping Pairs based on input and output values + Example : + Returns : new location + Args : a Bio::Location::Simple + +=cut + +sub _check_direction { + my ($self) = @_; + + my $new_direction = 1; + $new_direction = -1 if $self->{'_in'} > $self->{'_out'}; + + unless ($new_direction == $self->{_direction} ) { + map { $self->{'_mappers'}->{$_}->swap } keys %{$self->{'_mappers'}}; + # record the changed direction; + $self->{_direction} *= -1; + } + 1; +} + + +=head2 _get_path + + Title : _get_path + Usage : $obj->_get_path('peptide'); + Function: internal method for finding that shortest path between + input and output coordinate systems. + Calculations and caching are handled by the graph class. + See L. + Example : + Returns : array of the mappers + Args : none + +=cut + +sub _get_path { + my ($self) = @_; + + my $start = $self->{'_in'} || 0; + my $end = $self->{'_out'} || 0; + + # note the order + # always go from smaller to bigger: it makes caching more efficient + my $reverse; + if ($start > $end) { + ($start, $end) = ($end, $start ); + $reverse++; + } + + my @mappers; + if (exists $self->{'_previous_path'} and + $self->{'_previous_path'} eq "$start$end" ) { + # use cache + @mappers = @{$self->{'_mapper_path'}}; + } else { + my $mapper; + my $prev_node = ''; + @mappers = + map { $mapper = "$prev_node-$_"; $prev_node = $_; $mapper; } + $self->{'_graph'}->shortest_path($start, $end); + shift @mappers; + + $self->{'_previous_path'} = "$start$end"; + $self->{'_mapper_path'} = \@mappers; + } + + $reverse ? return reverse @mappers : return @mappers; +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Coordinate/Graph.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Coordinate/Graph.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,403 @@ +# $Id: Graph.pm,v 1.2.2.2 2003/09/08 12:16:18 heikki Exp $ +# +# bioperl module for Bio::Coordinate::Graph +# +# Cared for by Heikki Lehvaslaiho +# +# Copyright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Coordinate::Graph - Finds shortest path between nodes in a graph + +=head1 SYNOPSIS + + # get a hash of hashes representing the graph. E.g.: + my $hash= { + '1' => { + '2' => 1 + }, + '2' => { + '4' => 1, + '3' => 1 + }, + '3' => undef, + '4' => { + '5' => 1 + }, + '5' => undef + }; + + # create the object; + my $graph = Bio::Coordinate::Graph->new(-graph => $hash); + + # find the shortest path between two nodes + my $a = 1; + my $b = 6; + my @path = $graph->shortest_paths($a); + print join (", ", @path), "\n"; + + +=head1 DESCRIPTION + +This class calculates the shortest path between input and output +coordinate systems in a graph that defines the relationships between +them. This class is primarely designed to analyze gene-related +coordinate systems. See L. + +Note that this module can not be used to manage graphs. + +Technically the graph implemented here is known as Directed Acyclic +Graph (DAG). DAG is composed of vertices (nodes) and edges (with +optional weights) linking them. Nodes of the graph are the coordinate +systems in gene mapper. + +The shortest path is found using the Dijkstra's algorithm. This +algorithm is fast and greedy and requires all weights to be +positive. All weights in the gene coordinate system graph are +currently equal (1) making the graph unweighted. That makes the use of +Dijkstra's algorithm an overkill. A impler and faster breadth-first +would be enough. Luckily the difference for small graphs is not +signigicant and the implementation is capable to take weights into +account if needed at some later time. + +=head2 Input format + +The graph needs to be primed using a hash of hashes where there is a +key for each node. The second keys are the names of the downstream +neighboring nodes and values are the weights for reaching them. Here +is part of the gene coordiante system graph:: + + + $hash = { + '6' => undef, + '3' => { + '6' => 1 + }, + '2' => { + '6' => 1, + '4' => 1, + '3' => 1 + }, + '1' => { + '2' => 1 + }, + '4' => { + '5' => 1 + }, + '5' => undef + }; + + +Note that the names need to be positive integrers. Root should be '1' +and directness of the graph is taken advantage of to speed +calculations by assuming that downsream nodes always have larger +number as name. + +An alternative (shorter) way of describing input is to use hash of +arrays. See L. + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email: heikki@ebi.ac.uk +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Coordinate::Graph; +use vars qw(@ISA ); +use strict; + +# Object preamble - inherits from Bio::Root::Root +use Bio::Root::Root; + +@ISA = qw(Bio::Root::Root); + + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + + my($graph, $hasharray) = + $self->_rearrange([qw( + GRAPH + HASHARRAY + )], + @args); + + $graph && $self->graph($graph); + $hasharray && $self->hasharray($hasharray); + + $self->{'_root'} = undef; + + return $self; # success - we hope! + +} + +=head2 Graph structure input methods + +=cut + +=head2 graph + + Title : graph + Usage : $obj->graph($my_graph) + Function: Read/write method for the graph structure + Example : + Returns : hash of hashes grah structure + Args : reference to a hash of hashes + +=cut + +sub graph { + + my ($self,$value) = @_; + + if ($value) { + $self->throw("Need a hash of hashes") + unless ref($value) eq 'HASH' ; + $self->{'_dag'} = $value; + + # empty the cache + $self->{'_root'} = undef; + + } + + return $self->{'_dag'}; + +} + + +=head2 hash_of_arrays + + Title : hash_of_arrays + Usage : $obj->hash_of_array(%hasharray) + Function: An alternative method to read in the graph structure. + Hash arrays are easier to type. This method converts + arrays into hashes and assigns equal values "1" to + weights. + + Example : Here is an example of simple structure containing a graph. + + my $DAG = { + 6 => [], + 5 => [], + 4 => [5], + 3 => [6], + 2 => [3, 4, 6], + 1 => [2] + }; + + Returns : hash of hashes graph structure + Args : reference to a hash of arrays + +=cut + +sub hash_of_arrays { + + my ($self,$value) = @_; + + # empty the cache + $self->{'_root'} = undef; + + if ($value) { + + $self->throw("Need a hash of hashes") + unless ref($value) eq 'HASH' ; + + #copy the hash of arrays into a hash of hashes; + my %hash; + foreach my $start ( keys %{$value}){ + $hash{$start} = undef; + map { $hash{$start}{$_} = 1 } @{$value->{$start}}; + } + + $self->{'_dag'} = \%hash; + } + + return $self->{'_dag'}; + +} + +=head2 Methods for determining the shortest path in the graph + +=cut + +=head2 shortest_path + + Title : shortest_path + Usage : $obj->shortest_path($a, $b); + Function: Method for retrieving the shortest path between nodes. + If the start node remains the same, the method is sometimes + able to use cached results, otherwise it will recalculate + the paths. + Example : + Returns : array of node names, only the start node name if no path + Args : name of the start node + : name of the end node + +=cut + + +sub shortest_path { + my ($self, $root, $end) = @_; + + $self->throw("Two arguments needed") unless @_ == 3; + $self->throw("No node name [$root]") + unless exists $self->{'_dag'}->{$root}; + $self->throw("No node name [$end]") + unless exists $self->{'_dag'}->{$end}; + + my @res; # results + my $reverse; + + if ($root > $end) { + ($root, $end) = ($end, $root ); + $reverse++; + } + + # try to use cached paths + $self->dijkstra($root) unless + defined $self->{'_root'} and $self->{'_root'} eq $root; + + return @res unless $self->{'_paths'} ; + + # create the list + my $node = $end; + my $prev = $self->{'_paths'}->{$end}{'prev'}; + while ($prev) { + unshift @res, $node; + $node = $self->{'_paths'}->{$node}{'prev'}; + $prev = $self->{'_paths'}->{$node}{'prev'}; + } + unshift @res, $node; + + $reverse ? return reverse @res : return @res; +} + + +=head2 dijkstra + + Title : dijkstra + Usage : $graph->dijkstra(1); + Function: Implements Dijkstra's algorithm. + Returns or sets a list of mappers. The returned path + description is always directed down from the root. + Called from shortest_path(). + Example : + Returns : Reference to a hash of hashes representing a linked list + which contains shortest path down to all nodes from the start + node. E.g.: + + $res = { + '2' => { + 'prev' => '1', + 'dist' => 1 + }, + '1' => { + 'prev' => undef, + 'dist' => 0 + }, + }; + + Args : name of the start node + +=cut + +sub dijkstra { + my ($self,$root) = @_; + + $self->throw("I need the name of the root node input") unless $root; + $self->throw("No node name [$root]") + unless exists $self->{'_dag'}->{$root}; + + my %est = (); # estimate hash + my %res = (); # result hash + my $nodes = keys %{$self->{'_dag'}}; + my $maxdist = 1000000; + + # cache the root value + $self->{'_root'} = $root; + + foreach my $node ( keys %{$self->{'_dag'}} ){ + if ($node eq $root) { + $est{$node}{'prev'} = undef; + $est{$node}{'dist'} = 0; + } else { + $est{$node}{'prev'} = undef; + $est{$node}{'dist'} = $maxdist; + } + } + + # remove nodes from %est until it is empty + while (keys %est) { + + #select the node closest to current one, or root node + my $min_node; + my $min = $maxdist; + foreach my $node (reverse sort keys %est) { + if ( $est{$node}{'dist'} < $min ) { + $min = $est{$node}{'dist'}; + $min_node = $node; + } + } + + # no more links between nodes + last unless ($min_node); + + # move the node from %est into %res; + $res{$min_node} = delete $est{$min_node}; + + # recompute distances to the neighbours + my $dist = $res{$min_node}{'dist'}; + foreach my $neighbour ( keys %{$self->{'_dag'}->{$min_node}} ){ + next unless $est{$neighbour}; # might not be there any more + $est{$neighbour}{'prev'} = $min_node; + $est{$neighbour}{'dist'} = + $dist + $self->{'_dag'}{$min_node}{$neighbour} + if $est{$neighbour}{'dist'} > $dist + 1 ; + } + } + return $self->{'_paths'} = \%res; +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Coordinate/MapperI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Coordinate/MapperI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,193 @@ +# $Id: MapperI.pm,v 1.5 2002/11/08 09:28:24 heikki Exp $ +# +# bioperl module for Bio::Coordinate::MapperI +# +# Cared for by Heikki Lehvaslaiho +# +# Copyright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Coordinate::MapperI - Interface describing coordinate mappers + +=head1 SYNOPSIS + + # not to be used directly + +=head1 DESCRIPTION + +MapperI defines methods for classes capable for mapping locations +between coordinate systems. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email: heikki@ebi.ac.uk +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Coordinate::MapperI; +use vars qw(@ISA ); +use strict; + +# Object preamble - inherits from Bio::Root::RootI +use Bio::Root::RootI; + +@ISA = qw(Bio::Root::RootI); + + + +=head2 in + + Title : in + Usage : $obj->in('peptide'); + Function: Set and read the input coordinate system. + Example : + Returns : value of input system + Args : new value (optional), Bio::LocationI + +=cut + +sub in { + my ($self,$value) = @_; + + $self->throw_not_implemented(); + +} + + +=head2 out + + Title : out + Usage : $obj->out('peptide'); + Function: Set and read the output coordinate system. + Example : + Returns : value of output system + Args : new value (optional), Bio::LocationI + +=cut + +sub out { + my ($self,$value) = @_; + + $self->throw_not_implemented(); +} + +=head2 swap + + Title : swap + Usage : $obj->swap; + Function: Swap the direction of mapping: input <-> output) + Example : + Returns : 1 + Args : + +=cut + +sub swap { + my ($self) = @_; + + $self->throw_not_implemented(); + +} + +=head2 test + + Title : test + Usage : $obj->test; + Function: test that both components are of same length + Example : + Returns : ( 1 | undef ) + Args : + +=cut + +sub test { + my ($self) = @_; + + $self->throw_not_implemented(); +} + + +=head2 map + + Title : map + Usage : $newpos = $obj->map($loc); + Function: Map the location from the input coordinate system + to a new value in the output coordinate system. + Example : + Returns : new value in the output coordiante system + Args : Bio::LocationI + +=cut + +sub map { + my ($self,$value) = @_; + + $self->throw_not_implemented(); + +} + +=head2 return_match + + Title : return_match + Usage : $obj->return_match(1); + Function: A flag to turn on the simplified mode of + returning only one joined Match object or undef + Example : + Returns : boolean + Args : boolean (optional) + +=cut + +sub return_match { + my ($self,$value) = @_; + if( defined $value) { + $value ? ( $self->{'_return_match'} = 1 ) : + ( $self->{'_return_match'} = 0 ); + } + return $self->{'_return_match'} || 0 ; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Coordinate/Pair.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Coordinate/Pair.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,448 @@ +# $Id: Pair.pm,v 1.9.2.1 2003/02/20 05:11:45 heikki Exp $ +# +# bioperl module for Bio::Coordinate::Pair +# +# Cared for by Heikki Lehvaslaiho +# +# Copyright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Coordinate::Pair - Continuous match between two coordinate sets + +=head1 SYNOPSIS + + use Bio::Location::Simple; + use Bio::Coordinate::Pair; + + my $match1 = Bio::Location::Simple->new + (-seq_id => 'propeptide', -start => 21, -end => 40, -strand=>1 ); + my $match2 = Bio::Location::Simple->new + (-seq_id => 'peptide', -start => 1, -end => 20, -strand=>1 ); + my $pair = Bio::Coordinate::Pair->new(-in => $match1, + -out => $match2 + ); + # location to match + $pos = Bio::Location::Simple->new + (-start => 25, -end => 25, -strand=> -1 ); + + # results are in a Bio::Coordinate::Result + # they can be Matches and Gaps; are Bio::LocationIs + $res = $pair->map($pos); + $res->isa('Bio::Coordinate::Result'); + $res->each_match == 1; + $res->each_gap == 0; + $res->each_Location == 1; + $res->match->start == 5; + $res->match->end == 5; + $res->match->strand == -1; + $res->match->seq_id eq 'peptide'; + + +=head1 DESCRIPTION + +This class represents a one continuous match between two coordinate +systems represented by Bio::Location::Simple objects. The relationship +is directed and reversible. It implements methods to ensure internal +consistency, and map continuous and split locations from one +coordinate system to another. + +The map() method returns Bio::Coordinate::Results with +Bio::Coordinate::Result::Gaps. The calling code have to deal (process +or ignore) them. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email: heikki@ebi.ac.uk +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Coordinate::Pair; +use vars qw(@ISA ); +use strict; + +# Object preamble - inherits from Bio::Root::Root +use Bio::Root::Root; +use Bio::Coordinate::MapperI; +use Bio::Coordinate::Result; +use Bio::Coordinate::Result::Match; +use Bio::Coordinate::Result::Gap; + +@ISA = qw(Bio::Root::Root Bio::Coordinate::MapperI); + + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + + my($in, $out) = + $self->_rearrange([qw(IN + OUT + )], + @args); + + $in && $self->in($in); + $out && $self->out($out); + return $self; # success - we hope! +} + +=head2 in + + Title : in + Usage : $obj->in('peptide'); + Function: Set and read the input coordinate system. + Example : + Returns : value of input system + Args : new value (optional), Bio::LocationI + +=cut + +sub in { + my ($self,$value) = @_; + if( defined $value) { + $self->throw("Not a valid input Bio::Location [$value] ") + unless $value->isa('Bio::LocationI'); + $self->{'_in'} = $value; + } + return $self->{'_in'}; +} + + +=head2 out + + Title : out + Usage : $obj->out('peptide'); + Function: Set and read the output coordinate system. + Example : + Returns : value of output system + Args : new value (optional), Bio::LocationI + +=cut + +sub out { + my ($self,$value) = @_; + if( defined $value) { + $self->throw("Not a valid output coordinate Bio::Location [$value] ") + unless $value->isa('Bio::LocationI'); + $self->{'_out'} = $value; + } + return $self->{'_out'}; +} + + +=head2 swap + + Title : swap + Usage : $obj->swap; + Function: Swap the direction of mapping; input <-> output + Example : + Returns : 1 + Args : + +=cut + +sub swap { + my ($self) = @_; + ($self->{'_in'}, $self->{'_out'}) = ($self->{'_out'}, $self->{'_in'}); + return 1; +} + +=head2 strand + + Title : strand + Usage : $obj->strand; + Function: Get strand value for the pair + Example : + Returns : ( 1 | 0 | -1 ) + Args : + +=cut + +sub strand { + my ($self) = @_; + $self->warn("Outgoing coordinates are not defined") + unless $self->out; + $self->warn("Incoming coordinates are not defined") + unless $self->in; + + return $self->in->strand * $self->out->strand; +} + +=head2 test + + Title : test + Usage : $obj->test; + Function: test that both components are of the same length + Example : + Returns : ( 1 | undef ) + Args : + +=cut + +sub test { + my ($self) = @_; + $self->warn("Outgoing coordinates are not defined") + unless $self->out; + $self->warn("Incoming coordinates are not defined") + unless $self->in; + + 1 if $self->in->end - $self->in->start == $self->out->end - $self->out->start; +} + + +=head2 map + + Title : map + Usage : $newpos = $obj->map($pos); + Function: Map the location from the input coordinate system + to a new value in the output coordinate system. + Example : + Returns : new Bio::LocationI in the output coordinate system or undef + Args : Bio::LocationI object + +=cut + +sub map { + my ($self,$value) = @_; + + $self->throw("Need to pass me a value.") + unless defined $value; + $self->throw("I need a Bio::Location, not [$value]") + unless $value->isa('Bio::LocationI'); + $self->throw("Input coordinate system not set") + unless $self->in; + $self->throw("Output coordinate system not set") + unless $self->out; + + + if ($value->isa("Bio::Location::SplitLocationI")) { + + my $result = new Bio::Coordinate::Result; + my $split = new Bio::Location::Split(-seq_id=>$self->out->seq_id); + foreach my $loc ( $value->sub_Location(1) ) { + + my $res = $self->_map($loc); + map { $result->add_sub_Location($_) } $res->each_Location; + + } + return $result; + + } else { + return $self->_map($value); + } + +} + + +=head2 _map + + Title : _map + Usage : $newpos = $obj->_map($simpleloc); + Function: Internal method that does the actual mapping. Called + multiple times by map() if the location to be mapped is a + split location + Example : + Returns : new location in the output coordinate system or undef + Args : Bio::Location::Simple + +=cut + +sub _map { + my ($self,$value) = @_; + + my $result = new Bio::Coordinate::Result; + + my $offset = $self->in->start - $self->out->start; + my $start = $value->start - $offset; + my $end = $value->end - $offset; + + my $match = Bio::Location::Simple->new; + $match->location_type($value->location_type); + $match->strand($self->strand); + + #within + # |-------------------------| + # |-| + if ($start >= $self->out->start and $end <= $self->out->end) { + + $match->seq_id($self->out->seq_id); + $result->seq_id($self->out->seq_id); + + if ($self->strand == 1) { + $match->start($start); + $match->end($end); + } else { + $match->start($self->out->end - $end + $self->out->start); + $match->end($self->out->end - $start + $self->out->start); + } + if ($value->strand) { + $match->strand($match->strand * $value->strand); + $result->strand($match->strand); + } + bless $match, 'Bio::Coordinate::Result::Match'; + $result->add_sub_Location($match); + } + #out + # |-------------------------| + # |-| or |-| + elsif ( ($end < $self->out->start or $start > $self->out->end ) or + #insertions just outside the range need special settings + ($value->location_type eq 'IN-BETWEEN' and + ($end = $self->out->start or $start = $self->out->end))) { + + $match->seq_id($self->in->seq_id); + $result->seq_id($self->in->seq_id); + $match->start($value->start); + $match->end($value->end); + $match->strand($value->strand); + + bless $match, 'Bio::Coordinate::Result::Gap'; + $result->add_sub_Location($match); + } + #partial I + # |-------------------------| + # |-----| + elsif ($start < $self->out->start and $end <= $self->out->end ) { + + $result->seq_id($self->out->seq_id); + if ($value->strand) { + $match->strand($match->strand * $value->strand); + $result->strand($match->strand); + } + my $gap = Bio::Location::Simple->new; + $gap->start($value->start); + $gap->end($self->in->start - 1); + $gap->strand($value->strand); + $gap->seq_id($self->in->seq_id); + + bless $gap, 'Bio::Coordinate::Result::Gap'; + $result->add_sub_Location($gap); + + # match + $match->seq_id($self->out->seq_id); + + if ($self->strand == 1) { + $match->start($self->out->start); + $match->end($end); + } else { + $match->start($self->out->end - $end + $self->out->start); + $match->end($self->out->end); + } + bless $match, 'Bio::Coordinate::Result::Match'; + $result->add_sub_Location($match); + } + #partial II + # |-------------------------| + # |------| + elsif ($start >= $self->out->start and $end > $self->out->end ) { + + $match->seq_id($self->out->seq_id); + $result->seq_id($self->out->seq_id); + if ($value->strand) { + $match->strand($match->strand * $value->strand); + $result->strand($match->strand); + } + if ($self->strand == 1) { + $match->start($start); + $match->end($self->out->end); + } else { + $match->start($self->out->start); + $match->end($self->out->end - $start + $self->out->start); + } + bless $match, 'Bio::Coordinate::Result::Match'; + $result->add_sub_Location($match); + + my $gap = Bio::Location::Simple->new; + $gap->start($self->in->end + 1); + $gap->end($value->end); + $gap->strand($value->strand); + $gap->seq_id($self->in->seq_id); + bless $gap, 'Bio::Coordinate::Result::Gap'; + $result->add_sub_Location($gap); + + } + #enveloping + # |-------------------------| + # |---------------------------------| + elsif ($start < $self->out->start and $end > $self->out->end ) { + + $result->seq_id($self->out->seq_id); + if ($value->strand) { + $match->strand($match->strand * $value->strand); + $result->strand($match->strand); + } + # gap1 + my $gap1 = Bio::Location::Simple->new; + $gap1->start($value->start); + $gap1->end($self->in->start - 1); + $gap1->strand($value->strand); + $gap1->seq_id($self->in->seq_id); + bless $gap1, 'Bio::Coordinate::Result::Gap'; + $result->add_sub_Location($gap1); + + # match + $match->seq_id($self->out->seq_id); + + $match->start($self->out->start); + $match->end($self->out->end); + bless $match, 'Bio::Coordinate::Result::Match'; + $result->add_sub_Location($match); + + # gap2 + my $gap2 = Bio::Location::Simple->new; + $gap2->start($self->in->end + 1); + $gap2->end($value->end); + $gap2->strand($value->strand); + $gap2->seq_id($self->in->seq_id); + bless $gap2, 'Bio::Coordinate::Result::Gap'; + $result->add_sub_Location($gap2); + + } else { + $self->throw("Should not be here!"); + } + return $result; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Coordinate/Result.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Coordinate/Result.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,296 @@ +# $Id: Result.pm,v 1.5.2.1 2003/02/20 05:11:45 heikki Exp $ +# +# bioperl module for Bio::Coordinate::Result +# +# Cared for by Heikki Lehvaslaiho +# +# Copyright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Coordinate::Result - Results from coordinate transformation + +=head1 SYNOPSIS + + use Bio::Coordinate::Result; + + #get results from a Bio::Coordinate::MapperI + $matched = $result->each_match; + +=head1 DESCRIPTION + +The results from Bio::Coordinate::MapperI are kept in an object which +itself is a split location, See L. The results +are either Matches or Gaps. See L and +L. + +If only one Match is returned, there is a convenience method of +retrieving it or accessing its methods. Same holds true for a Gap. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email: heikki@ebi.ac.uk +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Coordinate::Result; +use vars qw(@ISA ); +use strict; + +use Bio::Location::Split; +use Bio::Coordinate::ResultI; + +@ISA = qw(Bio::Location::Split Bio::Coordinate::ResultI); + + +=head2 add_location + + Title : add_sub_Location + Usage : $obj->add_sub_Location($variant) + Function: + + Pushes one Bio::LocationI into the list of variants. + + Example : + Returns : 1 when succeeds + Args : Location object + +=cut + +sub add_sub_Location { + my ($self,$value) = @_; + $self->throw("Is not a Bio::LocationI but [$value]") + unless $value->isa('Bio::LocationI'); + + $self->{'_match'} = $value + if $value->isa('Bio::Coordinate::Result::Match'); + + $self->{'_gap'} = $value + if $value->isa('Bio::Coordinate::Result::Gap'); + + $self->SUPER::add_sub_Location($value); + +} + +=head2 add_result + + Title : add_result + Usage : $obj->add_result($result) + Function: Adds the contents of one Bio::Coordinate::Result + Example : + Returns : 1 when succeeds + Args : Result object + +=cut + +sub add_result { + my ($self,$value) = @_; + + $self->throw("Is not a Bio::Coordinate::Result but [$value]") + unless $value->isa('Bio::Coordinate::Result'); + + map { $self->add_sub_Location($_);} $value->each_Location; + +} + +=head2 seq_id + + Title : seq_id + Usage : my $seqid = $location->seq_id(); + Function: Get/Set seq_id that location refers to + + We override this here in order to propagate to all sublocations + which are not remote (provided this root is not remote either) + + Returns : seq_id + Args : [optional] seq_id value to set + + +=cut + +sub seq_id { + my ($self, $seqid) = @_; + + my @ls = $self->each_Location; + if (@ls) { + return $ls[0]->seq_id; + } else { + return undef; + } + +} + + +=head2 Convenience methods + +These methods are shortcuts to Match and Gap locations. + +=cut + +=head2 each_gap + + Title : each_gap + Usage : $obj->each_gap(); + Function: + + Returns a list of Bio::Coordianate::Result::Gap objects. + + Returns : list of gaps + Args : none + +=cut + +sub each_gap{ + my ($self) = @_; + + + my @gaps; + foreach my $gap ($self->each_Location) { + push @gaps, $gap if $gap->isa('Bio::Coordinate::Result::Gap'); + } + return @gaps; + +} + + +=head2 each_match + + Title : each_match + Usage : $obj->each_match(); + Function: + + Returns a list of Bio::Coordinate::Result::Match objects. + + Returns : list of Matchs + Args : none + +=cut + +sub each_match { + my ($self) = @_; + + my @matches; + foreach my $match ($self->each_Location) { + push @matches, $match if $match->isa('Bio::Coordinate::Result::Match'); + } + return @matches; +} + +=head2 match + + Title : match + Usage : $match_object = $obj->match(); #or + $gstart = $obj->gap->start; + Function: Read only method for retrieving or accessing the match object. + Returns : one Bio::Coordinate::Result::Match + Args : + +=cut + +sub match { + my ($self) = @_; + + $self->warn("More than one match in results") + if $self->each_match > 1 and $self->verbose > 0; + unless (defined $self->{'_match'} ) { + my @m = $self->each_match; + $self->{'_match'} = $m[-1]; + } + return $self->{'_match'}; +} + +=head2 gap + + Title : gap + Usage : $gap_object = $obj->gap(); #or + $gstart = $obj->gap->start; + Function: Read only method for retrieving or accessing the gap object. + Returns : one Bio::Coordinate::Result::Gap + Args : + +=cut + +sub gap { + my ($self) = @_; + + $self->warn("More than one gap in results") + if $self->each_gap > 1 and $self->verbose > 0; + unless (defined $self->{'_gap'} ) { + my @m = $self->each_gap; + $self->{'_gap'} = $m[-1]; + } + return $self->{'_gap'}; +} + + +=head2 purge_gaps + + Title : purge_gaps + Usage : $gap_count = $obj->purge_gaps; + Function: remove all gaps from the Result + Returns : count of removed gaps + Args : + +=cut + +sub purge_gaps { + my ($self) = @_; + my @matches; + my $count = 0; + + foreach my $loc ($self->each_Location) { + if ($loc->isa('Bio::Coordinate::Result::Match')) { + push @matches, $loc; + } else { + $count++ + } + } + @{$self->{'_sublocations'}} = (); + delete $self->{'_gap'} ; + push @{$self->{'_sublocations'}}, @matches; + return $count; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Coordinate/Result/Gap.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Coordinate/Result/Gap.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,73 @@ +# $Id: Gap.pm,v 1.2.2.1 2003/02/20 05:11:45 heikki Exp $ +# +# BioPerl module for Bio::Coordinate::Result::Gap +# +# Cared for by Heikki Lehvaslaiho +# +# Copywright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code +# + +=head1 NAME + +Bio::Coordinate::Result::Gap - An other name for Bio::Location::Simple + +=head1 SYNOPSIS + + $loc = new Bio::Coordinate::Result::Gap(-start=>10, + -end=>30, + -strand=>1); + +=head1 DESCRIPTION + +This is a location object for coordinate mapping results. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email heikki@ebi.ac.uk + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +package Bio::Coordinate::Result::Gap; +use vars qw(@ISA); +use strict; + +use Bio::Location::Simple; +use Bio::Coordinate::ResultI; + +@ISA = qw(Bio::Location::Simple Bio::Coordinate::ResultI); + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Coordinate/Result/Match.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Coordinate/Result/Match.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,73 @@ +# $Id: Match.pm,v 1.2.2.1 2003/02/20 05:11:45 heikki Exp $ +# +# BioPerl module for Bio::Coordinate::Result::Match +# +# Cared for by Heikki Lehvaslaiho +# +# Copywright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code +# + +=head1 NAME + +Bio::Coordinate::Result::Match - An other name for Bio::Location::Simple + +=head1 SYNOPSIS + + $loc = new Bio::Coordinate::Result::Match(-start=>10, + -end=>30, + -strand=>+1); + +=head1 DESCRIPTION + +This is a location class for coordinate mapping results. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email heikki@ebi.ac.uk + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +package Bio::Coordinate::Result::Match; +use vars qw(@ISA); +use strict; + +use Bio::Location::Simple; +use Bio::Coordinate::ResultI; + +@ISA = qw(Bio::Location::Simple Bio::Coordinate::ResultI); + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Coordinate/ResultI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Coordinate/ResultI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,80 @@ +# $Id: ResultI.pm,v 1.1.2.1 2003/02/21 02:58:47 heikki Exp $ +# +# bioperl module for Bio::Coordinate::ResultI +# +# Cared for by Heikki Lehvaslaiho +# +# Copyright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Coordinate::ResultI - Interface to identify coordinate mapper results + +=head1 SYNOPSIS + + # not to be used directly + +=head1 DESCRIPTION + +ResultI identifies Bio::LocationIs returned by +Bio::Coordinate::MapperI implementing classes from other locations. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email: heikki@ebi.ac.uk +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Coordinate::ResultI; +use vars qw(@ISA ); +use strict; + +# Object preamble +use Bio::LocationI; + +@ISA = qw(Bio::LocationI); + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Coordinate/Utils.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Coordinate/Utils.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,157 @@ +# $Id: Utils.pm,v 1.1.2.1 2003/02/20 05:11:45 heikki Exp $ +# +# BioPerl module for Bio::Coordinate::Utils +# +# Cared for by Heikki Lehvaslaiho +# +# Copyright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Coordinate::Utils - Additional methods to create Bio::Coordinate objects + +=head1 SYNOPSIS + + use Bio::Coordinate::Utils; + # get a Bio::Align::AlignI compliant object, $aln, somehow + # it could be a Bio::SimpleAlign + + $mapper = Bio::Coordinate::Utils->from_align($aln, 1); + +=head1 DESCRIPTION + +This class is a holder of methods that work on or create +Bio::Coordinate::MapperI- compliant objects. . These methods are not +part of the Bio::Coordinate::MapperI interface and should in general +not be essential to the primary function of sequence objects. If you +are thinking of adding essential functions, it might be better to +create your own sequence class. See L, +L, and L for more. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email: heikki@ebi.ac.uk +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Coordinate::Utils; +use vars qw(@ISA); + +use Bio::Location::Simple; +use Bio::Coordinate::Pair; +use Bio::Coordinate::Collection; + +use strict; + +@ISA = qw(Bio::Root::Root); +# new inherited from Root + +=head2 from_align + + Title : from_align + Usage : $mapper = Bio::Coordinate::Utils->from_align($aln, 1); + Function: + Create a mapper out of an alignment. + The mapper will return a value only when both ends of + the input range find a match. + + Note: This implementation works only on pairwise alignments + and is not yet well tested! + + Returns : A Bio::Coordinate::MapperI + Args : Bio::Align::AlignI object + Id for the reference sequence, optional + +=cut + +sub from_align { + my ($self, $aln, $ref ) = @_; + + $aln->isa('Bio::Align::AlignI') || + $self->throw('Not a Bio::Align::AlignI object but ['. ref($self). ']'); + + # default reference sequence to the first sequence + $ref ||= 1; + + my $collection = Bio::Coordinate::Collection->new(-return_match=>1); + + # this works only for pairs, so split the MSA + # take the ref + #foreach remaining seq in aln, do: + + my $cs = $aln->consensus_string(49); + while ( $cs =~ /([^-]+)/g) { + + # alignment coordinates + my $start = pos($cs) - length($1) + 1; + my $end = $start+length($1)-1; + + my $seq1 = $aln->get_seq_by_pos(1); + my $seq2 = $aln->get_seq_by_pos(2); + + my $match1 = Bio::Location::Simple->new + (-seq_id => $seq1->id, + -start => $seq1->location_from_column($start)->start, + -end => $seq1->location_from_column($end)->start, + -strand => $seq1->strand ); + + my $match2 = Bio::Location::Simple->new + (-seq_id => $seq2->id, + -start => $seq2->location_from_column($start)->start, + -end => $seq2->location_from_column($end)->start, + -strand => $seq2->strand ); + + my $pair = Bio::Coordinate::Pair-> + new(-in => $match1, + -out => $match2 + ); + + $collection->add_mapper($pair); + } + + return @{$collection->each_mapper}[0] if $collection->each_mapper == 1; + return $collection; + +} + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/Ace.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/Ace.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,209 @@ + +# $Id: Ace.pm,v 1.10 2002/10/22 07:38:29 lapp Exp $ +# +# BioPerl module for Bio::DB::Ace +# +# Cared for by Ewan Birney +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::DB::Ace - Database object interface to ACeDB servers + +=head1 SYNOPSIS + + $db = Bio::DB::Ace->new( -server => 'myace.server.com', port => '120000'); + + $seq = $db->get_Seq_by_id('MUSIGHBA1'); # Unique ID + + # or ... + + $seq = $db->get_Seq_by_acc('J00522'); # Accession Number + +=head1 DESCRIPTION + +This provides a standard BioPerl database access to Ace, using Lincoln Steins +excellent AcePerl module. You need to download and install the aceperl module from + + http://stein.cshl.org/AcePerl/ + +before this interface will work. + +This interface is designed at the moment to work through a aceclient/aceserver +type mechanism + +=head1 INSTALLING ACEPERL + +Download the latest aceperl tar file, gunzip/untar and cd into the directory. +This is a standard CPAN-style directory, so if you go + + Perl Makefile.PL + make + + make install + +Then you will have installed Aceperl. Use the PREFIX mechanism to install elsewhere. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney + +Email birney@sanger.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::DB::Ace; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::DB::RandomAccessI + +use Bio::DB::RandomAccessI; +use Bio::Seq; + +BEGIN { + eval { + require Ace; + }; + if( $@) { + print STDERR "You have not installed Ace.pm.\n Read the docs in Bio::DB::Ace for more information about how to do this.\n It is very easy\n\nError message $@"; + } +} + + +@ISA = qw(Bio::DB::RandomAccessI); + +# new() is inherited from Bio::DB::Abstract + +# _initialize is where the heavy stuff will happen when new is called + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + my ($host,$port) = $self->_rearrange([qw( + HOST + PORT + )], + @args, + ); + + if( !$host || !$port ) { + $self->throw("Must have a host and port for an acedb server to work"); + } + + my $aceobj = Ace->connect(-host => $host, + -port => $port) || + $self->throw("Could not make acedb object to $host:$port"); + + $self->_aceobj($aceobj); + + + return $self; +} + +=head2 get_Seq_by_id + + Title : get_Seq_by_id + Usage : $seq = $db->get_Seq_by_id($uid); + Function: Gets a Bio::Seq object by its unique identifier/name + Returns : a Bio::Seq object + Args : $id : the id (as a string) of the desired sequence entry + +=cut + +sub get_Seq_by_id { + my $self = shift; + my $id = shift or $self->throw("Must supply an identifier!\n"); + my $ace = $self->_aceobj(); + my ($seq,$dna,$out); + + $seq = $ace->fetch( 'Sequence' , $id); + + # get out the sequence somehow! + + $dna = $seq->asDNA(); + + $dna =~ s/^>.*\n//; + $dna =~ s/\n//g; + + $out = Bio::Seq->new( -id => $id, -type => 'Dna', -seq => $dna, -name => "Sequence from Bio::DB::Ace $id"); + return $out; + +} + +=head2 get_Seq_by_acc + + Title : get_Seq_by_acc + Usage : $seq = $db->get_Seq_by_acc($acc); + Function: Gets a Bio::Seq object by its accession number + Returns : a Bio::Seq object + Args : $acc : the accession number of the desired sequence entry + + +=cut + +sub get_Seq_by_acc { + + my $self = shift; + my $acc = shift or $self->throw("Must supply an accesion number!\n"); + + return $self->get_Seq_by_id($acc); +} + +=head2 _aceobj + + Title : _aceobj + Usage : $ace = $db->_aceobj(); + Function: Get/Set on the acedb object + Returns : Ace object + Args : New value of the ace object + +=cut + +sub _aceobj { + my ($self,$arg) = @_; + + if( $arg ) { + $self->{'_aceobj'} = $arg; + } + + return $self->{'_aceobj'}; +} + +1; + + + + + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/Biblio/biofetch.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/Biblio/biofetch.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,309 @@ +# $Id: biofetch.pm,v 1.5 2002/10/22 07:45:14 lapp Exp $ +# +# BioPerl module Bio::DB::Biblio::biofetch.pm +# +# Cared for by Heikki Lehvaslaiho +# For copyright and disclaimer see below. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::DB::Biblio::biofetch - A BioFetch-based access to a bibliographic + citation retrieval + +=head1 SYNOPSIS + +Do not use this object directly, only access it through the +I module: + + use Bio::Biblio; + my $biblio = new Bio::Biblio (-access => 'biofetch'); + my $ref = $biblio->get_by_id('20063307')); + + my $ids = ['20063307', '98276153']; + my $refio = $biblio->get_all($ids); + while ($ref = $refio->next_bibref) { + print $ref->identifier, "\n"; + } + +=head1 DESCRIPTION + +This class uses BioFetch protocol based service to retrieve Medline +references by their ID. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Heikki Lehvaslaiho (heikki@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=head1 BUGS AND LIMITATIONS + +=over 1 + +=item * + +Only method get_by_id() is supported. + +=back + +=head1 APPENDIX + +The main documentation details are to be found in +L. + +Here is the rest of the object methods. Internal methods are preceded +with an underscore _. + +=cut + + +# Let the code begin... + + +package Bio::DB::Biblio::biofetch; +use vars qw(@ISA $VERSION %HOSTS %FORMATMAP $DEFAULTFORMAT + $Revision $DEFAULT_SERVICE $DEFAULT_NAMESPACE); +use strict; + +use Bio::Biblio; +use Bio::DB::DBFetch; +use Bio::Biblio::IO; + +@ISA = qw( Bio::DB::DBFetch Bio::Biblio); + +BEGIN { + + # set the version for version checking + $VERSION = do { my @r = (q$Revision: 1.5 $ =~ /\d+/g); sprintf "%d.%-02d", @r }; + $Revision = q$Id: biofetch.pm,v 1.5 2002/10/22 07:45:14 lapp Exp $; + + # you can add your own here theoretically. + %HOSTS = ( + 'dbfetch' => { + baseurl => 'http://%s/cgi-bin/dbfetch?db=medline&style=raw', + hosts => { + 'ebi' => 'www.ebi.ac.uk' + } + } + ); + %FORMATMAP = ( 'default' => 'medlinexml' + ); + $DEFAULTFORMAT = 'default'; + + $DEFAULT_SERVICE = 'http://www.ebi.ac.uk/cgi-bin/dbfetch'; + +} + + +sub new { + my ($class, @args ) = @_; + my $self = $class->SUPER::new(@args); + + $self->{ '_hosts' } = {}; + $self->{ '_formatmap' } = {}; + + $self->hosts(\%HOSTS); + $self->formatmap(\%FORMATMAP); + $self->{'_default_format'} = $DEFAULTFORMAT; + + return $self; +} + +=head2 get_by_id + + Title : get_by_id + Usage : $entry = $db->get__by_id('20063307') + Function: Gets a Bio::Biblio::RefI object by its name + Returns : a Bio::Biblio::Medline object + Args : the id (as a string) of the reference + +=cut + +sub get_by_id { + my ($self,$id) = @_; + my $io = $self->get_Stream_by_id([$id]); + $self->throw("id does not exist") if( !defined $io ) ; + return $io->next_bibref(); +} + + +=head2 get_all + + Title : get_all + Usage : $seq = $db->get_all($ref); + Function: Retrieves reference objects from the server 'en masse', + rather than one at a time. For large numbers of sequences, + this is far superior than get_by_id(). + Example : + Returns : a stream of Bio::Biblio::Medline objects + Args : $ref : either an array reference, a filename, or a filehandle + from which to get the list of unique ids/accession numbers. + +=cut + +sub get_all { + my ($self, $ids) = @_; + return $self->get_seq_stream('-uids' => $ids, '-mode' => 'single'); +} + +=head2 get_seq_stream + + Title : get_seq_stream + Usage : my $seqio = $self->get_seq_sream(%qualifiers) + Function: builds a url and queries a web db + Returns : a Bio::SeqIO stream capable of producing sequence + Args : %qualifiers = a hash qualifiers that the implementing class + will process to make a url suitable for web querying + +=cut + +sub get_seq_stream { + my ($self, %qualifiers) = @_; + my ($rformat, $ioformat) = $self->request_format(); + my $seen = 0; + foreach my $key ( keys %qualifiers ) { + if( $key =~ /format/i ) { + $rformat = $qualifiers{$key}; + $seen = 1; + } + } + $qualifiers{'-format'} = $rformat if( !$seen); + ($rformat, $ioformat) = $self->request_format($rformat); + + my $request = $self->get_request(%qualifiers); + my ($stream,$resp); + if( $self->retrieval_type =~ /temp/i ) { + my $dir = $self->io()->tempdir( CLEANUP => 1); + my ( $fh, $tmpfile) = $self->io()->tempfile( DIR => $dir ); + close $fh; + my ($resp) = $self->_request($request, $tmpfile); + if( ! -e $tmpfile || -z $tmpfile || ! $resp->is_success() ) { + $self->throw("WebDBSeqI Error - check query sequences!\n"); + } + $self->postprocess_data('type' => 'file', + 'location' => $tmpfile); + # this may get reset when requesting batch mode + ($rformat,$ioformat) = $self->request_format(); + if( $self->verbose > 0 ) { + open(ERR, "<$tmpfile"); + while() { $self->debug($_);} + } + $stream = new Bio::Biblio::IO('-format' => $ioformat, + '-file' => $tmpfile); + } elsif( $self->retrieval_type =~ /io_string/i ) { + my ($resp) = $self->_request($request); + my $content = $resp->content_ref; + $self->debug( "content is $$content\n"); + if( ! $resp->is_success() || length(${$resp->content_ref()}) == 0 ) { + $self->throw("WebDBSeqI Error - check query sequences!\n"); + } + ($rformat,$ioformat) = $self->request_format(); + $self->postprocess_data('type'=> 'string', + 'location' => $content); + $stream = new Bio::Biblio::IO('-format' => $ioformat, +# '-data' => "". $$content. ""); + '-data' => $$content + ); + } else { + $self->throw("retrieval type " . $self->retrieval_type . + " unsupported\n"); + } + return $stream; +} + + +=head2 postprocess_data + + Title : postprocess_data + Usage : $self->postprocess_data ( 'type' => 'string', + 'location' => \$datastr); + Function: process downloaded data before loading into a Bio::SeqIO + Returns : void + Args : hash with two keys - 'type' can be 'string' or 'file' + - 'location' either file location or string + reference containing data + +=cut + +# the default method, works for genbank/genpept, other classes should +# override it with their own method. + +sub postprocess_data { + my ($self, %args) = @_; + my $data; + my $type = uc $args{'type'}; + my $location = $args{'location'}; + if( !defined $type || $type eq '' || !defined $location) { + return; + } elsif( $type eq 'STRING' ) { + $data = $$location; + } elsif ( $type eq 'FILE' ) { + open(TMP, $location) or $self->throw("could not open file $location"); + my @in = ; + close TMP; + $data = join("", @in); + } + + $data = "". $data. ""; + + if( $type eq 'FILE' ) { + open(TMP, ">$location") or $self->throw("could overwrite file $location"); + print TMP $data; + close TMP; + } elsif ( $type eq 'STRING' ) { + ${$args{'location'}} = $data; + } + + $self->debug("format is ". $self->request_format(). " data is $data\n"); + +} + + + + +=head2 VERSION and Revision + + Usage : print $Bio::DB::Biblio::biofetch::VERSION; + print $Bio::DB::Biblio::biofetch::Revision; + +=cut + +=head2 Defaults + + Usage : print $Bio::DB::Biblio::biofetch::DEFAULT_SERVICE; + +=cut + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/Biblio/soap.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/Biblio/soap.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,558 @@ +# $Id: soap.pm,v 1.5 2002/10/22 07:45:14 lapp Exp $ +# +# BioPerl module Bio::DB::Biblio::soap.pm +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::DB::Biblio::soap - A SOAP-based access to a bibliographic query service + +=head1 SYNOPSIS + +Do not use this object directly, it is recommended to access it and use +it through the I module: + + use Bio::Biblio; + my $biblio = new Bio::Biblio (-access => 'soap'); + +=head1 DESCRIPTION + +This object contains the real implementation of a Bibliographic Query +Service as defined in L - using a SOAP protocol +to access a WebService (a remote server) that represents a +bibliographic repository. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=head1 BUGS AND LIMITATIONS + +=over + +=item * + +Methods returning a boolean value (I, I and +I) can be used only with SOAP::Lite version 0.52 and newer +(probably due to a bug in the older SOAP::Lite). + +=item * + +It does not use WSDL. Coming soon... + +=item * + +There is an open question to discuss: should the service return +citations as type I or rather as type I? What is +faster? What is better for keeping non-ASCII characters untouched? How +the decision would be influenced if the transparent compression +support is introduced? + +=item * + +More testing and debugging needed to ensure that returned citations +are properly transferred even if they contain foreign characters. + +=back + +=head1 APPENDIX + +The main documentation details are to be found in +L. + +Here is the rest of the object methods. Internal methods are preceded +with an underscore _. + +=cut + + +# Let the code begin... + + +package Bio::DB::Biblio::soap; +use vars qw(@ISA $VERSION $Revision $DEFAULT_SERVICE $DEFAULT_NAMESPACE); +use strict; + +use Bio::Biblio; # TBD: ?? WHY SHOULD I DO THIS ?? +use SOAP::Lite + on_fault => sub { + my $soap = shift; + my $res = shift; + my $msg = + ref $res ? "--- SOAP FAULT ---\n" . $res->faultcode . " " . $res->faultstring + : "--- TRANSPORT ERROR ---\n" . $soap->transport->status; + Bio::DB::Biblio::soap->throw ( -text => $msg ); + } +; + +@ISA = qw(Bio::Biblio); + +BEGIN { + # set the version for version checking + $VERSION = do { my @r = (q$Revision: 1.5 $ =~ /\d+/g); sprintf "%d.%-02d", @r }; + $Revision = q$Id: soap.pm,v 1.5 2002/10/22 07:45:14 lapp Exp $; + + # where to go... + $DEFAULT_SERVICE = 'http://industry.ebi.ac.uk/soap/openBQS'; + + # ...and what to find there + $DEFAULT_NAMESPACE = 'http://industry.ebi.ac.uk/openBQS'; +} + +# ----------------------------------------------------------------------------- + +=head2 _initialize + + Usage : my $obj = new Bio::Biblio (-access => 'soap' ...); + (_initialize is internally called from this constructor) + Returns : nothing interesting + Args : This module recognises and uses following arguments: + + -namespace => 'urn' + The namespace used by the WebService that is being + accessed. It is a string which guarantees its world-wide + uniqueness - therefore it often has a style of a URL - + but it does not mean that such pseudo-URL really exists. + + Default is 'http://industry.ebi.ac.uk/openBQS' + (which well corresponds with the default '-location' - + see module Bio::Biblio). + + -destroy_on_exit => '0' + Default value is '1' which means that all Bio::Biblio + objects - when being finalised - will send a request + to the remote WebService to forget the query collections + they represent. + + If you change it to '0' make sure that you know the + query collection identification - otherwise you will + not be able to re-established connection with it. + This can be done by calling method get_collection_id. + + -collection_id => '...' + It defines what query collection will this object work + with. Use this argument when you know a collection ID + of an existing query collection and when you wish to + re-established connection with it. + + By default, the collection IDs are set automatically + by the query methods - they return Bio::Biblio objects + already having a collection ID. + + A missing or undefined collection ID means that the + object represents the whole bibliographic repository + (which again means that some methods, like get_all, + will be probably refused). + + -soap => a SOAP::Lite object + Usually all Bio::Biblio objects share an instance of + the underlying SOAP::Lite module. But you are free + to have more - perhaps with different characteristics. + + See the code for attributes of the default SOAP::Lite + object. + + -httpproxy => 'http://server:port' + In addition to the 'location' parameter, you may need + to specify also a location/URL of a HTTP proxy server + (if your site requires one). + + Additionally, the main module Bio::Biblio recognises + also: + -access => '...' + -location => '...' + +It populates calling object with the given arguments, and then - for +some attributes and only if they are not yet populated - it assigns +some default values. + +This is an actual new() method (except for the real object creation +and its blessing which is done in the parent class Bio::Root::Root in +method _create_object). + +Note that this method is called always as an I method (never as +a I method) - and that the object who calls this method may +already be partly initiated (from Bio::Biblio::new method); so if you +need to do some tricks with the 'class invocation' you need to change +Bio::Biblio::new method, not this one. + +=cut + +sub _initialize { + my ($self, @args) = @_; + + # make a hashtable from @args + my %param = @args; + @param { map { lc $_ } keys %param } = values %param; # lowercase keys + + # copy all @args into this object (overwriting what may already be + # there) - changing '-key' into '_key' + my $new_key; + foreach my $key (keys %param) { + ($new_key = $key) =~ s/^-/_/; + $self->{ $new_key } = $param { $key }; + } + + # finally add default values for those keys who have default value + # and who are not yet in the object + $self->{'_location'} = $DEFAULT_SERVICE unless $self->{'_location'}; + $self->{'_namespace'} = $DEFAULT_NAMESPACE unless $self->{'_namespace'}; + $self->{'_destroy_on_exit'} = 1 unless defined $self->{'_destroy_on_exit'}; + unless ($self->{'_soap'}) { + if (defined $self->{'_httpproxy'}) { + $self->{'_soap'} = SOAP::Lite + -> uri ($self->{'_namespace'}) + -> proxy ($self->{'_location'}, + proxy => ['http' => $self->{'_httpproxy'}]); + } else { + $self->{'_soap'} = SOAP::Lite + -> uri ($self->{'_namespace'}) + -> proxy ($self->{'_location'}); + } + } +} + +# ----------------------------------------------------------------------------- + +# +# objects representing query collections are being destroyed if they +# have attribute '_destroy_on_exit' set to true - which is a default +# value +# +sub DESTROY { + my $self = shift; + my $soap = $self->{'_soap'}; + my $destroy = $self->{'_destroy_on_exit'}; + return unless $destroy; + my $collection_id = $self->{'_collection_id'}; + return unless $collection_id; + + # ignore all errors here + eval { + $soap->destroy (SOAP::Data->type (string => $collection_id)); + } +} + +# +# some methods must be called with an argument containing a collection +# ID; here we return a proper error message explaining it +# +sub _no_id_msg { + my $self = shift; + my $package = ref $self; + my $method = (caller(1))[3]; + my $strip_method = $method; + $strip_method =~ s/^$package\:\://; + + return <<"END_OF_MSG"; +Method '$method' works only if its object has a query collection ID. +Perhaps you need to use: +\tnew Bio::Biblio (-collection_id => '1234567')->$strip_method; +or to obtain a collection ID indirectly from a query method: +\tnew Bio::Biblio->find ('keyword')->$strip_method; +END_OF_MSG +} + +# +# some methods do not work with older SOAP::Lite version; here we +#return message explaining it +# +sub _old_version_msg { + my $self = shift; + my $method = (caller(1))[3]; + + return <<"END_OF_MSG"; +Method '$method' works only with SOAP::Lite +version 0.52 and newer (the problem is with returning a boolean value from the server). +END_OF_MSG +} + +# +# some controlled vocabulary methods needs two parameters; here we +# return message explaining it +# +sub _two_params_msg { + my $self = shift; + my $method = (caller(1))[3]; + + return <<"END_OF_MSG"; +Method '$method' expects two parameters: vocabulary name and a value. +END_OF_MSG +} + +# +# some controlled vocabulary methods needs a vocabulary name; here we +# return message explaining it +# +sub _missing_name_msg { + my $self = shift; + my $method = (caller(1))[3]; + + return <<"END_OF_MSG"; +Method '$method' expects vocabulary name as parameter. +END_OF_MSG +} + +# +# return a copy of a given array, with all its elements replaced +# with the SOAP-Data objects defining elements type as 'string' +# +sub _as_strings { + my ($ref_input_array) = @_; + my (@result) = map { SOAP::Data->new (type => 'string', value => $_) } @$ref_input_array; + return \@result; +} + +# --------------------------------------------------------------------- +# +# Here are the methods implementing Bio::DB::BiblioI interface +# (documentation is in Bio::DB::BiblioI) +# +# --------------------------------------------------------------------- + +sub get_collection_id { + my ($self) = @_; + $self->{'_collection_id'}; +} + +sub get_count { + my ($self) = @_; + my $soap = $self->{'_soap'}; + my ($collection_id) = $self->{'_collection_id'}; + if ($collection_id) { + $soap->getBibRefCount (SOAP::Data->type (string => $collection_id))->result; + } else { + $soap->getBibRefCount->result; + } +} + +# try: 94033980 +sub get_by_id { + my ($self, $citation_id) = @_; + $self->throw ("Citation ID is expected as a parameter of method 'get_by_id'.") + unless $citation_id; + my $soap = $self->{'_soap'}; + $soap->getById (SOAP::Data->type (string => $citation_id))->result; +} + +sub find { + my ($self, $keywords, $attrs) = @_; + my (@keywords, @attrs); + + # $keywords can be a comma-delimited scalar or a reference to an array + if ($keywords) { + my $ref = ref $keywords; + @keywords = split (/,/, $keywords) unless $ref; + @keywords = @$keywords if $ref =~ /ARRAY/; + } + $self->throw ("No keywords given in 'find' method.\n") + unless (@keywords); + + # ...and the same with $attrs + if ($attrs) { + my $ref = ref $attrs; + @attrs = split (/,/, $attrs) unless $ref; + @attrs = @$attrs if $ref =~ /ARRAY/; + } + + my $soap = $self->{'_soap'}; + my $collection_id = $self->{'_collection_id'}; + my $new_id; + if ($collection_id) { + if (@attrs) { + $new_id = $soap->find (SOAP::Data->type (string => $collection_id), + &_as_strings (\@keywords), + &_as_strings (\@attrs))->result; + } else { + $new_id = $soap->find (SOAP::Data->type (string => $collection_id), + &_as_strings (\@keywords))->result; + } + } else { + if (@attrs) { + $new_id = $soap->find (&_as_strings (\@keywords), + &_as_strings (\@attrs))->result; + + + } else { + $new_id = $soap->find (&_as_strings (\@keywords))->result; + } + } + + # clone itself but change the collection ID to a new one + return $self->new (-collection_id => $new_id, + -parent_collection_d => $collection_id); +} + +sub get_all_ids { + my ($self) = @_; + my $soap = $self->{'_soap'}; + my ($collection_id) = $self->{'_collection_id'}; + $self->throw ($self->_no_id_msg) unless $collection_id; + $soap->getAllIDs (SOAP::Data->type (string => $collection_id))->result; +} + +sub get_all { + my ($self) = @_; + my $soap = $self->{'_soap'}; + my ($collection_id) = $self->{'_collection_id'}; + $self->throw ($self->_no_id_msg) unless $collection_id; + $soap->getAllBibRefs (SOAP::Data->type (string => $collection_id))->result; +} + +sub has_next { + my ($self) = @_; + my $soap = $self->{'_soap'}; + my ($collection_id) = $self->{'_collection_id'}; + $self->throw ($self->_no_id_msg) unless $collection_id; + $self->throw ($self->_old_version_msg) if $SOAP::Lite::VERSION < 0.52; + $soap->hasNext (SOAP::Data->type (string => $collection_id))->result; +} + +sub get_next { + my ($self) = @_; + my $soap = $self->{'_soap'}; + my ($collection_id) = $self->{'_collection_id'}; + $self->throw ($self->_no_id_msg) unless $collection_id; + my $ra = $soap->getNext (SOAP::Data->type (string => $collection_id))->result; + $self->{'_collection_id'} = shift @{ $ra }; + shift @{ $ra }; +} + +sub get_more { + my ($self, $how_many) = @_; + my $soap = $self->{'_soap'}; + my $collection_id = $self->{'_collection_id'}; + $self->throw ($self->_no_id_msg) unless $collection_id; + + unless (defined ($how_many) and $how_many =~ /^\d+$/) { + warn ("Method 'get_more' expects a numeric argument. Changing to 1.\n"); + $how_many = 1; + } + unless ($how_many > 0) { + warn ("Method 'get_more' expects a positive argument. Changing to 1.\n"); + $how_many = 1; + } + + my $ra = $soap->getMore (SOAP::Data->type (string => $collection_id), + SOAP::Data->type (int => $how_many))->result; + $self->{'_collection_id'} = shift @{ $ra }; + $ra; +} + +sub reset_retrieval { + my ($self) = @_; + my $soap = $self->{'_soap'}; + my ($collection_id) = $self->{'_collection_id'}; + $self->throw ($self->_no_id_msg) unless $collection_id; + $self->{'_collection_id'} = $soap->resetRetrieval (SOAP::Data->type (string => $collection_id))->result; +} + +sub exists { + my ($self) = @_; + my $soap = $self->{'_soap'}; + my ($collection_id) = $self->{'_collection_id'}; + $self->throw ($self->_no_id_msg) unless $collection_id; + $self->throw ($self->_old_version_msg) if $SOAP::Lite::VERSION < 0.52; + $soap->exists (SOAP::Data->type (string => $collection_id))->result; +} + +sub destroy { + my ($self) = @_; + my $soap = $self->{'_soap'}; + my ($collection_id) = $self->{'_collection_id'}; + $self->throw ($self->_no_id_msg) unless $collection_id; + $soap->destroy (SOAP::Data->type (string => $collection_id)); +} + +sub get_vocabulary_names { + my ($self) = @_; + my $soap = $self->{'_soap'}; + $soap->getAllVocabularyNames->result; +} + +sub contains { + my ($self, $vocabulary_name, $value) = @_; + my $soap = $self->{'_soap'}; + $self->throw ($self->_old_version_msg) if $SOAP::Lite::VERSION < 0.52; + $self->throw ($self->_two_params_msg) + unless defined $vocabulary_name and defined $value; + $soap->contains (SOAP::Data->type (string => $vocabulary_name), + SOAP::Data->type (string => $value))->result; +} + +sub get_entry_description { + my ($self, $vocabulary_name, $value) = @_; + my $soap = $self->{'_soap'}; + $self->throw ($self->_two_params_msg) + unless defined $vocabulary_name and defined $value; + $soap->getEntryDescription (SOAP::Data->type (string => $vocabulary_name), + SOAP::Data->type (string => $value))->result; +} + +sub get_all_values { + my ($self, $vocabulary_name) = @_; + my $soap = $self->{'_soap'}; + $self->throw ($self->_missing_name_msg) + unless defined $vocabulary_name; + $soap->getAllValues (SOAP::Data->type (string => $vocabulary_name))->result; +} + +sub get_all_entries { + my ($self, $vocabulary_name) = @_; + my $soap = $self->{'_soap'}; + $self->throw ($self->_missing_name_msg) + unless defined $vocabulary_name; + $soap->getAllEntries (SOAP::Data->type (string => $vocabulary_name))->result; +} + +=head2 VERSION and Revision + + Usage : print $Bio::DB::Biblio::soap::VERSION; + print $Bio::DB::Biblio::soap::Revision; + +=cut + +=head2 Defaults + + Usage : print $Bio::DB::Biblio::soap::DEFAULT_SERVICE; + print $Bio::DB::Biblio::soap::DEFAULT_NAMESPACE; + +=cut + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/BiblioI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/BiblioI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,493 @@ +# $Id: BiblioI.pm,v 1.5 2002/10/22 07:45:13 lapp Exp $ +# +# BioPerl module for Bio::DB::BiblioI +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. +# + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::DB::BiblioI - An interface to a Bibliographic Query Service + +=head1 SYNOPSIS + +This is an interface module - you do not instantiate it. +Use I module: + + use Bio::Biblio; + my $biblio = new Bio::Biblio (@args); + +=head1 DESCRIPTION + +This interface describes the methods for accessing a bibliographic +repository, for quering it and for retrieving citations from it. The +retrieved citations are in XML format and can be converted to perl +objects using I. + +The interface complies (with some simplifications) with the +specification described in the B project. Its home page is at +http://industry.ebi.ac.uk/openBQS + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=head1 APPENDIX + +This is actually the main documentation... + +If you try to call any of these methods directly on this +Bio::DB::BiblioI object you will get a I error +message. You need to call them on a Bio::Biblio object. + +=cut + + +# Let the code begin... + +package Bio::DB::BiblioI; +use vars qw(@ISA $VERSION $Revision); +use strict; +use Bio::Root::RootI; + +@ISA = qw(Bio::Root::RootI); + +BEGIN { + $VERSION = do { my @r = (q$Revision: 1.5 $ =~ /\d+/g); sprintf "%d.%-02d", @r }; + $Revision = q$Id: BiblioI.pm,v 1.5 2002/10/22 07:45:13 lapp Exp $; +} + +# ----------------------------------------------------------------------------- + +=head2 get_collection_id + + Usage : my $collection_id = $biblio->get_collection_id; + Returns : string identifying a query collection + represented by the $biblio object + Args : none + +Every query collection is uniquely identify-able by its collection +ID. The returned value can be used to populate another $biblio object +and then to access that collection. + +=cut + +sub get_collection_id { + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + + +# ----------------------------------------------------------------------------- + +=head2 get_count + + Usage : my $count = $biblio->get_count; + Returns : integer + Args : none, or a string identifying a query collection + +It returns a number of citations in the query collection represented +by the calling $biblio object, or in the collection whose ID is given +as an argument. + +=cut + +sub get_count { shift->throw_not_implemented(); } + +# ----------------------------------------------------------------------------- + +=head2 find + + Usage : my $new_biblio = $biblio->find ($keywords, $attrs); + my $new_biblio = $biblio->find ('perl', 'abstract'); + my $new_biblio = $biblio->find ( [ 'perl', 'Java' ] ); + Returns : new Bio::Biblio object representing a new query + collection + Args : $keywords - what to look for (mandatory) + - a comma-delimited list of keywords, or + - an array reference with keywords as elements + $attrs - where to look in (optional) + - a comma-delimited list of attribute names, or + - an array reference with attribute names as elements + +This is the main query method. It looks for the $keywords in a default +set of attributes, or - if $attrs given - only in the given +attributes. + +Because it returns a new Bio::Biblio object which can be again queried +it is possible to chain together several invocations: + + $biblio->find ('Brazma')->find ('Robinson')->get_collection_id; + +=cut + +sub find { shift->throw_not_implemented; } + +# ----------------------------------------------------------------------------- + +# TBD: AFAIK this method is not implemented on the server side. +# Let's comment it out for the time being... +#sub query { shift->throw_not_implemented(); } + +# ----------------------------------------------------------------------------- + +=head2 reset_retrieval + + Usage : $biblio->reset_retrieval; + Returns : nothing + Args : none + +It sets an iterator stored in the $biblio object back to its +beginning. After this, the retrieval methods I, I +and I start to iterate the underlying query collection +again from its start. + +It throws an exception if this object does not represent any query +result (e.i. it does not contain a collection ID). Note that a +collection ID is created automatically when this object was returned +by a I method, or it can be assigned in a constructor using +argument I<-collection_id>. + +=cut + +sub reset_retrieval { shift->throw_not_implemented; } + +# ----------------------------------------------------------------------------- + +=head2 get_next + + Usage : my $citation = $biblio->get_next; + Returns : a citation in an XML format + Args : none + +It returns the next available citation from the underlying query +collection. It throws an exception if there are no more citations. In +order to avoid this use it together with the I method: + + my $result = $biblio->find ('brazma', 'authors'); + while ( $result->has_next ) { + print $result->get_next; + } + +It also throws an exception if this object does not represent any +query result - see explanation in the I elsewhere in +this document. + +=cut + +sub get_next { shift->throw_not_implemented; } + +# ----------------------------------------------------------------------------- + +=head2 get_more + + Usage : my $r_citations = $biblio->get_more (5); + Returns : an array reference - each element has a citation + in an XML format + Args : an integer 'how_many' citations to return; + default is 1 - but it is assigned with warning + +It returns the next I available citations from the +underlying query collection. It does not throw any exception if +'how_many' is more than currently available - it simply returns +less. However, it throws an exception if used again without calling +first I. + +It also throws an exception if this object does not represent any +query result - see explanation in method I elsewhere +in this document. + +=cut + +sub get_more { shift->throw_not_implemented; } + +# ----------------------------------------------------------------------------- + +=head2 has_next + + Usage : my $is = $biblio->has_next; + Returns : 1 or undef + Args : none + +It returns 1 if there is a next citation available in the underlying +query collection. Otherwise it returns undef. + +It throws an exception if this object does not represent any query +result - see explanation in method I elsewhere in +this document. + +=cut + +sub has_next { shift->throw_not_implemented; } + +# ----------------------------------------------------------------------------- + +=head2 get_all_ids + + Usage : my $r_ids = $biblio->get_all_ids; + Returns : an array reference - each element has + a citation identifier + Args : none + +The identifiers of all citations in the underlying query collection +are returned. A usual pattern is to use them then in the I +method: + + my $biblio = $repository->find ('brazma')->find ('robinson'); + foreach my $id ( @{ $biblio->get_all_ids } ) { + print $biblio->get_by_id ($id); + } + +It throws an exception if this object does not represent any query +result - see explanation in method I elsewhere in +this document. + +=cut + +sub get_all_ids { shift->throw_not_implemented; } + +# ----------------------------------------------------------------------------- + +=head2 get_by_id + + Usage : my $citation = $biblio->get_by_id ('94033980'); + Returns : a citation in an XML format + Args : a citation identifier + (e.g. for MEDLINE it is a MedlineID + - at least for the time being) + +It returns a citation - disregarding if the citation is or is not in +the underlying query collection (of course, it must be in the +repository). + +=cut + +sub get_by_id { shift->throw_not_implemented; } + +# ----------------------------------------------------------------------------- + +=head2 get_all + + Usage : my $all = $biblio->get_all; + Returns : a (big) string with all citations in an XML format + Args : none + +It returns an XML valid string (which means that individual citations +are also surrounded by a "set" XML tag) representing all citations +from the underlying query collection. + +Note that some servers may limit the number of citations which can be +returned by this method. In such case you need either to refine +further your query collection (using I method) or to retrieve +results by iteration (methods I, I, I). + +It throws an exception if this object does not represent any query +result - see explanation in method I elsewhere in +this document. + +=cut + +sub get_all { shift->throw_not_implemented; } + +# ----------------------------------------------------------------------------- + +=head2 exists + + Usage : my $exists = $biblio->exists; + Returns : 1 or undef + Args : none + +It returns 1 if the underlying query collection represented by the +$biblio object still exists (on the server side). + +If you have a collection ID (e.g. stored or printed in a previous +session) but you do not have anymore a C object representing +it this is how you can check the collection existence: + + use Bio::Biblio; + print + new Bio::Biblio (-collection_id => '1014324148861')->exists; + +It throws an exception if this object does not represent any query +result - see explanation in method I elsewhere in +this document. + +=cut + +sub exists { shift->throw_not_implemented; } + +# ----------------------------------------------------------------------------- + +=head2 destroy + + Usage : $biblio->destroy; + Returns : nothing + Args : none + +It sends a message to the remote server to forget (or free, or destroy +- whatever server choose to do) the query collection represented by +this object. + +It throws an exception if this object does not represent any query +collection. + +=cut + +sub destroy { shift->throw_not_implemented; } + +# ----------------------------------------------------------------------------- + +=head2 get_vocabulary_names + + Usage : print join ("\n", @{ $biblio->get_vocabulary_names }); + Returns : an array reference - each element has a name + of a controlled vocabulary + Args : none + +The controlled vocabularies allow to introspect bibliographic +repositories and to find what citation resource types (such as journal +and book articles, patents or technical reports) are provided by the +repository, what attributes they have, eventually what attribute +values are allowed. + +This method returns names of all available controlled +vocabularies. The names can than be used in other methods dealing with +vocabularies: I, I, +I, and I. + +=cut + +sub get_vocabulary_names { shift->throw_not_implemented; } + +# ----------------------------------------------------------------------------- + +=head2 contains + + Usage : my $yes = $biblio->contains ($vocabulary_name, $value); + Returns : 1 or undef + Args : $vocabulary_name defines a vocabulary where to look, + and a $value defines what to look for + +It returns 1 if the given controlled vocabulary contains the given +value. + +For example, when you know, that a vocabulary +C contains value C you can +use it in the I method: + + $biblio->find ('United States', 'COUNTRY'); + +=cut + +sub contains { shift->throw_not_implemented; } + +# ----------------------------------------------------------------------------- + +=head2 get_entry_description + + Usage : $biblio->get_entry_description ($voc_name, $value); + Returns : a string with a desciption + Args : $voc_name defines a vocabulary where to look, + and a $value defines whose description to return + +Each vocabulary entry has its value (mandatory attribute), and can +have a description (optional attribute). The description may be just a +human readable explanation of an attribute, or it can have more exact +meaning. For example, the server implementation of the bibliographic +query service provided by the EBI puts into attribute descriptions +words I and/or I to distinguish the role of +the attributes. + +It throws an exception if either vocabulary or value do not exist. + +=cut + +sub get_entry_description { shift->throw_not_implemented; } + +# ----------------------------------------------------------------------------- + +=head2 get_all_values + + Usage : $biblio->get_all_values ($vocabulary_name); + Returns : an array reference - each element has a value (scalar) + from the given controlled vocabulary + Args : $vocabulary_name defines a vocabulary whose values + are being returned + +It returns all values of the given vocabulary. It throws an exception +if the vocabulary does not exist. + +=cut + +sub get_all_values { shift->throw_not_implemented; } + +# ----------------------------------------------------------------------------- + +=head2 get_all_entries + + Usage : $biblio->get_all_entries ($vocabulary_name); + Returns : a hash reference - keys are vocabulary values + and values are their descriptions + Args : $vocabulary_name defines a vocabulary whose entries + are being returned + +It returns pairs of values and their descriptions of the whole +vocabulary. It throws an exception if the vocabulary does not exist. + +This is one way how to get it and print it: + + my $name = 'MEDLINE2002/JournalArticle/properties'; + use Data::Dumper; + print Data::Dumper->Dump ( [$biblio->get_all_entries ($name)], + ['All entries']); + +=cut + +sub get_all_entries { shift->throw_not_implemented; } + +# ----------------------------------------------------------------------------- + +=head2 VERSION and Revision + + Usage : print $Bio::DB::BiblioI::VERSION; + print $Bio::DB::BiblioI::Revision; + +=cut + +1; +__END__ + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/BioFetch.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/BioFetch.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,516 @@ +# $Id: BioFetch.pm,v 1.13.2.1 2003/06/25 13:44:18 heikki Exp $ +# +# BioPerl module for Bio::DB::BioFetch +# +# Cared for by Lincoln Stein +# +# Copyright Lincoln Stein +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code +# + +package Bio::DB::BioFetch; +use strict; +use Bio::DB::WebDBSeqI; +use HTTP::Request::Common 'POST'; + +=head1 NAME + +Bio::DB::BioFetch - Database object interface to BioFetch retrieval + +=head1 SYNOPSIS + + use Bio::DB::BioFetch; + + $bf = new Bio::DB::BioFetch; + + $seq = $sp->get_Seq_by_id('BUM'); # EMBL or SWALL ID + + # change formats, storage procedures + $bf = new Bio::DB::BioFetch(-format => 'fasta', + -retrievaltype => 'tempfile', + -db => 'EMBL'); + + $stream = $bf->get_Stream_by_id(['BUM','J00231']); + while (my $s = $stream->next_seq) { + print $s->seq,"\n"; + } + # get a RefSeq entry + $bf->db('refseq'); + eval { + $seq = $bf->get_Seq_by_version('NM_006732.1'); # RefSeq VERSION + }; + print "accession is ", $seq->accession_number, "\n" unless $@; + + +=head1 DESCRIPTION + +Bio::DB::BioFetch is a guaranteed best effort sequence entry fetching +method. It goes to the Web-based dbfetch server located at the EBI +(http://www.ebi.ac.uk/cgi-bin/dbfetch) to retrieve sequences in the +EMBL or GenBank sequence repositories. + +This module implements all the Bio::DB::RandomAccessI interface, plus +the get_Stream_by_id() and get_Stream_by_acc() methods that are found +in the Bio::DB::SwissProt interface. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Lincoln Stein + +Email Lincoln Stein Elstein@cshl.orgE + +Also thanks to Heikki Lehvaslaiho Eheikki@ebi.ac.ukE for the +BioFetch server and interface specification. + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... +use vars qw(@ISA $VERSION %FORMATMAP ); +use Bio::Root::Root; +@ISA = qw(Bio::DB::WebDBSeqI Bio::Root::Root); +$VERSION = '1.0'; + +# warning: names used here must map into Bio::SeqIO::* space +use constant DEFAULT_LOCATION => 'http://www.ebi.ac.uk/cgi-bin/dbfetch'; + +BEGIN { + + %FORMATMAP = ( + 'embl' => { + default => 'embl', # default BioFetch format/SeqIOmodule pair + embl => 'embl', # alternative BioFetch format/module pair + fasta => 'fasta', # alternative BioFetch format/module pair + namespace => 'embl', + }, + 'swissprot' => { + default => 'swiss', + swissprot => 'swiss', + fasta => 'fasta', + namespace => 'swall', + }, + 'refseq' => { + default => 'genbank', + genbank => 'genbank', + fasta => 'fasta', + namespace => 'RefSeq', + }, + 'swall' => { + default => 'swiss', + swissprot => 'swiss', + fasta => 'fasta', + namespace => 'swall', + }, + 'genbank' => { + default => 'genbank', + genbank => 'genbank', + namespace => 'genbank', + }, + 'genpep' => { + default => 'genbank', + genbank => 'genbank', + namespace => 'genpep', + }, + ); +} + +=head2 new + + Title : new + Usage : $bf = Bio::DB::BioFetch->new(@args) + Function: Construct a new Bio::DB::BioFetch object + Returns : a Bio::DB::BioFetch object + Args : see below + Throws : + +@args are standard -name=Evalue options as listed in the following +table. If you do not provide any options, the module assumes reasonable +defaults. + + Option Value Default + ------ ----- ------- + + -baseaddress location of dbfetch server http://www.ebi.ac.uk/cgi-bin/dbfetch + -retrievaltype "tempfile" or "io_string" io_string + -format "embl", "fasta", "swissprot", embl + or "genbank" + -db "embl", "genbank" or "swissprot" embl + +=cut + +#' +sub new { + my ($class,@args) = @_; + my $self = $class->SUPER::new(@args); + my ($db) = $self->_rearrange([qw(DB)],@args); + $db ||= $self->default_db; + $self->db($db); + $self->url_base_address(DEFAULT_LOCATION) unless $self->url_base_address; + $self; +} + +=head2 new_from_registry + + Title : new_from_registry + Usage : $biofetch = $db->new_from_registry(%config) + Function: Creates a BioFetch object from the registry config hash + Returns : itself + Args : A configuration hash (see Registry.pm) + Throws : + + +=cut + +sub new_from_registry { + my ($class,%config)=@_; + + my $self = $class->SUPER::new( + -BASEADDRESS=>$config{'location'} + ); + $self->db($config{'dbname'}) if $config{dbname}; + return $self; +} + +# from Bio::DB::RandomAccessI + +=head2 get_Seq_by_id + + Title : get_Seq_by_id + Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN') + Function: Gets a Bio::Seq object by its name + Returns : a Bio::Seq object + Args : the id (as a string) of a sequence + Throws : "id does not exist" exception + + +=cut + +=head2 get_Seq_by_acc + + Title : get_Seq_by_acc + Usage : $seq = $db->get_Seq_by_acc('X77802'); + Function: Gets a Bio::Seq object by accession number + Returns : A Bio::Seq object + Args : accession number (as a string) + Throws : "acc does not exist" exception + +=cut + +=head2 get_Seq_by_gi + + Title : get_Seq_by_gi + Usage : $seq = $db->get_Seq_by_gi('405830'); + Function: Gets a Bio::Seq object by gi number + Returns : A Bio::Seq object + Args : gi number (as a string) + Throws : "gi does not exist" exception + +=cut + +=head2 get_Seq_by_version + + Title : get_Seq_by_version + Usage : $seq = $db->get_Seq_by_version('X77802.1'); + Function: Gets a Bio::Seq object by sequence version + Returns : A Bio::Seq object + Args : accession.version (as a string) + Throws : "acc.version does not exist" exception + +=cut + +sub get_Seq_by_version { + my ($self,$seqid) = @_; + return $self->get_Seq_by_acc($seqid); +} + + +=head2 get_Stream_by_id + + Title : get_Stream_by_id + Usage : $stream = $db->get_Stream_by_id( [$uid1, $uid2] ); + Function: Gets a series of Seq objects by unique identifiers + Returns : a Bio::SeqIO stream object + Args : $ref : a reference to an array of unique identifiers for + the desired sequence entries + +=cut + +=head2 get_Stream_by_gi + + Title : get_Stream_by_gi + Usage : $seq = $db->get_Seq_by_gi([$gi1, $gi2]); + Function: Gets a series of Seq objects by gi numbers + Returns : a Bio::SeqIO stream object + Args : $ref : a reference to an array of gi numbers for + the desired sequence entries + Note : For GenBank, this just calls the same code for get_Stream_by_id() + +=cut + +=head2 get_Stream_by_batch + + Title : get_Stream_by_batch + Usage : $seq = $db->get_Stream_by_batch($ref); + Function: Get a series of Seq objects by their IDs + Example : + Returns : a Bio::SeqIO stream object + Args : $ref : an array reference containing a list of unique + ids/accession numbers. + +In some of the Bio::DB::* moduels, get_Stream_by_id() is called +get_Stream_by_batch(). Since there seems to be no consensus, this +is provided as an alias. + +=cut + +*get_Stream_by_batch = \&Bio::DB::WebDBSeqI::get_Stream_by_id; + +=head1 The remainder of these methods are for internal use + +=head2 get_request + + Title : get_request + Usage : my $url = $self->get_request + Function: returns a HTTP::Request object + Returns : + Args : %qualifiers = a hash of qualifiers (ids, format, etc) + +=cut + + +sub get_request { + my ($self, @qualifiers) = @_; + my ($uids, $format) = $self->_rearrange([qw(UIDS FORMAT)], + @qualifiers); + my $db = $self->db; + my $namespace = $self->_namespace; + + $self->throw("Must specify a value for UIDs to fetch") + unless defined $uids; + my $tmp; + my $format_string = ''; + + $format ||= $self->default_format; + ($format, $tmp) = $self->request_format($format); + + my $base = $self->url_base_address; + my $uid = join('+', ref $uids ? @$uids : $uids); + $self->debug("\n$base$format_string&id=$uid\n"); + return POST($base, + [ db => $namespace, + id => join('+',ref $uids ? @$uids : $uids), + format => $format, + style => 'raw' + ]); +} + +=head2 default_format + + Title : default_format + Usage : $format = $self->default_format + Function: return the default format + Returns : a string + Args : + +=cut + +sub default_format { + return 'default'; +} + +=head2 default_db + + Title : default_db + Usage : $db = $self->default_db + Function: return the default database + Returns : a string + Args : + +=cut + +sub default_db { 'embl' } + +=head2 db + + Title : db + Usage : $db = $self->db([$db]) + Function: get/set the database + Returns : a string + Args : new database + +=cut + +sub db { + my $self = shift; + + if (@_) { + + my $db = lc shift; + $FORMATMAP{$db} or $self->throw("invalid db [$db], must be one of [". + join(' ',keys %FORMATMAP). "]"); + $self->{_db} = $db; + } + return $self->{_db} || $self->default_db ; +} + +sub _namespace { + my $self = shift; + my $db = $self->db; + return $FORMATMAP{$db}{namespace} or $db; +} + +=head2 postprocess_data + + Title : postprocess_data + Usage : $self->postprocess_data ( 'type' => 'string', + 'location' => \$datastr); + Function: process downloaded data before loading into a Bio::SeqIO + Returns : void + Args : hash with two keys - 'type' can be 'string' or 'file' + - 'location' either file location or string + reference containing data + +=cut + +sub postprocess_data { + my ($self,%args) = @_; + + # check for errors in the stream + if ($args{'type'} eq 'string') { + my $stringref = $args{'location'}; + if ($$stringref =~ /^ERROR (\d+) (.+)/m) { + $self->throw("BioFetch Error $1: $2"); + } + } + + elsif ($args{'type'} eq 'file') { + open (F,$args{'location'}) or $self->throw("Couldn't open $args{location}: $!"); + # this is dumb, but the error may be anywhere on the first three lines because the + # CGI headers are sometimes printed out by the server... + my @data = (scalar ,scalar ,scalar ); + if (join('',@data) =~ /^ERROR (\d+) (.+)/m) { + $self->throw("BioFetch Error $1: $2"); + } + close F; + } + + else { + $self->throw("Don't know how to postprocess data of type $args{'type'}"); + } +} + + +=head2 request_format + + Title : request_format + Usage : my ($req_format, $ioformat) = $self->request_format; + $self->request_format("genbank"); + $self->request_format("fasta"); + Function: Get/Set sequence format retrieval. The get-form will normally not + be used outside of this and derived modules. + Returns : Array of two strings, the first representing the format for + retrieval, and the second specifying the corresponding SeqIO format. + Args : $format = sequence format + +=cut + +sub request_format { + my ($self, $value) = @_; + if ( defined $value ) { + my $db = $self->db; + my $namespace = $self->_namespace; + my $format = lc $value; + print "format:", $format, " module:", $FORMATMAP{$db}->{$format}, " ($namespace)\n" + if $self->verbose > 0; + $self->throw("Invalid format [$format], must be one of [". + join(' ',keys %{$FORMATMAP{$db}}). "]") + unless $format eq 'default' || $FORMATMAP{$db}->{$format}; + + $self->{'_format'} = [ $format, $FORMATMAP{$db}->{$format}]; + } + return @{$self->{'_format'}}; +} + + +=head2 Bio::DB::WebDBSeqI methods + +Overriding WebDBSeqI method to help newbies to retrieve sequences. +EMBL database is all too often passed RefSeq accessions. This +redirects those calls. See L. + + +=head2 get_Stream_by_acc + + Title : get_Stream_by_acc + Usage : $seq = $db->get_Seq_by_acc([$acc1, $acc2]); + Function: Gets a series of Seq objects by accession numbers + Returns : a Bio::SeqIO stream object + Args : $ref : a reference to an array of accession numbers for + the desired sequence entries + +=cut + +sub get_Stream_by_acc { + my ($self, $ids ) = @_; + $self->_check_id($ids); + return $self->get_seq_stream('-uids' => $ids, '-mode' => 'single'); +} + + +=head2 _check_id + + Title : _check_id + Usage : + Function: Throw on whole chromosome NCBI sequences not in sequence databases + and redirect RefSeq accession requests sent to EMBL. + Returns : + Args : $id(s), $string + Throws : if accessionn number indicates whole chromosome NCBI sequence + +=cut + +sub _check_id { + my ($self, $id) = @_; + + # NT contigs can not be retrieved + $self->throw("NT_ contigs are whole chromosome files which are not part of regular". + "database distributions. Go to ftp://ftp.ncbi.nih.gov/genomes/.") + if $id =~ /NT_/; + + # Asking for a RefSeq from EMBL/GenBank + + if ($id =~ /N._/ && $self->db ne 'refseq') { + $self->warn("[$id] is not a normal sequence database but a RefSeq entry.". + " Redirecting the request.\n") + if $self->verbose >= 0; + $self->db('RefSeq'); + } +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/DBFetch.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/DBFetch.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,370 @@ +# +# $Id: DBFetch.pm,v 1.8 2002/12/22 22:02:13 lstein Exp $ +# +# BioPerl module for Bio::DB::DBFetch +# +# Cared for by Heikki Lehvaslaiho +# +# Copyright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::DB::DBFetch - Database object for retrieving using the dbfetch script + +=head1 SYNOPSIS + + #do not use this module directly + +=head1 DESCRIPTION + +Allows the dynamic retrieval of entries from databases using the +dbfetch script at EBI: +LEwww.ebi.ac.ukEcgi-binEdbfetch>. + +In order to make changes transparent we have host type (currently only +ebi) and location (defaults to ebi) separated out. This allows later +additions of more servers in different geographical locations. + +This is a superclass which is called by instantiable subclasses with +correct parameters. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email Heikki Lehvaslaiho EHeikki@ebi.ac.ukE + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::DB::DBFetch; +use strict; +use vars qw(@ISA $MODVERSION $DEFAULTFORMAT $DEFAULTLOCATION + $DEFAULTSERVERTYPE); + +$MODVERSION = '0.1'; +use HTTP::Request::Common; +use Bio::DB::WebDBSeqI; + +@ISA = qw(Bio::DB::WebDBSeqI); + +# the new way to make modules a little more lightweight + +BEGIN { + # global vars + $DEFAULTSERVERTYPE = 'dbfetch'; + $DEFAULTLOCATION = 'ebi'; +} + + +=head1 Routines from Bio::DB::WebDBSeqI + +=head2 get_request + + Title : get_request + Usage : my $url = $self->get_request + Function: returns a HTTP::Request object + Returns : + Args : %qualifiers = a hash of qualifiers (ids, format, etc) + +=cut + +sub get_request { + my ($self, @qualifiers) = @_; + my ($uids, $format) = $self->_rearrange([qw(UIDS FORMAT)], + @qualifiers); + + $self->throw("Must specify a value for UIDs to fetch") + unless defined $uids; + my $tmp; + my $format_string = ''; + $format ||= $self->default_format; + ($format, $tmp) = $self->request_format($format); + $format_string = "&format=$format" if $format ne $self->default_format; + my $url = $self->location_url(); + my $uid; + if( ref($uids) =~ /ARRAY/i ) { + $uid = join (',', @$uids); + $self->warn ('The server will accept maximum of 50 entries in a request. The rest are ignored.') + if scalar @$uids >50; + } else { + $uid = $uids; + } + + return GET $url. $format_string. '&id='. $uid; +} + + +=head2 postprocess_data + + Title : postprocess_data + Usage : $self->postprocess_data ( 'type' => 'string', + 'location' => \$datastr); + Function: process downloaded data before loading into a Bio::SeqIO + Returns : void + Args : hash with two keys - 'type' can be 'string' or 'file' + - 'location' either file location or string + reference containing data + +=cut + +# remove occasional blank lines at top of web output +sub postprocess_data { + my ($self, %args) = @_; + if ($args{type} eq 'string') { + ${$args{location}} =~ s/^\s+//; # get rid of leading whitespace + } + elsif ($args{type} eq 'file') { + open F,$args{location} or $self->throw("Cannot open $args{location}: $!"); + my @data = ; + for (@data) { + last unless /^\s+$/; + shift @data; + } + open F,">$args{location}" or $self->throw("Cannot write to $args{location}: $!"); + print F @data; + close F; + } +} + +=head2 default_format + + Title : default_format + Usage : my $format = $self->default_format + Function: Returns default sequence format for this module + Returns : string + Args : none + +=cut + +sub default_format { + my ($self) = @_; + return $self->{'_default_format'}; +} + +=head1 Bio::DB::DBFetch specific routines + +=head2 get_Stream_by_id + + Title : get_Stream_by_id + Usage : $seq = $db->get_Stream_by_id($ref); + Function: Retrieves Seq objects from the server 'en masse', rather than one + at a time. For large numbers of sequences, this is far superior + than get_Stream_by_[id/acc](). + Example : + Returns : a Bio::SeqIO stream object + Args : $ref : either an array reference, a filename, or a filehandle + from which to get the list of unique ids/accession numbers. + +NOTE: for backward compatibility, this method is also called +get_Stream_by_batch. + +=cut + +sub get_Stream_by_id { + my ($self, $ids) = @_; + return $self->get_seq_stream('-uids' => $ids, '-mode' => 'batch'); +} + +=head2 get_Seq_by_version + + Title : get_Seq_by_version + Usage : $seq = $db->get_Seq_by_version('X77802.1'); + Function: Gets a Bio::Seq object by accession number + Returns : A Bio::Seq object + Args : version number (as a string) + Throws : "version does not exist" exception + +=cut + +sub get_Seq_by_version { + my ($self,$seqid) = @_; + my $seqio = $self->get_Stream_by_acc([$seqid]); + $self->throw("version does not exist") if( !defined $seqio ); + return $seqio->next_seq(); +} + +=head2 request_format + + Title : request_format + Usage : my ($req_format, $ioformat) = $self->request_format; + $self->request_format("genbank"); + $self->request_format("fasta"); + Function: Get/Set sequence format retrieval. The get-form will normally not + be used outside of this and derived modules. + Returns : Array of two strings, the first representing the format for + retrieval, and the second specifying the corresponding SeqIO format. + Args : $format = sequence format + +=cut + +sub request_format { + my ($self, $value) = @_; + if( defined $value ) { + $value = lc $value; + $self->{'_format'} = $value; + return ($value, $value); + } + $value = $self->{'_format'}; + if( $value and defined $self->formatmap->{$value} ) { + return ($value, $self->formatmap->{$value}); + } else { + # Try to fall back to a default. + return ($self->default_format, $self->default_format ); + } +} + + +=head2 servertype + + Title : servertype + Usage : my $servertype = $self->servertype + $self->servertype($servertype); + Function: Get/Set server type + Returns : string + Args : server type string [optional] + +=cut + +sub servertype { + my ($self, $servertype) = @_; + if( defined $servertype && $servertype ne '') { + $self->throw("You gave an invalid server type ($servertype)". + " - available types are ". + keys %{$self->hosts}) unless( $self->hosts->{$servertype} ); + $self->{'_servertype'} = $servertype; + } + $self->{'_servertype'} = $DEFAULTSERVERTYPE unless $self->{'_servertype'}; + return $self->{'_servertype'}; +} + +=head2 hostlocation + + Title : hostlocation + Usage : my $location = $self->hostlocation() + $self->hostlocation($location) + Function: Set/Get Hostlocation + Returns : string representing hostlocation + Args : string specifying hostlocation [optional] + +=cut + +sub hostlocation { + my ($self, $location ) = @_; + $location = lc $location; + my $servertype = $self->servertype; + $self->throw("Must have a valid servertype defined not $servertype") + unless defined $servertype; + my %hosts = %{$self->hosts->{$servertype}->{'hosts'}}; + if( defined $location && $location ne '' ) { + if( ! $hosts{$location} ) { + $self->throw("Must specify a known host, not $location,". + " possible values (". + join(",", sort keys %hosts ). ")"); + } + $self->{'_hostlocation'} = $location; + } + $self->{'_hostlocation'} = $DEFAULTLOCATION unless $self->{'_hostlocation'}; + return $self->{'_hostlocation'}; +} + +=head2 location_url + + Title : location + Usage : my $url = $self->location_url() + Function: Get host url + Returns : string representing url + Args : none + +=cut + +sub location_url { + my ($self) = @_; + my $servertype = $self->servertype(); + my $location = $self->hostlocation(); + if( ! defined $location || !defined $servertype ) { + $self->throw("must have a valid hostlocation and servertype set before calling location_url"); + } + return sprintf($self->hosts->{$servertype}->{'baseurl'}, + $self->hosts->{$servertype}->{'hosts'}->{$location}); +} + +=head1 Bio::DB::DBFetch routines + +These methods allow subclasses to pass parameters. + +=head2 hosts + + Title : hosts + Usage : + Function: get/set for host hash + Returns : + Args : optional hash + +=cut + +sub hosts { + my ($self, $value) = @_; + if (defined $value) { + $self->{'_hosts'} = $value; + } + unless (exists $self->{'_hosts'}) { + return (''); + } else { + return $self->{'_hosts'}; + } +} + +=head2 formatmap + + Title : formatmap + Usage : + Function: get/set for format hash + Returns : + Args : optional hash + +=cut + +sub formatmap { + my ($self, $value) = @_; + if (defined $value) { + $self->{'_formatmap'} = $value; + } + unless (exists $self->{'_formatmap'}) { + return (''); + } else { + return $self->{'_formatmap'}; + } +} + + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/EMBL.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/EMBL.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,209 @@ +# +# $Id: EMBL.pm,v 1.12.2.1 2003/06/25 13:44:18 heikki Exp $ +# +# BioPerl module for Bio::DB::EMBL +# +# Cared for by Heikki Lehvaslaiho +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::DB::EMBL - Database object interface for EMBL entry retrieval + +=head1 SYNOPSIS + + use Bio::DB::EMBL; + + $embl = new Bio::DB::EMBL; + + # remember that EMBL_ID does not equal GenBank_ID! + $seq = $embl->get_Seq_by_id('BUM'); # EMBL ID + print "cloneid is ", $seq->id, "\n"; + + # or changeing to accession number and Fasta format ... + $embl->request_format('fasta'); + $seq = $embl->get_Seq_by_acc('J02231'); # EMBL ACC + print "cloneid is ", $seq->id, "\n"; + + # especially when using versions, you better be prepared + # in not getting what what want + eval { + $seq = $embl->get_Seq_by_version('J02231.1'); # EMBL VERSION + }; + print "cloneid is ", $seq->id, "\n" unless $@; + + # or ... best when downloading very large files, prevents + # keeping all of the file in memory + + # also don't want features, just sequence so let's save bandwith + # and request Fasta sequence + $embl = new Bio::DB::EMBL(-retrievaltype => 'tempfile' , + -format => 'fasta'); + my $seqio = $embl->get_Stream_by_batch(['AC013798', 'AC021953'] ); + while( my $clone = $seqio->next_seq ) { + print "cloneid is ", $clone->id, "\n"; + } + +=head1 DESCRIPTION + +Allows the dynamic retrieval of sequence objects L from the +EMBL database using the dbfetch script at EBI: +L. + +In order to make changes transparent we have host type (currently only +ebi) and location (defaults to ebi) separated out. This allows later +additions of more servers in different geographical locations. + +The functionality of this module is inherited from L +which implements L. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email Heikki Lehvaslaiho EHeikki@ebi.ac.ukE + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::DB::EMBL; +use strict; +use vars qw(@ISA $MODVERSION %HOSTS %FORMATMAP $DEFAULTFORMAT); + +$MODVERSION = '0.2'; +use Bio::DB::DBFetch; +use Bio::DB::RefSeq; + +@ISA = qw(Bio::DB::DBFetch); + +BEGIN { + # you can add your own here theoretically. + %HOSTS = ( + 'dbfetch' => { + baseurl => 'http://%s/cgi-bin/dbfetch?db=embl&style=raw', + hosts => { + 'ebi' => 'www.ebi.ac.uk' + } + } + ); + %FORMATMAP = ( 'embl' => 'embl', + 'fasta' => 'fasta' + ); + $DEFAULTFORMAT = 'embl'; +} + +=head2 new + + Title : new + Usage : $gb = Bio::DB::GenBank->new(@options) + Function: Creates a new genbank handle + Returns : New genbank handle + Args : -delay number of seconds to delay between fetches (3s) + +NOTE: There are other options that are used internally. + +=cut + +sub new { + my ($class, @args ) = @_; + my $self = $class->SUPER::new(@args); + + $self->{ '_hosts' } = {}; + $self->{ '_formatmap' } = {}; + + $self->hosts(\%HOSTS); + $self->formatmap(\%FORMATMAP); + $self->{'_default_format'} = $DEFAULTFORMAT; + + return $self; +} + + +=head2 Bio::DB::WebDBSeqI methods + +Overriding WebDBSeqI method to help newbies to retrieve sequences. +EMBL database is all too often passed RefSeq accessions. This +redirects those calls. See L. + + +=head2 get_Stream_by_acc + + Title : get_Stream_by_acc + Usage : $seq = $db->get_Seq_by_acc([$acc1, $acc2]); + Function: Gets a series of Seq objects by accession numbers + Returns : a Bio::SeqIO stream object + Args : $ref : a reference to an array of accession numbers for + the desired sequence entries + Note : For GenBank, this just calls the same code for get_Stream_by_id() + +=cut + +sub get_Stream_by_acc { + my ($self, $ids ) = @_; + my $newdb = $self->_check_id($ids); + if ($newdb && $newdb->isa('Bio::DB::RefSeq')) { + return $newdb->get_seq_stream('-uids' => $ids, '-mode' => 'single'); + } else { + return $self->get_seq_stream('-uids' => $ids, '-mode' => 'single'); + } +} + + +=head2 _check_id + + Title : _check_id + Usage : + Function: + Returns : A Bio::DB::RefSeq reference or throws + Args : $id(s), $string +=cut + +sub _check_id { + my ($self, $ids) = @_; + + # NT contigs can not be retrieved + $self->throw("NT_ contigs are whole chromosome files which are not part of regular". + "database distributions. Go to ftp://ftp.ncbi.nih.gov/genomes/.") + if $ids =~ /NT_/; + + # Asking for a RefSeq from EMBL/GenBank + + if ($ids =~ /N._/) { + $self->warn("[$ids] is not a normal sequence database but a RefSeq entry.". + " Redirecting the request.\n") + if $self->verbose >= 0; + return new Bio::DB::RefSeq(-verbose => $self->verbose); + } +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/Failover.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/Failover.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,162 @@ +# POD documentation - main docs before the code + +# $Id: Failover.pm,v 1.5.2.1 2003/06/25 13:44:18 heikki Exp $ + + +=head1 NAME + +Bio::DB::Failover - A Bio::DB::RandomAccessI compliant class which wraps a priority list of DBs + +=head1 SYNOPSIS + + $failover = Bio::DB::Failover->new(); + + $failover->add_database($db); + + # fail over Bio::DB::RandomAccessI.pm + + # this will check each database in priority, returning when + # the first one succeeds + + $seq = $failover->get_Seq_by_id($id); + +=head1 DESCRIPTION + +This module provides fail over access to a set of Bio::DB::RandomAccessI objects + + +=head1 CONTACT + +Ewan Birney originally wrote this class. + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::DB::Failover; + +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::DB::RandomAccessI; +@ISA = qw(Bio::Root::Root Bio::DB::RandomAccessI ); + +sub new { + my ($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + $self->{'_database'} = []; + return $self; +} + +=head2 add_database + + Title : add_database + Usage : add_database(%db) + Function: Adds a database to the + Returns : count of number of databases + Args : hash of db resource name to Bio::DB::SeqI object + +=cut + +sub add_database { + my ($self,@db) = @_; + foreach my $db ( @db ) { + if( !ref $db || !$db->isa('Bio::DB::RandomAccessI') ) { + $self->throw("Database objects $db is a not a Bio::DB::RandomAccessI"); + next; + } + + push(@{$self->{'_database'}},$db); + } +} + + +=head2 get_Seq_by_id + + Title : get_Seq_by_id + Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN') + Function: Gets a Bio::Seq object by its name + Returns : a Bio::Seq object + Args : the id (as a string) of a sequence + Throws : "id does not exist" exception + + +=cut + +sub get_Seq_by_id { + my ($self,$id) = @_; + + if( !defined $id ) { + $self->throw("no id is given!"); + } + + foreach my $db ( @{$self->{'_database'}} ) { + my $seq; + + eval { + $seq = $db->get_Seq_by_id($id); + }; + if( defined $seq ) { + return $seq; + } + } + + return undef; +} + +=head2 get_Seq_by_acc + + Title : get_Seq_by_acc + Usage : $seq = $db->get_Seq_by_acc('X77802'); + Function: Gets a Bio::Seq object by accession number + Returns : A Bio::Seq object + Args : accession number (as a string) + Throws : "acc does not exist" exception + + +=cut + +sub get_Seq_by_acc { + my ($self,$id) = @_; + + if( !defined $id ) { + $self->throw("no id is given!"); + } + + foreach my $db ( @{$self->{'_database'}} ) { + my $seq; + eval { + $seq = $db->get_Seq_by_acc($id); + }; + if( defined $seq ) { + return $seq; + } + } + return undef; +} + + +## End of Package + +1; + +__END__ + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/Fasta.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/Fasta.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1074 @@ +=head1 NAME + +Bio::DB::Fasta -- Fast indexed access to a directory of fasta files + +=head1 SYNOPSIS + + use Bio::DB::Fasta; + + # create database from directory of fasta files + my $db = Bio::DB::Fasta->new('/path/to/fasta/files'); + + # simple access (for those without Bioperl) + my $seq = $db->seq('CHROMOSOME_I',4_000_000 => 4_100_000); + my $revseq = $db->seq('CHROMOSOME_I',4_100_000 => 4_000_000); + my @ids = $db->ids; + my $length = $db->length('CHROMOSOME_I'); + my $alphabet = $db->alphabet('CHROMOSOME_I'); + my $header = $db->header('CHROMOSOME_I'); + + # Bioperl-style access + my $db = Bio::DB::Fasta->new('/path/to/fasta/files'); + + my $obj = $db->get_Seq_by_id('CHROMOSOME_I'); + my $seq = $obj->seq; + my $subseq = $obj->subseq(4_000_000 => 4_100_000); + my $length = $obj->length; + # (etc) + + # Bio::SeqIO-style access + my $stream = Bio::DB::Fasta->new('/path/to/fasta/files')->get_PrimarySeq_stream; + while (my $seq = $stream->next_seq) { + # Bio::PrimarySeqI stuff + } + + my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files'); + while (my $seq = <$fh>) { + # Bio::PrimarySeqI stuff + } + + # tied hash access + tie %sequences,'Bio::DB::Fasta','/path/to/fasta/files'; + print $sequences{'CHROMOSOME_I:1,20000'}; + +=head1 DESCRIPTION + +Bio::DB::Fasta provides indexed access to one or more Fasta files. It +provides random access to each sequence entry, and to subsequences +within each entry, allowing you to retrieve portions of very large +sequences without bringing the entire sequence into memory. + +When you initialize the module, you point it at a single fasta file or +a directory of multiple such files. The first time it is run, the +module generates an index of the contents of the file or directory +using the AnyDBM module (Berkeley DB preferred, followed by GDBM_File, +NDBM_File, and SDBM_File). Thereafter it uses the index file to find +the file and offset for any requested sequence. If one of the source +fasta files is updated, the module reindexes just that one file. (You +can also force reindexing manually). For improved performance, the +module keeps a cache of open filehandles, closing less-recently used +ones when the cache is full. + +The fasta files may contain any combination of nucleotide and protein +sequences; during indexing the module guesses the molecular type. +Entries may have any line length, and different line lengths are +allowed in the same file. However, within a sequence entry, all lines +must be the same length except for the last. + +The module uses /^E(\S+)/ to extract each sequence's primary ID from +the Fasta header. During indexing, you may pass a callback routine to +modify this primary ID. For example, you may wish to extract a +portion of the gi|gb|abc|xyz nonsense that GenBank Fasta files use. +The original header line can be recovered later. + +This module was developed for use with the C. elegans and human +genomes, and has been tested with sequence segments as large as 20 +megabases. Indexing the C. elegans genome (100 megabases of genomic +sequence plus 100,000 ESTs) takes ~5 minutes on my 300 MHz pentium +laptop. On the same system, average access time for any 200-mer within +the C. elegans genome was E0.02s. + +=head1 DATABASE CREATION AND INDEXING + +The two constructors for this class are new() and newFh(). The former +creates a Bio::DB::Fasta object which is accessed via method calls. +The latter creates a tied filehandle which can be used Bio::SeqIO +style to fetch sequence objects in a stream fashion. There is also a +tied hash interface. + +=over 4 + +=item $db = Bio::DB::Fasta-Enew($fasta_path [,%options]) + +Create a new Bio::DB::Fasta object from the Fasta file or files +indicated by $fasta_path. Indexing will be performed automatically if +needed. If successful, new() will return the database accessor +object. Otherwise it will return undef. + +$fasta_path may be an individual Fasta file, or may refer to a +directory containing one or more of such files. Following the path, +you may pass a series of name=Evalue options or a hash with these +same name=Evalue pairs. Valid options are: + + Option Name Description Default + ----------- ----------- ------- + + -glob Glob expression to use *.{fa,fasta,fast,FA,FASTA,FAST,dna} + for searching for Fasta + files in directories. + + -makeid A code subroutine for None + transforming Fasta IDs. + + -maxopen Maximum size of 32 + filehandle cache. + + -debug Turn on status 0 + messages. + + -reindex Force the index to be 0 + rebuilt. + + -dbmargs Additional arguments none + to pass to the DBM + routines when tied + (scalar or array ref). + +-dbmargs can be used to control the format of the index. For example, +you can pass $DB_BTREE to this argument so as to force the IDs to be +sorted and retrieved alphabetically. Note that you must use the same +arguments every time you open the index! + +-reindex can be used to force the index to be recreated from scratch. + +=item $fh = Bio::DB::Fasta-EnewFh($fasta_path [,%options]) + +Create a tied filehandle opened on a Bio::DB::Fasta object. Reading +from this filehandle with EE will return a stream of sequence objects, +Bio::SeqIO style. + +=back + +The -makeid option gives you a chance to modify sequence IDs during +indexing. The option's value should be a code reference that will +take a scalar argument and return a scalar result, like this: + + $db = Bio::DB::Fasta->new("file.fa",-makeid=>\&make_my_id); + + sub make_my_id { + my $description_line = shift; + # get a new id from the fasta header + return $new_id; + } + +make_my_id() will be called with the full fasta id line (including the +"E" symbol!). For example: + + >A12345.3 Predicted C. elegans protein egl-2 + +By default, this module will use the regular expression /^E(\S+)/ +to extract "A12345.3" for use as the ID. If you pass a -makeid +callback, you can extract any portion of this, such as the "egl-2" +symbol. + +The -makeid option is ignored after the index is constructed. + +=head1 OBJECT METHODS + +The following object methods are provided. + +=over 4 + +=item $raw_seq = $db-Eseq($id [,$start, $stop]) + +Return the raw sequence (a string) given an ID and optionally a start +and stop position in the sequence. In the case of DNA sequence, if +$stop is less than $start, then the reverse complement of the sequence +is returned (this violates Bio::Seq conventions). + +For your convenience, subsequences can be indicated with this compound +ID: + + $db->seq("$id:$start,$stop") + +=item $length = $db-Elength($id) + +Return the length of the indicated sequence. + +=item $header = $db-Eheader($id) + +Return the header line for the ID, including the initial "E". + +=item $type = $db-Ealphabet($id) + +Return the molecular type of the indicated sequence. One of "dna", +"rna" or "protein". + +=item $filename = $db-Efile($id) + +Return the name of the file in which the indicated sequence can be +found. + +=item $offset = $db-Eoffset($id) + +Return the offset of the indicated sequence from the beginning of the +file in which it is located. The offset points to the beginning of +the sequence, not the beginning of the header line. + +=item $header_length = $db-Eheaderlen($id) + +Return the length of the header line for the indicated sequence. + +=item $header_offset = $db-Eheader_offset($id) + +Return the offset of the header line for the indicated sequence from +the beginning of the file in which it is located. + +=item $index_name = $db-Eindex_name + +Return the path to the index file. + +=item $path = $db-Epath + +Return the path to the Fasta file(s). + +=back + +For BioPerl-style access, the following methods are provided: + +=over 4 + +=item $seq = $db-Eget_Seq_by_id($id) + +Return a Bio::PrimarySeq::Fasta object, which obeys the +Bio::PrimarySeqI conventions. For example, to recover the raw DNA or +protein sequence, call $seq-Eseq(). + +Note that get_Seq_by_id() does not bring the entire sequence into +memory until requested. Internally, the returned object uses the +accessor to generate subsequences as needed. + +=item $seq = $db-Eget_Seq_by_acc($id) + +=item $seq = $db-Eget_Seq_by_primary_id($id) + +These methods all do the same thing as get_Seq_by_id(). + +=item $stream = $db-Eget_PrimarySeq_stream() + +Return a Bio::DB::Fasta::Stream object, which supports a single method +next_seq(). Each call to next_seq() returns a new +Bio::PrimarySeq::Fasta object, until no more sequences remain. + +=back + +See L for methods provided by the sequence objects +returned from get_Seq_by_id() and get_PrimarySeq_stream(). + +=head1 TIED INTERFACES + +This module provides two tied interfaces, one which allows you to +treat the sequence database as a hash, and the other which allows you +to treat the database as an I/O stream. + +=head2 Creating a Tied Hash + +The tied hash interface is very straightforward + +=over 4 + +=item $obj = tie %db,'Bio::DB::Fasta','/path/to/fasta/files' [,@args] + +Tie %db to Bio::DB::Fasta using the indicated path to the Fasta files. +The optional @args list is the same set of named argument/value pairs +used by Bio::DB::Fasta-Enew(). + +If successful, tie() will return the tied object. Otherwise it will +return undef. + +=back + +Once tied, you can use the hash to retrieve an individual sequence by +its ID, like this: + + my $seq = $db{CHROMOSOME_I}; + +You may select a subsequence by appending the comma-separated range to +the sequence ID in the format "$id:$start,$stop". For example, here +is the first 1000 bp of the sequence with the ID "CHROMOSOME_I": + + my $seq = $db{'CHROMOSOME_I:1,1000'}; + +(The regular expression used to parse this format allows sequence IDs +to contain colons.) + +When selecting subsequences, if $start E stop, then the reverse +complement will be returned for DNA sequences. + +The keys() and values() functions will return the sequence IDs and +their sequences, respectively. In addition, each() can be used to +iterate over the entire data set: + + while (my ($id,$sequence) = each %db) { + print "$id => $sequence\n"; + } + +When dealing with very large sequences, you can avoid bringing them +into memory by calling each() in a scalar context. This returns the +key only. You can then use tied(%db) to recover the Bio::DB::Fasta +object and call its methods. + + while (my $id = each %db) { + print "$id => $db{$sequence:1,100}\n"; + print "$id => ",tied(%db)->length($id),"\n"; + } + +You may, in addition invoke Bio::DB::Fasta's FIRSTKEY and NEXTKEY tied +hash methods directly. + +=over 4 + +=item $id = $db-EFIRSTKEY + +Return the first ID in the database. + +=item $id = $db-ENEXTKEY($id) + +Given an ID, return the next ID in sequence. + +=back + +This allows you to write the following iterative loop using just the +object-oriented interface: + + my $db = Bio::DB::Fasta->new('/path/to/fasta/files'); + for (my $id=$db->FIRSTKEY; $id; $id=$db->NEXTKEY($id)) { + # do something with sequence + } + +=head2 Creating a Tied Filehandle + +The Bio::DB::Fasta-EnewFh() method creates a tied filehandle from +which you can read Bio::PrimarySeq::Fasta sequence objects +sequentially. The following bit of code will iterate sequentially +over all sequences in the database: + + my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files'); + while (my $seq = <$fh>) { + print $seq->id,' => ',$seq->length,"\n"; + } + +When no more sequences remain to be retrieved, the stream will return +undef. + +=head1 BUGS + +When a sequence is deleted from one of the Fasta files, this deletion +is not detected by the module and removed from the index. As a +result, a "ghost" entry will remain in the index and will return +garbage results if accessed. + +Currently, the only way to accomodate deletions is to rebuild the +entire index, either by deleting it manually, or by passing +-reindex=E1 to new() when initializing the module. + +=head1 SEE ALSO + +L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut + +#' +package Bio::DB::Fasta; + +BEGIN { + @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File) +} + +use strict; +use IO::File; +use AnyDBM_File; +use Fcntl; +use File::Basename qw(basename dirname); +use Bio::DB::SeqI; +use Bio::Root::Root; +use vars qw($VERSION @ISA); + +@ISA = qw(Bio::DB::SeqI Bio::Root::Root); + +$VERSION = '1.03'; + +*seq = *sequence = \&subseq; +*ids = \&get_all_ids; +*get_seq_by_primary_id = *get_Seq_by_acc = \&get_Seq_by_id; + +use constant STRUCT =>'NNnnCa*'; +use constant DNA => 1; +use constant RNA => 2; +use constant PROTEIN => 3; + +# Bio::DB-like object +# providing fast random access to a directory of FASTA files + +=head2 new + + Title : new + Usage : my $db = new Bio::DB::Fasta( $path, @options); + Function: initialize a new Bio::DB::Fasta object + Returns : new Bio::DB::Fasta object + Args : path to dir of fasta files or a single filename + +These are optional arguments to pass in as well. + + -glob Glob expression to use *.{fa,fasta,fast,FA,FASTA,FAST} + for searching for Fasta + files in directories. + + -makeid A code subroutine for None + transforming Fasta IDs. + + -maxopen Maximum size of 32 + filehandle cache. + + -debug Turn on status 0 + messages. + + -reindex Force the index to be 0 + rebuilt. + + -dbmargs Additional arguments none + to pass to the DBM + routines when tied + (scalar or array ref). + +=cut + +sub new { + my $class = shift; + my $path = shift; + my %opts = @_; + + my $self = bless { debug => $opts{-debug}, + makeid => $opts{-makeid}, + glob => $opts{-glob} || '*.{fa,fasta,FA,FASTA,fast,FAST,dna,fsa}', + maxopen => $opts{-maxfh} || 32, + dbmargs => $opts{-dbmargs} || undef, + fhcache => {}, + cacheseq => {}, + curopen => 0, + openseq => 1, + dirname => undef, + offsets => undef, + }, $class; + my ($offsets,$dirname); + + if (-d $path) { + $offsets = $self->index_dir($path,$opts{-reindex}); + $dirname = $path; + } elsif (-f _) { + $offsets = $self->index_file($path,$opts{-reindex}); + $dirname = dirname($path); + } else { + $self->throw( "$path: Invalid file or dirname"); + } + @{$self}{qw(dirname offsets)} = ($dirname,$offsets); + + $self; +} + +=head2 newFh + + Title : newFh + Function: gets a new Fh for a file + Example : internal method + Returns : GLOB + Args : + +=cut + +sub newFh { + my $class = shift; + my $self = $class->new(@_); + require Symbol; + my $fh = Symbol::gensym or return; + tie $$fh,'Bio::DB::Fasta::Stream',$self or return; + $fh; +} + +sub _open_index { + my $self = shift; + my ($index,$write) = @_; + my %offsets; + my $flags = $write ? O_CREAT|O_RDWR : O_RDONLY; + my @dbmargs = $self->dbmargs; + tie %offsets,'AnyDBM_File',$index,$flags,0644,@dbmargs or $self->throw( "Can't open cache file: $!"); + return \%offsets; +} + +=head2 index_dir + + Title : index_dir + Usage : $db->index_dir($dir) + Function: set the index dir and load all files in the dir + Returns : hashref of seq offsets in each file + Args : dirname, boolean to force a reload of all files + +=cut + +sub index_dir { + my $self = shift; + my $dir = shift; + my $force_reindex = shift; + + # find all fasta files + my @files = glob("$dir/$self->{glob}"); + $self->throw( "no fasta files in $dir") unless @files; + + # get name of index + my $index = $self->index_name($dir,1); + + # if caller has requested reindexing, then unlink + # the index file. + unlink $index if $force_reindex; + + # get the modification time of the index + my $indextime = (stat($index))[9] || 0; + + # get the most recent modification time of any of the contents + my $modtime = 0; + my %modtime; + foreach (@files) { + my $m = (stat($_))[9]; + $modtime{$_} = $m; + $modtime = $m if $modtime < $m; + } + + my $reindex = $force_reindex || $indextime < $modtime; + my $offsets = $self->_open_index($index,$reindex) or return; + $self->{offsets} = $offsets; + + # no indexing needed + return $offsets unless $reindex; + + # otherwise reindex contents of changed files + $self->{indexing} = $index; + foreach (@files) { + next if( defined $indextime && $modtime{$_} <= $indextime); + $self->calculate_offsets($_,$offsets); + } + delete $self->{indexing}; + return $self->{offsets}; +} + +=head2 get_Seq_by_id + + Title : get_Seq_by_id + Usage : my $seq = $db->get_Seq_by_id($id) + Function: Bio::DB::RandomAccessI method implemented + Returns : Bio::PrimarySeqI object + Args : id + +=cut + +sub get_Seq_by_id { + my $self = shift; + my $id = shift; + return Bio::PrimarySeq::Fasta->new($self,$id); +} + +=head2 index_file + + Title : index_file + Usage : $db->index_file($filename) + Function: (re)loads a sequence file and indexes sequences offsets in the file + Returns : seq offsets in the file + Args : filename, + boolean to force reloading a file + +=cut + + +sub index_file { + my $self = shift; + my $file = shift; + my $force_reindex = shift; + + my $index = $self->index_name($file); + # if caller has requested reindexing, then unlink the index + unlink $index if $force_reindex; + + # get the modification time of the index + my $indextime = (stat($index))[9]; + my $modtime = (stat($file))[9]; + + my $reindex = $force_reindex || $indextime < $modtime; + my $offsets = $self->_open_index($index,$reindex) or return; + $self->{offsets} = $offsets; + + return $self->{offsets} unless $reindex; + + $self->{indexing} = $index; + $self->calculate_offsets($file,$offsets); + delete $self->{indexing}; + return $self->{offsets}; +} + +=head2 dbmargs + + Title : dbmargs + Usage : my @args = $db->dbmargs; + Function: gets stored dbm arguments + Returns : array + Args : none + + +=cut + +sub dbmargs { + my $self = shift; + my $args = $self->{dbmargs} or return; + return ref($args) eq 'ARRAY' ? @$args : $args; +} + +=head2 index_name + + Title : index_name + Usage : my $indexname = $db->index_name($path,$isdir); + Function: returns the name of the index for a specific path + Returns : string + Args : path to check, + boolean if it is a dir + +=cut + +sub index_name { + my $self = shift; + my ($path,$isdir) = @_; + unless ($path) { + my $dir = $self->{dirname} or return; + return $self->index_name($dir,-d $dir); + } + return "$path/directory.index" if $isdir; + return "$path.index"; +} + +=head2 calculate_offsets + + Title : calculate_offsets + Usage : $db->calculate_offsets($filename,$offsets); + Function: calculates the sequence offsets in a file based on id + Returns : offset hash for each file + Args : file to process + $offsets - hashref of id to offset storage + +=cut + +sub calculate_offsets { + my $self = shift; + my ($file,$offsets) = @_; + my $base = $self->path2fileno(basename($file)); + + my $fh = IO::File->new($file) or $self->throw( "Can't open $file: $!"); + warn "indexing $file\n" if $self->{debug}; + my ($offset,$id,$linelength,$type,$firstline,$count,%offsets); + while (<$fh>) { # don't try this at home + if (/^>(\S+)/) { + print STDERR "indexed $count sequences...\n" + if $self->{debug} && (++$count%1000) == 0; + my $pos = tell($fh); + if ($id) { + my $seqlength = $pos - $offset - length($_) - 1; + $seqlength -= int($seqlength/$linelength); + $offsets->{$id} = $self->_pack($offset,$seqlength, + $linelength,$firstline, + $type,$base); + } + $id = ref($self->{makeid}) eq 'CODE' ? $self->{makeid}->($_) : $1; + ($offset,$firstline,$linelength) = ($pos,length($_),0); + } else { + $linelength ||= length($_); + $type ||= $self->_type($_); + } + } + # deal with last entry + if ($id) { + my $pos = tell($fh); + +# my $seqlength = $pos - $offset - length($_) - 1; + # $_ is always null should not be part of this calculation + my $seqlength = $pos - $offset - 1; + + if ($linelength == 0) { # yet another pesky empty chr_random.fa file + $seqlength = 0; + } else { + $seqlength -= int($seqlength/$linelength); + }; + $offsets->{$id} = $self->_pack($offset,$seqlength, + $linelength,$firstline, + $type,$base); + } + return \%offsets; +} + +=head2 get_all_ids + + Title : get_all_ids + Usage : my @ids = $db->get_all_ids + Function: gets all the stored ids in all indexes + Returns : list of ids + Args : none + +=cut + +sub get_all_ids { grep {!/^__/} keys %{shift->{offsets}} } + +sub offset { + my $self = shift; + my $id = shift; + my $offset = $self->{offsets}{$id} or return; + ($self->_unpack($offset))[0]; +} + +sub length { + my $self = shift; + my $id = shift; + my $offset = $self->{offsets}{$id} or return; + ($self->_unpack($offset))[1]; +} + +sub linelen { + my $self = shift; + my $id = shift; + my $offset = $self->{offsets}{$id} or return; + ($self->_unpack($offset))[2]; +} + +sub headerlen { + my $self = shift; + my $id = shift; + my $offset = $self->{offsets}{$id} or return; + ($self->_unpack($offset))[3]; +} + +sub alphabet { + my $self = shift; + my $id = shift; + my $offset = $self->{offsets}{$id} or return; + my $type = ($self->_unpack($offset))[4]; + return $type == DNA ? 'dna' + : $type == RNA ? 'rna' + : 'protein'; + +} + +sub path { shift->{dirname} } + +sub header_offset { + my $self = shift; + my $id = shift; + return unless $self->{offsets}{$id}; + return $self->offset($id) - $self->headerlen($id); +} + +sub file { + my $self = shift; + my $id = shift; + my $offset = $self->{offsets}{$id} or return; + $self->fileno2path(($self->_unpack($offset))[5]); +} + +sub fileno2path { + my $self = shift; + my $no = shift; + return $self->{offsets}{"__file_$no"}; +} + +sub path2fileno { + my $self = shift; + my $path = shift; + if ( !defined $self->{offsets}{"__path_$path"} ) { + my $fileno = ($self->{offsets}{"__path_$path"} = 0+ $self->{fileno}++); + $self->{offsets}{"__file_$fileno"} = $path; + } + return $self->{offsets}{"__path_$path"} +} + +=head2 subseq + + Title : subseq + Usage : $seqdb->subseq($id,$start,$stop); + Function: returns a subseq of a sequence in the db + Returns : subsequence data + Args : id of sequence, starting point, ending point + +=cut + +sub subseq { + my ($self,$id,$start,$stop) = @_; + if ($id =~ /^(.+):([\d_]+)[,-]([\d_]+)$/) { + ($id,$start,$stop) = ($1,$2,$3); + $start =~ s/_//g; + $stop =~ s/_//g; + } + $start ||= 1; + $stop ||= $self->length($id); + + my $reversed; + if ($start > $stop) { + ($start,$stop) = ($stop,$start); + $reversed++; + } + + my $data; + + my $fh = $self->fh($id) or return; + my $filestart = $self->caloffset($id,$start); + my $filestop = $self->caloffset($id,$stop); + + seek($fh,$filestart,0); + read($fh,$data,$filestop-$filestart+1); + $data =~ s/\n//g; + if ($reversed) { + $data = reverse $data; + $data =~ tr/gatcGATC/ctagCTAG/; + } + $data; +} + +sub fh { + my $self = shift; + my $id = shift; + my $file = $self->file($id) or return; + $self->fhcache("$self->{dirname}/$file") or $self->throw( "Can't open file $file"); +} + +sub header { + my $self = shift; + my $id = shift; + my ($offset,$seqlength,$linelength,$firstline,$type,$file) + = $self->_unpack($self->{offsets}{$id}) or return; + $offset -= $firstline; + my $data; + my $fh = $self->fh($id) or return; + seek($fh,$offset,0); + read($fh,$data,$firstline); + chomp $data; + substr($data,0,1) = ''; + $data; +} + +sub caloffset { + my $self = shift; + my $id = shift; + my $a = shift()-1; + my ($offset,$seqlength,$linelength,$firstline,$type,$file) = $self->_unpack($self->{offsets}{$id}); + $a = 0 if $a < 0; + $a = $seqlength-1 if $a >= $seqlength; + $offset + $linelength * int($a/($linelength-1)) + $a % ($linelength-1); +} + +sub fhcache { + my $self = shift; + my $path = shift; + if (!$self->{fhcache}{$path}) { + if ($self->{curopen} >= $self->{maxopen}) { + my @lru = sort {$self->{cacheseq}{$a} <=> $self->{cacheseq}{$b};} keys %{$self->{fhcache}}; + splice(@lru, $self->{maxopen} / 3); + $self->{curopen} -= @lru; + for (@lru) { delete $self->{fhcache}{$_} } + } + $self->{fhcache}{$path} = IO::File->new($path) or return; + $self->{curopen}++; + } + $self->{cacheseq}{$path}++; + $self->{fhcache}{$path} +} + +sub _pack { + shift; + pack STRUCT,@_; +} + +sub _unpack { + shift; + unpack STRUCT,shift; +} + +sub _type { + shift; + local $_ = shift; + return /^[gatcnGATCN*-]+$/ ? DNA + : /^[gaucnGAUCN*-]+$/ ? RNA + : PROTEIN; +} + +=head2 get_PrimarySeq_stream + + Title : get_PrimarySeq_stream + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub get_PrimarySeq_stream { + my $self = shift; + return Bio::DB::Fasta::Stream->new($self); +} + +sub TIEHASH { + my $self = shift; + return $self->new(@_); +} + +sub FETCH { + shift->subseq(@_); +} +sub STORE { + shift->throw("Read-only database"); +} +sub DELETE { + shift->throw("Read-only database"); +} +sub CLEAR { + shift->throw("Read-only database"); +} +sub EXISTS { + defined shift->offset(@_); +} +sub FIRSTKEY { tied(%{shift->{offsets}})->FIRSTKEY(@_); } +sub NEXTKEY { tied(%{shift->{offsets}})->NEXTKEY(@_); } + +sub DESTROY { + my $self = shift; + if ($self->{indexing}) { # killed prematurely, so index file is no good! + warn "indexing was interrupted, so unlinking $self->{indexing}"; + unlink $self->{indexing}; + } +} + +#------------------------------------------------------------- +# Bio::PrimarySeqI compatibility +# +package Bio::PrimarySeq::Fasta; +use overload '""' => 'display_id'; + +use vars '@ISA'; +eval { + require Bio::PrimarySeqI; + require Bio::Root::Root; +} && (@ISA = ('Bio::Root::Root','Bio::PrimarySeqI')); + +sub new { + my $class = shift; + $class = ref($class) if ref $class; + my ($db,$id,$start,$stop) = @_; + return bless { db => $db, + id => $id, + start => $start || 1, + stop => $stop || $db->length($id) + },$class; +} + +sub seq { + my $self = shift; + return $self->{db}->seq($self->{id},$self->{start},$self->{stop}); +} + +sub subseq { + my $self = shift; + my ($start,$stop) = @_; + $self->throw("Stop cannot be smaller than start") unless $start <= $stop; + return $self->{start} <= $self->{stop} ? $self->new($self->{db}, + $self->{id}, + $self->{start}+$start-1, + $self->{start}+$stop-1) + : $self->new($self->{db}, + $self->{id}, + $self->{start}-($start-1), + $self->{start}-($stop-1) + ); + +} + +sub display_id { + my $self = shift; + return $self->{id}; +} + +sub accession_number { + my $self = shift; + return "unknown"; +} + +sub primary_id { + my $self = shift; + return overload::StrVal($self); +} + +sub can_call_new { return 0 } + +sub alphabet { + my $self = shift; + return $self->{db}->alphabet($self->{id}); +} + +sub revcom { + my $self = shift; + return $self->new(@{$self}{'db','id','stop','start'}); +} + +sub length { + my $self = shift; + return $self->{db}->length($self->{id}); +} + +sub desc { + my $self = shift; + return ''; +} + +#------------------------------------------------------------- +# stream-based access to the database +# +package Bio::DB::Fasta::Stream; +use Tie::Handle; +use vars qw(@ISA); +@ISA = qw(Tie::Handle); +eval { + require Bio::DB::SeqI; +} && (push @ISA,'Bio::DB::SeqI'); + + +sub new { + my $class = shift; + my $db = shift; + my $key = $db->FIRSTKEY; + return bless { db=>$db,key=>$key },$class; +} + +sub next_seq { + my $self = shift; + my ($key,$db) = @{$self}{'key','db'}; + my $value = $db->get_Seq_by_id($key); + $self->{key} = $db->NEXTKEY($key); + $value; +} + +sub TIEHANDLE { + my $class = shift; + my $db = shift; + return $class->new($db); +} +sub READLINE { + my $self = shift; + $self->next_seq; +} + +1; + +__END__ + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/FileCache.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/FileCache.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,323 @@ +# POD documentation - main docs before the code +# +# + +=head1 NAME + +Bio::DB::FileCache - In file cache for BioSeq objects + +=head1 SYNOPSIS + + + + $cachedb = Bio::DB::FileCache->new($real_db); + + # + # $real_db is a Bio::DB::RandomAccessI database + # + + $seq = $cachedb->get_Seq_by_id('ROA1_HUMAN'); + + # + # $seq is a Bio::Seq object + # + + # more control provided with named-parameter form + + $cachedb = Bio::DB::FileCache->new( -seqdb => $real_db, + -file => $path, + -keep => $flag, + ); +=head1 DESCRIPTION + +This is a disk cache system which saves the objects returned by +Bio::DB::RandomAccessI on disk. The disk cache grows without limit, +while the process is running, but is automatically unlinked at process +termination unless the -keep flag is set. + +This module requires DB_File and Storable. + +=head1 CONTACT + +Lincoln Stein + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::DB::FileCache; + +use DB_File; +use Storable qw(freeze thaw); +use Fcntl qw(O_CREAT O_RDWR O_RDONLY); +use File::Temp 'tmpnam'; + +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; + +@ISA = qw(Bio::Root::Root Bio::DB::SeqI); + +use Bio::DB::SeqI; +use Bio::Seq::RichSeq; +use Bio::Location::Split; +use Bio::Location::Fuzzy; +use Bio::Seq; +use Bio::SeqFeature::Generic; +use Bio::Species; +use Bio::Annotation::Collection; + +=head2 new + + Title : new + Usage : $db = Bio::DB::FileCache->new( + -seqdb => $db, # Bio::DB::RandomAccessI database + -file => $path, # path to index file + -keep => $flag, # don't unlink index file + ) + Function: creates a new on-disk cache + Returns : a Bio::DB::RandomAccessI database + Args : as above + Throws : "Must be a randomaccess database" exception + "Could not open primary index file" exception + +If no index file is specified, will create a temporary file in your +system's temporary file directory. The name of this temporary file +can be retrieved using file_name(). + +=cut + +sub new { + my ($class,@args) = @_; + + my $self = Bio::Root::Root->new(); + bless $self,$class; + + my ($seqdb,$file_name,$keep) = $self->_rearrange([qw(SEQDB FILE KEEP)],@args); + + if( !defined $seqdb || !ref $seqdb || !$seqdb->isa('Bio::DB::RandomAccessI') ) { + $self->throw("Must be a randomaccess database not a [$seqdb]"); + } + + $self->seqdb($seqdb); + $file_name ||= tmpnam(); + $self->file_name($file_name); + $self->keep($keep); + + $self->_open_database($file_name); + return $self; +} + +=head2 get_Seq_by_id + + Title : get_Seq_by_id + Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN') + Function: Gets a Bio::Seq object by its name + Returns : a Bio::Seq object + Args : the id (as a string) of a sequence + Throws : "id does not exist" exception + + +=cut + +sub get_Seq_by_id{ + my ($self,$id) = @_; + + # look in the cache first + my $obj = $self->_get('id' => $id); + return $obj if defined $obj; + + # get object from seqdb + $obj = $self->seqdb->get_Seq_by_id($id); + $self->_store('id' => $id, $obj); + + return $obj; +} + +=head2 get_Seq_by_acc + + Title : get_Seq_by_acc + Usage : $seq = $db->get_Seq_by_acc('X77802'); + Function: Gets a Bio::Seq object by accession number + Returns : A Bio::Seq object + Args : accession number (as a string) + Throws : "acc does not exist" exception + + +=cut + +sub get_Seq_by_acc{ + my ($self,$acc) = @_; + + # look in the cache first + my $obj = $self->_get('acc' => $acc); + return $obj if defined $obj; + + # get object from seqdb + $obj = $self->seqdb->get_Seq_by_acc($acc); + $self->_store('acc' => $acc, $obj); + + return $obj; +} + +=head2 seqdb + + Title : seqdb + Usage : $seqdb = $db->seqdb([$seqdb]) + Function: gets/sets the Bio::DB::RandomAccessI database + Returns : a Bio::DB::RandomAccessI database + Args : new sequence database (optional) + Throws : nothing + +=cut + +sub seqdb { + my ($self, $seqdb) = @_; + if ($seqdb) { + $self->{'seqdb'} = $seqdb; + } else { + return $self->{'seqdb'}; + } +} + +=head2 file_name + + Title : file_name + Usage : $path = $db->file_name([$file_name]) + Function: gets/sets the name of the cache file + Returns : a path + Args : new cache file name (optional) + Throws : nothing + +It probably isn't useful to set the cache file name after you've +opened it. + +=cut + +#' + +sub file_name { + my $self = shift; + my $d = $self->{file_name}; + $self->{file_name} = shift if @_; + $d; +} + +=head2 keep + + Title : keep + Usage : $keep = $db->keep([$flag]) + Function: gets/sets the value of the "keep" flag + Returns : current value + Args : new value (optional) + Throws : nothing + +The keep flag will cause the index file to be unlinked when the +process exits. Since on some operating systems (Unix, OS/2) the +unlinking occurs during the new() call immediately after opening the +file, it probably isn't safe to change this value. + +=cut + +sub keep { + my $self = shift; + my $d = $self->{keep}; + $self->{keep} = shift if @_; + $d; +} + +=head2 db + + Title : db + Usage : $db->db + Function: returns tied hash to index database + Returns : a Berkeley DB tied hashref + Args : none + Throws : nothing + +=cut + +sub db { shift->{db} } + +=head2 flush + + Title : flush + Usage : $db->flush + Function: flushes the cache + Returns : nothing + Args : none + Throws : nothing + +=cut + +sub flush { + my $db = shift->db or return; + %{$db} = (); +} + +sub _get { + my $self = shift; + my ($type,$id) = @_; + my $serialized = $self->db->{"${type}_${id}"}; + my $obj = thaw($serialized); + $obj; +} + +sub _store { + my $self = shift; + my ($type,$id,$obj) = @_; + my $serialized = freeze($obj); + $self->db->{"${type}_${id}"} = $serialized; +} + +=head2 get_Seq_by_version + + Title : get_Seq_by_version + Usage : $seq = $db->get_Seq_by_version('X77802.1'); + Function: Gets a Bio::Seq object by sequence version + Returns : A Bio::Seq object + Args : accession.version (as a string) + Throws : "acc.version does not exist" exception + +=cut + +sub get_Seq_by_version{ + my ($self,@args) = @_; + $self->throw("Not implemented it"); +} + +sub DESTROY { + my $self = shift; + unlink $self->file_name unless $self->keep; +} + + +sub _open_database { + my $self = shift; + my $file = shift; + my $flags = O_CREAT|O_RDWR; + my %db; + tie(%db,'DB_File',$file,$flags,0666,$DB_BTREE) + or $self->throw("Could not open primary index file"); + $self->{db} = \%db; + unlink $file unless $self->keep; +} + +## End of Package + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/Flat.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/Flat.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,526 @@ +# +# $Id: Flat.pm,v 1.6 2002/12/22 22:02:13 lstein Exp $ +# +# BioPerl module for Bio::DB::Flat +# +# Cared for by Lincoln Stein +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::DB::Flat - Interface for indexed flat files + +=head1 SYNOPSIS + + $db = Bio::DB::Flat->new(-directory => '/usr/share/embl', + -format => 'embl', + -write_flag => 1); + $db->build_index('/usr/share/embl/primate.embl','/usr/share/embl/protists.embl'); + $seq = $db->get_Seq_by_id('BUM'); + @sequences = $db->get_Seq_by_acc('DIV' => 'primate'); + $raw = $db->fetch_raw('BUM'); + +=head1 DESCRIPTION + +This object provides the basic mechanism to associate positions in +files with primary and secondary name spaces. Unlike +Bio::Index::Abstract (see L), this is specialized +to work with the "flat index" and BerkeleyDB indexed flat file formats +worked out at the 2002 BioHackathon. + +This object is a general front end to the underlying databases. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Lincoln Stein + +Email - lstein@cshl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal +methods are usually preceded with an "_" (underscore). + +=cut + + +# Let the code begin... +package Bio::DB::Flat; + +use Bio::DB::RandomAccessI; +use Bio::Root::Root; +use Bio::Root::IO; +use vars '@ISA'; + +@ISA = qw(Bio::Root::Root Bio::DB::RandomAccessI); + +use constant CONFIG_FILE_NAME => 'config.dat'; + +=head2 new + + Title : new + Usage : my $db = new Bio::Flat->new( + -directory => $root_directory, + -write_flag => 0, + -index => 'bdb'|'flat', + -verbose => 0, + -out => 'outputfile', + -format => 'genbank'); + Function: create a new Bio::Index::BDB object + Returns : new Bio::Index::BDB object + Args : -directory Root directory containing "config.dat" + -write_flag If true, allows reindexing. + -verbose Verbose messages + -out File to write to when write_seq invoked + Status : Public + +The root -directory indicates where the flat file indexes will be +stored. The build_index() and write_seq() methods will automatically +create a human-readable configuration file named "config.dat" in this +file. + +The -write_flag enables writing new entries into the database as well +as the creation of the indexes. By default the indexes will be opened +read only. + +-index is one of "bdb" or "flat" and indicates the type of index to +generate. "bdb" corresponds to Berkeley DB. You *must* be using +BerkeleyDB version 2 or higher, and have the Perl BerkeleyDB extension +installed (DB_File will *not* work). + +The -out argument species the output file for writing objects created +with write_seq(). + +=cut + +sub new { + my $class = shift; + $class = ref($class) if ref($class); + my $self = $class->SUPER::new(@_); + + # first we initialize ourselves + my ($flat_directory) = @_ == 1 ? shift + : $self->_rearrange([qw(DIRECTORY)],@_); + + # set values from configuration file + $self->directory($flat_directory); + $self->_read_config() if -e $flat_directory; + + # but override with initialization values + $self->_initialize(@_); + + # now we figure out what subclass to instantiate + my $index_type = $self->indexing_scheme eq 'BerkeleyDB/1' ? 'BDB' + :$self->indexing_scheme eq 'flat/1' ? 'Flat' + :$self->throw("unknown indexing scheme: ".$self->indexing_scheme); + my $format = $self->file_format; + my $child_class= "Bio\:\:DB\:\:Flat\:\:$index_type\:\:\L$format"; + eval "use $child_class"; + $self->throw($@) if $@; + + # rebless & reinitialize with the new class + # (this prevents subclasses from forgetting to call our own initialization) + bless $self,$child_class; + $self->_initialize(@_); + $self->_set_namespaces(@_); + + $self; +} + +sub _initialize { + my $self = shift; + + my ($flat_write_flag,$flat_indexing,$flat_verbose,$flat_outfile,$flat_format) + = $self->_rearrange([qw(WRITE_FLAG INDEX VERBOSE OUT FORMAT)],@_); + + $self->write_flag($flat_write_flag) if defined $flat_write_flag; + + if (defined $flat_indexing) { + # very permissive + $flat_indexing = 'BerkeleyDB/1' if $flat_indexing =~ /bdb/; + $flat_indexing = 'flat/1' if $flat_indexing =~ /flat/; + $self->indexing_scheme($flat_indexing); + } + + $self->verbose($flat_verbose) if defined $flat_verbose; + $self->out_file($flat_outfile) if defined $flat_outfile; + $self->file_format($flat_format) if defined $flat_format; +} + +sub _set_namespaces { + my $self = shift; + + $self->primary_namespace($self->default_primary_namespace) + unless defined $self->{flat_primary_namespace}; + + $self->secondary_namespaces($self->default_secondary_namespaces) + unless defined $self->{flat_secondary_namespaces}; + + $self->file_format($self->default_file_format) + unless defined $self->{flat_format}; +} + +# accessors +sub directory { + my $self = shift; + my $d = $self->{flat_directory}; + $self->{flat_directory} = shift if @_; + $d; +} +sub write_flag { + my $self = shift; + my $d = $self->{flat_write_flag}; + $self->{flat_write_flag} = shift if @_; + $d; +} +sub verbose { + my $self = shift; + my $d = $self->{flat_verbose}; + $self->{flat_verbose} = shift if @_; + $d; +} +sub out_file { + my $self = shift; + my $d = $self->{flat_outfile}; + $self->{flat_outfile} = shift if @_; + $d; +} + +sub primary_namespace { + my $self = shift; + my $d = $self->{flat_primary_namespace}; + $self->{flat_primary_namespace} = shift if @_; + $d; +} + +# get/set secondary namespace(s) +# pass an array ref. +# get an array ref in scalar context, list in list context. +sub secondary_namespaces { + my $self = shift; + my $d = $self->{flat_secondary_namespaces}; + $self->{flat_secondary_namespaces} = (ref($_[0]) eq 'ARRAY' ? shift : [@_]) if @_; + return unless $d; + $d = [$d] if $d && ref($d) ne 'ARRAY'; # just paranoia + return wantarray ? @$d : $d; +} + +# return the file format +sub file_format { + my $self = shift; + my $d = $self->{flat_format}; + $self->{flat_format} = shift if @_; + $d; +} + +# return the indexing scheme +sub indexing_scheme { + my $self = shift; + my $d = $self->{flat_indexing}; + $self->{flat_indexing} = shift if @_; + $d; +} + +sub add_flat_file { + my $self = shift; + my ($file_path,$file_length,$nf) = @_; + + # check that file_path is absolute + File::Spec->file_name_is_absolute($file_path) + or $self->throw("the flat file path $file_path must be absolute"); + + -r $file_path or $self->throw("flat file $file_path cannot be read: $!"); + + my $current_size = -s _; + if (defined $file_length) { + $current_size == $file_length + or $self->throw("flat file $file_path has changed size. Was $file_length bytes; now $current_size"); + } else { + $file_length = $current_size; + } + + unless (defined $nf) { + $self->{flat_file_index} = 0 unless exists $self->{flat_file_index}; + $nf = $self->{flat_file_index}++; + } + $self->{flat_flat_file_path}{$nf} = $file_path; + $self->{flat_flat_file_no}{$file_path} = $nf; + $nf; +} + +sub write_config { + my $self = shift; + $self->write_flag or $self->throw("cannot write configuration file because write_flag is not set"); + my $path = $self->_config_path; + + open (F,">$path") or $self->throw("open error on $path: $!"); + + my $index_type = $self->indexing_scheme; + print F "index\t$index_type\n"; + + my $format = $self->file_format; + print F "format\t$format\n"; + + my @filenos = $self->_filenos or $self->throw("cannot write config file because no flat files defined"); + for my $nf (@filenos) { + my $path = $self->{flat_flat_file_path}{$nf}; + my $size = -s $path; + print F join("\t","fileid_$nf",$path,$size),"\n"; + } + + # write primary namespace + my $primary_ns = $self->primary_namespace + or $self->throw('cannot write config file because no primary namespace defined'); + + print F join("\t",'primary_namespace',$primary_ns),"\n"; + + # write secondary namespaces + my @secondary = $self->secondary_namespaces; + print F join("\t",'secondary_namespaces',@secondary),"\n"; + + close F or $self->throw("close error on $path: $!"); +} + +sub files { + my $self = shift; + return unless $self->{flat_flat_file_no}; + return keys %{$self->{flat_flat_file_no}}; +} + +sub write_seq { + my $self = shift; + my $seq = shift; + + $self->write_flag or $self->throw("cannot write sequences because write_flag is not set"); + + my $file = $self->out_file or $self->throw('no outfile defined; use the -out argument to new()'); + my $seqio = $self->{flat_cached_parsers}{$file} + ||= Bio::SeqIO->new(-Format => $self->file_format, + -file => ">$file") + or $self->throw("couldn't create Bio::SeqIO object"); + + my $fh = $seqio->_fh or $self->throw("couldn't get filehandle from Bio::SeqIO object"); + my $offset = tell($fh); + $seqio->write_seq($seq); + my $length = tell($fh)-$offset; + my $ids = $self->seq_to_ids($seq); + $self->_store_index($ids,$file,$offset,$length); + + $self->{flat_outfile_dirty}++; +} + +sub close { + my $self = shift; + return unless $self->{flat_outfile_dirty}; + $self->write_config; + delete $self->{flat_outfile_dirty}; + delete $self->{flat_cached_parsers}{$self->out_file}; +} + + +sub _filenos { + my $self = shift; + return unless $self->{flat_flat_file_path}; + return keys %{$self->{flat_flat_file_path}}; +} + +# read the configuration file +sub _read_config { + my $self = shift; + my $config = shift; + + my $path = defined $config ? Bio::Root::IO->catfile($config,CONFIG_FILE_NAME) + : $self->_config_path; + return unless -e $path; + + open (F,$path) or $self->throw("open error on $path: $!"); + my %config; + while () { + chomp; + my ($tag,@values) = split "\t"; + $config{$tag} = \@values; + } + CORE::close F or $self->throw("close error on $path: $!"); + + $config{index}[0] =~ m~(flat/1|BerkeleyDB/1)~ + or $self->throw("invalid configuration file $path: no index line"); + + $self->indexing_scheme($1); + + $self->file_format($config{format}[0]) if $config{format}; + + # set up primary namespace + my $primary_namespace = $config{primary_namespace}[0] + or $self->throw("invalid configuration file $path: no primary namespace defined"); + $self->primary_namespace($primary_namespace); + + # set up secondary namespaces (may be empty) + $self->secondary_namespaces($config{secondary_namespaces}); + + # get file paths and their normalization information + my @normalized_files = grep {$_ ne ''} map {/^fileid_(\S+)/ && $1} keys %config; + for my $nf (@normalized_files) { + my ($file_path,$file_length) = @{$config{"fileid_${nf}"}}; + $self->add_flat_file($file_path,$file_length,$nf); + } + 1; +} + + +sub _config_path { + my $self = shift; + $self->_catfile($self->_config_name); +} + +sub _catfile { + my $self = shift; + my $component = shift; + Bio::Root::IO->catfile($self->directory,$component); +} + +sub _config_name { CONFIG_FILE_NAME } + +sub _path2fileno { + my $self = shift; + my $path = shift; + return $self->add_flat_file($path) + unless exists $self->{flat_flat_file_no}{$path}; + $self->{flat_flat_file_no}{$path}; +} + +sub _fileno2path { + my $self = shift; + my $fileno = shift; + $self->{flat_flat_file_path}{$fileno}; +} + +sub _files { + my $self = shift; + my $paths = $self->{flat_flat_file_no}; + return keys %$paths; +} + +=head2 fetch + + Title : fetch + Usage : $index->fetch( $id ) + Function: Returns a Bio::Seq object from the index + Example : $seq = $index->fetch( 'dJ67B12' ) + Returns : Bio::Seq object + Args : ID + +Deprecated. Use get_Seq_by_id instead. + +=cut + +sub fetch { shift->get_Seq_by_id(@_) } + + +=head2 To Be Implemented in Subclasses + +The following methods MUST be implemented by subclasses. + +=cut + +# create real live Bio::Seq object +sub get_Seq_by_id { + my $self = shift; + my $id = shift; + $self->throw_not_implemented; +} + + +# fetch array of Bio::Seq objects +sub get_Seq_by_acc { + my $self = shift; + return $self->get_Seq_by_id(shift) if @_ == 1; + my ($ns,$key) = @_; + + $self->throw_not_implemented; +} + +sub fetch_raw { + my ($self,$id,$namespace) = @_; + $self->throw_not_implemented; +} + +# This is the method that must be implemented in +# child classes. It is passed a filehandle which should +# point to the next record to be indexed in the file, +# and returns a two element list +# consisting of a key and an adjustment value. +# The key can be a scalar, in which case it is treated +# as the primary ID, or a hashref containing namespace=>[id] pairs, +# one of which MUST correspond to the primary namespace. +# The adjustment value is normally zero, but can be a positive or +# negative integer which will be added to the current file position +# in order to calculate the correct end of the record. +sub parse_one_record { + my $self = shift; + my $fh = shift; + $self->throw_not_implemented; + # here's what you would implement + my (%keys,$offset); + return (\%keys,$offset); +} + +sub default_file_format { + my $self = shift; + $self->throw_not_implemented; +} + +sub _store_index { + my ($ids,$file,$offset,$length) = @_; + $self->throw_not_implemented; +} + +=head2 May Be Overridden in Subclasses + +The following methods MAY be overridden by subclasses. + +=cut + +sub default_primary_namespace { + return "ACC"; +} + +sub default_secondary_namespaces { + return; +} + +sub seq_to_ids { + my $self = shift; + my $seq = shift; + my %ids; + $ids{$self->primary_namespace} = $seq->accession_number; + \%ids; +} + +sub DESTROY { + my $self = shift; + $self->close; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/Flat/BDB.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/Flat/BDB.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,472 @@ +# +# $Id: BDB.pm,v 1.6.2.1 2003/03/25 18:46:10 jason Exp $ +# +# BioPerl module for Bio::DB::Flat::BDB +# +# Cared for by Lincoln Stein +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::DB::Flat::BDB - Interface for BioHackathon standard BDB-indexed flat file + +=head1 SYNOPSIS + +You should not be using this module directly. See Bio::DB::Flat. + +=head1 DESCRIPTION + +This object provides the basic mechanism to associate positions in +files with primary and secondary name spaces. Unlike +Bio::Index::Abstract (see L), this is specialized +to work with the BerkeleyDB-indexed "common" flat file format worked +out at the 2002 BioHackathon. + +This object is the guts to the mechanism, which will be used by the +specific objects inheriting from it. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via +email or the web: + + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Lincoln Stein + +Email - lstein@cshl.org + +=head1 SEE ALSO + +L, + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal +methods are usually preceded with an "_" (underscore). + +=cut + + +# Let the code begin... + +package Bio::DB::Flat::BDB; + +use strict; +use DB_File; +use IO::File; +use Fcntl qw(O_CREAT O_RDWR O_RDONLY); +use File::Spec; +use Bio::DB::Flat; +use Bio::SeqIO; +use Bio::DB::RandomAccessI; +use Bio::Root::Root; +use Bio::Root::IO; +use vars '@ISA'; + +@ISA = qw(Bio::DB::Flat); + +sub _initialize { + my $self = shift; + my ($max_open) = $self->_rearrange(['MAXOPEN'],@_); + $self->{bdb_maxopen} = $max_open || 32; +} + +# return a filehandle seeked to the appropriate place +# this only works with the primary namespace +sub _get_stream { + my ($self,$id) = @_; + my ($filepath,$offset,$length) = $self->_lookup_primary($id) + or $self->throw("Unable to find a record for $id in the flat file index"); + my $fh = $self->_fhcache($filepath) + or $self->throw("couldn't open $filepath: $!"); + seek($fh,$offset,0) or $self->throw("can't seek on $filepath: $!"); + $fh; +} + +# return records corresponding to the indicated index +# if there are multiple hits will return a list in list context, +# otherwise will throw an exception +sub fetch_raw { + my ($self,$id,$namespace) = @_; + + # secondary lookup + if (defined $namespace && $namespace ne $self->primary_namespace) { + my @hits = $self->_lookup_secondary($namespace,$id); + $self->throw("Multiple records correspond to $namespace=>$id but function called in a scalar context") + unless wantarray; + return map {$self->_read_record(@$_)} @hits; + } + + # primary lookup + my @args = $self->_lookup_primary($id) + or $self->throw("Unable to find a record for $id in the flat file index"); + return $self->_read_record(@args); +} + +# create real live Bio::Seq object +sub get_Seq_by_id { + my $self = shift; + my $id = shift; + my $fh = eval {$self->_get_stream($id)} or return; + my $seqio = + $self->{bdb_cached_parsers}{fileno $fh} ||= Bio::SeqIO->new( -Format => $self->file_format, + -fh => $fh); + return $seqio->next_seq; +} + +# fetch array of Bio::Seq objects +sub get_Seq_by_acc { + my $self = shift; + unshift @_,'ACC' if @_==1; + my ($ns,$key) = @_; + my @primary_ids = $self->expand_ids($ns => $key); + $self->throw("more than one sequences correspond to this accession") + if @primary_ids > 1 && ! wantarray; + my @rc = map {$self->get_Seq_by_id($_)} @primary_ids; + return wantarray ? @rc : $rc[0]; +} + +# fetch array of Bio::Seq objects +sub get_Seq_by_version { + my $self = shift; + unshift @_,'VERSION' if @_==1; + my ($ns,$key) = @_; + my @primary_ids = $self->expand_ids($ns => $key); + $self->throw("more than one sequences correspond to this accession") + if @primary_ids > 1 && !wantarray; + return map {$self->get_Seq_by_id($_)} @primary_ids; +} + +=head2 get_PrimarySeq_stream + + Title : get_PrimarySeq_stream + Usage : $stream = get_PrimarySeq_stream + Function: Makes a Bio::DB::SeqStreamI compliant object + which provides a single method, next_primary_seq + Returns : Bio::DB::SeqStreamI + Args : none + + +=cut + +sub get_PrimarySeq_stream { + my $self = shift; + my @files = $self->files || 0; + my $out = Bio::SeqIO::MultiFile->new( -format => $self->file_format , + -files => \@files); + return $out; +} + +sub get_all_primary_ids { + my $self = shift; + my $db = $self->primary_db; + return keys %$db; +} + +=head2 get_all_primary_ids + + Title : get_all_primary_ids + Usage : @ids = $seqdb->get_all_primary_ids() + Function: gives an array of all the primary_ids of the + sequence objects in the database. + Example : + Returns : an array of strings + Args : none + +=cut + +# this will perform an ID lookup on a (possibly secondary) +# id, returning all the corresponding ids +sub expand_ids { + my $self = shift; + my ($ns,$key) = @_; + return $key unless defined $ns; + return $key if $ns eq $self->primary_namespace; + my $db = $self->secondary_db($ns) + or $self->throw("invalid secondary namespace $ns"); + my $record = $db->{$key} or return; # nothing there + return $self->unpack_secondary($record); +} + +# build index from files listed +sub build_index { + my $self = shift; + my @files = @_; + my $count = 0; + for my $file (@files) { + $file = File::Spec->rel2abs($file) + unless File::Spec->file_name_is_absolute($file); + $count += $self->_index_file($file); + } + $self->write_config; + $count; +} + +sub _index_file { + my $self = shift; + my $file = shift; + + my $fileno = $self->_path2fileno($file); + defined $fileno or $self->throw("could not create a file number for $file"); + + my $fh = $self->_fhcache($file) or $self->throw("could not open $file for indexing: $!"); + my $offset = 0; + my $count = 0; + while (!eof($fh)) { + my ($ids,$adjustment) = $self->parse_one_record($fh) or next; + $adjustment ||= 0; # prevent uninit variable warning + my $pos = tell($fh) + $adjustment; + $self->_store_index($ids,$file,$offset,$pos-$offset); + $offset = $pos; + $count++; + } + $count; +} + +=head2 To Be Implemented in Subclasses + +The following methods MUST be implemented by subclasses. + +=cut + +=head2 May Be Overridden in Subclasses + +The following methods MAY be overridden by subclasses. + +=cut + +sub default_primary_namespace { + return "ACC"; +} + +sub default_secondary_namespaces { + return; +} + +sub _read_record { + my $self = shift; + my ($filepath,$offset,$length) = @_; + my $fh = $self->_fhcache($filepath) + or $self->throw("couldn't open $filepath: $!"); + seek($fh,$offset,0) or $self->throw("can't seek on $filepath: $!"); + my $record; + read($fh,$record,$length) or $self->throw("can't read $filepath: $!"); + $record +} + +# return a list in the form ($filepath,$offset,$length) +sub _lookup_primary { + my $self = shift; + my $primary = shift; + my $db = $self->primary_db + or $self->throw("no primary namespace database is open"); + + my $record = $db->{$primary} or return; # nothing here + + my($fileid,$offset,$length) = $self->unpack_primary($record); + my $filepath = $self->_fileno2path($fileid) + or $self->throw("no file path entry for fileid $fileid"); + return ($filepath,$offset,$length); +} + +# return a list of array refs in the form [$filepath,$offset,$length] +sub _lookup_secondary { + my $self = shift; + my ($namespace,$secondary) = @_; + my @primary = $self->expand_ids($namespace=>$secondary); + return map {[$self->_lookup_primary($_)]} @primary; +} + +# store indexing information into a primary & secondary record +# $namespaces is one of: +# 1. a scalar corresponding to the primary name +# 2. a hashref corresponding to namespace=>id identifiers +# it is valid for secondary id to be an arrayref +sub _store_index { + my $self = shift; + my ($keys,$filepath,$offset,$length) = @_; + my ($primary,%secondary); + + if (ref $keys eq 'HASH') { + my %valid_secondary = map {$_=>1} $self->secondary_namespaces; + while (my($ns,$value) = each %$keys) { + if ($ns eq $self->primary_namespace) { + $primary = $value; + } else { + $valid_secondary{$ns} or $self->throw("invalid secondary namespace $ns"); + push @{$secondary{$ns}},$value; + } + } + $primary or $self->throw("no primary namespace ID provided"); + } else { + $primary = $keys; + } + + $self->throw("invalid primary ID; must be a scalar") + if ref($primary) =~ /^(ARRAY|HASH)$/; # but allow stringified objects + + $self->_store_primary($primary,$filepath,$offset,$length); + for my $ns (keys %secondary) { + my @ids = ref $secondary{$ns} ? @{$secondary{$ns}} : $secondary{$ns}; + $self->_store_secondary($ns,$_,$primary) foreach @ids; + } + + 1; +} + +# store primary index +sub _store_primary { + my $self = shift; + my ($id,$filepath,$offset,$length) = @_; + + my $db = $self->primary_db + or $self->throw("no primary namespace database is open"); + my $fileno = $self->_path2fileno($filepath); + defined $fileno or $self->throw("could not create a file number for $filepath"); + + my $record = $self->pack_primary($fileno,$offset,$length); + $db->{$id} = $record or return; # nothing here + 1; +} + +# store a primary index name under a secondary index +sub _store_secondary { + my $self = shift; + my ($secondary_ns,$secondary_id,$primary_id) = @_; + + my $db = $self->secondary_db($secondary_ns) + or $self->throw("invalid secondary namespace $secondary_ns"); + + # first get whatever secondary ids are already stored there + my @primary = $self->unpack_secondary($db->{$secondary_id}); + # uniqueify + my %unique = map {$_=>undef} @primary,$primary_id; + + my $record = $self->pack_secondary(keys %unique); + $db->{$secondary_id} = $record; +} + +# get output file handle +sub _outfh { + my $self = shift; +#### XXXXX FINISH ##### +# my $ +} + +# unpack a primary record into fileid,offset,length +sub unpack_primary { + my $self = shift; + my $index_record = shift; + return split "\t",$index_record; +} + +# unpack a secondary record into a list of primary ids +sub unpack_secondary { + my $self = shift; + my $index_record = shift or return; + return split "\t",$index_record; +} + +# pack a list of fileid,offset,length into a primary id record +sub pack_primary { + my $self = shift; + my ($fileid,$offset,$length) = @_; + return join "\t",($fileid,$offset,$length); +} + +# pack a list of primary ids into a secondary id record +sub pack_secondary { + my $self = shift; + my @secondaries = @_; + return join "\t",@secondaries; +} + +sub primary_db { + my $self = shift; + # lazy opening + $self->_open_bdb unless exists $self->{bdb_primary_db}; + return $self->{bdb_primary_db}; +} + +sub secondary_db { + my $self = shift; + my $secondary_namespace = shift + or $self->throw("usage: secondary_db(\$secondary_namespace)"); + $self->_open_bdb unless exists $self->{bdb_primary_db}; + return $self->{bdb_secondary_db}{$secondary_namespace}; +} + +sub _open_bdb { + my $self = shift; + + my $flags = $self->write_flag ? O_CREAT|O_RDWR : O_RDONLY; + + my $primary_db = {}; + tie(%$primary_db,'DB_File',$self->_catfile($self->_primary_db_name),$flags,0666,$DB_BTREE) + or $self->throw("Could not open primary index file: $! (did you remember to use -write_flag=>1?)"); + $self->{bdb_primary_db} = $primary_db; + + for my $secondary ($self->secondary_namespaces) { + my $secondary_db = {}; + tie(%$secondary_db,'DB_File',$self->_catfile($self->_secondary_db_name($secondary)),$flags,0666,$DB_BTREE) + or $self->throw("Could not open primary index file"); + $self->{bdb_secondary_db}{$secondary} = $secondary_db; + } + + 1; +} + +sub _primary_db_name { + my $self = shift; + my $pns = $self->primary_namespace or $self->throw('no primary namespace defined'); + return "key_$pns"; +} + +sub _secondary_db_name { + my $self = shift; + my $sns = shift; + return "id_$sns"; +} + +sub _fhcache { + my $self = shift; + my $path = shift; + my $write = shift; + + if (!$self->{bdb_fhcache}{$path}) { + $self->{bdb_curopen} ||= 0; + if ($self->{bdb_curopen} >= $self->{bdb_maxopen}) { + my @lru = sort {$self->{bdb_cacheseq}{$a} <=> $self->{bdb_cacheseq}{$b};} keys %{$self->{bdb_fhcache}}; + splice(@lru, $self->{bdb_maxopen} / 3); + $self->{bdb_curopen} -= @lru; + for (@lru) { delete $self->{bdb_fhcache}{$_} } + } + if ($write) { + my $modifier = $self->{bdb_fhcache_seenit}{$path}++ ? '>' : '>>'; + $self->{bdb_fhcache}{$path} = IO::File->new("${modifier}${path}") or return; + } else { + $self->{bdb_fhcache}{$path} = IO::File->new($path) or return; + } + $self->{bdb_curopen}++; + } + $self->{bdb_cacheseq}{$path}++; + $self->{bdb_fhcache}{$path} +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/Flat/BDB/embl.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/Flat/BDB/embl.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,97 @@ +# +# $Id: embl.pm,v 1.4 2002/10/22 07:38:31 lapp Exp $ +# +# BioPerl module for Bio::DB::Flat::BDB +# +# Cared for by Lincoln Stein +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::DB::Flat::BDB::embl - embl adaptor for Open-bio standard BDB-indexed flat file + +=head1 SYNOPSIS + +See Bio::DB::Flat. + +=head1 DESCRIPTION + +This module allows embl files to be stored in Berkeley DB flat files +using the Open-Bio standard BDB-indexed flat file scheme. You should +not be using this directly, but instead use it via Bio::DB::Flat. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Lincoln Stein + +Email - lstein@cshl.org + +=head1 SEE ALSO + +L, + +=cut + +package Bio::DB::Flat::BDB::embl; + +use strict; +use Bio::DB::Flat::BDB; +use vars '@ISA'; + +@ISA = qw(Bio::DB::Flat::BDB); + +sub parse_one_record { + my $self = shift; + my $fh = shift; + my $parser = + $self->{embl_cached_parsers}{fileno($fh)} ||= Bio::SeqIO->new(-fh=>$fh,-format=>$self->default_file_format); + my $seq = $parser->next_seq; + my $ids = $self->seq_to_ids($seq); + return $ids; +} + +sub seq_to_ids { + my $self = shift; + my $seq = shift; + + my $display_id = $seq->display_id; + my $accession = $seq->accession_number; + my %ids; + $ids{ID} = $display_id; + $ids{ACC} = $accession if defined $accession; + return \%ids; +} + +sub default_primary_namespace { + return "ID"; +} + +sub default_secondary_namespaces { + return qw(ACC); +} + +sub default_file_format { "embl" } + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/Flat/BDB/fasta.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/Flat/BDB/fasta.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,86 @@ +# +# $Id: fasta.pm,v 1.3 2002/10/22 07:38:31 lapp Exp $ +# +# BioPerl module for Bio::DB::Flat::BDB +# +# Cared for by Lincoln Stein +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::DB::Flat::BDB::fasta - fasta adaptor for Open-bio standard BDB-indexed flat file + +=head1 SYNOPSIS + +See Bio::DB::Flat. + +=head1 DESCRIPTION + +This module allows fasta files to be stored in Berkeley DB flat files +using the Open-Bio standard BDB-indexed flat file scheme. You should +not be using this directly, but instead use it via Bio::DB::Flat. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 SEE ALSO + +L, + +=head1 AUTHOR - Lincoln Stein + +Email - lstein@cshl.org + +=cut + +package Bio::DB::Flat::BDB::fasta; + +use strict; +use Bio::DB::Flat::BDB; +use vars '@ISA'; + +@ISA = qw(Bio::DB::Flat::BDB); + +sub parse_one_record { + my $self = shift; + my $fh = shift; + + undef $self->{fasta_stored_id} if exists $self->{fasta_stored_fh} + && $fh ne $self->{fasta_stored_fh} ; + $self->{fasta_stored_fh} = $fh; + + while (<$fh>) { # don't try this at home + if (/^>(\S+)/) { + my $id = $self->{fasta_stored_id}; + $self->{fasta_stored_id} = $1; + next unless defined $id; + return ($id,-length($_)); + } + } + # we get here at the end of the file + return $self->{fasta_stored_id}; +} + +sub default_file_format { "fasta" } + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/Flat/OBDAIndex.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/Flat/OBDAIndex.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1692 @@ +# $Id: OBDAIndex.pm,v 1.12.2.1 2003/06/28 20:47:16 jason Exp $ +# +# BioPerl module for Bio::DB::Flat::OBDAIndex +# +# Cared for by Michele Clamp > +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::DB::Flat::OBDAIndex - Binary search indexing system for sequence files + +=head1 SYNOPSIS + +This module can be used both to index sequence files and also to retrieve +sequences from existing sequence files. + +=head2 Index creation + + my $sequencefile; # Some fasta sequence file + +Patterns have to be entered to define where the keys are to be +indexed and also where the start of each record. E.g. for fasta + + my $start_pattern = "^>"; + my $primary_pattern = "^>(\\S+)"; + + +So the start of a record is a line starting with a E and the primary +key is all characters up to the first space afterf the E + +A string also has to be entered to defined what the primary key +(primary_namespace) is called. + +The index can now be created using + + my $index = new Bio::DB::Flat::OBDAIndex( + -start_pattern => $start_pattern, + -primary_pattern => $primary_pattern, + -primary_namespace => "ACC", + ); + +To actually write it out to disk we need to enter a directory where the +indices will live, a database name and an array of sequence files to index. + + my @files = ("file1","file2","file3"); + + $index->make_index("/Users/michele/indices","mydatabase",@files); + +The index is now ready to use. For large sequence files the perl +way of indexing takes a *long* time and a *huge* amount of memory. +For indexing things like dbEST I recommend using the C indexer. + +=head2 Creating indices with secondary keys + +Sometimes just indexing files with one id per entry is not enough. For +instance you may want to retrieve sequences from swissprot using +their accessions as well as their ids. + +To be able to do this when creating your index you need to pass in +a hash of secondary_patterns which have their namespaces as the keys +to the hash. + +e.g. For Indexing something like + +ID 1433_CAEEL STANDARD; PRT; 248 AA. +AC P41932; +DT 01-NOV-1995 (Rel. 32, Created) +DT 01-NOV-1995 (Rel. 32, Last sequence update) +DT 15-DEC-1998 (Rel. 37, Last annotation update) +DE 14-3-3-LIKE PROTEIN 1. +GN FTT-1 OR M117.2. +OS Caenorhabditis elegans. +OC Eukaryota; Metazoa; Nematoda; Chromadorea; Rhabditida; Rhabditoidea; +OC Rhabditidae; Peloderinae; Caenorhabditis. +OX NCBI_TaxID=6239; +RN [1] + +where we want to index the accession (P41932) as the primary key and the +id (1433_CAEEL) as the secondary id. The index is created as follows + + my %secondary_patterns; + + my $start_pattern = "^ID (\\S+)"; + my $primary_pattern = "^AC (\\S+)\;"; + + $secondary_patterns{"ID"} = "^ID (\\S+)"; + + my $index = new Bio::DB::Flat::OBDAIndex( + -start_pattern => $start_pattern, + -primary_pattern => $primary_pattern, + -primary_namespace => 'ACC', + -secondary_patterns => \%secondary_patterns); + + $index->make_index("/Users/michele/indices","mydb",($seqfile)); + +Of course having secondary indices makes indexing slower and more +of a memory hog. + + +=head2 Index reading + +To fetch sequences using an existing index first of all create your sequence +object + + my $index = new Bio::DB::Flat::OBDAIndex(-index_dir => $index_directory, + -dbname => 'swissprot'); + +Now you can happily fetch sequences either by the primary key or +by the secondary keys. + + my $entry = $index->get_entry_by_id('HBA_HUMAN'); + +This returns just a string containing the whole entry. This is +useful is you just want to print the sequence to screen or write it to a file. + +Other ways of getting sequences are + + my $fh = $index->get_stream_by_id('HBA_HUMAN'); + +This can then be passed to a seqio object for output or converting +into objects. + + my $seq = new Bio::SeqIO(-fh => $fh, + -format => 'fasta'); + +The last way is to retrieve a sequence directly. This is the +slowest way of extracting as the sequence objects need to be made. + + my $seq = $index->get_Seq_by_id('HBA_HUMAN'); + +To access the secondary indices the secondary namespace needs to be known +(use $index-Esecondary_namespaces) and the following call used + + my $seq = $index->get_Seq_by_secondary('ACC','Q21973'); + my $fh = $index->get_stream_by_secondary('ACC','Q21973'); + my $entry = $index->get_entry_by_secondary('ACC','Q21973'); + +=head1 DESCRIPTION + +This object allows indexing of sequence files both by a primary key +(say accession) and multiple secondary keys (say ids). This is +different from the Bio::Index::Abstract (see L) +which uses DBM files as storage. This module uses a binary search to +retrieve sequences which is more efficient for large datasets. + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Michele Clamp + +Email - michele@sanger.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal +methods are usually preceded with an "_" (underscore). + +=cut + +package Bio::DB::Flat::OBDAIndex; + +use strict; +use vars qw(@ISA); + +use Fcntl qw(SEEK_END SEEK_CUR); +# rather than using tell which might be buffered +sub systell{ sysseek($_[0], 0, SEEK_CUR) } +sub syseof{ sysseek($_[0], 0, SEEK_END) } + + +use Bio::DB::RandomAccessI; +use Bio::Root::RootI; +use Bio::SeqIO; +use Bio::Seq; + +@ISA = qw(Bio::DB::RandomAccessI); + +use constant CONFIG_FILE_NAME => 'config.dat'; +use constant HEADER_SIZE => 4; + +my @formats = ['FASTA','SWISSPROT','EMBL']; + +=head2 new + + Title : new + Usage : For reading + my $index = new Bio::DB::Flat::OBDAIndex( + -index_dir => '/Users/michele/indices/', + -dbname => 'dbEST', + -format => 'fasta'); + + For writing + + my %secondary_patterns = {"ACC" => "^>\\S+ +(\\S+)"} + my $index = new Bio::DB::Flat::OBDAIndex( + -index_dir => '/Users/michele/indices', + -primary_pattern => "^>(\\S+)", + -secondary_patterns => \%secondary_patterns, + -primary_namespace => "ID"); + + my @files = ('file1','file2','file3'); + + $index->make_index('mydbname',@files); + + + Function: create a new Bio::DB::Flat::OBDAIndex object + Returns : new Bio::DB::Flat::OBDAIndex + Args : -index_dir Directory containing the indices + -primary_pattern Regexp defining the primary id + -secondary_patterns A hash ref containing the secondary + patterns with the namespaces as keys + -primary_namespace A string defining what the primary key + is + + Status : Public + +=cut + +sub new { + my($class, @args) = @_; + + my $self = $class->SUPER::new(@args); + + bless $self, $class; + + my ($index_dir,$dbname,$format,$primary_pattern,$primary_namespace, + $start_pattern,$secondary_patterns) = + $self->_rearrange([qw(INDEX_DIR + DBNAME + FORMAT + PRIMARY_PATTERN + PRIMARY_NAMESPACE + START_PATTERN + SECONDARY_PATTERNS)], @args); + + $self->index_directory($index_dir); + $self->database_name ($dbname); + + if ($self->index_directory && $dbname) { + + $self->read_config_file; + + my $fh = $self->primary_index_filehandle; + my $record_width = $self->read_header($fh); + + $self->record_size($record_width); + } + + + $self->format ($format); + $self->primary_pattern ($primary_pattern); + $self->primary_namespace ($primary_namespace); + $self->start_pattern ($start_pattern); + $self->secondary_patterns($secondary_patterns); + + return $self; +} + +sub new_from_registry { + my ($self,%config) = @_; + + my $dbname = $config{'dbname'}; + my $location = $config{'location'}; + + my $index = new Bio::DB::Flat::OBDAIndex(-dbname => $dbname, + -index_dir => $location, + ); +} + +=head2 get_Seq_by_id + + Title : get_Seq_by_id + Usage : $obj->get_Seq_by_id($newval) + Function: + Example : + Returns : value of get_Seq_by_id + Args : newvalue (optional) + +=cut + +sub get_Seq_by_id { + my ($self,$id) = @_; + + my ($fh,$length) = $self->get_stream_by_id($id); + + if (!defined($self->format)) { + $self->throw("Can't create sequence - format is not defined"); + } + + if(!$fh){ + return; + } + if (!defined($self->{_seqio})) { + + $self->{_seqio} = new Bio::SeqIO(-fh => $fh, + -format => $self->format); + } else { + + $self->{_seqio}->fh($fh); + } + + return $self->{_seqio}->next_seq; + +} + +=head2 get_entry_by_id + + Title : get_entry_by_id + Usage : $obj->get_entry_by_id($newval) + Function: + Example : + Returns : + Args : + + +=cut + +sub get_entry_by_id { + my ($self,$id) = @_; + + my ($fh,$length) = $self->get_stream_by_id($id); + + my $entry; + + sysread($fh,$entry,$length); + + return $entry; +} + + +=head2 get_stream_by_id + + Title : get_stream_by_id + Usage : $obj->get_stream_by_id($newval) + Function: + Example : + Returns : value of get_stream_by_id + Args : newvalue (optional) + + +=cut + +sub get_stream_by_id { + my ($self,$id) = @_; + + my $indexfh = $self->primary_index_filehandle; + + syseof ($indexfh); + + my $filesize = systell($indexfh); + + my $end = ($filesize-$self->{_start_pos})/$self->record_size; + + my ($newid,$rest,$fhpos) = $self->find_entry($indexfh,0,$end,$id,$self->record_size); + + + my ($fileid,$pos,$length) = split(/\t/,$rest); + + #print STDERR "OBDAIndex Found id entry $newid $fileid $pos $length:$rest\n"; + + if (!$newid) { + return; + } + + my $fh = $self->get_filehandle_by_fileid($fileid); + my $file = $self->{_file}{$fileid}; + + open (IN,"<$file"); + $fh = \*IN; + + my $entry; + + sysseek($fh,$pos,0); + + return ($fh,$length); +} + +=head2 get_Seq_by_acc + + Title : get_Seq_by_acc + Usage : $obj->get_Seq_by_acc($newval) + Function: + Example : + Returns : value of get_Seq_by_acc + Args : newvalue (optional) + + +=cut + +sub get_Seq_by_acc { + my ($self,$acc) = @_; + + if ($self->primary_namespace eq "ACC") { + return $self->get_Seq_by_id($acc); + } else { + return $self->get_Seq_by_secondary("ACC",$acc); + } +} + +=head2 get_Seq_by_secondary + + Title : get_Seq_by_secondary + Usage : $obj->get_Seq_by_secondary($newval) + Function: + Example : + Returns : value of get_Seq_by_secondary + Args : newvalue (optional) + + +=cut + +sub get_Seq_by_secondary { + my ($self,$name,$id) = @_; + + my @names = $self->secondary_namespaces; + + my $found = 0; + foreach my $tmpname (@names) { + if ($name eq $tmpname) { + $found = 1; + } + } + + if ($found == 0) { + $self->throw("Secondary index for $name doesn't exist\n"); + } + + my $fh = $self->open_secondary_index($name); + + syseof ($fh); + + my $filesize = systell($fh); + + my $recsize = $self->{_secondary_record_size}{$name}; +# print "Name " . $recsize . "\n"; + + my $end = ($filesize-$self->{_start_pos})/$recsize; + +# print "End $end $filesize\n"; + + my ($newid,$primary_id,$pos) = $self->find_entry($fh,0,$end,$id,$recsize); + + sysseek($fh,$pos,0); + +# print "Found new id $newid $primary_id\n"; + # We now need to shuffle up the index file to find the top secondary entry + + my $record = $newid; + + while ($record =~ /^$newid/ && $pos >= 0) { + + $record = $self->read_record($fh,$pos,$recsize); + $pos = $pos - $recsize; +# print "Up record = $record:$newid\n"; + } + + $pos += $recsize; + +# print "Top position is $pos\n"; + + # Now we have to shuffle back down again to read all the secondary entries + + my $current_id = $newid; + my %primary_id; + + $primary_id{$primary_id} = 1; + + while ($current_id eq $newid) { + $record = $self->read_record($fh,$pos,$recsize); + print "Record is :$record:\n"; + my ($secid,$primary_id) = split(/\t/,$record,2); + $current_id = $secid; + + if ($current_id eq $newid) { + $primary_id =~ s/ //g; + # print "Primary $primary_id\n"; + $primary_id{$primary_id} = 1; + + $pos = $pos + $recsize; + # print "Down record = $record\n"; + } + } + + if (!defined($newid)) { + return; + } + + my $entry; + + foreach my $id (keys %primary_id) { + $entry .= $self->get_Seq_by_id($id); + } + return $entry; + +} + +=head2 read_header + + Title : read_header + Usage : $obj->read_header($newval) + Function: + Example : + Returns : value of read_header + Args : newvalue (optional) + + +=cut + +sub read_header { + my ($self,$fh) = @_; + + my $record_width; + + sysread($fh,$record_width,HEADER_SIZE); + + $self->{_start_pos} = HEADER_SIZE; + $record_width =~ s/ //g; + $record_width = $record_width * 1; + + return $record_width; +} + +=head2 read_record + + Title : read_record + Usage : $obj->read_record($newval) + Function: + Example : + Returns : value of read_record + Args : newvalue (optional) + + +=cut + +sub read_record { + my ($self,$fh,$pos,$len) = @_; + + sysseek($fh,$pos,0); + + my $record; + + sysread($fh,$record,$len); + + return $record; + +} + + +=head2 find_entry + + Title : find_entry + Usage : $obj->find_entry($newval) + Function: + Example : + Returns : value of find_entry + Args : newvalue (optional) + + +=cut + +sub find_entry { + my ($self,$fh,$start,$end,$id,$recsize) = @_; + + my $mid = int(($end+1+$start)/2); + my $pos = ($mid-1)*$recsize + $self->{_start_pos}; + + my ($record) = $self->read_record($fh,$pos,$recsize); + my ($entryid,$rest) = split(/\t/,$record,2); + +# print "Mid $recsize $mid $pos:$entryid:$rest:$record\n"; +# print "Entry :$id:$entryid:$rest\n"; + + + my ($first,$second) = sort { $a cmp $b} ($id,$entryid); + + if ($id eq $entryid) { + + return ($id,$rest,$pos-$recsize); + + } elsif ($first eq $id) { + + if ($end-$start <= 1) { + return; + } + my $end = $mid; +# print "Moving up $entryid $id\n"; + $self->find_entry($fh,$start,$end,$id,$recsize); + + } elsif ($second eq $id ) { +# print "Moving down $entryid $id\n"; + if ($end-$start <= 1) { + return; + } + + $start = $mid; + + $self->find_entry($fh,$start,$end,$id,$recsize); + } + + } + + +=head2 make_index + + Title : make_index + Usage : $obj->make_index($newval) + Function: + Example : + Returns : value of make_index + Args : newvalue (optional) + + +=cut + +sub make_index { + my ($self,$dbname,@files) = @_;; + + my $rootdir = $self->index_directory; + + if (!defined($rootdir)) { + $self->throw("No index directory set - can't build indices"); + } + + if (! -d $rootdir) { + $self->throw("Index directory [$rootdir] is not a directory. Cant' build indices"); + } + if (!(@files)) { + $self->throw("Must enter an array of filenames to index"); + } + + if (!defined($dbname)) { + $self->throw("Must enter an index name for your files"); + } + + my $pwd = `pwd`; chomp($pwd); + + foreach my $file (@files) { + if ($file !~ /^\//) { + $file = $pwd . "/$file"; + } + if (! -e $file) { + $self->throw("Can't index file [$file] as it doesn't exist"); + } + } + + $self->database_name($dbname); + $self->make_indexdir($rootdir);; + $self->make_config_file(\@files); + + # Finally lets index + foreach my $file (@files) { + $self->_index_file($file); + } + + # And finally write out the indices + $self->write_primary_index; + $self->write_secondary_indices; +} + +=head2 _index_file + + Title : _index_file + Usage : $obj->_index_file($newval) + Function: + Example : + Returns : value of _index_file + Args : newvalue (optional) + + +=cut + +sub _index_file { + my ($self,$file) = @_; + + open(FILE,"<$file") || $self->throw("Can't open file [$file]"); + + my $recstart = 0; + my $fileid = $self->get_fileid_by_filename($file); + my $found = 0; + my $id; + my $count; + + my $primary = $self->primary_pattern; + my $start_pattern = $self->start_pattern; + + my $pos = 0; + + my $new_primary_entry; + + my $length; + #my $pos = 0; + my $fh = \*FILE; + + my $done = -1; + + my @secondary_names = $self->secondary_namespaces; + my %secondary_id; + + while (<$fh>) { + if ($_ =~ /$start_pattern/) { + if ($done == 0) { + $id = $new_primary_entry; + + my $tmplen = tell($fh) - length($_); + + $length = $tmplen - $pos; + + if (!defined($id)) { + $self->throw("No id defined for sequence"); + } + if (!defined($fileid)) { + $self->throw("No fileid defined for file $file"); + } + if (!defined($pos)) { + $self->throw("No position defined for " . $id . "\n"); + } + if (!defined($length)) { + $self->throw("No length defined for " . $id . "\n"); + } + + $self->_add_id_position($id,$pos,$fileid,$length,\%secondary_id); + + $pos = $tmplen; + + if ($count%1000 == 0) { + print STDERR "Indexed $count ids\n"; + } + + $count++; + } else { + $done = 0; + } + } + + if ($_ =~ /$primary/) { + $new_primary_entry = $1; + } + + my $secondary_patterns = $self->secondary_patterns; + + foreach my $sec (@secondary_names) { + my $pattern = $secondary_patterns->{$sec}; + + if ($_ =~ /$pattern/) { + $secondary_id{$sec} = $1; + } + } + + } + + # Remeber to add in the last one + + $id = $new_primary_entry; + + my $tmplen = tell($fh) - length($_); + + $length = $tmplen - $pos; + + if (!defined($id)) { + $self->throw("No id defined for sequence"); + } + if (!defined($fileid)) { + $self->throw("No fileid defined for file $file"); + } + if (!defined($pos)) { + $self->throw("No position defined for " . $id . "\n"); + } + if (!defined($length)) { + $self->throw("No length defined for " . $id . "\n"); + } + + $self->_add_id_position($id,$pos,$fileid,$length,\%secondary_id); + + close(FILE); +} + +=head2 write_primary_index + + Title : write_primary_index + Usage : $obj->write_primary_index($newval) + Function: + Example : + Returns : value of write_primary_index + Args : newvalue (optional) + + +=cut + +sub write_primary_index { + my ($self) = @_; + + my @ids = keys %{$self->{_id}}; + + @ids = sort {$a cmp $b} @ids; + + print STDERR "Number of ids = " . scalar(@ids) . "\n"; + + open (INDEX,">" . $self->primary_index_file) || $self->throw("Can't open primary index file [" . $self->primary_index_file . "]"); + + my $recordlength = $self->{_maxidlength} + + $self->{_maxfileidlength} + + $self->{_maxposlength} + + $self->{_maxlengthlength} + 3; + + + print INDEX sprintf("%4d",$recordlength); + + foreach my $id (@ids) { + + if (!defined($self->{_id}{$id}{_fileid})) { + $self->throw("No fileid for $id\n"); + } + if (!defined($self->{_id}{$id}{_pos})) { + $self->throw("No position for $id\n"); + } + if (!defined($self->{_id}{$id}{_length})) { + $self->throw("No length for $id"); + } + + my $record = $id . "\t" . + $self->{_id}{$id}{_fileid} . "\t" . + $self->{_id}{$id}{_pos} . "\t" . + $self->{_id}{$id}{_length}; + + print INDEX sprintf("%-${recordlength}s",$record); + + } + close(INDEX); +} + +=head2 write_secondary_indices + + Title : write_secondary_indices + Usage : $obj->write_secondary_indices($newval) + Function: + Example : + Returns : value of write_secondary_indices + Args : newvalue (optional) + + +=cut + +sub write_secondary_indices { + my ($self) = @_; + + # These are the different + my @names = keys (%{$self->{_secondary_id}}); + + + foreach my $name (@names) { + + my @seconds = keys %{$self->{_secondary_id}{$name}}; + + # First we need to loop over to get the longest record. + my $length = 0; + + foreach my $second (@seconds) { + my $tmplen = length($second) + 1; + my @prims = keys %{$self->{_secondary_id}{$name}{$second}}; + + foreach my $prim (@prims) { + my $recordlen = $tmplen + length($prim); + + if ($recordlen > $length) { + $length = $recordlen; + } + } + } + + # Now we can print the index + + my $fh = $self->new_secondary_filehandle($name); + + print $fh sprintf("%4d",$length); + @seconds = sort @seconds; + + foreach my $second (@seconds) { + + my @prims = keys %{$self->{_secondary_id}{$name}{$second}}; + my $tmp = $second; + + foreach my $prim (@prims) { + my $record = $tmp . "\t" . $prim; + if (length($record) > $length) { + $self->throw("Something has gone horribly wrong - length of record is more than we thought [$length]\n"); + } else { + print $fh sprintf("%-${length}s",$record); + print $fh sprintf("%-${length}s",$record); + } + } + } + + close($fh); + } +} + +=head2 new_secondary_filehandle + + Title : new_secondary_filehandle + Usage : $obj->new_secondary_filehandle($newval) + Function: + Example : + Returns : value of new_secondary_filehandle + Args : newvalue (optional) + + +=cut + +sub new_secondary_filehandle { + my ($self,$name) = @_; + + my $indexdir = $self->index_directory; + + my $secindex = $indexdir . $self->database_name . "/id_$name.index"; + + my $fh = new FileHandle(">$secindex"); + + return $fh; +} + +=head2 open_secondary_index + + Title : open_secondary_index + Usage : $obj->open_secondary_index($newval) + Function: + Example : + Returns : value of open_secondary_index + Args : newvalue (optional) + + +=cut + +sub open_secondary_index { + my ($self,$name) = @_; + + if (!defined($self->{_secondary_filehandle}{$name})) { + + my $indexdir = $self->index_directory; + my $secindex = $indexdir . $self->database_name . "/id_$name.index"; + + if (! -e $secindex) { + $self->throw("Index is not present for namespace [$name]\n"); + } + + my $newfh = new FileHandle("<$secindex"); + my $reclen = $self->read_header($newfh); + + $self->{_secondary_filehandle} {$name} = $newfh; + $self->{_secondary_record_size}{$name} = $reclen; + } + + return $self->{_secondary_filehandle}{$name}; + +} + +=head2 _add_id_position + + Title : _add_id_position + Usage : $obj->_add_id_position($newval) + Function: + Example : + Returns : value of _add_id_position + Args : newvalue (optional) + + +=cut + +sub _add_id_position { + my ($self,$id,$pos,$fileid,$length,$secondary_id) = @_; + + if (!defined($id)) { + $self->throw("No id defined. Can't add id position"); + } + if (!defined($pos)) { +v $self->throw("No position defined. Can't add id position"); + } + if (!defined($fileid)) { + $self->throw("No fileid defined. Can't add id position"); + } + if (!defined($length) || $length <= 0) { + $self->throw("No length defined or <= 0 [$length]. Can't add id position"); + } + + $self->{_id}{$id}{_pos} = $pos; + $self->{_id}{$id}{_length} = $length; + $self->{_id}{$id}{_fileid} = $fileid; + + # Now the secondary ids + + foreach my $sec (keys (%$secondary_id)) { + my $value = $secondary_id->{$sec}; + + $self->{_secondary_id}{$sec}{$value}{$id} = 1; + } + + if (length($id) >= $self->{_maxidlength}) { + $self->{_maxidlength} = length($id); + } + + if (length($fileid) >= $self->{_maxfileidlength}) { + $self->{_maxfileidlength} = length($fileid); + } + + if (length($pos) >= $self->{_maxposlength}) { + $self->{_maxposlength} = length($pos); + } + + if (length($length) >= $self->{_maxlengthlength}) { + $self->{_maxlengthlength} = length($length); + } +} + +=head2 make_indexdir + + Title : make_indexdir + Usage : $obj->make_indexdir($newval) + Function: + Example : + Returns : value of make_indexdir + Args : newvalue (optional) + + +=cut + +sub make_indexdir { + my ($self,$rootdir) = @_; + + if (!defined($rootdir)) { + $self->throw("Must enter an index directory name for make_indexdir"); + } + if (! -e $rootdir) { + $self->throw("Root index directory [$rootdir] doesn't exist"); + } + + if (! -d $rootdir) { + $self->throw("[$rootdir] exists but is not a directory"); + } + + if ($rootdir !~ /\/$/) { + $rootdir .= "/"; + } + + my $indexdir = $rootdir . $self->database_name; + + if (! -e $indexdir) { + mkdir $indexdir,0755; + } else { + $self->throw("Index directory " . $indexdir . " already exists. Exiting\n"); + } + +} + +=head2 make_config_file + + Title : make_config_file + Usage : $obj->make_config_file($newval) + Function: + Example : + Returns : value of make_config_file + Args : newvalue (optional) + +=cut + +sub make_config_file { + my ($self,$files) = @_; + + my @files = @$files; + + my $dir = $self->index_directory; + + my $configfile = $dir . $self->database_name . "/" .CONFIG_FILE_NAME; + + open(CON,">$configfile") || $self->throw("Can't create config file [$configfile]"); + + # First line must be the type of index - in this case flat + + print CON "index\tflat/1\n"; + + # Now the fileids + + my $count = 0; + + foreach my $file (@files) { + + my $size = -s $file; + + print CON "fileid_$count\t$file\t$size\n"; + + my $fh = new FileHandle("<$file"); + $self->{_fileid}{$count} = $fh; + $self->{_file} {$count} = $file; + $self->{_dbfile}{$file} = $count; + $self->{_size}{$count} = $size; + + $count++; + } + + # Now the namespaces + + print CON "primary_namespace\t" .$self->primary_namespace. "\n"; + + # Needs fixing for the secondary stuff + + my $second_patterns = $self->secondary_patterns; + + my @second = keys %$second_patterns; + + if ((@second)) { + print CON "secondary_namespaces"; + + foreach my $second (@second) { + print CON "\t$second"; + } + print CON "\n"; + } + + # Now the config format + + if (!defined($self->format)) { + $self->throw("Format does not exist in module - can't write config file"); + } else { + print CON "format\t" . $self->format . "\n"; + } + + + close(CON); +} + +=head2 read_config_file + + Title : read_config_file + Usage : $obj->read_config_file($newval) + Function: + Example : + Returns : value of read_config_file + Args : newvalue (optional) + + +=cut + +sub read_config_file { + my ($self) = @_; + + my $dir = $self->index_directory . $self->database_name . "/";; + + if (! -d $dir) { + $self->throw("No index directory [" . $dir . "]. Can't read ". CONFIG_FILE_NAME); + } + + my $configfile = $dir . CONFIG_FILE_NAME; + + if (! -e $configfile) { + $self->throw("No config file [$configfile]. Can't read namespace"); + } + + open(CON,"<$configfile") || $self->throw("Can't open configfile [$configfile]"); + + # First line must be type + + my $line = ; chomp($line); + my $version; + + # This is hard coded as we only index flatfiles here + if ($line =~ /index\tflat\/(\d+)/) { + $version = $1; + } else { + $self->throw("First line not compatible with flat file index. Should be something like\n\nindex\tflat/1"); + } + + $self->index_type("flat"); + $self->index_version($version); + + while () { + chomp; + + # Look for fileid lines + if ($_ =~ /^fileid_(\d+)\t(\S+)\t(\d+)/) { + my $fileid = $1; + my $filename = $2; + my $filesize = $3; + + if (! -e $filename) { + $self->throw("File [$filename] does not exist!"); + } + if (-s $filename != $filesize) { + $self->throw("Flatfile size for $filename differs from what the index thinks it is. Real size [" . (-s $filename) . "] Index thinks it is [" . $filesize . "]"); + } + + my $fh = new FileHandle("<$filename"); + + $self->{_fileid}{$fileid} = $fh; + $self->{_file} {$fileid} = $filename; + $self->{_dbfile}{$filename} = $fileid; + $self->{_size} {$fileid} = $filesize; + + } + + # Look for namespace lines + if ($_ =~ /(.*)_namespace.*\t(\S+)/) { + if ($1 eq "primary") { + $self->primary_namespace($2); + } elsif ($1 eq "secondary") { + $self->secondary_namespaces($2); + } else { + $self->throw("Unknown namespace name in config file [$1"); + } + } + + # Look for format lines + + if ($_ =~ /format\t(\S+)/) { + + # Check the format here? + + $self->format($1); + } + } + close(CON); + + # Now check we have all that we need + + my @fileid_keys = keys (%{$self->{_fileid}}); + + if (!(@fileid_keys)) { + $self->throw("No flatfile fileid files in config - check the index has been made correctly"); + } + + if (!defined($self->primary_namespace)) { + $self->throw("No primary namespace exists"); + } + + if (! -e $self->primary_index_file) { + $self->throw("Primary index file [" . $self->primary_index_file . "] doesn't exist"); + } +} + +=head2 get_fileid_by_filename + + Title : get_fileid_by_filename + Usage : $obj->get_fileid_by_filename($newval) + Function: + Example : + Returns : value of get_fileid_by_filename + Args : newvalue (optional) + + +=cut + +sub get_fileid_by_filename { + my ($self,$file) = @_; + + if (!defined($self->{_dbfile})) { + $self->throw("No file to fileid mapping present. Has the fileid file been read?"); + } + + + return $self->{_dbfile}{$file}; +} + +=head2 get_filehandle_by_fileid + + Title : get_filehandle_by_fileid + Usage : $obj->get_filehandle_by_fileid($newval) + Function: + Example : + Returns : value of get_filehandle_by_fileid + Args : newvalue (optional) + + +=cut + +sub get_filehandle_by_fileid { + my ($self,$fileid) = @_; + + if (!defined($self->{_fileid}{$fileid})) { + $self->throw("ERROR: undefined fileid in index [$fileid]"); + } + + return $self->{_fileid}{$fileid}; +} + +=head2 primary_index_file + + Title : primary_index_file + Usage : $obj->primary_index_file($newval) + Function: + Example : + Returns : value of primary_index_file + Args : newvalue (optional) + + +=cut + +sub primary_index_file { + my ($self) = @_; + + return $self->index_directory . $self->database_name . "/key_" . $self->primary_namespace . ".key"; +} + +=head2 primary_index_filehandle + + Title : primary_index_filehandle + Usage : $obj->primary_index_filehandle($newval) + Function: + Example : + Returns : value of primary_index_filehandle + Args : newvalue (optional) + + +=cut + +sub primary_index_filehandle { + my ($self) = @_; + + if (!defined ($self->{_primary_index_handle})) { + $self->{_primary_index_handle} = new FileHandle("<" . $self->primary_index_file); + } + return $self->{_primary_index_handle}; +} + +=head2 database_name + + Title : database_name + Usage : $obj->database_name($newval) + Function: + Example : + Returns : value of database_name + Args : newvalue (optional) + + +=cut + + +sub database_name { + my ($self,$arg) = @_; + + if (defined($arg)) { + $self->{_database_name} = $arg; + } + return $self->{_database_name}; + +} + +=head2 format + + Title : format + Usage : $obj->format($newval) + Function: + Example : + Returns : value of format + Args : newvalue (optional) + + +=cut + +sub format{ + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'format'} = $value; + } + return $obj->{'format'}; + +} + +=head2 index_directory + + Title : index_directory + Usage : $obj->index_directory($newval) + Function: + Example : + Returns : value of index_directory + Args : newvalue (optional) + + +=cut + +sub index_directory { + my ($self,$arg) = @_; + + if (defined($arg)) { + if ($arg !~ /\/$/) { + $arg .= "/"; + } + $self->{_index_directory} = $arg; + } + return $self->{_index_directory}; + +} + +=head2 record_size + + Title : record_size + Usage : $obj->record_size($newval) + Function: + Example : + Returns : value of record_size + Args : newvalue (optional) + + +=cut + +sub record_size { + my ($self,$arg) = @_; + + if (defined($arg)) { + $self->{_record_size} = $arg; + } + return $self->{_record_size}; +} + +=head2 primary_namespace + + Title : primary_namespace + Usage : $obj->primary_namespace($newval) + Function: + Example : + Returns : value of primary_namespace + Args : newvalue (optional) + +=cut + +sub primary_namespace { + my ($self,$arg) = @_; + + if (defined($arg)) { + $self->{_primary_namespace} = $arg; + } + return $self->{_primary_namespace}; +} + +=head2 index_type + + Title : index_type + Usage : $obj->index_type($newval) + Function: + Example : + Returns : value of index_type + Args : newvalue (optional) + + +=cut + +sub index_type { + my ($self,$arg) = @_; + + if (defined($arg)) { + $self->{_index_type} = $arg; + } + return $self->{_index_type}; +} + +=head2 index_version + + Title : index_version + Usage : $obj->index_version($newval) + Function: + Example : + Returns : value of index_version + Args : newvalue (optional) + + +=cut + +sub index_version { + my ($self,$arg) = @_; + + if (defined($arg)) { + $self->{_index_version} = $arg; + } + return $self->{_index_version}; +} + +=head2 primary_pattern + + Title : primary_pattern + Usage : $obj->primary_pattern($newval) + Function: + Example : + Returns : value of primary_pattern + Args : newvalue (optional) + + +=cut + +sub primary_pattern{ + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'primary_pattern'} = $value; + } + + return $obj->{'primary_pattern'}; + +} +=head2 start_pattern + + Title : start_pattern + Usage : $obj->start_pattern($newval) + Function: + Example : + Returns : value of start_pattern + Args : newvalue (optional) + + +=cut + +sub start_pattern{ + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'start_pattern'} = $value; + } + return $obj->{'start_pattern'}; + +} + +=head2 secondary_patterns + + Title : secondary_patterns + Usage : $obj->secondary_patterns($newval) + Function: + Example : + Returns : value of secondary_patterns + Args : newvalue (optional) + + +=cut + +sub secondary_patterns{ + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'secondary_patterns'} = $value; + + my @names = keys %$value; + + foreach my $name (@names) { + $obj->secondary_namespaces($name); + } + } + return $obj->{'secondary_patterns'}; + +} + +=head2 secondary_namespaces + + Title : secondary_namespaces + Usage : $obj->secondary_namespaces($newval) + Function: + Example : + Returns : value of secondary_namespaces + Args : newvalue (optional) + + +=cut + +sub secondary_namespaces{ + my ($obj,$value) = @_; + + if (!defined($obj->{secondary_namespaces})) { + $obj->{secondary_namespaces} = []; + } + if( defined $value) { + push(@{$obj->{'secondary_namespaces'}},$value); + } + return @{$obj->{'secondary_namespaces'}}; + +} + + + +## These are indexing routines to index commonly used format - fasta +## swissprot and embl + +sub new_SWISSPROT_index { + my ($self,$index_dir,$dbname,@files) = @_; + + my %secondary_patterns; + + my $start_pattern = "^ID (\\S+)"; + my $primary_pattern = "^AC (\\S+)\\;"; + + $secondary_patterns{"ID"} = $start_pattern; + + my $index = new Bio::DB::Flat::OBDAIndex(-index_dir => $index_dir, + -format => 'swiss', + -primary_pattern => $primary_pattern, + -primary_namespace => "ACC", + -start_pattern => $start_pattern, + -secondary_patterns => \%secondary_patterns); + + $index->make_index($dbname,@files); +} + +sub new_EMBL_index { + my ($self,$index_dir,$dbname,@files) = @_; + + my %secondary_patterns; + + my $start_pattern = "^ID (\\S+)"; + my $primary_pattern = "^AC (\\S+)\\;"; + my $primary_namespace = "ACC"; + + $secondary_patterns{"ID"} = $start_pattern; + + my $index = new Bio::DB::Flat::OBDAIndex(-index_dir => $index_dir, + -format => 'embl', + -primary_pattern => $primary_pattern, + -primary_namespace => "ACC", + -start_pattern => $start_pattern, + -secondary_patterns => \%secondary_patterns); + + $index->make_index($dbname,@files); + + return $index; +} + +sub new_FASTA_index { + my ($self,$index_dir,$dbname,@files) = @_; + + my %secondary_patterns; + + my $start_pattern = "^>"; + my $primary_pattern = "^>(\\S+)"; + my $primary_namespace = "ACC"; + + $secondary_patterns{"ID"} = "^>\\S+ +(\\S+)"; + + my $index = new Bio::DB::Flat::OBDAIndex(-index_dir => $index_dir, + -format => 'fasta', + -primary_pattern => $primary_pattern, + -primary_namespace => "ACC", + -start_pattern => $start_pattern, + -secondary_patterns => \%secondary_patterns); + + $index->make_index($dbname,@files); + + return $index; + +} + + + +1; + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GDB.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GDB.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,315 @@ +# $Id: GDB.pm,v 1.12 2002/12/01 00:05:19 jason Exp $ +# +# BioPerl module for Bio::DB::GenBank +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code +# + +=head1 NAME + +Bio::DB::GDB - Database object interface to GDB HTTP query + +=head1 SYNOPSIS + + $gdb = new Bio::DB::GDB; + + $info = $gdb->get_info(-type=>'marker', + -id=>'D1S243'); # Marker name + + print "genbank id is ", $info->{'gdbid'}, + "\nprimers are (fwd, rev) ", join(",", @{$info->{'primers'}}), + "\nproduct length is ", $info->{'length'}, "\n"; + +=head1 DESCRIPTION + +This class allows connections to the Genome Database (GDB) and queries +to retrieve any database objects. See http://www.gdb.org/ or any +mirror for details. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the +evolution of this and other Bioperl modules. Send +your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation +is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to +help us keep track the bugs and their resolution. +Bug reports can be submitted via email or the +web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the +object methods. Internal methods are usually +preceded with a _ + +=cut + +# Let the code begin... + +package Bio::DB::GDB; +use strict; +use Bio::Root::Root; +use LWP::UserAgent; +use HTTP::Request::Common; +use HTML::Parser; + +use vars qw(@ISA $BASEADDRESS %PARAMSTRING $MODVERSION); + +@ISA = qw(Bio::Root::Root); + +$MODVERSION = '0.01'; +$BASEADDRESS = 'http://www.gdb.org/gdb-bin/genera/genera/hgd/GenomicSegment'; +%PARAMSTRING = ( + gene => { '!action' => 'query' }, + marker => { '!action' => 'query' }, + ); + +# the new way to make modules a little more lightweight +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + + my $ua = new LWP::UserAgent; + $ua->agent(ref($self) ."/$MODVERSION"); + $self->ua($ua); + + return $self; +} + +=head2 ua + + Title : ua + Usage : my $ua = $self->ua or + $self->ua($ua) + Function: Get/Set a LWP::UserAgent for use + Returns : reference to LWP::UserAgent Object + Args : $ua - must be a LWP::UserAgent + +=cut + +sub ua { + my ($self, $ua) = @_; + if( defined $ua && $ua->isa("LWP::UserAgent") ) { + $self->{_ua} = $ua; + } + return $self->{_ua}; +} + +# helper method to get specific options + +=head2 get_params + + Title : get_params + Usage : my %params = $self->get_params($mode) + Function: Returns key,value pairs to be passed to query + for mode ('marker', 'gene') + Returns : a key,value pair hash + Args : 'marker' or 'gene' mode for retrieval + +=cut + +sub get_params { + my ($self, $mode) = @_; + return %{$PARAMSTRING{$mode}}; +} + +=head2 get_info + + Title : get_info + Usage : my $info = $self->get_info(-type => 'marker', + -id => 'D1S234'); + Function: Returns key,value pairs specific + Returns : a key,value pair hash + Args : -type => 'marker' or 'gene' mode for retrieval + -id => unique id to query for + +=cut + +sub get_info { + my ($self, @args) = @_; + my ( $type, $id) = $self->_rearrange([qw(TYPE ID)], @args); + if( !defined $type ) { + $self->throw("Must specify a type you are querying for"); + } elsif( !defined $id ) { + $self->throw("Must specify a id to query for"); + } + my %params = $self->get_params($type); + + $params{'displayName'} = $id; + + if( $type eq 'marker' ) { + # do more specific stuff? + } elsif( $type eq 'gene' ) { + # do more specific stuff? + } + my $url = $self->get_request(%params); + + my ($resp) = $self->_request($url); + if( ! defined $resp || ! ref($resp) ) { + $self->warn("Did not get any data for url ". $url->uri); + return undef; + } + my $content = $resp->content; + if( $content =~ /ERROR/ || length($resp->content) == 0 ) { + $self->warn("Error getting for url " . $url->uri . "!\n"); + return undef; + } + my (@primers, $length, $markerurl, $realname); + my $state = 0; + my $title = 0; + my $p; + $p = new HTML::Parser( api_version => 3, + start_h => [ sub { + return if( $title == 2 || $state == 3); + my($tag,$attr,$text) = @_; + return if( !defined $tag); + if( $tag eq 'table' ) { + $state = 1; + } elsif( $tag eq 'title' ) { + $title = 1; + } elsif( $state == 2 && + $tag eq 'a' && + $attr->{'href'} ) { + $state = 3; + if( $text =~ m(href="?(http://.+)"?\s*>) ) { + $markerurl = $1; + } + } + }, "tagname, attr, text" ], + end_h => [ sub { + return if ($title == 2 || $state == 3); + my ( $tag ) = @_; + $title = 0 if( $tag eq 'title' ); + }, "tagname" ], + text_h => [ sub { + return if( $title == 2 || $state == 3); + my($text) = @_; + if( $title && $text =~ /Amplimer/ ) { + $markerurl = 'this'; + $title = 2; + } + $state = 2 if( $state == 1 && $text =~ /Amplimer/); + }, "text" ], + marked_sections =>1); + $p->parse($content) or die "Can't open: $!"; + if( ! defined $markerurl ) { + @primers = ('notfound','notfound', '?'); + } elsif( $markerurl eq 'this' ) { + + } + else { + my $resp = $self->_request(GET $markerurl); + return undef if ( !defined $resp ); + $content = $resp->content(); + } + $state = 0; + $realname = 'unknown'; + my $lasttag = ''; + $p = HTML::Parser->new(api_version => 3, + start_h => [ sub { my ($tag) = @_; + $tag = lc $tag; + $lasttag = $tag; + if( $state == 3 && $tag eq 'dd' ) { + $state = 4; + } + } , 'tagname'], + text_h => [ sub { + my($text) = @_; + if( $text =~ /Primer Sequence/ ) { + $state =1; + } elsif( $state == 1 ) { + foreach my $l ( split(/\n+/,$text) ) { + $l =~ s/\s+(\S+)/$1/; + my ($name,$primer) = split(/\s+/,$l); + next if( !defined $name); + push @primers, $primer; + $state = 2; + } + } elsif( $state == 2 && + ($text =~ /Seq Min Len/i || + $text =~ /Seq Max Len/i) ) { + $state = 3; + } elsif ( $state == 4 ) { + my ($len) = ( $text =~ /(\d+\.\d+)/ +); + $length = $len; + $length *= 1000 if( $len < 1 ); + $state = 0; + } elsif( $lasttag eq 'dd' && + $text =~ /(GDB:\d+)/i ) { + $realname = $1; + } + } , "text" ], + marked_sections =>1, + ); + $p->parse($content) || die "Can't open: $!"; + + return { 'gdbid' => $realname, 'length' => $length, 'primers' => \@primers }; +} + +=head2 get_request + + Title : get_request + Usage : my $url = $self->get_request + Function: HTTP::Request + Returns : + Args : %qualifiers = a hash of qualifiers (ids, format, etc) + +=cut + +sub get_request { + my ($self, %params) = @_; + if( ! %params ) { + $self->throw("must provide parameters with which to query"); + } + my $url = $BASEADDRESS; + my $querystr = '?' . join("&", map { "$_=$params{$_}" } keys %params); + return GET $url . $querystr; +} + +# private methods +sub _request { + + my ($self, $url,$tmpfile) = @_; + my ($resp); + if( defined $tmpfile && $tmpfile ne '' ) { + $resp = $self->ua->request($url, $tmpfile); + } else { $resp = $self->ua->request($url); } + + if( $resp->is_error ) { + $self->warn($resp->as_string() . "\nError getting for url " . + $url->uri . "!\n"); + return undef; + } + return $resp; +} + +sub _gdb_search_tag_start { + +} + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,3379 @@ +# $Id: GFF.pm,v 1.71.2.2 2003/09/12 13:29:32 lstein Exp $ + +=head1 NAME + +Bio::DB::GFF -- Storage and retrieval of sequence annotation data + +=head1 SYNOPSIS + + use Bio::DB::GFF; + + # Open the sequence database + my $db = Bio::DB::GFF->new( -adaptor => 'dbi::mysqlopt', + -dsn => 'dbi:mysql:elegans', + -fasta => '/usr/local/fasta_files' + ); + + # fetch a 1 megabase segment of sequence starting at landmark "ZK909" + my $segment = $db->segment('ZK909', 1 => 1000000); + + # pull out all transcript features + my @transcripts = $segment->features('transcript'); + + # for each transcript, total the length of the introns + my %totals; + for my $t (@transcripts) { + my @introns = $t->Intron; + $totals{$t->name} += $_->length foreach @introns; + } + + # Sort the exons of the first transcript by position + my @exons = sort {$a->start <=> $b->start} $transcripts[0]->Exon; + + # Get a region 1000 bp upstream of first exon + my $upstream = $exons[0]->segment(-1000,0); + + # get its DNA + my $dna = $upstream->dna; + + # and get all curated polymorphisms inside it + @polymorphisms = $upstream->contained_features('polymorphism:curated'); + + # get all feature types in the database + my @types = $db->types; + + # count all feature types in the segment + my %type_counts = $segment->types(-enumerate=>1); + + # get an iterator on all curated features of type 'exon' or 'intron' + my $iterator = $db->get_seq_stream(-type => ['exon:curated','intron:curated']); + + while (my $s = $iterator->next_seq) { + print $s,"\n"; + } + + # find all transcripts annotated as having function 'kinase' + my $iterator = $db->get_seq_stream(-type=>'transcript', + -attributes=>{Function=>'kinase'}); + while (my $s = $iterator->next_seq) { + print $s,"\n"; + } + +=head1 DESCRIPTION + +Bio::DB::GFF provides fast indexed access to a sequence annotation +database. It supports multiple database types (ACeDB, relational), +and multiple schemas through a system of adaptors and aggregators. + +The following operations are supported by this module: + + - retrieving a segment of sequence based on the ID of a landmark + - retrieving the DNA from that segment + - finding all annotations that overlap with the segment + - finding all annotations that are completely contained within the + segment + - retrieving all annotations of a particular type, either within a + segment, or globally + - conversion from absolute to relative coordinates and back again, + using any arbitrary landmark for the relative coordinates + - using a sequence segment to create new segments based on relative + offsets + +The data model used by Bio::DB::GFF is compatible with the GFF flat +file format (http://www.sanger.ac.uk/software/GFF). The module can +load a set of GFF files into the database, and serves objects that +have methods corresponding to GFF fields. + +The objects returned by Bio::DB::GFF are compatible with the +SeqFeatureI interface, allowing their use by the Bio::Graphics and +Bio::DAS modules. + +=head2 Auxiliary Scripts + +The bioperl distribution includes several scripts that make it easier +to work with Bio::DB::GFF databases. They are located in the scripts +directory under a subdirectory named Bio::DB::GFF: + +=over 4 + +=item bp_load_gff.pl + +This script will load a Bio::DB::GFF database from a flat GFF file of +sequence annotations. Only the relational database version of +Bio::DB::GFF is supported. It can be used to create the database from +scratch, as well as to incrementally load new data. + +This script takes a --fasta argument to load raw DNA into the database +as well. However, GFF databases do not require access to the raw DNA +for most of their functionality. + +load_gff.pl also has a --upgrade option, which will perform a +non-destructive upgrade of older schemas to newer ones. + +=item bp_bulk_load_gff.pl + +This script will populate a Bio::DB::GFF database from a flat GFF file +of sequence annotations. Only the MySQL database version of +Bio::DB::GFF is supported. It uses the "LOAD DATA INFILE" query in +order to accelerate loading considerably; however, it can only be used +for the initial load, and not for updates. + +This script takes a --fasta argument to load raw DNA into the database +as well. However, GFF databases do not require access to the raw DNA +for most of their functionality. + +=item bp_fast_load_gff.pl + +This script is as fast as bp_bulk_load_gff.pl but uses Unix pipe +tricks to allow for incremental updates. It only supports the MySQL +database version of Bio::DB::GFF and is guaranteed not to work on +non-Unix platforms. + +Arguments are the same as bp_load_gff.pl + +=item gadfly_to_gff.pl + +This script will convert the GFF-like format used by the Berkeley +Drosophila Sequencing project into a format suitable for use with this +module. + +=item sgd_to_gff.pl + +This script will convert the tab-delimited feature files used by the +Saccharomyces Genome Database into a format suitable for use with this +module. + +=back + +=head2 GFF Fundamentals + +The GFF format is a flat tab-delimited file, each line of which +corresponds to an annotation, or feature. Each line has nine columns +and looks like this: + + Chr1 curated CDS 365647 365963 . + 1 Transcript "R119.7" + +The 9 columns are as follows: + +=over 4 + +=item 1. reference sequence + +This is the ID of the sequence that is used to establish the +coordinate system of the annotation. In the example above, the +reference sequence is "Chr1". + +=item 2. source + +The source of the annotation. This field describes how the annotation +was derived. In the example above, the source is "curated" to +indicate that the feature is the result of human curation. The names +and versions of software programs are often used for the source field, +as in "tRNAScan-SE/1.2". + +=item 3. method + +The annotation method. This field describes the type of the +annotation, such as "CDS". Together the method and source describe +the annotation type. + +=item 4. start position + +The start of the annotation relative to the reference sequence. + +=item 5. stop position + +The stop of the annotation relative to the reference sequence. Start +is always less than or equal to stop. + +=item 6. score + +For annotations that are associated with a numeric score (for example, +a sequence similarity), this field describes the score. The score +units are completely unspecified, but for sequence similarities, it is +typically percent identity. Annotations that don't have a score can +use "." + +=item 7. strand + +For those annotations which are strand-specific, this field is the +strand on which the annotation resides. It is "+" for the forward +strand, "-" for the reverse strand, or "." for annotations that are +not stranded. + +=item 8. phase + +For annotations that are linked to proteins, this field describes the +phase of the annotation on the codons. It is a number from 0 to 2, or +"." for features that have no phase\. + +=item 9. group + +GFF provides a simple way of generating annotation hierarchies ("is +composed of" relationships) by providing a group field. The group +field contains the class and ID of an annotation which is the logical +parent of the current one. In the example given above, the group is +the Transcript named "R119.7". + +The group field is also used to store information about the target of +sequence similarity hits, and miscellaneous notes. See the next +section for a description of how to describe similarity targets. + +The format of the group fields is "Class ID" with a single space (not +a tab) separating the class from the ID. It is VERY IMPORTANT to +follow this format, or grouping will not work properly. + +=back + +The sequences used to establish the coordinate system for annotations +can correspond to sequenced clones, clone fragments, contigs or +super-contigs. Thus, this module can be used throughout the lifecycle +of a sequencing project. + +In addition to a group ID, the GFF format allows annotations to have a +group class. For example, in the ACeDB representation, RNA +interference experiments have a class of "RNAi" and an ID that is +unique among the RNAi experiments. Since not all databases support +this notion, the class is optional in all calls to this module, and +defaults to "Sequence" when not provided. + +Double-quotes are sometimes used in GFF files around components of the +group field. Strictly, this is only necessary if the group name or +class contains whitespace. + +=head2 Making GFF files work with this module + +Some annotations do not need to be individually named. For example, +it is probably not useful to assign a unique name to each ALU repeat +in a vertebrate genome. Others, such as predicted genes, correspond +to named biological objects; you probably want to be able to fetch the +positions of these objects by referring to them by name. + +To accomodate named annotations, the GFF format places the object +class and name in the group field. The name identifies the object, +and the class prevents similarly-named objects, for example clones and +sequences, from collding. + +A named object is shown in the following excerpt from a GFF file: + + Chr1 curated transcript 939627 942410 . + . Transcript Y95B8A.2 + +This object is a predicted transcript named Y95BA.2. In this case, +the group field is used to identify the class and name of the object, +even though no other annotation belongs to that group. + +It now becomes possible to retrieve the region of the genome covered +by transcript Y95B8A.2 using the segment() method: + + $segment = $db->segment(-class=>'Transcript',-name=>'Y95B8A.2'); + +It is not necessary for the annotation's method to correspond to the +object class, although this is commonly the case. + +As explained above, each annotation in a GFF file refers to a +reference sequence. It is important that each reference sequence also +be identified by a line in the GFF file. This allows the Bio::DB::GFF +module to determine the length and class of the reference sequence, +and makes it possible to do relative arithmetic. + +For example, if "Chr1" is used as a reference sequence, then it should +have an entry in the GFF file similar to this one: + + Chr1 assembly chromosome 1 14972282 . + . Sequence Chr1 + +This indicates that the reference sequence named "Chr1" has length +14972282 bp, method "chromosome" and source "assembly". In addition, +as indicated by the group field, Chr1 has class "Sequence" and name +"Chr1". + +The object class "Sequence" is used by default when the class is not +specified in the segment() call. This allows you to use a shortcut +form of the segment() method: + + $segment = $db->segment('Chr1'); # whole chromosome + $segment = $db->segment('Chr1',1=>1000); # first 1000 bp + +For your convenience, if, during loading a GFF file, Bio::DB::GFF +encounters a line like the following: + + ##sequence-region Chr1 1 14972282 + +It will automatically generate the following entry: + + Chr1 reference Component 1 14972282 . + . Sequence Chr1 + +This is sufficient to use Chr1 as a reference point. +The ##sequence-region line is frequently found in the GFF files +distributed by annotation groups. + +=head2 Sequence alignments + +There are two cases in which an annotation indicates the relationship +between two sequences. The first case is a similarity hit, where the +annotation indicates an alignment. The second case is a map assembly, +in which the annotation indicates that a portion of a larger sequence +is built up from one or more smaller ones. + +Both cases are indicated by using the B tag in the group +field. For example, a typical similarity hit will look like this: + + Chr1 BLASTX similarity 76953 77108 132 + 0 Target Protein:SW:ABL_DROME 493 544 + +The group field contains the Target tag, followed by an identifier for +the biological object referred to. The GFF format uses the notation +I:I for the biological object, and even though this is +stylistically inconsistent, that's the way it's done. The object +identifier is followed by two integers indicating the start and stop +of the alignment on the target sequence. + +Unlike the main start and stop columns, it is possible for the target +start to be greater than the target end. The previous example +indicates that the the section of Chr1 from 76,953 to 77,108 aligns to +the protein SW:ABL_DROME starting at position 493 and extending to +position 544. + +A similar notation is used for sequence assembly information as shown +in this example: + + Chr1 assembly Link 10922906 11177731 . . . Target Sequence:LINK_H06O01 1 254826 + LINK_H06O01 assembly Cosmid 32386 64122 . . . Target Sequence:F49B2 6 31742 + +This indicates that the region between bases 10922906 and 11177731 of +Chr1 are composed of LINK_H06O01 from bp 1 to bp 254826. The region +of LINK_H0601 between 32386 and 64122 is, in turn, composed of the +bases 5 to 31742 of cosmid F49B2. + +=head2 Attributes + +While not intended to serve as a general-purpose sequence database +(see bioperl-db for that), GFF allows you to tag features with +arbitrary attributes. Attributes appear in the Group field following +the initial class/name pair. For example: + + Chr1 cur trans 939 942 . + . Transcript Y95B8A.2 ; Gene sma-3 ; Alias sma3 + +This line tags the feature named Transcript Y95B8A.2 as being "Gene" +named sma-3 and having the Alias "sma3". Features having these +attributes can be looked up using the fetch_feature_by_attribute() method. + +Two attributes have special meaning: "Note" is for backward +compatibility and is used for unstructured text remarks. "Alias" is +considered as a synonym for the feature name and will be consulted +when looking up a feature by its name. + +=head2 Adaptors and Aggregators + +This module uses a system of adaptors and aggregators in order to make +it adaptable to use with a variety of databases. + +=over 4 + +=item Adaptors + +The core of the module handles the user API, annotation coordinate +arithmetic, and other common issues. The details of fetching +information from databases is handled by an adaptor, which is +specified during Bio::DB::GFF construction. The adaptor encapsulates +database-specific information such as the schema, user authentication +and access methods. + +Currently there are two adaptors: 'dbi::mysql' and 'dbi::mysqlopt'. +The former is an interface to a simple Mysql schema. The latter is an +optimized version of dbi::mysql which uses a binning scheme to +accelerate range queries and the Bio::DB::Fasta module for rapid +retrieval of sequences. Note the double-colon between the words. + +=item Aggregators + +The GFF format uses a "group" field to indicate aggregation properties +of individual features. For example, a set of exons and introns may +share a common transcript group, and multiple transcripts may share +the same gene group. + +Aggregators are small modules that use the group information to +rebuild the hierarchy. When a Bio::DB::GFF object is created, you +indicate that it use a set of one or more aggregators. Each +aggregator provides a new composite annotation type. Before the +database query is generated each aggregator is called to +"disaggregate" its annotation type into list of component types +contained in the database. After the query is generated, each +aggregator is called again in order to build composite annotations +from the returned components. + +For example, during disaggregation, the standard +"processed_transcript" aggregator generates a list of component +feature types including "UTR", "CDS", and "polyA_site". Later, it +aggregates these features into a set of annotations of type +"processed_transcript". + +During aggregation, the list of aggregators is called in reverse +order. This allows aggregators to collaborate to create multi-level +structures: the transcript aggregator assembles transcripts from +introns and exons; the gene aggregator then assembles genes from sets +of transcripts. + +Three default aggregators are provided: + + transcript assembles transcripts from features of type + exon, CDS, 5'UTR, 3'UTR, TSS, and PolyA + clone assembles clones from Clone_left_end, Clone_right_end + and Sequence features. + alignment assembles gapped alignments from features of type + "similarity". + +In addition, this module provides the optional "wormbase_gene" +aggregator, which accomodates the WormBase representation of genes. +This aggregator aggregates features of method "exon", "CDS", "5'UTR", +"3'UTR", "polyA" and "TSS" into a single object. It also expects to +find a single feature of type "Sequence" that spans the entire gene. + +The existing aggregators are easily customized. + +Note that aggregation will not occur unless you specifically request +the aggregation type. For example, this call: + + @features = $segment->features('alignment'); + +will generate an array of aggregated alignment features. However, +this call: + + @features = $segment->features(); + +will return a list of unaggregated similarity segments. + +For more informnation, see the manual pages for +Bio::DB::GFF::Aggregator::processed_transcript, Bio::DB::GFF::Aggregator::clone, +etc. + +=back + +=head1 API + +The following is the API for Bio::DB::GFF. + +=cut + +package Bio::DB::GFF; + +use strict; + +use Bio::DB::GFF::Util::Rearrange; +use Bio::DB::GFF::RelSegment; +use Bio::DB::GFF::Feature; +use Bio::DB::GFF::Aggregator; +use Bio::DasI; +use Bio::Root::Root; + +use vars qw(@ISA $VERSION); +@ISA = qw(Bio::Root::Root Bio::DasI); + +$VERSION = '1.2003'; +my %valid_range_types = (overlaps => 1, + contains => 1, + contained_in => 1); + +=head1 Querying GFF Databases + +=head2 new + + Title : new + Usage : my $db = new Bio::DB::GFF(@args); + Function: create a new Bio::DB::GFF object + Returns : new Bio::DB::GFF object + Args : lists of adaptors and aggregators + Status : Public + +These are the arguments: + + -adaptor Name of the adaptor module to use. If none + provided, defaults to "dbi::mysqlopt". + + -aggregator Array reference to a list of aggregators + to apply to the database. If none provided, + defaults to ['processed_transcript','alignment']. + + Any other named argument pairs are passed to + the adaptor for processing. + +The adaptor argument must correspond to a module contained within the +Bio::DB::GFF::Adaptor namespace. For example, the +Bio::DB::GFF::Adaptor::dbi::mysql adaptor is loaded by specifying +'dbi::mysql'. By Perl convention, the adaptors names are lower case +because they are loaded at run time. + +The aggregator array may contain a list of aggregator names, a list of +initialized aggregator objects, or a string in the form +"aggregator_name{subpart1,subpart2,subpart3/main_method}" (the +/main_method part is optional). For example, if you wish to change +the components aggregated by the transcript aggregator, you could pass +it to the GFF constructor this way: + + my $transcript = + Bio::DB::Aggregator::transcript->new(-sub_parts=>[qw(exon intron utr + polyA spliced_leader)]); + + my $db = Bio::DB::GFF->new(-aggregator=>[$transcript,'clone','alignment], + -adaptor => 'dbi::mysql', + -dsn => 'dbi:mysql:elegans42'); + +Alternatively, you could create an entirely new transcript aggregator +this way: + + my $new_agg = 'transcript{exon,intron,utr,polyA,spliced_leader}'; + my $db = Bio::DB::GFF->new(-aggregator=>[$new_agg,'clone','alignment], + -adaptor => 'dbi::mysql', + -dsn => 'dbi:mysql:elegans42'); + +See L for more details. + +The commonly used 'dbi::mysql' adaptor recognizes the following +adaptor-specific arguments: + + Argument Description + -------- ----------- + + -dsn the DBI data source, e.g. 'dbi:mysql:ens0040' + If a partial name is given, such as "ens0040", the + "dbi:mysql:" prefix will be added automatically. + + -user username for authentication + + -pass the password for authentication + + -refclass landmark Class; defaults to "Sequence" + +The commonly used 'dbi::mysqlopt' adaptor also recogizes the following +arguments. + + Argument Description + -------- ----------- + + -fasta path to a directory containing FASTA files for the DNA + contained in this database (e.g. "/usr/local/share/fasta") + + -acedb an acedb URL to use when converting features into ACEDB + objects (e.g. sace://localhost:2005) + +=cut + +#' + +sub new { + my $package = shift; + my ($adaptor,$aggregators,$args,$refclass); + + if (@_ == 1) { # special case, default to dbi::mysqlopt + $adaptor = 'dbi::mysqlopt'; + $args = {DSN => shift}; + } else { + ($adaptor,$aggregators,$refclass,$args) = rearrange([ + [qw(ADAPTOR FACTORY)], + [qw(AGGREGATOR AGGREGATORS)], + 'REFCLASS', + ],@_); + } + + $adaptor ||= 'dbi::mysqlopt'; + my $class = "Bio::DB::GFF::Adaptor::\L${adaptor}\E"; + eval "require $class" unless $class->can('new'); + $package->throw("Unable to load $adaptor adaptor: $@") if $@; + + my $self = $class->new($args); + $self->default_class($refclass) if defined $refclass; + + # handle the aggregators. + # aggregators are responsible for creating complex multi-part features + # from the GFF "group" field. If none are provided, then we provide a + # list of the two used in WormBase. + # Each aggregator can be a scalar or a ref. In the former case + # it is treated as a class name to call new() on. In the latter + # the aggreator is treated as a ready made object. + $aggregators = $self->default_aggregators unless defined $aggregators; + my @a = ref($aggregators) eq 'ARRAY' ? @$aggregators : $aggregators; + for my $a (@a) { + $self->add_aggregator($a); + } + + # default settings go here..... + $self->automerge(1); # set automerge to true + + $self; +} + + +=head2 types + + Title : types + Usage : $db->types(@args) + Function: return list of feature types in range or database + Returns : a list of Bio::DB::GFF::Typename objects + Args : see below + Status : public + +This routine returns a list of feature types known to the database. +The list can be database-wide or restricted to a region. It is also +possible to find out how many times each feature occurs. + +For range queries, it is usually more convenient to create a +Bio::DB::GFF::Segment object, and then invoke it's types() method. + +Arguments are as follows: + + -ref ID of reference sequence + -class class of reference sequence + -start start of segment + -stop stop of segment + -enumerate if true, count the features + +The returned value will be a list of Bio::DB::GFF::Typename objects, +which if evaluated in a string context will return the feature type in +"method:source" format. This object class also has method() and +source() methods for retrieving the like-named fields. + +If -enumerate is true, then the function returns a hash (not a hash +reference) in which the keys are type names in "method:source" format +and the values are the number of times each feature appears in the +database or segment. + +The argument -end is a synonum for -stop, and -count is a synonym for +-enumerate. + +=cut + +sub types { + my $self = shift; + my ($refseq,$start,$stop,$enumerate,$refclass,$types) = rearrange ([ + [qw(REF REFSEQ)], + qw(START), + [qw(STOP END)], + [qw(ENUMERATE COUNT)], + [qw(CLASS SEQCLASS)], + [qw(TYPE TYPES)], + ],@_); + $types = $self->parse_types($types) if defined $types; + $self->get_types($refseq,$refclass,$start,$stop,$enumerate,$types); +} + +=head2 classes + + Title : classes + Usage : $db->classes + Function: return list of landmark classes in database + Returns : a list of classes + Args : none + Status : public + +This routine returns the list of reference classes known to the +database, or empty if classes are not used by the database. Classes +are distinct from types, being essentially qualifiers on the reference +namespaces. + +=cut + +sub classes { + my $self = shift; + return (); +} + +=head2 segment + + Title : segment + Usage : $db->segment(@args); + Function: create a segment object + Returns : segment object(s) + Args : numerous, see below + Status : public + +This method generates a segment object, which is a Perl object +subclassed from Bio::DB::GFF::Segment. The segment can be used to +find overlapping features and the raw DNA. + +When making the segment() call, you specify the ID of a sequence +landmark (e.g. an accession number, a clone or contig), and a +positional range relative to the landmark. If no range is specified, +then the entire extent of the landmark is used to generate the +segment. + +You may also provide the ID of a "reference" sequence, which will set +the coordinate system and orientation used for all features contained +within the segment. The reference sequence can be changed later. If +no reference sequence is provided, then the coordinate system is based +on the landmark. + +Arguments: + + -name ID of the landmark sequence. + + -class Database object class for the landmark sequence. + "Sequence" assumed if not specified. This is + irrelevant for databases which do not recognize + object classes. + + -start Start of the segment relative to landmark. Positions + follow standard 1-based sequence rules. If not specified, + defaults to the beginning of the landmark. + + -end Stop of the segment relative to the landmark. If not specified, + defaults to the end of the landmark. + + -stop Same as -end. + + -offset For those who prefer 0-based indexing, the offset specifies the + position of the new segment relative to the start of the landmark. + + -length For those who prefer 0-based indexing, the length specifies the + length of the new segment. + + -refseq Specifies the ID of the reference landmark used to establish the + coordinate system for the newly-created segment. + + -refclass Specifies the class of the reference landmark, for those databases + that distinguish different object classes. Defaults to "Sequence". + + -absolute + Return features in absolute coordinates rather than relative to the + parent segment. + + -nocheck Don't check the database for the coordinates and length of this + feature. Construct a segment using the indicated name as the + reference, a start coordinate of 1, an undefined end coordinate, + and a strand of +1. + + -force Same as -nocheck. + + -seq,-sequence,-sourceseq Aliases for -name. + + -begin,-end Aliases for -start and -stop + + -off,-len Aliases for -offset and -length + + -seqclass Alias for -class + +Here's an example to explain how this works: + + my $db = Bio::DB::GFF->new(-dsn => 'dbi:mysql:human',-adaptor=>'dbi::mysql'); + +If successful, $db will now hold the database accessor object. We now +try to fetch the fragment of sequence whose ID is A0000182 and class +is "Accession." + + my $segment = $db->segment(-name=>'A0000182',-class=>'Accession'); + +If successful, $segment now holds the entire segment corresponding to +this accession number. By default, the sequence is used as its own +reference sequence, so its first base will be 1 and its last base will +be the length of the accession. + +Assuming that this sequence belongs to a longer stretch of DNA, say a +contig, we can fetch this information like so: + + my $sourceseq = $segment->sourceseq; + +and find the start and stop on the source like this: + + my $start = $segment->abs_start; + my $stop = $segment->abs_stop; + +If we had another segment, say $s2, which is on the same contiguous +piece of DNA, we can pass that to the refseq() method in order to +establish it as the coordinate reference point: + + $segment->refseq($s2); + +Now calling start() will return the start of the segment relative to +the beginning of $s2, accounting for differences in strandedness: + + my $rel_start = $segment->start; + +IMPORTANT NOTE: This method can be used to return the segment spanned +by an arbitrary named annotation. However, if the annotation appears +at multiple locations on the genome, for example an EST that maps to +multiple locations, then, provided that all locations reside on the +same physical segment, the method will return a segment that spans the +minimum and maximum positions. If the reference sequence occupies +ranges on different physical segments, then it returns them all in an +array context, and raises a "multiple segment exception" exception in +a scalar context. + +=cut + +#' + +sub segment { + my $self = shift; + my @segments = Bio::DB::GFF::RelSegment->new(-factory => $self, + $self->setup_segment_args(@_)); + foreach (@segments) { + $_->absolute(1) if $self->absolute; + } + + $self->_multiple_return_args(@segments); +} + +sub _multiple_return_args { + my $self = shift; + my @args = @_; + if (@args == 0) { + return; + } elsif (@args == 1) { + return $args[0]; + } elsif (wantarray) { # more than one reference sequence + return @args; + } else { + $self->error($args[0]->name, + " has more than one reference sequence in database. Please call in a list context to retrieve them all."); + $self->throw('multiple segment exception'); + return; + } + +} + +# backward compatibility -- don't use! +# (deliberately undocumented too) +sub abs_segment { + my $self = shift; + return $self->segment($self->setup_segment_args(@_),-absolute=>1); +} + +sub setup_segment_args { + my $self = shift; + return @_ if defined $_[0] && $_[0] =~ /^-/; + return (-name=>$_[0],-start=>$_[1],-stop=>$_[2]) if @_ == 3; + return (-class=>$_[0],-name=>$_[1]) if @_ == 2; + return (-name=>$_[0]) if @_ == 1; +} + +=head2 features + + Title : features + Usage : $db->features(@args) + Function: get all features, possibly filtered by type + Returns : a list of Bio::DB::GFF::Feature objects + Args : see below + Status : public + +This routine will retrieve features in the database regardless of +position. It can be used to return all features, or a subset based on +their method and source. + +Arguments are as follows: + + -types List of feature types to return. Argument is an array + reference containing strings of the format "method:source" + + -merge Whether to apply aggregators to the generated features. + + -rare Turn on optimizations suitable for a relatively rare feature type, + where it makes more sense to filter by feature type first, + and then by position. + + -attributes A hash reference containing attributes to match. + + -iterator Whether to return an iterator across the features. + + -binsize A true value will create a set of artificial features whose + start and stop positions indicate bins of the given size, and + whose scores are the number of features in the bin. The + class and method of the feature will be set to "bin", + its source to "method:source", and its group to "bin:method:source". + This is a handy way of generating histograms of feature density. + +If -iterator is true, then the method returns a single scalar value +consisting of a Bio::SeqIO object. You can call next_seq() repeatedly +on this object to fetch each of the features in turn. If iterator is +false or absent, then all the features are returned as a list. + +Currently aggregation is disabled when iterating over a series of +features. + +Types are indicated using the nomenclature "method:source". Either of +these fields can be omitted, in which case a wildcard is used for the +missing field. Type names without the colon (e.g. "exon") are +interpreted as the method name and a source wild card. Regular +expressions are allowed in either field, as in: "similarity:BLAST.*". + +The -attributes argument is a hashref containing one or more attributes +to match against: + + -attributes => { Gene => 'abc-1', + Note => 'confirmed' } + +Attribute matching is simple string matching, and multiple attributes +are ANDed together. + +=cut + +sub features { + my $self = shift; + my ($types,$automerge,$sparse,$iterator,$other); + if (defined $_[0] && + $_[0] =~ /^-/) { + ($types,$automerge,$sparse,$iterator,$other) = rearrange([ + [qw(TYPE TYPES)], + [qw(MERGE AUTOMERGE)], + [qw(RARE SPARSE)], + 'ITERATOR' + ],@_); + } else { + $types = \@_; + } + + # for whole database retrievals, we probably don't want to automerge! + $automerge = $self->automerge unless defined $automerge; + $other ||= {}; + $self->_features({ + rangetype => 'contains', + types => $types, + }, + { sparse => $sparse, + automerge => $automerge, + iterator =>$iterator, + %$other, + } + ); +} + +=head2 get_seq_stream + + Title : get_seq_stream + Usage : my $seqio = $self->get_seq_sream(@args) + Function: Performs a query and returns an iterator over it + Returns : a Bio::SeqIO stream capable of producing sequence + Args : As in features() + Status : public + +This routine takes the same arguments as features(), but returns a +Bio::SeqIO::Stream-compliant object. Use it like this: + + $stream = $db->get_seq_stream('exon'); + while (my $exon = $stream->next_seq) { + print $exon,"\n"; + } + +NOTE: This is also called get_feature_stream(), since that's what it +really does. + +=cut + +sub get_seq_stream { + my $self = shift; + my @args = !defined($_[0]) || $_[0] =~ /^-/ ? (@_,-iterator=>1) + : (-types=>\@_,-iterator=>1); + $self->features(@args); +} + +*get_feature_stream = \&get_seq_stream; + +=head2 get_feature_by_name + + Title : get_feature_by_name + Usage : $db->get_feature_by_name($class => $name) + Function: fetch features by their name + Returns : a list of Bio::DB::GFF::Feature objects + Args : the class and name of the desired feature + Status : public + +This method can be used to fetch a named feature from the database. +GFF annotations are named using the group class and name fields, so +for features that belong to a group of size one, this method can be +used to retrieve that group (and is equivalent to the segment() +method). Any Alias attributes are also searched for matching names. + +An alternative syntax allows you to search for features by name within +a circumscribed region: + + @f = $db->get_feature_by_name(-class => $class,-name=>$name, + -ref => $sequence_name, + -start => $start, + -end => $end); + +This method may return zero, one, or several Bio::DB::GFF::Feature +objects. + +Aggregation is performed on features as usual. + +NOTE: At various times, this function was called fetch_group(), +fetch_feature(), fetch_feature_by_name() and segments(). These names +are preserved for backward compatibility. + +=cut + +sub get_feature_by_name { + my $self = shift; + my ($gclass,$gname,$automerge,$ref,$start,$end); + if (@_ == 1) { + $gclass = $self->default_class; + $gname = shift; + } else { + ($gclass,$gname,$automerge,$ref,$start,$end) = rearrange(['CLASS','NAME','AUTOMERGE', + ['REF','REFSEQ'], + 'START',['STOP','END'] + ],@_); + $gclass ||= $self->default_class; + } + $automerge = $self->automerge unless defined $automerge; + + # we need to refactor this... It's repeated code (see below)... + my @aggregators; + if ($automerge) { + for my $a ($self->aggregators) { + push @aggregators,$a if $a->disaggregate([],$self); + } + } + + my %groups; # cache the groups we create to avoid consuming too much unecessary memory + my $features = []; + my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) }; + my $location = [$ref,$start,$end] if defined $ref; + $self->_feature_by_name($gclass,$gname,$location,$callback); + + warn "aggregating...\n" if $self->debug; + foreach my $a (@aggregators) { # last aggregator gets first shot + $a->aggregate($features,$self) or next; + } + + @$features; +} + +# horrible indecision regarding proper names! +*fetch_group = *fetch_feature = *fetch_feature_by_name = \&get_feature_by_name; +*segments = \&segment; + +=head2 get_feature_by_target + + Title : get_feature_by_target + Usage : $db->get_feature_by_target($class => $name) + Function: fetch features by their similarity target + Returns : a list of Bio::DB::GFF::Feature objects + Args : the class and name of the desired feature + Status : public + +This method can be used to fetch a named feature from the database +based on its similarity hit. + +=cut + +sub get_feature_by_target { + shift->get_feature_by_name(@_); +} + +=head2 get_feature_by_attribute + + Title : get_feature_by_attribute + Usage : $db->get_feature_by_attribute(attribute1=>value1,attribute2=>value2) + Function: fetch segments by combinations of attribute values + Returns : a list of Bio::DB::GFF::Feature objects + Args : the class and name of the desired feature + Status : public + +This method can be used to fetch a set of features from the database. +Attributes are a list of name=Evalue pairs. They will be logically +ANDED together. + +=cut + +sub get_feature_by_attribute { + my $self = shift; + my %attributes = ref($_[0]) ? %{$_[0]} : @_; + + # we need to refactor this... It's repeated code (see above)... + my @aggregators; + if ($self->automerge) { + for my $a ($self->aggregators) { + unshift @aggregators,$a if $a->disaggregate([],$self); + } + } + + my %groups; # cache the groups we create to avoid consuming too much unecessary memory + my $features = []; + my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) }; + $self->_feature_by_attribute(\%attributes,$callback); + + warn "aggregating...\n" if $self->debug; + foreach my $a (@aggregators) { # last aggregator gets first shot + $a->aggregate($features,$self) or next; + } + + @$features; +} + +# more indecision... +*fetch_feature_by_attribute = \&get_feature_by_attribute; + +=head2 get_feature_by_id + + Title : get_feature_by_id + Usage : $db->get_feature_by_id($id) + Function: fetch segments by feature ID + Returns : a Bio::DB::GFF::Feature object + Args : the feature ID + Status : public + +This method can be used to fetch a feature from the database using its +ID. Not all GFF databases support IDs, so be careful with this. + +=cut + +sub get_feature_by_id { + my $self = shift; + my $id = ref($_[0]) eq 'ARRAY' ? $_[0] : \@_; + my %groups; # cache the groups we create to avoid consuming too much unecessary memory + my $features = []; + my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) }; + $self->_feature_by_id($id,'feature',$callback); + return wantarray ? @$features : $features->[0]; +} +*fetch_feature_by_id = \&get_feature_by_id; + +=head2 get_feature_by_gid + + Title : get_feature_by_gid + Usage : $db->get_feature_by_gid($id) + Function: fetch segments by feature ID + Returns : a Bio::DB::GFF::Feature object + Args : the feature ID + Status : public + +This method can be used to fetch a feature from the database using its +group ID. Not all GFF databases support IDs, so be careful with this. + +The group ID is often more interesting than the feature ID, since +groups can be complex objects containing subobjects. + +=cut + +sub get_feature_by_gid { + my $self = shift; + my $id = ref($_[0]) eq 'ARRAY' ? $_[0] : \@_; + my %groups; # cache the groups we create to avoid consuming too much unecessary memory + my $features = []; + my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) }; + $self->_feature_by_id($id,'group',$callback); + return wantarray ? @$features : $features->[0]; +} +*fetch_feature_by_gid = \&get_feature_by_gid; + +=head2 delete_features + + Title : delete_features + Usage : $db->delete_features(@ids_or_features) + Function: delete one or more features + Returns : count of features deleted + Args : list of features or feature ids + Status : public + +Pass this method a list of numeric feature ids or a set of features. +It will attempt to remove the features from the database and return a +count of the features removed. + +NOTE: This method is also called delete_feature(). Also see +delete_groups(). + +=cut + +*delete_feature = \&delete_features; + +sub delete_features { + my $self = shift; + my @features_or_ids = @_; + my @ids = map {UNIVERSAL::isa($_,'Bio::DB::GFF::Feature') ? $_->id : $_} @features_or_ids; + return unless @ids; + $self->_delete_features(@ids); +} + +=head2 delete_groups + + Title : delete_groups + Usage : $db->delete_groups(@ids_or_features) + Function: delete one or more feature groups + Returns : count of features deleted + Args : list of features or feature group ids + Status : public + +Pass this method a list of numeric group ids or a set of features. It +will attempt to recursively remove the features and ALL members of +their group from the database. It returns a count of the number of +features (not groups) returned. + +NOTE: This method is also called delete_group(). Also see +delete_features(). + +=cut + +*delete_group = \&delete_groupss; + +sub delete_groups { + my $self = shift; + my @features_or_ids = @_; + my @ids = map {UNIVERSAL::isa($_,'Bio::DB::GFF::Feature') ? $_->group_id : $_} @features_or_ids; + return unless @ids; + $self->_delete_groups(@ids); +} + +=head2 delete + + Title : delete + Usage : $db->delete(@args) + Function: delete features + Returns : count of features deleted -- if available + Args : numerous, see below + Status : public + +This method deletes all features that overlap the specified region or +are of a particular type. If no arguments are provided and the -force +argument is true, then deletes ALL features. + +Arguments: + + -name ID of the landmark sequence. + + -ref ID of the landmark sequence (synonym for -name). + + -class Database object class for the landmark sequence. + "Sequence" assumed if not specified. This is + irrelevant for databases which do not recognize + object classes. + + -start Start of the segment relative to landmark. Positions + follow standard 1-based sequence rules. If not specified, + defaults to the beginning of the landmark. + + -end Stop of the segment relative to the landmark. If not specified, + defaults to the end of the landmark. + + -offset Zero-based addressing + + -length Length of region + + -type,-types Either a single scalar type to be deleted, or an + reference to an array of types. + + -force Force operation to be performed even if it would delete + entire feature table. + + -range_type Control the range type of the deletion. One of "overlaps" (default) + "contains" or "contained_in" + +Examples: + + $db->delete(-type=>['intron','repeat:repeatMasker']); # remove all introns & repeats + $db->delete(-name=>'chr3',-start=>1,-end=>1000); # remove annotations on chr3 from 1 to 1000 + $db->delete(-name=>'chr3',-type=>'exon'); # remove all exons on chr3 + +The short form of this call, as described in segment() is also allowed: + + $db->delete("chr3",1=>1000); + $db->delete("chr3"); + +IMPORTANT NOTE: This method only deletes features. It does *NOT* +delete the names of groups that contain the deleted features. Group +IDs will be reused if you later load a feature with the same group +name as one that was previously deleted. + +NOTE ON FEATURE COUNTS: The DBI-based versions of this call return the +result code from the SQL DELETE operation. Some dbd drivers return the +count of rows deleted, while others return 0E0. Caveat emptor. + +=cut + +sub delete { + my $self = shift; + my @args = $self->setup_segment_args(@_); + my ($name,$class,$start,$end,$offset,$length,$type,$force,$range_type) = + rearrange([['NAME','REF'],'CLASS','START',[qw(END STOP)],'OFFSET', + 'LENGTH',[qw(TYPE TYPES)],'FORCE','RANGE_TYPE'],@args); + $offset = 0 unless defined $offset; + $start = $offset+1 unless defined $start; + $end = $start+$length-1 if !defined $end and $length; + $class ||= $self->default_class; + + my $types = $self->parse_types($type); # parse out list of types + + $range_type ||= 'overlaps'; + $self->throw("range type must be one of {". + join(',',keys %valid_range_types). + "}\n") + unless $valid_range_types{lc $range_type}; + + + my @segments; + if (defined $name && $name ne '') { + my @args = (-name=>$name,-class=>$class); + push @args,(-start=>$start) if defined $start; + push @args,(-end =>$end) if defined $end; + @segments = $self->segment(@args); + return unless @segments; + } + $self->_delete({segments => \@segments, + types => $types, + range_type => $range_type, + force => $force} + ); +} + +=head2 absolute + + Title : absolute + Usage : $abs = $db->absolute([$abs]); + Function: gets/sets absolute mode + Returns : current setting of absolute mode boolean + Args : new setting for absolute mode boolean + Status : public + +$db-Eabsolute(1) will turn on absolute mode for the entire database. +All segments retrieved will use absolute coordinates by default, +rather than relative coordinates. You can still set them to use +relative coordinates by calling $segment-Eabsolute(0). + +Note that this is not the same as calling abs_segment(); it continues +to allow you to look up groups that are not used directly as reference +sequences. + +=cut + +sub absolute { + my $self = shift; + my $d = $self->{absolute}; + $self->{absolute} = shift if @_; + $d; +} + +=head2 strict_bounds_checking + + Title : strict_bounds_checking + Usage : $flag = $db->strict_bounds_checking([$flag]) + Function: gets/sets strict bounds checking + Returns : current setting of bounds checking flag + Args : new setting for bounds checking flag + Status : public + +This flag enables extra checks for segment requests that go beyond the +ends of their reference sequences. If bounds checking is enabled, +then retrieved segments will be truncated to their physical length, +and their truncated() methods will return true. + +If the flag is off (the default), then the module will return segments +that appear to extend beyond their physical boundaries. Requests for +features beyond the end of the segment will, however, return empty. + +=cut + +sub strict_bounds_checking { + my $self = shift; + my $d = $self->{strict}; + $self->{strict} = shift if @_; + $d; +} + +=head2 get_Seq_by_id + + Title : get_Seq_by_id + Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN') + Function: Gets a Bio::Seq object by its name + Returns : a Bio::Seq object + Args : the id (as a string) of a sequence + Throws : "id does not exist" exception + +NOTE: Bio::DB::RandomAccessI compliant method + +=cut + +sub get_Seq_by_id { + my $self = shift; + my $id = shift; + my $stream = $self->get_Stream_by_id($id); + return $stream->next_seq; +} + + +=head2 get_Seq_by_accession + + Title : get_Seq_by_accession + Usage : $seq = $db->get_Seq_by_accession('AL12234') + Function: Gets a Bio::Seq object by its accession + Returns : a Bio::Seq object + Args : the id (as a string) of a sequence + Throws : "id does not exist" exception + +NOTE: Bio::DB::RandomAccessI compliant method + +=cut + +sub get_Seq_by_accession { + my $self = shift; + my $id = shift; + my $stream = $self->get_Stream_by_accession($id); + return $stream->next_seq; +} + +=head2 get_Stream_by_acc () + +=cut + +=head2 get_Seq_by_acc + + Title : get_Seq_by_acc + Usage : $seq = $db->get_Seq_by_acc('X77802'); + Function: Gets a Bio::Seq object by accession number + Returns : A Bio::Seq object + Args : accession number (as a string) + Throws : "acc does not exist" exception + +NOTE: Bio::DB::RandomAccessI compliant method + +=cut + +sub get_Stream_by_name { + my $self = shift; + my @ids = @_; + my $id = ref($ids[0]) ? $ids[0] : \@ids; + Bio::DB::GFF::ID_Iterator->new($self,$id,'name'); +} + +=head2 get_Stream_by_id + + Title : get_Stream_by_id + Usage : $seq = $db->get_Stream_by_id(@ids); + Function: Retrieves a stream of Seq objects given their ids + Returns : a Bio::SeqIO stream object + Args : an array of unique ids/accession numbers, or + an array reference + +NOTE: This is also called get_Stream_by_batch() + +=cut + +sub get_Stream_by_id { + my $self = shift; + my @ids = @_; + my $id = ref($ids[0]) ? $ids[0] : \@ids; + Bio::DB::GFF::ID_Iterator->new($self,$id,'feature'); +} + +=head2 get_Stream_by_batch () + + Title : get_Stream_by_batch + Usage : $seq = $db->get_Stream_by_batch(@ids); + Function: Retrieves a stream of Seq objects given their ids + Returns : a Bio::SeqIO stream object + Args : an array of unique ids/accession numbers, or + an array reference + +NOTE: This is the same as get_Stream_by_id(). + +=cut + +*get_Stream_by_batch = \&get_Stream_by_id; + + +=head2 get_Stream_by_group () + +Bioperl compatibility. + +=cut + +sub get_Stream_by_group { + my $self = shift; + my @ids = @_; + my $id = ref($ids[0]) ? $ids[0] : \@ids; + Bio::DB::GFF::ID_Iterator->new($self,$id,'group'); +} + +=head2 all_seqfeatures + + Title : all_seqfeatures + Usage : @features = $db->all_seqfeatures(@args) + Function: fetch all the features in the database + Returns : an array of features, or an iterator + Args : See below + Status : public + +This is equivalent to calling $db-Efeatures() without any types, and +will return all the features in the database. The -merge and +-iterator arguments are recognized, and behave the same as described +for features(). + +=cut + +sub all_seqfeatures { + my $self = shift; + my ($automerge,$iterator)= rearrange([ + [qw(MERGE AUTOMERGE)], + 'ITERATOR' + ],@_); + my @args; + push @args,(-merge=>$automerge) if defined $automerge; + push @args,(-iterator=>$iterator) if defined $iterator; + $self->features(@args); +} + +=head1 Creating and Loading GFF Databases + +=head2 initialize + + Title : initialize + Usage : $db->initialize(-erase=>$erase,-option1=>value1,-option2=>value2); + Function: initialize a GFF database + Returns : true if initialization successful + Args : a set of named parameters + Status : Public + +This method can be used to initialize an empty database. It takes the following +named arguments: + + -erase A boolean value. If true the database will be wiped clean if it + already contains data. + +Other named arguments may be recognized by subclasses. They become database +meta values that control various settable options. + +As a shortcut (and for backward compatibility) a single true argument +is the same as initialize(-erase=E1). + +=cut + +sub initialize { + my $self = shift; + #$self->do_initialize(1) if @_ == 1 && $_[0]; + #why was this line (^) here? I can't see that it actually does anything + #one option would be to execute the line and return, but I don't know + #why you would want to do that either. + + my ($erase,$meta) = rearrange(['ERASE'],@_); + $meta ||= {}; + + # initialize (possibly erasing) + return unless $self->do_initialize($erase); + my @default = $self->default_meta_values; + + # this is an awkward way of uppercasing the + # even-numbered values (necessary for case-insensitive SQL databases) + for (my $i=0; $i<@default; $i++) { + $default[$i] = uc $default[$i] if !($i % 2); + } + + my %values = (@default,%$meta); + foreach (keys %values) { + $self->meta($_ => $values{$_}); + } + 1; +} + + +=head2 load_gff + + Title : load_gff + Usage : $db->load_gff($file|$directory|$filehandle); + Function: load GFF data into database + Returns : count of records loaded + Args : a directory, a file, a list of files, + or a filehandle + Status : Public + +This method takes a single overloaded argument, which can be any of: + +=over 4 + +=item 1. a scalar corresponding to a GFF file on the system + +A pathname to a local GFF file. Any files ending with the .gz, .Z, or +.bz2 suffixes will be transparently decompressed with the appropriate +command-line utility. + +=item 2. an array reference containing a list of GFF files on the system + +For example ['/home/gff/gff1.gz','/home/gff/gff2.gz'] + +=item 3. directory path + +The indicated directory will be searched for all files ending in the +suffixes .gff, .gff.gz, .gff.Z or .gff.bz2. + +=item 4. filehandle + +An open filehandle from which to read the GFF data. Tied filehandles +now work as well. + +=item 5. a pipe expression + +A pipe expression will also work. For example, a GFF file on a remote +web server can be loaded with an expression like this: + + $db->load_gff("lynx -dump -source http://stein.cshl.org/gff_test |"); + +=back + +If successful, the method will return the number of GFF lines +successfully loaded. + +NOTE:this method used to be called load(), but has been changed. The +old method name is also recognized. + +=cut + +sub load_gff { + my $self = shift; + my $file_or_directory = shift || '.'; + return $self->do_load_gff($file_or_directory) if ref($file_or_directory) && + tied *$file_or_directory; + + my $tied_stdin = tied(*STDIN); + open SAVEIN,"<&STDIN" unless $tied_stdin; + local @ARGV = $self->setup_argv($file_or_directory,'gff') or return; # to play tricks with reader + my $result = $self->do_load_gff('ARGV'); + open STDIN,"<&SAVEIN" unless $tied_stdin; # restore STDIN + return $result; +} + +*load = \&load_gff; + +=head2 load_fasta + + Title : load_fasta + Usage : $db->load_fasta($file|$directory|$filehandle); + Function: load FASTA data into database + Returns : count of records loaded + Args : a directory, a file, a list of files, + or a filehandle + Status : Public + +This method takes a single overloaded argument, which can be any of: + +=over 4 + +=item 1. scalar corresponding to a FASTA file on the system + +A pathname to a local FASTA file. Any files ending with the .gz, .Z, or +.bz2 suffixes will be transparently decompressed with the appropriate +command-line utility. + +=item 2. array reference containing a list of FASTA files on the +system + +For example ['/home/fasta/genomic.fa.gz','/home/fasta/genomic.fa.gz'] + +=item 3. path to a directory + +The indicated directory will be searched for all files ending in the +suffixes .fa, .fa.gz, .fa.Z or .fa.bz2. + +a=item 4. filehandle + +An open filehandle from which to read the FASTA data. + +=item 5. pipe expression + +A pipe expression will also work. For example, a FASTA file on a remote +web server can be loaded with an expression like this: + + $db->load_gff("lynx -dump -source http://stein.cshl.org/fasta_test.fa |"); + +=back + +=cut + +sub load_fasta { + my $self = shift; + my $file_or_directory = shift || '.'; + return $self->load_sequence($file_or_directory) if ref($file_or_directory) && + tied *$file_or_directory; + + my $tied = tied(*STDIN); + open SAVEIN,"<&STDIN" unless $tied; + local @ARGV = $self->setup_argv($file_or_directory,'fa','dna','fasta') or return; # to play tricks with reader + my $result = $self->load_sequence('ARGV'); + open STDIN,"<&SAVEIN" unless $tied; # restore STDIN + return $result; +} + +=head2 load_sequence_string + + Title : load_sequence_string + Usage : $db->load_sequence_string($id,$dna) + Function: load a single DNA entry + Returns : true if successfully loaded + Args : a raw sequence string (DNA, RNA, protein) + Status : Public + +=cut + +sub load_sequence_string { + my $self = shift; + my ($acc,$seq) = @_; + my $offset = 0; + $self->insert_sequence_chunk($acc,\$offset,\$seq) or return; + $self->insert_sequence($acc,$offset,$seq) or return; + 1; +} + +sub setup_argv { + my $self = shift; + my $file_or_directory = shift; + my @suffixes = @_; + no strict 'refs'; # so that we can call fileno() on the argument + + my @argv; + + if (-d $file_or_directory) { + @argv = map { glob("$file_or_directory/*.{$_,$_.gz,$_.Z,$_.bz2}")} @suffixes; + }elsif (my $fd = fileno($file_or_directory)) { + open STDIN,"<&=$fd" or $self->throw("Can't dup STDIN"); + @argv = '-'; + } elsif (ref $file_or_directory) { + @argv = @$file_or_directory; + } else { + @argv = $file_or_directory; + } + + foreach (@argv) { + if (/\.gz$/) { + $_ = "gunzip -c $_ |"; + } elsif (/\.Z$/) { + $_ = "uncompress -c $_ |"; + } elsif (/\.bz2$/) { + $_ = "bunzip2 -c $_ |"; + } + } + @argv; +} + +=head2 lock_on_load + + Title : lock_on_load + Usage : $lock = $db->lock_on_load([$lock]) + Function: set write locking during load + Returns : current value of lock-on-load flag + Args : new value of lock-on-load-flag + Status : Public + +This method is honored by some of the adaptors. If the value is true, +the tables used by the GFF modules will be locked for writing during +loads and inaccessible to other processes. + +=cut + +sub lock_on_load { + my $self = shift; + my $d = $self->{lock}; + $self->{lock} = shift if @_; + $d; +} + +=head2 meta + + Title : meta + Usage : $value = $db->meta($name [,$newval]) + Function: get or set a meta variable + Returns : a string + Args : meta variable name and optionally value + Status : abstract + +Get or set a named metavalues for the database. Metavalues can be +used for database-specific settings. + +By default, this method does nothing! + +=cut + +sub meta { + my $self = shift; + my ($name,$value) = @_; + return; +} + +=head2 default_meta_values + + Title : default_meta_values + Usage : %values = $db->default_meta_values + Function: empty the database + Returns : a list of tag=>value pairs + Args : none + Status : protected + +This method returns a list of tag=Evalue pairs that contain default +meta information about the database. It is invoked by initialize() to +write out the default meta values. The base class version returns an +empty list. + +For things to work properly, meta value names must be UPPERCASE. + +=cut + +sub default_meta_values { + my $self = shift; + return (); +} + + +=head2 error + + Title : error + Usage : $db->error( [$new error] ); + Function: read or set error message + Returns : error message + Args : an optional argument to set the error message + Status : Public + +This method can be used to retrieve the last error message. Errors +are not reset to empty by successful calls, so contents are only valid +immediately after an error condition has been detected. + +=cut + +sub error { + my $self = shift; + my $g = $self->{error}; + $self->{error} = join '',@_ if @_; + $g; +} + +=head2 debug + + Title : debug + Usage : $db->debug( [$flag] ); + Function: read or set debug flag + Returns : current value of debug flag + Args : new debug flag (optional) + Status : Public + +This method can be used to turn on debug messages. The exact nature +of those messages depends on the adaptor in use. + +=cut + +sub debug { + my $self = shift; + my $g = $self->{debug}; + $self->{debug} = shift if @_; + $g; +} + + +=head2 automerge + + Title : automerge + Usage : $db->automerge( [$new automerge] ); + Function: get or set automerge value + Returns : current value (boolean) + Args : an optional argument to set the automerge value + Status : Public + +By default, this module will use the aggregators to merge groups into +single composite objects. This default can be changed to false by +calling automerge(0). + +=cut + +sub automerge { + my $self = shift; + my $g = $self->{automerge}; + $self->{automerge} = shift if @_; + $g; +} + +=head2 attributes + + Title : attributes + Usage : @attributes = $db->attributes($id,$name) + Function: get the "attributres" on a particular feature + Returns : an array of string + Args : feature ID + Status : public + +Some GFF version 2 files use the groups column to store a series of +attribute/value pairs. In this interpretation of GFF, the first such +pair is treated as the primary group for the feature; subsequent pairs +are treated as attributes. Two attributes have special meaning: +"Note" is for backward compatibility and is used for unstructured text +remarks. "Alias" is considered as a synonym for the feature name. + +If no name is provided, then attributes() returns a flattened hash, of +attribute=Evalue pairs. This lets you do: + + %attributes = $db->attributes($id); + +Normally, attributes() will be called by the feature: + + @notes = $feature->attributes('Note'); + +In a scalar context, attributes() returns the first value of the +attribute if a tag is present, otherwise a hash reference in which the +keys are attribute names and the values are anonymous arrays +containing the values. + +=cut + +sub attributes { + my $self = shift; + my ($id,$tag) = @_; + my @result = $self->do_attributes($id,$tag) or return; + return @result if wantarray; + + # what to do in an array context + return $result[0] if $tag; + my %result; + while (my($key,$value) = splice(@result,0,2)) { + push @{$result{$key}},$value; + } + return \%result; +} + +=head2 fast_queries + + Title : fast_queries + Usage : $flag = $db->fast_queries([$flag]) + Function: turn on and off the "fast queries" option + Returns : a boolean + Args : a boolean flag (optional) + Status : public + +The mysql database driver (and possibly others) support a "fast" query +mode that caches results on the server side. This makes queries come +back faster, particularly when creating iterators. The downside is +that while iterating, new queries will die with a "command synch" +error. This method turns the feature on and off. + +For databases that do not support a fast query, this method has no +effect. + +=cut + +# override this method in order to set the mysql_use_result attribute, which is an obscure +# but extremely powerful optimization for both performance and memory. +sub fast_queries { + my $self = shift; + my $d = $self->{fast_queries}; + $self->{fast_queries} = shift if @_; + $d; +} + +=head2 add_aggregator + + Title : add_aggregator + Usage : $db->add_aggregator($aggregator) + Function: add an aggregator to the list + Returns : nothing + Args : an aggregator + Status : public + +This method will append an aggregator to the end of the list of +registered aggregators. Three different argument types are accepted: + + 1) a Bio::DB::GFF::Aggregator object -- will be added + 2) a string in the form "aggregator_name{subpart1,subpart2,subpart3/main_method}" + -- will be turned into a Bio::DB::GFF::Aggregator object (the /main_method + part is optional). + 3) a valid Perl token -- will be turned into a Bio::DB::GFF::Aggregator + subclass, where the token corresponds to the subclass name. + +=cut + +sub add_aggregator { + my $self = shift; + my $aggregator = shift; + my $list = $self->{aggregators} ||= []; + if (ref $aggregator) { # an object + @$list = grep {$_->get_method ne $aggregator->get_method} @$list; + push @$list,$aggregator; + } + + elsif ($aggregator =~ /^(\w+)\{([^\/\}]+)\/?(.*)\}$/) { + my($agg_name,$subparts,$mainpart) = ($1,$2,$3); + my @subparts = split /,\s*/,$subparts; + my @args = (-method => $agg_name, + -sub_parts => \@subparts); + push @args,(-main_method => $mainpart) if $mainpart; + warn "making an aggregator with (@args), subparts = @subparts" if $self->debug; + push @$list,Bio::DB::GFF::Aggregator->new(@args); + } + + else { + my $class = "Bio::DB::GFF::Aggregator::\L${aggregator}\E"; + eval "require $class"; + $self->throw("Unable to load $aggregator aggregator: $@") if $@; + push @$list,$class->new(); + } +} + +=head2 aggregators + + Title : aggregators + Usage : $db->aggregators([@new_aggregators]); + Function: retrieve list of aggregators + Returns : list of aggregators + Args : a list of aggregators to set (optional) + Status : public + +This method will get or set the list of aggregators assigned to +the database. If 1 or more arguments are passed, the existing +set will be cleared. + +=cut + +sub aggregators { + my $self = shift; + my $d = $self->{aggregators}; + if (@_) { + $self->clear_aggregators; + $self->add_aggregator($_) foreach @_; + } + return unless $d; + return @$d; +} + +=head2 clear_aggregators + + Title : clear_aggregators + Usage : $db->clear_aggregators + Function: clears list of aggregators + Returns : nothing + Args : none + Status : public + +This method will clear the aggregators stored in the database object. +Use aggregators() or add_aggregator() to add some back. + +=cut + +sub clear_aggregators { shift->{aggregators} = [] } + +=head1 Methods for use by Subclasses + +The following methods are chiefly of interest to subclasses and are +not intended for use by end programmers. + +=head2 abscoords + + Title : abscoords + Usage : $db->abscoords($name,$class,$refseq) + Function: finds position of a landmark in reference coordinates + Returns : ($ref,$class,$start,$stop,$strand) + Args : name and class of landmark + Status : public + +This method is called by Bio::DB::GFF::RelSegment to obtain the +absolute coordinates of a sequence landmark. The arguments are the +name and class of the landmark. If successful, abscoords() returns +the ID of the reference sequence, its class, its start and stop +positions, and the orientation of the reference sequence's coordinate +system ("+" for forward strand, "-" for reverse strand). + +If $refseq is present in the argument list, it forces the query to +search for the landmark in a particular reference sequence. + +=cut + +sub abscoords { + my $self = shift; + my ($name,$class,$refseq) = @_; + $class ||= $self->{default_class}; + $self->get_abscoords($name,$class,$refseq); +} + +=head1 Protected API + +The following methods are not intended for public consumption, but are +intended to be overridden/implemented by adaptors. + +=head2 default_aggregators + + Title : default_aggregators + Usage : $db->default_aggregators; + Function: retrieve list of aggregators + Returns : array reference containing list of aggregator names + Args : none + Status : protected + +This method (which is intended to be overridden by adaptors) returns a +list of standard aggregators to be applied when no aggregators are +specified in the constructor. + +=cut + +sub default_aggregators { + my $self = shift; + return ['processed_transcript','alignment']; +} + +=head2 do_load_gff + + Title : do_load_gff + Usage : $db->do_load_gff($handle) + Function: load a GFF input stream + Returns : number of features loaded + Args : A filehandle. + Status : protected + +This method is called to load a GFF data stream. The method will read +GFF features from EE and load them into the database. On exit the +method must return the number of features loaded. + +Note that the method is responsible for parsing the GFF lines. This +is to allow for differences in the interpretation of the "group" +field, which are legion. + +You probably want to use load_gff() instead. It is more flexible +about the arguments it accepts. + +=cut + +# load from <> +sub do_load_gff { + my $self = shift; + my $io_handle = shift; + + local $self->{gff3_flag} = 0; + $self->setup_load(); + + my $fasta_sequence_id; + + while (<$io_handle>) { + chomp; + $self->{gff3_flag}++ if /^\#\#gff-version\s+3/; + if (/^>(\S+)/) { # uh oh, sequence coming + $fasta_sequence_id = $1; + last; + } + if (/^\#\#\s*sequence-region\s+(\S+)\s+(\d+)\s+(\d+)/i) { # header line + $self->load_gff_line( + { + ref => $1, + class => 'Sequence', + source => 'reference', + method => 'Component', + start => $2, + stop => $3, + score => undef, + strand => undef, + phase => undef, + gclass => 'Sequence', + gname => $1, + tstart => undef, + tstop => undef, + attributes => [], + } + ); + next; + } + + next if /^\#/; + my ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = split "\t"; + next unless defined($ref) && defined($method) && defined($start) && defined($stop); + foreach (\$score,\$strand,\$phase) { + undef $$_ if $$_ eq '.'; + } + + my ($gclass,$gname,$tstart,$tstop,$attributes) = $self->split_group($group,$self->{gff3_flag}); + + # no standard way in the GFF file to denote the class of the reference sequence -- drat! + # so we invoke the factory to do it + my $class = $self->refclass($ref); + + # call subclass to do the dirty work + if ($start > $stop) { + ($start,$stop) = ($stop,$start); + if ($strand eq '+') { + $strand = '-'; + } elsif ($strand eq '-') { + $strand = '+'; + } + } + $self->load_gff_line({ref => $ref, + class => $class, + source => $source, + method => $method, + start => $start, + stop => $stop, + score => $score, + strand => $strand, + phase => $phase, + gclass => $gclass, + gname => $gname, + tstart => $tstart, + tstop => $tstop, + attributes => $attributes} + ); + } + + my $result = $self->finish_load(); + $result += $self->load_sequence($io_handle,$fasta_sequence_id) + if defined $fasta_sequence_id; + $result; + +} + +=head2 load_sequence + + Title : load_sequence + Usage : $db->load_sequence($handle [,$id]) + Function: load a FASTA data stream + Returns : number of sequences + Args : a filehandle and optionally the ID of + the first sequence in the stream. + Status : protected + +You probably want to use load_fasta() instead. The $id argument is a +hack used to switch from GFF loading to FASTA loading when load_gff() +discovers FASTA data hiding at the bottom of the GFF file (as Artemis +does). + +=cut + +sub load_sequence { + my $self = shift; + my $io_handle = shift; + my $id = shift; # hack for GFF files that contain fasta data + + # read fasta file(s) from ARGV + my ($seq,$offset,$loaded) = (undef,0,0); + while (<$io_handle>) { + chomp; + if (/^>(\S+)/) { + $self->insert_sequence($id,$offset,$seq) if $id; + $id = $1; + $offset = 0; + $seq = ''; + $loaded++; + } else { + $seq .= $_; + $self->insert_sequence_chunk($id,\$offset,\$seq); + } + } + $self->insert_sequence($id,$offset,$seq) if $id; + $loaded+0; +} + +sub insert_sequence_chunk { + my $self = shift; + my ($id,$offsetp,$seqp) = @_; + if (my $cs = $self->dna_chunk_size) { + while (length($$seqp) >= $cs) { + my $chunk = substr($$seqp,0,$cs); + $self->insert_sequence($id,$$offsetp,$chunk); + $$offsetp += length($chunk); + substr($$seqp,0,$cs) = ''; + } + } + return 1; # the calling routine may expect success or failure +} + +# used to store big pieces of DNA in itty bitty pieces +sub dna_chunk_size { + return 0; +} + +sub insert_sequence { + my $self = shift; + my($id,$offset,$seq) = @_; + $self->throw('insert_sequence(): must be defined in subclass'); +} + +# This is the default class for reference points. Defaults to Sequence. +sub default_class { + my $self = shift; + my $d = exists($self->{default_class}) ? $self->{default_class} : 'Sequence'; + $self->{default_class} = shift if @_; + $d; +} + +# gets name of the reference sequence, and returns its class +# currently just calls default_class +sub refclass { + my $self = shift; + my $name = shift; + return $self->default_class; +} + +=head2 setup_load + + Title : setup_load + Usage : $db->setup_load + Function: called before load_gff_line() + Returns : void + Args : none + Status : abstract + +This abstract method gives subclasses a chance to do any +schema-specific initialization prior to loading a set of GFF records. +It must be implemented by a subclass. + +=cut + +sub setup_load { + # default, do nothing +} + +=head2 finish_load + + Title : finish_load + Usage : $db->finish_load + Function: called after load_gff_line() + Returns : number of records loaded + Args : none + Status :abstract + +This method gives subclasses a chance to do any schema-specific +cleanup after loading a set of GFF records. + +=cut + +sub finish_load { + # default, do nothing +} + +=head2 load_gff_line + + Title : load_gff_line + Usage : $db->load_gff_line(@args) + Function: called to load one parsed line of GFF + Returns : true if successfully inserted + Args : see below + Status : abstract + +This abstract method is called once per line of the GFF and passed a +hashref containing parsed GFF fields. The fields are: + + {ref => $ref, + class => $class, + source => $source, + method => $method, + start => $start, + stop => $stop, + score => $score, + strand => $strand, + phase => $phase, + gclass => $gclass, + gname => $gname, + tstart => $tstart, + tstop => $tstop, + attributes => $attributes} + +=cut + +sub load_gff_line { + shift->throw("load_gff_line(): must be implemented by an adaptor"); +} + + +=head2 do_initialize + + Title : do_initialize + Usage : $db->do_initialize([$erase]) + Function: initialize and possibly erase database + Returns : true if successful + Args : optional erase flag + Status : protected + +This method implements the initialize() method described above, and +takes the same arguments. + +=cut + +sub do_initialize { + shift->throw('do_initialize(): must be implemented by an adaptor'); +} + +=head2 dna + + Title : dna + Usage : $db->dna($id,$start,$stop,$class) + Function: return the raw DNA string for a segment + Returns : a raw DNA string + Args : id of the sequence, its class, start and stop positions + Status : public + +This method is invoked by Bio::DB::GFF::Segment to fetch the raw DNA +sequence. + +Arguments: -name sequence name + -start start position + -stop stop position + -class sequence class + +If start and stop are both undef, then the entire DNA is retrieved. +So to fetch the whole dna, call like this: + + $db->dna($name_of_sequence); + +or like this: + + $db->dna(-name=>$name_of_sequence,-class=>$class_of_sequence); + +NOTE: you will probably prefer to create a Segment and then invoke its +dna() method. + +=cut + +# call to return the DNA string for the indicated region +# real work is done by get_dna() +sub dna { + my $self = shift; + my ($id,$start,$stop,$class) = rearrange([ + [qw(NAME ID REF REFSEQ)], + qw(START), + [qw(STOP END)], + 'CLASS', + ],@_); +# return unless defined $start && defined $stop; + $self->get_dna($id,$start,$stop,$class); +} + +sub features_in_range { + my $self = shift; + my ($range_type,$refseq,$class,$start,$stop,$types,$parent,$sparse,$automerge,$iterator,$other) = + rearrange([ + [qw(RANGE_TYPE)], + [qw(REF REFSEQ)], + qw(CLASS), + qw(START), + [qw(STOP END)], + [qw(TYPE TYPES)], + qw(PARENT), + [qw(RARE SPARSE)], + [qw(MERGE AUTOMERGE)], + 'ITERATOR' + ],@_); + $other ||= {}; + $automerge = $types && $self->automerge unless defined $automerge; + $self->throw("range type must be one of {". + join(',',keys %valid_range_types). + "}\n") + unless $valid_range_types{lc $range_type}; + $self->_features({ + rangetype => lc $range_type, + refseq => $refseq, + refclass => $class, + start => $start, + stop => $stop, + types => $types }, + { + sparse => $sparse, + automerge => $automerge, + iterator => $iterator, + %$other, + }, + $parent); +} + +=head2 get_dna + + Title : get_dna + Usage : $db->get_dna($id,$start,$stop,$class) + Function: get DNA for indicated segment + Returns : the dna string + Args : sequence ID, start, stop and class + Status : protected + +If start E stop and the sequence is nucleotide, then this method +should return the reverse complement. The sequence class may be +ignored by those databases that do not recognize different object +types. + +=cut + +sub get_dna { + my $self = shift; + my ($id,$start,$stop,$class,) = @_; + $self->throw("get_dna() must be implemented by an adaptor"); +} + +=head2 get_features + + Title : get_features + Usage : $db->get_features($search,$options,$callback) + Function: get list of features for a region + Returns : count of number of features retrieved + Args : see below + Status : protected + +The first argument is a hash reference containing search criteria for +retrieving features. It contains the following keys: + + rangetype One of "overlaps", "contains" or "contained_in". Indicates + the type of range query requested. + + refseq ID of the landmark that establishes the absolute + coordinate system. + + refclass Class of this landmark. Can be ignored by implementations + that don't recognize such distinctions. + + start Start of the range, inclusive. + + stop Stop of the range, inclusive. + + types Array reference containing the list of annotation types + to fetch from the database. Each annotation type is an + array reference consisting of [source,method]. + +The second argument is a hash reference containing certain options +that affect the way information is retrieved: + + sort_by_group + A flag. If true, means that the returned features should be + sorted by the group that they're in. + + sparse A flag. If true, means that the expected density of the + features is such that it will be more efficient to search + by type rather than by range. If it is taking a long + time to fetch features, give this a try. + + binsize A true value will create a set of artificial features whose + start and stop positions indicate bins of the given size, and + whose scores are the number of features in the bin. The + class of the feature will be set to "bin", and its name to + "method:source". This is a handy way of generating histograms + of feature density. + +The third argument, the $callback, is a code reference to which +retrieved features are passed. It is described in more detail below. + +This routine is responsible for getting arrays of GFF data out of the +database and passing them to the callback subroutine. The callback +does the work of constructing a Bio::DB::GFF::Feature object out of +that data. The callback expects a list of 13 fields: + + $refseq The reference sequence + $start feature start + $stop feature stop + $source feature source + $method feature method + $score feature score + $strand feature strand + $phase feature phase + $groupclass group class (may be undef) + $groupname group ID (may be undef) + $tstart target start for similarity hits (may be undef) + $tstop target stop for similarity hits (may be undef) + $feature_id A unique feature ID (may be undef) + +These fields are in the same order as the raw GFF file, with the +exception that the group column has been parsed into group class and +group name fields. + +The feature ID, if provided, is a unique identifier of the feature +line. The module does not depend on this ID in any way, but it is +available via Bio::DB::GFF-Eid() if wanted. In the dbi::mysql and +dbi::mysqlopt adaptor, the ID is a unique row ID. In the acedb +adaptor it is not used. + +=cut + +sub get_features{ + my $self = shift; + my ($search,$options,$callback) = @_; + $self->throw("get_features() must be implemented by an adaptor"); +} + + +=head2 _feature_by_name + + Title : _feature_by_name + Usage : $db->_feature_by_name($class,$name,$location,$callback) + Function: get a list of features by name and class + Returns : count of number of features retrieved + Args : name of feature, class of feature, and a callback + Status : abstract + +This method is used internally. The callback arguments are the same +as those used by make_feature(). This method must be overidden by +subclasses. + +=cut + +sub _feature_by_name { + my $self = shift; + my ($class,$name,$location,$callback) = @_; + $self->throw("_feature_by_name() must be implemented by an adaptor"); +} + +sub _feature_by_attribute { + my $self = shift; + my ($attributes,$callback) = @_; + $self->throw("_feature_by_name() must be implemented by an adaptor"); +} + +=head2 _feature_by_id + + Title : _feature_by_id + Usage : $db->_feature_by_id($ids,$type,$callback) + Function: get a feature based + Returns : count of number of features retrieved + Args : arrayref to feature IDs to fetch + Status : abstract + +This method is used internally to fetch features either by their ID or +their group ID. $ids is a arrayref containing a list of IDs, $type is +one of "feature" or "group", and $callback is a callback. The +callback arguments are the same as those used by make_feature(). This +method must be overidden by subclasses. + +=cut + +sub _feature_by_id { + my $self = shift; + my ($ids,$type,$callback) = @_; + $self->throw("_feature_by_id() must be implemented by an adaptor"); +} + +=head2 overlapping_features + + Title : overlapping_features + Usage : $db->overlapping_features(@args) + Function: get features that overlap the indicated range + Returns : a list of Bio::DB::GFF::Feature objects + Args : see below + Status : public + +This method is invoked by Bio::DB::GFF::Segment-Efeatures() to find +the list of features that overlap a given range. It is generally +preferable to create the Segment first, and then fetch the features. + +This method takes set of named arguments: + + -refseq ID of the reference sequence + -class Class of the reference sequence + -start Start of the desired range in refseq coordinates + -stop Stop of the desired range in refseq coordinates + -types List of feature types to return. Argument is an array + reference containing strings of the format "method:source" + -parent A parent Bio::DB::GFF::Segment object, used to create + relative coordinates in the generated features. + -rare Turn on an optimization suitable for a relatively rare feature type, + where it will be faster to filter by feature type first + and then by position, rather than vice versa. + -merge Whether to apply aggregators to the generated features. + -iterator Whether to return an iterator across the features. + +If -iterator is true, then the method returns a single scalar value +consisting of a Bio::SeqIO object. You can call next_seq() repeatedly +on this object to fetch each of the features in turn. If iterator is +false or absent, then all the features are returned as a list. + +Currently aggregation is disabled when iterating over a series of +features. + +Types are indicated using the nomenclature "method:source". Either of +these fields can be omitted, in which case a wildcard is used for the +missing field. Type names without the colon (e.g. "exon") are +interpreted as the method name and a source wild card. Regular +expressions are allowed in either field, as in: "similarity:BLAST.*". + +=cut + +# call to return the features that overlap the named region +# real work is done by get_features +sub overlapping_features { + my $self = shift; + $self->features_in_range(-range_type=>'overlaps',@_); +} + +=head2 contained_features + + Title : contained_features + Usage : $db->contained_features(@args) + Function: get features that are contained within the indicated range + Returns : a list of Bio::DB::GFF::Feature objects + Args : see overlapping_features() + Status : public + +This call is similar to overlapping_features(), except that it only +retrieves features whose end points are completely contained within +the specified range. + +Generally you will want to fetch a Bio::DB::GFF::Segment object and +call its contained_features() method rather than call this directly. + +=cut + +# The same, except that it only returns features that are completely contained within the +# range (much faster usually) +sub contained_features { + my $self = shift; + $self->features_in_range(-range_type=>'contains',@_); +} + +=head2 contained_in + + Title : contained_in + Usage : @features = $s->contained_in(@args) + Function: get features that contain this segment + Returns : a list of Bio::DB::GFF::Feature objects + Args : see features() + Status : Public + +This is identical in behavior to features() except that it returns +only those features that completely contain the segment. + +=cut + +sub contained_in { + my $self = shift; + $self->features_in_range(-range_type=>'contained_in',@_); +} + +=head2 get_abscoords + + Title : get_abscoords + Usage : $db->get_abscoords($name,$class,$refseq) + Function: get the absolute coordinates of sequence with name & class + Returns : ($absref,$absstart,$absstop,$absstrand) + Args : name and class of the landmark + Status : protected + +Given the name and class of a genomic landmark, this function returns +a four-element array consisting of: + + $absref the ID of the reference sequence that contains this landmark + $absstart the position at which the landmark starts + $absstop the position at which the landmark stops + $absstrand the strand of the landmark, relative to the reference sequence + +If $refseq is provided, the function searches only within the +specified reference sequence. + +=cut + +sub get_abscoords { + my $self = shift; + my ($name,$class,$refseq) = @_; + $self->throw("get_abscoords() must be implemented by an adaptor"); +} + +=head2 get_types + + Title : get_types + Usage : $db->get_types($absref,$class,$start,$stop,$count) + Function: get list of all feature types on the indicated segment + Returns : list or hash of Bio::DB::GFF::Typename objects + Args : see below + Status : protected + +Arguments are: + + $absref the ID of the reference sequence + $class the class of the reference sequence + $start the position to start counting + $stop the position to end counting + $count a boolean indicating whether to count the number + of occurrences of each feature type + +If $count is true, then a hash is returned. The keys of the hash are +feature type names in the format "method:source" and the values are +the number of times a feature of this type overlaps the indicated +segment. Otherwise, the call returns a set of Bio::DB::GFF::Typename +objects. If $start or $stop are undef, then all features on the +indicated segment are enumerated. If $absref is undef, then the call +returns all feature types in the database. + +=cut + +sub get_types { + my $self = shift; + my ($refseq,$class,$start,$stop,$count,$types) = @_; + $self->throw("get_types() must be implemented by an adaptor"); +} + +=head2 make_feature + + Title : make_feature + Usage : $db->make_feature(@args) + Function: Create a Bio::DB::GFF::Feature object from string data + Returns : a Bio::DB::GFF::Feature object + Args : see below + Status : internal + + This takes 14 arguments (really!): + + $parent A Bio::DB::GFF::RelSegment object + $group_hash A hashref containing unique list of GFF groups + $refname The name of the reference sequence for this feature + $refclass The class of the reference sequence for this feature + $start Start of feature + $stop Stop of feature + $source Feature source field + $method Feature method field + $score Feature score field + $strand Feature strand + $phase Feature phase + $group_class Class of feature group + $group_name Name of feature group + $tstart For homologies, start of hit on target + $tstop Stop of hit on target + +The $parent argument, if present, is used to establish relative +coordinates in the resulting Bio::DB::Feature object. This allows one +feature to generate a list of other features that are relative to its +coordinate system (for example, finding the coordinates of the second +exon relative to the coordinates of the first). + +The $group_hash allows the group_class/group_name strings to be turned +into rich database objects via the make_obect() method (see above). +Because these objects may be expensive to create, $group_hash is used +to uniquefy them. The index of this hash is the composite key +{$group_class,$group_name,$tstart,$tstop}. Values are whatever object +is returned by the make_object() method. + +The remainder of the fields are taken from the GFF line, with the +exception that "Target" features, which contain information about the +target of a homology search, are parsed into their components. + +=cut + +# This call is responsible for turning a line of GFF into a +# feature object. +# The $parent argument is a Bio::DB::GFF::Segment object and is used +# to establish the coordinate system for the new feature. +# The $group_hash argument is an hash ref that holds previously- +# generated group objects. +# Other arguments are taken right out of the GFF table. +sub make_feature { + my $self = shift; + my ($parent,$group_hash, # these arguments provided by generic mechanisms + $srcseq, # the rest is provided by adaptor + $start,$stop, + $source,$method, + $score,$strand,$phase, + $group_class,$group_name, + $tstart,$tstop, + $db_id,$group_id) = @_; + + return unless $srcseq; # return undef if called with no arguments. This behavior is used for + # on-the-fly aggregation. + + my $group; # undefined + if (defined $group_class && defined $group_name) { + $tstart ||= ''; + $tstop ||= ''; + if ($group_hash) { + $group = $group_hash->{$group_class,$group_name,$tstart,$tstop} + ||= $self->make_object($group_class,$group_name,$tstart,$tstop); + } else { + $group = $self->make_object($group_class,$group_name,$tstart,$tstop); + } + } + +# fix for some broken GFF files +# unfortunately - has undesired side effects +# if (defined $tstart && defined $tstop && !defined $strand) { +# $strand = $tstart <= $tstop ? '+' : '-'; +# } + + if (ref $parent) { # note that the src sequence is ignored + return Bio::DB::GFF::Feature->new_from_parent($parent,$start,$stop, + $method,$source, + $score,$strand,$phase, + $group,$db_id,$group_id, + $tstart,$tstop); + } else { + return Bio::DB::GFF::Feature->new($self,$srcseq, + $start,$stop, + $method,$source, + $score,$strand,$phase, + $group,$db_id,$group_id, + $tstart,$tstop); + } +} + +sub make_aggregated_feature { + my $self = shift; + my ($accumulated_features,$parent,$aggregators) = splice(@_,0,3); + my $feature = $self->make_feature($parent,undef,@_); + return [$feature] if $feature && !$feature->group; + + # if we have accumulated features and either: + # (1) make_feature() returned undef, indicated very end or + # (2) the current group is different from the previous one + + local $^W = 0; # irritating uninitialized value warning in next statement + if (@$accumulated_features && + (!defined($feature) || ($accumulated_features->[-1]->group ne $feature->group))) { + foreach my $a (@$aggregators) { # last aggregator gets first shot + $a->aggregate($accumulated_features,$self) or next; + } + my @result = @$accumulated_features; + @$accumulated_features = $feature ? ($feature) : (); + return unless @result; + return \@result ; + } + push @$accumulated_features,$feature; + return; +} + +=head2 parse_types + + Title : parse_types + Usage : $db->parse_types(@args) + Function: parses list of types + Returns : an array ref containing ['method','source'] pairs + Args : a list of types in 'method:source' form + Status : internal + +This method takes an array of type names in the format "method:source" +and returns an array reference of ['method','source'] pairs. It will +also accept a single argument consisting of an array reference with +the list of type names. + +=cut + +# turn feature types in the format "method:source" into a list of [method,source] refs +sub parse_types { + my $self = shift; + return [] if !@_ or !defined($_[0]); + return $_[0] if ref $_[0] eq 'ARRAY' && ref $_[0][0]; + my @types = ref($_[0]) ? @{$_[0]} : @_; + my @type_list = map { [split(':',$_,2)] } @types; + return \@type_list; +} + +=head2 make_match_sub + + Title : make_match_sub + Usage : $db->make_match_sub($types) + Function: creates a subroutine used for filtering features + Returns : a code reference + Args : a list of parsed type names + Status : protected + +This method is used internally to generate a code subroutine that will +accept or reject a feature based on its method and source. It takes +an array of parsed type names in the format returned by parse_types(), +and generates an anonymous subroutine. The subroutine takes a single +Bio::DB::GFF::Feature object and returns true if the feature matches +one of the desired feature types, and false otherwise. + +=cut + +# a subroutine that matches features indicated by list of types +sub make_match_sub { + my $self = shift; + my $types = shift; + + return sub { 1 } unless ref $types && @$types; + + my @expr; + for my $type (@$types) { + my ($method,$source) = @$type; + $method ||= '.*'; + $source = $source ? ":$source" : "(?::.+)?"; + push @expr,"${method}${source}"; + } + my $expr = join '|',@expr; + return $self->{match_subs}{$expr} if $self->{match_subs}{$expr}; + + my $sub =<type =~ /^($expr)\$/i; +} +END + warn "match sub: $sub\n" if $self->debug; + my $compiled_sub = eval $sub; + $self->throw($@) if $@; + return $self->{match_subs}{$expr} = $compiled_sub; +} + +=head2 make_object + + Title : make_object + Usage : $db->make_object($class,$name,$start,$stop) + Function: creates a feature object + Returns : a feature object + Args : see below + Status : protected + +This method is called to make an object from the GFF "group" field. +By default, all Target groups are turned into Bio::DB::GFF::Homol +objects, and everything else becomes a Bio::DB::GFF::Featname. +However, adaptors are free to override this method to generate more +interesting objects, such as true BioPerl objects, or Acedb objects. + +Arguments are: + + $name database ID for object + $class class of object + $start for similarities, start of match inside object + $stop for similarities, stop of match inside object + +=cut + +# abstract call to turn a feature into an object, given its class and name +sub make_object { + my $self = shift; + my ($class,$name,$start,$stop) = @_; + return Bio::DB::GFF::Homol->new($self,$class,$name,$start,$stop) + if defined $start and length $start; + return Bio::DB::GFF::Featname->new($class,$name); +} + + +=head2 do_attributes + + Title : do_attributes + Usage : $db->do_attributes($id [,$tag]); + Function: internal method to retrieve attributes given an id and tag + Returns : a list of Bio::DB::GFF::Feature objects + Args : a feature id and a attribute tag (optional) + Status : protected + +This method is overridden by subclasses in order to return a list of +attributes. If called with a tag, returns the value of attributes of +that tag type. If called without a tag, returns a flattened array of +(tag=Evalue) pairs. A particular tag can be present multiple times. + +=cut + +sub do_attributes { + my $self = shift; + my ($id,$tag) = @_; + return (); +} + + + +=head1 Internal Methods + +The following methods are internal to Bio::DB::GFF and are not +guaranteed to remain the same. + +=head2 _features + + Title : _features + Usage : $db->_features($search,$options,$parent) + Function: internal method + Returns : a list of Bio::DB::GFF::Feature objects + Args : see below + Status : internal + +This is an internal method that is called by overlapping_features(), +contained_features() and features() to create features based on a +parent segment's coordinate system. It takes three arguments, a +search options hashref, an options hashref, and a parent segment. + +The search hashref contains the following keys: + + rangetype One of "overlaps", "contains" or "contained_in". Indicates + the type of range query requested. + refseq reference sequence ID + refclass reference sequence class + start start of range + stop stop of range + types arrayref containing list of types in "method:source" form + +The options hashref contains zero or more of the following keys: + + sparse turn on optimizations for a rare feature + automerge if true, invoke aggregators to merge features + iterator if true, return an iterator + +The $parent argument is a scalar object containing a +Bio::DB::GFF::RelSegment object or descendent. + +=cut + +#' + +sub _features { + my $self = shift; + my ($search,$options,$parent) = @_; + (@{$search}{qw(start stop)}) = (@{$search}{qw(stop start)}) + if defined($search->{start}) && $search->{start} > $search->{stop}; + + my $types = $self->parse_types($search->{types}); # parse out list of types + my @aggregated_types = @$types; # keep a copy + + # allow the aggregators to operate on the original + my @aggregators; + if ($options->{automerge}) { + for my $a ($self->aggregators) { + $a = $a->clone if $options->{iterator}; + unshift @aggregators,$a + if $a->disaggregate(\@aggregated_types,$self); + } + } + + if ($options->{iterator}) { + my @accumulated_features; + my $callback = $options->{automerge} ? sub { $self->make_aggregated_feature(\@accumulated_features,$parent,\@aggregators,@_) } + : sub { [$self->make_feature($parent,undef,@_)] }; + return $self->get_features_iterator({ %$search, + types => \@aggregated_types }, + { %$options, + sort_by_group => $options->{automerge} }, + $callback + ); + } + + my %groups; # cache the groups we create to avoid consuming too much unecessary memory + my $features = []; + + my $callback = sub { push @$features,$self->make_feature($parent,\%groups,@_) }; + $self->get_features({ %$search, + types => \@aggregated_types }, + $options, + $callback); + + if ($options->{automerge}) { + warn "aggregating...\n" if $self->debug; + foreach my $a (@aggregators) { # last aggregator gets first shot + warn "Aggregator $a:\n" if $self->debug; + $a->aggregate($features,$self); + } + } + + @$features; +} + +=head2 get_features_iterator + + Title : get_features_iterator + Usage : $db->get_features_iterator($search,$options,$callback) + Function: get an iterator on a features query + Returns : a Bio::SeqIO object + Args : as per get_features() + Status : Public + +This method takes the same arguments as get_features(), but returns an +iterator that can be used to fetch features sequentially, as per +Bio::SeqIO. + +Internally, this method is simply a front end to range_query(). +The latter method constructs and executes the query, returning a +statement handle. This routine passes the statement handle to the +constructor for the iterator, along with the callback. + +=cut + +sub get_features_iterator { + my $self = shift; + my ($search,$options,$callback) = @_; + $self->throw('feature iteration is not implemented in this adaptor'); +} + +=head2 split_group + + Title : split_group + Usage : $db->split_group($group_field,$gff3_flag) + Function: parse GFF group field + Returns : ($gclass,$gname,$tstart,$tstop,$attributes) + Args : the gff group column and a flag indicating gff3 compatibility + Status : internal + +This is a method that is called by load_gff_line to parse out the +contents of one or more group fields. It returns the class of the +group, its name, the start and stop of the target, if any, and an +array reference containing any attributes that were stuck into the +group field, in [attribute_name,attribute_value] format. + +=cut + +sub split_group { + my $self = shift; + my ($group,$gff3) = @_; + if ($gff3) { + my @groups = split /[;&]/,$group; # so easy! + return $self->_split_gff3_group(@groups); + } else { + # handle group parsing + # protect embedded semicolons in the group; there must be faster/more elegant way + # to do this. + $group =~ s/\\;/$;/g; + while ($group =~ s/( \"[^\"]*);([^\"]*\")/$1$;$2/) { 1 } + my @groups = split(/\s*;\s*/,$group); + foreach (@groups) { s/$;/;/g } + return $self->_split_gff2_group(@groups); + } +} + +=head2 _split_gff2_group + +This is an internal method called by split_group(). + +=cut + +sub _split_gff2_group { + my $self = shift; + my @groups = @_; + + my ($gclass,$gname,$tstart,$tstop,@attributes); + + for (@groups) { + + my ($tag,$value) = /^(\S+)(?:\s+(.+))?/; + $value ||= ''; + if ($value =~ /^\"(.+)\"$/) { #remove quotes + $value = $1; + } + $value =~ s/\\t/\t/g; + $value =~ s/\\r/\r/g; + + # Any additional groups become part of the attributes hash + # For historical reasons, the tag "Note" is treated as an + # attribute, even if it is the only group. + $tag ||= ''; + if ($tag eq 'Note' or ($gclass && $gname)) { + push @attributes,[$tag => $value]; + } + + # if the tag eq 'Target' then the class name is embedded in the ID + # (the GFF format is obviously screwed up here) + elsif ($tag eq 'Target' && /([^:\"\s]+):([^\"\s]+)/) { + ($gclass,$gname) = ($1,$2); + ($tstart,$tstop) = / (\d+) (\d+)/; + } + + elsif (!$value) { + push @attributes,[Note => $tag]; # e.g. "Confirmed_by_EST" + } + + # otherwise, the tag and value correspond to the + # group class and name + else { + ($gclass,$gname) = ($tag,$value); + } + } + + return ($gclass,$gname,$tstart,$tstop,\@attributes); +} + +=head2 _split_gff3_group + +This is called internally from split_group(). + +=cut + +sub _split_gff3_group { + my $self = shift; + my @groups = @_; + my ($gclass,$gname,$tstart,$tstop,@attributes); + + for my $group (@groups) { + my ($tag,$value) = split /=/,$group; + $tag = unescape($tag); + my @values = map {unescape($_)} split /,/,$value; + if ($tag eq 'Parent') { + $gclass = 'Sequence'; + $gname = shift @values; + } + elsif ($tag eq 'ID') { + $gclass = 'Sequence'; + $gname = shift @values; + } + elsif ($tag eq 'Target') { + $gclass = 'Sequence'; + ($gname,$tstart,$tstop) = split /\s+/,shift @values; + } + push @attributes,[$tag=>$_] foreach @values; + } + return ($gclass,$gname,$tstart,$tstop,\@attributes); +} + +=head2 _delete_features(), _delete_groups(),_delete() + + Title : _delete_features(), _delete_groups(),_delete() + Usage : $count = $db->_delete_features(@feature_ids) + $count = $db->_delete_groups(@group_ids) + $count = $db->_delete(\%delete_spec) + Function: low-level feature/group deleter + Returns : count of groups removed + Args : list of feature or group ids removed + Status : for implementation by subclasses + +These methods need to be implemented in adaptors. For +_delete_features and _delete_groups, the arguments are a list of +feature or group IDs to remove. For _delete(), the argument is a +hashref with the three keys 'segments', 'types' and 'force'. The +first contains an arrayref of Bio::DB::GFF::RelSegment objects to +delete (all FEATURES within the segment are deleted). The second +contains an arrayref of [method,source] feature types to delete. The +two are ANDed together. If 'force' has a true value, this forces the +operation to continue even if it would delete all features. + +=cut + +sub _delete_features { + my $self = shift; + my @feature_ids = @_; + $self->throw('_delete_features is not implemented in this adaptor'); +} + +sub _delete_groups { + my $self = shift; + my @group_ids = @_; + $self->throw('_delete_groups is not implemented in this adaptor'); +} + +sub _delete { + my $self = shift; + my $delete_options = shift; + $self->throw('_delete is not implemented in this adaptor'); +} + +sub unescape { + my $v = shift; + $v =~ tr/+/ /; + $v =~ s/%([0-9a-fA-F]{2})/chr hex($1)/ge; + return $v; +} + + +package Bio::DB::GFF::ID_Iterator; +use strict; + +use Bio::Root::Root; +use vars '@ISA'; +@ISA = 'Bio::Root::Root'; + +sub new { + my $class = shift; + my ($db,$ids,$type) = @_; + return bless {ids=>$ids,db=>$db,type=>$type},$class; +} + +sub next_seq { + my $self = shift; + my $next = shift @{$self->{ids}}; + return unless $next; + my $name = ref($next) eq 'ARRAY' ? Bio::DB::GFF::Featname->new(@$next) : $next; + my $segment = $self->{type} eq 'name' ? $self->{db}->segment($name) + : $self->{type} eq 'feature' ? $self->{db}->fetch_feature_by_id($name) + : $self->{type} eq 'group' ? $self->{db}->fetch_feature_by_gid($name) + : $self->throw("Bio::DB::GFF::ID_Iterator called to fetch an unknown type of identifier"); + $self->throw("id does not exist") unless $segment; + return $segment; +} + +1; + +__END__ + +=head1 BUGS + +Features can only belong to a single group at a time. This must be +addressed soon. + +Start coordinate can be greater than stop coordinate for relative +addressing. This breaks strict BioPerl compatibility and must be +fixed. + +=head1 SEE ALSO + +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Adaptor/ace.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Adaptor/ace.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,66 @@ +package Bio::DB::GFF::Adaptor::ace; + +=head1 NAME + +Bio::DB::GFF::Adaptor::ace -- ace interface (for multiple inheritance) + +=head1 SYNOPSIS + +Pending + +See L and L + +=head1 SEE ALSO + +L, L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE. + +Copyright (c) 2002 Cold Spring Harbor Laboratory. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut + +use strict; +use Ace; +use Bio::DB::GFF::Util::Rearrange; # for rearrange() + +sub dna_db { + my $self = shift; + my $d = $self->{dna_db}; + $self->{dna_db} = shift if @_; + $d; +} +sub acedb { + my $self = shift; + my $d = $self->{acedb}; + $self->{acedb} = shift if @_; + $d; +} + +=head2 freshen_ace + + Title : freshen + Usage : $flag = Bio::DB::GFF->freshen_ace; + Function: Refresh internal acedb handle + Returns : flag if correctly freshened + Args : none + Status : Public + +ACeDB has an annoying way of timing out, leaving dangling database +handles. This method will invoke the ACeDB reopen() method, which +causes dangling handles to be refreshed. It has no effect if you are +not using ACeDB to create ACeDB objects. + +=cut + +sub freshen_ace { + my $acedb = shift->acedb or return; + $acedb->reopen(); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Adaptor/biofetch.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Adaptor/biofetch.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,276 @@ +package Bio::DB::GFF::Adaptor::biofetch; + +=head1 NAME + +Bio::DB::GFF::Adaptor::biofetch -- Cache BioFetch objects in a Bio::DB::GFF database + +=head1 SYNOPSIS + +Proof of principle. Not for production use. + +=head1 DESCRIPTION + +This adaptor is a proof-of-principle. It is used to fetch BioFetch +sequences into a Bio::DB::GFF database (currently uses a hard-coded +EMBL database) as needed. This allows the Generic Genome Browser to +be used as a Genbank/EMBL browser. + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE. + +Copyright 2002 Cold Spring Harbor Laboratory. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut + +use strict; +use Bio::DB::GFF::Util::Rearrange; # for rearrange() +use Bio::DB::GFF::Adaptor::dbi::mysql; +use Bio::DB::BioFetch; +use Bio::SeqIO; + +use vars qw(@ISA %preferred_tags); +@ISA = qw(Bio::DB::GFF::Adaptor::dbi::mysql); + +# priority for choosing names of CDS tags, higher is higher priority +%preferred_tags = ( + strain => 10, + organism => 20, + protein_id => 40, + locus_tag => 50, + locus => 60, + gene => 70, + standard_name => 80, + ); + +=head2 new + + Title : new + Usage : $db = Bio::DB::GFF->new(-adaptor=>'biofetch',@args) + Function: create a new adaptor + Returns : a Bio::DB::GFF object + Args : see below + Status : Public + +This is the constructor for the adaptor. It is called automatically +by Bio::DB::GFF-Enew. In addition to arguments that are common among +all adaptors, the following class-specific arguments are recgonized: + + Argument Description + -------- ----------- + + -dsn the DBI data source, e.g. 'dbi:mysql:ens0040' + + -user username for authentication + + -pass the password for authentication + + -proxy [['http','ftp'],'http://proxy:8080'] + +-dsn,-user and -pass indicate the local database to cache results in, +and as are per Bio::DB::GFF::Adaptor::dbi. The -proxy argument allows +you to set the biofetch web proxy, and uses the same syntax described +for the proxy() method of L, except that the +argument must be passed as an array reference. + +=cut + +sub new { + my $class = shift; + my $self = $class->SUPER::new(@_); + my ($proxy) = rearrange(['PROXY'],@_); + if ($proxy) { + my @args = ref($proxy) ? @$proxy : eval $proxy; + $self->{_proxy} = \@args if @args; + } + $self; +} + +sub segment { + my $self = shift; + my @segments = $self->SUPER::segment(@_); + + if (!@segments) { + my $refclass = $self->refclass; + + my %args = $self->setup_segment_args(@_); + if ($args{-class} && $args{-class} =~ /$refclass/oi) { + return unless $self->load_from_embl('embl'=>$args{-name}); + @segments = $self->SUPER::segment(@_); + } elsif ($args{-class} && $args{-class} =~ /refseq|swall|embl/i) { #hack to get refseq names + return unless $self->load_from_embl(lc($args{-class})=>$args{-name}); + $args{-class} = $self->refclass; + @segments = $self->SUPER::segment(%args); + } + } + + $self->_multiple_return_args(@segments); +} + +# default is to return 'Sequence' as the class of all references +sub refclass { + my $self = shift; + my $refname = shift; + 'Accession'; +} + +sub load_from_embl { + my $self = shift; + my $db = shift; + my $acc = shift or $self->throw('Must provide an accession ID'); + + my $biofetch; + if ($self->{_biofetch}{$db}) { + $biofetch = $self->{_biofetch}{$db}; + } else { + $biofetch = $self->{_biofetch}{$db} = Bio::DB::BioFetch->new(-db=>$db); + $biofetch->retrieval_type('tempfile'); + $biofetch->proxy(@{$self->{_proxy}}) if $self->{_proxy}; + } + + my $seq = eval {$biofetch->get_Seq_by_id($acc)} or return; + $self->_load_embl($acc,$seq); + 1; +} + +sub load_from_file { + my $self = shift; + my $file = shift; + + my $format = $file =~ /\.(gb|genbank|gbk)$/i ? 'genbank' : 'embl'; + + my $seqio = Bio::SeqIO->new( '-format' => $format, -file => $file); + my $seq = $seqio->next_seq; + + $self->_load_embl($seq->accession,$seq); + 1; +} + +sub _load_embl { + my $self = shift; + my $acc = shift; + my $seq = shift; + my $refclass = $self->refclass; + my $locus = $seq->id; + + # begin loading + $self->setup_load(); + + # first synthesize the entry for the top-level feature + my @aliases; + foreach ($seq->accession,$seq->get_secondary_accessions) { + next if lc($_) eq lc($acc); + push @aliases,[Alias => $_]; + } + $self->load_gff_line( + { + ref => $acc, + class => $refclass, + source => 'EMBL', + method => 'origin', + start => 1, + stop => $seq->length, + score => undef, + strand => '.', + phase => '.', + gclass => $self->refclass, + gname => $acc, + tstart => undef, + tstop => undef, + attributes => [[Note => $seq->desc],@aliases], + } + ); + # now load each feature in turn + for my $feat ($seq->all_SeqFeatures) { + my $attributes = $self->get_attributes($feat); + my $name = $self->guess_name($attributes); + + my $location = $feat->location; + my @segments = map {[$_->start,$_->end,$_->seq_id]} + $location->can('sub_Location') ? $location->sub_Location : $location; + + my $type = $feat->primary_tag eq 'CDS' ? 'mRNA' : $feat->primary_tag; + my $parttype = $feat->primary_tag eq 'gene' ? 'exon' : $feat->primary_tag; + + if ($feat->primary_tag =~ /^(gene|CDS)$/) { + $self->load_gff_line( { + ref => $acc, + class => $refclass, + source => 'EMBL', + method => $type, + start => $location->start, + stop => $location->end, + score => $feat->score || undef, + strand => $feat->strand > 0 ? '+' : ($feat->strand < 0 ? '-' : '.'), + phase => $feat->frame || '.', + gclass => $name->[0], + gname => $name->[1], + tstart => undef, + tstop => undef, + attributes => $attributes, + } + ); + @$attributes = (); + } + + for my $segment (@segments) { + + $self->load_gff_line( { + ref => $segment->[2] eq $locus ? $acc : $segment->[2], + class => $refclass, + source => 'EMBL', + method => $parttype, + start => $segment->[0], + stop => $segment->[1], + score => $feat->score || undef, + strand => $feat->strand > 0 ? '+' : ($feat->strand < 0 ? '-' : '.'), + phase => $feat->frame || '.', + gclass => $name->[0], + gname => $name->[1], + tstart => undef, + tstop => undef, + attributes => $attributes, + } + ); + } + + } + + # finish loading + $self->finish_load(); + + # now load the DNA + $self->load_sequence_string($acc,$seq->seq); + + 1; +} + +sub get_attributes { + my $self = shift; + my $seq = shift; + + my @tags = $seq->all_tags or return; + my @result; + foreach my $tag (@tags) { + foreach my $value ($seq->each_tag_value($tag)) { + push @result,[$tag=>$value]; + } + } + \@result; +} + +sub guess_name { + my $self = shift; + my $attributes = shift; + my @ordered_attributes = sort {($preferred_tags{$a->[0]} || 0) <=> ($preferred_tags{$b->[0]} || 0)} @$attributes; + my $best = pop @ordered_attributes; + @$attributes = @ordered_attributes; + return $best; +} + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Adaptor/dbi.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Adaptor/dbi.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1956 @@ +# $Id: dbi.pm,v 1.41.2.1 2003/07/05 00:52:30 lstein Exp $ + +=head1 NAME + +Bio::DB::GFF::Adaptor::dbi -- Database adaptor for DBI (SQL) databases + +=head1 SYNOPSIS + +See L + +=head1 DESCRIPTION + +This is the base class for DBI-based adaptors. It does everything +except generating the text of the queries to be used. See the section +QUERIES TO IMPLEMENT for the list of methods that must be implemented. + +=cut + +package Bio::DB::GFF::Adaptor::dbi; + +# base class for dbi-based implementations +use strict; + +use DBI; +use Bio::DB::GFF; +use Bio::DB::GFF::Util::Rearrange; # for rearrange() +use Bio::DB::GFF::Util::Binning; +use Bio::DB::GFF::Adaptor::dbi::iterator; +use Bio::DB::GFF::Adaptor::dbi::caching_handle; +use vars qw(@ISA); + +@ISA = qw(Bio::DB::GFF); + +# constants for choosing + +use constant MAX_SEGMENT => 100_000_000; # the largest a segment can get + +# this is the largest that any reference sequence can be (100 megabases) +use constant MAX_BIN => 100_000_000; + +# this is the smallest bin (1 K) +use constant MIN_BIN => 1000; + +# size of range over which it is faster to force the database to use the range for indexing +use constant STRAIGHT_JOIN_LIMIT => 200_000; + +# this is the size to which DNA should be shredded +use constant DNA_CHUNK_SIZE => 2000; + +############################################################################## + + +=head2 new + + Title : new + Usage : $db = Bio::DB::GFF->new(@args) + Function: create a new adaptor + Returns : a Bio::DB::GFF object + Args : see below + Status : Public + +This is the constructor for the adaptor. It is called automatically +by Bio::DB::GFF-Enew. In addition to arguments that are common among +all adaptors, the following class-specific arguments are recgonized: + + Argument Description + -------- ----------- + + -dsn the DBI data source, e.g. 'dbi:mysql:ens0040' + + -user username for authentication + + -pass the password for authentication + +=cut + +# Create a new Bio::DB::GFF::Adaptor::dbi object +sub new { + my $class = shift; + my ($features_db,$username,$auth,$other) = rearrange([ + [qw(FEATUREDB DB DSN)], + [qw(USERNAME USER)], + [qw(PASSWORD PASS)], + ],@_); + + $features_db || $class->throw("new(): Provide a data source or DBI database"); + + if (!ref($features_db)) { + my $dsn = $features_db; + my @args; + push @args,$username if defined $username; + push @args,$auth if defined $auth; + $features_db = Bio::DB::GFF::Adaptor::dbi::caching_handle->new($dsn,@args) + || $class->throw("new(): Failed to connect to $dsn: " + . Bio::DB::GFF::Adaptor::dbi::caching_handle->errstr); + } else { + $features_db->isa('DBI::db') + || $class->throw("new(): $features_db is not a DBI handle"); + } + + # fill in object + return bless { + features_db => $features_db + },$class; +} + +sub debug { + my $self = shift; + $self->features_db->debug(@_); + $self->SUPER::debug(@_); +} + +=head2 features_db + + Title : features_db + Usage : $dbh = $db->features_db + Function: get database handle + Returns : a DBI handle + Args : none + Status : Public + + Note: what is returned is not really a DBI::db handle, but a + subclass of one. This means that you cannot manipulate the + handle's attributes directly. Instead call the attribute + method: + + my $dbh = $db->features_db; + $dbh->attribute(AutoCommit=>0); + +=cut + +sub features_db { shift->{features_db} } +sub dbh { shift->{features_db} } + +=head2 get_dna + + Title : get_dna + Usage : $string = $db->get_dna($name,$start,$stop,$class) + Function: get DNA string + Returns : a string + Args : name, class, start and stop of desired segment + Status : Public + +This method performs the low-level fetch of a DNA substring given its +name, class and the desired range. It is actually a front end to the +abstract method make_dna_query(), which it calls after some argument +consistency checking. + +=cut + +sub get_dna { + my $self = shift; + my ($ref,$start,$stop,$class) = @_; + + my ($offset_start,$offset_stop); + + my $has_start = defined $start; + my $has_stop = defined $stop; + + my $reversed; + if ($has_start && $has_stop && $start > $stop) { + $reversed++; + ($start,$stop) = ($stop,$start); + } + + # turn start and stop into 0-based offsets + my $cs = $self->dna_chunk_size; + $start -= 1; $stop -= 1; + $offset_start = int($start/$cs)*$cs; + $offset_stop = int($stop/$cs)*$cs; + + my $sth; + # special case, get it all + if (!($has_start || $has_stop)) { + $sth = $self->dbh->do_query('select fdna,foffset from fdna where fref=? order by foffset',$ref); + } + + elsif (!$has_stop) { + $sth = $self->dbh->do_query('select fdna,foffset from fdna where fref=? and foffset>=? order by foffset', + $ref,$offset_start); + } + + else { # both start and stop defined + $sth = $self->dbh->do_query('select fdna,foffset from fdna where fref=? and foffset>=? and foffset<=? order by foffset', + $ref,$offset_start,$offset_stop); + } + + my $dna = ''; + while (my($frag,$offset) = $sth->fetchrow_array) { + substr($frag,0,$start-$offset) = '' if $has_start && $start > $offset; + $dna .= $frag; + } + substr($dna,$stop-$start+1) = '' if $has_stop && $stop-$start+1 < length($dna); + if ($reversed) { + $dna = reverse $dna; + $dna =~ tr/gatcGATC/ctagCTAG/; + } + + $sth->finish; + $dna; +} + + +=head2 get_abscoords + + Title : get_abscoords + Usage : ($refseq,$refclass,$start,$stop,$strand) = $db->get_abscoords($name,$class) + Function: get absolute coordinates for landmark + Returns : an array ref -- see below + Args : name and class of desired landmark + Status : Public + +This method performs the low-level resolution of a landmark into a +reference sequence and position. + +The result is an array ref, each element of which is a five-element +list containing reference sequence name, class, start, stop and strand. + +=cut + +# given sequence name, return (reference,start,stop,strand) +sub get_abscoords_b { + my $self = shift; + my ($name,$class,$refseq) = @_; + + my $sth = $self->make_abscoord_query($name,$class,$refseq); + + my @result; + while ( my @row = $sth->fetchrow_array) { + push @result,\@row + } + $sth->finish; + + if (@result == 0) { + $self->error("$name not found in database"); + return; + } else { + return \@result; + } +} + + +sub get_abscoords { + my $self = shift; + my ($name,$class,$refseq) = @_; + + my $sth = $self->make_abscoord_query($name,$class,$refseq); + + my @result; + while (my @row = $sth->fetchrow_array) { + push @result,\@row + } + $sth->finish; + + if (@result == 0) { + #$self->error("$name not found in database"); + my $sth2 = $self->make_aliasabscoord_query($name,$class); + + while (my @row2 = $sth2->fetchrow_array) { + push @result,\@row2 + } + $sth->finish; + + if (@result == 0){ + $self->error("$name not found in database"); + return; + } + } + #} else { + return \@result; + #} +} + + +=head2 get_features + + Title : get_features + Usage : $db->get_features($search,$options,$callback) + Function: retrieve features from the database + Returns : number of features retrieved + Args : see below + Status : Public + +This is the low-level method that is called to retrieve GFF lines from +the database. It is responsible for retrieving features that satisfy +range and feature type criteria, and passing the GFF fields to a +callback subroutine. + +See the manual page for Bio::DB::GFF for the interpretation of the +arguments and how the information retrieved by get_features is passed +to the callback for processing. + +Internally, get_features() is a front end for range_query(). The +latter method constructs the query and executes it. get_features() +calls fetchrow_array() to recover the fields and passes them to the +callback. + +=cut + +# Given sequence name, range, and optional filter, retrieve list of +# all features. Passes features through callback. +sub get_features { + my $self = shift; + my ($search,$options,$callback) = @_; + $callback || $self->throw('must provide a callback argument'); + + my $sth = $self->range_query(@{$search}{qw(rangetype + refseq + refclass + start + stop + types) }, + @{$options}{qw( + sparse + sort_by_group + ATTRIBUTES + BINSIZE)}) or return; + + my $count = 0; + while (my @row = $sth->fetchrow_array) { + $callback->(@row); + $count++; + } + $sth->finish; + return $count; +} + +=head2 classes + + Title : classes + Usage : $db->classes + Function: return list of landmark classes in database + Returns : a list of classes + Args : none + Status : public + +This routine returns the list of reference classes known to the +database, or empty if classes are not used by the database. Classes +are distinct from types, being essentially qualifiers on the reference +namespaces. + +NOTE: In the current mysql-based schema, this query takes a while to +run due to the classes not being normalized. + +=cut + +sub classes { + my $self = shift; + my ($query,@args) = $self->make_classes_query or return; + my $sth = $self->dbh->do_query($query,@args); + my @classes; + while (my ($c) = $sth->fetchrow_array) { + push @classes,$c; + } + @classes; +} + +=head2 make_classes_query + + Title : make_classes_query + Usage : ($query,@args) = $db->make_classes_query + Function: return query fragment for generating list of reference classes + Returns : a query and args + Args : none + Status : public + +=cut + +sub make_classes_query { + my $self = shift; + return; +} + +=head2 _feature_by_name + + Title : _feature_by_name + Usage : $db->get_features_by_name($name,$class,$callback) + Function: get a list of features by name and class + Returns : count of number of features retrieved + Args : name of feature, class of feature, and a callback + Status : protected + +This method is used internally. The callback arguments are those used +by make_feature(). Internally, it invokes the following abstract procedures: + + make_features_select_part + make_features_from_part + make_features_by_name_where_part + make_features_join_part + +=cut + +sub _feature_by_name { + my $self = shift; + my ($class,$name,$location,$callback) = @_; + $callback || $self->throw('must provide a callback argument'); + + my $select = $self->make_features_select_part; + my $from = $self->make_features_from_part(undef,{sparse_groups=>1}); + my ($where,@args) = $self->make_features_by_name_where_part($class,$name); + my $join = $self->make_features_join_part; + my $range = $self->make_features_by_range_where_part('overlaps', + {refseq=>$location->[0], + class =>'', + start=>$location->[1], + stop =>$location->[2]}) if $location; + my $query = "SELECT $select FROM $from WHERE $where AND $join"; + $query .= " AND $range" if $range; + my $sth = $self->dbh->do_query($query,@args); + + my $count = 0; + while (my @row = $sth->fetchrow_array) { + $callback->(@row); + $count++; + } + $sth->finish; + return $count; +} + +=head2 _feature_by_id + + Title : _feature_by_id + Usage : $db->_feature_by_id($ids,$type,$callback) + Function: get a list of features by ID + Returns : count of number of features retrieved + Args : arrayref containing list of IDs to fetch and a callback + Status : protected + +This method is used internally. The $type selector is one of +"feature" or "group". The callback arguments are those used by +make_feature(). Internally, it invokes the following abstract +procedures: + + make_features_select_part + make_features_from_part + make_features_by_id_where_part + make_features_join_part + +=cut + +sub _feature_by_id { + my $self = shift; + my ($ids,$type,$callback) = @_; + $callback || $self->throw('must provide a callback argument'); + + my $select = $self->make_features_select_part; + my $from = $self->make_features_from_part; + my ($where,@args) = $type eq 'feature' ? $self->make_features_by_id_where_part($ids) + : $self->make_features_by_gid_where_part($ids); + my $join = $self->make_features_join_part; + my $query = "SELECT $select FROM $from WHERE $where AND $join"; + my $sth = $self->dbh->do_query($query,@args); + + my $count = 0; + while (my @row = $sth->fetchrow_array) { + $callback->(@row); + $count++; + } + $sth->finish; + return $count; +} + +sub _feature_by_attribute { + my $self = shift; + my ($attributes,$callback) = @_; + $callback || $self->throw('must provide a callback argument'); + + my $select = $self->make_features_select_part; + my $from = $self->make_features_from_part(undef,{attributes=>$attributes}); + my ($where,@args) = $self->make_features_by_range_where_part('',{attributes=>$attributes}); + my $join = $self->make_features_join_part({attributes=>$attributes}); + my $query = "SELECT $select FROM $from WHERE $where AND $join"; + my $sth = $self->dbh->do_query($query,@args); + + my $count = 0; + while (my @row = $sth->fetchrow_array) { + $callback->(@row); + $count++; + } + $sth->finish; + return $count; +} + +=head2 get_types + + Title : get_types + Usage : $db->get_types($refseq,$refclass,$start,$stop,$count) + Function: get list of types + Returns : a list of Bio::DB::GFF::Typename objects + Args : see below + Status : Public + +This method is responsible for fetching the list of feature type names +from the database. The query may be limited to a particular range, in +which case the range is indicated by a landmark sequence name and +class and its subrange, if any. These arguments may be undef if it is +desired to retrieve all feature types in the database (which may be a +slow operation in some implementations). + +If the $count flag is false, the method returns a simple list of +vBio::DB::GFF::Typename objects. If $count is true, the method returns +a list of $name=E$count pairs, where $count indicates the number of +times this feature occurs in the range. + +Internally, this method calls upon the following functions to generate +the SQL and its bind variables: + + ($q1,@args) = make_types_select_part(@args); + ($q2,@args) = make_types_from_part(@args); + ($q3,@args) = make_types_where_part(@args); + ($q4,@args) = make_types_join_part(@args); + ($q5,@args) = make_types_group_part(@args); + +The components are then combined as follows: + + $query = "SELECT $q1 FROM $q2 WHERE $q3 AND $q4 GROUP BY $q5"; + +If any of the query fragments contain the ? bind variable, then the +same number of bind arguments must be provided in @args. The +fragment-generating functions are described below. + +=cut + +sub get_types { + my $self = shift; + my ($srcseq,$class,$start,$stop,$want_count,$typelist) = @_; + my $straight = $self->do_straight_join($srcseq,$start,$stop,[]) ? 'straight_join' : ''; + my ($select,@args1) = $self->make_types_select_part($srcseq,$start,$stop,$want_count,$typelist); + my ($from,@args2) = $self->make_types_from_part($srcseq,$start,$stop,$want_count,$typelist); + my ($join,@args3) = $self->make_types_join_part($srcseq,$start,$stop,$want_count,$typelist); + my ($where,@args4) = $self->make_types_where_part($srcseq,$start,$stop,$want_count,$typelist); + my ($group,@args5) = $self->make_types_group_part($srcseq,$start,$stop,$want_count,$typelist); + + my $query = "SELECT $straight $select FROM $from WHERE $join AND $where"; + $query .= " GROUP BY $group" if $group; + my @args = (@args1,@args2,@args3,@args4,@args5); + my $sth = $self->dbh->do_query($query,@args) or return; + + my (%result,%obj); + while (my ($method,$source,$count) = $sth->fetchrow_array) { + my $type = Bio::DB::GFF::Typename->new($method,$source); + $result{$type} = $count; + $obj{$type} = $type; + } + return $want_count ? %result : values %obj; +} + +=head2 range_query + + Title : range_query + Usage : $db->range_query($range_type,$refseq,$refclass,$start,$stop,$types,$order_by_group,$attributes,$binsize) + Function: create statement handle for range/overlap queries + Returns : a DBI statement handle + Args : see below + Status : Protected + +This method constructs the statement handle for this module's central +query: given a range and/or a list of feature types, fetch their GFF +records. + +The positional arguments are as follows: + + Argument Description + + $isrange A flag indicating that this is a range. + query. Otherwise an overlap query is + assumed. + + $refseq The reference sequence name (undef if no range). + + $refclass The reference sequence class (undef if no range). + + $start The start of the range (undef if none). + + $stop The stop of the range (undef if none). + + $types Array ref containing zero or feature types in the + format [method,source]. + + $order_by_group A flag indicating that statement handler should group + the features by group id (handy for iterative fetches) + + $attributes A hash containing select attributes. + + $binsize A bin size for generating tables of feature density. + +If successful, this method returns a statement handle. The handle is +expected to return the fields described for get_features(). + +Internally, range_query() makes calls to the following methods, +each of which is expected to be overridden in subclasses: + + $select = $self->make_features_select_part; + $from = $self->make_features_from_part; + $join = $self->make_features_join_part; + ($where,@args) = $self->make_features_by_range_where_part($isrange,$srcseq,$class, + $start,$stop,$types,$class); + +The query that is constructed looks like this: + + SELECT $select FROM $from WHERE $join AND $where + +The arguments that are returned from make_features_by_range_where_part() are +passed to the statement handler's execute() method. + +range_query() also calls a do_straight_join() method, described +below. If this method returns true, then the keyword "straight_join" +is inserted right after SELECT. + +=cut + +sub range_query { + my $self = shift; + my($rangetype,$refseq,$class,$start,$stop,$types,$sparse,$order_by_group,$attributes,$bin) = @_; + + my $dbh = $self->features_db; + + # NOTE: straight_join is necessary in some database to force the right index to be used. + my %a = (refseq=>$refseq,class=>$class,start=>$start,stop=>$stop,types=>$types,attributes=>$attributes,bin_width=>$bin); + my $straight = $self->do_straight_join(\%a) ? 'straight_join' : ''; + my $select = $self->make_features_select_part(\%a); + my $from = $self->make_features_from_part($sparse,\%a); + my $join = $self->make_features_join_part(\%a); + my ($where,@args) = $self->make_features_by_range_where_part($rangetype,\%a); + my ($group_by,@more_args) = $self->make_features_group_by_part(\%a); + my $order_by = $self->make_features_order_by_part(\%a) if $order_by_group; + + my $query = "SELECT $straight $select FROM $from WHERE $join"; + $query .= " AND $where" if $where; + if ($group_by) { + $query .= " GROUP BY $group_by"; + push @args,@more_args; + } + $query .= " ORDER BY $order_by" if $order_by; + + my $sth = $self->dbh->do_query($query,@args); + $sth; +} + +=head2 make_features_by_range_where_part + + Title : make_features_by_range_where_part + Usage : ($string,@args) = + $db->make_features_select_part($isrange,$refseq,$class,$start,$stop,$types) + Function: make where part of the features query + Returns : the list ($query,@bind_args) + Args : see below + Status : Protected + +This method creates the part of the features query that immediately +follows the WHERE keyword and is ANDed with the string returned by +make_features_join_part(). + +The six positional arguments are a flag indicating whether to perform +a range search or an overlap search, the reference sequence, class, +start and stop, all of which define an optional range to search in, +and an array reference containing a list [$method,$souce] pairs. + +The method result is a multi-element list containing the query string +and the list of runtime arguments to bind to it with the execute() +method. + +This method's job is to clean up arguments and perform consistency +checking. The real work is done by the following abstract methods: + + Method Description + + refseq_query() Return the query string needed to match the reference + sequence. + + range_query() Return the query string needed to find all features contained + within a range. + + overlap_query() Return the query string needed to find all features that overlap + a range. + +See Bio::DB::Adaptor::dbi::mysql for an example of how this works. + +=cut + +#' + +sub make_features_by_range_where_part { + my $self = shift; + my ($rangetype,$options) = @_; + $options ||= {}; + my ($refseq,$class,$start,$stop,$types,$attributes) = + @{$options}{qw(refseq class start stop types attributes)}; + + my (@query,@args); + + if ($refseq) { + my ($q,@a) = $self->refseq_query($refseq,$class); + push @query,$q; + push @args,@a; + } + + if (defined $start or defined $stop) { + $start = 0 unless defined($start); + $stop = MAX_SEGMENT unless defined($stop); + + my ($range_query,@range_args) = + $rangetype eq 'overlaps' ? $self->overlap_query($start,$stop) + : $rangetype eq 'contains' ? $self->contains_query($start,$stop) + : $rangetype eq 'contained_in' ? $self->contained_in_query($start,$stop) + : (); + + push @query,$range_query; + push @args,@range_args; + } + + if (defined $types && @$types) { + my ($type_query,@type_args) = $self->types_query($types); + push @query,$type_query; + push @args,@type_args; + } + + if ($attributes) { + my ($attribute_query,@attribute_args) = $self->make_features_by_attribute_where_part($attributes); + push @query,"($attribute_query)"; + push @args,@attribute_args; + } + + my $query = join "\n\tAND ",@query; + return wantarray ? ($query,@args) : $self->dbh->dbi_quote($query,@args); +} + +=head2 do_straight_join + + Title : do_straight_join + Usage : $boolean = $db->do_straight_join($refseq,$class,$start,$stop,$types) + Function: optimization flag + Returns : a flag + Args : see range_query() + Status : Protected + +This subroutine, called by range_query() returns a boolean flag. +If true, range_query() will perform a straight join, which can be +used to optimize certain SQL queries. The four arguments correspond +to similarly-named arguments passed to range_query(). + +=cut + +sub do_straight_join { 0 } # false by default + +=head2 string_match + + Title : string_match + Usage : $string = $db->string_match($field,$value) + Function: create a SQL fragment for performing exact or regexp string matching + Returns : query string + Args : the table field and match value + Status : public + +This method examines the passed value for meta characters. If so it +produces a SQL fragment that performs a regular expression match. +Otherwise, it produces a fragment that performs an exact string match. + +This method is not used in the module, but is available for use by +subclasses. + +=cut + +sub string_match { + my $self = shift; + my ($field,$value) = @_; + return qq($field = ?) if $value =~ /^[!@%&a-zA-Z0-9_\'\" ~-]+$/; + return qq($field REGEXP ?); +} + +=head2 exact_match + + Title : exact_match + Usage : $string = $db->exact_match($field,$value) + Function: create a SQL fragment for performing exact string matching + Returns : query string + Args : the table field and match value + Status : public + +This method produces the SQL fragment for matching a field name to a +constant string value. + +=cut + +sub exact_match { + my $self = shift; + my ($field,$value) = @_; + return qq($field = ?); +} + +=head2 meta + + Title : meta + Usage : $value = $db->meta($name [,$newval]) + Function: get or set a meta variable + Returns : a string + Args : meta variable name and optionally value + Status : public + +Get or set a named metavariable for the database. Metavariables can +be used for database-specific settings. This method calls two +class-specific methods which must be implemented: + + make_meta_get_query() Returns a sql fragment which given a meta + parameter name, returns its value. One bind + variable. + make_meta_set_query() Returns a sql fragment which takes two bind + arguments, the parameter name and its value + + +Don't make changes unless you know what you're doing! It will affect the +persistent database. + +=cut + +sub meta { + my $self = shift; + my $param_name = uc shift; + + # getting + if (@_) { + my $value = shift; + my $sql = $self->make_meta_set_query() or return; + my $sth = $self->dbh->prepare_delayed($sql) + or $self->error("Can't prepare $sql: ",$self->dbh->errstr), return; + $sth->execute($param_name,$value) + or $self->error("Can't execute $sql: ",$self->dbh->errstr), return; + $sth->finish; + return $self->{meta}{$param_name} = $value; + } + + elsif (exists $self->{meta}{$param_name}) { + return $self->{meta}{$param_name}; + } + + else { + undef $self->{meta}{$param_name}; # so that we don't check again + my $sql = $self->make_meta_get_query() or return; + my $sth = $self->dbh->prepare_delayed($sql) + or $self->error("Can't prepare $sql: ",$self->dbh->errstr), return; + $sth->execute($param_name) + or $self->error("Can't execute $sql: ",$sth->errstr),return; + my ($value) = $sth->fetchrow_array; + $sth->finish; + return $self->{meta}{$param_name} = $value; + } + +} + +=head2 make_meta_get_query + + Title : make_meta_get_query + Usage : $sql = $db->make_meta_get_query + Function: return SQL fragment for getting a meta parameter + Returns : SQL fragment + Args : none + Status : public + +By default this does nothing; meta parameters are not stored or +retrieved. + +=cut + +sub make_meta_get_query { + return 'SELECT fvalue FROM fmeta WHERE fname=?'; +} + + +sub dna_chunk_size { + my $self = shift; + $self->meta('chunk_size') || DNA_CHUNK_SIZE; +} + +=head2 make_meta_set_query + + Title : make_meta_set_query + Usage : $sql = $db->make_meta_set_query + Function: return SQL fragment for setting a meta parameter + Returns : SQL fragment + Args : none + Status : public + +By default this does nothing; meta parameters are not stored or +retrieved. + +=cut + +sub make_meta_set_query { + return; +} + +=head2 default_meta_values + + Title : default_meta_values + Usage : %values = $db->default_meta_values + Function: empty the database + Returns : a list of tag=>value pairs + Args : none + Status : protected + +This method returns a list of tag=Evalue pairs that contain default +meta information about the database. It is invoked by initialize() to +write out the default meta values. The base class version returns an +empty list. + +For things to work properly, meta value names must be UPPERCASE. + +=cut + +sub default_meta_values { + my $self = shift; + my @values = $self->SUPER::default_meta_values; + return ( + @values, + max_bin => MAX_BIN, + min_bin => MIN_BIN, + straight_join_limit => STRAIGHT_JOIN_LIMIT, + chunk_size => DNA_CHUNK_SIZE, + ); +} + +sub min_bin { + my $self = shift; + return $self->meta('min_bin') || MIN_BIN; +} +sub max_bin { + my $self = shift; + return $self->meta('max_bin') || MAX_BIN; +} + +sub straight_join_limit { + my $self = shift; + return $self->meta('straight_join_limit') || STRAIGHT_JOIN_LIMIT; +} + +=head2 get_features_iterator + + Title : get_features_iterator + Usage : $iterator = $db->get_features_iterator($search,$options,$callback) + Function: create an iterator on a features() query + Returns : A Bio::DB::GFF::Adaptor::dbi::iterator object + Args : see get_features() + Status : public + +This method is similar to get_features(), except that it returns an +iterator across the query. See +L. + +=cut + +sub get_features_iterator { + my $self = shift; + my ($search,$options,$callback) = @_; + $callback || $self->throw('must provide a callback argument'); + my $sth = $self->range_query(@{$search}{qw(rangetype + refseq + refclass + start + stop + types)}, + @{$options}{qw( + sparse + sort_by_group + ATTRIBUTES + BINSIZE)}) or return; + return Bio::DB::GFF::Adaptor::dbi::iterator->new($sth,$callback); +} + +########################## loading and initialization ##################### + +=head2 do_initialize + + Title : do_initialize + Usage : $success = $db->do_initialize($drop_all) + Function: initialize the database + Returns : a boolean indicating the success of the operation + Args : a boolean indicating whether to delete existing data + Status : protected + +This method will load the schema into the database. If $drop_all is +true, then any existing data in the tables known to the schema will be +deleted. + +Internally, this method calls schema() to get the schema data. + +=cut + +# Create the schema from scratch. +# You will need create privileges for this. +sub do_initialize { + #shift->throw("do_initialize(): must be implemented by subclass"); + my $self = shift; + my $erase = shift; + $self->drop_all if $erase; + + my $dbh = $self->features_db; + my $schema = $self->schema; + foreach my $table_name ($self->tables) { + my $create_table_stmt = $schema->{$table_name}{table} ; + $dbh->do($create_table_stmt) || warn $dbh->errstr; + $self->create_other_schema_objects(\%{$schema->{$table_name}}); + } + + 1; +} + +=head2 finish_load + + Title : finish_load + Usage : $db->finish_load + Function: called after load_gff_line() + Returns : number of records loaded + Args : none + Status : protected + +This method performs schema-specific cleanup after loading a set of +GFF records. It finishes each of the statement handlers prepared by +setup_load(). + +=cut + +sub finish_load { + my $self = shift; + + my $dbh = $self->features_db or return; + $dbh->do('UNLOCK TABLES') if $self->lock_on_load; + + foreach (keys %{$self->{load_stuff}{sth}}) { + $self->{load_stuff}{sth}{$_}->finish; + } + + my $counter = $self->{load_stuff}{counter}; + delete $self->{load_stuff}; + return $counter; +} + + +=head2 create_other_schema_objects + + Title : create_other_schema_objects + Usage : $self->create_other_schema_objects($table_name) + Function: create other schema objects like : indexes, sequences, triggers + Returns : + Args : + Status : Abstract + +=cut + +sub create_other_schema_objects{ + #shift->throw("create_other_schema_objects(): must be implemented by subclass"); + my $self = shift ; + my $table_schema = shift ; + my $dbh = $self->features_db; + foreach my $object_type(keys %$table_schema){ + if ($object_type !~ /table/) { + foreach my $object_name(keys %{$table_schema->{$object_type}}){ + my $create_object_stmt = $table_schema->{$object_type}{$object_name}; + $dbh->do($create_object_stmt) || warn $dbh->errstr; + } + } + } + 1; +} + +=head2 drop_all + + Title : drop_all + Usage : $db->drop_all + Function: empty the database + Returns : void + Args : none + Status : protected + +This method drops the tables known to this module. Internally it +calls the abstract tables() method. + +=cut + +# Drop all the GFF tables -- dangerous! +sub drop_all { + #shift->throw("drop_all(): must be implemented by subclass"); + my $self = shift; + my $dbh = $self->features_db; + my $schema = $self->schema; + + local $dbh->{PrintError} = 0; + foreach ($self->tables) { + $dbh->do("drop table $_") || warn $dbh->errstr; + + #when dropping a table - the indexes and triggers are being dropped automatically + # sequences needs to be dropped - if there are any (Oracle, PostgreSQL) + if ($schema->{$_}{sequence}){ + foreach my $sequence_name(keys %{$schema->{$_}{sequence}}) { + $dbh->do("drop sequence $sequence_name"); + } + } + + #$self->drop_other_schema_objects($_); + + } +} + + +=head1 QUERIES TO IMPLEMENT + +The following astract methods either return DBI statement handles or +fragments of SQL. They must be implemented by subclasses of this +module. See Bio::DB::GFF::Adaptor::dbi::mysql for examples. + + + + +=head2 drop_other_schema_objects + + Title : drop_other_schema_objects + Usage : $self->create_other_schema_objects($table_name) + Function: create other schema objects like : indexes, sequences, triggers + Returns : + Args : + Status : Abstract + + +=cut + +sub drop_other_schema_objects{ + #shift->throw("drop_other_schema_objects(): must be implemented by subclass"); + +} + +=head2 make_features_select_part + + Title : make_features_select_part + Usage : $string = $db->make_features_select_part() + Function: make select part of the features query + Returns : a string + Args : none + Status : Abstract + +This abstract method creates the part of the features query that +immediately follows the SELECT keyword. + +=cut + +sub make_features_select_part { + shift->throw("make_features_select_part(): must be implemented by subclass"); +} + +=head2 tables + + Title : tables + Usage : @tables = $db->tables + Function: return list of tables that belong to this module + Returns : list of tables + Args : none + Status : protected + +This method lists the tables known to the module. + +=cut + +# return list of tables that "belong" to us. +sub tables { + my $schema = shift->schema; + return keys %$schema; +} + +=head2 schema + + Title : schema + Usage : $schema = $db->schema + Function: return the CREATE script for the schema + Returns : a hashref + Args : none + Status : abstract + +This method returns an array ref containing the various CREATE +statements needed to initialize the database tables. The keys are the +table names, and the values are strings containing the appropriate +CREATE statement. + +=cut + +sub schema { + shift->throw("The schema() method must be implemented by subclass"); +} + +=head2 DESTROY + + Title : DESTROY + Usage : $db->DESTROY + Function: disconnect database at destruct time + Returns : void + Args : none + Status : protected + +This is the destructor for the class. + +=cut + +sub DESTROY { + my $self = shift; + $self->features_db->disconnect if defined $self->features_db; +} + +################## query cache ################## + + +######################################### +## Moved from mysql.pm and mysqlopt.pm ## +######################################### + +=head2 make_features_by_name_where_part + + Title : make_features_by_name_where_part + Usage : $db->make_features_by_name_where_part + Function: create the SQL fragment needed to select a feature by its group name & class + Returns : a SQL fragment and bind arguments + Args : see below + Status : Protected + +=cut + +sub make_features_by_name_where_part { + my $self = shift; + my ($class,$name) = @_; + if ($name =~ /\*/) { + $name =~ s/\*/%/g; + return ("fgroup.gclass=? AND fgroup.gname LIKE ?",$class,$name); + } else { + return ("fgroup.gclass=? AND fgroup.gname=?",$class,$name); + } +} + +sub make_features_by_attribute_where_part { + my $self = shift; + my $attributes = shift; + my @args; + my @sql; + foreach (keys %$attributes) { + push @sql,"(fattribute.fattribute_name=? AND fattribute_to_feature.fattribute_value=?)"; + push @args,($_,$attributes->{$_}); + } + return (join(' OR ',@sql),@args); +} + +=head2 make_features_by_id_where_part + + Title : make_features_by_id_where_part + Usage : $db->make_features_by_id_where_part($ids) + Function: create the SQL fragment needed to select a set of features by their ids + Returns : a SQL fragment and bind arguments + Args : arrayref of IDs + Status : Protected + +=cut + +sub make_features_by_id_where_part { + my $self = shift; + my $ids = shift; + my $set = join ",",@$ids; + return ("fdata.fid IN ($set)"); +} + +=head2 make_features_by_gid_where_part + + Title : make_features_by_id_where_part + Usage : $db->make_features_by_gid_where_part($ids) + Function: create the SQL fragment needed to select a set of features by their ids + Returns : a SQL fragment and bind arguments + Args : arrayref of IDs + Status : Protected + +=cut + +sub make_features_by_gid_where_part { + my $self = shift; + my $ids = shift; + my $set = join ",",@$ids; + return ("fgroup.gid IN ($set)"); +} + + +=head2 make_features_from_part + + Title : make_features_from_part + Usage : $string = $db->make_features_from_part() + Function: make from part of the features query + Returns : a string + Args : none + Status : protected + +This method creates the part of the features query that immediately +follows the FROM keyword. + +=cut + +sub make_features_from_part { + my $self = shift; + my $sparse = shift; + my $options = shift || {}; + return $options->{attributes} ? "fdata,ftype,fgroup,fattribute,fattribute_to_feature\n" + : "fdata,ftype,fgroup\n"; +} + + +=head2 make_features_join_part + + Title : make_features_join_part + Usage : $string = $db->make_features_join_part() + Function: make join part of the features query + Returns : a string + Args : none + Status : protected + +This method creates the part of the features query that immediately +follows the WHERE keyword. + +=cut + +sub make_features_join_part { + my $self = shift; + my $options = shift || {}; + return !$options->{attributes} ? <make_features_order_by_part() + Function: make the ORDER BY part of the features() query + Returns : a SQL fragment and bind arguments, if any + Args : none + Status : protected + +This method creates the part of the features query that immediately +follows the ORDER BY part of the query issued by features() and +related methods. + +=cut + +sub make_features_order_by_part { + my $self = shift; + my $options = shift || {}; + return "fgroup.gname"; +} + +=head2 make_features_group_by_part + + Title : make_features_group_by_part + Usage : ($query,@args) = $db->make_features_group_by_part() + Function: make the GROUP BY part of the features() query + Returns : a SQL fragment and bind arguments, if any + Args : none + Status : protected + +This method creates the part of the features query that immediately +follows the GROUP BY part of the query issued by features() and +related methods. + +=cut + +sub make_features_group_by_part { + my $self = shift; + my $options = shift || {}; + my $att = $options->{attributes} or return; + my $key_count = keys %$att; + return unless $key_count > 1; + #return ("fdata.fid having count(fdata.fid) > ?",$key_count-1); + return ("fref,fstart,fstop,fsource,fmethod,fscore,fstrand,fphase,gclass,gname,ftarget_start,ftarget_stop,fdata.fid,fdata.gid having count(fdata.fid) > ?",$key_count-1); + + if (my $att = $options->{attributes}) { + my $key_count = keys %$att; + return unless $key_count > 1; + return ("fdata.fid,fref,fstart,fstop,fsource, + fmethod,fscore,fstrand,fphase,gclass,gname,ftarget_start, + ftarget_stop,fdata.gid + HAVING count(fdata.fid) > ?",$key_count-1); + } + elsif (my $b = $options->{bin_width}) { + return "fref,fstart,fdata.ftypeid"; + } + +} + +=head2 refseq_query + + Title : refseq_query + Usage : ($query,@args) = $db->refseq_query($name,$class) + Function: create SQL fragment that selects the desired reference sequence + Returns : a list containing the query and bind arguments + Args : reference sequence name and class + Status : protected + +This method is called by make_features_by_range_where_part() to +construct the part of the select WHERE section that selects a +particular reference sequence. It returns a mult-element list in +which the first element is the SQL fragment and subsequent elements +are bind values. + +For example: + + sub refseq_query { + my ($name,$class) = @_; + return ('gff.refseq=? AND gff.refclass=?', + $name,$class); + } + +The current schema does not distinguish among different classes of +reference sequence. + +=cut + +# IMPORTANT NOTE: THE MYSQL SCHEMA IGNORES THE SEQUENCE CLASS +# THIS SHOULD BE FIXED +sub refseq_query { + my $self = shift; + my ($refseq,$refclass) = @_; + my $query = "fdata.fref=?"; + return wantarray ? ($query,$refseq) : $self->dbh->dbi_quote($query,$refseq); +} + +=head2 attributes + + Title : attributes + Usage : @attributes = $db->attributes($id,$name) + Function: get the attributes on a particular feature + Returns : an array of string + Args : feature ID + Status : public + +Some GFF version 2 files use the groups column to store a series of +attribute/value pairs. In this interpretation of GFF, the first such +pair is treated as the primary group for the feature; subsequent pairs +are treated as attributes. Two attributes have special meaning: +"Note" is for backward compatibility and is used for unstructured text +remarks. "Alias" is considered as a synonym for the feature name. + +If no name is provided, then attributes() returns a flattened hash, of +attribute=Evalue pairs. This lets you do: + + %attributes = $db->attributes($id); + +Normally, attributes() will be called by the feature: + + @notes = $feature->attributes('Note'); + +=cut + +sub do_attributes { + my $self = shift; + my ($id,$tag) = @_; + my $from = 'fattribute_to_feature,fattribute'; + my $join = 'fattribute.fattribute_id=fattribute_to_feature.fattribute_id'; + my $where1 = 'fid=? AND fattribute_name=?'; + my $where2 = 'fid=?'; + my $sth = defined($tag) ? $self->dbh->do_query("SELECT fattribute_value FROM $from WHERE $where1 AND $join",$id,$tag) + : $self->dbh->do_query("SELECT fattribute_name,fattribute_value FROM $from WHERE $where2 AND $join",$id); + my @result; + while (my @stuff = $sth->fetchrow_array) { + push @result,@stuff; + } + $sth->finish; + return @result; +} + + + +=head2 overlap_query_nobin + + Title : overlap_query + Usage : ($query,@args) = $db->overlap_query($start,$stop) + Function: create SQL fragment that selects the desired features by range + Returns : a list containing the query and bind arguments + Args : the start and stop of a range, inclusive + Status : protected + +This method is called by make_features_byrange_where_part() to construct the +part of the select WHERE section that selects a set of features that +overlap a range. It returns a multi-element list in which the first +element is the SQL fragment and subsequent elements are bind values. + + +sub overlap_query_nobin { + my ($start,$stop) = @_; + return ('gff.stopE=? AND gff.startE=?', + $start,$stop); + +=cut + +# find features that overlap a given range +sub overlap_query_nobin { + my $self = shift; + my ($start,$stop) = @_; + + my $query = qq(fdata.fstop>=? AND fdata.fstart<=?); + return wantarray ? ($query,$start,$stop) : $self->dbh->dbi_quote($query,$start,$stop); +} + +=head2 contains_query_nobin + + Title : contains_query + Usage : ($query,@args) = $db->contains_query($start,$stop) + Function: create SQL fragment that selects the desired features by range + Returns : a list containing the query and bind arguments + Args : the start and stop of a range, inclusive + Status : protected + +This method is called by make_features_byrange_where_part() to construct the +part of the select WHERE section that selects a set of features +entirely enclosed by a range. It returns a multi-element list in which +the first element is the SQL fragment and subsequent elements are bind +values. For example: + + sub contains_query_nobin { + my ($start,$stop) = @_; + return ('gff.start>=? AND gff.stop<=?', + $start,$stop); + +=cut + +# find features that are completely contained within a range +sub contains_query_nobin { + my $self = shift; + my ($start,$stop) = @_; + my $query = qq(fdata.fstart>=? AND fdata.fstop<=?); + return wantarray ? ($query,$start,$stop) : $self->dbh->dbi_quote($query,$start,$stop); +} + +=head2 contained_in_query_nobin + + Title : contained_in_query_nobin + Usage : ($query,@args) = $db->contained_in_query($start,$stop) + Function: create SQL fragment that selects the desired features by range + Returns : a list containing the query and bind arguments + Args : the start and stop of a range, inclusive + Status : protected + +This method is called by make_features_byrange_where_part() to construct the +part of the select WHERE section that selects a set of features +entirely enclosed by a range. It returns a multi-element list in which +the first element is the SQL fragment and subsequent elements are bind +values.For example: + + sub contained_in_query_nobin { + my ($start,$stop) = @_; + return ('gff.start<=? AND gff.stop>=?', + $start,$stop); + } + +=cut + +# find features that are completely contained within a range +sub contained_in_query_nobin { + my $self = shift; + my ($start,$stop) = @_; + my $query = qq(fdata.fstart<=? AND fdata.fstop>=?); + return wantarray ? ($query,$start,$stop) : $self->dbh->dbi_quote($query,$start,$stop); +} + +=head2 types_query + + Title : types_query + Usage : ($query,@args) = $db->types_query($types) + Function: create SQL fragment that selects the desired features by type + Returns : a list containing the query and bind arguments + Args : an array reference containing the types + Status : protected + +This method is called by make_features_byrange_where_part() to construct the +part of the select WHERE section that selects a set of features based +on their type. It returns a multi-element list in which the first +element is the SQL fragment and subsequent elements are bind values. +The argument is an array reference containing zero or more +[$method,$source] pairs. + +=cut + +# generate the fragment of SQL responsible for searching for +# features with particular types and methods +sub types_query { + my $self = shift; + my $types = shift; + + my @method_queries; + my @args; + for my $type (@$types) { + my ($method,$source) = @$type; + my $meth_query = $self->exact_match('fmethod',$method) if defined $method && length $method; + my $src_query = $self->exact_match('fsource',$source) if defined $source && length $source; + my @pair; + if (defined $method && length $method) { + push @pair,$self->exact_match('fmethod',$method); + push @args,$method; + } + if (defined $source && length $source) { + push @pair,$self->exact_match('fsource',$source); + push @args,$source; + } + push @method_queries,"(" . join(' AND ',@pair) .")" if @pair; + } + my $query = " (".join(' OR ',@method_queries).")\n" if @method_queries; + return wantarray ? ($query,@args) : $self->dbh->dbi_quote($query,@args); +} + +=head2 make_types_select_part + + Title : make_types_select_part + Usage : ($string,@args) = $db->make_types_select_part(@args) + Function: create the select portion of the SQL for fetching features type list + Returns : query string and bind arguments + Args : see below + Status : protected + +This method is called by get_types() to generate the query fragment +and bind arguments for the SELECT part of the query that retrieves +lists of feature types. The four positional arguments are as follows: + + $refseq reference sequence name + $start start of region + $stop end of region + $want_count true to return the count of this feature type + +If $want_count is false, the SQL fragment returned must produce a list +of feature types in the format (method, source). + +If $want_count is true, the returned fragment must produce a list of +feature types in the format (method, source, count). + +=cut + +#------------------------- support for the types() query ------------------------ +sub make_types_select_part { + my $self = shift; + my ($srcseq,$start,$stop,$want_count) = @_; + my $query = $want_count ? 'ftype.fmethod,ftype.fsource,count(fdata.ftypeid)' + : 'fmethod,fsource'; + return $query; +} + +=head2 make_types_from_part + + Title : make_types_from_part + Usage : ($string,@args) = $db->make_types_from_part(@args) + Function: create the FROM portion of the SQL for fetching features type lists + Returns : query string and bind arguments + Args : see below + Status : protected + +This method is called by get_types() to generate the query fragment +and bind arguments for the FROM part of the query that retrieves lists +of feature types. The four positional arguments are as follows: + + $refseq reference sequence name + $start start of region + $stop end of region + $want_count true to return the count of this feature type + +If $want_count is false, the SQL fragment returned must produce a list +of feature types in the format (method, source). + +If $want_count is true, the returned fragment must produce a list of +feature types in the format (method, source, count). + +=cut + +sub make_types_from_part { + my $self = shift; + my ($srcseq,$start,$stop,$want_count) = @_; + my $query = defined($srcseq) || $want_count ? 'fdata,ftype' : 'ftype'; + return $query; +} + +=head2 make_types_join_part + + Title : make_types_join_part + Usage : ($string,@args) = $db->make_types_join_part(@args) + Function: create the JOIN portion of the SQL for fetching features type lists + Returns : query string and bind arguments + Args : see below + Status : protected + +This method is called by get_types() to generate the query fragment +and bind arguments for the JOIN part of the query that retrieves lists +of feature types. The four positional arguments are as follows: + + $refseq reference sequence name + $start start of region + $stop end of region + $want_count true to return the count of this feature type + +=cut + +sub make_types_join_part { + my $self = shift; + my ($srcseq,$start,$stop,$want_count) = @_; + my $query = defined($srcseq) || $want_count ? 'fdata.ftypeid=ftype.ftypeid' + : ''; + return $query || '1=1'; +} + +=head2 make_types_where_part + + Title : make_types_where_part + Usage : ($string,@args) = $db->make_types_where_part(@args) + Function: create the WHERE portion of the SQL for fetching features type lists + Returns : query string and bind arguments + Args : see below + Status : protected + +This method is called by get_types() to generate the query fragment +and bind arguments for the WHERE part of the query that retrieves +lists of feature types. The four positional arguments are as follows: + + $refseq reference sequence name + $start start of region + $stop end of region + $want_count true to return the count of this feature type + +=cut + +sub make_types_where_part { + my $self = shift; + my ($srcseq,$start,$stop,$want_count,$typelist) = @_; + my (@query,@args); + if (defined($srcseq)) { + push @query,'fdata.fref=?'; + push @args,$srcseq; + if (defined $start or defined $stop) { + $start = 1 unless defined $start; + $stop = MAX_SEGMENT unless defined $stop; + my ($q,@a) = $self->overlap_query($start,$stop); + push @query,"($q)"; + push @args,@a; + } + } + if (defined $typelist && @$typelist) { + my ($q,@a) = $self->types_query($typelist); + push @query,($q); + push @args,@a; + } + my $query = @query ? join(' AND ',@query) : '1=1'; + return wantarray ? ($query,@args) : $self->dbh->dbi_quote($query,@args); +} + +=head2 make_types_group_part + + Title : make_types_group_part + Usage : ($string,@args) = $db->make_types_group_part(@args) + Function: create the GROUP BY portion of the SQL for fetching features type lists + Returns : query string and bind arguments + Args : see below + Status : protected + +This method is called by get_types() to generate the query fragment +and bind arguments for the GROUP BY part of the query that retrieves +lists of feature types. The four positional arguments are as follows: + + $refseq reference sequence name + $start start of region + $stop end of region + $want_count true to return the count of this feature type + +=cut + +sub make_types_group_part { + my $self = shift; + my ($srcseq,$start,$stop,$want_count) = @_; + return unless $srcseq or $want_count; + return 'ftype.ftypeid,ftype.fmethod,ftype.fsource'; +} + + +=head2 get_feature_id + + Title : get_feature_id + Usage : $integer = $db->get_feature_id($ref,$start,$stop,$typeid,$groupid) + Function: get the ID of a feature + Returns : an integer ID or undef + Args : none + Status : private + +This internal method is called by load_gff_line to look up the integer +ID of an existing feature. It is ony needed when replacing a feature +with new information. + +=cut + +# this method is called when needed to look up a feature's ID +sub get_feature_id { + my $self = shift; + my ($ref,$start,$stop,$typeid,$groupid) = @_; + my $s = $self->{load_stuff}; + unless ($s->{get_feature_id}) { + my $dbh = $self->features_db; + $s->{get_feature_id} = + $dbh->prepare_delayed('SELECT fid FROM fdata WHERE fref=? AND fstart=? AND fstop=? AND ftypeid=? AND gid=?'); + } + my $sth = $s->{get_feature_id} or return; + $sth->execute($ref,$start,$stop,$typeid,$groupid) or return; + my ($fid) = $sth->fetchrow_array; + return $fid; +} + + + +=head2 make_abscoord_query + + Title : make_abscoord_query + Usage : $sth = $db->make_abscoord_query($name,$class); + Function: create query that finds the reference sequence coordinates given a landmark & classa + Returns : a DBI statement handle + Args : name and class of landmark + Status : protected + +The statement handler should return rows containing five fields: + + 1. reference sequence name + 2. reference sequence class + 3. start position + 4. stop position + 5. strand ("+" or "-") + +This query always returns "Sequence" as the class of the reference +sequence. + +=cut + +# given sequence name, return (reference,start,stop,strand) +sub make_abscoord_query { + my $self = shift; + my ($name,$class,$refseq) = @_; + #my $query = GETSEQCOORDS; + my $query = $self->getseqcoords_query(); + my $getforcedseqcoords = $self->getforcedseqcoords_query() ; + if ($name =~ /\*/) { + $name =~ tr/*/%/; + $query =~ s/gname=\?/gname LIKE ?/; + } + defined $refseq ? $self->dbh->do_query($getforcedseqcoords,$name,$class,$refseq) + : $self->dbh->do_query($query,$name,$class); +} + +sub make_aliasabscoord_query { + my $self = shift; + my ($name,$class) = @_; + #my $query = GETALIASCOORDS; + my $query = $self->getaliascoords_query(); + if ($name =~ /\*/) { + $name =~ tr/*/%/; + $query =~ s/gname=\?/gname LIKE ?/; + } + $self->dbh->do_query($query,$name,$class); +} + +sub getseqcoords_query { + shift->throw("getseqcoords_query(): must be implemented by a subclass"); +} + +sub getaliascoords_query { + shift->throw("getaliascoords_query(): must be implemented by a subclass"); +} + +sub bin_query { + my $self = shift; + my ($start,$stop,$minbin,$maxbin) = @_; + my ($query,@args); + + $start = 0 unless defined($start); + $stop = $self->meta('max_bin') unless defined($stop); + + my @bins; + $minbin = defined $minbin ? $minbin : $self->min_bin; + $maxbin = defined $maxbin ? $maxbin : $self->max_bin; + my $tier = $maxbin; + while ($tier >= $minbin) { + my ($tier_start,$tier_stop) = (bin_bot($tier,$start),bin_top($tier,$stop)); + if ($tier_start == $tier_stop) { + push @bins,'fbin=?'; + push @args,$tier_start; + } else { + push @bins,'fbin between ? and ?'; + push @args,($tier_start,$tier_stop); + } + $tier /= 10; + } + $query = join("\n\t OR ",@bins); + return wantarray ? ($query,@args) + : $self->dbh->dbi_quote($query,@args); +} + +# find features that overlap a given range +sub overlap_query { + my $self = shift; + my ($start,$stop) = @_; + + my ($bq,@bargs) = $self->bin_query($start,$stop); + my ($iq,@iargs) = $self->overlap_query_nobin($start,$stop); + my $query = "($bq)\n\tAND $iq"; + my @args = (@bargs,@iargs); + + return wantarray ? ($query,@args) : $self->dbh->dbi_quote($query,@args); +} + +# find features that are completely contained within a range +sub contains_query { + my $self = shift; + my ($start,$stop) = @_; + my ($bq,@bargs) = $self->bin_query($start,$stop,undef,bin($start,$stop,$self->min_bin)); + my ($iq,@iargs) = $self->contains_query_nobin($start,$stop); + my $query = "($bq)\n\tAND $iq"; + my @args = (@bargs,@iargs); + return wantarray ? ($query,@args) : $self->dbh->dbi_quote($query,@args); +} + +# find features that are completely contained within a range +sub contained_in_query { + my $self = shift; + my ($start,$stop) = @_; + my ($bq,@bargs) = $self->bin_query($start,$stop,abs($stop-$start)+1,undef); + my ($iq,@iargs) = $self->contained_in_query_nobin($start,$stop); + my $query = "($bq)\n\tAND $iq"; + my @args = (@bargs,@iargs); + return wantarray ? ($query,@args) : $self->dbh->dbi_quote($query,@args); +} + +1; + +__END__ + +=head1 BUGS + +Schemas need work to support multiple hierarchical groups. + +=head1 SEE ALSO + +L, L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Adaptor/dbi/caching_handle.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Adaptor/dbi/caching_handle.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,254 @@ +package Bio::DB::GFF::Adaptor::dbi::caching_handle; + +use strict; +use DBI; +use Bio::Root::Root; +use vars '$AUTOLOAD','@ISA'; +@ISA = qw(Bio::Root::Root); + +=head1 NAME + +Bio::DB::GFF::Adaptor::dbi::caching_handle -- Cache for database handles + +=head1 SYNOPSIS + + use Bio::DB::GFF::Adaptor::dbi::caching_handle; + $db = Bio::DB::GFF::Adaptor::dbi::caching_handle->new('dbi:mysql:test'); + $sth = $db->prepare('select * from foo'); + @h = $sth->fetch_rowarray; + $sth->finish + +=head1 DESCRIPTION + +This module handles a pool of database handles. It was motivated by +the MYSQL driver's {mysql_use_result} attribute, which dramatically +improves query speed and memory usage, but forbids additional query +statements from being evaluated while an existing one is in use. + +This module is a plug-in replacement for vanilla DBI. It +automatically activates the {mysql_use_result} attribute for the mysql +driver, but avoids problems with multiple active statement handlers by +creating new database handles as needed. + +=head1 USAGE + +The object constructor is +Bio::DB::GFF::Adaptor::dbi::caching_handle-Enew(). This is called +like DBI-Econnect() and takes the same arguments. The returned object +looks and acts like a conventional database handle. + +In addition to all the standard DBI handle methods, this package adds +the following: + +=head2 dbi_quote + + Title : dbi_quote + Usage : $string = $db->dbi_quote($sql,@args) + Function: perform bind variable substitution + Returns : query string + Args : the query string and bind arguments + Status : public + +This method replaces the bind variable "?" in a SQL statement with +appropriately quoted bind arguments. It is used internally to handle +drivers that don't support argument binding. + +=head2 do_query + + Title : do_query + Usage : $sth = $db->do_query($query,@args) + Function: perform a DBI query + Returns : a statement handler + Args : query string and list of bind arguments + Status : Public + +This method performs a DBI prepare() and execute(), returning a +statement handle. You will typically call fetch() of fetchrow_array() +on the statement handle. The parsed statement handle is cached for +later use. + +=head2 debug + + Title : debug + Usage : $debug = $db->debug([$debug]) + Function: activate debugging messages + Returns : current state of flag + Args : optional new setting of flag + Status : public + +=cut + +sub new { + my $class = shift; + my @dbi_args = @_; + my $self = bless { + dbh => [], + args => \@dbi_args, + debug => 0, + },$class; + $self->dbh || $self->throw("Can't connect to database: " . DBI->errstr); + $self; +} + +sub AUTOLOAD { + my($pack,$func_name) = $AUTOLOAD=~/(.+)::([^:]+)$/; + return if $func_name eq 'DESTROY'; + my $self = shift or return DBI->$func_name(@_); + $self->dbh->$func_name(@_); +} + +sub debug { + my $self = shift; + my $d = $self->{debug}; + $self->{debug} = shift if @_; + $d; +} + +sub prepare { + my $self = shift; + my $query = shift; + + # find a non-busy dbh + my $dbh = $self->dbh || $self->throw("Can't connect to database: " . DBI->errstr); + if (my $sth = $self->{$dbh}{$query}) { + warn "Using cached statement handler\n" if $self->debug; + return $sth; + } else { + warn "Creating new statement handler\n" if $self->debug; + $sth = $dbh->prepare($query) || $self->throw("Couldn't prepare query $query:\n ".DBI->errstr."\n"); + return $self->{$dbh}{$query} = $sth; + } +} + +sub do_query { + my $self = shift; + my ($query,@args) = @_; + warn $self->dbi_quote($query,@args),"\n" if $self->debug; + my $sth = $self->prepare($query); + $sth->execute(@args) || $self->throw("Couldn't execute query $query:\n ".DBI->errstr."\n"); + $sth; +} + +sub dbh { + my $self = shift; + foreach (@{$self->{dbh}}) { + return $_ if $_->inuse == 0; + } + # if we get here, we must create a new one + warn "(Re)connecting to database\n" if $self->debug; + my $dbh = DBI->connect(@{$self->{args}}) or return; + + $dbh->{PrintError} = 0; + + # for Oracle - to retrieve LOBs, need to define the length (Jul 15, 2002) + $dbh->{LongReadLen} = 100*65535; + $dbh->{LongTruncOk} = 0; + + my $wrapper = Bio::DB::GFF::Adaptor::dbi::faux_dbh->new($dbh); + push @{$self->{dbh}},$wrapper; + $wrapper; +} + +=head2 attribute + + Title : attribute + Usage : $value = $db->attribute(AttributeName , [$newvalue]) + Function: get/set DBI::db handle attribute + Returns : current state of the attribute + Args : name of the attribute and optional new setting of attribute + Status : public + + Under Bio::DB::GFF::Adaptor::dbi::caching_handle the DBI::db + attributes that are usually set using hashref calls are unavailable. + Use attribute() instead. For example, instead of: + + $dbh->{AutoCommit} = 0; + + use + + $dbh->attribute(AutoCommit=>0); + +=cut + +sub attribute { + my $self = shift; + my $dbh = $self->dbh->{dbh}; + return $dbh->{$_[0]} = $_[1] if @_ == 2; + return $dbh->{$_[0]} if @_ == 1; + return; +} + +sub disconnect { + my $self = shift; + $_ && $_->disconnect foreach @{$self->{dbh}}; + $self->{dbh} = []; +} + +sub dbi_quote { + my $self = shift; + my ($query,@args) = @_; + my $dbh = $self->dbh; + $query =~ s/\?/$dbh->quote(shift @args)/eg; + $query; +} + +package Bio::DB::GFF::Adaptor::dbi::faux_dbh; +use vars '$AUTOLOAD'; + +sub new { + my $class = shift; + my $dbh = shift; + bless {dbh=>$dbh},$class; +} + +sub prepare { + my $self = shift; + my $sth = $self->{dbh}->prepare(@_) or return; + $sth->{mysql_use_result} = 1 if $self->{dbh}->{Driver}{Name} eq 'mysql'; + $sth; +} + +sub prepare_delayed { + my $self = shift; + my $sth = $self->{dbh}->prepare(@_) or return; + $sth; +} + +sub inuse { + shift->{dbh}->{ActiveKids}; +} + +sub DESTROY { } + +sub AUTOLOAD { + my($pack,$func_name) = $AUTOLOAD=~/(.+)::([^:]+)$/; + return if $func_name eq 'DESTROY'; + my $self = shift; + if( defined $self->{dbh} ) { + $self->{dbh}->$func_name(@_); + } +} + +1; + +__END__ + +=head1 BUGS + +Report to the author. + +=head1 SEE ALSO + +L, L, L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Adaptor/dbi/iterator.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Adaptor/dbi/iterator.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,73 @@ +=head1 NAME + +Bio::DB::GFF::Adaptor::dbi::iterator - iterator for Bio::DB::GFF::Adaptor::dbi + +=head1 SYNOPSIS + +For internal use only + +=head1 DESCRIPTION + +This is an internal module that is used by the Bio::DB::GFF DBI +adaptor to return an iterator across a sequence feature query. The +object has a single method, next_feature(), that returns the next +feature from the query. The method next_seq() is an alias for +next_feature(). + +=head1 BUGS + +None known yet. + +=head1 SEE ALSO + +L, + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut + +package Bio::DB::GFF::Adaptor::dbi::iterator; +use strict; + +use constant STH => 0; +use constant CALLBACK => 1; +use constant CACHE => 2; + +*next_seq = \&next_feature; + +sub new { + my $class = shift; + my ($sth,$callback) = @_; + return bless [$sth,$callback,[]],$class; +} + +sub next_feature { + my $self = shift; + return shift @{$self->[CACHE]} if @{$self->[CACHE]}; + my $sth = $self->[STH] or return; + my $callback = $self->[CALLBACK]; + + my $features; + while (1) { + if (my @row = $sth->fetchrow_array) { + $features = $callback->(@row); + last if $features; + } else { + $sth->finish; + undef $self->[STH]; + $features = $callback->(); + last; + } + } + $self->[CACHE] = $features or return; + shift @{$self->[CACHE]}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Adaptor/dbi/mysql.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Adaptor/dbi/mysql.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,847 @@ +package Bio::DB::GFF::Adaptor::dbi::mysql; + +=head1 NAME + +Bio::DB::GFF::Adaptor::dbi::mysql -- Database adaptor for a specific mysql schema + +=head1 SYNOPSIS + +See L + +=cut + +# a simple mysql adaptor +use strict; +use Bio::DB::GFF::Adaptor::dbi; +use Bio::DB::GFF::Util::Rearrange; # for rearrange() +use Bio::DB::GFF::Util::Binning; +use vars qw(@ISA); +@ISA = qw(Bio::DB::GFF::Adaptor::dbi); + +use constant MAX_SEGMENT => 100_000_000; # the largest a segment can get + +use constant GETSEQCOORDS =><<<< < select * from fgroup where gname='sjj_2L52.1'; + +-------+-------------+------------+ + | gid | gclass | gname | + +-------+-------------+------------+ + | 69736 | PCR_product | sjj_2L52.1 | + +-------+-------------+------------+ + 1 row in set (0.70 sec) + + mysql> select fref,fstart,fstop from fdata,fgroup + where gclass='PCR_product' and gname = 'sjj_2L52.1' + and fdata.gid=fgroup.gid; + +---------------+--------+-------+ + | fref | fstart | fstop | + +---------------+--------+-------+ + | CHROMOSOME_II | 1586 | 2355 | + +---------------+--------+-------+ + 1 row in set (0.03 sec) + +=item ftype + +This table contains the feature types, one per row. Columns are: + + ftypeid the feature type ID (integer) + fmethod the feature type method name (string) + fsource the feature type source name (string) + +The ftype.ftypeid field joins with the fdata.ftypeid field. Example: + + mysql> select fref,fstart,fstop,fmethod,fsource from fdata,fgroup,ftype + where gclass='PCR_product' + and gname = 'sjj_2L52.1' + and fdata.gid=fgroup.gid + and fdata.ftypeid=ftype.ftypeid; + +---------------+--------+-------+-------------+-----------+ + | fref | fstart | fstop | fmethod | fsource | + +---------------+--------+-------+-------------+-----------+ + | CHROMOSOME_II | 1586 | 2355 | PCR_product | GenePairs | + +---------------+--------+-------+-------------+-----------+ + 1 row in set (0.08 sec) + +=item fdna + +This table holds the raw DNA of the reference sequences. It has three +columns: + + fref reference sequence name (string) + foffset offset of this sequence + fdna the DNA sequence (longblob) + +To overcome problems loading large blobs, DNA is automatically +fragmented into multiple segments when loading, and the position of +each segment is stored in foffset. The fragment size is controlled by +the -clump_size argument during initialization. + +=item fattribute_to_feature + +This table holds "attributes", which are tag/value pairs stuffed into +the GFF line. The first tag/value pair is treated as the group, and +anything else is treated as an attribute (weird, huh?). + + CHR_I assembly_tag Finished 2032 2036 . + . Note "Right: cTel33B" + CHR_I assembly_tag Polymorphism 668 668 . + . Note "A->C in cTel33B" + +The columns of this table are: + + fid feature ID (integer) + fattribute_id ID of the attribute (integer) + fattribute_value text of the attribute (text) + +The fdata.fid column joins with fattribute_to_feature.fid. + +=item fattribute + +This table holds the normalized names of the attributes. Fields are: + + fattribute_id ID of the attribute (integer) + fattribute_name Name of the attribute (varchar) + +=back + +=head2 Data Loading Methods + +In addition to implementing the abstract SQL-generating methods of +Bio::DB::GFF::Adaptor::dbi, this module also implements the data +loading functionality of Bio::DB::GFF. + +=cut + + +=head2 new + + Title : new + Usage : $db = Bio::DB::GFF->new(@args) + Function: create a new adaptor + Returns : a Bio::DB::GFF object + Args : see below + Status : Public + +The new constructor is identical to the "dbi" adaptor's new() method, +except that the prefix "dbi:mysql" is added to the database DSN identifier +automatically if it is not there already. + + Argument Description + -------- ----------- + + -dsn the DBI data source, e.g. 'dbi:mysql:ens0040' or "ens0040" + + -user username for authentication + + -pass the password for authentication + +=cut + +#' + +sub new { + my $class = shift; + my ($dsn,$other) = rearrange([ + [qw(FEATUREDB DB DSN)], + ],@_); + $dsn = "dbi:mysql:$dsn" if !ref($dsn) && $dsn !~ /^(?:dbi|DBI):/; + my $self = $class->SUPER::new(-dsn=>$dsn,%$other); + $self; +} + +=head2 get_dna + + Title : get_dna + Usage : $string = $db->get_dna($name,$start,$stop,$class) + Function: get DNA string + Returns : a string + Args : name, class, start and stop of desired segment + Status : Public + +This method performs the low-level fetch of a DNA substring given its +name, class and the desired range. This should probably be moved to +the parent class. + +=cut + +sub getseqcoords_query { + my $self = shift; + return GETSEQCOORDS ; +} + +sub getaliascoords_query{ + my $self = shift; + return GETALIASCOORDS ; +} + + +sub getforcedseqcoords_query{ + my $self = shift; + return GETFORCEDSEQCOORDS ; +} + + +sub getaliaslike_query{ + my $self = shift; + return GETALIASLIKE ; +} + + +# override parent +sub get_abscoords_bkup { + my $self = shift; + my ($name,$class,$refseq) = @_; + + my $result = $self->SUPER::get_abscoords(@_); + return $result if $result; + + my $sth; + if ($name =~ s/\*/%/g) { + $sth = $self->dbh->do_query(GETALIASLIKE,$name,$class); + } else { + $sth = $self->dbh->do_query(GETALIASCOORDS,$name,$class); + } + my @result; + while (my @row = $sth->fetchrow_array) { push @result,\@row } + $sth->finish; + + if (@result == 0) { + $self->error("$name not found in database"); + return; + } else { + return \@result; + } + +} + + + +sub make_features_select_part { + my $self = shift; + my $options = shift || {}; + my $s; + if (my $b = $options->{bin_width}) { + + $s = <{attributes} && keys %{$options->{attributes}}>1; + $s; +} + + +# IMPORTANT NOTE: +# WHETHER OR NOT THIS WORKS IS CRITICALLY DEPENDENT ON THE RELATIVE MAGNITUDE OF THE +sub make_features_from_part { + my $self = shift; + my $sparse_types = shift; + my $options = shift || {}; + my $sparse_groups = $options->{sparse_groups}; + my $index = $sparse_groups ? ' USE INDEX(gid)' + : $sparse_types ? ' USE INDEX(ftypeid)' + : ''; + return $options->{attributes} ? "fdata${index},ftype,fgroup,fattribute,fattribute_to_feature\n" + : "fdata${index},ftype,fgroup\n"; +} + +=head2 search_notes + + Title : search_notes + Usage : @search_results = $db->search_notes("full text search string",$limit) + Function: Search the notes for a text string, using mysql full-text search + Returns : array of results + Args : full text search string, and an optional row limit + Status : public + +This is a mysql-specific method. Given a search string, it performs a +full-text search of the notes table and returns an array of results. +Each row of the returned array is a arrayref containing the following fields: + + column 1 A Bio::DB::GFF::Featname object, suitable for passing to segment() + column 2 The text of the note + column 3 A relevance score. + +=cut + +sub search_notes { + my $self = shift; + my ($search_string,$limit) = @_; + my $query = FULLTEXTSEARCH; + $query .= " limit $limit" if defined $limit; + my $sth = $self->dbh->do_query($query,$search_string,$search_string); + my @results; + while (my ($class,$name,$note,$relevance) = $sth->fetchrow_array) { + next unless $class && $name; # sorry, ignore NULL objects + $relevance = sprintf("%.2f",$relevance); # trim long floats + my $featname = Bio::DB::GFF::Featname->new($class=>$name); + push @results,[$featname,$note,$relevance]; + } + @results; +} + + + +################################ loading and initialization ################################## + +=head2 schema + + Title : schema + Usage : $schema = $db->schema + Function: return the CREATE script for the schema + Returns : a list of CREATE statemetns + Args : none + Status : protected + +This method returns a list containing the various CREATE statements +needed to initialize the database tables. + +=cut + +sub schema { + my %schema = ( + fdata =>{ +table=> q{ +#create table fdata ( +# fid int not null auto_increment, +# fref varchar(100) not null, +# fstart int unsigned not null, +# fstop int unsigned not null, +# ftypeid int not null, +# fscore float, +# fstrand enum('+','-'), +# fphase enum('0','1','2'), +# gid int not null, +# ftarget_start int unsigned, +# ftarget_stop int unsigned, +# primary key(fid), +# unique index(fref,fstart,fstop,ftypeid,gid), +# index(ftypeid), +# index(gid) +#) type=MyISAM + + + create table fdata ( + fid int not null auto_increment, + fref varchar(100) not null, + fstart int unsigned not null, + fstop int unsigned not null, + fbin double(20,6) not null, + ftypeid int not null, + fscore float, + fstrand enum('+','-'), + fphase enum('0','1','2'), + gid int not null, + ftarget_start int unsigned, + ftarget_stop int unsigned, + primary key(fid), + unique index(fref,fbin,fstart,fstop,ftypeid,gid), + index(ftypeid), + index(gid) + ) type=MyISAM +} # fdata table +}, # fdata + + fgroup =>{ +table=> q{ +create table fgroup ( + gid int not null auto_increment, + gclass varchar(100), + gname varchar(100), + primary key(gid), + unique(gclass,gname) +) type=MyISAM +} +}, + + ftype => { +table=> q{ +create table ftype ( + ftypeid int not null auto_increment, + fmethod varchar(100) not null, + fsource varchar(100), + primary key(ftypeid), + index(fmethod), + index(fsource), + unique ftype (fmethod,fsource) +)type=MyISAM +} #ftype table +}, #ftype + + fdna => { +table=> q{ +create table fdna ( + fref varchar(100) not null, + foffset int(10) unsigned not null, + fdna longblob, + primary key(fref,foffset) +)type=MyISAM +} # fdna table +},#fdna + + fmeta => { +table=> q{ +create table fmeta ( + fname varchar(255) not null, + fvalue varchar(255) not null, + primary key(fname) +)type=MyISAM +} # fmeta table +},#fmeta + + fattribute => { +table=> q{ +create table fattribute ( + fattribute_id int(10) unsigned not null auto_increment, + fattribute_name varchar(255) not null, + primary key(fattribute_id) +)type=MyISAM +} #fattribute table +},#fattribute + + fattribute_to_feature => { +table=> q{ +create table fattribute_to_feature ( + fid int(10) not null, + fattribute_id int(10) not null, + fattribute_value text, + key(fid,fattribute_id), + key(fattribute_value(48)), + fulltext(fattribute_value) +)type=MyISAM +} # fattribute_to_feature table + }, # fattribute_to_feature +); + return \%schema; +} + + + +=head2 make_classes_query + + Title : make_classes_query + Usage : ($query,@args) = $db->make_classes_query + Function: return query fragment for generating list of reference classes + Returns : a query and args + Args : none + Status : public + +=cut + +sub make_classes_query { + my $self = shift; + return 'SELECT DISTINCT gclass FROM fgroup WHERE NOT ISNULL(gclass)'; +} + + +=head2 make_meta_set_query + + Title : make_meta_set_query + Usage : $sql = $db->make_meta_set_query + Function: return SQL fragment for setting a meta parameter + Returns : SQL fragment + Args : none + Status : public + +By default this does nothing; meta parameters are not stored or +retrieved. + +=cut + +sub make_meta_set_query { + return 'REPLACE INTO fmeta VALUES (?,?)'; +} + +=head2 setup_load + + Title : setup_load + Usage : $db->setup_load + Function: called before load_gff_line() + Returns : void + Args : none + Status : protected + +This method performs schema-specific initialization prior to loading a +set of GFF records. It prepares a set of DBI statement handlers to be +used in loading the data. + +=cut + +sub setup_load { + my $self = shift; + + my $dbh = $self->features_db; + + if ($self->lock_on_load) { + my @tables = map { "$_ WRITE"} $self->tables; + my $tables = join ', ',@tables; + $dbh->do("LOCK TABLES $tables"); + } + + my $lookup_type = $dbh->prepare_delayed('SELECT ftypeid FROM ftype WHERE fmethod=? AND fsource=?'); + my $insert_type = $dbh->prepare_delayed('INSERT INTO ftype (fmethod,fsource) VALUES (?,?)'); + + my $lookup_group = $dbh->prepare_delayed('SELECT gid FROM fgroup WHERE gname=? AND gclass=?'); + my $insert_group = $dbh->prepare_delayed('INSERT INTO fgroup (gname,gclass) VALUES (?,?)'); + + my $lookup_attribute = $dbh->prepare_delayed('SELECT fattribute_id FROM fattribute WHERE fattribute_name=?'); + my $insert_attribute = $dbh->prepare_delayed('INSERT INTO fattribute (fattribute_name) VALUES (?)'); + my $insert_attribute_value = $dbh->prepare_delayed('INSERT INTO fattribute_to_feature (fid,fattribute_id,fattribute_value) VALUES (?,?,?)'); + + my $insert_data = $dbh->prepare_delayed(<{load_stuff}{sth}{lookup_ftype} = $lookup_type; + $self->{load_stuff}{sth}{insert_ftype} = $insert_type; + $self->{load_stuff}{sth}{lookup_fgroup} = $lookup_group; + $self->{load_stuff}{sth}{insert_fgroup} = $insert_group; + $self->{load_stuff}{sth}{insert_fdata} = $insert_data; + $self->{load_stuff}{sth}{lookup_fattribute} = $lookup_attribute; + $self->{load_stuff}{sth}{insert_fattribute} = $insert_attribute; + $self->{load_stuff}{sth}{insert_fattribute_value} = $insert_attribute_value; + $self->{load_stuff}{types} = {}; + $self->{load_stuff}{groups} = {}; + $self->{load_stuff}{counter} = 0; +} + +=head2 load_gff_line + + Title : load_gff_line + Usage : $db->load_gff_line($fields) + Function: called to load one parsed line of GFF + Returns : true if successfully inserted + Args : hashref containing GFF fields + Status : protected + +This method is called once per line of the GFF and passed a series of +parsed data items that are stored into the hashref $fields. The keys are: + + ref reference sequence + source annotation source + method annotation method + start annotation start + stop annotation stop + score annotation score (may be undef) + strand annotation strand (may be undef) + phase annotation phase (may be undef) + group_class class of annotation's group (may be undef) + group_name ID of annotation's group (may be undef) + target_start start of target of a similarity hit + target_stop stop of target of a similarity hit + attributes array reference of attributes, each of which is a [tag=>value] array ref + +=cut + +sub load_gff_line { + my $self = shift; + my $gff = shift; + + my $s = $self->{load_stuff}; + my $dbh = $self->features_db; + local $dbh->{PrintError} = 0; + + defined(my $typeid = $self->get_table_id('ftype', $gff->{method} => $gff->{source})) or return; + defined(my $groupid = $self->get_table_id('fgroup',$gff->{gname} => $gff->{gclass})) or return; + + my $bin = bin($gff->{start},$gff->{stop},$self->min_bin); + my $result = $s->{sth}{insert_fdata}->execute($gff->{ref}, + $gff->{start},$gff->{stop},$bin, + $typeid, + $gff->{score},$gff->{strand},$gff->{phase}, + $groupid, + $gff->{tstart},$gff->{tstop}); + + warn $dbh->errstr,"\n" and return unless $result; + + my $fid = $dbh->{mysql_insertid} + || $self->get_feature_id($gff->{ref},$gff->{start},$gff->{stop},$typeid,$groupid); + + + # insert attributes + foreach (@{$gff->{attributes}}) { + defined(my $attribute_id = $self->get_table_id('fattribute',$_->[0])) or return; + $s->{sth}{insert_fattribute_value}->execute($fid,$attribute_id,$_->[1]); + } + + if ( (++$s->{counter} % 1000) == 0) { + print STDERR "$s->{counter} records loaded..."; + print STDERR -t STDOUT && !$ENV{EMACS} ? "\r" : "\n"; + } + + $fid; +} + + +sub insert_sequence { + my $self = shift; + my($id,$offset,$seq) = @_; + my $sth = $self->{_insert_sequence} + ||= $self->dbh->prepare_delayed('replace into fdna values (?,?,?)'); + $sth->execute($id,$offset,$seq) or die $sth->errstr; +} + + +=head2 get_table_id + + Title : get_table_id + Usage : $integer = $db->get_table_id($table,@ids) + Function: get the ID of a group or type + Returns : an integer ID or undef + Args : none + Status : private + +This internal method is called by load_gff_line to look up the integer +ID of an existing feature type or group. The arguments are the name +of the table, and two string identifiers. For feature types, the +identifiers are the method and source. For groups, the identifiers +are group name and class. + +This method requires that a statement handler named I, +have been created previously by setup_load(). It is here to overcome +deficiencies in mysql's INSERT syntax. + +=cut + +#' +# get the object ID from a named table +sub get_table_id { + my $self = shift; + my $table = shift; + my @ids = @_; + + # irritating warning for null id + my $id_key; + { + local $^W=0; + $id_key = join ':',@ids; + } + + my $s = $self->{load_stuff}; + my $sth = $s->{sth}; + my $dbh = $self->features_db; + + unless (defined($s->{$table}{$id_key})) { + + ######################################### + # retrieval of the last inserted id is now located at the adaptor and not in caching_handle + ####################################### + if ( (my $result = $sth->{"lookup_$table"}->execute(@ids)) > 0) { + $s->{$table}{$id_key} = ($sth->{"lookup_$table"}->fetchrow_array)[0]; + } else { + $sth->{"insert_$table"}->execute(@ids) + && ($s->{$table}{$id_key} = $self->insertid($sth->{"insert_$table"})); + #&& ($s->{$table}{$id_key} = $sth->{"insert_$table"}{sth}{mysql_insertid}); + #&& ($s->{$table}{$id_key} = $sth->{"insert_$table"}->insertid); + } + } + + my $id = $s->{$table}{$id_key}; + unless (defined $id) { + warn "No $table id for $id_key ",$dbh->errstr," Record skipped.\n"; + return; + } + $id; +} + +sub insertid { + my $self = shift; + my $s = shift ; + $s->{mysql_insertid}; +} + + +=head2 get_feature_id + + Title : get_feature_id + Usage : $integer = $db->get_feature_id($ref,$start,$stop,$typeid,$groupid) + Function: get the ID of a feature + Returns : an integer ID or undef + Args : none + Status : private + +This internal method is called by load_gff_line to look up the integer +ID of an existing feature. It is ony needed when replacing a feature +with new information. + +=cut + +# this method is called when needed to look up a feature's ID +sub get_feature_id { + my $self = shift; + my ($ref,$start,$stop,$typeid,$groupid) = @_; + my $s = $self->{load_stuff}; + unless ($s->{get_feature_id}) { + my $dbh = $self->features_db; + $s->{get_feature_id} = + $dbh->prepare_delayed('SELECT fid FROM fdata WHERE fref=? AND fstart=? AND fstop=? AND ftypeid=? AND gid=?'); + } + my $sth = $s->{get_feature_id} or return; + $sth->execute($ref,$start,$stop,$typeid,$groupid) or return; + my ($fid) = $sth->fetchrow_array; + return $fid; +} + +1; + + +__END__ + +=head1 BUGS + +none ;-) + +=head1 SEE ALSO + +L, L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE. + +Copyright (c) 2002 Cold Spring Harbor Laboratory. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Adaptor/dbi/mysqlace.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Adaptor/dbi/mysqlace.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,114 @@ +package Bio::DB::GFF::Adaptor::dbi::mysqlace; + +=head1 NAME + +Bio::DB::GFF::Adaptor::dbi::mysqlace -- Unholy union between mysql GFF database and acedb database + +=head1 SYNOPSIS + +Pending + +See L and L + +=head1 SEE ALSO + +L, L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE. + +Copyright (c) 2002 Cold Spring Harbor Laboratory. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut + +use strict; +use Bio::DB::GFF::Adaptor::dbi::mysql; +use Bio::DB::GFF::Adaptor::ace; +use Bio::DB::GFF::Util::Rearrange; # for rearrange() + +use vars '@ISA'; +@ISA = qw(Bio::DB::GFF::Adaptor::dbi::mysql Bio::DB::GFF::Adaptor::ace); + +# Create a new Bio::DB::GFF::Adaptor::dbi object +sub new { + my $class = shift; + my $self = $class->SUPER::new(@_); + my ($dna_db,$acedb) = rearrange([[qw(DNADB DNA FASTA FASTA_DIR)],'ACEDB'],@_); + if ($dna_db) { + if (!ref($dna_db)) { + require Bio::DB::Fasta; + my $fasta_dir = $dna_db; + $dna_db = Bio::DB::Fasta->new($fasta_dir); + $dna_db or $class->throw("new(): Failed to create new Bio::DB::Fasta from files in $fasta_dir"); + } else { + $dna_db->isa('Bio::DB::Fasta') or $class->throw("new(): $dna_db is not a Bio::DB::Fasta object"); + } + $self->dna_db($dna_db); + } + + if ($acedb) { + $acedb->isa('Ace') or $class->throw("$acedb is not an acedb accessor object"); + $self->acedb($acedb); + } + $self; +} + +=head2 freshen_ace + + Title : freshen + Usage : $flag = Bio::DB::GFF->freshen_ace; + Function: Refresh internal acedb handle + Returns : flag if correctly freshened + Args : none + Status : Public + +ACeDB has an annoying way of timing out, leaving dangling database +handles. This method will invoke the ACeDB reopen() method, which +causes dangling handles to be refreshed. It has no effect if you are +not using ACeDB to create ACeDB objects. + +=cut + +######################### +# Moved from mysqlopt.pm +######################### +sub make_object { + my $self = shift; + my ($class,$name,$start,$stop) = @_; + + if (my $db = $self->acedb) { + + # for Notes we just return a text, no database associated + return $class->new(Text=>$name) if $class eq 'Note'; + + # for homols, we create the indicated Protein or Sequence object + # then generate a bogus Homology object (for future compatability??) + if ($start ne '') { + require Ace::Sequence::Homol; + return Ace::Sequence::Homol->new_homol($class,$name,$db,$start,$stop); + } + + # General case: + my $obj = $db->class->new($class=>$name,$self->acedb); + + return $obj if defined $obj; + + # Last resort, return a Text + return $class->new(Text=>$name); + } + + return $self->SUPER::make_object($class,$name,$start,$stop); +} + +sub get_dna { + my $self = shift; + my ($ref,$start,$stop,$class) = @_; + my $dna_db = $self->dna_db or return $self->SUPER::get_dna(@_); + return $dna_db->seq($ref,$start,$stop,$class); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Adaptor/dbi/mysqlopt.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Adaptor/dbi/mysqlopt.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,33 @@ +package Bio::DB::GFF::Adaptor::dbi::mysqlopt; + +=head1 NAME + +Bio::DB::GFF::Adaptor::dbi::mysqlopt -- Deprecated database adaptor + +=head1 SYNOPSIS + +This adaptor has been superseded by Bio::DB::GFF::Adaptor::dbi::mysql. + +See L and L + +=head1 SEE ALSO + +L, L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE. + +Copyright (c) 2002 Cold Spring Harbor Laboratory. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut + +use strict; +use Bio::DB::GFF::Adaptor::dbi::mysql; +use vars '@ISA'; +@ISA = 'Bio::DB::GFF::Adaptor::dbi::mysql'; + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Adaptor/dbi/oracle.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Adaptor/dbi/oracle.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1013 @@ +package Bio::DB::GFF::Adaptor::dbi::oracle; + +=head1 NAME + +Bio::DB::GFF::Adaptor::dbi::oracle -- Database adaptor for a specific oracle schema + +=head1 SYNOPSIS + +See L + +=cut + +# a simple oracle adaptor +use strict; +use Bio::DB::GFF::Adaptor::dbi; +#use Bio::DB::GFF::Adaptor::dbi::mysql; +#use Bio::DB::GFF::Adaptor::dbi::mysqlopt; +use Bio::DB::GFF::Util::Binning; +use Bio::DB::GFF::Util::Rearrange; # for rearrange() +use vars qw(@ISA); +#@ISA = qw(Bio::DB::GFF::Adaptor::dbi::mysqlopt); +@ISA = qw(Bio::DB::GFF::Adaptor::dbi); + +use constant MAX_SEGMENT => 100_000_000; # the largest a segment can get +use constant DEFAULT_CHUNK => 2000; + +use constant GETSEQCOORDS =><<<< 100_000_000; + +# this is the smallest bin (1 K) +use constant MIN_BIN => 1000; + +# size of range over which it is faster to force mysql to use the range for indexing +use constant STRAIGHT_JOIN_LIMIT => 200_000; + +############################################################################## + +=head1 DESCRIPTION + +This adaptor implements a specific oracle database schema that is +compatible with Bio::DB::GFF. It inherits from +Bio::DB::GFF::Adaptor::dbi, which itself inherits from Bio::DB::GFF. + +The schema uses several tables: + +=over 4 + +=item fdata + +This is the feature data table. Its columns are: + + fid feature ID (integer) + fref reference sequence name (string) + fstart start position relative to reference (integer) + fstop stop postion relative to reference (integer) + ftypeid feature type ID (integer) + fscore feature score (float); may be null + fstrand strand; one of "+" or "-"; may be null + fphase phase; one of 0, 1 or 2; may be null + gid group ID (integer) + ftarget_start for similarity features, the target start position (integer) + ftarget_stop for similarity features, the target stop position (integer) + +Note that it would be desirable to normalize the reference sequence +name, since there are usually many features that share the same +reference feature. However, in the current schema, query performance +suffers dramatically when this additional join is added. + +=item fgroup + +This is the group table. There is one row for each group. Columns: + + gid the group ID (integer) + gclass the class of the group (string) + gname the name of the group (string) + +The group table serves multiple purposes. As you might expect, it is +used to cluster features that logically belong together, such as the +multiple exons of the same transcript. It is also used to assign a +name and class to a singleton feature. Finally, the group table is +used to identify the target of a similarity hit. This is consistent +with the way in which the group field is used in the GFF version 2 +format. + +The fgroup.gid field joins with the fdata.gid field. + +Examples: + + sql> select * from fgroup where gname='sjj_2L52.1'; + +-------+-------------+------------+ + | gid | gclass | gname | + +-------+-------------+------------+ + | 69736 | PCR_product | sjj_2L52.1 | + +-------+-------------+------------+ + 1 row in set (0.70 sec) + + sql> select fref,fstart,fstop from fdata,fgroup + where gclass='PCR_product' and gname = 'sjj_2L52.1' + and fdata.gid=fgroup.gid; + +---------------+--------+-------+ + | fref | fstart | fstop | + +---------------+--------+-------+ + | CHROMOSOME_II | 1586 | 2355 | + +---------------+--------+-------+ + 1 row in set (0.03 sec) + +=item ftype + +This table contains the feature types, one per row. Columns are: + + ftypeid the feature type ID (integer) + fmethod the feature type method name (string) + fsource the feature type source name (string) + +The ftype.ftypeid field joins with the fdata.ftypeid field. Example: + + sql> select fref,fstart,fstop,fmethod,fsource from fdata,fgroup,ftype + where gclass='PCR_product' + and gname = 'sjj_2L52.1' + and fdata.gid=fgroup.gid + and fdata.ftypeid=ftype.ftypeid; + +---------------+--------+-------+-------------+-----------+ + | fref | fstart | fstop | fmethod | fsource | + +---------------+--------+-------+-------------+-----------+ + | CHROMOSOME_II | 1586 | 2355 | PCR_product | GenePairs | + +---------------+--------+-------+-------------+-----------+ + 1 row in set (0.08 sec) + +=item fdna + +This table holds the raw DNA of the reference sequences. It has three +columns: + + fref reference sequence name (string) + foffset offset of this sequence + fdna the DNA sequence (longblob) + +To overcome problems loading large blobs, DNA is automatically +fragmented into multiple segments when loading, and the position of +each segment is stored in foffset. The fragment size is controlled by +the -clump_size argument during initialization. + +=item fattribute_to_feature + +This table holds "attributes", which are tag/value pairs stuffed into +the GFF line. The first tag/value pair is treated as the group, and +anything else is treated as an attribute (weird, huh?). + + CHR_I assembly_tag Finished 2032 2036 . + . Note "Right: cTel33B" + CHR_I assembly_tag Polymorphism 668 668 . + . Note "A->C in cTel33B" + +The columns of this table are: + + fid feature ID (integer) + fattribute_id ID of the attribute (integer) + fattribute_value text of the attribute (text) + +The fdata.fid column joins with fattribute_to_feature.fid. + +=item fattribute + +This table holds the normalized names of the attributes. Fields are: + + fattribute_id ID of the attribute (integer) + fattribute_name Name of the attribute (varchar) + +=back + +=head2 Data Loading Methods + +In addition to implementing the abstract SQL-generating methods of +Bio::DB::GFF::Adaptor::dbi, this module also implements the data +loading functionality of Bio::DB::GFF. + +=cut + + +=head2 new + + Title : new + Usage : $db = Bio::DB::GFF->new(@args) + Function: create a new adaptor + Returns : a Bio::DB::GFF object + Args : see below + Status : Public + +The new constructor is identical to the "dbi" adaptor's new() method, +except that the prefix "dbi:oracle" is added to the database DSN identifier +automatically if it is not there already. + + Argument Description + -------- ----------- + + -dsn the DBI data source, e.g. 'dbi:mysql:ens0040' or "ens0040" + + -user username for authentication + + -pass the password for authentication + +=cut + +#' + +sub new { + my $class = shift; + my ($dsn,$other) = rearrange([ + [qw(FEATUREDB DB DSN)], + ],@_); + $dsn = "dbi:Oracle:$dsn" if !ref($dsn) && $dsn !~ /^(dbi|DBI):/; + my $self = $class->SUPER::new(-dsn=>$dsn,%$other); + $self; +} + +=head2 schema + + Title : schema + Usage : $schema = $db->schema + Function: return the CREATE script for the schema + Returns : a list of CREATE statemetns + Args : none + Status : protected + +This method returns a list containing the various CREATE statements +needed to initialize the database tables. + +=cut + +sub schema { + my %schema = ( + fdata =>{ +table=> q{ +create table fdata ( + fid INTEGER NOT NULL, + fref VARCHAR(100) DEFAULT '' NOT NULL, + fstart INTEGER DEFAULT '0' NOT NULL, + fstop INTEGER DEFAULT '0' NOT NULL, + fbin NUMBER DEFAULT '0.000000' NOT NULL, + ftypeid INTEGER DEFAULT '0' NOT NULL, + fscore NUMBER , + fstrand VARCHAR2(3) CHECK (fstrand IN ('+','-')), + fphase VARCHAR2(3) CHECK (fphase IN ('0','1','2')), + gid INTEGER DEFAULT '0' NOT NULL, + ftarget_start INTEGER , + ftarget_stop INTEGER , + CONSTRAINT fdata_pk PRIMARY KEY (fid) +) +}, # fdata table + +index=>{ + fdata_fref_idx => q{ +CREATE UNIQUE INDEX fdata_fref_idx ON fdata (fref,fbin,fstart,fstop,ftypeid,gid) +}, + + fdata_ftypeid_idx => q{ +CREATE INDEX fdata_ftypeid_idx ON fdata (ftypeid) +}, + + fdata_gid_idx => q{ +CREATE INDEX fdata_gid_idx ON fdata (gid) +} + }, # fdata indexes + +sequence=> { + fdata_fid_sq => q{ +CREATE SEQUENCE fdata_fid_sq START WITH 1 +} + }, # fdata sequences + +trigger=> { + fdata_fid_ai => q{ +CREATE OR REPLACE TRIGGER fdata_fid_ai +BEFORE INSERT ON fdata +FOR EACH ROW WHEN (new.fid IS NULL OR new.fid = 0) +BEGIN + SELECT fdata_fid_sq.nextval INTO :new.fid FROM dual; +END; +} + }# fdata triggers + +}, # fdata + + + + fgroup => { +table => q{ +CREATE TABLE fgroup ( + gid INTEGER NOT NULL, + gclass VARCHAR(100) , + gname VARCHAR(100) , + CONSTRAINT fgroup_pk PRIMARY KEY (gid) +) +}, # fgroup table + +index => { + fgroup_gclass_idx => q{ +CREATE UNIQUE INDEX fgroup_gclass_idx ON fgroup (gclass,gname) +} + }, # fgroup indexes + +sequence => { + + fgroup_gid_sq => q{ +CREATE SEQUENCE fgroup_gid_sq START WITH 1 +} + }, # fgroup sequences + + +trigger => { + fgroup_gid_ai => q{ +CREATE OR REPLACE TRIGGER fgroup_gid_ai +BEFORE INSERT ON fgroup +FOR EACH ROW WHEN (new.gid IS NULL OR new.gid = 0) +BEGIN + SELECT fgroup_gid_sq.nextval INTO :new.gid FROM dual; +END; +} + } # fgroup triggers + +}, # fgroup + + ftype => { +table => q{ +CREATE TABLE ftype ( + ftypeid INTEGER NOT NULL, + fmethod VARCHAR(100) DEFAULT '' NOT NULL, + fsource VARCHAR(100), + CONSTRAINT ftype_pk PRIMARY KEY (ftypeid) +) +}, # ftype table + +index => { + ftype_fmethod_idx => q{ +CREATE INDEX ftype_fmethod_idx ON ftype (fmethod) +}, + + ftype_fsource_idx => q{ +CREATE INDEX ftype_fsource_idx ON ftype (fsource) +}, + + ftype_ftype_idx => q{ +CREATE UNIQUE INDEX ftype_ftype_idx ON ftype (fmethod,fsource) +} + }, # ftype indexes + +sequence => { + ftype_ftypeid_sq => q{ +CREATE SEQUENCE ftype_ftypeid_sq START WITH 1 +} + }, #ftype sequences + +trigger => { + ftype_ftypeid_ai => q{ +CREATE OR REPLACE TRIGGER ftype_ftypeid_ai +BEFORE INSERT ON ftype +FOR EACH ROW WHEN (new.ftypeid IS NULL OR new.ftypeid = 0) +BEGIN + SELECT ftype_ftypeid_sq.nextval INTO :new.ftypeid FROM dual; +END; +} + } #ftype triggers +}, # ftype + + + fdna => { +table => q{ +CREATE TABLE fdna ( + fref VARCHAR(100) DEFAULT '' NOT NULL, + foffset INTEGER DEFAULT '0' NOT NULL, + fdna LONG /* LONGBLOB */ , + CONSTRAINT fdna_pk PRIMARY KEY (fref,foffset) +) +} #fdna table + }, #fdna + + fmeta => { +table => q{ +CREATE TABLE fmeta ( + fname VARCHAR(255) DEFAULT '' NOT NULL, + fvalue VARCHAR(255) DEFAULT '' NOT NULL, + CONSTRAINT fmeta_pk PRIMARY KEY (fname) +) +} # fmeta table + }, # fmeta + + + fattribute => { +table => q{ +CREATE TABLE fattribute ( + fattribute_id INTEGER NOT NULL, + fattribute_name VARCHAR(255) DEFAULT '' NOT NULL, + CONSTRAINT fattribute_pk PRIMARY KEY (fattribute_id) +) +}, # fattribute table + +sequence=> { + fattribute_fattribute_id_sq => q{ +CREATE SEQUENCE fattribute_fattribute_id_sq START WITH 1 +} + }, # fattribute sequences + +trigger => { + fattribute_fattribute_id_ai => q{ +CREATE OR REPLACE TRIGGER fattribute_fattribute_id_ai +BEFORE INSERT ON fattribute +FOR EACH ROW WHEN (new.fattribute_id IS NULL OR new.fattribute_id = 0) +BEGIN + SELECT fattribute_fattribute_id_sq.nextval INTO :new.fattribute_id FROM dual; +END; +} + } # fattribute triggers +}, # fattribute + + fattribute_to_feature => { +table => q{ +CREATE TABLE fattribute_to_feature ( + fid INTEGER DEFAULT '0' NOT NULL, + fattribute_id INTEGER DEFAULT '0' NOT NULL, + fattribute_value VARCHAR2(255) /* TEXT */ +) +}, # fattribute_to_feature table + +index => { + fattribute_to_feature_fid => q{ +CREATE INDEX fattribute_to_feature_fid ON fattribute_to_feature (fid,fattribute_id) +} + } # fattribute_to_feature indexes +} # fattribute_to_feature + + +); + return \%schema; +} + + +=head2 do_initialize + + Title : do_initialize + Usage : $success = $db->do_initialize($drop_all) + Function: initialize the database + Returns : a boolean indicating the success of the operation + Args : a boolean indicating whether to delete existing data + Status : protected + +This method will load the schema into the database. If $drop_all is +true, then any existing data in the tables known to the schema will be +deleted. + +Internally, this method calls schema() to get the schema data. + +=cut + +# Create the schema from scratch. +# You will need create privileges for this. +#sub do_initialize { +# my $self = shift; +# my $erase = shift; +# $self->drop_all if $erase; + +# my $dbh = $self->features_db; +# my $schema = $self->schema; + +# foreach my $table_name(keys %$schema) { +# my $create_table_stmt = $$schema{$table_name}{table} ; +# $dbh->do($create_table_stmt) || warn $dbh->errstr; +# } +# 1; +#} + + + +=head2 drop_all + + Title : drop_all + Usage : $db->drop_all + Function: empty the database + Returns : void + Args : none + Status : protected + +This method drops the tables known to this module. Internally it +calls the abstract tables() method. + +=cut + +# Drop all the GFF tables -- dangerous! +#sub drop_all { +# my $self = shift; +# my $dbh = $self->features_db; +# local $dbh->{PrintError} = 0; +# foreach ($self->tables) { +# $dbh->do("drop table $_"); +# } +#} + + + + + + +=head2 setup_load + + Title : setup_load + Usage : $db->setup_load + Function: called before load_gff_line() + Returns : void + Args : none + Status : protected + +This method performs schema-specific initialization prior to loading a +set of GFF records. It prepares a set of DBI statement handlers to be +used in loading the data. + +=cut + +sub setup_load { + my $self = shift; + my $schema = $self->schema; + + my $dbh = $self->features_db; + + if ($self->lock_on_load) { + my @tables = map { "$_ WRITE"} $self->tables; + my $tables = join ', ',@tables; + $dbh->do("LOCK TABLES $tables"); + } + + my $lookup_type = $dbh->prepare_delayed('SELECT ftypeid FROM ftype WHERE fmethod=? AND fsource=?'); + my $insert_type = $dbh->prepare_delayed('INSERT INTO ftype (fmethod,fsource) VALUES (?,?)'); + my $sequence_type = (keys %{$schema->{ftype}{sequence}})[0]; + my $insertid_type = $dbh->prepare_delayed("SELECT $sequence_type.CURRVAL FROM dual"); + + my $lookup_group = $dbh->prepare_delayed('SELECT gid FROM fgroup WHERE gname=? AND gclass=?'); + my $insert_group = $dbh->prepare_delayed('INSERT INTO fgroup (gname,gclass) VALUES (?,?)'); + my $sequence_group = (keys %{$schema->{fgroup}{sequence}})[0]; + my $insertid_group = $dbh->prepare_delayed("SELECT $sequence_group.CURRVAL FROM dual"); + + my $lookup_attribute = $dbh->prepare_delayed('SELECT fattribute_id FROM fattribute WHERE fattribute_name=?'); + my $insert_attribute = $dbh->prepare_delayed('INSERT INTO fattribute (fattribute_name) VALUES (?)'); + my $sequence_attribute = (keys %{$schema->{fattribute}{sequence}})[0]; + my $insertid_attribute = $dbh->prepare_delayed("SELECT $sequence_attribute.CURRVAL FROM dual"); + + my $insert_attribute_value = $dbh->prepare_delayed('INSERT INTO fattribute_to_feature (fid,fattribute_id,fattribute_value) VALUES (?,?,?)'); + + my $insert_data = $dbh->prepare_delayed(<prepare_delayed('DELETE FROM fdata WHERE fref=? AND fstart=? AND fstop=? AND fbin=? AND ftypeid=? AND GID=?'); + my $sequence_data = (keys %{$schema->{fdata}{sequence}})[0]; + my $insertid_data = $dbh->prepare_delayed("SELECT $sequence_data.CURRVAL FROM dual"); + + + + $self->{load_stuff}{sth}{lookup_ftype} = $lookup_type; + $self->{load_stuff}{sth}{insert_ftype} = $insert_type; + $self->{load_stuff}{sth}{insertid_ftype} = $insertid_type; + $self->{load_stuff}{sth}{lookup_fgroup} = $lookup_group; + $self->{load_stuff}{sth}{insert_fgroup} = $insert_group; + $self->{load_stuff}{sth}{insertid_fgroup} = $insertid_group; + $self->{load_stuff}{sth}{insert_fdata} = $insert_data; + $self->{load_stuff}{sth}{insertid_fdata} = $insertid_data; + $self->{load_stuff}{sth}{delete_existing_fdata} = $delete_existing_data; + $self->{load_stuff}{sth}{lookup_fattribute} = $lookup_attribute; + $self->{load_stuff}{sth}{insert_fattribute} = $insert_attribute; + $self->{load_stuff}{sth}{insertid_fattribute} = $insertid_attribute; + $self->{load_stuff}{sth}{insert_fattribute_value} = $insert_attribute_value; + $self->{load_stuff}{types} = {}; + $self->{load_stuff}{groups} = {}; + $self->{load_stuff}{counter} = 0; +} + +=head2 load_gff_line + + Title : load_gff_line + Usage : $db->load_gff_line($fields) + Function: called to load one parsed line of GFF + Returns : true if successfully inserted + Args : hashref containing GFF fields + Status : protected + +This method is called once per line of the GFF and passed a series of +parsed data items that are stored into the hashref $fields. The keys are: + + ref reference sequence + source annotation source + method annotation method + start annotation start + stop annotation stop + score annotation score (may be undef) + strand annotation strand (may be undef) + phase annotation phase (may be undef) + group_class class of annotation's group (may be undef) + group_name ID of annotation's group (may be undef) + target_start start of target of a similarity hit + target_stop stop of target of a similarity hit + attributes array reference of attributes, each of which is a [tag=>value] array ref + +=cut + +sub load_gff_line { + my $self = shift; + my $gff = shift; + + if (defined $gff->{phase}){ + chomp($gff->{phase}); + undef($gff->{phase}) if $gff->{phase} eq '.'; + } + + if (defined $gff->{strand} && $gff->{strand} eq '.'){undef($gff->{strand})}; + if (defined $gff->{score} && $gff->{score} eq '.'){undef($gff->{score})}; + + my $s = $self->{load_stuff}; + my $dbh = $self->features_db; + local $dbh->{PrintError} = 0; + + defined(my $typeid = $self->get_table_id('ftype', $gff->{method} => $gff->{source})) or return; + defined(my $groupid = $self->get_table_id('fgroup',$gff->{gname} => $gff->{gclass})) or return; + + my $bin = bin($gff->{start},$gff->{stop},$self->min_bin); + my $result = $s->{sth}{insert_fdata}->execute($gff->{ref}, + $gff->{start},$gff->{stop},$bin, + $typeid, + $gff->{score},$gff->{strand},$gff->{phase}, + $groupid, + $gff->{tstart},$gff->{tstop}); + if (defined ($dbh->errstr)){ + # print $dbh->errstr,"\n" ,%$gff,"\n"; + if ($dbh->errstr =~ /ORA-02290: check constraint/){ + print "PHASE=$gff->{phase}"."===","\n"; + } + + #if ($dbh->errstr =~ /ORA-00001: unique constraint/){ + # $result = $s->{sth}{delete_existing_fdata}->execute($gff->{ref}, + # $gff->{start},$gff->{stop},$bin, + # $typeid, + # $groupid); + # + # print "delete row result=$result\n"; + # $result = $s->{sth}{insert_fdata}->execute($gff->{ref}, + # $gff->{start},$gff->{stop},$bin, + # $typeid, + # $gff->{score},$gff->{strand},$gff->{phase}, + # $groupid, + # $gff->{tstart},$gff->{tstop}); + # + # print "insert row result=$result\n"; + #} + } + warn $dbh->errstr,"\n" and print "ref=",$gff->{ref}," start=",$gff->{start}," stop=",$gff->{stop}," bin=",$bin," typeid=",$typeid," groupid=",$groupid,"\n" + and return unless $result; + + my $fid = $self->insertid($s->{sth},'fdata') + || $self->get_feature_id($gff->{ref},$gff->{start},$gff->{stop},$typeid,$groupid); + + + # insert attributes + foreach (@{$gff->{attributes}}) { + defined(my $attribute_id = $self->get_table_id('fattribute',$_->[0])) or return; + $s->{sth}{insert_fattribute_value}->execute($fid,$attribute_id,$_->[1]); + } + + if ( (++$s->{counter} % 1000) == 0) { + print STDERR "$s->{counter} records loaded..."; + print STDERR -t STDOUT && !$ENV{EMACS} ? "\r" : "\n"; + } + + $fid; +} + + + + +=head2 get_table_id + + Title : get_table_id + Usage : $integer = $db->get_table_id($table,@ids) + Function: get the ID of a group or type + Returns : an integer ID or undef + Args : none + Status : private + +This internal method is called by load_gff_line to look up the integer +ID of an existing feature type or group. The arguments are the name +of the table, and two string identifiers. For feature types, the +identifiers are the method and source. For groups, the identifiers +are group name and class. + +This method requires that a statement handler named I, +have been created previously by setup_load(). It is here to overcome +deficiencies in mysql's INSERT syntax. + +=cut + +#' +# get the object ID from a named table +sub get_table_id { + my $self = shift; + my $table = shift; + my @ids = @_; + + # irritating warning for null id + my $id_key; + { + local $^W=0; + $id_key = join ':',@ids; + } + + my $s = $self->{load_stuff}; + my $sth = $s->{sth}; + my $dbh = $self->features_db; + + unless (defined($s->{$table}{$id_key})) { + $sth->{"lookup_$table"}->execute(@ids); + my @result = $sth->{"lookup_$table"}->fetchrow_array; + if (@result > 0) { + $s->{$table}{$id_key} = $result[0]; + } else { + $sth->{"insert_$table"}->execute(@ids) + && ($s->{$table}{$id_key} = $self->insertid($sth,$table)); + #&& ($s->{$table}{$id_key} = $self->insertid($sth->{"insertid_$table"})); + #&& ($s->{$table}{$id_key} = $sth->{"insert_$table"}->insertid); + } + } + + my $id = $s->{$table}{$id_key}; + unless (defined $id) { + warn "No $table id for $id_key ",$dbh->errstr," Record skipped.\n"; + return; + } + $id; +} + +sub insertid { + my $self = shift; + my $sth = shift ; + my $table = shift; + + my $insert_id; + if ($sth->{"insertid_$table"}->execute()){ + $insert_id = ($sth->{"insertid_$table"}->fetchrow_array)[0]; + } + else{ + warn "No CURRVAL for SEQUENCE of table $table ",$sth->errstr,"\n"; + return; + } + return $insert_id; +} + + +#sub insertid { +# my $self = shift; +# my $insertid_sth = shift ; +# my $insert_id; +# if ($insertid_sth->execute){ +# $insert_id = ($insertid_sth->fetchrow_array)[0]; +# } +# else{ +# warn "No CURRVAL for SEQUENCE ",$insertid_sth->errstr,"\n"; +# return; +# } +# return $insert_id; +#} + +sub insert_sequence { + my $self = shift; + my($id,$offset,$seq) = @_; + my $sth = $self->{_insert_sequence} + ||= $self->dbh->prepare_delayed('insert into fdna values (?,?,?)'); + $sth->execute($id,$offset,$seq) or die $sth->errstr; +} + +=head2 search_notes + + Title : search_notes + Usage : @search_results = $db->search_notes("full text search string",$limit) + Function: Search the notes for a text string, using mysql full-text search + Returns : array of results + Args : full text search string, and an optional row limit + Status : public + +This is a mysql-specific method. Given a search string, it performs a +full-text search of the notes table and returns an array of results. +Each row of the returned array is a arrayref containing the following fields: + + column 1 A Bio::DB::GFF::Featname object, suitable for passing to segment() + column 2 The text of the note + column 3 A relevance score. + +=cut + +sub search_notes { + my $self = shift; + my ($search_string,$limit) = @_; + + my @words = $search_string =~ /(\w+)/g; + my $regex = join '|',@words; + my @searches = map {"fattribute_value LIKE '%${_}%'"} @words; + my $search = join(' OR ',@searches); + + my $query = <dbh->do_query($query); + my @results; + while (my ($class,$name,$note) = $sth->fetchrow_array) { + next unless $class && $name; # sorry, ignore NULL objects + my @matches = $note =~ /($regex)/g; + my $relevance = 10*@matches; + my $featname = Bio::DB::GFF::Featname->new($class=>$name); + push @results,[$featname,$note,$relevance]; + last if $limit && @results >= $limit; + } + @results; +} + +=head2 make_meta_set_query + + Title : make_meta_set_query + Usage : $sql = $db->make_meta_set_query + Function: return SQL fragment for setting a meta parameter + Returns : SQL fragment + Args : none + Status : public + +By default this does nothing; meta parameters are not stored or +retrieved. + +=cut + +sub make_meta_set_query { + return 'INSERT INTO fmeta VALUES (?,?)'; +} + +sub make_classes_query { + my $self = shift; + return 'SELECT DISTINCT gclass FROM fgroup WHERE NOT gclass IS NULL'; +} + + +sub chunk_size { + my $self = shift; + $self->meta('chunk_size') || DEFAULT_CHUNK; +} + +sub getseqcoords_query { + my $self = shift; + return GETSEQCOORDS ; +} + +sub getaliascoords_query{ + my $self = shift; + return GETALIASCOORDS ; +} + + +sub getforcedseqcoords_query{ + my $self = shift; + return GETFORCEDSEQCOORDS ; +} + + +sub getaliaslike_query{ + my $self = shift; + return GETALIASLIKE ; +} + + +sub make_features_select_part { + my $self = shift; + my $options = shift || {}; + my $s; + if (my $b = $options->{bin_width}) { + + $s = <{attributes} && keys %{$options->{attributes}}>1; + $s; +} + +sub make_features_from_part_bkup { + my $self = shift; + my $sparse = shift; + my $options = shift || {}; + #my $index = $sparse ? ' USE INDEX(ftypeid)': ''; + my $index = ''; + return $options->{attributes} ? "fdata${index},ftype,fgroup,fattribute,fattribute_to_feature\n" + : "fdata${index},ftype,fgroup\n"; +} + + +#################################### +# moved from mysqlopt.pm +################################### +# meta values +sub default_meta_values { + my $self = shift; + my @values = $self->SUPER::default_meta_values; + return ( + @values, + max_bin => MAX_BIN, + min_bin => MIN_BIN, + straight_join_limit => STRAIGHT_JOIN_LIMIT, + ); +} + +sub min_bin { + my $self = shift; + return $self->meta('min_bin') || MIN_BIN; +} +sub max_bin { + my $self = shift; + return $self->meta('max_bin') || MAX_BIN; +} +sub straight_join_limit { + my $self = shift; + return $self->meta('straight_join_limit') || STRAIGHT_JOIN_LIMIT; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Adaptor/dbi/oracleace.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Adaptor/dbi/oracleace.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,111 @@ +package Bio::DB::GFF::Adaptor::dbi::oracleace; + +=head1 NAME + +Bio::DB::GFF::Adaptor::dbi::oracleace -- Unholy union between oracle GFF database and acedb database + +=head1 SYNOPSIS + +Pending + +See L and L + +=head1 SEE ALSO + +L, L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE. + +Copyright (c) 2002 Cold Spring Harbor Laboratory. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut + +use strict; +use Bio::DB::GFF::Adaptor::dbi::oracle; +use Bio::DB::GFF::Adaptor::ace; +use Bio::DB::GFF::Util::Rearrange; # for rearrange() + +use vars '@ISA'; +@ISA = qw(Bio::DB::GFF::Adaptor::ace Bio::DB::GFF::Adaptor::dbi::oracle); + +# Create a new Bio::DB::GFF::Adaptor::dbi object +sub new { + my $class = shift; + my $self = $class->SUPER::new(@_); + my ($dna_db,$acedb) = rearrange([[qw(DNADB DNA FASTA FASTA_DIR)],'ACEDB'],@_); + if ($dna_db) { + if (!ref($dna_db)) { + require Bio::DB::Fasta; + my $fasta_dir = $dna_db; + $dna_db = Bio::DB::Fasta->new($fasta_dir); + $dna_db or $class->throw("new(): Failed to create new Bio::DB::Fasta from files in $fasta_dir"); + } else { + $dna_db->isa('Bio::DB::Fasta') or $class->throw("new(): $dna_db is not a Bio::DB::Fasta object"); + } + $self->dna_db($dna_db); + } + + if ($acedb) { + $acedb->isa('Ace') or $class->throw("$acedb is not an acedb accessor object"); + $self->acedb($acedb); + } + $self; +} + +sub make_object { + my $self = shift; + my ($class,$name,$start,$stop) = @_; + + if (my $db = $self->acedb) { + + # for Notes we just return a text, no database associated + return $class->new(Text=>$name) if $class eq 'Note'; + + # for homols, we create the indicated Protein or Sequence object + # then generate a bogus Homology object (for future compatability??) + if ($start ne '') { + require Ace::Sequence::Homol; + return Ace::Sequence::Homol->new_homol($class,$name,$db,$start,$stop); + } + + # General case: + my $obj = $db->class->new($class=>$name,$self->acedb); + + return $obj if defined $obj; + + # Last resort, return a Text + return $class->new(Text=>$name); + } + + return $self->SUPER::make_object($class,$name,$start,$stop); +} + +sub get_dna { + my $self = shift; + my ($ref,$start,$stop,$class) = @_; + my $dna_db = $self->dna_db or return $self->SUPER::get_dna(@_); + return $dna_db->seq($ref,$start,$stop,$class); +} + +=head2 freshen_ace + + Title : freshen + Usage : $flag = Bio::DB::GFF->freshen_ace; + Function: Refresh internal acedb handle + Returns : flag if correctly freshened + Args : none + Status : Public + +ACeDB has an annoying way of timing out, leaving dangling database +handles. This method will invoke the ACeDB reopen() method, which +causes dangling handles to be refreshed. It has no effect if you are +not using ACeDB to create ACeDB objects. + +=cut + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Adaptor/memory.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Adaptor/memory.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,705 @@ +package Bio::DB::GFF::Adaptor::memory; + +=head1 NAME + +Bio::DB::GFF::Adaptor::dbi::mysql -- Database adaptor for a specific mysql schema + +=head1 SYNOPSIS + + use Bio::DB::GFF; + my $db = Bio::DB::GFF->new(-adaptor=> 'memory', + -file => 'my_features.gff', + -fasta => 'my_dna.fa' + ); + +See L for other methods. + +=head1 DESCRIPTION + +This adaptor implements an in-memory version of Bio::DB::GFF. It can be used to +store and retrieve SHORT GFF files. It inherits from Bio::DB::GFF. + +=head1 CONSTRUCTOR + +Use Bio::DB::GFF-Enew() to construct new instances of this class. +Three named arguments are recommended: + + Argument Description + + -adaptor Set to "memory" to create an instance of this class. + -gff Read the indicated file or directory of .gff file. + -fasta Read the indicated file or directory of fasta files. + -dsn Indicates a directory containing .gff and .fa files + +If you use the -dsn option and the indicated directory is writable by +the current process, then this library will create a FASTA file index +that greatly diminishes the memory usage of this module. + +=head1 METHODS + +See L for inherited methods. + +=head1 BUGS + +none ;-) + +=head1 SEE ALSO + +L, L + +=head1 AUTHOR + +Shuly Avraham Eavraham@cshl.orgE. + +Copyright (c) 2002 Cold Spring Harbor Laboratory. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut + +use strict; +# $Id: memory.pm,v 1.7.2.1 2003/07/05 00:52:31 lstein Exp $ +# AUTHOR: Shulamit Avraham +# This module needs to be cleaned up and documented + +# Bio::DB::GFF::Adaptor::memory -- in-memory db adaptor +# implements the low level handling of data which stored in memory. +# This adaptor implements a specific in memory schema that is compatible with Bio::DB::GFF. +# Inherits from Bio::DB::GFF. + + +#use lib './blib/lib'; +#use lib '/u/swiss/shuly/bioperl-live'; +# use lib '/a/swiss/export/home/shuly/bioperl-live'; +use Bio::DB::GFF; +use Bio::DB::GFF::Util::Rearrange; # for rearrange() +use Bio::DB::GFF::Adaptor::memory_iterator; +use File::Basename 'dirname'; + +use vars qw(@ISA); + +use constant MAX_SEGMENT => 100_000_000; # the largest a segment can get + +@ISA = qw(Bio::DB::GFF); + +sub new { + my $class = shift ; + my ($file,$fasta,$dbdir) = rearrange([ + [qw(GFF FILE DIRECTORY)], + 'FASTA', + [qw(DSN DB DIR)], + ],@_); + + # fill in object + my $self = bless{ data => [] },$class; + $file ||= $dbdir; + $fasta ||= $dbdir; + $self->load_gff($file) if $file; + $self->load_or_store_fasta($fasta) if $fasta; + return $self; +} + +sub load_or_store_fasta { + my $self = shift; + my $fasta = shift; + if ((-f $fasta && -w dirname($fasta)) + or + (-d $fasta && -w $fasta)) { + require Bio::DB::Fasta; + my $dna_db = Bio::DB::Fasta->new($fasta) + or $self->throw("Couldn't create a new Bio::DB::Fasta index from $fasta"); + $self->dna_db($dna_db); + } else { + $self->load_fasta($fasta); + } +} + +sub dna_db { + my $self = shift; + my $d = $self->{dna_db}; + $self->{dna_db} = shift if @_; + $d; +} + +sub insert_sequence { + my $self = shift; + my($id,$offset,$seq) = @_; + $self->{dna}{$id} .= $seq; +} + +# low-level fetch of a DNA substring given its +# name, class and the desired range. +sub get_dna { + my $self = shift; + my ($id,$start,$stop,$class) = @_; + if (my $dna_db = $self->dna_db) { + return $dna_db->seq($id,$start=>$stop); + } + + return $self->{dna}{$id} if !defined $start || !defined $stop; + $start = 1 if !defined $start; + + my $reversed = 0; + if ($start > $stop) { + $reversed++; + ($start,$stop) = ($stop,$start); + } + my $dna = substr($self->{dna}{$id},$start-1,$stop-$start+1); + if ($reversed) { + $dna =~ tr/gatcGATC/ctagCTAG/; + $dna = reverse $dna; + } + + $dna; +} + +# this method loads the feature as a hash into memory - +# keeps an array of features-hashes as an in-memory db +sub load_gff_line { + my $self = shift; + my $feature_hash = shift; + $feature_hash->{strand} = '' if $feature_hash->{strand} && $feature_hash->{strand} eq '.'; + $feature_hash->{phase} = '' if $feature_hash->{phase} && $feature_hash->{phase} eq '.'; + push @{$self->{data}},$feature_hash; +} + +# given sequence name, return (reference,start,stop,strand) +sub get_abscoords { + my $self = shift; + my ($name,$class,$refseq) = @_; + my %refs; + my $regexp; + + if ($name =~ /[*?]/) { # uh oh regexp time + $name =~ quotemeta($name); + $name =~ s/\\\*/.*/g; + $name =~ s/\\\?/.?/g; + $regexp++; + } + + # Find all features that have the requested name and class. + # Sort them by reference point. + for my $feature (@{$self->{data}}) { + + my $no_match_class_name; + my $empty_class_name; + if (defined $feature->{gname} and defined $feature->{gclass}){ + my $matches = $feature->{gclass} eq $class + && ($regexp ? $feature->{gname} =~ /$name/i : $feature->{gname} eq $name); + $no_match_class_name = !$matches; # to accomodate Shuly's interesting logic + } + + else{ + $empty_class_name = 1; + } + + if ($no_match_class_name || $empty_class_name){ + + my $feature_attributes = $feature->{attributes}; + my $attributes = {Alias => $name}; + if (!_matching_attributes($feature_attributes,$attributes)){ + next; + } + } + push @{$refs{$feature->{ref}}},$feature; + } + + # find out how many reference points we recovered + + if (! %refs) { + $self->error("$name not found in database"); + return; + } elsif (keys %refs > 1) { + $self->error("$name has more than one reference sequence in database"); + return; + } + + # compute min and max + my ($ref) = keys %refs; + my @found = @{$refs{$ref}}; + my ($strand,$start,$stop); + foreach (@found) { + $strand ||= $_->{strand}; + $strand = '+' if $strand && $strand eq '.'; + $start = $_->{start} if !defined($start) || $start > $_->{start}; + $stop = $_->{stop} if !defined($stop) || $stop < $_->{stop}; + + my @found_segments; + foreach my $ref (keys %refs) { + next if defined($refseq) and $ref ne $refseq; + my @found = @{$refs{$ref}}; + my ($strand,$start,$stop); + foreach (@found) { + $strand ||= $_->{strand}; + $strand = '+' if $strand && $strand eq '.'; + $start = $_->{start} if !defined($start) || $start > $_->{start}; + $stop = $_->{stop} if !defined($stop) || $stop < $_->{stop}; + } + push @found_segments,[$ref,$class,$start,$stop,$strand]; + + } + return \@found_segments; +} + +sub search_notes { + my $self = shift; + my ($search_string,$limit) = @_; + my @results; + my @words = map {quotemeta($_)} $search_string =~ /(\w+)/g; + + for my $feature (@{$self->{data}}) { + next unless defined $feature->{gclass} && defined $feature->{gname}; # ignore NULL objects + next unless $feature->{attributes}; + my @attributes = @{$feature->{attributes}}; + my @values = map {$_->[1]} @attributes; + my $value = "@values"; + my $matches = 0; + my $note; + for my $w (@words) { + my @hits = $value =~ /($w)/g; + $note ||= $value if @hits; + $matches += @hits; + } + next unless $matches; + + my $relevance = 10 * $matches; + my $featname = Bio::DB::GFF::Featname->new($feature->{gclass}=>$feature->{gname}); + push @results,[$featname,$note,$relevance]; + last if @results >= $limit; + } + @results; +} + +# attributes - + +# Some GFF version 2 files use the groups column to store a series of +# attribute/value pairs. In this interpretation of GFF, the first such +# pair is treated as the primary group for the feature; subsequent pairs +# are treated as attributes. Two attributes have special meaning: +# "Note" is for backward compatibility and is used for unstructured text +# remarks. "Alias" is considered as a synonym for the feature name. +# If no name is provided, then attributes() returns a flattened hash, of +# attribute=>value pairs. + +sub do_attributes{ + my $self = shift; + my ($feature_id,$tag) = @_; + my $attr ; + + my $feature = ${$self->{data}}[$feature_id]; + my @result; + for my $attr (@{$feature->{attributes}}) { + my ($attr_name,$attr_value) = @$attr ; + if (defined($tag) && $attr_name eq $tag){push @result,$attr_value;} + elsif (!defined($tag)) {push @result,($attr_name,$attr_value);} + } + return @result; +} + + +#sub get_feature_by_attribute{ +sub _feature_by_attribute{ + my $self = shift; + my ($attributes,$callback) = @_; + $callback || $self->throw('must provide a callback argument'); + my $count = 0; + my $feature_id = -1; + my $feature_group_id = undef; + + for my $feature (@{$self->{data}}) { + + $feature_id++; + for my $attr (@{$feature->{attributes}}) { + my ($attr_name,$attr_value) = @$attr ; + #there could be more than one set of attributes...... + foreach (keys %$attributes) { + if ($_ eq $attr_name && $attributes->{$_} eq $attr_value){ + $callback->($feature->{ref}, + $feature->{start}, + $feature->{stop}, + $feature->{source}, + $feature->{method}, + $feature->{score}, + $feature->{strand}, + $feature->{phase}, + $feature->{gclass}, + $feature->{gname}, + $feature->{tstart}, + $feature->{tstop}, + $feature_id, + $feature_group_id); + $count++; + } + } + } + } + +} + + + +# This is the low-level method that is called to retrieve GFF lines from +# the database. It is responsible for retrieving features that satisfy +# range and feature type criteria, and passing the GFF fields to a +# callback subroutine. + +sub get_features{ + my $self = shift; + my $count = 0; + my ($search,$options,$callback) = @_; + my $data = \@{$self->{data}}; + + my $found_features; + + $found_features = _get_features_by_search_options($data,$search,$options); + + # only true if the sort by group option was specified + @{$found_features} = sort {"$a->{gclass}:$a->{gname}" cmp "$b->{gclass}:$b->{gname}"} + @{$found_features} if $options->{sort_by_group} ; + + for my $feature (@{$found_features}) { # only true if the sort by group option was specified + $count++; + $callback->( + @{$feature}{qw(ref start stop source method score strand phase gclass gname tstart tstop feature_id feature_group_id)} + ); + } + + return $count; +} + + +# Low level implementation of fetching a named feature. +# GFF annotations are named using the group class and name fields. +# May return zero, one, or several Bio::DB::GFF::Feature objects. + +=head2 _feature_by_name + + Title : _feature_by_name + Usage : $db->get_features_by_name($name,$class,$callback) + Function: get a list of features by name and class + Returns : count of number of features retrieved + Args : name of feature, class of feature, and a callback + Status : protected + +This method is used internally. The callback arguments are those used +by make_feature(). + +=cut + +sub _feature_by_name { + my $self = shift; + my ($class,$name,$location,$callback) = @_; + $callback || $self->throw('must provide a callback argument'); + my $count = 0; + my $id = -1; + my $regexp; + + if ($name =~ /[*?]/) { # uh oh regexp time + $name =~ quotemeta($name); + $name =~ s/\\\*/.*/g; + $name =~ s/\\\?/.?/g; + $regexp++; + } + + + for my $feature (@{$self->{data}}) { + $id++; + next unless ($regexp && $feature->{gname} =~ /$name/i) || $feature->{gname} eq $name; + next unless $feature->{gclass} eq $class; + if ($location) { + next if $location->[0] ne $feature->{ref}; + next if $location->[1] && $location->[1] > $feature->{stop}; + next if $location->[2] && $location->[2] < $feature->{start}; + } + $count++; + $callback->(@{$feature}{qw( + ref + start + stop + source + method + score + strand + phase + gclass + gname + tstart + tstop + )},$id,0 + ); + } + return $count; +} + +# Low level implementation of fetching a feature by it's id. +# The id of the feature as implemented in the in-memory db, is the location of the +# feature in the features hash array. +sub _feature_by_id{ + my $self = shift; + my ($ids,$type,$callback) = @_; + $callback || $self->throw('must provide a callback argument'); + + my $feature_group_id = undef; + + my $count = 0; + if ($type eq 'feature'){ + for my $feature_id (@$ids){ + my $feature = ${$self->{data}}[$feature_id]; + + $callback->($feature->{ref}, + $feature->{start}, + $feature->{stop}, + $feature->{source}, + $feature->{method}, + $feature->{score}, + $feature->{strand}, + $feature->{phase}, + $feature->{gclass}, + $feature->{gname}, + $feature->{tstart}, + $feature->{tstop}, + $feature_id, + $feature_group_id); + $count++; + + } + } +} + + +# This method is similar to get_features(), except that it returns an +# iterator across the query. +# See Bio::DB::GFF::Adaptor::memory_iterator. + +sub get_features_iterator { + my $self = shift; + my ($search,$options,$callback) = @_; + $callback || $self->throw('must provide a callback argument'); + + my $data = \@{$self->{data}}; + my $results = _get_features_by_search_options($data,$search,$options); + my $results_array = _convert_feature_hash_to_array($results); + + return Bio::DB::GFF::Adaptor::memory_iterator->new($results_array,$callback); +} + + + + +# This method is responsible for fetching the list of feature type names. +# The query may be limited to a particular range, in +# which case the range is indicated by a landmark sequence name and +# class and its subrange, if any. These arguments may be undef if it is +# desired to retrieve all feature types. + +# If the count flag is false, the method returns a simple list of +# Bio::DB::GFF::Typename objects. If $count is true, the method returns +# a list of $name=>$count pairs, where $count indicates the number of +# times this feature occurs in the range. + +sub get_types { + my $self = shift; + my ($srcseq,$class,$start,$stop,$want_count,$typelist) = @_; + my(%result,%obj); + + for my $feature (@{$self->{data}}) { + my $feature_start = $feature->{start}; + my $feature_stop = $feature->{stop}; + my $feature_ref = $feature->{ref}; + my $feature_class = $feature->{class}; + my $feature_method = $feature->{method}; + my $feature_source = $feature->{source}; + + if (defined $srcseq){ + next unless $feature_ref eq $srcseq ; + } + + if (defined $class){ + next unless $feature_class eq $class ; + } + + # the requested range should OVERLAP the retrieved features + if (defined $start or defined $stop) { + $start = 1 unless defined $start; + $stop = MAX_SEGMENT unless defined $stop; + next unless $feature_stop >= $start && $feature_start <= $stop; + } + + if (defined $typelist && @$typelist){ + next unless _matching_typelist($feature_method,$feature_source,$typelist); + } + + my $type = Bio::DB::GFF::Typename->new($feature_method,$feature_source); + $result{$type}++; + $obj{$type} = $type; + + } #end features loop + + return $want_count ? %result : values %obj; + +} + + + + +# Internal method that performs a search on the features array, +# sequentialy retrieves the features, and performs a check on each feature +# according to the search options. +sub _get_features_by_search_options{ + my $count = 0; + my ($data,$search,$options) = @_; + my ($rangetype,$refseq,$class,$start,$stop,$types,$sparse,$order_by_group,$attributes) = + (@{$search}{qw(rangetype refseq refclass start stop types)}, + @{$options}{qw(sparse sort_by_group ATTRIBUTES)}) ; + + my @found_features; + + my $feature_id = -1 ; + my $feature_group_id = undef; + + for my $feature (@{$data}) { + + $feature_id++; + + my $feature_start = $feature->{start}; + my $feature_stop = $feature->{stop}; + my $feature_ref = $feature->{ref}; + + if (defined $refseq){ + next unless $feature_ref eq $refseq; + } + + if (defined $start or defined $stop) { + $start = 0 unless defined($start); + $stop = MAX_SEGMENT unless defined($stop); + + if ($rangetype eq 'overlaps') { + next unless $feature_stop >= $start && $feature_start <= $stop; + } elsif ($rangetype eq 'contains') { + next unless $feature_start >= $start && $feature_stop <= $stop; + } elsif ($rangetype eq 'contained_in') { + next unless $feature_start <= $start && $feature_stop >= $stop; + } else { + next unless $feature_start == $start && $feature_stop == $stop; + } + + } + + my $feature_source = $feature->{source}; + my $feature_method = $feature->{method}; + + if (defined $types && @$types){ + next unless _matching_typelist($feature_method,$feature_source,$types); + } + + my $feature_attributes = $feature->{attributes}; + if (defined $attributes){ + next unless _matching_attributes($feature_attributes,$attributes); + } + + # if we get here, then we have a feature that meets the criteria. + # Then we just push onto an array + # of found features and continue. + + my $found_feature = $feature ; + $found_feature->{feature_id} = $feature_id; + $found_feature->{group_id} = $feature_group_id; + push @found_features,$found_feature; + + } + + return \@found_features; +} + + + + + +# this subroutine is needed for convertion of the feature from hash to array in order to +# pass it to the callback subroutine +sub _convert_feature_hash_to_array{ + my @features_hash_array = @_; + + use constant FREF => 0; + use constant FSTART => 1; + use constant FSTOP => 2; + use constant FSOURCE => 3; + use constant FMETHOD => 4; + use constant FSCORE => 5; + use constant FSTRAND => 6; + use constant FPHASE => 7; + use constant GCLASS => 8; + use constant GNAME => 9; + use constant TSTART => 10; + use constant TSTOP => 11; + use constant FID => 12; + use constant GID => 13; + + my @features_array_array; + my $feature_count = 0; + + for my $feature_hash (@{$features_hash_array[0]}){ + my @feature_array; + + $feature_array[FREF] = $feature_hash->{ref}; + $feature_array[FSTART] = $feature_hash->{start}; + $feature_array[FSTOP] = $feature_hash->{stop}; + $feature_array[FSOURCE] = $feature_hash->{source}; + $feature_array[FMETHOD] = $feature_hash->{method}; + $feature_array[FSCORE] = $feature_hash->{score}; + $feature_array[FSTRAND] = $feature_hash->{strand}; + $feature_array[FPHASE ] = $feature_hash->{phase}; + $feature_array[GCLASS] = $feature_hash->{gclass}; + $feature_array[GNAME] = $feature_hash->{gname}; + $feature_array[TSTART] = $feature_hash->{tstart}; + $feature_array[TSTOP] = $feature_hash->{tstop}; + $feature_array[FID] = $feature_hash->{feature_id}; + $feature_array[GID] = $feature_hash->{group_id}; + + $features_array_array[$feature_count] = \@feature_array; + $feature_count++; + } + return \@features_array_array; +} + +sub _matching_typelist{ + my ($feature_method,$feature_source,$typelist) = @_; + foreach (@$typelist) { + my ($search_method,$search_source) = @$_; + next if $search_method ne $feature_method; + next if defined($search_source) && $search_source ne $feature_source; + return 1; + } + return 0; +} + +sub _matching_attributes{ + my ($feature_attributes,$attributes) = @_ ; + foreach (keys %$attributes) { + return 0 if !_match_all_attr_in_feature($_,$attributes->{$_},$feature_attributes) + + } + return 1; +} + +sub _match_all_attr_in_feature{ + my ($attr_name,$attr_value,$feature_attributes) = @_; + for my $attr (@$feature_attributes) { + my ($feature_attr_name,$feature_attr_value) = @$attr ; + next if ($attr_name ne $feature_attr_name || $attr_value ne $feature_attr_value); + return 1; + } + return 0; +} + + +sub do_initialize { 1; } +sub setup_load { } +sub finish_load { 1; } +sub get_feature_by_group_id{ 1; } + +1; + +} +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Adaptor/memory_iterator.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Adaptor/memory_iterator.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,83 @@ +=head1 NAME + +Bio::DB::GFF::Adaptor::memory_iterator - iterator for Bio::DB::GFF::Adaptor::memory + +=head1 SYNOPSIS + +For internal use only + +=head1 DESCRIPTION + +This is an internal module that is used by the Bio::DB::GFF in-memory +adaptor to return an iterator across a sequence feature query. The +object has a single method, next_feature(), that returns the next +feature from the query. The method next_seq() is an alias for +next_feature(). + +=head1 BUGS + +None known yet. + +=head1 SEE ALSO + +L, + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut + +package Bio::DB::GFF::Adaptor::memory_iterator; +use strict; +# $Id: memory_iterator.pm,v 1.4.2.1 2003/07/05 00:52:31 lstein Exp $ +# this module needs to be cleaned up and documented + +#use constant STH => 0; +#use constant CALLBACK => 1; +#use constant CACHE => 2; + +*next_seq = \&next_feature; + +sub new { + my $class = shift; + my ($data,$callback) = @_; + my $pos = 0; + return bless {data => $data, + pos => $pos, + callback => $callback, + cache => []},$class; + #return bless [$sth,$callback,[]],$class; +} + +sub next_feature { + my $self = shift; + return shift @{$self->{cache}} if @{$self->{cache}}; + + my $data = $self->{data} or return; + my $callback = $self->{callback}; + + my $features; + while (1) { + my $feature = $data->[$self->{pos}++]; + if ($feature) { + $features = $callback->(@{$feature}); + last if $features; + } else { + $features = $callback->(); + undef $self->{pos}; + undef $self->{data}; + undef $self->{cache}; + last; + } + } + $self->{cache} = $features or return; + shift @{$self->{cache}}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Aggregator.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Aggregator.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,592 @@ +=head1 NAME + +Bio::DB::GFF::Aggregator -- Aggregate GFF groups into composite features + +=head1 SYNOPSIS + + use Bio::DB::GFF; + + my $agg1 = Bio::DB::GFF::Aggregator->new(-method => 'cistron', + -main_method => 'locus', + -sub_parts => ['allele','variant'] + ); + + my $agg2 = Bio::DB::GFF::Aggregator->new(-method => 'splice_group', + -sub_parts => 'transcript'); + + my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', + -aggregator => [$agg1,$agg2], + -dsn => 'dbi:mysql:elegans42', + ); + + +=head1 DESCRIPTION + +Bio::DB::GFF::Aggregator is used to aggregate GFF groups into +composite features. Each composite feature has a "main part", the +top-level feature, and a series of zero or more subparts, retrieved +with the sub_SeqFeature() method. The aggregator class is designed to +be subclassable, allowing a variety of GFF feature types to be +supported. + +The base Bio::DB::GFF::Aggregator class is generic, and can be used to +create specific instances to be passed to the -aggregator argument of +Bio::DB::GFF-Enew() call. The various subclasses of +Bio::DB::GFF::Aggregator are tuned for specific common feature types +such as clones, gapped alignments and transcripts. + +Instances of Bio::DB::GFF::Aggregator have three attributes: + +=over 3 + +=item method + +This is the GFF method field of the composite feature as a whole. For +example, "transcript" may be used for a composite feature created by +aggregating individual intron, exon and UTR features. + +=item main method + +Sometimes GFF groups are organized hierarchically, with one feature +logically containing another. For example, in the C. elegans schema, +methods of type "Sequence:curated" correspond to regions covered by +curated genes. There can be zero or one main methods. + +=item subparts + +This is a list of one or more methods that correspond to the component +features of the aggregates. For example, in the C. elegans database, +the subparts of transcript are "intron", "exon" and "CDS". + +=back + +Aggregators have two main methods that can be overridden in +subclasses: + +=over 4 + +=item disaggregate() + +This method is called by the Adaptor object prior to fetching a list +of features. The method is passed an associative array containing the +[method,source] pairs that the user has requested, and it returns a +list of raw features that it would like the adaptor to fetch. + +=item aggregate() + +This method is called by the Adaptor object after it has fetched +features. The method is passed a list of raw features and is expected +to add its composite features to the list. + +=back + +The disaggregate() and aggregate() methods provided by the base +Aggregator class should be sufficient for many applications. In this +case, it suffices for subclasses to override the following methods: + +=over 4 + +=item method() + +Return the default method for the composite feature as a whole. + +=item main_name() + +Return the default main method name. + +=item part_names() + +Return a list of subpart method names. + +=back + +Provided that method() and part_names() are overridden (and optionally +main_name() as well), then the bare name of the aggregator subclass +can be passed to the -aggregator of Bio::DB::GFF-Enew(). For example, +this is a small subclass that will aggregate features of type "allele" +and "polymorphism" into an aggregate named "mutant": + + package Bio::DB::GFF::Aggregator::mutant; + + use strict; + use Bio::DB::GFF::Aggregator; + + use vars '@ISA'; + @ISA = 'Bio::DB::GFF::Aggregator'; + + sub method { 'mutant' } + + sub part_names { + return qw(allele polymorphism); + } + + 1; + +Once installed, this aggregator can be passed to Bio::DB::GFF-Enew() +by name like so: + + my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', + -aggregator => 'mutant', + -dsn => 'dbi:mysql:elegans42', + ); + +=head1 API + +The remainder of this document describes the public and private +methods implemented by this module. + +=cut + +package Bio::DB::GFF::Aggregator; + +use strict; +use Bio::DB::GFF::Util::Rearrange; # for rearrange() +use Bio::DB::GFF::Feature; +use vars qw(@ISA); + +@ISA = qw(Bio::Root::Root); + +my $ALWAYS_TRUE = sub { 1 }; + +=head2 new + + Title : new + Usage : $a = Bio::DB::GFF::Aggregator->new(@args) + Function: create a new aggregator + Returns : a Bio::DB::GFF::Aggregator object + Args : see below + Status : Public + +This is the constructor for Bio::DB::GFF::Aggregator. Named arguments +are as follows: + + -method the method for the composite feature + + -main_method the top-level raw feature, if any + + -sub_parts the list of raw features that will form the subparts + of the composite feature (array reference or scalar) + +=cut + +sub new { + my $class = shift; + my ($method,$main,$sub_parts) = rearrange(['METHOD', + ['MAIN_PART','MAIN_METHOD'], + ['SUB_METHODS','SUB_PARTS'] + ],@_); + return bless { + method => $method, + main_method => $main, + sub_parts => $sub_parts, + },$class; +} + +=head2 disaggregate + + Title : disaggregate + Usage : $a->disaggregate($types,$factory) + Function: disaggregate type list into components + Returns : a true value if this aggregator should be called to reaggregate + Args : see below + Status : Public + +This method is called to disaggregate a list of types into the set of +low-level features to be retrieved from the GFF database. The list of +types is passed as an array reference containing a series of +[method,source] pairs. This method synthesizes a new set of +[method,source] pairs, and appends them to the list of requested +types, changing the list in situ. + +Arguments: + + $types reference to an array of [method,source] pairs + + $factory reference to the Adaptor object that is calling + this method + +Note that the API allows disaggregate() to remove types from the type +list. This feature is probably not desirable and may be deprecated in +the future. + +=cut + +# this is called at the beginning to turn the pseudo-type +# into its component feature types +sub disaggregate { + my $self = shift; + my $types = shift; + my $factory = shift; + + my $sub_features = $factory->parse_types($self->get_part_names); + my $main_feature = $factory->parse_types($self->get_main_name); + + if (@$types) { + my (@synthetic_types,@unchanged); + foreach (@$types) { + my ($method,$source) = @$_; + if (lc $method eq lc $self->get_method) { # e.g. "transcript" + push @synthetic_types,map { [$_->[0],$_->[1] || $source] } @$sub_features,@$main_feature; + } + else { + push @unchanged,$_; + } + } + # remember what we're searching for + $self->components(\@synthetic_types); + $self->passthru(\@unchanged); + @$types = (@unchanged,@synthetic_types); + } + + # we get here when no search types are listed + else { + my @stypes = map { [$_->[0],$_->[1]] } @$sub_features,@$main_feature; + $self->components(\@stypes); + $self->passthru(undef); + } + + return $self->component_count > 0; +} + + +=head2 aggregate + + Title : aggregate + Usage : $features = $a->aggregate($features,$factory) + Function: aggregate a feature list into composite features + Returns : an array reference containing modified features + Args : see below + Status : Public + +This method is called to aggregate a list of raw GFF features into the +set of composite features. The method is called an array reference to +a set of Bio::DB::GFF::Feature objects. It runs through the list, +creating new composite features when appropriate. The method result +is an array reference containing the composite features. + +Arguments: + + $features reference to an array of Bio::DB::GFF::Feature objects + + $factory reference to the Adaptor object that is calling + this method + +NOTE: The reason that the function result contains the raw features as +well as the aggregated ones is to allow queries like this one: + + @features = $segment->features('exon','transcript:curated'); + +Assuming that "transcript" is the name of an aggregated feature and +that "exon" is one of its components, we do not want the transcript +aggregator to remove features of type "exon" because the user asked +for them explicitly. + +=cut + +sub aggregate { + my $self = shift; + my $features = shift; + my $factory = shift; + + my $main_method = $self->get_main_name; + my $matchsub = $self->match_sub($factory) or return; + my $passthru = $self->passthru_sub($factory); + + my (%aggregates,@result); + for my $feature (@$features) { + if ($feature->group && $matchsub->($feature)) { + if ($main_method && lc $feature->method eq lc $main_method) { + $aggregates{$feature->group,$feature->refseq}{base} ||= $feature->clone; + } else { + push @{$aggregates{$feature->group,$feature->refseq}{subparts}},$feature; + } + push @result,$feature if $passthru && $passthru->($feature); + + } else { + push @result,$feature; + } + } + + # aggregate components + my $pseudo_method = $self->get_method; + my $require_whole_object = $self->require_whole_object; + foreach (keys %aggregates) { + if ($require_whole_object && $self->components) { + next unless $aggregates{$_}{base} && $aggregates{$_}{subparts}; + } + my $base = $aggregates{$_}{base}; + unless ($base) { # no base, so create one + my $first = $aggregates{$_}{subparts}[0]; + $base = $first->clone; # to inherit parent coordinate system, etc + $base->score(undef); + $base->phase(undef); + } + $base->method($pseudo_method); + $base->add_subfeature($_) foreach @{$aggregates{$_}{subparts}}; + $base->adjust_bounds; + $base->compound(1); # set the compound flag + push @result,$base; + } + @$features = @result; +} + + +=head2 method + + Title : method + Usage : $string = $a->method + Function: get the method type for the composite feature + Returns : a string + Args : none + Status : Protected + +This method is called to get the method to be assigned to the +composite feature once it is aggregated. It is called if the user did +not explicitly supply a -method argument when the aggregator was +created. + +This is the method that should be overridden in aggregator subclasses. + +=cut + +# no default method +sub method { + my $self = shift; + return; +} + +=head2 main_name + + Title : main_name + Usage : $string = $a->main_name + Function: get the method type for the "main" component of the feature + Returns : a string + Args : none + Status : Protected + +This method is called to get the method of the "main component" of the +composite feature. It is called if the user did not explicitly supply +a -main-method argument when the aggregator was created. + +This is the method that should be overridden in aggregator subclasses. + +=cut + +# no default main method +sub main_name { + my $self = shift; + return; +} + +=head2 part_names + + Title : part_names + Usage : @methods = $a->part_names + Function: get the methods for the non-main various components of the feature + Returns : a list of strings + Args : none + Status : Protected + +This method is called to get the list of methods of the "main component" of the +composite feature. It is called if the user did not explicitly supply +a -main-method argument when the aggregator was created. + +This is the method that should be overridden in aggregator subclasses. + +=cut + +# no default part names +sub part_names { + my $self = shift; + return; +} + +=head2 require_whole_object + + Title : require_whole_object + Usage : $bool = $a->require_whole_object + Function: see below + Returns : a boolean flag + Args : none + Status : Internal + +This method returns true if the aggregator should refuse to aggregate +an object unless both its main part and its subparts are present. + +=cut + +sub require_whole_object { 0; } + +=head2 match_sub + + Title : match_sub + Usage : $coderef = $a->match_sub($factory) + Function: generate a code reference that will match desired features + Returns : a code reference + Args : see below + Status : Internal + +This method is used internally to generate a code sub that will +quickly filter out the raw features that we're interested in +aggregating. The returned sub accepts a Feature and returns true if +we should aggregate it, false otherwise. + +=cut + +sub match_sub { + my $self = shift; + my $factory = shift; + my $types_to_aggregate = $self->components() or return; # saved from disaggregate call + return unless @$types_to_aggregate; + return $factory->make_match_sub($types_to_aggregate); +} + +sub passthru_sub { + my $self = shift; + my $factory = shift; + my $passthru = $self->passthru() or return; + return unless @$passthru; + return $factory->make_match_sub($passthru); +} + +=head2 components + + Title : components + Usage : @array= $a->components([$components]) + Function: get/set stored list of parsed raw feature types + Returns : an array in list context, an array ref in scalar context + Args : new arrayref of feature types + Status : Internal + +This method is used internally to remember the parsed list of raw +features that we will aggregate. The need for this subroutine is +seen when a user requests a composite feature of type +"clone:cosmid". This generates a list of components in which the +source is appended to the method, like "clone_left_end:cosmid" and +"clone_right_end:cosmid". components() stores this information for +later use. + +=cut + +sub components { + my $self = shift; + my $d = $self->{components}; + $self->{components} = shift if @_; + return unless ref $d; + return wantarray ? @$d : $d; +} + +sub component_count { + my @c = shift->components; + scalar @c; +} + +sub passthru { + my $self = shift; + my $d = $self->{passthru}; + $self->{passthru} = shift if @_; + return unless ref $d; + return wantarray ? @$d : $d; +} + +sub clone { + my $self = shift; + my %new = %{$self}; + return bless \%new,ref($self); +} + +=head2 get_part_names + + Title : get_part_names + Usage : @array = $a->get_part_names + Function: get list of sub-parts for this type of feature + Returns : an array + Args : none + Status : Internal + +This method is used internally to fetch the list of feature types that +form the components of the composite feature. Type names in the +format "method:source" are recognized, as are "method" and +Bio::DB::GFF::Typename objects as well. It checks instance variables +first, and if not defined calls the part_names() method. + +=cut + +sub get_part_names { + my $self = shift; + if ($self->{sub_parts}) { + return ref $self->{sub_parts} ? @{$self->{sub_parts}} : $self->{sub_parts}; + } else { + return $self->part_names; + } +} + +=head2 get_main_name + + Title : get_main_name + Usage : $string = $a->get_main_name + Function: get the "main" method type for this feature + Returns : a string + Args : none + Status : Internal + +This method is used internally to fetch the type of the "main part" of +the feature. It checks instance variables first, and if not defined +calls the main_name() method. + +=cut + +sub get_main_name { + my $self = shift; + return $self->{main_method} if defined $self->{main_method}; + return $self->main_name; +} + +=head2 get_method + + Title : get_method + Usage : $string = $a->get_method + Function: get the method type for the composite feature + Returns : a string + Args : none + Status : Internal + +This method is used internally to fetch the type of the method that +will be assigned to the composite feature once it is synthesized. + +=cut + +sub get_method { + my $self = shift; + return $self->{method} if defined $self->{method}; + return $self->method; +} + +1; + +=head1 BUGS + +None known yet. + +=head1 SEE ALSO + +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Aggregator/alignment.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Aggregator/alignment.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,136 @@ +=head1 NAME + +Bio::DB::GFF::Aggregator::alignment -- Alignment aggregator + +=head1 SYNOPSIS + + use Bio::DB::GFF; + + # Open the sequence database + my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', + -dsn => 'dbi:mysql:elegans42', + -aggregator => ['alignment'], + ); + + ----------------------------- + Aggregator method: alignment + Main method: (none) + Sub methods: similarity, HSP + ----------------------------- + +=head1 DESCRIPTION + +Bio::DB::GFF::Aggregator::alignment is one of the default aggregators, +and was written to be compatible with the C elegans GFF files. It +aggregates raw "similarity" features into composite features of type +"alignment". A better name for this class might be +"gapped_alignment." + +This aggregator does not insist that there be a single top-level +feature that spans one end of the alignment to the other. As a +result, it can produce truncated alignments if the entire alignment is +not contained within the segment of interest. + +=cut + +package Bio::DB::GFF::Aggregator::alignment; + +use strict; + +use Bio::DB::GFF::Aggregator; +use vars qw(@ISA); + +@ISA = qw(Bio::DB::GFF::Aggregator); + +=head2 aggregate + + Title : aggregate + Usage : $features = $a->aggregate($features,$factory) + Function: aggregate a feature list into composite features + Returns : an array reference containing modified features + Args : see L + Status : Public + +Because of the large number of similarity features, the aggregate() +method is overridden in order to perform some optimizations. + +=cut + +# we look for features of type Sequence and add them to a pseudotype transcript +sub aggregate { + my $self = shift; + my $features = shift; + my $factory = shift; + + my $matchsub = $self->match_sub($factory) or return; + my $passthru = $self->passthru_sub($factory); + my $method = $self->get_method; + + my (%alignments,%targets,@result); + + warn "running alignment aggregator" if $factory->debug; + for my $feature (@$features) { + + if ($matchsub->($feature)) { + + my $group = $feature->{group}; + my $source = $feature->source; + unless (exists $alignments{$group,$source}) { + my $type = Bio::DB::GFF::Typename->new($method,$source); + + my $f = $feature->clone; + # this is a violation of OO encapsulation, but need to do it this way + # to achieve desired performance + @{$f}{qw(type score phase)} = ($type,undef,undef); + + $alignments{$group,$source} = $f or next; + } + + my $main = $alignments{$group,$source}; + $main->add_subfeature($feature); + push @result,$feature if $passthru && $passthru->($feature); + } else { + push @result,$feature; + } + } + + warn "running aligner adjuster" if $factory->debug; + for my $alignment (values %alignments) { + $alignment->adjust_bounds; + $alignment->compound(1); + push @result,$alignment; + } + warn "aligner done" if $factory->debug; + @$features = @result; +} + +=head2 method + + Title : method + Usage : $aggregator->method + Function: return the method for the composite object + Returns : the string "alignment" + Args : none + Status : Public + +=cut + +sub method { 'alignment' } + +=head2 part_names + + Title : part_names + Usage : $aggregator->part_names + Function: return the methods for the sub-parts + Returns : the list ("similarity","HSP") + Args : none + Status : Public + +=cut + +sub part_names { + my $self = shift; + return qw(similarity HSP); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Aggregator/clone.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Aggregator/clone.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,162 @@ +=head1 NAME + +Bio::DB::GFF::Aggregator::clone -- Clone aggregator + +=head1 SYNOPSIS + + use Bio::DB::GFF; + + # Open the sequence database + my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', + -dsn => 'dbi:mysql:elegans42', + -aggregator => ['transcript','clone'], + ); + + ---------------------------------------------------------------------------- + Aggregator method: clone + Main method: -none- + Sub methods: Clone_left_end Clone_right_end Sequence:Genomic_canonical + ---------------------------------------------------------------------------- + +=head1 DESCRIPTION + +Bio::DB::GFF::Aggregator::clone is one of the default aggregators, and +was written to be compatible with the C elegans GFF files. It +aggregates raw "Clone_left_end", "Clone_right_end", and +"Sequence:Genomic_canonical" features into composite features of type +"clone". + +=cut + +package Bio::DB::GFF::Aggregator::clone; + +use strict; + +use Bio::DB::GFF::Aggregator; +use vars qw(@ISA); + +@ISA = qw(Bio::DB::GFF::Aggregator); + +=head2 aggregate + + Title : aggregate + Usage : $features = $a->aggregate($features,$factory) + Function: aggregate a feature list into composite features + Returns : an array reference containing modified features + Args : see L + Status : Public + +The WormBase GFF model is unusual in that clones aren't identified as +a single feature with start and stop positions, but as two features, a +"left end" and a "right end". One or both of these features may be +absent. In order to accomodate this, the aggregator will return undef +for the start and/or stop if one or both of the ends are missing. + +=cut + +#' + +# we look for features of type Sequence and add them to a pseudotype transcript +sub aggregate { + my $self = shift; + my $features = shift; + my $factory = shift; + + my $matchsub = $self->match_sub($factory) or return; + my $passthru = $self->passthru_sub($factory); + my $method = $self->get_method; + + my (%clones,%types,@result); + for my $feature (@$features) { + + if ($feature->group && $matchsub->($feature)) { + + if ($feature->method eq 'Sequence' && $feature->source eq 'Genomic_canonical') { + $clones{$feature->group}{canonical} = $feature; + } elsif ($feature->method eq 'Clone_left_end') { + $clones{$feature->group}{left} = $feature; + } elsif ($feature->method eq 'Clone_right_end') { + $clones{$feature->group}{right} = $feature; + } + push @result,$feature if $passthru && $passthru->($feature); + } else { + push @result,$feature; + } + } + + for my $clone (keys %clones) { + my $canonical = $clones{$clone}{canonical} or next; + + # the genomic_canonical doesn't tell us where the clone starts and stops + # so don't assume it + my $duplicate = $canonical->clone; # make a duplicate of the feature + # munge the method and source fields + my $source = $duplicate->source; + my $type = $types{$method,$source} ||= Bio::DB::GFF::Typename->new($method,$source); + $duplicate->type($type); + + my ($start,$stop) = $duplicate->strand > 0 ? ('start','stop') : ('stop','start'); + @{$duplicate}{$start,$stop} =(undef,undef); + + $duplicate->{$start} = $clones{$clone}{left}{$start} if exists $clones{$clone}{left}; + $duplicate->{$stop} = $clones{$clone}{right}{$stop} if exists $clones{$clone}{right}; + $duplicate->method($self->method); + push @result,$duplicate; + } + + @$features = @result; +} + +=head2 method + + Title : method + Usage : $aggregator->method + Function: return the method for the composite object + Returns : the string "clone" + Args : none + Status : Public + +=cut + +sub method { 'clone' } + +=head2 part_names + + Title : part_names + Usage : $aggregator->part_names + Function: return the methods for the sub-parts + Returns : the list ("Clone_left_end", "Clone_right_end", "Sequence:Genomic_canonical") + Args : none + Status : Public + +=cut + +sub part_names { + my $self = shift; + return qw(Clone_left_end Clone_right_end Sequence:Genomic_canonical); +} + +1; + +__END__ + +=head1 BUGS + +None reported. + + +=head1 SEE ALSO + +L, L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Aggregator/coding.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Aggregator/coding.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,104 @@ +=head1 NAME + +Bio::DB::GFF::Aggregator::coding -- The Coding Region Aggregator + +=head1 SYNOPSIS + + use Bio::DB::GFF; + + # Open the sequence database + my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', + -dsn => 'dbi:mysql:elegans42', + -aggregator => ['coding'], + ); + + ------------------------------------------------------------------------ + Aggregator method: coding + Main method: mRNA + Sub methods: CDS + ------------------------------------------------------------------------ + +=head1 DESCRIPTION + +Bio::DB::GFF::Aggregator::coding aggregates "CDS" features into a +feature called "coding" and was written to be compatible with the +Sequence Ontology canonical gene. The CDS features are expected to +belong to a parent of type "mRNA," but the aggregator will work even +if this isn't the case. + +=cut + +package Bio::DB::GFF::Aggregator::coding; + +use strict; +use Bio::DB::GFF::Aggregator; + +use vars qw(@ISA); +@ISA = qw(Bio::DB::GFF::Aggregator); + +=head2 method + + Title : method + Usage : $aggregator->method + Function: return the method for the composite object + Returns : the string "processed_transcript" + Args : none + Status : Public + +=cut + +sub method { 'coding' } + +=head2 part_names + + Title : part_names + Usage : $aggregator->part_names + Function: return the methods for the sub-parts + Returns : the list (CDS cds) + Args : none + Status : Public + +=cut + +sub part_names { + return qw(CDS cds); +} + +=head2 main_name + + Title : main_name + Usage : $aggregator->main_name + Function: return the method for the main component + Returns : the string "mRNA" + Args : none + Status : Public + +=cut + +sub main_name { + return 'mRNA'; +} + +1; +__END__ + +=head1 BUGS + +None reported. + + +=head1 SEE ALSO + +L, L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Aggregator/match.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Aggregator/match.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,107 @@ +=head1 NAME + +Bio::DB::GFF::Aggregator::match -- Match aggregator + +=head1 SYNOPSIS + + use Bio::DB::GFF; + + # Open the sequence database + my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', + -dsn => 'dbi:mysql:elegans42', + -aggregator => ['match'], + ); + + ------------------------------------------------- + Aggregator method: match + Main method: match + Sub methods: similarity HSP + ------------------------------------------------- + +=head1 DESCRIPTION + +This aggregator is used for Sequence Ontology-compatible gapped +alignments, in which there is a single top-level alignment called +"match" and a series of subalignments called either "similarity" or +"HSP". + +Also see the "alignment" aggregator. + +=cut + +package Bio::DB::GFF::Aggregator::match; + +use strict; +use Bio::DB::GFF::Aggregator; + +use vars qw(@ISA); +@ISA = qw(Bio::DB::GFF::Aggregator); + +=head2 method + + Title : method + Usage : $aggregator->method + Function: return the method for the composite object + Returns : the string "match" + Args : none + Status : Public + +=cut + +sub method { 'match' } + +=head2 part_names + + Title : part_names + Usage : $aggregator->part_names + Function: return the methods for the sub-parts + Returns : the list "similarity", "HSP" + Args : none + Status : Public + +=cut + +sub part_names { + return qw(similarity HSP); +} + +=head2 main_name + + Title : main_name + Usage : $aggregator->main_name + Function: return the method for the main component + Returns : the string "match" + Args : none + Status : Public + +=cut + +sub main_name { + return 'match'; +} + +sub require_whole_object {1} + +1; +__END__ + +=head1 BUGS + +None reported. + + +=head1 SEE ALSO + +L, L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Aggregator/none.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Aggregator/none.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,45 @@ +=head1 NAME + +Bio::DB::GFF::Aggregator::none -- No aggregation + +=head1 SYNOPSIS + + use Bio::DB::GFF; + + # Open the sequence database + my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', + -dsn => 'dbi:mysql:elegans42', + -aggregator => 'none' + ); + + +=head1 DESCRIPTION + +Bio::DB::GFF::Aggregator::none can be used to indicate that you do not +want any aggregation performed. It is equivalent to providing undef +to the B<-aggregator> argument. It overrides disaggregate() and +aggregate() so that they do exactly nothing. + +=cut + +package Bio::DB::GFF::Aggregator::none; + +use strict; +use Bio::DB::GFF::Aggregator; +use vars qw(@ISA); + +@ISA = qw(Bio::DB::GFF::Aggregator); + +sub disaggregate { + my $self = shift; + my $types = shift; + # no change +} + +sub aggregate { + my $self = shift; + my $features = shift; + return; # no change +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Aggregator/processed_transcript.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Aggregator/processed_transcript.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,108 @@ +=head1 NAME + +Bio::DB::GFF::Aggregator::processed_transcript -- Sequence Ontology Transcript + +=head1 SYNOPSIS + + use Bio::DB::GFF; + + # Open the sequence database + my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', + -dsn => 'dbi:mysql:elegans42', + -aggregator => ['processed_transcript'], + ); + + ------------------------------------------------------------------------ + Aggregator method: processed_transcript + Main method: mRNA + Sub methods: CDS 5'-UTR 3'-UTR transcription_start_site polyA_site + ------------------------------------------------------------------------ + +=head1 DESCRIPTION + +Bio::DB::GFF::Aggregator::processed_transcript is one of the default +aggregators, and was written to be compatible with the Sequence +Ontology canonical gene. It aggregates raw "CDS", "5'-UTR", "3'-UTR", +"transcription_start_site" and "polyA_site" features into "gene" +features. The UTRs may also be named "untranslated_region," +"five_prime_untranslated_region," or +"three_prime_untranslated_region." + +=cut + +package Bio::DB::GFF::Aggregator::processed_transcript; + +use strict; +use Bio::DB::GFF::Aggregator; + +use vars qw(@ISA); +@ISA = qw(Bio::DB::GFF::Aggregator); + +=head2 method + + Title : method + Usage : $aggregator->method + Function: return the method for the composite object + Returns : the string "processed_transcript" + Args : none + Status : Public + +=cut + +sub method { 'processed_transcript' } + +=head2 part_names + + Title : part_names + Usage : $aggregator->part_names + Function: return the methods for the sub-parts + Returns : the list CDS 5'-UTR 3'-UTR transcription_start_site polyA_site + Args : none + Status : Public + +=cut + +sub part_names { + return qw(CDS 5'-UTR 3'-UTR transcription_start_site + polyA_site UTR five_prime_untranslated_region + three_prime_untranslated_region); +} + +=head2 main_name + + Title : main_name + Usage : $aggregator->main_name + Function: return the method for the main component + Returns : the string "mRNA" + Args : none + Status : Public + +=cut + +sub main_name { + return 'mRNA'; +} + +1; +__END__ + +=head1 BUGS + +None reported. + + +=head1 SEE ALSO + +L, L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Aggregator/transcript.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Aggregator/transcript.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,116 @@ +=head1 NAME + +Bio::DB::GFF::Aggregator::transcript -- Transcript aggregator + +=head1 SYNOPSIS + + use Bio::DB::GFF; + + # Open the sequence database + my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', + -dsn => 'dbi:mysql:elegans42', + -aggregator => ['transcript','clone'], + ); + + ------------------------------------------------- + Aggregator method: transcript + Main method: transcript + Sub methods: exon CDS 5'UTR 3'UTR TSS PolyA + ------------------------------------------------- + +=head1 DESCRIPTION + +Bio::DB::GFF::Aggregator::transcript is one of the default +aggregators, and was written to be compatible with the C elegans GFF +files. It aggregates raw ""exon", "CDS", "5'UTR", "3'UTR", "polyA" +and "TSS" features into "transcript" features. For compatibility with +the idiosyncrasies of the Sanger GFF format, it expects that the full +range of the transcript is contained in a main feature of type +"Transcript" (notice the capital "T"). + +Internally this module is very simple. To override it with one that +recognizes a main feature named "gene", simply follow this +template: + + my $db = Bio::DB::GFF->new(...etc...) + my $aggregator = Bio::DB::GFF::Aggregator->new(-method => 'transcript', + -main_method => 'gene', + -sub_parts => ['exon','CDS']); + $db->add_aggregator($aggregator); + +=cut + +package Bio::DB::GFF::Aggregator::transcript; + +use strict; +use Bio::DB::GFF::Aggregator; + +use vars qw(@ISA); +@ISA = qw(Bio::DB::GFF::Aggregator); + +=head2 method + + Title : method + Usage : $aggregator->method + Function: return the method for the composite object + Returns : the string "transcript" + Args : none + Status : Public + +=cut + +sub method { 'transcript' } + +=head2 part_names + + Title : part_names + Usage : $aggregator->part_names + Function: return the methods for the sub-parts + Returns : the list "intron", "exon" and "CDS" + Args : none + Status : Public + +=cut + +sub part_names { + return qw(exon CDS 5'UTR 3'UTR TSS PolyA); +} + +=head2 main_name + + Title : main_name + Usage : $aggregator->main_name + Function: return the method for the main component + Returns : the string "transcript" + Args : none + Status : Public + +=cut + +sub main_name { + return 'transcript'; +} + +1; +__END__ + +=head1 BUGS + +None reported. + + +=head1 SEE ALSO + +L, L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Aggregator/ucsc_acembly.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Aggregator/ucsc_acembly.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,101 @@ +=head1 NAME + +Bio::DB::GFF::Aggregator::ucsc_acembly -- UCSC acembly aggregator + +=head1 SYNOPSIS + + use Bio::DB::GFF; + + # Open the sequence database + my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', + -dsn => 'dbi:mysql:elegans42', + -aggregator => ['transcript','clone'], + ); + + ------------------------------------------------- + Aggregator method: transcript + Main method: transcript + Sub methods: exon CDS 5'UTR 3'UTR TSS PolyA + ------------------------------------------------- + +=head1 DESCRIPTION + +L + +=cut + +package Bio::DB::GFF::Aggregator::ucsc_acembly; + +use strict; +use Bio::DB::GFF::Aggregator; + +use vars qw(@ISA); +@ISA = qw(Bio::DB::GFF::Aggregator); + + +=head2 method + + Title : method + Usage : $aggregator->method + Function: return the method for the composite object + Returns : the string "acembly" + Args : none + Status : Public + +=cut + +sub method { 'acembly' } + +=head2 part_names + + Title : part_names + Usage : $aggregator->part_names + Function: return the methods for the sub-parts + Returns : empty list + Args : none + Status : Public + +=cut + +sub part_names { + return (); +} + +=head2 main_name + + Title : main_name + Usage : $aggregator->main_name + Function: return the method for the main component + Returns : the string "transcript:acembly" + Args : none + Status : Public + +=cut + +sub main_name { + return 'transcript:acembly'; +} + +1; +__END__ + +=head1 BUGS + +None reported. + + +=head1 SEE ALSO + +L, L + +=head1 AUTHOR + +Allen Day Eallenday@ucla.eduE. + +Copyright (c) 2002 Allen Day, University of California, Los Angeles. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Aggregator/ucsc_ensgene.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Aggregator/ucsc_ensgene.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,101 @@ +=head1 NAME + +Bio::DB::GFF::Aggregator::ucsc_ensgene -- UCSC ensGene aggregator + +=head1 SYNOPSIS + + use Bio::DB::GFF; + + # Open the sequence database + my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', + -dsn => 'dbi:mysql:elegans42', + -aggregator => ['transcript','clone'], + ); + + ------------------------------------------------- + Aggregator method: ensgene + Main method: transcript + Sub methods: ensGene + ------------------------------------------------- + +=head1 DESCRIPTION + +L + +=cut + +package Bio::DB::GFF::Aggregator::ucsc_ensgene; + +use strict; +use Bio::DB::GFF::Aggregator; + +use vars qw(@ISA); +@ISA = qw(Bio::DB::GFF::Aggregator); + + +=head2 method + + Title : method + Usage : $aggregator->method + Function: return the method for the composite object + Returns : the string "ensgene" + Args : none + Status : Public + +=cut + +sub method { 'ensgene' } + +=head2 part_names + + Title : part_names + Usage : $aggregator->part_names + Function: return the methods for the sub-parts + Returns : empty list + Args : none + Status : Public + +=cut + +sub part_names { + return (); +} + +=head2 main_name + + Title : main_name + Usage : $aggregator->main_name + Function: return the method for the main component + Returns : the string "transcript:ensGene" + Args : none + Status : Public + +=cut + +sub main_name { + return 'transcript:ensGene'; +} + +1; +__END__ + +=head1 BUGS + +None reported. + + +=head1 SEE ALSO + +L, L + +=head1 AUTHOR + +Allen Day Eallenday@ucla.eduE. + +Copyright (c) 2002 Allen Day, University of California, Los Angeles. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Aggregator/ucsc_genscan.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Aggregator/ucsc_genscan.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,100 @@ +=head1 NAME + +Bio::DB::GFF::Aggregator::ucsc_genscan -- UCSC genscan aggregator + +=head1 SYNOPSIS + + use Bio::DB::GFF; + + # Open the sequence database + my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', + -dsn => 'dbi:mysql:elegans42', + -aggregator => ['transcript','clone'], + ); + + ------------------------------------------------- + Aggregator method: genscan + Main method: transcript + Sub methods: genscan + ------------------------------------------------- + +=head1 DESCRIPTION + +L + +=cut + +package Bio::DB::GFF::Aggregator::ucsc_genscan; + +use strict; +use Bio::DB::GFF::Aggregator; + +use vars qw(@ISA); +@ISA = qw(Bio::DB::GFF::Aggregator); + +=head2 method + + Title : method + Usage : $aggregator->method + Function: return the method for the composite object + Returns : the string "genscan" + Args : none + Status : Public + +=cut + +sub method { 'genscan' } + +=head2 part_names + + Title : part_names + Usage : $aggregator->part_names + Function: return the methods for the sub-parts + Returns : empty list + Args : none + Status : Public + +=cut + +sub part_names { + return (); +} + +=head2 main_name + + Title : main_name + Usage : $aggregator->main_name + Function: return the method for the main component + Returns : the string "transcript:genscan" + Args : none + Status : Public + +=cut + +sub main_name { + return 'transcript:genscan'; +} + +1; +__END__ + +=head1 BUGS + +None reported. + + +=head1 SEE ALSO + +L, L + +=head1 AUTHOR + +Allen Day Eallenday@ucla.eduE. + +Copyright (c) 2002 Allen Day, University of California, Los Angeles. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Aggregator/ucsc_refgene.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Aggregator/ucsc_refgene.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,100 @@ +=head1 NAME + +Bio::DB::GFF::Aggregator::ucsc_refgene -- UCSC refGene aggregator + +=head1 SYNOPSIS + + use Bio::DB::GFF; + + # Open the sequence database + my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', + -dsn => 'dbi:mysql:elegans42', + -aggregator => ['transcript','clone'], + ); + + ------------------------------------------------- + Aggregator method: refgene + Main method: transcript + Sub methods: refGene + ------------------------------------------------- + +=head1 DESCRIPTION + +L + +=cut + +package Bio::DB::GFF::Aggregator::ucsc_refgene; + +use strict; +use Bio::DB::GFF::Aggregator; + +use vars qw(@ISA); +@ISA = qw(Bio::DB::GFF::Aggregator); + +=head2 method + + Title : method + Usage : $aggregator->method + Function: return the method for the composite object + Returns : the string "refgene" + Args : none + Status : Public + +=cut + +sub method { 'refgene' } + +=head2 part_names + + Title : part_names + Usage : $aggregator->part_names + Function: return the methods for the sub-parts + Returns : empty list + Args : none + Status : Public + +=cut + +sub part_names { + return (); +} + +=head2 main_name + + Title : main_name + Usage : $aggregator->main_name + Function: return the method for the main component + Returns : the string "transcript:refGene" + Args : none + Status : Public + +=cut + +sub main_name { + return 'transcript:refGene'; +} + +1; +__END__ + +=head1 BUGS + +None reported. + + +=head1 SEE ALSO + +L, L + +=head1 AUTHOR + +Allen Day Eallenday@ucla.eduE. + +Copyright (c) 2002 Allen Day, University of California, Los Angeles. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Aggregator/ucsc_sanger22.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Aggregator/ucsc_sanger22.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,101 @@ +=head1 NAME + +Bio::DB::GFF::Aggregator::ucsc_sanger22 -- UCSC sanger22 aggregator + +=head1 SYNOPSIS + + use Bio::DB::GFF; + + # Open the sequence database + my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', + -dsn => 'dbi:mysql:elegans42', + -aggregator => ['transcript','clone'], + ); + + ------------------------------------------------- + Aggregator method: sanger22 + Main method: transcript + Sub methods: sanger22 + ------------------------------------------------- + +=head1 DESCRIPTION + +L + +=cut + +package Bio::DB::GFF::Aggregator::ucsc_sanger22; + +use strict; +use Bio::DB::GFF::Aggregator; + +use vars qw(@ISA); +@ISA = qw(Bio::DB::GFF::Aggregator); + + +=head2 method + + Title : method + Usage : $aggregator->method + Function: return the method for the composite object + Returns : the string "sanger22" + Args : none + Status : Public + +=cut + +sub method { 'sanger22' } + +=head2 part_names + + Title : part_names + Usage : $aggregator->part_names + Function: return the methods for the sub-parts + Returns : empty list + Args : none + Status : Public + +=cut + +sub part_names { + return (); +} + +=head2 main_name + + Title : main_name + Usage : $aggregator->main_name + Function: return the method for the main component + Returns : the string "transcript:sanger22" + Args : none + Status : Public + +=cut + +sub main_name { + return 'transcript:sanger22'; +} + +1; +__END__ + +=head1 BUGS + +None reported. + + +=head1 SEE ALSO + +L, L + +=head1 AUTHOR + +Allen Day Eallenday@ucla.eduE. + +Copyright (c) 2002 Allen Day, University of California, Los Angeles. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Aggregator/ucsc_sanger22pseudo.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Aggregator/ucsc_sanger22pseudo.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,101 @@ +=head1 NAME + +Bio::DB::GFF::Aggregator::ucsc_sanger22pseudo -- UCSC sanger22pseudo aggregator + +=head1 SYNOPSIS + + use Bio::DB::GFF; + + # Open the sequence database + my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', + -dsn => 'dbi:mysql:elegans42', + -aggregator => ['transcript','clone'], + ); + + ------------------------------------------------- + Aggregator method: sanger22pseudo + Main method: transcript + Sub methods: sanger22pseudo + ------------------------------------------------- + +=head1 DESCRIPTION + +L + +=cut + +package Bio::DB::GFF::Aggregator::ucsc_sanger22pseudo; + +use strict; +use Bio::DB::GFF::Aggregator; + +use vars qw(@ISA); +@ISA = qw(Bio::DB::GFF::Aggregator); + + +=head2 method + + Title : method + Usage : $aggregator->method + Function: return the method for the composite object + Returns : the string "sanger22pseudo" + Args : none + Status : Public + +=cut + +sub method { 'sanger22pseudo' } + +=head2 part_names + + Title : part_names + Usage : $aggregator->part_names + Function: return the methods for the sub-parts + Returns : empty list + Args : none + Status : Public + +=cut + +sub part_names { + return (); +} + +=head2 main_name + + Title : main_name + Usage : $aggregator->main_name + Function: return the method for the main component + Returns : the string "transcript:sanger22pseudo" + Args : none + Status : Public + +=cut + +sub main_name { + return 'transcript:sanger22pseudo'; +} + +1; +__END__ + +=head1 BUGS + +None reported. + + +=head1 SEE ALSO + +L, L + +=head1 AUTHOR + +Allen Day Eallenday@ucla.eduE. + +Copyright (c) 2002 Allen Day, University of California, Los Angeles. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Aggregator/ucsc_softberry.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Aggregator/ucsc_softberry.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,100 @@ +=head1 NAME + +Bio::DB::GFF::Aggregator::ucsc_softberry -- UCSC softberry aggregator + +=head1 SYNOPSIS + + use Bio::DB::GFF; + + # Open the sequence database + my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', + -dsn => 'dbi:mysql:elegans42', + -aggregator => ['transcript','clone'], + ); + + ------------------------------------------------- + Aggregator method: softberry + Main method: transcript + Sub methods: softberryGene + ------------------------------------------------- + +=head1 DESCRIPTION + +L + +=cut + +package Bio::DB::GFF::Aggregator::ucsc_softberry; + +use strict; +use Bio::DB::GFF::Aggregator; + +use vars qw(@ISA); +@ISA = qw(Bio::DB::GFF::Aggregator); + +=head2 method + + Title : method + Usage : $aggregator->method + Function: return the method for the composite object + Returns : the string "softberry" + Args : none + Status : Public + +=cut + +sub method { 'softberry' } + +=head2 part_names + + Title : part_names + Usage : $aggregator->part_names + Function: return the methods for the sub-parts + Returns : empty list + Args : none + Status : Public + +=cut + +sub part_names { + return (); +} + +=head2 main_name + + Title : main_name + Usage : $aggregator->main_name + Function: return the method for the main component + Returns : the string "transcript:softberryGene" + Args : none + Status : Public + +=cut + +sub main_name { + return 'transcript:softberryGene'; +} + +1; +__END__ + +=head1 BUGS + +None reported. + + +=head1 SEE ALSO + +L, L + +=head1 AUTHOR + +Allen Day Eallenday@ucla.eduE. + +Copyright (c) 2002 Allen Day, University of California, Los Angeles. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Aggregator/ucsc_twinscan.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Aggregator/ucsc_twinscan.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,100 @@ +=head1 NAME + +Bio::DB::GFF::Aggregator::ucsc_twinscan -- UCSC twinscan aggregator + +=head1 SYNOPSIS + + use Bio::DB::GFF; + + # Open the sequence database + my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', + -dsn => 'dbi:mysql:elegans42', + -aggregator => ['transcript','clone'], + ); + + ------------------------------------------------- + Aggregator method: twinscan + Main method: transcript + Sub methods: twinscan + ------------------------------------------------- + +=head1 DESCRIPTION + +L + +=cut + +package Bio::DB::GFF::Aggregator::ucsc_twinscan; + +use strict; +use Bio::DB::GFF::Aggregator; + +use vars qw(@ISA); +@ISA = qw(Bio::DB::GFF::Aggregator); + +=head2 method + + Title : method + Usage : $aggregator->method + Function: return the method for the composite object + Returns : the string "twinscan" + Args : none + Status : Public + +=cut + +sub method { 'twinscan' } + +=head2 part_names + + Title : part_names + Usage : $aggregator->part_names + Function: return the methods for the sub-parts + Returns : empty list + Args : none + Status : Public + +=cut + +sub part_names { + return (); +} + +=head2 main_name + + Title : main_name + Usage : $aggregator->main_name + Function: return the method for the main component + Returns : the string "transcript:twinscan" + Args : none + Status : Public + +=cut + +sub main_name { + return 'transcript:twinscan'; +} + +1; +__END__ + +=head1 BUGS + +None reported. + + +=head1 SEE ALSO + +L, L + +=head1 AUTHOR + +Allen Day Eallenday@ucla.eduE. + +Copyright (c) 2002 Allen Day, University of California, Los Angeles. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Aggregator/ucsc_unigene.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Aggregator/ucsc_unigene.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,102 @@ +=head1 NAME + +Bio::DB::GFF::Aggregator::ucsc_unigene -- UCSC UniGene aggregator + +=head1 SYNOPSIS + + use Bio::DB::GFF; + + # Open the sequence database + my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', + -dsn => 'dbi:mysql:elegans42', + -aggregator => ['transcript','clone'], + ); + + ------------------------------------------------- + Aggregator method: unigene + Main method: transcript + Sub methods: unigene_2 + ------------------------------------------------- + +=head1 DESCRIPTION + +L + +=cut + +package Bio::DB::GFF::Aggregator::ucsc_unigene; + +use strict; +use Bio::DB::GFF::Aggregator; + +use vars qw(@ISA); +@ISA = qw(Bio::DB::GFF::Aggregator); + + +=head2 method + + Title : method + Usage : $aggregator->method + Function: return the method for the composite object + Returns : the string "unigene" + Args : none + Status : Public + +=cut + +sub method { 'unigene' } + +=head2 part_names + + Title : part_names + Usage : $aggregator->part_names + Function: return the methods for the sub-parts + Returns : empty list + Args : none + Status : Public + +=cut + +sub part_names { + return (); +} + +=head2 main_name + + Title : main_name + Usage : $aggregator->main_name + Function: return the method for the main component + Returns : the string "transcript" + Args : none + Status : Public + +=cut + +sub main_name { +#transcript + return 'transcript:uniGene_2'; +} + +1; +__END__ + +=head1 BUGS + +None reported. + + +=head1 SEE ALSO + +L, L + +=head1 AUTHOR + +Allen Day Eallenday@ucla.eduE. + +Copyright (c) 2002 Allen Day, University of California, Los Angeles. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Featname.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Featname.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,155 @@ +=head1 NAME + +Bio::DB::GFF::Featname -- The name of a feature + +=head1 SYNOPSIS + + use Bio::DB::GFF; + + my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', + -dsn => 'dbi:mysql:elegans42'); + + my $feature = Bio::DB::GFF::Featname->new(Locus => 'unc-19'); + my $segment = $db->segment($feature); + +=head1 DESCRIPTION + +Bio::DB::GFF::Featname is the name of a feature. It contains two +fields: name and class. It is typically used by the Bio::DB::GFF +module to denote a group, and is accepted by +Bio::DB::Relsegment-Enew() and Bio::DB::GFF-Esegment() as a +replacement for the -name and -class arguments. + +=head1 METHODS + +=cut + +package Bio::DB::GFF::Featname; +use strict; +use vars '@ISA'; +use Bio::Root::RootI; +@ISA = qw(Bio::Root::RootI); + +use overload + '""' => 'asString', + fallback => 1; + +=head2 new + + Title : new + Usage : $name = Bio::DB::GFF::Featname->new($class,$name) + Function: create a new Bio::DB::GFF::Featname object + Returns : a new Bio::DB::GFF::Featname object + Args : class and ID + Status : Public + +=cut + +sub new { + # use a blessed array for speed + my $pack = shift; + bless [@_],$pack; # class,name +} + +sub _cleanup_methods { return; } + +=head2 id + + Title : id + Usage : $id = $name->id + Function: return a unique ID for the combination of class and name + Returns : a string + Args : none + Status : Public + +This method returns a unique combination of the name and class in the +form "class:name". Coincidentally, this is the same format used +by AceDB. + +=cut + +sub id { + my $self = shift; + return join ':',@$self; +} + +=head2 name + + Title : name + Usage : $name = $name->name + Function: return the name of the Featname + Returns : a string + Args : none + Status : Public + +=cut + +sub name { shift->[1] } + +=head2 class + + Title : class + Usage : $class = $name->class + Function: return the name of the Featname + Returns : a string + Args : none + Status : Public + +=cut + +sub class { shift->[0] } + +=head2 asString + + Title : asString + Usage : $string = $name->asString + Function: same as name() + Returns : a string + Args : none + Status : Public + +This method is used to overload the "" operator. It is equivalent to +calling name(). + +=cut + +sub asString { shift->name } + +=head2 clone + + Title : clone + Usage : $new_clone = $type->clone; + Function: clone this object + Returns : a new Bio::DB::GFF::Featname object + Args : none + Status : Public + +This method creates an exact copy of the object. + +=cut + +sub clone { + my $self = shift; + return bless [@$self],ref $self; +} + +=head1 BUGS + +This module is still under development. + +=head1 SEE ALSO + +L, L, L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Feature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Feature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1116 @@ +=head1 NAME + +Bio::DB::GFF::Feature -- A relative segment identified by a feature type + +=head1 SYNOPSIS + +See L. + +=head1 DESCRIPTION + +Bio::DB::GFF::Feature is a stretch of sequence that corresponding to a +single annotation in a GFF database. It inherits from +Bio::DB::GFF::RelSegment, and so has all the support for relative +addressing of this class and its ancestors. It also inherits from +Bio::SeqFeatureI and so has the familiar start(), stop(), +primary_tag() and location() methods (it implements Bio::LocationI +too, if needed). + +Bio::DB::GFF::Feature adds new methods to retrieve the annotation's +type, group, and other GFF attributes. Annotation types are +represented by Bio::DB::GFF::Typename objects, a simple class that has +two methods called method() and source(). These correspond to the +method and source fields of a GFF file. + +Annotation groups serve the dual purpose of giving the annotation a +human-readable name, and providing higher-order groupings of +subfeatures into features. The groups returned by this module are +objects of the Bio::DB::GFF::Featname class. + +Bio::DB::GFF::Feature inherits from and implements the abstract +methods of Bio::SeqFeatureI, allowing it to interoperate with other +Bioperl modules. + +Generally, you will not create or manipulate Bio::DB::GFF::Feature +objects directly, but use those that are returned by the +Bio::DB::GFF::RelSegment-Efeatures() method. + +=head2 Important note about start() vs end() + +If features are derived from segments that use relative addressing +(which is the default), then start() will be less than end() if the +feature is on the opposite strand from the reference sequence. This +breaks Bio::SeqI compliance, but is necessary to avoid having the real +genomic locations designated by start() and end() swap places when +changing reference points. + +To avoid this behavior, call $segment-Eabsolute(1) before fetching +features from it. This will force everything into absolute +coordinates. + +For example: + + my $segment = $db->segment('CHROMOSOME_I'); + $segment->absolute(1); + my @features = $segment->features('transcript'); + +=head1 API + +The remainder of this document describes the public and private +methods implemented by this module. + +=cut + +package Bio::DB::GFF::Feature; + +use strict; + +use Bio::DB::GFF::Util::Rearrange; +use Bio::DB::GFF::RelSegment; +use Bio::DB::GFF::Featname; +use Bio::DB::GFF::Typename; +use Bio::DB::GFF::Homol; +use Bio::SeqFeatureI; +use Bio::Root::Root; +use Bio::LocationI; + +use vars qw(@ISA $AUTOLOAD); +@ISA = qw(Bio::DB::GFF::RelSegment Bio::SeqFeatureI + Bio::Root::Root); + +#' + +*segments = \&sub_SeqFeature; +my %CONSTANT_TAGS = (method=>1, source=>1, score=>1, phase=>1, notes=>1, id=>1, group=>1); + +=head2 new_from_parent + + Title : new_from_parent + Usage : $f = Bio::DB::GFF::Feature->new_from_parent(@args); + Function: create a new feature object + Returns : new Bio::DB::GFF::Feature object + Args : see below + Status : Internal + +This method is called by Bio::DB::GFF to create a new feature using + +information obtained from the GFF database. It is one of two similar +constructors. This one is called when the feature is generated from a +RelSegment object, and should inherit that object's coordinate system. + +The 13 arguments are positional (sorry): + + $parent a Bio::DB::GFF::RelSegment object (or descendent) + $start start of this feature + $stop stop of this feature + $method this feature's GFF method + $source this feature's GFF source + $score this feature's score + $fstrand this feature's strand (relative to the source + sequence, which has its own strandedness!) + $phase this feature's phase + $group this feature's group (a Bio::DB::GFF::Featname object) + $db_id this feature's internal database ID + $group_id this feature's internal group database ID + $tstart this feature's target start + $tstop this feature's target stop + +tstart and tstop aren't used for anything at the moment, since the +information is embedded in the group object. + +=cut + +# this is called for a feature that is attached to a parent sequence, +# in which case it inherits its coordinate reference system and strandedness +sub new_from_parent { + my $package = shift; + my ($parent, + $start,$stop, + $method,$source,$score, + $fstrand,$phase, + $group,$db_id,$group_id, + $tstart,$tstop) = @_; + + ($start,$stop) = ($stop,$start) if defined($fstrand) and $fstrand eq '-'; + my $class = $group ? $group->class : $parent->class; + + my $self = bless { + factory => $parent->{factory}, + sourceseq => $parent->{sourceseq}, + strand => $parent->{strand}, + ref => $parent->{ref}, + refstart => $parent->{refstart}, + refstrand => $parent->{refstrand}, + absolute => $parent->{absolute}, + start => $start, + stop => $stop, + type => Bio::DB::GFF::Typename->new($method,$source), + fstrand => $fstrand, + score => $score, + phase => $phase, + group => $group, + db_id => $db_id, + group_id => $group_id, + class => $class, + },$package; + $self; +} + +=head2 new + + Title : new + Usage : $f = Bio::DB::GFF::Feature->new(@args); + Function: create a new feature object + Returns : new Bio::DB::GFF::Feature object + Args : see below + Status : Internal + +This method is called by Bio::DB::GFF to create a new feature using +information obtained from the GFF database. It is one of two similar +constructors. This one is called when the feature is generated +without reference to a RelSegment object, and should therefore use its +default coordinate system (relative to itself). + +The 11 arguments are positional: + + $factory a Bio::DB::GFF adaptor object (or descendent) + $srcseq the source sequence + $start start of this feature + $stop stop of this feature + $method this feature's GFF method + $source this feature's GFF source + $score this feature's score + $fstrand this feature's strand (relative to the source + sequence, which has its own strandedness!) + $phase this feature's phase + $group this feature's group + $db_id this feature's internal database ID + +=cut + +# 'This is called when creating a feature from scratch. It does not have +# an inherited coordinate system. +sub new { + my $package = shift; + my ($factory, + $srcseq, + $start,$stop, + $method,$source, + $score,$fstrand,$phase, + $group,$db_id,$group_id, + $tstart,$tstop) = @_; + + my $self = bless { },$package; + ($start,$stop) = ($stop,$start) if defined($fstrand) and $fstrand eq '-'; + + my $class = $group ? $group->class : 'Sequence'; + + @{$self}{qw(factory sourceseq start stop strand class)} = + ($factory,$srcseq,$start,$stop,$fstrand,$class); + + # if the target start and stop are defined, then we use this information to create + # the reference sequence + # THIS SHOULD BE BUILT INTO RELSEGMENT + if (0 && $tstart ne '' && $tstop ne '') { + if ($tstart < $tstop) { + @{$self}{qw(ref refstart refstrand)} = ($group,$start - $tstart + 1,'+'); + } else { + @{$self}{'start','stop'} = @{$self}{'stop','start'}; + @{$self}{qw(ref refstart refstrand)} = ($group,$tstop + $stop - 1,'-'); + } + + } else { + @{$self}{qw(ref refstart refstrand)} = ($srcseq,1,'+'); + } + + @{$self}{qw(type fstrand score phase group db_id group_id absolute)} = + (Bio::DB::GFF::Typename->new($method,$source),$fstrand,$score,$phase, + $group,$db_id,$group_id,$factory->{absolute}); + + $self; +} + +=head2 type + + Title : type + Usage : $type = $f->type([$newtype]) + Function: get or set the feature type + Returns : a Bio::DB::GFF::Typename object + Args : a new Typename object (optional) + Status : Public + +This method gets or sets the type of the feature. The type is a +Bio::DB::GFF::Typename object, which encapsulates the feature method +and source. + +The method() and source() methods described next provide shortcuts to +the individual fields of the type. + +=cut + +sub type { + my $self = shift; + my $d = $self->{type}; + $self->{type} = shift if @_; + $d; +} + +=head2 method + + Title : method + Usage : $method = $f->method([$newmethod]) + Function: get or set the feature method + Returns : a string + Args : a new method (optional) + Status : Public + +This method gets or sets the feature method. It is a convenience +feature that delegates the task to the feature's type object. + +=cut + +sub method { + my $self = shift; + my $d = $self->{type}->method; + $self->{type}->method(shift) if @_; + $d; +} + +=head2 source + + Title : source + Usage : $source = $f->source([$newsource]) + Function: get or set the feature source + Returns : a string + Args : a new source (optional) + Status : Public + +This method gets or sets the feature source. It is a convenience +feature that delegates the task to the feature's type object. + +=cut + +sub source { + my $self = shift; + my $d = $self->{type}->source; + $self->{type}->source(shift) if @_; + $d; +} + +=head2 score + + Title : score + Usage : $score = $f->score([$newscore]) + Function: get or set the feature score + Returns : a string + Args : a new score (optional) + Status : Public + +This method gets or sets the feature score. + +=cut + +sub score { + my $self = shift; + my $d = $self->{score}; + $self->{score} = shift if @_; + $d; +} + +=head2 phase + + Title : phase + Usage : $phase = $f->phase([$phase]) + Function: get or set the feature phase + Returns : a string + Args : a new phase (optional) + Status : Public + +This method gets or sets the feature phase. + +=cut + +sub phase { + my $self = shift; + my $d = $self->{phase}; + $self->{phase} = shift if @_; + $d; +} + +=head2 strand + + Title : strand + Usage : $strand = $f->strand + Function: get the feature strand + Returns : +1, 0 -1 + Args : none + Status : Public + +Returns the strand of the feature. Unlike the other methods, the +strand cannot be changed once the object is created (due to coordinate +considerations). + +=cut + +sub strand { + my $self = shift; + return 0 unless $self->{fstrand}; + if ($self->absolute) { + return Bio::DB::GFF::RelSegment::_to_strand($self->{fstrand}); + } + return $self->SUPER::strand; +} + +=head2 group + + Title : group + Usage : $group = $f->group([$new_group]) + Function: get or set the feature group + Returns : a Bio::DB::GFF::Featname object + Args : a new group (optional) + Status : Public + +This method gets or sets the feature group. The group is a +Bio::DB::GFF::Featname object, which has an ID and a class. + +=cut + +sub group { + my $self = shift; + my $d = $self->{group}; + $self->{group} = shift if @_; + $d; +} + +=head2 display_id + + Title : display_id + Usage : $display_id = $f->display_id([$display_id]) + Function: get or set the feature display id + Returns : a Bio::DB::GFF::Featname object + Args : a new display_id (optional) + Status : Public + +This method is an alias for group(). It is provided for +Bio::SeqFeatureI compatibility. + +=cut + +=head2 info + + Title : info + Usage : $info = $f->info([$new_info]) + Function: get or set the feature group + Returns : a Bio::DB::GFF::Featname object + Args : a new group (optional) + Status : Public + +This method is an alias for group(). It is provided for AcePerl +compatibility. + +=cut + +*info = \&group; +*display_id = \&group; +*display_name = \&group; + +=head2 target + + Title : target + Usage : $target = $f->target([$new_target]) + Function: get or set the feature target + Returns : a Bio::DB::GFF::Featname object + Args : a new group (optional) + Status : Public + +This method works like group(), but only returns the group if it +implements the start() method. This is typical for +similarity/assembly features, where the target encodes the start and stop +location of the alignment. + +=cut + +sub target { + my $self = shift; + my $group = $self->group or return; + return unless $group->can('start'); + $group; +} + +=head2 hit + + Title : hit + Usage : $hit = $f->hit([$new_hit]) + Function: get or set the feature hit + Returns : a Bio::DB::GFF::Featname object + Args : a new group (optional) + Status : Public + +This is the same as target(), for compatibility with +Bio::SeqFeature::SimilarityPair. + +=cut + +*hit = \⌖ + +=head2 id + + Title : id + Usage : $id = $f->id + Function: get the feature ID + Returns : a database identifier + Args : none + Status : Public + +This method retrieves the database identifier for the feature. It +cannot be changed. + +=cut + +sub id { shift->{db_id} } + +=head2 group_id + + Title : group_id + Usage : $id = $f->group_id + Function: get the feature ID + Returns : a database identifier + Args : none + Status : Public + +This method retrieves the database group identifier for the feature. +It cannot be changed. Often the group identifier is more useful than +the feature identifier, since it is used to refer to a complex object +containing subparts. + +=cut + +sub group_id { shift->{group_id} } + +=head2 clone + + Title : clone + Usage : $feature = $f->clone + Function: make a copy of the feature + Returns : a new Bio::DB::GFF::Feature object + Args : none + Status : Public + +This method returns a copy of the feature. + +=cut + +sub clone { + my $self = shift; + my $clone = $self->SUPER::clone; + + if (ref(my $t = $clone->type)) { + my $type = $t->can('clone') ? $t->clone : bless {%$t},ref $t; + $clone->type($type); + } + + if (ref(my $g = $clone->group)) { + my $group = $g->can('clone') ? $g->clone : bless {%$g},ref $g; + $clone->group($group); + } + + if (my $merged = $self->{merged_segs}) { + $clone->{merged_segs} = { %$merged }; + } + + $clone; +} + +=head2 compound + + Title : compound + Usage : $flag = $f->compound([$newflag]) + Function: get or set the compound flag + Returns : a boolean + Args : a new flag (optional) + Status : Public + +This method gets or sets a flag indicated that the feature is not a +primary one from the database, but the result of aggregation. + +=cut + +sub compound { + my $self = shift; + my $d = $self->{compound}; + $self->{compound} = shift if @_; + $d; +} + +=head2 sub_SeqFeature + + Title : sub_SeqFeature + Usage : @feat = $feature->sub_SeqFeature([$method]) + Function: get subfeatures + Returns : a list of Bio::DB::GFF::Feature objects + Args : a feature method (optional) + Status : Public + +This method returns a list of any subfeatures that belong to the main +feature. For those features that contain heterogeneous subfeatures, +you can retrieve a subset of the subfeatures by providing a method +name to filter on. + +For AcePerl compatibility, this method may also be called as +segments(). + +=cut + +sub sub_SeqFeature { + my $self = shift; + my $type = shift; + my $subfeat = $self->{subfeatures} or return; + $self->sort_features; + my @a; + if ($type) { + my $features = $subfeat->{lc $type} or return; + @a = @{$features}; + } else { + @a = map {@{$_}} values %{$subfeat}; + } + return @a; +} + +=head2 add_subfeature + + Title : add_subfeature + Usage : $feature->add_subfeature($feature) + Function: add a subfeature to the feature + Returns : nothing + Args : a Bio::DB::GFF::Feature object + Status : Public + +This method adds a new subfeature to the object. It is used +internally by aggregators, but is available for public use as well. + +=cut + +sub add_subfeature { + my $self = shift; + my $feature = shift; + my $type = $feature->method; + my $subfeat = $self->{subfeatures}{lc $type} ||= []; + push @{$subfeat},$feature; +} + +=head2 attach_seq + + Title : attach_seq + Usage : $sf->attach_seq($seq) + Function: Attaches a Bio::Seq object to this feature. This + Bio::Seq object is for the *entire* sequence: ie + from 1 to 10000 + Example : + Returns : TRUE on success + Args : a Bio::PrimarySeqI compliant object + +=cut + +sub attach_seq { } + + +=head2 location + + Title : location + Usage : my $location = $seqfeature->location() + Function: returns a location object suitable for identifying location + of feature on sequence or parent feature + Returns : Bio::LocationI object + Args : none + +=cut + +sub location { + my $self = shift; + require Bio::Location::Split unless Bio::Location::Split->can('new'); + require Bio::Location::Simple unless Bio::Location::Simple->can('new'); + + my $location; + if (my @segments = $self->segments) { + $location = Bio::Location::Split->new(-seq_id => $self->seq_id); + foreach (@segments) { + $location->add_sub_Location($_->location); + } + } else { + $location = Bio::Location::Simple->new(-start => $self->start, + -end => $self->stop, + -strand => $self->strand, + -seq_id => $self->seq_id); + } + $location; +} + +=head2 entire_seq + + Title : entire_seq + Usage : $whole_seq = $sf->entire_seq() + Function: gives the entire sequence that this seqfeature is attached to + Example : + Returns : a Bio::PrimarySeqI compliant object, or undef if there is no + sequence attached + Args : none + + +=cut + +sub entire_seq { + my $self = shift; + $self->factory->segment($self->sourceseq); +} + +=head2 merged_segments + + Title : merged_segments + Usage : @segs = $feature->merged_segments([$method]) + Function: get merged subfeatures + Returns : a list of Bio::DB::GFF::Feature objects + Args : a feature method (optional) + Status : Public + +This method acts like sub_SeqFeature, except that it merges +overlapping segments of the same time into contiguous features. For +those features that contain heterogeneous subfeatures, you can +retrieve a subset of the subfeatures by providing a method name to +filter on. + +A side-effect of this method is that the features are returned in +sorted order by their start tposition. + +=cut + +#' + +sub merged_segments { + my $self = shift; + my $type = shift; + $type ||= ''; # prevent uninitialized variable warnings + + my $truename = overload::StrVal($self); + + return @{$self->{merged_segs}{$type}} if exists $self->{merged_segs}{$type}; + my @segs = map { $_->[0] } + sort { $a->[1] <=> $b->[1] || + $a->[2] cmp $b->[2] } + map { [$_, $_->start, $_->type] } $self->sub_SeqFeature($type); + + # attempt to merge overlapping segments + my @merged = (); + for my $s (@segs) { + my $previous = $merged[-1] if @merged; + my ($pscore,$score) = (eval{$previous->score}||0,eval{$s->score}||0); + if (defined($previous) + && $previous->stop+1 >= $s->start + && (!defined($s->score) || $previous->score == $s->score) + && $previous->method eq $s->method + ) { + if ($self->absolute && $self->strand < 0) { + $previous->{start} = $s->{start}; + } else { + $previous->{stop} = $s->{stop}; + } + # fix up the target too + my $g = $previous->{group}; + if ( ref($g) && $g->isa('Bio::DB::GFF::Homol')) { + my $cg = $s->{group}; + $g->{stop} = $cg->{stop}; + } + } elsif (defined($previous) + && $previous->start == $s->start + && $previous->stop == $s->stop) { + next; + } else { + my $copy = $s->clone; + push @merged,$copy; + } + } + $self->{merged_segs}{$type} = \@merged; + @merged; +} + +=head2 sub_types + + Title : sub_types + Usage : @methods = $feature->sub_types + Function: get methods of all sub-seqfeatures + Returns : a list of method names + Args : none + Status : Public + +For those features that contain subfeatures, this method will return a +unique list of method names of those subfeatures, suitable for use +with sub_SeqFeature(). + +=cut + +sub sub_types { + my $self = shift; + my $subfeat = $self->{subfeatures} or return; + return keys %$subfeat; +} + +=head2 attributes + + Title : attributes + Usage : @attributes = $feature->attributes($name) + Function: get the "attributes" on a particular feature + Returns : an array of string + Args : feature ID + Status : public + +Some GFF version 2 files use the groups column to store a series of +attribute/value pairs. In this interpretation of GFF, the first such +pair is treated as the primary group for the feature; subsequent pairs +are treated as attributes. Two attributes have special meaning: +"Note" is for backward compatibility and is used for unstructured text +remarks. "Alias" is considered as a synonym for the feature name. + + @gene_names = $feature->attributes('Gene'); + @aliases = $feature->attributes('Alias'); + +If no name is provided, then attributes() returns a flattened hash, of +attribute=Evalue pairs. This lets you do: + + %attributes = $db->attributes; + +=cut + +sub attributes { + my $self = shift; + my $factory = $self->factory; + defined(my $id = $self->id) or return; + $factory->attributes($id,@_) +} + + +=head2 notes + + Title : notes + Usage : @notes = $feature->notes + Function: get the "notes" on a particular feature + Returns : an array of string + Args : feature ID + Status : public + +Some GFF version 2 files use the groups column to store various notes +and remarks. Adaptors can elect to store the notes in the database, +or just ignore them. For those adaptors that store the notes, the +notes() method will return them as a list. + +=cut + +sub notes { + my $self = shift; + $self->attributes('Note'); +} + +=head2 aliases + + Title : aliases + Usage : @aliases = $feature->aliases + Function: get the "aliases" on a particular feature + Returns : an array of string + Args : feature ID + Status : public + +This method will return a list of attributes of type 'Alias'. + +=cut + +sub aliases { + my $self = shift; + $self->attributes('Alias'); +} + + + +=head2 Autogenerated Methods + + Title : AUTOLOAD + Usage : @subfeat = $feature->Method + Function: Return subfeatures using autogenerated methods + Returns : a list of Bio::DB::GFF::Feature objects + Args : none + Status : Public + +Any method that begins with an initial capital letter will be passed +to AUTOLOAD and treated as a call to sub_SeqFeature with the method +name used as the method argument. For instance, this call: + + @exons = $feature->Exon; + +is equivalent to this call: + + @exons = $feature->sub_SeqFeature('exon'); + +=cut + +=head2 SeqFeatureI methods + +The following Bio::SeqFeatureI methods are implemented: + +primary_tag(), source_tag(), all_tags(), has_tag(), each_tag_value() [renamed get_tag_values()]. + +=cut + +*primary_tag = \&method; +*source_tag = \&source; +sub all_tags { + my $self = shift; + my @tags = keys %CONSTANT_TAGS; + # autogenerated methods + if (my $subfeat = $self->{subfeatures}) { + push @tags,keys %$subfeat; + } + @tags; +} +*get_all_tags = \&all_tags; + +sub has_tag { + my $self = shift; + my $tag = shift; + my %tags = map {$_=>1} $self->all_tags; + return $tags{$tag}; +} + +*each_tag_value = \&get_tag_values; + +sub get_tag_values { + my $self = shift; + my $tag = shift; + return $self->$tag() if $CONSTANT_TAGS{$tag}; + $tag = ucfirst $tag; + return $self->$tag(); # try autogenerated tag +} + +sub AUTOLOAD { + my($pack,$func_name) = $AUTOLOAD=~/(.+)::([^:]+)$/; + my $sub = $AUTOLOAD; + my $self = $_[0]; + + # ignore DESTROY calls + return if $func_name eq 'DESTROY'; + + # fetch subfeatures if func_name has an initial cap +# return sort {$a->start <=> $b->start} $self->sub_SeqFeature($func_name) if $func_name =~ /^[A-Z]/; + return $self->sub_SeqFeature($func_name) if $func_name =~ /^[A-Z]/; + + # error message of last resort + $self->throw(qq(Can't locate object method "$func_name" via package "$pack")); +}#' + +=head2 adjust_bounds + + Title : adjust_bounds + Usage : $feature->adjust_bounds + Function: adjust the bounds of a feature + Returns : ($start,$stop,$strand) + Args : none + Status : Public + +This method adjusts the boundaries of the feature to enclose all its +subfeatures. It returns the new start, stop and strand of the +enclosing feature. + +=cut + +# adjust a feature so that its boundaries are synched with its subparts' boundaries. +# this works recursively, so subfeatures can contain other features +sub adjust_bounds { + my $self = shift; + my $g = $self->{group}; + + if (my $subfeat = $self->{subfeatures}) { + for my $list (values %$subfeat) { + for my $feat (@$list) { + + # fix up our bounds to hold largest subfeature + my($start,$stop,$strand) = $feat->adjust_bounds; + $self->{fstrand} = $strand unless defined $self->{fstrand}; + my ($low,$high) = $start < $stop ? ($start,$stop) : ($stop,$start); + if ($self->{fstrand} ne '-') { + $self->{start} = $low if !defined($self->{start}) || $low < $self->{start}; + $self->{stop} = $high if !defined($self->{stop}) || $high > $self->{stop}; + } else { + $self->{start} = $high if !defined($self->{start}) || $high > $self->{start}; + $self->{stop} = $low if !defined($self->{stop}) || $low < $self->{stop}; + } + + # fix up endpoints of targets too (for homologies only) + my $h = $feat->group; + next unless $h && $h->isa('Bio::DB::GFF::Homol'); + next unless $g && $g->isa('Bio::DB::GFF::Homol'); + ($start,$stop) = ($h->{start},$h->{stop}); + if ($start <= $stop) { + $g->{start} = $start if !defined($g->{start}) || $start < $g->{start}; + $g->{stop} = $stop if !defined($g->{stop}) || $stop > $g->{stop}; + } else { + $g->{start} = $start if !defined($g->{start}) || $start > $g->{start}; + $g->{stop} = $stop if !defined($g->{stop}) || $stop < $g->{stop}; + } + } + } + } + + ($self->{start},$self->{stop},$self->strand); +} + +=head2 sort_features + + Title : sort_features + Usage : $feature->sort_features + Function: sort features + Returns : nothing + Args : none + Status : Public + +This method sorts subfeatures in ascending order by their start +position. For reverse strand features, it sorts subfeatures in +descending order. After this is called sub_SeqFeature will return the +features in order. + +This method is called internally by merged_segments(). + +=cut + +# sort features +sub sort_features { + my $self = shift; + return if $self->{sorted}++; + my $strand = $self->strand or return; + my $subfeat = $self->{subfeatures} or return; + for my $type (keys %$subfeat) { + $subfeat->{$type} = [map { $_->[0] } + sort {$a->[1] <=> $b->[1] } + map { [$_,$_->start] } + @{$subfeat->{$type}}] if $strand > 0; + $subfeat->{$type} = [map { $_->[0] } + sort {$b->[1] <=> $a->[1]} + map { [$_,$_->start] } + @{$subfeat->{$type}}] if $strand < 0; + } +} + +=head2 asString + + Title : asString + Usage : $string = $feature->asString + Function: return human-readabled representation of feature + Returns : a string + Args : none + Status : Public + +This method returns a human-readable representation of the feature and +is called by the overloaded "" operator. + +=cut + +sub asString { + my $self = shift; + my $type = $self->type; + my $name = $self->group; + return "$type($name)" if $name; + return $type; +# my $type = $self->method; +# my $id = $self->group || 'unidentified'; +# return join '/',$id,$type,$self->SUPER::asString; +} + +sub name { + my $self =shift; + return $self->group || $self->SUPER::name; +} + +sub gff_string { + my $self = shift; + my ($start,$stop) = ($self->start,$self->stop); + + # the defined() tests prevent uninitialized variable warnings, when dealing with clone objects + # whose endpoints may be undefined + ($start,$stop) = ($stop,$start) if defined($start) && defined($stop) && $start > $stop; + + my ($class,$name) = ('',''); + my @group; + if (my $t = $self->target) { + my $class = $t->class; + my $name = $t->name; + my $start = $t->start; + my $stop = $t->stop; + push @group,qq(Target "$class:$name" $start $stop); + } + + elsif (my $g = $self->group) { + $class = $g->class || ''; + $name = $g->name || ''; + push @group,"$class $name"; + } + push @group,map {qq(Note "$_")} $self->notes; + + my $group_field = join ' ; ',@group; + my $strand = ('-','.','+')[$self->strand+1]; + my $ref = $self->refseq; + my $n = ref($ref) ? $ref->name : $ref; + my $phase = $self->phase; + $phase = '.' unless defined $phase; + return join("\t",$n,$self->source,$self->method,$start||'.',$stop||'.',$self->score||'.',$strand||'.',$phase,$group_field); +} + +=head1 A Note About Similarities + +The current default aggregator for GFF "similarity" features creates a +composite Bio::DB::GFF::Feature object of type "gapped_alignment". +The target() method for the feature as a whole will return a +RelSegment object that is as long as the extremes of the similarity +hit target, but will not necessarily be the same length as the query +sequence. The length of each "similarity" subfeature will be exactly +the same length as its target(). These subfeatures are essentially +the HSPs of the match. + +The following illustrates this: + + @similarities = $segment->feature('similarity:BLASTN'); + $sim = $similarities[0]; + + print $sim->type; # yields "gapped_similarity:BLASTN" + + $query_length = $sim->length; + $target_length = $sim->target->length; # $query_length != $target_length + + @matches = $sim->Similarity; # use autogenerated method + $query1_length = $matches[0]->length; + $target1_length = $matches[0]->target->length; # $query1_length == $target1_length + +If you merge segments by calling merged_segments(), then the length of +the query sequence segments will no longer necessarily equal the +length of the targets, because the alignment information will have +been lost. Nevertheless, the targets are adjusted so that the first +and last base pairs of the query match the first and last base pairs +of the target. + +=cut + +1; + +=head1 BUGS + +This module is still under development. + +=head1 SEE ALSO + +L, L, L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Homol.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Homol.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,102 @@ +=head1 NAME + +Bio::DB::GFF::Homol -- A segment of DNA that is homologous to another + +=head1 SYNOPSIS + +See L. + +=head1 DESCRIPTION + +Bio::DB::GFF::Homol is a named subtype of Bio::DB::GFF::Segment. It +inherits all the methods of its parent, and was created primarily to +allow for isa() queries and for compatibility with +Ace::Sequence::Homol. + +A Homol object is typically returned as the method result of the +Bio::DB::GFF::Feature-Etarget() method. + +=head1 METHODS + +=cut + +package Bio::DB::GFF::Homol; +use strict; + +use Bio::DB::GFF::Segment; +use vars qw(@ISA); +@ISA = 'Bio::DB::GFF::Segment'; + +=head2 name + + Title : name + Usage : $name = $homol->name + Function: get the ID of the homology object + Returns : a string + Args : none + Status : Public + +=cut + +sub name { shift->refseq } + +=head2 asString + + Title : asString + Usage : $name = $homol->asString + Function: same as name(), for operator overloading + Returns : a string + Args : none + Status : Public + +=cut + +sub asString { shift->name } + + +=head2 id + + Title : id + Usage : $id = $homol->id + Function: get database ID in class:id format + Returns : a string + Args : none + Status : Public + +=cut + +sub id { + my $self = shift; + return "$self->{class}:$self->{name}"; +} + +sub new_from_segment { + my $package = shift; + $package = ref $package if ref $package; + my $segment = shift; + my $new = {}; + @{$new}{qw(factory sourceseq start stop strand class ref refstart refstrand)} + = @{$segment}{qw(factory sourceseq start stop strand class ref refstart refstrand)}; + return bless $new,__PACKAGE__; +} + +=head1 BUGS + +This module is still under development. + +=head1 SEE ALSO + +L, L, L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/RelSegment.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/RelSegment.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1099 @@ +=head1 NAME + +Bio::DB::GFF::RelSegment -- Sequence segment with relative coordinate support + +=head1 SYNOPSIS + +See L. + +=head1 DESCRIPTION + +Bio::DB::GFF::RelSegment is a stretch of sequence that can handle +relative coordinate addressing. It inherits from +Bio::DB::GFF::Segment, and is the base class for +Bio::DB::GFF::Feature. + +In addition to the source sequence, a relative segment has a +"reference sequence", which is used as the basis for its coordinate +system. The reference sequence can be changed at will, allowing you +freedom to change the "frame of reference" for features contained +within the segment. For example, by setting a segment's reference +sequence to the beginning of a gene, you can view all other features +in gene-relative coordinates. + +The reference sequence and the source sequence must be on the same +physical stretch of DNA, naturally. However, they do not have to be +on the same strand. The strandedness of the reference sequence +determines whether coordinates increase to the right or the left. + +Generally, you will not create or manipulate Bio::DB::GFF::RelSeg0ment +objects directly, but use those that are returned by the Bio::DB::GFF +module. + +=head2 An Example + +To understand how relative coordinates work, consider the following +example from the C. elegans database. First we create the appropriate +GFF accessor object (the factory): + + my $db = Bio::DB::GFF->new(-dsn => 'dbi:mysql:elegans', + -adaptor=>'dbi:mysqlopt'); + +Now we fetch out a segment based on cosmid clone ZK909: + + my $seg = $db->segment('ZK909'); + +If we call the segment's refseq() method, we see that the base of the +coordinate system is the sequence "ZK154", and that its start and +stop positions are 1 and the length of the cosmid: + + print $seg->refseq; + => ZK909 + + print $seg->start,' - ',$seg->stop; + => 1 - 33782 + +As a convenience, the "" operator is overloaded in this class, to give +the reference sequence, and start and stop positions: + + print $seg; + => ZK909:1,33782 + +Internally, Bio::DB::GFF::RelSegment has looked up the absolute +coordinates of this segment and maintains the source sequence and the +absolute coordinates relative to the source sequence. We can see this +information using sourceseq() (inherited from Bio::DB::GFF::Segment) +and the abs_start() and abs_end() methods: + + print $seg->sourceseq; + => CHROMOSOME_I + + print $seg->abs_start,' - ',$seg->abs_end; + => 14839545 - 14873326 + +We can also put the segment into absolute mode, so that it behaves +like Bio::DB::Segment, and always represents coordinates on the source +sequence. This is done by passing a true value to the absolute() +method: + + $seq->absolute(1); + print $seg; + => CHROMOSOME_I:14839545,14873326 + +We can change the reference sequence at any time. One way is to call +the segment's ref() method, giving it the ID (and optionally the +class) of another landmark on the genome. For example, if we know +that cosmid ZK337 is adjacent to ZK909, then we can view ZK909 in +ZK337-relative coordinates: + + $seg->refseq('ZK337'); + print $seg; + => ZK337:-33670,111 + +We can call the segment's features() method in order to get the list +of contigs that overlap this segment (in the C. elegans database, +contigs have feature type "Sequence:Link"): + + @links = $seg->features('Sequence:Link'); + +We can now set the reference sequence to the first of these contigs like so: + + $seg->refseq($links[0]); + print $seg; + => Sequence:Link(LINK_Y95D11A):3997326,4031107 + +=cut + +package Bio::DB::GFF::RelSegment; + +use strict; + +use Bio::DB::GFF::Feature; +use Bio::DB::GFF::Util::Rearrange; +use Bio::DB::GFF::Segment; +use Bio::RangeI; + +use vars qw(@ISA); +@ISA = qw(Bio::DB::GFF::Segment); + +use overload '""' => 'asString', + 'bool' => sub { overload::StrVal(shift) }, + fallback=>1; + +=head1 API + +The remainder of this document describes the API for +Bio::DB::GFF::Segment. + +=cut + +=head2 new + + Title : new + Usage : $s = Bio::DB::GFF::RelSegment->new(@args) + Function: create a new relative segment + Returns : a new Bio::DB::GFF::RelSegment object + Args : see below + Status : Public + +This method creates a new Bio::DB::GFF::RelSegment object. Generally +this is called automatically by the Bio::DB::GFF module and +derivatives. + +This function uses a named-argument style: + + -factory a Bio::DB::GFF::Adaptor to use for database access + -seq ID of the source sequence + -class class of the source sequence + -start start of the desired segment relative to source sequence + -stop stop of the desired segment relative to source sequence + -ref ID of the reference sequence + -refclass class of the reference sequence + -offset 0-based offset from source sequence to start of segment + -length length of desired segment + -absolute, -force_absolute + use absolute coordinates, rather than coordinates relative + to the start of self or the reference sequence + +The -seq argument accepts the ID of any landmark in the database. The +stored source sequence becomes whatever the GFF file indicates is the +proper sequence for this landmark. A class of "Sequence" is assumed +unless otherwise specified in the -class argument. + +If the argument to -seq is a Bio::GFF::Featname object (such as +returned by the group() method), then the class is taken from that. + +The optional -start and -stop arguments specify the end points for the +retrieved segment. For those who do not like 1-based indexing, +-offset and -length are provided. If both -start/-stop and +-offset/-length are provided, the latter overrides the former. +Generally it is not a good idea to mix metaphors. + +-ref and -refclass together indicate a sequence to be used for +relative coordinates. If not provided, the source sequence indicated +by -seq is used as the reference sequence. If the argument to -ref is +a Bio::GFF::Featname object (such as returned by the group() method), +then the class is taken from that. + +-force_absolute should be used if you wish to skip the lookup of the +absolute position of the source sequence that ordinarily occurs when +you create a relative segment. In this case, the source sequence must +be a sequence that has been specified as the "source" in the GFF file. + +=cut + +# Create a new Bio::DB::GFF::RelSegment Object +# arguments are: +# -factory => factory and DBI interface +# -seq => $sequence_name +# -start => $start_relative_to_sequence +# -stop => $stop_relative_to_sequence +# -ref => $sequence which establishes coordinate system +# -offset => 0-based offset relative to sequence +# -length => length of segment +# -nocheck => turn off checking, force segment to be constructed +# -absolute => use absolute coordinate addressing +#' +sub new { + my $package = shift; + my ($factory,$name,$start,$stop,$refseq,$class,$refclass,$offset,$length,$force_absolute,$nocheck) = + rearrange([ + 'FACTORY', + [qw(NAME SEQ SEQUENCE SOURCESEQ)], + [qw(START BEGIN)], + [qw(STOP END)], + [qw(REFSEQ REF REFNAME)], + [qw(CLASS SEQCLASS)], + qw(REFCLASS), + [qw(OFFSET OFF)], + [qw(LENGTH LEN)], + [qw(ABSOLUTE)], + [qw(NOCHECK FORCE)], + ],@_); + + $package = ref $package if ref $package; + $factory or $package->throw("new(): provide a -factory argument"); + + # to allow people to use segments as sources + if (ref($name) && $name->isa('Bio::DB::GFF::Segment')) { + $start = 1 unless defined $start; + $stop = $name->length unless defined $stop; + return $name->subseq($start,$stop); + } + + my @object_results; + + # support for Featname objects + if (ref($name) && $name->can('class')) { + $class = $name->class; + $name = $name->name; + } + # if the class of the landmark is not specified then default to 'Sequence' + $class ||= eval{$factory->default_class} || 'Sequence'; + + # confirm that indicated sequence is actually in the database! + my @abscoords; + + # abscoords() will now return an array ref, each element of which is + # ($absref,$absclass,$absstart,$absstop,$absstrand) + + if ($nocheck) { + $force_absolute++; + $start = 1; + } + + if ($force_absolute && defined($start)) { # absolute position is given to us + @abscoords = ([$name,$class,$start,$stop,'+']); + } else { + my $result = $factory->abscoords($name,$class,$force_absolute ? $name : ()) or return; + @abscoords = @$result; + } + + foreach (@abscoords) { + my ($absref,$absclass,$absstart,$absstop,$absstrand,$sname) = @$_; + $sname = $name unless defined $sname; + my ($this_start,$this_stop,$this_length) = ($start,$stop,$length); + + # partially fill in object + my $self = bless { factory => $factory },$package; + + $absstrand ||= '+'; + + # an explicit length overrides start and stop + if (defined $offset) { + warn "new(): bad idea to call new() with both a start and an offset" + if defined $this_start; + $this_start = $offset+1; + } + if (defined $this_length) { + warn "new(): bad idea to call new() with both a stop and a length" + if defined $this_stop; + $this_stop = $this_start + $length - 1; + } + + # this allows a SQL optimization way down deep + $self->{whole}++ if $absref eq $sname and !defined($this_start) and !defined($this_stop); + + $this_start = 1 if !defined $this_start; + $this_stop = $absstop-$absstart+1 if !defined $this_stop; + $this_length = $this_stop - $this_start + 1; + + # now offset to correct subsegment based on desired start and stop + if ($force_absolute) { + ($this_start,$this_stop) = ($absstart,$absstop); + $self->absolute(1); + } elsif ($absstrand eq '+') { + $this_start = $absstart + $this_start - 1; + $this_stop = $this_start + $this_length - 1; + } else { + $this_start = $absstop - ($this_start - 1); + $this_stop = $absstop - ($this_stop - 1); + } + + # handle truncation in either direction + # This only happens if the segment runs off the end of + # the reference sequence + if ($factory->strict_bounds_checking && + (($this_start < $absstart) || ($this_stop > $absstop))) { + # return empty if we are completely off the end of the ref se + next unless $this_start<=$absstop && $this_stop>=$absstart; + if (my $a = $factory->abscoords($absref,'Sequence')) { + my $refstart = $a->[0][2]; + my $refstop = $a->[0][3]; + if ($this_start < $refstart) { + $this_start = $refstart; + $self->{truncated}{start}++; + } + if ($this_stop > $refstop) { + $this_stop = $absstop; + $self->{truncated}{stop}++; + } + } + } + + @{$self}{qw(sourceseq start stop strand class)} + = ($absref,$this_start,$this_stop,$absstrand,$absclass); + + # handle reference sequence + if (defined $refseq) { + $refclass = $refseq->class if $refseq->can('class'); + $refclass ||= 'Sequence'; + my ($refref,$refstart,$refstop,$refstrand) = $factory->abscoords($refseq,$refclass); + unless ($refref eq $absref) { + $self->error("reference sequence is on $refref but source sequence is on $absref"); + return; + } + $refstart = $refstop if $refstrand eq '-'; + @{$self}{qw(ref refstart refstrand)} = ($refseq,$refstart,$refstrand); + } else { + $absstart = $absstop if $absstrand eq '-'; + @{$self}{qw(ref refstart refstrand)} = ($sname,$absstart,$absstrand); + } + push @object_results,$self; + } + + return wantarray ? @object_results : $object_results[0]; +} + +# overridden methods +# start, stop, length +sub start { + my $self = shift; + return $self->strand < 0 ? $self->{stop} : $self->{start} if $self->absolute; + $self->_abs2rel($self->{start}); +} +sub end { + my $self = shift; + return $self->strand < 0 ? $self->{start} : $self->{stop} if $self->absolute; + $self->_abs2rel($self->{stop}); +} +*stop = \&end; + +sub length { + my $self = shift; + return unless defined $self->abs_end; + abs($self->abs_end - $self->abs_start) + 1; +} + +sub abs_start { + my $self = shift; + if ($self->absolute) { + my ($a,$b) = ($self->SUPER::abs_start,$self->SUPER::abs_end); + return ($a<$b) ? $a : $b; + } + else { + return $self->SUPER::abs_start(@_); + } +} +sub abs_end { + my $self = shift; + if ($self->absolute) { + my ($a,$b) = ($self->SUPER::abs_start,$self->SUPER::abs_end); + return ($a>$b) ? $a : $b; + } + + else { + return $self->SUPER::abs_end(@_); + } +} + +=head2 refseq + + Title : refseq + Usage : $ref = $s->refseq([$newseq] [,$newseqclass]) + Function: get/set reference sequence + Returns : current reference sequence + Args : new reference sequence and class (optional) + Status : Public + +This method will get or set the reference sequence. Called with no +arguments, it returns the current reference sequence. Called with +either a sequence ID and class, a Bio::DB::GFF::Segment object (or +subclass) or a Bio::DB::GFF::Featname object, it will set the current +reference sequence and return the previous one. + +The method will generate an exception if you attempt to set the +reference sequence to a sequence that isn't contained in the database, +or one that has a different source sequence from the segment. + +=cut + +#' +sub refseq { + my $self = shift; + my $g = $self->{ref}; + if (@_) { + my ($newref,$newclass); + if (@_ == 2) { + $newclass = shift; + $newref = shift; + } else { + $newref = shift; + $newclass = 'Sequence'; + } + + defined $newref or $self->throw('refseq() called with an undef reference sequence'); + + # support for Featname objects + $newclass = $newref->class if ref($newref) && $newref->can('class'); + + # $self->throw("Cannot define a segment's reference sequence in terms of itself!") + # if ref($newref) and overload::StrVal($newref) eq overload::StrVal($self); + + my ($refsource,undef,$refstart,$refstop,$refstrand); + if ($newref->isa('Bio::DB::GFF::RelSegment')) { + ($refsource,undef,$refstart,$refstop,$refstrand) = + ($newref->sourceseq,undef,$newref->abs_start,$newref->abs_end,$newref->abs_strand >= 0 ? '+' : '-'); + } else { + my $coords = $self->factory->abscoords($newref,$newclass); + foreach (@$coords) { # find the appropriate one + ($refsource,undef,$refstart,$refstop,$refstrand) = @$_; + last if $refsource eq $self->{sourceseq}; + } + + } + $self->throw("can't set reference sequence: $newref and $self are on different sequence segments") + unless $refsource eq $self->{sourceseq}; + + @{$self}{qw(ref refstart refstrand)} = ($newref,$refstart,$refstrand); + $self->absolute(0); + } + return $self->absolute ? $self->sourceseq : $g; +} + + +=head2 abs_low + + Title : abs_low + Usage : $s->abs_low + Function: the absolute lowest coordinate of the segment + Returns : an integer + Args : none + Status : Public + +This is for GadFly compatibility, and returns the low coordinate in +absolute coordinates; + +=cut + +sub abs_low { + my $self = shift; + my ($a,$b) = ($self->abs_start,$self->abs_end); + return ($a<$b) ? $a : $b; +} + +=head2 abs_high + + Title : abs_high + Usage : $s->abs_high + Function: the absolute highest coordinate of the segment + Returns : an integer + Args : none + Status : Public + +This is for GadFly compatibility, and returns the high coordinate in +absolute coordinates; + +=cut + +sub abs_high { + my $self = shift; + my ($a,$b) = ($self->abs_start,$self->abs_end); + return ($a>$b) ? $a : $b; +} + + +=head2 asString + + Title : asString + Usage : $s->asString + Function: human-readable representation of the segment + Returns : a string + Args : none + Status : Public + +This method will return a human-readable representation of the +segment. It is the overloaded method call for the "" operator. + +Currently the format is: + + refseq:start,stop + +=cut + +sub asString { + my $self = shift; + return $self->SUPER::asString if $self->absolute; + my $label = $self->{ref}; + my $start = $self->start || ''; + my $stop = $self->stop || ''; + if (ref($label) && overload::StrVal($self) eq overload::StrVal($label->ref)) { + $label = $self->abs_ref; + $start = $self->abs_start; + $stop = $self->abs_end; + } + return "$label:$start,$stop"; +} + +sub name { shift->asString } + +=head2 absolute + + Title : absolute + Usage : $abs = $s->absolute([$abs]) + Function: get/set absolute coordinates + Returns : a boolean flag + Args : new setting for flag (optional) + Status : Public + +Called with a boolean flag, this method controls whether to display +relative coordinates (relative to the reference sequence) or absolute +coordinates (relative to the source sequence). It will return the +previous value of the setting. + +=cut + +sub absolute { + my $self = shift; + my $g = $self->{absolute}; + $self->{absolute} = shift if @_; + $g; +} + +=head2 features + + Title : features + Usage : @features = $s->features(@args) + Function: get features that overlap this segment + Returns : a list of Bio::DB::GFF::Feature objects + Args : see below + Status : Public + +This method will find all features that overlap the segment and return +a list of Bio::DB::GFF::Feature objects. The features will use +coordinates relative to the reference sequence in effect at the time +that features() was called. + +The returned list can be limited to certain types of feature by +filtering on their method and/or source. In addition, it is possible +to obtain an iterator that will step through a large number of +features sequentially. + +Arguments can be provided positionally or using the named arguments +format. In the former case, the arguments are a list of feature types +in the format "method:source". Either method or source can be +omitted, in which case the missing component is treated as a wildcard. +If no colon is present, then the type is treated as a method name. +Multiple arguments are ORed together. + +Examples: + + @f = $s->features('exon:curated'); # all curated exons + @f = $s->features('exon:curated','intron'); # curated exons and all introns + @f = $s->features('similarity:.*EST.*'); # all similarities + # having something to do + # with ESTs + +The named parameter form gives you control over a few options: + + -types an array reference to type names in the format + "method:source" + + -merge Whether to apply aggregators to the generated features (default yes) + + -rare Turn on an optimization suitable for a relatively rare feature type, + where it will be faster to filter by feature type first + and then by position, rather than vice versa. + + -attributes a hashref containing a set of attributes to match + + -iterator Whether to return an iterator across the features. + + -binsize A true value will create a set of artificial features whose + start and stop positions indicate bins of the given size, and + whose scores are the number of features in the bin. The + class and method of the feature will be set to "bin", + its source to "method:source", and its group to "bin:method:source". + This is a handy way of generating histograms of feature density. + +-merge is a boolean flag that controls whether the adaptor's +aggregators wll be applied to the features returned by this method. + +If -iterator is true, then the method returns a single scalar value +consisting of a Bio::SeqIO object. You can call next_seq() repeatedly +on this object to fetch each of the features in turn. If iterator is +false or absent, then all the features are returned as a list. + +The -attributes argument is a hashref containing one or more +attributes to match against: + + -attributes => { Gene => 'abc-1', + Note => 'confirmed' } + +Attribute matching is simple string matching, and multiple attributes +are ANDed together. + +=cut + +#' + +# return all features that overlap with this segment; +# optionally modified by a list of types to filter on +sub features { + my $self = shift; + my @args = $self->_process_feature_args(@_); + return $self->factory->overlapping_features(@args); +} + +=head2 top_SeqFeatures + + Title : top_SeqFeatures + Usage : + Function: + Example : + Returns : + Args : + +Alias for features(). Provided for Bio::SeqI compatibility. + +=cut + +=head2 all_SeqFeatures + + Title : all_SeqFeatures + Usage : + Function: + Example : + Returns : + Args : + +Alias for features(). Provided for Bio::SeqI compatibility. + +=cut + +=head2 sub_SeqFeatures + + Title : sub_SeqFeatures + Usage : + Function: + Example : + Returns : + Args : + +Alias for features(). Provided for Bio::SeqI compatibility. + +=cut + +*top_SeqFeatures = *all_SeqFeatures = \&features; + +=head2 get_feature_stream + + Title : features + Usage : $stream = $s->get_feature_stream(@args) + Function: get a stream of features that overlap this segment + Returns : a Bio::SeqIO::Stream-compliant stream + Args : see below + Status : Public + +This is the same as features(), but returns a stream. Use like this: + + $stream = $s->get_feature_stream('exon'); + while (my $exon = $stream->next_seq) { + print $exon->start,"\n"; + } + +=cut + +sub get_feature_stream { + my $self = shift; + my @args = $_[0] =~ /^-/ ? (@_,-iterator=>1) : (-types=>\@_,-iterator=>1); + $self->features(@args); +} + +=head2 get_seq_stream + + Title : get_seq_stream + Usage : $stream = $s->get_seq_stream(@args) + Function: get a stream of features that overlap this segment + Returns : a Bio::SeqIO::Stream-compliant stream + Args : see below + Status : Public + +This is the same as feature_stream(), and is provided for Bioperl +compatibility. Use like this: + + $stream = $s->get_seq_stream('exon'); + while (my $exon = $stream->next_seq) { + print $exon->start,"\n"; + } + +=cut + +*get_seq_stream = \&get_feature_stream; + + +=head2 overlapping_features + + Title : overlapping_features + Usage : @features = $s->overlapping_features(@args) + Function: get features that overlap this segment + Returns : a list of Bio::DB::GFF::Feature objects + Args : see features() + Status : Public + +This is an alias for the features() method, and takes the same +arguments. + +=cut + +*overlapping_features = \&features; + +=head2 contained_features + + Title : contained_features + Usage : @features = $s->contained_features(@args) + Function: get features that are contained by this segment + Returns : a list of Bio::DB::GFF::Feature objects + Args : see features() + Status : Public + +This is identical in behavior to features() except that it returns +only those features that are completely contained within the segment, +rather than any that overlap. + +=cut + +# return all features completely contained within this segment +sub contained_features { + my $self = shift; + local $self->{whole} = 0; + my @args = $self->_process_feature_args(@_); + return $self->factory->contained_features(@args); +} + +# *contains = \&contained_features; + +=head2 contained_in + + Title : contained_in + Usage : @features = $s->contained_in(@args) + Function: get features that contain this segment + Returns : a list of Bio::DB::GFF::Feature objects + Args : see features() + Status : Public + +This is identical in behavior to features() except that it returns +only those features that completely contain the segment. + +=cut + +# return all features completely contained within this segment +sub contained_in { + my $self = shift; + local $self->{whole} = 0; + my @args = $self->_process_feature_args(@_); + return $self->factory->contained_in(@args); +} + +=head2 _process_feature_args + + Title : _process_feature_args + Usage : @args = $s->_process_feature_args(@args) + Function: preprocess arguments passed to features, + contained_features, and overlapping_features + Returns : a list of parsed arguents + Args : see feature() + Status : Internal + +This is an internal method that is used to check and format the +arguments to features() before passing them on to the adaptor. + +=cut + +sub _process_feature_args { + my $self = shift; + + my ($ref,$class,$start,$stop,$strand,$whole) + = @{$self}{qw(sourceseq class start stop strand whole)}; + + ($start,$stop) = ($stop,$start) if $strand eq '-'; + + my @args = (-ref=>$ref,-class=>$class); + + # indicating that we are fetching the whole segment allows certain + # SQL optimizations. + push @args,(-start=>$start,-stop=>$stop) unless $whole; + + if (@_) { + if ($_[0] =~ /^-/) { + push @args,@_; + } else { + my @types = @_; + push @args,-types=>\@types; + } + } + push @args,-parent=>$self; + @args; +} + +=head2 types + + Title : types + Usage : @types = $s->types([-enumerate=>1]) + Function: list feature types that overlap this segment + Returns : a list of Bio::DB::GFF::Typename objects or a hash + Args : see below + Status : Public + +The types() method will return a list of Bio::DB::GFF::Typename +objects, each corresponding to a feature that overlaps the segment. +If the optional -enumerate parameter is set to a true value, then the +method will return a hash in which the keys are the type names and the +values are the number of times a feature of that type is present on +the segment. For example: + + %count = $s->types(-enumerate=>1); + +=cut + +# wrapper for lower-level types() call. +sub types { + my $self = shift; + my ($ref,$class,$start,$stop,$strand) = @{$self}{qw(sourceseq class start stop strand)}; + ($start,$stop) = ($stop,$start) if $strand eq '-'; + + my @args; + if (@_ && $_[0] !~ /^-/) { + @args = (-type => \@_) + } else { + @args = @_; + } + $self->factory->types(-ref => $ref, + -class => $class, + -start=> $start, + -stop => $stop, + @args); +} + +=head1 Internal Methods + +The following are internal methods and should not be called directly. + +=head2 new_from_segment + + Title : new_from_segment + Usage : $s = $segment->new_from_segment(@args) + Function: create a new relative segment + Returns : a new Bio::DB::GFF::RelSegment object + Args : see below + Status : Internal + +This constructor is used internally by the subseq() method. It forces +the new segment into the Bio::DB::GFF::RelSegment package, regardless +of the package that it is called from. This causes subclass-specfic +information, such as feature types, to be dropped when a subsequence +is created. + +=cut + +sub new_from_segment { + my $package = shift; + $package = ref $package if ref $package; + my $segment = shift; + my $new = {}; + @{$new}{qw(factory sourceseq start stop strand class ref refstart refstrand)} + = @{$segment}{qw(factory sourceseq start stop strand class ref refstart refstrand)}; + return bless $new,__PACKAGE__; +} + +=head2 _abs2rel + + Title : _abs2rel + Usage : @coords = $s->_abs2rel(@coords) + Function: convert absolute coordinates into relative coordinates + Returns : a list of relative coordinates + Args : a list of absolute coordinates + Status : Internal + +This is used internally to map from absolute to relative +coordinates. It does not take the offset of the reference sequence +into account, so please use abs2rel() instead. + +=cut + +sub _abs2rel { + my $self = shift; + my @result; + return unless defined $_[0]; + + if ($self->absolute) { + @result = @_; + } else { + my ($refstart,$refstrand) = @{$self}{qw(refstart refstrand)}; + @result = defined($refstrand) && $refstrand eq '-' ? map { $refstart - $_ + 1 } @_ + : map { $_ - $refstart + 1 } @_; + } + # if called with a single argument, caller will expect a single scalar reply + # not the size of the returned array! + return $result[0] if @result == 1 and !wantarray; + @result; +} + +=head2 rel2abs + + Title : rel2abs + Usage : @coords = $s->rel2abs(@coords) + Function: convert relative coordinates into absolute coordinates + Returns : a list of absolute coordinates + Args : a list of relative coordinates + Status : Public + +This function takes a list of positions in relative coordinates to the +segment, and converts them into absolute coordinates. + +=cut + +sub rel2abs { + my $self = shift; + my @result; + + if ($self->absolute) { + @result = @_; + } else { + my ($abs_start,$abs_strand) = ($self->abs_start,$self->abs_strand); + @result = $abs_strand < 0 ? map { $abs_start - $_ + 1 } @_ + : map { $_ + $abs_start - 1 } @_; + } + # if called with a single argument, caller will expect a single scalar reply + # not the size of the returned array! + return $result[0] if @result == 1 and !wantarray; + @result; +} + +=head2 abs2rel + + Title : abs2rel + Usage : @rel_coords = $s-abs2rel(@abs_coords) + Function: convert absolute coordinates into relative coordinates + Returns : a list of relative coordinates + Args : a list of absolutee coordinates + Status : Public + +This function takes a list of positions in absolute coordinates +and returns a list expressed in relative coordinates. + +=cut + +sub abs2rel { + my $self = shift; + my @result; + + if ($self->absolute) { + @result = @_; + } else { + my ($abs_start,$abs_strand) = ($self->abs_start,$self->abs_strand); + @result = $abs_strand < 0 ? map { $abs_start - $_ + 1 } @_ + : map { $_ - $abs_start + 1 } @_; + } + # if called with a single argument, caller will expect a single scalar reply + # not the size of the returned array! + return $result[0] if @result == 1 and !wantarray; + @result; +} + +sub subseq { + my $self = shift; + my $obj = $self->SUPER::subseq(@_); + bless $obj,__PACKAGE__; # always bless into the generic RelSegment package +} + +sub strand { + my $self = shift; + if ($self->absolute) { + return _to_strand($self->{strand}); + } + return $self->stop <=> $self->start; +} + +sub _to_strand { + my $s = shift; + return -1 if $s eq '-'; + return +1 if $s eq '+'; + return 0; +} + +=head2 Bio::RangeI Methods + +The following Bio::RangeI methods are supported: + +overlaps(), contains(), equals(),intersection(),union(),overlap_extent() + +=cut + +sub intersection { + my $self = shift; + my (@ranges) = @_; + unshift @ranges,$self if ref $self; + $ranges[0]->isa('Bio::DB::GFF::RelSegment') + or return $self->SUPER::intersection(@_); + + my $ref = $ranges[0]->abs_ref; + my ($low,$high); + foreach (@ranges) { + return unless $_->can('abs_ref'); + $ref eq $_->abs_ref or return; + $low = $_->abs_low if !defined($low) or $low < $_->abs_low; + $high = $_->abs_high if !defined($high) or $high > $_->abs_high; + } + return unless $low < $high; + $self->new(-factory=> $self->factory, + -seq => $ref, + -start => $low, + -stop => $high); +} + +sub overlaps { + my $self = shift; + my($other,$so) = @_; + return $self->SUPER::overlaps(@_) unless $other->isa('Bio::DB::GFF::RelSegment'); + return if $self->abs_ref ne $other->abs_ref; + return if $self->abs_low > $other->abs_high; + return if $self->abs_high < $other->abs_low; + 1; +} + +sub contains { + my $self = shift; + my($other,$so) = @_; + return $self->SUPER::overlaps(@_) unless $other->isa('Bio::DB::GFF::RelSegment'); + return if $self->abs_ref ne $other->abs_ref; + return unless $self->abs_low <= $other->abs_low; + return unless $self->abs_high >= $other->abs_high; + 1; +} + +sub union { + my $self = shift; + my (@ranges) = @_; + unshift @ranges,$self if ref $self; + $ranges[0]->isa('Bio::DB::GFF::RelSegment') + or return $self->SUPER::union(@_); + + my $ref = $ranges[0]->abs_ref; + my ($low,$high); + foreach (@ranges) { + return unless $_->can('abs_ref'); + $ref eq $_->abs_ref or return; + $low = $_->abs_low if !defined($low) or $low > $_->abs_low; + $high = $_->abs_high if !defined($high) or $high < $_->abs_high; + } + $self->new(-factory=> $self->factory, + -seq => $ref, + -start => $low, + -stop => $high); +} + + +1; + +__END__ + +=head1 BUGS + +Schemas need some work. + +=head1 SEE ALSO + +L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Segment.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Segment.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,862 @@ +=head1 NAME + +Bio::DB::GFF::Segment -- Simple DNA segment object + +=head1 SYNOPSIS + +See L. + +=head1 DESCRIPTION + +Bio::DB::GFF::Segment provides the basic representation of a range of +DNA contained in a GFF database. It is the base class from which the +Bio::DB::GFF::RelSegment and Bio::DB::GFF::Feature classes are +derived. + +Generally, you will not create or manipulate Bio::DB::GFF::Segment +objects directly, but use those that are returned by the Bio::DB::GFF +module. + +=cut + +package Bio::DB::GFF::Segment; + +use strict; +use Bio::Root::Root; +use Bio::Annotation::Collection; +use Bio::RangeI; +use Bio::Das::SegmentI; +use Bio::SeqI; + +use vars qw(@ISA); +@ISA = qw(Bio::Root::Root Bio::RangeI Bio::SeqI Bio::Das::SegmentI); + +use overload + '""' => 'asString', + eq => 'equals', + fallback => 1; + +=head1 API + +The remainder of this document describes the API for +Bio::DB::GFF::Segment. + +=cut + +=head2 new + + Title : new + Usage : $s = Bio::DB::GFF::Segment->new(@args) + Function: create a new segment + Returns : a new Bio::DB::GFF::Segment object + Args : see below + Status : Public + +This method creates a new Bio::DB::GFF::Segment object. Generally +this is called automatically by the Bio::DB::GFF module and +derivatives. + +There are five positional arguments: + + $factory a Bio::DB::GFF::Adaptor to use for database access + $sourceseq ID of the source sequence + $sourceclass class of the source sequence + $start start of the desired segment relative to source sequence + $stop stop of the desired segment relative to source sequence + +=cut + +sub new { + my $class = shift; + my ($factory,$segclass,$segname,$start,$stop) = @_; + $segclass = $segname->class if ref($segname) && $segname->can('class'); + $segclass ||= 'Sequence'; + + $factory or $class->throw("->new(): provide a factory argument"); + $class = ref $class if ref $class; + return bless { factory => $factory, + sourceseq => $segname, + class => $segclass, + start => $start, + stop => $stop, + strand => 0, + },$class; +} + +# read-only accessors + +=head2 factory + + Title : factory + Usage : $s->factory + Function: get the factory object + Returns : a Bio::DB::GFF::Adaptor + Args : none + Status : Public + +This is a read-only accessor for the Bio::DB::GFF::Adaptor object used +to create the segment. + +=cut + +sub factory { shift->{factory} } + +# start, stop, length + +=head2 start + + Title : start + Usage : $s->start + Function: start of segment + Returns : integer + Args : none + Status : Public + +This is a read-only accessor for the start of the segment. + +=cut + +sub start { shift->{start} } + +=head2 end + + Title : end + Usage : $s->end + Function: end of segment + Returns : integer + Args : none + Status : Public + +This is a read-only accessor for the end of the segment. + +=cut + +sub end { shift->{stop} } + +=head2 stop + + Title : stop + Usage : $s->stop + Function: stop of segment + Returns : integer + Args : none + Status : Public + +This is an alias for end(), provided for AcePerl compatibility. + +=cut + +*stop = \&end; + +=head2 length + + Title : length + Usage : $s->length + Function: length of segment + Returns : integer + Args : none + Status : Public + +Returns the length of the segment. Always a positive number. + +=cut + +sub length { abs($_[0]->{start} - $_[0]->{stop})+1 } + + +=head2 strand + + Title : strand + Usage : $s->strand + Function: strand of segment + Returns : +1,0,-1 + Args : none + Status : Public + +Returns the strand on which the segment resides, either +1, 0 or -1. + +=cut + +sub strand { + my $self = shift; + 0; +} + +=head2 low + + Title : low + Usage : $s->low + Function: return lower coordinate + Returns : lower coordinate + Args : none + Status : Public + +Returns the lower coordinate, either start or end. + +=cut + +sub low { + my $self = shift; + my ($start,$stop) = ($self->start,$self->stop); + return $start < $stop ? $start : $stop; +} +*abs_low = \&low; + +=head2 high + + Title : high + Usage : $s->high + Function: return higher coordinate + Returns : higher coordinate + Args : none + Status : Public + +Returns the higher coordinate, either start or end. + +=cut + +sub high { + my $self = shift; + my ($start,$stop) = ($self->start,$self->stop); + return $start > $stop ? $start : $stop; +} +*abs_high = \&high; + +=head2 sourceseq + + Title : sourceseq + Usage : $s->sourceseq + Function: get the segment source + Returns : a string + Args : none + Status : Public + +Returns the name of the source sequence for this segment. + +=cut + +sub sourceseq { shift->{sourceseq} } + +=head2 class + + Title : class + Usage : $s->class([$newclass]) + Function: get the source sequence class + Returns : a string + Args : new class (optional) + Status : Public + +Gets or sets the class for the source sequence for this segment. + +=cut + +sub class { + my $self = shift; + my $d = $self->{class}; + $self->{class} = shift if @_; + $d; +} + +=head2 subseq + + Title : subseq + Usage : $s->subseq($start,$stop) + Function: generate a subsequence + Returns : a Bio::DB::GFF::Segment object + Args : start and end of subsequence + Status : Public + +This method generates a new segment from the start and end positions +given in the arguments. If stop E start, then the strand is reversed. + +=cut + +sub subseq { + my $self = shift; + my ($newstart,$newstop) = @_; + my ($refseq,$start,$stop,$class) = ($self->{sourceseq}, + $self->{start},$self->{stop}, + $self->class); + + # We deliberately force subseq to return objects of type RelSegment + # Otherwise, when we get a subsequence from a Feature object, + # its method and source go along for the ride, which is incorrect. + my $new = $self->new_from_segment($self); + if ($start <= $stop) { + @{$new}{qw(start stop)} = ($start + $newstart - 1, $start + $newstop - 1); + } else { + @{$new}{qw(start stop)} = ($start - ($newstart - 1), $start - ($newstop - 1)), + + } + + $new; +} + +=head2 seq + + Title : seq + Usage : $s->seq + Function: get the sequence string for this segment + Returns : a string + Args : none + Status : Public + +Returns the sequence for this segment as a simple string. (-) strand +segments are automatically reverse complemented + +This method is also called dna() and protein() for backward +compatibility with AceDB. + +=cut + +sub seq { + my $self = shift; + my ($ref,$class,$start,$stop,$strand) + = @{$self}{qw(sourceseq class start stop strand)}; +# ($start,$stop) = ($stop,$start) if $strand eq '-'; + $self->factory->dna($ref,$start,$stop,$class); +} + +*protein = *dna = \&seq; + + +=head2 primary_seq + + Title : primary_seq + Usage : $s->primary_seq + Function: returns a Bio::PrimarySeqI compatible object + Returns : a Bio::PrimarySeqI object + Args : none + Status : Public + +This is for compatibility with BioPerl's separation of SeqI +from PrimarySeqI. It just returns itself. + +=cut + +#' + +sub primary_seq { shift } + +=head2 type + + Title : type + Usage : $s->type + Function: return the string "feature" + Returns : the string "feature" + Args : none + Status : Public + +This is for future sequence ontology-compatibility and +represents the default type of a feature on the genome + +=cut + +sub type { "feature" } + +=head2 equals + + Title : equals + Usage : $s->equals($d) + Function: segment equality + Returns : true, if two segments are equal + Args : another segment + Status : Public + +Returns true if the two segments have the same source sequence, start and stop. + +=cut + +sub equals { + my $self = shift; + my $peer = shift; + return unless defined $peer; + return $self->asString eq $peer unless ref($peer) && $peer->isa('Bio::DB::GFF::Segment'); + return $self->{start} eq $peer->{start} + && $self->{stop} eq $peer->{stop} + && $self->{sourceseq} eq $peer->{sourceseq}; +} + +=head2 asString + + Title : asString + Usage : $s->asString + Function: human-readable string for segment + Returns : a string + Args : none + Status : Public + +Returns a human-readable string representing this sequence. Format +is: + + sourceseq/start,stop + +=cut + +sub asString { + my $self = shift; + my $label = $self->refseq; + my $start = $self->start; + my $stop = $self->stop; + return "$label:$start,$stop"; +} + +=head2 clone + + Title : clone + Usage : $copy = $s->clone + Function: make a copy of this segment + Returns : a Bio::DB::GFF::Segment object + Args : none + Status : Public + +This method creates a copy of the segment and returns it. + +=cut + +# deep copy of the thing +sub clone { + my $self = shift; + my %h = %$self; + return bless \%h,ref($self); +} + +=head2 error + + Title : error + Usage : $error = $s->error([$new_error]) + Function: get or set the last error + Returns : a string + Args : an error message (optional) + Status : Public + +In case of a fault, this method can be used to obtain the last error +message. Internally it is called to set the error message. + +=cut + +sub error { + my $self = shift; + my $g = $self->{error}; + $self->{error} = shift if @_; + $g; +} + +=head1 Relative Addressing Methods + +The following methods are provided for compatibility with +Bio::DB::GFF::RelSegment, which provides relative addressing +functions. + +=head2 abs_start + + Title : abs_start + Usage : $s->abs_start + Function: the absolute start of the segment + Returns : an integer + Args : none + Status : Public + +This is an alias to start(), and provided for API compatibility with +Bio::DB::GFF::RelSegment. + +=cut + +*abs_start = \&start; + +=head2 abs_end + + Title : abs_end + Usage : $s->abs_end + Function: the absolute stop of the segment + Returns : an integer + Args : none + Status : Public + +This is an alias to stop(), and provided for API compatibility with +Bio::DB::GFF::RelSegment. + +=cut + +*abs_stop = \&stop; +*abs_end = \&stop; + +=head2 abs_strand + + Title : abs_strand + Usage : $s->abs_strand + Function: the absolute strand of the segment + Returns : +1,0,-1 + Args : none + Status : Public + +This is an alias to strand(), and provided for API compatibility with +Bio::DB::GFF::RelSegment. + +=cut + +sub abs_strand { + my $self = shift; + return $self->abs_end <=> $self->abs_start; +} + +=head2 abs_ref + + Title : abs_ref + Usage : $s->abs_ref + Function: the reference sequence for this segment + Returns : a string + Args : none + Status : Public + +This is an alias to sourceseq(), and is here to provide API +compatibility with Bio::DB::GFF::RelSegment. + +=cut + +*abs_ref = \&sourceseq; + +=head2 refseq + + Title : refseq + Usage : $s->refseq + Function: get or set the reference sequence + Returns : a string + Args : none + Status : Public + +Examine or change the reference sequence. This is an alias to +sourceseq(), provided here for API compatibility with +Bio::DB::GFF::RelSegment. + +=cut + +*refseq = \&sourceseq; + +=head2 ref + + Title : ref + Usage : $s->refseq + Function: get or set the reference sequence + Returns : a string + Args : none + Status : Public + +An alias for refseq() + +=cut + +sub ref { shift->refseq(@_) } + +=head2 seq_id + + Title : seq_id + Usage : $ref = $s->seq_id + Function: get the reference sequence in a LocationI-compatible way + Returns : a string + Args : none + Status : Public + +An alias for refseq() but only allows reading. + +=cut + +sub seq_id { shift->refseq } + +=head2 truncated + + Title : truncated + Usage : $truncated = $s->truncated + Function: Flag indicating that the segment was truncated during creation + Returns : A boolean flag + Args : none + Status : Public + +This indicates that the sequence was truncated during creation. The +returned flag is undef if no truncation occured. If truncation did +occur, the flag is actually an array ref in which the first element is +true if truncation occurred on the left, and the second element +occurred if truncation occurred on the right. + +=cut + +sub truncated { + my $self = shift; + my $hash = $self->{truncated} or return; + CORE::ref($hash) eq 'HASH' or return [1,1]; # paranoia -- not that this would ever happen ;-) + return [$hash->{start},$hash->{stop}]; +} + +=head2 Bio::RangeI Methods + +The following Bio::RangeI methods are supported: + +overlaps(), contains(), equals(),intersection(),union(),overlap_extent() + +=cut + +sub overlaps { + my $self = shift; + my($other,$so) = @_; + if ($other->isa('Bio::DB::GFF::RelSegment')) { + return if $self->abs_ref ne $other->abs_ref; + } + $self->SUPER::overlaps(@_); +} + +sub contains { + my $self = shift; + my($other,$so) = @_; + if ($other->isa('Bio::DB::GFF::RelSegment')) { + return if $self->abs_ref ne $other->abs_ref; + } + $self->SUPER::contains(@_); +} +#sub equals { +# my $self = shift; +# my($other,$so) = @_; +# if ($other->isa('Bio::DB::GFF::RelSegment')) { +# return if $self->abs_ref ne $other->abs_ref; +# } +# $self->SUPER::equals(@_); +#} +sub intersection { + my $self = shift; + my($other,$so) = @_; + if ($other->isa('Bio::DB::GFF::RelSegment')) { + return if $self->abs_ref ne $other->abs_ref; + } + $self->SUPER::intersection(@_); +} +sub union { + my $self = shift; + my($other) = @_; + if ($other->isa('Bio::DB::GFF::RelSegment')) { + return if $self->abs_ref ne $other->abs_ref; + } + $self->SUPER::union(@_); +} + +sub overlap_extent { + my $self = shift; + my($other) = @_; + if ($other->isa('Bio::DB::GFF::RelSegment')) { + return if $self->abs_ref ne $other->abs_ref; + } + $self->SUPER::overlap_extent(@_); +} + + +=head2 Bio::SeqI implementation + +=cut + +=head2 primary_id + + Title : primary_id + Usage : $unique_implementation_key = $obj->primary_id; + Function: Returns the unique id for this object in this + implementation. This allows implementations to manage their + own object ids in a way the implementaiton can control + clients can expect one id to map to one object. + + For sequences with no accession number, this method should + return a stringified memory location. + + Returns : A string + Args : None + Status : Virtual + + +=cut + +sub primary_id { + my ($obj,$value) = @_; + + if( defined $value) { + $obj->{'primary_id'} = $value; + } + if( ! exists $obj->{'primary_id'} ) { + return "$obj"; + } + return $obj->{'primary_id'}; +} + + +=head2 display_name + + Title : display_name + Usage : $id = $obj->display_name or $obj->display_name($newid); + Function: Gets or sets the display id, also known as the common name of + the Seq object. + + The semantics of this is that it is the most likely string + to be used as an identifier of the sequence, and likely to + have "human" readability. The id is equivalent to the LOCUS + field of the GenBank/EMBL databanks and the ID field of the + Swissprot/sptrembl database. In fasta format, the >(\S+) is + presumed to be the id, though some people overload the id + to embed other information. Bioperl does not use any + embedded information in the ID field, and people are + encouraged to use other mechanisms (accession field for + example, or extending the sequence object) to solve this. + + Notice that $seq->id() maps to this function, mainly for + legacy/convenience issues. + Returns : A string + Args : None or a new id + +Note, this used to be called display_id(), and this name is preserved for +backward compatibility. The default is to return the seq_id(). + +=cut + +sub display_name { shift->seq_id } +*display_id = \&display_name; + +=head2 accession_number + + Title : accession_number + Usage : $unique_biological_key = $obj->accession_number; + Function: Returns the unique biological id for a sequence, commonly + called the accession_number. For sequences from established + databases, the implementors should try to use the correct + accession number. Notice that primary_id() provides the + unique id for the implemetation, allowing multiple objects + to have the same accession number in a particular implementation. + + For sequences with no accession number, this method should return + "unknown". + Returns : A string + Args : None + + +=cut + +sub accession_number { + return 'unknown'; +} + +=head2 alphabet + + Title : alphabet + Usage : if( $obj->alphabet eq 'dna' ) { /Do Something/ } + Function: Returns the type of sequence being one of + 'dna', 'rna' or 'protein'. This is case sensitive. + + This is not called because this would cause + upgrade problems from the 0.5 and earlier Seq objects. + + Returns : a string either 'dna','rna','protein'. NB - the object must + make a call of the type - if there is no type specified it + has to guess. + Args : none + Status : Virtual + + +=cut + +sub alphabet{ + return 'dna'; # no way this will be anything other than dna! +} + +=head2 desc + + Title : desc + Usage : $seqobj->desc($string) or $seqobj->desc() + Function: Sets or gets the description of the sequence + Example : + Returns : The description + Args : The description or none + + +=cut + +sub desc { shift->asString } + +=head2 species + + Title : species + Usage : $species = $seq->species() or $seq->species($species) + Function: Gets or sets the species + Example : + Returns : Bio::Species object + Args : None or Bio::Species object + +See L for more information + +=cut + +sub species { + my ($self, $species) = @_; + if ($species) { + $self->{'species'} = $species; + } else { + return $self->{'species'}; + } +} + +=head2 annotation + + Title : annotation + Usage : $ann = $seq->annotation or $seq->annotation($annotation) + Function: Gets or sets the annotation + Example : + Returns : Bio::Annotation object + Args : None or Bio::Annotation object + +See L for more information + +=cut + +sub annotation { + my ($obj,$value) = @_; + if( defined $value || ! defined $obj->{'annotation'} ) { + $value = new Bio::Annotation::Collection() unless defined $value; + $obj->{'annotation'} = $value; + } + return $obj->{'annotation'}; + +} + +=head2 is_circular + + Title : is_circular + Usage : if( $obj->is_circular) { /Do Something/ } + Function: Returns true if the molecule is circular + Returns : Boolean value + Args : none + +=cut + +sub is_circular{ + return 0; +} + + +1; +__END__ + +=head1 BUGS + +Report them please. + +=head1 SEE ALSO + +L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 CONTRIBUTORS + +Jason Stajich Ejason@bioperl.orgE. + +=cut + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Typename.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Typename.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,188 @@ +=head1 NAME + +Bio::DB::GFF::Typename -- The name of a feature type + +=head1 SYNOPSIS + + use Bio::DB::GFF; + + my $type = Bio::DB::GFF::Typename->new(similarity => 'BLAT_EST_GENOME'); + my $segment = $segment->features($type); + +=head1 DESCRIPTION + +Bio::DB::GFF::Typename objects encapsulate the combination of feature +method and source used by the GFF flat file format. They can be used +in the Bio::DB::GFF modules wherever a feature type is called for. + +Since there are relatively few types and many features, this module +maintains a memory cache of unique types so that two features of the +same type will share the same Bio::DB::GFF::Typename object. + +=head1 METHODS + +=cut + +package Bio::DB::GFF::Typename; + +use strict; +use Bio::Root::Root; +use Bio::Das::FeatureTypeI; +use overload + '""' => 'asString', + fallback => 1; + +use vars '@ISA'; + +@ISA = qw(Bio::Root::Root Bio::Das::FeatureTypeI); + +# cut down on the number of equivalent objects we have to create +my %OBJECT_CACHE; + +=head2 new + + Title : new + Usage : $type = Bio::DB::GFF::Typename->new($method,$source) + Function: create a new Bio::DB::GFF::Typename object + Returns : a new Bio::DB::GFF::Typename object + Args : method and source + Status : Public + +=cut + +sub new { + my $package = shift; + my ($method,$source) = @_; + $method ||= ''; + $source ||= ''; + if ($source eq '' && $method =~ /^(\w+):(\w+)$/) { + $method = $1; + $source = $2; + } + return $OBJECT_CACHE{"$method:$source"} ||= bless [$method,$source],$package; +} + +=head2 method + + Title : method + Usage : $method = $type->method([$newmethod]) + Function: get or set the method + Returns : a method name + Args : new method name (optional) + Status : Public + +=cut + +sub method { + my $self = shift; + my $d = $self->[0]; + $self->[0] = shift if @_; + $d; +} + + +=head2 source + + Title : source + Usage : $source = $type->source([$newsource]) + Function: get or set the source + Returns : a source name + Args : new source name (optional) + Status : Public + +=cut + +sub source { + my $self = shift; + my $d = $self->[1]; + $self->[1] = shift if @_; + $d; +} + +=head2 asString + + Title : asString + Usage : $string = $type->asString + Function: get the method and source as a string + Returns : a string in "method:source" format + Args : none + Status : Public + +This method is used by operator overloading to overload the '""' +operator. + +=cut + +sub asString { + $_[0]->[1] ? join ':',@{$_[0]} : $_[0]->[0]; +} + +=head2 clone + + Title : clone + Usage : $new_clone = $type->clone; + Function: clone this object + Returns : a new Bio::DB::GFF::Typename object + Args : none + Status : Public + +This method creates an exact copy of the object. + +=cut + +sub clone { + my $self = shift; + return bless [@$self],ref $self; +} + +=head2 match + + Title : match + Usage : $boolean = $type->match($type_or_string) + Function: fuzzy match on types + Returns : a flag indicating that the argument matches the object + Args : a Bio::DB::GFF::typename object, or a string in method:source format + Status : Public + +This match allows Sequence:Link and Sequence: to match, but not +Sequence:Link and Sequence:Genomic_canonical. + +=cut + +sub match { + my $self = shift; + my $target = shift; + my ($method,$source); + if (UNIVERSAL::isa($target,'Bio::DB::GFF::Typename')) { + ($method,$source) = ($target->method,$target->source); + } else { + ($method,$source) = split /:/,$target; + } + + return if $method ne '' && $self->method ne '' && $method ne $self->method; + return if $source ne '' && $self->source ne '' && $source ne $self->source; + 1; +} + +1; + +=head1 BUGS + +This module is still under development. + +=head1 SEE ALSO + +L, L, L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Util/Binning.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Util/Binning.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,106 @@ +=head1 NAME + +Bio::DB::GFF::Util::Binning - binning utility for Bio::DB::GFF index + +=head1 SYNOPSIS + + use Bio::DB::GFF::Util::Binning qw(bin bin_bot bin_top); + my $tier = bin($start,$stop,$min); + +=head1 DESCRIPTION + +This is a utility module that exports the functions bin(), bin_bot() +and bin_top(). These functions translate a range on the genome into a +named bin that is used as an index in the Bio::DB::GFF schema. The +index makes certain range retrieval queries much faster. + +=head1 API + +The remainder of the document describes the function calls. No calls +are exported by default, but must be imported explicitly. + +=over 4 + +=cut + +package Bio::DB::GFF::Util::Binning; + +use strict; +require Exporter; +use vars qw($VERSION @ISA @EXPORT @EXPORT_OK); +@ISA = 'Exporter'; +@EXPORT_OK = qw(bin bin_bot bin_top); +@EXPORT = @EXPORT_OK; + +=item $bin_name = bin($start,$stop,$bin_size) + +Given a start, stop and bin size on the genome, translate this +location into a bin name. In a list context, returns the bin tier +name and the position that the bin begins. + +=cut + +sub bin { + my ($start,$stop,$min) = @_; + my $tier = $min; + my ($bin_start,$bin_end); + while (1) { + $bin_start = int $start/$tier; + $bin_end = int $stop/$tier; + last if $bin_start == $bin_end; + $tier *= 10; + } + return wantarray ? ($tier,$bin_start) : bin_name($tier,$bin_start); +} + +=item $bottom = bin_bot($tier,$start) + +Given a tier name and a range start position, returns the lower end of +the bin range. + +=cut + +sub bin_bot { + my $tier = shift; + my $pos = shift; + bin_name($tier,int($pos/$tier)); +} + +=item $top = bin_top($tier,$end) + +Given a tier name and the end of a range, returns the upper end of the +bin range. + +=cut + +*bin_top = \&bin_bot; + +sub bin_name { sprintf("%d.%06d",@_) } + +sub log10 { + my $i = shift; + log($i)/log(10); +} + +1; + +=back + +=head1 BUGS + +None known yet. + +=head1 SEE ALSO + +L, + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GFF/Util/Rearrange.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF/Util/Rearrange.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,107 @@ +=head1 NAME + +Bio::DB::GFF::Util::Rearrange - rearrange utility + +=head1 SYNOPSIS + + use Bio::DB::GFF::Util::Rearrange 'rearrange'; + + my ($arg1,$arg2,$arg3,$others) = rearrange(['ARG1','ARG2','ARG3'],@args); + +=head1 DESCRIPTION + +This is a different version of the _rearrange() method from +Bio::Root::Root. It runs as a function call, rather than as a method +call, and it handles unidentified parameters slightly differently. + +It exports a single function call: + +=over 4 + +=item @rearranged_args = rearrange(\@parameter_names,@parameters); + +The first argument is an array reference containing list of parameter +names in the desired order. The second and subsequent arguments are a +list of parameters in the format: + + (-arg1=>$arg1,-arg2=>$arg2,-arg3=>$arg3...) + +The function calls returns the parameter values in the order in which +they were specified in @parameter_names. Any parameters that were not +found in @parameter_names are returned in the form of a hash reference +in which the keys are the uppercased forms of the parameter names, and +the values are the parameter values. + +=back + +=head1 BUGS + +None known yet. + +=head1 SEE ALSO + +L, + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut + +package Bio::DB::GFF::Util::Rearrange; + +use strict; +require Exporter; +use vars qw($VERSION @ISA @EXPORT @EXPORT_OK); +@ISA = 'Exporter'; +@EXPORT_OK = qw(rearrange); +@EXPORT = qw(rearrange); + +# default export +sub rearrange { + my($order,@param) = @_; + return unless @param; + my %param; + + if (ref $param[0] eq 'HASH') { + %param = %{$param[0]}; + } else { + return @param unless (defined($param[0]) && substr($param[0],0,1) eq '-'); + + my $i; + for ($i=0;$i<@param;$i+=2) { + $param[$i]=~s/^\-//; # get rid of initial - if present + $param[$i]=~tr/a-z/A-Z/; # parameters are upper case + } + + %param = @param; # convert into associative array + } + + my(@return_array); + + local($^W) = 0; + my($key)=''; + foreach $key (@$order) { + my($value); + if (ref($key) eq 'ARRAY') { + foreach (@$key) { + last if defined($value); + $value = $param{$_}; + delete $param{$_}; + } + } else { + $value = $param{$key}; + delete $param{$key}; + } + push(@return_array,$value); + } + push (@return_array,\%param) if %param; + return @return_array; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GenBank.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GenBank.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,309 @@ +# $Id: GenBank.pm,v 1.47.2.2 2003/07/03 12:31:31 heikki Exp $ +# +# BioPerl module for Bio::DB::GenBank +# +# Cared for by Aaron Mackey +# +# Copyright Aaron Mackey +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code +# +# Added LWP support - Jason Stajich 2000-11-6 +# completely reworked by Jason Stajich 2000-12-8 +# to use WebDBSeqI + +# Added batch entrez back when determined that new entrez cgi will +# essentially work (there is a limit to the number of characters in a +# GET request so I am not sure how we can get around this). The NCBI +# Batch Entrez form has changed some and it does not support retrieval +# of text only data. Still should investigate POST-ing (tried and +# failed) a message to the entrez cgi to get around the GET +# limitations. + +=head1 NAME + +Bio::DB::GenBank - Database object interface to GenBank + +=head1 SYNOPSIS + + use Bio::DB::GenBank; + $gb = new Bio::DB::GenBank; + + $seq = $gb->get_Seq_by_id('MUSIGHBA1'); # Unique ID + + # or ... + + $seq = $gb->get_Seq_by_acc('J00522'); # Accession Number + $seq = $gb->get_Seq_by_version('J00522.1'); # Accession.version + $seq = $gb->get_Seq_by_gi('405830'); # GI Number + + # get a stream via a query string + my $query = Bio::DB::Query::GenBank->new + (-query =>'Oryza sativa[Organism] AND EST', + -reldate => '30', + -db => 'nucleotide'); + my $seqio = $gb->get_Stream_by_query($query); + + while( my $seq = $seqio->next_seq ) { + print "seq length is ", $seq->length,"\n"; + } + + # or ... best when downloading very large files, prevents + # keeping all of the file in memory + + # also don't want features, just sequence so let's save bandwith + # and request Fasta sequence + $gb = new Bio::DB::GenBank(-retrievaltype => 'tempfile' , + -format => 'Fasta'); + my $seqio = $gb->get_Stream_by_acc(['AC013798', 'AC021953'] ); + while( my $clone = $seqio->next_seq ) { + print "cloneid is ", $clone->display_id, " ", + $clone->accession_number, "\n"; + } + # note that get_Stream_by_version is not implemented + +=head1 DESCRIPTION + +Allows the dynamic retrieval of Sequence objects (Bio::Seq) from the +GenBank database at NCBI, via an Entrez query. + +WARNING: Please do NOT spam the Entrez web server with multiple +requests. NCBI offers Batch Entrez for this purpose. + +Note that when querying for GenBank accessions starting with 'NT_' you +will need to call $gb-Erequest_format('fasta') beforehand, because +in GenBank format (the default) the sequence part will be left out +(the reason is that NT contigs are rather annotation with references +to clones). + +Some work has been done to automatically detect and retrieve whole NT_ +clones when the data is in that format (NCBI RefSeq clones). More +testing and feedback from users is needed to achieve a good fit of +functionality and ease of use. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Aaron Mackey, Jason Stajich + +Email amackey@virginia.edu +Email jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the +object methods. Internal methods are usually +preceded with a _ + +=cut + +# Let the code begin... + +package Bio::DB::GenBank; +use strict; +use vars qw(@ISA %PARAMSTRING $DEFAULTFORMAT $DEFAULTMODE); +use Bio::DB::NCBIHelper; + +@ISA = qw(Bio::DB::NCBIHelper); +BEGIN { + $DEFAULTMODE = 'single'; + $DEFAULTFORMAT = 'gp'; + %PARAMSTRING = ( + 'batch' => { 'db' => 'nucleotide', + 'usehistory' => 'n', + 'tool' => 'bioperl', + 'retmode' => 'text'}, + 'query' => { 'usehistory' => 'y', + 'tool' => 'bioperl', + 'retmode' => 'text'}, + 'gi' => { 'db' => 'nucleotide', + 'usehistory' => 'n', + 'tool' => 'bioperl', + 'retmode' => 'text'}, + 'version' => { 'db' => 'nucleotide', + 'usehistory' => 'n', + 'tool' => 'bioperl', + 'retmode' => 'text'}, + 'single' => { 'db' => 'nucleotide', + 'usehistory' => 'n', + 'tool' => 'bioperl', + 'retmode' => 'text'}, + ); +} + +# new is in NCBIHelper + +# helper method to get db specific options + +=head2 new + + Title : new + Usage : $gb = Bio::DB::GenBank->new(@options) + Function: Creates a new genbank handle + Returns : New genbank handle + Args : -delay number of seconds to delay between fetches (3s) + +NOTE: There are other options that are used internally. By NCBI policy, this +module introduces a 3s delay between fetches. If you are fetching multiple genbank +ids, it is a good idea to use get + +=cut + +=head2 get_params + + Title : get_params + Usage : my %params = $self->get_params($mode) + Function: Returns key,value pairs to be passed to NCBI database + for either 'batch' or 'single' sequence retrieval method + Returns : a key,value pair hash + Args : 'single' or 'batch' mode for retrieval + +=cut + +sub get_params { + my ($self, $mode) = @_; + return defined $PARAMSTRING{$mode} ? + %{$PARAMSTRING{$mode}} : %{$PARAMSTRING{$DEFAULTMODE}}; +} + +# from Bio::DB::WebDBSeqI from Bio::DB::RandomAccessI + +=head1 Routines Bio::DB::WebDBSeqI from Bio::DB::RandomAccessI + +=head2 get_Seq_by_id + + Title : get_Seq_by_id + Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN') + Function: Gets a Bio::Seq object by its name + Returns : a Bio::Seq object + Args : the id (as a string) of a sequence + Throws : "id does not exist" exception + +=head2 get_Seq_by_acc + + Title : get_Seq_by_acc + Usage : $seq = $db->get_Seq_by_acc($acc); + Function: Gets a Seq object by accession numbers + Returns : a Bio::Seq object + Args : the accession number as a string + Note : For GenBank, this just calls the same code for get_Seq_by_id() + Throws : "id does not exist" exception + +=cut + + +sub get_Seq_by_acc { + my ($self,$seqid) = @_; + $self->SUPER::get_Seq_by_acc("gb|$seqid"); +} + +=head2 get_Seq_by_gi + + Title : get_Seq_by_gi + Usage : $seq = $db->get_Seq_by_gi('405830'); + Function: Gets a Bio::Seq object by gi number + Returns : A Bio::Seq object + Args : gi number (as a string) + Throws : "gi does not exist" exception + +=head2 get_Seq_by_version + + Title : get_Seq_by_version + Usage : $seq = $db->get_Seq_by_version('X77802.1'); + Function: Gets a Bio::Seq object by sequence version + Returns : A Bio::Seq object + Args : accession.version (as a string) + Throws : "acc.version does not exist" exception + +=head1 Routines implemented by Bio::DB::NCBIHelper + +=head2 get_Stream_by_query + + Title : get_Stream_by_query + Usage : $seq = $db->get_Stream_by_query($query); + Function: Retrieves Seq objects from Entrez 'en masse', rather than one + at a time. For large numbers of sequences, this is far superior + than get_Stream_by_[id/acc](). + Example : + Returns : a Bio::SeqIO stream object + Args : $query : An Entrez query string or a + Bio::DB::Query::GenBank object. It is suggested that you + create a Bio::DB::Query::GenBank object and get the entry + count before you fetch a potentially large stream. + +=cut + +=head2 get_Stream_by_id + + Title : get_Stream_by_id + Usage : $stream = $db->get_Stream_by_id( [$uid1, $uid2] ); + Function: Gets a series of Seq objects by unique identifiers + Returns : a Bio::SeqIO stream object + Args : $ref : a reference to an array of unique identifiers for + the desired sequence entries + +=head2 get_Stream_by_acc + + Title : get_Stream_by_acc + Usage : $seq = $db->get_Stream_by_acc([$acc1, $acc2]); + Function: Gets a series of Seq objects by accession numbers + Returns : a Bio::SeqIO stream object + Args : $ref : a reference to an array of accession numbers for + the desired sequence entries + Note : For GenBank, this just calls the same code for get_Stream_by_id() + +=cut + +=head2 get_Stream_by_gi + + Title : get_Stream_by_gi + Usage : $seq = $db->get_Seq_by_gi([$gi1, $gi2]); + Function: Gets a series of Seq objects by gi numbers + Returns : a Bio::SeqIO stream object + Args : $ref : a reference to an array of gi numbers for + the desired sequence entries + Note : For GenBank, this just calls the same code for get_Stream_by_id() + +=head2 get_Stream_by_batch + + Title : get_Stream_by_batch + Usage : $seq = $db->get_Stream_by_batch($ref); + Function: Retrieves Seq objects from Entrez 'en masse', rather than one + at a time. + Example : + Returns : a Bio::SeqIO stream object + Args : $ref : either an array reference, a filename, or a filehandle + from which to get the list of unique ids/accession numbers. + +NOTE: This method is redundant and deprecated. Use get_Stream_by_id() +instead. + +=head2 get_request + + Title : get_request + Usage : my $url = $self->get_request + Function: HTTP::Request + Returns : + Args : %qualifiers = a hash of qualifiers (ids, format, etc) + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/GenPept.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GenPept.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,235 @@ +# $Id: GenPept.pm,v 1.26 2002/11/21 17:45:59 lstein Exp $ +# +# BioPerl module for Bio::DB::GenPept +# +# Cared for by Aaron Mackey +# +# Copyright Aaron Mackey +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +# completely reworked by Jason Stajich to use Bio::DB::WebDBSeqI 2000-12-12 + +=head1 NAME + +Bio::DB::GenPept - Database object interface to GenPept + +=head1 SYNOPSIS + + $gb = new Bio::DB::GenPept; + + $seq = $gb->get_Seq_by_id('195055'); # Unique ID + + # or ... + + $seq = $gb->get_Seq_by_acc('DEECTH'); # Accession Number + + my $seqio = $gb->get_Stream_by_id(['195055', 'DEECTH']); + while( my $seq = $seqio->next_seq ) { + print "seq is is ", $seq->display_id, "\n"; + } + +=head1 DESCRIPTION + +Allows the dynamic retrieval of Sequence objects (Bio::Seq) from the GenPept +database at NCBI, via an Entrez query. + +WARNING: Please do NOT spam the Entrez web server with multiple requests. +NCBI offers Batch Entrez for this purpose. Batch Entrez support will likely +be supported in a future version of DB::GenPept. + +Currently the only return format supported by NCBI Entrez for GenPept +database is GenPept format, so any format specification passed to +GenPept will be ignored still be forced to GenPept format (which is +just GenBank format). + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the +evolution of this and other Bioperl modules. Send +your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation +is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Aaron Mackey, Jason Stajich + +Email amackey@virginia.edu +Email jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::DB::GenPept; +use strict; +use vars qw(@ISA $DEFAULTFORMAT $DEFAULTMODE %PARAMSTRING ); +use Bio::DB::NCBIHelper; + +@ISA = qw(Bio::DB::NCBIHelper); +BEGIN { + $DEFAULTMODE = 'single'; + $DEFAULTFORMAT = 'gp'; + %PARAMSTRING = ( + 'batch' => { 'db' => 'protein', + 'usehistory' => 'n', + 'tool' => 'bioperl', + 'retmode' => 'text'}, + 'gi' => { 'db' => 'protein', + 'usehistory' => 'n', + 'tool' => 'bioperl', + 'retmode' => 'text'}, + 'version' => { 'db' => 'protein', + 'usehistory' => 'n', + 'tool' => 'bioperl', + 'retmode' => 'text'}, + 'single' => { 'db' => 'protein', + 'usehistory' => 'n', + 'tool' => 'bioperl', + 'retmode' => 'text'}, + ); +} + +# the new way to make modules a little more lightweight +sub new { + my($class, @args) = @_; + my $self = $class->SUPER::new(@args); + $self->request_format($self->default_format); + return $self; +} + +=head2 get_params + + Title : get_params + Usage : my %params = $self->get_params($mode) + Function: Returns key,value pairs to be passed to NCBI database + for either 'batch' or 'single' sequence retrieval method + Returns : a key,value pair hash + Args : 'single' or 'batch' mode for retrieval + +=cut + +sub get_params { + my ($self, $mode) = @_; + return defined $PARAMSTRING{$mode} ? %{$PARAMSTRING{$mode}} : %{$PARAMSTRING{$DEFAULTMODE}}; +} + +=head2 default_format + + Title : default_format + Usage : my $format = $self->default_format + Function: Returns default sequence format for this module + Returns : string + Args : none + +=cut + +sub default_format { + return $DEFAULTFORMAT; +} + +# from Bio::DB::WebDBSeqI from Bio::DB::RandomAccessI + +=head1 Routines from Bio::DB::WebDBSeqI and Bio::DB::RandomAccessI + +=head2 get_Seq_by_id + + Title : get_Seq_by_id + Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN') + Function: Gets a Bio::Seq object by its name + Returns : a Bio::Seq object + Args : the id (as a string) of a sequence + Throws : "id does not exist" exception + +=head2 get_Seq_by_acc + + Title : get_Seq_by_acc + Usage : $seq = $db->get_Seq_by_acc('AAC73346'); + Function: Gets a Seq objects by accession number + Returns : Bio::Seq object + Args : accession number to retrive by + +=head1 Routines implemented by Bio::DB::NCBIHelper + +=head2 get_request + + Title : get_request + Usage : my $url = $self->get_request + Function: HTTP::Request + Returns : + Args : %qualifiers = a hash of qualifiers (ids, format, etc) + +=head2 get_Stream_by_id + + Title : get_Stream_by_id + Usage : $stream = $db->get_Stream_by_id( [$uid1, $uid2] ); + Function: Gets a series of Seq objects by unique identifiers + Returns : a Bio::SeqIO stream object + Args : $ref : a reference to an array of unique identifiers for + the desired sequence entries + +=head2 get_Stream_by_acc (2) + + Title : get_Stream_by_acc + Usage : $seq = $db->get_Stream_by_acc($acc); + Function: Gets a series of Seq objects by accession numbers + Returns : a Bio::SeqIO stream object + Args : $ref : a reference to an array of accession numbers for + the desired sequence entries + Note : For GenBank, this just calls the same code for get_Stream_by_id() + +=head2 request_format + + Title : request_format + Usage : my $format = $self->request_format; + $self->request_format($format); + Function: Get/Set sequence format retrieval + Returns : string representing format + Args : $format = sequence format + +=cut + +# oberride to force format to be GenPept regardless +sub request_format { + my ($self) = @_; + return $self->SUPER::request_format($self->default_format()); +} + +1; +__END__ + + + + + + + + + + + + + + + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/InMemoryCache.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/InMemoryCache.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,258 @@ +# POD documentation - main docs before the code +# +# + +=head1 NAME + +Bio::DB::InMemoryCache - Abstract interface for a sequence database + +=head1 SYNOPSIS + + $cachedb = Bio::DB::InMemoryCache->new( -seqdb => $real_db, + -number => 1000); + + # + # get a database object somehow using a concrete class + # + + $seq = $cachedb->get_Seq_by_id('ROA1_HUMAN'); + + # + # $seq is a Bio::Seq object + # + +=head1 DESCRIPTION + +This is a memory cache system which saves the objects returned by Bio::DB::RandomAccessI in +memory to a hard limit of sequences. + +=head1 CONTACT + +Ewan Birney + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::DB::InMemoryCache; + +use Bio::DB::SeqI; + +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::Seq; + +@ISA = qw(Bio::Root::Root Bio::DB::SeqI); + + +sub new { + my ($class,@args) = @_; + + my $self = Bio::Root::Root->new(); + bless $self,$class; + + my ($seqdb,$number,$agr) = $self->_rearrange([qw(SEQDB NUMBER AGRESSION)],@args); + + if( !defined $seqdb || !ref $seqdb || !$seqdb->isa('Bio::DB::RandomAccessI') ) { + $self->throw("Must be a randomaccess database not a [$seqdb]"); + } + if( !defined $number ) { + $number = 1000; + } + + $self->seqdb($seqdb); + $self->number($number); + $self->agr($agr); + + # we consider acc as the primary id here + $self->{'_cache_number_hash'} = {}; + $self->{'_cache_id_hash'} = {}; + $self->{'_cache_acc_hash'} = {}; + $self->{'_cache_number'} = 1; + + return $self; +} + + + +=head2 get_Seq_by_id + + Title : get_Seq_by_id + Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN') + Function: Gets a Bio::Seq object by its name + Returns : a Bio::Seq object + Args : the id (as a string) of a sequence + Throws : "id does not exist" exception + + +=cut + +sub get_Seq_by_id{ + my ($self,$id) = @_; + + if( defined $self->{'_cache_id_hash'}->{$id} ) { + my $acc = $self->{'_cache_id_hash'}->{$id}; + my $seq = $self->{'_cache_acc_hash'}->{$acc}; + $self->{'_cache_number_hash'}->{$seq->accession} = $self->{'_cache_number'}++; + return $seq; + } else { + return $self->_load_Seq('id',$id); + } +} + +=head2 get_Seq_by_acc + + Title : get_Seq_by_acc + Usage : $seq = $db->get_Seq_by_acc('X77802'); + Function: Gets a Bio::Seq object by accession number + Returns : A Bio::Seq object + Args : accession number (as a string) + Throws : "acc does not exist" exception + + +=cut + +sub get_Seq_by_acc{ + my ($self,$acc) = @_; + + #print STDERR "In cache get for $acc\n"; + if( defined $self->{'_cache_acc_hash'}->{$acc} ) { + #print STDERR "Returning cached $acc\n"; + my $seq = $self->{'_cache_acc_hash'}->{$acc}; + $self->{'_cache_number_hash'}->{$seq->accession} = $self->{'_cache_number'}++; + return $seq; + } else { + return $self->_load_Seq('acc',$acc); + } +} + + + +sub number { + my ($self, $number) = @_; + if ($number) { + $self->{'number'} = $number; + } else { + return $self->{'number'}; + } +} + +sub seqdb { + my ($self, $seqdb) = @_; + if ($seqdb) { + $self->{'seqdb'} = $seqdb; + } else { + return $self->{'seqdb'}; + } +} + +sub agr { + my ($self, $agr) = @_; + if ($agr) { + $self->{'agr'} = $agr; + } else { + return $self->{'agr'}; + } +} + + +sub _load_Seq { + my ($self,$type,$id) = @_; + + my $seq; + + if( $type eq 'id') { + $seq = $self->seqdb->get_Seq_by_id($id); + }elsif ( $type eq 'acc' ) { + $seq = $self->seqdb->get_Seq_by_acc($id); + } else { + $self->throw("Bad internal error. Don't understand $type"); + } + + if( $self->agr() ) { + #print STDERR "Pulling out into memory\n"; + my $newseq = Bio::Seq->new( -display_id => $seq->display_id, + -accession_number => $seq->accession, + -seq => $seq->seq, + -desc => $seq->desc, + ); + if( $self->agr() == 1 ) { + foreach my $sf ( $seq->top_SeqFeatures() ) { + $newseq->add_SeqFeature($sf); + } + + $newseq->annotation($seq->annotation); + } + $seq = $newseq; + } + + if( $self->_number_free < 1 ) { + # remove the latest thing from the hash + my @accs = sort { $self->{'_cache_number_hash'}->{$a} <=> $self->{'_cache_number_hash'}->{$b} } keys %{$self->{'_cache_number_hash'}}; + + my $acc = shift @accs; + # remove this guy + my $seq = $self->{'_cache_acc_hash'}->{$acc}; + + delete $self->{'_cache_number_hash'}->{$acc}; + delete $self->{'_cache_id_hash'}->{$seq->id}; + delete $self->{'_cache_acc_hash'}->{$acc}; + } + + # up the number, register this sequence into the hash. + $self->{'_cache_id_hash'}->{$seq->id} = $seq->accession; + $self->{'_cache_acc_hash'}->{$seq->accession} = $seq; + $self->{'_cache_number_hash'}->{$seq->accession} = $self->{'_cache_number'}++; + + return $seq; +} + + +sub _number_free { + my $self = shift; + + return $self->number - scalar(keys %{$self->{'_cache_number_hash'}}); +} + + + + +=head2 get_Seq_by_version + + Title : get_Seq_by_version + Usage : $seq = $db->get_Seq_by_version('X77802.1'); + Function: Gets a Bio::Seq object by sequence version + Returns : A Bio::Seq object + Args : accession.version (as a string) + Throws : "acc.version does not exist" exception + +=cut + + +sub get_Seq_by_version{ + my ($self,@args) = @_; + $self->throw("Not implemented it"); +} + + + +## End of Package + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/MANIFEST --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/MANIFEST Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,19 @@ +GFF.pm +GFF/Adaptor/dbi.pm +GFF/Adaptor/dbi/mysql.pm +GFF/Adaptor/dbi/mysqlopt.pm +GFF/Aggregator.pm +GFF/Aggregator/alignment.pm +GFF/Aggregator/clone.pm +GFF/Aggregator/none.pm +GFF/Aggregator/transcript.pm +GFF/Featname.pm +GFF/Feature.pm +GFF/Homol.pm +GFF/RelSegment.pm +GFF/Segment.pm +GFF/Typename.pm +GFF/Util/Binning.pm +GFF/Util/Rearrange.pm +MANIFEST +Makefile.PL diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/Makefile.PL --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/Makefile.PL Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,13 @@ +use ExtUtils::MakeMaker; +# See lib/ExtUtils/MakeMaker.pm for details of how to influence +# the contents of the Makefile that is written. +WriteMakefile( + 'NAME' => 'Bio::DB::GFF', + 'VERSION_FROM' => 'GFF.pm', # finds $VERSION + 'PREREQ_PM' => { 'DBI' => 1.0, + 'Bio::DB::Fasta' => 1.0, + }, # e.g., Module::Name => 1.1 + 'LIBS' => [''], # e.g., '-lm' + 'DEFINE' => '', # e.g., '-DHAVE_SOMETHING' + 'INC' => '', # e.g., '-I/usr/include/other' +); diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/NCBIHelper.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/NCBIHelper.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,468 @@ +# $Id: NCBIHelper.pm,v 1.24.2.2 2003/06/12 09:29:38 heikki Exp $ +# +# BioPerl module for Bio::DB::NCBIHelper +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code +# +# Interfaces with new WebDBSeqI interface + +=head1 NAME + +Bio::DB::NCBIHelper - A collection of routines useful for queries to +NCBI databases. + +=head1 SYNOPSIS + + #Do not use this module directly. + + # get a Bio::DB::NCBIHelper object somehow + my $seqio = $db->get_Stream_by_acc(['MUSIGHBA1']); + foreach my $seq ( $seqio->next_seq ) { + # process seq + } + +=head1 DESCRIPTION + +Provides a single place to setup some common methods for querying NCBI +web databases. This module just centralizes the methods for +constructing a URL for querying NCBI GenBank and NCBI GenPept and the +common HTML stripping done in L(). + +The base NCBI query URL used is +http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the +evolution of this and other Bioperl modules. Send +your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation +is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to +help us keep track the bugs and their resolution. +Bug reports can be submitted via email or the +web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the +object methods. Internal methods are usually +preceded with a _ + +=cut + +# Let the code begin... + +package Bio::DB::NCBIHelper; +use strict; +use vars qw(@ISA $HOSTBASE %CGILOCATION %FORMATMAP + $DEFAULTFORMAT $MAX_ENTRIES $VERSION); + +use Bio::DB::WebDBSeqI; +use Bio::DB::Query::GenBank; +use HTTP::Request::Common; +use URI; +use Bio::Root::IO; +use Bio::DB::RefSeq; +use Bio::Root::Root; + +@ISA = qw(Bio::DB::WebDBSeqI Bio::Root::Root); +$VERSION = '0.8'; + +BEGIN { + $MAX_ENTRIES = 19000; + $HOSTBASE = 'http://eutils.ncbi.nlm.nih.gov'; + %CGILOCATION = ( + 'batch' => ['post' => '/entrez/eutils/efetch.fcgi'], + 'query' => ['get' => '/entrez/eutils/efetch.fcgi'], + 'single' => ['get' => '/entrez/eutils/efetch.fcgi'], + 'version'=> ['get' => '/entrez/eutils/efetch.fcgi'], + 'gi' => ['get' => '/entrez/eutils/efetch.fcgi'], + ); + + %FORMATMAP = ( 'gb' => 'genbank', + 'gp' => 'genbank', + 'fasta' => 'fasta', + ); + + $DEFAULTFORMAT = 'gb'; +} + +# the new way to make modules a little more lightweight + +sub new { + my ($class, @args ) = @_; + my $self = $class->SUPER::new(@args); + return $self; +} + + +=head2 get_params + + Title : get_params + Usage : my %params = $self->get_params($mode) + Function: Returns key,value pairs to be passed to NCBI database + for either 'batch' or 'single' sequence retrieval method + Returns : a key,value pair hash + Args : 'single' or 'batch' mode for retrieval + +=cut + +sub get_params { + my ($self, $mode) = @_; + $self->throw("subclass did not implement get_params"); +} + +=head2 default_format + + Title : default_format + Usage : my $format = $self->default_format + Function: Returns default sequence format for this module + Returns : string + Args : none + +=cut + +sub default_format { + return $DEFAULTFORMAT; +} + +=head2 get_request + + Title : get_request + Usage : my $url = $self->get_request + Function: HTTP::Request + Returns : + Args : %qualifiers = a hash of qualifiers (ids, format, etc) + +=cut + +sub get_request { + my ($self, @qualifiers) = @_; + my ($mode, $uids, $format, $query) = $self->_rearrange([qw(MODE UIDS + FORMAT QUERY)], + @qualifiers); + + $mode = lc $mode; + ($format) = $self->request_format() unless ( defined $format); + if( !defined $mode || $mode eq '' ) { $mode = 'single'; } + my %params = $self->get_params($mode); + if( ! %params ) { + $self->throw("must specify a valid retrieval mode 'single' or 'batch' not '$mode'") + } + my $url = URI->new($HOSTBASE . $CGILOCATION{$mode}[1]); + + unless( defined $uids or defined $query) { + $self->throw("Must specify a query or list of uids to fetch"); + } + + if ($uids) { + if( ref($uids) =~ /array/i ) { + $uids = join(",", @$uids); + } + $params{'id'} = $uids; + } + + elsif ($query && $query->can('cookie')) { + @params{'WebEnv','query_key'} = $query->cookie; + $params{'db'} = $query->db; + } + + elsif ($query) { + $params{'id'} = join ',',$query->ids; + } + + $params{'rettype'} = $format; + if ($CGILOCATION{$mode}[0] eq 'post') { + return POST $url,[%params]; + } else { + $url->query_form(%params); + $self->debug("url is $url \n"); + return GET $url; + } +} + +=head2 get_Stream_by_batch + + Title : get_Stream_by_batch + Usage : $seq = $db->get_Stream_by_batch($ref); + Function: Retrieves Seq objects from Entrez 'en masse', rather than one + at a time. For large numbers of sequences, this is far superior + than get_Stream_by_[id/acc](). + Example : + Returns : a Bio::SeqIO stream object + Args : $ref : either an array reference, a filename, or a filehandle + from which to get the list of unique ids/accession numbers. + +NOTE: deprecated API. Use get_Stream_by_id() instead. + +=cut + +*get_Stream_by_batch = sub { + my $self = shift; + $self->deprecated('get_Stream_by_batch() is deprecated; use get_Stream_by_id() instead'); + $self->get_Stream_by_id(@_) +}; + +=head2 get_Stream_by_query + + Title : get_Stream_by_query + Usage : $seq = $db->get_Stream_by_query($query); + Function: Retrieves Seq objects from Entrez 'en masse', rather than one + at a time. For large numbers of sequences, this is far superior + than get_Stream_by_[id/acc](). + Example : + Returns : a Bio::SeqIO stream object + Args : $query : An Entrez query string or a + Bio::DB::Query::GenBank object. It is suggested that you + create a Bio::DB::Query::GenBank object and get the entry + count before you fetch a potentially large stream. + +=cut + +sub get_Stream_by_query { + my ($self, $query) = @_; + unless (ref $query && $query->can('query')) { + $query = Bio::DB::Query::GenBank->new($query); + } + return $self->get_seq_stream('-query' => $query, '-mode'=>'query'); +} + +=head2 postprocess_data + + Title : postprocess_data + Usage : $self->postprocess_data ( 'type' => 'string', + 'location' => \$datastr); + Function: process downloaded data before loading into a Bio::SeqIO + Returns : void + Args : hash with two keys - 'type' can be 'string' or 'file' + - 'location' either file location or string + reference containing data + +=cut + +# the default method, works for genbank/genpept, other classes should +# override it with their own method. + +sub postprocess_data { + my ($self, %args) = @_; + my $data; + my $type = uc $args{'type'}; + my $location = $args{'location'}; + if( !defined $type || $type eq '' || !defined $location) { + return; + } elsif( $type eq 'STRING' ) { + $data = $$location; + } elsif ( $type eq 'FILE' ) { + open(TMP, $location) or $self->throw("could not open file $location"); + my @in = ; + close TMP; + $data = join("", @in); + } + + # transform links to appropriate descriptions + if ($data =~ /\nCONTIG\s+/) { + $self->warn("CONTIG found. GenBank get_Stream_by_acc about to run."); + my(@batch,@accession,%accessions,@location,$id, + $contig,$stream,$aCount,$cCount,$gCount,$tCount); + + # process GenBank CONTIG join(...) into two arrays + $data =~ /(?:CONTIG\s+join\()((?:.+\n)+)(?:\/\/)/; + $contig = $1; + $contig =~ s/\n|\)//g; + foreach (split /\s*,\s*/,$contig){ + if (/>(.+)<.+>:(.+)/) { + ($id) = split /\./, $1; + push @accession, $id; + push @location, $2; + $accessions{$id}->{'count'}++; + } elsif( /([\w\.]+):(.+)/ ) { + ($id) = split /\./, $1; + $accessions{$id}->{'count'}++; + push @accession, $id; + push @location, $2; + } + } + + # grab multiple sequences by batch and join based location variable + my @unique_accessions = keys %accessions; + $stream = $self->get_Stream_by_acc(\@unique_accessions); + $contig = ""; + my $ct = 0; + while( my $seq = $stream->next_seq() ) { + if( $seq->accession_number !~ /$unique_accessions[$ct]/ ) { + printf STDERR "warning, %s does not match %s\n", + $seq->accession_number, $unique_accessions[$ct]; + } + $accessions{$unique_accessions[$ct]}->{'seq'} = $seq; + $ct++; + } + for (my $i = 0; $i < @accession; $i++) { + my $seq = $accessions{$accession[$i]}->{'seq'}; + unless( defined $seq ) { + # seq not cached, get next sequence + $self->warn("unable to find sequence $accession[$i]\n"); + return undef; + } + my($start,$end) = split(/\.\./, $location[$i]); + $contig .= $seq->subseq($start,$end-$start); + } + + # count number of each letter in sequence + $aCount = () = $contig =~ /a/ig; + $cCount = () = $contig =~ /c/ig; + $gCount = () = $contig =~ /g/ig; + $tCount = () = $contig =~ /t/ig; + + # remove everything after and including CONTIG + $data =~ s/(CONTIG[\s\S]+)$//i; + + # build ORIGIN part of data file using sequence and counts + $data .= "BASE COUNT $aCount a $cCount c $gCount g $tCount t\n"; + $data .= "ORIGIN \n"; + $data .= "$contig\n//"; + } + else { + $data =~ s/\s*(\S+)\s*<\s*\/a\s*\>/$1/ig; + } + + # fix gt and lt + $data =~ s/>/>/ig; + $data =~ s/</$location") or $self->throw("couldn't overwrite file $location"); + print TMP $data; + close TMP; + } elsif ( $type eq 'STRING' ) { + ${$args{'location'}} = $data; + } + $self->debug("format is ". join(',',$self->request_format()). + " data is\n$data\n"); +} + + +=head2 request_format + + Title : request_format + Usage : my ($req_format, $ioformat) = $self->request_format; + $self->request_format("genbank"); + $self->request_format("fasta"); + Function: Get/Set sequence format retrieval. The get-form will normally not + be used outside of this and derived modules. + Returns : Array of two strings, the first representing the format for + retrieval, and the second specifying the corresponding SeqIO format. + Args : $format = sequence format + +=cut + +sub request_format { + my ($self, $value) = @_; + if( defined $value ) { + $value = lc $value; + if( defined $FORMATMAP{$value} ) { + $self->{'_format'} = [ $value, $FORMATMAP{$value}]; + } else { + # Try to fall back to a default. Alternatively, we could throw + # an exception + $self->{'_format'} = [ $value, $value ]; + } + } + return @{$self->{'_format'}}; +} + +=head2 Bio::DB::WebDBSeqI methods + +Overriding WebDBSeqI method to help newbies to retrieve sequences + +=head2 get_Stream_by_acc + + Title : get_Stream_by_acc + Usage : $seq = $db->get_Stream_by_acc([$acc1, $acc2]); + Function: Gets a series of Seq objects by accession numbers + Returns : a Bio::SeqIO stream object + Args : $ref : a reference to an array of accession numbers for + the desired sequence entries + Note : For GenBank, this just calls the same code for get_Stream_by_id() + +=cut + +sub get_Stream_by_acc { + my ($self, $ids ) = @_; + my $newdb = $self->_check_id($ids); + if (defined $newdb && ref($newdb) && $newdb->isa('Bio::DB::RefSeq')) { + return $newdb->get_seq_stream('-uids' => $ids, '-mode' => 'single'); + } else { + return $self->get_seq_stream('-uids' => $ids, '-mode' => 'single'); + } +} + + +=head2 _check_id + + Title : _check_id + Usage : + Function: + Returns : A Bio::DB::RefSeq reference or throws + Args : $id(s), $string + +=cut + +sub _check_id { + my ($self, $ids) = @_; + + # NT contigs can not be retrieved + $self->throw("NT_ contigs are whole chromosome files which are not part of regular". + "database distributions. Go to ftp://ftp.ncbi.nih.gov/genomes/.") + if $ids =~ /NT_/; + + # Asking for a RefSeq from EMBL/GenBank + + if ($ids =~ /N._/) { + $self->warn("[$ids] is not a normal sequence database but a RefSeq entry.". + " Redirecting the request.\n") + if $self->verbose >= 0; + return new Bio::DB::RefSeq; + } +} + +=head2 delay_policy + + Title : delay_policy + Usage : $secs = $self->delay_policy + Function: return number of seconds to delay between calls to remote db + Returns : number of seconds to delay + Args : none + +NOTE: NCBI requests a delay of 3s between requests. This method +implements that policy. + +=cut + +sub delay_policy { + my $self = shift; + return 3; +} + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/Query/GenBank.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/Query/GenBank.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,286 @@ +# $Id: GenBank.pm,v 1.4.2.1 2003/09/09 21:28:52 lstein Exp $ +# +# BioPerl module for Bio::DB::Query::GenBank.pm +# +# Cared for by Lincoln Stein +# +# Copyright Lincoln Stein +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code +# + +=head1 NAME + +Bio::DB::Query::GenBank - Build a GenBank Entrez Query + +=head1 SYNOPSIS + + my $query_string = 'Oryza[Organism] AND EST[Keyword]'; + my $query = Bio::DB::Query::GenBank->new(-db=>'nucleotide', + -query=>$query_string, + -mindate => '2001', + -maxdate => '2002'); + my $count = $query->count; + my @ids = $query->ids; + + # get a genbank database handle + my $gb = new Bio::DB::GenBank; + my $stream = $gb->get_Stream_by_query($query); + while (my $seq = $stream->next_seq) { + ... + } + + # initialize the list yourself + my $query = Bio::DB::Query::GenBank->new(-ids=>[195052,2981014,11127914]); + + +=head1 DESCRIPTION + +This class encapsulates NCBI Entrez queries. It can be used to store +a list of GI numbers, to translate an Entrez query expression into a +list of GI numbers, or to count the number of terms that would be +returned by a query. Once created, the query object can be passed to +a Bio::DB::GenBank object in order to retrieve the entries +corresponding to the query. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the +evolution of this and other Bioperl modules. Send +your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation +is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to +help us keep track the bugs and their resolution. +Bug reports can be submitted via email or the +web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Lincoln Stein + +Email lstein@cshl.org + +=head1 APPENDIX + +The rest of the documentation details each of the +object methods. Internal methods are usually +preceded with a _ + +=cut + +# Let the code begin... + +package Bio::DB::Query::GenBank; +use strict; +use Bio::DB::Query::WebQuery; +use URI::Escape 'uri_unescape'; + +use constant EPOST => 'http://www.ncbi.nih.gov/entrez/eutils/epost.fcgi'; +use constant ESEARCH => 'http://www.ncbi.nih.gov/entrez/eutils/esearch.fcgi'; +use constant DEFAULT_DB => 'protein'; +use constant MAXENTRY => 100; + +use vars qw(@ISA @ATTRIBUTES $VERSION); + +@ISA = 'Bio::DB::Query::WebQuery'; +$VERSION = '0.2'; + +BEGIN { + @ATTRIBUTES = qw(db reldate mindate maxdate datetype); + for my $method (@ATTRIBUTES) { + eval <{'_$method'}; + \$self->{'_$method'} = shift if \@_; + \$d; +} +END + } +} + +=head2 new + + Title : new + Usage : $db = Bio::DB::Query::GenBank->new(@args) + Function: create new query object + Returns : new query object + Args : -db database ('protein' or 'nucleotide') + -query query string + -mindate minimum date to retrieve from + -maxdate maximum date to retrieve from + -reldate relative date to retrieve from (days) + -datetype date field to use ('edat' or 'mdat') + -ids array ref of gids (overrides query) + +This method creates a new query object. Typically you will specify a +-db and a -query argument, possibly modified by -mindate, -maxdate, or +-reldate. -mindate and -maxdate specify minimum and maximum dates for +entries you are interested in retrieving, expressed in the form +DD/MM/YYYY. -reldate is used to fetch entries that are more recent +than the indicated number of days. + +If you provide an array reference of IDs in -ids, the query will be +ignored and the list of IDs will be used when the query is passed to a +Bio::DB::GenBank object's get_Stream_by_query() method. A variety of +IDs are automatically recognized, including GI numbers, Accession +numbers, Accession.version numbers and locus names. + +=cut + +sub new { + my $class = shift; + my $self = $class->SUPER::new(@_); + my ($db,$reldate,$mindate,$maxdate,$datetype,$ids) + = $self->_rearrange([qw(DB RELDATE MINDATE MAXDATE DATETYPE IDS)],@_); + $self->db($db || DEFAULT_DB); + $reldate && $self->reldate($reldate); + $mindate && $self->mindate($mindate); + $maxdate && $self->maxdate($maxdate); + $datetype ||= 'mdat'; + $datetype && $self->datetype($datetype); + $self; +} + +=head2 cookie + + Title : cookie + Usage : ($cookie,$querynum) = $db->cookie + Function: return the NCBI query cookie + Returns : list of (cookie,querynum) + Args : none + +NOTE: this information is used by Bio::DB::GenBank in +conjunction with efetch. + +=cut + +sub cookie { + my $self = shift; + if (@_) { + $self->{'_cookie'} = shift; + $self->{'_querynum'} = shift; + } + + else { + $self->_run_query; + @{$self}{qw(_cookie _querynum)}; + } +} + +=head2 _request_parameters + + Title : _request_parameters + Usage : ($method,$base,@params = $db->_request_parameters + Function: return information needed to construct the request + Returns : list of method, url base and key=>value pairs + Args : none + +=cut + +sub _request_parameters { + my $self = shift; + my ($method,$base); + my @params = map {eval("\$self->$_") ? ($_ => eval("\$self->$_")) : () } @ATTRIBUTES; + push @params,('usehistory'=>'y','tool'=>'bioperl'); + $method = 'get'; + $base = ESEARCH; + push @params,('term' => $self->query); + push @params,('retmax' => $self->{'_count'} || MAXENTRY); + ($method,$base,@params); +} + + +=head2 count + + Title : count + Usage : $count = $db->count; + Function: return count of number of entries retrieved by query + Returns : integer + Args : none + +Returns the number of entries that are matched by the query. + +=cut + +sub count { + my $self = shift; + if (@_) { + my $d = $self->{'_count'}; + $self->{'_count'} = shift; + return $d; + } + else { + $self->_run_query; + return $self->{'_count'}; + } +} + +=head2 ids + + Title : ids + Usage : @ids = $db->ids([@ids]) + Function: get/set matching ids + Returns : array of sequence ids + Args : (optional) array ref with new set of ids + +=cut + +=head2 query + + Title : query + Usage : $query = $db->query([$query]) + Function: get/set query string + Returns : string + Args : (optional) new query string + +=cut + +=head2 _parse_response + + Title : _parse_response + Usage : $db->_parse_response($content) + Function: parse out response + Returns : empty + Args : none + Throws : 'unparseable output exception' + +=cut + +sub _parse_response { + my $self = shift; + my $content = shift; + if (my ($warning) = $content =~ m!(.+)!s) { + warn "Warning(s) from GenBank: $warning\n"; + } + if (my ($error) = $content =~ /([^<]+)/) { + $self->throw("Error from Genbank: $error"); + } + + my ($count) = $content =~ /(\d+)/; + my ($max) = $content =~ /(\d+)/; + my $truncated = $count > $max; + $self->count($count); + if (!$truncated) { + my @ids = $content =~ /(\d+)/g; + $self->ids(\@ids); + } + $self->_truncated($truncated); + my ($cookie) = $content =~ m!(\S+)!; + my ($querykey) = $content =~ m!(\d+)!; + $self->cookie(uri_unescape($cookie),$querykey); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/Query/WebQuery.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/Query/WebQuery.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,353 @@ +# $Id: WebQuery.pm,v 1.5 2002/12/05 13:46:32 heikki Exp $ +# +# BioPerl module for Bio::DB::WebQuery.pm +# +# Cared for by Lincoln Stein +# +# Copyright Lincoln Stein +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code +# + +=head1 NAME + +Bio::DB::Query::WebQuery - Helper class for web-based sequence queryies + +=head1 SYNOPSIS + +Do not use this class directly. See Bio::DB::QueryI and one of the +implementor classes (such as Bio::DB::GenBankQuery) for information. + + +=head1 DESCRIPTION + +Do not use this class directly. See Bio::DB::QueryI and one of the +implementor classes (such as Bio::DB::GenBankQuery) for information. + +Those writing subclasses must define _get_params() and +_parse_response(), and possibly override _request_method(). + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the +evolution of this and other Bioperl modules. Send +your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation +is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to +help us keep track the bugs and their resolution. +Bug reports can be submitted via email or the +web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Lincoln Stein + +Email lstein@cshl.org + +=head1 APPENDIX + +The rest of the documentation details each of the +object methods. Internal methods are usually +preceded with a _ + +=cut + +# Let the code begin... + +package Bio::DB::Query::WebQuery; +use strict; +use URI; +use LWP::UserAgent; +use HTTP::Request::Common; +use Bio::Root::Root; +use Bio::DB::QueryI; + +use vars qw(@ISA $VERSION); + +@ISA = qw(Bio::Root::Root Bio::DB::QueryI); +$VERSION = '0.1'; + +=head2 new + + Title : new + Usage : $db = Bio::DB::WebQuery->new(@args) + Function: create new query object + Returns : new query object + Args : -db database (e.g. 'protein') + -ids array ref of ids (overrides query) + -verbose turn on verbose debugging + +This method creates a new query object. Typically you will specify a +-db and a -query argument. The value of -query is a database-specific +string. + +If you provide an array reference of IDs in -ids, the query will be +ignored and the list of IDs will be used when the query is passed to +the database. + +=cut + +# Borrowed shamelessly from WebDBSeqI. Some of this code should be +# refactored. +sub new { + my $class = shift; + my $self = $class->SUPER::new(@_); + + my ($query,$ids,$verbose) = $self->_rearrange(['QUERY','IDS','VERBOSE'],@_); + $self->throw('must provide one of the the -query or -ids arguments') + unless defined($query) || defined($ids); + $query ||= join ',',ref($ids) ? @$ids : $ids; + $query && $self->query($query); + $verbose && $self->verbose($verbose); + + my $ua = new LWP::UserAgent; + $ua->agent(ref($self) ."/$VERSION"); + $self->ua($ua); + $self->{'_authentication'} = []; + $self; +} + +=head2 ua + + Title : ua + Usage : my $ua = $self->ua or + $self->ua($ua) + Function: Get/Set a LWP::UserAgent for use + Returns : reference to LWP::UserAgent Object + Args : $ua - must be a LWP::UserAgent + +=cut + +sub ua { + my ($self, $ua) = @_; + my $d = $self->{'_ua'}; + if( defined $ua && $ua->isa("LWP::UserAgent") ) { + $self->{'_ua'} = $ua; + } + $d; +} + +=head2 proxy + + Title : proxy + Usage : $httpproxy = $db->proxy('http') or + $db->proxy(['http','ftp'], 'http://myproxy' ) + Function: Get/Set a proxy for use of proxy + Returns : a string indicating the proxy + Args : $protocol : an array ref of the protocol(s) to set/get + $proxyurl : url of the proxy to use for the specified protocol + $username : username (if proxy requires authentication) + $password : password (if proxy requires authentication) + +=cut + +sub proxy { + my ($self,$protocol,$proxy,$username,$password) = @_; + return undef if ( !defined $self->ua || !defined $protocol + || !defined $proxy ); + $self->authentication($username, $password) + if ($username && $password); + return $self->ua->proxy($protocol,$proxy); +} + +=head2 authentication + + Title : authentication + Usage : $db->authentication($user,$pass) + Function: Get/Set authentication credentials + Returns : Array of user/pass + Args : Array or user/pass + + +=cut + +sub authentication{ + my ($self,$u,$p) = @_; + + if( defined $u && defined $p ) { + $self->{'_authentication'} = [ $u,$p]; + } + return @{$self->{'_authentication'}}; +} + +=head2 ids + + Title : ids + Usage : @ids = $db->ids([@ids]) + Function: get/set matching ids + Returns : array of sequence ids + Args : (optional) array ref with new set of ids + +=cut + +sub ids { + my $self = shift; + if (@_) { + my $d = $self->{'_ids'}; + my $arg = shift; + $self->{'_ids'} = ref $arg ? $arg : [$arg]; + return $d ? @$d : (); + } else { + $self->_fetch_ids; + return @{$self->{'_ids'}}; + } +} + +=head2 query + + Title : query + Usage : $query = $db->query([$query]) + Function: get/set query string + Returns : string + Args : (optional) new query string + +=cut + +sub query { + my $self = shift; + my $d = $self->{'_query'}; + $self->{'_query'} = shift if @_; + $d; +} + +=head2 _fetch_ids + + Title : _fetch_ids + Usage : @ids = $db->_fetch_ids + Function: run query, get ids + Returns : array of sequence ids + Args : none + +=cut + +sub _fetch_ids { + my $self = shift; + $self->_run_query; + $self->_run_query(1) if $self->_truncated; + $self->throw('Id list has been truncated even after maxids requested') + if $self->_truncated; + return @{$self->{'_ids'}} if $self->{'_ids'}; +} + +=head2 _run_query + + Title : _run_query + Usage : $success = $db->_run_query + Function: run query, parse results + Returns : true if successful + Args : none + +=cut + +sub _run_query { + my $self = shift; + my $force = shift; + + # allow the query to be run one extra time if truncated + return $self->{'_ran_query'} if $self->{'_ran_query'}++ && !$force; + + my $request = $self->_get_request; + $self->debug("request is ".$request->url) if $self->verbose; + my $response = $self->ua->request($request); + return unless $response->is_success; + $self->debug("response is ".$response->content) if $self->verbose; + $self->_parse_response($response->content); + 1; +} + +=head2 _truncated + + Title : _truncated + Usage : $flag = $db->_truncated([$newflag]) + Function: get/set truncation flag + Returns : boolean + Args : new flag + +Some databases will truncate output unless explicitly asked +not to. This flag allows a "two probe" attempt. + +=cut + +sub _truncated { + my $self = shift; + my $d = $self->{'_truncated'}; + $self->{'_truncated'} = shift if @_; + $d; +} + +=head2 _get_request + + Title : _get_request + Usage : $http_request = $db->_get_request(@params) + Function: create an HTTP::Request with indicated parameters + Returns : HTTP::Request object + Args : CGI parameter list + +=cut + +sub _get_request { + my $self = shift; + my ($method,$base,@params) = $self->_request_parameters; + my $uri = URI->new($base); + my $request; + if ($method eq 'get') { + $uri->query_form(@params); + $request = GET $uri; + } else { + $request = POST $uri,\@params; + } + + $request->proxy_authorization_basic($self->authentication) + if $self->authentication; + $request; +} + +=head2 _parse_response + + Title : _parse_response + Usage : $db->_parse_response($content) + Function: parse out response + Returns : empty + Args : none + Throws : 'unparseable output exception' + +NOTE: This method must be implemented by subclass. + +=cut + +sub _parse_response { + my $self = shift; + my $content = shift; + $self->throw_not_implemented; +} + +=head2 _request_parameters + + Title : _request_parameters + Usage : ($method,$base,@params = $db->_request_parameters + Function: return information needed to construct the request + Returns : list of method, url base and key=>value pairs + Args : none + +NOTE: This method must be implemented by subclass. + +=cut + +sub _request_parameters { + my $self = shift; + $self->throw_not_implemented; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/QueryI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/QueryI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,158 @@ +# $Id: QueryI.pm,v 1.1 2002/11/20 08:39:03 lstein Exp $ +# +# BioPerl module for Bio::DB::QueryI.pm +# +# Cared for by Lincoln Stein +# +# Copyright Lincoln Stein +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code +# + +=head1 NAME + +Bio::DB::QueryI - Object Interface to queryable sequence databases + +=head1 SYNOPSIS + + # using Bio::DB::Query::GenBank as an example + my $query_string = 'Oryza[Organism] AND EST[Keyword]'; + my $query = Bio::DB::Query::GenBank->new(-db=>'nucleotide', + -query=>$query_string); + my $count = $query->count; + my @ids = $query->ids; + + # get a genbank database handle + $gb = new Bio::DB::GenBank; + my $stream = $db->get_Stream_by_query($query); + while (my $seq = $stream->next_seq) { + ... + } + + # initialize the list yourself + my $query = Bio::DB::Query::GenBank->new(-ids=>['X1012','CA12345']); + +=head1 DESCRIPTION + +This interface provides facilities for managing sequence queries such +as those offered by Entrez. A query object is created by calling +new() with a database-specific argument list. From the query object +you can either obtain the list of IDs returned by the query, or a +count of entries that would be returned. You can pass the query +object to a Bio::DB::RandomAccessI object to return the entries +themselves as a list or a stream. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the +evolution of this and other Bioperl modules. Send +your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation +is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to +help us keep track the bugs and their resolution. +Bug reports can be submitted via email or the +web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Lincoln Stein + +Email lstein@cshl.org + +=head1 APPENDIX + +The rest of the documentation details each of the +object methods. Internal methods are usually +preceded with a _ + +=cut + +# Let the code begin... + +package Bio::DB::QueryI; +use strict; +use Bio::Root::RootI; + +use vars qw(@ISA $VERSION); + +@ISA = qw(Bio::Root::RootI); +$VERSION = '0.1'; + +=head2 new + + Title : new + Usage : $db = Bio::DB::QueryI->new(@args); + Function: constructor + Returns : QueryI object + Args : -query a query string + -ids a list of ids as an arrayref + +Create new QueryI object. You may initialize with either a query +string or with a list of ids. If both ids and a query are provided, +the former takes precedence. + +Subclasses may recognize additional arguments. + +=cut + +=head2 count + + Title : count + Usage : $count = $db->count; + Function: return count of number of entries retrieved by query + Returns : integer + Args : none + +Returns the number of entries that are matched by the query. + +=cut + +sub count { + my $self = shift; + my @ids = $self->ids; + scalar @ids; +} + +=head2 ids + + Title : ids + Usage : @ids = $db->ids([@ids]) + Function: get/set matching ids + Returns : array of sequence ids + Args : (optional) array ref with new set of ids + +=cut + +sub ids { + my $self = shift; + $self->throw_not_implemented; +} + +=head2 query + + Title : query + Usage : $query = $db->query([$query]) + Function: get/set query string + Returns : string + Args : (optional) new query string + +=cut + +sub query { + my $self = shift; + $self->throw_not_implemented; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/RandomAccessI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/RandomAccessI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,125 @@ +# POD documentation - main docs before the code +# +# $Id: RandomAccessI.pm,v 1.12 2002/10/22 07:38:29 lapp Exp $ +# + +=head1 NAME + +Bio::DB::RandomAccessI - Abstract interface for a sequence database + +=head1 SYNOPSIS + + # + # get a database object somehow using a concrete class + # + + $seq = $db->get_Seq_by_id('ROA1_HUMAN'); + + # + # $seq is a Bio::Seq object + # + +=head1 DESCRIPTION + +This is a pure interface class - in other words, all this does is define +methods which other (concrete) classes will actually implement. + +The Bio::DB::RandomAccessI class defines what methods a generic database class +should have. At the moment it is just the ability to make Bio::Seq objects +from a name (id) or a accession number. + +=head1 CONTACT + +Ewan Birney originally wrote this class. + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::DB::RandomAccessI; + +use vars qw(@ISA); +use strict; + +use Bio::Root::RootI; +use Bio::Root::Root; + +@ISA = qw(Bio::Root::Root); + +=head2 get_Seq_by_id + + Title : get_Seq_by_id + Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN') + Function: Gets a Bio::Seq object by its name + Returns : a Bio::Seq object or undef if not found + Args : the id (as a string) of a sequence, + +=cut + +sub get_Seq_by_id{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +=head2 get_Seq_by_acc + + Title : get_Seq_by_acc + Usage : $seq = $db->get_Seq_by_acc('X77802'); + $seq = $db->get_Seq_by_acc(Locus => 'X77802'); + Function: Gets a Bio::Seq object by accession number + Returns : A Bio::Seq object or undef if not found + Args : accession number (as a string), or a two + element list consisting of namespace=>accession + Throws : "more than one sequences correspond to this accession" + if the accession maps to multiple primary ids and + method is called in a scalar context + +NOTE: The two-element form allows you to choose the namespace for the +accession number. + +=cut + +sub get_Seq_by_acc{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + + +=head2 get_Seq_by_version + + Title : get_Seq_by_version + Usage : $seq = $db->get_Seq_by_version('X77802.1'); + Function: Gets a Bio::Seq object by sequence version + Returns : A Bio::Seq object + Args : accession.version (as a string) + Throws : "acc.version does not exist" exception + +=cut + + +sub get_Seq_by_version{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + + + +## End of Package + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/RefSeq.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/RefSeq.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,142 @@ +# +# $Id: RefSeq.pm,v 1.5 2002/10/22 07:38:29 lapp Exp $ +# +# BioPerl module for Bio::DB::EMBL +# +# Cared for by Heikki Lehvaslaiho +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::DB::RefSeq - Database object interface for RefSeq retrieval + +=head1 SYNOPSIS + use Bio::DB::RefSeq; + + $db = new Bio::DB::RefSeq; + + # most of the time RefSeq_ID eq RefSeq acc + $seq = $db->get_Seq_by_id('NM_006732'); # RefSeq ID + print "accession is ", $seq->accession_number, "\n"; + + # or changeing to accession number and Fasta format ... + $db->request_format('fasta'); + $seq = $db->get_Seq_by_acc('NM_006732'); # RefSeq ACC + print "seq is ", $seq->seq, "\n"; + + # especially when using versions, you better be prepared + # in not getting what what want + eval { + $seq = $db->get_Seq_by_version('NM_006732.1'); # RefSeq VERSION + }; + print "accesion is ", $seq->accession_number, "\n" unless $@; + + # or ... best when downloading very large files, prevents + # keeping all of the file in memory + + # also don't want features, just sequence so let's save bandwith + # and request Fasta sequence + $db = new Bio::DB::RefSeq(-retrievaltype => 'tempfile' , + -format => 'fasta'); + my $seqio = $db->get_Stream_by_batch(['NM_006732', 'NM_005252'] ); + while( my $seq = $seqio->next_seq ) { + print "seqid is ", $seq->id, "\n"; + } + +=head1 DESCRIPTION + +Allows the dynamic retrieval of sequence objects L from the +RefSeq database using the dbfetch script at EBI: +LEwww.ebi.ac.ukEcgi-binEdbfetch>. + +In order to make changes transparent we have host type (currently only +ebi) and location (defaults to ebi) separated out. This allows later +additions of more servers in different geographical locations. + +The functionality of this module is inherited from L +which implements L. + +This module retrieves entries from EBI although it +retrives database entries produced at NCBI. When read into bioperl +objects, the parser for GenBank format it used. RefSeq is a +NONSTANDARD GenBank file so be ready for surprises. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email Heikki Lehvaslaiho EHeikki@ebi.ac.ukE + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::DB::RefSeq; +use strict; +use vars qw(@ISA $MODVERSION %HOSTS %FORMATMAP $DEFAULTFORMAT); + +$MODVERSION = '0.1'; +use Bio::DB::DBFetch; + +@ISA = qw(Bio::DB::DBFetch); + +BEGIN { + # you can add your own here theoretically. + %HOSTS = ( + 'dbfetch' => { + baseurl => 'http://%s/cgi-bin/dbfetch?db=refseq&style=raw', + hosts => { + 'ebi' => 'www.ebi.ac.uk' + } + } + ); + %FORMATMAP = ( 'embl' => 'embl', + 'genbank' => 'genbank', + 'fasta' => 'fasta' + ); + $DEFAULTFORMAT = 'genbank'; +} + +sub new { + my ($class, @args ) = @_; + my $self = $class->SUPER::new(@args); + + $self->{ '_hosts' } = {}; + $self->{ '_formatmap' } = {}; + + $self->hosts(\%HOSTS); + $self->formatmap(\%FORMATMAP); + $self->{'_default_format'} = $DEFAULTFORMAT; + + return $self; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/Registry.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/Registry.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,256 @@ +# POD documentation - main docs before the code + +# $Id: Registry.pm,v 1.12.2.2 2003/06/26 11:07:10 heikki Exp $ + + +=head1 NAME + +Bio::DB::Registry - Access to the Open Bio Database Access registry scheme + +=head1 SYNOPSIS + + use Bio::DB::Registry(); + + $registry = new Bio::DB::Registry(); + + @available_services = $registry->services; + + $db = $registry->get_database('embl'); + # $db is a Bio::DB::SeqI implementing class + + $seq = $db->get_Seq_by_acc("J02231"); + +=head1 DESCRIPTION + +This module provides access to the Open Bio Database Access scheme, +which provides a cross language and cross platform specification of how +to get to databases. + +If the user or system administrator has not installed the default init +file, seqdatabase.ini, in /etc/bioinformatics or ${HOME}/.bioinformatics +then creating the first Registry object copies the default settings from +the net. The Registry object will attempt to store these settings in +${HOME}/.bioinformatics/seqdatabase.ini. + +Users can specify one or more custom locations for the init file by +setting $OBDA_SEARCH_PATH to those directories, where multiple +directories should be separated by ';'. + +=head1 CONTACT + +Ewan Birney originally wrote this class. + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::DB::Registry; + +use vars qw(@ISA $VERSION $OBDA_SPEC_VERSION $OBDA_SEARCH_PATH); +use strict; + +use Bio::Root::Root; +@ISA = qw(Bio::Root::Root); +use Bio::DB::Failover; +use Bio::Root::HTTPget; + +$VERSION = '1.2'; +BEGIN { + $OBDA_SPEC_VERSION = 1.0; + if (defined $ENV{OBDA_SEARCH_PATH}) { + $OBDA_SEARCH_PATH = $ENV{OBDA_SEARCH_PATH} || ''; + + } +} + +my %implement = ( + 'biocorba' => 'Bio::CorbaClient::SeqDB', + 'flat' => 'Bio::DB::Flat', + 'biosql' => 'Bio::DB::BioSQL::BioDatabaseAdaptor', + 'biofetch' => 'Bio::DB::BioFetch' + ); + +my $fallbackRegistryURL = 'http://www.open-bio.org/registry/seqdatabase.ini'; + + +sub new { + my ($class,@args) = shift; + my $self = $class->SUPER::new(@args); + + # open files in order + $self->{'_dbs'} = {}; + $self->_load_registry(); + return $self; +} + + +sub _load_registry { + my ($self) = @_; + + my $home = (getpwuid($>))[7]; + my $f; + + if ( $OBDA_SEARCH_PATH ) { + foreach ( split /;/,$OBDA_SEARCH_PATH ) { + next unless -e $_; + open(F,"$OBDA_SEARCH_PATH/seqdatabase.ini"); + $f = \*F; + last; + } + } + elsif( -e "$home/.bioinformatics/seqdatabase.ini" ) { + open(F,"$home/.bioinformatics/seqdatabase.ini"); + $f = \*F; + } elsif ( -e "/etc/bioinformatics/seqdatabase.ini" ) { + open(F,"/etc/bioinformatics/seqdatabase.ini"); + $f = \*F; + } else { + # waiting for information + $self->warn("No seqdatabase.ini file found in ~/.bioinformatics/ \nor in /etc/bioinformatics/.\nor in directory specified by $OBDA_SEARCH_PATH". + "Using web to get database registry from \n$fallbackRegistryURL"); + + # Last gasp. Try to use HTTPget module to retrieve the registry from + # the web... + + $f = Bio::Root::HTTPget::getFH($fallbackRegistryURL); + + # store the default registry file + mkdir "$home/.bioinformatics" unless -e "$home/.bioinformatics"; + open(F,">$home/.bioinformatics/seqdatabase.ini"); + print F while (<$f>); + close F; + + $self->warn("Stored the default registry configuration into:\n". + " $home/.bioinformatics/seqdatabase.ini"); + + open(F,"$home/.bioinformatics/seqdatabase.ini"); + $f = \*F; + + } + + while( <$f> ) { + /^VERSION=([\d\.]+)/; + $self->throw("Do not know about this version [$1] > $OBDA_SPEC_VERSION") + if $1 > $OBDA_SPEC_VERSION or !$1; + last; + } + + while( <$f> ) { + if( /^#/ ) { + next; + } + if( /^\s/ ) { + next; + } + + if( /\[(\w+)\]/ ) { + my $db = $1; + my $hash = {}; + while( <$f> ) { + chomp(); + /^#/ && next; + /^$/ && last; + my ($tag,$value) = split('=',$_); + $value =~ s/\s//g; + $tag =~ s/\s//g; + $hash->{"\L$tag"} = lc $value; + } + + if( !exists $self->{'_dbs'}->{$db} ) { + my $failover = Bio::DB::Failover->new(); + $self->{'_dbs'}->{$db}=$failover; + } + my $class; + if (defined $implement{$hash->{'protocol'}}) { + $class = $implement{$hash->{'protocol'}}; + } + else { + $self->warn("Registry does not support protocol ".$hash->{'protocol'}); + next; + } + eval "require $class"; + + if ($@) { + $self->verbose && $self->warn("Couldn't load $class"); + next; + } + + else { + eval { + my $randi = $class->new_from_registry(%$hash); + $self->{'_dbs'}->{$db}->add_database($randi); }; + if ($@) { + $self->warn("Couldn't call new_from_registry on [$class]\n$@"); + } + } + next; # back to main loop + } + $self->warn("Uninterpretable line in registry, $_"); + } +} + +=head2 get_database + + Title : get_database + Usage : my $db = $registry->get_database($dbname); + Function: Retrieve a Database object which implements Bio::DB::SeqI interface + Returns : Bio::DB::SeqI object + Args : string describing the name of the database + +=cut + +sub get_database { + my ($self,$dbname) = @_; + + $dbname = lc $dbname; + if( !defined $dbname ) { + $self->warn("must get_database with a database name"); + return undef; + } + if( !exists $self->{'_dbs'}->{$dbname} ) { + $self->warn("No database in with $dbname in registry"); + return undef; + } + return $self->{'_dbs'}->{$dbname}; +} + +=head2 services + + Title : services + Usage : my @available = $registry->services(); + Function: returns list of possible services + Returns : list of strings + Args : none + + +=cut + +sub services{ + my ($self) = @_; + return () unless ( defined $self->{'_dbs'} && + ref( $self->{'_dbs'} ) =~ /HASH/i); + return keys %{$self->{'_dbs'}}; +} + + +## End of Package + +1; + +__END__ + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/SeqI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/SeqI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,179 @@ + +# +# $Id: SeqI.pm,v 1.7 2002/10/22 07:38:29 lapp Exp $ +# +# BioPerl module for Bio::DB::SeqI.pm +# +# Cared for by Ewan Birney +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::DB::SeqI - Abstract Interface for Sequence databases + +=head1 SYNOPSIS + + # get a Bio::DB::SeqI somehow + + $seq = $seqdb->get_Seq_by_id('some-id'); + $seq = $seqdb->get_Seq_by_acc('some-accession-number'); + + @ids = $seqdb->get_all_ids(); + $stream = $seqdb->get_PrimarySeq_stream(); + while((my $seq = $stream->next_seq()) { + # $seq is a PrimarySeqI compliant object + } + + +=head1 DESCRIPTION + +Abstract interface for a sequence database + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney + +Email birney@ebi.ac.uk + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::DB::SeqI; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Object + +use Bio::DB::RandomAccessI; +@ISA = qw(Bio::DB::RandomAccessI); + +=head1 Methods inherieted from Bio::DB::RandomAccessI + +=head2 get_Seq_by_id + + Title : get_Seq_by_id + Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN') + Function: Gets a Bio::Seq object by its name + Returns : a Bio::Seq object + Args : the id (as a string) of a sequence + Throws : "id does not exist" exception + + +=cut + +=head2 get_Seq_by_acc + + Title : get_Seq_by_acc + Usage : $seq = $db->get_Seq_by_acc('X77802'); + Function: Gets a Bio::Seq object by accession number + Returns : A Bio::Seq object + Args : accession number (as a string) + Throws : "acc does not exist" exception + + +=cut + +=head1 Methods [that were] specific for Bio::DB::SeqI + +=head2 get_PrimarySeq_stream + + Title : get_PrimarySeq_stream + Usage : $stream = get_PrimarySeq_stream + Function: Makes a Bio::SeqIO compliant object + which provides a single method, next_seq + Returns : Bio::SeqIO + Args : none + +=cut + +sub get_PrimarySeq_stream{ + my ($self,@args) = @_; + + $self->throw("Object did not provide a PrimarySeq stream object"); +} + +=head2 get_all_primary_ids + + Title : get_all_ids + Usage : @ids = $seqdb->get_all_primary_ids() + Function: gives an array of all the primary_ids of the + sequence objects in the database. These + maybe ids (display style) or accession numbers + or something else completely different - they + *are not* meaningful outside of this database + implementation. + Example : + Returns : an array of strings + Args : none + + +=cut + +sub get_all_primary_ids{ + my ($self,@args) = @_; + $self->throw("Object did not provide a get_all_ids method"); +} + + +=head2 get_Seq_by_primary_id + + Title : get_Seq_by_primary_id + Usage : $seq = $db->get_Seq_by_primary_id($primary_id_string); + Function: Gets a Bio::Seq object by the primary id. The primary + id in these cases has to come from $db->get_all_primary_ids. + There is no other way to get (or guess) the primary_ids + in a database. + + The other possibility is to get Bio::PrimarySeqI objects + via the get_PrimarySeq_stream and the primary_id field + on these objects are specified as the ids to use here. + Returns : A Bio::Seq object + Args : accession number (as a string) + Throws : "acc does not exist" exception + + +=cut + +sub get_Seq_by_primary_id { + my ($self,@args) = @_; + + $self->throw("Abstract database call of get_Seq_by_primary_id. Your database has not implemented this method!"); + +} + +1; + + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/SwissProt.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/SwissProt.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,447 @@ +# +# $Id: SwissProt.pm,v 1.19 2002/12/01 00:05:19 jason Exp $ +# +# BioPerl module for Bio::DB::SwissProt +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code +# Reworked to use Bio::DB::WebDBSeqI 2000-12-11 + +=head1 NAME + +Bio::DB::SwissProt - Database object interface to SwissProt retrieval + +=head1 SYNOPSIS + + use Bio::DB::SwissProt; + + $sp = new Bio::DB::SwissProt; + + $seq = $sp->get_Seq_by_id('KPY1_ECOLI'); # SwissProt ID + # <4-letter-identifier>_ + # or ... + $seq = $sp->get_Seq_by_acc('P43780'); # SwissProt AC + # [OPQ]xxxxx + + + # In fact in this implementation + # these methods call the same webscript so you can use + # then interchangeably + + # choose a different server to query + $sp = new Bio::DB::SwissProt('-servertype' => 'expasy', + '-hostlocation' => 'us'); + + $seq = $sp->get_Seq_by_id('BOLA_HAEIN'); # SwissProtID + +=head1 DESCRIPTION + +SwissProt is a curated database of proteins managed by the Swiss +Bioinformatics Institute. This is in contrast to EMBL/GenBank/DDBJ +which are archives of protein information. Additional tools for +parsing and manipulating swissprot files can be found at +ftp://ftp.ebi.ac.uk/pub/software/swissprot/Swissknife/. + +Allows the dynamic retrieval of Sequence objects (Bio::Seq) from the +SwissProt database via an expasy retrieval. Perhaps through SRS +later. + +In order to make changes transparent we have host type (currently only +expasy) and location (default to switzerland) separated out. This +allows the user to pick the closest expasy mirror for running their +queries. + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email Jason Stajich Ejason@bioperl.org E + +Thanks go to Alexandre Gattiker Egattiker@isb-sib.chE of Swiss +Institute of Bioinformatics for helping point us in the direction of +the correct expasy scripts and for swissknife references. + +Also thanks to Heikki Lehvaslaiho Eheikki@ebi.ac.ukE for help with +adding EBI swall server. + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::DB::SwissProt; +use strict; +use vars qw(@ISA $MODVERSION %HOSTS $DEFAULTFORMAT $DEFAULTSERVERTYPE); + +$MODVERSION = '0.8.1'; +use HTTP::Request::Common; +use Bio::DB::WebDBSeqI; + +@ISA = qw(Bio::DB::WebDBSeqI); + +# global vars +$DEFAULTSERVERTYPE = 'ebi'; +$DEFAULTFORMAT = 'swissprot'; + +# you can add your own here theoretically. +%HOSTS = ( + 'expasy' => { + 'default' => 'us', + 'baseurl' => 'http://%s/cgi-bin/sprot-retrieve-list.pl', + 'hosts' => + { + 'switzerland' => 'ch.expasy.org', + 'canada' => 'ca.expasy.org', + 'china' => 'cn.expasy.org', + 'taiwan' => 'tw.expasy.org', + 'australia' => 'au.expasy.org', + 'korea' => 'kr.expasy.org', + 'us' => 'us.expasy.org', + }, + # ick, CGI variables + 'jointype' => ' ', + 'idvar' => 'list', + 'basevars' => [ ], + }, + 'ebi' => { + 'default' => 'uk', + 'baseurl' => 'http://%s/cgi-bin/dbfetch', + 'hosts' => { + 'uk' => 'www.ebi.ac.uk', + }, + 'jointype' => ',', + 'idvar' => 'id', + 'basevars' => [ 'db' => 'swall', + 'style' => 'raw' ], + } + ); + +# new modules should be a little more lightweight and +# should use Bio::Root::Root +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + my ($format, $hostlocation,$servertype) = + $self->_rearrange([qw(FORMAT HOSTLOCATION SERVERTYPE)], + @args); + + if( $format && $format !~ /(swiss)|(fasta)/i ) { + $self->warn("Requested Format $format is ignored because only SwissProt and Fasta formats are currently supported"); + $format = $self->default_format; + } + $servertype = $DEFAULTSERVERTYPE unless $servertype; + $servertype = lc $servertype; + $self->servertype($servertype); + if ( $hostlocation ) { + $self->hostlocation(lc $hostlocation); + } + + $self->request_format($format); # let's always override the format, as it must be swiss or fasta + return $self; +} + +=head2 Routines from Bio::DB::RandomAccessI + +=cut + +=head2 get_Seq_by_id + + Title : get_Seq_by_id + Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN') + Function: Gets a Bio::Seq object by its name + Returns : a Bio::Seq object + Args : the id (as a string) of a sequence + Throws : "id does not exist" exception + +=cut + +=head2 get_Seq_by_acc + + Title : get_Seq_by_acc + Usage : $seq = $db->get_Seq_by_acc('X77802'); + Function: Gets a Bio::Seq object by accession number + Returns : A Bio::Seq object + Args : accession number (as a string) + Throws : "acc does not exist" exception + +=cut + +=head2 get_Stream_by_id + + Title : get_Stream_by_id + Usage : $stream = $db->get_Stream_by_id( [$uid1, $uid2] ); + Function: Gets a series of Seq objects by unique identifiers + Returns : a Bio::SeqIO stream object + Args : $ref : a reference to an array of unique identifiers for + the desired sequence entries + +=cut + +=head2 get_Stream_by_acc + + Title : get_Stream_by_acc + Usage : $seq = $db->get_Seq_by_acc([$acc1, $acc2]); + Function: Gets a series of Seq objects by accession numbers + Returns : a Bio::SeqIO stream object + Args : $ref : a reference to an array of accession numbers for + the desired sequence entries + Note : For GenBank, this just calls the same code for get_Stream_by_id() + +=cut + +=head2 get_Stream_by_batch + + Title : get_Stream_by_batch + Usage : $seq = $db->get_Stream_by_batch($ref); + Function: Retrieves Seq objects from SwissProt 'en masse', rather than one + at a time. This is implemented the same way as get_Stream_by_id, + but is provided here in keeping with access methods of NCBI + modules. + Example : + Returns : a Bio::SeqIO stream object + Args : $ref : either an array reference, a filename, or a filehandle + from which to get the list of unique ids/accession numbers. + +=cut + +sub get_Stream_by_batch { + my ($self, $ids) = @_; + return $self->get_Stream_by_id( $ids); +} + +=head2 Implemented Routines from Bio::DB::WebDBSeqI interface + +=cut + +=head2 get_request + + Title : get_request + Usage : my $url = $self->get_request + Function: returns a HTTP::Request object + Returns : + Args : %qualifiers = a hash of qualifiers (ids, format, etc) + +=cut + +sub get_request { + my ($self, @qualifiers) = @_; + my ($uids, $format) = $self->_rearrange([qw(UIDS FORMAT)], + @qualifiers); + + if( !defined $uids ) { + $self->throw("Must specify a value for uids to query"); + } + my ($f,undef) = $self->request_format($format); + + my %vars = ( + @{$HOSTS{$self->servertype}->{'basevars'}}, + ( 'format' => $f ) + ); + + my $url = $self->location_url; + + my $uid; + my $jointype = $HOSTS{$self->servertype}->{'jointype'} || ' '; + my $idvar = $HOSTS{$self->servertype}->{'idvar'} || 'id'; + + if( ref($uids) =~ /ARRAY/i ) { + # HTTP::Request automagically converts the ' ' to %20 + $uid = join($jointype, @$uids); + } else { + $uid = $uids; + } + $vars{$idvar} = $uid; + + return POST $url, \%vars; +} + +=head2 postprocess_data + + Title : postprocess_data + Usage : $self->postprocess_data ( 'type' => 'string', + 'location' => \$datastr); + Function: process downloaded data before loading into a Bio::SeqIO + Returns : void + Args : hash with two keys - 'type' can be 'string' or 'file' + - 'location' either file location or string + reference containing data + +=cut + +# don't need to do anything + +sub postprocess_data { + my ($self, %args) = @_; + return; +} + +=head2 default_format + + Title : default_format + Usage : my $format = $self->default_format + Function: Returns default sequence format for this module + Returns : string + Args : none + +=cut + +sub default_format { + return $DEFAULTFORMAT; +} + +=head2 Bio::DB::SwissProt specific routines + +=cut + +=head2 servertype + + Title : servertype + Usage : my $servertype = $self->servertype + $self->servertype($servertype); + Function: Get/Set server type + Returns : string + Args : server type string [optional] + +=cut + +sub servertype { + my ($self, $servertype) = @_; + if( defined $servertype && $servertype ne '') { + $self->throw("You gave an invalid server type ($servertype)". + " - available types are ". + keys %HOSTS) unless( $HOSTS{$servertype} ); + $self->{'_servertype'} = $servertype; + $self->{'_hostlocation'} = $HOSTS{$servertype}->{'default'}; + + # make sure format is reset properly in that different + # servers have different syntaxes + my ($existingformat,$seqioformat) = $self->request_format; + $self->request_format($existingformat); + } + return $self->{'_servertype'} || $DEFAULTSERVERTYPE; +} + + +=head2 hostlocation + + Title : hostlocation + Usage : my $location = $self->hostlocation() + $self->hostlocation($location) + Function: Set/Get Hostlocation + Returns : string representing hostlocation + Args : string specifying hostlocation [optional] + +=cut + +sub hostlocation { + my ($self, $location ) = @_; + $location = lc $location; + my $servertype = $self->servertype; + $self->throw("Must have a valid servertype defined not $servertype") + unless defined $servertype; + my %hosts = %{$HOSTS{$servertype}->{'hosts'}}; + if( defined $location && $location ne '' ) { + if( ! $hosts{$location} ) { + $self->throw("Must specify a known host, not $location,". + " possible values (". + join(",", sort keys %hosts ). ")"); + } + $self->{'_hostlocation'} = $location; + } + return $self->{'_hostlocation'}; +} + +=head2 location_url + + Title : location + Usage : my $url = $self->location_url() + Function: Get host url + Returns : string representing url + Args : none + +=cut + +sub location_url { + my ($self) = @_; + my $servertype = $self->servertype(); + my $location = $self->hostlocation(); + + if( ! defined $location || !defined $servertype ) { + $self->throw("must have a valid hostlocation and servertype set before calling location_url"); + } + return sprintf($HOSTS{$servertype}->{'baseurl'}, + $HOSTS{$servertype}->{'hosts'}->{$location}); +} + +=head2 request_format + + Title : request_format + Usage : my ($req_format, $ioformat) = $self->request_format; + $self->request_format("genbank"); + $self->request_format("fasta"); + Function: Get/Set sequence format retrieval. The get-form will normally not + be used outside of this and derived modules. + Returns : Array of two strings, the first representing the format for + retrieval, and the second specifying the corresponding SeqIO format. + Args : $format = sequence format + +=cut + +sub request_format { + my ($self, $value) = @_; + if( defined $value ) { + if( $self->servertype =~ /expasy/ ) { + if( $value =~ /sprot/ || $value =~ /swiss/ ) { + $self->{'_format'} = [ 'sprot', 'swiss']; + } elsif( $value =~ /^fa/ ) { + $self->{'_format'} = [ 'fasta', 'fasta']; + } else { + $self->warn("Unrecognized format $value requested"); + $self->{'_format'} = [ 'fasta', 'fasta']; + } + } elsif( $self->servertype =~ /ebi/ ) { + if( $value =~ /sprot/ || $value =~ /swiss/ ) { + $self->{'_format'} = [ 'swissprot', 'swiss' ]; + } elsif( $value =~ /^fa/ ) { + $self->{'_format'} = [ 'fasta', 'fasta']; + } else { + $self->warn("Unrecognized format $value requested"); + $self->{'_format'} = [ 'swissprot', 'swiss']; + } + } + } + return @{$self->{'_format'}}; +} + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/Universal.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/Universal.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,247 @@ + +# +# BioPerl module for Bio::DB::Universal +# +# Cared for by Ewan Birney +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::DB::Universal - Artificial database that delegates to specific databases + +=head1 SYNOPSIS + + $uni = Bio::DB::Universal->new(); + + # by default connects to web databases. We can also + # substitute local databases + + $embl = Bio::Index::EMBL->new( -filename => '/some/index/filename/locally/stored'); + $uni->use_database('embl',$embl); + + # treat it like a normal database. Recognises strings + # like gb|XXXXXX and embl:YYYYYY + + $seq1 = $uni->get_Seq_by_id("embl:HSHNRNPA"); + $seq2 = $uni->get_Seq_by_acc("gb|A000012"); + + # with no separator, tries to guess database. In this case the + # _ is considered to be indicative of swissprot + $seq3 = $uni->get_Seq_by_id('ROA1_HUMAN'); + +=head1 DESCRIPTION + +Artificial database that delegates to specific databases, with a +"smart" (well, smartish) guessing routine for what the ids. No doubt +the smart routine can be made smarter. + +The hope is that you can make this database and just throw ids at it - +for most easy cases it will sort you out. Personally, I would be +making sure I knew where each id came from and putting it into its own +database first - but this is a quick and dirty solution. + +By default this connects to web orientated databases, with all the +reliability and network bandwidth costs this implies. However you can +subsistute your own local databases - they could be Bio::Index +databases (DBM file and flat file) or bioperl-db based (MySQL based) +or biocorba-based (whatever you like behind the corba interface). + +Internally the tags for the databases are + + genbank - ncbi dna database + embl - ebi's dna database (these two share accession number space) + swiss - swissprot + sptrembl (EBI's protein database) + +We should extend this for RefSeq and other sequence databases which +are out there... ;) + +Inspired by Lincoln Stein, written by Ewan Birney. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bio.perl.org + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney + +Email birney@ebi.ac.uk + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::DB::Universal; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::Root::Root; +use Bio::DB::RandomAccessI; + +use Bio::DB::GenBank; +use Bio::DB::SwissProt; +use Bio::DB::EMBL; + + +@ISA = qw(Bio::DB::RandomAccessI Bio::Root::Root); +# new() can be inherited from Bio::Root::Root + +sub new { + my ($class) = @_; + + my $self = {}; + bless $self,$class; + + $self->{'db_hash'} = {}; + + # default databases + + $self->use_database('embl',Bio::DB::EMBL->new); + $self->use_database('genbank',Bio::DB::GenBank->new); + $self->use_database('swiss',Bio::DB::GenBank->new); + + return $self; +} + + +=head2 get_Seq_by_id + + Title : get_Seq_by_id + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub get_Seq_by_id{ + my ($self,$str) = @_; + + my ($tag,$id) = $self->guess_id($str); + + return $self->{'db_hash'}->{$tag}->get_Seq_by_id($id); +} + + +=head2 get_Seq_by_acc + + Title : get_Seq_by_acc + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub get_Seq_by_acc { + my ($self,$str) = @_; + + my ($tag,$id) = $self->guess_id($str); + + return $self->{'db_hash'}->{$tag}->get_Seq_by_acc($id); +} + + + +=head2 guess_id + + Title : guess_id + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub guess_id{ + my ($self,$str) = @_; + + if( $str =~ /(\S+)[:|\/;](\w+)/ ) { + my $tag; + my $db = $1; + my $id = $2; + if( $db =~ /gb/i || $db =~ /genbank/i || $db =~ /ncbi/i ) { + $tag = 'genbank'; + } elsif ( $db =~ /embl/i || $db =~ /emblbank/ || $db =~ /^em/i ) { + $tag = 'embl'; + } elsif ( $db =~ /swiss/i || $db =~ /^sw/i || $db =~ /sptr/ ) { + $tag = 'swiss'; + } else { + # throw for the moment + $self->throw("Could not guess database type $db from $str"); + } + return ($tag,$id); + + } else { + my $tag; + # auto-guess from just the id + if( $str =~ /_/ ) { + $tag = 'swiss'; + } elsif ( $str =~ /^[QPR]\w+\d$/ ) { + $tag = 'swiss'; + } elsif ( $str =~ /[A-Z]\d+/ ) { + $tag = 'genbank'; + } else { + # default genbank... + $tag = 'genbank'; + } + return ($tag,$str); + } + + +} + + +=head2 use_database + + Title : use_database + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub use_database{ + my ($self,$name,$database) = @_; + + $self->{'db_hash'}->{$name} = $database; +} + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/UpdateableSeqI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/UpdateableSeqI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,232 @@ +# +# $Id: UpdateableSeqI.pm,v 1.6 2002/12/01 00:05:19 jason Exp $ +# +# BioPerl module for Bio::DB::UpdateableSeqI +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself +# +# _history +# June 18, 2000 - module begun +# +# POD Doc - main docs before code + +=head1 NAME + +Bio::DB::UpdateableSeqI - An interface for writing to a database of sequences. + +=head1 SYNOPSIS + + # get a Bio::DB::UpdateableSeqI somehow + eval { + my ( @updatedseqs, @newseqs, @deadseqs); + my $seq = $db->get_Seq_by_id('ROA1_HUMAN'); + $seq->desc('a new description'); + + push @updatedseqs, $seq; + + $db->write_seq(\@updatedseqs, \@newseqs, \@deadseqs); + }; + if( $@ ) { + print STDERR "an error when trying to write seq : $@\n"; + } + +=head1 DESCRIPTION + +This module seeks to provide a simple method for pushing sequence changes +back to a Sequence Database - which can be an SQL compliant database, a file +based database, AceDB, etc. + +=head1 AUTHOR + +Jason Stajich Ejason@bioperl.orgE + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + +#Lets start some code + +package Bio::DB::UpdateableSeqI; + +use strict; + +use vars qw( @ISA ); + +use Bio::DB::SeqI; + +@ISA = qw(Bio::DB::SeqI); + +=head2 write_seq + + Title : write_seq + Usage : write_seq(\@updatedseqs, \@addedseqs, \@deadseqs) + Function: updates sequences in first array, + adds sequences in the second array, + and removes sequences in the third array. + Example : + Returns : + Args : arrays of sequence objects that must be obtained from + Bio::DB::UpdateableSeqI. + +=cut + +sub write_seq { + my ($self) = @_; + + $self->throw("Abstract database call of write_seq. Your database has not implemented this method!"); + +} + +=head2 _add_seq + + Title : _add_seq + Usage : _add_seq($seq) + Function: Adds a new sequence + Example : + Returns : will throw an exception if + sequences accession number already exists + Args : a new seq object - should have an accession number + +=cut + +sub _add_seq { + my ($self ) = @_; + + $self->throw("Abstract database call of _add_seq. Your database has not implemented this method!"); + +} + +=head2 _remove_seq + + Title : _remove_seq + Usage : _remove_seq($seq) + Function: Removes an existing sequence + Example : + Returns : will throw an exception if + sequence does not exists for the primary_id + Args : a seq object that was retrieved from Bio::DB::UpdateableSeqI + +=cut + +sub _remove_seq { + my ($self) = @_; + + $self->throw("Abstract database call of _remove_seq. Your database has not implemented this method!"); + +} + +=head2 _update_seq + + Title : _update_seq + Usage : _update_seq($seq) + Function: Updates a sequence + Example : + Returns : will throw an exception if + sequence is out of sync from expected val. + Args : a seq object that was retrieved from Bio::DB::UpdateableSeqI + +=cut + +sub _update_seq { + my ($self) = @_; + + $self->throw("Abstract database call of _update_seq. Your database has not implemented this method!"); + +} + + +=head1 Methods inherieted from Bio::DB::RandomAccessI + +=head2 get_Seq_by_id + + Title : get_Seq_by_id + Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN') + Function: Gets a Bio::Seq object by its name + Returns : a Bio::Seq object + Args : the id (as a string) of a sequence + Throws : "id does not exist" exception + + +=cut + +=head2 get_Seq_by_acc + + Title : get_Seq_by_acc + Usage : $seq = $db->get_Seq_by_acc('X77802'); + Function: Gets a Bio::Seq object by accession number + Returns : A Bio::Seq object + Args : accession number (as a string) + Throws : "acc does not exist" exception + + +=cut + +=head1 Methods inheirited from Bio::DB::SeqI + +=head2 get_PrimarySeq_stream + + Title : get_PrimarySeq_stream + Usage : $stream = get_PrimarySeq_stream + Function: Makes a Bio::DB::SeqStreamI compliant object + which provides a single method, next_primary_seq + Returns : Bio::DB::SeqStreamI + Args : none + + +=cut + +=head2 get_all_primary_ids + + Title : get_all_ids + Usage : @ids = $seqdb->get_all_primary_ids() + Function: gives an array of all the primary_ids of the + sequence objects in the database. These + maybe ids (display style) or accession numbers + or something else completely different - they + *are not* meaningful outside of this database + implementation. + Example : + Returns : an array of strings + Args : none + + +=cut + +=head2 get_Seq_by_primary_id + + Title : get_Seq_by_primary_id + Usage : $seq = $db->get_Seq_by_primary_id($primary_id_string); + Function: Gets a Bio::Seq object by the primary id. The primary + id in these cases has to come from $db->get_all_primary_ids. + There is no other way to get (or guess) the primary_ids + in a database. + + The other possibility is to get Bio::PrimarySeqI objects + via the get_PrimarySeq_stream and the primary_id field + on these objects are specified as the ids to use here. + Returns : A Bio::Seq object + Args : accession number (as a string) + Throws : "acc does not exist" exception + + +=cut + +1; + + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/WebDBSeqI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/WebDBSeqI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,790 @@ +# $Id: WebDBSeqI.pm,v 1.30.2.1 2003/06/12 09:29:38 heikki Exp $ +# +# BioPerl module for Bio::DB::WebDBSeqI +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code +# + +=head1 NAME + +Bio::DB::WebDBSeqI - Object Interface to generalize Web Databases + for retrieving sequences + +=head1 SYNOPSIS + + # get a WebDBSeqI object somehow + # assuming it is a nucleotide db + my $seq = $db->get_Seq_by_id('ROA1_HUMAN') + +=head1 DESCRIPTION + + + + +Provides core set of functionality for connecting to a web based +database for retriving sequences. + +Users wishing to add another Web Based Sequence Dabatase will need to +extend this class (see Bio::DB::SwissProt or Bio::DB::NCBIHelper for +examples) and implement the get_request method which returns a +HTTP::Request for the specified uids (accessions, ids, etc depending +on what query types the database accepts). + + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the +evolution of this and other Bioperl modules. Send +your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation +is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to +help us keep track the bugs and their resolution. +Bug reports can be submitted via email or the +web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email E jason@bioperl.org E + +=head1 APPENDIX + +The rest of the documentation details each of the +object methods. Internal methods are usually +preceded with a _ + +=cut + +# Let the code begin... + +package Bio::DB::WebDBSeqI; +use strict; +use vars qw(@ISA $MODVERSION %RETRIEVAL_TYPES $DEFAULT_RETRIEVAL_TYPE + $DEFAULTFORMAT $LAST_INVOCATION_TIME); + +use Bio::DB::RandomAccessI; +use Bio::SeqIO; +use Bio::Root::IO; +use LWP::UserAgent; +use HTTP::Request::Common; +use HTTP::Response; +use File::Spec; +use IO::String; +use Bio::Root::Root; + +@ISA = qw(Bio::DB::RandomAccessI); + +BEGIN { + $MODVERSION = '0.8'; + %RETRIEVAL_TYPES = ( 'io_string' => 1, + 'tempfile' => 1, + 'pipeline' => 1, + ); + $DEFAULT_RETRIEVAL_TYPE = 'pipeline'; + $DEFAULTFORMAT = 'fasta'; + $LAST_INVOCATION_TIME = 0; +} + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + my ($baseaddress, $params, $ret_type, $format,$delay,$db) = + $self->_rearrange([qw(BASEADDRESS PARAMS RETRIEVALTYPE FORMAT DELAY DB)], + @args); + + $ret_type = $DEFAULT_RETRIEVAL_TYPE unless ( $ret_type); + $baseaddress && $self->url_base_address($baseaddress); + $params && $self->url_params($params); + $db && $self->db($db); + $ret_type && $self->retrieval_type($ret_type); + $delay = $self->delay_policy unless defined $delay; + $self->delay($delay); + + # insure we always have a default format set for retrieval + # even though this will be immedietly overwritten by most sub classes + $format = $self->default_format unless ( defined $format && + $format ne '' ); + + $self->request_format($format); + my $ua = new LWP::UserAgent; + $ua->agent(ref($self) ."/$MODVERSION"); + $self->ua($ua); + $self->{'_authentication'} = []; + return $self; +} + +# from Bio::DB::RandomAccessI + +=head2 get_Seq_by_id + + Title : get_Seq_by_id + Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN') + Function: Gets a Bio::Seq object by its name + Returns : a Bio::Seq object + Args : the id (as a string) of a sequence + Throws : "id does not exist" exception + + +=cut + +sub get_Seq_by_id { + my ($self,$seqid) = @_; + $self->_sleep; + my $seqio = $self->get_Stream_by_id([$seqid]); + $self->throw("id does not exist") if( !defined $seqio ) ; + my @seqs; + while( my $seq = $seqio->next_seq() ) { push @seqs, $seq; } + $self->throw("id does not exist") unless @seqs; + if( wantarray ) { return @seqs } else { return shift @seqs } +} + +=head2 get_Seq_by_acc + + Title : get_Seq_by_acc + Usage : $seq = $db->get_Seq_by_acc('X77802'); + Function: Gets a Bio::Seq object by accession number + Returns : A Bio::Seq object + Args : accession number (as a string) + Throws : "acc does not exist" exception + +=cut + +sub get_Seq_by_acc { + my ($self,$seqid) = @_; + $self->_sleep; + my $seqio = $self->get_Stream_by_acc($seqid); + $self->throw("acc does not exist") if( ! defined $seqio ); + my @seqs; + while( my $seq = $seqio->next_seq() ) { push @seqs, $seq; } + $self->throw("acc does not exist") unless @seqs; + if( wantarray ) { return @seqs } else { return shift @seqs } +} + + +=head2 get_Seq_by_gi + + Title : get_Seq_by_gi + Usage : $seq = $db->get_Seq_by_gi('405830'); + Function: Gets a Bio::Seq object by gi number + Returns : A Bio::Seq object + Args : gi number (as a string) + Throws : "gi does not exist" exception + +=cut + +sub get_Seq_by_gi { + my ($self,$seqid) = @_; + $self->_sleep; + my $seqio = $self->get_Stream_by_gi($seqid); + $self->throw("gi does not exist") if( !defined $seqio ); + my @seqs; + while( my $seq = $seqio->next_seq() ) { push @seqs, $seq; } + $self->throw("gi does not exist") unless @seqs; + if( wantarray ) { return @seqs } else { return shift @seqs } +} + +=head2 get_Seq_by_version + + Title : get_Seq_by_version + Usage : $seq = $db->get_Seq_by_version('X77802.1'); + Function: Gets a Bio::Seq object by sequence version + Returns : A Bio::Seq object + Args : accession.version (as a string) + Throws : "acc.version does not exist" exception + +=cut + +sub get_Seq_by_version { + my ($self,$seqid) = @_; + $self->_sleep; + my $seqio = $self->get_Stream_by_version($seqid); + $self->throw("accession.version does not exist") if( !defined $seqio ); + my @seqs; + while( my $seq = $seqio->next_seq() ) { push @seqs, $seq; } + $self->throw("accession.version does not exist") unless @seqs; + if( wantarray ) { return @seqs } else { return shift @seqs } +} + +# implementing class must define these + +=head2 get_request + + Title : get_request + Usage : my $url = $self->get_request + Function: returns a HTTP::Request object + Returns : + Args : %qualifiers = a hash of qualifiers (ids, format, etc) + +=cut + +sub get_request { + my ($self) = @_; + my $msg = "Implementing class must define method get_request in class WebDBSeqI"; + $self->throw($msg); +} + +# class methods + +=head2 get_Stream_by_id + + Title : get_Stream_by_id + Usage : $stream = $db->get_Stream_by_id( [$uid1, $uid2] ); + Function: Gets a series of Seq objects by unique identifiers + Returns : a Bio::SeqIO stream object + Args : $ref : a reference to an array of unique identifiers for + the desired sequence entries + + +=cut + +sub get_Stream_by_id { + my ($self, $ids) = @_; + my ($webfmt,$localfmt) = $self->request_format; + return $self->get_seq_stream('-uids' => $ids, '-mode' => 'single', + '-format' => $webfmt); +} + +*get_Stream_by_batch = sub { + my $self = shift; + $self->deprecated('get_Stream_by_batch() is deprecated; use get_Stream_by_id() instead'); + $self->get_Stream_by_id(@_) +}; + + +=head2 get_Stream_by_acc + + Title : get_Stream_by_acc + Usage : $seq = $db->get_Stream_by_acc([$acc1, $acc2]); + Function: Gets a series of Seq objects by accession numbers + Returns : a Bio::SeqIO stream object + Args : $ref : a reference to an array of accession numbers for + the desired sequence entries + Note : For GenBank, this just calls the same code for get_Stream_by_id() + +=cut + +sub get_Stream_by_acc { + my ($self, $ids ) = @_; + return $self->get_seq_stream('-uids' => $ids, '-mode' => 'single'); +} + + +=head2 get_Stream_by_gi + + Title : get_Stream_by_gi + Usage : $seq = $db->get_Stream_by_gi([$gi1, $gi2]); + Function: Gets a series of Seq objects by gi numbers + Returns : a Bio::SeqIO stream object + Args : $ref : a reference to an array of gi numbers for + the desired sequence entries + Note : For GenBank, this just calls the same code for get_Stream_by_id() + +=cut + +sub get_Stream_by_gi { + my ($self, $ids ) = @_; + return $self->get_seq_stream('-uids' => $ids, '-mode' => 'gi'); +} + +=head2 get_Stream_by_version + + Title : get_Stream_by_version + Usage : $seq = $db->get_Stream_by_version([$version1, $version2]); + Function: Gets a series of Seq objects by accession.versions + Returns : a Bio::SeqIO stream object + Args : $ref : a reference to an array of accession.version strings for + the desired sequence entries + Note : For GenBank, this is implemeted in NCBIHelper + +=cut + +sub get_Stream_by_version { + my ($self, $ids ) = @_; +# $self->throw("Implementing class should define this method!"); + return $self->get_seq_stream('-uids' => $ids, '-mode' => 'version'); # how it should work +} + +=head2 get_Stream_by_query + + Title : get_Stream_by_query + Usage : $stream = $db->get_Stream_by_query($query); + Function: Gets a series of Seq objects by way of a query string or oject + Returns : a Bio::SeqIO stream object + Args : $query : A string that uses the appropriate query language + for the database or a Bio::DB::QueryI object. It is suggested + that you create the Bio::DB::Query object first and interrogate + it for the entry count before you fetch a potentially large stream. + +=cut + +sub get_Stream_by_query { + my ($self, $query ) = @_; + return $self->get_seq_stream('-query' => $query, '-mode'=>'query'); +} + +=head2 default_format + + Title : default_format + Usage : my $format = $self->default_format + Function: Returns default sequence format for this module + Returns : string + Args : none + +=cut + +sub default_format { + return $DEFAULTFORMAT; +} + +# sorry, but this is hacked in because of BioFetch problems... +sub db { + my $self = shift; + my $d = $self->{_db}; + $self->{_db} = shift if @_; + $d; +} + +=head2 request_format + + Title : request_format + Usage : my ($req_format, $ioformat) = $self->request_format; + $self->request_format("genbank"); + $self->request_format("fasta"); + Function: Get/Set sequence format retrieval. The get-form will normally not + be used outside of this and derived modules. + Returns : Array of two strings, the first representing the format for + retrieval, and the second specifying the corresponding SeqIO format. + Args : $format = sequence format + +=cut + +sub request_format { + my ($self, $value) = @_; + + if( defined $value ) { + $self->{'_format'} = [ $value, $value]; + } + return @{$self->{'_format'}}; +} + +=head2 get_seq_stream + + Title : get_seq_stream + Usage : my $seqio = $self->get_seq_sream(%qualifiers) + Function: builds a url and queries a web db + Returns : a Bio::SeqIO stream capable of producing sequence + Args : %qualifiers = a hash qualifiers that the implementing class + will process to make a url suitable for web querying + +=cut + +sub get_seq_stream { + my ($self, %qualifiers) = @_; + my ($rformat, $ioformat) = $self->request_format(); + my $seen = 0; + foreach my $key ( keys %qualifiers ) { + if( $key =~ /format/i ) { + $rformat = $qualifiers{$key}; + $seen = 1; + } + } + $qualifiers{'-format'} = $rformat if( !$seen); + ($rformat, $ioformat) = $self->request_format($rformat); + + my $request = $self->get_request(%qualifiers); + $request->proxy_authorization_basic($self->authentication) + if ( $self->authentication); + $self->debug("request is ". $request->as_string(). "\n"); + + # workaround for MSWin systems + $self->retrieval_type('io_string') if $self->retrieval_type =~ /pipeline/ && $^O =~ /^MSWin/; + + if ($self->retrieval_type =~ /pipeline/) { + # Try to create a stream using POSIX fork-and-pipe facility. + # this is a *big* win when fetching thousands of sequences from + # a web database because we can return the first entry while + # transmission is still in progress. + # Also, no need to keep sequence in memory or in a temporary file. + # If this fails (Windows, MacOS 9), we fall back to non-pipelined access. + + # fork and pipe: _stream_request()=> + my $result = eval { open(STREAM,"-|") }; + + if (defined $result) { + $DB::fork_TTY = '/dev/null'; # prevents complaints from debugger + if (!$result) { # in child process + $self->_stream_request($request); + kill 9=>$$; # to prevent END{} blocks from executing in forked children + exit 0; + } + else { + return Bio::SeqIO->new('-verbose' => $self->verbose, + '-format' => $ioformat, + '-fh' => \*STREAM); + } + } + else { + $self->retrieval_type('io_string'); + } + } + + if ($self->retrieval_type =~ /temp/i) { + my $dir = $self->io->tempdir( CLEANUP => 1); + my ( $fh, $tmpfile) = $self->io()->tempfile( DIR => $dir ); + close $fh; + my $resp = $self->_request($request, $tmpfile); + if( ! -e $tmpfile || -z $tmpfile || ! $resp->is_success() ) { + $self->throw("WebDBSeqI Error - check query sequences!\n"); + } + $self->postprocess_data('type' => 'file', + 'location' => $tmpfile); + # this may get reset when requesting batch mode + ($rformat,$ioformat) = $self->request_format(); + if( $self->verbose > 0 ) { + open(ERR, "<$tmpfile"); + while() { $self->debug($_);} + } + + return Bio::SeqIO->new('-verbose' => $self->verbose, + '-format' => $ioformat, + '-file' => $tmpfile); + } + + if ($self->retrieval_type =~ /io_string/i ) { + my $resp = $self->_request($request); + my $content = $resp->content_ref; + $self->debug( "content is $$content\n"); + if (!$resp->is_success() || length($$content) == 0) { + $self->throw("WebDBSeqI Error - check query sequences!\n"); + } + ($rformat,$ioformat) = $self->request_format(); + $self->postprocess_data('type'=> 'string', + 'location' => $content); + $self->debug( "str is $$content\n"); + return Bio::SeqIO->new('-verbose' => $self->verbose, + '-format' => $ioformat, + '-fh' => new IO::String($$content)); + } + + # if we got here, we don't know how to handle the retrieval type + $self->throw("retrieval type " . $self->retrieval_type . + " unsupported\n"); +} + +=head2 url_base_address + + Title : url_base_address + Usage : my $address = $self->url_base_address or + $self->url_base_address($address) + Function: Get/Set the base URL for the Web Database + Returns : Base URL for the Web Database + Args : $address - URL for the WebDatabase + +=cut + +sub url_base_address { + my $self = shift; + my $d = $self->{'_baseaddress'}; + $self->{'_baseaddress'} = shift if @_; + $d; +} + + +=head2 proxy + + Title : proxy + Usage : $httpproxy = $db->proxy('http') or + $db->proxy(['http','ftp'], 'http://myproxy' ) + Function: Get/Set a proxy for use of proxy + Returns : a string indicating the proxy + Args : $protocol : an array ref of the protocol(s) to set/get + $proxyurl : url of the proxy to use for the specified protocol + $username : username (if proxy requires authentication) + $password : password (if proxy requires authentication) + +=cut + +sub proxy { + my ($self,$protocol,$proxy,$username,$password) = @_; + return undef if ( !defined $self->ua || !defined $protocol + || !defined $proxy ); + $self->authentication($username, $password) + if ($username && $password); + return $self->ua->proxy($protocol,$proxy); +} + +=head2 authentication + + Title : authentication + Usage : $db->authentication($user,$pass) + Function: Get/Set authentication credentials + Returns : Array of user/pass + Args : Array or user/pass + + +=cut + +sub authentication{ + my ($self,$u,$p) = @_; + + if( defined $u && defined $p ) { + $self->{'_authentication'} = [ $u,$p]; + } + return @{$self->{'_authentication'}}; +} + + +=head2 retrieval_type + + Title : retrieval_type + Usage : $self->retrieval_type($type); + my $type = $self->retrieval_type + Function: Get/Set a proxy for retrieval_type (pipeline, io_string or tempfile) + Returns : string representing retrieval type + Args : $value - the value to store + +This setting affects how the data stream from the remote web server is +processed and passed to the Bio::SeqIO layer. Three types of retrieval +types are currently allowed: + + pipeline Perform a fork in an attempt to begin streaming + while the data is still downloading from the remote + server. Disk, memory and speed efficient, but will + not work on Windows or MacOS 9 platforms. + + io_string Store downloaded database entry(s) in memory. Can be + problematic for batch downloads because entire set + of entries must fit in memory. Alll entries must be + downloaded before processing can begin. + + tempfile Store downloaded database entry(s) in a temporary file. + All entries must be downloaded before processing can + begin. + +The default is pipeline, with automatic fallback to io_string if +pipelining is not available. + +=cut + +sub retrieval_type { + my ($self, $value) = @_; + if( defined $value ) { + $value = lc $value; + if( ! $RETRIEVAL_TYPES{$value} ) { + $self->warn("invalid retrieval type $value must be one of (" . + join(",", keys %RETRIEVAL_TYPES), ")"); + $value = $DEFAULT_RETRIEVAL_TYPE; + } + $self->{'_retrieval_type'} = $value; + } + return $self->{'_retrieval_type'}; +} + +=head2 url_params + + Title : url_params + Usage : my $params = $self->url_params or + $self->url_params($params) + Function: Get/Set the URL parameters for the Web Database + Returns : url parameters for Web Database + Args : $params - parameters to be appended to the URL for the WebDatabase + +=cut + +sub url_params { + my ($self, $value) = @_; + if( defined $value ) { + $self->{'_urlparams'} = $value; + } +} + +=head2 ua + + Title : ua + Usage : my $ua = $self->ua or + $self->ua($ua) + Function: Get/Set a LWP::UserAgent for use + Returns : reference to LWP::UserAgent Object + Args : $ua - must be a LWP::UserAgent + +=cut + +sub ua { + my ($self, $ua) = @_; + if( defined $ua && $ua->isa("LWP::UserAgent") ) { + $self->{'_ua'} = $ua; + } + return $self->{'_ua'}; +} + +=head2 postprocess_data + + Title : postprocess_data + Usage : $self->postprocess_data ( 'type' => 'string', + 'location' => \$datastr); + Function: process downloaded data before loading into a Bio::SeqIO + Returns : void + Args : hash with two keys - 'type' can be 'string' or 'file' + - 'location' either file location or string + reference containing data + +=cut + +sub postprocess_data { + my ( $self, %args) = @_; + return; +} + +# private methods +sub _request { + + my ($self, $url,$tmpfile) = @_; + my ($resp); + if( defined $tmpfile && $tmpfile ne '' ) { + $resp = $self->ua->request($url, $tmpfile); + } else { $resp = $self->ua->request($url); } + + if( $resp->is_error ) { + $self->throw("WebDBSeqI Request Error:\n".$resp->as_string); + } + return $resp; +} + +# send web request to stdout for streaming purposes +sub _stream_request { + my $self = shift; + my $request = shift; + + # fork so as to pipe output of fetch process through to + # postprocess_data method call. + my $child = open (FETCH,"-|"); + $self->throw("Couldn't fork: $!") unless defined $child; + + if ($child) { + local ($/) = "//\n"; # assume genbank/swiss format + $| = 1; + my $records = 0; + while (my $record = ) { + $records++; + $self->postprocess_data('type' => 'string', + 'location' => \$record); + print STDOUT $record; + } + $/ = "\n"; # reset to be safe; + close(FETCH); + close STDOUT; + close STDERR; + kill 9=>$$; # to prevent END{} blocks from executing in forked children + sleep; + } + else { + $| = 1; + my $resp = $self->ua->request($request, + sub { print shift } + ); + if( $resp->is_error ) { + $self->throw("WebDBSeqI Request Error:\n".$resp->as_string); + } + + close STDOUT; close STDERR; + kill 9=>$$; # to prevent END{} blocks from executing in forked children + sleep; + } + exit 0; +} + +sub io { + my ($self,$io) = @_; + + if(defined($io) || (! exists($self->{'_io'}))) { + $io = Bio::Root::IO->new() unless $io; + $self->{'_io'} = $io; + } + return $self->{'_io'}; +} + + +=head2 delay + + Title : delay + Usage : $secs = $self->delay([$secs]) + Function: get/set number of seconds to delay between fetches + Returns : number of seconds to delay + Args : new value + +NOTE: the default is to use the value specified by delay_policy(). +This can be overridden by calling this method, or by passing the +-delay argument to new(). + +=cut + +sub delay { + my $self = shift; + my $d = $self->{'_delay'}; + $self->{'_delay'} = shift if @_; + $d; +} + +=head2 delay_policy + + Title : delay_policy + Usage : $secs = $self->delay_policy + Function: return number of seconds to delay between calls to remote db + Returns : number of seconds to delay + Args : none + +NOTE: The default delay policy is 0s. Override in subclasses to +implement delays. The timer has only second resolution, so the delay +will actually be +/- 1s. + +=cut + +sub delay_policy { + my $self = shift; + return 0; +} + +=head2 _sleep + + Title : _sleep + Usage : $self->_sleep + Function: sleep for a number of seconds indicated by the delay policy + Returns : none + Args : none + +NOTE: This method keeps track of the last time it was called and only +imposes a sleep if it was called more recently than the delay_policy() +allows. + +=cut + +sub _sleep { + my $self = shift; + my $last_invocation = $LAST_INVOCATION_TIME; + if (time - $LAST_INVOCATION_TIME < $self->delay) { + my $delay = $self->delay - (time - $LAST_INVOCATION_TIME); + warn "sleeping for $delay seconds\n" if $self->verbose; + sleep $delay; + } + $LAST_INVOCATION_TIME = time; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/XEMBL.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/XEMBL.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,184 @@ +# +# $Id: XEMBL.pm,v 1.3 2002/10/22 07:38:29 lapp Exp $ +# +# BioPerl module for Bio::DB::XEMBL +# +# Cared for by Lincoln Stein +# +# Copyright Lincoln Stein +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::DB::XEMBL - Database object interface for XEMBL entry retrieval + +=head1 SYNOPSIS + + use Bio::DB::XEMBL; + + $embl = new Bio::DB::XEMBL; + + # remember that XEMBL_ID does not equal GenBank_ID! + $seq = $embl->get_Seq_by_id('BUM'); # EMBL ID + print "cloneid is ", $seq->id, "\n"; + + # or changeing to accession number and Fasta format ... + $seq = $embl->get_Seq_by_acc('J02231'); # XEMBL ACC + print "cloneid is ", $seq->id, "\n"; + + # especially when using versions, you better be prepared + # in not getting what what want + eval { + $seq = $embl->get_Seq_by_version('J02231.1'); # XEMBL VERSION + } + print "cloneid is ", $seq->id, "\n" unless $@; + + my $seqio = $embl->get_Stream_by_batch(['U83300','U83301','U83302']); + while( my $clone = $seqio->next_seq ) { + print "cloneid is ", $clone->id, "\n"; + } + +=head1 DESCRIPTION + +Allows the dynamic retrieval of Bio::Seq objects from the XEMBL +database. See L for details. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Lincoln Stein + +Email Lincoln Stein Elstein@cshl.orgE + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::DB::XEMBL; +use strict; +use Bio::DB::RandomAccessI; +use Bio::DB::XEMBLService 'getNucSeq'; +# bsml parser appears broken... +use Bio::SeqIO::bsml; +use File::Temp 'tempfile'; +use vars qw(@ISA $MODVERSION); + +@ISA = qw(Bio::DB::RandomAccessI); +$MODVERSION = '0.1'; + +sub new { + my ($class, @args ) = @_; + my $self = $class->SUPER::new(@args); + return $self; +} + +=head2 get_Seq_by_id + + Title : get_Seq_by_id + Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN') + Function: Gets a Bio::Seq object by its name + Returns : a Bio::Seq object + Args : the id (as a string) of a sequence + Throws : "id does not exist" exception + + +=cut + +sub get_Seq_by_id { + my ($self,@args) = @_; + my $seqio = $self->get_Stream_by_batch([@args]); + return $seqio->next_seq; +} + +=head2 get_Stream_by_batch + + Title : get_Stream_by_batch + Usage : $seq = $db->get_Stream_by_batch($ref); + Function: Retrieves Seq objects from XEMBL 'en masse', rather than one + at a time. Currently this is not particularly efficient, as + it loads the entire result into memory and parses it. + Example : + Returns : a Bio::SeqIO stream object + Args : $ref : an array reference containing a list of unique + ids/accession numbers. + +=cut + +sub get_Stream_by_batch { + my ($self, $ids) = @_; + $self->throw("expected an array ref, but got $ids") + unless ref($ids) eq 'ARRAY'; + my @args = @$ids; + my $result = getNucSeq(SOAP::Data->name(format=>'bsml'), + SOAP::Data->name(ids=>"@args")) + or $self->throw('id does not exist'); + my($fh,$filename) = tempfile(File::Spec->tmpdir . '/bsmlXXXXXX',SUFFIX=>'.bsml'); + print $fh $result; + close $fh; + my $seqio = Bio::SeqIO->new(-file=>$filename,-format=>'bsml'); + unlink $filename; + $seqio; +} + +*get_Stream_by_id = \&get_Stream_by_batch; + +=head2 get_Seq_by_acc + + Title : get_Seq_by_acc + Usage : $seq = $db->get_Seq_by_acc('X77802'); + Function: Gets a Bio::Seq object by accession number + Returns : A Bio::Seq object + Args : accession number (as a string) + Throws : "acc does not exist" exception + + +=cut + +sub get_Seq_by_acc{ + my ($self,@args) = @_; + return $self->get_Seq_by_id(@args); +} + +=head2 get_Seq_by_version + + Title : get_Seq_by_version + Usage : $seq = $db->get_Seq_by_version('X77802.1'); + Function: Gets a Bio::Seq object by sequence version + Returns : A Bio::Seq object + Args : accession.version (as a string) + Throws : "acc.version does not exist" exception + +=cut + +sub get_Seq_by_version{ + my ($self,@args) = @_; + return $self->get_Seq_by_id(@args); +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DB/XEMBLService.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/XEMBLService.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,107 @@ +# +# $Id: XEMBLService.pm,v 1.3 2002/10/22 07:38:29 lapp Exp $ +# +# BioPerl module for Bio::DB::XEMBLService +# +# Cared for by Lincoln Stein +# +# Copyright Lincoln Stein +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::DB::XEMBLService - SOAP service definition for XEMBL + +=head1 SYNOPSIS + + #usage + +=head1 DESCRIPTION + +SOAP service definition for XEMBL. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Lincoln Stein + +Email lstein@cshl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + + +package Bio::DB::XEMBLService; + +# -- generated by SOAP::Lite (v0.51) for Perl -- soaplite.com -- Copyright (C) 2000-2001 Paul Kulchenko -- +# -- generated from http://www.ebi.ac.uk/xembl/XEMBL.wsdl [Sat Jan 26 14:47:29 2002] + +my %methods = ( + getNucSeq => { + endpoint => 'http://www.ebi.ac.uk:80/cgi-bin/xembl/XEMBL-SOAP.pl', + soapaction => 'http://www.ebi.ac.uk/XEMBL#getNucSeq', + uri => 'http://www.ebi.ac.uk/XEMBL', + parameters => [ + SOAP::Data->new(name => 'format', type => 'xsd:string', attr => {}), + SOAP::Data->new(name => 'ids', type => 'xsd:string', attr => {}), + ], + }, +); + +use SOAP::Lite; +use Exporter; +use Carp (); + +use vars qw(@ISA $AUTOLOAD @EXPORT_OK %EXPORT_TAGS); +@ISA = qw(Exporter SOAP::Lite); +@EXPORT_OK = (keys %methods); +%EXPORT_TAGS = ('all' => [@EXPORT_OK]); + +no strict 'refs'; +for my $method (@EXPORT_OK) { + my %method = %{$methods{$method}}; + *$method = sub { + my $self = UNIVERSAL::isa($_[0] => __PACKAGE__) + ? ref $_[0] ? shift # OBJECT + # CLASS, either get self or create new and assign to self + : (shift->self || __PACKAGE__->self(__PACKAGE__->new)) + # function call, either get self or create new and assign to self + : (__PACKAGE__->self || __PACKAGE__->self(__PACKAGE__->new)); + $self->proxy($method{endpoint} || Carp::croak "No server address (proxy) specified") unless $self->proxy; + my @templates = @{$method{parameters}}; + my $som = $self + -> endpoint($method{endpoint}) + -> uri($method{uri}) + -> on_action(sub{qq!"$method{soapaction}"!}) + -> call($method => map {shift(@templates)->value($_)} @_); + UNIVERSAL::isa($som => 'SOAP::SOM') ? wantarray ? $som->paramsall : $som->result + : $som; + } +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DBLinkContainerI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DBLinkContainerI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,106 @@ +# $Id: DBLinkContainerI.pm,v 1.8 2002/10/22 07:38:24 lapp Exp $ +# +# BioPerl module for Bio::DBLinkContainerI +# +# Cared for by Heikki Lehvaslaiho +# +# Copyright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::DBLinkContainerI - Abstract interface for any object wanting to use + database cross references + +=head1 SYNOPSIS + + # get an objects containing database cross reference + + foreach $obj ( @objs ) { + if( $obj->isa('Bio::DBLinkContainerI') ) { + foreach $dblink ( $obj->each_DBLink() ) { + # do stuff + } + } + } + +=head1 DESCRIPTION + +This interface defines the functions one can expect for any object +wanting to use database cross-references. This class does not actually +provide any implemention, it just provides the definitions of what +methods one can call. + +The database cross-references are implemented as L +objects. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email: heikki@ebi.ac.uk +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::DBLinkContainerI; +use vars qw(@ISA); +use strict; + +use Carp; +use Bio::Root::RootI; + +@ISA = qw(Bio::Root::RootI); + +=head2 each_DBLink + + Title : each_DBLink + Usage : foreach $ref ( $self->each_DBlink() ) + Function: gets an array of DBlink of objects + Example : + Returns : an array of Bio::Annotation::DBLink objects + Args : none + + +=cut + +sub each_DBLink{ + my ($self) = @_; + my $class = ref($self) || $self; + $self->throw("Class $class did not define method 'each_DBLink' for interface DBLinkContainerI"); +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Das/FeatureTypeI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Das/FeatureTypeI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,389 @@ +# $Id: FeatureTypeI.pm,v 1.3 2002/11/11 18:16:30 lapp Exp $ +# +# BioPerl module for Bio::Das::FeatureTypeI +# +# Cared for by Lincoln Stein +# +# Copyright Lincoln Stein +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Das::FeatureTypeI - Simple interface to Sequence Ontology feature types + +=head1 SYNOPSIS + + # Get a Bio::Das::FeatureTypeI object from somewhere + $term = $db->fetch.... + + # Get the name of the term + $definition = $term->name; + + # Get the accession of the term + $accession = $term->accession; + + # Get the definition of the term + $definition = $term->definition; + + # Get the parents of the term, optionally filtered by relationship + @parents = $term->parents($relationship); + + # Get the children of the term, optionally filtered by relationship + @children = $term->children($relationship); + + # Given a parent and child, returns their relationship, or undef if + # not directly related + $relationship = $parent->relationship($child); + + # Return true if two terms are identical + $match = $term1->equals($term2); + + # Return true if $term2 is a descendent of $term1, optionally + # filtering by relationship ("isa" assumed) + $match = $term1->is_descendent($term2,$relationship); + + # Return true if $term2 is a parent of $term1, optionally + # filtering by relationship ("isa" assumed) + $match = $term1->is_parent($term2,$relationship); + + # Return true if $term2 is equal to $term1 or if $term2 descends + # from term 1 via the "isa" relationship + $match = $term1->match($term2); + + # Create a new term de novo + $term = Bio::Das::FeatureTypeI->new(-name => $name, + -accession => $accession, + -definition => $definition); + + # Add a child to a term + $term1->add_child($term2,$relationship); + + # Delete a child from a term + $term1->delete_child($term2); + +=head1 DESCRIPTION + +Bio::Das::FeatureTypeI is an interface to the Gene Ontology +Consortium's Sequence Ontology (SO). The SO, like other ontologies, +is a directed acyclic graph in which a child node may have multiple +parents. The relationship between parent and child is one of a list +of relationships. The SO currently recognizes two relationships "isa" +and "partof". + +The intent of this interface is to interoperate with older software +that uses bare strings to represent feature types. For this reason, +the interface overloads the stringify ("") and string equals (eq) +operations. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bio.perl.org + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Lincoln Stein + +Email lstein@cshl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +#' +# Let the code begin... + +package Bio::Das::FeatureTypeI; +use strict; + +use vars qw(@ISA); +use Bio::Root::RootI; +use overload '""' => 'name', + eq => 'match', + fallback => 1; + +# Object preamble - inherits from Bio::Root::RootI; +@ISA = qw(Bio::Root::RootI); + +=head2 name + + Title : name + Usage : $string = $term->name + Function: return the term for the type + Returns : a string + Args : none + Status : Public + +=cut + +sub name { shift->throw_not_implemented } + +=head2 accession + + Title : accession + Usage : $string = $term->accession + Function: return the accession number for the term + Returns : a string + Args : none + Status : Public + +=cut + +sub accession { shift->throw_not_implemented } + +=head2 definition + + Title : definition + Usage : $string = $term->definition + Function: return the human-readable definition for the term + Returns : a string + Args : none + Status : Public + +=cut + +sub definition { shift->throw_not_implemented } + +=head2 parents + + Title : parents + Usage : @terms = $term->parents($relationship) + Function: return parent terms + Returns : list of Bio::Das::FeatureTypeI + Args : none + Status : Public + +Returns the parents for the current term, empty if there are none. An +optional relationship argument will return those parents +that are related via the specified relationship type. + +The relationship is one of "isa" or "partof". + +=cut + +sub parents { shift->throw_not_implemented; } + +=head2 children + + Title : children + Usage : @terms = $term->children($relationship) + Function: return children terms + Returns : list of Bio::Das::FeatureTypeI + Args : none + Status : Public + +Returns the children for the current term, empty if there are none. An +optional relationship argument will return those children +that are related via the specified relationship type. + +The relationship is one of "isa" or "partof". + +=cut + +sub children { shift->throw_not_implemented; } + +=head2 relationship + + Title : relationship + Usage : $relationship = $parent->relationship($child) + Function: return the relationship between a parent and a child + Returns : one of "isa" or "partof" + Args : none + Status : Public + +This method returns the relationship between a parent and one of its +immediate descendents. It can return "isa", "partof", or undef if +there is not a direct parent/child relationship (kissing cousins are +*not* recognized). + +=cut + +sub relationship { shift->throw_not_implemented } + +=head2 equals + + Title : equals + Usage : $boolean = $term1->equals($term2) + Function: return true if $term1 and $term2 are the same + Returns : boolean + Args : second term + Status : Public + +The two terms must be identical. In practice, this means that if +term2 is a Bio::Das::FeatureI object, then its accession number must +match the first term's accession number. Otherwise, if term2 is a +bare string, then it must equal (in a case insensitive manner) +the name of term1. + +NOTE TO IMPLEMENTORS: This method is defined in terms of other +methods, so does not need to be implemented. + +=cut + +#' +sub equals { + my $self = shift; + my $term2 = shift; + if ($term2->isa('Bio::Das::FeatureTypeI')) { + return $self->accession eq $term2->accession; + } else { + return lc $self->name eq lc $term2; + } +} + +=head2 is_descendent + + Title : is_descendent + Usage : $boolean = $term1->is_descendent($term2 [,$relationship]) + Function: return true of $term2 is a descendent of $term1 + Returns : boolean + Args : second term + Status : Public + +This method returns true if $term2 descends from $term1. The +operation traverses the tree. The traversal can be limited to the +relationship type ("isa" or "partof") if desired. $term2 can be a +bare string, in which case the term names will be used as the basis +for term matching (see equals()). + +NOTE TO IMPLEMENTORS: this method is defined as the inverse of +is_parent(). Do not implement it directly, but do implement +is_parent(). + +=cut + +sub is_descendent { + my $self = shift; + my ($term,$relationship) = @_; + $self->throw("$term is not a Bio::Das::FeatureTypeI") + unless $term->isa('Bio::Das::FeatureTypeI'); + $term->is_parent($self,$relationship); +} + +=head2 is_parent + + Title : is_parent + Usage : $boolean = $term1->is_parent($term2 [,$relationship]) + Function: return true of $term2 is a parent of $term1 + Returns : boolean + Args : second term + Status : Public + +This method returns true if $term2 is a parent of $term1. The +operation traverses the tree. The traversal can be limited to the +relationship type ("isa" or "partof") if desired. $term2 can be a +bare string, in which case the term names will be used as the basis +for term matching (see equals()). + +NOTE TO IMPLEMENTORS: Implementing this method will also implement +is_descendent(). + +=cut + +sub is_parent { shift->throw_not_implemented } + +=head2 match + + Title : match + Usage : $boolean = $term1->match($term2) + Function: return true if $term1 equals $term2 or if $term2 is an "isa" descendent + Returns : boolean + Args : second term + Status : Public + +This method combines equals() and is_descendent() in such a way that +the two terms will match if they are the same or if the second term is +an instance of the first one. This is also the basis of the operator +overloading of eq. + +NOTE TO IMPLEMENTORS: This method is defined in terms of other methods +and does not need to be implemented. + +=cut + +sub match { + my $self = shift; + my $term2 = shift; + return 1 if $self->equals($term2); + return $self->is_descendent($term2,'isa'); +} + +=head2 new + + Title : new + Usage : $term = Bio::Das::FeatureTypeI->new(@args) + Function: create a new term + Returns : new term + Args : see below + Status : Public + +This method creates a new Bio::Das::FeatureTypeI. Arguments: + + Argument Description + -------- ------------ + + -name Name of this term + + -accession Accession number for the term + + -definition Definition of the term + +=cut + +sub new { shift->throw_not_implemented } + +=head2 add_child + + Title : add_child + Usage : $boolean = $term->add_child($term2,$relationship) + Function: add a child to a term + Returns : a boolean indicating success + Args : new child + Throws : a "cycle detected" exception + Status : Public + +This method adds a new child to the indicated node. It may detect a +cycle in the DAG and throw a "cycle detected" exception. + +=cut + +sub add_child { shift->throw_not_implemented } + + +=head2 delete_child + + Title : delete_child + Usage : $boolean = $term->delete_child($term2); + Function: delete a child of the term + Returns : a boolean indicating success + Args : child to be deleted + Throws : a "not a child" exception + Status : Public + +This method deletes a new child from the indicated node. It will +throw an exception if the indicated child is not a direct descendent. + +=cut + +sub delete_child { shift->throw_not_implemented } + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Das/SegmentI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Das/SegmentI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,453 @@ +# $Id: SegmentI.pm,v 1.6 2002/12/22 03:42:22 lstein Exp $ +# +# BioPerl module for Bio::Das::SegmentI +# +# Cared for by Lincoln Stein +# +# Copyright Lincoln Stein +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Das::SegmentI - DAS-style access to a feature database + +=head1 SYNOPSIS + + # Get a Bio::Das::SegmentI object from a Bio::DasI database... + + $segment = $das->segment(-name=>'Landmark', + -start=>$start, + -end => $end); + + @features = $segment->overlapping_features(-type=>['type1','type2']); + # each feature is a Bio::SeqFeatureI-compliant object + + @features = $segment->contained_features(-type=>['type1','type2']); + + @features = $segment->contained_in(-type=>['type1','type2']); + + $stream = $segment->get_feature_stream(-type=>['type1','type2','type3']; + while (my $feature = $stream->next_seq) { + # do something with feature + } + + $count = $segment->features_callback(-type=>['type1','type2','type3'], + -callback => sub { ... { } + ); + +=head1 DESCRIPTION + +Bio::Das::SegmentI is a simplified alternative interface to sequence +annotation databases used by the distributed annotation system. In +this scheme, the genome is represented as a series of landmarks. Each +Bio::Das::SegmentI object ("segment") corresponds to a genomic region +defined by a landmark and a start and end position relative to that +landmark. A segment is created using the Bio::DasI segment() method. + +Features can be filtered by the following attributes: + + 1) their location relative to the segment (whether overlapping, + contained within, or completely containing) + + 2) their type + + 3) other attributes using tag/value semantics + +Access to the feature list uses three distinct APIs: + + 1) fetching entire list of features at a time + + 2) fetching an iterator across features + + 3) a callback + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bio.perl.org + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Lincoln Stein + +Email lstein@cshl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +#' +# Let the code begin... + +package Bio::Das::SegmentI; +use strict; + +use vars qw(@ISA $VERSION); +use Bio::Root::RootI; + +# Object preamble - inherits from Bio::Root::RootI; +@ISA = qw(Bio::Root::RootI); +$VERSION = 1.00; + +=head2 seq_id + + Title : seq_id + Usage : $ref = $s->seq_id + Function: return the ID of the landmark + Returns : a string + Args : none + Status : Public + +=cut + +sub seq_id { shift->throw_not_implemented } + +=head2 start + + Title : start + Usage : $s->start + Function: start of segment + Returns : integer + Args : none + Status : Public + +This is a read-only accessor for the start of the segment. Alias +to low() for Gadfly compatibility. + +=cut + +sub start { shift->throw_not_implemented } +sub low { shift->start } + +=head2 end + + Title : end + Usage : $s->end + Function: end of segment + Returns : integer + Args : none + Status : Public + +This is a read-only accessor for the end of the segment. Alias to +high() for Gadfly compatibility. + +=cut + +sub end { shift->throw_not_implemented } +sub stop { shift->end } +sub high { shift->end } + +=head2 length + + Title : length + Usage : $s->length + Function: length of segment + Returns : integer + Args : none + Status : Public + +Returns the length of the segment. Always a positive number. + +=cut + +sub length { shift->throw_not_implemented; } + +=head2 seq + + Title : seq + Usage : $s->seq + Function: get the sequence string for this segment + Returns : a string + Args : none + Status : Public + +Returns the sequence for this segment as a simple string. + +=cut + +sub seq {shift->throw_not_implemented} + +=head2 ref + + Title : ref + Usage : $ref = $s->ref([$newlandmark]) + Function: get/set the reference landmark for addressing + Returns : a string + Args : none + Status : Public + +This method is used to examine/change the reference landmark used to +establish the coordinate system. By default, the landmark cannot be +changed and therefore this has the same effect as seq_id(). The new +landmark might be an ID, or another Das::SegmentI object. + +=cut + +sub ref { shift->seq_id } +*refseq = \&ref; + +=head2 absolute + + Title : absolute + Usage : $s->absolute([$new_value]) + Function: get/set absolute addressing mode + Returns : flag + Args : new flag (optional) + Status : Public + +Turn on and off absolute-addressing mode. In absolute addressing +mode, coordinates are relative to some underlying "top level" +coordinate system (such as a chromosome). ref() returns the identity +of the top level landmark, and start() and end() return locations +relative to that landmark. In relative addressing mode, coordinates +are relative to the landmark sequence specified at the time of segment +creation or later modified by the ref() method. + +The default is to return false and to do nothing in response to +attempts to set absolute addressing mode. + +=cut + +sub absolute { return } + +=head2 features + + Title : features + Usage : @features = $s->features(@args) + Function: get features that overlap this segment + Returns : a list of Bio::SeqFeatureI objects + Args : see below + Status : Public + +This method will find all features that intersect the segment in a +variety of ways and return a list of Bio::SeqFeatureI objects. The +feature locations will use coordinates relative to the reference +sequence in effect at the time that features() was called. + +The returned list can be limited to certain types, attributes or +range intersection modes. Types of range intersection are one of: + + "overlaps" the default + "contains" return features completely contained within the segment + "contained_in" return features that completely contain the segment + +Two types of argument lists are accepted. In the positional argument +form, the arguments are treated as a list of feature types. In the +named parameter form, the arguments are a series of -name=Evalue +pairs. + + Argument Description + -------- ------------ + + -types An array reference to type names in the format + "method:source" + + -attributes A hashref containing a set of attributes to match + + -rangetype One of "overlaps", "contains", or "contained_in". + + -iterator Return an iterator across the features. + + -callback A callback to invoke on each feature + +The -attributes argument is a hashref containing one or more +attributes to match against: + + -attributes => { Gene => 'abc-1', + Note => 'confirmed' } + +Attribute matching is simple string matching, and multiple attributes +are ANDed together. More complex filtering can be performed using the +-callback option (see below). + +If -iterator is true, then the method returns an object reference that +implements the next_seq() method. Each call to next_seq() returns a +new Bio::SeqFeatureI object. + +If -callback is passed a code reference, the code reference will be +invoked on each feature returned. The code will be passed two +arguments consisting of the current feature and the segment object +itself, and must return a true value. If the code returns a false +value, feature retrieval will be aborted. + +-callback and -iterator are mutually exclusive options. If -iterator +is defined, then -callback is ignored. + +NOTE: the following methods all build on top of features(), and do not +need to be explicitly implemented. + + overlapping_features() + contained_features() + contained_in() + get_feature_stream() + +=cut + +sub features {shift->throw_not_implemented} + +=head2 overlapping_features + + Title : overlapping_features + Usage : @features = $s->overlapping_features(@args) + Function: get features that overlap this segment + Returns : a list of Bio::SeqFeatureI objects + Args : see below + Status : Public + +This method is identical to features() except that it defaults to +finding overlapping features. + +=cut + +sub overlapping_features { + my $self = shift; + my @args = $_[0] !~ /^-/ ? (@_, -rangetype=>'overlaps') + : (-types=>\@_,-rangetype=>'overlaps'); + $self->features(@args); +} + +=head2 contained_features + + Title : contained_features + Usage : @features = $s->contained_features(@args) + Function: get features that are contained in this segment + Returns : a list of Bio::SeqFeatureI objects + Args : see below + Status : Public + +This method is identical to features() except that it defaults to +a range type of 'contained'. + +=cut + +sub contained_features { + my $self = shift; + my @args = $_[0] !~ /^-/ ? (@_, -rangetype=>'contained') + : (-types=>\@_,-rangetype=>'contained'); + $self->features(@args); +} + +=head2 contained_in + + Title : contained_in + Usage : @features = $s->contained_in(@args) + Function: get features that contain this segment + Returns : a list of Bio::SeqFeatureI objects + Args : see below + Status : Public + +This method is identical to features() except that it defaults to +a range type of 'contained_in'. + +=cut + +sub contained_in { + my $self = shift; + my @args = $_[0] !~ /^-/ ? (@_, -rangetype=>'contained_in') + : (-types=>\@_,-rangetype=>'contained_in'); + $self->features(@args); +} + +=head2 get_feature_stream + + Title : get_feature_stream + Usage : $iterator = $s->get_feature_stream(@args) + Function: get an iterator across the segment + Returns : an object that implements next_seq() + Args : see below + Status : Public + +This method is identical to features() except that it always generates +an iterator. + +NOTE: This is defined in the interface in terms of features(). You do not +have to implement it. + +=cut + +sub get_feature_stream { + my $self = shift; + my @args = defined $_[0] && $_[0] =~ /^-/ ? (@_, -iterator=>1) + : (-types=>\@_,-iterator=>1); + $self->features(@args); +} + +=head2 factory + + Title : factory + Usage : $factory = $s->factory + Function: return the segment factory + Returns : a Bio::DasI object + Args : see below + Status : Public + +This method returns a Bio::DasI object that can be used to fetch +more segments. This is typically the Bio::DasI object from which +the segment was originally generated. + +=cut + +#' + +sub factory {shift->throw_not_implemented} + +=head2 primary_tag + + Title : primary_tag + Usage : $tag = $s->primary_tag + Function: identifies the segment as type "DasSegment" + Returns : a string named "DasSegment" + Args : none + Status : Public, but see below + +This method provides Bio::Das::Segment objects with a primary_tag() +field that identifies them as being of type "DasSegment". This allows +the Bio::Graphics engine to render segments just like a feature in order +nis way useful. + +This does not need to be implemented. It is defined by the interface. + +=cut + +#' + +sub primary_tag {"DasSegment"} + +=head2 strand + + Title : strand + Usage : $strand = $s->strand + Function: identifies the segment strand as 0 + Returns : the number 0 + Args : none + Status : Public, but see below + +This method provides Bio::Das::Segment objects with a strand() field +that identifies it as being strandless. This allows the Bio::Graphics +engine to render segments just like a feature in order nis way useful. + +This does not need to be implemented. It is defined by the interface. + +=cut + +sub strand { 0 } + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DasI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DasI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,415 @@ +# $Id: DasI.pm,v 1.15 2002/11/11 18:16:29 lapp Exp $ +# +# BioPerl module for Bio::DasI +# +# Cared for by Lincoln Stein +# +# Copyright Lincoln Stein +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::DasI - DAS-style access to a feature database + +=head1 SYNOPSIS + + # Open up a feature database somehow... + $db = Bio::DasI->new(@args); + + @segments = $db->segment(-name => 'NT_29921.4', + -start => 1, + -end => 1000000); + + # segments are Bio::Das::SegmentI - compliant objects + + # fetch a list of features + @features = $db->features(-type=>['type1','type2','type3']); + + # invoke a callback over features + $db->features(-type=>['type1','type2','type3'], + -callback => sub { ... } + ); + + $stream = $db->get_seq_stream(-type=>['type1','type2','type3']); + while (my $feature = $stream->next_seq) { + # each feature is a Bio::SeqFeatureI-compliant object + } + + # get all feature types + @types = $db->types; + + # count types + %types = $db->types(-enumerate=>1); + + @feature = $db->get_feature_by_name($class=>$name); + @feature = $db->get_feature_by_target($target_name); + @feature = $db->get_feature_by_attribute($att1=>$value1,$att2=>$value2); + $feature = $db->get_feature_by_id($id); + + $error = $db->error; + +=head1 DESCRIPTION + +Bio::DasI is a simplified alternative interface to sequence annotation +databases used by the distributed annotation system (see +L). In this scheme, the genome is represented as a series of +features, a subset of which are named. Named features can be used as +reference points for retrieving "segments" (see L), +and these can, in turn, be used as the basis for exploring the genome +further. + +In addition to a name, each feature has a "class", which is +essentially a namespace qualifier and a "type", which describes what +type of feature it is. Das uses the GO consortium's ontology of +feature types, and so the type is actually an object of class +Bio::Das::FeatureTypeI (see L). Bio::DasI +provides methods for interrogating the database for the types it +contains and the counts of each type. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bio.perl.org + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Lincoln Stein + +Email lstein@cshl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +#' +# Let the code begin... + +package Bio::DasI; +use strict; + +use vars qw(@ISA); +use Bio::Root::RootI; +use Bio::Das::SegmentI; +use Bio::SeqFeature::CollectionI; +# Object preamble - inherits from Bio::Root::Root; +@ISA = qw(Bio::Root::RootI Bio::SeqFeature::CollectionI); + +=head2 new + + Title : new + Usage : Bio::DasI->new(@args) + Function: Create new Bio::DasI object + Returns : a Bio::DasI object + Args : see below + +The new() method creates a new object. The argument list is either a +single argument consisting of a connection string, or the following +list of -name=Evalue arguments: + + Argument Description + -------- ----------- + + -dsn Connection string for database + -adaptor Name of an adaptor class to use when connecting + -aggregator Array ref containing list of aggregators + "semantic mappers" to apply to database + -user Authentication username + -pass Authentication password + +Implementors of DasI may add other arguments. + +=cut + +sub new {shift->throw_not_implemented} + +=head2 types + + Title : types + Usage : $db->types(@args) + Function: return list of feature types in database + Returns : a list of Bio::Das::FeatureTypeI objects + Args : see below + +This routine returns a list of feature types known to the database. It +is also possible to find out how many times each feature occurs. + +Arguments are -option=Evalue pairs as follows: + + -enumerate if true, count the features + +The returned value will be a list of Bio::Das::FeatureTypeI objects +(see L. + +If -enumerate is true, then the function returns a hash (not a hash +reference) in which the keys are the stringified versions of +Bio::Das::FeatureTypeI and the values are the number of times each +feature appears in the database. + +=cut + +sub types { shift->throw_not_implemented; } + +=head2 segment + + Title : segment + Usage : $db->segment(@args); + Function: create a segment object + Returns : segment object(s) + Args : see below + +This method generates a Bio::Das::SegmentI object (see +L). The segment can be used to find overlapping +features and the raw sequence. + +When making the segment() call, you specify the ID of a sequence +landmark (e.g. an accession number, a clone or contig), and a +positional range relative to the landmark. If no range is specified, +then the entire region spanned by the landmark is used to generate the +segment. + +Arguments are -option=Evalue pairs as follows: + + -name ID of the landmark sequence. + + -class A namespace qualifier. It is not necessary for the + database to honor namespace qualifiers, but if it + does, this is where the qualifier is indicated. + + -version Version number of the landmark. It is not necessary for + the database to honor versions, but if it does, this is + where the version is indicated. + + -start Start of the segment relative to landmark. Positions + follow standard 1-based sequence rules. If not specified, + defaults to the beginning of the landmark. + + -end End of the segment relative to the landmark. If not specified, + defaults to the end of the landmark. + +The return value is a list of Bio::Das::SegmentI objects. If the method +is called in a scalar context and there are no more than one segments +that satisfy the request, then it is allowed to return the segment. +Otherwise, the method must throw a "multiple segment exception". + +=cut + +#' + +sub segment { shift->throw_not_implemented } + +=head2 features + + Title : features + Usage : $db->features(@args) + Function: get all features, possibly filtered by type + Returns : a list of Bio::SeqFeatureI objects + Args : see below + Status : public + +This routine will retrieve features in the database regardless of +position. It can be used to return all features, or a subset based on +their type + +Arguments are -option=Evalue pairs as follows: + + -types List of feature types to return. Argument is an array + of Bio::Das::FeatureTypeI objects or a set of strings + that can be converted into FeatureTypeI objects. + + -callback A callback to invoke on each feature. The subroutine + will be passed each Bio::SeqFeatureI object in turn. + + -attributes A hash reference containing attributes to match. + +The -attributes argument is a hashref containing one or more attributes +to match against: + + -attributes => { Gene => 'abc-1', + Note => 'confirmed' } + +Attribute matching is simple exact string matching, and multiple +attributes are ANDed together. See L for a +more sophisticated take on this. + +If one provides a callback, it will be invoked on each feature in +turn. If the callback returns a false value, iteration will be +interrupted. When a callback is provided, the method returns undef. + +=cut + +sub features { shift->throw_not_implemented } + +=head2 get_feature_by_name + + Title : get_feature_by_name + Usage : $db->get_feature_by_name(-class=>$class,-name=>$name) + Function: fetch features by their name + Returns : a list of Bio::SeqFeatureI objects + Args : the class and name of the desired feature + Status : public + +This method can be used to fetch named feature(s) from the database. +The -class and -name arguments have the same meaning as in segment(), +and the method also accepts the following short-cut forms: + + 1) one argument: the argument is treated as the feature name + 2) two arguments: the arguments are treated as the class and name + (note: this uses _rearrange() so the first argument must not + begin with a hyphen or it will be interpreted as a named + argument). + +This method may return zero, one, or several Bio::SeqFeatureI objects. +The implementor may allow the name to contain wildcards, in which case +standard C-shell glob semantics are expected. + +=cut + +sub get_feature_by_name { + shift->throw_not_implemented(); +} + +=head2 get_feature_by_target + + Title : get_feature_by_target + Usage : $db->get_feature_by_target($class => $name) + Function: fetch features by their similarity target + Returns : a list of Bio::SeqFeatureI objects + Args : the class and name of the desired feature + Status : public + +This method can be used to fetch a named feature from the database +based on its similarity hit. The arguments are the same as +get_feature_by_name(). If this is not implemented, the interface +defaults to using get_feature_by_name(). + +=cut + +sub get_feature_by_target { + shift->get_feature_by_name(@_); +} + +=head2 get_feature_by_id + + Title : get_feature_by_id + Usage : $db->get_feature_by_target($id) + Function: fetch a feature by its ID + Returns : a Bio::SeqFeatureI objects + Args : the ID of the feature + Status : public + +If the database provides unique feature IDs, this can be used to +retrieve a single feature from the database. If not overridden, this +interface calls get_feature_by_name() and returns the first element. + +=cut + +sub get_feature_by_id { + (shift->get_feature_by_name(@_))[0]; +} + +=head2 get_feature_by_attribute + + Title : get_feature_by_attribute + Usage : $db->get_feature_by_attribute(attribute1=>value1,attribute2=>value2) + Function: fetch features by combinations of attribute values + Returns : a list of Bio::SeqFeatureI objects + Args : the class and name of the desired feature + Status : public + +This method can be used to fetch a set of features from the database. +Attributes are a list of name=Evalue pairs. They will be +logically ANDed together. If an attribute value is an array +reference, the list of values in the array is treated as an +alternative set of values to be ORed together. + +=cut + +sub get_feature_by_attribute { + shift->throw_not_implemented(); +} + + +=head2 search_notes + + Title : search_notes + Usage : $db->search_notes($search_term,$max_results) + Function: full-text search on features, ENSEMBL-style + Returns : an array of [$name,$description,$score] + Args : see below + Status : public + +This routine performs a full-text search on feature attributes (which +attributes depend on implementation) and returns a list of +[$name,$description,$score], where $name is the feature ID, +$description is a human-readable description such as a locus line, and +$score is the match strength. + +Since this is a decidedly non-standard thing to do (but the generic +genome browser uses it), the default method returns an empty list. +You do not have to implement it. + +=cut + +sub search_notes { return } + +=head2 get_seq_stream + + Title : get_seq_stream + Usage : $seqio = $db->get_seq_stream(@args) + Function: Performs a query and returns an iterator over it + Returns : a Bio::SeqIO stream capable of returning Bio::SeqFeatureI objects + Args : As in features() + Status : public + +This routine takes the same arguments as features(), but returns a +Bio::SeqIO::Stream-compliant object. Use it like this: + + $stream = $db->get_seq_stream('exon'); + while (my $exon = $stream->next_seq) { + print $exon,"\n"; + } + +NOTE: In the interface this method is aliased to get_feature_stream(), +as the name is more descriptive. + +=cut + +sub get_seq_stream { shift->throw_not_implemented } +sub get_feature_stream {shift->get_seq_stream(@_) } + +=head2 refclass + + Title : refclass + Usage : $class = $db->refclass + Function: returns the default class to use for segment() calls + Returns : a string + Args : none + Status : public + +For data sources which use namespaces to distinguish reference +sequence accessions, this returns the default namespace (or "class") +to use. This interface defines a default of "Accession". + +=cut + +sub refclass { "Accession" } + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/DescribableI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DescribableI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,111 @@ +# $Id: DescribableI.pm,v 1.6 2002/10/25 01:29:37 lapp Exp $ + +# +# This module is licensed under the same terms as Perl itself. You use, +# modify, and redistribute it under the terms of the Perl Artistic License. +# + +=head1 NAME + +Bio::DescribableI - interface for objects with human readable names and descriptions + +=head1 SYNOPSIS + + + # to test this is a describable object + + $obj->isa("Bio::DescribableI") || + $obj->throw("$obj does not implement the Bio::DescribableI interface"); + + # accessors + + $name = $obj->display_name(); + $desc = $obj->description(); + + + +=head1 DESCRIPTION + +This interface describes methods expected on describable objects, ie +ones which have human displayable names and descriptions + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney + +Email birney@sanger.ac.uk + +=cut + +package Bio::DescribableI; +use vars qw(@ISA ); +use strict; +use Bio::Root::RootI; + + +@ISA = qw(Bio::Root::RootI); + +=head1 Implementation Specific Functions + +These functions are the ones that a specific implementation must +define. + +=head2 display_name + + Title : display_name + Usage : $string = $obj->display_name() + Function: A string which is what should be displayed to the user + the string should have no spaces (ideally, though a cautious + user of this interface would not assumme this) and should be + less than thirty characters (though again, double checking + this is a good idea) + Returns : A scalar + Status : Virtual + +=cut + +sub display_name { + my ($self) = @_; + $self->throw_not_implemented(); +} + + +=head2 description + + Title : description + Usage : $string = $obj->description() + Function: A text string suitable for displaying to the user a + description. This string is likely to have spaces, but + should not have any newlines or formatting - just plain + text. The string should not be greater than 255 characters + and clients can feel justified at truncating strings at 255 + characters for the purposes of display + Returns : A scalar + Status : Virtual + +=cut + +sub description { + my ($self) = @_; + $self->throw_not_implemented(); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/AlignStrainSlice.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/AlignStrainSlice.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,344 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::AlignStrainSlice - Represents the slice of the genome aligned with certain strains (applying the variations/indels) + +=head1 SYNOPSIS + + $sa = $db->get_SliceAdaptor; + + $slice = + $sa->fetch_by_region( 'chromosome', 'X', 1_000_000, 2_000_000 ); + + $strainSlice1 = $slice->get_by_Strain($strain_name1); + $strainSlice2 = $slice->get_by_Strain($strain_name2); + + my @strainSlices; + push @strainSlices, $strainSlice1; + push @strainSlices, $strainSlice2; + + $alignSlice = Bio::EnsEMBL::AlignStrainSlice->new( + -SLICE => $slice, + -STRAINS => \@strainSlices + ); + + # Get coordinates of variation in alignSlice + my $alleleFeatures = $strainSlice1->get_all_AlleleFeature_Slice(); + + foreach my $af ( @{$alleleFeatures} ) { + my $new_feature = $alignSlice->alignFeature( $af, $strainSlice1 ); + print( "Coordinates of the feature in AlignSlice are: ", + $new_feature->start, "-", $new_feature->end, "\n" ); + } + +=head1 DESCRIPTION + +A AlignStrainSlice object represents a region of a genome align for +certain strains. It can be used to align certain strains to a reference +slice. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::AlignStrainSlice; +use strict; + +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Mapper; +use Bio::EnsEMBL::Mapper::RangeRegistry; +use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning); + +=head2 new + + Arg[1] : Bio::EnsEMBL::Slice $Slice + Arg[2] : listref of Bio::EnsEMBL::StrainSlice $strainSlice + Example : push @strainSlices, $strainSlice1; + push @strainSlices, $strainSlice2; + ..... + push @strainSlices, $strainSliceN; + $alignStrainSlice = Bio::EnsEMBL::AlignStrainSlice->new(-SLICE => $slice, + -STRAIN => \@strainSlices); + Description : Creates a new Bio::EnsEMBL::AlignStrainSlice object that will contain a mapper between + the Slice object, plus all the indels from the different Strains + ReturnType : Bio::EnsEMBL::AlignStrainSlice + Exceptions : none + Caller : general + +=cut + +sub new{ + my $caller = shift; + my $class = ref($caller) || $caller; + + my ($slice, $strainSlices) = rearrange([qw(SLICE STRAINS)],@_); + + #check that both StrainSlice and Slice are identical (must have been defined in the same slice) + foreach my $strainSlice (@{$strainSlices}){ + if (($strainSlice->start != $slice->start) || ($strainSlice->end != $slice->end) || ($strainSlice->seq_region_name ne $slice->seq_region_name)){ + warning("Not possible to create Align object from different Slices"); + return []; + } + } + + return bless{'slice' => $slice, + 'strains' => $strainSlices}, $class; +} + +=head2 alignFeature + + Arg[1] : Bio::EnsEMBL::Feature $feature + Arg[2] : Bio::EnsEMBL::StrainSlice $strainSlice + Example : $new_feature = $alignSlice->alignFeature($feature, $strainSlice); + Description : Creates a new Bio::EnsEMBL::Feature object that aligned to + the AlignStrainSlice object. + ReturnType : Bio::EnsEMBL::Feature + Exceptions : none + Caller : general + +=cut + +sub alignFeature{ + my $self = shift; + my $feature = shift; + + #check that the object is a Feature + if (!ref($feature) || !$feature->isa('Bio::EnsEMBL::Feature')){ + throw("Bio::EnsEMBL::Feature object expected"); + } + #and align it to the AlignStrainSlice object + my $mapper_strain = $self->mapper(); + + my @results; + + if ($feature->start > $feature->end){ + #this is an Indel, map it with the special method + @results = $mapper_strain->map_indel('Slice',$feature->start, $feature->end, $feature->strand,'Slice'); + #and modify the coordinates according to the length of the indel + $results[0]->end($results[0]->start + $feature->length_diff -1); + } + else{ + @results = $mapper_strain->map_coordinates('Slice',$feature->start, $feature->end, $feature->strand,'Slice'); + } + #get need start and end of the new feature, aligned ot AlignStrainSlice + my @results_ordered = sort {$a->start <=> $b->start} @results; + + my %new_feature = %$feature; #make a shallow copy of the Feature + $new_feature{'start'}= $results_ordered[0]->start(); + $new_feature{'end'} = $results_ordered[-1]->end(); #get last element of the array, the end of the slice + + return bless \%new_feature, ref($feature); + +} + + +#getter for the mapper between the Slice and the different StrainSlice objects +sub mapper{ + my $self = shift; + + if (!defined $self->{'mapper'}){ + #get the alleleFeatures in all the strains + if (!defined $self->{'indels'}){ + #when the list of indels is not defined, get them + $self->{'indels'} = $self->_get_indels(); + } + my $indels = $self->{'indels'}; #gaps in reference slice + my $mapper = Bio::EnsEMBL::Mapper->new('Slice', 'AlignStrainSlice'); + my $start_slice = 1; + my $end_slice; + my $start_align = 1; + my $end_align; + my $length_indel = 0; + my $length_acum_indel = 0; + foreach my $indel (@{$indels}){ + $end_slice = $indel->[0] - 1; + $end_align = $indel->[0] - 1 + $length_acum_indel; #we must consider length previous indels + + $length_indel = $indel->[1] - $indel->[0] + 1; + + + $mapper->add_map_coordinates('Slice',$start_slice,$end_slice,1,'AlignStrainSlice',$start_align,$end_align); + + $mapper->add_indel_coordinates('Slice',$end_slice + 1,$end_slice,1,'AlignStrainSlice',$end_align + 1,$end_align + $length_indel); + $start_slice = $end_slice + 1; + $start_align = $indel->[1] + 1 + $length_acum_indel; #we must consider legnth previous indels + + $length_acum_indel += $length_indel; + } + if ($start_slice <= $self->length){ + $mapper->add_map_coordinates('Slice',$start_slice,$self->length,1,'AlignStrainSlice',$start_align,$start_align + $self->length - $start_slice) + } + $self->{'mapper'} = $mapper; + + } + return $self->{'mapper'}; +} + +#returns the length of the AlignSlice: length of the Slice plus the gaps +sub length{ + my $self = shift; + my $length; + if (!defined $self->{'indels'}){ + #when the list of indels is not defined, get them + $self->{'indels'} = $self->_get_indels(); + } + $length = $self->{'slice'}->length; + map {$length += ($_->[1] - $_->[0] + 1)} @{$self->{'indels'}}; + return $length; +} + +=head2 strains + + Args : None + Description: Returns list with all strains used to + define this AlignStrainSlice object + Returntype : listref of Bio::EnsEMBL::StrainSlice objects + Exceptions : none + Caller : general + +=cut + +sub strains{ + my $self = shift; + + return $self->{'strains'}; +} + +=head2 Slice + + Args : None + Description: Returns slice where the AlignStrainSlice + is defined + Returntype : Bio::EnsEMBL::Slice object + Exceptions : none + Caller : general + +=cut + +sub Slice{ + my $self = shift; + return $self->{'slice'}; +} +#method to retrieve, in order, a list with all the indels in the different strains +sub _get_indels{ + my $self = shift; + + #go throuh all the strains getting ONLY the indels (length_diff <> 0) + my @indels; + foreach my $strainSlice (@{$self->strains}){ + my $differences = $strainSlice->get_all_AlleleFeatures_Slice(); #need to check there are differences.... + foreach my $af (@{$differences}){ + #if length is 0, but is a -, it is still a gap in the strain + if (($af->length_diff != 0) || ($af->length_diff == 0 && $af->allele_string =~ /-/)){ + push @indels, $af; + } + } + } + #need to overlap the gaps using the RangeRegistry module + my $range_registry = Bio::EnsEMBL::Mapper::RangeRegistry->new(); + foreach my $indel (@indels){ + #in the reference and the strain there is a gap + $range_registry->check_and_register(1,$indel->start,$indel->start) if ($indel->length_diff == 0); + #deletion in reference slice + $range_registry->check_and_register(1,$indel->start, $indel->end ) if ($indel->length_diff < 0); + #insertion in reference slice + $range_registry->check_and_register(1,$indel->start,$indel->start + $indel->length_diff - 1) if ($indel->length_diff > 0); + } + #and return all the gap coordinates.... + return $range_registry->get_ranges(1); +} + +=head2 get_all_Slices + + Args : none + Description: This Slice is made of several Bio::EnsEMBL::StrainSlices + sequence. This method returns these StrainSlices (or part of + them) with the original coordinates + Returntype : listref of Bio::EnsEMBL::StrainSlice objects + Exceptions : end should be at least as big as start + Caller : general + +=cut + +sub get_all_Slices { + my $self = shift; + + my @strains; + #add the reference strain + my $dbVar = $self->Slice->adaptor->db->get_db_adaptor('variation'); + unless($dbVar) { + warning("Variation database must be attached to core database to " . + "retrieve variation information" ); + return ''; + } + my $indAdaptor = $dbVar->get_IndividualAdaptor(); + my $ref_name = $indAdaptor->get_reference_strain_name; + my $ref_strain = Bio::EnsEMBL::StrainSlice->new( + -START => $self->Slice->{'start'}, + -END => $self->Slice->{'end'}, + -STRAND => $self->Slice->{'strand'}, + -ADAPTOR => $self->Slice->adaptor(), + -SEQ => $self->Slice->{'seq'}, + -SEQ_REGION_NAME => $self->Slice->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->Slice->{'seq_region_length'}, + -COORD_SYSTEM => $self->Slice->{'coord_system'}, + -STRAIN_NAME => $ref_name, + ); + #this is a fake reference alisce, should not contain any alleleFeature + undef $ref_strain->{'alleleFeatures'}; + + push @strains, @{$self->strains}; + my $new_feature; + my $indel; + my $aligned_features; + my $indels = (); #reference to a hash containing indels in the different strains + #we need to realign all Features in the different Slices and add '-' in the reference Slice + foreach my $strain (@{$self->strains}){ + foreach my $af (@{$strain->get_all_AlleleFeatures_Slice()}){ + $new_feature = $self->alignFeature($af); #align feature in AlignSlice coordinates + push @{$aligned_features},$new_feature if($new_feature->seq_region_start <= $strain->end); #some features might map outside slice + if ($af->start != $af->end){ #an indel, need to add to the reference, and realign in the strain + #make a shallow copy of the indel - clear it first! + $indel = undef; + %{$indel} = %{$new_feature}; + bless $indel, ref($new_feature); + $indel->allele_string('-'); + push @{$indels},$indel; #and include in the list of potential indels + } + } + next if (!defined $aligned_features); + undef $strain->{'alleleFeatures'}; #remove all features before adding new aligned features + push @{$strain->{'alleleFeatures'}}, @{$aligned_features}; + undef $aligned_features; + } + push @strains, $ref_strain; + #need to add indels in the different strains, if not present + if (defined $indels){ + foreach my $strain (@strains){ + #inlcude the indels in the StrainSlice object + push @{$strain->{'alignIndels'}},@{$indels}; + } + } + return \@strains; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Analysis.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Analysis.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,621 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Analysis.pm - Stores details of an analysis run + +=head1 SYNOPSIS + + my $obj = new Bio::EnsEMBL::Analysis( + -id => $id, + -logic_name => 'SWIRBlast', + -db => $db, + -db_version => $db_version, + -db_file => $db_file, + -program => $program, + -program_version => $program_version, + -program_file => $program_file, + -gff_source => $gff_source, + -gff_feature => $gff_feature, + -module => $module, + -module_version => $module_version, + -parameters => $parameters, + -created => $created, + -description => 'some warm words about this analysis', + -display_label => 'UNIprot alignment', + -displayable => '1', + -web_data => 'web metadata info' + ); + +=head1 DESCRIPTION + +Object to store details of an analysis run. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Analysis; + +use vars qw(@ISA); +use strict; + +use Bio::EnsEMBL::Storable; + +use Bio::EnsEMBL::Utils::Exception qw(throw); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Scalar::Util qw/isweak weaken/; + +@ISA = qw(Bio::EnsEMBL::Storable); + + +=head2 new + + Arg [..] : Takes a set of named arguments + Example : $analysis = new Bio::EnsEMBL::Analysis::Analysis( + -id => $id, + -logic_name => 'SWIRBlast', + -db => $db, + -db_version => $db_version, + -db_file => $db_file, + -program => $program, + -program_version => $program_version, + -program_file => $program_file, + -gff_source => $gff_source, + -gff_feature => $gff_feature, + -module => $module, + -module_version => $module_version, + -parameters => $parameters, + -created => $created ); + Description: Creates a new Analysis object + Returntype : Bio::EnsEMBL::Analysis + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub new { + my($class,@args) = @_; + + my $self = bless {},$class; + + my ($id, $adaptor, $db, $db_version, $db_file, $program, $program_version, + $program_file, $gff_source, $gff_feature, $module, $module_version, + $parameters, $created, $logic_name, $description, $display_label, + $displayable, $web_data) = + + rearrange([qw(ID + ADAPTOR + DB + DB_VERSION + DB_FILE + PROGRAM + PROGRAM_VERSION + PROGRAM_FILE + GFF_SOURCE + GFF_FEATURE + MODULE + MODULE_VERSION + PARAMETERS + CREATED + LOGIC_NAME + DESCRIPTION + DISPLAY_LABEL + DISPLAYABLE + WEB_DATA + )],@args); + + $displayable ||= 0; + + $self->dbID ($id); + $self->adaptor ($adaptor); + $self->db ($db); + $self->db_version ($db_version); + $self->db_file ($db_file); + $self->program ($program); + $self->program_version($program_version); + $self->program_file ($program_file); + $self->module ($module); + $self->module_version ($module_version); + $self->gff_source ($gff_source); + $self->gff_feature ($gff_feature); + $self->parameters ($parameters); + $self->created ($created); + $self->logic_name ( $logic_name ); + $self->description( $description ); + $self->display_label( $display_label ); + $self->displayable( $displayable ); + $self->web_data ( $web_data ); + return $self; # success - we hope! +} + +=head2 new_fast + + Arg [1] : HashRef $hashref + Value to bless + Description: Bless a hash into this object type + Exceptions : none + Returntype : Bio::EnsEMBL::Analysis + Caller : general, subclass constructors + +=cut + +sub new_fast { + my ($class, $hashref) = @_; + my $self = bless $hashref, ref($class) || $class; + weaken($self->{adaptor}) if ( ! isweak($self->{adaptor}) ); + return $self; +} + +=head2 db + + Arg [1] : string $db + Description: get/set for the attribute db + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub db { + my ($self,$arg) = @_; + + if (defined($arg)) { + $self->{_db} = $arg; + } + + return $self->{_db}; +} + + +=head2 db_version + + Arg [1] : string $db_version + Description: get/set for attribute db_version + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub db_version { + my ($self,$arg) = @_; + + if (defined($arg)) { + $self->{_db_version} = $arg; + } + + return $self->{_db_version}; +} + + +=head2 db_file + + Arg [1] : string $db_file + Description: get/set for attribute db_file + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub db_file { + my ($self,$arg) = @_; + + if (defined($arg)) { + $self->{_db_file} = $arg; + } + + return $self->{_db_file}; +} + + + +=head2 program + + Arg [1] : string $program + Description: get/set for attribute program + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub program { + my ($self,$arg) = @_; + + if (defined($arg)) { + $self->{_program} = $arg; + } + + return $self->{_program}; +} + + +=head2 program_version + + Arg [1] : string $program_version + Description: get/set for attribute program_version + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub program_version { + my ($self,$arg) = @_; + + if (defined($arg)) { + $self->{_program_version} = $arg; + } + + return $self->{_program_version}; +} + + +=head2 program_file + + Arg [1] : string $program_file + Description: get/set for attribute program_file + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub program_file { + my ($self,$arg) = @_; + + if (defined($arg)) { + $self->{_program_file} = $arg; + } + + return $self->{_program_file}; +} + + +=head2 module + + Arg [1] : string $module + Description: get/set for attribute module. Usually a RunnableDB perl + module that executes this analysis job. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub module { + my ($self,$arg) = @_; + + if (defined($arg)) { + $self->{_module} = $arg; + } + + return $self->{_module}; +} + + +=head2 module_version + + Arg [1] : string $module_version + Description: get/set for attribute module_version + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub module_version { + my ($self,$arg) = @_; + + if (defined($arg)) { + $self->{_module_version} = $arg; + } + + return $self->{_module_version}; +} + + +=head2 gff_source + + Arg [1] : string $gff_source + Description: get/set for attribute gff_source + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub gff_source { + my ($self,$arg) = @_; + + if (defined($arg)) { + $self->{_gff_source} = $arg; + } + + return $self->{_gff_source}; +} + + +=head2 gff_feature + + Arg [1] : string $gff_feature + Description: get/set for attribute gff_feature + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub gff_feature { + my ($self,$arg) = @_; + + if (defined($arg)) { + $self->{_gff_feature} = $arg; + } + + return $self->{_gff_feature}; +} + + +=head2 parameters + + Arg [1] : string $parameters + Description: get/set for attribute parameters. This should be evaluated + by the module if given or the program that is specified. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub parameters { + my ($self,$arg) = @_; + + if (defined($arg)) { + $self->{_parameters} = $arg; + } + + return $self->{_parameters}; +} + + +=head2 created + + Arg [1] : string $created + Description: get/set for attribute created time. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub created { + my ($self,$arg) = @_; + + if (defined($arg)) { + $self->{_created} = $arg; + } + + return $self->{_created}; +} + + +=head2 logic_name + + Arg [1] : string $logic_name + Description: Get/set method for the logic_name, the name under + which this typical analysis is known. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub logic_name { + my ($self, $arg ) = @_; + ( defined $arg ) && + ($self->{_logic_name} = $arg); + $self->{_logic_name}; +} + + +=head2 has_database + + Args : none + Description: tests if the db attribute is set, returns 1 if so, + 0 if not. + Returntype : int 0,1 + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub has_database{ + my ($self,@args) = @_; + + if( defined $self->db ){ return 1; } + return 0; +} + + +=head2 description + + Arg [1] : string $description + Example : none + Description: get/set for attribute description + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub description { + my ($self,$arg) = @_; + + if (defined($arg)) { + $self->{_description} = $arg; + } + + return $self->{_description}; +} + + +=head2 display_label + + Arg [1] : string $display_label + Description: get/set for attribute display_label + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub display_label { + my ($self,$arg) = @_; + + if (defined($arg)) { + $self->{_display_label} = $arg; + } + + return $self->{_display_label}; +} + +=head2 displayable + + Arg [1] : string $displayable + Description: get/set for attribute displayable + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub displayable { + my ($self,$arg) = @_; + + if (defined($arg)) { + $self->{_displayable} = $arg; + } + + return $self->{_displayable}; +} + + +=head2 web_data + + Arg [1] : string $web_data + Description: get/set for attribute web_data + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub web_data { + my ($self,$arg) = @_; + + if (defined($arg)) { + $self->{_web_data} = $arg; + } + + return $self->{_web_data}; +} + +=head2 compare + + Arg 1 : Bio::EnsEMBL::Analysis $ana + The analysis to compare to + Description: returns 1 if this analysis is special case of given analysis + returns 0 if they are equal + returns -1 if they are completely different + Returntype : int -1,0,1 + Exceptions : none + Caller : unknown + Status : Stable + +=cut + +sub compare { + my ($self, $ana ) = @_; + + throw("Object is not a Bio::EnsEMBL::Analysis") + unless $ana->isa("Bio::EnsEMBL::Analysis"); + + my $detail = 0; + + foreach my $methodName ( 'program', 'program_version', 'program_file', + 'db','db_version','db_file','gff_source','gff_feature', 'module', + 'module_version', 'parameters','logic_name' ) { + if( defined $self->$methodName() && ! $ana->can($methodName )) { + $detail = 1; + } + if( defined $self->$methodName() && ! defined $ana->$methodName() ) { + $detail = 1; + } + # if given anal is different from this, defined or not, then its different + if( defined($ana->$methodName()) && defined($self->$methodName()) && + ( $self->$methodName() ne $ana->$methodName() )) { + return -1; + } + } + if( $detail == 1 ) { return 1 }; + return 0; +} + + +1; + + + + + + + + + + + + + + + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Analysis/PairAlign.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Analysis/PairAlign.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,383 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +PairAlign - Dna pairwise alignment module + +=head1 SYNOPSIS + +#To convert between coordinates: + + my $cdna_coord = $pair->genomic2cDNA($gen_coord); + my $gen_coord = $pair->cDNA2genomic($cdna_coord); + +=head1 DESCRIPTION + +Contains list of sub alignments making up a dna-dna alignment + +Creation: + + my $pair = new Bio::EnsEMBL::FeaturePair( + -start => $qstart, + -end => $qend, + -strand => $qstrand, + -hstart => $hstart, + -hend => $hend, + -hend => $hstrand, + ); + + my $pairaln = new Bio::EnsEMBL::Analysis::PairAlign; + $pairaln->addFeaturePair($pair); + +Any number of pair alignments can be added to the PairAlign object + +=cut + +package Bio::EnsEMBL::Analysis::PairAlign; + +use vars qw(@ISA); +use strict; + + +@ISA = qw(Bio::EnsEMBL::Root); + +sub new { + my($class,@args) = @_; + my $self = {}; + bless $self, $class; + + $self->{'_homol'} = []; + + return $self; # success - we hope! +} + +sub addFeaturePair { + my ($self,$pair) = @_; + + $self->throw("Not a Bio::EnsEMBL::FeaturePair object") unless ($pair->isa("Bio::EnsEMBL::FeaturePair")); + + push(@{$self->{'_pairs'}},$pair); + +} + + +=head2 eachFeaturePair + + Title : eachFeaturePair + Example : my @pairs = $pair->eachFeaturePair + Returns : Array of Bio::SeqFeature::FeaturePair + Args : none +=cut + +sub eachFeaturePair { + my ($self) = @_; + + if (defined($self->{'_pairs'})) { + return @{$self->{'_pairs'}}; + } +} + +sub get_hstrand { + my ($self) = @_; + + my @features = $self->eachFeaturePair; + + return $features[0]->hstrand; +} + +=head2 genomic2cDNA + + Title : genomic2cDNA + Usage : my $cdna_coord = $pair->genomic2cDNA($gen_coord) + Function: Converts a genomic coordinate to a cdna coordinate + Returns : int + Args : int +=cut + +sub genomic2cDNA { + my ($self,$coord) = @_; + my @pairs = $self->eachFeaturePair; + + @pairs = sort {$a->start <=> $b->start} @pairs; + + my $newcoord; + + HOMOL: while (my $sf1 = shift(@pairs)) { + next HOMOL unless ($coord >= $sf1->start && $coord <= $sf1->end); + + if ($sf1->strand == 1 && $sf1->hstrand == 1) { + $newcoord = $sf1->hstart + ($coord - $sf1->start); + last HOMOL; + } elsif ($sf1->strand == 1 && $sf1->hstrand == -1) { + $newcoord = $sf1->hend - ($coord - $sf1->start); + last HOMOL; + } elsif ($sf1->strand == -1 && $sf1->hstrand == 1) { + $newcoord = $sf1->hstart + ($sf1->end - $coord); + last HOMOL; + } elsif ($sf1->strand == -1 && $sf1->hstrand == -1) { + $newcoord = $sf1->hend - ($sf1->end - $coord); + last HOMOL; + } else { + $self->throw("ERROR: Wrong strand value in FeaturePair (" . $sf1->strand . "/" . $sf1->hstrand . "\n"); + } + } + + if (defined($newcoord)) { + + return $newcoord; + } else { + $self->throw("Couldn't convert $coord"); + } +} + +=head2 cDNA2genomic + + Title : cDNA2genomic + Usage : my $gen_coord = $pair->genomic2cDNA($cdna_coord) + Function: Converts a cdna coordinate to a genomic coordinate + Example : + Returns : int + Args : int + + +=cut + +sub cDNA2genomic { + my ($self,$coord) = @_; + + my @pairs = $self->eachFeaturePair; + + my $newcoord; + + HOMOL: while (my $sf1 = shift(@pairs)) { + next HOMOL unless ($coord >= $sf1->hstart && $coord <= $sf1->hend); + + if ($sf1->strand == 1 && $sf1->hstrand == 1) { + $newcoord = $sf1->start + ($coord - $sf1->hstart); + last HOMOL; + } elsif ($sf1->strand == 1 && $sf1->hstrand == -1) { + $newcoord = $sf1->start +($sf1->hend - $coord); + last HOMOL; + } elsif ($sf1->strand == -1 && $sf1->hstrand == 1) { + $newcoord = $sf1->end - ($coord - $sf1->hstart); + last HOMOL; + } elsif ($sf1->strand == -1 && $sf1->hstrand == -1) { + $newcoord = $sf1->end - ($sf1->hend - $coord); + last HOMOL; + } else { + $self->throw("ERROR: Wrong strand value in homol (" . $sf1->strand . "/" . $sf1->hstrand . "\n"); + } + } + + if (defined ($newcoord)) { + return $newcoord; + } else { + $self->throw("Couldn't convert $coord\n"); + } +} + +sub find_Pair { + my ($self,$coord) = @_; + + foreach my $p ($self->eachFeaturePair) { + if ($coord >= $p->hstart && $coord <= $p->hend) { + return $p; + } + } +} + +=head2 convert_cDNA_feature + + Title : convert_cDNA_feature + Usage : my @newfeatures = $self->convert_cDNA_feature($f); + Function: Converts a feature on the cDNA into an array of + features on the genomic (for features that span across introns); + Returns : Array of Bio::EnsEMBL::FeaturePair + Args : Bio::EnsEMBL::FeaturePair + +=cut + +sub convert_cDNA_feature { + my ($self,$feature) = @_; + + my $foundstart = 0; + my $foundend = 0; + + my @pairs = $self->eachFeaturePair; + my @newfeatures; + + HOMOL: while (my $sf1 = shift(@pairs)) { + my $skip = 0; + #print STDERR "Looking at cDNA exon " . $sf1->hstart . "\t" . $sf1->hend . "\t" . $sf1->strand ."\n"; + + $skip = 1 unless ($feature->start >= $sf1->hstart + && $feature->start <= $sf1->hend); + + if($skip){ + #print STDERR "Skipping ".$sf1->hstart . "\t" . $sf1->hend . "\t" . $sf1->strand ."\n"; + next HOMOL; + } + if ($feature->end >= $sf1->hstart && $feature->end <= $sf1->hend) { + $foundend = 1; + } + + my $startcoord = $self->cDNA2genomic($feature->start); + my $endcoord; + + if ($sf1->hstrand == 1) { + $endcoord = $sf1->end; + } else { + $endcoord = $sf1->start; + } + + if ($foundend) { + $endcoord = $self->cDNA2genomic($feature->end); + } + + #print STDERR "Making new genomic feature $startcoord\t$endcoord\n"; + + my $tmpf = new Bio::EnsEMBL::Feature(-seqname => $feature->seqname, + -start => $startcoord, + -end => $endcoord, + -strand => $feature->strand); + push(@newfeatures,$tmpf); + last; + } + + # Now the rest of the pairs until we find the endcoord + + while ((my $sf1 = shift(@pairs)) && ($foundend == 0)) { + + if ($feature->end >= $sf1->hstart && $feature->end <= $sf1->hend) { + $foundend = 1; + } + + my $startcoord; + my $endcoord; + + if ($sf1->hstrand == 1) { + $startcoord = $sf1->start; + $endcoord = $sf1->end; + } else { + $startcoord = $sf1->end; + $endcoord = $sf1->start; + } + + if ($foundend) { + $endcoord = $self->cDNA2genomic($feature->end); + } + + # #print STDERR "Making new genomic feature $startcoord\t$endcoord\n"; + + my $tmpf = new Bio::EnsEMBL::Feature(-seqname => $feature->seqname, + -start => $startcoord, + -end => $endcoord, + -strand => $feature->strand); + push(@newfeatures,$tmpf); + } + #print STDERR "Have ".@newfeatures." features from ".$feature."\n"; + return @newfeatures; +} + + +sub convert_FeaturePair { + my ($self,$pair) = @_; + + my $hstrand = $self->get_hstrand; + + my $feat = $self->create_Feature($pair->start, $pair->end, $pair->strand, + $pair->slice); + my @newfeatures = $self->convert_cDNA_feature($feat); + my @newpairs; + + my $hitpairaln = new Bio::EnsEMBL::Analysis::PairAlign; + $hitpairaln->addFeaturePair($pair); + + foreach my $new (@newfeatures) { + + # Now we want to convert these cDNA coords into hit coords + + my $hstart1 = $self->genomic2cDNA($new->start); + my $hend1 = $self->genomic2cDNA($new->end); + + my $hstart2 = $hitpairaln->genomic2cDNA($hstart1); + my $hend2 = $hitpairaln->genomic2cDNA($hend1); + + # We can now put the final feature together + + my $finalstrand = $hstrand * $pair->strand * $pair->hstrand; + + if ($hstart2 > $hend2) { + my $tmp = $hstart2; + $hstart2 = $hend2; + $hend2 = $tmp; + } + + my $finalpair = $self->create_FeaturePair($new->start, $new->end, + $new->strand, + $hstart2, $hend2, + $finalstrand, $pair->score); + + push(@newpairs,$finalpair); + + } + + return @newpairs; +} + +sub create_FeaturePair { + my ($self, $start, $end, $strand, $hstart, $hend, + $hstrand, $score) = @_; + + my $fp = Bio::EnsEMBL::FeaturePair->new( + -start => $start, + -end => $end, + -strand => $strand, + -hstart => $hstart, + -hend => $hend, + -hstrand => $hstrand, + -score => $score, + ); + + + return $fp; +} + +sub create_Feature{ + my ($self, $start, $end, $strand, $slice) = @_; + + my $feat = new Bio::EnsEMBL::Feature(-start => $start, + -end => $end, + -strand => $strand, + -slice => $slice, + ); + return $feat; +} + +1; + + + + + + + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Analysis/Programs.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Analysis/Programs.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,196 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +package Bio::EnsEMBL::Analysis::Programs; +use strict; +use vars qw( %Program_Paths ); +use Carp; +use Cwd; +use Sys::Hostname; +use Bio::EnsEMBL::Utils::Exception qw ( throw ) ; + +sub import { + my $pkg = shift; + foreach (@_) { + #print STDERR "importing: $_\n"; + $Program_Paths{ $_ } = 0; + } + my( $home, @PATH, @missing ); + + $home = cwd() or die "Can't save cwd"; + my $H = [ $home, 1 ]; + + @PATH = split /:/, $ENV{'PATH'}; + foreach (@PATH) { + s|/?$|/|; # Append / to each path + } + + # For each program, check there is an executable + foreach my $program (keys %Program_Paths) { + + # Deal with paths + if ($program =~ m|/|) { + _go_home( $H ); + my $path = $program; + # Deal with tildes + $path =~ s{^~([^/]*)}{ $1 ? (getpwnam($1))[7] + : (getpwuid($>))[7] }e; + if (my $real = _is_prog( $H, $path )) { + $Program_Paths{ $program } = $real; + } + } + # Or search through all paths + else { + foreach my $path (@PATH) { + _go_home( $H ); + if (my $real = _is_prog( $H, $path, $program )) { + $Program_Paths{ $program } = $real; + last; + } + } + } + } + _go_home( $H ); # Return to home directory + + # Make a list of all missing programs + foreach my $program (keys %Program_Paths) { + push( @missing, $program ) unless $Program_Paths{ $program }; + } + + # Give informative death message if programs weren't found + if (@missing) { + throw("Unable to locate the following programs as '". (getpwuid($<))[0]. "' on host '". hostname(). "' :\t". + join ( " --> " , @missing )) ; + } +} + +# Recursive function which follows links, or tests final destination +sub _is_prog { + my( $h, $path, $prog ) = @_; + + # Need to split path if $prog not provided + unless ($prog) { + ($path, $prog) = $path =~ m|(.*?)([^/]+)$|; + } + + if (-l "$path$prog") { + # Follow link + _follow( $h, $path ) or return; + unless (-x readlink($prog)) { + confess "Can't read link '$path$prog' : $!"; + } + my $link = $prog; + $path = cwd() or confess "Can't determine cwd"; + return "$path/$prog"; + } elsif (-f _ and -x _) { + # Return full path + _follow( $h, $path ) or return; + $path = cwd() or confess "Can't determine cwd"; + return "$path/$prog"; + } else { + # Not a link or an executable plain file + return; + } +} + +# To avoid unnecessary chdir'ing +sub _follow { + my( $H, $path ) = @_; + + # Chdir without arguments goes to home dir. + # Can't use defined in test since $path may contain + # a real null string. + if ( ! $path and $path ne '0' ) { + return 1; + } elsif (chdir($path)) { + $H->[1] = 0; + return 1; + } else { + return; + } +} +sub _go_home { + my( $H ) = @_; + + # Go home unless we're already there + if ($H->[1] == 0) { + if (chdir( $H->[0] )) { + $H->[1] = 1; + } else { + confess "Can't go home to [ ", $H->[0], ' ]'; + } + } +} + +1; + +__END__ + +=head1 NAME Programs + +=head1 SYSNOPSIS + + use Bio::EnsEMBL::Analysis::Programs qw( efetch getz est2genome + /usr/local/bin/this_one + ~me/some/path/my_prog + ~/../jane/bin/her_prog ); + + # Can also do at run time + Bio::EnsEMBL::Analysis::Programs->import( $someProg ); + + $path_to_prog = $Bio::EnsEMBL::Analysis::Programs::Program_Paths{ $prog }; + +=head1 DESCRIPTION + +B is used to check at compile time for the +presence of executables which will be called from your +script. Arguments passed via the use statement +can be just the program name, or an absolute or +relative path to the program. Tildes are expanded +correctly (I using "glob"). Failure to find any +one program is fatal, and a list of all failures is +printed, along with the host''s name. + +If you want to check for a program during run time, +the import funtion can be called directly, as shown above. + +The paths to each program found are stored in the +B<%Program_Paths> hash, which is keyed on the original +arguments passed. + +=head1 BUGS + +If the executable is in the root directory, then it''s found +path will appear as "//prog" in %Program_Paths, not "/prog". + +=head1 AUTHOR + +B Email jgrg@sanger.ac.uk + + + + + + + + + + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/ApiVersion.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/ApiVersion.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,53 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::ApiVersion + +=head1 SYNOPSIS + + use Bio::EnsEMBL::ApiVersion; + + printf( "The API version used is %s\n", software_version() ); + +=head1 DESCRIPTION + +The module exports the software_version() subroutine which returns the +release version of the Ensembl Core API. + +=cut + +package Bio::EnsEMBL::ApiVersion; + +use strict; +use warnings; + +use Exporter; + +use base qw( Exporter ); + +our @EXPORT = qw( software_version ); + +my $API_VERSION = 68; + +sub software_version { return $API_VERSION } + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/ArchiveStableId.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/ArchiveStableId.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,506 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::ArchiveStableId + +=head1 DESCRIPTION + +ArchiveStableId objects are the main workunit for retrieving stable id +archived information from EnsEMBL core database. + +Attributes: + type: Gene, Transcript, Translation, Exon, other, undef + stable_id: eg. ENSG00000000001 + version: e.g. 1 + db_name: eg. homo_sapiens_core_12_31 + release: e.g. 35 + assembly: e.g. NCBI35 + successors: listref of Bio::EnsEMBL::ArchiveStableIds + adaptor: Bio::EnsEMBL::DBSQL::ArchiveStableIdAdaptor + +Status: At Risk. This module is in development. + +=head1 SEE ALSO + +Bio::EnsEMBL::DBSQL::ArchiveStableIdAdaptor +Bio::EnsEMBL::StableIdEvent +Bio::EnsEMBL::StableIdHistoryTree + +=cut + +package Bio::EnsEMBL::ArchiveStableId; + +use strict; +use warnings; +no warnings qw(uninitialized); + +use Bio::EnsEMBL::Root; +our @ISA = qw(Bio::EnsEMBL::Root); + +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Scalar::Util qw(weaken isweak); + +=head2 new + + Arg [STABLE_ID] : String $stable_id + Arg [VERSION] : Int $version + Arg [CURRENT_VERSION]: Int $current_version + Arg [DB_NAME] : String $db_name + Arg [RELEASE] : String $release + Arg [ASSEMBLY_NAME] : String $assembly + Arg [TYPE] : String $type - "Gene", "Transcript", "Translation", "Exon" + Arg [ADAPTOR] : Bio::EnsEMBL::DBSQL::ArchiveStableIdAdaptor $adaptor + Description : standard constructor with named arguments to create + ArchiveStableId + Returntype : Bio::EnsEMBL::ArchiveStableId + Exceptions : none + Caller : general, Bio::EnsEMBL::DBSQL::ArchiveStableIdAdaptor + Status : At Risk + : under development + +=cut + +sub new { + my $class = shift; + $class = ref( $class ) || $class; + + my $self = bless {}, $class; + + my ($stable_id, $version, $current_version, $db_name, $release, $assembly, + $type, $adaptor) = rearrange([qw( STABLE_ID VERSION CURRENT_VERSION DB_NAME + RELEASE ASSEMBLY TYPE ADAPTOR)], @_ ); + + $self->{'stable_id'} = $stable_id; + $self->{'version'} = $version; + $self->{'current_version'} = $current_version; + $self->{'db_name'} = $db_name; + $self->{'release'} = $release; + $self->{'assembly'} = $assembly; + $self->{'type'} = $type; + $self->adaptor($adaptor); + + return $self; +} + + +=head2 new_fast + + Arg [1] : String $stable_id + Arg [2] : Int $version + Arg [3] : String $db_name + Arg [4] : String $release + Arg [5] : String $assembly + Arg [6] : String $type - "Gene", "Transcript", "Translation", "Exon" + Arg [7] : Bio::EnsEMBL::DBSQL::ArchiveStableIdAdaptor $adaptor + Arg [8] : Int $current_version + Description : faster version of above constructor + Returntype : Bio::EnsEMBL::ArchiveStableId + Exceptions : none + Caller : general, Bio::EnsEMBL::DBSQL::ArchiveStableIdAdaptor + Status : At Risk + : under development + +=cut + +sub new_fast { + my $class = shift; + + $class = ref ($class) || $class; + + my $self = bless { + 'stable_id' => $_[0], + 'version' => $_[1], + 'db_name' => $_[2], + 'release' => $_[3], + 'assembly' => $_[4], + 'type' => $_[5], + 'adaptor' => $_[6], + 'current_version' => $_[7], + }, $class; + + weaken($self->{adaptor}) if ( ! isweak($self->{adaptor}) ); + + return $self; +} + + +=head2 get_history_tree + + Arg[1] : (optional) Int $num_high_scorers + number of mappings per stable ID allowed when filtering + Arg[2] : (optional) Int $max_rows + maximum number of stable IDs in history tree (used for + filtering) + Example : my $history_tree = $archive_id->get_history_tree; + Description : Returns the history tree of this ArchiveStableId + Return type : Bio::EnsEMBL::StableIdHistoryTree + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_history_tree { + my ($self, $num_high_scorers, $max_rows) = @_; + + unless ($self->{'history'}) { + $self->{'history'} = $self->adaptor->fetch_history_tree_by_stable_id( + $self->stable_id, $num_high_scorers, $max_rows); + } + + return $self->{'history'}; +} + + +=head2 get_all_predecessors + + Args : none + Description : Retrieve a list of ArchiveStableIds that were mapped to this + one. + Returntype : listref of Bio::EnsEMBL::ArchiveStableId + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_all_predecessors { + my $self = shift; + + my $predecessors = $self->adaptor->fetch_predecessors_by_archive_id($self); + + foreach my $pre (@$predecessors) { + $pre->successors($self); + } + + return $predecessors; +} + + +=head2 get_all_successors + + Args : none + Description : Retrieve a list of ArchiveStableIds that this one was mapped to. + Returntype : listref Bio::EnsEMBL::ArchiveStableId + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_all_successors { + my $self = shift; + + if ($self->{'successors'}) { + return $self->{'successors'}; + } else { + my $successors = $self->adaptor->fetch_successors_by_archive_id($self); + return $self->successors(@$successors); + } +} + + +=head2 get_peptide + + Description : Retrieves the peptide string for this ArchiveStableId. + Returntype : String, or undef if this is not a Translation or cant be found + in the database. + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_peptide { + my $self = shift; + + if ( lc( $self->type() ) eq 'translation' ) { + return $self->adaptor->get_peptide($self); + } else { + return undef; + } +} + + +=head2 get_all_associated_archived + + Example : my ($arch_gene, $arch_tr, $arch_tl, $pep_seq) = + @{ $arch_id->get_all_associated_archived }; + Description : Fetches associated archived stable IDs from the db for this + ArchiveStableId (version is taken into account). + Return type : Listref of + ArchiveStableId archived gene + ArchiveStableId archived transcript + (optional) ArchiveStableId archived translation + (optional) peptide sequence + Caller : webcode, general + Status : At Risk + : under development + +=cut + +sub get_all_associated_archived { + my $self = shift; + return $self->adaptor->fetch_associated_archived($self); +} + + +=head2 get_all_gene_archive_ids + + Example : my @archived_genes = @{ $arch_id->get_all_gene_archive_ids }; + Description : Returns gene ArchiveStableIds associated with this + ArchiveStableId. If this is a gene, it returns itself. + Returntype : listref of Bio::EnsEMBL::ArchiveStableId + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_all_gene_archive_ids { + my $self = shift; + + if ($self->type eq "Gene") { + return [$self]; + } else { + return $self->adaptor->fetch_all_by_archive_id($self, 'Gene'); + } +} + + +=head2 get_all_transcript_archive_ids + + Example : my @archived_transcripts = + @{ $arch_id->get_all_transcript_archive_ids }; + Description : Returns transcript ArchiveStableIds associated with this + ArchiveStableId. If this is a transcript, it returns itself. + Returntype : listref of Bio::EnsEMBL::ArchiveStableId + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_all_transcript_archive_ids { + my $self = shift; + + if ($self->type eq "Transcript") { + return [$self]; + } else { + return $self->adaptor->fetch_all_by_archive_id($self, 'Transcript'); + } +} + + +=head2 get_all_translation_archive_ids + + Example : my @archived_peptides = + @{ $arch_id->get_all_translation_archive_ids }; + Description : Returns translation ArchiveStableIds associated with this + ArchiveStableId. If this is a translation, it returns itself. + Returntype : listref of Bio::EnsEMBL::ArchiveStableId + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_all_translation_archive_ids { + my $self = shift; + + if ($self->type eq "Translation") { + return [$self]; + } else { + return $self->adaptor->fetch_all_by_archive_id($self, 'Translation'); + } +} + + +=head2 current_version + + Example : if (my $v = $arch_id->current_version) { + print "Current version of this stable ID ", $v, "\n"; + } else { + print "This stable ID is not in the current db.\n"; + } + Description : Lazy-loads the current version of stable ID + Return type : Boolean (TRUE is current version found, else FALSE) + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub current_version { + my $self = shift; + + if (@_) { + $self->{'current_version'} = shift; + } elsif (! defined $self->{'current_version'}) { + if (defined $self->adaptor()) { + # lazy load + $self->adaptor()->lookup_current($self); + } + } + + return $self->{'current_version'}; +} + + +=head2 is_current + + Example : if ($arch_id->is_current) { + print $arch_id->version, " is the current version of this + stable ID.\n"; + } + Description : Determines if the version of this object is the current version + of this stable ID. Note that this method doesn't lazy-load the + current version of an ArchiveStableId; if you want to be sure, + use current_version() instead. + Return type : Boolean (TRUE if it is current, else FALSE) + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub is_current { + my $self = shift; + return ($self->{'version'} == $self->{'current_version'}); +} + + +=head2 get_latest_incarnation + + Example : my $latest = $arch_id->get_latest_incarnation; + print "Latest version of ".$arch_id->stable_id." is ". + $latest->version."\n"; + Description : Returns the ArchiveStableId representing the latest version + of this stable ID. Returns itself if this already is the latest + version, otherwise fetches it from the db. + Return type : Bio::EnsEMBL::ArchiveStableId + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_latest_incarnation { + my $self = shift; + + return $self if ($self->is_latest); + + my $latest = $self->adaptor->fetch_by_stable_id($self->stable_id); + return $latest; +} + + +=head2 is_latest + + Arg[1] : (optional) Boolean $is_latest + Example : if ($arch_id->is_latest) { + print "Version ".$arch_id->version." is the latest version + of ".$arch_id->stable_id."\n"; + } + Description : Indicates whether this is the latest version of this stable ID. + Can also be used as a setter if we know this is the latest + version. + Return type : Boolean (TRUE if yes, FALSE if no) + Exceptions : none + Caller : Bio::EnsEMBL::DBSQL::ArchiveStableIdAdaptor->fetch_by_stable_id, general + Status : At Risk + : under development + +=cut + +sub is_latest { + my $self = shift; + $self->{'is_latest'} = shift if (@_); + return ($self->{'is_latest'} || $self->is_current); +} + + +# +# getter/setters for attributes +# + +sub stable_id { + my $self = shift; + $self->{'stable_id'} = shift if (@_); + return $self->{'stable_id'}; +} + +sub version { + my $self = shift; + $self->{'version'} = shift if (@_); + return $self->{'version'}; +} + +sub db_name { + my $self = shift; + $self->{'db_name'} = shift if (@_); + return $self->{'db_name'}; +} + +sub release { + my $self = shift; + $self->{'release'} = shift if (@_); + return $self->{'release'}; +} + +sub assembly { + my $self = shift; + $self->{'assembly'} = shift if (@_); + return $self->{'assembly'}; +} + +sub type { + my $self = shift; + $self->{'type'} = shift if (@_); + return $self->{'type'}; +} + +sub adaptor { + my $self = shift; + weaken($self->{'adaptor'} = shift) if (@_); + return $self->{'adaptor'}; +} + +sub successors { + my $self = shift; + $self->{'successors'} = \@_; + return $self->{'successors'}; +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/AssemblyExceptionFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/AssemblyExceptionFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,186 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::AssemblyExceptionFeature - A feature that represents an assembly exception + +=head1 SYNOPSIS + + use Bio::EnsEMBL::AssemblyExceptionFeature; + + $feature = Bio::EnsEMBL::AssemblyExceptionFeature->new( + -start => 100, + -end => 220, + -type => 'HAP', + -slice => $slice, + -adaptor => $adaptor + ); + +=head1 DESCRIPTION + +Certain features, e.g. Haplotypes and PARs, are represented as +"exceptions" to the normal assembly. This class represents such +features. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::AssemblyExceptionFeature; + +use strict; + +use vars qw(@ISA); + +use Bio::EnsEMBL::Feature; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Scalar::Util qw(weaken isweak); + +@ISA = qw(Bio::EnsEMBL::Feature); + + +=head2 new + + Arg [TYPE] : The type (e.g. HAP for haplotype, PAR for PAR) + Arg [...] : Named arguments passed to superclass + Example : $feature = Bio::EnsEMBL::AssemblyExceptionFeature->new + (-start => 1, + -end => 100, + -slice => $slice, + -alternate_slice => $alt_slice, + -adaptor => $adaptor, + -type => 'HAP') + Description: Constructs a new Bio::EnsEMBL::Feature. Generally subclasses + of this method are instantiated, rather than this class itself. + Returntype : Bio::EnsEMBL::Feature + Exceptions : Thrown on invalid -SLICE arguments + Caller : general, subclass constructors + Status : Stable + +=cut + +sub new { + + my $caller = shift; + + # allow this to be called as class or object method + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_); + + my ($type, $alternate_slice) = rearrange(['TYPE', 'ALTERNATE_SLICE'],@_); + $self->{'type'} = $type; + $self->{'alternate_slice'} = $alternate_slice; + + return $self; +} + +=head2 new_fast + + Arg [1] : hashref to be blessed + Description: Construct a new Bio::EnsEMBL::Feature using the hashref. + Exceptions : none + Returntype : Bio::EnsEMBL::Feature + Caller : general, subclass constructors + Status : Stable + +=cut + +sub new_fast { + my $class = shift; + my $hashref = shift; + my $self = bless $hashref, $class; + weaken($self->{adaptor}) if ( ! isweak($self->{adaptor}) ); + return $self; +} + + +=head2 type + + Arg [1] : (optional) string $value + Example : $type = $assembly_exception_feature->type(); + Description: Getter/Setter for the type associated with this + feature. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub type { + + my $self = shift; + + $self->{'type'} = shift if(@_); + + return $self->{'type'}; +} + + +=head2 alternate_slice + + Arg [1] : (optional) string $value + Example : $alt_slice = $assembly_exception_feature->alternate_slice(); + Description: Getter/Setter for the alternate slice associated with this feature. + The alternate slice represents the "other side" of the AssemblyExceptionFeature. + Returntype : Bio::EnsEMBL::Slice + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub alternate_slice { + + my $self = shift; + + $self->{'alternate_slice'} = shift if(@_); + + return $self->{'alternate_slice'}; +} + + + +=head2 display_id + + Arg [1] : none + Example : print $aef->display_id(); + Description: This method returns a string that is considered to be + the 'display' identifier. For assembly exception features + this is the name of the alternate seqregion or '' if the + alternate slice is not defined. + Returntype : string + Exceptions : none + Caller : web drawing code + Status : Stable + +=cut + +sub display_id { + my $self = shift; + my $slice = $self->{'alternate_slice'}; + return '' if(!$slice); + return $slice->seq_region_name(); +} + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/AssemblyMapper.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/AssemblyMapper.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,880 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::AssemblyMapper - +Handles mapping between two coordinate systems using the information +stored in the assembly table. + +=head1 SYNOPSIS + + $db = Bio::EnsEMBL::DBSQL::DBAdaptor->new(...); + $asma = $db->get_AssemblyMapperAdaptor(); + $csa = $db->get_CoordSystemAdaptor(); + + my $chr_cs = $cs_adaptor->fetch_by_name( 'chromosome', 'NCBI33' ); + my $ctg_cs = $cs_adaptor->fetch_by_name('contig'); + + $asm_mapper = $map_adaptor->fetch_by_CoordSystems( $cs1, $cs2 ); + + # Map to contig coordinate system from chromosomal. + @ctg_coords = + $asm_mapper->map( 'X', 1_000_000, 2_000_000, 1, $chr_cs ); + + # Map to chromosome coordinate system from contig. + @chr_coords = + $asm_mapper->map( 'AL30421.1.200.92341', 100, 10000, -1, + $ctg_cs ); + + # List contig names for a region of chromsome. + @ctg_ids = $asm_mapper->list_ids( '13', 1_000_000, 1, $chr_cs ); + + # List chromosome names for a contig region. + @chr_ids = + $asm_mapper->list_ids( 'AL30421.1.200.92341', 1, 1000, -1, + $ctg_cs ); + +=head1 DESCRIPTION + +The AssemblyMapper is a database aware mapper which faciliates +conversion of coordinates between any two coordinate systems with an +relationship explicitly defined in the assembly table. In the future +it may be possible to perform multiple step (implicit) mapping between +coordinate systems. + +It is implemented using the Bio::EnsEMBL::Mapper object, which is a +generic mapper object between disjoint coordinate systems. + +=head1 METHODS + +=cut + + +package Bio::EnsEMBL::AssemblyMapper; + +use strict; +use warnings; + +use Bio::EnsEMBL::Mapper; +use Bio::EnsEMBL::Utils::Exception qw(throw deprecate); +use Scalar::Util qw(weaken); + +my $ASSEMBLED = 'assembled'; +my $COMPONENT = 'component'; + +my $DEFAULT_MAX_PAIR_COUNT = 1000; + + +=head2 new + + Arg [1] : Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor + Arg [2] : Bio::EnsEMBL::CoordSystem $asm_cs + Arg [3] : Bio::EnsEMBL::CoordSystem $cmp_cs + Example : Should use AssemblyMapperAdaptor->fetch_by_CoordSystems() + Description: Creates a new AssemblyMapper + Returntype : Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor + Exceptions : Throws if multiple coord_systems are provided + Caller : AssemblyMapperAdaptor + Status : Stable + +=cut + +sub new { + my ( $proto, $adaptor, @coord_systems ) = @_; + + my $class = ref($proto) || $proto; + + my $self = bless( {}, $class ); + + $self->adaptor($adaptor); + + $adaptor->cache_seq_ids_with_mult_assemblys(); + + if ( @coord_systems != 2 ) { + throw( 'Can only map between two coordinate systems. ' + . scalar(@coord_systems) + . ' were provided' ); + } + + # Set the component and assembled coordinate systems + $self->{'asm_cs'} = $coord_systems[0]; + $self->{'cmp_cs'} = $coord_systems[1]; + + # We load the mapper calling the 'ASSEMBLED' the 'from' coord system + # and the 'COMPONENT' the 'to' coord system. + + $self->{'mapper'} = Bio::EnsEMBL::Mapper->new( $ASSEMBLED, $COMPONENT, + $coord_systems[0], $coord_systems[1] ); + + $self->{'max_pair_count'} = $DEFAULT_MAX_PAIR_COUNT; + + return $self; +} ## end sub new + +=head2 max_pair_count + + Arg [1] : (optional) int $max_pair_count + Example : $mapper->max_pair_count(100000) + Description: Getter/Setter for the number of mapping pairs allowed + in the internal cache. This can be used to override + the default value (1000) to tune the performance and + memory usage for certain scenarios. Higher value + means bigger cache, more memory used. + Return type: int + Exceptions : None + Caller : General + Status : Stable + +=cut + +sub max_pair_count { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'max_pair_count'} = $value; + } + + return $self->{'max_pair_count'}; +} + +=head2 register_all + + Arg [1] : None + Example : $mapper->max_pair_count(10e6); + $mapper->register_all(); + Description: Pre-registers all assembly information in this + mapper. The cache size should be set to a + sufficiently large value so that all of the + information can be stored. This method is useful + when *a lot* of mapping will be done in regions + which are distributed around the genome. After + registration the mapper will consume a lot of memory + but will not have to perform any SQL and will be + faster. + Return type: None + Exceptions : None + Caller : Specialised programs doing a lot of mapping. + Status : Stable + +=cut + +sub register_all { + my ($self) = @_; + + $self->adaptor()->register_all($self); +} + +=head2 map + + Arg [1] : string $frm_seq_region + The name of the sequence region to transform FROM. + Arg [2] : int $frm_start + The start of the region to transform FROM. + Arg [3] : int $frm_end + The end of the region to transform FROM. + Arg [4] : int $strand + The strand of the region to transform FROM. + Arg [5] : Bio::EnsEMBL::CoordSystem + The coordinate system to transform FROM + Example : @coords = + $asm_mapper->map( 'X', 1_000_000, 2_000_000, 1, + $chr_cs ); + Description: Transforms coordinates from one coordinate system to + another. + Return type: List of Bio::EnsEMBL::Mapper::Coordinate and/or + Bio::EnsEMBL::Mapper:Gap objects. + Exceptions : Throws if if the specified TO coordinat system is not + one of the coordinate systems associated with this + assembly mapper. + Caller : General + Status : Stable + +=cut + +sub map { + throw('Incorrect number of arguments.') if (!( @_ >= 6)); + + my ( $self, $frm_seq_region_name, $frm_start, $frm_end, $frm_strand, + $frm_cs, $to_slice ) + = @_; + + my $mapper = $self->{'mapper'}; + my $asm_cs = $self->{'asm_cs'}; + my $cmp_cs = $self->{'cmp_cs'}; + my $adaptor = $self->{'adaptor'}; + my $frm; + + + my $seq_region_id = + $self->adaptor() + ->seq_regions_to_ids( $frm_cs, [$frm_seq_region_name] )->[0]; + + # Speed critical section: + # Try to do simple pointer equality comparisons of the coord system + # objects first since this is likely to work most of the time and is + # much faster than a function call. + + if ( $frm_cs == $cmp_cs + || ( $frm_cs != $asm_cs && $frm_cs->equals($cmp_cs) ) ) + { + if ( !$self->{'cmp_register'}->{$seq_region_id} ) { + $adaptor->register_component( $self, $seq_region_id ); + } + $frm = $COMPONENT; + + } elsif ( $frm_cs == $asm_cs || $frm_cs->equals($asm_cs) ) { + + # This can be probably be sped up some by only calling registered + # assembled if needed. + $adaptor->register_assembled( $self, $seq_region_id, $frm_start, + $frm_end ); + $frm = $ASSEMBLED; + + } else { + + throw( + sprintf( "Coordinate system %s %s is neither the assembled " + . "nor the component coordinate system " + . "of this AssemblyMapper", + $frm_cs->name(), $frm_cs->version() ) ); + + } + + return + $mapper->map_coordinates( $seq_region_id, $frm_start, $frm_end, + $frm_strand, $frm ); +} ## end sub map + + +=head2 flush + + Args : None + Example : None + Description: Remove all cached items from this AssemblyMapper. + Return type: None + Exceptions : None + Caller : AssemblyMapperAdaptor + Status : Stable + +=cut + +sub flush { + my ($self) = @_; + + $self->{'mapper'}->flush(); + $self->{'cmp_register'} = {}; + $self->{'asm_register'} = {}; +} + +=head2 size + + Args : None + Example : $num_of_pairs = $mapper->size(); + Description: Returns the number of pairs currently stored. + Return type: int + Exceptions : None + Caller : General + Status : Stable + +=cut + +sub size { + my ($self) = @_; + + return $self->{'mapper'}->{'pair_count'}; +} + +=head2 fastmap + + Arg [1] : string $frm_seq_region + The name of the sequence region to transform FROM. + Arg [2] : int $frm_start + The start of the region to transform FROM. + Arg [3] : int $frm_end + The end of the region to transform FROM. + Arg [4] : int $strand + The strand of the region to transform FROM. + Arg [5] : Bio::EnsEMBL::CoordSystem + The coordinate system to transform FROM. + Example : @coords = + $asm_mapper->map( 'X', 1_000_000, 2_000_000, 1, + $chr_cs ); + Description: Transforms coordinates from one coordinate system to + another. + Return type: List of Bio::EnsEMBL::Mapper::Coordinate and/or + Bio::EnsEMBL::Mapper:Gap objects. + Exceptions : Throws if the specified TO coordinat system is not + one of the coordinate systems associated with this + assembly mapper. + Caller : General + Status : Stable + +=cut + +sub fastmap { + if ( @_ != 6 ) { + throw('Incorrect number of arguments.'); + } + + my ( $self, $frm_seq_region_name, $frm_start, $frm_end, $frm_strand, + $frm_cs ) + = @_; + + my $mapper = $self->{'mapper'}; + my $asm_cs = $self->{'asm_cs'}; + my $cmp_cs = $self->{'cmp_cs'}; + my $adaptor = $self->adaptor(); + my $frm; + + my @tmp; + push @tmp, $frm_seq_region_name; + + my $seq_region_id = + $self->adaptor()->seq_regions_to_ids( $frm_cs, \@tmp )->[0]; + + # Speed critical section: + # Try to do simple pointer equality comparisons of the coord system + # objects first since this is likely to work most of the time and is + # much faster than a function call. + + if ( $frm_cs == $cmp_cs + || ( $frm_cs != $asm_cs && $frm_cs->equals($cmp_cs) ) ) + { + + if ( !$self->{'cmp_register'}->{$seq_region_id} ) { + $adaptor->register_component( $self, $seq_region_id ); + } + $frm = $COMPONENT; + + } elsif ( $frm_cs == $asm_cs || $frm_cs->equals($asm_cs) ) { + + # This can be probably be sped up some by only calling registered + # assembled if needed + $adaptor->register_assembled( $self, $seq_region_id, $frm_start, + $frm_end ); + $frm = $ASSEMBLED; + + } else { + + throw( + sprintf( "Coordinate system %s %s is neither the assembled " + . "nor the component coordinate system " + . "of this AssemblyMapper", + $frm_cs->name(), $frm_cs->version() ) ); + + } + + return + $mapper->fastmap( $seq_region_id, $frm_start, $frm_end, $frm_strand, + $frm ); +} ## end sub fastmap + +=head2 list_ids + + Arg [1] : string $frm_seq_region + The name of the sequence region of interest. + Arg [2] : int $frm_start + The start of the region of interest. + Arg [3] : int $frm_end + The end of the region to transform of interest. + Arg [5] : Bio::EnsEMBL::CoordSystem $frm_cs + The coordinate system to obtain overlapping IDs of. + Example : foreach my $id ( + $asm_mapper->list_ids( 'X', 1, 1000, $ctg_cs ) ) + { ... } + Description: Retrieves a list of overlapping seq_region names of + another coordinate system. This is the same as the + list_ids method but uses seq_region names rather + internal IDs. + Return type: List of strings. + Exceptions : None + Caller : General + Status : Stable + +=cut + +sub list_ids { + if ( @_ != 5 ) { + throw('Incorrect number of arguments.'); + } + + my ( $self, $frm_seq_region_name, $frm_start, $frm_end, $frm_cs ) = + @_; + + my @tmp = ($frm_seq_region_name); + + my $seq_region_id = + $self->adaptor()->seq_regions_to_ids( $frm_cs, \@tmp )->[0]; + + if ( $frm_cs->equals( $self->component_CoordSystem() ) ) { + + if ( !$self->have_registered_component($seq_region_id) ) { + $self->adaptor->register_component( $self, $seq_region_id ); + } + + # Pull out the 'from' identifiers of the mapper pairs. The we + # loaded the assembled side as the 'from' side in the constructor. + + return + map ( { $_->from()->id() } + $self->mapper()->list_pairs( + $seq_region_id, $frm_start, $frm_end, $COMPONENT + ) ); + + } elsif ( $frm_cs->equals( $self->assembled_CoordSystem() ) ) { + + $self->adaptor->register_assembled( $self, $seq_region_id, + $frm_start, $frm_end ); + + # Pull out the 'to' identifiers of the mapper pairs we loaded the + # component side as the 'to' coord system in the constructor. + + return + map ( { $_->to->id() } + $self->mapper()->list_pairs( + $seq_region_id, $frm_start, $frm_end, $ASSEMBLED + ) ); + + } else { + + throw( + sprintf( "Coordinate system %s %s is neither the assembled " + . "nor the component coordinate system " + . "of this AssemblyMapper", + $frm_cs->name(), $frm_cs->version() ) ); + + } +} ## end sub list_ids + +#sub list_seq_regions { +# throw('Incorrect number of arguments.') if(@_ != 5); +# my($self, $frm_seq_region_name, $frm_start, $frm_end, $frm_cs) = @_; + +# if($frm_cs->equals($self->component_CoordSystem())) { + +# if(!$self->have_registered_component($seq_region_id)) { +# $self->adaptor->register_component($self,$seq_region_id); +# } + +# #pull out the 'from' identifiers of the mapper pairs. The +# #we loaded the assembled side as the 'from' side in the constructor +# return +# map {$_->from()->id()} +# $self->mapper()->list_pairs($seq_region_id, $frm_start, +# $frm_end, $COMPONENT); + +# } elsif($frm_cs->equals($self->assembled_CoordSystem())) { + +# $self->adaptor->register_assembled($self, +# $frm_seq_region,$frm_start,$frm_end); + +# #pull out the 'to' identifiers of the mapper pairs +# #we loaded the component side as the 'to' coord system in the constructor +# return +# map {$_->to->id()} +# $self->mapper()->list_pairs($frm_seq_region, $frm_start, +# $frm_end, $ASSEMBLED); +# } else { +# throw("Coordinate system " . $frm_cs->name . " " . $frm_cs->version . +# " is neither the assembled nor the component coordinate system " . +# " of this AssemblyMapper"); +# } +#} + + +=head2 list_seq_regions + + Arg [1] : string $frm_seq_region + The name of the sequence region of interest. + Arg [2] : int $frm_start + The start of the region of interest. + Arg [3] : int $frm_end + The end of the region to transform of interest. + Arg [5] : Bio::EnsEMBL::CoordSystem $frm_cs + The coordinate system to obtain overlapping IDs of. + Example : foreach my $id ( + $asm_mapper->list_seq_regions( + 'X', 1, 1000, $chr_cs + ) ) { ... } + Description: Retrieves a list of overlapping seq_region internal + identifiers of another coordinate system. This is + the same as the list_seq_regions method but uses + internal identfiers rather than seq_region strings. + Return type: List of ints. + Exceptions : None + Caller : General + Status : Stable + +=cut + +sub list_seq_regions { + if ( @_ != 5 ) { + throw('Incorrect number of arguments.'); + } + + my ( $self, $frm_seq_region, $frm_start, $frm_end, $frm_cs ) = @_; + + # Retrieve the seq_region names. + + my @seq_ids = + $self->list_ids( $frm_seq_region, $frm_start, $frm_end, $frm_cs ); + + # The seq_regions are from the 'to' coordinate system not the from + # coordinate system we used to obtain them. + + my $to_cs; + if ( $frm_cs->equals( $self->assembled_CoordSystem() ) ) { + $to_cs = $self->component_CoordSystem(); + } else { + $to_cs = $self->assembled_CoordSystem(); + } + + # Convert them to IDs. + return @{ $self->adaptor()->seq_ids_to_regions( \@seq_ids ) }; +} + +#sub list_ids { +# throw('Incorrect number of arguments.') if(@_ != 5); +# my($self, $frm_seq_region, $frm_start, $frm_end, $frm_cs) = @_; + +# #retrieve the seq_region names +# my @seq_regs = +# $self->list_seq_regions($frm_seq_region,$frm_start,$frm_end,$frm_cs); + +# #The seq_regions are from the 'to' coordinate system not the +# #from coordinate system we used to obtain them +# my $to_cs; +# if($frm_cs->equals($self->assembled_CoordSystem())) { +# $to_cs = $self->component_CoordSystem(); +# } else { +# $to_cs = $self->assembled_CoordSystem(); +# } + +# #convert them to ids +# return @{$self->adaptor()->seq_regions_to_ids($to_cs, \@seq_regs)}; +#} + +=head2 have_registered_component + + Arg [1] : string $cmp_seq_region + The name of the sequence region to check for + registration. + Example : if ( $asm_mapper->have_registered_component('AL240214.1') ) {} + Description: Returns true if a given component region has + been registered with this assembly mapper. This + should only be called by this class or the + AssemblyMapperAdaptor. In other words, do not use + this method unless you really know what you are + doing. + Return type: Boolean (0 or 1) + Exceptions : Throws on incorrect arguments. + Caller : Internal, AssemblyMapperAdaptor + Status : Stable + +=cut + +sub have_registered_component { + my ( $self, $cmp_seq_region ) = @_; + + if ( !defined($cmp_seq_region) ) { + throw('cmp_seq_region argument is required'); + } + + if ( exists( $self->{'cmp_register'}->{$cmp_seq_region} ) ) { + return 1; + } + + return 0; +} + +=head2 have_registered_assembled + + Arg [1] : string $asm_seq_region + The name of the sequence region to check for + registration. + Arg [2] : int $chunk_id + The chunk number of the provided seq_region to check + for registration. + Example : if ( $asm_mapper->have_registered_component( 'X', 9 ) ) { } + Description: Returns true if a given assembled region chunk + has been registered with this assembly mapper. + This should only be called by this class or the + AssemblyMapperAdaptor. In other words, do not use + this method unless you really know what you are + doing. + Return type: Boolean (0 or 1) + Exceptions : Throws on incorrect arguments + Caller : Internal, AssemblyMapperAdaptor + Status : Stable + +=cut + +sub have_registered_assembled { + my ( $self, $asm_seq_region, $chunk_id ) = @_; + + if ( !defined($asm_seq_region) ) { + throw('asm_seq_region argument is required'); + } + if ( !defined($chunk_id) ) { + throw('chunk_id is required'); + } + + if ( + exists( $self->{'asm_register'}->{$asm_seq_region}->{$chunk_id} ) ) + { + return 1; + } + + return 0; +} + + +=head2 register_component + + Arg [1] : integer $cmp_seq_region + The dbID of the component sequence region to + register. + Example : $asm_mapper->register_component('AL312341.1'); + Description: Flags a given component sequence region as registered + in this assembly mapper. This should only be called + by this class or the AssemblyMapperAdaptor. + Return type: None + Exceptions : Throws on incorrect arguments + Caller : Internal, AssemblyMapperAdaptor + Status : Stable + +=cut + +sub register_component { + my ( $self, $cmp_seq_region ) = @_; + + if ( !defined($cmp_seq_region) ) { + throw('cmp_seq_region argument is required'); + } + + $self->{'cmp_register'}->{$cmp_seq_region} = 1; +} + +=head2 register_assembled + + Arg [1] : integer $asm_seq_region + The dbID of the sequence region to register. + Arg [2] : int $chunk_id + The chunk number of the provided seq_region to register. + Example : $asm_mapper->register_assembled( 'X', 4 ); + Description: Flags a given assembled region as registered in this + assembly mapper. This should only be called by this + class or the AssemblyMapperAdaptor. Do not call this + method unless you really know what you are doing. + Return type: None + Exceptions : Throws on incorrect arguments + Caller : Internal, AssemblyMapperAdaptor + Status : Stable + +=cut + +sub register_assembled { + my ( $self, $asm_seq_region, $chunk_id ) = @_; + + if ( !defined($asm_seq_region) ) { + throw('asm_seq_region argument is required'); + } + if ( !defined($chunk_id) ) { + throw('chunk_id srgument is required'); + } + + $self->{'asm_register'}->{$asm_seq_region}->{$chunk_id} = 1; +} + +=head2 mapper + + Arg [1] : None + Example : $mapper = $asm_mapper->mapper(); + Description: Retrieves the internal mapper used by this Assembly + Mapper. This is unlikely to be useful unless you + _really_ know what you are doing. + Return type: Bio::EnsEMBL::Mapper + Exceptions : None + Caller : Internal, AssemblyMapperAdaptor + Status : Stable + +=cut + +sub mapper { + my ($self) = @_; + + return $self->{'mapper'}; +} + +=head2 assembled_CoordSystem + + Arg [1] : None + Example : $cs = $asm_mapper->assembled_CoordSystem(); + Description: Retrieves the assembled CoordSystem from this + assembly mapper. + Return type: Bio::EnsEMBL::CoordSystem + Exceptions : None + Caller : Internal, AssemblyMapperAdaptor + Status : Stable + +=cut + +sub assembled_CoordSystem { + my ($self) = @_; + + return $self->{'asm_cs'}; +} + +=head2 component_CoordSystem + + Arg [1] : None + Example : $cs = $asm_mapper->component_CoordSystem(); + Description: Retrieves the component CoordSystem from this + assembly mapper. + Return type: Bio::EnsEMBL::CoordSystem + Exceptions : None + Caller : Internal, AssemblyMapperAdaptor + Status : Stable + +=cut + +sub component_CoordSystem { + my ($self) = @_; + + return $self->{'cmp_cs'}; +} + +=head2 adaptor + + Arg [1] : Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor $adaptor + Description: Getter/set terfor this object's database adaptor. + Returntype : Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor + Exceptions : None + Caller : General + Status : Stable + +=cut + +sub adaptor { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + weaken($self->{'adaptor'} = $value); + } + + return $self->{'adaptor'}; +} + +=head2 in_assembly + + Description: DEPRECATED, use map() or list_ids() instead. + +=cut + +sub in_assembly { + my ( $self, $object ) = @_; + + deprecate('Use map() or list_ids() instead.'); + + my $csa = $self->db->get_CoordSystemAdaptor(); + + my $top_level = $csa->fetch_top_level(); + + my $asma = + $self->adaptor->fetch_by_CoordSystems( $object->coord_system(), + $top_level ); + + my @list = $asma->list_ids( $object->seq_region(), + $object->start(), + $object->end(), + $object->coord_system() ); + + return ( @list > 0 ); +} + +=head2 map_coordinates_to_assembly + + Description: DEPRECATED, use map() instead. + +=cut + +sub map_coordinates_to_assembly { + my ( $self, $contig_id, $start, $end, $strand ) = @_; + + deprecate('Use map() instead.'); + + # Not sure if contig_id is seq_region_id or name... + return + $self->map( $contig_id, $start, $end, $strand, + $self->contig_CoordSystem() ); + +} + +=head2 fast_to_assembly + + Description: DEPRECATED, use map() instead. + +=cut + +sub fast_to_assembly { + my ( $self, $contig_id, $start, $end, $strand ) = @_; + + deprecate('Use map() instead.'); + + # Not sure if contig_id is seq_region_id or name... + return + $self->map( $contig_id, $start, $end, $strand, + $self->contig_CoordSystem() ); +} + +=head2 map_coordinates_to_rawcontig + + Description: DEPRECATED, use map() instead. + +=cut + +sub map_coordinates_to_rawcontig { + my ( $self, $chr_name, $start, $end, $strand ) = @_; + + deprecate('Use map() instead.'); + + return + $self->map( $chr_name, $start, $end, $strand, + $self->assembled_CoordSystem() ); +} + +=head2 list_contig_ids + + Description: DEPRECATED, use list_ids() instead. + +=cut + +sub list_contig_ids { + my ( $self, $chr_name, $start, $end ) = @_; + + deprecate('Use list_ids() instead.'); + + return + $self->list_ids( $chr_name, $start, $end, + $self->assembled_CoordSystem() ); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Attribute.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Attribute.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,192 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Attribute - A generic Attribute class. + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Attribute; + + my $attribute = Bio::EnsEMBL::Attribute->new + (-CODE => 'myCode', + -NAME => 'My Attribute', + -DESCRIPTION => 'This is my attribute description.', + -VALUE => '10023'); + + print $attrib->name(), "\n"; + print $attrib->code(), "\n"; + print $attrib->description(), "\n"; + print $attrib->value(), "\n"; + +=head1 DESCRIPTION + +This is a generic attribute class used to represent attributes +associated with seq_regions (and their Slices) and MiscFeatures. + +=head1 SEE ALSO + +Bio::EnsEMBL::Slice +Bio::EnsEMBL::MiscFeature +Bio::EnsEMBL::DBSQL::AttributeAdaptor + +=cut + +package Bio::EnsEMBL::Attribute; + +use strict; +use warnings; + +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Scalar::Util qw(weaken isweak); + +=head2 new + + Arg [-CODE] : string - the code for this attribute + Arg [-NAME] : string - a human readable name for this attribute + Arg [-DESCRIPTION] : string - a description for this attribute + Arg [-VALUE] : value - the value of this attribute + Example : my $attribute = Bio::EnsEMBL::Attribute->new + (-CODE => 'myCode', + -NAME => 'My Attribute', + -DESCRIPTION => 'This is my attribute description.', + -VALUE => '10023'); + Description : Constructor. Instantiates a Bio::EnsEMBL::Attribute object. + Returntype : Bio::EnsEMBL::Attribute + Exceptions : none + Caller : general + Status : Stable + +=cut + + +sub new { + my $caller = shift; + + # allow to be called as class or object method + my $class = ref($caller) || $caller; + + my ($code, $name, $desc, $value) = + rearrange([qw(CODE NAME DESCRIPTION VALUE)], @_); + + return bless {'code' => $code, + 'name' => $name, + 'description' => $desc, + 'value' => $value}, $class; +} + +=head2 new_fast + + Arg [1] : hashref to be blessed + Description: Construct a new Bio::EnsEMBL::Attribute using the hashref. + Exceptions : none + Returntype : Bio::EnsEMBL::Attribute + Caller : general, subclass constructors + Status : Stable + +=cut + + +sub new_fast { + my $class = shift; + my $hashref = shift; + my $self = bless $hashref, $class; + weaken($self->{adaptor}) if ( ! isweak($self->{adaptor}) ); + return $self; +} + + +=head2 code + + Arg [1] : string $code (optional) + Example : $code = $attribute->code(); + Description: Getter/Setter for code attribute + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub code { + my $self = shift; + $self->{'code'} = shift if(@_); + return $self->{'code'}; +} + + +=head2 name + + Arg [1] : string $name (optional) + Example : $name = $attribute->name(); + Description: Getter/Setter for name attribute + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub name { + my $self = shift; + $self->{'name'} = shift if(@_); + return $self->{'name'}; +} + +=head2 description + + Arg [1] : string $description (optional) + Example : $description = $attribute->description(); + Description: Getter/Setter for description attribute + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub description { + my $self = shift; + $self->{'description'} = shift if(@_); + return $self->{'description'}; +} + + +=head2 value + + Arg [1] : string $value (optional) + Example : $value = $attribute->value(); + Description: Getter/Setter for value attribute + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub value { + my $self = shift; + $self->{'value'} = shift if(@_); + return $self->{'value'}; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/BaseAlignFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/BaseAlignFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,897 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::BaseAlignFeature - Baseclass providing a common abstract +implmentation for alignment features + +=head1 SYNOPSIS + + my $feat = new Bio::EnsEMBL::DnaPepAlignFeature( + -slice => $slice, + -start => 100, + -end => 120, + -strand => 1, + -hseqname => 'SP:RF1231', + -hstart => 200, + -hend => 220, + -analysis => $analysis, + -cigar_string => '10M3D5M2I' + ); + + Alternatively if you have an array of ungapped features + + my $feat = + new Bio::EnsEMBL::DnaPepAlignFeature( -features => \@features ); + + Where @features is an array of Bio::EnsEMBL::FeaturePair + + There is a method to manipulate the cigar_string into ungapped features + + my @ungapped_features = $feat->ungapped_features(); + + This converts the cigar string into an array of Bio::EnsEMBL::FeaturePair + + $analysis is a Bio::EnsEMBL::Analysis object + + Bio::EnsEMBL::Feature methods can be used + Bio::EnsEMBL::FeaturePair methods can be used + + The cigar_string contains the ungapped pieces that make up the gapped + alignment + + It looks like: n Matches [ x Deletes or Inserts m Matches ]* + but a bit more condensed like "23M4I12M2D1M" + and evenmore condensed as you can ommit 1s "23M4I12M2DM" + + + To make things clearer this is how a blast HSP would be parsed + + >AK014066 + Length = 146 + + Minus Strand HSPs: + + Score = 76 (26.8 bits), Expect = 1.4, P = 0.74 + Identities = 20/71 (28%), Positives = 29/71 (40%), Frame = -1 + + Query: 479 GLQAPPPTPQGCRLIPPPPLGLQAPLPTLRAVGSSHHHP*GRQGSSLSSFRSSLASKASA 300 + G APPP PQG R P P G + P L + + ++ R +A + + Sbjct: 7 GALAPPPAPQG-RWAFPRPTG-KRPATPLHGTARQDRQVRRSEAAKVTGCRGRVAPHVAP 64 + + Query: 299 SSPHNPSPLPS 267 + H P+P P+ + Sbjct: 65 PLTHTPTPTPT 75 + + The alignment goes from 267 to 479 in sequence 1 and 7 to 75 in sequence 2 + and the strand is -1. + + The alignment is made up of the following ungapped pieces : + + sequence 1 start 447 , sequence 2 start 7 , match length 33 , strand -1 + sequence 1 start 417 , sequence 2 start 18 , match length 27 , strand -1 + sequence 1 start 267 , sequence 2 start 27 , match length 137 , strand -1 + + These ungapped pieces are made up into the following string (called a cigar + string) "33M3I27M3I137M" with start 267 end 479 strand -1 hstart 7 hend 75 + hstrand 1 and feature type would be DnaPepAlignFeature + +=cut + + +package Bio::EnsEMBL::BaseAlignFeature; + +use Bio::EnsEMBL::FeaturePair; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Scalar::Util qw(weaken isweak); + +use vars qw(@ISA); +use strict; + +@ISA = qw(Bio::EnsEMBL::FeaturePair); + + +=head2 new + + Arg [..] : List of named arguments. (-cigar_string , -features) defined + in this constructor, others defined in FeaturePair and + SeqFeature superclasses. Either cigar_string or a list + of ungapped features should be provided - not both. + Example : $baf = new BaseAlignFeatureSubclass(-cigar_string => '3M3I12M'); + Description: Creates a new BaseAlignFeature using either a cigarstring or + a list of ungapped features. BaseAlignFeature is an abstract + baseclass and should not actually be instantiated - rather its + subclasses should be. + Returntype : Bio::EnsEMBL::BaseAlignFeature + Exceptions : thrown if both feature and cigar string args are provided + thrown if neither feature nor cigar string args are provided + Caller : general + Status : Stable + +=cut + +sub new { + + my $caller = shift; + + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + my ($cigar_string,$features) = rearrange([qw(CIGAR_STRING FEATURES)], @_); + + if (defined($cigar_string) && defined($features)) { + throw("CIGAR_STRING or FEATURES argument is required - not both."); + } elsif (defined($features)) { + $self->_parse_features($features); + + } elsif (defined($cigar_string)) { + $self->{'cigar_string'} = $cigar_string; + } else { + throw("CIGAR_STRING or FEATURES argument is required"); + } + + return $self; +} + + +=head2 new_fast + + Arg [1] : hashref $hashref + A hashref which will be blessed into a PepDnaAlignFeature. + Example : none + Description: This allows for very fast object creation when a large number + of PepDnaAlignFeatures needs to be created. This is a bit of + a hack but necessary when thousands of features need to be + generated within a couple of seconds for web display. It is + not recommended that this method be called unless you know what + you are doing. It requires knowledge of the internals of this + class and its superclasses. + Returntype : Bio::EnsEMBL::BaseAlignFeature + Exceptions : none + Caller : none currently + Status : Stable + +=cut + +sub new_fast { + my ($class, $hashref) = @_; + my $self = bless $hashref, $class; + weaken($self->{adaptor}) if ( ! isweak($self->{adaptor}) ); + return $self; +} + + +=head2 cigar_string + + Arg [1] : string $cigar_string + Example : $feature->cigar_string( "12MI3M" ); + Description: get/set for attribute cigar_string + cigar_string describes the alignment. "xM" stands for + x matches (mismatches), "xI" for inserts into query sequence + (thats the ensembl sequence), "xD" for deletions + (inserts in the subject). an "x" that is 1 can be omitted. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub cigar_string { + my $self = shift; + $self->{'cigar_string'} = shift if(@_); + return $self->{'cigar_string'}; +} + + +=head2 alignment_length + + Arg [1] : None + Description: return the alignment length (including indels) based on the cigar_string + Returntype : int + Exceptions : + Caller : + Status : Stable + +=cut + +sub alignment_length { + my $self = shift; + + if (! defined $self->{'_alignment_length'} && defined $self->cigar_string) { + + my @pieces = ( $self->cigar_string =~ /(\d*[MDI])/g ); + unless (@pieces) { + print STDERR "Error parsing cigar_string\n"; + } + my $alignment_length = 0; + foreach my $piece (@pieces) { + my ($length) = ( $piece =~ /^(\d*)/ ); + if (! defined $length || $length eq "") { + $length = 1; + } + $alignment_length += $length; + } + $self->{'_alignment_length'} = $alignment_length; + } + return $self->{'_alignment_length'}; +} + +=head2 ungapped_features + + Args : none + Example : @ungapped_features = $align_feature->get_feature + Description: converts the internal cigar_string into an array of + ungapped feature pairs + Returntype : list of Bio::EnsEMBL::FeaturePair + Exceptions : cigar_string not set internally + Caller : general + Status : Stable + +=cut + +sub ungapped_features { + my ($self) = @_; + + if (!defined($self->{'cigar_string'})) { + throw("No cigar_string defined. Can't return ungapped features"); + } + + return @{$self->_parse_cigar()}; +} + +=head2 strands_reversed + + Arg [1] : int $strands_reversed + Description: get/set for attribute strands_reversed + 0 means that strand and hstrand are the original strands obtained + from the alignment program used + 1 means that strand and hstrand have been flipped as compared to + the original result provided by the alignment program used. + You may want to use the reverse_complement method to restore the + original strandness. + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub strands_reversed { + my ($self, $arg) = @_; + + if ( defined $arg ) { + $self->{'strands_reversed'} = $arg ; + } + + $self->{'strands_reversed'} = 0 unless (defined $self->{'strands_reversed'}); + + return $self->{'strands_reversed'}; +} + +=head2 reverse_complement + + Args : none + Description: reverse complement the FeaturePair, + modifing strand, hstrand and cigar_string in consequence + Returntype : none + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub reverse_complement { + my ($self) = @_; + + # reverse strand in both sequences + $self->strand($self->strand * -1); + $self->hstrand($self->hstrand * -1); + + # reverse cigar_string as consequence + my $cigar_string = $self->cigar_string; + $cigar_string =~ s/(D|I|M)/$1 /g; + my @cigar_pieces = split / /,$cigar_string; + $cigar_string = ""; + while (my $piece = pop @cigar_pieces) { + $cigar_string .= $piece; + } + + $self->{'strands_reversed'} = 0 unless (defined $self->{'strands_reversed'}); + + if ($self->strands_reversed) { + $self->strands_reversed(0) + } else { + $self->strands_reversed(1); + } + + $self->cigar_string($cigar_string); +} + + + +=head2 transform + + Arg 1 : String $coordinate_system_name + Arg [2] : String $coordinate_system_version + Example : $feature = $feature->transform('contig'); + $feature = $feature->transform('chromosome', 'NCBI33'); + Description: Moves this AlignFeature to the given coordinate system. + If the feature cannot be transformed to the destination + coordinate system undef is returned instead. + Returntype : Bio::EnsEMBL::BaseAlignFeature; + Exceptions : wrong parameters + Caller : general + Status : Medium Risk + : deprecation needs to be removed at some time + +=cut + +sub transform { + my $self = shift; + + # catch for old style transform calls + if( ref $_[0] eq 'HASH') { + deprecate("Calling transform with a hashref is deprecate.\n" . + 'Use $feat->transfer($slice) or ' . + '$feat->transform("coordsysname") instead.'); + my (undef, $new_feat) = each(%{$_[0]}); + return $self->transfer($new_feat->slice); + } + + my $new_feature = $self->SUPER::transform(@_); + if ( !defined($new_feature) + || $new_feature->length() != $self->length() ) + { + my @segments = @{ $self->project(@_) }; + + if ( !@segments ) { + return undef; + } + + my @ungapped; + foreach my $f ( $self->ungapped_features() ) { + $f = $f->transform(@_); + if ( defined($f) ) { + push( @ungapped, $f ); + } else { + warning( "Failed to transform alignment feature; " + . "ungapped component could not be transformed" ); + return undef; + } + } + + eval { $new_feature = $self->new( -features => \@ungapped ); }; + + if ($@) { + warning($@); + return undef; + } + } ## end if ( !defined($new_feature...)) + + return $new_feature; +} + + +=head2 _parse_cigar + + Args : none + Description: PRIVATE (internal) method - creates ungapped features from + internally stored cigar line + Returntype : list of Bio::EnsEMBL::FeaturePair + Exceptions : none + Caller : ungapped_features + Status : Stable + +=cut + +sub _parse_cigar { + my ( $self ) = @_; + + my $query_unit = $self->_query_unit(); + my $hit_unit = $self->_hit_unit(); + + my $string = $self->{'cigar_string'}; + + throw("No cigar string defined in object") if(!defined($string)); + + my @pieces = ( $string =~ /(\d*[MDI])/g ); + #print "cigar: ",join ( ",", @pieces ),"\n"; + + my @features; + my $strand1 = $self->{'strand'} || 1; + my $strand2 = $self->{'hstrand'}|| 1; + + my ( $start1, $start2 ); + + if( $strand1 == 1 ) { + $start1 = $self->{'start'}; + } else { + $start1 = $self->{'end'}; + } + + if( $strand2 == 1 ) { + $start2 = $self->{'hstart'}; + } else { + $start2 = $self->{'hend'}; + } + + # + # Construct ungapped blocks as FeaturePairs objects for each MATCH + # + foreach my $piece (@pieces) { + + my ($length) = ( $piece =~ /^(\d*)/ ); + if( $length eq "" ) { $length = 1 } + my $mapped_length; + + # explicit if statements to avoid rounding problems + # and make sure we have sane coordinate systems + if( $query_unit == 1 && $hit_unit == 3 ) { + $mapped_length = $length*3; + } elsif( $query_unit == 3 && $hit_unit == 1 ) { + $mapped_length = $length / 3; + } elsif ( $query_unit == 1 && $hit_unit == 1 ) { + $mapped_length = $length; + } else { + throw("Internal error $query_unit $hit_unit, currently only " . + "allowing 1 or 3 "); + } + + if( int($mapped_length) != $mapped_length and + ($piece =~ /M$/ or $piece =~ /D$/)) { + throw("Internal error with mismapped length of hit, query " . + "$query_unit, hit $hit_unit, length $length"); + } + + if( $piece =~ /M$/ ) { + # + # MATCH + # + my ( $qstart, $qend); + if( $strand1 == 1 ) { + $qstart = $start1; + $qend = $start1 + $length - 1; + $start1 = $qend + 1; + } else { + $qend = $start1; + $qstart = $start1 - $length + 1; + $start1 = $qstart - 1; + } + + my ($hstart, $hend); + if( $strand2 == 1 ) { + $hstart = $start2; + $hend = $start2 + $mapped_length - 1; + $start2 = $hend + 1; + } else { + $hend = $start2; + $hstart = $start2 - $mapped_length + 1; + $start2 = $hstart - 1; + } + + + push @features, Bio::EnsEMBL::FeaturePair->new + (-SLICE => $self->{'slice'}, + -SEQNAME => $self->{'seqname'}, + -START => $qstart, + -END => $qend, + -STRAND => $strand1, + -HSLICE => $self->{'hslice'}, + -HSEQNAME => $self->{'hseqname'}, + -HSTART => $hstart, + -HEND => $hend, + -HSTRAND => $strand2, + -SCORE => $self->{'score'}, + -PERCENT_ID => $self->{'percent_id'}, + -ANALYSIS => $self->{'analysis'}, + -P_VALUE => $self->{'p_value'}, + -EXTERNAL_DB_ID => $self->{'external_db_id'}, + -HCOVERAGE => $self->{'hcoverage'}, + -GROUP_ID => $self->{'group_id'}, + -LEVEL_ID => $self->{'level_id'}); + + + # end M cigar bits + } elsif( $piece =~ /I$/ ) { + # + # INSERT + # + if( $strand1 == 1 ) { + $start1 += $length; + } else { + $start1 -= $length; + } + } elsif( $piece =~ /D$/ ) { + # + # DELETION + # + if( $strand2 == 1 ) { + $start2 += $mapped_length; + } else { + $start2 -= $mapped_length; + } + } else { + throw( "Illegal cigar line $string!" ); + } + } + + return \@features; +} + + + + +=head2 _parse_features + + Arg [1] : listref Bio::EnsEMBL::FeaturePair $ungapped_features + Description: creates internal cigarstring and start,end hstart,hend + entries. + Returntype : none, fills in values of self + Exceptions : argument list undergoes many sanity checks - throws under many + invalid conditions + Caller : new + Status : Stable + +=cut + +my $message_only_once = 1; + +sub _parse_features { + my ($self,$features ) = @_; + + my $query_unit = $self->_query_unit(); + my $hit_unit = $self->_hit_unit(); + + if (ref($features) ne "ARRAY") { + throw("features must be an array reference not a [".ref($features)."]"); + } + + my $strand = $features->[0]->strand; + + throw ('FeaturePair needs to have strand == 1 or strand == -1') if(!$strand); + + my @f; + + # + # Sort the features on their start position + # Ascending order on positive strand, descending on negative strand + # + if( $strand == 1 ) { + @f = sort {$a->start <=> $b->start} @$features; + } else { + @f = sort { $b->start <=> $a->start} @$features; + } + + my $hstrand = $f[0]->hstrand; + my $slice = $f[0]->slice(); + my $hslice = $f[0]->hslice(); + my $name = $slice->name() if($slice); + my $hname = $f[0]->hseqname; + my $score = $f[0]->score; + my $percent = $f[0]->percent_id; + my $analysis = $f[0]->analysis; + my $pvalue = $f[0]->p_value(); + my $external_db_id = $f[0]->external_db_id; + my $hcoverage = $f[0]->hcoverage; + my $group_id = $f[0]->group_id; + my $level_id = $f[0]->level_id; + + my $seqname = $f[0]->seqname; + # implicit strand 1 for peptide sequences + $strand ||= 1; + $hstrand ||= 1; + my $ori = $strand * $hstrand; + + throw("No features in the array to parse") if(scalar(@f) == 0); + + my $prev1; # where last feature q part ended + my $prev2; # where last feature s part ended + + my $string; + + # Use strandedness info of query and hit to make sure both sets of + # start and end coordinates are oriented the right way around. + my $f1start; + my $f1end; + my $f2end; + my $f2start; + + if ($strand == 1) { + $f1start = $f[0]->start; + $f1end = $f[-1]->end; + } else { + $f1start = $f[-1]->start; + $f1end = $f[0]->end; + } + + if ($hstrand == 1) { + $f2start = $f[0]->hstart; + $f2end = $f[-1]->hend; + } else { + $f2start = $f[-1]->hstart; + $f2end = $f[0]->hend; + } + + # + # Loop through each portion of alignment and construct cigar string + # + + foreach my $f (@f) { + # + # Sanity checks + # + + if (!$f->isa("Bio::EnsEMBL::FeaturePair")) { + throw("Array element [$f] is not a Bio::EnsEMBL::FeaturePair"); + } + if( defined($f->hstrand()) && $f->hstrand() != $hstrand ) { + throw("Inconsistent hstrands in feature array"); + } + if( defined($f->strand()) && ($f->strand != $strand)) { + throw("Inconsistent strands in feature array"); + } + if ( defined($name) && $name ne $f->slice->name()) { + throw("Inconsistent names in feature array [$name - ". + $f->slice->name()."]"); + } + if ( defined($hname) && $hname ne $f->hseqname) { + throw("Inconsistent hit names in feature array [$hname - ". + $f->hseqname . "]"); + } + if ( defined($score) && $score ne $f->score) { + throw("Inconsisent scores in feature array [$score - " . + $f->score . "]"); + } + if (defined($f->percent_id) && $percent ne $f->percent_id) { + throw("Inconsistent pids in feature array [$percent - " . + $f->percent_id . "]"); + } + if(defined($pvalue) && $pvalue != $f->p_value()) { + throw("Inconsistant p_values in feature arraw [$pvalue " . + $f->p_value() . "]"); + } + if($seqname && $seqname ne $f->seqname){ + throw("Inconsistent seqname in feature array [$seqname - ". + $f->seqname . "]"); + } + my $start1 = $f->start; #source sequence alignment start + my $start2 = $f->hstart(); #hit sequence alignment start + + # + # More sanity checking + # + if (defined($prev1)) { + if ( $strand == 1 ) { + if ($f->start < $prev1) { + throw("Inconsistent coords in feature array (forward strand).\n" . + "Start [".$f->start()."] in current feature should be greater " . + "than previous feature end [$prev1]."); + } + } else { + if ($f->end > $prev1) { + throw("Inconsistent coords in feature array (reverse strand).\n" . + "End [".$f->end() ."] should be less than previous feature " . + "start [$prev1]."); + } + } + } + + my $length = ($f->end - $f->start + 1); #length of source seq alignment + my $hlength = ($f->hend - $f->hstart + 1); #length of hit seq alignment + + # using multiplication to avoid rounding errors, hence the + # switch from query to hit for the ratios + + # + # Yet more sanity checking + # + if($query_unit > $hit_unit){ + # I am going to make the assumption here that this situation will + # only occur with DnaPepAlignFeatures, this may not be true + my $query_p_length = sprintf "%.0f", ($length/$query_unit); + my $hit_p_length = sprintf "%.0f", ($hlength * $hit_unit); + if( $query_p_length != $hit_p_length) { + throw( "Feature lengths not comparable Lengths:" .$length . + " " . $hlength . " Ratios:" . $query_unit . " " . + $hit_unit ); + } + } else{ + my $query_d_length = sprintf "%.0f", ($length*$hit_unit); + my $hit_d_length = sprintf "%.0f", ($hlength * $query_unit); + if( $length * $hit_unit != $hlength * $query_unit ) { + throw( "Feature lengths not comparable Lengths:" . $length . + " " . $hlength . " Ratios:" . $query_unit . " " . + $hit_unit ); + } + } + + my $hlengthfactor = ($query_unit/$hit_unit); + + # + # Check to see if there is an I type (insertion) gap: + # If there is a space between the end of the last source sequence + # alignment and the start of this one, then this is an insertion + # + + my $insertion_flag = 0; + if( $strand == 1 ) { + if( ( defined $prev1 ) && ( $f->start > $prev1 + 1 )) { + + #there is an insertion + $insertion_flag = 1; + my $gap = $f->start - $prev1 - 1; + if( $gap == 1 ) { + $gap = ""; # no need for a number if gap length is 1 + } + $string .= "$gap"."I"; + + } + + #shift our position in the source seq alignment + $prev1 = $f->end(); + } else { + + if(( defined $prev1 ) && ($f->end + 1 < $prev1 )) { + + #there is an insertion + $insertion_flag = 1; + my $gap = $prev1 - $f->end() - 1; + if( $gap == 1 ) { + $gap = ""; # no need for a number if gap length is 1 + } + $string .= "$gap"."I"; + } + + #shift our position in the source seq alignment + $prev1 = $f->start(); + } + + # + # Check to see if there is a D type (deletion) gap + # There is a deletion gap if there is a space between the end of the + # last portion of the hit sequence alignment and this one + # + if( $hstrand == 1 ) { + if(( defined $prev2 ) && ( $f->hstart() > $prev2 + 1 )) { + + #there is a deletion + my $gap = $f->hstart - $prev2 - 1; + my $gap2 = int( $gap * $hlengthfactor + 0.5 ); + + if( $gap2 == 1 ) { + $gap2 = ""; # no need for a number if gap length is 1 + } + $string .= "$gap2"."D"; + + #sanity check, Should not be an insertion and deletion + if($insertion_flag) { + if ($message_only_once) { + warning("Should not be an deletion and insertion on the " . + "same alignment region. cigar_line=$string\n"); + $message_only_once = 0; + } + } + } + #shift our position in the hit seq alignment + $prev2 = $f->hend(); + + } else { + if( ( defined $prev2 ) && ( $f->hend() + 1 < $prev2 )) { + + #there is a deletion + my $gap = $prev2 - $f->hend - 1; + my $gap2 = int( $gap * $hlengthfactor + 0.5 ); + + if( $gap2 == 1 ) { + $gap2 = ""; # no need for a number if gap length is 1 + } + $string .= "$gap2"."D"; + + #sanity check, Should not be an insertion and deletion + if($insertion_flag) { + if ($message_only_once) { + warning("Should not be an deletion and insertion on the " . + "same alignment region. prev2 = $prev2; f->hend() = " . + $f->hend() . "; cigar_line = $string;\n"); + $message_only_once = 0; + } + } + } + #shift our position in the hit seq alignment + $prev2 = $f->hstart(); + } + + my $matchlength = $f->end() - $f->start() + 1; + if( $matchlength == 1 ) { + $matchlength = ""; + } + $string .= $matchlength."M"; + } + + $self->{'start'} = $f1start; + $self->{'end'} = $f1end; + $self->{'seqname'} = $seqname; + $self->{'strand'} = $strand; + $self->{'score'} = $score; + $self->{'percent_id'} = $percent; + $self->{'analysis'} = $analysis; + $self->{'slice'} = $slice; + $self->{'hslice'} = $hslice; + $self->{'hstart'} = $f2start; + $self->{'hend'} = $f2end; + $self->{'hstrand'} = $hstrand; + $self->{'hseqname'} = $hname; + $self->{'cigar_string'} = $string; + $self->{'p_value'} = $pvalue; + $self->{'external_db_id'} = $external_db_id; + $self->{'hcoverage'} = $hcoverage; + $self->{'group_id'} = $group_id; + $self->{'level_id'} = $level_id; +} + + + + + + +=head2 _hit_unit + + Args : none + Description: abstract method, overwrite with something that returns + one or three + Returntype : int 1,3 + Exceptions : none + Caller : internal + Status : Stable + +=cut + +sub _hit_unit { + my $self = shift; + throw( "Abstract method call!" ); +} + + + +=head2 _query_unit + + Args : none + Description: abstract method, overwrite with something that returns + one or three + Returntype : int 1,3 + Exceptions : none + Caller : internal + Status : Stable + +=cut + +sub _query_unit { + my $self = shift; + throw( "Abstract method call!" ); +} + + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/ChainedAssemblyMapper.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/ChainedAssemblyMapper.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,819 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::ChainedAssemblyMapper - +Handles mapping between two coordinate systems using the information +stored in the assembly table + +=head1 SYNOPSIS + + $db = Bio::EnsEMBL::DBSQL::DBAdaptor->new(...); + $asma = $db->get_AssemblyMapperAdaptor(); + $csa = $db->get_CoordSystemAdaptor(); + + my $chr_cs = $cs_adaptor->fetch_by_name( 'chromosome', 'NCBI33' ); + my $cln_cs = $cs_adaptor->fetch_by_name('clone'); + + $asm_mapper = $map_adaptor->fetch_by_CoordSystems( $cs1, $cs2 ); + + # Map to contig coordinate system from chromosomal + @cln_coords = + $asm_mapper->map( 'X', 1_000_000, 2_000_000, 1, $chr_cs ); + + # Map to chromosome coordinate system from contig + @chr_coords = + $asm_mapper->map( 'AL30421.1', 100, 10000, -1, $cln_cs ); + + # List contig names for a region of chromsome + @cln_ids = $asm_mapper->list_ids( '13', 1_000_000, 1, $chr_cs ); + + # List chromosome names for a contig region + @chr_ids = + $asm_mapper->list_ids( 'AL30421.1', 1, 1000, -1, $cln_cs ); + +=head1 DESCRIPTION + +The ChainedAssemblyMapper is an extension of the regular AssemblyMapper +that allows for mappings between coordinate systems that require +multi-step mapping. For example if explicit mappings are defined +between the following coordinate systems, + + chromosome <-> contig + contig <-> clone + +the ChainedAssemblyMapper would be able to perform implicit mapping +between the chromosome and clone coordinate systems. This should be +transparent to the user of this module, and users should not even +realise that they are using a chained assembly mapper as opposed to a +normal assembly mapper. + +=head1 METHODS + +=cut + + +my $FIRST = 'first'; +my $MIDDLE = 'middle'; +my $LAST = 'last'; + +package Bio::EnsEMBL::ChainedAssemblyMapper; + +use strict; +use warnings; +use integer; #use proper arithmetic bitshifts + +use Bio::EnsEMBL::Mapper; +use Bio::EnsEMBL::Mapper::RangeRegistry; +use Bio::EnsEMBL::Utils::Exception qw(throw deprecate); +use Scalar::Util qw(weaken); + +#2^20 = approx 10^6 +my $CHUNKFACTOR = 20; + +# max size of the pair cache in the mappers +my $DEFAULT_MAX_PAIR_COUNT = 6000; + +=head2 new + + Arg [1] : Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor + Arg [2] : Bio::EnsEMBL::CoordSystem $src_cs + Arg [3] : Bio::EnsEMBL::CoordSystem $int_cs + Arg [4] : Bio::EnsEMBL::CoordSystem $dst_cs + Example : Should use AssemblyMapperAdaptor->fetch_by_CoordSystems + Description: Creates a new AssemblyMapper + Returntype : Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor + Exceptions : thrown if wrong number of coord_systems are provided + Caller : AssemblyMapperAdaptor + Status : Stable + +=cut + +sub new { + my ($caller,$adaptor,@coord_systems) = @_; + + my $class = ref($caller) || $caller; + + my $self = {}; + bless $self, $class; + + $self->adaptor($adaptor); + + if(@coord_systems != 3) { + throw('ChainedMapper can only map between 3 coordinate systems. ' . + scalar(@coord_systems) . ' were provided'); + } + + $adaptor->cache_seq_ids_with_mult_assemblys(); + + # Set the component, intermediate and assembled coordinate systems + $self->{'first_cs'} = $coord_systems[0]; + $self->{'mid_cs'} = $coord_systems[1]; + $self->{'last_cs'} = $coord_systems[2]; + + #maps between first and intermediate coord systems + $self->{'first_mid_mapper'} = Bio::EnsEMBL::Mapper->new($FIRST, $MIDDLE); + + #maps between last and intermediate + $self->{'last_mid_mapper'} = Bio::EnsEMBL::Mapper->new($LAST, $MIDDLE); + + #mapper that is actually used and is loaded by the mappings generated + #by the other two mappers + $self->{'first_last_mapper'} = Bio::EnsEMBL::Mapper->new($FIRST, $LAST, + $coord_systems[0], + $coord_systems[2]); + + #need registries to keep track of what regions are registered in source + #and destination coordinate systems + $self->{'first_registry'} = Bio::EnsEMBL::Mapper::RangeRegistry->new(); + $self->{'last_registry'} = Bio::EnsEMBL::Mapper::RangeRegistry->new(); + + $self->{'max_pair_count'} = $DEFAULT_MAX_PAIR_COUNT; + + return $self; +} + + +=head2 max_pair_count + + Arg [1] : (optional) int $max_pair_count + Example : $mapper->max_pair_count(100000) + Description: Getter/Setter for the number of mapping pairs allowed in the + internal cache. This can be used to override the default value + (6000) to tune the performance and memory usage for certain + scenarios. Higher value = bigger cache, more memory used + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub max_pair_count { + my $self = shift; + $self->{'max_pair_count'} = shift if(@_); + return $self->{'max_pair_count'}; +} + + + + +=head2 register_all + + Arg [1] : none + Example : $mapper->max_pair_count(10e6); + $mapper->register_all(); + Description: Pre-registers all assembly information in this mapper. The + cache size should be set to a sufficiently large value + so that all of the information can be stored. This method + is useful when *a lot* of mapping will be done in regions + which are distributed around the genome. After registration + the mapper will consume a lot of memory but will not have to + perform any SQL and will be faster. + Returntype : none + Exceptions : none + Caller : specialised programs doing a lot of mapping + Status : Stable + +=cut + +sub register_all { + my $self = shift; + $self->adaptor->register_all_chained($self); + return; +} + + + + +sub flush { + my $self = shift; + $self->{'first_registry'}->flush(); + $self->{'last_registry'}->flush(); + + $self->{'first_mid_mapper'}->flush(); + $self->{'last_mid_mapper'}->flush(); + $self->{'first_last_mapper'}->flush(); +} + +=head2 size + + Args : none + Example : $num_of_pairs = $mapper->size(); + Description: return the number of pairs currently stored. + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub size { + my $self = shift; + return ( $self->{'first_last_mapper'}->{'pair_count'} + + $self->{'last_mid_mapper'}->{'pair_count'} + + $self->{'first_mid_mapper'}->{'pair_count'} ); +} + + + +=head2 map + + Arg [1] : string $frm_seq_region + The name of the sequence region to transform FROM + Arg [2] : int $frm_start + The start of the region to transform FROM + Arg [3] : int $frm_end + The end of the region to transform FROM + Arg [4] : int $strand + The strand of the region to transform FROM + Arg [5] : Bio::EnsEMBL::CoordSystem + The coordinate system to transform FROM + Arg [6] : (optional) fastmap + Arg [7] : (optional) Bio::Ensembl::Slice + The slice to transform TO + Example : @coords = $asm_mapper->map('X', 1_000_000, 2_000_000, + 1, $chr_cs); + Description: Transforms coordinates from one coordinate system + to another. + Returntype : List of Bio::EnsEMBL::Mapper::Coordinate and/or + Bio::EnsEMBL::Mapper:Gap objects + Exceptions : thrown if the specified TO coordinat system is not one + of the coordinate systems associated with this assembly mapper + Caller : general + Status : Stable + +=cut + +sub map { + throw('Incorrect number of arguments.') if(@_ < 6); + + my ($self, $frm_seq_region_name, $frm_start, + $frm_end, $frm_strand, $frm_cs, $fastmap, $to_slice) = @_; + + my $mapper = $self->{'first_last_mapper'}; + my $first_cs = $self->{'first_cs'}; + my $last_cs = $self->{'last_cs'}; + + my $is_insert = ($frm_end + 1 == $frm_start); + + my $frm; + my $registry; + + + + + my @tmp; + push @tmp, $frm_seq_region_name; + my $seq_region_id = @{$self->adaptor()->seq_regions_to_ids($frm_cs, \@tmp)}[0]; + + #speed critical section: + #try to do simple pointer equality comparisons of the coord system objects + #first since this is likely to work most of the time and is much faster + #than a function call + + if($frm_cs == $first_cs || + ($frm_cs != $last_cs && $frm_cs->equals($first_cs))) { + $frm = $FIRST; + $registry = $self->{'first_registry'}; + } elsif($frm_cs == $last_cs || $frm_cs->equals($last_cs)) { + $frm = $LAST; + $registry = $self->{'last_registry'}; + } else { + throw("Coordinate system " . $frm_cs->name . " " . $frm_cs->version . + " is neither the first nor the last coordinate system " . + " of this ChainedAssemblyMapper"); + } + + #the minimum area we want to register if registration is necessary is + #about 1MB. Break requested ranges into chunks of 1MB and then register + #this larger region if we have a registry miss. + + #use bitwise shift for fast and easy integer multiplication and division + my ($min_start, $min_end); + + if($is_insert) { + $min_start = (($frm_end >> $CHUNKFACTOR) << $CHUNKFACTOR); + $min_end = ((($frm_start >> $CHUNKFACTOR) + 1) << $CHUNKFACTOR) - 1 ; + } else { + $min_start = (($frm_start >> $CHUNKFACTOR) << $CHUNKFACTOR); + $min_end = ((($frm_end >> $CHUNKFACTOR) + 1) << $CHUNKFACTOR) - 1 ; + } + + #get a list of ranges in the requested region that have not been registered, + #and register them at the same + + my $ranges; + + if($is_insert) { + $ranges = $registry->check_and_register($seq_region_id, $frm_end, + $frm_start, $min_start, $min_end); + } else { + $ranges = $registry->check_and_register($seq_region_id, $frm_start, + $frm_end, $min_start, $min_end); + } + + if(defined($ranges)) { + if( $self->size() > $self->{'max_pair_count'} ) { + $self->flush(); + + if($is_insert) { + $ranges = $registry->check_and_register + ($seq_region_id, $frm_end, $frm_start, $min_start, $min_end); + } else { + $ranges = $registry->check_and_register + ($seq_region_id, $frm_start, $frm_end, $min_start, $min_end); + } + } + $self->adaptor->register_chained($self,$frm,$seq_region_id,$ranges,$to_slice); + } + + if($fastmap) { + return $mapper->fastmap($seq_region_id, $frm_start, $frm_end, + $frm_strand, $frm); + } + + return $mapper->map_coordinates($seq_region_id, $frm_start, $frm_end, + $frm_strand, $frm); +} + + +sub fastmap { + my $self = shift; + return $self->map(@_,1); +} + + +=head2 list_ids + + Arg [1] : string $frm_seq_region + The name of the sequence region of interest + Arg [2] : int $frm_start + The start of the region of interest + Arg [3] : int $frm_end + The end of the region to transform of interest + Arg [5] : Bio::EnsEMBL::CoordSystem $frm_cs + The coordinate system to obtain overlapping ids of + Example : foreach $id ($asm_mapper->list_ids('X',1,1000,$chr_cs)) {...} + Description: Retrieves a list of overlapping seq_region internal identifiers + of another coordinate system. This is the same as the + list_seq_regions method but uses internal identfiers rather + than seq_region strings + Returntype : List of ints + Exceptions : none + Caller : general + Status : Stable + +=cut + + +sub list_ids { + throw('Incorrect number of arguments.') if(@_ != 5); + my($self, $frm_seq_region_name, $frm_start, $frm_end, $frm_cs) = @_; + + my $is_insert = ($frm_start == $frm_end + 1); + + #the minimum area we want to register if registration is necessary is + #about 1MB. Break requested ranges into chunks of 1MB and then register + #this larger region if we have a registry miss. + + #use bitwise shift for fast and easy integer multiplication and division + my ($min_start, $min_end); + + if($is_insert) { + $min_start = (($frm_end >> $CHUNKFACTOR) << $CHUNKFACTOR); + $min_end = ((($frm_start >> $CHUNKFACTOR) + 1) << $CHUNKFACTOR) - 1; + } else { + $min_start = (($frm_start >> $CHUNKFACTOR) << $CHUNKFACTOR); + $min_end = ((($frm_end >> $CHUNKFACTOR) + 1) << $CHUNKFACTOR) - 1; + } + + my @tmp; + push @tmp, $frm_seq_region_name; + my $seq_region_id = @{$self->adaptor()->seq_regions_to_ids($frm_cs, \@tmp)}[0]; + + if($frm_cs->equals($self->{'first_cs'})) { + my $registry = $self->{'first_registry'}; + + my $ranges; + + + if($is_insert) { + $ranges = $registry->check_and_register + ($seq_region_id, $frm_end, $frm_start, $min_start, $min_end); + } else { + $ranges = $registry->check_and_register + ($seq_region_id, $frm_start, $frm_end, $min_start, $min_end); + } + + if(defined($ranges)) { + $self->adaptor->register_chained($self,$FIRST,$seq_region_id,$ranges); + } + + return map {$_->to()->id()} + $self->first_last_mapper()->list_pairs($seq_region_id, $frm_start, + $frm_end, $FIRST); + + } elsif($frm_cs->equals($self->{'last_cs'})) { + my $registry = $self->{'last_registry'}; + + my $ranges; + if($is_insert) { + $ranges = $registry->check_and_register + ($seq_region_id, $frm_end, $frm_start, $min_start, $min_end); + } else { + $ranges = $registry->check_and_register + ($seq_region_id, $frm_start, $frm_end, $min_start, $min_end); + } + + if(defined($ranges)) { + $self->adaptor->register_chained($self,$LAST,$seq_region_id,$ranges); + } + + return map {$_->from()->id()} + $self->first_last_mapper()->list_pairs($seq_region_id, $frm_start, + $frm_end, $LAST); + } else { + throw("Coordinate system " . $frm_cs->name . " " . $frm_cs->version . + " is neither the first nor the last coordinate system " . + " of this ChainedAssemblyMapper"); + } +} + + +=head2 list_seq_regions + + Arg [1] : string $frm_seq_region + The name of the sequence region of interest + Arg [2] : int $frm_start + The start of the region of interest + Arg [3] : int $frm_end + The end of the region to transform of interest + Arg [5] : Bio::EnsEMBL::CoordSystem $frm_cs + The coordinate system to obtain overlapping ids of + Example : foreach $id ($asm_mapper->list_ids('X',1,1000,$ctg_cs)) {...} + Description: Retrieves a list of overlapping seq_region internal identifiers + of another coordinate system. This is the same as the + list_ids method but uses seq_region names rather internal ids + Returntype : List of strings + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub list_seq_regions { + throw('Incorrect number of arguments.') if(@_ != 5); + my($self, $frm_seq_region, $frm_start, $frm_end, $frm_cs) = @_; + + #retrieve the seq_region names + my @seq_regs = + $self->list_ids($frm_seq_region,$frm_start,$frm_end,$frm_cs); + + #The seq_regions are from the 'to' coordinate system not the + #from coordinate system we used to obtain them + my $to_cs; + if($frm_cs->equals($self->first_CoordSystem())) { + $to_cs = $self->last_CoordSystem(); + } else { + $to_cs = $self->first_CoordSystem(); + } + + #convert them to names + return @{$self->adaptor()->seq_ids_to_regions(\@seq_regs)}; +} + + + + + + +=head2 first_last_mapper + + Args : none + Example : $mapper = $cam->first_last_mapper(); + Description: return the mapper. + Returntype : Bio::EnsEMBL::Mapper + Exceptions : none + Caller : internal + Status : Stable + +=cut + +sub first_last_mapper { + my $self = shift; + return $self->{'first_last_mapper'}; +} + +=head2 first_middle_mapper + + Args : none + Example : $mapper = $cam->first_middle_mapper(); + Description: return the mapper. + Returntype : Bio::EnsEMBL::Mapper + Exceptions : none + Caller : internal + Status : Stable + +=cut + + +sub first_middle_mapper { + my $self = shift; + return $self->{'first_mid_mapper'}; +} + +=head2 last_middle_mapper + + Args : none + Example : $mapper = $cam->last_middle_mapper(); + Description: return the mapper. + Returntype : Bio::EnsEMBL::Mapper + Exceptions : none + Caller : internal + Status : Stable + +=cut + +sub last_middle_mapper { + my $self = shift; + return $self->{'last_mid_mapper'}; +} + + +=head2 first_CoordSystem + + Args : none + Example : $coordsys = $cam->first_CoordSystem(); + Description: return the CoordSystem. + Returntype : Bio::EnsEMBL::CoordSystem + Exceptions : none + Caller : internal + Status : Stable + +=cut + +sub first_CoordSystem { + my $self = shift; + return $self->{'first_cs'}; +} + + +=head2 middle_CoordSystem + + Args : none + Example : $coordsys = $cam->middle_CoordSystem(); + Description: return the CoordSystem. + Returntype : Bio::EnsEMBL::CoordSystem + Exceptions : none + Caller : internal + Status : Stable + +=cut + +sub middle_CoordSystem { + my $self = shift; + return $self->{'mid_cs'}; +} + +=head2 last_CoordSystem + + Args : none + Example : $coordsys = $cam->last_CoordSystem(); + Description: return the CoordSystem. + Returntype : Bio::EnsEMBL::CoordSystem + Exceptions : none + Caller : internal + Status : Stable + +=cut + +sub last_CoordSystem { + my $self = shift; + return $self->{'last_cs'}; +} + +=head2 first_registry + + Args : none + Example : $rr = $cam->first_registry(); + Description: return the Registry. + Returntype : Bio::EnsEMBL::Mapper::RangeRegistry + Exceptions : none + Caller : internal + Status : Stable + +=cut + +sub first_registry { + my $self = shift; + return $self->{'first_registry'}; +} + +=head2 last_registry + + Args : none + Example : $rr = $cam->last_registry(); + Description: return the Registry. + Returntype : Bio::EnsEMBL::Mapper::RangeRegistry + Exceptions : none + Caller : internal + Status : Stable + +=cut + +sub last_registry { + my $self = shift; + return $self->{'last_registry'}; +} + + +# +# Methods supplied to maintain polymorphism with AssemblyMapper there +# is no real assembled or component in the chained mapper, since the +# ordering is arbitrary and both ends might actually be assembled, but +# these methods provide convenient synonyms +# + +=head2 mapper + + Args : none + Example : $mapper = $cam->mapper(); + Description: return the first_last_mapper. + Returntype : Bio::EnsEMBL::Mapper + Exceptions : none + Caller : internal + Status : Stable + +=cut + +sub mapper { + my $self = shift; + return $self->first_last_mapper(); +} + +=head2 assembled_CoordSystem + + Args : none + Example : $coordsys = $cam->assembled_CoordSystem(); + Description: return the first CoordSystem. + Returntype : Bio::EnsEMBL::CoordSystem + Exceptions : none + Caller : internal + Status : Stable + +=cut + + +sub assembled_CoordSystem { + my $self = shift; + return $self->{'first_cs'}; +} + +=head2 component_CoordSystem + + Args : none + Example : $coordsys = $cam->component_CoordSystem(); + Description: return the last CoordSystem. + Returntype : Bio::EnsEMBL::CoordSystem + Exceptions : none + Caller : internal + Status : Stable + +=cut + +sub component_CoordSystem { + my $self = shift; + return $self->{'last_cs'}; +} + + +=head2 adaptor + + Arg [1] : Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor $adaptor + Description: get/set for this objects database adaptor + Returntype : Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub adaptor { + my $self = shift; + weaken($self->{'adaptor'} = shift) if(@_); + return $self->{'adaptor'}; +} + + +=head2 in_assembly + + Deprecated. Use map() or list_ids() instead + +=cut + +sub in_assembly { + my ($self, $object) = @_; + + deprecate('Use map() or list_ids() instead.'); + + my $csa = $self->db->get_CoordSystemAdaptor(); + + my $top_level = $csa->fetch_top_level(); + + my $asma = $self->adaptor->fetch_by_CoordSystems($object->coord_system(), + $top_level); + + my @list = $asma->list_ids($object->seq_region(), $object->start(), + $object->end(), $object->coord_system()); + + return (@list > 0); +} + + +=head2 map_coordinates_to_assembly + + DEPRECATED use map() instead + +=cut + +sub map_coordinates_to_assembly { + my ($self, $contig_id, $start, $end, $strand) = @_; + + deprecate('Use map() instead.'); + + #not sure if contig_id is seq_region_id or name... + return $self->map($contig_id, $start, $end, $strand, + $self->contig_CoordSystem()); + +} + + +=head2 fast_to_assembly + + DEPRECATED use map() instead + +=cut + +sub fast_to_assembly { + my ($self, $contig_id, $start, $end, $strand) = @_; + + deprecate('Use map() instead.'); + + #not sure if contig_id is seq_region_id or name... + return $self->map($contig_id, $start, $end, $strand, + $self->contig_CoordSystem()); +} + + +=head2 map_coordinates_to_rawcontig + + DEPRECATED use map() instead + +=cut + +sub map_coordinates_to_rawcontig { + my ($self, $chr_name, $start, $end, $strand) = @_; + + deprecate('Use map() instead.'); + + return $self->map($chr_name, $start, $end, $strand, + $self->assembled_CoordSystem()); + +} + +=head2 list_contig_ids + DEPRECATED Use list_ids instead + +=cut + +sub list_contig_ids { + my ($self, $chr_name, $start, $end) = @_; + + deprecate('Use list_ids() instead.'); + + return $self->list_ids($chr_name, $start, $end, + $self->assembled_CoordSystem()); +} + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/CircularSlice.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/CircularSlice.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,3860 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::CircularSlice - Arbitary Slice of a genome + +=head1 SYNOPSIS + + $sa = $db->get_SliceAdaptor; + + $slice = + $sa->fetch_by_region( 'chromosome', 'X', 1_000_000, 2_000_000 ); + + # get some attributes of the slice + my $seqname = $slice->seq_region_name(); + my $start = $slice->start(); + my $end = $slice->end(); + + # get the sequence from the slice + my $seq = $slice->seq(); + + # get some features from the slice + foreach $gene ( @{ $slice->get_all_Genes } ) { + # do something with a gene + } + + foreach my $feature ( @{ $slice->get_all_DnaAlignFeatures } ) { + # do something with dna-dna alignments + } + +=head1 DESCRIPTION + +A Slice object represents a region of a genome. It can be used to +retrieve sequence or features from an area of interest. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::CircularSlice; +use vars qw(@ISA); +use strict; + +use Bio::PrimarySeqI; + +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception + qw(throw deprecate warning stack_trace_dump); +use Bio::EnsEMBL::RepeatMaskedSlice; +use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); +use Bio::EnsEMBL::Utils::Scalar qw( assert_ref ); +use Bio::EnsEMBL::ProjectionSegment; +use Bio::EnsEMBL::Registry; +use Bio::EnsEMBL::DBSQL::MergedAdaptor; + +use Bio::EnsEMBL::StrainSlice; +#use Bio::EnsEMBL::IndividualSlice; +#use Bio::EnsEMBL::IndividualSliceFactory; +use Bio::EnsEMBL::Mapper::RangeRegistry; +use Bio::EnsEMBL::Slice; +use Data::Dumper; +use Scalar::Util qw(weaken isweak); + +my $reg = "Bio::EnsEMBL::Registry"; + +@ISA = qw(Bio::EnsEMBL::Slice); + +=head2 new + + Arg [...] : List of named arguments + Bio::EnsEMBL::CoordSystem COORD_SYSTEM + string SEQ_REGION_NAME, + int START, + int END, + int SEQ_REGION_LENGTH, (optional) + string SEQ (optional) + int STRAND, (optional, defaults to 1) + Bio::EnsEMBL::DBSQL::SliceAdaptor ADAPTOR (optional) + Example : + + $slice = + Bio::EnsEMBL::CircularSlice->new( -coord_system => $cs, + -start => 1, + -end => 10000, + -strand => 1, + -seq_region_name => 'X', + -seq_region_length => 12e6, + -adaptor => $slice_adaptor ); + + Description: Creates a new slice object. A slice represents a + region of sequence in a particular coordinate system. + Slices can be used to retrieve sequence and features + from an area of interest in a genome. + + Coordinates start at 1 and are inclusive. Negative + coordinates or coordinates exceeding the length of + the seq_region are permitted. Start must be less + than or equal. to end regardless of the strand. + + Slice objects are immutable. Once instantiated their + attributes (with the exception of the adaptor) may + not be altered. To change the attributes a new slice + must be created. + + Returntype : Bio::EnsEMBL::CircularSlice + Exceptions : throws if start, end, coordsystem or seq_region_name not + specified or not of the correct type + Caller : general, Bio::EnsEMBL::SliceAdaptor + Status : Stable + +=cut + +sub new { + my $caller = shift; + + #new can be called as a class or object method + my $class = ref($caller) || $caller; + + my ( $seq, $coord_system, $seq_region_name, $seq_region_length, + $start, $end, $strand, $adaptor, $empty ) + = rearrange( [ + qw(SEQ COORD_SYSTEM SEQ_REGION_NAME SEQ_REGION_LENGTH + START END STRAND ADAPTOR EMPTY) ], + @_ ); + + #empty is only for backwards compatibility + if ($empty) { + deprecate( "Creation of empty slices is no longer needed " + . "and is deprecated" ); + my $self = bless( { 'empty' => 1 }, $class ); + $self->adaptor($adaptor); + return $self; + } + + if ( !defined($seq_region_name) ) { + throw('SEQ_REGION_NAME argument is required'); + } + if ( !defined($start) ) { throw('START argument is required') } + if ( !defined($end) ) { throw('END argument is required') } + + if ( !defined($seq_region_length) ) { $seq_region_length = $end } + + if ( $seq_region_length <= 0 ) { + throw('SEQ_REGION_LENGTH must be > 0'); + } + + if ( defined($coord_system) ) { + assert_ref( $coord_system, 'Bio::EnsEMBL::CoordSystem' ); + + if ( $coord_system->is_top_level() ) { + throw('Cannot create circular slice on toplevel CoordSystem.'); + } + } else { + warning("CircularSlice without coordinate system"); + } + + $strand ||= 1; + + if ( $strand != 1 && $strand != -1 ) { + throw('STRAND argument must be -1 or 1'); + } + + if ( defined($adaptor) ) { + assert_ref( $adaptor, 'Bio::EnsEMBL::DBSQL::SliceAdaptor' ); + } + + my $seq1 = { 'coord_system' => $coord_system, + 'seq' => $seq, + 'seq_region_name' => $seq_region_name, + 'seq_region_length' => $seq_region_length, + 'start' => int($start), + 'end' => int($end), + 'strand' => $strand }; + + bless $seq1, $class; + $seq1->adaptor($adaptor); + return $seq1; +} ## end sub new + +=head2 new_fast + + Arg [1] : hashref to be blessed + Description: Construct a new Bio::EnsEMBL::Slice using the hashref. + Exceptions : none + Returntype : Bio::EnsEMBL::CircularSlice + Caller : general + Status : Stable + +=cut + +sub new_fast { + my $class = shift; + my $hashref = shift; + my $self = bless $hashref, $class; + weaken($self->{adaptor}) if ( ! isweak($self->{adaptor}) ); + return $self; +} + +=head2 centrepoint + + Arg [1] : none + Example : $cp = $slice->centrepoint(); + Description: Returns the mid position of this slice relative to the + start of the sequence region that it was created on. + Coordinates are inclusive and start at 1. + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub centrepoint { + my $self = shift; + + my ( $s, $e, $length ) = + ( $self->{'start'}, $self->{'end'}, $self->{'seq_region_length'} ); + + if ( $s < $e ) { + return ( $s + $e )/2; + } + + my $r1 = $length - $s; + my $r2 = $e; + my $r = ( $r1 + $r2 )/2; + my $m = $s + $r; + + if ( $m > $length ) { + $m = $m - $length; + } + + return $m; +} + +=head2 length + + Arg [1] : none + Example : $length = $slice->length(); + Description: Returns the length of this slice in basepairs + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub length { + my ($self) = @_; + + if ( $self->{'start'} < $self->{'end'} ) { + return $self->{'end'} - $self->{'start'} + 1; + } + + my $r1 = $self->{'seq_region_length'} - $self->{'start'}; + my $r2 = $self->{'end'}; + my $ln = $r1 + $r2 + 1; + + return $ln; +} + +=head2 invert + + Arg [1] : none + Example : $inverted_slice = $slice->invert; + Description: Creates a copy of this slice on the opposite strand and + returns it. + Returntype : Bio::EnsEMBL::Slice + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub invert { + my $self = shift; + + # make a shallow copy of the slice via a hash copy and flip the strand + my %s = %$self; + $s{'strand'} = $self->{'strand'}*-1; + + # reverse compliment any attached sequence + reverse_comp( \$s{'seq'} ) if ( $s{'seq'} ); + + # bless and return the copy + return bless \%s, ref $self; +} + +=head2 seq + + Arg [1] : none + Example : print "SEQUENCE = ", $slice->seq(); + Description: Returns the sequence of the region represented by this + slice formatted as a string. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub seq { + my $self = shift; + + # special case for in-between (insert) coordinates + return '' if ( $self->start() == $self->end() + 1 ); + return $self->{'seq'} if ( $self->{'seq'} ); + + if ( $self->adaptor() ) { + + my $seqAdaptor = $self->adaptor()->db()->get_SequenceAdaptor(); + if ( $self->{'start'} > $self->{'end'} ) { + my $length = $self->{'seq_region_length'}; + + my $sl1 = + Bio::EnsEMBL::Slice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => $self->{'start'}, + -END => $self->{'seq_region_length'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my $sl2 = + Bio::EnsEMBL::Slice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => 1, + -END => $self->{'end'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my $seq1 = ${ + $seqAdaptor->fetch_by_Slice_start_end_strand( $sl1, 1, undef, + 1 ) }; + my $seq2 = ${ + $seqAdaptor->fetch_by_Slice_start_end_strand( $sl2, 1, undef, + 1 ) }; + return $seq1 . $seq2; + + } else { + my $seq1 = ${ + $seqAdaptor->fetch_by_Slice_start_end_strand( $self, 1, undef, + 1 ) }; + return $seq1; + } + } ## end if ( $self->adaptor() ) + + # no attached sequence, and no db, so just return Ns + return 'N' x $self->length(); +} ## end sub seq + +=head2 subseq + + Arg [1] : int $startBasePair + relative to start of slice, which is 1. + Arg [2] : int $endBasePair + relative to start of slice. + Arg [3] : (optional) int $strand + The strand of the slice to obtain sequence from. Default + value is 1. + Description: returns string of dna sequence + Returntype : txt + Exceptions : end should be at least as big as start + strand must be set + Caller : general + Status : Stable + +=cut + +sub subseq { + my ( $self, $start, $end, $strand ) = @_; + + # handle 'between' case for insertions + return '' if ( $start == $end + 1 ); + + $strand = 1 unless ( defined $strand ); + + if ( $strand != -1 && $strand != 1 ) { + throw("Invalid strand [$strand] in call to Slice::subseq."); + } + my $subseq; + my $length = $self->{'seq_region_length'}; + + if ( $self->adaptor ) { + + my $seqAdaptor = $self->adaptor->db->get_SequenceAdaptor(); + if ( $end < $start ) { + my $subseq1 = ${ + $seqAdaptor->fetch_by_Slice_start_end_strand( $self, $start, + $length, $strand ) + }; + my $subseq2 = ${ + $seqAdaptor->fetch_by_Slice_start_end_strand( $self, 1, $end, + $strand ) }; + $subseq = $subseq1 . $subseq2; + + } else { + $subseq = ${ + $seqAdaptor->fetch_by_Slice_start_end_strand( $self, $start, + $end, $strand ) }; + } + } else { + ## check for gap at the beginning and pad it with Ns + if ( $start < 1 ) { + $subseq = "N" x ( 1 - $start ); + $start = 1; + } + $subseq .= substr( $self->seq(), $start - 1, $end - $start + 1 ); + ## check for gap at the end and pad it with Ns + if ( $end > $self->length() ) { + $subseq .= "N" x ( $end - $self->length() ); + } + reverse_comp( \$subseq ) if ( $strand == -1 ); + } + return $subseq; +} ## end sub subseq + +=head2 project + + Arg [1] : string $name + The name of the coordinate system to project this slice onto + Arg [2] : string $version + The version of the coordinate system (such as 'NCBI34') to + project this slice onto + Example : + my $clone_projection = $slice->project('clone'); + + foreach my $seg (@$clone_projection) { + my $clone = $segment->to_Slice(); + print $slice->seq_region_name(), ':', $seg->from_start(), '-', + $seg->from_end(), ' -> ', + $clone->seq_region_name(), ':', $clone->start(), '-', + $clone->end(), + $clone->strand(), "\n"; + } + Description: Returns the results of 'projecting' this slice onto another + coordinate system. Projecting to a coordinate system that + the slice is assembled from is analagous to retrieving a tiling + path. This method may also be used to 'project up' to a higher + level coordinate system, however. + + This method returns a listref of triplets [start,end,slice] + which represents the projection. The start and end defined the + region of this slice which is made up of the third value of + the triplet: a slice in the requested coordinate system. + Returntype : list reference of Bio::EnsEMBL::ProjectionSegment objects which + can also be used as [$start,$end,$slice] triplets + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub project { + my $self = shift; + my $cs_name = shift; + my $cs_version = shift; + + throw('Coord_system name argument is required') if ( !$cs_name ); + + my $slice_adaptor = $self->adaptor(); + + if ( !$slice_adaptor ) { + warning("Cannot project without attached adaptor."); + return []; + } + + if ( !$self->coord_system() ) { + warning("Cannot project without attached coord system."); + return []; + } + + my $db = $slice_adaptor->db(); + my $csa = $db->get_CoordSystemAdaptor(); + my $cs = $csa->fetch_by_name( $cs_name, $cs_version ); + + my $sl01 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => $self->{'start'}, + -END => $self->{'seq_region_length'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my $sl02 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => 1, + -END => $self->{'end'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my @projection; + my $current_start = 1; + + foreach my $sl2 ( $sl01, $sl02 ) { + my $slice_cs = $sl2->coord_system(); + + if ( !$cs ) { + throw( "Cannot project to unknown coordinate system " + . "[$cs_name $cs_version]" ); + } + +# no mapping is needed if the requested coord system is the one we are in +# but we do need to check if some of the slice is outside of defined regions + if ( $slice_cs->equals($cs) ) { + return $self->_constrain_to_region(); + } + + # decompose this slice into its symlinked components. + # this allows us to handle haplotypes and PARs + my $normal_slice_proj = + $slice_adaptor->fetch_normalized_slice_projection($sl2); + + foreach my $segment (@$normal_slice_proj) { + my $normal_slice = $segment->[2]; + + $slice_cs = $normal_slice->coord_system(); + + my $asma = $db->get_AssemblyMapperAdaptor(); + my $asm_mapper = $asma->fetch_by_CoordSystems( $slice_cs, $cs ); + + # perform the mapping between this slice and the requested system + my @coords; + + if ( defined $asm_mapper ) { + @coords = $asm_mapper->map( $normal_slice->seq_region_name(), + $normal_slice->start(), + $normal_slice->end(), + $normal_slice->strand(), + $slice_cs ); + + } else { + $coords[0] = + Bio::EnsEMBL::Mapper::Gap->new( $normal_slice->start(), + $normal_slice->end() ); + } + + #construct a projection from the mapping results and return it + foreach my $coord (@coords) { + my $coord_start = $coord->start(); + my $coord_end = $coord->end(); + my $length = $coord_end - $coord_start + 1; + + #skip gaps + if ( $coord->isa('Bio::EnsEMBL::Mapper::Coordinate') ) { + my $coord_cs = $coord->coord_system(); + + # If the normalised projection just ended up mapping to the + # same coordinate system we were already in then we should just + # return the original region. This can happen for example, if we + # were on a PAR region on Y which refered to X and a projection to + # 'toplevel' was requested. + + if ( $coord_cs->equals($slice_cs) ) { + # trim off regions which are not defined + return $self->_constrain_to_region(); + } + #create slices for the mapped-to coord system + + my $slice = + $slice_adaptor->fetch_by_seq_region_id( + $coord->id(), $coord_start, + $coord_end, $coord->strand() + ); + + my $current_end = $current_start + $length - 1; + push @projection, + bless( [ $current_start, $current_end, $slice ], + "Bio::EnsEMBL::ProjectionSegment" ); + } ## end if ( $coord->isa('Bio::EnsEMBL::Mapper::Coordinate'...)) + + $current_start += $length; + } ## end foreach my $coord (@coords) + } ## end foreach my $segment (@$normal_slice_proj) + } #foreach + + return \@projection; +} ## end sub project + +sub project_org { + my $self = shift; + my $cs_name = shift; + my $cs_version = shift; + + throw('Coord_system name argument is required') if ( !$cs_name ); + + my $slice_adaptor = $self->adaptor(); + + if ( !$slice_adaptor ) { + warning("Cannot project without attached adaptor."); + return []; + } + + if ( !$self->coord_system() ) { + warning("Cannot project without attached coord system."); + return []; + } + + my $db = $slice_adaptor->db(); + my $csa = $db->get_CoordSystemAdaptor(); + my $cs = $csa->fetch_by_name( $cs_name, $cs_version ); + my $slice_cs = $self->coord_system(); + + if ( !$cs ) { + throw( "Cannot project to unknown coordinate system " + . "[$cs_name $cs_version]" ); + } + + # No mapping is needed if the requested coord system is the one we + # are in. But we do need to check if some of the slice is outside of + # defined regions. + if ( $slice_cs->equals($cs) ) { + return $self->_constrain_to_region(); + } + + my @projection; + my $current_start = 1; + + # Decompose this slice into its symlinked components. This allows us + # to handle haplotypes and PARs. + my $normal_slice_proj = + $slice_adaptor->fetch_normalized_slice_projection($self); + foreach my $segment (@$normal_slice_proj) { + my $normal_slice = $segment->[2]; + + $slice_cs = $normal_slice->coord_system(); + + my $asma = $db->get_AssemblyMapperAdaptor(); + my $asm_mapper = $asma->fetch_by_CoordSystems( $slice_cs, $cs ); + + # perform the mapping between this slice and the requested system + my @coords; + + if ( defined $asm_mapper ) { + @coords = $asm_mapper->map( $normal_slice->seq_region_name(), + $normal_slice->start(), + $normal_slice->end(), + $normal_slice->strand(), + $slice_cs ); + + } else { + $coords[0] = + Bio::EnsEMBL::Mapper::Gap->new( $normal_slice->start(), + $normal_slice->end() ); + } + + #construct a projection from the mapping results and return it + foreach my $coord (@coords) { + my $coord_start = $coord->start(); + my $coord_end = $coord->end(); + my $length = $coord_end - $coord_start + 1; + + #skip gaps + if ( $coord->isa('Bio::EnsEMBL::Mapper::Coordinate') ) { + my $coord_cs = $coord->coord_system(); + + # If the normalised projection just ended up mapping to the + # same coordinate system we were already in then we should just + # return the original region. This can happen for example, + # if we were on a PAR region on Y which refered to X and a + # projection to 'toplevel' was requested. + + if ( $coord_cs->equals($slice_cs) ) { + # trim off regions which are not defined + return $self->_constrain_to_region(); + } + #create slices for the mapped-to coord system + + my $slice = + $slice_adaptor->fetch_by_seq_region_id( $coord->id(), + $coord_start, $coord_end, $coord->strand() ); + + my $current_end = $current_start + $length - 1; + + push @projection, + bless( [ $current_start, $current_end, $slice ], + "Bio::EnsEMBL::ProjectionSegment" ); + } ## end if ( $coord->isa('Bio::EnsEMBL::Mapper::Coordinate'...)) + + $current_start += $length; + } ## end foreach my $coord (@coords) + } ## end foreach my $segment (@$normal_slice_proj) + + return \@projection; +} ## end sub project_org + +sub _constrain_to_region { + my $self = shift; + + my $entire_len = $self->seq_region_length(); + + # If the slice has negative coordinates or coordinates exceeding the + # exceeding length of the sequence region we want to shrink the slice + # to the defined region. + + if ( $self->{'start'} > $entire_len || $self->{'end'} < 1 ) { + #none of this slice is in a defined region + return []; + } + + my $right_contract = 0; + my $left_contract = 0; + if ( $self->{'end'} > $entire_len ) { + $right_contract = $entire_len - $self->{'end'}; + } + if ( $self->{'start'} < 1 ) { + $left_contract = $self->{'start'} - 1; + } + + my $new_slice; + if ( $left_contract || $right_contract ) { + $new_slice = $self->expand( $left_contract, $right_contract ); + } else { + $new_slice = $self; + } + + return [ bless [ 1 - $left_contract, + $self->length() + $right_contract, + $new_slice ], + "Bio::EnsEMBL::ProjectionSegment" ]; +} ## end sub _constrain_to_region + +=head2 expand + + Arg [1] : (optional) int $five_prime_expand + The number of basepairs to shift this slices five_prime + coordinate by. Positive values make the slice larger, + negative make the slice smaller. + coordinate left. + Default = 0. + Arg [2] : (optional) int $three_prime_expand + The number of basepairs to shift this slices three_prime + coordinate by. Positive values make the slice larger, + negative make the slice smaller. + Default = 0. + Arg [3] : (optional) bool $force_expand + if set to 1, then the slice will be contracted even in the case + when shifts $five_prime_expand and $three_prime_expand overlap. + In that case $five_prime_expand and $three_prime_expand will be set + to a maximum possible number and that will result in the slice + which would have only 2pbs. + Default = 0. + Arg [4] : (optional) int* $fpref + The reference to a number of basepairs to shift this slices five_prime + coordinate by. Normally it would be set to $five_prime_expand. + But in case when $five_prime_expand shift can not be applied and + $force_expand is set to 1, then $$fpref will contain the maximum possible + shift + Arg [5] : (optional) int* $tpref + The reference to a number of basepairs to shift this slices three_prime + coordinate by. Normally it would be set to $three_prime_expand. + But in case when $five_prime_expand shift can not be applied and + $force_expand is set to 1, then $$tpref will contain the maximum possible + shift + Example : my $expanded_slice = $slice->expand( 1000, 1000); + my $contracted_slice = $slice->expand(-1000,-1000); + my $shifted_right_slice = $slice->expand(-1000, 1000); + my $shifted_left_slice = $slice->expand( 1000,-1000); + my $forced_contracted_slice = $slice->expand(-1000,-1000, 1, \$five_prime_shift, \$three_prime_shift); + + Description: Returns a slice which is a resized copy of this slice. The + start and end are moved outwards from the center of the slice + if positive values are provided and moved inwards if negative + values are provided. This slice remains unchanged. A slice + may not be contracted below 1bp but may grow to be arbitrarily + large. + Returntype : Bio::EnsEMBL::Slice + Exceptions : warning if an attempt is made to contract the slice below 1bp + Caller : general + Status : Stable + +=cut + +sub expand { + my $self = shift; + my $five_prime_shift = shift || 0; + my $three_prime_shift = shift || 0; + my $force_expand = shift || 0; + my $fpref = shift; + my $tpref = shift; + + if ( $self->{'seq'} ) { + warning( + "Cannot expand a slice which has a manually attached sequence "); + return undef; + } + + my $new_start; + my $new_end; + my $sshift = $five_prime_shift; + my $eshift = $three_prime_shift; + + if ( $self->{'strand'} != 1 ) { + $eshift = $five_prime_shift; + $sshift = $three_prime_shift; + } + + $new_start = $self->{'start'} - $sshift; + $new_end = $self->{'end'} + $eshift; + +# if($new_start > $new_end) { +# if ($force_expand) { # Apply max possible shift, if force_expand is set +# if ($sshift < 0) { # if we are contracting the slice from the start - move the start just before the end +# $new_start = $new_end - 1; +# $sshift = $self->{start} - $new_start; +# } + +# if($new_start > $new_end) { # if the slice still has a negative length - try to move the end +# if ($eshift < 0) { +# $new_end = $new_start + 1; +# $eshift = $new_end - $self->{end}; +# } +# } +# return the values by which the primes were actually shifted +# $$tpref = $self->{strand} == 1 ? $eshift : $sshift; +# $$fpref = $self->{strand} == 1 ? $sshift : $eshift; +# } +# if($new_start > $new_end) { +# throw('Slice start cannot be greater than slice end'); +# } +# } + + #fastest way to copy a slice is to do a shallow hash copy + my %new_slice = %$self; + $new_slice{'start'} = int($new_start); + $new_slice{'end'} = int($new_end); + + return bless \%new_slice, ref($self); +} ## end sub expand + +=head2 sub_Slice_same + + Arg 1 : int $start + Arg 2 : int $end + Arge [3] : int $strand + Description: Makes another Slice that covers only part of this slice + If a slice is requested which lies outside of the boundaries + of this function will return undef. This means that + behaviour will be consistant whether or not the slice is + attached to the database (i.e. if there is attached sequence + to the slice). Alternatively the expand() method or the + SliceAdaptor::fetch_by_region method can be used instead. + Returntype : Bio::EnsEMBL::Slice or undef if arguments are wrong + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub sub_Slice_same { + my ( $self, $start, $end, $strand ) = @_; + + if ( $start < 1 || $start > $self->{'end'} ) { + # throw( "start argument not valid" ); + return undef; + } + + if ( $end < $start || $end > $self->{'end'} ) { + # throw( "end argument not valid" ) + return undef; + } + + my ( $new_start, $new_end, $new_strand, $new_seq ); + if ( !defined $strand ) { + $strand = 1; + } + + if ( $self->{'strand'} == 1 ) { + $new_start = $self->{'start'} + $start - 1; + $new_end = $self->{'start'} + $end - 1; + $new_strand = $strand; + } else { + $new_start = $self->{'end'} - $end + 1; + $new_end = $self->{'end'} - $start + 1; + $new_strand = -$strand; + } + + if ( defined $self->{'seq'} ) { + $new_seq = $self->subseq( $start, $end, $strand ); + } + + #fastest way to copy a slice is to do a shallow hash copy + my %new_slice = %$self; + $new_slice{'start'} = int($new_start); + $new_slice{'end'} = int($new_end); + $new_slice{'strand'} = $new_strand; + if ($new_seq) { + $new_slice{'seq'} = $new_seq; + } + + return bless \%new_slice, ref($self); +} ## end sub sub_Slice_same + +=head2 seq_region_Slice_same + + Arg [1] : none + Example : $slice = $slice->seq_region_Slice(); + Description: Returns a slice which spans the whole seq_region which this slice + is on. For example if this is a slice which spans a small region + of chromosome X, this method will return a slice which covers the + entire chromosome X. The returned slice will always have strand + of 1 and start of 1. This method cannot be used if the sequence + of the slice has been set manually. + Returntype : Bio::EnsEMBL::Slice + Exceptions : warning if called when sequence of Slice has been set manually. + Caller : general + Status : Stable + +=cut + +sub seq_region_Slice_same { + my $self = shift; + + if ( $self->{'seq'} ) { + warning( + "Cannot get a seq_region_Slice of a slice which has manually " + . "attached sequence " ); + return undef; + } + + # quick shallow copy + my $slice; + %{$slice} = %{$self}; + bless $slice, ref($self); + + $slice->{'start'} = 1; + $slice->{'end'} = $slice->{'seq_region_length'}; + $slice->{'strand'} = 1; + + return $slice; +} + +=head2 get_seq_region_id_same + + Arg [1] : none + Example : my $seq_region_id = $slice->get_seq_region_id(); + Description: Gets the internal identifier of the seq_region that this slice + is on. Note that this function will not work correctly if this + slice does not have an attached adaptor. Also note that it may + be better to go through the SliceAdaptor::get_seq_region_id + method if you are working with multiple databases since is + possible to work with slices from databases with different + internal seq_region identifiers. + Returntype : int or undef if slices does not have attached adaptor + Exceptions : warning if slice is not associated with a SliceAdaptor + Caller : assembly loading scripts, general + Status : Stable + +=cut + +sub get_seq_region_id_same { + my ($self) = @_; + + if ( $self->adaptor ) { + return $self->adaptor->get_seq_region_id($self); + } else { + warning('Cannot retrieve seq_region_id without attached adaptor.'); + return undef; + } +} + +=head2 get_all_Attributes + + Arg [1] : optional string $attrib_code + The code of the attribute type to retrieve values for. + Example : ($htg_phase) = @{$slice->get_all_Attributes('htg_phase')}; + @slice_attributes = @{$slice->get_all_Attributes()}; + Description: Gets a list of Attributes of this slice''s seq_region. + Optionally just get Attrubutes for given code. + Returntype : listref Bio::EnsEMBL::Attribute + Exceptions : warning if slice does not have attached adaptor + Caller : general + Status : Stable + +=cut + +sub get_all_Attributes { + my $self = shift; + my $attrib_code = shift; + + my $result; + my @results; + + if ( !$self->adaptor() ) { + warning('Cannot get attributes without an adaptor.'); + return []; + } + + my $attribute_adaptor = $self->adaptor->db->get_AttributeAdaptor(); + +## circular BOF + my $sl1 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => $self->{'start'}, + -END => $self->{'seq_region_length'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my $sl2 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => 1, + -END => $self->{'end'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + my $pta_ref = []; + + if ( defined $attrib_code ) { + my @res1 = + grep { uc( $_->code() ) eq uc($attrib_code) } + @{ $attribute_adaptor->fetch_all_by_Slice($sl1) }; + my @res2 = + grep { uc( $_->code() ) eq uc($attrib_code) } + @{ $attribute_adaptor->fetch_all_by_Slice($sl2) }; + my @res; + push @res, @res1, @res2; + $result = \@res; + } else { + my @res1 = @{ $attribute_adaptor->fetch_all_by_Slice($sl1) }; + my @res2 = @{ $attribute_adaptor->fetch_all_by_Slice($sl2) }; + my @res; + push @res, @res1, @res2; + $result = \@res; + } + +## circular EOF + + # if( defined $attrib_code ) { + # @results = grep { uc($_->code()) eq uc($attrib_code) } + # @{$attribute_adaptor->fetch_all_by_Slice( $self )}; + # $result = \@results; + # } else { + # $result = $attribute_adaptor->fetch_all_by_Slice( $self ); + # } + + return $result; +} ## end sub get_all_Attributes + +=head2 get_all_PredictionTranscripts + + Arg [1] : (optional) string $logic_name + The name of the analysis used to generate the prediction + transcripts obtained. + Arg [2] : (optional) boolean $load_exons + If set to true will force loading of all PredictionExons + immediately rather than loading them on demand later. This + is faster if there are a large number of PredictionTranscripts + and the exons will be used. + Example : @transcripts = @{$slice->get_all_PredictionTranscripts}; + Description: Retrieves the list of prediction transcripts which overlap + this slice with logic_name $logic_name. If logic_name is + not defined then all prediction transcripts are retrieved. + Returntype : listref of Bio::EnsEMBL::PredictionTranscript + Exceptions : warning if slice does not have attached adaptor + Caller : none + Status : Stable + +=cut + +sub get_all_PredictionTranscripts { + my ( $self, $logic_name, $load_exons ) = @_; + + if ( !$self->adaptor() ) { + warning( + 'Cannot get PredictionTranscripts without attached adaptor'); + return []; + } + my $pta = $self->adaptor()->db()->get_PredictionTranscriptAdaptor(); + +## circular BOF + my $sl1 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => $self->{'start'}, + -END => $self->{'seq_region_length'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my $sl2 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => 1, + -END => $self->{'end'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my ( @arr, @arr1, @arr2 ); + @arr1 = + @{ $pta->fetch_all_by_Slice( $sl1, $logic_name, $load_exons ) }; + @arr2 = + @{ $pta->fetch_all_by_Slice( $sl2, $logic_name, $load_exons ) }; + push @arr, @arr1, @arr2; + return \@arr; +## circular EOF + + #return $pta->fetch_all_by_Slice($sl1, $logic_name, $load_exons); +} ## end sub get_all_PredictionTranscripts + +=head2 get_all_DnaAlignFeatures + + Arg [1] : (optional) string $logic_name + The name of the analysis performed on the dna align features + to obtain. + Arg [2] : (optional) float $score + The mimimum score of the features to retrieve + Arg [3] : (optional) string $dbtype + The name of an attached database to retrieve the features from + instead, e.g. 'otherfeatures'. + Arg [4] : (optional) float hcoverage + The minimum hcoverage od the featurs to retrieve + Example : @dna_dna_align_feats = @{$slice->get_all_DnaAlignFeatures}; + Description: Retrieves the DnaDnaAlignFeatures which overlap this slice with + logic name $logic_name and with score above $score. If + $logic_name is not defined features of all logic names are + retrieved. If $score is not defined features of all scores are + retrieved. + Returntype : listref of Bio::EnsEMBL::DnaDnaAlignFeatures + Exceptions : warning if slice does not have attached adaptor + Caller : general + Status : Stable + +=cut + +sub get_all_DnaAlignFeatures { + my ( $self, $logic_name, $score, $dbtype, $hcoverage ) = @_; + + if ( !$self->adaptor() ) { + warning('Cannot get DnaAlignFeatures without attached adaptor'); + return []; + } + + my $db; + + if ($dbtype) { + $db = $self->adaptor->db->get_db_adaptor($dbtype); + if ( !$db ) { + warning("Don't have db $dbtype returning empty list\n"); + return []; + } + } else { + $db = $self->adaptor->db; + } + + my $dafa = $db->get_DnaAlignFeatureAdaptor(); + + if ( defined($score) and defined($hcoverage) ) { + warning "cannot specify score and hcoverage. Using score only"; + } + +## circular BOF + my $sl1 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => $self->{'start'}, + -END => $self->{'seq_region_length'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my $sl2 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => 1, + -END => $self->{'end'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my ( @arr, @arr1, @arr2 ); + my %union; + if ( defined($score) ) { + @arr1 = @{ $dafa->fetch_all_by_Slice_and_score( $sl1, $score, + $logic_name ) }; + @arr2 = @{ $dafa->fetch_all_by_Slice_and_score( $sl2, $score, + $logic_name ) }; + push @arr, @arr1, @arr2; + return \@arr; + } + @arr1 = @{ + $dafa->fetch_all_by_Slice_and_hcoverage( $sl1, $hcoverage, + $logic_name ) }; + @arr2 = @{ + $dafa->fetch_all_by_Slice_and_hcoverage( $sl2, $hcoverage, + $logic_name ) }; + push @arr, @arr1, @arr2; + return \@arr; + +## circular EOF + +# if(defined($score)){ +# return $dafa->fetch_all_by_Slice_and_score($self,$score, $logic_name); +# } +# return $dafa->fetch_all_by_Slice_and_hcoverage($self,$hcoverage, $logic_name); +} ## end sub get_all_DnaAlignFeatures + +=head2 get_all_ProteinAlignFeatures + + Arg [1] : (optional) string $logic_name + The name of the analysis performed on the protein align features + to obtain. + Arg [2] : (optional) float $score + The mimimum score of the features to retrieve + Arg [3] : (optional) string $dbtype + The name of an attached database to retrieve features from + instead. + Arg [4] : (optional) float hcoverage + The minimum hcoverage od the featurs to retrieve + Example : @dna_pep_align_feats = @{$slice->get_all_ProteinAlignFeatures}; + Description: Retrieves the DnaPepAlignFeatures which overlap this slice with + logic name $logic_name and with score above $score. If + $logic_name is not defined features of all logic names are + retrieved. If $score is not defined features of all scores are + retrieved. + Returntype : listref of Bio::EnsEMBL::DnaPepAlignFeatures + Exceptions : warning if slice does not have attached adaptor + Caller : general + Status : Stable + +=cut + +sub get_all_ProteinAlignFeatures { + my ( $self, $logic_name, $score, $dbtype, $hcoverage ) = @_; + if ( !$self->adaptor() ) { + warning('Cannot get ProteinAlignFeatures without attached adaptor'); + return []; + } + my $db; + if ($dbtype) { + $db = $self->adaptor->db->get_db_adaptor($dbtype); + if ( !$db ) { + warning("Don't have db $dbtype returning empty list\n"); + return []; + } + } else { + $db = $self->adaptor->db; + } + + my $pafa = $db->get_ProteinAlignFeatureAdaptor(); + + if ( defined($score) and defined($hcoverage) ) { + warning "cannot specify score and hcoverage. Using score only"; + } + + my $sl1 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => $self->{'start'}, + -END => $self->{'seq_region_length'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my $sl2 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => 1, + -END => $self->{'end'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + my ( @arr, @arr1, @arr2 ); + if ( defined($score) ) { + @arr1 = @{ $pafa->fetch_all_by_Slice_and_score( $sl1, $score, + $logic_name ) }; + @arr2 = @{ $pafa->fetch_all_by_Slice_and_score( $sl2, $score, + $logic_name ) }; + + push @arr, @arr1, @arr2; + return \@arr; + } + + @arr1 = @{ + $pafa->fetch_all_by_Slice_and_hcoverage( $sl1, $hcoverage, + $logic_name ) }; + @arr2 = @{ + $pafa->fetch_all_by_Slice_and_hcoverage( $sl2, $hcoverage, + $logic_name ) }; + + push @arr, @arr1, @arr2; + return \@arr; + +# if(defined($score)){ +# return $pafa->fetch_all_by_Slice_and_score($self,$score, $logic_name); +# } +# return $pafa->fetch_all_by_Slice_and_hcoverage($self,$hcoverage, $logic_name); + +} ## end sub get_all_ProteinAlignFeatures + +=head2 get_all_SimilarityFeatures + + Arg [1] : (optional) string $logic_name + the name of the analysis performed on the features to retrieve + Arg [2] : (optional) float $score + the lower bound of the score of the features to be retrieved + Example : @feats = @{$slice->get_all_SimilarityFeatures}; + Description: Retrieves all dna_align_features and protein_align_features + with analysis named $logic_name and with score above $score. + It is probably faster to use get_all_ProteinAlignFeatures or + get_all_DnaAlignFeatures if a sepcific feature type is desired. + If $logic_name is not defined features of all logic names are + retrieved. If $score is not defined features of all scores are + retrieved. + Returntype : listref of Bio::EnsEMBL::BaseAlignFeatures + Exceptions : warning if slice does not have attached adaptor + Caller : general + Status : Stable + +=cut + +sub get_all_SimilarityFeatures { + my ( $self, $logic_name, $score ) = @_; + + my @out = (); + + push @out, + @{ $self->get_all_ProteinAlignFeatures( $logic_name, $score ) }; + push @out, + @{ $self->get_all_DnaAlignFeatures( $logic_name, $score ) }; + + return \@out; +} + +=head2 get_all_SimpleFeatures + + Arg [1] : (optional) string $logic_name + The name of the analysis performed on the simple features + to obtain. + Arg [2] : (optional) float $score + The mimimum score of the features to retrieve + Example : @simple_feats = @{$slice->get_all_SimpleFeatures}; + Description: Retrieves the SimpleFeatures which overlap this slice with + logic name $logic_name and with score above $score. If + $logic_name is not defined features of all logic names are + retrieved. If $score is not defined features of all scores are + retrieved. + Returntype : listref of Bio::EnsEMBL::SimpleFeatures + Exceptions : warning if slice does not have attached adaptor + Caller : general + Status : Stable + +=cut + +sub get_all_SimpleFeatures { + my ( $self, $logic_name, $score, $dbtype ) = @_; + + if ( !$self->adaptor() ) { + warning('Cannot get SimpleFeatures without attached adaptor'); + return []; + } + + my $db; + if ($dbtype) { + $db = $self->adaptor->db->get_db_adaptor($dbtype); + if ( !$db ) { + warning("Don't have db $dbtype returning empty list\n"); + return []; + } + } else { + $db = $self->adaptor->db; + } + + my $sfa = $db->get_SimpleFeatureAdaptor(); + + return $sfa->fetch_all_by_Slice_and_score( $self, $score, + $logic_name ); +} + +#### STOP !!! ######################################## + +=head2 get_all_RepeatFeatures + + Arg [1] : (optional) string $logic_name + The name of the analysis performed on the repeat features + to obtain. + Arg [2] : (optional) string $repeat_type + Limits features returned to those of the specified repeat_type + Arg [3] : (optional) string $db + Key for database e.g. core/vega/cdna/.... + Example : @repeat_feats = @{$slice->get_all_RepeatFeatures(undef,'LTR')}; + Description: Retrieves the RepeatFeatures which overlap with + logic name $logic_name and with score above $score. If + $logic_name is not defined features of all logic names are + retrieved. + Returntype : listref of Bio::EnsEMBL::RepeatFeatures + Exceptions : warning if slice does not have attached adaptor + Caller : general + Status : Stable + +=cut + +sub get_all_RepeatFeatures { + my ( $self, $logic_name, $repeat_type, $dbtype ) = @_; + + if ( !$self->adaptor() ) { + warning('Cannot get RepeatFeatures without attached adaptor'); + return []; + } + + my $db; + if ($dbtype) { + $db = $self->adaptor->db->get_db_adaptor($dbtype); + if ( !$db ) { + warning("Don't have db $dbtype returning empty list\n"); + return []; + } + } else { + $db = $self->adaptor->db; + } + + my $rpfa = $db->get_RepeatFeatureAdaptor(); + + return $rpfa->fetch_all_by_Slice( $self, $logic_name, $repeat_type ); +} + +=head2 get_all_LD_values + + Arg [1] : (optional) Bio::EnsEMBL::Variation::Population $population + Description : returns all LD values on this slice. This function will only work correctly if the variation + database has been attached to the core database. If the argument is passed, will return the LD information + in that population + ReturnType : Bio::EnsEMBL::Variation::LDFeatureContainer + Exceptions : none + Caller : contigview, snpview + Status : At Risk + : Variation database is under development. + +=cut + +sub get_all_LD_values { + my $self = shift; + my $population = shift; + + if ( !$self->adaptor() ) { + warning('Cannot get LDFeatureContainer without attached adaptor'); + return []; + } + + my $variation_db = $self->adaptor->db->get_db_adaptor('variation'); + + unless ($variation_db) { + warning( "Variation database must be attached to core database to " + . "retrieve variation information" ); + return []; + } + + my $ld_adaptor = $variation_db->get_LDFeatureContainerAdaptor; + + if ($ld_adaptor) { + return $ld_adaptor->fetch_by_Slice( $self, $population ); + } else { + return []; + + } + +# my $ld_adaptor = Bio::EnsEMBL::DBSQL::MergedAdaptor->new(-species => $self->adaptor()->db()->species, -type => "LDFeatureContainer"); + +# if( $ld_adaptor ) { +# my $ld_values = $ld_adaptor->fetch_by_Slice($self,$population); +# if (@{$ld_values} > 1){ +# warning("More than 1 variation database attached. Trying to merge LD results"); +# my $ld_value_merged = shift @{$ld_values}; +# #with more than 1 variation database attached, will try to merge in one single LDContainer object. +# foreach my $ld (@{$ld_values}){ +# #copy the ld values to the result hash +# foreach my $key (keys %{$ld->{'ldContainer'}}){ +# $ld_value_merged->{'ldContainer'}->{$key} = $ld->{'ldContainer'}->{$key}; +# } +# #and copy the variationFeatures as well +# foreach my $key (keys %{$ld->{'variationFeatures'}}){ +# $ld_value_merged->{'variationFeatures'}->{$key} = $ld->{'variationFeatures'}->{$key}; +# } + +# } +# return $ld_value_merged; +# } +# else{ +# return shift @{$ld_values}; +# } +# } else { +# warning("Variation database must be attached to core database to " . +# "retrieve variation information" ); +# return []; +# } +} ## end sub get_all_LD_values + +=head2 get_all_VariationFeatures + + Args : $filter [optional] + Description:returns all variation features on this slice. This function will only work + correctly if the variation database has been attached to the core database. + If $filter is "genotyped" return genotyped Snps only... (nice likkle hack); + ReturnType : listref of Bio::EnsEMBL::Variation::VariationFeature + Exceptions : none + Caller : contigview, snpview + Status : At Risk + : Variation database is under development. + +=cut + +sub get_all_VariationFeatures { + my $self = shift; + my $filter = shift; + + $filter ||= ''; + if ( !$self->adaptor() ) { + warning('Cannot get variation features without attached adaptor'); + return []; + } + + my $vf_adaptor = + Bio::EnsEMBL::DBSQL::MergedAdaptor->new( + -species => $self->adaptor()->db()->species, + -type => "VariationFeature" ); + if ($vf_adaptor) { + if ( $filter eq 'genotyped' ) { + return $vf_adaptor->fetch_all_genotyped_by_Slice($self); + } else { + return $vf_adaptor->fetch_all_by_Slice($self); + } + } else { + warning( "Variation database must be attached to core database to " + . "retrieve variation information" ); + return []; + } +} ## end sub get_all_VariationFeatures + +=head2 get_all_IndividualSlice + + Args : none + Example : my $individualSlice = $slice->get_by_Population($population); + Description : Gets the specific Slice for all the individuls in the population + ReturnType : listref of Bio::EnsEMB::IndividualSlice + Exceptions : none + Caller : general + +=cut + +sub get_all_IndividualSlice { + my $self = shift; + + my $individualSliceFactory = + Bio::EnsEMBL::IndividualSliceFactory->new( + -START => $self->{'start'}, + -END => $self->{'end'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor(), + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -COORD_SYSTEM => $self->{'coord_system'}, ); + return $individualSliceFactory->get_all_IndividualSlice(); +} + +=head2 get_by_Individual + + Arg[1] : Bio::EnsEMBL::Variation::Individual $individual + Example : my $individualSlice = $slice->get_by_Individual($individual); + Description : Gets the specific Slice for the individual + ReturnType : Bio::EnsEMB::IndividualSlice + Exceptions : none + Caller : general + +=cut + +sub get_by_Individual { + my $self = shift; + my $individual = shift; + + return Bio::EnsEMBL::IndividualSlice->new( + -START => $self->{'start'}, + -END => $self->{'end'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor(), + # -SEQ => $self->{'seq'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -COORD_SYSTEM => $self->{'coord_system'}, + -INDIVIDUAL => $individual ); + +} + +=head2 get_by_strain + + Arg[1] : string $strain + Example : my $strainSlice = $slice->get_by_strain($strain); + Description : Gets the specific Slice for the strain + ReturnType : Bio::EnsEMB::StrainSlice + Exceptions : none + Caller : general + +=cut + +sub get_by_strain { + my $self = shift; + my $strain_name = shift; + + return + Bio::EnsEMBL::StrainSlice->new( + -START => $self->{'start'}, + -END => $self->{'end'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor(), + -SEQ => $self->{'seq'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -COORD_SYSTEM => $self->{'coord_system'}, + -STRAIN_NAME => $strain_name ); + +} + +sub calculate_theta { + my $self = shift; + my $strains = shift; + my $feature = shift + ; #optional parameter. Name of the feature in the Slice you want to calculate + + if ( !$self->adaptor() ) { + warning('Cannot get variation features without attached adaptor'); + return 0; + } + my $variation_db = $self->adaptor->db->get_db_adaptor('variation'); + + unless ($variation_db) { + warning( "Variation database must be attached to core database to " + . "retrieve variation information" ); + return 0; + } + + #need to get coverage regions for the slice in the different strains + my $coverage_adaptor = $variation_db->get_ReadCoverageAdaptor; + my $strain; + my $differences = []; + my $slices = []; + if ($coverage_adaptor) { + my $num_strains = scalar( @{$strains} ) + 1; + if ( !defined $feature ) { + #we want to calculate for the whole slice + push @{$slices}, + $self; #add the slice as the slice to calculate the theta value + } else { + #we have features, get the slices for the different features + my $features = $self->get_all_Exons(); + map { push @{$slices}, $_->feature_Slice } + @{$features}; #add the slices of the features + } + my $length_regions = 0; + my $snps = 0; + my $theta = 0; + my $last_position = 0; + #get all the differences in the slice coordinates + foreach my $strain_name ( @{$strains} ) { + my $strain = $self->get_by_strain($strain_name) + ; #get the strainSlice for the strain + + my $results = $strain->get_all_differences_Slice; + push @{$differences}, @{$results} if ( defined $results ); + } +#when we finish, we have, in max_level, the regions covered by all the sample +#sort the differences by the genomic position + my @differences_sorted = + sort { $a->start <=> $b->start } @{$differences}; + foreach my $slice ( @{$slices} ) { + my $regions_covered = + $coverage_adaptor->fetch_all_regions_covered( $slice, + $strains ); + if ( defined $regions_covered ) { + foreach my $range ( @{$regions_covered} ) { + $length_regions += + ( $range->[1] - $range->[0] ) + + 1; #add the length of the genomic region + for ( my $i = $last_position; $i < @differences_sorted; $i++ ) + { + if ( $differences_sorted[$i]->start >= $range->[0] + && $differences_sorted[$i]->end <= $range->[1] ) + { + $snps++; #count differences in the region + } elsif ( $differences_sorted[$i]->end > $range->[1] ) { + $last_position = $i; + last; + } + } + } + #when all the ranges have been iterated, calculate rho + #this is an intermediate variable called a in the formula + # a = sum i=2..strains 1/i-1 + } + } ## end foreach my $slice ( @{$slices...}) + my $a = _calculate_a($num_strains); + $theta = $snps/( $a*$length_regions ); + return $theta; + } else { + return 0; + } +} ## end sub calculate_theta + +sub _calculate_a { + my $max_level = shift; + + my $a = 0; + for ( my $i = 2; $i <= $max_level + 1; $i++ ) { + $a += 1/( $i - 1 ); + } + return $a; +} + +sub calculate_pi { + my $self = shift; + my $strains = shift; + my $feature = shift; + + if ( !$self->adaptor() ) { + warning('Cannot get variation features without attached adaptor'); + return 0; + } + my $variation_db = $self->adaptor->db->get_db_adaptor('variation'); + + unless ($variation_db) { + warning( "Variation database must be attached to core database to " + . "retrieve variation information" ); + return 0; + } + + #need to get coverage regions for the slice in the different strains + my $coverage_adaptor = $variation_db->get_ReadCoverageAdaptor; + my $differences = []; + my $slices = []; + if ($coverage_adaptor) { + my $num_strains = scalar( @{$strains} ) + 1; + if ( !defined $feature ) { + #we want to calculate for the whole slice + push @{$slices}, + $self; #add the slice as the slice to calculate the theta value + } else { + #we have features, get the slices for the different features + my $features = $self->get_all_Exons(); + map { push @{$slices}, $_->feature_Slice } + @{$features}; #add the slices of the features + } + my @range_differences = (); + my $pi = 0; + my $regions = 0; + my $last_position = + 0; #last position visited in the sorted list of differences + my $triallelic = 0; + my $is_triallelic = 0; + foreach my $slice ( @{$slices} ) { + + foreach my $strain_name ( @{$strains} ) { + my $strain = $slice->get_by_strain($strain_name) + ; #get the strainSlice for the strain + my $results = $strain->get_all_differences_Slice; + push @{$differences}, @{$results} if ( defined $results ); + } + my @differences_sorted = + sort { $a->start <=> $b->start } @{$differences}; + + my $regions_covered = + $coverage_adaptor->fetch_all_regions_covered( $slice, + $strains ); +#when we finish, we have, in max_level, the regions covered by all the sample +#sort the differences + if ( defined $regions_covered ) { + foreach my $range ( @{$regions_covered} ) { + for ( my $i = $last_position; $i < @differences_sorted; $i++ ) + { + if ( $differences_sorted[$i]->start >= $range->[0] + && $differences_sorted[$i]->end <= $range->[1] ) + { + #check wether it is the same region or different + if ( !defined $range_differences[0] + || ( $differences_sorted[$i]->start == + $range_differences[0]->start ) ) + { + if ( defined $range_differences[0] + && ( $differences_sorted[$i]->allele_string ne + $range_differences[0]->allele_string ) ) + { + $is_triallelic = 1; + } + push @range_differences, $differences_sorted[$i]; + } else { + #new site, calc pi for the previous one + $pi += + 2* + ( @range_differences/($num_strains) )* + ( 1 - ( @range_differences/$num_strains ) ); + if ($is_triallelic) { + $triallelic++; + $is_triallelic = 0; + } + $regions++; + @range_differences = (); + #and start a new range + push @range_differences, $differences_sorted[$i]; + } + } elsif ( $differences_sorted[$i]->end > $range->[1] ) { + $last_position = $i; + last; + } + } ## end for ( my $i = $last_position...) + #calculate pi for last site, if any + if ( defined $range_differences[0] ) { + $pi += + 2* + ( @range_differences/$num_strains )* + ( 1 - ( @range_differences/$num_strains ) ); + $regions++; + } + } ## end foreach my $range ( @{$regions_covered...}) + } ## end if ( defined $regions_covered) + $pi = $pi/$regions; #calculate average pi + print +"Regions with variations in region $regions and triallelic $triallelic\n\n"; + } ## end foreach my $slice ( @{$slices...}) + return $pi; + } else { + return 0; + } + +} ## end sub calculate_pi + +=head2 get_all_genotyped_VariationFeatures + + Args : none + Description: returns all variation features on this slice that have been genotyped. + This function will only work correctly if the variation database has + been attached to the core database. + ReturnType : listref of Bio::EnsEMBL::Variation::VariationFeature + Exceptions : none + Caller : contigview, snpview, ldview + Status : At Risk + : Variation database is under development. + +=cut + +sub get_all_genotyped_VariationFeatures { + my $self = shift; + my $vfa; + if ( !$self->adaptor() ) { + warning('Cannot get variation features without attached adaptor'); + return []; + } + + my $vf_adaptor = + Bio::EnsEMBL::DBSQL::MergedAdaptor->new( + -species => $self->adaptor()->db()->species, + -type => "VariationFeature" ); + + if ($vf_adaptor) { + return $vf_adaptor->fetch_all_genotyped_by_Slice($self); + } else { + warning( "Variation database must be attached to core database to " + . "retrieve variation information" ); + return []; + } +} + +=head2 get_all_SNPs + + Description: DEPRECATED. Use get_all_VariationFeatures instead + +=cut + +sub get_all_SNPs { + my $self = shift; + + deprecate('Use get_all_VariationFeatures() instead.'); + + my $snps; + my $vf = $self->get_all_genotyped_VariationFeatures(); + if ( $vf->[0] ) { + #necessary to convert the VariationFeatures into SNP objects + foreach my $variation_feature ( @{$vf} ) { + push @{$snps}, $variation_feature->convert_to_SNP(); + } + return $snps; + } else { + return []; + } +} + +=head2 get_all_genotyped_SNPs + + Description : DEPRECATED. Use get_all_genotyped_VariationFeatures instead + +=cut + +sub get_all_genotyped_SNPs { + my $self = shift; + + deprecate("Use get_all_genotyped_VariationFeatures instead"); + my $vf = $self->get_all_genotyped_VariationFeatures; + my $snps; + if ( $vf->[0] ) { + foreach my $variation_feature ( @{$vf} ) { + push @{$snps}, $variation_feature->convert_to_SNP(); + } + return $snps; + } else { + return []; + } +} + +sub get_all_SNPs_transcripts { + my $self = shift; + + deprecate("DEPRECATED"); + + return []; + +} + +=head2 get_all_Genes + + Arg [1] : (optional) string $logic_name + The name of the analysis used to generate the genes to retrieve + Arg [2] : (optional) string $dbtype + The dbtype of genes to obtain. This assumes that the db has + been added to the DBAdaptor under this name (using the + DBConnection::add_db_adaptor method). + Arg [3] : (optional) boolean $load_transcripts + If set to true, transcripts will be loaded immediately rather + than being lazy-loaded on request. This will result in a + significant speed up if the Transcripts and Exons are going to + be used (but a slow down if they are not). + Arg [4] : (optional) string $source + The source of the genes to retrieve. + Arg [5] : (optional) string $biotype + The biotype of the genes to retrieve. + Example : @genes = @{$slice->get_all_Genes}; + Description: Retrieves all genes that overlap this slice. + Returntype : listref of Bio::EnsEMBL::Genes + Exceptions : none + Caller : none + Status : Stable + +=cut + +sub get_all_Genes { + my ( $self, $logic_name, $dbtype, $load_transcripts, $source, + $biotype ) + = @_; + + if ( !$self->adaptor() ) { + warning('Cannot get Genes without attached adaptor'); + return []; + } + + my $ga; + if ($dbtype) { + my $db = $reg->get_db( $self->adaptor()->db(), $dbtype ); + if ( defined($db) ) { + $ga = $reg->get_adaptor( $db->species(), $db->group(), "Gene" ); + } else { + $ga = $reg->get_adaptor( $self->adaptor()->db()->species(), + $dbtype, "Gene" ); + } + if ( !defined $ga ) { + warning("$dbtype genes not available"); + return []; + } + } else { + $ga = $self->adaptor->db->get_GeneAdaptor(); + } + +## circular BOF + my $sl1 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => $self->{'start'}, + -END => $self->{'seq_region_length'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my $sl2 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => 1, + -END => $self->{'end'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my ( @arr, @arr1, @arr2 ); + @arr1 = @{ + $ga->fetch_all_by_Slice( $sl1, $logic_name, $load_transcripts, + $source, $biotype ) }; + @arr2 = @{ + $ga->fetch_all_by_Slice( $sl2, $logic_name, $load_transcripts, + $source, $biotype ) }; + push @arr, @arr1, @arr2; + return \@arr; + + ## circular EOF + +} ## end sub get_all_Genes + +=head2 get_all_Genes_by_type + + Arg [1] : string $type + The biotype of genes wanted. + Arg [2] : (optional) string $logic_name + Arg [3] : (optional) boolean $load_transcripts + If set to true, transcripts will be loaded immediately rather + than being lazy-loaded on request. This will result in a + significant speed up if the Transcripts and Exons are going to + be used (but a slow down if they are not). + Example : @genes = @{$slice->get_all_Genes_by_type('protein_coding', + 'ensembl')}; + Description: Retrieves genes that overlap this slice of biotype $type. + This is primarily used by the genebuilding code when several + biotypes of genes are used. + + The logic name is the analysis of the genes that are retrieved. + If not provided all genes will be retrieved instead. + + Returntype : listref of Bio::EnsEMBL::Genes + Exceptions : none + Caller : genebuilder, general + Status : Stable + +=cut + +sub get_all_Genes_by_type { + my ( $self, $type, $logic_name, $load_transcripts ) = @_; + + if ( !$self->adaptor() ) { + warning('Cannot get Genes without attached adaptor'); + return []; + } + + return + $self->get_all_Genes( $logic_name, undef, $load_transcripts, undef, + $type ); +} + +=head2 get_all_Genes_by_source + + Arg [1] : string source + Arg [2] : (optional) boolean $load_transcripts + If set to true, transcripts will be loaded immediately rather + than being lazy-loaded on request. This will result in a + significant speed up if the Transcripts and Exons are going to + be used (but a slow down if they are not). + Example : @genes = @{$slice->get_all_Genes_by_source('ensembl')}; + Description: Retrieves genes that overlap this slice of source $source. + + Returntype : listref of Bio::EnsEMBL::Genes + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_Genes_by_source { + my ( $self, $source, $load_transcripts ) = @_; + + if ( !$self->adaptor() ) { + warning('Cannot get Genes without attached adaptor'); + return []; + } + + return $self->get_all_Genes( undef, undef, $load_transcripts, + $source ); +} + +=head2 get_all_Transcripts + + Arg [1] : (optional) boolean $load_exons + If set to true exons will not be lazy-loaded but will instead + be loaded right away. This is faster if the exons are + actually going to be used right away. + Arg [2] : (optional) string $logic_name + the logic name of the type of features to obtain + Arg [3] : (optional) string $db_type + Example : @transcripts = @{$slice->get_all_Transcripts)_}; + Description: Gets all transcripts which overlap this slice. If you want to + specify a particular analysis or type, then you are better off + using get_all_Genes or get_all_Genes_by_type and iterating + through the transcripts of each gene. + Returntype : reference to a list of Bio::EnsEMBL::Transcripts + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_Transcripts { + my $self = shift; + my $load_exons = shift; + my $logic_name = shift; + my $dbtype = shift; + if ( !$self->adaptor() ) { + warning('Cannot get Transcripts without attached adaptor'); + return []; + } + + my $ta; + if ($dbtype) { + my $db = $reg->get_db( $self->adaptor()->db(), $dbtype ); + if ( defined($db) ) { + $ta = + $reg->get_adaptor( $db->species(), $db->group(), "Transcript" ); + } else { + $ta = $reg->get_adaptor( $self->adaptor()->db()->species(), + $dbtype, "Transcript" ); + } + if ( !defined $ta ) { + warning("$dbtype genes not available"); + return []; + } + } else { + $ta = $self->adaptor->db->get_TranscriptAdaptor(); + } + +## circular BOF + my $sl1 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => $self->{'start'}, + -END => $self->{'seq_region_length'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my $sl2 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => 1, + -END => $self->{'end'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my ( @arr, @arr1, @arr2 ); + @arr1 = + @{ $ta->fetch_all_by_Slice( $sl1, $load_exons, $logic_name ) }; + @arr2 = + @{ $ta->fetch_all_by_Slice( $sl2, $load_exons, $logic_name ) }; + push @arr, @arr1, @arr2; + return \@arr; +## circular EOF + #return $ta->fetch_all_by_Slice($self, $load_exons, $logic_name); +} ## end sub get_all_Transcripts + +=head2 get_all_Exons + + Arg [1] : none + Example : @exons = @{$slice->get_all_Exons}; + Description: Gets all exons which overlap this slice. Note that these exons + will not be associated with any transcripts, so this may not + be terribly useful. + Returntype : reference to a list of Bio::EnsEMBL::Exons + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_Exons { + my $self = shift; + + if ( !$self->adaptor() ) { + warning('Cannot get Exons without attached adaptor'); + return []; + } + + ## circular BOF + my $sl1 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => $self->{'start'}, + -END => $self->{'seq_region_length'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my $sl2 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => 1, + -END => $self->{'end'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my ( @arr, @arr1, @arr2 ); + @arr1 = + @{ $sl1->adaptor->db->get_ExonAdaptor->fetch_all_by_Slice($sl1) }; + @arr2 = + @{ $sl2->adaptor->db->get_ExonAdaptor->fetch_all_by_Slice($sl2) }; + push @arr, @arr1, @arr2; + return \@arr; + ## circular EOF + + #rturn $self->adaptor->db->get_ExonAdaptor->fetch_all_by_Slice($self); +} ## end sub get_all_Exons + +=head2 get_all_QtlFeatures + + Args : none + Description: returns overlapping QtlFeatures + Returntype : listref Bio::EnsEMBL::Map::QtlFeature + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_QtlFeatures { + my $self = shift; + + if ( !$self->adaptor() ) { + warning('Cannot get QtlFeatures without attached adaptor'); + return []; + } + + my $qfAdaptor; + if ( $self->adaptor() ) { + $qfAdaptor = $self->adaptor()->db()->get_QtlFeatureAdaptor(); + } else { + return []; + } + + ## circular BOF + my $sl1 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => $self->{'start'}, + -END => $self->{'seq_region_length'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my $sl2 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => 1, + -END => $self->{'end'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my ( @arr, @arr1, @arr2 ); + @arr1 = @{ $qfAdaptor->fetch_all_by_Slice_constraint($sl1) }; + @arr2 = @{ $qfAdaptor->fetch_all_by_Slice_constraint($sl2) }; + push @arr, @arr1, @arr2; + return \@arr; + ## circular EOF + + #return $qfAdaptor->fetch_all_by_Slice_constraint( $self ); +} ## end sub get_all_QtlFeatures + +=head2 get_all_KaryotypeBands + + Arg [1] : none + Example : @kary_bands = @{$slice->get_all_KaryotypeBands}; + Description: Retrieves the karyotype bands which this slice overlaps. + Returntype : listref oif Bio::EnsEMBL::KaryotypeBands + Exceptions : none + Caller : general, contigview + Status : Stable + +=cut + +sub get_all_KaryotypeBands { + my ($self) = @_; + + if ( !$self->adaptor() ) { + warning('Cannot get KaryotypeBands without attached adaptor'); + return []; + } + + my $kadp = $self->adaptor->db->get_KaryotypeBandAdaptor(); + + ## circular BOF + my $sl1 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => $self->{'start'}, + -END => $self->{'seq_region_length'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my $sl2 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => 1, + -END => $self->{'end'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my ( @arr, @arr1, @arr2 ); + @arr1 = @{ $kadp->fetch_all_by_Slice($sl1) }; + @arr2 = @{ $kadp->fetch_all_by_Slice($sl2) }; + push @arr, @arr1, @arr2; + return \@arr; + ## circular EOF + + #return $kadp->fetch_all_by_Slice($self); +} ## end sub get_all_KaryotypeBands + +=head2 get_repeatmasked_seq + + Arg [1] : listref of strings $logic_names (optional) + Arg [2] : int $soft_masking_enable (optional) + Arg [3] : hash reference $not_default_masking_cases (optional, default is {}) + The values are 0 or 1 for hard and soft masking respectively + The keys of the hash should be of 2 forms + "repeat_class_" . $repeat_consensus->repeat_class, + e.g. "repeat_class_SINE/MIR" + "repeat_name_" . $repeat_consensus->name + e.g. "repeat_name_MIR" + depending on which base you want to apply the not default + masking either the repeat_class or repeat_name. Both can be + specified in the same hash at the same time, but in that case, + repeat_name setting has priority over repeat_class. For example, + you may have hard masking as default, and you may want soft + masking of all repeat_class SINE/MIR, but repeat_name AluSp + (which are also from repeat_class SINE/MIR). + Your hash will be something like {"repeat_class_SINE/MIR" => 1, + "repeat_name_AluSp" => 0} + Example : $rm_slice = $slice->get_repeatmasked_seq(); + $softrm_slice = $slice->get_repeatmasked_seq(['RepeatMask'],1); + Description: Returns Bio::EnsEMBL::Slice that can be used to create repeat + masked sequence instead of the regular sequence. + Sequence returned by this new slice will have repeat regions + hardmasked by default (sequence replaced by N) or + or soft-masked when arg[2] = 1 (sequence in lowercase) + Will only work with database connection to get repeat features. + Returntype : Bio::EnsEMBL::RepeatMaskedSlice + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_repeatmasked_seq { + my ( $self, $logic_names, $soft_mask, $not_default_masking_cases ) = + @_; + + return + Bio::EnsEMBL::RepeatMaskedSlice->new( + -START => $self->{'start'}, + -END => $self->{'end'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor(), + -SEQ => $self->{'seq'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -COORD_SYSTEM => $self->{'coord_system'}, + -REPEAT_MASK => $logic_names, + -SOFT_MASK => $soft_mask, + -NOT_DEFAULT_MASKING_CASES => $not_default_masking_cases + ); +} + +=head2 _mask_features + + Arg [1] : reference to a string $dnaref + Arg [2] : array_ref $repeats + reference to a list Bio::EnsEMBL::RepeatFeature + give the list of coordinates to replace with N or with + lower case + Arg [3] : int $soft_masking_enable (optional) + Arg [4] : hash reference $not_default_masking_cases (optional, default is {}) + The values are 0 or 1 for hard and soft masking respectively + The keys of the hash should be of 2 forms + "repeat_class_" . $repeat_consensus->repeat_class, + e.g. "repeat_class_SINE/MIR" + "repeat_name_" . $repeat_consensus->name + e.g. "repeat_name_MIR" + depending on which base you want to apply the not default masking either + the repeat_class or repeat_name. Both can be specified in the same hash + at the same time, but in that case, repeat_name setting has priority over + repeat_class. For example, you may have hard masking as default, and + you may want soft masking of all repeat_class SINE/MIR, + but repeat_name AluSp (which are also from repeat_class SINE/MIR). + Your hash will be something like {"repeat_class_SINE/MIR" => 1, + "repeat_name_AluSp" => 0} + Description: replaces string positions described in the RepeatFeatures + with Ns (default setting), or with the lower case equivalent + (soft masking). The reference to a dna string which is passed + is changed in place. + Returntype : none + Exceptions : none + Caller : seq + Status : Stable + +=cut + +sub _mask_features { + my ( $self, $dnaref, $repeats, $soft_mask, + $not_default_masking_cases ) = @_; + + $soft_mask = 0 unless ( defined $soft_mask ); + $not_default_masking_cases = {} + unless ( defined $not_default_masking_cases ); + + # explicit CORE::length call, to avoid any confusion with the Slice + # length method + my $dnalen = CORE::length($$dnaref); + +REP: foreach my $old_f ( @{$repeats} ) { + my $f = $old_f->transfer($self); + my $start = $f->start; + my $end = $f->end; + my $length = ( $end - $start ) + 1; + + # check if we get repeat completely outside of expected slice range + if ( $end < 1 || $start > $dnalen ) { + # warning("Unexpected: Repeat completely outside slice coordinates."); + next REP; + } + + # repeat partly outside slice range, so correct + # the repeat start and length to the slice size if needed + if ( $start < 1 ) { + $start = 1; + $length = ( $end - $start ) + 1; + } + + # repeat partly outside slice range, so correct + # the repeat end and length to the slice size if needed + if ( $end > $dnalen ) { + $end = $dnalen; + $length = ( $end - $start ) + 1; + } + + $start--; + + my $padstr; +# if we decide to define masking on the base of the repeat_type, we'll need +# to add the following, and the other commented line few lines below. +# my $rc_type = "repeat_type_" . $f->repeat_consensus->repeat_type; + my $rc_class = "repeat_class_" . $f->repeat_consensus->repeat_class; + my $rc_name = "repeat_name_" . $f->repeat_consensus->name; + + my $masking_type; +# $masking_type = $not_default_masking_cases->{$rc_type} if (defined $not_default_masking_cases->{$rc_type}); + $masking_type = $not_default_masking_cases->{$rc_class} + if ( defined $not_default_masking_cases->{$rc_class} ); + $masking_type = $not_default_masking_cases->{$rc_name} + if ( defined $not_default_masking_cases->{$rc_name} ); + + $masking_type = $soft_mask unless ( defined $masking_type ); + + if ($masking_type) { + $padstr = lc substr( $$dnaref, $start, $length ); + } else { + $padstr = 'N' x $length; + } + substr( $$dnaref, $start, $length ) = $padstr; + } ## end foreach my $old_f ( @{$repeats...}) +} ## end sub _mask_features + +=head2 get_all_SearchFeatures + + Arg [1] : scalar $ticket_ids + Example : $slice->get_all_SearchFeatures('BLA_KpUwwWi5gY'); + Description: Retreives all search features for stored blast + results for the ticket that overlap this slice + Returntype : listref of Bio::EnsEMBL::SeqFeatures + Exceptions : none + Caller : general (webby!) + Status : Stable + +=cut + +sub get_all_SearchFeatures { + my $self = shift; + my $ticket = shift; + local $_; + unless ($ticket) { + throw("ticket argument is required"); + } + + if ( !$self->adaptor() ) { + warning("Cannot get SearchFeatures without an attached adaptor"); + return []; + } + + my $sfa = $self->adaptor()->db()->get_db_adaptor('blast'); + + my $offset = $self->start - 1; + + my $features = $sfa + ? $sfa->get_all_SearchFeatures( $ticket, $self->seq_region_name, + $self->start, $self->end ) + : []; + + foreach (@$features) { + $_->start( $_->start - $offset ); + $_->end( $_->end - $offset ); + } + return $features; + +} ## end sub get_all_SearchFeatures + +=head2 get_all_AssemblyExceptionFeatures + + Arg [1] : string $set (optional) + Example : $slice->get_all_AssemblyExceptionFeatures(); + Description: Retreives all misc features which overlap this slice. If + a set code is provided only features which are members of + the requested set are returned. + Returntype : listref of Bio::EnsEMBL::AssemblyExceptionFeatures + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_AssemblyExceptionFeatures { + my $self = shift; + my $misc_set = shift; + + my $adaptor = $self->adaptor(); + + if ( !$adaptor ) { + warning('Cannot retrieve features without attached adaptor.'); + return []; + } + + my $aefa = $adaptor->db->get_AssemblyExceptionFeatureAdaptor(); + + ## circular BOF + my $sl1 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => $self->{'start'}, + -END => $self->{'seq_region_length'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my $sl2 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => 1, + -END => $self->{'end'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my ( @arr, @arr1, @arr2 ); + @arr1 = @{ $aefa->fetch_all_by_Slice($sl1) }; + @arr2 = @{ $aefa->fetch_all_by_Slice($sl2) }; + push @arr, @arr1, @arr2; + return \@arr; + ## circular EOF + #return $aefa->fetch_all_by_Slice($self); +} ## end sub get_all_AssemblyExceptionFeatures + +=head2 get_all_MiscFeatures + + Arg [1] : string $set (optional) + Arg [2] : string $database (optional) + Example : $slice->get_all_MiscFeatures('cloneset'); + Description: Retreives all misc features which overlap this slice. If + a set code is provided only features which are members of + the requested set are returned. + Returntype : listref of Bio::EnsEMBL::MiscFeatures + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_MiscFeatures { + my $self = shift; + my $misc_set = shift; + my $dbtype = shift; + my $msa; + + my $adaptor = $self->adaptor(); + if ( !$adaptor ) { + warning('Cannot retrieve features without attached adaptor.'); + return []; + } + + my $mfa; + if ($dbtype) { + my $db = $reg->get_db( $adaptor->db(), $dbtype ); + if ( defined($db) ) { + $mfa = $reg->get_adaptor( lc( $db->species() ), + $db->group(), "miscfeature" ); + } else { + $mfa = $reg->get_adaptor( $adaptor->db()->species(), + $dbtype, "miscfeature" ); + } + if ( !defined $mfa ) { + warning("$dbtype misc features not available"); + return []; + } + } else { + $mfa = $adaptor->db->get_MiscFeatureAdaptor(); + } + + ## circular BOF + my $sl1 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => $self->{'start'}, + -END => $self->{'seq_region_length'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my $sl2 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => 1, + -END => $self->{'end'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my ( @arr, @arr1, @arr2 ); + + if ($misc_set) { + @arr1 = + @{ $mfa->fetch_all_by_Slice_and_set_code( $sl1, $misc_set ) }; + @arr2 = + @{ $mfa->fetch_all_by_Slice_and_set_code( $sl2, $misc_set ) }; + push @arr, @arr1, @arr2; + return \@arr; + } + @arr1 = @{ $mfa->fetch_all_by_Slice($sl1) }; + @arr2 = @{ $mfa->fetch_all_by_Slice($sl2) }; + push @arr, @arr1, @arr2; + return \@arr; + ## circular EOF + + # if($misc_set) { + # return $mfa->fetch_all_by_Slice_and_set_code($self,$misc_set); + # } + + # return $mfa->fetch_all_by_Slice($self); +} ## end sub get_all_MiscFeatures + +=head2 get_all_AffyFeatures + + Args : (optional) list of strings - array names + Example : $slice->get_all_AffyFeatures(); + Description: Retrieves all AffyFeature objects which overlap this slice. + Returntype : listref of Bio::EnsEMBL::AffyFeature objects + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_AffyFeatures { + my $self = shift; + my @arraynames = @_; + + my $sa = $self->adaptor(); + if ( !$sa ) { + warning("Cannot retrieve features without attached adaptor."); + } + my $fa = $sa->db()->get_AffyFeatureAdaptor(); + my $features; + + ## circular BOF + my $sl1 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => $self->{'start'}, + -END => $self->{'seq_region_length'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my $sl2 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => 1, + -END => $self->{'end'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my ( @arr, @arr1, @arr2 ); + + if (@arraynames) { + @arr1 = @{ $fa->fetch_all_by_Slice_arrayname( $sl1, @arraynames ) }; + @arr2 = @{ $fa->fetch_all_by_Slice_arrayname( $sl2, @arraynames ) }; + } else { + @arr1 = @{ $fa->fetch_all_by_Slice($sl1) }; + @arr2 = @{ $fa->fetch_all_by_Slice($sl2) }; + } + push @arr, @arr1, @arr2; + return \@arr; + ## circular EOF + +# if ( @arraynames ) { +# $features = $fa->fetch_all_by_Slice_arrayname( $self, @arraynames ); +# } else { +# $features = $fa->fetch_all_by_Slice( $self ); +# } +# return $features; +} ## end sub get_all_AffyFeatures + +=head2 get_all_OligoFeatures + + Args : (optional) list of strings - array names + Example : $slice->get_all_OligoFeatures(); + Description: Retrieves all OligoFeature objects which overlap this slice. + Optionally just retrieve OligoFeature objects generated by + probes from named arrays. + Returntype : listref of Bio::EnsEMBL::OligoFeature objects + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_OligoFeatures { + my $self = shift; + my @arraynames = @_; + + my $sa = $self->adaptor(); + if ( !$sa ) { + warning("Cannot retrieve features without attached adaptor."); + } + my $fa = $sa->db()->get_OligoFeatureAdaptor(); + my $features; + + ## circular BOF + my $sl1 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => $self->{'start'}, + -END => $self->{'seq_region_length'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my $sl2 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => 1, + -END => $self->{'end'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my ( @arr, @arr1, @arr2 ); + + if (@arraynames) { + @arr1 = @{ $fa->fetch_all_by_Slice_arrayname( $sl1, @arraynames ) }; + @arr2 = @{ $fa->fetch_all_by_Slice_arrayname( $sl2, @arraynames ) }; + } else { + @arr1 = @{ $fa->fetch_all_by_Slice($sl1) }; + @arr2 = @{ $fa->fetch_all_by_Slice($sl2) }; + } + push @arr, @arr1, @arr2; + return \@arr; + ## circular EOF + +# if ( @arraynames ) { +# $features = $fa->fetch_all_by_Slice_arrayname( $self, @arraynames ); +# } else { +# $features = $fa->fetch_all_by_Slice( $self ); +# } +# return $features; +} ## end sub get_all_OligoFeatures + +=head2 get_all_OligoFeatures_by_type + + Arg [1] : string - type of array (e.g. AFFY or OLIGO) + Arg [2] : (optional) string - logic name + Example : $slice->get_all_OligoFeatures_by_type('OLIGO'); + Description: Retrieves all OligoFeature objects which overlap this slice and + were created by probes from the specified type of array. + Returntype : listref of Bio::EnsEMBL::OligoFeature objects + Exceptions : throws if no type + Caller : general + Status : Stable + +=cut + +sub get_all_OligoFeatures_by_type { + my ( $self, $type, $logic_name ) = @_; + + throw('Need type as parameter') if !$type; + + my $sa = $self->adaptor(); + if ( !$sa ) { + warning("Cannot retrieve features without attached adaptor."); + } + my $fa = $sa->db()->get_OligoFeatureAdaptor(); + + ## circular BOF + my $sl1 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => $self->{'start'}, + -END => $self->{'seq_region_length'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my $sl2 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => 1, + -END => $self->{'end'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my ( @arr, @arr1, @arr2 ); + @arr1 = @{ $fa->fetch_all_by_Slice_type( $sl1, $type, $logic_name ) }; + @arr2 = @{ $fa->fetch_all_by_Slice_type( $sl2, $type, $logic_name ) }; + push @arr, @arr1, @arr2; + return \@arr; + ## circular EOF + +#my $features = $fa->fetch_all_by_Slice_type( $self, $type, $logic_name ); +#return $features; +} ## end sub get_all_OligoFeatures_by_type + +=head2 get_all_MarkerFeatures + + Arg [1] : (optional) string logic_name + The logic name of the marker features to retrieve + Arg [2] : (optional) int $priority + Lower (exclusive) priority bound of the markers to retrieve + Arg [3] : (optional) int $map_weight + Upper (exclusive) priority bound of the markers to retrieve + Example : my @markers = @{$slice->get_all_MarkerFeatures(undef,50, 2)}; + Description: Retrieves all markers which lie on this slice fulfilling the + specified map_weight and priority parameters (if supplied). + Returntype : reference to a list of Bio::EnsEMBL::MarkerFeatures + Exceptions : none + Caller : contigview, general + Status : Stable + +=cut + +sub get_all_MarkerFeatures { + my ( $self, $logic_name, $priority, $map_weight ) = @_; + + if ( !$self->adaptor() ) { + warning('Cannot retrieve MarkerFeatures without attached adaptor.'); + return []; + } + + my $ma = $self->adaptor->db->get_MarkerFeatureAdaptor; + + ## circular BOF + my $sl1 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => $self->{'start'}, + -END => $self->{'seq_region_length'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my $sl2 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => 1, + -END => $self->{'end'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my ( @arr, @arr1, @arr2 ); + @arr1 = @{ + $ma->fetch_all_by_Slice_and_priority( $sl1, $priority, $map_weight, + $logic_name ) }; + @arr2 = @{ + $ma->fetch_all_by_Slice_and_priority( $sl2, $priority, $map_weight, + $logic_name ) }; + push @arr, @arr1, @arr2; + return \@arr; + ## circular EOF + +# my $feats = $ma->fetch_all_by_Slice_and_priority($self, $priority, $map_weight, $logic_name); +# return $feats; +} ## end sub get_all_MarkerFeatures + +=head2 get_all_compara_DnaAlignFeatures + + Arg [1] : string $qy_species + The name of the species to retrieve similarity features from + Arg [2] : string $qy_assembly + The name of the assembly to retrieve similarity features from + Arg [3] : string $type + The type of the alignment to retrieve similarity features from + Arg [4] : compara dbadptor to use. + Example : $fs = $slc->get_all_compara_DnaAlignFeatures('Mus musculus', + 'MGSC3', + 'WGA'); + Description: Retrieves a list of DNA-DNA Alignments to the species specified + by the $qy_species argument. + The compara database must be attached to the core database + for this call to work correctly. As well the compara database + must have the core dbadaptors for both this species, and the + query species added to function correctly. + Returntype : reference to a list of Bio::EnsEMBL::DnaDnaAlignFeatures + Exceptions : warning if compara database is not available + Caller : contigview + Status : Stable + +=cut + +sub get_all_compara_DnaAlignFeatures { + my ( $self, $qy_species, $qy_assembly, $alignment_type, $compara_db ) + = @_; + + if ( !$self->adaptor() ) { + warning( + "Cannot retrieve DnaAlignFeatures without attached adaptor"); + return []; + } + + unless ( $qy_species && $alignment_type # && $qy_assembly + ) + { + throw( +"Query species and assembly and alignmemt type arguments are required" + ); + } + + if ( !defined($compara_db) ) { + $compara_db = + Bio::EnsEMBL::Registry->get_DBAdaptor( "compara", "compara" ); + } + unless ($compara_db) { + warning( + "Compara database must be attached to core database or passed " + . "as an argument to " + . "retrieve compara information" ); + return []; + } + + my $dafa = $compara_db->get_DnaAlignFeatureAdaptor; + ## circular BOF + my $sl1 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => $self->{'start'}, + -END => $self->{'seq_region_length'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my $sl2 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => 1, + -END => $self->{'end'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my ( @arr, @arr1, @arr2 ); + @arr1 = @{ + $dafa->fetch_all_by_Slice( $sl1, $qy_species, $qy_assembly, + $alignment_type ) }; + @arr2 = @{ + $dafa->fetch_all_by_Slice( $sl2, $qy_species, $qy_assembly, + $alignment_type ) }; + push @arr, @arr1, @arr2; + return \@arr; + ## circular EOF +#return $dafa->fetch_all_by_Slice($self, $qy_species, $qy_assembly, $alignment_type); +} ## end sub get_all_compara_DnaAlignFeatures + +=head2 get_all_compara_Syntenies + + Arg [1] : string $query_species e.g. "Mus_musculus" or "Mus musculus" + Arg [2] : string $method_link_type, default is "SYNTENY" + Arg [3] : compara dbadaptor to use. + Description: gets all the compara syntenyies for a specfic species + Returns : arrayref of Bio::EnsEMBL::Compara::SyntenyRegion + Status : Stable + +=cut + +sub get_all_compara_Syntenies { + my ( $self, $qy_species, $method_link_type, $compara_db ) = @_; + + if ( !$self->adaptor() ) { + warning("Cannot retrieve features without attached adaptor"); + return []; + } + + unless ($qy_species) { + throw("Query species and assembly arguments are required"); + } + + unless ( defined $method_link_type ) { + $method_link_type = "SYNTENY"; + } + + if ( !defined($compara_db) ) { + $compara_db = + Bio::EnsEMBL::Registry->get_DBAdaptor( "compara", "compara" ); + } + unless ($compara_db) { + warning( + "Compara database must be attached to core database or passed " + . "as an argument to " + . "retrieve compara information" ); + return []; + } + my $gdba = $compara_db->get_GenomeDBAdaptor(); + my $mlssa = $compara_db->get_MethodLinkSpeciesSetAdaptor(); + my $dfa = $compara_db->get_DnaFragAdaptor(); + my $sra = $compara_db->get_SyntenyRegionAdaptor(); + + my $this_gdb = + $gdba->fetch_by_core_DBAdaptor( $self->adaptor()->db() ); + my $query_gdb = $gdba->fetch_by_registry_name($qy_species); + my $mlss = + $mlssa->fetch_by_method_link_type_GenomeDBs( $method_link_type, + [ $this_gdb, $query_gdb ] ); + + my $cs = $self->coord_system()->name(); + my $sr = $self->seq_region_name(); + my ($dnafrag) = + @{ $dfa->fetch_all_by_GenomeDB_region( $this_gdb, $cs, $sr ) }; + + ## circular BOF + my $sl1 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => $self->{'start'}, + -END => $self->{'seq_region_length'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my $sl2 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => 1, + -END => $self->{'end'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my ( @arr, @arr1, @arr2 ); + @arr1 = @{ + $sra->fetch_all_by_MethodLinkSpeciesSet_DnaFrag( $mlss, $dnafrag, + $sl1->start, $sl1->end ) + }; + @arr2 = @{ + $sra->fetch_all_by_MethodLinkSpeciesSet_DnaFrag( $mlss, $dnafrag, + $sl2->start, $sl2->end ) + }; + push @arr, @arr1, @arr2; + return \@arr; + ## circular EOF +#return $sra->fetch_all_by_MethodLinkSpeciesSet_DnaFrag($mlss, $dnafrag, $self->start, $self->end); +} ## end sub get_all_compara_Syntenies + +=head2 get_all_Haplotypes + + Arg [1] : (optional) boolean $lite_flag + if true lightweight haplotype objects are used + Example : @haplotypes = $slice->get_all_Haplotypes; + Description: Retrieves all of the haplotypes on this slice. Only works + if the haplotype adaptor has been attached to the core adaptor + via $dba->add_db_adaptor('haplotype', $hdba); + Returntype : listref of Bio::EnsEMBL::External::Haplotype::Haplotypes + Exceptions : warning is Haplotype database is not available + Caller : contigview, general + Status : Stable + +=cut + +sub get_all_Haplotypes { + my ( $self, $lite_flag ) = @_; + + if ( !$self->adaptor() ) { + warning("Cannot retrieve features without attached adaptor"); + return []; + } + + my $haplo_db = $self->adaptor->db->get_db_adaptor('haplotype'); + + unless ($haplo_db) { + warning( "Haplotype database must be attached to core database to " + . "retrieve haplotype information" ); + return []; + } + + my $haplo_adaptor = $haplo_db->get_HaplotypeAdaptor; + + ## circular BOF + my $sl1 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => $self->{'start'}, + -END => $self->{'seq_region_length'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my $sl2 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => 1, + -END => $self->{'end'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my ( @arr, @arr1, @arr2 ); + @arr1 = @{ $haplo_adaptor->fetch_all_by_Slice( $sl1, $lite_flag ) }; + @arr2 = @{ $haplo_adaptor->fetch_all_by_Slice( $sl2, $lite_flag ) }; + push @arr, @arr1, @arr2; + return \@arr; + ## circular EOF + +#my $haplotypes = $haplo_adaptor->fetch_all_by_Slice($self, $lite_flag); +#return $haplotypes; +} ## end sub get_all_Haplotypes + +sub get_all_DASFactories { + my $self = shift; + return [ $self->adaptor()->db()->_each_DASFeatureFactory ]; +} + + +sub get_all_DASFeatures_dsn { + my ( $self, $source_type, $dsn ) = @_; + + if ( !$self->adaptor() ) { + warning("Cannot retrieve features without attached adaptor"); + return []; + } + my @X = + grep { $_->adaptor->dsn eq $dsn } + $self->adaptor()->db()->_each_DASFeatureFactory; + + return [ $X[0]->fetch_all_Features( $self, $source_type ) ]; +} + +sub get_all_DAS_Features { + my ($self) = @_; + + $self->{_das_features} ||= {}; # Cache + $self->{_das_styles} ||= {}; # Cache + $self->{_das_segments} ||= {}; # Cache + my %das_features; + my %das_styles; + my %das_segments; + my $slice = $self; + + foreach my $dasfact ( @{ $self->get_all_DASFactories } ) { + my $dsn = $dasfact->adaptor->dsn; + my $name = $dasfact->adaptor->name; + # my $type = $dasfact->adaptor->type; + my $url = $dasfact->adaptor->url; + + my ($type) = $dasfact->adaptor->mapping; + if ( ref $type eq 'ARRAY' ) { + $type = shift @$type; + } + $type ||= $dasfact->adaptor->type; + # Construct a cache key : SOURCE_URL/TYPE + # Need the type to handle sources that serve multiple types of features + + my $key = join( '/', $name, $type ); + if ( $self->{_das_features}->{$key} ) { # Use cached + $das_features{$name} = $self->{_das_features}->{$key}; + $das_styles{$name} = $self->{_das_styles}->{$key}; + $das_segments{$name} = $self->{_das_segments}->{$key}; + } else { # Get fresh data + my ( $featref, $styleref, $segref ) = + $dasfact->fetch_all_Features( $slice, $type ); + $self->{_das_features}->{$key} = $featref; + $self->{_das_styles}->{$key} = $styleref; + $self->{_das_segments}->{$key} = $segref; + $das_features{$name} = $featref; + $das_styles{$name} = $styleref; + $das_segments{$name} = $segref; + } + } ## end foreach my $dasfact ( @{ $self...}) + + return ( \%das_features, \%das_styles, \%das_segments ); +} ## end sub get_all_DAS_Features + +=head2 get_all_DASFeatures + + Arg [1] : none + Example : $features = $slice->get_all_DASFeatures; + Description: Retreives a hash reference to a hash of DAS feature + sets, keyed by the DNS, NOTE the values of this hash + are an anonymous array containing: + (1) a pointer to an array of features; + (2) a pointer to the DAS stylesheet + Returntype : hashref of Bio::SeqFeatures + Exceptions : ? + Caller : webcode + Status : Stable + +=cut +sub get_all_DASFeatures { + my ( $self, $source_type ) = @_; + + if ( !$self->adaptor() ) { + warning("Cannot retrieve features without attached adaptor"); + return []; + } + + my %genomic_features = map { + ( $_->adaptor->dsn => + [ $_->fetch_all_Features( $self, $source_type ) ] ) + } $self->adaptor()->db()->_each_DASFeatureFactory; + return \%genomic_features; + +} + +sub old_get_all_DASFeatures { + my ( $self, @args ) = @_; + + if ( !$self->adaptor() ) { + warning("Cannot retrieve features without attached adaptor"); + return []; + } + + my %genomic_features = + map { ( $_->adaptor->dsn => [ $_->fetch_all_by_Slice($self) ] ) } + $self->adaptor()->db()->_each_DASFeatureFactory; + return \%genomic_features; + +} + +=head2 get_all_ExternalFeatures + + Arg [1] : (optional) string $track_name + If specified only features from ExternalFeatureAdaptors with + the track name $track_name are retrieved. + If not set, all features from every ExternalFeatureAdaptor are + retrieved. + Example : @x_features = @{$slice->get_all_ExternalFeatures} + Description: Retrieves features on this slice from external feature adaptors + Returntype : listref of Bio::SeqFeatureI implementing objects in slice + coordinates + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_ExternalFeatures { + my ( $self, $track_name ) = @_; + if ( !$self->adaptor() ) { + warning("Cannot retrieve features without attached adaptor"); + return []; + } + my $features = []; + my $xfa_hash = $self->adaptor->db->get_ExternalFeatureAdaptors; + my @xf_adaptors = (); + if ($track_name) { + #use a specific adaptor + if ( exists $xfa_hash->{$track_name} ) { + push @xf_adaptors, $xfa_hash->{$track_name}; + } + } else { + #use all of the adaptors + push @xf_adaptors, values %$xfa_hash; + } + + ## circular BOF + my $sl1 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => $self->{'start'}, + -END => $self->{'seq_region_length'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my $sl2 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => 1, + -END => $self->{'end'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + foreach my $xfa (@xf_adaptors) { + push @$features, @{ $xfa->fetch_all_by_Slice($sl1) }; + push @$features, @{ $xfa->fetch_all_by_Slice($sl2) }; + } + return $features; + ## circular EOF +} ## end sub get_all_ExternalFeatures + +=head2 get_all_DitagFeatures + + Arg [1] : (optional) string ditag type + Arg [1] : (optional) string logic_name + Example : @dna_dna_align_feats = @{$slice->get_all_DitagFeatures}; + Description: Retrieves the DitagFeatures of a specific type which overlap + this slice with. If type is not defined, all features are + retrieved. + Returntype : listref of Bio::EnsEMBL::DitagFeatures + Exceptions : warning if slice does not have attached adaptor + Caller : general + Status : Stable + +=cut + +sub get_all_DitagFeatures { + my ( $self, $type, $logic_name ) = @_; + + if ( !$self->adaptor() ) { + warning('Cannot get DitagFeatures without attached adaptor'); + return []; + } + + my $dfa = $self->adaptor->db->get_DitagFeatureAdaptor(); + + ## circular BOF + my $sl1 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => $self->{'start'}, + -END => $self->{'seq_region_length'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my $sl2 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => 1, + -END => $self->{'end'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my ( @arr, @arr1, @arr2 ); + @arr1 = @{ $dfa->fetch_all_by_Slice( $sl1, $type, $logic_name ) }; + @arr2 = @{ $dfa->fetch_all_by_Slice( $sl2, $type, $logic_name ) }; + push @arr, @arr1, @arr2; + return \@arr; + ## circular EOF + #return $dfa->fetch_all_by_Slice($self, $type, $logic_name); +} ## end sub get_all_DitagFeatures + +# GENERIC FEATURES (See DBAdaptor.pm) + +=head2 get_generic_features + + Arg [1] : (optional) List of names of generic feature types to return. + If no feature names are given, all generic features are + returned. + Example : my %features = %{$slice->get_generic_features()}; + Description: Gets generic features via the generic feature adaptors that + have been added via DBAdaptor->add_GenricFeatureAdaptor (if + any) + Returntype : Hash of named features. + Exceptions : none + Caller : none + Status : Stable + +=cut + +sub get_generic_features { + + my ( $self, @names ) = @_; + + if ( !$self->adaptor() ) { + warning('Cannot retrieve features without attached adaptor'); + return []; + } + + my $db = $self->adaptor()->db(); + + my %features = (); # this will hold the results + + # get the adaptors for each feature + my %adaptors = %{ $db->get_GenericFeatureAdaptors(@names) }; + + foreach my $adaptor_name ( keys(%adaptors) ) { + + my $adaptor_obj = $adaptors{$adaptor_name}; + # get the features and add them to the hash + ## circular BOF + my $sl1 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => $self->{'start'}, + -END => $self->{'seq_region_length'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my $sl2 = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $self->{'coord_system'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -START => 1, + -END => $self->{'end'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor() ); + + my ( @arr1, @arr2 ); + my $features_ref; + @arr1 = @{ $adaptor_obj->fetch_all_by_Slice($sl1) }; + @arr2 = @{ $adaptor_obj->fetch_all_by_Slice($sl2) }; + push @{$features_ref}, @arr1, @arr2; + ## circular EOF + #my $features_ref = $adaptor_obj->fetch_all_by_Slice($self); + + # add each feature to the hash to be returned + foreach my $feature (@$features_ref) { + $features{$adaptor_name} = $feature; + } + } ## end foreach my $adaptor_name ( ...) + + return \%features; + +} ## end sub get_generic_features + +=head2 project_to_slice + + Arg [1] : Slice to project to. + Example : my $chr_projection = $clone_slice->project_to_slice($chrom_slice); + foreach my $segment ( @$chr_projection ){ + $chr_slice = $segment->to_Slice(); + print $clone_slice->seq_region_name(). ':'. $segment->from_start(). '-'. + $segment->from_end(). ' -> '.$chr_slice->seq_region_name(). ':'. $chr_slice->start(). + '-'.$chr_slice->end(). + $chr_slice->strand(). " length: ".($chr_slice->end()-$chr_slice->start()+1). "\n"; + } + Description: Projection of slice to another specific slice. Needed for where we have multiple mappings + and we want to state which one to project to. + Returntype : list reference of Bio::EnsEMBL::ProjectionSegment objects which + can also be used as [$start,$end,$slice] triplets. + Exceptions : none + Caller : none + Status : At Risk + +=cut + +sub project_to_slice { + my $self = shift; + my $to_slice = shift; + + throw('Slice argument is required') if ( !$to_slice ); + + my $slice_adaptor = $self->adaptor(); + + if ( !$slice_adaptor ) { + warning("Cannot project without attached adaptor."); + return []; + } + + my $mapper_aptr = $slice_adaptor->db->get_AssemblyMapperAdaptor(); + + my $cs = $to_slice->coord_system(); + my $slice_cs = $self->coord_system(); + + my @projection; + my $current_start = 1; + + # decompose this slice into its symlinked components. + # this allows us to handle haplotypes and PARs + my $normal_slice_proj = + $slice_adaptor->fetch_normalized_slice_projection($self); + foreach my $segment (@$normal_slice_proj) { + my $normal_slice = $segment->[2]; + + $slice_cs = $normal_slice->coord_system(); + + my $asma = $self->adaptor->db->get_AssemblyMapperAdaptor(); + my $asm_mapper = $asma->fetch_by_CoordSystems( $slice_cs, $cs ); + + # perform the mapping between this slice and the requested system + my @coords; + + if ( defined $asm_mapper ) { + @coords = $asm_mapper->map( $normal_slice->seq_region_name(), + $normal_slice->start(), + $normal_slice->end(), + $normal_slice->strand(), + $slice_cs, + undef, + $to_slice ); + } else { + $coords[0] = + Bio::EnsEMBL::Mapper::Gap->new( $normal_slice->start(), + $normal_slice->end() ); + } + + #construct a projection from the mapping results and return it + foreach my $coord (@coords) { + my $coord_start = $coord->start(); + my $coord_end = $coord->end(); + my $length = $coord_end - $coord_start + 1; + + #skip gaps + if ( $coord->isa('Bio::EnsEMBL::Mapper::Coordinate') ) { + my $coord_cs = $coord->coord_system(); + + # If the normalised projection just ended up mapping to the + # same coordinate system we were already in then we should just + # return the original region. This can happen for example, if we + # were on a PAR region on Y which refered to X and a projection to + # 'toplevel' was requested. + # if($coord_cs->equals($slice_cs)) { + # # trim off regions which are not defined + # return $self->_constrain_to_region(); + # } + + #create slices for the mapped-to coord system + my $slice = + $slice_adaptor->fetch_by_seq_region_id( $coord->id(), + $coord_start, $coord_end, $coord->strand() ); + + my $current_end = $current_start + $length - 1; + + push @projection, + bless( [ $current_start, $current_end, $slice ], + "Bio::EnsEMBL::ProjectionSegment" ); + } + + $current_start += $length; + } ## end foreach my $coord (@coords) + } ## end foreach my $segment (@$normal_slice_proj) + +# delete the cache as we may want to map to different set next time and old +# results will be cached. + + $mapper_aptr->delete_cache; + + return \@projection; +} ## end sub project_to_slice + +# +# Bioperl Bio::PrimarySeqI methods: +# + +=head2 id + + Description: Included for Bio::PrimarySeqI interface compliance (0.7) + +=cut + +sub id { name(@_); } + +=head2 display_id + + Description: Included for Bio::PrimarySeqI interface compliance (1.2) + +=cut + +sub display_id { name(@_); } + +=head2 primary_id + + Description: Included for Bio::PrimarySeqI interface compliance (1.2) + +=cut + +sub primary_id { name(@_); } + +=head2 desc + +Description: Included for Bio::PrimarySeqI interface compliance (1.2) + +=cut + +sub desc { + return $_[0]->coord_system->name() . ' ' . $_[0]->seq_region_name(); +} + +=head2 moltype + +Description: Included for Bio::PrimarySeqI interface compliance (0.7) + +=cut + +sub moltype { return 'dna'; } + +=head2 alphabet + + Description: Included for Bio::PrimarySeqI interface compliance (1.2) + +=cut + +sub alphabet { return 'dna'; } + +=head2 accession_number + + Description: Included for Bio::PrimarySeqI interface compliance (1.2) + +=cut + +sub accession_number { name(@_); } + +=head2 is_circular + + Description: Included for Bio::PrimarySeqI interface compliance (1.2) + +=cut + +sub is_circular { + my ($self) = @_; + + if ( !defined( $self->{'circular'} ) ) { + my @attrs = + grep { $_ } @{ $self->get_all_Attributes('circular_seq') }; + $self->{'circular'} = @attrs ? 1 : 0; + } + + return $self->{'circular'}; +} + +# sub DEPRECATED METHODS # +############################################################################### + +=head1 DEPRECATED METHODS + +=cut + +=head2 get_all_supercontig_Slices + + DEPRECATED use get_tiling_path("NTcontig") instead + +=cut + +sub get_all_supercontig_Slices { + my $self = shift; + + deprecate("Use get_tiling_path('NTcontig') instead"); + + my $result = []; + + if ( $self->adaptor() ) { + my $superctg_names = + $self->adaptor()->list_overlapping_supercontigs($self); + + for my $name (@$superctg_names) { + my $slice; + $slice = $self->adaptor()->fetch_by_supercontig_name($name); + $slice->name($name); + push( @$result, $slice ); + } + } else { + warning( + "Slice needs to be attached to a database to get supercontigs"); + } + + return $result; +} + +=head2 get_Chromosome + + DEPRECATED use this instead: + $slice_adp->fetch_by_region('chromosome',$slice->seq_region_name) + +=cut + +sub get_Chromosome { + my $self = shift @_; + + deprecate( "Use SliceAdaptor::fetch_by_region('chromosome'," + . '$slice->seq_region_name) instead' ); + + my $csa = $self->adaptor->db->get_CoordSystemAdaptor(); + my ($top_cs) = @{ $csa->fetch_all() }; + + return + $self->adaptor->fetch_by_region( $top_cs->name(), + $self->seq_region_name(), + undef, undef, undef, $top_cs->version() ); +} + +=head2 chr_name + + DEPRECATED use seq_region_name() instead + +=cut + +sub chr_name { + deprecate("Use seq_region_name() instead"); + seq_region_name(@_); +} + +=head2 chr_start + + DEPRECATED use start() instead + +=cut + +sub chr_start { + deprecate('Use start() instead'); + start(@_); +} + +=head2 chr_end + + DEPRECATED use end() instead + + Returntype : int + Exceptions : none + Caller : SliceAdaptor, general + +=cut + +sub chr_end { + deprecate('Use end() instead'); + end(@_); +} + +=head2 assembly_type + + DEPRECATED use version instead + +=cut + +sub assembly_type { + my $self = shift; + deprecate('Use $slice->coord_system()->version() instead.'); + return $self->coord_system->version(); +} + +=head2 get_tiling_path + + DEPRECATED use project instead + +=cut + +sub get_tiling_path { + my $self = shift; + deprecate('Use $slice->project("seqlevel") instead.'); + return []; +} + +=head2 dbID + + Description: DEPRECATED use SliceAdaptor::get_seq_region_id instead + +=cut + +sub dbID { + my $self = shift; + deprecate('Use SliceAdaptor::get_seq_region_id instead.'); + if ( !$self->adaptor ) { + warning('Cannot retrieve seq_region_id without attached adaptor.'); + return 0; + } + return $self->adaptor->get_seq_region_id($self); +} + +=head2 get_all_MapFrags + + DEPRECATED use get_all_MiscFeatures instead + +=cut + +sub get_all_MapFrags { + my $self = shift; + deprecate('Use get_all_MiscFeatures instead'); + return $self->get_all_MiscFeatures(@_); +} + +=head2 has_MapSet + + DEPRECATED use get_all_MiscFeatures instead + +=cut + +sub has_MapSet { + my ( $self, $mapset_name ) = @_; + deprecate('Use get_all_MiscFeatures instead'); + my $mfs = $self->get_all_MiscFeatures($mapset_name); + return ( @$mfs > 0 ); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/CoordSystem.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/CoordSystem.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,366 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::CoordSystem + +=head1 SYNOPSIS + + my $db = Bio::EnsEMBL::DBSQL::DBAdaptor->new(...); + + my $csa = $db->get_CoordSystemAdaptor(); + + # + # Get all coord systems in the database: + # + foreach my $cs ( @{ $csa->fetch_all() } ) { + my $str = join ':', $cs->name(), $cs->version(), $cs->dbID(); + print "$str\n"; + } + +=head1 DESCRIPTION + +This is a simple object which contains a few coordinate system attributes: +name, internal identifier, version. A coordinate system is uniquely defined +by its name and version. A version of a coordinate system applies to all +sequences within a coordinate system. This should not be confused with +individual sequence versions. + +Take for example the Human assembly. The version 'NCBI33' applies to +to all chromosomes in the NCBI33 assembly (that is the entire 'chromosome' +coordinate system). The 'clone' coordinate system in the same database would +have no version however. Although the clone sequences have their own sequence +versions, there is no version which applies to the entire set of clones. + +Coordinate system objects are immutable. Their name and version, and other +attributes may not be altered after they are created. + +=head1 METHODS + +=cut + + +use strict; +use warnings; + +package Bio::EnsEMBL::CoordSystem; + +use Bio::EnsEMBL::Storable; + +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw); + +use vars qw(@ISA); + +@ISA = qw(Bio::EnsEMBL::Storable); + + +=head2 new + + Arg [..] : List of named arguments: + -NAME - The name of the coordinate system + -VERSION - (optional) The version of the coordinate system. + Note that if the version passed in is undefined, + it will be set to the empty string in the + resulting CoordSystem object. + -RANK - The rank of the coordinate system. The highest + level coordinate system should have rank 1, the + second highest rank 2 and so on. An example of + a high level coordinate system is 'chromosome' an + example of a lower level coordinate system is + 'clone'. + -TOP_LEVEL - (optional) Sets whether this is a top-level coord + system. Default = 0. This should only be set to + true if you are creating an artificial toplevel + coordsystem by the name of 'toplevel' + -SEQUENCE_LEVEL - (optional) Sets whether this is a sequence + level coordinate system. Default = 0 + -DEFAULT - (optional) + Whether this is the default version of the + coordinate systems of this name. Default = 0 + -DBID - (optional) The internal identifier of this + coordinate system + -ADAPTOR - (optional) The adaptor which provides database + interaction for this object + Example : $cs = Bio::EnsEMBL::CoordSystem->new(-NAME => 'chromosome', + -VERSION => 'NCBI33', + -RANK => 1, + -DBID => 1, + -ADAPTOR => adaptor, + -DEFAULT => 1, + -SEQUENCE_LEVEL => 0); + Description: Creates a new CoordSystem object representing a coordinate + system. + Returntype : Bio::EnsEMBL::CoordSystem + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + my ( $name, $version, $top_level, $sequence_level, $default, $rank ) = + rearrange( [ 'NAME', 'VERSION', + 'TOP_LEVEL', 'SEQUENCE_LEVEL', + 'DEFAULT', 'RANK' ], + @_ ); + + $version = '' if ( !defined($version) ); + + $top_level = ($top_level) ? 1 : 0; + $sequence_level = ($sequence_level) ? 1 : 0; + $default = ($default) ? 1 : 0; + $rank ||= 0; + + if ( $top_level == 1 ) { + if ( $rank != 0 ) { + throw('RANK argument must be 0 if TOP_LEVEL is 1'); + } + + if ( defined($name) ) { + if ( $name ne 'toplevel' ) { + throw('The NAME argument must be "toplevel" if TOP_LEVEL is 1'); + } + } else { + $name = 'toplevel'; + } + + if ( $sequence_level == 1 ) { + throw("SEQUENCE_LEVEL argument must be 0 if TOP_LEVEL is 1"); + } + + $default = 0; + + } else { + + if ( $rank == 0 ) { + throw("RANK argument must be non-zero unless TOP_LEVEL is 1"); + } + + if ( !defined($name) ) { + throw('The NAME argument is required'); + } elsif ( $name eq 'toplevel' ) { + throw( "Cannot name coord system 'toplevel' " + . "unless TOP_LEVEL is 1" ); + } + + } + + if ( $rank !~ /^\d+$/ ) { + throw('The RANK argument must be a positive integer'); + } + + $self->{'version'} = $version; + $self->{'name'} = $name; + $self->{'top_level'} = $top_level; + $self->{'sequence_level'} = $sequence_level; + $self->{'default'} = $default; + $self->{'rank'} = $rank; + + return $self; +} ## end sub new + + +=head2 name + + Arg [1] : (optional) string $name + Example : print $coord_system->name(); + Description: Getter for the name of this coordinate system + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub name { + my $self = shift; + return $self->{'name'}; +} + + + +=head2 version + + Arg [1] : none + Example : print $coord->version(); + Description: Getter for the version of this coordinate system. This + will return an empty string if no version is defined for this + coordinate system. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub version { + my $self = shift; + return $self->{'version'}; +} + + + +=head2 species + + Arg [1] : none + Example : print $coord->species(); + Description: Shortcut method to get the species this CoordSystem refers to. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub species { + my $self = shift; + return $self->adaptor->db->species; +} + + + +=head2 equals + + Arg [1] : Bio::EnsEMBL::CoordSystem $cs + The coord system to compare to for equality. + Example : if($coord_sys->equals($other_coord_sys)) { ... } + Description: Compares 2 coordinate systems and returns true if they are + equivalent. The definition of equivalent is sharing the same + name and version. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub equals { + my $self = shift; + my $cs = shift; + + if(!$cs || !ref($cs) || !$cs->isa('Bio::EnsEMBL::CoordSystem')) { + if ($cs->isa('Bio::EnsEMBL::ExternalData::DAS::CoordSystem')) { + return $cs->equals($self); + } + throw('Argument must be a CoordSystem'); + } + + if($self->{'version'} eq $cs->version() && $self->{'name'} eq $cs->name()) { + return 1; + } + + return 0; +} + + + + +=head2 is_top_level + + Arg [1] : none + Example : if($coord_sys->is_top_level()) { ... } + Description: Returns true if this is the toplevel pseudo coordinate system. + The toplevel coordinate system is not a real coordinate system + which is stored in the database, but it is a placeholder that + can be used to request transformations or retrievals to/from + the highest defined coordinate system in a given region. + Returntype : 1 or 0 + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub is_top_level { + my $self = shift; + return $self->{'top_level'}; +} + + +=head2 is_sequence_level + + Arg [1] : none + Example : if($coord_sys->is_sequence_level()) { ... } + Description: Returns true if this is a sequence level coordinate system + Returntype : 1 or 0 + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub is_sequence_level { + my $self = shift; + return $self->{'sequence_level'}; +} + + +=head2 is_default + + Arg [1] : none + Example : if($coord_sys->is_default()) { ... } + Description: Returns true if this coordinate system is the default + version of the coordinate system of this name. + Returntype : 1 or 0 + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub is_default { + my $self = shift; + return $self->{'default'}; +} + + + + +=head2 rank + + Arg [1] : none + Example : if($cs1->rank() < $cs2->rank()) { + print $cs1->name(), " is a higher level coord system than", + $cs2->name(), "\n"; + } + Description: Returns the rank of this coordinate system. A lower number + is a higher coordinate system. The highest level coordinate + system has a rank of 1 (e.g. 'chromosome'). The toplevel + pseudo coordinate system has a rank of 0. + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub rank { + my $self = shift; + return $self->{'rank'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DB/ExternalFeatureFactoryI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DB/ExternalFeatureFactoryI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,301 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DB::ExternalFeatureFactoryI - +Legacy Abstract interface for External Feature +Factories. Bio::EnsEMBL::External::ExternalFeatureAdaptor should be used +instead if possible. + +=head1 SYNOPSIS + + $external_ff = new ImplementingExternalFeatureFactoryClass; + + $database_adaptor = new Bio::EnsEMBL::DBSQL::DBAdaptor( + -host => 'blah', + -dbname => 'other', + -pass => 'pass' + ); + + # alternatively, you can add external databases to an obj once made + $database_adaptor->add_ExternalFeatureFactory($external_ff); + + # now the ExternalFeatureFactory has been added, Ensembl RawContigs + # and Slices will now have ExternalFeatures on them + $contig = + $db_adaptor->get_RawContigAdaptor->fetch_by_name('AC00056.00001'); + @external = @{ $contig->get_all_ExternalFeatures() }; + + # this works on Slices as well + $slice = + $db_adaptor->get_SliceAdaptor->fetch_by_chr_start_end( '12', 10000, + 30000 ); + @external = @{ $slice->get_all_ExternalFeatures() }; + +=head1 DESCRIPTION + +This is a legacy class. It is included only for backwards +compatibility with ExternalFeatureFactories which are presumably +still used to place data into ensembl. It is recommended that if +you wish to create EnsEMBL features externally that you use the +Bio::EnsEMBL::External::ExternalFeatureAdaptor instead. + +This object defines the abstract interface for External Database access +inside Ensembl. The aim is that one can attach an External Database +which will generate Sequence Features and these Sequence Features will +be accessible along side all the internal Ensembl sequence features, for +drawing, EMBL dumping etc. In particular, the external database does not +have to worry about the transformation of the Sequence Feature objects +into VirtualContigs. + +Sequence Features have to be defined in one of two coordinate systems: +Original EMBL/GenBank coordinates of a particular sequnence version or +the Ensembl contig coordinates. This means you have to calculate your +sequence features in one these two coordinate systems + +The methods that have to be implemented are: + + get_External_SeqFeatures_contig( $ensembl_contig_identifier, + $sequence_version, $start, $end ); + + get_External_SeqFeatures_clone( $embl_accession_number, + $sequence_version, $start, $end ); + +The semantics of this method is as follows: + + $ensembl_contig_identifier - the ensembl contig id (external id). + $sequence_version - embl/genbank sequence version + $embl_accession_number - the embl/genbank accession number + +The $start/$end can be ignored, but methods can take advantage of it. +This is so that ensembl can ask for features only on a region of DNA, +and if desired, the external database can respond with features only in +this region, rather than the entire sequence. + +The hope is that the second method could potentially have a very complex +set of mappings of other embl_accession numbers to one embl_accession +number and provide the complex mapping. + +The methods should return Sequence Features with the following spec: + + a) must implement the Bio::SeqFeatureI interface. + + b) must accept "set" calls on + + start,end,strand + + to provide coordinate transformation of the feature. + + c) must be unique in-memory objects, ie, the implementation is not + allowed to cache the sequence feature in its entirity. Two separate + calls to get_External_SeqFeatures_contig must be able to separately + set start,end,strand information without clobbering each other. The + other information, if so wished, can be cached by each SeqFeature + holding onto another object, but this is left to the implementor to + decide on the correct strategy. + + d) must return an unique identifier when called with method id. + +You must implement both functions. In most cases, one function will +always return an empty list, whereas the other function will actually +query the external database. + +The second way of accessing the External Database from Ensembl is using +unique internal identifiers in that database. The method is: + + get_SeqFeature_by_id($id); + +It should return exactly one Sequence Feature object of the same type as +above. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DB::ExternalFeatureFactoryI; +use Bio::EnsEMBL::External::ExternalFeatureAdaptor; +use vars qw(@ISA); + +@ISA = ( 'Bio::EnsEMBL::External::ExternalFeatureAdaptor' ); + + +=head2 coordinate_systems + + Arg [1] : none + Example : none + Description: This method is present to make the ExternalFeatureFactory + interface behave as an ExternalFeatureAdaptor. It is for + backwards compatibility. + Returntype : none + Exceptions : none + Caller : internal + +=cut + +sub coordinate_systems { + my $self = shift; + return qw(CONTIG); +} + + +=head2 fetch_all_by_contig_name + + Arg [1] : none + Example : none + Description: This method is present to make the ExternalFeatureFactory + interface behave as an ExternalFeatureAdaptor. It is for + backwards compatibility. + Returntype : none + Exceptions : none + Caller : internal + +=cut + +sub fetch_all_by_contig_name { + my ($self, $contig_name) = @_; + + unless($self->db) { + $self->throw('DB attribute not set. This value must be set for the ' . + 'ExternalFeatureFactory to function correctly'); + } + + my @features = (); + + my $ctg = $self->db->get_RawContigAdaptor->fetch_by_name($contig_name); + my $clone = $ctg->clone; + my $version = $clone->version; + my $ctg_length = $ctg->length; + + #get contig features + push @features, $self->get_Ensembl_SeqFeatures_contig($ctg->name, + $version, + 1, + $ctg_length); + + #get clone features + my $clone_start = $ctg->embl_offset; + my $clone_end = $clone_start + $ctg_length - 1; + my @clone_features = $self->get_Ensembl_SeqFeatures_clone($clone->id, + $version, + $clone_start, + $clone_end); + + #change clone coordinates to contig coordinates + my ($start, $end); + foreach my $f (@clone_features) { + $start = $f->start - $clone_start + 1; + $end = $f->end - $clone_start + 1; + + #skip features outside the contig + next if($end < 1 || $start > $ctg_length); + + $f->start($start); + $f->end($end); + + push @features, $f; + } + + return \@features; +} + +=head2 get_Ensembl_SeqFeatures_contig + + Title : get_Ensembl_SeqFeatures_contig (Abstract) + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub get_Ensembl_SeqFeatures_contig{ + my ($self) = @_; + + $self->warn("Abstract method get_External_SeqFeatures_contig " . + "encountered in base class. Implementation failed to complete it"); + +} + +=head2 get_Ensembl_SeqFeatures_clone + + Title : get_Ensembl_SeqFeatures_clone (Abstract) + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub get_Ensembl_SeqFeatures_clone{ + my ($self) = @_; + + $self->warn("Abstract method get_Ensembl_SeqFeatures_clone " . + "encountered in base class. Implementation failed to complete it"); + +} + +=head2 get_Ensembl_Genes_clone + + Title : get_Ensembl_Genes_clone + Function: returns Gene objects in clone coordinates from a gene id + Returns : An array of Gene objects + Args : clone id + +=cut + +sub get_Ensembl_Genes_clone { + my $self = @_; + + return; +} + +=head2 get_SeqFeature_by_id + + Title : get_SeqFeature_by_id (Abstract) + Usage : + Function: Return SeqFeature object for any valid unique id + Example : + Returns : + Args : id as determined by the External Database + + +=cut + + +sub get_SeqFeature_by_id { + my ($self) = @_; + $self->warn("Abstract method get_SeqFeature_by_id encountered " . + "in base class. Implementation failed to complete it"); +} + + +1; + + + + + + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBEntry.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBEntry.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,715 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBEntry - +Object representing an external reference (xref) + +=head1 DESCRIPTION + +This object holds information about external references (xrefs) to +Ensembl objects. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBEntry; + +use strict; +use warnings; +no warnings qw(uninitialized); + +use Bio::EnsEMBL::Storable; +use Bio::Annotation::DBLink; + +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(deprecate); +use Scalar::Util qw(weaken isweak); + +our @ISA = qw(Bio::EnsEMBL::Storable Bio::Annotation::DBLink); + + +=head2 new_fast + + Arg [1] : Hashref $hashref - hash reference to bless as new DBEntry object + Description: A very quick constructor that requires internal knowledge of + the class. This is used in speed critical sections of the code + where many objects need to be created quickly. + Returntype : Bio::EnsEMBL::DBEntry + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub new_fast { + my $class = shift; + my $hashref = shift; + my $self = bless $hashref, $class; + weaken($self->{adaptor}) if ( ! isweak($self->{adaptor}) ); + return $self; +} + + +=head2 new + + Args [...] : list of named parameters + Example : my $dbentry = new Bio::EnsEMBL::DBEntry( + -adaptor => $adaptor, + -primary_id => $pid, + -version => $version, + -dbname => $dbname, + -release => $release, + -display_id => $did, + -description => $description, + -priority => $priority, + -db_display_name => $db_display_name, + -info_type => $info_type, + -info_text => $info_text, + -type => $type, + -secondary_db_name => $secondary_db_name, + -secondary_db_table => $secondary_db_table + -linkage_annotation => $object_xref_text); + Description: Creates a new DBEntry object + Returntype : Bio::EnsEMBL::DBEntry + Exceptions : none + Caller : Bio::EnsEMBL::DBEntryAdaptor + Status : At Risk + Due to 'PRIORITY', + 'INFO_TYPE', 'INFO_TEXT', ''DB_DISPLAY_NAME', 'TYPE', + 'SECONDARY_DB_NAME', 'SECONDARY_DB_TABLE' + being under development - if you don't use any of these the + method can be considered Stable + +=cut + +sub new { + my ($class, @args) = @_; + + my $self = bless {},$class; + + my ( $adaptor, $dbID, $primary_id, $version, + $dbname, $release, $display_id, $description, + $priority, + $db_display_name, $info_type, $info_text, $type, + $secondary_db_name, $secondary_db_table, $link_annotation, $analysis) = + rearrange ( ['ADAPTOR','DBID','PRIMARY_ID','VERSION', + 'DBNAME','RELEASE','DISPLAY_ID','DESCRIPTION', + 'PRIORITY', + 'DB_DISPLAY_NAME', 'INFO_TYPE', 'INFO_TEXT', 'TYPE', + 'SECONDARY_DB_NAME', 'SECONDARY_DB_TABLE', 'LINKAGE_ANNOTATION', 'ANALYSIS'], @args ); + + $self->adaptor($adaptor); + $self->{'dbID'} = $dbID; + + if( defined $primary_id ) { $self->primary_id( $primary_id ) } + if( defined $version ) { $self->version( $version ) } else + { $self->version( 0 ); } + if( defined $dbname ) { $self->dbname( $dbname ) } + if( defined $release) { $self->release( $release ) } + if( defined $display_id) { $self->display_id( $display_id ) } + if( defined $description) { $self->description($description) } + if( defined $priority) { $self->priority($priority) } + if( defined $db_display_name) { $self->db_display_name($db_display_name) } + if( defined $info_type) { $self->info_type($info_type) } + if( defined $info_text) { $self->info_text($info_text) } + if( defined $type) { $self->type($type) } + if( defined $secondary_db_name) { $self->secondary_db_name($secondary_db_name) } + if( defined $secondary_db_table) { $self->secondary_db_table($secondary_db_table) } + + $self->linkage_annotation($link_annotation) if defined $link_annotation; + $self->analysis($analysis) if defined $analysis; + + + return $self; +} + + +=head2 primary_id + + Arg [1] : (optional) String $arg - value to set + Example : none + Description: Getter/setter for attribute 'primary_id'. + This is the object's primary id in the external database. + Returntype : String + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub primary_id { + my ( $self, $arg ) = @_; + if( defined $arg ) { + $self->{primary_id} = $arg; + } + return $self->{primary_id}; +} + + +=head2 display_id + + Arg [1] : (optional) String $arg - value to set + Example : none + Description: Getter/setter for attribute 'display_id'. + The object's preferred display name. This can be the same + as primary_id or ensembl-specific. + Returntype : String + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub display_id{ + my ( $self, $arg ) = @_; + if( defined $arg ) { + $self->{display_id} = $arg; + } + return $self->{display_id}; +} + + +=head2 optional_id + + Args : none + Example : none + Description: Additional getter for attribute 'display_id'. + The object's preferred display name. + Only include for BioPerl interface compliance, please use + $self->display_id(). + Returntype : String + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub optional_id { + my $self = shift; + return $self->display_id; +} + + +=head2 dbname + + Arg [1] : (optional) String $arg - value to set + Example : none + Description: Getter/setter for attribute 'dbname'. + The name of the external database. + Returntype : String + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub dbname { + my ( $self, $arg ) = @_; + if( defined $arg ) { + $self->{dbname} = $arg; + } + return $self->{dbname}; +} + + +=head2 database + + Args : none + Example : none + Description: Additional getter for attribute 'dbname'. + The name of the external database. + Only include for BioPerl interface compliance, please use + $self->dbname(). + Returntype : String + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub database { + my $self = shift; + return $self->dbname(); +} + + +=head2 release + + Arg [1] : (optional) String $arg - value to set + Example : none + Description: Getter/setter for attribute 'release'. + The external database release name. + Returntype : String + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub release { + my ( $self, $arg ) = @_; + if( defined $arg ) { + $self->{release} = $arg; + } + return $self->{release}; +} + + +=head2 version + + Arg [1] : (optional) String $arg - value to set + Example : none + Description: Getter/setter for attribute 'version'. + The object's version in the external database. + Returntype : String + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub version { + my ( $self, $arg ) = @_; + if( defined $arg ) { + $self->{version} = $arg; + } + return $self->{version}; +} + + +=head2 description + + Arg [1] : (optional) String $arg - value to set + Example : none + Description: Getter/setter for attribute 'description'. + The object's description, from the xref table + Returntype : String + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub description { + my ( $self, $arg ) = @_; + + if ( defined($arg) ) { $self->{'description'} = $arg } + + return $self->{description}; +} + +=head2 analysis + + Arg [1] : Bio::EnsEMBL::Analysis $analysis + Example : none + Description: get/set for attribute analysis + Returntype : Bio::EnsEMBL::Analysis + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub analysis { + my $self = shift; + $self->{analysis} = shift if( @_ ); + return $self->{analysis}; +} + +=head2 comment + + Args : none + Example : none + Description: Additional getter for attribute 'description'. + The object's description. + Only include for BioPerl interface compliance, please use + $self->description(). + Returntype : String + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub comment { + my $self = shift; + return $self->description(); +} + + +=head2 priority + + Arg [1] : int $priority + Example : none + Priority : Getter/setter for attribute 'priority'. Note this + is the priority from the external_db table. + Returntype : String + Exceptions : none + Caller : general + Status : At Risk + : due to it being under development + +=cut + +sub priority { + my ( $self, $arg ) = @_; + if( defined $arg ) { + $self->{priority} = $arg; + } + return $self->{priority}; +} + + +=head2 db_display_name + + Arg [1] : String $db_display_name + Example : none + Description: Getter/setter for attribute 'db_display_name'. + The preferred display name for the external database. Has + "Projected " prepended if info_type='PROJECTION'. + Returntype : String + Exceptions : none + Caller : general + +=cut + +sub db_display_name { + my ( $self, $arg ) = @_; + if( defined $arg ) { + $self->{db_display_name} = $arg; + } + + my $name; + if ($self->{info_type} && $self->{info_type} eq "PROJECTION") { + $name = "Projected " . $self->{db_display_name}; + } else { + $name = $self->{db_display_name}; + } + + return $name; +} + + +=head2 info_type + + Arg [1] : String $info_type + Example : none + Description: Getter/setter for attribute 'info_type'. + Returntype : String + Exceptions : none + Caller : general + +=cut + +sub info_type { + my ( $self, $arg ) = @_; + if( defined $arg ) { + $self->{info_type} = $arg; + } + return $self->{info_type}; + } + + +=head2 info_text + + Arg [1] : String $info_text + Example : none + Description: Getter/setter for attribute 'info_text'. + Returntype : String + Exceptions : none + Caller : general + +=cut + +sub info_text { + my ( $self, $arg ) = @_; + if( defined $arg ) { + $self->{info_text} = $arg; + } + return $self->{info_text}; +} + +=head2 linkage_annotation + + Arg [1] : String $object_xref_text + Example : none + Description: Getter/setter for attribute 'linkage_annotation'. + Returntype : String + Exceptions : none + Caller : general + +=cut + +sub linkage_annotation { + my ( $self, $arg ) = @_; + + $self->{linkage_annotation} = $arg if defined $arg; + + return $self->{linkage_annotation}; +} + + +=head2 type + + Arg [1] : String $type + Example : none + Description: Getter/setter for attribute 'type'. + Returntype : String + Exceptions : none + Caller : general + +=cut + +sub type { + my ( $self, $arg ) = @_; + if( defined $arg ) { + $self->{type} = $arg; + } + return $self->{type}; +} + +=head2 secondary_db_name + + Arg [1] : String $secondary_db_name + Description: Getter/setter for attribute 'secondary_db_name'. + Returntype : String + Exceptions : none + Caller : general + +=cut + +sub secondary_db_name { + my ( $self, $arg ) = @_; + if( defined $arg ) { + $self->{secondary_db_name} = $arg; + } + return $self->{secondary_db_name}; +} + + +=head2 secondary_db_table + + Arg [1] : String $secondary_db_table + Description: Getter/setter for attribute 'secondary_db_table'. + Returns : String + Exceptions : none + Caller : general + +=cut + +sub secondary_db_table { + my ( $self, $arg ) = @_; + if( defined $arg ) { + $self->{secondary_db_table} = $arg; + } + return $self->{secondary_db_table}; +} + + +=head2 add_synonym + + Arg [1] : String $arg - synonym to add + Description: Add a synonym for the external object. + Returntype : none + Exceptions : none + Caller : general + Status : Stable + +=cut + + +sub add_synonym { + my ( $self, $arg ) = @_; + if( defined $arg ) { + push( @{$self->{synonyms}}, $arg ); + } +} + + +=head2 get_all_synonyms + + Args : none + Example : my @synonyms = @{ $db_entry->get_all_synonyms }; + Description: Get a list of synonyms known for this object. + Synonyms are lazy-loaded if required. + Returntype : listref of strings. May be empty. + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_synonyms { + + my $self = shift; + + # lazy-load synonyms if required + if (!$self->{synonyms} && $self->adaptor()) { + $self->{synonyms} = $self->adaptor()->fetch_all_synonyms($self->dbID()); + } + + return $self->{synonyms}; +} + + +=head2 get_all_dependents + + Args[1] : (optional) Bio::EnsEMBL::Gene, Transcript or Translation object + Example : my @dependents = @{ $db_entry->get_all_dependents }; + Description: Get a list of DBEntrys that are depenednet on the DBEntry. + if an ensembl gene transcript or translation is given then only + the ones on that object will be given + Returntype : listref of DBEntrys. May be empty. + Exceptions : none + Caller : general + Status : UnStable + +=cut + +sub get_all_dependents { + my $self = shift; + my $ensembl_object = shift; + + return $self->adaptor()->get_all_dependents($self->dbID(), $ensembl_object); +} + +=head2 get_all_masters + + Args[1] : (optional) Bio::EnsEMBL::Gene, Transcript or Translation object + Example : my @masters = @{ $db_entry->get_all_masters }; + Description: Get a list of DBEntrys that are the masters of the DBEntry. + if an ensembl gene transcript or translation is given then only + the ones on that object will be given. + Returntype : listref of DBEntrys. May be empty. + Exceptions : none + Caller : general + Status : UnStable + +=cut + +sub get_all_masters { + my $self = shift; + my $ensembl_object = shift; + + return $self->adaptor()->get_all_masters($self->dbID(), $ensembl_object); +} + + +=head2 flush_synonyms + + Args : none + Description: Remove all synonyms from this object. + Returntype : none + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub flush_synonyms { + my $self = shift; + $self->{synonyms} = []; +} + + +=head2 status + + Arg [1] : (optional) String $arg - value to set + Description: Getter/setter for attribute 'status'. + Returntype : String + Exceptions : none + Caller : general + Status : Stable + +=cut + + +sub status{ + my ( $self, $arg ) = @_; + if( defined $arg ) { + $self->{status} = $arg; + } + return $self->{status}; +} + +=head2 ensembl_object_type + + Arg [1] : (optional) String $arg - value to set + Description: Getter/setter for attribute ensembl_object_type. + Returntype : String + Exceptions : none + Caller : general + Status : Stable + +=cut + + +sub ensembl_object_type{ + my ( $self, $arg ) = @_; + if( defined $arg ) { + $self->{ensembl_object_type} = $arg; + } + return $self->{ensembl_object_type}; +} + +=head2 ensembl_id + + Arg [1] : (optional) String $arg - value to set + Description: Getter/setter for attribute ensembl_id. + Returntype : String + Exceptions : none + Caller : general + Status : Stable + +=cut + + +sub ensembl_id{ + my ( $self, $arg ) = @_; + if( defined $arg ) { + $self->{ensembl_id} = $arg; + } + return $self->{ensembl_id}; +} + + + +=head1 DEPRECATED METHODS + +=cut + +=head2 get_synonyms + + DEPRECATED use get_all_synonyms instead + +=cut + +sub get_synonyms { + my $self = shift; + + deprecate("get_synonyms has been renamed get_all_synonyms."); + return $self->get_all_synonyms; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBFile/CollectionAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBFile/CollectionAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,243 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBFile::CollectionAdaptor + +=head1 SYNOPSIS + +For use with a Bio::EnsEMBL::Collector e.g. + + package Bio::EnsEMBL::Funcgen::DBSQL::ResultFeatureAdaptor; + + @ISA = qw(Bio::EnsEMBL::Funcgen::DBSQL::BaseFeatureAdaptor + Bio::EnsEMBL::Funcgen::Collector::ResultFeature + Bio::EnsEMBL::DBFile::CollectionAdaptor); + #DBSQL and DBFile inheritance here due to dynamic nature of ResultFeatureAdaptor + + +Fetch wrapper methods access file based data via read_collection_blob: + + sub _fetch_from_file_by_Slice_ResultSet{ + + #define filepath/config + + my $packed_scores = $self->read_collection_blob( + $filepath, + $efg_sr_id, + $conf->{$window_size}{'byte_offset'}, + $conf->{$window_size}{'byte_length'}, + ); + + #Do unpacking and object creation here + + } + +=head1 DESCRIPTION + +Adaptor for direct collection(.col) file access, which are binary compressed fixed +width format files providing window based values across the genome. Collection files +integrate an index block which contains seq_region byte off set values. + +NOTE: By default all collection files are generated and packed using little endian encoding. +Due to the lack of standards of float encoding(wrt to endianess) perl packs using the +implicit endianess of the underlying architecture. This means that accessing float +collection files located on a big endian architecture will produce unexpected results. + +# endian issues will disappear with knetfile xsubs + +=head1 SEE ALSO + +Bio::EnsEMBL::DBFile::FileAdaptor + +=cut + + + +package Bio::EnsEMBL::DBFile::CollectionAdaptor; + +use strict; +use warnings; + +use Bio::EnsEMBL::DBFile::FileAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning deprecate); +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::DBFile::FileAdaptor); + + +=head2 initialise_filehandle + + Arg[1] : string - filepath + Example : $self->initialise_filehandle($filepath); + Description: Initialises the filehandle for use, in this case reads + the index (seq_region offsets) + Returntype : None + Exceptions : warns if read fails + Caller : Bio::EnsEMBL::DBFile::FileAdaptor::get_filehandle + Status : at risk + +=cut + +sub initialise_filehandle{ + my ($self, $filepath) = @_; + my $fh = $self->{file_cache}{$filepath}{filehandle}; + + #offsets include the length of the complete index block + my ($index_size, $read_bytes, $index, $num_keys, %offset_index); + + ### INDEX FORMAT ### + #First block of the index the index size in bytes(not inc size block). + # + #Rest of index is a hash of sr_id(v 2 bytes) key offset(V 4 bytes) value pairs + #V (long) is 4 bytes(via sys/read), which is actually an Config{intsize} i.e. i? + #long is 8 bytes according to Config{longsize}! + + #read uses logical characters not necessarily in bytes + #altho this does seem to read bytes, maybe due to binmode? + #seek is in bytes + #Changed to sysread/read which both use bytes explicitly + #Can't mix sysread/seek due to I/O buffering differences + + + #Read index_size first encoded as v(2 bytes) + $read_bytes = sysread($fh, $index_size, 2); + + if(! ((defined $read_bytes) && ($read_bytes == 2))){ + #! defined is error 0 is end of file + warn "Failed to read index size from $filepath\n$!"; + + #Delete fh as it is useless/unsafe to retry + undef $self->{file_cache}{$filepath}{filehandle}; + } + else{ #Read index + ($index_size) = unpack('v', $index_size); + $read_bytes = sysread($fh, $index, $index_size); #Now read index proper + + if(! ((defined $read_bytes) && ($read_bytes == $index_size))){ + #! defined is error 0 is end of file + warn "Failed to read index from $filepath\n$!"; + + #Delete fh as it is useless/unsafe to retry + undef $self->{file_cache}{$filepath}{filehandle}; + } + else{ + #Number of key-value pairs => $index_size /(size of key(v 2bytes) + size of offset(V 4bytes)) + $num_keys = $index_size/6; + my $unpack_template = '(vV)'.$num_keys,; + + %offset_index = unpack($unpack_template, $index); + $self->{file_cache}{$filepath}{off_sets} = \%offset_index; + } + } + + return $self->{file_cache}{$filepath}{off_sets}; +} + + +=head2 read_collection_blob + + Arg[1] : string - filepath + Arg[2] : int - seq_region_id + Arg[3] : int - seq_region offset. The byte offset required to + locate the required start position + Arg[4] : int - byte length to read + Example : my $blob_substr = $self->read_collection_blob($filepath, + $sr_key, + $sr_offset, + $byte_length); + Description: Reads bytes from file given a seq_region_key, byte offset and byte length. + Sets filehandle to undef if read fails. + Returntype : string - packed binary data + Exceptions : warns if seek or read errors + Caller : general e.g. fetch_from_file_by_Slice_ResultSet + Status : at risk + +=cut + +# We could change this to take a Slice, hence we could check +# whether an EOF error is because the slice is out of range +# and undef only if it is in range i.e. the index/file is corrupt +# overkill? +# This is something the Slice API should warn about +# but will still cause undef'd filehandle here +# Index should also contain ends, so we can validate whether the slice is out of range??? + + +sub read_collection_blob{ + my($self, $filepath, $sr_key, $sr_offset, $byte_length) = @_; + + my $blob_substr; + my $fh = $self->get_filehandle($filepath, {-binmode => 1}); + + if(defined $fh){ + #Return from query cache here? + #cache key = "$filepath:$key:$sr_offset:$byte_length" + + #define total offset + + #if(! exists $self->{file_cache}{$filepath}{off_sets}{$sr_key}){ + # #warn "sr_key($sr_key) is not part of index for $filepath\n"; + #} + #else{ + + if(exists $self->{file_cache}{$filepath}{off_sets}{$sr_key}){ + + my $total_offset = $self->{file_cache}{$filepath}{off_sets}{$sr_key} + $sr_offset; + my $seeked = sysseek($fh, $total_offset, 0);#0(whence) is SEEK_SET. + + if(! $seeked){ + warn("Failed to seek to byte $total_offset in $filepath"); + #Don't undef fh here as this valid Slice maybe out of range + #and we don't want to kill a valid fh + #i.e. Slice start/end is past end of seq_region + } + else{ + my $read_bytes = sysread($fh, $blob_substr, $byte_length); + + if(! ((defined $read_bytes) && ($read_bytes == $byte_length))){ + #! defined is error 0 is end of file + warn "Failed to read from $filepath\n$!"; + + if($read_bytes == 0){ + #This maybe because the slice is out of range! + #The API gives no warning about this + + warn "End Of File encountered\n"; + warn "Total offset:\t".$self->{file_cache}{$filepath}{off_sets}{$sr_key}. + " key($sr_key) + $sr_offset = $total_offset\n"; + + #add some checks against the theoretical/true length of the file? + } + else{ #Delete fh as it is useless/unsafe to retry + undef $self->{file_cache}{$filepath}{filehandle}; + #$blob_substr is now set to empty string by read + undef $blob_substr; + } + } + } + } + } + + return $blob_substr; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBFile/FileAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBFile/FileAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,220 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBFile::FileAdaptor - Base Adaptor for direct file access + +=head1 DESCRIPTION + +Basic wrapper class to provide access to file based data. + +This is primarily aimed at indexed Collection(.col) files which are optimised for Slice +based queries. Collections store fixed width width/windowed data as BLOBS. This makes +it possible to seek to the a required location given slice coordinate and read the only +the required amount of data covering the slice. + +Currently only works as hybrid DBAdaptor e.g. ResultFeatureAdaptor which inherits both from +here and BaseFeatureAdaptor. + +=cut + + + +package Bio::EnsEMBL::DBFile::FileAdaptor; + +use Bio::EnsEMBL::Utils::Exception qw(throw warning deprecate); +use strict; +use warnings; + + +=head2 get_filehandle + + Arg[1] : string - filepath + Arg[2] : HASHREF - Optional params, see open_file + Example : my $fh = $self->get_filehandle($filepath, 1); + Description: Gets and caches a simple file handle. + Returntype : GLOB/undef - filehandle + Exceptions : warns if cache entry exists but is not defined + Caller : general + Status : at risk + +=cut + +sub get_filehandle{ + my ($self, $filepath, $params_hash) = @_; + + my $file_op = '<'; + + if(exists $params_hash->{-file_operator}){ + $file_op = $params_hash->{-file_operator}; + }else{ + $params_hash->{-file_operator} = $file_op; + } + + if(! exists $self->{file_cache}{$filepath}{filehandle}){ + my $fh = $self->Bio::EnsEMBL::DBFile::FileAdaptor::open_file($filepath, $params_hash); + + if(defined $fh){ + $self->{file_cache}{$filepath}{filehandle} = $fh; + #$self->initialise_filehandle($filepath) if $self->can('initialise_filehandle'); + $self->initialise_filehandle($filepath) if($file_op eq '<'); + } + } + elsif(! defined $self->{file_cache}{$filepath}{filehandle}){ + #This maybe one of several read/seek errors which will have already been warned + warn "Encountered and error with file handle for $filepath\n"; + } + #else + # check against cache file op + # to make sure we aren't trying to open an already open fh with a different operator + + + return $self->{file_cache}{$filepath}{filehandle}; +} + + +=head2 open_file + + Arg[1] : string - filepath + Arg[2] : HASHREF - Optional params: + -binmode => 0|1, # Boolean i.e. treat file as binary + -file_operator => '>' # Default is '<' + #-perms_octal => # Requires FileHandle + Example : my $fh = $self->open_file($filepath, {-binmode = > 1, -file_operator => '>'}); + Description: Opens a file for reading or writing. + Returntype : GLOB/undef - filehandle + Exceptions : warns if file open fails + warns if file operator unsupported + warns if failed to set binmode + Caller : general + Status : at risk + +=cut + +sub open_file{ + my ($self, $filepath, $params_hash) = @_; + + #Validate params_hash? + #rearrange? Will not warn/throw for invalid keys? + #perms octal, requires FileHandle? See EFGUtils::open_file + + + + my $file_op = $params_hash->{-file_operator} || '<'; + + if(($file_op ne '<') && + ($file_op ne '>') && + ($file_op ne '>>')){ + throw("Cannot perform open with unsupported operator:\t${file_op}${filepath}"); + } + + my $fh; + my $success = open($fh, "${file_op}${filepath}"); + #$fh will be still be GLOB on fail + + #These warn instead of throw/die to allow + #open_file to be used to test a file + #this prevents throws/die when an attempting to access an absent file (good for webcode) + #could alternatively change to throw/die and eval where required + #prevents need to catch everywhere else and potential double reporting of error + + if(! $success){ + #undef $fh; + throw("Failed to open:\t$filepath\n$!\n"); + } + elsif($params_hash->{-binmode}){ + $success = binmode $fh; + + if(! $success){ + throw("Failed to set binmode:\t$filepath\n$!"); + #undef $fh; + } + } + + return $fh; +} + + +=head2 validate_file_length + + Arg[1] : string - filepath + Arg[2] : int - expected length in bytes + Example : $self->validate_file_length($filepath, $expected_length); + Description: Utility method which can be used during file creation + Returntype : None + Exceptions : warns if file open fails + throws if file is not expected length + Caller : general + Status : at risk - change to seek to accounts for 'logical characters' + +=cut + +sub validate_file_length{ + my ($self, $filepath, $expected_length, $binmode) = @_; + + #Currently not using cache as we rarely want to + #use the file handle afterwards + + + #THIS WAS USING EFGUtils::open_file imported in the Collector::ResultFeature!!!! + #which is just a sub not a class method, and is in a parallel inheritance path + #No warnings about redefining method :( + #Force use of FileAdaptor::open_file + + my $fh = $self->Bio::EnsEMBL::DBFile::FileAdaptor::open_file($filepath, {-binmode => $binmode}); + + + #sysseek always returns length in bytes, change to seek which + #uses logical characters i.e. actual encoding? + #Does seek use bytes in binmode and chars in non-binmode? + + my $seeked_bytes = sysseek($fh, 0, 2);# 2 is SEEK_END + #There is no systell function. Use sysseek(FH, 0, 1) for that. + + if($seeked_bytes < $expected_length){ + throw("File is shorter($seeked_bytes) than expected($expected_length):\t$filepath\n"); + } + elsif($seeked_bytes > $expected_length){ + throw("File is longer($seeked_bytes) than expected($expected_length):\t$filepath\n"); + } + + return; +} + + + + + +### STUBB/TEMPLATE METHODS ### +# +# If required hese should be over-ridden in the +# descendant FileAdaptor e.g. CollectionAdaptor +# Listed here rather for visibility (rather than +# using 'can') + + +sub initialise_filehandle{ + return; +} + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBLoader.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBLoader.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,131 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBLoader - Run time database loader + +=head1 SYNOPSIS + + $db = + Bio::EnsEMBL::DBLoader->new( "Bio::EnsEMBL::DBSQL::DBAdaptor/" + . "host=localhost;" + . "dbname=homo_sapiens_core_19_34a;" + . "user=ensro;" ); + + # $db is a database object + $db = Bio::EnsEMBL::DBLoader->standard(); + + # equivalent to + # Bio::EnsEMBL::DBLoader->new( $ENV{'ENSEMBL_DATABASE'} ); + +=head1 DESCRIPTION + +This system provides a run-time loading of the database for ensembl, +allowing two things + + a) Only "using" the database module which is required for a + particular implementation + + b) Providing a simple string method to indicate where the database + is, allowing per sites defaults and other things as such + + +The string is parsed as follows: + +Before the / is the Perl database object to load, after are the +parameters to pass to that database. The parameters are series of +key=values separated by semi-colons. These are passed as a hash to the +new method of the database object + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBLoader; + +use strict; + + +=head2 new + + Arg [1] : string $string + An Ensembl database locator string. + Example : Bio::EnsEMBL::DBSQL::DBLoader->new("Bio::EnsEMBL::DBSQL::DBAdaptor/host=localhost;dbname=homo_sapiens_core_19_34a;user=ensro;" + Description: Connects to an Ensembl database using the module specified in + the locator string. + Returntype : The module specified in the load string is returned. + Exceptions : thrown if the specified module cannot be instantiated or the + locator string cannot be parsed + Caller : ? + Status : Stable + +=cut + +sub new{ + my ($class,$string) = @_; + my ($module,%hash); + + $string =~ /(\S+?)\/([\S+\s*]+)/ || die "Could not parse [$string] as a ensembl database locator. Needs database_module/params"; + $module = $1; + my $param = $2; + + &_load_module($module); + my @param = split(/;/,$param); + foreach my $keyvalue ( @param ) { + $keyvalue =~ /(\S+?)=([\S*\s*]*)/ || do { warn("In loading $keyvalue, could not split into keyvalue for loading $module. Ignoring"); next; }; + + my $key = $1; + my $value = $2; + + $hash{"-$key"} = $value; + } + + my @kv = %hash; + + return "$module"->new(%hash); +} + + +sub _load_module{ + my ($modulein) = @_; + my ($module,$load,$m); + + $module = "_<$modulein.pm"; + $load = "$modulein.pm"; + $load =~ s/::/\//g; + + return 1 if $main::{$module}; + eval { + require $load; + }; + if( $@ ) { + print STDERR <. + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::AnalysisAdaptor + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Registry; + + Bio::EnsEMBL::Registry->load_registry_from_db( + -host => 'ensembldb.ensembl.org', + -user => 'anonymous' + ); + + $analysis_adaptor = + Bio::EnsEMBL::Registry->get_adaptor( "human", "core", "analysis" ); + + my $analysis = $analysis_adaptor->fetch_by_logic_name('genscan'); + +=head1 DESCRIPTION + + Module to encapsulate all db access for persistent class Analysis. + There should be just one per application and database connection. + +=head1 METHODS + +=cut + + +package Bio::EnsEMBL::DBSQL::AnalysisAdaptor; + +use Bio::EnsEMBL::Analysis; +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Utils::Exception; + + +use vars qw(@ISA); +use strict; + +@ISA = qw( Bio::EnsEMBL::DBSQL::BaseAdaptor); + + +=head2 new + + Args : Bio::EnsEMBL::DBSQL::DBAdaptor + Example : my $aa = new Bio::EnsEMBL::DBSQL::AnalysisAdaptor(); + Description: Creates a new Bio::EnsEMBL::DBSQL::AnalysisAdaptor object and + internally loads and caches all the Analysis objects from the + database. + Returntype : Bio::EnsEMBL::DBSQL::AnalysisAdaptor + Exceptions : none + Caller : Bio::EnsEMBL::DBSQL::DBAdaptor + Status : Stable + +=cut + +sub new { + my ($class, $db) = @_; + + my $self = $class->SUPER::new($db); + + #load and cache all of the Analysis objects + $self->fetch_all; + + return $self; +} + + +=head2 fetch_all + + Args : none + Example : my @analysis = @{$analysis_adaptor->fetch_all()}; + Description: fetches all of the Analysis objects from the database and caches + them internally. + Returntype : listref of Bio::EnsEMBL::Analysis retrieved from the database + Exceptions : none + Caller : AnalysisAdaptor::new + Status : Stable + +=cut + +sub fetch_all { + my $self = shift; + my ( $analysis, $dbID ); + my $rowHashRef; + + $self->{_cache} = {}; + $self->{_logic_name_cache} = {}; + + my $sth = $self->prepare( q { + SELECT analysis.analysis_id, logic_name, + program, program_version, program_file, + db, db_version, db_file, + module, module_version, + gff_source, gff_feature, + created, parameters, description, display_label, displayable, web_data + FROM analysis + LEFT JOIN analysis_description + ON analysis.analysis_id = analysis_description.analysis_id } ); + $sth->execute; + + while( $rowHashRef = $sth->fetchrow_hashref ) { + my $analysis = $self->_objFromHashref( $rowHashRef ); + + $self->{_cache}->{$analysis->dbID} = $analysis; + $self->{_logic_name_cache}->{lc($analysis->logic_name())} = $analysis; + } + + my @ana = values %{$self->{_cache}}; + + return \@ana; +} + + +=head2 fetch_all_by_feature_class + + Arg [1] : string $feature_cless - The name of the feature class + Example : my @analyses = @{$analysis_adaptor->fetch_all_by_feature_class('Gene'); + Description: Returns all analyses that correspond to a given + feature class; see feature_classes method for a list. + Returntype : Listref of Bio::EnsEMBL::Analysis + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_feature_class { + my $self = shift; + deprecate("Deprecated. Hard-coded logic is not supported"); + my $feat_class = shift || throw( "Need a feature type, e.g. SimpleFeature" ); + + my @feature_classes = $self->feature_classes; # List of all feature classes + my %feat_table_map; + foreach my $class( @feature_classes ){ + # Map e.g. DnaAlignFeature to dna_align_feature + my $table = join( "_", map lc, ( $class =~ /([A-Z][a-z]+)/g ) ); + $feat_table_map{$class} = $table; + } + $feat_table_map{DensityFeature}='density_type'; # analysis_id in diff table + my $feat_table = $feat_table_map{$feat_class} || + ( warning( "No feature type corresponding to $feat_class" ) && + return [] ); + + my $sql_t = qq| +SELECT DISTINCT analysis_id FROM %s |; + + my $sql = sprintf( $sql_t, $feat_table ); + my $sth = $self->prepare( $sql ); + my $rv = $sth->execute(); + my $res = $sth->fetchall_arrayref; + my @analyses; + foreach my $r( @{$res} ){ + my $analysis = $self->fetch_by_dbID($r->[0]) + || throw( "analysis_id $r->[0] from $feat_table table " + . "is not in the analysis table!" ); + push @analyses, $analysis; + } + return [@analyses]; +} + + +=head2 feature_classes + + Arg [1] : NONE + Example : my @fclasses = $analysis_adaptor->feature_classes; + Description: Returns a list of the different classes of Ensembl feature + object that have an analysis + Returntype : List of feature classes + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub feature_classes{ + deprecate("Deprecated. Hard-coded logic is not supported"); + # Can't think of a way to do this programatically, so hard-coded + return qw( + DensityFeature + DnaAlignFeature + Gene + MarkerFeature + PredictionTranscript + ProteinAlignFeature + ProteinFeature + QtlFeature + RepeatFeature + SimpleFeature + ); +} + +=head2 fetch_by_dbID + + Arg [1] : int $internal_analysis_id - the database id of the analysis + record to retrieve + Example : my $analysis = $analysis_adaptor->fetch_by_dbID(1); + Description: Retrieves an Analysis object from the database via its internal + id. + Returntype : Bio::EnsEMBL::Analysis + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_by_dbID { + my $self = shift; + my $id = shift; + + if( defined $self->{_cache}->{$id} ) { + return $self->{_cache}->{$id}; + } + + my $query = q{ + SELECT analysis.analysis_id, logic_name, + program, program_version, program_file, + db, db_version, db_file, + module, module_version, + gff_source, gff_feature, + created, parameters, description, display_label, displayable, web_data + FROM analysis + LEFT JOIN analysis_description + ON analysis.analysis_id = analysis_description.analysis_id + WHERE analysis.analysis_id = ? }; + + my $sth = $self->prepare($query); + $sth->bind_param(1,$id,SQL_INTEGER); + $sth->execute(); + my $rowHashRef = $sth->fetchrow_hashref; + if( ! defined $rowHashRef ) { + return undef; + } + + my $anal = $self->_objFromHashref( $rowHashRef ); + $self->{_cache}->{$anal->dbID} = $anal; + $self->{_logic_name_cache}->{lc($anal->logic_name())} = $anal; + return $anal; +} + + +=head2 fetch_by_logic_name + + Arg [1] : string $logic_name the logic name of the analysis to retrieve + Example : my $analysis = $a_adaptor->fetch_by_logic_name('Eponine'); + Description: Retrieves an analysis object from the database using its unique + logic name. + Returntype : Bio::EnsEMBL::Analysis + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_by_logic_name { + my ( $self, $logic_name ) = @_; + + my $analysis; + my $rowHash; + + # Check the cache for the logic name + if ( defined( $self->{_logic_name_cache}{ lc($logic_name) } ) ) { + return $self->{_logic_name_cache}{ lc($logic_name) }; + } + + my $sth = $self->prepare( + qq( +SELECT analysis.analysis_id, + logic_name, + program, + program_version, + program_file, + db, + db_version, + db_file, + module, + module_version, + gff_source, + gff_feature, + created, + parameters, + description, + display_label, + displayable, + web_data +FROM analysis + LEFT JOIN analysis_description + ON ( analysis.analysis_id = analysis_description.analysis_id ) +WHERE LOWER(logic_name) = ?) + ); + + $sth->bind_param( 1, lc($logic_name), SQL_VARCHAR ); + $sth->execute(); + my $rowHashRef = $sth->fetchrow_hashref(); + + if ( !defined($rowHashRef) ) { return undef } + + $analysis = $self->_objFromHashref($rowHashRef); + + # place the analysis in the caches, cross referenced by dbID and + # logic_name + $self->{_cache}->{ $analysis->dbID() } = $analysis; + $self->{_logic_name_cache}->{ lc($logic_name) } = $analysis; + + return $analysis; +} ## end sub fetch_by_logic_name + + +=head2 store + + Arg [1] : Bio:EnsEMBL::Analysis $analysis + Example : $analysis_adaptor->store($analysis); + Description: Stores $analysis in db. If the analysis is already stored in + the database its dbID and adaptor are updated, but the analysis + is not stored a second time. + Sets created date if not already set. Sets dbID and adaptor + inside $analysis. Returns dbID. + Returntype : int - dbID of stored analysis + Exceptions : throw on incorrect argument + throw if analysis argument does not have a logic name + Caller : general + Status : Stable + +=cut + +sub store { + my $self = shift; + my $analysis = shift; + + if(!ref($analysis) || !$analysis->isa('Bio::EnsEMBL::Analysis')) { + throw("Bio::EnsEMBL::Analysis argument expected."); + } + + if($analysis->is_stored($self->db())) { + return $analysis->dbID(); + } + + if(!$analysis->logic_name()) { + throw("Analysis cannot be stored without a valid logic_name"); + } + + my $insertion_method = (lc($self->dbc->driver) eq 'sqlite') ? 'INSERT OR IGNORE' : 'INSERT IGNORE'; + + my $rows_inserted = 0; + my $sth; + + if ( $analysis->created() ) { + + # We use insert IGNORE so that this method can be used in a + # multi-process environment. If another process has already written + # this record then there will not be a problem. + + $sth = $self->prepare( + qq{ + $insertion_method INTO analysis + (created, logic_name, db, db_version, db_file, program, program_version, program_file, parameters, module, module_version, gff_source, gff_feature) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + } + ); + $sth->bind_param( 1, $analysis->created(), SQL_DATETIME ); + $sth->bind_param( 2, lc( $analysis->logic_name() ), SQL_VARCHAR ); + $sth->bind_param( 3, $analysis->db(), SQL_VARCHAR ); + $sth->bind_param( 4, $analysis->db_version(), SQL_VARCHAR ); + $sth->bind_param( 5, $analysis->db_file(), SQL_VARCHAR ); + $sth->bind_param( 6, $analysis->program(), SQL_VARCHAR ); + $sth->bind_param( 7, $analysis->program_version(), SQL_VARCHAR ); + $sth->bind_param( 8, $analysis->program_file(), SQL_VARCHAR ); + $sth->bind_param( 9, $analysis->parameters(), SQL_VARCHAR ); + $sth->bind_param( 10, $analysis->module(), SQL_VARCHAR ); + $sth->bind_param( 11, $analysis->module_version(), SQL_VARCHAR ); + $sth->bind_param( 12, $analysis->gff_source(), SQL_VARCHAR ); + $sth->bind_param( 13, $analysis->gff_feature(), SQL_VARCHAR ); + + $rows_inserted = $sth->execute(); + + } else { + $sth = $self->prepare( + qq{ + $insertion_method INTO analysis + (created, logic_name, db, db_version, db_file, program, program_version, program_file, parameters, module, module_version, gff_source, gff_feature) + VALUES (CURRENT_TIMESTAMP, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + } + ); + + $sth->bind_param( 1, $analysis->logic_name, SQL_VARCHAR ); + $sth->bind_param( 2, $analysis->db, SQL_VARCHAR ); + $sth->bind_param( 3, $analysis->db_version, SQL_VARCHAR ); + $sth->bind_param( 4, $analysis->db_file, SQL_VARCHAR ); + $sth->bind_param( 5, $analysis->program, SQL_VARCHAR ); + $sth->bind_param( 6, $analysis->program_version, SQL_VARCHAR ); + $sth->bind_param( 7, $analysis->program_file, SQL_VARCHAR ); + $sth->bind_param( 8, $analysis->parameters, SQL_VARCHAR ); + $sth->bind_param( 9, $analysis->module, SQL_VARCHAR ); + $sth->bind_param( 10, $analysis->module_version, SQL_VARCHAR ); + $sth->bind_param( 11, $analysis->gff_source, SQL_VARCHAR ); + $sth->bind_param( 12, $analysis->gff_feature, SQL_VARCHAR ); + + $rows_inserted = $sth->execute(); + + } ## end else [ if ( $analysis->created...)] + + my $dbID; + # If we need to fetch the timestamp, or the insert failed due to + # existance of an existing entry, we need to retrieve the entry from + # the database. Note: $sth->execute() may return 0E0 on error which + # is zero, but true which is why the $rows_inserted clause was added. + if ( !$analysis->created() || !$rows_inserted || $rows_inserted == 0 ) + { + my $new_analysis = + $self->fetch_by_logic_name( $analysis->logic_name ); + + if ( !$new_analysis ) { + throw("Could not retrieve just stored analysis from database.\n" + . "Possibly incorrect db permissions or missing analysis table\n" + ); + } + + $dbID = $new_analysis->dbID(); + $analysis->created( $new_analysis->created() ); + } + + $dbID ||= $sth->{'mysql_insertid'}; + $sth->finish(); + + # store description and display_label + if( defined( $analysis->description() ) || defined( $analysis->display_label() )|| defined( $analysis->web_data() )) { + $sth = $self->prepare( "INSERT IGNORE INTO analysis_description (analysis_id, display_label, description, displayable, web_data) VALUES (?,?,?,?, ?)"); + + $sth->bind_param(1,$dbID,SQL_INTEGER); + $sth->bind_param(2,$analysis->display_label(),SQL_VARCHAR); + $sth->bind_param(3,$analysis->description,SQL_LONGVARCHAR); + $sth->bind_param(4,$analysis->displayable,SQL_TINYINT); + #$sth->bind_param(5,$analysis->web_data(),SQL_LONGVARCHAR); + my $web_data; + $web_data = $self->dump_data($analysis->web_data()) if ($analysis->web_data()); + $sth->bind_param(5,$web_data,SQL_LONGVARCHAR); + $sth->execute(); + + $sth->finish(); + } + + + + $self->{_cache}->{$dbID} = $analysis; + $self->{_logic_name_cache}{lc($analysis->logic_name)} = $analysis; + + $analysis->adaptor( $self ); + $analysis->dbID( $dbID ); + + return $dbID; +} + + + +=head2 update + + Arg [1] : Bio::EnsEMBL::Analysis $anal + Example : $adaptor->update($anal) + Description: Updates this analysis in the database + Returntype : int 1 if update is performed, undef if it is not + Exceptions : throw if arg is not an analysis object + Caller : ? + Status : Stable + +=cut + +sub update { + my $self = shift; + my $a = shift; + + if (!ref($a) || !$a->isa('Bio::EnsEMBL::Analysis')) { + throw("Expected Bio::EnsEMBL::Analysis argument."); + } + + if(!$a->is_stored($self->db())) { + return undef; + } + + my $sth = $self->prepare + ("UPDATE analysis " . + "SET created = ?, logic_name = ?, db = ?, db_version = ?, db_file = ?, ". + " program = ?, program_version = ?, program_file = ?, ". + " parameters = ?, module = ?, module_version = ?, ". + " gff_source = ?, gff_feature = ? " . + "WHERE analysis_id = ?"); + + + + $sth->bind_param(1,$a->created,SQL_DATETIME); + $sth->bind_param(2,$a->logic_name,SQL_VARCHAR); + $sth->bind_param(3,$a->db,SQL_VARCHAR); + $sth->bind_param(4,$a->db_version,SQL_VARCHAR); + $sth->bind_param(5,$a->db_file,SQL_VARCHAR); + $sth->bind_param(6,$a->program,SQL_VARCHAR); + $sth->bind_param(7,$a->program_version,SQL_VARCHAR); + $sth->bind_param(8,$a->program_file,SQL_VARCHAR); + $sth->bind_param(9,$a->parameters,SQL_VARCHAR); + $sth->bind_param(10,$a->module,SQL_VARCHAR); + $sth->bind_param(11,$a->module_version,SQL_VARCHAR); + $sth->bind_param(12,$a->gff_source,SQL_VARCHAR); + $sth->bind_param(13,$a->gff_feature,SQL_VARCHAR); + $sth->bind_param(14,$a->dbID,SQL_INTEGER); + + $sth->execute(); + + $sth->finish(); + + # also update description & display label - may need to create these if + # not already there + $sth = $self->prepare("SELECT description FROM analysis_description WHERE analysis_id= ?"); + $sth->execute($a->dbID); + my $web_data; #this is an anonymous reference to a hash, will have to be dumped into string before writing to db + if ($sth->fetchrow_hashref) { # update if exists + $web_data = $self->dump_data($a->web_data()) if ($a->web_data()); + $sth = $self->prepare + ("UPDATE analysis_description SET description = ?, display_label = ?, displayable = ?, web_data = ? WHERE analysis_id = ?"); + $sth->bind_param(1,$a->description,SQL_LONGVARCHAR); + $sth->bind_param(2,$a->display_label(),SQL_VARCHAR); + $sth->bind_param(3,$a->displayable,SQL_TINYINT); + # print "after $web_data\n"; + $sth->bind_param(4,$web_data,SQL_LONGVARCHAR); + $sth->bind_param(5,$a->dbID,SQL_INTEGER); + $sth->execute(); + + } else { # create new entry + + if( $a->description() || $a->display_label() || $a->web_data) { + $web_data = $self->dump_data($a->web_data()) if ($a->web_data()); + #my $web_data = $self->dump_data($a->web_data()); + $sth = $self->prepare( "INSERT IGNORE INTO analysis_description (analysis_id, display_label, description, displayable, web_data) VALUES (?,?,?,?,?)"); + $sth->bind_param(1,$a->dbID,SQL_INTEGER); + $sth->bind_param(2,$a->display_label(),SQL_VARCHAR); + $sth->bind_param(3,$a->description,SQL_LONGVARCHAR); + $sth->bind_param(4,$a->displayable,SQL_TINYINT); + #my $web_data = $self->dump_data($a->web_data()); + $sth->bind_param(5,$web_data,SQL_LONGVARCHAR); + $sth->execute(); + + } + + } + + + $sth->finish(); + + # the logic_name cache needs to be re-updated now, since we may have just + # changed the logic_name + $self->fetch_all(); + + return 1; +} + + + +=head2 remove + + Arg [1] : Bio::EnsEMBL::Analysis $anal + Example : $adaptor->remove($anal) + Description: Removes this analysis from the database. This is not really + safe to execute in a multi process environment, so programs + should not remove analysis while out on the farm. + Returntype : none + Exceptions : thrown if $anal arg is not an analysis object + Caller : ? + Status : Stable + +=cut + +sub remove { + my ($self, $analysis) = @_; + + if (!defined $analysis || !ref $analysis) { + throw("called remove on AnalysisAdaptor with a [$analysis]"); + } + + if(!$analysis->is_stored($self->db())) { + return undef; + } + + my $sth = $self->prepare("DELETE FROM analysis WHERE analysis_id = ?"); + $sth->bind_param(1,$analysis->dbID,SQL_INTEGER); + $sth->execute(); + + $sth = $self->prepare("DELETE FROM analysis_description WHERE analysis_id = ?"); + $sth->execute($analysis->dbID()); + + # remove this analysis from the cache + delete $self->{'_cache'}->{$analysis->dbID()}; + delete $self->{'_logic_name_cache'}->{lc($analysis->logic_name)}; + + + # unset the adaptor and dbID + $analysis->dbID(undef); + $analysis->adaptor(undef); + + return; +} + + + +=head2 exists + + Arg [1] : Bio::EnsEMBL::Analysis $anal + Example : if($adaptor->exists($anal)) #do something + Description: Tests whether this Analysis already exists in the database + by checking first if the adaptor and dbID are set and + secondly by whether it is in this adaptors internal cache. + Note that this will not actually check the database and will + not find and analysis which were recently added by other + processes. You are better off simply trying to store an + analysis which will reliably ensure that it is not stored twice + in the database. + Returntype : int dbID if analysis is found, otherwise returns undef + Exceptions : thrown if $anal arg is not an analysis object + Caller : store + Status : Stable + +=cut + +sub exists { + my ($self,$anal) = @_; + + if(!ref($anal) || !$anal->isa("Bio::EnsEMBL::Analysis")) { + throw("Object is not a Bio::EnsEMBL::Analysis"); + } + + #if this analysis is stored in this db already return its dbID + if($anal->is_stored($self->db())) { + return $anal->dbID(); + } + + #this analysis object is not stored but one exactly like it may have been + foreach my $cacheId (keys %{$self->{_cache}}) { + if ($self->{_cache}->{$cacheId}->compare($anal) >= 0) { + # $anal->dbID( $cacheId ); + # $anal->adaptor( $self ); + return $cacheId; + } + } + + #no analysis like this one exists in the database + return undef; +} + + +=head2 _objFromHashref + + Arg [1] : hashref $rowHash + Description: Private helper function generates an Analysis object from a + mysql row hash reference. + Returntype : Bio::EnsEMBL::Analysis + Exceptions : none + Caller : Bio::EnsEMBL::DBSQL::AnalsisAdaptor::fetch_* methods + Status : Stable + +=cut + +sub _objFromHashref { + my $self = shift; + my $h = shift; + + my $web_data = $h->{web_data} ? $self->get_dumped_data($h->{web_data}) : ''; + + return Bio::EnsEMBL::Analysis->new_fast({ + dbID => $h->{analysis_id}, + adaptor => $self, + _db => $h->{db}, + _db_file => $h->{db_file}, + _db_version => $h->{db_version}, + _program => $h->{program}, + _program_version => $h->{program_version}, + _program_file => $h->{program_file}, + _gff_source => $h->{gff_source}, + _gff_feature => $h->{gff_feature}, + _module => $h->{module}, + _module_version => $h->{module_version}, + _parameters => $h->{parameters}, + _created => $h->{created}, + _logic_name => $h->{logic_name}, + _description => $h->{description}, + _display_label => $h->{display_label}, + _displayable => $h->{displayable}, + _web_data => $web_data, + }); +} + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/ArchiveStableIdAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/ArchiveStableIdAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1513 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::ArchiveStableIdAdaptor + +=head1 SYNOPSIS + + my $registry = "Bio::EnsEMBL::Registry"; + + my $archiveStableIdAdaptor = + $registry->get_adaptor( 'Human', 'Core', 'ArchiveStableId' ); + + my $stable_id = 'ENSG00000068990'; + + my $arch_id = $archiveStableIdAdaptor->fetch_by_stable_id($stable_id); + + print("Latest incarnation of this stable ID:\n"); + printf( " Stable ID: %s.%d\n", + $arch_id->stable_id(), $arch_id->version() ); + print(" Release: " + . $arch_id->release() . " (" + . $arch_id->assembly() . ", " + . $arch_id->db_name() + . ")\n" ); + + print "\nStable ID history:\n\n"; + + my $history = + $archiveStableIdAdaptor->fetch_history_tree_by_stable_id( + $stable_id); + + foreach my $a ( @{ $history->get_all_ArchiveStableIds } ) { + printf( " Stable ID: %s.%d\n", $a->stable_id(), $a->version() ); + print(" Release: " + . $a->release() . " (" + . $a->assembly() . ", " + . $a->db_name() + . ")\n\n" ); + } + +=head1 DESCRIPTION + +ArchiveStableIdAdaptor does all SQL to create ArchiveStableIds and works +of + + stable_id_event + mapping_session + peptite_archive + gene_archive + +tables inside the core database. + +This whole module has a status of At Risk as it is under development. + +=head1 METHODS + + fetch_by_stable_id + fetch_by_stable_id_version + fetch_by_stable_id_dbname + fetch_all_by_archive_id + fetch_predecessors_by_archive_id + fetch_successors_by_archive_id + fetch_history_tree_by_stable_id + add_all_current_to_history + list_dbnames + previous_dbname + next_dbname + get_peptide + get_current_release + get_current_assembly + +=head1 RELATED MODULES + + Bio::EnsEMBL::ArchiveStableId + Bio::EnsEMBL::StableIdEvent + Bio::EnsEMBL::StableIdHistoryTree + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::ArchiveStableIdAdaptor; + +use strict; +use warnings; +no warnings qw(uninitialized); + +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +our @ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); + +use Bio::EnsEMBL::ArchiveStableId; +use Bio::EnsEMBL::StableIdEvent; +use Bio::EnsEMBL::StableIdHistoryTree; +use Bio::EnsEMBL::Utils::Exception qw(deprecate warning throw); + +use constant MAX_ROWS => 30; +use constant NUM_HIGH_SCORERS => 20; + + +=head2 fetch_by_stable_id + + Arg [1] : string $stable_id + Arg [2] : (optional) string $type + Example : none + Description : Retrives an ArchiveStableId that is the latest incarnation of + given stable_id. + Returntype : Bio::EnsEMBL::ArchiveStableId or undef if not in database + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub fetch_by_stable_id { + my $self = shift; + my $stable_id = shift; + + my $arch_id = Bio::EnsEMBL::ArchiveStableId->new( + -stable_id => $stable_id, + -adaptor => $self + ); + + @_ ? $arch_id->type(shift) : $self->_resolve_type($arch_id); + + if ($self->lookup_current($arch_id)) { + + # stable ID is in current release + $arch_id->version($arch_id->current_version); + $arch_id->db_name($self->dbc->dbname); + $arch_id->release($self->get_current_release); + $arch_id->assembly($self->get_current_assembly); + + } else { + + # look for latest version of this stable id + my $extra_sql = defined($arch_id->{'type'}) ? + " AND sie.type = '@{[lc($arch_id->{'type'})]}'" : ''; + + my $r = $self->_fetch_archive_id($stable_id, $extra_sql, $extra_sql); + + if ($r->{'new_stable_id'} and $r->{'new_stable_id'} eq $stable_id) { + # latest event is a self event, use new_* data + $arch_id->version($r->{'new_version'}); + $arch_id->release($r->{'new_release'}); + $arch_id->assembly($r->{'new_assembly'}); + $arch_id->db_name($r->{'new_db_name'}); + } else { + # latest event is a deletion event (or mapping to other ID; this clause + # is only used to cope with buggy data where deletion events are + # missing), use old_* data + $arch_id->version($r->{'old_version'}); + $arch_id->release($r->{'old_release'}); + $arch_id->assembly($r->{'old_assembly'}); + $arch_id->db_name($r->{'old_db_name'}); + } + + $arch_id->type(ucfirst(lc($r->{'type'}))); + } + + if (! defined $arch_id->db_name) { + # couldn't find stable ID in archive or current db + return undef; + } + + $arch_id->is_latest(1); + + return $arch_id; +} + + +=head2 fetch_by_stable_id_version + + Arg [1] : string $stable_id + Arg [2] : int $version + Example : none + Description : Retrieve an ArchiveStableId with given version and stable ID. + Returntype : Bio::EnsEMBL::ArchiveStableId + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub fetch_by_stable_id_version { + my $self = shift; + my $stable_id = shift; + my $version = shift; + + my $arch_id = Bio::EnsEMBL::ArchiveStableId->new( + -stable_id => $stable_id, + -version => $version, + -adaptor => $self + ); + + @_ ? $arch_id->type(shift) : $self->_resolve_type($arch_id); + + if ($self->lookup_current($arch_id) && $arch_id->is_current) { + + # this version is the current one + $arch_id->db_name($self->dbc->dbname); + $arch_id->release($self->get_current_release); + $arch_id->assembly($self->get_current_assembly); + + } else { + + # find latest release this stable ID version is found in archive + my $extra_sql1 = qq(AND sie.old_version = "$version"); + my $extra_sql2 = qq(AND sie.new_version = "$version"); + my $r = $self->_fetch_archive_id($stable_id, $extra_sql1, $extra_sql2); + + if ($r->{'new_stable_id'} and $r->{'new_stable_id'} eq $stable_id + and $r->{'new_version'} == $version) { + # latest event is a self event, use new_* data + $arch_id->release($r->{'new_release'}); + $arch_id->assembly($r->{'new_assembly'}); + $arch_id->db_name($r->{'new_db_name'}); + } else { + # latest event is a deletion event (or mapping to other ID; this clause + # is only used to cope with buggy data where deletion events are + # missing), use old_* data + $arch_id->release($r->{'old_release'}); + $arch_id->assembly($r->{'old_assembly'}); + $arch_id->db_name($r->{'old_db_name'}); + } + + $arch_id->type(ucfirst(lc($r->{'type'}))); + } + + if (! defined $arch_id->db_name) { + # couldn't find stable ID version in archive or current release + return undef; + } + + return $arch_id; +} + + +=head2 fetch_by_stable_id_dbname + + Arg [1] : string $stable_id + Arg [2] : string $db_name + Example : none + Description : Create an ArchiveStableId from given arguments. + Returntype : Bio::EnsEMBL::ArchiveStableId or undef if not in database + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub fetch_by_stable_id_dbname { + my $self = shift; + my $stable_id = shift; + my $db_name = shift; + + my $arch_id = Bio::EnsEMBL::ArchiveStableId->new( + -stable_id => $stable_id, + -db_name => $db_name, + -adaptor => $self + ); + + @_ ? $arch_id->type(shift) : $self->_resolve_type($arch_id); + + if ($self->lookup_current($arch_id) and $db_name eq $self->dbc->dbname) { + + # this version is the current one + $arch_id->version($arch_id->current_version); + $arch_id->release($self->get_current_release); + $arch_id->assembly($self->get_current_assembly); + + } else { + + # find version for this dbname in the stable ID archive + my $extra_sql = defined($arch_id->{'type'}) ? + " AND sie.type = '@{[lc($arch_id->{'type'})]}'" : ''; + my $extra_sql1 = $extra_sql . qq( AND ms.old_db_name = "$db_name"); + my $extra_sql2 = $extra_sql . qq( AND ms.new_db_name = "$db_name"); + my $r = $self->_fetch_archive_id($stable_id, $extra_sql1, $extra_sql2); + + if ($r->{'new_stable_id'} and $r->{'new_stable_id'} eq $stable_id + and $r->{'new_db_name'} eq $db_name) { + + # latest event is a self event, use new_* data + $arch_id->release($r->{'new_release'}); + $arch_id->assembly($r->{'new_assembly'}); + $arch_id->version($r->{'new_version'}); + } else { + # latest event is a deletion event (or mapping to other ID; this clause + # is only used to cope with buggy data where deletion events are + # missing), use old_* data + $arch_id->release($r->{'old_release'}); + $arch_id->assembly($r->{'old_assembly'}); + $arch_id->version($r->{'old_version'}); + } + + $arch_id->type(ucfirst(lc($r->{'type'}))); + } + + if (! defined $arch_id->version ) { + # couldn't find stable ID version in archive or current release + return undef; + } + + return $arch_id; +} + +# +# Helper method to do fetch ArchiveStableId from db. +# Used by fetch_by_stable_id(), fetch_by_stable_id_version() and +# fetch_by_stable_id_dbname(). +# Returns hashref as returned by DBI::sth::fetchrow_hashref +# +sub _fetch_archive_id { + my $self = shift; + my $stable_id = shift; + my $extra_sql1 = shift; + my $extra_sql2 = shift; + + # using a UNION is much faster in this query than somthing like + # "... AND (sie.old_stable_id = ? OR sie.new_stable_id = ?)" + my $sql = qq( + (SELECT * FROM stable_id_event sie, mapping_session ms + WHERE sie.mapping_session_id = ms.mapping_session_id + AND sie.old_stable_id = ? + $extra_sql1) + UNION + (SELECT * FROM stable_id_event sie, mapping_session ms + WHERE sie.mapping_session_id = ms.mapping_session_id + AND sie.new_stable_id = ? + $extra_sql2) + ORDER BY created DESC + LIMIT 1 + ); + + my $sth = $self->prepare($sql); + $sth->execute($stable_id,$stable_id); + my $r = $sth->fetchrow_hashref; + $sth->finish; + + return $r; +} + + +=head2 fetch_all_by_archive_id + + Arg [1] : Bio::EnsEMBL::ArchiveStableId $archive_id + Arg [2] : String $return_type - type of ArchiveStableId to fetch + Example : my $arch_id = $arch_adaptor->fetch_by_stable_id('ENSG0001'); + my @archived_transcripts = + $arch_adaptor->fetch_all_by_archive_id($arch_id, 'Transcript'); + Description : Given a ArchiveStableId it retrieves associated ArchiveStableIds + of specified type (e.g. retrieve transcripts for genes or vice + versa). + + See also fetch_associated_archived() for a different approach to + retrieve this data. + Returntype : listref Bio::EnsEMBL::ArchiveStableId + Exceptions : none + Caller : Bio::EnsEMBL::ArchiveStableId->get_all_gene_archive_ids, + get_all_transcript_archive_ids, get_all_translation_archive_ids + Status : At Risk + : under development + +=cut + +sub fetch_all_by_archive_id { + my $self = shift; + my $archive_id = shift; + my $return_type = shift; + + my @result = (); + my $lc_self_type = lc($archive_id->type); + my $lc_return_type = lc($return_type); + + my $sql = qq( + SELECT + ga.${lc_return_type}_stable_id, + ga.${lc_return_type}_version, + m.old_db_name, + m.old_release, + m.old_assembly + FROM gene_archive ga, mapping_session m + WHERE ga.${lc_self_type}_stable_id = ? + AND ga.${lc_self_type}_version = ? + AND ga.mapping_session_id = m.mapping_session_id + ); + + my $sth = $self->prepare($sql); + $sth->bind_param(1, $archive_id->stable_id, SQL_VARCHAR); + $sth->bind_param(2, $archive_id->version, SQL_SMALLINT); + $sth->execute; + + my ($stable_id, $version, $db_name, $release, $assembly); + $sth->bind_columns(\$stable_id, \$version, \$db_name, \$release, \$assembly); + + while ($sth->fetch) { + my $new_arch_id = Bio::EnsEMBL::ArchiveStableId->new( + -stable_id => $stable_id, + -version => $version, + -db_name => $db_name, + -release => $release, + -assembly => $assembly, + -type => $return_type, + -adaptor => $self + ); + + push( @result, $new_arch_id ); + } + + $sth->finish(); + return \@result; +} + + +=head2 fetch_associated_archived + + Arg[1] : Bio::EnsEMBL::ArchiveStableId $arch_id - + the ArchiveStableId to fetch associated archived IDs for + Example : my ($arch_gene, $arch_tr, $arch_tl, $pep_seq) = + @{ $archive_adaptor->fetch_associated_archived($arch_id) }; + Description : Fetches associated archived stable IDs from the db for a given + ArchiveStableId (version is taken into account). + Return type : Listref of + ArchiveStableId archived gene + ArchiveStableId archived transcript + (optional) ArchiveStableId archived translation + (optional) peptide sequence + Exceptions : thrown on missing or wrong argument + thrown if ArchiveStableID has no type + Caller : Bio::EnsEMBL::ArchiveStableId->get_all_associated_archived() + Status : At Risk + : under development + +=cut + +sub fetch_associated_archived { + my $self = shift; + my $arch_id = shift; + + throw("Need a Bio::EnsEMBL::ArchiveStableId") unless ($arch_id + and ref($arch_id) and $arch_id->isa('Bio::EnsEMBL::ArchiveStableId')); + + my $type = $arch_id->type(); + + if ( !defined($type) ) { + throw("Can't deduce ArchiveStableId type."); + } + + $type = lc($type); + + my $sql = qq( + SELECT ga.gene_stable_id, + ga.gene_version, + ga.transcript_stable_id, + ga.transcript_version, + ga.translation_stable_id, + ga.translation_version, + pa.peptide_seq, + ms.old_release, + ms.old_assembly, + ms.old_db_name + FROM (mapping_session ms, gene_archive ga) + LEFT JOIN peptide_archive pa + ON ga.peptide_archive_id = pa.peptide_archive_id + WHERE ga.mapping_session_id = ms.mapping_session_id + AND ga.${type}_stable_id = ? + AND ga.${type}_version = ? + ); + + my $sth = $self->prepare($sql); + $sth->bind_param(1, $arch_id->stable_id, SQL_VARCHAR); + $sth->bind_param(2, $arch_id->version, SQL_SMALLINT); + $sth->execute; + + my @result = (); + + while (my $r = $sth->fetchrow_hashref) { + + my @row = (); + + # create ArchiveStableIds genes, transcripts and translations + push @row, Bio::EnsEMBL::ArchiveStableId->new( + -stable_id => $r->{'gene_stable_id'}, + -version => $r->{'gene_version'}, + -db_name => $r->{'old_db_name'}, + -release => $r->{'old_release'}, + -assembly => $r->{'old_assembly'}, + -type => 'Gene', + -adaptor => $self + ); + + push @row, Bio::EnsEMBL::ArchiveStableId->new( + -stable_id => $r->{'transcript_stable_id'}, + -version => $r->{'transcript_version'}, + -db_name => $r->{'old_db_name'}, + -release => $r->{'old_release'}, + -assembly => $r->{'old_assembly'}, + -type => 'Transcript', + -adaptor => $self + ); + + if ($r->{'translation_stable_id'}) { + push @row, Bio::EnsEMBL::ArchiveStableId->new( + -stable_id => $r->{'translation_stable_id'}, + -version => $r->{'translation_version'}, + -db_name => $r->{'old_db_name'}, + -release => $r->{'old_release'}, + -assembly => $r->{'old_assembly'}, + -type => 'Translation', + -adaptor => $self + ); + + # push peptide sequence onto result list + push @row, $r->{'peptide_seq'}; + } + + push @result, \@row; + } + + return \@result; +} + + +=head2 fetch_predecessors_by_archive_id + + Arg [1] : Bio::EnsEMBL::ArchiveStableId + Example : none + Description : Retrieve a list of ArchiveStableIds that were mapped to the + given one. This method goes back only one level, to retrieve + a full predecessor history use fetch_predecessor_history, or + ideally fetch_history_tree_by_stable_id for the complete + history network. + Returntype : listref Bio::EnsEMBL::ArchiveStableId + Exceptions : none + Caller : Bio::EnsEMBL::ArchiveStableId->get_all_predecessors + Status : At Risk + : under development + +=cut + +sub fetch_predecessors_by_archive_id { + my $self = shift; + my $arch_id = shift; + + my @result; + + if( ! ( defined $arch_id->stable_id() && + defined $arch_id->db_name() )) { + throw( "Need db_name for predecessor retrieval" ); + } + + my $sql = qq( + SELECT + sie.old_stable_id, + sie.old_version, + sie.type, + m.old_db_name, + m.old_release, + m.old_assembly + FROM mapping_session m, stable_id_event sie + WHERE sie.mapping_session_id = m.mapping_session_id + AND sie.new_stable_id = ? + AND m.new_db_name = ? + ); + + my $sth = $self->prepare($sql); + $sth->bind_param(1, $arch_id->stable_id, SQL_VARCHAR); + $sth->bind_param(2, $arch_id->db_name, SQL_VARCHAR); + $sth->execute(); + + my ($old_stable_id, $old_version, $type, $old_db_name, $old_release, $old_assembly); + $sth->bind_columns(\$old_stable_id, \$old_version, \$type, \$old_db_name, \$old_release, \$old_assembly); + + while ($sth->fetch) { + if (defined $old_stable_id) { + my $old_arch_id = Bio::EnsEMBL::ArchiveStableId->new( + -stable_id => $old_stable_id, + -version => $old_version, + -db_name => $old_db_name, + -release => $old_release, + -assembly => $old_assembly, + -type => $type, + -adaptor => $self + ); + push( @result, $old_arch_id ); + } + } + $sth->finish(); + + # if you didn't find any predecessors, there might be a gap in the + # mapping_session history (i.e. databases in mapping_session don't chain). To + # bridge the gap, look in the previous mapping_session for identical + # stable_id.version + unless (@result) { + + $sql = qq( + SELECT + sie.new_stable_id, + sie.new_version, + sie.type, + m.new_db_name, + m.new_release, + m.new_assembly + FROM mapping_session m, stable_id_event sie + WHERE sie.mapping_session_id = m.mapping_session_id + AND sie.new_stable_id = ? + AND m.new_db_name = ? + ); + + $sth = $self->prepare($sql); + + my $curr_dbname = $arch_id->db_name; + + PREV: + while (my $prev_dbname = $self->previous_dbname($curr_dbname)) { + + $sth->bind_param(1,$arch_id->stable_id, SQL_VARCHAR); + $sth->bind_param(2,$prev_dbname, SQL_VARCHAR); + $sth->execute(); + + $sth->bind_columns(\$old_stable_id, \$old_version, \$type, \$old_db_name, \$old_release, \$old_assembly); + + while( $sth->fetch() ) { + if (defined $old_stable_id) { + my $old_arch_id = Bio::EnsEMBL::ArchiveStableId->new( + -stable_id => $old_stable_id, + -version => $old_version, + -db_name => $old_db_name, + -release => $old_release, + -assembly => $old_assembly, + -type => $type, + -adaptor => $self + ); + push( @result, $old_arch_id ); + + last PREV; + } + } + + $curr_dbname = $prev_dbname; + + } + + $sth->finish(); + } + + return \@result; +} + + +=head2 fetch_successors_by_archive_id + + Arg [1] : Bio::EnsEMBL::ArchiveStableId + Example : none + Description : Retrieve a list of ArchiveStableIds that the given one was + mapped to. This method goes forward only one level, to retrieve + a full successor history use fetch_successor_history, or + ideally fetch_history_tree_by_stable_id for the complete + history network. + Returntype : listref Bio::EnsEMBL::ArchiveStableId + Exceptions : none + Caller : Bio::EnsEMBL::ArchiveStableId->get_all_successors + Status : At Risk + : under development + +=cut + +sub fetch_successors_by_archive_id { + my $self = shift; + my $arch_id = shift; + my @result; + + + if( ! ( defined $arch_id->stable_id() && + defined $arch_id->db_name() )) { + throw( "Need db_name for successor retrieval" ); + } + + my $sql = qq( + SELECT + sie.new_stable_id, + sie.new_version, + sie.type, + m.new_db_name, + m.new_release, + m.new_assembly + FROM mapping_session m, stable_id_event sie + WHERE sie.mapping_session_id = m.mapping_session_id + AND sie.old_stable_id = ? + AND m.old_db_name = ? + ); + + my $sth = $self->prepare( $sql ); + $sth->bind_param(1,$arch_id->stable_id,SQL_VARCHAR); + $sth->bind_param(2,$arch_id->db_name,SQL_VARCHAR); + $sth->execute(); + + my ($new_stable_id, $new_version, $type, $new_db_name, $new_release, $new_assembly); + $sth->bind_columns(\$new_stable_id, \$new_version, \$type, \$new_db_name, \$new_release, \$new_assembly); + + while( $sth->fetch() ) { + if( defined $new_stable_id ) { + my $new_arch_id = Bio::EnsEMBL::ArchiveStableId->new( + -stable_id => $new_stable_id, + -version => $new_version, + -db_name => $new_db_name, + -release => $new_release, + -assembly => $new_assembly, + -type => $type, + -adaptor => $self + ); + + push( @result, $new_arch_id ); + } + } + $sth->finish(); + + # if you didn't find any successors, there might be a gap in the + # mapping_session history (i.e. databases in mapping_session don't chain). To + # bridge the gap, look in the next mapping_session for identical + # stable_id.version + unless (@result) { + + $sql = qq( + SELECT + sie.old_stable_id, + sie.old_version, + sie.type, + m.old_db_name, + m.old_release, + m.old_assembly + FROM mapping_session m, stable_id_event sie + WHERE sie.mapping_session_id = m.mapping_session_id + AND sie.old_stable_id = ? + AND m.old_db_name = ? + ); + + $sth = $self->prepare($sql); + + my $curr_dbname = $arch_id->db_name; + + NEXTDB: + while (my $next_dbname = $self->next_dbname($curr_dbname)) { + + $sth->bind_param(1, $arch_id->stable_id, SQL_VARCHAR); + $sth->bind_param(2, $next_dbname, SQL_VARCHAR); + $sth->execute(); + + $sth->bind_columns(\$new_stable_id, \$new_version, \$type, \$new_db_name, \$new_release, \$new_assembly); + + while( $sth->fetch() ) { + if (defined $new_stable_id) { + my $new_arch_id = Bio::EnsEMBL::ArchiveStableId->new( + -stable_id => $new_stable_id, + -version => $new_version, + -db_name => $new_db_name, + -release => $new_release, + -assembly => $new_assembly, + -type => $type, + -adaptor => $self + ); + + push( @result, $new_arch_id ); + + last NEXTDB; + } + } + + $curr_dbname = $next_dbname; + + } + + $sth->finish(); + } + + return \@result; +} + + + +=head2 fetch_history_tree_by_stable_id + + Arg[1] : String $stable_id - the stable ID to fetch the history tree for + Arg[2] : (optional) Int $num_high_scorers + number of mappings per stable ID allowed when filtering + Arg[3] : (optional) Int $max_rows + maximum number of stable IDs in history tree (used for + filtering) + Example : my $history = $archive_adaptor->fetch_history_tree_by_stable_id( + 'ENSG00023747897'); + Description : Returns the history tree for a given stable ID. This will + include a network of all stable IDs it is related to. The + method will try to return a minimal (sparse) set of nodes + (ArchiveStableIds) and links (StableIdEvents) by removing any + redundant entries and consolidating mapping events so that only + changes are recorded. + Return type : Bio::EnsEMBL::StableIdHistoryTree + Exceptions : thrown on missing argument + Caller : Bio::EnsEMBL::ArchiveStableId::get_history_tree, general + Status : At Risk + : under development + +=cut + +sub fetch_history_tree_by_stable_id { + my ($self, $stable_id, $num_high_scorers, $max_rows) = @_; + + throw("Expecting a stable ID argument.") unless $stable_id; + + $num_high_scorers ||= NUM_HIGH_SCORERS; + $max_rows ||= MAX_ROWS; + + # using a UNION is much faster in this query than somthing like + # "... AND (sie.old_stable_id = ?) OR (sie.new_stable_id = ?)" + my $sql = qq( + SELECT sie.old_stable_id, sie.old_version, + ms.old_db_name, ms.old_release, ms.old_assembly, + sie.new_stable_id, sie.new_version, + ms.new_db_name, ms.new_release, ms.new_assembly, + sie.type, sie.score + FROM stable_id_event sie, mapping_session ms + WHERE sie.mapping_session_id = ms.mapping_session_id + AND sie.old_stable_id = ? + UNION + SELECT sie.old_stable_id, sie.old_version, + ms.old_db_name, ms.old_release, ms.old_assembly, + sie.new_stable_id, sie.new_version, + ms.new_db_name, ms.new_release, ms.new_assembly, + sie.type, sie.score + FROM stable_id_event sie, mapping_session ms + WHERE sie.mapping_session_id = ms.mapping_session_id + AND sie.new_stable_id = ? + ); + + my $sth = $self->prepare($sql); + + my $history = Bio::EnsEMBL::StableIdHistoryTree->new( + -CURRENT_DBNAME => $self->dbc->dbname, + -CURRENT_RELEASE => $self->get_current_release, + -CURRENT_ASSEMBLY => $self->get_current_assembly, + ); + + # remember stable IDs you need to do and those that are done. Initialise the + # former hash with the focus stable ID + my %do = ($stable_id => 1); + my %done; + + # while we got someting to do + while (my ($id) = keys(%do)) { + + # if we already have more than MAX_ROWS stable IDs in this tree, we can't + # build the full tree. Return undef. + if (scalar(keys(%done)) > $max_rows) { + # warning("Too many related stable IDs (".scalar(keys(%done)).") to draw a history tree."); + $history->is_incomplete(1); + $sth->finish; + last; + } + + # mark this stable ID as done + delete $do{$id}; + $done{$id} = 1; + + # fetch all stable IDs related to this one from the database + $sth->bind_param(1, $id, SQL_VARCHAR); + $sth->bind_param(2, $id, SQL_VARCHAR); + $sth->execute; + + my @events; + + while (my $r = $sth->fetchrow_hashref) { + + # + # create old and new ArchiveStableIds and a StableIdEvent to link them + # add all of these to the history tree + # + my ($old_id, $new_id); + + if ($r->{'old_stable_id'}) { + $old_id = Bio::EnsEMBL::ArchiveStableId->new( + -stable_id => $r->{'old_stable_id'}, + -version => $r->{'old_version'}, + -db_name => $r->{'old_db_name'}, + -release => $r->{'old_release'}, + -assembly => $r->{'old_assembly'}, + -type => $r->{'type'}, + -adaptor => $self + ); + } + + if ($r->{'new_stable_id'}) { + $new_id = Bio::EnsEMBL::ArchiveStableId->new( + -stable_id => $r->{'new_stable_id'}, + -version => $r->{'new_version'}, + -db_name => $r->{'new_db_name'}, + -release => $r->{'new_release'}, + -assembly => $r->{'new_assembly'}, + -type => $r->{'type'}, + -adaptor => $self + ); + } + + my $event = Bio::EnsEMBL::StableIdEvent->new( + -old_id => $old_id, + -new_id => $new_id, + -score => $r->{'score'} + ); + + push @events, $event; + + } + + # filter out low-scoring events; the number of highest scoring events + # returned is defined by NUM_HIGH_SCORERS + my @others; + + foreach my $event (@events) { + + my $old_id = $event->old_ArchiveStableId; + my $new_id = $event->new_ArchiveStableId; + + # creation, deletion and mapping-to-self events are added to the history + # tree directly + if (!$old_id || !$new_id || ($old_id->stable_id eq $new_id->stable_id)) { + $history->add_StableIdEvents($event); + } else { + push @others, $event; + } + + } + + #if (scalar(@others) > $num_high_scorers) { + # warn "Filtering ".(scalar(@others) - $num_high_scorers). + # " low-scoring events.\n"; + #} + + my $k = 0; + foreach my $event (sort { $b->score <=> $a->score } @others) { + $history->add_StableIdEvents($event); + + # mark stable IDs as todo if appropriate + $do{$event->old_ArchiveStableId->stable_id} = 1 + unless $done{$event->old_ArchiveStableId->stable_id}; + $do{$event->new_ArchiveStableId->stable_id} = 1 + unless $done{$event->new_ArchiveStableId->stable_id}; + + last if (++$k == $num_high_scorers); + } + + } + + $sth->finish; + + # try to consolidate the tree (remove redundant nodes, bridge gaps) + $history->consolidate_tree; + + # now add ArchiveStableIds for current Ids not found in the archive + $self->add_all_current_to_history($history); + + # calculate grid coordinates for the sorted tree; this will also try to + # untangle the tree + $history->calculate_coords; + + return $history; +} + + +=head2 add_all_current_to_history + + Arg[1] : Bio::EnsEMBL::StableIdHistoryTree $history - + the StableIdHistoryTree object to add the current IDs to + Description : This method adds the current versions of all stable IDs found + in a StableIdHistoryTree object to the tree, by creating + appropriate Events for the stable IDs found in the *_stable_id + tables. This is a helper method for + fetch_history_tree_by_stable_id(), see there for more + documentation. + Return type : none (passed-in object is manipulated) + Exceptions : thrown on missing or wrong argument + Caller : internal + Status : At Risk + : under development + +=cut + +sub add_all_current_to_history { + my $self = shift; + my $history = shift; + + unless ($history and $history->isa('Bio::EnsEMBL::StableIdHistoryTree')) { + throw("Need a Bio::EnsEMBL::StableIdHistoryTree."); + } + + my @ids = @{ $history->get_unique_stable_ids }; + my $id_string = join("', '", @ids); + + my $tmp_id = Bio::EnsEMBL::ArchiveStableId->new(-stable_id => $ids[0]); + my $type = lc($self->_resolve_type($tmp_id)); + return unless ($type); + + # get current stable IDs from db + my $sql = qq( + SELECT stable_id, version FROM ${type} + WHERE stable_id IN ('$id_string') + ); + my $sth = $self->prepare($sql); + $sth->execute; + + while (my ($stable_id, $version) = $sth->fetchrow_array) { + + my $new_id = Bio::EnsEMBL::ArchiveStableId->new( + -stable_id => $stable_id, + -version => $version, + -current_version => $version, + -db_name => $self->dbc->dbname, + -release => $self->get_current_release, + -assembly => $self->get_current_assembly, + -type => $type, + -adaptor => $self + ); + + my $event = $history->get_latest_StableIdEvent($new_id); + next unless ($event); + + if ($event->old_ArchiveStableId and + $event->old_ArchiveStableId->stable_id eq $stable_id) { + + # latest event was a self event + # update it with current stable ID and add to tree + $event->new_ArchiveStableId($new_id); + + } else { + + # latest event was a non-self event + # create a new event where the old_id is the new_id from latest + my $new_event = Bio::EnsEMBL::StableIdEvent->new( + -old_id => $event->new_ArchiveStableId, + -new_id => $new_id, + -score => $event->score, + ); + $history->add_StableIdEvents($new_event); + } + + } + + # refresh node cache + $history->flush_ArchiveStableIds; + $history->add_ArchiveStableIds_for_events; +} + + +=head2 fetch_successor_history + + Arg [1] : Bio::EnsEMBL::ArchiveStableId $arch_id + Example : none + Description : Gives back a list of archive stable ids which are successors in + the stable_id_event tree of the given stable_id. Might well be + empty. + + This method isn't deprecated, but in most cases you will rather + want to use fetch_history_tree_by_stable_id(). + Returntype : listref Bio::EnsEMBL::ArchiveStableId + Since every ArchiveStableId knows about it's successors, this is + a linked tree. + Exceptions : none + Caller : webcode for archive + Status : At Risk + : under development + +=cut + +sub fetch_successor_history { + my $self = shift; + my $arch_id = shift; + + my $current_db_name = $self->list_dbnames->[0]; + my $dbname = $arch_id->db_name; + + if ($dbname eq $current_db_name) { + return [$arch_id]; + } + + my $old = []; + my @result = (); + + push @$old, $arch_id; + + while ($dbname ne $current_db_name) { + my $new = []; + while (my $asi = (shift @$old)) { + push @$new, @{ $asi->get_all_successors }; + } + + if (@$new) { + $dbname = $new->[0]->db_name; + } else { + last; + } + + # filter duplicates + my %unique = map { join(":", $_->stable_id, $_->version, $_->release) => + $_ } @$new; + @$new = values %unique; + + @$old = @$new; + push @result, @$new; + } + + return \@result; +} + + +=head2 fetch_predecessor_history + + Arg [1] : Bio::EnsEMBL::ArchiveStableId $arch_id + Example : none + Description : Gives back a list of archive stable ids which are predecessors + in the stable_id_event tree of the given stable_id. Might well + be empty. + + This method isn't deprecated, but in most cases you will rather + want to use fetch_history_tree_by_stable_id(). + Returntype : listref Bio::EnsEMBL::ArchiveStableId + Since every ArchiveStableId knows about it's successors, this is + a linked tree. + Exceptions : none + Caller : webcode for archive + Status : At Risk + : under development + +=cut + +sub fetch_predecessor_history { + my $self = shift; + my $arch_id = shift; + + my $oldest_db_name = $self->list_dbnames->[-1]; + my $dbname = $arch_id->db_name; + + if ($dbname eq $oldest_db_name) { + return [$arch_id]; + } + + my $old = []; + my @result = (); + + push @$old, $arch_id; + + while ($dbname ne $oldest_db_name) { + my $new = []; + while (my $asi = (shift @$old)) { + push @$new, @{ $asi->get_all_predecessors }; + } + + if( @$new ) { + $dbname = $new->[0]->db_name; + } else { + last; + } + + # filter duplicates + my %unique = map { join(":", $_->stable_id, $_->version, $_->release) => + $_ } @$new; + @$new = values %unique; + + @$old = @$new; + push @result, @$new; + } + + return \@result; +} + + +=head2 list_dbnames + + Args : none + Example : none + Description : A list of available database names from the latest (current) to + the oldest (ordered). + Returntype : listref of strings + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub list_dbnames { + my $self = shift; + + if( ! defined $self->{'dbnames'} ) { + + my $sql = qq( + SELECT old_db_name, new_db_name + FROM mapping_session + ORDER BY created DESC + ); + my $sth = $self->prepare( $sql ); + $sth->execute(); + my ( $old_db_name, $new_db_name ); + + my @dbnames = (); + my %seen; + + $sth->bind_columns( \$old_db_name, \$new_db_name ); + + while( $sth->fetch() ) { + # this code now can deal with non-chaining mapping sessions + push(@{ $self->{'dbnames'} }, $new_db_name) unless ($seen{$new_db_name}); + $seen{$new_db_name} = 1; + + push(@{ $self->{'dbnames'} }, $old_db_name) unless ($seen{$old_db_name}); + $seen{$old_db_name} = 1; + } + + $sth->finish(); + + } + + return $self->{'dbnames'}; +} + + +=head2 previous_dbname + + Arg[1] : String $dbname - focus db name + Example : my $prev_db = $self->previous_dbname($curr_db); + Description : Returns the name of the next oldest database which has mapping + session information. + Return type : String (or undef if not available) + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub previous_dbname { + my $self = shift; + my $dbname = shift; + + my $curr_idx = $self->_dbname_index($dbname); + my @dbnames = @{ $self->list_dbnames }; + + if ($curr_idx == @dbnames) { + # this is the oldest dbname, so no previous one available + return undef; + } else { + return $dbnames[$curr_idx+1]; + } +} + + +=head2 next_dbname + + Arg[1] : String $dbname - focus db name + Example : my $prev_db = $self->next_dbname($curr_db); + Description : Returns the name of the next newest database which has mapping + session information. + Return type : String (or undef if not available) + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub next_dbname { + my $self = shift; + my $dbname = shift; + + my $curr_idx = $self->_dbname_index($dbname); + my @dbnames = @{ $self->list_dbnames }; + + if ($curr_idx == 0) { + # this is the latest dbname, so no next one available + return undef; + } else { + return $dbnames[$curr_idx-1]; + } +} + + +# +# helper method to return the array index of a database in the ordered list of +# available databases (as returned by list_dbnames() +# +sub _dbname_index { + my $self = shift; + my $dbname = shift; + + my @dbnames = @{ $self->list_dbnames }; + + for (my $i = 0; $i < @dbnames; $i++) { + if ($dbnames[$i] eq $dbname) { + return $i; + } + } +} + + +=head2 get_peptide + + Arg [1] : Bio::EnsEMBL::ArchiveStableId $arch_id + Example : none + Description : Retrieves the peptide string for given ArchiveStableId. If its + not a peptide or not in the database returns undef. + Returntype : string or undef + Exceptions : none + Caller : Bio::EnsEMBL::ArchiveStableId->get_peptide, general + Status : At Risk + : under development + +=cut + +sub get_peptide { + my $self = shift; + my $arch_id = shift; + + if ( lc( $arch_id->type() ) ne 'translation' ) { + return undef; + } + + my $sql = qq( + SELECT pa.peptide_seq + FROM peptide_archive pa, gene_archive ga + WHERE ga.translation_stable_id = ? + AND ga.translation_version = ? + AND ga.peptide_archive_id = pa.peptide_archive_id + ); + + my $sth = $self->prepare($sql); + $sth->bind_param( 1, $arch_id->stable_id, SQL_VARCHAR ); + $sth->bind_param( 2, $arch_id->version, SQL_SMALLINT ); + $sth->execute(); + + my ($peptide_seq) = $sth->fetchrow_array(); + $sth->finish(); + + return $peptide_seq; +} ## end sub get_peptide + + +=head2 get_current_release + + Example : my $current_release = $archive_adaptor->get_current_release; + Description : Returns the current release number (as found in the meta table). + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_current_release { + my $self = shift; + + unless ($self->{'current_release'}) { + my $mca = $self->db->get_MetaContainer; + my ($release) = @{ $mca->list_value_by_key('schema_version') }; + $self->{'current_release'} = $release; + } + + return $self->{'current_release'}; +} + + +=head2 get_current_assembly + + Example : my $current_assembly = $archive_adaptor->get_current_assembly; + Description : Returns the current assembly version (as found in the meta + table). + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_current_assembly { + my $self = shift; + + unless ($self->{'current_assembly'}) { + my $mca = $self->db->get_MetaContainer; + my ($assembly) = @{ $mca->list_value_by_key('assembly.default') }; + $self->{'current_assembly'} = $assembly; + } + + return $self->{'current_assembly'}; +} + + +=head2 lookup_current + + Arg[1] : Bio::EnsEMBL::ArchiveStableId $arch_id - + the stalbe ID to find the current version for + Example : if ($self->lookup_version($arch_id) { + $arch_id->version($arch_id->current_version); + $arch_id->db_name($self->dbc->dbname); + Description : Look in [gene|transcript|translation]_stable_id if you can find + a current version for this stable ID. Set + ArchiveStableId->current_version if found. + Return type : Boolean (TRUE if current version found, else FALSE) + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub lookup_current { + my $self = shift; + my $arch_id = shift; + + my $type = lc( $arch_id->type ); + + unless ($type) { + warning("Can't lookup current version without a type."); + return 0; + } + + my $sql = qq( + SELECT version FROM ${type} + WHERE stable_id = ? + ); + my $sth = $self->prepare($sql); + $sth->execute( $arch_id->stable_id ); + my ($version) = $sth->fetchrow_array; + $sth->finish; + + if ($version) { + $arch_id->current_version($version); + return 1; + } + + # didn't find a current version + return 0; +} ## end sub lookup_current + + +# infer type from stable ID format +sub _resolve_type { + my $self = shift; + my $arch_id = shift; + + my $stable_id = $arch_id->stable_id(); + my $id_type; + + # first, try to infer type from stable ID format + # + # Anopheles IDs + if ($stable_id =~ /^AGAP.*/) { + if ($stable_id =~ /.*-RA/) { + $id_type = "Transcript"; + } elsif ($stable_id =~ /.*-PA/) { + $id_type = "Translation"; + } else { + $id_type = "Gene"; + } + + # standard Ensembl IDs + } elsif ($stable_id =~ /.*G\d+$/) { + $id_type = "Gene"; + } elsif ($stable_id =~ /.*T\d+$/) { + $id_type = "Transcript"; + } elsif ($stable_id =~ /.*P\d+$/) { + $id_type = "Translation"; + } elsif ($stable_id =~ /.*E\d+$/) { + $id_type = "Exon"; + + # if guessing fails, look in db + } else { + my $sql = qq( + SELECT type from stable_id_event + WHERE old_stable_id = ? + OR new_stable_id = ? + ); + my $sth = $self->prepare($sql); + $sth->execute($stable_id, $stable_id); + ($id_type) = $sth->fetchrow_array; + $sth->finish; + } + + warning("Couldn't resolve stable ID type.") unless ($id_type); + + $arch_id->type($id_type); +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/AssemblyAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/AssemblyAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,234 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::AssemblyAdaptor - Retrieves meta information +related to the assembly, density features/counts per chromosome or if none +provided, all top level seq regions + + +=head1 SYNOPSIS + + +=head1 DESCRIPTION + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::AssemblyAdaptor; + +use strict; +use warnings; + +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::DBSQL::MetaContainer; +use Bio::EnsEMBL::Attribute; + +use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning); + +use vars qw(@ISA); + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); + + + +=head2 new + + Arg [1] : Bio::EnsEMBL::DBAdaptor $dbadaptor the adaptor for + the database this assembly info adaptor is using. + Example : my $aia = new Bio::EnsEMBL::AssemblyAdaptor($dbadaptor); + Description: Creates a new AssemblyAdaptor object + Returntype : Bio::EnsEMBL::DBSQL::AssemblyAdaptor + Exceptions : none + Caller : Bio::EnsEMBL::DBSQL::DBAdaptor + Status : Stable + +=cut + +sub new { + my($class, $dbadaptor) = @_; + + my $self = $class->SUPER::new($dbadaptor); + + return $self; +} + +=head2 fetch_info + + Description: Returns a hash containing information about the assembly + stored in the meta table, such as assembly name, date etc., + a reference to array of top level seq_region names and a + reference to array of all coordinate system versions found + Returntype : reference to a hash with assembly info key and value pairs + Exceptions : none + Caller : general + Status : Stable + +=cut + + +sub fetch_info { + my $self = shift; + + #fetch assembly information stored in the meta table + + my $meta_container = $self->db()->get_adaptor('MetaContainer'); + + my @meta_keys = qw(assembly.name assembly.date genebuild.start_date + genebuild.method genebuild.initial_release_date genebuild.last_geneset_update); + my %assembly_info; + + foreach my $meta_key (@meta_keys) { + my @values = @{ $meta_container->list_value_by_key($meta_key) }; + if (@values) { + $assembly_info{$meta_key} = $values[0]; + } + } + + my $schema_build = $self->db()->_get_schema_build(); + if ($schema_build) { + $assembly_info{'schema_build'} = $schema_build; + } + + #fetch available coordinate systems + + my $csa = $self->db()->get_adaptor('CoordSystem'); + my %versions; + foreach my $cs (@{$csa->fetch_all()}) { + $versions{$cs->version()} = 1; + } + my @coord_system_versions = keys %versions; + + $assembly_info{'coord_system_versions'} = \@coord_system_versions; + + #fetch top level seq_region names + + my $sa = $self->db()->get_adaptor('Slice'); + + my $slices = $sa->fetch_all('toplevel'); + + my @top_level_seq_region_names; + + if ($slices) { + @top_level_seq_region_names = sort(map { $_->seq_region_name() } @$slices); + } + + $assembly_info{'top_level_seq_region_names'} = \@top_level_seq_region_names; + + return \%assembly_info; +} + + +=head2 fetch_stats + + Arg [1] : string $seq_region_name (optional) + The name of the toplevel seq_region for which statistics should be fetched + + Description: Returns a reference to a hash containing density features/ density related + seq_region attributes for a toplevel seq_region provided or if none + provided - all top level seq regions + Returntype : hashref + Exceptions : throw if the toplevel slice with seq_region_name provided does not exist + Caller : general + Status : Stable + +=cut + + +sub fetch_stats { + my $self = shift; + + my $seq_region_name = shift; + + my @slices; + + my %assembly_stats; + + my $sa = $self->db()->get_adaptor('Slice'); + + if ($seq_region_name) { + my $slice = $sa->fetch_by_region('toplevel',$seq_region_name); + if (!$slice) { + throw("Top level slice $seq_region_name not found"); + } + push(@slices, $slice); + $assembly_stats{'seq_region_name'} = $seq_region_name; + } else { + @slices = @{$sa->fetch_all('toplevel')}; + } + + my @density_types = qw(genedensity knowngenedensity snpdensity percentgc); + + my @attrib_types = qw(GeneNo% SNPCount); + + my $aa = $self->db()->get_adaptor('Attribute'); + + my $dfa = $self->db()->get_adaptor('DensityFeature'); + + #used to calculate the average density value for density types represented as ratios + + my %density_ft_count = (); + + foreach my $slice (@slices) { + + $assembly_stats{'Length (bps)'} += $slice->length(); + + foreach my $density_type (@density_types) { + + my $density_features = $dfa->fetch_all_by_Slice($slice,$density_type); + + foreach my $density_feature (@$density_features) { + + if ($density_feature->density_type()->value_type() eq 'ratio') { + $density_ft_count{$density_feature->density_type()->analysis()->display_label()} += 1; + } + + $assembly_stats{$density_feature->density_type()->analysis()->display_label()} += $density_feature->density_value(); + } + } + + foreach my $attrib_type (@attrib_types) { + + my $attribs = $aa->fetch_all_by_Slice($slice,$attrib_type); + + foreach my $attrib (@$attribs) { + $assembly_stats{$attrib->description()} += $attrib->value(); + } + } + } + + foreach my $density_analysis (keys %density_ft_count) { + + if ($density_ft_count{$density_analysis} > 1) { + $assembly_stats{$density_analysis} /= $density_ft_count{$density_analysis}; + $assembly_stats{$density_analysis} = sprintf "%.2f", $assembly_stats{$density_analysis}; + $assembly_stats{$density_analysis} .= '%'; + } + } + + return \%assembly_stats; +} + + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/AssemblyExceptionFeatureAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/AssemblyExceptionFeatureAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,618 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::AssemblyExceptionFeatureAdaptor + +=head1 SYNOPSIS + + my $assembly_exception_feature_adaptor = + $database_adaptor->get_AssemblyExceptionFeatureAdaptor(); + + @assembly_exception_features = + $assembly_exception_feature_adaptor->fetch_all_by_Slice($slice); + +=head1 DESCRIPTION + +Assembly Exception Feature Adaptor - database access for assembly +exception features. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::AssemblyExceptionFeatureAdaptor; + +use strict; +use warnings; +no warnings qw(uninitialized); + +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; +use Bio::EnsEMBL::AssemblyExceptionFeature; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Cache; + +our @ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); + +# set the number of slices you'd like to cache +our $ASSEMBLY_EXCEPTION_FEATURE_CACHE_SIZE = 100; + +=head2 new + + Arg [1] : list of args @args + Superclass constructor arguments + Example : none + Description: Constructor which just initializes internal cache structures + Returntype : Bio::EnsEMBL::DBSQL::AssemblyExceptionFeatureAdaptor + Exceptions : none + Caller : implementing subclass constructors + Status : Stable + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + # initialize an LRU cache for slices + my %cache; + tie(%cache, 'Bio::EnsEMBL::Utils::Cache', + $ASSEMBLY_EXCEPTION_FEATURE_CACHE_SIZE); + + $self->{'_aexc_slice_cache'} = \%cache; + + return $self; +} + +=head2 fetch_all + + Arg [1] : none + Example : my @axfs = @{$axfa->fetch_all()}; + Description: Retrieves all assembly exception features which are in the + database and builds internal caches of the features. + Returntype : reference to list of Bio::EnsEMBL::AssemblyExceptionFeatures + Exceptions : none + Caller : fetch_by_dbID, fetch_by_Slice + Status : Stable + +=cut + +sub fetch_all { + my $self = shift; + + # this is the "global" cache for all assembly exception features in the db + if(defined($self->{'_aexc_cache'})) { + return $self->{'_aexc_cache'}; + } + + my $statement = qq( + SELECT ae.assembly_exception_id, + ae.seq_region_id, + ae.seq_region_start, + ae.seq_region_end, + ae.exc_type, + ae.exc_seq_region_id, + ae.exc_seq_region_start, + ae.exc_seq_region_end, + ae.ori + FROM assembly_exception ae, + coord_system cs, + seq_region sr + WHERE cs.species_id = ? + AND sr.coord_system_id = cs.coord_system_id + AND sr.seq_region_id = ae.seq_region_id); + + my $sth = $self->prepare($statement); + + $sth->bind_param( 1, $self->species_id(), SQL_INTEGER ); + + $sth->execute(); + + my ($ax_id, $sr_id, $sr_start, $sr_end, + $x_type, $x_sr_id, $x_sr_start, $x_sr_end, $ori); + + $sth->bind_columns(\$ax_id, \$sr_id, \$sr_start, \$sr_end, + \$x_type, \$x_sr_id, \$x_sr_start, \$x_sr_end, \$ori); + + my @features; + my $sa = $self->db()->get_SliceAdaptor(); + + $self->{'_aexc_dbID_cache'} = {}; + + while($sth->fetch()) { + my $slice = $sa->fetch_by_seq_region_id($sr_id); + my $x_slice = $sa->fetch_by_seq_region_id($x_sr_id); + + # each row creates TWO features, each of which has alternate_slice + # pointing to the "other" one + + + my $a = Bio::EnsEMBL::AssemblyExceptionFeature->new + ('-dbID' => $ax_id, + '-start' => $sr_start, + '-end' => $sr_end, + '-strand' => 1, + '-adaptor' => $self, + '-slice' => $slice, + '-alternate_slice' => $x_slice->sub_Slice($x_sr_start, $x_sr_end), + '-type' => $x_type); + + push @features, $a; + $self->{'_aexc_dbID_cache'}->{$ax_id} = $a; + + push @features, Bio::EnsEMBL::AssemblyExceptionFeature->new + ('-dbID' => $ax_id, + '-start' => $x_sr_start, + '-end' => $x_sr_end, + '-strand' => 1, + '-adaptor' => $self, + '-slice' => $x_slice, + '-alternate_slice' => $slice->sub_Slice($sr_start, $sr_end), + '-type' => "$x_type REF" ); + } + + $sth->finish(); + + $self->{'_aexc_cache'} = \@features; + + return \@features; +} + + +=head2 fetch_by_dbID + + Arg [1] : int $dbID + Example : my $axf = $axfa->fetch_by_dbID(3); + Description: Retrieves a single assembly exception feature via its internal + identifier. Note that this only retrieves one of the two + assembly exception features which are represented by a single + row in the assembly_exception table. + Returntype : Bio::EnsEMBL::AssemblyExceptionFeature + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_by_dbID { + my $self = shift; + my $dbID = shift; + + if(!exists($self->{'_aexc_dbID_cache'})) { + # force loading of cache + $self->fetch_all(); + } + + return $self->{'_aexc_dbID_cache'}->{$dbID}; +} + + +=head2 fetch_all_by_Slice + + Arg [1] : Bio::EnsEMBL::Slice $slice + Example : my @axfs = @{$axfa->fetch_all_by_Slice($slice)}; + Description: Retrieves all assembly exception features which overlap the + provided slice. The returned features will be in coordinate + system of the slice. + Returntype : reference to list of Bio::EnsEMBL::AssemblyException features + Exceptions : none + Caller : Feature::get_all_alt_locations, general + Status : Stable + +=cut + +sub fetch_all_by_Slice { + my $self = shift; + my $slice = shift; + + my $key= uc($slice->name()); + + # return features from the slice cache if present + if(exists($self->{'_aexc_slice_cache'}->{$key})) { + return $self->{'_aexc_slice_cache'}->{$key}; + } + + my $all_features = $self->fetch_all(); + + my $mcc = $self->db()->get_MetaCoordContainer(); + my $css = $mcc->fetch_all_CoordSystems_by_feature_type('assembly_exception'); + + my @features; + + my $ma = $self->db()->get_AssemblyMapperAdaptor(); + + foreach my $cs (@$css) { + my $mapper; + if($cs->equals($slice->coord_system)) { + $mapper = undef; + } else { + $mapper = $ma->fetch_by_CoordSystems($cs,$slice->coord_system()); + } + + push @features, @{ $self->_remap($all_features, $mapper, $slice) }; + } + + $self->{'_aexc_slice_cache'}->{$key} = \@features; + + return \@features; +} + + +# +# Given a list of features checks if they are in the correct coord system +# by looking at the first features slice. If they are not then they are +# converted and placed on the slice. +# +# Note that this is a re-implementation of a method with the same name in +# BaseFeatureAdaptor, and in contrast to the latter which maps features in +# place, this method returns a remapped copy of each feature. The reason for +# this is to get around conflicts with caching. +# +sub _remap { + my ($self, $features, $mapper, $slice) = @_; + + # check if any remapping is actually needed + if(@$features && (!$features->[0]->isa('Bio::EnsEMBL::Feature') || + $features->[0]->slice == $slice)) { + return $features; + } + + # remapping has not been done, we have to do our own conversion from + # to slice coords + + my @out; + + my $slice_start = $slice->start(); + my $slice_end = $slice->end(); + my $slice_strand = $slice->strand(); + my $slice_cs = $slice->coord_system(); + + my ($seq_region, $start, $end, $strand); + + my $slice_seq_region = $slice->seq_region_name(); + + foreach my $f (@$features) { + # since feats were obtained in contig coords, attached seq is a contig + my $fslice = $f->slice(); + if(!$fslice) { + throw("Feature does not have attached slice.\n"); + } + my $fseq_region = $fslice->seq_region_name(); + my $fcs = $fslice->coord_system(); + + if(!$slice_cs->equals($fcs)) { + # slice of feature in different coord system, mapping required + ($seq_region, $start, $end, $strand) = + $mapper->fastmap($fseq_region,$f->start(),$f->end(),$f->strand(),$fcs); + + # undefined start means gap + next if(!defined $start); + } else { + $start = $f->start(); + $end = $f->end(); + $strand = $f->strand(); + $seq_region = $f->slice->seq_region_name(); + } + + # maps to region outside desired area + next if ($start > $slice_end) || ($end < $slice_start) || + ($slice_seq_region ne $seq_region); + + # create new copies of successfully mapped feaatures with shifted start, + # end and strand + my ($new_start, $new_end); + if($slice_strand == -1) { + $new_start = $slice_end - $end + 1; + $new_end = $slice_end - $start + 1; + } else { + $new_start = $start - $slice_start + 1; + $new_end = $end - $slice_start + 1; + } + + push @out, Bio::EnsEMBL::AssemblyExceptionFeature->new( + '-dbID' => $f->dbID, + '-start' => $new_start, + '-end' => $new_end, + '-strand' => $strand * $slice_strand, + '-adaptor' => $self, + '-slice' => $slice, + '-alternate_slice' => $f->alternate_slice, + '-type' => $f->type, + ); + } + + return \@out; +} + +=head2 store + + Arg[1] : Bio::EnsEMBL::AssemblyException $asx + Arg[2] : Bio::EnsEMBL::AssemblyException $asx2 + + Example : $asx = Bio::EnsEMBL::AssemblyExceptionFeature->new(...) + $asx2 = Bio::EnsEMBL::AssemblyExceptionFeature->new(...) + $asx_seq_region_id = $asx_adaptor->store($asx); + Description: This stores a assembly exception feature in the + assembly_exception table and returns the assembly_exception_id. + Needs 2 features: one pointing to the Assembly_exception, and the + other pointing to the region in the reference that is being mapped to + Will check that start, end and type are defined, and the alternate + slice is present as well. + ReturnType: int + Exceptions: throw if assembly exception not defined (needs start, end, + type and alternate_slice) of if $asx not a Bio::EnsEMBL::AssemblyException + Caller: general + Status: Stable + +=cut + +sub store{ + my $self = shift; + my $asx = shift; + my $asx2 = shift; + + + if (! $asx->isa('Bio::EnsEMBL::AssemblyExceptionFeature')){ + throw("$asx is not a Ensembl assemlby exception -- not stored"); + } + #if already present, return ID in the database + my $db = $self->db(); + if ($asx->is_stored($db)){ + return $asx->dbID(); + } + #do some checkings for the object + #at the moment, the orientation is always 1 + if (! $asx->start || ! $asx->end ){ + throw("Assembly exception does not have coordinates"); + } + if ($asx->type !~ /PAR|HAP|PATCH_NOVEL|PATCH_FIX/){ + throw("Only types of assembly exception features valid are PAR, HAP, PATCH_FIX or PATCH_NOVEL"); + } + if ( !($asx->alternate_slice->isa('Bio::EnsEMBL::Slice')) ){ + throw("Alternate slice should be a Bio::EnsEMBL::Slice"); + } + #now check the other Assembly exception feature, the one pointing to the REF + # region + if (!$asx2->isa('Bio::EnsEMBL::AssemblyExceptionFeature')){ + throw("$asx2 is not a Ensembl assemlby exception -- not stored"); + } + if (! $asx2->start || ! $asx2->end ){ + throw("Assembly exception does not have coordinates"); + } + if ($asx2->type !~ /HAP REF|PAR REF|PATCH_NOVEL REF|PATCH_FIX REF/){ + throw("$asx2 should have type of assembly exception features HAP REF, PAR REF, PATCH_FIX REF or PATCH_NOVEL REF"); + } + if (! ($asx2->alternate_slice->isa('Bio::EnsEMBL::Slice')) ){ + throw("Alternate slice should be a Bio::EnsEMBL::Slice"); + } + #finally check that both features are pointing to each other slice + if ($asx->slice != $asx2->alternate_slice || $asx->alternate_slice != $asx2->slice){ + throw("Slice and alternate slice in both features are not pointing to each other"); + } + #prepare the SQL + my $asx_sql = q{ + INSERT INTO assembly_exception( seq_region_id, seq_region_start, + seq_region_end, + exc_type, exc_seq_region_id, + exc_seq_region_start, exc_seq_region_end, + ori) + VALUES (?, ?, ?, ?, ?, ?, ?, 1) + }; + + my $asx_st = $self->prepare($asx_sql); + my $asx_id = undef; + my $asx_seq_region_id; + my $asx2_seq_region_id; + my $original = $asx; + my $original2 = $asx2; + #check all feature information + ($asx, $asx_seq_region_id) = $self->_pre_store($asx); + ($asx2, $asx2_seq_region_id) = $self->_pre_store($asx2); + + #and store it + $asx_st->bind_param(1, $asx_seq_region_id, SQL_INTEGER); + $asx_st->bind_param(2, $asx->start(), SQL_INTEGER); + $asx_st->bind_param(3, $asx->end(), SQL_INTEGER); + $asx_st->bind_param(4, $asx->type(), SQL_VARCHAR); + $asx_st->bind_param(5, $asx2_seq_region_id, SQL_INTEGER); + $asx_st->bind_param(6, $asx2->start(), SQL_INTEGER); + $asx_st->bind_param(7, $asx2->end(), SQL_INTEGER); + + $asx_st->execute(); + $asx_id = $asx_st->{'mysql_insertid'}; + + #finally, update the dbID and adaptor of the asx and asx2 + $original->adaptor($self); + $original->dbID($asx_id); + $original2->adaptor($self); + $original2->dbID($asx_id); + #and finally update dbID cache with new assembly exception + $self->{'_aexc_dbID_cache'}->{$asx_id} = $original; + #and update the other caches as well + push @{$self->{'_aexc_slice_cache'}->{uc($asx->slice->name)}},$original, $original2; + push @{$self->{'_aexc_cache'}}, $original, $original2; + + return $asx_id; +} + +# +# Helper function containing some common feature storing functionality +# +# Given a Feature this will return a copy (or the same feature if no changes +# to the feature are needed) of the feature which is relative to the start +# of the seq_region it is on. The seq_region_id of the seq_region it is on +# is also returned. +# +# This method will also ensure that the database knows which coordinate +# systems that this feature is stored in. +# Since this adaptor doesn't inherit from BaseFeatureAdaptor, we need to copy +# the code +# + +sub _pre_store { + my $self = shift; + my $feature = shift; + + if(!ref($feature) || !$feature->isa('Bio::EnsEMBL::Feature')) { + throw('Expected Feature argument.'); + } + + $self->_check_start_end_strand($feature->start(),$feature->end(), + $feature->strand()); + + + my $db = $self->db(); + + my $slice_adaptor = $db->get_SliceAdaptor(); + my $slice = $feature->slice(); + + if(!ref($slice) || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice')) ) { + throw('Feature must be attached to Slice to be stored.'); + } + + # make sure feature coords are relative to start of entire seq_region + + if($slice->start != 1 || $slice->strand != 1) { + #move feature onto a slice of the entire seq_region + $slice = $slice_adaptor->fetch_by_region($slice->coord_system->name(), + $slice->seq_region_name(), + undef, #start + undef, #end + undef, #strand + $slice->coord_system->version()); + + $feature = $feature->transfer($slice); + + if(!$feature) { + throw('Could not transfer Feature to slice of ' . + 'entire seq_region prior to storing'); + } + } + + # Ensure this type of feature is known to be stored in this coord system. + my $cs = $slice->coord_system; + + my $mcc = $db->get_MetaCoordContainer(); + + $mcc->add_feature_type($cs, 'assembly_exception', $feature->length); + + my $seq_region_id = $slice_adaptor->get_seq_region_id($slice); + + if(!$seq_region_id) { + throw('Feature is associated with seq_region which is not in this DB.'); + } + + return ($feature, $seq_region_id); +} + +# +# helper function used to validate start/end/strand and +# hstart/hend/hstrand etc. +# +sub _check_start_end_strand { + my $self = shift; + my $start = shift; + my $end = shift; + my $strand = shift; + + # + # Make sure that the start, end, strand are valid + # + if(int($start) != $start) { + throw("Invalid Feature start [$start]. Must be integer."); + } + if(int($end) != $end) { + throw("Invalid Feature end [$end]. Must be integer."); + } + if(int($strand) != $strand || $strand < -1 || $strand > 1) { + throw("Invalid Feature strand [$strand]. Must be -1, 0 or 1."); + } + if($end < $start) { + throw("Invalid Feature start/end [$start/$end]. Start must be less " . + "than or equal to end."); + } + + return 1; +} + +=head2 remove + + Arg [1] : $asx Bio::EnsEMBL::AssemblyFeatureException + Example : $asx_adaptor->remove($asx); + Description: This removes a assembly exception feature from the database. + Returntype : none + Exceptions : thrown if $asx arg does not implement dbID(), or if + $asx->dbID is not a true value + Caller : general + Status : Stable + +=cut + +#again, this method is generic in BaseFeatureAdaptor, but since this class +#is not inheriting, need to copy&paste +sub remove { + my ($self, $feature) = @_; + + if(!$feature || !ref($feature) || !$feature->isa('Bio::EnsEMBL::AssemblyExceptionFeature')) { + throw('AssemblyExceptionFeature argument is required'); + } + + if(!$feature->is_stored($self->db)) { + throw("This feature is not stored in this database"); + } + + my $asx_id = $feature->dbID(); + my $key = uc($feature->slice->name); + my $sth = $self->prepare("DELETE FROM assembly_exception WHERE assembly_exception_id = ?"); + $sth->bind_param(1,$feature->dbID,SQL_INTEGER); + $sth->execute(); + + #and clear cache + #and finally update dbID cache + delete $self->{'_aexc_dbID_cache'}->{$asx_id}; + #and remove from cache feature + my @features; + foreach my $asx (@{$self->{'_aexc_slice_cache'}->{$key}}){ + if ($asx->dbID != $asx_id){ + push @features, $asx; + } + } + $self->{'_aexc_slice_cache'}->{$key} = \@features; + @features = (); + foreach my $asx (@{$self->{'_aexc_cache'}}){ + if ($asx->dbID != $asx_id){ + push @features, $asx; + } + } + $self->{'_aexc_cache'} = \@features; + +#unset the feature dbID ad adaptor + $feature->dbID(undef); + $feature->adaptor(undef); + + return; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/AssemblyMapperAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/AssemblyMapperAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1841 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Registry; + + Bio::EnsEMBL::Registry->load_registry_from_db( + -host => 'ensembldb.ensembl.org', + -user => 'anonymous' + ); + + $asma = Bio::EnsEMBL::Registry->get_adaptor( "human", "core", + "assemblymapper" ); + + $csa = Bio::EnsEMBL::Registry->get_adaptor( "human", "core", + "coordsystem" ); + + my $chr33_cs = $csa->fetch_by_name( 'chromosome', 'NCBI33' ); + my $chr34_cs = $csa->fetch_by_name( 'chromosome', 'NCBI34' ); + my $ctg_cs = $csa->fetch_by_name('contig'); + my $clone_cs = $csa->fetch_by_name('clone'); + + my $chr_ctg_mapper = + $asma->fetch_by_CoordSystems( $chr33_cs, $ctg_cs ); + + my $ncbi33_ncbi34_mapper = + $asm_adptr->fetch_by_CoordSystems( $chr33, $chr34 ); + + my $ctg_clone_mapper = + $asm_adptr->fetch_by_CoordSystems( $ctg_cs, $clone_cs ); + + +=head1 DESCRIPTION + +Adaptor for handling Assembly mappers. This is a I class. +ie: There is only one per database (C). + +This is used to retrieve mappers between any two coordinate systems +whose makeup is described by the assembly table. Currently one step +(explicit) and two step (implicit) pairwise mapping is supported. In +one-step mapping an explicit relationship between the coordinate systems +is defined in the assembly table. In two-step 'chained' mapping no +explicit mapping is present but the coordinate systems must share a +common mapping to an intermediate coordinate system. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor; +use vars qw(@ISA); +use strict; + +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::AssemblyMapper; +use Bio::EnsEMBL::ChainedAssemblyMapper; +use Bio::EnsEMBL::TopLevelAssemblyMapper; + +use Bio::EnsEMBL::Utils::Cache; #CPAN LRU cache +use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning stack_trace_dump); +#use Bio::EnsEMBL::Utils::Exception qw(deprecate throw); +use Bio::EnsEMBL::Utils::SeqRegionCache; + +use integer; #do proper arithmetic bitshifts + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); + + +my $CHUNKFACTOR = 20; # 2^20 = approx. 10^6 + +=head2 new + + Arg [1] : Bio::EnsEMBL::DBAdaptor $dbadaptor the adaptor for + the database this assembly mapper is using. + Example : my $asma = new Bio::EnsEMBL::AssemblyMapperAdaptor($dbadaptor); + Description: Creates a new AssemblyMapperAdaptor object + Returntype : Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor + Exceptions : none + Caller : Bio::EnsEMBL::DBSQL::DBAdaptor + Status : Stable + +=cut + +sub new { + my($class, $dbadaptor) = @_; + + my $self = $class->SUPER::new($dbadaptor); + + $self->{'_asm_mapper_cache'} = {}; + + # use a shared cache (for this database) that contains info about + # seq regions + my $seq_region_cache = $self->db->get_SeqRegionCache(); + $self->{'sr_name_cache'} = $seq_region_cache->{'name_cache'}; + $self->{'sr_id_cache'} = $seq_region_cache->{'id_cache'}; + + return $self; +} + + + +=head2 cache_seq_ids_with_mult_assemblys + + Example : $self->adaptor->cache_seq_ids_with_mult_assemblys(); + Description: Creates a hash of the component seq region ids that + map to more than one assembly from the assembly table. + Retruntype : none + Exceptions : none + Caller : AssemblyMapper, ChainedAssemblyMapper + Status : At Risk + +=cut + +sub cache_seq_ids_with_mult_assemblys{ + my $self = shift; + my %multis; + + return if (defined($self->{'multi_seq_ids'})); + + $self->{'multi_seq_ids'} = {}; + + my $sql = qq( + SELECT sra.seq_region_id + FROM seq_region_attrib sra, + attrib_type at, + seq_region sr, + coord_system cs + WHERE sra.attrib_type_id = at.attrib_type_id + AND code = "MultAssem" + AND sra.seq_region_id = sr.seq_region_id + AND sr.coord_system_id = cs.coord_system_id + AND cs.species_id = ?); + + my $sth = $self->prepare($sql); + + $sth->bind_param( 1, $self->species_id(), SQL_INTEGER ); + + $sth->execute(); + + my ($seq_region_id); + + $sth->bind_columns(\$seq_region_id); + + while($sth->fetch()) { + $self->{'multi_seq_ids'}->{$seq_region_id} = 1; + } + $sth->finish; +} + + +=head2 fetch_by_CoordSystems + + Arg [1] : Bio::EnsEMBL::CoordSystem $cs1 + One of the coordinate systems to retrieve the mapper + between + Arg [2] : Bio::EnsEMBL::CoordSystem $cs2 + The other coordinate system to map between + Description: Retrieves an Assembly mapper for two coordinate + systems whose relationship is described in the + assembly table. + + The ordering of the coodinate systems is arbitrary. + The following two statements are equivalent: + $mapper = $asma->fetch_by_CoordSystems($cs1,$cs2); + $mapper = $asma->fetch_by_CoordSystems($cs2,$cs1); + Returntype : Bio::EnsEMBL::AssemblyMapper + Exceptions : wrong argument types + Caller : general + Status : Stable + +=cut + +sub fetch_by_CoordSystems { + my $self = shift; + my $cs1 = shift; + my $cs2 = shift; + + if(!ref($cs1) || !$cs1->isa('Bio::EnsEMBL::CoordSystem')) { + throw("cs1 argument must be a Bio::EnsEMBL::CoordSystem."); + } + if(!ref($cs2) || !$cs2->isa('Bio::EnsEMBL::CoordSystem')) { + throw("cs2 argument must be a Bio::EnsEMBL::CoordSystem."); + } + +# if($cs1->equals($cs2)) { +# throw("Cannot create mapper between same coord systems: " . +# $cs1->name . " " . $cs1->version); +# } + + if($cs1->is_top_level()) { + return Bio::EnsEMBL::TopLevelAssemblyMapper->new($self, $cs1, $cs2); + } + if($cs2->is_top_level()) { + return Bio::EnsEMBL::TopLevelAssemblyMapper->new($self, $cs2, $cs1); + } + + my $csa = $self->db->get_CoordSystemAdaptor(); + + #retrieve the shortest possible mapping path between these systems + my @mapping_path = @{$csa->get_mapping_path($cs1,$cs2)}; + + if(!@mapping_path) { + + # It is perfectly fine not to have a mapping. No warning needed really + # Just check the return code!! + +# warning( +# "There is no mapping defined between these coord systems:\n" . +# $cs1->name() . " " . $cs1->version() . " and " . $cs2->name() . " " . +# $cs2->version() +# ); + return undef; + } + + my $key = join(':', map({defined($_)?$_->dbID():"-"} @mapping_path)); + + my $asm_mapper = $self->{'_asm_mapper_cache'}->{$key}; + + return $asm_mapper if($asm_mapper); + + if(@mapping_path == 1) { + throw("Incorrect mapping path defined in meta table. " . + "0 step mapping encountered between:\n" . + $cs1->name() . " " . $cs1->version() . " and " . $cs2->name . " " . + $cs2->version()); + } + + if(@mapping_path == 2) { + #1 step regular mapping + $asm_mapper = Bio::EnsEMBL::AssemblyMapper->new($self, @mapping_path); + +# If you want multiple pieces on two seqRegions to map to each other +# you need to make an assembly.mapping entry that is seperated with a # +# instead of an |. + + $self->{'_asm_mapper_cache'}->{$key} = $asm_mapper; + return $asm_mapper; + } + + if(@mapping_path == 3) { + #two step chained mapping + $asm_mapper = Bio::EnsEMBL::ChainedAssemblyMapper->new($self,@mapping_path); + #in multi-step mapping it is possible get requests with the + #coordinate system ordering reversed since both mappings directions + #cache on both orderings just in case + #e.g. chr <-> contig <-> clone and clone <-> contig <-> chr + + $self->{'_asm_mapper_cache'}->{$key} = $asm_mapper; + $key = join(':', map({defined($_)?$_->dbID():"-"} reverse(@mapping_path))); + $self->{'_asm_mapper_cache'}->{$key} = $asm_mapper; + return $asm_mapper; + } + + throw("Only 1 and 2 step coordinate system mapping is currently\n" . + "supported. Mapping between " . + $cs1->name() . " " . $cs1->version() . " and " . + $cs2->name() . " " . $cs2->version() . + " requires ". (scalar(@mapping_path)-1) . " steps."); +} + + + +=head2 register_assembled + + Arg [1] : Bio::EnsEMBL::AssemblyMapper $asm_mapper + A valid AssemblyMapper object + Arg [2] : integer $asm_seq_region + The dbID of the seq_region to be registered + Arg [3] : int $asm_start + The start of the region to be registered + Arg [4] : int $asm_end + The end of the region to be registered + Description: Declares an assembled region to the AssemblyMapper. + This extracts the relevant data from the assembly + table and stores it in Mapper internal to the $asm_mapper. + It therefore must be called before any mapping is + attempted on that region. Otherwise only gaps will + be returned. Note that the AssemblyMapper automatically + calls this method when the need arises. + Returntype : none + Exceptions : throw if the seq_region to be registered does not exist + or if it associated with multiple assembled pieces (bad data + in assembly table) + Caller : Bio::EnsEMBL::AssemblyMapper + Status : Stable + +=cut + + +sub register_assembled { + my $self = shift; + my $asm_mapper = shift; + my $asm_seq_region = shift; + my $asm_start = shift; + my $asm_end = shift; + + if(!ref($asm_mapper) || !$asm_mapper->isa('Bio::EnsEMBL::AssemblyMapper')) { + throw("Bio::EnsEMBL::AssemblyMapper argument expected"); + } + + throw("asm_seq_region argument expected") if(!defined($asm_seq_region)); + throw("asm_start argument expected") if(!defined($asm_start)); + throw("asm_end argument expected") if(!defined($asm_end)); + + my $asm_cs_id = $asm_mapper->assembled_CoordSystem->dbID(); + my $cmp_cs_id = $asm_mapper->component_CoordSystem->dbID(); + + #split up the region to be registered into fixed chunks + #this allows us to keep track of regions that have already been + #registered and also works under the assumption that if a small region + #is requested it is likely that other requests will be made in the + #vicinity (the minimum size registered the chunksize (2^chunkfactor) + + my @chunk_regions; + #determine span of chunks + #bitwise shift right is fast and easy integer division + + my($start_chunk, $end_chunk); + + $start_chunk = $asm_start >> $CHUNKFACTOR; + $end_chunk = $asm_end >> $CHUNKFACTOR; + + # inserts have start = end + 1, on boundary condition start_chunk + # could be less than end chunk + if($asm_start == $asm_end + 1) { + ($start_chunk, $end_chunk) = ($end_chunk, $start_chunk); + } + + #find regions of continuous unregistered chunks + my $i; + my ($begin_chunk_region,$end_chunk_region); + for ($i = $start_chunk; $i <= $end_chunk; $i++) { + if($asm_mapper->have_registered_assembled($asm_seq_region, $i)) { + if(defined($begin_chunk_region)) { + #this is the end of an unregistered region. + my $region = [($begin_chunk_region << $CHUNKFACTOR), + (($end_chunk_region+1) << $CHUNKFACTOR)-1]; + push @chunk_regions, $region; + $begin_chunk_region = $end_chunk_region = undef; + } + } else { + $begin_chunk_region = $i if(!defined($begin_chunk_region)); + $end_chunk_region = $i+1; + $asm_mapper->register_assembled($asm_seq_region,$i); + } + } + + #the last part may have been an unregistered region too + if(defined($begin_chunk_region)) { + my $region = [($begin_chunk_region << $CHUNKFACTOR), + (($end_chunk_region+1) << $CHUNKFACTOR) -1]; + push @chunk_regions, $region; + } + + return if(!@chunk_regions); + + # keep the Mapper to a reasonable size + if( $asm_mapper->size() > $asm_mapper->max_pair_count() ) { + $asm_mapper->flush(); + #we now have to go and register the entire requested region since we + #just flushed everything + + @chunk_regions = ( [ ( $start_chunk << $CHUNKFACTOR) + , (($end_chunk+1) << $CHUNKFACTOR)-1 ] ); + + for( my $i = $start_chunk; $i <= $end_chunk; $i++ ) { + $asm_mapper->register_assembled( $asm_seq_region, $i ); + } + } + +# my $asm_seq_region_id = +# $self->_seq_region_name_to_id($asm_seq_region,$asm_cs_id); + + # Retrieve the description of how the assembled region is made from + # component regions for each of the continuous blocks of unregistered, + # chunked regions + + my $q = qq{ + SELECT + asm.cmp_start, + asm.cmp_end, + asm.cmp_seq_region_id, + sr.name, + sr.length, + asm.ori, + asm.asm_start, + asm.asm_end + FROM + assembly asm, seq_region sr + WHERE + asm.asm_seq_region_id = ? AND + ? <= asm.asm_end AND + ? >= asm.asm_start AND + asm.cmp_seq_region_id = sr.seq_region_id AND + sr.coord_system_id = ? + }; + + my $sth = $self->prepare($q); + + foreach my $region (@chunk_regions) { + my($region_start, $region_end) = @$region; + $sth->bind_param(1,$asm_seq_region,SQL_INTEGER); + $sth->bind_param(2,$region_start,SQL_INTEGER); + $sth->bind_param(3,$region_end,SQL_INTEGER); + $sth->bind_param(4,$cmp_cs_id,SQL_INTEGER); + + $sth->execute(); + + my($cmp_start, $cmp_end, $cmp_seq_region_id, $cmp_seq_region, $ori, + $asm_start, $asm_end, $cmp_seq_region_length); + + $sth->bind_columns(\$cmp_start, \$cmp_end, \$cmp_seq_region_id, + \$cmp_seq_region, \$cmp_seq_region_length, \$ori, + \$asm_start, \$asm_end); + + # + # Load the unregistered regions of the mapper + # + while($sth->fetch()) { + next if($asm_mapper->have_registered_component($cmp_seq_region_id) + and !defined($self->{'multi_seq_ids'}->{$cmp_seq_region_id})); + $asm_mapper->register_component($cmp_seq_region_id); + $asm_mapper->mapper->add_map_coordinates( + $asm_seq_region, $asm_start, $asm_end, + $ori, + $cmp_seq_region_id, $cmp_start, $cmp_end); + + my $arr = [ $cmp_seq_region_id, $cmp_seq_region, + $cmp_cs_id, $cmp_seq_region_length ]; + + $self->{'sr_name_cache'}->{"$cmp_seq_region:$cmp_cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$cmp_seq_region_id"} = $arr; + } + } + + $sth->finish(); +} + + + +sub _seq_region_name_to_id { + my $self = shift; + my $sr_name = shift; + my $cs_id = shift; + + ($sr_name && $cs_id) || throw('seq_region_name and coord_system_id args ' . + 'are required'); + + my $arr = $self->{'sr_name_cache'}->{"$sr_name:$cs_id"}; + if( $arr ) { + return $arr->[0]; + } + + # Get the seq_region_id via the name. This would be quicker if we just + # used internal ids instead but stored but then we lose the ability + # the transform accross databases with different internal ids + + my $sth = $self->prepare("SELECT seq_region_id, length " . + "FROM seq_region " . + "WHERE name = ? AND coord_system_id = ?"); + + $sth->bind_param(1,$sr_name,SQL_VARCHAR); + $sth->bind_param(2,$cs_id,SQL_INTEGER); + $sth->execute(); + + if(!$sth->rows() == 1) { + throw("Ambiguous or non-existant seq_region [$sr_name] " . + "in coord system $cs_id"); + } + + my ($sr_id, $sr_length) = $sth->fetchrow_array(); + $sth->finish(); + + $arr = [ $sr_id, $sr_name, $cs_id, $sr_length ]; + + $self->{'sr_name_cache'}->{"$sr_name:$cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$sr_id"} = $arr; + + return $sr_id; +} + +sub _seq_region_id_to_name { + my $self = shift; + my $sr_id = shift; + + ($sr_id) || throw('seq_region_is required'); + + my $arr = $self->{'sr_id_cache'}->{"$sr_id"}; + if( $arr ) { + return $arr->[1]; + } + + # Get the seq_region name via the id. This would be quicker if we just + # used internal ids instead but stored but then we lose the ability + # the transform accross databases with different internal ids + + my $sth = $self->prepare("SELECT name, length ,coord_system_id " . + "FROM seq_region " . + "WHERE seq_region_id = ? "); + + $sth->bind_param(1,$sr_id,SQL_INTEGER); + $sth->execute(); + + if(!$sth->rows() == 1) { + throw("non-existant seq_region [$sr_id]"); + } + + my ($sr_name, $sr_length, $cs_id) = $sth->fetchrow_array(); + $sth->finish(); + + $arr = [ $sr_id, $sr_name, $cs_id, $sr_length ]; + + $self->{'sr_name_cache'}->{"$sr_name:$cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$sr_id"} = $arr; + + return $sr_name; +} + + +=head2 register_component + + Arg [1] : Bio::EnsEMBL::AssemblyMapper $asm_mapper + A valid AssemblyMapper object + Arg [2] : integer $cmp_seq_region + The dbID of the seq_region to be registered + Description: Declares a component region to the AssemblyMapper. + This extracts the relevant data from the assembly + table and stores it in Mapper internal to the $asm_mapper. + It therefore must be called before any mapping is + attempted on that region. Otherwise only gaps will + be returned. Note that the AssemblyMapper automatically + calls this method when the need arises. + Returntype : none + Exceptions : throw if the seq_region to be registered does not exist + or if it associated with multiple assembled pieces (bad data + in assembly table) + Caller : Bio::EnsEMBL::AssemblyMapper + Status : Stable + +=cut + +sub register_component { + my $self = shift; + my $asm_mapper = shift; + my $cmp_seq_region = shift; + + if(!ref($asm_mapper) || !$asm_mapper->isa('Bio::EnsEMBL::AssemblyMapper')) { + throw("Bio::EnsEMBL::AssemblyMapper argument expected"); + } + + if(!defined($cmp_seq_region)) { + throw("cmp_seq_region argument expected"); + } + + my $cmp_cs_id = $asm_mapper->component_CoordSystem()->dbID(); + my $asm_cs_id = $asm_mapper->assembled_CoordSystem()->dbID(); + + #do nothing if this region is already registered or special case + return if($asm_mapper->have_registered_component($cmp_seq_region) + and !defined($self->{'multi_seq_ids'}->{$cmp_seq_region})); + +# my $cmp_seq_region_id = +# $self->_seq_region_name_to_id($cmp_seq_region, $cmp_cs_id); + + # Determine what part of the assembled region this component region makes up + + my $q = qq{ + SELECT + asm.asm_start, + asm.asm_end, + asm.asm_seq_region_id, + sr.name, + sr.length + FROM + assembly asm, seq_region sr + WHERE + asm.cmp_seq_region_id = ? AND + asm.asm_seq_region_id = sr.seq_region_id AND + sr.coord_system_id = ? + }; + + my $sth = $self->prepare($q); + $sth->bind_param(1,$cmp_seq_region,SQL_INTEGER); + $sth->bind_param(2,$asm_cs_id,SQL_INTEGER); + $sth->execute(); + + if($sth->rows() == 0) { + #this component is not used in the assembled part i.e. gap + $asm_mapper->register_component($cmp_seq_region); + return; + } + + #we do not currently support components mapping to multiple assembled + # make sure that you've got the correct mapping in the meta-table : + # chromosome:EquCab2#contig ( use'#' for multiple mappings ) + # chromosome:EquCab2|contig ( use '|' delimiter for 1-1 mappings ) + # + if($sth->rows() != 1) { + throw("Multiple assembled regions for single " . + "component region cmp_seq_region_id=[$cmp_seq_region]\n". + "Remember that multiple mappings use the \#-operaator". + " in the meta-table (i.e. chromosome:EquCab2\#contig\n"); + } + + my ($asm_start, $asm_end, $asm_seq_region_id, + $asm_seq_region, $asm_seq_region_length) = $sth->fetchrow_array(); + + my $arr = [ $asm_seq_region_id, $asm_seq_region, + $asm_cs_id, $asm_seq_region_length ]; + + $self->{'sr_name_cache'}->{"$asm_seq_region:$asm_cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$asm_seq_region_id"} = $arr; + + $sth->finish(); + + # Register the corresponding assembled region. This allows a us to + # register things in assembled chunks which allows us to: + # (1) Keep track of what assembled regions are registered + # (2) Use locality of reference (if they want something in same general + # region it will already be registered). + + $self->register_assembled($asm_mapper,$asm_seq_region_id,$asm_start,$asm_end); +} + + + +=head2 register_chained + + Arg [1] : Bio::EnsEMBL::ChainedAssemblyMapper $casm_mapper + The chained assembly mapper to register regions on + Arg [2] : string $from ('first' or 'last') + The direction we are registering from, and the name of the + internal mapper. + Arg [3] : string $seq_region_name + The name of the seqregion we are registering on + Arg [4] : listref $ranges + A list of ranges to register (in [$start,$end] tuples). + Arg [5] : (optional) $to_slice + Only register those on this Slice. + Description: Registers a set of ranges on a chained assembly mapper. + This function is at the heart of the chained mapping process. + It retrieves information from the assembly table and + dynamically constructs the mappings between two coordinate + systems which are 2 mapping steps apart. It does this by using + two internal mappers to load up a third mapper which is + actually used by the ChainedAssemblyMapper to perform the + mapping. + + This method must be called before any mapping is + attempted on regions of interest, otherwise only gaps will + be returned. Note that the ChainedAssemblyMapper automatically + calls this method when the need arises. + Returntype : none + Exceptions : throw if the seq_region to be registered does not exist + or if it associated with multiple assembled pieces (bad data + in assembly table) + + throw if the mapping between the coordinate systems cannot + be performed in two steps, which means there is an internal + error in the data in the meta table or in the code that creates + the mapping paths. + Caller : Bio::EnsEMBL::AssemblyMapper + Status : Stable + +=cut + +sub register_chained { + my $self = shift; + my $casm_mapper = shift; + my $from = shift; + my $seq_region_id = shift; + my $ranges = shift; + my $to_slice = shift; + + my $to_seq_region_id; + if(defined($to_slice)){ + if($casm_mapper->first_CoordSystem()->equals($casm_mapper->last_CoordSystem())){ + return $self->_register_chained_special($casm_mapper, $from, $seq_region_id, $ranges, $to_slice); + } + $to_seq_region_id = $to_slice->get_seq_region_id(); + if(!defined($to_seq_region_id)){ + die "Could not get seq_region_id for to_slice".$to_slice->seq_region_name."\n"; + } + } + + my ($start_name, $start_mid_mapper, $start_cs, $start_registry, + $end_name, $end_mid_mapper, $end_cs, $end_registry); + + if($from eq 'first') { + $start_name = 'first'; + $start_mid_mapper = $casm_mapper->first_middle_mapper(); + $start_cs = $casm_mapper->first_CoordSystem(); + $start_registry = $casm_mapper->first_registry(); + $end_mid_mapper = $casm_mapper->last_middle_mapper(); + $end_cs = $casm_mapper->last_CoordSystem(); + $end_registry = $casm_mapper->last_registry(); + $end_name = 'last'; + } elsif($from eq 'last') { + $start_name = 'last'; + $start_mid_mapper = $casm_mapper->last_middle_mapper(); + $start_cs = $casm_mapper->last_CoordSystem(); + $start_registry = $casm_mapper->last_registry(); + $end_mid_mapper = $casm_mapper->first_middle_mapper(); + $end_cs = $casm_mapper->first_CoordSystem(); + $end_registry = $casm_mapper->first_registry(); + $end_name = 'first'; + } else { + throw("Invalid from argument: [$from], must be 'first' or 'last'"); + } + + my $combined_mapper = $casm_mapper->first_last_mapper(); + my $mid_cs = $casm_mapper->middle_CoordSystem(); + my $mid_name = 'middle'; + my $csa = $self->db->get_CoordSystemAdaptor(); + + # Check for the simple case where the ChainedMapper is short + if( ! defined $mid_cs ) { + $start_mid_mapper = $combined_mapper; + } + + + ############## + # obtain the first half of the mappings and load them into the start mapper + # + + #ascertain which is component and which is actually assembled coord system + my @path; + + # check for the simple case, where the ChainedMapper is short + if( defined $mid_cs ) { + @path = @{$csa->get_mapping_path($start_cs, $mid_cs)}; + } else { + @path = @{$csa->get_mapping_path( $start_cs, $end_cs )}; + } + + if(@path != 2 && defined( $path[1] )) { + my $path = join(',', map({$_->name .' '. $_->version} @path)); + my $len = scalar(@path) - 1; + throw("Unexpected mapping path between start and intermediate " . + "coord systems (". $start_cs->name . " " . $start_cs->version . + " and " . $mid_cs->name . " " . $mid_cs->version . ")." . + "\nExpected path length 1, got $len. " . + "(path=$path)"); + } + + my $sth; + my ($asm_cs,$cmp_cs); + $asm_cs = $path[0]; + $cmp_cs = $path[-1]; + + #the SQL varies depending on whether we are coming from assembled or + #component coordinate system + +my $asm2cmp = (<= asm.asm_start AND + asm.cmp_seq_region_id = sr.seq_region_id AND + sr.coord_system_id = ? +ASMCMP + + +my $cmp2asm = (<= asm.cmp_start AND + asm.asm_seq_region_id = sr.seq_region_id AND + sr.coord_system_id = ? +CMPASM + + my $asm2cmp_sth; + my $cmp2asm_sth; + if(defined($to_slice)){ + my $to_cs = $to_slice->coord_system; + if($asm_cs->equals($to_cs)){ + $asm2cmp_sth = $self->prepare($asm2cmp); + $cmp2asm_sth = $self->prepare($cmp2asm." AND asm.asm_seq_region_id = $to_seq_region_id"); + } + elsif($cmp_cs->equals($to_cs)){ + $asm2cmp_sth = $self->prepare($asm2cmp." AND asm.cmp_seq_region_id = $to_seq_region_id"); + $cmp2asm_sth = $self->prepare($cmp2asm); + } + else{ + $asm2cmp_sth = $self->prepare($asm2cmp); + $cmp2asm_sth = $self->prepare($cmp2asm); + } + } + else{ + $asm2cmp_sth = $self->prepare($asm2cmp); + $cmp2asm_sth = $self->prepare($cmp2asm); + } + + + + $sth = ($asm_cs->equals($start_cs)) ? $asm2cmp_sth : $cmp2asm_sth; + + my $mid_cs_id; + + # check for the simple case where the ChainedMapper is short + if( defined $mid_cs ) { + $mid_cs_id = $mid_cs->dbID(); + } else { + $mid_cs_id = $end_cs->dbID(); + } + + my @mid_ranges; + my @start_ranges; + + #need to perform the query for each unregistered range + foreach my $range (@$ranges) { + my ($start, $end) = @$range; + $sth->bind_param(1,$seq_region_id,SQL_INTEGER); + $sth->bind_param(2,$start,SQL_INTEGER); + $sth->bind_param(3,$end,SQL_INTEGER); + $sth->bind_param(4,$mid_cs_id,SQL_INTEGER); + $sth->execute(); + + #load the start <-> mid mapper with the results and record the mid cs + #ranges we just added to the mapper + + my ($mid_start, $mid_end, $mid_seq_region_id, $mid_seq_region, $mid_length, + $ori, $start_start, $start_end); + + $sth->bind_columns(\$mid_start, \$mid_end, \$mid_seq_region_id, + \$mid_seq_region, \$mid_length, \$ori, \$start_start, + \$start_end); + + while($sth->fetch()) { + + if( defined $mid_cs ) { + $start_mid_mapper->add_map_coordinates + ( + $seq_region_id,$start_start, $start_end, $ori, + $mid_seq_region_id, $mid_start, $mid_end + ); + } else { + if( $from eq "first" ) { + $combined_mapper->add_map_coordinates + ( + $seq_region_id,$start_start, $start_end, $ori, + $mid_seq_region_id, $mid_start, $mid_end + ); + } else { + $combined_mapper->add_map_coordinates + ( + $mid_seq_region_id, $mid_start, $mid_end, $ori, + $seq_region_id,$start_start, $start_end + ); + } + } + + #update sr_name cache + my $arr = [ $mid_seq_region_id, $mid_seq_region, + $mid_cs_id, $mid_length ]; + + $self->{'sr_name_cache'}->{"$mid_seq_region:$mid_cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$mid_seq_region_id"} = $arr; + + push @mid_ranges,[$mid_seq_region_id,$mid_seq_region, + $mid_start,$mid_end]; + push @start_ranges, [ $seq_region_id, $start_start, $start_end ]; + + #the region that we actually register may actually be larger or smaller + #than the region that we wanted to register. + #register the intersection of the region so we do not end up doing + #extra work later + + if($start_start < $start || $start_end > $end) { + $start_registry->check_and_register($seq_region_id,$start_start, + $start_end); + } + } + $sth->finish(); + } + + # in the one step case, we load the mid ranges in the + # last_registry and we are done + if( ! defined $mid_cs ) { + for my $range ( @mid_ranges ) { + $end_registry->check_and_register( $range->[0], $range->[2], + $range->[3] ); + } + + # and thats it for the simple case ... + return; + } + + + ########### + # now the second half of the mapping + # perform another query and load the mid <-> end mapper using the mid cs + # ranges + # + + #ascertain which is component and which is actually assembled coord system + @path = @{$csa->get_mapping_path($mid_cs, $end_cs)}; + if(@path == 2 || ( @path == 3 && !defined $path[1])) { + + $asm_cs = $path[0]; + $cmp_cs = $path[-1]; + } else { + my $path = join(',', map({$_->name .' '. $_->version} @path)); + my $len = scalar(@path)-1; + throw("Unexpected mapping path between intermediate and last" . + "coord systems (". $mid_cs->name . " " . $mid_cs->version . + " and " . $end_cs->name . " " . $end_cs->version . ")." . + "\nExpected path length 1, got $len. " . + "(path=$path)"); + } + + if(defined($to_slice)){ + my $to_cs = $to_slice->coord_system; + if($asm_cs->equals($to_cs)){ + $asm2cmp_sth = $self->prepare($asm2cmp); + $cmp2asm_sth = $self->prepare($cmp2asm." AND asm.asm_seq_region_id = $to_seq_region_id"); + } + elsif($cmp_cs->equals($to_cs)){ + $asm2cmp_sth = $self->prepare($asm2cmp." AND asm.cmp_seq_region_id = $to_seq_region_id"); + $cmp2asm_sth = $self->prepare($cmp2asm); + } + else{ + $asm2cmp_sth = $self->prepare($asm2cmp); + $cmp2asm_sth = $self->prepare($cmp2asm); + } + } + + $sth = ($asm_cs->equals($mid_cs)) ? $asm2cmp_sth : $cmp2asm_sth; + + my $end_cs_id = $end_cs->dbID(); + foreach my $mid_range (@mid_ranges) { + my ($mid_seq_region_id, $mid_seq_region,$start, $end) = @$mid_range; + $sth->bind_param(1,$mid_seq_region_id,SQL_INTEGER); + $sth->bind_param(2,$start,SQL_INTEGER); + $sth->bind_param(3,$end,SQL_INTEGER); + $sth->bind_param(4,$end_cs_id,SQL_INTEGER); + $sth->execute(); + + #load the end <-> mid mapper with the results and record the mid cs + #ranges we just added to the mapper + + my ($end_start, $end_end, $end_seq_region_id, $end_seq_region, $end_length, + $ori, $mid_start, $mid_end); + + $sth->bind_columns(\$end_start, \$end_end, \$end_seq_region_id, + \$end_seq_region, \$end_length, \$ori, \$mid_start, + \$mid_end); + + while($sth->fetch()) { + $end_mid_mapper->add_map_coordinates + ( + $end_seq_region_id, $end_start, $end_end, $ori, + $mid_seq_region_id, $mid_start, $mid_end + ); + + #update sr_name cache + my $arr = [ $end_seq_region_id,$end_seq_region,$end_cs_id,$end_length ]; + + $self->{'sr_name_cache'}->{"$end_seq_region:$end_cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$end_seq_region_id"} = $arr; + + #register this region on the end coord system + $end_registry->check_and_register($end_seq_region_id, $end_start, $end_end); + } + $sth->finish(); + } + + ######### + # Now that both halves are loaded + # Do stepwise mapping using both of the loaded mappers to load + # the final start <-> end mapper + # + + _build_combined_mapper(\@start_ranges, $start_mid_mapper, $end_mid_mapper, + $combined_mapper, $start_name); + #all done! + return; +} + + +=head2 _register_chained_special + + Arg [1] : Bio::EnsEMBL::ChainedAssemblyMapper $casm_mapper + The chained assembly mapper to register regions on + Arg [2] : string $from ('first' or 'last') + The direction we are registering from, and the name of the + internal mapper. + Arg [3] : string $seq_region_name + The name of the seqregion we are registering on + Arg [4] : listref $ranges + A list of ranges to register (in [$start,$end] tuples). + Arg [5] : (optional) $to_slice + Only register those on this Slice. + Description: Registers a set of ranges on a chained assembly mapper. + This function is at the heart of the chained mapping process. + It retrieves information from the assembly table and + dynamically constructs the mappings between two coordinate + systems which are 2 mapping steps apart. It does this by using + two internal mappers to load up a third mapper which is + actually used by the ChainedAssemblyMapper to perform the + mapping. + + This method must be called before any mapping is + attempted on regions of interest, otherwise only gaps will + be returned. Note that the ChainedAssemblyMapper automatically + calls this method when the need arises. + Returntype : none + Exceptions : throw if the seq_region to be registered does not exist + or if it associated with multiple assembled pieces (bad data + in assembly table) + + throw if the mapping between the coordinate systems cannot + be performed in two steps, which means there is an internal + error in the data in the meta table or in the code that creates + the mapping paths. + Caller : Bio::EnsEMBL::AssemblyMapper + Status : Stable + +=cut + +sub _register_chained_special { + my $self = shift; + my $casm_mapper = shift; + my $from = shift; + my $seq_region_id = shift; + my $ranges = shift; + my $to_slice = shift; + my $found = 0; + + my $sth = $self->prepare("SELECT + asm.cmp_start, + asm.cmp_end, + asm.cmp_seq_region_id, + sr.name, + sr.length, + asm.ori, + asm.asm_start, + asm.asm_end + FROM + assembly asm, seq_region sr + WHERE + asm.asm_seq_region_id = ? AND + ? <= asm.asm_end AND + ? >= asm.asm_start AND + asm.cmp_seq_region_id = sr.seq_region_id AND + sr.coord_system_id = ? AND + asm.cmp_seq_region_id = ?"); + + + my ($start_name, $start_mid_mapper, $start_cs, $start_registry, + $end_name, $end_mid_mapper, $end_cs, $end_registry); + + if($from eq 'first') { + $start_name = 'first'; + $start_mid_mapper = $casm_mapper->first_middle_mapper(); + $start_cs = $casm_mapper->first_CoordSystem(); + $start_registry = $casm_mapper->first_registry(); + $end_mid_mapper = $casm_mapper->last_middle_mapper(); + $end_cs = $casm_mapper->last_CoordSystem(); + $end_registry = $casm_mapper->last_registry(); + $end_name = 'last'; + } elsif($from eq 'last') { + $start_name = 'last'; + $start_mid_mapper = $casm_mapper->last_middle_mapper(); + $start_cs = $casm_mapper->last_CoordSystem(); + $start_registry = $casm_mapper->last_registry(); + $end_mid_mapper = $casm_mapper->first_middle_mapper(); + $end_cs = $casm_mapper->first_CoordSystem(); + $end_registry = $casm_mapper->first_registry(); + $end_name = 'first'; + } else { + throw("Invalid from argument: [$from], must be 'first' or 'last'"); + } + + my $combined_mapper = $casm_mapper->first_last_mapper(); + my $mid_cs = $casm_mapper->middle_CoordSystem(); + my $mid_name = 'middle'; + my $csa = $self->db->get_CoordSystemAdaptor(); + + # Check for the simple case where the ChainedMapper is short + if( ! defined $mid_cs ) { + $start_mid_mapper = $combined_mapper; + } + + + my @path; + if( defined $mid_cs ) { + @path = @{$csa->get_mapping_path($start_cs, $mid_cs)}; + } else { + @path = @{$csa->get_mapping_path( $start_cs, $end_cs )}; + } + if( ! defined $mid_cs ) { + $start_mid_mapper = $combined_mapper; + } + + if(@path != 2 && defined( $path[1] )) { + my $path = join(',', map({$_->name .' '. $_->version} @path)); + my $len = scalar(@path) - 1; + throw("Unexpected mapping path between start and intermediate " . + "coord systems (". $start_cs->name . " " . $start_cs->version . + " and " . $mid_cs->name . " " . $mid_cs->version . ")." . + "\nExpected path length 1, got $len. " . + "(path=$path)"); + } + + my ($asm_cs,$cmp_cs); + $asm_cs = $path[0]; + $cmp_cs = $path[-1]; + + $combined_mapper = $casm_mapper->first_last_mapper(); + $mid_cs = $casm_mapper->middle_CoordSystem(); + $mid_name = 'middle'; + $csa = $self->db->get_CoordSystemAdaptor(); + + my $mid_cs_id; + + # Check for the simple case where the ChainedMapper is short + if ( !defined $mid_cs ) { + $start_mid_mapper = $combined_mapper; + } else { + $mid_cs_id = $mid_cs->dbID(); + } + + my @mid_ranges; + my @start_ranges; + + my $to_cs = $to_slice->coord_system; + foreach my $direction (1, 0){ + my $id1; + my $id2; + if($direction){ + $id1 = $seq_region_id; + $id2 = $to_slice->get_seq_region_id(); + } + else{ + $id2 = $seq_region_id; + $id1 = $to_slice->get_seq_region_id(); + } + + foreach my $range (@$ranges) { + my ($start, $end) = @$range; + $sth->bind_param(1,$id1,SQL_INTEGER); + $sth->bind_param(2,$start,SQL_INTEGER); + $sth->bind_param(3,$end,SQL_INTEGER); + $sth->bind_param(4,$to_cs->dbID,SQL_INTEGER); + $sth->bind_param(5,$id2,SQL_INTEGER); + $sth->execute(); + + my ($mid_start, $mid_end, $mid_seq_region_id, $mid_seq_region, $mid_length, + $ori, $start_start, $start_end); + + $sth->bind_columns(\$mid_start, \$mid_end, \$mid_seq_region_id, + \$mid_seq_region, \$mid_length, \$ori, \$start_start, + \$start_end); + + while($sth->fetch()) { + $found = 1; + + if( defined $mid_cs ) { + $start_mid_mapper->add_map_coordinates + ( + $id1,$start_start, $start_end, $ori, + $mid_seq_region_id, $mid_start, $mid_end + ); + } else { + if( $from eq "first") { + if($direction){ + $combined_mapper->add_map_coordinates + ( + $id1,$start_start, $start_end, $ori, + $mid_seq_region_id, $mid_start, $mid_end + ); + } + else{ + $combined_mapper->add_map_coordinates + ( + $mid_seq_region_id, $mid_start, $mid_end, $ori, + $id1,$start_start, $start_end + ); + } + } else { + if($direction){ + $combined_mapper->add_map_coordinates + ( + $mid_seq_region_id, $mid_start, $mid_end, $ori, + $id1,$start_start, $start_end + ); + } + else{ + $combined_mapper->add_map_coordinates + ( + $id1,$start_start, $start_end, $ori, + $mid_seq_region_id, $mid_start, $mid_end + ); + } + } + } + + #update sr_name cache + my $arr = [ $mid_seq_region_id, $mid_seq_region, + $mid_cs_id, $mid_length ]; + + $self->{'sr_name_cache'}->{"$mid_seq_region:$mid_cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$mid_seq_region_id"} = $arr; + + push @mid_ranges,[$mid_seq_region_id,$mid_seq_region, + $mid_start,$mid_end]; + push @start_ranges, [ $id1, $start_start, $start_end ]; + + #the region that we actually register may actually be larger or smaller + #than the region that we wanted to register. + #register the intersection of the region so we do not end up doing + #extra work later + + if($start_start < $start || $start_end > $end) { + $start_registry->check_and_register($id1,$start_start, + $start_end); + } + } + $sth->finish(); + } + if($found){ + if( ! defined $mid_cs ) { + for my $range ( @mid_ranges ) { + $end_registry->check_and_register( $range->[0], $range->[2], + $range->[3] ); + } + + # and thats it for the simple case ... + return; + } + } + } +} + + +=head2 register_all + + Arg [1] : Bio::EnsEMBL::AssemblyMapper $mapper + Example : $mapper = $asm_mapper_adaptor->fetch_by_CoordSystems($cs1,$cs2); + + # make cache large enough to hold all of the mappings + $mapper->max_pair_count(10e6); + $asm_mapper_adaptor->register_all($mapper); + + # perform mappings as normal + $mapper->map($slice->seq_region_name(), $sr_start, $sr_end, + $sr_strand, $cs1); + ... + Description: This function registers the entire set of mappings between + two coordinate systems in an assembly mapper. + This will use a lot of memory but will be much more efficient + when doing a lot of mapping which is spread over the entire + genome. + Returntype : none + Exceptions : none + Caller : specialised prograhsm + Status : Stable + +=cut + +sub register_all { + my $self = shift; + my $mapper = shift; + + my $asm_cs_id = $mapper->assembled_CoordSystem()->dbID(); + my $cmp_cs_id = $mapper->component_CoordSystem()->dbID(); + + # retrieve every relevant assembled/component pair from the assembly table + + my $q = qq{ + SELECT + asm.cmp_start, + asm.cmp_end, + asm.cmp_seq_region_id, + cmp_sr.name, + cmp_sr.length, + asm.ori, + asm.asm_start, + asm.asm_end, + asm.asm_seq_region_id, + asm_sr.name, + asm_sr.length + FROM + assembly asm, seq_region asm_sr, seq_region cmp_sr + WHERE + asm.cmp_seq_region_id = cmp_sr.seq_region_id AND + asm.asm_seq_region_id = asm_sr.seq_region_id AND + cmp_sr.coord_system_id = ? AND + asm_sr.coord_system_id = ? + }; + + my $sth = $self->prepare($q); + + $sth->bind_param(1,$cmp_cs_id,SQL_INTEGER); + $sth->bind_param(2,$asm_cs_id,SQL_INTEGER); + $sth->execute(); + + # load the mapper with the assembly information + + my ($cmp_start, $cmp_end, $cmp_seq_region_id, $cmp_seq_region, $cmp_length, + $ori, + $asm_start, $asm_end, $asm_seq_region_id, $asm_seq_region, $asm_length); + + $sth->bind_columns(\$cmp_start, \$cmp_end, \$cmp_seq_region_id, + \$cmp_seq_region, \$cmp_length, \$ori, + \$asm_start, \$asm_end, \$asm_seq_region_id, + \$asm_seq_region, \$asm_length); + + my %asm_registered; + + while($sth->fetch()) { + $mapper->register_component($cmp_seq_region_id); + $mapper->mapper->add_map_coordinates( + $asm_seq_region_id, $asm_start, $asm_end, + $ori, + $cmp_seq_region_id, $cmp_start, $cmp_end); + + my $arr = [$cmp_seq_region_id, $cmp_seq_region, $cmp_cs_id, $cmp_length]; + + $self->{'sr_name_cache'}->{"$cmp_seq_region:$cmp_cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$cmp_seq_region_id"} = $arr; + + # only register each asm seq_region once since it requires some work + if(!$asm_registered{$asm_seq_region_id}) { + $asm_registered{$asm_seq_region_id} = 1; + + # register all chunks from start of seq region to end + my $end_chunk = $asm_length >> $CHUNKFACTOR; + for(my $i = 0; $i <= $end_chunk; $i++) { + $mapper->register_assembled($asm_seq_region_id, $i); + } + + $arr = [$asm_seq_region_id, $asm_seq_region, $asm_cs_id, $asm_length]; + + $self->{'sr_name_cache'}->{"$asm_seq_region:$asm_cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$asm_seq_region_id"} = $arr; + } + } + + $sth->finish(); + + return; +} + + + +=head2 register_all_chained + + Arg [1] : Bio::EnsEMBL::ChainedAssemblyMapper $casm_mapper + Example : $mapper = $asm_mapper_adaptor->fetch_by_CoordSystems($cs1,$cs2); + + # make the cache large enough to hold all of the mappings + $mapper->max_pair_count(10e6); + # load all of the mapping data + $asm_mapper_adaptor->register_all_chained($mapper); + + # perform mappings as normal + $mapper->map($slice->seq_region_name(), $sr_start, $sr_end, + $sr_strand, $cs1); + ... + Description: This function registers the entire set of mappings between + two coordinate systems in a chained mapper. This will use a lot + of memory but will be much more efficient when doing a lot of + mapping which is spread over the entire genome. + Returntype : none + Exceptions : throw if mapper is between coord systems with unexpected + mapping paths + Caller : specialised programs doing a lot of genome-wide mapping + Status : Stable + +=cut + +sub register_all_chained { + my $self = shift; + my $casm_mapper = shift; + + my $first_cs = $casm_mapper->first_CoordSystem(); + my $mid_cs = $casm_mapper->middle_CoordSystem(); + my $last_cs = $casm_mapper->last_CoordSystem(); + + my $start_mid_mapper = $casm_mapper->first_middle_mapper(); + my $end_mid_mapper = $casm_mapper->last_middle_mapper(); + my $combined_mapper = $casm_mapper->first_last_mapper(); + + my @ranges; + + my $sth = $self->prepare( + 'SELECT + asm.cmp_start, + asm.cmp_end, + asm.cmp_seq_region_id, + sr_cmp.name, + sr_cmp.length, + asm.ori, + asm.asm_start, + asm.asm_end, + asm.asm_seq_region_id, + sr_asm.name, + sr_asm.length + FROM + assembly asm, seq_region sr_asm, seq_region sr_cmp + WHERE + sr_asm.seq_region_id = asm.asm_seq_region_id AND + sr_cmp.seq_region_id = asm.cmp_seq_region_id AND + sr_asm.coord_system_id = ? AND + sr_cmp.coord_system_id = ?'); + + my $csa = $self->db()->get_CoordSystemAdaptor(); + + my @path; + + if ( !defined $mid_cs ) { + @path = @{ $csa->get_mapping_path( $first_cs, $last_cs ) }; + if ( !defined( $path[1] ) ) { + splice( @path, 1, 1 ); + } + } else { + @path = @{ $csa->get_mapping_path( $first_cs, $mid_cs ) }; + # fix for when we have something like supercontig#contig#chromosome + if ( !defined( $path[1] ) ) { + splice( @path, 1, 1 ); + } + } + + if ( @path != 2 ) { + my $path = + join( ',', map( { $_->name . ' ' . $_->version } @path ) ); + my $len = scalar(@path) - 1; + throw( "Unexpected mapping path between start and intermediate " + . "coord systems (" + . $first_cs->name . " " + . $first_cs->version . " and " + . $mid_cs->name . " " + . $mid_cs->version . ")." + . "\nExpected path length 1, got $len. " + . "(path=$path)" ); + } + + my ($asm_cs,$cmp_cs) = @path; + + $sth->{mysql_use_result} = 1; + $sth->bind_param(1,$asm_cs->dbID,SQL_INTEGER); + $sth->bind_param(2,$cmp_cs->dbID,SQL_INTEGER); + $sth->execute(); + + + my ($mid_start, $mid_end, $mid_seq_region_id, $mid_seq_region, $mid_length, + $ori, $start_start, $start_end, $start_seq_region_id, $start_seq_region, + $start_length); + + if($asm_cs->equals($first_cs)) { + $sth->bind_columns(\$mid_start, \$mid_end, \$mid_seq_region_id, + \$mid_seq_region, \$mid_length, \$ori, \$start_start, + \$start_end, \$start_seq_region_id, \$start_seq_region, + \$start_length); + } else { + $sth->bind_columns(\$start_start, \$start_end, \$start_seq_region_id, + \$start_seq_region, \$start_length, \$ori, + \$mid_start, \$mid_end, \$mid_seq_region_id, + \$mid_seq_region, \$mid_length); + + } + + my ( $mid_cs_id, $start_cs_id, $reg, $mapper ); + if ( !defined $mid_cs ) { + $mid_cs_id = $last_cs->dbID(); + $start_cs_id = $first_cs->dbID(); + $mapper = $combined_mapper; + } else { + $mid_cs_id = $mid_cs->dbID(); + $start_cs_id = $first_cs->dbID(); + $mapper = $start_mid_mapper; + } + + $reg = $casm_mapper->first_registry(); + + while($sth->fetch()) { + $mapper->add_map_coordinates + ( + $start_seq_region_id, $start_start, $start_end, $ori, + $mid_seq_region_id, $mid_start, $mid_end + ); + push( @ranges, [$start_seq_region_id, $start_start, $start_end ] ); + + $reg->check_and_register( $start_seq_region_id, 1, $start_length ); + if( ! defined $mid_cs ) { + $casm_mapper->last_registry()->check_and_register + ( $mid_seq_region_id, $mid_start, $mid_end ); + } + + my $arr = [ $mid_seq_region_id, $mid_seq_region, + $mid_cs_id, $mid_length ]; + + $self->{'sr_name_cache'}->{"$mid_seq_region:$mid_cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$mid_seq_region_id"} = $arr; + + $arr = [ $start_seq_region_id, $start_seq_region, + $start_cs_id, $start_length ]; + + $self->{'sr_name_cache'}->{"$start_seq_region:$start_cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$start_seq_region_id"} = $arr; + } + + if( ! defined $mid_cs ) { + # thats it for the simple case + return; + } + + + @path = @{ $csa->get_mapping_path( $last_cs, $mid_cs ) }; + if ( defined($mid_cs) ) { + if ( !defined( $path[1] ) ) { + splice( @path, 1, 1 ); + } + } + + if ( @path != 2 ) { + my $path = + join( ',', map( { $_->name . ' ' . $_->version } @path ) ); + my $len = scalar(@path) - 1; + throw( "Unexpected mapping path between intermediate and last " + . "coord systems (" + . $last_cs->name . " " + . $last_cs->version . " and " + . $mid_cs->name . " " + . $mid_cs->version . ")." + . "\nExpected path length 1, got $len. " + . "(path=$path)" ); + } + + ($asm_cs,$cmp_cs) = @path; + + $sth->bind_param(1,$asm_cs->dbID,SQL_INTEGER); + $sth->bind_param(2,$cmp_cs->dbID,SQL_INTEGER); + $sth->execute(); + + + my ($end_start, $end_end, $end_seq_region_id, $end_seq_region, + $end_length); + + if($asm_cs->equals($mid_cs)) { + $sth->bind_columns(\$end_start, \$end_end, \$end_seq_region_id, + \$end_seq_region, \$end_length, \$ori, + \$mid_start, \$mid_end, \$mid_seq_region_id, + \$mid_seq_region, \$mid_length); + } else { + $sth->bind_columns(\$mid_start, \$mid_end, \$mid_seq_region_id, + \$mid_seq_region, \$mid_length, \$ori, \$end_start, + \$end_end, \$end_seq_region_id, \$end_seq_region, + \$end_length); + } + + my $end_cs_id = $last_cs->dbID(); + $reg = $casm_mapper->last_registry(); + + while($sth->fetch()) { + $end_mid_mapper->add_map_coordinates + ( + $end_seq_region_id, $end_start, $end_end, $ori, + $mid_seq_region_id, $mid_start, $mid_end + ); + + $reg->check_and_register( $end_seq_region_id, 1, $end_length ); + + my $arr = [ $end_seq_region_id, $end_seq_region, + $end_cs_id, $end_length ]; + $self->{'sr_name_cache'}->{"$end_seq_region:$end_cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$end_seq_region_id"} = $arr; + } + + _build_combined_mapper( \@ranges, $start_mid_mapper, $end_mid_mapper, + $combined_mapper, "first" ); + + return; +} + + + +# after both halves of a chained mapper are loaded +# this function maps all ranges in $ranges and loads the +# results into the combined mapper +sub _build_combined_mapper { + my $ranges = shift; + my $start_mid_mapper = shift; + my $end_mid_mapper = shift; + my $combined_mapper = shift; + my $start_name = shift; + + my $mid_name = "middle"; + + foreach my $range (@$ranges) { + my ( $seq_region_id, $start, $end) = @$range; + + my $sum = 0; + + my @initial_coords = $start_mid_mapper->map_coordinates($seq_region_id, + $start,$end,1, + $start_name); + + foreach my $icoord (@initial_coords) { + #skip gaps + if($icoord->isa('Bio::EnsEMBL::Mapper::Gap')) { + $sum += $icoord->length(); + next; + } + + + #feed the results of the first mapping into the second mapper + my @final_coords = + $end_mid_mapper->map_coordinates($icoord->id, $icoord->start, + $icoord->end, + $icoord->strand, $mid_name); + + + foreach my $fcoord (@final_coords) { + #load up the final mapper + + if($fcoord->isa('Bio::EnsEMBL::Mapper::Coordinate')) { + my $total_start = $start + $sum; + my $total_end = $total_start + $fcoord->length - 1; + my $ori = $fcoord->strand(); + + if($start_name eq 'first') { # add coords in consistant order + $combined_mapper->add_map_coordinates( + $seq_region_id, $total_start, $total_end, $ori, + $fcoord->id(), $fcoord->start(), $fcoord->end()); + } else { + $combined_mapper->add_map_coordinates( + $fcoord->id(), $fcoord->start(), $fcoord->end(),$ori, + $seq_region_id, $total_start, $total_end); + } + + } + $sum += $fcoord->length(); + } + } + } + #all done! +} + + +=head2 seq_regions_to_ids + + Arg [1] : Bio::EnsEMBL::CoordSystem $coord_system + Arg [2] : listref of strings $seq_regions + Example : my @ids = @{$asma->seq_regions_to_ids($coord_sys, \@seq_regs)}; + Description: Converts a list of seq_region names to internal identifiers + using the internal cache that has accumulated while registering + regions for AssemblyMappers. If any requested regions are + not found in the cache an attempt is made to retrieve them + from the database. + Returntype : listref of ints + Exceptions : throw if a non-existant seqregion is provided + Caller : general + Status : Stable + +=cut + +sub seq_regions_to_ids { + my $self = shift; + my $coord_system = shift; + my $seq_regions = shift; + + my $cs_id = $coord_system->dbID(); + + my @out; + + foreach my $sr (@$seq_regions) { + my $arr = $self->{'sr_name_cache'}->{"$sr:$cs_id"}; + if( $arr ) { + push( @out, $arr->[0] ); + } else { + push @out, $self->_seq_region_name_to_id($sr,$cs_id); + } + } + + return \@out; +} + + +=head2 seq_ids_to_regions + + Arg [1] : listref of seq_region ids + Example : my @ids = @{$asma->ids_to_seq_regions(\@seq_ids)}; + Description: Converts a list of seq_region ids to seq region names + using the internal cache that has accumulated while registering + regions for AssemblyMappers. If any requested regions are + not found in the cache an attempt is made to retrieve them + from the database. + Returntype : listref of strings + Exceptions : throw if a non-existant seq_region_id is provided + Caller : general + Status : Stable + +=cut + +sub seq_ids_to_regions { + my $self = shift; + my $seq_region_ids = shift; + + my @out; + + foreach my $sr (@$seq_region_ids) { + my $arr = $self->{'sr_id_cache'}->{"$sr"}; + if( $arr ) { + push( @out, $arr->[1] ); + } else { + push @out, $self->_seq_region_id_to_name($sr); + } + } + + return \@out; +} + +=head2 delete_cache + + Description: Delete all the caches for the mappings/seq_regions + Returntype : none + Exceptions : none + Caller : General + Status : At risk + +=cut + +sub delete_cache{ + my ($self) = @_; + + %{$self->{'sr_name_cache'}} = (); + %{$self->{'sr_id_cache'}} = (); + + foreach my $key (keys %{$self->{'_asm_mapper_cache'}}){ + $self->{'_asm_mapper_cache'}->{$key}->flush(); + } + %{$self->{'_asm_mapper_cache'}} = (); + return; +} + + +=head2 register_region + + Description: DEPRECATED use register_assembled instead + +=cut + +sub register_region{ + my ($self, $assmapper, $type, $chr_name, $start, $end) = @_; + + deprecate('Use register_assembled instead'); + + $self->register_assembled($assmapper, $chr_name, $start, $end); +} + + +=head2 register_contig + + Description: DEPRECATED use register_component instead + +=cut + +sub register_contig { + my ($self, $assmapper, $type, $contig_id ) = @_; + + deprecate('Use register_component instead'); + + #not sure if the use is passing in a seq_region_name or a + #seq_region_id... + register_component($assmapper, $contig_id); +} + + +=head2 fetch_by_type + + Description: DEPRECATED use fetch_by_CoordSystems instead + +=cut + +sub fetch_by_type{ + my ($self,$type) = @_; + + deprecate('Use fetch_by_CoordSystems instead'); + + #assume that what the user wanted was a mapper between the sequence coord + #level and the top coord level + + my $csa = $self->db()->get_CoordSystemAdaptor(); + + my $cs1 = $csa->fetch_top_level($type); + my $cs2 = $csa->fetch_sequence_level(); + + return $self->fetch_by_CoordSystems($cs1,$cs2); +} + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/AssemblySliceAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/AssemblySliceAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,240 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::AssemblySliceAdaptor - adaptor/factory for MappedSlices +representing alternative assemblies + +=head1 SYNOPSIS + + my $slice = + $slice_adaptor->fetch_by_region( 'chromosome', 14, 900000, 950000 ); + + my $msc = Bio::EnsEMBL::MappedSliceContainer->new( -SLICE => $slice ); + + my $asa = Bio::EnsEMBL::DBSQL::AssemblySliceAdaptor->new; + + my ($mapped_slice) = @{ $asa->fetch_by_version( $msc, 'NCBIM36' ) }; + +=head1 DESCRIPTION + +NOTE: this code is under development and not fully functional nor tested +yet. Use only for development. + +This adaptor is a factory for creating MappedSlices representing +alternative assemblies and attaching them to a MappedSliceContainer. A +mapper will be created to map between the reference slice and the common +container slice coordinate system. + +=head1 METHODS + + new + fetch_by_version + +=head1 REALTED MODULES + + Bio::EnsEMBL::MappedSlice + Bio::EnsEMBL::MappedSliceContainer + Bio::EnsEMBL::Compara::AlignSlice + Bio::EnsEMBL::Compara::AlignSlice::Slice + Bio::EnsEMBL::AlignStrainSlice + Bio::EnsEMBL::StrainSlice + +=cut + +package Bio::EnsEMBL::DBSQL::AssemblySliceAdaptor; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::MappedSlice; +use Bio::EnsEMBL::Mapper; +use Bio::EnsEMBL::DBSQL::BaseAdaptor; + +our @ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); + + +=head2 new + + Example : my $assembly_slice_adaptor = + Bio::EnsEMBL::DBSQL::AssemblySliceAdaptor->new; + Description : Constructor. + Return type : Bio::EnsEMBL::DBSQL::AssemblySliceAdaptor + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_); + + return $self; +} + + +=head2 fetch_by_version + + Arg[1] : Bio::EnsEMBL::MappedSliceContainer $container - the container + to attach MappedSlices to + Arg[2] : String $version - the assembly version to fetch + Example : my ($mapped_slice) = @{ $msc->fetch_by_version('NCBIM36') }; + Description : Creates a MappedSlice representing an alternative assembly + version of the container's reference slice. + Return type : listref of Bio::EnsEMBL::MappedSlice + Exceptions : thrown on wrong or missing arguments + Caller : general, Bio::EnsEMBL::MappedSliceContainer + Status : At Risk + : under development + +=cut + +sub fetch_by_version { + my $self = shift; + my $container = shift; + my $version = shift; + + # arguement check + unless ($container and ref($container) and + $container->isa('Bio::EnsEMBL::MappedSliceContainer')) { + throw("Need a MappedSliceContainer."); + } + + unless ($version) { + throw("Need an assembly version."); + } + + my $slice = $container->ref_slice; + + # project slice onto other assembly and construct MappedSlice for result + my $mapped_slice = Bio::EnsEMBL::MappedSlice->new( + -ADAPTOR => $self, + -CONTAINER => $container, + -NAME => $slice->name."\#mapped_$version", + ); + + my $cs_name = $slice->coord_system_name; + + foreach my $seg (@{ $slice->project($cs_name, $version) }) { + + my $proj_slice = $seg->to_Slice; + + # create a Mapper to map to/from the mapped_slice artificial coord system + my $mapper = Bio::EnsEMBL::Mapper->new('mapped_slice', 'native_slice'); + + # tell the mapper how to map this segment + $mapper->add_map_coordinates( + 'mapped_slice', + $seg->from_start, + $seg->from_end, + ($slice->strand * $proj_slice->strand), + $proj_slice->seq_region_name, + $proj_slice->start, + $proj_slice->end + ); + + # add the Slice/Mapper pair to the MappedSlice + $mapped_slice->add_Slice_Mapper_pair($proj_slice, $mapper); + } + + return [$mapped_slice]; +} + + +=head2 fetch_by_name + + Arg[1] : Bio::EnsEMBL::MappedSliceContainer $container - the container + to attach MappedSlices to + Arg[2] : String $name - the assembly name to fetch + Arg[3] : (optional) String $version -- the version for the new assembly + Example : my ($mapped_slice) = @{ $msc->fetch_by_name('LRG1','1') }; + Description : Creates a MappedSlice representing an alternative assembly + version of the container's reference slice. + Return type : listref of Bio::EnsEMBL::MappedSlice + Exceptions : thrown on wrong or missing arguments + Caller : general, Bio::EnsEMBL::MappedSliceContainer + Status : At Risk + : under development + +=cut + +sub fetch_by_name { + my $self = shift; + my $container = shift; + my $name = shift; + my $version = shift; + + # arguement check + unless ($container and ref($container) and + $container->isa('Bio::EnsEMBL::MappedSliceContainer')) { + throw("Need a MappedSliceContainer."); + } + + unless ($name) { + throw("Need an assembly name."); + } + + $version ||= ''; + my $slice = $container->ref_slice; + + # project slice onto other assembly and construct MappedSlice for result + my $mapped_slice = Bio::EnsEMBL::MappedSlice->new( + -ADAPTOR => $self, + -CONTAINER => $container, + -NAME => $slice->name."\#mapped_$name:$version", + ); + + + foreach my $seg (@{ $slice->project($name, $version) }) { + + my $proj_slice = $seg->to_Slice; + + # create a Mapper to map to/from the mapped_slice artificial coord system + my $mapper = Bio::EnsEMBL::Mapper->new('mapped_slice', 'native_slice'); + + # tell the mapper how to map this segment + $mapper->add_map_coordinates( + 'mapped_slice', + $seg->from_start, + $seg->from_end, + ($slice->strand * $proj_slice->strand), + $proj_slice->seq_region_name, + $proj_slice->start, + $proj_slice->end + ); + + # add the Slice/Mapper pair to the MappedSlice + $mapped_slice->add_Slice_Mapper_pair($proj_slice, $mapper); + } + + return [$mapped_slice]; +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/AttributeAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/AttributeAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,424 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::AttributeAdaptor - Provides database interaction for +Bio::EnsEMBL::Attribute objects. + + +=head1 SYNOPSIS + + # $db is a Bio::EnsEMBL::DBSQL::DBAdaptor object: + $attribute_adaptor = $db->get_AttributeAdaptor(); + + $attributes = $attribute_adaptor->fetch_all_by_MiscFeature($feature); + + $attributes = $attribute_adaptor->fetch_all_by_Slice($slice); + + $attribute_adaptor->store_on_Slice( $slice, \@attributes ); + + $attribute_adaptor->store_on_MiscFeature( $misc_feature, + \@attributes ) + +=head1 DESCRIPTION + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::AttributeAdaptor; + +use strict; +use warnings; + +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Attribute; + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + +use vars qw(@ISA); + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); + + +=head2 new + + Arg [...] : Superclass args. See Bio::EnsEMBL::DBSQL::BaseAdaptor + Description: Instantiates a Bio::EnsEMBL::DBSQL::AttributeAdaptor + Returntype : Bio::EnsEMBL::AttributeAdaptor + Exceptions : none + Caller : DBAdaptor + Status : Stable + +=cut + + +sub new { + my $class = shift; + + my $self = $class->SUPER::new(@_); + + + # cache creation could go here + return $self; +} + +use vars '$AUTOLOAD'; + +sub AUTOLOAD { + my ($self,@args) = @_; + my @array_return=(); + my $ref_return = undef; + $AUTOLOAD =~ /^.*::(\w+_)+(\w+)$/ ; + + my $sub = $1; + my $type = $2; + + + +# print STDERR "AUTO".$AUTOLOAD."\n"; + +# print STDERR "AUTOLOAD reached with call to $sub of type $type\n"; + if($self->can($sub)){ + return $self->$sub($type,@args); + } + else{ + warn("In AttribAdaptor cannot call sub $sub$type\n"); + } + return undef; +} + + + +sub store_on_ { + my $self = shift; + my $type = shift; + my $object = shift; + my $attributes = shift; + my $table; + + + my $object_id; + if($type =~ /[GT][er][na][en]/){ + if (!ref($object)){ + $object_id = $object; + } + else { + $object_id = $object->dbID; + } + $table = lc($type); +# $type = lc($type); + } + else{ + if(!ref($object) || !$object->isa('Bio::EnsEMBL::'.$type)) { + throw("$type argument is required. but you passed $object"); + } + if($type eq "Slice"){ + $object_id = $object->get_seq_region_id(); + $table = "seq_region"; + $type = "seq_region"; + } + else{ + if($type eq "MiscFeature"){ + $type = "misc_feature"; + $table = "misc"; + } + else{ + $table = lc($type); + } + + $object_id = $object->dbID(); + my $db = $self->db(); + + if(!$object->is_stored($db)) { + throw("$type is not stored in this database."); + } + + } + } + my $sth = $self->prepare( "INSERT into ".$table."_attrib ". + "SET ".$type."_id = ?, attrib_type_id = ?, ". + "value = ? " ); + + my $undef_circular_cache = 0; + for my $attrib ( @$attributes ) { + if(!ref($attrib) && $attrib->isa('Bio::EnsEMBL::Attribute')) { + throw("Reference to list of Bio::EnsEMBL::Attribute objects " . + "argument expected."); + } + my $atid = $self->_store_type( $attrib ); + if ((defined $attrib->code) and ($attrib->code eq 'circular_seq')) { + $undef_circular_cache = 1; + } + $sth->bind_param(1,$object_id,SQL_INTEGER); + $sth->bind_param(2,$atid,SQL_INTEGER); + $sth->bind_param(3,$attrib->value,SQL_VARCHAR); + $sth->execute(); + } + + if($table eq "seq_region") { + if ($undef_circular_cache) { + #the slice is circular + $object->{'circular'} = 1; + my $slice_adaptor = $object->adaptor(); + #undefine slice adaptor->is_circular and the circular slice cache + if (defined $slice_adaptor) { + $slice_adaptor->{'is_circular'} = undef; + $slice_adaptor->{'circular_sr_id_cache'} = {}; + } + } + } + + return; +} + + +sub remove_from_{ + my $self = shift; + my $type = shift; + my $object = shift; + my $code = shift; + my $table; + + if(!ref($object) || !$object->isa('Bio::EnsEMBL::'.$type)) { + throw("$type argument is required or a attrib code. but you passed $object"); + } + + my $object_id; + if($type eq "Slice"){ + $object_id = $object->get_seq_region_id(); + $table = "seq_region"; + $type = "seq_region"; + if ((defined $code) and ($code eq 'circular_seq')) { + #undefine slice->is_circular, slice adaptor->is_circular and the circular slice cache + $object->{'circular'} = undef; + my $slice_adaptor = $object->adaptor(); + if (defined $slice_adaptor) { + $slice_adaptor->{'is_circular'} = undef; + $slice_adaptor->{'circular_sr_id_cache'} = {}; + } + } + } + else{ + if($type eq "MiscFeature"){ + $type = "misc_feature"; + $table = "misc"; + } + else{ + $table = lc($type); + } + + $object_id = $object->dbID(); + my $db = $self->db(); + + if(!$object->is_stored($db)) { + throw("$type is not stored in this database."); + } + + } + + if(!defined($object_id)) { + throw("$type must have dbID."); + } + + my $sth; + if(defined($code)){ + $sth = $self->prepare("DELETE a FROM ".$table."_attrib a ,attrib_type at " . + "WHERE a.attrib_type_id = at.attrib_type_id AND ". + "a.".$type."_id = ? AND ". + "at.code like ?"); + $sth->bind_param(1,$object_id,SQL_INTEGER); + $sth->bind_param(2,$code,SQL_VARCHAR); + } + else{ + $sth = $self->prepare("DELETE FROM ".$table."_attrib " . + "WHERE ".$type."_id = ?"); + $sth->bind_param(1,$object_id,SQL_INTEGER); + } + $sth->execute(); + + $sth->finish(); + + return; +} + + + +sub fetch_all_by_{ + my $self = shift; + my $type = shift; + my $object = shift; + my $code = shift; + my $table =undef; + + if(!ref($object) || !$object->isa('Bio::EnsEMBL::'.$type)) { + throw("$type argument is required. but you passed $object"); + } + + my $object_id; + if($type eq "Slice"){ + $object_id = $object->get_seq_region_id(); + $table = "seq_region"; + $type = "seq_region"; + } + else{ + if($type eq "MiscFeature"){ + $type = "misc_feature"; + $table = "misc"; + } + else{ + $table = lc($type); + } + + $object_id = $object->dbID(); + } + + if(!defined($object_id)) { + throw("$type must have dbID."); + } + + + my $sql = "SELECT at.code, at.name, at.description, t.value " . + "FROM ".($table||$type)."_attrib t, attrib_type at ". + "WHERE t.".$type."_id = ? " . + "AND at.attrib_type_id = t.attrib_type_id "; + + if(defined($code)){ + $sql .= 'AND at.code like "'.$code.'" '; + } + + my $sth = $self->prepare($sql); + + $sth->bind_param(1,$object_id,SQL_INTEGER); + $sth->execute(); + + my $results = $self->_obj_from_sth($sth); + + $sth->finish(); + + return $results; + +} + + +sub DESTROY{ +} + + +# +# _id_check +# +# backwards compatibility check: +# check if $ensID is an object; if so, return $obj->dbID +# + +sub _id_check { + my $self = shift; + my $ensID = shift; + + if ($ensID =~ /^\d+$/) { + return $ensID; + + } elsif (ref($ensID) eq 'Bio::EnsEMBL::Gene' or + ref($ensID) eq 'Bio::EnsEMBL::Transcript' or + ref($ensID) eq 'Bio::EnsEMBL::Translation') { + + warning("You should pass a dbID rather than an ensembl object to store the attribute on"); + + if ($ensID->dbID) { + return $ensID->dbID; + } else { + throw("Ensembl object ".$ensID->display_id." doesn't have a dbID, can't store attribute"); + } + + } else { + throw("Invalid dbID"); + } + +} + + +# _store_type + +sub _store_type { + my $self = shift; + my $attrib = shift; + + my $sth1 = $self->prepare + ("INSERT IGNORE INTO attrib_type set code = ?, name = ?, ". + "description = ?" ); + + + $sth1->bind_param(1,$attrib->code,SQL_VARCHAR); + $sth1->bind_param(2,$attrib->name,SQL_VARCHAR); + $sth1->bind_param(3,$attrib->description,SQL_LONGVARCHAR); + + my $rows_inserted = $sth1->execute(); + + my $atid = $sth1->{'mysql_insertid'}; + + if($rows_inserted == 0) { + # the insert failed because the code is already stored + my $sth2 = $self->prepare + ("SELECT attrib_type_id FROM attrib_type " . + "WHERE code = ?"); + $sth2->bind_param(1,$attrib->code,SQL_VARCHAR); + $sth2->execute(); + ($atid) = $sth2->fetchrow_array(); + + $sth2->finish(); + + if(!$atid) { + throw("Could not store or fetch attrib_type code [".$attrib->code."]\n" . + "Wrong database user/permissions?"); + } + } + + $sth1->finish(); + + return $atid; +} + + +sub _obj_from_sth { + my $self = shift; + my $sth = shift; + + my ($code, $name, $desc, $value); + $sth->bind_columns(\$code, \$name, \$desc, \$value); + + my @results; + while($sth->fetch()) { + push @results, Bio::EnsEMBL::Attribute->new_fast + ( {'code' => $code, + 'name' => $name, + 'description' => $desc, + 'value' => $value} ); + } + + return \@results; +} + + + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/BaseAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/BaseAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,852 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::BaseAdaptor - Base Adaptor for DBSQL adaptors + +=head1 SYNOPSIS + + # base adaptor provides + + # SQL prepare function + $adaptor->prepare("sql statement"); + + # get of root DBAdaptor object + $adaptor->db(); + + # constructor, ok for inheritence + $adaptor = Bio::EnsEMBL::DBSQL::SubClassOfBaseAdaptor->new($dbobj) + +=head1 DESCRIPTION + +This is a true base class for Adaptors in the Ensembl DBSQL +system. Original idea from Arne + +Adaptors are expected to have the following functions + + $obj = $adaptor->fetch_by_dbID($internal_id); + +which builds the object from the primary key of the object. This +function is crucial because it allows adaptors to collaborate relatively +independently of each other - in other words, we can change the schema +under one adaptor without too many knock on changes through the other +adaptors. + +Most adaptors will also have + + $dbid = $adaptor->store($obj); + +which stores the object. Currently the storing of an object also causes +the objects to set + + $obj->dbID(); + +correctly and attach the adaptor. + +Other fetch functions go by the convention of + + @object_array = @{ $adaptor->fetch_all_by_XXXX($arguments_for_XXXX) }; + +sometimes it returns an array ref denoted by the 'all' in the name of +the method, sometimes an individual object. For example + + $gene = $gene_adaptor->fetch_by_stable_id($stable_id); + +or + + @fp = @{ $simple_feature_adaptor->fetch_all_by_Slice($slice) }; + +Occassionally adaptors need to provide access to lists of ids. In this +case the convention is to go list_XXXX, such as + + @gene_ids = @{ $gene_adaptor->list_geneIds() }; + +(note: this method is poorly named) + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::BaseAdaptor; +require Exporter; +use vars qw(@ISA @EXPORT); +use strict; + +use Bio::EnsEMBL::Utils::Exception qw(throw); +use Bio::EnsEMBL::Utils::Scalar qw(assert_ref); +use DBI qw(:sql_types); +use Data::Dumper; + +@ISA = qw(Exporter); +@EXPORT = (@{$DBI::EXPORT_TAGS{'sql_types'}}); + +=head2 new + + Arg [1] : Bio::EnsEMBL::DBSQL::DBConnection $dbobj + Example : $adaptor = new AdaptorInheritedFromBaseAdaptor($dbobj); + Description: Creates a new BaseAdaptor object. The intent is that this + constructor would be called by an inherited superclass either + automatically or through $self->SUPER::new in an overridden + new method. + Returntype : Bio::EnsEMBL::DBSQL::BaseAdaptor + Exceptions : none + Caller : Bio::EnsEMBL::DBSQL::DBConnection + Status : Stable + +=cut + +sub new { + my ( $class, $dbobj ) = @_; + + my $self = bless {}, $class; + + if ( !defined $dbobj || !ref $dbobj ) { + throw("Don't have a db [$dbobj] for new adaptor"); + } + + if ( $dbobj->isa('Bio::EnsEMBL::DBSQL::DBAdaptor') ) { + $self->db($dbobj); + $self->dbc( $dbobj->dbc ); + $self->species_id( $dbobj->species_id() ); + $self->is_multispecies( $dbobj->is_multispecies() ); + } elsif ( ref($dbobj) =~ /DBAdaptor$/ ) { + $self->db($dbobj); + $self->dbc( $dbobj->dbc ); + } elsif ( ref($dbobj) =~ /DBConnection$/ ) { + $self->dbc($dbobj); + } else { + throw("Don't have a DBAdaptor [$dbobj] for new adaptor"); + } + + return $self; +} + + +=head2 prepare + + Arg [1] : string $string + a SQL query to be prepared by this adaptors database + Example : $sth = $adaptor->prepare("select yadda from blabla") + Description: provides a DBI statement handle from the adaptor. A convenience + function so you dont have to write $adaptor->db->prepare all the + time + Returntype : DBI::StatementHandle + Exceptions : none + Caller : Adaptors inherited from BaseAdaptor + Status : Stable + +=cut + +sub prepare { + my ( $self, $string ) = @_; + + # Uncomment next line to cancel caching on the SQL side. + # Needed for timing comparisons etc. + #$string =~ s/SELECT/SELECT SQL_NO_CACHE/i; + + return $self->dbc->prepare($string); +} + + +=head2 db + + Arg [1] : (optional) Bio::EnsEMBL::DBSQL::DBAdaptor $obj + the database this adaptor is using. + Example : $db = $adaptor->db(); + Description: Getter/Setter for the DatabaseConnection that this adaptor is + using. + Returntype : Bio::EnsEMBL::DBSQL::DBAdaptor + Exceptions : none + Caller : Adaptors inherited from BaseAdaptor + Status : Stable + +=cut + +sub db { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'db'} = $value; + } + + return $self->{'db'}; +} + +=head2 dbc + + Arg [1] : (optional) Bio::EnsEMBL::DBSQL::DBConnection $obj + the database this adaptor is using. + Example : $db = $adaptor->db(); + Description: Getter/Setter for the DatabaseConnection that this adaptor is + using. + Returntype : Bio::EnsEMBL::DBSQL::DBConnection + Exceptions : none + Caller : Adaptors inherited from BaseAdaptor + Status : Stable + +=cut + +sub dbc { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'dbc'} = $value; + } + + return $self->{'dbc'}; +} + +=head2 is_multispecies + + Arg [1] : (optional) boolean $arg + Example : if ($adaptor->is_multispecies()) { } + Description: Getter/Setter for the is_multispecies boolean of + to use for this adaptor. + Returntype : boolean + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub is_multispecies { + my ( $self, $arg ) = @_; + + if ( defined($arg) ) { + $self->{_is_multispecies} = $arg; + } + + return $self->{_is_multispecies}; +} + +=head2 species_id + + Arg [1] : (optional) int $species_id + The internal ID of the species in a multi-species database. + Example : $db = $adaptor->db(); + Description: Getter/Setter for the internal ID of the species in a + multi-species database. The default species ID is 1. + Returntype : Integer + Exceptions : none + Caller : Adaptors inherited from BaseAdaptor + Status : Stable + +=cut + +sub species_id { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'species_id'} = $value; + } + + return $self->{'species_id'} || 1; +} + + +# list primary keys for a particular table +# args are table name and primary key field +# if primary key field is not supplied, tablename_id is assumed +# returns listref of IDs +sub _list_dbIDs { + my ( $self, $table, $pk, $ordered ) = @_; + + if ( !defined($pk) ) { $pk = $table . "_id" } + + my $sql = sprintf( "SELECT %s FROM %s", $pk, $table ); + + my $join_with_cs = 0; + if ( $self->is_multispecies() + && $self->isa('Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor') + && !$self->isa('Bio::EnsEMBL::DBSQL::UnmappedObjectAdaptor') ) + { + + $sql .= q( +JOIN seq_region USING (seq_region_id) +JOIN coord_system cs USING (coord_system_id) +WHERE cs.species_id = ? +); + + $join_with_cs = 1; + } + + if ( defined($ordered) && $ordered ) { + $sql .= " ORDER BY seq_region_id, seq_region_start"; + } + + my $sth = $self->prepare($sql); + + if ($join_with_cs) { + $sth->bind_param( 1, $self->species_id(), SQL_INTEGER ); + } + + eval { $sth->execute() }; + if ($@) { + throw("Detected an error whilst executing SQL '${sql}': $@"); + } + + my $id; + $sth->bind_col( 1, \$id ); + + my @out; + while ( $sth->fetch() ) { + push( @out, $id ); + } + + return \@out; +} ## end sub _list_dbIDs + + +# _straight_join + +# Arg [1] : (optional) boolean $new_val +# Example : $self->_straight_join(1); +# $self->generic_fetch($constraint); +# $self->_straight_join(0); +# Description: PROTECTED Getter/Setter that turns on/off the use of +# a straight join in queries. +# Returntype : boolean +# Exceptions : none +# Caller : general + +sub _straight_join { + my $self = shift; + if(@_) { + $self->{'_straight_join'} = shift; + } + + return $self->{'_straight_join'}; +} + + +=head2 bind_param_generic_fetch + + Arg [1] : (optional) scalar $param + This is the parameter to bind + Arg [2] : (optional) int $sql_type + Type of the parameter (from DBI (:sql_types)) + Example : $adaptor->bind_param_generic_fetch($stable_id,SQL_VARCHAR); + $adaptor->generic_fetch(); + Description: When using parameters for the query, will call the bind_param to avoid + some security issues. If there are no arguments, will return the bind_parameters + ReturnType : listref + Exceptions: if called with one argument + +=cut + +sub bind_param_generic_fetch{ + my $self = shift; + my $param = shift; + my $sql_type = shift; + + if (defined $param && !defined $sql_type){ + throw("Need to specify sql_type for parameter $param\n"); + } + elsif (defined $param && defined $sql_type){ + #check when there is a SQL_INTEGER type that the parameter is really a number + if ($sql_type eq SQL_INTEGER){ + throw "Trying to assign a non numerical parameter to an integer value in the database" if ($param !~ /^\d+$/); + } + #both paramters have been entered, push it to the bind_param array + push @{$self->{'_bind_param_generic_fetch'}},[$param,$sql_type]; + } + elsif (!defined $param && !defined $sql_type){ + #when there are no arguments, return the array + return $self->{'_bind_param_generic_fetch'}; + } + +} + + + +=head2 generic_fetch + + Arg [1] : (optional) string $constraint + An SQL query constraint (i.e. part of the WHERE clause) + Arg [2] : (optional) Bio::EnsEMBL::AssemblyMapper $mapper + A mapper object used to remap features + as they are retrieved from the database + Arg [3] : (optional) Bio::EnsEMBL::Slice $slice + A slice that features should be remapped to + Example : $fts = $a->generic_fetch('contig_id in (1234, 1235)'); + Description: Performs a database fetch and returns feature objects in + contig coordinates. + Returntype : listref of Bio::EnsEMBL::SeqFeature in contig coordinates + Exceptions : Thrown if there is an issue with querying the data + Caller : BaseFeatureAdaptor, ProxyDnaAlignFeatureAdaptor::generic_fetch + Status : Stable + +=cut + +sub generic_fetch { + my ($self, $constraint, $mapper, $slice) = @_; + my $sql = $self->_generate_sql($constraint); + my $params = $self->bind_param_generic_fetch(); + $params ||= []; + $self->{_bind_param_generic_fetch} = undef; + my $sth = $self->db()->dbc()->prepare($sql); + my $i = 1; + foreach my $param (@{$params}){ + $sth->bind_param($i,$param->[0],$param->[1]); + $i++; + } + eval { $sth->execute() }; + if ($@) { + throw("Detected an error whilst executing SQL '${sql}': $@"); + } + + my $res = $self->_objs_from_sth($sth, $mapper, $slice); + $sth->finish(); + return $res; +} + +=head2 generic_count + + Arg [1] : (optional) string $constraint + An SQL query constraint (i.e. part of the WHERE clause) + Example : $number_feats = $a->generic_count('contig_id in (1234, 1235)'); + Description: Performs a database fetch and returns a count of those features + found. This is analagous to C + Returntype : Integer count of the elements. + Exceptions : Thrown if there is an issue with querying the data + +=cut + +sub generic_count { + my ($self, $constraint) = @_; + my $sql = $self->_generate_sql($constraint, 'count(*)'); + my $params = $self->bind_param_generic_fetch(); + $params ||= []; + $self->{_bind_param_generic_fetch} = undef; + my $h = $self->db()->dbc()->sql_helper(); + my $count = $h->execute_single_result(-SQL => $sql, -PARAMS => $params); + return $count; +} + +sub _generate_sql { + my ($self, $constraint, @input_columns) = @_; + + my @tabs = $self->_tables(); + + my $extra_default_where; + + # Hack for feature types that needs to be restricted to species_id (in + # coord_system). + if ( $self->is_multispecies() + && $self->isa('Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor') + && !$self->isa('Bio::EnsEMBL::DBSQL::UnmappedObjectAdaptor') ) + { + # We do a check to see if there is already seq_region + # and coord_system defined to ensure we get the right + # alias. We then do the extra query irrespectively of + # what has already been specified by the user. + my %thash = map { $_->[0] => $_->[1] } @tabs; + + my $sr_alias = + ( exists( $thash{seq_region} ) ? $thash{seq_region} : 'sr' ); + my $cs_alias = + ( exists( $thash{coord_system} ) ? $thash{coord_system} : 'cs' ); + + if ( !exists( $thash{seq_region} ) ) { + push( @tabs, [ 'seq_region', $sr_alias ] ); + } + if ( !exists( $thash{coord_system} ) ) { + push( @tabs, [ 'coord_system', $cs_alias ] ); + } + + $extra_default_where = sprintf( + '%s.seq_region_id = %s.seq_region_id ' + . 'AND %s.coord_system_id = %s.coord_system_id ' + . 'AND %s.species_id = ?', + $tabs[0]->[1], $sr_alias, $sr_alias, + $cs_alias, $cs_alias ); + + $self->bind_param_generic_fetch( $self->species_id(), SQL_INTEGER ); + } ## end if ( $self->is_multispecies...) + + @input_columns = $self->_columns() if ! @input_columns; + my $columns = join(', ', @input_columns); + + # + # Construct a left join statement if one was defined, and remove the + # left-joined table from the table list + # + my @left_join_list = $self->_left_join(); + my $left_join_prefix = ''; + my $left_join = ''; + my @tables; + if(@left_join_list) { + my %left_join_hash = map { $_->[0] => $_->[1] } @left_join_list; + while(my $t = shift @tabs) { + my $t_alias = $t->[0] . " " . $t->[1]; + if( exists $left_join_hash{ $t->[0] } || exists $left_join_hash{$t_alias}) { + my $condition = $left_join_hash{ $t->[0] }; + $condition ||= $left_join_hash{$t_alias}; + my $syn = $t->[1]; + $left_join .= + "\n LEFT JOIN " . $t->[0] . " $syn ON $condition ) "; + $left_join_prefix .= '('; + } else { + push @tables, $t; + } + } + } else { + @tables = @tabs; + } + + my $straight_join = ''; + + if($self->_straight_join()) { + $straight_join = "STRAIGHT_JOIN"; + } + + #construct a nice table string like 'table1 t1, table2 t2' + my $tablenames = join(', ', map({ join(' ', @$_) } @tables)); + + my $sql = + "SELECT $straight_join $columns\n" + . "FROM $left_join_prefix ($tablenames) $left_join"; + + my $default_where = $self->_default_where_clause(); + my $final_clause = $self->_final_clause; + + if ($extra_default_where) { + if ($default_where) { + $default_where .= "\n AND $extra_default_where"; + } else { + $default_where = $extra_default_where; + } + } + + #append a where clause if it was defined + if ($constraint) { + $sql .= "\n WHERE $constraint "; + if ($default_where) { + $sql .= " AND\n $default_where "; + } + } elsif ($default_where) { + $sql .= "\n WHERE $default_where "; + } + + #append additional clauses which may have been defined + $sql .= "\n$final_clause"; + + # FOR DEBUG: + #printf(STDERR "SQL:\n%s\n", $sql); + + return $sql; +} + + +=head2 fetch_by_dbID + + Arg [1] : int $id + The unique database identifier for the feature to be obtained + Example : $feat = $adaptor->fetch_by_dbID(1234)); + $feat = $feat->transform('contig'); + Description: Returns the feature created from the database defined by the + the id $id. The feature will be returned in its native + coordinate system. That is, the coordinate system in which it + is stored in the database. In order to convert it to a + particular coordinate system use the transfer() or transform() + method. If the feature is not found in the database then + undef is returned instead + Returntype : Bio::EnsEMBL::Feature or undef + Exceptions : thrown if $id arg is not provided + does not exist + Caller : general + Status : Stable + +=cut + +sub fetch_by_dbID{ + my ($self,$id) = @_; + + throw("id argument is required") if(!defined $id); + + #construct a constraint like 't1.table1_id = 123' + my @tabs = $self->_tables; + my ($name, $syn) = @{$tabs[0]}; + $self->bind_param_generic_fetch($id,SQL_INTEGER); + my $constraint = "${syn}.${name}_id = ?"; + + #Should only be one + my ($feat) = @{$self->generic_fetch($constraint)}; + + return undef if(!$feat); + + return $feat; +} + + +=head2 fetch_all_by_dbID_list + + Arg [1] : listref of integers $id_list + The unique database identifiers for the features to + be obtained. + Arg [2] : optional - Bio::EnsEMBL::Slice to map features onto. + Example : @feats = @{$adaptor->fetch_all_by_dbID_list([1234, 2131, 982]))}; + Description: Returns the features created from the database + defined by the the IDs in contained in the provided + ID list $id_list. The features will be returned + in their native coordinate system. That is, the + coordinate system in which they are stored in the + database. In order to convert the features to a + particular coordinate system use the transfer() or + transform() method. If none of the features are + found in the database a reference to an empty list is + returned. + Returntype : listref of Bio::EnsEMBL::Features + Exceptions : thrown if $id arg is not provided + does not exist + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_dbID_list { + my ( $self, $id_list_ref, $slice ) = @_; + + if ( !defined($id_list_ref) || ref($id_list_ref) ne 'ARRAY' ) { + throw("id_list list reference argument is required"); + } + + if ( !@{$id_list_ref} ) { return [] } + + # Construct a constraint like 't1.table1_id = 123' + my @tabs = $self->_tables(); + my ( $name, $syn ) = @{ $tabs[0] }; + + # Ensure that we do not exceed MySQL's max_allowed_packet (defaults to + # 1 MB) splitting large queries into smaller queries of at most 256 KB + # (32768 8-bit characters). Assuming a (generous) average dbID string + # length of 16, this means 2048 dbIDs in each query. + my $max_size = 2048; + + + my %id_list; + $id_list{$_}++ for @{$id_list_ref}; + my @id_list = keys %id_list; + + my @out; + + while (@id_list) { + my @ids; + my $id_str; + + if ( scalar(@id_list) > $max_size ) { + @ids = splice( @id_list, 0, $max_size ); + } else { + @ids = @id_list; + @id_list = (); + } + + if ( scalar(@ids) > 1 ) { + $id_str = " IN (" . join( ',', @ids ) . ")"; + } else { + $id_str = " = " . $ids[0]; + } + + my $constraint = "${syn}.${name}_id $id_str"; + + push @out, @{ $self->generic_fetch($constraint, undef, $slice) }; + } + + return \@out; +} ## end sub fetch_all_by_dbID_list + +# might not be a good idea, but for convenience +# shouldnt be called on the BIG tables though + +sub fetch_all { + my $self = shift; + return $self->generic_fetch(); +} + +=head2 last_insert_id + + Arg [1] : (optional) $field the name of the field the inserted ID was pushed + into + Arg [2] : (optional) HashRef used to pass extra attributes through to the + DBD driver + Arg [3] : (optional) $table the name of the table to use if the adaptor + does not implement C<_tables()> + Description : Delegating method which uses DBI to extract the last inserted + identifier. If using MySQL we just call the DBI method + L since MySQL ignores any extra + arguments. See L for more information about this + delegated method. + Example : my $id = $self->last_insert_id('my_id'); my $other_id = $self->last_insert_id(); + Returntype : Scalar or undef + +=cut + +sub last_insert_id { + my ($self, $field, $attributes, $table) = @_; + my $dbc = $self->dbc(); + my $dbh = $dbc->db_handle(); + my @args; + if($dbc->driver() eq 'mysql') { + @args = (undef,undef,undef,undef); + } + else { + if(!$table) { + ($table) = $self->_tables(); + } + @args = (undef, $dbc->dbname(), $table->[0], $field); + } + $attributes ||= {}; + return $dbh->last_insert_id(@args, $attributes); +} + + +#_tables +# +# Args : none +# Example : $tablename = $self->_table_name() +# Description: ABSTRACT PROTECTED +# Subclasses are responsible for implementing this +# method. It should list of [tablename, alias] pairs. +# Additionally the primary table (with the dbID, +# analysis_id, and score) should be the first table in +# the list. e.g: +# ( ['repeat_feature', 'rf'], +# ['repeat_consensus', 'rc']); +# used to obtain features. +# Returntype : list of [tablename, alias] pairs +# Exceptions : thrown if not implemented by subclass +# Caller : BaseFeatureAdaptor::generic_fetch +# + +sub _tables { + throw( "abstract method _tables not defined " + . "by implementing subclass of BaseAdaptor" ); +} + + +#_columns +# +# Args : none +# Example : $tablename = $self->_columns() +# Description: ABSTRACT PROTECTED +# Subclasses are responsible for implementing this +# method. It should return a list of columns to be +# used for feature creation. +# Returntype : list of strings +# Exceptions : thrown if not implemented by subclass +# Caller : BaseFeatureAdaptor::generic_fetch +# + +sub _columns { + throw( "abstract method _columns not defined " + . "by implementing subclass of BaseAdaptor" ); +} + + +# _default_where_clause +# +# Arg [1] : none +# Example : none +# Description: May be overridden to provide an additional where +# constraint to the SQL query which is generated to +# fetch feature records. This constraint is always +# appended to the end of the generated where clause +# Returntype : string +# Exceptions : none +# Caller : generic_fetch +# + +sub _default_where_clause { return '' } + + +# _left_join + +# Arg [1] : none +# Example : none +# Description: Can be overridden by a subclass to specify any left +# joins which should occur. The table name specigfied +# in the join must still be present in the return +# values of. +# Returntype : a {'tablename' => 'join condition'} pair +# Exceptions : none +# Caller : general +# + +sub _left_join { return () } + + +#_final_clause + +# Arg [1] : none +# Example : none +# Description: May be overriden to provide an additional clause +# to the end of the SQL query used to fetch feature +# records. This is useful to add a required ORDER BY +# clause to the query for example. +# Returntype : string +# Exceptions : none +# Caller : generic_fetch + +sub _final_clause { return '' } + + +#_objs_from_sth + +# Arg [1] : DBI::row_hashref $hashref containing key-value pairs +# for each of the columns specified by the _columns method +# Example : my @feats = $self->_obj_from_hashref +# Description: ABSTRACT PROTECTED +# The subclass is responsible for implementing this +# method. It should take in a DBI row hash reference +# and return a list of created features in contig +# coordinates. +# Returntype : list of Bio::EnsEMBL::*Features in contig coordinates +# Exceptions : thrown if not implemented by subclass +# Caller : BaseFeatureAdaptor::generic_fetch + +sub _objs_from_sth { + throw( "abstract method _objs_from_sth not defined " + . "by implementing subclass of BaseAdaptor" ); +} + +sub dump_data { + my $self = shift; + my $data = shift; + + my $dumper = Data::Dumper->new([$data]); + $dumper->Indent(0); + $dumper->Terse(1); + my $dump = $dumper->Dump(); +# $dump =~ s/'/\\'/g; + # $dump =~ s/^\$VAR1 = //; + return $dump; +} + +sub get_dumped_data { + my $self = shift; + my $data = shift; + + $data =~ s/\n|\r|\f|\\//g; + return eval ($data); +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/BaseAlignFeatureAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/BaseAlignFeatureAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,326 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::BaseAlignFeatureAdaptor - Abstract Base class for +AlignFeatureAdaptors + +=head1 SYNOPSIS + +Abstract class, should not be instantiated. Implementation of abstract +methods must be performed by subclasses. + +=head1 DESCRIPTION + +This is a base adaptor for the align feature adaptors +DnaAlignFeatureAdaptor and ProteinAlignFeatureAdaptor. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::BaseAlignFeatureAdaptor; +use vars qw(@ISA @EXPORT); +use strict; + +use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor); + +@EXPORT = (@{$DBI::EXPORT_TAGS{'sql_types'}}); + +=head2 fetch_all_by_Slice_and_hcoverage + + Arg [1] : Bio::EnsEMBL::Slice $slice + The slice from which to obtain align features. + Arg [2] : (optional) float $hcoverage + A lower bound for the hcoverage of feats to obtain. + Arg [3] : (optional) string $logic_name + The logic name of the type of features to obtain. + Example : @feats = @{ + $adaptor->fetch_all_by_Slice_and_hcoverage( $slice, + 50.0 ) }; + Description: Returns a listref of features created from the + database which are on the Slice $slice and with a + hcoverage greater than $hcoverage. If logic name + is defined, only features with an analysis of type + $logic_name will be returned. + Returntype : listref of Bio::EnsEMBL::BaseAlignFeatures + in Slice coordinates + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_Slice_and_hcoverage { + my ( $self, $slice, $hcoverage, $logic_name ) = @_; + + my $constraint; + if ( defined($hcoverage) ) { + $constraint = "hcoverage > $hcoverage"; + } + + return + $self->fetch_all_by_Slice_constraint( $slice, $constraint, + $logic_name ); +} + +=head2 fetch_all_by_Slice_and_external_db + + Arg [1] : Bio::EnsEMBL::Slice $slice + The slice from which to obtain align features. + Arg [2] : String $external_db_name + Name of the external DB to which the align features + should be restricted. + Arg [3] : (optional) string $logic_name + The logic name of the type of features to obtain. + Example : @feats = @{ + $adaptor->fetch_all_by_Slice_and_external_db( $slice, + 'EMBL' ) }; + Description: Returns a listref of features created from the + database which are on the Slice $slice and associated + with external DB $external_db_name. If logic name + is defined, only features with an analysis of type + $logic_name will be returned. + Returntype : listref of Bio::EnsEMBL::BaseAlignFeatures + in Slice coordinates + Exceptions : thrown if $external_db_name is not defined or if + the subclass does not return a table alias for the + external_db table from _tables() + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_Slice_and_external_db { + my ( $self, $slice, $external_db_name, $logic_name ) = @_; + + if ( !defined($external_db_name) ) { + throw("Need name of external DB to restrict to"); + } + + my @join_tables = $self->_tables(); + + my $edb_alias; + foreach my $join_table (@join_tables) { + my ( $table, $table_alias ) = @{$join_table}; + if ( $table eq 'external_db' ) { + $edb_alias = $table_alias; + last; + } + } + + if ( !defined($edb_alias) ) { + throw("Can not find alias for external_db table"); + } + + my $constraint = sprintf( "%s.db_name = %s", + $edb_alias, + $self->dbc()->db_handle() + ->quote( $external_db_name, SQL_VARCHAR ) + ); + + return + $self->fetch_all_by_Slice_constraint( $slice, $constraint, + $logic_name ); +} ## end sub fetch_all_by_Slice_and_external_db + +=head2 fetch_all_by_Slice_and_pid + + Arg [1] : Bio::EnsEMBL::Slice $slice + The slice from which to obtain align features. + Arg [2] : (optional) float $pid + A lower bound for the percentage identity of features + to obtain. + Arg [3] : (optional) string $logic_name + The logic name of the type of features to obtain. + Example : @feats = + @{ $adaptor->fetch_all_by_Slice_and_pid( $slice, 50.0 ) }; + Description: Returns a listref of features created from the + database which are on the Slice $slice and with a + percentage identity greater than $pid. If logic name + is defined, only features with an analysis of type + $logic_name will be returned. + Returntype : listref of Bio::EnsEMBL::BaseAlignFeatures + in Slice coordinates + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_Slice_and_pid { + my ( $self, $slice, $pid, $logic_name ) = @_; + + # #get the primary table alias + # my @tabs = $self->_tables; + # my $alias = $tabs[0]->[1]; + + # if(defined $pid) { + # $constraint = "${alias}.perc_ident > $pid"; + # } + + my $constraint; + if ( defined($pid) ) { + $constraint = sprintf( "perc_ident > %s", + $self->dbc()->db_handle() + ->quote( $pid, SQL_FLOAT ) ); + } + + return + $self->fetch_all_by_Slice_constraint( $slice, $constraint, + $logic_name ); +} + + +=head2 fetch_all_by_hit_name + + Arg [1] : string $hit_name + The hit_name of the features to obtain + Arg [2] : (optional) string $logic_name + The analysis logic name of the type of features to + obtain. + Example : @feats = + @{ $adaptor->fetch_all_by_hit_name( $name, + $logic_name ); } + Description: Returns a listref of features created from the + database which correspond to the given hit_name. If + logic name is defined, only features with an analysis + of type $logic_name will be returned. + Returntype : listref of Bio::EnsEMBL::BaseAlignFeatures + Exceptions : thrown if hit_name is not defined + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_hit_name { + my ( $self, $hit_name, $logic_name ) = @_; + + if ( !defined($hit_name) ) { + throw("hit_name argument is required"); + } + + # Construct a constraint like 't1.hit_name = "123"' + my @tabs = $self->_tables(); + my ( $name, $syn ) = @{ $tabs[0] }; + + my $constraint = sprintf( "%s.hit_name = %s", + $syn, + $self->dbc()->db_handle()->quote( $hit_name, SQL_VARCHAR ) ); + + if ( defined($logic_name) ) { + # Add the $logic_name constraint + $constraint = + $self->_logic_name_to_constraint( $constraint, $logic_name ); + } + + return $self->generic_fetch($constraint); +} + + +=head2 fetch_all_by_hit_name_unversioned + + Arg [1] : string $hit_name + The beginning of the hit_name of the features to + obtain, e.g. AA768786 would retrieve AA768786.1, + AA768786.2 etc. + Arg [2] : (optional) string $logic_name + The analysis logic name of the type of features to + obtain. + Example : @feats = + @{ $adaptor->fetch_all_by_hit_name( $name, + $logic_name ) }; + Description: Returns a listref of features created from the + database which start with the given hit_name. If + logic name is defined, only features with an analysis + of type $logic_name will be returned. + Returntype : listref of Bio::EnsEMBL::BaseAlignFeatures + Exceptions : thrown if hit_name is not defined + Caller : general + Status : At risk + +=cut + +sub fetch_all_by_hit_name_unversioned { + my ( $self, $hit_name, $logic_name ) = @_; + + if ( !defined($hit_name) ) { + throw("hit_name argument is required"); + } + $hit_name =~ s/_/\\_/; + + #construct a constraint like 't1.hit_name = "123"' + my @tabs = $self->_tables; + my ( $name, $syn ) = @{ $tabs[0] }; + + my $constraint = sprintf( "%s.hit_name LIKE %s", + $syn, + $self->dbc()->db_handle()->quote( $hit_name . '.%', SQL_VARCHAR ) ); + + if ( defined($logic_name) ) { + # Add the $logic_name constraint + $constraint = + $self->_logic_name_to_constraint( $constraint, $logic_name ); + } + + return $self->generic_fetch($constraint); +} + + + +=head2 fetch_all_by_RawContig_and_pid + + Description: DEPRECATED use fetch_all_by_Slice_and_pid instead + +=cut + +sub fetch_all_by_RawContig_and_pid { + my($self, $contig, $pid, $logic_name) = @_; + + my $constraint; + + #get the primary table alias + my @tabs = $self->_tables; + my $alias = $tabs[0]->[1]; + + if(defined $pid) { + $constraint = "${alias}.perc_ident > $pid"; + } + + return $self->fetch_all_by_RawContig_constraint($contig, + $constraint, + $logic_name); +} + + + + +##implemented by subclasses: +# store +# _tables +# _columns +# _obj_from_hashref + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/BaseFeatureAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/BaseFeatureAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1338 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor - An Abstract Base class for all +FeatureAdaptors + +=head1 SYNOPSIS + +Abstract class - should not be instantiated. Implementation of +abstract methods must be performed by subclasses. + +=head1 DESCRIPTION + +This is a base adaptor for feature adaptors. This base class is simply a way +of eliminating code duplication through the implementation of methods +common to all feature adaptors. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; +use vars qw(@ISA @EXPORT); +use strict; + +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Utils::Cache; +use Bio::EnsEMBL::Utils::Exception qw(warning throw deprecate stack_trace_dump); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Iterator; + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); + +@EXPORT = (@{$DBI::EXPORT_TAGS{'sql_types'}}); + +our $SLICE_FEATURE_CACHE_SIZE = 4; +our $MAX_SPLIT_QUERY_SEQ_REGIONS = 3; +our $SILENCE_CACHE_WARNINGS = 0; + +=head2 new + + Arg [1] : list of args @args + Superclass constructor arguments + Example : none + Description: Constructor which warns if caching has been switched off + Returntype : Bio::EnsEMBL::BaseFeatureAdaptor + Exceptions : none + Caller : implementing subclass constructors + Status : Stable + +=cut + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + if ( defined $self->db->no_cache() && $self->db->no_cache() && ! $SILENCE_CACHE_WARNINGS) { + warning( "You are using the API without caching most recent features. " + . "Performance might be affected." ); + } + return $self; +} + +=head2 start_equals_end + + Arg [1] : (optional) boolean $newval + Example : $bfa->start_equals_end(1); + Description: Getter/Setter for the start_equals_end flag. If set + to true sub _slice_fetch will use a simplified sql to retrieve 1bp slices. + Returntype : boolean + Exceptions : none + Caller : EnsemblGenomes variation DB build + Status : Stable + +=cut + +sub start_equals_end { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'start_equals_end'} = $value; + } + return $self->{'start_equals_end'}; +} + + +=head2 clear_cache + + Args : None + Example : my $sa = + $registry->get_adaptor( 'Mus musculus', 'Core', + 'Slice' ); + my $ga = + $registry->get_adaptor( 'Mus musculus', 'Core', + 'Gene' ); + + my $slice = + $sa->fetch_by_region( 'Chromosome', '1', 1e8, + 1.05e8 ); + + my $genes = $ga->fetch_all_by_Slice($slice); + + $ga->clear_cache(); + + Description : Empties the feature cache associated with this + feature adaptor. + Return type : None + Exceptions : None + Caller : General + Status : At risk (under development) + +=cut + +sub clear_cache { + my ($self) = @_; + %{$self->{_slice_feature_cache}} = (); + return; +} + +=head2 _slice_feature_cache + + Description : Returns the feature cache if we are allowed to cache and + will build it if we need to. We will never return a reference + to the hash to avoid unintentional auto-vivfying caching + Returntype : Bio::EnsEMBL::Utils::Cache + Exceptions : None + Caller : Internal + +=cut + +sub _slice_feature_cache { + my ($self) = @_; + return if $self->db()->no_cache(); + if(! exists $self->{_slice_feature_cache}) { + tie my %cache, 'Bio::EnsEMBL::Utils::Cache', $SLICE_FEATURE_CACHE_SIZE; + $self->{_slice_feature_cache} = \%cache; + } + return $self->{_slice_feature_cache}; +} + +=head2 fetch_all_by_Slice + + Arg [1] : Bio::EnsEMBL::Slice $slice + the slice from which to obtain features + Arg [2] : (optional) string $logic_name + the logic name of the type of features to obtain + Example : $fts = $a->fetch_all_by_Slice($slice, 'Swall'); + Description: Returns a listref of features created from the database + which are on the Slice defined by $slice. If $logic_name is + defined only features with an analysis of type $logic_name + will be returned. + NOTE: only features that are entirely on the slice's seq_region + will be returned (i.e. if they hang off the start/end of a + seq_region they will be discarded). Features can extend over the + slice boundaries though (in cases where you have a slice that + doesn't span the whole seq_region). + Returntype : listref of Bio::EnsEMBL::SeqFeatures in Slice coordinates + Exceptions : none + Caller : Bio::EnsEMBL::Slice + Status : Stable + +=cut + +sub fetch_all_by_Slice { + my ($self, $slice, $logic_name) = @_; + #fetch by constraint with empty constraint + return $self->fetch_all_by_Slice_constraint($slice, '', $logic_name); +} + + + +=head2 fetch_Iterator_by_Slice_method + + Arg [1] : CODE ref of Slice fetch method + Arg [2] : ARRAY ref of parameters for Slice fetch method + Arg [3] : Optional int: Slice index in parameters array + Arg [4] : Optional int: Slice chunk size. Default=500000 + Example : my $slice_iter = $feature_adaptor->fetch_Iterator_by_Slice_method + ($feature_adaptor->can('fetch_all_by_Slice_Arrays'), + \@fetch_method_params, + 0,#Slice idx + ); + + while(my $feature = $slice_iter->next && defined $feature){ + #Do something here + } + + Description: Creates an Iterator which chunks the query Slice to facilitate + large Slice queries which would have previously run out of memory + Returntype : Bio::EnsEMBL::Utils::Iterator + Exceptions : Throws if mandatory params not valid + Caller : general + Status : at risk + +=cut + +#Does not support Collections. See Funcgen ResultFeatureAdaptor::fetch_collection_Iterator_by_Slice_method + +sub fetch_Iterator_by_Slice_method{ + my ($self, $slice_method_ref, $params_ref, $slice_idx, $chunk_size) = @_; + + if(! ( defined $slice_method_ref && + ref($slice_method_ref) eq 'CODE') + ){ + throw('Must pass a valid Slice fetch method CODE ref'); + } + + if (! ($params_ref && + ref($params_ref) eq 'ARRAY')) { + #Don't need to check size here so long as we have valid Slice + throw('You must pass a method params ARRAYREF'); + } + + $slice_idx = 0 if(! defined $slice_idx); + my $slice = $params_ref->[$slice_idx]; + $chunk_size ||= 1000000; + + my @feat_cache; + my $finished = 0; + my $start = 1; #local coord for sub slice + my $end = $slice->length; + my $num_overlaps = 0; + + my $coderef = + sub { + + while (scalar(@feat_cache) == 0 && + ! $finished) { + + my $new_end = ($start + $chunk_size - 1); + + if ($new_end >= $end) { + # this is our last chunk + $new_end = $end; + $finished = 1; + } + + #Chunk by sub slicing + my $sub_slice = $slice->sub_Slice($start, $new_end); + $params_ref->[$slice_idx] = $sub_slice; + @feat_cache = @{ $slice_method_ref->($self, @$params_ref)}; + + #Remove & count overlapping features + splice(@feat_cache, 0, $num_overlaps) if($num_overlaps); + my $i; + + if (scalar(@feat_cache) > 0) { + + my $feat_end = $feat_cache[$#feat_cache]->seq_region_end; + my $slice_end = $sub_slice->end; + $num_overlaps = 0; + + for ($i = $#feat_cache; $i >=0; $i--) { + + if ($feat_end > $slice_end) { + $feat_end = $feat_cache[$i]->end; + $num_overlaps ++; + } else { + last; + } + + } + } + + # update the start coordinate + $start = $new_end + 1; + } + + #this maybe returning from an undef cache + #Need to sub this out even more? + return shift @feat_cache; + }; + + return Bio::EnsEMBL::Utils::Iterator->new($coderef); +} + + +=head2 fetch_Iterator_by_Slice + + Arg [1] : Bio::EnsEMBL::Slice + Arg [2] : Optional string: logic name of analysis + Arg [3] : Optional int: Chunk size to iterate over. Default is 500000 + Example : my $slice_iter = $feature_adaptor->fetch_Iterator_by_Slice($slice); + + while(my $feature = $slice_iter->next && defined $feature){ + #Do something here + } + + Description: Creates an Iterator which chunks the query Slice to facilitate + large Slice queries which would have previously run out of memory + Returntype : Bio::EnsEMBL::Utils::Iterator + Exceptions : None + Caller : general + Status : at risk + +=cut + +sub fetch_Iterator_by_Slice{ + my ($self, $slice, $logic_name, $chunk_size) = @_; + + my $method_ref = $self->can('fetch_all_by_Slice'); + + return $self->fetch_Iterator_by_Slice_method($method_ref, [$slice, $logic_name], 0, $chunk_size); +} + + +=head2 fetch_all_by_Slice_and_score + + Arg [1] : Bio::EnsEMBL::Slice $slice + the slice from which to obtain features + Arg [2] : (optional) float $score + lower bound of the the score of the features retrieved + Arg [3] : (optional) string $logic_name + the logic name of the type of features to obtain + Example : $fts = $a->fetch_all_by_Slice_and_score($slice,90,'Swall'); + Description: Returns a list of features created from the database which are + are on the Slice defined by $slice and which have a score + greater than $score. If $logic_name is defined, + only features with an analysis of type $logic_name will be + returned. + Returntype : listref of Bio::EnsEMBL::SeqFeatures in Slice coordinates + Exceptions : none + Caller : Bio::EnsEMBL::Slice + Status : Stable + +=cut + +sub fetch_all_by_Slice_and_score { + my ( $self, $slice, $score, $logic_name ) = @_; + + my $constraint; + if ( defined($score) ) { + # Get the synonym of the primary_table + my @tabs = $self->_tables(); + my $syn = $tabs[0]->[1]; + + $constraint = sprintf( "%s.score > %s", + $syn, + $self->dbc()->db_handle()->quote( $score, SQL_FLOAT ) ); + } + + return + $self->fetch_all_by_Slice_constraint( $slice, $constraint, + $logic_name ); +} + + +=head2 fetch_all_by_Slice_constraint + + Arg [1] : Bio::EnsEMBL::Slice $slice + the slice from which to obtain features + Arg [2] : (optional) string $constraint + An SQL query constraint (i.e. part of the WHERE clause) + Arg [3] : (optional) string $logic_name + the logic name of the type of features to obtain + Example : $fs = $a->fetch_all_by_Slice_constraint($slc, 'perc_ident > 5'); + Description: Returns a listref of features created from the database which + are on the Slice defined by $slice and fulfill the SQL + constraint defined by $constraint. If logic name is defined, + only features with an analysis of type $logic_name will be + returned. + Returntype : listref of Bio::EnsEMBL::SeqFeatures in Slice coordinates + Exceptions : thrown if $slice is not defined + Caller : Bio::EnsEMBL::Slice + Status : Stable + +=cut + +sub fetch_all_by_Slice_constraint { + my ( $self, $slice, $constraint, $logic_name ) = @_; + + + my @result = (); + + if ( !ref($slice) + || !( $slice->isa('Bio::EnsEMBL::Slice') + or $slice->isa('Bio::EnsEMBL::LRGSlice') ) ) + { + throw("Bio::EnsEMBL::Slice argument expected."); + } + + $constraint ||= ''; + $constraint = + $self->_logic_name_to_constraint( $constraint, $logic_name ); + + # If the logic name was invalid, undef was returned + if ( !defined($constraint) ) { return [] } + + my $key; + my $cache; + + # Will only use feature_cache if hasn't been no_cache attribute set + if ( + !( defined( $self->db()->no_cache() ) && $self->db()->no_cache() ) ) + { + + #strain test and add to constraint if so to stop caching. + if ( $slice->isa('Bio::EnsEMBL::StrainSlice') ) { + my $string = + $self->dbc()->db_handle()->quote( $slice->strain_name() ); + + if ( $constraint ne "" ) { + $constraint .= " AND $string = $string "; + } else { + $constraint .= " $string = $string "; + } + } + + # Check the cache and return the cached results if we have already + # done this query. The cache key is the made up from the slice + # name, the constraint, and the bound parameters (if there are any). + $key = uc( join( ':', $slice->name(), $constraint ) ); + + my $bind_params = $self->bind_param_generic_fetch(); + + if ( defined($bind_params) ) { + $key .= ':' + . join( ':', map { $_->[0] . '/' . $_->[1] } @{$bind_params} ); + } + + $cache = $self->_slice_feature_cache(); + if ( exists( $cache->{$key} ) ) { + # Clear the bound parameters and return the cached data. + $self->{'_bind_param_generic_fetch'} = (); + return $cache->{$key}; + } + } ## end if ( !( defined( $self...))) + + my $sa = $slice->adaptor(); + + # Hap/PAR support: retrieve normalized 'non-symlinked' slices. + my @proj = @{ $sa->fetch_normalized_slice_projection($slice) }; + + + + if ( !@proj ) { + throw( 'Could not retrieve normalized Slices. ' + . 'Database contains incorrect assembly_exception information.' + ); + } + + # Want to get features on the FULL original slice as well as any + # symlinked slices. + + # Filter out partial slices from projection that are on same + # seq_region as original slice. + + my $sr_id = $slice->get_seq_region_id(); + + @proj = grep { $_->to_Slice->get_seq_region_id() != $sr_id } @proj; + + my $segment = bless( [ 1, $slice->length(), $slice ], + 'Bio::EnsEMBL::ProjectionSegment' ); + push( @proj, $segment ); + + # construct list of Hap/PAR boundaries for entire seq region + my @bounds; + + my $ent_slice = $sa->fetch_by_seq_region_id($sr_id); + if ( $slice->strand() == -1 ) { + $ent_slice = $ent_slice->invert(); + } + + my @ent_proj = + @{ $sa->fetch_normalized_slice_projection($ent_slice) }; + shift(@ent_proj); # skip first + + @bounds = map { $_->from_start() - $slice->start() + 1 } @ent_proj; + + + # fetch features for the primary slice AND all symlinked slices + foreach my $seg (@proj) { + + + my $offset = $seg->from_start(); + my $seg_slice = $seg->to_Slice(); + my $features = + $self->_slice_fetch( $seg_slice, $constraint ); + + # If this was a symlinked slice offset the feature coordinates as + # needed. + if ( $seg_slice->name() ne $slice->name() ) { + + FEATURE: + foreach my $f ( @{$features} ) { + if ( $offset != 1 ) { + $f->{'start'} += $offset - 1; + $f->{'end'} += $offset - 1; + } + + # discard boundary crossing features from symlinked regions + foreach my $bound (@bounds) { + if ( $f->{'start'} < $bound && $f->{'end'} >= $bound ) { + next FEATURE; + } + } + + $f->{'slice'} = $slice; + push( @result, $f ); + } + } else { + push( @result, @{$features} ); + } + } ## end foreach my $seg (@proj) + + # Will only use feature_cache when set attribute no_cache in DBAdaptor + if ( defined($key) ) { + $cache->{$key} = \@result; + } + + return \@result; +} ## end sub fetch_all_by_Slice_constraint + + +=head2 fetch_all_by_logic_name + + Arg [3] : string $logic_name + the logic name of the type of features to obtain + Example : $fs = $a->fetch_all_by_logic_name('foobar'); + Description: Returns a listref of features created from the database. + only features with an analysis of type $logic_name will + be returned. If the logic name is invalid (not in the + analysis table), a reference to an empty list will be + returned. + Returntype : listref of Bio::EnsEMBL::SeqFeatures + Exceptions : thrown if no $logic_name + Caller : General + Status : Stable + +=cut + +sub fetch_all_by_logic_name { + my ( $self, $logic_name ) = @_; + + if ( !defined($logic_name) ) { + throw("Need a logic_name"); + } + + my $constraint = $self->_logic_name_to_constraint( '', $logic_name ); + + if ( !defined($constraint) ) { + warning("Invalid logic name: $logic_name"); + return []; + } + + return $self->generic_fetch($constraint); +} + +# Method that creates an object. Called by the _objs_from_sth() method +# in the sub-classes (the various feature adaptors). Overridden by the +# feature collection classes. + +sub _create_feature { + my ( $self, $feature_type, $args ) = @_; + return $feature_type->new( %{$args} ); +} + +# This is the same as the above, but calls the new_fast() constructor of +# the feature type. + +sub _create_feature_fast { + my ( $self, $feature_type, $args ) = @_; + return $feature_type->new_fast($args); +} + +# +# helper function used by fetch_all_by_Slice_constraint method +# +sub _slice_fetch { + my ( $self, $slice, $orig_constraint ) = @_; + + my $slice_start = $slice->start(); + my $slice_end = $slice->end(); + my $slice_strand = $slice->strand(); + my $slice_cs = $slice->coord_system(); + my $slice_seq_region = $slice->seq_region_name(); + my $slice_seq_region_id = $slice->get_seq_region_id(); + + #get the synonym and name of the primary_table + my @tabs = $self->_tables; + my ( $tab_name, $tab_syn ) = @{ $tabs[0] }; + + #find out what coordinate systems the features are in + my $mcc = $self->db->get_MetaCoordContainer(); + my @feat_css = (); + + my $mca = $self->db->get_MetaContainer(); + my $value_list = $mca->list_value_by_key( $tab_name . "build.level" ); + if ( @$value_list and $slice->is_toplevel() ) { + push @feat_css, $slice_cs; + } else { + @feat_css = + @{ $mcc->fetch_all_CoordSystems_by_feature_type($tab_name) }; + } + + my $asma = $self->db->get_AssemblyMapperAdaptor(); + my @features; + + # fetch the features from each coordinate system they are stored in +COORD_SYSTEM: foreach my $feat_cs (@feat_css) { + my $mapper; + my @coords; + my @ids; + + if ( $feat_cs->equals($slice_cs) ) { + # no mapping is required if this is the same coord system + + my $max_len = $self->_max_feature_length() + || $mcc->fetch_max_length_by_CoordSystem_feature_type( $feat_cs, + $tab_name ); + + my $constraint = $orig_constraint; + + my $sr_id; + if ( $slice->adaptor() ) { + $sr_id = $slice->adaptor()->get_seq_region_id($slice); + } else { + $sr_id = + $self->db()->get_SliceAdaptor()->get_seq_region_id($slice); + } + + # If there is mapping information, use the external_seq_region_id + # to get features. + + my @sr_ids = ($sr_id); + + while (1) { + my $ext_sr_id = $self->get_seq_region_id_external($sr_id); + + if ( $ext_sr_id == $sr_id ) { last } + + push( @sr_ids, $ext_sr_id ); + $sr_id = $ext_sr_id; + } + + $constraint .= " AND " if ($constraint); + + + $constraint .= "${tab_syn}.seq_region_id IN (" + . join( ',', @sr_ids ) . ") AND"; + + #faster query for 1bp slices where SNP data is not compressed + if ( $self->start_equals_end && $slice_start == $slice_end ) { + $constraint .= + " AND ${tab_syn}.seq_region_start = $slice_end" . + " AND ${tab_syn}.seq_region_end = $slice_start"; + + } else { + + if ( !$slice->is_circular() ) { + # Deal with the default case of a non-circular chromosome. + $constraint .= " ${tab_syn}.seq_region_start <= $slice_end AND " + . "${tab_syn}.seq_region_end >= $slice_start"; + + if ( $max_len ) { + my $min_start = $slice_start - $max_len; + $constraint .= " AND ${tab_syn}.seq_region_start >= $min_start"; + } + + } else { + # Deal with the case of a circular chromosome. + if ( $slice_start > $slice_end ) { + $constraint .= " ( ${tab_syn}.seq_region_start >= $slice_start " + . "OR ${tab_syn}.seq_region_start <= $slice_end " + . "OR ${tab_syn}.seq_region_end >= $slice_start " + . "OR ${tab_syn}.seq_region_end <= $slice_end " + . "OR ${tab_syn}.seq_region_start > ${tab_syn}.seq_region_end)"; + + } else { + $constraint .= " ((${tab_syn}.seq_region_start <= $slice_end " + . "AND ${tab_syn}.seq_region_end >= $slice_start) " + . "OR (${tab_syn}.seq_region_start > ${tab_syn}.seq_region_end " + . "AND (${tab_syn}.seq_region_start <= $slice_end " + . "OR ${tab_syn}.seq_region_end >= $slice_start)))"; + } + } + + } + + my $fs = $self->generic_fetch( $constraint, undef, $slice ); + + # features may still have to have coordinates made relative to slice + # start + $fs = $self->_remap( $fs, $mapper, $slice ); + + push @features, @$fs; + } else { + $mapper = $asma->fetch_by_CoordSystems( $slice_cs, $feat_cs ); + + next unless defined $mapper; + + # Get list of coordinates and corresponding internal ids for + # regions the slice spans + @coords = + $mapper->map( $slice_seq_region, $slice_start, $slice_end, + $slice_strand, $slice_cs ); + + @coords = grep { !$_->isa('Bio::EnsEMBL::Mapper::Gap') } @coords; + + next COORD_SYSTEM if ( !@coords ); + + @ids = map { $_->id() } @coords; + #coords are now id rather than name + # @ids = @{$asma->seq_regions_to_ids($feat_cs, \@ids)}; + + # When regions are large and only partially spanned + # by slice it is faster to to limit the query with + # start and end constraints. Take simple approach: + # use regional constraints if there are less than a + # specific number of regions covered. + + if ( @coords > $MAX_SPLIT_QUERY_SEQ_REGIONS ) { + my $constraint = $orig_constraint; + my $id_str = join( ',', @ids ); + $constraint .= " AND " if ($constraint); + $constraint .= "${tab_syn}.seq_region_id IN ($id_str)"; + my $fs = $self->generic_fetch( $constraint, $mapper, $slice ); + + $fs = $self->_remap( $fs, $mapper, $slice ); + + push @features, @$fs; + + } else { + # do multiple split queries using start / end constraints + + my $max_len = ( + $self->_max_feature_length() + || $mcc->fetch_max_length_by_CoordSystem_feature_type( + $feat_cs, $tab_name + ) ); + + my $len = @coords; + for ( my $i = 0; $i < $len; $i++ ) { + my $constraint = $orig_constraint; + $constraint .= " AND " if ($constraint); + $constraint .= + "${tab_syn}.seq_region_id = " + . $ids[$i] . " AND " + . "${tab_syn}.seq_region_start <= " + . $coords[$i]->end() . " AND " + . "${tab_syn}.seq_region_end >= " + . $coords[$i]->start(); + + if ($max_len) { + my $min_start = $coords[$i]->start() - $max_len; + $constraint .= + " AND ${tab_syn}.seq_region_start >= $min_start"; + } + my $fs = $self->generic_fetch( $constraint, $mapper, $slice ); + + $fs = $self->_remap( $fs, $mapper, $slice ); + + push @features, @$fs; + } + } ## end else [ if ( @coords > $MAX_SPLIT_QUERY_SEQ_REGIONS)] + } ## end else [ if ( $feat_cs->equals(...))] + } ## end foreach my $feat_cs (@feat_css) + + return \@features; +} ## end sub _slice_fetch + + +#for a given seq_region_id, gets the one used in an external database, if present, otherwise, returns the internal one +sub get_seq_region_id_external { + my ( $self, $sr_id ) = @_; + my $cs_a = $self->db()->get_CoordSystemAdaptor(); + return ( exists( $cs_a->{'_internal_seq_region_mapping'}->{$sr_id} ) + ? $cs_a->{'_internal_seq_region_mapping'}->{$sr_id} + : $sr_id ); +} + +#for a given seq_region_id and coord_system, gets the one used in the internal (core) database +sub get_seq_region_id_internal{ + my ( $self, $sr_id ) = @_; + my $cs_a = $self->db()->get_CoordSystemAdaptor(); + return ( exists $cs_a->{'_external_seq_region_mapping'}->{$sr_id} + ? $cs_a->{'_external_seq_region_mapping'}->{$sr_id} + : $sr_id); +} + +# +# Helper function containing some common feature storing functionality +# +# Given a Feature this will return a copy (or the same feature if no changes +# to the feature are needed) of the feature which is relative to the start +# of the seq_region it is on. The seq_region_id of the seq_region it is on +# is also returned. +# +# This method will also ensure that the database knows which coordinate +# systems that this feature is stored in. +# + +sub _pre_store { + my $self = shift; + my $feature = shift; + + if(!ref($feature) || !$feature->isa('Bio::EnsEMBL::Feature')) { + throw('Expected Feature argument.'); + } + my $slice = $feature->slice(); + + $self->_check_start_end_strand($feature->start(),$feature->end(), + $feature->strand(), $slice); + + + my $db = $self->db(); + + my $slice_adaptor = $db->get_SliceAdaptor(); + + if(!ref($slice) || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice')) ) { + throw('Feature must be attached to Slice to be stored.'); + } + + # make sure feature coords are relative to start of entire seq_region + + if($slice->start != 1 || $slice->strand != 1) { + #move feature onto a slice of the entire seq_region + $slice = $slice_adaptor->fetch_by_region($slice->coord_system->name(), + $slice->seq_region_name(), + undef, #start + undef, #end + undef, #strand + $slice->coord_system->version()); + + $feature = $feature->transfer($slice); + + if(!$feature) { + throw('Could not transfer Feature to slice of ' . + 'entire seq_region prior to storing'); + } + } + + # Ensure this type of feature is known to be stored in this coord system. + my $cs = $slice->coord_system; + + my ($tab) = $self->_tables(); + my $tabname = $tab->[0]; + + my $mcc = $db->get_MetaCoordContainer(); + + $mcc->add_feature_type($cs, $tabname, $feature->length); + + my $seq_region_id = $slice_adaptor->get_seq_region_id($slice); + + if(!$seq_region_id) { + throw('Feature is associated with seq_region which is not in this DB.'); + } + + return ($feature, $seq_region_id); +} + + +# The same function as _pre_store +# This one is used to store user uploaded features in XXX_userdata db + +sub _pre_store_userdata { + my $self = shift; + my $feature = shift; + + if(!ref($feature) || !$feature->isa('Bio::EnsEMBL::Feature')) { + throw('Expected Feature argument.'); + } + + my $slice = $feature->slice(); + my $slice_adaptor = $slice->adaptor; + + $self->_check_start_end_strand($feature->start(),$feature->end(), + $feature->strand(), $slice); + + + if(!ref($slice) || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice')) ) { + throw('Feature must be attached to Slice to be stored.'); + } + + # make sure feature coords are relative to start of entire seq_region + + if($slice->start != 1 || $slice->strand != 1) { + #move feature onto a slice of the entire seq_region + $slice = $slice_adaptor->fetch_by_region($slice->coord_system->name(), + $slice->seq_region_name(), + undef, #start + undef, #end + undef, #strand + $slice->coord_system->version()); + + $feature = $feature->transfer($slice); + + if(!$feature) { + throw('Could not transfer Feature to slice of ' . + 'entire seq_region prior to storing'); + } + } + + # Ensure this type of feature is known to be stored in this coord system. + my $cs = $slice->coord_system; + + my ($tab) = $self->_tables(); + my $tabname = $tab->[0]; + + my $db = $self->db; + my $mcc = $db->get_MetaCoordContainer(); + + $mcc->add_feature_type($cs, $tabname, $feature->length); + + my $seq_region_id = $slice_adaptor->get_seq_region_id($slice); + + if(!$seq_region_id) { + throw('Feature is associated with seq_region which is not in this DB.'); + } + + return ($feature, $seq_region_id); +} + + +# +# helper function used to validate start/end/strand and +# hstart/hend/hstrand etc. +# +sub _check_start_end_strand { + my $self = shift; + my $start = shift; + my $end = shift; + my $strand = shift; + my $slice = shift; + + # + # Make sure that the start, end, strand are valid + # + if(int($start) != $start) { + throw("Invalid Feature start [$start]. Must be integer."); + } + if(int($end) != $end) { + throw("Invalid Feature end [$end]. Must be integer."); + } + if(int($strand) != $strand || $strand < -1 || $strand > 1) { + throw("Invalid Feature strand [$strand]. Must be -1, 0 or 1."); + } + if($end < $start && !$slice->is_circular()) { + throw("Invalid Feature start/end [$start/$end]. Start must be less " . + "than or equal to end."); + } + + return 1; +} + + +# +# Given a list of features checks if they are in the correct coord system +# by looking at the first features slice. If they are not then they are +# converted and placed on the slice. +# +sub _remap { + my ( $self, $features, $mapper, $slice ) = @_; + + #check if any remapping is actually needed + if(@$features && (!$features->[0]->isa('Bio::EnsEMBL::Feature') || + $features->[0]->slice == $slice)) { + return $features; + } + + #remapping has not been done, we have to do our own conversion from + #to slice coords + + my @out; + + my $slice_start = $slice->start(); + my $slice_end = $slice->end(); + my $slice_strand = $slice->strand(); + my $slice_cs = $slice->coord_system(); + + my ($seq_region, $start, $end, $strand); + + my $slice_seq_region_id = $slice->get_seq_region_id(); + my $slice_seq_region = $slice->seq_region_name(); + + foreach my $f (@$features) { + #since feats were obtained in contig coords, attached seq is a contig + my $fslice = $f->slice(); + if(!$fslice) { + throw("Feature does not have attached slice.\n"); + } + my $fseq_region = $fslice->seq_region_name(); + my $fseq_region_id = $fslice->get_seq_region_id(); + my $fcs = $fslice->coord_system(); + + if(!$slice_cs->equals($fcs)) { + #slice of feature in different coord system, mapping required + + ($seq_region, $start, $end, $strand) = + $mapper->fastmap($fseq_region_id,$f->start(),$f->end(),$f->strand(),$fcs); + + # undefined start means gap + next if(!defined $start); + } else { + $start = $f->start(); + $end = $f->end(); + $strand = $f->strand(); + $seq_region = $f->slice->seq_region_name(); + } + + # maps to region outside desired area + next if ($start > $slice_end) || ($end < $slice_start) || + ($slice_seq_region ne $seq_region); + + #shift the feature start, end and strand in one call + if($slice_strand == -1) { + $f->move( $slice_end - $end + 1, $slice_end - $start + 1, $strand * -1 ); + } else { + $f->move( $start - $slice_start + 1, $end - $slice_start + 1, $strand ); + } + + $f->slice($slice); + + push @out,$f; + } + + return \@out; +} + + +# +# Given a logic name and an existing constraint this will +# add an analysis table constraint to the feature. Note that if no +# analysis_id exists in the columns of the primary table then no +# constraint is added at all +# +sub _logic_name_to_constraint { + my $self = shift; + my $constraint = shift; + my $logic_name = shift; + + return $constraint if(!$logic_name); + + #make sure that an analysis_id exists in the primary table + my ($prim_tab) = $self->_tables(); + my $prim_synonym = $prim_tab->[1]; + + my $found_analysis=0; + foreach my $col ($self->_columns) { + my ($syn,$col_name) = split(/\./,$col); + next if($syn ne $prim_synonym); + if($col_name eq 'analysis_id') { + $found_analysis = 1; + last; + } + } + + if(!$found_analysis) { + warning("This feature is not associated with an analysis.\n" . + "Ignoring logic_name argument = [$logic_name].\n"); + return $constraint; + } + + my $aa = $self->db->get_AnalysisAdaptor(); + my $an = $aa->fetch_by_logic_name($logic_name); + + if ( !defined($an) ) { + return undef; + } + + my $an_id = $an->dbID(); + + $constraint .= ' AND' if($constraint); + $constraint .= " ${prim_synonym}.analysis_id = $an_id"; + return $constraint; +} + + +=head2 store + + Arg [1] : list of Bio::EnsEMBL::SeqFeature + Example : $adaptor->store(@feats); + Description: ABSTRACT Subclasses are responsible for implementing this + method. It should take a list of features and store them in + the database. + Returntype : none + Exceptions : thrown method is not implemented by subclass + Caller : general + Status : At Risk + : throws if called. + +=cut + +sub store{ + my $self = @_; + + throw("Abstract method store not defined by implementing subclass\n"); +} + + +=head2 remove + + Arg [1] : A feature $feature + Example : $feature_adaptor->remove($feature); + Description: This removes a feature from the database. The table the + feature is removed from is defined by the abstract method + _tablename, and the primary key of the table is assumed + to be _tablename() . '_id'. The feature argument must + be an object implementing the dbID method, and for the + feature to be removed from the database a dbID value must + be returned. + Returntype : none + Exceptions : thrown if $feature arg does not implement dbID(), or if + $feature->dbID is not a true value + Caller : general + Status : Stable + +=cut + + +sub remove { + my ($self, $feature) = @_; + + if(!$feature || !ref($feature) || !$feature->isa('Bio::EnsEMBL::Feature')) { + throw('Feature argument is required'); + } + + if(!$feature->is_stored($self->db)) { + throw("This feature is not stored in this database"); + } + + my @tabs = $self->_tables; + my ($table) = @{$tabs[0]}; + + my $sth = $self->prepare("DELETE FROM $table WHERE ${table}_id = ?"); + $sth->bind_param(1,$feature->dbID,SQL_INTEGER); + $sth->execute(); + + #unset the feature dbID ad adaptor + $feature->dbID(undef); + $feature->adaptor(undef); + + return; +} + + +=head2 remove_by_Slice + + Arg [1] : Bio::Ensembl::Slice $slice + Example : $feature_adaptor->remove_by_Slice($slice); + Description: This removes features from the database which lie on a region + represented by the passed in slice. Only features which are + fully contained by the slice are deleted; features which overlap + the edge of the slice are not removed. + The table the features are removed from is defined by + the abstract method_tablename. + Returntype : none + Exceptions : thrown if no slice is supplied + Caller : general + Status : Stable + +=cut + +sub remove_by_Slice { + my ($self, $slice) = @_; + + if(!$slice || !ref($slice) || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice')) ) { + throw("Slice argument is required"); + } + + my @tabs = $self->_tables; + my ($table_name) = @{$tabs[0]}; + + my $seq_region_id = $self->db->get_SliceAdaptor->get_seq_region_id($slice); + my $start = $slice->start(); + my $end = $slice->end(); + + # + # Delete only features fully on the slice, not overlapping ones + # + my $sth = $self->prepare("DELETE FROM $table_name " . + "WHERE seq_region_id = ? " . + "AND seq_region_start >= ? " . + "AND seq_region_end <= ?"); + + $sth->bind_param(1,$seq_region_id,SQL_INTEGER); + $sth->bind_param(2,$start,SQL_INTEGER); + $sth->bind_param(3,$end,SQL_INTEGER); + $sth->execute(); + $sth->finish(); +} + + +# +# Internal function. Allows the max feature length which is normally +# retrieved from the meta_coord table to be overridden. This allows +# for some significant optimizations to be put in when it is known +# that requested features will not be over a certain size. +# +sub _max_feature_length { + my $self = shift; + return $self->{'_max_feature_length'} = shift if(@_); + return $self->{'_max_feature_length'}; +} + + +# +# Lists all seq_region_ids that a particular feature type is found on. +# Useful e.g. for finding out which seq_regions have genes. +# Returns a listref of seq_region_ids. +# +sub _list_seq_region_ids { + my ($self, $table) = @_; + + my @out; + + my $sql = qq( + SELECT DISTINCT + sr.seq_region_id + FROM seq_region sr, + $table a, + coord_system cs + WHERE sr.seq_region_id = a.seq_region_id + AND sr.coord_system_id = cs.coord_system_id + AND cs.species_id = ?); + + my $sth = $self->prepare($sql); + + $sth->bind_param( 1, $self->species_id(), SQL_INTEGER ); + + $sth->execute(); + + while (my ($id) = $sth->fetchrow) { + push(@out, $id); + } + + $sth->finish; + + return \@out; +} + + +=head1 DEPRECATED METHODS + +=cut + + +=head2 fetch_all_by_RawContig_constraint + + Description: DEPRECATED use fetch_all_by_RawContig_constraint instead + +=cut + +sub fetch_all_by_RawContig_constraint { + my $self = shift; + deprecate('Use fetch_all_by_Slice_constraint() instead.'); + return $self->fetch_all_by_slice_constraint(@_); +} + +=head2 fetch_all_by_RawContig + + Description: DEPRECATED use fetch_all_by_Slice instead + +=cut + +sub fetch_all_by_RawContig { + my $self = shift; + deprecate('Use fetch_all_by_Slice() instead.'); + return $self->fetch_all_by_Slice(@_); +} + +=head2 fetch_all_by_RawContig_and_score + + Description: DEPRECATED use fetch_all_by_Slice_and_score instead + +=cut + +sub fetch_all_by_RawContig_and_score{ + my $self = shift; + deprecate('Use fetch_all_by_Slice_and_score() instead.'); + return $self->fetch_all_by_Slice_and_score(@_); +} + +=head2 remove_by_RawContig + + Description: DEPRECATED use remove_by_Slice instead + +=cut + +sub remove_by_RawContig { + my $self = shift; + deprecate("Use remove_by_Slice instead"); + return $self->remove_by_Slice(@_); +} + + +sub remove_by_analysis_id { + my ($self, $analysis_id) = @_; + + $analysis_id or throw("Must call with analysis id"); + + my @tabs = $self->_tables; + my ($tablename) = @{$tabs[0]}; + + my $sql = "DELETE FROM $tablename WHERE analysis_id = $analysis_id"; +# warn "SQL : $sql"; + + my $sth = $self->prepare($sql); + $sth->execute(); + $sth->finish(); +} + +sub remove_by_feature_id { + my ($self, $features_list) = @_; + + my @feats = @$features_list or throw("Must call store with features"); + + my @tabs = $self->_tables; + my ($tablename) = @{$tabs[0]}; + + my $sql = sprintf "DELETE FROM $tablename WHERE ${tablename}_id IN (%s)", join ', ', @feats; +# warn "SQL : $sql"; + + my $sth = $self->prepare($sql); + $sth->execute(); + $sth->finish(); +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/BaseMetaContainer.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/BaseMetaContainer.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,418 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::BaseMetaContainer - Encapsulates all generic access +to database meta information + +=head1 SYNOPSIS + + my $meta_container = $db_adaptor->get_MetaContainer(); + + my @mapping_info = + @{ $meta_container->list_value_by_key('assembly.mapping') }; + +=head1 DESCRIPTION + + An object that encapsulates access to db meta data + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::BaseMetaContainer; + +use vars qw(@ISA); +use strict; + +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning); + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); + +# new() is inherited from Bio::EnsEMBL::DBSQL::BaseAdaptor + +=head2 get_schema_version + + Arg [1] : none + Example : $schema_ver = $meta_container->get_schema_version(); + Description: Retrieves the schema version from the database meta table + Returntype : int + Exceptions : none + Caller : ? + Status : Medium risk + +=cut + +sub get_schema_version { + my $self = shift; + + my $arrRef = $self->list_value_by_key('schema_version'); + + if (@$arrRef) { + my ($ver) = ( $arrRef->[0] =~ /^\s*(\d+)\s*$/ ); + if ( !defined($ver) ) { # old style format + return 0; + } + return $ver; + } else { + warning( + sprintf( + "Please insert meta_key 'schema_version' " + . "in meta table on core database '%s'\n", + $self->dbc()->dbname() ) ); + } + + return 0; +} + + +=head2 list_value_by_key + + Arg [1] : string $key + the key to obtain values from the meta table with + Example : my @values = @{ $meta_container->list_value_by_key($key) }; + Description: gets a value for a key. Can be anything + Returntype : listref of strings + Exceptions : none + Caller : ? + Status : Stable + +=cut + +sub list_value_by_key { + my ( $self, $key ) = @_; + + $self->{'cache'} ||= {}; + + if ( exists $self->{'cache'}->{$key} ) { + return $self->{'cache'}->{$key}; + } + + my $sth; + + if ( !$self->_species_specific_key($key) ) { + $sth = + $self->prepare( "SELECT meta_value " + . "FROM meta " + . "WHERE meta_key = ? " + . "AND species_id IS NULL " + . "ORDER BY meta_id" ); + } else { + $sth = + $self->prepare( "SELECT meta_value " + . "FROM meta " + . "WHERE meta_key = ? " + . "AND species_id = ? " + . "ORDER BY meta_id" ); + $sth->bind_param( 2, $self->species_id(), SQL_INTEGER ); + } + + $sth->bind_param( 1, $key, SQL_VARCHAR ); + $sth->execute(); + + my @result; + while ( my $arrRef = $sth->fetchrow_arrayref() ) { + push( @result, $arrRef->[0] ); + } + + $sth->finish(); + $self->{'cache'}->{$key} = \@result; + + return \@result; +} ## end sub list_value_by_key + +=head2 single_value_by_key + + Arg [1] : string $key + the key to obtain values from the meta table with + Arg [2] : boolean $warn + If true will cause the code to warn the non-existence of a value + Example : my $value = $mc->single_value_by_key($key); + Description: Gets a value for a key. Can be anything + Returntype : Scalar + Exceptions : Raised if more than 1 meta item is returned + +=cut + +sub single_value_by_key { + my ($self, $key, $warn) = @_; + my $results = $self->list_value_by_key($key); + if(defined $results) { + my $count = scalar(@{$results}); + if($count == 1) { + my ($value) = @{$results}; + return $value; + } + elsif($count == 0) { + if($warn) { + my $group = $self->db()->group(); + my $msg = sprintf(qq{Please insert meta_key '%s' in meta table at %s db\n}, $key, $group); + warning($msg); + } + } + else { + my $values = join(q{,}, @{$results}); + throw sprintf(q{Found the values [%s] for the key '%s'}, $values, $key); + } + } + return; +} ## end sub single_value_by_key + +=head2 store_key_value + + Arg [1] : string $key + a key under which $value should be stored + Arg [2] : string $value + the value to store in the meta table + Example : $meta_container->store_key_value($key, $value); + Description: stores a value in the meta container, accessable by a key + Returntype : none + Exceptions : Thrown if the key/value already exists. + Caller : ? + Status : Stable + +=cut + +sub store_key_value { + my ( $self, $key, $value ) = @_; + + if ( $self->key_value_exists( $key, $value ) ) { + warn( "Key-value pair '$key'-'$value' " + . "already exists in the meta table; " + . "not storing duplicate" ); + return; + } + + my $sth; + + if ( !$self->_species_specific_key($key) ) { + $sth = $self->prepare( + 'INSERT INTO meta (meta_key, meta_value, species_id) ' + . 'VALUES(?, ?, \N)' ); + } else { + $sth = $self->prepare( + 'INSERT INTO meta (meta_key, meta_value, species_id) ' + . 'VALUES (?, ?, ?)' ); + $sth->bind_param( 3, $self->species_id(), SQL_INTEGER ); + } + + $sth->bind_param( 1, $key, SQL_VARCHAR ); + $sth->bind_param( 2, $value, SQL_VARCHAR ); + $sth->execute(); + + $self->{'cache'} ||= {}; + + delete $self->{'cache'}->{$key}; +} ## end sub store_key_value + +=head2 update_key_value + + Arg [1] : string $key + a key under which $value should be updated + Arg [2] : string $value + the value to update in the meta table + Example : $meta_container->update_key_value($key, $value); + Description: update a value in the meta container, accessable by a key + Returntype : none + Exceptions : none + Caller : ? + Status : Stable + +=cut + +sub update_key_value { + my ( $self, $key, $value ) = @_; + + my $sth; + + if ( !$self->_species_specific_key($key) ) { + $sth = + $self->prepare( 'UPDATE meta SET meta_value = ? ' + . 'WHERE meta_key = ?' + . 'AND species_id IS NULL' ); + } else { + $sth = + $self->prepare( 'UPDATE meta ' + . 'SET meta_value = ? ' + . 'WHERE meta_key = ? ' + . 'AND species_id = ?' ); + $sth->bind_param( 3, $self->species_id(), SQL_INTEGER ); + } + + $sth->bind_param( 1, $value, SQL_VARCHAR ); + $sth->bind_param( 2, $key, SQL_VARCHAR ); + $sth->execute(); + +} ## end sub update_key_value + + +=head2 delete_key + + Arg [1] : string $key + The key which should be removed from the database. + Example : $meta_container->delete_key('sequence.compression'); + Description: Removes all rows from the meta table which have a meta_key + equal to $key. + Returntype : none + Exceptions : none + Caller : dna_compress script, general + Status : Stable + +=cut + +sub delete_key { + my ( $self, $key ) = @_; + + my $sth; + + if ( !$self->_species_specific_key($key) ) { + $sth = + $self->prepare( 'DELETE FROM meta ' + . 'WHERE meta_key = ?' + . 'AND species_id IS NULL' ); + } else { + $sth = + $self->prepare( 'DELETE FROM meta ' + . 'WHERE meta_key = ? ' + . 'AND species_id = ?' ); + $sth->bind_param( 2, $self->species_id(), SQL_INTEGER ); + } + + $sth->bind_param( 1, $key, SQL_VARCHAR ); + $sth->execute(); + + delete $self->{'cache'}->{$key}; +} + +=head2 delete_key_value + + Arg [1] : string $key + The key which should be removed from the database. + Arg [2] : string $value + The value to be removed. + Example : $meta_container->delete_key('patch', 'patch_39_40_b.sql|xref_unique_constraint'); + Description: Removes all rows from the meta table which have a meta_key + equal to $key, AND a meta_value equal to $value. + Returntype : none + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub delete_key_value { + my ( $self, $key, $value ) = @_; + + my $sth; + + if ( !$self->_species_specific_key($key) ) { + $sth = + $self->prepare( 'DELETE FROM meta ' + . 'WHERE meta_key = ? ' + . 'AND meta_value = ?' + . 'AND species_id IS NULL' ); + } else { + $sth = + $self->prepare( 'DELETE FROM meta ' + . 'WHERE meta_key = ? ' + . 'AND meta_value = ? ' + . 'AND species_id = ?' ); + $sth->bind_param( 3, $self->species_id(), SQL_INTEGER ); + } + + $sth->bind_param( 1, $key, SQL_VARCHAR ); + $sth->bind_param( 2, $value, SQL_VARCHAR ); + $sth->execute(); + + delete $self->{'cache'}->{$key}; +} ## end sub delete_key_value + +=head2 key_value_exists + + Arg [1] : string $key + the key to check + Arg [2] : string $value + the value to check + Example : if ($meta_container->key_value_exists($key, $value)) ... + Description: Return true (1) if a particular key/value pair exists, + false (0) otherwise. + Returntype : boolean + Exceptions : none + Caller : ? + Status : Stable + +=cut + +sub key_value_exists { + my ( $self, $key, $value ) = @_; + + my $sth; + + if ( !$self->_species_specific_key($key) ) { + $sth = + $self->prepare( 'SELECT meta_value ' + . 'FROM meta ' + . 'WHERE meta_key = ? ' + . 'AND meta_value = ?' + . 'AND species_id IS NULL' ); + } else { + $sth = + $self->prepare( 'SELECT meta_value ' + . 'FROM meta ' + . 'WHERE meta_key = ? ' + . 'AND meta_value = ? ' + . 'AND species_id = ?' ); + $sth->bind_param( 3, $self->species_id(), SQL_INTEGER ); + } + + $sth->bind_param( 1, $key, SQL_VARCHAR ); + $sth->bind_param( 2, $value, SQL_VARCHAR ); + $sth->execute(); + + while ( my $arrRef = $sth->fetchrow_arrayref() ) { + if ( $arrRef->[0] eq $value ) { + $sth->finish(); + return 1; + } + } + + return 0; +} ## end sub key_value_exists + +# This utility method determines whether the key is a species-specific +# meta key or not. If the key is either 'patch' or 'schema_version', +# then it is not species-specific. + +# FIXME variation team messed up in release 65 and added the ploidy +# entry without species_id - this will be corrected for release 66, +# for now, I've added it to the list of allowed non-species specific + +sub _species_specific_key { + my ( $self, $key ) = @_; + + return ( $key ne 'patch' + && $key ne 'schema_version' + && $key ne 'schema_type' + && $key ne 'ploidy'); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/CompressedSequenceAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/CompressedSequenceAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,205 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::CompressedSequenceAdaptor - Facilitates DB storage and retrieval of compressed sequence + +=head1 SYNOPSIS + + $seq_adptr = $database_adaptor->get_SequenceAdaptor(); + + $dna = + ${ $seq_adptr->fetch_by_Slice_start_end_strand( $slice, 1, 1000, + -1 ) }; + +=head1 DESCRIPTION + +An adaptor for the retrieval of compressed DNA sequence from the EnsEMBL +database + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::CompressedSequenceAdaptor; + +use vars qw(@ISA); +use strict; + +use Bio::EnsEMBL::DBSQL::SequenceAdaptor; + +@ISA = qw(Bio::EnsEMBL::DBSQL::SequenceAdaptor); + + +sub _fetch_seq { + my $self = shift; + my $seq_region_id = shift; + my $start = shift; + my $len = shift; + + #calculate the offset and start in the compressed sequence + my $comp_start = ($start-1 >> 2) + 1; + my $comp_len = ($len >> 2) + 2; + + my ($bvector, $nline); + + my $sth = $self->prepare( + "SELECT SUBSTRING( d.sequence, ?, ?), n_line + FROM dnac d + WHERE d.seq_region_id = ?"); + $sth->bind_param(1,$comp_start,SQL_INTEGER); + $sth->bind_param(2,$comp_len ,SQL_INTEGER); + $sth->bind_param(3,$seq_region_id,SQL_INTEGER); + $sth->execute(); + $sth->bind_columns(\$bvector, \$nline); + $sth->fetch(); + $sth->finish(); + + #convert sequence from binary string to 0123 string + my $bitlen = length($bvector) << 2; + my $str = ''; + for(my $i=0; $i < $bitlen; $i++) { + $str .= vec($bvector, $i, 2); + } + + #convert from 0123 to ACTG + $str =~ tr/0123/ACTG/; + + $str = substr($str, ($start-1)%4, $len); + + #expand the nlines and place them back in the sequence + my @nlines = split(/:/, $nline); + foreach my $nl (@nlines) { + my ($offset,$char,$nlen) = $nl =~ /(\d+)(\D)(\d+)/; + + #skip nlines entirely out of range + next if(($offset+$nlen-1) < $start || $offset > ($start+$len-1)); + + #obtain relative offset into requested region + $offset = $offset - $start + 1; + + #nlines that partially overlap requested region have to be shrunk + if($offset < 1) { + $nlen = $nlen - (1-$offset); + $offset = 1; + } + if($offset + $nlen > $start+$len) { + $nlen = $len - $offset + 1; + } + + substr($str,$offset-1,$nlen) = $char x $nlen; + } + + return \$str; +} + + +=head2 store + + Arg [1] : string $seq_region_id the id of the sequence region this dna + will be associated with. + Arg [2] : string reference $sequence the dna sequence to be stored in + the database + Example : $dbID = $seq_adaptor->store(12,\'ACTGGGTACCAAACAAACACAACA'); + Description: stores a dna sequence in the databases dna table and returns the + database identifier for the new record. + Returntype : int + Exceptions : none + Caller : Bio::EnsEMBL::DBSQL::RawContigAdaptor::store + Status : Stable + +=cut + +sub store { + my ($self, $seq_region_id, $sequence) = @_; + + if(!$seq_region_id) { + throw('seq_region_id is required'); + } + + $sequence = uc($sequence); + + my $bvector = ''; + + #convert sequence to 0s,1s,2s and 3s + $sequence =~ tr/ACTG/0123/; + + #nlines cover sequence which is not ACTG such as N + #nline format is a set of colon delimited int, char, int triplets: + # + my($nline_char,$nline_len,$nline_off); + my @nlines; + + my $len = length($sequence); + for(my $i=0; $i < $len; $i++) { + my $char = substr($sequence,$i,1); + + #quickly check if this character was an A,C,T or G (and was converted to + # a 0,1,2,3) + if($char =~ /[0-3]/) { + vec($bvector, $i,2) = $char; + if($nline_char) { + #end of an nline + push @nlines, "$nline_off$nline_char$nline_len"; + $nline_char = undef; + $nline_len = 0; + $nline_off = 0; + } + } else { + #this was not an ACTG + if($nline_char) { + if($nline_char eq $char) { + #continuation of an nline + $nline_len++; + } else { + #end of a previous nline and start of a new one + push @nlines, "$nline_off$nline_char$nline_len"; + $nline_char = $char; + $nline_len = 1; + $nline_off = $i+1; + } + } else { + #start of a new nline + $nline_char = $char; + $nline_len = 1; + $nline_off = $i+1; + } + $char = 0; #need to put numeric val into bitvector despite nline + } + + vec($bvector, $i,2) = $char; + } + + my $nline = join(':', @nlines); + my $statement = $self->prepare( + "INSERT INTO dnac(seq_region_id, sequence, n_line) VALUES(?,?,?)"); + + $statement->bind_param(1,$seq_region_id,SQL_INTEGER); + $statement->bind_param(2,$bvector,SQL_BLOB); + $statement->bind_param(3,$nline,SQL_LONGVARCHAR); + $statement->execute(); + + $statement->finish(); + return; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/CoordSystemAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/CoordSystemAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1074 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::CoordSystemAdaptor + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Registry; + + Bio::EnsEMBL::Registry->load_registry_from_db( + -host => 'ensembldb.ensembl.org', + -user => 'anonymous' + ); + + $csa = Bio::EnsEMBL::Registry->get_adaptor( "human", "core", + "coordsystem" ); + + # + # Get all coord systems in the database: + # + foreach my $cs ( @{ $csa->fetch_all() } ) { + print $cs->name, ' ', $cs->version, "\n"; + } + + # + # Fetching by name: + # + + # use the default version of coord_system 'chromosome' (e.g. NCBI33): + $cs = $csa->fetch_by_name('chromosome'); + + # get an explicit version of coord_system 'chromosome': + $cs = $csa->fetch_by_name( 'chromsome', 'NCBI34' ); + + # get all coord_systems of name 'chromosome': + foreach $cs ( @{ $csa->fetch_all_by_name('chromosome') } ) { + print $cs->name, ' ', $cs->version, "\n"; + } + + # + # Fetching by rank: + # + $cs = $csa->fetch_by_rank(2); + + # + # Fetching the pseudo coord system 'toplevel' + # + + # Get the default top_level coord system: + $cs = $csa->fetch_top_level(); + + # can also use an alias in fetch_by_name: + $cs = $csa->fetch_by_name('toplevel'); + + # can also request toplevel using rank=0 + $cs = $csa->fetch_by_rank(0); + + # + # Fetching by sequence level: + # + + # Get the coord system which is used to store sequence: + $cs = $csa->fetch_sequence_level(); + + # can also use an alias in fetch_by_name: + $cs = $csa->fetch_by_name('seqlevel'); + + # + # Fetching by id + # + $cs = $csa->fetch_by_dbID(1); + + +=head1 DESCRIPTION + +This adaptor allows the querying of information from the coordinate +system adaptor. + +Note that many coordinate systems do not have a concept of a version +for the entire coordinate system (though they may have a per-sequence +version). The 'chromosome' coordinate system usually has a version +(i.e. the assembly version) but the clonal coordinate system does not +(despite having individual sequence versions). In the case where a +coordinate system does not have a version an empty string ('') is used +instead. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::CoordSystemAdaptor; + +use strict; +use warnings; + +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning deprecate); +use Bio::EnsEMBL::CoordSystem; + +use vars qw(@ISA); + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); + + +=head2 new + + Arg [1] : See BaseAdaptor for arguments (none specific to this + subclass) + Example : $cs = $db->get_CoordSystemAdaptor(); #better than new() + Description: Creates a new CoordSystem adaptor and caches the contents + of the coord_system table in memory. + Returntype : Bio::EnsEMBL::DBSQL::CoordSystemAdaptor + Exceptions : none + Caller : + Status : Stable + +=cut + +sub new { + my ( $proto, @args ) = @_; + + my $class = ref($proto) || $proto; + my $self = $class->SUPER::new(@args); + + # + # Cache the entire contents of the coord_system table cross-referenced + # by dbID and name. + # + + # keyed on name, list of coord_system value + $self->{'_name_cache'} = {}; + + # keyed on id, coord_system value + $self->{'_dbID_cache'} = {}; + + # keyed on rank + $self->{'_rank_cache'} = {}; + + # keyed on id, 1/undef values + $self->{'_is_sequence_level'} = {}; + $self->{'_is_default_version'} = {}; + + #cache to store the seq_region_mapping information + #from internal->external + $self->{'_internal_seq_region_mapping'} = {}; + #from external->internal + $self->{'_external_seq_region_mapping'} = {}; + + my $sth = $self->prepare( + 'SELECT coord_system_id, name, rank, version, attrib ' + . 'FROM coord_system ' + . 'WHERE species_id = ?' ); + + $sth->bind_param( 1, $self->species_id(), SQL_INTEGER ); + $sth->execute(); + + my ( $dbID, $name, $rank, $version, $attrib ); + $sth->bind_columns( \( $dbID, $name, $rank, $version, $attrib ) ); + + while ( $sth->fetch() ) { + my $seq_lvl = 0; + my $default = 0; + + if ( defined($attrib) ) { + foreach my $attrib ( split( ',', $attrib ) ) { + $self->{"_is_$attrib"}->{$dbID} = 1; + if ( $attrib eq 'sequence_level' ) { + $seq_lvl = 1; + } elsif ( $attrib eq 'default_version' ) { + $default = 1; + } + } + } + + my $cs = + Bio::EnsEMBL::CoordSystem->new( -DBID => $dbID, + -ADAPTOR => $self, + -NAME => $name, + -VERSION => $version, + -RANK => $rank, + -SEQUENCE_LEVEL => $seq_lvl, + -DEFAULT => $default ); + + $self->{'_dbID_cache'}->{$dbID} = $cs; + + $self->{'_name_cache'}->{ lc($name) } ||= []; + $self->{'_rank_cache'}->{$rank} = $cs; + + push @{ $self->{'_name_cache'}->{ lc($name) } }, $cs; + + } ## end while ( $sth->fetch() ) + $sth->finish(); + + $self->_cache_mapping_paths(); + + $self->_cache_seq_region_mapping(); + + return $self; +} ## end sub new + +sub _cache_seq_region_mapping { + # + # This cache will load the information from the seq_region_table, if + # any, to allow mapping between internal and external seq_region_id. + # + + my ($self) = @_; + + # For a given core database, will return the schema_build information. + my $schema_build = $self->db->_get_schema_build(); + + # Prepare the query to get relation for the current database being + # used. + my $sql = qq( + SELECT s.internal_seq_region_id, + s.external_seq_region_id + FROM seq_region_mapping s, + mapping_set ms, + seq_region sr, + coord_system cs + WHERE ms.mapping_set_id = s.mapping_set_id + AND ms.schema_build = ? + AND s.internal_seq_region_id = sr.seq_region_id + AND sr.coord_system_id = cs.coord_system_id + AND cs.species_id = ?); + + my $sth = $self->prepare($sql); + + $sth->bind_param( 1, $schema_build, SQL_VARCHAR ); + $sth->bind_param( 2, $self->species_id(), SQL_INTEGER ); + + $sth->execute(); + + # Load the cache: + foreach my $row ( @{ $sth->fetchall_arrayref() } ) { + # internal->external + $self->{'_internal_seq_region_mapping'}->{ $row->[0] } = $row->[1]; + # external->internal + $self->{'_external_seq_region_mapping'}->{ $row->[1] } = $row->[0]; + } + + $sth->finish(); + +} ## end sub _cache_seq_region_mapping + + +sub _cache_mapping_paths { + # Retrieve a list of available mappings from the meta table. This + # may eventually be moved a table of its own if this proves too + # cumbersome. + + my ($self) = @_; + + my %mapping_paths; + my $mc = $self->db()->get_MetaContainer(); + +MAP_PATH: + foreach + my $map_path ( @{ $mc->list_value_by_key('assembly.mapping') } ) + { + my @cs_strings = split( /[|#]/, $map_path ); + + if ( scalar(@cs_strings) < 2 ) { + warning( "Incorrectly formatted assembly.mapping value in meta " + . "table: $map_path" ); + next MAP_PATH; + } + + my @coord_systems; + foreach my $cs_string (@cs_strings) { + my ( $name, $version ) = split( /:/, $cs_string ); + + my $cs = $self->fetch_by_name( $name, $version ); + + if ( !defined($cs) ) { + warning( "Unknown coordinate system specified in meta table " + . " assembly.mapping:\n $name:$version" ); + next MAP_PATH; + } + + push( @coord_systems, $cs ); + } + + # If the delimiter is a '#' we want a special case, multiple parts + # of the same component map to the same assembly part. As this + # looks like the "long" mapping, we just make the path a bit longer + # :-) + + if ( index( $map_path, '#' ) != -1 && scalar(@coord_systems) == 2 ) + { + splice( @coord_systems, 1, 0, (undef) ); + } + + my $cs1 = $coord_systems[0]; + my $cs2 = $coord_systems[$#coord_systems]; + + my $key1 = $cs1->name() . ':' . $cs1->version(); + my $key2 = $cs2->name() . ':' . $cs2->version(); + + if ( exists( $mapping_paths{"$key1|$key2"} ) ) { + warning( "Meta table specifies multiple mapping paths between " + . "coord systems $key1 and $key2.\n" + . "Choosing shorter path arbitrarily." ); + + if ( scalar( @{ $mapping_paths{"$key1|$key2"} } ) < + scalar(@coord_systems) ) + { + next MAP_PATH; + } + } + + $mapping_paths{"$key1|$key2"} = \@coord_systems; + } ## end foreach my $map_path ( @{ $mc... + + # Create the pseudo coord system 'toplevel' and cache it so that only + # one of these is created for each database. + + my $toplevel = + Bio::EnsEMBL::CoordSystem->new( -TOP_LEVEL => 1, + -NAME => 'toplevel', + -ADAPTOR => $self ); + + $self->{'_top_level'} = $toplevel; + $self->{'_mapping_paths'} = \%mapping_paths; + + return 1; +} ## end sub _cache_mapping_paths + +=head2 fetch_all + + Arg [1] : none + Example : foreach my $cs (@{$csa->fetch_all()}) { + print $cs->name(), ' ', $cs->version(), "\n"; + } + Description: Retrieves every coordinate system defined in the DB. + These will be returned in ascending order of rank. I.e. + The highest coordinate system with rank=1 would be first in the + array. + Returntype : listref of Bio::EnsEMBL::CoordSystems + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_all { + my $self = shift; + + my @coord_systems; + + #order the array by rank in ascending order + foreach my $rank (sort {$a <=> $b} keys %{$self->{'_rank_cache'}}) { + push @coord_systems, $self->{'_rank_cache'}->{$rank}; + } + + return \@coord_systems; +} + + + +=head2 fetch_by_rank + + Arg [1] : int $rank + Example : my $cs = $coord_sys_adaptor->fetch_by_rank(1); + Description: Retrieves a CoordinateSystem via its rank. 0 is a special + rank reserved for the pseudo coordinate system 'toplevel'. + undef is returned if no coordinate system of the specified rank + exists. + Returntype : Bio::EnsEMBL::CoordSystem + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_by_rank { + my $self = shift; + my $rank = shift; + + throw("Rank argument must be defined.") if(!defined($rank)); + throw("Rank argument must be a non-negative integer.") if($rank !~ /^\d+$/); + + if($rank == 0) { + return $self->fetch_top_level(); + } + + return $self->{'_rank_cache'}->{$rank}; +} + + +=head2 fetch_by_name + + Arg [1] : string $name + The name of the coordinate system to retrieve. Alternatively + this may be an alias for a real coordinate system. Valid + aliases are 'toplevel' and 'seqlevel'. + Arg [2] : string $version (optional) + The version of the coordinate system to retrieve. If not + specified the default version will be used. + Example : $coord_sys = $csa->fetch_by_name('clone'); + $coord_sys = $csa->fetch_by_name('chromosome', 'NCBI33'); + # toplevel is an pseudo coord system representing the highest + # coord system in a given region + # such as the chromosome coordinate system + $coord_sys = $csa->fetch_by_name('toplevel'); + #seqlevel is an alias for the sequence level coordinate system + #such as the clone or contig coordinate system + $coord_sys = $csa->fetch_by_name('seqlevel'); + Description: Retrieves a coordinate system by its name + Returntype : Bio::EnsEMBL::CoordSystem + Exceptions : throw if no name argument provided + warning if no version provided and default does not exist + Caller : general + Status : Stable + +=cut + +sub fetch_by_name { + my $self = shift; + my $name = lc(shift); #case insensitve matching + my $version = shift; + + throw('Name argument is required.') if(!$name); + + $version = lc($version) if($version); + + + if($name eq 'seqlevel') { + return $self->fetch_sequence_level(); + } elsif($name eq 'toplevel') { + return $self->fetch_top_level($version); + } + + if(!exists($self->{'_name_cache'}->{$name})) { + if($name =~ /top/) { + warning("Did you mean 'toplevel' coord system instead of '$name'?"); + } elsif($name =~ /seq/) { + warning("Did you mean 'seqlevel' coord system instead of '$name'?"); + } + return undef; + } + + my @coord_systems = @{$self->{'_name_cache'}->{$name}}; + + foreach my $cs (@coord_systems) { + if($version) { + return $cs if(lc($cs->version()) eq $version); + } elsif($self->{'_is_default_version'}->{$cs->dbID()}) { + return $cs; + } + } + + if($version) { + #the specific version we were looking for was not found + return undef; + } + + #didn't find a default, just take first one + my $cs = shift @coord_systems; + my $v = $cs->version(); + warning("No default version for coord_system [$name] exists. " . + "Using version [$v] arbitrarily"); + + return $cs; +} + + +=head2 fetch_all_by_name + + Arg [1] : string $name + The name of the coordinate system to retrieve. This can be + the name of an actual coordinate system or an alias for a + coordinate system. Valid aliases are 'toplevel' and 'seqlevel'. + Example : foreach my $cs (@{$csa->fetch_all_by_name('chromosome')}){ + print $cs->name(), ' ', $cs->version(); + } + Description: Retrieves all coordinate systems of a particular name + Returntype : listref of Bio::EnsEMBL::CoordSystem objects + Exceptions : throw if no name argument provided + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_name { + my $self = shift; + my $name = lc(shift); #case insensitive matching + + throw('Name argument is required') if(!$name); + + if($name eq 'seqlevel') { + return [$self->fetch_sequence_level()]; + } elsif($name eq 'toplevel') { + return [$self->fetch_top_level()]; + } + + return $self->{'_name_cache'}->{$name} || []; +} + + + + + +=head2 fetch_by_dbID + + Arg [1] : int dbID + Example : $cs = $csa->fetch_by_dbID(4); + Description: Retrieves a coord_system via its internal + identifier, or undef if no coordinate system with the provided + id exists. + Returntype : Bio::EnsEMBL::CoordSystem or undef + Exceptions : thrown if no coord_system exists for specified dbID + Caller : general + Status : Stable + +=cut + +sub fetch_by_dbID { + my $self = shift; + my $dbID = shift; + + throw('dbID argument is required') if(!$dbID); + + my $cs = $self->{'_dbID_cache'}->{$dbID}; + + return undef if(!$cs); + + return $cs; +} + + + +=head2 fetch_top_level + + Arg [1] : none + Example : $cs = $csa->fetch_top_level(); + Description: Retrieves the toplevel pseudo coordinate system. + Returntype : Bio::EnsEMBL::CoordSystem object + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_top_level { + my $self = shift; + + return $self->{'_top_level'}; +} + + +=head2 fetch_sequence_level + + Arg [1] : none + Example : ($id, $name, $version) = $csa->fetch_sequence_level(); + Description: Retrieves the coordinate system at which sequence + is stored at. + Returntype : Bio::EnsEMBL::CoordSystem + Exceptions : throw if no sequence_level coord system exists at all + throw if multiple sequence_level coord systems exists + Caller : general + Status : Stable + +=cut + +sub fetch_sequence_level { + my $self = shift; + + my @dbIDs = keys %{$self->{'_is_sequence_level'}}; + + throw('No sequence_level coord_system is defined') if(!@dbIDs); + + if(@dbIDs > 1) { + throw('Multiple sequence_level coord_systems are defined.' . + 'Only one is currently supported'); + } + + return $self->{'_dbID_cache'}->{$dbIDs[0]}; +} + + + + +=head2 get_mapping_path + + Arg [1] : Bio::EnsEMBL::CoordSystem $cs1 + Arg [2] : Bio::EnsEMBL::CoordSystem $cs2 + Example : foreach my $cs @{$csa->get_mapping_path($cs1,$cs2); + Description: Given two coordinate systems this will return a mapping path + between them if one has been defined. Allowed Mapping paths are + explicitly defined in the meta table. The following is an + example: + + mysql> select * from meta where meta_key = 'assembly.mapping'; + +---------+------------------+--------------------------------------+ + | meta_id | meta_key | meta_value | + +---------+------------------+--------------------------------------+ + | 20 | assembly.mapping | chromosome:NCBI34|contig | + | 21 | assembly.mapping | clone|contig | + | 22 | assembly.mapping | supercontig|contig | + | 23 | assembly.mapping | chromosome:NCBI34|contig|clone | + | 24 | assembly.mapping | chromosome:NCBI34|contig|supercontig | + | 25 | assembly.mapping | supercontig|contig|clone | + +---------+------------------+--------------------------------------+ + + For a one-step mapping path to be valid there needs to be + a relationship between the two coordinate systems defined in + the assembly table. Two step mapping paths work by building + on the one-step mapping paths which are already defined. + + The first coordinate system in a one step mapping path must + be the assembled coordinate system and the second must be + the component. + + Example of use: + my $cs1 = $cs_adaptor->fetch_by_name('contig'); + my $cs2 = $cs_adaptor->fetch_by_name('chromosome'); + + my @path = @{$cs_adaptor->get_mapping_path($cs1,$cs2)}; + + if(!@path) { + print "No mapping path."; + } + elsif(@path == 2) { + print "2 step mapping path."; + print "Assembled = " . $path[0]->name() . "\n"; + print "Component = " . $path[1]->name() . "\n"; + } else { + print "Multi step mapping path\n"; + } + + Returntype : reference to a list of Bio::EnsEMBL::CoordSystem objects + + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_mapping_path { + my $self = shift; + my $cs1 = shift; + my $cs2 = shift; + + if(!ref($cs1) || !ref($cs2) || + !$cs1->isa('Bio::EnsEMBL::CoordSystem') || + !$cs2->isa('Bio::EnsEMBL::CoordSystem')) { + throw('Two Bio::EnsEMBL::CoordSystem arguments expected.'); + } + + my $key1 = $cs1->name() . ":" . $cs1->version(); + my $key2 = $cs2->name() . ":" . $cs2->version(); + + my $path = $self->{'_mapping_paths'}->{"$key1|$key2"}; + + return $path if($path); + + $path = $self->{'_mapping_paths'}->{"$key2|$key1"}; + + if(!$path) { + # No path was explicitly defined, but we might be able to guess a + # suitable path. We only guess for missing 2 step paths. + + my %mid1; + my %mid2; + + foreach my $path (values(%{$self->{'_mapping_paths'}})) { + next if(@$path != 2); + + my $match = undef; + + if($path->[0]->equals($cs1)) { + $match = 1; + } elsif($path->[1]->equals($cs1)) { + $match = 0; + } + + if(defined($match)) { + my $mid = $path->[$match]; + my $midkey = $mid->name() . ':' . $mid->version(); + + # is the same cs mapped to by other cs? + if($mid2{$midkey}) { + my $path = [$cs1,$mid,$cs2]; + $self->{'_mapping_paths'}->{"$key1|$key2"} = $path; + $key1 =~ s/\:$//; + $key2 =~ s/\:$//; + $midkey =~ s/\:$//; + warning("Using implicit mapping path between '$key1' and '$key2' " . + "coord systems.\n" . + "An explicit 'assembly.mapping' entry should be added " . + "to the meta table.\nExample: " . + "'$key1|$midkey|$key2'\n"); + return $path; + } else { + $mid1{$midkey} = $mid; + } + } + + $match = undef; + + if($path->[0]->equals($cs2)) { + $match = 1; + } elsif($path->[1]->equals($cs2)) { + $match = 0; + } + + + if(defined($match)) { + my $mid = $path->[$match]; + my $midkey = $mid->name() . ':' . $mid->version(); + + # is the same cs mapped to by other cs? + if($mid1{$midkey}) { + my $path = [$cs2,$mid,$cs1]; + $self->{'_mapping_paths'}->{"$key2|$key1"} = $path; + + $key1 =~ s/\:$//; + $key2 =~ s/\:$//; + $midkey =~ s/\:$//; + warning("Using implicit mapping path between '$key1' and '$key2' " . + "coord systems.\n" . + "An explicit 'assembly.mapping' entry should be added " . + "to the meta table.\nExample: " . + "'$key1|$midkey|$key2'\n"); + + return $path; + } else { + $mid2{$midkey} = $mid; + } + } + } + } + + return $path || []; +} + +=head2 store_mapping_path + + Arg [1] : Bio::EnsEMBL::CoordSystem $cs1 + Arg [2] : Bio::EnsEMBL::CoordSystem $cs2 + Arg [3..n] : Bio::EnsEMBL::CoordSystem $cs3..$csN + Example : my $pathref = $csa->store_mapping_path($cs1,$cs2); + Description: Given two or more coordinate systems this will store + mapping paths between them in the database. + + The 'rank' attrib of the CoordSystems is used to + determine the assembled/component relationships between + them. + + For example, if $cs1 represents chrs of version + V1, $cs2 represents contigs, and $cs3 clones then, unless + they already exist, the following entries will be created + in the meta table; + +------------------+---------------------+ + | meta_key | meta_value | + +------------------+---------------------+ + | assembly.mapping | chr:V1|clone | + | assembly.mapping | clone|contig | + | assembly.mapping | chr:V1|clone|contig | + +------------------+---------------------+ + + + For a one-step mapping path to be valid there needs to be + a relationship between the two coordinate systems defined in + the assembly table. Two step mapping paths work by building + on the one-step mapping paths which are already defined. + + The first coordinate system in a one step mapping path must + be the assembled coordinate system and the second must be + the component. + + Returntype : reference to a list of lists of new meta_value mapping strings + created for assembly.mapping + Exceptions : CoordSystems with no rank/duplicated rank + Caller : general + Status : Experimental + +=cut + +sub store_mapping_path{ + my $self = shift; + my @csystems = @_; + + # Validate and sort the args + my %seen_ranks; + @csystems >= 2 or throw('Need two or more CoordSystems'); + my $validate = sub{ + ref($_[0]) && $_[0]->isa('Bio::EnsEMBL::CoordSystem') or + throw('CoordSystem argument expected.'); + my $rank = $_[0]->rank || + throw('CoordSystem has no rank: '.$_[0]->name); + $seen_ranks{$rank} && + throw('CoordSystem '.$_[0]->name." shares rank $rank with ". + $seen_ranks{$rank}->name); + $seen_ranks{$rank} = $_[0]; + }; + @csystems = sort{$a->rank <=> $b->rank} map{&{$validate}($_)} @csystems; + + # Get a list of all existing assembly.mappings + #my %mappings = map{$_=>1} @{$meta->list_value_by_key('assembly.mapping')}; + + # For each pair in the sorted list, store in the DB + my $meta = $self->db->get_MetaContainer; + my @retlist; + for( my $i=1; $i<@csystems; $i++ ){ + for( my $j=0; $j<(@csystems-$i); $j++ ){ + my $mapping = join( "|", + map{join( ':', $_->name, ($_->version||()) )} + @csystems[$j..$j+$i] ); + my $mapping_key = join( "|", + map{join( ':', $_->name, ($_->version||'') )} + @csystems[$j..$j+$i] ); + # Skip existing + next if $self->{'_mapping_paths'}->{$mapping_key}; + + # Update the database + $meta->store_key_value('assembly.mapping',$mapping); + push @retlist, $mapping; + } + } + + if( @retlist ){ + # Update mapping path cache + $self->_cache_mapping_paths; + } + + # Return the mappings that we have just created + return [@retlist]; +} + +=head2 fetch_by_attrib + + Arg [1] : string attrib + Arg [2] : (optional) string version + Example : $csa->fetch_by_attrib('default_version','NCBIM37'); + Description: Retrieves a CoordSystem object from the database that have the specified + attrib and version, if no version is specified, returns the default version + Returntype : Bio::EnsEMBL::CoordSystem object + Exceptions : throw when attrib not present + Caller : general + Status : Stable + +=cut + +sub fetch_by_attrib { + my $self = shift; + my $attrib = shift; + my $version = shift; + + $version = lc($version) if($version); + + my @dbIDs = keys %{$self->{"_is_$attrib"}}; + + throw("No $attrib coordinate system defined") if(!@dbIDs); + + foreach my $dbID (@dbIDs) { + my $cs = $self->{'_dbID_cache'}->{$dbID}; + if($version) { + return $cs if(lc($version) eq $cs->version()); + } elsif($self->{'_is_default_version'}->{$dbID}) { + return $cs; + } + } + + #specifically requested attrib system was not found + if($version) { + throw("$attrib coord_system with version [$version] does not exist"); + } + + #coordsystem with attrib exists but no default is defined: + my $dbID = shift @dbIDs; + my $cs = $self->{'_dbID_cache'}->{$dbID}; + my $v = $cs->version(); + warning("No default version for $attrib coord_system exists. " . + "Using version [$v] arbitrarily"); + + return $cs; +} + + +sub _fetch_by_attrib{ + my $self = shift; + my $attrib = shift; + my $version = shift; + + deprecate("You should be using the public method fetch_by_attrib ". + "(without initial underscore) instead"); + + return $self->fetch_by_attrib($attrib,$version); +} + +=head2 fetch_all_by_attrib + + Arg [1] : string attrib + Example : $csa->fetch_all_by_attrib('default_version'); + Description: Retrieves all CoordSystem object from the database that have the specified + attrib. + Returntype : reference to a list of Bio::EnsEMBL::CoordSystem objects + Exceptions : throw when attrib not present + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_attrib { + my $self = shift; + my $attrib = shift; + + my @coord_systems = (); + foreach my $dbID (keys %{$self->{"_is_$attrib"}}) { + push @coord_systems, $self->{"_dbID_cache"}->{$dbID}; + } + + return \@coord_systems; +} + +sub _fetch_all_by_attrib{ + my $self = shift; + my $attrib = shift; + + deprecate("You should be using the public method fetch_all_by_attrib ". + "(without initial underscore) instead"); + + return $self->fetch_all_by_attrib($attrib); +} + +=head2 store + + Arg [1] : Bio::EnsEMBL::CoordSystem + Example : $csa->store($coord_system); + Description: Stores a CoordSystem object in the database. + Returntype : none + Exceptions : Warning if CoordSystem is already stored in this database. + Caller : none + Status : Stable + +=cut + +sub store { + my $self = shift; + my $cs = shift; + + if(!$cs || !ref($cs) || !$cs->isa('Bio::EnsEMBL::CoordSystem')) { + throw('CoordSystem argument expected.'); + } + + my $db = $self->db(); + my $name = $cs->name(); + my $version = $cs->version(); + my $rank = $cs->rank(); + + my $seqlevel = $cs->is_sequence_level(); + my $default = $cs->is_default(); + + my $toplevel = $cs->is_top_level(); + + if($toplevel) { + throw("The toplevel CoordSystem cannot be stored"); + } + + # + # Do lots of sanity checking to prevent bad data from being entered + # + + if($cs->is_stored($db)) { + warning("CoordSystem $name $version is already in db.\n"); + return; + } + + if($name eq 'toplevel' || $name eq 'seqlevel' || !$name) { + throw("[$name] is not a valid name for a CoordSystem."); + } + + if($seqlevel && keys(%{$self->{'_is_sequence_level'}})) { + throw("There can only be one sequence level CoordSystem."); + } + + if(exists $self->{'_name_cache'}->{lc($name)}) { + my @coord_systems = @{$self->{'_name_cache'}->{lc($name)}}; + foreach my $c (@coord_systems) { + if(lc($c->version()) eq lc($version)) { + warning("CoordSystem $name $version is already in db.\n"); + return; + } + if($default && $self->{'_is_default_version'}->{$c->dbID()}) { + throw("There can only be one default version of CoordSystem $name"); + } + } + } + + if($rank !~ /^\d+$/) { + throw("Rank attribute must be a positive integer not [$rank]"); + } + if($rank == 0) { + throw("Only toplevel CoordSystem may have rank of 0."); + } + + if(defined($self->{'_rank_cache'}->{$rank})) { + throw("CoordSystem with rank [$rank] already exists."); + } + + my @attrib; + + push @attrib, 'default_version' if($default); + push @attrib, 'sequence_level' if($seqlevel); + + my $attrib_str = (@attrib) ? join(',', @attrib) : undef; + + # + # store the coordinate system in the database + # + + my $sth = + $db->dbc->prepare( 'INSERT INTO coord_system ' + . 'SET name = ?, ' + . 'version = ?, ' + . 'attrib = ?,' + . 'rank = ?,' + . 'species_id = ?' ); + + $sth->bind_param( 1, $name, SQL_VARCHAR ); + $sth->bind_param( 2, $version, SQL_VARCHAR ); + $sth->bind_param( 3, $attrib_str, SQL_VARCHAR ); + $sth->bind_param( 4, $rank, SQL_INTEGER ); + $sth->bind_param( 5, $self->species_id(), SQL_INTEGER ); + + $sth->execute(); + my $dbID = $sth->{'mysql_insertid'}; + $sth->finish(); + + if(!$dbID) { + throw("Did not get dbID from store of CoordSystem."); + } + + $cs->dbID($dbID); + $cs->adaptor($self); + + # + # update the internal caches that are used for fetching + # + $self->{'_is_default_version'}->{$dbID} = 1 if($default); + $self->{'_is_sequence_level'}->{$dbID} = 1 if($seqlevel); + + $self->{'_name_cache'}->{lc($name)} ||= []; + push @{$self->{'_name_cache'}->{lc($name)}}, $cs; + + $self->{'_dbID_cache'}->{$dbID} = $cs; + $self->{'_rank_cache'}->{$rank} = $cs; + + return $cs; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/DBAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/DBAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1065 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::DBAdaptor + +=head1 SYNOPSIS + + $db = Bio::EnsEMBL::DBSQL::DBAdaptor->new( + -user => 'root', + -dbname => 'pog', + -host => 'caldy', + -driver => 'mysql' + ); + + $gene_adaptor = $db->get_GeneAdaptor(); + + $gene = $gene_adaptor->fetch_by_stable_id($stable_id); + + $slice = + $db->get_SliceAdaptor()->fetch_by_chr_start_end( 'X', 1, 10000 ); + +=head1 DESCRIPTION + +Formerly this class provided database connectivity and a means +to retrieve object adaptors. This class is now provided for +convenience and backwards compatibility, and delegates its connection +responsibilities to the DBConnection class (no longer inherited from) +and its object adaptor retrieval to the static Bio::EnsEMBL::Registry. + +Please use Bio::EnsEMBL::Registry to retrieve object adaptors. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::DBAdaptor; + +use strict; + +use Bio::EnsEMBL::DBSQL::DBConnection; +use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; +use Bio::EnsEMBL::Utils::SeqRegionCache; +use Bio::EnsEMBL::Utils::Exception qw(throw warning deprecate); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Registry; +use Bio::EnsEMBL::Utils::ConfigRegistry; + +my $reg = "Bio::EnsEMBL::Registry"; + +=head2 new + + Arg [-DNADB]: (optional) Bio::EnsEMBL::DBSQL::DBAdaptor DNADB + All sequence, assembly, contig information etc, will + be retrieved from this database instead. + + Arg [-NO_CACHE]: (optional) int 1 + This option will turn off caching for slice features, + so, every time a set of features is retrieved, + they will come from the database instead of the + cache. This option is only recommended for advanced + users, specially if you need to store and retrieve + features. It might reduce performance when querying + the database if not used properly. If in doubt, do + not use it or ask in the developer mailing list. + + Arg [..] : Other args are passed to superclass + Bio::EnsEMBL::DBSQL::DBConnection + + Example : $db = new Bio::EnsEMBL::DBSQL::DBAdaptor( + -user => 'root', + -dbname => 'pog', + -host => 'caldy', + -driver => 'mysql' + ); + + $db = new Bio::EnsEMBL::DBSQL::DBAdaptor( + -species => 'Homo_sapiens', + -group => 'core', + -user => 'root', + -dbname => 'pog', + -host => 'caldy', + -driver => 'mysql' + ); + + $db = new Bio::EnsEMBL::DBSQL::DBAdaptor( + -species => 'staphylococcus_aureus', + -group => 'core', + -user => 'root', + -dbname => 'staphylococcus_collection_1_52_1a', + -multispecies_db => 1, + -host => 'caldy', + -driver => 'mysql' + ); + + Description: Constructor for DBAdaptor. + Returntype : Bio::EnsEMBL::DBSQL::DBAdaptor + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub new { + my ( $class, @args ) = @_; + + my $self = bless {}, $class; + + my ( $is_multispecies, $species, $species_id, $group, $con, $dnadb, + $no_cache, $dbname ) + = rearrange( [ + 'MULTISPECIES_DB', 'SPECIES', 'SPECIES_ID', 'GROUP', + 'DBCONN', 'DNADB', 'NO_CACHE', 'DBNAME' + ], + @args + ); + + if ( defined($con) ) { $self->dbc($con) } + else { + if(! defined $dbname) { + throw "-DBNAME is a required parameter when creating a DBAdaptor"; + } + $self->dbc( new Bio::EnsEMBL::DBSQL::DBConnection(@args) ); + } + + if ( defined($species) ) { $self->species($species) } + if ( defined($group) ) { $self->group($group) } + + + $self = Bio::EnsEMBL::Utils::ConfigRegistry::gen_load($self); + +# if(!defined($species) ){ +# $reg->find_and_add_aliases($self); +# } + + $self->species_id( $species_id || 1 ); + + $self->is_multispecies( defined($is_multispecies) + && $is_multispecies == 1 ); + + if ( defined($dnadb) ) { $self->dnadb($dnadb) } + if ( defined($no_cache) ) { $self->no_cache($no_cache) } + + return $self; +} ## end sub new + +=head2 clear_caches + + Example : $dba->clear_caches(); + Description : Loops through all linked adaptors and clears their + caches if C is implemented. Not all caches + are cleared & the DBAdaptor instance should be removed from + the registry to clear these remaining essential caches. + Returntype : None + Exceptions : None + +=cut + +sub clear_caches { + my ($self) = @_; + my $adaptors = Bio::EnsEMBL::Registry->get_all_adaptors( + $self->species(), $self->group()); + while (my $adaptor = shift @{$adaptors}) { + if($adaptor->can('clear_cache')) { + $adaptor->clear_cache(); + } + } + return; +} + +=head2 dbc + + Arg[1] : (optional) Bio::EnsEMBL::DBSQL::DBConnection + + Example : $dbc = $dba->dbc(); + Description: Getter/Setter for DBConnection. + Returntype : Bio::EnsEMBL::DBSQL::DBConnection + Exceptions : throws if argument not a Bio::EnsEMBL::DBSQL::DBConnection + Caller : general + Status : Stable + +=cut + +sub dbc{ + my $self = shift; + + if(@_){ + my $arg = shift; + if(defined($arg)){ + if(!$arg->isa('Bio::EnsEMBL::DBSQL::DBConnection')){ + throw("$arg is no a DBConnection\n"); + } + } + $self->{_dbc} = $arg; + } + return $self->{_dbc}; +} + + + +=head2 add_db_adaptor + + Arg [1] : string $name + the name of the database to attach to this database + Arg [2] : Bio::EnsEMBL::DBSQL::DBConnection + the db adaptor to attach to this database + Example : $db->add_db_adaptor('lite', $lite_db_adaptor); + Description: Attaches another database instance to this database so + that it can be used in instances where it is required. + Returntype : none + Exceptions : none + Caller : EnsWeb + Status : At Risk + : may get deprecated, please use add_db from the registry instead + +=cut + +sub add_db_adaptor { + my ($self, $name, $adaptor) = @_; + + unless($name && $adaptor && ref $adaptor) { + throw('adaptor and name arguments are required'); + } + + Bio::EnsEMBL::Registry->add_db($self, $name, $adaptor); + +} + + +=head2 remove_db_adaptor + + Arg [1] : string $name + the name of the database to detach from this database. + Example : $lite_db = $db->remove_db_adaptor('lite'); + Description: Detaches a database instance from this database and returns + it. + Returntype : none + Exceptions : none + Caller : ? + Status : At Risk + : mey get deprecated, use remove_db instead from the Registry + +=cut + +sub remove_db_adaptor { + my ($self, $name) = @_; + + return Bio::EnsEMBL::Registry->remove_db($self, $name); +} + + +=head2 get_all_db_adaptors + + Arg [1] : none + Example : @attached_dbs = values %{$db->get_all_db_adaptors()}; + Description: returns all of the attached databases as + a hash reference of key/value pairs where the keys are + database names and the values are the attached databases + Returntype : hash reference with Bio::EnsEMBL::DBSQL::DBConnection values + Exceptions : none + Caller : Bio::EnsEMBL::DBSQL::ProxyAdaptor + Status : At Risk + : may get deprecated soon + : please use Bio::EnsEMBL::Registry->get_all_db_adaptors + +=cut + +sub get_all_db_adaptors { + my ($self) = @_; + return Bio::EnsEMBL::Registry->get_all_db_adaptors($self); +} + + + +=head2 get_db_adaptor + + Arg [1] : string $name + the name of the attached database to retrieve + Example : $lite_db = $db->get_db_adaptor('lite'); + Description: returns an attached db adaptor of name $name or undef if + no such attached database exists + Returntype : Bio::EnsEMBL::DBSQL::DBConnection + Exceptions : none + Caller : ? + Status : At Risk + : may get deprecated soon + : please use Bio::EnsEMBL::Registry->get_db_adaptors + +=cut + +sub get_db_adaptor { + my ($self, $name) = @_; + + return Bio::EnsEMBL::Registry->get_db($self, $name); +} + +=head2 get_available_adaptors + + Example : my %pairs = %{$dba->get_available_adaptors()}; + Description: gets a hash of the available adaptors + ReturnType : reference to a hash + Exceptions : none + Caller : Bio::EnsEMBL::Utils::ConfigRegistry + Status : Stable + +=cut + +sub get_available_adaptors { + my %pairs = ( + # Firstly those that just have an adaptor named after there object + # in the main DBSQL directory. + map( { $_ => "Bio::EnsEMBL::DBSQL::${_}Adaptor" } qw( + Analysis ArchiveStableId Attribute + AssemblyExceptionFeature AssemblyMapper CoordSystem + CompressedSequence DBEntry DnaAlignFeature + DensityFeature DensityType Exon + Gene KaryotypeBand MiscSet + MiscFeature PredictionTranscript PredictionExon + ProteinFeature ProteinAlignFeature RepeatConsensus + RepeatFeature Sequence SeqRegionSynonym SimpleFeature + Slice SupportingFeature Transcript + TranscriptSupportingFeature Translation UnmappedObject + UnconventionalTranscriptAssociation AssemblySlice + SplicingEvent SplicingEventFeature SplicingTranscriptPair + Operon OperonTranscript + DataFile Assembly + IntronSupportingEvidence + ) ), + # Those whose adaptors are in Map::DBSQL + map( { $_ => "Bio::EnsEMBL::Map::DBSQL::${_}Adaptor" } qw( + Marker MarkerFeature QtlFeature Qtl Ditag DitagFeature + ) ), + # Finally the exceptions... those that have non-standard mapping + # between object / adaptor .... + # 'Blast' => 'Bio::EnsEMBL::External::BlastAdaptor', + 'MetaCoordContainer' => 'Bio::EnsEMBL::DBSQL::MetaCoordContainer', + 'MetaContainer' => 'Bio::EnsEMBL::DBSQL::MetaContainer', + 'SNP' => 'Bio::EnsEMBL::DBSQL::ProxySNPAdaptor', + ); + + return ( \%pairs ); +} ## end sub get_available_adaptors + +########################################################### +# +# Support for DAS +# +########################################################### + +=head2 add_DASFeatureFactory + + Arg [1] : Bio::EnsEMBL::ExternalFeatureFactory $value + Example : none + Description: Attaches a DAS Feature Factory to this method. + ExternalFeatureFactory objects are not really used right now. + They may be reintroduced or taken out completely. The fate + of this function is unknown (although it is presently needed). + Returntype : none + Exceptions : none + Caller : EnsWeb + Status : At Risk + : with the new web code this may not be needed/supported + +=cut + +sub add_DASFeatureFactory{ + + my ($self,$value) = @_; + + push(@{$self->{'_das_ff'}},$value); +} + + +sub remove_all_DASFeatureFactories { + $_[0]->{'_das_ff'} = []; +} +=head2 _each_DASFeatureFactory + + Args : none + Example : none + Description: Not sure if this is used, or if it should be removed. It + does not seem to be used at the moment + Returntype : Bio::EnsEMBL::ExternalFeatureFactory + Exceptions : none + Caller : ?? + Status : At Risk + : with the new web code this may not be needed/supported + +=cut + +sub _each_DASFeatureFactory{ + my ($self) = @_; + + return @{$self->{'_das_ff'}||[]} +} + + +################################################################## +# +# SUPPORT FOR EXTERNAL FEATURE FACTORIES +# +################################################################## + + + +=head2 add_ExternalFeatureAdaptor + + Arg [1] : Bio::EnsEMBL::External::ExternalFeatureAdaptor + Example : $db_adaptor->add_ExternalFeatureAdaptor($xfa); + Description: Adds an external feature adaptor to this database adaptor. + Adding the external adaptor in this way allows external + features to be obtained from Slices and from RawContigs. + + The external feature adaptor which is passed to this method + will have its db attribuite set to this DBAdaptor object via + the db accessor method. + + ExternalFeatureAdaptors passed to this method are stored + internally in a hash keyed on the string returned by the + ExternalFeatureAdaptors track_name method. + + If the track name method is not implemented then the + a default key named 'External features' is assigned. In the + event of duplicate key names, a number is appended to the + key name, and incremented for each subsequent adaptor with the + same track name. For example, if no track_names are specified + then the the external feature adaptors will be stored under the + keys 'External features', 'External features2' + 'External features3' etc. + Returntype : none + Exceptions : none + Caller : general + +=cut + +sub add_ExternalFeatureAdaptor { + my ($self, $adaptor) = @_; + + unless($adaptor && ref $adaptor && + $adaptor->isa('Bio::EnsEMBL::External::ExternalFeatureAdaptor')) { + throw("[$adaptor] is not a " . + "Bio::EnsEMBL::External::ExternalFeatureAdaptor"); + } + + unless(exists $self->{'_xf_adaptors'}) { + $self->{'_xf_adaptors'} = {}; + } + + my $track_name = $adaptor->{'_track_name'}; + if(!$track_name) { + $track_name = $adaptor->track_name(); + } + + #use a generic track name if one hasn't been defined + unless(defined $track_name) { + $track_name = "External features"; + } + + #if the track name exists add numbers to the end until a free name is found + if(exists $self->{'_xf_adaptors'}->{"$track_name"}) { + my $num = 2; + $num++ while(exists $self->{'_xf_adaptors'}->{"$track_name$num"}); + $self->{'_xf_adaptors'}->{"$track_name$num"} = $adaptor; + } else { + $self->{'_xf_adaptors'}->{"$track_name"} = $adaptor; + } + + $adaptor->ensembl_db($self); +} + + + +=head2 get_ExternalFeatureAdaptors + + Arg [1] : none + Example : @xfas = values %{$db_adaptor->get_ExternalFeatureAdaptors}; + Description: Retrieves all of the ExternalFeatureAdaptors which have been + added to this DBAdaptor. The ExternalFeatureAdaptors are + returned in a reference to a hash keyed on the track names + of the external adaptors + Returntype : Reference to a hash of ExternalFeatureAdaptors keyed on + their track names. + Exceptions : none + Caller : general + +=cut + +sub get_ExternalFeatureAdaptors { + my $self = shift; + + return $self->{'_xf_adaptors'}; +} + + +=head2 add_ExternalFeatureFactory + + Arg [1] : Bio::EnsEMBL::DB::ExternalFeatureFactoryI $value + Example : $db_adaptor->add_ExternalFeatureFactory + Description: It is recommended that add_ExternalFeatureAdaptor be used + instead. See documentation for + Bio::EnsEMBL::External::ExternalFeatureAdaptor + + Adds an external feature factory to the core database + so that features from external sources can be displayed in + ensembl. This method is still available mainly for legacy + support for external EnsEMBL installations. + Returntype : none + Exceptions : none + Caller : external + +=cut + +sub add_ExternalFeatureFactory{ + my ($self,$value) = @_; + + $self->add_ExternalFeatureAdaptor($value); +} + +# +# OVERWRITABLE STANDARD ADAPTORS +# + +=head2 get_adaptor + + Arg [1] : Canonical data type for which an adaptor is required. + Example : $db_adaptor->get_adaptor("Protein") + Description: Gets an adaptor object for a standard data type. + Returntype : Adaptor Object of arbitrary type or undef + Exceptions : none + Caller : external + Status : Medium Risk + : please use the Registry method, as at some time this + : may no longer be supprted. + +=cut + +sub get_adaptor { + my ($self, $canonical_name, @other_args) = @_; + + return $reg->get_adaptor($self->species(),$self->group(),$canonical_name); +} + + + +=head2 set_adaptor + + Arg [1] : Canonical data type for new adaptor. + Arg [2] : Object defining the adaptor for arg1. + Example : $aa = Bio::EnsEMBL::DBSQL::GeneAdaptor->new($db_adaptor); + : $db_adaptor->set_adaptor("Gene", $ga) + Description: Stores the object which represents the adaptor for the + arg1 data type. + Returntype : none + Exceptions : none + Caller : external + Status : Medium Risk + : please use the Registry method, as at some time this + : may no longer be supprted. + +=cut + +sub set_adaptor { + my ($self, $canonical_name, $module) = @_; + + $reg->add_adaptor($self->species(),$self->group(),$canonical_name,$module); + + return $module; +} + + +# +# GENERIC FEATURE ADAPTORS +# + +=head2 get_GenericFeatureAdaptors + + Arg [1] : List of names of feature adaptors to get. If no + adaptor names are given, all the defined adaptors are returned. + Example : $db->get_GenericFeature("SomeFeature", "SomeOtherFeature") + Description: Returns a hash containing the named feature adaptors (or + all feature adaptors). + Returntype : reference to a Hash containing the named + feature adaptors (or all feature adaptors). + Exceptions : If any of the the named generic feature adaptors do not exist. + Caller : external + +=cut + +sub get_GenericFeatureAdaptors { + + my ($self, @names) = @_; + + my %adaptors = (); + + if (!@names) { + %adaptors = %{$self->{'generic_feature_adaptors'}}; + } else { + foreach my $name (@names) { + if (!exists($self->{'generic_feature_adaptors'}->{$name})) { + throw("No generic feature adaptor has been defined for $name" ); + } + + + $adaptors{$name} = $self->{'generic_feature_adaptors'}->{$name}; + } + } + + return \%adaptors; +} + + +=head2 add_GenericFeatureAdaptor + + Arg [1] : The name of the feature. + Arg [2] : Adaptor object for a generic feature. + Example : $db->add_GenericFeatureAdaptor("SomeFeature", + "Bio::EnsEMBL::DBSQL::SomeFeatureAdaptor") + Description: Stores the object which represents the adaptor for the + named feature type. + Returntype : none + Exceptions : + Caller : external + +=cut + +sub add_GenericFeatureAdaptor { + my ($self, $name, $adaptor_obj) = @_; + + # check that $adaptor is an object that subclasses BaseFeatureAdaptor + if (!$adaptor_obj->isa("Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor")) { + throw("$name is a " . ref($adaptor_obj) . "which is not a " . + "subclass of Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor" ); + } + + $self->{'generic_feature_adaptors'}->{$name} = $adaptor_obj; +} + +=head2 species + + Arg [1] : (optional) string $arg + The new value of the species used by this DBAdaptor. + Example : $species = $dba->species() + Description: Getter/Setter for the species of to use for + this connection. There is currently no point in setting + this value after the connection has already been established + by the constructor. + Returntype : string + Exceptions : none + Caller : new + Status : Stable + +=cut + +sub species { + my ( $self, $arg ) = @_; + + if ( defined($arg) ) { + $self->{_species} = $arg; + } + + $self->{_species}; +} + +=head2 all_species + + Args : NONE + Example : @all_species = @{$dba->all_species()}; + Description: Returns the names of all species contained in the + database to which this DBAdaptor is connected. + Returntype : array reference + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub all_species { + my ($self) = @_; + + if ( !$self->is_multispecies() ) { return [ $self->species() ] } + + if ( exists( $self->{'_all_species'} ) ) { + return $self->{'_all_species'}; + } + + my $statement = + "SELECT meta_value " + . "FROM meta " + . "WHERE meta_key = 'species.db_name'"; + + my $sth = $self->dbc()->db_handle()->prepare($statement); + + $sth->execute(); + + my $species; + $sth->bind_columns( \$species ); + + my @all_species; + while ( $sth->fetch() ) { push( @all_species, $species ) } + + $self->{'_all_species'} = \@all_species; + + return $self->{'_all_species'}; +} ## end sub all_species + + +=head2 is_multispecies + + Arg [1] : (optional) boolean $arg + Example : if ($dba->is_multispecies()) { } + Description: Getter/Setter for the is_multispecies boolean of + to use for this connection. There is currently no + point in setting this value after the connection has + already been established by the constructor. + Returntype : boolean + Exceptions : none + Caller : new + Status : Stable + +=cut + +sub is_multispecies { + my ( $self, $arg ) = @_; + + if ( defined($arg) ) { + $self->{_is_multispecies} = $arg; + } + + return $self->{_is_multispecies}; +} + + +=head2 species_id + + Arg [1] : (optional) string $arg + The new value of the species_id used by this DBAdaptor + when dealing with multi-species databases. + Example : $species_id = $dba->species_id() + Description: Getter/Setter for the species_id of to use for this + connection. There is currently no point in setting + this value after the connection has already been + established by the constructor. + Returntype : string + Exceptions : none + Caller : new + Status : Stable + +=cut + +sub species_id { + my ( $self, $arg ) = @_; + + if ( defined($arg) ) { + $self->{_species_id} = $arg; + } + + return $self->{_species_id}; +} + + +=head2 no_cache + + Arg [1] : (optional) int $arg + The new value of the no cache attribute used by this DBAdaptor. + Example : $no_cache = $dba->no_cache(); + Description: Getter/Setter for the no_cache to use for + this connection. There is currently no point in setting + this value after the connection has already been established + by the constructor. + Returntype : int + Exceptions : none + Caller : new + Status : Stable + +=cut + +sub no_cache { + my ($self, $arg ) = @_; + + if ( defined $arg ){ + if ($arg != 1 && $arg != 0){ + throw("$arg is not allowed for this attribute. Only value 1|0 is allowed"); + } + $self->{_no_cache} = $arg; + } + $self->{_no_cache}; +} + + +=head2 group + + Arg [1] : (optional) string $arg + The new value of the group used by this DBAdaptor. + Example : $group = $dba->group() + Description: Getter/Setter for the group of to use for + this connection. There is currently no point in setting + this value after the connection has already been established + by the constructor. + Returntype : string + Exceptions : none + Caller : new + Status : Stable + +=cut + +sub group { + my ($self, $arg ) = @_; + ( defined $arg ) && + ( $self->{_group} = $arg ); + $self->{_group}; +} + +=head2 get_SeqRegionCache + + Arg [1] : none + Example : my $srcache = $dba->get_SeqRegionCache(); + Description: Retrieves a seq_region cache for this database + Returntype : Bio::EnsEMBL::Utils::SeqRegionCache + Exceptions : none + Caller : SliceAdaptor, AssemblyMapperAdaptor + Status : Stable + +=cut + +sub get_SeqRegionCache { + my $self = shift; + + # use the cache from the database where seq_regions are stored + if($self != $self->dnadb()) { + return $self->dnadb()->get_SeqRegionCache(); + } + + if(!$self->{'seq_region_cache'}) { + $self->{'seq_region_cache'} = Bio::EnsEMBL::Utils::SeqRegionCache->new(); + } + + return $self->{'seq_region_cache'}; +} + + + +#convenient method to retrieve the schema_build version for the database being used + +sub _get_schema_build{ + my ($self) = @_; + + #avoided using dnadb by default to avoid obfuscation of behaviour + + my @dbname = split/_/, $self->dbc->dbname(); + + #warn "dbname is $schema_build"; + + my $schema_build = pop @dbname; + $schema_build = pop(@dbname).'_'.$schema_build; + + + return $schema_build; +} + + +=head2 dnadb + + Title : dnadb + Usage : my $dnadb = $db->dnadb(); + Function: returns the database adaptor where the dna lives + Useful if you only want to keep one copy of the dna + on disk but have other databases with genes and features in + Returns : dna database adaptor + Args : Bio::EnsEMBL::DBSQL::BaseAdaptor + Status : Medium Risk. + : Use the Registry method add_DNAAdaptor/get_DNAAdaptor instead + +=cut + +sub dnadb { + my $self = shift; + + if(@_) { + my $arg = shift; + $reg->add_DNAAdaptor($self->species(),$self->group(),$arg->species(),$arg->group()); + } + +# return $self->{'dnadb'} || $self; + return $reg->get_DNAAdaptor($self->species(),$self->group()) || $self; +} + + +use vars '$AUTOLOAD'; + +sub AUTOLOAD { + my ( $self, @args ) = @_; + + my $type; + if ( $AUTOLOAD =~ /^.*::get_(\w+)Adaptor$/ ) { + $type = $1; + } elsif ( $AUTOLOAD =~ /^.*::get_(\w+)$/ ) { + $type = $1; + } else { + throw( sprintf( "Could not work out type for %s\n", $AUTOLOAD ) ); + } + + my $ret = $reg->get_adaptor( $self->species(), $self->group(), $type ); + + return $ret if $ret; + + warning( sprintf( + "Could not find %s adaptor in the registry for %s %s\n", + $type, $self->species(), $self->group() ) ); + + throw( sprintf( + "Could not get adaptor %s for %s %s\n", + $type, $self->species(), $self->group() ) ); + +} ## end sub AUTOLOAD + +sub DESTROY { } # required due to AUTOLOAD + + +######################### +# sub DEPRECATED METHODS +######################### +=head2 db + + Description: DEPRECATED + +=cut + +sub db{ + my ($self, $arg ) = @_; + deprecate("db Should no longer be called from the DBAdaptor. DBConnection should now be used OR preferably the object adaptor itself\n"); + return $self->dbc($arg); +} + + +sub source { deprecate('Do not use - this method does nothing'); } + + +=head2 assembly_type + + Description: DEPRECATED - Use CoordSystemAdaptor to obtain default coordinate + system instead. + +=cut + +sub assembly_type{ + my $self = shift; + + deprecate('Use CoordSystemAdaptor $csa->fetch_all->[0]->version() instead'); + + my $csa = $self->get_CoordSystemAdaptor(); + my ($cs) = @{$csa->fetch_all()}; + return ($cs) ? $cs->version() : undef; +} + + + +=head2 list_supported_assemblies + + Description: DEPRECATED - Use CoordSystemAdaptor to obtain list of top-level + coordinate systems instead + +=cut + +sub list_supported_assemblies { + my($self) = @_; + deprecate('Use CoordSystemAdaptor::fetch_all instead'); + + my $csa = $self->get_CoordSystemAdaptor(); + my %versions; + foreach my $cs (@{$csa->fetch_all()}) { + $versions{$cs->version()} = 1; + } + + return keys %versions; +} + + +sub prepare{ + my ($self, @args) = @_; + + deprecate("prepare Should no longer be called from the DBAdaptor. DBConnection should now be used OR preferably the object adaptor itself\n"); + $self->dbc->prepare(@args); +} + +sub dbname{ + my ($self, @args) = @_; + + deprecate("dbname Should no longer be called from the DBAdaptor. DBConnection should now be used OR preferably the object adaptor itself\n"); + $self->dbc->dbname(@args); +} + +sub disconnect_when_inactive{ + my ($self, @args) = @_; + + deprecate("disconnect_when_inactive Should no longer be called from the DBAdaptor. DBConnection should now be used OR preferably the object adaptor itself\n"); + $self->dbc->disconnect_when_inactive(@args); +} + +sub reconnect_when_lost{ + my ($self, @args) = @_; + + deprecate("reconnect_when_lost Should no longer be called from the DBAdaptor. DBConnection should now be used OR preferably the object adaptor itself\n"); + $self->dbc->reconnect_when_lost(@args); +} + + +sub host{ + my ($self, @args) = @_; + + deprecate("host Should no longer be called from the DBAdaptor. DBConnection should now be used OR preferably the object adaptor itself\n"); + $self->dbc->host(@args); +} +sub username{ + my ($self, @args) = @_; + + deprecate("username Should no longer be called from the DBAdaptor. DBConnection should now be used OR preferably the object adaptor itself\n"); + $self->dbc->username(@args); +} +sub password{ + my ($self, @args) = @_; + + deprecate("password Should no longer be called from the DBAdaptor. DBConnection should now be used OR preferably the object adaptor itself\n"); + $self->dbc->password(@args); +} +sub driver{ + my ($self, @args) = @_; + + deprecate("driver Should no longer be called from the DBAdaptor. DBConnection should now be used OR preferably the object adaptor itself\n"); + $self->dbc->driver(@args); +} +sub port{ + my ($self, @args) = @_; + + deprecate("port Should no longer be called from the DBAdaptor. DBConnection should now be used OR preferably the object adaptor itself\n"); + $self->dbc->port(@args); +} + +sub db_handle{ + my ($self, @args) = @_; + + + deprecate("db_handle Should no longer be called from the DBAdaptor. DBConnection should now be used OR preferably the object adaptor itself\n"); + $self->dbc->db_handle(@args); +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/DBConnection.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/DBConnection.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1117 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::DBConnection + +=head1 SYNOPSIS + + $dbc = Bio::EnsEMBL::DBSQL::DBConnection->new( + -user => 'anonymous', + -dbname => 'homo_sapiens_core_20_34c', + -host => 'ensembldb.ensembl.org', + -driver => 'mysql', + ); + + # SQL statements should be created/executed through this modules + # prepare() and do() methods. + + $sth = $dbc->prepare("SELECT something FROM yourtable"); + + $sth->execute(); + + # do something with rows returned ... + + $sth->finish(); + +=head1 DESCRIPTION + +This class is a wrapper around DBIs datbase handle. It provides some +additional functionality such as the ability to automatically disconnect +when inactive and reconnect when needed. + +Generally this class will be used through one of the object adaptors or +the Bio::EnsEMBL::Registry and will not be instantiated directly. + +=head1 METHODS + +=cut + + +package Bio::EnsEMBL::DBSQL::DBConnection; + +use vars qw(@ISA); +use strict; + +use Bio::EnsEMBL::Root; +use DBI; + +use Bio::EnsEMBL::DBSQL::StatementHandle; + +use Bio::EnsEMBL::Utils::Exception qw/deprecate throw info warning/; +use Bio::EnsEMBL::Utils::Argument qw/rearrange/; +use Bio::EnsEMBL::Utils::Scalar qw/assert_ref wrap_array/; +use Bio::EnsEMBL::Utils::SqlHelper; + +@ISA = qw(Bio::EnsEMBL::Root); # for backwards compatibility + +=head2 new + + Arg [DBNAME] : (optional) string + The name of the database to connect to. + Arg [HOST] : (optional) string + The domain name of the database host to connect to. + 'localhost' by default. + Arg [USER] : string + The name of the database user to connect with + Arg [PASS] : (optional) string + The password to be used to connect to the database + Arg [PORT] : (optional) int + The port to use when connecting to the database + 3306 by default if the driver is mysql. + Arg [DRIVER] : (optional) string + The type of database driver to use to connect to the DB + mysql by default. + Arg [DBCONN] : (optional) + Open another handle to the same database as another connection + If this argument is specified, no other arguments should be + specified. + Arg [DISCONNECT_WHEN_INACTIVE]: (optional) boolean + If set to true, the database connection will be disconnected + everytime there are no active statement handles. This is + useful when running a lot of jobs on a compute farm + which would otherwise keep open a lot of connections to the + database. Database connections are automatically reopened + when required.Do not use this option together with RECONNECT_WHEN_CONNECTION_LOST. + Arg [WAIT_TIMEOUT]: (optional) integer + Time in seconds for the wait timeout to happen. Time after which + the connection is deleted if not used. By default this is 28800 (8 hours) + on most systems. + So set this to greater than this if your connection are getting deleted. + Only set this if you are having problems and know what you are doing. + Arg [RECONNECT_WHEN_CONNECTION_LOST]: (optional) boolean + In case you're reusing the same database connection, i.e. DISCONNECT_WHEN_INACTIVE is + set to false and running a job which takes a long time to process (over 8hrs), + which means that the db connection may be lost, set this option to true. + On each prepare or do statement the db handle will be pinged and the database + connection will be reconnected if it's lost. + + Example : $dbc = Bio::EnsEMBL::DBSQL::DBConnection->new + (-user => 'anonymous', + -dbname => 'homo_sapiens_core_20_34c', + -host => 'ensembldb.ensembl.org', + -driver => 'mysql'); + + Description: Constructor for a Database Connection. Any adaptors that require + database connectivity should inherit from this class. + Returntype : Bio::EnsEMBL::DBSQL::DBConnection + Exceptions : thrown if USER or DBNAME are not specified, or if the database + cannot be connected to. + Caller : Bio::EnsEMBL::Utils::ConfigRegistry ( for newer code using the registry) + Bio::EnsEMBL::DBSQL::DBAdaptor ( for old style code) + Status : Stable + +=cut + +sub new { + my $class = shift; + + my ( + $db, $host, $driver, + $user, $password, $port, + $inactive_disconnect, $dbconn, $wait_timeout, $reconnect + ) + = rearrange( [ + 'DBNAME', 'HOST', 'DRIVER', 'USER', 'PASS', 'PORT', + 'DISCONNECT_WHEN_INACTIVE', 'DBCONN', 'WAIT_TIMEOUT', 'RECONNECT_WHEN_CONNECTION_LOST' + ], + @_ + ); + + my $self = {}; + bless $self, $class; + + if($dbconn) { + if($db || $host || $driver || $password || $port || $inactive_disconnect || $reconnect) { + throw("Cannot specify other arguments when -DBCONN argument used."); + } + + $self->driver($dbconn->driver()); + $self->host($dbconn->host()); + $self->port($dbconn->port()); + $self->username($dbconn->username()); + $self->password($dbconn->password()); + $self->dbname($dbconn->dbname()); + + if($dbconn->disconnect_when_inactive()) { + $self->disconnect_when_inactive(1); + } + } else { + $driver ||= 'mysql'; + + if($driver eq 'mysql') { + $user || throw("-USER argument is required."); + $host ||= 'mysql'; + if(!defined($port)){ + $port = 3306; + if($host eq "ensembldb.ensembl.org"){ + if( $db =~ /\w+_\w+_\w+_(\d+)/){ + if($1 >= 48){ + $port = 5306; + } + } + } + } + } + + $wait_timeout ||= 0; + + $self->driver($driver); + $self->host( $host ); + $self->port($port); + $self->username( $user ); + $self->password( $password ); + $self->dbname( $db ); + $self->timeout($wait_timeout); + + if($inactive_disconnect) { + $self->disconnect_when_inactive($inactive_disconnect); + } + if($reconnect) { + $self->reconnect_when_lost($reconnect); + } + } + +# if(defined $dnadb) { +# $self->dnadb($dnadb); +# } + return $self; +} + + +=head2 connect + + Example : $dbcon->connect() + Description: Connects to the database using the connection attribute + information. + Returntype : none + Exceptions : none + Caller : new, db_handle + Status : Stable + +=cut + +sub connect { + my ($self) = @_; + + if ( $self->connected() ) { return } + + $self->connected(1); + + if ( defined( $self->db_handle() ) and $self->db_handle()->ping() ) { + warning( "unconnected db_handle is still pingable, " + . "reseting connected boolean\n" ); + } + + my ( $dsn, $dbh ); + my $dbname = $self->dbname(); + + if ( $self->driver() eq "Oracle" ) { + + $dsn = "DBI:Oracle:"; + + eval { + $dbh = DBI->connect( $dsn, + sprintf( "%s@%s", + $self->username(), $dbname ), + $self->password(), + { 'RaiseError' => 1, 'PrintError' => 0 } ); + }; + + } elsif ( $self->driver() eq "ODBC" ) { + + $dsn = sprintf( "DBI:ODBC:%s", $self->dbname() ); + + eval { + $dbh = DBI->connect( $dsn, + $self->username(), + $self->password(), { + 'LongTruncOk' => 1, + 'LongReadLen' => 2**16 - 8, + 'RaiseError' => 1, + 'PrintError' => 0, + 'odbc_cursortype' => 2 } ); + }; + + } elsif ( $self->driver() eq "Sybase" ) { + my $dbparam = ($dbname) ? ";database=${dbname}" : q{}; + + $dsn = sprintf( "DBI:Sybase:server=%s%s;tdsLevel=CS_TDS_495", + $self->host(), $dbparam ); + + eval { + $dbh = DBI->connect( $dsn, + $self->username(), + $self->password(), { + 'LongTruncOk' => 1, + 'RaiseError' => 1, + 'PrintError' => 0 } ); + }; + + } elsif ( lc( $self->driver() ) eq 'sqlite' ) { + + throw "We require a dbname to connect to a SQLite database" + if !$dbname; + + $dsn = sprintf( "DBI:SQLite:%s", $dbname ); + + eval { + $dbh = DBI->connect( $dsn, '', '', { 'RaiseError' => 1, } ); + }; + + } else { + + my $dbparam = ($dbname) ? "database=${dbname};" : q{}; + + $dsn = sprintf( "DBI:%s:%shost=%s;port=%s", + $self->driver(), $dbparam, + $self->host(), $self->port() ); + + if ( $self->{'disconnect_when_inactive'} ) { + $self->{'count'}++; + if ( $self->{'count'} > 1000 ) { + sleep 1; + $self->{'count'} = 0; + } + } + eval { + $dbh = DBI->connect( $dsn, $self->username(), $self->password(), + { 'RaiseError' => 1 } ); + }; + } + + if ( !$dbh || $@ || !$dbh->ping() ) { + warn( "Could not connect to database " + . $self->dbname() + . " as user " + . $self->username() + . " using [$dsn] as a locator:\n" + . $DBI::errstr ); + + $self->connected(0); + + throw( "Could not connect to database " + . $self->dbname() + . " as user " + . $self->username() + . " using [$dsn] as a locator:\n" + . $DBI::errstr ); + } + + $self->db_handle($dbh); + + if ( $self->timeout() ) { + $dbh->do( "SET SESSION wait_timeout=" . $self->timeout() ); + } + + #print("CONNECT\n"); +} ## end sub connect + + +=head2 connected + + Example : $dbcon->connected() + Description: Boolean which tells if DBConnection is connected or not. + State is set internally, and external processes should not alter state. + Returntype : undef or 1 + Exceptions : none + Caller : db_handle, connect, disconnect_if_idle, user processes + Status : Stable + +=cut + +sub connected { + my $self = shift; + + # Use the process id ($$) as part of the key for the connected flag. + # This forces the opening of another connection in a forked subprocess. + $self->{'connected'.$$} = shift if(@_); + return $self->{'connected'.$$}; +} + +sub disconnect_count { + my $self = shift; + return $self->{'disconnect_count'} = shift if(@_); + $self->{'disconnect_count'}=0 unless(defined($self->{'disconnect_count'})); + return $self->{'disconnect_count'}; +} + +sub timeout{ + my($self, $arg ) = @_; + + (defined $arg) && + ($self->{_timeout} = $arg ); + + return $self->{_timeout}; + +} + +sub query_count { + my $self = shift; + return $self->{'_query_count'} = shift if(@_); + $self->{'_query_count'}=0 unless(defined($self->{'_query_count'})); + return $self->{'_query_count'}; +} + +=head2 equals + + Example : warn 'Same!' if($dbc->equals($other_dbc)); + Description: Equality checker for DBConnection objects + Returntype : boolean + Exceptions : none + Caller : new + Status : Stable + +=cut + + +sub equals { + my ( $self, $dbc ) = @_; + return 0 if ! defined $dbc; + my $return = 0; + my $undef_str = q{!-undef-!}; + my $undef_num = -1; + + $return = 1 if ( + (($self->host() || $undef_str) eq ($dbc->host() || $undef_str)) && + (($self->dbname() || $undef_str) eq ($dbc->dbname() || $undef_str)) && + (($self->port() || $undef_num) == ($dbc->port() || $undef_num)) && + (($self->username() || $undef_str) eq ($dbc->username() || $undef_str)) && + ($self->driver() eq $dbc->driver()) + ); + + return $return; +} + +=head2 driver + + Arg [1] : (optional) string $arg + the name of the driver to use to connect to the database + Example : $driver = $db_connection->driver() + Description: Getter / Setter for the driver this connection uses. + Right now there is no point to setting this value after a + connection has already been established in the constructor. + Returntype : string + Exceptions : none + Caller : new + Status : Stable + +=cut + +sub driver { + my($self, $arg ) = @_; + + (defined $arg) && + ($self->{_driver} = $arg ); + return $self->{_driver}; +} + + +=head2 port + + Arg [1] : (optional) int $arg + the TCP or UDP port to use to connect to the database + Example : $port = $db_connection->port(); + Description: Getter / Setter for the port this connection uses to communicate + to the database daemon. There currently is no point in + setting this value after the connection has already been + established by the constructor. + Returntype : string + Exceptions : none + Caller : new + Status : Stable + +=cut + +sub port { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'_port'} = $value; + } + + return $self->{'_port'}; +} + + +=head2 dbname + + Arg [1] : (optional) string $arg + The new value of the database name used by this connection. + Example : $dbname = $db_connection->dbname() + Description: Getter/Setter for the name of the database used by this + connection. There is currently no point in setting this value + after the connection has already been established by the + constructor. + Returntype : string + Exceptions : none + Caller : new + Status : Stable + +=cut + +sub dbname { + my ($self, $arg ) = @_; + ( defined $arg ) && + ( $self->{_dbname} = $arg ); + $self->{_dbname}; +} + + +=head2 username + + Arg [1] : (optional) string $arg + The new value of the username used by this connection. + Example : $username = $db_connection->username() + Description: Getter/Setter for the username used by this + connection. There is currently no point in setting this value + after the connection has already been established by the + constructor. + Returntype : string + Exceptions : none + Caller : new + Status : Stable + +=cut + +sub username { + my ($self, $arg ) = @_; + ( defined $arg ) && + ( $self->{_username} = $arg ); + $self->{_username}; +} + + +=head2 host + + Arg [1] : (optional) string $arg + The new value of the host used by this connection. + Example : $host = $db_connection->host() + Description: Getter/Setter for the domain name of the database host use by + this connection. There is currently no point in setting + this value after the connection has already been established + by the constructor. + Returntype : string + Exceptions : none + Caller : new + Status : Stable + +=cut + +sub host { + my ($self, $arg ) = @_; + ( defined $arg ) && + ( $self->{_host} = $arg ); + $self->{_host}; +} + + +=head2 password + + Arg [1] : (optional) string $arg + The new value of the password used by this connection. + Example : $host = $db_connection->password() + Description: Getter/Setter for the password of to use for this + connection. There is currently no point in setting + this value after the connection has already been + established by the constructor. + Returntype : string + Exceptions : none + Caller : new + Status : Stable + +=cut + +sub password { + my ( $self, $arg ) = @_; + + if ( defined($arg) ) { + # Use an anonymous subroutine that will return the password when + # invoked. This will prevent the password from being accidentally + # displayed when using e.g. Data::Dumper on a structure containing + # one of these objects. + + $self->{_password} = sub { $arg }; + } + + return ( ref( $self->{_password} ) && &{ $self->{_password} } ) || ''; +} + + + +=head2 disconnect_when_inactive + + Arg [1] : (optional) boolean $newval + Example : $db->disconnect_when_inactive(1); + Description: Getter/Setter for the disconnect_when_inactive flag. If set + to true this DBConnection will continually disconnect itself + when there are no active statement handles and reconnect as + necessary. Useful for farm environments when there can be + many (often inactive) open connections to a database at once. + Returntype : boolean + Exceptions : none + Caller : Pipeline + Status : Stable + +=cut + +sub disconnect_when_inactive { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'disconnect_when_inactive'} = $value; + if ($value) { + $self->disconnect_if_idle(); + } + } + + return $self->{'disconnect_when_inactive'}; +} + + +=head2 reconnect_when_lost + + Arg [1] : (optional) boolean $newval + Example : $db->reconnect_when_lost(1); + Description: Getter/Setter for the reconnect_when_lost flag. If set + to true the db handle will be pinged on each prepare or do statement + and the connection will be reestablished in case it's lost. + Useful for long running jobs (over 8hrs), which means that the db + connection may be lost. + Returntype : boolean + Exceptions : none + Caller : Pipeline + Status : Stable + +=cut + +sub reconnect_when_lost { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'reconnect_when_lost'} = $value; + } + + return $self->{'reconnect_when_lost'}; +} + + + +=head2 locator + + Arg [1] : none + Example : $locator = $dbc->locator; + Description: Constructs a locator string for this database connection + that can, for example, be used by the DBLoader module + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + + +sub locator { + my ($self) = @_; + + return sprintf( + "%s/host=%s;port=%s;dbname=%s;user=%s;pass=%s", + ref($self), $self->host(), $self->port(), + $self->dbname(), $self->username(), $self->password() ); +} + + +=head2 db_handle + + Arg [1] : DBI Database Handle $value + Example : $dbh = $db_connection->db_handle() + Description: Getter / Setter for the Database handle used by this + database connection. + Returntype : DBI Database Handle + Exceptions : none + Caller : new, DESTROY + Status : Stable + +=cut + +sub db_handle { + my $self = shift; + + # Use the process id ($$) as part of the key for the database handle + # this makes this object fork safe. fork() does not makes copies + # of the open socket which creates problems when one of the forked + # processes disconnects, + return $self->{'db_handle'.$$} = shift if(@_); + return $self->{'db_handle'.$$} if($self->connected); + + $self->connect(); + return $self->{'db_handle'.$$}; +} + + +=head2 prepare + + Arg [1] : string $string + the SQL statement to prepare + Example : $sth = $db_connection->prepare("SELECT column FROM table"); + Description: Prepares a SQL statement using the internal DBI database handle + and returns the DBI statement handle. + Returntype : DBI statement handle + Exceptions : thrown if the SQL statement is empty, or if the internal + database handle is not present + Caller : Adaptor modules + Status : Stable + +=cut + +sub prepare { + my ($self,@args) = @_; + + if( ! $args[0] ) { + throw("Attempting to prepare an empty SQL query."); + } + + #warn "SQL(".$self->dbname."):" . join(' ', @args) . "\n"; + if ( ($self->reconnect_when_lost()) and (!$self->db_handle()->ping()) ) { + $self->reconnect(); + } + my $sth = $self->db_handle->prepare(@args); + + # return an overridden statement handle that provides us with + # the means to disconnect inactive statement handles automatically + bless $sth, "Bio::EnsEMBL::DBSQL::StatementHandle"; + $sth->dbc($self); + $sth->sql($args[0]); + + $self->query_count($self->query_count()+1); + return $sth; +} + +=head2 reconnect + + Example : $dbcon->reconnect() + Description: Reconnects to the database using the connection attribute + information if db_handle no longer pingable. + Returntype : none + Exceptions : none + Caller : new, db_handle + Status : Stable + +=cut + +sub reconnect { + my ($self) = @_; + $self->connected(undef); + $self->db_handle(undef); + $self->connect(); + return; +} + + +=head2 do + + Arg [1] : string $string + the SQL statement to prepare + Example : $sth = $db_connection->do("SELECT column FROM table"); + Description: Executes a SQL statement using the internal DBI database handle. + Returntype : Result of DBI dbh do() method + Exceptions : thrown if the SQL statement is empty, or if the internal + database handle is not present. + Caller : Adaptor modules + Status : Stable + +=cut + +sub do { + my ($self,$string) = @_; + + if( ! $string ) { + throw("Attempting to do an empty SQL query."); + } + + # warn "SQL(".$self->dbname."): $string"; + my $error; + + my $do_result = $self->work_with_db_handle(sub { + my ($dbh) = @_; + my $result = eval { $dbh->do($string) }; + $error = $@ if $@; + return $result; + }); + + throw "Detected an error whilst executing statement '$string': $error" if $error; + + return $do_result; +} + +=head2 work_with_db_handle + + Arg [1] : CodeRef $callback + Example : my $q_t = $dbc->work_with_db_handle(sub { my ($dbh) = @_; return $dbh->quote_identifier('table'); }); + Description: Gives access to the DBI handle to execute methods not normally + provided by the DBConnection interface + Returntype : Any from callback + Exceptions : If the callback paramater is not a CodeRef; all other + errors are re-thrown after cleanup. + Caller : Adaptor modules + Status : Stable + +=cut + +sub work_with_db_handle { + my ($self, $callback) = @_; + my $wantarray = wantarray; + assert_ref($callback, 'CODE', 'callback'); + if( $self->reconnect_when_lost() && !$self->db_handle()->ping()) { + $self->reconnect(); + } + my @results; + eval { + if($wantarray) { + @results = $callback->($self->db_handle()) + } + elsif(defined $wantarray) { + $results[0] = $callback->($self->db_handle()); + } + else { + $callback->($self->db_handle()); + } + }; + my $original_error = $@; + + $self->query_count($self->query_count()+1); + eval { + if($self->disconnect_when_inactive()) { + $self->disconnect_if_idle(); + } + }; + if($@) { + warning "Detected an error whilst attempting to disconnect the DBI handle: $@"; + } + if($original_error) { + throw "Detected an error when running DBI wrapper callback:\n$original_error"; + } + + if(defined $wantarray) { + return ($wantarray) ? @results : $results[0]; + } + return; +} + +=head2 prevent_disconnect + + Arg[1] : CodeRef $callback + Example : $dbc->prevent_disconnect(sub { $dbc->do('do something'); $dbc->do('something else')}); + Description : A wrapper method which prevents database disconnection for the + duration of the callback. This is very useful if you need + to make multiple database calls avoiding excessive database + connection creation/destruction but still want the API + to disconnect after the body of work. + + The value of C is set to 0 no + matter what the original value was & after $callback has + been executed. If C was + already set to 0 then this method will be an effective no-op. + Returntype : None + Exceptions : Raised if there are issues with reverting the connection to its + default state. + Caller : DBConnection methods + Status : Beta + +=cut + +sub prevent_disconnect { + my ($self, $callback) = @_; + assert_ref($callback, 'CODE', 'callback'); + my $original_dwi = $self->disconnect_when_inactive(); + $self->disconnect_when_inactive(0); + eval { $callback->(); }; + my $original_error = $@; + eval { + $self->disconnect_when_inactive($original_dwi); + }; + if($@) { + warning "Detected an error whilst attempting to reset disconnect_when_idle: $@"; + } + if($original_error) { + throw "Detected an error when running DBI wrapper callback:\n$original_error"; + } + return; +} + +=head2 quote_identifier + + Arg [n] : scalar/ArrayRef + Example : $q = $dbc->quote_identifier('table', 'other'); + $q = $dbc->quote_identifier([undef, 'my_db', 'table'], [undef, 'my_db', 'other']); + Description: Executes the DBI C method which will quote + any given string using the database driver's quote character. + Returntype : ArrayRef + Exceptions : None + Caller : General + Status : Stable + +=cut + +sub quote_identifier { + my ($self, @identifiers) = @_; + return $self->work_with_db_handle(sub { + my ($dbh) = @_; + my @output; + foreach my $identifier_array (@identifiers) { + $identifier_array = wrap_array($identifier_array); + push(@output, $dbh->quote_identifier(@{$identifier_array})); + } + return \@output; + }); +} + +=head2 disconnect_if_idle + + Arg [1] : none + Example : $dbc->disconnect_if_idle(); + Description: Disconnects from the database if there are no currently active + statement handles. + It is called automatically by the DESTROY method of the + Bio::EnsEMBL::DBSQL::SQL::StatementHandle if the + disconect_when_inactive flag is set. + Users may call it whenever they want to disconnect. Connection will + reestablish on next access to db_handle() + Returntype : 1 or 0 + 1=problem trying to disconnect while a statement handle was still active + Exceptions : none + Caller : Bio::EnsEMBL::DBSQL::SQL::StatementHandle::DESTROY + Bio::EnsEMBL::DBSQL::DBConnection::do + Status : Stable + +=cut + +sub disconnect_if_idle { + my $self = shift; + + return 0 if(!$self->connected()); + my $db_handle = $self->db_handle(); + return 0 unless(defined($db_handle)); + + #printf("disconnect_if_idle : kids=%d activekids=%d\n", + # $db_handle->{Kids}, $db_handle->{ActiveKids}); + + #If InactiveDestroy is set, don't disconnect. + #To comply with DBI specification + return 0 if($db_handle->{InactiveDestroy}); + + #If any statement handles are still active, don't allow disconnection + #In this case it is being called before a query has been fully processed + #either by not reading all rows of data returned, or not calling ->finish + #on the statement handle. Don't disconnect, send warning + if($db_handle->{ActiveKids} != 0) { + warn("Problem disconnect : kids=",$db_handle->{Kids}, + " activekids=",$db_handle->{ActiveKids},"\n"); + return 1; + } + + $db_handle->disconnect(); + $self->connected(undef); + $self->disconnect_count($self->disconnect_count()+1); + #print("DISCONNECT\n"); + $self->db_handle(undef); + return 0; +} + + +=head2 add_limit_clause + + Arg [1] : string $sql + Arg [2] : int $max_number + Example : my $new_sql = $dbc->add_limit_clause($sql,$max_number); + Description: Giving an SQL statement, it adds a limit clause, dependent on the database + (in MySQL, should add a LIMIT at the end, MSSQL uses a TOP clause) + Returntype : String containing the new valid SQL statement + Exceptions : none + Caller : general + Status : at risk + +=cut + + +sub add_limit_clause{ + my $self = shift; + my $sql = shift; + my $max_number = shift; + + my $new_sql = ''; + if ($self->driver eq 'mysql'){ + $new_sql = $sql . ' LIMIT ' . $max_number; + } + elsif ($self->driver eq 'odbc'){ + #need to get anything after the SELECT statement + $sql =~ /select(.*)/i; + $new_sql = 'SELECT TOP ' . $max_number . $1; + } + else{ + warning("Not possible to convert $sql to an unknow database driver: ", $self->driver, " no limit applied"); + $new_sql = $sql; + } + return $new_sql; +} + + +=head2 from_date_to_seconds + + Arg [1] : date $date + Example : my $string = $dbc->from_date_to_seconds($date); + Description: Giving a string representing a column of type date + applies the database function to convert to the number of seconds from 01-01-1970 + Returntype : string + Exceptions : none + Caller : general + Status : at risk + +=cut + +sub from_date_to_seconds{ + my $self= shift; + my $column = shift; + + my $string; + if ($self->driver eq 'mysql'){ + $string = "UNIX_TIMESTAMP($column)"; + } + elsif ($self->driver eq 'odbc'){ + $string = "DATEDIFF(second,'JAN 1 1970',$column)"; + } + else{ + warning("Not possible to convert $column due to an unknown database driver: ", $self->driver); + return ''; + } + return $string; +} + + +=head2 from_seconds_to_date + + Arg [1] : int $seconds + Example : my $string = $dbc->from_seconds_to_date($seconds); + Description: Giving an int representing number of seconds + applies the database function to convert to a date + Returntype : string + Exceptions : none + Caller : general + Status : at risk + +=cut + +sub from_seconds_to_date{ + my $self= shift; + my $seconds = shift; + + my $string; + if ($self->driver eq 'mysql'){ + if ($seconds){ + $string = "from_unixtime( ".$seconds.")"; + } + else{ + $string = "\"0000-00-00 00:00:00\""; + } + } + elsif ($self->driver eq 'odbc'){ + if ($seconds){ + $string = "DATEDIFF(date,'JAN 1 1970',$seconds)"; + } + else{ + $string = "\"0000-00-00 00:00:00\""; + } + } + else{ + warning("Not possible to convert $seconds due to an unknown database driver: ", $self->driver); + return ''; + + } + return $string; +} + +=head2 sql_helper + + Example : my $h = $dbc->sql_helper(); + Description: Lazy generated instance of L + which provides useful wrapper methods for interacting with a + DBConnection instance. + Returntype : Bio::EnsEMBL::Utils::SqlHelper + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub sql_helper { + my ($self) = @_; + if(! exists $self->{_sql_helper}) { + my $helper = Bio::EnsEMBL::Utils::SqlHelper->new(-DB_CONNECTION => $self); + $self->{_sql_helper} = $helper; + } + return $self->{_sql_helper}; +} + +#### +#deprecated functions +#### + +=head2 group + + group is no longer available in DBConnection and should be accessed if needed + from an adaptor. + +=cut + +sub group { + my ($self, $arg ) = @_; + ( defined $arg ) && + ( $self->{_group} = $arg ); + deprecate "group should not be called from DBConnection but from an adaptor\n"; + return $self->{_group}; +} + +=head2 species + + species is no longer available in DBConnection and should be accessed if needed + from an adaptor. + +=cut + +sub species { + my ($self, $arg ) = @_; + ( defined $arg ) && + ( $self->{_species} = $arg ); + deprecate "species should not be called from DBConnection but from an adaptor\n"; + return $self->{_species}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/DBEntryAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/DBEntryAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,2067 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::DBEntryAdaptor - +MySQL Database queries to load and store external object references. + +=head1 SYNOPSIS + + $db_entry_adaptor = + $registry->get_adaptor( 'Human', 'Core', 'DBEntry' ); + + $db_entry = $db_entry_adaptor->fetch_by_dbID($id); + + my $gene_adaptor = $registry->get_adaptor( 'Human', 'Core', 'Gene' ); + + my $gene = $gene_adaptor->fetch_by_stable_id('ENSG00000101367'); + + @db_entries = @{ $db_entry_adaptor->fetch_all_by_Gene($gene) }; + @gene_ids = $db_entry_adaptor->list_gene_ids_by_extids('BAB15482'); + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::DBEntryAdaptor; + +use Bio::EnsEMBL::DBSQL::BaseAdaptor; + +use Bio::EnsEMBL::DBEntry; +use Bio::EnsEMBL::IdentityXref; +use Bio::EnsEMBL::OntologyXref; + +use Bio::EnsEMBL::Utils::Exception qw(deprecate throw warning); + +use vars qw(@ISA); +use strict; + +@ISA = qw( Bio::EnsEMBL::DBSQL::BaseAdaptor ); + +=head2 fetch_by_dbID + + Arg [1] : int $dbID + the unique database identifier for the DBEntry to retrieve + Example : my $db_entry = $db_entry_adaptor->fetch_by_dbID($dbID); + Description: Retrieves a dbEntry from the database via its unique + identifier. + Returntype : Bio::EnsEMBL::DBEntry + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_by_dbID { + my ( $self, $dbID ) = @_; + + my $sth = $self->prepare( + "SELECT xref.xref_id, + xref.dbprimary_acc, + xref.display_label, + xref.version, + exDB.priority, + exDB.db_name, + exDB.db_display_name, + exDB.db_release, + es.synonym, + xref.info_type, + xref.info_text, + exDB.type, + exDB.secondary_db_name, + exDB.secondary_db_table, + xref.description + FROM (xref, external_db exDB) + LEFT JOIN external_synonym es ON + es.xref_id = xref.xref_id + WHERE xref.xref_id = ? + AND xref.external_db_id = exDB.external_db_id" ); + + $sth->bind_param( 1, $dbID, SQL_INTEGER ); + $sth->execute(); + + my $exDB; + + my $max_rows = 1000; + + while ( my $rowcache = $sth->fetchall_arrayref( undef, $max_rows ) ) { + #$description refers to the external_db description, while $desc was referring the xref description + while ( my $arrayref = shift( @{$rowcache} ) ) { + my ( $refID, $dbprimaryId, + $displayid, $version, + $priority, + $dbname, $db_display_name, + $release, $synonym, + $info_type, $info_text, + $type, $secondary_db_name, + $secondary_db_table, $description + ) = @$arrayref; + + if ( !defined($exDB) ) { + $exDB = + Bio::EnsEMBL::DBEntry->new( + -adaptor => $self, + -dbID => $dbID, + -primary_id => $dbprimaryId, + -display_id => $displayid, + -version => $version, + -release => $release, + -dbname => $dbname, + -priority => $priority, + -db_display_name => $db_display_name, + -info_type => $info_type, + -info_text => $info_text, + -type => $type, + -secondary_db_name => $secondary_db_name, + -secondary_db_table => $secondary_db_table, + -description => $description + ); + + + } + + if ( defined($synonym) ) { $exDB->add_synonym($synonym) } + + } ## end while ( my $arrayref = shift... + } ## end while ( my $rowcache = $sth... + + $sth->finish(); + + return $exDB; +} ## end sub fetch_by_dbID + + +sub _get_all_dm_loc_sth { + my ($self, $constraint ,$ensembl_object ) = @_; + my $object_type; + if($ensembl_object->isa("Bio::EnsEMBL::Gene")){ + $object_type = "Gene"; + } + elsif($ensembl_object->isa("Bio::EnsEMBL::Transcript")){ + $object_type = "Transcript"; + } + elsif($ensembl_object->isa("Bio::EnsEMBL::Translation")){ + $object_type = "Translation"; + } + elsif($ensembl_object->isa("Bio::EnsEMBL::Operon")){ + $object_type = "Operon"; + } + elsif($ensembl_object->isa("Bio::EnsEMBL::OperonTranscript")){ + $object_type = "OperonTranscript"; + } + else{ + warn(ref($ensembl_object)." is not a Gene Transcript or Translation object??\n"); + return undef; + } + my $sql = "SELECT xref.xref_id, + xref.dbprimary_acc, + xref.display_label, + xref.version, + exDB.priority, + exDB.db_name, + exDB.db_display_name, + exDB.db_release, + es.synonym, + xref.info_type, + xref.info_text, + exDB.type, + exDB.secondary_db_name, + exDB.secondary_db_table, + xref.description + FROM (xref, external_db exDB, dependent_xref dx, object_xref ox) + LEFT JOIN external_synonym es ON + es.xref_id = xref.xref_id + WHERE xref.external_db_id = exDB.external_db_id AND + ox.xref_id = xref.xref_id AND + ox.ensembl_object_type = \'$object_type\' AND + ox.ensembl_id = ".$ensembl_object->dbID(); + + if($constraint){ + $sql .= " AND $constraint"; + } + else{ + die "NO constraint???\n"; + } + + my $sth = $self->prepare($sql) || die "Could not prepare $sql"; + + return $self->_get_all_dm($sth); +} + +sub _get_all_dm_sth { + my ( $self, $constraint) = @_; + + my $sql = "SELECT xref.xref_id, + xref.dbprimary_acc, + xref.display_label, + xref.version, + exDB.priority, + exDB.db_name, + exDB.db_display_name, + exDB.db_release, + es.synonym, + xref.info_type, + xref.info_text, + exDB.type, + exDB.secondary_db_name, + exDB.secondary_db_table, + xref.description + FROM (xref, external_db exDB, dependent_xref dx) + LEFT JOIN external_synonym es ON + es.xref_id = xref.xref_id + WHERE xref.external_db_id = exDB.external_db_id "; + + if($constraint){ + $sql .= "AND $constraint"; + } + else{ + die "NO constraint???\n"; + } + + my $sth = $self->prepare($sql) || die "Could not prepare $sql"; + + return $self->_get_all_dm($sth); +} + + +sub _get_all_dm{ + + my ($self, $sth) = @_; + +# $sth->bind_param( 1, $dm_dbid, SQL_INTEGER ); + +# print $sth."\n"; + $sth->execute() || die "Not able to execute statement handle"; + + my @list =(); + my %seen; + + my $max_rows = 1000; + while ( my $rowcache = $sth->fetchall_arrayref(undef, $max_rows) ) { + while ( my $arrayref = shift( @{$rowcache} ) ) { + my ( $dbID, $dbprimaryId, + $displayid, $version, + $priority, + $dbname, $db_display_name, + $release, $synonym, + $info_type, $info_text, + $type, $secondary_db_name, + $secondary_db_table, $description + ) = @$arrayref; + + if ( !defined($seen{$dbID}) ) { + my $exDB = + Bio::EnsEMBL::DBEntry->new( + -adaptor => $self, + -dbID => $dbID, + -primary_id => $dbprimaryId, + -display_id => $displayid, + -version => $version, + -release => $release, + -dbname => $dbname, + -priority => $priority, + -db_display_name => $db_display_name, + -info_type => $info_type, + -info_text => $info_text, + -type => $type, + -secondary_db_name => $secondary_db_name, + -secondary_db_table => $secondary_db_table, + -description => $description + ); + + if ($synonym) { $exDB->add_synonym($synonym) }; + $seen{$dbID} = 1; + push @list, $exDB; + } + + + + } ## end while ( my $arrayref = shift... + } ## end while ( my $rowcache = $sth... + + $sth->finish(); + + return \@list; + +} + + +=head2 get_all_dependents + + Args[1] : dbID of the DBentry to get the dependents of. + Args[2] : (optional) Bio::EnsEMBL::Gene, Transcript or Translation object + Example : my @dependents = @{ $dbe_adaptor->get_all_dependents(1234) }; + Description: Get a list of DBEntrys that are depenednet on the DBEntry. + if an ensembl gene transcript or translation is given then only + the ones on that object will be given + Returntype : listref of DBEntrys. May be empty. + Exceptions : none + Caller : DBEntry->get_all_dependnets + Status : UnStable + +=cut + +sub get_all_dependents { + my ( $self, $dbid, $ensembl_object) = @_; + + if(defined($ensembl_object) and !($ensembl_object->isa("Bio::EnsEMBL::Feature") or $ensembl_object->isa("Bio::EnsEMBL::Translation"))){ + die ref($ensembl_object)." is not an Gene Transcript or Translation"; + } + + my $constraint = " dx.master_xref_id = $dbid AND dx.dependent_xref_id = xref.xref_id"; + if(defined($ensembl_object)){ + return $self->_get_all_dm_loc_sth($constraint, $ensembl_object); + } + else{ + return $self->_get_all_dm_sth($constraint, $ensembl_object); + } + +} + +=head2 get_all_masters + + Args[1] : dbID of the DBentry to get the masters of. + Args[2] : (optional) Bio::EnsEMBL::Gene, Transcript or Translation object + Example : my @masters = @{ $dbe_adaptor->get_all_masters(1234) }; + Description: Get a list of DBEntrys that are the masters of the DBEntry. + if an ensembl gene transcript or translation is given then only + the ones on that object will be given. + Returntype : listref of DBEntrys. May be empty. + Exceptions : none + Caller : DBEntry->get_all_masters + Status : UnStable + +=cut + +sub get_all_masters { + my ( $self, $dbid, $ensembl_object ) = @_; + + if(defined($ensembl_object) and !($ensembl_object->isa("Bio::EnsEMBL::Feature") or $ensembl_object->isa("Bio::EnsEMBL::Translation"))){ + die ref($ensembl_object)." is not an Gene Transcript or Translation"; + } + + my $constraint = "dx.dependent_xref_id = $dbid AND dx.master_xref_id = xref.xref_id"; + + if(defined($ensembl_object)){ + return $self->_get_all_dm_loc_sth($constraint, $ensembl_object); + } + else{ + return $self->_get_all_dm_sth($constraint, $ensembl_object); + } +# return $self->_get_all_dm($constraint, $ensembl_object); +} + + +=head fetch_all_by_name + + Arg [1] : string $name - The name of the external reference. + found in accession, display_label or synonym + Arg [2] : (optional) string $dbname - The name of the database which + the provided name is for. + + Example : my $xref = @{$dbea->fetch_all_by_name('BRAC2','HGNC')}[0]; + print $xref->description(), "\n" if($xref); + Description: Retrieves list of DBEntrys (xrefs) via a name. + The accesion is looked for first then the synonym and finally + the display_label. + NOTE $dbname this is optional but adding this speeds the + process up if you know what you are looking for. + + NOTE: In a multi-species database, this method will + return all the entries matching the search criteria, not + just the ones associated with the current species. + Returntype : Bio::EnsEMBL::DBSQL::DBEntry + Exceptions : thrown if arguments are incorrect + Caller : general, domainview + Status : Stable + +=cut + +sub fetch_all_by_name { + my ( $self, $name, $dbname ) = @_; + + my $sql = (<prepare($sql); + $sth->bind_param( 1, $name, SQL_VARCHAR ); + $sth->bind_param( 2, $name, SQL_VARCHAR ); + if(defined $dbname){ + $sth->bind_param( 3 , $dbname, SQL_VARCHAR ); + } + $sth->execute(); + + + if ( !$sth->rows() && lc($dbname) eq 'interpro' ) { + # This is a minor hack that means that results still come back even + # when a mistake was made and no interpro accessions were loaded into + # the xref table. This has happened in the past and had the result of + # breaking domainview + + $sth->finish(); + $sth = $self->prepare( + "SELECT NULL, + i.interpro_ac, + i.id, + NULL, + NULL, + 'Interpro', + NULL, + NULL + FROM interpro i + WHERE i.interpro_ac = ?" ); + + $sth->bind_param( 1, $name, SQL_VARCHAR ); + $sth->execute(); + } + + my %exDB; + my @exDBlist; + my $max_rows = 1000; + + while ( my $rowcache = $sth->fetchall_arrayref( undef, $max_rows ) ) { + while ( my $arrayref = shift( @{$rowcache} ) ) { + my ( $dbID, $dbprimaryId, + $displayid, $version, + $priority, + $dbname, $db_display_name, + $release, $synonym, + $info_type, $info_text, + $type, $secondary_db_name, + $secondary_db_table, $description + ) = @$arrayref; + + if ( !defined $exDB{$dbID} ) { + my $entrie = + Bio::EnsEMBL::DBEntry->new( + -adaptor => $self, + -dbID => $dbID, + -primary_id => $dbprimaryId, + -display_id => $displayid, + -version => $version, + -release => $release, + -dbname => $dbname, + -priority => $priority, + -db_display_name => $db_display_name, + -info_type => $info_type, + -info_text => $info_text, + -type => $type, + -secondary_db_name => $secondary_db_name, + -secondary_db_table => $secondary_db_table, + -description => $description + ); + $exDB{$dbID} = $entrie; + push @exDBlist, $entrie; + } + if ($synonym) { $exDB{$dbID}->add_synonym($synonym) } + + } ## end while ( my $arrayref = shift... + } ## end while ( my $rowcache = $sth... + + $sth->finish(); + + return \@exDBlist; +} ## end sub fetch_all_by_name + + + +=head2 fetch_by_db_accession + + Arg [1] : string $dbname - The name of the database which the provided + accession is for. + Arg [2] : string $accession - The accesion of the external reference to + retrieve. + Example : my $xref = $dbea->fetch_by_db_accession('Interpro','IPR003439'); + print $xref->description(), "\n" if($xref); + Description: Retrieves a DBEntry (xref) via the name of the database + it is from and its primary accession in that database. + Undef is returned if the xref cannot be found in the + database. + NOTE: In a multi-species database, this method will + return all the entries matching the search criteria, not + just the ones associated with the current species. + Returntype : Bio::EnsEMBL::DBSQL::DBEntry + Exceptions : thrown if arguments are incorrect + Caller : general, domainview + Status : Stable + +=cut + +sub fetch_by_db_accession { + my ( $self, $dbname, $accession ) = @_; + + my $sth = $self->prepare( + "SELECT xref.xref_id, + xref.dbprimary_acc, + xref.display_label, + xref.version, + exDB.priority, + exDB.db_name, + exDB.db_display_name, + exDB.db_release, + es.synonym, + xref.info_type, + xref.info_text, + exDB.type, + exDB.secondary_db_name, + exDB.secondary_db_table, + xref.description + FROM (xref, external_db exDB) + LEFT JOIN external_synonym es ON + es.xref_id = xref.xref_id + WHERE xref.dbprimary_acc = ? + AND exDB.db_name = ? + AND xref.external_db_id = exDB.external_db_id" ); + + $sth->bind_param( 1, $accession, SQL_VARCHAR ); + $sth->bind_param( 2, $dbname, SQL_VARCHAR ); + $sth->execute(); + + if ( !$sth->rows() && lc($dbname) eq 'interpro' ) { + # This is a minor hack that means that results still come back even + # when a mistake was made and no interpro accessions were loaded into + # the xref table. This has happened in the past and had the result of + # breaking domainview + + $sth->finish(); + $sth = $self->prepare( + "SELECT NULL, + i.interpro_ac, + i.id, + NULL, + NULL, + 'Interpro', + NULL, + NULL + FROM interpro i + WHERE i.interpro_ac = ?" ); + + $sth->bind_param( 1, $accession, SQL_VARCHAR ); + $sth->execute(); + } + + my $exDB; + + my $max_rows = 1000; + + while ( my $rowcache = $sth->fetchall_arrayref( undef, $max_rows ) ) { + while ( my $arrayref = shift( @{$rowcache} ) ) { + my ( $dbID, $dbprimaryId, + $displayid, $version, + $priority, + $dbname, $db_display_name, + $release, $synonym, + $info_type, $info_text, + $type, $secondary_db_name, + $secondary_db_table, $description + ) = @$arrayref; + + if ( !defined($exDB) ) { + $exDB = + Bio::EnsEMBL::DBEntry->new( + -adaptor => $self, + -dbID => $dbID, + -primary_id => $dbprimaryId, + -display_id => $displayid, + -version => $version, + -release => $release, + -dbname => $dbname, + -priority => $priority, + -db_display_name => $db_display_name, + -info_type => $info_type, + -info_text => $info_text, + -type => $type, + -secondary_db_name => $secondary_db_name, + -secondary_db_table => $secondary_db_table, + -description => $description + ); + + + } + + if ($synonym) { $exDB->add_synonym($synonym) } + + } ## end while ( my $arrayref = shift... + } ## end while ( my $rowcache = $sth... + + $sth->finish(); + + return $exDB; +} ## end sub fetch_by_db_accession + + +=head2 store + + Arg [1] : Bio::EnsEMBL::DBEntry $dbEntry + The DBEntry (xref) to be stored + Arg [2] : Int $ensID + The dbID of an EnsEMBL object to associate with this external + database entry + Arg [3] : string $ensType ('Transcript', 'Translation', 'Gene') + The type of EnsEMBL object that this external database entry is + being associated with. + Arg [4] : boolean $ignore_release + If unset or zero, will require that the release string + of the DBEntry object is identical to the release of the + external database. If set and non-zero, will ignore the + release information. + Example : $dbea->store($db_entry, $transcript_id, 'Transcript'); + Description: Stores a reference to an external database (if it is not stored + already) and associates an EnsEMBL object of a specified type + with the external identifier. + Returntype : int - the dbID of the newly created external refernce + Exceptions : thrown when invalid dbID is passed to this method + Caller : scripts which load Xrefs and ObjectXrefs, etc. into Ensembl + Status : Stable + +=cut + +sub store { + my ( $self, $dbEntry, $ensID, $ensType, $ignore_release ) = @_; + + my $dbJustInserted; + + # + # backwards compatibility check: + # check if $ensID is an object; if so, use $obj->dbID + # + my $ensembl_id; + + if ( defined($ensID) ) { + if ( $ensID =~ /^\d+$/ ) { + $ensembl_id = $ensID; + } elsif ( ref($ensID) eq 'Bio::EnsEMBL::Gene' + or ref($ensID) eq 'Bio::EnsEMBL::Transcript' + or ref($ensID) eq 'Bio::EnsEMBL::Translation' + or ref($ensID) eq 'Bio::EnsEMBL::OperonTranscript' + or ref($ensID) eq 'Bio::EnsEMBL::Operon' + ) + { + warning( "You should pass DBEntryAdaptor->store() " + . "a dbID rather than an ensembl object " + . "to store the xref on" ); + + if ( defined( $ensID->dbID() ) ) { + $ensembl_id = $ensID->dbID(); + } else { + throw( sprintf( "%s %s doesn't have a dbID, can't store xref", + $ensType, $ensID->display_id() ) ); + } + } else { + throw("Invalid dbID passed to DBEntryAdaptor->store()"); + } + } + + + + # Ensure external_db contains a record of the intended xref source + my $dbRef; + $dbRef = $self->_check_external_db($dbEntry,$ignore_release); + + # Attempt to insert DBEntry + my $xref_id = $self->_store_or_fetch_xref($dbEntry,$dbRef); + $dbEntry->dbID($xref_id); #keeps DBEntry in sync with database + ### Attempt to create an object->xref mapping + if ($ensembl_id) {$self->_store_object_xref_mapping($ensembl_id,$dbEntry,$ensType)}; + + return $xref_id; +} + +sub _store_object_xref_mapping { + my $self = shift; + my $ensembl_id = shift; + my $dbEntry = shift; + my $ensembl_type = shift; + + if (not defined ($ensembl_type)) { warning("No Ensembl data type provided for new xref");} + + my $analysis_id; + if ( $dbEntry->analysis() ) { + $analysis_id = $self->db()->get_AnalysisAdaptor->store( $dbEntry->analysis() ); + } else { + $analysis_id = 0; ## This used to be undef, but uniqueness in mysql requires a value + } + + my $sth = $self->prepare(qq( + INSERT IGNORE INTO object_xref + SET xref_id = ?, + ensembl_object_type = ?, + ensembl_id = ?, + linkage_annotation = ?, + analysis_id = ? ) + ); + $sth->bind_param( 1, $dbEntry->dbID(), SQL_INTEGER ); + $sth->bind_param( 2, $ensembl_type, SQL_VARCHAR ); + $sth->bind_param( 3, $ensembl_id, SQL_INTEGER ); + $sth->bind_param( 4, $dbEntry->linkage_annotation(),SQL_VARCHAR ); + $sth->bind_param( 5, $analysis_id, SQL_INTEGER ); + $sth->execute(); + $sth->finish(); + my $object_xref_id = $self->last_insert_id(); + + $dbEntry->adaptor($self); # hand Adaptor to dbEntry for future use with OntologyXrefs + + if ($object_xref_id) { + #no existing object_xref, therefore + if ( $dbEntry->isa('Bio::EnsEMBL::IdentityXref') ) { + $sth = $self->prepare( " + INSERT ignore INTO identity_xref + SET object_xref_id = ?, + xref_identity = ?, + ensembl_identity = ?, + xref_start = ?, + xref_end = ?, + ensembl_start = ?, + ensembl_end = ?, + cigar_line = ?, + score = ?, + evalue = ?" ); + $sth->bind_param( 1, $object_xref_id, SQL_INTEGER ); + $sth->bind_param( 2, $dbEntry->xref_identity, SQL_INTEGER ); + $sth->bind_param( 3, $dbEntry->ensembl_identity, SQL_INTEGER ); + $sth->bind_param( 4, $dbEntry->xref_start, SQL_INTEGER ); + $sth->bind_param( 5, $dbEntry->xref_end, SQL_INTEGER ); + $sth->bind_param( 6, $dbEntry->ensembl_start, SQL_INTEGER ); + $sth->bind_param( 7, $dbEntry->ensembl_end, SQL_INTEGER ); + $sth->bind_param( 8, $dbEntry->cigar_line, SQL_LONGVARCHAR ); + $sth->bind_param( 9, $dbEntry->score, SQL_DOUBLE ); + $sth->bind_param( 10, $dbEntry->evalue, SQL_DOUBLE ); + $sth->execute(); + } elsif ( $dbEntry->isa('Bio::EnsEMBL::OntologyXref') ) { + $sth = $self->prepare( " + INSERT ignore INTO ontology_xref + SET object_xref_id = ?, + source_xref_id = ?, + linkage_type = ? " ); + foreach my $info ( @{ $dbEntry->get_all_linkage_info() } ) { + my ( $linkage_type, $sourceXref ) = @{$info}; + my $sourceXid = undef; + if ($sourceXref) { + $sourceXref->is_stored( $self->dbc ) || $self->store($sourceXref); + $sourceXid = $sourceXref->dbID; + } + $sth->bind_param( 1, $object_xref_id, SQL_INTEGER ); + $sth->bind_param( 2, $sourceXid, SQL_INTEGER ); + $sth->bind_param( 3, $linkage_type, SQL_VARCHAR ); + $sth->execute(); + } #end foreach + } #end elsif + } # end if ($object_xref_id) + return $object_xref_id; +} + +=head2 _check_external_db + + Arg [1] : DBEntry object + Arg [2] : Ignore version flag + Description: Looks for a record of the given external database + Exceptions : Throws on missing external database entry + Returntype : Int + +=cut + +sub _check_external_db { + my ($self,$db_entry,$ignore) = @_; + my ($sql,@bound_params,$sql_helper,$db_name,$db_release); + + $db_name = $db_entry->dbname(); + $db_release = $db_entry->release(); + $sql_helper = $self->dbc->sql_helper; + + $sql = 'SELECT external_db_id FROM external_db WHERE db_name = ?'; + push @bound_params,$db_name; + unless ($ignore) { + if ($db_release) { + $sql .= ' AND db_release = ?'; + push @bound_params,$db_release; + } else { + $sql .= ' AND db_release is NULL'; + } + } + + my ($db_id) = @{ $sql_helper->execute_simple(-SQL => $sql, -PARAMS => \@bound_params) }; + + if ($db_id) { + return $db_id; + } + else { + throw( sprintf( "external_db [%s] release [%s] does not exist", + $db_name, $db_release) + ); + } +} + +=head2 _store_or_fetch_xref + + Arg [1] : DBEntry object + Arg [2] : Database accession for external database + Description: Thread-safe method for adding xrefs, or otherwise returning + an xref ID for the inserted or retrieved xref. Also inserts + synonyms for that xref when entire new + Returns : Int - the DB ID of the xref after insertion +=cut + +sub _store_or_fetch_xref { + my $self = shift; + my $dbEntry = shift; + my $dbRef = shift; + my $xref_id; + + my $sth = $self->prepare( " + INSERT IGNORE INTO xref + SET dbprimary_acc = ?, + display_label = ?, + version = ?, + description = ?, + external_db_id = ?, + info_type = ?, + info_text = ?"); + $sth->bind_param(1, $dbEntry->primary_id,SQL_VARCHAR); + $sth->bind_param(2, $dbEntry->display_id,SQL_VARCHAR); + $sth->bind_param(3, ($dbEntry->version || q{0}),SQL_VARCHAR); + $sth->bind_param(4, $dbEntry->description,SQL_VARCHAR); + $sth->bind_param(5, $dbRef,SQL_INTEGER); + $sth->bind_param(6, ($dbEntry->info_type || 'NONE'), SQL_VARCHAR); + $sth->bind_param(7, ($dbEntry->info_text || ''), SQL_VARCHAR); + + $sth->execute(); + $xref_id = $self->last_insert_id('xref_id',undef,'xref'); + $sth->finish(); + + if ($xref_id) { #insert was successful, store supplementary synonyms + # thread safety no longer an issue. + my $synonym_check_sth = $self->prepare( + "SELECT xref_id, synonym + FROM external_synonym + WHERE xref_id = ? + AND synonym = ?"); + + my $synonym_store_sth = $self->prepare( + "INSERT ignore INTO external_synonym + SET xref_id = ?, synonym = ?"); + + my $synonyms = $dbEntry->get_all_synonyms(); + foreach my $syn ( @$synonyms ) { + $synonym_check_sth->bind_param(1,$xref_id,SQL_INTEGER); + $synonym_check_sth->bind_param(2,$syn,SQL_VARCHAR); + $synonym_check_sth->execute(); + my ($dbSyn) = $synonym_check_sth->fetchrow_array(); + $synonym_store_sth->bind_param(1,$xref_id,SQL_INTEGER); + $synonym_store_sth->bind_param(2,$syn,SQL_VARCHAR); + $synonym_store_sth->execute() if(!$dbSyn); + } + $synonym_check_sth->finish(); + $synonym_store_sth->finish(); + + } else { # xref_id already exists, retrieve it according to fields in the unique key + my $sql = 'SELECT xref_id FROM xref + WHERE dbprimary_acc = ? + AND version =? + AND external_db_id = ? + AND info_type = ? + AND info_text = ?'; + my $info_type = $dbEntry->info_type() || 'NONE'; + my $info_text = $dbEntry->info_text() || q{}; + my $version = $dbEntry->version() || q{0}; + $sth = $self->prepare( $sql ); + $sth->bind_param(1, $dbEntry->primary_id,SQL_VARCHAR); + $sth->bind_param(2, $version, SQL_VARCHAR); + $sth->bind_param(3, $dbRef, SQL_INTEGER); + $sth->bind_param(4, $info_type, SQL_VARCHAR); + $sth->bind_param(5, $info_text, SQL_VARCHAR); + $sth->execute(); + ($xref_id) = $sth->fetchrow_array(); + $sth->finish; + if(!$xref_id) { + my $msg = 'Cannot find an xref id for %s (version=%d) with external db id %d.'; + throw(sprintf($msg, $dbEntry->primary_id(), $version, $dbRef)) + } + } + + return $xref_id; +} + +=head2 exists + + Arg [1] : Bio::EnsEMBL::DBEntry $dbe + Example : if($dbID = $db_entry_adaptor->exists($dbe)) { do stuff; } + Description: Returns the db id of this DBEntry if it exists in this database + otherwise returns undef. Exists is defined as an entry with + the same external_db and display_id + Returntype : int + Exceptions : thrown on incorrect args + Caller : GeneAdaptor::store, TranscriptAdaptor::store + Status : Stable + +=cut + +sub exists { + my ($self, $dbe) = @_ ; + + unless($dbe && ref $dbe && $dbe->isa('Bio::EnsEMBL::DBEntry')) { + throw("arg must be a Bio::EnsEMBL::DBEntry not [$dbe]"); + } + + my $sth = $self->prepare('SELECT x.xref_id + FROM xref x, external_db xdb + WHERE x.external_db_id = xdb.external_db_id + AND x.display_label = ? + AND xdb.db_name = ? + AND x.dbprimary_acc = ?'); + + $sth->bind_param(1,$dbe->display_id,SQL_VARCHAR); + $sth->bind_param(2,$dbe->dbname,SQL_VARCHAR); + $sth->bind_param(3,$dbe->primary_id,SQL_VARCHAR); + $sth->execute(); + + my ($dbID) = $sth->fetchrow_array; + + $sth->finish; + + return $dbID; +} + + +=head2 fetch_all_by_Gene + + Arg [1] : Bio::EnsEMBL::Gene $gene + (The gene to retrieve DBEntries for) + Arg [2] : optional external database name. SQL wildcards are accepted + Arg [3] : optional external_db type. SQL wildcards are accepted + Example : @db_entries = @{$db_entry_adaptor->fetch_all_by_Gene($gene)}; + Description: This returns a list of DBEntries associated with this gene. + Note that this method was changed in release 15. Previously + it set the DBLinks attribute of the gene passed in to contain + all of the gene, transcript, and translation xrefs associated + with this gene. + Returntype : listref of Bio::EnsEMBL::DBEntries; may be of type IdentityXref if + there is mapping data, or OntologyXref if there is linkage data. + Exceptions : thows if gene object not passed + Caller : Bio::EnsEMBL::Gene + Status : Stable + +=cut + +sub fetch_all_by_Gene { + my ( $self, $gene, $ex_db_reg, $exdb_type ) = @_; + + if(!ref($gene) || !$gene->isa('Bio::EnsEMBL::Gene')) { + throw("Bio::EnsEMBL::Gene argument expected."); + } + + return $self->_fetch_by_object_type($gene->dbID(), 'Gene', $ex_db_reg, $exdb_type); +} + +=head2 fetch_all_by_Operon + + Arg [1] : Bio::EnsEMBL::Operon $operon + (The operon to retrieve DBEntries for) + Arg [2] : optional external database name. SQL wildcards are accepted + Arg [3] : optional external_db type. SQL wildcards are accepted + Example : @db_entries = @{$db_entry_adaptor->fetch_all_by_Operon($operon)}; + Description: This returns a list of DBEntries associated with this operon. + Returntype : listref of Bio::EnsEMBL::DBEntries; may be of type IdentityXref if + there is mapping data, or OntologyXref if there is linkage data. + Exceptions : thows if operon object not passed + Caller : general + +=cut + +sub fetch_all_by_Operon { + my ( $self, $gene, $ex_db_reg, $exdb_type ) = @_; + + if(!ref($gene) || !$gene->isa('Bio::EnsEMBL::Operon')) { + throw("Bio::EnsEMBL::Operon argument expected."); + } + + return $self->_fetch_by_object_type($gene->dbID(), 'Operon', $ex_db_reg, $exdb_type); +} + + +=head2 fetch_all_by_Transcript + + Arg [1] : Bio::EnsEMBL::Transcript + Arg [2] : optional external database name. SQL wildcards are accepted + Arg [3] : optional external_db type. SQL wildcards are accepted + Example : @db_entries = @{$db_entry_adaptor->fetch_all_by_Transcript($trans)}; + Description: This returns a list of DBEntries associated with this + transcript. Note that this method was changed in release 15. + Previously it set the DBLinks attribute of the gene passed in + to contain all of the gene, transcript, and translation xrefs + associated with this gene. + Returntype : listref of Bio::EnsEMBL::DBEntries; may be of type IdentityXref if + there is mapping data, or OntologyXref if there is linkage data. + Exceptions : throes if transcript argument not passed + Caller : Bio::EnsEMBL::Transcript + Status : Stable + +=cut + +sub fetch_all_by_Transcript { + my ( $self, $trans, $ex_db_reg, $exdb_type ) = @_; + + if(!ref($trans) || !$trans->isa('Bio::EnsEMBL::Transcript')) { + throw("Bio::EnsEMBL::Transcript argument expected."); + } + + return $self->_fetch_by_object_type( $trans->dbID(), 'Transcript', $ex_db_reg, $exdb_type); +} + + +=head2 fetch_all_by_Translation + + Arg [1] : Bio::EnsEMBL::Translation $trans + (The translation to fetch database entries for) + Arg [2] : optional external database name. SQL wildcards are accepted + Arg [3] : optional externaldb type. SQL wildcards are accepted + Example : @db_entries = @{$db_entry_adptr->fetch_all_by_Translation($trans)}; + Description: Retrieves external database entries for an EnsEMBL translation + Returntype : listref of Bio::EnsEMBL::DBEntries; may be of type IdentityXref if + there is mapping data, or OntologyXref if there is linkage data. + Exceptions : throws if translation object not passed + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_Translation { + my ( $self, $trans, $ex_db_reg, $exdb_type ) = @_; + + if(!ref($trans) || !$trans->isa('Bio::EnsEMBL::Translation')) { + throw('Bio::EnsEMBL::Translation argument expected.'); + } + if( ! $trans->dbID ){ + warning( "Cannot fetch_all_by_Translation without a dbID" ); + return []; + } + + return $self->_fetch_by_object_type( $trans->dbID(), 'Translation', $ex_db_reg, $exdb_type ); +} + + + +=head2 remove_from_object + + Arg [1] : Bio::EnsEMBL::DBEntry $dbe - The external reference which + is to be disassociated from an ensembl object. + Arg [2] : Bio::EnsEMBL::Storable $object - The ensembl object the + external reference is to be disassociated from + Arg [3] : string $object_type - The type of the ensembl object. + E.g. 'Gene', 'Transcript', 'Translation' + Example : + # remove all dbentries from this translation + foreach my $dbe (@{$translation->get_all_DBEntries()}) { + $dbe_adaptor->remove($dbe, $translation, 'Translation'); + } + Description: Removes an association between an ensembl object and a + DBEntry (xref). This does not remove the actual xref from + the database, only its linkage to the ensembl object. + Returntype : none + Exceptions : Throw on incorrect arguments. + Warning if object or dbentry is not stored in this database. + Caller : TranscriptAdaptor::remove, GeneAdaptor::remove, + TranslationAdaptor::remove + Status : Stable + +=cut + +sub remove_from_object { + my $self = shift; + my $dbe = shift; + my $object = shift; + my $object_type = shift; + + if(!ref($dbe) || !$dbe->isa('Bio::EnsEMBL::DBEntry')) { + throw("Bio::EnsEMBL::DBEntry argument expected."); + } + + if(!ref($object) || !$dbe->isa('Bio::EnsEMBL::Storable')) { + throw("Bio::EnsEMBL::Storable argument expected."); + } + + if(!$object_type) { + throw("object_type string argument expected."); + } + + # make sure both the dbentry and the object it is allegedly linked to + # are stored in this database + + if(!$object->is_stored($self->db())) { + warning("Cannot remove DBEntries for $object_type " . $object->dbID() . + ". Object is not stored in this database."); + return; + } + + if(!$dbe->is_stored($self->db())) { + warning("Cannot remove DBEntry ".$dbe->dbID() . ". Is not stored " . + "in this database."); + return; + } + + # obtain the identifier of the link from the object_xref table + #No need to compare linkage_annotation here + my $sth = $self->prepare + ("SELECT ox.object_xref_id " . + "FROM object_xref ox ". + "WHERE ox.xref_id = ? " . + "AND ox.ensembl_id = ? " . + "AND ox.ensembl_object_type = ?"); + $sth->bind_param(1,$dbe->dbID,SQL_INTEGER); + $sth->bind_param(2,$object->dbID,SQL_INTEGER); + $sth->bind_param(3,$object_type,SQL_VARCHAR); + $sth->execute(); + + if(!$sth->rows() == 1) { + $sth->finish(); + return; + } + + my ($ox_id) = $sth->fetchrow_array(); + $sth->finish(); + + # delete from the tables which contain additional linkage information + + $sth = $self->prepare("DELETE FROM ontology_xref WHERE object_xref_id = ?"); + $sth->bind_param(1,$ox_id,SQL_INTEGER); + $sth->execute(); + $sth->finish(); + + $sth = $self->prepare("DELETE FROM identity_xref WHERE object_xref_id = ?"); + $sth->bind_param(1,$ox_id,SQL_INTEGER); + $sth->execute(); + $sth->finish(); + + # delete the actual linkage itself + $sth = $self->prepare("DELETE FROM object_xref WHERE object_xref_id = ?"); + $sth->bind_param(1,$ox_id,SQL_INTEGER); + $sth->execute(); + $sth->finish(); + + return; +} + + +=head2 _fetch_by_object_type + + Arg [1] : string $ensID + Arg [2] : string $ensType (object type to be returned) + Arg [3] : optional $exdbname (external database name) + (may be an SQL pattern containing '%' which matches any + number of characters) + Arg [4] : optional $exdb_type (external database type) + (may be an SQL pattern containing '%' which matches any + number of characters) + Example : $self->_fetch_by_object_type( $translation_id, 'Translation' ) + Description: Fetches DBEntry by Object type + NOTE: In a multi-species database, this method will + return all the entries matching the search criteria, not + just the ones associated with the current species. + + + Returntype : arrayref of DBEntry objects; may be of type IdentityXref if + there is mapping data, or OntologyXref if there is linkage data. + Exceptions : none + Caller : fetch_all_by_Gene + fetch_all_by_Translation + fetch_all_by_Transcript + Status : Stable + +=cut + +sub _fetch_by_object_type { + my ( $self, $ensID, $ensType, $exdbname, $exdb_type ) = @_; + + my @out; + + if ( !defined($ensID) ) { + throw("Can't fetch_by_EnsObject_type without an object"); + } + + if ( !defined($ensType) ) { + throw("Can't fetch_by_EnsObject_type without a type"); + } + + # my $sth = $self->prepare(" + my $sql = (<dbc()->db_handle()->quote( $exdbname, SQL_VARCHAR ); + } else { + $sql .= " AND exDB.db_name = " + . $self->dbc()->db_handle()->quote( $exdbname, SQL_VARCHAR ); + } + } + + if ( defined($exdb_type) ) { + if ( index( $exdb_type, '%' ) != -1 ) { + $sql .= " AND exDB.type LIKE " + . $self->dbc()->db_handle()->quote( $exdb_type, SQL_VARCHAR ); + } else { + $sql .= " AND exDB.type = " + . $self->dbc()->db_handle()->quote( $exdb_type, SQL_VARCHAR ); + } + } + + my $sth = $self->prepare($sql); + + $sth->bind_param( 1, $ensID, SQL_INTEGER ); + $sth->bind_param( 2, $ensType, SQL_VARCHAR ); + $sth->execute(); + + my ( %seen, %linkage_types, %synonyms ); + + my $max_rows = 1000; + + while ( my $rowcache = $sth->fetchall_arrayref( undef, $max_rows ) ) { + while ( my $arrRef = shift( @{$rowcache} ) ) { + my ( $refID, $dbprimaryId, + $displayid, $version, + $priority, + $dbname, $release, + $exDB_status, $exDB_db_display_name, + $exDB_secondary_db_name, $exDB_secondary_db_table, + $objid, $synonym, + $xrefid, $ensemblid, + $xref_start, $xref_end, + $ensembl_start, $ensembl_end, + $cigar_line, $score, + $evalue, $analysis_id, + $linkage_type, $info_type, + $info_text, $type, + $source_xref_id, $link_annotation, + $description + ) = @$arrRef; + + my $linkage_key = + ( $linkage_type || '' ) . ( $source_xref_id || '' ); + + + my $analysis = undef; + if ( defined($analysis_id) ) { + $analysis = + $self->db()->get_AnalysisAdaptor()->fetch_by_dbID($analysis_id); + } + + my %obj_hash = ( 'adaptor' => $self, + 'dbID' => $refID, + 'primary_id' => $dbprimaryId, + 'display_id' => $displayid, + 'version' => $version, + 'release' => $release, + 'info_type' => $info_type, + 'info_text' => $info_text, + 'type' => $type, + 'secondary_db_name' => $exDB_secondary_db_name, + 'secondary_db_table' => $exDB_secondary_db_table, + 'dbname' => $dbname, + 'description' => $description, + 'linkage_annotation' => $link_annotation, + 'analysis' => $analysis, + 'ensembl_object_type' => $ensType, + 'ensembl_id' => $ensID ); + + # Using an outer join on the synonyms as well as on identity_xref, + # we now have to filter out the duplicates (see v.1.18 for + # original). Since there is at most one identity_xref row per + # xref, this is easy enough; all the 'extra' bits are synonyms. + my $source_xref; + if ( !$seen{$refID} ) { + + my $exDB; + if ( ( defined($xrefid) ) ) { # an xref with similarity scores + $exDB = Bio::EnsEMBL::IdentityXref->new_fast( \%obj_hash ); + $exDB->xref_identity($xrefid); + $exDB->ensembl_identity($ensemblid); + + $exDB->cigar_line($cigar_line); + $exDB->xref_start($xref_start); + $exDB->xref_end($xref_end); # was not here before 14th Jan 2009 ???? + $exDB->ensembl_start($ensembl_start); + $exDB->ensembl_end($ensembl_end); + $exDB->score($score); + $exDB->evalue($evalue); + + } elsif ( defined $linkage_type && $linkage_type ne "" ) { + $exDB = Bio::EnsEMBL::OntologyXref->new_fast( \%obj_hash ); + $source_xref = ( defined($source_xref_id) + ? $self->fetch_by_dbID($source_xref_id) + : undef ); + $exDB->add_linkage_type( $linkage_type, $source_xref || () ); + $linkage_types{$refID}->{$linkage_key} = 1; + + } else { + $exDB = Bio::EnsEMBL::DBEntry->new_fast( \%obj_hash ); + } + + if ( defined($exDB_status) ) { $exDB->status($exDB_status) } + + $exDB->priority($priority); + $exDB->db_display_name($exDB_db_display_name); + + push( @out, $exDB ); + $seen{$refID} = $exDB; + + } ## end if ( !$seen{$refID} ) + + # $exDB still points to the same xref, so we can keep adding GO + # evidence tags or synonyms. + + if ( defined($synonym) && !$synonyms{$refID}->{$synonym} ) { + if ( defined($synonym) ) { + $seen{$refID}->add_synonym($synonym); + } + $synonyms{$refID}->{$synonym} = 1; + } + + if ( defined($linkage_type) + && $linkage_type ne "" + && !$linkage_types{$refID}->{$linkage_key} ) + { + $source_xref = ( defined($source_xref_id) + ? $self->fetch_by_dbID($source_xref_id) + : undef ); + $seen{$refID} + ->add_linkage_type( $linkage_type, $source_xref || () ); + $linkage_types{$refID}->{$linkage_key} = 1; + } + } ## end while ( my $arrRef = shift... + } ## end while ( my $rowcache = $sth... + + return \@out; +} ## end sub _fetch_by_object_type + +=head2 list_gene_ids_by_external_db_id + + Arg [1] : string $external_id + Example : @gene_ids = $dbea->list_gene_ids_by_external_db_id(1020); + Description: Retrieve a list of geneid by an external identifier that + is linked to any of the genes transcripts, translations + or the gene itself. + NOTE: If more than one external identifier has the + same primary accession then genes for each of these is + returned. + NOTE: In a multi-species database, this method will + return all the entries matching the search criteria, not + just the ones associated with the current species. + Returntype : list of ints + Exceptions : none + Caller : unknown + Status : Stable + +=cut + +sub list_gene_ids_by_external_db_id{ + my ($self,$external_db_id) = @_; + + my %T = map { ($_, 1) } + $self->_type_by_external_db_id( $external_db_id, 'Translation', 'gene' ), + $self->_type_by_external_db_id( $external_db_id, 'Transcript', 'gene' ), + $self->_type_by_external_db_id( $external_db_id, 'Gene' ); + return keys %T; +} + +=head2 list_gene_ids_by_extids + + Arg [1] : string $external_name + Arg [2] : (optional) string $external_db_name + Arg [3] : Boolean override, see _type_by_external_id + Example : @gene_ids = $dbea->list_gene_ids_by_extids('CDPX'); + Description: Retrieve a list of geneid by an external identifier that is + linked to any of the genes transcripts, translations or the + gene itself + Returntype : list of ints + Exceptions : none + Caller : unknown + Status : Stable + +=cut + +sub list_gene_ids_by_extids { + my ( $self, $external_name, $external_db_name, $override ) = @_; + + my %T = map { ( $_, 1 ) } + $self->_type_by_external_id( $external_name, 'Translation', 'gene', + $external_db_name, $override ), + $self->_type_by_external_id( $external_name, 'Transcript', 'gene', + $external_db_name, $override ), + $self->_type_by_external_id( $external_name, 'Gene', undef, + $external_db_name, $override ); + + return keys %T; +} + + +=head2 list_transcript_ids_by_extids + + Arg [1] : string $external_name + Arg [2] : (optional) string $external_db_name + Arg [3] : Boolean override, see _type_by_external_id + Example : @tr_ids = $dbea->list_transcript_ids_by_extids('BCRA2'); + Description: Retrieve a list transcript ids by an external identifier that + is linked to any of the genes transcripts, translations or the + gene itself + Returntype : list of ints + Exceptions : none + Caller : unknown + Status : Stable + +=cut + +sub list_transcript_ids_by_extids { + my ( $self, $external_name, $external_db_name, $override ) = @_; + + my %T = map { ( $_, 1 ) } + $self->_type_by_external_id( $external_name, 'Translation', + 'transcript', $external_db_name, $override + ), + $self->_type_by_external_id( $external_name, 'Transcript', undef, + $external_db_name, $override ); + + return keys %T; +} + + +=head2 list_translation_ids_by_extids + + Arg [1] : string $external_name + Arg [2] : (optional) string $external_db_name + Arg [3] : Boolean override, see _type_by_external_id + Example : @tr_ids = $dbea->list_translation_ids_by_extids('GO:0004835'); + Description: Gets a list of translation IDs by external display IDs + Returntype : list of Ints + Exceptions : none + Caller : unknown + Status : Stable + +=cut + +sub list_translation_ids_by_extids { + my ( $self, $external_name, $external_db_name, $override ) = @_; + + return + $self->_type_by_external_id( $external_name, 'Translation', undef, + $external_db_name, $override ); +} + +=head2 _type_by_external_id + + Arg [1] : string $name - dbprimary_acc + Arg [2] : string $ensType - ensembl_object_type + Arg [3] : (optional) string $extraType + Arg [4] : (optional) string $external_db_name + other object type to be returned + Arg [5] : Boolean override to force _ to be treated as an SQL 'any' + This is usually optimised out for query speed due to + large numbers of names like NM_00... + Example : $self->_type_by_external_id($name, 'Translation'); + NOTE: In a multi-species database, this method will + return all the entries matching the search criteria, not + just the ones associated with the current species. + SQL wildcards can be used in the external id, + but overly generic queries (two characters) will be prevented. + Description: Gets + Returntype : list of dbIDs (gene_id, transcript_id, etc.) + Exceptions : none + Caller : list_translation_ids_by_extids + translationids_by_extids + geneids_by_extids + Status : Stable + +=cut + +sub _type_by_external_id { + my ( $self, $name, $ensType, $extraType, $external_db_name, $override ) = @_; + + # $name has SQL wildcard support + # = or LIKE put into SQL statement, and open queries like % or A% are rejected. + my $comparison_operator; + if ($name =~ /[_%\[]/ ) { + $comparison_operator = "LIKE"; + if ($name =~ /^.?%/ && !$override) { + warn "External $ensType name $name is too vague and will monopolise database resources. Please use a more specific $ensType name.\n"; + return; + } + elsif ($name =~ /^\w\w_/ && !$override) { + # For entries such as NM_00000065, escape the _ so that SQL LIKE does not have to scan entire table + # Escape only the _ in the third character position + $name =~ s/(?<=\w\w)(?=_)/\\/; + } + } + else { + $comparison_operator = "="; + } + + + my $from_sql = ''; + my $where_sql = ''; + my $ID_sql = 'oxr.ensembl_id'; + + if ( defined($extraType) ) { + if ( lc($extraType) eq 'translation' ) { + $ID_sql = 'tl.translation_id'; + } else { + $ID_sql = "t.${extraType}_id"; + } + + if ( lc($ensType) eq 'translation' ) { + $from_sql = 'transcript t, translation tl, '; + $where_sql = qq( + t.transcript_id = tl.transcript_id AND + tl.translation_id = oxr.ensembl_id AND + t.is_current = 1 AND + ); + } else { + $from_sql = 'transcript t, '; + $where_sql = 't.' + . lc($ensType) + . '_id = oxr.ensembl_id AND ' + . 't.is_current = 1 AND '; + } + } + + my $multispecies = $self->db()->is_multispecies(); + + if ( lc($ensType) eq 'gene' ) { + $from_sql = 'gene g, '; + $from_sql .= 'seq_region s, coord_system cs, ' if $multispecies; + + $where_sql = 'g.gene_id = oxr.ensembl_id AND g.is_current = 1 AND '; + if($multispecies) { + $where_sql .= <<'SQL'; +g.seq_region_id = s.seq_region_id AND +s.coord_system_id = cs.coord_system_id AND +cs.species_id = ? AND +SQL + } + } + elsif ( lc($ensType) eq 'transcript' ) { + $from_sql = 'transcript t, '; + $from_sql .= 'seq_region s, coord_system cs, ' if $multispecies; + + $where_sql = 't.transcript_id = oxr.ensembl_id AND t.is_current = 1 AND '; + if($multispecies) { + $where_sql .= <<'SQL'; +t.seq_region_id = s.seq_region_id AND +s.coord_system_id = cs.coord_system_id AND +cs.species_id = ? AND +SQL + } + } + elsif ( lc($ensType) eq 'translation' ) { + $from_sql = 'translation tl, transcript t, '; + $from_sql .= 'seq_region s, coord_system cs, ' if $multispecies; + + $where_sql = 't.transcript_id = tl.transcript_id AND tl.translation_id = oxr.ensembl_id AND t.is_current = 1 AND '; + if($multispecies) { + $where_sql .= <<'SQL'; +t.seq_region_id = s.seq_region_id AND +s.coord_system_id = cs.coord_system_id AND +cs.species_id = ? AND +SQL + } + } + + if ( defined($external_db_name) ) { + # Involve the 'external_db' table to limit the hits to a particular + # external database. + + $from_sql .= 'external_db xdb, '; + $where_sql .= + 'xdb.db_name LIKE ' + . $self->dbc()->db_handle()->quote( $external_db_name . '%' ) + . ' AND xdb.external_db_id = x.external_db_id AND'; + } + + my $query1 = qq( + SELECT $ID_sql + FROM $from_sql + xref x, + object_xref oxr + WHERE $where_sql + ( x.dbprimary_acc $comparison_operator ? OR x.display_label $comparison_operator ? ) + AND x.xref_id = oxr.xref_id + AND oxr.ensembl_object_type = ? + ); + + my $query2; + + if ( defined($external_db_name) ) { + # If we are given the name of an external database, we need to join + # between the 'xref' and the 'object_xref' tables on 'xref_id'. + + $query2 = qq( + SELECT $ID_sql + FROM $from_sql + external_synonym syn, + object_xref oxr, + xref x + WHERE $where_sql + syn.synonym $comparison_operator ? + AND syn.xref_id = oxr.xref_id + AND oxr.ensembl_object_type = ? + AND x.xref_id = oxr.xref_id); + + } else { + # If we weren't given an external database name, we can get away + # with less joins here. + + $query2 = qq( + SELECT $ID_sql + FROM $from_sql + external_synonym syn, + object_xref oxr + WHERE $where_sql + syn.synonym $comparison_operator ? + AND syn.xref_id = oxr.xref_id + AND oxr.ensembl_object_type = ?); + + } + + my %result; + + my $sth = $self->prepare($query1); + + my $queryBind = 1; + $sth->bind_param( $queryBind++, $self->species_id(), SQL_INTEGER ) if $multispecies; + $sth->bind_param( $queryBind++, $name, SQL_VARCHAR ); + $sth->bind_param( $queryBind++, $name, SQL_VARCHAR ); + $sth->bind_param( $queryBind++, $ensType, SQL_VARCHAR ); + $sth->execute(); + my $r; + while ( $r = $sth->fetchrow_array() ) { $result{$r} = 1 } + + $sth = $self->prepare($query2); + + $queryBind = 1; + $sth->bind_param( $queryBind++, $self->species_id(), SQL_INTEGER ) if $multispecies; + $sth->bind_param( $queryBind++, $name, SQL_VARCHAR ); + $sth->bind_param( $queryBind++, $ensType, SQL_VARCHAR ); + $sth->execute(); + + while ( $r = $sth->fetchrow_array() ) { $result{$r} = 1 } + + return keys(%result); + +} ## end sub _type_by_external_id + +=head2 _type_by_external_db_id + + Arg [1] : integer $type - external_db_id + Arg [2] : string $ensType - ensembl_object_type + Arg [3] : (optional) string $extraType + other object type to be returned + Example : $self->_type_by_external_db_id(1030, 'Translation'); + Description: Gets. + NOTE: In a multi-species database, this method will + return all the entries matching the search criteria, not + just the ones associated with the current species. + Returntype : list of dbIDs (gene_id, transcript_id, etc.) + Exceptions : none + Caller : list_translation_ids_by_extids + translationids_by_extids + geneids_by_extids + Status : Stable + +=cut + +sub _type_by_external_db_id{ + my ($self, $external_db_id, $ensType, $extraType) = @_; + + my $from_sql = ''; + my $where_sql = ''; + my $ID_sql = "oxr.ensembl_id"; + + if (defined $extraType) { + if (lc($extraType) eq 'translation') { + $ID_sql = "tl.translation_id"; + } else { + $ID_sql = "t.${extraType}_id"; + } + + if (lc($ensType) eq 'translation') { + $from_sql = 'transcript t, translation tl, '; + $where_sql = qq( + t.transcript_id = tl.transcript_id AND + tl.translation_id = oxr.ensembl_id AND + t.is_current = 1 AND + ); + } else { + $from_sql = 'transcript t, '; + $where_sql = 't.'.lc($ensType).'_id = oxr.ensembl_id AND '. + 't.is_current = 1 AND '; + } + } + + if (lc($ensType) eq 'gene') { + $from_sql = 'gene g, '; + $where_sql = 'g.gene_id = oxr.ensembl_id AND g.is_current = 1 AND '; + } elsif (lc($ensType) eq 'transcript') { + $from_sql = 'transcript t, '; + $where_sql = 't.transcript_id = oxr.ensembl_id AND t.is_current = 1 AND '; + } elsif (lc($ensType) eq 'translation') { + $from_sql = 'transcript t, translation tl, '; + $where_sql = qq( + t.transcript_id = tl.transcript_id AND + tl.translation_id = oxr.ensembl_id AND + t.is_current = 1 AND + ); + } + + my $query = + "SELECT $ID_sql + FROM $from_sql xref x, object_xref oxr + WHERE $where_sql x.external_db_id = ? AND + x.xref_id = oxr.xref_id AND oxr.ensembl_object_type= ?"; + + my %result; + + my $sth = $self->prepare($query); + + $sth->bind_param( 1, "$external_db_id", SQL_VARCHAR ); + $sth->bind_param( 2, $ensType, SQL_VARCHAR ); + $sth->execute(); + + while ( my $r = $sth->fetchrow_array() ) { $result{$r} = 1 } + + return keys(%result); +} + + +=head2 fetch_all_by_description + + Arg [1] : string description to search for. Include % etc in this string + Arg [2] : string $dbname. Name of the database to search + + Example : @canc_refs = @{$db_entry_adaptor->fetch_all_by_description("%cancer%")}; + @db_entries = @{$db_entry_adaptor->fetch_all_by_description("%cancer%","MIM_MORBID")}; + Description: Retrieves DBEntries that match the description. + Optionally you can search on external databases type. + NOTE: In a multi-species database, this method will + return all the entries matching the search criteria, not + just the ones associated with the current species. + Returntype : ref to array of Bio::EnsEMBL::DBSQL::DBEntry + Exceptions : None. + Caller : General + Status : At Risk + +=cut + +sub fetch_all_by_description { + my ( $self, $description, $dbname ) = @_; + + my @results = (); + + my $sql = + "SELECT xref.xref_id, xref.dbprimary_acc, xref.display_label, + xref.version, + exDB.priority, + exDB.db_name, exDB.db_display_name, exDB.db_release, es.synonym, + xref.info_type, xref.info_text, exDB.type, exDB.secondary_db_name, + exDB.secondary_db_table, xref.description + FROM (xref, external_db exDB) + LEFT JOIN external_synonym es on es.xref_id = xref.xref_id + WHERE xref.description like ? + AND xref.external_db_id = exDB.external_db_id"; + + if ( defined($dbname) ) { $sql .= " AND exDB.db_name = ? " } + + my $sth = $self->prepare($sql); + + $sth->bind_param( 1, $description, SQL_VARCHAR ); + + if ( defined($dbname) ) { + $sth->bind_param( 2, $dbname, SQL_VARCHAR ); + } + + $sth->execute(); + + my $max_rows = 1000; + + while ( my $rowcache = $sth->fetchall_arrayref( undef, $max_rows ) ) { + while ( my $arrayref = shift( @{$rowcache} ) ) { + my ( $dbID, $dbprimaryId, + $displayid, $version, + $priority, + $ex_dbname, $db_display_name, + $release, $synonym, + $info_type, $info_text, + $type, $secondary_db_name, + $secondary_db_table, $description + ) = @$arrayref; + + my $exDB = + Bio::EnsEMBL::DBEntry->new( + -adaptor => $self, + -dbID => $dbID, + -primary_id => $dbprimaryId, + -display_id => $displayid, + -version => $version, + -release => $release, + -dbname => $ex_dbname, + -priority => $priority, + -db_display_name => $db_display_name, + -info_type => $info_type, + -info_text => $info_text, + -type => $type, + -secondary_db_name => $secondary_db_name, + -secondary_db_table => $secondary_db_table, + -description => $description + ); + + if ($synonym) { $exDB->add_synonym($synonym) } + + push @results, $exDB; + + } ## end while ( my $arrayref = shift... + } ## end while ( my $rowcache = $sth... + + $sth->finish(); + + return \@results; +} ## end sub fetch_all_by_description + + +=head2 fetch_all_by_source + + Arg [1] : string source to search for. Include % etc in this string + if you want to use SQL patterns + + Example : @unigene_refs = @{$db_entry_adaptor->fetch_all_by_source("%unigene%")}; + Description: Retrieves DBEntrys that match the source name. + Returntype : ref to array of Bio::EnsEMBL::DBSQL::DBEntry + Exceptions : None. + Caller : General + Status : At Risk + +=cut + +sub fetch_all_by_source { + my ( $self, $source ) = @_; + + my @results = (); + + my $sql = + "SELECT xref.xref_id, xref.dbprimary_acc, xref.display_label, + xref.version, + exDB.priority, + exDB.db_name, exDB.db_display_name, exDB.db_release, es.synonym, + xref.info_type, xref.info_text, exDB.type, exDB.secondary_db_name, + exDB.secondary_db_table, xref.description + FROM (xref, external_db exDB) + LEFT JOIN external_synonym es on es.xref_id = xref.xref_id + WHERE exDB.db_name like ? + AND xref.external_db_id = exDB.external_db_id"; + + + my $sth = $self->prepare($sql); + + $sth->bind_param( 1, $source, SQL_VARCHAR ); + + $sth->execute(); + + my $max_rows = 1000; + + while ( my $rowcache = $sth->fetchall_arrayref( undef, $max_rows ) ) { + while ( my $arrayref = shift( @{$rowcache} ) ) { + my ( $dbID, $dbprimaryId, + $displayid, $version, + $priority, + $dbname, $db_display_name, + $release, $synonym, + $info_type, $info_text, + $type, $secondary_db_name, + $secondary_db_table, $description + ) = @$arrayref; + + my $exDB = + Bio::EnsEMBL::DBEntry->new( + -adaptor => $self, + -dbID => $dbID, + -primary_id => $dbprimaryId, + -display_id => $displayid, + -version => $version, + -release => $release, + -dbname => $dbname, + -priority => $priority, + -db_display_name => $db_display_name, + -info_type => $info_type, + -info_text => $info_text, + -type => $type, + -secondary_db_name => $secondary_db_name, + -secondary_db_table => $secondary_db_table, + -description => $description + ); + + if ($synonym) { $exDB->add_synonym($synonym) } + + push @results, $exDB; + + } ## end while ( my $arrayref = shift... + } ## end while ( my $rowcache = $sth... + + $sth->finish(); + + return \@results; +} ## end sub fetch_all_by_source + + +=head2 fetch_all_synonyms + + Arg [1] : dbID of DBEntry to fetch synonyms for. Used in lazy loading of synonyms. + + Example : @canc_refs = @{$db_entry_adaptor->fetch_all_synonyms(1234)}; + Description: Fetches the synonyms for a particular DBEntry. + Returntype : listref of synonyms. List referred to may be empty if there are no synonyms. + Exceptions : None. + Caller : General + Status : At Risk + +=cut + + +sub fetch_all_synonyms { + my ( $self, $dbID ) = @_; + + my @synonyms = (); + + my $sth = + $self->prepare( "SELECT synonym " + . "FROM external_synonym " + . "WHERE xref_id = ?" ); + + $sth->bind_param( 1, $dbID, SQL_INTEGER ); + + $sth->execute(); + + my $synonym; + $sth->bind_col(1, \$synonym); + + while ( $sth->fetch() ) { + push( @synonyms, $synonym ); + } + + return \@synonyms; +} + + +=head2 get_db_name_from_external_db_id + + Arg [1] : external_dbid of database to get the database_name + Example : my $db_name = $db_entry_adaptor->get_db_name_from_external_db_id(1100); + Description: Gets the database name for a certain external_db_id + Returntype : scalar + Exceptions : None. + Caller : General + Status : At Risk + +=cut + +sub get_db_name_from_external_db_id{ + my $self = shift; + my $external_db_id = shift; + + my $sth = $self->prepare("SELECT db_name FROM external_db WHERE external_db_id = ?"); + + $sth->bind_param(1, $external_db_id, SQL_INTEGER); + $sth->execute(); + my ($db_name) = $sth->fetchrow_array(); + $sth->finish(); + return $db_name; + +} + +=head2 geneids_by_extids + + Description: DEPRECATED use list_gene_ids_by_extids instead + +=cut + +sub geneids_by_extids{ + my ($self,$name) = @_; + deprecate(" use 'list_gene_ids_by_extids instead"); + return $self->list_gene_ids_by_extids( $name ); +} + + +=head2 translationids_by_extids + + DEPRECATED use list_translation_ids_by_extids instead + +=cut + +sub translationids_by_extids{ + my ($self,$name) = @_; + deprecate("Use list_translation_ids_by_extids instead"); + return $self->list_translation_ids_by_extids( $name ); +} + + +=head2 transcriptids_by_extids + + DEPRECATED use transcriptids_by_extids instead + +=cut + +sub transcriptids_by_extids{ + my ($self,$name) = @_; + deprecate("Use list_transcript_ids_by_extids instead."); + return $self->list_transcript_ids_by_extids( $name ); +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/DataFileAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/DataFileAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,372 @@ +package Bio::EnsEMBL::DBSQL::DataFileAdaptor; + +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::DBSQL::DataFileAdaptor + +=head1 SYNOPSIS + + my $dfa = $dba->get_DataFileAdaptor(); + my $file = $dfa->fetch_by_dbID(1); + my $files = $dfa->fetch_all(); + + my $logic_name_files = $dfa->fetch_all_by_logic_name('bam_alignments'); + +=head1 DESCRIPTION + +Provides a database wrapper to store the locations of files and to pull these +records back out. DataFile objects can only provide basic information but they +can return an intended external database adaptor which can be used to +parse the information. This system assumes nothing about the file just that +your parser can access it. + +Files are supported over any protocol your parser supports and locations can be +made absolute, built on the fly or versioned. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +use base qw/Bio::EnsEMBL::DBSQL::BaseAdaptor/; + +use Bio::EnsEMBL::DataFile; +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Utils::Exception qw/throw warning deprecate/; +use Bio::EnsEMBL::Utils::Scalar qw/:assert/; + +my $GLOBAL_BASE_PATH; + +=head2 global_base_path + + Arg[1] : String; base path + Example : Bio::EnsEMBL::DBSQL::DataFileAdaptor->global_base_path('/base/path'); + Description : Stores a global value to be used when building data file paths + Returntype : String + Exceptions : None + +=cut + +sub global_base_path { + my ($class, $base_path) = @_; + return $GLOBAL_BASE_PATH unless $base_path; + $GLOBAL_BASE_PATH = $base_path; + return $GLOBAL_BASE_PATH; +} + +=head2 get_base_path + + Arg[1] : String; (optional) base path + Example : $dfa->get_base_path(); + Description : If given the path it will return that path; if not it consults + $self->global_base_path() for a value. As a last resort + it will look at the meta table for an entry keyed by + B + Returntype : String + Exceptions : Thrown if nothing is found after consulting all three locations + +=cut + +sub get_base_path { + my ($self, $path) = @_; + return $path if defined $path; + my $global_base_path = $self->global_base_path(); + return $global_base_path if defined $global_base_path; + my $meta_base_path = $self->db()->get_MetaContainer()->single_value_by_key('data_file.base_path', 1); + return $meta_base_path if defined $meta_base_path; + throw "No base path discovered. Either provide a path, set a global using global_base_path() or specify 'data_file.base_path' in meta"; +} + +=head2 DataFile_to_extension + + Deprecated + Arg[1] : Bio::EnsEMBL::DataFile + Example : my $ext = $dfa->DataFile_to_extension($bam_df); + Description : Returns an expected extension for the given DataFile type + Returntype : Scalar of the expected file extension + Exceptions : Raised if the given file type is not understood + +=cut + +sub DataFile_to_extension { + my ($self, $df) = @_; + deprecate("Use DataFile_to_extensions() instead"); + my $extensions = $self->DataFile_to_extensions($df); + return $extensions->[0]; +} + +=head2 DataFile_to_extensions + + Arg[1] : Bio::EnsEMBL::DataFile + Example : my $exts = $dfa->DataFile_to_extensions($bam_df); + Description : Returns all expected extensions for the given DataFile type. The + first returned is the default extension + Returntype : ArrayRef + Exceptions : Raised if the given file type is not understood + +=cut + +sub DataFile_to_extensions { + my ($self, $df) = @_; + my $type = $df->file_type(); + my $extensions = { + BAM => ['bam', 'bam.bai'], +# BIGBED => 'bb', + BIGWIG => ['bw'], + VCF => ['vcf.gz', 'vcf.gz.tbi'], + }->{$type}; + throw sprintf(q{No extensions found for the type '%s'}, $type ) if ! $extensions; + return $extensions; +} + + +=head2 DataFile_to_adaptor + + Arg[1] : Bio::EnsEMBL::DataFile + Arg[2] : (optional) base path + Example : my $bam = $dfa->DataFile_to_adaptor($bam_df); + Description : Returns an adaptor instance which will access the given DataFile + Returntype : Scalar actual return depends upon the given file type + Exceptions : Raised if the given file type is not understood + +=cut + +sub DataFile_to_adaptor { + my ($self, $df, $base) = @_; + my $type = $df->file_type(); + my $dispatch = { + BAM => sub { + require Bio::EnsEMBL::ExternalData::BAM::BAMAdaptor; + return Bio::EnsEMBL::ExternalData::BAM::BAMAdaptor->new($df->path($base)); + }, + BIGBED => sub { + require Bio::EnsEMBL::ExternalData::BigFile::BigBedAdaptor; + return Bio::EnsEMBL::ExternalData::BigFile::BigBedAdaptor->new($df->path($base)); + }, + BIGWIG => sub { + require Bio::EnsEMBL::ExternalData::BigFile::BigWigAdaptor; + return Bio::EnsEMBL::ExternalData::BigFile::BigWigAdaptor->new($df->path($base)); + }, + VCF => sub { + require Bio::EnsEMBL::ExternalData::VCF::VCFAdaptor; + return Bio::EnsEMBL::ExternalData::VCF::VCFAdaptor->new($df->path($base)); + }, + }->{$type}; + throw sprintf(q{No handler found for the type '%s'}, $type ) if ! $dispatch; + return $dispatch->(); +} + +=head2 fetch_all_by_logic_name + + Args [1] : String $logic_name for the linked analysis + Example : my $dfs = $dfa->fetch_all_by_logic_name('bam_alignments'); + Description : Returns all DataFile entries linked to the given analysis + logic name + Returntype : ArrayRef contains Bio::EnsEMBL::DataFile instances + Exceptions : Thrown if logic name does not exist + +=cut + +sub fetch_all_by_logic_name { + my ($self, $logic_name) = @_; + my $analysis = $self->db()->get_AnalysisAdaptor()->fetch_by_logic_name($logic_name); + throw "No analysis found for logic_name '${logic_name}'" if ! $analysis; + return $self->fetch_all_by_Analysis($analysis); +} + +=head2 fetch_all_by_Analysis + + Args [1] : Bio::EnsEMBL::Analysis $analysis to look up by + Example : my $dfs = $dfa->fetch_all_by_Analysis($analysis); + Description : Returns all DataFile entries linked to the given analysis + Returntype : ArrayRef contains Bio::EnsEMBL::DataFile instances + Exceptions : None + +=cut + +sub fetch_all_by_Analysis { + my ($self, $analysis) = @_; + assert_ref($analysis, 'Bio::EnsEMBL::Analysis', 'analysis'); + $self->bind_param_generic_fetch($analysis->dbID(), SQL_INTEGER); + return $self->generic_fetch('df.analysis_id =?'); +} + +=head2 fetch_all_by_CoordSystem + + Args [1] : Bio::EnsEMBL::CoordSystem $coord_system to look up by + Example : my $dfs = $dfa->fetch_all_by_CoordSystem($cs); + Description : Returns all DataFile entries linked to the given coordinate + system. Does B support I + Returntype : ArrayRef contains Bio::EnsEMBL::DataFile instances + Exceptions : None + +=cut + +sub fetch_all_by_CoordSystem { + my ($self, $cs) = @_; + assert_ref($cs, 'Bio::EnsEMBL::CoordSystem', 'coord_system'); + $self->bind_param_generic_fetch($cs->dbID(), SQL_INTEGER); + return $self->generic_fetch('df.coord_system_id =?'); +} + +sub fetch_by_name_and_type { + my ($self, $name, $type) = @_; + $self->bind_param_generic_fetch($name, SQL_VARCHAR); + $self->bind_param_generic_fetch($type, SQL_VARCHAR); + my $results = $self->generic_fetch('df.name =? and df.file_type =?'); + return $results->[0] if @{$results}; + return; +} + +sub generic_fetch { + my ($self, $constraint) = @_; + $constraint ||= q{}; + + my $sql = <<'SQL'; +select df.data_file_id, df.coord_system_id, df.analysis_id, df.name, df.version_lock, df.absolute, df.url, df.file_type +from data_file df +join coord_system cs using (coord_system_id) +where cs.species_id =? +SQL + $sql .= 'AND '.$constraint if $constraint; + + my $params = $self->bind_param_generic_fetch(); + if(defined $params) { + $self->{'_bind_param_generic_fetch'} = (); + } + else { + $params = []; + } + unshift(@{$params}, $self->db()->species_id()); + + my $csa = $self->db()->get_CoordSystemAdaptor(); + my $aa = $self->db()->get_AnalysisAdaptor(); + + return $self->dbc()->sql_helper()->execute(-SQL => $sql, -PARAMS => $params, -CALLBACK => sub { + my ($row) = @_; + my ($data_file_id, $coord_system_id, $analysis_id, $name, $version_lock, $absolute, $url, $file_type) = @{$row}; + my $hash = { + dbID => $data_file_id, + adaptor => $self, + coord_system => $csa->fetch_by_dbID($coord_system_id), + analysis => $aa->fetch_by_dbID($analysis_id), + name => $name, + version_lock => $version_lock, + absolute => $absolute, + file_type => $file_type, + }; + $hash->{url} = $url if $url; + return Bio::EnsEMBL::DataFile->new_fast($hash); + }); +} + +sub store { + my ($self, $df) = @_; + + assert_ref($df, 'Bio::EnsEMBL::DataFile', 'datafile'); + + if ($df->is_stored($self->db())) { + return $df->dbID(); + } + + throw 'Analysis is not defined for this data file' if ! defined $df->analysis(); + throw 'Coord system is not defined for this data file' if ! defined $df->coord_system(); + + my $sql = <<'SQL'; +INSERT INTO data_file (coord_system_id, analysis_id, name, version_lock, absolute, url, file_type) +VALUES (?,?,?,?,?,?,?) +SQL + my $params = [ + [$df->coord_system()->dbID(), SQL_INTEGER], + [$df->analysis()->dbID(), SQL_INTEGER], + [$df->name(), SQL_VARCHAR], + [$df->version_lock(), SQL_INTEGER], + [$df->absolute(), SQL_INTEGER], + [$df->url(), SQL_VARCHAR], + [$df->file_type(), SQL_VARCHAR], + ]; + $self->dbc()->sql_helper()->execute_update(-SQL => $sql, -PARAMS => $params, -CALLBACK => sub { + my ( $sth, $dbh ) = @_; + $df->dbID($self->last_insert_id()); + return; + }); + $df->adaptor($self); + + return $df->dbID(); +} + +sub update { + my ($self, $df) = @_; + + assert_ref($df, 'Bio::EnsEMBL::DataFile', 'datafile'); + + if (! $df->is_stored($self->db())) { + $self->store($df); + return; + } + + my $sql = <<'SQL'; +UPDATE data_file SET coord_system_id =?, analysis_id=?, name=?, version_lock=?, absolute=?, url=?, file_type=? +WHERE data_file_id =? +SQL + my $params = [ + [$df->coord_system()->dbID(), SQL_INTEGER], + [$df->analysis()->dbID(), SQL_INTEGER], + [$df->name(), SQL_VARCHAR], + [$df->version_lock(), SQL_INTEGER], + [$df->absolute(), SQL_INTEGER], + [$df->url(), SQL_VARCHAR], + [$df->file_type(), SQL_VARCHAR], + [$df->dbID(), SQL_INTEGER], + ]; + $self->dbc()->sql_helper()->execute_update(-SQL => $sql, -PARAMS => $params); + return; +} + +sub delete { + my ($self, $df) = @_; + + assert_ref($df, 'Bio::EnsEMBL::DataFile', 'datafile'); + + if (! $df->is_stored($self->db())) { + throw "Cannot delete the data file if it has not already been stored in this database"; + } + + $self->dbc()->sql_helper()->execute_update( + -SQL => 'DELETE from data_file where data_file_id =?', + -PARAMS => [[$df->dbID(), SQL_INTEGER]], + ); + + return; +} + +sub _tables { + my ($self) = @_; + return ( + [qw/data_file df/] + ); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/DensityFeatureAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/DensityFeatureAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,627 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::DensityFeatureAdaptor + +=head1 SYNOPSIS + + my $dfa = $database_adaptor->get_DensityFeatureAdaptor(); + + my $interpolate = 1; + my $blocks_wanted = 50; + + @dense_feats = @{ + $dfa->fetch_all_by_Slice( $slice, 'SNPDensity', $blocks_wanted, + $interpolate ); + } + +=head1 DESCRIPTION + +Density Feature Adaptor - An adaptor responsible for the creation of density +features from the database. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::DensityFeatureAdaptor; +use vars qw(@ISA); +use strict; + + +use POSIX; +use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; +use Bio::EnsEMBL::Utils::Cache; +use Bio::EnsEMBL::DensityFeature; +use Bio::EnsEMBL::DensityFeatureSet; +use Bio::EnsEMBL::DensityType; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor); + +our $DENSITY_FEATURE_CACHE_SIZE = 20; + +=head2 new + + Arg [1] : list of args @args + Superclass constructor arguments + Example : none + Description: Constructor which just initializes internal cache structures + Returntype : Bio::EnsEMBL::DBSQL::DensityFeatureAdaptor + Exceptions : none + Caller : implementing subclass constructors + Status : Stable + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + #initialize an LRU cache + my %cache; + tie(%cache, 'Bio::EnsEMBL::Utils::Cache', $DENSITY_FEATURE_CACHE_SIZE); + $self->{'_density_feature_cache'} = \%cache; + + return $self; +} + +=head2 fetch_all_by_Slice + + Arg [1] : Bio::EnsEMBL::Slice $slice - The slice representing the region + to retrieve density features from. + Arg [2] : string $logic_name - The logic name of the density features to + retrieve. + Arg [3] : int $num_blocks (optional; default = 50) - The number of + features that are desired. The ratio between the size of these + features and the size of the features in the database will be + used to determine which database features will be used. + Arg [4] : boolean $interpolate (optional; default = 0) - A flag indicating + whether the features in the database should be interpolated to + fit them to the requested number of features. If true the + features will be interpolated to provide $num_blocks features. + This will not guarantee that exactly $num_blocks features are + returned due to rounding etc. but it will be close. + Arg [5] : float $max_ratio - The maximum ratio between the size of the + requested features (as determined by $num_blocks) and the actual + size of the features in the database. If this value is exceeded + then an empty list will be returned. This can be used to + prevent excessive interpolation of the database values. + Example : #interpolate: + $feats = $dfa->fetch_all_by_Slice($slice,'SNPDensity', 10, 1); + #do not interpoloate, get what is in the database: + $feats = $dfa->fetch_all_by_Slice($slice,'SNPDensity', 50); + #interpolate, but not too much + $feats = $dfa->fetch_all_by_Slice($slice,'SNPDensity',50,1,5.0); + Description: Retrieves a set of density features which overlap the region + of this slice. Density features are a discrete representation + of a continuous value along a sequence, such as a density or + percent coverage. Density Features may be stored in chunks of + different sizes in the database, and interpolated to fit the + desired size for the requested region. For example the database + may store a single density value for each 1KB and also for each + 1MB. When fetching for an entire chromosome the 1MB density + chunks will be used if the requested number of blocks is not + very high. + Note that features which have been interpolated are not stored + in the database and as such will have no dbID or adaptor set. + Returntype : Bio::EnsEMBL::DensityFeature + Exceptions : warning on invalid $num_blocks argument + warning if no features with logic_name $logic_name exist + warning if density_type table has invalid block_size value + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_Slice { + my ($self, $slice, $logic_name, $num_blocks, $interpolate, $max_ratio) = @_; + + if(defined($num_blocks) && $num_blocks < 1) { + warning("Invalid number of density blocks [$num_blocks] requested.\n" . + "Returning empty list."); + return []; + } + + $num_blocks ||= 50; + my $length = $slice->length(); + + my $wanted_block_size = POSIX::ceil($length/$num_blocks); + + # + # get out all of the density types and choose the one with the + # block size closest to our desired block size + # + + my $dta = $self->db()->get_DensityTypeAdaptor(); + + my @dtypes = @{$dta->fetch_all_by_logic_name($logic_name)}; + if( ! @dtypes ){ + my @all_dtypes = @{ $dta->fetch_all() }; + @all_dtypes or warning( "No DensityTypes in $dta" ) && return []; + my $valid_list = join( ", ", map{$_->analysis->logic_name} @all_dtypes ); + warning( "No DensityTypes for logic name $logic_name. ". + "Select from $valid_list" ); + return []; + } + + my $best_ratio = undef; + my $density_type = undef; + my $best_ratio_large = undef; + my $density_type_large = undef; + my %dt_ratio_hash; + + foreach my $dt (@dtypes) { + + my $ratio; + if( $dt->block_size() > 0 ) { + $ratio = $wanted_block_size/$dt->block_size(); + } else { + # This is only valid if the requested seq_region is the one the + # features are stored on. Please use sensibly. Or find better implementation. + + my $block_size = $slice->seq_region_length() / $dt->region_features(); + $ratio = $wanted_block_size / $block_size; + } + + # we prefer to use a block size that's smaller than the required one + # (better results on interpolation). + # give larger bits a disadvantage and make them comparable + if( $ratio < 1 ) { + $ratio = 5/$ratio; + } + + $dt_ratio_hash{ $ratio } = $dt; + } + + $best_ratio = (sort {$a<=>$b} (keys %dt_ratio_hash))[0]; + + #the ratio was not good enough, or this logic name was not in the + #density type table, return an empty list + if(!defined($best_ratio) || + (defined($max_ratio) && $best_ratio > $max_ratio)) { + return []; + } + + $density_type = $dt_ratio_hash{$best_ratio}; + + my $constraint = "df.density_type_id = " . $density_type->dbID(); + + my @features = + @{$self->fetch_all_by_Slice_constraint($slice,$constraint)}; + + return \@features if(!$interpolate); + + #interpolate the features into new features of a different size + my @out; + #sort the features on start position + @features = sort( { $a->start() <=> $b->start() } @features ); + + #resize the features that were returned + my $start = 1; + my $end = $start+$wanted_block_size-1; + + my $value_type = $density_type->value_type(); + + # create a new density type object for the interpolated features that + # is not stored in the database + $density_type = Bio::EnsEMBL::DensityType->new + (-VALUE_TYPE => $value_type, + -ANALYSIS => $density_type->analysis(), + -BLOCK_SIZE => $wanted_block_size); + + while($start < $length) { +# $end = $length if($end > $length); + + my $density_value = 0.0; + my ($f, $fstart, $fend, $portion); + my @dvalues; + + #construct a new feature using all of the old density features that + #we overlapped + while(($f = shift(@features)) && $end > $f->{'start'}) { + + #what portion of this feature are we using to construct our new block? + $fstart = ($f->{'start'} < $start) ? $start : $f->{'start'}; + $fend = ($f->{'end'} > $end ) ? $end : $f->{'end'}; + $fend = $length if($fend > $length); + + if($value_type eq 'sum') { + + $portion = ($fend-$fstart+1)/$f->length(); + + #take a percentage of density value, depending on how much of the + #feature we overlapped + $density_value += $portion * $f->{'density_value'}; + + } elsif($value_type eq 'ratio') { + + #maintain a running total of the length used from each feature + #and its value + push(@dvalues,[$fend-$fstart+1,$f->{'density_value'}]); + + } else { + throw("Unknown density value type [$value_type]."); + } + + #do not want to look at next feature if we only used part of this one: + last if($fend < $f->{'end'}); + } + + #if we did not completely overlap the last feature, put it back on so + #it can be partially used by the next block + if(defined($f) && (!defined($fend) || $fend < $f->{'end'})) { + unshift(@features, $f); + } + + if($value_type eq 'ratio') { + #take a weighted average of the all the density values of the features + #used to construct this one + my $total_len = $end - $start + 1; + if($total_len > 0) { + foreach my $pair (@dvalues) { + my ($dlen, $dval) = @$pair; + $density_value += $dval * ($dlen/$total_len); + } + } + } + + # interpolated features are not stored in the db so do not set the dbID + # or the adaptor + push @out, Bio::EnsEMBL::DensityFeature->new + (-seq_region => $slice, + -start => $start, + -end => $end, + -density_type => $density_type, + -density_value => $density_value); + + $start = $end + 1; + $end += $wanted_block_size; + } + + return \@out; +} + + +sub _tables { + my $self = shift; + + return (['density_feature', 'df']); +} + + +sub _columns { + my $self = shift; + + return qw( df.density_feature_id + df.seq_region_id df.seq_region_start df.seq_region_end + df.density_value df.density_type_id ); +} + + + +sub _objs_from_sth { + my ($self, $sth, $mapper, $dest_slice) = @_; + + # + # This code is ugly because an attempt has been made to remove as many + # function calls as possible for speed purposes. Thus many caches, + # and a fair bit of gymnastics have been used. + # + + my $sa = $self->db()->get_SliceAdaptor(); + my $dta = $self->db()->get_DensityTypeAdaptor(); + + my @features; + my %dtype_hash; + my %slice_hash; + my %sr_name_hash; + my %sr_cs_hash; + + my($density_feature_id, $seq_region_id, $seq_region_start, $seq_region_end, + $density_value, $density_type_id ); + + $sth->bind_columns(\$density_feature_id, \$seq_region_id, \$seq_region_start, + \$seq_region_end, \$density_value, \$density_type_id); + + my $asm_cs; + my $cmp_cs; + my $asm_cs_vers; + my $asm_cs_name; + my $cmp_cs_vers; + my $cmp_cs_name; + if($mapper) { + $asm_cs = $mapper->assembled_CoordSystem(); + $cmp_cs = $mapper->component_CoordSystem(); + $asm_cs_name = $asm_cs->name(); + $asm_cs_vers = $asm_cs->version(); + $cmp_cs_name = $cmp_cs->name(); + $cmp_cs_vers = $cmp_cs->version(); + } + + my $dest_slice_start; + my $dest_slice_end; + my $dest_slice_strand; + my $dest_slice_length; + my $dest_slice_sr_name; + my $dest_slice_sr_id; + + if($dest_slice) { + $dest_slice_start = $dest_slice->start(); + $dest_slice_end = $dest_slice->end(); + $dest_slice_strand = $dest_slice->strand(); + $dest_slice_length = $dest_slice->length(); + $dest_slice_sr_name = $dest_slice->seq_region_name(); + $dest_slice_sr_id = $dest_slice->get_seq_region_id(); + } + + FEATURE: while($sth->fetch()) { + #get the density type object + my $density_type = $dtype_hash{$density_type_id} ||= + $dta->fetch_by_dbID($density_type_id); + + #get the slice object + #need to get the internal_seq_region, if present + $seq_region_id = $self->get_seq_region_id_internal($seq_region_id); + my $slice = $slice_hash{"ID:".$seq_region_id}; + if(!$slice) { + $slice = $sa->fetch_by_seq_region_id($seq_region_id); + $slice_hash{"ID:".$seq_region_id} = $slice; + $sr_name_hash{$seq_region_id} = $slice->seq_region_name(); + $sr_cs_hash{$seq_region_id} = $slice->coord_system(); + } + + my $sr_name = $sr_name_hash{$seq_region_id}; + my $sr_cs = $sr_cs_hash{$seq_region_id}; + + # + # remap the feature coordinates to another coord system + # if a mapper was provided + # + if($mapper) { + + my $len = $seq_region_end - $seq_region_start + 1; + + my @coords; + + if (defined $dest_slice && $mapper->isa('Bio::EnsEMBL::ChainedAssemblyMapper') ) { + + @coords = $mapper->map( $sr_name, $seq_region_start, $seq_region_end, + 1, $sr_cs, 0, $dest_slice); + + } else { + @coords = $mapper->map($sr_name, $seq_region_start, $seq_region_end,1, $sr_cs); + } + + #filter out gaps + @coords = grep {!$_->isa('Bio::EnsEMBL::Mapper::Gap')} @coords; + + #throw out density features mapped to gaps, or split + next FEATURE if(@coords != 1); + + $seq_region_start = $coords[0]->{'start'}; + $seq_region_end = $coords[0]->{'end'}; + $seq_region_id = $coords[0]->{'id'}; + + if($density_type->value_type() eq 'sum') { + #adjust density value so it reflects length of feature actually used + my $newlen = $seq_region_end - $seq_region_start +1; + $density_value *= $newlen/$len if($newlen != $len); + } + + #get a slice in the coord system we just mapped to +# if($asm_cs == $sr_cs || ($cmp_cs != $sr_cs && $asm_cs->equals($sr_cs))) { + $slice = $slice_hash{"ID:".$seq_region_id} ||= + $sa->fetch_by_seq_region_id($seq_region_id); +# } else { +# $slice = $slice_hash{"NAME:$sr_name:$asm_cs_name:$asm_cs_vers"} ||= +# $sa->fetch_by_region($asm_cs_name, $sr_name, undef, undef, undef, +# $asm_cs_vers); +# } + } + + # + # If a destination slice was provided convert the coords + # If the dest_slice starts at 1 and is foward strand, nothing needs doing + # + if($dest_slice) { + if($dest_slice_start != 1 || $dest_slice_strand != 1) { + if($dest_slice_strand == 1) { + $seq_region_start = $seq_region_start - $dest_slice_start + 1; + $seq_region_end = $seq_region_end - $dest_slice_start + 1; + } else { + my $tmp_seq_region_start = $seq_region_start; + $seq_region_start = $dest_slice_end - $seq_region_end + 1; + $seq_region_end = $dest_slice_end - $tmp_seq_region_start + 1; + } + } + + #throw away features entirely off the end of the requested slice + if($seq_region_end < 1 || $seq_region_start > $dest_slice_length || + ( $dest_slice_sr_id ne $seq_region_id )) { + next FEATURE; + } + $slice = $dest_slice; + } + + push( @features, + $self->_create_feature( 'Bio::EnsEMBL::DensityFeature', { + -dbID => $density_feature_id, + -adaptor => $self, + -start => $seq_region_start, + -end => $seq_region_end, + -seq_region => $slice, + -density_value => $density_value, + -density_type => $density_type + } ) ); + + } + + return \@features; +} + + +=head2 list_dbIDs + + Arg [1] : none + Example : @feature_ids = @{$density_feature_adaptor->list_dbIDs()}; + Description: Gets an array of internal ids for all density features in the + current db + Arg[1] : int. not set to 0 for the ids to be sorted by the seq_region. + Returntype : list of ints + Exceptions : none + Caller : ? + Status : Stable + +=cut + +sub list_dbIDs { + my ($self, $ordered) = @_; + + return $self->_list_dbIDs("density_feature",undef, $ordered); +} + + +=head2 store + + Arg [1] : list of Bio::EnsEMBL::DensityFeatures @df + the simple features to store in the database + Example : $density_feature_adaptor->store(1234, @density_feats); + Description: Stores a list of density feature objects in the database + Returntype : none + Exceptions : thrown if @df is not defined, if any of the features do not + have an attached slice. + or if any elements of @df are not Bio::EnsEMBL::SeqFeatures + Caller : general + Status : Stable + +=cut + +sub store{ + my ($self,@df) = @_; + + if( scalar(@df) == 0 ) { + throw("Must call store with list of DensityFeatures"); + } +#mysql> desc density_feature; +#+--------------------+---------+------+-----+---------+----------------+ +#| Field | Type | Null | Key | Default | Extra | +#+--------------------+---------+------+-----+---------+----------------+ +#| density_feature_id | int(11) | | PRI | NULL | auto_increment | +#| density_type_id | int(11) | | MUL | 0 | | +#| seq_region_id | int(11) | | | 0 | | +#| seq_region_start | int(11) | | | 0 | | +#| seq_region_end | int(11) | | | 0 | | +#| density_value | float | | | 0 | | +#+--------------------+---------+------+-----+---------+----------------+ + + my $sth = $self->prepare + ("INSERT INTO density_feature (seq_region_id, seq_region_start, " . + "seq_region_end, density_type_id, " . + "density_value) " . + "VALUES (?,?,?,?,?)"); + + my $db = $self->db(); + my $analysis_adaptor = $db->get_AnalysisAdaptor(); + + FEATURE: foreach my $df ( @df ) { + + if( !ref $df || !$df->isa("Bio::EnsEMBL::DensityFeature") ) { + throw("DensityFeature must be an Ensembl DensityFeature, " . + "not a [".ref($df)."]"); + } + + # we dont store 0 value density features + next if( $df->density_value == 0 ); + if($df->is_stored($db)) { + warning("DensityFeature [".$df->dbID."] is already stored" . + " in this database."); + next FEATURE; + } + + if(!defined($df->density_type)) { + throw("A density type must be attached to the features to be stored."); + } + + #store the density_type if it has not been stored yet + + if(!$df->density_type->is_stored($db)) { + my $dta = $db->get_DensityTypeAdaptor(); + $dta->store($df->density_type()); + } + + my $original = $df; + my $seq_region_id; + ($df, $seq_region_id) = $self->_pre_store($df); + + $sth->bind_param(1,$seq_region_id,SQL_INTEGER); + $sth->bind_param(2,$df->start,SQL_INTEGER); + $sth->bind_param(3,$df->end,SQL_INTEGER); + $sth->bind_param(4,$df->density_type->dbID,SQL_INTEGER); + $sth->bind_param(5,$df->density_value,SQL_FLOAT); + $sth->execute(); + + $original->dbID($sth->{'mysql_insertid'}); + $original->adaptor($self); + } +} + +=head2 fetch_Featureset_by_Slice + + Arg [1-5] : see + Bio::EnsEMBL::DBSQL::DensityFeatureAdaptor::fetch_all_by_Slice() + for argument documentation + Example : $featureset = $dfa->fetch_FeatureSet_by_Slice($slice,'SNPDensity', 10, 1); + Description: wrapper around + Bio::EnsEMBL::DBSQL::DensityFeatureAdaptor::fetch_all_by_Slice() + which returns a Bio::EnsEMBL::DensityFeatureSet and also caches + results + Returntype : Bio::EnsEMBL::DensityFeatureSet + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_Featureset_by_Slice { + my ($self, $slice, $logic_name, $num_blocks, $interpolate, $max_ratio) = @_; + + my $key = join(":", $slice->name, + $logic_name, + $num_blocks || 50, + $interpolate || 0, + $max_ratio); + + unless ($self->{'_density_feature_cache'}->{$key}) { + my $dfeats = $self->fetch_all_by_Slice($slice, $logic_name, $num_blocks, + $interpolate, $max_ratio); + $self->{'_density_feature_cache'}->{$key} = + new Bio::EnsEMBL::DensityFeatureSet($dfeats); + } + return $self->{'_density_feature_cache'}->{$key}; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/DensityTypeAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/DensityTypeAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,305 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::DensityTypeAdaptor + +=head1 SYNOPSIS + + my $density_type_adaptor = + $registry->get_adaptor( 'Human', 'Core', 'DensityType' ); + + my @density_types = @{ $density_type_adaptor->fetch_all() }; + + my $dt = $density_type_adaptor->fetch_by_dbID(12); + +=head1 DESCRIPTION + +DensityTypeAdaptor - Performs database interaction for DensityType objects. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::DensityTypeAdaptor; +use vars qw(@ISA); +use strict; + +use Bio::EnsEMBL::DBSQL::BaseAdaptor; + +use Bio::EnsEMBL::DensityType; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); + + + +=head2 new + + Arg [1] : see superclass (Bio::EnsEMBL::DBSQL::BaseAdaptor) arguments + Example : #use this instead of the constructor directly: + my $dta = $db_adaptor->get_DensityTypeAdaptor(); + Description: Constructor. Creates a new DensityTypeAdaptor + Returntype : Bio::EnsEMBL::DBSQL::DensityTypeAdaptor + Exceptions : none + Caller : DBAdaptor + Status : Stable + +=cut + +sub new { + my $class = shift; + + my $self = $class->SUPER::new(@_); + + $self->{'dbID_cache'} = {}; + + return $self; +} + + + +=head2 fetch_all + + Arg [1] : none + Example : my @density_types = @{$density_type_adaptor->fetch_all}; + Description: Retrieves every density type in the database. + NOTE: In a multi-species database, this method will + return all the entries, not just the ones associated with + the current species. + Returntype : reference to list of Bio::EnsEMBL::DensityType objects + Exceptions : none + Caller : general, new + Status : Stable + +=cut + +sub fetch_all { + my $self = shift; + + my @out; + + my $sth = $self->prepare("SELECT density_type_id, analysis_id, block_size,". + " value_type, region_features " . + "FROM density_type"); + + $sth->execute(); + + my($dbID, $analysis_id, $blk_size, $vtype, $region_features ); + $sth->bind_columns(\$dbID, \$analysis_id, \$blk_size, \$vtype, \$region_features ); + + my $analysis_adaptor = $self->db->get_AnalysisAdaptor(); + + while($sth->fetch()) { + my $analysis = $analysis_adaptor->fetch_by_dbID($analysis_id); + + + my $dt = Bio::EnsEMBL::DensityType->new(-ADAPTOR => $self, + -DBID => $dbID, + -ANALYSIS => $analysis, + -BLOCK_SIZE => $blk_size, + -REGION_FEATURES => $region_features, + -VALUE_TYPE => $vtype); + + $self->{'dbID_cache'}->{$dbID} = $dt; + + push @out, $dt; + } + + return \@out; +} + + + +=head2 fetch_by_dbID + + Arg [1] : int $dbID + Example : my $dt = $density_type_adaptor->fetch_by_dbID($dbID); + Description: Retrieves a density type object via its internal identifier + Returntype : Bio::EnsEMBL::DensityType + Exceptions : throw if dbID argument not defined + Caller : general + Status : Stable + +=cut + +sub fetch_by_dbID { + my $self = shift; + my $dbID = shift; + + if(!defined($dbID)) { + throw("dbID argument must be defined"); + } + + if($self->{'dbID_cache'}->{$dbID}) { + return $self->{'dbID_cache'}->{$dbID}; + } + + # go back to database and refill caches + $self->fetch_all(); + + return $self->{'dbID_cache'}->{$dbID}; +} + + +=head2 fetch_all_by_logic_name + + Arg [1] : string $logic_name + Example : my @dts = @{$dtype_adaptor->fetch_all('repeat_coverage')}; + Description: Retrieves all density types with a given logic name. + NOTE: In a multi-species database, this method will + return all the entries matching the search criteria, not + just the ones associated with the current species. + Returntype : reference to list of Bio::EnsEMBL::DensityTypes + Exceptions : thrown if logic_name argument is not provided + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_logic_name { + my $self = shift; + my $logic_name = shift; + + if(!defined($logic_name)) { + throw("logic_name argument is required."); + } + + my $analysis_adaptor = $self->db()->get_AnalysisAdaptor(); + my $analysis = $analysis_adaptor->fetch_by_logic_name($logic_name); + + return [] if(!$analysis); + + my $sth = $self->prepare("SELECT density_type_id, block_size,". + " value_type, region_features " . + "FROM density_type " . + "WHERE analysis_id = ?"); + $sth->bind_param(1,$analysis->dbID,SQL_INTEGER); + $sth->execute(); + + my($dbID, $blk_size, $vtype, $region_features ); + $sth->bind_columns(\$dbID, \$blk_size, \$vtype, \$region_features); + + my @out; + + while($sth->fetch()) { + + my $dt = Bio::EnsEMBL::DensityType->new(-ADAPTOR => $self, + -DBID => $dbID, + -ANALYSIS => $analysis, + -BLOCK_SIZE => $blk_size, + -REGION_FEATURES => $region_features, + -VALUE_TYPE => $vtype); + + $self->{'dbID_cache'}->{$dbID} = $dt; + + push @out, $dt; + } + + return \@out; +} + + +=head2 store + + Arg [1] : list of Bio::EnsEMBL::DensityType @dt + the density types to store in the database + Example : $density_type->store(@density_types); + Description: Stores a list of density type objects in the database + Returntype : none + Exceptions : thrown if @dt is not defined + or if any elements of @dt are not Bio::EnsEMBL::DensityType + Caller : general + Status : Stable + +=cut + +sub store { + my ($self,@dt) = @_; + + if( scalar(@dt) == 0 ) { + throw("Must call store with list of Density Types"); + } + + my $sth = $self->prepare + ("INSERT IGNORE INTO density_type (analysis_id,". + "block_size, value_type, region_features ) ". + "VALUES (?, ?, ?, ?)"); + + my $db = $self->db(); + my $analysis_adaptor = $db->get_AnalysisAdaptor(); + + FEATURE: foreach my $dt ( @dt ) { + if( !ref $dt || !$dt->isa("Bio::EnsEMBL::DensityType") ) { + throw("Density Type must be an Ensembl DensityType, " . + "not a [".ref($dt)."]"); + } + + if($dt->is_stored($db)) { + next FEATURE; + } + + if(!defined($dt->analysis())) { + throw("An analysis must be attached to the density type to be stored."); + } + + #store the analysis if it has not been stored yet + if(!$dt->analysis->is_stored($db)) { + $analysis_adaptor->store($dt->analysis()); + } + + my $block_size = $dt->block_size(); + $block_size |= 0; + my $region_features = $dt->region_features(); + $region_features |= 0; + + $sth->bind_param(1,$dt->analysis->dbID,SQL_INTEGER); + $sth->bind_param(2,$block_size,SQL_INTEGER); + $sth->bind_param(3,$dt->value_type,SQL_VARCHAR); + $sth->bind_param(4,$region_features, SQL_VARCHAR); + my $inserted = $sth->execute(); + + my $dbID; + + # $inserted can be 0E0 which is true but equal to 0 + if(!$inserted || $inserted == 0) { + # insert failed, presumably because was already stored in database + + my @dts=@{$self->fetch_all_by_logic_name($dt->analysis()->logic_name())}; + my ($stored_dt) = grep {$_->block_size() == $dt->block_size()} @dts; + if(!$stored_dt) { + throw("Could not retrieve or store DensityType from database.\n" . + "Incorrect db permissions or missing density_type table?\n"); + } + $dbID = $stored_dt->dbID(); + } else { + $dbID = $sth->{'mysql_insertid'}; + } + + # next two lines are to set the density type as stored + $dt->dbID($dbID); + $dt->adaptor($self); + + $self->{'dbID_cache'}->{$dbID} = $dt; + } +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/DnaAlignFeatureAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/DnaAlignFeatureAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,583 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::DnaAlignFeatureAdaptor - Adaptor for DnaAlignFeatures + +=head1 SYNOPSIS + + $dafa = $registry->get_adaptor( 'Human', 'Core', 'DnaAlignFeature' ); + + my @features = @{ $dafa->fetch_all_by_Slice($slice) }; + + $dafa->store(@features); + +=head1 DESCRIPTION + +This is an adaptor responsible for the retrieval and storage of +DnaDnaAlignFeatures from the database. This adaptor inherits most of its +functionality from the BaseAlignFeatureAdaptor and BaseFeatureAdaptor +superclasses. + +=head1 METHODS + +=cut + + +package Bio::EnsEMBL::DBSQL::DnaAlignFeatureAdaptor; +use vars qw(@ISA); +use strict; +use Bio::EnsEMBL::DnaDnaAlignFeature; +use Bio::EnsEMBL::DBSQL::BaseAlignFeatureAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAlignFeatureAdaptor); + + +=head2 _tables + + Args : none + Example : @tabs = $self->_tables + Description: PROTECTED implementation of the abstract method inherited from + BaseFeatureAdaptor. Returns list of [tablename, alias] pairs + Returntype : list of listrefs of strings + Exceptions : none + Caller : Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor::generic_fetch + Status : Stable + +=cut + +sub _tables { + my $self = shift; + + return (['dna_align_feature', 'daf'],['external_db','exdb']); +} + + +sub _left_join{ + return (['external_db',"exdb.external_db_id = daf.external_db_id"]); +} + +=head2 _columns + + Args : none + Example : @columns = $self->_columns + Description: PROTECTED implementation of abstract superclass method. + Returns a list of columns that are needed for object creation. + Returntype : list of strings + Exceptions : none + Caller : Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor::generic_fetch + Status : Stable + +=cut + +sub _columns { + my $self = shift; + + #warning, implementation of _objs_from_sth method depends on order of list + return qw(daf.dna_align_feature_id + daf.seq_region_id + daf.analysis_id + daf.seq_region_start + daf.seq_region_end + daf.seq_region_strand + daf.hit_start + daf.hit_end + daf.hit_name + daf.hit_strand + daf.cigar_line + daf.evalue + daf.perc_ident + daf.score + daf.external_db_id + daf.hcoverage + daf.external_data + daf.pair_dna_align_feature_id + exdb.db_name + exdb.db_display_name); +} + + +=head2 store + + Arg [1] : list of Bio::EnsEMBL::DnaAlignFeatures @feats + the features to store in the database + Example : $dna_align_feature_adaptor->store(@features); + Description: Stores a list of DnaAlignFeatures in the database + Returntype : none + Exceptions : throw if any of the provided features cannot be stored + which may occur if: + * The feature does not have an associate Slice + * The feature does not have an associated analysis + * The Slice the feature is associated with is on a seq_region + unknown to this database + A warning is given if: + * The feature has already been stored in this db + Caller : Pipeline + Status : Stable + +=cut + +sub store { + my ( $self, @feats ) = @_; + + throw("Must call store with features") if ( scalar(@feats) == 0 ); + + my @tabs = $self->_tables; + my ($tablename) = @{ $tabs[0] }; + + my $db = $self->db(); + my $analysis_adaptor = $db->get_AnalysisAdaptor(); + + my $sth = $self->prepare( + "INSERT INTO $tablename (seq_region_id, seq_region_start, + seq_region_end, seq_region_strand, + hit_start, hit_end, hit_strand, hit_name, + cigar_line, analysis_id, score, evalue, + perc_ident, external_db_id, hcoverage, + pair_dna_align_feature_id) + VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)" # 16 arguments + ); + +FEATURE: + foreach my $feat (@feats) { + if ( !ref $feat || !$feat->isa("Bio::EnsEMBL::DnaDnaAlignFeature") ) + { + throw("feature must be a Bio::EnsEMBL::DnaDnaAlignFeature," + . " not a [" + . ref($feat) + . "]." ); + } + + if ( $feat->is_stored($db) ) { + warning( "DnaDnaAlignFeature [" + . $feat->dbID() + . "] is already stored in this database." ); + next FEATURE; + } + + my $hstart = $feat->hstart(); + my $hend = $feat->hend(); + my $hstrand = $feat->hstrand(); + $self->_check_start_end_strand( $hstart, $hend, $hstrand ); + + my $cigar_string = $feat->cigar_string(); + if ( !$cigar_string ) { + $cigar_string = $feat->length() . 'M'; + warning( "DnaDnaAlignFeature does not define a cigar_string.\n" + . "Assuming ungapped block with cigar_line=$cigar_string ." ); + } + + my $hseqname = $feat->hseqname(); + if ( !$hseqname ) { + throw("DnaDnaAlignFeature must define an hseqname."); + } + + if ( !defined( $feat->analysis ) ) { + throw( + "An analysis must be attached to the features to be stored."); + } + + #store the analysis if it has not been stored yet + if ( !$feat->analysis->is_stored($db) ) { + $analysis_adaptor->store( $feat->analysis() ); + } + + my $original = $feat; + my $seq_region_id; + ( $feat, $seq_region_id ) = $self->_pre_store($feat); + + $sth->bind_param( 1, $seq_region_id, SQL_INTEGER ); + $sth->bind_param( 2, $feat->start, SQL_INTEGER ); + $sth->bind_param( 3, $feat->end, SQL_INTEGER ); + $sth->bind_param( 4, $feat->strand, SQL_TINYINT ); + $sth->bind_param( 5, $hstart, SQL_INTEGER ); + $sth->bind_param( 6, $hend, SQL_INTEGER ); + $sth->bind_param( 7, $hstrand, SQL_TINYINT ); + $sth->bind_param( 8, $hseqname, SQL_VARCHAR ); + $sth->bind_param( 9, $cigar_string, SQL_LONGVARCHAR ); + $sth->bind_param( 10, $feat->analysis->dbID, SQL_INTEGER ); + $sth->bind_param( 11, $feat->score, SQL_DOUBLE ); + $sth->bind_param( 12, $feat->p_value, SQL_DOUBLE ); + $sth->bind_param( 13, $feat->percent_id, SQL_FLOAT ); + $sth->bind_param( 14, $feat->external_db_id, SQL_INTEGER ); + $sth->bind_param( 15, $feat->hcoverage, SQL_DOUBLE ); + $sth->bind_param( 16, $feat->pair_dna_align_feature_id, + SQL_INTEGER ); + + $sth->execute(); + + $original->dbID( $sth->{'mysql_insertid'} ); + $original->adaptor($self); + } ## end foreach my $feat (@feats) + + $sth->finish(); +} ## end sub store + + +sub save { + my ($self, $features) = @_; + + my @feats = @$features; + throw("Must call store with features") if( scalar(@feats) == 0 ); + + my @tabs = $self->_tables; + my ($tablename) = @{$tabs[0]}; + + my $db = $self->db(); + my $analysis_adaptor = $db->get_AnalysisAdaptor(); + + my $sql = qq{INSERT INTO $tablename (seq_region_id, seq_region_start, seq_region_end, seq_region_strand, hit_start, hit_end, hit_strand, hit_name, cigar_line, analysis_id, score, evalue, perc_ident, external_db_id, hcoverage, pair_dna_align_feature_id, external_data) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)}; + + my %analyses = (); + + my $sth = $self->prepare($sql); + + FEATURE: foreach my $feat ( @feats ) { + if( !ref $feat || !$feat->isa("Bio::EnsEMBL::DnaDnaAlignFeature") ) { + throw("feature must be a Bio::EnsEMBL::DnaDnaAlignFeature," + . " not a [".ref($feat)."]."); + } + + if($feat->is_stored($db)) { + warning("DnaDnaAlignFeature [".$feat->dbID."] is already stored" . + " in this database."); + next FEATURE; + } + + my $hstart = $feat->hstart || 0; # defined $feat->hstart ? $feat->hstart : $feat->start ; + my $hend = $feat->hend || 0; # defined $feat->hend ? $feat->hend : $feat->end; + my $hstrand = $feat->hstrand|| 0; # defined $feat->hstrand ? $feat->hstrand : $feat->strand; + if( $hstart && $hend ) { + if($hend < $hstart) { + throw("Invalid Feature start/end [$hstart/$hend]. Start must be less than or equal to end."); + } + } + my $cigar_string = $feat->cigar_string(); + if(!$cigar_string) { + $cigar_string = $feat->length() . 'M'; + warning("DnaDnaAlignFeature does not define a cigar_string.\n" . + "Assuming ungapped block with cigar_line=$cigar_string ."); + } + + my $hseqname = $feat->hseqname(); + if(!$hseqname) { + throw("DnaDnaAlignFeature must define an hseqname."); + } + + if(!defined($feat->analysis)) { + throw("An analysis must be attached to the features to be stored."); + } + + #store the analysis if it has not been stored yet + if(!$feat->analysis->is_stored($db)) { + $analysis_adaptor->store($feat->analysis()); + } + + $analyses{ $feat->analysis->dbID }++; + + my $original = $feat; + my $seq_region_id; + ($feat, $seq_region_id) = $self->_pre_store_userdata($feat); + + my $extra_data = $feat->extra_data ? $self->dump_data($feat->extra_data) : ''; + + $sth->bind_param(1,$seq_region_id,SQL_INTEGER); + $sth->bind_param(2,$feat->start,SQL_INTEGER); + $sth->bind_param(3,$feat->end,SQL_INTEGER); + $sth->bind_param(4,$feat->strand,SQL_TINYINT); + $sth->bind_param(5,$hstart,SQL_INTEGER); + $sth->bind_param(6,$hend,SQL_INTEGER); + $sth->bind_param(7,$hstrand,SQL_TINYINT); + $sth->bind_param(8,$hseqname,SQL_VARCHAR); + $sth->bind_param(9,$cigar_string,SQL_LONGVARCHAR); + $sth->bind_param(10,$feat->analysis->dbID,SQL_INTEGER); + $sth->bind_param(11,$feat->score,SQL_DOUBLE); +# $sth->bind_param(11,$feat->score); # if the above statement does not work it means you need to upgrade DBD::mysql, meantime you can replace it with this line + $sth->bind_param(12,$feat->p_value,SQL_DOUBLE); + $sth->bind_param(13,$feat->percent_id,SQL_FLOAT); + $sth->bind_param(14,$feat->external_db_id,SQL_INTEGER); + $sth->bind_param(15,$feat->hcoverage,SQL_DOUBLE); + $sth->bind_param(16,$feat->pair_dna_align_feature_id,SQL_INTEGER); + $sth->bind_param(17,$extra_data,SQL_LONGVARCHAR); + + + $sth->execute(); + $original->dbID($sth->{'mysql_insertid'}); + $original->adaptor($self); + } + + $sth->finish(); + +## js5 hack to update meta_coord table... + if( keys %analyses ) { + + my $sth = $self->prepare( 'select sr.coord_system_id, max(daf.seq_region_end-daf.seq_region_start) from seq_region as sr, dna_align_feature as daf where daf.seq_region_id=sr.seq_region_id and analysis_id in ('.join(',',keys %analyses).') group by coord_system_id' ); + $sth->execute; + + foreach( @{ $sth->fetchall_arrayref } ) { + my $sth2 = $self->prepare( qq(insert ignore into meta_coord values("dna_align_feature",$_->[0],$_->[1])) ); + $sth2->execute; + $sth2->finish; + + $sth2 = $self->prepare( qq(update meta_coord set max_length = $_->[1] where coord_system_id = $_->[0] and table_name="dna_align_feature" and max_length < $_->[1]) ); + $sth2->execute; + $sth2->finish; + } + + $sth->finish; + } + +} + + +=head2 _objs_from_sth + + Arg [1] : DBI statement handle $sth + an exectuted DBI statement handle generated by selecting + the columns specified by _columns() from the table specified + by _table() + Example : @dna_dna_align_feats = $self->_obj_from_hashref + Description: PROTECTED implementation of superclass abstract method. + Creates DnaDnaAlignFeature objects from a DBI hashref + Returntype : listref of Bio::EnsEMBL::DnaDnaAlignFeatures + Exceptions : none + Caller : Bio::EnsEMBL::BaseFeatureAdaptor::generic_fetch + Status : Stable + +=cut + +sub _objs_from_sth { + my ( $self, $sth, $mapper, $dest_slice ) = @_; + + # + # This code is ugly because an attempt has been made to remove as many + # function calls as possible for speed purposes. Thus many caches and + # a fair bit of gymnastics is used. + # + + # In case of userdata we need the features on the dest_slice. In case + # of get_all_supporting_features dest_slice is not provided. + my $sa = ( $dest_slice + ? $dest_slice->adaptor() + : $self->db()->get_SliceAdaptor() ); + my $aa = $self->db->get_AnalysisAdaptor(); + + my @features; + my %analysis_hash; + my %slice_hash; + my %sr_name_hash; + my %sr_cs_hash; + + my ( $dna_align_feature_id, $seq_region_id, + $analysis_id, $seq_region_start, + $seq_region_end, $seq_region_strand, + $hit_start, $hit_end, + $hit_name, $hit_strand, + $cigar_line, $evalue, + $perc_ident, $score, + $external_db_id, $hcoverage, + $extra_data, $pair_dna_align_feature_id, + $external_db_name, $external_display_db_name ); + + $sth->bind_columns( \( $dna_align_feature_id, $seq_region_id, + $analysis_id, $seq_region_start, + $seq_region_end, $seq_region_strand, + $hit_start, $hit_end, + $hit_name, $hit_strand, + $cigar_line, $evalue, + $perc_ident, $score, + $external_db_id, $hcoverage, + $extra_data, $pair_dna_align_feature_id, + $external_db_name, $external_display_db_name ) + ); + + my $asm_cs; + my $cmp_cs; + my $asm_cs_vers; + my $asm_cs_name; + my $cmp_cs_vers; + my $cmp_cs_name; + + if ( defined($mapper) ) { + $asm_cs = $mapper->assembled_CoordSystem(); + $cmp_cs = $mapper->component_CoordSystem(); + $asm_cs_name = $asm_cs->name(); + $asm_cs_vers = $asm_cs->version(); + $cmp_cs_name = $cmp_cs->name(); + $cmp_cs_vers = $cmp_cs->version(); + } + + my $dest_slice_start; + my $dest_slice_end; + my $dest_slice_strand; + my $dest_slice_length; + my $dest_slice_sr_name; + my $dest_slice_seq_region_id; + + if ( defined($dest_slice) ) { + $dest_slice_start = $dest_slice->start(); + $dest_slice_end = $dest_slice->end(); + $dest_slice_strand = $dest_slice->strand(); + $dest_slice_length = $dest_slice->length(); + $dest_slice_sr_name = $dest_slice->seq_region_name(); + $dest_slice_seq_region_id = $dest_slice->get_seq_region_id(); + } + +FEATURE: + while ( $sth->fetch() ) { + # Get the analysis object. + my $analysis = $analysis_hash{$analysis_id} ||= + $aa->fetch_by_dbID($analysis_id); + + # Get the slice object. + my $slice = $slice_hash{ "ID:" . $seq_region_id }; + + if ( !defined($slice) ) { + $slice = $sa->fetch_by_seq_region_id($seq_region_id); + if ( defined($slice) ) { + $slice_hash{ "ID:" . $seq_region_id } = $slice; + $sr_name_hash{$seq_region_id} = $slice->seq_region_name(); + $sr_cs_hash{$seq_region_id} = $slice->coord_system(); + } + } + + my $sr_name = $sr_name_hash{$seq_region_id}; + my $sr_cs = $sr_cs_hash{$seq_region_id}; + + # Remap the feature coordinates to another coord system + # if a mapper was provided. + if ( defined($mapper) ) { + + if (defined $dest_slice && $mapper->isa('Bio::EnsEMBL::ChainedAssemblyMapper') ) { + ( $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand ) + = + $mapper->map( $sr_name, $seq_region_start, $seq_region_end, + $seq_region_strand, $sr_cs, 1, $dest_slice); + + } else { + + ( $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand ) + = + $mapper->fastmap( $sr_name, $seq_region_start, $seq_region_end, + $seq_region_strand, $sr_cs ); + } + + # Skip features that map to gaps or coord system boundaries. + if ( !defined($seq_region_id) ) { next FEATURE } + + # Get a slice in the coord system we just mapped to. + if ( $asm_cs == $sr_cs + || ( $cmp_cs != $sr_cs && $asm_cs->equals($sr_cs) ) ) + { + $slice = $slice_hash{ "ID:" . $seq_region_id } ||= + $sa->fetch_by_seq_region_id($seq_region_id); + } else { + $slice = $slice_hash{ "ID:" . $seq_region_id } ||= + $sa->fetch_by_seq_region_id($seq_region_id); + } + } + + # If a destination slice was provided, convert the coords. If the + # dest_slice starts at 1 and is forward strand, nothing needs doing. + if ( defined($dest_slice) ) { + if ( $dest_slice_start != 1 || $dest_slice_strand != 1 ) { + if ( $dest_slice_strand == 1 ) { + $seq_region_start = $seq_region_start - $dest_slice_start + 1; + $seq_region_end = $seq_region_end - $dest_slice_start + 1; + } else { + my $tmp_seq_region_start = $seq_region_start; + $seq_region_start = $dest_slice_end - $seq_region_end + 1; + $seq_region_end = $dest_slice_end - $tmp_seq_region_start + 1; + $seq_region_strand = -$seq_region_strand; + } + + # Throw away features off the end of the requested slice. + if ( $seq_region_end < 1 + || $seq_region_start > $dest_slice_length + || ( $dest_slice_seq_region_id ne $seq_region_id ) ) + { + next FEATURE; + } + } + $slice = $dest_slice; + } + + # Finally, create the new DnaAlignFeature. + push( @features, + $self->_create_feature_fast( + 'Bio::EnsEMBL::DnaDnaAlignFeature', { + 'slice' => $slice, + 'start' => $seq_region_start, + 'end' => $seq_region_end, + 'strand' => $seq_region_strand, + 'hseqname' => $hit_name, + 'hstart' => $hit_start, + 'hend' => $hit_end, + 'hstrand' => $hit_strand, + 'score' => $score, + 'p_value' => $evalue, + 'percent_id' => $perc_ident, + 'cigar_string' => $cigar_line, + 'analysis' => $analysis, + 'adaptor' => $self, + 'dbID' => $dna_align_feature_id, + 'external_db_id' => $external_db_id, + 'hcoverage' => $hcoverage, + 'extra_data' => ( + $extra_data + ? $self->get_dumped_data($extra_data) + : '' ), + 'dbname' => $external_db_name, + 'db_display_name' => $external_display_db_name, + 'pair_dna_align_feature_id' => $pair_dna_align_feature_id + } ) ); + + } ## end while ( $sth->fetch() ) + + return \@features; +} ## end sub _objs_from_sth + +=head2 list_dbIDs + + Arg [1] : none + Example : @feature_ids = @{$dna_align_feature_adaptor->list_dbIDs()}; + Description: Gets an array of internal ids for all dna align features in + the current db + Arg[1] : int. not 0 for the ids to be sorted by the seq_region. + Returntype : list of ints + Exceptions : none + Caller : ? + Status : Stable + +=cut + +sub list_dbIDs { + my ($self, $ordered) = @_; + + return $self->_list_dbIDs("dna_align_feature",undef, $ordered); +} + +1; + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/ExonAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/ExonAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,883 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::ExonAdaptor - An adaptor responsible for the retrieval and +storage of exon objects + +=head1 SYNOPSIS + + my $exon_adaptor = $registry->get_adaptor( 'Human', 'Core', 'Exon' ); + + my $exon = $exon_adaptor->fetch_by_dbID($dbID); + +=head1 DESCRIPTION + +The ExonAdaptor is responsible for retrieving and storing Exon objects +from an Ensembl database. Most of the ExonAdaptor functionality is +inherited from the B class. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::ExonAdaptor; + +use strict; + +use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; +use Bio::EnsEMBL::Exon; +use Bio::EnsEMBL::Utils::Exception qw( warning throw deprecate ); + +use vars qw( @ISA ); +@ISA = qw( Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor ); + + +#_tables +# +# Arg [1] : none +# Example : none +# Description: PROTECTED implementation of superclass abstract method +# returns the names, aliases of the tables to use for queries +# Returntype : list of listrefs of strings +# Exceptions : none +# Caller : internal + +sub _tables { + my $self = shift; + + # Allow the table definition to be overridden by certain methods. + if ( defined( $self->{'tables'} ) ) { + return @{ $self->{'tables'} }; + } + + return ( [ 'exon', 'e' ] ); +} + + +# _columns +# +# Arg [1] : none +# Example : none +# Description: PROTECTED implementation of superclass abstract method +# returns a list of columns to use for queries +# Returntype : list of strings +# Exceptions : none +# Caller : internal + +sub _columns { + my $self = shift; + + my $created_date = + $self->db->dbc->from_date_to_seconds("created_date"); + my $modified_date = + $self->db->dbc->from_date_to_seconds("modified_date"); + + return ( + 'e.exon_id', 'e.seq_region_id', 'e.seq_region_start', + 'e.seq_region_end', 'e.seq_region_strand', 'e.phase', + 'e.end_phase', 'e.is_current', 'e.is_constitutive', + 'e.stable_id', 'e.version', $created_date, + $modified_date + ); +} + + + +# _final_clause +# +# Arg [1] : none +# Example : none +# Description: PROTECTED implementation of superclass abstract method +# returns a default end for the SQL-query (ORDER BY) +# Returntype : string +# Exceptions : none +# Caller : internal + +sub _final_clause { + my $self = shift; + return $self->{'final_clause'} || ''; +} + + +sub fetch_all { + my ($self) = @_; + + my $constraint = 'e.is_current = 1'; + my @exons = @{ $self->generic_fetch($constraint) }; + return \@exons ; +} + +=head2 fetch_by_stable_id + + Arg [1] : string $stable_id + the stable id of the exon to retrieve + Example : $exon = $exon_adaptor->fetch_by_stable_id('ENSE0000988221'); + Description: Retrieves an Exon from the database via its stable id + Returntype : Bio::EnsEMBL::Exon in native coordinates. + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_by_stable_id { + my ($self, $stable_id) = @_; + + my $constraint = "e.stable_id = ? AND e.is_current = 1"; + + $self->bind_param_generic_fetch($stable_id,SQL_VARCHAR); + my ($exon) = @{ $self->generic_fetch($constraint) }; + + return $exon; +} + + +=head2 fetch_all_versions_by_stable_id + + Arg [1] : String $stable_id + The stable ID of the exon to retrieve + Example : my $exon = $exon_adaptor->fetch_all_version_by_stable_id + ('ENSE00000309301'); + Description : Similar to fetch_by_stable_id, but retrieves all versions of an + exon stored in the database. + Returntype : listref of Bio::EnsEMBL::Exon objects + Exceptions : if we cant get the gene in given coord system + Caller : general + Status : At Risk + +=cut + +sub fetch_all_versions_by_stable_id { + my ($self, $stable_id) = @_; + + my $constraint = "e.stable_id = ?"; + + $self->bind_param_generic_fetch($stable_id,SQL_VARCHAR); + + return $self->generic_fetch($constraint); +} + + +=head2 fetch_all_by_Transcript + + Arg [1] : Bio::EnsEMBL::Transcript $transcript + Example : none + Description: Retrieves all Exons for the Transcript in 5-3 order + Returntype : listref Bio::EnsEMBL::Exon on Transcript slice + Exceptions : throws if transcript has no slice + Caller : Transcript->get_all_Exons() + Status : Stable + +=cut + +sub fetch_all_by_Transcript { + my ( $self, $transcript ) = @_; + + my $tslice = $transcript->slice(); + my $slice; + + if ( !defined($tslice) ) { + throw("Transcript must have attached slice to retrieve exons."); + } + + # use a small slice the same size as the transcript + if ( !$tslice->is_circular() ) { + $slice = + $self->db()->get_SliceAdaptor()->fetch_by_Feature($transcript); + } else { + # Circular. + $slice = $tslice; + } + + # Override the tables definition to provide an additional join to the + # exon_transcript table. For efficiency we cannot afford to have this + # in as a left join every time. + my @tables = $self->_tables(); + + # Be extra cautious so that we do not add 'exon_transcript' twice. + my $found = 0; + foreach my $table (@tables) { + if ( $table->[0] eq 'exon_transcript' ) { + $found = 1; + last; + } + } + if ( !$found ) { + push @tables, [ 'exon_transcript', 'et' ]; + } + + $self->{'tables'} = \@tables; + $self->{'final_clause'} = "ORDER BY et.transcript_id, et.rank"; + + my $constraint = + "et.transcript_id = " + . $transcript->dbID() + . " AND e.exon_id = et.exon_id"; + + # fetch all of the exons + my $exons = $self->fetch_all_by_Slice_constraint($slice, $constraint); + + # un-override the table definition + delete( $self->{'tables'} ); + delete( $self->{'final_clause'} ); + + # remap exon coordinates if necessary + if($slice->name() ne $tslice->name()) { + my @out; + foreach my $ex (@$exons) { + push @out, $ex->transfer($tslice); + } + $exons = \@out; + } + + return $exons; +} + + +=head2 store + + Arg [1] : Bio::EnsEMBL::Exon $exon + the exon to store in this database + Example : $exon_adaptor->store($exon); + Description: Stores an exon in the database + Returntype : none + Exceptions : thrown if exon (or component exons) do not have a contig_id + or if $exon->start, $exon->end, $exon->strand, or $exon->phase + are not defined or if $exon is not a Bio::EnsEMBL::Exon + Caller : general + Status : Stable + +=cut + +sub store { + my ($self, $exon) = @_; + + if( ! $exon->isa('Bio::EnsEMBL::Exon') ) { + throw("$exon is not a EnsEMBL exon - not storing."); + } + + my $db = $self->db(); + + if($exon->is_stored($db)) { + return $exon->dbID(); + } + + if( ! $exon->start || ! $exon->end || + ! $exon->strand || ! defined $exon->phase ) { + throw("Exon does not have all attributes to store"); + } + + # Default to is_current = 1 if this attribute is not set + my $is_current = $exon->is_current(); + if ( !defined($is_current) ) { $is_current = 1 } + + # Default to is_constitutive = 0 if this attribute is not set + my $is_constitutive = $exon->is_constitutive(); + if ( !defined($is_constitutive) ) { $is_constitutive = 0 } + + my $exon_sql = q{ + INSERT into exon ( seq_region_id, seq_region_start, + seq_region_end, seq_region_strand, phase, + end_phase, is_current, is_constitutive + }; + if ( defined($exon->stable_id) ) { + my $created = $self->db->dbc->from_seconds_to_date($exon->created_date()); + my $modified = $self->db->dbc->from_seconds_to_date($exon->modified_date()); + $exon_sql .= ", stable_id, version, created_date, modified_date) VALUES ( ?,?,?,?,?,?,?,?,?,?,". $created . ",". $modified ." )"; + + } else { + $exon_sql .= q{ + ) VALUES ( ?,?,?,?,?,?,?,?) + }; + } + + + my $exonst = $self->prepare($exon_sql); + + my $exonId = undef; + + my $original = $exon; + my $seq_region_id; + ($exon, $seq_region_id) = $self->_pre_store($exon); + + #store the exon + $exonst->bind_param( 1, $seq_region_id, SQL_INTEGER ); + $exonst->bind_param( 2, $exon->start, SQL_INTEGER ); + $exonst->bind_param( 3, $exon->end, SQL_INTEGER ); + $exonst->bind_param( 4, $exon->strand, SQL_TINYINT ); + $exonst->bind_param( 5, $exon->phase, SQL_TINYINT ); + $exonst->bind_param( 6, $exon->end_phase, SQL_TINYINT ); + $exonst->bind_param( 7, $is_current, SQL_TINYINT ); + $exonst->bind_param( 8, $is_constitutive, SQL_TINYINT ); + + if ( defined($exon->stable_id) ) { + + $exonst->bind_param( 9, $exon->stable_id, SQL_VARCHAR ); + my $version = ($exon->version()) ? $exon->version() : 1; + $exonst->bind_param( 10, $version, SQL_INTEGER ); + } + + $exonst->execute(); + $exonId = $exonst->{'mysql_insertid'}; + + # Now the supporting evidence + my $esf_adaptor = $db->get_SupportingFeatureAdaptor; + $esf_adaptor->store($exonId, $exon->get_all_supporting_features); + + # + # Finally, update the dbID and adaptor of the exon (and any component exons) + # to point to the new database + # + + $original->adaptor($self); + $original->dbID($exonId); + + return $exonId; +} + + +=head2 remove + + Arg [1] : Bio::EnsEMBL::Exon $exon + the exon to remove from the database + Example : $exon_adaptor->remove($exon); + Description: Removes an exon from the database. This method is generally + called by the TranscriptAdaptor::store method. Database + integrity will not be maintained if this method is simply + called on its own without taking into account transcripts which + may refer to the exon being removed. + Returntype : none + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub remove { + my $self = shift; + my $exon = shift; + + if(!ref($exon) || !$exon->isa('Bio::EnsEMBL::Exon')) { + throw('Bio::EnsEMBL::Exon argument expected.'); + } + + if(!$exon->is_stored($self->db())) { + warning("Cannot remove exon " .$exon->dbID. + "Is not stored in this database."); + return; + } + + # sanity check: make sure nobdody tries to slip past a prediction exon + # which inherits from exon but actually uses different tables + if($exon->isa('Bio::EnsEMBL::PredictionExon')) { + throw("ExonAdaptor can only remove Exons not PredictionExons."); + } + + # Remove the supporting features of this exon + + my $prot_adp = $self->db->get_ProteinAlignFeatureAdaptor; + my $dna_adp = $self->db->get_DnaAlignFeatureAdaptor; + + my $sth = $self->prepare("SELECT feature_type, feature_id " . + "FROM supporting_feature " . + "WHERE exon_id = ?"); + $sth->bind_param(1, $exon->dbID, SQL_INTEGER); + $sth->execute(); + + # statements to check for shared align_features + my $sth1 = $self->prepare("SELECT count(*) FROM supporting_feature " . + "WHERE feature_type = ? AND feature_id = ?"); + my $sth2 = $self->prepare("SELECT count(*) " . + "FROM transcript_supporting_feature " . + "WHERE feature_type = ? AND feature_id = ?"); + + SUPPORTING_FEATURE: + while(my ($type, $feature_id) = $sth->fetchrow()){ + + # only remove align_feature if this is the last reference to it + $sth1->bind_param(1, $type, SQL_VARCHAR); + $sth1->bind_param(2, $feature_id, SQL_INTEGER); + $sth1->execute; + $sth2->bind_param(1, $type, SQL_VARCHAR); + $sth2->bind_param(2, $feature_id, SQL_INTEGER); + $sth2->execute; + my ($count1) = $sth1->fetchrow; + my ($count2) = $sth2->fetchrow; + if ($count1 + $count2 > 1) { + #warn "shared feature, not removing $type|$feature_id\n"; + next SUPPORTING_FEATURE; + } + + #warn "removing $type|$feature_id\n"; + + if($type eq 'protein_align_feature'){ + my $f = $prot_adp->fetch_by_dbID($feature_id); + $prot_adp->remove($f); + } + elsif($type eq 'dna_align_feature'){ + my $f = $dna_adp->fetch_by_dbID($feature_id); + $dna_adp->remove($f); + } + else { + warning("Unknown supporting feature type $type. Not removing feature."); + } + } + $sth->finish(); + $sth1->finish(); + $sth2->finish(); + + # delete the association to supporting features + + $sth = $self->prepare("DELETE FROM supporting_feature WHERE exon_id = ?"); + $sth->bind_param(1, $exon->dbID, SQL_INTEGER); + $sth->execute(); + $sth->finish(); + + + # delete the exon + + $sth = $self->prepare( "DELETE FROM exon WHERE exon_id = ?" ); + $sth->bind_param(1, $exon->dbID, SQL_INTEGER); + $sth->execute(); + $sth->finish(); + + $exon->dbID(undef); + $exon->adaptor(undef); + + return; +} + + +=head2 list_dbIDs + + Arg [1] : none + Example : @exon_ids = @{$exon_adaptor->list_dbIDs()}; + Description: Gets an array of internal ids for all exons in the current db + Arg[1] : int. not 0 for the ids to be sorted by the seq_region. + Returntype : list of ints + Exceptions : none + Caller : ? + Status : Stable + +=cut + +sub list_dbIDs { + my ($self, $ordered) = @_; + + return $self->_list_dbIDs("exon",undef, $ordered); +} + + +=head2 list_stable_ids + + Arg [1] : none + Example : @stable_exon_ids = @{$exon_adaptor->list_stable_dbIDs()}; + Description: Gets an array of stable ids for all exons in the current db + Returntype : list of ints + Exceptions : none + Caller : ? + Status : Stable + +=cut + +sub list_stable_ids { + my ($self) = @_; + + return $self->_list_dbIDs("exon", "stable_id"); +} + +#_objs_from_sth +# +# Arg [1] : StatementHandle $sth +# Example : none +# Description: PROTECTED implementation of abstract superclass method. +# responsible for the creation of Exons +# Returntype : listref of Bio::EnsEMBL::Exons in target coordinate system +# Exceptions : none +# Caller : internal + +sub _objs_from_sth { + my ( $self, $sth, $mapper, $dest_slice ) = @_; + + # + # This code is ugly because an attempt has been made to remove as many + # function calls as possible for speed purposes. Thus many caches and + # a fair bit of gymnastics is used. + # + + my $sa = $self->db()->get_SliceAdaptor(); + + my @exons; + my %slice_hash; + my %sr_name_hash; + my %sr_cs_hash; + + my ( $exon_id, $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand, $phase, + $end_phase, $is_current, $is_constitutive, + $stable_id, $version, $created_date, + $modified_date ); + + $sth->bind_columns( \( $exon_id, $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand, $phase, + $end_phase, $is_current, $is_constitutive, + $stable_id, $version, $created_date, + $modified_date ) ); + + my $asm_cs; + my $cmp_cs; + my $asm_cs_vers; + my $asm_cs_name; + my $cmp_cs_vers; + my $cmp_cs_name; + + if ($mapper) { + $asm_cs = $mapper->assembled_CoordSystem(); + $cmp_cs = $mapper->component_CoordSystem(); + $asm_cs_name = $asm_cs->name(); + $asm_cs_vers = $asm_cs->version(); + $cmp_cs_name = $cmp_cs->name(); + $cmp_cs_vers = $cmp_cs->version(); + } + + my $dest_slice_start; + my $dest_slice_end; + my $dest_slice_strand; + my $dest_slice_length; + my $dest_slice_cs; + my $dest_slice_sr_name; + my $dest_slice_sr_id; + my $asma; + + if ($dest_slice) { + $dest_slice_start = $dest_slice->start(); + $dest_slice_end = $dest_slice->end(); + $dest_slice_strand = $dest_slice->strand(); + $dest_slice_length = $dest_slice->length(); + $dest_slice_cs = $dest_slice->coord_system(); + $dest_slice_sr_name = $dest_slice->seq_region_name(); + $dest_slice_sr_id = $dest_slice->get_seq_region_id(); + $asma = $self->db->get_AssemblyMapperAdaptor(); + } + +FEATURE: while ( $sth->fetch() ) { + #need to get the internal_seq_region, if present + $seq_region_id = $self->get_seq_region_id_internal($seq_region_id); + + my $slice = $slice_hash{ "ID:" . $seq_region_id }; + my $dest_mapper = $mapper; + + if ( !$slice ) { + $slice = $sa->fetch_by_seq_region_id($seq_region_id); + $slice_hash{ "ID:" . $seq_region_id } = $slice; + $sr_name_hash{$seq_region_id} = $slice->seq_region_name(); + $sr_cs_hash{$seq_region_id} = $slice->coord_system(); + } + + #obtain a mapper if none was defined, but a dest_seq_region was + if ( !$dest_mapper + && $dest_slice + && !$dest_slice_cs->equals( $slice->coord_system ) ) + { + $dest_mapper = + $asma->fetch_by_CoordSystems( $dest_slice_cs, + $slice->coord_system ); + $asm_cs = $dest_mapper->assembled_CoordSystem(); + $cmp_cs = $dest_mapper->component_CoordSystem(); + $asm_cs_name = $asm_cs->name(); + $asm_cs_vers = $asm_cs->version(); + $cmp_cs_name = $cmp_cs->name(); + $cmp_cs_vers = $cmp_cs->version(); + } + + my $sr_name = $sr_name_hash{$seq_region_id}; + my $sr_cs = $sr_cs_hash{$seq_region_id}; + + # + # Remap the feature coordinates to another coord system if a mapper + # was provided. + # + if ( defined($dest_mapper) ) { + + if (defined $dest_slice && $dest_mapper->isa('Bio::EnsEMBL::ChainedAssemblyMapper') ) { + ( $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand ) + = + $dest_mapper->map( $sr_name, $seq_region_start, $seq_region_end, + $seq_region_strand, $sr_cs, 1, $dest_slice); + + } else { + + ( $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand ) + = $dest_mapper->fastmap( $sr_name, $seq_region_start, + $seq_region_end, $seq_region_strand, + $sr_cs ); + } + + # Skip features that map to gaps or coord system boundaries. + if ( !defined($seq_region_id) ) { next FEATURE } + + # Get a slice in the coord system we just mapped to + $slice = $slice_hash{ "ID:" . $seq_region_id } ||= + $sa->fetch_by_seq_region_id($seq_region_id); + } + + # + # If a destination slice was provided convert the coords. + # + if ( defined($dest_slice) ) { + if ( $dest_slice_strand == 1 ) { + # On the positive strand. + + $seq_region_start = $seq_region_start - $dest_slice_start + 1; + $seq_region_end = $seq_region_end - $dest_slice_start + 1; + + if ( ( $seq_region_end > $dest_slice_start || $seq_region_end < 0 || ( $dest_slice_start > $dest_slice_end + && $seq_region_end < 0 ) ) && $dest_slice->is_circular() ) { + # Handle circular chromosomes. + + if ( $seq_region_start > $seq_region_end ) { + # Looking at a feature overlapping the chromsome origin. + + if ( $seq_region_end > $dest_slice_start ) { + # Looking at the region in the beginning of the + # chromosome. + $seq_region_start -= $dest_slice->seq_region_length(); + } + + if ( $seq_region_end < 0 ) { + $seq_region_end += $dest_slice->seq_region_length(); + } + + } else { + if ( $dest_slice_start > $dest_slice_end + && $seq_region_end < 0 ) + { + # Looking at the region overlapping the chromosome + # origin and a feature which is at the beginning of the + # chromosome. + $seq_region_start += $dest_slice->seq_region_length(); + $seq_region_end += $dest_slice->seq_region_length(); + } + } + } + + } else { + # On the negative strand. + + if ( $seq_region_start > $seq_region_end && $dest_slice->is_circular() ) + { + # Handle circular chromosomes. + + if ( $dest_slice_start > $dest_slice_end ) { + my $tmp_seq_region_start = $seq_region_start; + $seq_region_start = $dest_slice_end - $seq_region_end + 1; + $seq_region_end = + $dest_slice_end + + $dest_slice->seq_region_length() - + $tmp_seq_region_start + 1; + } else { + + if ( $seq_region_end > $dest_slice_start ) { + # Looking at the region in the beginning of the + # chromosome. + $seq_region_start = $dest_slice_end - $seq_region_end + 1; + $seq_region_end = + $seq_region_end - + $dest_slice->seq_region_length() - + $dest_slice_start + 1; + } else { + my $tmp_seq_region_start = $seq_region_start; + $seq_region_start = + $dest_slice_end - + $seq_region_end - + $dest_slice->seq_region_length() + 1; + $seq_region_end = + $dest_slice_end - $tmp_seq_region_start + 1; + } + + } + + } else { + # Non-circular chromosome. + + my $tmp_seq_region_start = $seq_region_start; + $seq_region_start = $dest_slice_end - $seq_region_end + 1; + $seq_region_end = $dest_slice_end - $tmp_seq_region_start + 1; + } + + $seq_region_strand = -$seq_region_strand; + + } ## end else [ if ( $dest_slice_strand...)] + + # Throw away features off the end of the requested slice. + if ( $seq_region_end < 1 + || $seq_region_start > $dest_slice_length + || ( $dest_slice_sr_id != $seq_region_id ) ) + { + next FEATURE; + } + + $slice = $dest_slice; + } ## end if ( defined($dest_slice...)) + + # Finally, create the new exon. + push( @exons, + $self->_create_feature_fast( + 'Bio::EnsEMBL::Exon', { + 'start' => $seq_region_start, + 'end' => $seq_region_end, + 'strand' => $seq_region_strand, + 'adaptor' => $self, + 'slice' => $slice, + 'dbID' => $exon_id, + 'stable_id' => $stable_id, + 'version' => $version, + 'created_date' => $created_date || undef, + 'modified_date' => $modified_date || undef, + 'phase' => $phase, + 'end_phase' => $end_phase, + 'is_current' => $is_current, + 'is_constitutive' => $is_constitutive } ) + ); + + } ## end while ( $sth->fetch() ) + + return \@exons; +} ## end sub _objs_from_sth + +=head1 DEPRECATED METHODS + +=cut + + +=head2 get_stable_entry_info + + Description: DEPRECATED. This method is no longer necessary. Exons are + always fetched with their stable identifiers (if they exist) and + no lazy loading is necessary. + +=cut + +sub get_stable_entry_info { + my ($self,$exon) = @_; + + deprecated( "This method call shouldnt be necessary" ); + + if( !$exon || !ref $exon || !$exon->isa('Bio::EnsEMBL::Exon') ) { + $self->throw("Needs a exon object, not a $exon"); + } + if(!$exon->dbID){ + #$self->throw("can't fetch stable info with no dbID"); + return; + } + + my $created_date = $self->db->dbc->from_date_to_seconds("created_date"); + my $modified_date = $self->db->dbc->from_date_to_seconds("modified_date"); + my $sth = $self->prepare("SELECT stable_id, " . $created_date . ", + " . $modified_date . ", version + FROM exon + WHERE exon_id = "); + + $sth->bind_param(1, $exon->dbID, SQL_INTEGER); + $sth->execute(); + + # my @array = $sth->fetchrow_array(); + if( my $aref = $sth->fetchrow_arrayref() ) { + $exon->{'_stable_id'} = $aref->[0]; + $exon->{'_created'} = $aref->[1]; + $exon->{'_modified'} = $aref->[2]; + $exon->{'_version'} = $aref->[3]; + } + + return 1; +} + + +=head2 fetch_all_by_gene_id + + Description: DEPRECATED. This method should not be needed - Exons can + be fetched by Transcript. + +=cut + +sub fetch_all_by_gene_id { + my ( $self, $gene_id ) = @_; + my %exons; + my $hashRef; + my ( $currentId, $currentTranscript ); + + deprecated( "Hopefully this method is not needed any more. Exons should be fetched by Transcript" ); + + if( !$gene_id ) { + $self->throw("Gene dbID not defined"); + } + + $self->{rchash} = {}; + + my $query = qq { + SELECT + STRAIGHT_JOIN + e.exon_id + , e.contig_id + , e.contig_start + , e.contig_end + , e.contig_strand + , e.phase + , e.end_phase + , e.sticky_rank + FROM transcript t + , exon_transcript et + , exon e + WHERE t.gene_id = ? + AND et.transcript_id = t.transcript_id + AND e.exon_id = et.exon_id + ORDER BY t.transcript_id,e.exon_id + , e.sticky_rank DESC + }; + + my $sth = $self->prepare( $query ); + $sth->bind_param(1,$gene_id,SQL_INTEGER); + $sth->execute(); + + while( $hashRef = $sth->fetchrow_hashref() ) { + if( ! exists $exons{ $hashRef->{exon_id} } ) { + + my $exon = $self->_exon_from_sth( $sth, $hashRef ); + + $exons{$exon->dbID} = $exon; + } + } + delete $self->{rchash}; + + my @out = (); + + push @out, values %exons; + + return \@out; +} + + +1; + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/GOTermAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/GOTermAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,66 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::GOTermAdaptor + +=head1 DESCRIPTION + +A specialization of Bio::EnsEMBL::DBSQL::OntologyTermAdaptor, +specifically for Gene Ontology (GO) terms. See the documentation of +Bio::EnsEMBL::DBSQL::OntologyTermAdaptor for further information. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::GOTermAdaptor; + +use strict; +use warnings; + +use base qw( Bio::EnsEMBL::DBSQL::OntologyTermAdaptor ); + +=head2 new + + Arg [1] : Bio::EnsEMBL::DBSQL::DBAdaptor + Argument required for parent class + Bio::EnsEMBL::DBSQL::BaseAdaptor. + + Description : Creates an ontology term adaptor for GO terms. + + Example : + + my $go_adaptor = Bio::EnsEMBL::DBSQL::GOTermAdaptor->new( $dba ); + + Return type : Bio::EnsEMBL::DBSQL::GOTermAdaptor + +=cut + +sub new { + my ( $proto, $dba ) = @_; + + my $this = $proto->SUPER::new( $dba, 'GO' ); + + return $this; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/GeneAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/GeneAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,2382 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::GeneAdaptor - Database adaptor for the retrieval and +storage of Gene objects + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Registry; + + Bio::EnsEMBL::Registry->load_registry_from_db( + -host => 'ensembldb.ensembl.org', + -user => 'anonymous', + ); + + $gene_adaptor = + Bio::EnsEMBL::Registry->get_adaptor( "human", "core", "gene" ); + + $gene = $gene_adaptor->fetch_by_dbID(1234); + + $gene = $gene_adaptor->fetch_by_stable_id('ENSG00000184129'); + + @genes = @{ $gene_adaptor->fetch_all_by_external_name('BRCA2') }; + + $slice_adaptor = + Bio::EnsEMBL::Registry->get_adaptor( "human", "core", "slice" ); + + $slice = + $slice_adaptor->fetch_by_region( 'chromosome', '1', 1, 1000000 ); + + @genes = @{ $gene_adaptor->fetch_all_by_Slice($slice) }; + +=head1 DESCRIPTION + +This is a database aware adaptor for the retrieval and storage of gene +objects. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::GeneAdaptor; + +use strict; + +use Bio::EnsEMBL::Utils::Exception qw( deprecate throw warning ); +use Bio::EnsEMBL::Utils::Scalar qw( assert_ref ); +use Bio::EnsEMBL::DBSQL::SliceAdaptor; +use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Gene; + +use vars '@ISA'; +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor); + + +# _tables +# Arg [1] : none +# Description: PROTECTED implementation of superclass abstract method. +# Returns the names, aliases of the tables to use for queries. +# Returntype : list of listrefs of strings +# Exceptions : none +# Caller : internal +# Status : Stable + +sub _tables { + return ( + [ 'gene', 'g' ], + [ 'xref', 'x' ], + [ 'external_db', 'exdb' ] ); +} + + +# _columns +# Arg [1] : none +# Example : none +# Description: PROTECTED implementation of superclass abstract method. +# Returns a list of columns to use for queries. +# Returntype : list of strings +# Exceptions : none +# Caller : internal +# Status : Stable + +sub _columns { + my ($self) = @_; + + my $created_date = + $self->db()->dbc()->from_date_to_seconds("g.created_date"); + my $modified_date = + $self->db()->dbc()->from_date_to_seconds("g.modified_date"); + + return ( + 'g.gene_id', 'g.seq_region_id', + 'g.seq_region_start', 'g.seq_region_end', + 'g.seq_region_strand', 'g.analysis_id', + 'g.biotype', 'g.display_xref_id', + 'g.description', 'g.status', + 'g.source', 'g.is_current', + 'g.canonical_transcript_id', 'g.canonical_annotation', + 'g.stable_id', 'g.version', + $created_date, $modified_date, + 'x.display_label', 'x.dbprimary_acc', + 'x.description', 'x.version', + 'exdb.db_name', 'exdb.status', + 'exdb.db_release', 'exdb.db_display_name', + 'x.info_type', 'x.info_text' + ); +} ## end sub _columns + + +sub _left_join { + return ( + [ 'xref', "x.xref_id = g.display_xref_id" ], + [ 'external_db', "exdb.external_db_id = x.external_db_id" ] ); +} + + +=head2 list_dbIDs + + Example : @gene_ids = @{$gene_adaptor->list_dbIDs()}; + Description: Gets an array of internal ids for all genes in the current db + Arg[1] : int. not 0 for the ids to be sorted by the seq_region. + Returntype : Listref of Ints + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub list_dbIDs { + my ($self,$ordered) = @_; + + return $self->_list_dbIDs("gene",undef, $ordered); +} + + +=head2 list_stable_ids + + Example : @stable_gene_ids = @{$gene_adaptor->list_stable_ids()}; + Description: Gets an listref of stable ids for all genes in the current db + Returntype : reference to a list of strings + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub list_stable_ids { + my ($self) = @_; + + return $self->_list_dbIDs("gene", "stable_id"); +} + + +sub list_seq_region_ids { + my $self = shift; + + return $self->_list_seq_region_ids('gene'); +} + +=head2 fetch_by_display_label + + Arg [1] : String $label - display label of gene to fetch + Example : my $gene = $geneAdaptor->fetch_by_display_label("BRCA2"); + Description: Returns the gene which has the given display label or undef if + there is none. If there are more than 1, the gene on the + reference slice is reported or if none are on the reference, + the first one is reported. + Returntype : Bio::EnsEMBL::Gene + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_by_display_label { + my $self = shift; + my $label = shift; + + my $constraint = "x.display_label = ? AND g.is_current = 1"; + $self->bind_param_generic_fetch($label,SQL_VARCHAR); + my @genes = @{ $self->generic_fetch($constraint) }; + my $gene; + if (scalar(@genes) > 1) { + foreach my $gene_tmp (@genes) { + if ($gene_tmp->slice->is_reference) { + $gene = $gene_tmp; + } + last if ($gene); + } + if (!$gene) { + $gene = $genes[0]; + } + + } elsif (scalar(@genes) == 1) { + $gene = $genes[0]; + } + + return $gene; +} + +=head2 fetch_all_by_display_label + + Arg [1] : String $label - display label of genes to fetch + Example : my @genes = @{$geneAdaptor->fetch_all_by_display_label("PPP1R2P1")}; + Description: Returns all genes which have the given display label or undef if + there are none. + Returntype : listref of Bio::EnsEMBL::Gene objects + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_display_label { + my $self = shift; + my $label = shift; + + my $constraint = "x.display_label = ? AND g.is_current = 1"; + $self->bind_param_generic_fetch($label,SQL_VARCHAR); + my $genes = $self->generic_fetch($constraint) ; + + return $genes; +} + +=head2 fetch_by_stable_id + + Arg [1] : String $id + The stable ID of the gene to retrieve + Example : $gene = $gene_adaptor->fetch_by_stable_id('ENSG00000148944'); + Description: Retrieves a gene object from the database via its stable id. + The gene will be retrieved in its native coordinate system (i.e. + in the coordinate system it is stored in the database). It may + be converted to a different coordinate system through a call to + transform() or transfer(). If the gene or exon is not found + undef is returned instead. + Returntype : Bio::EnsEMBL::Gene or undef + Exceptions : if we cant get the gene in given coord system + Caller : general + Status : Stable + +=cut + +sub fetch_by_stable_id { + my ($self, $stable_id) = @_; + + my $constraint = "g.stable_id = ? AND g.is_current = 1"; + $self->bind_param_generic_fetch($stable_id,SQL_VARCHAR); + my ($gene) = @{ $self->generic_fetch($constraint) }; + + return $gene; +} + + + +=head2 fetch_all_by_biotype + + Arg [1] : String $biotype + listref of $biotypes + The biotype of the gene to retrieve. You can have as an argument a reference + to a list of biotypes + Example : $gene = $gene_adaptor->fetch_all_by_biotype('protein_coding'); + $gene = $gene_adaptor->fetch_all_by_biotypes(['protein_coding', 'sRNA', 'miRNA']); + Description: Retrieves an array reference of gene objects from the database via its biotype or biotypes. + The genes will be retrieved in its native coordinate system (i.e. + in the coordinate system it is stored in the database). It may + be converted to a different coordinate system through a call to + transform() or transfer(). If the gene or exon is not found + undef is returned instead. + Returntype : listref of Bio::EnsEMBL::Gene + Exceptions : if we cant get the gene in given coord system + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_biotype { + my ($self, $biotype) = @_; + + if (!defined $biotype){ + throw("Biotype or listref of biotypes expected"); + } + my $constraint; + if (ref($biotype) eq 'ARRAY'){ + $constraint = "g.biotype IN ("; + foreach my $b (@{$biotype}){ + $constraint .= "?,"; + $self->bind_param_generic_fetch($b,SQL_VARCHAR); + } + chop($constraint); #remove last , from expression + $constraint .= ") and g.is_current = 1"; + + } + else{ + $constraint = "g.biotype = ? and g.is_current = 1"; + $self->bind_param_generic_fetch($biotype,SQL_VARCHAR); + } + my @genes = @{ $self->generic_fetch($constraint) }; + return \@genes ; +} + + +sub fetch_all { + my ($self) = @_; + + my $constraint = 'g.biotype != "LRG_gene" and g.is_current = 1'; + my @genes = @{ $self->generic_fetch($constraint) }; + return \@genes ; +} + + +=head2 fetch_all_versions_by_stable_id + + Arg [1] : String $stable_id + The stable ID of the gene to retrieve + Example : $gene = $gene_adaptor->fetch_all_versions_by_stable_id + ('ENSG00000148944'); + Description : Similar to fetch_by_stable_id, but retrieves all versions of a + gene stored in the database. + Returntype : listref of Bio::EnsEMBL::Gene + Exceptions : if we cant get the gene in given coord system + Caller : general + Status : At Risk + +=cut + +sub fetch_all_versions_by_stable_id { + my ($self, $stable_id) = @_; + + my $constraint = "g.stable_id = ?"; + $self->bind_param_generic_fetch($stable_id,SQL_VARCHAR); + return $self->generic_fetch($constraint); +} + + +=head2 fetch_by_exon_stable_id + + Arg [1] : String $id + The stable id of an exon of the gene to retrieve + Example : $gene = $gene_adptr->fetch_by_exon_stable_id('ENSE00000148944'); + Description: Retrieves a gene object from the database via an exon stable id. + The gene will be retrieved in its native coordinate system (i.e. + in the coordinate system it is stored in the database). It may + be converted to a different coordinate system through a call to + transform() or transfer(). If the gene or exon is not found + undef is returned instead. + Returntype : Bio::EnsEMBL::Gene or undef + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_by_exon_stable_id { + my ($self, $stable_id, $version) = @_; + + my $sql = qq( + SELECT t.gene_id + FROM transcript as t, + exon_transcript as et, + exon as e + WHERE t.transcript_id = et.transcript_id + AND et.exon_id = e.exon_id + AND e.stable_id = ? + AND e.is_current = 1 + ); + + my $sth = $self->prepare($sql); + $sth->bind_param(1, $stable_id, SQL_VARCHAR); + $sth->execute(); + + my ($dbID) = $sth->fetchrow_array(); + + return undef if(!defined($dbID)); + + my $gene = $self->fetch_by_dbID($dbID); + + return $gene; +} + + +=head2 fetch_all_by_domain + + Arg [1] : String $domain + The domain to fetch genes from + Example : my @genes = @{ $gene_adaptor->fetch_all_by_domain($domain) }; + Description: Retrieves a listref of genes whose translation contain interpro + domain $domain. The genes are returned in their native coord + system (i.e. the coord_system they are stored in). If the coord + system needs to be changed, then tranform or transfer should be + called on the individual objects returned. + Returntype : list of Bio::EnsEMBL::Genes + Exceptions : none + Caller : domainview + Status : Stable + +=cut + +sub fetch_all_by_domain { + my ($self, $domain) = @_; + + throw("domain argument is required") unless ($domain); + + my $sth = $self->prepare(qq( + SELECT tr.gene_id + FROM interpro i, + protein_feature pf, + transcript tr, + translation tl, + seq_region sr, + coord_system cs + WHERE cs.species_id = ? + AND cs.coord_system_id = sr.coord_system_id + AND sr.seq_region_id = tr.seq_region_id + AND tr.is_current = 1 + AND tr.transcript_id = tl.transcript_id + AND tl.translation_id = pf.translation_id + AND pf.hit_name = i.id + AND i.interpro_ac = ? + GROUP BY tr.gene_id)); + + $sth->bind_param( 1, $self->species_id(), SQL_VARCHAR ); + $sth->bind_param( 2, $domain, SQL_VARCHAR ); + + $sth->execute(); + + my @array = @{$sth->fetchall_arrayref()}; + $sth->finish(); + + my @gene_ids = map {$_->[0]} @array; + + return $self->fetch_all_by_dbID_list(\@gene_ids); +} + + + +=head2 fetch_all_by_Slice_and_external_dbname_link + + Arg [1] : Bio::EnsEMBL::Slice $slice + The slice to fetch genes on. + Arg [2] : (optional) string $logic_name + the logic name of the type of features to obtain + Arg [3] : (optional) boolean $load_transcripts + if true, transcripts will be loaded immediately + rather than lazy loaded later. + Arg [4] : Name of the external database + Example : @genes = @{ + $ga->fetch_all_by_Slice_and_external_dbname_link( + $slice, undef, undef, "HUGO" ) }; + Description: Overrides superclass method to optionally load + transcripts immediately rather than lazy-loading them + later. This is more efficient when there are a lot + of genes whose transcripts are going to be used. The + genes are then filtered to return only those with + external database links of the type specified + Returntype : reference to list of genes + Exceptions : thrown if exon cannot be placed on transcript slice + Caller : + Status : Stable + +=cut + +sub fetch_all_by_Slice_and_external_dbname_link { + my ( $self, $slice, $logic_name, $load_transcripts, $db_name ) = @_; + + # Get the external_db_id(s) from the name. + my $sth = $self->prepare( + "SELECT external_db_id FROM external_db WHERE db_name = ?"); + + $sth->bind_param( 1, $db_name, SQL_VARCHAR ); + $sth->execute(); + + my $external_db_id; + $sth->bind_columns( \$external_db_id ); + + my @external_db_ids; + while ( $sth->fetch() ) { + push( @external_db_ids, $external_db_id ); + } + + if ( scalar(@external_db_ids) == 0 ) { + warn sprintf( "Could not find external database " + . "'%s' in the external_db table\n" + . "Available are:\n", + $db_name ); + + $sth = $self->prepare("SELECT DISTINCT db_name FROM external_db"); + + $sth->execute(); + $sth->bind_columns( \$external_db_id ); + + while ( $sth->fetch() ) { + warn "\t$external_db_id\n"; + } + return []; + } + + # Get the gene_ids for those with links. + my $dbe_adaptor = $self->db()->get_DBEntryAdaptor(); + + my %linked_genes; + foreach $external_db_id (@external_db_ids) { + my @linked_genes = + $dbe_adaptor->list_gene_ids_by_external_db_id($external_db_id); + + foreach my $gene_id (@linked_genes) { + $linked_genes{$gene_id} = 1; + } + } + + # Get all the genes on the slice. + my $genes = $self->SUPER::fetch_all_by_Slice_constraint( $slice, + 'g.is_current = 1', $logic_name ); + + # Create a list of those that are in the gene_ids list. + my @genes_passed; + foreach my $gene (@$genes) { + if ( exists( $linked_genes{ $gene->dbID() } ) ) { + push( @genes_passed, $gene ); + } + } + + # Return the list of those that passed. + return \@genes_passed; +} ## end sub fetch_all_by_Slice_and_external_dbname_link + +=head2 fetch_all_by_Slice + + Arg [1] : Bio::EnsEMBL::Slice $slice + The slice to fetch genes on. + Arg [2] : (optional) string $logic_name + the logic name of the type of features to obtain + Arg [3] : (optional) boolean $load_transcripts + if true, transcripts will be loaded immediately rather than + lazy loaded later. + Arg [4] : (optional) string $source + the source name of the features to obtain. + Arg [5] : (optional) string biotype + the biotype of the features to obtain. + Example : @genes = @{$gene_adaptor->fetch_all_by_Slice()}; + Description: Overrides superclass method to optionally load transcripts + immediately rather than lazy-loading them later. This + is more efficient when there are a lot of genes whose + transcripts are going to be used. + Returntype : reference to list of genes + Exceptions : thrown if exon cannot be placed on transcript slice + Caller : Slice::get_all_Genes + Status : Stable + +=cut + +sub fetch_all_by_Slice { + my ( $self, $slice, $logic_name, $load_transcripts, $source, + $biotype ) = @_; + + my $constraint = 'g.is_current = 1'; + + if ( defined($source) ) { + $constraint .= " and g.source = '$source'"; + } + if ( defined($biotype) ) { + $constraint .= " and g.biotype = '$biotype'"; + } + + my $genes = + $self->SUPER::fetch_all_by_Slice_constraint( $slice, $constraint, + $logic_name ); + + # If there are less than two genes, still do lazy-loading. + if ( !$load_transcripts || @$genes < 2 ) { + return $genes; + } + + # Preload all of the transcripts now, instead of lazy loading later, + # faster than one query per transcript. + + # First check if transcripts are already preloaded. + # FIXME: Should check all transcripts. + if ( exists( $genes->[0]->{'_transcript_array'} ) ) { + return $genes; + } + + # Get extent of region spanned by transcripts. + my ( $min_start, $max_end ); + foreach my $g (@$genes) { + if ( !defined($min_start) || $g->seq_region_start() < $min_start ) { + $min_start = $g->seq_region_start(); + } + if ( !defined($max_end) || $g->seq_region_end() > $max_end ) { + $max_end = $g->seq_region_end(); + } + } + + my $ext_slice; + + if ( $min_start >= $slice->start() && $max_end <= $slice->end() ) { + $ext_slice = $slice; + } else { + my $sa = $self->db()->get_SliceAdaptor(); + $ext_slice = $sa->fetch_by_region( + $slice->coord_system->name(), $slice->seq_region_name(), + $min_start, $max_end, + $slice->strand(), $slice->coord_system->version() ); + } + + # Associate transcript identifiers with genes. + + my %g_hash = map { $_->dbID => $_ } @{$genes}; + + my $g_id_str = join( ',', keys(%g_hash) ); + + my $sth = + $self->prepare( "SELECT gene_id, transcript_id " + . "FROM transcript " + . "WHERE gene_id IN ($g_id_str)" ); + + $sth->execute(); + + my ( $g_id, $tr_id ); + $sth->bind_columns( \( $g_id, $tr_id ) ); + + my %tr_g_hash; + + while ( $sth->fetch() ) { + $tr_g_hash{$tr_id} = $g_hash{$g_id}; + } + + my $ta = $self->db()->get_TranscriptAdaptor(); + my $transcripts = $ta->fetch_all_by_Slice( + $ext_slice, + 1, undef, + sprintf( "t.transcript_id IN (%s)", + join( ',', sort { $a <=> $b } keys(%tr_g_hash) ) ) ); + + # Move transcripts onto gene slice, and add them to genes. + foreach my $tr ( @{$transcripts} ) { + if ( !exists( $tr_g_hash{ $tr->dbID() } ) ) { next } + + my $new_tr; + if ( $slice != $ext_slice ) { + $new_tr = $tr->transfer($slice); + if ( !defined($new_tr) ) { + throw("Unexpected. " + . "Transcript could not be transfered onto Gene slice." ); + } + } else { + $new_tr = $tr; + } + + $tr_g_hash{ $tr->dbID() }->add_Transcript($new_tr); + } + + return $genes; +} ## end sub fetch_all_by_Slice + +=head2 fetch_by_transcript_id + + Arg [1] : Int $trans_id + Unique database identifier for the transcript whose gene should + be retrieved. The gene is returned in its native coord + system (i.e. the coord_system it is stored in). If the coord + system needs to be changed, then tranform or transfer should + be called on the returned object. undef is returned if the + gene or transcript is not found in the database. + Example : $gene = $gene_adaptor->fetch_by_transcript_id(1241); + Description: Retrieves a gene from the database via the database identifier + of one of its transcripts. + Returntype : Bio::EnsEMBL::Gene + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_by_transcript_id { + my ($self, $trans_id) = @_; + + # this is a cheap SQL call + my $sth = $self->prepare(qq( + SELECT tr.gene_id + FROM transcript tr + WHERE tr.transcript_id = ? + )); + + $sth->bind_param(1, $trans_id, SQL_INTEGER); + $sth->execute(); + + my ($geneid) = $sth->fetchrow_array(); + + $sth->finish(); + + return undef if( !defined $geneid ); + + my $gene = $self->fetch_by_dbID($geneid); + return $gene; +} + + +=head2 fetch_by_transcript_stable_id + + Arg [1] : string $trans_stable_id + transcript stable ID whose gene should be retrieved + Example : my $gene = $gene_adaptor->fetch_by_transcript_stable_id + ('ENST0000234'); + Description: Retrieves a gene from the database via the stable ID of one of + its transcripts + Returntype : Bio::EnsEMBL::Gene + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_by_transcript_stable_id { + my ($self, $trans_stable_id) = @_; + + my $sth = $self->prepare(qq( + SELECT gene_id + FROM transcript + WHERE stable_id = ? + AND is_current = 1 + )); + + $sth->bind_param(1, $trans_stable_id, SQL_VARCHAR); + $sth->execute(); + + my ($geneid) = $sth->fetchrow_array(); + $sth->finish; + + return undef if (!defined $geneid); + + my $gene = $self->fetch_by_dbID($geneid); + return $gene; +} + + +=head2 fetch_by_translation_stable_id + + Arg [1] : String $translation_stable_id + The stable id of a translation of the gene to be obtained + Example : my $gene = $gene_adaptor->fetch_by_translation_stable_id + ('ENSP00000278194'); + Description: Retrieves a gene via the stable id of one of its translations. + Returntype : Bio::EnsEMBL::Gene + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_by_translation_stable_id { + my ($self, $translation_stable_id) = @_; + + my $sth = $self->prepare(qq( + SELECT tr.gene_id + FROM transcript tr, + translation tl + WHERE tl.stable_id = ? + AND tr.transcript_id = tl.transcript_id + AND tr.is_current = 1 + )); + + $sth->bind_param(1, $translation_stable_id, SQL_VARCHAR); + $sth->execute(); + + my ($geneid) = $sth->fetchrow_array(); + $sth->finish; + if( !defined $geneid ) { + return undef; + } + return $self->fetch_by_dbID($geneid); +} + + + + +=head2 fetch_all_by_external_name + + Arg [1] : String $external_name + The external identifier for the gene to be obtained + Arg [2] : (optional) String $external_db_name + The name of the external database from which the + identifier originates. + Arg [3] : Boolean override. Force SQL regex matching for users + who really do want to find all 'NM%' + Example : @genes = @{$gene_adaptor->fetch_all_by_external_name('BRCA2')} + @many_genes = @{$gene_adaptor->fetch_all_by_external_name('BRCA%')} + Description: Retrieves a list of genes with an external database + identifier $external_name. The genes returned are in + their native coordinate system, i.e. in the coordinate + system they are stored in the database in. If another + coordinate system is required then the Gene::transfer or + Gene::transform method can be used. + SQL wildcards % and _ are supported in the $external_name, + but their use is somewhat restricted for performance reasons. + Users that really do want % and _ in the first three characters + should use argument 3 to prevent optimisations + Returntype : listref of Bio::EnsEMBL::Gene + Exceptions : none + Caller : goview, general + Status : Stable + +=cut + +sub fetch_all_by_external_name { + my ( $self, $external_name, $external_db_name, $override ) = @_; + + my $entryAdaptor = $self->db->get_DBEntryAdaptor(); + + my @ids = + $entryAdaptor->list_gene_ids_by_extids( $external_name, + $external_db_name, $override ); + + my %genes_by_dbIDs = + map { $_->dbID(), $_ } @{ $self->fetch_all_by_dbID_list( \@ids ) }; + + my @result = map { $genes_by_dbIDs{$_} } @ids; + + return \@result; +} + +=head2 fetch_all_by_GOTerm + + Arg [1] : Bio::EnsEMBL::OntologyTerm + The GO term for which genes should be fetched. + + Example: @genes = @{ + $gene_adaptor->fetch_all_by_GOTerm( + $go_adaptor->fetch_by_accession('GO:0030326') ) }; + + Description : Retrieves a list of genes that are associated with + the given GO term, or with any of its descendent + GO terms. The genes returned are in their native + coordinate system, i.e. in the coordinate system + in which they are stored in the database. If + another coordinate system is required then the + Gene::transfer or Gene::transform method can be + used. + + Return type : listref of Bio::EnsEMBL::Gene + Exceptions : Throws of argument is not a GO term + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_GOTerm { + my ( $self, $term ) = @_; + + assert_ref( $term, 'Bio::EnsEMBL::OntologyTerm' ); + if ( $term->ontology() ne 'GO' ) { + throw('Argument is not a GO term'); + } + + my $entryAdaptor = $self->db->get_DBEntryAdaptor(); + + my %unique_dbIDs; + foreach my $accession ( map { $_->accession() } + ( $term, @{ $term->descendants() } ) ) + { + my @ids = + $entryAdaptor->list_gene_ids_by_extids( $accession, 'GO' ); + foreach my $dbID (@ids) { $unique_dbIDs{$dbID} = 1 } + } + + my @result = @{ + $self->fetch_all_by_dbID_list( + [ sort { $a <=> $b } keys(%unique_dbIDs) ] + ) }; + + return \@result; +} ## end sub fetch_all_by_GOTerm + +=head2 fetch_all_by_GOTerm_accession + + Arg [1] : String + The GO term accession for which genes should be + fetched. + + Example : + + @genes = + @{ $gene_adaptor->fetch_all_by_GOTerm_accession( + 'GO:0030326') }; + + Description : Retrieves a list of genes that are associated with + the given GO term, or with any of its descendent + GO terms. The genes returned are in their native + coordinate system, i.e. in the coordinate system + in which they are stored in the database. If + another coordinate system is required then the + Gene::transfer or Gene::transform method can be + used. + + Return type : listref of Bio::EnsEMBL::Gene + Exceptions : Throws of argument is not a GO term accession + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_GOTerm_accession { + my ( $self, $accession ) = @_; + + if ( $accession !~ /^GO:/ ) { + throw('Argument is not a GO term accession'); + } + + my $goAdaptor = + Bio::EnsEMBL::Registry->get_adaptor( 'Multi', 'Ontology', + 'OntologyTerm' ); + + my $term = $goAdaptor->fetch_by_accession($accession); + + return $self->fetch_all_by_GOTerm($term); +} + +=head2 fetch_all_alt_alleles + + Arg [1] : Bio::EnsEMBL::Gene $gene + The gene to fetch alternative alleles for + Example : my @alt_genes = @{ $gene_adaptor->fetch_all_alt_alleles($gene) }; + foreach my $alt_gene (@alt_genes) { + print "Alternate allele: " . $alt_gene->stable_id() . "\n" ; + } + Description: Retrieves genes which are alternate alleles to a provided gene. + Alternate alleles in Ensembl are genes which are similar and are + on an alternative haplotype of the same region. There are not + currently very many of these. This method will return a + reference to an empty list if no alternative alleles are found. + Returntype : listref of Bio::EnsEMBL::Genes + Exceptions : throw if incorrect arg provided + warning if gene arg does not have dbID + Caller : Gene::get_all_alt_alleles + Status : Stable + +=cut + +sub fetch_all_alt_alleles { + my $self = shift; + my $gene = shift; + + if(!ref($gene) || !$gene->isa('Bio::EnsEMBL::Gene')) { + throw('Bio::EnsEMBL::Gene argument is required'); + } + + my $gene_id = $gene->dbID(); + + if(!$gene_id) { + warning('Cannot retrieve alternate alleles for gene without dbID'); + return []; + } + + my $sth = $self->prepare("SELECT aa1.gene_id " . + "FROM alt_allele aa1, alt_allele aa2 " . + "WHERE aa1.alt_allele_id = aa2.alt_allele_id " . + "AND aa2.gene_id = ? " . + "AND aa1.gene_id <> ?"); + + $sth->bind_param(1, $gene_id, SQL_INTEGER); + $sth->bind_param(2, $gene_id, SQL_INTEGER); + $sth->execute(); + + my @alt_ids; + my $row; + while($row = $sth->fetchrow_arrayref()) { + push @alt_ids, $row->[0]; + } + $sth->finish(); + + if (@alt_ids) { + return $self->fetch_all_by_dbID_list(\@alt_ids); + } + + return []; +} + +sub is_ref{ + my ( $self, $gene_id) = @_; + my $is_not_ref; + + # easier to find if it is not an alt_Allele do this and then negate it. + my $sth = $self->prepare("select count(1) from alt_allele where gene_id = $gene_id and is_ref = 0"); + $sth->execute(); + $sth->bind_columns(\$is_not_ref); + $sth->fetch; + + if(defined($is_not_ref) and $is_not_ref){ + return 0; + } + + return 1; +} + + +=head2 store_alt_alleles + + + Arg [1] : reference to list of Bio::EnsEMBL::Genes $genes + Example : $gene_adaptor->store_alt_alleles([$gene1, $gene2, $gene3]); + Description: This method creates a group of alternative alleles (i.e. locus) + from a set of genes. The genes should be genes from alternate + haplotypes which are similar. The genes must already be stored + in this database. + Returntype : int alt_allele_id or undef if no alt_alleles were stored + Exceptions : throw on incorrect arguments + throw on sql error (e.g. duplicate unique id) + Caller : general + Status : Stable + +=cut + +sub store_alt_alleles { + my $self = shift; + my $genes = shift; + + if(!ref($genes) eq 'ARRAY') { + throw('List reference of Bio::EnsEMBL::Gene argument expected.'); + } + + my @genes = @$genes; + my $num_genes = scalar(@genes); + + if($num_genes < 2) { + warning('At least 2 genes must be provided to construct alternative alleles (gene id: '. $genes[0]->dbID() .'). Ignoring.'); + return; + } + + my @is_ref; + my @ref_genes = (); + my @non_ref_genes = (); + my @gene_ids = (); + + foreach my $gene (@genes) { + + if(!ref($gene) || !$gene->isa('Bio::EnsEMBL::Gene')) { + throw('List reference of Bio::EnsEMBL::Gene argument expected.'); + } + + my $gene_id = $gene->dbID(); + + if (!$gene_id) { + throw('Genes must have dbIDs in order to construct alternative alleles.'); + } else { + push @gene_ids, $gene_id; + } + + my $is_ref = $gene->slice->is_reference(); + + push @is_ref, $is_ref; + + if ($is_ref) { + push @ref_genes, $gene->dbID(); + } else { + push @non_ref_genes, $gene->dbID(); + } + } + if (scalar(@ref_genes) > 1) { + warning('More than one alternative allele on the reference sequence (gene ids: ' . join(',',@ref_genes) . '). Ignoring.'); + return; + } + + # + #insert the first gene seperately in order to get a unique identifier for + #the set of alleles + # + + my $sth = $self->prepare("INSERT INTO alt_allele (gene_id, is_ref) VALUES (?,?)"); + $sth->bind_param(1, $gene_ids[0], SQL_INTEGER); + $sth->bind_param(2, $is_ref[0], SQL_INTEGER); + eval { + $sth->execute(); + }; + my $alt_allele_id = $sth->{'mysql_insertid'}; + + if (!$alt_allele_id || $@) { + throw("An SQL error occured inserting alternative alleles:\n$@"); + } + $sth->finish(); + # + # Insert all subsequent alt alleles using the alt_allele identifier + # from the first insert + # + + $sth = $self->prepare("INSERT INTO alt_allele (alt_allele_id, gene_id, is_ref) " . + "VALUES (?,?,?)"); + + for (my $i = 1; $i < $num_genes; $i++) { + + $sth->bind_param(1, $alt_allele_id, SQL_INTEGER); + $sth->bind_param(2, $gene_ids[$i], SQL_INTEGER); + $sth->bind_param(3, $is_ref[$i], SQL_INTEGER); + eval { + $sth->execute(); + }; + + if ($@) { + # an error occured, revert the db to the previous state + $sth = $self->prepare("DELETE FROM alt_allele WHERE alt_allele_id = ?"); + $sth->bind_param(1, $alt_allele_id, SQL_INTEGER); + $sth->execute(); + $sth->finish(); + throw("An SQL error occured inserting alternative alleles:\n$@"); + } + } + + $sth->finish(); + + return $alt_allele_id; +} + + +=head2 store + + Arg [1] : Bio::EnsEMBL::Gene $gene + The gene to store in the database + Arg [2] : ignore_release in xrefs [default 1] set to 0 to use release info + in external database references + Example : $gene_adaptor->store($gene); + Description: Stores a gene in the database. + Returntype : the database identifier (dbID) of the newly stored gene + Exceptions : thrown if the $gene is not a Bio::EnsEMBL::Gene or if + $gene does not have an analysis object + Caller : general + Status : Stable + +=cut + +sub store { + my ($self, $gene, $ignore_release) = @_; + + if (!ref $gene || !$gene->isa('Bio::EnsEMBL::Gene') ) { + throw("Must store a gene object, not a $gene"); + } + if(!defined($ignore_release)){ + $ignore_release = 1; + } + my $db = $self->db(); + + if ($gene->is_stored($db)) { + return $gene->dbID(); + } + + # ensure coords are correct before storing + $gene->recalculate_coordinates(); + + my $analysis = $gene->analysis(); + throw("Genes must have an analysis object.") if(!defined($analysis)); + + my $analysis_id; + if ($analysis->is_stored($db)) { + $analysis_id = $analysis->dbID(); + } else { + $analysis_id = $db->get_AnalysisAdaptor->store($analysis); + } + + my $type = $gene->biotype || ""; + + # default to is_current = 1 if this attribute is not set + my $is_current = $gene->is_current; + $is_current = 1 unless (defined($is_current)); + + my $original = $gene; + my $original_transcripts = $gene->get_all_Transcripts(); + + my $seq_region_id; + + ( $gene, $seq_region_id ) = $self->_pre_store($gene); + + my $store_gene_sql = qq( + INSERT INTO gene + SET biotype = ?, + analysis_id = ?, + seq_region_id = ?, + seq_region_start = ?, + seq_region_end = ?, + seq_region_strand = ?, + description = ?, + source = ?, + status = ?, + is_current = ?, + canonical_transcript_id = ?, + canonical_annotation = ? + ); + + if (defined($gene->stable_id)) { + my $created = $self->db->dbc->from_seconds_to_date($gene->created_date()); + my $modified = $self->db->dbc->from_seconds_to_date($gene->modified_date()); + $store_gene_sql .= ", stable_id = ?, version = ?, created_date = " . $created . " , modified_date = " . $modified; + + } + + # column status is used from schema version 34 onwards (before it was + # confidence) + + my $sth = $self->prepare($store_gene_sql); + $sth->bind_param( 1, $type, SQL_VARCHAR ); + $sth->bind_param( 2, $analysis_id, SQL_INTEGER ); + $sth->bind_param( 3, $seq_region_id, SQL_INTEGER ); + $sth->bind_param( 4, $gene->start(), SQL_INTEGER ); + $sth->bind_param( 5, $gene->end(), SQL_INTEGER ); + $sth->bind_param( 6, $gene->strand(), SQL_TINYINT ); + $sth->bind_param( 7, $gene->description(), SQL_LONGVARCHAR ); + $sth->bind_param( 8, $gene->source(), SQL_VARCHAR ); + $sth->bind_param( 9, $gene->status(), SQL_VARCHAR ); + $sth->bind_param( 10, $is_current, SQL_TINYINT ); + + # Canonical transcript ID will be updated later. + # Set it to zero for now. + $sth->bind_param( 11, 0, SQL_TINYINT ); + + $sth->bind_param( 12, $gene->canonical_annotation(), SQL_VARCHAR ); + + if ( defined($gene->stable_id) ) { + + $sth->bind_param( 13, $gene->stable_id, SQL_VARCHAR ); + my $version = ($gene->version()) ? $gene->version() : 1; + $sth->bind_param( 14, $version, SQL_INTEGER ); + } + + $sth->execute(); + $sth->finish(); + + my $gene_dbID = $sth->{'mysql_insertid'}; + + # store the dbentries associated with this gene + my $dbEntryAdaptor = $db->get_DBEntryAdaptor(); + + foreach my $dbe ( @{ $gene->get_all_DBEntries } ) { + $dbEntryAdaptor->store( $dbe, $gene_dbID, "Gene", $ignore_release ); + } + + # We allow transcripts not to share equal exons and instead have + # copies. For the database we still want sharing though, to have + # easier time with stable ids. So we need to have a step to merge + # exons together before store. + my %exons; + + foreach my $trans ( @{$gene->get_all_Transcripts} ) { + foreach my $e ( @{$trans->get_all_Exons} ) { + my $key = $e->hashkey(); + if( exists $exons{ $key } ) { + $trans->swap_exons( $e, $exons{$key} ); + } else { + $exons{$key} = $e; + } + } + } + + my $transcript_adaptor = $db->get_TranscriptAdaptor(); + + my $transcripts = $gene->get_all_Transcripts(); + + my $new_canonical_transcript_id; + for ( my $i = 0; $i < @$transcripts; $i++ ) { + my $new = $transcripts->[$i]; + my $old = $original_transcripts->[$i]; + + $transcript_adaptor->store( $new, $gene_dbID, $analysis_id ); + + if ( !defined($new_canonical_transcript_id) + && $new->is_canonical() ) + { + $new_canonical_transcript_id = $new->dbID(); + } + + # update the original transcripts since we may have made copies of + # them by transforming the gene + $old->dbID( $new->dbID() ); + $old->adaptor( $new->adaptor() ); + + if ( $new->translation ) { + $old->translation->dbID( $new->translation()->dbID ); + $old->translation->adaptor( $new->translation()->adaptor ); + } + } + + if ( defined($new_canonical_transcript_id) ) { + # Now the canonical transcript has been stored, so update the + # canonical_transcript_id of this gene with the new dbID. + my $sth = $self->prepare( + q( + UPDATE gene + SET canonical_transcript_id = ? + WHERE gene_id = ?) + ); + + $sth->bind_param( 1, $new_canonical_transcript_id, SQL_INTEGER ); + $sth->bind_param( 2, $gene_dbID, SQL_INTEGER ); + + $sth->execute(); + $sth->finish(); + } + + # update gene to point to display xref if it is set + if(my $display_xref = $gene->display_xref) { + my $dxref_id; + if($display_xref->is_stored($db)) { + $dxref_id = $display_xref->dbID(); + } else { + $dxref_id = $dbEntryAdaptor->exists($display_xref); + } + + if(defined($dxref_id)) { + my $sth = $self->prepare + ("UPDATE gene SET display_xref_id = ? WHERE gene_id = ?"); + $sth->bind_param(1, $dxref_id, SQL_INTEGER); + $sth->bind_param(2, $gene_dbID, SQL_INTEGER); + $sth->execute(); + $sth->finish(); + $display_xref->dbID($dxref_id); + $display_xref->adaptor($dbEntryAdaptor); + $display_xref->dbID($dxref_id); + $display_xref->adaptor($dbEntryAdaptor); + } else { + warning("Display_xref ".$display_xref->dbname().":". + $display_xref->display_id() . " is not stored in database.\n". + "Not storing relationship to this gene."); + $display_xref->dbID(undef); + $display_xref->adaptor(undef); + } + } + + # store gene attributes if there are any + my $attr_adaptor = $db->get_AttributeAdaptor(); + $attr_adaptor->store_on_Gene($gene_dbID, $gene->get_all_Attributes); + + # store unconventional transcript associations if there are any + my $utaa = $db->get_UnconventionalTranscriptAssociationAdaptor(); + foreach my $uta (@{$gene->get_all_unconventional_transcript_associations()}) { + $utaa->store($uta); + } + + # set the adaptor and dbID on the original passed in gene not the + # transfered copy + $original->adaptor($self); + $original->dbID($gene_dbID); + + return $gene_dbID; +} + + +=head2 remove + + Arg [1] : Bio::EnsEMBL::Gene $gene + the gene to remove from the database + Example : $gene_adaptor->remove($gene); + Description: Removes a gene completely from the database. All associated + transcripts, exons, stable_identifiers, descriptions, etc. + are removed as well. Use with caution! + Returntype : none + Exceptions : throw on incorrect arguments + warning if gene is not stored in this database + Caller : general + Status : Stable + +=cut + +sub remove { + my $self = shift; + my $gene = shift; + + if (!ref($gene) || !$gene->isa('Bio::EnsEMBL::Gene')) { + throw("Bio::EnsEMBL::Gene argument expected."); + } + + if ( !$gene->is_stored($self->db()) ) { + warning("Cannot remove gene " . $gene->dbID() . ". Is not stored in " . + "this database."); + return; + } + + # remove all object xrefs associated with this gene + + my $dbe_adaptor = $self->db()->get_DBEntryAdaptor(); + foreach my $dbe (@{$gene->get_all_DBEntries()}) { + $dbe_adaptor->remove_from_object($dbe, $gene, 'Gene'); + } + + # remove all alternative allele entries associated with this gene + my $sth = $self->prepare("DELETE FROM alt_allele WHERE gene_id = ?"); + $sth->bind_param( 1, $gene->dbID, SQL_INTEGER ); + $sth->execute(); + $sth->finish(); + + # remove the attributes associated with this transcript + my $attrib_adaptor = $self->db->get_AttributeAdaptor; + $attrib_adaptor->remove_from_Gene($gene); + + # remove all of the transcripts associated with this gene + my $transcriptAdaptor = $self->db->get_TranscriptAdaptor(); + foreach my $trans ( @{$gene->get_all_Transcripts()} ) { + $transcriptAdaptor->remove($trans); + } + + # remove any unconventional transcript associations involving this gene + + $sth = + $self->prepare( "DELETE FROM unconventional_transcript_association " + . "WHERE gene_id = ? " ); + $sth->bind_param( 1, $gene->dbID, SQL_INTEGER ); + $sth->execute(); + $sth->finish(); + + # remove this gene from the database + + $sth = $self->prepare("DELETE FROM gene WHERE gene_id = ? "); + $sth->bind_param( 1, $gene->dbID, SQL_INTEGER ); + $sth->execute(); + $sth->finish(); + + # unset the gene identifier and adaptor thereby flagging it as unstored + + $gene->dbID(undef); + $gene->adaptor(undef); + + return; +} + + +=head2 get_Interpro_by_geneid + + Arg [1] : String $gene_stable_id + The stable ID of the gene to obtain + Example : @i = @{ + $gene_adaptor->get_Interpro_by_geneid( + $gene->stable_id() ) }; + Description: Gets interpro accession numbers by gene stable id. A hack really + - we should have a much more structured system than this. + Returntype : listref of strings (Interpro_acc:description) + Exceptions : none + Caller : domainview + Status : Stable + +=cut + +sub get_Interpro_by_geneid { + my ($self, $gene_stable_id) = @_; + + my $sql = qq( + SELECT i.interpro_ac, + x.description + FROM transcript t, + translation tl, + protein_feature pf, + interpro i, + xref x, + gene g + WHERE g.stable_id = ? + AND t.gene_id = g.gene_id + AND t.is_current = 1 + AND tl.transcript_id = t.transcript_id + AND tl.translation_id = pf.translation_id + AND i.id = pf.hit_name + AND i.interpro_ac = x.dbprimary_acc); + + my $sth = $self->prepare($sql); + + $sth->bind_param( 1, $gene_stable_id, SQL_VARCHAR ); + + $sth->execute; + + my @out; + my %h; + while( (my $arr = $sth->fetchrow_arrayref()) ) { + if( $h{$arr->[0]} ) { next; } + $h{$arr->[0]}=1; + my $string = $arr->[0] .":".$arr->[1]; + push(@out,$string); + } + + return \@out; +} + + +=head2 update + + Arg [1] : Bio::EnsEMBL::Gene $gene + The gene to update + Example : $gene_adaptor->update($gene); + Description: Updates the type, analysis, display_xref, status, is_current and + description of a gene in the database. + Returntype : None + Exceptions : thrown if the $gene is not a Bio::EnsEMBL::Gene + Caller : general + Status : Stable + +=cut + +sub update { + my ($self, $gene) = @_; + my $update = 0; + + if ( !defined $gene || !ref $gene || !$gene->isa('Bio::EnsEMBL::Gene') ) { + throw("Must update a gene object, not a $gene"); + } + + my $update_gene_sql = qq( + UPDATE gene + SET biotype = ?, + analysis_id = ?, + display_xref_id = ?, + status = ?, + description = ?, + is_current = ?, + canonical_transcript_id = ?, + canonical_annotation = ? + WHERE gene_id = ? + ); + + my $display_xref = $gene->display_xref(); + my $display_xref_id; + + if ( $display_xref && $display_xref->dbID() ) { + $display_xref_id = $display_xref->dbID(); + } else { + $display_xref_id = undef; + } + + my $sth = $self->prepare( $update_gene_sql ); + + $sth->bind_param( 1, $gene->biotype(), SQL_VARCHAR ); + $sth->bind_param( 2, $gene->analysis->dbID(), SQL_INTEGER ); + $sth->bind_param( 3, $display_xref_id, SQL_INTEGER ); + $sth->bind_param( 4, $gene->status(), SQL_VARCHAR ); + $sth->bind_param( 5, $gene->description(), SQL_VARCHAR ); + $sth->bind_param( 6, $gene->is_current(), SQL_TINYINT ); + + if ( defined( $gene->canonical_transcript() ) ) { + $sth->bind_param( 7, $gene->canonical_transcript()->dbID(), + SQL_INTEGER ); + } else { + $sth->bind_param( 7, 0, SQL_INTEGER ); + } + + $sth->bind_param( 8, $gene->canonical_annotation(), SQL_VARCHAR ); + $sth->bind_param( 9, $gene->dbID(), SQL_INTEGER ); + + $sth->execute(); + + # maybe should update stable id ??? +} + + +# _objs_from_sth + +# Arg [1] : StatementHandle $sth +# Arg [2] : Bio::EnsEMBL::AssemblyMapper $mapper +# Arg [3] : Bio::EnsEMBL::Slice $dest_slice +# Description: PROTECTED implementation of abstract superclass method. +# responsible for the creation of Genes +# Returntype : listref of Bio::EnsEMBL::Genes in target coordinate system +# Exceptions : none +# Caller : internal +# Status : Stable + +sub _objs_from_sth { + my ($self, $sth, $mapper, $dest_slice) = @_; + + # + # This code is ugly because an attempt has been made to remove as many + # function calls as possible for speed purposes. Thus many caches and + # a fair bit of gymnastics is used. + # + + my $sa = $self->db()->get_SliceAdaptor(); + my $aa = $self->db()->get_AnalysisAdaptor(); + my $dbEntryAdaptor = $self->db()->get_DBEntryAdaptor(); + + my @genes; + my %analysis_hash; + my %slice_hash; + my %sr_name_hash; + my %sr_cs_hash; + + my ( + $gene_id, $seq_region_id, + $seq_region_start, $seq_region_end, + $seq_region_strand, $analysis_id, + $biotype, $display_xref_id, + $gene_description, $status, + $source, $is_current, + $canonical_transcript_id, $canonical_annotation, + $stable_id, $version, + $created_date, $modified_date, + $xref_display_id, $xref_primary_acc, + $xref_desc, $xref_version, + $external_db, $external_status, + $external_release, $external_db_name, + $info_type, $info_text + ); + + $sth->bind_columns( + \( + $gene_id, $seq_region_id, + $seq_region_start, $seq_region_end, + $seq_region_strand, $analysis_id, + $biotype, $display_xref_id, + $gene_description, $status, + $source, $is_current, + $canonical_transcript_id, $canonical_annotation, + $stable_id, $version, + $created_date, $modified_date, + $xref_display_id, $xref_primary_acc, + $xref_desc, $xref_version, + $external_db, $external_status, + $external_release, $external_db_name, + $info_type, $info_text + ) ); + + my $asm_cs; + my $cmp_cs; + my $asm_cs_vers; + my $asm_cs_name; + my $cmp_cs_vers; + my $cmp_cs_name; + + if($mapper) { + $asm_cs = $mapper->assembled_CoordSystem(); + $cmp_cs = $mapper->component_CoordSystem(); + $asm_cs_name = $asm_cs->name(); + $asm_cs_vers = $asm_cs->version(); + $cmp_cs_name = $cmp_cs->name(); + $cmp_cs_vers = $cmp_cs->version(); + } + + my $dest_slice_start; + my $dest_slice_end; + my $dest_slice_strand; + my $dest_slice_length; + my $dest_slice_sr_name; + my $dest_slice_sr_id; + + if($dest_slice) { + $dest_slice_start = $dest_slice->start(); + $dest_slice_end = $dest_slice->end(); + $dest_slice_strand = $dest_slice->strand(); + $dest_slice_length = $dest_slice->length(); + $dest_slice_sr_name = $dest_slice->seq_region_name(); + $dest_slice_sr_id = $dest_slice->get_seq_region_id(); + } + + FEATURE: while($sth->fetch()) { + #get the analysis object + my $analysis = $analysis_hash{$analysis_id} ||= + $aa->fetch_by_dbID($analysis_id); + + #need to get the internal_seq_region, if present + $seq_region_id = $self->get_seq_region_id_internal($seq_region_id); + my $slice = $slice_hash{"ID:".$seq_region_id}; + + if(!$slice) { + $slice = $sa->fetch_by_seq_region_id($seq_region_id); + $slice_hash{"ID:".$seq_region_id} = $slice; + $sr_name_hash{$seq_region_id} = $slice->seq_region_name(); + $sr_cs_hash{$seq_region_id} = $slice->coord_system(); + } + + my $sr_name = $sr_name_hash{$seq_region_id}; + my $sr_cs = $sr_cs_hash{$seq_region_id}; + + # + # remap the feature coordinates to another coord system + # if a mapper was provided + # + if($mapper) { + + if (defined $dest_slice && $mapper->isa('Bio::EnsEMBL::ChainedAssemblyMapper') ) { + ( $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand ) + = + $mapper->map( $sr_name, $seq_region_start, $seq_region_end, + $seq_region_strand, $sr_cs, 1, $dest_slice); + + } else { + + ( $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand ) + = + $mapper->fastmap( $sr_name, $seq_region_start, $seq_region_end, + $seq_region_strand, $sr_cs ); + } + + #skip features that map to gaps or coord system boundaries + next FEATURE if(!defined($seq_region_id)); + + #get a slice in the coord system we just mapped to +# if($asm_cs == $sr_cs || ($cmp_cs != $sr_cs && $asm_cs->equals($sr_cs))) { + $slice = $slice_hash{"ID:".$seq_region_id} ||= + $sa->fetch_by_seq_region_id($seq_region_id); +# } else { +# $slice = $slice_hash{"NAME:$sr_name:$asm_cs_name:$asm_cs_vers"} ||= +# $sa->fetch_by_region($asm_cs_name, $sr_name, undef, undef, undef, +# $asm_cs_vers); +# } + } + + # + # If a destination slice was provided convert the coords. + # + if ( defined($dest_slice) ) { + if ( $dest_slice_strand == 1 ) { + # Positive strand. + + $seq_region_start = $seq_region_start - $dest_slice_start + 1; + $seq_region_end = $seq_region_end - $dest_slice_start + 1; + + if ( $dest_slice->is_circular() ) { + # Handle cicular chromosomes. + + if ( $seq_region_start > $seq_region_end ) { + # Looking at a feature overlapping the chromsome origin. + + if ( $seq_region_end > $dest_slice_start ) { + # Looking at the region in the beginning of the + # chromosome. + $seq_region_start -= $dest_slice->seq_region_length(); + } + + if ( $seq_region_end < 0 ) { + $seq_region_end += $dest_slice->seq_region_length(); + } + + } else { + + if ( $dest_slice_start > $dest_slice_end + && $seq_region_end < 0 ) + { + # Looking at the region overlapping the chromosome + # origin and a feature which is at the beginning of the + # chromosome. + $seq_region_start += $dest_slice->seq_region_length(); + $seq_region_end += $dest_slice->seq_region_length(); + } + } + + } ## end if ( $dest_slice->is_circular...) + + } else { + # Negative strand. + + if ( $dest_slice->is_circular() + && $seq_region_start > $seq_region_end ) + { + # Handle cicular chromosomes. + + if ( $seq_region_end > $dest_slice_start ) { + # Looking at the region in the beginning of the + # chromosome. + $seq_region_start = $dest_slice_end - $seq_region_end + 1; + $seq_region_end = + $seq_region_end - + $dest_slice->seq_region_length - + $dest_slice_start + 1; + } else { + my $tmp_seq_region_start = $seq_region_start; + $seq_region_start = + $dest_slice_end - + $seq_region_end - + $dest_slice->seq_region_length + 1; + $seq_region_end = + $dest_slice_end - $tmp_seq_region_start + 1; + } + + } else { + # Non-circular chromosome. + + my $tmp_seq_region_start = $seq_region_start; + $seq_region_start = $dest_slice_end - $seq_region_end + 1; + $seq_region_end = $dest_slice_end - $tmp_seq_region_start + 1; + } + + $seq_region_strand = -$seq_region_strand; + + } ## end else [ if ( $dest_slice_strand...)] + + # Throw away features off the end of the requested slice or on + # different seq_region. + + if ( $seq_region_end < 1 + || $seq_region_start > $dest_slice_length + || ( $dest_slice_sr_id ne $seq_region_id ) ) + { + next FEATURE; + } + + $slice = $dest_slice; + } ## end if ( defined($dest_slice...)) + + my $display_xref; + + if ($display_xref_id) { + $display_xref = Bio::EnsEMBL::DBEntry->new_fast( { + 'dbID' => $display_xref_id, + 'adaptor' => $dbEntryAdaptor, + 'display_id' => $xref_display_id, + 'primary_id' => $xref_primary_acc, + 'version' => $xref_version, + 'description' => $xref_desc, + 'release' => $external_release, + 'dbname' => $external_db, + 'db_display_name' => $external_db_name, + 'info_type' => $info_type, + 'info_text' => $info_text + } ); + $display_xref->status($external_status); + } + + # Finally, create the new Gene. + push( + @genes, + $self->_create_feature_fast( + 'Bio::EnsEMBL::Gene', + { + 'analysis' => $analysis, + 'biotype' => $biotype, + 'start' => $seq_region_start, + 'end' => $seq_region_end, + 'strand' => $seq_region_strand, + 'adaptor' => $self, + 'slice' => $slice, + 'dbID' => $gene_id, + 'stable_id' => $stable_id, + 'version' => $version, + 'created_date' => $created_date || undef, + 'modified_date' => $modified_date || undef, + 'description' => $gene_description, + 'external_name' => undef, # will use display_id + # from display_xref + 'external_db' => $external_db, + 'external_status' => $external_status, + 'display_xref' => $display_xref, + 'status' => $status, + 'source' => $source, + 'is_current' => $is_current, + 'canonical_transcript_id' => $canonical_transcript_id, + 'canonical_annotation' => $canonical_annotation + } ) ); + + } + + return \@genes; +} + + +=head2 cache_gene_seq_mappings + + Example : $gene_adaptor->cache_gene_seq_mappings(); + Description: caches all the assembly mappings needed for genes + Returntype : None + Exceptions : None + Caller : general + Status : At Risk + : New experimental code + +=cut + +sub cache_gene_seq_mappings { + my ($self) = @_; + + # get the sequence level to map too + + my $sql = + 'SELECT name ' + . 'FROM coord_system ' + . 'WHERE attrib like "%%sequence_level%%"' + . 'AND species_id = ?'; + + my $sth = $self->prepare($sql); + $sth->bind_param( 1, $self->species_id(), SQL_INTEGER ); + $sth->execute(); + + my $sequence_level = $sth->fetchrow_array(); + + $sth->finish(); + + my $csa = $self->db->get_CoordSystemAdaptor(); + my $ama = $self->db->get_AssemblyMapperAdaptor(); + + my $cs1 = $csa->fetch_by_name($sequence_level); + + # get level to map to two + + my $mcc = $self->db->get_MetaCoordContainerAdaptor(); + my $csnew = $mcc->fetch_all_CoordSystems_by_feature_type('gene'); + + foreach my $cs2 (@$csnew) { + my $am = $ama->fetch_by_CoordSystems( $cs1, $cs2 ); + $am->register_all(); + } + +} ## end sub cache_gene_seq_mappings + + +=head2 fetch_all_by_exon_supporting_evidence + + Arg [1] : String $hit_name + Name of supporting feature + Arg [2] : String $feature_type + one of "dna_align_feature" or "protein_align_feature" + Arg [3] : (optional) Bio::Ensembl::Analysis + Example : $genes = $gene_adaptor->fetch_all_by_exon_supporting_evidence( + 'XYZ', 'dna_align_feature'); + Description: Gets all the genes with transcripts with exons which have a + specified hit on a particular type of feature. Optionally filter + by analysis. + Returntype : Listref of Bio::EnsEMBL::Gene + Exceptions : If feature_type is not of correct type. + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_exon_supporting_evidence { + my ($self, $hit_name, $feature_type, $analysis) = @_; + + if ($feature_type !~ /(dna)|(protein)_align_feature/) { + throw("feature type must be dna_align_feature or protein_align_feature"); + } + + my $anal_from = ", analysis a " if ($analysis); + my $anal_where = "AND a.analysis_id = f.analysis_id AND a.analysis_id=? " if ($analysis); + + my $sql = qq( + SELECT DISTINCT(g.gene_id) + FROM gene g, + transcript t, + exon_transcript et, + supporting_feature sf, + $feature_type f + $anal_from + WHERE g.gene_id = t.gene_id + AND g.is_current = 1 + AND t.transcript_id = et.transcript_id + AND et.exon_id = sf.exon_id + AND sf.feature_id = f.${feature_type}_id + AND sf.feature_type = ? + AND f.hit_name=? + $anal_where + ); + + my $sth = $self->prepare($sql); + + $sth->bind_param(1, $feature_type, SQL_VARCHAR); + $sth->bind_param(2, $hit_name, SQL_VARCHAR); + $sth->bind_param(3, $analysis->dbID(), SQL_INTEGER) if ($analysis); + + $sth->execute(); + + my @genes; + + while ( my $id = $sth->fetchrow_array ) { + my $gene = $self->fetch_by_dbID($id); + push(@genes, $gene) if $gene; + } + + return \@genes; +} + + +=head2 fetch_all_by_transcript_supporting_evidence + + Arg [1] : String $hit_name + Name of supporting feature + Arg [2] : String $feature_type + one of "dna_align_feature" or "protein_align_feature" + Arg [3] : (optional) Bio::Ensembl::Analysis + Example : $genes = $gene_adaptor->fetch_all_by_transcript_supporting_evidence('XYZ', 'dna_align_feature'); + Description: Gets all the genes with transcripts with evidence for a + specified hit on a particular type of feature. Optionally filter + by analysis. + Returntype : Listref of Bio::EnsEMBL::Gene. + Exceptions : If feature_type is not of correct type. + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_transcript_supporting_evidence { + my ($self, $hit_name, $feature_type, $analysis) = @_; + + if($feature_type !~ /(dna)|(protein)_align_feature/) { + throw("feature type must be dna_align_feature or protein_align_feature"); + } + + my $anal_from = ", analysis a " if ($analysis); + my $anal_where = "AND a.analysis_id = f.analysis_id AND a.analysis_id=? " if ($analysis); + + my $sql = qq( + SELECT DISTINCT(g.gene_id) + FROM gene g, + transcript t, + transcript_supporting_feature sf, + $feature_type f + $anal_from + WHERE g.gene_id = t.gene_id + AND g.is_current = 1 + AND t.transcript_id = sf.transcript_id + AND sf.feature_id = f.${feature_type}_id + AND sf.feature_type = ? + AND f.hit_name=? + $anal_where + ); + + my $sth = $self->prepare($sql); + + $sth->bind_param(1, $feature_type, SQL_VARCHAR); + $sth->bind_param(2, $hit_name, SQL_VARCHAR); + $sth->bind_param(3, $analysis->dbID(), SQL_INTEGER) if ($analysis); + + $sth->execute(); + + my @genes; + + while( my $id = $sth->fetchrow_array ) { + my $gene = $self->fetch_by_dbID($id); + push(@genes, $gene) if $gene; + } + + return \@genes; +} + +=head2 fetch_nearest_Gene_by_Feature + + Arg [1] : Feature object + Example : $genes = $gene_adaptor->fetch_nearest_Gene_by_Feature($feat); + Description: Gets the nearest gene to the feature + Returntype : Listref of Bio::EnsEMBL::Gene, EMPTY list if no nearest + Caller : general + Status : UnStable + +=cut + +sub fetch_nearest_Gene_by_Feature{ + my $self = shift; + my $feat = shift; + + my $stranded = shift; + my $stream = shift; # 1 up stream -1 downstream + my @genes; + + + my $strand = $feat->strand; + if(defined($stream) and !$strand){ + warn("stream specified but feature has no strand so +ve strand will be used"); + $strand = 1; + } + my $min_dist = 999; + my $gene_id = 0; + + my $overlapping = $feat->get_overlapping_Genes(); + + return $overlapping if(defined(@{$overlapping}[0])); + + my $seq_region_id = $feat->slice->adaptor->get_seq_region_id($feat->slice); + my $start = ($feat->start + $feat->slice->start) -1; + my $end = ($feat->end + $feat->slice->start) -1; + + + my @gene_ids; + if(!defined($stream) or $stream == 0){ + + my $sql1 = "select g.gene_id, (? - g.seq_region_end) as 'dist' from gene g where "; + if($stranded){ + $sql1 .= "g.seq_region_strand = ".$strand." and "; + } + $sql1 .= "seq_region_id = ? and g.seq_region_end < ? order by dist limit 10"; + + # + # MAYBE set the result of prepare to be static in case lots of calls. + # + my $sql1_sth = $self->prepare($sql1) || die "Could not prepare $sql1"; + $sql1_sth->execute($start, $seq_region_id, $start) || die "Could not execute sql"; + $sql1_sth->bind_columns(\$gene_id, \$min_dist) || die "Could mot bin columns"; + + my $last_dist = 99999999999999999; + while($sql1_sth->fetch()){ + if($min_dist <= $last_dist){ + push @gene_ids, $gene_id; + $last_dist = $min_dist; + } + } + $sql1_sth->finish(); + + + + my $sql2 = "select g.gene_id, (g.seq_region_start - ?) as 'dist' from gene g where "; + if($stranded){ + $sql2 .= "g.seq_region_strand = ".$feat->strand." and "; + } + $sql2 .= "seq_region_id = ? and g.seq_region_start > ? order by dist limit 10"; + + my $sql2_sth = $self->prepare($sql2) || die "could not prepare $sql2"; + + my ($tmp_min_dist, $tmp_gene_id); + $sql2_sth->execute($end, $seq_region_id, $end) || die "Could not execute sql"; + $sql2_sth->bind_columns(\$tmp_gene_id, \$tmp_min_dist) || die "Could mot bin columns"; + my $first =1; + while($sql2_sth->fetch()){ + if( $tmp_min_dist <= $last_dist){ + if($first){ + $first = 0; + if($tmp_min_dist < $last_dist){ + @gene_ids = (); #reset + } + } + push @gene_ids, $tmp_gene_id; + $last_dist = $tmp_min_dist; + } + } + $sql2_sth->finish(); + + + } + elsif(($stream*$strand) == 1){ + my $sql1 = "select g.gene_id, (? - g.seq_region_end) as 'dist' from gene g where "; + if($stranded){ + $sql1 .= "g.seq_region_strand = ".$strand." and "; + } + $sql1 .= "seq_region_id = ? and g.seq_region_end < ? order by dist limit 10"; + + # + # MAYBE set the result of prepare to be static in case lots of calls. + # + my $sql1_sth = $self->prepare($sql1) || die "Could not prepare $sql1"; + $sql1_sth->execute($start, $seq_region_id, $start) || die "Could not execute sql"; + $sql1_sth->bind_columns(\$gene_id, \$min_dist) || die "Could mot bin columns"; + + my $last_dist; + my $first = 1; + while($sql1_sth->fetch()){ + if($first){ + $first = 0; + } + else{ + next if ($min_dist > $last_dist); + } + push @gene_ids, $gene_id; + $last_dist = $min_dist; + } + $sql1_sth->finish(); + } + elsif(($stream * $strand) == -1){ + + my $sql2 = "select g.gene_id, (g.seq_region_start - ?) as 'dist' from gene g where "; + if($stranded){ + $sql2 .= "g.seq_region_strand = ".$feat->strand." and "; + } + $sql2 .= "seq_region_id = ? and g.seq_region_start > ? order by dist limit 10"; + + my $sql2_sth = $self->prepare($sql2) || die "could not prepare $sql2"; + + my ($tmp_min_dist, $tmp_gene_id); + $sql2_sth->execute($end, $seq_region_id, $end) || die "Could not execute sql"; + $sql2_sth->bind_columns(\$tmp_gene_id, \$tmp_min_dist) || die "Could mot bin columns"; + my $first =1; + my $last_dist; + while($sql2_sth->fetch()){ + if($first){ + $first = 0; + } + else{ + next if ($tmp_min_dist > $last_dist); + } + push @gene_ids, $tmp_gene_id; + $last_dist = $tmp_min_dist; + } + $sql2_sth->finish(); + } + else{ + die "Invalid stream or strand must be -1, 0 or 1\n"; + } + + + + foreach my $gene_id (@gene_ids){ + push @genes, $self->fetch_by_dbID($gene_id); + } + return \@genes; + +} + +########################## +# # +# DEPRECATED METHODS # +# # +########################## + + +=head2 fetch_by_maximum_DBLink + + DEPRECATED - use fetch_all_by_external_name instead + +=cut + +sub fetch_by_maximum_DBLink { + my ($self, $external_id) = @_; + + deprecate( "use fetch_all_by_external_name instead" ); + + my $genes=$self->fetch_all_by_external_name($external_id); + + my $biggest; + my $max = 0; + my $size = scalar(@$genes); + if ($size > 0) { + foreach my $gene (@$genes) { + my $size = scalar(@{$gene->get_all_Exons}); + if ($size > $max) { + $biggest = $gene; + $max = $size; + } + } + return $biggest; + } + return; +} + + +=head2 get_display_xref + + DEPRECATED use $gene->display_xref + +=cut + +sub get_display_xref { + my ($self, $gene) = @_; + + deprecate( "display xref should retrieved from Gene object directly" ); + + if ( !defined $gene ) { + throw("Must call with a Gene object"); + } + + my $sth = $self->prepare(qq( + SELECT e.db_name, + x.display_label, + x.xref_id + FROM gene g, + xref x, + external_db e + WHERE g.gene_id = ? + AND g.display_xref_id = x.xref_id + AND x.external_db_id = e.external_db_id + )); + + $sth->bind_param(1, $gene->dbID, SQL_INTEGER); + $sth->execute(); + + my ($db_name, $display_label, $xref_id) = $sth->fetchrow_array(); + if ( !defined $xref_id ) { + return undef; + } + + my $db_entry = Bio::EnsEMBL::DBEntry->new( + -dbid => $xref_id, + -adaptor => $self->db->get_DBEntryAdaptor(), + -dbname => $db_name, + -display_id => $display_label + ); + + return $db_entry; +} + + +=head2 get_description + + DEPRECATED, use gene->get_description + +=cut + +sub get_description { + my ($self, $dbID) = @_; + + deprecate( "Gene description should be loaded on gene retrieval. Use gene->get_description()" ); + + if ( !defined $dbID ) { + throw("must call with dbID"); + } + + my $sth = $self->prepare("SELECT description + FROM gene_description + WHERE gene_id = ?"); + + $sth->bind_param(1, $dbID, SQL_INTEGER); + $sth->execute(); + + my @array = $sth->fetchrow_array(); + return $array[0]; +} + + +=head2 fetch_by_Peptide_id + + DEPRECATED, use fetch_by_translation_stable_id() + +=cut + +sub fetch_by_Peptide_id { + my ( $self, $translation_stable_id) = @_; + + deprecate( "Please use better named fetch_by_translation_stable_id \n". + caller(2) ); + + $self->fetch_by_translation_stable_id($translation_stable_id); +} + + +=head2 get_stable_entry_info + + DEPRECATED use $gene->stable_id instead + +=cut + +sub get_stable_entry_info { + my ($self,$gene) = @_; + + deprecated("stable id info is loaded on default, no lazy loading necessary"); + + if ( !defined $gene || !ref $gene || !$gene->isa('Bio::EnsEMBL::Gene') ) { + throw("Needs a gene object, not a $gene"); + } + + my $created_date = $self->db->dbc->from_date_to_seconds("created_date"); + my $modified_date = $self->db->dbc->from_date_to_seconds("modified_date"); + + my $sth = + $self->prepare( "SELECT stable_id, " + . $created_date . "," + . $modified_date + . ", version FROM gene WHERE gene_id = ?" ); + + $sth->bind_param(1, $gene->dbID, SQL_INTEGER); + $sth->execute(); + + my @array = $sth->fetchrow_array(); + $gene->{'stable_id'} = $array[0]; + $gene->{'created'} = $array[1]; + $gene->{'modified'} = $array[2]; + $gene->{'version'} = $array[3]; + + return 1; +} + + +=head2 fetch_all_by_DBEntry + + DEPRECATED - Use fetch_all_by_external_name instead + +=cut + +sub fetch_all_by_DBEntry { + my $self = shift; + + deprecate('Use fetch_all_by_external_name instead.'); + + return $self->fetch_all_by_external_name(@_); +} + + +1; + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/IntronSupportingEvidenceAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/IntronSupportingEvidenceAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,527 @@ +package Bio::EnsEMBL::DBSQL::IntronSupportingEvidenceAdaptor; + +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::DBSQL::IntronSupportingEvidenceAdaptor + +=head1 SYNOPSIS + + my $isea = $dba->get_IntronSupportingEvidenceAdaptor(); + my $ise = $isea->fetch_by_dbID(1); + my $ise_array = $dfa->fetch_all(); + +=head1 METHODS + +=cut + +use strict; +use warnings; +use base qw/Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor/; + +use Bio::EnsEMBL::Intron; +use Bio::EnsEMBL::IntronSupportingEvidence; +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Utils::Exception qw/throw/; +use Bio::EnsEMBL::Utils::Scalar qw/assert_ref/; + +=head2 list_linked_transcript_ids + + Arg[1] : Bio::EnsEMBL::IntronSupportingEvidence Evidence to search with + Example : my $transcript_ids = @{$isea->list_linked_transcript_ids($ise)}; + Description : Uses the given IntronSupportingEvidence to find all linked + transcript ids + Returntype : ArrayRef[Integer] of transcript_id + Exceptions : Thrown if arguments are not as stated and for DB errors + +=cut + +sub list_linked_transcript_ids { + my ($self, $sf) = @_; + assert_ref($sf, 'Bio::EnsEMBL::IntronSupportingEvidence', 'intron_supporting_evidence'); + my $query = <<'SQL'; +select transcript_id from transcript_intron_supporting_evidence +where intron_supporting_evidence_id =? +SQL + return $self->dbc()->sql_helper()->execute_simple(-SQL => $query, -PARAMS => [$sf->dbID()]); +} + +=head2 fetch_all_by_Transcript + + Arg[1] : Bio::EnsEMBL::Transcript Transcript to search with + Example : my $ises = $isea->fetch_all_by_Transcript($transcript); + Description : Uses the given Transcript to search for all instances of + IntronSupportingEvidence linked to the transcript in the + database + Returntype : ArrayRef of IntronSupportingEvidence objects + Exceptions : Thrown if arguments are not as stated and for DB errors + +=cut + +sub fetch_all_by_Transcript { + my ($self, $transcript) = @_; + assert_ref($transcript, 'Bio::EnsEMBL::Transcript', 'transcript'); + my $query = <<'SQL'; +select intron_supporting_evidence_id from transcript_intron_supporting_evidence where transcript_id =? +SQL + my $ids = $self->dbc()->sql_helper()->execute_simple(-SQL => $query, -PARAMS => [$transcript->dbID()]); + return $self->fetch_all_by_dbID_list($ids); +} + +=head2 fetch_flanking_exon_ids + + Arg[1] : Bio::EnsEMBL::IntronSupportingEvidence Evidence to search with + Arg[2] : Bio::EnsEMBL::Transcript Transcript to search with + Example : my ($prev_id, $next_id) = @{$isea->fetch_flanking_exon_ids($ise, $transcript)}; + Description : Uses the given IntronSupportingEvidence and Transcript to search + for the recorded previous and next exon database ids + Returntype : ArrayRef 1 row long but with 2 columns representing previous + and next IDs respectivly + Exceptions : Thrown if arguments are not as stated and for DB errors + +=cut + +sub fetch_flanking_exon_ids { + my ($self, $sf, $transcript) = @_; + assert_ref($sf, 'Bio::EnsEMBL::IntronSupportingEvidence', 'intron_supporting_evidence'); + assert_ref($transcript, 'Bio::EnsEMBL::Transcript', 'transcript'); + my $query = <<'SQL'; +select previous_exon_id, next_exon_id +from transcript_intron_supporting_evidence +where transcript_id =? and intron_supporting_evidence_id =? +SQL + my $ids = $self->dbc()->sql_helper()->execute(-SQL => $query, -PARAMS => [$transcript->dbID(), $sf->dbID()]); + return unless @{$ids}; + return @{$ids->[0]}; +} + +sub _tables { + return ( [ 'intron_supporting_evidence', 'ise' ] ); +} + +sub _columns { + return qw/ + ise.intron_supporting_evidence_id + ise.analysis_id + ise.seq_region_id ise.seq_region_start ise.seq_region_end ise.seq_region_strand + ise.hit_name ise.score ise.score_type + ise.is_splice_canonical + /; +} + +sub _objs_from_sth { + my ($self, $sth, $mapper, $dest_slice) = @_; + + my $sa = $self->db()->get_SliceAdaptor(); + my $aa = $self->db->get_AnalysisAdaptor(); + + my @features; + my %analysis_hash; + my %slice_hash; + my %sr_name_hash; + my %sr_cs_hash; + + my($id, $analysis_id, $seq_region_id, $seq_region_start, $seq_region_end, + $seq_region_strand, $hit_name, $score, $score_type, $splice_canonical); + + $sth->bind_columns(\$id, \$analysis_id, \$seq_region_id, \$seq_region_start, + \$seq_region_end, \$seq_region_strand, \$hit_name, + \$score, \$score_type, \$splice_canonical); + + my $asm_cs; + my $cmp_cs; + my $asm_cs_vers; + my $asm_cs_name; + my $cmp_cs_vers; + my $cmp_cs_name; + if($mapper) { + $asm_cs = $mapper->assembled_CoordSystem(); + $cmp_cs = $mapper->component_CoordSystem(); + $asm_cs_name = $asm_cs->name(); + $asm_cs_vers = $asm_cs->version(); + $cmp_cs_name = $cmp_cs->name(); + $cmp_cs_vers = $cmp_cs->version(); + } + + my $dest_slice_start; + my $dest_slice_end; + my $dest_slice_strand; + my $dest_slice_length; + my $dest_slice_sr_name; + my $dest_slice_seq_region_id; + if($dest_slice) { + $dest_slice_start = $dest_slice->start(); + $dest_slice_end = $dest_slice->end(); + $dest_slice_strand = $dest_slice->strand(); + $dest_slice_length = $dest_slice->length(); + $dest_slice_sr_name = $dest_slice->seq_region_name(); + $dest_slice_seq_region_id = $dest_slice->get_seq_region_id(); + } + + my $count = 0; + FEATURE: while($sth->fetch()) { + $count++; + #get the analysis object + my $analysis = $analysis_hash{$analysis_id} ||= $aa->fetch_by_dbID($analysis_id); + + #need to get the internal_seq_region, if present + $seq_region_id = $self->get_seq_region_id_internal($seq_region_id); + #get the slice object + my $slice = $slice_hash{"ID:".$seq_region_id}; + + if(!$slice) { + $slice = $sa->fetch_by_seq_region_id($seq_region_id); + $slice_hash{"ID:".$seq_region_id} = $slice; + $sr_name_hash{$seq_region_id} = $slice->seq_region_name(); + $sr_cs_hash{$seq_region_id} = $slice->coord_system(); + } + + my $sr_name = $sr_name_hash{$seq_region_id}; + my $sr_cs = $sr_cs_hash{$seq_region_id}; + + # + # remap the feature coordinates to another coord system + # if a mapper was provided + # + if($mapper) { + + if (defined $dest_slice && $mapper->isa('Bio::EnsEMBL::ChainedAssemblyMapper') ) { + ( $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand ) + = + $mapper->map( $sr_name, $seq_region_start, $seq_region_end, + $seq_region_strand, $sr_cs, 1, $dest_slice); + + } else { + + ( $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand ) + = + $mapper->fastmap( $sr_name, $seq_region_start, $seq_region_end, + $seq_region_strand, $sr_cs ); + } + + #skip features that map to gaps or coord system boundaries + next FEATURE if(!defined($seq_region_id)); + + #get a slice in the coord system we just mapped to + if($asm_cs == $sr_cs || ($cmp_cs != $sr_cs && $asm_cs->equals($sr_cs))) { + $slice = $slice_hash{"ID:".$seq_region_id} ||= + $sa->fetch_by_seq_region_id($seq_region_id); + } else { + $slice = $slice_hash{"ID:".$seq_region_id} ||= + $sa->fetch_by_seq_region_id($seq_region_id); + } + } + + # + # If a destination slice was provided convert the coords + # If the dest_slice starts at 1 and is foward strand, nothing needs doing + # + if($dest_slice) { + if($dest_slice_start != 1 || $dest_slice_strand != 1) { + if($dest_slice_strand == 1) { + $seq_region_start = $seq_region_start - $dest_slice_start + 1; + $seq_region_end = $seq_region_end - $dest_slice_start + 1; + } else { + my $tmp_seq_region_start = $seq_region_start; + $seq_region_start = $dest_slice_end - $seq_region_end + 1; + $seq_region_end = $dest_slice_end - $tmp_seq_region_start + 1; + $seq_region_strand *= -1; + } + } + + #throw away features off the end of the requested slice + if($seq_region_end < 1 || $seq_region_start > $dest_slice_length || ( $dest_slice_seq_region_id != $seq_region_id )) { + next FEATURE; + } + $slice = $dest_slice; + } + + push( @features, + $self->_create_feature_fast( + 'Bio::EnsEMBL::IntronSupportingEvidence', { + 'start' => $seq_region_start, + 'end' => $seq_region_end, + 'strand' => $seq_region_strand, + 'slice' => $slice, + 'analysis' => $analysis, + 'adaptor' => $self, + 'dbID' => $id, + 'hit_name' => $hit_name, + 'score' => $score, + 'score_type' => $score_type, + 'is_splice_canonical' => $splice_canonical, + } ) ); + } + + return \@features; +} + +####### STORAGE + +=head2 store + + Arg[1] : Bio::EnsEMBL::IntronSupportingEvidence Evidence to store + Example : $isea->store($ise); + Description : Stores the IntronSupportingEvidence in the database. Duplicates + are ignored. + Returntype : Integer The assigned database identifier + Exceptions : Thrown if the given object is not a IntronSupportingEvidence, + and for any DB exception. + +=cut + +sub store { + my ($self, $sf) = @_; + assert_ref($sf, 'Bio::EnsEMBL::IntronSupportingEvidence', 'intron_supporting_feature'); + + my $db = $self->db(); + + if($sf->is_stored($db)) { + return $sf->dbID(); + } + + my $analysis = $sf->analysis(); + my $analysis_id = $analysis->is_stored($db) ? $analysis->dbID() : $db->get_AnalysisAdaptor()->store($analysis); + + my $seq_region_id; + ($sf, $seq_region_id) = $self->_pre_store($sf); + + my $sql = <start(), SQL_INTEGER], + [$sf->end(), SQL_INTEGER], + [$sf->strand(), SQL_INTEGER], + [$sf->hit_name(), SQL_VARCHAR], + ]; + + my $params = [ + @{$query_params}, + [$sf->score(), SQL_FLOAT], + [$sf->score_type(), SQL_VARCHAR], + [$sf->is_splice_canonical(), SQL_INTEGER], + ]; + + $self->dbc()->sql_helper()->execute_update(-SQL => $sql, -PARAMS => $params, -CALLBACK => sub { + my ( $sth, $dbh ) = @_; + $sf->dbID($self->last_insert_id('intron_supporting_evidence_id', undef, 'intron_supporting_evidence')); + return; + }); + $sf->adaptor($self); + + if(!$sf->dbID()) { + my $query = <<'SQL'; +select intron_supporting_evidence_id +from intron_supporting_evidence +where analysis_id =? +and seq_region_id =? and seq_region_start =? and seq_region_end =? and seq_region_strand =? +and hit_name =? +SQL + my $id = $self->dbc()->sql_helper()->execute_single_result(-SQL => $query, -PARAMS => $query_params); + $sf->dbID($id); + } + + return $sf->dbID(); +} + +=head2 store_transcript_linkage + + Arg[1] : Bio::EnsEMBL::IntronSupportingEvidence Evidence to link + Arg[2] : Bio::EnsEMBL::Transcript Transcript to link + Arg[3] : Integer an optional ID to give if the Transcript's own ID is possibly incorrect + Example : $isea->store_transcript_linkage($ise, $transcript); + $isea->store_transcript_linkage($ise, $transcript, $tid); + Description : Links a Transcript to a portion of Intron evidence + Returntype : None + Exceptions : Thrown if the given object is not a Transcript, if the + transcript is not stored, if the supporting evidence is not + stored and for any DB exception. + +=cut + +sub store_transcript_linkage { + my ($self, $sf, $transcript, $transcript_id) = @_; + assert_ref($sf, 'Bio::EnsEMBL::IntronSupportingEvidence', 'intron_supporting_evidence'); + assert_ref($transcript, 'Bio::EnsEMBL::Transcript', 'transcript'); + + throw "Cannot perform the link. The IntronSupportingEvidence must be persisted first" unless $sf->is_stored($self->db()); + + my $sql = <get_Intron($transcript); + my ($previous_exon, $next_exon) = ($intron->prev_Exon(), $intron->next_Exon()); + $transcript_id ||= $transcript->dbID(); + + my $params = [ + [$transcript_id, SQL_INTEGER], + [$sf->dbID(), SQL_INTEGER], + [$previous_exon->dbID(), SQL_INTEGER], + [$next_exon->dbID(), SQL_INTEGER], + ]; + $self->dbc()->sql_helper()->execute_update(-SQL => $sql, -PARAMS => $params); + + return; +} + +####### UPDATE + +=head2 update + + Arg[1] : Bio::EnsEMBL::IntronSupportingEvidence Evidence to update + Example : $isea->update($ise); + Description : Updates all attributes of an evidence object + Returntype : None + Exceptions : Thrown if the given object is not a IntronSupportingEvidence, + if the object is not stored and for normal DB errors + +=cut + +sub update { + my ($self, $sf) = @_; + assert_ref($sf, 'Bio::EnsEMBL::IntronSupportingEvidence', 'intron_supporting_evidence'); + if (! $sf->is_stored($self->db())) { + throw "Cannot update the supporting evidence if it has not already been stored in this database"; + } + + my $params = [ + [$sf->analysis()->dbID(), SQL_INTEGER], + [$sf->slice()->get_seq_region_id(), SQL_INTEGER], + [$sf->start(), SQL_INTEGER], + [$sf->end(), SQL_INTEGER], + [$sf->strand(), SQL_INTEGER], + [$sf->hit_name(), SQL_VARCHAR], + [$sf->score(), SQL_FLOAT], + [$sf->score_type(), SQL_VARCHAR], + [$sf->is_splice_canonical(), SQL_INTEGER], + [$sf->dbID(), SQL_INTEGER], + ]; + + my $sql = <<'SQL'; +UPDATE intron_supporting_evidence +SET analysis_id =?, seq_region_id =?, seq_region_start =?, +seq_region_end =?, seq_region_strand =?, hit_name =?, score =?, score_type =?, +is_splice_canonical =? +WHERE intron_supporting_evidence_id =? +SQL + + $self->dbc()->sql_helper()->execute_update(-SQL => $sql, -PARAMS => $params); + return; +} + +####### DELETION + +=head2 remove + + Arg[1] : Bio::EnsEMBL::IntronSupportingEvidence + Example : $isea->remove($ise); + Description : Deletes the given IntronSupportingEvidence from the database. + This can only occur if the object has no linked transcripts + Returntype : None + Exceptions : Thrown if the IntronSupportingEvidence is not stored, if + the object has linked transcripts and in the event of any + database error + +=cut + +sub remove { + my ($self, $sf) = @_; + assert_ref($sf, 'Bio::EnsEMBL::IntronSupportingEvidence', 'intron_supporting_evidence'); + if (! $sf->is_stored($self->db())) { + throw "Cannot delete the supporting evidence if it has not already been stored in this database"; + } + if($sf->has_linked_transcripts()) { + throw sprintf('Cannot delete supporting evidence %d. It still has transcripts attached', $sf->dbID()); + } + $self->dbc()->sql_helper()->execute_update( + -SQL => 'DELETE from intron_supporting_evidence where intron_supporting_evidence_id =?', + -PARAMS => [[$sf->dbID(), SQL_INTEGER]], + ); + return; +} + +=head2 remove_all_transcript_linkages + + Arg[1] : Bio::EnsEMBL::IntronSupportingEvidence + Example : $isea->remove_all_transcript_linkages($ise); + Description : Deletes the transcript links to the given IntronSupportingEvidence + Returntype : None + Exceptions : See remove_transcript_linkage + +=cut + +sub remove_all_transcript_linkages { + my ($self, $sf) = @_; + foreach my $transcript_id (@{$self->list_linked_transcript_ids($sf)}) { + $self->_remove_transcript_linkage($sf, $transcript_id); + } + return; +} + +=head2 remove_transcript_linkage + + Arg[1] : Bio::EnsEMBL::IntronSupportingEvidence Evidence to unlink + Arg[2] : Bio::EnsEMBL::Transcript Transcript to unlink + Example : $isea->remove_transcript_linkages($ise, $transcript); + Description : Deletes a transcript's link to the given IntronSupportingEvidence + Returntype : None + Exceptions : Thrown if the given object is not a Transcript, if the + transcript is not stored, if the supporting evidence is not + stored and for any DB exception. + +=cut + +sub remove_transcript_linkage { + my ($self, $sf, $transcript) = @_; + assert_ref($transcript, 'Bio::EnsEMBL::Transcript', 'transcript'); + if (! $transcript->is_stored($self->db())) { + throw "Cannot delete the supporting evidence to transcript linkage if the transcript has not already been stored in this database"; + } + $self->_remove_transcript_linkage($sf, $transcript->dbID()); + return; +} + +sub _remove_transcript_linkage { + my ($self, $sf, $transcript_id) = @_; + if (! $sf->is_stored($self->db())) { + throw "Cannot delete the supporting evidence to transcript linkage if the evidence has not already been stored in this database"; + } + $self->dbc()->sql_helper()->execute_update( + -SQL => 'DELETE from transcript_intron_supporting_evidence where intron_supporting_evidence_id =? and transcript_id =?', + -PARAMS => [[$sf->dbID(), SQL_INTEGER], [$transcript_id, SQL_INTEGER]], + ); + return; +} + +1; \ No newline at end of file diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/KaryotypeBandAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/KaryotypeBandAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,234 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::KaryotypeBandAdaptor + +=head1 SYNOPSIS + + $kary_adaptor = $db_adaptor->get_KaryotypeBandAdaptor(); + + foreach $band ( @{ $kary_adaptor->fetch_all_by_Slice($slice) } ) { + # do something with band + } + + $band = $kary_adaptor->fetch_by_dbID($id); + + my @bands = @{ $kary_adaptor->fetch_all_by_chr_name('X') }; + + my $band = $kary_adaptor->fetch_by_chr_band( '4', 'q23' ); + +=head1 DESCRIPTION + +Database adaptor to provide access to KaryotypeBand objects + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::KaryotypeBandAdaptor; + +use strict; + +use vars qw(@ISA); + +use Bio::EnsEMBL::KaryotypeBand; +use Bio::EnsEMBL::Utils::Exception qw(throw warning deprecate); +use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor); + +#_tables +# +# Arg [1] : none +# Example : none +# Description: PROTECTED Implementation of abstract superclass method to +# provide the name of the tables to query +# Returntype : string +# Exceptions : none +# Caller : internal + + +sub _tables { + my $self = shift; + + return (['karyotype','k']) +} + + +#_columns + +# Arg [1] : none +# Example : none +# Description: PROTECTED Implementation of abstract superclass method to +# provide the name of the columns to query +# Returntype : list of strings +# Exceptions : none +# Caller : internal + +sub _columns { + my $self = shift; + + #warning _objs_from_sth implementation depends on ordering + return qw ( + k.karyotype_id + k.seq_region_id + k.seq_region_start + k.seq_region_end + k.band + k.stain ); +} + + +sub _objs_from_sth { + my ($self, $sth) = @_; + my $db = $self->db(); + my $slice_adaptor = $db->get_SliceAdaptor(); + + my @features; + my %slice_cache; + + my($karyotype_id,$seq_region_id,$seq_region_start,$seq_region_end, + $band,$stain); + + $sth->bind_columns(\$karyotype_id, \$seq_region_id, \$seq_region_start, + \$seq_region_end, \$band, \$stain); + + while ( $sth->fetch() ) { + #need to get the internal_seq_region, if present + $seq_region_id = $self->get_seq_region_id_internal($seq_region_id); + + my $slice = $slice_cache{$seq_region_id} ||= + $slice_adaptor->fetch_by_seq_region_id($seq_region_id); + + push( @features, + $self->_create_feature( 'Bio::EnsEMBL::KaryotypeBand', { + -START => $seq_region_start, + -END => $seq_region_end, + -SLICE => $slice, + -ADAPTOR => $self, + -DBID => $karyotype_id, + -NAME => $band, + -STAIN => $stain + } ) ); + + } + + return \@features; +} + + + +=head2 fetch_all_by_chr_name + + Arg [1] : string $chr_name + Name of the chromosome from which to retrieve band objects + Example : @bands=@{$karyotype_band_adaptor->fetch_all_by_chr_name('X')}; + Description: Fetches all the karyotype band objects from the database for the + given chromosome. + Returntype : listref of Bio::EnsEMBL::KaryotypeBand in chromosomal + (assembly) coordinates + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_chr_name { + my ($self,$chr_name) = @_; + + throw('Chromosome name argument expected') if(!$chr_name); + + my $slice = + $self->db->get_SliceAdaptor->fetch_by_region(undef, $chr_name); + unless ($slice){ + warning("Cannot retrieve chromosome $chr_name"); + return; + } + return $self->fetch_all_by_Slice($slice); +} + + + +sub fetch_all_by_chr_band { + my ($self, $chr_name, $band) = @_; + + throw('Chromosome name argument expected') if(!$chr_name); + throw('Band argument expected') if(!$band); + + my $slice = $self->db->get_SliceAdaptor->fetch_by_region(undef, + $chr_name); + + my $constraint = "k.band like '$band%'"; + return $self->fetch_all_by_Slice_constraint($slice,$constraint); +} + + +=head2 fetch_by_chr_band + + Arg [1] : string $chr_name + Name of the chromosome from which to retrieve the band + Arg [2] : string $band + The name of the band to retrieve from the specified chromosome + Example : @bands = @{$kary_adaptor->fetch_all_by_chr_band('4', 'q23')}; + Description: Fetches the karyotype band object from the database + for the given chromosome and band name. If no such band + exists, undef is returned instead. This function uses fuzzy + matching of the band name. For example the bands 'q23.1' and + 'q23.4' could be matched by fetch_all_by_chr_band('20', 'q23'); + Returntype : Bio::EnsEMBL::KaryotypeBand in chromosomal coordinates. + Exceptions : throws if chr or band is missing in arguments + Caller : general + Status : Stable + +=cut + +sub fetch_by_chr_band { + my $self = shift; + deprecate('Use fetch_all_by_chr_band instead.'); + + my ($band) = @{$self->fetch_all_by_chr_band(@_)}; + return $band; +} + + +=head2 list_dbIDs + + Arg [1] : none + Example : @kary_ids = @{$karyotype_band_adaptor->list_dbIDs()}; + Description: Gets an array of internal ids for all karyotype bands in the + current db + Arg[1] : int. not 0 for the ids to be sorted by the seq_region. + Returntype : reference to a list of ints + Exceptions : none + Caller : ? + Status : Stable + +=cut + +sub list_dbIDs { + my ($self, $ordered) = shift; + + return $self->_list_dbIDs("karyotype",undef, $ordered); +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/MergedAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/MergedAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,188 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::MergedAdaptor + +=head1 SYNOPSIS + + $merged_adaptor = new Bio::EnsEMBL::DBSQL::MergedAdaptor( + -species => "human", + -type => "Population" + ); + +=head1 DESCRIPTION + +The MergedAdaptor object is merely a list of adaptors. AUTOLOAD is used +to call a subroutine on each adaptor and merge the results. + +=head1 METHODS + +=cut + + +package Bio::EnsEMBL::DBSQL::MergedAdaptor; + + +use vars qw(@ISA); +use strict; + +use Bio::EnsEMBL::Utils::Exception qw(throw warning deprecate); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Registry; +my $reg = "Bio::EnsEMBL::Registry"; + + +=head2 new + + Arg [SPECIES]: (optional) string + species name to get adaptors for + Arg [TYPE] : (optional) string + type to get adaptors for + Arg [GROUPS] : (optional) ref to list + Example : $MergedAdaptor = new + : Bio::EnsEMBL::DBSQL::MergedAdaptor(-species=> 'human', -type =>'Population', -groups => ['Sanger','Ensembl']); + + Description: Creates a new MergedAdaptor + Returntype : Bio::EnsEMBL::DBSQL::MergedAdaptor + Exceptions : throws if species or type not specified + Caller : general + Status : At Risk + : Under development + +=cut + +sub new { + my ($class,@args) = @_; + + my $self ={}; + bless $self,$class; + + my ($species, $type, $groups) = + rearrange([qw(SPECIES TYPE GROUPS)], @args); + + if(!defined($species)|| !defined($type)){ + die "Species and Type must be specified\n"; + } + + my @adaps; + if (!defined ($groups)){ + #get all adaptors for that species and type + @adaps = @{$reg->get_all_adaptors(-species => $species, -type => $type)}; + } + else{ + #get only specified adaptors for the particular groups + foreach my $group (@{$groups}){ + push @adaps, $reg->get_adaptor($species,$group,$type); + } + } + + my @list =(); + push(@list,@adaps); + $self->{'list'}= \@list; + + return $self; +} + +=head2 add_list + + Example : $MergedAdaptor->add_list(@adaptors); + Description: adds a list of adaptors to the Merged adaptor list. + Returntype : none + Exceptions : none + Status : At Risk + : Under development + +=cut + +sub add_list{ + my ($self, @arr) = @_; + + foreach my $adap (@arr){ + $self->add_adaptor($adap); + } +} + +=head2 add_adaptor + + Example : $MergedAdaptor->add_adaptor(@adaptors); + Description: adds an adaptor to the Merged adaptor list. + Returntype : none + Exceptions : none + Status : At Risk + : Under development + +=cut + +sub add_adaptor{ + my ($self,$adaptor)=@_; + + if(!defined ($self->{'list'})){ + my @list =(); + push(@list,$adaptor); + $self->{'list'}= \@list; + } + else{ + push(@{$self->{'list'}},$adaptor); + } +} + + +sub printit{ + my ($self)=@_; + + foreach my $adaptor (@{$self->{'list'}}){ + print "printit $adaptor\t".$adaptor->db->group()."\n"; + } +} + + +use vars '$AUTOLOAD'; + +sub AUTOLOAD { + my ($self,@args) = @_; + my @array_return=(); + my $ref_return = undef; + $AUTOLOAD =~ /^.*::(\w+)+$/ ; + + my $sub = $1; + + foreach my $adaptor (@{$self->{'list'}}) { + my $ref; + if($adaptor->can($sub)){ + $ref = $adaptor->$sub(@args); + if( ref($ref) eq 'ARRAY' ) { + push @array_return, @{$ref}; + } else { + push @array_return, $ref; + } + } + else{ # end of can + warn("In Merged Adaptor $adaptor cannot call sub $sub"); + } + } + return \@array_return; +} + +sub DESTROY{ +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/MetaContainer.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/MetaContainer.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,272 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::MetaContainer - Encapsulates all access to core +database meta information + +=head1 SYNOPSIS + + my $meta_container = + $registry->get_adaptor( 'Human', 'Core', 'MetaContainer' ); + + my @mapping_info = + @{ $meta_container->list_value_by_key('assembly.mapping') }; + + my $scientific_name = $meta_container->get_scientific_name(); + +=head1 DESCRIPTION + + An object that encapsulates specific access to core db meta data + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::MetaContainer; + +use strict; +use warnings; + +use Bio::EnsEMBL::Utils::Exception qw/deprecate/; +use Bio::Species; + + +use base qw/Bio::EnsEMBL::DBSQL::BaseMetaContainer/; + +# add well known meta info get-functions below + +=head2 get_production_name + + Args : none + Example : $species = $meta_container->get_production_name(); + Description : Obtains the name of the species in a form usable as, for + example, a table name, file name etc. + Returntype : string + Exceptions : none + Status : Stable + +=cut + +sub get_production_name { + my ($self) = @_; + return $self->single_value_by_key('species.production_name'); +} + +=head2 get_short_name + + Args : none + Example : $species = $meta_container->get_short_name(); + Description : Obtains the name of the species in a form usable as, for + example, a short label in a GUI. + Returntype : string + Exceptions : none + Status : Stable + +=cut + +sub get_short_name { + my ($self) = @_; + return $self->single_value_by_key('species.short_name'); +} + +=head2 get_common_name + + Args : none + Example : $species = $meta_container->get_common_name(); + Description : Obtains the common name of the species. + Returntype : string + Exceptions : none + Status : Stable + +=cut + +sub get_common_name { + my ($self) = @_; + return $self->single_value_by_key('species.common_name'); +} + +=head2 get_scientific_name + + Args : none + Example : $species = $meta_container->get_scientific_name(); + Description : Obtains the full scientific name of the species. + Returntype : string + Exceptions : none + Status : Stable + +=cut +sub get_scientific_name { + my ($self) = @_; + return $self->single_value_by_key('species.scientific_name'); +} + +=head2 get_division + + Args : none + Example : $div = $meta_container->get_division(); + Description : Obtains the Ensembl Genomes division to which the species belongs. + Returntype : string + Exceptions : none + Status : Stable + +=cut +sub get_division { + my ($self) = @_; + return $self->single_value_by_key('species.division'); +} + +=head2 get_Species + + Arg [1] : none + Example : $species = $meta_container->get_Species(); + Description: Obtains the species from this databases meta table. Call is + deprecated; please use other subroutines in this package + Returntype : Bio::Species + Exceptions : none + Caller : ? + Status : Deprecated + +=cut + +sub get_Species { + my ($self) = @_; + + deprecate('Call is deprecated. Use $self->get_common_name() / $self->get_classification() / $self->get_scientific_name() instead'); + + my $common_name = $self->get_common_name(); + my $classification = + $self->list_value_by_key('species.classification'); + if ( !@$classification ) { + return undef; + } + + #Re-create the old classification data structure by adding the scientific + #name back onto the classification but with species before genus e.g. + # sapiens -> Homo -> Hominade + my $scientific_name = $self->get_scientific_name(); + my ($genus, @sp) = split(/ /, $scientific_name); + unshift(@{$classification}, join(q{ }, @sp), $genus); + + my $species = Bio::Species->new(); + $species->common_name($common_name); + + $species->classification($classification, 1); #always force it + + return $species; +} + + +=head2 get_taxonomy_id + + Arg [1] : none + Example : $tax_id = $meta_container->get_taxonomy_id(); + Description: Retrieves the taxonomy id from the database meta table + Returntype : string + Exceptions : none + Caller : ? + Status : Stable + +=cut + +sub get_taxonomy_id { + my ($self) = @_; + return $self->single_value_by_key('species.taxonomy_id', 1); +} + + + +=head2 get_default_assembly + + Description: DEPRECATED. Use the version of the coordinate system you are + interested in instead. + + Example: #use this instead + my ($highest_cs) = @{$db->get_CoordSystemAdaptor->fetch_all()}; + my $assembly = $highest_cs->version(); + +=cut + +sub get_default_assembly { + my $self = shift; + + deprecate("Use version of coordinate system you are interested in instead.\n". + "Example:\n". + ' ($cs) = @{$coord_system_adaptor->fetch_all()};'."\n" . + ' $assembly = $cs->version();'); + + my ($cs) = @{$self->db->get_CoordSystemAdaptor->fetch_all()}; + + return $cs->version(); +} + + +# +# TBD This method should be removed/deprecated +# +sub get_max_assembly_contig { + my $self = shift; + deprecate('This method should either be fixed or removed'); + return $self->single_value_by_key('assembly.maxcontig'); +} + +=head2 get_genebuild + + Arg [1] : none + Example : $tax_id = $meta_container->get_genebuild(); + Description: Retrieves the genebuild from the database meta table + Returntype : string + Exceptions : none + Caller : ? + Status : Stable + +=cut + +sub get_genebuild { + my ($self) = @_; + return $self->single_value_by_key('genebuild.start_date', 1); +} + +=head2 get_genebuild + + Example : $classification = $meta_container->get_classification(); + Description: Retrieves the classification held in the backing database minus + any species specific levels. This means that the first element + in the array will be subfamily/family level ascending to + superkingdom + Returntype : ArrayRef[String] + Exceptions : none + Caller : ? + Status : Stable + +=cut + +sub get_classification { + my ($self) = @_; + my $classification = $self->list_value_by_key('species.classification'); + my $copy = [@{$classification}]; + splice(@{$copy}, 0, 1); # remove the Homo sapiens + return $copy; +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/MetaCoordContainer.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/MetaCoordContainer.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,227 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +package Bio::EnsEMBL::DBSQL::MetaCoordContainer; + +use strict; +use warnings; + +use vars qw(@ISA); +use strict; + +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Utils::Exception; + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); + +sub new { + my $class = shift; + + my $self = $class->SUPER::new(@_); + + # + # Retrieve the list of the coordinate systems that features are stored + # in and cache them. + # + + my @coord_systems = + @{ $self->db()->dnadb()->get_CoordSystemAdaptor->fetch_all() }; + + my @cs_ids; + foreach my $cs (@coord_systems) { push( @cs_ids, $cs->dbID() ) } + + my $sth = $self->prepare( + 'SELECT mc.table_name, mc.coord_system_id, mc.max_length ' + . 'FROM meta_coord mc ' + . 'WHERE mc.coord_system_id in (' + . join( ',', @cs_ids ) + . ')' ); + + $sth->execute(); + + while ( my ( $table_name, $cs_id, $max_length ) = + $sth->fetchrow_array() ) + { + $table_name = lc($table_name); + + $self->{'_feature_cache'}->{$table_name} ||= []; + + push( @{ $self->{'_feature_cache'}->{$table_name} }, $cs_id ); + $self->{'_max_len_cache'}->{$cs_id}->{$table_name} = $max_length; + } + $sth->finish(); + + return $self; +} ## end sub new + + + + +=head2 fetch_all_CoordSystems_by_feature_type + + Arg [1] : string $table - the name of the table to retrieve coord systems + for. E.g. 'gene', 'exon', 'dna_align_feature' + Example : @css = @{$mcc->fetch_all_CoordSystems_by_feature_type('gene')}; + Description: This retrieves the list of coordinate systems that features + in a particular table are stored. It is used internally by + the API to perform queries to these tables and to ensure that + features are only stored in appropriate coordinate systems. + Returntype : listref of Bio::EnsEMBL::CoordSystem objects + Exceptions : throw if name argument not provided + Caller : BaseFeatureAdaptor + Status : Stable + +=cut + +sub fetch_all_CoordSystems_by_feature_type { + my $self = shift; + my $table = lc(shift); #case insensitive matching + + throw('Name argument is required') unless $table; + + if(!$self->{'_feature_cache'}->{$table}) { + return []; + } + + my @cs_ids = @{$self->{'_feature_cache'}->{$table}}; + my @coord_systems; + + my $csa = $self->db->get_CoordSystemAdaptor(); + + foreach my $cs_id (@cs_ids) { + my $cs = $csa->fetch_by_dbID($cs_id); + + if(!$cs) { + throw("meta_coord table refers to non-existant coord_system $cs_id"); + } + + push @coord_systems, $cs; + } + + return \@coord_systems; +} + + + +=head2 fetch_max_length_by_CoordSystem_feature_type + + Arg [1] : Bio::EnsEMBL::CoordSystem $cs + Arg [2] : string $table + Example : $max_len = $mcc->fetch_max_length_by_CoordSystem_feature_type($cs,'gene'); + Description: Returns the maximum length of features of a given type in + a given coordinate system. + Returntype : int or undef + Exceptions : throw on incorrect argument + Caller : BaseFeatureAdaptor + Status : Stable + +=cut + + +sub fetch_max_length_by_CoordSystem_feature_type { + my $self = shift; + my $cs = shift; + my $table = shift; + + if(!ref($cs) || !$cs->isa('Bio::EnsEMBL::CoordSystem')) { + throw('Bio::EnsEMBL::CoordSystem argument expected'); + } + + throw("Table name argument is required") unless $table; + + return $self->{'_max_len_cache'}->{$cs->dbID()}->{lc($table)}; +} + + + +=head2 add_feature_type + + Arg [1] : Bio::EnsEMBL::CoordSystem $cs + The coordinate system to associate with a feature table + Arg [2] : string $table - the name of the table in which features of + a given coordinate system will be stored in + Arg [3] : int $length + This length is used to update the max_length in the database + and the internal cache. + Example : $csa->add_feature_table($chr_coord_system, 'gene'); + Description: This function tells the coordinate system adaptor that + features from a specified table will be stored in a certain + coordinate system. If this information is not already stored + in the database it will be added. + Returntype : none + Exceptions : none + Caller : BaseFeatureAdaptor + Status : Stable + +=cut + + +sub add_feature_type { + my $self = shift; + my $cs = shift; + my $table = lc(shift); + my $length = shift; + if(!ref($cs) || !$cs->isa('Bio::EnsEMBL::CoordSystem')) { + throw('CoordSystem argument is required.'); + } + + if(!$table) { + throw('Table argument is required.'); + } + + my $cs_ids = $self->{'_feature_cache'}->{$table} || []; + + my ($exists) = grep {$cs->dbID() == $_} @$cs_ids; + if( $exists ) { + if( !$self->{'_max_len_cache'}->{$cs->dbID()}->{$table} || + $self->{'_max_len_cache'}->{$cs->dbID()}->{$table} < $length ) { + my $sth = $self->prepare('UPDATE meta_coord ' . + "SET max_length = $length " . + 'WHERE coord_system_id = ? ' . + 'AND table_name = ? '. + "AND (max_length<$length ". + "OR max_length is null)"); + $sth->execute( $cs->dbID(), $table ); + $self->{'_max_len_cache'}->{$cs->dbID()}->{$table} = $length; + } + return; + } + + #store the new tablename -> coord system relationship in the db + #ignore failures b/c during the pipeline multiple processes may try + #to update this table and only the first will be successful + my $sth = $self->prepare('INSERT IGNORE INTO meta_coord ' . + 'SET coord_system_id = ?, ' . + 'table_name = ?, ' . + 'max_length = ? ' + ); + + $sth->execute($cs->dbID, $table, $length ); + + #update the internal cache + $self->{'_feature_cache'}->{$table} ||= []; + push @{$self->{'_feature_cache'}->{$table}}, $cs->dbID(); + $self->{'_max_len_cache'}->{$cs->dbID()}->{$table} = $length; + + return; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/MiscFeatureAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/MiscFeatureAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,648 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::MiscFeatureAdaptor + +=head1 SYNOPSIS + + $mfa = $database_adaptor->get_MiscFeatureAdaptor(); + + # retrieve a misc feature by its dbID + my $misc_feat = $mfa->fetch_by_dbID(1234); + + # retrieve all misc features in a given region + my @misc_feats = @{ $mfa->fetch_all_by_Slice($slice) }; + + # retrieve all misc features in a given region with a given set code + my @misc_clones = + @{ $mfa->fetch_all_by_Slice_and_set_code('cloneset') }; + + # store some misc features in the database + $mfa->store(@misc_features); + +=head1 DESCRIPTION + +This is an adaptor for the retrieval and storage of MiscFeatures. +Misc Features are extremely generic features that can be added with +minimal effort to the database. Currently misc features are used to +describe the locations of clone sets and tiling path information, +but arbitrary features can be stored. Misc features are grouped +into sets and can be fetched according to their grouping using the +fetch_all_by_Slice_and_set_code and fetch_all_by_set_code methods. +MiscFeatures may belong to more than one set. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::MiscFeatureAdaptor; + +use strict; +use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; +use Bio::EnsEMBL::MiscFeature; +use Bio::EnsEMBL::Attribute; +use Bio::EnsEMBL::MiscSet; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + +use vars qw(@ISA); + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor); + + + +=head2 fetch_all_by_Slice_and_set_code + + Arg [1] : Bio::EnsEMBL::Slice $slice + A slice representing the region to fetch from + Arg [2...] : string $set_code + The code of the set to retrieve features from + Example : @feats = @{$mfa->fetch_all_by_Slice_and_set_code('cloneset')}; + Description: Retrieves a set of MiscFeatures which have a particular set code + and which lie in a particular region. All features with the + provide set code and which overlap the given slice are returned. + Returntype : listref of Bio::EnsEMBL::MiscFeatures + Exceptions : throw if set_code is not provided + warning if no set for provided set code exists + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_Slice_and_set_code { + my $self = shift; + my $slice = shift; + + throw('Set code argument is required.') unless @_; + + my $msa = $self->db->get_MiscSetAdaptor(); + my @sets = (); + my $max_len = 0; + foreach my $set_code (@_) { + my $set = $msa->fetch_by_code($set_code); + if($set) { + $max_len = $set->longest_feature if $set->longest_feature > $max_len; + push @sets, $set->dbID; + } else { + warning("No misc_set with code [$set_code] exists"); + } + } + my $constraint; + if( @sets > 1 ) { + $constraint = " mfms.misc_set_id in ( @{[join ',',@sets]} ) "; + } elsif( @sets == 1 ) { + $constraint = " mfms.misc_set_id = $sets[0] "; + } else { + return []; + } + + $self->_max_feature_length($max_len); + + my $results = $self->fetch_all_by_Slice_constraint($slice, $constraint); + + $self->_max_feature_length(undef); + + return $results; +} + + + +=head2 fetch_all_by_attribute_type_value + + Arg [1] : string $attrib_type_code + The code of the attribute type to fetch features for + Arg [2] : (optional) string $attrib_value + The value of the attribute to fetch features for + Example : + #get all misc features that have an embl accession + @feats = @{$mfa->fetch_all_by_attrib_type_value('embl_acc')}; + #get the misc feature with synonym 'AL014121' + ($feat)=@{$mfa->fetch_all_by_attrib_type_value('synonym','AL014121'); + Description: Retrieves MiscFeatures which have a particular attribute. + If the attribute value argument is also provided only + features which have the attribute AND a particular value + are returned. The features are returned in their native + coordinate system (i.e. the coordinate system that they + are stored in). + Returntype : listref of Bio::EnsEMBL::MiscFeatures + Exceptions : throw if attrib_type code arg is not provided + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_attribute_type_value { + my $self = shift; + my $attrib_type_code = shift; + my $attrib_value = shift; + + throw("Attrib type code argument is required.") + if ( !$attrib_type_code ); + + # Need to do 2 queries so that all of the ids come back with the + # features. The problem with adding attrib constraints to filter the + # misc_features which come back is that not all of the attributes will + # come back + + my $sql = qq( + SELECT DISTINCT + ma.misc_feature_id + FROM misc_attrib ma, + attrib_type at, + misc_feature mf, + seq_region sr, + coord_system cs + WHERE ma.attrib_type_id = at.attrib_type_id + AND at.code = ? + AND ma.misc_feature_id = mf.misc_feature_id + AND mf.seq_region_id = sr.seq_region_id + AND sr.coord_system_id = cs.coord_system_id + AND cs.species_id = ?); + + if ($attrib_value) { + $sql .= " AND ma.value = ?"; + } + + my $sth = $self->prepare($sql); + + $sth->bind_param( 1, $attrib_type_code, SQL_VARCHAR ); + $sth->bind_param( 2, $self->species_id(), SQL_INTEGER ); + if ($attrib_value) { + $sth->bind_param( 3, $attrib_value, SQL_VARCHAR ); + } + + $sth->execute(); + + my @ids = map { $_->[0] } @{ $sth->fetchall_arrayref() }; + + $sth->finish(); + + # Construct constraints from the list of ids. Split ids into groups + # of 1000 to ensure that the query is not too big. + my @constraints; + while (@ids) { + my @subset = splice( @ids, 0, 1000 ); + if ( @subset == 1 ) { + push @constraints, "mf.misc_feature_id = $subset[0]"; + } else { + my $id_str = join( ',', @subset ); + push @constraints, "mf.misc_feature_id in ($id_str)"; + } + } + + my @results; + foreach my $constraint (@constraints) { + push @results, @{ $self->generic_fetch($constraint) }; + } + + return \@results; +} ## end sub fetch_all_by_attribute_type_value + + +#_tables +# +# Arg [1] : none +# Example : none +# Description: PROTECTED Implementation of abstract superclass method to +# provide the name of the tables to query +# Returntype : string +# Exceptions : none +# Caller : internal + + +sub _tables { + my $self = shift; + + return (['misc_feature', 'mf'], + ['misc_feature_misc_set', 'mfms'], + ['misc_attrib', 'ma'], + ['attrib_type', 'at']); +} + + +#_columns + +# Arg [1] : none +# Example : none +# Description: PROTECTED Implementation of abstract superclass method to +# provide the name of the columns to query +# Returntype : list of strings +# Exceptions : none +# Caller : internal + +sub _columns { + my $self = shift; + + #warning _objs_from_sth implementation depends on ordering + return qw (mf.misc_feature_id + mf.seq_region_id + mf.seq_region_start + mf.seq_region_end + mf.seq_region_strand + ma.value + at.code + mfms.misc_set_id + at.name + at.description); +} + + + +# _default_where_clause + +# Arg [1] : none +# Example : none +# Description: Overrides superclass method to provide an additional +# table joining constraint before the SQL query is performed. +# Returntype : string +# Exceptions : none +# Caller : generic_fetch + +sub _default_where_clause { + my $self = shift; + + return ''; +} + + +sub _left_join { + my $self = shift; + + return( + ['misc_feature_misc_set','mf.misc_feature_id = mfms.misc_feature_id'], + ['misc_attrib', 'mf.misc_feature_id = ma.misc_feature_id'], + ['attrib_type','ma.attrib_type_id = at.attrib_type_id']); +} + + +sub _final_clause { + my $self = shift; + + return " ORDER BY mf.misc_feature_id"; +} + + +# _objs_from_sth + +# Arg [1] : StatementHandle $sth +# Example : none +# Description: PROTECTED implementation of abstract superclass method. +# responsible for the creation of MiscFeatures from a +# hashref generated from an SQL query +# Returntype : listref of Bio::EnsEMBL::MiscFeatures +# Exceptions : none +# Caller : internal + +sub _objs_from_sth { + my ($self, $sth, $mapper, $dest_slice) = @_; + + # + # This code is ugly because an attempt has been made to remove as many + # function calls as possible for speed purposes. Thus many caches and + # a fair bit of gymnastics is used. + # + + my $sa = $self->db()->get_SliceAdaptor(); + my $msa = $self->db->get_MiscSetAdaptor(); + + my @features; + my %ms_hash; + my %slice_hash; + my %sr_name_hash; + my %sr_cs_hash; + + my($misc_feature_id, $seq_region_id, $seq_region_start, $seq_region_end, + $seq_region_strand, $attrib_value, $attrib_type_code, $misc_set_id, + $attrib_type_name, $attrib_type_description ); + + $sth->bind_columns( \$misc_feature_id, \$seq_region_id, \$seq_region_start, + \$seq_region_end, \$seq_region_strand, + \$attrib_value, \$attrib_type_code,\$misc_set_id, + \$attrib_type_name, \$attrib_type_description ); + + my $asm_cs; + my $cmp_cs; + my $asm_cs_vers; + my $asm_cs_name; + my $cmp_cs_vers; + my $cmp_cs_name; + if($mapper) { + $asm_cs = $mapper->assembled_CoordSystem(); + $cmp_cs = $mapper->component_CoordSystem(); + $asm_cs_name = $asm_cs->name(); + $asm_cs_vers = $asm_cs->version(); + $cmp_cs_name = $cmp_cs->name(); + $cmp_cs_vers = $cmp_cs->version(); + } + + my $dest_slice_start; + my $dest_slice_end; + my $dest_slice_strand; + my $dest_slice_length; + my $dest_slice_sr_name; + my $dest_slice_sr_id; + if($dest_slice) { + $dest_slice_start = $dest_slice->start(); + $dest_slice_end = $dest_slice->end(); + $dest_slice_strand = $dest_slice->strand(); + $dest_slice_length = $dest_slice->length(); + $dest_slice_sr_name = $dest_slice->seq_region_name(); + $dest_slice_sr_id = $dest_slice->get_seq_region_id(); + } + + my $current = -1; + my $throw_away = -1; + my $feat; + my $feat_misc_sets; + my $feat_attribs; + my $seen_attribs; + + FEATURE: while($sth->fetch()) { + #if this feature is not being used, skip all rows related to it + next if($throw_away == $misc_feature_id); + + if ($current == $misc_feature_id) { + #still working on building up attributes and sets for current feature + + #if there is a misc_set, add it to the current feature + if ($misc_set_id) { + my $misc_set = $ms_hash{$misc_set_id} ||= + $msa->fetch_by_dbID($misc_set_id); + if ( ! exists $feat_misc_sets->{$misc_set->{'code'}} ) { + $feat->add_MiscSet( $misc_set ); + $feat_misc_sets->{$misc_set->{'code'}} = $misc_set; + } + } + + #if there is a new attribute add it to the current feature + if ($attrib_value && $attrib_type_code && + !$seen_attribs->{"$attrib_type_code:$attrib_value"}) { + my $attrib = Bio::EnsEMBL::Attribute->new + ( -CODE => $attrib_type_code, + -NAME => $attrib_type_name, + -DESC => $attrib_type_description, + -VALUE => $attrib_value + ); + + + $feat_attribs ||= []; + push @$feat_attribs, $attrib; + $seen_attribs->{"$attrib_type_code:$attrib_value"} = 1; + } + + } else { + if ($feat) { + #start working on a new feature, discard references to last one + $feat = {}; + $feat_attribs = []; + $feat_misc_sets = {}; + $seen_attribs = {}; + } + + $current = $misc_feature_id; + #need to get the internal_seq_region, if present + $seq_region_id = $self->get_seq_region_id_internal($seq_region_id); + my $slice = $slice_hash{"ID:".$seq_region_id}; + + if (!$slice) { + $slice = $sa->fetch_by_seq_region_id($seq_region_id); + $slice_hash{"ID:".$seq_region_id} = $slice; + $sr_name_hash{$seq_region_id} = $slice->seq_region_name(); + $sr_cs_hash{$seq_region_id} = $slice->coord_system(); + } + + my $sr_name = $sr_name_hash{$seq_region_id}; + my $sr_cs = $sr_cs_hash{$seq_region_id}; + # + # remap the feature coordinates to another coord system + # if a mapper was provided + # + if ($mapper) { + + + if (defined $dest_slice && $mapper->isa('Bio::EnsEMBL::ChainedAssemblyMapper') ) { + ( $seq_region_id, $seq_region_start,$seq_region_end, $seq_region_strand ) = + $mapper->map( $sr_name, $seq_region_start, $seq_region_end, + $seq_region_strand, $sr_cs, 1, $dest_slice); + + } else { + + ( $seq_region_id, $seq_region_start,$seq_region_end, $seq_region_strand ) = + $mapper->fastmap( $sr_name, $seq_region_start, $seq_region_end,$seq_region_strand, $sr_cs ); + } + + #skip features that map to gaps or coord system boundaries + if(!defined($seq_region_id)) { + $throw_away = $misc_feature_id; + next FEATURE; + } + + #get a slice in the coord system we just mapped to +# if ($asm_cs == $sr_cs || +# ($cmp_cs != $sr_cs && $asm_cs->equals($sr_cs))) { + $slice = $slice_hash{"ID:".$seq_region_id} ||= + $sa->fetch_by_seq_region_id($seq_region_id); +# } else { +# $slice = $slice_hash{"NAME:$sr_name:$asm_cs_name:$asm_cs_vers"} ||= +# $sa->fetch_by_region($asm_cs_name, $sr_name, undef, undef, undef, +# $asm_cs_vers); +# } + } + + # + # If a destination slice was provided convert the coords + # If the dest_slice starts at 1 and is foward strand, nothing needs doing + # + if ($dest_slice) { + if ($dest_slice_start != 1 || $dest_slice_strand != 1) { + if ($dest_slice_strand == 1) { + $seq_region_start = $seq_region_start - $dest_slice_start + 1; + $seq_region_end = $seq_region_end - $dest_slice_start + 1; + } else { + my $tmp_seq_region_start = $seq_region_start; + $seq_region_start = $dest_slice_end - $seq_region_end + 1; + $seq_region_end = $dest_slice_end - $tmp_seq_region_start + 1; + $seq_region_strand *= -1; + } + } + #throw away features off the end of the requested slice + if ($seq_region_end < 1 || $seq_region_start > $dest_slice_length || + ( $dest_slice_sr_id ne $seq_region_id )) { + #flag this feature as one to throw away + $throw_away = $misc_feature_id; + next FEATURE; + } + $slice = $dest_slice; + } + + + if ($attrib_value && $attrib_type_code) { + my $attrib = Bio::EnsEMBL::Attribute->new + ( -CODE => $attrib_type_code, + -NAME => $attrib_type_name, + -DESC => $attrib_type_description, + -VALUE => $attrib_value + ); + $feat_attribs = [$attrib]; + $seen_attribs->{"$attrib_type_code:$attrib_value"} = 1; + } + + $feat = + $self->_create_feature_fast( 'Bio::EnsEMBL::MiscFeature', { + 'start' => $seq_region_start, + 'end' => $seq_region_end, + 'strand' => $seq_region_strand, + 'slice' => $slice, + 'adaptor' => $self, + 'dbID' => $misc_feature_id, + 'attributes' => $feat_attribs + } ); + + push @features, $feat; + + if ($misc_set_id) { + #get the misc_set object + my $misc_set = $ms_hash{$misc_set_id} ||= + $msa->fetch_by_dbID($misc_set_id); + if ( ! exists $feat_misc_sets->{$misc_set->{'code'}} ) { + $feat->add_MiscSet( $misc_set ); + $feat_misc_sets->{$misc_set->{'code'}} = $misc_set; + } + } + } + } + + return \@features; +} + + + +=head2 list_dbIDs + + Arg [1] : none + Example : @feature_ids = @{$misc_feature_adaptor->list_dbIDs()}; + Description: Gets an array of internal ids for all misc_features in the + current db + Arg[1] : int. not 0 for the ids to be sorted by the seq_region. + Returntype : list of ints + Exceptions : none + Caller : ? + Status : Stable + +=cut + +sub list_dbIDs { + my ($self,$ordered) = @_; + + return $self->_list_dbIDs("misc_feature",undef,$ordered); +} + + +=head2 store + + Arg [1] : list of Bio::EnsEMBL::MiscFeatures @misc_features + Example : $misc_feature_adaptor->store(@misc_features); + Description: Stores a list of MiscFeatures in this database. The stored + features will have their + Returntype : none + Exceptions : throw on invalid arguments + warning if misc feature is already stored in this database + throw if start/end/strand attribs are not valid + Caller : general + Status : Stable + +=cut + +sub store { + my $self = shift; + my @misc_features = @_; + + my $db = $self->db(); + + my $feature_sth = $self->prepare + ("INSERT INTO misc_feature SET " . + " seq_region_id = ?, " . + " seq_region_start = ?, " . + " seq_region_end = ?, " . + " seq_region_strand = ?"); + + my $feature_set_sth = $self->prepare + ("INSERT IGNORE misc_feature_misc_set SET " . + " misc_feature_id = ?, " . + " misc_set_id = ?"); + + my $msa = $db->get_MiscSetAdaptor(); + my $aa = $db->get_AttributeAdaptor(); + + FEATURE: + foreach my $mf (@misc_features) { + if(!ref($mf) || !$mf->isa('Bio::EnsEMBL::MiscFeature')) { + throw("List of MiscFeature arguments expeceted"); + } + + if($mf->is_stored($db)) { + warning("MiscFeature [" .$mf->dbID."] is already stored in database."); + next FEATURE; + } + + # do some checking of the start/end and convert to seq_region coords + my $original = $mf; + my $seq_region_id; + ($mf, $seq_region_id) = $self->_pre_store($mf); + + # store the actual MiscFeature + $feature_sth->bind_param(1,$seq_region_id,SQL_INTEGER); + $feature_sth->bind_param(2,$mf->start,SQL_INTEGER); + $feature_sth->bind_param(3,$mf->end,SQL_INTEGER); + $feature_sth->bind_param(4,$mf->strand,SQL_TINYINT); + $feature_sth->execute(); + + my $dbID = $feature_sth->{'mysql_insertid'}; + + $mf->dbID($dbID); + $mf->adaptor($self); + + # store all the attributes + my $attribs = $mf->get_all_Attributes(); + $aa->store_on_MiscFeature($mf, $attribs); + + # store all the sets that have not been stored yet + my $sets = $mf->get_all_MiscSets(); + foreach my $set (@$sets) { + $msa->store($set) if(!$set->is_stored($db)); + + # update the misc_feat_misc_set table to store the set relationship + $feature_set_sth->bind_param(1,$dbID,SQL_INTEGER); + $feature_set_sth->bind_param(2,$set->dbID,SQL_INTEGER); + + $feature_set_sth->execute(); + } + } + + return; +} + +1; + + + + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/MiscSetAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/MiscSetAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,317 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::MiscSetAdaptor - Provides database interaction for +Bio::EnsEMBL::MiscSet objects. + +=head1 SYNOPSIS + + my $msa = $registry->get_adaptor( 'Human', 'Core', 'MiscSet' ); + + my $misc_set = $msa->fetch_by_dbID(1234); + + $misc_set = $msa->fetch_by_code('clone'); + +=head1 DESCRIPTION + +This class provides database interactivity for MiscSet objects. +MiscSets are used to classify MiscFeatures into groups. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::MiscSetAdaptor; + +use strict; +use warnings; + +use Bio::EnsEMBL::MiscSet; +use Bio::EnsEMBL::DBSQL::BaseAdaptor; + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + +use vars qw(@ISA); + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); + + +=head2 new + + Arg [...] : Superclass args. See Bio::EnsEMBL::DBSQL::BaseAdaptor + Description: Instantiates a Bio::EnsEMBL::DBSQL::MiscSetAdaptor and + caches the contents of the MiscSet table. + Returntype : Bio::EnsEMBL::MiscSet + Exceptions : none + Caller : MiscFeatureAdaptor + Status : Stable + +=cut + + +sub new { + my $class = shift; + + my $self = $class->SUPER::new(@_); + + $self->{'_id_cache'} = {}; + $self->{'_code_cache'} = {}; + + # cache the entire contents of the misc set table + # the table is small and it removes the need to repeatedly query the + # table or join to the table + + $self->fetch_all(); + + return $self; +} + + + + +=head2 fetch_all + + Arg [1] : none + Example : foreach my $ms (@{$msa->fetch_all()}) { + print $ms->code(), ' ', $ms->name(), "\n"; + } + Description: Retrieves every MiscSet defined in the DB. + NOTE: In a multi-species database, this method will + return all the entries matching the search criteria, not + just the ones associated with the current species. + Returntype : listref of Bio::EnsEMBL::MiscSets + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_all { + my $self = shift; + + my $sth = $self->prepare + ('SELECT misc_set_id, code, name, description, max_length FROM misc_set'); + + $sth->execute(); + + my ($dbID, $code, $name, $desc, $max_len); + $sth->bind_columns(\$dbID, \$code, \$name, \$desc, \$max_len); + + my @all; + + while($sth->fetch()) { + my $ms = Bio::EnsEMBL::MiscSet->new + (-DBID => $dbID, + -ADAPTOR => $self, + -CODE => $code, + -NAME => $name, + -DESCRIPTION => $desc, + -LONGEST_FEATURE => $max_len); + + $self->{'_id_cache'}->{$dbID} = $ms; + $self->{'_code_cache'}->{lc($code)} = $ms; + push @all, $ms; + } + + $sth->finish(); + + return \@all; +} + + + +=head2 fetch_by_dbID + + Arg [1] : int $dbID + The internal identifier of the misc set to retrieve + Example : my $ms = $msa->fetch_by_dbID($dbID); + Description: Retrieves a misc set via its internal identifier + Returntype : Bio::EnsEMBL::MiscSet + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_by_dbID { + my $self = shift; + my $dbID = shift; + + if(!$self->{'_id_cache'}->{$dbID}) { + # on a cache miss reread the whole table and reload the cache + $self->fetch_all(); + } + + return $self->{'_id_cache'}->{$dbID}; +} + + + +=head2 fetch_by_code + + Arg [1] : string $code + The unique code of the MiscSet to retrieve + Example : my $ms = $msa->fetch_by_code('clone'); + Description: Retrieves a MiscSet via its code + Returntype : Bio::EnsEMBL::MiscSet + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_by_code { + my $self = shift; + my $code = shift; + + if(!$self->{'_code_cache'}->{lc($code)}) { + # on cache miss, reread whole table and reload cache + $self->fetch_all(); + } + + return $self->{'_code_cache'}->{lc($code)}; +} + + + +=head2 store + + Arg [1] : list of MiscSets @mist_sets + Example : $misc_set_adaptor->store(@misc_sets); + Description: Stores a list of MiscSets in the database, and sets the + dbID and adaptor attributes of the stored sets. + Returntype : none + Exceptions : throw on incorrect arguments + warning if a feature is already stored in this database + Caller : MiscFeatureAdaptor::store + Status : Stable + +=cut + +sub store { + my $self = shift; + my @misc_sets = @_; + + # we use 'insert ignore' so that inserts can occur safely on the farm + # otherwise 2 processes could try to insert at the same time and one + # would fail + + my $sth = $self->prepare + ("INSERT IGNORE INTO misc_set " . + "SET code = ?, " . + " name = ?, " . + " description = ?, " . + " max_length = ?"); + + my $db = $self->db(); + + SET: + foreach my $ms (@misc_sets) { + if(!ref($ms) || !$ms->isa('Bio::EnsEMBL::MiscSet')) { + throw("List of MiscSet arguments expected."); + } + + if($ms->is_stored($db)) { + warning("MiscSet [".$ms->dbID."] is already stored in this database."); + next SET; + } + + $sth->bind_param(1,$ms->code,SQL_VARCHAR); + $sth->bind_param(2,$ms->name,SQL_VARCHAR); + $sth->bind_param(3,$ms->description,SQL_LONGVARCHAR); + $sth->bind_param(4,$ms->longest_feature,SQL_INTEGER); + + my $num_inserted = $sth->execute(); + + my $dbID; + + if($num_inserted == 0) { + # insert failed because set with this code already exists + my $sth2 = $self->prepare("SELECT misc_set_id from misc_set " . + "WHERE code = ?"); + $sth2->bind_param(1,$ms->code,SQL_VARCHAR); + $sth2->execute(); + + if($sth2->rows() != 1) { + throw("Could not retrieve or store MiscSet, code=[".$ms->code."]\n". + "Wrong database user/permissions?"); + } + + ($dbID) = $sth2->fetchrow_array(); + } else { + $dbID = $sth->{'mysql_insertid'}; + } + + $ms->dbID($dbID); + $ms->adaptor($self); + + # update the internal caches + $self->{'_id_cache'}->{$dbID} = $ms; + $self->{'_code_cache'}->{lc($ms->code())} = $ms; + } + + return; +} + +=head2 update + + Arg [1] : Bio::EnsEMBL::MiscSet $miscset + Example : $adaptor->update($miscset) + Description: Updates this misc_set in the database + Returntype : int 1 if update is performed, undef if it is not + Exceptions : throw if arg is not an misc_set object + Caller : ? + Status : Stable + +=cut + +sub update { + my $self = shift; + my $m = shift; + + if (!ref($m) || !$m->isa('Bio::EnsEMBL::MiscSet')) { + throw("Expected Bio::EnsEMBL::MiscSet argument."); + } + + if(!$m->is_stored($self->db())) { + return undef; + } + + my $sth = $self->prepare("UPDATE misc_set ". + "SET code =?, name =?, description = ?, max_length = ? ". + "WHERE misc_set_id = ?"); + + $sth->bind_param(1,$m->code,SQL_VARCHAR); + $sth->bind_param(2,$m->name,SQL_VARCHAR); + $sth->bind_param(3,$m->description,SQL_VARCHAR); + $sth->bind_param(4,$m->longest_feature,SQL_INTEGER); + $sth->bind_param(5,$m->dbID,SQL_INTEGER); + + $sth->execute(); + $sth->finish(); + + # update the internal caches + $self->{'_id_cache'}->{$m->dbID} = $m; + $self->{'_code_cache'}->{lc($m->code())} = $m; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/OntologyDBAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/OntologyDBAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,48 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::OntologyDBAdaptor + +=head1 DESCRIPTION + +Database adaptor for the ontology database ensembl_ontology_NN. +Mostly inheriting from Bio::EnsEMBL::DBSQL::DBAdaptor, overriding its +get_available_adaptors() method. Not doing very much else at this +moment. + +=cut + +package Bio::EnsEMBL::DBSQL::OntologyDBAdaptor; + +use strict; +use warnings; + +use base qw ( Bio::EnsEMBL::DBSQL::DBAdaptor ); + +sub get_available_adaptors { + return { + 'GOTerm' => 'Bio::EnsEMBL::DBSQL::OntologyTermAdaptor', #deprecated + 'SOTerm' => 'Bio::EnsEMBL::DBSQL::OntologyTermAdaptor', #deprecated + 'OntologyTerm' => 'Bio::EnsEMBL::DBSQL::OntologyTermAdaptor' }; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/OntologyTermAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/OntologyTermAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,785 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::OntologyTermAdaptor + +=head1 SYNOPSIS + + my $goa = + $registry->get_adaptor( 'Multi', 'Ontology', 'OntologyTerm' ); + + my $term = $goa->fetch_by_accession('GO:0010885'); + + my @children = @{ $goa->fetch_all_by_parent_term($term) }; + my @descendants = @{ $goa->fetch_all_by_ancestor_term($term) }; + + my @parents = @{ $goa->fetch_all_by_child_term($term) }; + my @ancestors = @{ $goa->fetch_all_by_descendant_term($term) }; + + my %ancestor_chart = %{ $goa->_fetch_ancestor_chart($term) }; + +=head1 DESCRIPTION + +An abstract adaptor class for fetching ontology +terms, creates Bio::EnsEMBL::OntologyTerm objects. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::OntologyTermAdaptor; + +use strict; +use warnings; + +use DBI qw( :sql_types ); + +use Bio::EnsEMBL::Utils::Exception qw( throw ); +use Bio::EnsEMBL::Utils::Scalar qw( assert_ref ); + +use Bio::EnsEMBL::OntologyTerm; + +use base qw( Bio::EnsEMBL::DBSQL::BaseAdaptor ); + +=head2 fetch_all_by_name + + Arg [1] : String, name of term, or SQL pattern + Arg [2] : (optional) String, name of ontology + + Description : Fetches ontology term(s) given a name, a synonym, or a + SQL pattern like "%splice_site%" + + Example : + + my ($term) = + @{ $ot_adaptor->fetch_by_name( 'DNA_binding_site', 'SO' ) }; + + # Will find terms in both SO and GO: + my @terms = @{ $ot_adaptor->fetch_by_name('%splice_site%') }; + + Return type : listref of Bio::EnsEMBL::OntologyTerm + +=cut + +sub fetch_all_by_name { + my ( $this, $pattern, $ontology ) = @_; + + my $statement = q( +SELECT DISTINCT + term.term_id, + term.accession, + term.name, + term.definition, + term.subsets, + ontology.namespace +FROM ontology + JOIN term USING (ontology_id) + LEFT JOIN synonym USING (term_id) +WHERE ( term.name LIKE ? OR synonym.name LIKE ? )); + + if ( defined($ontology) ) { + $statement .= " AND ontology.name = ?"; + } + + my $sth = $this->prepare($statement); + $sth->bind_param( 1, $pattern, SQL_VARCHAR ); + $sth->bind_param( 2, $pattern, SQL_VARCHAR ); + + if ( defined($ontology) ) { + $sth->bind_param( 3, $ontology, SQL_VARCHAR ); + } + + $sth->execute(); + + my ( $dbid, $accession, $name, $definition, $subsets, $namespace ); + $sth->bind_columns( + \( $dbid, $accession, $name, $definition, $subsets, $namespace ) ); + + my @terms; + + while ( $sth->fetch() ) { + $subsets ||= ''; + + push @terms, + Bio::EnsEMBL::OntologyTerm->new( + '-dbid' => $dbid, + '-adaptor' => $this, + '-accession' => $accession, + '-namespace' => $namespace, + '-subsets' => [ split( /,/, $subsets ) ], + '-name' => $name, + '-definition' => $definition, ); + } + + return \@terms; +} ## end sub fetch_all_by_name + + +=head2 fetch_by_accession + + Arg [1] : String + + Description : Fetches an ontology term given an accession. + + Example : + + my $term = $ot_adaptor->fetch_by_accession('GO:0030326'); + + Return type : Bio::EnsEMBL::OntologyTerm + +=cut + +sub fetch_by_accession { + my ( $this, $accession ) = @_; + + my $statement = q( +SELECT term.term_id, + term.name, + term.definition, + term.subsets, + ontology.name, + ontology.namespace +FROM ontology + JOIN term USING (ontology_id) +WHERE term.accession = ?); + + my $sth = $this->prepare($statement); + $sth->bind_param( 1, $accession, SQL_VARCHAR ); + + $sth->execute(); + + my ( $dbid, $name, $definition, $subsets, $ontology, $namespace ); + $sth->bind_columns( + \( $dbid, $name, $definition, $subsets, $ontology, $namespace ) ); + + $sth->fetch(); + $subsets ||= ''; + + my $term = + Bio::EnsEMBL::OntologyTerm->new( + '-dbid' => $dbid, + '-adaptor' => $this, + '-accession' => $accession, + '-ontology' => $ontology, + '-namespace' => $namespace, + '-subsets' => [ split( /,/, $subsets ) ], + '-name' => $name, + '-definition' => $definition, + '-synonyms' => $this->_fetch_synonyms_by_dbID($dbid) + ); + $sth->finish(); + + return $term; +} ## end sub fetch_by_accession + +=head2 fetch_all_by_parent_term + + Arg [1] : Bio::EnsEMBL::OntologyTerm + The term whose children terms should be fetched. + + Description : Given a parent ontology term, returns a list of + its immediate children terms. + + Example : + + my @children = + @{ $ot_adaptor->fetch_all_by_parent_term($term) }; + + Return type : listref of Bio::EnsEMBL::OntologyTerm + +=cut + +sub fetch_all_by_parent_term { + my ( $this, $term ) = @_; + + assert_ref( $term, 'Bio::EnsEMBL::OntologyTerm' ); + + my @terms; + + if ( !$term->{'child_terms_fetched'} ) { + my $statement = q( +SELECT child_term.term_id, + child_term.accession, + child_term.name, + child_term.definition, + child_term.subsets, + rt.name +FROM term child_term + JOIN relation ON (relation.child_term_id = child_term.term_id) + JOIN relation_type rt USING (relation_type_id) +WHERE relation.parent_term_id = ?); + + my $sth = $this->prepare($statement); + $sth->bind_param( 1, $term->dbID(), SQL_INTEGER ); + + $sth->execute(); + + my ( $dbid, $accession, $name, $definition, $subsets, $relation ); + $sth->bind_columns( + \( $dbid, $accession, $name, $definition, $subsets, $relation ) ); + + while ( $sth->fetch() ) { + $subsets ||= ''; + + my $child_term = + Bio::EnsEMBL::OntologyTerm->new( + '-dbid' => $dbid, + '-adaptor' => $this, + '-accession' => $accession, + '-ontology' => $term->{'ontology'}, + '-namespace' => $term->{'namespace'}, + '-subsets' => [ split( /,/, $subsets ) ], + '-name' => $name, + '-definition' => $definition, ); + + push( @terms, $child_term ); + push( @{ $term->{'children'}{$relation} }, $child_term ); + } + + $term->{'child_terms_fetched'} = 1; + } else { + foreach my $relation ( values( %{ $term->{'children'} } ) ) { + push( @terms, @{$relation} ); + } + } + + return \@terms; +} ## end sub fetch_all_by_parent_term + +=head2 fetch_all_by_ancestor_term + + Arg [1] : Bio::EnsEMBL::OntologyTerm + The term whose descendant terms should be fetched. + + Description : Given a parent ontology term, returns a list of + all its descendant terms, down to and including + any leaf terms. Relations of the type 'is_a' and + 'part_of' are followed. + + Example : + + my @descendants = + @{ $ot_adaptor->fetch_all_by_ancestor_term($term) }; + + Return type : listref of Bio::EnsEMBL::OntologyTerm + +=cut + +sub fetch_all_by_ancestor_term { + my ( $this, $term ) = @_; + + assert_ref( $term, 'Bio::EnsEMBL::OntologyTerm' ); + + my $statement = q( +SELECT DISTINCT + child_term.term_id, + child_term.accession, + child_term.name, + child_term.definition, + child_term.subsets +FROM term child_term + JOIN closure ON (closure.child_term_id = child_term.term_id) +WHERE closure.parent_term_id = ? + AND closure.distance > 0 +ORDER BY closure.distance, child_term.accession); + + my $sth = $this->prepare($statement); + $sth->bind_param( 1, $term->dbID(), SQL_INTEGER ); + + $sth->execute(); + + my ( $dbid, $accession, $name, $definition, $subsets ); + $sth->bind_columns( + \( $dbid, $accession, $name, $definition, $subsets ) ); + + my @terms; + + while ( $sth->fetch() ) { + $subsets ||= ''; + + push( @terms, + Bio::EnsEMBL::OntologyTerm->new( + '-dbid' => $dbid, + '-adaptor' => $this, + '-accession' => $accession, + '-ontology' => $term->{'ontology'}, + '-namespace' => $term->{'namespace'}, + '-subsets' => [ split( /,/, $subsets ) ], + '-name' => $name, + '-definition' => $definition, ) ); + } + + return \@terms; +} ## end sub fetch_all_by_ancestor_term + +=head2 fetch_all_by_child_term + + Arg [1] : Bio::EnsEMBL::OntologyTerm + The term whose parent terms should be fetched. + + Description : Given a child ontology term, returns a list of + its immediate parent terms. + + Example : + + my @parents = @{ $ot_adaptor->fetch_all_by_child_term($term) }; + + Return type : listref of Bio::EnsEMBL::OntologyTerm + +=cut + +sub fetch_all_by_child_term { + my ( $this, $term ) = @_; + + assert_ref( $term, 'Bio::EnsEMBL::OntologyTerm' ); + + my @terms; + + if ( !$term->{'parent_terms_fetched'} ) { + my $statement = q( +SELECT parent_term.term_id, + parent_term.accession, + parent_term.name, + parent_term.definition, + parent_term.subsets, + rt.name +FROM term parent_term + JOIN relation ON (relation.parent_term_id = parent_term.term_id) + JOIN relation_type rt USING (relation_type_id) +WHERE relation.child_term_id = ?); + + my $sth = $this->prepare($statement); + $sth->bind_param( 1, $term->dbID(), SQL_INTEGER ); + + $sth->execute(); + + my ( $dbid, $accession, $name, $definition, $subsets, $relation ); + $sth->bind_columns( + \( $dbid, $accession, $name, $definition, $subsets, $relation ) ); + + while ( $sth->fetch() ) { + $subsets ||= ''; + + my $parent_term = + Bio::EnsEMBL::OntologyTerm->new( + '-dbid' => $dbid, + '-adaptor' => $this, + '-accession' => $accession, + '-ontology' => $term->{'ontology'}, + '-namespace' => $term->{'namespace'}, + '-subsets' => [ split( /,/, $subsets ) ], + '-name' => $name, + '-definition' => $definition, ); + + push( @terms, $parent_term ); + push( @{ $term->{'parents'}{$relation} }, $parent_term ); + } + + $term->{'parent_terms_fetched'} = 1; + } else { + foreach my $relation ( values( %{ $term->{'parents'} } ) ) { + push( @terms, @{$relation} ); + } + } + + return \@terms; +} ## end sub fetch_all_by_child_term + +=head2 fetch_all_by_descendant_term + + Arg [1] : Bio::EnsEMBL::OntologyTerm + The term whose ancestor terms should be fetched. + + Arg [2] : (optional) String + The subset within the ontolgy to which the query + should be restricted. The subset may be specified as + a SQL pattern, e.g., "%goslim%" (but "goslim%" might + not do what you expect), or as a specific subset name, + e.g., "goslim_generic". + + Arg [3] : (optional) Boolean + If true (non-zero), only return the closest + term(s). If this argument is true, and the + previous argument is left undefined, this method + will return the parent(s) of the given term. + + Arg [4] : (optional) Boolean + If true we will allow the retrieval of terms whose distance + to the current term is 0. If false then we will only return + those which are above the current term in the ontology + + Description : Given a child ontology term, returns a list of + all its ancestor terms, up to and including any + root term. Relations of the type 'is_a' and + 'part_of' are followed. Optionally, only terms in + a given subset of the ontology may be returned, + and additionally one may ask to only get the + closest term(s) to the given child term. + + Example : + + my @ancestors = + @{ $ot_adaptor->fetch_all_by_descendant_term($term) }; + + Return type : listref of Bio::EnsEMBL::OntologyTerm + +=cut + +sub fetch_all_by_descendant_term { + my ( $this, $term, $subset, $closest_only, $allow_zero_distance ) = @_; + + assert_ref( $term, 'Bio::EnsEMBL::OntologyTerm' ); + + $closest_only ||= 0; + + my $statement = q( +SELECT DISTINCT + parent_term.term_id, + parent_term.accession, + parent_term.name, + parent_term.definition, + parent_term.subsets, + closure.distance +FROM term parent_term + JOIN closure ON (closure.parent_term_id = parent_term.term_id) +WHERE closure.child_term_id = ? + AND closure.distance > ?); + + if ( defined($subset) ) { + if ( index( $subset, '%' ) != -1 ) { + $statement .= q( + AND parent_term.subsets LIKE ?); + } else { + $statement .= q( + AND FIND_IN_SET(?, parent_term.subsets) > 0); + } + } + + $statement .= q( +ORDER BY closure.distance, parent_term.accession); + + my $sth = $this->prepare($statement); + $sth->bind_param( 1, $term->dbID(), SQL_INTEGER ); + my $query_distance = ($allow_zero_distance) ? -1 : 0; + $sth->bind_param( 2, $query_distance, SQL_INTEGER ); + + if ( defined($subset) ) { + $sth->bind_param( 3, $subset, SQL_VARCHAR ); + } + + $sth->execute(); + + my ( $dbid, $accession, $name, $definition, $subsets, $distance ); + $sth->bind_columns( + \( $dbid, $accession, $name, $definition, $subsets, $distance ) ); + + my @terms; + my $min_distance; + + while ( $sth->fetch() ) { + $subsets ||= ''; + $min_distance ||= $distance; + + if ( !$closest_only || $distance == $min_distance ) { + push( @terms, + Bio::EnsEMBL::OntologyTerm->new( + '-dbid' => $dbid, + '-adaptor' => $this, + '-accession' => $accession, + '-ontology' => $term->{'ontology'}, + '-namespace' => $term->{'namespace'}, + '-subsets' => [ split( /,/, $subsets ) ], + '-name' => $name, + '-definition' => $definition, ) ); + } else { + $sth->finish(); + last; + } + } + + return \@terms; +} ## end sub fetch_all_by_descendant_term + +sub _fetch_synonyms_by_dbID { + my ( $this, $dbID ) = @_; + + my $statement = q( +SELECT synonym.name +FROM synonym +WHERE synonym.term_id = ?); + + my $sth = $this->prepare($statement); + $sth->bind_param( 1, $dbID, SQL_INTEGER ); + + $sth->execute(); + + my $synonym; + $sth->bind_col( 1, \$synonym ); + + my @synonyms; + while ( $sth->fetch() ) { + push( @synonyms, $synonym ); + } + + return \@synonyms; +} + + + +=head2 _fetch_ancestor_chart + + Arg [1] : Bio::EnsEMBL::OntologyTerm + The term whose ancestor terms should be fetched. + + Description : Given a child ontology term, returns a hash + structure containing its ancestor terms, up to and + including any root term. Relations of the type + 'is_a' and 'part_of' are included. + + Example : + + my %chart = %{ $ot_adaptor->_fetch_ancestor_chart($term) }; + + Return type : A reference to a hash structure like this: + + { + 'GO:XXXXXXX' => { + 'term' => # ref to Bio::EnsEMBL::OntologyTerm object + 'is_a' => [...], # listref of Bio::EnsEMBL::OntologyTerm + 'part_of' => [...], # listref of Bio::EnsEMBL::OntologyTerm + }, + 'GO:YYYYYYY' => { + # Similarly for all ancestors, + # and including the query term itself. + } + } + +=cut + +sub _fetch_ancestor_chart { + my ( $this, $term ) = @_; + + assert_ref( $term, 'Bio::EnsEMBL::OntologyTerm' ); + + my $statement = q( +SELECT subparent_term.term_id, + parent_term.term_id, + relation_type.name +FROM closure + JOIN relation + ON (relation.parent_term_id = closure.parent_term_id + AND relation.child_term_id = closure.subparent_term_id) + JOIN relation_type USING (relation_type_id) + JOIN term subparent_term + ON (subparent_term.term_id = closure.subparent_term_id) + JOIN term parent_term ON (parent_term.term_id = closure.parent_term_id) +WHERE closure.child_term_id = ? +ORDER BY closure.distance); + + my $sth = $this->prepare($statement); + $sth->bind_param( 1, $term->dbID(), SQL_INTEGER ); + + $sth->execute(); + + my ( $subparent_id, $parent_id, $relation ); + $sth->bind_columns( \( $subparent_id, $parent_id, $relation ) ); + + my %id_chart; + my %acc_chart; + + while ( $sth->fetch() ) { + if ( !exists( $id_chart{$parent_id} ) ) { + $id_chart{$parent_id} = {}; + } + push( @{ $id_chart{$subparent_id}{$relation} }, $parent_id ); + } + + my @terms = @{ $this->fetch_all_by_dbID_list( [ keys(%id_chart) ] ) }; + + foreach my $term (@terms) { + $id_chart{ $term->dbID() }{'term'} = $term; + $acc_chart{ $term->accession() }{'term'} = $term; + } + + foreach my $term (@terms) { + my $accession = $term->accession(); + my $dbID = $term->dbID(); + + foreach my $relation ( keys( %{ $id_chart{$dbID} } ) ) { + if ( $relation eq 'term' ) { next } + + foreach my $id ( @{ $id_chart{$dbID}{$relation} } ) { + push( @{ $acc_chart{$accession}{$relation} }, + $id_chart{$id}{'term'} ); + } + } + } + + return \%acc_chart; +} ## end sub _fetch_ancestor_chart + +#----------------------------------------------------------------------- +# Useful public methods that implement functionality not properly +# provided by the parent class Bio::EnsEMBL::DBSQL::BaseAdaptor. + +sub fetch_by_dbID { + my ( $this, $dbid ) = @_; + + my $statement = q( +SELECT term.accession, + term.name, + term.definition, + term.subsets, + ontology.name, + ontology.namespace +FROM ontology + JOIN term USING (ontology_id) +WHERE term.term_id = ?); + + my $sth = $this->prepare($statement); + $sth->bind_param( 1, $dbid, SQL_INTEGER ); + + $sth->execute(); + + my ( $accession, $name, $definition, $subsets, $ontology, + $namespace ); + $sth->bind_columns( + \( $accession, $name, $definition, $subsets, $ontology, $namespace + ) ); + + $sth->fetch(); + $subsets ||= ''; + + my $term = + Bio::EnsEMBL::OntologyTerm->new( + '-dbid' => $dbid, + '-adaptor' => $this, + '-accession' => $accession, + '-ontology' => $ontology, + '-namespace' => $namespace, + '-subsets' => [ split( /,/, $subsets ) ], + '-name' => $name, + '-definition' => $definition, + '-synonyms' => $this->_fetch_synonyms_by_dbID($dbid) + ); + $sth->finish(); + + return $term; +} ## end sub fetch_by_dbID + +sub fetch_all_by_dbID_list { + my ( $this, $dbids ) = @_; + + if ( !@{$dbids} ) { return [] } + + my $stmt = q( +SELECT term.term_id, + term.accession, + term.name, + term.definition, + term.subsets, + ontology.name, + ontology.namespace +FROM ontology + JOIN term USING (ontology_id) +WHERE term.term_id IN (%s)); + + my $statement = sprintf( + $stmt, + join( + ',', + map { + $this->dbc()->db_handle()->quote( $_, SQL_INTEGER ) + } @{$dbids} ) ); + + my $sth = $this->prepare($statement); + + $sth->execute(); + + my ( $dbid, $accession, $name, $definition, $subsets, $ontology, + $namespace ); + $sth->bind_columns( \( $dbid, $accession, $name, $definition, + $subsets, $ontology, $namespace ) ); + + my @terms; + + while ( $sth->fetch() ) { + $subsets ||= ''; + + push( @terms, + Bio::EnsEMBL::OntologyTerm->new( + '-dbid' => $dbid, + '-adaptor' => $this, + '-accession' => $accession, + '-ontology' => $ontology, + '-namespace' => $namespace, + '-subsets' => [ split( /,/, $subsets ) ], + '-name' => $name, + '-definition' => $definition, ) ); + } + + return \@terms; +} ## end sub fetch_all_by_dbID_list + +sub fetch_all { + my ($this) = @_; + + my $statement = q( +SELECT term.term_id, + term.accession, + term.name, + term.definition, + term.subsets, + ontology.name, + ontology.namespace +FROM ontology + JOIN term USING (ontology_id)); + + my $sth = $this->prepare($statement); + $sth->execute(); + + my ( $dbid, $accession, $name, $definition, $subsets, $ontology, + $namespace ); + $sth->bind_columns( \( $dbid, $accession, $name, $definition, + $subsets, $ontology, $namespace ) ); + + my @terms; + + while ( $sth->fetch() ) { + $subsets ||= ''; + + push( @terms, + Bio::EnsEMBL::OntologyTerm->new( + '-dbid' => $dbid, + '-adaptor' => $this, + '-accession' => $accession, + '-ontology' => $ontology, + '-namespace' => $namespace, + '-subsets' => [ split( /,/, $subsets ) ], + '-name' => $name, + '-definition' => $definition ) ); + } + + return \@terms; +} ## end sub fetch_all + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/OperonAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/OperonAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,794 @@ + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::OperonAdaptor - Database adaptor for the retrieval and +storage of Operon objects + +=head1 SYNOPSIS + +my $operon_adaptor = Bio::EnsEMBL::DBSQL::OperonAdaptor->new($dba); +$operon_adaptor->store($operon); +my $operon2 = $operon_adaptor->fetch_by_dbID( $operon->dbID() ); + +=head1 DESCRIPTION + +This is a database aware adaptor for the retrieval and storage of operon +objects. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::OperonAdaptor; + +use strict; + +use Bio::EnsEMBL::Utils::Exception qw( deprecate throw warning ); +use Bio::EnsEMBL::Utils::Scalar qw( assert_ref ); +use Bio::EnsEMBL::DBSQL::SliceAdaptor; +use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Operon; + +use vars '@ISA'; +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor); + +# _tables +# Arg [1] : none +# Description: PROTECTED implementation of superclass abstract method. +# Returns the names, aliases of the tables to use for queries. +# Returntype : list of listrefs of strings +# Exceptions : none +# Caller : internal +# Status : Stable + +sub _tables { + return ( [ 'operon', 'o' ] ); +} + +# _columns +# Arg [1] : none +# Example : none +# Description: PROTECTED implementation of superclass abstract method. +# Returns a list of columns to use for queries. +# Returntype : list of strings +# Exceptions : none +# Caller : internal +# Status : Stable + +sub _columns { + my ($self) = @_; + + my $created_date = + $self->db()->dbc()->from_date_to_seconds("o.created_date"); + my $modified_date = + $self->db()->dbc()->from_date_to_seconds("o.modified_date"); + + return ( 'o.operon_id', 'o.seq_region_id', 'o.seq_region_start', + 'o.seq_region_end', 'o.seq_region_strand', 'o.display_label', + 'o.analysis_id', 'o.stable_id', 'o.version', + $created_date, $modified_date ); +} + +=head2 list_dbIDs + + Example : @operon_ids = @{$operon_adaptor->list_dbIDs()}; + Description: Gets an array of internal ids for all operons in the current db + Arg[1] : int. not 0 for the ids to be sorted by the seq_region. + Returntype : Listref of Ints + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub list_dbIDs { + my ( $self, $ordered ) = @_; + + return $self->_list_dbIDs( "operon", undef, $ordered ); +} + +=head2 list_stable_ids + + Example : @stable_operon_ids = @{$operon_adaptor->list_stable_ids()}; + Description: Gets an listref of stable ids for all operons in the current db + Returntype : reference to a list of strings + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub list_stable_ids { + my ($self) = @_; + + return $self->_list_dbIDs( "operon", "stable_id" ); +} + +sub list_seq_region_ids { + my $self = shift; + + return $self->_list_seq_region_ids('operon'); +} + +=head2 fetch_by_name + + Arg [1] : String $label - name of operon to fetch + Example : my $operon = $operonAdaptor->fetch_by_name("accBC"); + Description: Returns the operon which has the given display label or undef if + there is none. If there are more than 1, only the first is + reported. + Returntype : Bio::EnsEMBL::Operon + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_by_name { + my $self = shift; + my $label = shift; + + my $constraint = "o.display_label = ?"; + $self->bind_param_generic_fetch( $label, SQL_VARCHAR ); + my ($operon) = @{ $self->generic_fetch($constraint) }; + + return $operon; +} + +=head2 fetch_by_stable_id + + Arg [1] : String $id + The stable ID of the operon to retrieve + Example : $operon = $operon_adaptor->fetch_by_stable_id('ENSG00000148944'); + Description: Retrieves a operon object from the database via its stable id. + The operon will be retrieved in its native coordinate system (i.e. + in the coordinate system it is stored in the database). It may + be converted to a different coordinate system through a call to + transform() or transfer(). If the operon or exon is not found + undef is returned instead. + Returntype : Bio::EnsEMBL::Operon or undef + Exceptions : if we cant get the operon in given coord system + Caller : general + Status : Stable + +=cut + +sub fetch_by_stable_id { + my ( $self, $stable_id ) = @_; + + my $constraint = "o.stable_id = ?"; + $self->bind_param_generic_fetch( $stable_id, SQL_VARCHAR ); + my ($operon) = @{ $self->generic_fetch($constraint) }; + + return $operon; +} + +=head2 fetch_all + + Example : $operons = $operon_adaptor->fetch_all(); + Description : Similar to fetch_by_stable_id, but retrieves all + operons stored in the database. + Returntype : listref of Bio::EnsEMBL::Operon + Caller : general + Status : At Risk + +=cut + +sub fetch_all { + my ($self) = @_; + + my $constraint = ''; + my @operons = @{ $self->generic_fetch($constraint) }; + return \@operons; +} + +=head2 fetch_all_versions_by_stable_id + + Arg [1] : String $stable_id + The stable ID of the operon to retrieve + Example : $operon = $operon_adaptor->fetch_all_versions_by_stable_id + ('ENSG00000148944'); + Description : Similar to fetch_by_stable_id, but retrieves all versions of a + operon stored in the database. + Returntype : listref of Bio::EnsEMBL::Operon + Exceptions : if we cant get the operon in given coord system + Caller : general + Status : At Risk + +=cut + +sub fetch_all_versions_by_stable_id { + my ( $self, $stable_id ) = @_; + + my $constraint = "o.stable_id = ?"; + $self->bind_param_generic_fetch( $stable_id, SQL_VARCHAR ); + return $self->generic_fetch($constraint); +} + +=head2 fetch_all_by_Slice + + Arg [1] : Bio::EnsEMBL::Slice $slice + The slice to fetch operons on. + Arg [2] : (optional) string $logic_name + the logic name of the type of features to obtain + Arg [3] : (optional) boolean $load_transcripts + if true, transcripts will be loaded immediately rather than + lazy loaded later. + Arg [4] : (optional) string $source + the source name of the features to obtain. + Arg [5] : (optional) string biotype + the biotype of the features to obtain. + Example : @operons = @{$operon_adaptor->fetch_all_by_Slice()}; + Description: Overrides superclass method to optionally load transcripts + immediately rather than lazy-loading them later. This + is more efficient when there are a lot of operons whose + transcripts are going to be used. + Returntype : reference to list of operons + Exceptions : thrown if exon cannot be placed on transcript slice + Caller : Slice::get_all_operons + Status : Stable + +=cut + +sub fetch_all_by_Slice { + my ( $self, $slice, $logic_name, $load_transcripts ) = @_; + + my $constraint = ''; + my $operons = + $self->SUPER::fetch_all_by_Slice_constraint( $slice, $constraint, + $logic_name ); + + # If there are less than two operons, still do lazy-loading. + if ( !$load_transcripts || @$operons < 2 ) { + return $operons; + } + + # Preload all of the transcripts now, instead of lazy loading later, + # faster than one query per transcript. + + # First check if transcripts are already preloaded. + # FIXME: Should check all transcripts. + if ( exists( $operons->[0]->{'_operon_transcript_array'} ) ) { + return $operons; + } + + # Get extent of region spanned by transcripts. + my ( $min_start, $max_end ); + foreach my $o (@$operons) { + if ( !defined($min_start) || $o->seq_region_start() < $min_start ) { + $min_start = $o->seq_region_start(); + } + if ( !defined($max_end) || $o->seq_region_end() > $max_end ) { + $max_end = $o->seq_region_end(); + } + } + + my $ext_slice; + + if ( $min_start >= $slice->start() && $max_end <= $slice->end() ) { + $ext_slice = $slice; + } else { + my $sa = $self->db()->get_SliceAdaptor(); + $ext_slice = + $sa->fetch_by_region( $slice->coord_system->name(), + $slice->seq_region_name(), + $min_start, + $max_end, + $slice->strand(), + $slice->coord_system->version() ); + } + + # Associate transcript identifiers with operons. + + my %o_hash = map { $_->dbID => $_ } @{$operons}; + + my $o_id_str = join( ',', keys(%o_hash) ); + + my $sth = + $self->prepare( "SELECT operon_id, operon_transcript_id " + . "FROM operon_transcript " + . "WHERE operon_id IN ($o_id_str)" ); + + $sth->execute(); + + my ( $o_id, $tr_id ); + $sth->bind_columns( \( $o_id, $tr_id ) ); + + my %tr_o_hash; + + while ( $sth->fetch() ) { + $tr_o_hash{$tr_id} = $o_hash{$o_id}; + } + + my $ta = $self->db()->get_OperonTranscriptAdaptor(); + my $transcripts = + $ta->fetch_all_by_Slice( $ext_slice, + 1, undef, + sprintf( "ot.operon_transcript_id IN (%s)", + join( ',', + sort { $a <=> $b } + keys(%tr_o_hash) ) ) ); + + # Move transcripts onto operon slice, and add them to operons. + foreach my $tr ( @{$transcripts} ) { + if ( !exists( $tr_o_hash{ $tr->dbID() } ) ) { next } + + my $new_tr; + if ( $slice != $ext_slice ) { + $new_tr = $tr->transfer($slice); + if ( !defined($new_tr) ) { + throw( "Unexpected. " + . "Transcript could not be transfered onto operon slice." + ); + } + } else { + $new_tr = $tr; + } + + $tr_o_hash{ $tr->dbID() }->add_OperonTranscript($new_tr); + } + + return $operons; +} ## end sub fetch_all_by_Slice + +=head2 fetch_by_transcript_id + + Arg [1] : Int $trans_id + Unique database identifier for the transcript whose operon should + be retrieved. The operon is returned in its native coord + system (i.e. the coord_system it is stored in). If the coord + system needs to be changed, then tranform or transfer should + be called on the returned object. undef is returned if the + operon or transcript is not found in the database. + Example : $operon = $operon_adaptor->fetch_by_transcript_id(1241); + Description: Retrieves a operon from the database via the database identifier + of one of its transcripts. + Returntype : Bio::EnsEMBL::Operon + Exceptions : none + Caller : operonral + Status : Stable + +=cut + +sub fetch_by_operon_transcript_id { + my ( $self, $trans_id ) = @_; + + # this is a cheap SQL call + my $sth = $self->prepare( + qq( + SELECT tr.operon_id + FROM operon_transcript tr + WHERE tr.operon_transcript_id = ? + ) ); + + $sth->bind_param( 1, $trans_id, SQL_INTEGER ); + $sth->execute(); + + my ($operonid) = $sth->fetchrow_array(); + + $sth->finish(); + + return undef if ( !defined $operonid ); + + my $operon = $self->fetch_by_dbID($operonid); + return $operon; +} + +=head2 fetch_by_operon_transcript_stable_id + + Arg [1] : string $trans_stable_id + transcript stable ID whose operon should be retrieved + Example : my $operon = $operon_adaptor->fetch_by_operon_transcript_stable_id + ('ENST0000234'); + Description: Retrieves a operon from the database via the stable ID of one of + its transcripts + Returntype : Bio::EnsEMBL::Operon + Exceptions : none + Caller : operonral + Status : Stable + +=cut + +sub fetch_by_operon_transcript_stable_id { + my ( $self, $trans_stable_id ) = @_; + + my $sth = $self->prepare( + qq( + SELECT operon_id + FROM operon_transcript + WHERE stable_id = ? + ) ); + + $sth->bind_param( 1, $trans_stable_id, SQL_VARCHAR ); + $sth->execute(); + + my ($operonid) = $sth->fetchrow_array(); + $sth->finish; + + return undef if ( !defined $operonid ); + + my $operon = $self->fetch_by_dbID($operonid); + return $operon; +} + +sub fetch_by_operon_transcript { + my ( $self, $trans ) = @_; + assert_ref( $trans, 'Bio::EnsEMBL::OperonTranscript' ); + $self->fetch_by_operon_transcript_id( $trans->dbID() ); +} + +=head2 store + + Arg [1] : Bio::EnsEMBL::Operon $operon + The operon to store in the database + Arg [2] : ignore_release in xrefs [default 1] set to 0 to use release info + in external database references + Example : $operon_adaptor->store($operon); + Description: Stores a operon in the database. + Returntype : the database identifier (dbID) of the newly stored operon + Exceptions : thrown if the $operon is not a Bio::EnsEMBL::Operon or if + $operon does not have an analysis object + Caller : general + Status : Stable + +=cut + +sub store { + my ( $self, $operon, $ignore_release ) = @_; + + if ( !ref $operon || !$operon->isa('Bio::EnsEMBL::Operon') ) { + throw("Must store a operon object, not a $operon"); + } + + my $db = $self->db(); + + if ( $operon->is_stored($db) ) { + return $operon->dbID(); + } + my $analysis = $operon->analysis(); + throw("Operons must have an analysis object.") if(!defined($analysis)); + my $analysis_id; + if ( $analysis->is_stored($db) ) { + $analysis_id = $analysis->dbID(); + } else { + $analysis_id = $db->get_AnalysisAdaptor->store( $analysis ); + } + # ensure coords are correct before storing + #$operon->recalculate_coordinates(); + + my $seq_region_id; + + ( $operon, $seq_region_id ) = $self->_pre_store($operon); + + my $store_operon_sql = qq( + INSERT INTO operon + SET seq_region_id = ?, + seq_region_start = ?, + seq_region_end = ?, + seq_region_strand = ?, + display_label = ?, + analysis_id = ? + ); + + if ( defined($operon->stable_id()) ) { + my $created = $self->db->dbc->from_seconds_to_date($operon->created_date()); + my $modified = $self->db->dbc->from_seconds_to_date($operon->modified_date()); + $store_operon_sql .= ", stable_id = ?, version = ?, created_date = " . $created . ",modified_date = " . $modified; + } + + # column status is used from schema version 34 onwards (before it was + # confidence) + + my $sth = $self->prepare($store_operon_sql); + $sth->bind_param( 1, $seq_region_id, SQL_INTEGER ); + $sth->bind_param( 2, $operon->start(), SQL_INTEGER ); + $sth->bind_param( 3, $operon->end(), SQL_INTEGER ); + $sth->bind_param( 4, $operon->strand(), SQL_TINYINT ); + $sth->bind_param( 5, $operon->display_label(), SQL_VARCHAR ); + $sth->bind_param( 6, $analysis_id, SQL_INTEGER ); + + if ( defined($operon->stable_id()) ) { + $sth->bind_param( 7, $operon->stable_id(), SQL_VARCHAR ); + my $version = ($operon->version()) ? $operon->version() : 1; + $sth->bind_param( 8, $version, SQL_INTEGER ); + } + + $sth->execute(); + $sth->finish(); + + my $operon_dbID = $sth->{'mysql_insertid'}; + + my $transcripts = $operon->get_all_OperonTranscripts(); + + if ( $transcripts && scalar @$transcripts ) { + my $transcript_adaptor = $db->get_OperonTranscriptAdaptor(); + for my $transcript (@$transcripts) { + $transcript_adaptor->store( $transcript, $operon_dbID ); + } + } + + # store the dbentries associated with this operon + my $dbEntryAdaptor = $db->get_DBEntryAdaptor(); + + foreach my $dbe ( @{ $operon->get_all_DBEntries } ) { + $dbEntryAdaptor->store( $dbe, $operon_dbID, "Operon", $ignore_release ); + } + + # store operon attributes if there are any + my $attrs = $operon->get_all_Attributes(); + if ( $attrs && scalar @$attrs ) { + my $attr_adaptor = $db->get_AttributeAdaptor(); + $attr_adaptor->store_on_Operon( $operon, $attrs ); + } + + # set the adaptor and dbID on the original passed in operon not the + # transfered copy + $operon->adaptor($self); + $operon->dbID($operon_dbID); + + return $operon_dbID; +} ## end sub store + +=head2 remove + + Arg [1] : Bio::EnsEMBL::Operon $operon + the operon to remove from the database + Example : $operon_adaptor->remove($operon); + Description: Removes a operon completely from the database. All associated + transcripts, exons, stable_identifiers, descriptions, etc. + are removed as well. Use with caution! + Returntype : none + Exceptions : throw on incorrect arguments + warning if operon is not stored in this database + Caller : general + Status : Stable + +=cut + +sub remove { + my $self = shift; + my $operon = shift; + + if ( !ref($operon) || !$operon->isa('Bio::EnsEMBL::Operon') ) { + throw("Bio::EnsEMBL::Operon argument expected."); + } + + if ( !$operon->is_stored( $self->db() ) ) { + warning( "Cannot remove operon " + . $operon->dbID() + . ". Is not stored in " + . "this database." ); + return; + } + + # remove all object xrefs associated with this operon + + my $dbe_adaptor = $self->db()->get_DBEntryAdaptor(); + foreach my $dbe ( @{ $operon->get_all_DBEntries() } ) { + $dbe_adaptor->remove_from_object( $dbe, $operon, 'Operon' ); + } + + # remove all of the transcripts associated with this operon + my $transcriptAdaptor = $self->db->get_OperonTranscriptAdaptor(); + foreach my $trans ( @{ $operon->get_all_OperonTranscripts() } ) { + $transcriptAdaptor->remove($trans); + } + + # remove this operon from the database + + my $sth = $self->prepare("DELETE FROM operon WHERE operon_id = ? "); + $sth->bind_param( 1, $operon->dbID, SQL_INTEGER ); + $sth->execute(); + $sth->finish(); + + # unset the operon identifier and adaptor thereby flagging it as unstored + + $operon->dbID(undef); + $operon->adaptor(undef); + + return; +} ## end sub remove + +# _objs_from_sth + +# Arg [1] : StatementHandle $sth +# Arg [2] : Bio::EnsEMBL::AssemblyMapper $mapper +# Arg [3] : Bio::EnsEMBL::Slice $dest_slice +# Description: PROTECTED implementation of abstract superclass method. +# responsible for the creation of Operons +# Returntype : listref of Bio::EnsEMBL::Operon in target coordinate system +# Exceptions : none +# Caller : internal +# Status : Stable + +sub _objs_from_sth { + my ( $self, $sth, $mapper, $dest_slice ) = @_; + + # + # This code is ugly because an attempt has been made to remove as many + # function calls as possible for speed purposes. Thus many caches and + # a fair bit of gymnastics is used. + # + + my $sa = $self->db()->get_SliceAdaptor(); + my $aa = $self->db->get_AnalysisAdaptor(); + + my @operons; + my %analysis_hash; + my %slice_hash; + my %sr_name_hash; + my %sr_cs_hash; + my ( $stable_id, $version, $created_date, $modified_date, $analysis_id ); + + my ( $operon_id, $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand, $display_label ); + + $sth->bind_columns( \$operon_id, \$seq_region_id, + \$seq_region_start, \$seq_region_end, + \$seq_region_strand, \$display_label, + \$analysis_id, \$stable_id, + \$version, \$created_date, + \$modified_date ); + + my $asm_cs; + my $cmp_cs; + my $asm_cs_vers; + my $asm_cs_name; + my $cmp_cs_vers; + my $cmp_cs_name; + if ($mapper) { + $asm_cs = $mapper->assembled_CoordSystem(); + $cmp_cs = $mapper->component_CoordSystem(); + $asm_cs_name = $asm_cs->name(); + $asm_cs_vers = $asm_cs->version(); + $cmp_cs_name = $cmp_cs->name(); + $cmp_cs_vers = $cmp_cs->version(); + } + + my $dest_slice_start; + my $dest_slice_end; + my $dest_slice_strand; + my $dest_slice_length; + my $dest_slice_sr_name; + my $dest_slice_seq_region_id; + if ($dest_slice) { + $dest_slice_start = $dest_slice->start(); + $dest_slice_end = $dest_slice->end(); + $dest_slice_strand = $dest_slice->strand(); + $dest_slice_length = $dest_slice->length(); + $dest_slice_sr_name = $dest_slice->seq_region_name(); + $dest_slice_seq_region_id = $dest_slice->get_seq_region_id(); + } + + my $count = 0; + OPERON: while ( $sth->fetch() ) { + $count++; + #get the analysis object + my $analysis = $analysis_hash{$analysis_id} ||= + $aa->fetch_by_dbID($analysis_id); + $analysis_hash{$analysis_id} = $analysis; + #need to get the internal_seq_region, if present + $seq_region_id = $self->get_seq_region_id_internal($seq_region_id); + #get the slice object + my $slice = $slice_hash{ "ID:" . $seq_region_id }; + + if ( !$slice ) { + $slice = $sa->fetch_by_seq_region_id($seq_region_id); + $slice_hash{ "ID:" . $seq_region_id } = $slice; + $sr_name_hash{$seq_region_id} = $slice->seq_region_name(); + $sr_cs_hash{$seq_region_id} = $slice->coord_system(); + } + + my $sr_name = $sr_name_hash{$seq_region_id}; + my $sr_cs = $sr_cs_hash{$seq_region_id}; + # + # remap the feature coordinates to another coord system + # if a mapper was provided + # + if ($mapper) { + + + if (defined $dest_slice && $mapper->isa('Bio::EnsEMBL::ChainedAssemblyMapper') ) { + ( $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand ) + = + $mapper->map( $sr_name, $seq_region_start, $seq_region_end, + $seq_region_strand, $sr_cs, 1, $dest_slice); + + } else { + + ( $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand ) + = + $mapper->fastmap( $sr_name, $seq_region_start, $seq_region_end, + $seq_region_strand, $sr_cs ); + } + + #skip features that map to gaps or coord system boundaries + next OPERON if ( !defined($seq_region_id) ); + + #get a slice in the coord system we just mapped to + if ( $asm_cs == $sr_cs + || ( $cmp_cs != $sr_cs && $asm_cs->equals($sr_cs) ) ) + { + $slice = $slice_hash{ "ID:" . $seq_region_id } ||= + $sa->fetch_by_seq_region_id($seq_region_id); + } else { + $slice = $slice_hash{ "ID:" . $seq_region_id } ||= + $sa->fetch_by_seq_region_id($seq_region_id); + } + } + + # + # If a destination slice was provided convert the coords + # If the dest_slice starts at 1 and is foward strand, nothing needs doing + # + if ($dest_slice) { + if ( $dest_slice_start != 1 || $dest_slice_strand != 1 ) { + if ( $dest_slice_strand == 1 ) { + $seq_region_start = + $seq_region_start - $dest_slice_start + 1; + $seq_region_end = $seq_region_end - $dest_slice_start + 1; + } else { + my $tmp_seq_region_start = $seq_region_start; + $seq_region_start = $dest_slice_end - $seq_region_end + 1; + $seq_region_end = + $dest_slice_end - $tmp_seq_region_start + 1; + $seq_region_strand *= -1; + } + } + + #throw away features off the end of the requested slice + if ( $seq_region_end < 1 + || $seq_region_start > $dest_slice_length + || ( $dest_slice_seq_region_id != $seq_region_id ) ) + { +# print STDERR "IGNORED DUE TO CUTOFF $dest_slice_seq_region_id ne $seq_region_id . $sr_name\n"; + next OPERON; + } + $slice = $dest_slice; + } ## end if ($dest_slice) + + push( @operons, + Bio::EnsEMBL::Operon->new( + -START => $seq_region_start, + -END => $seq_region_end, + -STRAND => $seq_region_strand, + -SLICE => $slice, + -DISPLAY_LABEL => $display_label, + -ADAPTOR => $self, + -DBID => $operon_id, + -STABLE_ID => $stable_id, + -VERSION => $version, + -CREATED_DATE => $created_date || undef, + -MODIFIED_DATE => $modified_date || undef, + -ANALYSIS => $analysis ) ); + + } ## end while ( $sth->fetch() ) + + return \@operons; +} ## end sub _objs_from_sth + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/OperonTranscriptAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/OperonTranscriptAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,876 @@ + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::OperonAdaptor - Database adaptor for the retrieval and +storage of OperonTranscript objects + +=head1 SYNOPSIS + + +my $operon_transcript_adaptor = Bio::EnsEMBL::DBSQL::OperonTranscriptAdaptor->new($dba); +$operon_transcript_adaptor->store($operon_transcript); +my $operon_transcript2 = $operon_transcript_adaptor->fetch_by_dbID( $operon->dbID() ); +my $operon_transcripts = $operon_transcript_adaptor->fetch_all_by_gene( $gene ); + +=head1 DESCRIPTION + +This is a database aware adaptor for the retrieval and storage of operon +transcript objects. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::OperonTranscriptAdaptor; + +use strict; + +use Bio::EnsEMBL::Utils::Exception qw( deprecate throw warning ); +use Bio::EnsEMBL::Utils::Scalar qw( assert_ref ); +use Bio::EnsEMBL::DBSQL::SliceAdaptor; +use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Operon; +use Bio::EnsEMBL::OperonTranscript; +use Bio::EnsEMBL::Utils::SqlHelper; + +use vars '@ISA'; +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor); + +# _tables +# Arg [1] : none +# Description: PROTECTED implementation of superclass abstract method. +# Returns the names, aliases of the tables to use for queries. +# Returntype : list of listrefs of strings +# Exceptions : none +# Caller : interna +# Status : Stable + +sub _tables { + return ( [ 'operon_transcript', 'o' ] ); +} + +# _columns +# Arg [1] : none +# Example : none +# Description: PROTECTED implementation of superclass abstract method. +# Returns a list of columns to use for queries. +# Returntype : list of strings +# Exceptions : none +# Caller : internal +# Status : Stable + +sub _columns { + my ($self) = @_; + + my $created_date = + $self->db()->dbc()->from_date_to_seconds("o.created_date"); + my $modified_date = + $self->db()->dbc()->from_date_to_seconds("o.modified_date"); + + return ( 'o.operon_transcript_id', 'o.seq_region_id', + 'o.seq_region_start', 'o.seq_region_end', + 'o.seq_region_strand', 'o.display_label', + 'o.analysis_id', 'o.stable_id', + 'o.version', $created_date, + $modified_date ); +} + +=head2 list_dbIDs + + Example : @ot_ids = @{$ot_adaptor->list_dbIDs()}; + Description: Gets an array of internal ids for all operon_transcripts in the current db + Arg[1] : int. not 0 for the ids to be sorted by the seq_region. + Returntype : Listref of Ints + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub list_dbIDs { + my ( $self, $ordered ) = @_; + + return $self->_list_dbIDs( "operon_transcript", undef, $ordered ); +} + +=head2 list_stable_ids + + Example : @stable_ot_ids = @{$ot_adaptor->list_stable_ids()}; + Description: Gets an listref of stable ids for all operon_transcripts in the current db + Returntype : reference to a list of strings + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub list_stable_ids { + my ($self) = @_; + + return $self->_list_dbIDs( "operon_transcript", "stable_id" ); +} + +sub list_seq_region_ids { + my $self = shift; + + return $self->_list_seq_region_ids('operon'); +} + +=head2 fetch_by_stable_id + + Arg [1] : String $id + The stable ID of the operon_transcript to retrieve + Example : $operon_transcript = $operon_transcript_adaptor->fetch_by_stable_id('ENSG00000148944'); + Description: Retrieves a operon_transcript object from the database via its stable id. + The operon_transcript will be retrieved in its native coordinate system (i.e. + in the coordinate system it is stored in the database). It may + be converted to a different coordinate system through a call to + transform() or transfer(). If the operon_transcript or exon is not found + undef is returned instead. + Returntype : Bio::EnsEMBL::OperonTranscript or undef + Exceptions : if we cant get the operon_transcript in given coord system + Caller : general + Status : Stable + +=cut + +sub fetch_by_stable_id { + my ( $self, $stable_id ) = @_; + + my $constraint = "o.stable_id = ?"; + $self->bind_param_generic_fetch( $stable_id, SQL_VARCHAR ); + my ($operon_transcript) = @{ $self->generic_fetch($constraint) }; + + return $operon_transcript; +} + +=head2 fetch_by_name + + Arg [1] : String $label - name of operon transcript to fetch + Example : my $operon_transcript = $operonAdaptor->fetch_by_name("ECK0012121342"); + Description: Returns the operon transcript which has the given display label or undef if + there is none. If there are more than 1, only the first is + reported. + Returntype : Bio::EnsEMBL::OperonTranscript + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_by_name { + my $self = shift; + my $label = shift; + + my $constraint = "o.display_label = ?"; + $self->bind_param_generic_fetch( $label, SQL_VARCHAR ); + my ($operon) = @{ $self->generic_fetch($constraint) }; + + return $operon; +} + +=head2 fetch_all + + Example : $operon_transcripts = $operon_adaptor->fetch_all(); + Description : Retrieves all operon transcripts stored in the database. + Returntype : listref of Bio::EnsEMBL::OperonTranscript + Caller : general + Status : At Risk + +=cut + +sub fetch_all { + my ($self) = @_; + + my $constraint = ''; + my @operon_transcripts = @{ $self->generic_fetch($constraint) }; + return \@operon_transcripts; +} + +=head2 fetch_all_versions_by_stable_id + + Arg [1] : String $stable_id + The stable ID of the operon_transcript to retrieve + Example : $operon_transcript = $operon_transcript_adaptor->fetch_all_versions_by_stable_id + ('ENSG00000148944'); + Description : Similar to fetch_by_stable_id, but retrieves all versions of a + operon_transcript stored in the database. + Returntype : listref of Bio::EnsEMBL::OperonTranscript + Exceptions : if we cant get the operon_transcript in given coord system + Caller : general + Status : At Risk + +=cut + +sub fetch_all_versions_by_stable_id { + my ( $self, $stable_id ) = @_; + + my $constraint = "o.stable_id = ?"; + $self->bind_param_generic_fetch( $stable_id, SQL_VARCHAR ); + return $self->generic_fetch($constraint); +} + +=head2 fetch_all_by_Slice + + Arg [1] : Bio::EnsEMBL::Slice $slice + The slice to fetch operon_transcripts on. + Arg [2] : (optional) string $logic_name + the logic name of the type of features to obtain + Arg [3] : (optional) boolean $load_transcripts + if true, transcripts will be loaded immediately rather than + lazy loaded later. + Arg [4] : (optional) string $source + the source name of the features to obtain. + Arg [5] : (optional) string biotype + the biotype of the features to obtain. + Example : @operon_transcripts = @{$operon_transcript_adaptor->fetch_all_by_Slice()}; + Description: Overrides superclass method to optionally load transcripts + immediately rather than lazy-loading them later. This + is more efficient when there are a lot of operon_transcripts whose + transcripts are going to be used. + Returntype : reference to list of operon_transcripts + Exceptions : thrown if exon cannot be placed on transcript slice + Caller : Slice::get_all_OperonTranscripts + Status : Stable + +=cut + +sub fetch_all_by_Slice { + my ( $self, $slice, $logic_name, $load_transcripts ) = @_; + + my $constraint = ''; + + my $operons = + $self->SUPER::fetch_all_by_Slice_constraint( $slice, $constraint, + $logic_name ); + + # If there are less than two operons, still do lazy-loading. + if ( !$load_transcripts || @$operons < 2 ) { + return $operons; + } + + # Preload all of the transcripts now, instead of lazy loading later, + # faster than one query per transcript. + + # First check if transcripts are already preloaded. + # FIXME: Should check all transcripts. + if ( exists( $operons->[0]->{'_operon_transcript_array'} ) ) { + return $operons; + } + + # Get extent of region spanned by transcripts. + my ( $min_start, $max_end ); + foreach my $o (@$operons) { + if ( !defined($min_start) || $o->seq_region_start() < $min_start ) { + $min_start = $o->seq_region_start(); + } + if ( !defined($max_end) || $o->seq_region_end() > $max_end ) { + $max_end = $o->seq_region_end(); + } + } + + my $ext_slice; + + if ( $min_start >= $slice->start() && $max_end <= $slice->end() ) { + $ext_slice = $slice; + } else { + my $sa = $self->db()->get_SliceAdaptor(); + $ext_slice = + $sa->fetch_by_region( $slice->coord_system->name(), + $slice->seq_region_name(), + $min_start, + $max_end, + $slice->strand(), + $slice->coord_system->version() ); + } + + # Associate transcript identifiers with operon_transcripts. + + my %o_hash = map { $_->dbID => $_ } @{$operons}; + + my $o_id_str = join( ',', keys(%o_hash) ); + + my $sth = + $self->prepare( "SELECT operon_id, operon_transcript_id " + . "FROM operon_transcript " + . "WHERE operon_id IN ($o_id_str)" ); + + $sth->execute(); + + my ( $o_id, $tr_id ); + $sth->bind_columns( \( $o_id, $tr_id ) ); + + my %tr_o_hash; + + while ( $sth->fetch() ) { + $tr_o_hash{$tr_id} = $o_hash{$o_id}; + } + + my $ta = $self->db()->get_OperonTranscriptAdaptor(); + my $transcripts = + $ta->fetch_all_by_Slice( $ext_slice, + 1, undef, + sprintf( "ot.operon_transcript_id IN (%s)", + join( ',', + sort { $a <=> $b } + keys(%tr_o_hash) ) ) ); + +# Move transcripts onto operon_transcript slice, and add them to operon_transcripts. + foreach my $tr ( @{$transcripts} ) { + if ( !exists( $tr_o_hash{ $tr->dbID() } ) ) { next } + + my $new_tr; + if ( $slice != $ext_slice ) { + $new_tr = $tr->transfer($slice); + if ( !defined($new_tr) ) { + throw("Unexpected. " + . "Transcript could not be transfered onto OperonTranscript slice." + ); + } + } else { + $new_tr = $tr; + } + + $tr_o_hash{ $tr->dbID() }->add_OperonTranscript($new_tr); + } + + return $operons; +} ## end sub fetch_all_by_Slice + +=head2 fetch_by_Operon + + Arg [1] : Bio::EnsEMBL::Operon + Example : $ot = $ot_adaptor->fetch_by_Operon($operon); + Description: Retrieves all operon transcripts belonging to an operon + Returntype : arrayref of Bio::EnsEMBL::OperonTranscript + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_Operon { + my ( $self, $operon ) = @_; + return $self->fetch_by_operon_id( $operon->dbID() ); +} + +=head2 fetch_by_operon_id + + Arg [1] : Int id + Example : $ot = $ot_adaptor->fetch_by_operon_transcript($operon); + Description: Retrieves all operon transcripts belonging to an operon + Returntype : arrayref of Bio::EnsEMBL::OperonTranscript + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_by_operon_id { + my ( $self, $operon_id ) = @_; + + my $constraint = "o.operon_id = ?"; + $self->bind_param_generic_fetch( $operon_id, SQL_INTEGER ); + return $self->generic_fetch($constraint); +} + +=head2 fetch_genes_by_operon_transcript + + Arg [1] : Bio::EnsEMBL::OperonTranscript + Example : $ot = $ot_adaptor->fetch_genes_by_operon_transcript($operon_transcript); + Description: Retrieves all genes attached to an operon transcript + Returntype : arrayref of Bio::EnsEMBL::Gene + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_genes_by_operon_transcript { + my ( $self, $operon_transcript ) = @_; + assert_ref( $operon_transcript, 'Bio::EnsEMBL::OperonTranscript' ); + return $self->fetch_genes_by_operon_transcript_id( + $operon_transcript->dbID() ); +} + +=head2 fetch_genes_by_operon_transcript_id + + Arg [1] : Int id + Example : $ot = $ot_adaptor->fetch_genes_by_operon_transcript($operon_transcript_id); + Description: Retrieves all genes attached to an operon transcript + Returntype : arrayref of Bio::EnsEMBL::Gene + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_genes_by_operon_transcript_id { + my ( $self, $operon_transcript_id ) = @_; + my $helper = + Bio::EnsEMBL::Utils::SqlHelper->new( -DB_CONNECTION => $self->db->dbc() ); + + my $gene_ids = + $helper->execute_simple( + -SQL => +'SELECT gene_id FROM operon_transcript_gene tr WHERE operon_transcript_id =?', + -PARAMS => [$operon_transcript_id] ); + + my $genes = []; + my $gene_adaptor = $self->db()->get_GeneAdaptor(); + for my $gene_id (@$gene_ids) { + push @$genes, $gene_adaptor->fetch_by_dbID($gene_id); + } + return $genes; +} + +=head2 fetch_all_by_gene + + Arg [1] : Bio::EnsEMBL::Gene + Example : $ots = $ot_adaptor->fetch_all_by_gene($gene); + Description: Retrieves all operon transcripts attached to a given gene + Returntype : arrayref of Bio::EnsEMBL::OperonTranscript + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_gene { + my ( $self, $gene ) = @_; + assert_ref( $gene, 'Bio::EnsEMBL::Gene' ); + return $self->fetch_all_by_gene_id( $gene->dbID() ); +} + +=head2 fetch_all_by_gene_id + + Arg [1] : Int id of Bio::EnsEMBL::Gene + Example : $ots = $ot_adaptor->fetch_all_by_gene($gene); + Description: Retrieves all operon transcripts attached to a given gene + Returntype : arrayref of Bio::EnsEMBL::OperonTranscript + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_gene_id { + my ( $self, $gene_id ) = @_; + my $helper = + Bio::EnsEMBL::Utils::SqlHelper->new( -DB_CONNECTION => $self->db->dbc() ); + + my $ot_ids = $helper->execute_simple( + -SQL => +'SELECT operon_transcript_id FROM operon_transcript_gene tr WHERE gene_id =?', + -PARAMS => [$gene_id] ); + + my $ots = []; + for my $ot_id (@$ot_ids) { + push @$ots, $self->fetch_by_dbID($ot_id); + } + return $ots; +} + +=head2 store + + Arg [1] : Bio::EnsEMBL::OperonTranscript $gene + The gene to store in the database + Arg [2] : ignore_release in xrefs [default 1] set to 0 to use release info + in external database references + Example : $gene_adaptor->store($gene); + Description: Stores a gene in the database. + Returntype : the database identifier (dbID) of the newly stored gene + Exceptions : thrown if the $gene is not a Bio::EnsEMBL::OperonTranscript or if + $gene does not have an analysis object + Caller : general + Status : Stable + +=cut + +sub store { + my ( $self, $operon_transcript, $operon_id ) = @_; + + assert_ref( $operon_transcript, 'Bio::EnsEMBL::OperonTranscript' ); + + my $db = $self->db(); + + if ( $operon_transcript->is_stored($db) ) { + return $operon_transcript->dbID(); + } + + # ensure coords are correct before storing + #$operon->recalculate_coordinates(); + + my $seq_region_id; + + ( $operon_transcript, $seq_region_id ) = + $self->_pre_store($operon_transcript); + my $analysis = $operon_transcript->analysis(); + throw("OperonTranscripts must have an analysis object.") + if ( !defined($analysis) ); + my $analysis_id; + if ( $analysis->is_stored($db) ) { + $analysis_id = $analysis->dbID(); + } else { + $analysis_id = $db->get_AnalysisAdaptor->store($analysis); + } + my $store_operon_transcript_sql = qq( + INSERT INTO operon_transcript + SET seq_region_id = ?, + seq_region_start = ?, + seq_region_end = ?, + seq_region_strand = ?, + display_label = ?, + operon_id = ?, + analysis_id =? + ); + + if ( defined($operon_transcript->stable_id()) ) { + my $created = $self->db->dbc->from_seconds_to_date($operon_transcript->created_date()); + my $modified = $self->db->dbc->from_seconds_to_date($operon_transcript->modified_date()); + $store_operon_transcript_sql .= ", stable_id = ?, version = ?, created_date = " . $created . ",modified_date = " . $modified; + } + + + # column status is used from schema version 34 onwards (before it was + # confidence) + + my $sth = $self->prepare($store_operon_transcript_sql); + $sth->bind_param( 1, $seq_region_id, SQL_INTEGER ); + $sth->bind_param( 2, $operon_transcript->start(), SQL_INTEGER ); + $sth->bind_param( 3, $operon_transcript->end(), SQL_INTEGER ); + $sth->bind_param( 4, $operon_transcript->strand(), SQL_TINYINT ); + $sth->bind_param( 5, $operon_transcript->display_label(), SQL_VARCHAR ); + $sth->bind_param( 6, $operon_id, SQL_INTEGER ); + $sth->bind_param( 7, $analysis_id, SQL_INTEGER ); + + if ( defined($operon_transcript->stable_id()) ) { + $sth->bind_param( 8, $operon_transcript->stable_id(), SQL_VARCHAR ); + my $version = ($operon_transcript->version()) ? $operon_transcript->version() : 1; + $sth->bind_param( 9, $version, SQL_INTEGER ); + } + + $sth->execute(); + $sth->finish(); + + my $operon_transcript_dbID = $sth->{'mysql_insertid'}; + + # store the dbentries associated with this gene + my $dbEntryAdaptor = $db->get_DBEntryAdaptor(); + + foreach my $dbe ( @{ $operon_transcript->get_all_DBEntries } ) { + $dbEntryAdaptor->store( $dbe, $operon_transcript_dbID, + "OperonTranscript" ); + } + + # store operon attributes if there are any + my $attrs = $operon_transcript->get_all_Attributes(); + if ( $attrs && scalar @$attrs ) { + my $attr_adaptor = $db->get_AttributeAdaptor(); + $attr_adaptor->store_on_OperonTranscript( $operon_transcript, $attrs ); + } + + # set the adaptor and dbID on the original passed in gene not the + # transfered copy + $operon_transcript->adaptor($self); + $operon_transcript->dbID($operon_transcript_dbID); + + if ( defined $operon_transcript->{_gene_array} ) { + $self->store_genes_on_OperonTranscript( $operon_transcript, + $operon_transcript->{_gene_array} ); + } + + return $operon_transcript_dbID; +} ## end sub store + +=head2 store_genes_on_OperonTranscript + + Arg [1] : Bio::EnsEMBL::OperonTranscript $ot + the operon_transcript to store genes on + Arg [2] : arrayref of Bio::EnsEMBL::Gene $gene + the genes to store on operon transcript + Example : $ot_adaptor->store_genes_on_OperonTranscript(\@genes); + Description: Associates genes with operon transcript + Returntype : none + Exceptions : throw on incorrect arguments + warning if operon_transcript is not stored in this database + Caller : general, store + Status : Stable + +=cut + +sub store_genes_on_OperonTranscript { + my ( $self, $operon_transcript, $genes ) = @_; + assert_ref( $operon_transcript, "Bio::EnsEMBL::OperonTranscript" ); + my $sth = $self->prepare( +'insert into operon_transcript_gene(operon_transcript_id,gene_id) values(' + . $operon_transcript->dbID() + . ',?)' ); + for my $gene ( @{$genes} ) { + assert_ref( $gene, "Bio::EnsEMBL::Gene" ); + $sth->bind_param( 1, $gene->dbID(), SQL_INTEGER ); + $sth->execute(); + } + $sth->finish(); + return; +} + +=head2 remove + + Arg [1] : Bio::EnsEMBL::OperonTranscript $ot + the operon_transcript to remove from the database + Example : $ot_adaptor->remove($ot); + Description: Removes a operon transcript completely from the database. + Returntype : none + Exceptions : throw on incorrect arguments + warning if operon_transcript is not stored in this database + Caller : general + Status : Stable + +=cut + +sub remove { + my $self = shift; + my $operon_transcript = shift; + + assert_ref( $operon_transcript, 'Bio::EnsEMBL::OperonTranscript' ); + + if ( !$operon_transcript->is_stored( $self->db() ) ) { + warning( "Cannot remove operon transcript " + . $operon_transcript->dbID() + . ". Is not stored in " + . "this database." ); + return; + } + + # remove all object xrefs associated with this gene + + my $dbe_adaptor = $self->db()->get_DBEntryAdaptor(); + foreach my $dbe ( @{ $operon_transcript->get_all_DBEntries() } ) { + $dbe_adaptor->remove_from_object( $dbe, $operon_transcript, + 'OperonTranscript' ); + } + + # # remove the attributes associated with this transcript + # my $attrib_adaptor = $self->db->get_AttributeAdaptor; + # $attrib_adaptor->remove_from_OperonTranscript($operon_transcript); + + # remove from the database + my $sth = $self->prepare( + "DELETE FROM operon_transcript WHERE operon_transcript_id = ? "); + $sth->bind_param( 1, $operon_transcript->dbID, SQL_INTEGER ); + $sth->execute(); + $sth->finish(); + + # unset the gene identifier and adaptor thereby flagging it as unstored + + $operon_transcript->dbID(undef); + $operon_transcript->adaptor(undef); + + return; +} ## end sub remove + +# _objs_from_sth + +# Arg [1] : StatementHandle $sth +# Arg [2] : Bio::EnsEMBL::AssemblyMapper $mapper +# Arg [3] : Bio::EnsEMBL::Slice $dest_slice +# Description: PROTECTED implementation of abstract superclass method. +# responsible for the creation of OperonTranscripts +# Returntype : listref of Bio::EnsEMBL::OperonTranscripts in target coordinate system +# Exceptions : none +# Caller : internal +# Status : Stable + +sub _objs_from_sth { + my ( $self, $sth, $mapper, $dest_slice ) = @_; + + # + # This code is ugly because an attempt has been made to remove as many + # function calls as possible for speed purposes. Thus many caches and + # a fair bit of gymnastics is used. + # + + my $sa = $self->db()->get_SliceAdaptor(); + my $aa = $self->db->get_AnalysisAdaptor(); + + my @operons; + my %analysis_hash; + my %slice_hash; + my %sr_name_hash; + my %sr_cs_hash; + my ( $stable_id, $version, $created_date, $modified_date ); + + my ( $operon_transcript_id, $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand, $display_label, + $analysis_id ); + + $sth->bind_columns( \$operon_transcript_id, \$seq_region_id, + \$seq_region_start, \$seq_region_end, + \$seq_region_strand, \$display_label, + \$analysis_id, \$stable_id, + \$version, \$created_date, + \$modified_date ); + + my $asm_cs; + my $cmp_cs; + my $asm_cs_vers; + my $asm_cs_name; + my $cmp_cs_vers; + my $cmp_cs_name; + if ($mapper) { + $asm_cs = $mapper->assembled_CoordSystem(); + $cmp_cs = $mapper->component_CoordSystem(); + $asm_cs_name = $asm_cs->name(); + $asm_cs_vers = $asm_cs->version(); + $cmp_cs_name = $cmp_cs->name(); + $cmp_cs_vers = $cmp_cs->version(); + } + + my $dest_slice_start; + my $dest_slice_end; + my $dest_slice_strand; + my $dest_slice_length; + my $dest_slice_sr_name; + my $dest_slice_seq_region_id; + if ($dest_slice) { + $dest_slice_start = $dest_slice->start(); + $dest_slice_end = $dest_slice->end(); + $dest_slice_strand = $dest_slice->strand(); + $dest_slice_length = $dest_slice->length(); + $dest_slice_sr_name = $dest_slice->seq_region_name(); + $dest_slice_seq_region_id = $dest_slice->get_seq_region_id(); + } + + my $count = 0; + OPERON: while ( $sth->fetch() ) { + $count++; + # #get the analysis object + my $analysis = $analysis_hash{$analysis_id} ||= + $aa->fetch_by_dbID($analysis_id); + $analysis_hash{$analysis_id} = $analysis; + #need to get the internal_seq_region, if present + $seq_region_id = $self->get_seq_region_id_internal($seq_region_id); + #get the slice object + my $slice = $slice_hash{ "ID:" . $seq_region_id }; + + if ( !$slice ) { + $slice = $sa->fetch_by_seq_region_id($seq_region_id); + $slice_hash{ "ID:" . $seq_region_id } = $slice; + $sr_name_hash{$seq_region_id} = $slice->seq_region_name(); + $sr_cs_hash{$seq_region_id} = $slice->coord_system(); + } + + my $sr_name = $sr_name_hash{$seq_region_id}; + my $sr_cs = $sr_cs_hash{$seq_region_id}; + # + # remap the feature coordinates to another coord system + # if a mapper was provided + # + if ($mapper) { + + if (defined $dest_slice && $mapper->isa('Bio::EnsEMBL::ChainedAssemblyMapper') ) { + ( $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand ) + = + $mapper->map( $sr_name, $seq_region_start, $seq_region_end, + $seq_region_strand, $sr_cs, 1, $dest_slice); + + } else { + + ( $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand ) + = + $mapper->fastmap( $sr_name, $seq_region_start, $seq_region_end, + $seq_region_strand, $sr_cs ); + } + + + #skip features that map to gaps or coord system boundaries + next OPERON if ( !defined($seq_region_id) ); + + #get a slice in the coord system we just mapped to + if ( $asm_cs == $sr_cs + || ( $cmp_cs != $sr_cs && $asm_cs->equals($sr_cs) ) ) + { + $slice = $slice_hash{ "ID:" . $seq_region_id } ||= + $sa->fetch_by_seq_region_id($seq_region_id); + } else { + $slice = $slice_hash{ "ID:" . $seq_region_id } ||= + $sa->fetch_by_seq_region_id($seq_region_id); + } + } + + # + # If a destination slice was provided convert the coords + # If the dest_slice starts at 1 and is foward strand, nothing needs doing + # + if ($dest_slice) { + if ( $dest_slice_start != 1 || $dest_slice_strand != 1 ) { + if ( $dest_slice_strand == 1 ) { + $seq_region_start = + $seq_region_start - $dest_slice_start + 1; + $seq_region_end = $seq_region_end - $dest_slice_start + 1; + } else { + my $tmp_seq_region_start = $seq_region_start; + $seq_region_start = $dest_slice_end - $seq_region_end + 1; + $seq_region_end = + $dest_slice_end - $tmp_seq_region_start + 1; + $seq_region_strand *= -1; + } + } + + #throw away features off the end of the requested slice + if ( $seq_region_end < 1 + || $seq_region_start > $dest_slice_length + || ( $dest_slice_seq_region_id != $seq_region_id ) ) + { +# print STDERR "IGNORED DUE TO CUTOFF $dest_slice_seq_region_id ne $seq_region_id . $sr_name\n"; + next OPERON; + } + $slice = $dest_slice; + } ## end if ($dest_slice) + + push( @operons, + Bio::EnsEMBL::OperonTranscript->new( + -START => $seq_region_start, + -END => $seq_region_end, + -STRAND => $seq_region_strand, + -SLICE => $slice, + -DISPLAY_LABEL => $display_label, + -ADAPTOR => $self, + -DBID => $operon_transcript_id, + -STABLE_ID => $stable_id, + -VERSION => $version, + -CREATED_DATE => $created_date || undef, + -MODIFIED_DATE => $modified_date || undef, + -ANALYSIS => $analysis ) ); + + } ## end while ( $sth->fetch() ) + + return \@operons; +} ## end sub _objs_from_sth + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/PredictionExonAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/PredictionExonAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,468 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::PredictionExonAdaptor - Performs database interaction for +PredictionExons. + +=head1 SYNOPSIS + + $pea = $database_adaptor->get_PredictionExonAdaptor(); + $pexon = $pea->fetch_by_dbID(); + + my $slice = + $database_adaptor->get_SliceAdaptor->fetch_by_region( 'X', 1, 1e6 ); + + my @pexons = @{ $pea->fetch_all_by_Slice($slice) }; + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::PredictionExonAdaptor; + +use vars qw( @ISA ); +use strict; + + +use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; +use Bio::EnsEMBL::PredictionExon; +use Bio::EnsEMBL::Utils::Exception qw( warning throw deprecate ); + + +@ISA = qw( Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor ); + + +#_tables +# +# Arg [1] : none +# Example : none +# Description: PROTECTED implementation of superclass abstract method +# returns the names, aliases of the tables to use for queries +# Returntype : list of listrefs of strings +# Exceptions : none +# Caller : internal +# + +sub _tables { + return ([ 'prediction_exon', 'pe' ] ); +} + + + +#_columns +# +# Arg [1] : none +# Example : none +# Description: PROTECTED implementation of superclass abstract method +# returns a list of columns to use for queries +# Returntype : list of strings +# Exceptions : none +# Caller : internal + +sub _columns { + my $self = shift; + + return qw( pe.prediction_exon_id + pe.seq_region_id + pe.seq_region_start + pe.seq_region_end + pe.seq_region_strand + pe.start_phase + pe.score + pe.p_value ); +} + + +# _final_clause +# +# Arg [1] : none +# Example : none +# Description: PROTECTED implementation of superclass abstract method +# returns a default end for the SQL-query (ORDER BY) +# Returntype : string +# Exceptions : none +# Caller : internal + +sub _final_clause { + return "ORDER BY pe.prediction_transcript_id, pe.exon_rank"; +} + + +=head2 fetch_all_by_PredictionTranscript + + Arg [1] : Bio::EnsEMBL::PredcitionTranscript $transcript + Example : none + Description: Retrieves all Exons for the Transcript in 5-3 order + Returntype : listref Bio::EnsEMBL::Exon on Transcript slice + Exceptions : throws if transcript does not have a slice + Caller : Transcript->get_all_Exons() + Status : Stable + +=cut + +sub fetch_all_by_PredictionTranscript { + my ( $self, $transcript ) = @_; + my $constraint = "pe.prediction_transcript_id = ".$transcript->dbID(); + + # use 'keep_all' option to keep exons that are off end of slice + + my $tslice = $transcript->slice(); + my $slice; + + if(!$tslice) { + throw("Transcript must have attached slice to retrieve exons."); + } + + # use a small slice the same size as the prediction transcript + $slice = $self->db->get_SliceAdaptor->fetch_by_Feature($transcript); + + my $exons = $self->fetch_all_by_Slice_constraint($slice, $constraint); + + # remap exon coordinates if necessary + if($slice->name() ne $tslice->name()) { + my @out; + foreach my $ex (@$exons) { + push @out, $ex->transfer($tslice); + } + $exons = \@out; + } + + return $exons; +} + + + +=head2 store + + Arg [1] : Bio::EnsEMBL::PredictionExon $exon + The exon to store in this database + Arg [2] : int $prediction_transcript_id + The internal identifier of the prediction exon that that this + exon is associated with. + Arg [3] : int $rank + The rank of the exon in the transcript (starting at 1) + Example : $pexon_adaptor->store($pexon, 1211, 2); + Description: Stores a PredictionExon in the database + Returntype : none + Exceptions : thrown if exon does not have a slice attached + or if $exon->start, $exon->end, $exon->strand, or $exon->phase + are not defined or if $exon is not a Bio::EnsEMBL::PredictionExon + Caller : general + Status : Stable + +=cut + +sub store { + my ( $self, $pexon, $pt_id, $rank ) = @_; + + if(!ref($pexon) || !$pexon->isa('Bio::EnsEMBL::PredictionExon') ) { + throw("Expected PredictionExon argument"); + } + + throw("Expected PredictionTranscript id argument.") if(!$pt_id); + throw("Expected rank argument.") if(!$rank); + + my $db = $self->db(); + + if($pexon->is_stored($db)) { + warning('PredictionExon is already stored in this DB.'); + return $pexon->dbID(); + } + + if( ! $pexon->start || ! $pexon->end || + ! $pexon->strand || ! defined $pexon->phase ) { + throw("PredictionExon does not have all attributes to store.\n" . + "start, end, strand and phase attributes must be set."); + } + + #maintain reference to original passed-in prediction exon + my $original = $pexon; + my $seq_region_id; + ($pexon, $seq_region_id) = $self->_pre_store($pexon); + + my $sth = $db->dbc->prepare + ("INSERT into prediction_exon (prediction_transcript_id, exon_rank, " . + "seq_region_id, seq_region_start, seq_region_end, " . + "seq_region_strand, start_phase, score, p_value) " . + "VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ? )"); + + $sth->bind_param(1,$pt_id,SQL_INTEGER); + $sth->bind_param(2,$rank,SQL_SMALLINT); + $sth->bind_param(3,$seq_region_id,SQL_INTEGER); + $sth->bind_param(4,$pexon->start,SQL_INTEGER); + $sth->bind_param(5,$pexon->end,SQL_INTEGER); + $sth->bind_param(6,$pexon->strand,SQL_TINYINT); + $sth->bind_param(7,$pexon->phase,SQL_TINYINT); + $sth->bind_param(8,$pexon->score,SQL_DOUBLE); + $sth->bind_param(9,$pexon->p_value,SQL_DOUBLE); + + $sth->execute(); + + my $dbID = $sth->{'mysql_insertid'}; + + #set the adaptor and dbID of the object they passed in + $original->dbID($dbID); + $original->adaptor($self); + + return $dbID; +} + + + +=head2 remove + + Arg [1] : Bio::EnsEMBL::PredictionExon $exon + the exon to remove from the database + Example : $exon_adaptor->remove($exon); + Description: Removes an exon from the database + Returntype : none + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub remove { + my $self = shift; + my $pexon = shift; + + my $db = $self->db(); + + if(!$pexon->is_stored($db)) { + warning('PredictionExon is not in this DB - not removing'); + return undef; + } + + my $sth = $self->prepare( + "DELETE FROM prediction_exon WHERE prediction_exon_id = ?"); + $sth->bind_param( 1, $pexon->dbID, SQL_INTEGER ); + $sth->execute(); + + $pexon->dbID(undef); + $pexon->adaptor(undef); +} + + + +=head2 list_dbIDs + + Arg [1] : none + Example : @exon_ids = @{$exon_adaptor->list_dbIDs()}; + Description: Gets an array of internal ids for all exons in the current db + Arg[1] : int. not 0 for the ids to be sorted by the seq_region. + Returntype : list of ints + Exceptions : none + Caller : ? + Status : Stable + +=cut + +sub list_dbIDs { + my ($self,$ordered) = @_; + + return $self->_list_dbIDs("prediction_exon",undef, $ordered); +} + + + +#_objs_from_sth + +# Arg [1] : Hashreference $hashref +# Example : none +# Description: PROTECTED implementation of abstract superclass method. +# responsible for the creation of Genes +# Returntype : listref of Bio::EnsEMBL::Genes in target coordinate system +# Exceptions : none +# Caller : internal +# + +sub _objs_from_sth { + my ($self, $sth, $mapper, $dest_slice) = @_; + + # + # This code is ugly because an attempt has been made to remove as many + # function calls as possible for speed purposes. Thus many caches and + # a fair bit of gymnastics is used. + # + my $sa = $self->db()->get_SliceAdaptor(); + + my @exons; + my %slice_hash; + my %sr_name_hash; + my %sr_cs_hash; + + my($prediction_exon_id,$seq_region_id, + $seq_region_start, $seq_region_end, $seq_region_strand, + $start_phase, $score, $p_value); + + $sth->bind_columns(\$prediction_exon_id,\$seq_region_id, + \$seq_region_start, \$seq_region_end, \$seq_region_strand, + \$start_phase, \$score, \$p_value); + + my $asm_cs; + my $cmp_cs; + my $asm_cs_vers; + my $asm_cs_name; + my $cmp_cs_vers; + my $cmp_cs_name; + if($mapper) { + $asm_cs = $mapper->assembled_CoordSystem(); + $cmp_cs = $mapper->component_CoordSystem(); + $asm_cs_name = $asm_cs->name(); + $asm_cs_vers = $asm_cs->version(); + $cmp_cs_name = $cmp_cs->name(); + $cmp_cs_vers = $cmp_cs->version(); + } + + my $dest_slice_start; + my $dest_slice_end; + my $dest_slice_strand; + my $dest_slice_length; + my $dest_slice_cs; + my $asma; + my $dest_slice_sr_name; + my $dest_slice_sr_id; + + if($dest_slice) { + $dest_slice_start = $dest_slice->start(); + $dest_slice_end = $dest_slice->end(); + $dest_slice_strand = $dest_slice->strand(); + $dest_slice_length = $dest_slice->length(); + $dest_slice_cs = $dest_slice->coord_system; + $dest_slice_sr_name = $dest_slice->seq_region_name(); + $dest_slice_sr_id = $dest_slice->get_seq_region_id(); + $asma = $self->db->get_AssemblyMapperAdaptor(); + } + + FEATURE: while($sth->fetch()) { + #need to get the internal_seq_region, if present + $seq_region_id = $self->get_seq_region_id_internal($seq_region_id); + my $slice = $slice_hash{"ID:".$seq_region_id}; + my $dest_mapper = $mapper; + + + if(!$slice) { + $slice = $sa->fetch_by_seq_region_id($seq_region_id); + $slice_hash{"ID:".$seq_region_id} = $slice; + $sr_name_hash{$seq_region_id} = $slice->seq_region_name(); + $sr_cs_hash{$seq_region_id} = $slice->coord_system(); + } + + #obtain a mapper if none was defined, but a dest_seq_region was + if(!$dest_mapper && $dest_slice && + !$dest_slice_cs->equals($slice->coord_system)) { + $dest_mapper = $asma->fetch_by_CoordSystems($dest_slice_cs, + $slice->coord_system); + $asm_cs = $dest_mapper->assembled_CoordSystem(); + $cmp_cs = $dest_mapper->component_CoordSystem(); + $asm_cs_name = $asm_cs->name(); + $asm_cs_vers = $asm_cs->version(); + $cmp_cs_name = $cmp_cs->name(); + $cmp_cs_vers = $cmp_cs->version(); + } + + my $sr_name = $sr_name_hash{$seq_region_id}; + my $sr_cs = $sr_cs_hash{$seq_region_id}; + + # + # remap the feature coordinates to another coord system + # if a mapper was provided + # + if($dest_mapper) { + + if (defined $dest_slice && $dest_mapper->isa('Bio::EnsEMBL::ChainedAssemblyMapper') ) { + ( $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand ) + = + $dest_mapper->map( $sr_name, $seq_region_start, $seq_region_end, + $seq_region_strand, $sr_cs, 1, $dest_slice); + + } else { + + ( $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand ) + = $dest_mapper->fastmap( $sr_name, $seq_region_start, + $seq_region_end, $seq_region_strand, + $sr_cs ); + } + + #skip features that map to gaps or coord system boundaries + next FEATURE if(!defined($seq_region_id)); + + #get a slice in the coord system we just mapped to +# if($asm_cs == $sr_cs || ($cmp_cs != $sr_cs && $asm_cs->equals($sr_cs))) { + $slice = $slice_hash{"ID:".$seq_region_id} ||= + $sa->fetch_by_seq_region_id($seq_region_id); +# } else { +# $slice = $slice_hash{"NAME:$sr_name:$asm_cs_name:$asm_cs_vers"} ||= +# $sa->fetch_by_region($asm_cs_name, $sr_name, undef, undef, undef, +# $asm_cs_vers); +# } + } + + # + # If a destination slice was provided convert the coords + # If the dest_slice starts at 1 and is foward strand, nothing needs doing + # + if($dest_slice) { + if($dest_slice_start != 1 || $dest_slice_strand != 1) { + if($dest_slice_strand == 1) { + $seq_region_start = $seq_region_start - $dest_slice_start + 1; + $seq_region_end = $seq_region_end - $dest_slice_start + 1; + } else { + my $tmp_seq_region_start = $seq_region_start; + $seq_region_start = $dest_slice_end - $seq_region_end + 1; + $seq_region_end = $dest_slice_end - $tmp_seq_region_start + 1; + $seq_region_strand *= -1; + } + } + + #throw away features off the end of the requested slice + if($seq_region_end < 1 || $seq_region_start > $dest_slice_length || + ( $dest_slice_sr_id ne $seq_region_id )) { + next FEATURE; + } + + $slice = $dest_slice; + } + + # Finally, create the new PredictionExon. + push( @exons, + $self->_create_feature( 'Bio::EnsEMBL::PredictionExon', { + '-start' => $seq_region_start, + '-end' => $seq_region_end, + '-strand' => $seq_region_strand, + '-adaptor' => $self, + '-slice' => $slice, + '-dbID' => $prediction_exon_id, + '-phase' => $start_phase, + '-score' => $score, + '-p_value' => $p_value + } ) ); + + } + + return \@exons; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/PredictionTranscriptAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/PredictionTranscriptAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,587 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::PredictionTranscriptAdaptor - +Performs database interaction related to PredictionTranscripts + +=head1 SYNOPSIS + + # get a prediction transcript adaptor from the database + $pta = $database_adaptor->get_PredictionTranscriptAdaptor(); + + # get a slice on a region of chromosome 1 + $sa = $database_adaptor->get_SliceAdaptor(); + + $slice = $sa->fetch_by_region( 'chromosome', 'x', 100000, 200000 ); + + # get all the prediction transcripts from the slice region + $prediction_transcripts = @{ $pta->fetch_all_by_Slice($slice) }; + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::PredictionTranscriptAdaptor; + +use vars qw( @ISA ); +use strict; + +use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; +use Bio::EnsEMBL::DBSQL::AnalysisAdaptor; +use Bio::EnsEMBL::PredictionTranscript; +use Bio::EnsEMBL::Utils::Exception qw(deprecate throw warning); + +@ISA = qw( Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor ); + + +# _tables +# +# Arg [1] : none +# Example : none +# Description: Implements abstract superclass method to define the table used +# to retrieve prediction transcripts from the database +# Returntype : string +# Exceptions : none +# Caller : generic_fetch + +sub _tables { + my $self = shift; + + return ['prediction_transcript', 'pt']; +} + + +# _columns + +# Arg [1] : none +# Example : none +# Description: Implements abstract superclass method to define the columns +# retrieved in database queries used to create prediction +# transcripts. +# Returntype : list of strings +# Exceptions : none +# Caller : generic_fetch +# + +sub _columns { + my $self = shift; + + return qw( pt.prediction_transcript_id + pt.seq_region_id + pt.seq_region_start + pt.seq_region_end + pt.seq_region_strand + pt.analysis_id + pt.display_label); +} + + +=head2 fetch_by_stable_id + + Arg [1] : string $stable_id + The stable id of the transcript to retrieve + Example : $trans = $trans_adptr->fetch_by_stable_id('GENSCAN00000001234'); + Description: Retrieves a prediction transcript via its display_label. + This method is called fetch_by_stable_id for polymorphism with + the TranscriptAdaptor. Prediction transcript display_labels are + not necessarily stable in that the same identifier may be reused + for a completely different prediction transcript in a subsequent + database release. + Returntype : Bio::EnsEMBL::PredictionTranscript + Caller : general + Status : Stable + +=cut + +sub fetch_by_stable_id { + my $self = shift; + my $stable_id = shift; + + throw('Stable_id argument expected') if(!$stable_id); + + my $syn = $self->_tables()->[1]; + + $self->bind_param_generic_fetch($stable_id,SQL_VARCHAR); + my $pts = $self->generic_fetch("$syn.display_label = ?"); + + return (@$pts) ? $pts->[0] : undef; +} + + + +=head2 fetch_all_by_Slice + + Arg [1] : Bio::EnsEMBL::Slice $slice + The slice to fetch transcripts on. + Arg [3] : (optional) boolean $load_exons + if true, exons will be loaded immediately rather than + lazy loaded later. + Example : $transcripts = $ + Description: Overrides superclass method to optionally load exons + immediately rather than lazy-loading them later. This + is more efficient when there are a lot of transcripts whose + exons are going to be used. + Returntype : reference to list of transcripts + Exceptions : thrown if exon cannot be placed on transcript slice + Caller : Slice::get_all_Transcripts + Status : Stable + +=cut + +sub fetch_all_by_Slice { + my $self = shift; + my $slice = shift; + my $logic_name = shift; + my $load_exons = shift; + + my $transcripts = $self->SUPER::fetch_all_by_Slice($slice,$logic_name); + + # if there are 0 or 1 transcripts still do lazy-loading + if(!$load_exons || @$transcripts < 2) { + return $transcripts; + } + + # preload all of the exons now, instead of lazy loading later + # faster than 1 query per transcript + + # get extent of region spanned by transcripts + my ($min_start, $max_end); + foreach my $tr (@$transcripts) { + if(!defined($min_start) || $tr->seq_region_start() < $min_start) { + $min_start = $tr->seq_region_start(); + } + if(!defined($max_end) || $tr->seq_region_end() > $max_end) { + $max_end = $tr->seq_region_end(); + } + } + +# mades no sense, the limit for the slice will be defined by the transcripts +# $min_start += $slice->start() - 1; +# $max_end += $slice->start() - 1; + + my $ext_slice; + + if($min_start >= $slice->start() && $max_end <= $slice->end()) { + $ext_slice = $slice; + } else { + my $sa = $self->db()->get_SliceAdaptor(); + $ext_slice = $sa->fetch_by_region + ($slice->coord_system->name(), $slice->seq_region_name(), + $min_start,$max_end, $slice->strand(), $slice->coord_system->version()); + } + + # associate exon identifiers with transcripts + + my %tr_hash = map {$_->dbID => $_} @$transcripts; + + my $tr_id_str = '(' . join(',', keys %tr_hash) . ')'; + + my $sth = $self->prepare + ("SELECT prediction_transcript_id, prediction_exon_id, exon_rank " . + "FROM prediction_exon " . + "WHERE prediction_transcript_id IN $tr_id_str"); + + $sth->execute(); + + my ($ex_id, $tr_id, $rank); + $sth->bind_columns(\$tr_id, \$ex_id, \$rank); + + my %ex_tr_hash; + + while($sth->fetch()) { + $ex_tr_hash{$ex_id} ||= []; + push @{$ex_tr_hash{$ex_id}}, [$tr_hash{$tr_id}, $rank]; + } + + $sth->finish(); + + my $ea = $self->db()->get_PredictionExonAdaptor(); + my $exons = $ea->fetch_all_by_Slice($ext_slice); + + # move exons onto transcript slice, and add them to transcripts + foreach my $ex (@$exons) { + $ex = $ex->transfer($slice) if($slice != $ext_slice); + + if(!$ex) { + throw("Unexpected. PredictionExon could not be transfered onto " . + "PredictionTranscript slice."); + } + + foreach my $row (@{$ex_tr_hash{$ex->dbID()}}) { + my ($tr, $rank) = @$row; + $tr->add_Exon($ex, $rank); + } + } + + return $transcripts; +} + + + + + +=head2 _objs_from_sth + + Arg [1] : DBI:st $sth + An executed DBI statement handle + Arg [2] : (optional) Bio::EnsEMBL::Mapper $mapper + An mapper to be used to convert contig coordinates + to assembly coordinates. + Arg [3] : (optional) Bio::EnsEMBL::Slice $slice + A slice to map the prediction transcript to. + Example : $p_transcripts = $self->_objs_from_sth($sth); + Description: Creates a list of Prediction transcripts from an executed DBI + statement handle. The columns retrieved via the statement + handle must be in the same order as the columns defined by the + _columns method. If the slice argument is provided then the + the prediction transcripts will be in returned in the coordinate + system of the $slice argument. Otherwise the prediction + transcripts will be returned in the RawContig coordinate system. + Returntype : reference to a list of Bio::EnsEMBL::PredictionTranscripts + Exceptions : none + Caller : superclass generic_fetch + Status : Stable + +=cut + +sub _objs_from_sth { + my ($self, $sth, $mapper, $dest_slice) = @_; + + # + # This code is ugly because an attempt has been made to remove as many + # function calls as possible for speed purposes. Thus many caches and + # a fair bit of gymnastics is used. + # + + my $sa = $self->db()->get_SliceAdaptor(); + my $aa = $self->db()->get_AnalysisAdaptor(); + + my @ptranscripts; + my %analysis_hash; + my %slice_hash; + my %sr_name_hash; + my %sr_cs_hash; + + my ($prediction_transcript_id, + $seq_region_id, + $seq_region_start, + $seq_region_end, + $seq_region_strand, + $analysis_id, + $display_label); + + $sth->bind_columns(\$prediction_transcript_id, + \$seq_region_id, + \$seq_region_start, + \$seq_region_end, + \$seq_region_strand, + \$analysis_id, + \$display_label); + + my $asm_cs; + my $cmp_cs; + my $asm_cs_vers; + my $asm_cs_name; + my $cmp_cs_vers; + my $cmp_cs_name; + if($mapper) { + $asm_cs = $mapper->assembled_CoordSystem(); + $cmp_cs = $mapper->component_CoordSystem(); + $asm_cs_name = $asm_cs->name(); + $asm_cs_vers = $asm_cs->version(); + $cmp_cs_name = $cmp_cs->name(); + $cmp_cs_vers = $cmp_cs->version(); + } + + my $dest_slice_start; + my $dest_slice_end; + my $dest_slice_strand; + my $dest_slice_length; + my $dest_slice_sr_name; + my $dest_slice_sr_id; + if($dest_slice) { + $dest_slice_start = $dest_slice->start(); + $dest_slice_end = $dest_slice->end(); + $dest_slice_strand = $dest_slice->strand(); + $dest_slice_length = $dest_slice->length(); + $dest_slice_sr_name = $dest_slice->seq_region_name(); + $dest_slice_sr_id = $dest_slice->get_seq_region_id(); + } + + FEATURE: while($sth->fetch()) { + + #get the analysis object + my $analysis = $analysis_hash{$analysis_id} ||= + $aa->fetch_by_dbID($analysis_id); + #need to get the internal_seq_region, if present + $seq_region_id = $self->get_seq_region_id_internal($seq_region_id); + my $slice = $slice_hash{"ID:".$seq_region_id}; + + if(!$slice) { + $slice = $sa->fetch_by_seq_region_id($seq_region_id); + $slice_hash{"ID:".$seq_region_id} = $slice; + $sr_name_hash{$seq_region_id} = $slice->seq_region_name(); + $sr_cs_hash{$seq_region_id} = $slice->coord_system(); + } + + my $sr_name = $sr_name_hash{$seq_region_id}; + my $sr_cs = $sr_cs_hash{$seq_region_id}; + # + # remap the feature coordinates to another coord system + # if a mapper was provided + # + if($mapper) { + + if (defined $dest_slice && $mapper->isa('Bio::EnsEMBL::ChainedAssemblyMapper') ) { + ( $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand ) + = + $mapper->map( $sr_name, $seq_region_start, $seq_region_end, + $seq_region_strand, $sr_cs, 1, $dest_slice); + + } else { + + ( $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand ) + = + $mapper->fastmap( $sr_name, $seq_region_start, $seq_region_end, + $seq_region_strand, $sr_cs ); + } + + #skip features that map to gaps or coord system boundaries + next FEATURE if(!defined($seq_region_id)); + + #get a slice in the coord system we just mapped to +# if($asm_cs == $sr_cs || ($cmp_cs != $sr_cs && $asm_cs->equals($sr_cs))) { + $slice = $slice_hash{"ID:".$seq_region_id} ||= + $sa->fetch_by_seq_region_id($seq_region_id); +# } else { +# $slice = $slice_hash{"NAME:$sr_name:$asm_cs_name:$asm_cs_vers"} ||= +# $sa->fetch_by_region($asm_cs_name, $sr_name, undef, undef, undef, +# $asm_cs_vers); +# } + } + + # + # If a destination slice was provided convert the coords + # If the dest_slice starts at 1 and is foward strand, nothing needs doing + # + if($dest_slice) { + if($dest_slice_start != 1 || $dest_slice_strand != 1) { + if($dest_slice_strand == 1) { + $seq_region_start = $seq_region_start - $dest_slice_start + 1; + $seq_region_end = $seq_region_end - $dest_slice_start + 1; + } else { + my $tmp_seq_region_start = $seq_region_start; + $seq_region_start = $dest_slice_end - $seq_region_end + 1; + $seq_region_end = $dest_slice_end - $tmp_seq_region_start + 1; + $seq_region_strand *= -1; + } + } + + #throw away features off the end of the requested slice + if($seq_region_end < 1 || $seq_region_start > $dest_slice_length || + ( $dest_slice_sr_id ne $seq_region_id )) { + next FEATURE; + } + + + $slice = $dest_slice; + } + + # Finally, create the new PredictionTranscript. + push( @ptranscripts, + $self->_create_feature('Bio::EnsEMBL::PredictionTranscript', { + '-start' => $seq_region_start, + '-end' => $seq_region_end, + '-strand' => $seq_region_strand, + '-adaptor' => $self, + '-slice' => $slice, + '-analysis' => $analysis, + '-dbID' => $prediction_transcript_id, + '-display_label' => $display_label + } ) ); + + } + + return \@ptranscripts; +} + + + +=head2 store + + Arg [1] : list of Bio::EnsEMBL::PredictionTranscript @pre_transcripts + Example : $prediction_transcript_adaptor->store(@pre_transcripts); + Description: Stores a list of given prediction transcripts in database. + Puts dbID and Adaptor into each object stored object. + Returntype : none + Exceptions : on wrong argument type + Caller : general + Status : Stable + +=cut + +sub store { + my ( $self, @pre_transcripts ) = @_; + + my $ptstore_sth = $self->prepare + (qq{INSERT INTO prediction_transcript (seq_region_id, seq_region_start, + seq_region_end, seq_region_strand, + analysis_id, display_label) + VALUES( ?, ?, ?, ?, ?, ?)}); + + my $ptupdate_sth = $self->prepare + (qq{UPDATE prediction_transcript SET display_label = ? + WHERE prediction_transcript_id = ?}); + + my $db = $self->db(); + my $analysis_adaptor = $db->get_AnalysisAdaptor(); + my $pexon_adaptor = $db->get_PredictionExonAdaptor(); + + FEATURE: foreach my $pt (@pre_transcripts) { + if(!ref($pt) || !$pt->isa('Bio::EnsEMBL::PredictionTranscript')) { + throw('Expected PredictionTranscript argument not [' . ref($pt).']'); + } + + #skip prediction transcripts that have already been stored + if($pt->is_stored($db)) { + warning('Not storing already stored prediction transcript '. $pt->dbID); + next FEATURE; + } + + #get analysis and store it if it is not in the db + my $analysis = $pt->analysis(); + if(!$analysis) { + throw('Prediction transcript must have analysis to be stored.'); + } + if(!$analysis->is_stored($db)) { + $analysis_adaptor->store($analysis); + } + + #ensure that the transcript coordinates are correct, they may not be, + #if somebody has done some exon coordinate juggling and not recalculated + #the transcript coords. + $pt->recalculate_coordinates(); + + my $original = $pt; + my $seq_region_id; + ($pt, $seq_region_id) = $self->_pre_store($pt); + + #store the prediction transcript + $ptstore_sth->bind_param(1,$seq_region_id,SQL_INTEGER); + $ptstore_sth->bind_param(2,$pt->start,SQL_INTEGER); + $ptstore_sth->bind_param(3,$pt->end,SQL_INTEGER); + $ptstore_sth->bind_param(4,$pt->strand,SQL_TINYINT); + $ptstore_sth->bind_param(5,$analysis->dbID,SQL_INTEGER); + $ptstore_sth->bind_param(6,$pt->display_label,SQL_VARCHAR); + + $ptstore_sth->execute(); + + my $pt_id = $ptstore_sth->{'mysql_insertid'}; + $original->dbID($pt_id); + $original->adaptor($self); + + #store the exons + my $rank = 1; + foreach my $pexon (@{$original->get_all_Exons}) { + $pexon_adaptor->store($pexon, $pt_id, $rank++); + } + + # if a display label was not defined autogenerate one + if(!defined($pt->display_label())) { + my $zeros = '0' x (11 - length($pt_id)); + my $display_label = uc($analysis->logic_name()) . $zeros . $pt_id; + $ptupdate_sth->bind_param(1,$display_label,SQL_VARCHAR); + $ptupdate_sth->bind_param(2,$pt_id,SQL_INTEGER); + $ptupdate_sth->execute(); + $original->display_label($display_label); + } + } +} + + + +=head2 remove + + Arg [1] : Bio::EnsEMBL::PredictionTranscript $pt + Example : $prediction_transcript_adaptor->remove($pt); + Description: removes given prediction transcript $pt from database. + Returntype : none + Exceptions : throws if argument not a Bio::EnsEMBL::PredictionTranscript + Caller : general + Status : Stable + +=cut + +sub remove { + my $self = shift; + my $pre_trans = shift; + + if(!ref($pre_trans)||!$pre_trans->isa('Bio::EnsEMBL::PredictionTranscript')){ + throw('Expected PredictionTranscript argument.'); + } + + if(!$pre_trans->is_stored($self->db())) { + warning('PredictionTranscript is not stored in this DB - not removing.'); + return; + } + + #remove all associated prediction exons + my $pexon_adaptor = $self->db()->get_PredictionExonAdaptor(); + foreach my $pexon (@{$pre_trans->get_all_Exons}) { + $pexon_adaptor->remove($pexon); + } + + #remove the prediction transcript + my $sth = $self->prepare( "DELETE FROM prediction_transcript + WHERE prediction_transcript_id = ?" ); + $sth->bind_param(1,$pre_trans->dbID,SQL_INTEGER); + $sth->execute(); + + #unset the adaptor and internal id + $pre_trans->dbID(undef); + $pre_trans->adaptor(undef); +} + + +=head2 list_dbIDs + + Arg [1] : none + Example : @feature_ids = @{$prediction_transcript_adaptor->list_dbIDs()}; + Description: Gets an array of internal ids for all prediction transcript + features in the current db + Arg[1] : int. not 0 for the ids to be sorted by the seq_region. + Returntype : list of ints + Exceptions : none + Caller : ? + Status : Stable + +=cut + +sub list_dbIDs { + my ($self, $ordered) = @_; + + return $self->_list_dbIDs("prediction_transcript", undef, $ordered); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/ProteinAlignFeatureAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/ProteinAlignFeatureAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,406 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::ProteinAlignFeatureAdaptor - +Adaptor for ProteinAlignFeatures + +=head1 SYNOPSIS + + $pafa = + $registry->get_adaptor( 'Human', 'Core', 'ProteinAlignFeature' ); + + my @features = @{ $pafa->fetch_all_by_Slice($slice) }; + + $pafa->store(@features); + +=head1 METHODS + +=cut + + +package Bio::EnsEMBL::DBSQL::ProteinAlignFeatureAdaptor; +use vars qw(@ISA); +use strict; + +use Bio::EnsEMBL::DBSQL::BaseAlignFeatureAdaptor; +use Bio::EnsEMBL::DnaPepAlignFeature; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAlignFeatureAdaptor); + + +=head2 store + + Arg [1] : list of Bio::EnsEMBL::DnaPepAlignFeature @feats + Example : $protein_align_feature_adaptor->store(@feats); + Description: stores a list of ProteinAlignFeatures in the database + Returntype : none + Exceptions : throw if any of the provided features cannot be stored + which may occur if: + * The feature does not have an associated Slice + * The feature does not have an associated analysis + * The Slice the feature is associated with is on a seq_region + unknown to this database + A warning is given if: + * The feature has already been stored in this db + Caller : Pipeline + Status : Stable + +=cut + + +sub store{ + my ($self, @feats) = @_; + + throw("Must call store with features") if( scalar(@feats) == 0 ); + + my @tabs = $self->_tables; + my ($tablename) = @{$tabs[0]}; + + my $db = $self->db(); + my $slice_adaptor = $db->get_SliceAdaptor(); + my $analysis_adaptor = $db->get_AnalysisAdaptor(); + + my $sth = $self->prepare( + "INSERT INTO $tablename (seq_region_id, seq_region_start, seq_region_end, + seq_region_strand, hit_start, hit_end, + hit_name, cigar_line, + analysis_id, score, evalue, perc_ident, external_db_id, hcoverage) + VALUES (?,?,?,?,?,?,?,?,?,?, ?, ?, ?, ?)"); + + FEATURE: foreach my $feat ( @feats ) { + if( !ref $feat || !$feat->isa("Bio::EnsEMBL::DnaPepAlignFeature") ) { + throw("feature must be a Bio::EnsEMBL::DnaPepAlignFeature," + . " not a [".ref($feat)."]."); + } + + if($feat->is_stored($db)) { + warning("PepDnaAlignFeature [".$feat->dbID."] is already stored" . + " in this database."); + next FEATURE; + } + + #sanity check the hstart and hend + my $hstart = $feat->hstart(); + my $hend = $feat->hend(); + $self->_check_start_end_strand($hstart,$hend,1); + + my $cigar_string = $feat->cigar_string(); + if(!$cigar_string) { + $cigar_string = $feat->length() . 'M'; + warning("DnaPepAlignFeature does not define a cigar_string.\n" . + "Assuming ungapped block with cigar_string=$cigar_string\n"); + } + + my $hseqname = $feat->hseqname(); + if(!$hseqname) { + throw("DnaPepAlignFeature must define an hseqname."); + } + + if(!defined($feat->analysis)) { + throw("An analysis must be attached to the features to be stored."); + } + + #store the analysis if it has not been stored yet + if(!$feat->analysis->is_stored($db)) { + $analysis_adaptor->store($feat->analysis()); + } + + my $slice = $feat->slice(); + if(!defined($slice) || !($slice->isa("Bio::EnsEMBL::Slice") or $slice->isa('Bio::EnsEMBL::LRGSlice')) ) { + throw("A slice must be attached to the features to be stored."); + } + + my $original = $feat; + my $seq_region_id; + ($feat, $seq_region_id) = $self->_pre_store($feat); + + $sth->bind_param(1,$seq_region_id,SQL_INTEGER); + $sth->bind_param(2,$feat->start,SQL_INTEGER); + $sth->bind_param(3,$feat->end,SQL_INTEGER); + $sth->bind_param(4,$feat->strand,SQL_TINYINT); + $sth->bind_param(5,$feat->hstart,SQL_INTEGER); + $sth->bind_param(6,$feat->hend,SQL_INTEGER); + $sth->bind_param(7,$feat->hseqname,SQL_VARCHAR); + $sth->bind_param(8,$feat->cigar_string,SQL_LONGVARCHAR); + $sth->bind_param(9,$feat->analysis->dbID,SQL_INTEGER); + $sth->bind_param(10,$feat->score,SQL_DOUBLE); + $sth->bind_param(11,$feat->p_value,SQL_DOUBLE); + $sth->bind_param(12,$feat->percent_id,SQL_REAL); + $sth->bind_param(13,$feat->external_db_id,SQL_INTEGER); + $sth->bind_param(14,$feat->hcoverage,SQL_DOUBLE); + + $sth->execute(); + $original->dbID($sth->{'mysql_insertid'}); + $original->adaptor($self); + } + + $sth->finish(); +} + + +=head2 _objs_from_sth + + Arg [1] : DBI statement handle $sth + an exectuted DBI statement handle generated by selecting + the columns specified by _columns() from the table specified + by _table() + Example : @dna_dna_align_feats = $self->_obj_from_hashref + Description: PROTECTED implementation of superclass abstract method. + Creates DnaDnaAlignFeature objects from a DBI hashref + Returntype : listref of Bio::EnsEMBL::ProteinAlignFeatures + Exceptions : none + Caller : Bio::EnsEMBL::BaseFeatureAdaptor::generic_fetch + Status : Stable + +=cut + +sub _objs_from_sth { + my ($self, $sth, $mapper, $dest_slice) = @_; + + # + # This code is ugly because an attempt has been made to remove as many + # function calls as possible for speed purposes. Thus many caches and + # a fair bit of gymnastics is used. + # + + my $sa = $self->db()->get_SliceAdaptor(); + my $aa = $self->db->get_AnalysisAdaptor(); + + my @features; + my %analysis_hash; + my %slice_hash; + my %sr_name_hash; + my %sr_cs_hash; + + my ($protein_align_feature_id, $seq_region_id, $seq_region_start, + $seq_region_end, $analysis_id, $seq_region_strand, $hit_start, + $hit_end, $hit_name, $cigar_line, $evalue, $perc_ident, $score, + $external_db_id, $hcoverage, $external_db_name, $external_display_db_name ); + + $sth->bind_columns(\$protein_align_feature_id, \$seq_region_id, + \$seq_region_start,\$seq_region_end, \$analysis_id, + \$seq_region_strand, \$hit_start,\$hit_end, \$hit_name, + \$cigar_line, \$evalue, \$perc_ident, \$score, + \$external_db_id, \$hcoverage, \$external_db_name, \$external_display_db_name ); + + my $asm_cs; + my $cmp_cs; + my $asm_cs_vers; + my $asm_cs_name; + my $cmp_cs_vers; + my $cmp_cs_name; + if($mapper) { + $asm_cs = $mapper->assembled_CoordSystem(); + $cmp_cs = $mapper->component_CoordSystem(); + $asm_cs_name = $asm_cs->name(); + $asm_cs_vers = $asm_cs->version(); + $cmp_cs_name = $cmp_cs->name(); + $cmp_cs_vers = $cmp_cs->version(); + } + + my $dest_slice_start; + my $dest_slice_end; + my $dest_slice_strand; + my $dest_slice_length; + my $dest_slice_sr_name; + my $dest_slice_sr_id; + if($dest_slice) { + $dest_slice_start = $dest_slice->start(); + $dest_slice_end = $dest_slice->end(); + $dest_slice_strand = $dest_slice->strand(); + $dest_slice_length = $dest_slice->length(); + $dest_slice_sr_name = $dest_slice->seq_region_name(); + $dest_slice_sr_id = $dest_slice->get_seq_region_id(); + } + + FEATURE: while($sth->fetch()) { + #get the analysis object + my $analysis = $analysis_hash{$analysis_id} ||= + $aa->fetch_by_dbID($analysis_id); + + #get the slice object + my $slice = $slice_hash{"ID:".$seq_region_id}; + + if(!$slice) { + $slice = $sa->fetch_by_seq_region_id($seq_region_id); + $slice_hash{"ID:".$seq_region_id} = $slice; + $sr_name_hash{$seq_region_id} = $slice->seq_region_name(); + $sr_cs_hash{$seq_region_id} = $slice->coord_system(); + } + + my $sr_name = $sr_name_hash{$seq_region_id}; + my $sr_cs = $sr_cs_hash{$seq_region_id}; + # + # remap the feature coordinates to another coord system + # if a mapper was provided + # + if($mapper) { + + if (defined $dest_slice && $mapper->isa('Bio::EnsEMBL::ChainedAssemblyMapper') ) { + ( $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand ) + = + $mapper->map( $sr_name, $seq_region_start, $seq_region_end, + $seq_region_strand, $sr_cs, 1, $dest_slice); + + } else { + + ( $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand ) + = + $mapper->fastmap( $sr_name, $seq_region_start, $seq_region_end, + $seq_region_strand, $sr_cs ); + } + + #skip features that map to gaps or coord system boundaries + next FEATURE if(!defined($seq_region_id)); + + # #get a slice in the coord system we just mapped to + # if($asm_cs == $sr_cs || ($cmp_cs != $sr_cs && $asm_cs->equals($sr_cs))) { + $slice = $slice_hash{"ID:".$seq_region_id} ||= + $sa->fetch_by_seq_region_id($seq_region_id); +# } else { +# $slice = $slice_hash{"ID:".$seq_region_id} ||= +# $sa->fetch_by_seq_region_id($asm_cs_name, $sr_name, undef, undef, undef, +# $asm_cs_vers); +# } + } + + # + # If a destination slice was provided convert the coords + # If the dest_slice starts at 1 and is foward strand, nothing needs doing + # + if($dest_slice) { + if($dest_slice_start != 1 || $dest_slice_strand != 1) { + if($dest_slice_strand == 1) { + $seq_region_start = $seq_region_start - $dest_slice_start + 1; + $seq_region_end = $seq_region_end - $dest_slice_start + 1; + } else { + my $tmp_seq_region_start = $seq_region_start; + $seq_region_start = $dest_slice_end - $seq_region_end + 1; + $seq_region_end = $dest_slice_end - $tmp_seq_region_start + 1; + $seq_region_strand *= -1; + } + } + + #throw away features off the end of the requested slice + if($seq_region_end < 1 || $seq_region_start > $dest_slice_length || + ( $dest_slice_sr_id ne $seq_region_id )) { + next FEATURE; + } + $slice = $dest_slice; + } + + # Finally, create the new ProteinAlignFeature. + push( + @features, + $self->_create_feature_fast( + 'Bio::EnsEMBL::DnaPepAlignFeature', { + 'slice' => $slice, + 'start' => $seq_region_start, + 'end' => $seq_region_end, + 'strand' => $seq_region_strand, + 'hseqname' => $hit_name, + 'hstart' => $hit_start, + 'hend' => $hit_end, + 'hstrand' => 1, # dna_pep_align features + # are always hstrand 1 + 'score' => $score, + 'p_value' => $evalue, + 'percent_id' => $perc_ident, + 'cigar_string' => $cigar_line, + 'analysis' => $analysis, + 'adaptor' => $self, + 'dbID' => $protein_align_feature_id, + 'external_db_id' => $external_db_id, + 'hcoverage' => $hcoverage, + 'dbname' => $external_db_name, + 'db_display_name' => $external_display_db_name + } ) ); + + } + + return \@features; +} + + + +sub _tables { + my $self = shift; + + return (['protein_align_feature', 'paf'], ['external_db','exdb']); +} + + +sub _columns { + my $self = shift; + + #warning _objs_from_hashref method depends on ordering of this list + return qw( paf.protein_align_feature_id + paf.seq_region_id + paf.seq_region_start + paf.seq_region_end + paf.analysis_id + paf.seq_region_strand + paf.hit_start + paf.hit_end + paf.hit_name + paf.cigar_line + paf.evalue + paf.perc_ident + paf.score + paf.external_db_id + paf.hcoverage + exdb.db_name + exdb.db_display_name); +} + +sub _left_join{ + return (['external_db',"exdb.external_db_id = paf.external_db_id"]); +} + +=head2 list_dbIDs + + Arg [1] : none + Example : @feature_ids = @{$protein_align_feature_adaptor->list_dbIDs()}; + Description: Gets an array of internal ids for all protein align + features in the current db + Arg[1] : int. not 0 for the ids to be sorted by the seq_region. + Returntype : listref of ints + Exceptions : none + Caller : ? + Status : Stable + +=cut + +sub list_dbIDs { + my ($self,$ordered) = @_; + + return $self->_list_dbIDs("protein_align_feature", undef, $ordered); +} + + + + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/ProteinFeatureAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/ProteinFeatureAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,371 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::ProteinFeatureAdaptor + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Registry; + + Bio::EnsEMBL::Registry->load_registry_from_db( + -host => 'ensembldb.ensembl.org', + -user => 'anonymous' + ); + + $pfa = Bio::EnsEMBL::Registry->get_adaptor( "human", "core", + "proteinfeature" ); + + my @prot_feats = @{ $pfa->fetch_all_by_translation_id(1231) }; + + my $prot_feat = $pfa->fetch_by_dbID(523); + +=head1 METHODS + +=cut + + +package Bio::EnsEMBL::DBSQL::ProteinFeatureAdaptor; + +use strict; + +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::ProteinFeature; +use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning); + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); + + +=head2 fetch_all_by_translation_id + + Arg [1] : int $transl + the internal id of the translation corresponding to protein + whose features are desired + Example : @prot_feats = + @{ $prot_feat_adaptor->fetch_by_translation_id(1234) }; + Description: Gets all protein features present on a peptide using the + translations internal identifier. This method will return + an unsorted list of all protein_feature types. The feature + types may be distinguished using the logic name attribute of + the attached analysis objects. + Returntype : listref of Bio::EnsEMBL::ProteinFeatures + Exceptions : none + Caller : ? + Status : Stable + +=cut + +sub fetch_all_by_translation_id { + my($self,$translation_id) = @_; + + if(!$translation_id) { + throw("translation_id argument is required\n"); + } + + my @features; + my $analysis_adaptor = $self->db()->get_AnalysisAdaptor(); + + my $sth = $self->prepare + ("SELECT protein_feature_id, p.seq_start, p.seq_end, p.analysis_id, " . + " p.score, p.perc_ident, p.evalue, p.hit_start, p.hit_end, " . + " p.hit_name, x.display_label, i.interpro_ac " . + "FROM protein_feature p " . + "LEFT JOIN interpro AS i ON p.hit_name = i.id " . + "LEFT JOIN xref AS x ON x.dbprimary_acc = i.interpro_ac " . + "WHERE p.translation_id = ?"); + + $sth->bind_param(1,$translation_id,SQL_INTEGER); + $sth->execute(); + + while( my $row = $sth->fetchrow_arrayref) { + my ($dbID, $start, $end, $analysisid, $score, $perc_id, $evalue, $hstart, + $hend,$hid,$desc, $interpro_ac) = @$row; + + my $analysis = $analysis_adaptor->fetch_by_dbID($analysisid); + + if(!$analysis) { + warning("Analysis with dbID=$analysisid does not exist\n" . + "but is referenced by ProteinFeature $dbID"); + } + + my $feat = Bio::EnsEMBL::ProteinFeature->new + (-DBID => $dbID, + -ADAPTOR => $self, + -SEQNAME => $translation_id, + -START => $start, + -END => $end, + -ANALYSIS => $analysis, + -PERCENT_ID => $perc_id, + -P_VALUE => $evalue, + -SCORE => $score, + -HSTART => $hstart, + -HEND => $hend, + -HSEQNAME => $hid, + -IDESC => $desc, + -INTERPRO_AC => $interpro_ac); + + push(@features,$feat); + } + + $sth->finish(); + + return \@features; +} + + +=head2 fetch_by_dbID + + Arg [1] : int $protfeat_id + the unique database identifier of the protein feature to obtain + Example : my $feature = $prot_feat_adaptor->fetch_by_dbID(); + Description: Obtains a protein feature object via its unique id + Returntype : Bio::EnsEMBL::ProteinFeauture + Exceptions : none + Caller : ? + Status : Stable + +=cut + +sub fetch_by_dbID{ + my ($self,$protfeat_id) = @_; + + my $sth = $self->prepare( + "SELECT p.seq_start, p.seq_end, p.analysis_id, " . + " p.score, p.perc_ident, p.evalue, " . + " p.hit_start, p.hit_end, p.hit_name, " . + " x.display_label, i.interpro_ac " . + "FROM protein_feature p " . + "LEFT JOIN interpro AS i ON p.hit_name = i.id " . + "LEFT JOIN xref AS x ON x.dbprimary_acc = i.interpro_ac " . + "WHERE p.protein_feature_id = ?"); + + $sth->bind_param(1,$protfeat_id,SQL_INTEGER); + my $res = $sth->execute(); + + if($sth->rows == 0) { + $sth->finish(); + return undef; + } + + my ($start, $end, $analysis_id, $score, $perc_ident, $pvalue, $hstart, + $hend, $hseqname, $idesc, $interpro_ac) = $sth->fetchrow_array(); + + $sth->finish(); + + my $analysis = $self->db->get_AnalysisAdaptor->fetch_by_dbID($analysis_id); + + return Bio::EnsEMBL::ProteinFeature->new + (-ADAPTOR => $self, + -DBID => $protfeat_id, + -START => $start, + -END => $end, + -HSTART => $hstart, + -HEND => $hend, + -HSEQNAME => $hseqname, + -ANALYSIS => $analysis, + -SCORE => $score, + -P_VALUE => $pvalue, + -PERCENT_ID => $perc_ident, + -IDESC => $idesc, + -INTERPRO_AC => $interpro_ac); +} + + + + +=head2 store + + Arg [1] : Bio::EnsEMBL::ProteinFeature $feature + The feature to be stored + Arg [2] : int $translation_id + + Example : $protein_feature_adaptor->store($protein_feature); + Description: Stores a protein feature in the database + Returntype : int - the new internal identifier of the stored protein feature + Exceptions : thrown if arg is not a Bio::EnsEMBL: + Caller : none + Status : Stable + +=cut + +sub store { + my ($self,$feature, $translation_id) = @_; + + if(!ref($feature) || !$feature->isa('Bio::EnsEMBL::ProteinFeature')) { + throw("ProteinFeature argument is required"); + } + + if(!$translation_id) { + deprecate("Calling ProteinFeatureAdaptor without a translation_id is " . + "deprecated. Pass a translation_id argument rather than " . + "setting the ProteinFeature seqname to be the translation " . + "id"); + $translation_id = $feature->seqname(); + } + + my $db = $self->db(); + + if($feature->is_stored($db)) { + warning("ProteinFeature " . $feature->dbID() . " is already stored in " . + "this database - not storing again"); + } + + my $analysis = $feature->analysis(); + if (!defined($analysis)) { + throw("Feature doesn't have analysis. Can't write to database"); + } + + if(!$analysis->is_stored($db)) { + $db->get_AnalysisAdaptor->store($analysis); + } + + my $sth = + $self->prepare("INSERT INTO protein_feature " . + " SET translation_id = ?, " . + " seq_start = ?, ". + " seq_end = ?, ". + " analysis_id = ?, ". + " hit_start = ?, ". + " hit_end = ?, ". + " hit_name = ?, ". + " score = ?, ". + " perc_ident = ?, ". + " evalue = ?"); + + $sth->bind_param(1,$translation_id,SQL_INTEGER); + $sth->bind_param(2,$feature->start,SQL_INTEGER); + $sth->bind_param(3,$feature->end,SQL_INTEGER); + $sth->bind_param(4,$analysis->dbID,SQL_INTEGER); + $sth->bind_param(5,$feature->hstart,SQL_INTEGER); + $sth->bind_param(6,$feature->hend,SQL_INTEGER); + $sth->bind_param(7,$feature->hseqname,SQL_VARCHAR); + $sth->bind_param(8,$feature->score,SQL_DOUBLE); + $sth->bind_param(9,$feature->percent_id,SQL_FLOAT); + $sth->bind_param(10,$feature->p_value,SQL_DOUBLE); + + $sth->execute(); + + my $dbID = $sth->{'mysql_insertid'}; + + $feature->adaptor($self); + $feature->dbID($dbID); + + $sth->finish(); + + return $dbID; +} + + + +sub fetch_by_translation_id { + deprecate("Use fetch_all_by_translation_id instead."); + fetch_all_by_translation_id(@_); +} + +sub fetch_all_by_feature_and_dbID { + my $self = shift; + my $feature = shift; + my $translation_id = shift; + deprecate("Use fetch_all_by_translation_id instead."); + + print STDERR "translation_id = $translation_id feature = $feature\n"; + + my $features = $self->fetch_all_by_translation_id($translation_id); + + my @out; + foreach my $f (@$features) { + my $logic_name = lc($f->analysis->logic_name()); + print STDERR "LOGIC_NAME = $logic_name | FEATURE = $feature\n"; + push(@out, $f) if($logic_name eq lc($feature)); + } + + return \@out; +} + + +sub save { + + my ($self, $features) = @_; + + my @feats = @$features; + throw("Must call save with features") if( scalar(@feats) == 0 ); + +# my @tabs = $self->_tables; +# my ($tablename) = @{$tabs[0]}; + my $tablename = 'protein_feature'; + + my $db = $self->db(); + my $analysis_adaptor = $db->get_AnalysisAdaptor(); + + my $sql = qq{INSERT INTO $tablename (translation_id, seq_start, seq_end, hit_start, hit_end, hit_name, analysis_id, score, evalue, perc_ident, external_data) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)}; + + my $sth = $self->prepare($sql); + + foreach my $feat ( @feats ) { + if( !ref $feat || !$feat->isa("Bio::EnsEMBL::ProteinFeature") ) { + throw("feature must be a Bio::EnsEMBL::ProteinFeature,". " not a [".ref($feat)."]."); + } + + if($feat->is_stored($db)) { + warning("ProteinFeature [".$feat->dbID."] is already stored" . + " in this database."); + next; + } + + my $hstart = defined $feat->hstart ? $feat->hstart : $feat->start ; + my $hend = defined $feat->hend ? $feat->hend : $feat->end; + + if(!defined($feat->analysis)) { + throw("An analysis must be attached to the features to be stored."); + } + + #store the analysis if it has not been stored yet + if(!$feat->analysis->is_stored($db)) { + $analysis_adaptor->store($feat->analysis()); + } + + my $original = $feat; + my $extra_data = $feat->extra_data ? $self->dump_data($feat->extra_data) : ''; + + $sth->bind_param(1,$feat->translation_id,SQL_INTEGER); + $sth->bind_param(2,$feat->start,SQL_INTEGER); + $sth->bind_param(3,$feat->end,SQL_INTEGER); + $sth->bind_param(4,$hstart,SQL_INTEGER); + $sth->bind_param(5,$hend,SQL_INTEGER); + $sth->bind_param(6,$feat->hseqname,SQL_VARCHAR); + $sth->bind_param(7,$feat->analysis->dbID,SQL_INTEGER); + $sth->bind_param(8,$feat->score,SQL_DOUBLE); + $sth->bind_param(9,$feat->p_value,SQL_DOUBLE); + $sth->bind_param(10,$feat->percent_id,SQL_FLOAT); + $sth->bind_param(11,$extra_data,SQL_LONGVARCHAR); + + $sth->execute(); + $original->dbID($sth->{'mysql_insertid'}); + $original->adaptor($self); + } + + $sth->finish(); +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/ProxyDBConnection.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/ProxyDBConnection.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,260 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::ProxyDBConnection - Database connection wrapper allowing +for one backing connection to be used for multiple DBs + +=head1 SYNOPSIS + + my $dbc = Bio::EnsEMBL::DBSQL::DBConnection->new(-HOST => 'host', -PORT => 3306, -USER => 'user'); + my $p_h_dbc = Bio::EnsEMBL::DBSQL::ProxyDBConnection->new(-DBC => $dbc, -DBNAME => 'human'); + my $p_m_dbc = Bio::EnsEMBL::DBSQL::ProxyDBConnection->new(-DBC => $dbc, -DBNAME => 'mouse'); + + # With a 10 minute timeout reconnection in milliseconds + my $p_h_rc_dbc = Bio::EnsEMBL::DBSQL::ProxyDBConnection->new(-DBC => $dbc, -DBNAME => 'human', -RECONNECT_INTERVAL => (10*60*1000)); + +=head1 DESCRIPTION + +This class is used to maintain one active connection to a database whilst it +appears to be working against multiple schemas. It does this by checking the +currently connected database before performing any query which could require +a database change such as prepare. + +This class is only intended for internal use so please do not use unless +you are aware of what it will do and what the consequences of its usage are. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::ProxyDBConnection; + +use strict; +use warnings; + +use base qw/Bio::EnsEMBL::Utils::Proxy/; + +use Bio::EnsEMBL::Utils::Argument qw/rearrange/; +use Bio::EnsEMBL::Utils::Exception qw/warning throw/; +use Bio::EnsEMBL::Utils::SqlHelper; + +use Time::HiRes qw/time/; + +sub new { + my ($class, @args) = @_; + my ($dbc, $dbname, $reconnect_interval) = rearrange([qw/DBC DBNAME RECONNECT_INTERVAL/], @args); + throw "No DBConnection -DBC given" unless $dbc; + throw "No database name -DBNAME given" unless $dbname; + my $self = $class->SUPER::new($dbc); + $self->dbname($dbname); + if($reconnect_interval) { + $self->reconnect_interval($reconnect_interval); + $self->_last_used(); + } + return $self; +} + +=head2 switch_database + + Description : Performs a switch of the backing DBConnection if the currently + connected database is not the same as the database this proxy + wants to connect to. It currently supports MySQL, Oracle and + Postges switches is untested with all bar MySQL. If it + cannot do a live DB/schema switch then it will disconnect + the connection and then wait for the next process to + connect therefore switching the DB. + Exceptions : None but will warn if you attempt to switch a DB with + active kids attached to the proxied database handle. + +=cut + +sub switch_database { + my ($self) = @_; + my $proxy = $self->__proxy(); + my $backing_dbname = $proxy->dbname(); + my $dbname = $self->dbname(); + + my $switch = 0; + if(defined $dbname) { + if(defined $backing_dbname) { + $switch = ($dbname ne $backing_dbname) ? 1 : 0; + } + else { + $switch = 1; + } + } + else { + $switch = 1 if defined $backing_dbname; + } + + if($switch) { + $proxy->dbname($dbname); + if($proxy->connected()) { + my $kids = $proxy->db_handle()->{Kids}; + my $driver = lc($proxy->driver()); + #Edit to add other DB switching strategies on a per driver basis + if($driver eq 'mysql') { + $proxy->do('use '.$dbname); + } + elsif($driver eq 'oracle') { + $proxy->do('ALTER SESSION SET CURRENT_SCHEMA = '.$dbname); + } + elsif($driver eq 'pg') { + $proxy->do('set search_path to '.$dbname); + } + else { + if($kids > 0) { + warning "Attempting a database switch from '$backing_dbname' to '$dbname' with $kids active handle(s). Check your logic or do not use a ProxyDBConnection"; + } + $proxy->disconnect_if_idle(); + } + } + } + + return $switch; +} + +=head2 check_reconnection + + Description : Looks to see if the last time we used the backing DBI + connection was greater than the reconnect_interval() + provided at construction or runtime. If enought time has + elapsed then a reconnection is attempted. We do not + attempt a reconnection if: + + - No reconnect_interval was set + - The connection was not active + + Exceptions : None apart from those raised from the reconnect() method + from DBConnection +=cut + +sub check_reconnection { + my ($self) = @_; + #Return early if we had no reconnection interval + return unless $self->{reconnect_interval}; + + my $proxy = $self->__proxy(); + + #Only attempt it if we were connected; otherwise we can just skip + if($proxy->connected()) { + if($self->_require_reconnect()) { + $proxy->reconnect(); + } + $self->_last_used(); + } + return; +} + +# Each time this is called we record the current time in seconds +# to be used by the _require_reconnect() method +sub _last_used { + my ($self) = @_; + $self->{_last_used} = int(time()*1000); + return; +} + +# Uses the _last_used() time and the current reconnect_interval() to decide +# if the connection has been unused for long enough that we should attempt +# a reconnect +sub _require_reconnect { + my ($self) = @_; + my $interval = $self->reconnect_interval(); + return unless $interval; + my $last_used = $self->{_last_used}; + my $time_elapsed = int(time()*1000) - $last_used; + return $time_elapsed > $interval ? 1 : 0; +} + +=head2 reconnect_interval + + Arg[1] : Integer reconnection interval in milliseconds + Description : Accessor for the reconnection interval expressed in milliseconds + Returntype : Int miliseconds for a reconnection interval + +=cut + +sub reconnect_interval { + my ($self, $reconnect_interval) = @_; + $self->{'reconnect_interval'} = $reconnect_interval if defined $reconnect_interval; + return $self->{'reconnect_interval'}; +} + +=head2 dbname + + Arg[1] : String DB name + Description : Accessor for the name of the database we should use whenever + a DBConnection request is made via this class + Returntype : String the name of the database which we should use + Exceptions : None + +=cut + +sub dbname { + my ($self, $dbname) = @_; + $self->{'dbname'} = $dbname if defined $dbname; + return $self->{'dbname'}; +} + +my %SWITCH_METHODS = map { $_ => 1 } qw/ + connect + db_handle + do + prepare + reconnect + work_with_db_handle +/; + + +# Manual override of the SqlHelper accessor to ensure it always gets the Proxy +sub sql_helper { + my ($self) = @_; + if(! exists $self->{_sql_helper}) { + my $helper = Bio::EnsEMBL::Utils::SqlHelper->new(-DB_CONNECTION => $self); + $self->{_sql_helper} = $helper; + } + return $self->{_sql_helper}; +} + +sub __resolver { + my ($self, $package, $method) = @_; + if($self->__proxy()->can($method)) { + if($SWITCH_METHODS{$method}) { + return sub { + my ($local_self, @args) = @_; + $local_self->check_reconnection(); + $local_self->switch_database(); + $local_self->_last_used(); + return $local_self->__proxy()->$method(@args); + }; + } + else { + return sub { + my ($local_self, @args) = @_; + return $local_self->__proxy()->$method(@args); + }; + } + } + return; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/ProxySNPAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/ProxySNPAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,141 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::ProxySNPAdaptor + +=head1 SYNOPSIS + +Designed as an abstraction over the database specific SNPAdaptors. This +is written right now to serve as a replacement for a core SNPadaptor +which doesn''t even exist yet and probably never will since SNPs are +taken from external databases. In the future some sort of DBRegistry may +remove the need for this class. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::ProxySNPAdaptor; + +use strict; + +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw); + +use vars ('@ISA', '$AUTOLOAD'); + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); + +=head2 fetch_attributes_only + + Arg [1] : int refsnp_id + Arg [2] : (optional) string source + Example : none + Description: Retrieves a snp objcet from the SNP database but does not + populate the location information. This is necessary given + the current state of the snp database because location + information has to be retrieved differently for different + species! + Returntype : Bio::EnsEMBL::SNP + Exceptions : none + Caller : snpview + +=cut + + + +sub fetch_attributes_only{ + my ( $self, @args ) = @_; + + my $lite_db = Bio::EnsEMBL::Registry->get_db($self->db(),'lite'); + my $snp_db = Bio::EnsEMBL::Registry->get_db($self->db(),'SNP'); + + if( defined $snp_db ) { + my $snp_adaptor = $snp_db->get_SNPAdaptor(); + return $snp_adaptor->fetch_attributes_only( @args ); + } + + if( defined $lite_db ) { + my $snp_adaptor = $lite_db->get_SNPAdaptor(); + return $snp_adaptor->fetch_attributes_only( @args ); + } + +} + + + + +=head2 AUTOLOAD + + Arg [1] : list of arbitrary values @args + a list of arguments to pass to the request method + Example : - + Description: AUTOLOAD method should not be called directly. It is called + implicitly when a method requested from this class cannot be + found. This method first tries to execute the requested method + in the primary adaptor. If the method cannot be found then + it searches the other attached databases for equivalent adaptors + and tries then one at a time. + Returntype : arbitrary + Exceptions : thrown if the requested method cannot be found on the primary + adaptor or on any of the attached databases. + Caller : called implicitly by perl + +=cut + +sub AUTOLOAD { + my ($self, @args) = @_; + + #determine the method which was called + my $method = $AUTOLOAD; + + #strip out fully qualified method name + $method =~ s/.*:://; + + my $lite_db = Bio::EnsEMBL::Registry->get_db($self->db(),'lite'); + my $snp_db = Bio::EnsEMBL::Registry->get_db($self->db(),'SNP'); + + if( defined $lite_db ) { + my $snp_adaptor = $lite_db->get_SNPAdaptor(); + if($snp_adaptor->can($method)) { + return $snp_adaptor->$method(@args); + } + } + + if( defined $snp_db ) { + my $snp_adaptor = $snp_db->get_SNPAdaptor(); + if($snp_adaptor->can($method)) { + return $snp_adaptor->$method(@args); + } + } + + + + throw("The requested method $method could not be found in lite or snp" ); +} + +sub DESTROY { +} + +1; + +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/RepeatConsensusAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/RepeatConsensusAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,290 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::RepeatConsensusAdaptor + +=head1 SYNOPSIS + + $rca = $database_adaptor->get_RepeatConsensusAdaptor(); + + $repeat_consensus = $rca->fetch_by_dbID(132); + $repeat_consensus = $rca->fetch_by_name_class( 'AluSx', 'SINE/Alu' ); + + $rca->store( $rc1, $rc2, $rc3 ); + +=head1 DESCRIPTION + +This is an adaptor for the retrieval and storage of RepeatConsensus +objects. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::RepeatConsensusAdaptor; + +use strict; +use warnings; +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::RepeatConsensus; +use Bio::EnsEMBL::Utils::Exception qw(throw deprecate); + +use base qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); + +=head2 fetch_all_repeat_types + + Example : my $types = $rca->fetch_all_repeat_types(); + Description : Returns the distinct repeat types available from a database + Returntype : Array + Exceptions : - + +=cut + + +sub fetch_all_repeat_types { + my ($self) = @_; + return $self->dbc()->sql_helper()->execute_simple( + -SQL => 'SELECT DISTINCT repeat_type FROM repeat_consensus'); +} + + +=head2 fetch_by_dbID + + Arg [1] : int $db_id + The database identifier for the RepeatConsensus to obtain + Example : $repeat_consensus = $repeat_consensus_adaptor->fetch_by_dbID(4); + Description: Obtains a RepeatConsensus object from the database via its + primary key. + Returntype : Bio::EnsEMBL::RepeatConsensus + Exceptions : none + Caller : general, Bio::EnsEMBL::RepeatFeatureAdaptor + Status : Stable + +=cut + +sub fetch_by_dbID { + my( $self, $db_id ) = @_; + + my ($rc) = @{$self->_generic_fetch("repeat_consensus_id = $db_id")}; + + return $rc; +} + + + +=head2 fetch_by_name + + Arg [1] : string $name + the name of the repeat consensus to obtain + Example : $rc = $repeat_consensus_adaptor->fetch_by_name('AluSx'); + Description: Obtains a repeat consensus from the database via its name + Returntype : Bio::EnsEMBL::RepeatConsensus + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_by_name { + my( $self, $name ) = @_; + + my ($rc) = @{$self->_generic_fetch("repeat_name = '$name'")}; + + return $rc; +} + + +=head2 fetch_by_name_class + + Arg [1] : string $name + the name of the repeat consensus to obtain + Arg [2] : string $class + the class of the repeat consensus to obtain + Example : $rc = $repeat_consensus_adaptor-> + fetch_by_name_class('AluSx', 'SINE/Alu'); + Description: Obtains a repeat consensus from the database + via its name and class + Returntype : Bio::EnsEMBL::RepeatConsensus + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_by_name_class { + my( $self, $name, $class ) = @_; + + + my ($rc) = @{$self->_generic_fetch(qq{ + repeat_name = '$name' + AND repeat_class = '$class' + })}; + + return $rc; +} + + +=head2 fetch_all_by_class_seq + + Arg [1] : string $class + the class of the repeat consensus to obtain + Arg [2] : string $seq + the sequence of the repeat consensus to obtain + Example : $rc = $repeat_consensus_adaptor-> + fetch_all_by_class_seq('trf', 'ATGGTGTCA'); + Description: Obtains a repeat consensus from the database + via its class and sequence + Returntype : listREF of Bio::EnsEMBL::RepeatConsensus + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_class_seq { + my( $self, $class, $seq ) = @_; + + return $self->_generic_fetch(qq{ + repeat_class = '$class' + AND repeat_consensus = '$seq' + }); +} + + +sub fetch_by_class_seq { + deprecate('Use fetch_all_by_class_seq instead'); + fetch_all_by_class_seq(@_); +} + + +=head2 _generic_fetch + + Arg [1] : string $where_clause + Example : none + Description: PRIVATE used to create RepeatConsensus features from an + SQL constraint + Returntype : listref of Bio::EnsEMBL::RepeatConsensus objects + Exceptions : none + Caller : internal + Status : Stable + +=cut + +sub _generic_fetch { + my( $self, $where_clause ) = @_; + + my( $repeat_consensus_id, $repeat_name, $repeat_class,$repeat_length, + $repeat_consensus, $repeat_type ); + + my $sth = $self->prepare(qq{ + SELECT repeat_consensus_id + , repeat_name + , repeat_class + , repeat_type + , repeat_consensus + FROM repeat_consensus + WHERE }. $where_clause); + + $sth->execute; + $sth->bind_columns( + \$repeat_consensus_id, + \$repeat_name, + \$repeat_class, + \$repeat_type, + \$repeat_consensus + ); + + + my @consensi; + while ($sth->fetch) { + if ($repeat_consensus =~ /^(\d+)\(N\)$/) { + $repeat_length = $1; + } else { + $repeat_length = CORE::length($repeat_consensus); + } + + push @consensi, Bio::EnsEMBL::RepeatConsensus->new + (-DBID => $repeat_consensus_id, + -NAME => $repeat_name, + -REPEAT_CLASS => $repeat_class, + -REPEAT_TYPE => $repeat_type, + -LENGTH => $repeat_length, + -ADAPTOR => $self, + -REPEAT_CONSENSUS => $repeat_consensus); + } + return \@consensi; +} + + +=head2 store + + Arg [1] : list of Bio::EnsEMBL::RepeatConsensus @consensi + Example : $repeat_consensus_adaptor->store(@consensi); + Description: stores a list of RepeatConsensus objects in the database + Returntype : none + Exceptions : none + Caller : ? + Status : Stable + +=cut + +sub store { + my( $self, @consensi ) = @_; + + my $sth = $self->prepare(q{ + INSERT into repeat_consensus( repeat_consensus_id + , repeat_name + , repeat_class + , repeat_type + , repeat_consensus ) + VALUES (NULL, ?,?,?,?) + }); + + foreach my $rc (@consensi) { + my $name = $rc->name + or throw("name not set"); + my $class = $rc->repeat_class + or throw("repeat_class not set"); + my $type = $rc->repeat_type(); + $type = "" unless defined $type; + my $seq = $rc->repeat_consensus + or throw("repeat_consensus not set"); + + $sth->bind_param(1,$name,SQL_VARCHAR); + $sth->bind_param(2,$class,SQL_VARCHAR); + $sth->bind_param(3,$type,SQL_VARCHAR); + $sth->bind_param(4,$seq,SQL_LONGVARCHAR); + + $sth->execute(); + + my $db_id = $sth->{'mysql_insertid'} + or throw("Didn't get an insertid from the INSERT statement"); + + $rc->dbID($db_id); + $rc->adaptor($self); + } +} + +1; + +__END__ + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/RepeatFeatureAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/RepeatFeatureAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,511 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::RepeatFeatureAdaptor + +=head1 SYNOPSIS + + $rfa = $database_adaptor->get_RepeatFeatureAdaptor(); + + my $repeat = $rfa->fetch_by_dbID(1234); + my @repeats = @{ $rfa->fetch_all_by_Slice($slice) }; + +=head1 DESCRIPTION + +This is an adaptor for the retrieval and storage of RepeatFeature +objects from the database. Most of the implementation is in the +superclass BaseFeatureAdaptor. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::RepeatFeatureAdaptor; + +use strict; +use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; +use Bio::EnsEMBL::RepeatFeature; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Scalar qw/wrap_array/; + +use vars qw(@ISA); + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor); + + +=head2 fetch_all_by_Slice + + Arg [1] : Bio::EnsEMBL::Slice $slice + Arg [2] : (optional) string $logic_name + Limits RepeatFeatures obtained to those having an Analysis with + of the specified logic_name. If no logic name is specified + Repeats of all analysis types are retrieved. + Arg [3] : (optional) string/array $repeat_type + Limits RepeatFeatures obtained to those of specified + repeat_type + Example : @rfeats = @{$rfa->fetch_all_by_Slice($slice, undef, 'Type II Transposons')}; + @rfeats = @{$rfa->fetch_all_by_Slice($slice, undef, ['Type II Transposons', 'RNA repeats'])}; + Description: Retrieves repeat features overlapping the area designated by + the provided slice argument. Returned features will be in + in the same coordinate system as the provided slice and will + have coordinates relative to the slice start. + Returntype : reference to a list of Bio::EnsEMBL::RepeatFeatures. + Exceptions : throw on bad argument + Caller : Slice::get_all_RepeatFeatures + Status : Stable + +=cut + +sub fetch_all_by_Slice { + my $self = shift; + my $slice = shift; + my $logic_name = shift; + my $repeat_type = shift; + + my $constraint = ''; + + # MySQL was optimising the query the incorrect way when joining to + # the repeat_consensus table on type + $self->_straight_join(1); + + if($repeat_type) { + my $rta = wrap_array($repeat_type); + if(scalar(@{$rta}) > 1) { + $constraint .= sprintf('rc.repeat_type IN (%s)', join(q{,}, map {"'${_}'"} @{$rta})); + } + else { + $constraint .= "rc.repeat_type = '${repeat_type}'"; + } + } + + my $result = + $self->fetch_all_by_Slice_constraint($slice,$constraint,$logic_name); + + + $self->_straight_join(0); + + return $result; +} + + +# _tablename +# +# Arg [1] : none +# Example : none +# Description: PROTECTED Implementation of abstract superclass method to +# provide the name of the tables to query +# Returntype : string +# Exceptions : none +# Caller : internal + + +sub _tables { + my $self = shift; + + return (['repeat_feature', 'r'], ['repeat_consensus', 'rc']); +} + + +# _columns +# +# Arg [1] : none +# Example : none +# Description: PROTECTED Implementation of abstract superclass method to +# provide the name of the columns to query +# Returntype : list of strings +# Exceptions : none +# Caller : internal + +sub _columns { + my $self = shift; + + return qw (r.repeat_feature_id + r.seq_region_id + r.seq_region_start + r.seq_region_end + r.seq_region_strand + r.repeat_consensus_id + r.repeat_start + r.repeat_end + r.analysis_id + r.score + rc.repeat_name + rc.repeat_class + rc.repeat_type + rc.repeat_consensus); +} + + +# _default_where_clause +# Arg [1] : none +# Example : none +# Description: Overrides superclass method to provide an additional +# table joining constraint before the SQL query is performed. +# Returntype : string +# Exceptions : none +# Caller : generic_fetch +# + +sub _default_where_clause { + my $self = shift; + + return 'r.repeat_consensus_id = rc.repeat_consensus_id'; +} + + + +# Description: PROTECTED implementation of abstract superclass method. +# responsible for the creation of RepeatFeatures from a +# hashref generated from an SQL query + +sub _objs_from_sth { + my ($self, $sth, $mapper, $dest_slice) = @_; + + # + # This code is ugly because an attempt has been made to remove as many + # function calls as possible for speed purposes. Thus many caches and + # a fair bit of gymnastics is used. + # + + my $rca = $self->db()->get_RepeatConsensusAdaptor(); + my $sa = $self->db()->get_SliceAdaptor(); + my $aa = $self->db->get_AnalysisAdaptor(); + + my @features; + my %rc_hash; + my %analysis_hash; + my %slice_hash; + my %sr_name_hash; + my %sr_cs_hash; + + my($repeat_feature_id, $seq_region_id, $seq_region_start, $seq_region_end, + $seq_region_strand, $repeat_consensus_id, $repeat_start, $repeat_end, + $analysis_id, $score, $repeat_name, $repeat_class, $repeat_type, + $repeat_consensus); + + $sth->bind_columns( \$repeat_feature_id, \$seq_region_id, \$seq_region_start, + \$seq_region_end, \$seq_region_strand, + \$repeat_consensus_id, \$repeat_start,\$repeat_end, + \$analysis_id, \$score, \$repeat_name, \$repeat_class, + \$repeat_type, \$repeat_consensus ); + + my $asm_cs; + my $cmp_cs; + my $asm_cs_vers; + my $asm_cs_name; + my $cmp_cs_vers; + my $cmp_cs_name; + if($mapper) { + $asm_cs = $mapper->assembled_CoordSystem(); + $cmp_cs = $mapper->component_CoordSystem(); + $asm_cs_name = $asm_cs->name(); + $asm_cs_vers = $asm_cs->version(); + $cmp_cs_name = $cmp_cs->name(); + $cmp_cs_vers = $cmp_cs->version(); + } + + my $dest_slice_start; + my $dest_slice_end; + my $dest_slice_strand; + my $dest_slice_length; + my $dest_slice_sr_name; + my $dest_slice_sr_id; + if($dest_slice) { + $dest_slice_start = $dest_slice->start(); + $dest_slice_end = $dest_slice->end(); + $dest_slice_strand = $dest_slice->strand(); + $dest_slice_length = $dest_slice->length(); + $dest_slice_sr_name = $dest_slice->seq_region_name(); + $dest_slice_sr_id = $dest_slice->get_seq_region_id(); + } + + FEATURE: while($sth->fetch()) { + #create a repeat consensus object + + my $rc = $rc_hash{$repeat_consensus_id} ||= + Bio::EnsEMBL::RepeatConsensus->new_fast + ({'dbID' => $repeat_consensus_id, + 'adaptor' => $rca, + 'name' => $repeat_name, + 'repeat_class' => $repeat_class, + 'repeat_type' => $repeat_type, + 'repeat_consensus' => $repeat_consensus, + 'length' => length($repeat_consensus)}); + + #get the analysis object + my $analysis = $analysis_hash{$analysis_id} ||= + $aa->fetch_by_dbID($analysis_id); + #need to get the internal_seq_region, if present + $seq_region_id = $self->get_seq_region_id_internal($seq_region_id); + my $slice = $slice_hash{"ID:".$seq_region_id}; + + if(!$slice) { + $slice = $sa->fetch_by_seq_region_id($seq_region_id); + $slice_hash{"ID:".$seq_region_id} = $slice; + $sr_name_hash{$seq_region_id} = $slice->seq_region_name(); + $sr_cs_hash{$seq_region_id} = $slice->coord_system(); + } + + my $sr_name = $sr_name_hash{$seq_region_id}; + my $sr_cs = $sr_cs_hash{$seq_region_id}; + # + # remap the feature coordinates to another coord system + # if a mapper was provided + # + if($mapper) { + + if (defined $dest_slice && $mapper->isa('Bio::EnsEMBL::ChainedAssemblyMapper') ) { + ( $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand ) + = + $mapper->map( $sr_name, $seq_region_start, $seq_region_end, + $seq_region_strand, $sr_cs, 1, $dest_slice); + + } else { + + ( $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand ) + = + $mapper->fastmap( $sr_name, $seq_region_start, $seq_region_end, + $seq_region_strand, $sr_cs ); + } + + #skip features that map to gaps or coord system boundaries + next FEATURE if(!defined($seq_region_id)); + + #get a slice in the coord system we just mapped to +# if($asm_cs == $sr_cs || ($cmp_cs != $sr_cs && $asm_cs->equals($sr_cs))) { + $slice = $slice_hash{"ID:".$seq_region_id} ||= + $sa->fetch_by_seq_region_id($seq_region_id); +# } else { +# $slice = $slice_hash{"NAME:$sr_name:$asm_cs_name:$asm_cs_vers"} ||= +# $sa->fetch_by_region($asm_cs_name, $sr_name, undef, undef, undef, +# $asm_cs_vers); +# } + } + + # + # If a destination slice was provided convert the coords + # If the dest_slice starts at 1 and is foward strand, nothing needs doing + # + if($dest_slice) { + if($dest_slice_start != 1 || $dest_slice_strand != 1) { + if($dest_slice_strand == 1) { + $seq_region_start = $seq_region_start - $dest_slice_start + 1; + $seq_region_end = $seq_region_end - $dest_slice_start + 1; + } else { + my $tmp_seq_region_start = $seq_region_start; + $seq_region_start = $dest_slice_end - $seq_region_end + 1; + $seq_region_end = $dest_slice_end - $tmp_seq_region_start + 1; + $seq_region_strand *= -1; + } + } + + #throw away features off the end of the requested slice + if($seq_region_end < 1 || $seq_region_start > $dest_slice_length || + ( $dest_slice_sr_id ne $seq_region_id )) { + next FEATURE; + } + $slice = $dest_slice; + } + + # Finally, create the new RepeatFeature. + push( @features, + $self->_create_feature_fast( 'Bio::EnsEMBL::RepeatFeature', { + 'dbID' => $repeat_feature_id, + 'analysis' => $analysis, + 'start' => $seq_region_start, + 'end' => $seq_region_end, + 'strand' => $seq_region_strand, + 'score' => $score, + 'hstart' => $repeat_start, + 'hend' => $repeat_end, + 'repeat_consensus' => $rc, + 'adaptor' => $self, + 'slice' => $slice + } ) ); + + } + + return \@features; +} + + +=head2 store + + Arg [1] : list of Bio::EnsEMBL::RepeatFeatures $repeat_feature_id + the list of repeat features to store in the database + Example : $repeat_feature_adaptor->store(@repeat_features); + Description: stores a repeat feature in the database + Returntype : none + Exceptions : if the repeat features do not have attached sequences + or if repeat_consensus are not present + Caller : general + Status : Stable + +=cut + +sub store { + my( $self, @repeats ) = @_; + + my $db = $self->db(); + my $rca = $db->get_RepeatConsensusAdaptor(); + my $sa = $db->get_SliceAdaptor(); + my ($cons, $db_id); + + my $sth = $self->prepare(qq{ + INSERT into repeat_feature( repeat_feature_id + , seq_region_id + , seq_region_start + , seq_region_end + , seq_region_strand + , repeat_consensus_id + , repeat_start + , repeat_end + , score + , analysis_id ) + VALUES(NULL, ?,?,?,?,?,?,?,?,?) + }); + + FEATURE: foreach my $rf (@repeats) { + if(!ref($rf) || !$rf->isa('Bio::EnsEMBL::RepeatFeature')) { + throw('Expected RepeatFeature argument not [' . ref($rf) .'].'); + } + + if($rf->is_stored($db)) { + warning("RepeatFeature [".$rf->dbID."] is already stored in this DB."); + next FEATURE; + } + + my $cons = $rf->repeat_consensus(); + throw("Must have a RepeatConsensus attached") if(!defined($cons)); + + # for tandem repeats - simply store consensus and repeat + # one pair per hit. don't need to check consensi stored + # already. consensus has name and class set to 'trf' + + if ($cons->repeat_class eq 'trf') { + + # Look for matches already stored + my @match = @{$rca->fetch_all_by_class_seq('trf', $cons->repeat_consensus)}; + if (@match) { + $cons->dbID($match[0]->dbID()); + } + else { + $rca->store($cons); + } + + } elsif ($cons->repeat_class eq 'Simple_repeat') { + + my $rcon = $cons->name; + $rcon =~ s/\((\S+)\)n/$1/; # get repeat element + $cons->repeat_consensus($rcon); + + # Look for matches already stored + my $match = $rca->fetch_by_name_class($cons->name, 'Simple_repeat'); + if ($match) { + $cons->dbID($match->dbID()); + } + else { + $rca->store($cons); + } + + } else { + + # for other repeats - need to see if a consensus is stored already + if(!$cons->dbID) { + my $match = ($rca->fetch_by_name($cons->name)); + + if($match) { + #set the consensus dbID to be the same as the database one + $cons->dbID($match->dbID()); + } else { + # if we don't match a consensus already stored create a fake one + # and set consensus to 'N' as null seq not allowed + # FIXME: not happy with this, but ho hum ... + warning("Can't find " . $cons->name . "\n"); + $cons->repeat_consensus("N"); + $rca->store($cons); + } + } + + #if (@match > 1) { + #multiple consensi were matched + # $self->warn(@match . " consensi for " . $cons->name . "\n"); + #} + } + + my $slice = $rf->slice(); + if(!ref($slice) || !($slice->isa("Bio::EnsEMBL::Slice") or $slice->isa('Bio::EnsEMBL::LRGSlice'))) { + throw("RepeatFeature cannot be stored without an associated slice."); + } + + my $original = $rf; + my $seq_region_id; + ($rf, $seq_region_id) = $self->_pre_store($rf); + + $sth->bind_param(1,$seq_region_id,SQL_INTEGER); + $sth->bind_param(2,$rf->start,SQL_INTEGER); + $sth->bind_param(3,$rf->end,SQL_INTEGER); + $sth->bind_param(4,$rf->strand,SQL_TINYINT); + $sth->bind_param(5,$rf->repeat_consensus->dbID,SQL_INTEGER); + $sth->bind_param(6,$rf->hstart,SQL_INTEGER); + $sth->bind_param(7,$rf->hend,SQL_INTEGER); + $sth->bind_param(8,$rf->score,SQL_DOUBLE); + $sth->bind_param(9,$rf->analysis->dbID,SQL_INTEGER); + + $sth->execute(); + + my $db_id = $sth->{'mysql_insertid'} + or throw("Didn't get an insertid from the INSERT statement"); + + $original->dbID($db_id); + $original->adaptor($self); + } +} + + +=head2 list_dbIDs + + Arg [1] : none + Example : @feature_ids = @{$repeat_feature_adaptor->list_dbIDs()}; + Description: Gets an array of internal ids for all repeat features in the current db + Arg[1] : int. not 0 for the ids to be sorted by the seq_region. + Returntype : list of ints + Exceptions : none + Caller : ? + Status : Stable + +=cut + +sub list_dbIDs { + my ($self, $ordered) = @_; + + return $self->_list_dbIDs("repeat_feature", undef, $ordered); +} + +1; + + + + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/SOTermAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/SOTermAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,67 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::SOTermAdaptor + +=head1 DESCRIPTION + +A specialization of Bio::EnsEMBL::DBSQL::OntologyTermAdaptor, +specifically for Sequence ontology (SO) terms. See the +documentation of Bio::EnsEMBL::DBSQL::OntologyTermAdaptor for +further information. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::SOTermAdaptor; + +use strict; +use warnings; + +use base qw( Bio::EnsEMBL::DBSQL::OntologyTermAdaptor ); + +=head2 new + + Arg [1] : Bio::EnsEMBL::DBSQL::DBAdaptor + Argument required for parent class + Bio::EnsEMBL::DBSQL::BaseAdaptor. + + Description : Creates an ontology term adaptor for SO terms. + + Example : + + my $go_adaptor = Bio::EnsEMBL::DBSQL::SOTermAdaptor->new( $dba ); + + Return type : Bio::EnsEMBL::DBSQL::SOTermAdaptor + +=cut + +sub new { + my ( $proto, $dba ) = @_; + + my $this = $proto->SUPER::new( $dba, 'SO' ); + + return $this; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/SeqRegionSynonymAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/SeqRegionSynonymAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,79 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . +INTO +=cut + +package Bio::EnsEMBL::DBSQL::SeqRegionSynonymAdaptor; +use vars qw(@ISA); +use strict; + + +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning stack_trace_dump); +use Bio::EnsEMBL::SeqRegionSynonym; + +@ISA = ('Bio::EnsEMBL::DBSQL::BaseAdaptor'); + + +sub get_synonyms{ + my $self = shift; + my $seq_id = shift; + + my @results; + + my $sth = $self->prepare("select seq_region_synonym_id, synonym, external_db_id from seq_region_synonym where seq_region_id = ?"); + $sth->bind_param(1, $seq_id, SQL_INTEGER); + $sth->execute(); + my $dbid; + my $alt_name; + my $ex_db; + $sth->bind_columns(\$dbid, \$alt_name, \$ex_db); + while($sth->fetch()){ + push @results, Bio::EnsEMBL::SeqRegionSynonym->new(-adaptor => $self, + -synonym => $alt_name, + -dbID => $dbid, + -external_db_id => $ex_db, + -seq_region_id => $seq_id); + } + $sth->finish; + + return \@results; +} + +sub store { + my $self = shift; + my $syn = shift; + + return if($syn->is_stored($self->db)); + + if(!defined($syn->seq_region_id)){ + throw("seq_region_id is needed to store a seq_region_synoym"); + } + + my $sth = $self->prepare("INSERT IGNORE INTO seq_region_synonym (seq_region_id, synonym, external_db_id) VALUES (?, ?, ?)"); + $sth->bind_param(1, $syn->seq_region_id, SQL_INTEGER); + $sth->bind_param(2, $syn->name , SQL_VARCHAR); + $sth->bind_param(3, $syn->external_db_id, SQL_INTEGER); + $sth->execute; + $syn->{'dbID'} = $sth->{'mysql_insertid'}; + $sth->finish; +} + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/SequenceAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/SequenceAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,603 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::SequenceAdaptor - produce sequence strings from locations + +=head1 SYNOPSIS + + my $sa = $registry->get_adaptor( 'Human', 'Core', 'Sequence' ); + + my $dna = + ${ $sa->fetch_by_Slice_start_end_strand( $slice, 1, 1000, -1 ) }; + +=head1 DESCRIPTION + +An adaptor for the retrieval of DNA sequence from the EnsEMBL database + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::SequenceAdaptor; + +use vars qw(@ISA @EXPORT); +use strict; +use warnings; + +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw deprecate); +use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); +use Bio::EnsEMBL::Utils::Cache; +use Bio::EnsEMBL::Utils::Scalar qw( assert_ref ); + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); + +our $SEQ_CHUNK_PWR = 18; # 2^18 = approx. 250KB +our $SEQ_CACHE_SZ = 5; +our $SEQ_CACHE_MAX = (2 ** $SEQ_CHUNK_PWR) * $SEQ_CACHE_SZ; + +@EXPORT = (@{$DBI::EXPORT_TAGS{'sql_types'}}); + +=head2 new + + Arg [1] : none + Example : my $sa = $db_adaptor->get_SequenceAdaptor(); + Description: Constructor. Calls superclass constructor and initialises + internal cache structure. + Returntype : Bio::EnsEMBL::DBSQL::SequenceAdaptor + Exceptions : none + Caller : DBAdaptor::get_SequenceAdaptor + Status : Stable + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + # use an LRU cache to limit the size + my %seq_cache; + tie(%seq_cache, 'Bio::EnsEMBL::Utils::Cache', $SEQ_CACHE_SZ); + + $self->{'seq_cache'} = \%seq_cache; + + +# +# See if this has any seq_region_attrib of type "_rna_edit_cache" if so store these +# in a hash. +# + + my $sth = $self->dbc->prepare('select sra.seq_region_id, sra.value from seq_region_attrib sra, attrib_type at where sra.attrib_type_id = at.attrib_type_id and code like "_rna_edit"'); + + $sth->execute(); + my ($seq_region_id, $value); + $sth->bind_columns(\$seq_region_id, \$value); + my %edits; + my $count = 0; + while($sth->fetch()){ + $count++; + push @{$edits{$seq_region_id}}, $value; + } + $sth->finish; + if($count){ + $self->{_rna_edits_cache} = \%edits; + } + + return $self; +} + +=head2 clear_cache + + Example : $sa->clear_cache(); + Description : Removes all entries from the associcated sequence cache + Returntype : None + Exceptions : None + +=cut + +sub clear_cache { + my ($self) = @_; + %{$self->{seq_cache}} = (); + return; +} + + +=head2 fetch_by_Slice_start_end_strand + + Arg [1] : Bio::EnsEMBL::Slice slice + The slice from which you want the sequence + Arg [2] : (optional) int startBasePair + The start base pair relative to the start of the slice. Negative + values or values greater than the length of the slice are fine. + default = 1 + Arg [3] : (optional) int endBasePair + The end base pair relative to the start of the slice. Negative + values or values greater than the length of the slice are fine, + but the end must be greater than or equal to the start + count from 1 + default = the length of the slice + Arg [4] : (optional) int strand + 1, -1 + default = 1 + Example : $dna = $seq_adptr->fetch_by_Slice_start_end_strand($slice, 1, + 1000, -1); + Description: retrieves from db the sequence for this slice + uses AssemblyMapper to find the assembly + Returntype : string + Exceptions : endBasePair should be less or equal to length of slice + Caller : Bio::EnsEMBL::Slice::seq(), Slice::subseq() + Status : Stable + +=cut + +sub fetch_by_Slice_start_end_strand { + my ( $self, $slice, $start, $end, $strand ) = @_; + + if(!ref($slice) || !($slice->isa("Bio::EnsEMBL::Slice") or $slice->isa('Bio::EnsEMBL::LRGSlice')) ) { + throw("Slice argument is required."); + } + + $start = 1 if(!defined($start)); + + + if ( ( !defined($end) || $start > $end || $start < 0 || $end < 0 || $slice->start> $slice->end ) && $slice->is_circular ) { + + if ( !defined($end) || ($start > $end ) ) { + return $self->_fetch_by_Slice_start_end_strand_circular( $slice, $start, $end, $strand ); + } + + if ( defined($end) && ($end < 0) ) { + $end += $slice->seq_region_length; + } + + if ($start < 0) { + $start += $slice->seq_region_length; + } + + if($slice->start> $slice->end) { + return $self->_fetch_by_Slice_start_end_strand_circular( $slice, $slice->start, $slice->end, $strand ); + } + } + + if ( ( !defined($end) ) && (not $slice->is_circular) ) { + $end = $slice->end() - $slice->start() + 1; + } + + if ( $start > $end ) { + throw("Start must be less than or equal to end."); + } + + $strand ||= 1; + + #get a new slice that spans the exact region to retrieve dna from + my $right_expand = $end - $slice->length(); #negative is fine + my $left_expand = 1 - $start; #negative is fine + + if($right_expand || $left_expand) { + $slice = $slice->expand($left_expand, $right_expand); + } + + #retrieve normalized 'non-symlinked' slices + #this allows us to support haplotypes and PARs + my $slice_adaptor = $slice->adaptor(); + my @symproj=@{$slice_adaptor->fetch_normalized_slice_projection($slice)}; + + if(@symproj == 0) { + throw('Could not retrieve normalized Slices. Database contains ' . + 'incorrect assembly_exception information.'); + } + + #call this method again with any slices that were 'symlinked' to by this + #slice + if(@symproj != 1 || $symproj[0]->[2] != $slice) { + my $seq; + foreach my $segment (@symproj) { + my $symlink_slice = $segment->[2]; + #get sequence from each symlinked area + $seq .= ${$self->fetch_by_Slice_start_end_strand($symlink_slice, + 1,undef,1)}; + } + if($strand == -1) { + reverse_comp(\$seq); + } + return \$seq; + } + + # we need to project this slice onto the sequence coordinate system + # even if the slice is in the same coord system, we want to trim out + # flanking gaps (if the slice is past the edges of the seqregion) + my $csa = $self->db->get_CoordSystemAdaptor(); + my $seqlevel = $csa->fetch_sequence_level(); + + my @projection=@{$slice->project($seqlevel->name(), $seqlevel->version())}; + + my $seq = ''; + my $total = 0; + my $tmp_seq; + + #fetch sequence from each of the sequence regions projected onto + foreach my $segment (@projection) { + my ($start, $end, $seq_slice) = @$segment; + + #check for gaps between segments and pad them with Ns + my $gap = $start - $total - 1; + if($gap) { + $seq .= 'N' x $gap; + } + + my $seq_region_id = $slice_adaptor->get_seq_region_id($seq_slice); + + $tmp_seq = ${$self->_fetch_seq($seq_region_id, + $seq_slice->start, $seq_slice->length())}; + + #reverse compliment on negatively oriented slices + if($seq_slice->strand == -1) { + reverse_comp(\$tmp_seq); + } + + $seq .= $tmp_seq; + + $total = $end; + } + + #check for any remaining gaps at the end + my $gap = $slice->length - $total; + if($gap) { + $seq .= 'N' x $gap; + } + + #if the sequence is too short it is because we came in with a seqlevel + #slice that was partially off of the seq_region. Pad the end with Ns + #to make long enough + if(length($seq) != $slice->length()) { + $seq .= 'N' x ($slice->length() - length($seq)); + } + + if(defined($self->{_rna_edits_cache}) and defined($self->{_rna_edits_cache}->{$slice->get_seq_region_id})){ + $self->_rna_edit($slice,\$seq); + } + + #if they asked for the negative slice strand revcomp the whole thing + reverse_comp(\$seq) if($strand == -1); + + return \$seq; +} + + +sub _fetch_by_Slice_start_end_strand_circular { + my ( $self, $slice, $start, $end, $strand ) = @_; + + assert_ref( $slice, 'Bio::EnsEMBL::Slice' ); + + $strand ||= 1; + if ( !defined($start) ) { + $start ||= 1; + } + + if ( !defined($end) ) { + $end = $slice->end() - $slice->start() + 1; + } + + if ( $start > $end && $slice->is_circular() ) { + my ($seq, $seq1, $seq2); + + my $midpoint = $slice->seq_region_length - $slice->start + 1; + $seq1 = ${ $self->_fetch_by_Slice_start_end_strand_circular( $slice, 1, $midpoint, 1 )}; + $seq2 = ${ $self->_fetch_by_Slice_start_end_strand_circular( $slice, $midpoint + 1, $slice->length(), 1 )}; + + $seq = $slice->strand > 0 ? "$seq1$seq2" : "$seq2$seq1"; + + reverse_comp( \$seq ) if ( $strand == -1 ); + + return \$seq; + } + + + + # Get a new slice that spans the exact region to retrieve dna from + my $right_expand = $end - $slice->length(); #negative is fine + my $left_expand = 1 - $start; #negative is fine + + if ( $right_expand || $left_expand ) { + $slice = + $slice->strand > 0 + ? $slice->expand( $left_expand, $right_expand ) + : $slice->expand( $right_expand, $left_expand ); + } + + # Retrieve normalized 'non-symlinked' slices. This allows us to + # support haplotypes and PARs. + my $slice_adaptor = $slice->adaptor(); + my @symproj = + @{ $slice_adaptor->fetch_normalized_slice_projection($slice) }; + + if ( @symproj == 0 ) { + throw( 'Could not retrieve normalized Slices. Database contains ' + . 'incorrect assembly_exception information.' ); + } + + # Call this method again with any slices that were 'symlinked' to by + # this slice. + if ( @symproj != 1 || $symproj[0]->[2] != $slice ) { + my $seq; + foreach my $segment (@symproj) { + my $symlink_slice = $segment->[2]; + + # Get sequence from each symlinked area. + $seq .= ${ + $self->fetch_by_Slice_start_end_strand( $symlink_slice, 1, + undef, 1 ) }; + } + if ( $strand == -1 ) { + reverse_comp( \$seq ); + } + + return \$seq; + } + + # We need to project this slice onto the sequence coordinate system + # even if the slice is in the same coord system, we want to trim out + # flanking gaps (if the slice is past the edges of the seqregion). + my $csa = $self->db->get_CoordSystemAdaptor(); + my $seqlevel = $csa->fetch_sequence_level(); + + my @projection = + @{ $slice->project( $seqlevel->name(), $seqlevel->version() ) }; + + my $seq = ''; + my $total = 0; + my $tmp_seq; + + # Fetch sequence from each of the sequence regions projected onto. + foreach my $segment (@projection) { + my ( $start, $end, $seq_slice ) = @{$segment}; + + # Check for gaps between segments and pad them with Ns + my $gap = $start - $total - 1; + if ($gap) { + $seq .= 'N' x $gap; + } + + my $seq_region_id = $slice_adaptor->get_seq_region_id($seq_slice); + + $tmp_seq = ${ + $self->_fetch_seq( $seq_region_id, $seq_slice->start(), + $seq_slice->length() ) }; + + # Reverse compliment on negatively oriented slices. + if ( $seq_slice->strand == -1 ) { + reverse_comp( \$tmp_seq ); + } + + $seq .= $tmp_seq; + + $total = $end; + } + + # Check for any remaining gaps at the end. + my $gap = $slice->length() - $total; + + if ($gap) { + $seq .= 'N' x $gap; + } + + # If the sequence is too short it is because we came in with a + # seqlevel slice that was partially off of the seq_region. Pad the + # end with Ns to make long enough + if ( length($seq) != $slice->length() ) { + $seq .= 'N' x ( $slice->length() - length($seq) ); + } + + if ( defined( $self->{_rna_edits_cache} ) + && defined( + $self->{_rna_edits_cache}->{ $slice->get_seq_region_id } ) ) + { + $self->_rna_edit( $slice, \$seq ); + } + + return \$seq; +} ## end sub _fetch_by_Slice_start_end_strand_circular + + + + + +sub _rna_edit { + my $self = shift; + my $slice = shift; + my $seq = shift; #reference to string + + my $s_start = $slice->start; #substr start at 0 , but seq starts at 1 (so no -1 here) + my $s_end = $s_start+length($$seq); + + foreach my $edit (@{$self->{_rna_edits_cache}->{$slice->get_seq_region_id}}){ + my ($start, $end, $txt) = split (/\s+/, $edit); +# check that RNA edit is not outside the requested region : happens quite often with LRG regions + next if ($end < $s_start); + next if ($s_end < $start); + substr($$seq,$start-$s_start, ($end-$start)+1, $txt); + } + return; +} + + +sub _fetch_seq { + my $self = shift; + my $seq_region_id = shift; + my $start = shift; + my $length = shift; + + my $cache = $self->{'seq_cache'}; + + if($length < $SEQ_CACHE_MAX) { + my $chunk_min = ($start-1) >> $SEQ_CHUNK_PWR; + my $chunk_max = ($start + $length - 1) >> $SEQ_CHUNK_PWR; + + # piece together sequence from cached component parts + + my $entire_seq = undef; + for(my $i = $chunk_min; $i <= $chunk_max; $i++) { + if($cache->{"$seq_region_id:$i"}) { + $entire_seq .= $cache->{"$seq_region_id:$i"}; + } else { + # retrieve uncached portions of the sequence + + my $sth = + $self->prepare( "SELECT SUBSTRING(d.sequence, ?, ?) " + . "FROM dna d " + . "WHERE d.seq_region_id = ?" ); + + my $tmp_seq; + + my $min = ($i << $SEQ_CHUNK_PWR) + 1; + + $sth->bind_param( 1, $min, SQL_INTEGER ); + $sth->bind_param( 2, 1 << $SEQ_CHUNK_PWR, SQL_INTEGER ); + $sth->bind_param( 3, $seq_region_id, SQL_INTEGER ); + + $sth->execute(); + $sth->bind_columns(\$tmp_seq); + $sth->fetch(); + $sth->finish(); + + # always give back uppercased sequence so it can be properly softmasked + $entire_seq .= uc($tmp_seq); + $cache->{"$seq_region_id:$i"} = uc($tmp_seq); + } + } + + # return only the requested portion of the entire sequence + my $min = ( $chunk_min << $SEQ_CHUNK_PWR ) + 1; + # my $max = ( $chunk_max + 1 ) << $SEQ_CHUNK_PWR; + my $seq = substr( $entire_seq, $start - $min, $length ); + + return \$seq; + } else { + + # do not do any caching for requests of very large sequences + my $sth = + $self->prepare( "SELECT SUBSTRING(d.sequence, ?, ?) " + . "FROM dna d " + . "WHERE d.seq_region_id = ?" ); + + my $tmp_seq; + + $sth->bind_param( 1, $start, SQL_INTEGER ); + $sth->bind_param( 2, $length, SQL_INTEGER ); + $sth->bind_param( 3, $seq_region_id, SQL_INTEGER ); + + $sth->execute(); + $sth->bind_columns(\$tmp_seq); + $sth->fetch(); + $sth->finish(); + + # always give back uppercased sequence so it can be properly softmasked + $tmp_seq = uc($tmp_seq); + + return \$tmp_seq; + } +} + + +=head2 store + + Arg [1] : int $seq_region_id the id of the sequence region this dna + will be associated with. + Arg [2] : string $sequence the dna sequence to be stored + in the database. Note that the sequence passed in will be + converted to uppercase. + Example : $seq_adaptor->store(11, 'ACTGGGTACCAAACAAACACAACA'); + Description: stores a dna sequence in the databases dna table and returns the + database identifier for the new record. + Returntype : none + Exceptions : throw if the database insert fails + Caller : sequence loading scripts + Status : Stable + +=cut + +sub store { + my ($self, $seq_region_id, $sequence) = @_; + + if(!$seq_region_id) { + throw('seq_region_id is required'); + } + + $sequence = uc($sequence); + + my $statement = + $self->prepare("INSERT INTO dna(seq_region_id, sequence) VALUES(?,?)"); + + $statement->bind_param(1,$seq_region_id,SQL_INTEGER); + $statement->bind_param(2,$sequence,SQL_LONGVARCHAR); + $statement->execute(); + + $statement->finish(); + + return; +} + + + + +=head2 fetch_by_assembly_location + + Description: DEPRECATED use fetch_by_Slice_start_end_strand() instead. + +=cut + +sub fetch_by_assembly_location { + my ( $self, $chrStart, $chrEnd, + $strand, $chrName, $assemblyType ) = @_; + + deprecate('Use fetch_by_Slice_start_end_strand() instead'); + + my $csa = $self->db->get_CoordSystem(); + my $top_cs = @{$csa->fetch_all}; + + my $slice_adaptor = $self->db->get_SliceAdaptor(); + my $slice = $slice_adaptor->fetch_by_region($top_cs->name(), $chrName, + $chrStart, $chrEnd, + $strand, $top_cs->version); + + return $self->fetch_by_Slice_start_end_strand($slice,1, $slice->length,1); +} + + +=head2 fetch_by_RawContig_start_end_strand + + Description: DEPRECATED use fetch_by_Slice_start_end_strand instead + +=cut + +sub fetch_by_RawContig_start_end_strand { + deprecate('Use fetch_by_Slice_start_end_strand instead.'); + fetch_by_Slice_start_end_strand(@_); +} + + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/SimpleFeatureAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/SimpleFeatureAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,366 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::SimpleFeatureAdaptor + +=head1 SYNOPSIS + + my $reg = 'Bio::EnsEMBL::Registry'; + + $reg-> + load_registry_from_db( ... + + my $sfa = + $reg->get_adaptor('homo sapiens', 'core', 'SimpleFeature'); + + print ref($sfa), "\n"; + + my $sf_aref = + $sfa->fetch_all; + + print scalar @$sf_aref, "\n"; + +=head1 DESCRIPTION + +Simple Feature Adaptor - database access for simple features + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::SimpleFeatureAdaptor; +use vars qw(@ISA); +use strict; + +use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; +use Bio::EnsEMBL::SimpleFeature; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor); + + +=head2 store + + Arg [1] : list of Bio::EnsEMBL::SimpleFeatures @sf + the simple features to store in the database + Example : $simple_feature_adaptor->store(@simple_feats); + Description: Stores a list of simple feature objects in the database + Returntype : none + Exceptions : thrown if @sf is not defined, if any of the features do not + have an attached slice. + or if any elements of @sf are not Bio::EnsEMBL::SimpleFeatures + Caller : general + Status : Stable + +=cut + +sub store{ + my ($self,@sf) = @_; + + if( scalar(@sf) == 0 ) { + throw("Must call store with list of SimpleFeatures"); + } + + my $sth = $self->prepare + ("INSERT INTO simple_feature (seq_region_id, seq_region_start, " . + "seq_region_end, seq_region_strand, " . + "display_label, analysis_id, score) " . + "VALUES (?,?,?,?,?,?,?)"); + + my $db = $self->db(); + my $analysis_adaptor = $db->get_AnalysisAdaptor(); + + FEATURE: foreach my $sf ( @sf ) { + + if( !ref $sf || !$sf->isa("Bio::EnsEMBL::SimpleFeature") ) { + throw("SimpleFeature must be an Ensembl SimpleFeature, " . + "not a [".ref($sf)."]"); + } + + if($sf->is_stored($db)) { + warning("SimpleFeature [".$sf->dbID."] is already stored" . + " in this database."); + next FEATURE; + } + + if(!defined($sf->analysis)) { + throw("An analysis must be attached to the features to be stored."); + } + + #store the analysis if it has not been stored yet + if(!$sf->analysis->is_stored($db)) { + $analysis_adaptor->store($sf->analysis()); + } + + my $original = $sf; + my $seq_region_id; + ($sf, $seq_region_id) = $self->_pre_store($sf); + + $sth->bind_param(1,$seq_region_id,SQL_INTEGER); + $sth->bind_param(2,$sf->start,SQL_INTEGER); + $sth->bind_param(3,$sf->end,SQL_INTEGER); + $sth->bind_param(4,$sf->strand,SQL_TINYINT); + $sth->bind_param(5,$sf->display_label,SQL_VARCHAR); + $sth->bind_param(6,$sf->analysis->dbID,SQL_INTEGER); + $sth->bind_param(7,$sf->score,SQL_DOUBLE); + + $sth->execute(); + + $original->dbID($sth->{'mysql_insertid'}); + $original->adaptor($self); + } +} + + +=head2 _tables + + Arg [1] : none + Example : none + Description: PROTECTED implementation of superclass abstract method + returns the names, aliases of the tables to use for queries + Returntype : list of listrefs of strings + Exceptions : none + Caller : internal + Status : Stable + +=cut + +sub _tables { + my $self = shift; + + return ['simple_feature', 'sf']; +} + + +=head2 _columns + + Arg [1] : none + Example : none + Description: PROTECTED implementation of superclass abstract method + returns a list of columns to use for queries + Returntype : list of strings + Exceptions : none + Caller : internal + Status : Stable + +=cut + +sub _columns { + my $self = shift; + + return qw( sf.simple_feature_id + sf.seq_region_id sf.seq_region_start sf.seq_region_end + sf.seq_region_strand sf.display_label sf.analysis_id sf.score ); +} + + +=head2 _objs_from_sth + + Arg [1] : hash reference $hashref + Example : none + Description: PROTECTED implementation of superclass abstract method. + creates SimpleFeatures from an executed DBI statement handle. + Returntype : list reference to Bio::EnsEMBL::SimpleFeature objects + Exceptions : none + Caller : internal + Status : Stable + +=cut + +sub _objs_from_sth { + my ($self, $sth, $mapper, $dest_slice) = @_; + + # + # This code is ugly because an attempt has been made to remove as many + # function calls as possible for speed purposes. Thus many caches and + # a fair bit of gymnastics is used. + # + + my $sa = $self->db()->get_SliceAdaptor(); + my $aa = $self->db->get_AnalysisAdaptor(); + + my @features; + my %analysis_hash; + my %slice_hash; + my %sr_name_hash; + my %sr_cs_hash; + + + my($simple_feature_id,$seq_region_id, $seq_region_start, $seq_region_end, + $seq_region_strand, $display_label, $analysis_id, $score); + + $sth->bind_columns(\$simple_feature_id,\$seq_region_id, \$seq_region_start, + \$seq_region_end, \$seq_region_strand, \$display_label, + \$analysis_id, \$score); + + my $asm_cs; + my $cmp_cs; + my $asm_cs_vers; + my $asm_cs_name; + my $cmp_cs_vers; + my $cmp_cs_name; + if($mapper) { + $asm_cs = $mapper->assembled_CoordSystem(); + $cmp_cs = $mapper->component_CoordSystem(); + $asm_cs_name = $asm_cs->name(); + $asm_cs_vers = $asm_cs->version(); + $cmp_cs_name = $cmp_cs->name(); + $cmp_cs_vers = $cmp_cs->version(); + } + + my $dest_slice_start; + my $dest_slice_end; + my $dest_slice_strand; + my $dest_slice_length; + my $dest_slice_sr_name; + my $dest_slice_seq_region_id; + if($dest_slice) { + $dest_slice_start = $dest_slice->start(); + $dest_slice_end = $dest_slice->end(); + $dest_slice_strand = $dest_slice->strand(); + $dest_slice_length = $dest_slice->length(); + $dest_slice_sr_name = $dest_slice->seq_region_name(); + $dest_slice_seq_region_id =$dest_slice->get_seq_region_id(); + } + + my $count =0; + FEATURE: while($sth->fetch()) { + $count++; + #get the analysis object + my $analysis = $analysis_hash{$analysis_id} ||= + $aa->fetch_by_dbID($analysis_id); + + #need to get the internal_seq_region, if present + $seq_region_id = $self->get_seq_region_id_internal($seq_region_id); + #get the slice object + my $slice = $slice_hash{"ID:".$seq_region_id}; + + if(!$slice) { + $slice = $sa->fetch_by_seq_region_id($seq_region_id); + $slice_hash{"ID:".$seq_region_id} = $slice; + $sr_name_hash{$seq_region_id} = $slice->seq_region_name(); + $sr_cs_hash{$seq_region_id} = $slice->coord_system(); + } + + my $sr_name = $sr_name_hash{$seq_region_id}; + my $sr_cs = $sr_cs_hash{$seq_region_id}; + # + # remap the feature coordinates to another coord system + # if a mapper was provided + # + if($mapper) { + + if (defined $dest_slice && $mapper->isa('Bio::EnsEMBL::ChainedAssemblyMapper') ) { + ( $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand ) + = + $mapper->map( $sr_name, $seq_region_start, $seq_region_end, + $seq_region_strand, $sr_cs, 1, $dest_slice); + + } else { + + ( $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand ) + = + $mapper->fastmap( $sr_name, $seq_region_start, $seq_region_end, + $seq_region_strand, $sr_cs ); + } + + #skip features that map to gaps or coord system boundaries + next FEATURE if(!defined($seq_region_id)); + + #get a slice in the coord system we just mapped to + if($asm_cs == $sr_cs || ($cmp_cs != $sr_cs && $asm_cs->equals($sr_cs))) { + $slice = $slice_hash{"ID:".$seq_region_id} ||= + $sa->fetch_by_seq_region_id($seq_region_id); + } else { + $slice = $slice_hash{"ID:".$seq_region_id} ||= + $sa->fetch_by_seq_region_id($seq_region_id); + } + } + + # + # If a destination slice was provided convert the coords + # If the dest_slice starts at 1 and is foward strand, nothing needs doing + # + if($dest_slice) { + if($dest_slice_start != 1 || $dest_slice_strand != 1) { + if($dest_slice_strand == 1) { + $seq_region_start = $seq_region_start - $dest_slice_start + 1; + $seq_region_end = $seq_region_end - $dest_slice_start + 1; + } else { + my $tmp_seq_region_start = $seq_region_start; + $seq_region_start = $dest_slice_end - $seq_region_end + 1; + $seq_region_end = $dest_slice_end - $tmp_seq_region_start + 1; + $seq_region_strand *= -1; + } + } + + #throw away features off the end of the requested slice + if($seq_region_end < 1 || $seq_region_start > $dest_slice_length || + ( $dest_slice_seq_region_id != $seq_region_id )) { +# print STDERR "IGNORED DUE TO CUTOFF $dest_slice_seq_region_id ne $seq_region_id . $sr_name\n"; + next FEATURE; + } + $slice = $dest_slice; + } + + push( @features, + $self->_create_feature_fast( + 'Bio::EnsEMBL::SimpleFeature', { + 'start' => $seq_region_start, + 'end' => $seq_region_end, + 'strand' => $seq_region_strand, + 'slice' => $slice, + 'analysis' => $analysis, + 'adaptor' => $self, + 'dbID' => $simple_feature_id, + 'display_label' => $display_label, + 'score' => $score + } ) ); + + } + + return \@features; +} + + +=head2 list_dbIDs + + Arg [1] : none + Example : @feature_ids = @{$simple_feature_adaptor->list_dbIDs()}; + Description: Gets an array of internal ids for all simple features in the current db + Arg[1] : int. not 0 for the ids to be sorted by the seq_region. + Returntype : list of ints + Exceptions : none + Caller : ? + Status : Stable + +=cut + +sub list_dbIDs { + my ($self, $ordered) = @_; + + return $self->_list_dbIDs("simple_feature", undef, $ordered); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/SliceAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/SliceAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,2272 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::SliceAdaptor - A database aware adaptor responsible for +the creation of Slice objects. + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Utils::Slice qw(split_Slices); + use Bio::EnsEMBL::Registry; + + Bio::EnsEMBL::Registry->load_registry_from_db( + -host => 'ensembldb.ensembl.org', + -user => 'anonymous' + ); + + $slice_adaptor = + Bio::EnsEMBL::Registry->get_adaptor( "human", "core", "slice" ); + + # get a slice on the entire chromosome X + $chr_slice = $slice_adaptor->fetch_by_region( 'chromosome', 'X' ); + + # get a slice for each clone in the database + foreach $cln_slice ( @{ $slice_adaptor->fetch_all('clone') } ) { + # do something with clone + } + + # get a slice which is part of NT_004321 + $spctg_slice = + $slice_adaptor->fetch_by_region( 'supercontig', 'NT_004321', + 200_000, 600_000 ); + + # get all non-redundant slices from the highest possible coordinate + # systems + $slices = $slice_adaptor->fetch_all('toplevel'); + + # include non-reference regions + $slices = $slice_adaptor->fetch_all( 'toplevel', undef, 1 ); + + # include non-duplicate regions + $slices = $slice_adaptor->fetch_all( 'toplevel', undef, 0, 1 ); + + # split up a list of slices into smaller slices + $overlap = 1000; + $max_length = 1e6; + $slices = split_Slices( $slices, $max_length, $overlap ); + + # store a list of slice names in a file + open( FILE, ">$filename" ) or die("Could not open file $filename"); + foreach my $slice (@$slices) { + print FILE $slice->name(), "\n"; + } + close FILE; + + # retreive a list of slices from a file + open( FILE, $filename ) or die("Could not open file $filename"); + while ( $name = ) { + chomp($name); + $slice = $slice_adaptor->fetch_by_name($name); + # do something with slice + } + +=head1 DESCRIPTION + +This module is responsible for fetching Slices representing genomic +regions from a database. A Details on how slices can be used are in the +Bio::EnsEMBL::Slice module. + +=head1 METHODS + +=cut + + +package Bio::EnsEMBL::DBSQL::SliceAdaptor; +use vars qw(@ISA); +use strict; + + +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Slice; +use Bio::EnsEMBL::CircularSlice; +use Bio::EnsEMBL::Mapper; +use Bio::EnsEMBL::LRGSlice; +use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning stack_trace_dump); +use Scalar::Util qw/looks_like_number/; + +@ISA = ('Bio::EnsEMBL::DBSQL::BaseAdaptor'); + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + # use a cache which is shared and also used by the assembly + # mapper adaptor + + my $seq_region_cache = $self->db->get_SeqRegionCache(); + + $self->{'sr_name_cache'} = $seq_region_cache->{'name_cache'}; + $self->{'sr_id_cache'} = $seq_region_cache->{'id_cache'}; + + $self->{'lrg_region_test'} = undef; + my $meta_container = $self->db->get_MetaContainer(); + my @values = $meta_container->list_value_by_key("LRG"); + if(scalar(@values) and $values[0]->[0]){ + $self->{'lrg_region_test'} = $values[0]->[0]; + } + return $self; +} + + +=head2 fetch_by_region + + Arg [1] : string $coord_system_name (optional) + The name of the coordinate system of the slice to be created + This may be a name of an actual coordinate system or an alias + to a coordinate system. Valid aliases are 'seqlevel' or + 'toplevel'. + Arg [2] : string $seq_region_name + The name of the sequence region that the slice will be + created on. + Arg [3] : int $start (optional, default = 1) + The start of the slice on the sequence region + Arg [4] : int $end (optional, default = seq_region length) + The end of the slice on the sequence region + Arg [5] : int $strand (optional, default = 1) + The orientation of the slice on the sequence region + Arg [6] : string $version (optional, default = default version) + The version of the coordinate system to use (e.g. NCBI33) + Arg [7] : boolean $no_fuzz (optional, default = undef (false)) + If true (non-zero), do not use "fuzzy matching" (see below). + Example : $slice = $slice_adaptor->fetch_by_region('chromosome', 'X'); + $slice = $slice_adaptor->fetch_by_region('clone', 'AC008066.4'); + Description: Retrieves a slice on the requested region. At a minimum the + name the name of the seq_region to fetch must be provided. + + If no coordinate system name is provided than a slice on the + highest ranked coordinate system with a matching + seq_region_name will be returned. If a version but no + coordinate system name is provided, the same behaviour will + apply, but only coordinate systems of the appropriate version + are considered. The same applies if the 'toplevel' coordinate + system is specified, however in this case the version is + ignored. The coordinate system should always be specified if + it is known, since this is unambiguous and faster. + + Some fuzzy matching is performed if no exact match for + the provided name is found. This allows clones to be + fetched even when their version is not known. For + example fetch_by_region('clone', 'AC008066') will + retrieve the sequence_region with name 'AC008066.4'. + + The fuzzy matching can be turned off by setting the + $no_fuzz argument to a true value. + + If the requested seq_region is not found in the database undef + is returned. + + Returntype : Bio::EnsEMBL::Slice or undef + Exceptions : throw if no seq_region_name is provided + throw if invalid coord_system_name is provided + throw if start > end is provided + Caller : general + Status : Stable + +=cut + + +# +# ARNE: This subroutine needs simplification!! +# +sub fetch_by_region { + my ( $self, $coord_system_name, $seq_region_name, $start, $end, + $strand, $version, $no_fuzz ) + = @_; + + if ( !defined($start) ) { $start = 1 } + if ( !defined($strand) ) { $strand = 1 } + + if ( !defined($seq_region_name) ) { + throw('seq_region_name argument is required'); + } + + my $cs; + my $csa = $self->db->get_CoordSystemAdaptor(); + + if ( defined($coord_system_name) ) { + $cs = $csa->fetch_by_name( $coord_system_name, $version ); + + ## REMOVE THESE THREE LINES WHEN STICKLEBACK DB IS FIXED! + ## Anne/ap5 (2007-10-09): + # The problem was that the stickleback genebuild called the + # chromosomes 'groups', which meant they weren't being picked out by + # the karyotype drawing code. Apparently they are usually called + # 'groups' in the stickleback community, even though they really are + # chromosomes! + + if ( !defined($cs) && $coord_system_name eq 'chromosome' ) { + $cs = $csa->fetch_by_name( 'group', $version ); + } + + if ( !defined($cs) ) { + throw( sprintf( "Unknown coordinate system:\n" + . "name='%s' version='%s'\n", + $coord_system_name, $version ) ); + } + + # fetching by toplevel is same as fetching w/o name or version + if ( $cs->is_top_level() ) { + $cs = undef; + $version = undef; + } + + } ## end if ( defined($coord_system_name...)) + + my $constraint; + my $sql; + my @bind_params; + my $key; + + if ( defined($cs) ) { + $sql = sprintf( "SELECT sr.name, sr.seq_region_id, sr.length, %d " + . "FROM seq_region sr ", + $cs->dbID() ); + + $constraint = "AND sr.coord_system_id = ?"; + push( @bind_params, [ $cs->dbID(), SQL_INTEGER ] ); + + $key = "$seq_region_name:" . $cs->dbID(); + } else { + $sql = + "SELECT sr.name, sr.seq_region_id, sr.length, cs.coord_system_id " + . "FROM seq_region sr, coord_system cs "; + + $constraint = "AND sr.coord_system_id = cs.coord_system_id " + . "AND cs.species_id = ? "; + push( @bind_params, [ $self->species_id(), SQL_INTEGER ] ); + + if ( defined($version) ) { + $constraint .= "AND cs.version = ? "; + push( @bind_params, [ $version, SQL_VARCHAR ] ); + } + + $constraint .= "ORDER BY cs.rank ASC"; + } + + # check the cache so we only go to the db if necessary + my $length; + my $arr; + + if ( defined($key) ) { $arr = $self->{'sr_name_cache'}->{$key} } + + if ( defined($arr) ) { + $length = $arr->[3]; + } else { + my $sth = + $self->prepare( $sql . "WHERE sr.name = ? " . $constraint ); + + unshift( @bind_params, [ $seq_region_name, SQL_VARCHAR ] ); + + my $pos = 0; + foreach my $param (@bind_params) { + $sth->bind_param( ++$pos, $param->[0], $param->[1] ); + } + + $sth->execute(); + + if ( $sth->rows() == 0 ) { + $sth->finish(); + + + # try synonyms + my $syn_sql_sth = $self->prepare("select s.name from seq_region s, seq_region_synonym ss where s.seq_region_id = ss.seq_region_id and ss.synonym = ?"); + $syn_sql_sth->bind_param(1, $seq_region_name, SQL_VARCHAR); + $syn_sql_sth->execute(); + my $new_name; + $syn_sql_sth->bind_columns( \$new_name); + if($syn_sql_sth->fetch){ + $syn_sql_sth->finish; + return $self->fetch_by_region($coord_system_name, $new_name, $start, $end, $strand, $version, $no_fuzz); + } + $syn_sql_sth->finish; + + + if ($no_fuzz) { return undef } + + # Do fuzzy matching, assuming that we are just missing a version + # on the end of the seq_region name. + + $sth = + $self->prepare( $sql . " WHERE sr.name LIKE ? " . $constraint ); + + $bind_params[0] = + [ sprintf( '%s.%%', $seq_region_name ), SQL_VARCHAR ]; + + $pos = 0; + foreach my $param (@bind_params) { + $sth->bind_param( ++$pos, $param->[0], $param->[1] ); + } + + $sth->execute(); + + my $prefix_len = length($seq_region_name) + 1; + my $high_ver = undef; + my $high_cs = $cs; + + # Find the fuzzy-matched seq_region with the highest postfix + # (which ought to be a version). + + my ( $tmp_name, $id, $tmp_length, $cs_id ); + $sth->bind_columns( \( $tmp_name, $id, $tmp_length, $cs_id ) ); + + my $i = 0; + + while ( $sth->fetch ) { + my $tmp_cs = + ( defined($cs) ? $cs : $csa->fetch_by_dbID($cs_id) ); + + # cache values for future reference + my $arr = [ $id, $tmp_name, $cs_id, $tmp_length ]; + $self->{'sr_name_cache'}->{"$tmp_name:$cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$id"} = $arr; + + my $tmp_ver = substr( $tmp_name, $prefix_len ); + + # skip versions which are non-numeric and apparently not + # versions + if ( $tmp_ver !~ /^\d+$/ ) { next } + + # take version with highest num, if two versions match take one + # with highest ranked coord system (lowest num) + if ( !defined($high_ver) + || $tmp_ver > $high_ver + || ( $tmp_ver == $high_ver && $tmp_cs->rank < $high_cs->rank ) + ) + { + $seq_region_name = $tmp_name; + $length = $tmp_length; + $high_ver = $tmp_ver; + $high_cs = $tmp_cs; + } + + $i++; + } ## end while ( $sth->fetch ) + $sth->finish(); + + # warn if fuzzy matching found more than one result + if ( $i > 1 ) { + warning( + sprintf( + "Fuzzy matching of seq_region_name " + . "returned more than one result.\n" + . "You might want to check whether the returned seq_region\n" + . "(%s:%s) is the one you intended to fetch.\n", + $high_cs->name(), $seq_region_name ) ); + } + + $cs = $high_cs; + + # return if we did not find any appropriate match: + if ( !defined($high_ver) ) { return undef } + + } else { + + my ( $id, $cs_id ); + ( $seq_region_name, $id, $length, $cs_id ) = + $sth->fetchrow_array(); + $sth->finish(); + + # cache to speed up for future queries + my $arr = [ $id, $seq_region_name, $cs_id, $length ]; + $self->{'sr_name_cache'}->{"$seq_region_name:$cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$id"} = $arr; + $cs = $csa->fetch_by_dbID($cs_id); + } + } ## end else [ if ( defined($arr) ) ] + + if ( !defined($end) ) { $end = $length } + + #If this was given then check if we've got a circular seq region otherwise + #let it fall through to the normal Slice method + if ( $end + 1 < $start ) { + my $cs_id = $cs->dbID(); + my $seq_region_id = $self->{'sr_name_cache'}->{"$seq_region_name:$cs_id"}->[0]; + if($self->is_circular($seq_region_id)) { + my $new_sl = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $cs, + -SEQ_REGION_NAME => $seq_region_name, + -SEQ_REGION_LENGTH => $length, + -START => $start, + -END => $end, + -STRAND => 1, + -ADAPTOR => $self ); + + return $new_sl; + } + } + + if ( defined( $self->{'lrg_region_test'} ) + and substr( $cs->name, 0, 3 ) eq $self->{'lrg_region_test'} ) + { + return + Bio::EnsEMBL::LRGSlice->new( -COORD_SYSTEM => $cs, + -SEQ_REGION_NAME => $seq_region_name, + -SEQ_REGION_LENGTH => $length, + -START => $start, + -END => $end, + -STRAND => $strand, + -ADAPTOR => $self ); + } else { + return + Bio::EnsEMBL::Slice->new_fast( { + 'coord_system' => $cs, + 'seq_region_name' => $seq_region_name, + 'seq_region_length' => $length, + 'start' => $start, + 'end' => $end, + 'strand' => $strand, + 'adaptor' => $self } ); + } +} ## end sub fetch_by_region + +=head2 fetch_by_toplevel_location + + Arg [1] : string $location + Ensembl formatted location. Can be a format like + C, C, C, + C, C. We can also support strand + specification as a +/- or 1/-1. + + Location names must be separated by a C<:>. All others can be + separated by C<..>, C<:> or C<->. + Arg[2] : boolean $no_warnings + Suppress warnings from this method + Example : my $slice = $sa->fetch_by_toplevel_location('X:1-10000') + my $slice = $sa->fetch_by_toplevel_location('X:1-10000:-1') + Description : Converts an Ensembl location/region into the sequence region + name, start and end and passes them onto C. + The code assumes that the required slice is on the top level + coordinate system. The code assumes that location formatting + is not perfect and will perform basic cleanup before parsing. + Returntype : Bio::EnsEMBL::Slice + Exceptions : If $location is false otherwise see C + Caller : General + Status : Beta + +=cut + +sub fetch_by_toplevel_location { + my ($self, $location, $no_warnings) = @_; + + my ($seq_region_name, $start, $end, $strand) = $self->parse_location_to_values($location, $no_warnings); + + if(! $seq_region_name) { + return; + } + + if(defined $start && defined $end && $start > $end) { + throw "Cannot request a slice whose start is greater than its end. Start: $start. End: $end"; + } + + my $coord_system_name = 'toplevel'; + my $slice = $self->fetch_by_region($coord_system_name, $seq_region_name, $start, $end, $strand, undef, 0); + return unless $slice; + + my $srl = $slice->seq_region_length(); + my $name = $slice->seq_region_name(); + if(defined $start && $start > $srl) { + throw "Cannot request a slice whose start ($start) is greater than $srl for $name."; + } + if(defined $end && $end > $srl) { + warning "Requested end ($end) is greater than $srl for $name. Resetting to $srl" if ! $no_warnings; + $slice->{end} = $srl; + } + + return $slice; +} + +=head2 parse_location_to_values + + Arg [1] : string $location + Ensembl formatted location. Can be a format like + C, C, C, + C, C. We can also support strand + specification as a +/- or 1/-1. + + Location names must be separated by a C<:>. All others can be + separated by C<..>, C<:> or C<->. + Arg[2] : boolean $no_warnings + Suppress warnings from this method + Example : my ($name, $start, $end, $strand) = $sa->parse_location_to_values('X:1..100:1); + Description : Takes in an Ensembl location String and returns the parsed + values + Returntype : List. Contains name, start, end and strand + +=cut + + +sub parse_location_to_values { + my ($self, $location, $no_warnings, $no_errors) = @_; + + throw 'You must specify a location' if ! $location; + + #cleanup any nomenclature like 1_000 or 1 000 or 1,000 + my $number_seps_regex = qr/\s+|,|_/; + my $separator_regex = qr/(?:-|[.]{2}|\:)?/; + my $number_regex = qr/[0-9,_ E]+/xms; + my $strand_regex = qr/[+-1]|-1/xms; + + my $regex = qr/^(\w+) \s* :? \s* ($number_regex)? $separator_regex ($number_regex)? $separator_regex ($strand_regex)? $/xms; + my ($seq_region_name, $start, $end, $strand); + if(($seq_region_name, $start, $end, $strand) = $location =~ $regex) { + + if(defined $strand) { + if(!looks_like_number($strand)) { + $strand = ($strand eq '+') ? 1 : -1; + } + } + + if(defined $start) { + $start =~ s/$number_seps_regex//g; + if($start < 1) { + warning "Start was less than 1 (${start}) which is not allowed. Resetting to 1" if ! $no_warnings; + $start = 1; + } + } + if(defined $end) { + $end =~ s/$number_seps_regex//g; + if($end < 1) { + throw "Cannot request negative or 0 end indexes through this interface. Given $end but expected something greater than 0" unless $no_errors; + } + } + + if(defined $start && defined $end && $start > $end) { + throw "Cannot request a slice whose start is greater than its end. Start: $start. End: $end" unless $no_errors; + } + } + + return ($seq_region_name, $start, $end, $strand); +} + +=head2 fetch_by_region_unique + + Arg [1] : string $coord_system_name (optional) + The name of the coordinate system of the slice to be created + This may be a name of an actual coordinate system or an alias + to a coordinate system. Valid aliases are 'seqlevel' or + 'toplevel'. + Arg [2] : string $seq_region_name + The name of the sequence region that the slice will be + created on. + Arg [3] : int $start (optional, default = 1) + The start of the slice on the sequence region + Arg [4] : int $end (optional, default = seq_region length) + The end of the slice on the sequence region + Arg [5] : int $strand (optional, default = 1) + The orientation of the slice on the sequence region + Arg [6] : string $version (optional, default = default version) + The version of the coordinate system to use (e.g. NCBI33) + Arg [7] : boolean $no_fuzz (optional, default = undef (false)) + If true (non-zero), do not use "fuzzy matching" (see below). + Example : $slice = $slice_adaptor->fetch_by_region_unique('chromosome', 'HSCHR6_MHC_COX'); + Description: Retrieves a slice on the requested region but returns only the unique + parts of the slice. At a minimum the + name the name of the seq_region to fetch must be provided. + + If no coordinate system name is provided than a slice on the + highest ranked coordinate system with a matching + seq_region_name will be returned. If a version but no + coordinate system name is provided, the same behaviour will + apply, but only coordinate systems of the appropriate version + are considered. The same applies if the 'toplevel' coordinate + system is specified, however in this case the version is + ignored. The coordinate system should always be specified if + it is known, since this is unambiguous and faster. + + Some fuzzy matching is performed if no exact match for + the provided name is found. This allows clones to be + fetched even when their version is not known. For + example fetch_by_region('clone', 'AC008066') will + retrieve the sequence_region with name 'AC008066.4'. + + The fuzzy matching can be turned off by setting the + $no_fuzz argument to a true value. + + If the requested seq_region is not found in the database undef + is returned. + + Returntype : listref Bio::EnsEMBL::Slice + Exceptions : throw if no seq_region_name is provided + throw if invalid coord_system_name is provided + throw if start > end is provided + Caller : general + Status : Stable + +=cut + +sub fetch_by_region_unique { + my $self = shift; + + my @out = (); + my $slice = $self->fetch_by_region(@_); + + + if ( !exists( $self->{'asm_exc_cache'} ) ) { + $self->_build_exception_cache(); + } + + if ( exists( + $self->{'asm_exc_cache'}->{ $self->get_seq_region_id($slice) } + ) ) + { + # Dereference symlinked assembly regions. Take out any regions + # which are symlinked because these are duplicates. + my @projection = + @{ $self->fetch_normalized_slice_projection($slice) }; + + foreach my $segment (@projection) { + if ( $segment->[2]->seq_region_name() eq $slice->seq_region_name() + && $segment->[2]->coord_system->equals( $slice->coord_system ) ) + { + push( @out, $segment->[2] ); + } + } + } + + return \@out; +} ## end sub fetch_by_region_unique + +=head2 fetch_by_name + + Arg [1] : string $name + Example : $name = 'chromosome:NCBI34:X:1000000:2000000:1'; + $slice = $slice_adaptor->fetch_by_name($name); + $slice2 = $slice_adaptor->fetch_by_name($slice3->name()); + Description: Fetches a slice using a slice name (i.e. the value returned by + the Slice::name method). This is useful if you wish to + store a unique identifier for a slice in a file or database or + pass a slice over a network. + Slice::name allows you to serialise/marshall a slice and this + method allows you to deserialise/unmarshal it. + + Returns undef if no seq_region with the provided name exists in + the database. + + Returntype : Bio::EnsEMBL::Slice or undef + Exceptions : throw if incorrent arg provided + Caller : Pipeline + Status : Stable + +=cut + +sub fetch_by_name { + my $self = shift; + my $name = shift; + + if(!$name) { + throw("name argument is required"); + } + + my @array = split(/:/,$name); + + if(scalar(@array) < 3 || scalar(@array) > 6) { + throw("Malformed slice name [$name]. Format is " . + "coord_system:version:name:start:end:strand"); + } + + # Rearrange arguments to suit fetch_by_region + + my @targetarray; + + $targetarray[0]=$array[0]; + $targetarray[5]=(($array[1]&&$array[1] ne "")?$array[1]:undef); + $targetarray[1]=(($array[2]&&$array[2] ne "")?$array[2]:undef); + $targetarray[2]=(($array[3]&&$array[3] ne "")?$array[3]:undef); + $targetarray[3]=(($array[4]&&$array[4] ne "")?$array[4]:undef); + $targetarray[4]=(($array[5]&&$array[5] ne "")?$array[5]:undef); + return $self->fetch_by_region(@targetarray); +} + + + +=head2 fetch_by_seq_region_id + + Arg [1] : string $seq_region_id + The internal identifier of the seq_region to create this slice + on + Arg [2] : optional start + Arg [3] : optional end + Arg [4] : optional strand + Example : $slice = $slice_adaptor->fetch_by_seq_region_id(34413); + Description: Creates a slice object of an entire seq_region using the + seq_region internal identifier to resolve the seq_region. + Returns undef if no such slice exists. + Returntype : Bio::EnsEMBL::Slice or undef + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_by_seq_region_id { + my ( $self, $seq_region_id, $start, $end, $strand ) = @_; + + my $arr = $self->{'sr_id_cache'}->{$seq_region_id}; + my ( $name, $length, $cs, $cs_id ); + + + if ( $arr && defined( $arr->[2] ) ) { + ( $name, $cs_id, $length ) = ( $arr->[1], $arr->[2], $arr->[3] ); + $cs = $self->db->get_CoordSystemAdaptor->fetch_by_dbID($cs_id); + } else { + my $sth = + $self->prepare( "SELECT sr.name, sr.coord_system_id, sr.length " + . "FROM seq_region sr " + . "WHERE sr.seq_region_id = ? " ); + + $sth->bind_param( 1, $seq_region_id, SQL_INTEGER ); + $sth->execute(); + + if ( $sth->rows() == 0 ) { return undef } + + ( $name, $cs_id, $length ) = $sth->fetchrow_array(); + $sth->finish(); + + $cs = $self->db->get_CoordSystemAdaptor->fetch_by_dbID($cs_id); + + #cache results to speed up repeated queries + my $arr = [ $seq_region_id, $name, $cs_id, $length ]; + + $self->{'sr_name_cache'}->{"$name:$cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$seq_region_id"} = $arr; + } + + return + Bio::EnsEMBL::Slice->new_fast({ + 'coord_system' => $cs, + 'seq_region_name' => $name, + 'seq_region_length'=> $length, + 'start' => $start || 1, + 'end' => $end || $length, + 'strand' => $strand || 1, + 'adaptor' => $self} ); +} ## end sub fetch_by_seq_region_id + + + +=head2 get_seq_region_id + + Arg [1] : Bio::EnsEMBL::Slice $slice + The slice to fetch a seq_region_id for + Example : $srid = $slice_adaptor->get_seq_region_id($slice); + Description: Retrieves the seq_region id (in this database) given a slice + Seq region ids are not stored on the slices themselves + because they are intended to be somewhat database independant + and seq_region_ids vary accross databases. + Returntype : int + Exceptions : throw if the seq_region of the slice is not in the db + throw if incorrect arg provided + Caller : BaseFeatureAdaptor + Status : Stable + +=cut + +sub get_seq_region_id { + my $self = shift; + my $slice = shift; + + if(!$slice || !ref($slice) || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice'))) { + throw('Slice argument is required'); + } + + my $seq_region_name = $slice->seq_region_name(); + my $key = $seq_region_name.":".$slice->coord_system->dbID(); + my $arr = $self->{'sr_name_cache'}->{"$key"}; + + if( $arr ) { + return $arr->[0]; + } + + my $cs_id = $slice->coord_system->dbID(); + + my $sth = $self->prepare("SELECT seq_region_id, length " . + "FROM seq_region " . + "WHERE name = ? AND coord_system_id = ?"); + + #force seq_region_name cast to string so mysql cannot treat as int + $sth->bind_param(1,"$seq_region_name",SQL_VARCHAR); + $sth->bind_param(2,$cs_id,SQL_INTEGER); + $sth->execute(); + + if($sth->rows() != 1) { + throw("Non existant or ambigous seq_region:\n" . + " coord_system=[$cs_id],\n" . + " name=[$seq_region_name],\n"); + + } + + my($seq_region_id, $length) = $sth->fetchrow_array(); + $sth->finish(); + + #cache information for future requests + $arr = [ $seq_region_id, $seq_region_name, $cs_id, $length ]; + + $self->{'sr_name_cache'}->{"$seq_region_name:$cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$seq_region_id"} = $arr; + + return $seq_region_id; +} + + + +=head2 fetch_all + + Arg [1] : string $coord_system_name + The name of the coordinate system to retrieve slices of. + This may be a name of an acutal coordinate system or an alias + to a coordinate system. Valid aliases are 'seqlevel' or + 'toplevel'. + Arg [2] : string $coord_system_version (optional) + The version of the coordinate system to retrieve slices of + Arg [3] : bool $include_non_reference (optional) + If this argument is not provided then only reference slices + will be returned. If set, both reference and non refeference + slices will be rerurned. + Arg [4] : int $include_duplicates (optional) + If set duplicate regions will be returned. + + NOTE: if you do not use this option and you have a PAR + (pseudo-autosomal region) at the beginning of your seq_region + then your slice will not start at position 1, so coordinates + retrieved from this slice might not be what you expected. + + Arg[5] : bool $include_lrg (optional) (default 0) + If set lrg regions will be returned aswell. + + + Example : @chromos = @{$slice_adaptor->fetch_all('chromosome','NCBI33')}; + @contigs = @{$slice_adaptor->fetch_all('contig')}; + + # get even non-reference regions + @slices = @{$slice_adaptor->fetch_all('toplevel',undef,1)}; + + # include duplicate regions (such as pseudo autosomal regions) + @slices = @{$slice_adaptor->fetch_all('toplevel', undef,0,1)}; + + Description: Retrieves slices of all seq_regions for a given coordinate + system. This is analagous to the methods fetch_all which were + formerly on the ChromosomeAdaptor, RawContigAdaptor and + CloneAdaptor classes. Slices fetched span the entire + seq_regions and are on the forward strand. + If the coordinate system with the provided name and version + does not exist an empty list is returned. + If the coordinate system name provided is 'toplevel', all + non-redundant toplevel slices are returned (note that any + coord_system_version argument is ignored in that case). + + Retrieved slices can be broken into smaller slices using the + Bio::EnsEMBL::Utils::Slice module. + + Returntype : listref of Bio::EnsEMBL::Slices + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_all { + my $self = shift; + my $cs_name = shift; + my $cs_version = shift || ''; + + my ($include_non_reference, $include_duplicates, $include_lrg) = @_; + + # + # verify existance of requested coord system and get its id + # + my $csa = $self->db->get_CoordSystemAdaptor(); + my $orig_cs = $csa->fetch_by_name($cs_name, $cs_version); + + return [] if ( !$orig_cs ); + + my %bad_vals=(); + + + # + # Get a hash of non reference seq regions + # + if ( !$include_non_reference ) { + my $sth = + $self->prepare( 'SELECT sr.seq_region_id ' + . 'FROM seq_region sr, seq_region_attrib sra, ' + . 'attrib_type at, coord_system cs ' + . 'WHERE at.code = "non_ref" ' + . 'AND sra.seq_region_id = sr.seq_region_id ' + . 'AND at.attrib_type_id = sra.attrib_type_id ' + . 'AND sr.coord_system_id = cs.coord_system_id ' + . 'AND cs.species_id = ?' ); + + $sth->bind_param( 1, $self->species_id(), SQL_INTEGER ); + $sth->execute(); + + my ($seq_region_id); + $sth->bind_columns( \$seq_region_id ); + + while ( $sth->fetch() ) { + $bad_vals{$seq_region_id} = 1; + } + } + + # + # if we do not want lrg's then add them to the bad list; + # + if ( !$include_lrg ) { + my $sth = + $self->prepare( 'SELECT sr.seq_region_id ' + . 'FROM seq_region sr, seq_region_attrib sra, ' + . 'attrib_type at, coord_system cs ' + . 'WHERE at.code = "LRG" ' + . 'AND sra.seq_region_id = sr.seq_region_id ' + . 'AND at.attrib_type_id = sra.attrib_type_id ' + . 'AND sr.coord_system_id = cs.coord_system_id ' + . 'AND cs.species_id = ?' ); + + $sth->bind_param( 1, $self->species_id(), SQL_INTEGER ); + $sth->execute(); + + my ($seq_region_id); + $sth->bind_columns( \$seq_region_id ); + + while ( $sth->fetch() ) { + $bad_vals{$seq_region_id} = 1; + } + } + + # + # Retrieve the seq_regions from the database + # + + my $sth; + if ( $orig_cs->is_top_level() ) { + $sth = + $self->prepare( 'SELECT sr.seq_region_id, sr.name, ' + . 'sr.length, sr.coord_system_id ' + . 'FROM seq_region sr, seq_region_attrib sra, ' + . 'attrib_type at, coord_system cs ' + . 'WHERE at.code = "toplevel" ' + . 'AND at.attrib_type_id = sra.attrib_type_id ' + . 'AND sra.seq_region_id = sr.seq_region_id ' + . 'AND sr.coord_system_id = cs.coord_system_id ' + . 'AND cs.species_id = ?' ); + + $sth->bind_param( 1, $self->species_id(), SQL_INTEGER ); + $sth->execute(); + } else { + $sth = + $self->prepare( 'SELECT sr.seq_region_id, sr.name, ' + . 'sr.length, sr.coord_system_id ' + . 'FROM seq_region sr ' + . 'WHERE sr.coord_system_id = ?' ); + + $sth->bind_param( 1, $orig_cs->dbID, SQL_INTEGER ); + $sth->execute(); + } + + my ( $seq_region_id, $name, $length, $cs_id ); + $sth->bind_columns( \( $seq_region_id, $name, $length, $cs_id ) ); + + my $cache_count = 0; + + my @out; + while($sth->fetch()) { + if(!defined($bad_vals{$seq_region_id})){ + my $cs = $csa->fetch_by_dbID($cs_id); + + if(!$cs) { + throw("seq_region $name references non-existent coord_system $cs_id."); + } + + #cache values for future reference, but stop adding to the cache once we + #we know we have filled it up + if($cache_count < $Bio::EnsEMBL::Utils::SeqRegionCache::SEQ_REGION_CACHE_SIZE) { + my $arr = [ $seq_region_id, $name, $cs_id, $length ]; + + $self->{'sr_name_cache'}->{"$name:$cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$seq_region_id"} = $arr; + + $cache_count++; + } + + my $slice = Bio::EnsEMBL::Slice->new_fast({ + 'start' => 1, + 'end' => $length, + 'strand' => 1, + 'seq_region_name' => $name, + 'seq_region_length'=> $length, + 'coord_system' => $cs, + 'adaptor' => $self}); + + if(!defined($include_duplicates) or !$include_duplicates){ + # test if this slice *could* have a duplicate (exception) region + $self->_build_exception_cache() if(!exists $self->{'asm_exc_cache'}); + if(exists $self->{asm_exc_cache}->{$seq_region_id}) { + + # Dereference symlinked assembly regions. Take out + # any regions which are symlinked because these are duplicates + my @projection = @{$self->fetch_normalized_slice_projection($slice)}; + foreach my $segment ( @projection) { + if($segment->[2]->seq_region_name() eq $slice->seq_region_name() && + $segment->[2]->coord_system->equals($slice->coord_system)) { + push @out, $segment->[2]; + } + } + } else { + # no duplicate regions + push @out, $slice; + } + } else { + # we want duplicates anyway so do not do any checks + push @out, $slice; + } + } + } + + return \@out; +} + +=head2 is_toplevel + Arg : int seq_region_id + Example : my $top = $slice_adptor->is_toplevel($seq_region_id) + Description: Returns 1 if slice is a toplevel slice else 0 + Returntype : int + Caller : Slice method is_toplevel + Status : At Risk + +=cut + +sub is_toplevel { + my $self = shift; + my $id = shift; + + my $sth = $self->prepare( + "SELECT at.code from seq_region_attrib sra, attrib_type at " + . "WHERE sra.seq_region_id = ? " + . "AND at.attrib_type_id = sra.attrib_type_id " + . "AND at.code = 'toplevel'" ); + + $sth->bind_param( 1, $id, SQL_INTEGER ); + $sth->execute(); + + my $code; + $sth->bind_columns( \$code ); + + while ( $sth->fetch ) { + $sth->finish; + return 1; + } + + $sth->finish; + return 0; +} + +=head2 is_reference + Arg : int seq_region_id + Example : my $reference = $slice_adptor->is_reference($seq_region_id) + Description: Returns 1 if slice is a reference slice else 0 + Returntype : int + Caller : Slice method is_reference + Status : At Risk + +=cut + +sub is_reference { + my $self = shift; + my $id = shift; + + my $sth = $self->prepare( + "SELECT at.code from seq_region_attrib sra, attrib_type at " + . "WHERE sra.seq_region_id = ? " + . "AND at.attrib_type_id = sra.attrib_type_id " + . "AND at.code = 'non_ref'" ); + + $sth->bind_param( 1, $id, SQL_INTEGER ); + $sth->execute(); + + my $code; + $sth->bind_columns( \$code ); + + while ( $sth->fetch ) { + $sth->finish; + return 0; + } + + $sth->finish; + return 1; +} + +=head2 is_circular + + Arg[1] : int seq_region_id + Example : my $circular = $slice_adptor->is_circular($seq_region_id); + Description : Indicates if the sequence region was circular or not + Returntype : Boolean + +=cut + +sub is_circular { + my ($self, $id) = @_; + + if (! defined $self->{is_circular}) { + $self->_build_circular_slice_cache(); + } + + return 0 if $self->{is_circular} == 0; + return (exists $self->{circular_sr_id_cache}->{$id}) ? 1 : 0; +} + +=head2 fetch_by_band + + Title : fetch_by_band + Usage : + Function: Does not work please use fetch_by_chr_band + Example : + Returns : Bio::EnsEMBL::Slice + Args : the band name + Status : AT RISK + +=cut + +sub fetch_by_band { + my ($self,$band) = @_; + + my $sth = $self->dbc->prepare + ("select s.name,max(k.seq_region_id)-min(k.seq_region_id, min(k.seq_region_start), max(k.seq_region_id) " . + "from karyotype as k " . + "where k.band like ? and k.seq_region_id = s.seq_region_id"); + + $sth->bind_param(1,"$band%",SQL_VARCHAR); + $sth->execute(); + my ( $seq_region_name, $discrepancy, $seq_region_start, $seq_region_end) = $sth->fetchrow_array; + + if($seq_region_name && $discrepancy>0) { + throw("Band maps to multiple seq_regions"); + } else { + return $self->fetch_by_region('toplevel',$seq_region_name,$seq_region_start,$seq_region_end); + } + throw("Band not recognised in database"); +} + +=head2 fetch_by_chr_band + + Title : fetch_by_chr_band + Usage : + Function: create a Slice representing a series of bands + Example : + Returns : Bio::EnsEMBL::Slice + Args : the band name + Status : Stable + +=cut + +sub fetch_by_chr_band { + my ( $self, $chr, $band ) = @_; + + my $chr_slice = $self->fetch_by_region( 'toplevel', $chr ); + my $seq_region_id = $self->get_seq_region_id($chr_slice); + + my $sth = + $self->prepare( 'SELECT MIN(k.seq_region_start), ' + . 'MAX(k.seq_region_end) ' + . 'FROM karyotype k ' + . 'WHERE k.seq_region_id = ? ' + . 'AND k.band LIKE ?' ); + + $sth->bind_param( 1, $seq_region_id, SQL_INTEGER ); + $sth->bind_param( 2, "$band%", SQL_VARCHAR ); + $sth->execute(); + + my ( $slice_start, $slice_end ) = $sth->fetchrow_array; + + if ( defined $slice_start ) { + return + $self->fetch_by_region( 'toplevel', $chr, + $slice_start, $slice_end ); + } + + throw("Band not recognised in database"); +} ## end sub fetch_by_chr_band + + + +=head2 fetch_by_exon_stable_id + + Arg [1] : string $exonid + The stable id of the exon around which the slice is + desired + Arg [2] : (optional) int $size + The length of the flanking regions the slice should encompass + on either side of the exon (0 by default) + Example : $slc = $sa->fetch_by_exon_stable_id('ENSE00000302930',10); + Description: Creates a slice around the region of the specified exon. + If a context size is given, the slice is extended by that + number of basepairs on either side of the exon. + + The slice will be created in the exon's native coordinate system + and in the forward orientation. + Returntype : Bio::EnsEMBL::Slice + Exceptions : Thrown if the exon is not in the database. + Caller : general + Status : Stable + +=cut + +sub fetch_by_exon_stable_id{ + my ($self,$exonid,$size) = @_; + + throw('Exon argument is required.') if(!$exonid); + + my $ea = $self->db->get_ExonAdaptor; + my $exon = $ea->fetch_by_stable_id($exonid); + + throw("Exon [$exonid] does not exist in DB.") if(!$exon); + + return $self->fetch_by_Feature($exon, $size); +} + +=head2 fetch_by_transcript_stable_id + + Arg [1] : string $transcriptid + The stable id of the transcript around which the slice is + desired + Arg [2] : (optional) int $size + The length of the flanking regions the slice should encompass + on either side of the transcript (0 by default) + Example : $slc = $sa->fetch_by_transcript_stable_id('ENST00000302930',10); + Description: Creates a slice around the region of the specified transcript. + If a context size is given, the slice is extended by that + number of basepairs on either side of the + transcript. + + The slice will be created in the transcript's native coordinate + system and in the forward orientation. + Returntype : Bio::EnsEMBL::Slice + Exceptions : Thrown if the transcript is not in the database. + Caller : general + Status : Stable + +=cut + +sub fetch_by_transcript_stable_id{ + my ($self,$transcriptid,$size) = @_; + + throw('Transcript argument is required.') if(!$transcriptid); + + my $ta = $self->db->get_TranscriptAdaptor; + my $transcript = $ta->fetch_by_stable_id($transcriptid); + + throw("Transcript [$transcriptid] does not exist in DB.") if(!$transcript); + + return $self->fetch_by_Feature($transcript, $size); +} + + + +=head2 fetch_by_transcript_id + + Arg [1] : int $transcriptid + The unique database identifier of the transcript around which + the slice is desired + Arg [2] : (optional) int $size + The length of the flanking regions the slice should encompass + on either side of the transcript (0 by default) + Example : $slc = $sa->fetch_by_transcript_id(24, 1000); + Description: Creates a slice around the region of the specified transcript. + If a context size is given, the slice is extended by that + number of basepairs on either side of the + transcript. + + The slice will be created in the transcript's native coordinate + system and in the forward orientation. + Returntype : Bio::EnsEMBL::Slice + Exceptions : throw on incorrect args + throw if transcript is not in database + Caller : general + Status : Stable + +=cut + +sub fetch_by_transcript_id { + my ($self,$transcriptid,$size) = @_; + + throw('Transcript id argument is required.') if(!$transcriptid); + + my $transcript_adaptor = $self->db()->get_TranscriptAdaptor(); + my $transcript = $transcript_adaptor->fetch_by_dbID($transcriptid); + + throw("Transcript [$transcriptid] does not exist in DB.") if(!$transcript); + + return $self->fetch_by_Feature($transcript, $size); +} + + + +=head2 fetch_by_gene_stable_id + + Arg [1] : string $geneid + The stable id of the gene around which the slice is + desired + Arg [2] : (optional) int $size + The length of the flanking regions the slice should encompass + on either side of the gene (0 by default) + Example : $slc = $sa->fetch_by_gene_stable_id('ENSG00000012123',10); + Description: Creates a slice around the region of the specified gene. + If a context size is given, the slice is extended by that + number of basepairs on either side of the gene. + + The slice will be created in the gene's native coordinate system + and in the forward orientation. + Returntype : Bio::EnsEMBL::Slice + Exceptions : throw on incorrect args + throw if transcript does not exist + Caller : general + Status : Stable + +=cut + +sub fetch_by_gene_stable_id { + my ($self,$geneid,$size) = @_; + + throw('Gene argument is required.') if(!$geneid); + + my $gene_adaptor = $self->db->get_GeneAdaptor(); + my $gene = $gene_adaptor->fetch_by_stable_id($geneid); + + throw("Gene [$geneid] does not exist in DB.") if(!$gene); + + return $self->fetch_by_Feature($gene, $size); +} + + + +=head2 fetch_by_Feature + + Arg [1] : Bio::EnsEMBL::Feature $feat + The feature to fetch the slice around + Arg [2] : int size (optional) + The desired number of flanking basepairs around the feature. + The size may also be provided as a percentage of the feature + size such as 200% or 80.5%. + Example : $slice = $slice_adaptor->fetch_by_Feature($feat, 100); + Description: Retrieves a slice around a specific feature. All this really + does is return a resized version of the slice that the feature + is already on. Note that slices returned from this method + are always on the forward strand of the seq_region regardless of + the strandedness of the feature passed in. + Returntype : Bio::EnsEMBL::Slice + Exceptions : throw if the feature does not have an attached slice + throw if feature argument is not provided + Caller : fetch_by_gene_stable_id, fetch_by_transcript_stable_id, + fetch_by_gene_id, fetch_by_transcript_id + Status : Stable + +=cut + +sub fetch_by_Feature{ + my ($self, $feature, $size) = @_; + + $size ||= 0; + + if(!ref($feature) || !$feature->isa('Bio::EnsEMBL::Feature')) { + throw('Feature argument expected.'); + } + + my $slice = $feature->slice(); + if(!$slice || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice') )) { + throw('Feature must be attached to a valid slice.'); + } + + + my $fstart = $feature->start(); + my $fend = $feature->end(); + if(!defined($fstart) || !defined($fend)) { + throw('Feature must have defined start and end.'); + } + + #convert the feature slice coordinates to seq_region coordinates + my $slice_start = $slice->start(); + my $slice_end = $slice->end(); + my $slice_strand = $slice->strand(); + if($slice_start != 1 || $slice_strand != 1) { + if($slice_strand == 1) { + $fstart = $fstart + $slice_start - 1; + $fend = $fend + $slice_start - 1; + } else { + my $tmp_start = $fstart; + $fstart = $slice_end - $fend + 1; + $fend = $slice_end - $tmp_start + 1; + } + } + + ## Size may be stored as a %age of the length of the feature + ## Size = 100% gives no context + ## Size = 200% gives context - 50% the size of the feature either side of + ## feature + + $size = int( ($1-100)/200 * ($fend-$fstart+1) ) if( $size =~/([\d+\.]+)%/ ); + + #return a new slice covering the region of the feature + my $S = Bio::EnsEMBL::Slice->new_fast({ + 'seq_region_name' => $slice->seq_region_name, + 'seq_region_length' => $slice->seq_region_length, + 'coord_system' => $slice->coord_system, + 'start' => $fstart - $size, + 'end' => $fend + $size, + 'strand' => 1, + 'adaptor' => $self}); + $S->{'_raw_feature_strand'} = $feature->strand * $slice_strand if $feature->can('strand'); + return $S; +} + + + +=head2 fetch_by_misc_feature_attribute + + Arg [1] : string $attribute_type + The code of the attribute type + Arg [2] : (optional) string $attribute_value + The value of the attribute to fetch by + Arg [3] : (optional) int $size + The amount of flanking region around the misc feature desired. + Example : $slice = $sa->fetch_by_misc_feature_attribute('superctg', + 'NT_030871'); + $slice = $sa->fetch_by_misc_feature_attribute('synonym', + 'AL00012311', + $flanking); + Description: Fetches a slice around a MiscFeature with a particular + attribute type and value. If no value is specified then + the feature with the particular attribute is used. + If no size is specified then 0 is used. + Returntype : Bio::EnsEMBL::Slice + Exceptions : Throw if no feature with the specified attribute type and value + exists in the database + Warning if multiple features with the specified attribute type + and value exist in the database. + Caller : webcode + Status : Stable + +=cut + +sub fetch_by_misc_feature_attribute { + my ($self, $attrib_type_code, $attrib_value, $size) = @_; + + my $mfa = $self->db()->get_MiscFeatureAdaptor(); + + my $feats = $mfa->fetch_all_by_attribute_type_value($attrib_type_code, + $attrib_value); + + if(@$feats == 0) { + throw("MiscFeature with $attrib_type_code=$attrib_value does " . + "not exist in DB."); + } + + if(@$feats > 1) { + warning("MiscFeature with $attrib_type_code=$attrib_value is " . + "ambiguous - using first one found."); + } + + my ($feat) = @$feats; + + return $self->fetch_by_Feature($feat, $size); +} + +=head2 fetch_normalized_slice_projection + + Arg [1] : Bio::EnsEMBL::Slice $slice + Example : ( optional ) + Description: gives back a project style result. The returned slices + represent the areas to which there are symlinks for the + given slice. start, end show which area on given slice is + symlinked + Returntype : [[start,end,$slice][]] + Exceptions : none + Caller : BaseFeatureAdaptor + Status : Stable + +=cut + + +sub fetch_normalized_slice_projection { + my $self = shift; + my $slice = shift; + + my $slice_seq_region_id = $self->get_seq_region_id( $slice ); + + $self->_build_exception_cache() if(!exists($self->{'asm_exc_cache'})); + + my $result = $self->{'asm_exc_cache'}->{$slice_seq_region_id}; + + $result ||= []; + + my (@haps, @pars); + + foreach my $row (@$result) { + my ( $seq_region_id, $seq_region_start, $seq_region_end, + $exc_type, $exc_seq_region_id, $exc_seq_region_start, + $exc_seq_region_end ) = @$row; + + # need overlapping PAR and all HAPs if any + if( $exc_type eq "PAR" ) { + if( $seq_region_start <= $slice->end() && + $seq_region_end >= $slice->start() ) { + push( @pars, [ $seq_region_start, $seq_region_end, $exc_seq_region_id, + $exc_seq_region_start, $exc_seq_region_end ] ); + } + } else { + push( @haps, [ $seq_region_start, $seq_region_end, $exc_seq_region_id, + $exc_seq_region_start, $exc_seq_region_end ] ); + } + } + + if(!@pars && !@haps) { + #just return this slice, there were no haps or pars + return [bless ( [1,$slice->length, $slice], "Bio::EnsEMBL::ProjectionSegment")]; + } + + my @syms; + if( @haps >= 1 ) { + my @sort_haps = sort { $a->[1] <=> $b->[1] } @haps; + + my $count =0; + my $chr_start = 1; + my $hap_start = 1; + my $last = 0; + + my $seq_reg_slice = $self->fetch_by_seq_region_id($slice_seq_region_id); + my $exc_slice = $self->fetch_by_seq_region_id( $sort_haps[0][2] ); + my $len1 = $seq_reg_slice->length(); + my $len2 = $exc_slice->length(); + my $max_len = ($len1 > $len2) ? $len1 : $len2; + + while($count <= scalar(@sort_haps) and !$last){ + my $chr_end; + my $hap_end; + if(defined($sort_haps[$count]) and defined($sort_haps[$count][0]) ){ + $hap_end = $sort_haps[$count][0]-1; + $chr_end = $sort_haps[$count][3]-1 + } + else{ + $last = 1; + $hap_end = $len1; + $chr_end = $len2; + my $diff = ($hap_end-$hap_start)-($chr_end-$chr_start); + if($diff > 0){ + push( @syms, [ $hap_start, $hap_end, $sort_haps[0][2], $chr_start, $chr_end+$diff] ); + } + elsif($diff < 0){ + push( @syms, [ $hap_start, $hap_end - $diff, $sort_haps[0][2], $chr_start, $chr_end] ); + } + else{ + push( @syms, [ $hap_start, $hap_end, $sort_haps[0][2], $chr_start, $chr_end] ); + } + next; + } + if($hap_end and $hap_start < $len1){ # if hap at start or end of chromosome + push( @syms, [ $hap_start, $hap_end, $sort_haps[0][2], $chr_start, $chr_end] ); + } + $chr_start = $chr_end + ($sort_haps[$count][4]-$sort_haps[$count][3]) + 2; + $hap_start = $hap_end + ($sort_haps[$count][1]-$sort_haps[$count][0]) + 2; + $count++; + } + + + } + + + # for now haps and pars should not be both there, but in theory we + # could handle it here by cleverly merging the pars into the existing syms, + # for now just: + push( @syms, @pars ); + + my $mapper = Bio::EnsEMBL::Mapper->new( "sym", "org" ); + my $count = 0; + for my $sym ( @syms ) { + $mapper->add_map_coordinates( $slice_seq_region_id, $sym->[0], $sym->[1], + 1, $sym->[2], $sym->[3], $sym->[4] ); + } + + my @linked = $mapper->map_coordinates( $slice_seq_region_id, + $slice->start(), $slice->end(), + $slice->strand(), "sym" ); + + # gaps are regions where there is no mapping to another region + my $rel_start = 1; + + #if there was only one coord and it is a gap, we know it is just the + #same slice with no overlapping symlinks + if(@linked == 1 && $linked[0]->isa('Bio::EnsEMBL::Mapper::Gap')) { + return [bless( [1,$slice->length, $slice], "Bio::EnsEMBL::ProjectionSegment" )]; + } + + my @out; + for my $coord ( @linked ) { + if( $coord->isa( "Bio::EnsEMBL::Mapper::Gap" )) { + my $exc_slice = Bio::EnsEMBL::Slice->new_fast({ + 'start' => $coord->start(), + 'end' => $coord->end(), + 'strand' => $slice->strand(), + 'coord_system' => $slice->coord_system(), + 'adaptor' => $self, + 'seq_region_name' => $slice->seq_region_name(), + 'seq_region_length' => $slice->seq_region_length()}); + push( @out, bless ( [ $rel_start, $coord->length()+$rel_start-1, + $exc_slice ], "Bio::EnsEMBL::ProjectionSegment") ); + } else { + my $exc_slice = $self->fetch_by_seq_region_id( $coord->id() ); + my $exc2_slice = Bio::EnsEMBL::Slice->new_fast({ + + 'start' => $coord->start(), + 'end' => $coord->end(), + 'strand' => $coord->strand(), + 'seq_region_name' => $exc_slice->seq_region_name(), + 'seq_region_length' => $exc_slice->seq_region_length(), + 'coord_system' => $exc_slice->coord_system(), + 'adaptor' => $self + }); + + push( @out, bless( [ $rel_start, $coord->length() + $rel_start - 1, + $exc2_slice ], "Bio::EnsEMBL::ProjectionSegment") ); + } + $rel_start += $coord->length(); + } + + return \@out; +} + + + + +=head2 store + + Arg [1] : Bio::EnsEMBL::Slice $slice + Arg [2] : (optional) $seqref reference to a string + The sequence associated with the slice to be stored. + Example : $slice = Bio::EnsEMBL::Slice->new(...); + $seq_region_id = $slice_adaptor->store($slice, \$sequence); + Description: This stores a slice as a sequence region in the database + and returns the seq region id. The passed in slice must + start at 1, and must have a valid seq_region name and coordinate + system. The attached coordinate system must already be stored in + the database. The sequence region is assumed to start at 1 and + to have a length equalling the length of the slice. The end of + the slice must equal the seq_region_length. + If the slice coordinate system is the sequence level coordinate + system then the seqref argument must also be passed. If the + slice coordinate system is NOT a sequence level coordinate + system then the sequence argument cannot be passed. + Returntype : int + Exceptions : throw if slice has no coord system. + throw if slice coord system is not already stored. + throw if slice coord system is seqlevel and no sequence is + provided. + throw if slice coord system is not seqlevel and sequence is + provided. + throw if slice does not start at 1 + throw if sequence is provided and the sequence length does not + match the slice length. + throw if the SQL insert fails (e.g. on duplicate seq region) + throw if slice argument is not passed + throw if the slice end is not equal to seq_region_length + Caller : database loading scripts + Status : Stable + +=cut + + + +sub store { + my $self = shift; + my $slice = shift; + my $seqref = shift; + + # + # Get all of the sanity checks out of the way before storing anything + # + + if(!ref($slice) || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice'))) { + throw('Slice argument is required'); + } + + my $cs = $slice->coord_system(); + throw("Slice must have attached CoordSystem.") if(!$cs); + + my $db = $self->db(); + if(!$cs->is_stored($db)) { + throw("Slice CoordSystem must already be stored in DB.") + } + + if($slice->start != 1 || $slice->strand != 1) { + throw("Slice must have start==1 and strand==1."); + } + + if($slice->end() != $slice->seq_region_length()) { + throw("Slice must have end==seq_region_length"); + } + + my $sr_len = $slice->length(); + my $sr_name = $slice->seq_region_name(); + + if(!$sr_name) { + throw("Slice must have valid seq region name."); + } + + if($cs->is_sequence_level()) { + if(!$seqref) { + throw("Must provide sequence for sequence level coord system."); + } + if(ref($seqref) ne 'SCALAR') { + throw("Sequence must be a scalar reference."); + } + my $seq_len = length($$seqref); + + if($seq_len != $sr_len) { + throw("Sequence length ($seq_len) must match slice length ($sr_len)."); + } + } else { + if($seqref) { + throw("Cannot provide sequence for non-sequence level seq regions."); + } + } + + #store the seq_region + + my $sth = $db->dbc->prepare("INSERT INTO seq_region " . + "SET name = ?, " . + " length = ?, " . + " coord_system_id = ?" ); + + $sth->bind_param(1,$sr_name,SQL_VARCHAR); + $sth->bind_param(2,$sr_len,SQL_INTEGER); + $sth->bind_param(3,$cs->dbID,SQL_INTEGER); + + $sth->execute(); + + my $seq_region_id = $sth->{'mysql_insertid'}; + + if(!$seq_region_id) { + throw("Database seq_region insertion failed."); + } + + if($cs->is_sequence_level()) { + #store sequence if it was provided + my $seq_adaptor = $db->get_SequenceAdaptor(); + $seq_adaptor->store($seq_region_id, $$seqref); + } + + #synonyms + if(defined($slice->{'synonym'})){ + foreach my $syn (@{$slice->{'synonym'}} ){ + $syn->seq_region_id($seq_region_id); # set the seq_region_id + $syn->adaptor->store($syn); + } + } + + + $slice->adaptor($self); + + return $seq_region_id; +} + + +=head2 store_assembly + + Arg [1] : Bio::EnsEMBL::Slice $asm_slice + Arg [2] : Bio::EnsEMBL::Slice $cmp_slice + Example : $asm = $slice_adaptor->store_assembly( $slice1, $slice2 ); + Description: Creates an entry in the analysis table based on the + coordinates of the two slices supplied. Returns a string + representation of the assembly that gets created. + Returntype : string + Exceptions : throw if either slice has no coord system (cs). + throw unless the cs rank of the asm_slice is lower than the + cmp_slice. + throw if there is no mapping path between coord systems + throw if the lengths of each slice are not equal + throw if there are existing mappings between either slice + and the oposite cs + Caller : database loading scripts + Status : Experimental + +=cut + +sub store_assembly{ + my $self = shift; + my $asm_slice = shift; + my $cmp_slice = shift; + + # + # Get all of the sanity checks out of the way before storing anything + # + + if(!ref($asm_slice) || !($asm_slice->isa('Bio::EnsEMBL::Slice') or $asm_slice->isa('Bio::EnsEMBL::LRGSlice'))) { + throw('Assembled Slice argument is required'); + } + if(!ref($cmp_slice) || !($cmp_slice->isa('Bio::EnsEMBL::Slice') or $cmp_slice->isa('Bio::EnsEMBL::LRGSlice')) ) { + throw('Assembled Slice argument is required'); + } + + my $asm_cs = $asm_slice->coord_system(); + throw("Slice must have attached CoordSystem.") if(!$asm_cs); + my $cmp_cs = $cmp_slice->coord_system(); + throw("Slice must have attached CoordSystem.") if(!$cmp_cs); + + unless( $asm_cs->rank < $cmp_cs->rank ){ + throw("Assembled Slice CoordSystem->rank must be lower than ". + "the component Slice Coord_system" ); + } + + my @path = + @{ $asm_cs->adaptor()->get_mapping_path( $asm_cs, $cmp_cs ) }; + + if ( !@path ) { + throw("No mapping path defined between " + . $asm_cs->name() . " and " + . $cmp_cs->name() ); + } + + if( $asm_slice->length != $cmp_slice->length ){ + throw("The lengths of the assembled and component slices are not equal" ); + } + + # For now we disallow any existing mappings between the asm slice and cmp + # CoordSystem and vice-versa. + # Some cases of multiple mappings may be allowable by the API, but their + # logic needs to be coded below. + + my $asm_proj = $asm_slice->project( $cmp_cs->name, $cmp_cs->version ); + if( @$asm_proj ){ + throw("Regions of the assembled slice are already assembled ". + "into the component CoordSystem" ); + } + my $cmp_proj = $cmp_slice->project( $asm_cs->name, $asm_cs->version ); + if( @$cmp_proj ){ + throw("Regions of the component slice are already assembled ". + "into the assembled CoordSystem" ); + } + + # + # Checks complete. Store the data + # + my $sth = $self->db->dbc->prepare + ("INSERT INTO assembly " . + "SET asm_seq_region_id = ?, " . + " cmp_seq_region_id = ?, " . + " asm_start = ?, " . + " asm_end = ?, " . + " cmp_start = ?, " . + " cmp_end = ?, " . + " ori = ?" ); + + my $asm_seq_region_id = $self->get_seq_region_id( $asm_slice ); + my $cmp_seq_region_id = $self->get_seq_region_id( $cmp_slice ); + my $ori = $asm_slice->strand * $cmp_slice->strand; + + $sth->bind_param(1,$asm_seq_region_id,SQL_INTEGER); + $sth->bind_param(2,$cmp_seq_region_id,SQL_INTEGER); + $sth->bind_param(3,$asm_slice->start,SQL_INTEGER); + $sth->bind_param(4,$asm_slice->end,SQL_INTEGER); + $sth->bind_param(5,$cmp_slice->start,SQL_INTEGER); + $sth->bind_param(6,$cmp_slice->end,SQL_INTEGER); + $sth->bind_param(7,$ori,SQL_INTEGER); + + $sth->execute(); + + #use Data::Dumper qw( Dumper ); + #warn Dumper( $self->db->{seq_region_cache} ); + #$self->db->{seq_region_cache} = undef; + #$self->_cache_seq_regions(); + + my $ama = $self->db->get_AssemblyMapperAdaptor(); + $ama->delete_cache(); + + + return $asm_slice->name . "<>" . $cmp_slice->name; + +} + + +=head2 prepare + + Arg [1] : String $sql + Example : ( optional ) + Description: overrides the default adaptor prepare method. + All slice sql will usually use the dna_db. + Returntype : DBD::sth + Exceptions : none + Caller : internal, convenience method + Status : Stable + +=cut + +sub prepare { + my ( $self, $sql ) = @_; + return $self->db()->dnadb()->dbc->prepare($sql); +} + +sub _build_exception_cache { + my $self = shift; + + # build up a cache of the entire assembly exception table + # it should be small anyway + my $sth = + $self->prepare( 'SELECT ae.seq_region_id, ae.seq_region_start, ' + . 'ae.seq_region_end, ae.exc_type, ae.exc_seq_region_id, ' + . 'ae.exc_seq_region_start, ae.exc_seq_region_end ' + . 'FROM assembly_exception ae, ' + . 'seq_region sr, coord_system cs ' + . 'WHERE sr.seq_region_id = ae.seq_region_id ' + . 'AND sr.coord_system_id = cs.coord_system_id ' + . 'AND cs.species_id = ?' ); + + $sth->bind_param( 1, $self->species_id(), SQL_INTEGER ); + $sth->execute(); + + my %hash; + $self->{'asm_exc_cache'} = \%hash; + + my $row; + while ( $row = $sth->fetchrow_arrayref() ) { + my @result = @$row; + $hash{ $result[0] } ||= []; + push( @{ $hash{ $result[0] } }, \@result ); + } + $sth->finish(); +} ## end sub _build_exception_cache + +=head2 cache_toplevel_seq_mappings + + Args : none + Example : $slice_adaptor->cache_toplevel_seq_mappings(); + Description: caches all the assembly mappings needed for genes + Returntype : None + Exceptions : None + Caller : general + Status : At Risk + : New experimental code + +=cut + +sub cache_toplevel_seq_mappings { + my ($self) = @_; + + # Get the sequence level to map too + + my $sql = (<prepare($sql); + $sth->bind_param( 1, $self->species_id(), SQL_INTEGER ); + $sth->execute(); + + my $sequence_level = $sth->fetchrow_array(); + + $sth->finish(); + + my $csa = $self->db->get_CoordSystemAdaptor(); + my $ama = $self->db->get_AssemblyMapperAdaptor(); + + my $cs1 = $csa->fetch_by_name($sequence_level); + + #get level to map too. + + $sql = (<prepare($sql); + $sth->bind_param( 1, $self->species_id(), SQL_INTEGER ); + $sth->execute(); + + while ( my $csn = $sth->fetchrow_array() ) { + if ( $csn eq $sequence_level ) { next } + my $cs2 = $csa->fetch_by_name($csn); + my $am = $ama->fetch_by_CoordSystems( $cs1, $cs2 ); + $am->register_all(); + } + +} ## end sub cache_toplevel_seq_mappings + + +sub _build_circular_slice_cache { + my $self = shift; + + # build up a cache of circular sequence region ids + my $sth = + $self->prepare( "SELECT sra.seq_region_id FROM seq_region_attrib sra " + . "INNER JOIN attrib_type at ON sra.attrib_type_id = at.attrib_type_id " + . "INNER JOIN seq_region sr ON sra.seq_region_id = sr.seq_region_id " + . "INNER JOIN coord_system cs ON sr.coord_system_id = cs.coord_system_id " + . "WHERE code = 'circular_seq' and cs.species_id = ?"); + + $sth->bind_param( 1, $self->species_id(), SQL_INTEGER ); + $sth->execute(); + + my $id; + my %hash; + if ( ($id) = $sth->fetchrow_array() ) { + $self->{'circular_sr_id_cache'} = \%hash; + $self->{'is_circular'} = 1; + $hash{ $id } = $id; + while ( ($id) = $sth->fetchrow_array() ) { + $hash{ $id } = $id; + } + } else { + $self->{'is_circular'} = 0; + } + $sth->finish(); +} ## end _build_circular_slice_cache + + +##################################### +# sub DEPRECATED METHODs +##################################### + +=head2 fetch_by_mapfrag + + Function: DEPRECATED use fetch_by_misc_feature_attribute('synonym',$mapfrag) + +=cut + +sub fetch_by_mapfrag{ + my ($self,$mymapfrag,$flag,$size) = @_; + deprecate('Use fetch_by_misc_feature_attribute instead'); + $flag ||= 'fixed-width'; # alt.. 'context' + $size ||= $flag eq 'fixed-width' ? 100000 : 0; + return $self->fetch_by_misc_feature_attribute('synonym',$mymapfrag,$size); +} + + + +=head2 fetch_by_chr_start_end + + Description: DEPRECATED use fetch_by_region instead + +=cut + +sub fetch_by_chr_start_end { + my ($self,$chr,$start,$end) = @_; + deprecate('Use fetch_by_region() instead'); + + #assume that by chromosome the user actually meant top-level coord + #system since this is the old behaviour of this deprecated method + my $csa = $self->db->get_CoordSystemAdaptor(); + my ($cs) = @{$csa->fetch_all()}; # get the highest coord system + + return $self->fetch_by_region($cs->name,$chr,$start,$end,1,$cs->version); +} + + + +=head2 fetch_by_contig_name + + Description: Deprecated. Use fetch_by_region(), Slice::project(), + Slice::expand() instead + +=cut + +sub fetch_by_contig_name { + my ($self, $name, $size) = @_; + + deprecate('Use fetch_by_region(), Slice::project() and Slice::expand().'); + + #previously wanted chromosomal slice on a given contig. Assume this means + #a top-level slice on a given seq_region in the seq_level coord system + my $csa = $self->db()->get_CoordSystemAdaptor(); + my $seq_level = $csa->fetch_sequence_level(); + + my $seq_lvl_slice = $self->fetch_by_region($seq_level->name(), $name); + + if(!$seq_lvl_slice) { + return undef; + } + + my @projection = @{$seq_lvl_slice->project('toplevel')}; + + if(@projection != 1) { + warning("$name is mapped to multiple toplevel locations."); + } + + return $projection[0]->[2]->expand($size, $size); +} + + +=head2 fetch_by_clone_accession + + Description: DEPRECATED. Use fetch_by_region, Slice::project, Slice::expand + instead. + +=cut + +sub fetch_by_clone_accession{ + my ($self,$name,$size) = @_; + + deprecate('Use fetch_by_region(), Slice::project() and Slice::expand().'); + + my $csa = $self->db()->get_CoordSystemAdaptor(); + my $clone_cs = $csa->fetch_by_name('clone'); + + if(!$clone_cs) { + warning('Clone coordinate system does not exist for this species'); + return undef; + } + + #this unfortunately needs a version on the end to work + if(! ($name =~ /\./)) { + my $sth = + $self->prepare( "SELECT sr.name " + . "FROM seq_region sr, coord_system cs " + . "WHERE cs.name = 'clone' " + . "AND cs.coord_system_id = sr.coord_system_id " + . "AND sr.name LIKE '$name.%'" + . "AND cs.species_id = ?" ); + + $sth->bind_param( 1, $self->species_id(), SQL_INTEGER ); + $sth->execute(); + + if(!$sth->rows()) { + $sth->finish(); + throw("Clone $name not found in database"); + } + + ($name) = $sth->fetchrow_array(); + + $sth->finish(); + } + + my $clone = $self->fetch_by_region($clone_cs->name(), $name); + return undef if(!$clone); + + my @projection = @{$clone->project('toplevel')}; + + if(@projection != 1) { + warning("$name is mapped to multiple locations."); + } + + return $projection[0]->[2]->expand($size, $size); +} + + +=head2 fetch_by_supercontig_name + + Description: DEPRECATED. Use fetch_by_region(), Slice::project() and + Slice::expand() instead + +=cut + +sub fetch_by_supercontig_name { + my ($self,$name, $size) = @_; + + deprecate('Use fetch_by_region(), Slice::project() and Slice::expand().'); + + my $csa = $self->db()->get_CoordSystemAdaptor(); + my $sc_level = $csa->fetch_by_name('supercontig'); + + if(!$sc_level) { + warning('No supercontig coordinate system exists for this species.'); + return undef; + } + + my $sc_slice = $self->fetch_by_region($sc_level->name(),$name); + + return undef if(!$sc_slice); + + my @projection = @{$sc_slice->project('toplevel')}; + + if(@projection > 1) { + warning("$name is mapped to multiple locations in toplevel"); + } + + return $projection[0]->[2]->expand($size, $size); +} + + + + +=head2 list_overlapping_supercontigs + + Description: DEPRECATED use Slice::project instead + +=cut + +sub list_overlapping_supercontigs { + my ($self,$slice) = @_; + + deprecate('Use Slice::project() instead.'); + + my $csa = $self->db()->get_CoordSystemAdaptor(); + my $sc_level = $csa->fetch_by_name('supercontig'); + + if(!$sc_level) { + warning('No supercontig coordinate system exists for this species.'); + return undef; + } + + my @out; + foreach my $seg (@{$slice->project($sc_level->name(), $sc_level->version)}){ + push @out, $seg->[2]->seq_region_name(); + } + + return \@out; +} + + + +=head2 fetch_by_chr_name + + Description: DEPRECATED. Use fetch by region instead + +=cut + +sub fetch_by_chr_name{ + my ($self,$chr_name) = @_; + deprecate('Use fetch_by_region() instead.'); + + my $csa = $self->db->get_CoordSystemAdaptor(); + + my $top_cs = @{$csa->fetch_all()}; + + return $self->fetch_by_region($top_cs->name(),$chr_name, + undef,undef,undef,$top_cs->version); +} + + + + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/SplicingEventAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/SplicingEventAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,532 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::SlicingEventAdaptor - Database adaptor for the retrieval and +storage of SplicingEvent objects + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Registry; + + Bio::EnsEMBL::Registry->load_registry_from_db( + -host => 'ensembldb.ensembl.org', + -user => 'anonymous', + ); + + $se_adaptor = + Bio::EnsEMBL::Registry->get_adaptor( "human", "core", "SplicingEvent" ); + + $se = $se_adaptor->fetch_by_dbID(12); + + $slice_adaptor = + Bio::EnsEMBL::Registry->get_adaptor( "human", "core", "slice" ); + + $slice = + $slice_adaptor->fetch_by_region( 'chromosome', '1', 1, 1000000 ); + + @ase = @{ $se_adaptor->fetch_all_by_Slice($slice) }; + +=head1 DESCRIPTION + +This is a database aware adaptor for the retrieval and storage ofSlicingEvents +objects. + +=head1 METHODS + +=cut +package Bio::EnsEMBL::DBSQL::SplicingEventAdaptor; + +use strict; + +use Bio::EnsEMBL::Utils::Exception qw( deprecate throw warning ); +use Bio::EnsEMBL::DBSQL::SliceAdaptor; +use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::SplicingEvent; + +use vars '@ISA'; +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor); + + +=head2 list_dbIDs + + Example : @gene_ids = @{$gene_adaptor->list_dbIDs()}; + Description: Gets an array of internal ids for all genes in the current db + Arg[1] : int. not 0 for the ids to be sorted by the seq_region. + Returntype : Listref of Ints + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub list_dbIDs { + my ($self,$ordered) = @_; + + return $self->_list_dbIDs("splicing_event", undef, $ordered); +} + + +=head2 fetch_all_by_Slice + + Arg [1] : Bio::EnsEMBL::Slice $slice + The slice to fetch genes on. + Arg [2] : type of Transcript event + Arg [3] : (optional) boolean $load_features + If true, transcript will be loaded immediately rather + than lazy loaded later. +=cut + +sub fetch_all_by_Slice { + my ( $self, $slice, $type, $load_features ) = @_; + + my $constraint = ''; + + if ( defined($type) ) { + $constraint .= sprintf( " AND at.code = %s", + $self->dbc()->db_handle()->quote($type) ); + } + + my $tes = + $self->SUPER::fetch_all_by_Slice_constraint( $slice, $constraint ); + + # Is there any use in having a splice event without the pairs and + # features?? + + if ( !$load_features || scalar( @{$tes} ) < 2 ) { + return $tes; + } + + # Load pairs and features. + foreach my $te ( @{$tes} ) { + $te->get_all_Features(); + $te->get_all_Pairs(); + } + + return $tes; +} ## end sub fetch_all_by_Slice + + +sub fetch_all_by_Gene { + my ( $self, $gene ) = @_; + + my $sth = $self->dbc->prepare( + q( +SELECT se.splicing_event_id, + se.seq_region_id, + se.seq_region_start, + se.seq_region_end, + se.seq_region_strand, + se.name, + at.code +FROM splicing_event se + JOIN attrib_type at USING (attrib_type_id) +WHERE se.gene_id =) . $gene->dbID() ); + + $sth->execute(); + + my ( $splicing_event_id, $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand, $name, $type ); + + $sth->bind_columns( + \( $splicing_event_id, $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand, $name, + $type ) ); + + my @splicing_events; + + my $sa = $self->db()->get_SliceAdaptor(); + + while ( $sth->fetch() ) { + my $slice = + $sa->fetch_by_seq_region_id( $seq_region_id, + $seq_region_start, + $seq_region_end, + $seq_region_strand ); + + push( @splicing_events, + $self->_create_feature_fast( 'Bio::EnsEMBL::SplicingEvent', { + 'start' => $seq_region_start, + 'end' => $seq_region_end, + 'strand' => $seq_region_strand, + 'adaptor' => $self, + 'slice' => $slice, + 'dbID' => $splicing_event_id, + 'name' => $name, + 'gene_id' => $gene->dbID(), + 'type' => $type } ) ); + } + + foreach my $te (@splicing_events) { + $te->get_all_Features(); + $te->get_all_Pairs(); + } + + return \@splicing_events; +} ## end sub fetch_all_by_Gene + +sub fetch_all_by_Exon { + my ( $self, $exon ) = @_; + + my $sth = $self->dbc()->prepare( + q( +SELECT DISTINCT splicing_event_id +FROM splicing_event_feature +WHERE exon_id =) . $exon->dbID() ); + + $sth->execute(); + + my $se_id; + $sth->bind_col( 1, \$se_id ); + + my @list; + while ( $sth->fetch() ) { + push( @list, $se_id ); + } + + $sth = $self->dbc->prepare( + q( +SELECT se.splicing_event_id, + se.seq_region_id, + se.seq_region_start, + se.seq_region_end, + se.seq_region_strand, + se.name, + at.code, + se.gene_id +FROM splicing_event se + JOIN attrib_type at USING (attrib_type_id) +WHERE se.splicing_event_id in ) . '(' . join( ',', @list ) . ')' ); + + $sth->execute(); + + my ( $splicing_event_id, $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand, $name, $type, $gene_id ); + + $sth->bind_columns( + \( $splicing_event_id, $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand, $name, + $type, $gene_id ) ); + + my @splicing_events; + + my $sa = $self->db->get_SliceAdaptor(); + + while ( $sth->fetch ) { + my $slice = + $sa->fetch_by_seq_region_id( $seq_region_id, + $seq_region_start, + $seq_region_end, + $seq_region_strand ); + + push( @splicing_events, + $self->_create_feature_fast( 'Bio::EnsEMBL::SplicingEvent', { + 'start' => $seq_region_start, + 'end' => $seq_region_end, + 'strand' => $seq_region_strand, + 'adaptor' => $self, + 'slice' => $slice, + 'dbID' => $splicing_event_id, + 'name' => $name, + 'gene_id' => $gene_id, + 'type' => $type } ) ); + } + + foreach my $te (@splicing_events) { + $te->get_all_Features(); + $te->get_all_Pairs(); + } + + return \@splicing_events; +} ## end sub fetch_all_by_Exon + +sub fetch_all_by_Transcript { + my ( $self, $transcript ) = @_; + + my $sth = $self->dbc->prepare( + q( +SELECT DISTINCT splicing_event_id +FROM splicing_event_feature +WHERE transcript_id =) . $transcript->dbID() ); + + $sth->execute(); + + my $se_id; + $sth->bind_col( 1, \$se_id ); + + my @list; + while ( $sth->fetch() ) { + push( @list, $se_id ); + } + + $sth = $self->dbc->prepare( + q( +SELECT se.splicing_event_id, + se.seq_region_id, + se.seq_region_start, + se.seq_region_end, + se.seq_region_strand, + se.name, + at.code, + se.gene_id +FROM splicing_event se + JOIN attrib_type at USING (attrib_type_id) +WHERE se.splicing_event_id in ) . '(' . join( ',', @list ) . ')' ); + + $sth->execute(); + + my ( $splicing_event_id, $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand, $name, $type, $gene_id ); + + $sth->bind_columns( + \( $splicing_event_id, $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand, $name, + $type, $gene_id ) ); + + my @splicing_events; + + my $sa = $self->db()->get_SliceAdaptor(); + + while ( $sth->fetch() ) { + my $slice = + $sa->fetch_by_seq_region_id( $seq_region_id, + $seq_region_start, + $seq_region_end, + $seq_region_strand ); + + push( @splicing_events, + $self->_create_feature_fast( 'Bio::EnsEMBL::SplicingEvent', { + 'start' => $seq_region_start, + 'end' => $seq_region_end, + 'strand' => $seq_region_strand, + 'adaptor' => $self, + 'slice' => $slice, + 'dbID' => $splicing_event_id, + 'name' => $name, + 'gene_id' => $gene_id, + 'type' => $type } ) ); + } + + foreach my $te (@splicing_events) { + $te->get_all_Features(); + $te->get_all_Pairs(); + } + + return \@splicing_events; +} ## end sub fetch_all_by_Transcript + +# _tables +# Arg [1] : none +# Description: PROTECTED implementation of superclass abstract method. +# Returns the names, aliases of the tables to use for queries. +# Returntype : list of listrefs of strings +# Exceptions : none +# Caller : internal +# Status : At Risk + +sub _tables { + return ( [ 'splicing_event', 'se' ], [ 'attrib_type', 'at' ] ); +} + +# _columns +# Arg [1] : none +# Example : none +# Description: PROTECTED implementation of superclass abstract method. +# Returns a list of columns to use for queries. +# Returntype : list of strings +# Exceptions : none +# Caller : internal +# Status : At Risk + +sub _columns { + return ( 'se.splicing_event_id', 'se.seq_region_id', + 'se.seq_region_start', 'se.seq_region_end', + 'se.seq_region_strand', 'se.name', + 'se.gene_id', 'at.code' ); +} + +sub _left_join { + return ( [ 'attrib_type', 'at.attrib_type_id = se.attrib_type_id' ] ); +} + +sub _objs_from_sth { + my ( $self, $sth, $mapper, $dest_slice ) = @_; + + my ( $splicing_event_id, $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand, $name, $gene_id, $type ); + + $sth->bind_columns( + \( $splicing_event_id, $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand, $name, + $gene_id, $type ) ); + + my $sa = $self->db()->get_SliceAdaptor(); + + my @splicing_events; + my %slice_hash; + my %sr_name_hash; + my %sr_cs_hash; + + my $asm_cs; + my $cmp_cs; + my $asm_cs_vers; + my $asm_cs_name; + my $cmp_cs_vers; + my $cmp_cs_name; + + if ( defined($mapper) ) { + $asm_cs = $mapper->assembled_CoordSystem(); + $cmp_cs = $mapper->component_CoordSystem(); + $asm_cs_name = $asm_cs->name(); + $asm_cs_vers = $asm_cs->version(); + $cmp_cs_name = $cmp_cs->name(); + $cmp_cs_vers = $cmp_cs->version(); + } + + my $dest_slice_start; + my $dest_slice_end; + my $dest_slice_strand; + my $dest_slice_length; + my $dest_slice_cs; + my $dest_slice_sr_name; + my $dest_slice_sr_id; + my $asma; + + if ( defined($dest_slice) ) { + $dest_slice_start = $dest_slice->start(); + $dest_slice_end = $dest_slice->end(); + $dest_slice_strand = $dest_slice->strand(); + $dest_slice_length = $dest_slice->length(); + $dest_slice_cs = $dest_slice->coord_system(); + $dest_slice_sr_name = $dest_slice->seq_region_name(); + $dest_slice_sr_id = $dest_slice->get_seq_region_id(); + $asma = $self->db->get_AssemblyMapperAdaptor(); + } + +FEATURE: + while ( $sth->fetch() ) { + # Need to get the internal_seq_region, if present. + $seq_region_id = $self->get_seq_region_id_internal($seq_region_id); + + my $slice = $slice_hash{ "ID:" . $seq_region_id }; + my $dest_mapper = $mapper; + + if ( !$slice ) { + $slice = $sa->fetch_by_seq_region_id($seq_region_id); + $slice_hash{ "ID:" . $seq_region_id } = $slice; + $sr_name_hash{$seq_region_id} = $slice->seq_region_name(); + $sr_cs_hash{$seq_region_id} = $slice->coord_system(); + } + + # Obtain a mapper if none was defined, but a dest_seq_region was. + if ( !defined($dest_mapper) + && defined($dest_slice) + && !$dest_slice_cs->equals( $slice->coord_system ) ) + { + $dest_mapper = + $asma->fetch_by_CoordSystems( $dest_slice_cs, + $slice->coord_system ); + $asm_cs = $dest_mapper->assembled_CoordSystem(); + $cmp_cs = $dest_mapper->component_CoordSystem(); + $asm_cs_name = $asm_cs->name(); + $asm_cs_vers = $asm_cs->version(); + $cmp_cs_name = $cmp_cs->name(); + $cmp_cs_vers = $cmp_cs->version(); + } + + my $sr_name = $sr_name_hash{$seq_region_id}; + my $sr_cs = $sr_cs_hash{$seq_region_id}; + + # Remap the feature coordinates to another coord system if a mapper + # was provided. + if ( defined($dest_mapper) ) { + + if (defined $dest_slice && $dest_mapper->isa('Bio::EnsEMBL::ChainedAssemblyMapper') ) { + ( $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand ) + = + $dest_mapper->map( $sr_name, $seq_region_start, $seq_region_end, + $seq_region_strand, $sr_cs, 1, $dest_slice); + + } else { + + ( $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand ) + = $dest_mapper->fastmap( $sr_name, $seq_region_start, + $seq_region_end, $seq_region_strand, + $sr_cs ); + } + + # Skip features that map to gaps or coord system boundaries. + if ( !defined($seq_region_id) ) { next FEATURE } + + # Get a slice in the coord system we just mapped to. + $slice = $slice_hash{ "ID:" . $seq_region_id } ||= + $sa->fetch_by_seq_region_id($seq_region_id); + } + + # If a destination slice was provided convert the coords. If the + # dest_slice starts at 1 and is foward strand, nothing needs doing. + if ( defined($dest_slice) ) { + if ( $dest_slice_start != 1 || $dest_slice_strand != 1 ) { + if ( $dest_slice_strand == 1 ) { + $seq_region_start = $seq_region_start - $dest_slice_start + 1; + $seq_region_end = $seq_region_end - $dest_slice_start + 1; + } else { + my $tmp_seq_region_start = $seq_region_start; + $seq_region_start = $dest_slice_end - $seq_region_end + 1; + $seq_region_end = $dest_slice_end - $tmp_seq_region_start + 1; + $seq_region_strand = -$seq_region_strand; + } + } + + # Throw away features off the end of the requested slice. + if ( $seq_region_end < 1 + || $seq_region_start > $dest_slice_length + || ( $dest_slice_sr_id != $seq_region_id ) ) + { + next FEATURE; + } + + $slice = $dest_slice; + } + + # Finally, create the new splicing_event. + push( @splicing_events, + $self->_create_feature_fast( 'Bio::EnsEMBL::SplicingEvent', { + 'start' => $seq_region_start, + 'end' => $seq_region_end, + 'strand' => $seq_region_strand, + 'adaptor' => $self, + 'slice' => $slice, + 'dbID' => $splicing_event_id, + 'name' => $name, + 'gene_id' => $gene_id, + 'type' => $type } ) ); + + } ## end while ( $sth->fetch() ) + + return \@splicing_events; +} ## end sub _objs_from_sth + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/SplicingEventFeatureAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/SplicingEventFeatureAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,147 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::SlicingEventFeatureAdaptor - Database adaptor for the retrieval and +storage of SplicingEventFeature objects + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Registry; + + Bio::EnsEMBL::Registry->load_registry_from_db( + -host => 'ensembldb.ensembl.org', + -user => 'anonymous', + ); + + $se_adaptor = + Bio::EnsEMBL::Registry->get_adaptor( "human", "core", "SplicingEventFeature" ); + + +=head1 DESCRIPTION + +This is a database aware adaptor for the retrieval and storage of SlicingEventFeatures +objects. + +=head1 METHODS + +=cut +package Bio::EnsEMBL::DBSQL::SplicingEventFeatureAdaptor; + +use strict; + +use Bio::EnsEMBL::Utils::Exception qw( deprecate throw warning ); +use Bio::EnsEMBL::DBSQL::SliceAdaptor; +use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::SplicingEventFeature; + +use vars '@ISA'; +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor); + + + +sub fetch_all_by_SplicingEvent{ + my ($self, $splicing_event) = @_; + + + my ($splicing_event_feature_id, $splicing_event_id, $exon_id, $transcript_id, $feature_order, $transcript_association, $type, $start, $end); + + $splicing_event_id = $splicing_event->dbID; + + my $sql = "select splicing_event_feature_id, exon_id, transcript_id, feature_order, transcript_association, type, start, end from splicing_event_feature where splicing_event_id = $splicing_event_id"; + + my $sth = $self->prepare($sql); + + $sth->execute(); + $sth->bind_columns(\$splicing_event_feature_id, \$exon_id, \$transcript_id, \$feature_order, \$transcript_association, \$type, \$start, \$end); + + my @features; + + while($sth->fetch()){ + push( @features, + $self->_create_feature_fast( 'Bio::EnsEMBL::SplicingEventFeature', { + 'start' => $start, + 'end' => $end, + 'adaptor' => $self, + 'dbID' => $splicing_event_feature_id, + 'exon_id' => $exon_id, + 'transcript_id' => $transcript_id, + 'slice' => $splicing_event->slice, + 'type' => $type, + 'feature_order' => $feature_order, + 'transcript_association' => $transcript_association + } ) ); + + } + $sth->finish; + return \@features; + +} + + + + +# _tables +# Arg [1] : none +# Description: PROTECTED implementation of superclass abstract method. +# Returns the names, aliases of the tables to use for queries. +# Returntype : list of listrefs of strings +# Exceptions : none +# Caller : internal +# Status : At Risk + +sub _tables { + my $self = shift; + + return ([ 'splicing_event_feature', 'sef' ]); +} + +# _columns +# Arg [1] : none +# Example : none +# Description: PROTECTED implementation of superclass abstract method. +# Returns a list of columns to use for queries. +# Returntype : list of strings +# Exceptions : none +# Caller : internal +# Status : At Risk + +sub _columns { + my $self = shift; + +# my $created_date = $self->db->dbc->from_date_to_seconds("gsi.created_date"); +# my $modified_date = $self->db->dbc->from_date_to_seconds("gsi.modified_date"); + + return ( 'sef.splicing_event_id','sef.exon_id', 'sef.feature_order', 'sef.transcript_association', 'sef.type', 'sef.start', 'sef.end' ); + +} + +sub list_dbIDs { + my ($self,$ordered) = @_; + + return $self->_list_dbIDs("splicing_event_feature", undef, $ordered); +} + + +1; + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/SplicingTranscriptPairAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/SplicingTranscriptPairAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,143 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::SlicingTranscriptPairAdaptor - Database adaptor for the retrieval and +storage of SplicingTranscriptPair objects + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Registry; + + Bio::EnsEMBL::Registry->load_registry_from_db( + -host => 'ensembldb.ensembl.org', + -user => 'anonymous', + ); + + $se_adaptor = + Bio::EnsEMBL::Registry->get_adaptor( "human", "core", "SplicingTranscriptPair" ); + + +=head1 DESCRIPTION + +This is a database aware adaptor for the retrieval and storage of SplicingTranscriptPairs +objects. + +=head1 METHODS + +=cut +package Bio::EnsEMBL::DBSQL::SplicingTranscriptPairAdaptor; + +use strict; + +use Bio::EnsEMBL::Utils::Exception qw( deprecate throw warning ); +use Bio::EnsEMBL::DBSQL::SliceAdaptor; +use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::SplicingTranscriptPair; + +use vars '@ISA'; +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor); + + + +sub fetch_all_by_SplicingEvent{ + my ($self, $splicing_event) = @_; + + + my ($splicing_transcript_pair_id, $transcript_id_1, $transcript_id_2); + + my $splicing_event_id = $splicing_event->dbID; + + my $sql = "select splicing_transcript_pair_id, transcript_id_1, transcript_id_2 from splicing_transcript_pair where splicing_event_id = $splicing_event_id"; + + my $sth = $self->prepare($sql); + + $sth->execute(); + $sth->bind_columns(\$splicing_transcript_pair_id, \$transcript_id_1, \$transcript_id_2); + + my @pairs; + + while($sth->fetch()){ + push( @pairs, + $self->_create_feature_fast( 'Bio::EnsEMBL::SplicingTranscriptPair', { + 'adaptor' => $self, + 'dbID' => $splicing_transcript_pair_id, + 'transcript_id_1' => $transcript_id_1, + 'transcript_id_2' => $transcript_id_2, + 'start' => $splicing_event->start, + 'end' => $splicing_event->end + } ) ); + + } + $sth->finish; + return \@pairs; + +} + + + + +# _tables +# Arg [1] : none +# Description: PROTECTED implementation of superclass abstract method. +# Returns the names, aliases of the tables to use for queries. +# Returntype : list of listrefs of strings +# Exceptions : none +# Caller : internal +# Status : At Risk + +sub _tables { + my $self = shift; + + return ([ 'splicing_transcript_pair', 'stp' ]); +} + +# _columns +# Arg [1] : none +# Example : none +# Description: PROTECTED implementation of superclass abstract method. +# Returns a list of columns to use for queries. +# Returntype : list of strings +# Exceptions : none +# Caller : internal +# Status : At Risk + +sub _columns { + my $self = shift; + +# my $created_date = $self->db->dbc->from_date_to_seconds("gsi.created_date"); +# my $modified_date = $self->db->dbc->from_date_to_seconds("gsi.modified_date"); + + return ( 'stp.splicing_transcript_pair_id','stp.transcript_id_1', 'stp.transcript_id_2'); + +} + +sub list_dbIDs { + my ($self,$ordered) = @_; + + return $self->_list_dbIDs("splicing_transcript_pair", undef, $ordered); +} + + +1; + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/StatementHandle.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/StatementHandle.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,264 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::StatementHandle + +=head1 SYNOPSIS + +Do not use this class directly. It will automatically be used by the +Bio::EnsEMBL::DBSQL::DBConnection class. + +=head1 DESCRIPTION + +This class extends DBD::mysql::st so that the DESTROY method may be +overridden. If the DBConnection::disconnect_when_inactive flag is set +this statement handle will cause the database connection to be closed +when it goes out of scope and there are no other open statement handles. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::StatementHandle; + +use vars qw(@ISA); +use strict; + +use Bio::EnsEMBL::Utils::Exception qw(warning throw); + +use DBI; + +#use Time::HiRes qw(time); + +@ISA = qw(DBI::st); + + +# As DBD::mysql::st is a tied hash can't store things in it, +# so have to have parallel hash +my %dbchash; +my %dbc_sql_hash; + + +sub dbc { + my $self = shift; + + if (@_) { + my $dbc = shift; + if(!defined($dbc)) { + # without delete key space would grow indefinitely causing mem-leak + delete($dbchash{$self}); + } else { + $dbchash{$self} = $dbc; + } + } + + return $dbchash{$self}; +} + +sub sql { + my $self = shift; + + if (@_) { + my $sql = shift; + if(!defined($sql)) { + # without delete key space would grow indefinitely causing mem-leak + delete($dbc_sql_hash{$self}); + } else { + $dbc_sql_hash{$self} = $sql; + } + } + + return $dbc_sql_hash{$self}; +} + +sub DESTROY { + my ($self) = @_; + + my $dbc = $self->dbc; + $self->dbc(undef); + my $sql = $self->sql; + $self->sql(undef); + + # Re-bless into DBI::st so that superclass destroy method is called if + # it exists (it does not exist in all DBI versions). + bless( $self, 'DBI::st' ); + + # The count for the number of kids is decremented only after this + # function is complete. Disconnect if there is 1 kid (this one) + # remaining. + if ( $dbc + && $dbc->disconnect_when_inactive() + && $dbc->connected + && ( $dbc->db_handle->{Kids} == 1 ) ) + { + if ( $dbc->disconnect_if_idle() ) { + warn("Problem disconnect $self around sql = $sql\n"); + } + } +} ## end sub DESTROY + +1; + +# Comment out this "__END__" for printing out handy debug information +# (every query if you want). + +__END__ + +# To stop caching messing up your timings, try doing the following on +# any adapter: +# +# $slice_adaptor->dbc()->db_handle() +# ->do("SET SESSION query_cache_type = OFF"); +# +# To start logging: +# Bio::EnsEMBL::DBSQL::StatementHandle->sql_timing_start(); +# +# To display the results: +# Bio::EnsEMBL::DBSQL::StatementHandle->sql_timing_print(1); +# +# To pause logging: +# Bio::EnsEMBL::DBSQL::StatementHandle->sql_timimg_pause(); +# +# To resume logging after pause: +# Bio::EnsEMBL::DBSQL::StatementHandle->sql_timimg_resume(); + +use Time::HiRes qw(time); + +my @bind_args = (); +my $dump = 0; +my %total_time; +my %min_time; +my %max_time; +my %number_of_times; +my %first_time; +my $grand_total; + +sub sql_timing_start { + %total_time = (); + %number_of_times = (); + %min_time = (); + %max_time = (); + %first_time = (); + $dump = 1; +} + +sub sql_timing_pause { $dump = 0 } +sub sql_timing_resume { $dump = 1 } + +sub sql_timing_print { + my ( $self, $level, $fh ) = @_; + + my $grand_total = 0; + + if ( !defined($fh) ) { + $fh = \*STDERR; + } + + print( ref($fh), "\n" ); + + foreach my $key ( keys %total_time ) { + $grand_total += $total_time{$key}; + + if ( !( defined($level) and $level ) ) { next } + + print( $fh $key, "\n" ); + + print( $fh + "total\t \tnum\tfirst \t\tavg\t \t[min ,max ]\n" ); + + printf( $fh "%6f\t%d\t%6f\t%6f\t[%6f, %6f]\n\n", + $total_time{$key}, $number_of_times{$key}, + $first_time{$key}, ( $total_time{$key}/$number_of_times{$key} ), + $min_time{$key}, $max_time{$key} ); + } + + printf( $fh "\ntotal time %6f\n\n", $grand_total ); + +} ## end sub sql_timing_print + +sub bind_param { + my ( $self, @args ) = @_; + + $bind_args[ $args[0] - 1 ] = $args[1]; + $self->SUPER::bind_param(@args); +} + +sub execute { + my ( $self, @args ) = @_; + + my $retval; + # Skip dumping if !$dump + if ( !$dump ) { + local $self->{RaiseError}; + $retval = $self->SUPER::execute(@args); + if ( !defined($retval) ) { + throw("Failed to execute SQL statement"); + } + return $retval; + } + + my $sql = $self->sql(); + my @chrs = split( //, $sql ); + + my $j = 0; + + for ( my $i = 0; $i < @chrs; $i++ ) { + if ( $chrs[$i] eq '?' && defined( $bind_args[$j] ) ) { + $chrs[$i] = $bind_args[ $j++ ]; + } + } + + my $str = join( '', @chrs ); + + # Uncomment this line if you want to see sql in order. + # print( STDERR "\n\nSQL:\n$str\n\n" ); + + my $time = time(); + { + local $self->{RaiseError}; + $retval = $self->SUPER::execute(@args); + if ( !defined($retval) ) { + throw("Failed to execute SQL statement"); + } + } + # my $res = $self->SUPER::execute(@args); + $time = time() - $time; + + if ( defined( $total_time{$sql} ) ) { + $total_time{$sql} += $time; + $number_of_times{$sql}++; + + if ( $min_time{$sql} > $time ) { $min_time{$sql} = $time } + if ( $max_time{$sql} < $time ) { $max_time{$sql} = $time } + + } else { + $first_time{$sql} = $time; + $max_time{$sql} = $time; + $min_time{$sql} = $time; + $total_time{$sql} = $time; + $number_of_times{$sql} = 1; + } + + return $retval; +} ## end sub execute + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/StrainSliceAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/StrainSliceAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,361 @@ +=head1 LICENSE + + Copyright (c) 1999-2009 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::StrainSliceAdaptor - adaptor/factory for MappedSlices +representing alternative assemblies + +=head1 SYNOPSIS + + my $slice = + $slice_adaptor->fetch_by_region( 'chromosome', 14, 900000, 950000 ); + + my $msc = Bio::EnsEMBL::MappedSliceContainer->new(-SLICE => $slice); + + # create a new strain slice adaptor and attach it to the MSC + my $ssa = Bio::EnsEMBL::DBSQL::StrainSliceAdaptor->new($sa->db); + $msc->set_StrainSliceAdaptor($ssa); + + # now attach strain + $msc->attach_StrainSlice('Watson'); + +=head1 DESCRIPTION + +NOTE: this code is under development and not fully functional nor tested +yet. Use only for development. + +This adaptor is a factory for creating MappedSlices representing +strains and attaching them to a MappedSliceContainer. A mapper will be created +to map between the reference slice and the common container slice coordinate +system. + +=head1 METHODS + + new + fetch_by_name + +=head1 REALTED MODULES + + Bio::EnsEMBL::MappedSlice + Bio::EnsEMBL::MappedSliceContainer + Bio::EnsEMBL::AlignStrainSlice + Bio::EnsEMBL::StrainSlice + +=cut + +package Bio::EnsEMBL::DBSQL::StrainSliceAdaptor; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::MappedSlice; +use Bio::EnsEMBL::Mapper; +use Bio::EnsEMBL::DBSQL::BaseAdaptor; + +our @ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); + + +=head2 new + + Example : my $strain_slice_adaptor = + Bio::EnsEMBL::DBSQL::StrainSliceAdaptor->new; + Description : Constructor. + Return type : Bio::EnsEMBL::DBSQL::StrainSliceAdaptor + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_); + + return $self; +} + + +=head2 fetch_by_name + + Arg[1] : Bio::EnsEMBL::MappedSliceContainer $container - the container + to attach MappedSlices to + Arg[2] : String $name - the name of the strain to fetch + Example : my ($mapped_slice) = @{ $msc->fetch_by_name('Watson') }; + Description : Creates a MappedSlice representing a version of the container's + reference slice with variant alleles from the named strain + Return type : listref of Bio::EnsEMBL::MappedSlice + Exceptions : thrown on wrong or missing arguments + Caller : general, Bio::EnsEMBL::MappedSliceContainer + Status : At Risk + : under development + +=cut + +sub fetch_by_name { + my $self = shift; + my $container = shift; + my $name = shift; + + # argueent check + unless ($container and ref($container) and + $container->isa('Bio::EnsEMBL::MappedSliceContainer')) { + throw("Need a MappedSliceContainer."); + } + + unless ($name) { + throw("Need a strain name."); + } + + my $slice = $container->ref_slice; + + # get a connection to the variation DB + my $variation_db = $self->db->get_db_adaptor('variation'); + + unless($variation_db) { + warning("Variation database must be attached to core database to retrieve variation information"); + return ''; + } + + # now get an allele feature adaptor + my $af_adaptor = $variation_db->get_AlleleFeatureAdaptor; + + # check we got it + unless(defined $af_adaptor) { + warning("Not possible to retrieve AlleleFeatureAdaptor from variation database"); + return ''; + } + + # now get an individual adaptor + my $ind_adaptor = $variation_db->get_IndividualAdaptor; + + # check we got it + unless(defined $ind_adaptor) { + warning("Not possible to retrieve IndividualAdaptor from variation database"); + return ''; + } + + # fetch individual object for this strain name + my $ind = shift @{$ind_adaptor->fetch_all_by_name($name)}; + + # check we got a result + unless(defined $ind) { + warn("Strain ".$name." not found in the database"); + return ''; + } + + + ## MAP STRAIN SLICE TO REF SLICE + ################################ + + # create a mapper + my $mapper = Bio::EnsEMBL::Mapper->new('mapped_slice', 'ref_slice'); + + # create a mapped_slice object + my $mapped_slice = Bio::EnsEMBL::MappedSlice->new( + -ADAPTOR => $self, + -CONTAINER => $container, + -NAME => $slice->name."\#strain_$name", + ); + + # get the strain slice + my $strain_slice = $slice->get_by_strain($ind->name); + + # get all allele features for this slice and individual + #my @afs = sort {$a->start() <=> $b->start()} @{$af_adaptor->fetch_all_by_Slice($slice, $ind)}; + + # get allele features with coverage info + my $afs = $strain_slice->get_all_AlleleFeatures_Slice(1); + + # check we got some data + #warning("No strain genotype data available for slice ".$slice->name." and strain ".$ind->name) if ! defined $afs[0]; + + + my $start_slice = $slice->start; + my $start_strain = 1; + my $sr_name = $slice->seq_region_name; + #my $sr_name = 'ref_slice'; + my ($end_slice, $end_strain, $allele_length); + + my $indel_flag = 0; + my $total_length_diff = 0; + + # check for AFs + if(defined($afs) && scalar @$afs) { + + # go through each AF + foreach my $af(@$afs) { + + # find out if it changes the length + if($af->length_diff != 0) { + + $indel_flag = 1; + $total_length_diff += $af->length_diff; + + # get the allele length + $allele_length = $af->length + $af->length_diff(); + + $end_slice = $slice->start + $af->start() - 2; + + if ($end_slice >= $start_slice){ + $end_strain = $end_slice - $start_slice + $start_strain; + + #add the sequence that maps + $mapper->add_map_coordinates('mapped_slice', $start_strain, $end_strain, 1, $sr_name, $start_slice, $end_slice); + + #add the indel + $mapper->add_indel_coordinates('mapped_slice', $end_strain+1, $end_strain+$allele_length, 1, $sr_name,$end_slice+1,$end_slice + $af->length); + + $start_strain = $end_strain + $allele_length + 1; + } + + else{ + + #add the indel + $mapper->add_indel_coordinates('mapped_slice', $end_strain+1,$end_strain + $allele_length, 1, $sr_name,$end_slice+1,$end_slice + $af->length); + + $start_strain += $allele_length; + } + + $start_slice = $end_slice + $af->length+ 1; + } + } + } + + # add the remaining coordinates (or the whole length if no indels found) + $mapper->add_map_coordinates('mapped_slice', $start_strain, $start_strain + ($slice->end - $start_slice), 1, $sr_name, $start_slice, $slice->end); + + # add the slice/mapper pair + $mapped_slice->add_Slice_Mapper_pair($strain_slice, $mapper); + + + + ## MAP REF_SLICE TO CONTAINER SLICE + ################################### + + if($total_length_diff > 0) { + + # create a new mapper + my $new_mapper = Bio::EnsEMBL::Mapper->new('ref_slice', 'container'); + + # get existing pairs + my @existing_pairs = $container->mapper->list_pairs('container', 1, $container->container_slice->length, 'container'); + my @new_pairs = $mapper->list_pairs('mapped_slice', 1, $strain_slice->length(), 'mapped_slice'); + + # we need a list of indels (specifically inserts) + my @indels; + + # go through existing first + foreach my $pair(@existing_pairs) { + + if($pair->from->end - $pair->from->start != $pair->to->end - $pair->to->start) { + my $indel; + $indel->{'length_diff'} = ($pair->to->end - $pair->to->start) - ($pair->from->end - $pair->from->start); + + # we're only interested in inserts here, not deletions + next unless $indel->{'length_diff'} > 0; + + $indel->{'ref_start'} = $pair->from->start; + $indel->{'ref_end'} = $pair->from->end; + $indel->{'length'} = $pair->to->end - $pair->to->start; + + push @indels, $indel; + } + } + + # now new ones + foreach my $pair(@new_pairs) { + + if($pair->from->end - $pair->from->start != $pair->to->end - $pair->to->start) { + my $indel; + $indel->{'length_diff'} = ($pair->from->end - $pair->from->start) - ($pair->to->end - $pair->to->start); + + # we're only interested in inserts here, not deletions + next unless $indel->{'length_diff'} > 0; + + $indel->{'ref_start'} = $pair->to->start; + $indel->{'ref_end'} = $pair->to->end; + $indel->{'length'} = $pair->from->end - $pair->from->start; + + push @indels, $indel; + } + } + + # sort them + @indels = sort { + $a->{'ref_start'} <=> $b->{'ref_start'} || # by position + $b->{'length_diff'} <=> $a->{'length_diff'} # then by length diff so we only keep the longest + } @indels; + + # clean them + my @new_indels = (); + my $p = $indels[0]; + push @new_indels, $indels[0] if scalar @indels; + + for my $i(1..$#indels) { + my $c = $indels[$i]; + + if($c->{'ref_start'} != $p->{'ref_start'} && $c->{'ref_end'} != $p->{'ref_end'}) { + push @new_indels, $c; + $p = $c; + } + } + + $start_slice = $slice->start; + $start_strain = 1; + $sr_name = $slice->seq_region_name; + #$sr_name = 'ref_slice'; + + foreach my $indel(@new_indels) { + + $end_slice = $indel->{'ref_end'}; + $end_strain = $start_strain + ($end_slice - $start_slice); + + $allele_length = $indel->{'length'} + $indel->{'length_diff'}; + + $new_mapper->add_map_coordinates($sr_name, $start_slice, $end_slice, 1, 'container', $start_strain, $end_strain); + + $new_mapper->add_indel_coordinates($sr_name,$end_slice+1,$end_slice + $indel->{'length'}, 1, 'container', $end_strain+1, $end_strain+$allele_length); + + $start_strain = $end_strain + $allele_length + 1; + $start_slice = $end_slice + $indel->{'length'} + 1; + } + + $new_mapper->add_map_coordinates($sr_name, $start_slice, $slice->end, 1, 'container', $start_strain, $start_strain + ($slice->end - $start_slice)); + + # replace the mapper with the new mapper + $container->mapper($new_mapper); + + # change the container slice's length according to length diff + $container->container_slice($container->container_slice->expand(undef, $total_length_diff, 1)); + } + + return [$mapped_slice]; +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/SupportingFeatureAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/SupportingFeatureAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,258 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::SupportingFeatureAdaptor - Retrieves supporting +features from the database. + +=head1 SYNOPSIS + + my $sfa = + $registry->get_adaptor( 'Human', 'Core', 'SupportingFeature' ); + + my @supporting_feats = @{ $sfa->fetch_all_by_Exon($exon) }; + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::SupportingFeatureAdaptor; + +use strict; + +use Bio::EnsEMBL::DBSQL::BaseAdaptor; + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + +use vars qw(@ISA); + +#inherits from BaseAdaptor +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); + + + +=head2 fetch_all_by_Exon + + Arg [1] : Bio::EnsEMBL::Exon $exon + The exon to fetch supporting features. + Example : @sfs = + @{ $supporting_feat_adaptor->fetch_all_by_Exon($exon) }; + Description: Retrieves supporting features (evidence) for a given + exon. + Returntype : List of Bio::EnsEMBL::BaseAlignFeatures in the same + coordinate system as the $exon argument + Exceptions : Warning if $exon is not in the database (i.e. dbID + not defined). + Throw if a retrieved supporting feature is of unknown + type. + Caller : Bio::EnsEMBL::Exon + Status : Stable + +=cut + +sub fetch_all_by_Exon { + my ( $self, $exon ) = @_; + + my $out = []; + + unless($exon->dbID) { + warning("Cannot retrieve evidence for exon without dbID"); + return []; + } + + my $sth = $self->prepare("SELECT sf.feature_type, sf.feature_id + FROM supporting_feature sf + WHERE exon_id = ?"); + + $sth->bind_param(1,$exon->dbID,SQL_INTEGER); + $sth->execute(); + + my $prot_adp = $self->db->get_ProteinAlignFeatureAdaptor; + my $dna_adp = $self->db->get_DnaAlignFeatureAdaptor; + + my $feature; + while(my ($type, $feature_id) = $sth->fetchrow){ + if($type eq 'protein_align_feature'){ + $feature = $prot_adp->fetch_by_dbID($feature_id); + } elsif($type eq 'dna_align_feature'){ + $feature = $dna_adp->fetch_by_dbID($feature_id); + } else { + warning("Unknown feature type [$type]\n"); + } + + if(!$feature) { + warning("Supporting feature $type $feature_id does not exist in DB"); + } else { + my $new_feature = $feature->transfer($exon->slice()); + push @$out, $new_feature if( $new_feature ); + } + } + + $sth->finish(); + + return $out; +} + +=head2 store + + Arg [1] : Int $exonsID + The dbID of an EnsEMBL exon to associate with + supporting features. + Arg [2] : Ref to array of Bio::EnsEMBL::BaseAlignFeature + (the support) + Example : $sfa->store($exon_id, \@features); + Description: Stores a set of alignment features and associates an + EnsEMBL exon with them + Returntype : none + Exceptions : thrown when invalid dbID is passed to this method + Caller : TranscriptAdaptor + Status : Stable + +=cut + +sub store { + my ( $self, $exon_dbID, $aln_objs ) = @_; + + my $pep_check_sql = + "SELECT protein_align_feature_id " . + "FROM protein_align_feature " . + "WHERE seq_region_id = ? " . + "AND seq_region_start = ? " . + "AND seq_region_end = ? " . + "AND seq_region_strand = ? " . + "AND hit_name = ? " . + "AND hit_start = ? " . + "AND hit_end = ? " . + "AND analysis_id = ? " . + "AND cigar_line = ? "; + + my $dna_check_sql = + "SELECT dna_align_feature_id " . + "FROM dna_align_feature " . + "WHERE seq_region_id = ? " . + "AND seq_region_start = ? " . + "AND seq_region_end = ? " . + "AND seq_region_strand = ? " . + "AND hit_name = ? " . + "AND hit_start = ? " . + "AND hit_end = ? " . + "AND analysis_id = ? " . + "AND cigar_line = ? " . + "AND hit_strand = ? "; + + my $assoc_check_sql = + "SELECT * " . + "FROM supporting_feature " . + "WHERE exon_id = $exon_dbID " . + "AND feature_type = ? " . + "AND feature_id = ? "; + + my $assoc_write_sql = "INSERT into supporting_feature " . + "(exon_id, feature_id, feature_type) " . + "values(?, ?, ?)"; + + my $pep_check_sth = $self->prepare($pep_check_sql); + my $dna_check_sth = $self->prepare($dna_check_sql); + my $assoc_check_sth = $self->prepare($assoc_check_sql); + my $sf_sth = $self->prepare($assoc_write_sql); + + my $dna_adaptor = $self->db->get_DnaAlignFeatureAdaptor(); + my $pep_adaptor = $self->db->get_ProteinAlignFeatureAdaptor(); + + foreach my $f (@$aln_objs) { + # check that the feature is in toplevel coords + + if($f->slice->start != 1 || $f->slice->strand != 1) { + #move feature onto a slice of the entire seq_region + my $tls = $self->db->get_sliceAdaptor->fetch_by_region($f->slice->coord_system->name(), + $f->slice->seq_region_name(), + undef, #start + undef, #end + undef, #strand + $f->slice->coord_system->version()); + $f = $f->transfer($tls); + + if(!$f) { + throw('Could not transfer Feature to slice of ' . + 'entire seq_region prior to storing'); + } + } + + if(!$f->isa("Bio::EnsEMBL::BaseAlignFeature")){ + throw("$f must be an align feature otherwise" . + "it can't be stored"); + } + + my ($sf_dbID, $type, $adap, $check_sth); + + my @check_args = ($self->db->get_SliceAdaptor->get_seq_region_id($f->slice), + $f->start, + $f->end, + $f->strand, + $f->hseqname, + $f->hstart, + $f->hend, + $f->analysis->dbID, + $f->cigar_string); + + if($f->isa("Bio::EnsEMBL::DnaDnaAlignFeature")){ + $adap = $dna_adaptor; + $check_sth = $dna_check_sth; + $type = 'dna_align_feature'; + push @check_args, $f->hstrand; + } elsif($f->isa("Bio::EnsEMBL::DnaPepAlignFeature")){ + $adap = $pep_adaptor; + $check_sth = $pep_check_sth; + $type = 'protein_align_feature'; + } else { + warning("Supporting feature of unknown type. Skipping : [$f]\n"); + next; + } + + $check_sth->execute(@check_args); + $sf_dbID = $check_sth->fetchrow_array; + if (not $sf_dbID) { + $adap->store($f); + $sf_dbID = $f->dbID; + } + + # now check association + $assoc_check_sth->execute($type, + $sf_dbID); + if (not $assoc_check_sth->fetchrow_array) { + $sf_sth->bind_param(1, $exon_dbID, SQL_INTEGER); + $sf_sth->bind_param(2, $sf_dbID, SQL_INTEGER); + $sf_sth->bind_param(3, $type, SQL_VARCHAR); + $sf_sth->execute(); + } + } + + $dna_check_sth->finish; + $pep_check_sth->finish; + $assoc_check_sth->finish; + $sf_sth->finish; + +} + + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/TranscriptAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/TranscriptAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1952 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::TranscriptAdaptor - An adaptor which performs database +interaction relating to the storage and retrieval of Transcripts + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Registry; + + Bio::EnsEMBL::Registry->load_registry_from_db( + -host => 'ensembldb.ensembl.org', + -user => 'anonymous' + ); + + $transcript_adaptor = + Bio::EnsEMBL::Registry->get_adaptor( 'Human', 'Core', + 'Transcript' ); + + $transcript = $transcript_adaptor->fetch_by_dbID(1234); + + $transcript = + $transcript_adaptor->fetch_by_stable_id('ENST00000201961'); + + $slice = + $slice_adaptor->fetch_by_region( 'Chromosome', '3', 1, 1000000 ); + @transcripts = @{ $transcript_adaptor->fetch_all_by_Slice($slice) }; + + ($transcript) = + @{ $transcript_adaptor->fetch_all_by_external_name('NP_065811.1') }; + +=head1 DESCRIPTION + +This adaptor provides a means to retrieve and store information related +to Transcripts. Primarily this involves the retrieval or storage of +Bio::EnsEMBL::Transcript objects from a database. + +See Bio::EnsEMBL::Transcript for details of the Transcript class. + +=cut + +package Bio::EnsEMBL::DBSQL::TranscriptAdaptor; + +use strict; + +use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; +use Bio::EnsEMBL::Gene; +use Bio::EnsEMBL::Exon; +use Bio::EnsEMBL::Transcript; +use Bio::EnsEMBL::Translation; +use Bio::EnsEMBL::Utils::Exception qw( deprecate throw warning ); + +use vars qw(@ISA); +@ISA = qw( Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor ); + + +# _tables +# +# Description: PROTECTED implementation of superclass abstract method. +# Returns the names, aliases of the tables to use for queries. +# Returntype : list of listrefs of strings +# Exceptions : none +# Caller : internal +# Status : Stable + +sub _tables { + return ( + [ 'transcript', 't' ], + [ 'xref', 'x' ], + [ 'external_db', 'exdb' ] ); +} + + +#_columns +# +# Description: PROTECTED implementation of superclass abstract method. +# Returns a list of columns to use for queries. +# Returntype : list of strings +# Exceptions : none +# Caller : internal +# Status : Stable + +sub _columns { + my ($self) = @_; + + my $created_date = + $self->db()->dbc()->from_date_to_seconds("created_date"); + my $modified_date = + $self->db()->dbc()->from_date_to_seconds("modified_date"); + + return ( + 't.transcript_id', 't.seq_region_id', + 't.seq_region_start', 't.seq_region_end', + 't.seq_region_strand', 't.analysis_id', + 't.gene_id', 't.is_current', + 't.stable_id', 't.version', + $created_date, $modified_date, + 't.description', 't.biotype', + 't.status', 'exdb.db_name', + 'exdb.status', 'exdb.db_display_name', + 'x.xref_id', 'x.display_label', + 'x.dbprimary_acc', 'x.version', + 'x.description', 'x.info_type', + 'x.info_text' + ); +} + +sub _left_join { + return ( + [ 'xref', "x.xref_id = t.display_xref_id" ], + [ 'external_db', "exdb.external_db_id = x.external_db_id" ] + ); +} + + +=head2 fetch_by_stable_id + + Arg [1] : String $stable_id + The stable id of the transcript to retrieve + Example : my $tr = $tr_adaptor->fetch_by_stable_id('ENST00000309301'); + Description: Retrieves a transcript via its stable id. + Returntype : Bio::EnsEMBL::Transcript + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_by_stable_id { + my ($self, $stable_id) = @_; + + my $constraint = "t.stable_id = ? AND t.is_current = 1"; + + $self->bind_param_generic_fetch($stable_id,SQL_VARCHAR); + + my ($transcript) = @{ $self->generic_fetch($constraint) }; + + return $transcript; +} + + +sub fetch_all { + my ($self) = @_; + + my $constraint = 't.biotype != "LRG_gene" and t.is_current = 1'; + my @trans = @{ $self->generic_fetch($constraint) }; + return \@trans ; +} + +=head2 fetch_all_versions_by_stable_id + + Arg [1] : String $stable_id + The stable ID of the transcript to retrieve + Example : my $tr = $tr_adaptor->fetch_all_version_by_stable_id + ('ENST00000309301'); + Description : Similar to fetch_by_stable_id, but retrieves all versions of a + transcript stored in the database. + Returntype : listref of Bio::EnsEMBL::Transcript objects + Exceptions : if we cant get the gene in given coord system + Caller : general + Status : At Risk + +=cut + +sub fetch_all_versions_by_stable_id { + my ($self, $stable_id) = @_; + + my $constraint = "t.stable_id = ?"; + + $self->bind_param_generic_fetch($stable_id,SQL_VARCHAR); + + return $self->generic_fetch($constraint); +} + + +=head2 fetch_by_translation_stable_id + + Arg [1] : String $transl_stable_id + The stable identifier of the translation of the transcript to + retrieve + Example : my $tr = $tr_adaptor->fetch_by_translation_stable_id + ('ENSP00000311007'); + Description: Retrieves a Transcript object using the stable identifier of + its translation. + Returntype : Bio::EnsEMBL::Transcript + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_by_translation_stable_id { + my ($self, $transl_stable_id ) = @_; + + my $sth = $self->prepare(qq( + SELECT t.transcript_id + FROM translation tl, + transcript t + WHERE tl.stable_id = ? + AND tl.transcript_id = t.transcript_id + AND t.is_current = 1 + )); + + $sth->bind_param(1, $transl_stable_id, SQL_VARCHAR); + $sth->execute(); + + my ($id) = $sth->fetchrow_array; + $sth->finish; + if ($id){ + return $self->fetch_by_dbID($id); + } else { + return undef; + } +} + + +=head2 fetch_by_translation_id + + Arg [1] : Int $id + The internal identifier of the translation whose transcript + is to be retrieved + Example : my $tr = $tr_adaptor->fetch_by_translation_id($transl->dbID); + Description: Given the internal identifier of a translation this method + retrieves the transcript associated with that translation. + If the transcript cannot be found undef is returned instead. + Returntype : Bio::EnsEMBL::Transcript or undef + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_by_translation_id { + my ( $self, $p_dbID ) = @_; + + if ( !defined($p_dbID) ) { + throw("dbID argument is required"); + } + + my $sth = + $self->prepare( "SELECT transcript_id " + . "FROM translation " + . "WHERE translation_id = ?" ); + + $sth->bind_param( 1, $p_dbID, SQL_INTEGER ); + $sth->execute(); + + my ($dbID) = $sth->fetchrow_array(); + $sth->finish(); + + if ($dbID) { + return $self->fetch_by_dbID($dbID); + } + + return undef; +} + +=head2 fetch_all_by_Gene + + Arg [1] : Bio::EnsEMBL::Gene $gene + The gene to fetch transcripts of + Example : my $gene = $gene_adaptor->fetch_by_stable_id('ENSG0000123'); + my @transcripts = { $tr_adaptor->fetch_all_by_Gene($gene) }; + Description: Retrieves Transcript objects for given gene. Puts Genes slice + in each Transcript. + Returntype : Listref of Bio::EnsEMBL::Transcript objects + Exceptions : none + Caller : Gene->get_all_Transcripts() + Status : Stable + +=cut + +sub fetch_all_by_Gene { + my ( $self, $gene ) = @_; + + my $constraint = "t.gene_id = " . $gene->dbID(); + + # Use the fetch_all_by_Slice_constraint method because it handles the + # difficult Haps/PARs and coordinate remapping. + + # Get a slice that entirely overlaps the gene. This is because we + # want all transcripts to be retrieved, not just ones overlapping + # the slice the gene is on (the gene may only partially overlap the + # slice). For speed reasons, only use a different slice if necessary + # though. + + my $gslice = $gene->slice(); + + if ( !defined($gslice) ) { + throw("Gene must have attached slice to retrieve transcripts."); + } + + my $slice; + + if ( $gene->start() < 1 || $gene->end() > $gslice->length() ) { + if ( $gslice->is_circular() ) { + $slice = $gslice; + } else { + $slice = $self->db->get_SliceAdaptor->fetch_by_Feature($gene); + } + } else { + $slice = $gslice; + } + + my $transcripts = + $self->fetch_all_by_Slice_constraint( $slice, $constraint ); + + if ( $slice != $gslice ) { + my @out; + foreach my $tr ( @{$transcripts} ) { + push( @out, $tr->transfer($gslice) ); + } + $transcripts = \@out; + } + + my $canonical_t = $gene->canonical_transcript(); + + foreach my $t ( @{$transcripts} ) { + if ( $t->equals($canonical_t) ) { + $t->is_canonical(1); + last; + } + } + + return $transcripts; +} ## end sub fetch_all_by_Gene + + +=head2 fetch_all_by_Slice + + Arg [1] : Bio::EnsEMBL::Slice $slice + The slice to fetch transcripts on + Arg [2] : (optional) Boolean $load_exons + If true, exons will be loaded immediately rather than + lazy loaded later + Arg [3] : (optional) String $logic_name + The logic name of the type of features to obtain + ARG [4] : (optional) String $constraint + An extra contraint. + Example : my @transcripts = @{ $tr_adaptor->fetch_all_by_Slice($slice) }; + Description: Overrides superclass method to optionally load exons + immediately rather than lazy-loading them later. This + is more efficient when there are a lot of transcripts whose + exons are going to be used. + Returntype : Listref of Bio::EnsEMBL::Transcript objects + Exceptions : thrown if exon cannot be placed on transcript slice + Caller : Slice::get_all_Transcripts + Status : Stable + +=cut + +sub fetch_all_by_Slice { + my ( $self, $slice, $load_exons, $logic_name, $constraint ) = @_; + + my $transcripts; + if ( defined($constraint) && $constraint ne '' ) { + $transcripts = $self->SUPER::fetch_all_by_Slice_constraint( $slice, + 't.is_current = 1 AND ' . $constraint, $logic_name ); + } else { + $transcripts = $self->SUPER::fetch_all_by_Slice_constraint( $slice, + 't.is_current = 1', $logic_name ); + } + + # if there are 0 or 1 transcripts still do lazy-loading + if ( !$load_exons || @$transcripts < 2 ) { + return $transcripts; + } + + # preload all of the exons now, instead of lazy loading later + # faster than 1 query per transcript + + # first check if the exons are already preloaded + # @todo FIXME: Should test all exons. + if ( exists( $transcripts->[0]->{'_trans_exon_array'} ) ) { + return $transcripts; + } + + # get extent of region spanned by transcripts + my ( $min_start, $max_end ); + foreach my $tr (@$transcripts) { + if ( !defined($min_start) || $tr->seq_region_start() < $min_start ) + { + $min_start = $tr->seq_region_start(); + } + if ( !defined($max_end) || $tr->seq_region_end() > $max_end ) { + $max_end = $tr->seq_region_end(); + } + } + + my $ext_slice; + + if ( $min_start >= $slice->start() && $max_end <= $slice->end() ) { + $ext_slice = $slice; + } else { + my $sa = $self->db()->get_SliceAdaptor(); + $ext_slice = $sa->fetch_by_region( + $slice->coord_system->name(), $slice->seq_region_name(), + $min_start, $max_end, + $slice->strand(), $slice->coord_system->version() ); + } + + # associate exon identifiers with transcripts + + my %tr_hash = map { $_->dbID => $_ } @{$transcripts}; + + my $tr_id_str = join( ',', keys(%tr_hash) ); + + my $sth = + $self->prepare( "SELECT transcript_id, exon_id, rank " + . "FROM exon_transcript " + . "WHERE transcript_id IN ($tr_id_str)" ); + + $sth->execute(); + + my ( $tr_id, $ex_id, $rank ); + $sth->bind_columns( \( $tr_id, $ex_id, $rank ) ); + + my %ex_tr_hash; + + while ( $sth->fetch() ) { + $ex_tr_hash{$ex_id} ||= []; + push( @{ $ex_tr_hash{$ex_id} }, [ $tr_hash{$tr_id}, $rank ] ); + } + + my $ea = $self->db()->get_ExonAdaptor(); + my $exons = $ea->fetch_all_by_Slice_constraint( + $ext_slice, + sprintf( "e.exon_id IN (%s)", + join( ',', sort { $a <=> $b } keys(%ex_tr_hash) ) ) ); + + # move exons onto transcript slice, and add them to transcripts + foreach my $ex ( @{$exons} ) { + my $new_ex; + if ( $slice != $ext_slice ) { + $new_ex = $ex->transfer($slice); + if ( !defined($new_ex) ) { + throw("Unexpected. " + . "Exon could not be transfered onto Transcript slice." ); + } + } else { + $new_ex = $ex; + } + + foreach my $row ( @{ $ex_tr_hash{ $new_ex->dbID() } } ) { + my ( $tr, $rank ) = @{$row}; + $tr->add_Exon( $new_ex, $rank ); + } + } + + my $tla = $self->db()->get_TranslationAdaptor(); + + # load all of the translations at once + $tla->fetch_all_by_Transcript_list($transcripts); + + return $transcripts; +} ## end sub fetch_all_by_Slice + + +=head2 fetch_all_by_external_name + + Arg [1] : String $external_name + An external identifier of the transcript to be obtained + Arg [2] : (optional) String $external_db_name + The name of the external database from which the + identifier originates. + Arg [3] : Boolean override. Force SQL regex matching for users + who really do want to find all 'NM%' + Example : my @transcripts = + @{ $tr_adaptor->fetch_all_by_external_name( 'NP_065811.1') }; + my @more_transcripts = + @{$tr_adaptor->fetch_all_by_external_name( 'NP_0658__._')}; + Description: Retrieves all transcripts which are associated with + an external identifier such as a GO term, Swissprot + identifer, etc. Usually there will only be a single + transcript returned in the list reference, but not + always. Transcripts are returned in their native + coordinate system, i.e. the coordinate system in which + they are stored in the database. If they are required + in another coordinate system the Transcript::transfer or + Transcript::transform method can be used to convert them. + If no transcripts with the external identifier are found, + a reference to an empty list is returned. + SQL wildcards % and _ are supported in the $external_name + but their use is somewhat restricted for performance reasons. + Users that really do want % and _ in the first three characters + should use argument 3 to prevent optimisations + Returntype : listref of Bio::EnsEMBL::Transcript + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_external_name { + my ( $self, $external_name, $external_db_name, $override) = @_; + + my $entryAdaptor = $self->db->get_DBEntryAdaptor(); + + my @ids = + $entryAdaptor->list_transcript_ids_by_extids( $external_name, + $external_db_name, $override ); + + return $self->fetch_all_by_dbID_list( \@ids ); +} + +=head2 fetch_all_by_GOTerm + + Arg [1] : Bio::EnsEMBL::OntologyTerm + The GO term for which transcripts should be fetched. + + Example: @transcripts = @{ + $transcript_adaptor->fetch_all_by_GOTerm( + $go_adaptor->fetch_by_accession('GO:0030326') ) }; + + Description : Retrieves a list of transcripts that are + associated with the given GO term, or with any of + its descendent GO terms. The transcripts returned + are in their native coordinate system, i.e. in + the coordinate system in which they are stored + in the database. If another coordinate system + is required then the Transcript::transfer or + Transcript::transform method can be used. + + Return type : listref of Bio::EnsEMBL::Transcript + Exceptions : Throws of argument is not a GO term + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_GOTerm { + my ( $self, $term ) = @_; + + assert_ref( $term, 'Bio::EnsEMBL::OntologyTerm' ); + if ( $term->ontology() ne 'GO' ) { + throw('Argument is not a GO term'); + } + + my $entryAdaptor = $self->db->get_DBEntryAdaptor(); + + my %unique_dbIDs; + foreach my $accession ( map { $_->accession() } + ( $term, @{ $term->descendants() } ) ) + { + my @ids = + $entryAdaptor->list_transcript_ids_by_extids( $accession, 'GO' ); + foreach my $dbID (@ids) { $unique_dbIDs{$dbID} = 1 } + } + + my @result = @{ + $self->fetch_all_by_dbID_list( + [ sort { $a <=> $b } keys(%unique_dbIDs) ] + ) }; + + return \@result; +} ## end sub fetch_all_by_GOTerm + +=head2 fetch_all_by_GOTerm_accession + + Arg [1] : String + The GO term accession for which genes should be + fetched. + + Example : + + @genes = + @{ $gene_adaptor->fetch_all_by_GOTerm_accession( + 'GO:0030326') }; + + Description : Retrieves a list of genes that are associated with + the given GO term, or with any of its descendent + GO terms. The genes returned are in their native + coordinate system, i.e. in the coordinate system + in which they are stored in the database. If + another coordinate system is required then the + Gene::transfer or Gene::transform method can be + used. + + Return type : listref of Bio::EnsEMBL::Gene + Exceptions : Throws of argument is not a GO term accession + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_GOTerm_accession { + my ( $self, $accession ) = @_; + + if ( $accession !~ /^GO:/ ) { + throw('Argument is not a GO term accession'); + } + + my $goAdaptor = + Bio::EnsEMBL::Registry->get_adaptor( 'Multi', 'Ontology', + 'OntologyTerm' ); + + my $term = $goAdaptor->fetch_by_accession($accession); + + return $self->fetch_all_by_GOTerm($term); +} + +=head2 fetch_by_display_label + + Arg [1] : String $label - display label of transcript to fetch + Example : my $tr = $tr_adaptor->fetch_by_display_label("BRCA2"); + Description: Returns the transcript which has the given display label or + undef if there is none. If there are more than 1, only the first + is reported. + Returntype : Bio::EnsEMBL::Transcript + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_by_display_label { + my $self = shift; + my $label = shift; + + my $constraint = "x.display_label = ? AND t.is_current = 1"; + + $self->bind_param_generic_fetch($label,SQL_VARCHAR); + + my ($transcript) = @{ $self->generic_fetch($constraint) }; + + return $transcript; +} + + +=head2 fetch_all_by_exon_stable_id + + Arg [1] : String $stable_id + The stable id of an exon in a transcript + Example : my $tr = $tr_adaptor->fetch_all_by_exon_stable_id + ('ENSE00000309301'); + Description: Retrieves a list of transcripts via an exon stable id. + Returntype : Listref of Bio::EnsEMBL::Transcript objects + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_exon_stable_id { + my ($self, $stable_id) = @_; + + my @trans ; + + my $sth = $self->prepare(qq( + SELECT t.transcript_id + FROM exon_transcript et, exon e, transcript t + WHERE e.exon_id = et.exon_id + AND et.transcript_id = t.transcript_id + AND e.stable_id = ? + AND t.is_current = 1 + )); + + $sth->bind_param(1, $stable_id, SQL_VARCHAR); + $sth->execute(); + + while( my $id = $sth->fetchrow_array ) { + my $transcript = $self->fetch_by_dbID($id); + push(@trans, $transcript) if $transcript; + } + + if (!@trans) { + return undef; + } + + return \@trans; +} + +=head2 fetch_all_by_biotype + + Arg [1] : String $biotype + listref of $biotypes + The biotype of the gene to retrieve. You can also have a reference + to a list of biotypes in the event of needing several. + Example : $transcript = $transcript_adaptor->fetch_all_by_biotype('pseudogene'); + $transcript = $transcript_adaptor->fetch_all_by_biotype(['protein_coding','ambiguous_orf']); + Description: Retrieves an array reference of transcript objects from the + database via its biotype or biotypes. + The transcript will be retrieved in its native coordinate system + (i.e. in the coordinate system it is stored in the database). + It may be converted to a different coordinate system through a + call to transform() or transfer(). If the transcript is not found + undef is returned instead. + Returntype : listref of Bio::EnsEMBL::Transcript + Exceptions : if we cant get the transcript in given coord system + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_biotype { + my ($self, $biotype) = @_; + + if (!defined $biotype){ + throw("Biotype or listref of biotypes expected"); + } + my $constraint; + if (ref($biotype) eq 'ARRAY'){ + $constraint = "t.biotype IN ("; + foreach my $b (@{$biotype}){ + $constraint .= "?,"; + $self->bind_param_generic_fetch($b,SQL_VARCHAR); + } + chop($constraint); #remove last , from expression + $constraint .= ") and t.is_current = 1"; + + } + else{ + $constraint = "t.biotype = ? and t.is_current = 1"; + $self->bind_param_generic_fetch($biotype,SQL_VARCHAR); + } + my @transcripts = @{ $self->generic_fetch($constraint) }; + return \@transcripts ; +} + + +=head2 store + + Arg [1] : Bio::EnsEMBL::Transcript $transcript + The transcript to be written to the database + Arg [2] : Int $gene_dbID + The identifier of the gene that this transcript is associated + with + Arg [3] : DEPRECATED (optional) Int $analysis_id + The analysis_id to use when storing this gene. This is for + backward compatibility only and used to fall back to the gene + analysis_id if no analysis object is attached to the transcript + (which you should do for new code). + Example : $transID = $tr_adaptor->store($transcript, $gene->dbID); + Description: Stores a transcript in the database and returns the new + internal identifier for the stored transcript. + Returntype : Int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub store { + my ( $self, $transcript, $gene_dbID, $analysis_id ) = @_; + + if ( !ref($transcript) + || !$transcript->isa('Bio::EnsEMBL::Transcript') ) + { + throw("$transcript is not a EnsEMBL transcript - not storing"); + } + + my $db = $self->db(); + + if ( $transcript->is_stored($db) ) { + return $transcript->dbID(); + } + + # Force lazy-loading of exons and ensure coords are correct. + $transcript->recalculate_coordinates(); + + my $is_current = ( defined( $transcript->is_current() ) + ? $transcript->is_current() + : 1 ); + + # store analysis + my $analysis = $transcript->analysis(); + my $new_analysis_id; + + if ($analysis) { + if ( $analysis->is_stored($db) ) { + $new_analysis_id = $analysis->dbID; + } else { + $new_analysis_id = $db->get_AnalysisAdaptor->store($analysis); + } + } elsif ($analysis_id) { + # Fall back to analysis passed in (usually from gene) if analysis + # wasn't set explicitely for the transcript. This is deprectated + # though. + warning( "You should explicitely attach " + . "an analysis object to the Transcript. " + . "Will fall back to Gene analysis, " + . "but this behaviour is deprecated." ); + $new_analysis_id = $analysis_id; + } else { + throw("Need an analysis_id to store the Transcript."); + } + + # + # Store exons - this needs to be done before the possible transfer + # of the transcript to another slice (in _prestore()). Transfering + # results in copies being made of the exons and we need to preserve + # the object identity of the exons so that they are not stored twice + # by different transcripts. + # + my $exons = $transcript->get_all_Exons(); + my $exonAdaptor = $db->get_ExonAdaptor(); + foreach my $exon ( @{$exons} ) { + $exonAdaptor->store($exon); + } + + my $original_translation = $transcript->translation(); + my $original = $transcript; + my $seq_region_id; + ( $transcript, $seq_region_id ) = $self->_pre_store($transcript); + + # First store the transcript without a display xref. The display xref + # needs to be set after xrefs are stored which needs to happen after + # transcript is stored. + + # + # Store transcript + # + my $store_transcript_sql = qq( + INSERT INTO transcript + SET gene_id = ?, + analysis_id = ?, + seq_region_id = ?, + seq_region_start = ?, + seq_region_end = ?, + seq_region_strand = ?, + biotype = ?, + status = ?, + description = ?, + is_current = ?, + canonical_translation_id = ? + ); + + if ( defined( $transcript->stable_id() ) ) { + + my $created = $self->db->dbc->from_seconds_to_date($transcript->created_date()); + my $modified = $self->db->dbc->from_seconds_to_date($transcript->modified_date()); + $store_transcript_sql .= ", stable_id = ?, version = ?, created_date = " . $created . " , modified_date = " . $modified; + + } + + my $tst = $self->prepare($store_transcript_sql); + $tst->bind_param( 1, $gene_dbID, SQL_INTEGER ); + $tst->bind_param( 2, $new_analysis_id, SQL_INTEGER ); + $tst->bind_param( 3, $seq_region_id, SQL_INTEGER ); + $tst->bind_param( 4, $transcript->start(), SQL_INTEGER ); + $tst->bind_param( 5, $transcript->end(), SQL_INTEGER ); + $tst->bind_param( 6, $transcript->strand(), SQL_TINYINT ); + $tst->bind_param( 7, $transcript->biotype(), SQL_VARCHAR ); + $tst->bind_param( 8, $transcript->status(), SQL_VARCHAR ); + $tst->bind_param( 9, $transcript->description(), SQL_LONGVARCHAR ); + $tst->bind_param( 10, $is_current, SQL_TINYINT ); + + # If the transcript has a translation, this is updated later: + $tst->bind_param( 11, undef, SQL_INTEGER ); + + if ( defined( $transcript->stable_id() ) ) { + + $tst->bind_param( 12, $transcript->stable_id(), SQL_VARCHAR ); + my $version = ($transcript->version()) ? $transcript->version() : 1; + $tst->bind_param( 13, $version, SQL_INTEGER ); + } + + + $tst->execute(); + $tst->finish(); + + my $transc_dbID = $tst->{'mysql_insertid'}; + + # + # Store translation + # + + my $alt_translations = + $transcript->get_all_alternative_translations(); + my $translation = $transcript->translation(); + + if ( defined($translation) ) { + # Make sure that the start and end exon are set correctly. + my $start_exon = $translation->start_Exon(); + my $end_exon = $translation->end_Exon(); + + if ( !defined($start_exon) ) { + throw("Translation does not define a start exon."); + } + + if ( !defined($end_exon) ) { + throw("Translation does not defined an end exon."); + } + + # If the dbID is not set, this means the exon must have been a + # different object in memory than the the exons of the transcript. + # Try to find the matching exon in all of the exons we just stored. + if ( !defined( $start_exon->dbID() ) ) { + my $key = $start_exon->hashkey(); + ($start_exon) = grep { $_->hashkey() eq $key } @$exons; + + if ( defined($start_exon) ) { + $translation->start_Exon($start_exon); + } else { + throw( "Translation's start_Exon does not appear " + . "to be one of the exons in " + . "its associated Transcript" ); + } + } + + if ( !defined( $end_exon->dbID() ) ) { + my $key = $end_exon->hashkey(); + ($end_exon) = grep { $_->hashkey() eq $key } @$exons; + + if ( defined($end_exon) ) { + $translation->end_Exon($end_exon); + } else { + throw( "Translation's end_Exon does not appear " + . "to be one of the exons in " + . "its associated Transcript." ); + } + } + + my $old_dbid = $translation->dbID(); + $db->get_TranslationAdaptor()->store( $translation, $transc_dbID ); + + # Need to update the canonical_translation_id for this transcript. + + my $sth = $self->prepare( + q( + UPDATE transcript + SET canonical_translation_id = ? + WHERE transcript_id = ?) + ); + + $sth->bind_param( 1, $translation->dbID(), SQL_INTEGER ); + $sth->bind_param( 2, $transc_dbID, SQL_INTEGER ); + + $sth->execute(); + + # Set values of the original translation, we may have copied it when + # we transformed the transcript. + $original_translation->dbID( $translation->dbID() ); + $original_translation->adaptor( $translation->adaptor() ); + } ## end if ( defined($translation...)) + + # + # Store the alternative translations, if there are any. + # + + if ( defined($alt_translations) + && scalar( @{$alt_translations} ) > 0 ) + { + foreach my $alt_translation ( @{$alt_translations} ) { + my $start_exon = $alt_translation->start_Exon(); + my $end_exon = $alt_translation->end_Exon(); + + if ( !defined($start_exon) ) { + throw("Translation does not define a start exon."); + } elsif ( !defined($end_exon) ) { + throw("Translation does not defined an end exon."); + } + + if ( !defined( $start_exon->dbID() ) ) { + my $key = $start_exon->hashkey(); + ($start_exon) = grep { $_->hashkey() eq $key } @{$exons}; + + if ( defined($start_exon) ) { + $alt_translation->start_Exon($start_exon); + } else { + throw( "Translation's start_Exon does not appear " + . "to be one of the exon in" + . "its associated Transcript" ); + } + } elsif ( !defined( $end_exon->dbID() ) ) { + my $key = $end_exon->hashkey(); + ($end_exon) = grep { $_->hashkey() eq $key } @$exons; + + if ( defined($end_exon) ) { + $translation->end_Exon($end_exon); + } else { + throw( "Translation's end_Exon does not appear " + . "to be one of the exons in " + . "its associated Transcript." ); + } + } + + $db->get_TranslationAdaptor() + ->store( $alt_translation, $transc_dbID ); + } ## end foreach my $alt_translation... + } ## end if ( defined($alt_translations...)) + + # + # Store the xrefs/object xref mapping. + # + my $dbEntryAdaptor = $db->get_DBEntryAdaptor(); + + foreach my $dbe ( @{ $transcript->get_all_DBEntries() } ) { + $dbEntryAdaptor->store( $dbe, $transc_dbID, "Transcript", 1 ); + } + + # + # Update transcript to point to display xref if it is set. + # + if ( my $dxref = $transcript->display_xref() ) { + my $dxref_id; + + if ( $dxref->is_stored($db) ) { + $dxref_id = $dxref->dbID(); + } else { + $dxref_id = $dbEntryAdaptor->exists($dxref); + } + + if ( defined($dxref_id) ) { + my $sth = + $self->prepare( "UPDATE transcript " + . "SET display_xref_id = ? " + . "WHERE transcript_id = ?" ); + $sth->bind_param( 1, $dxref_id, SQL_INTEGER ); + $sth->bind_param( 2, $transc_dbID, SQL_INTEGER ); + $sth->execute(); + $dxref->dbID($dxref_id); + $dxref->adaptor($dbEntryAdaptor); + $sth->finish(); + } else { + warning(sprintf( + "Display_xref %s:%s is not stored in database.\n" + . "Not storing relationship to this transcript.", + $dxref->dbname(), $dxref->display_id() ) ); + $dxref->dbID(undef); + $dxref->adaptor(undef); + } + } ## end if ( my $dxref = $transcript...) + + # + # Link transcript to exons in exon_transcript table + # + my $etst = $self->prepare( + "INSERT INTO exon_transcript (exon_id,transcript_id,rank) " + . "VALUES (?,?,?)" ); + my $rank = 1; + foreach my $exon ( @{ $transcript->get_all_Exons } ) { + $etst->bind_param( 1, $exon->dbID, SQL_INTEGER ); + $etst->bind_param( 2, $transc_dbID, SQL_INTEGER ); + $etst->bind_param( 3, $rank, SQL_INTEGER ); + $etst->execute(); + $rank++; + } + + $etst->finish(); + + # Now the supporting evidence + my $tsf_adaptor = $db->get_TranscriptSupportingFeatureAdaptor(); + $tsf_adaptor->store( $transc_dbID, + $transcript->get_all_supporting_features() ); + + # store transcript attributes if there are any + my $attr_adaptor = $db->get_AttributeAdaptor(); + + $attr_adaptor->store_on_Transcript( $transc_dbID, + $transcript->get_all_Attributes() ); + + # store the IntronSupportingEvidence features + my $ise_adaptor = $db->get_IntronSupportingEvidenceAdaptor(); + my $intron_supporting_evidence = $transcript->get_all_IntronSupportingEvidence(); + foreach my $ise (@{$intron_supporting_evidence}) { + $ise_adaptor->store($ise); + $ise_adaptor->store_transcript_linkage($ise, $transcript, $transc_dbID); + } + + # Update the original transcript object - not the transfered copy that + # we might have created. + $original->dbID($transc_dbID); + $original->adaptor($self); + + return $transc_dbID; +} ## end sub store + + +=head2 get_Interpro_by_transid + + Arg [1] : String $trans_stable_id + The stable if of the transcript to obtain + Example : @i = $tr_adaptor->get_Interpro_by_transid($trans->stable_id()); + Description: Gets interpro accession numbers by transcript stable id. + A hack really - we should have a much more structured + system than this. + Returntype : listref of strings (Interpro_acc:description) + Exceptions : none + Caller : domainview? , GeneView + Status : Stable + +=cut + +sub get_Interpro_by_transid { + my ($self,$trans_stable_id) = @_; + + my $sth = $self->prepare(qq( + SELECT STRAIGHT_JOIN i.interpro_ac, x.description + FROM transcript t, + translation tl, + protein_feature pf, + interpro i, + xref x + WHERE t.stable_id = ? + AND tl.transcript_id = t.transcript_id + AND tl.translation_id = pf.translation_id + AND i.id = pf.hit_name + AND i.interpro_ac = x.dbprimary_acc + AND t.is_current = 1 + )); + + $sth->bind_param(1, $trans_stable_id, SQL_VARCHAR); + $sth->execute(); + + my @out; + my %h; + while( (my $arr = $sth->fetchrow_arrayref()) ) { + if( $h{$arr->[0]} ) { next; } + $h{$arr->[0]}=1; + my $string = $arr->[0] .":".$arr->[1]; + push(@out,$string); + } + + return \@out; +} + +=head2 is_Transcript_canonical() + + Arg [1] : Bio::EnsEMBL::Transcript $transcript + The transcript to query with + Example : $tr_adaptor->is_Transcript_canonical($transcript); + Description : Returns a boolean if the given transcript is considered + canonical with respect to a gene + Returntype : Boolean + Exceptions : None + Caller : Bio::EnsEMBL::Transcript + Status : Beta + + +=cut + +sub is_Transcript_canonical { + my ($self, $transcript) = @_; + return $self->dbc()->sql_helper()->execute_single_result( + -SQL => 'select count(*) from gene where canonical_transcript_id =?', + -PARAMS => [$transcript->dbID()] + ); +} + + +=head2 remove + + Arg [1] : Bio::EnsEMBL::Transcript $transcript + The transcript to remove from the database + Example : $tr_adaptor->remove($transcript); + Description: Removes a transcript completely from the database, and all + associated information. + This method is usually called by the GeneAdaptor::remove method + because this method will not preform the removal of genes + which are associated with this transcript. Do not call this + method directly unless you know there are no genes associated + with the transcript! + Returntype : none + Exceptions : throw on incorrect arguments + warning if transcript is not in this database + Caller : GeneAdaptor::remove + Status : Stable + +=cut + +sub remove { + my $self = shift; + my $transcript = shift; + + if(!ref($transcript) || !$transcript->isa('Bio::EnsEMBL::Transcript')) { + throw("Bio::EnsEMBL::Transcript argument expected"); + } + + # sanity check: make sure nobody tries to slip past a prediction transcript + # which inherits from transcript but actually uses different tables + if($transcript->isa('Bio::EnsEMBL::PredictionTranscript')) { + throw("TranscriptAdaptor can only remove Transcripts " . + "not PredictionTranscripts"); + } + + if ( !$transcript->is_stored($self->db()) ) { + warning("Cannot remove transcript ". $transcript->dbID .". Is not stored ". + "in this database."); + return; + } + + # remove the supporting features of this transcript + + my $prot_adp = $self->db->get_ProteinAlignFeatureAdaptor; + my $dna_adp = $self->db->get_DnaAlignFeatureAdaptor; + + my $sfsth = $self->prepare("SELECT feature_type, feature_id " . + "FROM transcript_supporting_feature " . + "WHERE transcript_id = ?"); + + $sfsth->bind_param(1, $transcript->dbID, SQL_INTEGER); + $sfsth->execute(); + + # statements to check for shared align_features + my $sth1 = $self->prepare("SELECT count(*) FROM supporting_feature " . + "WHERE feature_type = ? AND feature_id = ?"); + my $sth2 = $self->prepare("SELECT count(*) " . + "FROM transcript_supporting_feature " . + "WHERE feature_type = ? AND feature_id = ?"); + + SUPPORTING_FEATURE: + while(my ($type, $feature_id) = $sfsth->fetchrow()){ + + # only remove align_feature if this is the last reference to it + $sth1->bind_param(1, $type, SQL_VARCHAR); + $sth1->bind_param(2, $feature_id, SQL_INTEGER); + $sth1->execute; + $sth2->bind_param(1, $type, SQL_VARCHAR); + $sth2->bind_param(2, $feature_id, SQL_INTEGER); + $sth2->execute; + my ($count1) = $sth1->fetchrow; + my ($count2) = $sth2->fetchrow; + if ($count1 + $count2 > 1) { + #warn "transcript: shared feature, not removing $type|$feature_id\n"; + next SUPPORTING_FEATURE; + } + + #warn "transcript: removing $type|$feature_id\n"; + + if($type eq 'protein_align_feature'){ + my $f = $prot_adp->fetch_by_dbID($feature_id); + $prot_adp->remove($f); + } + elsif($type eq 'dna_align_feature'){ + my $f = $dna_adp->fetch_by_dbID($feature_id); + $dna_adp->remove($f); + } + else { + warning("Unknown supporting feature type $type. Not removing feature."); + } + } + $sfsth->finish(); + $sth1->finish(); + $sth2->finish(); + + # delete the association to supporting features + + $sfsth = $self->prepare("DELETE FROM transcript_supporting_feature WHERE transcript_id = ?"); + $sfsth->bind_param(1, $transcript->dbID, SQL_INTEGER); + $sfsth->execute(); + $sfsth->finish(); + + # delete the associated IntronSupportingEvidence and if the ISE had no more + # linked transcripts remove it + my $ise_adaptor = $self->db->get_IntronSupportingEvidenceAdaptor(); + foreach my $ise (@{$transcript->get_all_IntronSupportingEvidence()}) { + $ise_adaptor->remove_transcript_linkage($ise, $transcript); + if(! $ise->has_linked_transcripts()) { + $ise_adaptor->remove($ise); + } + } + + # remove all xref linkages to this transcript + + my $dbeAdaptor = $self->db->get_DBEntryAdaptor(); + foreach my $dbe (@{$transcript->get_all_DBEntries}) { + $dbeAdaptor->remove_from_object($dbe, $transcript, 'Transcript'); + } + + # remove the attributes associated with this transcript + my $attrib_adp = $self->db->get_AttributeAdaptor; + $attrib_adp->remove_from_Transcript($transcript); + + # remove the translation associated with this transcript + + my $translationAdaptor = $self->db->get_TranslationAdaptor(); + if( defined($transcript->translation()) ) { + $translationAdaptor->remove( $transcript->translation ); + } + + # remove exon associations to this transcript + + my $exonAdaptor = $self->db->get_ExonAdaptor(); + foreach my $exon ( @{$transcript->get_all_Exons()} ) { + # get the number of transcript references to this exon + # only remove the exon if this is the last transcript to + # reference it + + my $sth = $self->prepare( "SELECT count(*) + FROM exon_transcript + WHERE exon_id = ?" ); + $sth->bind_param(1, $exon->dbID, SQL_INTEGER); + $sth->execute(); + my ($count) = $sth->fetchrow_array(); + $sth->finish(); + + if($count == 1){ + $exonAdaptor->remove( $exon ); + } + } + + my $sth = $self->prepare( "DELETE FROM exon_transcript + WHERE transcript_id = ?" ); + $sth->bind_param(1, $transcript->dbID, SQL_INTEGER); + $sth->execute(); + $sth->finish(); + + + $sth = $self->prepare( "DELETE FROM transcript + WHERE transcript_id = ?" ); + $sth->bind_param(1, $transcript->dbID, SQL_INTEGER); + $sth->execute(); + $sth->finish(); + + $transcript->dbID(undef); + $transcript->adaptor(undef); + + return; +} + + +=head2 update + + Arg [1] : Bio::EnsEMBL::Transcript $transcript + The transcript to update + Example : $tr_adaptor->update($transcript); + Description: Updates a transcript in the database. + Returntype : None + Exceptions : thrown if the $transcript is not a Bio::EnsEMBL::Transcript. + warn if the method is called on a transcript that does not exist + in the database. + Should warn if trying to update the number of attached exons, but + this is a far more complex process and is not yet implemented. + Caller : general + Status : Stable + +=cut + +sub update { + my ( $self, $transcript ) = @_; + + if ( !defined($transcript) + || !ref($transcript) + || !$transcript->isa('Bio::EnsEMBL::Transcript') ) + { + throw("Must update a transcript object, not a $transcript"); + } + + my $update_transcript_sql = qq( + UPDATE transcript + SET analysis_id = ?, + display_xref_id = ?, + description = ?, + biotype = ?, + status = ?, + is_current = ?, + canonical_translation_id = ? + WHERE transcript_id = ? + ); + + my $display_xref = $transcript->display_xref(); + my $display_xref_id; + + if ( defined($display_xref) && $display_xref->dbID() ) { + $display_xref_id = $display_xref->dbID(); + } else { + $display_xref_id = undef; + } + + my $sth = $self->prepare($update_transcript_sql); + + $sth->bind_param( 1, $transcript->analysis()->dbID(), SQL_INTEGER ); + $sth->bind_param( 2, $display_xref_id, SQL_INTEGER ); + $sth->bind_param( 3, $transcript->description(), SQL_LONGVARCHAR ); + $sth->bind_param( 4, $transcript->biotype(), SQL_VARCHAR ); + $sth->bind_param( 5, $transcript->status(), SQL_VARCHAR ); + $sth->bind_param( 6, $transcript->is_current(), SQL_TINYINT ); + $sth->bind_param( 7, ( + defined( $transcript->translation() ) + ? $transcript->translation()->dbID() + : undef ), + SQL_INTEGER ); + $sth->bind_param( 8, $transcript->dbID(), SQL_INTEGER ); + + $sth->execute(); +} ## end sub update + + +=head2 list_dbIDs + + Example : @transcript_ids = @{ $t_adaptor->list_dbIDs }; + Description: Gets a list of internal ids for all transcripts in the db. + Arg[1] : int. not 0 for the ids to be sorted by the seq_region. Returntype : Listref of Ints + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub list_dbIDs { + my ($self, $ordered) = @_; + + return $self->_list_dbIDs("transcript",undef, $ordered); +} + + +=head2 list_stable_ids + + Example : @stable_trans_ids = @{ $transcript_adaptor->list_stable_ids }; + Description: Gets a list of stable ids for all transcripts in the current + database. + Returntype : Listref of Strings + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub list_stable_ids { + my ($self) = @_; + + return $self->_list_dbIDs("transcript", "stable_id"); +} + + +#_objs_from_sth + +# Arg [1] : StatementHandle $sth +# Arg [2] : Bio::EnsEMBL::AssemblyMapper $mapper +# Arg [3] : Bio::EnsEMBL::Slice $dest_slice +# Description: PROTECTED implementation of abstract superclass method. +# Responsible for the creation of Transcripts. +# Returntype : Listref of Bio::EnsEMBL::Transcripts in target coord system +# Exceptions : none +# Caller : internal +# Status : Stable + +sub _objs_from_sth { + my ($self, $sth, $mapper, $dest_slice) = @_; + + # + # This code is ugly because an attempt has been made to remove as many + # function calls as possible for speed purposes. Thus many caches and + # a fair bit of gymnastics is used. + # + + my $sa = $self->db()->get_SliceAdaptor(); + my $aa = $self->db->get_AnalysisAdaptor(); + my $dbEntryAdaptor = $self->db()->get_DBEntryAdaptor(); + + my @transcripts; + my %analysis_hash; + my %slice_hash; + my %sr_name_hash; + my %sr_cs_hash; + + my ( + $transcript_id, $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand, $analysis_id, + $gene_id, $is_current, $stable_id, + $version, $created_date, $modified_date, + $description, $biotype, $status, + $external_db, $external_status, $external_db_name, + $xref_id, $xref_display_label, $xref_primary_acc, + $xref_version, $xref_description, $xref_info_type, + $xref_info_text + ); + + $sth->bind_columns( + \( + $transcript_id, $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand, $analysis_id, + $gene_id, $is_current, $stable_id, + $version, $created_date, $modified_date, + $description, $biotype, $status, + $external_db, $external_status, $external_db_name, + $xref_id, $xref_display_label, $xref_primary_acc, + $xref_version, $xref_description, $xref_info_type, + $xref_info_text + ) ); + + my $asm_cs; + my $cmp_cs; + my $asm_cs_vers; + my $asm_cs_name; + my $cmp_cs_vers; + my $cmp_cs_name; + if($mapper) { + $asm_cs = $mapper->assembled_CoordSystem(); + $cmp_cs = $mapper->component_CoordSystem(); + $asm_cs_name = $asm_cs->name(); + $asm_cs_vers = $asm_cs->version(); + $cmp_cs_name = $cmp_cs->name(); + $cmp_cs_vers = $cmp_cs->version(); + } + + my $dest_slice_start; + my $dest_slice_end; + my $dest_slice_strand; + my $dest_slice_length; + my $dest_slice_cs; + my $dest_slice_sr_name; + my $dest_slice_sr_id; + + my $asma; + if($dest_slice) { + $dest_slice_start = $dest_slice->start(); + $dest_slice_end = $dest_slice->end(); + $dest_slice_strand = $dest_slice->strand(); + $dest_slice_length = $dest_slice->length(); + $dest_slice_cs = $dest_slice->coord_system(); + $dest_slice_sr_name = $dest_slice->seq_region_name(); + $dest_slice_sr_id = $dest_slice->get_seq_region_id(); + $asma = $self->db->get_AssemblyMapperAdaptor(); + } + + FEATURE: while($sth->fetch()) { + + #get the analysis object + my $analysis = $analysis_hash{$analysis_id} ||= + $aa->fetch_by_dbID($analysis_id); + #need to get the internal_seq_region, if present + $seq_region_id = $self->get_seq_region_id_internal($seq_region_id); + my $slice = $slice_hash{"ID:".$seq_region_id}; + my $dest_mapper = $mapper; + + if(!$slice) { + $slice = $sa->fetch_by_seq_region_id($seq_region_id); + $slice_hash{"ID:".$seq_region_id} = $slice; + $sr_name_hash{$seq_region_id} = $slice->seq_region_name(); + $sr_cs_hash{$seq_region_id} = $slice->coord_system(); + } + + #obtain a mapper if none was defined, but a dest_seq_region was + if(!$dest_mapper && $dest_slice && + !$dest_slice_cs->equals($slice->coord_system)) { + $dest_mapper = $asma->fetch_by_CoordSystems($dest_slice_cs, + $slice->coord_system); + $asm_cs = $dest_mapper->assembled_CoordSystem(); + $cmp_cs = $dest_mapper->component_CoordSystem(); + $asm_cs_name = $asm_cs->name(); + $asm_cs_vers = $asm_cs->version(); + $cmp_cs_name = $cmp_cs->name(); + $cmp_cs_vers = $cmp_cs->version(); + } + + my $sr_name = $sr_name_hash{$seq_region_id}; + my $sr_cs = $sr_cs_hash{$seq_region_id}; + # + # remap the feature coordinates to another coord system + # if a mapper was provided + # + if($dest_mapper) { + + if (defined $dest_slice && $dest_mapper->isa('Bio::EnsEMBL::ChainedAssemblyMapper') ) { + ( $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand ) + = + $dest_mapper->map( $sr_name, $seq_region_start, $seq_region_end, + $seq_region_strand, $sr_cs, 1, $dest_slice); + + } else { + + ( $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand ) + = $dest_mapper->fastmap( $sr_name, $seq_region_start, + $seq_region_end, $seq_region_strand, + $sr_cs ); + } + + #skip features that map to gaps or coord system boundaries + next FEATURE if(!defined($seq_region_id)); + + #get a slice in the coord system we just mapped to + if($asm_cs == $sr_cs || ($cmp_cs != $sr_cs && $asm_cs->equals($sr_cs))) { + $slice = $slice_hash{"ID:".$seq_region_id} ||= + $sa->fetch_by_seq_region_id($seq_region_id); + } else { + $slice = $slice_hash{"ID:".$seq_region_id} ||= + $sa->fetch_by_seq_region_id($seq_region_id); + } + } + + # + # If a destination slice was provided convert the coords. + # + if (defined($dest_slice)) { + if ( $dest_slice_strand == 1 ) { + $seq_region_start = $seq_region_start - $dest_slice_start + 1; + $seq_region_end = $seq_region_end - $dest_slice_start + 1; + + if ( $dest_slice->is_circular ) { + if ( $seq_region_start > $seq_region_end ) { + # Looking at a feature overlapping the chromsome origin. + if ( $seq_region_end > $dest_slice_start ) { + # Looking at the region in the beginning of the chromosome + $seq_region_start -= $dest_slice->seq_region_length(); + } + if ( $seq_region_end < 0 ) { + $seq_region_end += $dest_slice->seq_region_length(); + } + } else { + if ( $dest_slice_start > $dest_slice_end + && $seq_region_end < 0 ) + { + # Looking at the region overlapping the chromosome + # origin and a feature which is at the beginning of the + # chromosome. + $seq_region_start += $dest_slice->seq_region_length(); + $seq_region_end += $dest_slice->seq_region_length(); + } + } + } + } else { + if ( $dest_slice->is_circular() + && $seq_region_start > $seq_region_end ) + { + if ( $seq_region_end > $dest_slice_start ) { + # Looking at the region in the beginning of the chromosome. + $seq_region_start = $dest_slice_end - $seq_region_end + 1; + $seq_region_end = + $seq_region_end - + $dest_slice->seq_region_length() - + $dest_slice_start + 1; + } else { + my $tmp_seq_region_start = $seq_region_start; + $seq_region_start = + $dest_slice_end - + $seq_region_end - + $dest_slice->seq_region_length() + 1; + $seq_region_end = + $dest_slice_end - $tmp_seq_region_start + 1; + } + + } else { + my $tmp_seq_region_start = $seq_region_start; + $seq_region_start = $dest_slice_end - $seq_region_end + 1; + $seq_region_end = $dest_slice_end - $tmp_seq_region_start + 1; + } + + $seq_region_strand = -$seq_region_strand; + } ## end else [ if ( $dest_slice_strand...)] + + # Throw away features off the end of the requested slice + if ( $seq_region_end < 1 + || $seq_region_start > $dest_slice_length + || ( $dest_slice_sr_id ne $seq_region_id ) ) + { + next FEATURE; + } + + $slice = $dest_slice; + } + + my $display_xref; + + if ($xref_id) { + $display_xref = Bio::EnsEMBL::DBEntry->new_fast( { + 'dbID' => $xref_id, + 'display_id' => $xref_display_label, + 'primary_id' => $xref_primary_acc, + 'version' => $xref_version, + 'description' => $xref_description, + 'info_type' => $xref_info_type, + 'info_text' => $xref_info_text, + 'adaptor' => $dbEntryAdaptor, + 'db_display_name' => $external_db_name, + 'dbname' => $external_db + } ); + } + + + # Finally, create the new Transcript. + push( + @transcripts, + $self->_create_feature_fast( + 'Bio::EnsEMBL::Transcript', + { + 'analysis' => $analysis, + 'start' => $seq_region_start, + 'end' => $seq_region_end, + 'strand' => $seq_region_strand, + 'adaptor' => $self, + 'slice' => $slice, + 'dbID' => $transcript_id, + 'stable_id' => $stable_id, + 'version' => $version, + 'created_date' => $created_date || undef, + 'modified_date' => $modified_date || undef, + 'external_name' => $xref_display_label, + 'external_db' => $external_db, + 'external_status' => $external_status, + 'external_display_name' => $external_db_name, + 'display_xref' => $display_xref, + 'description' => $description, + 'biotype' => $biotype, + 'status' => $status, + 'is_current' => $is_current, + 'edits_enabled' => 1 + } ) ); + + } + + return \@transcripts; +} + + +=head2 fetch_all_by_exon_supporting_evidence + + Arg [1] : String $hit_name + Name of supporting feature + Arg [2] : String $feature_type + one of "dna_align_feature" or "protein_align_feature" + Arg [3] : (optional) Bio::Ensembl::Analysis + Example : $tr = $tr_adaptor->fetch_all_by_exon_supporting_evidence + ('XYZ', 'dna_align_feature'); + Description: Gets all the transcripts with exons which have a specified hit + on a particular type of feature. Optionally filter by analysis. + Returntype : Listref of Bio::EnsEMBL::Transcript objects + Exceptions : If feature_type is not of correct type. + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_exon_supporting_evidence { + my ($self, $hit_name, $feature_type, $analysis) = @_; + + if($feature_type !~ /(dna)|(protein)_align_feature/) { + throw("feature type must be dna_align_feature or protein_align_feature"); + } + + my $anal_from = ""; + $anal_from = ", analysis a " if ($analysis); + my $anal_where = ""; + $anal_where = "AND a.analysis_id = f.analysis_id AND a.analysis_id=? " + if ($analysis); + + my $sql = qq( + SELECT DISTINCT(t.transcript_id) + FROM transcript t, + exon_transcript et, + supporting_feature sf, + $feature_type f + $anal_from + WHERE t.transcript_id = et.transcript_id + AND t.is_current = 1 + AND et.exon_id = sf.exon_id + AND sf.feature_id = f.${feature_type}_id + AND sf.feature_type = ? + AND f.hit_name=? + $anal_where + ); + + my $sth = $self->prepare($sql); + + $sth->bind_param(1, $feature_type, SQL_VARCHAR); + $sth->bind_param(2, $hit_name, SQL_VARCHAR); + $sth->bind_param(3, $analysis->dbID(), SQL_INTEGER) if ($analysis); + + $sth->execute(); + + my @transcripts; + + while( my $id = $sth->fetchrow_array ) { + my $transcript = $self->fetch_by_dbID( $id ); + push(@transcripts, $transcript) if $transcript; + } + + return \@transcripts; +} + + +=head2 fetch_all_by_transcript_supporting_evidence + + Arg [1] : String $hit_name + Name of supporting feature + Arg [2] : String $feature_type + one of "dna_align_feature" or "protein_align_feature" + Arg [3] : (optional) Bio::Ensembl::Analysis + Example : $transcripts = $transcript_adaptor->fetch_all_by_transcript_supporting_evidence('XYZ', 'dna_align_feature'); + Description: Gets all the transcripts with evidence from a specified hit_name on a particular type of feature, stored in the + transcript_supporting_feature table. Optionally filter by analysis. For hits stored in the supporting_feature + table (linked to exons) use fetch_all_by_exon_supporting_evidence instead. + Returntype : Listref of Bio::EnsEMBL::Transcript objects + Exceptions : If feature_type is not of correct type. + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_transcript_supporting_evidence { + + my ($self, $hit_name, $feature_type, $analysis) = @_; + + if($feature_type !~ /(dna)|(protein)_align_feature/) { + throw("feature type must be dna_align_feature or protein_align_feature"); + } + + my $anal_from = ""; + $anal_from = ", analysis a " if ($analysis); + my $anal_where = ""; + $anal_where = "AND a.analysis_id = f.analysis_id AND a.analysis_id=? " + if ($analysis); + + my $sql = qq( + SELECT DISTINCT(t.transcript_id) + FROM transcript t, + transcript_supporting_feature sf, + $feature_type f + $anal_from + WHERE t.transcript_id = sf.transcript_id + AND t.is_current = 1 + AND sf.feature_id = f.${feature_type}_id + AND sf.feature_type = ? + AND f.hit_name=? + $anal_where + ); + + my $sth = $self->prepare($sql); + + $sth->bind_param(1, $feature_type, SQL_VARCHAR); + $sth->bind_param(2, $hit_name, SQL_VARCHAR); + $sth->bind_param(3, $analysis->dbID(), SQL_INTEGER) if ($analysis); + + $sth->execute(); + + my @transcripts; + + while( my $id = $sth->fetchrow_array ) { + my $transcript = $self->fetch_by_dbID( $id ); + push(@transcripts, $transcript) if $transcript; + } + + return \@transcripts; +} + + +########################## +# # +# DEPRECATED METHODS # +# # +########################## + + +=head2 get_display_xref + + Description: DEPRECATED. Use $transcript->display_xref() instead. + +=cut + +sub get_display_xref { + my ($self, $transcript) = @_; + + deprecate("display_xref should be retreived from Transcript object directly."); + + if ( !defined $transcript ) { + throw("Must call with a Transcript object"); + } + + my $sth = $self->prepare(qq( + SELECT e.db_name, + x.display_label, + e.db_external_name, + x.xref_id + FROM transcript t, + xref x, + external_db e + WHERE t.transcript_id = ? + AND t.display_xref_id = x.xref_id + AND x.external_db_id = e.external_db_id + )); + + $sth->bind_param(1, $transcript->dbID, SQL_INTEGER); + $sth->execute(); + + my ($db_name, $display_label, $xref_id, $display_db_name ) = + $sth->fetchrow_array(); + + if ( !defined $xref_id ) { + return undef; + } + + my $db_entry = Bio::EnsEMBL::DBEntry->new( + -dbid => $xref_id, + -adaptor => $self->db->get_DBEntryAdaptor(), + -dbname => $db_name, + -display_id => $display_label + -db_display_name => $display_db_name + ); + + return $db_entry; +} + + +=head2 get_stable_entry_info + + Description: DEPRECATED. Use $transcript->stable_id() instead. + +=cut + +sub get_stable_entry_info { + my ($self, $transcript) = @_; + + deprecate("Stable ids should be loaded directly now"); + + unless ( defined $transcript && ref $transcript && + $transcript->isa('Bio::EnsEMBL::Transcript') ) { + throw("Needs a Transcript object, not a $transcript"); + } + + my $sth = $self->prepare(qq( + SELECT stable_id, version + FROM transcript + WHERE transcript_id = ? + )); + + $sth->bind_param(1, $transcript->dbID, SQL_INTEGER); + $sth->execute(); + + my @array = $sth->fetchrow_array(); + $transcript->{'_stable_id'} = $array[0]; + $transcript->{'_version'} = $array[1]; + + return 1; +} + + +=head2 fetch_all_by_DBEntry + + Description: DEPRECATED. Use fetch_all_by_external_name() instead. + +=cut + +sub fetch_all_by_DBEntry { + my $self = shift; + deprecate('Use fetch_all_by_external_name instead.'); + return $self->fetch_all_by_external_name(@_); +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/TranscriptSupportingFeatureAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/TranscriptSupportingFeatureAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,259 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::TranscriptSupportingFeatureAdaptor - Retrieves +supporting features from the database. + +=head1 SYNOPSIS + + $supporting_feature_adaptor = + $database_adaptor->get_TranscriptSupportingFeatureAdaptor; + + @supporting_feats = + @{ $supporting_feat_adaptor->fetch_all_by_Transcript($transcript) }; + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::TranscriptSupportingFeatureAdaptor; + +use strict; +use Bio::EnsEMBL::DBSQL::BaseAdaptor; + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + +use vars qw(@ISA); + +#inherits from BaseAdaptor +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); + + + +=head2 fetch_all_by_Transcript + + Arg [1] : Bio::EnsEMBL::Transcript $transcript + The transcript to fetch supporting features for + Example : @sfs = @{$supporting_feat_adaptor->fetch_all_by_Transcript($transcript)}; + Description: Retrieves supporting features (evidence) for a given transcript. + Returntype : list of Bio::EnsEMBL::BaseAlignFeatures in the same coordinate + system as the $transcript argument + Exceptions : warning if $transcript is not in the database (i.e. dbID not defined) + throw if a retrieved supporting feature is of unknown type + Caller : Bio::EnsEMBL::Transcript + Status : Stable + +=cut + +sub fetch_all_by_Transcript { + my ( $self, $transcript ) = @_; + + my $out = []; + + unless($transcript->dbID) { + warning("Cannot retrieve evidence for transcript without dbID"); + return []; + } + + my $sth = $self->prepare("SELECT tsf.feature_type, tsf.feature_id + FROM transcript_supporting_feature tsf + WHERE transcript_id = ?"); + + + $sth->bind_param(1,$transcript->dbID,SQL_INTEGER); + $sth->execute(); + + my $prot_adp = $self->db->get_ProteinAlignFeatureAdaptor; + my $dna_adp = $self->db->get_DnaAlignFeatureAdaptor; + + my $feature; + while(my ($type, $feature_id) = $sth->fetchrow){ + if($type eq 'protein_align_feature'){ + $feature = $prot_adp->fetch_by_dbID($feature_id); + } elsif($type eq 'dna_align_feature'){ + $feature = $dna_adp->fetch_by_dbID($feature_id); + } else { + warning("Unknown feature type [$type]\n"); + } + + if(!$feature) { + warning("Supporting feature $type $feature_id does not exist in DB"); + } else { + my $new_feature = $feature->transfer($transcript->slice()); + push @$out, $new_feature if( $new_feature ); + } + } + + $sth->finish(); + + return $out; +} + + + +=head2 store + Arg [2] : Int $transID + The dbID of an EnsEMBL transcript to associate with supporting + features + Arg [1] : Ref to array of Bio::EnsEMBL::BaseAlignFeature (the support) + Example : $dbea->store($transcript_id, \@features); + Description: Stores a set of alignment features and associates an EnsEMBL transcript + with them + Returntype : none + Exceptions : thrown when invalid dbID is passed to this method + Caller : TranscriptAdaptor + Status : Stable + +=cut + +sub store { + my ( $self, $tran_dbID, $aln_objs ) = @_; + + my $pep_check_sql = + "SELECT protein_align_feature_id " . + "FROM protein_align_feature " . + "WHERE seq_region_id = ? " . + "AND seq_region_start = ? " . + "AND seq_region_end = ? " . + "AND seq_region_strand = ? " . + "AND hit_name = ? " . + "AND hit_start = ? " . + "AND hit_end = ? " . + "AND analysis_id = ? " . + "AND cigar_line = ? " . + "AND hcoverage = ? "; + + my $dna_check_sql = + "SELECT dna_align_feature_id " . + "FROM dna_align_feature " . + "WHERE seq_region_id = ? " . + "AND seq_region_start = ? " . + "AND seq_region_end = ? " . + "AND seq_region_strand = ? " . + "AND hit_name = ? " . + "AND hit_start = ? " . + "AND hit_end = ? " . + "AND analysis_id = ? " . + "AND cigar_line = ? " . + "AND hcoverage = ? " . + "AND hit_strand = ? "; + + my $assoc_check_sql = + "SELECT * " . + "FROM transcript_supporting_feature " . + "WHERE transcript_id = $tran_dbID " . + "AND feature_type = ? " . + "AND feature_id = ? "; + + my $assoc_write_sql = "INSERT into transcript_supporting_feature " . + "(transcript_id, feature_id, feature_type) " . + "values(?, ?, ?)"; + + my $pep_check_sth = $self->prepare($pep_check_sql); + my $dna_check_sth = $self->prepare($dna_check_sql); + my $assoc_check_sth = $self->prepare($assoc_check_sql); + my $sf_sth = $self->prepare($assoc_write_sql); + + my $dna_adaptor = $self->db->get_DnaAlignFeatureAdaptor(); + my $pep_adaptor = $self->db->get_ProteinAlignFeatureAdaptor(); + + foreach my $f (@$aln_objs) { + # check that the feature is in toplevel coords + + if($f->slice->start != 1 || $f->slice->strand != 1) { + #move feature onto a slice of the entire seq_region + my $tls = $self->db->get_sliceAdaptor->fetch_by_region($f->slice->coord_system->name(), + $f->slice->seq_region_name(), + undef, #start + undef, #end + undef, #strand + $f->slice->coord_system->version()); + $f = $f->transfer($tls); + + if(!$f) { + throw('Could not transfer Feature to slice of ' . + 'entire seq_region prior to storing'); + } + } + + if(!$f->isa("Bio::EnsEMBL::BaseAlignFeature")){ + throw("$f must be an align feature otherwise" . + "it can't be stored"); + } + + my ($sf_dbID, $type, $adap, $check_sth); + + my @check_args = ($self->db->get_SliceAdaptor->get_seq_region_id($f->slice), + $f->start, + $f->end, + $f->strand, + $f->hseqname, + $f->hstart, + $f->hend, + $f->analysis->dbID, + $f->cigar_string, + $f->hcoverage); + + if($f->isa("Bio::EnsEMBL::DnaDnaAlignFeature")){ + $adap = $dna_adaptor; + $check_sth = $dna_check_sth; + $type = 'dna_align_feature'; + push @check_args, $f->hstrand; + } elsif($f->isa("Bio::EnsEMBL::DnaPepAlignFeature")){ + $adap = $pep_adaptor; + $check_sth = $pep_check_sth; + $type = 'protein_align_feature'; + } else { + warning("Supporting feature of unknown type. Skipping : [$f]\n"); + next; + } + + $check_sth->execute(@check_args); + $sf_dbID = $check_sth->fetchrow_array; + + if (not $sf_dbID) { + + $adap->store($f); + $sf_dbID = $f->dbID; + } + + # now check association + $assoc_check_sth->execute($type, + $sf_dbID); + if (not $assoc_check_sth->fetchrow_array) { + $sf_sth->bind_param(1, $tran_dbID, SQL_INTEGER); + $sf_sth->bind_param(2, $sf_dbID, SQL_INTEGER); + $sf_sth->bind_param(3, $type, SQL_VARCHAR); + $sf_sth->execute(); + } + } + + $dna_check_sth->finish; + $pep_check_sth->finish; + $assoc_check_sth->finish; + $sf_sth->finish; + +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/TranslationAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/TranslationAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,912 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::TranslationAdaptor - Provides a means to fetch and store +Translation objects from a database. + +=head1 DESCRIPTION + +This adaptor provides a means to retrieve and store +Bio::EnsEMBL::Translation objects from/in a database. + +Translation objects only truly make sense in the context of their +transcripts so the recommended means to retrieve Translations is +by retrieving the Transcript object first, and then fetching the +Translation. + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Registry; + + Bio::EnsEMBL::Registry->load_registry_from_db( + -host => 'ensembldb.ensembl.org', + -user => 'anonymous' + ); + + $transcript_adaptor = + Bio::EnsEMBL::Registry->get_adaptor( "human", "core", + "transcript" ); + + $translation_adaptor = + Bio::EnsEMBL::Registry->get_adaptor( "human", "core", + "translation" ); + + my $transcript = $transcript_adaptor->fetch_by_dbID(131243); + my $translation = + $translation_adaptor->fetch_by_Transcript($transcript); + + print("Translation Start Site: " + . $translation->start_Exon()->stable_id() . " " + . $translation->start() + . "\n" ); + print("Translation Stop: " + . $translation->end_Exon()->stable_id() . " " + . $translation->end() ); + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::TranslationAdaptor; + +use vars qw(@ISA); +use strict; + +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Translation; +use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate ); +use Bio::EnsEMBL::Utils::Scalar qw( assert_ref ); + + +@ISA = qw( Bio::EnsEMBL::DBSQL::BaseAdaptor ); + +=head2 fetch_all_alternative_by_Transcript + + Arg [1] : Bio::EnsEMBL::Transcript $transcript + Example : + + @tl = @{ + $translation_adaptor->fetch_all_alternative_by_Transcript( + $transcript) + }; + + Description: Retrieves all alternative translations associated with a + particular transcript. If no alternative translation is + found, a reference to an empty list is returned. + + Returntype : listref of Bio::EnsEMBL::Translation + Exceptions : throw on incorrect argument + Caller : Transcript + Status : Stable + +=cut + +sub fetch_all_alternative_by_Transcript { + my ( $self, $transcript ) = @_; + + assert_ref($transcript, 'Bio::EnsEMBL::Transcript'); + + my $tl_created_date = + $self->db()->dbc()->from_date_to_seconds('tl.created_date'); + my $tl_modified_date = + $self->db()->dbc()->from_date_to_seconds('tl.modified_date'); + + my $sql = + sprintf( "SELECT tl.translation_id, tl.start_exon_id, " + . "tl.end_exon_id, tl.seq_start, tl.seq_end, " + . "tl.stable_id, tl.version, %s, %s " + . "FROM translation tl " + . "JOIN transcript t " + . "ON (t.transcript_id = tl.transcript_id) " + . "WHERE tl.transcript_id = ? " + . "AND tl.translation_id != t.canonical_translation_id", + $tl_created_date, $tl_modified_date ); + + my $transcript_id = $transcript->dbID(); + my $sth = $self->prepare($sql); + $sth->bind_param( 1, $transcript_id, SQL_INTEGER ); + + $sth->execute(); + + my ( + $translation_id, $start_exon_id, $end_exon_id, + $seq_start, $seq_end, $stable_id, + $version, $created_date, $modified_date + ); + + $sth->bind_columns( + \( + $translation_id, $start_exon_id, $end_exon_id, + $seq_start, $seq_end, $stable_id, + $version, $created_date, $modified_date + ) ); + + # Get all alternative translations. + my $translations = []; + while ( $sth->fetch() ) { + if ( !defined($translation_id) ) { next } + + my ( $start_exon, $end_exon ); + + # this will load all the exons whenever we load the translation + # but I guess thats ok .... + + foreach my $exon ( @{ $transcript->get_all_Exons() } ) { + if ( $exon->dbID() == $start_exon_id ) { $start_exon = $exon } + if ( $exon->dbID() == $end_exon_id ) { $end_exon = $exon } + } + + if ( !( defined($start_exon) && defined($end_exon) ) ) { + throw( + sprintf( + "Could not find start or end exon in transcript_id=%d\n", + $transcript->dbID() ) ); + } + + my $translation = + Bio::EnsEMBL::Translation->new_fast( { + 'dbID' => $translation_id, + 'adaptor' => $self, + 'start' => $seq_start, + 'end' => $seq_end, + 'start_exon' => $start_exon, + 'end_exon' => $end_exon, + 'stable_id' => $stable_id, + 'version' => $version, + 'created_date' => $created_date || undef, + 'modified_date' => $modified_date || undef, + } ); + + $translation->transcript($transcript); + + push( @{$translations}, $translation ); + + } ## end while ( $sth->fetch() ) + + return $translations; +} ## end sub fetch_all_by_Transcript + +=head2 fetch_by_Transcript + + Arg [1] : Bio::EnsEMBL::Transcript $transcript + Example : $tl = $translation_adaptor->fetch_by_Transcript($transcript); + Description: Retrieves a Translation via its associated transcript. + If the Translation is not found, undef is returned. + Returntype : Bio::EnsEMBL::Translation + Exceptions : throw on incorrect argument + Caller : Transcript + Status : Stable + +=cut + +sub fetch_by_Transcript { + my ( $self, $transcript ) = @_; + + assert_ref( $transcript, 'Bio::EnsEMBL::Transcript' ); + + my $tl_created_date = + $self->db()->dbc()->from_date_to_seconds('tl.created_date'); + my $tl_modified_date = + $self->db()->dbc()->from_date_to_seconds('tl.modified_date'); + + my $sql = + sprintf( "SELECT tl.translation_id, tl.start_exon_id, " + . "tl.end_exon_id, tl.seq_start, tl.seq_end, " + . "tl.stable_id, tl.version, %s, %s " + . "FROM translation tl " + . "JOIN transcript tr " + . "ON (tl.translation_id = tr.canonical_translation_id) " + . "WHERE tr.transcript_id = ?", + $tl_created_date, $tl_modified_date ); + + my $transcript_id = $transcript->dbID(); + my $sth = $self->prepare($sql); + $sth->bind_param( 1, $transcript_id, SQL_INTEGER ); + + $sth->execute(); + + my ( + $translation_id, $start_exon_id, $end_exon_id, + $seq_start, $seq_end, $stable_id, + $version, $created_date, $modified_date + ) = $sth->fetchrow_array(); + $sth->finish(); + + if ( !defined($translation_id) ) { return undef } + + my ( $start_exon, $end_exon ); + + # this will load all the exons whenever we load the translation + # but I guess thats ok .... + + foreach my $exon ( @{ $transcript->get_all_Exons() } ) { + if ( $exon->dbID() == $start_exon_id ) { $start_exon = $exon } + if ( $exon->dbID() == $end_exon_id ) { $end_exon = $exon } + } + + if ( !( defined($start_exon) && defined($end_exon) ) ) { + throw( + sprintf( "Could not find start or end exon in transcript_id=%d\n", + $transcript->dbID() ) ); + } + + my $translation = + Bio::EnsEMBL::Translation->new_fast( { + 'dbID' => $translation_id, + 'adaptor' => $self, + 'start' => $seq_start, + 'end' => $seq_end, + 'start_exon' => $start_exon, + 'end_exon' => $end_exon, + 'stable_id' => $stable_id, + 'version' => $version, + 'created_date' => $created_date || undef, + 'modified_date' => $modified_date || undef, + } ); + + $translation->transcript($transcript); + + return $translation; +} ## end sub fetch_by_Transcript + + + +=head2 fetch_all_by_external_name + + Arg [1] : string $external_name + The external identifier for the translation(s) to be + obtained. + Arg [2] : (optional) string $external_db_name + The name of the external database from which the + identifier originates. + Arg [3] : Boolean override. Force SQL regex matching for users + who really do want to find all 'NM%' + Example : my @translations = + @{ $trl_adaptor->fetch_all_by_external_name('BRCA2') }; + my @many_translations = + @{ $trl_adaptor->fetch_all_by_external_name('BRCA%') }; + Description: Retrieves a list of translations fetched via an + external identifier. Note that this may not be a + particularly useful method, because translations + do not make much sense out of the context of + their transcript. It may be better to use the + TranscriptAdaptor::fetch_all_by_external_name instead. + SQL wildcards % and _ are supported in the $external_name + but their use is somewhat restricted for performance reasons. + Users that really do want % and _ in the first three characters + should use argument 3 to prevent optimisations + Returntype : reference to a list of Translations + Exceptions : none + Caller : general + Status : Medium Risk + : At some time may be deprecated to instead use + : TranscriptAdaptor::fetch_all_by_external_name + +=cut + +sub fetch_all_by_external_name { + my ( $self, $external_name, $external_db_name, $override ) = @_; + + my $entry_adaptor = $self->db->get_DBEntryAdaptor(); + + my @ids = $entry_adaptor->list_translation_ids_by_extids( + $external_name, $external_db_name, $override ); + + my $transcript_adaptor = $self->db()->get_TranscriptAdaptor(); + + my @out; + foreach my $id (@ids) { + my $transcript = $transcript_adaptor->fetch_by_translation_id($id); + + if ( defined($transcript) ) { + push @out, $self->fetch_by_Transcript($transcript); + } + } + + return \@out; +} + +=head2 fetch_all_by_GOTerm + + Arg [1] : Bio::EnsEMBL::OntologyTerm + The GO term for which translations should be fetched. + + Example: @translations = @{ + $translation_adaptor->fetch_all_by_GOTerm( + $go_adaptor->fetch_by_accession('GO:0030326') ) }; + + Description : Retrieves a list of translations that are + associated with the given GO term, or with any of + its descendent GO terms. + + Return type : listref of Bio::EnsEMBL::Translation + Exceptions : Throws of argument is not a GO term + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_GOTerm { + my ( $self, $term ) = @_; + + assert_ref( $term, 'Bio::EnsEMBL::OntologyTerm' ); + if ( $term->ontology() ne 'GO' ) { + throw('Argument is not a GO term'); + } + + my $entryAdaptor = $self->db->get_DBEntryAdaptor(); + + my %unique_dbIDs; + foreach my $accession ( map { $_->accession() } + ( $term, @{ $term->descendants() } ) ) + { + my @ids = + $entryAdaptor->list_translation_ids_by_extids( $accession, 'GO' ); + foreach my $dbID (@ids) { $unique_dbIDs{$dbID} = 1 } + } + + my @result; + if ( scalar( keys(%unique_dbIDs) ) > 0 ) { + my $transcript_adaptor = $self->db()->get_TranscriptAdaptor(); + + foreach my $dbID ( sort { $a <=> $b } keys(%unique_dbIDs) ) { + my $transcript = + $transcript_adaptor->fetch_by_translation_id($dbID); + if ( defined($transcript) ) { + push( @result, $self->fetch_by_Transcript($transcript) ); + } + } + } + + return \@result; +} ## end sub fetch_all_by_GOTerm + +=head2 fetch_all_by_GOTerm_accession + + Arg [1] : String + The GO term accession for which genes should be + fetched. + + Example : + + @genes = + @{ $gene_adaptor->fetch_all_by_GOTerm_accession('GO:0030326') }; + + Description : Retrieves a list of genes that are associated with + the given GO term, or with any of its descendent + GO terms. The genes returned are in their native + coordinate system, i.e. in the coordinate system + in which they are stored in the database. If + another coordinate system is required then the + Gene::transfer or Gene::transform method can be + used. + + Return type : listref of Bio::EnsEMBL::Gene + Exceptions : Throws of argument is not a GO term accession + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_GOTerm_accession { + my ( $self, $accession ) = @_; + + if ( $accession !~ /^GO:/ ) { + throw('Argument is not a GO term accession'); + } + + my $goAdaptor = + Bio::EnsEMBL::Registry->get_adaptor( 'Multi', 'Ontology', + 'OntologyTerm' ); + + my $term = $goAdaptor->fetch_by_accession($accession); + + return $self->fetch_all_by_GOTerm($term); +} + +=head2 store + + Arg [1] : Bio::EnsEMBL::Translation $translation + The translation object to be stored in the database + Example : $transl_id = $translation_adaptor->store($translation); + Description: Stores a translation object in the database + Returntype : int - the new dbID of the stored translation + Exceptions : thrown if the dbID of the start_Exon or end_Exon is not + defined. + thrown if only partial stable id information is present (e.g. + identifier but not version number) + Caller : Transcript::store + Status : Stable + +=cut + +sub store { + my ( $self, $translation, $transcript_id ) = @_; + + my $start_exon = $translation->start_Exon(); + my $end_exon = $translation->end_Exon(); + + if(!$start_exon) { + throw("Translation must define a start_Exon to be stored."); + } + + if(!$end_exon) { + throw("Translation must define an end_Exon to be stored."); + } + + if(!$start_exon->dbID) { + throw("start_Exon must have a dbID for Translation to be stored."); + } + + if(!$end_exon->dbID) { + throw("end_Exon must have a dbID for Translation to be stored."); + } + + my $store_translation_sql = qq( + INSERT INTO translation + SET seq_start = ?, + start_exon_id = ?, + seq_end = ?, + end_exon_id = ?, + transcript_id = ? + ); + + if (defined($translation->stable_id)) { + my $created = $self->db->dbc->from_seconds_to_date($translation->created_date()); + my $modified = $self->db->dbc->from_seconds_to_date($translation->modified_date()); + $store_translation_sql .= ", stable_id = ?, version = ?, created_date = " . $created . " , modified_date = " . $modified; + + } + + my $sth = $self->prepare($store_translation_sql); + $sth->bind_param(1,$translation->start,SQL_INTEGER); + $sth->bind_param(2,$translation->start_Exon->dbID,SQL_INTEGER); + $sth->bind_param(3,$translation->end,SQL_INTEGER); + $sth->bind_param(4,$translation->end_Exon->dbID,SQL_INTEGER); + $sth->bind_param(5,$transcript_id,SQL_INTEGER); + + + if (defined($translation->stable_id)) { + + $sth->bind_param(6, $translation->stable_id,SQL_VARCHAR); + my $version = ($translation->version()) ? $translation->version() : 1; + $sth->bind_param(7, $version,SQL_VARCHAR); + } + + $sth->execute(); + + my $transl_dbID = $sth->{'mysql_insertid'}; + + # + # store object xref mappings to translations + # + + my $dbEntryAdaptor = $self->db()->get_DBEntryAdaptor(); + # store each of the xrefs for this translation + foreach my $dbl ( @{$translation->get_all_DBEntries} ) { + $dbEntryAdaptor->store( $dbl, $transl_dbID, "Translation", 1 ); + } + + #storing the protein features associated with the translation + my $pfadaptor = $self->db->get_ProteinFeatureAdaptor(); + foreach my $pf(@{$translation->get_all_ProteinFeatures}){ + $pfadaptor->store($pf, $transl_dbID); + } + + $translation->get_all_Attributes(); + + # store any translation attributes that are defined + my $attr_adaptor = $self->db->get_AttributeAdaptor(); + $attr_adaptor->store_on_Translation($transl_dbID, + $translation->get_all_Attributes()); + + $translation->dbID($transl_dbID); + $translation->adaptor($self); + + return $transl_dbID; +} + + + +=head2 remove + + Arg [1] : Bio::EnsEMBL::Translation $translation + Example : $translation_adaptor->remove($translation); + Description: Removes a translation completely from the database, and all + associated information including protein features etc. + Returntype : none + Exceptions : throw on incorrect arguments + warning if translation is not in this database + Caller : TranscriptAdaptor::remove + Status : Stable + +=cut + +sub remove { + my $self = shift; + my $translation = shift; + + if(!ref($translation) || !$translation->isa('Bio::EnsEMBL::Translation')) { + throw("Bio::EnsEMBL::Translation argument expected."); + } + + if( !$translation->is_stored($self->db()) ) { + warning("Cannot remove translation " . $translation->dbID() . + ". Is not stored in this database."); + return; + } + + # remove athe attributes associated with this translation + my $attrib_adp = $self->db->get_AttributeAdaptor; + $attrib_adp->remove_from_Translation($translation); + + # remove all xref associations to this translation + my $dbe_adaptor = $self->db()->get_DBEntryAdaptor(); + foreach my $dbe (@{$translation->get_all_DBEntries()}) { + $dbe_adaptor->remove_from_object($dbe, $translation, 'Translation'); + } + + # remove all protein_features on this translation + my $sth = $self->prepare + ("DELETE FROM protein_feature WHERE translation_id = ?"); + $sth->bind_param(1,$translation->dbID,SQL_INTEGER); + $sth->execute(); + $sth->finish(); + + # remove the translation itself + + $sth = $self->prepare("DELETE FROM translation WHERE translation_id = ?" ); + $sth->bind_param(1,$translation->dbID,SQL_INTEGER); + $sth->execute(); + $sth->finish(); + + $translation->dbID( undef ); + $translation->adaptor(undef); + + return +} + + +=head2 list_dbIDs + + Arg [1] : none + Example : @translation_ids = @{$translation_adaptor->list_dbIDs()}; + Description: Gets an array of internal ids for all translations in the current db + Returntype : list of ints + Exceptions : none + Caller : ? + Status : Stable + +=cut + +sub list_dbIDs { + my ($self) = @_; + + return $self->_list_dbIDs("translation"); +} + + +=head2 list_stable_ids + + Arg [1] : none + Example : @transl_stable_ids = @{$transl_adaptor->list_stable_dbIDs()}; + Description: Gets an array of stable ids for all translations in the current + db + Returntype : reference to a list of strings + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub list_stable_ids { + my ($self) = @_; + + return $self->_list_dbIDs("translation", "stable_id"); +} + + + +=head2 fetch_by_dbID + + Arg [1] : int $dbID + The internal identifier of the Translation to obtain + Example : $translation = $translation_adaptor->fetch_by_dbID(1234); + Description: This fetches a Translation object via its internal id. + This is only debatably useful since translations do + not make much sense outside of the context of their + Transcript. Consider using fetch_by_Transcript instead. + Returntype : Bio::EnsEMBL::Translation, or undef if the translation is not + found. + Exceptions : warning if an additional (old style) Transcript argument is + provided + Caller : ? + Status : Stable + +=cut + +sub fetch_by_dbID { + my ( $self, $dbID, $transcript ) = @_; + + if ($transcript) { + deprecate( "Use of fetch_by_dbID " + . "with a Transcript argument is deprecated." + . "Use fetch_by_Transcript instead." ); + } + + if ( !defined($dbID) ) { + throw("dbID argument is required"); + } + + my $transcript_adaptor = $self->db()->get_TranscriptAdaptor(); + $transcript = $transcript_adaptor->fetch_by_translation_id($dbID); + + if ( defined($transcript) ) { + my $translation = $self->fetch_by_Transcript($transcript); + + if ( defined($translation) && $translation->dbID()==$dbID ) { + return $translation; + } + + my @alt_translations = + @{ $self->fetch_all_alternative_by_Transcript($transcript) }; + + foreach my $alt_translation (@alt_translations) { + if ( $alt_translation->dbID() == $dbID ) { + return $alt_translation; + } + } + } + + return undef; +} ## end sub fetch_by_dbID + + +=head2 fetch_by_stable_id + + Arg [1] : string $stable_id + The stable identifier of the Translation to obtain + Example : $translation = $translation_adaptor->fetch_by_stable_id("ENSP00001"); + Description: This fetches a Translation object via its stable id. + This is only debatably useful since translations do + not make much sense outside of the context of their + Transcript. Consider using fetch_by_Transcript instead. + Returntype : Bio::EnsEMBL::Translation or undef if the translation is not + found. + Exceptions : warning if an additional (old style) Transcript argument is + provided + Caller : ? + Status : Stable + +=cut + +sub fetch_by_stable_id { + my ($self,$stable_id) = @_; + + if(!$stable_id) { + throw("stable id argument is required"); + } + + my $transcript_adaptor = $self->db()->get_TranscriptAdaptor(); + my $transcript = + $transcript_adaptor->fetch_by_translation_stable_id($stable_id); + + return undef if(!$transcript); + + return $self->fetch_by_Transcript($transcript); +} + + +=head2 fetch_all_by_Transcript_list + + Arg [1] : reference to list of Bio::EnsEMBL::Transcripts $transcripts + The list of $transcripts to obtain Translation object for. + Example : @translations = @{$tla->fetch_all_by_Transcript_list([$t1,$t2]); + Description: Fetches all translations associated with the list of transcripts + passed to this method. The passed transcripts will also have + their translation set by this method. + Returntype : Reference to list of Bio::EnsEMBL::Translations + Exceptions : None + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_Transcript_list { + my ($self,$transcripts) = @_; + + if(!defined($transcripts) || ref($transcripts) ne 'ARRAY') { + throw("reference to list of Transcripts argument is required"); + } + + return [] if(!@$transcripts); + + my %trans_hash = map {$_->dbID() => $_} @$transcripts; + my @id_list = keys %trans_hash; + + my @out; + + # mysql is faster and we ensure that we do not exceed the max query size by + # splitting large queries into smaller queries of 200 ids + my $max_size = 200; + + my ( $transcript_id, $translation_id, $start_exon_id, $end_exon_id, + $seq_start, $seq_end, $stable_id, $version, + $created_date, $modified_date ); + + my %ex_hash; + + while(@id_list) { + my @ids; + if(@id_list > $max_size) { + @ids = splice(@id_list, 0, $max_size); + } else { + @ids = splice(@id_list, 0); + } + + my $id_str; + if(@ids > 1) { + $id_str = " IN (" . join(',', @ids). ")"; + } else { + $id_str = " = " . $ids[0]; + } + + my $canonical_lookup = $self->dbc()->sql_helper()->execute_into_hash( + -SQL => 'SELECT transcript_id, canonical_translation_id FROM transcript WHERE transcript_id '.$id_str + ); + + my $created_date = $self->db->dbc->from_date_to_seconds("tl.created_date"); + my $modified_date = $self->db->dbc->from_date_to_seconds("tl.modified_date"); + + my $sth = $self->prepare + ("SELECT tl.transcript_id, tl.translation_id, tl.start_exon_id, + tl.end_exon_id, tl.seq_start, tl.seq_end, + tl.stable_id, tl.version, " . $created_date . "," . + $modified_date . + " FROM translation tl + WHERE tl.transcript_id $id_str"); + + $sth->execute(); + + $sth->bind_columns( \$transcript_id, \$translation_id, \$start_exon_id, \$end_exon_id, + \$seq_start, \$seq_end, \$stable_id, \$version, + \$created_date, \$modified_date ); + + while($sth->fetch()) { + my ($start_exon, $end_exon); + + # this will load all the exons whenever we load the translation + # but I guess thats ok .... + + my $tr = $trans_hash{$transcript_id}; + + foreach my $exon (@{$tr->get_all_Exons()}) { + if(!$start_exon && $exon->dbID() == $start_exon_id ) { + $start_exon = $exon; + last if($end_exon); + } + + if(!$end_exon && $exon->dbID() == $end_exon_id ) { + $end_exon = $exon; + last if($start_exon); + } + } + + unless($start_exon && $end_exon) { + throw("Could not find start or end exon in transcript\n"); + } + + my $tl = Bio::EnsEMBL::Translation->new + (-dbID => $translation_id, + -seq_start => $seq_start, + -seq_end => $seq_end, + -start_exon => $start_exon, + -end_exon => $end_exon, + -stable_id => $stable_id, + -version => $version, + -created_date => $created_date || undef, + -modified_date => $modified_date || undef); + + $tl->adaptor($self); + my $canonical_translation_id = $canonical_lookup->{$transcript_id}; + $tr->translation($tl) if $translation_id == $canonical_translation_id; + + push @out, $tl; + } + } + + return \@out; +} + + + +=head2 fetch_all_by_DBEntry + + Description: DEPRECATED, this has been renames fetch_all_by_external_name + +=cut + +sub fetch_all_by_DBEntry { + my $self = shift; + deprecate("Use fetch_all_by_external_name instead."); + return $self->fetch_all_by_external_name(@_); +} + +=head2 get_stable_entry_info + + Description: DEPRECATED - This method should no longer be needed. Stable + id info is fetched when the transcript is. + +=cut + +sub get_stable_entry_info { + my ($self,$translation) = @_; + + deprecate( "This method shouldnt be necessary any more" ); + + unless(defined $translation && ref $translation && + $translation->isa('Bio::EnsEMBL::Translation') ) { + throw("Needs a Translation object, not a [$translation]"); + } + + my $sth = $self->prepare("SELECT stable_id, version + FROM translation + WHERE translation_id = ?"); + $sth->bind_param(1,$translation->dbID,SQL_INTEGER); + $sth->execute(); + + my @array = $sth->fetchrow_array(); + $translation->{'_stable_id'} = $array[0]; + $translation->{'_version'} = $array[1]; + + return 1; +} + +=head2 fetch_all + + Example : $translations = $translation_adaptor->fetch_all(); + Description : Retrieves all canonical and alternative translations + stored in the database. + Returntype : listref of Bio::EnsEMBL::Translation + Caller : general + Status : At Risk + +=cut + +sub fetch_all { + my ($self) = @_; + my $transcript_adaptor = $self->db()->get_TranscriptAdaptor(); + + my @translations; + foreach my $transcript (@{$transcript_adaptor->fetch_all}) { + my $translation = $self->fetch_by_Transcript($transcript); + if ($translation) { + push @translations, $translation; + } + foreach my $alt_translation (@{$self->fetch_all_alternative_by_Transcript($transcript)}) { + push @translations, $alt_translation; + } + } + return \@translations; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/UnconventionalTranscriptAssociationAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/UnconventionalTranscriptAssociationAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,239 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::UnconventionalTranscriptAssociationAdaptor + +=head1 SYNOPSIS + + $utaa = $registry->get_adaptor( 'Human', 'Core', + 'UnconventionalTranscriptAssociation' ); + + my $uta = $utaa->fetch_all_by_type('antisense'); + +=head1 DESCRIPTION + +This is an adaptor for the retrieval and storage of +UnconventionalTranscriptAssociation objects from the database. Most of +the implementation is in the superclass BaseFeatureAdaptor. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::UnconventionalTranscriptAssociationAdaptor; + +use strict; +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::UnconventionalTranscriptAssociation; + +use vars qw(@ISA); + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); + + + + +=head2 fetch_all_by_interaction_type + + Arg [1] : String type + the type of associations to obtain + Example : $utas = $utaa->fetch_all_by_type('antisense'); + Description: Obtains all unconventional transcript associations that + have a particular interaction type. + NOTE: In a multi-species database, this method will + return all the entries matching the search criteria, not + just the ones associated with the current species. + Returntype : listREF of Bio::EnsEMBL::UnconventionalTranscriptAssociations + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub fetch_all_by_interaction_type { + + my( $self, $type) = @_; + + my $sth = $self->prepare("SELECT transcript_id, gene_id, interaction_type " . + "FROM unconventional_transcript_association " . + "WHERE interaction_type = ?"); + + $sth->bind_param(1, $type, SQL_VARCHAR); + $sth->execute(); + + my $results = $self->_objs_from_sth($sth); + + $sth->finish(); + + return $results; + +} + + +=head2 fetch_all_by_gene + + Arg [1] : String gene the gene of associations to obtain + Arg [2] : (optional) An interaction type; if set, only associations of this type will be returned. + Example : $utas = $utaa->fetch_all_by_gene($gene, 'antisense'); + Description: Obtains all unconventional transcript associations that involve + a particular gene. + Returntype : listREF of Bio::EnsEMBL::UnconventionalTranscriptAssociations + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub fetch_all_by_gene { + + my( $self, $gene, $type) = @_; + + if(!ref($gene) || !$gene->isa('Bio::EnsEMBL::Gene')) { + throw('$gene must be a Bio::EnsEMBL::Gene'); + } + + my $sql = "SELECT transcript_id, gene_id, interaction_type FROM unconventional_transcript_association WHERE gene_id = ?"; + $sql .= " AND interaction_type = ?" if ($type); + + my $sth = $self->prepare($sql); + + $sth->bind_param(1, $gene->dbID(), SQL_INTEGER); + $sth->bind_param(2, $type, SQL_VARCHAR) if ($type); + + $sth->execute(); + + my $results = $self->_objs_from_sth($sth); + + $sth->finish(); + + return $results; + +} + + +=head2 fetch_all_by_transcript + + Arg [1] : String transcript the transcript of associations to obtain + Arg [2] : (optional) An interaction type; if set, only associations of this type will be returned. + Example : $utas = $utaa->fetch_all_by_transcript($transcript, 'antisense'); + Description: Obtains all unconventional transcript associations that involve + a particular transcript. + Returntype : listREF of Bio::EnsEMBL::UnconventionalTranscriptAssociations + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub fetch_all_by_transcript { + + my( $self, $transcript, $type) = @_; + + if(!ref($transcript) || !$transcript->isa('Bio::EnsEMBL::Transcript')) { + throw('$transcript must be a Bio::EnsEMBL::Transcript'); + } + + my $sql = "SELECT transcript_id, gene_id, interaction_type FROM unconventional_transcript_association WHERE transcript_id = ?"; + $sql .= " AND interaction_type = ?" if ($type); + + my $sth = $self->prepare($sql); + + $sth->bind_param(1, $transcript->dbID(), SQL_INTEGER); + $sth->bind_param(2, $type, SQL_VARCHAR) if ($type); + + $sth->execute(); + + my $results = $self->_objs_from_sth($sth); + + $sth->finish(); + + return $results; + +} + + +=head2 store + + Arg [1] : Bio::EnsEMBL::UnconventionalTranscriptAssociation + the unconventional transcript association to store in the database + Example : $utaa_adaptor->store($uta); + Description: stores unconventional transcript associations in the database + Returntype : none + Exceptions : + Caller : general + Status : At Risk + : under development + +=cut + +sub store { + + my( $self, $uta ) = @_; + + if(!ref($uta) || !$uta->isa('Bio::EnsEMBL::UnconventionalTranscriptAssociation')) { + throw('$uta must be a Bio::EnsEMBL::UnconventionalTranscriptAssociation'); + } + + my $sth = $self->prepare(qq {INSERT into unconventional_transcript_association + (transcript_id, gene_id, interaction_type) VALUES (?,?,?)}); + + $sth->bind_param(1, $uta->transcript()->dbID(), SQL_INTEGER); + $sth->bind_param(2, $uta->gene()->dbID, SQL_INTEGER); + $sth->bind_param(3, $uta->interaction_type(), SQL_VARCHAR); + + $sth->execute(); + +} + + + +sub _objs_from_sth { + + my ($self, $sth) = @_; + + my $transcript_adaptor = $self->db()->get_TranscriptAdaptor(); + my $gene_adaptor = $self->db()->get_GeneAdaptor(); + + my ($gene_id, $transcript_id, $type); + $sth->bind_columns(\$transcript_id, \$gene_id, \$type); + + my @results; + + while($sth->fetch()) { + + my $gene = $gene_adaptor->fetch_by_dbID($gene_id); + my $transcript = $transcript_adaptor->fetch_by_dbID($transcript_id); + + my $obj = Bio::EnsEMBL::UnconventionalTranscriptAssociation->new($transcript, $gene, $type); + push @results, $obj; + + } + + return \@results; +} + +1; + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DBSQL/UnmappedObjectAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/UnmappedObjectAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,447 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::UnmappedObjectAdaptor + +=head1 SYNOPSIS + + my $uoa = $database_adaptor->get_UnmappedObjectAdaptor(); + + my $missed = @{ $uoa->fetch_all_by_type('xref') }; + +=head1 DESCRIPTION + +Unmapped ObjectAdaptor - An adaptor responsible for the creation, +editing, retrieval of Unmapped Objects. These being the Objects that +where not mapped in a specific process i.e. xref, cDNA, Markers. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::UnmappedObjectAdaptor; +use vars qw(@ISA); +use strict; + + +use POSIX; +use Bio::EnsEMBL::Utils::Cache; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; +use Bio::EnsEMBL::UnmappedObject; +use Bio::EnsEMBL::Analysis; +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor); + +our %desc_to_id; + +=head2 new + + Arg [1] : list of args @args + Superclass constructor arguments + Example : none + Description: Constructor which just initializes internal cache structures + Returntype : Bio::EnsEMBL::DBSQL::UnmappedObjectAdaptor + Exceptions : none + Caller : implementing subclass constructors + Status : At Risk + +=cut + +sub new { + my $proto = shift; + + my $class = ref($proto) || $proto; + + my $self = $class->SUPER::new(@_); + + my $sth = + $self->prepare( "SELECT unmapped_reason_id, full_description " + . "FROM unmapped_reason" ); + + $sth->execute(); + + my ( $id, $desc ); + $sth->bind_columns( \( $id, $desc ) ); + + while ( $sth->fetch() ) { + $desc_to_id{$desc} = $id; + } + + $sth->finish(); + + return $self; +} + + +# _tables +# Arg [1] : none +# Description: PROTECTED implementation of superclass abstract method +# returns the names, aliases of the tables to use for queries +# Returntype : list of listrefs of strings +# Exceptions : none +# Caller : internal +# Status : At Risk +sub _tables { + my $self = shift; + + return (['unmapped_object', 'uo'], + ['unmapped_reason', 'ur']); +} + + +# _columns +# Arg [1] : none +# Example : none +# Description: PROTECTED implementation of superclass abstract method +# returns a list of columns to use for queries +# Returntype : list of strings +# Exceptions : none +# Caller : internal +# Status : At Risk + +sub _columns { + my $self = shift; + + return qw(uo.unmapped_object_id uo.type uo.analysis_id uo.external_db_id + uo.identifier uo.unmapped_reason_id uo.query_score uo.target_score + uo.ensembl_id uo.ensembl_object_type + ur.summary_description ur.full_description); +} + +sub _left_join { + return ( [ + 'unmapped_object', "uo.unmapped_reason_id = ur.unmapped_reason_id" + ] ); +} + +=head2 list_dbIDs + + Arg [1] : none + Example : @unmapped_object_ids = @{$unmapped_object_adaptor->list_dbIDs()}; + Description: Gets an array of internal ids for all unmapped_objects in the current db + Returntype : list of ints + Exceptions : none + Caller : ? + Status : Stable + +=cut + +sub list_dbIDs { + my ($self) = @_; + + return $self->_list_dbIDs("unmapped_object"); +} + +=head2 list_unmapped_reasons + + Arg [1] : none + Example : @unmapped_object_reason+ids = + @{$unmapped_object_adaptor->list_unmapped_reasons()}; + Description: Gets an array of internal ids for all unmapped_objects in the current db + Returntype : list of ints + Exceptions : none + Caller : ? + Status : Stable + +=cut + +sub list_unmapped_reasons { + my ($self) = @_; + + return $self->_list_dbIDs("unmapped_reason"); +} + + +# _objs_from_sth + +# Arg [1] : StatementHandle $sth +# Example : none +# Description: PROTECTED implementation of abstract superclass method. +# responsible for the creation of UnmappedObjects +# Returntype : listref of Bio::EnsEMBL::UnmappedObjects +# Exceptions : none +# Caller : internal +# Status : At Risk + +sub _objs_from_sth { + my ($self, $sth) = @_; + + my($unmapped_object_id, $type, $analysis_id, $external_db_id, $identifier, + $unmapped_reason_id, $query_score, $target_score, $ensembl_id, + $ensembl_object_type, $summary, $full_desc); + + $sth->bind_columns(\$unmapped_object_id,\$type, \$analysis_id, + \$external_db_id, \$identifier, \$unmapped_reason_id, + \$query_score, \$target_score, \$ensembl_id, + \$ensembl_object_type, \$summary, \$full_desc); + + my $analysis_adaptor = $self->db->get_AnalysisAdaptor(); + + my @features; + while($sth->fetch()) { + my $analysis = $analysis_adaptor->fetch_by_dbID($analysis_id); + + #print "$identifier\n"; + + push( @features, + $self->_create_feature( + 'Bio::EnsEMBL::UnmappedObject', { + -unmapped_object_id => $unmapped_object_id, + -unmapped_reason_id => $unmapped_reason_id, + -type => $type, + -analysis => $analysis, + -external_db_id => $external_db_id, + -identifier => $identifier, + -query_score => $query_score, + -target_score => $target_score, + -ensembl_id => $ensembl_id, + -ensembl_object_type => $ensembl_object_type, + -summary => $summary, + -full_desc => $full_desc, + -adaptor => $self + } ) ); + + } + return \@features; +} + + + +=head2 store + + Arg [1] : list of Bio::EnsEMBL::UnmappedObjects @uo + the unmapped objects to store in the database + Example : $ou_adaptor->store(@uo); + Description: Stores a list of unmapped objects in the database + Returntype : none + Exceptions : thrown if no Analysis, or no list of objects to store. + Caller : general + Status : Stable + +=cut + +sub store{ + my ($self,@uos) = @_; + + if( scalar(@uos) == 0 ) { + throw("Must call store with list of UnmappedObjects"); + } + + + my $db = $self->db(); + my $analysis_adaptor = $db->get_AnalysisAdaptor(); + + my $sth_reason = $self->prepare + ("INSERT INTO unmapped_reason (summary_description, full_description)". + " VALUES (?,?)"); + + my $sth_unmapped_object = $self->prepare + ("INSERT INTO unmapped_object (type, analysis_id, external_db_id, + identifier, unmapped_reason_id, query_score, target_score, + ensembl_id, ensembl_object_type)". + " VALUES (?,?,?,?,?,?,?,?,?)"); + + FEATURE: foreach my $uo ( @uos ) { + + if( !ref $uo || !$uo->isa("Bio::EnsEMBL::UnmappedObject") ) { + throw("UnmappedObject must be an Ensembl UnmappedObject, " . + "not a [".ref($uo)."]"); + } + if($uo->is_stored($db)){ + next; + } + + my $analysis = $uo->analysis(); + throw("UnmappedObject must have an analysis object.".$uo->analysis."\n") if(!defined($analysis)); + + my $analysis_id; + if($analysis->is_stored($db)) { + $analysis_id = $analysis->dbID(); + } else { + $analysis_id = $db->get_AnalysisAdaptor->store($analysis); + } + + #First check to see unmapped reason is stored + if(!defined($desc_to_id{$uo->{'description'}})){ + $sth_reason->bind_param(1,$uo->{'summary'},SQL_VARCHAR); + $sth_reason->bind_param(2,$uo->{'description'},SQL_VARCHAR); + $sth_reason->execute(); + $uo->{'unmapped_reason_id'} = $desc_to_id{$uo->{'description'}} + = $sth_reason->{'mysql_insertid'}; + + } + else{ + $uo->{'unmapped_reason_id'} = $desc_to_id{$uo->{'description'}} ; + } + $sth_unmapped_object->bind_param(1,$uo->{'type'},SQL_VARCHAR); + $sth_unmapped_object->bind_param(2,$uo->analysis->dbID,SQL_INTEGER); + $sth_unmapped_object->bind_param(3,$uo->{'external_db_id'},SQL_INTEGER); + $sth_unmapped_object->bind_param(4,$uo->{'identifier'},SQL_VARCHAR); + $sth_unmapped_object->bind_param(5,$uo->{'unmapped_reason_id'},SQL_VARCHAR); + $sth_unmapped_object->bind_param(6,$uo->{'query_score'},SQL_DOUBLE); + $sth_unmapped_object->bind_param(7,$uo->{'target_score'},SQL_DOUBLE); + $sth_unmapped_object->bind_param(8,$uo->{'ensembl_id'},SQL_INTEGER); + $sth_unmapped_object->bind_param(9,$uo->{'ensembl_object_type'},SQL_VARCHAR); + $sth_unmapped_object->execute(); + $uo->dbID($sth_unmapped_object->{'mysql_insertid'}); + } + $sth_reason->finish(); + return; +} + + +=head2 fetch_all_by_type + + Arg [1] : string type. The type of unmapped objects + Example : @unmapped_object = @{$uoa->fetch_all_by_type('xref')}; + Description : Retrieves all the unmapped object for a particular + type. e.g. 'xref','cDNA', 'marker' + Returntype : Array ref of Bio::EnsEMBL::UnmappedObject + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_type { + my ($self, $type) = @_; + + unless($type) { + throw("type argument is required"); + } + $self->bind_param_generic_fetch($type,SQL_VARCHAR); + $self->generic_fetch("uo.type = ?"); + +} + +=head2 fetch_all_by_analysis + + Arg [1] : Bio:EnsEMBL::Analysis object + Arg [2] : (optional) string database name + Example : @unmapped_object = @{$uoa->fetch_all_by_analysis($analysis)}; + Description : Retrieves all the unmapped object for a particular + analysis type with the the option of a particular + database type. + Returntype : array ref of Bio::EnsEMBL::UnmappedObject + Exceptions : thorws if first argument is not an anaylisi object + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_analysis { + my ($self, $analysis,$dbname) = @_; + + unless($analysis) { + throw("analysis argument is required"); + } + $self->bind_param_generic_fetch($analysis->dbID,SQL_INTEGER); + my $constraint = "uo.analysis_id = ?"; + if(defined($dbname)){ + my $db_id =0; + my $sth = $self->prepare('select external_db_id from external_db where db_name like "'. + $dbname.'"'); + $sth->execute; + $sth->bind_columns(\$db_id); + $sth->fetch(); + if(!defined($db_id) or $db_id == 0){ + throw("$dbname could not be found in the external database table\n"); + } + $self->bind_param_generic_fetch($db_id,SQL_INTEGER); + $constraint .= " AND uo.external_db_id = ?"; + } + $self->generic_fetch($constraint); + +} + +=head2 fetch_by_identifier + + Arg [1] : string type. The type of unmapped objects + Arg [2] : (optional) string database name + Example : @unmapped_object = @{$uoa->fetch_by_identifier('Q123345')}; + Description : Retrieves the unmapped object for a particular + identifier/accession + Returntype : array ref of Bio::EnsEMBL::UnmappedObject + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub fetch_by_identifier { + my ($self, $identifier, $dbname) = @_; + + unless($identifier) { + throw("identifier argument is required"); + } + $self->bind_param_generic_fetch($identifier,SQL_VARCHAR); + my $constraint = 'uo.identifier like ?'; + + if(defined($dbname)){ + my $db_id =0; + my $sth = $self->prepare('select external_db_id from external_db where db_name like "'. + $dbname.'"'); + $sth->execute; + $sth->bind_columns(\$db_id); + $sth->fetch(); + if(!defined($db_id) or $db_id == 0){ + throw("$dbname could not be found in the external database table\n"); + } + $self->bind_param_generic_fetch($db_id,SQL_INTEGER); + $constraint .= " AND uo.external_db_id = ?"; + } + return $self->generic_fetch($constraint); +} + +=head2 fetch_all_by_object_type_id + + Arg [1] : string - The object type of the ensembl object e.g. Gene + Arg [2] : int - The internal dbID of the ensembl object + Example : my @unmapped_objects = @{$uoa->fetch_all_by_object_type_id('Gene', 12341)}; + Description : Retrieves the unmapped objects for a particular ensembl object + This is a base method which should be called by wrapper methods + defining the correct object type e.g. $uoa->fetch_all_by_Gene($gene) + Returntype : array ref of Bio::EnsEMBL::UnmappedObject objects + Exceptions : Throws if arguments are not defined + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_object_type_id { + my ($self, $object_type, $dbid) = @_; + + if(! ($object_type && $dbid)){ + throw("object_type and dbid arguments required"); + } + + $self->bind_param_generic_fetch($object_type, SQL_VARCHAR); + $self->bind_param_generic_fetch($dbid, SQL_INTEGER); + + my $constraint = 'uo.ensembl_object_type=? and uo.ensembl_id=?'; + + return $self->generic_fetch($constraint); +} + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DataFile.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DataFile.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,309 @@ +package Bio::EnsEMBL::DataFile; + +use strict; +use warnings; + +use base qw/Bio::EnsEMBL::Storable/; + +use Bio::EnsEMBL::ApiVersion; +use Bio::EnsEMBL::Utils::Argument qw/rearrange/; +use Bio::EnsEMBL::Utils::Exception qw/throw warning/; +use Bio::EnsEMBL::Utils::Scalar qw/:assert/; +use Bio::EnsEMBL::Utils::URI qw/is_uri/; +use File::Spec; +use Scalar::Util qw(weaken isweak); + +=head2 new + + Arg [-ADAPTOR] : Bio::EnsEMBL::DBSQL::DataFileAdaptor + Arg [-DBID] : Integer $dbID + Arg [-COORD_SYSTEM] : Bio::EnsEMBL::CoordSystem $coord_system + Arg [-ANALYSIS] : Bio::EnsEMBL::Analysis $analysis + Arg [-NAME] : String $name + Arg [-VERSION_LOCK] : Boolean $version_lock + Arg [-ABSOLUTE] : Boolean $absolute + Arg [-URL] : String $url + Arg [-FILE_TYPE] : String $file_type + Example : Bio::EnsEMBL::DataFile->new(); + Description : Returns a new instance of this object + Returntype : Bio::EnsEMBL::DataFile + Exceptions : Thrown if data is not as expected + +=cut + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + my ($coord_system, $analysis, $name, $version_lock, $absolute, $url, $file_type) = + rearrange([qw/coord_system analysis name version_lock absolute url file_type/], @args); + + $self->coord_system($coord_system); + $self->analysis($analysis); + $self->name($name); + $self->version_lock($version_lock); + $self->absolute($absolute); + $self->url($url); + $self->file_type($file_type); + + return $self; +} + +=head2 new_fast + + Arg [1] : hashref to be blessed + Description: Construct a new Bio::EnsEMBL::Feature using the hashref. + Exceptions : none + Returntype : Bio::EnsEMBL::Feature + Caller : general, subclass constructors + Status : Stable + +=cut + +sub new_fast { + my $class = shift; + my $hashref = shift; + my $self = bless $hashref, $class; + weaken($self->{adaptor}) if ( ! isweak($self->{adaptor}) ); + return $self; +} + +=head2 get_ExternalAdaptor + + Arg[1] : Scalar; optional base path. Uses defaults if not given + Example : my $ea = $df->get_ExternalAdaptor('/base/path'); + Description : Delegates to the parent adaptor to retrieve the external + adaptor for this data type + Returntype : Adaptor; will be an adaptor that can read the given data file + Exceptions : Thrown if there is no attached adaptor. + +=cut + +sub get_ExternalAdaptor { + my ($self, $base_path) = @_; + my $adaptor = $self->adaptor(); + throw "No DataFileAdaptor found in this object. Cannot request ExternalAdaptor" if ! $adaptor; + return $adaptor->DataFile_to_adaptor($self, $base_path); +} + +=head2 path + + Arg[1] : Scalar base of the path to use. Can be ignored if the instance + already represents a canonical path + Example : my $f = $df->path(); + Description : Used to generate the path to the file resource. Can return a + path to the file or a URL but it is up to the using code to + know how to interprate the different returned forms. + + If the data file url is canonical then this is just returned. + If not then a path is generated of the form + B + + Returntype : Scalar the absolute path/url to the given resource + Exceptions : Thrown if the linked Coordinate System lacks a version and the + current database also lacks a default version + Caller : public + +=cut + + +sub path { + my ($self, $base) = @_; + my $all_paths = $self->get_all_paths($base); + return $all_paths->[0]; +} + +sub get_all_paths { + my ($self, $base) = @_; + + return [$self->url()] if $self->absolute(); + + my @all_paths; + + $base = $self->adaptor()->get_base_path($base) if ! $base; + + my $production_name = $self->adaptor()->db()->get_MetaContainer()->get_production_name(); + my $cs_version = $self->coord_system()->version(); + if(! $cs_version) { + my ($highest_cs) = @{$self->adaptor()->db()->get_CoordSystemAdaptor()->fetch_all()}; + $cs_version = $highest_cs->version(); + } + if(!$cs_version) { + my $name = $self->name(); + throw "The file '${name}' in species '${$production_name} is attached to a CoordinateSystem lacking a version and has no default assembly. Please fix"; + } + + my @portions; + push(@portions, $production_name); + push(@portions, $cs_version); + push(@portions, software_version()) if $self->version_lock(); + push(@portions, $self->adaptor()->db()->group()); + + #Targets are the files to generate + my @targets; + #If URL is populated we assume we need to add this onto the end but removing the / + if($self->url()) { + my @split = split(/\//, $self->url()); + push(@targets, [@split]); + } + else { + my $extensions = $self->adaptor()->DataFile_to_extensions($self); + foreach my $ext (@{$extensions}) { + my $filename = sprintf(q{%s.%s}, $self->name(), $ext); + push(@targets, [$filename]); + } + } + + my $is_uri = is_uri($base); + foreach my $t (@targets) { + my $path; + if($is_uri) { + $path = join(q{/}, $base, @portions, @{$t}); + } + else { + $path = File::Spec->catfile($base, @portions, @{$t}); + } + push(@all_paths, $path); + } + return \@all_paths; +} + +=head2 coord_system + + Arg[1] : Bio::EnsEMBL::CoordSystem Optional setter + Description : Mutator for the coord system field. All files are linked to one + Returntype : Bio::EnsEMBL::CoordSystem + Exceptions : Thrown if not of the expected type + +=cut + + +sub coord_system { + my ($self, $coord_system) = @_; + if(defined $coord_system) { + assert_ref($coord_system, 'Bio::EnsEMBL::CoordSystem', 'coord_system'); + $self->{'coord_system'} = $coord_system; + } + return $self->{'coord_system'}; +} + +=head2 analysis + + Arg[1] : Bio::EnsEMBL::Analysis Optional setter + Description : Mutator for the analysis field. All files are linked to one + Returntype : Bio::EnsEMBL::Analysis + Exceptions : Thrown if not of the expected type + +=cut + +sub analysis { + my ($self, $analysis) = @_; + if(defined $analysis) { + assert_ref($analysis, 'Bio::EnsEMBL::Analysis', 'analysis'); + $self->{'analysis'} = $analysis; + } + return $self->{'analysis'}; +} + +=head2 name + + Arg[1] : String Optional setter + Description : Mutator for the name of the file. Can be used in file location + generation + Returntype : String + +=cut + +sub name { + my ($self, $name) = @_; + if(defined $name) { + $self->{'name'} = $name; + } + return $self->{'name'}; +} + +=head2 version_lock + + Arg[1] : Boolean Optional setter + Description : Boolean indicating if the file is linked to the version of the + database it was found in. + Returntype : Boolean + +=cut + +sub version_lock { + my ($self, $version_lock) = @_; + if(defined $version_lock) { + assert_boolean($version_lock, 'version_lock'); + $self->{'version_lock'} = $version_lock; + } + return $self->{'version_lock'}; +} + +=head2 absolute + + Arg[1] : Boolean Optional setter + Description : Indicates if the URL of this file is an absolute one i.e. + should be used verbatim or not. + Returntype : Boolean + +=cut + +sub absolute { + my ($self, $absolute) = @_; + if(defined $absolute) { + assert_boolean($absolute, 'absolute'); + $self->{'absolute'} = $absolute; + } + return $self->{'absolute'}; +} + +=head2 url + + Arg[1] : String Optional setter + Description : Location of the file. Can be optional and if set means once + we are in an automatic location use this value to locate + the file. + Returntype : String + +=cut + +sub url { + my ($self, $url) = @_; + $self->{'url'} = $url if defined $url; + return $self->{'url'}; +} + +=head2 file_type + + Arg[1] : String Optional setter + Description : The type of file we are working with. Can be used to generate + a file name. + Returntype : String + +=cut + +sub file_type { + my ($self, $file_type) = @_; + $self->{'file_type'} = $file_type if defined $file_type; + return $self->{'file_type'}; +} + +#=head2 files +# +# Args : +# Example : my $files = @{$df->files()}; +# Description : Returns all the file names we expect to cover for a flat file +# Returntype : type return_description +# Exceptions : +# Caller : caller +# Status : status +# +#=cut +# +# +#sub files { +# my ($self) = @_; +# +#} + +1; \ No newline at end of file diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DensityFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DensityFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,301 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DensityFeature - A feature representing a density, or +precentage coverage etc. in a given region. + +=head1 SYNOPSIS + + use Bio::EnsEMBL::DensityFeature; + + $feature = Bio::EnsEMBL::DensityFeature->new( + -seq_region => $region, + -start => 1, + -end => 1e6, + -density_type => $dt, + -density_value => 98.5 + ); + +=head1 DESCRIPTION + +A density feature represents a count, density, or percentage coverage, +etc. for a given region. + +This module is part of the Ensembl project http://www.ensembl.org + +=head1 METHODS + +=cut + + +use strict; +use warnings; + +package Bio::EnsEMBL::DensityFeature; + +use Bio::EnsEMBL::Feature; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw); +use Bio::EnsEMBL::DensityType; + +use vars qw(@ISA); + +@ISA = qw(Bio::EnsEMBL::Feature); + + +=head2 new + + Arg [SEQ_REGION] : the sequence over which the density was calculated. + + Arg [START] : start point on the seq at which density was calulated. + + Arg [END] : end point on the seq at which density was calulated. + + Arg [DENSITY_TYPE] : the type of density calculated. + + Arg [DENSITY_VALUE] : the density. + + Arg [...] : Named arguments passed to superclass + Example : $feature = Bio::EnsEMBL::DensityFeature->new + (-seq_region => $region, + -start => 1, + -end => 1e6, + -density_type => $dt, + -density_value => 98.5) + + Description: Creates a new density feature. + Returntype : Bio::EnsEMBL::DensityFeature + Exceptions : throw if invalid density value type is provided + Caller : general + Status : Stable + +=cut + +sub new { + my $caller = shift; + + #allow constructor to be called as class or object method + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + my($seq_region, $start, $end, $dt, $dv) = + rearrange(['SEQ_REGION', 'START', 'END', 'DENSITY_TYPE', 'DENSITY_VALUE'], + @_); + + throw("Density value must be >= 0.") if($dv < 0); + + if(!defined($dt)){ + throw("Density Type is NOT optional."); + } + + $self->{'density_type'} = $dt; + $self->{'density_value'} = $dv; + + $self->{'slice'} = $seq_region; + $self->{'start'} = $start; + $self->{'end'} = $end; + + + return $self; +} + + +=head2 new_fast + + Arg [...] : none + Example : $feature = Bio::EnsEMBL::DensityFeature->new_fast(); + Description: Creates a new density feature. + Returntype : Bio::EnsEMBL::DensityFeature + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub new_fast{ + my $caller = shift; + + #allow constructor to be called as class or object method + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + return $self; +} + + +=head2 strand + + Arg [1] : none + Example : $strand = $df->strand(); + Description: Getter fot the strand attribute. Density features always have + strand 0 and this attribute is not settable. + Returntype : int (always 0) + Exceptions : warning if an attempt is made to set the strand + Caller : general + Status : Stable + +=cut + +sub strand { + my $self = shift; + warning("DensityFeature strand is not settable") if(@_); + return 0; +} + + + +=head2 density_value + + Arg [1] : (optional) float $density_value + Example : $dv = $density_feature->density_value(); + Description: Getter/Setter for the density value of this DensityFeature. + The density value may be a count, a percentage, or a coverage + of a feature type in the area defined by this feature. + Returntype : float + Exceptions : throw if a negative density value is provided + Caller : general + Status : Stable + +=cut + +sub density_value { + my $self = shift; + + if(@_) { + my $density_value = shift; + throw("Density value must be >= 0.") if($density_value < 0); + $self->{'density_value'} = $density_value; + } + + return $self->{'density_value'}; +} + + + +=head2 analysis + + Arg [1] : (optional) Bio::EnsEMBL::Analysis $analysis + New value for the analysis of the attached DensityType + Example : print $df->analysis()->logic_name(); + Description: Overridden superclass analysis method, to chain to analysis + method on attached DensityType. + Returntype : Bio::EnsEMBL::Analysis + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub analysis { + my $self = shift; + + my $dt = $self->density_type(); + + return undef if(!$dt); + + return $dt->analysis(@_); +} + + + +=head2 density_type + + Arg [1] : string $newval (optional) + The new value to set the density_value_type attribute to + Example : $density_value_type = $obj->density_value_type() + Description: Getter/Setter for the density_value_type attribute + Returntype : Bio::EnsEMBL::DensityType + Exceptions : if object passed is not of type DensityType + Caller : general + Status : Stable + +=cut + +sub density_type{ + my $self = shift; + if(@_) { + my $type = shift; + if( !ref $type || !$type->isa("Bio::EnsEMBL::DensityType")){ + throw("object passed must be an ensembl DensityType ". + "not a [".ref($type)."]"); + } + else{ + $self->{'density_type'}=$type; + } + } + return $self->{'density_type'}; +} + + +###BG######## + +=head2 scaledvalue + + Title : scaledvalue + Usage : $obj->scaledvalue($newval) + Function: + Returns : scalar - object's scaled value + Args : newvalue (optional) + Status : Stable + +=cut + +sub scaledvalue{ + my $obj = shift; + if( @_ ) { + my $scaledvalue = shift; + $obj->{'scaledvalue'} = $scaledvalue; + } + return $obj->{'scaledvalue'}; +} + + + +=head2 url + + Title : url + Usage : $obj->url($newval) + Function: + Returns : String containing this object's url + Args : newvalue (optional) + Status : Stable + + +=cut + +sub url{ + my $obj = shift; + if( @_ ) { + my $url = shift; + $obj->{'url'} = $url; + } + return $obj->{'url'}; + +} + + +1; + + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DensityFeatureSet.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DensityFeatureSet.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,287 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DensityFeatureSet - +A feature representing a set of density features + +=head1 SYNOPSIS + + use Bio::EnsEMBL::DensityFeatureSet; + + my $densitySet = Bio::EnsEMBL::DensityFeatureSet->new( + -bin_array = \@out, + -stretch = 1, + ); + +=head1 DESCRIPTION + +A density feature set is a wrap around a array of density features with +additional information about the collective density feature set, such as +max_min_values and scale factors etc. a given region. + +This module is part of the Ensembl project http://www.ensembl.org + +=head1 METHODS + +=cut + + +package Bio::EnsEMBL::DensityFeatureSet; + +use strict; +use warnings; + +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw); + +=head2 new + + Description: Creates a new density feature set. + Returntype : Bio::EnsEMBL::DensityFeatureSet + Exceptions : throw if invalid density value type is provided + Caller : general + Status : Stable + +=cut + +sub new { + my $class = shift; + + my $max_value = undef; + my $min_value = undef; + + my($dfeats, $stretch, $scale_to_fit) = + rearrange(['FEATURES', 'STRETCH', 'SCALE_TO_FIT'], @_); + foreach (@$dfeats){ + my $value = $_->density_value; + $max_value = $value if (!defined($max_value) || $value > $max_value); + $min_value = $value if (!defined($min_value) || $value < $min_value); + } + + return bless {'bin_array' => $dfeats, + 'stretch' => $stretch, + 'scale_to_fit' => $scale_to_fit, + 'min_value' => $min_value, + 'max_value' => $max_value}, $class; +} + + +=head2 stretch + + Title : stretch + Usage : $obj->stretch($newval) + Function: gets/sets a boolean for whether we should stretch the data over the + range (i.e. from min to max rather than absolute numbers). + Returns : value of _stretch + Args : newvalue (optional) + Status : Stable + +=cut + +sub stretch{ + my $self = shift; + $self->{'stretch'} = shift if(@_); + return $self->{'stretch'}; +} + + +=head2 scale_to_fit + + Title : scale_to_fit + Usage : $obj->scale_to_fit($newval) + Function: gets/sets the number that the BinValues are to be scaled against - + i.e. the greatest BinValue->value will be scaled to this number, and the rest + scaled in proportion. + Returns : scale_to_fit value + Args : newvalue (optional) + Status : Stable + + +=cut + +sub scale_to_fit{ + my $self = shift; + $self->{'scale_to_fit'} = shift if (@_); + return $self->{'scale_to_fit'}; + +} + +=head2 colour + + Title : colour + Usage : $obj->colour($newval) + Function: + Returns : value of colour + Args : newvalue (optional) + Status : Stable + + +=cut + + +sub colour{ + my $self = shift; + $self->{'color'} = shift if(@_); + return $self->{'color'}; + +} + +=head2 label + + Title : label + Usage : $obj->label($newval) + Function: + Returns : String containing label + Args : newvalue (optional) + Status : Stable + + +=cut + +sub label{ + my $self = shift; + $self->{'label'} = shift if (@_); + return $self->{'label'}; + +} + + +=head2 label2 + + Title : label2 + Usage : $obj->label2($newval) + Function: + Returns : String containing label2 + Args : newvalue (optional) + Status : Stable + + +=cut + +sub label2{ + my $self = shift; + $self->{'label2'} = shift if (@_); + return $self->{'label2'}; +} + + + +=head2 get_all_binvalues + + Arg [1] : none + Example : @binvalues = @{$dfs->get_all_binvalues}; + Description: Scales all of the contained DensityFeatures by $scalefactor + and returns them. + Returntype : reference to a list of DensityFeatures + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_binvalues{ + my $self = shift; + my $max_value = $self->max_value(); + my $min_value = $self->min_value(); + + return [] if(!@{$self->{'bin_array'}}); + + my $width = $self->scale_to_fit(); + return [] unless defined($width); + # throw("Cannot scale values - scale_to_fit has not been set"); + + if ($self->stretch && ($max_value-$min_value) ){ + foreach my $bv (@{ $self->{'bin_array'}}){ + my $scaledval = (($bv->density_value - $min_value) / + ($max_value-$min_value) )* $width; + $bv->scaledvalue($scaledval); + } + } elsif($max_value) { + foreach my $bv (@{ $self->{'bin_array'}}){ + my $scaledval = ($bv->density_value / $max_value) * $width; + $bv->scaledvalue($scaledval); + } + } else { + foreach my $bv (@{ $self->{'bin_array'}}){ + $bv->scaledvalue(0); + } + } + + return $self->{'bin_array'}; +} + + +=head2 max_value + + Arg [1] : none + Example : my $max = $dfs->max_value(); + Description: Returns the maximum density feature value from the density + feature set + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub max_value{ $_[0]->{'max_value'};} + + +=head2 min_value + + Arg [1] : none + Example : my $min = $dfs->min_value(); + Description: Returns the minimum density feature value from the density + feature set. + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub min_value{ $_[0]->{'min_value'};} + + + +=head2 size + + Arg [1] : none + Example : my $num_features = $dfs->size(); + Description: Returns the number of density features in this density feature + set. + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub size { + my $self = shift; + return scalar @{$self->{'bin_array'}}; +} + +1; + + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DensityPlot/BinValue.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DensityPlot/BinValue.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,167 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DensityPlot::BinValue + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +This object deals with the raw data to built the density plots + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DensityPlot::BinValue; +use vars qw($AUTOLOAD @ISA); +use strict; + +# Object preamble - inheriets from Bio::Root::Object + +use Bio::EnsEMBL::Root; + +@ISA = qw(Bio::EnsEMBL::Root Exporter); +#@EXPORT_OK = qw(); +# new() is inherited from Bio::Root::Object + +# _initialize is where the heavy stuff will happen when new is called + +sub new { + my ($class,@args) = @_; + + my $self = {}; + bless $self,$class; + return $self; +} + +=head2 chromosomestart + + Title : ChromosomeStart + Usage : $obj->ChromosomeStart($newval) + Function: + Returns : value of ChromosomeStart + Args : newvalue (optional) + + +=cut + +sub chromosomestart{ + my $obj = shift; + if( @_ ) { + my $value = shift; + $obj->{'chromosomestart'} = $value; + } + return $obj->{'chromosomestart'}; + +} + +=head2 chromosomeend + + Title : chromosomesnd + Usage : $obj->chromosomeend($newval) + Function: + Returns : value of chromosomeend + Args : newvalue (optional) + + +=cut + +sub chromosomeend{ + my $obj = shift; + if( @_ ) { + my $value = shift; + $obj->{'chromosomeend'} = $value; + } + return $obj->{'chromosomeend'}; + +} + + +=head2 value + + Title : value + Usage : $obj->value($newval) + Function: + Returns : value of value + Args : newvalue (optional) + + +=cut + +sub value{ + my $obj = shift; + if( @_ ) { + my $value = shift; + $obj->{'value'} = $value; + } + return $obj->{'value'}; + +} + + + +=head2 scaledvalue + + Title : scaledvalue + Usage : $obj->scaledvalue($newval) + Function: + Returns : this object's scaled value + Args : newvalue (optional) + + +=cut + +sub scaledvalue{ + my $obj = shift; + if( @_ ) { + my $scaledvalue = shift; + $obj->{'scaledvalue'} = $scaledvalue; + } + return $obj->{'scaledvalue'}; + +} + + + +=head2 url + + Title : url + Usage : $obj->url($newval) + Function: + Returns : this object's url + Args : newvalue (optional) + + +=cut + +sub url{ + my $obj = shift; + if( @_ ) { + my $url = shift; + $obj->{'url'} = $url; + } + return $obj->{'url'}; + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DensityPlot/BinValueSet.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DensityPlot/BinValueSet.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,341 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + + +=head1 NAME + +Bio::EnsEMBL::DensityPlot::BinValueSet + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DensityPlot::BinValueSet; + +use vars qw($AUTOLOAD @ISA); +use strict; +use Bio::EnsEMBL::DensityPlot::BinValue; + +# Object preamble - inheriets from Bio::Root::Object + +use Bio::EnsEMBL::Root; + +@ISA = qw(Bio::EnsEMBL::Root Exporter); +#@EXPORT_OK = qw(); +# new() is inherited from Bio::Root::Object + +# _initialize is where the heavy stuff will happen when new is called + +sub new { + my ($class,@args) = @_; + + my $self = {}; + bless $self,$class; + $self->{'_bin_array'} = []; + return $self; +} + + + +=head2 add_binvalue + + Title : add_binValue + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub add_binvalue{ + my ($self,$value) = @_; + + defined ($value->chromosomestart) || $self->throw( "Bin Value object does not contain a ChromosomeStart method" ); + defined ($value->chromosomeend) || $self->throw( "Bin Value object does not contain a ChromosomeEnd method" ); + defined ($value->value) || $self->throw( "Bin Value object does not contain a Value method" ); + $self->_store_biggest($value->value); + $self->_store_smallest($value->value); + + push(@{$self->{'_bin_array'}},$value); +} + +=head2 get_binvalues + + Title : get_binvalues + Usage : my @binvalue_objects = $BVSet->get_binvalues + Function: scales all the binvalues by the scale_factor and returns them. + Example : + Returns : array of BinValue objects + Args : none + + +=cut + +sub get_binvalues{ + my $self = shift; + my $biggest_value = $self->{'_biggest_value'} || 0; + my $smallest_value = $self->{'_smallest_value'} || 0; + + if (!defined ($biggest_value)||!defined($smallest_value)){ + $self->throw("Cannot scale - no values to scale against"); + } + + my $width = $self->scale_to_fit(); + + if ($self->stretch && ($biggest_value-$smallest_value) ){ + foreach my $bv (@{ $self->{'_bin_array'}}){ + my $scaledval = (($bv->value - $smallest_value) / ($biggest_value-$smallest_value) )* $width; + $bv->scaledvalue($scaledval); + } + } elsif($biggest_value) { + foreach my $bv (@{ $self->{'_bin_array'}}){ + my $scaledval = ($bv->value / $biggest_value) * $width; + $bv->scaledvalue($scaledval); + } + } else { + foreach my $bv (@{ $self->{'_bin_array'}}){ + $bv->scaledvalue(0); + } + } + + return ( @{ $self->{'_bin_array'}} ); + +} + +sub size { + my $self = shift; + return scalar @{$self->{'_bin_array'}}; +} + +=head2 position + + Title : position + Usage : $obj->position($newval) + Function: + Returns : value of position + Args : newvalue (optional) + + +=cut + +sub position{ + my $self = shift; + if( @_ ) { + my $value = shift; + $self->{'position'} = $value; + } + return $self->{'position'}; + +} + + +=head2 label + + Title : label + Usage : $obj->label($newval) + Function: + Returns : value of label + Args : newvalue (optional) + + +=cut + +sub label{ + my $self = shift; + if( @_ ) { + my $value = shift; + $self->{'label'} = $value; + } + return $self->{'label'}; + +} + + +=head2 label2 + + Title : label2 + Usage : $obj->label2($newval) + Function: + Returns : value of label2 + Args : newvalue (optional) + + +=cut + +sub label2{ + my $self = shift; + if( @_ ) { + my $value = shift; + $self->{'label2'} = $value; + } + return $self->{'label2'}; + +} + + + +=head2 color + + Title : color + Usage : $obj->color($newval) + Function: + Returns : value of color + Args : newvalue (optional) + + +=cut + +sub color{ + my $self = shift; + + + if( @_ ) { + my $value = shift; + $self->{'color'} = $value; + } + return $self->{'color'}; + +} + +=head2 shape + + Title : shape + Usage : $obj->shape($newval) + Function: + Returns : value of shape + Args : newvalue (optional) + + +=cut + +sub shape{ + my $self = shift; + if( @_ ) { + my $value = shift; + $self->{'shape'} = $value; + } + return $self->{'shape'}; + +} + + + +=head2 stretch + + Title : stretch + Usage : $obj->stretch($newval) + Function: gets/sets a boolean for whether we should stretch the data over the + range (i.e. from min to max rather than absolute numbers). + Returns : value of _stretch + Args : newvalue (optional) + + +=cut + +sub stretch{ + my ($self,$value) = @_; + if( defined $value ) { + $self->{'_stretch'} = $value; + } + return $self->{'_stretch'}; +} + + +=head2 scale_to_fit + + Title : scale_to_fit + Usage : $obj->scale_to_fit($newval) + Function: gets/sets the number that the BinValues are to be scaled against - + i.e. the greatest BinValue->value will be scaled to this number, and the rest + scaled in proportion. + Returns : scale_to_fit value + Args : newvalue (optional) + + +=cut + +sub scale_to_fit{ + my $self = shift; + if( @_ ) { + my $value = shift; + $self->{'_scale_to_fit'} = $value; + } + return $self->{'_scale_to_fit'}; + +} + + +=head2 _store_biggest + + Title : _store_biggest + Usage : $self->_store_biggest($newval) + Function: internal method for storing the largest BinValue->value in this set. + Returns : biggest value seen so far + Args : value + + +=cut + +sub _store_biggest { + my ($self,$val) = @_; + + if (!defined $self->{'_biggest_value'} || + $val > $self->{'_biggest_value'}){ + $self->{'_biggest_value'}=$val; + } + + return $self->{'_biggest_value'}; +} + + + +=head2 _store_smallest + + Title : _store_smallest + Usage : $self->_store_smallest($newval) + Function: internal method for storing the smallest BinValue->value in this set. + Returns : smallest value seen so far + Args : value + +=cut + +sub _store_smallest { + my ($self,$val) = @_; + + if (!defined($self->{'_smallest_value'})){ + $self->{'_smallest_value'}=$val; + } + + if (!defined($self->{'_smallest_value'}) || + $val < $self->{'_smallest_value'}){ + $self->{'_smallest_value'}=$val; + } + return $self->{'_smallest_value'}; +} + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DensityType.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DensityType.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,195 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DensityType - A type representing a density, or percentage +coverage etc. in a given region. + +=head1 SYNOPSIS + + use Bio::EnsEMBL::DensityType; + + $type = Bio::EnsEMBL::DensityType->new( + -analysis => $analysis, + -blocksize => 1000000, + -value_type => $type + ); + +=head1 DESCRIPTION + +A density type represents a density, or percentage coverage etc. in a +given region. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::DensityType; + +use Bio::EnsEMBL::Storable; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw); + +use vars qw(@ISA); + +@ISA = qw(Bio::EnsEMBL::Storable); + +=head2 new + + Arg [..] : Takes a set of named arguments + Example : $dt = new Bio::EnsEMBL::DensityType::DensityType( + -analysis => $analysis, + -blocksize => 1e6, + -value_type => 'sum') + + Description: Creates a new Density Type object + Returntype : Bio::EnsEMBL::DensityType + Exceptions : blocksize > 0, + valuetype must be 'sum' or 'ratio', + valid analysis object must be passed + Caller : general + Status : Stable + +=cut + + +sub new { + my $class = shift; + + my $self = $class->SUPER::new(@_); + + my ($analysis, $block_size, $value_type, $region_features) = + rearrange(['ANALYSIS','BLOCK_SIZE','VALUE_TYPE','REGION_FEATURES'],@_); + + if($analysis) { + if(!ref($analysis) || !$analysis->isa('Bio::EnsEMBL::Analysis')) { + throw('-ANALYSIS argument must be a Bio::EnsEMBL::Analysis not '. + $analysis); + } + } + + if($value_type ne "sum" and $value_type ne "ratio"){ + throw('-VALUE_TYPE argument must be "ratio" or "sum" not *'. + $value_type."*"); + } + + $block_size |= 0; + $region_features |= 0; + + if(! ($block_size xor $region_features )){ + throw('Set either -BLOCK_SIZE or -REGION_FEATURES, not both'); + } + + if( $block_size <0 or $region_features < 0 ) { + throw( 'No negative values for -BLOCK_SIZE or -REGION_FEATURES' ); + } + + + $self->{'analysis'} = $analysis; + $self->{'block_size'} = $block_size; + $self->{'value_type'} = $value_type; + $self->{'region_features'} = $region_features; + + return $self; +} + +=head2 analysis + + Arg [1] : Bio::EnsEMBL::Analysis + Description: get/set for attribute analysis + Returntype : Bio::EnsEMBL::Analysis + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub analysis{ + my $self = shift; + + if(@_) { + my $a = shift; + if(defined($a) && (!ref($a) || !$a->isa('Bio::EnsEMBL::Analysis'))) { + throw("Argument must be undef or a Bio::EnsEMBL::Analysis object."); + } + $self->{'analysis'} = $a; + } + return $self->{'analysis'}; +} + +=head2 value_type + + Arg [1] : string $value_type + Description: gettter/setter for the type + Returntype : float + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub value_type{ + my $self = shift; + $self->{'value_type'} = shift if(@_); + return $self->{'value_type'}; +} + + +=head2 block_size + + Arg [1] : int + Description: getter/setter for attribute block_size + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub block_size{ + my $self = shift; + $self->{'block_size'} = shift if(@_); + return $self->{'block_size'}; +} + + +=head2 region_features + + Arg [1] : int $region_features + Example : The number of features per seq_region inside this density_type.. + Description: get/set for attribute region_features + Returntype : string + Exceptions : none + Caller : general + +=cut + +sub region_features { + my $self = shift; + $self->{'region_features'} = shift if( @_ ); + return $self->{'region_features'}; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DnaDnaAlignFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DnaDnaAlignFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,400 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DnaDnaAlignFeature - Ensembl specific dna-dna pairwise +alignment feature + +=head1 SYNOPSIS + + See BaseAlignFeature + +=cut + + +package Bio::EnsEMBL::DnaDnaAlignFeature; + +use strict; + +use Bio::EnsEMBL::BaseAlignFeature; + +use vars qw(@ISA); +use Bio::SimpleAlign; +use Bio::LocatableSeq; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); + +@ISA = qw( Bio::EnsEMBL::BaseAlignFeature ); + + +=head2 new + + Arg [..] : List of named arguments. (-pair_dna_align_feature_id) defined + in this constructor, others defined in BaseFeaturePair and + SeqFeature superclasses. + Example : $daf = new DnaDnaAlignFeature(-cigar_string => '3M3I12M'); + Description: Creates a new DnaDnaAlignFeature using either a cigarstring or + a list of ungapped features. + Returntype : Bio::EnsEMBL::DnaDnaAlignFeature + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub new { + + my $caller = shift; + + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + my ($pair_dna_align_feature_id) = rearrange([qw(PAIR_DNA_ALIGN_FEATURE_ID)], @_); + if (defined $pair_dna_align_feature_id){ + $self->{'pair_dna_align_feature_id'} = $pair_dna_align_feature_id; + } + return $self; +} + + +=head2 pair_dna_align_feature_id + + Arg[1] : (optional) String $arg - value to set + Example : $self->pair_dna_align_feature_id($pair_feature_id); + Description: Getter/setter for attribute 'pair_dna_align_feature_id' + The id of the dna feature aligned + Returntype : String + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub pair_dna_align_feature_id{ + my ($self, $arg) = @_; + if (defined $arg){ + $self->{pair_dna_align_feature_id} = $arg; + } + return $self->{pair_dna_align_feature_id}; +} + +=head2 _hit_unit + + Arg [1] : none + Description: PRIVATE implementation of abstract superclass method. Returns + 1 as the 'unit' used for the hit sequence. + Returntype : int + Exceptions : none + Caller : Bio::EnsEMBL::BaseAlignFeature + Status : Stable + +=cut + +sub _hit_unit { + return 1; +} + + + +=head2 _query_unit + + Arg [1] : none + Description: PRIVATE implementation of abstract superclass method Returns + 1 as the 'unit' used for the hit sequence. + Returntype : int + Exceptions : none + Caller : Bio::EnsEMBL::BaseAlignFeature + Status : Stable + +=cut + +sub _query_unit { + return 1; +} + +=head2 restrict_between_positions + + Arg [1] : int $start + Arg [2] : int $end + Arg [3] : string $flags + SEQ = $start and $end apply to the seq sequence + i.e. start and end methods + HSEQ = $start and $end apply to the hseq sequence + i.e. hstart and hend methods + Example : $daf->restrict_between_positions(150,543,"SEQ") + Description: Build a new DnaDnaAlignFeature object that fits within + the new specified coordinates and sequence reference, cutting + any pieces hanging upstream and downstream. + Returntype : Bio::EnsEMBL::DnaDnaAlignFeature object + Exceptions : + Caller : + Status : Stable + +=cut + +sub restrict_between_positions { + my ($self,$start,$end,$seqref) = @_; + + unless (defined $start && $start =~ /^\d+$/) { + $self->throw("The first argument is not defined or is not an integer"); + } + unless (defined $end && $end =~ /^\d+$/) { + $self->throw("The second argument is not defined or is not an integer"); + } + unless (defined $seqref && + ($seqref eq "SEQ" || $seqref eq "HSEQ")) { + $self->throw("The third argument is not defined or is not equal to 'SEQ' or 'HSEQ'"); + } + +# symbolic method references should be forbidden! +# need to be rewrite at some stage. + + my ($start_method1,$end_method1,$strand_method1,$start_method2,$end_method2,$strand_method2) = + qw(start end strand hstart hend hstrand); + + if ($seqref eq "HSEQ") { + ($start_method1,$end_method1,$strand_method1,$start_method2,$end_method2,$strand_method2) = + qw(hstart hend hstrand start end strand); + } + + my @restricted_features; + + foreach my $ungapped_feature ($self->ungapped_features) { + + if ($ungapped_feature->$start_method1() > $end || + $ungapped_feature->$end_method1() < $start) { + + next; + + } elsif ($ungapped_feature->$end_method1() <= $end && + $ungapped_feature->$start_method1() >= $start) { + + push @restricted_features, $ungapped_feature; + + } else { + + if ($ungapped_feature->$strand_method1() eq $ungapped_feature->$strand_method2()) { + + if ($ungapped_feature->$start_method1() < $start) { + + my $offset = $start - $ungapped_feature->$start_method1(); + $ungapped_feature->$start_method1($start); + $ungapped_feature->$start_method2($ungapped_feature->$start_method2() + $offset); + + } + if ($ungapped_feature->$end_method1() > $end) { + + my $offset = $ungapped_feature->$end_method1() - $end; + $ungapped_feature->$end_method1($end); + $ungapped_feature->$end_method2($ungapped_feature->$end_method2() - $offset); + + } + } else { + + if ($ungapped_feature->$start_method1() < $start) { + + my $offset = $start - $ungapped_feature->$start_method1(); + $ungapped_feature->$start_method1($start); + $ungapped_feature->$end_method2($ungapped_feature->$end_method2() - $offset); + + } + if ($ungapped_feature->$end_method1() > $end) { + + my $offset = $ungapped_feature->$end_method1() - $end; + $ungapped_feature->$end_method1($end); + $ungapped_feature->$start_method2($ungapped_feature->$start_method2() + $offset); + + } + } + + push @restricted_features, $ungapped_feature; + } + } + + if (scalar @restricted_features) { + my $DnaDnaAlignFeature = new Bio::EnsEMBL::DnaDnaAlignFeature('-features' =>\@restricted_features); + if (defined $self->slice) { + $DnaDnaAlignFeature->slice($self->slice); + } + if (defined $self->hslice) { + $DnaDnaAlignFeature->hslice($self->hslice); + } + return $DnaDnaAlignFeature; + } else { + return undef; + } +} + +=head2 alignment_strings + + Arg [1] : list of string $flags + FIX_SEQ = does not introduce gaps (dashes) in seq aligned sequence + and delete the corresponding insertions in hseq aligned sequence + FIX_HSEQ = does not introduce gaps (dashes) in hseq aligned sequence + and delete the corresponding insertions in seq aligned sequence + NO_SEQ = return the seq aligned sequence as an empty string + NO_HSEQ = return the hseq aligned sequence as an empty string + This 2 last flags would save a bit of time as doing so no querying to the core + database in done to get the sequence. + Example : $daf->alignment_strings or + $daf->alignment_strings("FIX_HSEQ") or + $daf->alignment_strings("NO_SEQ","FIX_SEQ") + Description: Allows to rebuild the alignment string of both the seq and hseq sequence + using the cigar_string information and the slice and hslice objects + Returntype : array reference containing 2 strings + the first corresponds to seq + the second corresponds to hseq + Exceptions : + Caller : + Status : Stable + +=cut + + +sub alignment_strings { + my ( $self, @flags ) = @_; + + # set the flags + my $seq_flag = 1; + my $hseq_flag = 1; + my $fix_seq_flag = 0; + my $fix_hseq_flag = 0; + + for my $flag ( @flags ) { + $seq_flag = 0 if ($flag eq "NO_SEQ"); + $hseq_flag = 0 if ($flag eq "NO_HSEQ"); + $fix_seq_flag = 1 if ($flag eq "FIX_SEQ"); + $fix_hseq_flag = 1 if ($flag eq "FIX_HSEQ"); + } + + my ($seq, $hseq); + $seq = $self->slice->subseq($self->start, $self->end, $self->strand) if ($seq_flag || $fix_seq_flag); + $hseq = $self->hslice->subseq($self->hstart, $self->hend, $self->hstrand) if ($hseq_flag || $fix_hseq_flag); + + my $rseq= ""; + # rseq - result sequence + my $rhseq= ""; + # rhseq - result hsequence + + my $seq_pos = 0; + my $hseq_pos = 0; + + my @cig = ( $self->cigar_string =~ /(\d*[DIM])/g ); + + for my $cigElem ( @cig ) { + my $cigType = substr( $cigElem, -1, 1 ); + my $cigCount = substr( $cigElem, 0 ,-1 ); + $cigCount = 1 unless $cigCount; + + if( $cigType eq "M" ) { + $rseq .= substr( $seq, $seq_pos, $cigCount ) if ($seq_flag); + $rhseq .= substr( $hseq, $hseq_pos, $cigCount ) if ($hseq_flag); + $seq_pos += $cigCount; + $hseq_pos += $cigCount; + } elsif( $cigType eq "D" ) { + if( ! $fix_seq_flag ) { + $rseq .= "-" x $cigCount if ($seq_flag); + $rhseq .= substr( $hseq, $hseq_pos, $cigCount ) if ($hseq_flag); + } + $hseq_pos += $cigCount; + } elsif( $cigType eq "I" ) { + if( ! $fix_hseq_flag ) { + $rseq .= substr( $seq, $seq_pos, $cigCount ) if ($seq_flag); + $rhseq .= "-" x $cigCount if ($hseq_flag); + } + $seq_pos += $cigCount; + } + } + return [ $rseq,$rhseq ]; +} + +=head2 get_SimpleAlign + + Arg [1] : list of string $flags + translated = by default, the sequence alignment will be on nucleotide. With translated flag + the aligned sequences are translated. + uc = by default aligned sequences are given in lower cases. With uc flag, the aligned + sequences are given in upper cases. + Example : $daf->get_SimpleAlign or + $daf->get_SimpleAlign("translated") or + $daf->get_SimpleAlign("translated","uc") + Description: Allows to rebuild the alignment string of both the seq and hseq sequence + using the cigar_string information and the slice and hslice objects + Returntype : a Bio::SimpleAlign object + Exceptions : + Caller : + Status : Stable + +=cut + +sub get_SimpleAlign { + my ( $self, @flags ) = @_; + + # setting the flags + my $uc = 0; + my $translated = 0; + + for my $flag ( @flags ) { + $uc = 1 if ($flag =~ /^uc$/i); + $translated = 1 if ($flag =~ /^translated$/i); + } + + my $sa = Bio::SimpleAlign->new(); + + #Hack to try to work with both bioperl 0.7 and 1.2: + #Check to see if the method is called 'addSeq' or 'add_seq' + my $bio07 = 0; + if(!$sa->can('add_seq')) { + $bio07 = 1; + } + + my ($sb_seq,$qy_seq) = @{$self->alignment_strings}; + + my $loc_sb_seq = Bio::LocatableSeq->new(-SEQ => $uc ? uc $sb_seq : lc $sb_seq, + -START => $self->seq_region_start, + -END => $self->seq_region_end, + -ID => $self->seqname, + -STRAND => $self->strand); + + $loc_sb_seq->seq($uc ? uc $loc_sb_seq->translate->seq + : lc $loc_sb_seq->translate->seq) if ($translated); + + my $loc_qy_seq = Bio::LocatableSeq->new(-SEQ => $uc ? uc $qy_seq : lc $qy_seq, + -START => $self->hseq_region_start, + -END => $self->hseq_region_end, + -ID => $self->hseqname, + -STRAND => $self->hstrand); + + $loc_qy_seq->seq($uc ? uc $loc_qy_seq->translate->seq + : lc $loc_qy_seq->translate->seq) if ($translated); + + if($bio07) { + $sa->addSeq($loc_sb_seq); + $sa->addSeq($loc_qy_seq); + } else { + $sa->add_seq($loc_sb_seq); + $sa->add_seq($loc_qy_seq); + } + + return $sa; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/DnaPepAlignFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DnaPepAlignFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,110 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DnaPepAlignFeature - Ensembl specific dna-pep pairwise +alignment feature + +=head1 SYNOPSIS + + See BaseAlignFeature + +=cut + + +package Bio::EnsEMBL::DnaPepAlignFeature; + +use strict; + +use Bio::EnsEMBL::BaseAlignFeature; +use Scalar::Util qw(weaken isweak); + +use vars qw(@ISA); + +@ISA = qw( Bio::EnsEMBL::BaseAlignFeature ); + + +=head2 new_fast + + Arg [1] : hashref $hashref + A hashref which will be blessed into a PepDnaAlignFeature. + Example : none + Description: This allows for very fast object creation when a large number + of DnaPepAlignFeatures needs to be created. This is a bit of + a hack but necessary when thousands of features need to be + generated within a couple of seconds for web display. It is + not recommended that this method be called unless you know what + you are doing. It requires knowledge of the internals of this + class and its superclasses. + Returntype : Bio::EnsEMBL::DnaPepAlignFeature + Exceptions : none + Caller : Bio::EnsEMBL::DBSQL::ProteinAlignFeatureAdaptor + Status : Stable + +=cut + +sub new_fast { + my ($class, $hashref) = @_; + my $self = bless $hashref, $class; + weaken($self->{adaptor}) if ( ! isweak($self->{adaptor}) ); + return $self; +} + + +=head2 _hit_unit + + Arg [1] : none + Description: PRIVATE implementation of abstract superclass method. Returns + 1 as the 'unit' used for the hit sequence. + Returntype : int + Exceptions : none + Caller : Bio::EnsEMBL::BaseAlignFeature + Status : Stable + + +=cut + +sub _hit_unit { + return 1; +} + + +=head2 _query_unit + + Arg [1] : none + Description: PRIVATE implementation of abstract superclass method. Returns + 3 as the 'unit' used for the query sequence. + Returntype : int + Exceptions : none + Caller : Bio::EnsEMBL::BaseAlignFeature + Status : Stable + + +=cut + +sub _query_unit { + return 3; +} + + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Exon.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Exon.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1661 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Exon - A class representing an Exon + +=head1 SYNOPSIS + + $ex = new Bio::EnsEMBL::Exon( + -START => 100, + -END => 200, + -STRAND => 1, + -SLICE => $slice, + -DBID => $dbID, + -ANALYSIS => $analysis, + -STABLE_ID => 'ENSE000000123', + -VERSION => 2 + ); + + # seq() returns a Bio::Seq + my $seq = $exon->seq->seq(); + + # Peptide only makes sense within transcript context + my $pep = $exon->peptide($transcript)->seq(); + + # Normal feature operations can be performed: + $exon = $exon->transform('clone'); + $exon->move( $new_start, $new_end, $new_strand ); + print $exon->slice->seq_region_name(); + +=head1 DESCRIPTION + +This is a class which represents an exon which is part of a transcript. +See Bio::EnsEMBL:Transcript + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Exon; + +use strict; + +use Bio::EnsEMBL::Feature; +use Bio::Seq; # exons have to have sequences... + +use Bio::EnsEMBL::Utils::Exception qw( warning throw deprecate ); +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Utils::Scalar qw( assert_ref ); +use Bio::EnsEMBL::DBSQL::SupportingFeatureAdaptor; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Feature); + + +=head2 new + + Arg [-SLICE]: Bio::EnsEMBL::SLice - Represents the sequence that this + feature is on. The coordinates of the created feature are + relative to the start of the slice. + Arg [-START]: The start coordinate of this feature relative to the start + of the slice it is sitting on. Coordinates start at 1 and + are inclusive. + Arg [-END] : The end coordinate of this feature relative to the start of + the slice it is sitting on. Coordinates start at 1 and are + inclusive. + Arg [-STRAND]: The orientation of this feature. Valid values are 1,-1,0. + Arg [-SEQNAME] : (optional) A seqname to be used instead of the default name + of the of the slice. Useful for features that do not have an + attached slice such as protein features. + Arg [-dbID] : (optional) internal database id + Arg [-ADAPTOR]: (optional) Bio::EnsEMBL::DBSQL::BaseAdaptor + Arg [-PHASE] : the phase. + Arg [-END_PHASE]: the end phase + Arg [-STABLE_ID]: (optional) the stable id of the exon + Arg [-VERSION] : (optional) the version + Arg [-CREATED_DATE] : (optional) the created date + Arg [-MODIFIED_DATE]: (optional) the last midifeid date + + Example : none + Description: create an Exon object + Returntype : Bio::EnsEMBL::Exon + Exceptions : if phase is not valid (i.e. 0,1, 2 -1) + Caller : general + Status : Stable + +=cut + +sub new { + my $class = shift; + + $class = ref $class || $class; + + my $self = $class->SUPER::new( @_ ); + + my ( $phase, $end_phase, $stable_id, $version, $created_date, + $modified_date, $is_current, $is_constitutive ) + = rearrange( [ + "PHASE", "END_PHASE", + "STABLE_ID", "VERSION", + "CREATED_DATE", "MODIFIED_DATE", + "IS_CURRENT", "IS_CONSTITUTIVE" + ], + @_ + ); + + if ( defined($phase) ) { # make sure phase is valid. + $self->phase($phase); + } + + $self->{'end_phase'} = $end_phase; + $self->{'stable_id'} = $stable_id; + $self->{'version'} = $version; + $self->{'created_date'} = $created_date; + $self->{'modified_date'} = $modified_date; + + # Default is_current + if ( !defined($is_current) ) { $is_current = 1 } + $self->{'is_current'} = $is_current; + + # Default is_constitutive + if ( !defined($is_constitutive) ) { $is_constitutive = 0 } + $self->{'is_constitutive'} = $is_constitutive; + + return $self; +} + + +# =head2 new_fast + +# Arg [1] : Bio::EnsEMBL::Slice $slice +# Arg [2] : int $start +# Arg [3] : int $end +# Arg [4] : int $strand (1 or -1) +# Example : none +# Description: create an Exon object +# Returntype : Bio::EnsEMBL::Exon +# Exceptions : throws if end < start +# Caller : general +# Status : Stable + +# =cut + +# sub new_fast { +# my ($class, $slice, $start, $end, $strand) = @_; + +# my $self = bless {}, $class; + +# # Swap start and end if they're in the wrong order +# # We assume that the strand is correct and keep the input value. + +# if ($start > $end) { +# throw( "End smaller than start not allowed" ); +# } + +# $self->start ($start); +# $self->end ($end); +# $self->strand($strand); +# $self->slice($slice); + +# return $self; +# } + + +=head2 end_phase + + Arg [1] : (optional) int $end_phase + Example : $end_phase = $feat->end_phase; + Description: Gets/Sets the end phase of the exon. + end_phase = number of bases from the last incomplete codon of + this exon. + Usually, end_phase = (phase + exon_length)%3 + but end_phase could be -1 if the exon is half-coding and its 3 + prime end is UTR. + Returntype : int + Exceptions : warning if end_phase is called without an argument and the + value is not set. + Caller : general + Status : Stable + +=cut + +sub end_phase { + my $self = shift; + if (@_) { + $self->{'end_phase'} = shift; + } + else { + if ( !defined( $self->{'end_phase'} ) ) { + warning("No end phase set in Exon. You must set it explicitly."); + } + } + return $self->{'end_phase'}; +} + + +=head2 phase + + Arg [1] : (optional) int $phase + Example : my $phase = $exon->phase; + $exon->phase(2); + Description: Gets/Sets the phase of the exon. + Returntype : int + Exceptions : throws if phase is not (0, 1 2 or -1). + Caller : general + Status : Stable + + +Get or set the phase of the Exon, which tells the +translation machinery, which makes a peptide from +the DNA, where to start. + +The Ensembl phase convention can be thought of as +"the number of bases of the first codon which are +on the previous exon". It is therefore 0, 1 or 2 +(or -1 if the exon is non-coding). In ascii art, +with alternate codons represented by B<###> and +B<+++>: + + Previous Exon Intron This Exon + ...------------- -------------... + + 5' Phase 3' + ...#+++###+++### 0 +++###+++###+... + ...+++###+++###+ 1 ++###+++###++... + ...++###+++###++ 2 +###+++###+++... + +Here is another explanation from Ewan: + +Phase means the place where the intron lands +inside the codon - 0 between codons, 1 between +the 1st and second base, 2 between the second and +3rd base. Exons therefore have a start phase and +a end phase, but introns have just one phase. + +=cut + +sub phase { + my ($self,$value) = @_; + + if (defined($value)) { + # Value must be 0,1,2, or -1 for non-coding + if ($value =~ /^(-1|0|1|2)$/) { + #print STDERR "Setting phase to $value\n"; + $self->{'phase'} = $value; + } else { + throw("Bad value ($value) for exon phase. Should only be" . + " -1,0,1,2\n"); + } + } + return $self->{'phase'}; +} + + +=head2 frame + + Arg [1] : none + Example : $frame = $exon->frame + Description: Gets the frame of this exon + Returntype : int + Exceptions : thrown if an arg is passed + thrown if frame cannot be calculated due to a bad phase value + Caller : general + Status : Stable + +=cut + +sub frame { + my ($self,$value) = @_; + + if( defined $value ) { + throw("Cannot set frame. Deduced from seq_start and phase"); + } + + # frame is mod 3 of the translation point + + if( $self->phase == -1 ) { + return '.'; # gff convention for no frame info + } + if( $self->phase == 0 ) { + return $self->start%3; + } + + if( $self->phase == 1 ) { + return ($self->start+2)%3; + } + + if( $self->phase == 2 ) { + return ($self->start+1)%3; + } + + throw("bad phase in exon ".$self->phase); + +} + + +=head2 start + + Arg [1] : int $start (optional) + Example : $start = $exon->start(); + Description: Getter/Setter for the start of this exon. The superclass + implmentation is overridden to flush the internal sequence + cache if this value is altered + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub start { + my $self = shift; + # if an arg was provided, flush the internal sequence cache + delete $self->{'_seq_cache'} if(@_); + return $self->SUPER::start(@_); +} + + +=head2 end + + Arg [1] : int $end (optional) + Example : $end = $exon->end(); + Description: Getter/Setter for the end of this exon. The superclass + implmentation is overridden to flush the internal sequence + cache if this value is altered + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub end { + my $self = shift; + # if an arg was provided, flush the internal sequence cache + delete $self->{'_seq_cache'} if(@_); + return $self->SUPER::end(@_); +} + + +=head2 strand + + Arg [1] : int $strand (optional) + Example : $start = $exon->strand(); + Description: Getter/Setter for the strand of this exon. The superclass + implmentation is overridden to flush the internal sequence + cache if this value is altered + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub strand { + my $self = shift; + # if an arg was provided, flush the internal sequence cache + delete $self->{'_seq_cache'} if(@_); + return $self->SUPER::strand(@_); +} + +=head2 cdna_start + + Arg [1] : Bio::EnsEMBL::Transcript $transcript + The transcript for which cDNA coordinates should be + relative to. + Example : $cdna_start = $exon->cdna_start($transcript); + Description : Returns the start position of the exon in cDNA + coordinates. + Since an exon may be part of one or more transcripts, + the relevant transcript must be given as argument to + this method. + Return type : Integer + Exceptions : Throws if the given argument is not a transcript. + Throws if the first part of the exon maps into a gap. + Throws if the exon can not be mapped at all. + Caller : General + Status : Stable + +=cut + +sub cdna_start { + my ($self, $transcript) = @_; + assert_ref($transcript, 'Bio::EnsEMBL::Transcript', 'transcript'); + + my $id = $transcript->dbID(); + + if(defined $id && exists $self->{cdna_start}->{$id}) { + return $self->{cdna_start}->{$id}; + } + + my $cdna_start; + my @coords = $transcript->genomic2cdna($self->start(), $self->end(), $self->strand()); + if(@coords && !$coords[0]->isa('Bio::EnsEMBL::Mapper::Gap')) { + $cdna_start = $coords[0]->start(); + } + elsif(@coords) { + throw "First part of exon maps into gap"; + } + else { + throw "Can not map exon"; + } + + if(defined $id) { + $self->{cdna_start}->{$id} = $cdna_start; + } + + return $cdna_start; +} ## end sub cdna_start + +=head2 cdna_end + + Arg [1] : Bio::EnsEMBL::Transcript $transcript + The transcript for which cDNA coordinates should be + relative to. + Example : $cdna_end = $exon->cdna_end($transcript); + Description : Returns the end position of the exon in cDNA + coordinates. + Since an exon may be part of one or more transcripts, + the relevant transcript must be given as argument to + this method. + Return type : Integer + Exceptions : Throws if the given argument is not a transcript. + Throws if the last part of the exon maps into a gap. + Throws if the exon can not be mapped at all. + Caller : General + Status : Stable + +=cut + +sub cdna_end { + my ($self, $transcript) = @_; + assert_ref($transcript, 'Bio::EnsEMBL::Transcript', 'transcript'); + + my $id = $transcript->dbID(); + + if(defined $id && exists $self->{cdna_end}->{$id}) { + return $self->{cdna_end}->{$id}; + } + + my $cdna_end; + my @coords = $transcript->genomic2cdna($self->start(), $self->end(), $self->strand()); + if(@coords && !$coords[-1]->isa('Bio::EnsEMBL::Mapper::Gap')) { + $cdna_end = $coords[-1]->end(); + } + elsif(@coords) { + throw "Last part of exon maps into gap"; + } + else { + throw "Can not map exon"; + } + + if(defined $id) { + $self->{cdna_end}->{$id} = $cdna_end; + } + + return $cdna_end; +} ## end sub cdna_end + +=head2 cdna_coding_start + + Arg [1] : Bio::EnsEMBL::Transcript $transcript + The transcript for which cDNA coordinates should be + relative to. + Example : $cdna_coding_start = $exon->cdna_coding_start($transcript); + Description : Returns the start position of the coding region of the + exon in cDNA coordinates. Returns undef if the whole + exon is non-coding. + Since an exon may be part of one or more transcripts, + the relevant transcript must be given as argument to + this method. + Return type : Integer or undef + Exceptions : Throws if the given argument is not a transcript. + Caller : General + Status : Stable + +=cut + +sub cdna_coding_start { + my ($self, $transcript) = @_; + assert_ref($transcript, 'Bio::EnsEMBL::Transcript', 'transcript'); + + my $id = $transcript->dbID(); + + if(defined $id && exists $self->{cdna_coding_start}->{$id}) { + return $self->{cdna_coding_start}->{$id}; + } + + my $cdna_coding_start; + my $transcript_coding_start = $transcript->cdna_coding_start(); + if(defined $transcript_coding_start) { + my $cdna_start = $self->cdna_start($transcript); + + if ( $transcript_coding_start < $cdna_start ) { + # Coding region starts upstream of this exon... + + if ( $transcript->cdna_coding_end() < $cdna_start ) { + # ... and also ends upstream of this exon. + $cdna_coding_start = undef; + } + else { + # ... and does not end upstream of this exon. + $cdna_coding_start = $cdna_start; + } + } else { + # Coding region starts either within or downstream of this + # exon. + + if ( $transcript_coding_start <= $self->cdna_end($transcript) ) { + # Coding region starts within this exon. + $cdna_coding_start = $transcript_coding_start; + } + else { + # Coding region starts downstream of this exon. + $cdna_coding_start = undef; + } + } + } + else { + $cdna_coding_start = undef; + } + + if(defined $id) { + $self->{cdna_coding_start}->{$id} = $cdna_coding_start; + $self->{cdna_coding_end}->{$id} = undef if ! defined $cdna_coding_start; + } + + return $cdna_coding_start; +} ## end sub cdna_coding_start + +=head2 cdna_coding_end + + Arg [1] : Bio::EnsEMBL::Transcript $transcript + The transcript for which cDNA coordinates should be + relative to. + Example : $cdna_coding_end = $exon->cdna_coding_end($transcript); + Description : Returns the end position of the coding region of the + exon in cDNA coordinates. Returns undef if the whole + exon is non-coding. + Since an exon may be part of one or more transcripts, + the relevant transcript must be given as argument to + this method. + Return type : Integer or undef + Exceptions : Throws if the given argument is not a transcript. + Caller : General + Status : Stable + +=cut + +sub cdna_coding_end { + my ($self, $transcript) = @_; + assert_ref($transcript, 'Bio::EnsEMBL::Transcript', 'transcript'); + + my $id = $transcript->dbID(); + + if(defined $id && exists $self->{cdna_coding_end}->{$id}) { + return $self->{cdna_coding_end}->{$id}; + } + + my $cdna_coding_end; + my $transcript_coding_end = $transcript->cdna_coding_end(); + if(defined $transcript_coding_end) { + my $cdna_end = $self->cdna_end($transcript); + + if ( $transcript_coding_end > $cdna_end ) { + + # Coding region ends downstream of this exon... + if ( $transcript->cdna_coding_start() > $cdna_end ) { + # ... and also starts downstream of this exon. + $cdna_coding_end = undef; + } + else { + # ... and does not start downstream of this exon. + $cdna_coding_end = $cdna_end; + } + } + else { + # Coding region ends either within or upstream of this + # exon. + + if ( $transcript_coding_end >= $self->cdna_start($transcript) ) { + # Coding region ends within this exon. + $cdna_coding_end = $transcript_coding_end; + } + else { + # Coding region ends upstream of this exon. + $cdna_coding_end = undef; + } + } + } + else { + $cdna_coding_end = undef; + } + + if(defined $id) { + $self->{cdna_coding_end}->{$id} = $cdna_coding_end; + $self->{cdna_coding_start}->{$id} = undef if ! defined $cdna_coding_end; + } + + return $cdna_coding_end; +} ## end sub cdna_coding_end + +=head2 coding_region_start + + Arg [1] : Bio::EnsEMBL::Transcript $transcript + Example : $coding_region_start = + $exon->coding_region_start($transcript); + Description : Returns the start position of the coding region + of the exon in slice-relative coordinates on the + forward strand. Returns undef if the whole exon is + non-coding. + Since an exon may be part of one or more transcripts, + the relevant transcript must be given as argument to + this method. + Return type : Integer or undef + Exceptions : Throws if the given argument is not a transcript. + Caller : General + Status : Stable + +=cut + +# The implementation of this method is analogous to the implementation +# of cdna_coding_start(). + +sub coding_region_start { + my ($self, $transcript) = @_; + assert_ref($transcript, 'Bio::EnsEMBL::Transcript', 'transcript'); + + my $id = $transcript->dbID(); + + if(defined $id && exists $self->{coding_region_start}->{$id}) { + return $self->{coding_region_start}->{$id}; + } + + my $coding_region_start; + my $transcript_coding_start = $transcript->coding_region_start(); + if(defined $transcript_coding_start) { + my $start = $self->start(); + + if ( $transcript_coding_start < $start ) { + # Coding region starts upstream of this exon... + + if ( $transcript->coding_region_end() < $start ) { + # ... and also ends upstream of this exon. + $coding_region_start = undef; + } + else { + # ... and does not end upstream of this exon. + $coding_region_start = $start; + } + } + else { + # Coding region starts either within or downstream of this + # exon. + + if ( $transcript_coding_start <= $self->end() ) { + # Coding region starts within this exon. + $coding_region_start = $transcript_coding_start; + } + else { + # Coding region starts downstream of this exon. + $coding_region_start = undef; + } + } + } + else { + $coding_region_start = undef; + } + + if(defined $id) { + $self->{coding_region_start}->{$id} = $coding_region_start; + $self->{coding_region_end}->{$id} = undef if ! defined $coding_region_start; + } + + return $coding_region_start; +} ## end sub coding_region_start + +=head2 coding_region_end + + Arg [1] : Bio::EnsEMBL::Transcript $transcript + Example : $coding_region_end = + $exon->coding_region_end($transcript); + Description : Returns the end position of the coding region of + the exon in slice-relative coordinates on the + forward strand. Returns undef if the whole exon is + non-coding. + Since an exon may be part of one or more transcripts, + the relevant transcript must be given as argument to + this method. + Return type : Integer or undef + Exceptions : Throws if the given argument is not a transcript. + Caller : General + Status : Stable + +=cut + +# The implementation of this method is analogous to the implementation +# of cdna_coding_end(). + +sub coding_region_end { + my ($self, $transcript) = @_; + assert_ref($transcript, 'Bio::EnsEMBL::Transcript', 'transcript'); + + my $id = $transcript->dbID(); + + if(defined $id && exists $self->{coding_region_end}->{$id}) { + return $self->{coding_region_end}->{$id}; + } + + my $coding_region_end; + my $transcript_coding_end = $transcript->coding_region_end(); + if(defined $transcript_coding_end) { + + my $end = $self->end(); + if($transcript_coding_end > $end) { + # Coding region ends downstream of this exon... + + if ( $transcript->coding_region_start() > $end ) { + # ... and also starts downstream of this exon. + $coding_region_end = undef; + } + else { + # ... and does not start downstream of this exon. + $coding_region_end = $end; + } + } + else { + # Coding region ends either within or upstream of this + # exon. + if ( $transcript_coding_end >= $self->start() ) { + $coding_region_end = $transcript_coding_end; + } + else { + $coding_region_end = undef; + } + } + } + else { + # This is a non-coding transcript. + $coding_region_end = undef; + } + + if(defined $id) { + $self->{coding_region_end}->{$id} = $coding_region_end; + $self->{coding_region_start}->{$id} = undef if ! defined $coding_region_end; + } + + return $coding_region_end; +} ## end sub coding_region_end + +=head2 slice + + Arg [1] : Bio::EnsEMBL::Slice + Example : $slice = $exon->slice(); + Description: Getter/Setter for the slice this exon is on. The superclass + implmentation is overridden to flush the internal sequence + cache if this value is altered + Returntype : Bio::EnsEMBL::Slice + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub slice { + my ( $self, $slice ) = @_; + + if ( defined($slice) ) { + # If a new slice was provided, flush the internal sequence cache and + # transfer all supporting evidence to the new slice. + + delete $self->{'_seq_cache'}; + + if ( exists( $self->{'_supporting_evidence'} ) ) { + my @new_features; + + for my $old_feature ( @{ $self->{'_supporting_evidence'} } ) { + + my $new_feature; + + if ( defined( $old_feature->slice() ) ) { + $new_feature = $old_feature->transfer($slice); + } else { + # If the old feature does not have a slice, assume transfer is + # not necessary. + $new_feature = $old_feature; + } + + push( @new_features, $new_feature ); + } + + $self->{'_supporting_evidence'} = \@new_features; + } + + return $self->SUPER::slice($slice); + } elsif ( @_ > 1 ) { + return $self->SUPER::slice(undef); + } else { + return $self->SUPER::slice(); + } +} ## end sub slice + +=head2 equals + + Arg [1] : Bio::EnsEMBL::Exon exon + Example : if ($exonA->equals($exonB)) { ... } + Description : Compares two exons for equality. + The test for eqality goes through the following list + and terminates at the first true match: + + 1. If Bio::EnsEMBL::Feature::equals() returns false, + then the exons are *not* equal. + 2. If both exons have stable IDs: if these are the + same, the exons are equal, otherwise not. + 3. If the exons have the same start, end, strand, and + phase, then they are equal, otherwise not. + + Return type : Boolean (0, 1) + + Exceptions : Thrown if a non-transcript is passed as the argument. + +=cut + +sub equals { + my ( $self, $exon ) = @_; + + if ( !defined($exon) ) { return 0 } + if ( $self eq $exon ) { return 1 } + + assert_ref( $exon, 'Bio::EnsEMBL::Exon' ); + + my $feature_equals = $self->SUPER::equals($exon); + if ( defined($feature_equals) && $feature_equals == 0 ) { + return 0; + } + + if ( defined( $self->stable_id() ) && defined( $exon->stable_id() ) ) + { + if ( $self->stable_id() eq $exon->stable_id() ) { + return 1; + } + else { + return 0; + } + } + + if ( $self->start() == $exon->start() && + $self->end() == $exon->end() && + $self->strand() == $exon->strand() && + $self->phase() == $exon->phase() && + $self->end_phase() == $exon->end_phase() ) + { + return 1; + } + + return 0; +} ## end sub equals + +=head2 move + + Arg [1] : int start + Arg [2] : int end + Arg [3] : (optional) int strand + Example : None + Description: Sets the start, end and strand in one call rather than in + 3 seperate calls to the start(), end() and strand() methods. + This is for convenience and for speed when this needs to be + done within a tight loop. This overrides the superclass + move() method so that the internal sequence cache can be + flushed if the exon if moved. + Returntype : none + Exceptions : Thrown is invalid arguments are provided + Caller : general + Status : Stable + +=cut + +sub move { + my $self = shift; + # flush the internal sequence cache + delete $self->{'_seq_cache'}; + return $self->SUPER::move(@_); +} + + +=head2 transform + + Arg 1 : String $coordinate_system_name + Arg [2] : String $coordinate_system_version + Description: moves this exon to the given coordinate system. If this exon has + attached supporting evidence, they move as well. + Returntype : Bio::EnsEMBL::Exon + Exceptions : wrong parameters + Caller : general + Status : Stable + +=cut + +sub transform { + my $self = shift; + + # catch for old style transform calls + if( !@_ || ( ref $_[0] && + ($_[0]->isa( "Bio::EnsEMBL::Slice" ) or $_[0]->isa( "Bio::EnsEMBL::LRGSlice" )) + )) { + deprecate('Calling transform without a coord system name is deprecated.'); + return $self->_deprecated_transform(@_); + } + + my $new_exon = $self->SUPER::transform( @_ ); + if (not defined $new_exon or + $new_exon->length != $self->length) { + return undef; + } + + if( exists $self->{'_supporting_evidence'} ) { + my @new_features; + for my $old_feature ( @{$self->{'_supporting_evidence'}} ) { + my $new_feature = $old_feature->transform( @_ ); + if (defined $new_feature) { + push( @new_features, $new_feature ); + } + } + $new_exon->{'_supporting_evidence'} = \@new_features; + } + + #dont want to share the same sequence cache + delete $new_exon->{'_seq_cache'}; + + return $new_exon; +} + + +=head2 transfer + + Arg [1] : Bio::EnsEMBL::Slice $destination_slice + Example : none + Description: Moves this Exon to given target slice coordinates. If Features + are attached they are moved as well. Returns a new exon. + Returntype : Bio::EnsEMBL::Gene + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub transfer { + my $self = shift; + + my $new_exon = $self->SUPER::transfer( @_ ); + return undef unless $new_exon; + + if( exists $self->{'_supporting_evidence'} ) { + my @new_features; + for my $old_feature ( @{$self->{'_supporting_evidence'}} ) { + my $new_feature = $old_feature->transfer( @_ ); + push( @new_features, $new_feature ); + } + $new_exon->{'_supporting_evidence'} = \@new_features; + } + + #dont want to share the same sequence cache + delete $new_exon->{'_seq_cache'}; + + return $new_exon; +} + + +=head2 add_supporting_features + + Arg [1] : Bio::EnsEMBL::Feature $feature + Example : $exon->add_supporting_features(@features); + Description: Adds a list of supporting features to this exon. + Duplicate features are not added. + If supporting features are added manually in this + way, prior to calling get_all_supporting_features then the + get_all_supporting_features call will not retrieve supporting + features from the database. + Returntype : none + Exceptions : throw if any of the features are not Feature + throw if any of the features are not in the same coordinate + system as the exon + Caller : general + Status : Stable + +=cut + +sub add_supporting_features { + my ($self,@features) = @_; + + return unless @features; + + $self->{_supporting_evidence} ||= []; + + # check whether this feature object has been added already + FEATURE: foreach my $feature (@features) { + unless($feature && $feature->isa("Bio::EnsEMBL::Feature")) { + throw("Supporting feat [$feature] not a " . + "Bio::EnsEMBL::Feature"); + } + + if ((defined $self->slice() && defined $feature->slice())&& + ( $self->slice()->name() ne $feature->slice()->name())){ + throw("Supporting feat not in same coord system as exon\n" . + "exon is attached to [".$self->slice()->name()."]\n" . + "feat is attached to [".$feature->slice()->name()."]"); + } + + foreach my $added_feature ( @{ $self->{_supporting_evidence} } ){ + # compare objects + if ( $feature == $added_feature ){ + # this feature has already been added + next FEATURE; + } + } + + # no duplicate was found, add the feature + push(@{$self->{_supporting_evidence}},$feature); + } +} + + +=head2 flush_supporting_features + + Example : $exon->flush_supporting_features; + Description : Removes all supporting evidence from the exon. + Return type : (Empty) listref + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub flush_supporting_features { + my $self = shift; + $self->{'_supporting_evidence'} = []; +} + + +=head2 get_all_supporting_features + + Arg [1] : none + Example : @evidence = @{$exon->get_all_supporting_features()}; + Description: Retreives any supporting features added manually by + calls to add_supporting_features. If no features have been + added manually and this exon is in a database (i.e. it h + Returntype : listreference of Bio::EnsEMBL::BaseAlignFeature objects + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_supporting_features { + my $self = shift; + + if( !exists $self->{_supporting_evidence} ) { + if($self->adaptor) { + my $sfa = $self->adaptor->db->get_SupportingFeatureAdaptor(); + $self->{_supporting_evidence} = $sfa->fetch_all_by_Exon($self); + } + } + + return $self->{_supporting_evidence} || []; +} + + +=head2 find_supporting_evidence + +# This method is only for genebuild backwards compatibility. +# Avoid using it if possible + + Arg [1] : Bio::EnsEMBL::Feature $features + The list of features to search for supporting (i.e. overlapping) + evidence. + Arg [2] : (optional) boolean $sorted + Used to speed up the calculation of overlapping features. + Should be set to true if the list of features is sorted in + ascending order on their start coordinates. + Example : $exon->find_supporting_evidence(\@features); + Description: Looks through all the similarity features and + stores as supporting features any feature + that overlaps with an exon. + Returntype : none + Exceptions : none + Caller : general + Status : Medium Risk + +=cut + +sub find_supporting_evidence { + my ($self,$features,$sorted) = @_; + + foreach my $f (@$features) { + # return if we have a sorted feature array + if ($sorted == 1 && $f->start > $self->end) { + return; + } + if ($f->sub_SeqFeature) { + my @subf = $f->sub_SeqFeature; + + $self->find_supporting_evidence(\@subf); + } + else { + if ($f->entire_seq()->name eq $self->slice()->name) { + if ($f->end >= $self->start && $f->start <= $self->end && $f->strand == $self->strand) { + $self->add_supporting_features($f); + } + } + } + } +} + + +=head2 stable_id + + Arg [1] : string $stable_id + Example : none + Description: get/set for attribute stable_id + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub stable_id { + my $self = shift; + $self->{'stable_id'} = shift if( @_ ); + return $self->{'stable_id'}; +} + + +=head2 created_date + + Arg [1] : string $created_date + Example : none + Description: get/set for attribute created_date + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub created_date { + my $self = shift; + $self->{'created_date'} = shift if ( @_ ); + return $self->{'created_date'}; +} + + +=head2 modified_date + + Arg [1] : string $modified_date + Example : none + Description: get/set for attribute modified_date + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub modified_date { + my $self = shift; + $self->{'modified_date'} = shift if ( @_ ); + return $self->{'modified_date'}; +} + + +=head2 version + + Arg [1] : string $version + Example : none + Description: get/set for attribute version + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub version { + my $self = shift; + $self->{'version'} = shift if( @_ ); + return $self->{'version'}; +} + + +=head2 is_current + + Arg [1] : Boolean $is_current + Example : $exon->is_current(1) + Description: Getter/setter for is_current state of this exon. + Returntype : Int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub is_current { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'is_current'} = $value; + } + return $self->{'is_current'}; +} + +=head2 is_constitutive + + Arg [1] : Boolean $is_constitutive + Example : $exon->is_constitutive(0) + Description: Getter/setter for is_constitutive state of this exon. + Returntype : Int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub is_constitutive { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'is_constitutive'} = $value; + } + return $self->{'is_constitutive'}; +} + + +=head2 adjust_start_end + + Arg 1 : int $start_adjustment + Arg 2 : int $end_adjustment + Example : none + Description: returns a new Exon with this much shifted coordinates + Returntype : Bio::EnsEMBL::Exon + Exceptions : none + Caller : Transcript->get_all_translateable_Exons() + Status : Stable + +=cut + +sub adjust_start_end { + my ( $self, $start_adjust, $end_adjust ) = @_; + + my $new_exon = Bio::EnsEMBL::Exon->new(); + %{$new_exon} = %{$self}; + + #invalidate the sequence cache + delete $new_exon->{'_seq_cache'}; + + if( $self->strand() == 1 ) { + $new_exon->start( $self->start() + $start_adjust ); + $new_exon->end( $self->end() + $end_adjust ) + } else { + $new_exon->start( $self->start() - $end_adjust ); + $new_exon->end( $self->end() - $start_adjust ) + } + + return $new_exon; +} + + +=head2 peptide + + Arg [1] : Bio::EnsEMBL::Transcript $tr + Example : my $pep_str = $exon->peptide($transcript)->seq; + Description: Retrieves the portion of the transcripts peptide + encoded by this exon. The transcript argument is necessary + because outside of the context of a transcript it is not + possible to correctly determine the translation. Note that + an entire amino acid will be present at the exon boundaries + even if only a partial codon is present. Therefore the + concatenation of all of the peptides of a transcripts exons + is not the same as a transcripts translation because the + summation may contain duplicated amino acids at splice sites. + In the case that this exon is entirely UTR, a Bio::Seq object + with an empty sequence string is returned. + Returntype : Bio::Seq + Exceptions : thrown if transcript argument is not provided + Caller : general + Status : Stable + +=cut + +sub peptide { + my $self = shift; + my $tr = shift; + + unless($tr && ref($tr) && $tr->isa('Bio::EnsEMBL::Transcript')) { + throw("transcript arg must be Bio::EnsEMBL:::Transcript not [$tr]"); + } + + #convert exons coordinates to peptide coordinates + my $tmp_exon = $self->transfer($tr->slice); + if (!$tmp_exon) { + throw("Couldn't transfer exon to transcript's slice"); + } + + my @coords = + $tr->genomic2pep($tmp_exon->start, $tmp_exon->end, $tmp_exon->strand); + + #filter out gaps + @coords = grep {$_->isa('Bio::EnsEMBL::Mapper::Coordinate')} @coords; + + #if this is UTR then the peptide will be empty string + my $pep_str = ''; + + + if(scalar(@coords) > 1) { + my $coord = $self->_merge_ajoining_coords(\@coords); + if($coord) { + @coords = ($coord); + } + else { + my ($e_id, $tr_id) = ($self->stable_id(), $tr->stable_id()); + throw("Error. Exon maps to multiple locations in peptide and those". + " locations are not continuous." . + " Is this exon [$e_id] a member of this transcript [$tr_id]?"); + } + } + elsif(scalar(@coords) == 1) { + my $c = $coords[0]; + my $pep = $tr->translate; + + #bioperl doesn't give back residues for incomplete codons + #make sure we don't subseq too far... + my ($start, $end); + $end = ($c->end > $pep->length) ? $pep->length : $c->end; + $start = ($c->start < $end) ? $c->start : $end; + $pep_str = $tr->translate->subseq($start, $end); + } + + return + Bio::Seq->new( -seq => $pep_str, + -moltype => 'protein', + -alphabet => 'protein', + -id => $self->display_id ); +} + +=head2 _merge_ajoining_coords + + Arg [1] : ArrayRef of Bio::EnsEMBL::Mapper::Coordinate objects + Example : + Description : Merges coords which are ajoining or overlapping + Returntype : Bio::EnsEMBL::Mapper::Coordinate or undef if it cannot happen + Exceptions : Exception if the cooords cannot be condensed into one location + Caller : internal + Status : Development + +=cut + +sub _merge_ajoining_coords { + my ($self, $coords) = @_; + + my $okay = 1; + my $coord = shift @{$coords}; + my $start = $coord->start(); + my $last_end = $coord->end(); + foreach my $other_coord (@{$coords}) { + if( ($last_end + 1) >= $other_coord->start() ) { + $last_end = $other_coord->end(); + } + else { + $okay = 0; + last; + } + } + + if(!$okay) { + return; + } + + my $new_coord = Bio::EnsEMBL::Mapper::Coordinate->new( + $coord->id(), $start, $last_end, $coord->strand(), $coord->rank()); + return $new_coord; +} + + + + +=head2 seq + + Arg [1] : none + Example : my $seq_str = $exon->seq->seq; + Description: Retrieves the dna sequence of this Exon. + Returned in a Bio::Seq object. Note that the sequence may + include UTRs (or even be entirely UTR). + Returntype : Bio::Seq or undef + Exceptions : warning if argument passed, + warning if exon does not have attatched slice + warning if exon strand is not defined (or 0) + Caller : general + Status : Stable + +=cut + +sub seq { + my ( $self, $arg ) = @_; + + if ( defined $arg ) { + warning("seq setting on Exon not supported currently"); + $self->{'_seq_cache'} = $arg->seq(); + } + + if ( !defined( $self->{'_seq_cache'} ) ) { + my $seq; + + if ( !defined $self->slice() ) { + warning("Cannot retrieve seq for exon without slice\n"); + return undef; + } + + if ( !$self->strand() ) { + warning("Cannot retrieve seq for unstranded exon\n"); + return undef; + } + + if ($self->slice->is_circular() ) { + if ( $self->slice->start > $self->slice->end) { +# Normally exons overlapping chromosome origin will have negative feature start, but slice will be from 1 .. length +# But in case you got an exon attached to a sub slice try this + my $mid_point = $self->slice()->seq_region_length() - $self->slice()->start() + 1; + my $seq1 = $self->slice()->subseq( $self->start(), $mid_point, $self->strand() ); + + my $seq2 = $self->slice()->subseq( $mid_point + 1, $self->end(), $self->strand() ); + + $seq = $self->strand() > 0 ? "$seq1$seq2" : "$seq2$seq1"; + } elsif ( $self->start < 0 || $self->start > $self->end) { +# Normally exons overlapping chromosome origin will be 0 based, and can have negative start +# But if you go via sub_Slice it gives you chromosome based coordinates, i.e it will have start greater then end + my $start_point = $self->slice->seq_region_length + $self->slice->start; + my $mid_point = $self->slice->seq_region_length; + my $seq1 = $self->slice->subseq( $self->start, $mid_point, $self->strand); + my $seq2 = $self->slice->subseq(1, $self->end, $self->strand ); + $seq = $self->strand > 0 ? "$seq1$seq2" : "$seq2$seq1"; + } else { +# End this is the case for genes not overlapping the origin + $seq = $self->slice()->subseq( $self->start(), $self->end(), $self->strand() ); + } + } else { + $seq = $self->slice()->subseq( $self->start(), $self->end(), $self->strand() ); + } + + $self->{'_seq_cache'} = $seq; + } ## end if ( !defined( $self->...)) + + return + Bio::Seq->new( -seq => $self->{'_seq_cache'}, + -id => $self->display_id, + -moltype => 'dna', + -alphabet => 'dna' ); +} ## end sub seq + + +=head2 hashkey + + Arg [1] : none + Example : if(exists $hash{$exon->hashkey}) { do_something(); } + Description: Returns a unique hashkey that can be used to uniquely identify + this exon. Exons are considered to be identical if they share + the same seq_region, start, end, strand, phase, end_phase. + Note that this will consider two exons on different slices + to be different, even if they actually are not. + Returntype : string formatted as slice_name-start-end-strand-phase-end_phase + Exceptions : thrown if not all the necessary attributes needed to generate + a unique hash value are set + set + Caller : general + Status : Stable + +=cut + +sub hashkey { + my $self = shift; + + my $slice = $self->{'slice'}; + my $slice_name = ($slice) ? $slice->name() : undef; + my $start = $self->{'start'}; + my $end = $self->{'end'}; + my $strand = $self->{'strand'}; + my $phase = $self->{'phase'}; + my $end_phase = $self->{'end_phase'}; + + if(!defined($slice_name)) { + throw('Slice must be set to generate correct hashkey.'); + } + + if(!defined($start)) { + warning("start attribute must be defined to generate correct hashkey."); + } + + if(!defined($end)) { + throw("end attribute must be defined to generate correct hashkey."); + } + + if(!defined($strand)) { + throw("strand attribute must be defined to generate correct hashkey."); + } + + if(!defined($phase)) { + throw("phase attribute must be defined to generate correct hashkey."); + } + + if(!defined($end_phase)) { + throw("end_phase attribute must be defined to generate correct hashkey."); + } + + return "$slice_name-$start-$end-$strand-$phase-$end_phase"; +} + + +=head2 display_id + + Arg [1] : none + Example : print $exons->display_id(); + Description: This method returns a string that is considered to be + the 'display' identifier. For exons this is (depending on + availability and in this order) the stable Id, the dbID or an + empty string. + Returntype : string + Exceptions : none + Caller : web drawing code + Status : Stable + +=cut + +sub display_id { + my $self = shift; + return $self->{'stable_id'} || $self->dbID || ''; +} + + +=head2 load + + Args : None + Example : $exon->load(); + Description : The Ensembl API makes extensive use of + lazy-loading. Under some circumstances (e.g., + when copying genes between databases), all data of + an object needs to be fully loaded. This method + loads the parts of the object that are usually + lazy-loaded. + Returns : Nothing. + +=cut + +sub load { + my ($self) = @_; + + $self->analysis(); + $self->stable_id(); + $self->get_all_supporting_features(); +} + +=head1 DEPRECATED METHODS + +=cut + + +=head2 _get_stable_entry_info + + Description: DEPRECATED. + +=cut + +sub _get_stable_entry_info { + my $self = shift; + deprecate( "This function shouldnt be called any more" ); + if( !defined $self->adaptor ) { + return undef; + } + $self->adaptor->get_stable_entry_info($self); +} + + +=head2 temporary_id + + Description: DEPRECATED. This should not be necessary. + +=cut + +sub temporary_id { + my $self = shift; + deprecate('It should not be necessary to use this method.'); + $self->{'tempID'} = shift if(@_); + return $self->{'tempID'}; +} + + +=head2 created + + Description: DEPRECATED. Do not use. + +=cut + +sub created { + my ($self,$value) = @_; + deprecate( "Created attribute not supported any more." ); + if(defined $value ) { + $self->{'_created'} = $value; + } + return $self->{'_created'}; +} + +=head2 modified + + Description: DEPRECATED. Do not use. + +=cut + + +sub modified { + my ($self,$value) = @_; + deprecate( "Modified attribute not supported any more." ); + if( defined $value ) { + $self->{'_modified'} = $value; + } + return $self->{'_modified'}; +} + + +=head2 type + + Description: DEPRECATED. Do not use. + +=cut + +sub type { + my ($self,$value) = @_; + deprecate("Type attribute not supported anymore."); + if (defined($value)) { + $self->{'type'} = $value; + } + return $self->{'type'}; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/External/BlastAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/External/BlastAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1190 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +package Bio::EnsEMBL::External::BlastAdaptor; + +use strict; +use DBI; +use Storable qw(freeze thaw); +use Data::Dumper qw( Dumper ); +use Time::Local; + +use vars qw(@ISA); + +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::DBSQL::DBConnection; +use Bio::Search::HSP::EnsemblHSP; # This is a web module + +@ISA = qw( Bio::EnsEMBL::DBSQL::BaseAdaptor ); +#@ISA = qw( Bio::EnsEMBL::DBSQL::DBAdaptor ); + + +#---------------------------------------------------------------------- +# Define SQL + +#--- CREATE TABLES --- +our $SQL_CREATE_TICKET = " +CREATE TABLE blast_ticket ( + ticket_id int(10) unsigned NOT NULL auto_increment, + create_time datetime NOT NULL default '0000-00-00 00:00:00', + update_time datetime NOT NULL default '0000-00-00 00:00:00', + ticket varchar(32) NOT NULL default '', + status enum('CURRENT','DELETED') NOT NULL default 'CURRENT', + object longblob, + PRIMARY KEY (ticket_id), + UNIQUE KEY ticket (ticket), + KEY create_time (create_time), + KEY update_time (update_time) +) ENGINE=MyISAM"; + +our $SQL_CREATE_TABLE_LOG = " +CREATE TABLE blast_table_log ( + table_id int(10) unsigned NOT NULL auto_increment, + table_name varchar(32), + table_type enum('TICKET','RESULT','HIT','HSP') default NULL, + table_status enum('CURRENT','FILLED','DELETED') default NULL, + use_date date default NULL, + create_time datetime default NULL, + delete_time datetime default NULL, + num_objects int(10) default NULL, + PRIMARY KEY (table_id), + KEY table_name (table_name), + KEY table_type (table_type), + KEY use_date (use_date), + KEY table_status (table_status) +) ENGINE=MyISAM"; + + +our $SQL_CREATE_DAILY_RESULT = " +CREATE TABLE %s ( + result_id int(10) unsigned NOT NULL auto_increment, + ticket varchar(32) default NULL, + object longblob, + PRIMARY KEY (result_id), + KEY ticket (ticket) +) ENGINE=MyISAM"; + +our $SQL_CREATE_DAILY_HIT = " +CREATE TABLE %s ( + hit_id int(10) unsigned NOT NULL auto_increment, + ticket varchar(32) default NULL, + object longblob, + PRIMARY KEY (hit_id), + KEY ticket (ticket) +) ENGINE=MyISAM"; + +our $SQL_CREATE_DAILY_HSP = " +CREATE TABLE %s ( + hsp_id int(10) unsigned NOT NULL auto_increment, + ticket varchar(32) default NULL, + object longblob, + chr_name varchar(32) default NULL, + chr_start int(10) unsigned default NULL, + chr_end int(10) unsigned default NULL, + PRIMARY KEY (hsp_id), + KEY ticket (ticket) +) ENGINE=MyISAM MAX_ROWS=705032704 AVG_ROW_LENGTH=4000"; + +#--- TABLE LOG --- +our $SQL_SELECT_TABLE_LOG_CURRENT = " +SELECT use_date +FROM blast_table_log +WHERE table_type = ? +AND table_status = 'CURRENT' +ORDER BY use_date DESC"; + +our $SQL_TABLE_LOG_INSERT = " +INSERT into blast_table_log + ( table_name, table_status, table_type, use_date, create_time) +VALUES ( ?, ?, ?, ?, NOW() )"; + +our $SQL_TABLE_LOG_UPDATE = " +UPDATE blast_table_log +SET table_status = ?, + delete_time = ?, + num_objects = ? +WHERE table_name = ?"; + +#--- TICKETS --- + +our $SQL_SEARCH_MULTI_STORE = " +INSERT INTO blast_ticket ( create_time, update_time, object, ticket ) +VALUES ( NOW(), NOW(), ? , ? )"; + +our $SQL_SEARCH_MULTI_UPDATE = " +UPDATE blast_ticket +SET object = ?, + update_time = NOW() +WHERE ticket = ?"; + +our $SQL_SEARCH_MULTI_RETRIEVE = " +SELECT object +FROM blast_ticket +WHERE ticket = ? "; + +#--- RESULTS --- + +our $SQL_RESULT_STORE = " +INSERT INTO blast_result%s ( object, ticket ) +VALUES ( ? , ? )"; + +our $SQL_RESULT_UPDATE = " +UPDATE blast_result%s +SET object = ?, + ticket = ? +WHERE result_id = ?"; + +our $SQL_RESULT_RETRIEVE = " +SELECT object +FROM blast_result%s +WHERE result_id = ? "; + +our $SQL_RESULT_RETRIEVE_TICKET = " +SELECT object +FROM blast_result%s +WHERE ticket = ? "; + +#--- HITS --- + +our $SQL_HIT_STORE = " +INSERT INTO blast_hit%s ( object, ticket ) +VALUES ( ? , ? )"; + +our $SQL_HIT_UPDATE = " +UPDATE blast_hit%s +SET object = ?, + ticket = ? +WHERE hit_id = ?"; + +our $SQL_HIT_RETRIEVE = " +SELECT object +FROM blast_hit%s +WHERE hit_id = ? "; + +#--- HSPS --- + +our $SQL_HSP_STORE = " +INSERT INTO blast_hsp%s ( object, ticket, chr_name, chr_start, chr_end ) +VALUES ( ? , ? , ? , ? , ? )"; + +our $SQL_HSP_UPDATE = " +UPDATE blast_hsp%s +SET object = ?, + ticket = ?, + chr_name = ?, + chr_start = ?, + chr_end = ? +WHERE hsp_id = ?"; + +our $SQL_HSP_RETRIEVE = " +SELECT object +FROM blast_hsp%s +WHERE hsp_id = ? "; + +our $SQL_HSP_REMOVE = " +UPDATE blast_hsp%s +SET chr_name = NULL, + chr_start = NULL, + chr_end = NULL +WHERE hsp_id = ?"; + + +#=head2 new +# +# Arg [1] : +# Function : +# Returntype: +# Exceptions: +# Caller : +# Example : +# +#=cut +# +# +sub new { + my $caller = shift; +#warn "DB - @_"; + my $connection = Bio::EnsEMBL::DBSQL::DBConnection->new(@_); + my $self = $caller->SUPER::new($connection); + $self->{'disconnect_flag'} = 1; + return $self; +} + + +sub new_fast{ + my ($caller,$connection) = @_; + my $self = $caller->SUPER::new($connection); + $self->{'disconnect_flag'} = 1; + return $self; +} + +#---------------------------------------------------------------------- + +sub species { + my ($self, $arg ) = @_; + ( defined $arg ) && + ( $self->{_species} = $arg ); + $self->{_species}; +} + +#---------------------------------------------------------------------- + +=head2 ticket + + Arg [1] : string ticket (optional) + Function : Get/get the blast ticket attribute + Returntype: string ticket + Exceptions: + Caller : + Example : + +=cut + +sub ticket{ + my $key = "_ticket"; + my $self = shift; + if( @_ ){ $self->{$key} = shift } + return $self->{$key}; +} + +#---------------------------------------------------------------------- + +=head2 store + + Arg [1] : + Function : + Returntype: + Exceptions: + Caller : + Example : + +=cut + +sub store { + my $self = shift; + my $obj = shift; + my $ret_value = undef; + if( $obj->isa("Bio::Tools::Run::SearchMulti") ) { + $ret_value = $self->store_search_multi( $obj, @_ ); +# warn "Just stored as Bio::Tools::Run::SearchMulti"; + } elsif( $obj->isa( "Bio::Search::Result::ResultI" ) ) { + $ret_value = $self->store_result( $obj, @_ ); +# warn "Just stored as Bio::Tools::Result::ResultI"; + } elsif( $obj->isa( "Bio::Search::Hit::HitI" ) ) { + $ret_value = $self->store_hit( $obj, @_ ); +# warn "Just stored as Bio::Tools::Hit::HitI"; + } elsif( $obj->isa( "Bio::Search::HSP::HSPI" ) ) { + $ret_value = $self->store_hsp( $obj, @_ ); +# warn "Just stored as Bio::Tools::HSP::HSPI"; + } else { +# warn "DID NOT STORE ".ref($obj); + $self->throw( "Do not know how to store objects of type ".ref($obj) ); + return undef; + } +# if( $self->{'disconnect_flag'} ) { +# warn "HERE WE ARE DISCONNECTING...."; +# $self->dbc->db_handle->disconnect(); +# $self->dbc->connected(0); +# warn "AND WE ARE RECONNECTING...."; +# $self->dbc->connect(); +# } + return $ret_value; +} + +sub prepare { + my $self = shift; +# warn( "==> ", $self->dbc->dbname, " ", $self->dbc->db_handle ); +#warn @_; + my $T = $self->SUPER::prepare( @_ ); +# warn( "<== ", $self->dbc->dbname, " ", $self->dbc->db_handle ); + return $T; +} +#---------------------------------------------------------------------- + +=head2 retrieve + + Arg [1] : + Function : + Returntype: + Exceptions: + Caller : + Example : + +=cut + +sub retrieve { + my $self = shift; + my $caller = shift; + my %METHODS = qw( + Bio::Tools::Run::EnsemblSearchMulti search_multi + Bio::Search::Result::ResultI result + Bio::Search::Hit::HitI hit + Bio::Search::HSP::HSPI hsp + ); + foreach my $type (keys %METHODS) { + if( UNIVERSAL::isa($caller, $type) ) { + my $method = "retrieve_$METHODS{$type}"; + return $self->$method( @_ ); + } + } + return undef if UNIVERSAL::isa($caller,'Bio::Tools::Run::Search'); + $self->throw( "Do not know how to retrieve objects of type ". + ( ref($caller)? ref($caller) : $caller ) ); +} + +#---------------------------------------------------------------------- + +=head2 remove + + Arg [1] : + Function : TODO: implement remove functions + Returntype: + Exceptions: + Caller : + Example : + +=cut + +sub remove { + my $self = shift; + my $obj = shift; + return 1 if $obj->isa("Bio::Tools::Run::EnsemblSearchMulti"); # Nothing to do here { return $self->remove_search_multi( @_ ); } + return 1 if $obj->isa("Bio::Search::Result::ResultI"); # Nothing to do here { return $self->remove_result( @_ ); } + return 1 if $obj->isa("Bio::Search::Hit::HitI"); # Nothing to do here { return $self->remove_hit( @_ ); } + return $self->remove_hsp( $obj ) if $obj->isa("Bio::Search::HSP::HSPI"); + return undef(); # Do not know how to remove objects of this type +} +#---------------------------------------------------------------------- +=head2 store_search_multi + + Arg [1] : Bio::Tools::Run::EnsemblSearchMulti obj + Function : Stores the ensembl SearchMulti container object in the database + Returntype: scalar (token) + Exceptions: + Caller : + Example : my $container_token = $blast_adpt->store_ticket( $container ); + +=cut + +sub store_search_multi{ + my $self = shift; + my $search_multi = shift || + $self->throw( "Need a Bio::Tools::Run::EnsemblSearchMulti obj" ); + + my $frozen = shift || $search_multi->serialise; + + my $dbh = $self->dbc->db_handle; + my $ticket = $search_multi->token || $self->throw( "Bio::Tools::Run::EnsemblSearchMulti obj has no ticket" ); + + my $sth = $self->prepare( $SQL_SEARCH_MULTI_RETRIEVE ); + my $rv = $sth->execute( $ticket ) || $self->throw( $sth->errstr ); + $sth->finish; + + if( $rv < 1 ){ # Insert (do first to minimise risk of race) + my $sth = $self->prepare( $SQL_SEARCH_MULTI_STORE ); + $sth->execute( $frozen, $ticket ) || $self->throw( $sth->errstr ); + #$search_multi->token( $self->dbh->{mysql_insertid} ); + $sth->finish; + } + else{ # Update + my $sth = $self->prepare( $SQL_SEARCH_MULTI_UPDATE ); + $sth->execute( $frozen, $ticket ) || $self->throw( $sth->errstr ); + $sth->finish; + } + my $sth = $self->prepare('show tables'); $sth->execute(); $sth->finish; + return $search_multi->token(); +} + +#---------------------------------------------------------------------- + +=head2 retrieve_search_multi + + Arg [1] : + Function : + Returntype: + Exceptions: + Caller : + Example : + +=cut + +sub retrieve_search_multi { + my $self = shift; + my $ticket = shift || $self->throw( "Need an EnsemblSearchMulti ticket" ); + + my $dbh = $self->dbc->db_handle; +warn $dbh; +warn $SQL_SEARCH_MULTI_RETRIEVE; + my $sth = $self->prepare( $SQL_SEARCH_MULTI_RETRIEVE ); +warn $sth; + my $rv = $sth->execute( $ticket ) || $self->throw( $sth->errstr ); + if( $rv < 1 ){ $self->throw( "Token $ticket not found" ) } + my ( $frozen ) = $sth->fetchrow_array; + $frozen || $self->throw( "Object from ticket $ticket is empty" ); + $sth->finish; + return $frozen; +} + + + +#---------------------------------------------------------------------- +=head2 store_result + + Arg [1] : Bio::Search::Result::EnsemblResult obj + Function : Stores the ensembl Result in the database + Returntype: scalar (token) + Exceptions: + Caller : + Example : my $result_token = $blast_adpt->store_result( $result ); + +=cut + +sub store_result{ + my $self = shift; + my $res = shift || $self->throw( "Need a Bio::Search::Result::EnsemblResult obj" ); + my $frozen = shift || $res->serialise; + my $dbh = $self->dbc->db_handle; + my $sth; + + my ( $id, $use_date ) = split( '!!', $res->token || '' ); + $use_date ||= $self->use_date( 'RESULT' ); + #my $ticket = $res->group_ticket || warn( "Result $id has no ticket" ); + my $ticket = $self->ticket || warn("Result $id BlastAdaptor has no ticket"); + + my $rv = 0; + if( $id ){ + $sth = $self->prepare( sprintf $SQL_RESULT_RETRIEVE, $use_date ); + $rv = $sth->execute( $id ) || $self->throw( $sth->errstr ); + $sth->finish; + } + if( $rv < 1 ){ # We have no result with this token string Insert + my $use_date = $res->use_date() || $res->use_date($self->use_date('RESULT')); + $sth = $self->prepare( sprintf $SQL_RESULT_STORE, $use_date ); + $sth->execute( $frozen, $ticket ) || $self->throw( $sth->errstr ); + my $id = $dbh->{mysql_insertid}; + $res->token( join( '!!', $id, $use_date ) ); + $sth->finish; + } else { # Update + $sth = $self->prepare( sprintf $SQL_RESULT_UPDATE, $use_date ); + $sth->execute( $frozen, $ticket, $id ) || $self->throw( $sth->errstr ); + $sth->finish; + } + return $res->token(); +} + +sub store_result_2{ + my $self = shift; + my $res = shift || $self->throw( "Need a Bio::Search::Result::EnsemblResult obj" ); + my $frozen = shift || $res->serialise; + my $dbh = $self->dbc->db_handle; + my $sth; + + my ( $id, $use_date ) = split( '!!', $res->token || '' ); + $use_date ||= $self->use_date( 'RESULT' ); + #my $ticket = $res->group_ticket || warn( "Result $id has no ticket" ); + my $ticket = $self->ticket || warn("Result $id BlastAdaptor has no ticket"); + + my $rv = 0; + if( $ticket ){ + $sth = $self->prepare( sprintf $SQL_RESULT_RETRIEVE_TICKET, $use_date ); + $rv = $sth->execute( $ticket ) || $self->throw( $sth->errstr ); + $sth->finish; + } + if( !$rv && $id ){ + $sth = $self->prepare( sprintf $SQL_RESULT_RETRIEVE, $use_date ); + $rv = $sth->execute( $id ) || $self->throw( $sth->errstr ); + $sth->finish; + } + if( $rv < 1 ){ # We have no result with this token string Insert + my $use_date = $res->use_date() || $res->use_date($self->use_date('RESULT')); + $sth = $self->prepare( sprintf $SQL_RESULT_STORE, $use_date ); + $sth->execute( $frozen, $ticket ) || $self->throw( $sth->errstr ); + my $id = $dbh->{mysql_insertid}; + $res->token( join( '!!', $id, $use_date ) ); + $sth->finish; + } else { # Update + $sth = $self->prepare( sprintf $SQL_RESULT_UPDATE, $use_date ); + $sth->execute( $frozen, $ticket, $id ) || $self->throw( $sth->errstr ); + $sth->finish; + } + return $res->token(); +} + +#---------------------------------------------------------------------- + +=head2 retrieve_result + + Arg [1] : + Function : + Returntype: + Exceptions: + Caller : + Example : + +=cut + +sub retrieve_result{ + my $self = shift; + my $token = shift || $self->throw( "Need a Hit token" ); + my ( $id, $use_date ) = split( '!!',$token); + $use_date ||= ''; + + my $dbh = $self->dbc->db_handle; + my $sth = $self->prepare( sprintf $SQL_RESULT_RETRIEVE, $use_date ); + my $rv = $sth->execute( $id ) || $self->throw( $sth->errstr ); + if( $rv < 1 ){ $self->throw( "Token $id not found" ) } + my ( $frozen ) = $sth->fetchrow_array; + $frozen || $self->throw( "Object from result $id is empty" ); + $sth->finish; + return $frozen; +} + +#---------------------------------------------------------------------- +=head2 store_hit + + Arg [1] : Bio::Search::Hit::EnsemblHit obj + Function : Stores the ensembl Hit in the database + Returntype: scalar (token) + Exceptions: + Caller : + Example : my $hit_token = $blast_adpt->store_hit( $hit ); + +=cut + +sub store_hit{ + my $self = shift; + my $hit = shift || + $self->throw( "Need a Bio::Search::Hit::EnsemblHit obj" ); + my $frozen = shift || $hit->serialise; + + my $dbh = $self->dbc->db_handle; + + my ( $id, $use_date ) = split( '!!', $hit->token || '' ); + $use_date ||= $hit->use_date() || $hit->use_date($self->use_date('HIT'));; + #my $ticket = $hit->group_ticket || warn( "Hit $id has no ticket" ); + my $ticket = $self->ticket || warn("Hit $id BlastAdaptor has no ticket"); + + my $rv = 0; + if( $id ){ + my $sth = $self->prepare( sprintf $SQL_HIT_RETRIEVE, $use_date ); + $rv = $sth->execute( $id ) || $self->throw( $sth->errstr ); + $sth->finish; + } + if( $rv < 1 ){ # Insert + my $sth = $self->prepare( sprintf $SQL_HIT_STORE, $use_date ); + $sth->execute( $frozen, $ticket ) || $self->throw( $sth->errstr ); + my $id = $dbh->{mysql_insertid}; + $hit->token( join( '!!', $id, $use_date ) ); + $sth->finish; + } + else{ # Update + my $sth = $self->prepare( sprintf $SQL_HIT_UPDATE, $use_date ); + $sth->execute( $frozen, $ticket, $id ) || $self->throw( $sth->errstr ); + $sth->finish; + } + return $hit->token(); +} + +#---------------------------------------------------------------------- + +=head2 retrieve_hit + + Arg [1] : + Function : + Returntype: + Exceptions: + Caller : + Example : + +=cut + +sub retrieve_hit{ + my $self = shift; + my $token = shift || $self->throw( "Need a Hit token" ); + my ( $id, $use_date ) = split( '!!',$token); + $use_date ||= ''; + my $dbh = $self->dbc->db_handle; + my $sth = $self->prepare( sprintf $SQL_HIT_RETRIEVE, $use_date ); + my $rv = $sth->execute( $id ) || $self->throw( $sth->errstr ); + if( $rv < 1 ){ $self->throw( "Token $token not found" ) } + my ( $frozen ) = $sth->fetchrow_array; + $frozen || $self->throw( "Object from hit $id is empty" ); + $sth->finish; + return $frozen; +} + +#---------------------------------------------------------------------- +=head2 store_hsp + + Arg [1] : Bio::Search::HSP::EnsemblHSP obj + Function : Stores the ensembl HSP in the database + Returntype: + Exceptions: + Caller : + Example : + +=cut + +sub store_hsp{ + my $self = shift; + my $hsp = shift || + $self->throw( "Need a Bio::Search::HSP::EnsemblHSP obj" ); + my $frozen = shift || $hsp->serialise; + + my $dbh = $self->dbc->db_handle; + my ( $id, $use_date ) = split( '!!', $hsp->token || ''); + $use_date ||= $hsp->use_date() || $hsp->use_date($self->use_date('HSP')); + + #my $ticket = $hsp->group_ticket || warn( "HSP $id has no ticket" ); + my $ticket = $self->ticket || warn( "HSP $id BlastAdaptor has no ticket" ); + + my $chr_name = ''; + my $chr_start = 0; + my $chr_end = 0; + if( my $genomic = $hsp->genomic_hit ){ + $chr_name = $genomic->seq_region_name; + $chr_start = $genomic->start; + $chr_end = $genomic->end; + } + my $rv = 0; + if( $id ){ + my $sth = $self->prepare( sprintf $SQL_HSP_RETRIEVE, $use_date ); + $rv = $sth->execute( $id ) || $self->throw( $sth->errstr ); + $sth->finish; + } + if( $rv < 1 ){ # Insert + my $use_date = $hsp->use_date() || $hsp->use_date($self->use_date('HSP')); + my $sth = $self->prepare( 'show tables' ); $sth->execute(); $sth->finish(); + $sth = $self->prepare( sprintf $SQL_HSP_STORE, $use_date ); + my @bound = ( $frozen, $ticket, $chr_name, $chr_start, $chr_end ); + $sth->execute( @bound ) || $self->throw( $sth->errstr ); + my $id = $dbh->{mysql_insertid}; + $hsp->token( join( '!!', $id, $use_date ) ); + $sth->finish; + } + else{ # Update + my $sth = $self->prepare( sprintf $SQL_HSP_UPDATE, $use_date ); + my @bound = ( $frozen, $ticket, $chr_name, $chr_start, $chr_end, $id ); + $sth->execute( @bound ) || $self->throw( $sth->errstr ); + $sth->finish; + } + return $hsp->token(); +} + +#---------------------------------------------------------------------- + +=head2 retrieve_hsp + + Arg [1] : + Function : + Returntype: + Exceptions: + Caller : + Example : + +=cut + +sub retrieve_hsp{ + my $self = shift; + my $token = shift || $self->throw( "Need an HSP token" ); + my ( $id, $use_date ) = split( '!!',$token); + $use_date ||= ''; + my $dbh = $self->dbc->db_handle; + my $sth = $self->prepare( sprintf $SQL_HSP_RETRIEVE, $use_date ); + my $rv = $sth->execute( $id ) || $self->throw( $sth->errstr ); + if( $rv < 1 ){ $self->throw( "Token $token not found" ) } + my ( $frozen ) = $sth->fetchrow_array; + $frozen || $self->throw( "Object from hsp $id is empty" ); + $sth->finish; + return $frozen; +} + +#---------------------------------------------------------------------- + +=head2 remove_hsp + + Arg [1] : $hsp object to be removed + Function : 'removes' hsp from e.g. contigview by setting chr fields + to null + Returntype: + Exceptions: + Caller : $self->remove + Example : + +=cut + +sub remove_hsp { + my $self = shift; + my $hsp = shift || + $self->throw( "Need a Bio::Search::HSP::EnsemblHSP obj" ); + + my $dbh = $self->dbc->db_handle; + + my ( $id, $use_date ) = split( '!!', $hsp->token || ''); + $use_date ||= $hsp->use_date() || $hsp->use_date($self->use_date('HSP')); + + my $sth = $self->prepare( sprintf $SQL_HSP_REMOVE, $use_date ); + my @bound = ( $id ); + my $rv = $sth->execute( @bound ) || $self->throw( $sth->errstr ); + $sth->finish; + return 1; +} + + + +#---------------------------------------------------------------------- + +=head2 get_all_HSPs + + Arg [1] : + Function : + Returntype: + Exceptions: + Caller : + Example : + +=cut + +sub get_all_HSPs { + my $self = shift; + my $ticket = shift || $self->throw( "Need a search ticket!"); + my $chr_name = shift || undef; + my $chr_start = shift || undef; + my $chr_end = shift || undef; + my ( $id, $use_date ) = split( '!!', $ticket ); + $use_date ||= ''; + + my $SQL = qq( +SELECT object, hsp_id +FROM blast_hsp%s +WHERE ticket = ? ); + + my $CHR_SQL = qq( +AND chr_name = ? ); + + my $RANGE_SQL = qq( +AND chr_start <= ? +AND chr_end >= ? ); + + my $q = sprintf( $SQL, $use_date ); + my @binded = ( $id ); + + if( $chr_name ){ + $q .= $CHR_SQL; + push @binded, $chr_name; + + if( $chr_start && $chr_end ){ + $q .= $RANGE_SQL; + push @binded, $chr_end, $chr_start; + } + } + my $sth = $self->dbc->db_handle->prepare($q); + my $rv = $sth->execute( @binded ) || $self->throw( $sth->errstr ); + + my @hsps = (); + foreach my $row( @{$sth->fetchall_arrayref()} ){ + # Retrieve HSP and reset token + my $hsp = thaw( $row->[0] ); + my $hsp_id = $row->[1]; + $hsp->token( join( '!!', $hsp_id, $use_date ) ); + push @hsps, $hsp; + } + $sth->finish; + return [@hsps]; +} + + + +#---------------------------------------------------------------------- + +=head2 get_all_SearchFeatures + + Arg [1] : + Function : + Returntype: + Exceptions: + Caller : + Example : + +=cut + +sub get_all_SearchFeatures { + my $self = shift; + my $hsps = $self->get_all_HSPs(@_); + my $ticket = shift; + + $self->dynamic_use( ref($hsps->[0] ) ); + my @feats = (); + foreach my $hsp( @$hsps ){ + my $base_align = $hsp->genomic_hit || next; + + ( $ticket ) = split( "!!", $ticket ); + my $hsp_id = join( "!!", $ticket, $hsp->token ); + + $base_align->hseqname( join( ":", $base_align->hseqname, $hsp_id ) ); + push @feats, $base_align; + } + return [ @feats ]; +} + +sub dynamic_use { + my( $self, $classname ) = @_; + my( $parent_namespace, $module ) = $classname =~/^(.*::)(.*?)$/; + no strict 'refs'; + return 1 if $parent_namespace->{$module.'::'}; # return if already used + eval "require $classname"; + if($@) { + warn "DrawableContainer: failed to use $classname\nDrawableContainer: $@"; + return 0; + } + $classname->import(); + return 1; +} + +#---------------------------------------------------------------------- + +=head2 use_date + + Arg [1] : + Function : + Returntype: + Exceptions: + Caller : + Example : + +=cut + +my %valid_table_types = ( HIT=>1, HSP=>1, RESULT=>1 ); +sub use_date { + my $key = '_current_table'; + my $self = shift; + my $type = uc( shift ); +#warn "$self --- $key --- $type $self"; + $valid_table_types{$type} || + $self->throw( "Need a table type (Result, Hit or HSP)" ); + + $self->{$key} ||= {}; + if( ! $self->{$key}->{$type} ){ + my $sth = $self->dbc->db_handle->prepare( " +SELECT table_type, use_date + FROM blast_table_log + WHERE table_status = 'CURRENT' +ORDER BY use_date ASC" ); +#warn "prepare... $sth"; +#warn $SQL_SELECT_TABLE_LOG_CURRENT; +#warn $type; + my $rv = $sth->execute();# $type ); +#warn $rv; + unless( $rv ) { + $sth->finish; + warn( $sth->errstr ); + return; + } +#warn "exec..."; + foreach my $r (@{ $sth->fetchall_arrayref }) { + my $date = $r->[1]; + $date =~ s/-//g; + $self->{$key}->{$r->[0]} = $date; +#warn "$r->[0] ---> $r->[1] ---> $date"; + } +# $rv > 0 || ( warn( "No current $type table found" ) && return ); + $sth->finish; +#warn "end of finish..."; + } + return $self->{$key}->{$type}; +} + + + +#---------------------------------------------------------------------- + +=head2 clean_blast_database + + Arg [1] : int $days + Function : Removes blast tickets older than $days days + Returntype: + Exceptions: SQL errors + Caller : + Example : $ba->clean_blast_database(14) + +=cut + +sub clean_blast_database{ + my $self = shift; + my $days = shift || $self->throw( "Missing arg: number of days" ); + $days =~ /\D/ && $self->throw( "Bad arg: number of days $days not int" ); + my $dbh = $self->dbc->db_handle; + + # Get list of tickets > $days days old + my $q = qq( + SELECT ticket_id + FROM blast_ticket + WHERE update_time < SUBDATE( NOW(), INTERVAL $days DAY ) ); + + my $sth = $self->dbc->db_handle->prepare($q); + my $rv = $sth->execute() || $self->throw( $sth->errstr ); + my $res = $sth->fetchall_arrayref; + $sth->finish; + + # Delete result and ticket rows associated with old tickets + my $q_del_tmpl = qq( + DELETE + FROM blast_ticket + WHERE ticket_id = %s); + + my $c = 0; + foreach my $row( @$res ){ + my $ticket_id = $row->[0]; + $c++; + my $q_del = sprintf( $q_del_tmpl, $ticket_id ); + my $sth = $self->dbc->db_handle->prepare($q_del); + my $rv = $sth->execute() || $self->throw( $sth->errstr ); + } + warn "Purging $days days: Deleted $c rows\n"; + + # Drop daily Result, Hit and HSP tables not updated within $days days + my $q_find = 'show table status like ?'; + my $sth2 = $self->prepare( $q_find ); + $sth2->execute( "blast_result%" ) || $self->throw( $sth2->errstr ); + my $res_res = $sth2->fetchall_arrayref(); + $sth2->execute( "blast_hit%" ) || $self->throw( $sth2->errstr ); + my $hit_res = $sth2->fetchall_arrayref(); + $sth2->execute( "blast_hsp%" ) || $self->throw( $sth2->errstr ); + my $hsp_res = $sth2->fetchall_arrayref(); + + my @deletable_hit_tables; + foreach my $row( @$res_res, @$hit_res, @$hsp_res ){ + my $table_name = $row->[0]; ## table name + my $num_rows = $row->[4]; ## # Rows... + my $update_time = $row->[12]; ## update time --- Should be a string like 2003-08-15 10:36:56 + next unless $update_time; #cope with an occasional innodb table that has no update time + my @time = split( /[-:\s]/, $update_time ); + + my $epoch_then = timelocal( $time[5], $time[4], $time[3], + $time[2], $time[1]-1, $time[0] - 1900 ); + my $secs_old = time() - $epoch_then; + my $days_old = $secs_old / ( 60 * 60 * 24 ); + if( $days_old > $days ){ + warn( "Dropping table $table_name: $num_rows rows\n" ); + my $sth_drop = $self->prepare( "DROP table $table_name" ); + my $sth_log = $self->prepare( $SQL_TABLE_LOG_UPDATE ); + $sth_drop->execute || $self->throw( $sth_drop->errstr ); + my( $se,$mi,$hr,$da,$mo,$yr ) = (localtime)[0,1,2,3,4,5]; + my $now = sprintf( "%4d-%2d-%2d %2d:%2d:%2d", + $yr+1900,$mo+1,$da,$hr,$mi,$se ); + $sth_log->execute + ('DELETED',$now,$num_rows,$table_name) || + $self->throw( $sth_log->errstr ); + } + } + + return 1; +} + +#---------------------------------------------------------------------- + +=head2 create_tables + + Arg [1] : none + Function : Creates the blast_ticket and blast_table_log + tables in the database indicated by the database handle. + Checks first to make sure they do not exist + Returntype: boolean + Exceptions: + Caller : + Example : + +=cut + +sub create_tables { + my $self = shift; + my $dbh = $self->dbc->db_handle; + + # Get list of existing tables in database + my $q = 'show tables like ?'; + my $sth = $self->prepare( $q ); + my $rv_tck = $sth->execute("blast_ticket") || $self->throw($sth->errstr); + my $rv_log = $sth->execute("blast_table_log" )|| $self->throw($sth->errstr); + $sth->finish; + + if( $rv_tck == 0 ){ + warn( "Creating blast_ticket table\n" ); + my $sth = $self->prepare( $SQL_CREATE_TICKET ); + my $rv = $sth->execute() || $self->throw( $sth->errstr ); + $sth->finish; + } + else{ warn( "blast_ticket table already exists\n" ) } + + if( $rv_log == 0 ){ + warn( "Creating blast_result table\n" ); + my $sth = $self->prepare( $SQL_CREATE_TABLE_LOG ); + my $rv = $sth->execute() || $self->throw( $sth->errstr ); + $sth->finish; + } + else{ warn( "blast_table_log table already exists\n" ) } + + return 1; +} + +#---------------------------------------------------------------------- + +=head2 rotate_daily_tables + + Arg [1] : none + Function : Creates the daily blast_result{date}, blast_hit{date} + and blast_hsp{date} tables in the database indicated by + the database handle. + Checks first to make sure they do not exist. + Sets the new table to 'CURRENT' in the blast_table_log. + Sets the previous 'CURRENT' table to filled. + Returntype: boolean + Exceptions: + Caller : + Example : + +=cut + +sub rotate_daily_tables { + my $self = shift; + my $dbh = $self->dbc->db_handle; + + # Get date + my( $day, $month, $year ) = (localtime)[3,4,5]; + my $date = sprintf( "%04d%02d%02d", $year+1900, $month+1, $day ); + + my $res_table = "blast_result$date"; + my $hit_table = "blast_hit$date"; + my $hsp_table = "blast_hsp$date"; + + # Get list of existing tables in database + my $q = 'show table status like ?'; + my $sth = $self->prepare( $q ); + my $rv_res = $sth->execute($res_table) || $self->throw($sth->errstr); + my $rv_hit = $sth->execute($hit_table) || $self->throw($sth->errstr); + my $rv_hsp = $sth->execute($hsp_table) || $self->throw($sth->errstr); + $sth->finish; + + if( $rv_res == 0 ){ + warn( "Creating today's $res_table table\n" ); + + # Create new table + my $q = sprintf($SQL_CREATE_DAILY_RESULT, $res_table); + my $sth1 = $self->prepare( $q ); + my $rv = $sth1->execute() || $self->throw( $sth1->errstr ); + + # Flip current table in blast_table_tog + my $last_date = $self->use_date( "RESULT" ) || ''; + my $sth2 = $self->prepare( $SQL_TABLE_LOG_INSERT ); + my $sth3 = $self->prepare( $SQL_TABLE_LOG_UPDATE ); + $sth2->execute( "$res_table",'CURRENT','RESULT',$date ) + || die( $self->throw( $sth2->errstr ) ); + $sth3->execute( 'FILLED','0',0,"blast_result$last_date") + || die( $self->throw( $sth3->errstr ) ); + $sth1->finish(); + $sth2->finish(); + $sth3->finish(); + } + else{ warn( "Today's $res_table table already exists\n" ) } + + if( $rv_hit == 0 ){ + warn( "Creating today's $hit_table table\n" ); + + # Create new table + my $q = sprintf($SQL_CREATE_DAILY_HIT, $hit_table); + my $sth1 = $self->prepare( $q ); + my $rv = $sth1->execute() || $self->throw( $sth1->errstr ); + + # Flip current table in blast_table_tog + my $last_date = $self->use_date( "HIT" ) || ''; + my $sth2 = $self->prepare( $SQL_TABLE_LOG_INSERT ); + my $sth3 = $self->prepare( $SQL_TABLE_LOG_UPDATE ); + $sth2->execute( "$hit_table",'CURRENT','HIT',$date ) + || die( $self->throw( $sth2->errstr ) ); + $sth3->execute( 'FILLED','0',0,"blast_hit$last_date") + || die( $self->throw( $sth3->errstr ) ); + $sth1->finish(); + $sth2->finish(); + $sth3->finish(); + } + else{ warn( "Today's $hit_table table already exists\n" ) } + + if( $rv_hsp == 0 ){ + warn( "Creating today's $hsp_table table\n" ); + + # Create new table + my $q = sprintf($SQL_CREATE_DAILY_HSP, $hsp_table ); + my $sth1 = $self->prepare( $q ); + my $rv = $sth1->execute() || $self->throw( $sth1->errstr ); + + # Flip current table in blast_table_tog + my $last_date = $self->use_date( "HSP" ) || ''; + my $sth2 = $self->prepare( $SQL_TABLE_LOG_INSERT ); + my $sth3 = $self->prepare( $SQL_TABLE_LOG_UPDATE ); + $sth2->execute( "$hsp_table",'CURRENT','HSP',$date ) + || die( $self->throw( $sth2->errstr ) ); + $sth3->execute( 'FILLED','0',0,"blast_hsp$last_date") + || die( $self->throw( $sth3->errstr ) ); + $sth1->finish(); + $sth2->finish(); + $sth3->finish(); + } + else{ warn( "Today's $hsp_table table already exists\n" ) } + return 1; +} + +#---------------------------------------------------------------------- + + +=head2 cleanup_processes + + Arg [1] : none + Function : Kills any sleeping processes older that 1000 + Returntype: boolean + Exceptions: + Caller : + Example : + +=cut + +sub cleanup_processes { + my $self = shift; + my $dbh = $self->dbc->db_handle; + my $sth = $self->prepare( 'show processlist' ); + my $kill_sth = $self->prepare('kill ?'); + $sth->execute; + my $res = $sth->fetchall_arrayref([0,3,4,5]); + my $c = 0; + foreach my $ps (@$res) { + my ($pid,$db,$stat,$time) = @$ps; + if ($db eq 'ensembl_blast') { + if ( ($stat eq 'Sleep') && ($time > 1000) ) { + $kill_sth->execute($pid); + $c++; + } + } + } + warn "Killed $c processes"; + return 1; +} + + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/External/ExternalFeatureAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/External/ExternalFeatureAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,702 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::External::ExternalFeatureAdaptor + +=head 1 SUMMARY + +Allows features created externally from Ensembl in a single +coordinate system to be retrieved in several other (Ensembl-style) +coordinate systems. This is intended to be a replacement for the old +Bio::EnsEMBL::DB::ExternalFeatureFactoryI interface. + +=head1 SYNOPSIS + + $database_adaptor = new Bio::EnsEMBL::DBSQL::DBAdaptor( + -host => 'kaka.sanger.ac.uk', + -dbname => 'homo_sapiens_core_9_30', + -pass => 'anonymous' + ); + + $xf_adaptor = new ExternalFeatureAdaptorSubClass; + + # Connect the Ensembl core database: + $xf_adaptor->db($database_adaptor); + + # get some features in vontig coords + @feats = + @{ $xf_adaptor->fetch_all_by_contig_name('AC000087.2.1.42071') }; + + # get some features in assembly coords + @feats = + @{ $xf_adaptor->fetch_all_by_chr_start_end( 'X', 100000, 200000 ) }; + + # get some features in clone coords + @feats = @{ $xf_adaptor->fetch_all_by_clone_accession('AC000087') }; + + # Add the adaptor to the ensembl core dbadaptor (implicitly sets db + # attribute) + $database_adaptor->add_ExternalFeatureAdaptor($xf_adaptor); + + # get some features in Slice coords + $slice_adaptor = $database_adaptor->get_SliceAdaptor; + $slice = + $slice_adaptor->fetch_all_by_chr_start_end( 1, 100000, 200000 ); + @feats = @{ $xf_adaptor->fetch_all_by_Slice($slice) }; + + # now features can be retrieved directly from Slice + @feats = @{ $slice->get_all_ExternalFeatures }; + +=head1 DESCRIPTION + +This class is intended to be used as a method of getting external +features into EnsEMBL. To work, this class must be extended and must +implement the the coordinate_systems method. As well, the subclass +is required to implement a single fetch method so that the external +features may be retrieved. By implementing a single fetch_method in a +single coordinate system all of the other ExternalFeatureAdaptor fetch +methods become available for retrieving the data in several different +coordinate systems. + +The coordinate_systems method should return a list of strings indicating +which coordinate system(s) have been implemented. If a given string is +returned from the coordinate_systems method then the corresponding fetch +method must be implemented. The reverse is also true: if a fetch method +is implemented then coordinate_systems must return the appropriate +string in its list of return values. The following are the valid +coordinate system values and the corresponding fetch methods that must +be implemented: + + COORD SYSTEM STRING FETCH_METHOD + ------------------- ------------ + 'ASSEMBLY' fetch_all_by_chr_start_end + 'CLONE' fetch_all_by_clone_accession + 'CONTIG' fetch_all_by_contig_name + 'SUPERCONTIG' fetch_all_by_supercontig_name + 'SLICE' fetch_all_by_Slice + +The objects returned by the fetch methods should be EnsEMBL or BioPerl +style Feature objects. These objects MUST have start, end and strand +methods. + +Before the non-overridden ExternalFeatureAdaptor fetch methods may +be called an EnsEMBL core database adaptor must be attached to the +ExternalFeatureAdaptor . This database adaptor is required to perform +the remappings between various coordinate system. This may be done +implicitly by adding the ExternalFeatureAdaptor to the database adaptor +through a call to the DBAdaptor add_ExternalFeatureAdaptor method or +explicitly by calling the ExternalFeatureAdaptor ensembl_db method. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::External::ExternalFeatureAdaptor; + +use strict; + +use Bio::EnsEMBL::Utils::Exception qw(warning throw); + + +=head2 new + + Arg [1] : none + Example : $xfa = new Bio::EnsEMBL::External::ExternalFeatureAdaptor; + Description: Creates a new ExternalFeatureAdaptor object. You may wish to + extend this constructor and provide your own set of paremeters. + Returntype : Bio::EnsEMBL::External::ExternalFeatureAdaptor + Exceptions : none + Caller : general + +=cut + +sub new { + my $class = shift; + + if(ref $class) { + return bless {}, ref $class; + } + + return bless {}, $class; +} + + + +=head2 ensembl_db + + Arg [1] : (optional) Bio::EnsEMBL::DBSQL::DBAdaptor + Example : $external_feature_adaptor->ensembl_db($new_val); + Description: none + Returntype : Bio::EnsEMBL::DBSQL::DBAdaptor + Exceptions : none + Caller : internal + +=cut + +sub ensembl_db { + my ($self, $value) = @_; + + if($value) { + $self->{'ensembl_db'} = $value; + } + + return $self->{'ensembl_db'}; +} + + + +=head2 coordinate_systems + + Arg [1] : none + Example : @implemented_coord_systems = $ext_adaptor->coordinate_systems; + Description: ABSTRACT method. Must be implemented by all + ExternalFeatureAdaptor subclasses. This method returns a list + of coordinate systems which are implemented by the subclass. + A minimum of on valid coordinate system must be implemented. + Valid coordinate systems are: 'SLICE', 'ASSEMBLY', 'CONTIG', + and 'CLONE'. + Returntype : list of strings + Exceptions : none + Caller : internal + +=cut + +sub coordinate_systems { + my $self = shift; + + throw("abstract method coordinate_systems not implemented\n"); + + return ''; +} + + +=head2 track_name + + Arg [1] : none + Example : $track_name = $xf_adaptor->track_name; + Description: Currently this is not really used. In the future it may be + possible to have ExternalFeatures automatically displayed by + the EnsEMBL web code. By default this method returns + 'External features' but you are encouraged to override this + method and provide your own meaningful name for the features + your adaptor provides. This also allows you to distinguish the + type of features retrieved from Slices. See + the PODs for Bio::EnsEMBL::Slice::get_all_ExternalFeatures and + Bio::EnsEMBL::DBSQL::DBAdaptor::add_ExternalFeatureAdaptor + methods. + Returntype : string + Exceptions : none + Caller : Bio::EnsEMBL::DBSQL::DBAdaptor::add_ExternalFeatureAdaptor + +=cut + +sub track_name { + my $self = shift; + + return 'External features'; +} + + + +=head2 feature_type + + Arg [1] : none + Example : $feature_type = $xf_adaptor->track_name + Description: Currently this is not used. In the future it may be possible + to have ExternalFeatures automatically displayed by the EnsEMBL + web code. This method would then be used do determine the + type of glyphs used to draw the features which are returned + from this external adaptor. + Returntype : string + Exceptions : none + Caller : none + +=cut + +sub feature_type { + my $self = shift; + + return qw(SIMPLE); +} + + + +=head2 fetch_all_by_Slice + + Arg [1] : Bio::EnsEMBL::Slice $slice + Example : @features = @{$ext_adaptor->fetch_all_by_Slice($slice)}; + Description: Retrieves all features which lie in the region defined + by $slice in slice coordinates. + + If this method is overridden then the coordinate_systems method + must return 'SLICE' as one of its values. + + This method will work as is (i.e. without overriding it) + providing at least one of the other fetch methods is overridden. + Returntype : reference to a list of Bio::SeqFeature objects in the Slice + coordinate system + Exceptions : Thrown on incorrect input arguments + Caller : general, fetch_all_by_chr_start_end + +=cut + +sub fetch_all_by_Slice { + my ($self, $slice) = @_; + + unless($slice && ref $slice && $slice->isa('Bio::EnsEMBL::Slice')) { + throw("[$slice] is not a Bio::EnsEMBL::Slice"); + } + + my $out = []; + + my $csa = $self->ensembl_db->get_CoordSystemAdaptor(); + + my $slice_start = $slice->start; + my $slice_end = $slice->end; + my $slice_strand = $slice->strand; + my $slice_seq_region = $slice->seq_region_name; + my $slice_seq_region_id = $slice->get_seq_region_id; + my $coord_system = $slice->coord_system; + + if($self->_supported('SLICE')) { + throw("ExternalFeatureAdaptor supports SLICE coordinate system" . + " but fetch_all_by_Slice not implemented"); + } + + my %features; + my $from_coord_system; + + my $fetch_method; + + # + # Get all of the features from whatever coord system they are computed in + # + if($self->_supported('CLONE')) { + $fetch_method = sub { + my $self = shift; + my $name = shift; + my ($acc, $ver) = split(/\./, $name); + $self->fetch_all_by_clone_accession($acc,$ver,@_); + }; + $from_coord_system = $csa->fetch_by_name('clone'); + } elsif($self->_supported('ASSEMBLY')) { + $from_coord_system = $csa->fetch_by_name('chromosome'); + $fetch_method = $self->can('fetch_all_by_chr_start_end'); + } elsif($self->_supported('CONTIG')) { + $from_coord_system = $csa->fetch_by_name('contig'); + $fetch_method = $self->can('fetch_all_by_contig_name'); + } elsif($self->_supported('SUPERCONTIG')) { + $from_coord_system = $csa->fetch_by_name('supercontig'); + $fetch_method = $self->can('fetch_all_by_supercontig_name'); + } else { + $self->_no_valid_coord_systems(); + } + + if($from_coord_system->equals($coord_system)) { + $features{$slice_seq_region} = &$fetch_method($self, $slice_seq_region, + $slice_start,$slice_end); + } else { + foreach my $segment (@{$slice->project($from_coord_system->name, + $from_coord_system->version)}) { + my ($start,$end,$pslice) = @$segment; + $features{$pslice->seq_region_name } ||= []; + push @{$features{$pslice->seq_region_name }}, + @{&$fetch_method($self, $pslice->seq_region_name, + $pslice->start(), + $pslice->end())}; + } + } + + my @out; + + if(!$coord_system->equals($from_coord_system)) { + my $asma = $self->ensembl_db->get_AssemblyMapperAdaptor(); + my $mapper = $asma->fetch_by_CoordSystems($from_coord_system, + $coord_system); + my %slice_cache; + my $slice_adaptor = $self->ensembl_db->get_SliceAdaptor(); + my $slice_setter; + + #convert the coordinates of each of the features retrieved + foreach my $fseq_region (keys %features) { + my $feats = $features{$fseq_region}; + next if(!$feats); + $slice_setter = _guess_slice_setter($feats) if(!$slice_setter); + + foreach my $f (@$feats) { + my($sr_id, $start, $end, $strand) = + $mapper->fastmap($fseq_region,$f->start,$f->end,$f->strand, + $from_coord_system); + + #maps to gap + next if(!defined($sr_id)); + + #maps to unexpected seq region, probably error in the externally + if($sr_id ne $slice_seq_region_id) { + warning("Externally created Feature mapped to [$sr_id] " . + "which is not on requested seq_region_id [$slice_seq_region_id]"); + next; + } + + #update the coordinates of the feature + &$slice_setter($f,$slice); + $f->start($start); + $f->end($end); + $f->strand($strand); + push @out, $f; + } + } + } else { + #we already know the seqregion the featues are on, we just have + #to put them on the slice + @out = @{$features{$slice_seq_region}}; + my $slice_setter = _guess_slice_setter(\@out); + + foreach my $f (@out) { + &$slice_setter($f,$slice); + } + } + + #shift the feature coordinates again if + #the requested slice is not the full seqregion + if($slice->start != 1 || $slice->strand != 1) { + #convert from assembly coords to slice coords + my($f_start, $f_end, $f_strand); + foreach my $f (@out) { + if($slice_strand == 1) { + $f_start = $f->start - $slice_start + 1; + $f_end = $f->end - $slice_start + 1; + $f_strand = $f->strand; + } else { + $f_start = $slice_end - $f->end + 1; + $f_end = $slice_end - $f->start + 1; + $f_strand = $f->strand * -1; + } + + $f->start($f_start); + $f->end($f_end); + $f->strand($f_strand); + } + } + + return \@out; +} + + +sub _guess_slice_setter { + my $features = shift; + + #we do not know what type of features these are. They might + #be bioperl features or old ensembl features, hopefully they are new + #style features. Try to come up with a setter method for the + #slice. + + return undef if(!@$features); + + my ($f) = @$features; + + my $slice_setter; + foreach my $method (qw(slice contig attach_seq)) { + last if($slice_setter = $f->can($method)); + } + + if(!$slice_setter) { + if($f->can('seqname')) { + $slice_setter = sub { $_[0]->seqname($_[1]->seq_region_name()); }; + } else { + $slice_setter = sub{} if(!$slice_setter); + } + } + + return $slice_setter; +} + + +=head2 fetch_all_by_contig_name + + Arg [1] : string $contig_name + Arg [2] : int $start (optional) + The start of the region on the contig to retrieve features on + if not specified the whole of the contig is used. + Arg [3] : int $end (optional) + The end of the region on the contig to retrieve features on + if not specified the whole of the contig is used. + Example : @fs = @{$self->fetch_all_by_contig_name('AB00879.1.1.39436')}; + Description: Retrieves features on the contig defined by the name + $contig_name in contig coordinates. + + If this method is overridden then the coordinate_systems + method must return 'CONTIG' as one of its values. + + This method will work as is (i.e. without being overridden) + providing at least one other fetch method has + been overridden. + Returntype : reference to a list of Bio::SeqFeature objects in the contig + coordinate system. + Exceptions : thrown if the input argument is incorrect + thrown if the coordinate_systems method returns the value + 'CONTIG' and this method has not been overridden. + Caller : general, fetch_all_by_Slice + +=cut + +sub fetch_all_by_contig_name { + my ($self, $contig_name, $start, $end) = @_; + + unless($contig_name) { + throw("contig_name argument not defined"); + } + + if($self->_supported('CONTIG')) { + throw("ExternalFeatureAdaptor supports CONTIG coordinate system" . + " but fetch_all_by_contig_name is not implemented"); + } + + unless($self->ensembl_db) { + throw('DB attribute not set. This value must be set for the ' . + 'ExternalFeatureAdaptor to function correctly'); + } + + my $slice_adaptor = $self->ensembl_db->get_SliceAdaptor(); + my $slice = $slice_adaptor->fetch_by_region('contig', $contig_name, + $start, $end); + return $self->fetch_all_by_Slice($slice); +} + + + +=head2 fetch_all_by_supercontig_name + + Arg [1] : string $supercontig_name + Arg [2] : int $start (optional) + The start of the region on the contig to retrieve features on + if not specified the whole of the contig is used. + Arg [3] : int $end (optional) + The end of the region on the contig to retrieve features on + if not specified the whole of the contig is used. + Example : @fs = @{$self->fetch_all_by_contig_name('NT_004321')}; + Description: Retrieves features on the contig defined by the name + $supercontigname in supercontig coordinates. + + If this method is overridden then the coordinate_systems + method must return 'SUPERCONTIG' as one of its values. + + This method will work as is (i.e. without being overridden) + providing at least one other fetch method has + been overridden. + Returntype : reference to a list of Bio::SeqFeature objects in the contig + coordinate system. + Exceptions : thrown if the input argument is incorrect + thrown if the coordinate_systems method returns the value + 'SUPERCONTIG' and this method has not been overridden. + Caller : general, fetch_all_by_Slice + +=cut + + +sub fetch_all_by_supercontig_name { + my ($self, $supercontig_name, $start, $end) = @_; + + unless($supercontig_name) { + throw("supercontig_name argument not defined"); + } + + if($self->_supported('SUPERCONTIG')) { + throw("ExternalFeatureAdaptor supports SUPERCONTIG coordinate system" . + " but fetch_all_by_supercontig_name is not implemented"); + } + + unless($self->ensembl_db) { + throw('DB attribute not set. This value must be set for the ' . + 'ExternalFeatureAdaptor to function correctly'); + } + + my $slice_adaptor = $self->ensembl_db->get_SliceAdaptor(); + my $slice = $slice_adaptor->fetch_by_region('supercontig', $supercontig_name, + $start, $end); + return $self->fetch_all_by_Slice($slice); +} + + +=head2 fetch_all_by_clone_accession + + Arg [1] : string $acc + The EMBL accession number of the clone to fetch features from. + Arg [2] : (optional) string $ver + Arg [3] : (optional) int $start + Arg [4] : (optional) int $end + + Example : @fs = @{$self->fetch_all_by_clone_accession('AC000093')}; + Description: Retrieves features on the clone defined by the $acc arg in + Clone coordinates. + + If this method is overridden then the coordinate_systems method + must return 'CLONE' as one of its values. The arguments + start, end, version are passed if this method is overridden and + can optionally be used to reduce the scope of the query and + improve performance. + + This method will work as is - providing at least one other + fetch method has been overridden. + Returntype : reference to a list of Bio::SeqFeature objects in the Clone + coordinate system + Exceptions : thrown if the input argument is incorrect + thrown if the coordinate system method returns the value 'CLONE' + and this method is not overridden. + thrown if the coordinate systems method does not return any + valid values. + Caller : general, fetch_all_by_clone_accession + +=cut + +sub fetch_all_by_clone_accession { + my ($self, $acc, $version, $start, $end) = @_; + + unless($acc) { + throw("clone accession argument not defined"); + } + + if($self->_supported('CLONE')) { + throw('ExternalFeatureAdaptor supports CLONE coordinate system ' . + 'but does not implement fetch_all_by_clone_accession'); + } + + unless($self->ensembl_db) { + throw('DB attribute not set. This value must be set for the ' . + 'ExternalFeatureAdaptor to function correctly'); + } + + if(defined($version)) { + $acc = "$acc.$version"; + } elsif(!$acc =~ /\./) { + $acc = "$acc.1"; + } + + my $slice_adaptor = $self->ensembl_db->get_SliceAdaptor; + + my $slice = $slice_adaptor->fetch_by_region('clone', $acc, $start, $end); + + return $self->fetch_all_by_Slice($slice); +} + + + +=head2 fetch_all_by_chr_start_end + + Arg [1] : string $chr_name + The name of the chromosome to retrieve features from + Arg [2] : int $start + The start coordinate of the chromosomal region to retrieve + features from. + Arg [3] : int $end + The end coordinate of the chromosomal region to retrieve + features from. + Example : @features + Description: Retrieves features on the region defined by the $chr_name, + $start, and $end args in assembly (chromosomal) coordinates. + + If this method is overridden then the coordinate_systems method + must return 'ASSEMBLY' as one of its values. + + This method will work as is (i.e. without overriding it) + providing at least one of the other fetch methods is overridden. + Returntype : reference to a list of Bio::SeqFeatures + Exceptions : Thrown if the coordinate_systems method returns ASSEMBLY as a + value and this method is not overridden. + Thrown if any of the input arguments are incorrect + Caller : general, fetch_all_by_Slice + +=cut + +sub fetch_all_by_chr_start_end { + my ($self, $chr_name, $start, $end) = @_; + + unless($chr_name && defined $start && defined $end && $start < $end) { + throw("Incorrect start [$start] end [$end] or chr [$chr_name] arg"); + } + + unless($self->ensembl_db) { + throw('DB attribute not set. This value must be set for the ' . + 'ExternalFeatureAdaptor to function correctly'); + } + + my $slice_adaptor = $self->ensembl_db->get_SliceAdaptor(); + + my $slice = $slice_adaptor->fetch_by_region('toplevel', $chr_name, $start, + $end); + + return $self->fetch_all_by_Slice($slice); +} + + +=head2 _no_valid_coord_system + + Arg [1] : none + Example : none + Description: PRIVATE method - throws an error with a descriptive message + Returntype : none + Exceptions : always thrown + Caller : internal + +=cut + +sub _no_valid_coord_system { + my $self = shift; + + throw("This ExternalFeatureAdaptor does not support a known " . + "coordinate system.\n Valid coordinate systems are: " . + "[SLICE,ASSEMBLY,SUPERCONTIG,CONTIG,CLONE].\n This External Adaptor " . + "supports: [" . join(', ', $self->coordinate_systems) . "]"); +} + + + + +=head2 _supported + + Arg [1] : string $system + Example : print "CONTIG system supported" if($self->_supported('CONTIG')); + Description: PRIVATE method. Tests if the coordinate system defined by + the $system argument is implemented. + Returntype : boolean + Exceptions : none + Caller : internal + +=cut + +sub _supported { + my ($self, $system) = @_; + + #construct the hash of supported features if it has not been already + unless(exists $self->{_supported}) { + $self->{_supported} = {}; + foreach my $coord_system ($self->coordinate_systems) { + $self->{_supported}->{$coord_system} = 1; + } + } + + return $self->{_supported}->{$system}; +} + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Feature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Feature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1582 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Feature - Ensembl specific sequence feature. + +=head1 SYNOPSIS + + my $feat = new Bio::EnsEMBL::Feature( + -start => 100, + -end => 220, + -strand => -1, + -slice => $slice -analysis => $analysis + ); + + my $start = $feat->start(); + my $end = $feat->end(); + my $strand = $feat->strand(); + + # Move the feature to the chromosomal coordinate system + $feature = $feature->transform('chromosome'); + + # Move the feature to a different slice (possibly on another coord + # system) + $feature = $feature->transfer($new_slice); + + # Project the feature onto another coordinate system possibly across + # boundaries: + @projection = @{ $feature->project('contig') }; + + # Change the start, end, and strand of the feature in place + $feature->move( $new_start, $new_end, $new_strand ); + +=head1 DESCRIPTION + +This is the Base feature class from which all Ensembl features inherit. +It provides a bare minimum functionality that all features require. It +basically describes a location on a sequence in an arbitrary coordinate +system. + +=head1 METHODS + +=cut + + +package Bio::EnsEMBL::Feature; + +use strict; +use warnings; + +use Bio::EnsEMBL::Storable; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning); +use Bio::EnsEMBL::Utils::Scalar qw(check_ref assert_ref); +use Bio::EnsEMBL::Slice; +use Bio::EnsEMBL::StrainSlice; +use vars qw(@ISA); + +use Scalar::Util qw(weaken isweak); + +@ISA = qw(Bio::EnsEMBL::Storable); + + +=head2 new + + Arg [-SLICE]: Bio::EnsEMBL::SLice - Represents the sequence that this + feature is on. The coordinates of the created feature are + relative to the start of the slice. + Arg [-START]: The start coordinate of this feature relative to the start + of the slice it is sitting on. Coordinates start at 1 and + are inclusive. + Arg [-END] : The end coordinate of this feature relative to the start of + the slice it is sitting on. Coordinates start at 1 and are + inclusive. + Arg [-STRAND]: The orientation of this feature. Valid values are 1,-1,0. + Arg [-SEQNAME] : A seqname to be used instead of the default name of the + of the slice. Useful for features that do not have an + attached slice such as protein features. + Arg [-dbID] : (optional) internal database id + Arg [-ADAPTOR]: (optional) Bio::EnsEMBL::DBSQL::BaseAdaptor + Example : $feature = Bio::EnsEMBL::Feature->new(-start => 1, + -end => 100, + -strand => 1, + -slice => $slice, + -analysis => $analysis); + Description: Constructs a new Bio::EnsEMBL::Feature. Generally subclasses + of this method are instantiated, rather than this class itself. + Returntype : Bio::EnsEMBL::Feature + Exceptions : Thrown on invalid -SLICE, -ANALYSIS, -STRAND ,-ADAPTOR arguments + Caller : general, subclass constructors + Status : Stable + +=cut + + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + my ( $start, $end, $strand, $slice, $analysis,$seqname, $dbID, $adaptor ) = + rearrange(['START','END','STRAND','SLICE','ANALYSIS', 'SEQNAME', + 'DBID', 'ADAPTOR'], @_); + if($slice) { + if(!ref($slice) || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice')) ) { + throw('-SLICE argument must be a Bio::EnsEMBL::Slice not '.$slice); + } + } + + if($analysis) { + if(!ref($analysis) || !$analysis->isa('Bio::EnsEMBL::Analysis')) { + throw('-ANALYSIS argument must be a Bio::EnsEMBL::Analysis not '. + $analysis); + } + } + + if(defined($strand)) { + if(!($strand == 1) && !($strand == -1) && !($strand == 0)) { + throw('-STRAND argument must be 1, -1, or 0'); + } + } + + if(defined($start) && defined($end)) { + if (($start =~ /\d+/) && ($end =~ /\d+/)) { + if($end+1 < $start) { + throw(sprintf('Start (%d) must be less than or equal to end+1 (%d)', $start, ($end+1))); + } + } else { + throw('Start and end must be integers'); + } + } + + my $self = bless({'start' => $start, + 'end' => $end, + 'strand' => $strand, + 'slice' => $slice, + 'analysis' => $analysis, + 'seqname' => $seqname, + 'dbID' => $dbID}, $class); + + $self->adaptor($adaptor); + return $self; +} + + +=head2 new_fast + + Arg [1] : hashref to be blessed + Description: Construct a new Bio::EnsEMBL::Feature using the hashref. + Exceptions : none + Returntype : Bio::EnsEMBL::Feature + Caller : general, subclass constructors + Status : Stable + +=cut + + +sub new_fast { + my $class = shift; + my $hashref = shift; + my $self = bless $hashref, $class; + weaken($self->{adaptor}) if ( ! isweak($self->{adaptor}) ); + return $self; +} + +=head2 start + + Arg [1] : (optional) int $start + The start of this feature relative to the start of the slice + that it is on. + Example : $start = $feat->start() + Description: Getter/Setter for the start of this feature relative to the + start of the slice it is on. Note that negative values, or + values exceeding the length of the slice are permitted. + Start must be less than or equal to the end regardless of the + strand. Coordinate values start at 1 and are inclusive. + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub start { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'start'} = $value; + } + + return $self->{'start'}; +} + + + +=head2 end + + Arg [1] : (optional) int $end + Example : $end = $feat->end(); + Description: Getter/Setter for the end of this feature relative to the + start of the slice that it is on. Note that negative values, + of values exceeding the length of the slice are permitted. End + must be greater than or equal to start regardless of the strand. + Coordinate values start at 1 and are inclusive. + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub end { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'end'} = $value; + } + + return $self->{'end'}; +} + + + + +=head2 strand + + Arg [1] : (optional) int $strand + Example : $feat->strand(-1); + Description: Getter/Setter for the strand of this feature relative to the + slice it is on. 0 is an unknown or non-applicable strand. + -1 is the reverse (negative) strand and 1 is the forward + (positive) strand. No other values are permitted. + Returntype : int + Exceptions : thrown if an invalid strand argument is passed + Caller : general + Status : Stable + +=cut + +sub strand { + my ( $self, $strand ) = @_; + + if ( defined($strand) ) { + if ( $strand != 0 && $strand != 1 && $strand != -1 ) { + throw('strand argument must be 0, -1 or 1'); + } + + $self->{'strand'} = $strand; + } + + return $self->{'strand'}; +} + +=head2 move + + Arg [1] : int start + Arg [2] : int end + Arg [3] : (optional) int strand + Example : None + Description: Sets the start, end and strand in one call rather than in + 3 seperate calls to the start(), end() and strand() methods. + This is for convenience and for speed when this needs to be + done within a tight loop. + Returntype : none + Exceptions : Thrown is invalid arguments are provided + Caller : general + Status : Stable + +=cut + +sub move { + my $self = shift; + + throw('start and end arguments are required') if(@_ < 2); + + my $start = shift; + my $end = shift; + my $strand = shift; + + if(defined($start) && defined($end) && $end < $start) { + throw('start must be less than or equal to end'); + } + if(defined($strand) && $strand != 0 && $strand != -1 && $strand != 1) { + throw('strand must be 0, -1 or 1'); + } + + $self->{'start'} = $start; + $self->{'end'} = $end; + $self->{'strand'} = $strand if(defined($strand)); +} + + + +=head2 length + + Arg [1] : none + Example : $length = $feat->length(); + Description: Returns the length of this feature + Returntype : Integer + Exceptions : Throws if end < start and the feature is not on a + circular slice + Caller : general + Status : Stable + +=cut + +sub length { + my ($self) = @_; + + if ( $self->{'end'} < $self->{'start'} ) { + # if circular, we can work out the length of an origin-spanning + # feature using the size of the underlying region. + if ( $self->slice() && $self->slice()->is_circular() ) { + my $len = + $self->slice()->seq_region_length() - + ( $self->{'start'} - $self->{'end'} ) + 1; + return $len; + } else { + throw( "Cannot determine length of non-circular feature " + . "where start > end" ); + } + } + + return $self->{'end'} - $self->{'start'} + 1; +} + +=head2 analysis + + Arg [1] : (optional) Bio::EnsEMBL::Analysis $analysis + Example : $feature->analysis(new Bio::EnsEMBL::Analysis(...)) + Description: Getter/Setter for the analysis that is associated with + this feature. The analysis describes how this feature + was derived. + Returntype : Bio::EnsEMBL::Analysis + Exceptions : thrown if an invalid argument is passed + Caller : general + Status : Stable + +=cut + +sub analysis { + my $self = shift; + + if(@_) { + my $an = shift; + if(defined($an) && (!ref($an) || !$an->isa('Bio::EnsEMBL::Analysis'))) { + throw('analysis argument must be a Bio::EnsEMBL::Analysis'); + } + $self->{'analysis'} = $an; + } + + return $self->{'analysis'}; +} + + + +=head2 slice + + Arg [1] : (optional) Bio::EnsEMBL::Slice $slice + Example : $seqname = $feature->slice()->name(); + Description: Getter/Setter for the Slice that is associated with this + feature. The slice represents the underlying sequence that this + feature is on. Note that this method call is analagous to the + old SeqFeature methods contig(), entire_seq(), attach_seq(), + etc. + Returntype : Bio::EnsEMBL::Slice + Exceptions : thrown if an invalid argument is passed + Caller : general + Status : Stable + +=cut + +sub slice { + my ( $self, $slice ) = @_; + + if ( defined($slice) ) { + if ( !check_ref( $slice, 'Bio::EnsEMBL::Slice' ) + && !check_ref( $slice, 'Bio::EnsEMBL::LRGSlice' ) ) + { + throw('slice argument must be a Bio::EnsEMBL::Slice'); + } + + $self->{'slice'} = $slice; + } elsif ( @_ > 1 ) { + delete($self->{'slice'}); + } + + return $self->{'slice'}; +} + +=head2 equals + + Arg [1] : Bio::EnsEMBL::Feature object + Example : if ($featureA->equals($featureB)) { ... } + Description : Compares two features using various criteria. The + test for eqality goes through the following list and + terminates at the first true match: + + 1. If the two features are the same object, they are + equal. + 2. If they are of different types (e.g., transcript + and gene), they are *not* equal. + 3. If they both have dbIDs: if these are the same, + then they are equal, otherwise not. + 4. If they both have slices and analysis objects: + if the analysis dbIDs are the same and the + seq_region_id are the same, along with + seq_region_start and seq_region_end, then they are + equal, otherwise not. + + If none of the above is able to determine equality, + undef is returned. + + Return type : tri-Boolean (0, 1, undef = "unknown") + + Exceptions : Thrown if a non-feature is passed as the argument. + +=cut + +sub equals { + my ( $self, $feature ) = @_; + + # If the features are the same object, they are equal. + if ( !defined($feature) ) { return 0 } + if ( $self eq $feature ) { return 1 } + + assert_ref( $feature, 'Bio::EnsEMBL::Feature' ); + + # If the features have different types, they are *not* equal. + if ( ref($self) ne ref($feature) ) { + return 0; + } + + # If the features has the same dbID, they are equal. + if ( defined( $self->dbID() ) && defined( $feature->dbID() ) ) { + if ( $self->dbID() == $feature->dbID() ) { return 1 } + else { return 0 } + } + + # We now know that one of the features do not have a dbID. + + # If the features have the same start, end, strand and seq_region_id, + # and analysis_id, they are equal. + if ( + ( defined( $self->analysis() ) && defined( $feature->analysis() ) ) + && ( defined( $self->slice() ) && defined( $feature->slice() ) ) ) + { + if ( ( $self->start() == $feature->start() ) && + ( $self->end() == $feature->end() ) && + ( $self->strand() == $feature->strand() ) && + ( $self->slice()->get_seq_region_id() == + $feature->slice()->get_seq_region_id() ) && + ( $self->analysis()->dbID() == $feature->analysis()->dbID() ) ) + { + return 1; + } + else { return 0 } + } + + # We now know that one of the features does not have either analysis + # or slice. + + # We don't know if the features are equal. This happens if they are + # not the same object but are of the same type, and one of them lacks + # dbID, and if there aren't slice and analysis objects attached to + # them both. + return undef; +} ## end sub equals + + +=head2 transform + + Arg [1] : string $coord_system + The coord system to transform this feature to. + Arg [2] : string $version (optional) + The version of the coord system to transform this feature to. + Example : $feature = $feature->transform('contig'); + next if(!defined($feature)); + Description: Returns a copy of this feature, but converted to a different + coordinate system. The converted feature will be placed on a + slice which spans an entire sequence region of the new + coordinate system. If the requested coordinate system is the + same coordinate system it is simply placed on a slice which + spans the entire seq_region (as opposed to the original slice + which may have only partially covered the seq_region). + + If a feature spans a boundary in the new coordinate system, + undef is returned instead. + + For example, transforming an exon in contig coordinates to one + in chromosomal coodinates will place the exon on a slice of an + entire chromosome. + Returntype : Bio::EnsEMBL::Feature (or undef) + Exceptions : thrown if an invalid coordinate system is provided + warning if Feature is not attached to a slice + Caller : general, transfer() + Status : Stable + +=cut + +sub transform { + my $self = shift; + my $cs_name = shift; + my $cs_version = shift; + my $to_slice = shift; + + # + # For backwards compatibility check if the arguments are old style args + # + if(!$cs_name || ref($cs_name)) { + deprecate('Calling transform without a coord system name is deprecated.'); + return $self->_deprecated_transform($cs_name); + } + + my $slice = $self->{'slice'}; + + if(!$slice) { + warning("Feature cannot be transformed without attached slice."); + return undef; + } + + if(!$slice->adaptor()) { + warning("Feature cannot be transformed without adaptor on" . + " attached slice."); + return undef; + } + + #use db from slice since this feature may not yet be stored in a database + my $db = $slice->adaptor->db(); + my $cs = $db->get_CoordSystemAdaptor->fetch_by_name($cs_name, $cs_version); + my $current_cs = $slice->coord_system(); + + if(!$current_cs) { + warning("Feature cannot be transformed without CoordSystem on " . + "attached slice."); + return undef; + } + + if(!$cs) { + throw("Cannot transform to unknown coordinate system " . + "[$cs_name $cs_version]\n"); + } + + # if feature is already in the requested coordinate system, we can just + # return a copy + if( $cs->equals( $current_cs ) && $slice->start() == 1 && + $slice->strand() == 1 ) { + my $new_feature; + %$new_feature = %$self; + bless $new_feature, ref $self; + return $new_feature; + } + my $projection; + if(defined($to_slice)){ + $projection = $self->project_to_slice( $to_slice ); } + else{ + $projection = $self->project( $cs_name, $cs_version ); + } + + if(@$projection == 0){ + return undef; + } + if( @$projection != 1 and !defined($to_slice)) { +# warn "MORE than one projection and NO slice specified "; +# warn "from ".$self->slice->name." to $cs_name, $cs_version\n"; + return undef; + } + my $index = 0; + if(defined($to_slice)){ + my $found = 0; + my $i = 0; + foreach my $proj (@{$projection}) { + my $slice = $proj->[2]; + if($to_slice->get_seq_region_id eq $slice->get_seq_region_id){ + $found =1; + $index = $i; + } + $i++; + } + if(!$found){ + if(@$projection != 1){ + if(@$projection == 0){ + warn "number of mappings is ".@$projection."\n"; + warn "could not project feature ".ref($self)." from ".$self->slice->seq_region_name." to ".$to_slice->seq_region_name."\n"; + warn "In the region of ".$self->slice->start." <-> ".$self->slice->end."\n"; + warn "feat start=".($self->slice->start+$self->start)."\tend=".($self->slice->start+$self->end)."\n"; + } + else{ + foreach my $proj (@{$projection}) { + my $slice = $proj->[2]; + warn "available slice ".$slice->seq_regon_name."\n"; + } + warn "MORE than one projection and none to slice specified (".$to_slice->seq_region_name.")\n"; + } + } + else { + foreach my $proj (@{$projection}) { + warn "Mapping is to ".$proj->[2]->seq_region_name."\n"; + } + warn "One projection but none to slice specified\n"; + } + return undef; + } + } + + my $p_slice = $projection->[$index]->[2]; + my $slice_adaptor = $db->get_SliceAdaptor; + $slice = $slice_adaptor->fetch_by_region($p_slice->coord_system()->name(), + $p_slice->seq_region_name(), + undef, #start + undef, #end + 1, #strand + $p_slice->coord_system()->version); + + my $new_feature; + %$new_feature = %$self; + bless $new_feature, ref $self; + $new_feature->{'start'} = $p_slice->start(); + $new_feature->{'end'} = $p_slice->end(); + $new_feature->{'strand'} = + ($self->{'strand'} == 0) ? 0 : $p_slice->strand(); + $new_feature->{'slice'} = $slice; + return $new_feature; + +} + + + +=head2 transfer + + Arg [1] : Bio::EnsEMBL::Slice $slice + The slice to transfer this feature to + Example : $feature = $feature->transfer($slice); + next if(!defined($feature)); + Description: Returns a copy of this feature which has been shifted onto + another slice. + + If the new slice is in a different coordinate system the + feature is transformed first and then placed on the slice. + If the feature would be split across a coordinate system + boundary or mapped to a gap undef is returned instead. + + If the feature cannot be placed on the provided slice because + it maps to an entirely different location, undef is returned + instead. + + Returntype : Bio::EnsEMBL::Feature (or undef) + Exceptions : throw on incorrect argument + throw if feature does not have attached slice + Caller : general, transform() + Status : Stable + +=cut + + +sub transfer { + my $self = shift; + my $slice = shift; + + if(!$slice || !ref($slice) || (!$slice->isa('Bio::EnsEMBL::Slice') && !$slice->isa('Bio::EnsEMBL::LRGSlice'))) { + throw('Slice argument is required'); + } + + #make a shallow copy of the feature to be transfered + my $feature; + %{$feature} = %{$self}; + bless $feature, ref($self); + weaken $feature->{adaptor}; + + my $current_slice = $self->{'slice'}; + + if(!$current_slice) { + warning("Feature cannot be transfered without attached slice."); + return undef; + } + + my $cur_cs = $current_slice->coord_system(); + my $dest_cs = $slice->coord_system(); + + #if we are not in the same coord system a transformation step is needed first + if(!$dest_cs->equals($cur_cs)) { + $feature = $feature->transform($dest_cs->name, $dest_cs->version, $slice); + return undef if(!defined($feature)); + $current_slice = $feature->{'slice'}; + } + + # feature went to entirely different seq_region + if($current_slice->seq_region_name() ne $slice->seq_region_name()) { + return undef; + } + + #if the current feature positions are not relative to the start of the + #seq region, convert them so they are + my $cur_slice_start = $current_slice->start(); + my $cur_slice_strand = $current_slice->strand(); + if($cur_slice_start != 1 || $cur_slice_strand != 1) { + my $fstart = $feature->{'start'}; + my $fend = $feature->{'end'}; + + if($cur_slice_strand == 1) { + $feature->{'start'} = $fstart + $cur_slice_start - 1; + $feature->{'end'} = $fend + $cur_slice_start - 1; + } else { + my $cur_slice_end = $current_slice->end(); + $feature->{'start'} = $cur_slice_end - $fend + 1; + $feature->{'end'} = $cur_slice_end - $fstart + 1; + $feature->{'strand'} *= -1; + } + } + + my $fstart = $feature->{'start'}; + my $fend = $feature->{'end'}; + + #convert to destination slice coords + if($slice->strand == 1) { + $feature->{'start'} = $fstart - $slice->start() + 1; + $feature->{'end'} = $fend - $slice->start() + 1; + } else { + $feature->{'start'} = $slice->end() - $fend + 1; + $feature->{'end'} = $slice->end() - $fstart + 1; + $feature->{'strand'} *= -1; + } + + $feature->{'slice'} = $slice; + + return $feature; +} + +=head2 project_to_slice + + Arg [1] : slice to project to + + + Example : + my $clone_projection = $feature->project_to_slice($slice); + + foreach my $seg (@$clone_projection) { + my $clone = $seg->to_Slice(); + print "Features current coords ", $seg->from_start, '-', + $seg->from_end, " project onto clone coords " . + $clone->seq_region_name, ':', $clone->start, '-', $clone->end, + $clone->strand, "\n"; + } + Description: Returns the results of 'projecting' this feature onto another + slice . This is useful to see where a feature + would lie in a coordinate system in which it + crosses a boundary. + + This method returns a reference to a list of + Bio::EnsEMBL::ProjectionSegment objects. + ProjectionSegments are blessed arrays and can also be used as + triplets [from_start,from_end,to_Slice]. The from_start and + from_end are the coordinates relative to the feature start. + For example, if a feature is current 100-200bp on a slice + then the triplets returned might be: + [1,50,$slice1], + [51,101,$slice2] + + The to_Slice is a slice spanning the region on the requested + coordinate system that this feature projected to. + + If the feature projects entirely into a gap then a reference to + an empty list is returned. + + Returntype : listref of Bio::EnsEMBL::ProjectionSegments + which can also be used as [$start,$end,$slice] triplets + Exceptions : slice does not have an adaptor + Caller : general + Status : At Risk + +=cut + +sub project_to_slice { + my $self = shift; + my $to_slice = shift; + my $slice = $self->{'slice'}; + + if(!$slice) { + warning("Feature cannot be projected without attached slice."); + return []; + } + + + #get an adaptor from the attached slice because this feature may not yet + #be stored and may not have its own adaptor + my $slice_adaptor = $slice->adaptor(); + + if(!$slice_adaptor) { + throw("Cannot project feature because associated slice does not have an " . + " adaptor"); + } + + my $strand = $self->strand() * $slice->strand(); + #fetch by feature always gives back forward strand slice: + $slice = $slice_adaptor->fetch_by_Feature($self); + $slice = $slice->invert if($strand == -1); + return $slice->project_to_slice($to_slice); +} + + +=head2 project + + Arg [1] : string $name + The name of the coordinate system to project this feature onto + Arg [2] : string $version (optional) + The version of the coordinate system (such as 'NCBI34') to + project this feature onto + Example : + my $clone_projection = $feature->project('clone'); + + foreach my $seg (@$clone_projection) { + my $clone = $seg->to_Slice(); + print "Features current coords ", $seg->from_start, '-', + $seg->from_end, " project onto clone coords " . + $clone->seq_region_name, ':', $clone->start, '-', $clone->end, + $clone->strand, "\n"; + } + Description: Returns the results of 'projecting' this feature onto another + coordinate system. This is useful to see where a feature + would lie in a coordinate system in which it + crosses a boundary. + + This method returns a reference to a list of + Bio::EnsEMBL::ProjectionSegment objects. + ProjectionSegments are blessed arrays and can also be used as + triplets [from_start,from_end,to_Slice]. The from_start and + from_end are the coordinates relative to the feature start. + For example, if a feature is current 100-200bp on a slice + then the triplets returned might be: + [1,50,$slice1], + [51,101,$slice2] + + The to_Slice is a slice spanning the region on the requested + coordinate system that this feature projected to. + + If the feature projects entirely into a gap then a reference to + an empty list is returned. + + Returntype : listref of Bio::EnsEMBL::ProjectionSegments + which can also be used as [$start,$end,$slice] triplets + Exceptions : slice does not have an adaptor + Caller : general + Status : Stable + +=cut + +sub project { + my $self = shift; + my $cs_name = shift; + my $cs_version = shift; + + my $slice = $self->{'slice'}; + + if(!$slice) { + warning("Feature cannot be projected without attached slice."); + return []; + } + + + #get an adaptor from the attached slice because this feature may not yet + #be stored and may not have its own adaptor + my $slice_adaptor = $slice->adaptor(); + + if(!$slice_adaptor) { + throw("Cannot project feature because associated slice does not have an " . + " adaptor"); + } + + my $strand = $self->strand() * $slice->strand(); + #fetch by feature always gives back forward strand slice: + $slice = $slice_adaptor->fetch_by_Feature($self); + $slice = $slice->invert if($strand == -1); + return $slice->project($cs_name, $cs_version); +} + + + +=head2 seqname + + Arg [1] : (optional) $seqname + Example : $seqname = $feat->seqname(); + Description: Getter/Setter for the name of the sequence that this feature + is on. Normally you can get away with not setting this value + and it will default to the name of the slice on which this + feature is on. It is useful to set this value on features which + do not ordinarily sit on features such as ProteinFeatures which + sit on peptides. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub seqname { + my $self = shift; + + if(@_) { + $self->{'seqname'} = shift; + } + + if(!$self->{'seqname'} && $self->slice()) { + return $self->slice->name(); + } + + return $self->{'seqname'}; +} + + + + +=head2 display_id + + Arg [1] : none + Example : print $f->display_id(); + Description: This method returns a string that is considered to be + the 'display' identifier. It is overridden by subclasses to + return an appropriate value for objects of that particular + class. If no appropriate display id is available an empty + string is returned instead. + Returntype : string + Exceptions : none + Caller : web drawing code + Status : Stable + +=cut + +sub display_id { + my $self = shift; + return ''; +} + + +=head2 feature_Slice + + Args : none + Example : $slice = $feature->feature_Slice() + Description: This is a convenience method to return a slice that covers the + Area of this feature. The feature start will be at 1 on it, and + it will have the length of this feature. + Returntype : Bio::EnsEMBL::Slice or undef if this feature has no attached + Slice. + Exceptions : warning if Feature does not have attached slice. + Caller : web drawing code + Status : Stable + +=cut + +sub feature_Slice { + my $self = shift; + + my $slice = $self->slice(); + + if(!$slice) { + warning('Cannot obtain Feature_Slice for feature without attached slice'); + return undef; + } + + if($slice->isa("Bio::EnsEMBL::StrainSlice")){ + return Bio::EnsEMBL::StrainSlice->new + (-seq_region_name => $slice->seq_region_name, + -seq_region_length => $slice->seq_region_length, + -coord_system => $slice->coord_system, + -start => $self->seq_region_start(), + -end => $self->seq_region_end(), + -strand => $self->seq_region_strand(), + -adaptor => $slice->adaptor(), + -strain_name => $slice->strain_name()); + } + else{ + return Bio::EnsEMBL::Slice->new + (-seq_region_name => $slice->seq_region_name, + -seq_region_length => $slice->seq_region_length, + -coord_system => $slice->coord_system, + -start => $self->seq_region_start(), + -end => $self->seq_region_end(), + -strand => $self->seq_region_strand(), + -adaptor => $slice->adaptor()); + } +} + + +=head2 seq_region_name + + Arg [1] : none + Example : print $feature->seq_region_name(); + Description: Gets the name of the seq_region which this feature is on. + Returns undef if this Feature is not on a slice. + Returntype : string or undef + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub seq_region_name { + my $self = shift; + my $slice = $self->{'slice'}; + + return ($slice) ? $slice->seq_region_name() : undef; +} + + +=head2 seq_region_length + + Arg [1] : none + Example : print $feature->seq_region_length(); + Description: Returns the length of the seq_region which this feature is on + Returns undef if this Feature is not on a slice. + Returntype : int (unsigned) or undef + Exceptions : none + Caller : general + Status : Stable + +=cut + + +sub seq_region_length { + my $self = shift; + my $slice = $self->{'slice'}; + + return ($slice) ? $slice->seq_region_length() : undef; +} + + +=head2 seq_region_strand + + Arg [1] : none + Example : print $feature->seq_region_strand(); + Description: Returns the strand of the seq_region which this feature is on + (i.e. feature_strand * slice_strand) + Returns undef if this Feature is not on a slice. + Returntype : 1,0,-1 or undef + Exceptions : none + Caller : general + Status : Stable + +=cut + + +sub seq_region_strand { + my $self = shift; + my $slice = $self->{'slice'}; + + return ($slice) ? $slice->strand() * $self->{'strand'} : undef; +} + + +=head2 seq_region_start + + Arg [1] : none + Example : print $feature->seq_region_start(); + Description: Convenience method which returns the absolute start of this + feature on the seq_region, as opposed to the relative (slice) + position. + + Returns undef if this feature is not on a slice. + Returntype : int or undef + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub seq_region_start { + my ($self) = @_; + + my $slice = $self->slice(); + + if ( defined($slice) ) { + my $start; + + if ( $slice->strand() == 1 ) { + if ( defined( $self->start() ) ) { + if ($self->start < 0 && $slice->is_circular) { + $start = $slice->seq_region_length + $self->start; + } else { + $start = $slice->start() + $self->start() - 1; + } + } + } else { + if ( defined( $self->end() ) ) { + $start = $slice->end() - $self->end() + 1; + } + } + + if ( defined($start) + && $slice->is_circular() + && $start > $slice->seq_region_length() ) + { + $start -= $slice->seq_region_length(); + } + + return $start; + } + + return undef; +} ## end sub seq_region_start + + +=head2 seq_region_end + + Arg [1] : none + Example : print $feature->seq_region_end(); + Description: Convenience method which returns the absolute end of this + feature on the seq_region, as opposed to the relative (slice) + position. + + Returns undef if this feature is not on a slice. + Returntype : int or undef + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub seq_region_end { + my ($self) = @_; + + my $slice = $self->slice(); + + if ( defined($slice) ) { + my $end; + + if ( $slice->strand() == 1 ) { + if ( defined( $self->end() ) ) { + $end = $slice->start() + $self->end() - 1; + } + } else { + if ( defined( $self->start() ) ) { + $end = $slice->end() - $self->start() + 1; + } + } + + if ( defined($end) + && $slice->is_circular() + && $end > $slice->seq_region_length() ) + { + $end -= $slice->seq_region_length(); + } + + return $end; + } + + return undef; +} ## end sub seq_region_end + + +=head2 coord_system_name + + Arg [1] : none + Example : print $feature->coord_system_name() + Description: Gets the name of the coord_system which this feature is on. + Returns undef if this Feature is not on a slice. + Returntype : string or undef + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub coord_system_name { + my $self = shift; + my $slice = $self->{'slice'}; + return ($slice) ? $slice->coord_system_name() : undef; +} + + +=head2 seq + + Args : none + Example : my $dna_sequence = $simple_feature->seq(); + Description: Returns the dna sequence from the attached slice and + attached database that overlaps with this feature. + Returns undef if there is no slice or no database. + Returns undef if this feature is unstranded (i.e. strand=0). + Returntype : undef or string + Exceptions : warning if this feature is not stranded + Caller : general + Status : Stable + +=cut + + +sub seq { + my $self = shift; + + if( ! defined $self->{'slice'} ) { + return undef; + } + + if(!$self->strand()) { + warning("Cannot retrieve sequence for unstranded feature."); + return undef; + } + + return $self->{'slice'}->subseq($self->start(), $self->end(), + $self->strand()); + +} + + + + +=head2 get_all_alt_locations + + Arg [1] : none + Example : @features = @{$feature->get_all_alt_locations()}; + foreach $f (@features) { + print $f->slice->seq_region_name,' ',$f->start, $f->end,"\n"; + } + + Description: Retrieves shallow copies of this feature in its alternate + locations. A feature can be considered to have multiple + locations when it sits on a alternative structural haplotype + or when it is on a pseudo autosomal region. Most features will + just return a reference to an empty list though. + The features returned by this method will be on a slice which + covers the entire alternate region. + + Currently this method does not take into account alternate + locations on the alternate locations (e.g. a reference + sequence may have multiple alternate haplotypes. Asking + for alternate locations of a feature on one of the alternate + haplotypes will give you back the reference location, but not + locations on the other alternate haplotypes). + + Returntype : listref of features of the same type of this feature. + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_alt_locations { + my $self = shift; + my $return_all = shift || 0; + + my $slice = $self->{'slice'} or return []; + my $sa = $slice->adaptor() or return []; + + # get slice of entire region + $slice = $sa->fetch_by_seq_region_id($slice->get_seq_region_id); + + my $axfa = $sa->db->get_AssemblyExceptionFeatureAdaptor(); + my $axfs = $axfa->fetch_all_by_Slice($slice); + + my (@haps, @alt); + + foreach my $axf (@$axfs) { + if(uc($axf->type()) eq 'HAP') { + push @haps, $axf; + } elsif(uc($axf->type()) =~ 'PAR') { + push @alt, $axf; + } elsif( $axf->type() eq "PATCH_FIX"){ + push @haps, $axf; + } elsif( $axf->type() eq "PATCH_FIX REF"){ + push @haps, $axf if $return_all > 0 ; + } elsif( $axf->type() eq "HAP REF" ) { + push @haps, $axf if $return_all > 0 ; + # do nothing when you are on REF + } elsif( $axf->type() eq "PATCH_NOVEL"){ + push @haps, $axf; + }elsif( $axf->type() eq "PATCH_NOVEL REF"){ + push @haps, $axf if $return_all > 0 ; + } else { + warning("Unknown exception feature type ". $axf->type()."- ignoring."); + } + } + + # regions surrounding hap are those of interest, not hap itself + # convert hap alt. exc. features to regions around haps instead + foreach my $h (@haps) { + my $haslice = $h->alternate_slice(); + my $hacs = $haslice->coord_system(); + + if($h->start() > 1 && $haslice->start() > 1) { + my $aslice = $sa->fetch_by_region($hacs->name(), + $haslice->seq_region_name(), + 1, + $haslice->start()-1, + $haslice->strand(), + $hacs->version()); + + push @alt, Bio::EnsEMBL::AssemblyExceptionFeature->new + (-start => 1, + -end => $h->start()-1, + -alternate_slice => $aslice); + } + + if($h->end() < $slice->seq_region_length() && + $haslice->end < $haslice->seq_region_length()) { + my $aslice = $sa->fetch_by_region($hacs->name(), + $haslice->seq_region_name(), + $haslice->end()+1, + $haslice->seq_region_length(), + $haslice->strand(), + $hacs->version()); + + push @alt, Bio::EnsEMBL::AssemblyExceptionFeature->new + (-start => $h->end() + 1, + -end => $slice->seq_region_length(), + -alternate_slice => $aslice); + } + } + + + # check if exception regions contain our feature + + my @features; + + foreach my $axf (@alt) { + # ignore other region if feature is not entirely on it + next if($self->seq_region_start() < $axf->start() || + $self->seq_region_end() > $axf->end()); + + # quick shallow copy of the feature + my $f; + %$f = %$self; + bless $f, ref($self); + + my $aslice = $axf->alternate_slice(); + + # position feature on entire slice of other region + $f->{'start'} = $f->seq_region_start() - $axf->start() + $aslice->start(); + $f->{'end'} = $f->seq_region_end() - $axf->start() + $aslice->start(); + $f->{'strand'} *= $aslice->strand(); + + $f->{'slice'} = $sa->fetch_by_seq_region_id($aslice->get_seq_region_id()); + + push @features, $f; + } + + return \@features; +} + + +=head2 overlaps + + Arg [1] : Bio::EnsEMBL::Feature $f + The other feature you want to check overlap with this feature + for. + Description: This method does a range comparison of this features start and + end and compares it with another features start and end. It will + return true if these ranges overlap and the features are on the + same seq_region. + Returntype : TRUE if features overlap, FALSE if they don't + Exceptions : warning if features are on different seq_regions + Caller : general + Status : Stable + +=cut + +sub overlaps { + my $self = shift; + my $f = shift; + + my $sr1_name = $self->seq_region_name; + my $sr2_name = $f->seq_region_name; + + if ($sr1_name and $sr2_name and ($sr1_name ne $sr2_name)) { + warning("Bio::EnsEMBL::Feature->overlaps(): features are on different seq regions."); + return undef; + } + + return ($self->seq_region_end >= $f->seq_region_start and $self->seq_region_start <= $f->seq_region_end); +} + + +=head2 get_overlapping_Genes + + Description: Get all the genes that overlap this feature. + Returntype : list ref of Bio::EnsEMBL::Gene + Caller : general + Status : UnStable + +=cut + +sub get_overlapping_Genes{ + my $self = shift; + + my $slice = $self->feature_Slice; + return $slice->get_all_Genes(); +} + +# query for absolute nearest. +# select x.display_label, g.gene_id, g.seq_region_start, ABS(cast((32921638 - g.seq_region_end) as signed)) as 'dist' from gene g, xref x where g.display_xref_id = x.xref_id and seq_region_id = 27513 order by ABS(cast((32921638 - g.seq_region_end) as signed)) limit 10; + +=head2 get_nearest_Gene + + Description: Get all the nearest gene to the feature + Returntype : Bio::EnsEMBL::Gene + Caller : general + Status : UnStable + +=cut + +sub get_nearest_Gene { + my $self = shift; + my $stranded = shift; + my $stream = shift; + + my $ga = Bio::EnsEMBL::Registry->get_adaptor($self->adaptor->db->species,"core","Gene"); + + return $ga->fetch_nearest_Gene_by_Feature($self, $stranded, $stream); + +} + +=head2 summary_as_hash + + Example : $feature_summary = $feature->summary_as_hash(); + Description : Retrieves a textual summary of this Feature. + Should be overidden by subclasses for specific tweaking + Returns : hashref of arrays of descriptive strings + Status : Intended for internal use +=cut + +sub summary_as_hash { + my $self = shift; + my %summary; + $summary{'ID'} = $self->display_id; + $summary{'start'} = $self->seq_region_start; + $summary{'end'} = $self->seq_region_end; + $summary{'strand'} = $self->strand; + $summary{'seq_region_name'} = $self->seq_region_name; + return \%summary; +} + +=head2 species + + Example : $feature->species(); + Description : Shortcut to the feature's DBAdaptor and returns its species name + Returntype : String the species name + Exceptions : Thrown if there is no attached adaptor + Caller : Webcode + +=cut + +sub species { + my ($self) = @_; + throw "Can only call this method if you have attached an adaptor" if ! $self->adaptor(); + return $self->adaptor()->db()->species(); +} + + +############################################## +# Methods included for backwards compatibility +############################################## + + +=head2 contig + + This method is deprecated and included for backwards compatibility only. + Use slice() instead +=cut +sub contig { + deprecate('Use slice() instead'); + slice(@_); +} + + + +=head2 sub_SeqFeature + + This method is deprecated and only for genebuild backwards compatibility. + Avoid using it if possible +=cut +sub sub_SeqFeature{ + my ($self) = @_; + return @{$self->{'_gsf_sub_array'}} if($self->{'_gsf_sub_array'}); +} + +=head2 add_sub_SeqFeature + + This method is deprecated and only for genebuild backwards compatibility. + Avoid using it if possible +=cut +sub add_sub_SeqFeature{ + my ($self,$feat,$expand) = @_; + my ($p, $f, $l) = caller; + if( $expand eq 'EXPAND' ) { + # if this doesn't have start/end set - forget it! + if( ! $self->start && ! $self->end ) { + + $self->start($feat->start()); + $self->end($feat->end()); + $self->strand($feat->strand); + } else { + if( $feat->start < $self->start ) { + $self->start($feat->start); + } + + if( $feat->end > $self->end ) { + $self->end($feat->end); + } + } + } else { + if($self->start > $feat->start || $self->end < $feat->end) { + throw("$feat is not contained within parent feature, " . + "and expansion is not valid"); + } + } + + push(@{$self->{'_gsf_sub_array'}},$feat); +} + +=head2 flush_sub_SeqFeature + + This method is deprecated and only for genebuild backwards compatibility. + Avoid using it isf possible +=cut +sub flush_sub_SeqFeature { + my ($self) = @_; + $self->{'_gsf_sub_array'} = []; +} + + +sub _deprecated_transform { + my $self = shift; + my $arg = shift; + + if(!$arg) { + warning("Calling transform() with no arguments is deprecated.\n". + "A coordinate system name argument should be used instead.\n". + "You probably wanted transform('seqlevel') or transform('contig')."); + return $self->transform('seqlevel'); + } + + if(ref($arg) eq 'Bio::EnsEMBL::Slice') { + if($arg->{'empty'}) { + warning("Calling transform with an empty slice is deprecated.\n" . + "A coordinate system name argument should be used instead.\n". + "You probably wanted transform('chromosome') or " . + "transform('toplevel')"); + return $self->transform('toplevel'); + } + warning("Calling transform with a slice is deprecated.\n" . + "Use the transfer method instead"); + return $self->transfer($arg); + } + + warning("Calling transform with a [".ref($arg)."] arg is no longer " . + "(or never was) supported. Doing nothing instead."); + + return $self; +} + + +=head2 id + +This method is deprecated and only included for backwards compatibility. +Use display_id, hseqname, dbID or stable_id instead + +=cut + +sub id { + my $self = shift; + deprecate("id method is not used - use display_id instead"); + return $self->{'stable_id'} if($self->{'stable_id'}); + return $self->{'hseqname'} if($self->{'hseqname'}); + return $self->{'seqname'} if($self->{'seqname'}); + return $self->{'dbID'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/FeaturePair.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/FeaturePair.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,990 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::FeaturePair - Stores sequence Features which are +themselves hits to other sequence features. + +=head1 SYNOPSIS + + my $feat = Bio::EnsEMBL::FeaturePair->new( + -start => 132_231, + -end => 132_321, + -strand => -1, + -slice => $slice, + -hstart => 10, + -hend => 100, + -hstrand => 1, + -score => 100, + -percent_id => 92.0, + -hseqname => 'ALUSX10.1', + -analysis => $analysis + ); + + my $hit_start = $feat->hstart(); + my $hit_end = $feat->hend(); + my $hit_strand = $feat->hstrand(); + my $analysis = $feat->analysis(); + +=head1 DESCRIPTION + +A sequence feature object where the feature is itself a feature on +another sequence - e.g. a blast hit where residues 1-40 of a protein +sequence SW:HBA_HUMAN has hit to bases 100 - 220 on a genomic sequence +HS120G22. The genomic sequence coordinates are represented by the +start, end, strand attributes while the protein (hit) coordinates are +represented by the hstart, hend, hstrand attributes. + + $clone = $slice_adpator->fetch_by_region( 'clone', 'HS120G22' ); + + $fp = Bio::EnsEMBL::FeaturePair( + -start => 100, + -end => 220, + -strand => 1, + -slice => $clone, + -hstart => 1, + -hend => 40, + -hstrand => 1, + -percent_id => 92.0, + -score => 100, + -hseqname => 'SW:HBA_HUMAN', + -species => 'Homo sapiens', + -hspecies => 'Homo sapiens' + ); + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::FeaturePair; + +use vars qw(@ISA); +use strict; + +use Bio::EnsEMBL::Feature; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning); + +@ISA = qw(Bio::EnsEMBL::Feature); + + + +=head2 new + + Arg [HSTART] : int - The start of the hit region (optional) + Arg [HEND] : int - The end of the hit region (optional) + Arg [HSTRAND] : (0,1,-1) - The strand of the hit region (optional) + Arg [PERCENT_ID]: float - The precentage identity of the hit (optional) + Arg [SCORE] : float - The score of the hit (optional) + Arg [HSEQNAME] : string - The name of the hit sequence (optional) + Arg [P_VALUE] : float - The pvalue or evalue (optional) + Arg [SPECIES] : string - The species the query sequence is from (optional) + Arg [HSPECIES] : string - The species the hit sequence is from (optional) + Arg [COVERAGE] : string - The % of the query that this feature pair covers + Arg [HCOVERAGE] : string - The % of the target this this feature pair covers + Arg [...] : Named superclass constructor args (Bio::EnsEMBL::Feature) + Example : $feat = Bio::EnsEMBL::FeaturePair->new(-start => 132_231, + -end => 132_321, + -strand => -1, + -slice => $slice, + -hstart => 10, + -hend => 100, + -hstrand => 1, + -score => 100, + -percent_id => 92.0, + -hseqname => 'ALUSX10.1', + -analysis => $analysis); + Description: Creates a new Bio::EnsEMBL::FeaturePair object + Returntype : Bio::EnsEMBL::FeaturePair + Exceptions : throw if start > end + throw if invalid strand is provided + Caller : general + Status : Stable + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + my ($hstart,$hend,$hstrand,$percent_id,$score, $species, $hspecies, + $p_value, $hseqname, $f1,$f2, $coverage, $hcoverage, $group_id,$level_id, $external_db_id, $extra_data, $external_db_name, $external_display_db_name) = + rearrange(['HSTART','HEND','HSTRAND','PERCENT_ID','SCORE','SPECIES', + 'HSPECIES', 'P_VALUE', 'HSEQNAME', 'FEATURE1','FEATURE2', + 'COVERAGE', 'HCOVERAGE', 'GROUP_ID','LEVEL_ID', 'EXTERNAL_DB_ID', 'EXTRA_DATA', 'DBNAME', 'DB_DISPLAY_NAME'], @_); + + if(defined($hstart) && defined($hend) && ($hend < $hstart)) { + throw('HSTART must be less than or equal to HEND'); + } + + if(defined($hstrand) && $hstrand != 1 && $hstrand != -1 && $hstrand != 0) { + throw('HSTRAND must be one of (0,1,-1)'); + } + + $self->{'hstart'} = $hstart; + $self->{'hend'} = $hend; + $self->{'hstrand'} = $hstrand; + $self->{'score'} = $score; + $self->{'percent_id'} = $percent_id; + $self->{'species'} = $species; + $self->{'hspecies'} = $hspecies; + $self->{'hseqname'} = $hseqname; + $self->{'coverage'} = $coverage; + $self->{'hcoverage'} = $hcoverage; + $self->{'p_value'} = $p_value; + $self->{'group_id'} = $group_id; + $self->{'level_id'} = $level_id; + $self->{'external_db_id'} = $external_db_id; + $self->{'extra_data'} = $extra_data; + $self->{'dbname'} = $external_db_name; + $self->{'db_display_name'} = $external_display_db_name; + + # + # Feature1 and Feature2 arg handling for backwards compatibility + # + if($f1) { + deprecate("Using FEATURE1 arg to construct FeaturePairs" . + " is deprecated.\nUse the args START,END,STRAND,SLICE instead"); + + #eval because we are not exactly sure what f1 arg will look like + eval { + $self->{'start'} = $f1->start(); + $self->{'end'} = $f1->end(); + $self->{'strand'} = $f1->strand(); + $self->{'slice'} = $f1->contig(); + $self->{'analysis'} = $f1->analysis() if($f1->analysis()); + }; + } + + if($f2) { + deprecate("Using FEATURE2 arg to construct FeaturePairs is deprecated" . + "\nUse the args HSTART,HEND,HSTRAND,HSEQNAME instead"); + + #eval because we are not exactly sure what f2 arg will look like + eval { + $self->{'hseqname'} = $f2->seqname(); + $self->{'hstart'} = $f2->start(); + $self->{'hend'} = $f2->end(); + $self->{'hstrand'} = $f2->strand(); + $self->{'analysis'} = $f2->analysis() if($f2->analysis()); + }; + } + + return $self; +} + + + +=head2 hseqname + + Arg [1] : string $hseqname (optional) + Example : $hseqname = $fp->hseqname(); + Description: Getter/Setter for the name of the hit sequence + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub hseqname { + my $self = shift; + $self->{'hseqname'} = shift if(@_); + return $self->{hseqname}; +} + + + +=head2 hstart + + Arg [1] : string $hstart (optional) + Example : $hstart = $fp->hstart(); + Description: Getter/Setter for the start coordinate on the hit sequence + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub hstart{ + my $self = shift; + $self->{'hstart'} = shift if(@_); + return $self->{'hstart'}; +} + + +=head2 hend + + Arg [1] : string $hend (optional) + Example : $hend = $fp->hend(); + Description: Getter/Setter for the end coordinate on the hit sequence + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub hend{ + my $self = shift; + $self->{'hend'} = shift if(@_); + return $self->{'hend'}; +} + + + +=head2 hstrand + + Arg [1] : int $hstrand (optional) + Example : $hstrand = $fp->hstrand + Description: Getter/Setter for the orientation of the hit on the hit sequence + Returntype : 0,1,-1 + Exceptions : thrown + Caller : general + Status : Stable + +=cut + +sub hstrand{ + my $self = shift; + + if(@_) { + my $hstrand = shift; + if(defined($hstrand) && $hstrand != 1 && $hstrand != 0 && $hstrand != -1) { + throw('hstrand must be one of (-1,0,1)'); + } + $self->{'hstrand'} = $hstrand; + } + + return $self->{'hstrand'}; +} + +=head2 hslice + + Arg [1] : (optional) Bio::EnsEMBL::Slice $slice + Example : $hseqname = $featurepair->hslice()->seq_region_name(); + Description: Getter/Setter for the Slice that is associated with this + hit feature. The slice represents the underlying sequence that this + feature is on. Note that this method call is analagous to the + old SeqFeature methods contig(), entire_seq(), attach_seq(), + etc. + Returntype : Bio::EnsEMBL::Slice + Exceptions : thrown if an invalid argument is passed + Caller : general + Status : Stable + +=cut + +sub hslice { + my $self = shift; + + if(@_) { + my $sl = shift; + if(defined($sl) && (!ref($sl) || !($sl->isa('Bio::EnsEMBL::Slice') ) )) { + throw('slice argument must be a Bio::EnsEMBL::Slice'); + } + + $self->{'hslice'} = $sl; + } + + return $self->{'hslice'}; +} + +=head2 hseq_region_name + + Arg [1] : none + Example : print $feature->hseq_region_name(); + Description: Gets the name of the hseq_region which this feature is on. + Returns undef if this Feature is not on a hslice. + Returntype : string or undef + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub hseq_region_name { + my $self = shift; + my $slice = $self->{'hslice'}; + + return ($slice) ? $slice->seq_region_name() : undef; +} + + +=head2 hseq_region_strand + + Arg [1] : none + Example : print $feature->hseq_region_strand(); + Description: Returns the strand of the hseq_region which this feature is on + (i.e. feature_strand * slice_strand) + Returns undef if this Feature is not on a hslice. + Returntype : 1,0,-1 or undef + Exceptions : none + Caller : general + Status : Stable + +=cut + + +sub hseq_region_strand { + my $self = shift; + my $slice = $self->{'hslice'}; + + return ($slice) ? $slice->strand() * $self->{'hstrand'} : undef; +} + +=head2 hseq_region_start + + Arg [1] : none + Example : print $feature->hseq_region_start(); + Description: Convenience method which returns the absolute start of this + feature on the hseq_region, as opposed to the relative (hslice) + position. + + Returns undef if this feature is not on a hslice. + Returntype : int or undef + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub hseq_region_start { + my $self = shift; + my $slice = $self->{'hslice'}; + + return undef if(!$slice); + + if($slice->strand == 1) { + return undef if(!defined($self->{'hstart'})); + return $slice->start() + $self->{'hstart'} - 1; + } else { + return undef if(!defined($self->{'hend'})); + return $slice->end() - $self->{'hend'} + 1; + } +} + + +=head2 hseq_region_end + + Arg [1] : none + Example : print $feature->hseq_region_end(); + Description: Convenience method which returns the absolute end of this + feature on the hseq_region, as opposed to the relative (hslice) + position. + + Returns undef if this feature is not on a hslice. + Returntype : int or undef + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub hseq_region_end { + my $self = shift; + my $slice = $self->{'hslice'}; + + return undef if(!$slice); + + if($slice->strand == 1) { + return undef if(!defined($self->{'hend'})); + return $slice->start() + $self->{'hend'} - 1; + } else { + return undef if(!defined($self->{'hstart'})); + return $slice->end() - $self->{'hstart'} + 1; + } +} + +=head2 score + + Arg [1] : float $score (optional) + Example : $score = $fp->score(); + Description: Getter/Setter for the score of this feature pair + Returntype : float + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub score{ + my $self = shift; + $self->{'score'} = shift if(@_); + return $self->{'score'}; +} + + + +=head2 percent_id + + Arg [1] : float $percent_id (optional) + Example : $percent_id = $fp->percent_id(); + Description: Getter/Setter for the percentage identity of this feature pair + Returntype : float + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub percent_id { + my $self = shift; + $self->{'percent_id'} = shift if(@_); + return $self->{'percent_id'}; +} + + + +=head2 species + + Arg [1] : string $genus_species_name (optional) + e.g. Homo_sapiens or Mus_musculus + Example : $species = $fp->species(); + Description: get/set on the species of feature1 + Returntype : string + Execeptions: none + Caller : general + Status : Stable + +=cut + +sub species{ + my $self = shift; + $self->{'species'} = shift if(@_); + return $self->{'species'}; +} + + +=head2 hspecies + + Arg [1] : string $genus_species_name (optional) + e.g. Homo_sapiens or Mus_musculus + Example : $hspecies = $fp->hspecies + Description: get/set on the species of feature2 + Returntype : string + Execeptions: none + Caller : general + Status : Stable + +=cut + +sub hspecies{ + my $self = shift; + $self->{'hspecies'} = shift if(@_); + return $self->{'hspecies'}; +} + + +=head2 coverage + + Arg [1] : number (percentage) $coverage (optional) + Example : $cov = $fp->coverage(); + Description: Getter/Setter for the % of the query covered by the feature + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub coverage { + my $self = shift; + $self->{'coverage'} = shift if(@_); + return $self->{'coverage'}; +} + + +=head2 hcoverage + + Arg [1] : number (percentage) $hcoverage (optional) + Example : $hcov = $fp->hcoverage(); + Description: Getter/Setter for the % of the target covered by the feature + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub hcoverage { + my $self = shift; + $self->{'hcoverage'} = shift if(@_); + return $self->{'hcoverage'}; +} + +=head2 external_db_id + + Arg [1] : int $external_db_id (optional) + Example : $ex_db = $fp->external_db_id(); + Description: Getter/Setter for the external_db_id taregt source database feature + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub external_db_id { + my $self = shift; + $self->{'external_db_id'} = shift if(@_); + return $self->{'external_db_id'}; +} + + +=head2 db_name + + Arg [1] : string $external_db_name (optional) + Example : $ex_db_name = $fp->dbname(); + Description: Getter/Setter for the external_db_name attribute, name of external database + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub db_name { + my $self = shift; + $self->{'dbname'} = shift if(@_); + return $self->{'dbname'}; +} + +=head2 db_display_name + + Arg [1] : string $db_display_name (optional) + Example : $ex_db_display_name = $fp->db_display_name(); + Description: Getter/Setter for the db_display_name attribute + The preferred display name for the external database. + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub db_display_name { + my $self = shift; + $self->{'db_display_name'} = shift if(@_); + return $self->{'db_display_name'}; +} + + + +=head2 p_value + + Arg [1] : float $p_value (optional) + Example : $eval = $fp->p_value + Description: Getter Setter for the evalue / pvalue of this feature + Returntype : float + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub p_value{ + my $self = shift; + $self->{'p_value'} = shift if(@_); + return $self->{'p_value'}; +} + + + +=head2 display_id + + Arg [1] : none + Example : print $fp->display_id(); + Description: This method returns a string that is considered to be + the 'display' identifier. For feature pairs this is the + hseqname if it is available otherwise it is an empty string. + Returntype : string + Exceptions : none + Caller : web drawing code + Status : Stable + +=cut + +sub display_id { + my $self = shift; + return $self->{'hseqname'} || ''; +} + + +=head2 identical_matches + + Arg [1] : int $identical_matches (optional) + Example : + Description: get/set on the number of identical matches + Returntype : int + Execeptions: none + Caller : general + Status : Stable + +=cut + +sub identical_matches{ + my ($self,$arg) = @_; + + if (defined($arg)) { + return $self->{'_identical_matches'} = $arg; + } + return $self->{'_identical_matches'}; +} + +=head2 positive_matches + + Arg [1] : int $positive_matches (optional) + Example : + Description: get/set on the number of positive matches + Returntype : int + Execeptions: none + Caller : general + Status : Stable + +=cut + +sub positive_matches{ + my ($self,$arg) = @_; + + if (defined($arg)) { + return $self->{'_positive_matches'} = $arg; + } + return $self->{'_positive_matches'}; +} + +=head2 group_id + + Arg [1] : int $group_id + Example : none + Description: get/set for attribute group_id + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub group_id { + my ($self, $arg) = @_; + + if ( defined $arg ) { + $self->{'group_id'} = $arg ; + } + return $self->{'group_id'}; +} + +=head2 level_id + + Arg [1] : int $level_id + Example : none + Description: get/set for attribute level_id + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub level_id { + my ($self, $arg) = @_; + + if ( defined $arg ) { + $self->{'level_id'} = $arg ; + } + return $self->{'level_id'}; +} + + + + + + +=head1 DEPRECATED METHODS + +=cut + + +=head2 feature1 + + Description: DEPRECATED use start(), end(), strand(), slice(), etc. + methods instead + +=cut + +sub feature1 { + my ($self,$arg) = @_; + + deprecate('Use start(), end(), strand(), slice(), etc. methods instead.'); + + if($arg) { + $self->start($arg->start()); + $self->end($arg->end()); + $self->strand($arg->strand()); + $self->score($arg->score()); + $self->percent_id($arg->percent_id()); + $self->analysis($arg->analysis); + if($arg->contig){ + $self->slice($arg->contig); + } + } + + return $self; +} + +=head2 feature2 + + Description: DEPRECATED use hstart(), hend(), hstrand() etc. + methods instead + +=cut + +sub feature2 { + my ($self,$arg) = @_; + + deprecate('Use hstart(),hend(),hstrand(),hseqname() methods instead.'); + + if (defined($arg)) { + $self->hstart($arg->start()); + $self->hend($arg->end()); + $self->hstrand($arg->strand()); + $self->hseqname($arg->seqname()); + return $arg; + } + + return new Bio::EnsEMBL::Feature( + -START => $self->hstart(), + -END => $self->hend(), + -STRAND => $self->hstrand(), + -SCORE => $self->score(), + -PERCENT_ID => $self->percent_id(), + -ANALYSIS => $self->analysis, + -SEQNAME => $self->hseqname()); +} + + + + +=head2 invert + + Arg [1] : (optional) Bio::EnsEMBL::Slice $newslice + Example : $feature->invert(); + Description: This method is used to swap the hit and query sides of this + feature in place. A new slice may optionally provided which + this feature will be placed on. If no slice is provided the + feature slice will be set to undef. + Returntype : none + Exceptions : none + Caller : pipeline (BlastMiniGenewise) + +=cut + +sub invert { + my ($self,$slice) = @_; + + if (! defined $slice && defined $self->hslice) { + $slice = $self->hslice; + } + + my $hstart = $self->{'hstart'}; + my $hend = $self->{'hend'}; + my $hstrand = $self->{'hstrand'}; + my $hspecies = $self->{'hspecies'}; + my $hseqname = $self->{'hseqname'}; + + my $start = $self->{'start'}; + my $end = $self->{'end'}; + my $strand = $self->{'strand'}; + my $species = $self->{'species'}; + my $seqname = $self->seqname(); + + $self->{'start'} = $hstart; + $self->{'end'} = $hend; + $self->{'strand'} = $hstrand; + $self->{'species'} = $hspecies; + $self->{'seqname'} = $hseqname if(defined($hseqname)); + + $self->{'hstart'} = $start; + $self->{'hend'} = $end; + $self->{'hstrand'} = $strand; + $self->{'hseqname'} = $seqname; + $self->{'hspecies'} = $species; + + $self->{'hslice'} = $self->slice; + $self->{'slice'} = $slice; +} + + + +=head2 validate + + Description: DEPRECATED do not use + +=cut + +sub validate { + my ($self) = @_; + + deprecate('This method does nothing and should not be used.'); +} + +=head2 validate_prot_feature + + Description: DEPRECATED do not use + +=cut + +sub validate_prot_feature{ + my ($self) = @_; + + deprecate('This method does nothing and should not be used.'); +} + + +=head2 set_featurepair_fields + + Description: DEPRECATED do not use + +=cut + +sub set_featurepair_fields { + my ($self, $start, $end, $strand, $score, $seqname, $hstart, $hend, + $hstrand, $hseqname, $analysis, $e_value, $perc_id, + $phase, $end_phase) = @_; + + deprecate("Use individual Getter/Setters or Constructor arguments " . + " instead.\nThere is no advantage to using this method."); + + throw('interface fault') if (@_ < 12 or @_ > 15); + + $self->start($start); + $self->end($end); + $self->strand($strand); + $self->score($score); + $self->seqname($seqname); + $self->hstart($hstart); + $self->hend($hend); + $self->hstrand($hstrand); + $self->hseqname($hseqname); + $self->analysis($analysis); + $self->p_value ($e_value) if (defined $e_value); + $self->percent_id ($perc_id) if (defined $perc_id); + $self->phase ($phase) if (defined $phase); + $self->end_phase ($end_phase) if (defined $end_phase); +} + + +=head2 gffstring + + Description: DEPRECATED do not use + +=cut + +sub gffstring { + my ($self) = @_; + + deprecate('Do not use'); + + my $str .= (defined $self->slice) ? $self->slice->name()."\t": "\t"; + $str .= "\t"; #source tag + $str .= "\t"; #primary tag + $str .= (defined $self->start) ? $self->start."\t" : "\t"; + $str .= (defined $self->end) ? $self->end."\t" : "\t"; + $str .= (defined $self->score) ? $self->score."\t" : "\t"; + $str .= (defined $self->strand) ? $self->strand."\t" : ".\t"; + $str .= ".\t"; #phase + $str .= ".\t"; #end phase + + my $hstrand = "+"; + + if (($self->hstrand)&&($self->hstrand == -1)) { + $hstrand = "-"; + } + + #Append a few FeaturePair specific things + $str .= (defined $self->hseqname) ? $self->hseqname."\t" : "\t"; + $str .= (defined $self->hstart) ? $self->hstart."\t" : "\t"; + $str .= (defined $self->hend) ? $self->hend."\t" : "\t"; + $str .= (defined $self->hstrand) ? $hstrand."\t" : "\t"; + $str .= (defined $self->hphase) ? $self->hphase."\t" : ".\t"; + + return $str; +} + + + + +=head2 hphase + + Description: DEPRECATED do not use + +=cut + +sub hphase { + my ($self, $value) = @_; + + deprecate('This method does nothing useful.'); + + if (defined($value)) { + $self->{_hphase} = $value; + } + + return $self->{_hphase}; +} + + +=head2 hend_phase + + Description: DEPRECATED do not use + +=cut + +sub hend_phase { + my ($self, $value) = @_; + + deprecate('This method does nothing useful.'); + + if (defined($value)) { + $self->{_hend_phase} = $value; + } + return $self->{_hend_phase}; +} + +sub extra_data { + my $self = shift; + $self->{'extra_data'} = shift if(@_); + return $self->{'extra_data'}; +} + +sub type { + my $self = shift; + $self->{'extra_data'}->{'type'} = shift if(@_); + if (exists $self->{'extra_data'}) { + return $self->{'extra_data'}->{'type'}; + } + return; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/AnnotatedFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/AnnotatedFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,273 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::AnnotatedFeature +# +# You may distribute this module under the same terms as Perl itself + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::AnnotatedFeature - A module to represent a feature mapping as +predicted by the eFG pipeline. + +=head1 SYNOPSIS + +use Bio::EnsEMBL::Funcgen::AnnotatedFeature; + +my $feature = Bio::EnsEMBL::Funcgen::AnnotatedFeature->new + ( + -SLICE => $chr_1_slice, + -START => 1_000_000, + -SUMMIT => 1_000_019, + -END => 1_000_024, + -STRAND => -1, + -DISPLAY_LABEL => $text, + -SCORE => $score, + -FEATURE_SET => $fset, + ); + + + +=head1 DESCRIPTION + +An AnnotatedFeature object represents the genomic placement of a prediction +generated by the eFG analysis pipeline. This normally represents the +output of a peak calling analysis. It can have a score and/or a summit, the +meaning of which depend on the specific Analysis used to infer the feature. +For example, in the case of a feature derived from a peak call over a ChIP-seq +experiment, the score is the peak caller score, and summit is the point in the +feature where more reads align with the genome. + +=head1 SEE ALSO + +Bio::EnsEMBL::Funcgen::DBSQL::AnnotatedFeatureAdaptor + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::AnnotatedFeature; + +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Utils::Exception qw( throw ); +use Bio::EnsEMBL::Funcgen::SetFeature; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::SetFeature); + + +=head2 new + + Arg [-SLICE] : Bio::EnsEMBL::Slice - The slice on which this feature is. + Arg [-START] : int - The start coordinate of this feature relative to the start of the slice + it is sitting on. Coordinates start at 1 and are inclusive. + Arg [-END] : int -The end coordinate of this feature relative to the start of the slice + Arg [-STRAND] : int - The orientation of this feature. Valid values are 1, -1 and 0. + it is sitting on. Coordinates start at 1 and are inclusive. + Arg [-DISPLAY_LABEL]: string - Display label for this feature + Arg [-SUMMIT] : optional int - seq_region peak summit position + Arg [-SCORE] : optional int - Score assigned by analysis pipeline + Arg [-dbID] : optional int - Internal database ID. + Arg [-ADAPTOR] : optional Bio::EnsEMBL::DBSQL::BaseAdaptor - Database adaptor. + Example : my $feature = Bio::EnsEMBL::Funcgen::AnnotatedFeature->new + ( + -SLICE => $chr_1_slice, + -START => 1_000_000, + -END => 1_000_024, + -STRAND => -1, + -FEATURE_SET => $fset, + -DISPLAY_LABEL => $text, + -SCORE => $score, + -SUMMIT => 1_000_019, + ); + + + Description: Constructor for AnnotatedFeature objects. + Returntype : Bio::EnsEMBL::Funcgen::AnnotatedFeature + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_); + #Hard code strand => 0 here? And remove from input params? + my ($score, $summit) = rearrange(['SCORE', 'SUMMIT'], @_); + + #Direct assingment here removes need for set arg test in method + + $self->{'score'} = $score if defined $score; + $self->{'summit'} = $summit if defined $summit; + + return $self; +} + + +=head2 score + + Arg [1] : (optional) int - score + Example : my $score = $feature->score(); + Description: Getter for the score attribute for this feature. + Returntype : int + Exceptions : None + Caller : General + Status : Low Risk + +=cut + +sub score { + my $self = shift; + return $self->{'score'}; +} + +=head2 summit + + Arg [1] : (optional) int - summit postition + Example : my $peak_summit = $feature->summit; + Description: Getter for the summit attribute for this feature. + Returntype : int + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub summit { + my $self = shift; + return $self->{'summit'}; +} + + +=head2 display_label + + Example : my $label = $feature->display_label(); + Description: Getter for the display label of this feature. + Returntype : String + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub display_label { + my $self = shift; + + #auto generate here if not set in table + #need to go with one or other, or can we have both, split into diplay_name and display_label? + + if(! $self->{'display_label'} && $self->adaptor){ + $self->{'display_label'} = $self->feature_type->name()." -"; + $self->{'display_label'} .= " ".$self->cell_type->name(); + $self->{'display_label'} .= " Enriched Site"; + } + + return $self->{'display_label'}; +} + + +=head2 is_focus_feature + + Args : None + Example : if($feat->is_focus_feature){ ... } + Description: Returns true if AnnotatedFeature is part of a focus + set used in the RegulatoryBuild + Returntype : Boolean + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub is_focus_feature{ + my $self = shift; + + #Do we need to test for FeatureSet here? + + return $self->feature_set->is_focus_set; +} + + +=head2 get_underlying_structure + + Example : my @loci = @{ $af->get_underlying_structure() }; + Description: Returns and array of loci consisting of: + (start, (motif_feature_start, motif_feature_end)*, end) + Returntype : ARRAYREF + Exceptions : None + Caller : General + Status : At Risk - This is TFBS specific and could move to TranscriptionFactorFeature + +=cut + +#This should really be precomputed and stored in the DB to avoid the MF attr fetch +#Need to be aware of projecting here, as these will expire if we project after this method is called + +sub get_underlying_structure{ + my $self = shift; + + if(! defined $self->{underlying_structure}){ + my @loci = ($self->start); + + foreach my $mf(@{$self->get_associated_MotifFeatures}){ + push @loci, ($mf->start, $mf->end); + } + + push @loci, $self->end; + + $self->{underlying_structure} = \@loci; + } + + return $self->{underlying_structure}; +} + +=head2 get_associated_MotifFeatures + + Example : my @assoc_mfs = @{ $af->get_associated_MotifFeatures }; + Description: Returns and array associated MotifFeature i.e. MotifFeatures + representing a relevanting PWM/BindingMatrix + Returntype : ARRAYREF + Exceptions : None + Caller : General + Status : At Risk - This is TFBS specific and could move to TranscriptionFactorFeature + +=cut + +sub get_associated_MotifFeatures{ + my ($self) = @_; + + if(! defined $self->{'assoc_motif_features'}){ + my $mf_adaptor = $self->adaptor->db->get_MotifFeatureAdaptor; + + #These need reslicing! + + $self->{'assoc_motif_features'} = $mf_adaptor->fetch_all_by_AnnotatedFeature($self, $self->slice); + } + + return $self->{'assoc_motif_features'}; +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/Array.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Array.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,588 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::Array +# +# You may distribute this module under the same terms as Perl itself + + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::Funcgen::Array - A module to represent a nucleotide microarray. + +=head1 SYNOPSIS + +use Bio::EnsEMBL::Funcgen::Array; + +my $array = Bio::EnsEMBL::Funcgen::Array->new( + -NAME => 'Array-1', + -FORMAT => 'Tiled', + -SIZE => '1', + -VENDOR => 'Nimblegen', + -DESCRIPTION => $desc, + -TYPE => 'OLIGO', + -CLASS => 'VENDOR_FORMAT' +); + +my $db_adaptor = Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor->new(...); +my $array_adaptor = $db_adaptor->get_ArrayAdaptor(); +my $array = $array_adaptor->fetch_by_name($array_name) + +=head1 DESCRIPTION + +An Array object represents a nucleotide (OLIGO, PCR etc.) microarray. The data +(currently the name, format, size, species, vendor and description) are stored +in the array table. + +=cut + + +use strict; +use warnings; + + +package Bio::EnsEMBL::Funcgen::Array; + + +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Utils::Exception qw( throw warning ); +use Bio::EnsEMBL::Funcgen::Storable; + +use vars qw(@ISA);# %VALID_TYPE); +@ISA = qw(Bio::EnsEMBL::Funcgen::Storable); + + +# Possible types for OligoArray objects +#This should match the vendor enum values? +#%VALID_TYPE = ( +# 'AFFY' => 1, +# 'OLIGO' => 1, +#); + + +=head2 new + + Arg [-NAME] : string - the name of this array + Arg [-VENDOR] : string - the vendor of this array (AFFY, NIMBLEGEN etc) + Arg [-TYPE] : string - type of array e.g. OLIGO, PCR + Arg [-FORMAT] : string - the format of this array (TILED, TARGETTED, GENE etc) + Arg [-DESCRIPTION] : strin - description of the array + +#array_chips is array of hashes or design_id and name, dbID will be populated on store, this should be a simple object! + + Example : my $array = Bio::EnsEMBL::Funcgen::Array->new( + -NAME => 'Array-1', + -FORMAT => 'Tiled', + -SIZE => '1', + -VENDOR => 'Nimblegen', + -TYPE => 'OLIGO', + -DESCRIPTION => $desc, + -CLASS => 'VENDOR_FORMAT',#e.g. AFFY_UTR, ILLUMINA_WG + ); + Description: Creates a new Bio::EnsEMBL::Funcgen::Array object. + Returntype : Bio::EnsEMBL::Funcgen::Array + Exceptions : None ? should throw if mandatort params not set/valid + Caller : General + Status : At risk + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_); + + my ($name, $format, $size, $vendor, $type, $desc, $aclass) + = rearrange( ['NAME', 'FORMAT', 'SIZE', 'VENDOR', 'TYPE', 'DESCRIPTION', 'CLASS'], @_ ); + + #mandatory params? + #name, format, vendor + #enum on format? + + my @stack = caller(); + + if($self->dbID() && $stack[0] ne "Bio::EnsEMBL::Funcgen::DBSQL::ArrayAdaptor"){ + throw("You must use the ArrayAdaptor($stack[0]) to generate Arrays with a dbID i.e. from the DB, as this module accomodates updating which may cause incorrect data if the object is not generated form the DB"); + } + + + throw("Must provide a vendor parameter") if ! $vendor; + throw("Must provide a name parameter") if ! $name; + #any others? + + + $self->name($name); + $self->format($format) if defined $format; + + if(defined $format && $format eq 'EXPRESSION' && ! defined $class){ + throw('You must defined a class if you are importing and array with an EXPRESSION format'); + } + + $self->class(uc($aclass)) if defined $aclass; + $self->size($size) if defined $size; + $self->vendor($vendor); + $self->description($desc) if defined $desc; + $self->type($type) if defined $type; + + return $self; +} + +=head2 get_all_Probes + + Args : None + Example : my $probes = $array->get_all_Probes(); + Description: Returns all probes on an array. Needs a database connection. + Returntype : Listref of Bio::EnsEMBL::Funcgen::Probe objects + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub get_all_Probes { + my $self = shift; + + if ( $self->dbID() && $self->adaptor() ) { + my $opa = $self->adaptor()->db()->get_ProbeAdaptor(); + my $probes = $opa->fetch_all_by_Array($self); + return $probes; + } else { + warning('Need database connection to retrieve Probes'); + return []; + } +} + +=head2 get_all_Probe_dbIDs + + Args : None + Example : my @dbids = @{$array->get_all_Probe_dbIDs}; + Description: Returns an array ref of all the Probe database IDs for this array + Returntype : arrayref of ints + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub get_all_Probe_dbIDs { + my $self = shift; + + if(! $self->{probe_dbids}){ + #check for adaptor here? + + if(! $self->adaptor){ + throw('Must have set an adaptor to get_all_Probe_dbIDs'); + } + + $self->{probe_dbids} = $self->adaptor->fetch_Probe_dbIDs_by_Array($self); + } + + return $self->{probe_dbids}; +} + + + + +#Nath new get methods + +=head2 get_all_ProbeSets + + Args : None + Example : my $probesets = $array->get_all_ProbeSets(); + Description: Returns all probesets on an array. Needs a database connection. + Returntype : Listref of Bio::EnsEMBL::Funcgen::ProbeSets objects + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub get_all_ProbeSets { + my $self = shift; + + if ( $self->dbID() && $self->adaptor() ) { + my $opsa = $self->adaptor()->db()->get_ProbeSetAdaptor(); + my $probesets = $opsa->fetch_all_by_Array($self); + return $probesets; + } else { + warning('Need database connection to retrieve ProbeSets'); + return []; + } +} + + +#All the array_chip methods will be migrated to ArrayChip.pm + +=head2 get_array_chip_ids + + Example : my @ac_ids = @{$array->get_array_chip_ids()}; + Description: Returns all array_chip_ids for this array. + Returntype : Listref of array_chip ids + Exceptions : Throws if none retrieved + Caller : General + Status : At Risk + +=cut + +sub get_array_chip_ids { + my $self = shift; + + my @ac_ids; + + + $self->get_ArrayChips(); + + #should we get_ArrayChips is we have none cached? + #this may cause problem + + + foreach my $achip(values %{$self->{'array_chips'}}){ + push @ac_ids, $achip->dbID(); + } + + if(! @ac_ids){ + throw("No array_chip_ids available"); # should this be warn? + } + + return \@ac_ids; +} + +=head2 get_design_ids + + Example : my @design_ids = @{$array->get_design_ids()}; + Description: Returns a the design_ids for each array_chip contained within this array + Returntype : list + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + + + +sub get_design_ids{ + my $self = shift; + return [keys %{$self->{'array_chips'}}]; +} + + + +=head2 name + + Arg [1] : (optional) string - the name of this array + Example : my $name = $array->name(); + Description: Getter, setter of the name attribute for Array + objects. + Returntype : string + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub name{ + my $self = shift; + + $self->{'name'} = shift if @_; + + #do we need this? + #if ( !exists $self->{'name'} && $self->dbID() && $self->adaptor() ) { + # $self->adaptor->fetch_attributes($self); + #} + + return $self->{'name'}; +} + + +=head2 type + + Arg [1] : (optional) string - the type of this array + Example : $array->type('OLIGO'); + Description: Getter, setter of the type attribute for Array + objects. + Returntype : string + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub type{ + my $self = shift; + + $self->{'type'} = shift if @_; + + return $self->{'type'}; +} + + +=head2 format + + Arg [1] : (optional) string - the format of the array + Example : my $format = $array->format(); + Description: Getter, setter of format attribute for + Array objects e.g. Tiled, Targetted etc... + Returntype : string + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub format { + my $self = shift; + + $self->{'format'} = shift if @_; + + #do we need this? + #if ( !exists $self->{'format'} && $self->dbID() && $self->adaptor() ) { + # $self->adaptor->fetch_attributes($self); + #} + + return $self->{'format'}; +} + +=head2 class + + Arg [1] : (optional) string - the class of the array + Example : my $class = $array->class('AFFY_UTR'); + Description: Getter, setter of class attribute for + Array objects e.g. AFFY_UTR, AFFY_ST + Returntype : string + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub class { + my $self = shift; + + $self->{'class'} = shift if @_; + + return $self->{'class'}; +} + + +=head2 size + + Arg [1] : (optional) int - the number of ? in the array + Example : my $size = $array->size(); + Description: Getter of size attribute for Array objects. This + simply counts the constituent ArrayChips + Returntype : int + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub size { + my $self = shift; + + return scalar(keys %{$self->{'array_chips'}}); +} + + +=head2 vendor + + Arg [1] : (optional) string - the name of the array vendor + Example : my $vendor = $array->vendor(); + Description: Getter, setter of vendor attribute for + Array objects. + Returntype : string + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub vendor { + my $self = shift; + $self->{'vendor'} = shift if @_; + + #do we need this? + #if ( !exists $self->{'vendor'} && $self->dbID() && $self->adaptor() ) { + # $self->adaptor->fetch_attributes($self); + #} + + return $self->{'vendor'}; +} + +=head2 description + + Arg [1] : (optional) string - the description of the array + Example : my $size = $array->description(); + Description: Getter, setter of description attribute for + Array objects. + Returntype : string + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub description { + my $self = shift; + $self->{'description'} = shift if @_; + + #do we need this? + #if ( !exists $self->{'description'} && $self->dbID() && $self->adaptor() ) { + # $self->adaptor->fetch_attributes($self); + #} + + return $self->{'description'}; +} + +=head2 probe_count + + Example : my $num_probes = $array->probe_count(); + Description: Return number of probes on array + Returntype : string + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub probe_count { + my ($self) = @_; + #Do we want a distinct flag here? + + if(! defined $self->{'probe_count'}){ + $self->{'probe_count'} = $self->adaptor->fetch_probe_count_by_Array($self); + } + + return $self->{'probe_count'}; +} + + + +=head2 get_ArrayChips + + Example : my @achips = @{$array->get_ArrayChips()}; + Description: Getter, setter and lazy loader of array_chip hashes + Returntype : Arrays of ArrayChip objects + Exceptions : Throws exception if none found for array_id + Caller : General + Status : High Risk - migrate to ArrayChip.pm + +=cut + +sub get_ArrayChips { + my $self = shift; + + #lazy loaded as we won't want this for light DB + #should do meta check and want here + + if ( ! exists $self->{'array_chips'}){ + + if( $self->dbID() && $self->adaptor() ) { + #$self->adaptor->fetch_attributes($self); + #need to do this differently as we're accessing a different table + $self->{'array_chips'} = {}; + + foreach my $achip(@{$self->adaptor->db->get_ArrayChipAdaptor->fetch_all_by_array_id($self->dbID())}){ + $self->{'array_chips'}{$achip->design_id} = $achip; + #%{$self->{'array_chips'}} = %{$self->adaptor->db->get_ArrayAdaptor->_fetch_array_chips_by_array_dbID($self->dbID())}; + } + } + else{ + throw("Need array dbID and DB connection to retrieve array_chips"); + } + } + + return [ values %{$self->{'array_chips'}} ]; +} + +=head2 get_ArrayChip_by_design_id + + Arg [1] : (mandatory) int - design_id + Example : my %ac = %{$array->get_ArrayChip_by_design_id('1234')}; + Description: Getter for array_chip hashes + Returntype : Hashref + Exceptions : Throws exception if no design_id defined, warns if not part of array + Caller : General + Status : At risk + +=cut + +sub get_ArrayChip_by_design_id{ + my ($self, $design_id) = @_; + + + #warn "This needs to get the array chip if not defined?? but we're using it to test whether is has been stored same problem as probe_design?"; + + my ($achip); + throw("Must supply a valid array chip design_id") if (! defined $design_id); + + if(defined $self->{'array_chips'}{$design_id}){ + $achip = $self->{'array_chips'}{$design_id}; + }else{ + #No we use this to check whether it has been stored with the array + #warn("should this throw? Array does not contain ArrayChip:$design_id\n"); + } + + return $achip; +} + +=head2 add_ArrayChip + + Arg [1] : mandatory - Bio::EnsEMBL::Funcgen::ArrayChip + Example : $array->add_ArrayChip($array_chip); + Description: Setter for array chips + Returntype : None + Exceptions : Throws if arg not a Bio::EnsEMBL::Funcgen::ArrayChip, or Array not stored + Caller : General + Status : Ar risk + +=cut + +#This uses previosuly stored array_chips withotu warning +#Need to implement fetch_store method? + +sub add_ArrayChip{ + my ($self, $array_chip) = @_; + + throw("You must supply a stored Bio::EnsEMBL::Funcgen::ArrayChip") if(! ($array_chip && + $array_chip->isa("Bio::EnsEMBL::Funcgen::ArrayChip") && + $array_chip->dbID())); + + if ($self->dbID() && $self->adaptor()){ + $self->get_ArrayChips() if (! $self->{'array_chips'}); + + if(exists $self->{'array_chips'}{$array_chip->design_id}){ + $array_chip = $self->{'array_chips'}{$array_chip->design_id}; + #warn("Array chip for ".$array_chip->design_id()." already exists, using previous stored array chip\n"); + }else{ + $self->{'array_chips'}{$array_chip->design_id} = $array_chip; + } + + }else{ + throw("Array must be stored before adding an array_chip"); + } + + return; +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/ArrayChip.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/ArrayChip.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,205 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::ArrayChip +# +# You may distribute this module under the same terms as Perl itself + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::Funcgen::ArrayChip - A simple module to represent the concept/template of +a chip/slide within an array, of which the physical manifestation is an ExperimentalChip. + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Funcgen::ArrayChip; + + my $ec = Bio::EnsEMBL::Funcgen::ArrayChip->new( + -ARRAY_ID => $array->dbID(), + -NAME => $desc, + -DESIGN_ID => $design_id, + ); + +#add more methods here? + + +=head1 DESCRIPTION + +An ArrayChip object represent the concept of an array chip/slide withing a given array/chipset. +The data for ArrayChips is stored in the array_chip table. + + +=cut + +use strict; +use warnings; + + +package Bio::EnsEMBL::Funcgen::ArrayChip; + + +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Utils::Exception qw( throw warning ); +use Bio::EnsEMBL::Funcgen::Storable; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Storable); + + +=head2 new + + Arg [-ARRAY_ID] : int - the dbID of the parent array + Arg [-ARRAY] : Bio::EnsEMBL::Funcgen::Array + Arg [-DESIGN_ID] : string - the unqiue deisng ID defined by the array vendor + Arg [-NAME] : string - the name of the array chip + + + Example : my $array_chip = Bio::EnsEMBL::Funcgen::ArrayChip->new( + -ARRAY_ID => $array->dbID(), + -NAME => $desc, + -DESIGN_ID => $design_id, + ); ); + Description: Creates a new Bio::EnsEMBL::Funcgen::ArrayChip object. + Returntype : Bio::EnsEMBL::Funcgen::ArrayChip + Exceptions : None ? should throw if mandaotry params not set + Caller : General + Status : Medium Risk + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_); + + my ($array_id, $name, $design_id, $array) + = rearrange( ['ARRAY_ID', 'NAME', 'DESIGN_ID', 'ARRAY'], @_ ); + + + #Remove array_id so we can remove checking below? + + throw("Must define a name($name) and design_id($design_id)") if(! $name || ! $design_id); + + + #Make these mutually exclusive to avoid checking + if($array_id && $array){ + throw('Must provide either -array or -array_id but not both'); + } + + if(defined $array){ + + if(!(ref($array) && $array->isa('Bio::EnsEMBL::Funcgen::Array'))){ + throw('array paramter must be a valid Bio::EnsEMBL::Funcgen::Array'); + } + + $self->{'array'} = $array; + } + + + + $self->array_id($array_id) if defined $array_id; + $self->name($name); + $self->design_id($design_id); + + return $self; +} + + +=head2 array_id + + Arg [1] : (optional) int - the parent array dbID + Example : my $array_id = $array_chip->array_id(); + Description: Getter, setter array_id attribute. + Returntype : int + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub array_id { + my $self = shift; + $self->{'array_id'} = shift if @_; + return $self->{'array_id'}; +} + +=head2 name + + Arg [1] : (optional) string - the array chip name + Example : my $ac_name = $array_chip->name(); + Description: Getter, setter for the name attribute + Returntype : string + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub name { + my $self = shift; + $self->{'name'} = shift if @_; + return $self->{'name'}; +} + +=head2 design_id + + Arg [1] : (optional) string - the array_chip unique design id as deinfed by the array vendor + Example : my $design_id = $array_chip->design_id(); + Description: Getter, setter for the design_id attribute + Returntype : string + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub design_id { + my $self = shift; + $self->{'design_id'} = shift if @_; + return $self->{'design_id'}; +} + + +=head2 get_Array + + Example : my $array = $array_chip->get_array(); + Description: Getter for the array attribute + Returntype : Bio::EnsEMBL::Funcgen::Array + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub get_Array { + my $self = shift; + + if(! defined $self->{'array'}){ + $self->{'array'} = $self->adaptor->db->get_ArrayAdaptor()->fetch_by_dbID($self->array_id()); + } + + return $self->{'array'}; +} + + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/BindingMatrix.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/BindingMatrix.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,529 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::BindingMatrix +# +# You may distribute this module under the same terms as Perl itself + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::Funcgen::BindingMatrix - A module to represent a BindingMatrix. +In EFG this represents the binding affinities of a Transcription Factor to DNA. + +=head1 SYNOPSIS + +use Bio::EnsEMBL::Funcgen::BindingMatrix; + +my $matrix = Bio::EnsEMBL::Funcgen::BindingMatrix->new( + -name => "MA0122.1", + -type => "Jaspar", + -description => "Nkx3-2 Jaspar Matrix", + ); +$matrix->frequencies("A [ 4 1 13 24 0 0 6 4 9 ] + C [ 7 4 1 0 0 0 0 6 7 ] + G [ 4 5 7 0 24 0 18 12 5 ] + T [ 9 14 3 0 0 24 0 2 3 ]"); + +print $matrix->relative_affinity("TGGCCACCA")."\n"; + +print $matrix->threshold."\n"; + +=head1 DESCRIPTION + +This class represents information about a BindingMatrix, containing the name +(e.g. the Jaspar ID, or an internal name), and description. A BindingMatrix +is always associated to an Analysis (indicating the origin of the matrix e.g. +Jaspar) and a FeatureType (the binding factor). + +=head1 SEE ALSO + +Bio::EnsEMBL::Funcgen::DBSQL::BindingMatrixAdaptor + +=cut + + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::BindingMatrix; + +use Bio::EnsEMBL::Utils::Argument qw( rearrange ) ; +use Bio::EnsEMBL::Utils::Exception qw( throw warning ); +use Bio::EnsEMBL::Funcgen::Storable; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Storable); + +=head2 new + + Arg [-name]: string - name of Matrix + Arg [-analysis]: Bio::EnsEMBL::Analysis - analysis describing how the matrix was obtained + Arg [-frequencies]: (optional) string - frequencies representing the binding affinities of a Matrix + Arg [-threshold]: (optional) float - minimum relative affinity for binding sites of this matrix + Arg [-description]: (optional) string - descriptiom of Matrix + Example : my $matrix = Bio::EnsEMBL::Funcgen::BindingMatrix->new( + -name => "MA0122.1", + -analysis => $analysis, + -description => "Jaspar Matrix", + ); + Description: Constructor method for BindingMatrix class + Returntype : Bio::EnsEMBL::Funcgen::BindingMatrix + Exceptions : Throws if name or/and type not defined + Caller : General + Status : Medium risk + +=cut + +sub new { + my $caller = shift; + + my $obj_class = ref($caller) || $caller; + my $self = $obj_class->SUPER::new(@_); + + my ( $name, $analysis, $freq, $desc, $ftype, $thresh ) = rearrange + ( [ + 'NAME', 'ANALYSIS', 'FREQUENCIES', 'DESCRIPTION', 'FEATURE_TYPE', 'THRESHOLD' + ], @_); + + + if(! defined $name){ + throw("Must supply a name\n"); + } + + if(! ((ref $analysis) && $analysis->isa('Bio::EnsEMBL::Analysis') )){ + throw("You must define a valid Bio::EnsEMBL::Analysis"); + #leave is stored test to adaptor + } + + if(! (ref($ftype) && $ftype->isa('Bio::EnsEMBL::Funcgen::FeatureType'))){ + throw("You must define a valid Bio::EnsEMBL::Funcgen::FeatureType"); + #leave is stored test to adaptor + } + + $self->name($name); + $self->{analysis} = $analysis; + $self->{feature_type} = $ftype; + $self->frequencies($freq) if $freq; + $self->description($desc) if $desc; + $self->threshold($thresh) if $thresh; + + return $self; +} + + +=head2 feature_type + + Example : my $ft_name = $matrix->feature_type()->name(); + Description: Getter for the feature_type attribute for this matrix. + Returntype : Bio::EnsEMBL::Funcgen::FeatureType + Exceptions : None + Caller : General + Status : At risk + +=cut + +sub feature_type { + my $self = shift; + + return $self->{'feature_type'}; +} + + +=head2 name + + Arg [1] : (optional) string - name + Example : my $name = $matrix->name(); + Description: Getter and setter of name attribute + Returntype : string + Exceptions : None + Caller : General + Status : Low Risk + +=cut + +sub name { + my $self = shift; + $self->{'name'} = shift if @_; + return $self->{'name'}; +} + +=head2 description + + Arg [1] : (optional) string - description + Example : my $desc = $matrix->description(); + Description: Getter and setter of description attribute + Returntype : string + Exceptions : None + Caller : General + Status : Low Risk + +=cut + +sub description { + my $self = shift; + $self->{'description'} = shift if @_; + return $self->{'description'}; +} + +=head2 threshold + + Arg [1] : (optional) float - threshold + Example : my $thresh = $matrix->threshold(); + Description: Getter and setter of threshold attribute + Returntype : float + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub threshold { + my $self = shift; + $self->{'threshold'} = shift if @_; + return $self->{'threshold'}; +} + + +=head2 analysis + Example : $matrix->analysis()->logic_name(); + Description: Getter for the feature_type attribute for this matrix. + Returntype : Bio::EnsEMBL::Analysis + Exceptions : None + Caller : General + Status : At risk + +=cut + +sub analysis { + my $self = shift; + + return $self->{'analysis'}; +} + + +=head2 frequencies + + Arg [1] : (optional) string - frequencies + Example : $matrix->frequencies($frequencies_string); + Description: Getter and setter of frequencies attribute + + The attribute is a string representing the matrix binding + affinities in the Jaspar format. E.g. + "> + [ ] + " + + Returntype : string + Exceptions : Throws if the string attribute is not a properly + formed matrix in the Jaspar format + Caller : General + Status : At Risk + +=cut + +sub frequencies { + my $self = shift; + + my $frequencies = shift if @_; + if($frequencies){ + $self->_weights($frequencies); + $self->{'frequencies'} = $frequencies; + } + return $self->{'frequencies'}; +} + +=head2 frequencies_revcomp + + Example : $matrix->frequencies_revcomp(); + Description: Getter for the reverse complement frequencies attribute + + The attribute represents the reverse complement of frequencies + + Returntype : string + Caller : General + Status : At Risk + +=cut + +sub frequencies_revcomp { + my $self = shift; + + return $self->{'frequencies_revcomp'}; +} + + +=head2 relative_affinity + + Arg [1] : string - Binding Site Sequence + Arg [2] : (optional) boolean - 1 if results are to be in linear scale (default is log scale) + Example : $matrix->relative_affinity($sequence); + Description: Calculates the binding affinity of a given sequence + relative to the optimal site for the matrix + The site is taken as if it were in the proper orientation + Considers a purely random background p(A)=p(C)=p(G)=p(T) + Returntype : double + Exceptions : Throws if the sequence length does not have the matrix length + or if the sequence has unclear bases (N is not accepted) + Caller : General + Status : At Risk + +=cut + +sub relative_affinity { + my ($self, $sequence, $linear) = (shift, shift, shift); + $sequence =~ s/^\s+//; + $sequence =~ s/\s+$//; + + throw "No sequence given" if !$sequence; + $sequence = uc($sequence); + if($sequence =~ /[^ACGT]/){ + throw "Sequence $sequence contains invalid characters: Only Aa Cc Gg Tt accepted"; + } + + my $weight_matrix = $self->_weights; + my $matrix_length = scalar(@{$weight_matrix->{'A'}}); + if(length($sequence) != $matrix_length){ + throw "Sequence $sequence does not have length $matrix_length"; + } + + my $log_odds = 0; + my @bases = split(//,$sequence); + for(my $i=0;$i<$matrix_length;$i++){ + $log_odds += $weight_matrix->{$bases[$i]}->[$i]; + } + + #This log scale may be quite unrealistic... but usefull just for comparisons... + if(!$linear){ + return ($log_odds - $self->_min_bind) / ($self->_max_bind - $self->_min_bind); + } else { + return (exp($log_odds) - exp($self->_min_bind)) / (exp($self->_max_bind) - exp($self->_min_bind)); + } + +} + +=head2 is_position_informative + + Arg [1] : int - 1-based position withing the matrix + Arg [2] : (optional) double - threshold [0-2] for information content [default is 1.5] + Example : $matrix->is_position_informative($pos); + Description: Returns true if position information content is over threshold + Returntype : boolean + Exceptions : Throws if position or threshold out of bounds + Caller : General + Status : At High Risk + +=cut + +sub is_position_informative { + my ($self,$position,$threshold) = (shift,shift,shift); + throw "Need a position" if(!defined($position)); + throw "Position out of bounds" if(($position<1) || ($position > $self->length)); + if(!defined($threshold)){ $threshold = 1.5; } + throw "Threshold out of bounds" if(($threshold<0) || ($threshold>2)); + return ($self->{'ic'}->[$position-1] >= $threshold); +} + + + +=head2 length + + Example : $bm->length(); + Description: Returns the length of the the matrix (e.g. 19bp long) + Returntype : int with the length of this binding matrix + Exceptions : none + Caller : General + Status : At Risk + +=cut + +sub length { + my $self = shift; + + my $weight_matrix = $self->_weights; + + return scalar(@{$weight_matrix->{'A'}}); +} + +=head2 _weights + + Arg [1] : (optional) string - frequencies + Example : _weights($frequencies); + Description: Private Getter Setter for the weight matrix based on frequencies + Returntype : HASHREF with the weights of this binding matrix + Exceptions : Throws if the frequencies attribute string does not correspond + to 4 rows of an equal number of integer numbers. + Caller : Self + Status : At Risk + +=cut + +sub _weights { + my $self = shift; + + #for the moment use equiprobability and constant pseudo-count + my $pseudo = 0.1; + + #TODO allow for it to be passed as parameters? + my $frequencies = shift if @_; + if($frequencies){ + $frequencies =~ s/^(>.*?\n)//; + my $header = $1; + + my ($a,$c,$g,$t) = split(/\n/,$frequencies); + my @As = split(/\s+/,_parse_matrix_line('[A\[\]]',$a)); + my @Cs = split(/\s+/,_parse_matrix_line('[C\[\]]',$c)); + my @Gs = split(/\s+/,_parse_matrix_line('[G\[\]]',$g)); + my @Ts = split(/\s+/,_parse_matrix_line('[T\[\]]',$t)); + if((scalar(@As)!=scalar(@Cs)) || (scalar(@As)!=scalar(@Gs)) || (scalar(@As)!=scalar(@Ts)) ){ + throw "Frequencies provided are not a valid frequency matrix" + } + $self->_calc_ic(\@As,\@Cs,\@Gs,\@Ts,$pseudo); + + #Create the reverse complement + my @revT = reverse(@As); + my @revA = reverse(@Ts); + my @revC = reverse(@Gs); + my @revG = reverse(@Cs); + my $revcomp = $header; + $revcomp.= "A [ ".join("\t",@revA)." ]\n"; + $revcomp.= "C [ ".join("\t",@revC)." ]\n"; + $revcomp.= "G [ ".join("\t",@revG)." ]\n"; + $revcomp.= "T [ ".join("\t",@revT)." ]\n"; + $self->{'frequencies_revcomp'} = $revcomp; + + my @totals; + for(my $i=0;$i{'weights'} = \%weights; + + my $max = 0; my $min = 0; + for(my $i=0;$i_max_bind($max); + $self->_min_bind($min); + } + + return $self->{'weights'}; + +} + +=head2 _calc_ic + + Example : _calc_ic($as,$cs,$gs,$ts,$pseudo); + Description: Private function to calculate the matrix information content per position + Caller : self + Status : At Risk + +=cut + +sub _calc_ic { + my ($self,$as, $cs, $gs, $ts,$pseudo) = (shift,shift, shift, shift, shift, shift); + my @ic = (); + for (my $i=0;$i[$i] + $cs->[$i] + $gs->[$i] + $ts->[$i] + (4*$pseudo); + my $fas = ($as->[$i] + $pseudo) / $total_i; + my $fcs = ($cs->[$i] + $pseudo) / $total_i; + my $fgs = ($gs->[$i] + $pseudo) / $total_i; + my $fts = ($ts->[$i] + $pseudo) / $total_i; + my $ic_i = 2 + ($fas * log($fas)/log(2)) + ($fcs * log($fcs)/log(2)) + ($fgs * log($fgs)/log(2)) + ($fts * log($fts)/log(2)); + push @ic, $ic_i; + } + $self->{'ic'} = \@ic; +} + +sub _parse_matrix_line { + my ($pat,$line) = (shift,shift); + $line=~s/$pat//g; $line=~s/^\s+//; $line=~s/\s+$//; + return $line; +} + +sub _max { return _min_max(shift, 0); } + +sub _min { return _min_max(shift, 1); } + +sub _min_max { + my ($list,$min) = (shift, shift); + my $min_max = $list->[0]; + map { if($min ? $_ < $min_max : $_ > $min_max){ $min_max = $_; } } @$list; + return $min_max; +} + + +=head2 _max_bind + + Arg [1] : (optional) double - maximum binding affinity + Example : $matrix->_max_bind(10.2); + Description: Private Getter and setter of max_bind attribute (not to be called directly) + Returntype : float with the maximum binding affinity of the matrix + Exceptions : None + Caller : Self + Status : At Risk + +=cut + +sub _max_bind { + my $self = shift; + + $self->{'max_bind'} = shift if @_; + + return $self->{'max_bind'}; +} + +=head2 _min_bind + + Arg [1] : (optional) double - minimum binding affinity + Example : $matrix->_min_bind(-10.2); + Description: Private Getter and setter of min_bind attribute (not to be called directly) + Returntype : float with the minimum binding affinity of the matrix + Exceptions : None + Caller : Self + Status : At Risk + +=cut + +sub _min_bind { + my $self = shift; + + $self->{'min_bind'} = shift if @_; + + return $self->{'min_bind'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/CellType.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/CellType.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,232 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::CellType +# +# You may distribute this module under the same terms as Perl itself + + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::Funcgen::CellType - A module to represent a CellType. + +=head1 SYNOPSIS + +use Bio::EnsEMBL::Funcgen::CellType; + +#Fetch from adaptor +my $ctype = $cell_type_adaptor->fetch_by_name($ctype_name); + +#Create from new +my $ctype = Bio::EnsEMBL::Funcgen::CellType->new + ( + -name => 'H1-ESC', + -display_label => 'H1-ESC', + -description => 'Human Embryonic Stem Cell', + -efo_id => 'efo:EFO_0003042', + -tissue => 'embryonic stem cell', + ); + +print $ctype->name.' is a '.$ctype->description."\n"; + +#H1-ESC is a Human Embryonic Stem Cell + + +=head1 DESCRIPTION + +This is a simple class to represent information about a cell type. This may represent +harvested cells, a cell line or a more generic tissue type. + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::CellType; + +use Bio::EnsEMBL::Utils::Argument qw( rearrange ) ; +use Bio::EnsEMBL::Utils::Exception qw( throw warning ); +use Bio::EnsEMBL::Storable; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Storable); + +my %valid_genders = ( + male => 1, + female => 1, + hermaphrodite => 1, + ); + +=head2 new + + Arg [1] : String - name of CellType + Arg [2] : String - display label of CellType. Defaults to name + Arg [3] : String - description of CellType + Arg [4] : String - gender e.g. male, female or NULL + Arg [5] : String - Experimental Factor Ontology ID e.g. EFO_0002869 + + Example : my $ct = Bio::EnsEMBL::Funcgen::CellType->new + ( + -name => "U2OS", + -display_label => "U20S", + -description => "Human Bone Osteosarcoma Epithelial Cells", + -gender => 'female', + -efo_id => 'EFO_0002869', + ); + + Description: Constructor method for CellType class + Returntype : Bio::EnsEMBL::Funcgen::CellType + Exceptions : Throws if name not defined. + Caller : General + Status : Stable + +=cut + +#-type/class => "TISSUE", enum? Mandatory. +#remove display label? + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_); + + my ($name, $dlabel, $desc, $gender, $efo_id, $tissue) = rearrange + (['NAME', 'DISPLAY_LABEL', 'DESCRIPTION','GENDER', 'EFO_ID', 'TISSUE'], @_); + + throw("Must supply a CellType name") if ! defined $name; + + if(defined $gender){ + + if( ! exists $valid_genders{lc($gender)} ){ #enum will not force this so validate here + throw("Gender not valid, must be one of:\t".join(' ', keys %valid_genders)); + } + + $self->{gender} = $gender; + } + + #Set explicitly to enable faster getter only methods + $self->{name} = $name; + $self->{display_label} = $dlabel || $name; + $self->{description} = $desc if defined $desc; + $self->{efo_id} = $efo_id if defined $efo_id; + $self->{tissue} = $tissue if defined $tissue; + return $self; +} + + +=head2 name + + Example : my $name = $ct->name(); + Description: Getter of name attribute for CellType objects + Returntype : string + Exceptions : None + Caller : General + Status : Low Risk + +=cut + +sub name { + return $_[0]->{'name'}; +} + + +=head2 gender + + Example : my $gender = $ct->gender(); + Description: Getter for the gender attribute for CellType objects + Returntype : string + Exceptions : None + Caller : General + Status : Low Risk + +=cut + +sub gender { + my $self = shift; + $self->{'gender'} = shift if @_; + return $self->{'gender'}; +} + + +=head2 description + + Example : my $desc = $ct->description(); + Description: Getter of description attribute for CellType objects + Returntype : string + Exceptions : None + Caller : General + Status : Stable + +=cut + +sub description { + return $_[0]->{'description'}; +} + + +=head2 display_label + + Example : my $display_label = $ct->display_label(); + Description: Getter of display_label attribute for CellType objects. + Returntype : string + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub display_label { + return $_[0]->{'display_label'}; +} + + +=head2 efo_id + + Example : my $efo_id = $ft->efo_id; + Description: Getter of the Experimental Factor Ontology ID + Returntype : String + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub efo_id{ + return $_[0]->{'efo_id'}; +} + + +=head2 tissue + + Example : my $tissue = $ft->tissue; + Description: Getter of the tissue attribute for a given cell type + Returntype : String + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub tissue{ + return $_[0]->{'tissue'}; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/Channel.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Channel.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,207 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::Channel +# +# You may distribute this module under the same terms as Perl itself + + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::Funcgen::Channel - A module to represent a single channel of an ExperimentalChip + +=head1 SYNOPSIS + +use Bio::EnsEMBL::Funcgen::Channel; + +my $array = Bio::EnsEMBL::Funcgen::Channel->new( + -EXPERIMENTAL_CHIP_ID => $ec_id, + -SAMPLE_ID => $sample_id, + -TYPE => $type, + -DYE => $dye, + ); + +#-replace TYPE with DENOMINATOR? + +my $db_adaptor = Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor->new(...); +my $chan_a = $db_adaptor->get_ChannelAdaptor(); +my $chan = $chan_a->fetch_by_type_ExperimentalChip($type, $ExpChip); + +=head1 DESCRIPTION + +A Channel object represents a single channel on an ExperimentalChip. The data +are stored in the channel table, and associated expermental variables are +stored in the experimental_variable table. + +=cut + +use strict; +use warnings; + + +package Bio::EnsEMBL::Funcgen::Channel; + + +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Utils::Exception qw( throw warning ); +use Bio::EnsEMBL::Funcgen::Storable; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Storable); + + +=head2 new + + Arg [-EXPERIMENTAL_CHIP_ID]: int - the experimental chip dbID + + + + Example : my $array = Bio::EnsEMBL::Funcgen::Channel->new( + -EXPERIMENTAL_CHIP_ID => $ec_id, + -SAMPLE_ID => $sample_id, + -TYPE => $type, + -DYE => $dye, + ); + Description: Creates a new Bio::EnsEMBL::Funcgen::Channel object. + Returntype : Bio::EnsEMBL::Funcgen::Channel + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + #can we lc these? + my ($ec_id, $sample_id, $type, $dye) + = rearrange( ['EXPERIMENTAL_CHIP_ID', 'SAMPLE_ID', 'TYPE', 'DYE'], @_ ); + + $self->sample_id($sample_id) if defined $sample_id; + $self->experimental_chip_id($ec_id) if defined $ec_id; + $self->type($type) if defined $type; + $self->dye($dye) if defined $dye; + + + return $self; +} + +=head2 sample_id + + Arg [1] : (optional) string - the sample id for this Channel + Example : my $sample_id = $chan->sample_id(); + Description: Getter, setter and lazy loader of sample_id attribute. + Returntype : string + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub sample_id { + my $self = shift; + $self->{'sample_id'} = shift if @_; + + if ( ! exists $self->{'sample_id'} && $self->dbID() && $self->adaptor() ) { + $self->adaptor->fetch_attributes($self); + } + + return $self->{'sample_id'}; +} + + + +=head2 experimental_chip_id + + Arg [1] : (optional) int - the experimenta chip dbID + Example : my $ec_id = $chan->experimental_chip_id(); + Description: Getter, setter and lazy loader of experimental_chip_id attribute + Returntype : int + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub experimental_chip_id { + my $self = shift; + + $self->{'experimental_chip_id'} = shift if @_; + if ( !exists $self->{'experimental_chip_id'} && $self->dbID() && $self->adaptor() ) { + $self->adaptor->fetch_attributes($self); + } + return $self->{'experimental_chip_id'}; +} + +=head2 type + + Arg [1] : (optional) string - the channel type e.g. EXPERIMENTAL or CONTROL + Example : my $type = $chan->type(); + Description: Getter, setter and lazy loader of type attribute + Returntype : string + Exceptions : + Caller : General + Status : Medium Risk + +=cut + +sub type { + my $self = shift; + + $self->{'type'} = shift if @_; + + #warn "we need to control EXPERIMENTAL OR CONTROL here or enum on DB"; + + if ( !exists $self->{'type'} && $self->dbID() && $self->adaptor() ) { + $self->adaptor->fetch_attributes($self); + } + return $self->{'type'}; +} + +=head2 dye + + Arg [1] : (optional) string - the channel type e.g. EXPERIMENTAL or CONTROL + Example : my $dye = $chan->dye(); + Description: Getter, setter and lazy loader of dye attribute + Returntype : string + Exceptions : + Caller : General + Status : Medium Risk + +=cut + +sub dye { + my $self = shift; + + $self->{'dye'} = shift if @_; + + #if ( !exists $self->{'dye'} && $self->dbID() && $self->adaptor() ) { + # $self->adaptor->fetch_attributes($self); + #} + return $self->{'dye'}; +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/Collection/ResultFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Collection/ResultFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,169 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::Collection::ResultFeature +# +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::Funcgen::Collection::ResultFeature - A module to represent a lightweight ResultFeature collection + +=head1 SYNOPSIS + +use Bio::EnsEMBL::Funcgen::Collection::ResultFeature; + +my $rfeature = Bio::EnsEMBL::Funcgen::Collection::ResultFeature->new_fast({ + start => $start, + end => $end, + scores => [ $score ] +}); + +my @rfeatures = @{$rset->get_displayable_ResultFeatures_by_Slice($slice)}; + +foreach my $rfeature (@rfeatures){ + my $score = $rfeature->score(); + my $rf_start = $rfeature->start(); + my $rf_end = $rfeature->end(); +} + +=head1 DESCRIPTION + +This is a Collection feature which is designed to store compressed/collected +feature information for a defined window/bin size over a complete seq_region. +Or alternatively a single feature at the natural resolution i.e. window_size == 0. +The complete seq_region collections are cropped to provide a ResultFeature on any +given Slice. ResultFeatures are primarily stored in the result_feature table, +but can also be generated on the fly from unprocessed data in the array result +tables. + + +=head1 SEE ALSO + +Bio::EnsEMBL::Funcgen::DBSQL::ResultFeatureAdaptor +Bio::EnsEMBL::Funcgen::Collector::ResultFeature + +=cut + + +#This is distinct from a normal feature as the collection may have differen attributes and methods from the normal feature +#implementation. For example a Bio::EnsEMBL::Collection::Gene would only have summary information over several genes. +#Altho', unlikely that we'll ever collect genes. + +use strict; +use warnings; + + +package Bio::EnsEMBL::Funcgen::Collection::ResultFeature; +use base ('Bio::EnsEMBL::Feature');#@ISA + +#This needs to inherit from Bio::EnsEMBL::Collection +#Which can host some of the below methods + +#Reverted to hash implementation as we no longer deal with +#huge amounts of features due to collections. +#This enables use of transform/seq_region_start/end methods +#and enable us to store on slices that do not begin at 1 +#Altho need to remove code stipulating this + + + +#To do +#Can we move any of these methods to a base Collection class? +#Should probably now use normal new method with validation? + +=head2 new_fast + + Args : Array with attributes start, end, strand, scores, probe, result_set_id, window_size, slice IN THAT ORDER. + WARNING: None of these are validated, hence can omit some where not needed + Example : none + Description: Fast and list version of new. Only works if the code is very disciplined. + Returntype : Bio::EnsEMBL::Funcgen::ResultFeature + Exceptions : None + Caller : ResultSetAdaptor + Status : At Risk + +=cut + +sub new_fast { + #This is agnostic towards to type of reference + return bless ($_[1], $_[0]); + +} + + + +=head2 scores + + Example : my $score = $rf->score(); + Description: Getter of the scores attribute for ResultFeature + objects + Returntype : Arrayref. + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub scores { + return $_[0]->{scores}; +} + + + +=head2 probe + + Example : my $probe = $rf->probe(); + Description: Getter of the probe attribute for ResultFeature + objects + Returntype : Bio::EnsEMBL::Funcgen::Probe + Exceptions : None + Caller : General + Status : At Risk - This can only be used for Features with window 0. + +=cut + +#probe_id is currently not available in the result_feature table, so this would be a result/probe_feature query. + +sub probe { + return $_[0]->{probe}; +} + + +sub result_set_id { + return $_[0]->{result_set_id}; +} + +sub window_size { + return $_[0]->{window_size}; +} + + +sub get_min_max_scores{ + + if(! defined $_[0]->{'min_max_scores'}){ + my @sorted_scores = sort { $a <=> $b } @{$_[0]->{'scores'}}; + $_[0]->{'min_max_scores'} = [$sorted_scores[0], $sorted_scores[$#sorted_scores]]; + } + + return $_[0]->{'min_max_scores'}; +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/Collector.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Collector.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1770 @@ +# $Id: Collector.pm,v 1.7 2011/01/10 11:27:34 nj1 Exp $ + + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +#Your Bio::Ensembl::Collection::Feature defs module should inherit from here +#This could be a local defs file which you have created and require'd into your script + +#If your collections defs module refers to a Bio::EnsEMBL::Feature, +#then it's adaptor should inherit from the collections defs module + + + +package Bio::EnsEMBL::Funcgen::Collector; +#Move this to Bio::EnsEMBL::Utils::Collector for 59? + + +use strict; +use warnings; + +use Bio::EnsEMBL::Utils::Argument ('rearrange'); +use Bio::EnsEMBL::Utils::Exception ('throw'); +use Bio::EnsEMBL::Funcgen::ResultFeature; + +#use base('Bio::EnsEMBL::Collection');#ISA + +our ($pack_template, $packed_size, @window_sizes); #These get set in the FeatureAdaptor +#Make these constants and remove setter functionality in methods? +#Only really important for pack template and windows, maybe these if we are going to start + + +our $max_data_type_size = 16777216; #Default is 16MB for long blob +#we need to deduct the size of the rest of the record here! +#For a 2byte packet the smallest window size possible is: +#(slice->length/(16777216/2) +#so int(bin_size)+1 +#Obviously have to use the largest slice here, for human chr1: +#249,250,621/(16777216/2) = 29.7??? +#We may need to up this slightly to account for larger chrs? +#Implications on memory usage? Is it 4 times for blob manipulation? +#Does substr require this manipulation? +#This max_allowed_packet_size does not seem to translate directly to the size of the +#data being stored e.g. quite a bit more is needed. ISG haven't got to the bottom of this yet. +#But have simply upped the config to 67108864 to handle the largest human chr. + +our $max_view_width = 500000;#Max width in Region In Detail; + + +#our %VALID_BINNING_METHODS +#Remove this in favour of can->('calculate_.$method) and coderefs? + + + + +#To do +# 1 DONE Merge in Collection code, (no need to do this, removed inheritance) +# 2 Write simple BED input to flat file output. +# 3 Separate store method so we can simply get, then wrap store around this +# 4 Test get method with slice adjusts +# 5 separate set_config? +# 6 optimise generate_bin_chunks to handle just one window size for display? +# 7 Handle packed_size pack_template as methods constants +# 8 Provide override method in basefeature adaptor which will use package constant in feature adaptor +# This is because these are really adaptor config, the collector only needs to know the +# packed_size, and in the absence of an feature adaptor also provides the default methods for both. +# If we substr in the API then we need to set sensible limits on blob size, otherwise we will have to unpack a lot of data +# to get at the slice we want. +# OR +# Change adaptor to substr in DB based on known blob ranges/window size +# and stitch together any which cross boundaries. This depends on speed of substr at end of large blob TEST! +# Load with current code first and test this before making either change! +# Delete empty (non-0) collections? i.e. For seq_regions which do not have any source features. +# +# 9 Handle PAR/HAP regions using fetch_normalised_slice_projections This has to be done in the feature adaptor! Then restrict to non_dup regions in calling script + + + +=head2 new + + Args : None + Example : my $collector = Bio::EnsEMBL::(Funcgen|Compara|Variation::)Collector::FEATURE->new; + $collector->store_windows_by_Slice($slice); + Description: Simple new method to enable use of collector when not inherited by + a descendant of Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor + Returntype : Bio::EnsEMBL::Funcgen::Collector + Exceptions : None + Caller : Collector script + Status : At Risk + +=cut + +sub new{ + return bless {}, $_[0];#Simple blesses this class as an empty hash + + #Do not set anything here + #As will not be first in ISA for feature adaptors + #Hence not guaranteed to be called + +} + +#Setter/Getter methods if we don't have a dedicated Collector module +#to set the package variables in? Also to allow overriding of defaults. +#This can be used by the write_collection method +#to determine when to build and store a compressed collection +#Effectively the max size of the data type you are using to store +#a compressed score. defaults to max for long blob 16MB + + + +#Generic method, but only ever called by write_collection in descendant + +sub new_assembly{ + my ($self, $new_ass) = @_; + + if($new_ass){ + #Validate new assm to project to + + + $self->{'new_assembly'} = $new_ass; + } + + return $self->{'new_assembly'}; +} + + +sub max_data_type_size{ + my ($self, $size) = @_; + + #Validate is sensible integer? + + if($size && ! int($size)){ + throw("max_data_type_size must be a integer of bytes, not $size"); + } + elsif($size){ + $self->{'max_data_type_size'} = $size; + } + elsif(! defined $self->{'max_data_type_size'}){ + #default set at head of this module or in descendant Collector + $self->{'max_data_type_size'} = $Bio::EnsEMBL::Funcgen::Collector::max_data_type_size; + } + + return $self->{'max_data_type_size'}; +} + +sub max_view_width{ + my ($self, $size) = @_; + + #Validate is sensible integer? + + if($size && ! int($size)){ + throw("max_view_width must be a integer, not $size"); + } + elsif($size){ + $self->{'max_view_width'} = $size; + } + elsif(! defined $self->{'max_view_width'}){ + #default set at head of this module or in descendant Collector + $self->{'max_view_width'} = $Bio::EnsEMBL::Funcgen::Collector::max_view_width; + } + + return $self->{'max_view_width'}; +} + + +sub bins_per_record(){ +#$collector_class::bins_per_record = ($collector_class::max_data_type_size/$collector_class::packed_size);#This should be done dynamically as we may redefine either of these variables? + + my ($self) = shift; + + return int($self->max_data_type_size/$self->packed_size); +} + + +#The defaults for these should be defined in the feature/format specific Collector descendant +#either by specifying the package variables or using config attrs to set methods? +#general config should be parsed here. +#rename bin_method? + +sub bin_method{ + my ($self, $method) = @_; + + if($method || ! $self->{'bin_method'}){ + + if($method){ + $self->{'bin_method'} = $method; + #should test can here? or validate versus hash? + } + elsif(! $self->{'bin_method'}){ + + if (! defined $Bio::EnsEMBL::Funcgen::Collector::bin_method){ + throw('Must pass a bin_method in the config or define $Bio::EnsEMBL::Funcgen::Collector::bin_method in your Collector'); + } + + $self->{'bin_method'} = $Bio::EnsEMBL::Funcgen::Collector::bin_method; + } + + #or current validate method if we are keeping the method in the if/else block + + + #if(! $self->can("calculate_${method}"))){ + #throw("$method is no a valid a valid binning method"); + #} + + } + + return $self->{'bin_method'}; +} + +#We could replace this with a hash of bin_methods and models? +#This could then be used to validate +#Altho if we are going to commodotise the bin methods, then we need to be able to +#define this in the child Collector. Could still do this by modifying the method/model +#hash from the child Collector + +sub bin_model{ + my ($self, $bin_model) = @_; + + if($bin_model || ! $self->{'bin_model'}){ + + if($bin_model){ + $self->{'bin_model'} = $bin_model; + } + elsif(! $self->{'bin_model'}){ + + #Set as global constant defined in descendant Collector + if (! defined $Bio::EnsEMBL::Funcgen::Collector::bin_model){ + throw('Must pass -bin_model in the config or define $Bio::EnsEMBL::Funcgen::Collector::bin_model in your Collector'); + } + + $self->{'bin_model'} = $Bio::EnsEMBL::Funcgen::Collector::bin_model; + } + + #Need to validate bin models here + throw('Bio::EnsEMBL::Funcgen::Collector does not yet support non-SIMPLE bin models') if $self->{'bin_model'} ne 'SIMPLE'; + } + + return $self->{'bin_model'}; +} + +#This can be overridden by adaptor method +#At present this could cause problems as we can pass window sizes in the config, but they will never be set +#as adaptor method is not a setter. Adaptor method should throw if we try and set them as this could cause problems when fetching and not knowing the custom sizes? + +sub window_sizes{ + my ($self, $sizes) = @_; + + if($sizes || ! $self->{'window_sizes'}){ + + if($sizes){ + $self->{'window_sizes'} = $sizes; + } + else{#! $self->{'windows_sizes' + + if (! @window_sizes){ + throw('Must pass -windows_sizes in the config or define @Bio::EnsEMBL::Funcgen::Collector::window_sizes in your Collector'); + } + + @{$self->{'window_sizes'}} = @window_sizes; + } + + if(ref($self->{'window_sizes'}) ne 'ARRAY' || + scalar(@{$self->{'window_sizes'}}) == 0){ + throw('window_sizes must be an arrayref of at least one window size'); + } + } + + return $self->{'window_sizes'}; +} + + + + +#Optional attrs dependant on whether Collection is packed +#Can be redefined in the adaptor but becareful never to redefine the actual values +#As these should really be constants for a given Collector +#What is best here? We only want pack methods for storing/fetching compressed collections +#Move this to base feature adaptor and define attrs as constants using +#package variable? Or directly in new? +#Then direct modification will be caught. +#Just leave here for now. + +#Caller _obj_from_sth/store + +sub pack_template{ + my ($self, $template) = @_; + + if($template){ + $self->{'pack_template'} = $template; + } + elsif(! $self->{'pack_template'}){ + + #Set as global constant defined in descendant Collector + + if (! defined $Bio::EnsEMBL::Funcgen::Collector::pack_template){ + throw('Must pass a per score pack_template in the config or define $Bio::EnsEMBL::Funcgen::Collector::pack_template in your Collector'); + } + + $self->{'pack_template'} = $Bio::EnsEMBL::Funcgen::Collector::pack_template; + } + + return $self->{'pack_template'}; + +} + +#Caller _obj_from_sth/store & current_packed_size + +sub packed_size{ + my ($self, $size) = @_; + + if($size){ + + if(! int($size)){ + throw("$size is not an integer, must pass a size integer for packed_size which specifies size of pack_template:\t".$self->pack_template); + } + + $self->{'packed_size'} = $size; + } + elsif(! $self->{'packed_size'}){ + + #Set as global constant defined in descendant Collector + + if (! defined $Bio::EnsEMBL::Funcgen::Collector::packed_size){ + throw('Must pass a packed_size(wrt to pack_template) in the config or define $Bio::EnsEMBL::Funcgen::Collector::packed_size in your Collector'); + } + + $self->{'packed_size'} = $Bio::EnsEMBL::Funcgen::Collector::packed_size; + } + + return $self->{'packed_size'}; + +} + +#These methods are used by the descendant Collector +#For caching infor whilst building collections +#This is used to log how big a collection has grown before storing + +sub current_packed_size{ + my ($self, $wsize) = @_; + + #$self->{'current_packed_size'}{$wsize} ||= 0; + + #if(defined $cps){ +# $self->{'current_packed_size'}{$wsize} = $cps; +# } +# else{ +# return $self->{'current_packed_size'}{$wsize}; +# } + + return (scalar(@{$self->score_cache($wsize)})*$self->packed_size); + +} + + +sub score_cache{ + my ($self, $wsize, $scores) = @_; + + $self->{'score_cache'}{$wsize} ||= []; + + if(defined $scores){ + push @{$self->{'score_cache'}{$wsize}}, @{$scores}; + } + else{ + #Do this here to stop passing the ref everytime + #Will this be faster? + #Would certainly be faster if we were not returning a ref + return $self->{'score_cache'}{$wsize}; + } +} + +#These last methods are only used for the 0 wsize +#natural resolution and ar wrt the orig_slice passed +#to store_windows_by_Slice + +sub collection_start{ + my ($self, $wsize, $sr_start) = @_; + + if(defined $sr_start){ + $self->{'collection_start'}{$wsize} = $sr_start; + } + else{ + return $self->{'collection_start'}{$wsize}; + } +} + +sub collection_end{ + my ($self, $wsize, $sr_end) = @_; + + if(defined $sr_end){ + $self->{'collection_end'}{$wsize} = $sr_end; + } + else{ + return $self->{'collection_end'}{$wsize}; + } +} + +sub collection_strand{ + my ($self, $wsize, $strand) = @_; + + if(defined $strand){ + $self->{'collection_strand'}{$wsize} = $strand; + } + else{ + return $self->{'collection_strand'}{$wsize}; + } +} + + + +=pod + +sub _create_feature { + my ( $this, $feature_type, $args ) = @_; + + my $feature = $this->SUPER::_create_feature( $feature_type, $args ); + + if ( !$this->_lightweight() ) { + my ( $phase, $end_phase, $stable_id, $version, $created_date, + $modified_date, $is_current ) + = rearrange( [ 'PHASE', 'END_PHASE', + 'STABLE_ID', 'VERSION', + 'CREATED_DATE', 'MODIFIED_DATE', + 'IS_CURRENT' + ], + %{$args} ); + + push( @{$feature}, + $phase, $end_phase, $stable_id, $version, $created_date, + $modified_date, $is_current ); + } + + return $feature; +} + +sub _create_feature_fast { + my ( $this, $feature_type, $args ) = @_; + + my $feature = + $this->SUPER::_create_feature_fast( $feature_type, $args ); + + return $feature; +} + + +#This might not be sensible for Features which are split across tables + +sub _tables { + my ($this) = @_; + + my @tables = $this->SUPER::_tables(); + + if ( $this->_lightweight() ) { + return ( $tables[0] ); + } + + return @tables; +} + + +sub _columns { + my ($this) = @_; + + my @columns = $this->SUPER::_columns(); + + if ( $this->_lightweight() ) { + + #What is this doing? + #Probably not sensible for ResultFeature + @columns[ 5 .. $#columns ] = map( 1, 5 .. $#columns ); + } + + return @columns; +} + +#Also not sensible for objects spread across several tables + +sub _default_where_clause { + my ($this) = @_; + + if ( $this->_lightweight() ) { + return ''; + } + + return $this->SUPER::_default_where_clause(); +} + +=cut + + + +#This need to be generic +#Again we need to pass an accessor method/reference? +#Will be some sort of generic fetch for feature adaptors +#or while loop for in flat file accessor +#rollback to be handled in caller? + +# To do +# +# 1 Allow variable chunks lengths (so we only have one resolution of windows?) +# This will allow SNP collections which currently define classification i.e colour +# Density of SNPs within window will define shading. Count will be displayed in zmenu +# This maybe something we have to do in the descendant +# +# 2 Implement collection param definition in/from descendant + + +# return collection config from adaptor fetch +# window size +# fixed width? +# render/collection style? +# This chould be implemented in BaseFeatureAdaptor::generic_fetch? +# Or could be done in the calling fetchmethod? +# + +#need to change this to get_window_bin_by_Slice +#to enable generating bins on uncompressed data +#Need to remove all counts and store based code to store caller +#this would mean removing any pack based code too +#separate set_config method + + +#Probelm here is size of slice? +#We need to generate bins all in one go, but also need to store at interval +#so as not to explode memory +#Do we need to separate the window generation from the bin generation code? + + +#Define the optimal way to generate windowed data by +#finding the most common denominator + +sub _define_window_chunks{ + my ($self, $window_sizes, $max_view_size) = @_; + + ### DEFINE CHUNKS WRT WINDOWS + + #Shortcut for on the fly uncompressed collection retrieval + #if(scalar(@wsizes) = 1){ + # + #} + #else{ + + #Calulate sensible slice length based on window sizes + my @wsizes = sort {$a <=> $b} @$window_sizes; + + #We need a default when only calculating 0 resolution + #Will binning code work with only 0 resolution? + if((scalar(@wsizes) == 1) && + $wsizes[0] == 0){ + return { $self->max_view_width => [0] }; + } + + + my $multiplier = int($max_view_size/$wsizes[$#wsizes]); + my $chunk_length = $multiplier * $wsizes[$#wsizes]; + my $not_divisible = 1; + my %chunk_windows;#Registry of chunk lengths to run with windows + my %workable_chunks = map {$_ => {}} @wsizes; + delete $workable_chunks{'0'};#get rid of natural resolution as this will always work + + + while($not_divisible && $chunk_length != 0){ + $not_divisible = 0; + + foreach my $wsize(@wsizes){ + next if $wsize == 0;#Special wsize for normal data + + #Set not divisible if modulus is true + if($chunk_length % $wsize){ + $not_divisible = 1; + } + else{ + #No need to listref here? + $workable_chunks{$wsize}{$chunk_length} = []; + } + + #warn "chunk length is $chunk_length and not_divisible is $not_divisible"; + } + + #Gradually shrink the length until we find a workable slice length for all windows + $chunk_length -= $wsizes[$#wsizes] if $not_divisible; + } + + my %chunk_sets; + + + if($chunk_length == 0){ + print "Could not find chunk length for all window sizes, attempting to subset windows using alternate slice length\n"; + + foreach my $wsize(keys %workable_chunks){ + #Loop through windows, seeing if they are workable in the other windows + + foreach my $chunk(keys %{$workable_chunks{$wsize}}){ + + foreach my $other_wsize(keys %workable_chunks){ + next if $wsize == $other_wsize; + + if(exists $workable_chunks{$other_wsize}{$chunk}){ + #only push it onto the other wsize, as we will do the reverse later + $chunk_sets{$chunk}{$wsize} = undef; + } + } + } + } + + #Now we have a register of co-occurence of wsizes with repect to chunks + #Loop through finding the least amount of sets with the longest chunk length? + #There is no way to decide which is best? + #we could calculate the number of loops? Factored by the chunk length? + #Let's just print out and see what we get + + #warn "chunk sets are :\n".Data::Dumper::Dumper(\%chunk_sets); + + + #For now let's just take the one which has the most windows and the longest chunk + #Then we just get the largest which handles the rest. + + #define possible set lengths + my $i = 0; + my %set_lengths; + map {$set_lengths{$i} = []; $i++} @wsizes; + delete $set_lengths{'0'};#get rid of natural resolution as this will always work + + + #store chunks lengths for each set size + foreach my $chunk(keys %chunk_sets){ + my $set_size = scalar(values %{$chunk_sets{$chunk}}); + push @{$set_lengths{$set_size}}, $chunk; + } + + #Now we get the biggest set with the longest length; + my $largest_size = scalar(@wsizes);#scalar here as we are disregarding natural resolution of 0 in loop + my $found_largest_set = 0; + + while(! $found_largest_set){ + $largest_size--; + + if(scalar(@{$set_lengths{$largest_size}}>0)){ + $found_largest_set = 1; + } + } + + + #We should be able to loop this bit, to find all the biggest sets. + my ($largest_chunk) = sort {$b<=>$a} @{$set_lengths{$largest_size}}; + #we could even be selective here, but let's just take the first one for now + + my @largest_windows = keys %{$chunk_sets{$largest_chunk}}; + @{$chunk_windows{$largest_chunk}} = @largest_windows; + print "Largest chunk $largest_chunk($largest_size) contains windows: @largest_windows\n"; + + my %remaining_windows = map {$_ => {}} @wsizes; + delete $remaining_windows{'0'};#get rid of natural resolution as this will always work + map { delete $remaining_windows{$_} } @largest_windows; + my $remaining_set_size = scalar(keys %remaining_windows); + + #swapping to array here for practicality, would need to maintain hash if we need to iterate + my @rwindows = keys %remaining_windows; + + #This could just be one window, but this will not be inthe co-occurence hash %chunk_sets + #Hence the normal approach will not work. and we just want to find a suitably large chunk for this one window. + my $next_chunk; + + if(scalar(@rwindows) == 1){ + #we just want to find a suitably large chunk for this one window. + my ($last_window) = @rwindows; + + $multiplier = int(500000/$last_window); + $next_chunk = $multiplier * $last_window; + } + else{ + #Now were are doing something very similar to above + #populating a set_size chunk length registry + #my %seen_hash; + + + foreach my $chunk(sort {$b<=>$a} @{$set_lengths{$remaining_set_size}}){ + my $seen_count = 0; + + foreach my $rwindow(@rwindows){ + + $seen_count++ if grep/$rwindow/, (values %{$chunk_sets{$chunk}}); + } + + if ($seen_count == $remaining_set_size){ + + $next_chunk = $chunk; + last; + } + } + } + + @{$chunk_windows{$next_chunk}} = @rwindows; + + + if($next_chunk){ + + print "Found next chunk length $next_chunk contains remaining windows:\t@rwindows\n"; + + #Now we want to cycle through all the set lengths which could contain the ones not in the first + #so we need to + } + else{ + warn "Need to write iterative sub for set definition"; + throw('Could not find workable slice length for remaining windows: '. + join(', ', @rwindows)); + + } + } + else{ + @{$chunk_windows{$chunk_length}} = keys(%workable_chunks); + print "Found workable chunk length($chunk_length) for all window sizes:\t". + join(' ', @{$chunk_windows{$chunk_length}})."\n"; + } + + return \%chunk_windows; +} + + +#Let's concentrate on store function first before we split out into store and fetch methods +#How will this work with the Bed parser? +#The descendant collector will sort the input and detect the current slice before calling +#store_window_bins_by_Slice. This may require some caching of line or seeking as we will see the next slice before we have a chance to set it. +#This will store as ResultFeature collections, so maybe we need to separate the input from output code? +#i.e. Bed parser/wrapper +# ResultFeatureAdaptor wrapper +#These + +#Problem with passing window_sizes here +#We need to check that they aren't already defined a class variables as this could potentially +#screw up retrieval, expect for only 0 or all but 0 +#Should we remove this config and force the class variable to be set in the 'adaptor' +#Method is then only used internally, make private or only getter? Set by changing class vars? + +sub store_window_bins_by_Slice{ + my ($self, $slice, %config) = @_; + + my ($window_sizes, $logic_name, $bin_method, $fetch_method_ref, $max_view_width, + $max_data_type_size, $pack_template, $packed_size, $bin_model, $new_assm, $skip_zero_window) = + rearrange( [ 'WINDOW_SIZES', 'LOGIC_NAME', 'BIN_METHOD', 'FETCH_METHOD_REF', 'MAX_VIEW_WIDTH', 'MAX_DATA_TYPE_SIZE', 'PACK_TEMPLATE', 'PACKED_SIZE', 'BIN_MODEL', 'NEW_ASSEMBLY', 'SKIP_ZERO_WINDOW'], %config ); + + warn "Need to be careful here about cleaning start end strand caches between serially run slices"; + + + + ### VAILDATE VARS/CONFIG + + #This could be done once in set_config, could then remove setter bahviour from attr methods? + #All default defs params/methods can be overridden by config params + #Attrs used in this method + $bin_method = $self->bin_method($bin_method); + $bin_model = $self->bin_model($bin_model); + #$window_sizes = $self->window_sizes($window_sizes);#Now done below + #Set to undef if we ave empty array + $window_sizes = undef if (ref($window_sizes) eq 'ARRAY' && scalar(@$window_sizes) == 0); + #Attrs used in other (store) methods + $self->pack_template($pack_template); + $self->packed_size($packed_size); + $self->max_data_type_size($max_data_type_size); + $self->max_view_width($max_view_width); + + #Other vars + $self->new_assembly($new_assm); + + #Need to validate slice here + + warn "temp hack for bin_method validation"; + $bin_method = $self->validate_bin_method($bin_method); + + ### Set window_sizes + + if($self->new_assembly){ + print "Assembly projection may cause problems for large Collections, defaulting to window_sizes = (0)\n"; + #Then build the bins on the projected 0 level single ResultFeatures + + #Test we haven't explicity set window_sizes to be soemthing else + + if($window_sizes && + ! ( scalar(@$window_sizes) == 1 && $window_sizes[0] == 0)){ + + throw("You have set window_sizes config which are not safe when projecting to a new assembly($new_assm), please omit window_sizes config or set to 0"); + + } + $window_sizes = $self->window_sizes([0]); + } + else{ + + if($window_sizes && $skip_zero_window && grep/^0$/,@$window_sizes){ + throw("You have specied skip_zero_window and window_size 0 in your config, please remove one of these"); + } + elsif($window_sizes && ! grep/^0$/,@$window_sizes){ + $skip_zero_window = 1; + unshift @$window_sizes, 0;#re-add 0 window as we need this to build the collections + } + + $window_sizes = $self->window_sizes($window_sizes); + } + + + #This is already done in the script + if($skip_zero_window && $new_assm){ + throw("You cannot -skip_zero_window or omit 0 from -window_sizes when projecting to a new assembly($new_assm) which should only be generated using window_size=0"); + } + + + + ### Rollback previously stored features + + if($self->can('rollback_Features_by_Slice')){ + $self->rollback_Features_by_Slice($slice); + } + else{ + #This is currently the only warn output we can't get rid off + warn ref($self)." cannot rollback_Features_by_Slice. This may result in duplicate Collections being stored if there is pre-existing data"; + } + + + + ### PROCESS CHUNKS + + #Not lightweight as we will be storing them + # Temporarily set the collection to be lightweight??? + #my $old_value = $this->_lightweight(); + #if ( defined($lightweight) ) { $this->_lightweight($lightweight) } + #else { $this->_lightweight(1) } + + my %chunk_windows = %{$self->_define_window_chunks($self->window_sizes, $self->max_view_width)}; + my (%counts, $store_natural); + $store_natural = grep/^0/, @$window_sizes; + $counts{0}=0;#Set natural res count to 0 + my $slice_end = $slice->end; + my $orig_slice = $slice; + my $orig_start = $slice->start; + #my $slice_adj = $slice->start - 1;#Removed this as we are now generating features local to orig_slice + #start/end conversion will be done in write/store_collection + my $region = $slice->coord_system_name; + my $version = $slice->coord_system->version; + my $seq_region_name = $slice->seq_region_name; + my $strand = $slice->strand; + my $only_natural = 0; + #my $slice_adj = 0; + + + #We need to account for only 0 here when doing projection + #The chunk window is set to max_view_widht in _define_chunk_windows + + $only_natural = 1 if $store_natural && scalar(@$window_sizes) == 1; + $store_natural = 0 if $skip_zero_window; + #SHould really test these two, but should already be caught by now + + #Set the initial collection_start to orig_start + #Could default to 1, but we may not be starting from 1 + #This is not the case for 0 wsize where it must always be + #The first feature start + + + for my $wsize(@{$self->window_sizes}){ + + next if $wsize == 0;# && $skip_zero_window;#We never want to assume start of 0 window collection + $self->collection_start($wsize, $orig_start); + } + + + + foreach my $chunk_length(sort keys %chunk_windows){ + + print "Processing windows ".join(', ', @{$chunk_windows{$chunk_length}}). + " with chunk length $chunk_length\n"; + map $counts{$_} = 0, @{$chunk_windows{$chunk_length}}; #Set window counts to 0 + + #Now walk through slice using slice length chunks and build all windows in each chunk + my $in_slice = 1; + my $start_adj = 0; + my ($sub_end, $features, $bins); + my $sub_start = 1; + my $slice_length = $slice->length; + + + #Can we subslice and then exclusivly use bin_start(local to orig_slice) + #Then we never have to deal with sr coord until we store + #This should me we never have to do the sr conversion unless we + #use a slice which doesn't start at 1(PAR or test) + #Always create in local coords for fetch + #Then change to seq_region coords for store if required + + + while($in_slice){ + #$sr_start = $slice_start + $start_adj; + $sub_start += $start_adj; + + #$slice_start = $sr_start;#Keep for next slice + #$sr_end = $sr_start + $chunk_length - 1; + $sub_end = $sub_start + $chunk_length - 1; + + #Last chunk might not be the correct window length + #Hence why we should do this on whole chromosomes + if($sub_end >= $slice_length){ + #$sub_end = $slice_end; + #No longer set to slice end, as we don't want to corrupt the bin definition? + #Surplus bins are removed in store/write_collection in caller + #We could simply add the largest window the the end of the slice? + #Then we will only build the minimum of excess bins? + #This should be okay for bin calcs + #But may screw up bin trimming in caller as we currently expect $ub_end to be a valid bin end + #for all wsizes + #bin trimming should handle this, but this will corrupt the bin definition??? + #bin definition is depedant on method + #So this method need to be agnostic + #And deal with the rest in descendant + $in_slice = 0; + } + + + $slice = $slice->adaptor->fetch_by_region($region, $seq_region_name, ($sub_start + $orig_start -1), ($sub_end + $orig_start - 1), $strand, $version); + #Can't subslice as this will not clip if we go over the length of the slice, unlike normal slice fetching + #hence we cannot rely on this + #$slice = $orig_slice->sub_Slice($sub_start, $sub_end, $orig_slice->strand); + #warn "got sub slice $slice as $sub_start - $sub_end from ".$orig_slice->name; + + + ### Grab features and shift chunk coords + #features may already be a 0 wsize collection if we have projected from an old assembly + #Could move this check to get_Features_by_Slice? + + #e.g. [ $features, \%config ] + $features = $self->get_Features_by_Slice($slice); + #next if scalar(@$features) == 0;#We want to store values for all windows + + if( (@$features) && + (ref($features->[0]) =~ /Bio::EnsEMBL::Funcgen::Collection/) ){#Change to isa 'Bio::EnsEMBL::Collection + + #Check that the returned feature/collections support window_size + + if($features->[0]->can('window_size')){ + + if($features->[0]->window_size != 0){ + throw("You are trying to generated Collections from a non-zero window sized Collection:\t".$features->[1]->{'window_size'}); + } + + #This should never happen + if(! $skip_zero_window){ + throw('You have retrieved data from a Collection which without using -skip_zero_window i.e. you are trying to generate overwrite the data you are generating the Collections from'); + } + } + else{ + throw('Something si wrong, the Collection you have retrieved does not support the method window_size'); + } + } + + + + #Set collection start here for 0 window_size + if(@$features && $store_natural && ! defined $self->collection_start(0)){ + $self->collection_start(0, ($features->[0]->start + $sub_start)); + } + + + + $start_adj = $chunk_length if($in_slice); + + + + #This should return a hash of window size => bin array pairs + if(! $only_natural){ + + $bins = $self->_bin_features_by_window_sizes( + -slice => $slice, + -window_sizes => $chunk_windows{$chunk_length}, + -bin_method => $bin_method, + -features => $features, + ); + + + + + } + + #my $bin_start = $sr_start + $slice_adj;#This was only required for storing individual bins + #Could calc bin_start + slice_adjust ahere for all features + #Doing this will break old code for single window collections + + #This is sr start and should be local to orig_slice! + + + + + #We need to handle strandedness of slice!? + + #Store all normal features in result_feature + + + + if($store_natural){ + + foreach my $feature(@$features){ + $counts{0}++; + #warn "storing ".join(', ', ($feature->start, $feature->end, $feature->strand, $feature->scores->[0])); + + + #Should we handle bin trimming here for overhanging slices + #Then counts wil be correct and wont have to do in caller + + #We could stop here if the feature seq_region start > orig_slice end + #Current done in write/store_collection + #This may mean working in seq_region values rather than slice values + + + #write_collection is implemented in descendant e.g. Bio::EnsEMBL::Funcgen::Collector::ResultFeature + #as wrapper to adaptor store method or print to file + + #These params need to be generated in a way defined by the descendant + # + + if($bin_model eq 'SIMPLE'){ + #We need to pass the slice with this so we can sub slice when storing + #the collection and set the start/end to 1 and length of slice + #we still need to store the first start to be able to sub slice correctly + + $self->collection_start(0, ($feature->start + $sub_start)); + + #Need to pass strand for 0 resolution + $self->write_collection(0, + $orig_slice, + #These are now wrt orig_slice + #($feature->start + $sub_start), + ($feature->end + $sub_start), + $feature->strand, + $feature->scores, + ); + + #We can have problems here if the original score type + #does not match the collected score type + #For max magnitude this is not an issue + #as we take the larget value from the bin + #But for other methods this may not be true + #e.g. count + #Hence, if we want to preserve the 0 window + #We must account for this in the feature collector + #e.g. set_collection_defs_by_ResultSet_window_size? + #Just omit 0 window for reads + + } + } + + print "Window size 0 (natural resolution) has ".scalar(@{$features})." feature bins for:\t".$slice->name."\n"; + } + + #Now store bins + # my ($bin_end, $bin_scores); + my $num_bins; + + foreach my $wsize(sort keys %{$bins}){ + $num_bins = scalar(@{$bins->{$wsize}}); + #warn "$num_bins bin scores for $wsize:\t".join(',', @{$bins->{$wsize}}); + + #Should we handle bin trimming here for overhanging slices + #Then counts wil be correct and wont have to do in caller + + + $counts{$wsize}+= $num_bins; + + + + #We don't need this loop for collections as we can simply push all the scores at once + #Just use the slice start and end + if($bin_model eq 'SIMPLE'){ + + $self->write_collection($wsize, + $orig_slice, + #$sub_start, + $sub_end, + $orig_slice->strand,#This is most likely 1! + #Override this woth 0 in descendant Collector if required. + $bins->{$wsize}, + ); + + } + else{ + throw('Bio::EnsEMBL::Funcgen::Collector does not yet support non-SIMPLE bin models'); + #i.e. More than one score + } + + + +# #Reset start and end for new wsize +# $bin_start = $slice->start; +# $bin_end = $slice->start; +# +# +# +# #We don't need this loop for collections as we can simply push all the scores at once +# +# +# foreach my $bin_index(0..$#{$bins->{$wsize}}){ +# +# +# +# #default method to handle simple fixed width bin? +# #bin_end need to be defined dependant on the bin type +# #($bin_start) = $self->process_default_bin($bins->{$wsize}->[$bin_index], $wsize);#? +# +# +# +# #either define default bin method in descendant +# #Or can we set a process_bin_method var? +# #No just pass all this info to write collection and handle it there? +# +# #Can we have just predefined rotueines handling different bin types? +# #Simple +# #Simple compressed +# #Clipped +# #This will prevent hanving to make attrs/method for storing persistent start/end/score info +# +# +# +# #Need validate bin_type method +# #Could convert these to numbers for speed as with binning methods +# +# if($bin_model eq 'SIMPLE'){ +# +# $bin_scores = $bins->{$wsize}->[$bin_index]; +# +# warn "bin scores is $bin_scores"; +# +# +# #next if ! $bin_score;#No we're no inc'ing the start ends for bins with no scores +# +# $bin_end += $wsize; +# +# #if($bin_score){#Removed this as we always want to write the score even if it is 0 +# +# #This is a little backwards as we are generating the object to store it +# #If we are aiming for speed the maybe we could also commodotise the store method +# #store by args arrays? store_fast? +# #Speed not essential for storing! +# +# #Note: list ref passed +# +# #Don't need to pass all this info for fixed width blob collections +# #Need to write some default handlers depedant on the collection type +# #Simple(original) +# #Simple compressed +# #Multi compressed +# #Clipped uncompressed? +# +# +# $self->write_collection($wsize, +# $orig_slice, +# ($bin_start + $slice_adj), +# ($bin_end + $slice_adj), +# $orig_slice->strand,#This is most likely 0 +# $bin_scores, +# ); +# +# #Only count if we have a stored(projected?) feature +# $count++;#Change this to attr/method? +# #} +# +# $bin_start += $wsize; +# } +# else{ +# throw('Bio::EnsEMBL::Funcgen::Collector does not yet support non-SIMPLE bin models'); + # } + # } + + #warn "Window size $wsize has ".scalar(@{$bins->{$wsize}})." bins"; + #$counts{$wsize}+= $count; + } + } + + $store_natural = 0; #Turn off storing of natural resolution for next chunk length sets + } + + #Now need to write last collections for each wsize + + foreach my $wsize(@{$self->window_sizes}){ + + next if $wsize == 0 && ! $store_natural; + next if $wsize != 0 && $only_natural; + + print "Writing final $wsize window_size collection, this may result in slightly different bin numbers from counts due to removing overhanging bins past end of slice\n"; + + $self->write_collection($wsize, $orig_slice);#store last collection + } + + + #Print some counts here + foreach my $wsize(sort (keys %counts)){ + print "Generated ".$counts{$wsize}." bins for window size $wsize for ".$orig_slice->name."\n"; + #Some may have failed to store if we are projecting to a new assembly + #Need collection count here too, but would need methods for this? + } + + #Return this counts hash so we can print/log from the caller, hence we don't print in here? + + return; +} + + + +=head2 _bin_features_by_window_sizes + + Args[0] : Bio::EnsEMBL::Slice + Args[1] : ARRAYREF of window sizes + Args[2] : int - bin method, currently defined by validate_bin_methods + Args[3] : ARRAYREF of Bio::EnsEMBL::Features + Example : $bins = $self->_bin_features_by_window_sizes( + -slice => $slice, + -window_sizes => $chunk_windows{$chunk_length}, + -bin_method => $bin_method, + -features => $features, + ); + Description: Bins feature scores for a given list of window sizes and predefined method number + Returntype : HASHREF of scores per bin per window size + Exceptions : Throws if bin method not supported + Caller : store_window_bins_by_Slice + Status : At Risk + +=cut + + +#To do +# 1 Remove Bio::EnsEMBL::Feature dependancy? Or just create Features for non adaptor Collectors. +# Is there a way we can skip the object generation in the adaptor completely and just +# pass the values we need? +# 2 Separate methods, so we can define custom methods in descendants? +# 3 Expand %bins model to optionally be one of +# the following dependant on binning method +# Simple: fixed width containing arrays of scores for each window +# Multi: fixed width containing multiple arrays of scores for each window +# Non-simple?: Separate aggregated features, either fixed width or not, not BLOB! +# Clipped: default fixed width with option to clip start and end. Needs start/end attrs +# Can't store this in a blob due to non-standard start ends? +# Most likely want more than one score here? Count/Density SNPs? +# Removes data skew from standard window bins, would need to store each bin and post +# process. Or do in line to avoid 2nd post-processing loop,requires awareness of when +# we have moved to a new bin between features. This holds for overlapping and +# non-overlapping features. Once we have observed a gap we need to clip the end of the +# last bin and clip the start of the new bin. This requires knowing the greatest end +# values from the last bin's feature. what if two overlapping features had the same +# start and different end, would we see the longest last? Check default slice_fetch sort + +sub _bin_features_by_window_sizes{ + my $this = shift; + my ( $slice, $window_sizes, $method, $features ) = + rearrange( [ 'SLICE', 'WINDOW_SIZES', 'BIN_METHOD', 'FEATURES' ], @_ ); + + + #Do this conditional on the Collection type + #i.e. is collection seq_region blob then no else yes + #if ( !defined($features) || !@{$features} ) { return {} } + + #warn 'Processing '.scalar(@$features).' features for window sizes '.join(', ',@$window_sizes).' for slice '.$slice->name."\n"; + + #Set up some hashes to store data by window_size + my (%bins, %nbins, %bin_counts); + my $slice_start = $slice->start(); + + #Default handlers for + #my($first_bin); + #if ( $method == 0 || # 'count' or 'density' + # $method == 3 || # 'fractional_count' or 'weight' + # $method == 4 # 'coverage' + # ){ + # # For binning methods where each bin contain numerical values. + # $first_bin = 0; + # } + # else { + # # For binning methods where each bin does not contain numerical + # # values. + # + # #Remove this + # $first_bin = undef; + # } + + + #Set up some bin data for the windows + my $slice_length = $slice->length; + + foreach my $wsize (@$window_sizes) { + #TO DO: Need to modify this block if default 0's are undesirable for collection type + #i.e. should it be undef instead? May have prolbems representing undef in blob + + $nbins{$wsize} = int($slice_length / $wsize); #int rounds down + #nbins is actually the index of the bin not the 'number' + #Unless slice_Length is a multiple! + $nbins{$wsize}-- if(! ($slice_length % $wsize)); + + #Create default bins with 0 + @{$bins{$wsize}} = (); + map {$bins{$wsize}->[$_] = 0} (0 .. $nbins{$wsize}); + + #Set bin counts to 0 for each bin + @{$bin_counts{$wsize}} = (); + + #This is adding an undef to the start of the array!? + map { $bin_counts{$wsize}->[($_)] = 0 } @{$bins{$wsize}}; + + foreach my $bin(@{$bins{$wsize}}){ + $bin_counts{$wsize}->[$bin] = 0; + } + } + + #warn "bin_counts are :\n".Data::Dumper::Dumper(\%bin_counts); + #This fails for slices which are smaller than the chunk length; + my $feature_index = 0; + my ($bin_index, @bin_masks); + + foreach my $feature ( @{$features} ) { + #Set up the bins for each window size + + #Omit test for Bio::EnsEMBL::Feature here for speed + #Only needs start/end methods + + foreach my $wsize (@$window_sizes) { + + #We have already highjacked the object creation by here + #This is done in core BaseFeatureAdaptor + #We probably don't want to do this for ResultFeatures as we don't use the + #standard feature implementation + #we already use an array and we don't store the slice + #as this is already known by the caller + #and we always build on top level so we don't need to remap + + #We do however need the slice to store, as we only store local starts when generating + #We need a store by Slice method? + #This will remove the need to inherit from Feature. + #These will need to be regenerated everytime we import a new build + #As we do with the probe_features themselves + #This also mean the result_feature status has to be associated with a coord_system_id + + #Which bins do the start and end lie in for this feature? + #Already dealing with local starts, so no slice subtraction + #Could wrap these start/end methods via the descendant Collector + #to remove the Feature dependancy? Or just create Features when parsing in the caller + my $start_bin = int(($feature->start ) / $wsize); + my $end_bin = int(($feature->end) / $wsize ); + $end_bin = $nbins{$wsize} if $end_bin > $nbins{$wsize}; + + + + #Slightly obfuscated code to match method number(faster) + #by avoiding string comparisons. + #Could call methods directly using coderef set in validate_bin_method + #Accessor may slow things down, but should be uniform for all methods + #rather than being dependant on position in if/else block below + + #reserve 0 for descendant defined method? + #There fore always fastest in this block, or use coderefs? + if ( $method == 0 ) { + # ---------------------------------------------------------------- + # For 'count' and 'density'. + + for ( $bin_index = $start_bin ; + $bin_index <= $end_bin ; + ++$bin_index ) { + + $bins{$wsize}->[$bin_index]++; + + #warn "setting $wsize bin $bin_index to ". $bins{$wsize}->[$bin_index]; + + } + } + +=pod + + } elsif ( $method == 1 ) { + # ---------------------------------------------------------------- + # For 'indices' and 'index' + + + #How is this useful? + #Is this not just count per bin? + #No this is a list of the feature indices + #So forms a distribution? + + throw('Not implemented for method for index'); + + for ( my $bin_index = $start_bin ; + $bin_index <= $end_bin ; + ++$bin_index ) { + push( @{ $bins[$bin_index] }, $feature_index ); + } + + ++$feature_index; + + } elsif ( $method == 2 ) { + # ---------------------------------------------------------------- + # For 'features' and 'feature'. + + throw('Not implemented for method for feature/features'); + + for ( my $bin_index = $start_bin ; + $bin_index <= $end_bin ; + ++$bin_index ) { + push( @{ $bins[$bin_index] }, $feature ); + } + + } elsif ( $method == 3 ) { + # ---------------------------------------------------------------- + # For 'fractional_count' and 'weight'. + + + throw('Not implemented for method for fractional_count/weight'); + + if ( $start_bin == $end_bin ) { + ++$bins[$start_bin]; + } else { + + my $feature_length = + $feature->[FEATURE_END] - $feature->[FEATURE_START] + 1; + + # The first bin... + $bins[$start_bin] += + ( ( $start_bin + 1 )*$bin_length - + ( $feature->[FEATURE_START] - $slice_start ) )/ + $feature_length; + + # The intermediate bins (if there are any)... + for ( my $bin_index = $start_bin + 1 ; + $bin_index <= $end_bin - 1 ; + ++$bin_index ) { + $bins[$bin_index] += $bin_length/$feature_length; + } + + # The last bin... + $bins[$end_bin] += + ( ( $feature->[FEATURE_END] - $slice_start ) - + $end_bin*$bin_length + + 1 )/$feature_length; + + } ## end else [ if ( $start_bin == $end_bin) + + } + elsif ( $method == 4 ) { + # ---------------------------------------------------------------- + # For 'coverage'. + + #What exactly is this doing? + #This is coverage of bin + #Rather than coverage of feature as in fractional_count + + + # my $feature_start = $feature->[FEATURE_START] - $slice_start; + # my $feature_end = $feature->[FEATURE_END] - $slice_start; + # + # if ( !defined( $bin_masks[$start_bin] ) + # || ( defined( $bin_masks[$start_bin] ) + # && $bin_masks[$start_bin] != 1 ) ) { + # # Mask the $start_bin from the start of the feature to the end + # # of the bin, or to the end of the feature (whichever occurs + # # first). + # my $bin_start = int( $start_bin*$bin_length ); + # my $bin_end = int( ( $start_bin + 1 )*$bin_length - 1 ); + # for ( my $pos = $feature_start; + # $pos <= $bin_end && $pos <= $feature_end ; + # ++$pos ) { + # $bin_masks[$start_bin][ $pos - $bin_start ] = 1; + # } + # } + # + # for ( my $bin_index = $start_bin + 1 ; + # $bin_index <= $end_bin - 1 ; + # ++$bin_index ) { + # # Mark the middle bins between $start_bin and $end_bin as fully + # # masked out. + # $bin_masks[$bin_index] = 1; + # } + # + # if ( $end_bin != $start_bin ) { + # + # if ( !defined( $bin_masks[$end_bin] ) + # || ( defined( $bin_masks[$end_bin] ) + # && $bin_masks[$end_bin] != 1 ) ) { + # # Mask the $end_bin from the start of the bin to the end of + # # the feature, or to the end of the bin (whichever occurs + # # first). + # my $bin_start = int( $end_bin*$bin_length ); + # my $bin_end = int( ( $end_bin + 1 )*$bin_length - 1 ); + # for ( my $pos = $bin_start ; + # $pos <= $feature_end && $pos <= $bin_end ; + # ++$pos ) { + # $bin_masks[$end_bin][ $pos - $bin_start ] = 1; + # } + # } + # } + # } ## end elsif ( $method == 4 ) + +=cut + + + elsif ( $method == 5 ) { + #$self->$method($bin_index, $start_bin, $end_bin, $wsize, \%bins, \%bin_counts); + + + #average score + #This is simple an average of all the scores for features which overlap this bin + #No weighting with respect to the bin or the feature + + for ( $bin_index = $start_bin ; + $bin_index <= $end_bin ; + ++$bin_index ) { + + #we should really push onto array here so we can have median or mean. + $bins{$wsize}->[$bin_index] += $this->get_score_by_Feature($feature); + $bin_counts{$wsize}->[$bin_index]++; + } + } + elsif( $method == 6){ + #Max magnitude + #Take the highest value +ve or -ve score + for ( $bin_index = $start_bin ; + $bin_index <= $end_bin ; + ++$bin_index ) { + + #we really need to capture the lowest -ve and higest +ve scores here and post process + #To pick between them + + my $score = $this->get_score_by_Feature($feature); + #Write score method as wrapper to scores? + + $bins{$wsize}->[$bin_index] ||= [0,0]; #-ve, +ve + + + #warn "Comparing wsize $wsize bin $bin_index score $score to ". $bins{$wsize}->[$bin_index]->[0].' '.$bins{$wsize}->[$bin_index]->[1]."\n"; + + if($score < $bins{$wsize}->[$bin_index]->[0]){ + #warn "setting -ve bin to $score\n"; + $bins{$wsize}->[$bin_index]->[0] = $score; + } + elsif($score > $bins{$wsize}->[$bin_index][1]){ + #warn "setting +ve bin to $score\n"; + $bins{$wsize}->[$bin_index]->[1] = $score; + } + } + } + else { + throw("Only accomodates average score method"); + } + + + } + + } ## end foreach my $feature ( @{$features... + + + #Now do post processing of bins + +=pod + + if ( $method == 4 ) { + + # ------------------------------------------------------------------ + # For the 'coverage' method: Finish up by going through @bin_masks + # and sum up the arrays. + + for ( my $bin_index = 0 ; $bin_index < $nbins ; ++$bin_index ) { + if ( defined( $bin_masks[$bin_index] ) ) { + if ( !ref( $bin_masks[$bin_index] ) ) { + $bins[$bin_index] = 1; + } else { + $bins[$bin_index] = + scalar( grep ( defined($_), @{ $bin_masks[$bin_index] } ) )/ + $bin_length; + } + } + } + } + +=cut + + if( $method == 5){ + #For average score, need to divide bins by bin_counts + + foreach my $wsize(keys %bins){ + + foreach my $bin_index(0..$#{$bins{$wsize}}){ + + if($bin_counts{$wsize}->[$bin_index]){ + $bins{$wsize}->[$bin_index] /= $bin_counts{$wsize}->[$bin_index]; + } + #warn "bin_index $wsize:$bin_index has score ".$bins{$wsize}->[$bin_index]; + } + } + } + elsif( $method == 6){ + #Max magnitude + #Take the highest value +ve or -ve score + + foreach my $wsize(keys %bins){ + + foreach my $bin_index(0..$#{$bins{$wsize}}){ + + #So we have the potential that we have no listref in a given bin + + #default value if we haven't seen anything is 0 + #we actually want an array of -ve +ve values + + #warn "Are we storing 0 values for absent data?"; + #Not for max_magnitude, but maybe for others? + + if($bins{$wsize}->[$bin_index]){ + #warn $wsize.':'.$bin_index.':'.$bins{$wsize}->[$bin_index]->[0].'-'.$bins{$wsize}->[$bin_index]->[1]; + my $tmp_minus = $bins{$wsize}->[$bin_index]->[0] * -1; + + if($tmp_minus > $bins{$wsize}->[$bin_index]->[1]){ + $bins{$wsize}->[$bin_index] = $bins{$wsize}->[$bin_index]->[0]; + } + else{ + $bins{$wsize}->[$bin_index] = $bins{$wsize}->[$bin_index]->[1]; + } + + #warn "bin $bin_index now ". $bins{$wsize}->[$bin_index]; + } + } + } + } + elsif($method != 0){#Do no post processing for count(0) + throw('Collector currently only accomodates average_score, count and max magnitude methods'); + } + + + #Could return bin_counts too summary reporting in zmenu + #Could also do counting of specific type + + #warn "returning bins ".Data::Dumper::Dumper(\%bins); + + return \%bins; +} ## end sub _bin_features + + +=pod + +#These could potentially be used as code refs to avoid having the if else block +#This way we can also define new methods in the descendant Collector? +#Would have to have pass args and refs to bin hashes +#This would slow things down over direct access here +#But speed is no longer that critical as we do not use the Collector for display +#purposes, only to build the Collections which are then used for display directly. + +sub calculate_average_score{ + my $self = shift; + + if ( $method == 5 ) { + #average score + #This is simple an average of all the scores for features which overlap this bin + #No weighting with respect to the bin or the feature + + for ( my $bin_index = $start_bin ; + $bin_index <= $end_bin ; + ++$bin_index ) { + + #we should really push onto array here so we can have median or mean. + $bins{$wsize}->[$bin_index] += $feature->score; + $bin_counts{$wsize}->[$bin_index]++; + } + } + + + +} + + +sub post_process_average_score{ + +} + + +sub calculate_max_magnitude{ + my $self = shift; + + #Max magnitude + #Take the highest value +ve or -ve score + for ( my $bin_index = $start_bin ; + $bin_index <= $end_bin ; + ++$bin_index ) { + + #we really need to capture the lowest -ve and higest +ve scores here and post process + #To pick between them + + my $score = $feature->score; + $bins{$wsize}->[$bin_index] ||= [0,0]; #-ve, +ve + + if($score < $bins{$wsize}->[$bin_index]->[0]){ + $bins{$wsize}->[$bin_index]->[0] = $score; + } + elsif($score > $bins{$wsize}->[$bin_index][1]){ + $bins{$wsize}->[$bin_index]->[1] = $score; + } + } +} + + +sub post_process_max_magnitude{ + +} + +=cut + +#separated to allow addition of non-standard methods +#Could potentially add these in new +#and put this back in _bin_features + + +sub validate_bin_method{ + my ($self, $method) = @_; + + + #change this to set the coderefs + #Just set anonymous sub to immediately return for non post processed methods + #No need for coderef, just set the method name? + + #if(! $self->can('calculate_'.$method)){ + #throw("$method method does not have a valid calculate_${method} method"); + #} + + #if($self->can('post_process_'.$method)){ + ##set post process flag? + #or simply do this can in line in the _bin_features sub? + #} + + + + + #Add average_score to avoid changing Collection.pm + my $class = ref($self); + ${$class::VALID_BINNING_METHODS}{'average_score'} = 5; + ${$class::VALID_BINNING_METHODS}{'max_magnitude'} = 6; + ${$class::VALID_BINNING_METHODS}{'count'} = 0; + + + + #foreach my $method_name(keys %{$class::VALID_BINNING_METHODS}){ +# warn "valid method is $method name"; +# } + + + if ( ! exists( ${$class::VALID_BINNING_METHODS}{$method} ) ) { + throw( + sprintf( + "Invalid binning method '%s', valid methods are:\n\t%s\n", + $method, + join( "\n\t", sort( keys(%{$class::VALID_BINNING_METHODS}) ) ) ) ); + } + else{ + #warn "found valid method $method with index ".${$class::VALID_BINNING_METHODS}{$method}; + } + + return ${$class::VALID_BINNING_METHODS}{$method}; +} + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/Collector/ResultFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Collector/ResultFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,752 @@ +# $Id: ResultFeature.pm,v 1.11 2011/02/18 14:35:41 nj1 Exp $ + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + + +package Bio::EnsEMBL::Funcgen::Collector::ResultFeature; + +use strict; +use warnings; + +use Bio::EnsEMBL::Utils::Argument ('rearrange'); +use Bio::EnsEMBL::Utils::Exception ('throw'); +use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw (open_file); +use Bio::EnsEMBL::Funcgen::Collection::ResultFeature; +use Bio::EnsEMBL::Funcgen::ProbeFeature; #Only used for _pre_storing slice/seq_region details +#use POSIX;#ceil + +use base qw(Bio::EnsEMBL::Utils::Collector Bio::EnsEMBL::DBFile::CollectionAdaptor);#@ISA + + +### Global config variables +# See associated Collector methods for more info + +$Bio::EnsEMBL::Utils::Collector::bin_model = 'SIMPLE'; + +#Default is for read coverage, array intensity config defined in set_collection_defs_by_ResultSet +#Kept here as example of basic package config +$Bio::EnsEMBL::Utils::Collector::window_sizes = [30, 65, 130, 260, 450, 648, 950, 1296]; + + +#May need to drop 30 here due to RPKM float value doubling packed size + +# Tuning 900(Most used display size) - 172(non-drawable area) = 772 pixels/bins +# Optimal window sizes over zoom levels +# 1kb 5kb 10kb 50kb 100kb 200kb 500kb 1mb +# 2 7 13 65 130 260 648 1296 +#Theoretical max for default 16MB max_allowed_packet_size is 30(~23kb) - sufficient min for read coverage. +#0 will be used for array intensies until 65 is more appropriate(see adaptor) +#450 and 950 added to handle middle ground, but are these really needed? +#New windows will use more memory due to smaller window sizes +#But we have dropped a window size from the mid-upper range +#Largest size is essential so we aren't retrieving/drawing more features than pixels +#Smaller sizes are desirable as more resolution required when zoomed in. + +$Bio::EnsEMBL::Utils::Collector::bin_method = 'RPKM';#'count';#only used by collector +#small integers(little endian) +#RPKM is now float(native) +$Bio::EnsEMBL::Utils::Collector::packed_size = 4;#2;#per score +$Bio::EnsEMBL::Utils::Collector::pack_template = 'f';#'v';#per score + + +#Length is completely unreliable here +#returns 2 for single v and 5 for single f? + +#Does appear to be 4 from read bytes + +### Mandatory methods required by the base Collector + + + + +# You must have a methods like this to perform the store + +sub store_window_bins_by_Slice_ResultSet { + my ($self, $slice, $rset, %config) = @_; + + $self->source_set_type('result');#required by get_Feature_by_Slice + $self->set_collection_defs_by_ResultSet($rset); + $self->set_config(%config); + + + #Need to test for existing collection + + $self->store_window_bins_by_Slice($slice); + + return; +} + + + +sub store_window_bins_by_Slice_Parser{ + my ($self, $slice, $imp, %config) = @_; + + + #We need to test for new_assm if we have a reads result set and fail + #currently don't have access to new assm for validation? + #Can project to new_assm as this would require two passes storing initially on the 0 window level + #which we don't want to do + #throw if new_assm as we need to remap before running this + #This is done in Collector so why are we parsing it here? + #skip_zero window will always be set, so this would fail!!! + + #test parse config here, will this strip it out of the hash before + #we pass it to the super method? + #Also need to test skip_zero_window or window_sizes dependant on result set type(sequencing, array) + #This needs to be defined when craeting the ResultSet in the Importer + #NEED TO CHANGE ALL ResultSet generation to add type!!!!! + + my ($skip_zero_window, $force) = rearrange( [ 'SKIP_ZERO_WINDOW', 'FORCE'], %config ); + + + + #There is no way of setting IMPORTED status for slice based jobs + #We need the accumulator to set the IMPORTED/RESULT_FEATURE_SET status + #Would need to turn parse_and_import into RunnableDB? + #To run these as slice jobs we would need to code to configure some slice based input ids + + + $self->source_set_type('input');#required by get_Feature_by_Slice + $self->set_collection_defs_by_ResultSet($imp->result_set); + + $self->parser($imp); + + #For safety, set skip_zero window if we are using SEQUENCING data + + if(! $skip_zero_window){ + #Assume we only have one set here(enforced in define_and_validate_sets) + my ($iset) = @{$imp->result_set->get_InputSets}; + + if($iset->format eq 'SEQUENCING'){ + $config{'-skip_zero_window'} = 1; + } + } + + $self->set_config(%config, (-method_config => { + #RPKM method config + -dnadb => $imp->db->dnadb, + -total_features => $imp->total_features, + -gender => $imp->cell_type->gender, + -window_sizes => $self->window_sizes, + #pass here as Collector is currently 'readding' 0 wsize + } + )); + + + ### Check for existing dbfile dir/files + my $rset_dir = $self->result_set->dbfile_data_dir; + my $dbfile_data_dir = $imp->get_dir('output'); + + if((! $force ) && + $rset_dir && + ($rset_dir ne $dbfile_data_dir)){ + throw("ResultSet dbfile_data_dir($rset_dir) and -output_dir($dbfile_data_dir) do not match. Please rectify or specify -force to update"); + } + + #Update/set ResultSet dbfile_data_dir + if( (! $rset_dir) || + ( $rset_dir && + ($rset_dir ne $dbfile_data_dir) && + $force ) ){ + $self->result_set->dbfile_data_dir($dbfile_data_dir); + $self->result_set->adaptor->store_dbfile_data_dir($self->result_set); + } + + if((! defined $dbfile_data_dir) || + (! -d $dbfile_data_dir)){ + throw('Your -dbfile_data_dir is either not set or not a valid directory'); + } + + + #Check and open each window_size file + for my $wsize(@{$self->window_sizes}){ + + if($wsize == 0){ + #Would not normally happen + warn ("Need to fix 0bp window_size re-adding in Collector?"); + next; + } + else{ + my $col_file_name = $self->result_set->get_dbfile_path_by_window_size($wsize, $slice); + + if(-e $col_file_name && $force){ + unlink($col_file_name) or warn("Failed to remove exisiting col file:\t$col_file_name\n$!"); + #no throw as file will be over-written anyway + } + + #Generate and cache filhandle here to avoid failing after data has been generated + my $fh = $self->get_filehandle($col_file_name, {-file_operator => '>'}); + + if(! defined $fh){ + throw("Could not get_filehandle for >${col_file_name}"); + } + + #Set AUTOFLUSH to enable validate_file_length in store_collection + $fh->autoflush; + } + } + + $self->store_window_bins_by_Slice($slice); + + #Index creation and merging done in post-processing script + return; +} + + +=head2 rollback_Features_by_Slice + + Args[0] : Bio::EnsEMBL::Slice + Example : $collector->get_Feature_by_Slice($slice); + Description: Wrapper method to fetch input features for building the Collections + Returntype : ARRAYREF of Bio::EnsEMBL::Funcgen::Collection::ResultFeatures + Exceptions : None + Caller : Bio::EnsEMBL::Utils::Collector::store_window_bins_by_Slice + Status : At Risk + +=cut + +#optional method to perform feature rollback + +sub rollback_Features_by_Slice{ + my ($self, $slice) = @_; + + #Point to Helper here + #This is already done in the InputSet importer for + #bed/sam seq imports + #but not for array based imports + #Need to take account of wsizes + + +} + + + +=head2 get_Features_by_Slice + + Args[0] : Bio::EnsEMBL::Slice + Example : $collector->get_Feature_by_Slice($slice); + Description: Wrapper method to fetch input features for building the Collections + Returntype : ARRAYREF of Bio::EnsEMBL::Funcgen::Collection::ResultFeatures + Exceptions : None + Caller : Bio::EnsEMBL::Utils::Collector::store_window_bins_by_Slice + Status : At Risk + +=cut + +#Can remove this method if we pass a method ref +#Default for normal FeatureAdaptors would fetch_all_by_Slice +#However cannot do this as we don't get self passed via a code ref +#we could however manually pass this when we call the method? +#self will either be a collector or a parser +#Just maintain this wrapper for now for simplicity + +sub get_Features_by_Slice{ + my ($self, $slice) = @_; + + #Can we pass this as method ref to Collector, this would prvent the need to write + #this wrapper for standard fetch_all_by_Slice based access + my $features; + my $source_set_type = $self->source_set_type; + + if ($source_set_type eq 'result'){ + #Add more args here? status? + #Add default window size 0, if ResultSet is alread a RESULT_FEATURE_SET + #This may occur if you are projecting features from an old assembly to + #0 window size before generating the other windows in a second pass + $features = $self->result_set->get_ResultFeatures_by_Slice($slice, undef, undef, undef, 0); + } + elsif($source_set_type eq 'input'){ + $features = $self->parser->parse_Features_by_Slice($slice); + + #This method assumes a sorted file handle + #Can we set markers for disk seeking on a sorted handle? + #Either we make it handle an slice passed + #Or we assume the next query slice will be after the last + #Just restrict to one slice at a time for now + + #Also needs to hold cache of long features + #Kind of reinventing the BaseFeatureAdaptor wheel here? + + + } + else{ + #We should have already validated this by now + throw('get_Features_by_Slice only support input and result set_types'); + } + + return $features; + +} + + + +=head2 write_collection + + Args[0] : int : window_size + Args[1] : Bio::EnsEMBL::Slice + Args[2] : optional int : Feature end + Args[3] : optional int : Feature strand + Args[4] : optional ARRAYREF of scores + Example : $self->store_collection($wsize, $slice, $self->collection_start($wsize), $self->collection_end($wsize), $self->collection_strand($wsize)); + Description: Writes Collections by caching scores, defining collection end and strand and + storing when end of collection seen or max_packed_size is exceeded (currently does + not support fragmented collections). Stores last Collection if only window_size + and Slice are passed. + Returntype : None + Exceptions : None + Caller : Bio::EnsEMBL::Utils::Collector::store_window_bins_by_Slice + Status : At Risk + +=cut + +sub write_collection{ + my ($self, $wsize, $slice, $slice_end, $strand, $scores) = @_; + + if(defined $slice_end){ + #collection_start is defined in Collector for first bin in collection + $self->collection_end($wsize, $slice_end); + $self->collection_strand($wsize, $strand); + } + + + if(defined $scores){ + my $new_cps = $self->current_packed_size($wsize) + (scalar(@$scores)*$self->packed_size); + #This loop is very similar to the _obj_from_sth loop + #We need to set the slice and modify the start end appropriately + + + if($new_cps >= $self->max_data_type_size){ + warn("Have found $wsize collection larger that max_data_type_size(16MB). Need to implement cross collection querying"); + #Via a union of two substr queries + #This is no set to 64MB, but this does not directly translate to the maximum size allowed in a single insert + } + + + + if($wsize == 0){ + #We need to 0 wsize collections to be projected as single features + #Other wise the assembly projection will not work + #Then use the 0 wsize ResultFeatures to generate + + #if($new_cps >= $self->max_data_type_size){ + # warn('Have found collection larger that max_data_type_size(16MB). Need to implement cross collection querying'); + # #Via a union of two substr queries + # } + + + #Need to cache current score for 0 window + $self->score_cache($wsize, $scores) if $wsize == 0; + $self->store_collection($wsize, $slice, $self->collection_start($wsize), $slice_end, $strand); + + } + + $self->score_cache($wsize, $scores) if $wsize != 0; + } + else{#No score info, so we store the remaining last collections for each wsize + $self->store_collection($wsize, $slice, $self->collection_start($wsize), $self->collection_end($wsize), $self->collection_strand($wsize)); + } + + return; + +} + + +#To get this to print to file, we need to set up a slice/window specific file handle +#Doing this on a slice basis would also reduce the initial seek time +#at the expense of the number of open file handles (similar to partitions) +#Hence need to cat together slices for same window in defined order in a post processing step + + +=head2 get_score_by_Feature + + +=cut + +#Not required for count/density based collections + +sub get_score_by_Feature{ + my ($self, $feature) = @_; + + #For speed, assume we have a valid Bio::EnsEMBL::Funcgen::Collection::ResultFeature + #This will always be a 0 wsize collection, hence we always want the first score + + return $feature->scores->[0]; + +} + + + +=head2 reinitialise_input + + +=cut + +#optional method - resets file handle for iterative slice parsing for different chunk length sets + +sub reinitialise_input{ + my $self = shift; + + #No need to do this for DB queries + return if $self->source_set_type eq 'result'; + + #This will make all the parser counts go screwy + #as we are potentially counting the same features again + + + #Move this to parser method + #change to seek fh, 0, 0 + $self->parser->file_handle->close; + $self->parser->file_handle( open_file($self->parser->input_file, $self->parser->input_file_operator) ); + + #Reset caches + $self->parser->{'last_slice'} = undef; + $self->parser->{'overhang_features'} = []; + $self->parser->last_line(''); + return; +} + + +############################################################## +### Following methods are not mandatory for a Collector +### but support above methods for this ResultFeature Collector +############################################################## + + + + +=head2 store_collection + + Args[0] : int : window_size + Args[1] : Bio::EnsEMBL::Slice + Args[2] : int : Feature start + Args[3] : int : Feature end + Args[4] : int : Feature strand + Example : $self->store_collection($wsize, $slice, $self->collection_start($wsize), $self->collection_end($wsize), $self->collection_strand($wsize)); + Description: Collection storage method. Resets seq_regios_start/end and scores appropriately + if collection exceeds storage slice. Resets score cache and next collection_start. + Returntype : None + Exceptions : None + Caller : write_collection and Bio::EnsEMBL::Utils::Collector::store_window_bins_by_Slice + Status : At Risk + +=cut + +sub store_collection{ + my ($self, $wsize, $full_slice, $slice_start, $slice_end, $strand) = @_; + + warn "Storing collection $wsize, $full_slice, $slice_start, $slice_end, $strand"; + + my $sr_start = $slice_start; + my $sr_end = $slice_end; + $strand = 0 if ! defined $strand; + #Overwriting strand value with 0 + #As we have collected features from both strands? + $strand = 0 if $wsize != 0; + + + #This happens if the last collection was already written in the loop + #Handle this here so we don't have to set/test collection_start for every record + return if($sr_start == ($sr_end + 1)); + + + #Set store slice to start at 1 (PARs, test slices) + #Can we remove this store_slice now? + #How are we going to handle storing/fetching on PARs? + my $store_slice = $full_slice; + + + if($store_slice->start != 1){ + #Alter sr_start/end + $sr_start = $slice_start + $full_slice->start - 1; + $sr_end = $slice_start + $full_slice->start - 1; + $store_slice = $store_slice->adaptor->fetch_by_region(undef, $store_slice->seq_region_name); + } + + + ### Splice scores if collection overhang store slice + #We reassign this below to avoid any weird ref updating behaviour + my $scores_ref = $self->score_cache($wsize); + + if($sr_end > $full_slice->end){ + #Can happen for 0 wsize if we are running with small test slice + #But we never want to splice scores for 0 wsize + + if($wsize == 0){ #Throw away any which are not at least partially on this slice + return if $sr_start > $full_slice->end; + } + else{ + + #Trim the scores and sr_end to the nearest bin + my $tmp_end = int($full_slice->end/$wsize) * $wsize; + $tmp_end += $wsize if $tmp_end < $full_slice->end; + my $overhang_length = ($sr_end - $tmp_end)/$wsize; + #This should always be an int as $slice_end is always a valid bin end + $sr_end = $tmp_end; + + #Now remove the surplus scores from the cache + splice(@$scores_ref, ($#{$scores_ref} - $overhang_length), $overhang_length); + } + + } + + print 'Storing '.scalar(@$scores_ref)." bins(window_size=$wsize) for:\t".$store_slice->name."\n"; + + + if($wsize == 0){ + + $self->store([Bio::EnsEMBL::Funcgen::Collection::ResultFeature->new_fast + ({ + start => $sr_start, + end => $sr_end, + strand => $strand, + scores => $scores_ref, + #probe => undef, + result_set_id => $self->result_set->dbID, + window_size => $wsize, + slice => $store_slice,#Full slice + } + )], $self->result_set, $self->new_assembly); + } + else{ #Write to col file! + #This needs moving to ResultFeatureAdaptor::write_to_file? + + #store slice should always be the full length slice here + #and sr_start should always ==1 + #and sr_end should always >= slice->end/length + + my $col_file_name = $self->result_set->get_dbfile_path_by_window_size($wsize, $store_slice); + my $fh = $self->get_filehandle($col_file_name); + #, {-file_operator => '>'});#not strictly needed here as it should be open for writing. + + + # STORE SEQ_REGION + if(! $self->get_seq_region_id_by_Slice($store_slice)){ + $self->_pre_store(Bio::EnsEMBL::Funcgen::ProbeFeature->new + ( + -slice => $store_slice, + -start => 1, + -end => 1, + -strand => 0, + ) + ); + } + + # PACK AND PRINT + my $pack_template = '('.$self->pack_template.')'.scalar(@{$scores_ref}); + warn "Pack template is $pack_template"; + + print $fh pack($pack_template, @{$scores_ref}); + + # VALIDATE + my $total_packed_size = $self->get_total_packed_size($sr_start, $sr_end, $wsize); + #This will only work due to autoflush set when opening in store_window_bins_by_Slice_Parser{ + #This will never be true is the collection end does not match the slice end? + #This effectively validates the pack template + $self->validate_file_length($col_file_name, $total_packed_size, 1);#1 is binmode; + } + + #Reassign/clean score_cache reference to avoid any reference updating problems + + $self->{'score_cache'}{$wsize} = []; + + #Set collection start, this is reset to the actual feature start for wsize == 0 + $self->collection_start($wsize, ($slice_end+1)); + + return; +} + + + + +sub get_total_packed_size{ + my ($self, $sr_start, $sr_end, $wsize) = @_; + + #Can afford to validate here as this is not for fetch purposes + my $start_bin = ($sr_start + $wsize - 1) / $wsize; + my $end_bin = $sr_end / $wsize; + + if(($start_bin != int($start_bin)) || + ($end_bin != int($end_bin))){ + throw("The seq_region_start/end($sr_start/$sr_end) are not valid bin bounds for window_size $wsize"); + } + + my $bin_count = ($sr_end - $sr_start +1) / $wsize; + #Could use POSIX::ceil here + my $tmp_bc = int($bin_count); + $bin_count = $tmp_bc + 1 if $tmp_bc != $bin_count; + + return ($bin_count * $self->packed_size); +} + + + + +=head2 set_collection_defs_by_ResultSet + + Args[0] : Bio::EnsEMBL::Funcgen::ResultSet + Example : $self->set_collection_defs_by_ResultSet($rset); + Description: Sets the packed_size and pack_template and bin_method dependant on the + ResultSet type(reads/array) + Returntype : None + Exceptions : throws if supporting InputSet is not of type result (i.e. short reads import) + throws if supporting InputSet format is not SEQUENCING (i.e. short reads import) + throws if ResultSet is not and input_set or experimental_chip based ResultSet (i.e. channel etc) + Caller : store_window_bins_by_Slice_ResultSet + Bio::EnsEMBL::Funcgen:DBSQL::ResultFeatureAdaptor::fetch_all_by_Slice_ResultSet + Status : At Risk + +=cut + +#This needs merging with set_collection_config_by_ResultSets + +sub set_collection_defs_by_ResultSet{ + my ($self, $rset) = @_; + + $self->result_set($rset); + + if(defined $rset){ + #This is most likely already done in the caller + #$self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::ResultSet', $rset); + + if($rset->table_name eq 'experimental_chip'){ #Array Intensities i.e. single float + #perl only offers native endian order for floats (http://www.perlmonks.org/?node_id=629530) + #$Bio::EnsEMBL::Utils::Collector::packed_size = 4;#per score + #$Bio::EnsEMBL::Utils::Collector::pack_template = 'f';#per score + #Use defaults for these now + + $Bio::EnsEMBL::Utils::Collector::bin_method = 'max_magnitude';#only used by collector + $Bio::EnsEMBL::Utils::Collector::window_sizes->[0] = 0;#Can have natural resolution for low density array data + } + elsif($rset->table_name eq 'input_set'){ + #Need to reset this as we may be doing serial queries. + $Bio::EnsEMBL::Utils::Collector::window_sizes->[0] = 30; + + #Currently only expecting int from InputSet + my @isets = @{$rset->get_InputSets}; + my @tmp_isets = grep(!/result/, (map $_->feature_class, @isets)); + + if(@tmp_isets){ + throw("Bio::EnsEMBL::Funcgen::Collector::ResultFeature only supports result type InputSets, not @tmp_isets types"); + } + + #We still have no way of encoding pack_type for result_feature InputSets + + @tmp_isets = grep(!/SEQUENCING/, (map $_->format, @isets)); + + if(@tmp_isets){ + throw("Bio::EnsEMBL::Funcgen::Collector::ResultFeature only supports SEQUENCING format InputSets, not @tmp_isets formats"); + } + } + else{ + throw('Bio::EnsEMBL::Funcgen::Collector:ResultFeature does not support ResultSets of type'.$rset->table_name); + } + } + + + #Do we need to validate the smallest non-0 window size + #against the max pack size? + #This should be done in the Collector + + return; +} + + + +=head2 result_set + + Args[0] : optional Bio::EnsEMBL::Funcgen::ResultSet + Example : $self->store_collection($wsize, $slice, $self->collection_start($wsize), $self->collection_end($wsize), $self->collection_strand($wsize)); + Description: result_set attribute method required for fetch wrapper method + Returntype : Bio::EnsEMBL::Funcgen::ResultSet + Exceptions : throws if arg is not valid + Caller : get_Features_by_Slice + Status : At Risk + +=cut + +sub result_set{ + my ($self, $rset) = @_; + + + #Can't use is_stored_and_valid here + + if($rset && ! (ref($rset) && $rset->isa('Bio::EnsEMBL::Funcgen::ResultSet'))){ + throw('You must pass a valid Bio::EnsEMBL::Funcgen::ResultSet'); + } + elsif($rset){ + + #Now done in the caller + #if($rset->has_status('RESULT_FEATURE_SET')){ + #throw('ResultSet('.$rset->name.') already has precomputed ResultFeatures stored, please rollback ResultFeature first'); + #Retrieving ResultFeatures here would end up retrieving them from the result_feature table + #which is where we want to store them + #They are only retrived from the probe/result/probe_feature table if they do not have this status. + #REMEMBER TO SET THIS IN THE CALLING SCRIPT!? + #} + + $self->{'result_set'} = $rset; + } + + return $self->{'result_set'}; +} + + +=head2 parser + + Args[0] : optional Bio::EnsEMBL::Funcgen::Parsers::InputSet + Example : $self->parser($parser); + Description: Getter/Setter for parser attribute if this ResultFeature Collector + Returntype : Bio::EnsEMBL::Funcgen::Parsers::InputSet + Exceptions : throws if arg is not valid + Caller : general + Status : At Risk + +=cut + +sub parser{ + my ($self, $parser) = @_; + + #Can't use is_stored_and_valid here + + if($parser && ! (ref($parser) && $parser->isa('Bio::EnsEMBL::Funcgen::Parsers::InputSet'))){ + throw('You must pass a valid Bio::EnsEMBL::Funcgen::Parsers::InputSet'); + } + elsif($parser){ + + $self->{'parser'} = $parser; + } + + return $self->{'parser'}; +} + + + + +sub source_set_type{ + my ($self, $type) = @_; + + $self->{source_set_type} = $type if $type; + + return $self->{source_set_type}; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/CoordSystem.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/CoordSystem.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,566 @@ + # +# EnsEMBL module for Bio::EnsEMBL::Funcgen::CoordSystem +# + + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::Funcgen::CoordSystem + +=head1 SYNOPSIS + + my $db = Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor->new(...); + + my $csa = $db->get_CoordSystemAdaptor(); + + # + # Get default chromosome coord system for the 39_36a DB: + # + my $cs = $csa->fetch_by_name_schema_build_version('chromosome', '39_36a'); + my $str = join ':', $cs->name(),$cs->version(),$cs->dbID(); + print "$str\n"; + + +=head1 DESCRIPTION + +This has been adapted from the core CoordSystem object to accomodate the multi-assembly +aspects of the eFG schema, namely hadnling the schema_build of the referenced core DB. + +This is a simple object which contains a few coordinate system attributes: +name, internal identifier, version and schema_build. A coordinate system is +uniquely defined by its name and version and which DB it came from i.e. schema_build. +A version of a coordinate system applies to all sequences within a coordinate system. +This should not be confused with individual sequence versions. + +Take for example the Human assembly. The version 'NCBI33' applies to +to all chromosomes in the NCBI33 assembly (that is the entire 'chromosome' +coordinate system). The 'clone' coordinate system in the same database would +have no version however. Although the clone sequences have their own sequence +versions, there is no version which applies to the entire set of clones. + +Coordinate system objects are immutable. Their name and version, and other +attributes may not be altered after they are created. + +=cut + + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::CoordSystem; + +use Bio::EnsEMBL::Storable; + +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw); + +use vars qw(@ISA); + +@ISA = qw(Bio::EnsEMBL::Storable); + +my %warnings; + + +=head2 new + + Arg [..] : List of named arguments: + -NAME - The name of the coordinate system + -VERSION - (optional) The version of the coordinate system. + Note that if the version passed in is undefined, + it will be set to the empty string in the + resulting CoordSystem object. + -RANK - The rank of the coordinate system. The highest + level coordinate system should have rank 1, the + second highest rank 2 and so on. An example of + a high level coordinate system is 'chromosome' an + example of a lower level coordinate system is + 'clone'. + -SCHEMA_BUILD - The schema and data build version of the DB of + origin. + -TOP_LEVEL - (optional) Sets whether this is a top-level coord + system. Default = 0. This should only be set to + true if you are creating an artificial toplevel + coordsystem by the name of 'toplevel' + -SEQUENCE_LEVEL - (optional) Sets whether this is a sequence + level coordinate system. Default = 0 + -DEFAULT - (optional) + Whether this is the default version of the + coordinate systems of this name. Default = 0 + -DBID - (optional) The internal identifier of this + coordinate system + -ADAPTOR - (optional) The adaptor which provides database + interaction for this object + Example : $cs = Bio::EnsEMBL::CoordSystem->new(-NAME => 'chromosome', + -VERSION => 'NCBI33', + -RANK => 1, + -DBID => 1, + -SCHEMA_BUILD => '39_36a', + -ADAPTOR => adaptor, + -DEFAULT => 1, + -SEQUENCE_LEVEL => 0); + Description: Creates a new CoordSystem object representing a coordinate + system. + Returntype : Bio::EnsEMBL::Funcgen::CoordSystem + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + + #Can we just hadnle schema_build here and call super->new for the rest. + #We will also have to handle the top/default levels issues with multiple DBs + + + #my ($name, $version, $sbuild, $top_level, $sequence_level, $default, $rank) = + # rearrange(['NAME','VERSION', 'SCHEMA_BUILD','TOP_LEVEL', 'SEQUENCE_LEVEL', + # 'DEFAULT', 'RANK'], @_); + + my ($name, $version) = rearrange(['NAME','VERSION'], @_); + + + throw('A name argument is required') if(! $name); + + + $version = '' if(!defined($version)); + + + #$top_level = ($top_level) ? 1 : 0; + #$sequence_level = ($sequence_level) ? 1 : 0; + #$default = ($default) ? 1 : 0; + #$rank ||= 0; + + #if($top_level) { + # if($rank) { + # throw('RANK argument must be 0 if TOP_LEVEL is 1'); + # } + + # if($name) { + # if($name ne 'toplevel') { + # throw('The NAME argument must be "toplevel" if TOP_LEVEL is 1') + # } + # } else { + # $name = 'toplevel'; + # } + + # if($sequence_level) { + # throw("SEQUENCE_LEVEL argument must be 0 if TOP_LEVEL is 1"); + # } + + # $default = 0; + +# } else { +# if(!$rank) { +# throw("RANK argument must be non-zero if not toplevel CoordSystem"); +# } +# if($name eq 'toplevel') { +# throw("Cannot name coord system 'toplevel' unless TOP_LEVEL is 1"); +# } +# } + +# if($rank !~ /^\d+$/) { +# throw('The RANK argument must be a positive integer'); +# } + + + $self->{'core_cache'} = {}; + $self->{'version'} = $version; + $self->{'name'} = $name; + #$self->{'schema_build'} = $sbuild; + #$self->{'top_level'} = $top_level; + #$self->{'sequence_level'} = $sequence_level; + #$self->{'default'} = $default; + #$self->{'rank'} = $rank; + + + + + return $self; +} + + +=head2 add_core_coord_system_info + + Arg [1] : mandatory hash: + + -RANK => $rank, + -SEQUENCE_LEVEL => $seq_lvl, + -DEFAULT => $default, + -SCHEMA_BUILD => $sbuild, + -CORE_COORD_SYSTEM_ID => $ccs_id, + -IS_STORED => $stored_status, + + Example : $cs->add_core_coord_system_info( + -RANK => $rank, + -SEQUENCE_LEVEL => $seq_lvl, + -DEFAULT => $default, + -SCHEMA_BUILD => $sbuild, + -CORE_COORD_SYSTEM_ID => $ccs_id, + -IS_STORED => 1, + ); + + Description: Setter for core coord system information + Returntype : none + Exceptions : throws if: + rank not 0 when toplevel + name not 'TOPLEVEL" when toplevel + sequence level and top level + no schema_build defined + no rank + rank 0 when not toplevel + name 'TOPLEVEL' when not toplevel + + Caller : Bio::EnsEMBL::Funcgen::DBSQL::CoordSystemAdaptor and ? + Status : at risk - replace with add_core_CoordSystem? implement top level? + +#this does not check name and version! + + +=cut + +sub add_core_coord_system_info { + my ($self) = shift; + + my ($sbuild, $top_level, $sequence_level, $default, $rank, $stored, $ccs_id) = + rearrange(['SCHEMA_BUILD','TOP_LEVEL', 'SEQUENCE_LEVEL', + 'DEFAULT', 'RANK', 'IS_STORED', 'CORE_COORD_SYSTEM_ID'], @_); + + + throw('Must provide a schema_build') if ! $sbuild; + throw('Must provide a core_coord_system_id') if ! $ccs_id; + + + #$top_level = ($top_level) ? 1 : 0; + $sequence_level = ($sequence_level) ? 1 : 0; + $default = ($default) ? 1 : 0; + $stored ||=0; + + $rank ||= 0; + + if($top_level) { + if($rank) { + throw('RANK argument must be 0 if TOP_LEVEL is 1'); + } + + if($self->name()) { + if($self->name() ne 'toplevel') { + throw('The NAME argument must be "toplevel" if TOP_LEVEL is 1') + } + } else { + throw('toplevel not yet implemented'); + #$name = 'toplevel'; + } + + if($sequence_level) { + throw("SEQUENCE_LEVEL argument must be 0 if TOP_LEVEL is 1"); + } + + $default = 0; + + } else { + if(!$rank) { + throw("RANK argument must be non-zero if not toplevel CoordSystem"); + } + if($self->name() eq 'toplevel') { + throw("Cannot name coord system 'toplevel' unless TOP_LEVEL is 1"); + } + } + + if($rank !~ /^\d+$/) { + throw('The RANK argument must be a positive integer'); + } + + + #We can add unstored coord systems here + #But will these ever have valid entries in the seq_region cache + #Initialising this cache key turning off the warning in equals about + #Using the nearest coord_system + + $self->{'core_cache'}{$sbuild} = {( + RANK => $rank, + SEQUENCE_LEVEL => $sequence_level, + DEFAULT => $default, + CORE_COORD_SYSTEM_ID => $ccs_id, + IS_STORED => $stored, + )}; + + + + + return; +} + + +#remove all but schema_buil and equals? +#depends on how we handle levels + +=head2 name + + Arg [1] : (optional) string $name + Example : print $coord_system->name(); + Description: Getter for the name of this coordinate system + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub name { + my $self = shift; + return $self->{'name'}; +} + + +=head2 get_latest_schema_build + + Example : my $db_schema_build = $coord_system->get_latest_schema_build(); + Description: Getter for the most recent schema_build of this coordinate system + Returntype : string + Exceptions : none + Caller : general + Status : at risk + +=cut + +sub get_latest_schema_build { + my $self = shift; + + return (sort (keys %{$self->{'core_cache'}}))[0]; +} + + +=head2 contains_schema_build + + Example : if ($coord_system->contains_schema_build('43_36e')){..do some coord system things ..}; + Description: Returns true is the CoordSystem maps to the corresponding core CoordSystem + Returntype : Boolean + Exceptions : throws if schema_build not defined + Caller : general + Status : at risk + +=cut + +sub contains_schema_build { + my ($self, $schema_build) = @_; + + throw('Must pass a schema_build') if ! $schema_build; + + return (exists $self->{'core_cache'}{$schema_build}) ? 1 : 0; +} + +=head2 version + + Arg [1] : none + Example : print $coord->version(); + Description: Getter/Setter for the version of this coordinate system. This + will return an empty string if no version is defined for this + coordinate system. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub version { + my $self = shift; + + return $self->{'version'}; +} + + + + +=head2 equals + + Arg [1] : Bio::EnsEMBL::Funcgen::CoordSystem $cs + The coord system to compare to for equality. + Example : if($coord_sys->equals($other_coord_sys)) { ... } + Description: Compares 2 coordinate systems and returns true if they are + equivalent. The definition of equivalent is sharing the same + name and version. + Returntype : string + Exceptions : none + Caller : general + Status : At risk + +=cut + +sub equals { + my $self = shift; + my $cs = shift; + + if(!$cs || !ref($cs) || + (! $cs->isa('Bio::EnsEMBL::Funcgen::CoordSystem') && + ! $cs->isa('Bio::EnsEMBL::CoordSystem'))){ + throw('Argument must be a Bio::EnsEMBL::Funcgen::CoordSystem'); + } + + + #need to add check on schema_build here + #all schema_builds should have been added by BaseFeatureAdaptor during import + #fails if we are using two different versions with the same cs's + + if(($self->version() eq $cs->version()) && + ($self->name() eq $cs->name())){ + + #we need to make sure these are default CS, otherwise we can get into trouble with + #re-used or mismatched seq_region_ids between DB wih different default assemblies + + if (! $self->contains_schema_build($self->adaptor->db->_get_schema_build($cs->adaptor()))) { + + #Only warn first time this is seen + my $warning_key = $self->adaptor->db->_get_schema_build($cs->adaptor()).':'.$self->name().':'.$self->version; + + if(! exists $warnings{$warning_key}){ + warn 'You are using a schema_build('.$self->adaptor->db->_get_schema_build($cs->adaptor()).') which has no CoordSystem stored for '.$cs->version.". Defaulting to closest name version match.\n"; + $warnings{$warning_key} = 1; + } + } + return 1; + } + + return 0; +} + + + + +=head2 is_top_level + + Arg [1] : none + Example : if($coord_sys->is_top_level()) { ... } + Description: Returns true if this is the toplevel pseudo coordinate system. + The toplevel coordinate system is not a real coordinate system + which is stored in the database, but it is a placeholder that + can be used to request transformations or retrievals to/from + the highest defined coordinate system in a given region. + Returntype : 0 or 1 + Exceptions : none + Caller : general + Status : at risk - not implemented yet + +=cut + +sub is_top_level { + my $self = shift; + + throw('Not yet implmented, need to test against the core cache using dnadb/schema_build'); + + return $self->{'top_level'}; +} + + +#These attribute methods are largely redundant +#is_default is used by Feature Adaptors to restrict features to +#current default assembly for non slice based methods +#Especially redundant now we have implemented this in fetch_all + +=head2 is_sequence_level + + Arg [1] : Bio::EnsEMBL::DBSQL::DBAdaptor + Example : if($coord_sys->is_sequence_level($dnadb)) { ... } + Description: Returns true if this is a sequence level coordinate system + for a given dnadb + Returntype : 0 or 1 + Exceptions : none + Caller : general + Status : at risk + +=cut + +sub is_sequence_level { + my ($self, $dnadb) = @_; + + return $self->get_coord_system_attribute('sequence_level', $dnadb); +} + + +=head2 is_default + + Arg [1] : Bio::EnsEMBL::DBSQL::DBAdaptor + Example : if($coord_sys->is_default($dnadb)) { ... } + Description: Returns true if this coordinate system is the default + version of the coordinate system of this name for a given dnadb. + Returntype : 0 or 1 + Exceptions : none + Caller : general - Used + Status : at risk + +=cut + +sub is_default { + my ($self, $dnadb) = @_; + + return $self->get_coord_system_attribute('default', $dnadb); +} + +sub get_coord_system_attribute{ + my($self, $attr_name, $dnadb) = @_; + + if(! ($dnadb && ref($dnadb) && $dnadb->isa('Bio::EnsEMBL::DBSQL::DBAdaptor'))){ + throw("You must pass a dnadb to access the CoordSystem attribute:\t $attr_name"); + } + + my $schema_build = $self->adaptor->db->_get_schema_build($dnadb); + + if(! $self->contains_schema_build($schema_build)){ + throw("CoordSystem does not contain the schema_build:\t$schema_build"); + } + + return $self->{'core_cache'}{$schema_build}{uc($attr_name)}; + +} + + +=head2 rank + + Arg [1] : Bio::EnsEMBL::DBSQL::DBAdaptor + Example : if($cs1->rank($dnadb) < $cs2->rank($dnadb)) { + print $cs1->name(), " is a higher level coord system than", + $cs2->name(), "\n"; + } + Description: Returns the rank of this coordinate system for a given dnadb. + A lower number is a higher coordinate system. The highest level coordinate + system has a rank of 1 (e.g. 'chromosome'). The toplevel + pseudo coordinate system has a rank of 0. + Returntype : int + Exceptions : none + Caller : general + Status : at risk - not yet implemented + +=cut + +sub rank { + my ($self, $dnadb) = @_; + return $self->get_coord_system_attribute('rank', $dnadb); + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/AnnotatedFeatureAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/AnnotatedFeatureAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,402 @@ +# +# Ensembl module for Bio::EnsEMBL::DBSQL::Funcgen::AnnotatedFeatureAdaptor +# + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::DBSQL::Funcgen::AnnotatedFeatureAdaptor - A database adaptor for fetching and +storing AnnotatedFeature objects. + +=head1 SYNOPSIS + +my $afa = $db->get_AnnotatedFeatureAdaptor(); + +my $features = $afa->fetch_all_by_Slice($slice); + + +=head1 DESCRIPTION + +The AnnotatedFeatureAdaptor is a database adaptor for storing and retrieving +AnnotatedFeature objects. + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::DBSQL::AnnotatedFeatureAdaptor; + +use Bio::EnsEMBL::Utils::Exception qw( throw warning ); +use Bio::EnsEMBL::Funcgen::AnnotatedFeature; +use Bio::EnsEMBL::Funcgen::DBSQL::SetFeatureAdaptor; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::DBSQL::SetFeatureAdaptor); + +#Query extension stuff +use constant TRUE_TABLES => [['annotated_feature', 'af'], ['feature_set', 'fs']]; +use constant TABLES => [['annotated_feature', 'af'], ['feature_set', 'fs']]; + +#SetFeatureAdaptor::_default_where_clause specifies join to fs table + +=head2 _tables + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns the names and aliases of the tables to use for queries. + Returntype : List of listrefs of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _tables { + my $self = shift; + + return @{$self->TABLES}; +} + +=head2 _columns + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns a list of columns to use for queries. + Returntype : List of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _columns { + my $self = shift; + + return qw( + af.annotated_feature_id af.seq_region_id + af.seq_region_start af.seq_region_end + af.seq_region_strand af.feature_set_id + af.display_label af.score + af.summit + ); +} + + + +=head2 _objs_from_sth + + Arg [1] : DBI statement handle object + Example : None + Description: PROTECTED implementation of superclass abstract method. + Creates AnnotatedFeature objects from an executed DBI statement + handle. + Returntype : Listref of Bio::EnsEMBL::AnnotatedFeature objects + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _objs_from_sth { + my ($self, $sth, $mapper, $dest_slice) = @_; + + + #For EFG this has to use a dest_slice from core/dnaDB whether specified or not. + #So if it not defined then we need to generate one derived from the species_name and schema_build of the feature we're retrieving. + + # This code is ugly because caching is used to improve speed + + + my ($sa, $seq_region_id); + $sa = $dest_slice->adaptor->db->get_SliceAdaptor() if($dest_slice);#don't really need this if we're using DNADBSliceAdaptor? + $sa ||= $self->db->dnadb->get_SliceAdaptor(); + + + #Some of this in now probably overkill as we'll always be using the DNADB as the slice DB + #Hence it should always be on the same coord system + my $fset_adaptor = $self->db->get_FeatureSetAdaptor(); + my @features; + my (%fset_hash, %slice_hash, %sr_name_hash, %sr_cs_hash); + + my ( + $annotated_feature_id, $efg_seq_region_id, + $seq_region_start, $seq_region_end, + $seq_region_strand, $fset_id, + $display_label, $score, + $summit + ); + + $sth->bind_columns( + \$annotated_feature_id, \$efg_seq_region_id, + \$seq_region_start, \$seq_region_end, + \$seq_region_strand, \$fset_id, + \$display_label, \$score, + \$summit + ); + + + my $asm_cs; + my $cmp_cs; + my $asm_cs_name; + my $asm_cs_vers; + my $cmp_cs_name; + my $cmp_cs_vers; + if ($mapper) { + $asm_cs = $mapper->assembled_CoordSystem(); + $cmp_cs = $mapper->component_CoordSystem(); + $asm_cs_name = $asm_cs->name(); + $asm_cs_vers = $asm_cs->version(); + $cmp_cs_name = $cmp_cs->name(); + $cmp_cs_vers = $cmp_cs->version(); + } + + my $dest_slice_start; + my $dest_slice_end; + my $dest_slice_strand; + my $dest_slice_length; + my $dest_slice_sr_name; + if ($dest_slice) { + $dest_slice_start = $dest_slice->start(); + $dest_slice_end = $dest_slice->end(); + $dest_slice_strand = $dest_slice->strand(); + $dest_slice_length = $dest_slice->length(); + $dest_slice_sr_name = $dest_slice->seq_region_name(); + } + + + FEATURE: while ( $sth->fetch() ) { + #Need to build a slice adaptor cache here? + #Would only ever want to do this if we enable mapping between assemblies?? + #Or if we supported the mapping between cs systems for a given schema_build, which would have to be handled by the core api + + #get core seq_region_id + #This fails if we are using a 'comparable' CoordSystem as we don't have a cache + #for the new DB. Wasn't this fixed with the tmp seq_region_cache? + $seq_region_id = $self->get_core_seq_region_id($efg_seq_region_id); + + if(! $seq_region_id){ + warn "Cannot get slice for eFG seq_region_id $efg_seq_region_id\n". + "The region you are using is not present in the current seq_region_cache.\n". + "Maybe you need to redefine the dnadb or update_DB_for_release?"; + next; + } + + #Get the FeatureSet object + $fset_hash{$fset_id} = $fset_adaptor->fetch_by_dbID($fset_id) if(! exists $fset_hash{$fset_id}); + + + # Get the slice object + my $slice = $slice_hash{'ID:'.$seq_region_id}; + + if (! $slice) { + $slice = $sa->fetch_by_seq_region_id($seq_region_id); + $slice_hash{'ID:'.$seq_region_id} = $slice; + $sr_name_hash{$seq_region_id} = $slice->seq_region_name(); + $sr_cs_hash{$seq_region_id} = $slice->coord_system(); + } + + my $sr_name = $sr_name_hash{$seq_region_id}; + my $sr_cs = $sr_cs_hash{$seq_region_id}; + + # Remap the feature coordinates to another coord system if a mapper was provided + if ($mapper) { + + throw("Not yet implmented mapper, check equals are Funcgen calls too!"); + + ($sr_name, $seq_region_start, $seq_region_end, $seq_region_strand) + = $mapper->fastmap($sr_name, $seq_region_start, $seq_region_end, $seq_region_strand, $sr_cs); + + # Skip features that map to gaps or coord system boundaries + next FEATURE if !defined $sr_name; + + # Get a slice in the coord system we just mapped to + if ( $asm_cs == $sr_cs || ( $cmp_cs != $sr_cs && $asm_cs->equals($sr_cs) ) ) { + $slice = $slice_hash{"NAME:$sr_name:$cmp_cs_name:$cmp_cs_vers"} + ||= $sa->fetch_by_region($cmp_cs_name, $sr_name, undef, undef, undef, $cmp_cs_vers); + } else { + $slice = $slice_hash{"NAME:$sr_name:$asm_cs_name:$asm_cs_vers"} + ||= $sa->fetch_by_region($asm_cs_name, $sr_name, undef, undef, undef, $asm_cs_vers); + } + } + + # If a destination slice was provided convert the coords + # If the destination slice starts at 1 and is forward strand, nothing needs doing + if ($dest_slice) { + unless ($dest_slice_start == 1 && $dest_slice_strand == 1) { + if ($dest_slice_strand == 1) { + $seq_region_start = $seq_region_start - $dest_slice_start + 1; + $seq_region_end = $seq_region_end - $dest_slice_start + 1; + } else { + my $tmp_seq_region_start = $seq_region_start; + $seq_region_start = $dest_slice_end - $seq_region_end + 1; + $seq_region_end = $dest_slice_end - $tmp_seq_region_start + 1; + $seq_region_strand *= -1; + } + } + + # Throw away features off the end of the requested slice + if(! $self->force_reslice){ + #force_reslice set by RegulatoryFeature::regulatory_attributes + #so we don't lose attrs which are not on the dest_slice + + next FEATURE if $seq_region_end < 1 || $seq_region_start > $dest_slice_length + || ( $dest_slice_sr_name ne $sr_name ); + + $slice = $dest_slice; + } + } + + + + push @features, Bio::EnsEMBL::Funcgen::AnnotatedFeature->new_fast + ({ + 'start' => $seq_region_start, + 'end' => $seq_region_end, + 'strand' => $seq_region_strand, + 'slice' => $slice, + 'adaptor' => $self, + 'dbID' => $annotated_feature_id, + 'score' => $score, + 'summit' => $summit, + 'display_label' => $display_label, + 'set' => $fset_hash{$fset_id}, + }); + } + + return \@features; +} + +=head2 store + + Args : List of Bio::EnsEMBL::Funcgen::AnnotatedFeature objects + Example : $ofa->store(@features); + Description: Stores given AnnotatedFeature objects in the database. Should only + be called once per feature because no checks are made for + duplicates. Sets dbID and adaptor on the objects that it stores. + Returntype : Listref of stored AnnotatedFeatures + Exceptions : Throws if a list of AnnotatedFeature objects is not provided or if + the Analysis, CellType and FeatureType objects are not attached or stored + Caller : General + Status : At Risk + +=cut + +sub store{ + my ($self, @pfs) = @_; + + if (scalar(@pfs) == 0) { + throw('Must call store with a list of AnnotatedFeature objects'); + } + + my $sth = $self->prepare(" + INSERT INTO annotated_feature ( + seq_region_id, seq_region_start, + seq_region_end, seq_region_strand, + feature_set_id, display_label, score, summit + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?) + "); + + + my $db = $self->db(); + + FEATURE: foreach my $pf (@pfs) { + + if( !ref $pf || !$pf->isa('Bio::EnsEMBL::Funcgen::AnnotatedFeature') ) { + throw('Feature must be an AnnotatedFeature object'); + } + + if ( $pf->is_stored($db) ) { + #does not accomodate adding Feature to >1 feature_set + warning('AnnotatedFeature [' . $pf->dbID() . '] is already stored in the database'); + next FEATURE; + } + + + #Only need to check FeatureSet here, as FeatureSet store will check analysis + + if (! $pf->feature_set->is_stored($db)) { + throw('A stored Bio::EnsEMBL::Funcgen::FeatureSet must be attached to the AnnotatedFeature objects to be stored.'); + } + + + my $seq_region_id; + ($pf, $seq_region_id) = $self->_pre_store($pf); + + $sth->bind_param(1, $seq_region_id, SQL_INTEGER); + $sth->bind_param(2, $pf->start(), SQL_INTEGER); + $sth->bind_param(3, $pf->end(), SQL_INTEGER); + $sth->bind_param(4, $pf->strand(), SQL_TINYINT); + $sth->bind_param(5, $pf->feature_set->dbID(), SQL_INTEGER); + $sth->bind_param(6, $pf->display_label(), SQL_VARCHAR); + $sth->bind_param(7, $pf->score(), SQL_DOUBLE); + $sth->bind_param(8, $pf->summit, SQL_INTEGER); + + $sth->execute(); + $pf->dbID( $sth->{'mysql_insertid'} ); + $pf->adaptor($self); + } + + return \@pfs; +} + + +=head2 fetch_all_by_associated_MotifFeature + + Arg [1] : Bio::EnsEMBL::Funcgen::MotifFeature + Example : my $assoc_afs = $af_adaptor->fetch_all_by_associated_MotifFeature($motif_feature); + Description: Fetches all associated AnnotatedFeatures for a given MotifFeature. + Returntype : ARRAYREF of Bio::EnsEMBL::Funcgen::AnnotatedFeature objects + Exceptions : Throws if arg is not valid or stored + Caller : General + Status : At risk + +=cut + +sub fetch_all_by_associated_MotifFeature{ + my ($self, $mf) = @_; + + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::MotifFeature', $mf); + push @{$self->TABLES}, ['associated_motif_feature', 'amf']; + my $table_name = $mf->adaptor->_main_table->[0]; + + my $constraint = 'amf.annotated_feature_id=af.annotated_feature_id AND amf.motif_feature_id='.$mf->dbID; + my $afs = $self->generic_fetch($constraint); + $self->reset_true_tables; + + return $afs; +} + +### DEPRECATED METHODS ### + +sub fetch_all_by_Slice_FeatureSet { + my ($self, $slice, $fset) = @_; + deprecate('Please use generic method SetFeatureAdaptor::fetch_all_by_Slice_FeatureSets'); + return $self->fetch_all_by_Slice_FeatureSets($slice, [$fset]); +} + + +#sub fetch_all_by_Slice_Experiment # removed in v68 was never functional + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/ArrayAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/ArrayAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,541 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::DBSQL::ArrayAdaptor +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::Funcgen::DBSQL::ArrayAdaptor - A database adaptor for fetching and +storing Funcgen Array objects. + +=head1 SYNOPSIS + +my $oaa = $db->get_ArrayAdaptor(); + +my $array = $oaa->fetch_by_name_vendor('HG-U133A', 'AFFY'); +my @arrays = @{$oaa->fetch_all()}; + +=head1 DESCRIPTION + +The ArrayAdaptor is a database adaptor for storing and retrieving +Funcgen Array objects. + + +=head1 SEE ALSO + +Bio::EnsEMBL::Funcgen::Array +Bio::EnsEMBL::Funcgen::ArrayChip + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::DBSQL::ArrayAdaptor; + +use Bio::EnsEMBL::Utils::Exception qw( warning throw ); +use Bio::EnsEMBL::Funcgen::Array; +use Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor); + +use constant TRUE_TABLES => [['array', 'a']]; +use constant TABLES => [['array', 'a']]; + + +=head2 fetch_by_array_chip_dbID + + Arg [1] : Int - dbID of an ArrayChip + Example : my $array = $array_adaptor->fetch_by_array_chip_dbID($ac_dbid); + Description: Retrieves Array object based on one of it's constituent ArrayChip dbIDs + Returntype : Bio::EnsEMBL::Funcgen::Array + Exceptions : None + Caller : General + Status : Stable + +=cut + +#Changed to use simple query extension +#Removed 1 query +#3.7 % or 1.04 times faster + +sub fetch_by_array_chip_dbID { + my ($self, $ac_dbid) = @_; + + throw('Must provide an ArrayChip dbID') if ! $ac_dbid; + + #Extend query tables + push @{$self->TABLES}, (['array_chip', 'ac']); + + #Extend query and group + my $array = $self->generic_fetch('ac.array_chip_id='.$ac_dbid.' and ac.array_id=a.array_id GROUP by a.array_id')->[0]; + $self->reset_true_tables; + + return $array; +} + + +=head2 fetch_by_name_vendor + + Arg [1] : String - Name of an array e.g. MoGene-1_0-st-v1 + Arg [2] : String - (optional) Name of vendor e.g. AFFY + Example : #You can list the possible array name and vendor values as follows + map print $_->vendor.' '.$_->name, @{$array_adaptor->fetch_all}; + + #Use some of the above values with this method + my $array = $array_adaptor->fetch_by_name_vendor('MoGene-1_0-st-v1', 'AFFY'); + Description: Retrieves a named Array object from the database. + Returntype : Bio::EnsEMBL::Funcgen::Array + Exceptions : Throws is name argument not defined + Throws if vendor argument not defined and more than one arrays is found + Caller : General + Status : Stable + +=cut + +sub fetch_by_name_vendor { + my ($self, $name, $vendor) = @_; + + throw("Must provide and name") if (! $name); + my @arrays; + + if(! $vendor){ + @arrays = @{$self->generic_fetch("a.name = '$name'")}; + + if(scalar(@arrays) > 1){ + throw("There is more than one array with this name please specify a vendor argument as one of:\t".join(' ', (map $_->vendor, @arrays))); + } + } + else{ + #name vendor is unique key so will only ever return 1 + @arrays = @{$self->generic_fetch("a.name = '$name' and a.vendor='".uc($vendor)."'")}; + } + + return $arrays[0]; +} + + +=head2 fetch_by_name_class + + Arg [1] : String - Name of an array + Arg [2] : String - Class of array e.g. AFFY_UTR + Example : #You can list the possible array name and class values as follows + map print $_->class.' '.$_->name, @{$array_adaptor->fetch_all}; + + #Use some of the above values with this method + my $array = $array_adaptor->fetch_by_name_class('HuGene_1_0_st_v1', 'AFFY_ST'); + Description: Retrieves Array object from the database based on name and class. + Returntype : Bio::EnsEMBL::Funcgen::Array + Exceptions : Throws is name and class not passed + Caller : General + Status : Stable + +=cut + +sub fetch_by_name_class { + my ($self, $name, $class) = @_; + throw("Must provide and array and class e.g.'HuGene_1_0_st_v1', 'AFFY_ST'") if (! ($name && $class)); + + #name class is unique key so will only ever return 1 + return $self->generic_fetch("a.name = '$name' and a.class='".uc($class)."'")->[0]; +} + + +=head2 fetch_all_by_class + + Arg [1] : String - Class e.g. ILLUMINA_WG + Example : #You can list the possible array class values as follows + map print $_->class, @{$array_adaptor->fetch_all}; + + #Use some one the above values with this method + my $array = $array_adaptor->fetch_all_by_class('AFFY_ST'); + Description: Retrieves Array object from the database based class. + Returntype : ARRAYREF of Bio::EnsEMBL::Funcgen::Array objects + Exceptions : Throws if nor class passed + Caller : General + Status : At Risk + +=cut + +sub fetch_all_by_class { + my ($self, $class) = @_; + + throw("Must provide and array class e.g.'AFFY_ST'") if (! defined $class); + return $self->generic_fetch("a.class='".uc($class)."'"); +} + + +=head2 fetch_all_by_type + + Arg [1] : List of strings - type(s) (e.g. OLIGO, PCR) + Example : my @arrays = @{$array_adaptor->fetch_all_by_type('OLIGO')}; + Description: Fetch all arrays of a particular type. + Returntype : Listref of Bio::EnsEMBL::Funcgen::Array objects + Exceptions : Throws if no type is provided + Caller : General + Status : at risk - needs deprecating and changing to fetch_all_by_types + +=cut + +#Is this used in the API anywhere? + +sub fetch_all_by_type { + my ($self, @types) = @_; + + throw('Need type as parameter') if ! @types; + + my $constraint; + if (scalar @types == 1) { + $constraint = qq( a.type = '$types[0]' ); + } else { + $constraint = join q(','), @types; + $constraint = qq( a.type IN ('$constraint') ); + } + + return $self->generic_fetch($constraint); +} + + +=head2 fetch_all_by_Experiment + + Arg [1] : Bio::EnsEMBL::Funcgen::Experiment + Example : my @arrays = @{$array_adaptor->fetch_all_by_Experiment($exp)}; + Description: Fetch all arrays associated with a given Experiment + Returntype : ARRAYREF of Bio::EnsEMBL::Funcgen::Array objects + Exceptions : none + Caller : General + Status : Stable + +=cut + +#Changed to use simple query extension +#Removed 2 queries +#96.4% or 26.7 times faster!!! + +sub fetch_all_by_Experiment{ + my ($self, $exp) = @_; + + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::Experiment', $exp); + + #Extend query tables + push @{$self->TABLES}, (['array_chip', 'ac'], ['experimental_chip', 'ec']); + + #Extend query and group + my $arrays = $self->generic_fetch($exp->dbID.'=ec.experiment_id and ec.array_chip_id=ac.array_chip_id and ac.array_id=a.array_id GROUP by a.array_id'); + + $self->reset_true_tables; + + return $arrays; +} + + +=head2 fetch_all_by_ProbeSet + + Arg [1] : Bio::EnsEMBL::Funcgen::ProbeSet + Example : my @arrays = @{$aa->fetch_all_by_ProbeSet($probeset)}; + Description: Fetch all arrays containing a given ProbeSet + This is a convenience method to hide the 2 adaptors required + for this call. + Returntype : ARRAYREF of Bio::EnsEMBL::Funcgen::Array objects + Exceptions : none + Caller : General + Status : at risk + +=cut + + +#Changed to use simple query extension +#Removed 1 query and hash loop +#This is only 1.04 times faster or ~ 4% + +sub fetch_all_by_ProbeSet{ + my ($self, $pset) = @_; + + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::ProbeSet', $pset); + + #Extend query tables + push @{$self->TABLES}, (['array_chip', 'ac'], ['probe', 'p']); + + #Extend query and group + my $arrays = $self->generic_fetch('p.probe_set_id='.$pset->dbID.' and p.array_chip_id=ac.array_chip_id and ac.array_id=a.array_id GROUP BY a.array_id'); + # ORDER BY NULL');#Surpresses default order by group columns. Actually slower? Result set too small? + + $self->reset_true_tables; + + return $arrays; +} + + +=head2 _tables + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns the names and aliases of the tables to use for queries. + Returntype : List of listrefs of strings + Exceptions : None + Caller : Internal + Status : Stable + +=cut + +sub _tables { + my $self = shift; + + return @{$self->TABLES}; +} + +=head2 _columns + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns a list of columns to use for queries. + Returntype : List of strings + Exceptions : None + Caller : Internal + Status : Stable + +=cut + +sub _columns { + my $self = shift; + + return qw( a.array_id a.name a.format a.vendor a.description a.type a.class); +} + +=head2 _objs_from_sth + + Arg [1] : DBI statement handle object + Example : None + Description: PROTECTED implementation of superclass abstract method. + Creates Array objects from an executed DBI statement + handle. + Returntype : Listref of Bio::EnsEMBL::Funcgen::Array objects + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _objs_from_sth { + my ($self, $sth) = @_; + + my (@result, $array_id, $name, $format, $vendor, $description, $type, $class); + + $sth->bind_columns(\$array_id, \$name, \$format, \$vendor, \$description, \$type, \$class); + + while ( $sth->fetch() ) { + + my $array = Bio::EnsEMBL::Funcgen::Array->new + ( + -dbID => $array_id, + -adaptor => $self, + -name => $name, + -format => $format, + -vendor => $vendor, + -description => $description, + -type => $type, + -class => $class, + ); + + push @result, $array; + + + } + return \@result; +} + + + + +=head2 store + + Args : List of Bio::EnsEMBL::Funcgen::Array objects + Example : $oaa->store($array1, $array2, $array3); + Description: Stores given Array objects in the database. This + method checks for arrays previously stored and updates + and new array_chips accordingly. + Returntype : Listref of stored Array objects + Exceptions : None + Caller : General + Status : At Risk + +=cut + + +#This works slightly differently as arary_chip are not stored as objects, +#yet we need to retrieve a dbID for the array before we know about all the array_chips + +sub store { + my $self = shift; + my @args = @_; + + my ($sarray); + + my $sth = $self->prepare(" + INSERT INTO array + (name, format, vendor, description, type, class) + VALUES (?, ?, ?, ?, ?, ?)"); + + + foreach my $array (@args) { + if ( !$array->isa('Bio::EnsEMBL::Funcgen::Array') ) { + warning('Can only store Array objects, skipping $array'); + next; + } + + if (!( $array->dbID() && $array->adaptor() == $self )){ + #try and fetch array here and set to array if valid + $sarray = $self->fetch_by_name_vendor($array->name(), $array->vendor());#this should be name_vendor? + + if( ! $sarray){ + #sanity check here + throw("Array name must not be longer than 30 characters") if (length($array->name) > 40); + $sth->bind_param(1, $array->name(), SQL_VARCHAR); + $sth->bind_param(2, $array->format(), SQL_VARCHAR); + $sth->bind_param(3, $array->vendor(), SQL_VARCHAR); + $sth->bind_param(4, $array->description(), SQL_VARCHAR); + $sth->bind_param(5, $array->type(), SQL_VARCHAR); + $sth->bind_param(6, $array->class(), SQL_VARCHAR); + + + $sth->execute(); + my $dbID = $sth->{'mysql_insertid'}; + $array->dbID($dbID); + $array->adaptor($self); + } + else{ + #warn("Array already stored, using previously stored array\n");# validating array_chips\n"); + $array = $sarray; + } + } + } + + return \@args; +} + + + + +=head2 fetch_probe_count_by_Array + + Args : None + Example : my $probe_count = @{$array_adaptor->fetch_probe_count_by_Array($array)}; + Description: Counts probes on given array + Returntype : ints + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub fetch_probe_count_by_Array{ + my ($self, $array) = @_; + + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::Array', $array); + + my ($count) = @{$self->dbc->db_handle->selectrow_arrayref('select count(distinct probe_id) from array_chip ac, probe p where ac.array_id='.$array->dbID.' and ac.array_chip_id=p.array_chip_id')}; + + return $count; +} + + +=head2 fetch_Probe_dbIDs_by_Array + +Arg [1] : Bio::EnsEMBL::Funcgen::Array +Example : my @dbids = @{$array_adaptor->fetch_Probe_dbIDs_by_Array($array)} +Description: Fetches a arrayref of Probe dbIDs for a given Array +Returntype : arrayref of Probe dbIDs +Exceptions : None +Caller : General +Status : at risk + +=cut + +sub fetch_Probe_dbIDs_by_Array{ + my ($self, $array) = @_; + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::Array', $array); + + my $sql = sprintf(qq/ +SELECT distinct p.probe_id +FROM probe p +WHERE array_chip_id in( %s ) /, join( ',', @{$array->get_array_chip_ids} ) ); + + my $sth = $self->prepare( $sql ); + $sth->execute || die ($sth->errstr); + my @dbids = map{$_->[0]} @{$sth->fetchall_arrayref}; + + return \@dbids; +} + +=head2 fetch_Probe_name2dbID_by_Array + +Arg [1] : Bio::EnsEMBL::Funcgen::Array +Example : my %name2dbid = %{$array_adaptor->fetch_Probe_name2dbID_by_Array($array)} +Description: Fetches a hashref of Probe dbIDs keyed by probe name + for all Probes of a given Array +Returntype : Hashref for Probe dbIDs keyed by probe name +Exceptions : None +Caller : General +Status : at risk + +=cut + +sub fetch_Probe_name2dbID_by_Array{ + my ($self, $array) = @_; + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::Array', $array); + + my $sql = sprintf(qq/ +SELECT p.name, p.probe_id +FROM probe p +WHERE array_chip_id in( %s ) /, join( ',', @{$array->get_array_chip_ids} ) ); + + my $sth = $self->prepare( $sql ); + $sth->execute || die ($sth->errstr); + my %mapping; + map{$mapping{$_->[0]}=$_->[1]} @{$sth->fetchall_arrayref}; + return \%mapping; +} + + + +sub check_status_by_class{ + my ($self, $status, $class) = @_; + + foreach my $array(@{$self->fetch_all_by_class($class)}){ + + foreach my $ac(@{$array->get_ArrayChips}){ + + if(! $ac->has_status($status)){ + throw('Found '.$class.' ArrayChip '.$ac->name." without $status status"); + } + } + } + + return; +} + + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/ArrayChipAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/ArrayChipAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,344 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::DBSQL::ArrayChipAdaptor +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::Funcgen::DBSQL::ArrayChipAdaptor - A database adaptor for fetching and +storing Funcgen ArrayChip objects. + +=head1 SYNOPSIS + +my $ac_adaptor = $db->get_ArrayChipAdaptor(); + +my @achips = @{$ec_adaptor->fetch_all_by_Array($array)}; + + +=head1 DESCRIPTION + +The ArrayChipAdaptor is a database adaptor for storing and retrieving +Funcgen ArrayChip objects. + + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::DBSQL::ArrayChipAdaptor; + +use Bio::EnsEMBL::Utils::Exception qw( warning throw ); +use Bio::EnsEMBL::Funcgen::ArrayChip; +use Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor; + +use vars qw(@ISA); + + +#May need to our this? +@ISA = qw(Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor); + +=head2 fetch_all_by_array_id + + Arg [1] : int - dbID of Array + Example : my @ccs = @{$ec_a->fetch_all_by_array_dbID($array->dbID()); + Description: Does what it says on the tin + Returntype : Listref of Bio::EnsEMBL::Funcgen::ArrayChip + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub fetch_all_by_array_id { + my $self = shift; + my $array_id = shift; + + throw("Must specify an array dbID") if(! $array_id); + + my $constraint = "ac.array_id='$array_id'"; + + return $self->generic_fetch($constraint); +} + + +=head2 fetch_all_by_Array + + Arg [1] : Bio::EnsEMBL::Funcgen::Array + Example : my @ccs = @{$ec_a->fetch_all_by_Array($array); + Description: Returns all ArrayChips which belong to the given Array + Returntype : Arrayref of Bio::EnsEMBL::Funcgen::ArrayChip objects + Exceptions : Throws if ARg not valid and stored + Caller : General + Status : At Risk + +=cut + +sub fetch_all_by_Array { + my ($self, $array) = @_; + + if(! (ref($array) && $array->isa->('Bio::EnsEMBL::Funcgen::Array') && $array->dbID())){ + throw("Must pass a valid stored Bio::EnsEMBL::Funcgen::Array"); + } + + return $self->fetch_all_by_array_id($array->dbID); +} + + +=head2 fetch_all_by_ExperimentalChips + + Arg [1] : arrayref of - Bio::EnsEMBL::Funcgen::ExperimentalChips + Example : my @achips = @{$ec_a->fetch_all_by_ExperimentalChips($echips); + Description: Gets a non-redundant list of the corresponding ArrayChips + Returntype : Listref of Bio::EnsEMBL::Funcgen::ArrayChips + Exceptions : Throws if no ExperimentalChips passed or if ExperimentalChips are not stored or valid + Caller : General + Status : at risk + +=cut + +sub fetch_all_by_ExperimentalChips { + my ($self, $echips) = @_; + + my %ac_ids; + + foreach my $echip(@$echips){ + + if(! ($echip->isa('Bio::EnsEMBL::Funcgen::ExperimentalChip') && $echip->dbID())){ + throw('Must provide an arrayref of valid stored Bio::EnsEMBL::Funcgen::ExperimentalChips'); + } + + $ac_ids{$echip->array_chip_id} = 1; + + } + + if(! keys(%ac_ids)){ + throw('Must provide an arrayref of valid stored Bio::EnsEMBL::Funcgen::ExperimentalChips'); + } + + return $self->generic_fetch('ac.array_chip_id IN ('.join(', ', keys(%ac_ids)).')'); +} + + +=head2 fetch_by_array_design_ids + + Arg [1] : int - dbID of Array + Arg [2] : string - Design ID of ArrayChip + Example : my $ac = $ac_adaptpr->fetch_by_array_design_ids($array->dbID, $design_id); + Description: Does what it says on the tin + Returntype : Bio::EnsEMBL::Funcgen::ArrayChip + Exceptions : Throws if args not met. + Caller : General + Status : Medium Risk + +=cut + +sub fetch_by_array_design_ids{ + my ($self, $array_id, $design_id) = @_; + + if( ! ($array_id && $design_id)){ + throw('You must pass an Array dbID and a ArrayChip design ID'); + } + + my $constraint = "ac.array_id='$array_id' and ac.design_id='$design_id'"; + + my ($ac) = @{$self->generic_fetch($constraint)}; + #unique key means this always has just one element + + return $ac; +} + + + + + +#fetch by Array_array_chip_name?? +#would need this if we're going to check for previously imported ArrayChips, as there's no guarantee that the design_name will be populated. + +=head2 _tables + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns the names and aliases of the tables to use for queries. + Returntype : List of listrefs of strings + Exceptions : None + Caller : Internal + Status : Medium Risk + +=cut + +sub _tables { + my $self = shift; + + return ['array_chip', 'ac']; +} + +=head2 _columns + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns a list of columns to use for queries. + Returntype : List of strings + Exceptions : None + Caller : Internal + Status : Medium Risk + +=cut + +sub _columns { + my $self = shift; + + return qw( ac.array_chip_id ac.design_id ac.array_id ac.name); +} + +=head2 _objs_from_sth + + Arg [1] : DBI statement handle object + Example : None + Description: PROTECTED implementation of superclass abstract method. + Creates Array objects from an executed DBI statement + handle. + Returntype : Listref of Bio::EnsEMBL::Funcgen::ArrayChip objects + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _objs_from_sth { + my ($self, $sth) = @_; + + my (@result, $ac_id, $design_id, $array_id, $name); + + $sth->bind_columns(\$ac_id, \$design_id, \$array_id, \$name); + + while ( $sth->fetch() ) { + my $array = Bio::EnsEMBL::Funcgen::ArrayChip->new( + -dbID => $ac_id, + -design_id => $design_id, + -array_id => $array_id, + -name => $name, + -adaptor => $self, + ); + + push @result, $array; + + } + return \@result; +} + + + +=head2 store + + Args : List of Bio::EnsEMBL::Funcgen::ArrayChip objects + Example : $arraychip_adaptor->store($ac1, $ac2, $ac3); + Description: Stores given ArrayChip objects in the database. Should only be + called once per ArrayChip because no checks are made for duplicates. + Returntype : ARRAYREF + Exceptions : warns if passed non-ArrayChip arg + Caller : General + Status : At Risk + +=cut + +sub store { + my $self = shift; + my @args = @_; + + #my ($stored_ac); + + #Should we implement a throw here is the caller is not Array? + #make private _store? + + + my $sth = $self->prepare(" + INSERT INTO array_chip + (design_id, array_id, name) + VALUES (?, ?, ?)" + ); + + + + foreach my $ac (@args) { + if ( ! $ac->isa('Bio::EnsEMBL::Funcgen::ArrayChip') ) { + warning('Can only store ExperimentalChip objects, skipping $ec'); + next; + } + + throw("ArrayChip must have an array_id to be stored") if ! $ac->array_id(); + + #check for array_id here? this is done by not null in sql + + + #check for previously stored array_chips is done in Array via add_ArrayChip + + + + if (!( $ac->dbID() && $ac->adaptor() == $self )){ + + my $s_ac = $self->fetch_by_array_design_ids($ac->array_id(), $ac->design_id()); + + if(! $s_ac){ + + $sth->bind_param(1, $ac->design_id(), SQL_VARCHAR); + $sth->bind_param(2, $ac->array_id(), SQL_INTEGER); + $sth->bind_param(3, $ac->name(), SQL_VARCHAR); + $sth->execute(); + + my $dbID = $sth->{'mysql_insertid'}; + $ac->dbID($dbID); + $ac->adaptor($self); + + }else{ + $ac = $s_ac; + # my @states = @{$self->db->fetch_all_states('experimental_chip', $ec->dbID())}; + # warn("Using previously stored ExperimentalChip (".$ec->unique_id().") with states\t@states\n"); + } + } + } + return \@args; +} + + +=head2 list_dbIDs + + Args : None + Example : my @array_ids = @{$ec_a->list_dbIDs()}; + Description: Gets an array of internal IDs for all ArrayChip objects in the + current database. + Returntype : List of ints + Exceptions : None + Caller : ? + Status : Medium Risk + +=cut + +sub list_dbIDs { + my ($self) = @_; + return $self->_list_dbIDs('array_chip'); +} + + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/BaseAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/BaseAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,912 @@ + +# +# Perl module for Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor - Simple wrapper class for Funcgen StorableAdaptors + +=head1 SYNOPSIS + +$adaptor->store_states($storable); + +=head1 DESCRIPTION + +This is a simple wrapper class to hold common methods to all Funcgen StorableAdaptors. +Includes status methods. + +=head1 SEE ALSO + +Bio::EnsEMBL::DBSQL::BaseAdaptor + +=cut + +package Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor; +require Exporter; +use vars qw(@ISA @EXPORT); +use strict; + +use Bio::EnsEMBL::Utils::Exception qw(throw deprecate); +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +use DBI qw(:sql_types); + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor Exporter); +@EXPORT = (@{$DBI::EXPORT_TAGS{'sql_types'}}); + + + +=head2 compose_constraint_query + + Arg [1] : Hash - Params hash containing 'constraints' key value pairs + Example : my @fsets = $fs_adaptopr->fetch_all_by_FeatureType($type); + Description: Retrieves FeatureSet objects from the database based on feature_type id. + Returntype : Listref of Bio::EnsEMBL::Funcgen::FeatureSet objects + Exceptions : Throws if arg is not a valid FeatureType + Caller : General + Status : At Risk + +=cut + +#This approach originated in the need for a fully flexible method call +#for the FeatureSetAdaptor to support the Experiment view. +#Add support for final_clause? +#Will still be useful to define an array of valid constraints in the descendant adaptor +#This will enable us to restrict the generic constraints(in here) for a given adaptor +#and dynamically provide a list of valid constraint in the error output + +sub compose_constraint_query{ + my ($self, $params) = @_; + + if($params && + (ref($params) ne 'HASH') ){ + throw('You must pass a valid params HASHREF to compose_constraint_query'); + } + + + my @constraints; + + if( exists ${$params}{constraints}){ + + my @filter_names = keys (%{$params->{constraints}}); + + foreach my $constraint_key(keys (%{$params->{constraints}})){ + + my $constrain_method = '_constrain_'.$constraint_key; + + if(! $self->can($constrain_method)){ + throw($constraint_key." is not a valid constraint"); + + #Need to add test on and list valid constraints + + # please specify values for one of:\t". + # join(', ', keys(%constraint_config))); + } + + my ($constraint, $constraint_conf) = $self->$constrain_method($params->{constraints}{$constraint_key}); + push @constraints, $constraint; + + + #Currently only handle tables here but could also + #set other dynamic config e.g. final clause + + if (exists ${$constraint_conf}{tables}) { + push @{$self->TABLES}, @{$constraint_conf->{tables}}; + } + } # END OF CONSTRAINTS + } # END OF $PARAMS + + return join(' AND ', @constraints) || ''; +} + + + +sub reset_true_tables{ + my $self = shift; + + #deref to avoid modifying TRUE_TABLES + @{$self->TABLES} = @{$self->TRUE_TABLES}; + return; +} + +=head2 store_states + + Arg [1] : Bio::EnsEMBL::Funcgen::Storable + Example : $rset_adaptor->store_states($result_set); + Description: Stores states of Storable in status table. + Returntype : None + Exceptions : Throws if Storable is not stored + Caller : General + Status : At Risk + +=cut + +#do we want to keep the IMPORTED_CS status for feature_sets/array_chips? +#rename to MAPPED_CS_N? + +sub store_states{ + my ($self, $storable) = @_; + + throw('Must pass a Bio::EnsEMBL::Funcgen::Storable') if(! $storable->isa("Bio::EnsEMBL::Funcgen::Storable")); + + foreach my $state(@{$storable->get_all_states()}){ + + $self->store_status($state, $storable) if (! $self->has_stored_status($state, $storable)); + } + + return; + +} + +=head2 fetch_all + + Arg[1] : string - optional status name e.g. 'DISPLAYABLE' + Example : my @dsets = @{$dsa->fetch_all()}; + Description: Gets all available objects from the DB, which + might not be a good idea, shouldnt be called on + the BIG tables though + Returntype : ARRAYREF + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub fetch_all { + my ($self, $status) = @_; + + my ($constraint); + #Can we throw here if we're trying to get all from known large tables + + + $constraint = $self->status_to_constraint($status) if $status; + + + if(defined $status && ! defined $constraint){ + my @tables = $self->_tables; + my ($table_name, undef) = @{$tables[0]}; + + warn "You are trying to fetch $table_name entries which have a $status status, which is not present in the DB"; + return []; + } + + return $self->generic_fetch($constraint); + +} + + + +=head2 fetch_all_displayable + + Example : my @displayable_dset = @{$dsa->fetch_all_displayable()}; + Description: Gets all displayable DataSets + Returntype : ARRAYREF + Exceptions : None + Caller : General + Status : At Risk - can we just reimplement fetch_all with optional status arg + +=cut + +sub fetch_all_displayable{ + my $self = shift; + return $self->fetch_all_by_status('DISPLAYABLE'); +} + +=head2 fetch_all_by_status + + Arg [1] : string - status e.g. 'DISPLAYABLE' + Example : my @displayable_dset = @{$dsa->fetch_all_by_status('DISPLAYABLE')}; + Description: Gets all DataSets with given status + Returntype : ARRAYREF + Exceptions : Throws is no status defined + Warns if + Caller : General + Status : At Risk - To be removed + +=cut + +sub fetch_all_by_status{ + my ($self, $status) = @_; + + deprecate('Use fetch_all($status) instead'); + return $self->fetch_all($status); + +} + + +#Can we not just re implement fetch_all here to have the optional status arg? + + + +=head2 status_to_constraint + + Arg [1] : string - status e.g. 'DISPLAYABLE' + Arg [2] : string - Constraint + Example : $sql = $self->status_to_constraint($self, $constraint, $status); + Description: Appends the appropriate status constraint dependant on the BaseAdaptor sub class. + Returntype : string - constraint + Exceptions : None + Caller : Bio::EnsEMBL::Funcgen::DBSQL::"BaseAdaptors" + Status : At risk + +=cut + +sub status_to_constraint{ + my ($self, $status) = @_; + + #This is now supported in compose_constraint_query + #Which avoid some of the problems below + + my $constraint; + + #This will throw if status not valid, but still may be absent + my $status_id = $self->_get_status_name_id($status); + + + return if (! $status_id); + + my @tables = $self->_tables; + my ($table_name, $syn) = @{$tables[0]}; + + my @status_ids; + + my $sql = "SELECT table_id from status where table_name='$table_name' and status_name_id='$status_id'"; + @status_ids = map $_ = "@$_", @{$self->db->dbc->db_handle->selectall_arrayref($sql)}; + + #This is causing problems as we might get none, which will invalidate the sql + #Hence we return nothing + + $constraint = " $syn.${table_name}_id IN (".join(',', @status_ids).")" if @status_ids; + return $constraint; +} + + + +=head2 _test_funcgen_table + + Arg [1] : Bio::EnsEMBL::"OBJECT" + Example : $status_a->_is_funcgen_object($experimental_chip)}; + Description: Tests if the object is a valid funcgen object with an identifiable table_name + Returntype : string - table_name + Exceptions : Throws if argument if a Bio::EnsEMBL::Funcgen::"OBJECT" not supplied + Throws if not table name identified + Caller : general + Status : At risk + +=cut + + + +sub _test_funcgen_table{ + my ($self, $obj) = @_; + + #Can we change this to is_stored_and_valid for a Storable? + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::Storable', $obj); + #Does this test for ad + + my @tables = $obj->adaptor->_tables; + + my ($table) = @{$tables[0]}; + #InputSubSet fix, as doesn't have own adaptor + $table = 'input_subset' if $obj->isa('Bio::EnsEMBL::Funcgen::InputSubset'); + + return $table || $self->throw("Cannot identify table name from $obj adaptor"); +} + + + +=head2 fetch_all_states + + Arg [1] : Bio::EnsEMBL::"OBJECT" + Arg [2...] : listref of states + Example : my @ec_states = @{$status_a->fetch_all_states($experimental_chip)}; + Description: Retrieves all states associated with the given "OBJECT" + Returntype : ARRAYREF + Exceptions : None + Caller : general + Status : At risk + +=cut + +sub fetch_all_states{ + my ($self, $obj) = @_; + + my $table = $self->_test_funcgen_table($obj); + + + my $sql = "SELECT name FROM status_name sn, status s WHERE s.table_name='$table' AND s.table_id='".$obj->dbID()."' and s.status_name_id=sn.status_name_id"; + + my @states = map $_ = "@$_", @{$self->db->dbc->db_handle->selectall_arrayref($sql)}; + + return \@states; +} + + + + +=head2 has_stored_status + + Arg [1] : string - status e.g. IMPORTED, DISPLAYABLE + Arg [2] : Bio::EnsEMBL::Storable + Example : if($status_a->has_stored_status('IMPORTED', $array){ ... skip import ... }; + Description: Tests wether a given object has a given state + Returntype : BOOLEAN + Exceptions : None + Caller : Bio::EnsEMBL::Funcgen::BaseAdaptor + Status : At risk + +=cut + + + +sub has_stored_status{ + my ($self, $state, $obj) = @_; + + my (@row); + + #Only used for set_status, merge with set_status? + my $status_id = $self->_get_status_name_id($state); + my $table = $self->_test_funcgen_table($obj); + + + + if($status_id){ + my $sql = "SELECT status_name_id FROM status WHERE table_name=\"$table\" AND table_id=\"".$obj->dbID()."\" AND status_name_id=\"$status_id\""; + + #could just return the call directly? + @row = $self->db->dbc->db_handle->selectrow_array($sql); + } + + return (@row) ? 1 : 0; +} + + + + +=head2 store_status + + Arg [1] : string - status e.g. IMPORTED, DISPLAYABLE + Arg [2] : Bio::EnsEMBL::"OBJECT" + Example : $status_a->store_status('IMPORTED', $array_chip); + Description: Sets a state for a given object + Returntype : None + Exceptions : None + Caller : general + Status : At risk - Move to Status.pm? + +=cut + + +sub store_status{ + my ($self, $state, $obj) = @_; + + my $sql; + my $table = $self->_test_funcgen_table($obj); + + if(! $self->has_stored_status($state, $obj)){ + my $status_id = $self->_get_status_name_id($state); + + if(! $status_id){ + throw("$state is not a valid status_name for $obj:\t".$obj->dbID); + } + + $sql = "INSERT into status(table_id, table_name, status_name_id) VALUES('".$obj->dbID()."', '$table', '$status_id')"; + $self->db->dbc->do($sql); + + #Setting it in the obj if it is not already present. + $obj->add_status($state) if(! $obj->has_status($state, $obj)); + } + + return; +} + + +=head2 revoke_status + + Arg [1] : string - status name e.g. 'IMPORTED' + Arg [2] : Bio::EnsEMBL::Funcgen::Storable + Example : $rset_adaptor->revoke_status('DAS DISPLAYABLE', $result_set); + Description: Revokes the given state of Storable in status table. + Returntype : None + Exceptions : Warns if storable does not have state + Throws is status name is not valid + Throws if not state passed + Caller : General + Status : At Risk + +=cut + + +sub revoke_status{ + my ($self, $state, $storable) = @_; + + throw('Must provide a status name') if(! defined $state); + my $table_name = $self->_test_funcgen_table($storable); + my $status_id = $self->_get_status_name_id($state); + + #hardcode for InputSubset + $table_name = 'input_subset' if $storable->isa('Bio::Ensembl::Funcgen:InputSubset'); + + + if(! $self->has_stored_status($state, $storable)){ + warn $storable.' '.$storable->dbID()." does not have status $state to revoke\n"; + return; + } + + #do sanity checks on table to ensure that IMPORTED does not get revoke before data deleted? + #how would we test this easily? + + my $sql = "delete from status where table_name='${table_name}'". + " and status_name_id=${status_id} and table_id=".$storable->dbID(); + + $self->db->dbc->db_handle->do($sql); + + #now splice from status array; + #splice in loop should work as we will only see 1 + #Just hash this? + + for my $i(0..$#{$storable->{'states'}}){ + + if($storable->{'states'}->[0] eq $state){ + splice @{$storable->{'states'}}, $i, 1; + last; + } + } + + return; +} + +=head2 revoke_states + + Arg [1] : Bio::EnsEMBL::Funcgen::Storable + Example : $rset_adaptor->revoke_status($result_set); + Description: Revokes all states of Storable in status table. + Returntype : Bio::EnsEMBL::Funcgen::Storable + Exceptions : None + Caller : General + Helper rollback methods + Status : At Risk + +=cut + + +sub revoke_states{ + my ($self, $storable) = @_; + + my $table_name = $self->_test_funcgen_table($storable); + + #hardcode for ExperimentalSubset as this uses the ExperimentalSetAdaptor + $table_name = 'experimental_subset' if $storable->isa('Bio::Ensembl::Funcgen:ExperimentalSubset'); + + my $sql = "delete from status where table_name='${table_name}'". + " and table_id=".$storable->dbID(); + + $self->db->dbc->db_handle->do($sql); + + #Clear stored states + undef $storable->{'states'}; + + return $storable; +} + + +=head2 set_imported_states_by_Set + + Arg [1] : Bio::EnsEMBL::Funcgen::Set e.g. a FeatureSet or ResultSet + Example : $self->set_imported_states_by_Set($set); + Description: Sets default states for imported Feature|ResultSets + Returntype : None + Exceptions : None + Caller : Import parsers and RunnableDBs + Status : At risk - move to BaseImporter + +=cut + +#This needs to be used by RunnableDBs too! +#All state stuff is handled by BaseAdaptor? +#Can we put this in the SetAdaptor? + +sub set_imported_states_by_Set{ + my ($self, $set) = @_; + + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::Set', $set); + #This should really be restricted to FeatureSet and ResultSet + + #Store default states for FeatureSets + #DAS_DISPLAYABLE IMPORTED_'CSVERSION' + #These need to insert ignore as may already be present? + #Insert ignore may not catch an invalid status + #So add states and store states as this checks + $set->adaptor->store_status('DAS_DISPLAYABLE', $set); + + + #To get assembly version here we need to + # 1 get the current default chromosome version + # or/and + # 2 Use the assembly param to guess it from the coord_sys table + # #This may pose problems for DB names which use numbers in their genebuild version + # Would need to set this as a flag and/or specify the genebuild version too + # Currently dnadb is set to last dnadb with 'assembly' as default version + # We should match as test, just to make sure + + #Get default chromosome version for this dnadb + my $cs_version = $self->db->dnadb->get_CoordSystemAdaptor->fetch_by_name('chromosome')->version; + + #Sanity check that it matches the assembly param? + #Woould need to do this if ever we loaded on a non-default cs version + + $set->adaptor->store_status("IMPORTED_${cs_version}", $set); +} + + + + +=head2 status_filter + + Arg [1] : string - status e.g. IMPORTED, DISPLAYABLE + Arg [2] : string - table name e.g. experimental_chip + Arg [3] : list - table dbIDs + Exmaple : my @displayable_ec_ids = @{$ec_adaptor->status_filter('DISPLAYABLE', + 'experimental_chip', + (map $_->dbID, @echips))}; + Description: Quick method for filtering dbIDs based on their table and and status + Returntype : ARRAYREF + Exceptions : Warns if state already set + Throws is status name is not already stored. + Caller : general - ResultSetAdaptor->fetch_Resultfeatures_by_Slice + Status : At risk - Move to Status? + +=cut + + +sub status_filter{ + my ($self, $status, $table_name, @table_ids) = @_; + + + my @status_ids; + + my $status_id = $self->_get_status_name_id($status); + + + return \@status_ids if(! $status_id); + + throw("Must provide a table_name and table_ids to filter non-displayable ids") if(! ($table_name && @table_ids)); + + my $sql = "SELECT table_id from status where table_name='$table_name' and table_id in (".join(", ", @table_ids).") and status.status_name_id='$status_id'"; + + + @status_ids = map $_ = "@$_", @{$self->db->dbc->db_handle->selectall_arrayref($sql)}; + + return \@status_ids; + +} + + +=head2 _get_status_name_id + + Arg [1] : string - status e.g. IMPORTED, DISPLAYABLE + Example : my $status_id = $self->_get_status_name_id('IMPORTED'); + Description: Retrieves the dbID of a given status_name + Returntype : INT + Exceptions : None + Caller : Bio::EnsEMBL::Funcgen::BaseAdaptor + Status : At risk - Move to Status? + +=cut + + +sub _get_status_name_id{ + my ($self, $status) = @_; + + #$self->_validate_status($status); + + my $sql = "SELECT status_name_id from status_name where name='$status'"; + my $ref = $self->db->dbc->db_handle->selectrow_arrayref($sql); + + my ($status_id) = @$ref if $ref; + + + #we should throw here? + #To force manual addition of the status_name + #need to make sure all status_names which are explicitly used by API + #are stored in all DBs, else we could find ourselves with broken code + #for sparsely populated DBs + + throw("Status name $status is not valid. Maybe you need to add it to the status_name table?") if ! $status_id; + + + return $status_id; +} + + + + +=head2 fetch_all_by_external_name + + Arg [1] : String $external_name + An external identifier of the feature to be obtained + Arg [2] : (optional) String $external_db_name + The name of the external database from which the + identifier originates. + Example : my @features = + @{ $adaptor->fetch_all_by_external_name( 'NP_065811.1') }; + Description: Retrieves all features which are associated with + an external identifier such as an Ensembl Gene or Transcript + stable ID etc. Usually there will only be a single + feature returned in the list reference, but not + always. Features are returned in their native + coordinate system, i.e. the coordinate system in which + they are stored in the database. If they are required + in another coordinate system the Feature::transfer or + Feature::transform method can be used to convert them. + If no features with the external identifier are found, + a reference to an empty list is returned. + Returntype : arrayref of Bio::EnsEMBL::Funcgen::Storable objects + Maybe any Feature, FeatureType, Probe or ProbeSet + Exceptions : Warns if method not available for given object adaptor + Caller : general + Status : at risk + +=cut + +sub fetch_all_by_external_name { + my ( $self, $external_name, $external_db_name ) = @_; + + my $entryAdaptor = $self->db->get_DBEntryAdaptor(); + my (@ids, $type, $type_name); + ($type = ref($self)) =~ s/.*:://; + $type =~ s/Adaptor$//; + ($type_name = $type) =~ s/Feature$/_feature/; + my $xref_method = 'list_'.lc($type_name).'_ids_by_extid'; + + if(! $entryAdaptor->can($xref_method)){ + warn "Does not yet accomodate $type external names"; + return []; + } + + #Would be better if _list_ method returned and arrayref + return $self->fetch_all_by_dbID_list([$entryAdaptor->$xref_method($external_name, $external_db_name)]); +} + + +=head2 fetch_all_by_external_names + + Arg [1] : ARRAYREF of strings. External identifiers of the features to be obtained + Arg [2] : (optional) String $external_db_name + The name of the external database from which the + identifier originates. + Example : my @features = + @{ $adaptor->fetch_all_by_external_names(['ENST00003548913', ...])}; + Description: Retrieves all features which are associated with + the external identifiers such as a Ensembl gene or transcript + stable IDs, etc. Features are returned in their native + coordinate system, i.e. the coordinate system in which + they are stored in the database. If they are required + in another coordinate system the Feature::transfer or + Feature::transform method can be used to convert them. + If no features with the external identifier are found, + a reference to an empty list is returned. + Returntype : arrayref of Bio::EnsEMBL::Funcgen::Storable objects + Maybe any Feature, FeatureType, Probe or ProbeSet + Exceptions : Warns if xref method not available for given object adaptor + Caller : general + Status : at risk + +=cut + +sub fetch_all_by_external_names{ + my ( $self, $external_names, $external_db_name ) = @_; + + my $entryAdaptor = $self->db->get_DBEntryAdaptor(); + my ($type, $type_name); + ($type = ref($self)) =~ s/.*:://; + $type =~ s/Adaptor$//; + ($type_name = $type) =~ s/Feature$/_feature/; + my $xref_method = 'list_'.lc($type_name).'_ids_by_extids'; + + + if(! $entryAdaptor->can($xref_method)){ + warn "Does not yet accomodate $type external names"; + return []; + } + + + #Would be better if _list_ method returned and arrayref + my @ids = $entryAdaptor->$xref_method($external_names, $external_db_name); + + return $self->fetch_all_by_dbID_list(\@ids); +} + + +=head2 fetch_all_by_linked_Transcript + + Arg [1] : Bio::EnsEMBL::Transcript + Example : my @psets = + @{ $probe_set_adaptor->fetch_all_by_linked_Transcript($tx_obj); + Description: Retrieves all features which are associated with + the given Ensembl Transcript. + Returntype : ARRAYREF of Bio::EnsEMBL::Funcgen::Storable objects + Maybe any Feature, FeatureType, Probe or ProbeSet + Exceptions : Throws if arguments not valid + Caller : general + Status : at risk + +=cut + +sub fetch_all_by_linked_Transcript{ + my ($self, $tx) = @_; + + if(! $tx || + ! (ref($tx) && $tx->isa('Bio::EnsEMBL::Transcript') && $tx->dbID)){ + throw('You must provide a valid stored Bio::EnsEMBL:Transcript object'); + } + + return $self->fetch_all_by_external_name($tx->stable_id, $self->db->species.'_core_Transcript') +} + +=head2 fetch_all_by_linked_transcript_Gene + + Arg [1] : Bio::EnsEMBL::Gene + Example : my @psets = + @{ $probe_set_adaptor->fetch_all_by_linked_transcript_Gene($gene_obj); + Description: Retrieves all features which are indirectly associated with + the given Ensembl Gene, through it's Transcripts. + Returntype : ARRAYREF of Bio::EnsEMBL::Funcgen::Storable objects + Maybe any Feature, FeatureType, Probe or ProbeSet + Exceptions : Throws if arguments not valid + Caller : general + Status : at risk + +=cut + + +sub fetch_all_by_linked_transcript_Gene{ + my ( $self, $gene ) = @_; + + if(! $gene || + ! (ref($gene) && $gene->isa('Bio::EnsEMBL::Gene') && $gene->dbID)){ + throw('You must provide a valid stored Bio::EnsEMBL:Gene object'); + } + #No need to quote param here as this is a known int from the DB. + my $tx_sids = $gene->adaptor->db->dbc->db_handle->selectcol_arrayref('select tsid.stable_id from transcript_stable_id tsid, transcript t where t.gene_id='.$gene->dbID.' and t.transcript_id=tsid.transcript_id'); + + return $self->fetch_all_by_external_names($tx_sids, $self->db->species.'_core_Transcript'); +} + + + +=head2 store_associated_feature_types + + Arg [1] : Bio::EnsEMBL::Funcgen::Sotrable + Example : $ext_feat_adaptor->store_associated_feature_type($ext_feat); + Description : Store FeatureTypes assoicated with a given Storable + Returntype : None + Exceptions : Throws if FeatureTypes are not valid or stored + Caller : Adaptors + Status : At risk + +=cut + +sub store_associated_feature_types { + my ($self, $storable) = @_; + + #Direct access to avoid lazy loading with an unstored SetFeature + my $assoc_ftypes = $storable->{'associated_feature_types'}; + + #Could be undef or empty + return if ! defined $assoc_ftypes || scalar(@$assoc_ftypes) == 0; + + my $table_name = $storable->adaptor->_main_table->[0]; + my $dbid = $storable->dbID; + + my $sql = 'INSERT into associated_feature_type(table_id, table_name, feature_type_id) values (?,?,?)'; + + foreach my $ftype(@$assoc_ftypes){ + + #We have already tested the class but not whether it is stored + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::FeatureType', $ftype); + + my $sth = $self->prepare($sql); + $sth->bind_param(1, $dbid, SQL_INTEGER); + $sth->bind_param(2, $table_name, SQL_VARCHAR); + $sth->bind_param(3, $ftype->dbID, SQL_INTEGER); + $sth->execute(); + } + + + return; +} + + +=head2 fetch_all_by_associated_FeatureType + + Arg [1] : Bio::EnsEMBL::Funcgen::FeatureType + Example : my $assoc_ftypes = $ft_adaptor->fetch_all_by_associated_SetFeature($ext_feature); + Description: Fetches all objects which have associated FeatureType. + Note this is not the main FeatureType for this object. + Returntype : ARRAYREF of Bio::EnsEMBL::Funcgen::Storable objects + Exceptions : Throws if FeatureType not valid or stored + Caller : General + Status : At risk + +=cut + + +sub fetch_all_by_associated_FeatureType{ + my ($self, $ftype) = @_; + + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::FeatureType', $ftype); + my ($table_name, $table_syn) = @{$self->_main_table}; + + push @{$self->TABLES}, ['associated_feature_type', 'aft']; + my $constraint = "aft.feature_type_id=? AND aft.table_name='${table_name}' AND aft.table_id=${table_syn}.${table_name}_id"; + + $self->bind_param_generic_fetch($ftype->dbID, SQL_INTEGER); + my $objs = $self->generic_fetch($constraint); + $self->reset_true_tables; + + return $objs; +} + + +=head2 _main_table + + Example : my $syn = $adaptor->_main_table->[1]; + Description: Convenience method to retrieve the main table or main table synonym for this adaptor + Entirely dependent on ensembl convention of always having main table as first element + of tables array. + Returntype : Array ref + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub _main_table{ + my $self = shift; + + #Need to do this to put it in list context to avoid just returning the last value + my @tables = $self->_tables(); + return $tables[0]; +} + + +=head2 list_dbIDs + + Args : None + Example : my @feature_ids = @{$ofa->list_dbIDs()}; + Description: Gets an array of internal IDs for all objects in + the main table of this class. + Returntype : List of ints + Exceptions : None + Caller : ? + Status : Medium Risk + +=cut + +sub list_dbIDs { + my $self = shift; + return $self->_list_dbIDs($self->_main_table->[0]); +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/BaseFeatureAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/BaseFeatureAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1334 @@ +# +# EnsEMBL module for Bio::EnsEMBL::Funcgen::DBSQL::BaseFeatureAdaptor +# + + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::Funcgen::DBSQL::BaseFeatureAdaptor - A Base class for all +Funcgen FeatureAdaptors + +=head1 SYNOPSIS + +Abstract class - should not be instantiated. Implementation of +abstract methods must be performed by subclasses. + +=head1 DESCRIPTION + +This is a base adaptor for Funcgen feature adaptors. This base class is simply a way +to redefine some methods to use with the Funcgen DB. + +=cut + +package Bio::EnsEMBL::Funcgen::DBSQL::BaseFeatureAdaptor; +use vars qw(@ISA @EXPORT); +use strict; + + +use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; +use Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Utils::Cache; +use Bio::EnsEMBL::Utils::Exception qw(warning throw deprecate stack_trace_dump); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor); + + +@EXPORT = (@{$DBI::EXPORT_TAGS{'sql_types'}}); + +### To do ### +#Add Translation method for fetch by FeatureSets methods? +#Test and implement feature mapping between coord systems +#Correct/Document methods!!! +#Implement externaldb db_name version registry + +my %warnings; + +=head2 generic_fetch + + Arg [1] : (optional) string $constraint + An SQL query constraint (i.e. part of the WHERE clause) + Arg [2] : (optional) Bio::EnsEMBL::AssemblyMapper $mapper + A mapper object used to remap features + as they are retrieved from the database + Arg [3] : (optional) Bio::EnsEMBL::Slice $slice + A slice that features should be remapped to + Example : $fts = $a->generic_fetch('contig_id in (1234, 1235)', 'Swall'); + Description: Wrapper method for core BaseAdaptor, build seq_region cache for features + Returntype : ARRAYREF of Bio::EnsEMBL::SeqFeature in contig coordinates + Exceptions : none + Caller : FeatureAdaptor classes + Status : at risk + +=cut + +sub generic_fetch { + my $self = shift; + + #need to wrap _generic_fetch to always generate the + #seq_region_cache other wise non-slice based fetch methods fail + + #build seq_region cache here once for entire query + #Using default schema_build here + #So would need to set dnadb appropriately + #This is cleaning tmp cache values such that + #nested feature fetches cause failure e.g. when regfeats retrieve their reg_attrs + #We need a way of always generating the tmp cache, or having it persist? + #This is because we haven't built the tmp cache in non_slice based methods i.e. we haven't run get_seq_region_id_by_Slice + #This is the same for all non-Slice based methods + #And the is no way around it as we are not providing that info about the new DB by passing a slice! + #The only way to get around this is to make the tmp_cache persistant + + $self->build_seq_region_cache(); + + return $self->SUPER::generic_fetch(@_); +} + + +=head2 fetch_all_by_Slice_constraint + + Arg [1] : Bio::EnsEMBL::Slice $slice + the slice from which to obtain features + Arg [2] : (optional) string $constraint + An SQL query constraint (i.e. part of the WHERE clause) + Arg [3] : (optional) string $logic_name + the logic name of the type of features to obtain + Example : $fs = $a->fetch_all_by_Slice_constraint($slc, 'perc_ident > 5'); + Description: Returns a listref of features created from the database which + are on the Slice defined by $slice and fulfill the SQL + constraint defined by $constraint. If logic name is defined, + only features with an analysis of type $logic_name will be + returned. + Returntype : listref of Bio::EnsEMBL::SeqFeatures in Slice coordinates + Exceptions : thrown if $slice is not defined + Caller : Bio::EnsEMBL::Slice + Status : Stable + +=cut + +#Remove this now we don't have array based ResultFeatures? + +sub fetch_all_by_Slice_constraint { + my($self, $slice, $constraint, $logic_name) = @_; + + my @result; + + if(!ref($slice) || !$slice->isa("Bio::EnsEMBL::Slice")) { + throw("Bio::EnsEMBL::Slice argument expected."); + } + + $constraint ||= ''; + + my $fg_cs = $self->db->get_FGCoordSystemAdaptor->fetch_by_name( + $slice->coord_system->name(), + $slice->coord_system->version() + ); + + + if(! defined $fg_cs){ + warn "No CoordSystem present for ".$slice->coord_system->name().":".$slice->coord_system->version(); + return \@result; + } + + + #build seq_region cache here once for entire query + $self->build_seq_region_cache($slice); + my $syn = $self->_main_table->[1]; + + $constraint = $self->_logic_name_to_constraint($constraint, $logic_name); + + + #if the logic name was invalid, undef was returned + return [] if(!defined($constraint)); + + #check the cache and return if we have already done this query + + #Added schema_build to feature cache for EFG + + my $key; + my $cache; + if ( !$self->db->no_cache() ) { + my $bind_params = $self->bind_param_generic_fetch(); + $key = uc(join(':', $slice->name, $constraint, $self->db->_get_schema_build($slice->adaptor->db()))); + + if ( defined($bind_params) ) { + $key .= ':' + . join( ':', map { $_->[0] . '/' . $_->[1] } @{$bind_params} ); + } + $cache = $self->_slice_feature_cache(); + if(exists($cache->{$key})){ + $self->{'_bind_param_generic_fetch'} = (); + return $cache->{$key}; + } + } + + my $sa = $slice->adaptor(); + + # Hap/PAR support: retrieve normalized 'non-symlinked' slices + my @proj = @{$sa->fetch_normalized_slice_projection($slice)}; + + + if(@proj == 0) { + throw('Could not retrieve normalized Slices. Database contains ' . + 'incorrect assembly_exception information.'); + } + + # Want to get features on the FULL original slice + # as well as any symlinked slices + + # Filter out partial slices from projection that are on + # same seq_region as original slice + + my $sr_id = $slice->get_seq_region_id(); + @proj = grep { $_->to_Slice->get_seq_region_id() != $sr_id } @proj; + my $segment = bless([1,$slice->length(),$slice ], + 'Bio::EnsEMBL::ProjectionSegment'); + push( @proj, $segment ); + + # construct list of Hap/PAR boundaries for entire seq region + my @bounds; + my $ent_slice = $sa->fetch_by_seq_region_id($sr_id); + $ent_slice = $ent_slice->invert() if($slice->strand == -1); + my @ent_proj = @{$sa->fetch_normalized_slice_projection($ent_slice)}; + + shift @ent_proj; # skip first + @bounds = map {$_->from_start - $slice->start() + 1} @ent_proj; + + + # fetch features for the primary slice AND all symlinked slices + foreach my $seg (@proj) { + + my $offset = $seg->from_start(); + my $seg_slice = $seg->to_Slice(); + my $features = $self->_slice_fetch($seg_slice, $constraint); ## NO RESULTS? This is a problem with the cs->equals method? + + # if this was a symlinked slice offset the feature coordinates as needed + if($seg_slice->name() ne $slice->name()) { + + FEATURE: + foreach my $f (@$features) { + if($offset != 1) { + $f->{'start'} += $offset-1; + $f->{'end'} += $offset-1; + } + + # discard boundary crossing features from symlinked regions + foreach my $bound (@bounds) { + if($f->{'start'} < $bound && $f->{'end'} >= $bound) { + + next FEATURE; + } + } + + $f->{'slice'} = $slice; + push @result, $f; + } + } + else { + push @result, @$features; + } + } + + if(defined $key) { + $cache->{$key} = \@result; + } + + return \@result; +} + +=head2 build_seq_region_cache + + Arg [1] : optional - Bio::EnsEMBL::Slice + the slice from which to obtain features + Example : $self->build_seq_region_cache(); + Description: Builds the seq_region_id caches based on the species and schema_build + Returntype : None + Exceptions : thrown if optional Slice argument is not valid + Caller : self + Status : At risk - should be private _build_seq_region_cache? Change arg to DBAdaptor? Or remove if we are building the full cache? + +=cut + +#do we even need to have the coord system? +#so long as we are only using one schema build i.e. one dnadb defualt = current +#slice and fg_cs are optional +#need to look at this + +sub build_seq_region_cache{ + my ($self, $slice) = @_; + + #instead of building this for each query + #we need to store the current cached coord_system + #and test whether it contains the query coord_system + #No as this then restricts us to one coord_system + #where as we want to cover all from one DB/schema_build + + if(defined $slice){ + throw('Optional argument must be a Bio::EnsEMBL::Slice') if(! ( ref($slice) && $slice->isa('Bio::EnsEMBL::Slice'))); + } + + my $dnadb = (defined $slice) ? $slice->adaptor->db() : $self->db->dnadb(); + my $schema_build = $self->db->_get_schema_build($dnadb); + my $sql = 'select sr.core_seq_region_id, sr.seq_region_id from seq_region sr'; + my @args = ($schema_build); + + if($self->is_multispecies()) { + $sql.= ', coord_system cs where sr.coord_system_id = cs.coord_system_id and cs.species_id=? and'; + unshift(@args, $self->species_id()); + } + else { + $sql.= ' where'; + } + $sql.=' sr.schema_build =?'; + + #Check we have not already got the right cache + my $cache_key = join(':', @args); + + #Do we already have a valid cache? + return if($self->cache_key eq $cache_key); + + + #Can't maintain these caches as we may be adding to them when storing + #we should just maintain the cache with the cache_key + + #$self->{'seq_region_cache'}{$cache_key} ||= {}; + #$self->{'core_seq_region_cache'} = {}; + + my $sth = $self->prepare($sql); + $sth->execute(@args); + while(my $ref = $sth->fetchrow_arrayref()) { + $self->{seq_region_cache}->{$cache_key}->{$ref->[0]} = $ref->[1]; + $self->{core_seq_region_cache}->{$cache_key}->{$ref->[1]} = $ref->[0]; + } + $sth->finish(); + + $self->cache_key($cache_key); + + return; +} + + +=head2 cache_key + + Arg [1] : optional string - species_id.schema_build e.g. 1.55_37 + the slice from which to obtain features + Example : $self->build_seq_region_cache(); + Description: Getter/Setter for the seq_region cache_key + Returntype : string + Exceptions : None + Caller : self + Status : At risk + +=cut + +sub cache_key{ + my ($self, $key) = @_; + + $self->{'cache_key'} = $key if $key; + return $self->{'cache_key'} || ''; +} + + + +#Need to separate these into by slice and by cs? +#We could just use an old cs/schema_build to grab the correct slice based on the name +#however we want to at least warn that the db needs updating +#The problem is that we want to detect whether the seq_region_id is present for _pre_store +#but automatically use a comparable seq_region for the normal fetch_methods. + +#So we need to split or add $? + +sub get_seq_region_id_by_Slice{ + my ($self, $slice, $fg_cs, $test_present) = @_; + + if(! ($slice && ref($slice) && $slice->isa("Bio::EnsEMBL::Slice"))){ + throw('You must provide a valid Bio::EnsEMBL::Slice'); + } + + #We really need to validate the schema_build of the slice to make sure it + #present in the current default coord_system i.e. the one which was used to + #generate the seq_region_cache + #This is set with the dnadb or with a slice query + #This may not always have been done. + + #Now all we have to do is test the cache_key + #Or can we just build_seq_region_cache as this checks and rebuilds if not correct + #This may generate a mistmach between the dnadb and the schema_build used to generate the cache + #This will be reset if required for subesquent queries using the cache key + $self->build_seq_region_cache($slice); + + my ($core_sr_id, $fg_sr_id); + + + #Slice should always have an adaptor, no? + if( $slice->adaptor() ) { + $core_sr_id = $slice->adaptor()->get_seq_region_id($slice); + } + else { + $core_sr_id = $self->db->dnadb->get_SliceAdaptor()->get_seq_region_id($slice); + } + + + #This does not work!! When updating for a new schema_build we get the first + #seq_region stored, than for each subsequent one, it arbitrarily assigns a value from the hash even tho the + #the exists condition isn't met! + #my $fg_sr_id = $self->{'seq_region_cache'}{$core_sr_id} if exists $self->{'seq_region_cache'}{$core_sr_id}; + #Can't replicate this using a normal hash + + #This cache has been built based on the schema_build + if (exists $self->{'seq_region_cache'}{$self->cache_key}{$core_sr_id}){ + $fg_sr_id = $self->{'seq_region_cache'}{$self->cache_key}{$core_sr_id}; + } + + + if(! $fg_sr_id && ref($fg_cs)){ + #This is used to store new seq_region info along side previous stored seq_regions of the same version + + if( ! $fg_cs->isa('Bio::EnsEMBL::Funcgen::CoordSystem')){ + throw('Must pass as valid Bio::EnsEMBL::Funcgen:CoordSystem to retrieve seq_region_ids for forwards compatibility, passed '.$fg_cs); + } + + my $sql = 'select seq_region_id from seq_region where coord_system_id =? and name =?'; + my $sth = $self->prepare($sql); + $sth->execute($fg_cs->dbID(), $slice->seq_region_name()); + + #This may not exist, so we need to catch it here? + ($fg_sr_id) = $sth->fetchrow_array(); + $sth->finish(); + + #if we are providing forward compatability + #Then we know the eFG DB doesn't have the core seq_region_ids in the DB + #Hence retrieving the slice will fail in _obj_from_sth + #So we need to set it internally here + #Then pick it up when get_core_seq_region_id is called for the first time(from where?) + #and populate the cache with the value + + #This only works if there is a comparable slice + #If we are dealing with a new assembly, then no $fg_sr_id will be returned + #So need to catch this in the caller + + #Can we remove this now the cache is persistant? + #i.e. Cache is not regenerated everytime, hence we don't lose the data? + + if($fg_sr_id){ + $self->{'_tmp_core_seq_region_cache'}{$self->cache_key} = {( + $fg_sr_id => $core_sr_id + )}; + } + } + elsif(! $fg_sr_id && ! $test_present) { + #This generally happens when using a new core db with a efg db that hasn't been updated + #Default to name match or throw if not present in DB + my $schema_build = $self->db->_get_schema_build($slice->adaptor->db()); + my $core_cs = $slice->coord_system; + + #Avoids mapping of core to efg seq_region_ids + #via schema_build(of the new core db) as we are matching directly to the seq_name + + my $sql = 'select distinct(seq_region_id) from seq_region sr, coord_system cs where sr.coord_system_id=cs.coord_system_id and sr.name=? and cs.name =?'; + my @args = ($slice->seq_region_name(), $core_cs->name()); + + if($core_cs->version()) { + $sql.= ' and cs.version =?'; + push(@args, $core_cs->version()); + } + if($self->is_multispecies()) { + $sql.=' and cs.species_id=?'; + push(@args, $self->species_id()); + } + + my $sth = $self->prepare($sql); + $sth->execute(@args); + ($fg_sr_id) = $sth->fetchrow_array(); + $sth->finish(); + + if(! $fg_sr_id){ + #Warn instead of throw so we can catch absent seq_region without eval + #warn('Cannot find previously stored seq_region for: '.$core_cs->name.':'.$core_cs->version.':'.$slice->seq_region_name. + # "\nYou need to update your eFG seq_regions to match your core DB using: update_DB_for_release.pl\n"); + } + + #Only warn first time this is seen + my $warning_key = $core_cs->name.':'.$core_cs->version.':'.$slice->seq_region_name; + + if(! exists $warnings{$warning_key}){ + warn 'Defaulting to previously store seq_region for: '.$warning_key. + "\nYou need to update your eFG seq_regions to match your core DB using: update_DB_for_release.pl\n"; + $warnings{$warning_key} = 1; + } + } + + return $fg_sr_id; +} + +sub get_core_seq_region_id{ + my ($self, $fg_sr_id) = @_; + + #This is based on what the current schema_build is + #to enable multi-schema/assmelby look up + #we will have to nest these in schema_build caches + #and use the schema_build of the slice which is passed to acquire the core_seq_region_id + #likewise for reverse, i.e. when we store. + + my $core_sr_id = $self->{'core_seq_region_cache'}{$self->cache_key}{$fg_sr_id}; + + if(! defined $core_sr_id && exists $self->{'_tmp_core_seq_region_cache'}{$self->cache_key}{$fg_sr_id}){ + #Do we need to test the cache_key here, might it have changed since get_seq_region_id_by_Slice? + #Or will build_seq_region_cache handle this? + + #Why do we reset this in the main cache here if we are returning the value + #Is this not corrupting the main cache? But we always want this value? + $self->{'core_seq_region_cache'}{$self->cache_key}{$fg_sr_id} = $self->{'_tmp_core_seq_region_cache'}{$self->cache_key}{$fg_sr_id}; + + #Delete here so we don't have schema_build specific sr_ids hanging around for another query? + #delete $self->{'_tmp_core_seq_region_cache'}{$self->cache_key}{$fg_sr_id}; + + #These are valid values and would most likely be required for subsequent queries. + #Cache key is now also schema_build specific so this is no longer a problem + #Removed this as this causes RegFeat retrieval to fail + #As all non slice based methods would fail due to the lack of info about unstored DB i.e. we need a slice + + + $core_sr_id = $self->{'core_seq_region_cache'}{$self->cache_key}{$fg_sr_id}; + + } + elsif(! defined $core_sr_id){#No cache entry, so get from dnadb + + my $sql = "select distinct(name) from seq_region where seq_region_id=$fg_sr_id"; + my $slice_name = $self->db->dbc->db_handle->selectall_arrayref($sql); + ($slice_name) = @{$slice_name->[0]}; + + #if(scalar(@names) != 1){#This should never happen + + #We should really grab the coord sys name above too. + my $slice_adaptor = $self->db->dnadb->get_SliceAdaptor; + $core_sr_id = $slice_adaptor->get_seq_region_id($slice_adaptor->fetch_by_region(undef, $slice_name)); + + #Set this in the cache so we don't have to retrieve it again + $self->{'core_seq_region_cache'}{$self->cache_key}{$fg_sr_id} = $core_sr_id; + } + + return $core_sr_id; +} + +=head2 _pre_store + + Arg [1] : Bio::EnsEMBL::Feature + Example : $fs = $a->fetch_all_by_Slice_constraint($slc, 'perc_ident > 5'); + Description: Helper function containing some common feature storing functionality + Given a Feature this will return a copy (or the same feature if no changes + to the feature are needed) of the feature which is relative to the start + of the seq_region it is on. The seq_region_id of the seq_region it is on + is also returned. This method will also ensure that the database knows which coordinate + systems that this feature is stored in. This supercedes the core method, to trust the + slice the feature has been generated on i.e. from the dnadb. Also handles multi-coordsys + aspect, generating new coord_system_ids as appropriate and assembly projection. + Returntype : Bio::EnsEMBL::Feature and the seq_region_id it is mapped to + Exceptions : thrown if $slice is not defined + Caller : Bio::EnsEMBL::"Type"FeatureAdaptors + Status : At risk + +=cut + + +sub _pre_store { + my ($self, $feature, $new_assembly) = @_; + #May want to add cs_level arg? + #What about ignore length flag? + + if(!ref($feature) || !$feature->isa('Bio::EnsEMBL::Feature')) { + throw('Expected Feature argument.'); + } + + $self->_check_start_end_strand($feature->start(),$feature->end(), + $feature->strand()); + + my $db = $self->db(); + my $slice = $feature->slice(); + + if(!ref($slice) || !$slice->isa('Bio::EnsEMBL::Slice')) { + throw('Feature must be attached to Slice to be stored.'); + } + + # make sure feature coords are relative to start of entire seq_region + if($slice->start != 1 || $slice->strand != 1) { + + #throw("You must generate your feature on a slice starting at 1 with strand 1"); + #We did remove this transfer it uses direct hash access which + #did not work with old array based ResultFeatures + + #move feature onto a slice of the entire seq_region + $slice = $slice->adaptor->fetch_by_region($slice->coord_system->name(), + $slice->seq_region_name(), + undef, #start + undef, #end + undef, #strand + $slice->coord_system->version()); + $feature = $feature->transfer($slice); + + if(!$feature) { + throw('Could not transfer Feature to slice of ' . + 'entire seq_region prior to storing'); + } + } + + + + #Project here before we start building sr caches and storing CSs + if($new_assembly){ + #my $cs_level = + #Don't set this for old and new slice as + #at some point in the future we have mappings between different levels. + + + #warn "Projecting ".$feature->start.'-'.$feature->end." on "..$feature->slice->name." to $new_assembly"; + + + my @segments = @{$feature->feature_Slice->project($slice->coord_system->name, $new_assembly)}; + # do some sanity checks on the projection results: + # discard the projected feature if + # 1. it doesn't project at all (no segments returned) + # 2. the projection is fragmented (more than one segment) + # 3. the projection doesn't have the same length as the original + # feature + + # this tests for (1) and (2) + if (scalar(@segments) == 0) { + warn "Feature doesn't project to $new_assembly\n"; + return; + } + elsif (scalar(@segments) > 1) { + warn "Feature projection is fragmented in $new_assembly\n"; + return; + } + + # test (3) + my $proj_slice = $segments[0]->to_Slice; + + if ($feature->length != $proj_slice->length) { + + #if(! $conf->param('ignore_length')){ + warn "Feature projection is wrong length in $new_assembly\n"; + return; + # } + } + + #warn "proj ".$proj_slice->name; + + # everything looks fine, so adjust the coords of your feature + #Have to generate new_slice here as we are not sure it is going to be + #on the same slice as the old assembly + $slice = $proj_slice->adaptor->fetch_by_region($proj_slice->coord_system->name, $proj_slice->seq_region_name); + + #These are just callers for ResultFeature! + #For speed. + + $feature->start($proj_slice->start); + $feature->end($proj_slice->end); + $feature->slice($slice); + + #warn "new feature ".$feature->feature_Slice->name; + } + + + # Ensure this type of feature is known to be stored in this coord system. + my $cs = $slice->coord_system;#from core/dnadb + + #retrieve corresponding Funcgen coord_system and set id in feature + my $csa = $self->db->get_FGCoordSystemAdaptor();#had to call it FG as we were getting the core adaptor + my $fg_cs = $csa->validate_and_store_coord_system($cs); + $fg_cs = $csa->fetch_by_name($cs->name(), $cs->version());#Why are we refetching this? + my $tabname = $self->_main_table->[0]; + + + #Need to do this for Funcgen DB + my $mcc = $db->get_MetaCoordContainer(); + + $mcc->add_feature_type($fg_cs, $tabname, $feature->length); + + + #build seq_region cache here once for entire query + $self->build_seq_region_cache($slice); + + #Now need to check whether seq_region is already stored + #1 is test present flag + my $seq_region_id = $self->get_seq_region_id_by_Slice($slice, undef, 1); + + + if(! $seq_region_id){ + #check whether we have an equivalent seq_region_id + $seq_region_id = $self->get_seq_region_id_by_Slice($slice, $fg_cs); + my $schema_build = $self->db->_get_schema_build($slice->adaptor->db()); + my $sql; + my $core_sr_id = $slice->get_seq_region_id; + my @args = ($slice->seq_region_name(), $fg_cs->dbID(), $core_sr_id, $schema_build); + + #Add to comparable seq_region + if($seq_region_id) { + $sql = 'insert into seq_region(seq_region_id, name, coord_system_id, core_seq_region_id, schema_build) values (?,?,?,?,?)'; + unshift(@args, $seq_region_id); + } + #No compararble seq_region + else{ + $sql = 'insert into seq_region(name, coord_system_id, core_seq_region_id, schema_build) values (?,?,?,?)'; + } + + my $sth = $self->prepare($sql); + + #Need to eval this + eval{$sth->execute(@args);}; + + if(!$@){ + $seq_region_id = $sth->{'mysql_insertid'}; + } + + #Now we need to add this to the seq_region caches + #As we are not regenerating them every time we query. + $self->{seq_region_cache}{$self->cache_key}{$core_sr_id} = $seq_region_id; + $self->{core_seq_region_cache}{$self->cache_key}{$seq_region_id} = $core_sr_id; + } + + #Need to return seq_region_id as they are not stored + #in the slice retrieved from slice adaptor + return ($feature, $seq_region_id); +} + + +#This is causing problems with remapping +#Need to merge this back into the core code, subbing out the relevant parts +#required for funcgen support + +sub _slice_fetch { + my $self = shift; + my $slice = shift; + my $orig_constraint = shift; + + + my $slice_start = $slice->start(); + my $slice_end = $slice->end(); + my $slice_strand = $slice->strand(); + my $slice_cs = $slice->coord_system(); + #my $slice_seq_region = $slice->seq_region_name(); + + #We Need to translate the seq_regions IDs to efg seq_region_ids + #we need to fetch the seq_region ID based on the coord_system id and the name + #we don't want to poulate with the eFG seq_region_id, jsut the core one, as we need to maintain core info in the slice. + + #we need to cache the seq_region_id mappings for the coord_system and schema_build, + #so we don't do it for every feature in a slice + #should we just reset it in temporarily in fetch above, other wise we lose cache between projections + #should we add seq_region cache to coord_system/adaptor? + #Then we can access it from here and never change the slice seq_region_id + #how are we accessing the eFG CS? As we only have the core CS from the slice here? + #No point in having in eFG CS if we're just having to recreate it for everyquery + #CSA already has all CS's cached, so query overhead would be mininal + #But cache overhead for all CS seq_region_id caches could be quite large. + #we're going to need it in the _pre_store, so having it in the CS would be natural there + #can we just cache dynamically instead? + #we're still calling for csa rather than accessing from cache + #could we cache the csa in the base feature adaptor? + + + + + #my $slice_seq_region_id = $slice->get_seq_region_id(); + + #get the synonym and name of the primary_table + my ($tab_name, $tab_syn) = @{$self->_main_table}; + + #find out what coordinate systems the features are in + my $mcc = $self->db->get_MetaCoordContainer(); + my @feat_css=(); + + + my $mca = $self->db->get_MetaContainer(); + my $value_list = $mca->list_value_by_key( $tab_name."build.level" ); + if( @$value_list and $slice->is_toplevel()) { + push @feat_css, $slice_cs; + } + else{ + @feat_css = @{$mcc->fetch_all_CoordSystems_by_feature_type($tab_name)}; + } + + my $asma = $self->db->get_AssemblyMapperAdaptor(); + my @features; + + + # fetch the features from each coordinate system they are stored in + COORD_SYSTEM: foreach my $feat_cs (@feat_css) { + + my $mapper; + my @coords; + my @ids; + + #This is not returning true if the feat_cs is no default + # + + if($feat_cs->equals($slice_cs)) { + # no mapping is required if this is the same coord system + + + #eFG change + #We want to set this to undef if we are dealing with result_features which + #are not wsize==0! + #This is only required if we load array data at the natural resolution! + #This test will effect ever single feature query, can we omit this somehow? + my $max_len; + + + if(! (($self->can('_window_size') && + $self->_window_size))){ + $max_len = $self->_max_feature_length() || + $mcc->fetch_max_length_by_CoordSystem_feature_type($feat_cs,$tab_name); + } + + my $constraint = $orig_constraint; + my $sr_id = $self->get_seq_region_id_by_Slice($slice, $feat_cs); + + + #If we have no slice id here, we know we don't have anything in the DB for this slice + return [] if ! $sr_id; + + + $constraint .= " AND " if($constraint); + $constraint .= + "${tab_syn}.seq_region_id = $sr_id AND " . + "${tab_syn}.seq_region_start <= $slice_end AND " . + "${tab_syn}.seq_region_end >= $slice_start"; + + + if($max_len) { + + my $min_start = $slice_start - $max_len; + $constraint .= + " AND ${tab_syn}.seq_region_start >= $min_start"; + } + + + my $fs = $self->generic_fetch($constraint,undef,$slice); + # features may still have to have coordinates made relative to slice + # start + + + + + $fs = $self->_remap($fs, $mapper, $slice); + push @features, @$fs; + } + else { + #Table contains some feature on a CS that differs from the Slice CS + #can't do CS remapping yet as AssemblyMapper expects a core CS + #change AssemblyMapper? + #or do we just create a core CS just for the remap and convert back when done? + + # $mapper = $asma->fetch_by_CoordSystems($slice_cs, $feat_cs); + + # next unless defined $mapper; + + # Get list of coordinates and corresponding internal ids for + # regions the slice spans + # @coords = $mapper->map($slice_seq_region, $slice_start, $slice_end, + # $slice_strand, $slice_cs); + + # @coords = grep {!$_->isa('Bio::EnsEMBL::Mapper::Gap')} @coords; + + # next COORD_SYSTEM if(!@coords); + + # @ids = map {$_->id()} @coords; + ##coords are now id rather than name + ## @ids = @{$asma->seq_regions_to_ids($feat_cs, \@ids)}; + + # When regions are large and only partially spanned by slice + # it is faster to to limit the query with start and end constraints. + # Take simple approach: use regional constraints if there are less + # than a specific number of regions covered. + + # if(@coords > $MAX_SPLIT_QUERY_SEQ_REGIONS) { + # my $constraint = $orig_constraint; + # my $id_str = join(',', @ids); + # $constraint .= " AND " if($constraint); + # $constraint .= "${tab_syn}.seq_region_id IN ($id_str)"; + + # my $fs = $self->generic_fetch($constraint, $mapper, $slice); + + # $fs = _remap($fs, $mapper, $slice); + + # push @features, @$fs; + + # } else { + # do multiple split queries using start / end constraints + + # my $max_len = $self->_max_feature_length() || + # $mcc->fetch_max_length_by_CoordSystem_feature_type($feat_cs, + # $tab_name); + # my $len = @coords; + # for(my $i = 0; $i < $len; $i++) { + # my $constraint = $orig_constraint; + # $constraint .= " AND " if($constraint); + # $constraint .= + # "${tab_syn}.seq_region_id = " . $ids[$i] . " AND " . + # "${tab_syn}.seq_region_start <= " . $coords[$i]->end() . " AND ". + # "${tab_syn}.seq_region_end >= " . $coords[$i]->start(); + + # if($max_len) { + # my $min_start = $coords[$i]->start() - $max_len; + # $constraint .= + # " AND ${tab_syn}.seq_region_start >= $min_start"; + # } + + # my $fs = $self->generic_fetch($constraint,$mapper,$slice); + + # $fs = _remap($fs, $mapper, $slice); + +# push @features, @$fs; + # } + # } + } + } #COORD system loop + + return \@features; +} + + + + +# +# Given a list of features checks if they are in the correct coord system +# by looking at the first features slice. If they are not then they are +# converted and placed on the slice. +# + +#We have to have this here as this is a sub not a method, hence the _remap +#in the core BaseFeatureAdaptor is not available here + +#Need to over-ride this for Iterator and remap per feature? +#Can we skip this altogether if the check test is done before hand? +#Would have to account for each CS set of features +#but we are currently only handling the query slice CS, so we can skip this completely for now + + +sub _remap { + my ($self, $features, $mapper, $slice) = @_; + + #check if any remapping is actually needed + if(@$features && (!$features->[0]->isa('Bio::EnsEMBL::Feature') || + $features->[0]->slice == $slice)) { + return $features; + } + + #remapping has not been done, we have to do our own conversion from + #to slice coords + + my @out; + + my $slice_start = $slice->start(); + my $slice_end = $slice->end(); + my $slice_strand = $slice->strand(); + my $slice_cs = $slice->coord_system(); + + my ($seq_region, $start, $end, $strand); + + #my $slice_seq_region_id = $slice->get_seq_region_id(); + my $slice_seq_region = $slice->seq_region_name(); + + foreach my $f (@$features) { + #since feats were obtained in contig coords, attached seq is a contig + my $fslice = $f->slice(); + + if(!$fslice) { + throw("Feature does not have attached slice.\n"); + } + my $fseq_region = $fslice->seq_region_name(); + my $fseq_region_id = $fslice->get_seq_region_id(); + my $fcs = $fslice->coord_system(); + + if(!$slice_cs->equals($fcs)) { + #slice of feature in different coord system, mapping required + + ($seq_region, $start, $end, $strand) = + $mapper->fastmap($fseq_region_id,$f->start(),$f->end(),$f->strand(),$fcs); + + # undefined start means gap + next if(!defined $start); + } else { + $start = $f->start(); + $end = $f->end(); + $strand = $f->strand(); + $seq_region = $f->slice->seq_region_name(); + } + + # maps to region outside desired area + next if ($start > $slice_end) || ($end < $slice_start) || + ($slice_seq_region ne $seq_region); + + #shift the feature start, end and strand in one call + if($slice_strand == -1) { + $f->move( $slice_end - $end + 1, $slice_end - $start + 1, $strand * -1 ); + } else { + $f->move( $start - $slice_start + 1, $end - $slice_start + 1, $strand ); + } + + $f->slice($slice); + + push @out,$f; + } + + return \@out; +} + + +=head2 fetch_all_by_stable_Storable_FeatureSets + + Arg [1] : Bio::EnsEMBL::Storable + Arg [5] : arrayref - Bio::EnsEMBL::Funcgen::FeatureSet + Example : ($start, $end) = $self->_set_bounds_by_regulatory_feature_xref + ($trans_chr, $start, $end, $transcript, $fsets); + Description: Internal method to set an xref Slice bounds given a list of + FeatureSets. + Returntype : List - ($start, $end); + Exceptions : throw if incorrent args provided + Caller : self + Status : at risk + +=cut + +#We really only want this method to work on storables +#Namely all SetFeatures, could extend to FeatureTypes +#This is a reimplementation of what has been used in the +#Funcgen::SliceAdaptor::_set_bounds_by_xref_FeatureSet + +#we could have a fourth arg here which is a coderef to filter the feature returned? +#This could be used by the SliceAdaptor to only return features which exceed a current slice +#We would still have to test the start and end in the caller, so not a massive win +#for too much code obfucation. Altho we are maintaining two versions of this code +#which could get out of sync, epsecially with respect to external_db names + +#This is currently feature generic +#So can call from one adaptor but will return features from all adaptors? +#Where can we put this? +#DBEntry adaptor? +#The only logical place for this is really a convinience method in the core BaseFeatureAdaptor +#This removes the problem of the generic nature of the returned data, as the focus is with the +#caller which is a singular feature, therefore, not mixing of data types + +#restrict to one feature type for now. but this will be slower as we will have to make the list call +#for each type of feature_set passed, so if these are mixed they will be redundant +#Let's just put a warning in +#Keep this structure so we can port it to a convinience method somewhere +#and slim down later? + + +#This does not descend Gene>Transcript>Translation + +#Now we have conflicting standards +#FeatureSets was kep as a list to prevent having to pass [$fset] for one FeatureSet +#This also elimated the need to test ref == ARRAY +#But now we want an extra optional descend flag +#Should we change back? +#Or should we wrap this method in one which will decend? +#Write wrappers for now, as we will have to write the descend loop anyway +#But this does not resolve the @fsets vs [$fset] issue + +#Will have to do this in the wrappers anyway, so change back +#and live with the dichotomy of FeatureSet implementations for now + +sub fetch_all_by_stable_Storable_FeatureSets{ + my ($self, $obj, $fsets) = @_; + + my ($extdb_name); + my $dbe_adaptor = $self->db->get_DBEntryAdaptor; + + + #Do we need a central registry for ensembl db names, what about versions? + + + if(ref($obj) && $obj->isa('Bio::EnsEMBL::Storable') && $obj->can('stable_id')){ + my @tmp = split/::/, ref($obj); + my $obj_type = pop @tmp; + my $group = $obj->adaptor->db->group; + #Could sanity check for funcgen here? + + + if (! defined $group){ + throw('You must pass a stable Bio::EnsEMBL::Feature with an attached DBAdaptor with the group attribute set'); + } + + $extdb_name = 'ensembl_'.$group.'_'.$obj_type; + + #warn "ext db name is $extdb_name"; + + } + else{ + throw('Must pass a stable Bio::EnsEMBL::Feature, you passed a '.$obj); + } + + + #Set which eFG features we want to look at. + + if(ref($fsets) ne 'ARRAY' || scalar(@$fsets) == 0){ + throw('Must define an array of Bio::EnsEMBL::FeatureSets to extend xref Slice bound. You passed: '.$fsets); + } + + my %feature_set_types; + + foreach my $fset(@$fsets){ + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::FeatureSet', $fset); + + $feature_set_types{$fset->feature_class} ||= []; + push @{$feature_set_types{$fset->feature_class}}, $fset; + } + + + #We can list the outer loop here and put in the BaseFeatureAdaptor, or possible storable as we do have FeatureType xrefs. + #This would be useful for fetching all the efg features for a given xref and FeatureSets + #Don't implement as a parent sub and call from here as this would mean looping through array twice. + #Altho we could pass a code ref to do the filtering? + #Just copy and paste for now to avoid obfuscation + + my @features; + + #Get xrefs for each eFG feature set type + foreach my $fset_type(keys %feature_set_types){ + + #my $xref_method = 'list_'.$fset_type.'_feature_ids_by_extid'; + #e.g. list_regulatory_feature_ids_by_extid + + + #Do type test here + my $adaptor_type = ucfirst($fset_type).'FeatureAdaptor'; + + #remove this for generic method + next if ref($self) !~ /$adaptor_type/; + + #This is for generic method + #my $adaptor_method = 'get_'.ucfirst($fset_type).'FeatureAdaptor'; + #my $adaptor = $self->db->$adaptor_method; + + my %feature_set_ids; + map $feature_set_ids{$_->dbID} = 1, @{$feature_set_types{$fset_type}}; + + my $cnt = 0; + + #Change this self to adaptor for generic method + foreach my $efg_feature(@{$self->fetch_all_by_external_name($obj->stable_id, $extdb_name)}){ + + #Skip if it is not in one of our FeatureSets + next if ! exists $feature_set_ids{$efg_feature->feature_set->dbID}; + push @features, $efg_feature; + } + } + + + return \@features; +} + + +sub fetch_all_by_Gene_FeatureSets{ + my ($self, $gene, $fsets, $dblinks) = @_; + + if(! ( ref($gene) && $gene->isa('Bio::EnsEMBL::Gene'))){ + throw("You must pass a valid Bio::EnsEMBL::Gene object"); + } + + my @features = @{$self->fetch_all_by_stable_Storable_FeatureSets($gene, $fsets)}; + + if($dblinks){ + + foreach my $transcript(@{$gene->get_all_Transcripts}){ + push @features, @{$self->fetch_all_by_Transcript_FeatureSets($transcript, $fsets, $dblinks)}; + } + } + + return \@features; +} + +sub fetch_all_by_Transcript_FeatureSets{ + my ($self, $transc, $fsets, $dblinks) = @_; + + if(! ( ref($transc) && $transc->isa('Bio::EnsEMBL::Transcript'))){ + throw("You must pass a valid Bio::EnsEMBL::Transcript object"); + } + + + my @features = @{$self->fetch_all_by_stable_Storable_FeatureSets($transc, $fsets)}; + + if($dblinks){ + my $translation = $transc->translation; + push @features, @{$self->fetch_all_by_stable_Storable_FeatureSets($translation, $fsets)} if $translation; + } + + return \@features; +} + + + +=head2 fetch_all_by_display_label + + Arg [1] : String $label - display label of feature to fetch + Example : my $feat = $adaptor->fetch_all_by_display_label('abd-A_dpp:REDFLY:TF000092'); + Description: Returns the features with the given display label. + Returntype : ARRAYREF of Bio::EnsEMBL::Funcgen::Feature objects + Exceptions : none + Caller : general + Status : At risk + +=cut + +#There is no result_feature.display_label attribute. +#Move this to individual feature adaptors to avoid over-riding? + + +sub fetch_all_by_display_label { + my ($self, $label) = @_; + + throw('You must defined a display_label argument') if ! defined $label; + + my $table_syn = $self->_main_table->[1]; + my $constraint = "${table_syn}.display_label = ?"; + $self->bind_param_generic_fetch($label, SQL_VARCHAR); + + return $self->generic_fetch($constraint); +} + + + + +=head2 _get_coord_system_ids + + Arg [1] : optional - ARRAYREF of Bio::EnsEMBL::Funcgen::CoordSystem objects + Example : my $cs_ids = $self->_get_coord_system_ids([$cs1, $cs2], ...); + Description: Returns the IDs of the CoordSystems specified or all default CoordSystems. + Returntype : ARRAYREF + Exceptions : Throws if CoordSystem object are not stored and valid + Caller : BaseFeatureAdaptor.pm + Status : At risk + +=cut + +#Can we remove the need for this by restricting the sr cache to default entries? + +sub _get_coord_system_ids{ + my ($self, $coord_systems) = @_; + + my @cs_ids; + + if($coord_systems){ + + if(ref($coord_systems) eq 'ARRAY' && + (scalar(@$coord_systems) >0)){ + + foreach my $cs(@$coord_systems){ + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::CoordSystem', $cs); + push @cs_ids, $cs->dbID; + } + } + else{ + throw('CoordSystems parameter must be an arrayref of one or more Bio::EnsEMBL::Funcgen::CoordSystems'); + } + } + else{ + + #Get current default cs's + foreach my $cs(@{$self->db->get_FGCoordSystemAdaptor->fetch_all($self->db->dnadb, 'DEFAULT')}){ + push @cs_ids, $cs->dbID; + } + } + + #This should never happen + if(scalar(@cs_ids) == 0){ + throw('Could not find any default CoordSystems'); + } + + return \@cs_ids; +} + + +=head2 count_features_by_field_id + + Arg [1] : string - table field to count + Arg [2] : string/int - id to count + Example : my $probe_feature_count = $pfa->count_feature_by_field_id('probe_id', $probe_id); + Description: Returns a count of the features with the acciated field id + Returntype : string/int - count of features + Exceptions : Throws args are not set + Caller : FeatureAdaptors + Status : At risk + +=cut + +#This does not assume one record per feature +#But does assume primary key if ${table_name}_id +#Can't move to core due to cs issues, but could mirror implementation. + +sub count_features_by_field_id{ + my ($self, $count_field, $count_id) = @_; + #Any other params here? + + if(! ($count_field && $count_id)){ + throw('You must provide a count name and a count id to count by'); + } + + my ($table_name, $table_syn) = @{$self->_main_table}; + my $table_id = "${table_name}_id"; + my @cs_ids = @{$self->_get_coord_system_ids}; + my $sql = "SELECT count(distinct($table_id)) from $table_name $table_syn, seq_region sr where ${table_syn}.${count_field}=? and ${table_syn}.seq_region_id in(select distinct(seq_region_id) from seq_region where coord_system_id in(".join(',', @cs_ids).'))'; + my $sth = $self->prepare($sql); + + $sth->bind_param(1, $count_id, SQL_INTEGER); + $sth->execute; + + return $sth->fetchrow_array; +} + + + +=head2 force_reslice + + Arg [1] : Optional - Boolean + Example : if($self->force_reslice){ + # Reslice features past ends of destination Slice + } + Description: Sets/Returns force_reslice boolean + Returntype : Boolean + Exceptions : None + Caller : FeatureAdaptors::_objs_from_sth + Status : At risk + +=cut + + +sub force_reslice{ + my ($self, $force) = @_; + + + if(defined $force){ + $self->{force_reslice} = $force; + } + + return $self->{force_reslice}; +} + + + +1; + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/BindingMatrixAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/BindingMatrixAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,329 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::DBSQL::BindingMatrixAdaptor +# + + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::Funcgen::DBSQL::BindingMatrixAdaptor - A database adaptor for fetching and +storing Funcgen BindingMatrix objects. + +=head1 SYNOPSIS + +my $matrix_adaptor = $db->get_BindingMatrixAdaptor(); +my @matrices = @{$matrix_adaptor->fetch_all_by_name("MA0122.1")}; + +=head1 DESCRIPTION + +The BindingMatrixAdaptor is a database adaptor for storing and retrieving +Funcgen BindingMatrix objects. + + +=head1 SEE ALSO + +Bio::EnsEMBL::Funcgen::BindingMatrix + +=cut + + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::DBSQL::BindingMatrixAdaptor; + +use Bio::EnsEMBL::Utils::Exception qw( warning throw ); +use Bio::EnsEMBL::Funcgen::BindingMatrix; +use Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor); + + +=head2 fetch_all_by_name + + Arg [1] : string - name of Matrix + Arg [2] : Bio::EnsEMBL::Analysis (optional) Analysis indicating Matrix origin + Example : my @matrices = @{$matrix_adaptor->fetch_all_by_name('MA0122.1')}; + Description: Fetches matrix objects given a name and an optional Analysis object. + If both are specified, only one unique BindingMatrix will be returned + Returntype : Arrayref of Bio::EnsEMBL::Funcgen::BindingMatrix objects + Exceptions : Throws if no name if defined + Caller : General + Status : At risk - Change this to fetch_all_by_name_FeatureType + +=cut + +sub fetch_all_by_name{ + my ($self, $name, $analysis) = @_; + + throw("Must specify a BindingMatrix name") if(! $name); + + my $constraint = " bm.name = ? "; + $constraint .= " AND bm.analysis_id = ?" if $analysis; + + $self->bind_param_generic_fetch($name, SQL_VARCHAR); + $self->bind_param_generic_fetch($analysis->dbID, SQL_INTEGER) if $analysis; + + return $self->generic_fetch($constraint); +} + +=head2 fetch_all_by_name_FeatureType + + Arg [1] : string - name of Matrix + Arg [2] : Bio::EnsEMBL::Funcgen::FeatureType + Arg [3] : Bio::EnsEMBL::Analysis (optional) Analysis indicating Matrix origin + Description: Fetches matrix objects given a name and a FeatureType. + Returntype : Arrayref of Bio::EnsEMBL::Funcgen::BindingMatrix objects + Exceptions : Throws if no name if defined or if FeatureType is not valid + Caller : General + Status : At risk + +=cut + +sub fetch_all_by_name_FeatureType{ + my ($self, $name, $ftype, $analysis) = @_; + + throw("Must specify a BindingMatrix name") if(! $name); + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::FeatureType', $ftype); + + my $constraint = " bm.name = ? and bm.feature_type_id = ?"; + $constraint .= " AND bm.analysis_id = ?" if $analysis; + + $self->bind_param_generic_fetch($name, SQL_VARCHAR); + $self->bind_param_generic_fetch($ftype->dbID, SQL_INTEGER); + $self->bind_param_generic_fetch($analysis->dbID, SQL_INTEGER) if $analysis; + + return $self->generic_fetch($constraint); +} + + +=head2 fetch_all_by_FeatureType + + Arg [1] : Bio::EnsEMBL::Funcgen::FeatureType + Arg [2] : Bio::EnsEMBL::Analysis (optional) Analysis indicating Matrix origin + Example : my @matrices = @{$matrix_adaptor->fetch_all_by_FeatureType($ftype)}; + Description: Fetches BindingMatrix objects given it's FeatureType + Returntype : Bio::EnsEMBL::Funcgen::BindingMatrix + Exceptions : Throws if FeatureType is not valid + Caller : General + Status : At risk + +=cut + +sub fetch_all_by_FeatureType{ + my ($self, $ftype, $analysis) = @_; + + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::FeatureType', $ftype); + + my $constraint = " bm.feature_type_id = ?"; + $constraint .= " AND bm.analysis_id = ?" if $analysis; + + $self->bind_param_generic_fetch($ftype->dbID, SQL_INTEGER); + $self->bind_param_generic_fetch($analysis->dbID, SQL_INTEGER) if $analysis; + + return $self->generic_fetch($constraint); +} + + + +=head2 _tables + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns the names and aliases of the tables to use for queries. + Returntype : List of listrefs of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _tables { + my $self = shift; + + return (['binding_matrix', 'bm']); +} + +=head2 _columns + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns a list of columns to use for queries. + Returntype : List of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _columns { + my $self = shift; + + return qw( bm.binding_matrix_id bm.name bm.analysis_id bm.frequencies bm.description bm.feature_type_id bm.threshold); +} + +=head2 _objs_from_sth + + Arg [1] : DBI statement handle object + Example : None + Description: PROTECTED implementation of superclass abstract method. + Creates objects from an executed DBI statement handle. + Returntype : Arrayref of Bio::EnsEMBL::Funcgen::BindingMatrix objects + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _objs_from_sth { + my ($self, $sth) = @_; + + my (@result, $matrix_id, $name, $analysis_id, $freq, $desc, $ftype_id, $thresh); + $sth->bind_columns(\$matrix_id, \$name, \$analysis_id, \$freq, \$desc, \$ftype_id, \$thresh); + + my $ftype_adaptor = $self->db->get_FeatureTypeAdaptor; + my %ftype_cache; + + my $analysis_adaptor = $self->db->get_AnalysisAdaptor; + my %analysis_cache; + + while ( $sth->fetch() ) { + + if(! exists $ftype_cache{$ftype_id}){ + $ftype_cache{$ftype_id} = $ftype_adaptor->fetch_by_dbID($ftype_id); + } + + if(! exists $analysis_cache{$analysis_id}){ + $analysis_cache{$analysis_id} = $analysis_adaptor->fetch_by_dbID($analysis_id); + } + + my $matrix = Bio::EnsEMBL::Funcgen::BindingMatrix->new + ( + -dbID => $matrix_id, + -NAME => $name, + -ANALYSIS => $analysis_cache{$analysis_id}, + -FREQUENCIES => $freq, + -DESCRIPTION => $desc, + -FEATURE_TYPE => $ftype_cache{$ftype_id}, + -THRESHOLD => $thresh, + -ADAPTOR => $self, + ); + + push @result, $matrix; + + } + + return \@result; +} + + + +=head2 store + + Args : List of Bio::EnsEMBL::Funcgen::BindingMatrix objects + Example : $matrix_adaptor->store($m1, $m2, $m3); + Description: Stores given Matrix objects in the database. + Sets dbID and adaptor on the objects that it stores. + Returntype : None + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub store { + my $self = shift; + my @args = @_; + + my $sth = $self->prepare(" + INSERT INTO binding_matrix + (name, analysis_id, frequencies, description, feature_type_id, threshold) + VALUES (?, ?, ?, ?, ?, ?)"); + + my $s_matrix; + + foreach my $matrix (@args) { + + if ( ! $matrix->isa('Bio::EnsEMBL::Funcgen::BindingMatrix') ) { + throw('Can only store BindingMatrix objects, skipping $matrix'); + } + + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::FeatureType', $matrix->feature_type); + + + + if (!( $matrix->dbID() && $matrix->adaptor() == $self )){ + + #Check for previously stored BindingMatrix + ($s_matrix) = @{$self->fetch_all_by_name_FeatureType($matrix->name(), $matrix->feature_type, $matrix->analysis())}; + + if(! $s_matrix){ + + $sth->bind_param(1, $matrix->name(), SQL_VARCHAR); + $sth->bind_param(2, $matrix->analysis()->dbID(), SQL_INTEGER); + $sth->bind_param(3, $matrix->frequencies(), SQL_LONGVARCHAR); + $sth->bind_param(4, $matrix->description(), SQL_VARCHAR); + $sth->bind_param(5, $matrix->feature_type->dbID(), SQL_INTEGER); + $sth->bind_param(6, $matrix->threshold(), SQL_DOUBLE); + + $sth->execute(); + my $dbID = $sth->{'mysql_insertid'}; + $matrix->dbID($dbID); + $matrix->adaptor($self); + + $self->store_associated_feature_types($matrix); + } + else{ + $matrix = $s_matrix; + warn("Using previously stored Matrix:\t".$matrix->name()."\n"); + #Could update associated FeatureTypes here + } + } + } + + return \@args; +} + + +=head2 list_dbIDs + + Args : None + Example : my @matrix_ids = @{$matrix_adaptor->list_dbIDs()}; + Description: Gets an array of internal IDs for all Matrix objects in the current database. + Returntype : ArrayRef of BindingMatrix + Exceptions : None + Caller : General + Status : At risk + +=cut + +sub list_dbIDs { + my ($self) = @_; + + return $self->_list_dbIDs('binding_matrix'); +} + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/CellTypeAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/CellTypeAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,243 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::DBSQL::CellTypeAdaptor +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::Funcgen::DBSQL::CellTypeAdaptor - A database adaptor for fetching and +storing Funcgen CellType objects. + +=head1 SYNOPSIS + +my $ct_adaptor = $efgdba->get_CellTypeAdaptor(); + +my $cell_type = $ct_adaptor->fetch_by_name("HeLa-S3"); + + +=head1 DESCRIPTION + +The CellTypeAdaptor is a database adaptor for storing and retrieving +Funcgen CellType objects. + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::DBSQL::CellTypeAdaptor; + +use Bio::EnsEMBL::Utils::Exception qw( warning throw ); +use Bio::EnsEMBL::Funcgen::CellType; +use Bio::EnsEMBL::DBSQL::BaseAdaptor; + +use vars qw(@ISA); + + +#May need to our this? +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); + +=head2 fetch_by_name + + Arg [1] : string - name of CellType + Arg [1] : optional string - class of CellType + Example : my $ct = $ct_adaptor->fetch_by_name('HeLa'); + Description: Retrieves CellType objects by name. + Returntype : Bio::EnsEMBL::Funcgen::CellType object + Exceptions : Throws no name given + Caller : General + Status : At risk + +=cut + +sub fetch_by_name{ + my ($self, $name) = @_; + + throw("Must specify a CellType name") if(! $name); + + my $constraint = "ct.name ='$name'"; + + my @ctype = @{$self->generic_fetch($constraint)}; + #name is unique so we should only have one + + return $ctype[0]; +} + + +#fetch_all_by_efo_id +#fetch_all_by_tissue +#fetch_all_by_lineage + +=head2 _tables + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns the names and aliases of the tables to use for queries. + Returntype : List of listrefs of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _tables { + my $self = shift; + + return ( + ['cell_type', 'ct'], + ); +} + +=head2 _columns + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns a list of columns to use for queries. + Returntype : List of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _columns { + my $self = shift; + + return qw( ct.cell_type_id ct.name ct.display_label ct.description ct.gender ct.efo_id ct.tissue); + #type/class = enum cell, cell line, tissue +} + +=head2 _objs_from_sth + + Arg [1] : DBI statement handle object + Example : None + Description: PROTECTED implementation of superclass abstract method. + Creates Channel objects from an executed DBI statement + handle. + Returntype : Listref of Bio::EnsEMBL::Funcgen::CellType objects + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _objs_from_sth { + my ($self, $sth) = @_; + + my (@result, $ct_id, $name, $dlabel, $desc, $gender, $efo_id, $tissue); + + $sth->bind_columns(\$ct_id, \$name, \$dlabel, \$desc, \$gender, \$efo_id, \$tissue); + + while ( $sth->fetch() ) { + my $ctype = Bio::EnsEMBL::Funcgen::CellType->new( + -dbID => $ct_id, + -NAME => $name, + -DISPLAY_LABEL => $dlabel, + -DESCRIPTION => $desc, + -GENDER => $gender, + -EFO_ID => $efo_id, + -ADAPTOR => $self, + ); + + push @result, $ctype; + + } + return \@result; +} + + + +=head2 store + + Args : List of Bio::EnsEMBL::Funcgen::CellType objects + Example : $chan_a->store($c1, $c2, $c3); + Description: Stores CellType objects in the database. + Returntype : None + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub store { + my $self = shift; + my @args = @_; + + + my $sth = $self->prepare(" + INSERT INTO cell_type + (name, display_label, description, gender, efo_id, tissue) + VALUES (?, ?, ?, ?, ?, ?)"); + + + + foreach my $ct (@args) { + if ( ! $ct->isa('Bio::EnsEMBL::Funcgen::CellType') ) { + warning('Can only store CellType objects, skipping $ct'); + next; + } + + if ( $ct->dbID() && $ct->adaptor() == $self ){ + warn("Skipping previously stored CellType dbID:".$ct->dbID().")"); + next; + } + + + $sth->bind_param(1, $ct->name, SQL_VARCHAR); + $sth->bind_param(2, $ct->display_label, SQL_VARCHAR); + $sth->bind_param(3, $ct->description, SQL_VARCHAR); + $sth->bind_param(4, $ct->gender, SQL_VARCHAR); + $sth->bind_param(5, $ct->efo_id, SQL_VARCHAR); + $sth->bind_param(6, $ct->tissue, SQL_VARCHAR); + $sth->execute(); + $ct->dbID($sth->{'mysql_insertid'}); + $ct->adaptor($self); + } + + return \@args; +} + + +=head2 list_dbIDs + + Args : None + Example : my @ct_ids = @{$ct_a->list_dbIDs()}; + Description: Gets an array of internal IDs for all CellType objects in the + current database. + Returntype : List of ints + Exceptions : None + Caller : ? + Status : At risk + +=cut + +sub list_dbIDs { + my ($self) = @_; + + return $self->_list_dbIDs('cell_type'); +} + + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/ChannelAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/ChannelAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,386 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::DBSQL::ChannelAdaptor +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::Funcgen::DBSQL::ChannelAdaptor - A database adaptor for fetching and +storing Funcgen Channel objects. + +=head1 SYNOPSIS + +my $chan_a = $db->get_ChannelAdaptor(); + +my @channels = @{$chan_a->fetch_all_by_ExperimentalChip($exp)}; + + +=head1 DESCRIPTION + +The ChannelAdaptor is a database adaptor for storing and retrieving +Funcgen Channel objects. + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::DBSQL::ChannelAdaptor; + +use Bio::EnsEMBL::Utils::Exception qw( warning throw ); +use Bio::EnsEMBL::Funcgen::Channel; +use Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor; + +use vars qw(@ISA); + + +#May need to our this? +@ISA = qw(Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor); + +=head2 fetch_by_type_experimental_chip_id + + Arg [1] : string - type + Arg [2] : Bio::EnsEMBL::Funcgen::ExperimentalChip + Example : my $chan = $chan_a->fetch_by_type_experimental_chip_dbID('EXPERIMENTAL', $ec_dbid); + Description: Does what it says on the tin + Returntype : Bio::EnsEMBL::Funcgen::Channel object + Exceptions : Throws if args not met + Caller : General + Status : At risk + +=cut + +sub fetch_by_type_experimental_chip_id { + my ($self, $type, $ec_id) = @_; + + throw("Must specify a channel type e.g. EXPERIMENTAL, TOTAL and an ExperimentalChip id") if(! ($type && $ec_id)); + + + my $constraint = 'c.experimental_chip_id = ? AND c.type = ?'; + + $self->bind_param_generic_fetch($ec_id, SQL_INTEGER); + $self->bind_param_generic_fetch($type, SQL_VARCHAR); + + + return $self->generic_fetch($constraint); + } + + +=head2 fetch_by_dye_experimental_chip_dbID + + Arg [1] : string - dye + Arg [1] : int - dbID of Experiment + Example : my $chan = $chan_a->fetch_by_type_experimental_chip_dbID('Cy5', $ec_dbid); + Description: Does what it says on the tin + Returntype : Bio::EnsEMBL::Funcgen::Channel object + Exceptions : Throws is experiment dbID or dye not passed + Caller : General + Status : At Risk + +=cut + +sub fetch_by_dye_experimental_chip_dbID { + my $self = shift; + my $dye = shift; + my $ec_dbid = shift; + + throw("not yet impemented"); + + my ($chan_id, @results); + + throw("Must specify an experiemntal dbID") if(! $ec_dbid); + #Need to validate dye against VendorDefs? Or leave and just return whatever is in DB e.g. nothing if dye name is wrong. + + + my $sth = $self->prepare(" + SELECT c.channel_id + FROM experimental_chip ec, channel c + WHERE c.experimental_chip_id = ec.experimental_chip_id + AND ec.experimental_chip_id = $ec_dbid + "); + + + + #can we do a generic fetch here? + + + $sth->execute(); + + + while ($chan_id = $sth->fetchrow()){ + push @results, $self->fetch_by_dbID($chan_id); + } + + return \@results; +} + + +=head2 fetch_all_by_experimental_chip_dbID + + Arg [1] : int - dbID of Experiment + Example : my @chans = @{$ec_a->fetch_all_by_experimental_chip_dbID($ac_dbid); + Description: Does what it says on the tin + Returntype : Listref of Bio::EnsEMBL::Funcgen::Channel object + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub fetch_all_by_experimental_chip_dbID { + my $self = shift; + my $ec_dbid = shift; + + my ($chan_id, @results); + + throw("Must specify an experiemntal dbID") if(! $ec_dbid); + + + #my $sth = $self->prepare(" + #SELECT c.channel_id + # FROM experimental_chip ec, channel c + # WHERE c.experimental_chip_id = ec.experimental_chip_id + # AND ec.experimental_chip_id = $ec_dbid + # "); + + + my $constraint = "c.experimental_chip_id=$ec_dbid"; + + #can we do a generic fetch here? + + + #$sth->execute(); + + + #while ($chan_id = $sth->fetchrow()){ + # push @results, $self->fetch_by_dbID($chan_id); + # } + + return $self->generic_fetch($constraint); +} + +=head2 fetch_all_by_ExperimentalChip + + Arg [1] : Bio::EnsEMBL::Funcgen::ExperimentalChip + Example : my @chans = @{$ec_a->fetch_all_by_ExperimentalChip($echip); + Description: Returns all channels associated with a given ExperimentalChip + Returntype : Listref of Bio::EnsEMBL::Funcgen::Channel objects + Exceptions : Throws if no ExperimentalChip defined + Caller : General + Status : At Risk + +=cut + +sub fetch_all_by_ExperimentalChip{ + my ($self, $exp) = @_; + throw("Must provide an ExperimentChip object") if(! $exp->isa('Bio::EnsEMBL::Funcgen::ExperimentalChip')); + return $self->fetch_all_by_experimental_chip_dbID($exp->dbID()); +} + +=head2 fetch_attributes + + Arg [1] : Bio::EnsEMBL::Funcgen::Array - array to fetch attributes for + Example : None + Description: This function is solely intended to lazy load attributes into + empty Array objects. You should not need to call this. + Returntype : None + Exceptions : None + Caller : Bio::EnsEMBL::Funcgen::Array getters + Status : Medium Risk + +=cut + +sub fetch_attributes { + my $self = shift; + my $array = shift; + + my $tmp_array = $self->fetch_by_dbID( $array->dbID() ); + %$array = %$tmp_array; +} + +=head2 _tables + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns the names and aliases of the tables to use for queries. + Returntype : List of listrefs of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _tables { + my $self = shift; + + return ['channel', 'c']; +} + +=head2 _columns + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns a list of columns to use for queries. + Returntype : List of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _columns { + my $self = shift; + + return qw( c.channel_id c.experimental_chip_id c.sample_id c.dye c.type); +} + +=head2 _objs_from_sth + + Arg [1] : DBI statement handle object + Example : None + Description: PROTECTED implementation of superclass abstract method. + Creates Channel objects from an executed DBI statement + handle. + Returntype : Listref of Bio::EnsEMBL::Funcgen::Channel objects + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _objs_from_sth { + my ($self, $sth) = @_; + + my (@result, $ec_id, $chan_id, $sample_id, $type, $dye); + + $sth->bind_columns(\$chan_id, \$ec_id, \$sample_id, \$dye, \$type); + + while ( $sth->fetch() ) { + my $chan = Bio::EnsEMBL::Funcgen::Channel->new( + -DBID => $chan_id, + -EXPERIMENTAL_CHIP_ID => $ec_id, + -SAMPLE_ID => $sample_id, + -TYPE => $type, + -DYE => $dye, + -ADAPTOR => $self, + ); + + push @result, $chan; + + } + return \@result; +} + + + +=head2 store + + Args : List of Bio::EnsEMBL::Funcgen::Channel objects + Example : $chan_a->store($c1, $c2, $c3); + Description: Stores given Channel objects in the database. Should only be + called once per array because no checks are made for duplicates. + Sets dbID and adaptor on the objects that it stores. + Returntype : None + Exceptions : Throws if object is not a Bio::EnsEMBL::Funcgen::Channel + Throws if object is already present in the DB but has no dbID + Caller : General + Status : At Risk + +=cut + +sub store { + my $self = shift; + my @args = @_; + + my $sth = $self->prepare(" + INSERT INTO channel + (experimental_chip_id, sample_id, dye, type) + VALUES (?, ?, ?, ?)"); + + + foreach my $chan (@args) { + throw('Can only store Channel objects') if ( ! $chan->isa('Bio::EnsEMBL::Funcgen::Channel')); + + if (!( $chan->dbID() && $chan->adaptor() == $self )){#use is_stored? + + + my $s_chan = $self->fetch_by_type_experimental_chip_id($chan->type(), $chan->experimental_chip_id()); + throw("Channel already exists in the database with dbID:".$s_chan->dbID(). + "\nTo reuse/update this Channel you must retrieve it using the ChannelAdaptor". + "\nMaybe you want to use the -recover option?") if $s_chan; + + #if(! $s_chan){ + $sth->bind_param(1, $chan->experimental_chip_id(), SQL_INTEGER); + $sth->bind_param(2, $chan->sample_id(), SQL_VARCHAR); + $sth->bind_param(3, $chan->dye() , SQL_VARCHAR); + $sth->bind_param(4, $chan->type(), SQL_VARCHAR); + + $sth->execute(); + my $dbID = $sth->{'mysql_insertid'}; + $chan->dbID($dbID); + $chan->adaptor($self); + #} + #else{ + # #do some status checks here, check IMPORTED + # #Need to account for recover in Importer? + # $chan = $s_chan; + + # my @states = @{$self->db->fetch_all_states('channel', $chan->dbID())}; + + # #need better id than dbID? + # warn("Using previously stored Channel (".$chan->experimental_chip_id().":".$chan->type().") with states\t@states\n"); + #} + }else{ + #assume we want to update the states + warn('You may want to use $chan->adaptor->store_states($chan)'); + $self->store_states($chan); + } + } + + return \@args; +} + + +=head2 list_dbIDs + + Args : None + Example : my @array_ids = @{$ec_a->list_dbIDs()}; + Description: Gets an array of internal IDs for all ExperimentalChip objects in the + current database. + Returntype : List of ints + Exceptions : None + Caller : ? + Status : At risk + +=cut + +sub list_dbIDs { + my ($self) = @_; + + return $self->_list_dbIDs('channel'); +} + + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/CoordSystemAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/CoordSystemAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1253 @@ +# +# EnsEMBL module for Bio::EnsEMBL::Funcgen::DBSQL::CoordSystemAdaptor +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::Funcgen::DBSQL::CoordSystemAdaptor + +=head1 SYNOPSIS + + my $db = Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor->new(...); + + my $csa = $db->get_CoordSystemAdaptor(); + + # + # Fetch by name, schema_build and version(opt). + # + + $cs = $csa->fetch_by_name_schema_build_version('chromosome', '39_36a', 'NCBI36'); + + #As this is a multi-assembly DB, we have to accomodate the idea of schema versions, which will + #enable a mapping from the feature table back to assembly/core DB of origin. + + + #Old core methods, some may not work as they assume that there will only be one default version + #where are there maybe multiple default versions, one for each assembly/schema_build + + # + # Get all coord systems in the database: + # + foreach my $cs (@{$csa->fetch_all()}) { + print $cs->name, ' ', $cs->version, "\n"; + } + + # + # Fetching by name: + # + + #use the default version of coord_system 'chromosome' (e.g. NCBI33): + $cs = $csa->fetch_by_name('chromosome'); + + #get an explicit version of coord_system 'chromosome': + $cs = $csa->fetch_by_name('chromsome', 'NCBI34'); + + #get all coord_systems of name 'chromosome': + foreach $cs (@{$csa->fetch_all_by_name('chromosome')}) { + print $cs->name, ' ', $cs->version, "\n"; + } + + # + # Fetching by rank: + # + $cs = $csa->fetch_by_rank(2); + + # + # Fetching the pseudo coord system 'toplevel' + # + + #Get the default top_level coord system: + $cs = $csa->fetch_top_level(); + + #can also use an alias in fetch_by_name: + $cs = $csa->fetch_by_name('toplevel'); + + #can also request toplevel using rank=0 + $cs = $csa->fetch_by_rank(0); + + # + # Fetching by sequence level: + # + + #Get the coord system which is used to store sequence: + $cs = $csa->fetch_sequence_level(); + + #can also use an alias in fetch_by_name: + $cs = $csa->fetch_by_name('seqlevel'); + + # + # Fetching by id + # + $cs = $csa->fetch_by_dbID(1); + + +=head1 DESCRIPTION + +The Funcgen CoordSystemAdaptor works slighty different to the core version. As +the Funcgen DB stores features mapped to multiple core/dna DBs the schema and +data versions(i.e. the last bit of the DB name) have to be stored. This maintains +a link between the seq_region_id stored in the Funcgen DB and the seq_region and assembly +tables stored in the core DB on which the features were originally built. + +Default versions or ranking has not yet been tested. + +This adaptor allows the querying of information from the coordinate system +adaptor. + +Note that many coordinate systems do not have a concept of a version +for the entire coordinate system (though they may have a per-sequence version). +The 'chromosome' coordinate system usually has a version (i.e. the +assembly version) but the clonal coordinate system does not (despite having +individual sequence versions). In the case where a coordinate system does +not have a version an empty string ('') is used instead. + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::DBSQL::CoordSystemAdaptor; + +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Funcgen::CoordSystem; +my %cs_warnings; + +use vars qw(@ISA); + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); + + +=head2 new + + Arg [1] : See BaseAdaptor for arguments (none specific to this + subclass) + Example : $cs = $db->get_CoordSystemAdaptor(); #better than new() + Description: Creates a new CoordSystem adaptor and caches the contents + of the coord_system table in memory. + Returntype : Bio::EnsEMBL::Funcgen::DBSQL::CoordSystemAdaptor + Exceptions : none + Caller : + Status : At risk + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + # + # Cache the entire contents of the coord_system table cross-referenced + # by dbID and name + # + + #Funcgen specific + #Added extra key on schema_build for all + #keyed on name, list of coord_system value + $self->{'_name_cache'} = {}; + + #keyed on id, coord_system value + $self->{'_dbID_cache'} = {}; + + #keyed on rank + #$self->{'_rank_cache'} = {}; + + #keyed on id, 1/undef values + $self->{'_is_sequence_level'} = {}; + $self->{'_is_default_version'} = {}; + + my $sql = 'SELECT coord_system_id, name, rank, version, attrib, schema_build, core_coord_system_id FROM coord_system'; + my @args; + if($self->is_multispecies()) { + $sql.=' where species_id =?'; + push(@args, $self->species_id()); + } + $sql.=' order by coord_system_id'; + my $sth = $self->prepare($sql); + $sth->execute(@args); + + my ($dbID, $name, $rank, $version, $attrib, $sbuild, $ccs_id, $cs); + $sth->bind_columns(\$dbID, \$name, \$rank, \$version, \$attrib, \$sbuild, \$ccs_id); + + while($sth->fetch()) { + my $seq_lvl = 0; + my $default = 0; + + + #what we need is an add schema_build, seq_level, default, rank method + #name and version shoudl be same for one CS + + if($attrib) { + foreach my $attrib (split(',', $attrib)) { + $self->{"_is_$attrib"}->{$dbID} = 1; + if($attrib eq 'sequence_level') { + $seq_lvl = 1; + } elsif($attrib eq 'default_version') { + $default = 1; + } + } + } + + + #Found new name, version pair + if(! $cs || ($dbID != $cs->dbID())){ + + if($cs){ + + #handle caching here + + #the get methods which utilise these caches need to sort the results based on the latest schema build. + #or maybe instead of just having one name, where cat the schema_build, but point to the same cs + #so loop through all the schema build for one CS? + + + $self->{'_dbID_cache'}->{$cs->dbID()} = $cs; + + + + #Right then + #Unless we're querying by cs_id from the eFG DB then we will always need + #schema_build&level||rank or name&version + + #No point in having NR rank cache, need to resolve with schema_build? + #Name + #have schema_build as optional arg in all methods, get from BDAdaptor if not defined? + #This will just match the schema build to the current eFG DB + + $self->{'_name_cache'}->{lc($cs->name())} ||= []; + #$self->{'_rank_cache'}->{$rank} ||= []; + #push @{$self->{'_rank_cache'}->{$rank}}, $cs; + push @{$self->{'_name_cache'}->{lc($cs->name())}}, $cs; + } + + + $cs = Bio::EnsEMBL::Funcgen::CoordSystem->new + (-DBID => $dbID, + -ADAPTOR => $self, + -NAME => $name, + -VERSION => $version, + #-IS_CURRENT => $is_current, + ); + } + + + #could we fetch the actual core CS here, and add it to the eFG coord sys? + #or should we just handle the individual args? + #do we need to write generic method in DBAdaptor for this, then we can use the + #CSAdaptor as a cache for all DBAdaptor(CSs) should we not use reg for this? + + #we could populate objects from new rather than from db, then create adaptor as required? + #still need to store is stored in CD? and also we need to test everytime to see if we have an adaptor + + $cs->add_core_coord_system_info( + -RANK => $rank, + -SEQUENCE_LEVEL => $seq_lvl, + -DEFAULT => $default, + -SCHEMA_BUILD => $sbuild, + -CORE_COORD_SYSTEM_ID => $ccs_id, + -IS_STORED => 1, + ); + + #orig + + + #if($attrib) { + # foreach my $attrib (split(',', $attrib)) { + # $self->{"_is_$attrib"}->{$dbID} = 1; + # if($attrib eq 'sequence_level') { + # $seq_lvl = 1; + # } elsif($attrib eq 'default_version') { + # $default = 1; + # } + # } + #} + + #my $cs = Bio::EnsEMBL::Funcgen::CoordSystem->new + # (-DBID => $dbID, + # -ADAPTOR => $self, + # -NAME => $name, + # -VERSION => $version, + # -RANK => $rank, + # -SEQUENCE_LEVEL => $seq_lvl, + # -DEFAULT => $default, + # -SCHEMA_BUILD => $sbuild, + # ); + + + #can we change these caches to use just the name and version rather than schema_build? + + #$self->{'_sb_name_cache'}->{$sbuild.":".lc($name)} ||= []; + #$self->{'_dbID_cache'}->{$dbID} = $cs; + #$self->{'_sb_rank_cache'}->{$sbuild.":".$rank} = $cs; + #push @{$self->{'_sb_name_cache'}->{$sbuild.":".lc($name)}}, $cs; + } + + + #handle last cs + + if($cs){ + $self->{'_dbID_cache'}->{$cs->dbID()} = $cs; + #push @{$self->{'_rank_cache'}->{$rank}}, $cs; + push @{$self->{'_name_cache'}->{lc($cs->name())}}, $cs; + } + + $sth->finish(); + + + + #Get rid? Let core handle this + #No mapping paths present in meta table! + + # + # Retrieve a list of available mappings from the meta table. + # this may eventually be moved a table of its own if this proves too + # cumbersome + # + + #my %mapping_paths; + #my $mc = $self->db()->get_MetaContainer(); + + + #MAP_PATH: + #foreach my $map_path (@{$mc->list_value_by_key('assembly.mapping')}) { + # my @cs_strings = split(/[|#]/, $map_path); + +# if(@cs_strings < 2) { +# warning("Incorrectly formatted assembly.mapping value in meta " . +# "table: $map_path"); +# next MAP_PATH; +# } + +# my @coord_systems; +# foreach my $cs_string (@cs_strings) { +# my($name, $version) = split(/:/, $cs_string); +# my $cs = $self->fetch_by_name($name, $version); +# if(!$cs) { +# warning("Unknown coordinate system specified in meta table " . +# " assembly.mapping:\n $name:$version"); +# next MAP_PATH; +# } +# push @coord_systems, $cs; +# } + + # if the delimiter is a # we want a special case, multiple parts of the same + # componente map to same assembly part. As this looks like the "long" mapping + # we just make the path a bit longer :-) + +# if( $map_path =~ /\#/ && scalar( @coord_systems ) == 2 ) { +# splice( @coord_systems, 1, 0, ( undef )); +# } + +# my $cs1 = $coord_systems[0]; +# my $cs2 = $coord_systems[$#coord_systems]; + +# my $key1 = $cs1->name().':'.$cs1->version(); +# my $key2 = $cs2->name().':'.$cs2->version(); + +# if(exists($mapping_paths{"$key1|$key2"})) { +# warning("Meta table specifies multiple mapping paths between " . +# "coord systems $key1 and $key2.\n" . +# "Choosing shorter path arbitrarily.");# + +# next MAP_PATH if(@{$mapping_paths{"$key1|$key2"}} < @coord_systems); +# } + +# $mapping_paths{"$key1|$key2"} = \@coord_systems; +# } + + # + # Create the pseudo coord system 'toplevel' and cache it so that + # only one of these is created for each db... + # + + + #Not yet implemented across multiple dbs + #my $toplevel = Bio::EnsEMBL::Funcgen::CoordSystem->new(-TOP_LEVEL => 1, + # -NAME => 'toplevel', + # -ADAPTOR => $self); + # $self->{'_top_level'} = $toplevel; + + #$self->{'_mapping_paths'} = \%mapping_paths; + + return $self; +} + + +=head2 fetch_by_name + + Arg [1] : string $name + The name of the coordinate system to retrieve. Alternatively + this may be an alias for a real coordinate system. Valid + aliases are 'toplevel' and 'seqlevel'. + Arg [2] : optional - string $version + The version of the coordinate system to retrieve. If not + specified the default version for the appropriate schema_build + will be used. + Example : $coord_sys = $csa->fetch_by_name('chromosome', 'NCBI36'); + # toplevel is an pseudo coord system representing the highest + # coord system in a given region + # such as the chromosome coordinate system + $coord_sys = $csa->fetch_by_name('toplevel'); + #seqlevel is an alias for the sequence level coordinate system + #such as the clone or contig coordinate system + $coord_sys = $csa->fetch_by_name('seqlevel'); + Description: Retrieves a coordinate system by its name + Returntype : Bio::EnsEMBL::Funcgen::CoordSystem + Exceptions : throw if no name argument provided + warning if no version provided and default does not exist + Caller : general + Status : At risk + +=cut + + +#we need the schema_build for the top/sequence_level!!!!!!!!!!!!!!!!!!!!!!!!! +#if schema_build not defined them we need to use ->db->dnadb schema_build +#careful, this could be using the default dnadb already +#but this is the desired behaviour is it not? + +#need a generic method to fetch the best cs based on dnadb or latest schema_build +#also need generic method in DBAdaptor to set dnadb by Experiment +#need to populate schema_build in Experiment? +#how can we do this dynamically? all Experiment(ec, channel, rset) based methods should set dnadb appropriately? +#could this potentially mean this is called too many times for one query? +#or we could just let the user manage it? +#we need to check whether different/non-comparable schema_builds are added to the same result set +#use latest schema_build i.e. gene set or original schema_build. + +sub fetch_by_name{ + my $self = shift; + my $name = lc(shift); + my $version = lc(shift); + my $sbuild = $self->db->_get_schema_build($self->db->dnadb()); + my ($cs, $found_cs); + + throw('Mandatory argument \'name\'') if(! $name); + + #Set default_version if not specified + if(! $version){ + $version = $self->db->get_CoordSystemAdaptor->fetch_by_name($name)->version(); + } + + #can we not just use + #if(($name eq 'toplevel' || $name eq 'seqlevel') && ! $schema_build){ + # throw('To access toplevel or seqlevel you must provide a the third schema_build argument'); + # } + + warn "Using dnadb(".$sbuild.") to acquire $name" if($name =~ /level/); + + if($name eq 'seqlevel') { + return $self->fetch_sequence_level_by_schema_build($sbuild); + } elsif($name eq 'toplevel') { + return $self->fetch_top_level_by_schema_build($sbuild); + } + + if(! exists($self->{'_name_cache'}->{$name})) { + if($name =~ /top/) { + warn("Did you mean 'toplevel' coord system instead of '$name'?"); + } elsif($name =~ /seq/) { + warn("Did you mean 'seqlevel' coord system instead of '$name'?"); + } + return undef; + } + + + my @coord_systems = @{$self->{'_name_cache'}->{$name}}; + + #Filter versions if or get the default for the schema_build or comparable + + #This will only get non-versioned CSs if there are already loaded on a given schema_build + #Hence we can never retrieve a 'comparable' supercontig if it has not been loaded onto the current schema_build + #Hence we end up loading a new CS for each non-versioned level. + + + foreach $cs (@coord_systems) { + #Versions are only relevant to assembled levels e.g. chromosome & scaffold? + #No some DBs have version for contig, supercontig etc if they have no assembled level + #Issues around unassembled levels with version i.e. not being able to access data from + #old version, even tho' 'assembly' of contig should be identical. This should not be a big + #problem as the only species we are likely to do this with will have mappings between levels + #or we can just re import for new version of unassembled level? This will give redundant data in + #the DB for those contigs which appear in both versions and are identical. + + if($version) {#Assembled level + + if(lc($cs->version()) eq $version){ + #This will pick the right CS even if the dnadb schema_build is not present + $found_cs = $cs; + last; + } + } + else{#We have an unassembled/non-versioned level and can use any as there should only be 1 + $found_cs = $cs; + last; + } + } + + #should these throw? + if(! $found_cs){ + if($version) { + warn "No coord system found for $sbuild version '$version'"; + return undef; + }else{ + warn "Could not find $name CoordSystem."; + return undef + } + } + + + + #didn't find a default, just take first one + #my $cs = shift @coord_systems; + #warning("No default version for coord_system [$name] exists. " . + # "Using version [".$cs->version()."] arbitrarily"); + + return $found_cs; +} + + + +=head2 fetch_all + + Arg [1] : Bio::EnsEMBL::DBSQL::DBAdaptor + Arg [2] : optional string - attribute e.g. DEFAULT, SEQUENCE_LEVEL, RANK + Example : foreach my $cs (@{$csa->fetch_all()}) { + print $cs->name(), ' ', $cs->version(), "\n"; + } + Description: Retrieves every coordinate system defined in the DB. Will + restrict to those from a particular dnadb if one is passed. + #These will be returned in ascending order of rank. I.e. + #The highest coordinate system with rank=1 would be first in the + #array. + Returntype : listref of Bio::EnsEMBL::Funcgen::CoordSystems + Exceptions : none + Caller : general + Status : at risk - make arg optional so we can truly retrieve all + +=cut + +sub fetch_all { + my ($self, $dnadb, $attribute) = @_; + + if(! $dnadb || ! (ref($dnadb) && $dnadb->isa('Bio::EnsEMBL::DBSQL::DBAdaptor'))){ + throw('Not yet implement full fetch_all, please pass a dnadb'); + } + + my @coord_systems; + my $schema_build = $self->db->_get_schema_build($dnadb); + + foreach my $cs(values %{$self->{'_dbID_cache'}}){ + + if ($cs->contains_schema_build($schema_build)){ + next if($attribute && ! $cs->get_coord_system_attribute(uc($attribute), $dnadb)); + push @coord_systems, $cs; + } + } + + ##order the array by rank in ascending order + #foreach my $rank (sort {$a <=> $b} keys %{$self->{'_rank_cache'}}) { + # push @coord_systems, $self->{'_rank_cache'}->{$rank}; + #} + + return \@coord_systems; +} + + + +=head2 fetch_by_rank + + Arg [1] : int $rank + Example : my $cs = $coord_sys_adaptor->fetch_by_rank(1); + Description: Retrieves a CoordinateSystem via its rank. 0 is a special + rank reserved for the pseudo coordinate system 'toplevel'. + undef is returned if no coordinate system of the specified rank + exists. + Returntype : Bio::EnsEMBL::Funcgen::CoordSystem + Exceptions : none + Caller : general + Status : At risk + +=cut + +sub fetch_by_rank { + my $self = shift; + my $rank = shift; + + thrw('not implemented rank cache yet'); + + throw("Rank argument must be defined.") if(!defined($rank)); + throw("Rank argument must be a non-negative integer.") if($rank !~ /^\d+$/); + + if($rank == 0) { + return $self->fetch_top_level(); + } + + return $self->{'_rank_cache'}->{$rank}; +} + + + +=head2 fetch_all_by_name + + Arg [1] : string $name + The name of the coordinate system to retrieve. This can be + the name of an actual coordinate system or an alias for a + coordinate system. Valid aliases are 'toplevel' and 'seqlevel'. + Example : foreach my $cs (@{$csa->fetch_all_by_name('chromosome')}){ + print $cs->name(), ' ', $cs->version(); + } + Description: Retrieves all coordinate systems of a particular name + Returntype : listref of Bio::EnsEMBL::Funcgen::CoordSystem objects + Exceptions : throw if no name argument provided + Caller : general + Status : Medium + +=cut + +sub fetch_all_by_name { + my $self = shift; + my $name = lc(shift); #case insensitive matching + + throw('Name argument is required') if(!$name); + + if($name eq 'seqlevel') { + return [$self->fetch_sequence_level()]; + } elsif($name eq 'toplevel') { + return [$self->fetch_top_level()]; + } + + return $self->{'_name_cache'}->{$name} || []; +} + + + + + +=head2 fetch_by_dbID + + Arg [1] : int dbID + Example : $cs = $csa->fetch_by_dbID(4); + Description: Retrieves a coord_system via its internal + identifier, or undef if no coordinate system with the provided + id exists. + Returntype : Bio::EnsEMBL::Funcgen::CoordSystem or undef + Exceptions : thrown if no coord_system exists for specified dbID + Caller : general + Status : Stable + +=cut + +sub fetch_by_dbID { + my $self = shift; + my $dbID = shift; + + throw('dbID argument is required') if(!$dbID); + + my $cs = $self->{'_dbID_cache'}->{$dbID}; + + return undef if(!$cs); + + return $cs; +} + + + +=head2 fetch_top_level + + Arg [1] : none + Example : $cs = $csa->fetch_top_level(); + Description: Retrieves the toplevel pseudo coordinate system. + Returntype : a Bio::EnsEMBL::Funcgen::CoordSystem object + Exceptions : none + Caller : general + Status : At risk + +=cut + +sub fetch_top_level { + my $self = shift; + + throw("Not yet implemented with schema_build"); + + return $self->{'_top_level'}; +} + + +=head2 fetch_sequence_level + + Arg [1] : none + Example : ($id, $name, $version) = $csa->fetch_sequence_level(); + Description: Retrieves the coordinate system at which sequence + is stored at. + Returntype : Bio::EnsEMBL::Funcgen::CoordSystem + Exceptions : throw if no sequence_level coord system exists at all + throw if multiple sequence_level coord systems exists + Caller : general + Status : At risk + +=cut + +sub fetch_sequence_level { + my $self = shift; + + throw("Not yet implemented with schema_build"); + + my @dbIDs = keys %{$self->{'_is_sequence_level'}}; + + throw('No sequence_level coord_system is defined') if(!@dbIDs); + + if(@dbIDs > 1) { + throw('Multiple sequence_level coord_systems are defined.' . + 'Only one is currently supported'); + } + + return $self->{'_dbID_cache'}->{$dbIDs[0]}; +} + + + + +=head2 get_mapping_path + + Arg [1] : Bio::EnsEMBL::CoordSystem $cs1 + Arg [2] : Bio::EnsEMBL::CoordSystem $cs2 + Example : foreach my $cs @{$csa->get_mapping_path($cs1,$cs2); + Description: Given two coordinate systems this will return a mapping path + between them if one has been defined. Allowed Mapping paths are + explicitly defined in the meta table. The following is an + example: + + mysql> select * from meta where meta_key = 'assembly.mapping'; + +---------+------------------+--------------------------------------+ + | meta_id | meta_key | meta_value | + +---------+------------------+--------------------------------------+ + | 20 | assembly.mapping | chromosome:NCBI34|contig | + | 21 | assembly.mapping | clone|contig | + | 22 | assembly.mapping | supercontig|contig | + | 23 | assembly.mapping | chromosome:NCBI34|contig|clone | + | 24 | assembly.mapping | chromosome:NCBI34|contig|supercontig | + | 25 | assembly.mapping | supercontig|contig|clone | + +---------+------------------+--------------------------------------+ + + For a one-step mapping path to be valid there needs to be + a relationship between the two coordinate systems defined in + the assembly table. Two step mapping paths work by building + on the one-step mapping paths which are already defined. + + The first coordinate system in a one step mapping path must + be the assembled coordinate system and the second must be + the component. + + Example of use: + my $cs1 = $cs_adaptor->fetch_by_name('contig'); + my $cs2 = $cs_adaptor->fetch_by_name('chromosome'); + + my @path = @{$cs_adaptor->get_mapping_path($cs1,$cs2)}; + + if(!@path) { + print "No mapping path."; + } + elsif(@path == 2) { + print "2 step mapping path."; + print "Assembled = " . $path[0]->name() . "\n"; + print "Component = " . $path[1]->name() . "\n"; + } else { + print "Multi step mapping path\n"; + } + + Returntype : reference to a list of Bio::EnsEMBL::CoordSystem objects + + Exceptions : none + Caller : general + Status : At risk + +=cut + + +#Need to be redirected to the core/dnadb of interest + +sub get_mapping_path { + my $self = shift; + my $cs1 = shift; + my $cs2 = shift; + + if(!ref($cs1) || !ref($cs2) || + !$cs1->isa('Bio::EnsEMBL::CoordSystem') || + !$cs2->isa('Bio::EnsEMBL::CoordSystem')) { + throw('Two Bio::EnsEMBL::CoordSystem arguments expected.'); + } + + my $key1 = $cs1->name() . ":" . $cs1->version(); + my $key2 = $cs2->name() . ":" . $cs2->version(); + + my $path = $self->{'_mapping_paths'}->{"$key1|$key2"}; + + return $path if($path); + + $path = $self->{'_mapping_paths'}->{"$key2|$key1"}; + + if(!$path) { + # No path was explicitly defined, but we might be able to guess a + # suitable path. We only guess for missing 2 step paths. + + my %mid1; + my %mid2; + + foreach my $path (values(%{$self->{'_mapping_paths'}})) { + next if(@$path != 2); + + my $match = undef; + + if($path->[0]->equals($cs1)) { + $match = 1; + } elsif($path->[1]->equals($cs1)) { + $match = 0; + } + + if(defined($match)) { + my $mid = $path->[$match]; + my $midkey = $mid->name() . ':' . $mid->version(); + + # is the same cs mapped to by other cs? + if($mid2{$midkey}) { + my $path = [$cs1,$mid,$cs2]; + $self->{'_mapping_paths'}->{"$key1|$key2"} = $path; + $key1 =~ s/\:$//; + $key2 =~ s/\:$//; + $midkey =~ s/\:$//; + warning("Using implicit mapping path between '$key1' and '$key2' " . + "coord systems.\n" . + "An explicit 'assembly.mapping' entry should be added " . + "to the meta table.\nExample: " . + "'$key1|$midkey|$key2'\n"); + return $path; + } else { + $mid1{$midkey} = $mid; + } + } + + $match = undef; + + if($path->[0]->equals($cs2)) { + $match = 1; + } elsif($path->[1]->equals($cs2)) { + $match = 0; + } + + + if(defined($match)) { + my $mid = $path->[$match]; + my $midkey = $mid->name() . ':' . $mid->version(); + + # is the same cs mapped to by other cs? + if($mid1{$midkey}) { + my $path = [$cs2,$mid,$cs1]; + $self->{'_mapping_paths'}->{"$key2|$key1"} = $path; + + $key1 =~ s/\:$//; + $key2 =~ s/\:$//; + $midkey =~ s/\:$//; + warning("Using implicit mapping path between '$key1' and '$key2' " . + "coord systems.\n" . + "An explicit 'assembly.mapping' entry should be added " . + "to the meta table.\nExample: " . + "'$key1|$midkey|$key2'\n"); + + return $path; + } else { + $mid2{$midkey} = $mid; + } + } + } + } + + return $path || []; +} + +=head2 _fetch_by_attribute + + Arg [1] : + Example : + Description: + Returntype : + Exceptions : + Caller : + Status : At risk + +=cut + +sub _fetch_by_attrib { + my $self = shift; + my $attrib = shift; + my $version = shift; + + $version = lc($version) if($version); + + my @dbIDs = keys %{$self->{"_is_$attrib"}}; + + throw("No $attrib coordinate system defined") if(!@dbIDs); + + foreach my $dbID (@dbIDs) { + my $cs = $self->{'_dbID_cache'}->{$dbID}; + if($version) { + return $cs if(lc($version) eq $cs->version()); + } elsif($self->{'_is_default_version'}->{$dbID}) { + return $cs; + } + } + + #specifically requested attrib system was not found + if($version) { + throw("$attrib coord_system with version [$version] does not exist"); + } + + #coordsystem with attrib exists but no default is defined: + my $dbID = shift @dbIDs; + my $cs = $self->{'_dbID_cache'}->{$dbID}; + my $v = $cs->version(); + warning("No default version for $attrib coord_system exists. " . + "Using version [$v] arbitrarily"); + + return $cs; +} + +=head2 _fetch_all_by_attribute + + Arg [1] : + Example : + Description: + Returntype : + Exceptions : + Caller : + Status : At risk + +=cut + +sub _fetch_all_by_attrib { + my $self = shift; + my $attrib = shift; + + my @coord_systems = (); + foreach my $dbID (keys %{$self->{"_is_$attrib"}}) { + push @coord_systems, $self->{"_dbID_cache"}->{$dbID}; + } + + return \@coord_systems; +} + + +=head2 store + + Arg [1] : Bio::EnsEMBL::Funcgen::CoordSystem + Example : $csa->store($coord_system); + Description: Stores a CoordSystem object in the database. + Returntype : none + Exceptions : Warning if CoordSystem is already stored in this database. + Caller : none + Status : At risk + +=cut + +sub store { + my $self = shift; + my $cs = shift; + + if(!$cs || !ref($cs) || !$cs->isa('Bio::EnsEMBL::Funcgen::CoordSystem')) { + throw('CoordSystem argument expected.'); + } + + my $sth; + my $db = $self->db(); + my $name = $cs->name(); + my $version = $cs->version(); + + if($name eq 'toplevel' || $name eq 'seqlevel' || !$name) { + throw("[$name] is not a valid name for a storable CoordSystem."); + } + + foreach my $sbuild(keys %{$cs->{'core_cache'}}){ + my $rank = $cs->{'core_cache'}->{$sbuild}->{'RANK'}; + my $seqlevel = $cs->{'core_cache'}->{$sbuild}->{'SEQUENCE_LEVEL'}; + my $default = $cs->{'core_cache'}->{$sbuild}->{'DEFAULT'}; + my $ccs_id = $cs->{'core_cache'}->{$sbuild}->{'CORE_COORD_SYSTEM_ID'}; + + # + # Do lots of sanity checking to prevent bad data from being entered + # + + if($cs->{'core_cache'}->{$sbuild}->{'IS_STORED'}) { + #Doesn't this only check on dbID? + next; + } + + + #if($seqlevel && keys(%{$self->{'_is_sequence_level'}})) { + # throw("There can only be one sequence level CoordSystem."); + #} + + #if(exists $self->{'_name_cache'}->{lc($name)}) { + # my @coord_systems = @{$self->{'_name_cache'}->{lc($name)}}; + + # foreach my $c (@coord_systems) { + # if(lc($c->version()) eq lc($version)) { + # warning("CoordSystem $name $version is already in db.\n"); + # return; + # } + # if($default && $self->{'_is_default_version'}->{$c->dbID()}) { + # throw("There can only be one default version of CoordSystem $name"); + # } + # } + #} + + if($rank !~ /^\d+$/) { + throw("Rank attribute must be a positive integer not [$rank]"); + } + + if($rank == 0) { + throw("Only toplevel CoordSystem may have rank of 0."); + } + + #if(defined($self->{'_rank_cache'}->{$rank})) { + # throw("CoordSystem with rank [$rank] already exists."); + #} + + my @attrib; + + push @attrib, 'default_version' if($default); + push @attrib, 'sequence_level' if($seqlevel); + + my $attrib_str = (@attrib) ? join(',', @attrib) : undef; + + # + # store the coordinate system in the database + # + + if(! $cs->dbID()){ + + $sth = $self->prepare('insert into coord_system (name, version, attrib, rank, schema_build, core_coord_system_id, species_id) values (?,?,?,?,?,?,?)'); + + $sth->bind_param(1, $name, SQL_VARCHAR); + $sth->bind_param(2, $version, SQL_VARCHAR); + $sth->bind_param(3, $attrib_str, SQL_VARCHAR); + $sth->bind_param(4, $rank, SQL_INTEGER); + $sth->bind_param(5, $sbuild, SQL_VARCHAR); + $sth->bind_param(6, $ccs_id, SQL_INTEGER); + $sth->bind_param(7, $self->species_id(), SQL_INTEGER); + + + #Here we are getting failures due to concurrent processes storing the same CS. + #There is no abolsolute way of protecting again this unless we lock the tables + #before we query. + #This could happen with seq_region also, but it is higly unlikley that we will write at the same time. + + $sth->execute(); + + #eval { $sth->execute() }; + + #if($@){ + + #} + + + my $dbID = $sth->{'mysql_insertid'}; + $sth->finish(); + + if(!$dbID) { + throw("Did not get dbID from store of CoordSystem."); + } + + $cs->dbID($dbID); + $cs->adaptor($self); + }else{ + #can we prep this out of the loop + #we don't know until we're in it + my $sql = 'insert into coord_system (coord_system_id, name, version, attrib, rank, schema_build, core_coord_system_id, species_id) values (?,?,?,?,?,?,?,?)'; + $sth = $db->dbc->prepare($sql); + + $sth->bind_param(1, $cs->dbID(), SQL_INTEGER); + $sth->bind_param(2, $name, SQL_VARCHAR); + $sth->bind_param(3, $version, SQL_VARCHAR); + $sth->bind_param(4, $attrib_str, SQL_VARCHAR); + $sth->bind_param(5, $rank, SQL_INTEGER); + $sth->bind_param(6, $sbuild, SQL_VARCHAR); + $sth->bind_param(7, $ccs_id, SQL_INTEGER); + $sth->bind_param(8, $self->species_id(), SQL_INTEGER); + $sth->execute(); + $sth->finish(); + } + + $cs->{'core_cache'}{$sbuild}{'IS_STORED'} = 1; + + + } + + + # + # update the internal caches that are used for fetching + # + #$self->{'_is_default_version'}->{$dbID} = 1 if($default); + #$self->{'_is_sequence_level'}->{$dbID} = 1 if($seqlevel); + + $self->{'_name_cache'}->{lc($name)} ||= []; + #$self->{'_rank_cache'}->{$rank} ||= []; + $self->{'_dbID_cache'}->{$cs->dbID()} = $cs; + #this will duplicate CS in cache if we add a core cs and then store + #same with rank cache, need to replace + my $push = 1; + + foreach my $name_cs(@{$self->{'_name_cache'}->{lc($name)}}){ + + if($name_cs->version() eq $cs->version()){ + $push = 0; + $name_cs = $cs; + } + } + + push @{$self->{'_name_cache'}->{lc($name)}}, $cs if $push; + + #$push = 1; + + + + #this could result in mixed rank cs in the same rank cache + #push @{$self->{'_rank_cache'}->{$rank}}, $cs; + #need to rethink rank cache? make it schema_rank cache + + + return $cs; +} + + +=head2 validate_and_store_coord_system + + Arg [1] : Bio::EnsEMBL::CoordSystem (could also be Funcgen::CoordSystem) + Example : my $funcgen_cs = $csa->validate_coord_system($core_cs); + Description: Given a CoordSystem retrieves the corresponding Funcgen CoordSystem + or generates new one + Returntype : Bio::EnsEMBL::Funcgen::CoordSystem + Exceptions : throw if arg not valid and stored + Caller : general + Status : At risk - just have validate and let DBAdaptor store totally new CSs? + +=cut + +#currently get cs from slice, and need to validate for dnadb too +#can take FGCoordSystem or CoordSystem + +sub validate_and_store_coord_system{ + my ($self, $cs) = @_; + + if (! (ref($cs) && $cs->isa('Bio::EnsEMBL::CoordSystem') && $cs->dbID())) { + throw('Must provide a valid stored Bio::EnsEMBL::CoordSystem'); + } + + + #Need to add to Funcgen coord_system here + #check if name and version are present and reset coord_system_id to that one, else get last ID and create a new one + #coord_system_ids will not match those in core DBs, so we need ot be mindful about this. + #can't use is_stored as this simply checks the dbID + #seq_region_ids may change between schemas with the same assembly version + #Store schema_version in coord_system and create seq_region translation + #table to maintain the seq_region_id mapping back to each core DB + + + #Do we need to check the the dnadb and the slice db match? + #Do we have to have specified a dnadb at this point? No. + #But need to put checks in place for dnadb methods i.e. seq/slice retrieval + + + + my $sbuild = $self->db->_get_schema_build($cs->adaptor->db()); + + #this should implicitly use the current schema_build + #hence providing specificty for non-version CS's e.g. supercontig etc... + my $fg_cs = $self->fetch_by_name($cs->name(), $cs->version()); + + #this needs to satify both schema_build and version + #retrieving by name version should retunr the lastest schema_build unless the it is not the toplevel or highest expected rank? + + my $version; + + if (! $fg_cs) { + + #if($cs->name ne 'clone' && (! $cs->version)){ + #NO VERSION for assembled level !! + #Assume the default version + #we could get this from meta, but is unreliable + #get from default chromosome version + #my $tmp_cs = $cs->adaptor->fetch_by_name('chromosome'); + #$version = $tmp_cs->version; + #} + + + $fg_cs = Bio::EnsEMBL::Funcgen::CoordSystem->new( + -NAME => $cs->name(), + -VERSION => $cs->version(), + ); + + warn "Created new CoordSystem:\t".$fg_cs->name().":".$fg_cs->version()."\n"; + } + + + #This is done in BaseFeatureAdaptor->_pre_store + #to avoid users without write permission trying + #to store olf assemblies on new shema_builds + #on old schema_build which are already present + #If the CS can't be found then you're probably + #importing for the first time and have write permissions + + #re-instated as we don't want any extra calls in _pre_store + #as this iterates over every features stored + #increasing import time. + + if (! $fg_cs->contains_schema_build($sbuild)) { + + #Need to set all attribs here. + + $fg_cs->add_core_coord_system_info( + -RANK => $cs->rank(), + -SEQUENCE_LEVEL => $cs->is_sequence_level(), + -DEFAULT => $cs->is_default(), + -SCHEMA_BUILD => $sbuild, + -CORE_COORD_SYSTEM_ID => $cs->dbID(), + -IS_STORED => 0, + ); + + eval { $fg_cs = $self->store($fg_cs) }; + + if ($@ && (! exists $cs_warnings{$fg_cs->name.':'.$fg_cs->version})) { + $cs_warnings{$fg_cs->name.':'.$fg_cs->version} = 1; + warning("$@\nYou do not have permisson to store the CoordSystem for schema_build $sbuild\n". + "Using comparable CoordSystem:\t".$fg_cs->name.':'.$fg_cs->version."\n"); + } + } + + + + return $fg_cs; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/DBAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/DBAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,787 @@ + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor + +=head1 SYNOPSIS + +my $db = Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor->new + ( + -host => "ensembldb.ensembl.org", + -dbname => "mus_musculus_funcgen_41_36b", + -species => "Mus_musculus", + -user => "anonymous", + -dnadb => $mouse_core_db, + -port => '3307', + ); + +my $experiment_adaptor = $db->get_ExperimentAdaptor(); + +=back + +=head1 DESCRIPTION + +An adaptor to access the funcgen database and expose other available adaptors. + +=cut + +################################################################################ + + +package Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; + +use strict; +use base qw(Bio::EnsEMBL::DBSQL::DBAdaptor); #@ISA + +use DBI; + +use Bio::EnsEMBL::Utils::Exception qw(warning throw deprecate stack_trace_dump); +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::DBSQL::DBConnection; +use Bio::EnsEMBL::Registry; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +my $reg = "Bio::EnsEMBL::Registry"; + + +=head2 new + + Arg [-DNADB_HOST] : String - Overrides defaults (ensembldb or registry) + Arg [-DNADB_USER] : String - Overrides defaults (ensembldb or registry) + Arg [-DNADB_PASS] : String - Overrides defaults (ensembldb or registry) + Arg [-DNADB_PORT] : String - Overrides defaults (ensembldb or registry) + Arg [-DNADB_NAME] : String - Overrides defaults (ensembldb or registry) + Arg [-DNADB_ASSEMBLY] : String - Overrides defaults (ensembldb or registry) + + Arg [...] : Other args are passed to superclass Bio::EnsEMBL::DBSQL::DBAdaptor + Example1 : $db = new Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor + ( + -user => 'readonly', #No password + -dbname => 'pog', + -host => 'caldy', + ); + + #If dnadb is not defined in registry, this will automatically + #set it from the default dnadb_host (e.g. ensembldb) + + Exmaple2 : $db = new Bio::EnsEMBL::DBSQL::Funcgen::DBAdaptor + ( + -user => 'write', + -pass => 'password', + -dbname => 'pog', + -host => 'caldy', + -dnadb_assmebly => '36', + ); + #This will specifically look for a dnadb with assembly version 36 + #on the default dnadb_host + + Exmaple2 : $db = new Bio::EnsEMBL::DBSQL::Funcgen::DBAdaptor + ( + -user => 'write', + -pass => 'password', + -dbname => 'pog', + -host => 'caldy', + -dnadb_name => 'my_homo_sapiens_funcgen_67_37', + -dnadb_host => 'my_host', + ); + #This will over-ride the default dnadb setting in favour of the dnadb params + + + Description: Constructor for DBAdaptor. Will automatically set the dnadb based on dnadb params. + This makes some assumptions about how the DBs name are defined i.e. the last part must + conform to the _RELEASE_ASSEMBLY format e.g. 67_37 in homo_sapiens_funcgen_67_37 + Returntype : Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor + Exceptions : Throws if conflicting dnadb params found + Caller : general + Status : Stable + +=cut + +sub new { + my ($class, @args) = @_; + + #Force group to be funcgen as this is the only valid group. + my $self = $class->SUPER::new(@args, '-group', 'funcgen'); + + if($self->species eq 'DEFAULT'){ #Auto set species if not set + + #Can't do list_value_by_key as this depends on species, so we get a circular reference + #This has already been set in the registry in SUPER::new above as DEFAULT! + #So we need to reset this in the registry here? + + $self->{'_species'} = ${$self->get_MetaContainer->list_value_by_key('species.production_name')}[0] || 'DEFAULT'; + + if($self->species ne 'DEFAULT'){ #Reset this in the registry to the correct species + $self = Bio::EnsEMBL::Utils::ConfigRegistry::gen_load($self); + #This causes duplicate software vs DB release warnings + } + } + #else should we redefine the species as the standard alias if it does not match? + #This would prevent external_db species name testing + #Maybe the solution is to make external_db multi_species + + my ( $dnadb_host, $dnadb_user, $dnadb_port, $dnadb_pass, $dnadb_assm, $dnadb_name, $dnadb) + = rearrange( [ 'DNADB_HOST', 'DNADB_USER', + 'DNADB_PORT', 'DNADB_PASS', + 'DNADB_ASSEMBLY', 'DNADB_NAME', + 'DNADB' + ], + @args ); + + my $default_dnadb = $self->SUPER::dnadb; + my ($default_host, $default_port, $default_user, $default_pass, $default_assm, $default_name, $efg_assm); + my ($dnadb_predefined, $dnadb_params); + + if( $dnadb_host || $dnadb_user || $dnadb_port || $dnadb_pass || $dnadb_assm || $dnadb_name){ + $dnadb_params = 1; + + if($dnadb){ + throw('You cannot specific -dnadb and other dnadb params'); + } + } + + + if($default_dnadb->group eq 'core'){ + #This means you have loaded a registry or passed a dnadb to the efg DBAdaptor + $default_host = $default_dnadb->dbc->host; + $default_port = $default_dnadb->dbc->port; + $default_user = $default_dnadb->dbc->username; + $default_pass = $default_dnadb->dbc->password; + $default_name = $default_dnadb->dbc->dbname; + ($default_assm = (split/_/, $self->_get_schema_build($default_dnadb))[1]) =~ s/[a-z]//; + $dnadb_predefined = 1; + } + + #We need to test dnadb_name vs dnadb_assm here before we over-ride we set defaults + #We can expect a mistmatch if we have only defined one or the other + if( ($dnadb_assm && $dnadb_name) && + ($dnadb_name !~ /_${dnadb_assm}_/) ){ + throw("You have specified conflicting -dnadb_name(${dnadb_name}) and -dnadb_assembly(${dnadb_assm}) parameters"); + } + elsif($dnadb_name){ #Get dnadb_assm from name + #This is not strictly required, but means we don't set is incorrectly below + ($dnadb_assm = $dnadb_name) =~ s/.*_([0-9a-z]+)$/$1/; + } + + + #Defaults now set in dnadb as we want to test registry first; + #These will pick default_dnadb values if params not explicitly set + $self->{'dnadb_host'} = $dnadb_host || $default_host || 'ensembldb.ensembl.org'; + $self->{'dnadb_port'} = $dnadb_port || $default_port || 3306; + $self->{'dnadb_user'} = $dnadb_user || $default_user || 'anonymous'; + $self->{'dnadb_pass'} = $dnadb_pass || $default_pass || undef; + $self->{'dnadb_name'} = $dnadb_name || $default_name || undef; + ($efg_assm = (split/_/, $self->_get_schema_build($self))[1]) =~ s/[a-z]//; + $self->{'dnadb_assm'} = $dnadb_assm || $default_assm || $efg_assm; + $dnadb_assm = $self->{'dnadb_assm'}; #reset here as we use below. + + + #This only tries to _set_dnadb if we set some dnadb_params + #or the dnadb_assm doesn't match the default/predefined dnadb + + if($dnadb_params || + ($self->_get_schema_build($self->dnadb()) !~ /[0-9]+_${dnadb_assm}[a-z]*$/) ){ + + if(! $dnadb_params){ + + warn ':: WARNING: Unable to match assembly version between the dnadb name ('. + $self->dnadb->dbc->dbname.') and the specified -dnadb_assm '.$self->dnadb_assembly. + "\nMaybe you need to rename your DBs according to the Ensembl naming convention e.g. myprefix_homo_sapiens_55_37"; + } + elsif($dnadb_predefined){ + #No can't be dnadb as we throw if we have conflicting -dnadb and dnadb params + warn ":: Over-riding pre-defined dnadb regsitry values with dnadb params:\t". + $self->dnadb_user.'@'.$self->dnadb_host.':'.$self->dnadb_port; + } + + $self->_set_dnadb; + } + + return $self; +} + + +#Move these stored method to BaseAdaptor? + +=head2 is_stored_and_valid + + Arg [1] : string - class namespace + Arg [1] : Bio::EnsEMBL::Funcgen::Storable e.g. ResultSet etc. + Example : $db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::ResultSet', $rset); + DESCRIPTION: Validates object class and stored status + Returntype : none + Exceptions : Throws if Storable is not valid or stored + Caller : general - Adaptors, objects will probably be better off implementing in situ. + This is to avoid having to test for the adaptor for every object which could slow things down + Status : At risk + +=cut + + +#This has to be in the DBAdaptor rather than Storable as we're +#calling isa on self otherwise which we don't know whether we can + +sub is_stored_and_valid{ + my ($self, $class, $obj) = @_; + + if(! (ref($obj) && $obj->isa($class) && $obj->is_stored($self))){ + #is_stored checks adaptor params and dbID, but not whether the adaptor matches the class + throw('Must provide a valid stored '.$class."\nParameter provided was:\t$obj"); + } + + return; +} + +=head2 are_stored_and_valid + + Arg [1] : string - class namespace + Arg [2] : ARRAYREF os Bio::EnsEMBL::Funcgen::Storable objects e.g. ResultSet + Arg [3] : String : return value method name + Example : $db->are_stored_and_valid('Bio::EnsEMBL::Funcgen::ResultSet', \@rsets); + DESCRIPTION: Wrapper for is_stored_and_valid. Will optionally return array of values + defined by calling method name arg on each object passed + Returntype : ARRAYREF - contents defined by optional method name arg + Exceptions : Throws if object list is not an ARRAY with at least one element + Caller : general + Status : At risk + +=cut + +#Add method params? + +sub are_stored_and_valid{ + my ($self, $class, $obj_list, $method_name) = @_; + + my @return_vals; + + if( (ref($obj_list) ne 'ARRAY') || + (scalar(@$obj_list) <=0) ){ + throw('You must provide an ARRAYREF of objects to validate'); + } + + foreach my $obj(@$obj_list){ + $self->is_stored_and_valid($class, $obj); + + if($method_name){ + #test can method here? + push @return_vals, $obj->$method_name; + } + } + + return \@return_vals; +} + + + +#Move these to Helper.pm! Check method dependencies first! + +=head2 load_table_data + + Arg [1] : string - table name + Arg [1] : string - file path for file to load + Example : $db->load_table_data("result", $self->get_dir($results_dir)."/result.txt"); + DESCRIPTION: Generic method to load a file into a specified table + Returntype : none + Exceptions : Throws if argument not supplied + Caller : general + Status : At risk - only used by for results at present, to be removed + +=cut + +sub load_table_data{ + my ($self, $table, $file, $ssh) = @_; + + chmod 0755, $file; + # warn("Importing $table data from $file"); + my $cmd = 'mysqlimport -L '.$self->connect_string().' '.$file; + system($cmd) == 0 || throw("Failed to load data from $file\nExit code:\t".($?>>8)."\n$!"); + + return; +} + + + +=head2 get_available_adaptors + + Example : my %pairs = %{$dba->get_available_adaptors()}; + Description: gets a hash of the available adaptors + ReturnType : reference to a hash + Exceptions : none + Caller : Bio::EnsEMBL::Utils::ConfigRegistry + Status : Stable + +=cut + + +#will adding SliceAdaptor here use the dna DB? i.e. the core DB rather than the efg DB? + +sub get_available_adaptors{ + my ($self) = shift; + + my %pairs = ( + 'Channel' => 'Bio::EnsEMBL::Funcgen::DBSQL::ChannelAdaptor', + 'ExperimentalChip' => 'Bio::EnsEMBL::Funcgen::DBSQL::ExperimentalChipAdaptor', + 'ArrayChip' => 'Bio::EnsEMBL::Funcgen::DBSQL::ArrayChipAdaptor', + 'Array' => 'Bio::EnsEMBL::Funcgen::DBSQL::ArrayAdaptor', + 'ProbeSet' => 'Bio::EnsEMBL::Funcgen::DBSQL::ProbeSetAdaptor', + 'Probe' => 'Bio::EnsEMBL::Funcgen::DBSQL::ProbeAdaptor', + 'ProbeFeature' => 'Bio::EnsEMBL::Funcgen::DBSQL::ProbeFeatureAdaptor', + 'AnnotatedFeature' => 'Bio::EnsEMBL::Funcgen::DBSQL::AnnotatedFeatureAdaptor', + 'RegulatoryFeature' => 'Bio::EnsEMBL::Funcgen::DBSQL::RegulatoryFeatureAdaptor', + 'Experiment' => 'Bio::EnsEMBL::Funcgen::DBSQL::ExperimentAdaptor', + 'ExperimentalGroup' => 'Bio::EnsEMBL::Funcgen::DBSQL::ExperimentalGroupAdaptor', + 'DataSet' => 'Bio::EnsEMBL::Funcgen::DBSQL::DataSetAdaptor', + 'FeatureType' => 'Bio::EnsEMBL::Funcgen::DBSQL::FeatureTypeAdaptor', + 'FGCoordSystem' => 'Bio::EnsEMBL::Funcgen::DBSQL::CoordSystemAdaptor',#prepended FG to override core adaptor? + 'MetaCoordContainer' => 'Bio::EnsEMBL::Funcgen::DBSQL::MetaCoordContainer', + 'FeatureSet' => 'Bio::EnsEMBL::Funcgen::DBSQL::FeatureSetAdaptor', + 'ResultSet' => 'Bio::EnsEMBL::Funcgen::DBSQL::ResultSetAdaptor', + 'DataSet' => 'Bio::EnsEMBL::Funcgen::DBSQL::DataSetAdaptor', + 'InputSet' => 'Bio::EnsEMBL::Funcgen::DBSQL::InputSetAdaptor', + 'ExternalFeature' => 'Bio::EnsEMBL::Funcgen::DBSQL::ExternalFeatureAdaptor', + 'CellType' => 'Bio::EnsEMBL::Funcgen::DBSQL::CellTypeAdaptor', + 'DBEntry' => 'Bio::EnsEMBL::Funcgen::DBSQL::DBEntryAdaptor', + 'Slice' => 'Bio::EnsEMBL::Funcgen::DBSQL::SliceAdaptor', + 'ResultFeature' => 'Bio::EnsEMBL::Funcgen::DBSQL::ResultFeatureAdaptor', + 'MotifFeature' => 'Bio::EnsEMBL::Funcgen::DBSQL::MotifFeatureAdaptor', + 'BindingMatrix' => 'Bio::EnsEMBL::Funcgen::DBSQL::BindingMatrixAdaptor', + 'SegmentationFeature'=> 'Bio::EnsEMBL::Funcgen::DBSQL::SegmentationFeatureAdaptor', + + + #add required EnsEMBL(core) adaptors here + #Should write/retrieve from efg not dna db + 'UnmappedObject' => 'Bio::EnsEMBL::DBSQL::UnmappedObjectAdaptor', + 'Analysis' => 'Bio::EnsEMBL::DBSQL::AnalysisAdaptor', + "MetaContainer" => 'Bio::EnsEMBL::DBSQL::MetaContainer', + ); + + return (\%pairs); +} + +=head2 _get_schema_build + + Arg [1] : Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor or Bio::EnsEMBL::DBSQL::DBAdaptor + Example : my $shema_build = $db->_get_schema_build($slice->adaptor->db()); + DESCRIPTION: + Returntype : string + Exceptions : Throws if argument not supplied + Caller : general + Status : At risk - replace with MetaContainer method + +=cut + + +#Slightly hacky convinience method to get the data/schema.version/build from a feature slice + +sub _get_schema_build{ + my ($self, $db) = @_; + + #Have to explicitly pass self->db to this method if required, this highlights which db is being tested + throw("Need to define a DBAdaptor to retrieve the schema_build from") if (! $db); + #avoided using dnadb by default to avoid obfuscation of behaviour + + my @dbname = split/_/, $db->dbc->dbname(); + + + + my $schema_build = pop @dbname; + $schema_build = pop(@dbname).'_'.$schema_build; + return $schema_build; +} + + +#Remove all this dnadb setter functionality + +sub dnadb_host{ + my ($self, $host) = @_; + $self->{'dnadb_host'} = $host if $host; + return $self->{'dnadb_host'}; +} + +sub dnadb_name{ + return $_[0]->{dnadb_name}; +} + +sub dnadb_port{ + my ($self, $port) = @_; + $self->{'dnadb_port'} = $port if $port; + return $self->{'dnadb_port'}; +} + +sub dnadb_pass{ + my ($self, $pass) = @_; + $self->{'dnadb_pass'} = $pass if $pass; + return $self->{'dnadb_pass'}; +} + +sub dnadb_user{ + my ($self, $user) = @_; + $self->{'dnadb_user'} = $user if $user; + return $self->{'dnadb_user'}; +} + +sub dnadb_assembly{ + my ($self, $assm) = @_; + $self->{'dnadb_assm'} = $assm if $assm; + return $self->{'dnadb_assm'}; +} + +#Redefine dbadb here to add coordsystem + +=head2 dnadb + + Arg [1]: Bio::EnsEMBL::DBSQL::DBAdaptor + Arg [2]: string - coord_system name e.g. chromosome + Usage : my $dnadb = $db->dnadb(); + Description: returns the database adaptor where the dna lives i.e. the core db for a given species + There are at least 2 cases where you need to set this explicitly + 1. If you want to retrieve features on an assembly which is not the default in + the correspeonding core DB with matching schema_build + 2. If the corresponding core DB is not available on the default ensembl DB + server(ensembldb/ens-livemirror) i.e. before a new release. + Status : At risk. - Might remove validation of CS + +=cut + +sub dnadb { + my ($self, $dnadb, $cs_name) = @_; + + + if($dnadb || $self->SUPER::dnadb->group() ne 'core'){ + + if(! $dnadb){#Guess/set dnadb by assembly or dnadb params + return $self->_set_dnadb; + } + + $self->SUPER::dnadb($dnadb); + + #set default coordsystem here + #there might not be a chromosome level if we just have a scaffold assembly + #supercontig is already loaded as we use toplevel? + #This should really get all the default CSs from the core CSAdaptor? + #We should also do this during the update? + #This will also enable people to query using clone/contig/supercontig level slices + #How will this work? Where will the mapping between CSs be done? + + my @cs_names; + @cs_names = ($cs_name) if $cs_name; + #$cs_name ||= 'chromosome'; + + if(! $cs_name){ + + foreach my $cs(@{$dnadb->get_CoordSystemAdaptor->fetch_all_by_attrib('default_version')}){ + push @cs_names, $cs->name; + } + } + + + foreach my $cs_name(@cs_names){ + + my $cs; + eval { $cs = $dnadb->get_CoordSystemAdaptor->fetch_by_name($cs_name)}; + my $error = $@; + + if($error){ + my ($schema, $build) = split/_/, $self->_get_schema_build($dnadb); + $build =~ s/[a-z]//; + throw("It appears that the schema of ".$dnadb->dbc->dbname. + ' is incompatible with your current core API version('.$reg->software_version. + "). You could try using the $schema version of the core API, or alternatively try specifying ". + "different -dnadb/registry_host parameters to point to a make recent version containing build $build\n"); + + } + + + #this will only add the default assembly for this DB, if we're generating on another we need to add it separately. + + #!!! This is a non-obvious store behaviour !!! + #This can result in coord_system entries being written unknowingly if you are using the efg DB with a write user + $self->get_FGCoordSystemAdaptor->validate_and_store_coord_system($cs); + } + } + + return $self->SUPER::dnadb();#never pass @_ here! +} + + +=head2 _set_dnadb + + Usage : $self->_set_dnadb; + Description: Sets the dnadb to the latest version given the assembly version + Exceptions: Throws if no assembly version provided or cannot for appropriate dnadb on ensembldb + Caller: Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor::new or dnadb + Status : At risk + +=cut + + +sub _set_dnadb{ + my $self = shift; + + my $assm_ver = $self->dnadb_assembly; + my $dnadb_name = $self->dnadb_name; + + #Sanity check, but this should always be the case when called from new/dnadb + if(! ($assm_ver || $dnadb_name)){ + throw("You need to have define at least a dnadb_assembly($assm_ver) or dnadb_name($dnadb_name) before calling this method"); + } + + throw('Must provide and assembly version to set the dnadb') if ! defined $assm_ver; + + my $reg_lspecies = $reg->get_alias($self->species()); + #The registry has incremented the species as we have recreated the efg DB + #possibly using a different schema_build + #This set true lspecies to allow dnadb detection + #in multi DB environments e.g. DAS server + my $lspecies = $reg_lspecies; + $lspecies =~ s/[0-9]+$// if($lspecies =~ /[0-9]$/); + + + throw('Either specify a species parameter or set species.production_name in the meta table to set dnadb automatically, alternatively pass a dnadb parameter') if $lspecies eq 'default'; + + #Wse params first + #else registry params + #else ensembldb + + my @ports = ($self->dnadb_port); + + #Start with lastest MySQL instances + #We are over-riding specified port here, only for known hosts + #we should really account for this and make it nr + if($self->dnadb_host eq 'ensdb-archive'){# + @ports = (5304, 3304); + } + elsif($self->dnadb_host eq 'ensembldb.ensembl.org'){ + @ports = (5306, 3306); + } + + + if(! $dnadb_name){ + $dnadb_name = $lspecies.'_core_%_'.$assm_ver.'%'; + } + + my $sql = 'show databases like "'.$dnadb_name.'"'; + my ($dbh, @dbnames, $port, $host_port); + + foreach $port(@ports){ + #This is probably duplicating connections and over-riding any connection + #pooling going on in the base DBConnection if we are using the same host port + #as the registry connection + + $dbh = DBI->connect('DBI:mysql:host='.$self->dnadb_host.";port=${port}", + $self->dnadb_user, + $self->dnadb_pass, + {'RaiseError' => 1}); + + eval { @dbnames = map {$_ = "@$_"} @{$dbh->selectall_arrayref($sql)}; }; + + if($@){ + throw('Failed to fetch dna DB names from '.$self->dnadb_host.":${port}"."\n$@"); + } + + + #Will always take the latest release, not the latest genebuild version + #Which is probably what we want anyway + + @dbnames = grep(/core_[0-9]/, sort @dbnames); + + if(scalar(@dbnames)==0){ + warn(':: Failed to find dnadb like '.$dnadb_name.', using ' + .$self->dnadb_user.'@'.$self->dnadb_host.':'.$port); + } + else{ + $host_port = $port; + last; + } + } + + throw("Failed to find dnadb like $dnadb_name.") if(scalar(@dbnames)==0); + + + warn ":: Auto-selecting build $assm_ver core DB as:\t". + $self->dnadb_user.'@'.$dbnames[$#dbnames].':'.$self->dnadb_host.':'.$host_port."\n"; + + + my $db = $reg->reset_DBAdaptor($reg_lspecies, 'core', $dbnames[$#dbnames], $self->dnadb_host, $host_port, $self->dnadb_user, $self->dnadb_pass); + + $self->dnadb($db); + return $db; +} + + + + + +#General Status methods +#will Move to Bio::EnsEMBL::Funcgen::DBSQL::Status + +=head2 fetch_all_states + + Arg [1] : string - table name + Arg [2] : int - table id + Example : my @states = @{$db->fetch_all_states('channel', 1)}; + Description: Retrieves all states associated with the given table record + Returntype : Listref + Exceptions : Throws if arguments not supplied + Caller : general + Status : At risk - Move to Status + +=cut + +sub fetch_all_states{ + my ($self, $table, $id) = @_; + + + throw("DBAdaptor::fetch_all_states is deprecated"); + + + throw("Need to specifiy a table and an id to retrieve status") if (! $table || ! $id); + + + my $sql = "SELECT state FROM status WHERE table_name=\"$table\" AND table_id=\"$id\""; + + my @states = map{ $_ = "@$_"} @{$self->dbc->db_handle->selectall_arrayref($sql)}; + + return \@states; +} + + +=head2 fetch_status_by_name + + Arg [1] : string - table name + Arg [2] : int - table id + Arg [3] : string - status + Example : if($db->fetch_status_by_name('channel', 1, 'IMPORTED'){ ... }; + Description: Retrieves given state associated with the table record + Returntype : ARRAYREF + Exceptions : Throws if arguments not supplied + Caller : general + Status : At risk - Move to Stasus + +=cut + + + +sub fetch_status_by_name{ + my ($self, $table, $id, $state) = @_; + + throw("DBAdaptor::fetch_status_by_name is deprecated"); + + throw("Need to specify a table and an id to retrieve status") if (! $table || ! $id || ! $state); + + #should we enum the state? + + + my $sql = "SELECT state FROM status WHERE table_name=\"$table\" AND table_id=\"$id\" AND state=\"$state\""; + return $self->dbc->db_handle->selectrow_array($sql); +} + + +=head2 set_status + + Arg [1] : string - table name + Arg [2] : int - table id + Arg [3] : string - status + Example : $db->set_status('channel', 1, 'IMPORTED'); + DESCRIPTION: RETRIEVES GIVEN STATE ASSOCIATED WITH THE table record + Returntype : ARRAYREF + Exceptions : Throws if arguments not supplied + Caller : general + Status : At risk - Move to Status + +=cut + + +sub set_status{ + my ($self, $table, $id, $state) = @_; + + throw("DBAdaptor::set_status is deprecated"); + + throw("Need to supply a table, dbid and a valid status") if (!($table && $id && $state)); + + my $sql = "INSERT INTO status(table_id, table_name, state) VALUES(\"$id\", \"$table\", \"$state\")"; + $self->dbc->do($sql); + + return; +} + + +sub stable_id_prefix{ + my $self = shift; + + if(! defined $self->{'stable_id_prefix'}){ + ($self->{'stable_id_prefix'}) = @{$self->dnadb->get_MetaContainer->list_value_by_key('species.stable_id_prefix')}; + + #Only add R if it is defined + $self->{'stable_id_prefix'} .= 'R' if $self->{'stable_id_prefix'}; + } + + return $self->{'stable_id_prefix'} +} + + +=head2 connect_string + + Example : my $import_cmd = 'mysqlimport '.$db->connect_string()." $table_file"; + Description: Retrieves the mysql cmdline connection string + Returntype : String + Exceptions : none + Caller : general + Status : At risk + +=cut + +sub connect_string{ + my $self = shift; + + return '-h'.$self->dbc->host().' -u'.$self->dbc->username().' -p'.$self->dbc->password() + .' -P'.$self->dbc->port().' '.$self->dbc->dbname(); +} + + + +# DEPRECATED METHODS # + +sub fetch_group_details{ + my ($self, $gname) = @_; + + deprecate("Please use ExperimentalGroupAdaptor"); + + throw("Need to specify a group name") if ! $gname; + my $sql = "SELECT * from experimental_group where name=\"$gname\""; + return $self->dbc->db_handle->selectrow_array($sql); +} + +sub import_group{ + my ($self, $gname, $loc, $contact) = @_; + + deprecate('Please use ExperimentalGroup/Adaptor to import experimental_group info'); + throw('import_group no longer supported'); +} + +sub set_dnadb_by_assembly_version{ + my $self = shift; + deprecate('Please use _set_dnadb'); + $self->{assembly_version} = shift; + return $self->_set_dnadb; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/DBEntryAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/DBEntryAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,724 @@ + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::Funcgen::DBSQL::DBEntryAdaptor - +MySQL Database queries to load and store external object references. + +=head1 SYNOPSIS + +$db_entry_adaptor = $db_adaptor->get_DBEntryAdaptor(); +$db_entry = $db_entry_adaptor->fetch_by_dbID($id); + +my $gene = $db_adaptor->get_GeneAdaptor->fetch_by_stable_id('ENSG00000101367'); +@db_entries = @{$db_entry_adaptor->fetch_all_by_Gene($gene)}; +@gene_ids = $db_entry_adaptor->list_gene_ids_by_extids('BAB15482'); + +=cut + +package Bio::EnsEMBL::Funcgen::DBSQL::DBEntryAdaptor; + +use Bio::EnsEMBL::DBSQL::DBEntryAdaptor; +use Bio::EnsEMBL::DBSQL::BaseAdaptor; + +use Bio::EnsEMBL::DBEntry; +#use Bio::EnsEMBL::IdentityXref; +#use Bio::EnsEMBL::OntologyXref; + +use Bio::EnsEMBL::Utils::Exception qw(deprecate throw warning); + +use vars qw(@ISA @EXPORT); +use strict; + +@ISA = qw( Bio::EnsEMBL::DBSQL::DBEntryAdaptor Bio::EnsEMBL::DBSQL::BaseAdaptor); +@EXPORT = (@{$DBI::EXPORT_TAGS{'sql_types'}}); + + +=head2 fetch_all_by_FeatureType + + + Arg [1] : Bio::EnsEMBL::Funcgen::FeatureType $feature_type + (The feature type to retrieve DBEntries for) + Arg [2] : optional external database name + Arg [3] : optional external_db type + Example : @db_entries = @{$db_entry_adaptor->fetch_by_FeatureType($feature_type)}; + Description: This returns a list of DBEntries associated with this feature type. + Returntype : listref of Bio::EnsEMBL::DBEntries; may be of type IdentityXref if + there is mapping data, or GoXref if there is linkage data. + Exceptions : throws if feature type object not passed + Caller : Bio::EnsEMBL::Funcgen::FeatureType + Status : At Risk + +=cut + +sub fetch_all_by_FeatureType { + my ( $self, $feature_type, $ex_db_reg, $exdb_type ) = @_; + + if(!ref($feature_type) || !$feature_type->isa('Bio::EnsEMBL::Funcgen::FeatureType')) { + throw("Bio::EnsEMBL::Funcgen::FeatureType argument expected."); + } + + return $self->_fetch_by_object_type($feature_type->dbID(), 'FeatureType', $ex_db_reg, $exdb_type); +} + + +=head2 list_feature_type_ids_by_extid + + Arg [1] : string $external_name + Arg [2] : (optional) string $external_db_name + Example : @tr_ids = $dbea->list_feature_type_ids_by_extid('BEAF-32'); + Description: Gets a list of feature type IDs by external display ID + Returntype : list of Ints + Exceptions : none + Caller : unknown + Status : At risk + +=cut + +sub list_feature_type_ids_by_extid { + my ( $self, $external_name, $external_db_name ) = @_; + + return $self->_type_by_external_id( $external_name, 'FeatureType', + undef, $external_db_name ); +} + + +=head2 list_feature_type_ids_by_extids + + Arg [1] : ARRAYREF of external name strings + Arg [2] : (optional) string $external_db_name + Example : @tr_ids = $dbea->list_feature_type_ids_by_extids(['ENST00012398371', ...]); + Description: Gets a list of feature type IDs by external display IDs + Returntype : list of Ints + Exceptions : none + Caller : unknown + Status : At risk + +=cut + + +sub list_feature_type_ids_by_extids { + my ( $self, $external_names, $external_db_name ) = @_; + + return $self->_type_by_external_ids( $external_names, 'FeatureType', + undef, $external_db_name ); +} + + +=head2 list_regulatory_feature_ids_by_extid + + Arg [1] : string $external_name + Arg [2] : (optional) string $external_db_name + Example : @tr_ids = $dbea->list_regulatory_feature_ids_by_extid('GO:0004835'); + Description: Gets a list of regulatory_feature IDs by external display ID + Returntype : list of Ints + Exceptions : none + Caller : unknown + Status : At risk + +=cut + +sub list_regulatory_feature_ids_by_extid { + my ( $self, $external_name, $external_db_name ) = @_; + + + return $self->_type_by_external_id( $external_name, 'RegulatoryFeature', + undef, $external_db_name ); +} + + +=head2 list_regulatory_feature_ids_by_extids + + Arg [1] : ARRAYREF of external name strings + Arg [2] : (optional) string $external_db_name + Example : @tr_ids = $dbea->list_regulatory_feature_ids_by_extids(['ENSG00283757289', ...]); + Description: Gets a list of regulatory_feature IDs by external display IDs + Returntype : list of Ints + Exceptions : none + Caller : unknown + Status : At risk + +=cut + + +sub list_regulatory_feature_ids_by_extids { + my ( $self, $external_names, $external_db_name ) = @_; + + + return $self->_type_by_external_ids( $external_names, 'RegulatoryFeature', + undef, $external_db_name ); +} + + +=head2 list_external_feature_ids_by_extid + + Arg [1] : string $external_name + Arg [2] : (optional) string $external_db_name + Example : @tr_ids = $dbea->list_external_feature_ids_by_extid('GO:0004835'); + Description: Gets a list of external_feature IDs by external display ID + Returntype : list of Ints + Exceptions : none + Caller : unknown + Status : At risk + +=cut + +sub list_external_feature_ids_by_extid { + my ( $self, $external_name, $external_db_name ) = @_; + + return + $self->_type_by_external_id( $external_name, 'ExternalFeature', undef, + $external_db_name ); +} + + +=head2 list_external_feature_ids_by_extids + + Arg [1] : ARRAYREF of external name strings + Arg [2] : (optional) string $external_db_name + Example : @tr_ids = $dbea->list_external_feature_ids_by_extids('ENSG00085672387',...]); + Description: Gets a list of external_feature IDs by external display IDs + Returntype : list of Ints + Exceptions : none + Caller : unknown + Status : At risk + +=cut + + +sub list_external_feature_ids_by_extids { + my ( $self, $external_names, $external_db_name ) = @_; + + return + $self->_type_by_external_ids( $external_names, 'ExternalFeature', undef, + $external_db_name ); +} + + +=head2 list_annotated_feature_ids_by_extid + + Arg [1] : string $external_name + Arg [2] : (optional) string $external_db_name + Example : @tr_ids = $dbea->list_annotated_feature_ids_by_extid('GO:0004835'); + Description: Gets a list of annotated_feature IDs by external display IDs + Returntype : list of Ints + Exceptions : none + Caller : unknown + Status : At risk + +=cut + +sub list_annotated_feature_ids_by_extid { + my ( $self, $external_name, $external_db_name ) = @_; + + return + $self->_type_by_external_id( $external_name, 'AnnotatedFeature', undef, + $external_db_name ); +} + + +=head2 list_annotated_feature_ids_by_extids + + Arg [1] : ARRAYREF of external name strings + Arg [2] : (optional) string $external_db_name + Example : @tr_ids = $dbea->list_annotated_feature_ids_by_extids('ENSG00023847582', ...]); + Description: Gets a list of annotated_feature IDs by external display IDs + Returntype : list of Ints + Exceptions : none + Caller : unknown + Status : At risk + +=cut + + +sub list_annotated_feature_ids_by_extids{ + my ( $self, $external_names, $external_db_name ) = @_; + + return + $self->_type_by_external_ids( $external_names, 'AnnotatedFeature', undef, + $external_db_name ); +} + + + +=head2 list_probe_feature_ids_by_extid + + Arg [1] : string $external_name + Arg [2] : (optional) string $external_db_name + Example : @tr_ids = $dbea->list_annotated_feature_ids_by_extid('ENST000000000001'); + Description: Gets a list of annotated_feature IDs by external display ID + Returntype : list of Ints + Exceptions : none + Caller : unknown + Status : At risk + +=cut + +sub list_probe_feature_ids_by_extid { + my ( $self, $external_name, $external_db_name ) = @_; + + return + $self->_type_by_external_id( $external_name, 'ProbeFeature', undef, + $external_db_name ); +} + + +=head2 list_probe_feature_ids_by_extids + + Arg [1] : ARRAYREF of external name strings + Arg [2] : (optional) string $external_db_name + Example : @tr_ids = $dbea->list_annotated_feature_ids_by_extids(['ENST000000000001', ...]); + Description: Gets a list of annotated_feature IDs by external display IDs + Returntype : list of Ints + Exceptions : none + Caller : unknown + Status : At risk + +=cut + +sub list_probe_feature_ids_by_extids { + my ( $self, $external_names, $external_db_name ) = @_; + + return + $self->_type_by_external_ids( $external_names, 'ProbeFeature', undef, + $external_db_name ); +} + + + + +=head2 list_probe_ids_by_extid + + Arg [1] : string $external_name + Arg [2] : (optional) string $external_db_name + Example : @tr_ids = $dbea->list_probe_id_by_extid('ENST000000000001'); + Description: Gets a list of probe IDs by external display ID + Returntype : list of Ints + Exceptions : none + Caller : unknown + Status : At risk + +=cut + +sub list_probe_ids_by_extid { + my ( $self, $external_name, $external_db_name ) = @_; + + return + $self->_type_by_external_id( $external_name, 'Probe', undef, + $external_db_name ); +} + + +=head2 list_probe_ids_by_extids + + Arg [1] : ARRAYREF of external name strings + Arg [2] : (optional) string $external_db_name + Example : @tr_ids = $dbea->list_probe_id_by_extids(['ENST000000000001'], ...); + Description: Gets a list of probe IDs by external display IDs + Returntype : list of Ints + Exceptions : none + Caller : unknown + Status : At risk + +=cut + +sub list_probe_ids_by_extids { + my ( $self, $external_names, $external_db_name ) = @_; + + return + $self->_type_by_external_ids( $external_names, 'Probe', undef, + $external_db_name ); +} + +=head2 list_probeset_ids_by_extid + + Arg [1] : string $external_name + Arg [2] : (optional) string $external_db_name + Example : @tr_ids = $dbea->list_probeset_ids_by_extid('ENST000000000001'); + Description: Gets a list of probeset IDs by external display ID + Returntype : list of Ints + Exceptions : none + Caller : unknown + Status : At risk + +=cut + +sub list_probeset_ids_by_extid { + my ( $self, $external_name, $external_db_name ) = @_; + + return + $self->_type_by_external_id( $external_name, 'ProbeSet', undef, + $external_db_name ); +} + +=head2 list_probeset_ids_by_extids + + Arg [1] : ARRAYREF of external name strings + Arg [2] : (optional) string $external_db_name + Example : @tr_ids = $dbea->list_probeset_ids_by_extids(['ENST000000000001'], ...); + Description: Gets a list of probeset IDs by external display IDs + Returntype : list of Ints + Exceptions : none + Caller : unknown + Status : At risk + +=cut + +sub list_probeset_ids_by_extids { + my ( $self, $external_names, $external_db_name ) = @_; + + return + $self->_type_by_external_ids( $external_names, 'ProbeSet', undef, + $external_db_name ); +} + + + +=head2 list_regulatory_feature_ids_by_external_db_id + + Arg [1] : string $external_id + Example : @gene_ids = $dbea->list_regulatory_feature_ids_by_external_db_id(1020); + Description: Retrieve a list of regulatory_feature ids by an external identifier that is + linked to any of the genes transcripts, translations or the + gene itself. NOTE: if more than one external identifier has the + same primary accession then genes for each of these is returned. + Returntype : list of ints + Exceptions : none + Caller : unknown + Status : Stable + +=cut + +sub list_regulatory_feature_ids_by_external_db_id{ + my ($self,$external_db_id) = @_; + + my %T = map { ($_, 1) } + $self->_type_by_external_db_id( $external_db_id, 'RegulatoryFeature' ); + return keys %T; +} + + + + + +=head2 _type_by_external_id + + Arg [1] : string $name - dbprimary_acc + Arg [2] : string $ensType - ensembl_object_type + Arg [3] : (optional) string $extraType + Arg [4] : (optional) string $external_db_name + other object type to be returned + Example : $self->_type_by_external_id($name, 'regulatory_feature'); + Description: Gets + Returntype : list of dbIDs (regulatory_feature, external_feature ) + Exceptions : none + Caller : list_regulatory/external_feature_ids_by_extid + Status : Stable + +=cut + +sub _type_by_external_id { + my ( $self, $name, $ensType, $extraType, $external_db_name ) = @_; + + #Can't deprecate/remove this as it is part of the core API interface + + return $self->_type_by_external_ids([$name], $ensType, $extraType, $external_db_name); +} ## end sub _type_by_external_id + + + +=head2 _type_by_external_ids + + Arg [1] : ARRAYREF of external names strings + Arg [2] : string $ensType - ensembl_object_type + Arg [3] : (optional) string $extraType + Arg [4] : (optional) string $external_db_name + other object type to be returned + Example : $self->_type_by_external_ids([$name, $name2] 'regulatory_feature'); + Description: Gets + Returntype : list of dbIDs (regulatory_feature, external_feature ) + Exceptions : none + Caller : list_regulatory/external_feature_ids_by_extid + Status : Stable + +=cut + + +sub _type_by_external_ids { + my ( $self, $names, $ensType, $extraType, $external_db_name ) = @_; + + my $from_sql = ''; + my $where_sql = ''; + my $ID_sql = "oxr.ensembl_id"; + + if ( defined $extraType ) { + #This was DBLinks query? We could do this for ProbeSet->Probe->ProbeFeature? + #See core method for missing code + throw('Extra types not accomodated in eFG xref schema'); + } + + + + + #These were specifying is_current for transcript/gene joins + #Also ensure no old/orphaned DBEntries are returned + #HCs ensure these are now removed + #Would need to re-instate these for any status filtering or query extension + #i.e. direct restrieval of nested ensembl objects + + +# if(lc($ensType) eq 'regulatoryfeature'){ +# $from_sql = 'regulatory_feature rf, '; +# $where_sql = qq( rf.regulatory_feature_id = oxr.ensembl_id AND ); +# } +# elsif(lc($ensType) eq 'externalfeature'){ +# $from_sql = 'external_feature ef, '; +# $where_sql = qq( ef.external_feature_id = oxr.ensembl_id AND ); +# } +# elsif(lc($ensType) eq 'annotatedfeature'){ +# $from_sql = 'annotated_feature af, '; +# $where_sql = qq( af.annotated_feature_id = oxr.ensembl_id AND ); +# } +# elsif(lc($ensType) eq 'featuretype'){ +# $from_sql = 'feature_type ft, '; +# $where_sql = qq( ft.feature_type_id = oxr.ensembl_id AND ); +# } +# elsif(lc($ensType) eq 'probefeature'){ +# $from_sql = 'probe_feature pf, '; +# $where_sql = qq( pf.probe_feature_id = oxr.ensembl_id AND ); +# } +# elsif(lc($ensType) eq 'probe'){ +# $from_sql = 'probe p, '; +# $where_sql = qq( p.probe_id = oxr.ensembl_id AND ); +# } +# elsif(lc($ensType) eq 'probeset'){ +# $from_sql = 'probe_set ps, '; +# $where_sql = qq( ps.probe_set_id = oxr.ensembl_id AND ); +# } +# else{ +# throw("ensembl_object_type $ensType is not accommodated"); +# } + + + if ( defined($external_db_name) ) { + # Involve the 'external_db' table to limit the hits to a particular + # external database. + + $from_sql .= 'external_db xdb, '; + $where_sql .= + 'xdb.db_name LIKE ' + . $self->dbc()->db_handle()->quote( $external_db_name . '%' ) + . ' AND xdb.external_db_id = x.external_db_id AND'; + } + + + my $in_clause = '('.join(', ', (map $self->db->dbc->db_handle->quote($_), @$names)).')'; + #For use with use selectcol_arrayref + + #my $in_clause = '('.join(', ', (('?') x scalar(@$names))).')'; + #For use with fetchall_hashref + + my @queries = ( + "SELECT $ID_sql + FROM $from_sql xref x, object_xref oxr + WHERE $where_sql x.dbprimary_acc IN $in_clause AND + x.xref_id = oxr.xref_id AND + oxr.ensembl_object_type= '${ensType}'", + "SELECT $ID_sql + FROM $from_sql xref x, object_xref oxr + WHERE $where_sql x.display_label IN $in_clause AND + x.xref_id = oxr.xref_id AND + oxr.ensembl_object_type= '${ensType}'" + ); + + if ( defined $external_db_name ) { + # If we are given the name of an external database, we need to join + # between the 'xref' and the 'object_xref' tables on 'xref_id'. + + push @queries, "SELECT $ID_sql + FROM $from_sql xref x, object_xref oxr, external_synonym syn + WHERE $where_sql syn.synonym IN $in_clause AND + x.xref_id = oxr.xref_id AND + oxr.ensembl_object_type= '${ensType}' AND + syn.xref_id = oxr.xref_id"; + } else { + # If we weren't given an external database name, we can get away + # with less joins here. + + push @queries, "SELECT $ID_sql + FROM $from_sql object_xref oxr, external_synonym syn + WHERE $where_sql syn.synonym IN $in_clause AND + oxr.ensembl_object_type= '${ensType}' AND + syn.xref_id = oxr.xref_id"; + } + + # Increase speed of query by splitting the OR in query into three + # separate queries. This is because the 'or' statments renders the + # index useless because MySQL can't use any fields in it. + + # Changed this to a UNION and grab the col arrayref directly + + + #UNION will return NR row but ensembl_ids may have >1 ox row + #hence selectcol won't necessarily return NR ID list, + #as caller will use fetch_all_by_dbID_list which will make it NR + #change to selectall_hashref if this changes + + + + return @{$self->db->dbc->db_handle->selectcol_arrayref(join(' UNION ', @queries))}; + +} ## end sub _type_by_external_ids + + + + +=head2 _type_by_external_db_id + + Arg [1] : string $type - external_db type + Arg [2] : string $ensType - ensembl_object_type + Arg [3] : (optional) string $extraType + other object type to be returned + Example : $self->_type_by_external_id('1030', 'Translation'); + Description: Gets + Returntype : list of dbIDs (gene_id, transcript_id, etc.) + Exceptions : none + Caller : list_translation_ids_by_extids + translationids_by_extids + geneids_by_extids + Status : Stable + +=cut + +sub _type_by_external_db_id{ + my ($self, $external_db_id, $ensType, $extraType) = @_; + + my $from_sql = ''; + my $where_sql = ''; + my $ID_sql = "oxr.ensembl_id"; + + if (defined $extraType) { + #See core method for missing code + throw('Extra types not accomodated in eFG xref schema'); + } + + + if(lc($ensType) eq 'regulatoryfeature'){ + $from_sql = 'regulatory_feature rf, '; + $where_sql = qq( rf.regulatory_feature_id = oxr.ensembl_id AND ); + } + elsif(lc($ensType) eq 'externalfeature'){ + $from_sql = 'external_feature ef, '; + $where_sql = qq( ef.external_feature_id = oxr.ensembl_id AND ); + } + elsif(lc($ensType) eq 'annotatedfeature'){ + $from_sql = 'annotated_feature af, '; + $where_sql = qq( af.annotated_feature_id = oxr.ensembl_id AND ); + } + elsif(lc($ensType) eq 'featuretype'){ + $from_sql = 'featuretype ft, '; + $where_sql = qq( ft.feature_type_id = oxr.ensembl_id AND ); + } + elsif(lc($ensType) eq 'probefeature'){ + $from_sql = 'probe_feature pf, '; + $where_sql = qq( pf.probe_feature_id = oxr.ensembl_id AND ); + } + elsif(lc($ensType) eq 'probe'){ + $from_sql = 'probe p, '; + $where_sql = qq( p.probe_id = oxr.ensembl_id AND ); + } + elsif(lc($ensType) eq 'probeset'){ + $from_sql = 'probe_set ps, '; + $where_sql = qq( pf.probe_set_id = oxr.ensembl_id AND ); + } + else{ + throw("ensembl_object_type $ensType is not accommodated"); + } + + my $query = + "SELECT $ID_sql + FROM $from_sql xref x, object_xref oxr + WHERE $where_sql x.external_db_id = ? AND + x.xref_id = oxr.xref_id AND oxr.ensembl_object_type= ?"; + +# Increase speed of query by splitting the OR in query into three separate +# queries. This is because the 'or' statments render the index useless +# because MySQL can't use any fields in the index. + + my %hash = (); + my @result = (); + + + my $sth = $self->prepare( $query ); + $sth->bind_param(1, "$external_db_id", SQL_VARCHAR); + $sth->bind_param(2, $ensType, SQL_VARCHAR); + $sth->execute(); + while( my $r = $sth->fetchrow_array() ) { + if( !exists $hash{$r} ) { + $hash{$r} = 1; + push( @result, $r ); + } + } + return @result; +} + +#Placeholders to catch error from inherited methods +#These now work in reverse as the Gene/Transcript/Translation +#is the xref not the ensembl_object as with the core code + +sub fetch_all_by_Gene { + my ( $self, $gene) = @_; + + if(! (ref($gene) && $gene->isa('Bio::EnsEMBL::Gene'))) { + throw("Bio::EnsEMBL::Gene argument expected."); + } + + throw('Not yet implemented for eFG, maybe you want the core DBEntryAdaptor?'); + + + #This is going to be a bit of a work around as we should really have a separate fetch method + #fetch_all_by_external_name_object_type? + #No!! Because this simply pulls back the xrefs, not the object xrefs!! + #This is the same for the fetch_by_dbID method??? + + #_fetch_by_external_id + #The problem here is that we want to return ox info aswell. + #Just rewrite _fetch_by_object_type? +} + + +sub fetch_all_by_Transcript { + my ( $self, $trans) = @_; + + throw('Not implemented in eFG, maybe you want the core DBEntryAdaptor?'); + + return; +} +sub fetch_all_by_Translation { + my ( $self, $trans) = @_; + + throw('Not implemented in eFG, maybe you want the core DBEntryAdaptor?'); + + return; +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/DataSetAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/DataSetAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,793 @@ +# +# Ensembl module for Bio::EnsEMBL::DBSQL::Funcgen::DataSetAdaptor +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::DBSQL::Funcgen::DataSetAdaptor - A database adaptor for fetching and +storing DataSet objects. + +=head1 SYNOPSIS + +my $dset_adaptor = $db->get_DataSetAdaptor(); + +my $dset = $dset_adaptor->fetch_by_dbID(1); +my @displayable_dsets = $dset_adaptor->fetch_all_displayable(); + +=head1 DESCRIPTION + +The DataSetAdaptor is a database adaptor for storing and retrieving +DataSet objects. + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::DBSQL::DataSetAdaptor; + +use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate ); +use Bio::EnsEMBL::Funcgen::DataSet; +use Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor; + +use vars qw(@ISA); +use strict; +use warnings; + +@ISA = qw(Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor); #do we need this? +#we can't pass the slice through the BaseAdaptor +#So we either don't use it, or use the slice on all the DataSet calls? + + + +#Generates DataSet contains info about DataSet content +#do we need to accomodate different classes of data or multiple feature types in one set? i.e. A combi experiment (Promot + Histone mod)? +#schema can handle this...API? ignore for now but be mindful. +#This is subtley different to handling different experiments with different features in the same DataSet. +#Combi will have same sample. +#See DataSet for definitions of set types + + +#This needs one call to return all displayable sets, grouped by cell_line and ordered by FeatureType +#needs to be restricted to cell line, feature type, but these fields have to be disparate from result_feature +#as this is only a simple linker table, and connections may not always be present +#so cell tpye and feature type constraints have to be performed on load, then can assume that associated features and results +# have same cell type/feature +#so we need to group by cell_type in sql and then order by feature_type_id in sql or rearrange in code? +#This will not know about chip sets, just that a feature set is linked to various result sets +#There fore we need to use the chip_set_id or link back to the experimental_chip chip_set_ids +#this would require a self join on experimental_chip + + + +#what other methods do we need? all with displayable option +#fetch by Experiment_Slice? +#fetch by FeatureType_Slice? +#fetch by CellType_Slice? +#set_chips (duplicates)? We are now using result_set_id as the chip_set key, so if we didn't know the sets previosuly, then we would have to alter the result_set_id retrospectively i.e. change the result_set_id. This would also require a check on result_feature to see if any feature_sets had been associated, and cleaning up of duplicate result_feature entries if the same feature were attached to both of the previously separate result sets. +#this may require an on duplicate key call....delete one. + +#wouldn't it be better to associate the chip set info with the ec's rather than the result set? + +#how are we going to accomodate a combi exp? Promot + Histone mods? +#These would lose their exp set association, i.e. same exp & sample different exp method +#we're getting close to defining the regulon here, combined results features from the same exp +#presently want them displayed as a group but ordered appropriately +#was previously treating each feature as a separate result set + + +#for storing/making link we don't need the Slice context +#store should check all +#so do we move the slice context to the object methods or make optional +#then object method can check for slice and throw or take a Slice as an optional argument +#this will enable generic set to be created to allow loading and linking of features to results +#we still need to know which feature arose from which chip!!!! Not easy to do and may span two. +#Need to genericise this to the chip_set(or use result_set_id non unique) +#We need to disentangle setting the feature to chip/set problem from the displayable problem. +#change the way StatusAdaptor works to accomodate result_set_id:table_name:table_id, as this will define unique results +# + +#can we extend this to creating skeleton result sets and loading raw results too? +# + +#Result.pm should be lightweight by default to enable fast web display, do we need oligo_probe_id? + + +#how are we going to overcome unlinked but displayable sets? +#incomplete result_feature records will be hack to update/alter? +#could have attach_result to feature method? +#force association when loading features + +=head2 fetch_by_name + + Arg [1] : string - name of DataSet + Arg [2] : (optional) string - status e.g. 'DISPLAYABLE' + Example : my $dset = $dset_adaptor->fetch_by_name('RegulatoryFeatures:MultiCell'); + Description: Fetch DataSet with a given name + Returntype : Bio::EnsEMBL::Funcgen::DataSet + Exceptions : Throws if no name passed + Caller : General + Status : At Risk + +=cut + +sub fetch_by_name { + my ($self, $name, $status) = @_; + + throw("Must provide a name argument") if (! defined $name); + + my $sql = "ds.name='".$name."'"; + + if ($status) { + my $constraint = $self->status_to_constraint($status) if $status; + $sql = (defined $constraint) ? $sql." ".$constraint : undef; + } + + return (defined $sql) ? $self->generic_fetch($sql)->[0] : []; + +} + +=head2 fetch_all_by_supporting_set_type + + Arg [1] : string - type of supporting_sets i.e. result or feature + Arg [2] : (optional) string - status e.g. 'DISPLAYABLE' + Example : my $dsets = $dset_adaptor->fetch_all_by_supporting_set('feature'); + Description: Fetch all DataSets whose pre-processed data consists of a particular set type + Returntype : ARRAYREF of Bio::EnsEMBL::Funcgen::DataSet objects + Exceptions : Throws if no supporting_set_type passed + Caller : General + Status : At Risk + +=cut + +sub fetch_all_by_supporting_set_type { + my ($self, $type, $status) = @_; + + throw("Must provide a supporting_set type argument") if (! defined $type); + + my $sql = "ss.type='".$type."'"; + + if ($status) { + my $constraint = $self->status_to_constraint($status) if $status; + $sql = (defined $constraint) ? $sql." ".$constraint : undef; + } + + return (defined $sql) ? $self->generic_fetch($sql) : []; + +} + +=head2 fetch_all_by_product_FeatureSet_type + + Arg [1] : string - product feature set type for this data_set e.g. 'annotated', 'regulatory' + Arg [2] : (optional) string - status e.g. 'DISPLAYABLE' + Example : my $dsets = $dset_adaptor->fetch_all_by_product_FeatureSet_type('regulatory'); + Description: Fetch all DataSets of a given product feature set type + Returntype : ARRAYREF of Bio::EnsEMBL::Funcgen::DataSet objects + Exceptions : Throws if no product feaure set type passed + Caller : General + Status : At Risk - not yet implmented + +=cut + +sub fetch_all_by_product_FeatureSet_type { + my ($self, $type, $status) = @_; + + throw('Not yet implemented'); + #left join? same for result_set? + #Or do we do a sneaky workaround and use the FeatureSet adaptor here to get the feature sets we require + #and then call self? + #or do we just use feature_sets for reg feats and forget about the possiblity of displaying the suppoorting sets? + #will we ever want to display supporting sets in contig/neighbourhood view? + #or will we only use expanded view in a feature context, hence we can just use the Regulatory Attribs? + #don't really want to expand regulatory feats on contig view, so access FeatureSet directly + + throw("Must provide a supporting_set type argument") if (! defined $type); + + my $sql = "ss.type='".$type."'"; + + if ($status) { + my $constraint = $self->status_to_constraint($status) if $status; + $sql = (defined $constraint) ? $sql." ".$constraint : undef; + } + + return (defined $sql) ? $self->generic_fetch($sql) : []; + +} + + +=head2 fetch_by_product_FeatureSet + + Arg [1] : Bio::EnsEMBL::Funcgen::FeatureSet + Example : my @dsets = $fs_adaptopr->fetch_by_product_FeatureSet($fset); + Description: Retrieves DataSet objects from the database based on the FeatureSet. + Returntype : Bio::EnsEMBL::Funcgen::DataSet + Exceptions : Throws if arg is not a valid FeatureSet + Caller : General + Status : Deprecated - use fetch_all_by_product_FeatureSet + +=cut + +sub fetch_all_by_FeatureSet { + my $self = shift; + + deprecate('Use fetch_by_product_FeatureSet'); + + return $self->fetch_by_product_FeatureSet(@_); + +} + + +=head2 fetch_by_product_FeatureSet + + Arg [1] : Bio::EnsEMBL::Funcgen::FeatureSet + Example : my @dsets = $fs_adaptopr->fetch_by_product_FeatureSet($fset); + Description: Retrieves DataSet objects from the database based on the FeatureSet. + Returntype : ARRAYREF of Bio::EnsEMBL::Funcgen::DataSet objects + Exceptions : Throws if arg is not a valid FeatureSet + Caller : General + Status : At Risk + +=cut + + +#This is main FeatureSet, i.e. the result of the analysis of the supporting_sets +#Supporting sets could also be FeatureSets!!! Confusion! + +sub fetch_by_product_FeatureSet { + my $self = shift; + my $fset = shift; + + + if (! ($fset && $fset->isa("Bio::EnsEMBL::Funcgen::FeatureSet") && $fset->dbID())) { + throw("Must provide a valid stored Bio::EnsEMBL::Funcgen::FeatureSet object"); + } + + my $sql = "ds.feature_set_id = '".$fset->dbID()."'"; + + + return $self->generic_fetch($sql)->[0]; +} + + +=head2 fetch_all_by_ResultSet + + Arg [1] : Bio::EnsEMBL::Funcgen::ResultSet + Example : my @dsets = $fs_adaptopr->fetch_all_by_ResultSet($rset); + Description: Retrieves DataSet objects from the database based on the ResultSet. + Returntype : ARRAYREF of Bio::EnsEMBL::Funcgen::DataSet objects + Exceptions : Throws if arg is not a valid ResultSet + Caller : General + Status : At Risk - to be removed + +=cut + +sub fetch_all_by_ResultSet { + my $self = shift; + my $rset = shift; + + deprecate('Use fetch_all_by_supporting_set'); + + return $self->fetch_all_by_supporting_set($rset); + + + #if(! ($rset && $rset->isa("Bio::EnsEMBL::Funcgen::ResultSet") && $rset->dbID())){ + # throw("Must provide a valid stored Bio::EnsEMBL::Funcgen::ResultSet object"); + #} + + + ##self join here to make sure we get all linked result_sets + #my $sql = 'ds.data_set_id IN (SELECT ds.data_set_id from data_set ds where result_set_id='.$rset->dbID().')'; + + + #return $self->generic_fetch($sql); +} + + + +=head2 fetch_all_by_supporting_set + + Arg [1] : Bio::EnsEMBL::Funcgen::Result|FeatureSet + Example : my @dsets = $fs_adaptopr->fetch_all_by_supporting_set($rset); + Description: Retrieves DataSet objects from the database based on the + given supporting Result or FeatureSet. + Returntype : ARRAYREF of Bio::EnsEMBL::Funcgen::DataSet objects + Exceptions : Throws if arg is not a valid Result|FeatureSet + Caller : General + Status : At Risk + +=cut + +sub fetch_all_by_supporting_set { + my $self = shift; + my $set = shift; + + if (! (ref($set) && + ( $set->isa("Bio::EnsEMBL::Funcgen::ResultSet") || + $set->isa("Bio::EnsEMBL::Funcgen::FeatureSet") || + $set->isa("Bio::EnsEMBL::Funcgen::InputSet")) + && $set->dbID())) { + throw("Must provide a valid stored Bio::EnsEMBL::Funcgen::ResultSet, FeatureSet or InputSet object"); + } + + #self join here to make sure we get all linked result_sets + my $sql = ' ds.data_set_id IN (SELECT data_set_id from supporting_set where type="'.$set->set_type.'" and supporting_set_id='.$set->dbID().')'; + + return $self->generic_fetch($sql); +} + + + + + +=head2 fetch_all_by_feature_type_class + + Arg [1] : string - class of associated FeatureSet FeatureType + Arg [2] : optional: string - status e.g. 'DISPLAYABLE' + Example : my @dsets = @{$ds_adaptopr->fetch_all_by_feature_type_class('HISTONE')}; + Description: Retrieves DataSet objects from the database based on the FeatureSet FeatureType class. + Returntype : ARRAYREF of Bio::EnsEMBL::Funcgen::DataSet objects + Exceptions : Throws if no class arg defined + Caller : General + Status : At Risk + +=cut + +sub fetch_all_by_feature_type_class { + my ($self, $class, $status) = @_; + + throw ('Must provide a FeatureType class to retrieve DataSets') if ! defined $class; + + my ($constraint, @dsets); + + if ($status) { + $constraint = $self->status_to_constraint($status) if $status; + return [] if ! defined $constraint; + } + + + #This is fetching all feature sets! + #we need to left join this? + #we can't do it for class + #but we can do it for product feature_set type + + foreach my $dset (@{$self->generic_fetch($constraint)}) { + #uc both here to avoid case sensitivities + push @dsets, $dset if uc($dset->product_FeatureSet->feature_type->class()) eq uc($class); + } + + return \@dsets; +} + +=head2 fetch_all_displayable_by_feature_type_class + + Arg [1] : string - class of associated FeatureSet FeatureType + Example : my @dsets = @{$ds_adaptopr->fetch_all_displayable_by_feature_type_class('HISTONE')}; + Description: Wrapper method, retrieves all displayable DataSets with given FeatureSet FeatureType class + Returntype : ARRAYREF of Bio::EnsEMBL::Funcgen::DataSet objects + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub fetch_all_displayable_by_feature_type_class { + my ($self, $class) = @_; + + return $self->fetch_all_by_feature_type_class($class, 'DISPLAYABLE'); +} + + +=head2 _tables + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns the names and aliases of the tables to use for queries. + Returntype : List + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _tables { + my $self = shift; + + return ( + [ 'data_set', 'ds' ], + [ 'supporting_set', 'ss'], + ); +} + +=head2 _columns + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns a list of columns to use for queries. + Returntype : List + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _columns { + #my $self = shift; + + #will this work? May have multiple record/result_set_id + + + return qw( + ds.data_set_id ds.feature_set_id + ds.name ss.type + ss.supporting_set_id + ); +} + + +=head2 _left_join + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns an additional table joining constraint to use for + queries. + Returntype : List + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _left_join { + my $self = shift; + + return (['supporting_set', 'ds.data_set_id = ss.data_set_id']); +} + + +#=head2 _final_clause +# +# Args : None +# Example : None +# Description: PROTECTED implementation of superclass abstract method. +# Returns an ORDER BY clause. Sorting by oligo_feature_id would be +# enough to eliminate duplicates, but sorting by location might +# make fetching features on a slice faster. +# Returntype : String +# Exceptions : None +# Caller : generic_fetch +# Status : At Risk +# +#=cut + + +#this should be another left join? on feature_set and a join on feature_type so we can sort lexically on class, name +#should we implement a default sort in the data_set adaptor which could be superceeded by a custom list? + +#sub _final_clause { +# return ' ORDER BY fs.feature_type_id, fs.cell_type_id'; #group on cell_type_id +#} + + +=head2 _objs_from_sth + + Arg [1] : DBI statement handle object + Example : None + Description: PROTECTED implementation of superclass abstract method. + Creates Array objects from an executed DBI statement + handle. + Returntype : ARRAYREF of Bio::EnsEMBL::Funcgen::DataSet objects + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _objs_from_sth { + my ($self, $sth) = @_; + + my (@data_sets, @supporting_sets, $data_set, $dbID, $set_id); + my ($fset_id, $fset, $set, $name, $ss_type, $ss_id); + + my %set_adaptors = ( + feature => $self->db->get_FeatureSetAdaptor(), + result => $self->db->get_ResultSetAdaptor(), + input => $self->db->get_InputSetAdaptor(), + ); + + $sth->bind_columns(\$dbID, \$fset_id, \$name, \$ss_type, \$ss_id); + + while ( $sth->fetch() ) { + + if ((! $data_set) || ($data_set->dbID() != $dbID)) { + + if ($data_set) { + $data_set->add_supporting_sets(\@supporting_sets); + push @data_sets, $data_set; + #do not set to empty array as this will cause failure of check in DataSet->new + undef @supporting_sets; + } + + #handle absent sets, dbIDs of 0 + $fset = ($fset_id) ? $set_adaptors{'feature'}->fetch_by_dbID($fset_id) : undef; + + $data_set = Bio::EnsEMBL::Funcgen::DataSet->new( + -DBID => $dbID, + -NAME => $name, + -FEATURE_SET => $fset, + -ADAPTOR => $self, + ); + + } + + if ($ss_id) { + my $sset = $set_adaptors{$ss_type}->fetch_by_dbID($ss_id); + + if (! $sset) { + throw("Could not find supporting $ss_type set with dbID $ss_id"); + } + + push @supporting_sets, $sset; + } + } + + + #handle last set + if ($data_set) { + # warn "ssets are @supporting_sets"; + $data_set->add_supporting_sets(\@supporting_sets); + push @data_sets, $data_set; + } + + + #As we're not (quite) constraining how DataSets are associated + #it's valid to have a combined exp, i.e. Promoter plus Histone features and results? + #It can be valid to have several feature_types in the same set? + #This is entirely dependent on sensible experimental design and how we want to view the data. + #We could then have multiple cell types too? Getting too many dimensions to display sensibly within e! + #Logically we need one constant to key on, cell_type, feature_type, but also allow support combined info + #i.e. all Histone mods for one cell_type, but also have promoter data too? + #This is getting close to a 'regulon', as we're incorporating all sorts of supporting data + #There should be an order of display for and within each feature class + #(or if we have same feature across several cell types then we order alphabetically?) + #Other possible variables to order on: + #analysis_id, this would also separate the features as we can't base a feature on mutliple analyses of the same data + #So we either accomodate everything, where the only contraint is that we have one constant in the set + #Or we restrict the Set to handle just one feature_set and it's supporting result_sets + #Start simple, let's just take the one feature/data set problem first + + return \@data_sets; + } + + +=head2 store + + Args : List of Bio::EnsEMBL::Funcgen::DataSet objects + Example : $dsa->store(@dsets); + Description: Stores given DataSet objects in the database. Sets dbID and adaptor + on the objects that it stores. + Returntype : ARRAYREF of stored DataSet objects + Exceptions : Throws if no DataSet objects passed + Throws if DataSet object has already been stored + Throws if any supporting sets have not been stored + Caller : General + Status : At Risk + +=cut + + sub store{ + my ($self, @dsets) = @_; + + throw('Must pass a list of DataSet objects to store') if(! @dsets || $#dsets < 0); + + + my $sth = $self->prepare("INSERT INTO data_set (feature_set_id, name) + VALUES (?, ?)"); + my $sth2 = $self->prepare("INSERT INTO supporting_set (data_set_id, supporting_set_id, type) + VALUES (?, ?, ?)"); + + my ($fset_id); + + my $db = $self->db(); + + + foreach my $dset (@dsets) { + + throw('Must pass a DataSet object to store') if( ! ( ref $dset && + $dset->isa('Bio::EnsEMBL::Funcgen::DataSet'))); + + if ( $dset->is_stored($db) ) { + throw('DataSet [' . $dset->dbID() . '] is already stored in the database,'. + 'use store_updated_sets method to add new supporting sets in this DataSet'); + } + + $fset_id = (defined $dset->product_FeatureSet()) ? $dset->product_FeatureSet->dbID() : 0; + + $sth->bind_param(1, $fset_id, SQL_INTEGER); + $sth->bind_param(2, $dset->name(), SQL_VARCHAR); + $sth->execute(); + $dset->dbID( $sth->{'mysql_insertid'} ); + $dset->adaptor($self); + + + + foreach my $sset (@{$dset->get_supporting_sets()}) { + + throw("All supporting Feature and ResultSets must be stored previously.". + " Use store_updated_sets method if your DataSet has been stored") if(! $sset->is_stored($db)); + + $sth2->bind_param(1, $dset->dbID(), SQL_INTEGER); + $sth2->bind_param(2, $sset->dbID(), SQL_INTEGER); + $sth2->bind_param(3, $sset->set_type(), SQL_VARCHAR); #enum feature/result/experimental + $sth2->execute(); + } + } + + return \@dsets + } + + +=head2 store_updated_sets + + Args : List of previously stored Bio::EnsEMBL::Funcgen::DataSet objects + Example : $dsa->store_updated_sets(@dsets); + Description: Updates added supporting sets for a given previously stored DataSet + Returntype : ARRAYREF of updated DataSet objects + Exceptions : Throws if a list of DataSet objects is not provided + Throws if DataSet has not been previosuly stored + Throws if supporting set has not been previously stored + ? should we throw or warn if a set has been deleted? + Caller : General + Status : At Risk + +=cut + + #This needs to cahnge to an arrayref of dset and an overwrite flag + + + sub store_updated_sets{ + my ($self, $dsets, $overwrite) = @_; + + throw('Must pass a list of DataSet objects to store') if(! @$dsets || $#{$dsets} < 0); + my ($sql); + my $db = $self->db(); + my $sth = $self->prepare("INSERT INTO supporting_set (data_set_id, supporting_set_id, type) + VALUES (?, ?, ?)"); + + foreach my $dset (@$dsets) { + throw('Must pass a DataSet object to update') if( ! ( ref $dset && + $dset->isa('Bio::EnsEMBL::Funcgen::DataSet'))); + + throw('DataSet [' . $dset->dbID() . '] must be previous stored in the database') if (! $dset->is_stored($db) ); + my $stored_dset = $self->fetch_by_name($dset->name); + + + #Update product FeatureSet + #We need to do this first so we cacn check wether we're updated supporting_sets + #for a data set which has already got a product FeatureSet...not wise + + my $fset = $dset->product_FeatureSet; + my $stored_fset = $stored_dset->product_FeatureSet; + #This fset check is slight overkill, as you have to severly mangle a dataset to fail this validation + + if (defined $stored_fset) { + + if (! defined $fset) { + #How will this have ever happened? + warn("Populating absent product FeatureSet from DB for DataSet:\t".$dset->name); + } else { + #validate sets + if ($fset->dbID != $stored_fset->dbID) { + my $msg = 'Found product FeatureSet mismatch whilst updating DataSet('.$dset->name. + "):\tStored:".$stored_fset->name."\tUpdate:".$fset->name; + throw($msg) if ! $overwrite; + warn $msg; + } + } + } else { + #update data_set table + $sql = 'update data_set set feature_set_id='.$fset->dbID.' where data_set_id='.$dset->dbID; + $self->dbc->do($sql); + } + + my @sorted_ssets = sort {$a->dbID <=> $b->dbID} @{$dset->get_supporting_sets}; + my @stored_ssets = sort {$a->dbID <=> $b->dbID} @{$stored_dset->get_supporting_sets}; + my $mismatch = 0; + + $mismatch = 1 if(scalar(@sorted_ssets) != scalar(@stored_ssets)); + + if (! $mismatch) { + + for my $i (0..$#stored_ssets) { + + if ($stored_ssets[$i]->dbID != $sorted_ssets[$i]->dbID) { + $mismatch=1; + last; + } + } + } + + #Delete old supporting_sets + #We really only want to do this if there is a difference + #batched jobs cause race condition here + #unless updated once before submission + if ($mismatch && + $overwrite) { + $sql = 'DELETE from supporting_set where data_set_id='.$dset->dbID; + eval { $self->db->dbc->do($sql) }; + throw("Couldn\'t delete supporting_sets for DataSet:\t".$stored_dset->name."\n$@") if $@; + @stored_ssets = (); + } + + + #Update supporting sets + my %stored_dbids; + map {$stored_dbids{$_->dbID} = undef} @stored_ssets if @stored_ssets; + + foreach my $sset (@sorted_ssets) { + my $dbid = $sset->dbID; + + if (! grep(/^${dbid}$/, keys %stored_dbids)) { + throw("All supporting sets must be stored previously.") if(! $sset->is_stored($db)); + + #This causes problems when we want to re-run by slice + #Currently need + #warn "temporarily suspended throw for existing feature_set"; + throw('You are trying to update supporting sets for a data set which already has a product FeatureSet('.$stored_fset->name.'). Either rollback the FeatureSet before adding more supporting sets or specify the overwrite flag.') if defined $stored_fset && ! $overwrite; + + $sth->bind_param(1, $dset->dbID, SQL_INTEGER); + $sth->bind_param(2, $sset->dbID, SQL_INTEGER); + $sth->bind_param(3, $sset->set_type, SQL_VARCHAR); + #How is this failing? + #Is the index not being updated after the delete + $sth->execute(); + } + } + } + + return $dsets + } + + +=head2 list_dbIDs + + Args : None + Example : my @feature_ids = @{$ofa->list_dbIDs()}; + Description: Gets an array of internal IDs for all OligoFeature objects in + the current database. + Returntype : List of ints + Exceptions : None + Caller : ? + Status : Stable + +=cut + + sub list_dbIDs { + my $self = shift; + + return $self->_list_dbIDs('data_set'); + } + + 1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/ExperimentAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/ExperimentAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,385 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::DBSQL::ExperimentAdaptor +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::Funcgen::DBSQL::ExperimentAdaptor - A database adaptor for fetching and +storing Funcgen Experiment objects. + +=head1 SYNOPSIS + +my $exp_a = $db->get_ExperimentAdaptor(); +my $exp = $exp_a->fetch_by_name($name); + + +=head1 DESCRIPTION + +The ExperimentAdaptor is a database adaptor for storing and retrieving +Funcgen Experiment objects. + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::DBSQL::ExperimentAdaptor; + +use Bio::EnsEMBL::Utils::Exception qw( throw deprecate ); +use Bio::EnsEMBL::Funcgen::Experiment; +use Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor; + +use vars qw(@ISA); + + +#May need to our this? +@ISA = qw(Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor); + + +=head2 fetch_by_name + + Arg [1] : string - name of an Experiment + Example : my $exp = $exp_a->fetch_by_name('Exp-1'); + Description: Retrieves a named Experiment object from the database. + Returntype : Bio::EnsEMBL::Funcgen::Experiment + Exceptions : Throws if no name defined or if more than one returned + Caller : General + Status : Medium risk + +=cut + +sub fetch_by_name { + my $self = shift; + my $name = shift; + + throw("Need to specify and experiment name argument") if (! defined $name); + + $self->bind_param_generic_fetch($name, SQL_VARCHAR); + my $result = $self->generic_fetch("e.name = ?"); + + if (scalar @$result > 1) { + throw("Experiment $name is not unique in the database, but only one result has been returned"); + #should have unique key of group_id and experiment_name + } + return $result->[0]; +} + + + +=head2 get_all_experiment_names + + Arg [1] : (optional) boolean - flag to denote whether experiment is flagged for web display + Example : my @names = @{$exp_a->get_all_experiment_names()}; + Description: Retrieves names of all experiments. + Returntype : ARRAYREF + Exceptions : none + Caller : General + Status : At Risk - rename fetch? + +=cut + +sub get_all_experiment_names{ + my ($self, $displayable) = @_; + + + my ($constraint); + + my $sql = "SELECT e.name FROM experiment e"; + $sql .= ", status s WHERE e.experiment_id =\"s.table_id\" AND s.table_name=\"experiment\" AND s.state=\"DISPLAYABLE\"" if($displayable); + + return $self->db->dbc->db_handle->selectcol_arrayref($sql); +} + +=head2 _tables + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns the names and aliases of the tables to use for queries. + Returntype : List of listrefs of strings + Exceptions : None + Caller : Internal + Status : At risk + +=cut + +sub _tables { + my $self = shift; + + return ['experiment', 'e']; +} + +=head2 _columns + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns a list of columns to use for queries. + Returntype : List of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _columns { + my $self = shift; + + return qw(e.experiment_id e.name e.experimental_group_id e.date e.primary_design_type e.description e.mage_xml_id); +} + +=head2 _objs_from_sth + + Arg [1] : DBI statement handle object + Example : None + Description: PROTECTED implementation of superclass abstract method. + Creates Array objects from an executed DBI statement + handle. + Returntype : Listref of Bio::EnsEMBL::Funcgen::Experiment objects + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _objs_from_sth { + my ($self, $sth) = @_; + + my (@result, $exp_id, $name, $group_id, $p_design_type, $date, $description, $xml_id); + + my $eg_adaptor = $self->db->get_ExperimentalGroupAdaptor(); + + $sth->bind_columns(\$exp_id, \$name, \$group_id, \$date, \$p_design_type, \$description, \$xml_id); + + + while ( $sth->fetch() ) { + + my $group = $eg_adaptor->fetch_by_dbID($group_id);#cache these in ExperimentalGroupAdaptor + + my $exp = Bio::EnsEMBL::Funcgen::Experiment->new + ( + -DBID => $exp_id, + -ADAPTOR => $self, + -NAME => $name, + -EXPERIMENTAL_GROUP => $group, + -DATE => $date, + -PRIMARY_DESIGN_TYPE => $p_design_type, + -DESCRIPTION => $description, + -MAGE_XML_ID => $xml_id, + ); + + push @result, $exp; + + } + return \@result; +} + + + + +=head2 store + + Args : List of Bio::EnsEMBL::Funcgen::Experiment objects + Example : $oaa->store($exp1, $exp2, $exp3); + Description: Stores given Experiment objects in the database. + Returntype : ARRAYREF of Bio::EnsEMBL::Funcgen::Experiment objects + Exceptions : Throws is group not present in DB + Throws if object is not a Bio::EnsEMBL::Funcgen::Experiment + Throws if object is already present in the DB but has no dbID + Caller : General + Status : At Risk + +=cut + +sub store { + my $self = shift; + my @args = @_; + + my ($s_exp); + + my $sth = $self->prepare('INSERT INTO experiment + (name, experimental_group_id, date, primary_design_type, description, mage_xml_id) + VALUES (?, ?, ?, ?, ?, ?)'); + + foreach my $exp (@args) { + throw('Can only store Experiment objects') if ( ! $exp->isa('Bio::EnsEMBL::Funcgen::Experiment')); + + if (!( $exp->dbID() && $exp->adaptor() == $self )){ + + + my $exp_group = $exp->experimental_group; + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::ExperimentalGroup', $exp_group); + + + $s_exp = $self->fetch_by_name($exp->name());#validate on group too! + throw("Experimental already exists in the database with dbID:".$s_exp->dbID(). + "\nTo reuse/update this Experimental you must retrieve it using the ExperimentalAdaptor". + "\nMaybe you want to use the -recover option?") if $s_exp; + + + $exp = $self->update_mage_xml_by_Experiment($exp) if(defined $exp->mage_xml()); + + $sth->bind_param(1, $exp->name(), SQL_VARCHAR); + $sth->bind_param(2, $exp->experimental_group()->dbID, SQL_INTEGER); + $sth->bind_param(3, $exp->date(), SQL_VARCHAR);#date? + $sth->bind_param(4, $exp->primary_design_type(), SQL_VARCHAR); + $sth->bind_param(5, $exp->description(), SQL_VARCHAR); + $sth->bind_param(6, $exp->mage_xml_id(), SQL_INTEGER); + + $sth->execute(); + $exp->dbID($sth->{'mysql_insertid'}); + $exp->adaptor($self); + + } + else{ + #assume we want to update the states + warn('You may want to use $exp->adaptor->store_states($exp)'); + $self->store_states($exp); + } + } + + return \@args; + } + +=head2 fetch_mage_xml_by_Experiment + + Args : Bio::EnsEMBL::Funcgen::Experiment + Example : my $xml = $exp_adaptor->fetch_mage_xml_by_Experiment($exp); + Description: Gets the MAGE XML for this experiment + Returntype : string + Exceptions : throws if arg is not a valid stored Experiment + Caller : general + Status : at Risk + +=cut + +sub fetch_mage_xml_by_Experiment{ + my ($self, $exp) = @_; + + if(!($exp and $exp->isa('Bio::EnsEMBL::Funcgen::Experiment') && $exp->dbID())){ + throw('You must provide a valid stored Bio::EnsEMBL::Funcgen::Experiment'); + } + + return if ! $exp->mage_xml_id(); + + my $sql = 'SELECT xml FROM mage_xml WHERE mage_xml_id='.$exp->mage_xml_id; + + return $self->db->dbc->db_handle->selectall_arrayref($sql)->[0]; +} + +=head2 fetch_mage_xml_by_experiment_name + + Args : + Example : my $xml = $exp_adaptor->fetch_mage_xml_by_Experiment($exp); + Description: Gets the MAGE XML for this experiment + Returntype : string + Exceptions : throws if no arg passed + Caller : general + Status : at Risk + +=cut + +sub fetch_mage_xml_by_experiment_name{ + my ($self, $exp_name) = @_; + + if(! defined $exp_name){ + throw('You must provide an Experiment name argument'); + } + + my $sql = 'SELECT mx.xml FROM mage_xml mx, experiment e WHERE e.name="'.$exp_name.'" and e.mage_xml_id=mx.mage_xml_id'; + + return $self->db->dbc->db_handle->selectall_arrayref($sql)->[0]; +} + + + +=head2 update_mage_xml_by_Experiment + + Args : Bio::EnsEMBL::Funcgen::Experiment + Example : my $xml = $exp_adaptor->fetch_mage_xml_by_Experiment($exp); + Description: Gets the MAGE XML for this experiment + Returntype : string + Exceptions : throws if arg is not a valid stored Experiment + Caller : general + Status : at Risk + +=cut + +sub update_mage_xml_by_Experiment{ + my ($self, $exp) = @_; + + + if(!($exp and $exp->isa('Bio::EnsEMBL::Funcgen::Experiment'))){ + throw('You must provide a valid Bio::EnsEMBL::Funcgen::Experiment'); + } + + if($exp->mage_xml_id()){ + #potentially calling dbID on a un-stored obj, implicit that it + warn('Overwriting mage_xml entry for Experiment: '.$exp->name); + my $sql = "UPDATE mage_xml set xml='".$exp->mage_xml()."'"; + $self->db->dbc->do($sql); + + }else{ + my $sql = "INSERT INTO mage_xml (xml) VALUES('".$exp->mage_xml()."')"; + #need to get a statement handle to retrieve insert id + my $sth = $self->prepare($sql); + $sth->execute(); + $exp->mage_xml_id($sth->{'mysql_insertid'}); + + $sql = "UPDATE experiment set mage_xml_id=".$exp->mage_xml_id()." where experiment_id =".$exp->dbID(); + $sth = $self->prepare($sql); + $sth->execute(); + } + + return $exp; +} + + +=head2 list_dbIDs + + Args : None + Example : my @exp_ids = @{$exp_a->list_dbIDs()}; + Description: Gets an array of internal IDs for all Experiment objects in the + current database. + Returntype : List of ints + Exceptions : None + Caller : ? + Status : Medium Risk + +=cut + +sub list_dbIDs { + my ($self) = @_; + + return $self->_list_dbIDs('experiment'); +} + + +### DEPRECATED METHODS ### + +sub fetch_experiment_filter_counts{ + deprecate('Please use FeatureSetAdaptor::fetch_feature_set_filter_counts') +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/ExperimentalChipAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/ExperimentalChipAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,455 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::DBSQL::ExperimentalChipAdaptor +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::Funcgen::DBSQL::ExperimentalChipAdaptor - A database adaptor for fetching and +storing Funcgen ExperimentalChip objects. + +=head1 SYNOPSIS + +my $ec_a = $db->get_ExperimentalChipAdaptor(); + +my @ecs = @{$ec_a->fetch_all_by_Experiment($exp)}; + + +=head1 DESCRIPTION + +The ExperimentalChipAdaptor is a database adaptor for storing and retrieving +Funcgen ExperimentalChip objects. + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::DBSQL::ExperimentalChipAdaptor; + +use Bio::EnsEMBL::Utils::Exception qw( warning throw ); +use Bio::EnsEMBL::Funcgen::ExperimentalChip; +use Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor; + +use vars qw(@ISA); + + +#May need to our this? +@ISA = qw(Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor); + +=head2 fetch_all_by_experiment_dbID + + Arg [1] : int - dbID of Experiment + Example : my @ecs = @{$ec_a->fetch_all_by_experiment_dbID($ac_dbid); + Description: Does what it says on the tin + Returntype : Listref of Bio::EnsEMBL::Funcgen::ExperimentalChip + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub fetch_all_by_experiment_dbID { + my $self = shift; + my $e_dbid = shift; + + my ($ec_id, @results); + + throw("Must specify an experiemntal dbID") if(! $e_dbid); + + + my $sth = $self->prepare(" + SELECT ec.experimental_chip_id + FROM experimental_chip ec, experiment e + WHERE ec.experiment_id = $e_dbid + "); + + + + #can we do a generic fetch here? + + + $sth->execute(); + + + while ($ec_id = $sth->fetchrow()){ + #warn("got ec id $ec_id\n"); + push @results, $self->fetch_by_dbID($ec_id); + } + + return \@results; +} + +=head2 fetch_all_by_Experiment + + Arg [1] : Bio::EnsEMBL::Funcgen::Experiment + Example : my @ecs = @{$ec_a->fetch_all_by_Experiment($exp)}; + Description: Does what it says on the tin + Returntype : Listref of Bio::EnsEMBL::Funcgen::ExperimentalChips + Exceptions : throws if valid stored Experiment not passed + Caller : General + Status : at risk + +=cut + + +sub fetch_all_by_Experiment(){ + my ($self, $exp) = @_; + + if(! ($exp && $exp->isa('Bio::EnsEMBL::Funcgen::Experiment') && $exp->dbID())){ + throw('Must provide a valid stored Bio::EnsEMBL::Funcgen::Experiment'); + } + + $self->generic_fetch("ec.experiment_id=".$exp->dbID()); +} + +=head2 fetch_all_by_ArrayChip + + Arg [1] : Bio::EnsEMBL::Funcgen::ArrayChip + Example : my @ecs = @{$ec_a->fetch_all_by_ArrayChip($achip)}; + Description: Retrieves all ExperimentChips which have the corresponding ArrayChip design + Returntype : Listref of Bio::EnsEMBL::Funcgen::ExperimentalChips + Exceptions : throws if valid stored ArrayChip not passed + Caller : General + Status : at risk + +=cut + + +sub fetch_all_by_ArrayChip{ + my ($self, $achip) = @_; + + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::ArrayChip', $achip); + + $self->generic_fetch("ec.array_chip_id=".$achip->dbID()); +} + +=head2 fetch_by_unique_and_experiment_id + + Arg [2] : int - unique_id + Arg [1] : int - dbID of Experiment + Example : my $ec = ec_a->fetch_by_unique_and_experiment_id($c_uid, $exp_dbid); + Description: Does what it says on the tin + Returntype : Bio::EnsEMBL::Funcgen::ExperimentalChip + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub fetch_by_unique_and_experiment_id { + my ($self, $c_uid, $e_dbid) = @_; + + + + + throw("Must provide and unique_id and and experiment_id") if(! $c_uid || ! $e_dbid); + + my $sth = $self->prepare(" + SELECT ec.experimental_chip_id + FROM experimental_chip ec + WHERE ec.unique_id ='$c_uid' + AND ec.experiment_id = $e_dbid + "); + + + $sth->execute(); + my ($ec_id) = $sth->fetchrow(); + + + return $self->fetch_by_dbID($ec_id) if $ec_id; +} + + +=head2 fetch_by_unique_id_vendor + + Arg [2] : string - unique_id + Arg [1] : string - name of array vendor e.g. NIMBLEGEN + Example : my $ec = $ec_a->fetch_by_unique_id_vendor($c_uid, $vendor); + Description: Fetches a unique ExperimentalChip for a given unique id and vendor + Returntype : Bio::EnsEMBL::Funcgen::ExperimentalChip + Exceptions : None + Caller : General + Status : at Risk + +=cut + +sub fetch_by_unique_id_vendor { + my ($self, $c_uid, $vendor) = @_; + + throw("Must provide and unique_id and and vendor") if(! $c_uid || ! $vendor); + + my ($ec_id, $ac_id, @ecids, $avendor); + + + my $sth = $self->prepare(" + SELECT ec.experimental_chip_id, ec.array_chip_id + FROM experimental_chip ec + WHERE ec.unique_id ='$c_uid' + "); + + + $sth->execute(); + + while (($ec_id, $ac_id) = $sth->fetchrow()){ + + my $sql = "SELECT a.vendor from array a, array_chip ac where a.array_id=ac.array_id and ac.array_chip_id=$ac_id;"; + ($avendor) = @{$self->db->dbc->db_handle->selectrow_arrayref($sql)}; + push @ecids, $ec_id if (uc($avendor) eq uc($vendor)); + } + + #This check shouldn't be necessary if this control is illicited on import + #no unique key possible so just for safety until import fully tested. + if(scalar(@ecids) > 1){ + throw("Found more than one ExperimentalChip with the same unique_id($c_uid) for $vendor"); + + } + + return $self->fetch_by_dbID($ecids[0]) if $ecids[0]; +} + + + + +=head2 _tables + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns the names and aliases of the tables to use for queries. + Returntype : List of listrefs of strings + Exceptions : None + Caller : Internal + Status : Medium Risk + +=cut + +sub _tables { + my $self = shift; + + return ['experimental_chip', 'ec']; +} + +=head2 _columns + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns a list of columns to use for queries. + Returntype : List of strings + Exceptions : None + Caller : Internal + Status : Medium Risk + +=cut + +sub _columns { + my $self = shift; + + return qw( ec.experimental_chip_id ec.unique_id + ec.experiment_id ec.array_chip_id + ec.feature_type_id ec.cell_type_id + ec.biological_replicate ec.technical_replicate); +} + +=head2 _objs_from_sth + + Arg [1] : DBI statement handle object + Example : None + Description: PROTECTED implementation of superclass abstract method. + Creates Array objects from an executed DBI statement + handle. + Returntype : Listref of Bio::EnsEMBL::Funcgen::ExperimentalChip objects + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _objs_from_sth { + my ($self, $sth) = @_; + + my (@result, $ec_id, $c_uid, $exp_id, $ac_id, $ftype_id, $ctype_id, $brep, $trep, $ftype, $ctype); + + my $ft_adaptor = $self->db->get_FeatureTypeAdaptor(); + my $ct_adaptor = $self->db->get_CellTypeAdaptor(); + + + $sth->bind_columns(\$ec_id, \$c_uid, \$exp_id, \$ac_id, \$ftype_id, \$ctype_id, \$brep, \$trep); + + while ( $sth->fetch() ) { + + $ftype = (defined $ftype_id) ? $ft_adaptor->fetch_by_dbID($ftype_id) : undef; + $ctype = (defined $ctype_id) ? $ct_adaptor->fetch_by_dbID($ctype_id) : undef; + + my $array = Bio::EnsEMBL::Funcgen::ExperimentalChip->new( + -dbID => $ec_id, + -unique_id => $c_uid, + -experiment_id => $exp_id, + -array_chip_id => $ac_id, + -feature_type => $ftype, + -cell_type => $ctype, + -biological_replicate => $brep, + -technical_replicate => $trep, + -adaptor => $self, + ); + + push @result, $array; + + } + return \@result; +} + + + +=head2 store + + Args : List of Bio::EnsEMBL::Funcgen::ExperimentalChip objects + Example : $oaa->store($ec1, $ec2, $ec3); + Description: Stores given ExperimentalChip objects in the database. Should only be + called once per array because no checks are made for duplicates. + Sets dbID and adaptor on the objects that it stores. + Returntype : ARRAYREF + Exceptions : Throws if passed non-ExperimentalChip arg or if ExperimentalChip already stored but arg has no dbID + Caller : General + Status : Medium Risk + +=cut + +sub store { + my $self = shift; + my @args = @_; + + my ($sarray); + + my $sth = $self->prepare(" + INSERT INTO experimental_chip + (unique_id, experiment_id, array_chip_id, feature_type_id, + cell_type_id, biological_replicate, technical_replicate) + VALUES (?, ?, ?, ?, ?, ?, ?)"); + + foreach my $ec (@args) { + throw('Can only store ExperimentalChip objects') if ( ! $ec->isa('Bio::EnsEMBL::Funcgen::ExperimentalChip') ); + + if (!( $ec->dbID() && $ec->adaptor() == $self )){ + + #Some slight jiggery pokery here to get the vendor as the EC does not yet have an adaptor + #cache these? + my $vendor = $self->db->get_ArrayChipAdaptor->fetch_by_dbID($ec->array_chip_id)->get_Array->vendor(); + my $s_ec = $self->fetch_by_unique_id_vendor($ec->unique_id(), $vendor); + + throw("ExperimentalChip already exists in the database with dbID:".$s_ec->dbID(). + "\nTo reuse/update this ExperimentalChip you must retrieve it using the ExperimentalChipAdaptor". + "\nMaybe you want to use the -recover option?") if $s_ec; + + my $ftype_id = (defined $ec->feature_type()) ? $ec->feature_type->dbID() : undef; + my $ctype_id = (defined $ec->cell_type()) ? $ec->cell_type->dbID() : undef; + + $sth->bind_param(1, $ec->unique_id(), SQL_VARCHAR); + $sth->bind_param(2, $ec->experiment_id(), SQL_VARCHAR); + $sth->bind_param(3, $ec->array_chip_id(), SQL_VARCHAR); + $sth->bind_param(4, $ftype_id, SQL_INTEGER); + $sth->bind_param(5, $ctype_id, SQL_INTEGER); + $sth->bind_param(6, $ec->biological_replicate(), SQL_VARCHAR); + $sth->bind_param(7, $ec->technical_replicate(), SQL_VARCHAR); + + $sth->execute(); + my $dbID = $sth->{'mysql_insertid'}; + $ec->dbID($dbID); + $ec->adaptor($self); + + #} + #else{ + # $ec = $s_ec; + + #my @states = @{$self->db->fetch_all_states('experimental_chip', $ec->dbID())}; + # my @states = @{$self->db->get_StatusAdaptor->fetch_all_states($ec)}; + # warn("Using previously stored ExperimentalChip (".$ec->unique_id().") with states\t@states\n"); + # } + }else{ + #assume we want to update the states + warn('You may want to use $exp_chip->adaptor->store_states($exp_chip)'); + $self->store_states($ec); + } + } + + return \@args; +} + + + + +sub update_replicate_types{ + my ($self, $echip) = @_; + + if(! ($echip && $echip->isa('Bio::EnsEMBL::Funcgen::ExperimentalChip') && $echip->dbID())){ + throw('Must provide a valid store Bio::EnsEMBL::Funcgen::ExperimentalChip'); + } + + my $sql = 'UPDATE experimental_chip set biological_replicate="'.$echip->biological_replicate(). + '" where experimental_chip_id='.$echip->dbID(); + + $self->db->dbc->do($sql); + + $sql = 'UPDATE experimental_chip set technical_replicate="'.$echip->technical_replicate(). + '" where experimental_chip_id='.$echip->dbID(); + $self->db->dbc->do($sql); + + + if(defined $echip->cell_type()){ + $sql = 'UPDATE experimental_chip set cell_type_id="'.$echip->cell_type()->dbID. + '" where experimental_chip_id='.$echip->dbID(); + $self->db->dbc->do($sql); + } + + if(defined $echip->feature_type()){ + $sql = 'UPDATE experimental_chip set feature_type_id="'.$echip->feature_type()->dbID. + '" where experimental_chip_id='.$echip->dbID(); + $self->db->dbc->do($sql); + } + + + + return; +} + +=head2 list_dbIDs + + Args : None + Example : my @array_ids = @{$ec_a->list_dbIDs()}; + Description: Gets an array of internal IDs for all ExperimentalChip objects in the + current database. + Returntype : List of ints + Exceptions : None + Caller : ? + Status : Medium Risk + +=cut + +sub list_dbIDs { + my ($self) = @_; + return $self->_list_dbIDs('experimental_chip'); +} + + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/ExperimentalGroupAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/ExperimentalGroupAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,245 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::DBSQL::ExperimentalGroupAdaptor +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::Funcgen::DBSQL::ExperimentalGroupAdaptor - A database adaptor for fetching and +storing Funcgen ExperimentalGroup objects. + +=head1 SYNOPSIS + +my $eg_adaptor = $db->get_ExperimentalGroupAdaptor(); + +my $group = $eg_adaptor->fetch_by_name("EBI"); + +=head1 DESCRIPTION + +The ExperimentalGroupAdaptor is a database adaptor for storing and retrieving +Funcgen ExperimentalGroup objects. + +=head1 SEE ALSO + +Bio::EnsEMBL::Funcgen::ExperimentalGroup + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::DBSQL::ExperimentalGroupAdaptor; + +use Bio::EnsEMBL::Utils::Exception qw( warning throw deprecate ); +use Bio::EnsEMBL::Funcgen::ExperimentalGroup; +use Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor); + + +=head2 fetch_by_name + + Arg [1] : string - name of ExperimentalGroup + Example : my $group = $eg_adaptor->fetch_by_name('EBI'); + Description: Fetches the group with the given name + Returntype : Bio::EnsEMBL::Funcgen::ExperimentalGroup object + Exceptions : + Caller : General + Status : At risk + +=cut + +sub fetch_by_name{ + my ($self, $name) = @_; + + throw("Must specify a Group name") if(! $name); + + my $constraint = " name = ?"; + + $self->bind_param_generic_fetch($name, SQL_VARCHAR); + + my @fts = @{$self->generic_fetch($constraint)}; + + return $fts[0]; +} + +=head2 _tables + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns the names and aliases of the tables to use for queries. + Returntype : List of listrefs of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _tables { + my $self = shift; + + return (['experimental_group', 'eg']); +} + +=head2 _columns + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns a list of columns to use for queries. + Returntype : List of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _columns { + my $self = shift; + + return qw( eg.experimental_group_id eg.name eg.location eg.contact eg.url eg.description eg.is_project); +} + +=head2 _objs_from_sth + + Arg [1] : DBI statement handle object + Example : None + Description: PROTECTED implementation of superclass abstract method. + Creates Channel objects from an executed DBI statement + handle. + Returntype : Listref of Bio::EnsEMBL::Funcgen::ExperimentalGroup objects + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _objs_from_sth { + my ($self, $sth) = @_; + + my (@result, $eg_id, $name, $location, $contact, $url, $desc, $is_project); + + $sth->bind_columns(\$eg_id, \$name, \$location, \$contact, \$url, \$desc, \$is_project); + + while ( $sth->fetch() ) { + my $group = Bio::EnsEMBL::Funcgen::ExperimentalGroup->new( + -dbID => $eg_id, + -NAME => $name, + -LOCATION => $location, + -CONTACT => $contact, + -URL => $url, + -DESCRIPTION => $desc, + -IS_PROJECT => $is_project, + -ADAPTOR => $self, + ); + + push @result, $group; + + } + return \@result; +} + + + +=head2 store + + Args : List of Bio::EnsEMBL::Funcgen::ExperimentalGroup objects + Example : $eg_adaptor->store($g1, $g2, $g3); + Description: Stores given ExperimentalGroup objects in the database. + Returntype : None + Exceptions : Throws if ExperimentalGroup not valid + Caller : General + Status : At Risk + +=cut + +sub store { + my $self = shift; + my @args = @_; + + + my $sth = $self->prepare(" + INSERT INTO experimental_group + (name, location, contact, url, description, is_project) + VALUES (?, ?, ?, ?, ?, ?)"); + + + + foreach my $group (@args) { + + if ( ! (ref($group) && $group->isa('Bio::EnsEMBL::Funcgen::ExperimentalGroup') )) { + throw('Can only store ExperimentalGroup objects, skipping $group'); + } + + if (!( $group->dbID() && $group->adaptor() == $self )){ + + #Check for previously stored FeatureType + my $s_eg = $self->fetch_by_name($group->name()); + + if(! $s_eg){ + $sth->bind_param(1, $group->name(), SQL_VARCHAR); + $sth->bind_param(2, $group->location(), SQL_VARCHAR); + $sth->bind_param(3, $group->contact(), SQL_VARCHAR); + $sth->bind_param(4, $group->url(), SQL_VARCHAR); + $sth->bind_param(5, $group->description(), SQL_VARCHAR); + $sth->bind_param(6, $group->is_project(), SQL_BOOLEAN); + + $sth->execute(); + my $dbID = $sth->{'mysql_insertid'}; + $group->dbID($dbID); + $group->adaptor($self); + } + else{ + $group = $s_eg; + warn("Using previously stored ExperimentalGroup:\t".$group->name()."\n"); + } + } + } + + return \@args; +} + + +=head2 list_dbIDs + + Args : None + Example : my @group_ids = @{$eg_adaptor->list_dbIDs()}; + Description: Gets an array of internal IDs for all ExperimentalGroup objects in the + current database. + Returntype : List of ints + Exceptions : None + Caller : ? + Status : At risk + +=cut + +sub list_dbIDs { + my ($self) = @_; + + return $self->_list_dbIDs('experimental_group'); +} + + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/ExperimentalSetAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/ExperimentalSetAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,462 @@ +# +# Ensembl module for Bio::EnsEMBL::DBSQL::Funcgen::ExperimentalSetAdaptor +# +# You may distribute this module under the same terms as Perl itself + +=head1 NAME + +Bio::EnsEMBL::DBSQL::Funcgen::ExperimentalSetAdaptor - A database adaptor for fetching and +storing ExperimentalSet objects. + +=head1 SYNOPSIS + +my $rset_adaptor = $db->get_ExperimentalSetAdaptor(); + +my @rsets = @{$rset_adaptor->fetch_all_ExperimentalSets_by_Experiment()}; +my @displayable_rsets = @{$rset_adaptor->fetch_all_displayable_ExperimentalSets()}; + +#Other methods? +#by FeatureType, CellType all with displayable flag? + + +=head1 DESCRIPTION + +The ExperimentalSetAdaptor is a database adaptor for storing and retrieving +ExperimentalSet objects. + +=head1 AUTHOR + +This module was created by Nathan Johnson. + +This module is part of the Ensembl project: http://www.ensembl.org/ + +=head1 CONTACT + +Post comments or questions to the Ensembl development list: ensembl-dev@ebi.ac.uk + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::DBSQL::ExperimentalSetAdaptor; + +use Bio::EnsEMBL::Utils::Exception qw( throw warning ); +use Bio::EnsEMBL::Funcgen::ExperimentalSet; +use Bio::EnsEMBL::Funcgen::ResultFeature; +use Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(mean median); + +use vars qw(@ISA); + + +@ISA = qw(Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor); + + + +=head2 fetch_all_by_FeatureType + + Arg [1] : Bio::EnsEMBL::Funcgen::FeatureType + Example : + Description: Retrieves a list of features on a given slice that are created + by probes from the specified type of array. + Returntype : Listref of Bio::EnsEMBL::OligoFeature objects + Exceptions : Throws if no array type is provided + Caller : General + Status : At Risk + +=cut + +sub fetch_all_by_FeatureType { + my ($self, $ftype) = @_; + + if( !(ref($ftype) && $ftype->isa("Bio::EnsEMBL::Funcgen::FeatureType") && $ftype->dbID())){ + throw("Need to pass a valid stored Bio::EnsEMBL::Funcgen::FeatureType"); + } + + my $constraint = "es.feature_type_id =".$ftype->dbID(); + + return $self->generic_fetch($constraint); +} + + +=head2 fetch_all_by_CellType + + Arg [1] : Bio::EnsEMBL::Funcgen::CellType + Example : + Description: + Returntype : Arrayref of Bio::EnsEMBL::Funcgen::ExperimentalSet objects + Exceptions : Throws if no CellType is provided + Caller : General + Status : At Risk + +=cut + +sub fetch_all_by_CellType { + my ($self, $ctype) = @_; + + if( !(ref($ctype) && $ctype->isa("Bio::EnsEMBL::Funcgen::CellType") && $ctype->dbID())){ + throw("Need to pass a valid stored Bio::EnsEMBL::Funcgen::CellType"); + } + + my $constraint = "es.cell_type_id =".$ctype->dbID(); + + return $self->generic_fetch($constraint); +} + + +=head2 fetch_all_by_Experiment + + Arg [1] : Bio::EnsEMBL::Funcgen::Experiment + Example : $exp_set = $eseta->fetch_by_Experiment($exp); + Description: Retrieves a ExperimentalSet based on the given Experiment + Returntype : Bio::EnsEMBL::Funcgen::ExperimentalSet + Exceptions : Throws if no valid stored Experiment provided + Caller : General + Status : At Risk + +=cut + +sub fetch_all_by_Experiment { + my ($self, $exp) = @_; + + if( ! ( ref($exp) && + $exp->isa('Bio::EnsEMBL::Funcgen::Experiment') && + $exp->dbID())){ + throw('Need to pass a valid stored Bio::EnsEMBL::Funcgen::Experiment'); + } + + return $self->generic_fetch('es.experiment_id = '.$exp->dbID()); +} + +=head2 fetch_by_name + + Arg [1] : string - ExperimentalSet name + Example : $exp_set = $eseta->fetch_by_Experiment('exp_set_1'); + Description: Retrieves a ExperimentalSet based on the ExperimetnalSet name + Returntype : Bio::EnsEMBL::Funcgen::ExperimentalSet + Exceptions : Throws if no name provided + Caller : General + Status : At Risk + +=cut + +sub fetch_by_name { + my ($self, $name) = @_; + + throw('Need to pass a name argument') if( ! defined $name); + + return $self->generic_fetch("es.name ='${name}'")->[0]; +} + +=head2 _tables + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns the names and aliases of the tables to use for queries. + Returntype : List of listrefs of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _tables { + my $self = shift; + + return ( + [ 'experimental_set', 'es' ], + [ 'experimental_subset', 'ess' ], + ); +} + +=head2 _columns + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns a list of columns to use for queries. + Returntype : List of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _columns { + my $self = shift; + + return qw( + es.experimental_set_id es.experiment_id + es.feature_type_id es.cell_type_id + es.format es.vendor + es.name ess.name + ess.experimental_subset_id + ); + + +} + +#=head2 _default_where_clause +# +# Args : None +# Example : None +# Description: PROTECTED implementation of superclass abstract method. +# Returns an additional table joining constraint to use for +# queries. +# Returntype : List of strings +# Exceptions : None +# Caller : Internal +# Status : At Risk +# +#=cut + +#sub _default_where_clause { +# my $self = shift; + +# return 'es.experimental_set_id = ess.experimental_set_id'; + +#} + +=head2 _left_join + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns an additional table joining constraint to use for + queries. + Returntype : List + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _left_join { + my $self = shift; + + return (['experimental_subset', 'es.experimental_set_id = ess.experimental_set_id']); +} + + + + + +=head2 _objs_from_sth + + Arg [1] : DBI statement handle object + Example : None + Description: PROTECTED implementation of superclass abstract method. + Creates Array objects from an executed DBI statement + handle. + Returntype : Listref of Bio::EnsEMBL::Funcgen::Experiment objects + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _objs_from_sth { + my ($self, $sth) = @_; + + my ($dbid, $exp_id, $ftype_id, $ctype_id, $format, $vendor, $name, $ess_name, $ess_id); + my ($eset, @esets, $ftype, $ctype); + my $ft_adaptor = $self->db->get_FeatureTypeAdaptor(); + my $ct_adaptor = $self->db->get_CellTypeAdaptor(); + my $exp_adaptor = $self->db->get_ExperimentAdaptor(); + $sth->bind_columns(\$dbid, \$exp_id, \$ftype_id, \$ctype_id, \$format, \$vendor, \$name, \$ess_name, \$ess_id); + + #this fails if we delete entries from the joined tables + #causes problems if we then try and store an rs which is already stored + + while ( $sth->fetch() ) { + + if(! $eset || ($eset->dbID() != $dbid)){ + + push @esets, $eset if $eset; + $ftype = (defined $ftype_id) ? $ft_adaptor->fetch_by_dbID($ftype_id) : undef; + $ctype = (defined $ctype_id) ? $ct_adaptor->fetch_by_dbID($ctype_id) : undef; + + $eset = Bio::EnsEMBL::Funcgen::ExperimentalSet->new( + -DBID => $dbid, + -EXPERIMENT => $exp_adaptor->fetch_by_dbID($exp_id), + -FORMAT => $format, + -VENDOR => $vendor, + -FEATURE_TYPE => $ftype, + -CELL_TYPE => $ctype, + -ADAPTOR => $self, + -NAME => $name, + ); + } + + #This assumes logical association between chip from the same exp, confer in store method????????????????? + + + #we're not controlling ctype and ftype during creating new ExperimentalSets to store. + #we should change add_table_id to add_ExperimentalChip and check in that method + if(defined $ess_name){ + + $eset->add_new_subset($ess_name, Bio::EnsEMBL::Funcgen::ExperimentalSubset->new( -name => $ess_name, + -dbID => $ess_id, + -adaptor => $self, + -experimental_set => $eset, + )); + + } + } + + push @esets, $eset if $eset; + + return \@esets; +} + + + +=head2 store + + Args : List of Bio::EnsEMBL::Funcgen::ExperimentalSet objects + Example : $rsa->store(@esets); + Description: Stores or updates previously stored ExperimentalSet objects in the database. + Returntype : None + Exceptions : Throws if a List of ExperimentalSet objects is not provided or if + an analysis is not attached to any of the objects + Caller : General + Status : At Risk + +=cut + +sub store{ + my ($self, @exp_sets) = @_; + + throw("Must provide a list of ExperimentalSet objects") if(scalar(@exp_sets == 0)); + + + + my $sth = $self->prepare('INSERT INTO experimental_set (experiment_id, feature_type_id, + cell_type_id, format, vendor, name) + VALUES (?, ?, ?, ?, ?, ?)'); + + my $db = $self->db(); + + foreach my $set (@exp_sets) { + + if( ! ref $set || ! $set->isa('Bio::EnsEMBL::Funcgen::ExperimentalSet') ) { + throw('Must be an ExperimentalSet object to store'); + } + + + if ( $set->is_stored($db) ) { + throw('ExperimentalSet [' . $set->dbID() . '] is already stored in the database\nExperimentalSetAdaptor does not yet accomodate updating ExperimentalSets'); + #would need to retrive stored result set and update table_ids + } + + + my $ct_id = (defined $set->cell_type()) ? $set->cell_type->dbID() : undef; + my $ft_id = (defined $set->feature_type()) ? $set->feature_type->dbID() : undef; + + $sth->bind_param(1, $set->get_Experiment->dbID(), SQL_INTEGER); + $sth->bind_param(2, $ft_id, SQL_INTEGER); + $sth->bind_param(3, $ct_id, SQL_INTEGER); + $sth->bind_param(4, $set->format, SQL_VARCHAR); + $sth->bind_param(5, $set->vendor, SQL_VARCHAR); + $sth->bind_param(6, $set->name, SQL_VARCHAR); + + + $sth->execute(); + + $set->dbID( $sth->{'mysql_insertid'} ); + $set->adaptor($self); + + + $self->store_ExperimentalSubsets($set->get_subsets()) if @{$set->get_subsets()}; + } + + return \@exp_sets; +} + + +=head2 store_ExperimentalSubsets + + Args : Bio::EnsEMBL::Funcgen::ExperimentalSet + Example : $esa->store_ExperimentalSubsets(\@e_subsets); + Description: Convenience methods extracted from store to allow updating of ExperimentalSubset entries + during inline result processing which would otherwise be troublesome due to the need + for an ExperimentalSet + Returntype : Bio::EnsEMBL::Funcgen::ExperimentalSet + Exceptions : Throws if a stored ExperimentalSet object is not provided + Throws if no ExperimentalSubsets present + Caller : General + Status : At Risk + +=cut + + +sub store_ExperimentalSubsets{ + my ($self, $ssets) = @_; + + my $sth = $self->prepare(" + INSERT INTO experimental_subset ( + experimental_set_id, name + ) VALUES (?, ?) + "); + + throw('Must provide at least one ExperimentalSubset') if(! @$ssets); + + #Store and set all previously unstored table_ids + foreach my $sset(@$ssets){ + + #use is_stored here? + if($sset->dbID()){ + warn "Skipping ExperimentalSubset ".$sset->name()." - already stored in the DB"; + next; + } + + + $sth->bind_param(1, $sset->experimental_set->dbID(), SQL_INTEGER); + $sth->bind_param(2, $sset->name(), SQL_VARCHAR); + $sth->execute(); + + $sset->dbID($sth->{'mysql_insertid'}); + $sset->adaptor($self); + + + #No need to set it as we're working on the hasref here, so should be updated in the class. + #add directly to avoid name clash warnings + #$exp_set->{'subsets'}{$sub_set_name} = Bio::EnsEMBL::Funcgen::ExperimentalSubset->new + # ( + # -dbID => $sth->{'mysql_insertid'}, + # -name => $sub_set_name, + # -adaptor => $self, + # #-experimental_set_id? + # ); + + } + + #don't really need to return as we're passing the ref + return $ssets; +} + +=head2 list_dbIDs + + Args : None + Example : my @sets_ids = @{$esa->list_dbIDs()}; + Description: Gets an array of internal IDs for all ExperimentalSet objects in + the current database. + Returntype : List of ints + Exceptions : None + Caller : general + Status : stable + +=cut + +sub list_dbIDs { + my $self = shift; + + return $self->_list_dbIDs('result_set'); +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/ExternalFeatureAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/ExternalFeatureAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,354 @@ +# +# Ensembl module for Bio::EnsEMBL::DBSQL::Funcgen::ExternalFeatureAdaptor +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::DBSQL::Funcgen::ExternalFeatureAdaptor - A database adaptor for fetching and +storing ExternalFeature objects. + +=head1 SYNOPSIS + +my $afa = $db->get_ExternalFeatureAdaptor(); + +my $features = $afa->fetch_all_by_Slice($slice); + + +=head1 DESCRIPTION + +The ExternalFeatureAdaptor is a database adaptor for storing and retrieving +ExternalFeature objects. + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::DBSQL::ExternalFeatureAdaptor; + +use Bio::EnsEMBL::Utils::Exception qw( throw warning ); +use Bio::EnsEMBL::Funcgen::ExternalFeature; +use Bio::EnsEMBL::Funcgen::DBSQL::SetFeatureAdaptor; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::DBSQL::SetFeatureAdaptor); + + +=head2 _tables + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns the names and aliases of the tables to use for queries. + Returntype : List of listrefs of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _tables { + my $self = shift; + + return ( + [ 'external_feature', 'ef' ], + [ 'feature_set', 'fs' ],#this is required for fetching on analysis, external_db (cell_type or feature_type). + ); +} + +=head2 _columns + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns a list of columns to use for queries. + Returntype : List of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _columns { + my $self = shift; + + return qw( + ef.external_feature_id ef.seq_region_id + ef.seq_region_start ef.seq_region_end + ef.seq_region_strand ef.display_label + ef.feature_type_id ef.feature_set_id + ef.interdb_stable_id + ); +} + +=head2 _objs_from_sth + + Arg [1] : DBI statement handle object + Example : None + Description: PROTECTED implementation of superclass abstract method. + Creates ExternalFeature objects from an executed DBI statement + handle. + Returntype : Listref of Bio::EnsEMBL::ExternalFeature objects + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _objs_from_sth { + my ($self, $sth, $mapper, $dest_slice) = @_; + + #For EFG this has to use a dest_slice from core/dnaDB whether specified or not. + #So if it not defined then we need to generate one derived from the species_name and schema_build of the feature we're retrieving. + + + my ($sa, @features, $seq_region_id, %fset_hash, %slice_hash, %sr_name_hash, %sr_cs_hash, %ftype_hash); + $sa = $dest_slice->adaptor->db->get_SliceAdaptor() if($dest_slice);#don't really need this if we're using DNADBSliceAdaptor? + $sa ||= $self->db->get_SliceAdaptor(); + + my $fset_adaptor = $self->db->get_FeatureSetAdaptor(); + my $ftype_adaptor = $self->db->get_FeatureTypeAdaptor(); + my ( + $external_feature_id, $efg_seq_region_id, + $seq_region_start, $seq_region_end, + $seq_region_strand, $fset_id, + $display_label, $ftype_id, + $interdb_stable_id + ); + + $sth->bind_columns( + \$external_feature_id, \$efg_seq_region_id, + \$seq_region_start, \$seq_region_end, + \$seq_region_strand, \$display_label, + \$ftype_id, \$fset_id, + \$interdb_stable_id + ); + + + #abstract to BaseFeatureAdaptor? + my ($asm_cs, $cmp_cs, $asm_cs_name, $asm_cs_vers, $cmp_cs_name, $cmp_cs_vers); + + if ($mapper) { + $asm_cs = $mapper->assembled_CoordSystem(); + $cmp_cs = $mapper->component_CoordSystem(); + $asm_cs_name = $asm_cs->name(); + $asm_cs_vers = $asm_cs->version(); + $cmp_cs_name = $cmp_cs->name(); + $cmp_cs_vers = $cmp_cs->version(); + } + + my ($dest_slice_start, $dest_slice_end, $dest_slice_strand, $dest_slice_length, $dest_slice_sr_name); + + if ($dest_slice) { + $dest_slice_start = $dest_slice->start(); + $dest_slice_end = $dest_slice->end(); + $dest_slice_strand = $dest_slice->strand(); + $dest_slice_length = $dest_slice->length(); + $dest_slice_sr_name = $dest_slice->seq_region_name(); + } + + + FEATURE: while ( $sth->fetch() ) { + #Need to build a slice adaptor cache here? + #Would only ever want to do this if we enable mapping between assemblies?? + #Or if we supported the mapping between cs systems for a given schema_build, + #which would have to be handled by the core api + + #get core seq_region_id + $seq_region_id = $self->get_core_seq_region_id($efg_seq_region_id); + + if(! $seq_region_id){ + warn "Cannot get slice for eFG seq_region_id $efg_seq_region_id\n". + "The region you are using is not present in the current dna DB"; + next; + } + + + #Get the FeatureSet/Type objects + $fset_hash{$fset_id} = $fset_adaptor->fetch_by_dbID($fset_id) if(! exists $fset_hash{$fset_id}); + $ftype_hash{$ftype_id} = $ftype_adaptor->fetch_by_dbID($ftype_id) if(! exists $ftype_hash{$ftype_id}); + + # Get the slice object + my $slice = $slice_hash{'ID:'.$seq_region_id}; + + if (! $slice) { + $slice = $sa->fetch_by_seq_region_id($seq_region_id); + $slice_hash{'ID:'.$seq_region_id} = $slice; + $sr_name_hash{$seq_region_id} = $slice->seq_region_name(); + $sr_cs_hash{$seq_region_id} = $slice->coord_system(); + } + + my $sr_name = $sr_name_hash{$seq_region_id}; + my $sr_cs = $sr_cs_hash{$seq_region_id}; + + #abstract to BaseFeatureAdaptor? + # Remap the feature coordinates to another coord system if a mapper was provided + if ($mapper) { + + throw("Not yet implmented mapper, check equals are Funcgen calls too!"); + + ($sr_name, $seq_region_start, $seq_region_end, $seq_region_strand) + = $mapper->fastmap($sr_name, $seq_region_start, $seq_region_end, $seq_region_strand, $sr_cs); + + # Skip features that map to gaps or coord system boundaries + next FEATURE if !defined $sr_name; + + # Get a slice in the coord system we just mapped to + if ( $asm_cs == $sr_cs || ( $cmp_cs != $sr_cs && $asm_cs->equals($sr_cs) ) ) { + $slice = $slice_hash{"NAME:$sr_name:$cmp_cs_name:$cmp_cs_vers"} + ||= $sa->fetch_by_region($cmp_cs_name, $sr_name, undef, undef, undef, $cmp_cs_vers); + } else { + $slice = $slice_hash{"NAME:$sr_name:$asm_cs_name:$asm_cs_vers"} + ||= $sa->fetch_by_region($asm_cs_name, $sr_name, undef, undef, undef, $asm_cs_vers); + } + } + + # If a destination slice was provided convert the coords + # If the destination slice starts at 1 and is forward strand, nothing needs doing + if ($dest_slice) { + unless ($dest_slice_start == 1 && $dest_slice_strand == 1) { + if ($dest_slice_strand == 1) { + $seq_region_start = $seq_region_start - $dest_slice_start + 1; + $seq_region_end = $seq_region_end - $dest_slice_start + 1; + } else { + my $tmp_seq_region_start = $seq_region_start; + $seq_region_start = $dest_slice_end - $seq_region_end + 1; + $seq_region_end = $dest_slice_end - $tmp_seq_region_start + 1; + $seq_region_strand *= -1; + } + } + + # Throw away features off the end of the requested slice + next FEATURE if $seq_region_end < 1 || $seq_region_start > $dest_slice_length + || ( $dest_slice_sr_name ne $sr_name ); + + $slice = $dest_slice; + } + + push @features, Bio::EnsEMBL::Funcgen::ExternalFeature->new_fast + ({ + 'start' => $seq_region_start, + 'end' => $seq_region_end, + 'strand' => $seq_region_strand, + 'slice' => $slice, + 'analysis' => $fset_hash{$fset_id}->analysis(), + 'adaptor' => $self, + 'dbID' => $external_feature_id, + 'display_label' => $display_label, + 'set' => $fset_hash{$fset_id}, + 'feature_type' => $ftype_hash{$ftype_id}, + 'interdb_stable_id', => $interdb_stable_id, + }); + } + + return \@features; +} + +=head2 store + + Args : List of Bio::EnsEMBL::Funcgen::ExternalFeature objects + Example : $ofa->store(@features); + Description: Stores given ExternalFeature objects in the database. Should only + be called once per feature because no checks are made for + duplicates. Sets dbID and adaptor on the objects that it stores. + Returntype : Arrayref of stored ExternalFeatures + Exceptions : Throws if a list of ExternalFeature objects is not provided or if + the Analysis, CellType and FeatureType objects are not attached or stored + Caller : General + Status : At Risk + +=cut + +sub store{ + my ($self, @efs) = @_; + + if (scalar(@efs) == 0) { + throw('Must call store with a list of ExternalFeature objects'); + } + + my $sth = $self->prepare(" + INSERT INTO external_feature ( + seq_region_id, seq_region_start, + seq_region_end, seq_region_strand, + display_label, feature_type_id, + feature_set_id + ) VALUES (?, ?, ?, ?, ?, ?, ?) + "); + + my $db = $self->db(); + + FEATURE: foreach my $ef (@efs) { + + if(! (ref($ef) && $ef->isa('Bio::EnsEMBL::Funcgen::ExternalFeature') )) { + throw('Feature must be an ExternalFeature object'); + } + + if ( $ef->is_stored($db) ) { + #does not accomodate adding Feature to >1 feature_set + warning('ExternalFeature [' . $ef->dbID() . '] is already stored in the database'); + next FEATURE; + } + + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::FeatureSet', $ef->feature_set); + + my $seq_region_id; + ($ef, $seq_region_id) = $self->_pre_store($ef); + + $sth->bind_param(1, $seq_region_id, SQL_INTEGER); + $sth->bind_param(2, $ef->start(), SQL_INTEGER); + $sth->bind_param(3, $ef->end(), SQL_INTEGER); + $sth->bind_param(4, $ef->strand(), SQL_TINYINT); + $sth->bind_param(5, $ef->display_label(), SQL_VARCHAR); + $sth->bind_param(6, $ef->feature_type->dbID(), SQL_INTEGER); + $sth->bind_param(7, $ef->feature_set->dbID(), SQL_INTEGER); + + $sth->execute(); + $ef->dbID( $sth->{'mysql_insertid'} ); + $ef->adaptor($self); + $self->store_associated_feature_types($ef) if (defined $ef->{'associated_feature_types'}); + } + + return \@efs; +} + +=head2 fetch_by_interdb_stable_id + + Arg [1] : Integer $stable_id - The 'interdb stable id' of the ExternalFeature to retrieve + Example : my $rf = $rf_adaptor->fetch_by_interdb_stable_id(1); + Description: Retrieves a ExternalFeature via its interdb_stable id. This is really an internal + method to facilitate inter DB linking. + Returntype : Bio::EnsEMBL::Funcgen::ExternalFeature + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_by_interdb_stable_id { + my ($self, $stable_id) = @_; + + $self->bind_param_generic_fetch($stable_id, SQL_INTEGER); + + return $self->generic_fetch('ef.interdb_stable_id=?')->[0]; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/FeatureSetAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/FeatureSetAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,905 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::DBSQL::FeatureSetAdaptor +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::Funcgen::DBSQL::FeatureSetAdaptor - A database adaptor for fetching and +storing Funcgen feature sets. + +=head1 SYNOPSIS + +my $fs_adaptor = $db->get_FeatureSetAdaptor(); + + +my @displayable_fsets = @{$fs_adaptor->fetch_all_displayable()}; + +=head1 DESCRIPTION + +The FeatureSetAdaptor is a database adaptor for storing and retrieving +Funcgen feature set. + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::DBSQL::FeatureSetAdaptor; + +use Bio::EnsEMBL::Utils::Exception qw( warning throw ); +use Bio::EnsEMBL::Funcgen::FeatureSet; +use Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor); +#use base does not import %true_tables or %tables or in fact %sql_types +#so cannot use base for any of the adaptors + +#Exported from BaseAdaptor +#$true_tables{feature_set} = [ [ 'feature_set', 'fs' ] ]; +#This derefs as we push onto tables +#and we don't want to alter true table +#@{$tables{feature_set}} = @{$true_tables{feature_set}}; + + +#Look into using ReadOnly::array? +#This will provide true read only, but will need to be our'd and export/imported +#use in conjuction? +#use Readonly; +#Readonly::Array my @true_tables => ([ 'feature_set', 'fs' ]); +#use constant TRUE_TABLES => [ @true_tables ]; +#does not work + +use constant TRUE_TABLES => [[ 'feature_set', 'fs' ]]; +use constant TABLES => [[ 'feature_set', 'fs' ]]; + +#Currently these need to be listrefs [[], ...] and push directly onto TABLE rather than _tables + + + + +#@{$tables{feature_set}} = (TRUE_TABLES); +#use constant TABLES => (TRUE_TABLES); #this does not deref, hence pushes affect TRUE_TABLES! + + +#use constant here still allows the contents of the ref to be modified +#Simply prevents need for import/export + +#Now change %tables to an attribute! +#Can't set as an attribute here, would have to be done in new or _tables + + +#Had to use hashes to prevent different adaptors resetting package level global vars +#TRUE_TABLES does not require this. + + +#No need for true_final_clause + + +=head2 fetch_all + + Arg [1] : optional HASHREF - Parameter hash containing contraints config lists e.g. + {'constraints' => + { + cell_types => [$cell_type, ...], # Bio::EnsEMBL::Funcgen::CellType + feature_types => [$ftype, ...], # Bio::EnsEMBL::Funcgen::FeatureType + evidence_types => [$evidence_type, ...], # String e.g. 'DNase1 & TFBS' or 'Hists & Pols' + status => $status_name, # String e.g. IMPORTED + analyses => [$analysis, ...] # Bio::EnsEMBL::Analysis + projects => [$proj_name, ...] # String (experimental_group.name is_project=1) e.g. ENCODE + } + } + Example : + Description: Retrieves a list of FeatureSets. Optional parameters hash allows for flexible query terms. + Returntype : ARRAYREF of Bio::EnsEMBL::FeatureSet objects + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub fetch_all{ + my ($self, $params_hash) = @_; + + my $results = $self->generic_fetch($self->compose_constraint_query($params_hash)); + #@{$tables{feature_set}} = @{$true_tables{feature_set}}; #in case we have added tables e.g. status + $self->reset_true_tables; + + return $results; +} + + + +=head2 fetch_all_by_FeatureType + + Arg [1] : Bio::EnsEMBL::Funcgen::FeatureType + Arg [2] : (optional) string - status e.g. 'DISPLAYABLE' + Example : my @fsets = $fs_adaptopr->fetch_all_by_FeatureType($type); + Description: Retrieves FeatureSet objects from the database based on feature_type id. + Returntype : Listref of Bio::EnsEMBL::Funcgen::FeatureSet objects + Exceptions : Throws if arg is not a valid FeatureType + Caller : General + Status : At Risk + +=cut + +sub fetch_all_by_FeatureType { + my ($self, $ftype, $status) = @_; + + my $params = {constraints => {feature_types => [$ftype]}}; + $params->{constraints}{state} = $status if $status; + #No need to reset tables for these + return $self->generic_fetch($self->compose_constraint_query($params)); +} + + +=head2 fetch_all_by_type + + Arg [1] : String - Type of feature set i.e. 'annotated', 'regulatory', 'segmentation' or 'external' + Arg [2] : (optional) string - status e.g. 'DISPLAYABLE' + Example : my @fsets = $fs_adaptopr->fetch_all_by_type('annotated'); + Description: Retrieves FeatureSet objects from the database based on feature_set type. + Returntype : ARRAYREF of Bio::EnsEMBL::Funcgen::FeatureSet objects + Exceptions : Throws if type not defined + Caller : General + Status : At Risk + +=cut + +sub fetch_all_by_type { + my $self = shift; + my $type = shift; + my $status = shift; + + #deprecate this? + + return $self->fetch_all_by_feature_class($type, $status); +} + + +=head2 fetch_all_by_feature_class + + Arg [1] : String - feature class i.e. 'annotated', 'regulatory', 'segmentation' or 'external' + Arg [2] : String (optional) - status e.g. 'DISPLAYABLE' + Arg [2] : Bio::EnsEMBL::Funcgen::CellType (optional) or a HASH parameters + containing contraint config e.g. + + $feature_set_adaptor->fetch_all_displayable_by_type + ('annotated', + {'constraints' => + { + cell_types => [$cell_type], #Bio::EnsEMBL::Funcgen::CellType + projects => ['ENCODE'], + evidence_types => ['Hists & Pols'], + feature_types => [$ftype], #Bio::EnsEMBL::Funcgen::FeatureType + } + }); + + Example : my @fsets = $fs_adaptopr->fetch_all_by_feature_class('annotated'); + Description: Retrieves FeatureSet objects from the database based on feature_set type. + Returntype : ARRAYREF of Bio::EnsEMBL::Funcgen::FeatureSet objects + Exceptions : Throws if type not defined + Caller : General + Status : At Risk - Move status to params hash + +=cut + +sub fetch_all_by_feature_class { + my ($self, $type, $status, $params) = @_; + + throw('Must provide a feature_set type') if(! defined $type); + my $sql = "fs.type = '".$type."'"; + + if(defined $params){ #Some redundancy over $ctype arg and $params cell_type + + if( ref($params) eq 'Bio::EnsEMBL::Funcgen::CellType'){ + $params = {constraints => {cell_types => [$params]}}; + } + elsif(ref($params) ne 'HASH'){ + throw('Argument must be a Bio::EnsEMBL::Funcgen::CellType or a params HASH'); + } + } + + + if($status){ + $params->{constraints}{status} = $status; + } + + + #Deal with params constraints + my $constraint = $self->compose_constraint_query($params); + $sql .= " AND $constraint " if $constraint; + + + #Get result and reset true tables + my $result = (defined $sql) ? $self->generic_fetch($sql) : []; + #@{$tables{feature_set}} = @{$true_tables{feature_set}}; + $self->reset_true_tables; + + + + + return $result; +} + + + + + +=head2 fetch_all_displayable_by_type + + Arg [1] : String - Type of feature set i.e. 'annotated', 'regulatory' or 'supporting' + Arg [2] : Bio::EnsEMBL::Funcgen::CellType (optional) or parameters HASH + Example : my @fsets = $fs_adaptopr->fetch_all_by_type('annotated'); + Description: Wrapper method for fetch_all_by_type + Returntype : ARRAYREF of Bio::EnsEMBL::Funcgen::FeatureSet objects + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub fetch_all_displayable_by_type { + my ($self, $type, $ctype_or_params) = @_; + + #Move status to config hash + $self->fetch_all_by_feature_class($type, 'DISPLAYABLE', $ctype_or_params); + +} + + +=head2 fetch_all_by_CellType + + Arg [1] : Bio::EnsEMBL::Funcgen::CellType + Arg [2] : (optional) string - status e.g. 'DISPLAYABLE' + Example : my @fsets = $fs_adaptopr->fetch_all_by_CellType($ctype); + Description: Retrieves FeatureSet objects from the database based on the CellType. + Returntype : Listref of Bio::EnsEMBL::Funcgen::FeatureSet objects + Exceptions : Throws if arg is not a valid CellType + Caller : General + Status : At Risk + +=cut + +sub fetch_all_by_CellType { + my ($self, $ctype, $status) = @_; + + my $params = {constraints => {cell_types => [$ctype]}}; + $params->{constraints}{status} = $status if ($status); + my $results = $self->generic_fetch($self->compose_constraint_query($params)); + #@{$tables{feature_set}} = @{$true_tables{feature_set}}; #in case we have added status + $self->reset_true_tables; + return $results; +} + + + +=head2 fetch_all_by_FeatureType_Analysis + + Arg [1] : Bio::EnsEMBL::Funcgen::FeatureType + Arg [2] : Bio::EnsEMBL::Analysis + Arg [3] : (optional) Bio::EnsEMBL::Funcgen::CellType + Example : my @fsets = $fs_adaptopr->fetch_all_by_FeatureType_Analysis($ftype, $anal, $ctype); + Description: Retrieves FeatureSet objects from the database based on FeatureType, Analysis and + CellType if defined. + Returntype : Listref of Bio::EnsEMBL::Funcgen::FeatureSet objects + Exceptions : Throws if args 1 and 2 are not valid or stored + Caller : General + Status : At Risk + +=cut + +sub fetch_all_by_FeatureType_Analysis { + my ($self, $ftype, $anal, $ctype) = @_; + + my $params = {constraints => + { + feature_types => [$ftype], + analyses => [$anal], + } + }; + + $params->{constraints}{cell_types} = [$ctype] if $ctype; + return $self->generic_fetch($self->compose_constraint_query($params)); + +} + +=head2 fetch_by_name + + Arg [1] : string - name of FeatureSet + Arg [2] : (optional) string - status e.g. 'DISPLAYABLE' + Example : my @fsets = @{$fset_adaptor->fetch_by_name('feature_set-1')}; + Description: Fetch all FeatureSets wit a given name + Returntype : Bio::EnsEMBL::Funcgen::FeatureSet objects + Exceptions : Throws if no name passed + Caller : General + Status : At Risk + +=cut + +sub fetch_by_name { + my ($self, $name, $status) = @_; + + throw("Must provide a name argument") if (! defined $name); + + my $sql = "fs.name='".$name."'"; + + if($status){ + my $constraint = $self->status_to_constraint($status) if $status; + $sql = (defined $constraint) ? $sql." ".$constraint : undef; + } + + return (defined $sql) ? $self->generic_fetch($sql)->[0] : []; + +} + +=head2 _tables + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns the names and aliases of the tables to use for queries. + Returntype : List of listrefs of strings + Exceptions : None + Caller : Internal + Status : Medium Risk + +=cut + +sub _tables { + my $self = shift; + + #return @{$tables{feature_set}}; + return ( @{$self->TABLES} ); +} + +=head2 _columns + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns a list of columns to use for queries. + Returntype : List of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _columns { + my $self = shift; + + return qw( fs.feature_set_id fs.feature_type_id + fs.analysis_id fs.cell_type_id + fs.name fs.type + fs.description fs.display_label + fs.input_set_id); +} + + + + +=head2 _objs_from_sth + + Arg [1] : DBI statement handle object + Example : None + Description: PROTECTED implementation of superclass abstract method. + Creates OligoArray objects from an executed DBI statement + handle. + Returntype : Listref of Bio::EnsEMBL::Funcgen::FeatureSet objects + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _objs_from_sth { + my ($self, $sth) = @_; + + my (@fsets, $fset, $analysis, %analysis_hash, $feature_type, $cell_type, $name, $type, $display_label, $desc); + my ($feature_set_id, $ftype_id, $analysis_id, $ctype_id, $input_set_id, %ftype_hash, %ctype_hash); + + my $ft_adaptor = $self->db->get_FeatureTypeAdaptor(); + my $anal_adaptor = $self->db->get_AnalysisAdaptor(); + my $ct_adaptor = $self->db->get_CellTypeAdaptor(); + $ctype_hash{'NULL'} = undef; + + $sth->bind_columns(\$feature_set_id, \$ftype_id, \$analysis_id, \$ctype_id, + \$name, \$type, \$desc, \$display_label, \$input_set_id); + + while ( $sth->fetch()) { + + $ctype_id ||= 'NULL'; + + # Get the analysis object + $analysis_hash{$analysis_id} = $anal_adaptor->fetch_by_dbID($analysis_id) if(! exists $analysis_hash{$analysis_id}); + + # Get the feature type object + $ftype_hash{$ftype_id} = $ft_adaptor->fetch_by_dbID($ftype_id) if(! exists $ftype_hash{$ftype_id}); + + # Get the cell_type object + $ctype_hash{$ctype_id} = $ct_adaptor->fetch_by_dbID($ctype_id) if(! exists $ctype_hash{$ctype_id}); + + #Use new_fast here and strip the prefixed -'s + $fset = Bio::EnsEMBL::Funcgen::FeatureSet->new + ( + -dbID => $feature_set_id, + -adaptor => $self, + -feature_type => $ftype_hash{$ftype_id}, + -analysis => $analysis_hash{$analysis_id}, + -cell_type => $ctype_hash{$ctype_id}, + -name => $name, + -feature_class => $type, + -display_label => $display_label, + -description => $desc, + -input_set_id => $input_set_id, + ); + + push @fsets, $fset; + + } + + return \@fsets; +} + + + +=head2 store + + Args : List of Bio::EnsEMBL::Funcgen::FeatureSet objects + Example : $oaa->store($fset1, $fset2, $fset3); + Description: Stores FeatureSet objects in the database. + Returntype : Listref of stored FeatureSet objects + Exceptions : Throws if FeatureSet does not have a stored FeatureType + Throws if invalid FeatureSet passed + Throws if not FeatureSets passed + Warns if external_db_name not defined is type is external + Throws if external_db is not present in the db + Caller : General + Status : At Risk + +=cut + +sub store { + my $self = shift; + my @fsets = @_; + + throw('Must supply a list of FeatureSets to store') if(scalar(@fsets) == 0); + + my $sth = $self->prepare + ( + "INSERT INTO feature_set + (feature_type_id, analysis_id, cell_type_id, name, type, description, display_label, input_set_id) + VALUES (?, ?, ?, ?, ?, ?, ?, ?)" + ); + + my $db = $self->db; + my ($sql, $edb_id, %edb_hash); + + foreach my $fset (@fsets) { + throw('Can only store FeatureSet objects, skipping $fset') if ( ! $fset->isa('Bio::EnsEMBL::Funcgen::FeatureSet')); + + + if (! $fset->is_stored($db) ) { + + # Check FeatureType and Analysis + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::FeatureType', $fset->feature_type); + $self->db->is_stored_and_valid('Bio::EnsEMBL::Analysis', $fset->analysis); + + + # Check optional InputSet and CellType + my $ctype_id; + my $ctype = $fset->cell_type; + + if($ctype){ + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::CellType', $ctype); + $ctype_id = $ctype->dbID; + } + + my $input_set_id; + my $input_set = $fset->get_InputSet; + + if($input_set){ + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::InputSet', $input_set); + $input_set_id = $input_set->dbID; + } + + + $sth->bind_param(1, $fset->feature_type->dbID, SQL_INTEGER); + $sth->bind_param(2, $fset->analysis->dbID, SQL_INTEGER); + $sth->bind_param(3, $ctype_id, SQL_INTEGER); + $sth->bind_param(4, $fset->name, SQL_VARCHAR); + $sth->bind_param(5, $fset->feature_class, SQL_VARCHAR); + $sth->bind_param(6, $fset->description, SQL_VARCHAR); + $sth->bind_param(7, $fset->display_label, SQL_VARCHAR); + $sth->bind_param(8, $input_set_id, SQL_INTEGER); + + $sth->execute(); + $fset->dbID($sth->{'mysql_insertid'}); + $fset->adaptor($self); + } + else{ + warn('FeatureSet '.$fset->name.'is already stored, updating status entries'); + $self->store_states($fset); + } + } + return \@fsets; +} + +=head2 list_dbIDs + + Args : None + Example : my @array_ids = @{$oaa->list_dbIDs()}; + Description: Gets an array of internal IDs for all OligoArray objects in the + current database. + Returntype : List of ints + Exceptions : None + Caller : ? + Status : Medium Risk + +=cut + +sub list_dbIDs { + my ($self) = @_; + + return $self->_list_dbIDs('feature_set'); +} + + +=head2 fetch_focus_set_config_by_FeatureSet + + Args : Bio::EnsEMBL::Funcgen::FeatureSet + Example : $self->{'focus_set'} = $self->adaptor->fetch_focus_set_config_by_FeatureSet($self); + Description: Caches and returns focus set config for a given FeatureSet + Returntype : Boolean + Exceptions : Warns if meta entry not present + Caller : Bio::EnsEMBL::Funcgen::FeatureSet::is_focus_set + Status : At Risk + +=cut + +sub fetch_focus_set_config_by_FeatureSet{ + my ($self, $fset) = @_; + + $self->{focus_set_config} ||= {}; + + if(! defined $self->{focus_set_config}->{$fset->dbID}){ + + #Is is an attribute set? + if($self->fetch_attribute_set_config_by_FeatureSet($fset)){ + + #Need to define these as RegBuild config + if( ($fset->feature_type->class eq 'Transcription Factor') || + ($fset->feature_type->class eq 'Open Chromatin') ){ + $self->{focus_set_config}->{$fset->dbID} = 1; + } + } + } + + return $self->{focus_set_config}->{$fset->dbID}; +} + + +=head2 fetch_attribute_set_config_by_FeatureSet + + Args : Bio::EnsEMBL::Funcgen::FeatureSet + Example : $self->{'attribute_set'} = $self->adaptor->fetch_attribute_set_config_by_FeatureSet($self); + Description: Caches and returns attribute set config for a given FeatureSet + Returntype : Boolean + Exceptions : Warns if meta entry not present + Caller : Bio::EnsEMBL::Funcgen::FeatureSet::is_attribute_set + Status : At Risk + +=cut + +sub fetch_attribute_set_config_by_FeatureSet{ + my ($self, $fset) = @_; + + $self->{attribute_set_config} ||= {}; + + if (! defined $self->{attribute_set_config}->{$fset->dbID}) { + $self->{attribute_set_config}->{$fset->dbID} = 0; #set cache default + my $string_key = 'regbuild.'.$fset->cell_type->name.'.feature_set_ids'; + + #list_value_by_key caches, so we don't need to implement this in the adaptor + #my ($attr_ids) = @{$self->db->get_MetaContainer->list_value_by_key($meta_key)}; + + my $species_id = $self->db->species_id; + my ($attr_ids) = $self->db->dbc->db_handle->selectrow_array("SELECT string from regbuild_string where name='${string_key}' and species_id=$species_id"); + + if (! defined $attr_ids) { + warn("Cannot detect attribute set as regbuild_string table does not contain $string_key"); + } + else { + + foreach my $aid (split/,\s*/, $attr_ids) { + $self->{attribute_set_config}->{$aid} = 1; + } + } + } + + return $self->{attribute_set_config}->{$fset->dbID}; +} + + + + +sub fetch_feature_set_filter_counts{ + my $self = shift; + + my $sql = 'SELECT count(*), eg.name, eg.description, eg.is_project, ft.class, ct.name, ct.description '. + 'FROM experimental_group eg, experiment e, feature_set fs, feature_type ft, cell_type ct, '. + 'status s, status_name sn, input_set inp '. + 'WHERE fs.input_set_id=inp.input_set_id and inp.experiment_id=e.experiment_id '. + 'AND e.experimental_group_id=eg.experimental_group_id '. + 'AND fs.feature_type_id=ft.feature_type_id AND fs.cell_type_id=ct.cell_type_id '. + 'AND fs.feature_set_id=s.table_id AND fs.type="annotated" AND s.table_name="feature_set" '. + 'AND s.status_name_id=sn.status_name_id and sn.name="DISPLAYABLE" '. + 'GROUP BY eg.name, eg.is_project, ft.class, ct.name'; + + #warn $sql; + #Need to write HC around this as we sometimes get less than expect. + + + my @rows = @{$self->db->dbc->db_handle->selectall_arrayref($sql)}; + my $ftype_info = $self->db->get_FeatureTypeAdaptor->get_regulatory_evidence_info; + + my %filter_info = ( + #Project=> {}, + #'Cell/Tissue' => {}, + All => + { All =>{ count => 0, + description => 'All experiments', + } + } + + ); + + foreach my $row(@rows){ + + my ($count, $project, $proj_desc, $is_proj, $ft_class, $ct_name, $ct_desc) = @$row; + + #All counts + $filter_info{All}{All}{count} += $count; + + #Project counts + if($is_proj){ + + if(! exists $filter_info{Project}{$project}){ + $filter_info{Project}{$project} = + { count => 0, + description => $proj_desc, + }; + } + + $filter_info{Project}{$project}{count} += $count; + } + + #Cell/Tissue counts + if(! exists $filter_info{'Cell/Tissue'}{$ct_name}){ + $filter_info{'Cell/Tissue'}{$ct_name} = + { count => 0, + description => $ct_desc, + }; + } + $filter_info{'Cell/Tissue'}{$ct_name}{count} += $count; + + #Evidence class counts + #Do we want to split this into ft.class + #i.e. split 'DNase1 & TFBS' + my $ft_class_label = $ftype_info->{$ft_class}{label}; + + if(! exists $filter_info{'Evidence type'}{$ft_class_label}){ + $filter_info{'Evidence type'}{$ft_class_label} = + { count => 0, + description => $ftype_info->{$ft_class}{long_name}, + }; + } + $filter_info{'Evidence type'}{$ft_class_label}{count} += $count; + } + + return \%filter_info; + + #Do we need to add an 'in_build' filter /data field? + +} + + + + +#Need to bind param these as they come from URL parameters and are not tested + +#Could move a lot of these to the BaseAdaptor if we have a valid_constraints config available +#and we use generic main_table approach for building constraint + +#and reuse between adaptors if we use the _tables method to get the table syn +#This may mean contraints can be specified for classes which do not contain +#the relevant fields. +#Allow this flexiblity or validate fields/constraint? +#Or implicit by location of contraint config, i.e. put it in the relevant +#parent adaptors + + +#All these _constrain methods must return a valid constraint string, and a hashref of any other constraint config + +sub _constrain_projects{ + my ($self, $egs) = @_; + + #enable query extension + my $constraint_conf = {tables => [['input_set', 'inp'], ['experiment', 'e']]}; + + + if ( (ref($egs) ne 'ARRAY') || + scalar(@$egs) == 0 ) { + throw('Must pass an arrayref of project names'); + } + my @eg_ids; + + foreach my $eg (@$egs) { + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::ExperimentalGroup', $eg); + + if (! $eg->is_project) { + throw("You have passed an ExperimentalGroup which is not a project:\t".$eg->name); + } + + push @eg_ids, $eg->dbID; + } + + #Don't need to bind param this as we validate above + my $constraint = ' fs.input_set_id=inp.input_set_id and inp.experiment_id=e.experiment_id AND '. + 'e.experimental_group_id IN ('.join(', ', @eg_ids).')'; + + return ($constraint, $constraint_conf); +} + + +sub _constrain_evidence_types { + my ($self, $etypes) = @_; + + my $constraint_conf = {tables => [['feature_type', 'ft']]}; + + my %in_values = + ( + 'DNase1 & TFBS' => ['Open Chromatin', + 'Transcription Factor', + 'Transcription Factor Complex'], + + 'Hists & Pols' => ['Histone', + 'Polymerase'], + ); + + my @ft_classes; + + if ( (ref($etypes) ne 'ARRAY') || + scalar(@$etypes) == 0 ) { + throw('Must pass an arrayref of evidence types'); + } + + foreach my $etype (@$etypes) { + + if (! exists $in_values{$etype}) { + throw("You have passed an invalid evidence type filter argument($etype)\n". + "Please use one of the following:\t".join(' ,', keys(%in_values))); + } + push @ft_classes, @{$in_values{$etype}}; + } + + #Don't need to bind param this as we validate above + my $constraint = ' fs.feature_type_id=ft.feature_type_id AND ft.class IN ("'. + join('", "', @ft_classes).'")'; + + return ($constraint, $constraint_conf); +} + + + +sub _constrain_cell_types { + my ($self, $cts) = @_; + + my $constraint = ' fs.cell_type_id IN ('. + join(', ', @{$self->db->are_stored_and_valid('Bio::EnsEMBL::Funcgen::CellType', $cts, 'dbID')} + ).')'; + + #{} = no futher contraint config + return ($constraint, {}); +} + + +#Generic, move to BaseAdaptor? + +sub _constrain_status { + my ($self, $state) = @_; + + my $constraint_conf = { tables => [['status', 's']]}; #,['status_name', 'sn']), + + #This can't use IN without duplicating the result + #Would need to add a default_final_clause to group + + #my @sn_ids; + #if( (ref($states) ne 'ARRAY') || + #scalar(@$states) == 0 ){ + #throw('Must pass an arrayref of status_names'); + #} + #foreach my $sn(@$states){ + ##This will throw if status not valid, but still may be absent + # push @sn_ids, $self->_get_status_name_id($sn); + # } + + + + my @tables = $self->_tables; + my ($table_name, $syn) = @{$tables[0]}; + + my $constraint = " $syn.${table_name}_id=s.table_id AND ". + "s.table_name='$table_name' AND s.status_name_id=".$self->_get_status_name_id($state); + #"s.table_name='$table_name' AND s.status_name_id IN (".join(', ', @sn_ids.')'; + + return ($constraint, $constraint_conf); + } + + +sub _constrain_feature_types { + my ($self, $fts) = @_; + + + my @tables = $self->_tables; + my (undef, $syn) = @{$tables[0]}; + + #Don't need to bind param this as we validate + my $constraint = " ${syn}.feature_type_id IN (". + join(', ', @{$self->db->are_stored_and_valid('Bio::EnsEMBL::Funcgen::FeatureType', $fts, 'dbID')}).')'; + + #{} = not futher constraint conf + return ($constraint, {}); +} + + + + + +sub _constrain_analyses { + my ($self, $anals) = @_; + + #Don't need to bind param this as we validate + my $constraint = ' fs.analysis_id IN ('. + join(', ', @{$self->db->are_stored_and_valid('Bio::EnsEMBL::Analysis', $anals, 'dbID')}).')'; + + #{} = not futher constraint conf + return ($constraint, {}); +} + + # add other fetch args + #type + #name + + + + + + +1; + +__END__ + +#Methods to add? + +#fetch_all_by_InputSet - would require input_set_id index diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/FeatureTypeAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/FeatureTypeAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,482 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::DBSQL::FeatureTypeAdaptor +# + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::Funcgen::DBSQL::FeatureTypeAdaptor - A database adaptor for fetching and +storing Funcgen FeatureType objects. + +=head1 SYNOPSIS + +my $ft_adaptor = $db->get_FeatureTypeAdaptor; + +my $feature_type = $ft_adaptor->fetch_by_name("H3K4me3"); + + +=head1 DESCRIPTION + +The FeatureTypeAdaptor is a database adaptor for storing and retrieving +Funcgen FeatureType objects. + +=head1 SEE ALSO + +Bio::EnsEMBL::Funcgen::FeatureType + +=cut + +package Bio::EnsEMBL::Funcgen::DBSQL::FeatureTypeAdaptor; + +use strict; +use warnings; +use Bio::EnsEMBL::Utils::Exception qw( warning throw deprecate ); +use Bio::EnsEMBL::Funcgen::FeatureType; +use Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor); + +#Query extension stuff +use constant TRUE_TABLES => [['feature_type', 'ft']]; +use constant TABLES => [['feature_type', 'ft']]; + +#Regulatory evidence feature type information + +my $core_label = 'DNase1 & TFBS'; +my $attr_label = 'Hists & Pols'; + +my %regulatory_evidence_labels = + ( + $core_label => { + name => 'Open chromatin & TFBS', + long_name => 'Open chromatin & Transcription factor binding sites', + label => $core_label, + }, + + + $attr_label => { + name => 'Histones & polymerases', + long_name => 'Histone modifications & RNA polymerases', + label => $attr_label, + } + ); + + +my %regulatory_evidence_info = + ( + 'Transcription Factor' => $regulatory_evidence_labels{$core_label}, + 'Transcription Factor Complex' => $regulatory_evidence_labels{$core_label}, + 'Open Chromatin' => $regulatory_evidence_labels{$core_label}, + 'Polymerase' => $regulatory_evidence_labels{$attr_label}, + 'Histone' => $regulatory_evidence_labels{$attr_label}, + ); + +# + + +=head2 fetch_by_name + + Arg [1] : string - name of FeatureType + Arg [2] : optional string - class of FeatureType + Arg [3] : optional Bio::EnsEMBL::Analysis - Analysis used to generate FeatureType + Example : my $ft = $ft_adaptor->fetch_by_name('H3K4me2'); + Description: Does what it says on the tin + Returntype : Bio::EnsEMBL::Funcgen::FeatureType object (or ARRAY if called in ARRAY context) + Exceptions : Throws if more than one FeatureType for a given name found. + Throws if Analysis is defined but not valid. + Caller : General + Status : At risk + +=cut + +#Should really change this to fetch_all_by_name +#incorporating that functionality here will conditioanlly change the return type!! + + +sub fetch_by_name{ + my ($self, $name, $class, $analysis) = @_; + + throw("Must specify a FeatureType name") if(! $name); + + my $constraint = ' name = ? '; + + $constraint .= ' AND class = ? ' if $class; + + if($analysis){ + + $self->db->is_stored_and_valid('Bio::EnsEMBL::Analysis', $analysis); + $constraint .= ' AND analysis_id = ? '; + } + + + + $self->bind_param_generic_fetch($name, SQL_VARCHAR); + $self->bind_param_generic_fetch($class, SQL_VARCHAR) if $class; + $self->bind_param_generic_fetch($analysis->dbID, SQL_INTEGER) if $analysis; + + + my @fts = @{$self->generic_fetch($constraint)}; + + + #This can happen if using a redundant name between classes or analyses + #remove? + if( wantarray && (scalar @fts >1) ){ + $class ||= ''; + throw("Found more than one FeatureType:$class $name"); + } + + return (wantarray) ? @fts : $fts[0]; +} + + +=head2 fetch_all_by_Analysis + + Arg [1] : Bio::EnsEMBL::Analysis + Example : my @fts = @{$ft_adaptor->fetch_all_by_Analysis($analysis); + Description: Fetches all FeatureTypes for a given Analysis. + Returntype : ARRAYREF of Bio::EnsEMBL::Funcgen::FeatureType objects + Exceptions : Throws if Analysis not valid + Caller : General + Status : At risk + +=cut + +sub fetch_all_by_Analysis{ + my ($self, $analysis) = @_; + + $self->db->is_stored_and_valid('Bio::EnsEMBL::Analysis', $analysis); + my $constraint = ' analysis_id = ? '; + + #Use bind param method to avoid injection + $self->bind_param_generic_fetch($analysis->dbID, SQL_INTEGER); + + return $self->generic_fetch($constraint); +} + + +=head2 fetch_all_by_class + + Arg [1] : string - class of FeatureType + Example : my @fts = @{$ft_adaptor->fetch_all_by_class('Histone')}; + Description: Fetches all FeatureTypes of a given class. + Returntype : ARRAYREF of Bio::EnsEMBL::Funcgen::FeatureType objects + Exceptions : Throws if class not defined + Caller : General + Status : At risk + +=cut + +sub fetch_all_by_class{ + my ($self, $class) = @_; + + throw("Must specify a FeatureType class") if(! defined $class); + + + my $constraint = " class = ? "; + + + #Use bind param method to avoid injection + $self->bind_param_generic_fetch($class, SQL_VARCHAR); + + return $self->generic_fetch($constraint); +} + + + + + +sub fetch_all_by_associated_SetFeature{ + my ($self, $sfeat) = @_; + deprecate('Please use the more generic fetch_all_by_association method'); + return $self->fetch_all_by_association($sfeat); +} + +=head2 fetch_all_by_association + + Arg [1] : Bio::EnsEMBL::Funcgen::Storable + Example : my $assoc_ftypes = $ft_adaptor->fetch_all_by_association($ext_feature); + Description: Fetches all associated FeatureTypes for a given Storable. + Note: Where appropriate, the main FeatureType for a Storable is + accessible via the feature_type method. + Returntype : ARRAYREF of Bio::EnsEMBL::Funcgen::FeatureType objects + Exceptions : Throws if arg is not valid or stored + Caller : General + Status : At risk + +=cut + +sub fetch_all_by_association{ + my ($self, $storable) = @_; + + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::Storable', $storable); + + push @{$self->TABLES}, ['associated_feature_type', 'aft']; + + my $table_name = $storable->adaptor->_main_table->[0]; + + my $constraint = 'aft.feature_type_id=ft.feature_type_id AND aft.table_name="'.$table_name. + '" AND aft.table_id='.$storable->dbID; + + my $feature_types = $self->generic_fetch($constraint); + $self->reset_true_tables; + + return $feature_types; +} + +=head2 _tables + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns the names and aliases of the tables to use for queries. + Returntype : List of listrefs of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _tables { + my $self = shift; + + return @{$self->TABLES}; +} + +=head2 _columns + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns a list of columns to use for queries. + Returntype : List of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _columns { + my $self = shift; + + return qw( ft.feature_type_id ft.name ft.class ft.analysis_id ft.description ft.so_accession ft.so_name); +} + +=head2 _objs_from_sth + + Arg [1] : DBI statement handle object + Example : None + Description: PROTECTED implementation of superclass abstract method. + Creates Channel objects from an executed DBI statement + handle. + Returntype : Listref of Bio::EnsEMBL::Funcgen::FeatureType objects + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _objs_from_sth { + my ($self, $sth) = @_; + + my (@result, $ft_id, $name, $class, $anal_id, $desc, $so_acc, $so_name, %analysis_hash); + my $anal_a = $self->db->get_AnalysisAdaptor; + + $sth->bind_columns(\$ft_id, \$name, \$class, \$anal_id, \$desc, \$so_acc, \$so_name); + + $analysis_hash{0} = undef; + + while ( $sth->fetch() ) { + + $anal_id ||= 0; + + if (! exists $analysis_hash{$anal_id}){ + $analysis_hash{$anal_id} = $anal_a->fetch_by_dbID($anal_id); + } + + my $ftype = Bio::EnsEMBL::Funcgen::FeatureType->new + ( + -dbID => $ft_id, + -NAME => $name, + -CLASS => $class, + -ANALYSIS => $analysis_hash{$anal_id}, + -DESCRIPTION => $desc, + -SO_ACCESSION => $so_acc, + -SO_NAME => $so_name, + -ADAPTOR => $self + ); + + push @result, $ftype; + } + + return \@result; +} + + + +=head2 store + + Args : List of Bio::EnsEMBL::Funcgen::FeatureType objects + Example : $chan_a->store($c1, $c2, $c3); + Description: Stores given Channel objects in the database. Should only be + called once per array because no checks are made for duplicates. + Sets dbID and adaptor on the objects that it stores. + Returntype : None + Exceptions : Throws if FeatureType not valid + Throws if Analysis defined but not valid + Caller : General + Status : At Risk + +=cut + +sub store { + my ($self, @args) = @_; + + #Prepare once for all ftypes + my $sth = $self->prepare('INSERT INTO feature_type'. + '(name, class, analysis_id, description, so_accession, so_name)'. + 'VALUES (?, ?, ?, ?, ?, ?)'); + + #Process each ftype + foreach my $ft (@args) { + + #Validate ftype + if ( ! (ref($ft) && $ft->isa('Bio::EnsEMBL::Funcgen::FeatureType') )) { + throw('Can only store FeatureType objects, skipping $ft'); + } + + #Validate analysis + my $anal_id; + my $analysis = $ft->analysis; + + if($analysis){ + $self->db->is_stored_and_valid('Bio::EnsEMBL::Analysis', $analysis); + $anal_id = $analysis->dbID; + } + + #Is already stored? + if ( $ft->dbID && ( $ft->adaptor == $self) ) { + warn "Skipping previous stores FeatureType:\t".$ft->name.'('.$ft->dbID.")\n"; + } else { + #Check for previously stored FeatureType + my $s_ft = $self->fetch_by_name($ft->name, $ft->class, $ft->analysis); + + if (! $s_ft) { + $sth->bind_param(1, $ft->name, SQL_VARCHAR); + $sth->bind_param(2, $ft->class, SQL_VARCHAR); + $sth->bind_param(3, $anal_id, SQL_INTEGER); + $sth->bind_param(4, $ft->description, SQL_VARCHAR); + $sth->bind_param(5, $ft->so_accession, SQL_VARCHAR); + $sth->bind_param(6, $ft->so_name, SQL_VARCHAR); + + + $sth->execute(); + my $dbID = $sth->{mysql_insertid}; + $ft->dbID($dbID); + $ft->adaptor($self); + } + else { + $ft = $s_ft; + + #Check other fields match + my @failed_methods; + + for my $method (qw(description so_accession so_name)) { + #Allow nulls/undefs to match empty strings + my $ft_val = $ft->$method || ''; + my $s_ft_val = $s_ft->$method || ''; + + if ($ft_val ne $s_ft_val) { + push @failed_methods, "$method does not match between existing(${s_ft_val}) and new(${ft_val}) FeatureTypes"; + } + } + + if (@failed_methods) { + #Could throw, but maybe easier to patch after import? + warn("Used existing FeatureType with disparities:\n\t".join("\n\t", @failed_methods)); + } + } + } + } + + return \@args; +} + +=head2 list_regulatory_evidence_classes + + Args : None + Example : + Description: + Returntype : Array of Strings + Exceptions : None + Caller : web code + Status : At risk - remove in favour of get_regulatory_evidence_info + +=cut + +sub list_regulatory_evidence_classes { + my ($self) = @_; + + #change this to return the whole hash + + return keys(%regulatory_evidence_info); +} + +=head2 get_regulatory_evidence_labels + + Args : None + Example : + Description: + Returntype : HASREF + Exceptions : None + Caller : web code + Status : At risk + +=cut + +sub get_regulatory_evidence_labels{ + return \%regulatory_evidence_labels; +} + +=head2 get_regulatory_evidence_info + + Args : None + Example : + Description: Returns all regulatory evidence info keyed on feature type class + Returntype : HASHREF + Exceptions : None + Caller : web code + Status : At risk + +=cut + +sub get_regulatory_evidence_info{ + return \%regulatory_evidence_info; +} + + +#Deprecated + +#list_dbIDs now uses inherited BaseAdaptor::list_dbIDs + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/InputSetAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/InputSetAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,539 @@ +# +# Ensembl module for Bio::EnsEMBL::DBSQL::Funcgen::InputSetAdaptor +# + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::DBSQL::Funcgen::InputSetAdaptor - A database adaptor for fetching and +storing InputSet objects. + +=head1 SYNOPSIS + +my $rset_adaptor = $db->get_InputSetAdaptor(); + +my @rsets = @{$rset_adaptor->fetch_all_InputSets_by_Experiment()}; +my @displayable_rsets = @{$rset_adaptor->fetch_all_displayable_InputSets()}; + +#Other methods? +#by FeatureType, CellType all with displayable flag? + + +=head1 DESCRIPTION + +The InputSetAdaptor is a database adaptor for storing and retrieving +InputSet objects. + +=cut + +package Bio::EnsEMBL::Funcgen::DBSQL::InputSetAdaptor; + +use strict; +use warnings; + +use Bio::EnsEMBL::Utils::Exception qw( throw ); +use Bio::EnsEMBL::Funcgen::InputSet; +use Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor; +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor); + +#Query extension stuff +use constant TRUE_TABLES => [ [ 'input_set', 'inp' ], [ 'input_subset', 'iss' ] ]; +use constant TABLES => [ [ 'input_set', 'inp' ], [ 'input_subset', 'iss' ] ]; + + + + +=head2 fetch_all + + Arg [1] : optional HASHREF - Parameter hash containing contraints config e.g. + {'constraints' => + { + cell_types => [$cell_type, ...], #Bio::EnsEMBL::Funcgen::CellType + feature_types => [$ftype, ...], #Bio::EnsEMBL::Funcgen::FeatureType + experiments => [$ecp, ...], #Bio::EnsEMBL::Funcgen::Experiment + format => $inpset_format, #String e.g. SEQUENCING + } + } + Example : + Description: Retrieves a list of InputSets. Optional paramters hash allows for flexible query terms. + Returntype : ARRAYREF of Bio::EnsEMBL::InputSet objects + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub fetch_all{ + my ($self, $params_hash) = @_; + + my $results = $self->generic_fetch($self->compose_constraint_query($params_hash)); + $self->reset_true_tables; #in case we have added tables e.g. status + + return $results; +} + + + +=head2 fetch_all_by_FeatureType + + Arg [1] : Bio::EnsEMBL::Funcgen::FeatureType + Example : + Description: Retrieves a list of features on a given slice that are created + by probes from the specified type of array. + Returntype : Listref of Bio::EnsEMBL::InputSet objects + Exceptions : Throws if no valid FeatureType type is provided + Caller : General + Status : At Risk + +=cut + +sub fetch_all_by_FeatureType { + my ($self, $ftype) = @_; + my $params = {constraints => {feature_types => [$ftype]}}; + return $self->generic_fetch($self->compose_constraint_query($params)); +} + + +=head2 fetch_all_by_CellType + + Arg [1] : Bio::EnsEMBL::Funcgen::CellType + Example : + Description: + Returntype : Arrayref of Bio::EnsEMBL::Funcgen::InputSet objects + Exceptions : Throws if no CellType is provided + Caller : General + Status : At Risk + +=cut + +sub fetch_all_by_CellType { + my ($self, $ctype) = @_; + my $params = {constraints => {cell_types => [$ctype]}}; + return $self->generic_fetch($self->compose_constraint_query($params)); +} + + +=head2 fetch_all_by_Experiment + + Arg [1] : Bio::EnsEMBL::Funcgen::Experiment + Example : $exp_set = $eseta->fetch_by_Experiment($exp); + Description: Retrieves a InputSet based on the given Experiment + Returntype : Bio::EnsEMBL::Funcgen::InputSet + Exceptions : Throws if no valid stored Experiment provided + Caller : General + Status : At Risk + +=cut + +sub fetch_all_by_Experiment { + my ($self, $exp) = @_; + my $params = {constraints => {experiments => [$exp]}}; + return $self->generic_fetch($self->compose_constraint_query($params)); +} + +=head2 fetch_by_name + + Arg [1] : string - InputSet name + Example : $exp_set = $eseta->fetch_by_name('exp_set_1'); + Description: Retrieves a InputSet based on the ExperimentalSet name + Returntype : Bio::EnsEMBL::Funcgen::InputSet + Exceptions : Throws if no name provided + Caller : General + Status : At Risk + +=cut + +sub fetch_by_name { + my ($self, $name) = @_; + + throw('Need to pass a name argument') if( ! defined $name); + $self->bind_param_generic_fetch($name, SQL_VARCHAR); + + return $self->generic_fetch("inp.name = ?")->[0]; +} + +=head2 _tables + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns the names and aliases of the tables to use for queries. + Returntype : List of listrefs of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _tables { + my $self = shift; + + return @{$self->TABLES}; +} + +=head2 _columns + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns a list of columns to use for queries. + Returntype : List of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _columns { + my $self = shift; + + #can't have 'is' as an alias as it is reserved + return qw( + inp.input_set_id inp.experiment_id + inp.feature_type_id inp.cell_type_id + inp.format inp.vendor + inp.name inp.type + inp.replicate iss.name + iss.input_subset_id iss.archive_id + iss.display_url iss.replicate + iss.is_control + ); + + +} + +=head2 _left_join + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns an additional table joining constraint to use for + queries. + Returntype : List + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +#rather than default_where so we still get back InputSets without InputSubsets + +sub _left_join { + my $self = shift; + + return (['input_subset', 'inp.input_set_id = iss.input_set_id']); +} + + + + + +=head2 _objs_from_sth + + Arg [1] : DBI statement handle object + Example : None + Description: PROTECTED implementation of superclass abstract method. + Creates Array objects from an executed DBI statement + handle. + Returntype : Listref of Bio::EnsEMBL::Funcgen::Experiment objects + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _objs_from_sth { + my ($self, $sth) = @_; + + my ($dbid, $exp_id, $ftype_id, $ctype_id, $format, $vendor, $name, $ess_name, $ess_id, $type); + my ($inp_rep, $eset, @esets, $ftype, $ctype, $archive_id, $display_url, $iss_rep, $is_control); + my $ft_adaptor = $self->db->get_FeatureTypeAdaptor(); + my $ct_adaptor = $self->db->get_CellTypeAdaptor(); + my $exp_adaptor = $self->db->get_ExperimentAdaptor(); + $sth->bind_columns(\$dbid, \$exp_id, \$ftype_id, \$ctype_id, \$format, + \$vendor, \$name, \$type, \$inp_rep, \$ess_name, \$ess_id, + \$archive_id, \$display_url, \$iss_rep, \$is_control); + + + while ( $sth->fetch() ) { + + if(! $eset || ($eset->dbID() != $dbid)){ + + push @esets, $eset if $eset; + $ftype = (defined $ftype_id) ? $ft_adaptor->fetch_by_dbID($ftype_id) : undef; + throw("Could not fetch FeatureType with dbID $ftype_id for InputSet $name") if ! $ftype; + + $ctype = (defined $ctype_id) ? $ct_adaptor->fetch_by_dbID($ctype_id) : undef; + throw("Could not fetch CellType with dbID $ctype_id for InputSet $name") if ! $ctype; + + + $eset = Bio::EnsEMBL::Funcgen::InputSet->new( + -DBID => $dbid, + -EXPERIMENT => $exp_adaptor->fetch_by_dbID($exp_id), + -FORMAT => $format, + -VENDOR => $vendor, + -FEATURE_TYPE => $ftype, + -CELL_TYPE => $ctype, + -FEATURE_CLASS=> $type, + -ADAPTOR => $self, + -NAME => $name, + -REPLICATE => $inp_rep, + ); + } + + + if(defined $ess_name){ + + $eset->add_new_subset($ess_name, + Bio::EnsEMBL::Funcgen::InputSubset->new( + -name => $ess_name, + -dbID => $ess_id, + -adaptor => $self, + -input_set => $eset, + -archive_id => $archive_id, + -display_url => $display_url, + -replicate => $iss_rep, + -is_control => $is_control, + ) + ); + } + } + + push @esets, $eset if $eset; + + return \@esets; +} + + + +=head2 store + + Args : List of Bio::EnsEMBL::Funcgen::InputSet objects + Example : $rsa->store(@esets); + Description: Stores or updates previously stored InputSet objects in the database. + Returntype : None + Exceptions : Throws if a List of InputSet objects is not provided or if + an analysis is not attached to any of the objects + Caller : General + Status : At Risk + +=cut + +sub store{ + my ($self, @exp_sets) = @_; + + throw("Must provide a list of InputSet objects") if(scalar(@exp_sets == 0)); + + + + my $sth = $self->prepare('INSERT INTO input_set (experiment_id, feature_type_id, + cell_type_id, format, vendor, name, type, replicate) + VALUES (?, ?, ?, ?, ?, ?, ?, ?)'); + + my $db = $self->db(); + + foreach my $set (@exp_sets) { + + if( ! ref $set || ! $set->isa('Bio::EnsEMBL::Funcgen::InputSet') ) { + throw('Must be an InputSet object to store'); + } + + + if ( $set->is_stored($db) ) { + throw('InputSet [' . $set->dbID() . '] is already stored in the database\nInputSetAdaptor does not yet accomodate updating InputSets'); + #would need to retrive stored result set and update table_ids + } + + + my $ct_id = (defined $set->cell_type()) ? $set->cell_type->dbID() : undef; + my $ft_id = (defined $set->feature_type()) ? $set->feature_type->dbID() : undef; + + + + + $sth->bind_param(1, $set->get_Experiment->dbID(), SQL_INTEGER); + $sth->bind_param(2, $ft_id, SQL_INTEGER); + $sth->bind_param(3, $ct_id, SQL_INTEGER); + $sth->bind_param(4, $set->format, SQL_VARCHAR); + $sth->bind_param(5, $set->vendor, SQL_VARCHAR); + $sth->bind_param(6, $set->name, SQL_VARCHAR); + $sth->bind_param(7, $set->feature_class, SQL_VARCHAR); + $sth->bind_param(7, $set->replicate, SQL_INTEGER); + + + $sth->execute(); + + $set->dbID( $sth->{'mysql_insertid'} ); + $set->adaptor($self); + + #This should never happen as InputSubset now tests for stored InputSet first + $self->store_InputSubsets($set->get_InputSubsets()) if @{$set->get_InputSubsets()}; + } + + return \@exp_sets; +} + + +=head2 store_InputSubsets + + Args : Bio::EnsEMBL::Funcgen::InputSet + Example : $esa->store_InputSubsets(\@e_subsets); + Description: Convenience methods extracted from store to allow updating of InputSubset entries + during inline result processing which would otherwise be troublesome due to the need + for an InputSet + Returntype : Bio::EnsEMBL::Funcgen::InputSet + Exceptions : Throws if a stored InputSet object is not provided + Throws if no InputSubsets present + Caller : General + Status : At Risk + +=cut + + +sub store_InputSubsets{ + my ($self, $ssets) = @_; + + my $sth = $self->prepare(" + INSERT INTO input_subset ( + input_set_id, name, archive_id, display_url, replicate, is_control + ) VALUES (?, ?, ?, ? ,?, ?) + "); + + throw('Must provide at least one InputSubset') if(! @$ssets); + + #Store and set all previously unstored table_ids + foreach my $sset(@$ssets){ + + #use is_stored here? + if($sset->dbID()){ + warn "Skipping InputSubset ".$sset->name()." - already stored in the DB"; + next; + } + + + $sth->bind_param(1, $sset->input_set->dbID, SQL_INTEGER); + $sth->bind_param(2, $sset->name, SQL_VARCHAR); + $sth->bind_param(3, $sset->archive_id, SQL_VARCHAR); + $sth->bind_param(4, $sset->display_url, SQL_VARCHAR); + $sth->bind_param(5, $sset->replicate, SQL_INTEGER); + $sth->bind_param(6, $sset->is_control, SQL_INTEGER); + $sth->execute(); + + $sset->dbID($sth->{'mysql_insertid'}); + $sset->adaptor($self); + } + + #don't really need to return as we're working on a ref + return $ssets; +} + +=head2 list_dbIDs + + Args : None + Example : my @sets_ids = @{$esa->list_dbIDs()}; + Description: Gets an array of internal IDs for all InputSet objects in + the current database. + Returntype : List of ints + Exceptions : None + Caller : general + Status : stable + +=cut + +sub list_dbIDs { + my $self = shift; + + return $self->_list_dbIDs('input_set'); +} + + + + +#All these _constrain methods must return a valid constraint string, and a hashref of any other constraint config + +#Need to bind param any of these which come from URL parameters and are not tested + + +sub _constrain_cell_types { + my ($self, $cts) = @_; + + #Don't need to bind param this as we validate + my $constraint = ' inp.cell_type_id IN ('. + join(', ', @{$self->db->are_stored_and_valid('Bio::EnsEMBL::Funcgen::CellType', $cts, 'dbID')}).')'; + + #{} = no further config + return ($constraint, {}); +} + + +sub _constrain_feature_types { + my ($self, $fts) = @_; + + + my @tables = $self->_tables; + my (undef, $syn) = @{$tables[0]}; + + #Don't need to bind param this as we validate + my $constraint = " ${syn}.feature_type_id IN (". + join(', ', @{$self->db->are_stored_and_valid('Bio::EnsEMBL::Funcgen::FeatureType', $fts, 'dbID')}).')'; + + #{} = not futher constraint conf + return ($constraint, {}); +} + + +sub _constrain_experiments { + my ($self, $exps) = @_; + + #Don't need to bind param this as we validate + my $constraint = ' inp.experiment_id IN ('. + join(', ', @{$self->db->are_stored_and_valid('Bio::EnsEMBL::Funcgen::Experiment', $exps, 'dbID')}).')'; + + #{} = not futher constraint conf + return ($constraint, {}); +} + +# remove format? + +sub _constrain_format { + my ($self, $format) = @_; + + #Is not currently enum'd so have to hardcode current values for now + #likely to change + #SEQUENCING EQTL + + my %valid_formats = (SEQUENCING=>1); + #SEGMENTATION? + + if (! exists $valid_formats{uc($format)}) { + throw("$format is not a valid InputSet format, please specify one of:\t". + join(', ', keys %valid_formats)); + } + + my $constraint = ' inp.format="'.uc($format).'"'; + + #{} = not futher constraint conf + return ($constraint, {}); +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/MetaCoordContainer.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/MetaCoordContainer.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,224 @@ + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + + package Bio::EnsEMBL::Funcgen::DBSQL::MetaCoordContainer; + +use vars qw(@ISA); +use strict; + +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Utils::Exception; + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); + +#Can remove this whole class from the API and use the core class if we can resolve problems below +#remove new and inherit from Bio::EnsEMBL::DBSQL::MetaCoordContainer? + +sub new { + my $class = shift; + + my $self = $class->SUPER::new(@_); + + # + # Retrieve the list of the coordinate systems that features are stored in + # and cache them + # + + my $sql = 'SELECT mc.table_name, mc.coord_system_id, mc.max_length FROM meta_coord mc'; + my @args; + if($self->is_multispecies()) { + $sql .= ' join coord_system cs using (coord_system_id) where cs.species_id =?'; + push(@args, $self->species_id()); + } + + my $sth = $self->prepare($sql); + $sth->execute(@args); + + while(my ($table_name, $cs_id, $max_length) = $sth->fetchrow_array()) { + $self->{'_feature_cache'}->{lc($table_name)} ||= []; + push @{$self->{'_feature_cache'}->{lc($table_name)}}, $cs_id; + $self->{'_max_len_cache'}->{$cs_id}->{lc($table_name)} = $max_length; + } + $sth->finish(); + + return $self; +} + + + + +=head2 fetch_all_CoordSystems_by_feature_type + + Arg [1] : string $table - the name of the table to retrieve coord systems + for. E.g. 'gene', 'exon', 'dna_align_feature' + Example : @css = @{$mcc->fetch_all_CoordSystems_by_feature_type('gene')}; + Description: This retrieves the list of coordinate systems that features + in a particular table are stored. It is used internally by + the API to perform queries to these tables and to ensure that + features are only stored in appropriate coordinate systems. + Returntype : listref of Bio::EnsEMBL::Funcgen::CoordSystem objects + Exceptions : throw if name argument not provided + Caller : BaseFeatureAdaptor + Status : At risk + +=cut + +# can remove this if we can get get_CoordSystemAdaptor to return Funcgen rather than core + +sub fetch_all_CoordSystems_by_feature_type { + my $self = shift; + my $table = lc(shift); #case insensitive matching + + throw('Name argument is required') unless $table; + + if(!$self->{'_feature_cache'}->{$table}) { + return []; + } + + my @cs_ids = @{$self->{'_feature_cache'}->{$table}}; + my @coord_systems; + + my $csa = $self->db->get_FGCoordSystemAdaptor(); + + foreach my $cs_id (@cs_ids) { + my $cs = $csa->fetch_by_dbID($cs_id); + + if(!$cs) { + throw("meta_coord table refers to non-existant coord_system $cs_id"); + } + + push @coord_systems, $cs; + } + + return \@coord_systems; +} + + + +=head2 fetch_max_length_by_CoordSystem_feature_type + + Arg [1] : Bio::EnsEMBL::Funcgen::CoordSystem $cs + Arg [2] : string $table + Example : $max_len = + $mcc->fetch_max_length_by_CoordSystem_feature_type($cs,'gene'); + Description: Returns the maximum length of features of a given type in + a given coordinate system. + Returntype : int or undef + Exceptions : throw on incorrect argument + Caller : BaseFeatureAdaptor + Status : At risk + +=cut + +#can remove this if we can get Funcgen::Coordsystem to inherit from core CoordSystem + +sub fetch_max_length_by_CoordSystem_feature_type { + my $self = shift; + my $cs = shift; + my $table = shift; + + if(!ref($cs) || !$cs->isa('Bio::EnsEMBL::Funcgen::CoordSystem')) { + throw('Bio::EnsEMBL::Funcgen::CoordSystem argument expected'); + } + + throw("Table name argument is required") unless $table; + + return $self->{'_max_len_cache'}->{$cs->dbID()}->{lc($table)}; +} + + + +=head2 add_feature_type + + Arg [1] : Bio::EnsEMBL::Funcgen::CoordSystem $cs + The coordinate system to associate with a feature table + Arg [2] : string $table - the name of the table in which features of + a given coordinate system will be stored in + Arg [3] : int $length + This length is used to update the max_length in the database + and the internal cache. + Example : $csa->add_feature_table($chr_coord_system, 'gene'); + Description: This function tells the coordinate system adaptor that + features from a specified table will be stored in a certain + coordinate system. If this information is not already stored + in the database it will be added. + Returntype : none + Exceptions : none + Caller : BaseFeatureAdaptor + Status : At risk + +=cut + + +#Can also be removed if inheritance/get_CoordSystemAdaptor issues resolved + +sub add_feature_type { + my $self = shift; + my $cs = shift; + my $table = lc(shift); + my $length = shift; + if(!ref($cs) || !$cs->isa('Bio::EnsEMBL::Funcgen::CoordSystem')) { + throw('CoordSystem argument is required.'); + } + + if(!$table) { + throw('Table argument is required.'); + } + + my $cs_ids = $self->{'_feature_cache'}->{$table} || []; + + my ($exists) = grep {$cs->dbID() == $_} @$cs_ids; + if( $exists ) { + if( !$self->{'_max_len_cache'}->{$cs->dbID()}->{$table} || + $self->{'_max_len_cache'}->{$cs->dbID()}->{$table} < $length ) { + my $sth = $self->prepare('UPDATE meta_coord ' . + "SET max_length = $length " . + 'WHERE coord_system_id = ? ' . + 'AND table_name = ? '. + "AND (max_length<$length ". + "OR max_length is null)"); + $sth->execute( $cs->dbID(), $table ); + $self->{'_max_len_cache'}->{$cs->dbID()}->{$table} = $length; + } + return; + } + + #store the new tablename -> coord system relationship in the db + #ignore failures b/c during the pipeline multiple processes may try + #to update this table and only the first will be successful + my $sth = $self->prepare('INSERT IGNORE INTO meta_coord ' . + 'SET coord_system_id = ?, ' . + 'table_name = ?, ' . + 'max_length = ? ' + ); + + $sth->execute($cs->dbID, $table, $length ); + + #update the internal cache + $self->{'_feature_cache'}->{$table} ||= []; + push @{$self->{'_feature_cache'}->{$table}}, $cs->dbID(); + $self->{'_max_len_cache'}->{$cs->dbID()}->{$table} = $length; + + return; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/MotifFeatureAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/MotifFeatureAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,657 @@ +# +# Ensembl module for Bio::EnsEMBL::DBSQL::Funcgen::MotifFeatureAdaptor +# +# You may distribute this module under the same terms as Perl itself + +=head1 NAME + +Bio::EnsEMBL::Funcgen::DBSQL::MotifFeatureAdaptor - A database adaptor for fetching and +storing MotifFeature objects. + +=head1 SYNOPSIS + +my $mfa = $db->get_MotifFeatureAdaptor(); + +my @mfeatures = @{$mfa->fetch_all_by_Slice_CellType($slic, $ctype)}; + + +=head1 DESCRIPTION + +The MotifFeatureAdaptor is a database adaptor for storing and retrieving +MotifFeature objects. + + +=head1 SEE ALSO + +Bio::EnsEMBL::Funcgen::MotifFeature + + +=head1 LICENSE + + Copyright (c) 1999-2009 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::DBSQL::MotifFeatureAdaptor; + +use Bio::EnsEMBL::Utils::Exception qw( throw warning ); +use Bio::EnsEMBL::Funcgen::MotifFeature; +use Bio::EnsEMBL::Funcgen::DBSQL::BaseFeatureAdaptor; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::DBSQL::BaseFeatureAdaptor); + +#Need to re-implement some of the base methods in SetFeatureAdaptor + +#Define tables here to enable query extension i.e. cell_type querys +# Need to add motif/pwm/binding_matrix here +#associated_motif_feature, annotated_feature and finally feature_set.cell_type_id + +use constant TRUE_TABLES => [['motif_feature', 'mf']]; +use constant TABLES => [['motif_feature', 'mf']]; + + +my $true_final_clause = ' ORDER by mf.seq_region_id, mf.seq_region_start, mf.seq_region_end'; +# ORDER by required by fetch_all_by_dbID_list when fetching as regulatory_attributes +# was '' to avoid use of undef warning from BaseAdaptor +my $final_clause = $true_final_clause; + + +=head2 fetch_all_by_AnnotatedFeature + + Arg [1] : Bio::EnsEMBL::AnnotatedFeature + Arg [2] : optional - Bio::EnsEMBL::Slice + Example : my $features = $ofa->fetch_all_by_AnnotatedFeature($af); + Description: Retrieves a list of all MotifFeatures linked to the given + AnnotatedFeature + Returntype : Listref of Bio::EnsEMBL::Funcgen::MotifFeature objects + Exceptions : Throws if AnnotatedFeature not stored and valid + Caller : General + Status : At Risk + +=cut + +sub fetch_all_by_AnnotatedFeature { + my ($self, $feature, $slice) = @_; + + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::AnnotatedFeature', $feature); + + #Extend query tables + push @{$self->TABLES}, (['associated_motif_feature', 'amf']); + my $constraint = 'mf.motif_feature_id = amf.motif_feature_id AND amf.annotated_feature_id=?'; + #No need for group here as we are restricting to one af + + $self->bind_param_generic_fetch($feature->dbID, SQL_INTEGER); + + my $mfs = $self->generic_fetch($constraint, undef, $slice); + $self->reset_true_tables; + + return $mfs; +} + +=head2 fetch_all_by_Slice_CellType + + Arg [1] : Bio::EnsEMBL::Slice + Arg [2] : Bio::EnsEMBL::Funcgen::CellType + #Arg [3] : (optional) string - type e.g. Jaspar/Inferred + Example : my $slice = $sa->fetch_by_region('chromosome', '1'); + my $features = $ofa->fetch_all_by_Slice_CellType($slice, $ct); + Description: Retrieves a list of features on a given slice, specific for a given CellType. + Returntype : Listref of Bio::EnsEMBL::MotifFeature objects + Exceptions : Throws if CellType is not valid + Caller : General + Status : At Risk - implement/change type to Analysis + +=cut + +sub fetch_all_by_Slice_CellType { + my ($self, $slice, $ctype, $type) = @_; + + #could add logic_name here for motif mapper analysis, motif source analysis + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::CellType', $ctype); + + #Extend query tables + push @{$self->TABLES}, (['feature_set', 'fs'], ['associated_motif_feature', 'amf'], ['annotated_feature', 'af']); + + my $constraint = 'mf.motif_feature_id = amf.motif_feature_id AND '. + 'amf.annotated_feature_id=af.annotated_feature_id and '. + 'af.feature_set_id=fs.feature_set_id AND fs.cell_type_id = ?'; + + #Group here as the mf may be linked to multiple afs + $final_clause = ' GROUP BY mf.motif_feature_id'; + + + $self->bind_param_generic_fetch( $ctype->dbID(), SQL_INTEGER); + my $mfs = $self->SUPER::fetch_all_by_Slice_constraint($slice, $constraint); + $self->reset_true_tables; + $final_clause = $true_final_clause; + + return $mfs; +} + + +=head2 fetch_all_by_Slice_BindingMatrix + + Arg [1] : Bio::EnsEMBL::Slice + Arg [2] : Bio::EnsEMBL::Funcgen::BindingMatrix + #Arg [3] : (optional) string - type e.g. Jaspar/Inferred + Example : my $slice = $sa->fetch_by_region('chromosome', '1'); + my $features = $ofa->fetch_all_by_Slice_BindingMatric($slice, $bm); + Description: Retrieves a list of features on a given slice, specific for a given BindingMatrix. + Returntype : Listref of Bio::EnsEMBL::MotifFeature objects + Exceptions : Throws if BindinMatrix is not valid + Caller : General + Status : At Risk - implement/change type to Analysis + +=cut + +sub fetch_all_by_Slice_BindingMatrix { + my ($self, $slice, $bm, $type) = @_; + + #could add logic_name here for motif mapper analysis, motif source analysis + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::BindingMatrix', $bm); + + my $constraint = 'mf.binding_matrix_id = ?'; + + $self->bind_param_generic_fetch( $bm->dbID(), SQL_INTEGER); + my $mfs = $self->SUPER::fetch_all_by_Slice_constraint($slice, $constraint); + $self->reset_true_tables; + + return $mfs; +} + + +=head2 fetch_all_by_Slice_FeatureSets + + Arg [1] : Bio::EnsEMBL::Slice + Arg [2] : arrayref of Bio::EnsEMBL::Funcgen::FeatureSet objects + #Arg [3] : (optional) string - type e.g. Jaspar/Inferred + Example : my $slice = $sa->fetch_by_region('chromosome', '1'); + my $features = $ofa->fetch_all_by_Slice_FeatureSet($slice, $fset); + Description: Retrieves a list of features on a given slice, specific for a given FeatureSet. + Returntype : Listref of Bio::EnsEMBL::MotifFeature objects + Exceptions : Throws if FeatureSet is not valid + Caller : General + Status : At Risk - implement/change type to Analysis + +=cut + +sub fetch_all_by_Slice_FeatureSets { + my ($self, $slice, $fsets, $type) = @_; + + #could add logic_name here for motif mapper analysis, motif source analysis + #$self->db->are_stored_and_valid('Bio::EnsEMBL::Funcgen::FeatureSet', $fsets); + foreach my $fset (@$fsets){ + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::FeatureSet', $fset); + } + + #Extend query tables + push @{$self->TABLES}, (['associated_motif_feature', 'amf'], ['annotated_feature', 'af']); + + my $constraint = 'mf.motif_feature_id = amf.motif_feature_id AND '. + 'amf.annotated_feature_id=af.annotated_feature_id and af.feature_set_id IN('.join(',', (map $_->dbID, @$fsets)).')'; + #Can't bind_param in lists + + #Group here as the mf may be linked to multiple afs across fsets + $final_clause = ' GROUP BY mf.motif_feature_id'; + + my $mfs = $self->SUPER::fetch_all_by_Slice_constraint($slice, $constraint); + $self->reset_true_tables; + $final_clause = $true_final_clause; + + return $mfs; +} + + +=head2 _final_clause + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns an ORDER BY clause. Sorting by probe_feature_id would be + enough to eliminate duplicates, but sorting by location might + make fetching features on a slice faster. + Returntype : String + Exceptions : None + Caller : generic_fetch + Status : At Risk + +=cut + + +sub _final_clause { + return $final_clause; +} + + +=head2 _tables + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns the names and aliases of the tables to use for queries. + Returntype : List of listrefs of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _tables { + my $self = shift; + + return @{$self->TABLES}; +} + +=head2 _columns + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns a list of columns to use for queries. + Returntype : List of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _columns { + my $self = shift; + + return qw( + mf.motif_feature_id mf.seq_region_id + mf.seq_region_start mf.seq_region_end + mf.seq_region_strand mf.binding_matrix_id + mf.display_label mf.score + mf.interdb_stable_id + ); +} + + + +=head2 _objs_from_sth + + Arg [1] : DBI statement handle object + Example : None + Description: PROTECTED implementation of superclass abstract method. + Creates MotifFeature objects from an executed DBI statement + handle. + Returntype : Listref of Bio::EnsEMBL::MotifFeature objects + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _objs_from_sth { + my ($self, $sth, $mapper, $dest_slice) = @_; + + my ($sa, $seq_region_id); + $sa = $dest_slice->adaptor->db->get_SliceAdaptor() if($dest_slice); + $sa ||= $self->db->dnadb->get_SliceAdaptor(); + + + my $bm_adaptor = $self->db->get_BindingMatrixAdaptor(); + my @features; + my (%bm_hash, %slice_hash, %sr_name_hash, %sr_cs_hash); + + my ( + $motif_feature_id, $efg_seq_region_id, + $seq_region_start, $seq_region_end, + $seq_region_strand, $bm_id, + $display_label, $score, + $stable_id + ); + + $sth->bind_columns( + \$motif_feature_id, \$efg_seq_region_id, + \$seq_region_start, \$seq_region_end, + \$seq_region_strand, \$bm_id, + \$display_label, \$score, + \$stable_id + ); + + my $asm_cs; + my $cmp_cs; + my $asm_cs_name; + my $asm_cs_vers; + my $cmp_cs_name; + my $cmp_cs_vers; + + if ($mapper) { + $asm_cs = $mapper->assembled_CoordSystem(); + $cmp_cs = $mapper->component_CoordSystem(); + $asm_cs_name = $asm_cs->name(); + $asm_cs_vers = $asm_cs->version(); + $cmp_cs_name = $cmp_cs->name(); + $cmp_cs_vers = $cmp_cs->version(); + } + + my $dest_slice_start; + my $dest_slice_end; + my $dest_slice_strand; + my $dest_slice_length; + my $dest_slice_sr_name; + + if ($dest_slice) { + $dest_slice_start = $dest_slice->start(); + $dest_slice_end = $dest_slice->end(); + $dest_slice_strand = $dest_slice->strand(); + $dest_slice_length = $dest_slice->length(); + $dest_slice_sr_name = $dest_slice->seq_region_name(); + } + + + FEATURE: while ( $sth->fetch() ) { + + #Build a slice adaptor cache here if we want to enable mapping between assemblies?? + #Or if we supported the mapping between cs systems for a given schema_build, which would have to be handled by the core api + + #get core seq_region_id + #This fails if we are using a 'comparable' CoordSystem as we don't have a cache + #for the new DB. Wasn't this fixed with the tmp seq_region_cache? + $seq_region_id = $self->get_core_seq_region_id($efg_seq_region_id); + + if(! $seq_region_id){ + warn "Cannot get slice for eFG seq_region_id $efg_seq_region_id\n". + "The region you are using is not present in the current seq_region_cache.\n". + "Maybe you need to redefine the dnadb or update_DB_for_release?"; + next; + } + + #Get the BindingMatrix object + $bm_hash{$bm_id} = $bm_adaptor->fetch_by_dbID($bm_id) if(! exists $bm_hash{$bm_id}); + + + # Get the slice object + my $slice = $slice_hash{'ID:'.$seq_region_id}; + + if (! $slice) { + $slice = $sa->fetch_by_seq_region_id($seq_region_id); + $slice_hash{'ID:'.$seq_region_id} = $slice; + $sr_name_hash{$seq_region_id} = $slice->seq_region_name(); + $sr_cs_hash{$seq_region_id} = $slice->coord_system(); + } + + my $sr_name = $sr_name_hash{$seq_region_id}; + my $sr_cs = $sr_cs_hash{$seq_region_id}; + + # Remap the feature coordinates to another coord system if a mapper was provided + if ($mapper) { + + throw("Not yet implmented mapper, check equals are Funcgen calls too!"); + + ($sr_name, $seq_region_start, $seq_region_end, $seq_region_strand) + = $mapper->fastmap($sr_name, $seq_region_start, $seq_region_end, $seq_region_strand, $sr_cs); + + # Skip features that map to gaps or coord system boundaries + next FEATURE if !defined $sr_name; + + # Get a slice in the coord system we just mapped to + if ( $asm_cs == $sr_cs || ( $cmp_cs != $sr_cs && $asm_cs->equals($sr_cs) ) ) { + $slice = $slice_hash{"NAME:$sr_name:$cmp_cs_name:$cmp_cs_vers"} + ||= $sa->fetch_by_region($cmp_cs_name, $sr_name, undef, undef, undef, $cmp_cs_vers); + } else { + $slice = $slice_hash{"NAME:$sr_name:$asm_cs_name:$asm_cs_vers"} + ||= $sa->fetch_by_region($asm_cs_name, $sr_name, undef, undef, undef, $asm_cs_vers); + } + } + + # If a destination slice was provided convert the coords + # If the destination slice starts at 1 and is forward strand, nothing needs doing + if ($dest_slice) { + + unless ($dest_slice_start == 1 && $dest_slice_strand == 1) { + if ($dest_slice_strand == 1) { + $seq_region_start = $seq_region_start - $dest_slice_start + 1; + $seq_region_end = $seq_region_end - $dest_slice_start + 1; + } else { + my $tmp_seq_region_start = $seq_region_start; + $seq_region_start = $dest_slice_end - $seq_region_end + 1; + $seq_region_end = $dest_slice_end - $tmp_seq_region_start + 1; + $seq_region_strand *= -1; + } + } + + # Throw away features off the end of the requested slice + if(! $self->force_reslice){ + #force_reslice set by RegulatoryFeature::regulatory_attributes + #so we don't lose attrs which are not on the dest_slice + + next FEATURE if $seq_region_end < 1 || $seq_region_start > $dest_slice_length + || ( $dest_slice_sr_name ne $sr_name ); + } + + $slice = $dest_slice; + } + + + + push @features, Bio::EnsEMBL::Funcgen::MotifFeature->new_fast + ( + {'start' => $seq_region_start, + 'end' => $seq_region_end, + 'strand' => $seq_region_strand, + 'slice' => $slice, + 'adaptor' => $self, + 'dbID' => $motif_feature_id, + 'score' => $score, + 'display_label' => $display_label, + 'binding_matrix' => $bm_hash{$bm_id}, + 'interdb_stable_id', => $stable_id, + } ); + } + + return \@features; +} + + +=head2 store + + Args : List of Bio::EnsEMBL::Funcgen::MotifFeature objects + Example : $ofa->store(@features); + Description: Stores given MotifFeature objects in the database. Should only + be called once per feature because no checks are made for + duplicates. Sets dbID and adaptor on the objects that it stores. + Returntype : Listref of stored MotifFeatures + Exceptions : Throws if a list of MotifFeature objects is not provided or if + the Analysis, CellType and FeatureType objects are not attached or stored + Caller : General + Status : At Risk + +=cut + +sub store{ + my ($self, @mfs) = @_; + + if (scalar(@mfs) == 0) { + throw('Must call store with a list of MotifFeature objects'); + } + + my $sth = $self->prepare(" + INSERT INTO motif_feature ( + seq_region_id, seq_region_start, + seq_region_end, seq_region_strand, + binding_matrix_id, display_label, score, interdb_stable_id + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?) + "); + + my $db = $self->db(); + + FEATURE: foreach my $mf (@mfs) { + + if( ! (ref($mf) && $mf->isa('Bio::EnsEMBL::Funcgen::MotifFeature'))){ + throw('Feature must be an MotifFeature object'); + } + + + + #Check for preexiting MF should be done in caller + #as there is currently no unique key to restrict duplicates + + + if ( $mf->is_stored($db) ) { + warning('MotifFeature [' . $mf->dbID() . '] is already stored in the database'); + next FEATURE; + } + + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::BindingMatrix', $mf->binding_matrix); + + + my $seq_region_id; + ($mf, $seq_region_id) = $self->_pre_store($mf); + + my $dlabel = $mf->display_label; + + if(! defined $dlabel){ + $dlabel = $mf->binding_matrix->feature_type->name.':' + .$mf->binding_matrix->name(); + } + + $sth->bind_param(1, $seq_region_id, SQL_INTEGER); + $sth->bind_param(2, $mf->start(), SQL_INTEGER); + $sth->bind_param(3, $mf->end(), SQL_INTEGER); + $sth->bind_param(4, $mf->strand(), SQL_TINYINT); + $sth->bind_param(5, $mf->binding_matrix->dbID(), SQL_INTEGER); + $sth->bind_param(6, $dlabel, SQL_VARCHAR); + $sth->bind_param(7, $mf->score(), SQL_DOUBLE); + $sth->bind_param(8, $mf->interdb_stable_id(), SQL_INTEGER); + + $sth->execute(); + + $mf->dbID( $sth->{'mysql_insertid'} ); + $mf->adaptor($self); + + #Don't store assoicated AF/TFF here + #do this explicitly in the caller via store_associated_AnnotatedFeature + } + + return \@mfs; +} + + +=head2 store_associated_AnnotatedFeature + + Args[1] : Bio::EnsEMBL::Funcgen::MotifFeature + Args[2] : Bio::EnsEMBL::Funcgen::AnnotatedFeature + Example : $esa->store_AnnotatedFeature_association($mf, $af); + Description: Store link between AnnotatedFeatures representing TF peaks + and MotifFeatures + Returntype : Bio::EnsEMBL::Funcgen::MotifFeature + Exceptions : Throws if args are not valid, warns if association already exists + Caller : General + Status : At Risk - likely to change to TranscriptFactorFeature + +=cut + +#Will always load AFs first, as we need them to generate BMs and MFs +#Hence we may not have all the associated AFs when we store the MF for the first time +#So we need to be able to update these, and will most likely be 1 AF at a time + +#Where will these be stored? +#reciprocal methods here and in AF/TFF? +#Just in MF for now untill we write TFF + +#When loading MFs, need a wayway to identify if +#it has already been loaded for a previous set +#possiblity of parallel processes loading same MF at same time. +#load scripts should handle multiple fsets, and only ever be run in series +#wrt to fset, can chunk slice wise + +#Should only ever rollback associations relevant to fsets in question +#this may leave orphaned MFs which may then cause duplicates + +#Duplicate may also be cause if some fsets are run in series, so we definitely +#need to test for MF using feature Slice and BindingMatrix + + +sub store_associated_AnnotatedFeature{ + my ($self, $mf, $af) = @_; + + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::MotifFeature', $mf); + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::AnnotatedFeature', $af); + + + #Check for existing association + + foreach my $existing_af(@{$mf->associated_annotated_features}){ + + if( $existing_af->dbID == $af->dbID ){ + warn "You are trying to store a pre-exiting AnnotatedFeature association"; + return; + } + } + + + #Validate MotifFeature is entirely contained within the AnnotatedFeature + + #if(! (( $mf->seq_region_start <= $af->seq_region_end) && + #( $mf->seq_region_end >= $af->seq_region_start))){ + # + # throw('MotifFeature is not entirely contained within associated AnnotatedFeature'); + # } + + + my $sth = $self->prepare(" + INSERT INTO associated_motif_feature ( + annotated_feature_id, motif_feature_id + ) VALUES (?, ?) + "); + + $sth->bind_param(1, $af->dbID, SQL_INTEGER); + $sth->bind_param(2, $mf->dbID, SQL_INTEGER); + $sth->execute(); + + + push @{$mf->{associated_annotated_features}}, $af; + + return $mf; +} + + + + +=head2 fetch_by_interdb_stable_id + + Arg [1] : Integer $stable_id - The 'interdb stable id' of the motif feature to retrieve + Example : my $rf = $rf_adaptor->fetch_by_interdb_stable_id(1); + Description: Retrieves a motif feature via its stable id. This is really an internal + method to facilitate inter DB linking. + Returntype : Bio::EnsEMBL::Funcgen::MotifFeature + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_by_interdb_stable_id { + my ($self, $stable_id) = @_; + + $self->bind_param_generic_fetch($stable_id, SQL_INTEGER); + + return $self->generic_fetch('mf.interdb_stable_id=?')->[0]; +} + + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/ProbeAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/ProbeAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,584 @@ +# +# Ensembl module for Bio::EnsEMBL::DBSQL::ProbeAdaptor +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::DBSQL::ProbeAdaptor - A database adaptor for fetching and +storing Probe objects. + +=head1 SYNOPSIS + +my $opa = $db->get_ProbeAdaptor(); + +my $probe = $opa->fetch_by_array_probe_probeset('Array-1', 'Probe-1', undef); + +=head1 DESCRIPTION + +The ProbeAdaptor is a database adaptor for storing and retrieving +Probe objects. + +=head1 SEE ALSO + +Bio::EnsEMBL::Funcgen::Probe + +=cut + +package Bio::EnsEMBL::Funcgen::DBSQL::ProbeAdaptor; + +use strict; +use warnings; + +use Bio::EnsEMBL::Utils::Exception qw( throw warning ); +use Bio::EnsEMBL::Funcgen::Probe; +use Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor;#Have to use here to import @EXPORT + +use base qw(Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor); #@ISA +#change to parent with perl 5.10 + + +=head2 fetch_by_array_probe_probeset_name + + Arg [1] : string - name of array + Arg [2] : string - name of probe + Arg [3] : (optional) string - name of probeset + Example : my $probe = $opa->fetch_by_array_probeset_probe('Array-1', 'Probe-1', 'ProbeSet-1'); + Description: Returns a probe given a combination of array name, probeset and + probe name. This will uniquely define an Affy probe. Only one + probe is ever returned. + Returntype : Bio::EnsEMBL::Funcgen::Probe + Exceptions : throws if array or probe name not defined + Caller : General + Status : At Risk - rename to fetch_by_probe_array_probeset_name? + +=cut + +#This does not currently capture on plate replicate probes with different names +#Only returns the record corresponding to the given name and not the other replicate name + +sub fetch_by_array_probe_probeset_name { + my ($self, $array_name, $probe_name, $probeset_name) = @_; + + if(! (defined $array_name && defined $probe_name)){ + throw('You must provide at least and array and probe name'); + } + + my $tables = 'probe p, array_chip ac, array a'; + $tables .= ', probe_set ps' if defined $probeset_name; + + my $sql = "SELECT distinct(p.probe_id) FROM $tables WHERE a.name=? and a.array_id=ac.array_id and ac.array_chip_id=p.array_chip_id and p.name=?"; + $sql .= ' AND p.probe_set_id=ps.probe_set_id and ps.name=?' if defined $probeset_name; + my $sth = $self->db->dbc->prepare($sql); + $sth->bind_param(1, $array_name, SQL_VARCHAR); + $sth->bind_param(2, $probe_name, SQL_VARCHAR); + $sth->bind_param(3, $probeset_name, SQL_VARCHAR) if defined $probeset_name; + $sth->execute; + + #This should only return one result + #The only possible way this would not return one result + #is if an identically named array(:probeset):probe which had a different sequence + #As Import array would separate these based on the sequence hash + my ($dbid) = $sth->fetchrow_array; + + return (defined $dbid) ? $self->fetch_by_dbID($dbid) : undef; +} + + + + +=head2 fetch_all_by_name + + Arg [1] : string - probe name + Example : my @probes = @{$opa->fetch_all_by_name('Probe1')}; + Description: Convenience method to re-instate the functionality of + $core_dbentry_adpator->fetch_all_by_External_name('probe_name'); + WARNING: This may not be the probe you are expecting as + probe names are not unqiue across arrays and vendors. + These should ideally be validated using the attached array + information or alternatively use fetch_by_array_probe_probeset_name + Returns a probe with the given name. + Returntype : Arrayref + Exceptions : Throws if name not passed + Caller : General + Status : At Risk + +=cut + + +sub fetch_all_by_name{ + my ($self, $name) = @_; + + + throw('Must provide a probe name argument') if ! defined $name; + + $self->bind_param_generic_fetch($name, SQL_VARCHAR); + + return $self->generic_fetch('p.name=?'); +} + + + + +=head2 fetch_all_by_ProbeSet + + Arg [1] : Bio::EnsEMBL::ProbeSet + Example : my @probes = @{$opa->fetch_all_by_ProbeSet($pset)}; + Description: Fetch all probes in a particular ProbeSet. + Returntype : Listref of Bio::EnsEMBL::Probe objects + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub fetch_all_by_ProbeSet { + my ($self, $probeset) = @_; + + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::ProbeSet', $probeset); + return $self->generic_fetch('p.probe_set_id = '.$probeset->dbID); +} + + + +=head2 fetch_all_by_Array + + Arg [1] : Bio::EnsEMBL::Funcgen::Array + Example : my @probes = @{$opa->fetch_all_by_Array($array)}; + Description: Fetch all probes on a particular array. + Returntype : Listref of Bio::EnsEMBL::Probe objects. + Exceptions : throws if arg is not valid or stored + Caller : General + Status : At Risk + +=cut + +sub fetch_all_by_Array { + my $self = shift; + my $array = shift; + + if(! (ref($array) && $array->isa('Bio::EnsEMBL::Funcgen::Array') && $array->dbID())){ + throw('Need to pass a valid stored Bio::EnsEMBL::Funcgen::Array'); + } + + #get all array_chip_ids, for array and do a multiple OR statement with generic fetch + + return $self->generic_fetch('p.array_chip_id IN ('.join(',', @{$array->get_array_chip_ids()}).')'); +} + +=head2 fetch_all_by_ArrayChip + + Arg [1] : Bio::EnsEMBL::Funcgen::ArrayChip + Example : my @probes = @{$opa->fetch_all_by_ArrayChip($array_chip)}; + Description: Fetch all probes on a particular ArrayChip. + Returntype : Listref of Bio::EnsEMBL::Probe objects. + Exceptions : throw is arg is not valid or stored + Caller : General + Status : At Risk + +=cut + +sub fetch_all_by_ArrayChip { + my $self = shift; + my $array_chip = shift; + + if(! (ref($array_chip) && $array_chip->isa('Bio::EnsEMBL::Funcgen::ArrayChip') && $array_chip->dbID())){ + throw('Need to pass a valid stored Bio::EnsEMBL::Funcgen::ArrayChip'); + } + + return $self->generic_fetch("p.array_chip_id =".$array_chip->dbID()); +} + + + +=head2 fetch_by_ProbeFeature + + Arg [1] : Bio::EnsEMBL::Funcgen::ProbeFeature + Example : my $probe = $opa->fetch_by_ProbeFeature($feature); + Description: Returns the probe that created a particular feature. + Returntype : Bio::EnsEMBL::Probe + Exceptions : Throws if argument is not a Bio::EnsEMBL::Funcgen::ProbeFeature object + Caller : General + Status : At Risk + +=cut + +sub fetch_by_ProbeFeature { + my $self = shift; + my $feature = shift; + + if ( + !ref($feature) + || !$feature->isa('Bio::EnsEMBL::Funcgen::ProbeFeature') + || !$feature->{'probe_id'} + ) { + throw('fetch_by_ProbeFeature requires a stored Bio::EnsEMBL::Funcgen::ProbeFeature object'); + } + + return $self->fetch_by_dbID($feature->{'probe_id'}); +} + +=head2 _tables + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns the names and aliases of the tables to use for queries. + Returntype : List of listrefs of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _tables { + my $self = shift; + + return (['probe', 'p']); +} + +=head2 _columns + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns a list of columns to use for queries. + Returntype : List of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _columns { + my $self = shift; + + return qw( p.probe_id p.probe_set_id p.name p.length p.array_chip_id p.class p.description); + +} + +=head2 _objs_from_sth + + Arg [1] : DBI statement handle object + Example : None + Description: PROTECTED implementation of superclass abstract method. + Creates Probe objects from an executed DBI statement + handle. + Returntype : Listref of Bio::EnsEMBL::Funcgen::Probe objects + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _objs_from_sth { + my ($self, $sth) = @_; + + my (@result, $current_dbid, $arraychip_id, $probe_id, $probe_set_id, $name, $class, $probelength, $desc); + my ($array, %array_cache, %probe_set_cache); + + $sth->bind_columns(\$probe_id, \$probe_set_id, \$name, \$probelength, \$arraychip_id, \$class, \$desc); + + + #Complex query extension + #We want the arrays, array_chip and probeset information setting + #So the probe feature zmenu can just do one query to populate the zmenu unstead of 4 + #Let's just do this with probset to start with as this is more simple + #The object creation for the linked adaptors need to be commodotised + #So we can say something like $probeset_adaptor->create_obj_from_sth_args + #Caches will need to be built in the calling adaptor rather than the true object adaptor + + #No group required as we will always want intermediate data + #Therefore cannot use this in combined with simple extension??? + #Unless we explicitly state group by primary keys. + + #Need to build array of adaptor column hashes(order important) + #Need to build and array of bound columns dependant + #Then we need to parse output dependant on primary keys of each table + #So we would need bolumns bound to hash values + + #We need a way to define the extended tables + #Pass param hash to caller which uses BaseAdaptor method to add tables and columns + #This would have to take into account anything added by the caller + + #Maybe the better way of doing this test would be to include all probes in a probeset? + #Or maybe all probe_features for a probe? + + + + + my $probe; + + while ( $sth->fetch() ) { + + #warn("Need to sort array cacheing, have redundant cache!!"); + #This is nesting array and probeset objects in probe! + + + #Is this required? or should we lazy load this? + #Should we also do the same for probe i.e. nest or lazy load probeset + #Setting here prevents, multiple queries, but if we store the array cache in the adaptor we can overcome this + #danger of eating memory here, but it's onld the same as would be used for generating all the probesets + #what about clearing the cache? + #also as multiple array_chips map to same array, cache would be redundant + #need to store only once and reference. + #have array_cache and arraychip_map + #arraychip_map would give array_id which would be key in array cache + #This is kinda reinventing the wheel, but reducing queries and redundancy of global cache + #cache would never be populated if method not called + #there for reducing calls and memory, increasing speed of generation/initation + #if method were called + #would slightly slow down processing, and would slightly increase memory as cache(small as non-redundant) + #and map hashes would persist + + + ####MAKE THIS LAZY LOADED!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + #Can we even do this given we won't then have the array context? + #We should just force this for efficiency and make people keep the array if they ever want to use that info? + #Will this affect any other methods? + + + + #Can we not change this to an ArrayChip cache and just reimplement the array method? + + $array = $array_cache{$arraychip_id} || $self->db->get_ArrayAdaptor()->fetch_by_array_chip_dbID($arraychip_id); + + + #I don't think we need this? Certainly not for storing + + #$probe_set = $probe_set_cache{$probe_set_id} || $self->db->get_ArrayAdaptor()->fetch_by_array_chip_dbID($arraychip_id); + #probe_set cache would be substantially bigger!! + #potentially as many as the probes + + #Just build cache and nest for now,may want to just return ID and lazy load + + #This is a prime target for compound query extension + #Either extend query by default and nest probe_set + #Or lazy load probeset using cache somehow? + #Use persistant probeset cache in ProbeSetAdaptor for dbID/Probe style queries + + my ($probeset); + + if($probe_set_id){ + $probeset = $probe_set_cache{$probe_set_id} || $self->db->get_ProbeSetAdaptor()->fetch_by_dbID($probe_set_id); + } + + if (!$current_dbid || $current_dbid != $probe_id) { + # New probe + + #UC??? or does rearrange handle this? + + $probe = Bio::EnsEMBL::Funcgen::Probe->new + ( + -dbID => $probe_id, + -name => $name, + -array_chip_id => $arraychip_id, + -array => $array, + -probe_set => $probeset, + -length => $probelength, + -class => $class, + -description => $desc, + -adaptor => $self, + ); + push @result, $probe; + $current_dbid = $probe_id; + } else { + # Extend existing probe + # Probe methods depend on preloading of Array objects + $probe->add_array_chip_probename($arraychip_id, $name, $array); + } + } + return \@result; +} + +=head2 store + + Arg [1] : List of Bio::EnsEMBL::Funcgen::Probe objects + Example : $opa->store($probe1, $probe2, $probe3); + Description: Stores given Probe objects in the database. Should only be + called once per probe because no checks are made for duplicates + Sets dbID and adaptor on the objects that it stores. + Returntype : ARRAYREF + Exceptions : Throws if arguments are not Probe objects + Caller : General + Status : At Risk + +=cut + +sub store { + my ($self, @probes) = @_; + + my ($sth, $dbID, @panals, $pd_sth); + my $pd_sql = "INSERT IGNORE into probe_design(probe_id, analysis_id, score, coord_system_id) values(?, ?, ?, ?)"; + my $db = $self->db(); + throw('Must call store with a list of Probe objects') if (scalar @probes == 0); + + #mv all prep statements here? + #or at least main probe insert + + + PROBE: foreach my $probe (@probes) { + undef $dbID; + + if ( !ref $probe || ! $probe->isa('Bio::EnsEMBL::Funcgen::Probe') ) { + throw("Probe must be an Probe object ($probe)"); + } + + if ( $probe->is_stored($db) ) { + warning('Probe [' . $probe->dbID() . '] is already stored in the database'); + next PROBE; + } + + # Get all the arrays this probe is on and check they're all in the database + my %array_hashes; + + foreach my $ac_id (keys %{$probe->{'arrays'}}) { + + if (defined ${$probe->{'arrays'}}{$ac_id}->dbID()) { + #Will this ever work as generally we're creating from scratch and direct access to keys above by passes DB fetch + $array_hashes{$ac_id} = $probe->{'arrays'}{$ac_id}; + } + } + + throw('Probes need attached arrays to be stored in the database') if ( ! %array_hashes ); + + # Insert separate entry (with same oligo_probe_id) in oligo_probe + # for each array/array_chip the probe is on + foreach my $ac_id (keys %array_hashes) { + my $ps_id = (defined $probe->probeset()) ? $probe->probeset()->dbID() : undef; + + foreach my $name(@{$probe->get_all_probenames($array_hashes{$ac_id}->name)}){ + + if (defined $dbID) { # Already stored + + $sth = $self->prepare + ( + "INSERT INTO probe( probe_id, probe_set_id, name, length, array_chip_id, class, description )". + "VALUES (?, ?, ?, ?, ?, ?, ?)" + ); + + $sth->bind_param(1, $dbID, SQL_INTEGER); + $sth->bind_param(2, $ps_id, SQL_INTEGER); + $sth->bind_param(3, $name, SQL_VARCHAR); + $sth->bind_param(4, $probe->length(), SQL_INTEGER); + $sth->bind_param(5, $ac_id, SQL_INTEGER); + $sth->bind_param(6, $probe->class(), SQL_VARCHAR); + $sth->bind_param(7, $probe->description, SQL_VARCHAR); + $sth->execute(); + } + else { + # New probe + $sth = $self->prepare + ( + "INSERT INTO probe( probe_set_id, name, length, array_chip_id, class, description)". + "VALUES (?, ?, ?, ?, ?, ?)" + ); + + $sth->bind_param(1, $ps_id, SQL_INTEGER); + $sth->bind_param(2, $name, SQL_VARCHAR); + $sth->bind_param(3, $probe->length(), SQL_INTEGER); + $sth->bind_param(4, $ac_id, SQL_INTEGER); + $sth->bind_param(5, $probe->class(), SQL_VARCHAR); + $sth->bind_param(6, $probe->description, SQL_VARCHAR); + $sth->execute(); + $dbID = $sth->{'mysql_insertid'}; + $probe->dbID($dbID); + $probe->adaptor($self); + } + } + } + + if(@panals = @{$probe->get_all_design_scores(1)}){#1 is no fetch flag + #we need to check for duplicates here, or can we just ignore them in the insert statement? + #ignoring would be convenient but may lose info about incorrect duplicates + #also not good general practise + #solution would be nest them with a dbid value aswell as score + #use ignore for now and update implementation when we create BaseProbeDesign? + + $pd_sth ||= $self->prepare($pd_sql); + + foreach my $probe_analysis(@panals){ + my ($analysis_id, $score, $cs_id) = @{$probe_analysis}; + $cs_id ||=0;#NULL + + $pd_sth->bind_param(1, $probe->dbID(), SQL_INTEGER); + $pd_sth->bind_param(2, $analysis_id, SQL_INTEGER); + $pd_sth->bind_param(3, $score, SQL_VARCHAR); + $pd_sth->bind_param(4, $cs_id, SQL_INTEGER); + $pd_sth->execute(); + + } + } + } + + return \@probes; +} + + + +=head2 fetch_all_design_scores + + Arg [1] : Bio::EnsEMBL::Funcgen::Probe + Example : my @probe_analyses = @{$pa->fetch_all_design_scores($probe)}; + Description: Fetchs all probe design analysis records as analysis_id, score and coord_system_id + Returntype : ARRAYREF + Exceptions : throws if not passed a valid stored Probe + Caller : General + Status : at risk + +=cut + +sub fetch_all_design_scores{ + my ($self, $probe) = @_; + + if(! ($probe && $probe->isa('Bio::EnsEMBL::Funcgen::Probe') && $probe->dbID())){ + throw('Must pass a valid stored Bio::EnsEMBL::Funcgen::Probe'); + } + + my $sql = 'SELECT analysis_id, score, coord_system_id from probe_design WHERE probe_id='.$probe->dbID.';'; + return @{$self->db->dbc->db_handle->selectall_arrayref($sql)}; +} + + + +=head2 list_dbIDs + + Arg [1] : none + Example : my @feature_ids = @{$opa->list_dbIDs()}; + Description: Gets an array of internal IDs for all Probe objects in the + current database. + Returntype : List of ints + Exceptions : None + Caller : ? + Status : At Risk + +=cut + +sub list_dbIDs { + my ($self) = @_; + + return $self->_list_dbIDs('probe'); +} + + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/ProbeFeatureAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/ProbeFeatureAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,874 @@ +# +# Ensembl module for Bio::EnsEMBL::DBSQL::Funcgen::ProbeFeatureAdaptor +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::DBSQL::Funcgen::ProbeFeatureAdaptor - A database adaptor for fetching and +storing ProbeFeature objects. + +=head1 SYNOPSIS + +my $ofa = $db->get_ProbeFeatureAdaptor(); + +my $features = $ofa->fetch_all_by_Probe($probe); +$features = $ofa->fetch_all_by_Slice_arrayname($slice, 'Array-1', 'Array-2'); + +=head1 DESCRIPTION + +The ProbeFeatureAdaptor is a database adaptor for storing and retrieving +ProbeFeature objects. + +=head1 SEE ALSO + +Bio::EnsEMBL::Funcgen::ProbeFeature + +=cut + + +package Bio::EnsEMBL::Funcgen::DBSQL::ProbeFeatureAdaptor; + +use Bio::EnsEMBL::Utils::Exception qw( throw deprecate ); +use Bio::EnsEMBL::Funcgen::ProbeFeature; +use Bio::EnsEMBL::Funcgen::DBSQL::BaseFeatureAdaptor; +use Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor; + +use vars qw(@ISA); +use strict; +use warnings; + +@ISA = qw(Bio::EnsEMBL::Funcgen::DBSQL::BaseFeatureAdaptor Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor); + +use constant TRUE_TABLES => [ [ 'probe_feature', 'pf' ], [ 'probe', 'p' ]]; +use constant TABLES => [ [ 'probe_feature', 'pf' ], [ 'probe', 'p' ]]; + + +my $true_final_clause = ' ORDER BY pf.seq_region_id, pf.seq_region_start, pf.probe_feature_id'; +#Could drop pf.probe_feature_id from the ORDER as is implicit from the group? +#still uses filesort for ac clause +my $final_clause = $true_final_clause; + + +=head2 fetch_all_by_Probe + + Arg [1] : Bio::EnsEMBL::Funcgen::Probe + Example : my $features = $ofa->fetch_all_by_Probe($probe); + Description: Fetchs all features that a given probe creates. + Returntype : Listref of Bio::EnsEMBL::PasteFeature objects + Exceptions : Throws if argument is not a stored Probe object + Caller : Probe->get_all_ProbeFeatures() + Status : At Risk + +=cut + +sub fetch_all_by_Probe { + my ($self, $probe, $coord_systems) = @_; + + if (! (ref($probe) && $probe->isa('Bio::EnsEMBL::Funcgen::Probe'))) { + throw('fetch_all_by_Probe requires a Bio::EnsEMBL::Funcgen::Probe object'); + } + + if ( !defined $probe->dbID() ) { + throw('fetch_all_by_Probe requires a stored Bio::EnsEMBL::Funcgen::Probe object'); + } + + return $self->fetch_all_by_probe_id($probe->dbID, $coord_systems); +} + +=head2 fetch_all_by_probe_id + + Arg [1] : int - Probe dbID + Example : my @features = @{$ofa->fetch_all_by_Probe_id($pid)}; + Description: Fetchs all features that a given probe creates. + Returntype : Listref of Bio::EnsEMBL::PasteFeature objects + Exceptions : Throws if argument not defined + Caller : Probe->get_all_ProbeFeatures() + Status : At Risk + +=cut + +sub fetch_all_by_probe_id { + my ($self, $pid, $coord_systems) = @_; + + if ( ! defined $pid ) { + throw('Need to specify a probe _id'); + } + + my @cs_ids = @{$self->_get_coord_system_ids($coord_systems)}; + push @{$self->TABLES}, (['seq_region', 'sr']); + + my $cs_ids = join(', ', @cs_ids); + my $constraint = " pf.probe_id=$pid AND pf.seq_region_id=sr.seq_region_id and sr.coord_system_id IN ($cs_ids)"; + $final_clause = ' GROUP by pf.probe_feature_id '.$final_clause; + + + my $features = $self->generic_fetch($constraint); + $self->reset_true_tables; + $final_clause = $true_final_clause; + + + return $features; +} + + + +=head2 fetch_all_by_probeset_name + + Arg [1] : String - probeset name + Arg [2] : ARRAYREF (optional) - Bio::EnsEMBL::CoordSystem objects + Example : my $features = $ofa->fetch_all_by_probeset('Set-1'); + Description: Fetchs all features that a given probeset creates. + Returntype : Listref of Bio::EnsEMBL::ProbeFeature objects + Exceptions : Throws if no probeset argument + Caller : General + Status : At Risk - add vendor/class to this? + +=cut + + +sub fetch_all_by_probeset_name { + my ($self, $probeset, $coord_systems) = @_; + + if (! $probeset) { + throw('fetch_all_by_probeset requires a probeset name argument'); + } + + #Restrict to default coord_systems + #Can we remove the need for this by restricting the sr cache to default entries? + my @cs_ids = @{$self->_get_coord_system_ids($coord_systems)}; + push @{$self->TABLES}, (['probe_set', 'ps'], ['seq_region', 'sr']); + + #Need to protect against SQL injection here due to text params + my $cs_ids = join(', ', @cs_ids); + my $constraint = " ps.name=? AND ps.probe_set_id=p.probe_set_id AND pf.seq_region_id=sr.seq_region_id and sr.coord_system_id IN ($cs_ids)"; + $final_clause = ' GROUP by pf.probe_feature_id '.$final_clause; + + $self->bind_param_generic_fetch($probeset, SQL_VARCHAR); + + my $features = $self->generic_fetch($constraint); + $self->reset_true_tables; + $final_clause = $true_final_clause; + + return $features; +} + + +=head2 fetch_all_by_ProbeSet + + Arg [1] : Bio::EnsEMBL::Funcgen::ProbeSet + Arg [2] : ARRAYREF (optional) - Bio::EnsEMBL::CoordSystem objects + Example : my @features = @{$probe_feature_adaptor->fetch_all_by_ProbeSet($pset)}; + Description: Fetches all ProbeFeatures from a given ProbeSet. + Returntype : ARRAYREF of Bio::EnsEMBL::Funcgen::ProbeFeature objects + Exceptions : Throws if no probeset argument + Caller : General + Status : At Risk - add vendor/class to this? + +=cut + + +sub fetch_all_by_ProbeSet { + my ($self, $pset, $coord_systems) = @_; + + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::ProbeSet', $pset); + + + #Restrict to default coord_systems + #Can we remove the need for this by restricting the sr cache to default entries? + my @cs_ids = @{$self->_get_coord_system_ids($coord_systems)}; + push @{$self->TABLES}, (['seq_region', 'sr']); + + my $cs_ids = join(', ', @cs_ids); + my $constraint = ' p.probe_set_id='.$pset->dbID." AND pf.seq_region_id=sr.seq_region_id and sr.coord_system_id IN ($cs_ids)"; + $final_clause = ' GROUP by pf.probe_feature_id '.$final_clause; + + + warn $constraint; + + my $features = $self->generic_fetch($constraint); + $self->reset_true_tables; + $final_clause = $true_final_clause; + + return $features; +} + + +=head2 fetch_all_by_Slice_ExperimentalChips + + Arg [1] : Bio::EnsEMBL::Slice + Arg [2] : ARRAY ref of Bio::EnsEMBL::Funcgen::ExperimentalChip objects + Example : my $features = $pfa->fetch_all_by_Slice_ExperimentalChips($slice, \@echips); + Description: Retrieves a list of features on a given slice that are created + by probes from the given ExperimentalChips. + Returntype : Listref of Bio::EnsEMBL::Funcgen::ProbeFeature objects + Exceptions : Throws if args not valid + Caller : + Status : At Risk + +=cut + +sub fetch_all_by_Slice_ExperimentalChips { + my ($self, $slice, $exp_chips) = @_; + + my %nr; + + foreach my $ec(@$exp_chips){ + + throw("Need pass listref of valid Bio::EnsEMBL::Funcgen::ExperimentalChip objects") + if ! $ec->isa("Bio::EnsEMBL::Funcgen::ExperimentalChip"); + + $nr{$ec->array_chip_id()} = 1; + } + + my $constraint = " p.array_chip_id IN (".join(", ", keys %nr).") AND p.probe_id = pf.probe_id "; + + return $self->SUPER::fetch_all_by_Slice_constraint($slice, $constraint); +} + + + +#Need to Group in the following methods as we may get array_chip +#to probe product if probe is presenton >1 array_chip. +#This will be slowing as GROUP implies order +#Does _objects_from_sth handle this without assuming order? + + + +=head2 fetch_all_by_Slice_array_vendor + + Arg [1] : Bio::EnsEMBL::Slice + Arg [2] : string - array name e.g. HG-U133A + Arg [3] : string - vendor e.g. AFFY + Example : my $slice = $sa->fetch_by_region('chromosome', '1'); + my $features = $ofa->fetch_by_Slice_array_vendor($slice, $array_name, $vendor_name); + Description: Retrieves a list of features on a given slice that are created + by probes from the specified array. + Returntype : Listref of Bio::EnsEMBL::Funcgen::ProbeFeature objects + Exceptions : Throws if no array name is provided + #Caller : Slice->get_all_ProbesFeatures() + Status : At Risk + +=cut + +sub fetch_all_by_Slice_array_vendor { + my ($self, $slice, $array, $vendor) = @_; + + if(! ($array && $vendor)){ + throw('You must provide and array name and a vendor name'); + } + + push @{$self->TABLES}, (['array', 'a'], ['array_chip', 'ac']); + + #Need to protect against SQL injection here due to text params + my $constraint = ' a.name=? and a.vendor=? and a.array_id=ac.array_id and ac.array_chip_id=p.array_chip_id'; + $final_clause = ' GROUP by pf.probe_feature_id '.$final_clause; + $self->bind_param_generic_fetch($array, SQL_VARCHAR); + $self->bind_param_generic_fetch($vendor, SQL_VARCHAR); + + my $features = $self->SUPER::fetch_all_by_Slice_constraint($slice, $constraint); + $self->reset_true_tables; + $final_clause = $true_final_clause; + + return $features; +} + + +=head2 fetch_all_by_Slice_Array + + Arg [1] : Bio::EnsEMBL::Slice + Arg [2] : Bio::EnsEMBL::Funcgen::Array + Example : my $slice = $sa->fetch_by_region('chromosome', '1'); + my $features = $pfa->fetch_all_by_Slice_Array($slice, $array); + Description: Retrieves a list of features on a given slice that are created + by probes from the given Array. + Returntype : Listref of Bio::EnsEMBL::Funcgen::ProbeFeature objects + Exceptions : Throws if no array name is provided + Caller : + Status : At Risk + +=cut + +sub fetch_all_by_Slice_Array { + my ($self, $slice, $array) = @_; + + throw("Need pass a valid stored Bio::EnsEMBL::Funcgen::Array object") + if (! (ref($array) && $array->isa("Bio::EnsEMBL::Funcgen::Array") && $array->dbID)); + + push @{$self->TABLES}, (['array_chip', 'ac']); + my $constraint = ' ac.array_id='.$array->dbID.' and ac.array_chip_id=p.array_chip_id '; + $final_clause = ' GROUP by pf.probe_feature_id '.$final_clause; + + my $features = $self->SUPER::fetch_all_by_Slice_constraint($slice, $constraint); + $self->reset_true_tables; + $final_clause = $true_final_clause; + + return $features; +} + + +=head2 fetch_all_by_Slice_Arrays + + Arg [1] : Bio::EnsEMBL::Slice + Arg [2] : ARRAYREF of Bio::EnsEMBL::Funcgen::Array objects + Arg [3] : HASHREF - optional params hash e.g. {logic_name => 'AFFY_ProbeTranscriptAlign'} + Example : my $slice = $sa->fetch_by_region('chromosome', '1'); + my $features = $pfa->fetch_all_by_Slice_Arrays($slice, \@arrays); + Description: Retrieves a list of features on a given slice that are created + by probes from the given Arrays. + Returntype : Listref of Bio::EnsEMBL::Funcgen::ProbeFeature objects + Exceptions : Throws if ARRAYREF of arrays is not provided + Caller : + Status : At Risk + +=cut + +sub fetch_all_by_Slice_Arrays{ + my ($self, $slice, $arrays, $params) = @_; + + my $logic_name; + $logic_name = $params->{'logic_name'} if exists ${$params}{'logic_names'}; + + + if(!(ref($arrays) eq 'ARRAY' && @$arrays)){ + throw('Must pass an ARRAYREF of Bio::EnsEMBL::Funcgen::Array objects'); + } + + my $array_ids = join(',', (map $_->dbID, @$arrays)); + + push @{$self->TABLES}, (['array_chip', 'ac']); + my $constraint = " ac.array_id IN ($array_ids) and ac.array_chip_id=p.array_chip_id "; + + $final_clause = ' GROUP by pf.probe_feature_id '.$final_clause; + my $features = $self->SUPER::fetch_all_by_Slice_constraint($slice, $constraint, $logic_name); + $self->reset_true_tables; + $final_clause = $true_final_clause; + + return $features; +} + + +=head2 fetch_Iterator_by_Slice_Arrays + + Arg [1] : Bio::EnsEMBL::Slice + Arg [2] : ARRAYREF of Bio::EnsEMBL::Funcgen::Array objects + Arg [3] : HASHREF - optional params hash e.g. {logic_name => 'AFFY_ProbeTranscriptAlign'} + Example : my $slice = $sa->fetch_by_region('chromosome', '1'); + my $features = $pfa->fetch_Iterator_by_Slice_Arrays($slice, \@arrays); + Description: Retrieves a list of features on a given slice that are created + by probes from the given Array. + Returntype : Bio::EnsEMBL::Utils::Iterator + Exceptions : Throws if ARRAYREF of arrays is not provided + Caller : + Status : At Risk + +=cut + +sub fetch_Iterator_by_Slice_Arrays{ + my ($self, $slice, $arrays, $params) = @_; + + + return $self->fetch_Iterator_by_Slice_method + ($self->can('fetch_all_by_Slice_Arrays'), + [$slice, $arrays, $params], + 0,#Slice idx + #500 #chunk length + ); +} + + + + +=head2 _tables + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns the names and aliases of the tables to use for queries. + Returntype : List of listrefs of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _tables { + my $self = shift; + + return @{$self->TABLES}; +} + +=head2 _columns + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns a list of columns to use for queries. + Returntype : List of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _columns { + my $self = shift; + + return qw( + pf.probe_feature_id pf.seq_region_id + pf.seq_region_start pf.seq_region_end + pf.seq_region_strand pf.probe_id + pf.analysis_id pf.mismatches + pf.cigar_line p.name + p.probe_set_id + ); +} + +=head2 _default_where_clause + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns an additional table joining constraint to use for + queries. + Returntype : List of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut +sub _default_where_clause { + my $self = shift; + + return 'pf.probe_id = p.probe_id'; +} + +=head2 _final_clause + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns an ORDER BY clause. Sorting by probe_feature_id would be + enough to eliminate duplicates, but sorting by location might + make fetching features on a slice faster. + Returntype : String + Exceptions : None + Caller : generic_fetch + Status : At Risk + +=cut + + +sub _final_clause { + return $final_clause; +} + + +=head2 _objs_from_sth + + Arg [1] : DBI statement handle object + Example : None + Description: PROTECTED implementation of superclass abstract method. + Creates ProbeFeature objects from an executed DBI statement + handle. + Returntype : Listref of Bio::EnsEMBL::ProbeFeature objects + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _objs_from_sth { + my ($self, $sth, $mapper, $dest_slice) = @_; + + #For EFG this has to use a dest_slice from core/dnaDB whether specified or not. + #So if it not defined then we need to generate one derived from the species_name and schema_build of the feature we're retrieving. + + + + # This code is ugly because caching is used to improve speed + my ($seq_region_id); + my $sa = $self->db->get_SliceAdaptor(); + $sa = $dest_slice->adaptor->db->get_SliceAdaptor() if($dest_slice);#don't really need this if we're using DNADBSliceAdaptor? + + #Some of this in now probably overkill as we'll always be using the DNADB as the slice DB + #Hence it should always be on the same coord system, unless we're projecting + + my $aa = $self->db->get_AnalysisAdaptor(); + my @features; + my (%analysis_hash, %slice_hash, %sr_name_hash, %sr_cs_hash); + + my ( + $probe_feature_id, $efg_seq_region_id, + $seq_region_start, $seq_region_end, + $seq_region_strand, $mismatches, + $probe_id, $analysis_id, + $probe_name, $cigar_line, + $probeset_id + ); + $sth->bind_columns( + \$probe_feature_id, \$efg_seq_region_id, + \$seq_region_start, \$seq_region_end, + \$seq_region_strand, \$probe_id, + \$analysis_id, \$mismatches, + \$cigar_line, \$probe_name, + \$probeset_id + ); + + my ($asm_cs, $cmp_cs, $asm_cs_name, $asm_cs_vers ,$cmp_cs_name, $cmp_cs_vers); + if ($mapper) { + $asm_cs = $mapper->assembled_CoordSystem(); + $cmp_cs = $mapper->component_CoordSystem(); + $asm_cs_name = $asm_cs->name(); + $asm_cs_vers = $asm_cs->version(); + $cmp_cs_name = $cmp_cs->name(); + $cmp_cs_vers = $cmp_cs->version(); + } + + my ($dest_slice_start, $dest_slice_end, $dest_slice_strand); + my ($dest_slice_length, $dest_slice_sr_name); + if ($dest_slice) { + $dest_slice_start = $dest_slice->start(); + $dest_slice_end = $dest_slice->end(); + $dest_slice_strand = $dest_slice->strand(); + $dest_slice_length = $dest_slice->length(); + $dest_slice_sr_name = $dest_slice->seq_region_name(); + } + + #This has already been done by + #build seq_region_cache based on slice + #$self->build_seq_region_cache_by_Slice($slice); + + + my $last_pfid; + + FEATURE: while ( $sth->fetch() ) { + #Need to build a slice adaptor cache here? + #Would only ever want to do this if we enable mapping between assemblies?? + #Or if we supported the mapping between cs systems for a given schema_build, which would have to be handled by the core api + + #This is only required due to multiple records being returned + #From nr probe entries due to being present on multiple ArrayChips + #Group instead? + next if($last_pfid && ($last_pfid == $probe_feature_id)); + $last_pfid = $probe_feature_id; + + #get core seq_region_id + $seq_region_id = $self->get_core_seq_region_id($efg_seq_region_id); + + if(! $seq_region_id){ + #warn "Cannot get slice for eFG seq_region_id $efg_seq_region_id for probe_feature $probe_feature_id\n". + #"The region you are using is not present in the current dna DB"; + #This can happen as non slice fetches only restrict on cs_id + #Hence for the non-versioned cs's there may be seq_regions which have + #disappeared in the current assembly and hence won't be in the sr cache. + #We could get around this by adding an sr_id IN(all the sr_ids from this DB) + #but this will most likely just slow things down for data which is not present on + #just one assembly + #So preferable to clear old data! + next; + } + + + # Get the analysis object + my $analysis = $analysis_hash{$analysis_id} ||= $aa->fetch_by_dbID($analysis_id); + + # Get the slice object + my $slice = $slice_hash{'ID:'.$seq_region_id}; + + if (!$slice) { + $slice = $sa->fetch_by_seq_region_id($seq_region_id); + + + if(! $slice){ + warn "Cannot get slice for seq_region_id $seq_region_id for probe_feature $probe_feature_id"; + } + + + + $slice_hash{'ID:'.$seq_region_id} = $slice; + $sr_name_hash{$seq_region_id} = $slice->seq_region_name(); + $sr_cs_hash{$seq_region_id} = $slice->coord_system(); + } + + #need to check once more here as it may not be in the DB, + #i.e. a supercontig(non-versioned) may have been deleted between releases + + + my $sr_name = $sr_name_hash{$seq_region_id}; + my $sr_cs = $sr_cs_hash{$seq_region_id}; + + # Remap the feature coordinates to another coord system if a mapper was provided + if ($mapper) { + + throw("Not yet implmented mapper, check equals are Funcgen calls too!"); + + ($sr_name, $seq_region_start, $seq_region_end, $seq_region_strand) + = $mapper->fastmap($sr_name, $seq_region_start, $seq_region_end, $seq_region_strand, $sr_cs); + + # Skip features that map to gaps or coord system boundaries + next FEATURE if !defined $sr_name; + + # Get a slice in the coord system we just mapped to + if ( $asm_cs == $sr_cs || ( $cmp_cs != $sr_cs && $asm_cs->equals($sr_cs) ) ) { + $slice = $slice_hash{"NAME:$sr_name:$cmp_cs_name:$cmp_cs_vers"} + ||= $sa->fetch_by_region($cmp_cs_name, $sr_name, undef, undef, undef, $cmp_cs_vers); + } else { + $slice = $slice_hash{"NAME:$sr_name:$asm_cs_name:$asm_cs_vers"} + ||= $sa->fetch_by_region($asm_cs_name, $sr_name, undef, undef, undef, $asm_cs_vers); + } + } + + # If a destination slice was provided convert the coords + # If the destination slice starts at 1 and is forward strand, nothing needs doing + if ($dest_slice) { + unless ($dest_slice_start == 1 && $dest_slice_strand == 1) { + if ($dest_slice_strand == 1) { + $seq_region_start = $seq_region_start - $dest_slice_start + 1; + $seq_region_end = $seq_region_end - $dest_slice_start + 1; + } else { + my $tmp_seq_region_start = $seq_region_start; + $seq_region_start = $dest_slice_end - $seq_region_end + 1; + $seq_region_end = $dest_slice_end - $tmp_seq_region_start + 1; + $seq_region_strand *= -1; + } + } + + # Throw away features off the end of the requested slice + next FEATURE if $seq_region_end < 1 || $seq_region_start > $dest_slice_length + || ( $dest_slice_sr_name ne $sr_name ); + + $slice = $dest_slice; + } + + push @features, Bio::EnsEMBL::Funcgen::ProbeFeature->new_fast + ({ + 'start' => $seq_region_start, + 'end' => $seq_region_end, + 'strand' => $seq_region_strand, + 'slice' => $slice, + 'analysis' => $analysis,#we should lazy load this from analysis adaptor cache? + 'adaptor' => $self, + 'dbID' => $probe_feature_id, + 'mismatchcount' => $mismatches, + 'cigar_string' => $cigar_line, + 'probe_id' => $probe_id, + #Do these need to be private? + '_probeset_id' => $probeset_id,#Used for linking feature glyphs + '_probe_name' => $probe_name,#?? There can be >1. Is this for array design purposes? + } ); + + + + } + + return \@features; +} + + +=head2 store + + Args : List of Bio::EnsEMBL::Funcgen::ProbeFeature objects + Example : $ofa->store(@features); + Description: Stores given ProbeFeature objects in the database. Should only + be called once per feature because no checks are made for + duplicates. Sets dbID and adaptor on the objects that it stores. + Returntype : None + Exceptions : Throws if a list of ProbeFeature objects is not provided or if + an analysis is not attached to any of the objects + Caller : General + Status : At Risk + +=cut + +sub store{ + my ($self, @ofs) = @_; + + if (scalar(@ofs) == 0) { + throw('Must call store with a list of ProbeFeature objects'); + } + + my $sth = $self->prepare(" + INSERT INTO probe_feature ( + seq_region_id, seq_region_start, + seq_region_end, seq_region_strand, + probe_id, analysis_id, + mismatches, cigar_line + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?) + "); + + my $db = $self->db(); + my $analysis_adaptor = $db->get_AnalysisAdaptor(); + + FEATURE: foreach my $of (@ofs) { + + if( ! ref $of || ! $of->isa('Bio::EnsEMBL::Funcgen::ProbeFeature') ) { + throw('Feature must be an ProbeFeature object'); + } + + if ( $of->is_stored($db) ) { + warn('ProbeFeature [' . $of->dbID() . '] is already stored in the database'); + next FEATURE; + } + + if ( !defined $of->analysis() ) { + throw('An analysis must be attached to the ProbeFeature objects to be stored.'); + } + + # Store the analysis if it has not been stored yet + if ( !$of->analysis->is_stored($db) ) { + $analysis_adaptor->store( $of->analysis() ); + } + + my $seq_region_id; + ($of, $seq_region_id) = $self->_pre_store($of); + + $sth->bind_param(1, $seq_region_id, SQL_INTEGER); + $sth->bind_param(2, $of->start(), SQL_INTEGER); + $sth->bind_param(3, $of->end(), SQL_INTEGER); + $sth->bind_param(4, $of->strand(), SQL_TINYINT); + $sth->bind_param(5, $of->probe_id(), SQL_INTEGER); + $sth->bind_param(6, $of->analysis->dbID(), SQL_INTEGER); + $sth->bind_param(7, $of->mismatchcount(), SQL_TINYINT); + $sth->bind_param(8, $of->cigar_string(), SQL_VARCHAR); + + $sth->execute(); + $of->dbID( $sth->{'mysql_insertid'} ); + $of->adaptor($self); + + + } + + #No need to return this really as the dbID and adaptor has been + #updated in the passed arrays of features via the object + #reference + return \@ofs +} + +=head2 list_dbIDs + + Args : None + Example : my @feature_ids = @{$ofa->list_dbIDs()}; + Description: Gets an array of internal IDs for all ProbeFeature objects in + the current database. + Returntype : List of ints + Exceptions : None + Caller : ? + Status : Medium Risk + +=cut + +sub list_dbIDs { + my $self = shift; + + return $self->_list_dbIDs('probe_feature'); +} + + +#Probe cache methods? + +=head2 reassign_feature_to_probe + + Arg[0] : ARRAYREF - feature dbIDs to reassign + Arg[1] : int - probe dbID to reassign to + Example : $ofa->reassign_feature_to_probe(\@fids, $pid); + Description: Update features to link to given probe dbID + Returntype : None + Exceptions : Throws is args not met + Caller : Importer + Status : At Risk + +=cut + +sub reassign_feature_to_probe{ + my ($self, $fids_ref, $pid) = @_; + + if(! @$fids_ref || ! $pid){ + throw('Need to pass a ref to an array of feature ids and a probe id to reassign to'); + } + + my $cmd = 'UPDATE probe_feature SET probe_id='.$pid.' WHERE probe_feature_id IN ('.join(',', @$fids_ref).')'; + $self->db->dbc->do($cmd); + + return; +} + +=head2 delete_features + + Arg[0] : ARRAYREF - feature dbIDs to reassign + Example : $pfa->delete_feature(\@fids); + Description: Deletes feature with given probe_feature_ids + Returntype : None + Exceptions : Throws if not arg defines + Caller : Importer + Status : At Risk + +=cut + + +#This does not rollback associated xrefs! + +sub delete_features{ + my ($self, $fids_ref) = @_; + + if(! @$fids_ref){ + throw('Need to pass a ref to an array of feature ids'); + } + + my $cmd = 'DELETE from probe_feature WHERE probe_feature_id IN ('.join(',', @$fids_ref).')'; + $self->db->dbc->do($cmd); + + return; +} + + +=head2 count_probe_features_by_probe_id + + Arg [1] : string/int - id to count + Example : my $probe_feature_count = $pfa->count_features_by_probe_id($probe_id); + Description: Returns a count of ProbeFeatures for a given probe id + Returntype : string/int - count of features + Exceptions : None + Caller : FeatureAdaptors + Status : At risk + +=cut + + +sub count_probe_features_by_probe_id { + my ($self, $probe_id) = @_; + + return $self->count_features_by_field_id('probe_id', $probe_id); +} + +### DEPRECATED METHODS ### + +sub fetch_all_by_probeset { #deprecated in v68 + my ($self, @args) = @_; + + deprecate('This method is deprecated, please use fetch_all_by_probeset_name or fetch_all_by_ProbeSet'); + + return $self->fetch_all_by_probeset_name(@args); +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/ProbeSetAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/ProbeSetAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,399 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::DBSQL::ProbeSetAdaptor +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::DBSQL::ProbeSetAdaptor - A database adaptor for fetching and +storing ProbeSet objects. + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Registry; + use Bio::EnsEMBL::Funcgen::ProbeSet; + + + my $reg = Bio::EnsEMBL::Registry->load_adaptors_from_db(-host => 'ensembldb.ensembl.org', + -user => 'anonymous'); + + + my $pset_a = Bio::EnsEMBL::Resgitry->get_adpator($species, 'funcgen', 'ProbeSet'); + + #Fetching a probeset by name + my $probeset = $pset_a->fetch_by_array_probeset_name('Array-1', 'ProbeSet-1'); + + ### Fetching probeset with transcript annotations ### + # Generated by the Ensembl array mapping pipeline + + my @probesets = @{$pset_a->fetch_all_by_linked_Transcript($transcript)}; + + #Note: Associated linkage annotation is stored in the associated DBEntries + +=head1 DESCRIPTION + +The ProbeSetAdaptor is a database adaptor for storing and retrieving +ProbeSet objects. + +=head1 SEE ALSO + + Bio::EnsEMBL::Funcgen::ProbeSet + ensembl-functgenomics/scripts/examples/microarray_annotation_example.pl + + Or for details on how to run the array mapping pipeline see: + ensembl-functgenomics/docs/array_mapping.txt + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::DBSQL::ProbeSetAdaptor; + +use Bio::EnsEMBL::Utils::Exception qw( throw warning ); +use Bio::EnsEMBL::Funcgen::ProbeSet; +use Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor); + +#Query extension stuff +use constant TRUE_TABLES => [[ 'probe_set', 'ps' ]]; +use constant TABLES => [[ 'probe_set', 'ps' ]]; + + +#Not currently using final clause as we don't group by default + +=head2 fetch_by_array_probeset_name + + Arg [1] : string - name of array + Arg [2] : string - name of probeset + Example : my $probeset = $opsa->fetch_by_array_probeset_name('Array-1', 'Probeset-1'); + Description: Returns a probeset given the array name and probeset name + This will uniquely define a probeset. Only one + probeset is ever returned. + Returntype : Bio::EnsEMBL::ProbeSet + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub fetch_by_array_probeset_name{ + my ($self, $array_name, $probeset_name) = @_; + + if(! ($array_name && $probeset_name)){ + throw('Must provide array_name and probeset_name arguments'); + } + + #Extend query tables + push @{$self->TABLES}, (['probe', 'p'], ['array_chip', 'ac'], ['array', 'a']); + + my $constraint = 'ps.name= ? AND ps.probe_set_id=p.probe_set_id AND p.array_chip_id=ac.array_chip_id AND ac.array_id=a.array_id AND a.name= ? GROUP by ps.probe_set_id'; + + #bind params as we have unsafe string args + $self->bind_param_generic_fetch($probeset_name, SQL_VARCHAR); + $self->bind_param_generic_fetch($array_name, SQL_VARCHAR); + + my $pset = $self->generic_fetch($constraint)->[0]; + $self->reset_true_tables; + + return $pset; +} + + + +=head2 fetch_all_by_name + + Arg [1] : string - probe set name + Example : my @probes = @{$pdaa->fetch_all_by_name('ProbeSet1')}; + Description: Convenience method to re-instate the functionality of + $core_dbentry_adpator->fetch_all_by_external_name('probeset_name'); + WARNING: This may not be the probeset you are expecting as + probeset names are not unqiue across arrays and vendors. + These should ideally be validated using the attached array + information or alternatively use fetch_by_array_probeset_name + Returns a probe with the given name. + Returntype : Arrayref + Exceptions : Throws if name not passed + Caller : General + Status : At Risk + +=cut + + +sub fetch_all_by_name{ + my ($self, $name) = @_; + + throw('Must provide a probeset name argument') if ! defined $name; + $self->bind_param_generic_fetch($name, SQL_VARCHAR); + + return $self->generic_fetch('ps.name=?'); +} + + +=head2 fetch_by_ProbeFeature + + Arg [1] : Bio::EnsEMBL::ProbeFeature + Example : my $probeset = $opsa->fetch_by_ProbeFeature($feature); + Description: Returns the probeset that created a particular feature. + Returntype : Bio::EnsEMBL::ProbeSet + Exceptions : Throws if argument is not a Bio::EnsEMBL::ProbeFeature object + Caller : General + Status : At Risk + +=cut + +#This is a good candidate for complex query extension +#As we will most likely want the probe also if we are fetching the ProbeSet +#For a given feature. +#We could also set the probe in the ProbeFeature object, so we don't re-query +#should the user use ProbeFeature->get_probe +#This is also a case for passing the array name to automatically set +#the probe name? As we will likely know the array name beforehand. + +#Could we also bring back annotations for this Probe/ProbeSet? +# + +sub fetch_by_ProbeFeature { + my ($self, $pfeature) = @_; + + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::ProbeFeature', $pfeature); + + #Extend query + push @{$self->TABLES}, (['probe', 'p']); + my $pset = $self->generic_fetch('p.probe_id='.$pfeature->probe_id.' and p.probe_set_id=ps.probe_set_id GROUP by ps.probe_set_id')->[0]; + $self->reset_true_tables; + + return $pset; +} + + +=head2 fetch_all_by_Array + +Arg [1] : Bio::EnsEMBL::Funcgen::Array +Example : my @probesets = @{$pset_adaptor->fetch_all_by_Array($array)}; +Description: Fetch all ProbeSets on a particular array. +Returntype : Listref of Bio::EnsEMBL::ProbeSet objects. +Exceptions : throws if arg is not valid or stored +Caller : General +Status : At Risk + +=cut + +#This is quicker than query extension? + +sub fetch_all_by_Array { + my $self = shift; + my $array = shift; + + if(! (ref($array) && $array->isa('Bio::EnsEMBL::Funcgen::Array') && $array->dbID())){ + throw('Need to pass a valid stored Bio::EnsEMBL::Funcgen::Array'); + } + + #get all array_chip_ids, for array and do a subselect statement with generic fetch + my $constraint = ( " ps.probe_set_id in" + ." ( SELECT distinct(p.probe_set_id)" + ." from probe p where" + ." p.array_chip_id IN (".join(",", @{$array->get_array_chip_ids()}).")" + ." )" ); + + return $self->generic_fetch($constraint); +} + + + +=head2 _tables + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns the names and aliases of the tables to use for queries. + Returntype : List of listrefs of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _tables { + my $self = shift; + + return @{$self->TABLES}; +} + +=head2 _columns + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns a list of columns to use for queries. + Returntype : List of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _columns { + my $self = shift; + + #remove xref_id and use xref tables + return qw( ps.probe_set_id ps.name ps.size ps.family); + +} + +=head2 _objs_from_sth + + Arg [1] : DBI statement handle object + Example : None + Description: PROTECTED implementation of superclass abstract method. + Creates ProbeSet objects from an executed DBI statement + handle. + Returntype : Listref of Bio::EnsEMBL::ProbeSet objects + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _objs_from_sth { + my ($self, $sth) = @_; + + my (@result, $current_dbid, $probeset_id, $name, $size, $family); + my ($array, %array_cache); + + $sth->bind_columns( \$probeset_id, \$name, \$size, \$family); + + + #do not have array_chip adaptor + #use array adaptor directly + #how are we going ot handle the cache here????? + + my $probeset; + while ( $sth->fetch() ) { + #$array = $array_cache{$array_id} || $self->db->get_ArrayAdaptor()->fetch_by_dbID($array_id); + + #This is nesting array object in probeset! + #$array = $array_cache{$arraychip_id} || $self->db->get_ArrayAdaptor()->fetch_by_array_chip_dbID($arraychip_id); + + #Is this required? or should we lazy load this? + #Should we also do the same for probe i.e. nest or lazy load probeset + #Setting here prevents, multiple queries, but if we store the array cache in the adaptor we can overcome this + #danger of eating memory here, but it's onld the same as would be used for generating all the probesets + #what about clearing the cache? + #also as multiple array_chips map to same array, cache would be redundant + #need to store only once and reference. + #have array_cache and arraychip_map + #arraychip_map would give array_id which would be key in array cache + #This is kinda reinventing the wheel, but reducing queries and redundancy of global cache + #cache would never be populated if method not called + #there for reducing calls and memory, increasing speed of generation/initation + #if method were called + #would slightly slow down processing, and would slightly increase memory as cache(small as non-redundant) + #and map hashes would persist + + #Do we even need this???? + + #warn("Can we lazy load the arrays from a global cache, which is itself lazy loaded and non-redundant?\n"); + + + #this current id stuff is due to lack of probeset table in core + #if (!$current_dbid || $current_dbid != $probeset_id) { + + # New probeset + $probeset = Bio::EnsEMBL::Funcgen::ProbeSet->new + ( + -dbID => $probeset_id, + -name => $name, + -size => $size, + # -array => $array, + -family => $family, + -adaptor => $self, + ); + push @result, $probeset; + + #$current_dbid = $probeset_id; + #} else { + # # Extend existing probe + # $probe->add_Array_probename($array, $name); + #} + } + return \@result; +} + +=head2 store + + Arg [1] : List of Bio::EnsEMBL::Funcgen::ProbeSet objects + Example : $opa->store($probeset1, $probeset2, $probeset3); + Description: Stores given ProbeSet objects in the database. Should only be + called once per probe because no checks are made for duplicates.??? It certainly looks like there is :/ + Sets dbID and adaptor on the objects that it stores. + Returntype : None + Exceptions : Throws if arguments are not Probe objects + Caller : General + Status : At Risk + +=cut + +sub store { + my ($self, @probesets) = @_; + + my ($sth, $array); + + if (scalar @probesets == 0) { + throw('Must call store with a list of Probe objects'); + } + + my $db = $self->db(); + + PROBESET: foreach my $probeset (@probesets) { + + if ( !ref $probeset || !$probeset->isa('Bio::EnsEMBL::Funcgen::ProbeSet') ) { + throw('ProbeSet must be an ProbeSet object'); + } + + if ( $probeset->is_stored($db) ) { + warning('ProbeSet [' . $probeset->dbID() . '] is already stored in the database'); + next PROBESET; + } + + $sth = $self->prepare(" + INSERT INTO probe_set + (name, size, family) + VALUES (?, ?, ?) + "); + $sth->bind_param(1, $probeset->name(), SQL_VARCHAR); + $sth->bind_param(2, $probeset->size(), SQL_INTEGER); + $sth->bind_param(3, $probeset->family(), SQL_VARCHAR); + + $sth->execute(); + my $dbID = $sth->{'mysql_insertid'}; + $probeset->dbID($dbID); + $probeset->adaptor($self); + } + + return \@probesets; +} + + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/RegulatoryFeatureAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/RegulatoryFeatureAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,984 @@ +# +# Ensembl module for Bio::EnsEMBL::DBSQL::Funcgen::RegulatoryFeatureAdaptor +# + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::DBSQL::Funcgen::RegulatoryFeatureAdaptor + +=head1 SYNOPSIS + +use Bio::EnsEMBL::Registry; +use Bio::EnsEMBL::Funcgen::RegulatoryFeature; + +my $reg = Bio::EnsEMBL::Registry->load_adaptors_from_db(-host => 'ensembldb.ensembl.org', + -user => 'anonymous'); + +my $regfeat_adaptor = $reg->get_adaptor($species, 'funcgen', 'RegulatoryFeature'); + + +#Fetch MultiCell RegulatoryFeatures +my @features = @{$regfeat_adaptor->fetch_all_by_Slice($slice)}; + +#Fetch cell type specific RegulatoryFeatures +my @ctype_features = @{$regfeat_adaptor->fetch_all_by_Slice_FeatureSets($slice, [$ctype_fset1, $ctype_fset2])}; + +#Fetch all cell type RegulatoryFeatures for a given stable ID +my @ctype_features = @{$regfeat_adaptor->fetch_all_by_stable_ID('ENSR00001348194')}; + + +=head1 DESCRIPTION + +The RegulatoryFeatureAdaptor is a database adaptor for storing and retrieving +RegulatoryFeature objects. The FeatureSet class provides convenient wrapper +methods to the Slice functionality within this adaptor. + +=cut + + +package Bio::EnsEMBL::Funcgen::DBSQL::RegulatoryFeatureAdaptor; + +use strict; +use warnings; +use Bio::EnsEMBL::Utils::Exception qw( throw warning ); +use Bio::EnsEMBL::Funcgen::RegulatoryFeature; +use Bio::EnsEMBL::Funcgen::DBSQL::SetFeatureAdaptor; + +use base qw(Bio::EnsEMBL::Funcgen::DBSQL::SetFeatureAdaptor); #@ISA +#change to parent with perl 5.10 + +my %valid_attribute_features = ( + 'Bio::EnsEMBL::Funcgen::MotifFeature' => 'motif', + 'Bio::EnsEMBL::Funcgen::AnnotatedFeature' => 'annotated', + ); + +=head2 fetch_all + + Arg [1] : optional - Bio::EnsEMBL::FeatureSet + Example : my $rfs = $rf_adaptor->fetch_all(); + Description: Over-ride generic fetch_all method to return only MultiCell features by default. + Returntype : ARRAYREF of Bio::EnsEMBL::Funcgen::RegulatoryFeature objects + Exceptions : none + Caller : general + Status : At risk + +=cut + +#Change to Iterator? + +sub fetch_all{ + my ($self, $fset) = @_; + + if($fset){ + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::FeatureSet', $fset) + }else{ + $fset = $self->_get_current_FeatureSet; + } + + my $constraint = ' rf.feature_set_id='.$fset->dbID; + + + return $self->fetch_all($constraint); +} + + + +=head2 _get_current_FeatureSet + + Example : my $regf_featureset = $self->_get_current_FeatureSet; + Description: Convenience method to get and test the current + Returntype : Bio::EnsEMBL::Funcgen::FeatureSet + Exceptions : Throws is FeatureSet is not available + Caller : general + Status : at risk - change to _get_core_FeatureSet? + +=cut + +sub _get_current_FeatureSet{ + my $self = shift; + + + if(! $self->{'multicell_set'}){ + $self->{'multicell_set'} = $self->db->get_FeatureSetAdaptor->fetch_by_name('RegulatoryFeatures:MultiCell'); + + if(! $self->{'multicell_set'}){ + warn('Could not retrieve current default RegulatoryFeatures:MuiltiCell FeatureSet'); + } + } + + return $self->{'multicell_set'}; +} + + +=head2 fetch_by_stable_id + + Arg [1] : String $stable_id - The stable id of the regulatory feature to retrieve + Arg [2] : optional - Bio::EnsEMBL::FeatureSet + Example : my $rf = $rf_adaptor->fetch_by_stable_id('ENSR00000309301'); + Description: Retrieves a regulatory feature via its stable id. + Returntype : Bio::EnsEMBL::Funcgen::RegulatoryFeature + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_by_stable_id { + my ($self, $stable_id, $fset) = @_; + + $fset ||= $self->_get_current_FeatureSet; + + + #remove this ternary operatory when changes to fetch_all_by_stable_id_FeatureSets done? + #No, this should not die when no data is present! + return (defined $fset) ? $self->fetch_all_by_stable_id_FeatureSets($stable_id, $fset)->[0] : undef; +} + +=head2 fetch_all_by_stable_id_FeatureSets + + Arg [1] : String $stable_id - The stable id of the regulatory feature to retrieve + Arg [2] : optional list of FeatureSets + Example : my $rf = $rf_adaptor->fetch_by_stable_id('ENSR00000309301'); + Description: Retrieves a regulatory feature via its stable id. + Returntype : Array ref of Bio::EnsEMBL::Funcgen::RegulatoryFeature objects + Exceptions : throws if no stable ID provided or FeatureSets aren't valid + warns if not FeatureSets defined + Caller : general + Status : at risk + +=cut + + +#change this to fetch_all_by_stable_id and remove method of same name below or vice versa? +#check usage of this method first + + +sub fetch_all_by_stable_id_FeatureSets { + my ($self, $stable_id, @fsets) = @_; + + #Change this to arrayref of fsets + + #Standard implementation exposes logic name as a parameter + #But it will always be RegulatoryFeature/Build + + throw('Must provide a stable ID') if ! defined $stable_id; + + $stable_id =~ s/ENS[A-Z]*R0*//; + + + #Need to test stable_id here as there is a chance that this argument has been omitted and we are dealing with + #a feature set object + $self->bind_param_generic_fetch($stable_id, SQL_INTEGER); + my $constraint = 'rf.stable_id=?'; + + + #Change this to use _generate_feature_set_id_clause + + if(@fsets){ + + #need to catch empty array and invalid FeatureSets + if(scalar(@fsets == 0)){ + warning("You have not specified any FeatureSets to fetch the RegulatoryFeature from, defaulting to all"); + } + else{ + + #validate FeatureSets + #Need to check $fset->feature_class eq 'regulatory' too? + map { $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::FeatureSet', $_)} @fsets; + + if(scalar(@fsets) == 1){ + $constraint .= ' and rf.feature_set_id=?'; + $self->bind_param_generic_fetch($fsets[0]->dbID, SQL_INTEGER); + }else{ + #How can we bind param this? + + my @bind_slots; + + foreach my $dbid(map $_->dbID, @fsets){ + push @bind_slots, '?'; + $self->bind_param_generic_fetch($dbid, SQL_INTEGER); + } + + $constraint .= ' AND rf.feature_set_id IN ('.join(', ', @bind_slots).')'; + } + } + } + + return $self->generic_fetch($constraint); +} + + + + +=head2 _tables + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns the names and aliases of the tables to use for queries. + Returntype : List of listrefs of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _tables { + my $self = shift; + + return ( + [ 'regulatory_feature', 'rf' ], + [ 'feature_set', 'fs' ], + [ 'regulatory_attribute', 'ra' ], + ); +} + +=head2 _columns + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns a list of columns to use for queries. + Returntype : List of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _columns { + my $self = shift; + + return qw( + rf.regulatory_feature_id rf.seq_region_id + rf.seq_region_start rf.seq_region_end + rf.seq_region_strand rf.bound_seq_region_start + rf.bound_seq_region_end rf.display_label + rf.feature_type_id rf.feature_set_id + rf.stable_id rf.binary_string + rf.projected ra.attribute_feature_id + ra.attribute_feature_table + ); +} + + +=head2 _left_join + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns an additional table joining constraint to use for + queries. + Returntype : List + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _left_join { + my $self = shift; + + return (['regulatory_attribute', 'rf.regulatory_feature_id = ra.regulatory_feature_id']); +} + + + +=head2 _objs_from_sth + + Arg [1] : DBI statement handle object + Example : None + Description: PROTECTED implementation of superclass abstract method. + Creates RegulatoryFeature objects from an executed DBI statement + handle. + Returntype : Listref of Bio::EnsEMBL::RegulatoryFeature objects + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _objs_from_sth { + my ($self, $sth, $mapper, $dest_slice) = @_; + + #For EFG this has to use a dest_slice from core/dnaDB whether specified or not. + #So if it not defined then we need to generate one derived from the species_name and schema_build of the feature we're retrieving. + # This code is ugly because caching is used to improve speed + + my ($sa, $reg_feat);#, $old_cs_id); + $sa = ($dest_slice) ? $dest_slice->adaptor->db->get_SliceAdaptor() : $self->db->get_SliceAdaptor(); + + #Some of this in now probably overkill as we'll always be using the DNADB as the slice DB + #Hence it should always be on the same coord system + my $ft_adaptor = $self->db->get_FeatureTypeAdaptor(); + my $fset_adaptor = $self->db->get_FeatureSetAdaptor(); + my (@features, $seq_region_id); + my (%fset_hash, %slice_hash, %sr_name_hash, %sr_cs_hash, %ftype_hash); + my $skip_feature = 0; + + my %feature_adaptors = ( + 'annotated' => $self->db->get_AnnotatedFeatureAdaptor, + 'motif' => $self->db->get_MotifFeatureAdaptor, + #external + ); + + my $stable_id_prefix = $self->db->stable_id_prefix; + + my ( + $dbID, $efg_seq_region_id, + $seq_region_start, $seq_region_end, + $seq_region_strand, $bound_seq_region_start, + $bound_seq_region_end, $display_label, + $ftype_id, $fset_id, + $stable_id, $attr_id, + $attr_type, $bin_string, + $projected + ); + + $sth->bind_columns( + \$dbID, \$efg_seq_region_id, + \$seq_region_start, \$seq_region_end, + \$seq_region_strand, \$bound_seq_region_start, + \$bound_seq_region_end, \$display_label, + \$ftype_id, \$fset_id, + \$stable_id, \$bin_string, + \$projected, \$attr_id, + \$attr_type + ); + + my ($asm_cs, $cmp_cs, $asm_cs_name); + my ($asm_cs_vers, $cmp_cs_name, $cmp_cs_vers); + + if ($mapper) { + $asm_cs = $mapper->assembled_CoordSystem(); + $cmp_cs = $mapper->component_CoordSystem(); + $asm_cs_name = $asm_cs->name(); + $asm_cs_vers = $asm_cs->version(); + $cmp_cs_name = $cmp_cs->name(); + $cmp_cs_vers = $cmp_cs->version(); + } + + my ($dest_slice_start, $dest_slice_end); + my ($dest_slice_strand, $dest_slice_length, $dest_slice_sr_name); + + if ($dest_slice) { + $dest_slice_start = $dest_slice->start(); + $dest_slice_end = $dest_slice->end(); + $dest_slice_strand = $dest_slice->strand(); + $dest_slice_length = $dest_slice->length(); + $dest_slice_sr_name = $dest_slice->seq_region_name(); + } + + my $slice; + my %reg_attrs = ( + annotated => {}, + motif => {}, + #external + ); + + #Set 'unique' set of feature_set_ids + my @fset_ids; + + if($self->{params_hash}{unique}){ + #FeatureSet have been pre-validated in the fetch method + @fset_ids = map $_->dbID, @{$self->{params_hash}{feature_sets}}; + } + + + my $skip_stable_id = 0;#stable IDs are never 0 + my $no_skip_stable_id = 0; + my @other_rf_ids; + + FEATURE: while ( $sth->fetch() ) { + + #Handle non-unique skipping first + + if ( $stable_id && + ($skip_stable_id == $stable_id) ) { + #Faster for queries which need to skip if we have this first + next; + } elsif (@fset_ids) { + #have no_skip_stable_id too + #so we don't keep doing _fetch_other_feature_set_ids_by_stable_feature_set_ids + #for ID s we have already checked + + if ($no_skip_stable_id != $stable_id) { + @other_rf_ids = @{$self->_fetch_other_dbIDs_by_stable_feature_set_ids + ($stable_id, + \@fset_ids, + { + include_projected => $self->{params_hash}{include_projected}})}; + + if (@other_rf_ids) { + $skip_stable_id = $stable_id; + #warn "skipping\n"; + next; + } else { + $no_skip_stable_id = $stable_id; + } + } + #else don't skip this stable ID + } + + + + if (! $reg_feat || ($reg_feat->dbID != $dbID)) { + + if ($skip_feature) { + undef $reg_feat; #so we don't duplicate the push for the feature previous to the skip feature + $skip_feature = 0; + } + + if ($reg_feat) { #Set the previous attr cache and reset + $reg_feat->attribute_cache(\%reg_attrs); + push @features, $reg_feat; + + + %reg_attrs = ( + annotated => {}, + motif => {}, + #external + ); + } + + #Would need to build a slice adaptor cache here to enable mapping between assemblies + #Or if mapping between cs systems for a given schema_build + #which would have to be handled by the core api + + #get core seq_region_id + $seq_region_id = $self->get_core_seq_region_id($efg_seq_region_id); + + if (! $seq_region_id) { + warn "Cannot get slice for eFG seq_region_id $efg_seq_region_id\n". + "The region you are using is not present in the current dna DB"; + next; + } + + #if($old_cs_id && ($old_cs_id+ != $cs_id)){ + # throw("More than one coord_system for feature query, need to implement SliceAdaptor hash?"); + #} + #$old_cs_id = $cs_id; + #Need to make sure we are restricting calls to Experiment and channel(i.e. the same coord_system_id) + + #Get the FeatureSet object + $fset_hash{$fset_id} = $fset_adaptor->fetch_by_dbID($fset_id) if(! exists $fset_hash{$fset_id}); + + #Get the FeatureType object + $ftype_hash{$ftype_id} = $ft_adaptor->fetch_by_dbID($ftype_id) if(! exists $ftype_hash{$ftype_id}); + + # Get the slice object + $slice = $slice_hash{'ID:'.$seq_region_id}; + + if (!$slice) { + $slice = $sa->fetch_by_seq_region_id($seq_region_id); + $slice_hash{'ID:'.$seq_region_id} = $slice; + $sr_name_hash{$seq_region_id} = $slice->seq_region_name(); + $sr_cs_hash{$seq_region_id} = $slice->coord_system(); + } + + my $sr_name = $sr_name_hash{$seq_region_id}; + my $sr_cs = $sr_cs_hash{$seq_region_id}; + + # Remap the feature coordinates to another coord system if a mapper was provided + if ($mapper) { + + throw("Not yet implmented mapper, check equals are Funcgen calls too!"); + + ($sr_name, $seq_region_start, $seq_region_end, $seq_region_strand) + = $mapper->fastmap($sr_name, $seq_region_start, $seq_region_end, $seq_region_strand, $sr_cs); + + # Skip features that map to gaps or coord system boundaries + if (! defined $sr_name) { + $skip_feature = 1; + next FEATURE; + } + + # Get a slice in the coord system we just mapped to + if ( $asm_cs == $sr_cs || ( $cmp_cs != $sr_cs && $asm_cs->equals($sr_cs) ) ) { + $slice = $slice_hash{"NAME:$sr_name:$cmp_cs_name:$cmp_cs_vers"} + ||= $sa->fetch_by_region($cmp_cs_name, $sr_name, undef, undef, undef, $cmp_cs_vers); + } else { + $slice = $slice_hash{"NAME:$sr_name:$asm_cs_name:$asm_cs_vers"} + ||= $sa->fetch_by_region($asm_cs_name, $sr_name, undef, undef, undef, $asm_cs_vers); + } + } + + # If a destination slice was provided convert the coords + # If the destination slice starts at 1 and is forward strand, nothing needs doing + if ($dest_slice) { + + unless ($dest_slice_start == 1 && $dest_slice_strand == 1) { + + #can remove the if $bound_seq_region_start/end once we have updated all reg feature entries and store API + + if ($dest_slice_strand == 1) { + $seq_region_start = $seq_region_start - $dest_slice_start + 1; + $seq_region_end = $seq_region_end - $dest_slice_start + 1; + + #if as we never have a seq_region start of 0; + $bound_seq_region_start = $bound_seq_region_start - $dest_slice_start + 1 if $bound_seq_region_start; + $bound_seq_region_end = $bound_seq_region_end - $dest_slice_start + 1 if $bound_seq_region_end; + } else { + my $tmp_seq_region_start = $seq_region_start; + my $tmp_bound_seq_region_start = $bound_seq_region_start; + $seq_region_start = $dest_slice_end - $seq_region_end + 1; + $seq_region_end = $dest_slice_end - $tmp_seq_region_start + 1; + $bound_seq_region_start = $dest_slice_end - $bound_seq_region_end + 1 if $bound_seq_region_end; + $bound_seq_region_end = $dest_slice_end - $tmp_bound_seq_region_start + 1 if $bound_seq_region_start; + $seq_region_strand *= -1; + } + } + + # Throw away features off the end of the requested slice + #Do not account for bounds here. + if ($seq_region_end < 1 || $seq_region_start > $dest_slice_length + || ( $dest_slice_sr_name ne $sr_name )) { + $skip_feature = 1; + next FEATURE; + } + + $slice = $dest_slice; + } + + + #This stops un init warning when sid is absent for sid mapping + my $sid = (defined $stable_id) ? sprintf($stable_id_prefix."%011d", $stable_id) : undef; + + $reg_feat = Bio::EnsEMBL::Funcgen::RegulatoryFeature->new_fast + ({ + 'start' => $seq_region_start, + 'end' => $seq_region_end, + 'bound_start' => $bound_seq_region_start, + 'bound_end' => $bound_seq_region_end, + 'strand' => $seq_region_strand, + 'slice' => $slice, + 'analysis' => $fset_hash{$fset_id}->analysis(), + 'adaptor' => $self, + 'dbID' => $dbID, + 'display_label' => $display_label, + 'binary_string' => $bin_string, + 'projected' => $projected, + 'set' => $fset_hash{$fset_id}, + 'feature_type' => $ftype_hash{$ftype_id}, + 'stable_id' => $sid, + }); + + } + + + #populate attributes cache + if (defined $attr_id && ! $skip_feature) { + + $reg_attrs{$attr_type}->{$attr_id} = undef; + } + } + + #handle last record + if ($reg_feat) { + + $reg_feat->attribute_cache(\%reg_attrs); + push @features, $reg_feat; + } + + + #reset params hash + $self->{params_hash} = undef; + + return \@features; +} + + + +=head2 store + + Args : List of Bio::EnsEMBL::Funcgen::RegulatoryFeature objects + Example : $ofa->store(@features); + Description: Stores given RegulatoryFeature objects in the database. Should only + be called once per feature because no checks are made for + duplicates. Sets dbID and adaptor on the objects that it stores. + Returntype : Listref of stored RegulatoryFeatures + Exceptions : Throws if a list of RegulatoryFeature objects is not provided or if + the Analysis, CellType and FeatureType objects are not attached or stored. + Throws if analysis of set and feature do not match + Warns if RegulatoryFeature already stored in DB and skips store. + Caller : General + Status : At Risk + +=cut + +sub store{ + my ($self, @rfs) = @_; + + if (scalar(@rfs) == 0) { + throw('Must call store with a list of RegulatoryFeature objects'); + } + + my $sth = $self->prepare(" + INSERT INTO regulatory_feature ( + seq_region_id, seq_region_start, + seq_region_end, bound_seq_region_start, + bound_seq_region_end, seq_region_strand, + display_label, feature_type_id, + feature_set_id, stable_id, + binary_string, projected + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"); + + my $sth2 = $self->prepare(" + INSERT INTO regulatory_attribute ( + regulatory_feature_id, attribute_feature_id, attribute_feature_table + ) VALUES (?, ?, ?)"); + + my $db = $self->db(); + + foreach my $rf (@rfs) { + + if( ! ref $rf || ! $rf->isa('Bio::EnsEMBL::Funcgen::RegulatoryFeature') ) { + throw('Feature must be an RegulatoryFeature object'); + } + + if ( $rf->is_stored($db) ) { + #does not accomodate adding Feature to >1 feature_set + warning('Skipping RegulatoryFeature [' . $rf->dbID() . '] as it is already stored in the database'); + next; + } + + + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::FeatureSet', $rf->feature_set); + + + my ($sid, $seq_region_id); + ($rf, $seq_region_id) = $self->_pre_store($rf); + $rf->adaptor($self);#Set adaptor first to allow attr feature retreival for bounds + #This is only required when storing + + + #Actually never happens, as we always assign stable_ids after storing + ($sid = $rf->stable_id) =~ s/ENS[A-Z]*R0*// if defined $rf->stable_id; + + $sth->bind_param(1, $seq_region_id, SQL_INTEGER); + $sth->bind_param(2, $rf->start(), SQL_INTEGER); + $sth->bind_param(3, $rf->end(), SQL_INTEGER); + $sth->bind_param(4, $rf->bound_start(), SQL_INTEGER); + $sth->bind_param(5, $rf->bound_end(), SQL_INTEGER); + $sth->bind_param(6, $rf->strand(), SQL_TINYINT); + $sth->bind_param(7, $rf->{'display_label'}, SQL_VARCHAR);#Direct access so we always store the binary string + $sth->bind_param(8, $rf->feature_type->dbID(), SQL_INTEGER); + $sth->bind_param(9, $rf->feature_set->dbID(), SQL_INTEGER); + $sth->bind_param(10, $sid, SQL_INTEGER); + $sth->bind_param(11, $rf->binary_string, SQL_VARCHAR); + $sth->bind_param(12, $rf->is_projected, SQL_BOOLEAN); + + #Store and set dbID + $sth->execute(); + $rf->dbID( $sth->{'mysql_insertid'} ); + + + #Store regulatory_attributes + #Attr cache now only holds dbids not objs + + my %attrs = %{$rf->attribute_cache}; + + foreach my $fclass(keys %attrs){ + + foreach my $attr_id(keys %{$attrs{$fclass}}){ + $sth2->bind_param(1, $rf->dbID, SQL_INTEGER); + $sth2->bind_param(2, $attr_id, SQL_INTEGER); + $sth2->bind_param(3, $fclass, SQL_VARCHAR); + $sth2->execute(); + } + } + } + + return \@rfs; +} + + + + + +=head2 fetch_all_by_Slice + + Arg [1] : Bio::EnsEMBL::Slice + Arg [2] : Bio::EnsEMBL::Slice + Example : my $slice = $sa->fetch_by_region('chromosome', '1'); + my $features = $regf_adaptor->fetch_all_by_Slice($slice); + Description: Retrieves a list of features on a given slice, specific for the current + default RegulatoryFeature set. + Returntype : Listref of Bio::EnsEMBL::RegulatoryFeature objects + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub fetch_all_by_Slice { + my ($self, $slice, $fset) = @_; + + $fset ||= $self->_get_current_FeatureSet; #This get the MultiCell sets + + #Ternary operator essential here, as we don't want to die if there is no data! + return (defined $fset) ? $self->fetch_all_by_Slice_FeatureSets($slice, [$fset]) : undef; +} + + +=head2 fetch_all_by_Slice_FeatureSets + + Arg [1] : Bio::EnsEMBL::Slice + Arg [2] : Arrayref of Bio::EnsEMBL::FeatureSet objects + Arg [3] : optional HASHREF - params: + { + unique => 0|1, #Get RegulatoryFeatures unique to these FeatureSets + include_projected => 0|1, #Consider projected features + } + Example : my $slice = $sa->fetch_by_region('chromosome', '1'); + my $features = $ofa->fetch_all_by_Slice_FeatureSets($slice, \@fsets); + Description: Simple wrapper to set unique flag. + Retrieves a list of features on a given slice, specific for a given list of FeatureSets. + Returntype : Listref of Bio::EnsEMBL::RegulatoryFeature objects + Exceptions : Throws if params hash is not valid or keys are not recognised + Caller : General + Status : At Risk + +=cut + +#These FeatureSets are not optional +#otherwise could have just re-implemented fetch_all_by_Slice +#with extra optional fsets args + +#To implement unique flag, would either need to self join on stable ID to compare counts +#or set flag(would have to be array of fset IDs if we want more than one fset) +#to do look up in objs_from_sth method +#Would need to add wrapper for fetch_all_by_Slice_FeatureSets here + +#This will slow down all regfeat fetches +#how can we do this without impacting obj_from_sth performance? +#is this worth worrying about? It's a fairly fast track anyway? +#could have method vars/code refs in obj_from_sth, but still adding method call instead of var test. +#impact likely negligable + +#Could pre_store this in DB? 'Impossible' for 1 fset without doing self join comparison? + +#We really need to expose the constraint here to enable more complex combined queries +#i.e. projected, FeatureType filter etc + +#Need to do full re-implementation here rather than wrapper as we can't +#pass a preformed constraint to a method which might be exposed directly by the website +#as this may enable someone to inject SQL via a URL. +#Also can't handle unique/include_projected in generic SetFeatureAdaptor method + +sub fetch_all_by_Slice_FeatureSets { + my ($self, $slice, $fsets, $params_hash) = @_; + + if($params_hash){ + + if(ref($params_hash) eq 'HASH'){ + #Really only need feature_sets as unique is implicit at present + #define unique for clarify + $self->{params_hash}{unique} = $params_hash->{unique}; + $self->{params_hash}{feature_sets} = $fsets; + $self->{params_hash}{include_projected} = $params_hash->{include_projected}; + #include_projected as we also need to constrain + #_fetch_other_dbIDs_by_stable_feature_set_ids + #Waht happens when we have include_projected on it's own and set to 0 + } + else{ + throw("The params_hash argument must be a valid HASHREF, not:\t".ref($params_hash)); + } + } + + + my $constraint = 'rf.feature_set_id '.$self->_generate_feature_set_id_clause($fsets); + $params_hash ||= {}; #To avoid deref fail below + my $inc_proj = $params_hash->{include_projected}; + + if( ($params_hash->{unique} && ( ! $inc_proj)) || + (defined $inc_proj && ($inc_proj == 0) )){ + $constraint .= ' AND '. ' rf.projected=0 '; + } + + #explicit super call, just in case we ever re-implement in here + return $self->SUPER::fetch_all_by_Slice_constraint($slice, $constraint); +} + + + + +sub _fetch_other_dbIDs_by_stable_feature_set_ids{ + my ($self, $stable_id_int, $fset_ids, $params_hash) = @_; + #Args and originating objects have been prevalidated + + #($stable_id = $stable_id) =~ s/^[A-Z0]+//; + #Do this in caller, as we already have the stripped ID from _objs_from_sth + + my @fset_ids = @$fset_ids; #Deref here as we are pushing, and don't want modify in the caller + my $projected_constraint = ''; + + #This is internal, so we can assume $param_hash is a valid + #HASHREF if defined. + $params_hash ||= {};#quick way to prevent deref fail below + + if(! $params_hash->{include_projected}){ + $projected_constraint = ' AND projected=0 '; + } + + #Handle MultiCell set, as this will always be present + if(! $params_hash->{include_multicell}){ + push @fset_ids, $self->_get_current_FeatureSet->dbID; + } + + + my @other_rf_ids = @{$self->db->dbc->db_handle->selectcol_arrayref + ('SELECT regulatory_feature_id from regulatory_feature '. + "WHERE stable_id=${stable_id_int} ".$projected_constraint. + ' AND feature_set_id not in('.join(',', @fset_ids).')')}; + + return \@other_rf_ids; +} + + + + +=head2 fetch_all_by_stable_ID + + Arg [1] : string - stable ID e.g. ENSR00000000001 + Example : my @cell_type_regfs = @{$regf_adaptor->fetch_all_by_stable_ID('ENSR00000000001'); + Description: Retrieves a list of RegulatoryFeatures with associated stable ID. One for each CellType or + 'core' RegulatoryFeature set which contains the specified stable ID. + Returntype : Listref of Bio::EnsEMBL::RegulatoryFeature objects + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub fetch_all_by_stable_ID { + my ($self, $stable_id) = @_; + + #Add fsets here? + + throw('You must provide a stable_id argument') if ! $stable_id; + $stable_id =~ s/[A-Z0]+//; + + $self->bind_param_generic_fetch($stable_id, SQL_INTEGER); + return $self->generic_fetch('rf.stable_id=?'); +} + +=head2 fetch_all_by_attribute_feature + + Arg [1] : Bio::Ensembl::Funcgen::AnnotatedFeature or MotifFeature + Example : my @regfs = @{$regf_adaptor->fetch_all_by_attribute_feature($motif_feature)}; + Description: Retrieves a list of RegulatoryFeatures which contain the givven attribute feature. + Returntype : Listref of Bio::EnsEMBL::RegulatoryFeature objects + Exceptions : Throws is argument not valid + Caller : General + Status : At Risk + +=cut + +sub fetch_all_by_attribute_feature { + my ($self, $attr_feat) = @_; + + #add fsets here as optional arg + + my $attr_class = ref($attr_feat); + + if(! $valid_attribute_features{$attr_class}){ + #This assigns null to the hash value but we throw straight away + throw("Attribute feature must be one of:\n\t".join("\n\t", keys(%valid_attribute_features))); + } + + $self->db->is_stored_and_valid($attr_class, $attr_feat); + my $attr_feat_table = $valid_attribute_features{$attr_class}; + + + #Don't retrict via existing left join as we want to get all + #the reg_attrs not just those define by this query + + #Was originally doing a subselect, but this was doing a filesort on ALL rf with no key! + #Separating the queries makes this a range query and uses the primary key + #still files sort, but just on exact number of rows rather than ALL( I guess because it can't do it in the buffer for some reason) + + + my ($rf_ids) = $self->db->dbc->db_handle->selectrow_array("SELECT group_concat(regulatory_feature_id) from regulatory_attribute ". + "WHERE attribute_feature_table='${attr_feat_table}' and attribute_feature_id=".$attr_feat->dbID); + + return (defined $rf_ids) ? $self->generic_fetch("rf.regulatory_feature_id in(${rf_ids})") : []; +} + + + +=head2 fetch_type_config_by_RegulatoryFeatures + + Arg [1] : Bio::EnsEMBL::Funcgen::RegulatoryFeature + Example : my $config = $regf_adaptor->fetch_type_config_by_RegualtoryFeature($rf); + Description: Retrieves a config hash of CellType and FeatureType names and dbIDs supporting + the given RegualtoryFeature + Returntype : HASHREF + Exceptions : None + Caller : General + Status : At Risk + +=cut + + +sub fetch_type_config_by_RegulatoryFeature{ + my ($self, $rf) = @_; + + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::RegulatoryFeature', $rf); + + + my $sql = 'SELECT ft.name, ft.feature_type_id from '. + 'feature_type ft, feature_set fs, regulatory_attribute ra, annotated_feature af '. + 'WHERE ft.feature_type_id=fs.feature_type_id AND fs.feature_set_id=af.feature_set_id AND '. + 'af.annotated_feature_id=ra.attribute_feature_id and ra.attribute_feature_table="annotated" AND '. + 'ra.regulatory_feature_id=? group by ft.name order by ft.name'; + + my $sth = $self->prepare($sql); + $sth->bind_param(1, $rf->dbID, SQL_INTEGER); + $sth->execute; + my @ftype_config = @{$sth->fetchall_arrayref}; + $sth->finish; + + + #Don't need cell type query here if we have a cell type sepcific set + #What is quicker here? + + $sql = 'SELECT ct.name, ct.cell_type_id from '. + 'cell_type ct, feature_set fs, regulatory_attribute ra, annotated_feature af '. + 'WHERE ct.cell_type_id=fs.cell_type_id AND fs.feature_set_id=af.feature_set_id AND '. + 'af.annotated_feature_id=ra.attribute_feature_id and ra.attribute_feature_table="annotated" AND '. + 'ra.regulatory_feature_id=? group by ct.name order by ct.name'; + + $sth = $self->prepare($sql); + $sth->bind_param(1, $rf->dbID, SQL_INTEGER); + $sth->execute; + my @ctype_config = @{$sth->fetchall_arrayref}; + $sth->finish; + + return { + feature_types => \@ftype_config, + cell_types => \@ctype_config, + }; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/ResultFeatureAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/ResultFeatureAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1084 @@ +# +# Ensembl module for Bio::EnsEMBL::DBSQL::Funcgen::ResultFeatureAdaptor +# +# You may distribute this module under the same terms as Perl itself + +=head1 NAME + +Bio::EnsEMBL::DBSQL::Funcgen::ResultFeatureAdaptor - adaptor for fetching and storing ResultFeature objects + +=head1 SYNOPSIS + +my $rfeature_adaptor = $db->get_ResultFeatureAdaptor(); + +my @result_features = @{$rfeature_adaptor->fetch_all_by_ResultSet_Slice($rset, $slice)}; + + +=head1 DESCRIPTION + +The ResultFeatureAdaptor is a database adaptor for storing and retrieving +ResultFeature objects. + +This will automatically query the web optimised result_feature +table if a data is present, else it will query the underlying raw data tables. + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + + +#How are we going to track the association between raw experimental_chip and collection result set? +#ExperimentalChip/Channel ResultSets will probably go away, so ignore this association problem for now. +#Association between FeatureSet and Input/ResultSet to be handled in import pipeline + +#ResultFeature ResultSets now have a RESULT_FEATURE_SET status entry. +#Add some details about file based collections + + +package Bio::EnsEMBL::Funcgen::DBSQL::ResultFeatureAdaptor; + +use strict; +use warnings; +use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate ); +use Bio::EnsEMBL::Funcgen::ResultSet; +use Bio::EnsEMBL::Funcgen::ResultFeature; +use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(mean median); +#Bareword SQL_TYPES not exported from BaseFeatureAdpator unless it is 'use'd first +use Bio::EnsEMBL::Funcgen::DBSQL::BaseFeatureAdaptor; +use Bio::EnsEMBL::Funcgen::Collector::ResultFeature; + +use base qw(Bio::EnsEMBL::Funcgen::DBSQL::BaseFeatureAdaptor + Bio::EnsEMBL::Funcgen::Collector::ResultFeature); + # Bio::EnsEMBL::DBFile::BigWigAdaptor);#@ISA + +warn "removed hardcoded inc BigWigAdaptor"; + + +#Private vars to used to maintain simple implementation of Collector +#Should be set in each method to enable trimmingof the start and end bins. +#Cannot depend on $dest_slice_start/end in _objs_from_sth +#As _collection_start/end are adjusted to the nearest bin +my ($_scores_field, $_collection_start, $_collection_end); +#my ($_window_size);#Need to be a method as it is used by the BaseFeatureAdaptor. or our? + +=head2 _tables + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns the names and aliases of the tables to use for queries. + Returntype : List of listrefs of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _tables { + my $self = shift; + + return ([ 'result_feature', 'rf' ]); +} + + +=head2 _columns + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns a list of columns to use for queries. + Returntype : List of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _columns { + my $self = shift; + + return ('rf.seq_region_start', 'rf.seq_region_end', 'rf.seq_region_strand', + "$_scores_field", 'rf.result_set_id'); +} + + + + +=head2 _objs_from_sth + + Arg [1] : DBI statement handle object + Example : None + Description: PROTECTED implementation of superclass abstract method. + Creates Array objects from an executed DBI statement + handle. + Returntype : Listref of Bio::EnsEMBL::Funcgen::Experiment objects + Exceptions : None + Caller : Internal + Status : At Risk - Moving to DBFile implementation + +=cut + +sub _objs_from_sth { + my ($self, $sth, $mapper, $dest_slice) = @_; + + if(! $dest_slice){ + throw('ResultFeatureAdaptor always requires a dest_slice argument'); + #Is this correct? + #ExperimentalChip based normalisation? + #Currently does not use this method + #Never have non-Slice based fetchs, so will always have dest_slice and seq_region info. + } + + my (%rfeats, $start, $end, $strand, $scores, $rset_id); + my $window_size = 0; #Always natural resolution from table + + #Could dynamically define simple obj hash dependant on whether feature is stranded and new_fast? + #We never call _obj_from_sth for extended queries + #This is only for result_feature table queries i.e. standard/new queries + $sth->bind_columns(\$start, \$end, \$strand, \$scores, \$rset_id); + + + #Test slice is loaded in eFG? + my $slice_adaptor = $self->db->get_SliceAdaptor; + + if(! $slice_adaptor->get_seq_region_id($dest_slice)){ + warn "Cannot get eFG slice for:".$dest_slice->name. + "\nThe region you are using is not present in the current dna DB"; + return; + } + + if($mapper) { + throw('Cannot dynamically assembly map Collections yet'); + #See GeneAdaptor for mapping code if required + } + + my $dest_slice_start; + my $dest_slice_end; + my $dest_slice_strand; + my $dest_slice_length; + my $dest_slice_sr_name; + my $dest_slice_sr_id; + + $dest_slice_start = $dest_slice->start(); + $dest_slice_end = $dest_slice->end(); + $dest_slice_strand = $dest_slice->strand(); + $dest_slice_length = $dest_slice->length(); + $dest_slice_sr_name = $dest_slice->seq_region_name(); + $dest_slice_sr_id = $dest_slice->get_seq_region_id(); + + + my (@scores, $slice, $start_pad, $end_pad); + #Set up the %rfeats arrays here to prevent having to test in loop + #This will speed up 0 wsize, but most likely slow others? + + FEATURE: while ( $sth->fetch() ) { + + if($window_size == 0){ + warn "0bp window size array based result_features are no longer supported"; + #Remove completely or re-instate? + } + else{ + + #Cannot have a collection which does not start at 1 + #As we cannot compute the bin start/ends correctly? + #Actually we can do so long as they have been stored correctly + #i.e. start and end are valid bin bounds(extending past the end of the seq_region if needed) + #Let's keep it simple for now + throw("Collections with a window size > 0 must start at 1, not ($start)") if $start !=1; + #Can remove this if we test start and end are valid bin bounds for the given wsize + + #Account for oversized slices + #This is if the slice seq_region_start/end are outside of the range of the record + #As collections should really represent a complete seq_region + #This should only happen if a slice is defined outside the the bounds of a seq_region + #i.e. seq_region_start < collection_start or seq_region_end > slice length + #if a test slice has been stored which does not represent the complete seq_region + #We don't need to pad at all, just adjust the $_collection_start/ends!!! + #Don't need to account for slice start < 1 + #Start and end should always be valid bin bounds + #These could be removed if we force only full length seq_region collections + $_collection_start = $start if($_collection_start < $start); + $_collection_end = $end if($_collection_end > $end); + + #warn "col start now $_collection_start"; + #warn "col end now $_collection_end"; + + # If the dest_slice starts at 1 and is foward strand, nothing needs doing + # else convert coords + # These need to use $_collection_start/end rather than dest_slice_start/end + + if($dest_slice_start != 1 || $dest_slice_strand != 1) { + if($dest_slice_strand == 1) { + $start = $_collection_start - $dest_slice_start + 1; + $end = $_collection_end - $dest_slice_start + 1; + } else { + my $tmp_seq_region_start = $_collection_start; + $start = $dest_slice_end - $_collection_end + 1; + $end = $dest_slice_end - $tmp_seq_region_start + 1; + $strand *= -1; + } + } + #What about 0 strand slices? + + #throw away features off the end of the requested slice or on different seq_region + if($end < 1 || $start > $dest_slice_length){# || + #( $dest_slice_sr_id ne $seq_region_id )) { + #This would only happen if assembly mapper had placed it on a different seq_region + #Dynamically mapped features are not guaranteed to come back in correct order? + + next FEATURE; + } + + @scores = unpack('('.$self->pack_template.')'.(($_collection_end - $_collection_start + 1)/$window_size ), $scores); + } + + + push @{$rfeats{$rset_id}}, Bio::EnsEMBL::Funcgen::Collection::ResultFeature->new_fast({ + start => $start, + end => $end, + strand =>$strand, + scores => [@scores], + #undef, + #undef, + window_size => $window_size, + slice => $dest_slice, + }); + + } + + + + return \%rfeats; +} + + +=head2 store + + Args[0] : List of Bio::EnsEMBL::Funcgen::ResultFeature objects + Args[1] : Bio::EnsEMBL::Funcgen::ResultSet + Args[2] : Optional - Assembly to project to e.g. GRCh37 + Example : $rfa->store(@rfeats); + Description: Stores ResultFeature objects in the result_feature table. + Returntype : None + Exceptions : Throws if a List of ResultFeature objects is not provided or if + any of the attributes are not set or valid. + Caller : General + Status : At Risk - Moving to DBFile implementation + +=cut + +sub store{ + my ($self, $rfeats, $rset, $new_assembly) = @_; + + #We can't project collections to a new assembly after they have been generated + #as this will mess up the standardised bin bounds. + #Just project the 0 window size and then rebuild other window_sizes form there + + $self->set_collection_defs_by_ResultSet($rset); + throw("Must provide a list of ResultFeature objects") if(scalar(@$rfeats == 0)); + + #These are in the order of the ResultFeature attr array(excluding probe_id, which is the result/probe_feature query only attr)) + my $sth = $self->prepare('INSERT INTO result_feature (result_set_id, seq_region_id, seq_region_start, seq_region_end, seq_region_strand, scores) VALUES (?, ?, ?, ?, ?, ?)'); + my $db = $self->db(); + my ($pack_template, $packed_string); + + + + #my @max_allowed_packet = $self->dbc->db_handle->selectrow_array('show variables like "max_allowed_packet"'); + #warn "@max_allowed_packet"; + + FEATURE: foreach my $rfeat (@$rfeats) { + + if( ! (ref($rfeat) && $rfeat->isa('Bio::EnsEMBL::Funcgen::Collection::ResultFeature'))) { + throw('Must be a Bio::EnsEMBL::Funcgen::Collection::ResultFeature object to store'); + } + + if($rfeat->window_size == 0){ + throw('Non 0bp window_size ResultFeatures cannot be stored in the result_feature table, write a col file instead'); + } + + + #This is the only validation! So all the validation must be done in the caller as we are simply dealing with ints? + #Remove result_feature_set from result_set and set as status? + + my $seq_region_id; + ($rfeat, $seq_region_id) = $self->_pre_store($rfeat, $new_assembly); + + next if ! $rfeat;#No projection to new assembly + #Is there a way of logging which ones don't make it? + + #This captures non full length collections at end of seq_region + $pack_template = '('.$self->pack_template.')'.scalar(@{$rfeat->scores}); + + + #Check that we have non-0 values in compressed collections + if($rfeat->window_size != 0){ + + if(! grep { /[^0]/ } @{$rfeat->scores} ){ + warn('Collection contains no non-0 scores. Skipping store for '. + $rfeat->slice->name.' '.$rfeat->window_size." window_size\n"); + next; + } + } + + + $packed_string = pack($pack_template, @{$rfeat->scores}); + + $sth->bind_param(1, $rfeat->result_set_id, SQL_INTEGER); + $sth->bind_param(2, $seq_region_id, SQL_INTEGER); + $sth->bind_param(3, $rfeat->start, SQL_INTEGER); + $sth->bind_param(4, $rfeat->end, SQL_INTEGER); + $sth->bind_param(5, $rfeat->strand, SQL_INTEGER); + $sth->bind_param(7, $packed_string, SQL_BLOB); + $sth->execute(); + } + + return $rfeats; +} + +#This is no applicable to ResultFeatures + +=head2 list_dbIDs + + Args : None + Example : my @rsets_ids = @{$rsa->list_dbIDs()}; + Description: Gets an array of internal IDs for all ResultFeature objects in + the current database. + Returntype : List of ints + Exceptions : None + Caller : general + Status : stable + +=cut + +sub list_dbIDs { + my $self = shift; + + return $self->_list_dbIDs('result_feature'); +} + + +=head2 _window_size + + Args : None + Example : my $wsize = $self->_window_size + Description: Gets the window_size of the current ResultFeature query. + This needs to be a method rather than just a private variable + as it is used by the BaseFeatureAdaptor. + Returntype : int + Exceptions : None + Caller : Bio::EnsEMBL::BaseFeatureAdaptor + Status : At risk - ??? Is this required anymore + +=cut + + +sub _window_size{ + my $self = shift; + + return $self->{'window_size'}; +} + + + + +=head2 set_collection_config_by_Slice_ResultSets + + Args[0] : Bio::EnsEMBL::Slice + Args[1] : ARRAYREF of Bio::EnsEMBL::Funcgen::ResultSet object + Args[2] : int - Maximum number of bins required i.e. number of pixels in drawable region + Example : $self->set_collection_defs_by_ResultSet([$rset]); + Description: Similar to set_collection_defs_by_ResultSet, but used + to set a config hash used for multi-ResultSet fetches. + Returntype : None + Exceptions : throws is args are not valid + throws if supporting InputSet is not of type result (i.e. short reads import) + throws if supporting InputSet format is not SEQUENCING (i.e. short reads import) + throws if ResultSet is not and input_set or experimental_chip based ResultSet (i.e. channel etc) + Caller : ResultFeatureAdaptor::fetch_all_by_Slice_ResultSets + Status : At Risk + +=cut + +sub set_collection_config_by_Slice_ResultSets{ + my ($self, $slice, $rsets, $max_bins, $window_size) = @_; + + if(ref($rsets) ne 'ARRAY'){ + throw('You must pass an ARRAYREF of Bio::EnsEMBL::ResultSet objects.'); + } + + if(! (ref($slice) && $slice->isa('Bio::EnsEMBL::Slice'))){ + throw('You must pass a valid Bio::EnsEMBL::Slice'); + } + + my ($wsize, $window_element, @rsets, %wsize_config); + my ($collection_start, $collection_end); + + if($window_size && $max_bins){ + warn "Over-riding max_bins with specific window_size, omit window_size to calculate window_size using max_bins"; + } + + my ($is_rf_set); + my $rf_source = 'file'; + + foreach my $rset(@{$rsets}){ + $is_rf_set = $rset->has_status('RESULT_FEATURE_SET') || 0; + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::ResultSet', $rset); + + if(! ($rset->table_name eq 'input_set' || $is_rf_set)){ + warn("Skipping non-ResultFeature ResultSet:\t".$rset->name); + next; + } + + #Eventually will want to completely remove DB sets? + #Depends on how we handle 0bp window sets, BigWig? + #Leave 0bp sets in DB for v62 which means we need an extra status + #Assume all input_set rsets are file based + + #NOTE: + #Don't need to check for packed_size and packed_template differences as they are now the + #same for all collections i.e. float. These would need to be separate queries otherwise. + #This method makes assumptions about the window_sizes array structure + #If this is to change more the the 0 - 30 bp change below then the config hash generation + #needs to be reviewed + + if($rset->table_name eq 'experimental_chip'){ #Array Intensities i.e. single float + $Bio::EnsEMBL::Utils::Collector::window_sizes->[0] = 0;#Can have natural resolution for low density array data + } + elsif($rset->table_name eq 'input_set'){ + + $Bio::EnsEMBL::Utils::Collector::window_sizes->[0] = 30; + + #Currently only expecting int from InputSet + my @isets = @{$rset->get_InputSets}; + my @tmp_isets = grep { !/result/ } (map { $_->feature_class } @isets ); + + if(@tmp_isets){ + throw("Bio::EnsEMBL::Funcgen::Collector::ResultFeature only supports result type InputSets, not @tmp_isets types"); + } + + #We still have no way of encoding pack_type for result_feature InputSets + @tmp_isets = grep { !/SEQUENCING/ } (map { $_->format } @isets); + + if(@tmp_isets){ + throw("Bio::EnsEMBL::Funcgen::Collector::ResultFeature only supports SEQUENCING format InputSets, not @tmp_isets formats"); + } + } + else{ + throw('Bio::EnsEMBL::Funcgen::Collector:ResultFeature does not support ResultSets of type'.$rset->table_name); + } + + + + + ### SET ResultSet CONFIG BASED ON OPTIMAL WINDOW SIZE + # This sets all based on $rf_source=file + # For wsize==0 (experimental_chip), the rf_source is fixed after this loop + # This is done to prevent cyclical dependancy and improve cache checking speed + + if( (defined $window_element ) && + exists $wsize_config{$rf_source}{$Bio::EnsEMBL::Utils::Collector::window_sizes->[$window_element]} ){ + $wsize_config{$rf_source}{$Bio::EnsEMBL::Utils::Collector::window_sizes->[$window_element]}{'result_sets'}{$rset->dbID} = $rset; + } + else{ #We have not seen this wsize before + + + #This is currently entirely based on the position of the first wsize. + #We can't strictly rely that the same window_element will be optimal for each collection + #However this will work if they are size ordered and only the first element changes e.g. 0 || 30 + #Will need to do for each set if window_sizes change + + if(! defined $window_element){ + + my @sizes = @{$self->window_sizes}; + #we need to remove wsize 0 if ResultSet was generated from high density seq reads + #0 should always be first + + shift @sizes if ($sizes[0] == 0 && ($rset->table_name eq 'input_set')); + $max_bins ||= 700;#This is default size of display? + + #The speed of this track is directly proportional + #to the display size, unlike other tracks! + #e.g + #let's say we have 300000bp + #700 pixels will use 450 wsize > Faster but lower resolution + #2000 pixels will use 150 wsize > Slower but higher resolution + + if(defined $window_size){ + + if(! grep { /^${window_size}$/ } @sizes){ + warn "The ResultFeature window_size specifed($window_size) is not valid, the next largest will be chosen from:\t".join(', ', @sizes); + } + else{ + $wsize = $window_size; + } + } + else{#! defined $window_size + + #Work out window size here based on Slice length + #Select 0 wsize if slice is small enough + #As loop will never pick 0 + #probably half 150 max length for current wsize + #Will also be proportional to display size + #This depends on size ordered window sizes arrays + + $window_size = ($slice->length)/$max_bins; + + if($Bio::EnsEMBL::Utils::Collector::window_sizes->[0] == 0){ + + my $zero_wsize_limit = ($max_bins * $sizes[1])/2; + + if($slice->length <= $zero_wsize_limit){ + $wsize = 0; + $window_element = 0; + } + } + } + + #Let's try and avoid this loop if we have already grep'd or set to 0 + #In the browser this is only ever likely to speed up the 0 window + + if (! defined $wsize) { + #default is maximum + $wsize = $sizes[-1]; #Last element + + #Try and find the next biggest window + #As we don't want more bins than there are pixels + + for (my $i = 0; $i <= $#sizes; $i++) { + #We have problems here if we want to define just one window size + #In the store methods, this resets the wsizes so we can only pick from those + #specified, hence we cannot force the use of 0 + #@sizes needs to always be the full range of valid windows sizes + #Need to always add 0 and skip_zero window if 0 not defined in window_sizes? + + if ($window_size <= $sizes[$i]) { + $window_element = $i; + $wsize = $sizes[$i]; + last; + } + } + } + } + else{ #ASSUME the same window_element has the optimal window_size + $wsize = $Bio::EnsEMBL::Utils::Collector::window_sizes->[$window_element]; + } + + + + #Set BLOB access & collection_start/end config + + if ( $wsize == 0) { + $wsize_config{$rf_source}{$wsize}{scores_field} = 'rf.scores'; + #No need to set start end config here as these are normal features + } else { + #We want a substring of a whole seq_region collection + + #Correct to the nearest bin bounds + #int rounds towards 0, not always down! + #down if +ve or up if -ve + #This causes problems with setting start as we round up to zero + + + #Sub this lot as we will use it in the Collector for building indexes? + + + my $start_bin = $slice->start/$wsize; + $collection_start = int($start_bin); + + #Add 1 here so we avoid multiply by 0 below + if ($collection_start < $start_bin) { + $collection_start +=1; #Add 1 to the bin due to int rounding down + } + + $collection_start = ($collection_start * $wsize) - $wsize + 1 ; #seq_region + $collection_end = int($slice->end/$wsize) * $wsize; #This will be <= $slice->end + + #Add another window if the end doesn't meet the end of the slice + if (($collection_end > 0) && + ($collection_end < $slice->end)) { + $collection_end += $wsize; + } + + #Now correct for packed size + #Substring on a blob returns bytes not 2byte ascii chars! + #start at the first char of the first bin + my $sub_start = (((($collection_start - 1)/$wsize) * $self->packed_size) + 1); #add first char + #Default to 1 as mysql substring starts < 1 do funny things + $sub_start = 1 if $sub_start < 1; + my $sub_end = (($collection_end/$wsize) * ($self->packed_size)); + + #Set local start end config for collections + $wsize_config{$rf_source}{$wsize}{collection_start} = $collection_start; + $wsize_config{$rf_source}{$wsize}{collection_end} = $collection_end; + + + if($rf_source eq 'file'){ #file BLOB access config + $wsize_config{$rf_source}{$wsize}{'byte_offset'} = $sub_start -1; #offset is always start - 1 + $wsize_config{$rf_source}{$wsize}{'byte_length'} = ($sub_end - $sub_start + 1); + #warn "byte_offset = $sub_start"; + #warn "byte_length = ($sub_end - $sub_start + 1) => ". ($sub_end - $sub_start + 1); + } + else{ #DB BLOB access config + #Finally set scores column for fetch + $wsize_config{$rf_source}{$wsize}{scores_field} = "substring(rf.scores, $sub_start, ". + ($sub_end - $sub_start + 1).')'; + } + } + + + #Set the result_set and scores field config + #Would also need to set pack template here if this + #were to change between collections + $wsize_config{$rf_source}{$wsize}{result_sets}{$rset->dbID} = $rset; + } + } + + #Fix the 0 wsize source as will be set to 'file' but should be 'db' + + if(exists $wsize_config{'file'}{0}){ + $wsize_config{'db'}{0} = $wsize_config{'file'}{0}; + delete $wsize_config{'file'}{0}; + + if(keys(%{$wsize_config{'file'}}) == 0){ + delete $wsize_config{'file'}; + } + } + + $self->{'_collection_config'} = \%wsize_config; + return $self->{'_collection_config'}; +} + + + + + +=head2 fetch_all_by_Slice_ResultSets + + Arg[1] : Bio::EnsEMBL::Slice - Slice to retrieve results from + Arg[2] : ARRAYREF of Bio::EnsEMBL::Funcgen::ResultSets - ResultSet to retrieve results from + Arg[3] : OPTIONAL int - max bins, maximum number of scores required + Arg[4] : OPTIONAL int - window size, size of bin/window size in base pirs + Arg[5] : OPTIONAL string - sql contraint for use only with DB collections + Arg[6] : OPTIONAL hasref - config hash + Example : my %rfeatures = %{$rsa->fetch_all_by_Slice_ResultSets($slice, [@rsets])}; + Description: Gets a list of lightweight ResultFeature collection(s) for the ResultSets and Slice passed. + Returntype : HASHREF of ResultSet dbID keys with a LISTREF of ResultFeature collection values + Exceptions : Warns and skips ResultSets which are not RESULT_FEATURE_SETS. + Caller : general + Status : At risk + +=cut + +sub fetch_all_by_Slice_ResultSets{ + my ($self, $slice, $rsets, $max_bins, $window_size, $orig_constraint) = @_; + + $orig_constraint .= (defined $orig_constraint) ? ' AND ' : ''; + #currently does not accomodate raw experimental_chip ResultSets! + my $conf = $self->set_collection_config_by_Slice_ResultSets($slice, $rsets, $max_bins, $window_size); + + #Loop through each wsize build constraint set private vars and query + my (%rset_rfs, $constraint); + + + #Remove this block now we don't really support table based RFs + + my $rf_conf = $conf->{db}; + + foreach my $wsize(keys(%{$rf_conf})){ + $self->{'window_size'} = $wsize; + $_scores_field = $rf_conf->{$wsize}->{scores_field}; + $constraint = 'rf.result_set_id IN ('.join(', ', keys(%{$rf_conf->{$wsize}->{result_sets}})).')'. + " AND rf.window_size=$wsize"; + + if ($wsize != 0){ + $_collection_start = $rf_conf->{$wsize}->{collection_start}; + $_collection_end = $rf_conf->{$wsize}->{collection_end}; + } + + my ($rset_results) = @{$self->fetch_all_by_Slice_constraint($slice, $orig_constraint.$constraint)}; + + #This maybe undef if the slice is not present + $rset_results ||= {}; + + #Will this work for wsize split queries? + #We are not setting values for empty queries + #Which is causing errors on deref + + %rset_rfs = (%rset_rfs, + %{$rset_results}); + + #Account for DB rsets which return no features + foreach my $rset(values %{$rf_conf->{$wsize}{result_sets}}){ + + if(! exists $rset_rfs{$rset->dbID}){ + $rset_rfs{$rset->dbID} = []; + } + } + } + + + #Two blocks to prevent extra if-conditional for each rset. + #Can easily remove one block if/when we deprecate DB collections + #Should always be the same wsize if we are using a file i.e. no 0bp + #Unless we have differing window_size ranges + my $rf; + $rf_conf = $conf->{file}; + + foreach my $wsize(keys(%{$rf_conf})){ + + foreach my $file_rset(values %{$rf_conf->{$wsize}{result_sets}}){ + $rf = $self->_fetch_from_file_by_Slice_ResultSet($slice, $file_rset, $wsize, $rf_conf); + $rset_rfs{$file_rset->dbID} = defined($rf) ? [$rf] : []; + } + } + + return \%rset_rfs; +} + + + +=head2 _fetch_from_file_by_Slice_ResultSet + + Arg[1] : Bio::EnsEMBL::Slice - Slice to retrieve results from + Arg[2] : Bio::EnsEMBL::Funcgen::ResultSets - ResultSet to retrieve results from + Arg[3] : int - valid window size defined by set_collection_config_by_ResultSets + Arg[4] : HASHREF - Window size config for file based ResultSets + Example : my $rfeat = $self->_fetch_from_file_by_Slice_ResultSet($slice, $rset, $wsize, $conf); + Description: Generates ResultFeature collection reading packed scores from a 'col' file. + Returntype : Bio::EnsEMBL::Funcgen::Collection::ResultFeature + Exceptions : + Caller : fetch_all_by_Slice_ResultSets + Status : At risk + +=cut + + +#Set/store filepath in ResultSet to avoid having to regenerate? + +sub _fetch_from_file_by_Slice_ResultSet{ + my ($self, $slice, $rset, $window_size, $conf) = @_; + #private as window_size needs to ba valid i.e. generated by set_collection_config_by_ResultSets + #and no class tests + + #Cache this as ResultSet::get_dbfile_path_prefix? + #Is there any point if the rsets aren't cached? + #Data is cached anyway, so no redundant calls + #ResultSets are always regenerated + #Key would either have to be query or dbID + #Former would be hard?(constraint key, is this already done for features?) + #Later would be object cache, so we would still do sql query but skip the object generation + #given the dbID is enough to pull a valid object from the cache + + #How can we cache result/feature sets? + #Would this really speed anything up? + + #if(! exists $conf->{$window_size}){ + # throw("Specified window_size($window_size) is not present in config.\n". + # "Have you called this private method directly?\n". + # "Try using the fetch_by_Slice_ResultSets warpapper method\n". + # "Or set the RESULT_FEATURE_FILE_SET status and window_size correctly."); + # } + + my $rf; + my $efg_sr_id = $self->get_seq_region_id_by_Slice($slice); + + if($efg_sr_id){ + + my $packed_scores = $self->read_collection_blob + ( + $rset->get_dbfile_path_by_window_size($window_size), + #Would be in analysis object for unique analysis tracks/data + $efg_sr_id, + $conf->{$window_size}{'byte_offset'}, + $conf->{$window_size}{'byte_length'}, + ); + + my ($start, $end, @scores); + + + if(defined $packed_scores){ + ($start, $end) = ($conf->{$window_size}{collection_start}, + $conf->{$window_size}{collection_end}); + + + #Need to capture unpack failure here and undef the fh? + #i.e. pack/unpack repeat count overflow + + @scores = unpack('('.$self->pack_template.')'.(($end - $start + 1)/$window_size), + $packed_scores); + + #could validate scores size here + #will only ever be <= than expected value + #as unpack will discard excess + #better validate length of $packed_scores + + $rf = Bio::EnsEMBL::Funcgen::Collection::ResultFeature->new_fast + ({ + start => $start, + end => $end, + strand => 0, #These are strandless features + scores => [@scores], + window_size => $window_size, + }); + } + } + + return $rf; +} + + + +=head2 fetch_all_by_Slice_ResultSet + + Arg[1] : Bio::EnsEMBL::Slice - Slice to retrieve results from + Arg[2] : Bio::EnsEMBL::Funcgen::ResultSet - ResultSet to retrieve results from + Arg[3] : OPTIONAL int - max bins, maximum number of scores required + Arg[4] : OPTIONAL int - window size, size of bin/window size in base pirs + Arg[5] : OPTIONAL string - sql contraint for use only with DB collections + Arg[6] : OPTIONAL hasref - config hash + Example : my %rfeatures = %{$rsa->fetch_all_by_Slice_ResultSets($slice, [@rsets])}; + Description: Gets a list of lightweight Collection of ResultFeatures for the ResultSet and Slice passed. + NOTE: ExperimentalChip/Channel based ResultFeatures was removed in version 63. + Returntype : Bio::EnsEMBL::Funcgen::Collection::ResultFeature + Exceptions : None + Caller : general + Status : At risk + +=cut + + + +#To do +#remove Bio::EnsEMBL::Funcgen::ResultFeature in favour of Collection::ResultFeature? + +sub fetch_all_by_Slice_ResultSet{ + my ($self, $slice, $rset, $max_bins, $window_size, $orig_constraint) = @_; + + #Do this first to avoid double validation of rset + my $rf_hashref = $self->fetch_all_by_Slice_ResultSets($slice, [$rset], $max_bins, $window_size, $orig_constraint); + + return $rf_hashref->{$rset->dbID}; +} + + + +sub fetch_Iterator_by_Slice_ResultSet{ + my ($self, $slice, $rset, $max_bins, $window_size, $constraint, $chunk_size) = @_; + + return $self->fetch_collection_Iterator_by_Slice_method + ($self->can('fetch_all_by_Slice_ResultSet'), + [$slice, $rset, $max_bins, $window_size, $constraint], + 0,#Slice idx + $chunk_size #Iterator chunk length + ); + +} + +=head2 fetch_collection_Iterator_by_Slice_method + + Arg [1] : CODE ref of Slice fetch method + Arg [2] : ARRAY ref of parameters for Slice fetch method + Arg [3] : Optional int: Slice index in parameters array + Arg [4] : Optional int: Slice chunk size. Default=500000 + Example : my $slice_iter = $feature_adaptor->fetch_Iterator_by_Slice_method + ($feature_adaptor->can('fetch_all_by_Slice_Arrays'), + \@fetch_method_params, + 0,#Slice idx + #500 #chunk length + ); + + while(my $feature = $slice_iter->next && defined $feature){ + #Do something here + } + + Description: Creates an Iterator which chunks the query Slice to facilitate + large Slice queries which would have previously run out of memory + Returntype : Bio::EnsEMBL::Utils::Iterator + Exceptions : Throws if mandatory params not valid + Caller : general + Status : at risk - move to core BaseFeatureAdaptor + +=cut + + +#Essentially the only difference here we have one feature with an array of 'scores' + +sub fetch_collection_Iterator_by_Slice_method{ + my ($self, $slice_method_ref, $params_ref, $slice_idx, $chunk_size) = @_; + + if(! ( defined $slice_method_ref && + ref($slice_method_ref) eq 'CODE') + ){ + throw('Must pass a valid Slice fetch method CODE ref'); + } + + if (! ($params_ref && + ref($params_ref) eq 'ARRAY')) { + #Don't need to check size here so long as we have valid Slice + throw('You must pass a method params ARRAYREF'); + } + + $slice_idx = 0 if(! defined $slice_idx); + my $slice = $params_ref->[$slice_idx]; + $chunk_size ||= 1000000; + + my $collection; + my $finished = 0; + my $start = 1; #local coord for sub slice + my $end = $slice->length; + my $overlap = 0; + + my $coderef = + sub { + my $collection; + + if(! $finished) { + + my $new_end = ($start + $chunk_size - 1); + + if ($new_end >= $end) { + # this is our last chunk + $new_end = $end; + $finished = 1; + } + + #Chunk by sub slicing + my $sub_slice = $slice->sub_Slice($start, $new_end); + $params_ref->[$slice_idx] = $sub_slice; + ($collection) = @{ $slice_method_ref->($self, @$params_ref)}; + + + if(! $collection){ + $finished = 1; + } + else{ + + #Trim score and start if overlapping found + #Will only ever be on overlapping 'score' + if($overlap){ + shift @{$collection->scores}; + $collection->{start} = $collection->start + $collection->window_size; + $overlap = 0; + } + + + if ( $collection->end > $sub_slice->end ){ + $overlap = 1; + } + + $start = $new_end + 1; + } + } + + return $collection; + }; + + return Bio::EnsEMBL::Utils::Iterator->new($coderef); +} + + + +# Over-ride/deprecate generic methods +# which do not work with ResultFeature Collections + +sub fetch_all{ + deprecate('The fetch_all method has been disabled as it is not appropriate for the ResultFeatureAdaptor'); + return; +} + +sub fetch_by_dbID{ + warn 'The fetch_by_dbID method has been disabled as it is not appropriate for the ResultFeatureAdaptor'; + #Could use it for 0 wsize DB based data, but not useful. + return; +} + +sub fetch_all_by_dbID_list { + warn 'The fetch_all_by_dbID_list method has been disabled as it is not appropriate for the ResultFeatureAdaptor'; + #Could use it for 0 wsize DB based data, but not useful. + return; +} + +sub fetch_all_by_logic_name { + warn 'The fetch_all_by_logic_name method has been disabled as it is not appropriate for the ResultFeatureAdaptor'; + #Could use it for 0 wsize DB based data, but not useful. + return; +} + +sub _list_seq_region_ids{ + warn 'The _list_seq_region_ids method has been disabled as it is not appropriate for the ResultFeatureAdaptor'; + #Could use it for 0 wsize DB based data, but not useful. + return +} + +#Over-ride fetch_all_by_display_label? Or move this to the individual FeatureAdaptors? +#Same with fetch_all_by_stable_Storable_FeatureSEts and wrappers (fetch_all_by_external_name)? +#Basically this is not a DBAdaptor anymore so should inherit from somewhere else. +#Need to separate the common utility methods and have co-inheritance e.g. +#DBFile::Adaptor Utils::FeatureAdaptor +#DBAdaptor::Adaptor Utils::FeatureAdaptor + + + +#Deprecated/Removed + +=head2 resolve_replicates_by_ResultSet + + Arg[0] : HASHREF - result_set_input_id => @scores pairs + #Arg[1] : Bio::EnsEMBL::Funcgen::ResultSet - ResultSet to retrieve results from + Example : my @rfeatures = @{$rsa->fetch_ResultFeatures_by_Slice_ResultSet($slice, $rset, 'DISPLAYABLE')}; + Description: Gets a list of lightweight ResultFeatures from the ResultSet and Slice passed. + Replicates are combined using a median of biological replicates based on + their mean techinical replicate scores + Returntype : List of Bio::EnsEMBL::Funcgen::ResultFeature + Exceptions : None + Caller : general + Status : deprecated + +=cut + + +sub resolve_replicates_by_ResultSet{ + die('ExperimentalChip/Channel based ResultFeature support was removed in version 63'); +} + + +=head2 fetch_results_by_probe_id_ResultSet + + Arg [1] : int - probe dbID + Arg [2] : Bio::EnsEMBL::Funcgen::ResultSet + Example : my @probe_results = @{$ofa->fetch_results_by_ProbeFeature_ResultSet($pid, $result_set)}; + Description: Gets result for a given probe in a ResultSet + Returntype : ARRAYREF + Exceptions : throws if args not valid + Caller : General + Status : deprecated + +=cut + +sub fetch_results_by_probe_id_ResultSet{ + die('ExperimentalChip/Channel based ResultFeature support was removed in version 63'); +} + + + + +1; + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/ResultSetAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/ResultSetAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,863 @@ +# +# Ensembl module for Bio::EnsEMBL::DBSQL::Funcgen::ResultSetAdaptor +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::DBSQL::Funcgen::ResultSetAdaptor - A database adaptor for fetching and +storing ResultSet objects. + +=head1 SYNOPSIS + +my $rset_adaptor = $db->get_ResultSetAdaptor(); + +my @rsets = @{$rset_adaptor->fetch_all_ResultSets_by_Experiment()}; +#my @displayable_rsets = @{$rset_adaptor->fetch_all_displayable_ResultSets()}; + + + + +=head1 DESCRIPTION + +The ResultSetAdaptor is a database adaptor for storing and retrieving +ResultSet objects which encapsulate ResultFeatures defining individual points of +experimental 'signal' data. This can be raw signal(Channel) or +normalised(ExperimentalChip) data from an Array Experiment. A ResultSet can also +encapsulate processed signal/read data(InputSet) from a sequencing Experiment e.g RPKM. + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::DBSQL::ResultSetAdaptor; + +use Bio::EnsEMBL::Utils::Exception qw( throw warning ); +use Bio::EnsEMBL::Funcgen::ResultSet; +use Bio::EnsEMBL::Funcgen::ResultFeature; +use Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(mean median); +use base qw(Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor); #@ISA + +#Generates ResultSet contains info about ResultSet content +#and actual results for channel or for chips in contig set? +#omit channel handling for now as we prolly won't ever display them +#but we might use it for running analyses and recording in result_set...change to result_group or result_analyses +#data_set!! Then we can keep other tables names and retain ResultFeature +#and change result_feature to result_set, this makes focus of result set more accurate and ResultFeatures are lightweight result objects. +#do we need to accomodate different classes of data or multiple feature types in one set? i.e. A combi experiment (Promot + Histone mod)? +#schema can handle this...API? ignore for now but be mindful. +#This is subtley different to handling different experiments with different features in the same ResultSet. +#Combi will have same sample. + + +#This needs one call to return all displayable sets, grouped by cell_line and ordered by FeatureType +#needs to be restricted to cell line, feature type, but these fields have to be disparate from result_feature +#as this is only a simple linker table, and connections may not always be present +#so cell tpye and feature type constraints have to be performed on load, then can assume that associated features and results +# have same cell type/feature +#so we need to group by cell_type in sql and then order by feature_type_id in sql or rearrange in code? +#This will not know about chip sets, just that a feature set is linked to various result sets +#There fore we need to use the chip_set_id or link back to the experimental_chip chip_set_ids +#this would require a self join on experimental_chip + + + + +#Result_set_id is analagous to the chip_set key, altho' we may have NR instances of the same chip set with different analysis +#if we didn't know the sets previosuly, then we would have to alter the result_set_id retrospectively i.e. change the result_set_id.#All chips in exp to be in same set until we know sets, or all in separate set? +#Do not populate data_set until we know sets as this would cause hacky updating in data_set too. + + +#how are we going to accomodate a combi exp? Promot + Histone mods? +#These would lose their exp set association, i.e. same exp & sample different exp method +#we're getting close to defining the regulon here, combined results features from the same exp +#presently want them displayed as a group but ordered appropriately +#was previously treating each feature as a separate result set + + +#for storing/making link we don't need the Slice context +#store should check all +#so do we move the slice context to the object methods or make optional +#then object method can check for slice and throw or take a Slice as an optional argument +#this will enable generic set to be created to allow loading and linking of features to results +#we still need to know which feature arose from which chip!!!! Not easy to do and may span two. +#Need to genericise this to the chip_set(or use result_set_id non unique) +#We need to disentangle setting the feature to chip/set problem from the displayable problem. +#change the way StatusAdaptor works to accomodate result_set_id:table_name:table_id, as this will define unique results +# + +#can we extend this to creating skeleton result sets and loading raw results too? +# + +#Result.pm should be lightweight by default to enable fast web display, do we need oligo_probe_id? + + +#how are we going to overcome unlinked but displayable sets? +#incomplete result_feature records will be hack to update/alter? +#could have attach_result to feature method? +#force association when loading features + + +=head2 fetch_all_linked_by_ResultSet + + Arg [1] : Bio::EnsEMBL::Funcgen::ResultSet + Arg [2] : Bio::EnsEMBL::Analysis + Example : my @rsets = @{$rset_adaptor->fetch_all_linked_by_ResultSet($rset)}; + Description: Retrieves a list of Bio::EnsEMBL::Funcgen::ResultSets which are linked + to the supplied ResultSet (i.e. replicate relationships/import sets) + Returntype : Listref of Bio::EnsEMBL::Funcgen::ResultSet objects + Exceptions : Throws if ResultSet not valid or stored + Caller : general + Status : At Risk + +=cut + +sub fetch_all_linked_by_ResultSet{ + my ($self, $rset) = @_; + + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::ResultSet', $rset); + + + my $constraint = ' rsi.result_set_id in (SELECT distinct(result_set_id) from result_set_input where result_set_input_id in('.join(', ', @{$rset->result_set_input_ids}).')) '; + + my @tmp = @{$self->generic_fetch($constraint)}; + + #Now remove query set + my @linked_sets; + + map {push @linked_sets, $_ if $_->dbID != $rset->dbID} @tmp; + + return \@linked_sets; + +} + + + +=head2 fetch_all_by_Experiment_Analysis + + Arg [1] : Bio::EnsEMBL::Funcgen::Experiment + Arg [2] : Bio::EnsEMBL::Analysis + Example : my @rsets = @{$rset_adaptor->fetch_all_by_Experiment_Analysis($exp, $anal)}; + Description: Retrieves a list of Bio::EnsEMBL::Funcgen::ResultSets with the given Analysis from the Experiment + Returntype : Listref of Bio::EnsEMBL::Funcgen::ResultSet objects + Exceptions : Throws if Analysis is not valid and stored + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_Experiment_Analysis{ + my ($self, $exp, $analysis) = @_; + + if ( !($analysis && $analysis->isa("Bio::EnsEMBL::Analysis") && $analysis->dbID())) { + throw("Need to pass a valid stored Bio::EnsEMBL::Analysis"); + } + + + my $join = $self->get_Experiment_join_clause($exp)." AND rs.analysis_id=".$analysis->dbID(); + + return ($join) ? $join." AND rs.analysis_id=".$analysis->dbID() : []; +} + +sub get_Experiment_join_clause{ + my ($self, $exp) = @_; + + if ( !($exp && $exp->isa("Bio::EnsEMBL::Funcgen::Experiment") && $exp->dbID())) { + throw("Need to pass a valid stored Bio::EnsEMBL::Funcgen::Experiment"); + } + + + my $constraint; + + my @ecs = @{$exp->get_ExperimentalChips()}; + + if (@ecs) { + + my $ec_ids = join(', ', (map $_->dbID, @ecs)); #get ' separated list of ecids + + + my @chans = map @$_, (map $_->get_Channels(), @ecs); + my $chan_ids = join(', ', (map $_->dbID(), @chans)); #get ' separated list of chanids + #These give empty strings which are defined + #This will not work for single IDs of 0, but this will never happen. + + if ($ec_ids && $chan_ids) { + $constraint = '(((rsi.table_name="experimental_chip" AND rsi.table_id IN ('.$ec_ids. + ')) OR (rsi.table_name="channel" AND rsi.table_id IN ('.$chan_ids.'))))'; + #This could probably be sped up using UNION + #But result set is too small for cost of implementation + } elsif ($ec_ids) { + $constraint = 'rsi.table_name="experimental_chip" AND rsi.table_id IN ('.$ec_ids.')'; + } elsif ($chan_ids) { + $constraint = 'rsi.table_name="channel" AND rsi.table_id IN ('.$chan_ids.')'; + } + } else { #Assume we have an InputSet Experiment? + #We could possibly have an expeirment with an array and an input set + #Currently nothing to stop this, but would most likely be loaded as separate experiments + my $input_setids = join(', ', (map $_->dbID, @{$self->db->get_InputSetAdaptor->fetch_all_by_Experiment($exp)})); + + $constraint = 'rsi.table_name="input_set" AND rsi.table_id IN ('.$input_setids.')'; + } + + return $constraint; +} + + +=head2 fetch_all_by_Experiment + + Arg [1] : Bio::EnsEMBL::Funcgen::Experiment + Example : my @rsets = @{$rset_adaptor->fetch_all_by_Experiment($exp)}; + Description: Retrieves a list of Bio::EnsEMBL::Funcgen::ResultSets from the Experiment + Returntype : Listref of Bio::EnsEMBL::Funcgen::ResultSet objects + Exceptions : None + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_Experiment{ + my ($self, $exp) = @_; + + #my $constraint = "ec.experiment_id=".$exp->dbID(); + #This was much easier with the more complicated default where join + #should we reinstate and just have a duplication of cell/feature_types? + + + my $join = $self->get_Experiment_join_clause($exp); + + return ($join) ? $self->generic_fetch($join) : []; +} + + + +=head2 fetch_all_by_FeatureType + + Arg [1] : Bio::EnsEMBL::Slice + Arg [2] : string - type of array (e.g. AFFY or OLIGO) + Arg [3] : (optional) string - logic name + Example : my $slice = $sa->fetch_by_region('chromosome', '1'); + my $features = $ofa->fetch_by_Slice_type($slice, 'OLIGO'); + Description: Retrieves a list of features on a given slice that are created + by probes from the specified type of array. + Returntype : Listref of Bio::EnsEMBL::OligoFeature objects + Exceptions : Throws if no array type is provided + Caller : General + Status : At Risk + +=cut + +sub fetch_all_by_FeatureType { + my ($self, $ftype) = @_; + + if ( !($ftype && $ftype->isa("Bio::EnsEMBL::Funcgen::FeatureType") && $ftype->dbID())) { + throw("Need to pass a valid stored Bio::EnsEMBL::Funcgen::FeatureType"); + } + + my $constraint = "rs.feature_type_id =".$ftype->dbID(); + + return $self->generic_fetch($constraint); +} + + +=head2 fetch_all_by_name_Analysis + + Arg [1] : string - ResultSet name + Arg [2] : Bio::EnsEMBL::Funcgen::Analysis + Example : ($rset) = @{$rseta->fetch_by_name($exp->name().'_IMPORT')}; + Description: Retrieves a ResultSet based on the name attribute + Returntype : Bio::EnsEMBL::Funcgen::ResultSet + Exceptions : Throws if no name provided + Caller : General + Status : At Risk - remove all, there should only be one? + +=cut + +sub fetch_all_by_name_Analysis { + my ($self, $name, $anal) = @_; + return $self->fetch_all_by_name($name, undef, undef, $anal); +} + +=head2 fetch_all_by_name + + Arg [0] : Mandatory string - ResultSet name + Arg [1] : Optional Bio::EnsEMBL::Funcgen::FeatureType + Arg [2] : Optional Bio::EnsEMBL::Funcgen::CellType + Arg [3] : Optional Bio::EnsEMBL::Analysis + Example : ($rset) = @{$rseta->fetch_all_by_name($exp->name().'_IMPORT')}; + Description: Retrieves ResultSets based on the name attribute + Returntype : ARRAYREF of Bio::EnsEMBL::Funcgen::ResultSet objects + Exceptions : Throws if no name provided or optional arguments are not valid + Caller : General + Status : At Risk + +=cut + +sub fetch_all_by_name{ + my ($self, $name, $ftype, $ctype, $anal) = @_; + + if ( ! defined $name) { + throw('Need to pass a ResultSet name'); + } + + my $constraint = 'rs.name = ?'; + $self->bind_param_generic_fetch($name, SQL_VARCHAR); + + if ($ftype) { + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::FeatureType',$ftype); + $constraint .= ' AND rs.feature_type_id=?'; + $self->bind_param_generic_fetch($ftype->dbID, SQL_INTEGER); + } + + if ($ctype) { + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::CellType',$ctype); + $constraint .= ' AND rs.cell_type_id=?'; + $self->bind_param_generic_fetch($ctype->dbID, SQL_INTEGER); + } + + if ($anal) { + $self->db->is_stored_and_valid('Bio::EnsEMBL::Analysis',$anal); + $constraint .= ' AND rs.analysis_id=?'; + $self->bind_param_generic_fetch($anal->dbID, SQL_INTEGER); + } + + return $self->generic_fetch($constraint); +} + + +=head2 _tables + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns the names and aliases of the tables to use for queries. + Returntype : List of listrefs of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _tables { + my $self = shift; + + return ( + [ 'result_set', 'rs' ], + [ 'result_set_input', 'rsi'], + [ 'dbfile_registry', 'dr' ], + ); +} + + +=head2 _left_join + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns the left join clasnames and aliases of the tables to use for queries. + Returntype : List of listrefs of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _left_join { + return (['dbfile_registry', '(rs.result_set_id=dr.table_id AND dr.table_name="result_set")']); +} + + +=head2 _columns + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns a list of columns to use for queries. + Returntype : List of strings + Exceptions : None + Caller : Internal + Status : Medium Risk + +=cut + +sub _columns { + my $self = shift; + + return qw( + rs.result_set_id rs.analysis_id + rsi.table_name rsi.result_set_input_id + rsi.table_id rs.name + rs.cell_type_id rs.feature_type_id + dr.path + ); + + +} + +=head2 _default_where_clause + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns an additional table joining constraint to use for + queries. + Returntype : List of strings + Exceptions : None + Caller : Internal + Status : Medium Risk + +=cut + +sub _default_where_clause { + my $self = shift; + + return 'rs.result_set_id = rsi.result_set_id'; +} + +=head2 _final_clause + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns an ORDER BY clause. Sorting by oligo_feature_id would be + enough to eliminate duplicates, but sorting by location might + make fetching features on a slice faster. + Returntype : String + Exceptions : None + Caller : generic_fetch + Status : Medium Risk + +=cut + + +sub _final_clause { + #do not mess with this! + return ' GROUP by rsi.result_set_input_id, rsi.result_set_id ORDER BY rs.result_set_id, rs.cell_type_id, rs.feature_type_id'; +} + + + +=head2 _objs_from_sth + + Arg [1] : DBI statement handle object + Example : None + Description: PROTECTED implementation of superclass abstract method. + Creates Array objects from an executed DBI statement + handle. + Returntype : Listref of Bio::EnsEMBL::Funcgen::Experiment objects + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _objs_from_sth { + my ($self, $sth) = @_; + + my (@rsets, $last_id, $rset, $dbid, $anal_id, $anal, $ftype, $ctype, $table_id, $name); + my ($sql, $table_name, $cc_id, $ftype_id, $ctype_id, $rf_set, $dbfile_dir); + my $a_adaptor = $self->db->get_AnalysisAdaptor(); + my $ft_adaptor = $self->db->get_FeatureTypeAdaptor(); + my $ct_adaptor = $self->db->get_CellTypeAdaptor(); + $sth->bind_columns(\$dbid, \$anal_id, \$table_name, \$cc_id, + \$table_id, \$name, \$ctype_id, \$ftype_id, \$dbfile_dir); + + + my $dbfile_data_root = $self->dbfile_data_root; + + #Need c/ftype cache here or rely on query cache? + + while ( $sth->fetch() ) { + + + if(! $rset || ($rset->dbID() != $dbid)){ + + push @rsets, $rset if $rset; + + $anal = (defined $anal_id) ? $a_adaptor->fetch_by_dbID($anal_id) : undef; + $ftype = (defined $ftype_id) ? $ft_adaptor->fetch_by_dbID($ftype_id) : undef; + $ctype = (defined $ctype_id) ? $ct_adaptor->fetch_by_dbID($ctype_id) : undef; + + + if(defined $dbfile_dir){ + $dbfile_dir = $dbfile_data_root.'/'.$dbfile_dir; + } + + + + $rset = Bio::EnsEMBL::Funcgen::ResultSet->new + ( + -DBID => $dbid, + -NAME => $name, + -ANALYSIS => $anal, + -TABLE_NAME => $table_name, + -FEATURE_TYPE => $ftype, + -CELL_TYPE => $ctype, + -ADAPTOR => $self, + -DBFILE_DATA_DIR => $dbfile_dir, + ); + } + + #This assumes logical association between chip from the same exp, confer in store method????????????????? + + if(defined $rset->feature_type()){ + throw("ResultSet does not accomodate multiple FeatureTypes") if ($ftype_id != $rset->feature_type->dbID()); + } + + if(defined $rset->cell_type()){ + throw("ResultSet does not accomodate multiple CellTypes") if ($ctype_id != $rset->cell_type->dbID()); + } + + #we're not controlling ctype and ftype during creating new ResultSets to store. + #we should change add_table_id to add_ExperimentalChip and check in that method + + #add just the ids here, as we're aiming at quick web display. + $rset->add_table_id($table_id, $cc_id); + + } + + push @rsets, $rset if $rset; + + return \@rsets; +} + + + +=head2 store + + Args : List of Bio::EnsEMBL::Funcgen::ResultSet objects + Example : $rsa->store(@rsets); + Description: Stores or updates previously stored ResultSet objects in the database. + Returntype : None + Exceptions : Throws if a List of ResultSet objects is not provided or if + an analysis is not attached to any of the objects + Caller : General + Status : At Risk + +=cut + +sub store{ + my ($self, @rsets) = @_; + + throw("Must provide a list of ResultSet objects") if(scalar(@rsets == 0)); + + my (%analysis_hash); + + my $sth = $self->prepare('INSERT INTO result_set (analysis_id, name, cell_type_id, feature_type_id) VALUES (?, ?, ?, ?)'); + + my $db = $self->db(); + my $analysis_adaptor = $db->get_AnalysisAdaptor(); + + FEATURE: foreach my $rset (@rsets) { + + if( ! ref $rset || ! $rset->isa('Bio::EnsEMBL::Funcgen::ResultSet') ) { + throw('Must be an ResultSet object to store'); + } + + + if ( $rset->is_stored($db) ) { + throw('ResultSet [' . $rset->dbID() . '] is already stored in the database\nResultSetAdaptor does not yet accomodate updating ResultSets'); + #would need to retrive stored result set and update table_ids + } + + #above does not check if it has been generated from scratch but is identical i.e. recovery. + #Need to check table_id and analysis and that it has the correct status + + + + if ( ! defined $rset->analysis() ) { + throw('An analysis must be attached to the ResultSet objects to be stored.'); + } + + # Store the analysis if it has not been stored yet + if ( ! $rset->analysis->is_stored($db) ) { + warn("Will this not keep storing the same analysis if we keep passing the same unstored analysis?"); + $analysis_adaptor->store( $rset->analysis() ); + } + + + my $ct_id = (defined $rset->cell_type()) ? $rset->cell_type->dbID() : undef; + my $ft_id = (defined $rset->feature_type()) ? $rset->feature_type->dbID() : undef; + + $sth->bind_param(1, $rset->analysis->dbID(), SQL_INTEGER); + $sth->bind_param(2, $rset->name(), SQL_VARCHAR); + $sth->bind_param(3, $ct_id, SQL_INTEGER); + $sth->bind_param(4, $ft_id, SQL_INTEGER); + + + + $sth->execute(); + + $rset->dbID( $sth->{'mysql_insertid'} ); + $rset->adaptor($self); + + $self->store_states($rset); + $self->store_chip_channels($rset); + $self->store_dbfile_data_dir($rset) if $rset->dbfile_data_dir; + } + + return \@rsets; +} + + + + + +=head2 dbfile_data_root + + Arg[1] : Optional String: Root path of dbfile data directory + Example : $rset_adaptor->dbfile_data_dir('/over-ride/path); + Description: This allows the root path to be over-ridden. The default path + is stored in the meta table as 'dbfile.data_root'. This can be + over-ridden by the webcode by setting REGULATION_FILE_PATH in + DEFAULTS.ini + Returntype : String + Exceptions : None + Caller : Bio::EnsEMBL::Funcgen::DBAdaptor::ResultSet + Status : at risk - move this to SetAdaptor/FileAdaptor? + +=cut + +sub dbfile_data_root{ + my ($self, $root) = @_; + + if($root){ + $self->{dbfile_data_root} = $root; + } + elsif(! defined $self->{dbfile_data_root}){ + + #Grab it from the meta table + $self->{dbfile_data_root} = $self->db->get_MetaContainer->single_value_by_key('dbfile.data_root'); + + #set to empty string if undef, so we don't keep querying the meta table + $self->{dbfile_data_root} ||= ''; + } + + return $self->{dbfile_data_root}; +} + + + + + +=head2 store_dbfile_data_dir + + Arg[1] : Bio::EnsEMBL::Funcgen::ResultSet + Example : $rset_adaptor->store_dbfile_data_dir; + Description: Updater/Setter for the root dbfile data directory for this ResultSet + Returntype : None + Exceptions : Throws if ResultSet is not stored and valid + Throws if dbfile_dat_dir not defined + Caller : Bio::EnsEMBL::Funcgen::Collector::ResultFeature + Status : at risk + +=cut + + + +sub store_dbfile_data_dir{ + my ($self, $rset) = @_; + + + if ( ! $rset->is_stored($self->db()) ) { + throw('ResultSet must be stored in the database before storing chip_channel entries'); + } + + my $data_dir = $rset->dbfile_data_dir; + + if(! $data_dir){ + throw('You need to pass a dbfile_data_dir argument to update the ResultSet'); + } + + #Check we have a record + my $sql = 'SELECT path from dbfile_registry where table_name="result_set" and table_id=?'; + my $sth = $self->prepare($sql); + $sth->bind_param(1, $data_dir, SQL_VARCHAR); + $sth->execute; + + my ($db_dir) = $sth->fetch(); + + if($db_dir && + ($db_dir ne $data_dir)){#UPDATE + $sql = 'UPDATE dbfile_registry set path=? where table_name="result_set" and table_id=?'; + $sth = $self->prepare($sql); + $sth->bind_param(1, $data_dir, SQL_VARCHAR); + $sth->bind_param(2, $rset->dbID, SQL_INTEGER); + $sth->execute; + } + elsif(! $db_dir){#STORE + $sql = 'INSERT INTO dbfile_registry(table_id, table_name, path) values(?, "result_set", ?)'; + $sth = $self->prepare($sql); + $sth->bind_param(1, $data_dir, SQL_VARCHAR); + $sth->bind_param(2, $rset->dbID, SQL_INTEGER); + $sth->execute; + } + + return; +} + + +=head2 store_chip_channels + + Args : Bio::EnsEMBL::Funcgen::ResultSet + Example : $rsa->store_chip_channel(@rset); + Description: Convinience methods extracted from store to allow updating of chip_channel entries + during inline result processing which would otherwise be troublesome due to the need + for a chip_channel_id in the result table before the ResultSet would normally be stored + i.e. after it has been fully populated with data. + Returntype : Bio::EnsEMBL::Funcgen::ResultSet + Exceptions : Throws if a stored ResultSet object is not provided + Caller : General + Status : At Risk + +=cut + + +sub store_chip_channels{ + my ($self, $rset) = @_; + + if(! ($rset && $rset->isa("Bio::EnsEMBL::Funcgen::ResultSet"))){ + throw("You must pasas a valid Bio::EnsEMBL::Funcgen::ResultSet"); + } + + if ( ! $rset->is_stored($self->db()) ) { + throw('ResultSet must be stored in the database before storing chip_channel entries'); + } + + my $sth = $self->prepare(" + INSERT INTO result_set_input ( + result_set_id, table_id, table_name + ) VALUES (?, ?, ?) + "); + + my $sth1 = $self->prepare(" + INSERT INTO result_set_input ( + result_set_input_id, result_set_id, table_id, table_name + ) VALUES (?, ?, ?, ?) + "); + + + #Store and set all previously unstored table_ids + foreach my $table_id(@{$rset->table_ids()}){ + my $cc_id = $rset->get_result_set_input_id($table_id); + + if(! defined $cc_id){ + $sth->bind_param(1, $rset->dbID(), SQL_INTEGER); + $sth->bind_param(2, $table_id, SQL_INTEGER); + $sth->bind_param(3, $rset->table_name(), SQL_VARCHAR); + + $sth->execute(); + + $cc_id = $sth->{'mysql_insertid'}; + $rset->add_table_id($table_id, $sth->{'mysql_insertid'}); + }else{ + + #this should only store if not already stored for this rset + #this is because we may want to add chip_channels to a previously stored rset + my $sql = 'SELECT result_set_input_id from result_set_input where result_set_id='.$rset->dbID(). + " AND result_set_input_id=${cc_id}"; + my ($loaded) = map $_ = "@$_", @{$self->db->dbc->db_handle->selectall_arrayref($sql)}; + + if(! $loaded){ + $sth1->bind_param(1, $cc_id, SQL_INTEGER); + $sth1->bind_param(2, $rset->dbID(), SQL_INTEGER); + $sth1->bind_param(3, $table_id, SQL_INTEGER); + $sth1->bind_param(4, $rset->table_name(), SQL_VARCHAR); + $sth1->execute();#this could still fail is some one duplicates a result_set_id, table_id, table_name entry + } + } + } + return $rset; +} + +=head2 list_dbIDs + + Args : None + Example : my @rsets_ids = @{$rsa->list_dbIDs()}; + Description: Gets an array of internal IDs for all ProbeFeature objects in + the current database. + Returntype : List of ints + Exceptions : None + Caller : general + Status : stable + +=cut + +sub list_dbIDs { + my $self = shift; + + return $self->_list_dbIDs('result_set'); +} + + +=head2 fetch_ResultFeatures_by_Slice_ResultSet + + Arg[0] : Bio::EnsEMBL::Slice - Slice to retrieve results from + Arg[1] : Bio::EnsEMBL::Funcgen::ResultSet - ResultSet to retrieve results from + Arg[2] : optional string - STATUS e.g. 'DIPLAYABLE' + Example : my @rfeatures = @{$rsa->fetch_ResultFeatures_by_Slice_ResultSet($slice, $rset, 'DISPLAYABLE')}; + Description: Gets a list of lightweight ResultFeatures from the ResultSet and Slice passed. + Replicates are combined using a median of biological replicates based on + their mean techinical replicate scores + Returntype : List of Bio::EnsEMBL::Funcgen::ResultFeature + Exceptions : Warns if not experimental_chip ResultSet + Throws if no Slice passed + Warns if + Caller : general + Status : At risk + +=cut + + +#Could we also have an optional net size for grouping ResultFeature into Nbp pseudo ResultFeatures? + + +#This does not account for strandedness!! +###??? +#Is this sensible? Do we really want the full probe object alongside the ResultFeatures? +#strandedness? +#what about just the probe name? +#we could simplt add the probe_id to the ResultFeature +#This would prevent creating probe features for all of the features which do not have results for a given resultset +#This will mean the probe will still have to be individually created +#But we're only creating it for those which we require +#and we're now creating the lightweight ResultFeature instead of the ProbeFeature +#However, if we're dealing with >1 rset in a loop +#Then we'll be recreating the same ResultFeatures and probes for each set. +#We're already restricting the ProbeFeatures to those within the rset +#What we want is to get the score along side the ProbeFeature? +#But we want the probe name!! +#We really want something that will give Probe and ResultFeature +#Let's set the Probe as an optional ResultFeature attribute + + +sub fetch_ResultFeatures_by_Slice_ResultSet{ + my ($self, $slice, $rset, $ec_status, $with_probe) = @_; + + warn "Bio::EnsEMBL::Funcgen::DBSQL::ResultSetAdaptor::fetch_ResultFeatures_by_Slice_ResultSet is now deprecated\n". + "Please use the ResultFeatureAdaptor directly"; + + return $self->db->get_ResultFeatureAdaptor->fetch_all_by_Slice_ResultSet($slice, $rset, $ec_status, $with_probe); + +} +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/SegmentationFeatureAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/SegmentationFeatureAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,367 @@ +# +# Ensembl module for Bio::EnsEMBL::DBSQL::Funcgen::SegmentationFeatureAdaptor +# + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::DBSQL::Funcgen::SegmentationFeatureAdaptor - A database adaptor for fetching and +storing SegmentationFeature objects. + +=head1 SYNOPSIS + +my $afa = $db->get_SegmentationFeatureAdaptor(); + +my $features = $afa->fetch_all_by_Slice($slice); + + +=head1 DESCRIPTION + +The SegmentationFeatureAdaptor is a database adaptor for storing and retrieving +SegmentationFeature objects. + +=cut + +package Bio::EnsEMBL::Funcgen::DBSQL::SegmentationFeatureAdaptor; + +use strict; +use warnings; + +use Bio::EnsEMBL::Utils::Exception qw( throw warning ); +use Bio::EnsEMBL::Funcgen::SegmentationFeature; +use Bio::EnsEMBL::Funcgen::DBSQL::SetFeatureAdaptor; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::DBSQL::SetFeatureAdaptor); + +#This adaptor does not yet use query extension + +=head2 _tables + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns the names and aliases of the tables to use for queries. + Returntype : List of listrefs of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _tables { + my $self = shift; + + return ( + ['segmentation_feature', 'sf'], + ['feature_set', 'fs'] + ); +} + +=head2 _columns + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns a list of columns to use for queries. + Returntype : List of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _columns { + my $self = shift; + + return qw( + sf.segmentation_feature_id sf.seq_region_id + sf.seq_region_start sf.seq_region_end + sf.seq_region_strand sf.feature_type_id + sf.feature_set_id sf.score + sf.display_label + ); +} + + + +=head2 _objs_from_sth + + Arg [1] : DBI statement handle object + Example : None + Description: PROTECTED implementation of superclass abstract method. + Creates SegmentationFeature objects from an executed DBI statement + handle. + Returntype : Listref of Bio::EnsEMBL::SegmentationFeature objects + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _objs_from_sth { + my ($self, $sth, $mapper, $dest_slice) = @_; + + + #For EFG this has to use a dest_slice from core/dnaDB whether specified or not. + #So if it not defined then we need to generate one derived from the species_name and schema_build of the feature we're retrieving. + + + my ($sa, $seq_region_id); + #don't really need this if we're using DNADBSliceAdaptor? + $sa = $dest_slice->adaptor->db->get_SliceAdaptor() if($dest_slice); + $sa ||= $self->db->dnadb->get_SliceAdaptor(); + + + #Some of this in now probably overkill as we'll always be using the DNADB as the slice DB + #Hence it should always be on the same coord system + my $fset_adaptor = $self->db->get_FeatureSetAdaptor; + my $ftype_adaptor = $self->db->get_FeatureTypeAdaptor; + my @features; + my (%fset_hash, %slice_hash, %sr_name_hash, %sr_cs_hash, %ftype_hash); + + my ( + $segmentation_feature_id, $efg_seq_region_id, + $seq_region_start, $seq_region_end, + $seq_region_strand, $ftype_id, + $fset_id, $score, + $display_label + + ); + + $sth->bind_columns( + \$segmentation_feature_id, \$efg_seq_region_id, + \$seq_region_start, \$seq_region_end, + \$seq_region_strand, \$ftype_id, + \$fset_id, \$score, + \$display_label, + ); + + + my $asm_cs; + my $cmp_cs; + my $asm_cs_name; + my $asm_cs_vers; + my $cmp_cs_name; + my $cmp_cs_vers; + if ($mapper) { + $asm_cs = $mapper->assembled_CoordSystem(); + $cmp_cs = $mapper->component_CoordSystem(); + $asm_cs_name = $asm_cs->name(); + $asm_cs_vers = $asm_cs->version(); + $cmp_cs_name = $cmp_cs->name(); + $cmp_cs_vers = $cmp_cs->version(); + } + + my $dest_slice_start; + my $dest_slice_end; + my $dest_slice_strand; + my $dest_slice_length; + my $dest_slice_sr_name; + if ($dest_slice) { + $dest_slice_start = $dest_slice->start(); + $dest_slice_end = $dest_slice->end(); + $dest_slice_strand = $dest_slice->strand(); + $dest_slice_length = $dest_slice->length(); + $dest_slice_sr_name = $dest_slice->seq_region_name(); + } + + + FEATURE: while ( $sth->fetch() ) { + #Need to build a slice adaptor cache here? + #Would only ever want to do this if we enable mapping between assemblies?? + #Or if we supported the mapping between cs systems for a given schema_build, which would have to be handled by the core api + + #get core seq_region_id + #This fails if we are using a 'comparable' CoordSystem as we don't have a cache + #for the new DB. Wasn't this fixed with the tmp seq_region_cache? + $seq_region_id = $self->get_core_seq_region_id($efg_seq_region_id); + + if(! $seq_region_id){ + warn "Cannot get slice for eFG seq_region_id $efg_seq_region_id\n". + "The region you are using is not present in the current seq_region_cache.\n". + "Maybe you need to redefine the dnadb or update_DB_for_release?"; + next; + } + + #Get the FeatureSet object + $fset_hash{$fset_id} = $fset_adaptor->fetch_by_dbID($fset_id) if ! exists $fset_hash{$fset_id}; + $ftype_hash{$ftype_id} = $ftype_adaptor->fetch_by_dbID($ftype_id) if ! exists $fset_hash{$ftype_id}; + + + # Get the slice object + my $slice = $slice_hash{'ID:'.$seq_region_id}; + + if (! $slice) { + $slice = $sa->fetch_by_seq_region_id($seq_region_id); + $slice_hash{'ID:'.$seq_region_id} = $slice; + $sr_name_hash{$seq_region_id} = $slice->seq_region_name(); + $sr_cs_hash{$seq_region_id} = $slice->coord_system(); + } + + my $sr_name = $sr_name_hash{$seq_region_id}; + my $sr_cs = $sr_cs_hash{$seq_region_id}; + + # Remap the feature coordinates to another coord system if a mapper was provided + if ($mapper) { + + throw("Not yet implmented mapper, check equals are Funcgen calls too!"); + + ($sr_name, $seq_region_start, $seq_region_end, $seq_region_strand) + = $mapper->fastmap($sr_name, $seq_region_start, $seq_region_end, $seq_region_strand, $sr_cs); + + # Skip features that map to gaps or coord system boundaries + next FEATURE if !defined $sr_name; + + # Get a slice in the coord system we just mapped to + if ( $asm_cs == $sr_cs || ( $cmp_cs != $sr_cs && $asm_cs->equals($sr_cs) ) ) { + $slice = $slice_hash{"NAME:$sr_name:$cmp_cs_name:$cmp_cs_vers"} + ||= $sa->fetch_by_region($cmp_cs_name, $sr_name, undef, undef, undef, $cmp_cs_vers); + } else { + $slice = $slice_hash{"NAME:$sr_name:$asm_cs_name:$asm_cs_vers"} + ||= $sa->fetch_by_region($asm_cs_name, $sr_name, undef, undef, undef, $asm_cs_vers); + } + } + + # If a destination slice was provided convert the coords + # If the destination slice starts at 1 and is forward strand, nothing needs doing + if ($dest_slice) { + unless ($dest_slice_start == 1 && $dest_slice_strand == 1) { + if ($dest_slice_strand == 1) { + $seq_region_start = $seq_region_start - $dest_slice_start + 1; + $seq_region_end = $seq_region_end - $dest_slice_start + 1; + } else { + my $tmp_seq_region_start = $seq_region_start; + $seq_region_start = $dest_slice_end - $seq_region_end + 1; + $seq_region_end = $dest_slice_end - $tmp_seq_region_start + 1; + $seq_region_strand *= -1; + } + } + + # Throw away features off the end of the requested slice + if(! $self->force_reslice){ + #force_reslice set by RegulatoryFeature::regulatory_attributes + #so we don't lose attrs which are not on the dest_slice + + next FEATURE if $seq_region_end < 1 || $seq_region_start > $dest_slice_length + || ( $dest_slice_sr_name ne $sr_name ); + + $slice = $dest_slice; + } + } + + + + push @features, Bio::EnsEMBL::Funcgen::SegmentationFeature->new_fast + ( { + start => $seq_region_start, + end => $seq_region_end, + strand => $seq_region_strand, + slice => $slice, + #'analysis' => $fset_hash{$fset_id}->analysis(), + #Need to pass this to keep Feature.pm happy + #Let's grab this from the FeatureSet in SetFeature new and pass + score => $score, + adaptor => $self, + dbID => $segmentation_feature_id, + display_label => $display_label, + set => $fset_hash{$fset_id}, + feature_type => $ftype_hash{$ftype_id}, + } ); + } + + return \@features; +} + + + +=head2 store + + Args : List of Bio::EnsEMBL::Funcgen::SegmentationFeature objects + Example : $ofa->store(@features); + Description: Stores given SegmentationFeature objects in the database. Should only + be called once per feature because no checks are made for + duplicates. Sets dbID and adaptor on the objects that it stores. + Returntype : Listref of stored SegmentationFeatures + Exceptions : Throws if a list of SegmentationFeature objects is not provided or if + the Analysis, CellType and FeatureType objects are not attached or stored + Caller : General + Status : At Risk + +=cut + +sub store{ + my ($self, @segs) = @_; + + if (scalar(@segs) == 0) { + throw('Must call store with a list of SegmentationFeature objects'); + } + + my $sth = $self->prepare(" + INSERT INTO segmentation_feature ( + seq_region_id, seq_region_start, + seq_region_end, seq_region_strand, + feature_type_id, feature_set_id, score, display_label + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?) + "); + + + my $db = $self->db(); + + + FEATURE: foreach my $seg (@segs) { + + if( !ref $seg || !$seg->isa('Bio::EnsEMBL::Funcgen::SegmentationFeature') ) { + throw('Feature must be an SegmentationFeature object'); + } + + if ( $seg->is_stored($db) ) { + #does not accomodate adding Feature to >1 feature_set + warning('SegmentationFeature [' . $seg->dbID() . '] is already stored in the database'); + next FEATURE; + } + + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::FeatureSet', $seg->feature_set); + + + my $seq_region_id; + ($seg, $seq_region_id) = $self->_pre_store($seg); + + $sth->bind_param(1, $seq_region_id, SQL_INTEGER); + $sth->bind_param(2, $seg->start(), SQL_INTEGER); + $sth->bind_param(3, $seg->end(), SQL_INTEGER); + $sth->bind_param(4, $seg->strand(), SQL_TINYINT); + $sth->bind_param(5, $seg->feature_type->dbID(), SQL_INTEGER); + $sth->bind_param(6, $seg->feature_set->dbID(), SQL_INTEGER); + $sth->bind_param(7, $seg->score(), SQL_DOUBLE); + $sth->bind_param(8, $seg->display_label(), SQL_VARCHAR); + + + $sth->execute(); + $seg->dbID( $sth->{'mysql_insertid'} ); + $seg->adaptor($self); + } + + return \@segs; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/SetFeatureAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/SetFeatureAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,515 @@ +# +# Ensembl module for Bio::EnsEMBL::DBSQL::Funcgen::SetFeatureAdaptor +# + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::DBSQL::Funcgen::SetFeatureAdaptor - A base database adaptor for SetFeature adaptors. +storing SetFeature objects. + +=head1 SYNOPSIS + +my $afa = $db->get_SetFeatureAdaptor(); + +my $features = $afa->fetch_all_by_Slice($slice); + + +=head1 DESCRIPTION + +The SetFeatureAdaptor is a base adaptor for all SetFeature adaptors. +e.g. AnnotatedFeatureAdaptor, RegulatoryFeatureAdaptor etc. +It provides common methods across all feature types. + + + +=head1 SEE ALSO + +Bio::EnsEMBL::Funcgen::SetFeature + +=cut + +package Bio::EnsEMBL::Funcgen::DBSQL::SetFeatureAdaptor; + +use strict; +use warnings; + +use Bio::EnsEMBL::Utils::Exception qw( throw warning ); +use Bio::EnsEMBL::Funcgen::SetFeature; +use Bio::EnsEMBL::Funcgen::DBSQL::BaseFeatureAdaptor; + +use vars qw(@ISA @EXPORT); +@ISA = qw(Bio::EnsEMBL::Funcgen::DBSQL::BaseFeatureAdaptor); +@EXPORT = (@{$DBI::EXPORT_TAGS{'sql_types'}}); +#required for child adaptor's store and _obj_from_sth methods + +=head2 fetch_all_by_FeatureType_FeatureSets + + Arg [1] : Bio::EnsEMBL::Slice + Arg [2] : Bio::EnsEMBL::FeatureType + Arg [3] : (optional) hashref - params hash + associated => 1, #Also return feature which have the associated FeatureType + logic_name => 'analysis.logic_name' + Example : my $slice = $sa->fetch_by_region('chromosome', '1'); + my $features = $ofa->fetch_all_by_Slice_FeatureType($slice, $ft); + Description: Retrieves a list of features on a given slice with main or associated FeatureType. + This is mainly used by external FeatureSets which can sometimes have more + than one associated FeatureType. NOTE: This is intended to work for FeatureTypes at the + feature level, not the more generic FeatureSet level FeatureTypes. + Returntype : Listref of Bio::EnsEMBL::SetFeature objects + Exceptions : Throws if no FeatureType object provided + Caller : General + Status : At Risk + +=cut + +sub fetch_all_by_FeatureType_FeatureSets { + my ($self, $ftype, $fsets, $params) = @_; + + if ($self->_feature_class eq 'annotated'){ + throw("This method is not appropriate for FeatureSets with feature_class 'annotated', please use standard fetch_all_by_FeatureSets or get_all_Features on an individual FeatureSet"); + #There is feature_type_id in annotated_feature + #Could warn and redirect + } + + $params->{'logic_name'} ||= undef; + $params->{'associated'} ||= undef; + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::FeatureType', $ftype); + my ($table_name, $table_syn) = @{$self->_main_table}; + + my $constraint = $table_syn.'.feature_set_id = fs.feature_set_id AND '. + $table_syn.'.feature_type_id='.$ftype->dbID.' AND '.$table_syn.".feature_set_id ".$self->_generate_feature_set_id_clause($fsets); + + #Should really pass the params hash on here + $constraint = $self->_logic_name_to_constraint($constraint, $params->{logic_name}); + + + my @features = @{$self->generic_fetch($constraint)}; + + + #This is an interim solution and really needs changing! + #Can we genericise this as a lazy loader function? + #Isn't there already something like this in core? + if ($params->{'associated'}){ + + for my $fset(@{$fsets}){ + #We want to bring back features from the same set + #We are not bringing back different class of feature i.e. AnnotatedFeatures and ExternalFeatures + #Let's not catch this, but let it silently fail so peaople. + + next if $table_name ne lc($fset->feature_class).'_feature'; + + my $sql = 'SELECT table_id from associated_feature_type where table_name="'.$table_name.'" and feature_type_id='.$ftype->dbID; + + #CR We are not restricting to feature_set here!!! + + + my @dbids = map $_ = "@$_", @{$self->dbc->db_handle->selectall_arrayref($sql)}; + + if(@dbids){ + $constraint = " $table_syn.${table_name}_id in (".join(',',@dbids).') ' if @dbids; + push @features, @{$self->generic_fetch($constraint, $params->{logic_name})}; + } + } + } + + return \@features; +} + + +=head2 fetch_all_by_Feature_associated_feature_types + + Arg [1] : Bio::EnsEMBL::SetFeature + Arg [2] : (optional) hashref - params hash, all entries optional + -logic_name => 'analysis.logic_name' + -include_direct_links => 1, #Also return feature which are linked by Feature->feature_type + Example : my $slice = $sa->fetch_by_region('chromosome', '1'); + my $features = $sfa->fetch_all_by_Feature_associated_feature_types($feature); + Description: Retrieves a list of all features linked via the associated FeatureTypes of + the given Feature in the same FeatureSet. This is mainly used by external + FeatureSets which can sometimes have more than one associated FeatureType. + Returntype : Listref of Bio::EnsEMBL::Funcgen::SetFeature objects + Exceptions : Throws if SetFeature not stored and valid + Caller : General + Status : At Risk + +=cut + +sub fetch_all_by_Feature_associated_feature_types { + my ($self, $feature, $params) = @_; + + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::SetFeature', $feature); + + #We always want associated! + #Otherwise it would be just a normal fetch_all_by_FeatureType_FeatureSets query + + $params->{'include_direct_links'} ||= undef; + $params->{'logic_name'} ||= undef; + + my %dbIDs;#Use a hash to reduce dbID redundancy + my ($table_name, $table_syn) = @{$self->_main_table}; + my $fset = $feature->feature_set; + my ($sql, $constraint, @features); + + #Direct FeatureType + if($params->{'include_direct_links'}){ + #Just grab dbIDs here rather than use generic fetch + $sql = "SELECT ${table_name}_id from $table_name where feature_type_id=".$feature->feature_type->dbID.' and feature_set_id='.$fset->dbID; + + #This just sets each value to a key with an undef value + map {$dbIDs{"@$_"} = undef} @{$self->dbc->db_handle->selectall_arrayref($sql)}; + } + + + + my @assoc_ftypes = @{$feature->associated_feature_types}; + + if(@assoc_ftypes){ + + my $ftype_ids = join(', ', (map $_->dbID, @assoc_ftypes)); + + #Now we need to restrict the Features based on the FeatureSet of the original Feature, we could make this optional. + $sql = "SELECT table_id from associated_feature_type aft, $table_name $table_syn where aft.table_name='".$fset->feature_class."_feature' and aft.feature_type_id in ($ftype_ids) and aft.table_id=${table_syn}.${table_name}_id and ${table_syn}.feature_set_id=".$fset->dbID; + #This just sets each value to a key with an undef value + + map {$dbIDs{"@$_"} = undef} @{$self->dbc->db_handle->selectall_arrayref($sql)}; + } + + + #CR do we need to fetch based on main ftype vs assoc ftpes and vice versa + + + if(keys %dbIDs){ + $constraint = " $table_syn.${table_name}_id in (".join(',', keys %dbIDs).')'; + push @features, @{$self->generic_fetch($constraint, $params->{logic_name})}; + } + + + return \@features; +} + + + + +=head2 fetch_all_by_Slice_FeatureType + + Arg [1] : Bio::EnsEMBL::Slice + Arg [2] : Bio::EnsEMBL::FeatureType + Arg [3] : (optional) string - analysis logic name + Example : my $slice = $sa->fetch_by_region('chromosome', '1'); + my $features = $ofa->fetch_all_by_Slice_FeatureType($slice, $ft); + Description: Retrieves a list of features on a given slice, specific for a given FeatureType. + Returntype : Listref of Bio::EnsEMBL::SetFeature objects + Exceptions : Throws if no FeatureType object provided + Caller : General + Status : At Risk + +=cut + +sub fetch_all_by_Slice_FeatureType { + my ($self, $slice, $type, $logic_name) = @_; + + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::FeatureType', $type); + + my $ft_id = $type->dbID(); + + my $constraint = $self->_main_table->[1].".feature_set_id = fs.feature_set_id AND ". + "fs.feature_type_id = '$ft_id'"; + + $constraint = $self->_logic_name_to_constraint($constraint, $logic_name); + + return $self->SUPER::fetch_all_by_Slice_constraint($slice, $constraint); +} + + +=head2 fetch_all_by_FeatureSets + + Arg [1] : Arrayref of Bio::EnsEMBL::FeatureSet objects + Arg [2] : optional - analysis.logic_name + Example : my $features = $set_feature_adaptor->fetch_all_by_FeatureSets(@fsets); + Description: Retrieves a list of features specific for a given list of FeatureSets. + Returntype : Listref of Bio::EnsEMBL::SetFeature objects + Exceptions : Throws if list provided does not contain FeatureSets or if none provided + Caller : General + Status : At Risk - Maybe not sensible to fetch all data in one call. + +=cut + +sub fetch_all_by_FeatureSets { + my ($self, $fsets, $logic_name) = @_; + + my $constraint = $self->_main_table->[1].'.feature_set_id '.$self->_generate_feature_set_id_clause($fsets); + $constraint = $self->_logic_name_to_constraint($constraint, $logic_name); + + return $self->generic_fetch($constraint); +} + + +=head2 _generate_feature_set_id_clause + + Arg [1] : Arrayref of Bio::EnsEMBL::FeatureSet objects + Example : my $fset_d_clause = $self->_generate_feature_set_id_clause($fsets); + Description: Build feature_set id clause for FeatureSet methods + Returntype : string + Exceptions : Throws if FeatureSets are passed + Throws if FeatureSet feature_class does not match adaptor feature_class + Throws if FeatureSet is not valid + Caller : Bio::EnsEMBL::DBSQL::SetFeatureAdaptor + Status : At Risk + +=cut + +sub _generate_feature_set_id_clause{ + my ($self, $fsets) = @_; + + my @fs_ids; + + if(! ( (ref($fsets) eq 'ARRAY') && + scalar(@{$fsets}) > 0) ){ + throw('Must provide a list of Bio::EnsEMBL::FeatureSet objects'); + } + + my $fclass = $self->_feature_class; + + foreach my $fset (@{$fsets}) { + + if (! (ref($fset) && $fset->isa("Bio::EnsEMBL::Funcgen::FeatureSet"))){ + throw('Not a FeatureSet object'); + } + + if($fset->feature_class ne $fclass){ + throw('FeatureSet feature_class \''.$fclass.'\' does not match adaptor feature_class \''.$fset->feature_class.'\''); + } + + $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::FeatureSet', $fset); + + push (@fs_ids, $fset->dbID()); + } + + return scalar(@fs_ids >1) ? 'IN ('.join(',', @fs_ids).')' : '= '.$fs_ids[0]; +} + + + +=head2 fetch_all_by_Slice_FeatureSets + + Arg [1] : Bio::EnsEMBL::Slice + Arg [2] : Arrayref of Bio::EnsEMBL::FeatureSet objects + Arg [3] : optional - analysis.logic_name + Example : my $slice = $sa->fetch_by_region('chromosome', '1'); + my $features = $ofa->fetch_all_by_Slice_FeatureSets($slice, \@fsets); + Description: Retrieves a list of features on a given slice, specific for a given list of FeatureSets. + Returntype : Listref of Bio::EnsEMBL::SetFeature objects + Exceptions : Throws if list provided does not contain FeatureSets or if none provided + Caller : General + Status : At Risk + +=cut + +sub fetch_all_by_Slice_FeatureSets { + my ($self, $slice, $fsets, $logic_name) = @_; + + my $constraint = $self->_main_table->[1].'.feature_set_id '. + $self->_generate_feature_set_id_clause($fsets); + + $constraint = $self->_logic_name_to_constraint($constraint, $logic_name); + + return $self->SUPER::fetch_all_by_Slice_constraint($slice, $constraint); +} + + + + +=head2 fetch_Iterator_by_Slice_FeatureSets + + Arg [1] : Bio::EnsEMBL::Slice + Arg [2] : Arrayref of Bio::EnsEMBL::FeatureSet objects + Arg [3] : optional - analysis.logic_name + Arg [4] : optional - iterator chunk length. Default is 1MB + Example : my $slice = $sa->fetch_by_region('chromosome', '1'); + my $iter = $ofa->fetch_Iterator_by_Slice_FeatureSets($slice, \@fsets); + + + Description: Simple Iterator wrapper method for fetch_all_by_Slice_FeatureSets + Returntype : Listref of Bio::EnsEMBL::SetFeature objects + Exceptions : Throws if list provided does not contain FeatureSets or if none provided + Caller : General + Status : At Risk + +=cut + + +sub fetch_Iterator_by_Slice_FeatureSets{ + my ($self, $slice, $fsets, $logic_name, $chunk_length) = @_; + + return $self->fetch_Iterator_by_Slice_method + ($self->can('fetch_all_by_Slice_FeatureSets'), + [$slice, $fsets, $logic_name], + 0,#Slice idx + $chunk_length #default is 1000000 + ); +} + + + + +#This is all done at the level of the feature_set + + + +sub _logic_name_to_constraint { + my $self = shift; + my $constraint = shift; + my $logic_name = shift; + + return $constraint if (!$logic_name); + + my $aa = $self->db->get_AnalysisAdaptor(); + my $an = $aa->fetch_by_logic_name($logic_name); + + if(! $an) { + #warn or throw? + warn("No analysis associated with logic_name $logic_name"); + return undef; + } + + my $an_id = $an->dbID(); + + $constraint .= ' AND' if($constraint); + $constraint .= " fs.analysis_id = $an_id"; + return $constraint; +} + + + +=head2 _default_where_clause + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns an additional table joining constraint to use for + queries. + Returntype : List of strings + Exceptions : None + Caller : Internal + Status : At Risk + +=cut + +sub _default_where_clause { + my $self = shift; + + return $self->_main_table->[1].'.feature_set_id = fs.feature_set_id'; +} + + +=head2 _final_clause + + Args : None + Example : None + Description: PROTECTED implementation of superclass abstract method. + Returns an GROUP/ORDER BY clause. + Returntype : String + Exceptions : None + Caller : generic_fetch + Status : At Risk + +=cut + +sub _final_clause { + my $self = shift; + #return ''; + return ' ORDER BY '.$self->_main_table->[1].'.seq_region_id, '.$self->_main_table->[1].'.seq_region_start'; +} + + + +=head2 fetch_all_by_logic_name + + Arg [1] : string $logic_name the logic name of the analysis of features to obtain + Example : $fs = $a->fetch_all_by_logic_name('foobar'); + Description : Returns an arrayref of features created from the database. only + features with an analysis of type $logic_name will be returned. + Returntype : arrayref of Bio::EnsEMBL::SetFeatures + Exceptions : thrown if $logic_name not defined + Caller : General + Status : At risk + +=cut + +#re-implemented for non-standard feature table i.e. points at feature_set + +sub fetch_all_by_logic_name { + my $self = shift; + my $logic_name = shift || throw( "Need a logic_name" ); + + my $constraint; + my $an = $self->db->get_AnalysisAdaptor->fetch_by_logic_name($logic_name); + $constraint = ' '.$self->_main_table->[1].'.feature_set_id=fs.feature_set_id and fs.analysis_id = '.$an->dbID() if($an); + + return (defined $constraint) ? $self->generic_fetch($constraint) : undef; +} + + +=head2 list_dbIDs + + Args : None + Example : my @feature_ids = @{$ofa->list_dbIDs()}; + Description: Gets an array of internal IDs for all SetFeature objects in + the current database. + Returntype : List of ints + Exceptions : None + Caller : ? + Status : Medium Risk + +=cut + +#Put this in the BaseAdaptor? + +sub list_dbIDs { + my $self = shift; + + return $self->_list_dbIDs($self->_main_table->[0]); +} + + +=head2 _feature_class + + Example : if($self->feature_class ne $fset->feature_class){ throw('some error'); } + Description: Convenience method to retrieve the feature class for this adaptor + Returntype : String + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub _feature_class{ + my $self = shift; + + #use the first word of the table name as the class + my $fclass; + ($fclass = $self->_main_table->[0]) =~ s/_.*//;#use the first word of the table name as the class + + return $fclass; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/SliceAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/DBSQL/SliceAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,299 @@ + +# +# Ensembl module for Bio::EnsEMBL::Funcgen::DBSQL::SliceAdaptor +# +# +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::Funcgen::DBSQL::SliceAdaptor - A database aware adaptor responsible for +the creation of Slices in the context of eFG objects. + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; + + + $db = Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor->new(...); + + $slice_adaptor = $db->get_SliceAdaptor(); + + # get a slice on the entire chromosome X + $gene_regulation_slice = $slice_adaptor->fetch_by_Gene_FeatureSets($gene, \@fsets); + + +=head1 DESCRIPTION + +This module is simple wrapper class for the core SliceAdaptor, extending new +methods to generate Slices for eFG features associated with a given gene or transcript. + +=cut + + +package Bio::EnsEMBL::Funcgen::DBSQL::SliceAdaptor; +use vars qw(@ISA); +use strict; + +use Bio::EnsEMBL::DBSQL::SliceAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning stack_trace_dump); + + +@ISA = ('Bio::EnsEMBL::DBSQL::SliceAdaptor'); + +sub new { + my ($caller, $efgdb) = @_; + + my $class = ref($caller) || $caller; + + #Now have to reset the efg db to the dnadb + #Does work using just efg.seq_region_id + #But this is undesirable as we want to perform + #other non seq_region table style queries on core DB + my $self = $class->SUPER::new($efgdb->dnadb); + $self->efgdb($efgdb); + return $self; +} + + +sub efgdb{ + my ($self, $efgdb) = @_; + + if($efgdb){ + + if(! (ref($efgdb) && $efgdb->isa('Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor'))){ + throw('Must provide a Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor'); + } + + $self->{'efgdb'} = $efgdb; + } + + return $self->{'efgdb'}; +} + + + +=head2 fetch_by_Gene_FeatureSets + + Arg [1] : Bio::EnsEMBL::Gene + Example : my $gene_reg_slice = $efg_slice_adaptor->fetch_by_gene($gene); + Description: Fetches a slice by finding the associated regulatory + elements associated with a given gene(and it's transcripts and + translations), and extending the gene feature slice to + encapsulate the associated regulatory elements. + Returntype : Bio::EnsEMBL::Slice or undef + Exceptions : throw if incorrent arg provided + Caller : General + Status : At risk + +=cut + +sub fetch_by_Gene_FeatureSets { + my ($self, $gene, $fsets) = @_; + + if(! ( ref($gene) && $gene->isa('Bio::EnsEMBL::Gene'))){ + throw("You must pass a valid Bio::EnsEMBL::Gene object"); + } + + #Now we want to get all the eFG feature associated with this gene and build the slice + #resitrict to RegFeats for now. + my $cs_name = $gene->slice->coord_system_name; + my $gene_chr = $gene->seq_region_name; + my $start = $gene->seq_region_start; + my $end = $gene->seq_region_end; + + + ($start, $end) = $self->_set_bounds_by_xref_FeatureSets + ($gene_chr, $start, $end, $gene, $fsets); + + #Now need to do this for all transcripts and proteins too? + + foreach my $trans(@{$gene->get_all_Transcripts}){ + ($start, $end) = $self->_set_bounds_by_xref_FeatureSets + ($gene_chr, $start, $end, $trans, $fsets); + + my $translation = $trans->translation; + ($start, $end) = $self->_set_bounds_by_xref_FeatureSets + ($gene_chr, $start, $end, $translation, $fsets) if $translation; + } + + return $self->fetch_by_region($cs_name, $gene_chr, $start, $end, $gene->strand); +} + +=head2 fetch_by_Transcript_FeatureSets + + Arg [1] : Bio::EnsEMBL::Transcript + Example : my $trans_reg_slice = $efg_slice_adaptor->fetch_by_transcript($transcript); + Description: Fetches a slice by finding the associated regulatory + elements associated with a given transcript(and it's translation), and + extending the gene feature slice to encapsulate the associated regulatory + elements. + Returntype : Bio::EnsEMBL::Slice or undef + Exceptions : throw if incorrent arg provided + Caller : self + Status : at risk + +=cut + +sub fetch_by_Transcript_FeatureSets{ + my ($self, $transcript, $fsets) = @_; + + + if(! ( ref($transcript) && $transcript->isa('Bio::EnsEMBL::Transcript'))){ + throw("You must pass a valid Bio::EnsEMBL::Transcript object"); + } + + #Now we want to get all the eFG feature associated with this gene and build the slice + #resitrict to RegFeats for now. + my $cs_name = $transcript->slice->coord_system_name; + my $trans_chr = $transcript->seq_region_name; + my $start = $transcript->seq_region_start; + my $end = $transcript->seq_region_end; + + + ($start, $end) = $self->_set_bounds_by_xref_FeatureSets + ($trans_chr, $start, $end, $transcript, $fsets); + + #Now need to do this for the protein too + my $translation = $transcript->translation; + ($start, $end) = $self->_set_bounds_by_xref_FeatureSets + ($trans_chr, $start, $end, $translation, $fsets) if $translation; + + + return $self->fetch_by_region($cs_name, $trans_chr, $start, $end, $transcript->strand); +} + +#Do we need fetch_by_translation? + +=head2 _set_bounds_by_xref_FeatureSets + + Arg [1] : string - seq_region_name i.e. chromosome name. + Arg [2] : string - seq_region_start of current slice bound + Arg [3] : string - seq_region_end of current slice bound. + Arg [4] : Bio::EnsEMBL::Gene|Transcript|Translation + Arg [5] : arrayref - Bio::EnsEMBL::Funcgen::FeatureSet + Example : ($start, $end) = $self->_set_bounds_by_regulatory_feature_xref + ($trans_chr, $start, $end, $transcript, $fsets); + Description: Internal method to set an xref Slice bounds given a list of + FeatureSets. + Returntype : List - ($start, $end); + Exceptions : throw if incorrent args provided + Caller : self + Status : at risk + +=cut + +sub _set_bounds_by_xref_FeatureSets{ + my ($self, $chr, $start, $end, $obj, $fsets) = @_; + + my ($extdb_name, $efg_feature); + my $dbe_adaptor = $self->efgdb->get_DBEntryAdaptor; + + + #do we need to test for start/end/chr here? + #Is implicit if we test for obj and fsets, but + #does not check if defined. + + #Set ext_dbname and validate obj + #Do we need a central store for ensembl db names? + + + + + if($obj->isa('Bio::EnsEMBL::Gene')){ + $extdb_name = 'ensembl_core_Gene'; + } + elsif($obj->isa('Bio::EnsEMBL::Transcript')){ + $extdb_name = 'ensembl_core_Transcript'; + } + elsif($obj->isa('Bio::EnsEMBL::Translation')){ + $extdb_name = 'ensembl_core_Translation'; + } + else{ + throw('Currently only handles Ensembl Gene, Transcript and Translation xrefs'); + } + + + #warn "Setting bounds by $obj ".$obj->stable_id; + + #Set which eFG features we want to look at. + + if(ref($fsets) ne 'ARRAY' || scalar(@$fsets) == 0){ + throw('Must define an array of Bio::EnsEMBL::FeatureSets to extend xref Slice bound. You passed: '.$fsets); + } + + my %feature_set_types; + + foreach my $fset(@$fsets){ + $self->efgdb->is_stored_and_valid('Bio::EnsEMBL::Funcgen::FeatureSet', $fset); + + $feature_set_types{$fset->feature_class} ||= []; + push @{$feature_set_types{$fset->feature_class}}, $fset; + } + + + #We can list the outer loop here and put in the BaseFeatureAdaptor, or possible storable as we do have FeatureType xrefs. + #This would be useful for fetching all the efg features for a given xref and FeatureSets + #Don't implement as a parent sub and call from here as this would mean looping through array twice. + #Altho we could pass a code ref to do the filtering? + #Just copy and paste for now to avoid obfuscation + + + #Get xrefs for each eFG feature set type + foreach my $fset_type(keys %feature_set_types){ + + my $xref_method = 'list_'.$fset_type.'_feature_ids_by_extid'; + #e.g. list_regulatory_feature_ids_by_extid + + my $adaptor_method = 'get_'.ucfirst($fset_type).'FeatureAdaptor'; + + my $adaptor = $self->efgdb->$adaptor_method; + + my %feature_set_ids; + map $feature_set_ids{$_->dbID} = 1, @{$feature_set_types{$fset_type}}; + + #my $cnt = 0; + + + #This should use fetch_all_by_external_name method + + foreach my $efg_feature(@{$adaptor->fetch_all_by_external_name($obj->stable_id, $extdb_name)}){ + next if ! exists $feature_set_ids{$efg_feature->feature_set->dbID}; + next if $efg_feature->seq_region_name ne $chr; + + + #warn "found xref ".$efg_feature->display_label.' with start '.$efg_feature->seq_region_start; + #$cnt ++; + + $start = $efg_feature->seq_region_start if $efg_feature->seq_region_start < $start; + $end = $efg_feature->seq_region_end if $efg_feature->seq_region_end > $end; + } + + #warn "Found $cnt $fset_type xrefs"; + + } + + + + return ($start, $end); +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/DataSet.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/DataSet.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,595 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::DataSet +# +# You may distribute this module under the same terms as Perl itself + + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::Funcgen::DataSet - A module to represent DataSet object. + + +=head1 SYNOPSIS + +use Bio::EnsEMBL::Funcgen::DataSet; + +my $data_set = Bio::EnsEMBL::Funcgen::DataSet->new( + -DBID => $dbID, + -ADAPTOR => $self, + -SUPPORTING_SETS => [$rset], + -FEATURE_SET => $fset, + -DISPLAYABLE => 1, + -NAME => 'DATASET1', + ); + + + +=head1 DESCRIPTION + +A DataSet object provides access to either or both raw results and AnnotatedFeatures +for a given experiment within a Slice, associated with set wide experimental meta data. +This was aimed primarily at easing access to data via the web API by creating +a wrapper class with convenience methods. The focus of this class is to contain raw and +associated processed/analysed data to be displayed as a set within the browser i.e. an +experiment may have different cell lines, features or time points, these would require different DataSets. +# However a DataSet may contain mixed data types i.e. promoter & histone???? No give separate sets? +May have duplicates for raw data but only one predicted features track?? +The data in this class is kept as lightweight as possible with data being loaded dynamically. + + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::DataSet; + +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate); +use Bio::EnsEMBL::Funcgen::Storable; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Storable); +#Should not be a Set as is sufficiently different + + +=head2 new + + + + Example : my $dset = Bio::EnsEMBL::Funcgen::DataSet->new( + -SUPPORTING_SETS => [$fset1, $fset2], + -FEATURE_SET => $fset, + -DISPLAYABLE => 1, + -NAME => 'DATASET1', + ); + + Description: Constructor for DataSet objects. + Returntype : Bio::EnsEMBL::Funcgen::DataSet + Exceptions : Throws if no experiment_id defined + Caller : General + Status : At risk + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + #do we need to add $fg_ids to this? Currently maintaining one feature_group focus.(combi exps?) + my ($fset, $sets, $name) + = rearrange(['FEATURE_SET', 'SUPPORTING_SETS', 'NAME'], @_); + + + my @caller = caller(); + + #do we need to passexperiment_id to check that table_name/id correspond for storage? + #'EXPERIMENT_ID', 'EXPERIMENT_IDS', + + #Can have more than one experiment_id for a combined feature set. But shouldn't query like that. + #therefore we need to be able to track back from feature to ec's rather than exps. + #as there may be mixed data in an exp which didn't necessarily contribute to the combined feature + #We are now separating potentially different featuretype from the same exp into different result_groups + #therefore we only have to track back to the result_group e.g. the contig chip set + + #We also need a way of pulling back GOLDEN/combined resultssets based on feature_set_id + #Set status as GOLDEN, then pull back displayable or GOLDEN raw results + + #Could link experiment_feature_type/feature_set to ec or result_set table? + #latter would mean we don't have to specifiy which ec, just part of set. + #This will make it easier for populating pfs but will mean that we can't easily track back to a particular ec without doing some probe/slice look up via the array chip. + #Not really a requirement, so let's take this hit. + + #Could then maybe use DataSet to store pfs, otherwise we'd have to pass the rset or at the very least the result_set_id. + #do we need some control of creating new objects with dbID and adding result_groups/feature_sets and them storing/updating them + #potential for someone to create one from new using a duplicate dbID and then linking incorrect data to a pre-existing ResultGroup + #can we check wether caller is DataSetAdaptor if we have dbID? + + if($self->dbID() && $caller[0] ne "Bio::EnsEMBL::Funcgen::DBSQL::DataSetAdaptor"){ + throw('You must use the DataSetAdaptor to generate DataSets with dbID i.e. from the DB,'. + ' as this module accomodates updating which may cause incorrect data if the object'. + ' is not generated from the DB'); + } + + + $self->{'supporting_sets'} ||= {}; + #throw("Must specify at least one Result/FeatureSet") if((! $sets) && (! $fset)); + #removed this to allow generation of DataSets without feature sets + #could reimplement this if we change the DataSetAdaptor::_obj_from_sth + + $self->add_supporting_sets($sets) if $sets; + $self->product_FeatureSet($fset) if $fset; + $self->name($name) if $name; + + return $self; +} + + + + + + + +#methods +#set wide display label(predicted_feature) + more wordy label for wiggle tracks? +#defined by experiment type i.e. time course would require timepoint in display label +#deal with this dynamically or have display_label in table +#Need call on type, or fetch all would + +#_get_ec_ids or contigsets? +#this should now be an intrinsic part of this class/adaptor + +#cell line +#feature_type +#displayable...should have one for the whole set and one for each raw and predicted? + +#have analysis as arg? Or do we get all analysis sets? +#we need to be able to set analyses for DataSets dynamically from DB +#pick up all DataSets +#displayable field in DataSets also? + +#If we have mixed types in the same experiment then we could get promoter features and histone wiggle tracks displayed togeter +#Not v.good for display purposes? We may want to separate the promoter and histone tracks, or we may want ll the experiment data together but of mixed types. +#We need to be able to pull back the experiment type for each set, therefore this needs setting on an ec level, not an experiment level. +#This is also v.reliant on putting contig set info in place, otherwise we may get mixed chip types in same set. + +#get_raw_analysis_name +#get_predicted_feature_analysis_name +#set ResultFeatures and AnnotatedFeatures in hash keyed by analysis_name? + +#Need to change to simple accessor +#or should we maintain to provide explicit method for delineating between parent and supporting FeatureSets? +#yes, and sub the feature_type/cell_type checks + + +=head2 product_FeatureSet + + Arg [1] : (optional) Bio::EnsEMBL::Funcgen::FeatureSet + Example : $data_set->product_FeatureSet($fset); + Description: Getter and setter for the main feature_set attribute for this DataSet. + Returntype : Bio::EnsEMBL::Funcgen::FeatureSet + Exceptions : Throws not a valid FeatureSet or if main feature_set has already been set. + Caller : General + Status : At Risk - change to get_product_FeatureSet + +=cut + +sub product_FeatureSet { + my ($self, $fset) = @_; + + if($fset){ + + if (! ($fset && ref($fset) && $fset->isa("Bio::EnsEMBL::Funcgen::FeatureSet"))){ + throw("Need to pass a valid Bio::EnsEMBL::Funcgen::FeatureSet") + } + + if(defined $self->{'feature_set'}){ + throw("The main feature_set has already been set for this DataSet, maybe you want add_SupportingSets?"); + } + else{ + $self->_validate_and_set_types($fset); + $self->{'feature_set'} = $fset; + } + } + + return $self->{'feature_set'}; +} + + +=head2 add_supporting_sets + + Arg [1] : Array of Bio::EnsEMBL::Feature/ResultSet object + Example : $dset->add_supporting_sets($rset); + Description: Adds Result/FeatureSets to the DataSet + Returntype : none + Exceptions : Throws if set not valid for supporting_set type of DataSet + Throws if supporting_sets is not an array ref + Caller : General + Status : At Risk + +=cut + + +sub add_supporting_sets { + my ($self, $sets) = @_; + + #should we handle displayable here, and propogate to the ResultSet if update_status is set + #is there scope to write a Funcgen::Storable, which provides convenience methods to StatusAdaptor? + #would have to make sure Feature object also inherited from Funcgen::Storable aswell as BaseFeature + + throw("Supporting sets need to be a reference to an ARRAY:\t".$sets) if ref($sets) ne 'ARRAY'; + + foreach my $set(@$sets){ + + if(!(ref($set) && $set->isa('Bio::EnsEMBL::Funcgen::Set') && $set->set_type ne 'data' && $set->dbID)){ + throw("Need to pass a valid stored Bio::EnsEMBL::Funcgen::Set which is not a DataSet:\t$set"); + } + #set type cannot be data at present as it does not inherit from Set.pm + + + + #Only validate if we are dealing with result type data + #As we can have various cell/feature_types for compound analyses e.g. RegulatoryFeatures + + $self->_validate_and_set_types($set) if $set->set_type() ne 'feature'; + + #should ResultSet/Adaptor contain all the fetch_methods, and leave DataSet as a kind of organisational class as a single point of access. + #DataSetAdaptor to perform the ordering according to feature/celltype + #This will still not resolve the complex data sets which can be accomodated by the DB. + #Maybe we can keep the data sets as simple as there are and confer the association by tracking back to the experiment? + #Would there only ever be one experiment for a complex data_set? + + + #Can have more than one experiment for a compound feature set, would we ever want to display raw data? + #This is actually an easier problem unless we are displaying two feature types(i.e. complex and compound) + + + $self->{'supporting_sets'}->{$set->analysis->dbID()} ||= (); + push @{$self->{'supporting_sets'}->{$set->analysis->dbID()}}, $set; + } + + return; +} + + +=head2 _validate_and_set_types + + Arg [1] : Bio::EnsEMBL::Feature/ResultSet object + Example : $dset->_validate_and_set_types($rset); + Description: Validates and sets DataSet cell and feature types + Returntype : none + Exceptions : Throws if types not valid + Caller : General + Status : At Risk + +=cut + + +sub _validate_and_set_types{ + my ($self, $set) = @_; + + #slightly dodgy bypassing methods, but extendable + + #This currently restricts all set types to one cell and feature type + #this is incorrect for feature_set types as we want to munge several feature and possibly cell types + #into one combined data set. + #this should set it to the FeatureSet type if is feature_set data_set + #this only works as we only validate supporting_sets if type is not feature + + for my $type('feature_type', 'cell_type'){ + + if(defined $self->{$type}){ + + #Need to test isa here? Why is this passing the defined test if not set? + if($set->{$type}->name() ne $self->{$type}->name()){ + + throw(ref($set)." $type(".$set->{$type}->name(). + ") does not match DataSet $type(".$self->{$type}->name().")"); + + } + } + else{ + $self->{$type} = $set->{$type}; + } + } + + return; +} + + + +=head2 get_supporting_sets_by_Analysis + + Arg [1] : Bio::EnsEMBL::Funcgen:Analysis + Arg [2] : (optional) status - e.g 'DISPLAYABLE' + Example : my $anal_sets = @{$result_set->get_ResultSets_by_Analysis($analysis)}; + Description: Getter for the SupportingSet objects of a given Analysis. + Returntype : ARRAYREF + Exceptions : Throws if arg is not a valid stored Bio::EnsEMBL::Anaylsis + Caller : General + Status : At Risk + +=cut + +sub get_supporting_sets_by_Analysis { + my ($self, $analysis, $status) = @_; + + + my @rsets; + + + #should we handle displayable here, and propogate to the ResultSet if update_status is set + #is there scope to write a Funcgen::Storable, which provides convenience methods to StatusAdaptor? + #would have to make sure Feature object also inherited from Funcgen::Storable aswell as BaseFeature + + + if (! ($analysis->isa("Bio::EnsEMBL::Analysis") && $analysis->dbID())){ + throw("Need to pass a valid stored Bio::EnsEMBL::Funcgen::ResultSet"); + } + + #will have to generate new array of object here if we want to filter displayable + #This may result in returning a ref to the stored ResultSets for no status + #And a ref to the abstracted/filtered i.e. non-stored ResultSets if we have a status + #This could cause problems if people want to edit the real ResultSets via the refs + #If we edit the ResultSets like this, we would still store via their adaptor + #so would need to refresh DataSet anyway. + + #should ResultSet/Adaptor contain all the fetch_methods, and leave DataSet as a kind of organisational class as a single point of access. + #DataSetAdaptor to perform the ordering according to feature/celltype + #This will still not resolve the complex data sets which can be accomodated by the DB. + #Maybe we can keep the data sets as simple as there are and confer the association by tracking back to the experiment? + #Would there only ever be one experiment for a complex data_set? + + + #Can have more than one experiment for a compound feature set, would we ever want to display raw data? + #This is actually an easier problem unless we are displaying two feature types(i.e. complex and compound) + + #could we have >1 rset with the same analysis? + + foreach my $anal_rset(@{$self->{'supporting_sets'}->{$analysis->dbID()}}){ + + if(! defined $status){ + push @rsets, $anal_rset; + } + elsif($anal_rset->has_status($status)){ + push @rsets, $anal_rset; + } + } + + return \@rsets; +} + + + +=head2 get_supporting_sets + + Arg [1] : (optional) status - e.g 'DISPLAYABLE' + Example : my @status_sets = @{$data_set->get_supporting_sets($status)}; + Description: Getter for the ResultSets for this DataSet. + Returntype : Arrayref + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub get_supporting_sets{ + my ($self, $status, $set_type) = @_; + #swap the args here + + #Add analysis here and make above method wrapper + + #Validate type here + if($set_type && + ($set_type ne 'result' && + $set_type ne 'feature' && + $set_type ne 'input')){ + throw("You have specified an invalid supporting set type:\t$set_type"); + } + + + my @ssets; + + foreach my $anal_id(keys %{$self->{'supporting_sets'}}){ + + foreach my $sset(@{$self->{'supporting_sets'}->{$anal_id}}){ + + if(defined $status && + (! $sset->has_status($status))){ + next; + } + + if(defined $set_type && + ($sset->set_type ne $set_type)){ + next; + } + + push @ssets, $sset; + } + } + + return \@ssets; +} + + + + +=head2 get_displayable_supporting_sets + + Example : my @displayable_rsets = @{$result_set->get_displayable_supporting_sets()}; + Description: Convenience method for web display + Returntype : Arrayref + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub get_displayable_supporting_sets{ + my ($self, $set_type) = @_; + + return $self->get_supporting_sets('DISPLAYABLE', $set_type); +} + + + +=head2 get_displayable_product_FeatureSet + + Example : my $fset = $data_set->get_displayable_product_FeatureSet(); + Description: Convenience method for web display + Returntype : Bio::EnsEMBL::Funcgen::FeatureSet + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub get_displayable_product_FeatureSet{ + my $self = shift; + + return $self->product_FeatureSet->has_status('DISPLAYABLE') ? $self->product_FeatureSet() : undef; +} + + + + + +=head2 name + + Example : my $dset->name('DATASET1'); + Description: Getter/Setter for the name of this DataSet. + Returntype : string + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub name { + my $self = shift; + + $self->{'name'} = shift if @_; + + return $self->{'name'}; +} + + + + +#The following attributes are generated dynamically from the consituent Result/FeatureSets + +=head2 cell_type + + Example : my $dset_ctype_name = $dset->cell_type->name(); + Description: Getter for the cell_type for this DataSet. + Returntype : Bio::EnsEMBL::Funcgen::CellType + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub cell_type { + my $self = shift; + + return $self->{'cell_type'}; +} + +=head2 feature_type + + Example : my $dset_ftype_name = $dset->feature_type->name(); + Description: Getter for the feature_type for this DataSet. + Returntype : Bio::EnsEMBL::Funcgen::FeatureType + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub feature_type { + my $self = shift; + + return $self->{'feature_type'}; +} + + + + + +=head2 display_label + + Example : print $rset->display_label(); + Description: Getter for the display_label attribute for this DataSet. + This is more appropriate for teh predicted_features of the set. + Use the individual display_labels for each raw result set. + Returntype : str + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub display_label { + my $self = shift; + + + #Add display label in table? + + if(! $self->{'display_label'}){ + + #This does not account for DataSet without a product FeatureSet + my $fset = $self->product_FeatureSet; + + if($fset && ($fset->feature_type->class() eq 'Regulatory Feature')){ + $self->{'display_label'} = 'Regulatory Features'; + } + else{ + + $self->{'display_label'} = $self->feature_type->name()." -"; + $self->{'display_label'} .= " ".($self->cell_type->display_label() || + $self->cell_type->description() || + $self->cell_type()->name()); + $self->{'display_label'} .= " Enriched Sites"; + } + } + + return $self->{'display_label'}; +} + + +#sub get_type_config{ +# my ($self) = @_; +# +# if (! defined $self->{type_config}){ +# $self->{type_config} = $self->adaptor->fetch_type_config_by_DataSet($self); +# } +# +# return $self->{type_config}; +#} + + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/Experiment.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Experiment.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,439 @@ + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::Funcgen::Experiment + +=head1 SYNOPSIS + +use Bio::EnsEMBL::Funcgen::Experiment; + +my $exp = Bio::EnsEMBL::Funcgen::Experiment->new + ( + -ADAPTOR => $self, + -NAME => $name, + -EXPERIMENTAL_GROUP => $experimental_group, + -DATE => $date, + -PRIMARY_DESIGN_TYPE => 'binding_site_indentification', + -DESCRIPTION => $description, + -ARCHIVE_ID => $archive_id, + ); + +my $db_adaptor = Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor->new(...); +my $exp_adaptor = $db_adaptor->get_ExperimentAdaptor(); +my $exp = $exp_adaptor->fetch_by_name($exp_name) + +=head1 DESCRIPTION + +The Experiment class represents an instance of an experiment i.e. a discrete set of data + +=cut + + +################################################################################ + +package Bio::EnsEMBL::Funcgen::Experiment; + +use warnings; +use strict; + +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate ); +use Bio::EnsEMBL::Funcgen::Storable; +use vars qw(@ISA); + +@ISA = qw(Bio::EnsEMBL::Funcgen::Storable); + + + +=head2 new + + Arg [-NAME] : String - experiment name + Arg [-EXPERIMENTAL_GROUP] : Bio::EnsEMBL::Funcgen ExperimentalGroup associated with this experiment + Arg [-DATE] : String - Date of the experiment (YYYY-MM-DD) + Arg [-PRIMARY_DESIGN_TYPE] : String - MGED term for the primary design of teh experiment e.g. binding_site_identification + Arg [-DESCRIPTION] : String + + Example : my $array = Bio::EnsEMBL::Funcgen::Experiment->new + ( + -NAME => $name, + -EXPERIMENTAL_GROUP => $group, + -DATE => $date, + -PRIMARY_DESIGN_TYPE => $p_design_type, + -DESCRIPTION => $description, + ); + + Description: Creates a new Bio::EnsEMBL::Funcgen::Experiment object. + Returntype : Bio::EnsEMBL::Funcgen::Experiment + Exceptions : Throws if name not defined + Throws if ExperimentalGroup not valid + Caller : General + Status : Medium Risk + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + my ($name, $group, $date, $p_dtype, $desc, $archive_id, $data_url, $xml_id, $xml) + = rearrange( ['NAME', 'EXPERIMENTAL_GROUP', 'DATE', 'PRIMARY_DESIGN_TYPE', + 'DESCRIPTION','ARCHIVE_ID', 'DATA_URL', 'MAGE_XML', 'MAGE_XML_ID'], @_ ); + + + #Added in v68 + #Remove in v69 + if($data_url || $archive_id){ + throw('The -data_url and -archive_id parameters have been moved to the InputSubSet class'); + } + + + #Mandatory attr checks + + if(ref($group) ne 'Bio::EnsEMBL::Funcgen::ExperimentalGroup'){ + throw("Must pass a valid stored Bio::EnsEMBL::Funcgen::ExperimentalGroup object"); + } + + if(! defined $name){ + throw('You must provide a name parameter'); + } + + #test date format here? + + + #Direct assignment here so we avoid setter test in methods + $self->{name} = $name; + $self->{group} = $group; + $self->{date} = $date if defined $date; + $self->{primary_design_type} = $p_dtype if defined $p_dtype; #MGED term for primary design type + $self->{description} = $desc if defined $desc; + + #Maintain setter funcs here as these are populated after initialisation + $self->mage_xml_id($xml_id) if defined $xml_id; + $self->mage_xml($xml) if defined $xml; + + return $self; +} + + +### ACCESSOR METHODS ### + +=head2 name + + Example : my $exp_name = $exp->name; + Description : Getter for the experiment name + Returntype : String + Exceptions : None + Caller : General + Status : Stable + +=cut + +sub name{ + return $_[0]->{'name'}; +} + + +=head2 experimental_group + + Example : my $exp_group_name = $exp->experimental_group()->name(); + Description : Getter for the experimental group + Returntype : Bio::EnsEMBL::Funcgen::ExperimentalGroup + Exceptions : None + Caller : General + Status : At risk + +=cut + +sub experimental_group{ + return $_[0]->{'group'}; +} + + +=head2 date + + Example : my $exp_date = $exp->date; + Description : Getter for the date + Returntype : String + Exceptions : None + Caller : General + Status : Stable + +=cut + +sub date{ + return $_[0]->{'date'}; +} + + +=head2 description + + Example : my $exp_desc = $exp->description + Description : Getter for the experiment description + Returntype : String + Exceptions : None + Caller : General + Status : At risk - Not used, was stable until v64 + +=cut + +sub description{ + return $_[0]->{'description'}; +} + + + + + +=head2 primary_design_type + + Example : my $pdt = $exp->primary_design_type; + Description : Getter for the primary design type + Returntype : String - MGED term + Exceptions : None + Caller : General + Status : At risk + +=cut + +sub primary_design_type{ + return $_[0]->{'primary_design_type'}; +} + + +# Accessor/Setter methods + +=head2 mage_xml + + Arg [1] : string(optional) - MAGE XML + Example : my $xml = $exp->mage_xml(); + Description : Getter/Setter for the mage_xml attribute + Returntype : String + Exceptions : None + Caller : General + Status : at risk + +=cut + +sub mage_xml{ + my ($self) = shift; + + $self->{'mage_xml'} = shift if(@_); + + #use fetch_attrs? + if(! exists $self->{'mage_xml'} && $self->mage_xml_id()){ + $self->{'mage_xml'} = $self->adaptor->fetch_mage_xml_by_Experiment($self); + } + + return (exists $self->{'mage_xml'}) ? $self->{'mage_xml'} : undef; +} + + +=head2 mage_xml_id + + Arg [1] : int (optional) - mage_xml_id + Example : $exp->group_db_id('1'); + Description : Getter/Setter for the mage_xml attribute + Returntype : String + Exceptions : None + Caller : General + Status : at risk + +=cut + +sub mage_xml_id{ + my $self = shift; + + $self->{'mage_xml_id'} = shift if @_; + + return $self->{'mage_xml_id'}; +} + + + + + + +#These convenience methods are to provide a registry for the experimental chips of the experiment + +=head2 get_ExperimentalChips + + Example : my $exp_chips = @{$exp->get_ExperimentalChips()} + Description : Retrieves all ExperiemntalChips + Returntype : Listref of ExperimentalChips + Exceptions : None + Caller : General + Status : At risk + +=cut + +sub get_ExperimentalChips{ + my ($self) = shift; + + #should this also store echips? + + #Need to retrieve all from DB if not defined, then check whether already present and add and store if not + #what about immediate access to dbID + #should we separate and have add_ExperimentalChip? + + + + if(! exists $self->{'experimental_chips'}){ + $self->{'experimental_chips'} = {}; + + #need to warn about DBAdaptor here? + + foreach my $echip(@{$self->adaptor->db->get_ExperimentalChipAdaptor->fetch_all_by_experiment_dbID($self->dbID())}){ + $self->{'experimental_chips'}->{$echip->unique_id()} = $echip; + } + } + + #is this returning a list or a listref? + return [values %{$self->{'experimental_chips'}}]; +} + +=head2 add_ExperimentalChip + + Example : $exp_chip = $exp->add_ExperimentalChip($exp_chip) + Description : Adds and stores an ExperiemntalChip for this Experiment + Returntype : Bio::EnsEMBL::Funcgen::ExperimentalChip + Exceptions : Throws is not passed a valid stored Bio::EnsENBML::Funcgen::ExperimentalChip + Caller : General + Status : At risk + +=cut + +sub add_ExperimentalChip{ + my ($self, $echip) = @_; + + + throw("Must pass a valid stored Bio::EnsEMBL::Funcgen::ExperimentalChip object") + if(! $echip->isa("Bio::EnsEMBL::Funcgen::ExperimentalChip") || ! $echip->dbID()); + + if(! exists $self->{'experimental_chips'}){ + $self->get_ExperimentalChips(); + $self->{'experimental_chips'}->{$echip->unique_id()} = $echip; + #do this here without checking to avoid probelm of retrieving first stored chip + }elsif(exists $self->{'experimental_chips'}->{$echip->unique_id()}){ + warn("You cannot add the same ExperimentalChip(".$echip->unique_id().")to an Experiment more than once, check your code"); + }else{ + $self->{'experimental_chips'}->{$echip->unique_id()} = $echip; + } + + return; +} + +=head2 get_ExperimentalChip_by_unique_id + + Example : $exp_chip = $exp->add_ExperimentalChip($exp_chip) + Description : Adds and stores an ExperiemntalChip for this Experiment + Returntype : Bio::EnsEMBL::Funcgen::ExperimentalChip + Exceptions : Throws if no uid supplied + Caller : General + Status : At risk + +=cut + +sub get_ExperimentalChip_by_unique_id{ + my ($self, $uid) = @_; + + my ($echip); + + throw("Must supply a ExperimentalChip unque_id") if(! defined $uid); + + $self->{'experimental_chips'} || $self->get_ExperimentalChips(); + + if(exists $self->{'experimental_chips'}->{$uid}){ + $echip = $self->{'experimental_chips'}->{$uid}; + } + #should we warn here if not exists? + + return $echip; +} + + +=head2 get_ExperimentalChip_unique_ids + + Example : foreach my $uid(@{$self->experiment->get_ExperimentalChip_unique_ids()}){ ... } + Description : retrieves all ExperimentalChip unique_ids + Returntype : ListRef + Exceptions : None + Caller : General + Status : At risk + +=cut + +sub get_ExperimentalChip_unique_ids{ + my $self = shift; + + $self->{'experimental_chips'} || $self->get_ExperimentalChips(); + + return [keys %{ $self->{'experimental_chips'}}]; +} + + + + +### Deprecated methods ### + + +sub group{ + my $self = shift; + + deprecate("group is deprecated experimental_group instead"); + throw("You are trying to set a experimental group name using a deprecated method") if @_; + return $self->experimental_group()->name; +} + + + +sub group_id{ + my ($self) = shift; + + deprecate("Experiment->group_id is deprecated. Use exp->experimental_group->dbID instead"); + return $self->experimental_group()->dbID; +} + + + +sub archive_id{ #deprecated in v68 + #would deprecate, but no easy way of doing this reliably + throw("Use InputSubset->archive_id"); +} + + +sub data_url{ #deprecated in v68 + #would deprecate, but no easy way of doing this reliably + throw("Use InputSubset->display_url"); +} + + +sub source_info{ #deprecated in v68 + #would deprecate, but no easy way of doing this reliably + throw("Use InputSubset->source_info"); +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/ExperimentalChip.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/ExperimentalChip.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,460 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::ExperimentalChip +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::Funcgen::ExperimentalChip - A module to represent a physical unique experimental chip. + +=head1 SYNOPSIS + +use Bio::EnsEMBL::Funcgen::ExperimentalChip; + +my $ec = Bio::EnsEMBL::Funcgen::ExperimentalChip->new( + -dbID => $ec_id, + -unique_id => $c_uid, + -experiment_id => $exp_id, + -array_chip_id => $ac_id, + -feature_type => $ftpye, + -cell_type => $ctype, + -chip_set_id => 1, + ); + + +=head1 DESCRIPTION + +An ExperimentalChip object represent a physical array chip/slide used in an experiment. The data +(currently the unique_id, experiment_id, array_chip_id, and description) are stored +in the experimental_chip table. + +=cut + +use strict; +use warnings; + + +package Bio::EnsEMBL::Funcgen::ExperimentalChip; + + +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Utils::Exception qw( throw warning ); +use Bio::EnsEMBL::Funcgen::Storable; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Storable); + + +=head2 new + + Arg [-unique_id] : int - the unique id of this individual experimental chip + Arg [-experiment_id] : int - the experiment dbID + Arg [-array_chip_id] : int - the dbID or the array_chip + Arg [-feature_type ] : Bio::EnsEMBL::Funcgen::FeatureType + Arg [-cell_type ] : Bio::EnsEMBL::Funcgen::CellType + Arg [-biological_replicate ] : string - the name to define the biological replicate set + Arg [-technical_replicate ] : string - the name to define the technical replicate set + + + Example : my $array = Bio::EnsEMBL::Funcgen::ExperimentalChip->new( + -dbID => $ec_id, + -unique_id => $c_uid, + -experiment_id => $exp_id, + -array_chip_id => $ac_id, + -feature_type => $ftype, + -cell_type => $ftype, + -biological_replicate => 'BIOREP1', + -technical_replicate => 'techrep_1', + ); + Description: Creates a new Bio::EnsEMBL::Funcgen::ExperimentalChip object. + Returntype : Bio::EnsEMBL::Funcgen::ExperimentalChip + Exceptions : None ? should throw if mandaotry params not set + Caller : General + Status : Medium Risk + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + #can we lc these? + my ($c_uid, $exp_dbid, $ac_id, $ftype, $ctype, $brep, $trep) + = rearrange( ['UNIQUE_ID', 'EXPERIMENT_ID', 'ARRAY_CHIP_ID', 'FEATURE_TYPE', 'CELL_TYPE', 'BIOLOGICAL_REPLICATE', 'TECHNICAL_REPLICATE'], @_ ); + + + $self->unique_id($c_uid) if defined $c_uid; + $self->experiment_id($exp_dbid) if defined $exp_dbid; + $self->array_chip_id($ac_id) if defined $ac_id; + $self->feature_type($ftype) if defined $ftype; + $self->cell_type($ctype) if defined $ctype; + $self->biological_replicate($brep) if defined $brep; + $self->technical_replicate($trep) if defined $trep; + return $self; +} + +=head2 get_Experiment + + Args : None + Example : my $exp = $exp_chip->get_Experiment(); + Description: Returns the Experiment which this ExperimentalChip belongs to. + Returntype : Bio::EnsEMBL::Funcgen::Experiment + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub get_Experiment { + my $self = shift; + + if (! $self->{'experiment'}){ + + if ($self->dbID() && $self->adaptor() ) { + $self->{'experiment'} = $self->adaptor->db->get_ExperimentAdaptor->fetch_by_dbID($self->experiment_id); + } else { + warning('Need database connection to retrieve Experiment'); + } + } + + + return $self->{'experiment'}; +} + + + +=head2 get_Channels + + Args : None + Example : my $channels = $exp_chip->get_Channels(); + Description: Returns all channels on a ExperimentalChip. Needs a database connection. + Returntype : Listref of Bio::EnsEMBL::Funcgen::Channel objects + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub get_Channels { + my $self = shift; + + if (! $self->{'channels'}){ + + if ($self->dbID() && $self->adaptor() ) { + foreach my $channel (@{$self->adaptor->db->get_ChannelAdaptor->fetch_all_by_ExperimentalChip($self)}){ + $self->add_Channel($channel); + } + } else { + warning('Need database connection to retrieve Channels'); + } + } + + return [values %{$self->{'channels'}}]; +} + + +=head2 add_Channel + + Args : Bio::EnsEMBL::Funcgen::Channel + Example : $exp_chip->add_channel($chan); + Description: Sets ad channel object for the ExperimentalChip + Returntype : Listref of Bio::EnsEMBL::Funcgen::Channel objects + Exceptions : warns if Channel already set + Caller : General + Status : At Risk + +=cut + +sub add_Channel{ + my ($self, $chan) = @_; + + + if(! ($chan && $chan->isa("Bio::EnsEMBL::Funcgen::Channel") && $chan->dbID())){ + throw("Must provide a valid stored Bio::EnsEMBL::Funcgen::Channel object"); + } + + + $self->{'channels'} ||= {}; + + if (exists $self->{'channels'}->{$chan->dbID()}){ + #should this throw? + #This currently prevents haveing to check whether a channel has already been added + #If we were duplicating then we probably would have a different dbID + warn("You cannot add the same Channel to an ExperimentalChip more than once"); + }else{ + + + ##change this to key on freq? + + $self->{'channels'}{$chan->dbID()} = $chan; + } + + return; +} + + + +=head2 get_channel_ids + + Args : None + Example : my @channel_ids = @{$array->get_channel_ids()}; + Description: Returns all channel ids for an ExperimentalChip. Needs a database connection. + Returntype : List of ints + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub get_channel_ids{ + my $self = shift; + + $self->get_Channels(); + + return [keys %{$self->{'channels'}}]; +} + +=head2 get_Channel_by_dye + + Args : string - dye used in channel + Example : my $chan = $echip->get_Channel_by_dye("CY5"); + Description: Returnsthe channel corresponding to the frequency specified + Returntype : Bio::EnsEMBL::Funcgen::Channel + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub get_Channel_by_dye{ + my ($self, $dye) = @_; + + my @chans; + + foreach my $chan(@{$self->get_Channels()}){ + push @chans, $chan if uc($chan->dye()) eq uc($dye); + } + + throw("Found more than one Channels with the same dye") if(scalar(@chans) > 1); + + + return (@chans) ? $chans[0] : undef; +} + +=head2 contains_Channel + + Args [1] : Bio::EnsEMBL::Funcgen::Channel + Example : if(! $echip->contains_Channel($chan){..add channel ..}; + Description: Checks whether this Channel has already been added to the ExperimentalChip + Returntype : Boolean + Exceptions : Throws if arg not a valid stored Bio::EnseMBL::Funcgen::Channel + Caller : General + Status : At Risk + +=cut + +sub contains_Channel{ + my ($self, $chan) = @_; + + if(! ($chan && $chan->isa("Bio::EnsEMBL::Funcgen::Channel") && $chan->dbID())){ + throw("Must provide a valid stored Bio::EnsEMBL::Funcgen::Channel object"); + } + + $self->get_Channels(); + + my $contains = 0; + + $contains = 1 if(exists $self->{'channels'}->{$chan->dbID()}); + + return $contains; +} + + + +=head2 unique_id + + Arg [1] : (optional) int - the unique chip id for this ExperimentalChip + Example : my $c_uid = $array->unique_id(); + Description: Getter, setter unique_id attribute. + Returntype : string + Exceptions : None + Caller : General + Status : at Risk + +=cut + +sub unique_id { + my $self = shift; + $self->{'unique_id'} = shift if @_; + return $self->{'unique_id'}; +} + +=head2 feature_type + + Arg [1] : (optional) Bio::EnsEMBL::Funcgen::FeatureType + Example : $ec->feature_type($ftype); + Description: Getter/Setter thefeature_type attribute. + Returntype : Bio::EnsEMBL::Funcgen::FeatureType + Exceptions : Throws if arg is not a Bio::EnsEMBL::FeatureType + Caller : General + Status : At Risk + +=cut + +sub feature_type { + my $self = shift; + + if(@_){ + throw("Must pass a valid Bio::EnsEMBL::Funcgen::FeatureType object") if (! $_[0]->isa("Bio::EnsEMBL::Funcgen::FeatureType")); + $self->{'feature_type'} = shift; + } + + return $self->{'feature_type'}; +} + + +=head2 cell_type + + Arg [1] : (optional) Bio::EnsEMBL::Funcgen::CellType + Example : $ec->cell_type($ctype); + Description: Getter/Setter the cell_type attribute. + Returntype : Bio::EnsEMBL::Funcgen::CellType + Exceptions : Throws if arg is not a Bio::EnsEMBL::CellType + Caller : General + Status : At Risk + +=cut + +sub cell_type { + my $self = shift; + + if(@_){ + throw("Must pass a valid Bio::EnsEMBL::Funcgen::CellType object") if (! $_[0]->isa("Bio::EnsEMBL::Funcgen::CellType")); + $self->{'cell_type'} = shift; + } + + return $self->{'cell_type'}; +} + + + +=head2 biological_replicate + + Arg [1] : (optional) string - the name or number of the chip biological replicate set + Example : $ec->biological_replicate('SAMPLENAME_BR1'); + Description: Getter, setter for the biological_replicate attribute. + Returntype : string + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub biological_replicate { + my $self = shift; + $self->{'biological_replicate'} = shift if @_; + return $self->{'biological_replicate'}; +} + +=head2 technical_replicate + + Arg [1] : (optional) string - the name or number of the chip technical replicate set + Example : $ec->technical_replicate('SAMPLENAME_BR1_TR1'); + Description: Getter, setter for the technical_replicate attribute. + Returntype : string + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub technical_replicate { + my $self = shift; + $self->{'technical_replicate'} = shift if @_; + return $self->{'technical_replicate'}; +} + + + +=head2 experiment_id + + Arg [1] : (optional) int - the experiment dbID + Example : my $exp_id = $array->experiment_id(); + Description: Getter, setter experiment_id attribute + Returntype : int + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub experiment_id { + my $self = shift; + $self->{'experiment_id'} = shift if @_; + return $self->{'experiment_id'}; +} + +=head2 array_chip_id + + Arg [1] : (optional) int - the array_chip dbID + Example : my $ac_id = $ec->array_chip_id(); + Description: Getter, setter array_chip_id attribute + Returntype : int + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub array_chip_id { + my $self = shift; + $self->{'array_chip_id'} = shift if @_; + return $self->{'array_chip_id'}; +} + +=head2 get_ArrayChip + + Example : my $array_chip = $exp_chip->get_ArrayChip(); + Description: Getter for the array_chip attribute + Returntype : Bio::EnsEMBL::Funcgen::ArrayChip + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub get_ArrayChip { + my $self = shift; + + if(! defined $self->{'array_chip'}){ + $self->{'array_chip'} = $self->adaptor->db->get_ArrayChipAdaptor()->fetch_by_dbID($self->array_chip_id()); + } + + return $self->{'array_chip'}; +} + + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/ExperimentalGroup.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/ExperimentalGroup.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,225 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::ExperimentalGroup +# + + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::Funcgen::ExperimentalGroup - A module to represent +an ExperimentalGroup. i.e. the authors of an experiment. + +=head1 SYNOPSIS + +use Bio::EnsEMBL::Funcgen::ExperimentalGroup; + + + +=head1 DESCRIPTION + +This is a simple class to represent information about an ExperimentalGroup, +containing a name and a more detailed description +This module is part of the Ensembl project: http://www.ensembl.org/ + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::ExperimentalGroup; + +use Bio::EnsEMBL::Utils::Argument qw( rearrange ) ; +use Bio::EnsEMBL::Utils::Exception qw( throw warning ); +use Bio::EnsEMBL::Funcgen::Storable; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Storable); + + +=head2 new + + Arg [-name]: string - name of ExperimentalGroup + Arg [-location]: (optional) string - location of ExperimentalGroup + Arg [-contact]: (optional) string - contact of ExperimentalGroup + Arg [-url]: (optional) string - url containing information for the ExperimentalGroup + Arg [-description]: (optional) string - descriptiom of ExperimentalGroup + Arg [-project]: (optional) boolean - True if this is part of a large project (eg. ENCODE) + Example : my $group = Bio::EnsEMBL::Funcgen::ExperimentalGroup->new( + -name => "EBI", + -location => "Hinxton", + -contact => "dev@ensembl.org", + -url => "http://www.ebi.ac.uk/", + -description => "European Bioinformatics Institute", + -is_project => 0, + ); + Description: Constructor method for ExperimentalGroup class + Returntype : Bio::EnsEMBL::Funcgen::ExperimentalGroup + Exceptions : Throws if name not defined + Caller : General + Status : At risk + +=cut + +sub new { + my $caller = shift; + + my $obj_class = ref($caller) || $caller; + my $self = $obj_class->SUPER::new(@_); + + my ( + $name, + $location, + $contact, + $url, + $desc, + $is_project + ) = rearrange([ + 'NAME', 'LOCATION', 'CONTACT', 'URL', 'DESCRIPTION', 'IS_PROJECT' + ], @_); + + + if($name){ + $self->name($name); + }else{ + throw("Must supply a Group name\n"); + } + $self->location($location) if $location; + $self->contact($contact) if $contact; + $self->url($url) if $url; + $self->description($desc) if $desc; + $self->is_project($is_project) if $is_project; + + return $self; +} + + + +=head2 name + + Arg [1] : string - name + Example : my $name = $ft->name(); + Description: Getter and setter of name attribute for ExperimentalGroup objects + Returntype : string + Exceptions : None + Caller : General + Status : Low Risk + +=cut + +sub name { + my $self = shift; + $self->{'name'} = shift if @_; + return $self->{'name'}; +} + +=head2 description + + Arg [1] : (optional) string - description + Example : my $desc = $group->description(); + Description: Getter and setter of description attribute for ExperimentalGroup objects. + Returntype : string + Exceptions : None + Caller : General + Status : Low Risk + +=cut + +sub description { + my $self = shift; + $self->{'description'} = shift if @_; + return $self->{'description'}; +} + +=head2 location + + Arg [1] : (optional) string - location + Example : my $location = $group->location(); + Description: Getter and setter of location attribute for ExperimentalGroup objects. + Returntype : string + Exceptions : None + Caller : General + Status : Low Risk + +=cut + +sub location { + my $self = shift; + $self->{'location'} = shift if @_; + return $self->{'location'}; +} + + +=head2 contact + + Arg [1] : (optional) string - contact + Example : my $contact = $group->contact(); + Description: Getter and setter of contact attribute for ExperimentalGroup objects. + Returntype : string + Exceptions : None + Caller : General + Status : Low Risk + +=cut + +sub contact { + my $self = shift; + $self->{'contact'} = shift if @_; + return $self->{'contact'}; +} + + +=head2 url + + Arg [1] : (optional) string - url + Example : my $url = $group->url(); + Description: Getter and setter of url attribute for ExperimentalGroup objects. + Returntype : string + Exceptions : None + Caller : General + Status : Low Risk + +=cut + +sub url { + my $self = shift; + $self->{'url'} = shift if @_; + return $self->{'url'}; +} + +=head2 is_project + + Arg [1] : (optional) Boolean - is_project + Example : $group->is_project(); + Description: Getter and setter of is_project attribute for ExperimentalGroup objects. + Returntype : string + Exceptions : None + Caller : General + Status : High Risk + +=cut + +sub is_project { + my $self = shift; + $self->{'is_project'} = shift if @_; + return $self->{'is_project'}; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/ExperimentalSet.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/ExperimentalSet.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,281 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::ExperimentalSubset +# +# You may distribute this module under the same terms as Perl itself + +=head1 NAME + +Bio::EnsEMBL::ExperimentalSet - A module to represent ExperimentalSet object. + + +=head1 SYNOPSIS + +use Bio::EnsEMBL::Funcgen::ExpeimentalSet; + +my $data_set = Bio::EnsEMBL::Funcgen::ExperimentalSet->new( + -DBID => $dbID, + -ADAPTOR => $self, + -EXPERIMENT => $exp, + -FEATURE_TYPE => $ftype, + -CELL_TYPE => $ctype, + -FORMAT => 'READ_FORMAT', + -VENDOR => 'SOLEXA', + -NAME => 'ExpSet1', + ); + + + +=head1 DESCRIPTION + +An ExperimentalSet object provides a generic container for any non-array based feature import, +allowing tracking of file import via the status table and integration into Data and FeatureSets to +provide traceability to the source experiment from a given FeatureSet. + +=head1 AUTHOR + +This module was created by Nathan Johnson. + +This module is part of the Ensembl project: http://www.ensembl.org/ + +=head1 CONTACT + +Post comments or questions to the Ensembl development list: ensembl-dev@ebi.ac.uk + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::ExperimentalSet; + +use Bio::EnsEMBL::Funcgen::ExperimentalSubset; +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate); +use Bio::EnsEMBL::Funcgen::Set; + + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Set);#change to Set once we have implemented analysis properly + + +=head2 new + + + + Example : my $eset = Bio::EnsEMBL::Funcgen::ExperimentalSet->new( + -EXPERIMENT => $exp, + -FEATURE_TYPE => $ftype, + -CELL_TYPE => $ctype, + -FORMAT => 'READ_FORMAT', + -VENDOR => 'SOLEXA', + -NAME => 'ExpSet1', + -ANALYSIS => $anal, + ); + + Do we want to define subsets likes this or are we more likely to add them one by one? + + Description: Constructor for ExperimentalSet objects. + Returntype : Bio::EnsEMBL::Funcgen::ExperimentalSet + Exceptions : Throws if no Experiment defined + Throws if CellType or FeatureType are not valid or stored + Caller : General + Status : At risk + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + #do we need to add $fg_ids to this? Currently maintaining one feature_group focus.(combi exps?) + my ($exp, $format, $vendor) + = rearrange(['EXPERIMENT', 'FORMAT', 'VENDOR'], @_); + + if (! (ref $exp && $exp->isa('Bio::EnsEMBL::Funcgen::Experiment') && $exp->dbID())){ + throw('Must specify a valid stored Bio::EnsEMBL::Funcgen::Experiment'); + } + + + #These are set in Set, just validate here + throw ('Must provide a FeatureType') if(! defined $self->feature_type); + throw ('Must provide a CellType') if(! defined $self->cell_type); + + + + if(! defined $self->analysis){ + #default analysis hack for v47 + #Set directly to avoid dbID boolean check + $self->{'analysis'} = Bio::EnsEMBL::Analysis->new(-logic_name => 'external', + -id => 0,#??someone needs to rewrite analysis + ); + } + + $self->format($format) if defined $format; + $self->vendor($vendor) if defined $vendor; + $self->{'experiment'} = $exp; + $self->{'subsets'} = {}; + + return $self; +} + + +=head2 add_new_subset + + Arg [1] : string - sub set name e.g. the file name (not path as we're restricted to 30 chars) + Example : $expset->add_new_subset($ss_name, $exp_subset); + Description: Adds experimental_subset + Returntype : none + Exceptions : Throws if set is already present + Throws if ExperimentalSubset is not valid or stored + Caller : General + Status : At Risk + +=cut + +sub add_new_subset { + my ($self, $ss_name, $exp_sset) = @_; + + if($self->get_subset_by_name($ss_name)){ + throw("Subset $ss_name is already present in this ExperimentalSet, maybe you need to alter the filename?"); + } + + if(defined $exp_sset){ + + if(!(ref($exp_sset) && $exp_sset->isa('Bio::EnsEMBL::Funcgen::ExperimentalSubset') && $exp_sset->dbID())){ + throw('ExperimentalSubsets must be valid and stored'); + } + } + else{ + + $exp_sset = Bio::EnsEMBL::Funcgen::ExperimentalSubset->new( + -name => $ss_name, + -experimental_set => $self, + ); + } + + $self->{'subsets'}{$ss_name} = $exp_sset; + + return $self->{'subsets'}{$ss_name}; +} + + +=head2 get_Experiment + + Example : my $exp = $exp_set->get_Experiment(); + Description: Getter for the Experiment of this DataSet. + Returntype : Bio::EnsEMBL::Fuuncgen::Experiment + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub get_Experiment{ + my $self = shift; + + return $self->{'experiment'}; +} + + +=head2 get_subsets + + Example : my @subsets = @{$exp_set->get_subsets()}; + Description: Getter for the subsets for this ExperimentalSet. + Returntype : Arrayref + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub get_subsets{ + my ($self) = shift; + + return [ values %{$self->{'subsets'}} ]; +} + +=head2 get_subset_by_name + + Example : my $subsets = $exp_set->get_subset_by_name('subset1'); + Description: Getter for the subset of a given name for this ExperimentalSet. + Returntype : Bio::EnsEMBL::Funcgen::ExpeirmentalSubset + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub get_subset_by_name{ + my ($self, $name) = @_; + + return (exists $self->{'subsets'}{$name}) ? $self->{'subsets'}{$name} : undef; +} + +=head2 get_subset_names + + Example : my @subset_names = @{$exp_set->get_subset_names()}; + Description: Getter for the subset names for this ExperimentalSet. + Returntype : Arrayref + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub get_subset_names{ + my ($self) = shift; + + return [ keys %{$self->{'subsets'}} ]; +} + + + + +=head2 vendor + + Arg[1] : string - vendor + Example : my $eset->vendor('SOLEXA'); + Description: Getter/Setter for the vendor attribute of this DataSet. + Returntype : string + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub vendor { + my $self = shift; + + $self->{'vendor'} = shift if @_; + + return $self->{'vendor'}; +} + + +=head2 format + + Arg[1] : string - format i.e. product type/format + Example : my $eset->format('DATASET1'); + Description: Getter/Setter for the format attribute of this ExperimentalSet. + Returntype : string + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub format { + my $self = shift; + + $self->{'format'} = shift if @_; + + return $self->{'format'}; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/ExperimentalSubset.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/ExperimentalSubset.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,139 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::ExperimentalSubset +# +# You may distribute this module under the same terms as Perl itself + +=head1 NAME + +Bio::EnsEMBL::ExperimentalSet - A module to represent ExperimentalSubset object. + + +=head1 SYNOPSIS + +use Bio::EnsEMBL::Funcgen::ExperimetnalSubset; + +my $data_set = Bio::EnsEMBL::Funcgen::ExperimentalSubset->new( + -DBID => $dbID, + -ADAPTOR => $self, + -NAME => $name, + -EXPERIMENTAL_SET => $eset, + ); + + + +=head1 DESCRIPTION + +An ExperimentalSubset object is a very simple skeleton class to enable storage of associated subset states. As such there +are only very simple accessor methods for basic information, and there is no namesake adaptor, rather is is handled by the +ExperimentalSetAdaptor. + +=head1 AUTHOR + +This module was created by Nathan Johnson. + +This module is part of the Ensembl project: http://www.ensembl.org/ + +=head1 CONTACT + +Post comments or questions to the Ensembl development list: ensembl-dev@ebi.ac.uk + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::ExperimentalSubset; + +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Utils::Exception qw( throw ); +use Bio::EnsEMBL::Funcgen::Storable; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Storable); + + +=head2 new + + Example : my $eset = Bio::EnsEMBL::Funcgen::ExperimentalSubset->new( + -DBID => $dbID, + -ADAPTOR => $self, + -NAME => $name, + -EXPERIMENTAL_SET => $eset, + ); + + + Description: Constructor for ExperimentalSubset objects. + Returntype : Bio::EnsEMBL::Funcgen::ExperimentalSubset + Exceptions : Throws if no name defined + Throws if CellType or FeatureType are not valid or stored + Caller : General + Status : At risk + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + #do we need to add $fg_ids to this? Currently maintaining one feature_group focus.(combi exps?) + my ($name, $eset) + = rearrange(['NAME', 'EXPERIMENTAL_SET'], @_); + + + throw('Must provide a name argument') if ! defined $name; + + if(!(ref($eset) && + $eset->isa('Bio::EnsEMBL::Funcgen::ExperimentalSet') + && $eset->dbID())){ + throw('Must provide a valid stored experimental_set argument'); + } + + + $self->{'name'} = $name; + $self->{'experimental_set'} = $eset; + + return $self; +} + + +=head2 name + + Example : my $name = $exp_sset->name(); + Description: Getter for the name of this ExperimentalSubset. + Returntype : string + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub name { + my $self = shift; + return $self->{'name'}; +} + +=head2 experimental_set + + Example : my $eset = $exp_sset->experimental_set(); + Description: Getter for the experimental_set attribute of this ExperimentalSubset. + Returntype : Bio::EnsEMBL::Funcgen::ExperimentalSet + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub experimental_set { + my $self = shift; + return $self->{'experimental_set'}; +} + + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/ExternalFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/ExternalFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,160 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::ExternalFeature +# + + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::ExternalFeature - A module to represent an externally curated feature +mapping from an external_db. + +=head1 SYNOPSIS + +use Bio::EnsEMBL::Funcgen::ExternalFeature; + +my $feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new( + -SLICE => $chr_1_slice, + -START => 1_000_000, + -END => 1_000_024, + -STRAND => -1, + -DISPLAY_LABEL => $text, + -FEATURE_SET => $fset, + -FEATURE_TYPE => $ftype, +); + + + +=head1 DESCRIPTION + +An ExternalFeature object represents the genomic placement of an externally curated +feature from and DB external to Ensembl. + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::ExternalFeature; + +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Utils::Exception qw( throw ); +use Bio::EnsEMBL::Funcgen::SetFeature; +use Bio::EnsEMBL::Funcgen::FeatureType; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::SetFeature); + + +=head2 new + + + Arg [-FEATURE_SET] : Bio::EnsEMBL::Funcgen::FeatureSet + Arg [-FEATURE_TYPE] : Bio::EnsEMBL::Funcgen::FeatureType + Arg [-ANALYSIS] : Bio::EnsEMBL::Analysis + Arg [-SLICE] : Bio::EnsEMBL::Slice - The slice on which this feature is. + Arg [-START] : int - The start coordinate of this feature relative to the start of the slice + it is sitting on. Coordinates start at 1 and are inclusive. + Arg [-END] : int -The end coordinate of this feature relative to the start of the slice + it is sitting on. Coordinates start at 1 and are inclusive. + Arg [-DISPLAY_LABEL]: string - Display label for this feature + Arg [-STRAND] : int - The orientation of this feature. Valid values are 1, -1 and 0. + Arg [-dbID] : (optional) int - Internal database ID. + Arg [-ADAPTOR] : (optional) Bio::EnsEMBL::DBSQL::BaseAdaptor - Database adaptor. + Example : my $feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new( + -SLICE => $chr_1_slice, + -START => 1_000_000, + -END => 1_000_024, + -STRAND => -1, + -DISPLAY_LABEL => $text, + -FEATURE_SET => $fset, + -FEATURE_TYPE => $ftpe, + ); + + + Description: Constructor for ExternalFeature objects. + Returntype : Bio::EnsEMBL::Funcgen::ExternalFeature + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_); + + #Remove this method if we interdb_stable_id to SetFeature + ($self->{'interdb_stable_id'}) = rearrange(['INTERDB_STABLE_ID'], @_); + + return $self; +} + +=head2 interdb_stable_id + + Arg [1] : (optional) int - stable_id e.g 1 + Example : my $idb_sid = $feature->interdb_stable_id(); + Description: Getter for the interdb_stable_id attribute for this feature. + This is simply to avoid using internal db IDs for inter DB linking + Returntype : int + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub interdb_stable_id { + return $_[0]->{'interdb_stable_id'}; +} + + + + +=head2 display_label + + Example : my $label = $feature->display_label(); + Description: Getter for the display label of this feature. + Returntype : String + Exceptions : None + Caller : General + Status : Medium risk + +=cut + +sub display_label { + my $self = shift; + + if(! $self->{'display_label'} && $self->adaptor){ + + $self->{'display_label'} = $self->feature_set->feature_type->name().' - '; + $self->{'display_label'} .= $self->cell_type->name() if $self->cell_type(); + $self->{'display_label'} .= $self->feature_type->name() if(defined $self->{'feature_type'}); + } + + return $self->{'display_label'}; +} + + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/FeatureSet.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/FeatureSet.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,458 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::FeatureSet +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::FeatureSet - A module to represent FeatureSet. + + +=head1 SYNOPSIS + +use Bio::EnsEMBL::Funcgen::FeatureSet; + +my $result_set = Bio::EnsEMBL::Funcgen::FeatureSet->new( + +); + + + +=head1 DESCRIPTION + +A FeatureSet object provides access to a set of feature predictions and their details, which may have been generated from a +single or multiple Experiments with potentially differing analyses. The FeatureSet itself will only have a single analysis +which may be one or a combination of programs but will be represented by one analysis record. + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::FeatureSet; + +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate); +use Bio::EnsEMBL::Funcgen::Set; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Set); + +my %valid_classes = ( annotated => undef, + regulatory => undef, + external => undef, + segmentation => undef, ); + +=head2 new + + -name => $name, + -feature_type => $ftype, + -cell_type => $ctype, + -name => $name, + -description => 'Release 3.1', + -display_label => 'Short name', + -analysis => $analysis, + Arg [-EXPERIMENT_ID] : Experiment dbID + -dbid => $dbid, +Arg [-ADAPTOR] + + Example : my $feature = Bio::EnsEMBL::Funcgen::FeatureSet->new( + -dbid => $dbid, + -analysis => $analysis, + -feature_type => $ftype, + -cell_type => $ctype, + -name => $name, + -feature_class => 'annotated', + -description => 'Release 3.1', + -display_label => 'Short name', + -input_set => $iset, + ); + Description: Constructor for FeatureSet objects. + Returntype : Bio::EnsEMBL::Funcgen::FeatureSet + Exceptions : Throws if FeatureType defined + Caller : General + Status : At risk + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_); + + my ( $desc, $dlabel, $iset_id, $iset, $exp_id, $exp ) = + rearrange( [ + 'DESCRIPTION', 'DISPLAY_LABEL', + 'INPUT_SET_ID', 'INPUT_SET', 'EXPERIMENT_ID', 'EXPERIMENT' + ], + @_ ); + + + if($exp_id || $exp){ + throw('Passing an Experiment or an experiment_id is now deprecated,'. + ' please use -input_set or -input_set_id instead'); + } + + #Allow exp or exp_id to be passed to support storing and lazy loading + + #Mandatory params checks here (setting done in Set.pm) + throw('Must provide a FeatureType') + if ( !defined $self->feature_type ); + + #explicit type check here to avoid invalid types being imported as NULL + #subsequently throwing errors on retrieval + my $type = $self->feature_class; + + if ( !( $type && exists $valid_classes{$type} ) ) { + throw( 'You must define a valid FeatureSet type e.g. ' . + join( ', ', keys %valid_classes ) ); + } + + #Direct assignment to prevent need for set arg test in method + + $self->{'description'} = $desc if defined $desc; + $self->{'display_label'} = $dlabel if defined $dlabel; + $self->{'input_set_id'} = $iset_id if defined $iset_id; + + if ( defined $iset ) { + #Exp obj is only passed during object storing + #so let the adaptor do is_stored_and_valid + $self->{'input_set'} = $iset; + } + + return $self; +} ## end sub new + + + +=head2 new_fast + + Args : Hashref with all internal attributes set + Example : none + Description: Quick and dirty version of new. Only works if the code is very + disciplined. + Returntype : Bio::EnsEMBL::Funcgen::FeatureSet + Exceptions : None + Caller : General + Status : At Risk + +=cut + + +sub new_fast { + return bless ($_[1], $_[0]); +} + + +=head2 description + + Example : print "Feature set description is:\t".$fset->description."\n"; + Description: Getter for the description of this FeatureSet. e.g. Release 3.1 + Returntype : String + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub description { + return $_[0]->{'description'}; +} + + + +=head2 display_label + + Example : print $rset->display_label; + Description: Getter for the display_label attribute for this FeatureSet. + Returntype : String + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub display_label { + my $self = shift; + + if ( !$self->{'display_label'} ) { + + if ( $self->feature_type->class() eq 'Regulatory Feature' ) { + $self->{'display_label'} = $self->name; + } + else { + #This still fails here if we don't have a class or a cell_type set + + $self->{'display_label'} = + $self->feature_type->name() . " - " . $self->cell_type->name() . + " Enriched Sites"; + } + } + + return $self->{'display_label'}; +} + + + + +=head2 get_FeatureAdaptor + + Example : + Description: Retrieves and caches FeatureAdaptor of feature_set type + Returntype : Bio::EnsEMBL::Funcgen::DBSQL::ucfirst($self->feature_class())FeatureAdaptor + Exceptions : None + Caller : General + Status : At Risk + +=cut + + +sub get_FeatureAdaptor{ + my $self = shift; + + if(! exists $self->{'adaptor_refs'}){ + + foreach my $valid_class(keys %valid_classes){ + my $method = 'get_'.ucfirst($valid_class).'FeatureAdaptor'; + + $self->{'adaptor_refs'}{$valid_class} = $self->adaptor->db->$method; + } + } + + return $self->{'adaptor_refs'}->{$self->feature_class()}; + +} + + + +=head2 get_Features_by_Slice + + Example : my @features = @{$FeatureSet->get_Features_by_Slice($slice)}; + Description: Retrieves all Features for this FeatureSet for a given Slice + Returntype : ARRAYREF containing Features of the feature_set type i.e. Annotated, Regulatory or Supporting; + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub get_Features_by_Slice{ + my ($self, $slice) = @_; + + return $self->get_FeatureAdaptor->fetch_all_by_Slice_FeatureSets($slice, [$self]); +} + + +=head2 get_Features_by_FeatureType + + Arg[0] : Bio::EnsEMBL::Funcgen::FeatureType + Example : my @features = @{$FeatureSet->get_Features_by_FeatureType($ftype)}; + Description: Retrieves all Features for this FeatureSet for a given FeatureType + or associated FeatureType. This is mainly used by external FeatureSets + which can sometimes have more than one associated FeatureType. + Returntype : ARRAYREF + Exceptions : None + Caller : General + Status : At Risk + +=cut + + +sub get_Features_by_FeatureType{ + my ($self, $type) = @_; + + return $self->get_FeatureAdaptor->fetch_all_by_FeatureType_FeatureSets($type, [$self]); +} + + +=head2 get_all_Features + + Example : my @features = @{$FeatureSet->get_all_Features}; + Description: Retrieves all Features for this FeatureSet + Returntype : ARRAYREF + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub get_all_Features{ + my $self = shift; + + return $self->get_FeatureAdaptor->fetch_all_by_FeatureSets([$self]); +} + + + + +=head2 is_focus_set + + Args : None + Example : if($fset->is_focus_set){ ... } + Description: Returns true if FeatureSet is a focus set used in the RegulatoryBuild + Returntype : Boolean + Exceptions : Throws if meta entry not present + Caller : General + Status : At Risk + +=cut + +sub is_focus_set{ + my $self = shift; + + if(! defined $self->{focus_set}){ + + if(! defined $self->cell_type){ + warn "FeatureSet without an associated CellType cannot be a focus set:\t".$self->name; + $self->{focus_set} = 0; + } + else{ + $self->{focus_set} = $self->adaptor->fetch_focus_set_config_by_FeatureSet($self); + } + } + + return $self->{focus_set}; +} + + +=head2 is_attribute_set + + Args : None + Example : if($fset->is_attribute_set){ ... } + Description: Returns true if FeatureSet is a supporting/attribute(focus or not) set used in the RegulatoryBuild + Returntype : Boolean + Exceptions : Throws if meta entry not present + Caller : General + Status : At Risk + +=cut + +sub is_attribute_set{ + my $self = shift; + + if(! defined $self->{attribute_set}){ + + if(! defined $self->cell_type){ + warn "FeatureSet without an associated CellType cannot be a attribute set:\t".$self->name; + $self->{attribute_set} = 0; + } + else{ + $self->{attribute_set} = $self->adaptor->fetch_attribute_set_config_by_FeatureSet($self); + } + } + + return $self->{attribute_set}; +} + + +=head2 get_InputSet + + Example : my $input_set = $FeatureSet->get_InputSet; + Description: Retrieves the InputSet for this FeatureSet + Returntype : Bio::EnsEMBL::Funcgen::InputSet + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub get_InputSet{ + my $self = shift; + + if( (! defined $self->{input_set}) && + (defined $self->{input_set_id}) ){ + $self->{input_set} = $self->adaptor->db->get_InputSetAdaptor->fetch_by_dbID($self->{input_set_id}); + } + + return $self->{input_set}; +} + + + +=head2 source_label + + Example : my $source_label = $fset->source_label; + Description: Retrieves the source label this FeatureSet, used in zmenus + Returntype : Arrayref of Strings + Exceptions : None + Caller : Webcode zmenus + Status : At Risk - remove, to be done by webcode? + +=cut + +#These are used to link through to the experiment view based on feature_set_id + +sub source_label{ + my $self = shift; + + if (! defined $self->{source_label}) { + my $input_set = $self->get_InputSet; + my @source_labels; + + if ($input_set) { + + + foreach my $isset(@{$input_set->get_InputSubsets}){ + + if(defined $isset->archive_id){ + push @source_labels, $isset->archive_id; + } + #Archive IDs e.g. SRX identifiers or undef. + } + + #Append project name + my $exp_group = $input_set->get_Experiment->experimental_group; + + if ($exp_group && + $exp_group->is_project) { + push @source_labels, $exp_group->name; + } + } + + + $self->{source_label} = join(' ', @source_labels); + } + + return $self->{source_label}; +} + + + + +### DEPRECATED ### + +=head2 get_Experiment +# +# Example : my $exp = $FeatureSet->get_Experiment; +# Description: Retrieves the Experiment for this FeatureSet +# Returntype : Bio::EnsEMBL::Funcgen::Experiment +# Exceptions : None +# Caller : General + Status : DEPRECATED + +=cut + +sub get_Experiment{ + throw('FeatureSet::get_Experiment is not longer supported, please use FeatureSet::get_InputSet'); +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/FeatureType.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/FeatureType.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,346 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::FeatureType +# + + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::Funcgen::FeatureType - A module to represent a FeatureType. i.e. the target of an experiment. + +=head1 SYNOPSIS + +use Bio::EnsEMBL::Funcgen::FeatureType; + + + +=head1 DESCRIPTION + +This is a simple class to represent information about a FeatureType, containing the name i.e Brno nomenclature or other controlled/validated name relevant to the class (HISTONE, PROMOTER etc), and description. This module is part of the Ensembl project: http://www.ensembl.org/ + +=cut + +#To do +# add coding_transcript/gene methods. Store as xrefs or custom feature_type_coding table? (miRanda etc) +# + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::FeatureType; + +use Bio::EnsEMBL::Utils::Argument qw( rearrange ) ; +use Bio::EnsEMBL::Utils::Exception qw( throw warning ); +use Bio::EnsEMBL::Funcgen::Storable; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Storable); + + +=head2 new + + Arg [-name] : String - name of FeatureType + Arg [-class] : String - class of FeatureType + Arg [-description] : String - descriptiom of FeatureType + Arg [-analysis] : optional Bio::EnsEMBL::Analysis used to generate FeatureType + Arg [-so_accession] : optional String - Sequence ontology accession + Arg [-so_name] : optional String - Sequence ontology name + + Example : my $ft = Bio::EnsEMBL::Funcgen::FeatureType->new + ( + -name => "H3K9Me", + -class => "HISTONE", + -description => "Generalised methylation of Histone 3 Lysine 9", + -analysis => $analysis, + -so_name => $so_name, + -so_accession => $so_accession + ); + Description: Constructor method for FeatureType class + Returntype : Bio::EnsEMBL::Funcgen::FeatureType + Exceptions : Throws if name or class not defined + Throws if analysis is defined but not valid + Caller : General + Status : Stable + +=cut + +sub new { + my $caller = shift; + + my $obj_class = ref($caller) || $caller; + my $self = $obj_class->SUPER::new(@_); + + my ($name, $desc, $class, $analysis, $so_acc, $so_name) = + rearrange(['NAME', 'DESCRIPTION', 'CLASS', 'ANALYSIS', 'SO_ACCESSION', 'SO_NAME'], @_); + + throw("Must supply a FeatureType name\n") if ! defined $name; + throw("Must supply a FeatureType class\n") if ! defined $class; + + #Direct assignments here prevent set arg test in getter only method + $self->{name} = $name; + $self->{class} = $class; + #add test for class and enum? Validate names against Brno etc? + + $self->{description} = $desc if defined $desc; + $self->{so_name} = $so_name if defined $so_name; + $self->{so_accession} = $so_acc if defined $so_acc; + + if($analysis){ + + if(ref($analysis) ne 'Bio::EnsEMBL::Analysis'){ + throw('Optional Analysis parameter must be a valid Bio::EnsEMBL::Analysis'); + #is_stored checks done in other fetch and store methods + } + + $self->{'analysis'} = $analysis; + #Direct assignment prevents arg test in getter only method + } + + return $self; +} + + + +=head2 name + + Example : my $name = $ft->name; + Description: Getter of name attribute for FeatureType objects + Returntype : String + Exceptions : None + Caller : General + Status : Stable + +=cut + +sub name { + return $_[0]->{'name'}; +} + +=head2 description + + Example : my $desc = $ft->description; + Description: Getter of description attribute for FeatureType objects. + Returntype : String + Exceptions : None + Caller : General + Status : Stable + +=cut + +sub description { + return $_[0]->{'description'}; +} + + +=head2 class + + Example : my $ft_class = $ft->class; + Description: Getter of class attribute for FeatureType objects. + Returntype : String + Exceptions : None + Caller : General + Status : Stable + +=cut + +sub class{ + return $_[0]->{'class'}; +} + +=head2 so_accession + + Example : my $ft_class = $ft->class; + Description: Getter of sequence ontoloy accession for FeatureType objects. + Returntype : String + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub so_accession{ + return $_[0]->{'so_accession'}; +} + +=head2 so_name + + Example : my $so_name = $ft->so_name; + Description: Getter of sequence ontology name for FeatureType objects. + Returntype : String + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub so_name{ + return $_[0]->{'so_name'}; +} + +=head2 analysis + + Example : my $ft_anal = $ft->analysis; + Description: Getter of the Analysis for FeatureType objects. + Returntype : Bio::EnsEMBL::Analysis + Exceptions : None + Caller : General + Status : At risk + +=cut + +sub analysis{ + my $self = shift; + return $self->{'analysis'}; +} + +=head2 evidence_type_label + + Example : my $track_label = $fsets[0]->feature_type->evidence_type_label.' MultiCell'; + Description: Getter for short evidence type label used in track label and field headers etc. + Returntype : string + Exceptions : None + Caller : Web code + Status : At risk + +=cut + +sub evidence_type_label{ + my $self = shift; + + #Could get undef key warn here + #But only used in webcode so omit for speed + + + if(! exists $self->{evidence_type_label}){ + $self->{evidence_type_label} = $self->adaptor->get_regulatory_evidence_info->{$self->class}->{label}; + } + + return $self->{evidence_type_label}; +} + +=head2 evidence_type_name + + Example : my $name = $fsets[0]->feature_type->evidence_type_name; + Description: Getter for evidence type name used in browser. + Returntype : string + Exceptions : None + Caller : Web code + Status : At risk + +=cut + +sub evidence_type_name{ + my $self = shift; + + #Could get undef key warn here + #But only used in webcode so omit for speed + if(! exists $self->{evidence_type_name}){ + $self->{evidence_type_name} = $self->adaptor->get_regulatory_evidence_info($self->class)->{name}; + } + + return $self->{evidence_type_name}; +} + +=head2 evidence_type_long_name + + Example : my $long_name = $fsets[0]->feature_type->evidence_type_long_name; + Description: Getter for evidence type name used in browser. + Returntype : string + Exceptions : None + Caller : Web code + Status : At risk + +=cut + +sub evidence_type_long_name{ + my $self = shift; + + #Could get undef key warn here + #But only used in webcode so omit for speed + if(! exists $self->{evidence_type_long_name}){ + $self->{evidence_type_long_name} = $self->adaptor->get_regulatory_evidence_info($self->class)->{long_name}; + } + + return $self->{evidence_type_long_name}; +} + + + +=head2 compare + + Arg[1] : Bio::EnsEMBL::Func + The analysis to compare to + Example : none + Description: returns 1 if this FeatureType is the same + returns 0 if there is a mistmatch apart from the dbID/DB/Adaptor + Returntype : Boolean + Exceptions : Throws if arg is not valid + Caller : General + Status : At risk + +=cut + +#simplified version of Analysis:compare +#move to storable and take method args + +sub compare{ + my ($self, $ftype) = @_; + + if(ref($ftype) ne 'Bio::EnsEMBL::Funcgen::FeatureType'){ + throw('You must pass a valid Bio::EnsEMBL::Funcgen::FeatureType to compare'); + } + + my $same = 1; + + foreach my $methodName ( 'name', 'class', 'so_accession','so_name','description'){ + + if( defined $self->$methodName() && ! $ftype->can($methodName )) { + $same = 0; + last; + } + if( defined $self->$methodName() && ! defined $ftype->$methodName() ) { + $same = 0; + last; + } + + if( defined($ftype->$methodName()) && defined($self->$methodName()) && + ( $ftype->$methodName() ne $ftype->$methodName() )) { + $same = 0; + last; + } + } + + + #This would be in a wrapper method + if($self->analysis && $ftype->analysis){ + + if($self->analysis->compare($ftype->analysis)){ + #analysis compare returns the opposite of what you expect + $same = 0; + } + } + elsif( ! ((! $self->analysis) && (! $ftype->analysis)) ){#Only one has analysis + $same = 0; + } + + return $same; +} +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/HiveConfig/Alignment_conf.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/HiveConfig/Alignment_conf.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,277 @@ + +=pod + +=head1 NAME + + Bio::EnsEMBL::Funcgen::HiveConfig::Alignment_conf; + +=head1 SYNOPSIS + + # TODO: this could be easily merged with the Peaks pipeline... + # TODO: allow subfolders which will represent replicates... + # Allow semaphores so jobs can be run truly in parallel (see SemaStart and SemaLongMult_conf) + + # Example 1: specifying only the mandatory options (initial params are taken from defaults) +init_pipeline.pl Bio::EnsEMBL::Funcgen::HiveConfig::Alignment_conf -password + + # Example 2: specifying the mandatory options as well as setting initial params: +init_pipeline.pl Bio::EnsEMBL::Funcgen::HiveConfig::Alignment_conf -password -p1name p1value -p2name p2value + + # Example 3: do not re-create the database, just load more tasks into an existing one: +init_pipeline.pl Bio::EnsEMBL::Funcgen::HiveConfig::Alignment_conf -job_topup -password -p1name p1value -p2name p2value + + +=head1 DESCRIPTION + + This is the Config file for the Alignment Pipeline + + Please refer to Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf module to understand the interface implemented here. + + The Alignment pipeline consists of several "analysis": + * SetupAlignmentPipeline verifies the existence of the files and creates alignment jobs ... + * RunAlignment makes the alignment... + * WrapUpAlignment merges the alignments, some QC and fills in the data tracking db + + Please see the implementation details in Runnable modules themselves. + +=head1 CONTACT + + Please contact ensembl-dev@ebi.ac.uk mailing list with questions/suggestions. + +=cut + + +package Bio::EnsEMBL::Funcgen::HiveConfig::Alignment_conf; + +use strict; +use warnings; +use Data::Dumper; +use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf'); +# All Hive databases configuration files should inherit from HiveGeneric, directly or indirectly + + +=head2 default_options + + Description : Implements default_options() interface method of + Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that is used to initialize default options. + +=cut + +sub default_options { + my ($self) = @_; + return { + %{$self->SUPER::default_options}, + + 'pipeline_db' => { + -host => $self->o('dbhost'), + -port => $self->o('dbport'), + -user => $self->o('dbuser'), + -pass => $self->o('dbpass'), + #The aligments are independent of the EFG DB but since we should call the collections and the peaks, we can keep it + #-dbname => $ENV{USER}.'_alignments_'.$self->o('efgdb_name'), + -dbname => $self->o('pipedb_name'), + }, + + 'bin_dir' => '/software/ensembl/funcgen', + + #'efgdb_host' => ... + #We can add default values for all these but tend to avoid since people quickly forget what those are... + + #We could add a dummy default dataset, just so we can create an "empty" pipeline and add sets as needed... + #This dummy dataset would have to be specifically detected and ignored in a setup step... + #'experiment_name' => 'Dummy', + #'cell_type' => 'Dummy', + #'feature_type' => 'Dummy', + + }; +} + +=head2 resource_classes + + Description : Implements resource_classes() interface method of + Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that lists the LSF resource classes available + +=cut + +sub resource_classes { + my ($self) = @_; + return { + +#Use this section when running on Sanger Farm + 0 => { -desc => 'default', 'LSF' => '' }, + 1 => { -desc => 'long high memory', 'LSF' => '-q long -M5000000 -R"select[mem>5000] rusage[mem=5000]"' }, + 2 => { -desc => 'normal high memory', 'LSF' => ' -M5000000 -R"select[mem>5000] rusage[mem=5000]"' }, + +#Use this section when running on EBI cluster +# 0 => { -desc => 'default', 'LSF' => '' }, +# 1 => { -desc => 'long high memory', 'LSF' => '-M5000 -R"select[mem>5000] rusage[mem=5000]"' }, +# 2 => { -desc => 'normal high memory', 'LSF' => '-M5000 -R"select[mem>5000] rusage[mem=5000]"' }, + + }; +} + + +=head2 pipeline_wide_parameters + + Description : Interface method that should return a hash of pipeline_wide_parameter_name->pipeline_wide_parameter_value pairs. + The value doesn't have to be a scalar, can be any Perl structure now (will be stringified and de-stringified automagically). + Please see existing PipeConfig modules for examples. + +=cut + +sub pipeline_wide_parameters { + my ($self) = @_; + return { + %{$self->SUPER::pipeline_wide_parameters}, # inheriting database and hive tables creation + 'pipeline_name' => 'alignments'.'_'.$self->o('dbname'), # name used by the beekeeper to prefix job names on the farm + + + 'work_dir' => $self->o('work_dir'), # data directories and filenames + #This will be used for temp files and debug + #'output_dir' => $self->o('output_dir').'/ehive/'.$self->o('dbname').'/hive_results', + #'hive_output_dir' => $self->o('output_dir').'/ehive/'.$self->o('dbname').'/hive_debug', + 'output_dir' => $self->o('output_dir').'/alignments/results', + 'hive_output_dir' => $self->o('output_dir').'/alignments/hive_debug', + + + #Maybe use parameters instead of ENV Variables... use ENV variables in the pipeline ENV + "dnadb" => { + "-host" => $self->o('dnadb_host'), + "-port" => $self->o('dnadb_port'), + "-user" => $self->o('dnadb_user'), + "-dbname" => $self->o('dnadb_name'), + }, + "efgdb" => { + "-host" => $self->o('dbhost'), + "-port" => $self->o('dbport'), + "-user" => $self->o('dbuser'), + "-pass" => $self->o('dbpass'), + "-dbname" => $self->o('dbname'), + }, + #This could be inferred from the db, but it's probably safer(?) to pass as parameter... + "species" => $self->o('species'), + #May pass this to input_id... to allow for files of different assemblies in the same pipeline run. + "assembly" => $self->o('assembly'), + + #Make sure the bwa_indexes were generated with the same version!!! + #Just use the default bwa: should be in in /software/varinfo/bin + "bwa_bin" => $self->o('bin_dir')."/bwa", + #"bwa_bin" => "/nfs/users/nfs_d/ds19/src/bwa-0.5.8a/bwa", + #get new versions of bwa in /software/ensembl/bin/bwa + 'bin_dir' => $self->o('bin_dir'), + + #Size of each sequence chunk to be aligned (nbr of reads * 4) + "fastq_chunk_size" => "16000000" #This should run in 30min-1h + + }; +} + + +=head2 pipeline_create_commands + + Description : Implements pipeline_create_commands() interface method of + Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that lists the commands + that will create and set up the Hive database. + +=cut + +sub pipeline_create_commands { + my ($self) = @_; + return [ + #HiveGeneric assumes ensembl-hive folder while if you use the stable version it's ensembl-hive_stable! + + @{$self->SUPER::pipeline_create_commands}, # inheriting database and hive tables creation + + #'mysql '.$self->dbconn_2_mysql('pipeline_db', 0)." -e 'CREATE DATABASE ".$self->o('pipeline_db', '-dbname')."'", + + # standard eHive tables and procedures: + #'mysql '.$self->dbconn_2_mysql('pipeline_db', 1).' <'.$self->o('ensembl_cvs_root_dir').'/ensembl-hive/sql/tables.sql', + #'mysql '.$self->dbconn_2_mysql('pipeline_db', 1).' <'.$self->o('ensembl_cvs_root_dir').'/ensembl-hive/sql/procedures.sql', + + #Create hive output folders as required + #'mkdir -p '.$self->o('work_dir').'/ehive/'.$self->o('efgdb_name').'/hive_debug', + #'mkdir -p '.$self->o('work_dir').'/ehive/'.$self->o('efgdb_name').'/hive_results', + 'mkdir -p '.$self->o('output_dir').'/alignments/results', + 'mkdir -p '.$self->o('output_dir').'/alignments/hive_debug', + + ]; +} + + +=head2 pipeline_analyses + + Description : Implements pipeline_analyses() interface method of + Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that defines the structure of the pipeline: analyses, jobs, rules, etc. + + +=cut + +sub pipeline_analyses { + my ($self) = @_; + return [ + { + + -logic_name => 'setup_pipeline', + -module => 'Bio::EnsEMBL::Funcgen::RunnableDB::SetupAlignmentPipeline', + -parameters => {}, + -input_ids => [ + # No initial input_ids... these will be added as needed by init_pipeline -job_topup + { 'cell_type' => $self->o('cell_type'), 'feature_type' => $self->o('feature_type'), 'experiment_name' => $self->o('experiment_name') }, + ], + -flow_into => { + '1->A' => [ 'run_alignments' ], + 'A->2' => [ 'wrap_up_pipeline' ], + }, + # These jobs can be run in parallell... don't put too many since it may generate many jobs...jobs + -limit => 1, + -batch_size => 1, + -hive_capacity => 10, + -rc_id => 0, + }, + + { + -logic_name => 'run_alignments', + -module => 'Bio::EnsEMBL::Funcgen::RunnableDB::RunBWA', + -parameters => { }, + -input_ids => [ + # (jobs for this analysis will be flown_into via branch-3 from 'setup_pipeline' jobs above) + ], + -hive_capacity => 100, + # Better safe than sorry... size of datasets tends to increase... + -rc_id => 2, + #No need to wait once since it is independent from everything else + #-wait_for => [ 'setup_pipeline' ] + }, + + { + -logic_name => 'wrap_up_pipeline', + -module => 'Bio::EnsEMBL::Funcgen::RunnableDB::WrapUpAlignment', + -parameters => { }, + -input_ids => [ + # (jobs for this analysis will be flown_into via branch-1 from 'setup_pipeline' jobs above) + ], + -hive_capacity => 10, + -rc_id => 1, + # No need to wait, if we use semaphores... + # -wait_for => [ 'run_alignments' ], + }, + + #Temporary thing, while replicates are not handled properly... + #{ + # -logic_name => 'converge_replicates', + # -module => 'Bio::EnsEMBL::Funcgen::RunnableDB::ConvergeReplicates', + # -parameters => { }, + # -input_ids => [ + # (jobs for this analysis will be flown_into via branch-1 from 'setup_pipeline' jobs above) + #jobs ], + # -hive_capacity => 10, + # -rc_id => 1, + # No need to wait, if we use semaphores... + # -wait_for => [ 'run_alignments' ], + #}, + + ]; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/HiveConfig/Annotation_conf.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/HiveConfig/Annotation_conf.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,191 @@ + +=pod + +=head1 NAME + + Bio::EnsEMBL::Funcgen::HiveConfig::Annotation_conf; + + +=head1 DESCRIPTION + + This is the Config file for the Annotation Pipeline + + Please refer to Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf module to understand the interface implemented here. + + The Annotation pipeline implements Damian;s scripts in /scripts/regulatory_annotation: + + Please see the implementation details in the Runnable modules. + +=head1 CONTACT + + Please contact ensembl-dev@ebi.ac.uk mailing list with questions/suggestions. + +=cut + +package Bio::EnsEMBL::Funcgen::HiveConfig::Annotation_conf; + +use strict; +use warnings; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); + +use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf'); + +sub default_options { + my ($self) = @_; + return { + 'ensembl_cvs_root_dir' => $ENV{'SRC'}, # some Compara developers might prefer $ENV{'HOME'}.'/ensembl_main' + + 'pipeline_db' => { + -host => $self->o('dbhost'), + -port => $self->o('dbport'), + -user => $self->o('dbuser'), + -pass => $self->o('dbpass'), + -dbname => $ENV{USER}.'_regfeat_annotation_'.$self->o('dbname'), + #-dbname => $self->o('pipedb_name'), + }, + + }; +} + +sub resource_classes { + my ($self) = @_; + return { + 0 => { -desc => 'default', 'LSF' => '' }, + 1 => { -desc => 'urgent', 'LSF' => '-q yesterday' }, + 2 => { -desc => 'normal ens-genomics2', 'LSF' => '-R"select[myens_genomics2<1000] rusage[myens_genomics2=10:duration=10:decay=1]"' }, + 3 => { -desc => 'long ens-genomics2', 'LSF' => '-q long -R"select[myens_genomics2<1000] rusage[myens_genomics2=10:duration=10:decay=1]"' }, + 4 => { -desc => 'long high memory', 'LSF' => '-q long -M4000000 -R"select[mem>4000] rusage[mem=4000]"' }, + 5 => { -desc => 'long ens-genomics2 high memory', 'LSF' => '-q long -M4000000 -R"select[myens_genomics2<1000 && mem>4000] rusage[myens_genomics2=10:duration=10:decay=1:mem=4000]"' }, + }; +} + + +=head2 pipeline_wide_parameters + + Description : Interface method that should return a hash of pipeline_wide_parameter_name->pipeline_wide_parameter_value pairs. + The value doesn't have to be a scalar, can be any Perl structure now (will be stringified and de-stringified automagically). + Please see existing PipeConfig modules for examples. + +=cut + +sub pipeline_wide_parameters { + my ($self) = @_; + return { + + 'pipeline_name' => $self->o('pipeline_db', '-dbname'), # name used by the beekeeper to prefix job names on the farm + + 'work_dir' => $self->o('work_dir'), # data directories and filenames + #use this as scratch dir or create one specifically? + 'output_dir' => $self->o('output_dir').'/annotation/results', + 'hive_output_dir' => $self->o('output_dir').'/annotation/hive_debug', + + #Maybe use parameters instead of ENV Variables... use ENV variables in the pipeline ENV + "dnadb" => { + "-host" => $self->o('dnadb_host'), + "-port" => $self->o('dnadb_port'), + "-user" => $self->o('dnadb_user'), + "-dbname" => $self->o('dnadb_name'), + }, + "efgdb" => { + "-host" => $self->o('dbhost'), + "-port" => $self->o('dbport'), + "-user" => $self->o('dbuser'), + "-pass" => $self->o('dbpass'), + "-dbname" => $self->o('dbname'), + }, + + "workdb" => { + "-host" => $self->o('workdb_host'), + "-port" => $self->o('workdb_port'), + "-user" => $self->o('workdb_user'), + #workdbpass? + "-pass" => $self->o('dbpass'), + }, + #This could be inferred from the db, but it's probably safer(?) to pass as parameter... + "species" => $self->o('species'), + + #"release" => $self->o('release'), + + + }; +} + +=head2 pipeline_create_commands + + Description : Implements pipeline_create_commands() interface method of + Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that lists the commands + that will create and set up the Hive database. + +=cut + +sub pipeline_create_commands { + my ($self) = @_; + return [ + #HiveGeneric assumes ensembl-hive folder while if you use the stable version it's ensembl-hive_stable! + @{$self->SUPER::pipeline_create_commands}, # inheriting database and hive tables creation + + #'mysql '.$self->dbconn_2_mysql('pipeline_db', 0)." -e 'CREATE DATABASE ".$self->o('pipeline_db', '-dbname')."'", + + # standard eHive tables and procedures: + #'mysql '.$self->dbconn_2_mysql('pipeline_db', 1).' <'.$self->o('ensembl_cvs_root_dir').'/ensembl-hive/sql/tables.sql', + #'mysql '.$self->dbconn_2_mysql('pipeline_db', 1).' <'.$self->o('ensembl_cvs_root_dir').'/ensembl-hive/sql/procedures.sql', + + #Create hive output folders as required + 'mkdir -p '.$self->o('output_dir').'/annotation/results', + 'mkdir -p '.$self->o('output_dir').'/annotation/hive_debug', + + + ]; +} + +=head2 pipeline_analyses + + Description : Implements pipeline_analyses() interface method of + Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that defines the structure of the pipeline: analyses, jobs, rules, etc. + + +=cut + +sub pipeline_analyses { + my ($self) = @_; + + return [ + { + + -logic_name => 'setup_pipeline', + -module => 'Bio::EnsEMBL::Funcgen::RunnableDB::SetupAnnotationPipeline', + -parameters => {}, + -input_ids => [ + # No initial input_ids... these will be added as needed by init_pipeline -job_topup + { }, + + ], + -flow_into => { + 2 => [ 'annotate_regulatory_features' ], + #3 => [ 'wrap_up_pipeline' ], + }, + #These jobs cannot run in parallel due to race conditions! Do NOT change this setting unless you know what you're doing + -hive_capacity => 1, + -rc_id => 0, + }, + + { + -logic_name => 'annotate_regulatory_features', + -module => 'Bio::EnsEMBL::Funcgen::RunnableDB::AnnotateRegulatoryFeatures', + -parameters => { }, + -input_ids => [ + # (jobs for this analysis will be flown_into via branch-2 from 'setup_pipeline' jobs above) + ], + #Since all the weight is in the database it is safer to run only one at a time... or a small number at least + -hive_capacity => 1, + #Control files should be handled by setup_pipeline. + -rc_id => 5, # Better safe than sorry... size of datasets tends to increase... + -wait_for => [ 'setup_pipeline' ] + }, + + ]; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/HiveConfig/Dnase_profile_conf.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/HiveConfig/Dnase_profile_conf.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,156 @@ + +=pod + +=head1 NAME + + Bio::EnsEMBL::Hive::PipeConfig::Dnase_profile_conf + +=head1 SYNOPSIS + + +=head1 DESCRIPTION + + This is an example pipeline put together from basic building blocks: + + Analysis_1: JobFactory.pm is used to turn the list of tables of the given database into jobs + + these jobs are sent down the branch #2 into the second analysis + + Analysis_2: SystemCmd.pm is used to run these dumping+compression jobs in parallel. + +=head1 CONTACT + + Please contact ehive-users@ebi.ac.uk mailing list with questions/suggestions. + +=cut + +package Bio::EnsEMBL::Funcgen::HiveConfig::Dnase_profile_conf; + +use strict; +use warnings; + +use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf'); # All Hive databases configuration files should inherit from HiveGeneric, directly or indirectly + +=head2 default_options + + Description : Implements default_options() interface method of Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that is used to initialize default options. + o('password') your read-write password for creation and maintenance of the hive database +=cut + +sub default_options { + my ($self) = @_; + return { + 'ensembl_cvs_root_dir' => $ENV{'SRC'}, # some Compara developers might prefer + 'pipeline_name' => 'dnase_profile', # name used by the beekeeper to prefix job names on the farm + + 'pipeline_db' => { # connection parameters + -host => 'ens-genomics1', + -port => 3306, + -user => 'ensadmin', + -pass => $self->o('password'), # a rule where a previously undefined parameter is used (which makes either of them obligatory) + -dbname => $ENV{USER}.'_'.$self->o('pipeline_name'), # a rule where a previously defined parameter is used (which makes both of them optional) + }, + + 'is_male' => 0, # include table creation statement before inserting the data + + 'work_dir' => '/lustre/scratch101/ensembl/ds19/Dnase_Footprint_ENCODE', # + }; + } + + +=head2 pipeline_wide_parameters + +Description : Interface method that should return a hash of pipeline_wide_parameter_name->pipeline_wide_parameter_value pairs. + The value doesn't have to be a scalar, can be any Perl structure now (will be stringified and de-stringified automagically). + Please see existing PipeConfig modules for examples. + +=cut + +sub pipeline_wide_parameters { + my ($self) = @_; + return { + 'work_dir' => $self->o('work_dir'), # data directories and filenames + 'hive_output_dir' => $self->o('work_dir').'/hive_debug', + }; +} + +=head2 pipeline_create_commands + + Description : Implements pipeline_create_commands() interface method of Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that lists the commands that will create and set up the Hive database. + In addition to the standard creation of the database and populating it with Hive tables and procedures it also creates a directory for storing the output. + +=cut + +sub pipeline_create_commands { + my ($self) = @_; + return [ + 'mysql '.$self->dbconn_2_mysql('pipeline_db', 0)." -e 'CREATE DATABASE ".$self->o('pipeline_db', '-dbname')."'", + + # standard eHive tables and procedures: + 'mysql '.$self->dbconn_2_mysql('pipeline_db', 1).' <'.$self->o('ensembl_cvs_root_dir').'/ensembl-hive/sql/tables.sql', + 'mysql '.$self->dbconn_2_mysql('pipeline_db', 1).' <'.$self->o('ensembl_cvs_root_dir').'/ensembl-hive/sql/procedures.sql', + + 'mkdir -p '.$self->o('work_dir').'/hive_debug', + ]; +} + +=head2 resource_classes + + Description : Implements resource_classes() interface method of + Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that lists the LSF resource classes available + +=cut + +sub resource_classes { + my ($self) = @_; + return { + 0 => { -desc => 'default', 'LSF' => '' }, + 1 => { -desc => 'urgent', 'LSF' => '-q yesterday' }, + 2 => { -desc => 'normal high memory', 'LSF' => '-M8000000 -R"select[mem>8000] rusage[mem=8000]"' }, + 3 => { -desc => 'normal huge memory', 'LSF' => '-M12000000 -R"select[mem>12000] rusage[mem=12000]"' }, + }; +} + +=head2 pipeline_analyses + + Description : Implements pipeline_analyses() interface method of + Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that defines the structure of the pipeline: analyses, jobs, rules, etc. + +=cut + +sub pipeline_analyses { + my ($self) = @_; + return [ + { -logic_name => 'make_profile', + -module => 'Bio::EnsEMBL::Funcgen::RunnableDB::MakeDnaseProfile', + -parameters => { + }, + -input_ids => [ + { + 'matrix' => $self->o('matrix'), + 'dnase' => $self->o('dnase'), + 'is_male' => $self->o('is_male'), + }, + ], + -hive_capacity => 100, # allow several workers to perform identical tasks in parallel + -rc_id => 2, + -flow_into => { + 2 => [ 'run_centipede' ], # will create a fan of jobs + }, + }, + + { -logic_name => 'run_centipede', + -module => 'Bio::EnsEMBL::Funcgen::RunnableDB::RunCentipede', + -parameters => { + }, + -hive_capacity => 100, # allow several workers to perform identical tasks in parallel + -rc_id => 2, + -input_ids => [ + # (jobs for this analysis will be flown_into via branch-2 from 'get_tables' jobs above) + ], + }, + ]; + } + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/HiveConfig/ImportMotifFeatures_conf.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/HiveConfig/ImportMotifFeatures_conf.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,198 @@ + +=pod + +=head1 NAME + + Bio::EnsEMBL::Funcgen::HiveConfig::ImportMotifFeatures_conf; + +=head1 SYNOPSIS + + # Example 1: specifying only the mandatory options (initial params are taken from defaults) +init_pipeline.pl Bio::EnsEMBL::Funcgen::HiveConfig::*_conf -password + + # Example 2: specifying the mandatory options as well as setting initial params: +init_pipeline.pl Bio::EnsEMBL::Funcgen::HiveConfig::*_conf -password -p1name p1value -p2name p2value + + # Example 3: do not re-create the database, just load more tasks into an existing one: +init_pipeline.pl Bio::EnsEMBL::Funcgen::HiveConfig::*_conf -job_topup -password -p1name p1value -p2name p2value + + +=head1 DESCRIPTION + + This is the Config file for the Import Pipeline + + Please refer to Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf module to understand the interface implemented here. + + The Import pipeline consists of several "analysis": + * SetupPipeline is equivalent to the "prepare" in parse_and_import.pl + * LoadMotifFeatures loads motif features per each slice... + * WrapUpPipeline finalizes when all partial imports are done... + + Please see the implementation details in LoadMotifFeatures Runnable module + +=head1 CONTACT + + Please contact ensembl-dev@ebi.ac.uk mailing list with questions/suggestions. + +=cut + + +package Bio::EnsEMBL::Funcgen::HiveConfig::ImportMotifFeatures_conf; + +use strict; +use warnings; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); + +use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf'); +# All Hive databases configuration files should inherit from HiveGeneric, directly or indirectly + + +=head2 default_options + + Description : Implements default_options() interface method of + Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that is used to initialize default options. + +=cut + +sub default_options { + my ($self) = @_; + return + { + 'ensembl_cvs_root_dir' => $ENV{'SRC'}, + # some Compara developers might prefer $ENV{'HOME'}.'/ensembl_main' + + 'pipeline_db' => + { + -host => $self->o('host'), + -port => $self->o('port'), + -user => $self->o('user'), + -pass => $self->o('pass'), + -dbname => $ENV{'USER'}.'_motif_import_'.$self->o('dbname'), + }, + + #Need to change this to use $ENV{OUT_ROOT} so we can switch scratch usage easily + 'output_dir' => '/lustre/scratch103/ensembl/funcgen/output/'.$self->o('dbname'), + 'slices' => '', + + }; +} + +=head2 resource_classes + + Description : Implements resource_classes() interface method of + Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that lists the LSF resource classes available + +=cut + +sub resource_classes { + my ($self) = @_; + return + { + 0 => { -desc => 'default', 'LSF' => '' }, + 1 => { -desc => 'urgent', 'LSF' => '-q yesterday' }, + 2 => { -desc => 'normal ens-genomics1', 'LSF' => '-M1000000 -R"select[myens_genomics1<1000 && mem>1000] rusage[myens_genomics1=10:duration=10:decay=1:mem=1000]"' }, + 3 => { -desc => 'long ens-genomics1', 'LSF' => '-q long -R"select[myens_genomics1<1000] rusage[myens_genomics1=10:duration=10:decay=1]"' }, + 4 => { -desc => 'long high memory', 'LSF' => '-q long -M4000000 -R"select[mem>4000] rusage[mem=4000]"' }, + 5 => { -desc => 'long ens-genomics1 high memory', 'LSF' => '-q long -M4000000 -R"select[myens_genomics1<600 && mem>4000] rusage[myens_genomics1=12:duration=5:decay=1:mem=4000]"' }, + + }; +} + + +=head2 pipeline_wide_parameters + + Description : Interface method that should return a hash of pipeline_wide_parameter_name->pipeline_wide_parameter_value pairs. + The value doesn't have to be a scalar, can be any Perl structure now (will be stringified and de-stringified automagically). + Please see existing PipeConfig modules for examples. + +=cut + +sub pipeline_wide_parameters { + my ($self) = @_; + return { + + 'pipeline_name' => $self->o('pipeline_db', '-dbname'), # name used by the beekeeper to prefix job names on the farm + 'hive_output_dir' => $self->o('output_dir')."/motif_features/hive_output", + 'output_dir' => $self->o('output_dir')."/motif_features/results", + + 'host' => $self->o('host'), + 'port' => $self->o('port'), + 'user' => $self->o('user'), + 'pass' => $self->o('pass'), + 'dbname' => $self->o('dbname'), + + 'dnadb_host' => $self->o('dnadb_host'), + 'dnadb_port' => $self->o('dnadb_port'), + 'dnadb_user' => $self->o('dnadb_user'), + 'dnadb_name' => $self->o('dnadb_name'), + + 'efg_src' => $self->o('efg_src'), + + 'slices' => $self->o('slices'), + + }; +} + + +=head2 pipeline_create_commands + + Description : Implements pipeline_create_commands() interface method of + Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that lists the commands + that will create and set up the Hive database. + +=cut + + +sub pipeline_create_commands { + my ($self) = @_; + + return + [ + #HiveGeneric assumes ensembl-hive folder while if you use the stable version its ensembl-hive_stable! + @{$self->SUPER::pipeline_create_commands}, + # inheriting database and hive tables creation rather than doing the following + #'mysql '.$self->dbconn_2_mysql('pipeline_db', 0)." -e 'CREATE DATABASE ".$self->o('pipeline_db', '-dbname')."'", + # standard eHive tables and procedures: + #'mysql '.$self->dbconn_2_mysql('pipeline_db', 1).' <'.$self->o('ensembl_cvs_root_dir').'/ensembl-hive/sql/tables.sql', + #'mysql '.$self->dbconn_2_mysql('pipeline_db', 1).' <'.$self->o('ensembl_cvs_root_dir').'/ensembl-hive/sql/procedures.mysql', + + + #Create hive output folders as required + 'mkdir -p '.$self->o('output_dir')."/motif_features/hive_output", + 'mkdir -p '.$self->o('output_dir')."/motif_features/results", + ]; +} + + +=head2 pipeline_analyses + + Description : Implements pipeline_analyses() interface method of + Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that defines the structure of the pipeline: analyses, jobs, rules, etc. + + +=cut + +sub pipeline_analyses { + my ($self) = @_; + + return + [ + { + -logic_name => 'run_import', + -module => 'Bio::EnsEMBL::Funcgen::RunnableDB::ImportMotifFeatures', + -parameters => { }, + -hive_capacity => 1, # allow several workers to perform identical tasks in parallel + -batch_size => 1, + -input_ids => [ + #For the moment it only receives the matrix, and deduces feature_type(s) from there... + { 'matrix' => $self->o('matrix'), 'file' => $self->o('file') } + ], + -rc_id => 2, + }, + ]; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/HiveConfig/Import_conf.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/HiveConfig/Import_conf.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,279 @@ + +=pod + +=head1 NAME + + Bio::EnsEMBL::Funcgen::HiveConfig::Import_conf; + +=head1 SYNOPSIS + + # Example 1: specifying only the mandatory options (initial params are taken from defaults) +init_pipeline.pl Bio::EnsEMBL::Funcgen::HiveConfig::Import_conf -password + + # Example 2: specifying the mandatory options as well as setting initial params: +init_pipeline.pl Bio::EnsEMBL::Funcgen::HiveConfig::Import_conf -password -p1name p1value -p2name p2value + + # Example 3: do not re-create the database, just load more tasks into an existing one: +init_pipeline.pl Bio::EnsEMBL::Funcgen::HiveConfig::Import_conf -job_topup -password -p1name p1value -p2name p2value + + +=head1 DESCRIPTION + + This is the Config file for the Import Pipeline + + Please refer to Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf module to understand the interface implemented here. + + The Import pipeline consists of several "analysis": + * SetupImportPipeline is equivalent to the "prepare" in parse_and_import.pl + * RunImport loads the reads per each slice... + * WrapUpImport finalizes when all partial imports are done... + + Please see the implementation details in Runnable modules themselves. + +=head1 CONTACT + + Please contact ensembl-dev@ebi.ac.uk mailing list with questions/suggestions. + +=cut + + +package Bio::EnsEMBL::Funcgen::HiveConfig::Import_conf; + +use strict; +use warnings; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); + +use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf'); +# All Hive databases configuration files should inherit from HiveGeneric, directly or indirectly + + +=head2 default_options + + Description : Implements default_options() interface method of + Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that is used to initialize default options. + +=cut + +sub default_options { + my ($self) = @_; + return { + 'ensembl_cvs_root_dir' => $ENV{'SRC'}, + + 'pipeline_db' => { + -host => $self->o('host'), + -port => $self->o('port'), + -user => $self->o('user'), + -pass => $self->o('pass'), + #-dbname => $ENV{USER}.'_peaks_'.$self->o('dbname'), + -dbname => $self->o('pipedb_name'), + }, + + 'verbose' => 0, + + 'format' => "SEQUENCING", + 'vendor' => "SEQUENCING", + 'parser' => "Bed", #Bed + 'group' => "efg", + 'location' => "Hinxton", + 'contact' => 'ensembl-dev@ebi.ac.uk', + + 'input_feature_class' => 'result', + 'registry_host' => 'ens-livemirror', + 'registry_port' => 3306, + 'registry_user' => 'ensro', + #'assembly' => 37, + + 'host' => 'ens-genomics1', + 'port' => 3306, + 'feature_analysis' => 'bwa_samse', + 'recover' => 1, + + 'data_dir' =>'/lustre/scratch103/ensembl/funcgen', + 'output_dir' =>'/lustre/scratch103/ensembl/funcgen/output/'.$self->o('dbname'), + + #you can only add a slice at a time... see if list of values can be passed as params... + #'slice' => undef, + + #why is this needed?? + 'farm' => 1, + + + }; +} + +=head2 resource_classes + + Description : Implements resource_classes() interface method of + Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that lists the LSF resource classes available + +=cut + +sub resource_classes { + my ($self) = @_; + return { + 0 => { -desc => 'default', 'LSF' => '' }, + 1 => { -desc => 'urgent', 'LSF' => '-q yesterday' }, + 2 => { -desc => 'normal ens-genomics1', 'LSF' => '-R"select[myens_genomics1<1000] rusage[myens_genomics1=10:duration=10:decay=1]"' }, + 3 => { -desc => 'long ens-genomics1', 'LSF' => '-q long -R"select[myens_genomics1<1000] rusage[myens_genomics1=10:duration=10:decay=1]"' }, + 4 => { -desc => 'long high memory', 'LSF' => '-q long -M4000000 -R"select[mem>4000] rusage[mem=4000]"' }, + # + 5 => { -desc => 'long ens-genomics1 high memory', 'LSF' => '-q long -M6000000 -R"select[myens_genomics1<600 && mem>6000] rusage[myens_genomics1=12:duration=5:decay=1:mem=6000]"' }, + }; +} + + +=head2 pipeline_wide_parameters + + Description : Interface method that should return a hash of pipeline_wide_parameter_name->pipeline_wide_parameter_value pairs. + The value doesn't have to be a scalar, can be any Perl structure now (will be stringified and de-stringified automagically). + Please see existing PipeConfig modules for examples. + +=cut + +sub pipeline_wide_parameters { + my ($self) = @_; + return { + + 'pipeline_name' => $self->o('pipedb_name'), # name used by the beekeeper to prefix job names on the farm + #Add a parameter $self->o('output_dir') + 'output_dir' => $self->o('output_dir')."/".$self->o('vendor'), + 'hive_output_dir' => $self->o('output_dir')."/".$self->o('vendor')."/hive_debug", + "species" => $self->o('species'), + "dbname" => $self->o("dbname"), + "user" => $self->o("user"), + "pass" => $self->o("pass"), + "input_dir" => $self->o('input_dir'), + + 'verbose' => $self->o('verbose'), + + 'format' => $self->o('format'), + 'vendor' => $self->o('vendor'), + 'parser' => $self->o('parser'), + 'group' => $self->o('group'), + 'location' => $self->o('location'), + 'contact' => $self->o('contact'), + + 'input_feature_class' => $self->o('input_feature_class'), + 'registry_host' => $self->o('registry_host'), + 'registry_port' => $self->o('registry_port'), + 'registry_user' => $self->o('registry_user'), + 'registry_version' => $self->o('registry_version'), + 'assembly' => $self->o('assembly'), + + 'host' => $self->o('host'), + 'port' => $self->o('port'), + 'feature_analysis' => $self->o('feature_analysis'), + 'recover' => $self->o('recover'), + + 'data_dir' => $self->o('data_dir'), + + #you can only add a slice at a time... see if list of values can be passed as params... + #'slice' => undef, + + #why is this needed?? + 'farm' => $self->o('farm'), + + + }; +} + + +=head2 pipeline_create_commands + + Description : Implements pipeline_create_commands() interface method of + Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that lists the commands + that will create and set up the Hive database. + +=cut + + +sub pipeline_create_commands { + my ($self) = @_; + + + return [ + + #HiveGeneric assumes ensembl-hive folder while if you use the stable version its ensembl-hive_stable! + @{$self->SUPER::pipeline_create_commands}, + # inheriting database and hive tables creation + + #'mysql '.$self->dbconn_2_mysql('pipeline_db', 0)." -e 'CREATE DATABASE ".$self->o('pipeline_db', '-dbname')."'", + + # standard eHive tables and procedures: + #'mysql '.$self->dbconn_2_mysql('pipeline_db', 1).' <'.$self->o('ensembl_cvs_root_dir').'/ensembl-hive/sql/tables.sql', + #'mysql '.$self->dbconn_2_mysql('pipeline_db', 1).' <'.$self->o('ensembl_cvs_root_dir').'/ensembl-hive/sql/procedures.sql', + + #Create hive output folders as required + 'mkdir -p '.$self->o('output_dir')."/".$self->o("vendor"), + 'mkdir -p '.$self->o('output_dir')."/".$self->o('vendor')."/hive_debug", + + ]; +} + + +=head2 pipeline_analyses + + Description : Implements pipeline_analyses() interface method of + Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that defines the structure of the pipeline: analyses, jobs, rules, etc. + + +=cut + +sub pipeline_analyses { + my ($self) = @_; + + return [ + { + + -logic_name => 'setup_pipeline', + -module => 'Bio::EnsEMBL::Funcgen::RunnableDB::Import', + -parameters => { 'batch_job' => 0, 'prepared' => 0 }, + -input_ids => [ + # No initial input_ids... these will be added as needed by init_pipeline -job_topup + { 'cell_type' => $self->o('cell_type'), 'feature_type' => $self->o('feature_type'), 'input_set' => $self->o('input_set'), 'result_file' => $self->o('result_file') }, + #allow slice(s) for partial import... + #{ 'cell_type' => $self->o('cell_type'), 'feature_type' => $self->o('feature_type'), 'input_set' => $self->o('input_set'), 'result_file' => $self->o('result_file'), 'slice' => $self->o('slice') }, + ], + -flow_into => { + 1 => [ 'import_reads' ], + 2 => [ 'wrap_up_pipeline' ], + }, + -hive_capacity => 10, + -rc_id => 0, + #this really need revising as this is sorting the bed files + #Need to change resource to reserve tmp space + }, + + { + -logic_name => 'import_reads', + -module => 'Bio::EnsEMBL::Funcgen::RunnableDB::Import', + -parameters => { 'batch_job' => 1, 'prepared' => 1 }, + -input_ids => [ + # (jobs for this analysis will be flown_into via branch-1 from 'setup_pipeline' jobs above) + ], + -hive_capacity => 50, + #Control files should be handled by setup_pipeline. + -rc_id => 5, # Better safe than sorry... size of datasets tends to increase... + #use semaphores... + #-wait_for => [ 'setup_pipeline' ] + }, + + { + -logic_name => 'wrap_up_pipeline', + -module => 'Bio::EnsEMBL::Funcgen::RunnableDB::Import', + -parameters => { 'wrap_up' => 1 }, + -input_ids => [ + # (jobs for this analysis will be flown_into via branch-2 from 'setup_pipeline' jobs above) + ], + -hive_capacity => 10, + -rc_id => 0, + #Use semaphores... + #-wait_for => [ 'run_peaks_DNAse', 'run_peaks' ], + }, + ]; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/HiveConfig/MotifFinder_conf.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/HiveConfig/MotifFinder_conf.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,234 @@ + +=pod + +=head1 NAME + + Bio::EnsEMBL::Funcgen::HiveConfig::MotifFinder_conf; + +=head1 SYNOPSIS + + # Example 1: specifying only the mandatory options (initial params are taken from defaults) +init_pipeline.pl Bio::EnsEMBL::Funcgen::HiveConfig::*_conf -password + + # Example 2: specifying the mandatory options as well as setting initial params: +init_pipeline.pl Bio::EnsEMBL::Funcgen::HiveConfig::*_conf -password -p1name p1value -p2name p2value + + # Example 3: do not re-create the database, just load more tasks into an existing one: +init_pipeline.pl Bio::EnsEMBL::Funcgen::HiveConfig::*_conf -job_topup -password -p1name p1value -p2name p2value + + +=head1 DESCRIPTION + + This is the Config file for the Motif Finder Pipeline + + Please refer to Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf module to understand the interface implemented here. + + The Motif Finder pipeline consists of several "analysis": + * SetupMotifPipeline + * InferSubMotifs + * ClusterMotifs + + Please see the implementation details in Runnable modules themselves. + +=head1 CONTACT + + Please contact ensembl-dev@ebi.ac.uk mailing list with questions/suggestions. + +=cut + + +package Bio::EnsEMBL::Funcgen::HiveConfig::MotifFinder_conf; + +use strict; +use warnings; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); + +use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf'); +# All Hive databases configuration files should inherit from HiveGeneric, directly or indirectly + + +=head2 default_options + + Description : Implements default_options() interface method of + Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that is used to initialize default options. + +=cut + +sub default_options { + my ($self) = @_; + return { + 'ensembl_cvs_root_dir' => $ENV{'SRC'}, + + 'pipeline_db' => { + -host => $self->o('dbhost'), + -port => $self->o('dbport'), + -user => $self->o('pipeuser'), + -pass => $self->o('pipepass'), + #-dbname => $ENV{USER}.'_peaks_'.$self->o('dbname'), + -dbname => $self->o('pipedb_name'), + }, + + 'dnadb_host' => 'ens-livemirror', + 'dnadb_port' => 3306, + "dnadb_user" => 'ensro', + + 'bin_dir' => "/software/ensembl/funcgen", + + 'bin_size' => 500, + 'window_size' => 50, + + }; +} + +=head2 resource_classes + + Description : Implements resource_classes() interface method of + Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that lists the LSF resource classes available + +=cut + +sub resource_classes { + my ($self) = @_; + return { + 0 => { -desc => 'default', 'LSF' => '' }, + 1 => { -desc => 'urgent', 'LSF' => '-q yesterday' }, + 2 => { -desc => 'normal ens-genomics1', 'LSF' => '-R"select[myens_genomics1<600 && myens_livemirror<600] rusage[myens_livemirror=10:myens_genomics1=10:duration=10:decay=1]"' }, + 3 => { -desc => 'long ens-genomics1', 'LSF' => '-q long -R"select[myens_genomics1<1000] rusage[myens_genomics1=10:duration=10:decay=1]"' }, + 4 => { -desc => 'long high memory', 'LSF' => '-q long -M4000000 -R"select[mem>4000] rusage[mem=4000]"' }, + + }; +} + + +=head2 pipeline_wide_parameters + + Description : Interface method that should return a hash of pipeline_wide_parameter_name->pipeline_wide_parameter_value pairs. + The value doesn't have to be a scalar, can be any Perl structure now (will be stringified and de-stringified automagically). + Please see existing PipeConfig modules for examples. + +=cut + +sub pipeline_wide_parameters { + my ($self) = @_; + return { + + 'pipeline_name' => $self->o('pipedb_name'), # name used by the beekeeper to prefix job names on the farm + + 'output_dir' => $self->o('work_dir')."/motifs/results", + 'hive_output_dir' => $self->o('work_dir')."/motifs/hive_output", + + 'dbhost' => $self->o('dbhost'), + 'dbport' => $self->o('dbport'), + "dbuser" => $self->o("dbuser"), + "dbname" => $self->o("dbname"), + + 'dnadb_host' => $self->o('dnadb_host'), + 'dnadb_port' => $self->o('dnadb_port'), + "dnadb_user" => $self->o("dnadb_user"), + "dnadb_name" => $self->o("dnadb_name"), + + "species" => $self->o("species"), + + 'bin_dir' => $self->o('bin_dir'), + + 'bin_size' => $self->o('bin_size'), + 'window_size' => $self->o('window_size'), + + + + }; +} + + +=head2 pipeline_create_commands + + Description : Implements pipeline_create_commands() interface method of + Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that lists the commands + that will create and set up the Hive database. + +=cut + + +sub pipeline_create_commands { + my ($self) = @_; + + + return [ + + @{$self->SUPER::pipeline_create_commands}, # inheriting database and hive tables creation + #'mysql '.$self->dbconn_2_mysql('pipeline_db', 0)." -e 'CREATE DATABASE ".$self->o('pipeline_db', '-dbname')."'", + # standard eHive tables and procedures: + #'mysql '.$self->dbconn_2_mysql('pipeline_db', 1).' <'.$self->o('ensembl_hive_root_dir').'/sql/tables.sql', + #'mysql '.$self->dbconn_2_mysql('pipeline_db', 1).' <'.$self->o('ensembl_hive_root_dir').'/sql/procedures.sql', + + #Create hive output folders as required + 'mkdir -p '.$self->o('work_dir')."/motifs/results", + 'mkdir -p '.$self->o('work_dir')."/motifs/hive_output", + + ]; +} + + +=head2 pipeline_analyses + + Description : Implements pipeline_analyses() interface method of + Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that defines the structure of the pipeline: analyses, jobs, rules, etc. + + +=cut + +sub pipeline_analyses { + my ($self) = @_; + + return [ + { + + -logic_name => 'setup_pipeline', + -module => 'Bio::EnsEMBL::Funcgen::RunnableDB::SetupMotifInference', + -parameters => { 'batch_job' => 0, 'prepared' => 0 }, + -input_ids => [ + # No initial input_ids... these will be added as needed by init_pipeline -job_topup + { 'feature_set' => $self->o('feature_set') }, + ], + -flow_into => { + 2 => [ 'infer_submotifs' ], + 3 => [ 'cluster_motifs' ], + }, + -hive_capacity => 10, + -rc_id => 0, + }, + + { + #This basically consists on running a command... + -logic_name => 'infer_submotifs', + -module => 'Bio::EnsEMBL::Funcgen::RunnableDB::InferMotifs', + -parameters => { }, + -input_ids => [ + # (jobs for this analysis will be flown_into via branch-1 from 'setup_pipeline' jobs above) + ], + -hive_capacity => 100, + -rc_id => 2, + #use semaphores... + #-wait_for => [ 'setup_pipeline' ] + }, + + { + #This basically consists on running a command... + -logic_name => 'cluster_motifs', + -module => 'Bio::EnsEMBL::Funcgen::RunnableDB::ClusterMotifs', + -parameters => { }, + -input_ids => [ + # (jobs for this analysis will be flown_into via branch-2 from 'setup_pipeline' jobs above) + ], + -hive_capacity => 10, + -rc_id => 0, + #Use semaphores... + #-wait_for => [ 'run_peaks_DNAse', 'run_peaks' ], + }, + ]; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/HiveConfig/Peaks_conf.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/HiveConfig/Peaks_conf.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,256 @@ + +=pod + +=head1 NAME + + Bio::EnsEMBL::Funcgen::HiveConfig::Peaks_conf; + +=head1 SYNOPSIS + + # Example 1: specifying only the mandatory options (initial params are taken from defaults) +init_pipeline.pl Bio::EnsEMBL::Funcgen::HiveConfig::Peaks_conf -password + + # Example 2: specifying the mandatory options as well as setting initial params: +init_pipeline.pl Bio::EnsEMBL::Funcgen::HiveConfig::Peaks_conf -password -p1name p1value -p2name p2value + + # Example 3: do not re-create the database, just load more tasks into an existing one: +init_pipeline.pl Bio::EnsEMBL::Funcgen::HiveConfig::Peaks_conf -job_topup -password -p1name p1value -p2name p2value + + +=head1 DESCRIPTION + + This is the Config file for the Peaks Pipeline + + Please refer to Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf module to understand the interface implemented here. + + The Peaks pipeline consists of several "analysis": + * SetupPeaksPipeline verifies the existence of experiments etc... + * RunSWEmbl make the peak calling and stores the annotated features... + * WrapUpSWEmbl do some filtering when needed and QC + + Please see the implementation details in Runnable modules themselves. + +=head1 CONTACT + + Please contact ensembl-dev@ebi.ac.uk mailing list with questions/suggestions. + +=cut + + +package Bio::EnsEMBL::Funcgen::HiveConfig::Peaks_conf; + +use strict; +use warnings; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); + +use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf'); +# All Hive databases configuration files should inherit from HiveGeneric, directly or indirectly + + +=head2 default_options + + Description : Implements default_options() interface method of + Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that is used to initialize default options. + +=cut + +sub default_options { + my ($self) = @_; + return { + %{$self->SUPER::default_options}, # inheriting database and hive tables creation + + 'pipeline_db' => { + -host => $self->o('dbhost'), + -port => $self->o('dbport'), + -user => $self->o('dbuser'), + -pass => $self->o('dbpass'), + #-dbname => $ENV{USER}.'_peaks_'.$self->o('dbname'), + -dbname => $self->o('pipedb_name'), + }, + + #'efgdb_host' => ... + #We can add default values for all these but tend to avoid since people quickly forget what those are... + + 'skip_control' => 0, + 'control_feature' => 'WCE', + 'bin_dir' => '/software/ensembl/funcgen', + 'control_file' => '' + + }; +} + +=head2 resource_classes + + Description : Implements resource_classes() interface method of + Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that lists the LSF resource classes available + +=cut + +sub resource_classes { + my ($self) = @_; + return { + 0 => { -desc => 'default', 'LSF' => '' }, + 1 => { -desc => 'urgent', 'LSF' => '-q yesterday' }, + 2 => { -desc => 'normal ens-genomics1', 'LSF' => '-R"select[myens_genomics1<1000] rusage[myens_genomics1=10:duration=10:decay=1]"' }, + 3 => { -desc => 'long ens-genomics1', 'LSF' => '-q long -R"select[myens_genomics1<1000] rusage[myens_genomics1=10:duration=10:decay=1]"' }, + 4 => { -desc => 'long high memory', 'LSF' => '-q long -M4000000 -R"select[mem>4000] rusage[mem=4000]"' }, + 5 => { -desc => 'long ens-genomics1 high memory', 'LSF' => '-q long -M4000000 -R"select[myens_genomics1<1000 && mem>4000] rusage[myens_genomics1=10:duration=10:decay=1:mem=4000]"' }, + }; +} + + +=head2 pipeline_wide_parameters + + Description : Interface method that should return a hash of pipeline_wide_parameter_name->pipeline_wide_parameter_value pairs. + The value doesn't have to be a scalar, can be any Perl structure now (will be stringified and de-stringified automagically). + Please see existing PipeConfig modules for examples. + +=cut + +sub pipeline_wide_parameters { + my ($self) = @_; + return { + %{$self->SUPER::pipeline_wide_parameters}, # inheriting database and hive tables creation + 'pipeline_name' => $self->o('pipedb_name'), # name used by the beekeeper to prefix job names on the farm + + + 'work_dir' => $self->o('work_dir'), # data directories and filenames + #'output_dir' => $self->o('work_dir').'/ehive/'.$self->o('efgdb_name').'/hive_results', + #'hive_output_dir' => $self->o('work_dir').'/ehive/'.$self->o('efgdb_name').'/hive_debug', + #'output_dir' => $self->o('output_dir').'/ehive/'.$self->o('efgdb_name').'/hive_results', + #'hive_output_dir' => $self->o('output_dir').'/ehive/'.$self->o('efgdb_name').'/hive_debug', + 'output_dir' => $self->o('output_dir').'/peaks/results', + 'hive_output_dir' => $self->o('output_dir').'/peaks/hive_debug', + 'bin_dir' => $self->o('bin_dir'), + + #Maybe use parameters instead of ENV Variables... use ENV variables in the pipeline ENV + "dnadb" => { + "-host" => $self->o('dnadb_host'), + "-port" => $self->o('dnadb_port'), + "-user" => $self->o('dnadb_user'), + "-dbname" => $self->o('dnadb_name'), + }, + "efgdb" => { + "-host" => $self->o('dbhost'), + "-port" => $self->o('dbport'), + "-user" => $self->o('dbuser'), + "-pass" => $self->o('dbpass'), + "-dbname" => $self->o('dbname'), + }, + #This could be inferred from the db, but it's probably safer(?) to pass as parameter... + "species" => $self->o('species'), + #May pass this to input_id... to allow for files of different assemblies in the same pipeline run. + "assembly" => $self->o('assembly'), + + }; +} + + +=head2 pipeline_create_commands + + Description : Implements pipeline_create_commands() interface method of + Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that lists the commands + that will create and set up the Hive database. + +=cut + +sub pipeline_create_commands { + my ($self) = @_; + return [ + #HiveGeneric assumes ensembl-hive folder while if you use the stable version it's ensembl-hive_stable! + @{$self->SUPER::pipeline_create_commands}, # inheriting database and hive tables creation + + #'mysql '.$self->dbconn_2_mysql('pipeline_db', 0)." -e 'CREATE DATABASE ".$self->o('pipeline_db', '-dbname')."'", + + # standard eHive tables and procedures: + #'mysql '.$self->dbconn_2_mysql('pipeline_db', 1).' <'.$self->o('ensembl_cvs_root_dir').'/ensembl-hive/sql/tables.sql', + #'mysql '.$self->dbconn_2_mysql('pipeline_db', 1).' <'.$self->o('ensembl_cvs_root_dir').'/ensembl-hive/sql/procedures.sql', + + #Create hive output folders as required + 'mkdir -p '.$self->o('output_dir').'/peaks/results', + 'mkdir -p '.$self->o('output_dir').'/peaks/hive_debug', + + #This command makes the init_pipeline script fail. + #Any value in meta can be added with pipeline_wide_parameters() + #'mysql '.$self->dbconn_2_mysql($pipeline_db, 1).' -e "INSERT INTO meta (meta_key, meta_value) VALUES (\'hive_output_dir\',\''.$self->o('work_dir').'/ehive/'.$self->o('efgdb_name').'/hive_debug\');"', + + ]; +} + + +=head2 pipeline_analyses + + Description : Implements pipeline_analyses() interface method of + Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that defines the structure of the pipeline: analyses, jobs, rules, etc. + + +=cut + +sub pipeline_analyses { + my ($self) = @_; + + return [ + { + + -logic_name => 'setup_pipeline', + -module => 'Bio::EnsEMBL::Funcgen::RunnableDB::SetupPeaksPipeline', + -parameters => {}, + -input_ids => [ + # No initial input_ids... these will be added as needed by init_pipeline -job_topup + { 'cell_type' => $self->o('cell_type'), 'feature_type' => $self->o('feature_type'), 'experiment_name' => $self->o('experiment_name'), 'file_type' => $self->o('file_type'), 'analysis' => $self->o('analysis_name'), 'skip_control' => $self->o('skip_control'), 'control_feature' => $self->o('control_feature'), 'control_file' => $self->o('control_file') }, +# , 'control_file' =>$self->o('cell_type')."_".$self->o('control_feature')."_".$self->o('experiment_name').".samse.".$self->o('file_type').".gz" }, + ], + -flow_into => { + 3 => [ 'run_peaks' ], + 4 => [ 'run_peaks_wide' ], + #2 => [ 'wrap_up_pipeline' ], + }, + #These jobs cannot run in parallel due to race conditions! Do NOT change this setting unless you know what you're doing + -hive_capacity => 1, + -rc_id => 0, + }, + + { + -logic_name => 'run_peaks', + -module => 'Bio::EnsEMBL::Funcgen::RunnableDB::RunSWEmbl', + -parameters => { }, + -input_ids => [ + # (jobs for this analysis will be flown_into via branch-3 from 'setup_pipeline' jobs above) + ], + -hive_capacity => 10, + #Control files should be handled by setup_pipeline. + -rc_id => 5, # Better safe than sorry... size of datasets tends to increase... + -wait_for => [ 'setup_pipeline' ] + }, + + { + -logic_name => 'run_peaks_wide', + -module => 'Bio::EnsEMBL::Funcgen::RunnableDB::RunCCAT', + -parameters => { }, + -input_ids => [ + # (jobs for this analysis will be flown_into via branch-3 from 'setup_pipeline' jobs above) + ], + -hive_capacity => 10, + #Control files should be handled by setup_pipeline. + -rc_id => 2, # CCAT does not need much + -wait_for => [ 'setup_pipeline' ] + }, + + #{ + # -logic_name => 'wrap_up_pipeline', + # -module => 'Bio::EnsEMBL::Funcgen::RunnableDB::WrapUpPeaksPipeline', + # -parameters => {}, + # -input_ids => [ + # # (jobs for this analysis will be flown_into via branch-1 from 'setup_pipeline' jobs above) + # ], + # #TODO see if it can run in paralell, usually we should be able to + # -hive_capacity => 10, + # -wait_for => [ 'run_peaks_DNAse', 'run_peaks' ], + #}, + ]; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/Importer.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Importer.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,2708 @@ + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::Funcgen::Importer + +=head1 SYNOPSIS + +my $imp = Bio::EnsEMBL::Funcgen::Importer->new(%params); +$imp->register_experiment(); + + +=head1 DESCRIPTION + +B is the main class coordinating import of tiling array design and experimental data. +It utilises several underlying parser classes specific to array vendor or import file type. + +=cut + +################################################################################ + +package Bio::EnsEMBL::Funcgen::Importer; + +use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(get_date open_file run_system_cmd); +use Bio::EnsEMBL::Utils::Exception qw( throw deprecate ); +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Funcgen::Experiment; +use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; +use Bio::EnsEMBL::DBSQL::DBAdaptor; + +use File::Path; +use strict; +use vars qw(@ISA); + + +################################################################################ + +=head2 new + + Description : Constructor method + + Arg [1] : hash containing optional attributes: + -name Name of Experiment(dir) + -format of array e.g. Tiled(default) + -vendor name of array vendor + -description of the experiment + -pass DB password + -host DB host + -user DB user + -port DB port + -registry_host Host to load registry from + -registry_port Port for registry host + -registry_user User for registry host + -registry_pass Password for registry user + -ssh Flag to set connection over ssh via forwarded port to localhost (default = 0); remove? + -group name of experimental/research group +` -location of experimental/research group + -contact e/mail address of primary contact for experimental group + -species + -assembly Genome assembly version i.e. 36 for NCBI36 + -recover Recovery flag (default = 0) + -data_dir Root data directory (default = $ENV{'EFG_DATA'}) + -output_dir review these dirs ??????? + -input_dir ????????? + -import_dir ??????? + -norm_dir ?????? + -fasta dump FASTA flag (default =0) + -array_set Flag to treat all chip designs as part of same array (default = 0) + -array_name Name for array set + -array_file Path of array file to import for sanger ENCODE array + -result_set_name Name to give the raw and normalised result sets (default uses experiment and analysis name) + -norm_method Normalisation method (Nimblegen default = VSN_GLOG); + -dbname Override for autogeneration of funcgen dbaname + -reg_config path to local registry config file (default = ~/ensembl.init || undef) + -design_type MGED term (default = binding_site_identification) get from meta/MAGE? + + -farm Flag to submit jobs to farm e.g. normalisation jobs + -batch_job Flag to signify that this Importer is running as a prepared batch/farm job + -prepared Flag to signify result files have been previously imported in prepare mode + and file names will differ to those record in InputSubset + + + #-use_defaults This changes some mandatory parameters to optional, instead using either DEFAULT or the input file name for the following options -name, -input_set, -feature_type, -cell_type etc ??? + + -verbose + ReturnType : Bio::EnsEMBL::Funcgen::Importer + Example : my $Exp = Bio::EnsEMBL::Importer->new(%params); + Exceptions : throws if mandatory params are not set or DB connect fails + Caller : General + Status : Medium - potential for %params names to change, remove %attrdata? + +=cut + +################################################################################ + +sub new{ + my ($caller) = shift; + + #my $reg = "Bio::EnsEMBL::Registry"; + my $class = ref($caller) || $caller; + + #$user, $host, $port, $pass, $dbname, $db, $assm_version, $reg_config, $reg_db, $ucsc_coords, $verbose, $release, $reg_host, $reg_port, $reg_user, $reg_pass + #$ftype_name, $ctype_name, + + my ($name, $format, $vendor, $group, $location, $contact, + $array_name, $array_set, $array_file, $data_dir, $result_files, + $exp_date, $desc, $design_type, $output_dir, $input_dir, + $batch_job, $farm, $prepared, + $norm_method, $old_dvd_format, $parser_type) + = rearrange(['NAME', 'FORMAT', 'VENDOR', 'GROUP', 'LOCATION', 'CONTACT', + 'ARRAY_NAME', 'ARRAY_SET', 'ARRAY_FILE', 'DATA_DIR', 'RESULT_FILES', + 'EXPERIMENT_DATE', 'DESCRIPTION', + 'DESIGN_TYPE', 'OUTPUT_DIR', 'INPUT_DIR', #to allow override of defaults + 'BATCH_JOB', 'FARM', 'PREPARED', 'NORM_METHOD', + 'OLD_DVD_FORMAT', 'PARSER'], @_); + + + + #### Define parent parser class based on vendor + throw("Mandatory argument -vendor not defined") if ! defined $vendor; + + #This will override the default Vendor Parser type + #Evals simply protect from messy errors if parser type not found + my $parser_error; + my $vendor_parser = ucfirst(lc($vendor)); + + + #WARNING evaling these parsers to enable pluggability hides errors in parser + #use a perl -MBio::EnsEMBL::Funcgen::Parsers:ParserType to debug + #get rid of all this case guessing and force correct parser name usage? + + + #WARNING + #Dynamic setting of ISA in this way reports the resultant object as Importer, when + #some throws/methods are actually in other base/custom Parsers + #This can seem a little counterintuitive, but allows plugability + #With out the need for separate control scripts + + + #Change this to be set and required/detected in the parse_and_import.pl script + #Then we can have Importer.pm as the base class and get rid of this. + #as well as set_config methods? + + + eval {require "Bio/EnsEMBL/Funcgen/Parsers/${vendor_parser}.pm";}; + + if ($@) { + #Don't warn/throw yet as we might have a standard parser format + + $parser_error .= "There is no valid parser for the vendor your have specified:\t".$vendor. + "\nMaybe this is a typo or maybe you want to specify a default import format using the -parser option\n".$@; + } + + + + if (defined $parser_type) { + + #try normal case first + eval {require "Bio/EnsEMBL/Funcgen/Parsers/${parser_type}.pm";}; + + if ($@) { + $parser_type = ucfirst(lc($parser_type)); + + #Now eval the new parser + eval {require "Bio/EnsEMBL/Funcgen/Parsers/${parser_type}.pm";}; + + if ($@) { + + #Might be no default + my $txt = "There is no valid parser for the -parser format your have specified:\t".$parser_type."\n"; + + if (! $parser_error) { + $txt .= "Maybe this is a typo or maybe you want run with the default $vendor_parser parser\n"; + } + + throw($txt.$@); + } + + #warn about over riding vendor parser here + if (! $parser_error) { + #Can't log this as we haven't blessed the Helper yet + warn("WARNING\t::\tYou are over-riding the default ".$vendor." parser with -parser ".$parser_type); + } + } + } else { + throw($parser_error) if $parser_error; + $parser_type = $vendor_parser; + } + + + #we should now really set parser_type as an attrtibute? + unshift @ISA, 'Bio::EnsEMBL::Funcgen::Parsers::'.$parser_type; + #change this to be called explicitly from the load script? + + #### Create object from parent class + + my $self = $class->SUPER::new(@_); + + #### Set vars and test minimum mandatory params for any import type + + $self->{'name'} = $name || throw('Mandatory param -name not met'); #This is not mandatory for array design import + ## $self->{'user'} = $user || $ENV{'EFG_WRITE_USER'}; + $self->vendor(uc($vendor)); #already tested + $self->{'format'} = uc($format) || 'TILED'; #remove default? + $self->group($group) if $group; + $self->location($location) if $location; + $self->contact($contact) if $contact; + ## $species || throw('Mandatory param -species not met'); + $self->array_name($array_name) if $array_name; + $self->array_set($array_set) if $array_set; + $self->array_file($array_file) if $array_file; + $self->{'data_dir'} = $data_dir || $ENV{'EFG_DATA'}; + $self->result_files($result_files)if $result_files; + $self->experiment_date($exp_date) if $exp_date; + $self->description($desc) if $desc; #experiment + ## $self->feature_set_description($fset_desc) if $fset_desc; + + #$assm_version || throw('Mandatory param -assembly not met'); + #Only required if setting DB by params e.g db not passed or generated from reg + #i.e. most of the time + #Why was this made mandatory? + #Default to dnadb||efgdb assm from the dbname + + $self->{'design_type'} = $design_type || 'binding_site_identification'; #remove default? + $self->{'output_dir'} = $output_dir if $output_dir; #config default override + $self->{'input_dir'} = $input_dir if $input_dir; #config default override + $self->farm($farm) if $farm; + $self->batch_job($batch_job); + $self->prepared($prepared); + ## $self->{'ssh'} = $ssh || 0; + ## $self->{'_dump_fasta'} = $fasta || 0; + #$self->{'recover'} = $recover || 0; Now in BaseImporter + #check for ~/.ensembl_init to mirror general EnsEMBL behaviour + ## $self->{'reg_config'} = $reg_config || ((-f "$ENV{'HOME'}/.ensembl_init") ? "$ENV{'HOME'}/.ensembl_init" : undef); + $self->{'old_dvd_format'} = $old_dvd_format || 0; + ## $self->{'ucsc_coords'} = $ucsc_coords || 0; + ## $self->{'verbose'} = $verbose || 0; + ## $self->{'release'} = $release; + + + + ## if($reg_host && $self->{'reg_config'}){ + ## warn "You have specified registry parameters and a config file:\t".$self->{'reg_config'}. + ## "\nOver-riding config file with specified paramters:\t${reg_user}@${reg_host}:$reg_port"; + ## } + + + #Will a general norm method be applicable for all imports? + #Already casued problems with Bed imports... remove? + #Could set NORM_METHOD in Parser!! + #warn "Need to fully implement norm_method is validate_mage, remove ENV NORM_METHOD?"; + $self->{'norm_method'} = $norm_method; # || $ENV{'NORM_METHOD'}; + + #if ($self->vendor ne 'NIMBLEGEN'){ + # $self->{'no_mage'} = 1; + # warn "Hardcoding no_mage for non-NIMBLEGEN imports"; + # } + + + # if($self->{'no_mage'} && $self->{'write_mage'}){ + # throw('-no_mage and -write_mage options are mutually exclusive, please select just one'); + # } + + ## #### Set up DBs and load and reconfig registry + ## + ## ### Load Registry + ## #Can we load the registry using the assembly version, then just redefine the efg DB? + ## #We have problems here if we try and load on a dev version, where no dev DBs are available on ensembldb + ## #Get the latest API version for the assembly we want to use + ## #Then load the registry from that version + ## #Then we can remove some of the dnadb setting code below? + ## #This may cause problems with API schema mismatches + ## #Can we just test whether the current default dnadb contains the assembly? + ## #Problem with this is that this will not have any other data e.g. genes etc + ## #which may be required for some parsers + ## + ## #How does the registry pick up the schema version?? + ## + ## #We should really load the registry first given the dnadb assembly version + ## #Then reset the eFG DB as appropriate + ## + ## + ## if ($reg_host || ! defined $self->{'_reg_config'}) { + ## #defaults to current ensembl DBs + ## $reg_host ||= 'ensembldb.ensembl.org'; + ## $reg_user ||= 'anonymous'; + ## + ## #Default to the most recent port for ensdb + ## if( (! $reg_port) && + ## ($reg_host eq 'ensdb-archive') ){ + ## $reg_port = 5304; + ## } + ## + ## #This will try and load the dev DBs if we are using v49 schema or API? + ## #Need to be mindful about this when developing + ## #we need to tip all this on it's head and load the reg from the dnadb version!!!!!!! + ## + ## my $version_text= ($self->{'release'}) ? 'version '.$self->{'release'} : 'current version'; + ## $self->log("Loading $version_text registry from $reg_user".'@'.$reg_host); + ## + ## #Note this defaults API version, hence running with head code + ## #And not specifying a release version will find not head version + ## #DBs on ensembldb, resulting in an exception from reset_DBAdaptor below + ## $reg->load_registry_from_db( + ## -host => $reg_host, + ## -user => $reg_user, + ## -port => $reg_port, + ## -pass => $reg_pass, + ## #-host => "ens-staging", + ## #-user => 'ensro', + ## -db_version => $self->{'release'},#51 + ## -verbose => $self->verbose, + ## ); + ## + ## throw('Not sensible to set the import DB as the default eFG DB from '.$reg_host.', please define db params') if ((! $dbname) && (! $db)); + ## } + ## else{ + ## $self->log("Loading registry from:\t".$self->{'_reg_config'}); + ## $reg->load_all($self->{'_reg_config'}, 1); + ## } + ## + ## + ## #Need to test the DBs here, as we may not have loaded any! + ## #get_alias wil fail otherwise? + ## #This is a cyclical dependancy as we need alias to get species which we use to grab the DB + ## #alias is dependant on core DB being loaded with relevant meta entries. + ## #revise this when we split the Importer + ## + ## + ## #Validate species + ## my $alias = $reg->get_alias($species) || throw("Could not find valid species alias for $species\nYou might want to clean up:\t".$self->get_dir('output')); + ## $self->species($alias); + ## $self->{'param_species'} = $species;#Only used for dir generation + ## + ## + ## #SET UP DBs + ## if($db){ + ## #db will have been defined before reg loaded, so will be present in reg + ## + ## if(! (ref($db) && $db->isa('Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor'))){ + ## $self->throw('-db must be a valid Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor'); + ## } + ## } + ## else{ #define eFG DB from params or registry + ## + ## if($reg_db){#load eFG DB from reg + ## + ## if($dbname){ + ## throw("You cannot specify DB params($dbname) and load from the registry at the same time."); + ## } + ## + ## $self->log('WARNING: Loading eFG DB from Registry'); + ## $db = $reg->get_DBAdaptor($self->species(), 'funcgen'); + ## throw("Unable to retrieve ".$self->species." funcgen DB from the registry") if ! $db; + ## } + ## else{#resets the eFG DB in the custom or generic registry + ## + ## $dbname || throw('Must provide a -dbname if not using default custom registry config'); + ## #$user || throw('Must provide a -user parameter');#make this default to EFG_WRITE_USER? + ## $pass || throw('Must provide a -pass parameter'); + ## + ## #remove this and throw? + ## if(! defined $host){ + ## $self->log('WARNING: Defaulting to localhost'); + ## $host = 'localhost'; + ## } + ## + ## $port ||= 3306; + ## my $host_ip = '127.0.0.1';#is this valid for all localhosts? + ## + ## if ($self->{'ssh'}) { + ## $host = `host localhost`; #mac specific? nslookup localhost wont work on server/non-PC + ## #will this always be the same? + ## + ## if (! (exists $ENV{'EFG_HOST_IP'})) { + ## warn "Environment variable EFG_HOST_IP not set for ssh mode, defaulting to $host_ip for $host"; + ## } else { + ## $host_ip = $ENV{'EFG_HOST_IP'}; + ## } + ## + ## if ($self->host() ne 'localhost') { + ## warn "Overriding host ".$self->host()." for ssh connection via localhost($host_ip)"; + ## } + ## } + ## + ## + ## #data version is only used if we don't want to define the dbname + ## #This should never be guessed so don't need data_version here + ## #$dbname ||= $self->species()."_funcgen_".$self->data_version(); + ## + ## + ## #Remove block below when we can + ## my $dbhost = ($self->{'ssh'}) ? $host_ip : $host; + ## + ## #This isn't set yet!? + ## #When we try to load, say 49, when we only have 48 on ensembldb + ## #This fails because there is not DB set for v49, as it is not on ensembl DB + ## #In this case we need to load from the previous version + ## #Trap this and suggest using the -schema_version/release option + ## #Can we autodetect this and reload the registry? + ## #We want to reload the registry anyway with the right version corresponding to the dnadb + ## #We could either test for the db in the registry or just pass the class. + ## + ## $db = $reg->reset_DBAdaptor($self->species(), 'funcgen', $dbname, $dbhost, $port, $self->user, $pass, + ## { + ## -dnadb_host => $reg_host, + ## -dnadb_port => $reg_port, + ## -dnadb_assembly => $assm_version, + ## -dnadb_user => $reg_user, + ## -dnadb_pass => $reg_pass, + ## }); + ## + ## + ## #ConfigRegistry will try and set this + ## #This will fail if there is already one in the registry as it will try + ## #and defined a new unique species so as not to overwrite the original + ## #e.g. homo_sapiens1 + ## + ## #This is why it was orignally written backwards as we can't easily dynamically redefine + ## #an adaptor in the registry without ConfigRegistry trying to change the name + ## #the very act of creating a new db to redefine the registry with causes ConfigRegistry + ## #to try and register it with a unique species name + ## + ## #Delete the old funcgen DB from the registry first + ## #$reg->remove_DBAdaptor($self->species, 'funcgen'); + ## + ## #ConfigRegistry will automatically configure this new db + ## + ## #$db = Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor->new( + ## # -user => $user, + ## # -host => ($self->{'ssh'}) ? $host_ip : $host, + ## # -port => $port, + ## # -pass => $pass, + ## # #we need to pass dbname else we can use non-standard dbs + ## # -dbname => $dbname, + ## # -species => $self->species(), + ## # -group => 'funcgen', + ## # ); + ## + ## + ## #if we get a species like homo_sapiens1 here + ## #This is because ConfigRegistry is try to make the dbname different between the + ## #one already present and the one you're trying to add + ## } + ## } + ## + ## + ## ### VALIDATE DNADB + ## #This is now done in DBAdaptor + ## + ## #We can change this to just use the assembly version + ## #we could even have the wordy assmelby version from the meta table + ## #do the standard ensembl subs + ## #s/[A-Za-z]//g; + ## #s/\.//g; + ## #And then validate? + ## #Just stick to number version for now. + ## + ## + ## #Now we need to set the dnadb_host params to avoid ensembldb defaults + ## #This should check the registry first + ## #Then load from the registry db? + ## #If we have a multi host registry config file this isn't going to work! + ## + ## #Is this required anymore as the DBAdaptor handles this? + ## #Not if we pass a db with an incorrect dnadb attached. + ## + ## #if($db->_get_schema_build($db->dnadb()) !~ /_[0-9]+_${assm_version}[a-z]*$/){ + ### my $warning = "WARNING: dnadb does not match assembly_version $assm_version. Using ensembldb.enembl.org to define the dnadb"; + ### $warning .= ' rather than the reg_config' if (defined $self->{'_reg_config'}); + ## + ## #We need to account for reg_config DBs which may have custom info in + ## #So try reg_config host first, then try ensembldb with warning + ## #Could have a reg_config only flag for core dbs + ## #Need to implement more params in set_dnadb_by_assembly_version + ### $self->log($warning); + ## + ### $db->set_dnadb_by_assembly_version($assm_version); + ### } + ## + ## + ## + ## + ## #Test connections + ## $self->db($db); + ## $db->dbc->db_handle; + ## $db->dnadb->dbc->db_handle; + ## #Set re/disconnect options + ## + ## #These really need setting dependant on the import parser + ## $db->dbc->disconnect_when_inactive(1); + ## $db->dnadb->dbc->disconnect_when_inactive(1); + + + + ## ### Check analyses/feature_type/cell_type + ## if($feature_analysis){ + ## my $fanal = $self->db->get_AnalysisAdaptor->fetch_by_logic_name($feature_analysis); + ## throw("The Feature Analysis $feature_analysis does not exist in the database") if(!$fanal); + ## $self->feature_analysis($fanal); + ## + ## #This currently fails before the config gets loaded! + ## #Need to load config before this validation! + ## } + ## + ## if($ctype_name){ + ## my $ctype = $self->db->get_CellTypeAdaptor->fetch_by_name($ctype_name); + ## throw("The CellType $ctype_name does not exist in the database") if(!$ctype); + ## $self->cell_type($ctype); + ## } + ## + ## if ($ftype_name) { + ## my $ftype = $self->db->get_FeatureTypeAdaptor->fetch_by_name($ftype_name); + ## throw("The FeatureType $ftype_name does not exist in the database") if(!$ftype); + ## $self->feature_type($ftype); + ## } + + + #Set config here instead? + #So we can check all mandatory params + #Set vendor specific attr dependent vars + + #Generic input dir + $self->{'input_dir'} ||= $self->get_dir("data").'/input/'.$self->{'param_species'}.'/'.$self->vendor().'/'.$self->name(); + + if (! -d $self->get_dir('input')) { + + if (@{$self->result_files}) { + #This is really InputSet specific + #Could go in init_experiment_import + $self->log("Processing files:\n\t\t".join("\n\t\t",@{$self->result_files})); + } else { + throw('input_dir is not defined or does not exist ('.$self->get_dir('input').')'); + } + } + + #Parser specific config + $self->set_config(); + + + $self->debug(2, "Importer class instance created."); + $self->debug_hash(3, \$self); + + return ($self); +} + +=head2 registry_host + + Example : my $reg_host = $imp->registry_host; + Description: Accessor for registry host attribute + Returntype : string e.g. ensembldb.ensembl.org + Exceptions : None + Caller : general + Status : at risk + +=cut + +sub registry_host{ + return $_[0]->{'reg_host'}; +} + +=head2 registry_user + + Example : my $reg_user = $imp->registry_user; + Description: Accessor for registry user attribute + Returntype : string e.g. ensembldb.ensembl.org + Exceptions : None + Caller : general + Status : at risk + +=cut + +sub registry_user{ + return $_[0]->{'reg_user'}; +} + +=head2 registry_port + + Example : my $reg_port = $imp->registry_port; + Description: Accessor for registry port attribute + Returntype : string e.g. ensembldb.ensembl.org + Exceptions : None + Caller : general + Status : at risk + +=cut + +sub registry_port{ + return $_[0]->{'reg_port'}; +} + +=head2 registry_pass + + Example : my $reg_pass = $imp->registry_pass; + Description: Accessor for registry pass attribute + Returntype : string e.g. ensembldb.ensembl.org + Exceptions : None + Caller : general + Status : at risk + +=cut + +sub registry_pass{ + return $_[0]->{'reg_pass'}; +} + + +#init method kept separate from new due to differing madatory check and set up + +=head2 init_array_import + + Example : $self->init_import(); + Description: Initialises import by creating working directories + and by storing the Experiemnt + Returntype : none + Exceptions : warns and throws depending on recover and Experiment status + Caller : general + Status : at risk - merge with register_array_design + +=cut + +sub init_array_import{ + + my ($self) = shift; + + # we need to define which paramters we'll be storing + #use the logic names of the analyses as the field headers + + #need to test for vendor here + + #Sanger, NIMBLEGEN(no design_id issue, could get from the ndf, but we want it in the DesignNotes.txt) + #Then we can change the Array/Chip generation to soley use the DesignNotes.txt rather than SampleKey + #which is experiment specific + #or eFG format. + + $self->create_output_dirs('caches', 'fastas'); + + +} + + +=head2 init_experiment_import + + Example : $self->init_import(); + Description: Initialises import by creating working directories + and by storing the Experiemnt + Returntype : none + Exceptions : warns and throws depending on recover and Experiment status + Caller : general + Status : at risk - merge with register exeriment + +=cut + +sub init_experiment_import{ + my ($self) = shift; + + #Change this to take config mandatory params? + #No specific to exp import + #Name is used in set_config anyway + #Currently we only have array and experiment import, both of which should have names + #Make mandatory? + + foreach my $tmp ("group", "data_dir") { #name now generically mandatory + throw("Mandatory arg $tmp not been defined") if (! defined $self->{$tmp}); + } + #Should we separate path on group here too, so we can have a dev/test group? + + #Create output dirs + #This should be moved to the Parser to avoid generating directories which are needed for different imports + $self->create_output_dirs('raw', 'norm', 'caches', 'fastas'); + throw("No result_files defined.") if (! defined $self->result_files()); + + #Log input files + #if (@{$self->result_files()}) { + # $self->log("Found result files arguments:\n\t".join("\n\t", @{$self->result_files()})); + #} + #This is done in new + + #check for cell||feature and warn if no met file supplied? + + if ($self->norm_method) { + my $norm_anal = $self->db->get_AnalysisAdaptor->fetch_by_logic_name($self->norm_method); + + #should we list the valid analyses? + throw($self->norm_method.' is not a valid analysis') if ! $norm_anal; + $self->norm_analysis($norm_anal); + } else { + $self->log('WARNING: No normalisation analysis specified'); + } + + #warn "Need to check env vars here or in Parser or just after set_config?"; + #Need generic method for checking ENV vars in Helper + #check for ENV vars? + #R_LIBS + #R_PATH if ! farm + #R_FARM_PATH + + $self->validate_group(); #import experimental_group + + #Get experiment + my $exp_adaptor = $self->db->get_ExperimentAdaptor(); + my $exp = $exp_adaptor->fetch_by_name($self->name()); #, $self->group()); + $self->process_experiment_config if $self->can('process_experiment_config'); #Parsers::MAGE::process_experiment_config + + #Moved MAGE support form here to MAGE.pm + + #Recovery now set so deal with experiment + if ($self->recovery() && ($exp)) { + $self->log("Using previously stored Experiment:\t".$exp->name); + } elsif ((! $self->recovery()) && $exp) { + throw("Your experiment name is already registered in the database, please choose a different \"name\", this will require renaming you input directory, or specify -recover if you are working with a failed/partial import."); + #can we skip this and store, and then check in register experiment if it is already stored then throw if not recovery + } else { # (recover && exp) || (recover && ! exp) + + + $exp = Bio::EnsEMBL::Funcgen::Experiment->new( + -EXPERIMENTAL_GROUP => $self->{egroup}, + -NAME => $self->name(), + -DATE => $self->experiment_date(), + -PRIMARY_DESIGN_TYPE => $self->design_type(), + -DESCRIPTION => $self->description(), + -ADAPTOR => $self->db->get_ExperimentAdaptor(), + ); + + ($exp) = @{$exp_adaptor->store($exp)}; + } + + + $self->experiment($exp); + + #remove and add specific report, this is catchig some Root stuff + #$self->log("Initiated efg import with following parameters:\n".Data::Dumper::Dumper(\$self)); + + return; +} + + +#Move this to new or init_experiment + +=head2 validate_group + + Example : $self->validate_group(); + Description: Validates groups details + Returntype : none + Exceptions : throws if insufficient info defined to store new Group and is not already present + Caller : general + Status : Medium - check location and contact i.e. group name clash? + +=cut + +sub validate_group{ + my ($self) = shift; + + my $egroup = $self->db->get_ExperimentalGroupAdaptor->fetch_by_name($self->group); + + if (! defined $egroup) { + + throw("validate_group does not yet fully support ExperimentalGroups, please add manually"); + + #if ($self->location() && $self->contact()) { + # $self->db->import_group($self->group(), $self->location, $self->contact()); + #} else { + # throw("Group ".$self->group()." does not exist, please specify a location and contact to register the group"); + #} + } + + $self->{egroup} = $egroup; + + return; +} + +=head2 create_output_dirs + + Example : $self->create_output_dirs(); + Description: Does what it says on the tin, creates dirs in + the root output dir foreach @dirnames, also set paths in self + Arg [1] : mandatory - list of dir names + Returntype : none + Exceptions : none + Caller : general + Status : Medium - add throw? + +=cut + +sub create_output_dirs{ + my ($self, @dirnames) = @_; + + #output dir created in control script + #avoids errors when logs generated first + + + foreach my $name (@dirnames) { + + if ($name eq 'caches') { + $self->{"${name}_dir"} = $ENV{'EFG_DATA'}."/${name}/".$self->db->dbc->dbname() if(! defined $self->{"${name}_dir"}); + } elsif ($name eq 'fastas') { + $self->{"${name}_dir"} = $ENV{'EFG_DATA'}."/${name}/" if(! defined $self->{"${name}_dir"}); + } else { + $self->{"${name}_dir"} = $self->get_dir('output')."/${name}" if(! defined $self->{"${name}_dir"}); + } + + if (! (-d $self->get_dir($name) || (-l $self->get_dir($name)))) { + $self->log("Creating directory:\t".$self->get_dir($name)); + #This did not throw with mkdir!! + mkpath $self->get_dir($name) || throw('Failed to create directory: '. $self->get_dir($name)); + chmod 0744, $self->get_dir($name); + } + } + + return; +} + +=head2 vendor + + Example : $imp->vendor("NimbleGen"); + Description: Getter/Setter for array vendor + Arg [1] : optional - vendor name + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub vendor{ + my ($self) = shift; + + if (@_) { + $self->{'vendor'} = shift; + $self->{'vendor'} = uc($self->{'vendor'}); + } + + return $self->{'vendor'}; +} + + +=head2 feature_type + + Example : $imp->feature_type($ftype); + Description: Getter/Setter for Experiment FeatureType + Arg [1] : optional - Bio::EnsEMBL::Funcgen::FeatureType + Returntype : Bio::EnsEMBL::Funcgen::FeatureType + Exceptions : Throws if arg is not valid or stored + Caller : general + Status : at risk + +=cut + +sub feature_type{ + my ($self) = shift; + + if (@_) { + my $ftype = shift; + + #do we need this as we're checking in new? + if (! ($ftype->isa('Bio::EnsEMBL::Funcgen::FeatureType') && $ftype->dbID())) { + throw("Must pass a valid stored Bio::EnsEMBL::Funcgen::FeatureType"); + } + + $self->{'feature_type'} = $ftype; + } + + return $self->{'feature_type'}; +} + +=head2 feature_analysis + + Example : $imp->feature_analysis($fanal); + Description: Getter/Setter for Analysis used for creating the imported Features + Arg [1] : optional - Bio::EnsEMBL::Analysis + Returntype : Bio::EnsEMBL::Analysis + Exceptions : Throws if arg is not valid or stored + Caller : general + Status : at risk + +=cut + +sub feature_analysis{ + my ($self) = shift; + + if (@_) { + my $fanal = shift; + + #do we need this as we're checking in new? + if (! (ref ($fanal) && $fanal->isa('Bio::EnsEMBL::Analysis') && $fanal->dbID())) { + throw("Must pass a valid stored Bio::EnsEMBL::Analysis"); + } + + $self->{'feature_analysis'} = $fanal; + } + + return $self->{'feature_analysis'}; +} + +=head2 norm_analysis + + Example : $imp->norm_analysis($anal); + Description: Getter/Setter for the normalisation analysis + Arg [1] : optional - Bio::EnsEMBL::Analysis + Returntype : Bio::EnsEMBL::Analysis + Exceptions : Throws if arg is not valid or stored + Caller : general + Status : at risk + +=cut + +sub norm_analysis{ + my ($self) = shift; + + if (@_) { + my $anal = shift; + + #do we need this as we're checking in new? + if (! (ref($anal) && $anal->isa('Bio::EnsEMBL::Analysis') && $anal->dbID())) { + throw("Must pass a valid stored Bio::EnsEMBL::Analysis"); + } + + $self->{'norm_analysis'} = $anal; + } + + return $self->{'norm_analysis'}; +} + + + +=head2 cell_type + + Example : $imp->cell_type($ctype); + Description: Getter/Setter for Experiment CellType + Arg [1] : optional - Bio::EnsEMBL::Funcgen::CellType + Returntype : Bio::EnsEMBL::Funcgen::CellType + Exceptions : Throws if arg is not valid or stored + Caller : general + Status : at risk + +=cut + +sub cell_type{ + my ($self) = shift; + + if (@_) { + my $ctype = shift; + + #do we need this as we're checking in new? + if (! ($ctype->isa('Bio::EnsEMBL::Funcgen::CellType') && $ctype->dbID())) { + throw("Must pass a valid stored Bio::EnsEMBL::Funcgen::CellType"); + } + + $self->{'cell_type'} = $ctype; + } + + return $self->{'cell_type'}; +} + + +##=head2 ucsc_coords +## +## Example : $start += 1 if $self->ucsc_coords; +## Description: Getter for UCSC coordinate usage flag +## Returntype : boolean +## Exceptions : none +## Caller : general +## Status : at risk +## +##=cut +## +##sub ucsc_coords{ +## my $self = shift; +## return $self->{'ucsc_coords'}; +##} + + + +=head2 array_file + + Example : my $array_file = $imp->array_file(); + Description: Getter/Setter for sanger/design array file + Arg [1] : optional - path to adf or gff array definition/mapping file + Returntype : string + Exceptions : none + Caller : general + Status : at risk + +=cut + +sub array_file{ + my ($self) = shift; + $self->{'array_file'} = shift if(@_); + return $self->{'array_file'}; +} + +=head2 array_name + + Example : my $array_name = $imp->array_name(); + Description: Getter/Setter for array name + Arg [1] : optional string - name of array + Returntype : string + Exceptions : none + Caller : general + Status : at risk + +=cut + +sub array_name{ + my ($self) = shift; + $self->{'array_name'} = shift if(@_); + return $self->{'array_name'}; +} + + +=head2 array_set + + Example : $imp->array_set(1); + Description: Getter/Setter for array set flag + Arg [1] : optional boolean - treat all array chips as the same array + Returntype : boolean + Exceptions : none + Caller : general + Status : at risk + +=cut + +sub array_set{ + my ($self) = shift; + $self->{'array_set'} = shift if(@_); + return $self->{'array_set'}; +} + + +=head2 add_Array + + Arg [1] : Bio::EnsEMBL::Funcgen::Array + Example : $self->add_Array($array); + Description: Setter for array elements + Returntype : none + Exceptions : throws if passed non Array or if more than one Array set + Caller : Importer + Status : At risk - Implement multiple arrays? Move to Experiment? + +=cut + +sub add_Array{ + my $self = shift; + + #do we need to check if stored? + if (! $_[0]->isa('Bio::EnsEMBL::Funcgen::Array')) { + throw("Must supply a Bio::EnsEMBL::Funcgen::Array"); + } elsif (@_) { + push @{$self->{'arrays'}}, @_; + } + + throw("Does not yet support multiple array imports") if(scalar (@{$self->{'arrays'}}) > 1); + #need to alter read_probe data at the very least + + return; +} + + + +=head2 arrays + + Example : foreach my $array(@{$imp->arrays}){ ...do an array of things ...}; + Description: Getter for the arrays attribute + Returntype : ARRAYREF + Exceptions : none + Caller : general + Status : at risk + +=cut + +sub arrays{ + my $self = shift; + + if (! defined $self->{'arrays'}) { + $self->{'arrays'} = $self->db->get_ArrayAdaptor->fetch_all_by_Experiment($self->experiment()); + } + + return $self->{'arrays'}; +} + + + +=head2 location + + Example : $imp->vendor("Hinxton"); + Description: Getter/Setter for group location + Arg [1] : optional - location + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + + +sub location{ + my ($self) = shift; + $self->{'location'} = shift if(@_); + return $self->{'location'}; +} + + +=head2 contact + + Example : my $contact = $imp->contact(); + Description: Getter/Setter for the group contact + Arg [1] : optional - contact name/email/address + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub contact{ + my ($self) = shift; + $self->{'contact'} = shift if(@_); + return $self->{'contact'}; +} + +=head2 name + + Example : $imp->name('Experiment1'); + Description: Getter/Setter for the experiment name + Arg [1] : optional - experiment name + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + + +sub name{ + my ($self) = shift; + $self->{'name'} = shift if(@_); + return $self->{'name'}; +} + +=head2 result_files + + Example : $imp->result_files(\@files); + Description: Getter/Setter for the result file paths + Arg [1] : Listref of file paths + Returntype : Listref + Exceptions : none + Caller : general + Status : At risk + +=cut + + +sub result_files{ + my ($self) = shift; + $self->{'result_files'} = shift if(@_); + return $self->{'result_files'}; +} + + + + + +=head2 experiment_date + + Example : $imp->experiment_date('2006-11-02'); + Description: Getter/Setter for the experiment date + Arg [1] : optional - date string in yyyy-mm-dd + Returntype : string + Exceptions : none + Caller : general + Status : At risk + +=cut + + + +sub experiment_date{ + my ($self) = shift; + + if (@_) { + my $date = shift; + + if ($date !~ /[0-9]{4}-[0-9]{2}[0-9]{2}/o) { + throw('Parameter -experiment_date needs to fe in the format: YYYY-MM-DD'); + } + + $self->{'experiment_date'} = $date; + } elsif ($self->vendor() eq "nimblegen" && ! defined $self->{'experiment_date'}) { + $self->{'experiment_date'} = &get_date("date", $self->get_config("chip_file")), + } + + return $self->{'experiment_date'}; +} + + + +=head2 group + + Example : my $exp_group = $imp->group(); + Description: Getter/Setter for the group name + Arg [1] : optional - group name + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub group{ + my ($self) = shift; + $self->{'group'} = shift if(@_); + return $self->{'group'}; +} + + +=head2 description + + Example : $imp->description("Human chrX H3 Lys 9 methlyation"); + Description: Getter/Setter for the experiment element + Arg [1] : optional - experiment description + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub description{ + my $self = shift; + + if (@_) { + $self->{'description'} = shift; + } + + return $self->{'description'}; +} + +##=head2 feature_set_description +## +## Example : $imp->description("ExperimentalSet description"); +## Description: Getter/Setter for the FeatureSet description for an +## InputSet import e.g. preprocessed GFF/Bed data +## Arg [1] : optional - string feature set description +## Returntype : string +## Exceptions : none +## Caller : general +## Status : At risk +## +##=cut +## +##sub feature_set_description{ +## my $self = shift; +## +## $self->{'feature_set_description'} = shift if @_; +## +## return $self->{'feature_set_description'}; +##} + +=head2 format + + Example : $imp->format("Tiled"); + Description: Getter/Setter for the array format + Arg [1] : optional - array format + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub format{ + my ($self) = shift; + $self->{'format'} = shift if(@_); + return $self->{'format'}; +} + +=head2 experiment + + Example : my $exp = $imp->experiment(); + Description: Getter/Setter for the Experiment element + Arg [1] : optional - Bio::EnsEMBL::Funcgen::Experiment + Returntype : Bio::EnsEMBL::Funcgen::Experiment + Exceptions : throws if arg is not an Experiment + Caller : general + Status : Stable + +=cut + +sub experiment{ + my ($self) = shift; + + if (@_) { + + if (! $_[0]->isa('Bio::EnsEMBL::Funcgen::Experiment')) { + throw("Must pass a Bio::EnsEMBL::Funcgen::Experiment object"); + } + + $self->{'experiment'} = shift; + } + + return $self->{'experiment'}; +} + +##=head2 db +## +## Example : $imp->db($funcgen_db); +## Description: Getter/Setter for the db element +## Arg [1] : optional - Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor +## Returntype : Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor +## Exceptions : throws if arg is not an DBAdaptor +## Caller : general +## Status : Stable +## +##=cut +## +##sub db{ +## my $self = shift; +## +## if (defined $_[0] && $_[0]->isa("Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor")) { +## $self->{'db'} = shift; +## } elsif (defined $_[0]) { +## throw("Need to pass a valid Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor"); +## } +## +## return $self->{'db'}; +##} +## +##=head2 pass +## +## Example : $imp->pass("password"); +## Description: Getter/Setter for the db password +## Arg [1] : optional - db password +## Returntype : string +## Exceptions : none +## Caller : general +## Status : Stable +## +##=cut +## +## +##sub pass{ +## my $self = shift; +## $self->{'pass'} = shift if(@_); +## return $self->{'pass'}; +##} +## +##=head2 pass +## +## Example : $imp->host("hoastname"); +## Description: Getter/Setter for the db hostname +## Arg [1] : optional - db hostname +## Returntype : string +## Exceptions : none +## Caller : general +## Status : Stable +## +##=cut +## +##sub host{ +## my $self = shift; +## $self->{'host'} = shift if(@_); +## return $self->{'host'}; +##} +## +##=head2 port +## +## Example : $imp->port(3306); +## Description: Getter/Setter for the db port number +## Arg [1] : optional - db port number +## Returntype : int +## Exceptions : none +## Caller : general +## Status : Stable +## +##=cut +## +##sub port{ +## my $self = shift; +## $self->{'port'} = shift if(@_); +## return $self->{'port'}; +##} +## +##=head2 user +## +## Example : $imp->user("user_name"); +## Description: Getter/Setter for the db user name +## Arg [1] : optional - db user name +## Returntype : string +## Exceptions : none +## Caller : general +## Status : Stable +## +##=cut +## +##sub user{ +## my $self = shift; +## $self->{'user'} = shift if(@_); +## return $self->{'user'}; +##} + +##=head2 dump_fasta +## +## Example : if($self->dump_fasta()){...do fasta dump...} +## Description: Getter/Setter for the dump_fasta flag +## Arg [1] : optional - 0 or 1 +## Returntype : boolean +## Exceptions : none +## Caller : self +## Status : Stable +## +##=cut +## +## +##sub dump_fasta{ +## my $self = shift; +## $self->{'_dump_fasta'} = shift if @_; +## return $self->{'_dump_fasta'}; +##} +## + + +##=head2 species +## +## Example : $imp->species("homo_sapiens"); +## Description: Getter/Setter for species +## Arg [1] : optional - species name(alias?) +## Returntype : string +## Exceptions : none ? throw if no alias found? +## Caller : general +## Status : Medium - may move reg alias look up to this method +## +##=cut +## +##sub species{ +## my $self = shift; +## +## #should we do reg alias look up here? +## #Will we ever want to redefine species? +## #Change to just getter? +## +## $self->{'species'} = shift if(@_); +## +## return $self->{'species'}; +##} + +=head2 get_dir + + Example : $imp->get_dir("import"); + Description: Retrieves full path for given directory + Arg [1] : mandatory - dir name + Returntype : string + Exceptions : none + Caller : general + Status : at risk - move to Helper? + +=cut + +sub get_dir{ + my ($self, $dirname) = @_; + return $self->get_data("${dirname}_dir"); +} + +=head2 norm_method + + Example : my $norm_method = $imp->norm_method() + Description: Getter/Setter for normalisation method + Arg [1] : mandatory - method name + Returntype : string + Exceptions : none ? throw if no analysis with logic name + Caller : general + Status : At risk - restrict to logic_name and validate against DB, allow multiple + +=cut + +#Move to Nimblegen? +#Do we ever want to normalise other data? + +sub norm_method{ + my $self = shift; + + if (@_) { + $self->{'norm_method'} = shift; + } elsif (! defined $self->{'norm_method'}) { + $self->{'norm_method'}= $self->get_config('norm_method'); + } + + return $self->{'norm_method'}; +} + + +=head2 get_config + + Arg [1] : mandatory - name of the data element to retrieve from the config hash + Example : %dye_freqs = %{$imp->get_config('dye_freqs')}; + Description: returns data from the definitions hash + Returntype : various + Exceptions : none + Caller : Importer + Status : at risk - replace with direct calls in the inherited Defs class? + +=cut + + +sub get_config{ + my ($self, $data_name) = @_; + return $self->get_data('config', $data_name); #will this cause undefs? +} + + + + + +=head2 register_experiment + + Example : $imp->register_experiment() + Description: General control method, performs all data import and normalisations + Arg [1] : optional - dnadb DBAdaptor + Returntype : none + Exceptions : throws if arg is not Bio::EnsEMBL::DBSQL::DBAdaptor + Caller : general + Status : Medium + +=cut + + +#Need to split this method? +#Pre farm process +#Define and store all sets, +#pre process input file once if required +#How are we going to be able to tell wether this has been done successfully? +#runner will catch error, therefore safe to assume that file is complete if farm job +#unless we are not running with farm + +#farm specific processes +#actually parse and store data + +#Can we separate these for different import types? + +sub register_experiment{ + my ($self) = shift; + + #Need to check for dnadb passed with adaptor to contructor + if (@_) { + + if ( ! $_[0]->isa("Bio::EnsEMBL::DBSQL::DBAdaptor")) { + throw("You need to pass a valid dnadb adaptor to register the experiment"); + } + $self->db->dnadb($_[0]); + } elsif ( ! $self->db) { + throw("You need to pass/set a DBAdaptor with a DNADB attached of the relevant data version"); + } + + #This could still be the default core db for the current version + #warn here if not passed DB? + #These should be vendor independent, only the read methods should need specific order? + + $self->init_experiment_import(); + #can we just have init here instead? + + + #This could do with a rewrite to move some things to the parsers + #$self::SUPER->register_experiment + + $self->write_validate_experiment_config if $self->can('write_validate_experiment_config'); + + #This is too array specific! + #Can we have an array of import_methods in the config? + #foreach my $method(@{$self->get_config{'import_methods'}}){ + #$self->method; + #} + #We're already doing this in read_data for each of these data_types + + #Need to be able to run this separately, so we can normalise previously imported sets with different methods + #should be able t do this without raw data files e.g. retrieve info from DB + #Is this implemented? + + $self->read_data("probe"); + $self->read_data("results"); + + + my $norm_method = $self->norm_method(); + + if (defined $norm_method) { + warn "norm method is $norm_method"; + + $self->R_norm($norm_method); + #change this to $self->$norm_method + #so we can have non-R_norm normalisation + } + + + return; +} + +#Move array specific ones to Nimblegen.pm? +#Also used by ArrayDesign and Sanger.pm +#So need to create Array.pm baseclass, which is a Helper. + +=head2 store_set_probes_features + + Arg [1] : mandatory - array chip id + Arg [2] : optional - Bio::EnsEMBL::Funcgen::ProbeSet + Arg [3] : mandatory - hashref of keys probe id, values are + hash of probe/features with values + Bio::EnsEMBL::Funcgen::Probe/Features for a given + probe set if defined. + Example : $self->store_set_probes_features($ac->dbID(), $ops, \%pfs); + Description: Stores probe set, probes and probe features + Returntype : none + Exceptions : none + Caller : self + Status : Medium + +=cut + +sub store_set_probes_features{ + my ($self, $ac_id, $pf_hash, $ops) = @_; + + ### Deal with ProbeSets + if ($ops) { + $ops->size(scalar(keys %$pf_hash)); + ($ops) = $self->db->get_ProbeSetAdaptor->store($ops); + } + + #If we're going to validate fully, we need to check for probes in this probeset on this array chip + #Update size if we have any new probes + #Overkill? Only do on recover? Do not read if array chip is IMPORTED + #This does not make any attempt to validate probes/set vs previously stored data + + for my $probe_id (keys %$pf_hash) { + + #set probeset in probe and store + #the process corresponding feature + my $probe = $pf_hash->{$probe_id}->{'probe'}; + $probe->probeset($ops) if $ops; + ($probe) = @{$self->db->get_ProbeAdaptor->store($probe)}; + + #Can't use get_all_Arrays here as we can't guarantee this will only ever be the array we've generated + #Might dynamically load array if non-present + #This is allowing multiple dbIDs per probe??? Is this wrong? + #$self->cache_probe_info($probe->get_probename(), $probe->dbID());###########Remove as we're now importing all then resolving + + + foreach my $feature (@{$pf_hash->{$probe_id}->{'features'}}) { + $feature->probe($probe); + ($feature) = @{$self->db->get_ProbeFeatureAdaptor->store($feature)}; + } + } + + undef $ops; #Will this persist in the caller? + undef %{$pf_hash}; + + return; +} + + +=head2 cache_slice + + Arg [0] : string - region_name e.g. X + Arg [1] : optional - coordinate system name e.g. supercontig, defaults to chromosome + Example : my $slice = $self->cache_slice(12); + Description: Caches or retrieves from cache a given slice + Returntype : Bio::EnsEMBL::Slice + Exceptions : throws f no region name specified + Caller : self + Status : At risk + +=cut + +sub cache_slice{ + my ($self, $region_name, $cs_name, $total_count) = @_; + + throw("Need to define a region_name to cache a slice from") if ! $region_name; + $self->{'slice_cache'} ||= {}; + $region_name =~ s/chr//; + $region_name = "MT" if $region_name eq "M"; + + if (! exists ${$self->{'seen_slice_cache'}}{$region_name}) { + my $slice = $self->slice_adaptor->fetch_by_region($cs_name, $region_name); + + #Set seen cache so we don't try this again + $self->{seen_slice_cache}{$region_name} = $slice; + + if (! $slice) { + warn("-- Could not generate a slice for ${cs_name}:$region_name\n"); + } else { + + my $sr_name = $slice->seq_region_name; #In case we passed a slice name + + if (@{$self->{seq_region_names}}) { + return if ! grep(/^${sr_name}$/, @{$self->{seq_region_names}}); #not on required slice + } + } + + $self->{'slice_cache'}->{$region_name} = $slice; + } + + if ($total_count && exists ${$self->{'seen_slice_cache'}}{$region_name}) { + #This is an InputSet specific method + $self->count('total_features') if $self->can('count'); + } + + #Only return if exists to avoid creating hash key + return (exists $self->{'slice_cache'}->{$region_name}) ? $self->{'slice_cache'}->{$region_name} : undef; +} + +=head2 slice_cache + + Example : my @seen_slices = values(%{$self->slice_cache});; + Description: Returns the slice cache i.e. all the Slices seen in the data filtered + by the defined slices. This method can be used to run only the appropriate + slice jobs after a prepare stage. + Returntype : Hashref of seq_region name Bio::EnsEMBL::Slice pairs + Exceptions : None + Caller : self + Status : At risk + +=cut + + +sub slice_cache{ + my $self = shift; + + return $self->{'slice_cache'}; +} + + + + +=head2 cache_probe_info + + Arg [0] : mandatory - probe name + Arg [1] : mandatory - probe dbID + Arg [2] : optioanl int - x coord of probe on array + Arg [3] : optional int - y coord of probe on array + Example : $self->cache_probe_info("Probe1", $probe->dbID()); + Or for result files which do not have X & Y, we need to cache + X & Y from the design files: $self->cache_probe_info('Probe2', $probe->dbID(), $x, $y); + Description: Setter for probe cache values + Returntype : none + Exceptions : throws is cache conflict encountered + Caller : self + Status : At risk - merge with following? + +=cut + +sub cache_probe_info{ + my ($self, $pname, $pid, $x, $y) = @_; + + throw('Deprecated, too memory expensive, now resolving DB duplicates and using Tied File cache'); + throw("Must provide a probe name and id") if (! defined $pname || ! defined $pid); + + + #do we need to loop through the file here? + #if we make sure we're testing for a previous dbID before storing probes then we don't need to do this + #we can catch the error when we get the probe id as we can check for >1 id for the same probe name + #if (defined $self->{'_probe_cache'}->{$pname} && ($self->{'_probe_cache'}->{$pname}->[0] != $pid)) { + # throw("Found two differing dbIDs for $pname, need to sort out redundant probe entries"); + #} + + $self->{'_probe_cache'}->{$pname} = (defined $x && defined $y) ? [$pid, $x, $y] : [$pid]; + + return; +} + + +=head2 get_probe_id_by_name_Array + + Arg [1] : mandatory - probe name + Example : $pid = $self->get_probe_id_by_name($pname); + Description: Getter for probe cache values + Returntype : int + Exceptions : none + Caller : self + Status : At risk - merge with previous, move to importer? + +=cut + +sub get_probe_id_by_name_Array{ + my ($self, $name, $array) = @_; + + #this is only ever called for fully imported ArrayChips, as will be deleted if recovering + $self->resolve_probe_data() if(! exists $self->{'_probe_cache'}{$array->name()}); + + #we want to cycle through the given cache starting from the last position or 0. + #we don't want to have to test for the size of the cache each time as this will be quite expensive + #so we should store sizes separately along with current position + + + my ($pid, $line); + + #check current line + if ($line = $self->{'_probe_cache'}{$array->name()}{'current_line'}) { + if ($line =~ /^\Q${name}\E\t/) { + $pid = (split/\t/o, $line)[1]; + } + } + + + if (! $pid) { + while ($line = $self->{'_probe_cache'}{$array->name()}{'handle'}->getline()) { + + if ($line =~ /^\Q${name}\E\t/) { + $pid = (split/\t/o, $line)[1]; + $self->{'_probe_cache'}{$array->name()}{'current_line'} = $line; + last; + } + } + } + + #do not remove this + if (! $pid) { + throw("Did not find probe name ($name) in cache, cache may need rebuilding, results may need sorting, or do you have an anomolaous probe?") + } else { + chomp $pid; + } + + return $pid; +} + +=head2 get_probe_cache_by_Array + + Arg[1] : Bio::EnsEMBL::Funcgen::Array + Arg[2] : boolean - from db flag, only to be used by Importer->resolve_probe_data ! + Example : $self->get_probe_cache_by_Array(); + Description: Gets the probe info cache which is an array tied to a file + Returntype : Boolean - True if cache has been generated and set successfully + Exceptions : none + Caller : general + Status : At risk + +=cut + +#from db flag should only be used by importer +#this is because there is no guarantee that it will be resolved unless +#called by resolve_probe_data +#which then renames the file and resets the handle +#can we clean this up and protect/hide this functionality? +#can we check the cache file name in the get methods and throw if it contains unresolved? +#make this private? + +sub get_probe_cache_by_Array{ + my ($self, $array, $from_db) = @_; + + my $msg = "Getting probe cache for ".$array->name(); + $msg .= " from DB" if $from_db; + $self->log($msg); #, 1); + + if (! ($array && $array->isa('Bio::EnsEMBL::Funcgen::Array') && $array->dbID())) { + throw('Must provide a valid stored Bio::EnsEMBL::Funcgen::Array object'); + } + + my $set = 0; + my $cache_file = $self->get_dir('caches').'/'.$array->name().'.probe_cache'; + + ### Generate and resolve fresh cache from DB + if ($from_db) { + + $cache_file .= '.unresolved'; #This will be renamed by the caller if it is resolved + + if (exists $self->{'_probe_cache'}{$array->name()}) { + $self->log('Rebuilding probe_cache from DB for '.$array->name(), 1); + + + #untie @{$self->{'_probe_cache'}{$array->name()}{'entries'}}; + #close($self->{'_probe_cache'}{$array->name()}{'handle'});#do we need to do this? + delete $self->{'_probe_cache'}{$array->name()}; #implicitly closes + $self->log('Deleted old cache', 1); + } else { + $self->log('Building probe_cache from DB for '.$array->name(), 1); + } + + #Move this to ProbeAdaptor? + #This is where we'd set the unique key for a vendor and resolves duplicates based on the key + my $cmd = 'SELECT name, probe_id from probe WHERE array_chip_id IN ('.join(',', @{$array->get_array_chip_ids()}).') ORDER by name, probe_id'; + $cmd = 'mysql '.$self->db->connect_string()." -e \"$cmd\" >".$cache_file; + run_system_cmd($cmd); + + } + + ### Set cache + if (-f $cache_file) { + $self->log('MD5 check here?',1); + $self->{'_probe_cache'}{$array->name()}{'current_line'} = undef; + $self->{'_probe_cache'}{$array->name()}{'handle'} = open_file($cache_file); + + #can we do a select count instead? and do this instead of the MD5? + #$cmd = "wc -l $cache_file"; + #my $size = `$cmd`; + + $set = 1; + } else { + warn 'Failed to get probe cache for array:'.$array->name(); + } + + return $set; +} + + +#should reorganise these emthods to split reading the array data, and the actual data +#currently: +#meta reads array and chip data +#probe reads probe_set, probes, which should definitely be in array, probe_feature? and results +#native data format may not map to these methods directly, so may need to call previous method if required data not defined + + +=head2 read_data + + Example : $self->read_data("probe") + Description: Calls each method in data_type array from config hash + Arg [1] : mandatory - data type + Returntype : none + Exceptions : none + Caller : self + Status : At risk + +=cut + +sub read_data{ + my($self, $data_type) = @_; + + map {my $method = "read_${_}_data"; $self->$method()} @{$self->get_config("${data_type}_data")}; + return; +} + + +=head2 design_type + + Example : $self->design_type("binding_site_identification") + Description: Getter/Setter for experimental design type + Arg [1] : optional - design type + Returntype : string + Exceptions : none + Caller : general + Status : At risk + +=cut + +sub design_type{ + my $self = shift; + return $self->{'design_type'}; +} + + +=head2 get_chr_seq_region_id + + Example : $seq_region_id = $self->get_seq_region_id('X'); + Description: Calls each method in data_type array from config hash + Arg [1] : mandatory - chromosome name + Arg [2] : optional - start value + Arg [3] : optional - end value + Returntype : int + Exceptions : none + Caller : self + Status : At risk + +=cut + +#convinience wrapper method +#could we use the seq region cache instead? +#this seems like a lot of overhead for getting the id +sub get_chr_seq_region_id{ + my ($self, $chr, $start, $end) = @_; + #what about strand info? + + #do we need the start and stop? + + #use start and stop to prevent problems with scaffodl assemblies, i.e. >1 seq_region_id + #my $slice = $self->slice_adaptor->fetch_by_region("chromosome", $chr, $start, $end); + #we could pass the slice back to the slice adaptor for this, to avoid dbid problems betwen DBs + + ###would need to implement other cs's here + + return $self->slice_adaptor->fetch_by_region("chromosome", $chr, $start, $end)->get_seq_region_id(); +} + + + +=head2 vsn_norm + + Example : $self->vsn_norm(); + Description: Convinience/Wrapper method for vsn R normalisation + Returntype : none + Exceptions : none + Caller : general + Status : At risk + +=cut + +#Have Norm class or contain methods in importer? +#Need to have analysis set up script for all standard analyses. + +sub vsn_norm{ + my $self = shift; + return $self->R_norm("VSN_GLOG"); +} + + +=head2 farm + + Arg [1] : Boolean + Example : $importer->farm(1); + Description: Flag to turn farm submission on + Returntype : Boolean + Exceptions : Throws is argument not a boolean + Caller : general + Status : At risk + +=cut + + +sub farm{ + my ($self, $farm) = @_; + + $self->{'farm'} ||= undef; #define farm + + if (defined $farm) { + throw("Argument to farm must be a boolean 1 or 0") if(! ($farm == 1 || $farm == 0)); + $self->{'farm'} = $farm; + } + + return $self->{'farm'}; + +} + +=head2 batch_job + + Arg [1] : Boolean + Example : $importer->batch_job(1); + Description: Flag to turn on batch_job status + Returntype : Boolean + Exceptions : Throws is argument not a boolean + Caller : general + Status : At risk + +=cut + + +sub batch_job{ + my ($self, $batch_job) = @_; + + #$self->{'batch_job'} ||= undef; + + if (defined $batch_job) { + throw("Argument to batch_job must be a boolean 1 or 0") if(! ($batch_job == 1 || $batch_job == 0)); + $self->{'batch_job'} = $batch_job; + } + + return $self->{'batch_job'}; + +} + +=head2 prepared + + Arg [1] : Boolean + Example : $importer->prepared(1); + Description: Flag to turn on prepared file status + This signifies that the files have been previously imported + using prepare mode and may not match the InputSubset names + Returntype : Boolean + Exceptions : None + Caller : general + Status : At risk + +=cut + + +sub prepared{ + my ($self, $prepared) = @_; + $self->{'prepared'} = $prepared if (defined $prepared); + return $self->{'prepared'}; +} + + +=head2 R_norm + + Example : $self->R_norm(@logic_names); + Description: Performs R normalisations for given logic names + Returntype : none + Exceptions : Throws if R exits with error code or if data not not valid for analysis + Caller : general + Status : At risk + +=cut + +sub R_norm{ + my ($self, @logic_names) = @_; + #This currently normalises a single two colour chip at a time + #rather than normalising across a set of chip + #also does in sets of analyses + #good for keeping data separate, but not efficient in terms of querying + #convert to use one script which only queries for each echip once, then does each anal + + + my $aa = $self->db->get_AnalysisAdaptor(); + my $rset_adaptor = $self->db->get_ResultSetAdaptor(); + my $ra_id = $aa->fetch_by_logic_name("RawValue")->dbID(); + my %r_config = ( + "VSN_GLOG" => {( libs => ['vsn'], + #norm_method => 'vsn', + )}, + + "T.Biweight" => {( + libs => ['affy'], + #norm_method => 'tukey.biweight', + )}, + ); + + + foreach my $logic_name (@logic_names) { + #throw("Not yet implemented TukeyBiweight") if $logic_name eq "Tukey_Biweight"; + + #this has already been chcecked and set as the norm_analysis + #need to resolve this multi method approach + my $norm_anal = $aa->fetch_by_logic_name($logic_name); + + + + #This only creates an RSet for the IMPORT set + #So if we re-run with a different analysis + #tab2mage will have already been validated + #So RSet generation will be skipped + #We need to recreate the each non-import RSet for this norm analysis + + #This also means the RSets are being created before the data has been imported + #This avoids having to parse tab2mage each time but means we have an uncertain status of these Rsets + + my $rset = $self->get_import_ResultSet($norm_anal, 'experimental_chip'); + + my @chips = (); + + if (! $rset) { + $self->log("All ExperimentalChips already have status:\t${logic_name}"); + } else { #Got data to normalise and import + my @dbids; + my $R_file = $self->get_dir("norm")."/${logic_name}.R"; + my $job_name = $self->experiment->name()."_${logic_name}"; + my $resultfile = $self->get_dir("norm")."/result.${logic_name}.txt"; + my $outfile = $self->get_dir("norm")."/${logic_name}.out"; + + #How do we get farm job output i.e. run time memusage + #from interactive job? + #This assumes R_PATH + my $errfile = $self->get_dir("norm")."/${logic_name}.err"; + + #Let's build this better so we capture the farm output aswell as the job output. + my $cmdline = "$ENV{'R_PATH'} --no-save < $R_file"; # >$errfile 2>&1"; + #-K option waits for job to complete + my $bsub = "bsub -K -J $job_name ".$ENV{'R_BSUB_OPTIONS'}. + " -e $errfile -o $outfile $ENV{'R_FARM_PATH'} CMD BATCH $R_file"; + + #Can we separate the out and err for commandline? + my $r_cmd = (! $self->farm()) ? "$cmdline >$outfile 2>&1" : $bsub; + + $self->backup_file($resultfile); #Need to do this as we're appending in the loop + + #setup qurey + #warn "Need to add host and port here"; + #Set up DB, defaults and libs for each logic name + my $query = "options(scipen=20);library(RMySQL);library(Ringo);"; + #scipen is to prevent probe_ids being converted to exponents + #Ringo is for default QC + + #foreach my $ln(@logic_names){ + + foreach my $lib (@{$r_config{$logic_name}{'libs'}}) { + $query .= "library($lib);"; + } + #} + + $query .= "con<-dbConnect(dbDriver(\"MySQL\"), host=\"".$self->db->dbc->host()."\", port=".$self->db->dbc->port().", dbname=\"".$self->db->dbc->dbname()."\", user=\"".$self->db->dbc->username()."\""; + + #should use read only pass here as we are printing this to file + $query .= ", pass=\"".$self->db->dbc->password."\")\n"; + + + #Build queries for each chip + foreach my $echip (@{$self->experiment->get_ExperimentalChips()}) { + + + #should implement logic name here? + #can't as we need seperate ResultSet for each + + + if ($echip->has_status($logic_name)) { + $self->log("ExperimentalChip ".$echip->unique_id()." already has status:\t$logic_name"); + } else { + + #warn "Need to roll back here if recovery, as norm import may fail halfway through"; + + push @chips, $echip; + my $cc_id = $rset->get_chip_channel_id($echip->dbID()); + + #if ($self->recovery()){ + # $self->log('Rolling back results for ExperimentalChip('.$echip->dbID().") $logic_name"); + # $self->db->rollback_results($cc_id) if $self->recovery(); + # } + + $self->log("Building $logic_name R cmd for ".$echip->unique_id()); + @dbids = (); + + foreach my $chan (@{$echip->get_Channels()}) { + + if ($chan->type() eq "EXPERIMENTAL") { + push @dbids, $chan->dbID(); + } else { + unshift @dbids, $chan->dbID(); + } + } + + throw("vsn does not accomodate more than 2 channels") if (scalar(@dbids > 2) && $logic_name eq "VSN_GLOG"); + + #should do some of this with maps? + #HARDCODED metric ID for raw data as one + + #Need to get total and experimental here and set db_id accordingly + #can probably do this directly into one df + + $query .= "c1<-dbGetQuery(con, 'select r.probe_id as PROBE_ID, r.score as CONTROL_score, r.X, r.Y from result r, chip_channel c, result_set rs where c.table_name=\"channel\" and c.table_id=${dbids[0]} and c.result_set_id=rs.result_set_id and rs.analysis_id=${ra_id} and c.chip_channel_id=r.chip_channel_id')\n"; + + $query .= "c2<-dbGetQuery(con, 'select r.probe_id as PROBE_ID, r.score as EXPERIMENTAL_score, r.X, r.Y from result r, chip_channel c, result_set rs where c.table_name=\"channel\" and c.table_id=${dbids[1]} and c.result_set_id=rs.result_set_id and rs.analysis_id=${ra_id} and c.chip_channel_id=r.chip_channel_id')\n"; + + #Can we define some of the basic structures here and reuse in the QC and each norm method? + + + #Is this going to eat up memory? + #can we strip out and separate the data from c1 and c2 into RGList and + #individual vector for probe_ids, then rm c1 and c2 to free up memory + + #create RGList object + $query .= "R<-as.matrix(c1['CONTROL_score'])\nG<-as.matrix(c2['EXPERIMENTAL_score'])\n"; + $query .= "genes<-cbind(c1['PROBE_ID'], c1['X'], c1['Y'])\n"; + $query .= "testRG<-new('RGList', list(R=R, G=G, genes=genes))\n"; + + + #QC plots here before doing norm + + #open pdf device + $query .= "pdf('".$self->get_dir('norm').'/'.$echip->unique_id."_QC.pdf', paper='a4', height = 15, width = 9)\n"; + #set format + $query .= "par(mfrow = c(2,2), font.lab = 2)\n"; + + #Channel densisties + #These need limma or Ringo + $query .= "plotDensities(testRG)\n"; + + #MvA Plot + + $query .= 'meanLogA<-((log(testRG$R, base=exp(2)) + log(testRG$G, base=exp(2)))/2)'."\n"; + $query .= 'logIntRatioM<-(log(testRG$R, base=exp(2)) - log(testRG$G, base=exp(2)))'."\n"; + $query .= "yMin<-min(logIntRatioM)\n"; + $query .= "yMax<-max(logIntRatioM)\n"; + + + + #Need to validate yMax here + #If is is Inf then we need to sort the vector and track back until we find the high real number + #count number of Infs and note on MvA plot + $query .= "infCount<-0\n"; + $query .= "if( yMax == Inf){; sortedM<-sort(logIntRatioM); lengthM<-length(logIntRatioM); indexM<-lengthM\n" + ."while (yMax == Inf){; indexM<-(indexM-1); yMax<-sortedM[indexM];}; infCount<-(lengthM-indexM);}\n"; + + # + $query .= "if(infCount == 0){\n"; + $query .= 'plot(meanLogA, logIntRatioM, xlab="A - Average Log Ratio",ylab="M - Log Ratio",pch=".",ylim=c(yMin,yMax), main="'.$echip->unique_id.'")'."\n"; + $query .= "} else {\n"; + $query .= 'plot(meanLogA, logIntRatioM, xlab="A - Average Log Ratio",ylab="M - Log Ratio",pch=".",ylim=c(yMin,yMax), main="'.$echip->unique_id.'", sub=paste(infCount, " Inf values not plotted"));'."}\n"; + + + #$query .= 'plot(log(testRG$R*testRG$G, base=exp(2))/2, log(testRG$R/testRG$G, base=exp(2)),xlab="A",ylab="M",pch=".",ylim=c(-3,3), main="'.$echip->unique_id."\")\n"; + + #Plate plots + $query .= 'image(testRG, 1, channel = "green", mycols = c("black", "green4", "springgreen"))'."\n"; + $query .= 'image(testRG, 1, channel = "red", mycols = c("black", "green4", "springgreen"))'."\n"; + + $query .= "dev.off()\n"; + #Finished QC pdf printing + + + + #The simple preprocess step of Ringo is actually vsn, so we can nest these in line + + + ### Build Analyses cmds ### + + if ($logic_name eq 'T.Biweight') { + + #log2 ratios + $query .= 'lr_df<-cbind((log(c2["EXPERIMENTAL_score"], base=exp(2)) - log(c1["CONTROL_score"], base=exp(2))))'."\n"; + + #Adjust using tukey.biweight weighted average + #inherits first col name + $query .= 'norm_df<-(lr_df["EXPERIMENTAL_score"]-tukey.biweight(as.matrix(lr_df)))'."\n"; + $query .= 'formatted_df<-cbind(rep.int(0, length(c1["PROBE_ID"])), c1["PROBE_ID"], sprintf("%.3f", norm_df[,1]), rep.int('.$cc_id.', length(c1["PROBE_ID"])), c1["X"], c1["Y"])'."\n"; + + } elsif ($logic_name eq 'VSN_GLOG') { + #could do this directly + $query .= "raw_df<-cbind(c1[\"CONTROL_score\"], c2[\"EXPERIMENTAL_score\"])\n"; + #variance stabilise + $query .= "norm_df<-vsn(raw_df)\n"; + + + #do some more calcs here and print report? + #fold change exponentiate? See VSN docs + #should do someplot's of raw and glog and save here? + #set log func and params + #$query .= "par(mfrow = c(1, 2)); log.na = function(x) log(ifelse(x > 0, x, NA));"; + #plot + #$query .= "plot(exprs(glog_df), main = \"vsn\", pch = \".\");". + # "plot(log.na(exprs(raw_df)), main = \"raw\", pch = \".\");"; + #FAILS ON RAW PLOT!! + #par(mfrow = c(1, 2)) + #> meanSdPlot(nkid, ranks = TRUE) + #> meanSdPlot(nkid, ranks = FALSE) + + + #Now create table structure with glog values(diffs) + #3 sig dec places on scores(doesn't work?!) + $query .= 'formatted_df<-cbind(rep.int(0, length(c1["PROBE_ID"])), c1["PROBE_ID"], sprintf("%.3f", (exprs(norm_df[,2]) - exprs(norm_df[,1]))), rep.int('.$cc_id.', length(c1["PROBE_ID"])), c1["X"], c1["Y"])'."\n"; + + } + #load back into DB + #c3results<-cbind(rep("", length(c3["probe_id"])), c3["probe_id"], c3["c3_score"], rep(1, length(c3["probe_id"])), rep(1, length(c3["probe_id"]))) + #may want to use safe.write here + #dbWriteTable(con, "result", c3results, append=TRUE) + #dbWriteTable returns true but does not load any data into table!!! + + $query .= "write.table(formatted_df, file=\"${resultfile}\", sep=\"\\t\", col.names=FALSE, row.names=FALSE, quote=FALSE, append=TRUE)\n"; + + #tidy up here?? + } + } + + $query .= "q();"; + + open(RFILE, ">$R_file") || throw("Cannot open $R_file for writing"); + print RFILE $query; + close(RFILE); + + my $submit_text = "Submitting $logic_name job"; + $submit_text .= ' to farm' if $self->farm; + $self->log("${submit_text}:\t".localtime()); + run_system_cmd($r_cmd); + $self->log("Finished $logic_name job:\t".localtime()); + $self->log('See '.$self->get_dir('norm').' for ExperimentalChip QC files'); + + #Now load file and update status + #Import directly here to avoid having to reparse all results if we crash!!!! + $self->log("Importing:\t$resultfile"); + $self->db->load_table_data("result", $resultfile); + $self->log("Finishing importing:\t$resultfile"); + + + foreach my $echip (@chips) { + $echip->adaptor->store_status($logic_name, $echip); + } + + #Recreate all non-import RSets for analysis if not already present + # + + my $rset_a = $self->db->get_ResultSetAdaptor(); + my %seen_rsets; + + foreach my $anal_rset (@{$rset_a->fetch_all_by_Experiment($self->experiment)}) { + next if($anal_rset->name =~ /_IMPORT$/o); + next if(exists $seen_rsets{$anal_rset->name}); + next if $anal_rset->analysis->logic_name eq $norm_anal->logic_name; + $seen_rsets{$rset->name} = 1; + $anal_rset->analysis($norm_anal); + $anal_rset->{'dbID'} = undef; + $anal_rset->{'adaptor'} = undef; + + #add the chip_channel_ids from the new anal IMPORT set + foreach my $table_id (@{$anal_rset->table_ids}) { + $anal_rset->{'table_id_hash'}{$table_id} = $rset->get_chip_channel_id($table_id); + } + + $self->log('Adding new ResultSet '.$anal_rset->name.' with analysis '.$norm_anal->logic_name); + $rset_a->store($anal_rset); + } + + + + } + } + + return; +} + +#can we sub this? args: table_name, logic_name +#also use result_set_name +#would also clean all data for result set if recovery +#return would be result_set +#Can we extend this to incorporate InputSet parser define_sets? + +sub get_import_ResultSet{ + my ($self, $anal, $table_name) = @_; + + if (!($anal && $anal->isa("Bio::EnsEMBL::Analysis") && $anal->dbID())) { + throw("Must provide a valid stored Bio::EnsEMBL::Analysis"); + } + + $self->log("Getting import $table_name ResultSet for analysis:\t".$anal->logic_name()); + + my ($rset, @new_chip_channels); + my $result_adaptor = $self->db->get_ResultSetAdaptor(); + my $logic_name = $anal->logic_name; + my $status = ($logic_name eq "RawValue") ? "IMPORTED" : $logic_name; + + if (($logic_name) eq 'RawValue' && ($table_name eq 'experimental_chip')) { + throw("Cannot have an ExperimentalChip ResultSet with a RawValue analysis, either specify 'channel' or another analysis"); + } + + #Build IMPORT Set for $table_name + foreach my $echip (@{$self->experiment->get_ExperimentalChips()}) { + + #clean chip import and generate rset + + if ($table_name eq 'experimental_chip') { + + if ($echip->has_status($status)) { #this translates to each channel have the IMPORTED_RawValue status + $self->log("ExperimentalChip(".$echip->unique_id().") already has status:\t".$status); + } else { + $self->log("Found ExperimentalChip(".$echip->unique_id().") without status $status"); + + push @new_chip_channels, $echip; + } + + } else { #channel + + foreach my $chan (@{$echip->get_Channels()}) { + + if ($chan->has_status($status)) { #this translates to each channel have the IMPORTED_RawValue status + $self->log("Channel(".$echip->unique_id()."_".$self->get_config('dye_freqs')->{$chan->dye()}.") already has status:\t".$status); + } else { + $self->log("Found Channel(".$echip->unique_id()."_".$self->get_config('dye_freqs')->{$chan->dye()}.") without status $status"); + push @new_chip_channels, $chan; + } + } + } + + if (( ! $rset) && @new_chip_channels) { + my(@tmp) = @{$result_adaptor->fetch_all_by_name_Analysis($self->name()."_IMPORT", $anal)}; + + if (scalar(@tmp) > 1) { + throw('Found more than one IMPORT ResultSet for '.$self->name().'_IMPORT with analysis '.$logic_name); + } + + $rset = shift @tmp; + + + #do we need to throw here if not recovery? + #what if we want the import result set elsewhere during the first import? + + #if ($self->recovery()) { + + #fetch by anal and experiment_id + #Need to change this to result_set.name! + # warn("add chip set handling here"); + + #my @tmp = @{$result_adaptor->fetch_all_by_Experiment_Analysis($self->experiment(), $anal)}; + #throw("Found more than one ResultSet for Experiment:\t".$self->experiment->name()."\tAnalysis:\t".$anal->logic_name().')' if (scalar(@tmp) >1); + #$rset = $tmp[0]; + + #warn "fetching rset with ".$self->name()."_IMPORT ". $anal->logic_name; + + #$rset = $result_adaptor->fetch_by_name_Analysis($self->name()."_IMPORT", $anal); + warn("Warning: Could not find recovery ResultSet for analysis ".$logic_name) if ! $rset; + #} + + if (! $rset) { + $self->log("Generating new ResultSet for analysis ".$logic_name); + + $rset = Bio::EnsEMBL::Funcgen::ResultSet->new + ( + -analysis => $anal, + -table_name => $table_name, + -name => $self->name()."_IMPORT", + -feature_type => $self->feature_type(), + -cell_type => $self->cell_type(), + ); + + #These types should be set to NULL during the MAGE-XML validation if we have more than one type in an experiment + + ($rset) = @{$result_adaptor->store($rset)}; + } + } + } + + #do we need this here as we're rolling back in the read methods? + #we only want to roll back those chips/channels which have not been registered + + if ($self->recovery()) { + + my $ec_adaptor = $self->db->get_ExperimentalChipAdaptor(); + + foreach my $cc (@new_chip_channels) { + + #only roll back if already part of import set + #Not previously registered if not + if ($rset->contains($cc) && $rset->get_chip_channel_id($cc->dbID())) { + + if ($table_name eq 'channel') { + my $chan_name = $ec_adaptor->fetch_by_dbID($cc->experimental_chip_id())->unique_id()."_". + $self->get_config('dye_freqs')->{$cc->dye()}; + $self->log("Rolling back results for $table_name:\t".$chan_name); + + } else { + $self->log("Rolling back results for $table_name:\t".$cc->unique_id); + } + + $self->rollback_results([$rset->get_chip_channel_id($cc->dbID())]); + } + } + } + + + #check whether it is present in the ResultSet and add if not + if ($rset) { + #ids will already be present if not rset i.e. already imported + + foreach my $cc (@new_chip_channels) { + $rset->add_table_id($cc->dbID()) if(! $rset->contains($cc)); + } + } + + + + if ($rset) { + $result_adaptor->store_chip_channels($rset); + } else { + $self->log("All ExperimentalChips have status:\t$status"); + } + + #this only returns a result set if there is new data to import + return $rset; +} + + + +=head2 resolve_probe_data + + Example : $self->resolve_probe_data(); + Description: Resolves DB probe duplicates and builds local probe cache + Returntype : none + Exceptions : ???? + Caller : general + Status : At risk + +=cut + +sub resolve_probe_data{ + my $self = shift; + + $self->log("Resolving probe data", 1); + + warn "Probe cache resolution needs to accomodate probesets too!"; + + foreach my $array (@{$self->arrays()}) { + my $resolve = 0; + + if ($self->get_probe_cache_by_Array($array)) { #cache already generated + + #check if we have any new unresolved array chips to add to the cache + foreach my $achip (@{$array->get_ArrayChips()}) { + + if ($achip->has_status('RESOLVED')) { + $self->log("ArrayChip has RESOLVED status:\t".$achip->design_id()); #, 1); + next; + } else { + $self->log("Found un-RESOLVED ArrayChip:\t".$achip->design_id()); + $resolve = 1; + last; + } + } + } else { #no cache file + $resolve = 1; + $self->log('No probe cache found for array '.$array->name()); + } + + if ($resolve) { + $self->log('Resolving array duplicates('.$array->name().') and rebuilding probe cache.', 1); + $self->get_probe_cache_by_Array($array, 1); #get from DB + + #we need ot make sure we mark cache as unresolved, so we don't use it by mistake. + + + + my ($line, $name, $pid, @pids); + #my $index = 0; + my $tmp_name = ''; + my $tmp_id = ''; + + #miss the header + + while ($line = $self->{'_probe_cache'}{$array->name}{'handle'}->getline()) { + ($name, $pid) = split/\t/o, $line; + + if ($name eq $tmp_name) { + + if ($pid != $tmp_id) { + push @pids, $pid; + #should reset to pid here if we have x y data else undef + #ignore this and force result to have x y + } + + #can't do this naymore unless we figure out how to move the line pointer + #would still need to sed the file anyway, better to regen from DB? + #undef $self->{'_probe_cache'}{$array->name}{'entries'}->[$i];#delete true or to be resolved duplicate + } elsif ($name ne $tmp_name) { #new probe + $self->tidy_duplicates(\@pids) if(scalar(@pids) > 1); + $tmp_name = $name; + $tmp_id = $pid; + @pids = ($pid); + #$index = $i + 1; + } + } + + $self->tidy_duplicates(\@pids) if(scalar(@pids) > 1); + + #rename resovled cache and reset cache handle + my $cmd = 'mv '.$self->get_dir('caches').'/'.$array->name().'.probe_cache.unresolved '. + $self->get_dir('caches').'/'.$array->name().'.probe_cache'; + + run_system_cmd($cmd); + $self->get_probe_cache_by_Array($array); #This sets the caches + + + #warn "Only generate MD5 here, as this is guranteed to be correct"; + + foreach my $achip (@{$array->get_ArrayChips()}) { + + if (! $achip->has_status('RESOLVED')) { + $self->log("Updating ArrayChip to RESOLVED status:\t".$achip->design_id()); + $achip->adaptor->store_status('RESOLVED', $achip); + } + } + + $self->log('Finished building probe cache for '.$array->name(), 1); + } + } + + $self->log('Finished resolving probe data', 1); + + return; +} + + +sub tidy_duplicates{ + my ($self, $pids) = @_; + + my $pfa = $self->db->get_ProbeFeatureAdaptor(); + my ($feature, %features); + + foreach my $dup_id (@$pids) { + + foreach $feature(@{$pfa->fetch_all_by_Probe_id($dup_id)}) { + #can we safely assume end will be same too? + push @{$features{$feature->seq_region_name().':'.$feature->start()}}, $feature; + } + } + + my (@reassign_ids, @delete_ids); + + foreach my $seq_start_key (keys %features) { + my $reassign_features = 1; + + foreach $feature(@{$features{$seq_start_key}}) { + + if ($feature->probe_id() == $pids->[0]) { + $reassign_features = 0; + } else { + push @delete_ids, $feature->dbID(); + } + } + + #This assumes that we actually have at least one element to every seq_start_key array + if ($reassign_features) { + my $new_fid = pop @delete_ids; + push @reassign_ids, $new_fid; + } + } + + #resolve features first so we don't get any orphaned features if we crash. + $pfa->reassign_features_to_probe(\@reassign_ids, $pids->[0]) if @reassign_ids; + $pfa->delete_features(\@delete_ids) if @delete_ids; + + return; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/InputSet.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/InputSet.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,384 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::InputSet +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::InputSet - A module to represent InputSet object. + + +=head1 SYNOPSIS + +use Bio::EnsEMBL::Funcgen::InputSet; + +#Create an InputSet + +my $inp_set = Bio::EnsEMBL::Funcgen::InputSet->new + ( + -DBID => $dbID, + -ADAPTOR => $self, + -EXPERIMENT => $exp, + -FEATURE_TYPE => $ftype, + -CELL_TYPE => $ctype, + -FORMAT => 'READ_FORMAT', + -VENDOR => 'SOLEXA', + -NAME => 'ExpSet1', + -REPLICATE => 1, + ); + +# Add some InputSubsets + +$inp_set->add_new_subsets($subset_name, $ + + + + +=head1 DESCRIPTION + +An InputSet object provides a generic container for any non-array based feature import, +allowing tracking of file import via the status table and integration into Data and FeatureSets to +provide traceability to the source experiment from a given FeatureSet. + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::InputSet; + +use Bio::EnsEMBL::Funcgen::InputSubset; +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate); +use Bio::EnsEMBL::Funcgen::Set; +use Bio::EnsEMBL::Analysis; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Set); + + +=head2 new + + + + Example : my $eset = Bio::EnsEMBL::Funcgen::InputSet->new( + -EXPERIMENT => $exp, + -FEATURE_TYPE => $ftype, + -CELL_TYPE => $ctype, + -FORMAT => 'READ_FORMAT', + -VENDOR => 'SOLEXA', + -NAME => 'ExpSet1', + -ANALYSIS => $anal, + -FEATURE_CLASS => 'annotated', + ); + + Do we want to define subsets likes this or are we more likely to add them one by one? + + Description: Constructor for InputSet objects. + Returntype : Bio::EnsEMBL::Funcgen::InputSet + Exceptions : Throws if no Experiment defined + Throws if CellType or FeatureType are not valid or stored + Caller : General + Status : At risk + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + + #Add set_type here to overwrite default ref parsing in Set::set_type + #This need to stay like this until we patch the DB + my $self = $class->SUPER::new(@_); + + my ($exp, $format, $vendor, $rep) + = rearrange(['EXPERIMENT', 'FORMAT', 'VENDOR', 'REPLICATE'], @_); + + if (! (ref $exp && $exp->isa('Bio::EnsEMBL::Funcgen::Experiment') && $exp->dbID())){ + throw('Must specify a valid stored Bio::EnsEMBL::Funcgen::Experiment'); + } + + + #These are set in Set, just validate here + throw ('Must provide a FeatureType') if(! defined $self->feature_type); + throw ('Must provide a CellType') if(! defined $self->cell_type); + + my $type = $self->feature_class; + + #Need to move these types to config + + if(! ($type && grep /^${type}$/, ('annotated', 'result', 'segmentation'))){ + throw("You must define a valid InputSet feature_class e.g. 'annotated' or 'result'"); + } + + if(($type eq 'result') && + ($format ne 'SEQUENCING')){ + throw('InputSet does not yet support a result type InputSet which does not have the \'SEQUENCING\' format'); + + } + + + #if(! defined $self->analysis){ + ##default analysis hack for v47 + ##Set directly to avoid dbID boolean check + #This is to support supporting_set cache in data_set? + $self->{'analysis'} = Bio::EnsEMBL::Analysis->new + (-logic_name => 'external', + -id => 0,#??someone needs to rewrite analysis + ); + + #Change to direct setting for speed + $self->{format} = $format; + $self->{vendor} = $vendor; + $self->{replicate} = $rep; + $self->{experiment} = $exp; + $self->{subsets} = {}; + + return $self; +} + + +=head2 add_new_subset + + Arg [1] : string - sub set name e.g. the file name (not path as we're restricted to 30 chars) + Arg [2] : Bio::EnsEMBL::Funcgen::InputSubset - optional + If not defined will create a sparse InputSubset based on the name + Example : $expset->add_new_subset($ss_name, $exp_subset); + Description: Adds input_subset + Returntype : none + Exceptions : Throws if set is already present + Throws if InputSubset is not valid or stored + Caller : General + Status : At Risk + +=cut + +#Do we still use the optional subset function? + +sub add_new_subset { + my ($self, $ss_name, $exp_sset) = @_; + + #Need to test $ss_name here + if(! ($ss_name && ref(\$ss_name) eq 'SCALAR')){#ref($exp_sset) would be 'REF' + throw('You must pass a InputSubset name'); + } + + if($self->get_subset_by_name($ss_name)){ + throw("Subset $ss_name is already present in this InputSet, maybe you need to alter the filename?"); + } + + if(defined $exp_sset){ + + if(!(ref($exp_sset) && $exp_sset->isa('Bio::EnsEMBL::Funcgen::InputSubset') && $exp_sset->dbID())){ + throw('InputSubsets must be valid and stored'); + } + } + else{ + + $exp_sset = Bio::EnsEMBL::Funcgen::InputSubset->new( + -name => $ss_name, + -input_set => $self, + ); + } + + $self->{subsets}{$ss_name} = $exp_sset; + + return $self->{subsets}{$ss_name}; +} + + +=head2 get_Experiment + + Example : my $exp = $exp_set->get_Experiment(); + Description: Getter for the Experiment of this DataSet. + Returntype : Bio::EnsEMBL::Fuuncgen::Experiment + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub get_Experiment{ return $_[0]->{experiment}; } + + +=head2 get_InputSubsets + + Example : my @subsets = @{$exp_set->get_InputSubsets()}; + Description: Getter for the InputSubsets for this InputSet. + Returntype : Arrayref + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub get_InputSubsets{ + my ($self) = shift; + + return [ values %{$self->{'subsets'}} ]; +} + + + + +=head2 get_subset_by_name + + Example : my $subsets = $exp_set->get_subset_by_name('subset1'); + Description: Getter for the subset of a given name for this InputSet. + Returntype : Bio::EnsEMBL::Funcgen::InputSubset + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub get_subset_by_name{ + my ($self, $name) = @_; + return (exists $self->{'subsets'}{$name}) ? $self->{'subsets'}{$name} : undef; +} + + +=head2 get_subset_names + + Example : my @subset_names = @{$exp_set->get_subset_names()}; + Description: Getter for the subset names for this InputSet. + Returntype : Arrayref + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub get_subset_names{ + my ($self) = shift; + return [ keys %{$self->{'subsets'}} ]; +} + + + + +=head2 vendor + + Arg[1] : String - vendor e.g. ILLUMINA + Example : my $iset_vendor = $iset->vendor; + Description: Getter for the vendor attribute of this InputSet. + Returntype : String + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub vendor { return $_[0]->{vendor}; } + + +=head2 format + + Arg[1] : string - format i.e. product type/format + Example : my $iset_format = $iset->format; + Description: Getter for the format attribute of this InputSet. + Returntype : String + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub format { return $_[0]->{format}; } + + +=head2 replicate + + Arg[1] : Integer - replicate 0 = merged or NA, >0 refers to individual replicate + Example : if($iset->replicate){ #Do something replicate specific in here } + Description: Getter for the replicate attribute of this InputSet. + Returntype : Integer + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub replicate { return $_[0]->{replicate}; } + + + +=head2 source_info + + Example : my $source_info = $input_set->source_info; + Description: Getter for the experiment source info i.e. [ $label, $url ] + Returntype : Listref + Exceptions : None + Caller : General + Status : At risk + +=cut + +#Currently handling redundant/absent InputSubset data + +sub source_info{ + my $self = shift; + + if(! defined $self->{source_info}){ + #could have data_url as highest priority here + #but we need to ensure removal when adding archive ids + #so we link to the archive and not the old data url + + my $exp_group = $self->get_Experiment->experimental_group; + my %source_info; #Handles redundant InputSubsets + my ($proj_name, $proj_link, $source_label, $source_link); + + if($exp_group->is_project){ + $proj_name = $exp_group->name; + $proj_link = $exp_group->url; + } + + foreach my $isset(@{$self->get_InputSubsets}){ + + if(defined $isset->archive_id ){ + $source_label = $isset->archive_id; + + if(! exists $source_info{$source_label}){ + $source_info{$source_label} = [$source_label, undef]; + #source_link can is undef here as archive_id overrides display url + #undef links will automatically go to the SRA + } + } + elsif(defined $proj_name){ + #$source_label = $self->experimental_group->name; + $source_link = $isset->display_url || $proj_link; + + if(! exists $source_info{$source_link}){ + $source_info{$source_link} = [$proj_name, $source_link]; + } + } + } + + $self->{source_info} = [values %source_info]; + } + + return $self->{source_info}; +} + + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/InputSubset.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/InputSubset.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,210 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::InputSubset +# + + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::InputSubset - A module to represent InputSubset object. + + +=head1 SYNOPSIS + +use Bio::EnsEMBL::Funcgen::InputSubset; + +my $input_subset = Bio::EnsEMBL::Funcgen::InputSubset->new + ( + -DBID => $dbID, + -ADAPTOR => $self, + -NAME => $name, + -INPUT_SET => $iset, + -archive_id => $archive_id, + -display_url => $display_url, + -replicate => $iss_rep, + -is_control => $is_control, + ); + + + +=head1 DESCRIPTION + +An InputSubset object represents an individual distinct input within a given InputSet. This +normally translates to single file or replicate. There is no dedicated InputSubsetAdaptor, +store and fetch functionality is embedded within the InputSetAdaptor. + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::InputSubset; + +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Utils::Exception qw( throw ); +use Bio::EnsEMBL::Funcgen::Storable; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Storable); + + +=head2 new + + Example : my $eset = Bio::EnsEMBL::Funcgen::InputSubset->new + ( + -DBID => $dbID, + -ADAPTOR => $self, + -NAME => $name, + -INPUT_SET => $iset, + -archive_id => $archive_id, + -display_url => $display_url, + -replicate => $iss_rep, + -is_control => $is_control, + ); + + + Description: Constructor for InputSubset objects. + Returntype : Bio::EnsEMBL::Funcgen::InputSubset + Exceptions : Throws if no name defined + Caller : InputSetAdaptor + Status : At risk + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + #do we need to add $fg_ids to this? Currently maintaining one feature_group focus.(combi exps?) + my ($name, $eset, $archive_id, + $display_url, $rep, $is_control) + = rearrange(['NAME', 'INPUT_SET', 'ARCHIVE_ID', + 'DISPLAY_URL', 'REPLICATE', 'IS_CONTROL'], @_); + + + throw('Must provide a name argument') if ! defined $name; + + if(!(ref($eset) && + $eset->isa('Bio::EnsEMBL::Funcgen::InputSet') + && $eset->dbID())){ + throw('Must provide a valid stored input_set argument'); + } + + + $self->{name} = $name; + $self->{input_set} = $eset; + $self->{archive_id} = $archive_id; + $self->{display_url} = $display_url; + $self->{replicate} = $rep; + $self->{is_control} = $is_control; + + return $self; +} + + +=head2 name + + Example : my $name = $exp_sset->name(); + Description: Getter for the name of this InputSubset. + Returntype : String + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub name { return $_[0]->{name}; } + + +=head2 input_set + + Example : my $input_set = $input_sset->input_set; + Description: Getter for the input_set attribute of this InputSubset. + Returntype : Bio::EnsEMBL::Funcgen::InputSet + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub input_set { return $_[0]->{input_set}; } + + +=head2 archive_id + + Example : my $archive_id = $inp_sset->archive_id; + Description: Getter for the archive of this InputSubset. + Returntype : String + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub archive_id { return $_[0]->{archive_id}; } + + +=head2 display_url + + Example : my $url = $inp_sset->displau_url; + Description: Getter for the display_url of this InputSubset. + Returntype : String + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub display_url{ return $_[0]->{display_url}; } + + +=head2 replicate + + Example : my $rep = $inp_sset->replicate; + Description: Getter for the replicate attribute of this InputSubset. + Returntype : Integer + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub replicate { return $_[0]->{replicate}; } + + +=head2 is_control + + Example : if($input_sset->is_control){ # Do some control specific stuff here } + Description: Getter for the is_control attribute of this InputSubset. + Returntype : Boolean + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub is_control { return $_[0]->{is_control}; } + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/MotifFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/MotifFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,360 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::MotifFeature +# +# You may distribute this module under the same terms as Perl itself + +=head1 NAME + +Bio::EnsEMBL::MotifFeature - A module to represent a feature mapping as based +on a binding matrix e.g position weight matrix + +=head1 SYNOPSIS + +use Bio::EnsEMBL::Funcgen::MotifFeature; + +my $feature = Bio::EnsEMBL::Funcgen::MotifFeature->new( + -SLICE => $chr_1_slice, + -START => 1_000_000, + -END => 1_000_024, + -STRAND => -1, + -DISPLAY_LABEL => $text, + -SCORE => $score, + -FEATURE_TYPE => $ftype, + -INTERDB_STABLE_ID => 1, +); + + + +=head1 DESCRIPTION + +A MotifFeature object represents the genomic placement of a sequence motif. +For example a transcription factor binding site motif associated with a +position weight matrix. These are generally associated with AnnotatedFeatures +of the corresponding FeatureType. + + +=head1 SEE ALSO + +Bio::EnsEMBL::Funcgen::DBSQL::MotifFeatureAdaptor + + +=head1 LICENSE + + Copyright (c) 1999-2009 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::MotifFeature; + +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Utils::Exception qw( throw ); +use Bio::EnsEMBL::Feature; +use Bio::EnsEMBL::Funcgen::Storable; + + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Feature Bio::EnsEMBL::Funcgen::Storable); + + +=head2 new + + + Arg [-SCORE] : (optional) int - Score assigned by analysis pipeline + Arg [-SLICE] : Bio::EnsEMBL::Slice - The slice on which this feature is. + Arg [-BINDING_MATRIX] : Bio::EnsEMBL::Funcgen::BindingMatrix - Binding Matrix associated to this feature. + Arg [-START] : int - The start coordinate of this feature relative to the start of the slice + it is sitting on. Coordinates start at 1 and are inclusive. + Arg [-END] : int -The end coordinate of this feature relative to the start of the slice + it is sitting on. Coordinates start at 1 and are inclusive. + Arg [-DISPLAY_LABEL] : string - Display label for this feature + Arg [-STRAND] : int - The orientation of this feature. Valid values are 1, -1 and 0. + Arg [-dbID] : (optional) int - Internal database ID. + Arg [-ADAPTOR] : (optional) Bio::EnsEMBL::DBSQL::BaseAdaptor - Database adaptor. + + Example : my $feature = Bio::EnsEMBL::Funcgen::MotifFeature->new( + -SLICE => $chr_1_slice, + -START => 1_000_000, + -END => 1_000_024, + -STRAND => -1, + -BINDING_MATRIX => $bm, + -DISPLAY_LABEL => $text, + -SCORE => $score, + -INTERDB_STABLE_ID => 1, + ); + + + Description: Constructor for MotifFeature objects. + Returntype : Bio::EnsEMBL::Funcgen::MotifFeature + Exceptions : Throws if BindingMatrix not valid + Caller : General + Status : Medium Risk + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_); + my $bmatrix; + ($self->{'score'}, $bmatrix, $self->{'display_label'}, $self->{'interdb_stable_id'}) + = rearrange(['SCORE', 'BINDING_MATRIX', 'DISPLAY_LABEL', 'INTERDB_STABLE_ID'], @_); + + + if(! (ref($bmatrix) && $bmatrix->isa('Bio::EnsEMBL::Funcgen::BindingMatrix'))){ + throw('You must pass be a valid Bio::EnsEMBL::Funcgen::BindingMatrix'); + } + + $self->{'binding_matrix'} = $bmatrix; + + return $self; +} + +=head2 new_fast + + Args : Hashref with all internal attributes set + Example : none + Description: Quick and dirty version of new. Only works if the calling code + is very disciplined. + Returntype : Bio::EnsEMBL::Funcgen::MotifFeature + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub new_fast { + return bless ($_[1], $_[0]); +} + + + +=head2 binding_matrix + + Example : my $bmatrix_name = $mfeat->binding_matrix->name; + Description: Getter for the BindingMatrix attribute for this feature. + Returntype : Bio::EnsEMBL::Funcgen::BindingMatrix + Exceptions : None + Caller : General + Status : At risk + +=cut + +sub binding_matrix{ + return $_[0]->{'binding_matrix'}; +} + +=head2 score + + Example : my $score = $feature->score(); + Description: Getter for the score attribute for this feature. + Returntype : double + Exceptions : None + Caller : General + Status : Low Risk + +=cut + +sub score { + return $_[0]->{'score'}; +} + + +=head2 display_label + + Example : my $label = $feature->display_label(); + Description: Getter for the display label of this feature. + Returntype : str + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub display_label { + #If not set in new before store, a default is stored as: + #$mf->binding_matrix->feature_type->name.':'.$mf->binding_matrix->name(); + + return $_[0]->{'display_label'}; +} + + + +=head2 associated_annotated_features + + Example : my @associated_afs = @{$feature->associated_annotated_features()}; + Description: Getter/Setter for associated AnntoatedFeatures. + Returntype : ARRAYREF of Bio::EnsEMBL::Funcgen:AnnotatedFeature objects + Exceptions : None + Caller : General + Status : At risk - may change to associated_transcript_factor_features + +=cut + +sub associated_annotated_features{ + my ($self, $afs) = @_; + + #Lazy load as we don't want to have to do a join on all features when most will not have any + + + if(defined $afs){ + + if(ref($afs) eq 'ARRAY'){ + + foreach my $af(@$afs){ + + if( ! $af->isa('Bio::EnsEMBL::Funcgen::AnnotatedFeature') ){ + throw('You must pass and ARRAYREF of stored Bio::EnsEMBL::Funcgen::AnnotatedFeature objects'); + } + #test is stored in adaptor + } + + if(defined $self->{'associated_annotated_features'}){ + warn('You are overwriting associated_annotated_features for the MotifFeature'); + #we could simply add the new ones and make them NR. + } + + $self->{'associated_annotated_features'} = $afs; + } + else{ + throw('You must pass and ARRAYREF of stored Bio::EnsEMBL::Funcgen::AnnotatedFeature objects'); + } + } + + + if(! defined $self->{'associated_annotated_features'}){ + + if(defined $self->adaptor){ + $self->{'associated_annotated_features'} = + $self->adaptor->db->get_AnnotatedFeatureAdaptor->fetch_all_by_associated_MotifFeature($self); + } + } + + #This has the potential to return undef, or an arrayref which may be empty. + return $self->{'associated_annotated_features'}; +} + + +=head2 is_position_informative + + Arg [1] : int - 1-based position within the Motif + Example : $mf->is_position_informative($pos); + Description: Indicates if a given position within the motif is highly informative + Returntype : boolean + Exceptions : throws if position out of bounds ( < 1 or > length of motif) + Caller : General + Status : At High risk + +=cut + +sub is_position_informative { + my ($self,$position) = (shift,shift); + throw "Need a position" if(!defined($position)); + throw "Position out of bounds" if(($position<1) || ($position>$self->binding_matrix->length)); + #if on the opposite strand, then need to reverse complement the position + if($self->strand < 0){ $position = $self->binding_matrix->length - $position + 1; } + return $self->binding_matrix->is_position_informative($position); +} + + +=head2 infer_variation_consequence + + Arg [1] : Bio::EnsEMBL::Variation::VariationFeature + Arg [2] : boolean - 1 if results in linear scale (default is log scale) + Example : my $vfs = $vf_adaptor->fetch_all_by_Slice($slice_adaptor->fetch_by_region('toplevel',$mf->seq_region_name,$mf->start,$mf->end,$mf->strand)); + foreach my $vf (@{$vfs}){ + print $mf->infer_variation_consequence($vf)."\n"; + } + + Description: Calculates the potential influence of a given variation in a motif feature. + Returns a value between -100% (lost) and +100% (gain) indicating the difference + in strength between the motif in the reference and after the variation. + + The variation feature slice needs to be the motif feature, including the strand + Returntype : float + Exceptions : throws if argument is not a Bio::EnsEMBL::Variation::VariationFeature + throws if the variation feature is not contained in the motif feature + Caller : General + Status : At High risk + +=cut + +sub infer_variation_consequence{ + my ($self, $vf, $linear) = (shift, shift, shift); + + if(! $vf->isa('Bio::EnsEMBL::Variation::VariationFeature')){ + throw "We expect a Bio::EnsEMBL::Variation::VariationFeature object, not a ".$vf->class; + } + + #See if these checks are required or if there are more efficient ways to do the checks... + #if(($self->slice->seq_region_name ne $vf->slice->seq_region_name) || + # ($self->slice->start != $vf->slice->start) || + # ($self->slice->end != $vf->slice->end) ){ + # throw "Variation and Motif are on distinct slices"; + #} + #if(!(($vf->start >= $self->start) && ($vf->end <= $self->end ))){ + # throw "Variation should be entirely contained in the Motif"; + #} + + if( ($vf->start < 1) || ($vf->end > $self->binding_matrix->length)){ throw "Variation not entirely contained in the motif feature"; } + + if(!($vf->allele_string =~ /^[ACTG]\/[ACTG]$/)){ throw "Currently only SNPs are supported"; } + + my $ref_seq = $self->seq; + + my $variant = $vf->allele_string; + $variant =~ s/^.*\///; + $variant =~ s/\s*$//; + + my ($vf_start,$vf_end) = ($vf->start, $vf->end); + if($vf->strand == -1){ + #Needed for insertions + $variant = reverse($variant); + $variant =~ tr/ACGT/TGCA/; + } + my $var_seq = substr($ref_seq,0, $vf_start - 1).$variant.substr($ref_seq, $vf_start+length($variant)-1); + + my $bm = $self->{'binding_matrix'}; + return 100 * ($bm->relative_affinity($var_seq,$linear) - $bm->relative_affinity($ref_seq,$linear)); + +} + +=head2 interdb_stable_id + + Arg [1] : (optional) int - stable_id e.g 1 + Example : my $idb_sid = $feature->interdb_stable_id(); + Description: Getter for the interdb_stable_id attribute for this feature. + This is simply to avoid using internal db IDs for inter DB linking + Returntype : int + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub interdb_stable_id { + return $_[0]->{'interdb_stable_id'}; +} + + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/ArrayDesign.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/ArrayDesign.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,434 @@ +# +# EnsEMBL module for Bio::EnsEMBL::Funcgen::Parsers::ArrayDesign +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::Funcgen::Parsers::ArrayDesign + +=head1 SYNOPSIS + + my $parser_type = "Bio::EnsEMBL::Funcgen::Parsers::ArrayDesign"; + push @INC, $parser_type; + my $imp = $class->SUPER::new(@_); my $imp = Bio::EnsEMBL::Funcgen::Importer->new(%params); + + $imp->set_config(); + + +=head1 DESCRIPTION + +This is a definitions class which should not be instatiated directly, it +normally inherited from the Importer. ArrayDesign contains meta data and methods +specific to handling array designs only (i.e. no experimental data), which have +been produced from the eFG array design software. + +=cut + +package Bio::EnsEMBL::Funcgen::Parsers::ArrayDesign; + +use Bio::EnsEMBL::Funcgen::Array; +use Bio::EnsEMBL::Funcgen::ProbeSet; +use Bio::EnsEMBL::Funcgen::Probe; +use Bio::EnsEMBL::Funcgen::ProbeFeature; +use Bio::EnsEMBL::Funcgen::FeatureType; +use Bio::EnsEMBL::Funcgen::ExperimentalChip; +use Bio::EnsEMBL::Funcgen::ArrayChip; +use Bio::EnsEMBL::Funcgen::Channel; +use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate ); +use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(species_chr_num open_file); +use Bio::EnsEMBL::Funcgen::Utils::Helper; +use strict; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Utils::Helper); + + + +=head2 new + + Example : my $self = $class->SUPER::new(@_); + Description: Constructor method for ArrayDesign class + Returntype : Bio::EnsEMBL::Funcgen::Parsers::ArrayDesign + Exceptions : throws if Experiment name not defined or if caller is not Importer + Caller : Bio::EnsEMBL::Funcgen::Importer + Status : at risk + +=cut + + +sub new{ + my $caller = shift; + + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(); + + throw("This is a skeleton class for Bio::EnsEMBL::Importer, should not be used directly") if(! $self->isa("Bio::EnsEMBL::Funcgen::Importer")); + + + $self->{'config'} = + {( + probe_data => ["probe"], + prb_fields => ['SEQ_ID', 'POSITION', 'LENGTH', 'PROBE_SEQUENCE', 'PROBE_ID', 'UNIQUENESS_SCORE', 'TM', 'MAS_CYCLES'], + notes_fields => ['DESIGN_ID', 'DESIGN_NAME', 'DESCRIPTION'], + )}; + + return $self; +} + + + +=head2 set_config + + Example : my $self->set_config; + Description: Sets attribute dependent config + Returntype : None + Exceptions : None + Caller : Bio::EnsEMBL::Funcgen::Importer + Status : at risk + +=cut + +sub set_config{ + my ($self) = @_; + + #placeholder method + #set paths + + return; +} + + +=head2 read_array_data + + Example : $imp->read_array_data(); + Description: Parses NimbleGen style DesignNotes.txt format files to create and store new Arrays + Returntype : none + Exceptions : None + Caller : general + Status : At risk - Can this be generic? Can we force the creation of a DesignNotes file on other formats? + +=cut + +#this is currently OLIGO specific. + +sub read_array_data{ + my ($self, $design_notes) = @_; + + $self->log("Reading and importing array data"); + + throw('You need to pass the path to a DesignNotes.txt file') if ! defined $design_notes; + $self->{'design_notes'} = $design_notes; + + my ($line, $array, $array_chip, @data, %hpos); + my $oa_adaptor = $self->db->get_ArrayAdaptor(); + my $ac_adaptor = $self->db->get_ArrayChipAdaptor(); + my $fh = open_file("<", $self->{'design_notes'}); + + while ($line = <$fh>){ + + $line =~ s/\r*\n//;#chump + @data = split/\t/o, $line; + + + + + #We need to have a DESIGN vendor type? + #also need to be able to set file path independently of config + + if($. == 1){ + %hpos = %{$self->set_header_hash(\@data, $self->get_config('notes_fields'))}; + next; + } + + ### CREATE AND STORE Array and ArrayChips + if(! defined $array ){ + #This is treating each array chip as a separate array, unless arrayset is defined + #AT present we have no way of differentiating between different array_chips on same array???!!! + #Need to add functionality afterwards to collate array_chips into single array + + #This will use a stored array if present + + $array = Bio::EnsEMBL::Funcgen::Array->new + ( + -NAME => $self->array_name() || $data[$hpos{'DESIGN_NAME'}], + -FORMAT => uc($self->format()), + -VENDOR => uc($self->vendor()), + -TYPE => 'OLIGO', + -DESCRIPTION => $data[$hpos{'DESCRIPTION'}],#need to trim the array chip specific description here + ); + + ($array) = @{$oa_adaptor->store($array)}; + + + $array_chip = Bio::EnsEMBL::Funcgen::ArrayChip->new( + -ARRAY_ID => $array->dbID(), + -NAME => $data[$hpos{'DESIGN_NAME'}], + -DESIGN_ID => $data[$hpos{'DESIGN_ID'}], + #add description? + ); + + #This will use a stored array_chip if present + ($array_chip) = @{$ac_adaptor->store($array_chip)}; + $array->add_ArrayChip($array_chip); + + } + elsif((! $array->get_ArrayChip_by_design_id($data[$hpos{'DESIGN_ID'}])) && ($self->array_set())){ + + $self->log("Generating new ArrayChip(".$data[$hpos{'DESIGN_NAME'}].". for same Array ".$array->name()."\n"); + + $array_chip = Bio::EnsEMBL::Funcgen::ArrayChip->new( + -ARRAY_ID => $array->dbID(), + -NAME => $data[$hpos{'DESIGN_NAME'}], + -DESIGN_ID => $data[$hpos{'DESIGN_ID'}], + ); + + ($array_chip) = @{$ac_adaptor->store($array_chip)}; + $array->add_ArrayChip($array_chip); + + } + elsif(! $array->get_ArrayChip_by_design_id($data[$hpos{'DESIGN_ID'}])){ + throw("Found experiment with more than one design without -array_set"); + } + } + + + $self->add_Array($array); + + close($fh); + + return; + +} + + + + + +=head2 read_probe_data + + Example : $imp->read_probe_data(); + Description: Parses and imports probes, probe sets and features for a given array design + Returntype : none + Exceptions : throws is not tiling format + Caller : Importer + Status : at risk + +=cut + + +#Assumes one chip_design per experimental set. +sub read_probe_data{ + my ($self, $array_file) = @_; + + $self->log("Reading and importing probe data"); + + + my ($fh, $line, @data, @log, %hpos, %probe_pos);#, %duplicate_probes); + my $aa = $self->db->get_AnalysisAdaptor(); + my $manal = $aa->fetch_by_logic_name('MASCycles'); + my $uanal = $aa->fetch_by_logic_name('UScore'); + my $tmanal= $aa->fetch_by_logic_name('NimblegenTM'); + + $array_file ||= $self->array_file(); + $self->log("Parsing ".$self->vendor()." probe data (".localtime().")"); + throw("ArrayDesign only accomodates a tiling design with no feature/probesets") if ($self->format() ne 'TILED'); + + ### Read in + # eFG prb file, not chiip info yet so only one ArrayChip per design + # potential to have pos file here for probes built on generic slices of genome + + #We need to handle different coord systems and possibly different assmemblies + my $slice_a = $self->db->get_SliceAdaptor(); + my $cs = $self->db->get_FGCoordSystemAdaptor()->fetch_by_name_schema_build_version( + 'chromosome', + $self->db->_get_schema_build($self->db->dnadb()) + ); + + + #sanity check we're only dealing with one array/chip + my @arrays = @{$self->arrays()}; + if(scalar(@arrays) != 1){ + throw("Array DESIGN imports only accomodate one Array per import, please check ".$self->{'design_notes'}); + } + + my @achips = @{$arrays[0]->get_ArrayChips()}; + if(scalar(@achips) != 1){ + throw("Array DESIGN imports only accomodates one ArrayChip per import, please check ".$self->{'design_notes'}); + } + + my $achip = $achips[0]; + + #foreach my $array(@{$self->arrays()}){ + + + # foreach my $achip(@{$array->get_ArrayChips()}){ + + $self->log("Importing array design(".$achip->name().") from ".$array_file); + + + if($achip->has_status('IMPORTED')){ + $self->log("Skipping fully imported ArrayChip:\t".$achip->design_id()); + return; + }elsif($self->recovery()){ + $self->log("Rolling back partially imported ArrayChip:\t".$achip->design_id()); + $self->db->rollback_ArrayChip([$achip]); + } + + $self->log("Importing ArrayChip:".$achip->design_id()); + + + #OPEN PROBE IN/OUT FILES + $fh = open_file("<", $array_file); + my $f_out = open_file(">", $self->get_dir("output")."/probe.".$achip->name()."fasta") if($self->{'_dump_fasta'}); + my ($op, $of, %pfs); + + #should define mapping_method arg to allows this to be set to LiftOver/EnsemblMap + my $anal = $self->db->get_AnalysisAdaptor()->fetch_by_logic_name("TileMap");##??? + my $strand = 0; #default for TileMap, should be config hash? + my $fasta = ""; + + while($line = <$fh>){ + $line =~ s/\r*\n//; + @data = split/\t/o, $line; + my $loc = ""; + + #SEQ_ID WINDOW_START WINDOW_END POSITION LENGTH PROBE_SEQUENCE TM UNIQUENESS_SCORE MAS_CYCLES + #X 3000001 3000100 3000041 56 TGACATCTTCAGTTCTTTACATAGTTTTCATATTAGTCCTCTATCAGATGTGGAGT 73.09 132 15 + + + if ($. == 1){ + %hpos = %{$self->set_header_hash(\@data, $self->get_config('prb_fields'))}; + next; + } + + + #This assumes tiling format with no feature/probe sets + + + if(%pfs){ + $self->store_set_probes_features($achip->dbID(), \%pfs); + undef %pfs; + } + + + #PROBE + $op = Bio::EnsEMBL::Funcgen::Probe->new( + -NAME => $data[$hpos{'PROBE_ID'}], + -LENGTH => $data[$hpos{'LENGTH'}], + -ARRAY => $arrays[0], + -ARRAY_CHIP_ID => $achip->dbID(), + -CLASS => 'DESIGN', + ); + + $op->add_Analysis_score($manal, $data[$hpos{'MAS_CYCLES'}]); + $op->add_Analysis_score($tmanal, $data[$hpos{'TM'}]); + $op->add_Analysis_CoordSystem_score($uanal, $cs, $data[$hpos{'UNIQUENESS_SCORE'}]); + + #would need to pass cs to store USCORE, CYCLES and TM are seq/anal dependent not cs + #options: + #associate cs dependent scores with features + #we are duplicating cs in probe_design table + + #do we need another object? ProbeDesign + #-cs would be empty for all but uscore + #-mas_cycles analysis_id + #-uscore analysis_id cs_id + #-tm analysis_id (anal most likely wont change so highly redundant) + #this would produce 3 records for each probe, with cs being empty for two and anal being redundant for the other + #would however provide for extensible design attributes + #would only need one tm and mas_cycles for each probe irrespective of cs + #could have separate table probe_design_feature? + #mmm doesn't have location, just cs + #just have empty cs fields, calls by cs would have to be in ('cs_id', 'NULL') + #or just make uscore method dependent on cs. + #or split table? + #ProbeDesign::add_analysis_score + #ProbeDesign::add_coord_sys_analysis_score + + #can we add this directly to the probe? + #separate retrieval in ProbeAdaptor so we're not joining everytime + #this would require generic get_analysis/analysis_coord_system_attribute method + + + + %{$pfs{$data[$hpos{'PROBE_ID'}]}} = ( + probe => $op, + features => [], + ); + + + #PROBE FEATURE + if(! $self->cache_slice($data[$hpos{'SEQ_ID'}])){ + warn("Skipping non-standard probe chromosome"); + undef %pfs; + next; + } + + my $end = ($data[$hpos{'POSITION'}] + $data[$hpos{'LENGTH'}]); + + if ($self->{'_dump_fasta'}){ + $loc .= $data[$hpos{'SEQ_ID'}].":".$data[$hpos{'POSITION'}]."-${end};"; + } + + + $of = Bio::EnsEMBL::Funcgen::ProbeFeature->new + ( + -START => $data[$hpos{'POSITION'}], + -END => $end, + -STRAND => $strand, + -SLICE => $self->cache_slice($data[$hpos{'SEQ_ID'}]), + -ANALYSIS => $anal, + -MISMATCHCOUNT => 0, + -CIGAR_LINE => $data[$hpos{'LENGTH'}].'M', + -PROBE => undef,#Need to update this in the store method + ); + + + push @{$pfs{$data[$hpos{'PROBE_ID'}]}{'features'}}, $of; + + if($self->{'_dump_fasta'}){ + #filter controls/randoms? Or would it be sensible to see where they map + #wrap seq here? + $fasta .= ">".$data[$hpos{'PROBE_ID'}]."\t".$data[$hpos{'CHROMOSOME'}]. + "\t$loc\n".$data[$hpos{'PROBE_SEQUENCE'}]."\n"; + } + } + + #need to store last data here + $self->store_set_probes_features($achip->dbID(), \%pfs); + $self->log(join("\n", @log)); + $achip->adaptor->set_status("IMPORTED", $achip); + $self->log("ArrayChip:\t".$achip->design_id()." has been IMPORTED"); + + if ($self->{'_dump_fasta'}){ + print $f_out $fasta if($self->{'_dump_fasta'}); + close($f_out); + } + + $self->log("Finished parsing probe data"); + #Total probe_sets:\t$psid\n". + # "Total probes:\t$pid\nTotal probe_features:\t$fid"); + + return; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/BaseExternalParser.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/BaseExternalParser.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,269 @@ +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +package Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser; + +use strict; + +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Utils::Exception qw( throw ); +use Bio::EnsEMBL::Funcgen::FeatureSet; +use Bio::EnsEMBL::Funcgen::FeatureType; +use Bio::EnsEMBL::Analysis; +#use Bio::EnsEMBL::Funcgen::Parsers::BaseImporter; +#use vars qw(@ISA) +#@ISA = ('Bio::EnsEMBL::Funcgen::Utils::Helper'); + +use base qw(Bio::EnsEMBL::Funcgen::Parsers::BaseImporter); #@ISA change to parent with perl 5.10 + + + +# Base functionality for external_feature parsers + +#Make this inherit from Helper? +#Then change all the prints to logs + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_); + + #validate and set type, analysis and feature_set here + my ($type, $db, $archive, $import_fsets) = rearrange(['TYPE', 'DB', 'ARCHIVE', 'IMPORT_SETS'], @_); + + #What is ExternalParser specific here? + #archive? + #type? is this even used? + + + #throw('You must define a type of external_feature to import') if(! defined $type); + + if (! ($db && ref($db) && + $db->isa('Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor'))){ + throw('You must provide a valid Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor'); + } + + throw('You can only specify either -clobber|rollback or -archive, but not both') if($self->rollback && $archive); + + $self->{display_name_cache} = {}; + $self->{'db'} = $db; + #$self->{type} = $type; + $self->{archive} = $archive if defined $archive; + + + #This is not fully implemented yet and need to be validated against the config feature_set + #pass something like set1,set2 and split and validate each. + #Or do this in the calling script? + + throw('-import_sets not fully implemented yet') if defined $import_fsets; + $self->{'import_sets'} = (defined $import_fsets) ? @{$import_fsets} : undef; + + $self->log("Parsing and loading $type ExternalFeatures"); + + return $self; + +} + + +=head2 import_sets + + Args : None + Example : foreach my $import_set_name(@{$self->import_sets}){ ... do the import ... } + Description: Getter for the list of import feature set names, defaults to all in parser config. + Returntype : Arrayref of import feature_set names + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub import_sets{ + my $self = shift; + + return $self->{'import_sets'} || [keys %{$self->{static_config}{feature_sets}}]; +} + + +=head2 set_feature_sets + + Args : None + Example : $self->set_feature_sets; + Description: Imports feature sets defined by import_sets. + Returntype : None + Exceptions : Throws if feature set already present and rollback or archive not set + Caller : General + Status : Medium Risk + +=cut + +#This is done after validate and store feature_types +#Updating this will require making all external parsers use 'static_config' + +sub set_feature_sets{ + my $self = shift; + + throw('Must provide a set feature_set config hash') if ! defined $self->{static_config}{feature_sets}; + + + my $fset_adaptor = $self->db->get_FeatureSetAdaptor; + my $analysis_adaptor = $self->db->get_AnalysisAdaptor; + + foreach my $fset_name(@{$self->import_sets}){ + + $self->log("Defining FeatureSet:\t$fset_name"); + my $fset = $fset_adaptor->fetch_by_name($fset_name); + + #we don't need data sets for external_feature sets! + #Compare against config after we have merged with defined_anld_validate etc + + if(defined $fset){ + $self->log("Found previous FeatureSet $fset_name"); + + if($self->rollback){ + + $self->rollback_FeatureSet($fset);#Need to pass \@slices here? + } + elsif($self->{'archive'}){ + my $archive_fset = $fset_adaptor->fetch_by_name($fset_name."_v".$self->{'archive'}); + + if(defined $archive_fset){ + throw("You are trying to create an archive external feature_set which already exists:\t${fset_name}_v".$self->{archive}); + } + + my $sql = "UPDATE feature_set set name='$fset_name}_v".$self->{archive}."' where name='$fset_name'"; + $self->db->dbc->do($sql); + undef $fset; + }else{ + throw("You are trying to create an external feature_set which already exists:\t$fset_name\nMaybe to want to rollback or archive?"); + } + } + + #Assume using static config for now + #Will need to resolve this when it become generic + #Maybe we set outside of config! + #simply as analyses, feature_sets and feature_types? + my $fset_config = $self->{static_config}{feature_sets}{$fset_name}{feature_set}; + + + if(! defined $fset){ + my ($name, $analysis, $ftype, $display_label, $desc); + my $fset_analysis_key = (exists ${$fset_config}{-analysis}) ? '-analysis' : '-ANALYSIS'; + my $fset_name_key = (exists ${$fset_config}{-name}) ? '-name' : '-NAME'; + my $fset_ftype_key = (exists ${$fset_config}{-feature_type}) ? '-feature_type' : '-FEATURE_TYPE'; + my $fset_dlabel_key = (exists ${$fset_config}{-display_label}) ? '-display_label' : '-DISPLAY_LABEL'; + my $fset_desc_key = (exists ${$fset_config}{-description}) ? '-description' : '-DESCRIPTION'; + my $display_name = (exists ${$fset_config}{$fset_dlabel_key}) ? $fset_config->{$fset_dlabel_key} : $fset_name; + #fset config name be different from key name + my $fs_name = (exists ${$fset_config}{$fset_name_key}) ? $fset_config->{$fset_name_key} : $fset_name; + #warn if they are different? + + + #Can't just deref config hash here as we need to deref the nested feature_type and analysis attrs + + $fset = Bio::EnsEMBL::Funcgen::FeatureSet->new( + -name => $fs_name, + -feature_class=> 'external', + -analysis => ${$fset_config->{$fset_analysis_key}}, + -feature_type => ${$fset_config->{$fset_ftype_key}}, + -display_label => $display_name, + -description => $fset_config->{$fset_desc_key} + ); + + ($fset) = @{$self->db->get_FeatureSetAdaptor->store($fset)}; + } + + #Now replace config hash with object + #Will this reset in hash or just locally? + #$fset_config = $fset; + $self->{static_config}{feature_sets}{$fset_name}{feature_set} = $fset; + } + + return; +} + +#Can't use this anymore as we have to use static_config for all parsers which use set_feature_sets + +# +#=head2 validate_and_store_feature_types +# +# Args : None +# Example : $self->validate_and_store_feature_types; +# Description: Imports feature types defined by import_sets. +# Returntype : None +# Exceptions : None +# Caller : General +# Status : High Risk - Now using BaseImporter::validate_and_store_config for vista +# +#=cut +# +##Change all external parsers to use BaseImporter::validate_and_store_config +# +#sub validate_and_store_feature_types{ +# my $self = shift; +# +# #This currently only stores ftype associated with the feature_sets +# #Havent't we done this already in the InputSet parser +# #Need to write BaseImporter and inherit from there. +# +# #InputSet does all loading, but depends on 'user_config' +# #Where as we are using hardcoded config here +# #Which are import_sets currently defaults to feature_sets keys +# +# #we could simply call this static_config and let user_config over-write static config with warnings? +# #on an key by key basis? (top level only?) +# +# +# my $ftype_adaptor = $self->db->get_FeatureTypeAdaptor; +# +# foreach my $import_set(@{$self->import_sets}){ +# +# my $ftype_config = ${$self->{static_config}{feature_sets}{$import_set}{feature_type}}; +# my $ftype = $ftype_adaptor->fetch_by_name($ftype_config->{'name'}); +# +# $self->log("Validating $import_set FeatureType:\t".$ftype_config->{'name'}); +# +# if(! defined $ftype){ +# $self->log("FeatureType '".$ftype_config->{'name'}."' for external feature_set ".$self->{'type'}." not present"); +# $self->log("Storing using type hash definitions"); +# +# $ftype = Bio::EnsEMBL::Funcgen::FeatureType->new( +# -name => $ftype_config->{'name'}, +# -class => $ftype_config->{'class'}, +# -description => $ftype_config->{'description'}, +# ); +# ($ftype) = @{$ftype_adaptor->store($ftype)}; +# } +# +# #Replace hash config with object +# $self->{static_config}{feature_types}{$ftype_config->{'name'}} = $ftype; +# } +# +# return; +#} +# + + + + + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/BaseImporter.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/BaseImporter.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1056 @@ +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::Funcgen::Parsers::BaseImporter + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +This is the start of merging the Importer with the InputSet & BaseExternal Parsers. +This class holds all generic methods used for importing data into the funcgen schema. +Move all generic methods from Importer to here, and move format specific methods to new parsers. +Then remove Importer completely. + +=head1 SEE ALSO + + +=cut + +package Bio::EnsEMBL::Funcgen::Parsers::BaseImporter; + +use strict; + + +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Utils::Exception qw( throw ); +use Bio::EnsEMBL::Funcgen::FeatureSet; +use Bio::EnsEMBL::Funcgen::FeatureType; +use Bio::EnsEMBL::Registry; + + +use base qw(Bio::EnsEMBL::Funcgen::Utils::Helper); #@ISA change to parent with perl 5.10 + + +# Need: +# Parser::Experiment? All but external +# Parser::ArrayExperiment (Inherits from ArrayDesign?) Or Nimblegen? + +#new +#edb_release over-ride, to enable loading of old data. +#How do we handle set rollback? (with force for associated sets) +# + +#Force was actually specific to store_window_bin_by_Slice_Parser +# + +=head2 new + + Arg [1] : HASH containing attributes: + -rollback Performs full rollback of imported features. + -recover Performs rollback of features/sets which do + not have an associated Imported status. + + -config_file Path to file containing config hash. + + - + Example : my $self = $class->SUPER::new(@_); + Description: Constructor method for Bed class + Returntype : Bio::EnsEMBL::Funcgen::Parsers::Simple + Exceptions : throws if caller is not Importer + Caller : Bio::EnsEMBL::Funcgen::Parsers:Simple + Status : at risk + +=cut + + + +#Remove rollback/clobber from here +#The idea being we completely separate the script which runs the import/rollback +#so we never accidentally rollback a set with a stray -rollback flag. +#Implement this at the script level? Then we can re-use the methods to perform the rollback? +#This means all the rollback function will be tied up with all the import pre-reqs +#want to rollback by filepath(InputSubset name)/InputSet name + +#Would just need to revoke states before calling validate_files +#Can do this manually before import + + +#Inheritance fix +#Can we move Importer new to here? ISA dodgyness and alter the scripts accordingly? +#The we can move all the set_config stuff the the parser new methods and remove set_config +#Issues : +#1 Non-generic params +#2 How will Importer methods be inherited if we remove it from ISA like this +# Either make Inheritance Parser->Importer->BaseImporter until we remove the Importer +# Or fix in one go. +# Or just move DB creation stuff here such that we can maintain current broken inheritance +# But allow removal of set_config +# So how much of the DB creation stuff can we add here without clashing with BaseExternalParser? +# BaseExternalParser has DB passed, whilst Importer allows both params and DB to be passed. + + +sub new{ + my $caller = shift; + + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_); + + my $reg = "Bio::EnsEMBL::Registry"; + my ($config_file, $clobber, $rollback, $species, $fset_desc, + $user, $host, $port, $pass, $dbname, $db, $ssh, + $assm_version, $release, $reg_config, $verbose, $slices, + $reg_db, $reg_host, $reg_port, $reg_user, $reg_pass, + $ftype_name, $ctype_name, $feature_analysis, $no_disconnect); + + + #Set some directly here to allow faster accessor only methods + + ($ftype_name, $ctype_name, $feature_analysis, $self->{feature_set_desc}, + $species, $db, $user, $host, + $port, $pass, $dbname, $assm_version, $ssh, + $release, $reg_config, $reg_db, $reg_host, + $reg_port, $reg_user, $reg_pass, $verbose, + $slices, $self->{recover}, $clobber, $rollback, + $config_file, $self->{ucsc_coords}, $self->{_dump_fasta}, $no_disconnect, + ) = rearrange + (['FEATURE_TYPE_NAME', 'CELL_TYPE_NAME', 'FEATURE_ANALYSIS', 'FEATURE_SET_DESCRIPTION', + 'species', 'db', 'user', 'host', + 'PORT', 'PASS', 'DBNAME', 'ASSEMBLY', 'SSH', + 'RELEASE', 'REG_CONFIG', 'REGISTRY_DB', 'REGISTRY_HOST', + 'REGISTRY_PORT', 'REGISTRY_USER', 'REGISTRY_PASS', 'VERBOSE', + 'slices', 'recover', 'clobber', 'rollback', + 'config_file', 'ucsc_coords', 'DUMP_FASTA', 'no_disconnect'], + @_); + + + + #Other stuff to bring in here: + # dirs + # prepared/batch/farm + # analysis/cell/feature_type/feature_set_description + # This is redundant wrt config leave for now and catch + + #Can we move all of this DB/reg handling stuff out + #So it can be re-used by other modules/scripts? + + #Need to compare these to BaseExternalParser + + if( (! $species) && + $db ){ + $species = $db->species; + } + + $species || throw('Mandatory param -species not met'); + + # Registry and DB handling - re/move to separate method? + + if($reg_host && $self->{'reg_config'}){ + warn "You have specified registry parameters and a config file:\t".$self->{'reg_config'}. + "\nOver-riding config file with specified paramters:\t${reg_user}@${reg_host}:$reg_port"; + } + + + #### Set up DBs and load and reconfig registry + + # Load Registry using assembly version + # Then just redefine the efg DB + + #We have problems here if we try and load on a dev version, where no dev DBs are available on ensembldb + #Get the latest API version for the assembly we want to use + #Then load the registry from that version + #Then we can remove some of the dnadb setting code below? + #This may cause problems with API schema mismatches + #Can we just test whether the current default dnadb contains the assembly? + #Problem with this is that this will not have any other data e.g. genes etc + #which may be required for some parsers + + #How does the registry pick up the schema version?? + + #We should really load the registry first given the dnadb assembly version + #Then reset the eFG DB as appropriate + $self->{'reg_config'} = $reg_config || ((-f "$ENV{'HOME'}/.ensembl_init") ? "$ENV{'HOME'}/.ensembl_init" : undef); + + if ($reg_host || ! defined $self->{'_reg_config'}) { + #defaults to current ensembl DBs + $reg_host ||= 'ensembldb.ensembl.org'; + $reg_user ||= 'anonymous'; + + #Default to the most recent port for ensdb + if( (! $reg_port) && + ($reg_host eq 'ensdb-archive') ){ + $reg_port = 5304; + } + + #This will try and load the dev DBs if we are using v49 schema or API? + #Need to be mindful about this when developing + #we need to tip all this on it's head and load the reg from the dnadb version + + my $version = $release || $reg->software_version; + $self->log("Loading v${version} registry from $reg_user".'@'.$reg_host); + + #Note this defaults API version, hence running with head code + #And not specifying a release version will find not head version + #DBs on ensembldb, resulting in an exception from reset_DBAdaptor below + + #This currently loads from reg_host + #Which triggers the funcgen DB to try and _set_dnadb + #even if we have specified as db/dnadb already,as this is reset after this + #we probably just want to use the dnadb_host as the default reg_host + #This also need cleaning up wrt DBAdaptor behaviours + #Simply and/or remove + + my $num_dbs = $reg->load_registry_from_db + ( + -host => $reg_host, + -user => $reg_user, + -port => $reg_port, + -pass => $reg_pass, + -db_version => $version, + -verbose => $verbose, + ); + + if(! $num_dbs){ + throw("Failed to load any DBs from $reg_user".'@'.$reg_host." for release $version.\n". + "This will result in a failure to validate the species.\n". + "Please define a valid -release for the registry and/or registry params/config\n". + "Or select a -registry_host which matches the API version:\t".$reg->software_version); + } + + if ((! $dbname) && (! $db)){ + throw('Not sensible to set the import DB as the default eFG DB from ' + .$reg_host.', please define db params'); + } + } + else{ + $self->log("Loading registry from:\t".$self->{'_reg_config'}); + $reg->load_all($self->{'_reg_config'}, 1); + } + + + #Need to test the DBs here, as we may not have loaded any! + #get_alias will fail otherwise + #This is a cyclical dependancy as we need alias to get species which we use to grab the DB + #alias is dependant on core DB being loaded with relevant meta entries. + #revise this when we split the Importer + + #Validate species + my $alias = $reg->get_alias($species) || throw("Could not find valid species alias for $species"); + #You might want to clean up:\t".$self->get_dir('output')); + $self->{species} = $alias; + $self->{'param_species'} = $species; #Only used for dir generation + + + + #SET UP DBs + if($db){ + #db will have been defined before reg loaded, so will be present in reg + + if(! (ref($db) && $db->isa('Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor'))){ + $self->throw('-db must be a valid Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor'); + } + } + else{ #define eFG DB from params or registry + + if($reg_db){#load eFG DB from reg + + if($dbname){ + throw("You cannot specify DB params($dbname) and load from the registry at the same time."); + } + + $self->log('WARNING: Loading eFG DB from Registry'); + $db = $reg->get_DBAdaptor($self->species(), 'funcgen'); + throw("Unable to retrieve ".$self->species." funcgen DB from the registry") if ! $db; + } + else{#resets the eFG DB in the custom or generic registry + + $dbname || throw('Must provide a -dbname if not using default custom registry config'); + $pass || throw('Must provide a -pass parameter'); + + #remove this and throw? + if(! defined $host){ + $self->log('WARNING: Defaulting to localhost'); + $host = 'localhost'; + } + + $port ||= 3306; + my $host_ip = '127.0.0.1';#is this valid for all localhosts? + + if ($ssh) { + $host = `host localhost`; #mac specific? nslookup localhost wont work on server/non-PC + #will this always be the same? + + if (! (exists $ENV{'EFG_HOST_IP'})) { + warn "Environment variable EFG_HOST_IP not set for ssh mode, defaulting to $host_ip for $host"; + } else { + $host_ip = $ENV{'EFG_HOST_IP'}; + } + + if ($self->host() ne 'localhost') { + warn "Overriding host ".$self->host()." for ssh connection via localhost($host_ip)"; + } + } + + + #data version is only used if we don't want to define the dbname + #This should never be guessed so don't need data_version here + #$dbname ||= $self->species()."_funcgen_".$self->data_version(); + + + #Remove block below when we can + my $dbhost = ($ssh) ? $host_ip : $host; + + #This isn't set yet!? + #When we try to load e.g. 49, when we only have 48 on ensembldb + #This fails because there is not DB set for v49, as it is not on ensembl DB + #In this case we need to load from the previous version + #Trap this and suggest using the -schema_version/release option + #Can we autodetect this and reload the registry? + #We want to reload the registry anyway with the right version corresponding to the dnadb + #We could either test for the db in the registry or just pass the class. + + $db = $reg->reset_DBAdaptor($self->species, 'funcgen', $dbname, $dbhost, $port, $user, $pass, + { + -dnadb_host => $reg_host, + -dnadb_port => $reg_port, + -dnadb_assembly => $assm_version, + -dnadb_user => $reg_user, + -dnadb_pass => $reg_pass, + }); + } + } + + + #Test connections + $self->{db} = $db; + $db->dbc->db_handle; + $db->dnadb->dbc->db_handle; + #Set re/disconnect options + + #These really need setting dependant on the import parser + $db->dbc->disconnect_when_inactive(1) if ! $no_disconnect; + $db->dnadb->dbc->disconnect_when_inactive(1) if ! $no_disconnect; + + + + #Catch config clashes/redundancy + + if($self->feature_set_description && + ($config_file || exists ${$self}{static_config}) ){ + throw('You have specified a -feature_set_description alongside user/static_config. Please define this in the config'); + } + + if( ($feature_analysis || $ctype_name || $ftype_name) && + exists ${$self}{static_config} ){ + throw('You have specified a analysis/cell/feature_type params alongside static_config. Please define this in the static_config'); + } + + #Catch no config here? Or can this be imported via Parser specific meta/config files + + + ### Check analyses/feature_type/cell_type + if($feature_analysis){ + my $fanal = $self->db->get_AnalysisAdaptor->fetch_by_logic_name($feature_analysis); + throw("The Feature Analysis $feature_analysis does not exist in the database") if(!$fanal); + $self->feature_analysis($fanal); + + #This currently fails before the config gets loaded! + #Need to load config before this validation! + } + + if($ctype_name){ + my $ctype = $self->db->get_CellTypeAdaptor->fetch_by_name($ctype_name); + throw("The CellType $ctype_name does not exist in the database") if(!$ctype); + $self->cell_type($ctype); + } + + if ($ftype_name) { + my $ftype = $self->db->get_FeatureTypeAdaptor->fetch_by_name($ftype_name); + throw("The FeatureType $ftype_name does not exist in the database") if(!$ftype); + $self->feature_type($ftype); + } + + + + #Set some attrs to allow setter only methods + $self->{slice_adaptor} = $db->dnadb->get_SliceAdaptor; + $self->slices($slices) if defined $slices; + $self->{rollback} = $rollback || $clobber; + $self->{counts} = {}; + $self->{seq_region_names} = [];#Used for slice based import + + + # USER CONFIG # + #Here we need to read config based on external file + #Should do something similar to set_feature_sets + #and validate_and_store_feature_types in BaseExternalParser + #but we are using define and validate sets instead + + #BaseExternalParser and BaseImporter really need to be merged + #After we have stripped out all the array/experiment specific stuff + + + if($config_file){ + my $config; + + $self->log("Reading config file:\t".$config_file); + + if(! ($config = do "$config_file")){ + throw("Couldn't parse config file:\t$config_file:\n$@") if $@; + throw("Couldn't do config:\t$config_file\n$!") if ! defined $config; + throw("Couldn't compile config_file:\t$config_file") if ! $config; + } + + #At least check it is hash + if(ref($config) ne 'HASH'){ + throw("Config file does not define a valid HASH:\t$config_file"); + } + + $self->{user_config} = $config; + + #Can call validate_and_store_config directly ehre once we have remove set_config stuff + + } + + #$self->debug(2, "BaseImporter class instance created."); + #$self->debug_hash(3, \$self); + + return $self; +} + + + + + + + +=head2 validate_and_store_config + + Args : None + Example : $self->validate_and_store_config; + Description: Imports feature types defined by import_sets. + Returntype : None + Exceptions : None + Caller : General + Status : At Risk + +=cut + +#Taken from InputSet +#Needs to support 'static' config from external parsers + +#Need to get the config has format working with this and set_feature_sets? +#Or just migrate define_and_validate_sets here now? + +#Now takes arrayref, need to change in other callers + +#All analyses and ftypes now stored here(inc fset only defined), so don't need to do this in the define sets method + +#define/set sets method should write to user config first, call validate_store, before defining sets? + +#Should we allow empty hashes to fetch from DB? + +sub validate_and_store_config{ + my ($self, $fset_names) = @_; + + if( (ref($fset_names) ne 'ARRAY') || + (scalar(@$fset_names) == 0) ){ + throw('Must pass FeatureSet names to validate_and_store_config'); + } + + my $ftype_adaptor = $self->db->get_FeatureTypeAdaptor; + + + #Do we want to enable no config? Just don't call this if we have no config! + + my ($static_config, $user_config, $fset_config); + + #Set here to avoid auto-vivifying in tests below + $user_config = $self->{user_config} if exists ${$self}{user_config}; + $static_config = $self->{static_config} if exists ${$self}{static_config}; + my $config = $user_config || $static_config; + + if(! $config){ + #throw('No user or static config found'); + warn('No user or static config found'); + } + elsif($user_config && $static_config){ + throw('BaseImporter does not yet support overriding static config with user config'); + #Would need to over-ride on a key by key basis, account for extra config in either static or user config? + } + else{ + #Store config for each feature set + #inc associated feature_types and analyses + #add cell_types in here? + + foreach my $import_set(@$fset_names){ + #warn "validating config for $import_set"; + + + if(exists ${$config}{feature_sets}{$import_set}){ + $fset_config = $config->{feature_sets}{$import_set}; + } + + if(! $fset_config){ + throw("Could not find config for:\t$import_set"); + } + + $self->log("Validating and storing config for:\t$import_set"); + + + #If we grab the ftype and analysis from the feature set first + #Then we can remove the reundancy in the config + #would need to handle case i.e. -ANALYSIS -analysis + #use rearrange! + + #Set in analyses and feature_types config, should use same ref if already exist + #else we have a duplicated entry using the same name, which may have different attrs! + + #This is going to be a problem as we are assigning a new value to the key? + #Do we need to use references in feature_sets? + + if(exists ${$fset_config}{feature_set}){ + #my ($fset_analysis, $fset_ftype) = rearrange(['ANALYSIS', 'FEATURE_TYPE'], %{$fset_config->{feature_set}}); + + #Can't use rearrange for key we are setting as we need to now the case + #This is grabbing top level config keys, not hashes + #config names refer to top level keys and may not match logic_name or ftype name + my $fset_params = $fset_config->{feature_set}; + my $fset_analysis_key = (exists ${$fset_params}{-analysis}) ? '-analysis' : '-ANALYSIS'; + my $analysis_config_name = (exists ${$fset_params}{$fset_analysis_key} ) ? $fset_params->{$fset_analysis_key} : undef; + my $fset_ftype_key = (exists ${$fset_params}{-feature_type}) ? '-feature_type' : '-FEATURE_TYPE'; + my $ftype_config_name = (exists ${$fset_params}{$fset_ftype_key}) ? $fset_params->{$fset_ftype_key} : undef; + + + #Check we have these in feature_sets analyses/feature_types already? + #Should that be a hash with empty {} values + #Then we can use the top level analyses hash if required + #Setting feature_set analysis/feature_type only if not present i.e. we don't over-write the top level + #defining hash + #Postponing checking here will lose context if we need to throw + #i.e. we won't know where the error has come from if the name is not present in the top level hash + #Can we call direct from here instead? + + + + #Leave to store to catch these mandatory attrs + + if($analysis_config_name){ + + if(ref(\$analysis_config_name) ne 'SCALAR'){ + throw("You must set feature_set($import_set) -analysis config as a string value i.e. a key referencing the top level analyses config"); + } + + if(! exists ${$fset_config}{analyses}{$analysis_config_name}){ #Set in analyses to validate/store below + $fset_config->{analyses}{$analysis_config_name} = {}; #Don't have to set a ref to top level here, as these will all get set to the same obj ref in validate/store + } + + #Only use refs in the feature_set + #top level and feature_set analyses get set to same obj at same time + $fset_config->{feature_set}{$fset_analysis_key} = \$fset_config->{analyses}{$analysis_config_name}; + + } + + + if($ftype_config_name){ + + if(ref(\$ftype_config_name) ne 'SCALAR'){ + throw("You must set feature_set($import_set) -feature_type config as a string value i.e. a key referencing the top level feature_types config"); + } + + if(! exists ${$fset_config}{feature_types}{$ftype_config_name}){ #Set in analyses to validate/store below + $fset_config->{feature_types}{$ftype_config_name} = {}; + } + + $fset_config->{feature_set}{$fset_ftype_key} = \$fset_config->{feature_types}{$ftype_config_name}; + + } + } + + + #Can self ref user config if 'do' will work with %config, specified as the last line + #Merge these two loops? + #Need to account for additional config keys in user/static config + + + if(exists ${$fset_config}{'analyses'}){ + + foreach my $logic_name(keys %{$fset_config->{'analyses'}}){ + $fset_config->{'analyses'}{$logic_name} = + $self->validate_and_store_analysis($logic_name, $fset_config->{'analyses'}{$logic_name}); + } + } + + if(exists ${$fset_config}{'feature_types'}){ + + foreach my $ftype_key(keys %{$fset_config->{'feature_types'}}){ + $fset_config->{'feature_types'}{$ftype_key} = + $self->validate_and_store_feature_type($ftype_key, $fset_config->{'feature_types'}{$ftype_key}); + } + } + + #if(exists ${$fset_config}{'feature_set'}){ + # #Just ignore this for now, as set_feature_set and define_and_validate_sets deal with this repectively + # #Solution is to extend define_and_validate_set to support external feature static config + # #Then remove set_feature_sets + # Also need to consider InputSet::define_sets + #} + } +} + return; +} + + +#Need some method to get and add analyses and ftypes to reduce hardcoding of key strings +#Can we add Class->new to config to provide some of this? +#Would proliferate constructor calls into config +#May also prevent some logic imposed by validate/set methods i.e. defaults? +#Has to be one or other as we can't assume we can use methods if we are dealing with a hash + +#Should we allow empty hashes to default to DB entry? +#Move HASH test into these methods + +#Need to use rearrange in these methods for case safety +#rearrange is quite slow, fine for import + +#Currently can't use empty hash to default to DB, as this gives us no key in the config +#Can we set this to the string required for the fetch method in the referenced hash(analyses/feature_types)? +# eq test would work, but would have to skip HASH test + +#Maybe we just set the analysis in the feature_set by the logic_name string, rather than a ref +#Then we test for presence in the relevant hash +#Would have to set obj and ref when validating/storing +#Would also have to do this in the feature_set feature_types/analyses i.e. we would be able to ref the whole hash +#would have to be an arrayrefs + +#Do we want to support no entries in top level definition if we can find it in the DB? + + +#Could merge these two using a config hash to define adaptor method, params etc? +#Would be easy to add more types to store e.g. cell_type + +sub validate_and_store_analysis{ + my ($self, $logic_name, $analysis_params) = @_; + my $analysis; + eval {$self->db->is_stored_and_valid('Bio::EnsEMBL::Analysis', $analysis_params)}; + + if(! $@){ #Can assume we have already set this valid Analysis + $analysis = $analysis_params + } + else{ + my $analysis_adaptor = $self->db->get_AnalysisAdaptor; + + ### Validate config entries + #Catches undef or empty {} at feature_sets and top levels, defaults to DB + my $invalid_entry = 1; + my $obj_logic_name; + my $got_config = 0; + + + if( (! defined $analysis_params ) || + (ref($analysis_params) eq 'HASH') ){ #We have a valid entry + + if( (ref($analysis_params) eq 'HASH') && + (%{$analysis_params}) ){ #number of keys + $got_config = 1; + $invalid_entry = 0; + } + else{ #empty feature_set analyses hash -> check top level analyses first + #$invalid_entry = 1; + + if(exists ${$self->{static_config}{analyses}}{$logic_name}){ + $analysis_params = $self->{static_config}{analyses}{$logic_name}; + + if( (! defined $analysis_params ) || + (ref($analysis_params) eq 'HASH') ){ + $invalid_entry = 0; + + if( (ref($analysis_params) eq 'HASH') && + (%{$analysis_params}) ){ #number of keys + $got_config = 1; + }#else is empty top level {} + + }#else is invalid + } + else{ #No top level config, assume we want to use the DB + $invalid_entry = 0; + } + } + } + + + if($invalid_entry){ + throw("You have defined a none HASH value in your config for analysis:\t$logic_name\n". + "Please define config as HASH, or use empty HASH or undef to use existing default config or DB"); + } + + if($got_config){ + ($obj_logic_name) = rearrange(['LOGIC_NAME'], %{$analysis_params}); + }else{ + $obj_logic_name = $logic_name; + } + + + if($logic_name ne $obj_logic_name){ #Not a show stopper as this is just the config key + warn "Found analysis key name - logic_name mismatch in config:\t$logic_name vs $obj_logic_name\n"; + } + + $analysis = $analysis_adaptor->fetch_by_logic_name($obj_logic_name); + + + + if($got_config){ + + my $config_anal = Bio::EnsEMBL::Analysis->new(%{$analysis_params}); + + if(! defined $analysis){ + $self->log('Analysis '.$obj_logic_name." not found in DB, storing from config"); + $analysis_adaptor->store($config_anal); + $analysis = $analysis_adaptor->fetch_by_logic_name($obj_logic_name); + } + else{ + + my $not_same = $analysis->compare($config_anal); + #Analysis::compare returns the opposite of what you expect! + + if($not_same){ + throw('There is a param mismatch between the '.$obj_logic_name. + ' Analysis in the DB and config. Please rectify or define a new logic_name'); + } + } + } + elsif(! defined $analysis){ + throw("Cannot fetch $obj_logic_name analysis from DB, please check your config key or define new top level analyses config"); + } +} + + return $analysis; +} + +sub validate_and_store_feature_type{ + my ($self, $ftype_name, $ftype_params) = @_; + my $ftype; + eval {$self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::FeatureType', $ftype_params)}; + + if(! $@){ #Can assume we have already set this valid Analysis + $ftype = $ftype_params + } + else{ + my $ftype_adaptor = $self->db->get_FeatureTypeAdaptor; + + #Validate config entries + #Catches undef or empty {} at feature_sets and top levels, defaults to DB + my $invalid_entry = 1; + my ($name, $class); + + my $got_config = 0; + + if( (! defined $ftype_params ) || + (ref($ftype_params) eq 'HASH') ){ #We have a valid entry + + if( (ref($ftype_params) eq 'HASH') && + (%{$ftype_params}) ){ #number of keys + $got_config = 1; + $invalid_entry = 0; + } + else{ #empty feature_set ftype hash -> check top level ftype first + #$invalid_entry = 1; + + if(exists ${$self->{static_config}{feature_types}}{$ftype_name}){ + $ftype_params = $self->{static_config}{feature_types}{$ftype_name}; + + if( (! defined $ftype_params ) || + (ref($ftype_params) eq 'HASH') ){ + $invalid_entry = 0; + + if( (ref($ftype_params) eq 'HASH') && + (%{$ftype_params}) ){ #number of keys + $got_config = 1; + }#else is empty top level {} + + }#else is invalid + } + else{ #No top level config, assume we want to use the DB + $invalid_entry = 0; + } + } + } + + + if($invalid_entry){ + throw("You have defined a none HASH value in your config for feature_type:\t$ftype_name\n". + "Please define config as HASH, or use empty HASH or undef to use existing default config or DB"); + } + + if($got_config){ + ($name, $class) = rearrange(['NAME', 'CLASS'], %{$ftype_params}); + }else{ + $name = $ftype_name; + } + + + + #Can't use rearrange for key we are setting as we need to now the case + my $analysis_key = (exists ${$ftype_params}{-analysis}) ? '-analysis' : '-ANALYSIS'; + my $analysis; + + if(exists ${$ftype_params}{$analysis_key}){ + #This is slightly redundant as we may have already validated this analysis + my ($lname) = rearrange(['LOGIC_NAME'], %{$ftype_params->{$analysis_key}}); + $ftype_params->{$analysis_key} = $self->validate_and_store_analysis($lname, $ftype_params->{$analysis_key}); + $analysis = $ftype_params->{$analysis_key}; + } + + my @ftypes = $ftype_adaptor->fetch_by_name($name, $class, $analysis); + + if(scalar(@ftypes) > 1){ + throw("Unable to fetch unique feature_type $name. Please specify top level config to define class (and analysis)"); + } + else{ + $ftype = $ftypes[0];#can be undef + } + + + if($got_config){ + my $config_ftype = Bio::EnsEMBL::Funcgen::FeatureType->new(%{$ftype_params}); + + if($ftype){ + + if(! $ftype->compare($config_ftype)){ + my $label = $name."($class"; + $label .= (defined $analysis) ? ' '.$analysis->logic_name.')' : ')'; + + throw('There is a param mismatch between the '.$name. + ' FeatureType in the DB and config. Please rectify in the config.'); + } + } + else{ + $self->log('FeatureType '.$name." not found in DB, storing from config"); + ($ftype) = @{$ftype_adaptor->store($config_ftype)}; + } + } + elsif(! defined $ftype){ + throw("Cannot fetch $name feature_type from DB, please check your config key or define new top level feature_types config"); + } +} + + return $ftype; +} + + + + + +sub counts{ + my ($self, $count_type) = @_; + + if($count_type){ + $self->{'_counts'}{$count_type} ||=0; + return $self->{'_counts'}{$count_type}; + } + + return $self->{'_counts'} +} + + + +sub slices{ + my ($self, $slices) = @_; + + if(defined $slices){ + + if (ref($slices) ne 'ARRAY'){ + throw("-slices parameter must be an ARRAYREF of Bio::EnsEMBL::Slices (i.e. not $slices)"); + } + + foreach my $slice(@$slices){ + + if(! ($slice && ref($slice) && $slice->isa('Bio::EnsEMBL::Slice'))){ + throw("-slices parameter must be Bio::EnsEMBL::Slices (i.e. not $slice)"); + } + + #Removed cache_slice from here as this was + #preventing us from identifying the seq_region in an input file + + my $full_slice = $self->slice_adaptor->fetch_by_name($slice->name); + + if(($slice->start != 1) || + ($slice->end != $full_slice->end)){ + throw("InputSet Parser does not yet accomodate partial Slice based import i.e. slice start > 1 or slice end < slice length:\t".$slice->name); + + } + + push @{$self->{seq_region_names}}, $slice->seq_region_name; + } + $self->{slices} = $slices; + } + + return $self->{slices} || []; +} + + +sub count{ + my ($self, $count_type) = @_; + + $self->{'_counts'}{$count_type} ||=0; + $self->{'_counts'}{$count_type}++; + return; +} + + +sub rollback{ return $_[0]->{rollback}; } + +sub recovery{ return $_[0]->{recover}; } + +=head2 db + + Example : my $db = $imp->db; + Description: Getter for the db attribute + Returntype : Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor + Exceptions : None + Caller : general + Status : At risk + +=cut + +sub db{ return $_[0]->{db}; } + + +=head2 species + + Example : my $species = $imp->species; + Description: Getter for species attribute + Returntype : String + Exceptions : None + Caller : general + Status : At risk + +=cut + +sub species{ return $_[0]->{species}; } + + +=head2 ucsc_coords + + Example : $start += 1 if $self->ucsc_coords; + Description: Getter for UCSC coordinate usage flag + Returntype : Boolean + Exceptions : none + Caller : general + Status : at risk + +=cut + +sub ucsc_coords{ return $_[0]->{ucsc_coords}; } + + +=head2 dump_fasta + + Example : if($self->dump_fasta()){...do fasta dump...} + Description: Getter for the dump_fasta flag + Returntype : Boolean + Exceptions : none + Caller : general + Status : At risk + +=cut + +sub dump_fasta{ return $_[0]->{_dump_fasta}; } + +sub slice_adaptor{ return $_[0]->{slice_adaptor}; } + + +=head2 feature_set_description + + Example : $imp->description("ExperimentalSet description"); + Description: Getter for the FeatureSet description + Returntype : String + Exceptions : none + Caller : general + Status : At risk + +=cut + +sub feature_set_description{ return $_[0]->{feature_set_desc}; } + + + + +=head2 project_feature + + Args [0] : Bio::EnsEMBL::Feature + Args [1] : string - Assembly e.g. NCBI37 + Example : $self->project($feature, $new_assembly); + Description: Projects a feature to a new assembly via the AssemblyMapper + Returntype : Bio::EnsEMBL::Feature + Exceptions : Throws is type is not valid. + Caller : General + Status : At risk + +=cut + +#Was in BaseExternalParser + +# -------------------------------------------------------------------------------- +# Project a feature from one slice to another +sub project_feature { + my ($self, $feat, $new_assembly) = @_; + + # project feature to new assembly + my $feat_slice = $feat->feature_Slice; + + + if(! $feat_slice){ + throw('Cannot get Feature Slice for '.$feat->start.':'.$feat->end.':'.$feat->strand.' on seq_region '.$feat->slice->name); + } + + my @segments = @{ $feat_slice->project('chromosome', $new_assembly) }; + + if(! @segments){ + $self->log("Failed to project feature:\t".$feat->display_label); + return; + } + elsif(scalar(@segments) >1){ + $self->log("Failed to project feature to distinct location:\t".$feat->display_label); + return; + } + + my $proj_slice = $segments[0]->to_Slice; + + if($feat_slice->length != $proj_slice->length){ + $self->log("Failed to project feature to comparable length region:\t".$feat->display_label); + return; + } + + + # everything looks fine, so adjust the coords of the feature + $feat->start($proj_slice->start); + $feat->end($proj_slice->end); + $feat->strand($proj_slice->strand); + my $slice_new_asm = $self->slice_adaptor->fetch_by_region('chromosome', $proj_slice->seq_region_name, undef, undef, undef, $new_assembly); + $feat->slice($slice_new_asm); + + return $feat; + +} + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/Bed.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/Bed.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,500 @@ +# +# EnsEMBL module for Bio::EnsEMBL::Funcgen::Parsers::Bed +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::Funcgen::Parsers::Bed + +=head1 SYNOPSIS + + my $parser_type = "Bio::EnsEMBL::Funcgen::Parsers::Bed"; + push @INC, $parser_type; + my $imp = $class->SUPER::new(@_); + + +=head1 DESCRIPTION + +This is a definitions class which should not be instatiated directly, it +normally set by the Importer as the parent class. Bed contains meta +data and methods specific to data in bed format, to aid +parsing and importing of experimental data. + +=cut + +#Import/Parser rework +#We now have potential to use indexed DBFile and Parsers +#Importer should become BaseImporter, inherited from InputSet/Nimblegen +#Have Bed(format) importer which sets the generic Bed(format) Parser + +package Bio::EnsEMBL::Funcgen::Parsers::Bed; + +use Bio::EnsEMBL::Funcgen::Parsers::InputSet; +use Bio::EnsEMBL::Funcgen::FeatureSet; +use Bio::EnsEMBL::Funcgen::AnnotatedFeature; +use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate ); +use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(open_file is_bed); +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Funcgen::Utils::Helper; +use File::Basename; +use strict; + +use Data::Dumper; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::InputSet); + +#To do +# Extend this so we can import features to annotated_feature +# and profiles/reads to result_feature +# This will replace individual tables for current bed das sources +# Altho' idea of separate tables for each set is not necessarily a bad one +# We would have to have extra table or fields details registration +# Easier admin i.e. drop tables +# What about partitioning? Irrelevant if we are moving to matrix +# Not easy to patch dynamically named tables? Can't assign values to user varaible from query! +# Would need to be a stored procedure +# Could have separate matrix files for result_features as we probably wouldn't want +# to do any cross set querying at this level. +# Could we also use this in the RunnableDBs? +# Simply separate those methods required by both normal bed annotated_feature import +# and RunnableDB based annotated_feature import +# Importing into result_feature requires a result_set which assumes a chip experiment +# We need to alter result_set such that it can be optionally associated with ExperimentalSets +# Or should we just go for a different table? read_feature? +# NOTE: PARITION by key may run into problems if different sets have different windows +# Would also need to modify ResultFeatureAdaptor? + +=head2 new + + Arg[0] : hash containing optional attributes: + -bed_reads => 0|1, #Set input as read alignments rather than peak calls (default is 0) + Example : my $self = $class->SUPER::new(@_); + Description: Constructor method for Bed class + Returntype : Bio::EnsEMBL::Funcgen::Parsers::Bed + Exceptions : throws if Experiment name not defined or if caller is not Importer + Caller : Bio::EnsEMBL::Funcgen::Importer + Status : at risk + +=cut + +sub new{ + my $caller = shift; + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_, no_disconnect => 1); + + throw("This is a skeleton class for Bio::EnsEMBL::Importer, should not be used directly") + if(! $self->isa("Bio::EnsEMBL::Funcgen::Importer")); + + + #This was over-riding InputSet new config + #$self->{'config'} = + # {( + # #order of these method arrays is important! + # #remove method arrays and execute serially? + # #Just move these to individual Parser register_experiment methods + # array_data => [],#['experiment'], + # probe_data => [],#["probe"], + # results_data => ["and_import"], + # norm_method => undef, + # + # #Need to make these definable? + # #have protocolfile arg and just parse tab2mage protocol section format + # #SEE NIMBLEGEN FOR EXAMPLE + # protocols => {()}, + # )}; + + $self->{'overhang_features'} = []; #Move to InputSet? + #Maybe used by other formats + + return $self; +} + + +=head2 set_config + + Example : my $self->set_config; + Description: Sets attribute dependent config + Returntype : None + Exceptions : None + Caller : Bio::EnsEMBL::Funcgen::Importer + Status : at risk -remove + +=cut + + +sub set_config{ + my $self = shift; + + $self->SUPER::set_config; + #dir are not set in config to enable generic get_dir method access + #Need to define output dir if we are processing reads + + return; +} + + + +sub pre_process_file{ + my ($self, $filepath, $prepare) = @_; + + #Test file format + throw("Input file is not bed format:\t$filepath") if ! &is_bed($filepath); + + + #separate sort keys stop lexical sorting of start/end + #when faced with a non numerical seq_region_name + my $sort = ($prepare || ! $self->prepared) ? 'sort -k1,1 -k2,2n -k3,3n ' : ''; + + + if($self->input_gzipped){ + $sort .= '|' if $sort; + $self->input_file_operator("gzip -dc %s | $sort "); + } + else{ + #This is really only required for read alignments + $self->input_file_operator("$sort %s |"); + } + + + if(! defined $self->output_file && $self->input_feature_class eq 'result'){ + my ($name) = fileparse($filepath); + $name =~ s/\.gz// if $self->input_gzipped; + + if($prepare){ + #This will be filtered for seq_region_name + $self->output_file($self->get_dir('output')."/prepared.${name}.gz"); + } + else{ + #Not currently used as we use direct import + #via AnnotatedFeatures or ResultFeature Collections + + #output_file would only be used DAS read mysqlimport loading + #This could also be used by SAM/BAM so put in InputSet + + #Or do we need a file for each discrete set? + #Each import consititutes one discrete data set + #So this is okay for replicates in a result set + #But not for different cell/feature types + #Use one file + #This is imposed case in INputSet::validate_files + #We can't pipe this to gzip as we need to mysqlimport it, which does not support gzip? + #gzip -dc file.sql | mysql would work but would be much slower due ti individual inserts + #$self->output_file($self->get_dir('output')."/result_feature.${name}.txt"); + } + } + + return $filepath; +} + + +#sub parse_header{ +# my ($self, $fh) = @_; +# +# +# #This will not work on a sorted file, so would have to +# #store header and test match every line! +# #just test for track name= for now +# +# +# warn "PARSING HEADER"; +# my $nr = 0; +# +# for my $line(<$fh>){ +# $nr++; +# +# #my $nr = $fh->input_line_number();#This always return 3451? Length of file? +# #This is not yet reliable here!!!! +# #Is this because of the gzip sort? +# #So let's depend on count? +# #If we don't know when the header is supposed to finish (i.e. multi line header) +# #We will need to decrement the seek position somehow +# +# warn "INPUT LINE = $nr $line"; +# +# #exit; +# if ($nr == 1){#$INPUT_LINE_NUMBER; +# #sanity check here +# return if($line =~ /track name=/o); +# $self->log(":: WARNING ::\tBED file does not appear to have valid header. First line($nr) will be treated as data:\t$line"); +# } +# +# exit; +# +# } +# +# +# exit; +# +# return; +#} + + + +sub parse_line{ + my ($self, $line, $prepare) = @_; + + #Need to handle header here for bed is always $.? + #Also files which do not have chr prefix? i.e. Ensembl BED rather than UCSC Bed with is also half open coords + + #if ($. == 0){#$INPUT_LINE_NUMBER; + # #sanity check here + return 0 if($line =~ /track name=/o); + $line =~ s/\r*\n//o;#chump accounts for windows files + + + my ($chr, $start, $end, $name, $score, $strand, @other_fields) = split/\s+/o, $line;#Shoudl this not be \t? + #Should we define minimum fields or microbed format with no naqme and just score? + #my ($chr, $start, $end, $score) = split/\t/o, $line;#Mikkelson hack + #Validate variables types here beofre we get a nasty error from bind_param? + + #Any more valid BED fields here? + # thickStart - The starting position at which the feature is drawn thickly (for example, the start codon in gene displays). + # thickEnd - The ending position at which the feature is drawn thickly (for example, the stop codon in gene displays). + # itemRgb - An RGB value of the form R,G,B (e.g. 255,0,0). If the track line itemRgb attribute is set to "On", this RBG value will determine the display color of the data contained in this BED line. NOTE: It is recommended that a simple color scheme (eight colors or less) be used with this attribute to avoid overwhelming the color resources of the Genome Browser and your Internet browser. + # blockCount - The number of blocks (exons) in the BED line. + # blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount. + # blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount. + + my $slice = $self->cache_slice($chr, undef, $prepare); + #prepare counts total features for RPKM + #This also filter slices for those defined + + if(! $slice){ + return 0; + } + else{ + my $sr_name = $slice->seq_region_name; + my $slice_name = $slice->name; + + + if(! $prepare){ + + $strand = $self->set_strand($strand); + $start += 1 if $self->ucsc_coords; + + # Set name dependantly on input class + my %name_param; + + if($self->input_feature_class eq 'segmentation'){ + #Expand this into pluggable config + #defined by field position against the input param name + + #Need to define ftype and analysis config outside + #of this parser anyway + + #Can we use similar set up to external parser config + #but extract actual config to separate file + + + + if(! exists $self->{user_config}{feature_sets}{$self->name}{feature_types}{$name}){ + #No need to test is valid as we have already done this + #just need to make sure we don't initialise the hash key + throw("Found segmentation BED name which is not defined in the feature_types". + " config for your feature_set:\t$name"); + } + + #warn "$name ftype is ".$self->{user_config}{feature_sets}{$self->name}{feature_types}{$name}; + $name_param{'-FEATURE_TYPE'} = $self->{user_config}{feature_sets}{$self->name}{feature_types}{$name}; + + #DISPLAY_LABEL Let this get autogenerated? + + } + else{#annotated + $name_param{'-DISPLAY_LABEL'} = $name; + } + + + $self->{_feature_params} = { + -START => $start, + -END => $end, + -STRAND => $strand, + -SLICE => $self->cache_slice($chr), + -SCORE => $score, + -FEATURE_SET => $self->data_set->product_FeatureSet, + %name_param, + }; + + $self->load_feature_and_xrefs; + } + } + + return 1; +} + + +#For the purposes of creating ResultFeature Collections +#Dependancy on creating features is overkill +#altho not critical as this is never used for display + +#Should really move this to InputSet parser +#Altho this would require an extra method call per line to parse the record + +sub parse_Features_by_Slice{ + my ($self, $slice) = @_; + + #Slice should have been checked by now in caller + if($slice->strand != 1){ + throw("Bed Parser does not support parsing features by non +ve/forward strand Slices\n". + 'This is to speed up generation of ResultFeature Collections for large sequencing data sets'); + } + + my $slice_chr = $slice->seq_region_name; + + #This method assumes that method calls will walk through a seq_region + #using adjacent slices + + #We need to maintain a feature cache, which contains all the features which over hang + #the current slice, such that we can include them in the next batch of features returned + + my @features; + my $slice_end = $slice->end; + my $slice_start = $slice->start; + my $last_slice = $self->last_slice; + my $last_slice_end = ($last_slice) ? $last_slice->end : ($slice_start - 1); + my $last_slice_name = ($last_slice) ? $last_slice->seq_region_name : $slice->seq_region_name; + my $rset_id = $self->result_set->dbID; + + if(! ($slice_start == ($last_slice_end + 1) && + ($slice->seq_region_name eq $last_slice_name))){ + #Need to reopen the file as we are doing a second pass over the same data + #This is not guaranteed to work for re-reading sets of slices + #This would also not be caught by this test + + #To be safe we need to reset the file handle from the caller context + + throw("Bed parser does not yet support parsing features from successive non-adjacent Slices\n". + "Last slice end - Next slice start:\t$last_slice_name:${last_slice_end} - ". + $slice->seq_region_name.':'.$slice_start); + } + + + #Deal with 5' overhang first + foreach my $feature(@{$self->overhang_features}){ + $feature = $feature->transfer($slice); + push @features, $feature if $feature;#This should always be true + } + + $self->{'overhang_features'} = []; #reset overhang features + my $fh = $self->file_handle; + my ($line, $feature); + my $parse = 1; + #Add counts here, or leave to Collector? + my $seen_chr = 0; + + #This currently parses the rest of the file once we have seen the data we want + + while((defined ($line = <$fh>)) && $parse){ + #This does not catch the end of the file! + + + if($self->last_line){#Deal with previous line first + $line = $self->last_line; + $self->last_line(''); + } + else{ + $line = <$fh>; + } + + #Still need to chump here in case no other fields + $line =~ s/\r*\n//o if $line;#chump accounts for windows files + + if(! $line){ + warn("Skipping empty line"); + next; + } + + #We could use a generic method to parse here + #But it is small enough and simple enough to have twice + my ($chr, $start, $end, $name, $score, $strand, @other_fields) = split/\s+/o, $line;#Shoudl this not be \t? + + if($slice_chr eq $chr){#Skim through the file until we find the slice + $seen_chr = 1; + if($end >= $slice_start){ + + if($start <= $slice_end){#feature is on slice + + $feature = Bio::EnsEMBL::Funcgen::Collection::ResultFeature->new_fast + ({ + start => ($start - $slice_start + 1), + end => ($end - $slice_start + 1), + strand => $strand, + scores => [$score], + result_set_id => $rset_id, + window_size => 0,#wsize + slice => $slice, + }); + push @features, $feature; + + if($end > $slice_end){ + #This will also capture last feature which may not be part of current slice + $self->overhang_features($feature); + } + } + else{#feature is past end of current slice + $parse = 0; + $self->last_line($line);#But maybe part of next slice chunk + } + } + } + elsif($seen_chr){ + #We have reached the end of the chromsome! + $self->last_line($line);#in case we are parsing slice serially + $parse = 0; + } + } + + $self->last_slice($slice); + #$self->log("Added logging of parsing (seen = $seen_chr) for memory footprinting through file", 'logmemflag'); + + return \@features; +} + +#Move these potentially generic methods to InputSet Parser for use by other Parsers + + +sub last_line{ + my ($self, $lline) = @_; + + $self->{'last_line'} = $lline if defined $lline; + return $self->{'last_line'}; + +} + +sub last_slice{ + my ($self, $lslice) = @_; + + $self->{'last_slice'} = $lslice if $lslice; + return $self->{'last_slice'}; +} + + +sub overhang_features{ + my ($self, $feature) = @_; + + push @{$self->{'overhang_features'}}, $feature if $feature; + + return $self->{'overhang_features'}; + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/GFF.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/GFF.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,195 @@ +# +# EnsEMBL module for Bio::EnsEMBL::Funcgen::Parsers::GFF +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +#Could this be based on a Generic Flat file parser? + +=head1 NAME + +Bio::EnsEMBL::Funcgen::Parsers::GFF + +=head1 SYNOPSIS + + my $parser_type = "Bio::EnsEMBL::Funcgen::Parsers::GFF"; + push @INC, $parser_type; + my $imp = $class->SUPER::new(@_); + + +=head1 DESCRIPTION + +This is a definitions class which should not be instatiated directly, it +normally set by the Importer as the parent class. GFF contains meta +data and methods specific to data in bed format, to aid +parsing and importing of experimental data. + +=cut + +package Bio::EnsEMBL::Funcgen::Parsers::GFF; + +use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate ); +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use strict; + + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::ExperimentalSet); + +=head2 new + + Example : my $self = $class->SUPER::new(@_); + Description: Constructor method for GFF class + Returntype : Bio::EnsEMBL::Funcgen::Parsers::GFF + Exceptions : None + Caller : Bio::EnsEMBL::Funcgen::Importer + Status : at risk + +=cut + + +sub new{ + my $caller = shift; + + my $class = ref($caller) || $caller; + + #define default fields here and pass + #We also need to be able to take custom attrs mappings + + #keys are array index of field, key are Feature paramter names + #reverse this? + #Unless we have a compound field which we name accordingly + #And then call e.g. parse_attrs + #Which will return a hash with the relevant Feature attributes + + #Is splitting this up simply going to make the parse slower due to acessor methods? + + #Pass or just set directly here? + # [attributes] [comments] + + + #Some of these may be highly redundant due to the nature of the data. + #We can hash things to lessen the impact but we're still going to be checking if exists for each one + #No way around this? Unless it is marginally faster to set a permanent type and then only check a boolean. + #Yes there is, this is the exhaustive GFF definition, we can just redefine or delete some entries dynamically to + #avoid ever considering a particular field index. + + + #Don't need any of this? Can we simply define process fields? + #This will remove the ability to define custom formats + #But then again we can only have custom format if it has ensembl compliant data + #i.e. no preprocessing has to be done before populating the feature_params hash + + #my %fields = ( +# 0 => 'fetch_slice', +# 1 => 'get_source', +# 2 => 'get_feature_type', +# 3 => '-start', + # 4 => '-end', +# 5 => '-strand',#Will most likely be , need to convert to -.+ > -1 0 1 + #6 => 'frame',#will most likely be . +# 7 => 'get_attributes', +# ); + + #We want to be able to define mappings between attributes and fields + #we're basically just dealing with display_label for annotated_feature + #e.g -display_label_format => ID+ACC + #Or maybe format of several fields and attrs + text? + #We need a separator which will not be used in the GFF attr names + #we also need to be able to differentiate + #First check standard GFF field, then check attrs + ##No no no, just have method, generate display label + #forget this for now and just use one field + + my $display_label_field = 'ID';#default + + #We still need to define the field name here as a global hash to allow this display_label_field look up. + + + my $self = $class->SUPER::new(@_);#, -fields => \%fields); + + ($display_label_field) = rearrange(['DISPLAY_LABEL_FIELD'], @_); + + #We need to define meta header method, starting with '##' + #Also need to skip comments '#' at begining or end of line + #Do we also need to skip field header? No methinks not. + + #Define result method + # $self->{'file_ext'} => 'gff';#Could use vendor here? + + #define this if we want to override the generic method in Simple + #$self->{'config'}{'results_data'} => ["and_import_gff"]; + + $self->display_label_field($display_label_field); + + + return $self; +} + + +=head2 set_config + + Example : my $self->set_config; + Description: Sets attribute dependent config + Returntype : None + Exceptions : None + Caller : Bio::EnsEMBL::Funcgen::Importer + Status : at risk + +=cut + + +sub set_config{ + my $self = shift; + + $self->SUPER::set_config; + + #GFF specific stuff here. + + return; +} + +#Need to implement this! +sub parse_line{ + my ($self, $line) = @_; + + #return if $line ~= + + #my %fields = ( +# 0 => 'fetch_slice', +# 1 => 'get_source', +# 2 => 'get_feature_type', +# 3 => '-start', + # 4 => '-end', +# 5 => '-strand',#Will most likely be , need to convert to -.+ > -1 0 1 + #6 => 'frame',#will most likely be . +# 7 => 'get_attributes', +# ); + + + + my ($chr, $start, $end, $pid, $score) = split/\t/o, $line; + + #we need to return feature_params and seq if defined? + +} + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/InputSet.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/InputSet.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,955 @@ +# +# EnsEMBL module for Bio::EnsEMBL::Funcgen::Parsers::InputSet +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::Funcgen::Parsers::InputSet + +=head1 SYNOPSIS + + use vars qw(@ISA); + @ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::InputSet); + + +=head1 DESCRIPTION + +This is a base class to support simple file format parsers. For simple imports the vendor is +set to the parser type i.e. the file format. The generic read_and_import_simple_data assumes +a one line per feature format, other format need there own read_and_import_format_data method, +which will need defining in the result_data config element. Features are stored either as +ResultFeature collections or AnnotatedFeatures dependan ton the input feature class. + +=cut + +# To do +# Add Parsers for BAM/SAM +# Rename to InputSet +# Handle mysqlimport for large data sets e.g. reads +# Incorporate collection code +# Implement matrix storage + +package Bio::EnsEMBL::Funcgen::Parsers::InputSet; + +use Bio::EnsEMBL::Funcgen::AnnotatedFeature; +use Bio::EnsEMBL::Funcgen::SegmentationFeature; +use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate ); +use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(species_chr_num open_file is_gzipped); +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +#use Bio::EnsEMBL::Funcgen::Utils::Helper; +use strict; + +#config stuff, move to BaseImporter? +use Bio::EnsEMBL::Analysis; +use Bio::EnsEMBL::Funcgen::FeatureType; + + +use base qw(Bio::EnsEMBL::Funcgen::Parsers::BaseImporter); #@ISA change to parent with perl 5.10 + +#use vars qw(@ISA); +#@ISA = qw(Bio::EnsEMBL::Funcgen::Utils::Helper); + +my %valid_types = ( + result => undef, + annotated => undef, + segmentation => undef, + ); + + +=head2 new + + Example : my $self = $class->SUPER::new(@_); + Description: Constructor method for Bed class + Returntype : Bio::EnsEMBL::Funcgen::Parsers::Simple + Exceptions : throws if caller is not Importer + Caller : Bio::EnsEMBL::Funcgen::Parsers:Simple + Status : at risk + +=cut + + +sub new{ + my $caller = shift; + + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_); + + # my $config_file; + + + ($self->{'input_set_name'}, + $self->{'input_feature_class'}, + #$self->{'slices'}, + $self->{total_features}, + $self->{force}, #Is this generic enough to go in Importer? used by store_window_bins_by_Slice_Parser + $self->{dbfile_data_root}, #only appropriate for result input_feature_class + # $config_file, #User defined config hash file + ) = rearrange(['input_set_name', 'input_feature_class', + 'total_features', 'force', 'dbfile_data_root'], @_); + + + #Could potentially take fields params directly to define a custom format + #Take direct field mappings, plus special fields which needs parsing differently + #i.e. default is tab delimited, and GFF would define Attrs field as compound field and provide special parsing and field mapping + + + throw("This is a skeleton class for Bio::EnsEMBL::Importer, should not be used directly") + if(! $self->isa("Bio::EnsEMBL::Funcgen::Importer")); + + $self->{'config'} = + {( + #can we omit these? + array_data => [],#['experiment'], + probe_data => [],#["probe"], + norm_method => undef, + #protocols => {()}, + 'results_data' => ["and_import"], + )}; + + #set up feature params + $self->{'_feature_params'} = {}; + $self->{'_dbentry_params'} = []; + + #$self->{'counts'} = {}; + #$self->{'slices'} = []; + #$self->{'seq_region_names'} = [];#Used for slice based import + + + # USER CONFIG # + #Here we need to read config based on external file + #Should do something similar to set_feature_sets + #and validate_and_store_feature_types in BaseExternalParser + #but we are using define and validate sets instead + + #BaseExternalParser and BaseImporter really need to be merged + #After we have stripped out all the array/experiment specific stuff + + #Do dev here so we are not developing more stuff in the Importer which will need migrating + #to the BaseImporter + + #if($config_file){ +# my $config; + +# $self->log("Reading config file:\t".$config_file); + +# if(! ($config = do "$config_file")){ +# throw("Couldn't parse config file:\t$config_file:\n$@") if $@; +# throw("Couldn't do config:\t$config_file\n$!") if ! defined $config; +# throw("Couldn't compile config_file:\t$config_file") if ! $config; +# } + +# #At least check it is hash +# if(ref($config) ne 'HASH'){ +# throw("Config file does not define a valid HASH:\t$config_file"); +# } +# +# $self->{user_config} = $config; +# } + + + return $self; +} + + +sub output_file{ + my ($self, $output_file) = @_; + + $self->{'output_file'} = $output_file if $output_file; + return $self->{'output_file'}; +} + +sub input_file{ + my ($self, $input_file) = @_; + + $self->{'input_file'} = $input_file if $input_file; + return $self->{'input_file'}; +} + + + +=head2 set_config + + Example : my $self->set_config; + Description: Sets attribute dependent config + Returntype : None + Exceptions : None + Caller : Bio::EnsEMBL::Funcgen::Importer + Status : at risk + +=cut + + +sub set_config{ + my $self = shift; + + #Move all this to new when we fix the inheritance in Importer + + #We could set input_set_name to experiment name + #But we would have to make warning in define_and_validate_sets mention -input_set_name + + throw('Must provide an -input_set name for a '.uc($self->vendor).' import') if ! defined $self->input_set_name(); + + #Mandatory checks + if(! defined $self->feature_analysis){ + throw('Must define a -feature_analysis parameter for '.uc($self->vendor).' imports'); + } + + + if(! exists $valid_types{$self->input_feature_class}){ + throw("You must define a valid input_feature_class:\t". + join(', ', keys %valid_types)); + } + + $self->{'feature_class'} = 'Bio::EnsEMBL::Funcgen::'.ucfirst($self->input_feature_class).'Feature'; + + + #We need to undef norm method as it has been set to the env var + $self->{'config'}{'norm_method'} = undef; + + #dirs are not set in config to enable generic get_dir method access + $self->{dbfile_data_root} ||= $self->get_dir('output');#Required for Collector + + #some convenience methods + my $adaptor_method = 'get_'.ucfirst($self->input_feature_class).'FeatureAdaptor'; + $self->{'feature_adaptor'} = $self->db->$adaptor_method; + $self->{'dbentry_adaptor'} = $self->db->get_DBEntryAdaptor; + $self->{'input_set_adaptor'} = $self->db->get_InputSetAdaptor; + ##$self->{'slice_adaptor'} = $self->db->dnadb->get_SliceAdaptor; + + #Validate slices + $self->slices($self->{'slices'}) if defined $self->{'slices'}; + + #Move to new when we sort out inheritance + $self->validate_and_store_config([$self->name]); + #Could use input_set_name here? + #This was to support >1 input set per experiment (name) + + #This current breaks for no config imports + #i.e. standard Bed import e.g. result_feature collections + #segmentation imports use Bed and config + #allow no config imports in BaseImporter? + #or ultimately set the params as part of the user_config? + + return; +} + + +sub define_sets{ + my ($self) = @_; + + my $eset = $self->db->get_InputSetAdaptor->fetch_by_name($self->input_set_name); + + if(! defined $eset){ + $eset = Bio::EnsEMBL::Funcgen::InputSet->new + ( + -name => $self->input_set_name(), + -experiment => $self->experiment(), + -feature_type => $self->feature_type(), + -cell_type => $self->cell_type(), + -vendor => $self->vendor(), + -format => $self->format(), + -analysis => $self->feature_analysis, + -feature_class => $self->input_feature_class, + ); + ($eset) = @{$self->db->get_InputSetAdaptor->store($eset)}; + } + + #Use define_and_validate with fetch/append as we may have a pre-existing set + #This now needs to handle ResultSets based on InputSets + + + my $dset = $self->define_and_validate_sets + ( + -dbadaptor => $self->db, + -name => $self->input_set_name,#or name? + -feature_type => $self->feature_type, + -cell_type => $self->cell_type, + -analysis => $self->feature_analysis, + -feature_class=> $self->input_feature_class, + -description => $self->feature_set_description, + #-append => 1,#Omit append to ensure we only have this eset + -recovery => $self->recovery, + -supporting_sets => [$eset], + -slices => $self->slices, + #Can't set rollback here, as we don't know until we've validated the files + #Can't validate the files until we have the sets. + #So we're doing this manually in validate_files + ); + + #We are now using IMPORTED to define wheather a FeatureSet has been imported succesfully + #However we already have IMPORTED on the InputSubSet + #We should add it to FeatureSet to remain consistent. + #See Helper::define_and_validate_sets for more notes on + #potential problems with FeatureSet IMPORTED status + + + #define_and_validate_sets should also replicate ResultSets? + #Questionable, mapped reads are never normalised across replicates + #There are generally used as input for peak calling individually. + #So files in this instance are expected to be separate parts of the same replicate + #e.g. different chromosomes + #Force one input file? + #What if we want to link several assays(feature/cell_types) to the same experiment? + + $self->{'_data_set'} = $dset; + + return $dset; +} + + + + +#we have rollback functionality incorporated here + +sub validate_files{ + my ($self, $prepare) = @_; + + #Get file + if (! @{$self->result_files()}) { + my $list = "ls ".$self->get_dir('input').'/'.$self->input_set_name().'*.';#.lc($self->vendor);#could use vendor here? Actually need suffix attr + my @rfiles = `$list`; + $self->result_files(\@rfiles); + } + + #We don't yet support multiple files + if(scalar(@{$self->result_files()}) >1){ + warn('Found more than one '.$self->vendor." file:\n". + join("\n", @{$self->result_files()})."\nThe InputSet parser does not yet handle multiple input files(e.g. replicates).". + " We need to resolve how we are going handle replicates with random cluster IDs"); + #do we even need to? + } + + #Here were are tracking the import of individual files by adding them as InputSubSets + #Recovery would never know what to delete + #So would need to delete all, Hence no point in setting status? + #We do not rollback IMPORTED data here. This is done via separate scripts + #To reduce the rick of accidentally deleting/overwriting data by leaving a stry -rollback + #flag in the run script + + ### VALIDATE FILES ### + #We need validate all the files first, so the import doesn't fall over half way through + #Or if we come across a rollback halfway through + my (%new_data, $eset); + my $dset = $self->data_set; + + if((scalar(@{$self->slices}) > 1) && + ! $prepare){ + throw('Validate files does not yet support multiple Slice rollback'); + } + + #This all assumes that there is only ever 1 InputSet + + if($self->input_feature_class eq 'result'){ + $eset = $dset->get_supporting_sets->[0]->get_InputSets->[0]; + } + else{#annotated/segmentation + $eset = $dset->get_supporting_sets->[0]; + } + + + #IMPORTED status here may prevent + #futher slice based imports + #so we have wait to set this until we know all the slices + #are loaded, unless we store slice based IMPORTED states + #We currently get around this be never settign IMPORTED for slice based jobs + #and always rolling back by slice before import + + #This loop supports multiple files + my (@rollback_sets, %file_paths); + my $auto_rollback = ($self->rollback) ? 0 : 1; + + foreach my $filepath( @{$self->result_files} ) { + my ($filename, $sub_set); + chomp $filepath; + ($filename = $filepath) =~ s/.*\///; + $file_paths{$filename} = $filepath; + $filename =~ s/^prepared\.// if $self->prepared; #reset filename to that originally used to create the Inputsubsets + + $self->log('Validating '.$self->vendor." file:\t$filename"); + throw("Cannot find ".$self->vendor." file:\t$filepath") if(! -e $filepath);#Can deal with links + + if( $sub_set = $eset->get_subset_by_name($filename) ){ + #IMPORTED status here is just for the file + #Any changes to analysis or coord_system should result in different InputSubset(file) + #Will only ever be imported into one Feature|ResultSet + + #Currently conflating recover_unimported and rollback + #as they serve the same purpose until we implement InputSubset level recovery + + + if( $sub_set->has_status('IMPORTED') ){ + $new_data{$filepath} = 0; + $self->log("Found previously IMPORTED InputSubset:\t$filename"); + } + else{ + $self->log("Found existing InputSubset without IMPORTED status:\t$filename"); + push @rollback_sets, $sub_set; + } + } + else{ + $self->log("Found new InputSubset:\t${filename}"); + throw("Should not have found new 'prepared' file:\t$filename") if $self->prepared; + $new_data{$filepath} = 1; + $sub_set = $eset->add_new_subset($filename); + $self->input_set_adaptor->store_InputSubsets([$sub_set]); + } + } + + + #Does -recover allow a single extra new file to be added to an existing InputSet? + + if(@rollback_sets && #recoverable sets i.e. exists but not IMPORTED + ( (! $self->recovery) && (! $self->rollback) ) ){ + throw("Found partially imported InputSubsets:\n\t".join("\n\t", (map $_->name, @rollback_sets))."\n". + "You must specify -recover or -rollback to perform a full rollback"); + + if($self->recovery){ + #Change these to logger->warn + $self->log("WARNING::\tCannot yet rollback for just an InputSubset, rolling back entire set? Unless slices defined"); + $self->log("WARNING::\tThis may be deleting previously imported data which you are not re-importing..list?!!!\n"); + } + } + + + if($self->rollback){ + #Check we have all existing InputSubsets files before we do full rollback + #Can probably remove this if we support InputSubset(file/slice) level rollback + $self->log('Rolling back all InputSubsets'); + @rollback_sets = @{$eset->get_InputSubsets}; + + foreach my $isset(@rollback_sets){ + + if(! exists $file_paths{$isset->name}){ + throw("You are attempting a multiple InputSubset rollback without specifying an existing InputSubset:\t".$isset->name. + "\nAborting rollback as data will be lost. Please specifying all existing InputSubset file names"); + } + } + } + + + foreach my $esset(@rollback_sets){ + #This needs to be mapped to the specified filepaths + my $fp_key = $esset->name; + $fp_key = 'prepared.'.$fp_key if $self->prepared; + + $new_data{$file_paths{$fp_key}} = 1; + $self->log("Revoking states for InputSubset:\t\t\t".$esset->name); + $eset->adaptor->revoke_states($esset); + + if(! $prepare){ + #This was to avoid redundant rollback in prepare step + $self->log("Rolling back InputSubset:\t\t\t\t".$esset->name); + + if($self->input_feature_class eq 'result'){ + #Can we do this by slice for parallelisation? + #This will only ever be a single ResultSet due to Helper::define_and_validate_sets + #flags are rollback_results and force(as this won't be a direct input to the product feature set) + $self->rollback_ResultSet($self->data_set->get_supporting_sets->[0], 1, $self->slices->[0], 1); + #Do no have rollback_InputSet here as we may have parallel Slice based imports running + } + else{#annotated/segmentation + $self->rollback_FeatureSet($self->data_set->product_FeatureSet, undef, $self->slices->[0]); + $self->rollback_InputSet($eset); + last; + } + } + } + + return \%new_data; +} + + + + +sub set_feature_separator{ + my ($self, $separator) = @_; + + #How do we test if something undefined was passed? + #Rather than nothing passed at all? + #Can't do this as this is the accessor + #Need to split method + + throw('Must provide a valid feature separator') if ( (! defined $separator) || ($separator eq '') ); + + $self->{'_feature_separator'} = $separator; + +} + +# SIMPLE ACCESSORS +# Some of these can be called for each record +# Trim the access time as much as possible + +sub input_feature_class{ return $_[0]->{'input_feature_class'}; } + +sub input_set_name{ return $_[0]->{'input_set_name'}; } #Set in new + +sub feature_adaptor{ return $_[0]->{'feature_adaptor'}; } + +sub dbentry_adaptor{ return $_[0]->{'dbentry_adaptor'}; } + +sub input_set_adaptor{ return $_[0]->{'input_set_adaptor'}; } + +sub set{ return $_[0]->{'set'}; } #Feature or Result, set in define_sets + +##sub slice_adaptor{ return $_[0]->{'slice_adaptor'}; } + +sub data_set{ return $_[0]->{'_data_set'}; } + +sub feature_separator{ return $_[0]->{'_feature_separator'}; } + +sub feature_params{ return $_[0]->{'_feature_params'}; } + +sub dbentry_params{ return $_[0]->{'_dbentry_params'}; } + +sub input_gzipped{ return $_[0]->{'input_gzipped'}; } + + +sub input_file_operator{ + my ($self, $op) = @_; + #Should be set in format parser + $self->{'input_file_operator'} = $op if defined $op; + + return $self->{'input_file_operator'}; +} + +# prepare boolean, simply stores the sets and preprocesses the file +# so we don't get each batch job trying to sort etc + + +#Still need to implement prepare in other Parsers!! + +sub read_and_import_data{ + my ($self, $prepare) = @_; + + my $action = ($prepare) ? 'preparing' : 'importing'; + + $self->log("Reading and $action ".$self->vendor()." data"); + my ($eset, $filename, $output_set, $fh, $f_out, %feature_params, @lines); + + if($prepare && ! $self->isa('Bio::EnsEMBL::Funcgen::Parsers::Bed')){ + throw('prepare mode is only currently implemented for the Bed parser'); + } + + + #Test for conflicting run modes + if($prepare && + ($self->batch_job || $self->prepared)){ + #prepare should be called once by the runner, not in each batch_job + #don't prepare if already prepared + throw('You cannot run read_and_import_data in prepare mode with a -batch_job or -prepared job'); + } + + my $dset = $self->define_sets; + + #We also need to account for bsub'd slice based import + #seq alignments loaded into a ResultSet + #Cannot have 0 window for ChIP Seq alignments + #As this would mean storing all the individual reads + #Hence we need to remap to a new assm before we import! + + if($self->input_feature_class eq 'result'){ + $output_set = $dset->get_supporting_sets->[0]; + $eset = $output_set->get_InputSets->[0]; + $self->result_set($output_set);#required for ResultFeature Collector and Bed Parser + } + else{#annotated/segmentation + $output_set = $dset->product_FeatureSet; + $eset = $dset->get_supporting_sets->[0]; + } + + + #If we can do these the other way araound we can get define_sets to rollback the FeatureSet + #Cyclical dependency for the sets :| + my $new_data = $self->validate_files($prepare); + my $seen_new_data = 0; + + + ### READ AND IMPORT FILES ### + foreach my $filepath(@{$self->result_files()}) { + chomp $filepath; + + ($filename = $filepath) =~ s/.*\///; + $self->input_file($filepath); #This is only used by Collector::ResultFeature::reinitialise_input method + + if($new_data->{$filepath} ){ #This will currently autovivify! + $seen_new_data = 1; + $self->{'input_gzipped'} = &is_gzipped($filepath); + + $filepath = $self->pre_process_file($filepath, $prepare) if $self->can('pre_process_file'); + + $self->log_header(ucfirst($action).' '.$self->vendor." file:\t".$filepath); + + #We need to be able to optional open pipe to gzip | sort here + #i.e. define open command + $fh = open_file($filepath, $self->input_file_operator); + + #This my become way too large for some reads files + #Currently no problems + #This is not working as we are sorting the file! + #$self->parse_header($fh) if $self->can('parse_header'); + + #For result features some times we want to run + #locally and just sort without dumping + #i.e if we are not a batch job + #as there is no need to dump if it is a single process + + + #Should this be prepared? + + if((($self->input_feature_class eq 'result') && ! $prepare)){ + #(($self->input_feature_class eq 'result') && (! $self->batch_job))){ #Local run on just 1 chr + # + + #Use the ResultFeature Collector here + #Omiting the 0 wsize + #How are we going to omit 0 wsize when doing the fetch? + #simply check table name in ResultSet? + + #Should we do this for multiple chrs? + #or fail here + # we need to pass self + #for access to get_Features_by_Slice + #which should be in the specific parser e.g Bed + + #Will this not clash with standard ResultFeature::get_ResultFeatures_by_Slice? + #Could really do with separating the pure file parsers from the importer code, so these can be reused + #by other code. Then simply use Bed import parser for specific import functions and as wrapper to + #Bed file parser + #So should really have + #Parsers::File::Bed + #and + #Parsers::Import::Bed + #This can probably wait until we update BioPerl and just grab the Bed parser from there? + + my $slices = $self->slices; + + #Should this be caught in new? + if(! @$slices){ + throw("You must define a slice to generate ResultFeature Collections from InputSet:\t".$eset->name); + } + + + if(scalar(@$slices) > 1){ + throw("InputSet parser does not yet support multi-Slice import for ResultFeature collections\n" + ."Please submit these to the farm as single slice jobs"); + } + + #restrict to just 1 slice as we don't yet support disk seeking + #if the slices are not in the same order as they appear in the file + #also we want to parallelise this + + #Set as attr for parse_Features_by_Slice in format sepcific Parsers + + + $self->file_handle(open_file($filepath, $self->input_file_operator)); + + + foreach my $slice(@$slices){ + $self->feature_adaptor->store_window_bins_by_Slice_Parser($slice, $self, + ( + #Force needs reimplementing here? + -force => $self->{force}, + -dbfile_data_root => $self->{dbfile_data_root}, + )); + } + + warn "Need to update InputSubset status to IMPORTED after all slices have been loaded"; + #Do we even need to set RESULT_FEATURE_SET for input_set ResultFeatures? + + + + warn "Closing $filename\nDisregard the following 'Broken pipe' warning"; + + #Closing the read end of a pipe before the process writing to it at the other end + #is done writing results in the writer receiving a SIGPIPE. If the other end can't + #handle that, be sure to read all the data before closing the pipe. + #This suggests the gzip pipe has not finished reading, but it *should* be at the end of the file? + #$SIG{PIPE} = 'IGNORE'; #Catch with subref and warn instead? + #Or maybe an eval will catch the STDERR better? + #sub catch_SIGPIPE{ + # my $sig = shift @_; + # print " Caught SIGPIPE: $sig $1 \n"; + # return; + # + #} + #$SIG{PIPE} = \&catch_SIGPIPE; + #my $retval = eval { no warnings 'all'; $fh->close }; + #if($@){ + # warn "after eval with error $@\nretval is $retval"; + #} + #Neither of these catch gzip: stdout: Broken pipe + + #IO::UnCompress::Gunzip? + + + $fh->close; + } + else{ + + + #Revoke FeatureSet IMPORTED state here incase we fail halfway through + $output_set->adaptor->revoke_status('IMPORTED', $output_set) if ($output_set->has_status('IMPORTED') && (! $prepare)); + + #What about IMPORTED_"CSVERSION" + #This may leave us with an incomplete import which still has + #an IMPORTED_CSVERSION state + #We need to depend on IMPORTED for completeness of set + #DAS currently only uses IMPORTED_CSVERSION + #This is okayish but we also need to write HCs for any sets + #which do not have IMPORTED state! + my ($line, @outlines, $out_fh); + + + if($prepare && ! $self->batch_job){ + #Assume we want gzipped output + #filename is actull based on input, so may not have gz in file name + $out_fh = open_file($self->output_file, "| gzip -c > %s"); + } + + + while(defined ($line=<$fh>)){ + #Generic line processing + #Move these to parse_line? + $line =~ s/\r*\n//o; + next if $line =~ /^\#/; + next if $line =~ /^$/; + + #This has now been simplified to process_line method + #my @fields = split/\t/o, $line; + #start building parameters hash + #foreach my $field_index(@{$self->get_field_indices}){ + # my $field = $self->get_field_by_index($field_index); + # $feature_params = ($field =~ /^-/) ? $fields[$field_index] : $self->$field($fields[$field_index]); + # } + + + #We also need to enable different parse line methods if we have different file + #e.g. cisRED + #Code refs? + + + if($self->parse_line($line, $prepare)){ + $self->count('total parsed lines'); + + #Cache or print to sorted file + if($prepare && ! $self->batch_job){ + + if(scalar(@outlines) >1000){ + print $out_fh join("\n", @outlines)."\n"; + @outlines = (); + } + else{ + push @outlines, $line; + } + } + } + } + + close($fh); + + #Print last of sorted file + if($prepare && ! $self->batch_job){ + print $out_fh join("\n", @outlines)."\n"; + close($out_fh); + @outlines = (); + } + + if(! $prepare){ + #Now we need to deal with anything left in the read cache + $self->process_params if $self->can('process_params'); + + #To speed things up we may need to also do file based import here with WRITE lock? + #mysqlimport will write lock the table by default? + + #reset filename to that originally used to create the Inputsubsets + $filename =~ s/^prepared\.// if $self->prepared; + + my $sub_set = $eset->get_subset_by_name($filename); + $sub_set->adaptor->store_status('IMPORTED', $sub_set) if ! $self->batch_job; + } + } + + + if($prepare){ + $self->log("Finished preparing import from:\t$filepath"); + } + else{ + #Need to tweak this for slice based import + $self->log('Finished importing '.$self->counts('features').' '. + $output_set->name." features from:\t$filepath"); + + } + + + #This currently fails here if the uncaught file sort was not successful + + foreach my $key (keys %{$self->counts}){ + $self->log("Count $key:\t".$self->counts($key)); + } + } + } + + #Here we should set IMPORTED on the FeatureSet + #We could also log the first dbID of each feature in a subset to facilitate subset rollback + #in feature table + #this would be sketchy at best + #delete from annotated_feature where annotated_feature_id >= $first_subset_feature_id and feature_set_id=$feature_set_id + #This may already have IMPORTED status as we don't revoke the status whilst + #updating to protect the feature set due to lack of supportingset tracking + #see Helper::defined_and_validate_sets for more notes. + #Is there any point in setting it if we don't revoke it? + #To allow consistent status handling across sets. Just need to be aware of fset status caveat. + #Also currently happens with ResultFeatures loaded by slice jobs, as this may already be set by a parallel job + + if(! $prepare){ + $output_set->adaptor->set_imported_states_by_Set($output_set) if $seen_new_data && ! $self->batch_job; + $self->log("No new data, skipping result parse") if ! grep /^1$/o, values %{$new_data}; + $self->log("Finished parsing and importing results"); + } + + return; +} + + +#Should be called from format parser e.g. BED, GFF, eQTL etc +#Why don't we pass feature_params and dbentry_params directly? + +sub load_feature_and_xrefs{ + my $self = shift; + + #warn "Loading ".($self->{_counts}{features}+1).' '.$self->feature_params->{-FEATURE_TYPE}->name."\n"; + #This now only fails once on the first run and then + #Need to count based on feature_type? + + #new rather than new fast here as we want to validate the import + my $feature = $self->{feature_class}->new(%{$self->feature_params}); + ($feature) = @{$self->feature_adaptor->store($feature)}; + $self->count('features'); + + #Add count based on FeatureType, should be ftype name and analysis to reflect unique ftype key? + + + + ##This needs to be handled in caller as we are validating loci? + #if($self->ucsc_coords){ + # $start += 1; + # $end += 1; + # } + + #This needs to be put in a separate sub and called by the caller + #if(! $self->cache_slice($chr)){ + # warn "Skipping AnnotatedFeature import, cound non standard chromosome: $chr"; + #} + #else{ + #grab seq if dump fasta and available + #my $seq; + #if(exists $self->feature_params->{'sequence'}){ + # $seq = $self->feature_params->{'sequence'}; + # delete $self->feature_params->{'sequence'}; + # } + # else{ + # $self->log('No fasta sequence available for '.$self->feature_params->display_label); + # } + # } + + #dump fasta here + #if ($self->dump_fasta){ + # $self->{'_fasta'} .= $self->generate_fasta_header($feature)."\n$seq\n"; + # } + + #Store the xrefs + + foreach my $dbentry_hash(@{$self->{'_dbentry_params'}}){ + my $ftype = $dbentry_hash->{feature_type}; + delete $dbentry_hash->{feature_type}; + + my $dbentry = Bio::EnsEMBL::DBEntry->new(%{$dbentry_hash}); + $self->dbentry_adaptor->store($dbentry, $feature->dbID, $ftype, 1);#1 is ignore release flag + #count here? no count in caller + } + + + #Clean data cache + $self->{'_feature_params'} = {}; + $self->{'_dbentry_params'} = []; + + return $feature; +} + +#This should really be handled in Bio::EnsEMBL::Feature? +#Move to Helper? + +sub set_strand{ + my ($self, $strand) = @_; + + my $ens_strand = 0; + + my %strand_vals = ( + '1' => 1, + '0' => 0, + '-1' => -1, + '+' => 1, + '-' => -1, + '.' => 0, + ); + + if($strand){ + + if(exists $strand_vals{$strand}){ + $ens_strand = $strand_vals{$strand}; + } + else{ + throw("Could not identify strand value for $strand"); + } + } + + return $ens_strand; +} + +sub total_features{ + my ($self, $total) = @_; + + $self->{'total_features'} = $total if defined $total; + return $self->{'total_features'}; +} + +#Currently only required for Bed::parse_Features_by_Slice + +#filehandle + + +sub file_handle{ + my ($self, $fh) = @_; + + $self->{'file_handle'} = $fh if defined $fh; + return $self->{'file_handle'}; +} + +sub result_set{ + my ($self, $rset) = @_; + + #already tested/created by self + + $self->{'result_set'} = $rset if $rset; + return $self->{'result_set'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/MAGE.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/MAGE.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1038 @@ +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::Funcgen::MAGE + +=head1 SYNOPSIS + +my $imp = Bio::EnsEMBL::Funcgen::Importer->new(%params); +$imp->register_experiment(); + + +=head1 DESCRIPTION + +B is a base main class for all MAGE type array importers(e.g. Nimblegen). + +=cut + +################################################################################ + +package Bio::EnsEMBL::Funcgen::Parsers::MAGE; + +use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(get_date open_file run_system_cmd); +use Bio::EnsEMBL::Utils::Exception qw( throw deprecate ); +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Funcgen::Utils::Helper; +use Bio::MAGE::XMLUtils; + + +use File::Path; +use strict; +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Utils::Helper); + + + +################################################################################ + +=head2 new + + Description : Constructor method + + Arg [1] : hash containing optional attributes: + + ReturnType : Bio::EnsEMBL::Funcgen::MAGE + Example : my $Exp = Bio::EnsEMBL::Nimblegen->new(%params); + Exceptions : throws if mandatory params are not set or DB connect fails + Caller : General + Status : Medium - potential for %params names to change, remove %attrdata? + +=cut + +################################################################################ + +sub new{ + my ($caller) = shift; + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_); + + #This needs to be an Importer! + throw("This is base class for the experiment Bio::EnsEMBL::Funcgen::Parsers, needs to inherit from Bio::EnsEMBL::Funcgen::Importer") if(! $self->isa("Bio::EnsEMBL::Funcgen::Importer")); + + + #Are we not passing any Helper params? + #Log file etc is set in the run script + + my ($write_mage, $no_mage, $vendor) + = rearrange(['WRITE_MAGE', 'NO_MAGE', 'VENDOR'], @_); + + + #$self->{'update_xml'} = $update_xml || 0; + $self->{'write_mage'} = $write_mage || 0; + $self->{'no_mage'} = $no_mage || 0; + $self->{'vendor'} = $vendor; + + + if ($self->vendor ne 'NIMBLEGEN'){ + $self->{'no_mage'} = 1; + warn "Hardcoding no_mage for non-NIMBLEGEN imports"; + } + + + if($self->{'no_mage'} && $self->{'write_mage'}){ + throw('-no_mage and -write_mage options are mutually exclusive, please select just one'); + } + + return $self; +} + +=head2 process_experiment_config + + Example : $self->init_experiment_import(); + Description: Initialises import by creating working directories + and by storing the Experiemnt + Returntype : none + Exceptions : warns and throws depending on recover and Experiment status + Caller : general + Status : at risk - merge with register exeriment + +=cut + +#This is actually processing the tab2mage file & writing XML + +sub process_experiment_config{ + my $self = shift; + + #Here, this is where we need to call the a Parser from Importer to do this for only MAGE experiments + #validate_import? + + #This is only used for the first test below. + my $exp_adaptor = $self->db->get_ExperimentAdaptor(); + my $xml = $exp_adaptor->fetch_mage_xml_by_experiment_name($self->name);# if $self->{'write_xml'}; + + #DO NOT CHANGE THIS LOGIC! + #write mage if we specify or we don't have a the final xml or the template + #recovery is turned on to stop exiting when previously stored chips are found from the 'write_mage' run. + #This does mean that if you import without running the write_mage step + #you could potentially be overwriting someone elses experiment info + #No way of getting around this, need to make warning obvious, add to end of log!!! + #We always want to write and update xml and ResultSets if we're running the 2nd stage of the import + #Why would we ever want to skip the validate process? + #Leave for now as this is working as we want it + #But propose to remove skip functionality + + if( ! $self->{'no_mage'}){ + + if($self->{'write_mage'} || !( -f $self->get_config('tab2mage_file') || $xml)){ + $self->{'write_mage'} = 1; + $self->backup_file($self->get_config('tab2mage_file')); + } + #elsif($xml && (! $self->{'update_xml'})){#Changed this so we always update + #elsif(! $self->{'update_xml'}){ + + + + #Here, we need to always update_xml + #If we are doing the 2nd stage + #Currently this is skipping as we haven't explicitly set it + #To remove this... + #what we need to do is check that we don't test for update_xml, + # i.e. assuming that we're running the second stage of the import. + # Therefore we need a boolean to set whether it is the first stage..else update_xml implicit + # write mage is explicit flag + # Or if we have not tab2mage file? + # we can then override this explicitly with update_xml? + # WE're never likely edit the xml directly, so we always want to validate and update + # so write mage flag become update_experiment? No this is no obvious behaviour + # We need to warn about removing the write_mage flag after we have updated it + # Otherwise we will never get to 2nd stage + + + #No mage is still valid as we may want to jus import and experiment + #Before receiving correct meta data + #When we can then rerun the import with -write_mage to update the resultsets + + # $self->{'recover'} = 1; + # $self->{'skip_validate'} = 1; + #} + elsif( -f $self->get_config('tab2mage_file')){#Run Tab2Mage + + $self->backup_file($self->get_config('mage_xml_file')); + my $cmd = 'tab2mage.pl -e '.$self->get_config('tab2mage_file').' -k -t '.$self->get_dir('output'). + ' -c -d '.$self->get_dir('results'); + + $self->log('Reading tab2mage file'); + my $t2m_exit_code = run_system_cmd($cmd, 1);#no exit flag due to non-zero exit codes + warn "tab2mage exit code is $t2m_exit_code"; + + if(! ($t2m_exit_code > -1) && ($t2m_exit_code <255)){ + throw("tab2mage failed. Please check and correct:\t".$self->get_config('tab2mage_file')."\n...and try again"); + } + + $self->{'recover'} = 1; + } + } + + return; +} + +=heead init_tab2mage_export + + Example : $self->init_tab2mage_export; + Description: Writes the standard experiment section of the tab2mage file + Returntype : FileHandle + Exceptions : ??? + Caller : general + Status : at risk + +=cut + +sub init_tab2mage_export{ + my $self = shift; + + $self->backup_file($self->get_config('tab2mage_file')) if(-f $self->get_config('tab2mage_file')); + + my $t2m_file = open_file($self->get_config('tab2mage_file'), '>'); + + #reformat this + my $exp_section = "experiment section\ndomain\t".(split/@/, $self->contact())[1]."\naccession\t\n". + "quality_control\tbiological_replicate\nexperiment_design_type\tbinding_site_identification\n". + "name\t".$self->name()."\nrelease_date\t\nsubmission_date\t\nsubmitter\t???\n". + "submitter_email\t???\ninvestigator\t???\ninvestigator_email\t???\norganization\t???\naddress\t". + "???\npublication_title\t\nauthors\t\njournal\t\nvolume\t\nissue\t\npages\t\nyear\t\npubmed_id\t\n"; + + my $protocol_section = "Protocol section\naccession\tname\ttext\tparameters\n"; + + foreach my $protocol(sort (keys %{$self->get_config('protocols')})){ + $protocol_section .= $self->get_config('protocols')->{$protocol}->{'accession'}. + "\t".$self->get_config('protocols')->{$protocol}->{'name'}. + "\t".$self->get_config('protocols')->{$protocol}->{'text'}."\t"; + + $protocol_section .= (defined $self->get_config('protocols')->{$protocol}->{'parameters'}) ? + $self->get_config('protocols')->{$protocol}->{'parameters'}."\t\n" : "\t\n"; + } + + #File[raw] Array[accession] Array[serial] Protocol[grow] Protocol[treatment] Protocol[extraction] Protocol[labeling] Protocol[hybridization] Protocol[scanning] BioSource Sample Extract LabeledExtract Immunoprecipitate Hybridization BioSourceMaterial SampleMaterial ExtractMaterial LabeledExtractMaterial Dye BioMaterialCharacteristics[Organism] BioMaterialCharacteristics[BioSourceType] BioMaterialCharacteristics[StrainOrLine] BioMaterialCharacteristics[CellType] BioMaterialCharacteristics[Sex] FactorValue[StrainOrLine] FactorValue[Immunoprecipitate] + + + #Need to do this bit better? + #have array of fields. We can then populate a hash in the read method based on field names, then use the array to print in order + + my $hyb_header = "\nHybridization section\n".join("\t", @{$self->hybridisation_fields()}); + + print $t2m_file $exp_section."\n".$protocol_section."\n".$hyb_header."\n"; + + return $t2m_file; +} + + +#Move to MAGE package? + +sub hybridisation_fields{ + my $self = shift; + + return ['File[raw]', 'Array[accession]', 'Array[serial]', + (map 'Protocol['.$_.']', (sort (keys %{$self->get_config('protocols')}))), + 'BioSource', 'Sample', 'Extract', 'LabeledExtract', 'Immunoprecipitate', 'Hybridization', + 'BioSourceMaterial', 'SampleMaterial', 'ExtractMaterial', 'LabeledExtractMaterial', + 'Dye', 'BioMaterialCharacteristics[Organism]', 'BioMaterialCharacteristics[BioSourceType]', + 'BioMaterialCharacteristics[StrainOrLine]', 'BioMaterialCharacteristics[CellType]', + 'BioMaterialCharacteristics[Sex]', 'FactorValue[StrainOrLine]', 'FactorValue[Immunoprecipitate]']; +} + + + +#=head2 register_experiment +# +# Example : $imp->register_experiment() +# Description: General control method, performs all data import and normalisations +# Arg [1] : optional - dnadb DBAdaptor +# Returntype : none +# Exceptions : throws if arg is not Bio::EnsEMBL::DBSQL::DBAdaptor +# Caller : general +# Status : Medium +# +#=cut + +#write/validate_mage + +sub write_validate_experiment_config{ + my $self = shift; + + + if($self->{'write_mage'} || $self->{'no_mage'}){ + $self->read_data("array"); + + if(! $self->{'no_mage'}){ + $self->log("PLEASE CHECK AND EDIT AUTOGENERATED TAB2MAGE FILE:\t".$self->get_config('tab2mage_file')); + #we could make this print only if it was set by the user, not by the Importer + $self->log('REMEMBER TO REMOVE -write_mage FLAG BEFORE UPDATING'); + exit; + } + } + elsif(! $self->{'no_mage'}){#This should be a no_channel flag, set dependent on import mode(gff_chip, gff_chan) + #Need to accomodate chip level imports in validate? + + if (! $self->{'skip_validate'}){ + + $self->log("Validating mage file:\t".$self->get_config('mage_xml_file')); + + + #Updating ResultSets: + #Given that we might want to add a chip to an experiment we will also need to update the tab2MAGE + #mage_xml and ResultSets accordingly. + + #This should happen if we specify update_xml + #Should recovery also always force update? + #Considering the two run modes, write tab2mage & validate and import + #There is a subtle difference between recovery and update mage_xml + #Do we always run in recovery mode for the validate&import step? + #Yes we do, so can't guarantee the this means we want to update. + #So we need to change update_xml to update to reflect the changed functionality on ResultSets + + #If we run an update without on then chips will be loaded but xml and ResultSets will not be altered :( + #If we're running the 2nd stage we should always be updating the xml anyway!!!! + #As there is no reason to rerun the validate & import step without it.(unless we're debugging maybe) + #So why should we ever run without it? + + #To update ResultSets we validate as normal and then update where appropriate + #What has precedence? Replicate name? + #Update echip types as appropriate + #What if this invalidates original rsets? + #Then list sets not covered for removal by script? + + + + my (%echips, @log); + my $rset_adaptor = $self->db->get_ResultSetAdaptor; + my $chan_anal = $self->db->get_AnalysisAdaptor->fetch_by_logic_name('RawValue'); + + #need to change this to default analysis + #There we issues with setting VSN_GLOG as an env var + #as this is tested for and the norm was skipped for some reason? + my $chip_anal = $self->db->get_AnalysisAdaptor->fetch_by_logic_name($self->norm_method()); + + #Try import sets like this first, so we know ther is new data + my $chan_rset = $self->get_import_ResultSet($chan_anal, 'channel'); + my $rset = $self->get_import_ResultSet($chip_anal, 'experimental_chip'); + + + #Else get them anyway and log + if(! $rset){ + + if($chan_rset){ + $self->log('Identified partial Channel only import, updating MAGE-XML'); + } + else{ + ($chan_rset) = @{$rset_adaptor->fetch_all_by_name_Analysis($self->experiment->name.'_IMPORT', $chan_anal)}; + #Don't need to test for >1 here as this has already been done in get_import_ResultSet + $self->log('All ExperimentalChips imported, updating MAGE-XML only'); + } + + ($rset) = @{$rset_adaptor->fetch_all_by_name_Analysis($self->experiment->name.'_IMPORT', $chip_anal)}; + } + + + #This will never happen now due to the change tab2mage rules in init_experiment + #Remove? + if(! $rset){ + throw('Cannot find ResultSet, are you trying to import a new experiment which already has a tab2mage file present? Try removing the file, or specifying the -write_mage flag to parse_and_import.pl'); + } + + if(! -l $self->get_dir('output').'/MAGE-ML.dtd'){ + system('ln -s '.$ENV{'EFG_DATA'}.'/MAGE-ML.dtd '.$self->get_dir('output').'/MAGE-ML.dtd') == 0 || + throw('Failed to link MAGE-ML.dtd'); + } + + + $self->log('VALIDATING MAGE XML'); + my $reader = Bio::MAGE::XML::Reader->new(); + my $mage_xml ||= $self->get_config('mage_xml_file'); + $self->{'mage'} = $reader->read($mage_xml); + + #this should only ever return 1 for an import + foreach my $mage_exp(@{$self->{'mage'}->getExperiment_package->getExperiment_list()}){ + + if($mage_exp->getName() ne $self->name()){ + $self->log('MAGE experiment name ('.$mage_exp->getName().') does not match import name ('.$self->name().')'); + } + + #add more experiment level validation here? + + foreach my $assay (@{$mage_exp->getBioAssays()}){ + + if($assay->isa('Bio::MAGE::BioAssay::PhysicalBioAssay')){#channel + $self->log('Validating PhysicalBioAssay "'.$assay->getName()."'\n");#hyb name(this is the file name for measured assays + + my $bioassc = $assay->getBioAssayCreation();#This is a Hybridisation + my $array = $bioassc->getArray();#this is an ArrayChip + my $design_id = $array->getArrayDesign->getIdentifier(); + my $chip_uid = $array->getArrayIdentifier(); + + + foreach my $echip(@{$rset->get_ExperimentalChips()}){ + + + if($echip->unique_id() eq $chip_uid){ + $self->log("Found ExperimentalChip:\t".$chip_uid); + + if(! exists $echips{$chip_uid}){ + $echips{$chip_uid} = {( + total_biorep => undef, + total_biotechrep => undef, + experimental_biorep => undef, + experimental_biotechrep => undef, + total_dye => undef, + experimental_dye => undef, + cell_type => undef, + feature_type => undef, + )}; + } + + #Validate ArrayChip + my ($achip) = @{$self->db->get_ArrayChipAdaptor->fetch_all_by_ExperimentalChips([$echip])}; + + if($achip->design_id() ne $design_id){ + push @log, "ArrayDesign Identifier (${design_id}) does not match ArrayChip design ID (". + $achip->design_id().")\n\tSkipping channel and replicate validation"; + #skip the channel/replicate validation here? + } + else { #validate channels and replicate names + + foreach my $src_biomat (@{$bioassc->getSourceBioMaterialMeasurements()}) { #Channel materials(X1)? + my $biomat = $src_biomat->getBioMaterial(); #LabelledExtract (IP/Control) + #we could sub this passing $echip and biomat? + #messy to pass regexs and populate correct echip hash attrs + #also messy to populate log + #keeping nested loop also prevents further obfuscation + #do we need to do all the defined checks, or maybe just the first one? + #Then we can skip all following warning? + + foreach my $treat (@{$biomat->getTreatments()}) { + #As there is effectively one more level of material extraction for the IP channel + #this loop will returns materials an iteration out of sync for each channel + + foreach my $ssrc_biomat (@{$treat->getSourceBioMaterialMeasurements()}) { #Channel measurement(x1) + my $sbiomat = $ssrc_biomat->getBioMaterial(); + #This will either be techrep name for control of IP name for experimental channel + #SOM0035_BR1_TR2 IP #Immunoprecicpitate + #SOM0035_BR1_TR2 #Extract + + if ($sbiomat->getName() =~ /BR[0-9]+_TR[0-9]+$/o) { #Total + + if (! defined $echips{$chip_uid}{'total_biotechrep'}) { + $echips{$chip_uid}{'total_biotechrep'} = $sbiomat->getName(); + } + else{ + push @log, "Found two TOTAL Channels on same chip with biotechreps:\t".$sbiomat->getName(). + " and ".$echips{$chip_uid}{'total_biotechrep'}; + } + }else{#Experimental + + #get feature type from assay + my $fv_ref = $assay->getBioAssayFactorValues(); + if(! defined $fv_ref){ + throw('No FactorValues found, you must populate the "Immunoprecipitate" field. Maybe you forgot to specify -feature_type?'); + } + + my ($feature_type); + + foreach my $fvalue(@{$fv_ref}){ + + if($fvalue->getValue()->getCategory() eq 'Immunoprecipitate'){ + $feature_type = $fvalue->getName(); + $feature_type =~ s/anti\s*-\s*//; + $feature_type =~ s/\s*antibody\s*//; + } + } + $echips{$chip_uid}{'feature_type'} = $feature_type; + } + + foreach my $ttreat (@{$sbiomat->getTreatments()}) { + + foreach my $tsrc_biomat (@{$ttreat->getSourceBioMaterialMeasurements()}) { + my $tbiomat = $tsrc_biomat->getBioMaterial(); + #SOM0035_BR1_TR2 #Extract (exp) + #SOM0035_BR1 #Sample (total) + + if ($tbiomat->getName() =~ /BR[0-9]+_TR[0-9]+$/o) { #experimental + + if (! defined $echips{$chip_uid}{'experimental_biotechrep'}) { + $echips{$chip_uid}{'experimental_biotechrep'} = $tbiomat->getName(); + } + else{ + push @log, "Found two EXPERIMENTAL Channels on same chip with biotechreps:\t".$tbiomat->getName(). + " and ".$echips{$chip_uid}{'experimental_biotechrep'}; + } + + my $dye = $biomat->getLabels()->[0]->getName(); + + foreach my $chan (@{$echip->get_Channels()}) { + + if ($chan->type() eq 'EXPERIMENTAL') { + + if (uc($dye) ne uc($chan->dye())) { + push @log, "EXPERIMENTAL channel dye mismatch:\tMAGE = ".uc($dye).' vs DB '.uc($chan->dye); + } else { + $echips{$chip_uid}{'experimental_dye'} = uc($dye); + } + } + } + } + else { #control + + if (! defined $echips{$chip_uid}{'total_biorep'}) { + $echips{$chip_uid}{'total_biorep'} = $tbiomat->getName(); + } + else{ + push @log, "Found two TOTAL Channels on same chip with biotechreps:\t".$tbiomat->getName(). + " and ".$echips{$chip_uid}{'total_biorep'}; + } + + my $dye = $biomat->getLabels()->[0]->getName(); + + foreach my $chan (@{$echip->get_Channels()}) { + + if ($chan->type() eq 'TOTAL') { + + if (uc($dye) ne uc($chan->dye())) { + push @log, "TOTAL channel dye mismatch:\tMAGE = ".uc($dye).' vs DB '.uc($chan->dye); + } + else { + $echips{$chip_uid}{'total_dye'} = uc($dye); + } + } + } + } + #could do one more iteration and get Source and FeatureType? + #we should really extend this, and then update the EC cell_type and feature_types + #these features might not be biotmats tho...need to check + + + foreach my $ftreat (@{$tbiomat->getTreatments()}) { + + foreach my $fsrc_biomat (@{$ftreat->getSourceBioMaterialMeasurements()}) { + my $fbiomat = $fsrc_biomat->getBioMaterial(); + #EXPERIMENTAL - biorep + #TOTAL - source/cell type + my $cell_type; + + if($fbiomat->getName() =~ /BR[0-9]+$/o){#EXPERIMETNAL + + if(! defined $echips{$chip_uid}{'experimental_biorep'}){ + $echips{$chip_uid}{'experimental_biorep'} = $fbiomat->getName(); + } + else{ + push @log, "Found two Experimental Channels on same chip with bioreps:\t".$fbiomat->getName(). + " and ".$echips{$chip_uid}{'experimental_biorep'}; + } + + + #last treatment/measurement/biomat level should go here + #as TOTAL channel does not have another level and will fail + foreach my $xtreat (@{$fbiomat->getTreatments()}) { + + foreach my $xsrc_biomat (@{$xtreat->getSourceBioMaterialMeasurements()}) { + my $xbiomat = $xsrc_biomat->getBioMaterial(); + + foreach my $char(@{$xbiomat->getCharacteristics()}){ + $cell_type = $char->getValue() if($char->getCategory() eq 'CellType'); + } + } + } + + }else{#this should be BioSource + #which should have CellType as characteristic + #we could change tab2mage and have this as a factor value, + #but don't want to start messing with "standard" format + + foreach my $char(@{$fbiomat->getCharacteristics()}){ + $cell_type = $char->getValue() if($char->getCategory() eq 'CellType'); + } + } + + #can have cell_type validation here + if(! defined $echips{$chip_uid}{'cell_type'}){ + $echips{$chip_uid}{'cell_type'} = $cell_type; + } + elsif( $echips{$chip_uid}{'cell_type'} ne $cell_type){ + push @log, "Found Channels on same chip (${chip_uid}) with different cell types:\t". + $cell_type." and ".$echips{$chip_uid}{'cell_type'}; + } + } + } + } + } + } + } + } + } + } #end of echip + } #end of foreach echip + } #end of physbioassay + } #end of foreach assay + } #end of foreach exp + + + + #we should fail here with log before we update the result sets + + #we need to build rep names + #we're currently using sample labels, in the tab2mage file + #altho' previous sets have been using exp name + #these have been manually patched afterwards + + #More desirable to have exp name as rset name, but no way of doing BR validation + #based on sample label, if we don't have it in the tab2mage + #if we change it in the DB then we need to update the tab2mage + + #no way to do this when generating tab2mage as the user hasn't yet defined the reps + #we could just make reps based on sample labels + #then we just assume that alterations made by the user are correct + #as we can no longer validate using sample labels + #can still validate using cell/feature type + + #no longer need vendor specific validation as this will be done in tab2mage generation + + + #We need to validate reps here + #then update ec records as appropriate and then create rsets + + my (%bio_reps, %tech_reps); + my $ct_adaptor = $self->db->get_CellTypeAdaptor(); + my $ft_adaptor = $self->db->get_FeatureTypeAdaptor(); + +#select rs.*, ec.*, c.* from result_set rs, chip_channel cc, channel c, experimental_chip ec where rs.result_set_id=cc.result_set_id and cc.table_name='experimental_chip' and cc.table_id=ec.experimental_chip_id and cc.table_id=c.experimental_chip_id order by name; + + foreach my $echip (@{$rset->get_ExperimentalChips()}) { + + my ($biorep, $biotechrep); + + if (! exists $echips{$echip->unique_id()}) { + push @log, "No MAGE entry found for ExperimentalChip:\t".$echip->unique_id(); + } + else { + + foreach my $chan_type('total', 'experimental'){ + + $biorep = $echips{$echip->unique_id()}{$chan_type.'_biorep'}; + $biotechrep = $echips{$echip->unique_id()}{$chan_type.'_biotechrep'}; + + if (! defined $biotechrep) { + push @log, 'ExperimentalChip('.$echip->unique_id().') Extract field do not meet naming convention(SAMPLE_BRN_TRN)'; + } #! defined biorep? will never occur at present + elsif ($biotechrep !~ /\Q$biorep\E/) { + push @log, "Found Extract(techrep) vs Sample(biorep) naming mismatch\t${biotechrep}\tvs\t$biorep"; + } + + if ( ! $echips{$echip->unique_id()}{$chan_type.'_dye'}) { + push @log, "No ".uc($chan_type)." channel found for ExperimentalChip:\t".$echip->unique_id(); + } + + } + + #Is this is really implicit in the test above + if($echips{$echip->unique_id()}{'experimental_biorep'} ne $echips{$echip->unique_id()}{'total_biorep'}){ + push @log, "Found biorep mismatch between channels of ExperimentalChip ".$echip->unique_id().":\n". + "\tEXPERIMENTAL\t".$echips{$echip->unique_id()}{'experimental_biorep'}."\tTOTAL\t". + $echips{$echip->unique_id()}{'total_biorep'}; + } + + #Is this is really implicit in the test above + if($echips{$echip->unique_id()}{'experimental_biotechrep'} ne $echips{$echip->unique_id()}{'total_biotechrep'}){ + push @log, "Found biotechrep mismatch between channels of ExperimentalChip ".$echip->unique_id().":\n". + "\tEXPERIMENTAL\t".$echips{$echip->unique_id()}{'experimental_biotechrep'}."\tTOTAL\t". + $echips{$echip->unique_id()}{'total_biotechrep'}; + } + + + } + + + #Now we need to validate ec has same feature/cell type as other ecs in this br + #this does not handle import sets which ARE allowed to have same name but different types + + #warn "Processing ".$echip->unique_id()." $biorep $biotechrep"; + + + if(exists $bio_reps{$biorep}){ + + + if(! defined $bio_reps{$biorep}{'cell_type'}){ + push @log, "Found undefined CellType for biorep $biorep"; + } + elsif($bio_reps{$biorep}{'cell_type'}->name() ne $echips{$echip->unique_id()}{'cell_type'}){ + push @log, "Found CellType mismatch between $biorep and ExperimentalChip ".$echip->unique_id(); + } + + + if(! defined $bio_reps{$biorep}{'feature_type'}){ + push @log, "Found undefined FeatureType for biorep $biorep"; + } + elsif($bio_reps{$biorep}{'feature_type'}->name() ne $echips{$echip->unique_id()}{'feature_type'}){ + push @log, "Found FeatureType mismatch between $biorep and ExperimentalChip ".$echip->unique_id(); + } + + #warn "$biorep exists with\t".$bio_reps{$biorep}{'cell_type'}->name().' '.$bio_reps{$biorep}{'feature_type'}->name(); + + #We need to set the tech rep here too! + #Do we need to validate this also, as above. + #This would be overkill due to the inherant nature of the TR to BR relationship + + if(! exists $tech_reps{$biotechrep}){ + $tech_reps{$biotechrep}{'cell_type'} = $bio_reps{$biorep}{'cell_type'}; + $tech_reps{$biotechrep}{'feature_type'} = $bio_reps{$biorep}{'feature_type'}; + } + + + }else{ + + #warn "Creating new BR $biorep and TR $biotechrep"; + + if(defined $echips{$echip->unique_id()}{'cell_type'}){ + + my $cell_type = $ct_adaptor->fetch_by_name($echips{$echip->unique_id()}{'cell_type'}); + + if(! defined $cell_type){ + push @log, 'CellType '.$echips{$echip->unique_id()}{'cell_type'}.' does not exist in the database, please use the import_type.pl script'; + }else{ + $bio_reps{$biorep}{'cell_type'} = $cell_type; + $tech_reps{$biotechrep}{'cell_type'} = $cell_type; + # warn "Setting ".$echip->unique_id()." $biorep $biotechrep ".$cell_type->name; + } + }else{ + warn "No CellType specified for ExperimentalChip:\t".$echip->unique_id()."\n"; + } + + + if(defined $echips{$echip->unique_id()}{'feature_type'}){ + my $feature_type = $ft_adaptor->fetch_by_name($echips{$echip->unique_id()}{'feature_type'}); + + if(! defined $feature_type){ + push @log, 'FeatureType '.$echips{$echip->unique_id()}{'feature_type'}.' does not exist in the database, please use the import_type.pl script'; + } + else{ + $bio_reps{$biorep}{'feature_type'} = $feature_type; + $tech_reps{$biotechrep}{'feature_type'} = $feature_type; + + #warn "Setting ".$echip->unique_id()." $biorep $biotechrep ".$feature_type->name; + } + }else{ + warn "No FeatureType specified for ExperimentalChip:\t".$echip->unique_id()."\n"; + } + } + + push @{$tech_reps{$biotechrep}{'echips'}}, $echip->unique_id(); + push @{$bio_reps{$biorep}{'echips'}}, $echip->unique_id(); + } + + + + + if (@log) { + $self->log("MAGE VALIDATION REPORT\n::\t".join("\n::\t", @log)); + throw("MAGE VALIDATION FAILED\nPlease correct tab2mage file and try again:\t".$self->get_config('tab2mage_file')); + } else { + $self->log('MAGE VALDIATION SUCCEEDED'); + } + + + #we also need to build the tech rep results sets(not displayable) + #do we need to have result sets for each biorep too? + #update ExperimentalChip replicate info + my (%rsets); + my %types = ( + feature => {}, + cell => {}, + ); + + + + #This needs to update and split the import/top level sets so they are of same types + #update ec type here as we have ec context + #careful not to update multiple times, just once for each ec + + my $eca = $self->db->get_ExperimentalChipAdaptor(); + + + foreach my $echip (@{$rset->get_ExperimentalChips()}) { + my ($cell_type, $feature_type); + + #Set biorep info and rset + foreach my $biorep (keys %bio_reps){ + + foreach my $chip_uid(@{$bio_reps{$biorep}{'echips'}}){ + + if($chip_uid eq $echip->unique_id()){ + $echip->biological_replicate($biorep); + $cell_type = $bio_reps{$biorep}{'cell_type'}; + $feature_type = $bio_reps{$biorep}{'feature_type'}; + + if(! defined $rsets{$biorep}){ + + $rsets{$biorep} = Bio::EnsEMBL::Funcgen::ResultSet->new + ( + -NAME => $biorep,#this may not be unique, prepend with exp name? Force method to use Experiment_and_name? + -ANALYSIS => $rset->analysis(), + -TABLE_NAME => 'experimental_chip', + -FEATURE_TYPE => $feature_type, + -CELL_TYPE => $cell_type, + ); + + #record cell and feature types + $types{'feature'}{$feature_type->name()} = $feature_type; + $types{'cell'}{$cell_type->name()} = $cell_type; + $self->log("Created BioRep ResultSet:\t".$rsets{$biorep}->log_label); + } + + $rsets{$biorep}->add_table_id($echip->dbID(), $rset->get_chip_channel_id($echip->dbID())); + } + } + } + + #reset echip types + $echip->feature_type($feature_type); + $echip->cell_type($cell_type); + + + #set tech rep info and rset + foreach my $techrep(keys %tech_reps){ + + foreach my $chip_uid(@{$tech_reps{$techrep}{'echips'}}){ + + if($chip_uid eq $echip->unique_id()){ + $echip->technical_replicate($techrep); + + if(! defined $rsets{$techrep}){ + $rsets{$techrep} = Bio::EnsEMBL::Funcgen::ResultSet->new + ( + -NAME => $techrep,#this may not be unique, prepend with exp name? Force method to use Experiment_and_name? + -ANALYSIS => $rset->analysis(), + -TABLE_NAME => 'experimental_chip', + -FEATURE_TYPE => $tech_reps{$techrep}{'feature_type'}, + -CELL_TYPE => $tech_reps{$techrep}{'cell_type'}, + ); + + $self->log("Created TechRep ResultSet:\t".$rsets{$techrep}->log_label); + } + $rsets{$techrep}->add_table_id($echip->dbID(), $rset->get_chip_channel_id($echip->dbID())); + } + } + } + + $echip->adaptor->update_replicate_types($echip);#store rep info + } + + + ### Reset/Update/Clean import sets type fields + my $sql; + + if(scalar keys %{$types{'feature'}} >1){ + $self->log('Resetting IMPORT FeatureType to NULL for multi-FeatureType Experiment'); + $sql = "UPDATE result_set set feature_type_id='NULL' where result_set_id in (".$rset->dbID().', '.$chan_rset->dbID().')'; + + }else{ + my ($ftype) = values %{$types{'feature'}}; + + if(! defined $rset->feature_type()){ + $self->log('Updating IMPORT FeatureType to '.$ftype->name()); + $sql = "UPDATE result_set set feature_type_id=".$ftype->dbID()." where result_set_id in (".$rset->dbID().', '.$chan_rset->dbID().')'; + } + elsif($rset->feature_type->dbID ne $ftype->dbID()){ + $self->log('WARNING: FeatureType mismatch. Updating IMPORT FeatureType('.$rset->feature_type->name().') to match meta('.$ftype->name.')'); + $sql = "UPDATE result_set set feature_type_id=".$ftype->dbID()." where result_set_id in (".$rset->dbID().', '.$chan_rset->dbID().')'; + + } + } + + $self->db->dbc->do($sql) if $sql; + + undef $sql; + + if(scalar keys %{$types{'cell'}} >1){ + $self->log('Resetting IMPORT CellType to NULL for multi-CellType Experiment'); + my $sql = "UPDATE result_set set cell_type_id='NULL' where result_set_id in (".$rset->dbID().', '.$chan_rset->dbID().')'; + }else{ + my ($ctype) = values %{$types{'cell'}}; + + if(! defined $rset->cell_type()){ + $self->log('Updating IMPORT CellType to '.$ctype->name()); + $sql = "UPDATE result_set set cell_type_id=".$ctype->dbID()." where result_set_id in (".$rset->dbID().', '.$chan_rset->dbID().')'; + } + elsif($rset->cell_type->dbID ne $ctype->dbID()){ + $self->log('WARNING: CellType mismatch. Updating IMPORT CellType('.$rset->cell_type->name().') to match meta('.$ctype->name.')'); + $sql = "UPDATE result_set set cell_type_id=".$ctype->dbID()." where result_set_id in (".$rset->dbID().', '.$chan_rset->dbID().')'; + } + } + + $self->db->dbc->do($sql) if $sql; + + ### Generate new top level sets here based on br type combos + #we risk duplicating sets here if import set is set to one cell/featuretype + #duplicate anyway, as import is really just for easy tracking of all chips during import + + my %toplevel_sets; + my $toplevel_cnt = 1; + #could tidy up toplevel_sets implmentation + + foreach my $new_rset(values %rsets){ + + my $ftype_name = (defined $new_rset->{'feature_type'}) ? $new_rset->{'feature_type'}->name() : undef; + my $ctype_name = (defined $new_rset->{'cell_type'}) ? $new_rset->{'cell_type'}->name() : undef; + + if(! exists $toplevel_sets{$ftype_name}){ + $toplevel_sets{$ftype_name} = {}; + $toplevel_sets{$ftype_name}{'feature_type'} = $new_rset->{'feature_type'}; + } + + + + if(! exists $toplevel_sets{$ftype_name}{$ctype_name}){ + $toplevel_sets{$ftype_name}{$ctype_name}{'cell_type'} = $new_rset->{'cell_type'}; + $toplevel_sets{$ftype_name}{$ctype_name}{'rsets'} = [$new_rset]; + }else{ + push @{$toplevel_sets{$ftype_name}{$ctype_name}{'rsets'}}, $new_rset; + } + } + + + + #build toplevel sets for each feature/cell type combo using constituent rsets + foreach my $ftype_name(keys %toplevel_sets){ + + foreach my $ctype_name(keys %{$toplevel_sets{$ftype_name}}){ + + next if $ctype_name eq 'feature_type';#skip feature type + + #we need to give these a different key so we're not overwriting in the rset hash + $rsets{$self->experiment->name().'_'.$toplevel_cnt} = Bio::EnsEMBL::Funcgen::ResultSet->new + ( + -NAME => $self->experiment->name(), + -ANALYSIS => $rset->analysis(), + -TABLE_NAME => 'experimental_chip', + -FEATURE_TYPE => $toplevel_sets{$ftype_name}{'feature_type'}, + -CELL_TYPE => $toplevel_sets{$ftype_name}{$ctype_name}{'cell_type'}, + ); + + $self->log("Created toplevel ResultSet for:\t". $rsets{$self->experiment->name().'_'.$toplevel_cnt}->log_label); + + #add consituent table ids + foreach my $new_rset(@{$toplevel_sets{$ftype_name}{$ctype_name}{'rsets'}}){ + + foreach my $ec_id(@{$new_rset->table_ids()}){ + + #Only add it if it has not already been added + if(! $rsets{$self->experiment->name().'_'.$toplevel_cnt}->get_chip_channel_id($ec_id)){ + $rsets{$self->experiment->name().'_'.$toplevel_cnt}->add_table_id($ec_id, $new_rset->get_chip_channel_id($ec_id)); + } + } + } + $toplevel_cnt++; + } + } + + #ResultSet update strategy + #To avoid messyness in resolving result_set differences + #Simply delete all that are not used as supporting sets + #and load new ones, log old supporting rsets for manual + #reassignment and rollback. + #If we have clash between an old set and a new set, rename old + #set and log + #We might not always have the previous data files. + #But we might want to maintain all the previous rsets and just add a new one + #At present this would require acquiring the previous Tab2Mage file + #and adding the new data to it. + #We could do with a way to merge data already in the DB with new meta data to form a new Tab2Mage file + #and validate that + + + my @previous_rep_sets; + my @supporting_rset_dsets; + + + #Get non-import Sets + map {push @previous_rep_sets, $_ if $_->name !~ /_IMPORT$/} + @{$rset_adaptor->fetch_all_by_Experiment_Analysis($self->experiment, $chip_anal)}; + + + #rollback_ResultSet if possible? + #This is just checking if they are supporting, not actually rolling them back + if(@previous_rep_sets){ + $self->log('Found previously stored ResultSets'); + + foreach my $prev_rset(@previous_rep_sets){ + #This should not rollback anything, just return skipped sets + #i.e. sets which have a product feature set + #It also used to delete the supporting set records which maybe important for redefining the DataSet below + my $rset_dset = $self->rollback_ResultSet($prev_rset); + push @supporting_rset_dsets, $rset_dset if @$rset_dset; + } + } + + #Note: If we remove chips from an experiment, they are only removed from the non-import sets + #To fully remove them, you need to use the rollback_experiment.pl script with -chip_ids + #can we log this in get_import_ResultSet? + + $self->log('Storing ResultSets'); + + #Store new tech, biol and toplevel type rsets + foreach my $new_rset(values %rsets){ + my $replace_txt; + + #Rename old set if we have a name/anal/type clash + foreach my $prs(@supporting_rset_dsets){ + + my ($pset, $dset) = @$prs; + + if($pset->log_label eq $new_rset->log_label){ + my $new_name = "OLD_".$rset->log_label; + $self->log("Found update supporting ResultSet clash, renaming to:\t${new_name}"); + $self->unlink_ResultSet_DataSet($rset, $dset, $new_name); + + #This pset dbID has already been removed + #Will get updated with new rset dbID when updating DataSet + $replace_txt = 'Proposed ResultSet(dbID) replacement for DataSet('.$dset->name."):\t".$pset->dbID.' > '; + } + } + + + $new_rset->add_status('DAS_DISPLAYABLE'); + my ($new_rset) = @{$rset_adaptor->store($new_rset)}; + + if(defined $replace_txt){ + $self->log($replace_txt.$new_rset->dbID); + } + } + + my $xml_file = open_file($self->get_config('mage_xml_file')); + + #slurp in changing separator to null so we get it all in one string. + $self->experiment->mage_xml(do{ local ($/); <$xml_file>}); + close($xml_file); + + $self->experiment($self->db->get_ExperimentAdaptor->update_mage_xml_by_Experiment($self->experiment())); + } + } + + return; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/Nimblegen.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/Nimblegen.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1221 @@ +# +# EnsEMBL module for Bio::EnsEMBL::Funcgen::Parsers::Nimblegen +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::Funcgen::Parsers::Nimblegen + +=head1 SYNOPSIS + + my $parser_type = "Bio::EnsEMBL::Funcgen::Parsers::Nimblegen"; + push @INC, $parser_type; + my $imp = $class->SUPER::new(@_); + + +=head1 DESCRIPTION + +This is a parser class which should not be instatiated directly, it +normally set by the Importer as the parent class. Nimblegen contains meta +data and methods specific to NimbleGen arrays to aid parsing and importing of +experimental data. + +=cut + +package Bio::EnsEMBL::Funcgen::Parsers::Nimblegen; + +use Bio::EnsEMBL::Funcgen::Array; +use Bio::EnsEMBL::Funcgen::ProbeSet; +use Bio::EnsEMBL::Funcgen::Probe; +use Bio::EnsEMBL::Funcgen::ProbeFeature; +use Bio::EnsEMBL::Funcgen::FeatureType; +use Bio::EnsEMBL::Funcgen::ExperimentalChip; +use Bio::EnsEMBL::Funcgen::ArrayChip; +use Bio::EnsEMBL::Funcgen::Channel; +use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate ); +use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(species_chr_num open_file); +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Funcgen::Parsers::MAGE; + +use strict; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::MAGE); + +=head2 new + + Example : my $self = $class->SUPER::new(@_); + Description: Constructor method for Nimblegen class + Returntype : Bio::EnsEMBL::Funcgen::Parser::Nimblegen + Exceptions : throws if Experiment name not defined or if caller is not Importer + Caller : Bio::EnsEMBL::Funcgen::Importer + Status : at risk + +=cut + + +sub new{ + my $caller = shift; + + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_); + + throw("This is a skeleton class for Bio::EnsEMBL::Importer, should not be used directly") if(! $self->isa("Bio::EnsEMBL::Funcgen::Importer")); + + #should we provide args override for all of these? + + $self->{'config'} = + {( + #order of these data arrays is important! + #Remove these method arrays, snd just run them serially? + array_data => ['experiment'],#Rename this!! + probe_data => ["probe"], + results_data => ["and_import_results"], + sample_key_fields => ['DESIGN_ID', 'CHIP_ID', 'DYE', 'PROMOT_SAMPLE_TYPE'],# 'SAMPLE_LABEL'],label now optional + # 'SAMPLE_DESCRIPTION removed due to naming disparities + ndf_fields => ['CONTAINER', 'PROBE_SEQUENCE', 'MISMATCH','FEATURE_ID', 'PROBE_ID'],#MISMATCH is always 0! + pos_fields => ['CHROMOSOME', 'PROBE_ID', 'POSITION', 'COUNT'], + result_fields => ['PROBE_ID', 'PM', 'X', 'Y'], + notes_fields => ['DESIGN_ID', 'DESIGN_NAME', 'DESCRIPTION'], + norm_method => 'VSN_GLOG', + dye_freqs => {( + Cy5 => 635, + Cy3 => 532, + )}, + + + #Need to make these definable? + #have protocolfile arg and just parse tab2mage protocol section format + protocols => {( + grow => {( + accession => 'GROW_NIMB', + name => 'GROW NIMBLEGEN CULTURE CONDITIONS', + text => 'Nimblegen culture conditions description here. Padding text here to avoid description too short warnings.Padding text here to avoid description too short warnings.', + paramters => undef, + )}, + treatment => {( + accession => 'CROSSLINK_NIMB', + name => 'NIMBLEGEN CHROMATIN PREPARATION', + text => 'Nimblegen X-linking and DNA extraction protocol.Padding text here to avoid description too short warnings.Padding text here to avoid description too short warnings.', + paramters => undef, + )}, + extraction => {( + accession => 'CHROMATIN_IP_NIMB', + name => 'NIMBLEGEN CHROMATIN IMMUNOPRECIPITATION and DNA RECOVERY', + text => 'Nimblegen chromatin immunoprecipitation and DNA extraction protocol here.Padding text here to avoid description too short warnings.Padding text here to avoid description too short warnings.', + paramters => undef, + )}, + labeling => {( + accession => 'LABELLING_NIMB', + name => 'NIMBLEGEN LABELLING', + text => 'Nimblegen labelling protocol here.Padding text here to avoid description too short warnings.Padding text here to avoid description too short warnings.Padding text here to avoid description too short warnings.', + paramteres => undef, + )}, + hybridization => {( + accession => 'HYBRIDISATION_NIMB', + name => 'NIMBLEGEN HYBRIDISATION', + text => 'Nimblegen chip hybridisation protocol here.Padding text here to avoid description too short warnings.Padding text here to avoid description too short warnings.Padding text here to avoid description too short warnings.', + parameters => undef, + )}, + scanning => {( + accession => 'SCANNING_NIMB', + name => 'NIMBLESCAN', + text => 'Nimblegen Nimblescan protocol here.Padding text here to avoid description too short warnings.Padding text here to avoid description too short warnings.Padding text here to avoid description too short warnings.', + paramters => undef, + )}, + )}, + + )}; + + return $self; +} + + +=head2 set_config + + Example : my $self->set_config; + Description: Sets attribute dependent config + Returntype : None + Exceptions : None + Caller : Bio::EnsEMBL::Funcgen::Importer + Status : at risk + +=cut + + +sub set_config{ + my $self = shift; + + #This should be general config for all types of import + #dirs are not set in config to enable generic get_dir method access + #This is really just setting paths rather than config rename? + + #This is generic for all imports + + + + if($self->{'old_dvd_format'}){ + $self->{'design_dir'} = $self->get_dir('input').'/DesignFiles'; + }else{ + $self->{'design_dir'} = $self->get_dir('input').'/Design_information'; + } + + + if($self->{'old_dvd_format'}){ + $self->{'config'}{'notes_file'} = $self->get_dir('input').'/DesignNotes.txt'; + }else{ + $self->{'config'}{'notes_file'} = $self->get_dir('design').'/DesignNotes.txt'; + } + + $self->{'config'}{'chip_file'} = $self->get_dir('input').'/SampleKey.txt'; + + + + #Experiment(output) specific + #This should already be set in the run script + #As we could get log write errors before we have created the output dir otherwise + $self->{'output_dir'} ||= $self->get_dir("data").'/output/'.$self->{'param_species'}.'/'.$self->vendor().'/'.$self->name(); + + $self->{'config'}{'tab2mage_file'} = $self->get_dir('output').'/E-TABM-'.$self->name().'.txt'; + + $self->{'config'}{'mage_xml_file'} = $self->get_dir('output').'/{UNASSIGNED}.xml'; + + if($self->{'old_dvd_format'}){ + $self->{'results_dir'} = $self->get_dir('input').'/PairData'; + }else{ + $self->{'results_dir'} = $self->get_dir('input').'/Raw_data_files'; + } + + return; +} + + + + +=head2 read_array_data + + Example : $imp->read_array_data(); + Description: Parses NimbleGen DesignNotes.txt files to create and store new Arrays + Returntype : none + Exceptions : None + Caller : general + Status : At risk - Can this be generic? Can we force the creation of a DesignNotes file on other formats? + +=cut + + +sub read_array_data{ + my ($self, $notes_file) = @_; + + + $notes_file ||= $self->get_config('notes_file'); + my ($line, $array, $array_chip, @data, %hpos); + my $oa_adaptor = $self->db->get_ArrayAdaptor(); + my $ac_adaptor = $self->db->get_ArrayChipAdaptor(); + + #Slurp file to string, sets local delimtter to null and subs new lines + my $fh = open_file($notes_file); + #($design_desc = do { local ($/); <$fh>;}) =~ s/\r*\n$//; + #close($fh); + + #Would be better if we only import the design info for the chips listed in the SampleKey.txt file + #Some cyclical dependency going on here :| + + while ($line = <$fh>){ + + $line =~ s/\r*\n//;#chump + @data = split/\t/o, $line; + + #We need to have a DESIGN vendor type? + #also need to be able to set file path independently of config + + if($. == 1){ + %hpos = %{$self->set_header_hash(\@data, $self->get_config('notes_fields'))}; + next; + } + + ### CREATE AND STORE Array and ArrayChips + if(! defined $array ){ + #This is treating each array chip as a separate array, unless arrayset is defined + #AT present we have no way of differentiating between different array_chips on same array???!!! + #Need to add functionality afterwards to collate array_chips into single array + + #This will use a stored array if present + + $array = Bio::EnsEMBL::Funcgen::Array->new + ( + -NAME => $self->array_name() || $data[$hpos{'DESIGN_NAME'}], + -FORMAT => uc($self->format()), + -VENDOR => uc($self->vendor()), + -TYPE => 'OLIGO', + -DESCRIPTION => $data[$hpos{'DESCRIPTION'}],#need to trim the array chip specific description here + ); + + ($array) = @{$oa_adaptor->store($array)}; + + $array_chip = Bio::EnsEMBL::Funcgen::ArrayChip->new( + -ARRAY_ID => $array->dbID(), + -NAME => $data[$hpos{'DESIGN_NAME'}], + -DESIGN_ID => $data[$hpos{'DESIGN_ID'}], + #add description? + ); + + #This will use a stored array_chip if present + ($array_chip) = @{$ac_adaptor->store($array_chip)}; + $array->add_ArrayChip($array_chip); + + } + elsif((! $array->get_ArrayChip_by_design_id($data[$hpos{'DESIGN_ID'}])) && ($self->array_set())){ + + $self->log("Generating new ArrayChip(".$data[$hpos{'DESIGN_NAME'}].") for same Array:\t".$array->name()."\n"); + + $array_chip = Bio::EnsEMBL::Funcgen::ArrayChip->new( + -ARRAY_ID => $array->dbID(), + -NAME => $data[$hpos{'DESIGN_NAME'}], + -DESIGN_ID => $data[$hpos{'DESIGN_ID'}], + ); + + ($array_chip) = @{$ac_adaptor->store($array_chip)}; + $array->add_ArrayChip($array_chip); + + } + elsif(! $array->get_ArrayChip_by_design_id($data[$hpos{'DESIGN_ID'}])){ + throw("Found experiment with more than one design without -array_set"); + } + } + + + $self->add_Array($array); + + close($fh); + + return; + +} + + +=head2 read_experiment_data + + Example : $imp->read_array_chip_data(); + Description: Parses and imports array & experimental chip meta data/objects + Returntype : none + Exceptions : throws if more than one array/design found and not an "array set" + Caller : Importer + Status : At risk + +=cut + +sub read_experiment_data{ + my $self = shift; + + $self->read_array_data(); + + my $t2m_file = $self->init_tab2mage_export() if $self->{'write_mage'}; + + + my ($design_desc, $line, $tmp_uid, $channel, $echip, $sample_label); + my ($sample_desc, %hpos, @data, %uid_reps, %did_reps, %sample_reps); + my $ec_adaptor = $self->db->get_ExperimentalChipAdaptor(); + my $chan_adaptor = $self->db->get_ChannelAdaptor(); + my $br_cnt = 1; + my $tr_cnt = 1; + + #Currently 1 design = 1 chip = 1 array /DVD + #Different designs are not currently collated into a chip_set/array in any ordered manner + #Register each design as an array and an array_chip + #May want to group array_chips into array/chip sets by association though the API + + + warn("Harcoded for one array(can have multiple chips from the same array) per experiment\n"); + my $fh = open_file($self->get_config("chip_file")); + $self->log("Reading chip data"); + + + #warn "Do we need to validate each line here against the header array?"; + + while ($line = <$fh>){ + next if $line =~ /^\s+\r*\n/; + $line =~ s/\r*\n//;#chump + @data = split/\t/o, $line; + #we could validate line against scalar of header array + #ORD_ID CHIP_ID DYE DESIGN_NAME DESIGN_ID SAMPLE_LABEL SAMPLE_SPECIES SAMPLE_DESCRIPTION TISSUE_TREATMENT PROMOT_SAMPLE_TYPE + if ($. == 1){ + %hpos = %{$self->set_header_hash(\@data, $self->get_config('sample_key_fields'))}; + + #we need to set the sample description field name, as it can vary :((( + @data = grep(/SAMPLE_DESCRIPTION/, keys %hpos); + $sample_desc = $data[0]; + + throw("More than one sample description(@data) in ".$self->get_config("chip_file")."\n") if(scalar @data >1); + next; + } + + #Need to handle array class here i.e. two channel arrays will have two lines + #validate species here + #look up alias from registry and match to self->species + #registry may not be loaded for local installation + + + ### CREATE AND STORE ExperimentalChips + if ((! $tmp_uid) || ($data[$hpos{'CHIP_ID'}] ne $tmp_uid)){ + + #Test both channels are available, i.e. the SampleKey has two TOTAL channels + if($echip){ + + for my $type('TOTAL', 'EXPERIMENTAL'){ + + my $test_chan = $chan_adaptor->fetch_by_type_experimental_chip_id($type, $echip->dbID()); + throw("ExperimentalChip(".$echip->unique_id(). + ") does not have a $type channel, please check the SampleKey.txt file") if ! $test_chan; + + } + } + + $tmp_uid = $data[$hpos{'CHIP_ID'}]; + $echip = $ec_adaptor->fetch_by_unique_id_vendor($data[$hpos{'CHIP_ID'}], 'NIMBLEGEN'); + + if($echip){ + + if(! $self->recovery()){ + throw("ExperimentalChip(".$echip->unqiue_id(). + " already exists in the database\nMaybe you want to recover?"); + } + }else{ + + $echip = Bio::EnsEMBL::Funcgen::ExperimentalChip->new + ( + -EXPERIMENT_ID => $self->experiment->dbID(), + -DESCRIPTION => $data[$hpos{$sample_desc}], + -FEATURE_TYPE => $self->feature_type, + -CELL_TYPE => $self->cell_type, + -ARRAY_CHIP_ID => $self->arrays->[0]->get_ArrayChip_by_design_id($data[$hpos{'DESIGN_ID'}])->dbID(), + -UNIQUE_ID => $data[$hpos{'CHIP_ID'}], + #-BIOLOGICAL_REPLICATE => , + #-TECHNICAL_REPLICATE => , + ); + + ($echip) = @{$ec_adaptor->store($echip)}; + $self->experiment->add_ExperimentalChip($echip); + } + } + + + ### CREATE AND STORE Channels + my $type = uc($data[$hpos{'PROMOT_SAMPLE_TYPE'}]); + my $sample_label = (! exists $hpos{'SAMPLE_LABEL'}) ? '' : $data[$hpos{'SAMPLE_LABEL'}]; + + + $type = 'TOTAL' if ($type ne 'EXPERIMENTAL'); + $channel = $chan_adaptor->fetch_by_type_experimental_chip_id($type, $echip->dbID()); + + if($channel){ + if(! $self->recovery()){ + throw("Channel(".$echip->unqiue_id().":".uc($data[$hpos{'PROMOT_SAMPLE_TYPE'}]). + " already exists in the database\nMaybe you want to recover?"); + }else{ + #push @{$self->{'_rollback_ids'}}, $channel->dbID(); + #No point in doing this as all Channels mey be pre-registered in recovery mode + #Hence all will be rolled back + } + }else{ + #Handles single/mutli + + $channel = Bio::EnsEMBL::Funcgen::Channel->new + ( + -EXPERIMENTAL_CHIP_ID => $echip->dbID(), + -DYE => $data[$hpos{'DYE'}], + -SAMPLE_ID => $sample_label, + -TYPE => $type, + ); + + #-SPECIES => $self->species(),#on channel/sample to enable multi-species chip/experiment + #would never happen on one chip? May happen between chips in one experiment + + ($channel) = @{$chan_adaptor->store($channel)}; + + } + + #we need to build the channel level tab2mage line here + + #For each BR there will be two sample_labels, one for each channel + #These will be used across multiple chips. + #If two chips the same design ID and the same sample labels, then we have a technical replicate + #else if they have different sample labels then we have another biological replicate + #We have a problem of associating channels to the same BR with differing sample labels + #This is solved by checking whether the chip ID has already been registered in a BR + + #This fails if more than one sample label is used for any given BR + #This will result in the BR being split into the number of unique sample label pairs(Experimental/Control channel) + #This also fails if the same sample label has been used for two different BRs + + if($self->{'write_mage'}){ + #my $sample_name = ($sample_label eq '') ? '???' : substr($sample_label, 0, (length($sample_label)-1)); + my $ctype_name = (defined $self->cell_type()) ? $self->cell_type->name() : '???'; + my $ftype_name = (defined $self->feature_type()) ? $self->feature_type->name() : '???'; + my $ctype_desc = (defined $self->cell_type()) ? $self->cell_type->description() : '???'; + + #define reps + + + #we need one to get the biorep based on the sample label and making sure the unique ID are the same + #we need to define the tech rep by matching the sample label and the making sure the design_id isn't already used + + #Is this doing the BR assignment properly? + + if(exists $sample_reps{$sample_label}){#Found chip in a previously seen BR + #Register the BR of this chip ID + $uid_reps{$data[$hpos{'CHIP_ID'}]}{'br'} = $sample_reps{$sample_label}; + + } + elsif(exists $uid_reps{$data[$hpos{'CHIP_ID'}]}){#Found the other channel + $sample_reps{$sample_label} = $uid_reps{$data[$hpos{'CHIP_ID'}]}{'br'}; + } + else{#assign new br + $sample_reps{$sample_label} = $br_cnt; #Assign BR to sample label + $uid_reps{$data[$hpos{'CHIP_ID'}]}{'br'} = $br_cnt; #Assign BR to chip id + $br_cnt++; + } + + + + #Something is going awry here. The TR is not being reset for some new BRs + + if(! exists $uid_reps{$data[$hpos{'CHIP_ID'}]}{'tr'}){ + #we only assign a new tr here if this design has not been seen in any of the reps + #i.e. we need to get the first tr which does not contain this design_id + + my $create_rep = 1; + my $tr; + my @chip_ids; + my $br = $uid_reps{$data[$hpos{'CHIP_ID'}]}{'br'}; + + foreach my $chip_id(keys %uid_reps){ + + push @chip_ids, $chip_id if($uid_reps{$chip_id}{'br'} == $br); + } + + #This is looping through all the TRs for all the design IDs + + foreach my $rep(sort keys %did_reps){ + #Doesn't exist for the given BR? + #So we need to get all the chip_ids for a given br + #Check wether it exists and wether it exists in did_reps and check wether the chip_id value is part of the BR set + #else we add it + + if(! exists $did_reps{$rep}{$data[$hpos{'DESIGN_ID'}]}){ + #Not seen in a TR of this $rep yet + $create_rep = 0; + }elsif(! grep(/$did_reps{$rep}{$data[$hpos{'DESIGN_ID'}]}/, @chip_ids)){ + #Not seen in this BR with this TR $rep + $create_rep = 0; + } + + if(! $create_rep){ + #Design ID not seen so add to this TR + $did_reps{$rep}{$data[$hpos{'DESIGN_ID'}]} = $data[$hpos{'CHIP_ID'}]; #don't really need to assign this + $tr = $rep; + last;#do not remove this or we get wierd TR incrementing + } + } + + + if($create_rep){ + #Get the next TR value for this given BR + my @trs; + + foreach my $rep(keys %did_reps){ + + foreach my $chip_id(values %{$did_reps{$rep}}){ + + #Push TR if chip_id is present in this BR + push @trs, $rep if(grep(/$chip_id/, @chip_ids)); + } + } + ($tr) = sort {$b<=>$a} @trs; + + $tr ||=0; + $tr++; + + #register design ID to chip ID mapping for this TR + $did_reps{$tr}{$data[$hpos{'DESIGN_ID'}]} = $data[$hpos{'CHIP_ID'}]; + } + + #register TR for this chip ID + $uid_reps{$data[$hpos{'CHIP_ID'}]}{'tr'} = $tr; + } + + my $br = $self->experiment->name().'_BR'. $uid_reps{$data[$hpos{'CHIP_ID'}]}{'br'}; + my $tr = $br.'_TR'.$uid_reps{$data[$hpos{'CHIP_ID'}]}{'tr'}; + + + #File[raw] + my $tsm_line = $echip->unique_id().'_'.$self->get_config('dye_freqs')->{$data[$hpos{'DYE'}]}.'_pair.txt'; + #Array[accession] # Should this be left blank for AE accession? + $tsm_line .= "\t".$data[$hpos{'DESIGN_ID'}]; + #Array[serial] + $tsm_line .= "\t".$echip->unique_id(); + + #Protocol(s)[grow][treatment][extraction][labelling][hybridisation][scanning] + foreach my $protocol(sort (keys %{$self->get_config('protocols')})){ + $tsm_line .= "\t".$self->get_config('protocols')->{$protocol}->{'accession'}; + } + + + #BioSource + $tsm_line .= "\t$ctype_name"; + #Sample + $tsm_line .= "\t$br"; + #Extract + $tsm_line .= "\t$tr"; + + #LabeledExtract & Immunoprecipitate + if($type eq 'EXPERIMENTAL'){ + $tsm_line .= "\t$sample_label - IP of $tr with anti $ftype_name (Ab vendor, Ab ID)"; + $tsm_line .= "\t$tr IP"; + }else{ + $tsm_line .= "\t$sample_label - Input control DNA of $tr\t"; + } + + #Hybridization + #U2OS BR1_TR1 ChIP H3KAc 46092 hyb + $tsm_line .= "\t$ctype_name $tr ChIP $ftype_name ".$echip->unique_id().' hyb'; + + #BioSourceMaterial SampleMaterial ExtractMaterial LabeledExtractMaterial + $tsm_line .= "\tcell\tgenomic_DNA\tgenomic_DNA\tsynthetic_DNA"; + + #Dye + $tsm_line .= "\t".$data[$hpos{'DYE'}]; + + #BioMaterialCharacteristics[Organism] + $tsm_line .= "\t".$self->species(); + + #BioMaterialCharacteristics[BioSourceType] + $tsm_line .= "\tfrozen_sample"; + + #BioMaterialCharacteristics[StrainOrLine] + $tsm_line .= "\t$ctype_name"; + + #BioMaterialCharacteristics[CellType] + $tsm_line .= "\t$ctype_name"; + + #BioMaterialCharacteristics[Sex] + $tsm_line .= "\t???"; + #FactorValue[StrainOrLine] + $tsm_line .= "\t$ctype_name"; + #FactorValue[Immunoprecipitate] + $tsm_line .= ($type eq 'EXPERIMENTAL') ? "\tanti-${ftype_name} antibody\n" : "\t\n"; + + print $t2m_file $tsm_line; + + } + + } + + close($t2m_file) if $self->{'write_mage'}; + close($fh); + + return; +} + + + + +=head2 read_probe_data + + Example : $imp->read_probe_data(); + Description: Parses and imports probes, probe sets and features of a given array + No duplicate handling or probe caching is performed due to memory + issues, this is done in resolve_probe_data. + Returntype : none + Exceptions : none + Caller : Importer + Status : Medium + +=cut + + +#Assumes one chip_design per experimental set. +sub read_probe_data{ + my ($self) = shift; + + my ($fh, $line, @data, %hpos, %probe_pos);#, %duplicate_probes); + $self->log("Parsing and importing ".$self->vendor()." probe data (".localtime().")", 1); + + ### Read in + #ndf file: probe_set, probe and probe_feature(.err contains multiple mappings) + #pos file: probe chromosome locations + + #Need to change how probe_names are generated for nimblegen? + #native probe_ids may not be unique, but should be when combined with the seq_id which is currently being used as the xref_id + #Handle with API!! + + #READ REGION POSITIONS + #We need to handle different coord systems and possibly different assmemblies + my $slice_a = $self->db->get_SliceAdaptor(); + my $cs = $self->db->get_FGCoordSystemAdaptor()->fetch_by_name('chromosome'); + + + #TIED FILE CACHE!!! + #We need to rebuild the cache from the DB before we start adding new probe info + #We only need to rebuild cache if we find a chip that hasn't been imported? + #No, we just need to import without cache, then re-do the resolve step + #Are we still going to get disconnects when we dump the cache? + + + #warn "Read probe data should only read in the array chips which are specified by the ExperimentalChip? Not just what is present in the DesignNotes file?"; + + + foreach my $array(@{$self->arrays()}){ + + foreach my $achip(@{$array->get_ArrayChips()}){ + + my (@log, %probe_pos, $fasta_file, $f_out); + #do we need to fetch probe by seq and array? + #this would also id non-unique seqs in design + + #warn "We need to account for different cs feature amppings here + + if($achip->has_status('IMPORTED')){ + $self->log("Skipping fully imported ArrayChip:\t".$achip->design_id()); + next; + }elsif($self->recovery()){ + $self->log("Rolling back ArrayChip:\t".$achip->design_id()); + $self->rollback_ArrayChips([$achip]); + } + + $self->log("Importing ArrayChip:".$achip->design_id()); + + #Always use pos file, ndf file cannot be guranteed to contain all location info + #pos file also gives a key to which probes should be considered 'EXPERIMENTAL' + + #CACHE PROBE POSITIONS + $fh = open_file($self->get_dir("design")."/".$achip->name().".pos"); + + #don't % = map ! Takes a lot longer than a while ;) + while($line = <$fh>){ + $line =~ s/\r*\n//o;#Not using last element + @data = split/\t/o, $line; + + #SEQ_ID CHROMOSOME PROBE_ID POSITION COUNT + if ($. == 1){ + %hpos = %{$self->set_header_hash(\@data, $self->get_config('pos_fields'))}; + next; + } + + #Skip probe if there is a duplicate + if(exists $probe_pos{$data[$hpos{'PROBE_ID'}]}){ + + if($data[$hpos{'CHROMOSOME'}] eq $probe_pos{$data[$hpos{'PROBE_ID'}]}->{chr} && + ($data[$hpos{'POSITION'}]+1) eq $probe_pos{$data[$hpos{'PROBE_ID'}]}->{start}){ + #log or warn here? + + #Not matching probe length here + + next; + #Do we need to skip this in the ndf file too? + + } + else{ + throw("Found duplicate mapping for ".$data[$hpos{'PROBE_ID'}]. + " need implement duplicate logging/cleaning"); + } + #need to build duplicate hash to clean elements from hash + # $duplicate_probes{$data[$hpos{'PROBE_ID'}]} = 1; + #next; + } + + + my $random = 0; + + if(! $self->cache_slice($data[$hpos{'CHROMOSOME'}])){ + push @log, "Skipping feature import for probe ".$data[$hpos{'PROBE_ID'}]." with non-standard region ".$data[$hpos{'CHROMOSOME'}]; + + #should we try and resolve the random chrs here? + #at least store probe/set/result and skip feature + #can we just import as chr with no start postition? + + #if ($data[$hpos{'CHROMOSOME'}] =~ /_random/){ + + # if(! $self->cache_slice($data[$hpos{'CHROMOSOME'}])){ + #push @log, "Skipping probe ".$data[$hpos{'PROBE_ID'}]." with non-standard region ".$data[$hpos{'CHROMOSOME'}]; + #}else{ + #we should really log this in a seprate file to avoid overloading the lgo file + # push @log, "Importing random probe ".$data[$hpos{'PROBE_ID'}]." on ".$data[$hpos{'CHROMOSOME'}]." omitting position"; + + #} + + #} + + } + + #This is not handling probes with random chrs + + $probe_pos{$data[$hpos{'PROBE_ID'}]} = {( + chr => $data[$hpos{'CHROMOSOME'}], + start => ($data[$hpos{'POSITION'}] +1),#default UCSC->Ensembl coord conversion + )}; + + } + + + + + #Remove duplicate probes + + $self->log("Built position cache from : ".$achip->name().".pos", 1); + close($fh); + + $self->log("Importing design probes from : ".$achip->name().".ndf"); + #OPEN PROBE IN/OUT FILES + $fh = open_file($self->get_dir("design")."/".$achip->name().".ndf"); + #Need to set these paths in each achip hash, file names could be tablename.chip_id.txt + + #Need to add dbname/port/species/vendor to this path? + #.efg DropDatabase should also clean the fasta dumps and caches for a given DB + + if($self->dump_fasta()){ + $fasta_file = $self->get_dir('fastas').'/'.$achip->name().".fasta"; + $self->backup_file($fasta_file); + $f_out = open_file($fasta_file, '>'); + } + + + my ($length, $ops, $op, $of, %pfs); + + #should define mapping_method arg to allows this to be set to LiftOver/EnsemblMap + my $anal = $self->db->get_AnalysisAdaptor()->fetch_by_logic_name("VendorMap"); + + + my $strand = 0; #default for nimblegen, should be config hash? + + #my $cig_line = "50M"; #default for nimblegen, should be config hash? + #probe length can change within design, should be built from length + + + my $fasta = ""; + + #$self->Timer()->mark("Starting probe loop"); + + + #This is leaking about 30-60MB for each normal density chip? + #need Devel::Monitor here? + + + while($line = <$fh>){ + $line =~ s/\r*\n//; + @data = split/\t/o, $line; + my $loc = ""; + my $class = "EXPERIMENTAL"; + + #PROBE_DESIGN_ID CONTAINER DESIGN_NOTE SELECTION_CRITERIA SEQ_ID PROBE_SEQUENCE MISMATCH MATCH_INDEX FEATURE_ID ROW_NUM COL_NUM PROBE_CLASS PROBE_ID POSITION DESIGN_ID X Y + #2067_0025_0001 BLOCK1 0 chrX TTAGTTTAAAATAAACAAAAAGATACTCTCTGGTTATTAAATCAATTTCT 0 52822449 52822449 1 25 experimental chrXP10404896 10404896 2067 25 1 + + if ($. == 1){ + %hpos = %{$self->set_header_hash(\@data, $self->get_config('ndf_fields'))}; + next; + } + + + if (! exists $probe_pos{$data[$hpos{'PROBE_ID'}]}){ + push @log, "Skipping non-experimental probe:\t".$data[$hpos{'PROBE_ID'}]; + next; + } + + #Which non-experimental probes might we want to store? + #if($data[$hpos{'CONTAINER'}] =~ /control/io){ + # $class = "CONTROL"; + #} + #elsif($data[$hpos{'CONTAINER'}] =~ /random/io){ + # $class = "RANDOM"; + #} + #elsif($data[$hpos{'PROBE_CLASS'}] !~ /experimental/io){ + # $class = "OTHER"; + #} + #elsif(! exists $probe_pos{$data[$hpos{'PROBE_ID'}]}){ #HACKY HACKY HACK HACK!! Needed for valid region retrival + # $class = "OTHER"; + #} + #SPIKE INS? + + + + #This assumes all probes in feature/probeset are next to each other!!!!!!!!!! + + + if($data[$hpos{'FEATURE_ID'}] != $data[$hpos{'MATCH_INDEX'}]){#Probe set data + #print "Generating new probeset:\tFeature id:\t".$data[$hpos{'FEATURE_ID'}]."\tmatchindex:\t".$data[$hpos{'MATCH_INDEX'}]."\n"; + + if($ops && ($data[$hpos{'FEATURE_ID'}] ne $ops->name())){ + #THis is where we chose to update/validate + #Do we need to pass probes if they're already stored..may aswell to reduce mysql load? + #No point as we have to query anyway + $self->store_set_probes_features($achip->dbID(), \%pfs, $ops); + throw("ops still defined in caller") if defined $ops; + } + + $ops = Bio::EnsEMBL::Funcgen::ProbeSet->new( + -NAME => $data[$hpos{'FEATURE_ID'}], + -SIZE => undef, + -FAMILY => $data[$hpos{'CONTAINER'}], + #xref_id => $data[$hpos{'SEQ_ID'}],#Need to populate xref table + ); + + #should we store straight away or build a probeset/probe/feature set, and then store and validate in turn? + #Store directly have separate method to validate and update? + #would need to check if one exists before storing anyway, else we could potentially duplicate the same probe/probeset from a different array + #remember for affy we need duplicate probe records with identical probe ids, probeset records unique across all arrays + + undef %pfs + } + elsif($. > 2){#may have previous ops set, but next has no ops, or maybe just no ops's at all + $self->store_set_probes_features($achip->dbID(), \%pfs, $ops); + throw("ops still defined in caller") if defined $ops; + } + + + ###PROBES + #should we cat $xref_id to $probe_id here to generate unique id? + #would be messy to handle in the code, but would have to somewhere(in the retrieval code) + + $length = length($data[$hpos{'PROBE_SEQUENCE'}]); + #$probe_string .= "\t${psid}\t".$data[$hpos{'PROBE_ID'}]."\t${length}\t$ac_id\t${class}\n"; + + $op = Bio::EnsEMBL::Funcgen::Probe->new( + -NAME => $data[$hpos{'PROBE_ID'}], + -LENGTH => $length, + -ARRAY => $array, + -ARRAY_CHIP_ID => $achip->dbID(), + -CLASS => $class, + ); + + + %{$pfs{$data[$hpos{'PROBE_ID'}]}} = ( + probe => $op, + features => [], + ); + + + ###PROBE FEATURES + #How can we be certain that we have the same mapping in the DB? + #Put checks in here for build? + #Need to handle controls/randoms here + #won't have features but will have results!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + #The format of the pos file looks like it should have all the data required, but + # chromsome is missing, first undef :( + #Need to use $pos here instead of .pos file + #However have problems defining probe class, as not populated in test set + #derive from container! :( + #Ignore controls/random as they won't have a region + #Also need to handle multiple mappings? + #As results reference probes, no features, can have multiple features on different builds + + #if($class eq "EXPERIMENTAL"){ + + #if(exists $regions{$data[$hpos{'SEQ_ID'}]}){ + # $fid++; + # $pf_string .= "\t".$regions{$data[$hpos{'SEQ_ID'}]}{'seq_region_id'}."\t".$data[$hpos{'POSITION'}]."\t". + # ($data[$hpos{'POSITION'}] + $length)."\t${strand}\t".$regions{$data[$hpos{'SEQ_ID'}]}{'coord_system_id'}. + # "\t${pid}\t${anal_id}\t".$data[$hpos{'MISMATCH'}]."\t${cig_line}\n"; + #$loc .= $regions{$data[$hpos{'SEQ_ID'}]}{'seq_region_id'}.":".$data[$hpos{'POSITION'}]. + # "-".($data[$hpos{'POSITION'}] + $length).";" if ($self->{'_dump_fasta'}); + # } + # else{ + # die("No regions defined for ".$data[$hpos{'SEQ_ID'}]." ".$data[$hpos{'PROBE_ID'}]. + #" with family ".$data[$hpos{'CONTAINER'}]); + # } + + + + + if ($self->dump_fasta()){ + #(my $chr = $probe_pos{$data[$hpos{'PROBE_ID'}]}->{'chr'}) =~ s/chr//; + + #$loc .= $chr.":".$probe_pos{$data[$hpos{'PROBE_ID'}]}->{'start'}."-". + # ($probe_pos{$data[$hpos{'PROBE_ID'}]}->{'start'}+ $length).";"; + + #$fasta .= ">".$data[$hpos{'PROBE_ID'}]."\t".$data[$hpos{'CHROMOSOME'}]. + # "\t$loc\n".$data[$hpos{'PROBE_SEQUENCE'}]."\n"; + + + + #filter controls/randoms? Or would it be sensible to see where they map + #wrap seq here? + #$fasta .= ">".$data[$hpos{'PROBE_ID'}]."\n".$data[$hpos{'PROBE_SEQUENCE'}]."\n"; + + + #To use this for mapping, we really need the dbID nr fasta + #This can be generated after the import, or maybe during resolve? + #This is also currently done on a chip level, where as the cache is resolved at the array level + #We could simply cat the files before resolving the fasta file + #Need to do this otherwise we risk overwriting the fasta file with incomplete data. + #Can we validate sequence across probes with same name in this step? + #Just use probe name for now. + + #We could cat and sort the fastas to make sure we have the same sequences + #Need to dump the design_id in the fasta header + #This would also reduce IO on the DB as identical probe will be consecutive, hence just one query to get the id. + + #Changed th format an content of this to facilitate dbID nr fasta file generation and sequence validation + + $fasta .= ">".$data[$hpos{'PROBE_ID'}]."\t".$achip->design_id."\n".$data[$hpos{'PROBE_SEQUENCE'}]."\n"; + + #Print fasta every 10000 lines + if(! ($. % 10000)){ + print $f_out $fasta; + $fasta = ''; + } + } + + + #Hack!!!!!! Still importing probe (and result?) + next if(! $self->cache_slice($probe_pos{$data[$hpos{'PROBE_ID'}]}->{'chr'})); + #warn("Skipping non standard probe (".$data[$hpos{'PROBE_ID'}].") with location:\t$loc\n"); + + + $of = Bio::EnsEMBL::Funcgen::ProbeFeature->new + ( + -START => $probe_pos{$data[$hpos{'PROBE_ID'}]}->{'start'}, + -END =>($probe_pos{$data[$hpos{'PROBE_ID'}]}->{'start'} + $length), + -STRAND => $strand, + -SLICE => $self->cache_slice($probe_pos{$data[$hpos{'PROBE_ID'}]}->{'chr'}), + -ANALYSIS => $anal, + -MISMATCHCOUNT => $data[$hpos{'MISMATCH'}],#Is this always 0 for import? remove from header hash? + -PROBE => undef, #Need to update this in the store method + ); + + + push @{$pfs{$data[$hpos{'PROBE_ID'}]}{'features'}}, $of; + + } + + #need to store last data here + $self->store_set_probes_features($achip->dbID(), \%pfs, $ops); + $self->log(join("\n", @log)); + $achip->adaptor->store_status("IMPORTED", $achip); + $self->log("ArrayChip:\t".$achip->design_id()." has been IMPORTED"); + + if ($self->dump_fasta()){ + print $f_out $fasta; + close($f_out); + } + + $self->log("Imported design from:\t".$achip->name().".ndf", 1); + + + + #$self->{'_probe_cache'} = undef;#As we can't get Y and Y info from the DB, this is only possible as the results files contain X and Y info + } + + #Should we build hash of probe_names:probe_feature_ids here for results import + #Should we dump this as a lookup file for easier recoverability + #This is the biggest step in the import + #Building the hash would be fastest but least recoverable + #Would have to write recover statments for each step i.e. build the most recent data structure required for the next import step + #Individual queries for each result would take ages + #This is all assuming there are no random records in the table i.e. ID series is linear with no gaps. + + + #Could throw a random validation check in every X entries? + #This would only work for non-parallel imports + #periodic import when hit new probe_set, but no new data printed + + } + + + $self->log("Finished parsing probe data"); + #Total probe_sets:\t$psid\n". + # "Total probes:\t$pid\nTotal probe_features:\t$fid"); + + + $self->resolve_probe_data(); + + return; +} + + + + +=head2 read_and_import_results_data + + Example : $imp->read_results_data(); + Description: Parses and dumps raw results to file + Returntype : none + Exceptions : none + Caller : Importer + Status : at risk + +=cut + + +sub read_and_import_results_data{ + my $self = shift; + + $self->log("Parsing ".$self->vendor()." results"); + my (@header, @data, @design_ids, @lines); + my ($fh, $pid, $line, $file); + my $anal = $self->db->get_AnalysisAdaptor->fetch_by_logic_name("RawValue"); + my $result_set = $self->get_import_ResultSet($anal, 'channel'); + + + if ($result_set) { #we have some new data to import + + foreach my $echip (@{$self->experiment->get_ExperimentalChips()}) { + + #if( ! $echip->has_status('IMPORTED')){ + + foreach my $chan (@{$echip->get_Channels()}) { + + if ( ! $chan->has_status('IMPORTED')) { + + #we need to set write_mage here + + my $array = $echip->get_ArrayChip->get_Array(); + + $self->get_probe_cache_by_Array($array) || throw('Failed to get the probe cache handle for results import, resolve cache here?'); + my ($probe_elem, $score_elem, %hpos); + my $cnt = 0; + my $r_string = ""; + my $chan_name = $echip->unique_id()."_".$self->get_config('dye_freqs')->{$chan->dye()}; + my $cc_id = $result_set->get_chip_channel_id($chan->dbID()); + + + #if ($self->recovery()) { + # $self->log("Rolling back results for channel:\t${chan_name}"); + # $self->db->rollback_results($cc_id); + # } + + #open/backup output + my $out_file = $self->get_dir("raw")."/result.".$chan_name.".txt"; + $self->backup_file($out_file); + my $r_out = open_file($out_file, '>'); + + (my $alt_chan_name = $chan_name) =~ s/\_/\_1h\_/; + my $found = 0; + + FILE: foreach my $name($chan_name, $alt_chan_name){ + + foreach my $suffix ("_pair.txt", ".pair", ".txt") { + + $file = $self->get_dir("results")."/".$name.$suffix; + + if (-f $file) { + $found = 1; + last FILE; + } + } + } + + throw("Could not find result file for Channel(${chan_name}) in ".$self->get_dir('results')) if ! $found; + + #open/slurp input + $self->log("Reading result for channel $chan_name:\t$file", 1); + $fh = open_file($file); + @lines = <$fh>; + close($fh); + + + ###PROCESS HEADER + + foreach my $i (0..$#lines) { + + if ($lines[$i] =~ /PROBE_ID/o) { + $lines[$i] =~ s/\r*\n//o; + @data = split/\t/o, $lines[$i]; + + %hpos = %{$self->set_header_hash(\@data, $self->get_config('result_fields'))}; + + #remove header + splice @lines, $i, 1; + + last; #finished processing header + } + } + + #we need to sort the result files based on the unique key(name at present, should replace with seq at some point) + @lines = sort {(split/\t/o, $a)[$hpos{'PROBE_ID'}] cmp (split/\t/o, $b)[$hpos{'PROBE_ID'}]} @lines; + + $self->log('Parsing results', 1); + + foreach $line(@lines) { + + #can we preprocess effectively? + next if $line =~ /^#/; + next if $line =~ /NGS_CONTROLS/; + next if $line =~ /V_CODE/; + next if $line =~ /H_CODE/; + next if $line =~ /RANDOM/; + + $line =~ s/\r*\n//o; + @data = split/\t/o, $line; + + ###PROCESS HEADER + #if ($line =~ /PROBE_ID/o){ + # + # %hpos = %{$self->set_header_hash(\@data, $self->get_config('result_fields'))}; + # next;#finished processing header + #} + + ###PROCESS DATA + #Is this string concat causing the slow down, would it befaster to use an array and print a join? + + if ($pid = $self->get_probe_id_by_name_Array($data[$hpos{'PROBE_ID'}], $array)) { + $cnt ++; + $r_string .= '\N'."\t${pid}\t".$data[$hpos{'PM'}]."\t${cc_id}\t".$data[$hpos{'X'}]."\t".$data[$hpos{'Y'}]."\n"; + } else { + warn "Found unfiltered non-experimental probe in input $data[$hpos{'PROBE_ID'}]"; + } + + ###PRINT SOME RESULTS + if ($cnt > 10000) { + $cnt = 0; + print $r_out $r_string; + $r_string =""; + #could we fork here and import in the background? + } + + } + #PRINT/CLOSE Channel file + print $r_out $r_string; + close($r_out); + $self->log("Finished parsing $chan_name result", 1); + + #Import directly here to avoid having to reparse all results if we crash!!!! + $self->log("Importing:\t$out_file"); + $self->db->load_table_data("result", $out_file); + $self->log("Finished importing:\t$out_file", 1); + $chan->adaptor->store_status('IMPORTED', $chan); + + + } + } + } + } + else { + $self->log("Skipping results parse and import"); + } + + $self->log("Finished parsing and importing results"); + + return; +} + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/Sanger.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/Sanger.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,577 @@ +# +# EnsEMBL module for Bio::EnsEMBL::Funcgen::Parsers::Sanger +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::Funcgen::Parsers::Sanger + +=head1 SYNOPSIS + + my $parser_type = "Bio::EnsEMBL::Funcgen::Parsers::Sanger"; + push @INC, $parser_type; + my $imp = $class->SUPER::new(@_); + + +=head1 DESCRIPTION + +This is a definitions class which should not be instatiated directly, it +normally inherited from the Importer. Sanger contains meta data and methods +specific to Sanger PCR arrays to aid parsing and importing of experimental data. + +=cut + +package Bio::EnsEMBL::Funcgen::Parsers::Sanger; + +use Bio::EnsEMBL::Funcgen::Array; +use Bio::EnsEMBL::Funcgen::ProbeSet; +use Bio::EnsEMBL::Funcgen::Probe; +use Bio::EnsEMBL::Funcgen::ProbeFeature; +use Bio::EnsEMBL::Funcgen::FeatureType; +use Bio::EnsEMBL::Funcgen::ExperimentalChip; +use Bio::EnsEMBL::Funcgen::ArrayChip; +use Bio::EnsEMBL::Funcgen::Channel; +use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate ); +use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(species_chr_num open_file); +use Bio::EnsEMBL::Funcgen::Utils::Helper; +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use strict; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Utils::Helper); + +=head2 new + + Example : my $self = $class->SUPER::new(@_); + Description: Constructor method for Sanger class + Returntype : Bio::EnsEMBL::Funcgen::Parsers::Sanger + Exceptions : throws if Experiment name not defined or if caller is not Importer + Caller : Bio::EnsEMBL::Funcgen::Importer + Status : at risk + +=cut + + +sub new{ + my $caller = shift; + + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(); + + throw("This is a skeleton class for Bio::EnsEMBL::Importer, should not be used directly") if(! $self->isa("Bio::EnsEMBL::Funcgen::Importer")); + + $self->{'config'} = {( + #order of these data arrays is important! + array_data => [], #["array_chip"], + probe_data => ["array_probe"], + results_data => ["and_import_result"], + #import_methods => [], + #data paths here? + norm_method => undef, + #is this disabling -input_dir override option? + )}; + + + + return $self; +} + +=head2 set_config + + Example : $imp->set_config(); + Description: Sets a attribute dependent variables + Returntype : none + Exceptions : None + Caller : Importer + Status : At risk + +=cut + +sub set_config{ + my ($self) = @_; + + #placeholder method for setting any attr dependant vars e.g. file paths etc. + + + return; +} + + +sub read_array_probe_data{ + my ($self, $array_file) = @_; + + warn("Remove hard coding for Sanger array import, and accomodate adf format"); + + + $array_file ||= $self->array_file(); + my ($line, $fh, @list, $array_file_format, $cmd); + my ($op, $of, $imported, $fimported, $fanal); + my $oa_adaptor = $self->db->get_ArrayAdaptor(); + my $op_adaptor = $self->db->get_ProbeAdaptor(); + my $of_adaptor = $self->db->get_ProbeFeatureAdaptor(); + my $ec_adaptor = $self->db->get_ExperimentalChipAdaptor(); + my $ac_adaptor = $self->db->get_ArrayChipAdaptor(); + my $slice_adaptor = $self->db->get_SliceAdaptor(); + my $analysis = $self->db->get_AnalysisAdaptor->fetch_by_logic_name("SangerPCR")->dbID(); + #have LiftOver? Could then use liftover in pipeline to redo mappings + + #store now checks whether already stored and updates array chips accordingly + my $array = Bio::EnsEMBL::Funcgen::Array->new + ( + -NAME => $self->array_name(), + -FORMAT => uc($self->format()), + -VENDOR => uc($self->vendor()), + -TYPE => 'PCR', + -DESCRIPTION => "Sanger ENCODE PCR array 3.1.1", + ); + + ($array) = @{$oa_adaptor->store($array)}; + + #This is treating each array chip as a separate array, unless arrayset is defined + #AT present we have no way of differentiating between different array_chips on same array???!!! + #Need to add functionality afterwards to collate array_chips into single array + my $array_chip = Bio::EnsEMBL::Funcgen::ArrayChip->new( + -NAME => $array->name(), + -DESIGN_ID => $array->name(), + -ARRAY_ID =>$array->dbID(), + ); + + ($array_chip) = @{$ac_adaptor->store($array_chip)}; + $array->add_ArrayChip($array_chip); + $self->add_Array($array); + + + #we also need to test wether the array as been imported as well as the mappings + #THis needs to use coord_sys-id not schema_build!! Duplcaite entries for different schema_builds + #with same assembly + + my $dnadb_cs = $self->db->dnadb->get_CoordSystemAdaptor->fetch_by_name('chromosome'); + my $fg_cs = $self->db->get_FGCoordSystemAdaptor->validate_and_store_coord_system($dnadb_cs); + + + #This fails if we're pointing to an old DB during the release cycle. Will be fine if we manage to cs mapping dynamically + + + if ($array_chip->has_status('IMPORTED')) { + $imported = 1; + $self->log("Skipping ArrayChip probe import (".$array_chip->name().") already fully imported"); + + #need to build cache here, from file first else from DB???? + #This is required for feature only imports + #as we won't have the probe dbID available + + if(! $self->get_probe_cache_by_Array($array)){ + $self->get_probe_cache_by_Array($array, 1); + } + + + + } elsif ($self->recovery()) { + $self->log("Rolling back partially imported ArrayChip:\t".$array_chip->name()); + $self->db->rollback_ArrayChip([$array_chip]); #This should really remove all CS imports too? + } + + + #should never really have CS imports if not IMPORTED + #there is however the potential to trash a lot of data if we were to remove the CS importes by mistake + #do we need to check whether any other sets are using the data? + #we have to check for result using relevant cs_id and cc_id + #no removal of probes is the key thing here as nothing is dependent on the feature_ids + #get all result sets by array chip? or get all ExperimentalChips by array chip + #would have to be result set as we would find our own ecs. May find our own rset + + + throw('This needs updating'); + + if ($array_chip->has_status('IMPORTED_CS_'.$fg_cs->dbID())) { + $fimported = 1; + $self->log("Skipping ArrayChip feature import (".$array_chip->name().") already fully imported for ".$self->data_version()); + } elsif ($self->recovery()) { + $self->log("Rolling back partially imported ArrayChip features:\t".$array_chip->name()); + $self->db->rollback_ArrayChip_features($array_chip, $fg_cs); + } + + + #need to check whether already imported on specified schema_build + #check for appropriate file given format in input dir or take path + + #if (! $fimported) {#now need to do this irrespective of import status due to x y requirements + #need only do this once, i.e. if the cache isn't defined yet + #this is assuming cache will be built properly + #may cause problems if not cleaned up properly after use. + + #ignore xy requirements for now, these should be associated with results file + + + + #if (! defined $self->{'_probe_cache'}) { + if (! $fimported) { + + + + if (! $array_file) { + + if (! defined $self->get_dir('input')) { + throw("No input_dir defined, if you are running in a non Experiment context please use -array_file"); + } + + #hacky ..do better? + for my $suffix ("gff", "adf") { + $cmd = $self->get_dir('input')."/".$self->array_name()."*".$suffix; + @list = `ls $cmd 2>/dev/null`; + + if ((scalar(@list) == 1) && + ($list[0] !~ /No such file or directory/o)) { ###this is only printed to STDERR? + + if (! defined $array_file) { + $array_file = $list[0]; + } else { + throw("Found more than one array file : $array_file\t$list[0]\nSpecify one with -array_file"); + } + } + } + + throw("Cannot find array file. Specify one with -array_file") if (! defined $array_file); + } + + + if ($array_file =~ /gff/io) { + $array_file_format = "GFF"; + } elsif ($array_file =~ /adf/io) { + $array_file_format = "ADF"; + throw("Does not yet accomodate Sanger adf format"); + } else { + throw("Could not determine array file format: $array_file"); + } + + + #if (! $fimported) { + $fanal = $self->db->get_AnalysisAdaptor->fetch_by_logic_name(($array_file_format eq "ADF") ? "VendorMap" : "LiftOver"); + #} + + $self->log("Parsing ".$self->vendor()." array data (".localtime().")"); + $fh = open_file($array_file); + my @lines = <$fh>; + close($fh); + + + + my ($chr, $start, $end, $strand, $pid);#, $x, $y, $meta_x, $meta_y, @xy); + + #avoid mutliple calls for same array + my $ac_dbid = $array->get_ArrayChip_by_design_id($array->name())->dbID(); + + #sort file to enable probe cache method for new feature imports + @lines = sort {(split/\t|\;/o, $a)[8] cmp (split/\t|\;/o, $b)[8]} @lines; + + #This is not sorting properly!! + + #my @tmp = map ((split/\t|\;/o, $_)[8], @lines); + #@tmp = sort @tmp; + + + #$self->log('Tmp sorted array is :\n'.join("\n", @tmp)."\n"); + + + + + foreach $line(@lines) { + $line =~ s/\r*\n//; + + #($chr, $start, $end, $ratio, $pid) = split/\t/o, $line; + #($chr, undef, undef, $start, $end, undef, $strand, undef, $pid, $x, $y, $meta_x, $meta_y) = split/\t|\;/o, $line; + ($chr, undef, undef, $start, $end, undef, $strand, undef, $pid) = split/\t|\;/o, $line; + + + if($self->ucsc_coords){ + $start += 1; + } + + + #$meta_x =~ s/META_X=//; + #$x =~ s/X=//; + #$x = $x + (($meta_x -1)*26); + #$meta_y =~ s/META_Y=//; + #$y =~ s/Y=//; + #$y = $y + (($meta_y -1)*25); + $pid =~ s/reporter_id=//o; + $chr =~ s/chr//; + $strand = ($strand eq "+") ? 0 : 1; + + #Hack!!!!!! This is still maintaining the probe entry (and result?) + if (! $self->cache_slice($chr)) { + warn("-- Skipping non standard probe (${pid}) with location:\t${chr}:${start}-${end}\n"); + next; + } + + + #need to parse dependant on file format + #also need to account for duplicate probes on grid + + #need to test for imprted here for rebuilding the probe_info cache + #this will result in always using first x y for the inital import (i.e. skip any probe already in cache) + #or using last x y for previosuly imported as we can't check the cache as it will already be there + #could check for x y + #should always check x y as this will also implicitly check if it is in the cache + + #if (! $self->get_probe_id_by_name($pid)) { #already present in cache + #if(! (@xy = @{$self->get_probe_x_y_by_name($pid)})){ + + #can we not use store_set_probes_features + #would have to add x y to probe, which is not logical as probe can have many x y's + #keep like this and just change cache_probe_info + + if (! $imported) { + #when we utilise array coords, we need to look up probe cache and store again with new coords + #we're currently storing duplicates i.e. different ids with for same probe + #when we should be storing two records for the same probe/id + #the criteria for this will be different for each vendor, may have to check container etc for NimbleGen + + $op = Bio::EnsEMBL::Funcgen::Probe->new( + -NAME => $pid, + -LENGTH => ($end - $start), + -ARRAY => $array, + -ARRAY_CHIP_ID => $ac_dbid, + -CLASS => 'EXPERIMENTAL', + ); + + ($op) = @{$op_adaptor->store($op)}; + #$self->cache_probe_info($pid, $op->dbID, $x, $y); + } else { + #update XY cache for previously imported array + #$self->cache_probe_info($pid, $self->get_probe_id_by_name($pid), $x, $y); + } + + #if (! $fimported) { + $of = Bio::EnsEMBL::Funcgen::ProbeFeature->new( + -START => $start, + -END => $end, + -STRAND => $strand, + -SLICE => $self->cache_slice($chr), + -ANALYSIS => $fanal, + -MISMATCHCOUNT => 0, + -PROBE_ID => ($imported) ? + $self->get_probe_id_by_name_Array($pid, $array) : $op->dbID(), + ); + + #get_probe_id will throw if not in cache, which means that we have an unimported probe + #for an ArrayChip which is flagged as imported, must have been omitted from the import deisgn + #probably a manual fix required. Can we log these and write an update/repair script. + + $of_adaptor->store($of); + #} + + #} else { + #warn("Sanger does not accomodate on plate duplicates yet, result are not linked to X Y coords, using first coords for probe if present in results for $pid\n");¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡¡ + #} + } + + $array_chip->adaptor->set_status('IMPORTED_CS_'.$fg_cs->dbID(), $array_chip) if ! $fimported; + $self->log("ArrayChip:\t".$array_chip->design_id()." has been IMPORTED_CS_".$fg_cs->dbID()); + + } + + + + if (! $imported) { + $array_chip->adaptor->set_status('IMPORTED', $array_chip); + $self->log("ArrayChip:\t".$array_chip->design_id()." has been IMPORTED"); + $self->resolve_probe_data(); + } + + $self->log("Finished parsing ".$self->vendor()." array/probe data (".localtime().")"); + #warn("Finished parsing ".$self->vendor()." array/probe data (".localtime().")"); + + return; +} + +=head2 read_and_import_result_data + + Example : $imp->read_and_import_result_data(); + Description: Parses and imports result for the sanger PCR array platform + Returntype : none + Exceptions : none + Caller : Importer + Status : At risk + +=cut + +sub read_and_import_result_data{ + my $self = shift; + + #change this to read_gff_chip_results + #as opposed to gff channel results + #This should also use the default logic names for the Vendor, or take a user defined list + $self->log("Reading ".$self->vendor()." result data (".localtime().")"); + + my ($file, $chip_uid, $line, $echip); + my ($ratio, $pid, %chip_files, %roll_back); + my $of_adaptor = $self->db->get_ProbeFeatureAdaptor(); + my $ec_adaptor = $self->db->get_ExperimentalChipAdaptor(); + my $chan_adaptor = $self->db->get_ChannelAdaptor(); + my $analysis = $self->db->get_AnalysisAdaptor->fetch_by_logic_name("SangerPCR"); + my $result_adaptor = $self->db->get_ResultSetAdaptor(); + #this is done to avoid having to self->array_name in loop, will make multiple array loop easier + my $array = ${$self->arrays()}[0]; + + #This works a little differently as we're not parsing a meta file + #so the echips haven't been added yet. + #This is treating each array chip as a separate array, unless arrayset is defined + #AT present we have no way of differentiating between different array_chips on same array???!!! + #Need to add functionality afterwards to collate array_chips into single array + + #First add the echips to the Experiment + + if (! @{$self->result_files()}) { + my $list = "ls ".$self->input_dir().'/[0-9]*-[0-9a-zA-Z]*\.all\.*'; + my @rfiles = `$list`; + $self->result_files(\@rfiles); + } + + + foreach $file(@{$self->result_files()}) { + chomp $file; + ($chip_uid = $file) =~ s/.*\///; + $chip_uid =~ s/\..*//; + + $self->log("Found SANGER results file for $chip_uid:\t$file"); + $chip_files{$chip_uid} = $file; + + + $echip = $ec_adaptor->fetch_by_unique_id_vendor($chip_uid, 'SANGER'); + + #this should throw if not recovery + #Nee to check Nimbelgen methods + + if ($echip) { + + if (! $self->recovery()) { + throw("ExperimentalChip(".$echip->unqiue_id().") already exists in the database\nMaybe you want to recover?"); + }else{ + #log pre-reg'd chips for rollback + $roll_back{$echip->dbID()} = 1; + } + } else { + + $echip = Bio::EnsEMBL::Funcgen::ExperimentalChip->new + ( + -EXPERIMENT_ID => $self->experiment->dbID(), + -ARRAY_CHIP_ID => $self->arrays->[0]->get_ArrayChip_by_design_id($array->name())->dbID(), + -UNIQUE_ID => $chip_uid, + ); + + ($echip) = @{$ec_adaptor->store($echip)}; + $self->experiment->add_ExperimentalChip($echip); #if we need a contains method in here , always add!! + } + + #do we need DUMMY entries any more? + + #sub this passing the echip? + foreach my $type ('DUMMY_TOTAL', 'DUMMY_EXPERIMENTAL') { + + my $channel = $chan_adaptor->fetch_by_type_experimental_chip_id($type, $echip->dbID()); + + if ($channel) { + if (! $self->recovery()) { + throw("Channel(".$echip->unique_id().":$type) already exists in the database\nMaybe you want to recover?"); + } + } else { + + $channel = Bio::EnsEMBL::Funcgen::Channel->new + ( + -EXPERIMENTAL_CHIP_ID => $echip->dbID(), + -TYPE => $type, + ); + + ($channel) = @{$chan_adaptor->store($channel)}; + } + } + } + + + + #Now get rset using experiment echips + my $rset = $self->get_import_ResultSet($analysis, 'experimental_chip'); + + if ($rset) { #we have some new data + + foreach my $echip (@{$self->experiment->get_ExperimentalChips()}) { + + if ($echip->has_status('IMPORTED_SangerPCR', $echip)) { + $self->log("ExperimentalChip(".$echip->unique_id().") has already been imported"); + } else { + + my $cc_id = $rset->get_chip_channel_id($echip->dbID()); + + if ($self->recovery() && $roll_back{$echip->dbID()}){ + $self->log("Rolling back results for ExperimentalChip:\t".$echip->unique_id()); + $self->rollback_results($cc_id); + } + + $self->log("Reading SANGER result file for ".$echip->unique_id().":\t".$chip_files{$echip->unique_id()}); + $self->get_probe_cache_by_Array($array) || throw('Failed to reset probe cache handle'); + my $fh = open_file($chip_files{$echip->unique_id()}); + my @lines = <$fh>; + close($fh); + + my $rfile_path = $self->get_dir("norm")."/result.SangerPCR.".$echip->unique_id().".txt"; + my $rfile = open_file($rfile_path, '>'); + my $r_string = ""; + + + @lines = sort {(split/\t|\:/o, $a)[5] cmp (split/\t|\:/o, $b)[5]} @lines; + + foreach my $line (@lines) { + $line =~ s/\r*\n//o; + + ($ratio, undef, $pid) = (split/\t|\:/o, $line)[3..5]; + $pid =~ s/.*://o; + + $ratio = '\N' if $ratio eq 'NA'; #NULL is still useful info to store in result + #my ($x, $y) = @{$self->get_probe_x_y_by_name($pid)}; + + #this is throwing away the encode region which could be used for the probeset/family? + $r_string .= '\N'."\t".$self->get_probe_id_by_name_Array($pid, $array)."\t${ratio}\t${cc_id}\t".'\N'."\t".'\N'."\n";#${x}\t${y}\n"; + } + + print $rfile $r_string; + close($rfile); + + $self->log("Importing:\t$rfile_path"); + $self->db->load_table_data("result", $rfile_path); + $self->log("Finished importing:\t$rfile_path"); + $echip->adaptor->set_status('IMPORTED_SangerPCR', $echip); + } + } + + + + } else { + $self->log("No new data, skipping result parse"); + } + + $self->log("Finished reading and importing ".$self->vendor()." result data (".localtime().")"); + return; +} + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/biotiffin.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/biotiffin.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,259 @@ +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +package Bio::EnsEMBL::Funcgen::Parsers::biotiffin; + +use strict; + +use File::Basename; + +# To get files for bioTIFFIN, download the following GFF file (e.g. via wget): +# +# http://td-blade.gurdon.cam.ac.uk/tad26/fly-tiffinScan-tiffin12.dm3.gff.gz + +# Thomas Down + +# +# 3R MotifScanner TIFDMEM0000001 936391 936401 0.0 + 0 +# 3R MotifScanner TIFDMEM0000001 13455911 13455921 0.0 - 0 +# 3R MotifScanner TIFDMEM0000001 17062830 17062840 0.0 + 0 + +use Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser; +use Bio::EnsEMBL::DBEntry; +use Bio::EnsEMBL::Funcgen::ExternalFeature; +use Bio::EnsEMBL::Utils::Exception qw( throw ); +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser); + + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_, type => 'BioTiffin'); + + #Set default feature_type and feature_set config + + #We need to capture version/release/data of external feature sets. + #This can be nested in the description? Need to add description to feature_set? + + $self->{static_config}{feature_types} = + { + 'BioTIFFIN Motif' => { + name => 'BioTIFFIN Motif', + class => 'Regulatory Motif', + description => 'BioTIFFIN motif', + } + }; + + $self->{static_config}{analyses} = + { + 'BioTIFFIN Motif' => { + -logic_name => 'BioTIFFIN Motif', + -description => 'BioTIFFIN regulatory motif database', + -display_label => 'BioTIFFIN motifs', + -displayable => 1, + }, + }; + + $self->{static_config}{feature_sets} = + { + 'BioTIFFIN Motif' => + { + feature_set => { + -feature_type => 'BioTIFFIN Motif', + -analysis => 'BioTIFFIN Motif', + }, + xrefs => 0, + } + }; + + + #Move xref flag here? + $self->{config} = { + 'BioTIFFIN Motif' => { + file => $ENV{'EFG_DATA'}.'/input/BioTIFFIN/fly-tiffinScan-tiffin12.dm3.gff', + gff_attrs => { + 'ID' => 1, + }, + }, + }; + + $self->validate_and_store_config([keys %{$self->{static_config}{feature_sets}}]); + $self->set_feature_sets; + + return $self; +} + + + +# Parse file and return hashref containing: +# +# - arrayref of features +# - arrayref of factors + + + + +sub parse_and_load { + my ($self, $files, $old_assembly, $new_assembly) = @_; + + if(scalar(@$files) != 1){ + throw('You must provide a unique file path to load VISTA features from:\t'.join(' ', @$files)); + } + + + my %slice_cache; + my $extf_adaptor = $self->db->get_ExternalFeatureAdaptor; + my $dbentry_adaptor = $self->db->get_DBEntryAdaptor; + my $ftype_adaptor = $self->db->get_FeatureTypeAdaptor; + # this object is only used for projection + my $dummy_analysis = new Bio::EnsEMBL::Analysis(-logic_name => 'BioTIFFINProjection');#do we need this? + my $species = $self->db->species; + if(! $species){ + throw('Must define a species to define the external_db'); + } + #Just to make sure we hav homo_sapiens and not Homo Sapiens + ($species = lc($species)) =~ s/ /_/; + + + if(scalar @{$self->import_sets} != 1){ + throw('biotiffin parser currently only supports one import FeatureSet'); + } + + my ($import_set) = @{$self->import_sets}; + + + #foreach my $import_set(@{$self->import_sets}){ + $self->log_header("Parsing $import_set data"); + + my %motif_cache; # name -> factor_id + my $config = $self->{'config'}{$import_set}; + my $fset = $self->{static_config}{feature_sets}{$import_set}{feature_set}; + my %gff_attrs = %{$config->{'gff_attrs'}}; + + + # Parse motifs.txt file + #my $file = $config->{'file'}; + my $file = $files->[0]; + my $skipped = 0; + my $motif_cnt = 0; + my $factor_xref_cnt = 0; + my $feature_cnt = 0; + my $feature_target_cnt = 0; + + open (FILE, "<$file") || die("Can't open $file\n$!\n"); + + LINE: while (my $line = ) { + chomp $line; + + #GFF3 + #3R MotifScanner TIFDMEM0000001 936391 936401 0.0 + 0 + #3R MotifScanner TIFDMEM0000001 13455911 13455921 0.0 - 0 + #3R MotifScanner TIFDMEM0000001 17062830 17062840 0.0 + 0 + #3R MotifScanner TIFDMEM0000001 17973965 17973975 0.0 + 0 + + #seq_name, source, feature, start, end, score, strand, frame, [attrs] + my ($chromosome, $program, $feature, $start, $end, $score, $strand, undef) = split /\t/o, $line; + + if(! exists $slice_cache{$chromosome}){ + + if($old_assembly){ + $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', + $chromosome, + undef, + undef, + undef, + $old_assembly); + } else { + $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', $chromosome); + } + } + + if(! defined $slice_cache{$chromosome}){ + warn "Can't get slice $chromosome for motif $feature;\n"; + $skipped++; + next; + } + + if(! exists $motif_cache{$feature}){ + + $motif_cache{$feature} = $ftype_adaptor->fetch_by_name($feature); + + if(! defined $motif_cache{$feature}){ + + ($motif_cache{$feature}) = @{$ftype_adaptor->store(Bio::EnsEMBL::Funcgen::FeatureType->new + ( + -name => $feature, + -class => $fset->feature_type->class, + -description => $fset->feature_type->description, + ))}; + + $motif_cnt ++; + } + } + + my $feature_type = $motif_cache{$feature}; + + #Now build actual feature + + $feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new + ( + -display_label => $feature, + -start => $start, + -end => $end, + -strand => (($strand eq '+') ? 1 : -1), + -feature_type => $feature_type, + -feature_set => $fset, + -slice => $slice_cache{$chromosome}, + ); + + + # project if necessary + if ($new_assembly) { + $feature = $self->project_feature($feature, $new_assembly); + + if(! defined $feature){ + $skipped ++; + next; + } + } + + ($feature) = @{$extf_adaptor->store($feature)}; + $feature_cnt++; + + + } + + close FILE; + + $self->log("Loaded ".$fset->name); + $self->log("$motif_cnt feature types"); + $self->log("$feature_cnt features"); + $self->log("Skipped $skipped features"); + +#} + + return; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/cisred.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/cisred.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,487 @@ +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +package Bio::EnsEMBL::Funcgen::Parsers::cisred; + +use strict; + +use File::Basename; + +# To get files for CisRed data, download the following 2 files (e.g. via wget): +# +# http://www.cisred.org/content/databases_methods/human_2/data_files/motifs.txt +# +# http://www.cisred.org/content/databases_methods/human_2/data_files/search_regions.txt + + +#No longer valid urls, now use the following for ensembl formats for all species: +#http://www.bcgsc.ca/downloads/cisred/temp/cisRED4Ensembl/ +#naminf may not be obvious, may have to cross reference with above previous urls to get build info + +# Format of motifs.txt (note group_name often blank) + +#name chromosome start end strand group_name ensembl_gene +#craHsap1 1 168129978 168129997 -1 1 ENSG00000000457 +#craHsap2 1 168129772 168129781 -1 2 ENSG00000000457 +#craHsap3 1 168129745 168129756 -1 3 ENSG00000000457 +#craHsap4 1 168129746 168129753 -1 4 ENSG00000000457 +#craHsap5 1 168129745 168129752 -1 5 ENSG00000000457 +#craHsap6 1 168129741 168129757 -1 6 ENSG00000000457 + + +# Format of search_regions.txt +# name chromosome start end strand ensembl_gene_id +# 1 17 39822200 39824467 -1 ENSG00000005961 +# 8 17 23151483 23153621 -1 ENSG00000007171 +# 14 1 166434638 166437230 -1 ENSG00000007908 +# 19 1 23602820 23605631 -1 ENSG00000007968 + + +use Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser; +use Bio::EnsEMBL::DBEntry; +use Bio::EnsEMBL::Funcgen::ExternalFeature; +use Bio::EnsEMBL::Utils::Exception qw( throw ); + + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser); + + + + + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_, type => 'cisRED'); + + #Set default feature_type and feature_set config + $self->{static_config}{feature_types} = + { + 'cisRED Search Region' => { + -name => 'cisRED Search Region', + -class => 'Search Region', + -description => 'cisRED search region', + }, + 'cisRED Motif' => { + -name => 'cisRED Motif', + -class => 'Regulatory Motif', + -description => 'cisRED atomic motif', + }, + }; + + $self->{static_config}{analyses} = + { + cisRED => { + -logic_name => 'cisRED', + -description => 'cisRED motif search (www.cisred.org)', + -display_label => 'cisRED', + -displayable => 1, + }, + }; + + $self->{static_config}{feature_sets} = + { + 'cisRED search regions' => + { + analyses => $self->{static_config}{analyses}, + feature_types => $self->{static_config}{feature_types}, + feature_set => { + #feature_type and analysis here are the keys from above + -feature_type => 'cisRED Search Region', + -display_label => 'cisRED searches', + -analysis => 'cisRED', + }, + xrefs => 1, + }, + + + 'cisRED motifs' => + { + analyses => $self->{static_config}{analyses}, + feature_types => $self->{static_config}{feature_types}, + feature_set => { + #feature_type and analysis here are the keys from above + -feature_type => 'cisRED Motif', + -analysis => 'cisRED', + }, + xrefs => 1, + }, + }; + + #$self->validate_and_store_feature_types; + $self->validate_and_store_config([keys %{$self->{static_config}{feature_sets}}]); + $self->set_feature_sets; + + return $self; +} + + + + + +# Parse file and return hashref containing: +# +# - arrayref of features +# - arrayref of factors + + +#To do +# 1 This needs to take both file names, motifs, then search regions. Like the Bed/GFF importers do. + + +sub parse_and_load { + my ($self, $files, $old_assembly, $new_assembly) = @_; + $self->log_header("Parsing cisRED data"); + + if(scalar(@$files) != 2){ + throw('You must currently define a motif and search file to load cisRED features from:\t'.join(' ', @$files)); + } + + + my $analysis_adaptor = $self->db->get_AnalysisAdaptor(); + #my %features_by_group; # name -> factor_id + my %groups; + my %slice_cache; + my $extf_adaptor = $self->db->get_ExternalFeatureAdaptor; + my $dbentry_adaptor = $self->db->get_DBEntryAdaptor; + my $ftype_adaptor = $self->db->get_FeatureTypeAdaptor; + #my $display_name_cache = $self->build_display_name_cache('gene'); + # this object is only used for projection + my $dummy_analysis = new Bio::EnsEMBL::Analysis(-logic_name => 'CisREDProjection'); + + # ---------------------------------------- + # We need a "blank" factor for those features which aren't assigned factors + # Done this way to maintain referential integrity + #my $blank_factor_id = $self->get_blank_factor_id($db_adaptor); + + #More validation of files here? + my ($motif_file) = grep(/motif/, @$files); + my ($search_file) = grep(/search/, @$files); + my $species = $self->db->species; + if(! $species){ + throw('Must define a species to define the external_db'); + } + #Just to make sure we hav homo_sapiens and not Homo Sapiens + ($species = lc($species)) =~ s/ /_/; + + + # Parse motifs.txt file + $self->log_header("Parsing cisRED motifs from $motif_file"); + my $skipped = 0; + my $skipped_xref = 0; + #my $coords_changed = 0; + my $cnt = 0; + my $set = $self->{static_config}{feature_sets}{'cisRED motifs'}{feature_set}; + + + + open (FILE, "<$motif_file") || die "Can't open $motif_file"; + ; # skip header + + while () { + next if ($_ =~ /^\s*\#/o || $_ =~ /^\s*$/o); + chomp; + + #name chromosome start end strand group_name ensembl_gene + #craHsap1 1 168129978 168129997 - crtHsap40066,crtHsap40060 ENSG00000000457 + #craHsap2 1 168129772 168129781 - crtHsap40068,crtHsap40193,crtHsap40130 ENSG00000000457 + + #So we only ever have one atomic motif, which may belong to several groups + #Do not store atmoic motifs as feature types as this is the actual feature + #simply use the feature_set->feature_type and store the atmoic motif id as the name + + + my ($motif_name, $chromosome, $start, $end, $strand, $groups, $gene_id) = split/\t/o; + #($gene_id) = $gene_id =~ /(ENS.*G\d{11})/; + my @group_names = split/,/, $groups; + + #These are stranded features, so either - or +, never 0; + $strand = ($strand eq '-') ? -1 : 1; + + if(! exists $slice_cache{$chromosome}){ + + if($old_assembly){ + $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', + $chromosome, + undef, + undef, + undef, + $old_assembly); + }else{ + $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', $chromosome); + } + } + + if(! defined $slice_cache{$chromosome}){ + warn "Can't get slice $chromosome for motif $motif_name\n"; + $skipped++; + next; + } + + + #get feature_type first + + #we are not maintaining this link in the DB! + #Do we need another xref for this or a different table? + + + #if ($group_name && $group_name ne '' && $group_name !~ /\s/o) { +# +# if(! exists $features_by_group{$group_name}){ +# $features_by_group{$group_name} = $ftype_adaptor->fetch_by_name('crtHsap'.$group_name); +# +# if(! defined $features_by_group{$group_name}){ +# ($features_by_group{$group_name}) = @{$ftype_adaptor->store(Bio::EnsEMBL::Funcgen::FeatureType->new +# ( +# -name => 'crtHsap'.$group_name, +# -class => 'Regulatory Motif', +# -description => 'cisRED group', +# ))}; +# } +# } +# } + #}else{ + # throw("Found cisRED feature $motif_name with no group_name, unable to defined feature_type"); + #} + + foreach my $group(@group_names){ + + next if exists $groups{$group}; + + #else store the new group as a feature_type and set $group to be the feature_type + ($group) = @{$ftype_adaptor->store(Bio::EnsEMBL::Funcgen::FeatureType->new + ( + -name => $group, + -class => 'Regulatory Motif', + -description => 'cisRED group', + ))}; + } + + + + #my $ftype = (defined $features_by_group{$group_name}) ? $features_by_group{$group_name} : $self->{'feature_sets'}{'cisRED group motifs'}->feature_type; + + + my $feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new + ( + -display_label => $motif_name, + -start => $start, + -end => $end, + -strand => $strand, + #-feature_type => $ftype, + -associated_feature_types => \@group_names, + -feature_set => $set, + -slice => $slice_cache{$chromosome}, + ); + + + + # project if necessary + if ($new_assembly) { + $feature = $self->project_feature($feature, $new_assembly); + + if(! defined $feature){ + $skipped ++; + next; + } + + } + + ($feature) = @{$extf_adaptor->store($feature)}; + $cnt++; + + + + #We don't care so much about loading features for retired Genes here + #as the Genes are only used to define the search regions + #Not a direct alignment as with the miRanda set + + #However, adding an xref will create dead link in the browser + + #Build Xref here + if (! $gene_id) { + warn("No xref available for motif $motif_name\n"); + $skipped_xref++; + next; + } + + my $display_name = $self->get_core_display_name_by_stable_id($self->db->dnadb, $gene_id, 'gene'); + + #Handle release/version in xref version as stable_id version? + + my $dbentry = Bio::EnsEMBL::DBEntry->new + ( + -dbname => $species.'_core_Gene', + #-release => $self->db->_get_schema_build($self->db->dnadb), + #-release => '49_36b',#harcoded for human + -release => '49_37b', #hardcoded for mouse + -status => 'KNOWNXREF', + #-display_label_linkable => 1, + -db_display_name => 'EnsemblGene', + -type => 'MISC',#this is external_db.type + -primary_id => $gene_id, + -display_id => $display_name, + -info_type => 'MISC', + -info_text => 'GENE', + -linkage_annotation => 'cisRED motif gene', + -analysis => $set->analysis, + #could have version here if we use the correct dnadb to build the cache + ); + $dbentry_adaptor->store($dbentry, $feature->dbID, 'ExternalFeature', 1);#1 is ignore release flag + } + + + close FILE; + + $self->log("Stored $cnt cisRED ExternalFeature motif"); + $self->log("Skipped $skipped cisRED ExternalFeature motif imports"); + $self->log("Skipped an additional $skipped_xref DBEntry imports"); + + #Now store states + foreach my $status(qw(DISPLAYABLE MART_DISPLAYABLE)){ + $set->adaptor->store_status($status, $set); + } + + + + # ---------------------------------------- + # Search regions + # read search_regions.txt from same location as $file + + #my $search_regions_file = dirname($file) . "/search_regions.txt"; + #my $search_file; + #($search_regions_file = $file) =~ s/motifs/searchregions/; + + $skipped = 0; + $cnt = 0; + $skipped_xref = 0; + $set = $self->{static_config}{feature_sets}{'cisRED search regions'}{feature_set}; + + $self->log_header("Parsing cisRED search regions from $search_file"); + open (SEARCH_REGIONS, "<$search_file") || die "Can't open $search_file"; + ; # skip header + + while () { + chomp; + my ($id, $chromosome, $start, $end, $strand, $gene_id) = split; + my $display_id = $self->get_core_display_name_by_stable_id($self->db->dnadb, $gene_id, 'gene'); + my $name = "CisRed_Search_$id"; + + if(! exists $slice_cache{$chromosome}){ + + if($old_assembly){ + $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', + $chromosome, + undef, + undef, + undef, + $old_assembly); + }else{ + $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', $chromosome); + } + } + + if(! defined $slice_cache{$chromosome}){ + warn "Can't get slice $chromosome for search region $name\n"; + next; + } + + + + + + my $search_feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new + ( + -display_label => $name, + -start => $start, + -end => $end, + -strand => $strand, + -feature_set => $set, + -slice => $slice_cache{$chromosome}, + ); + + + + # project if necessary + if ($new_assembly) { + $search_feature = $self->project_feature($search_feature); + + if(! defined $search_feature){ + $skipped ++; + next; + } + } + + $extf_adaptor->store($search_feature); + $cnt++; + + #Build Xref here + #need to validate gene_id here!! + + if (! $gene_id) { + warn("Can't get internal ID for $gene_id\n"); + $skipped_xref++; + next; + } + + my $display_name = $self->get_core_display_name_by_stable_id($self->db->dnadb, $gene_id, 'gene'); + + my $dbentry = Bio::EnsEMBL::DBEntry->new + ( + -dbname => $species.'_core_Gene', + #-release => $self->db->dnadb->dbc->dbname, + -status => 'KNOWNXREF', + #-display_label_linkable => 1, + #-db_display_name => $self->db->dnadb->dbc->dbname, + -db_display_name => 'EnsemblGene', + -type => 'MISC', + -primary_id => $gene_id, + -display_id => $display_name, + -info_type => 'MISC', + -info_text => 'GENE', + -linkage_annotation => 'cisRED search region gene',#omit? + -analysis => $set->analysis, + #could have version here if we use the correct dnadb to build the cache + ); + $dbentry_adaptor->store($dbentry, $search_feature->dbID, 'ExternalFeature', 1);#1 is ignore release flag + } + + close(SEARCH_REGIONS); + + + $self->log("Stored $cnt cisRED search region ExternalFeatures"); + $self->log("Skipped $skipped cisRED search region ExternalFeatures"); + $self->log("Skipped an additional $skipped_xref cisRED search region DBEntry imports"); + + #No MART_DISPLAYABLE here + $set->adaptor->store_status('DISPLAYABLE', $set); + + + #print "$coords_changed features had their co-ordinates changed as a result of assembly mapping.\n" if ($new_assembly); + + return; + +} + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/miranda.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/miranda.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,556 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +package Bio::EnsEMBL::Funcgen::Parsers::miranda; + +use strict; + +use Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser; +use Bio::EnsEMBL::DBEntry; +use Bio::EnsEMBL::Funcgen::ExternalFeature; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser); + + + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_, type => 'miRanda'); + + #Set default feature_type and feature_set config + $self->{static_config}{feature_types} = + { + 'miRanda Target' => { + -name => 'miRanda Target', + -class => 'RNA', + -description => 'miRanda microRNA target', + }, + }; + + $self->{static_config}{analyses} = + { + miRanda => { + -logic_name => 'miRanda', + -description => 'miRanda microRNA target predictions', + -display_label => 'miRanda Targets', + -displayable => 1, + }, + }; + + $self->{static_config}{feature_sets}{'miRanda miRNA targets'} = + { + #analyses => $self->{static_config}{analyses}, + #feature_types => $self->{static_config}{feature_types}, + feature_set => + { + -feature_type => 'miRanda Target', + -display_name => 'miRanda Targets', + -description => $self->{static_config}{analyses}{miRanda}{-description}, + -analysis => 'miRanda', + }, + xrefs => 1, + }; + + + + + #$self->validate_and_store_feature_types; + $self->validate_and_store_config([keys %{$self->{static_config}{feature_sets}}]); + $self->set_feature_sets; + + return $self; +} + + +#TO DO +# In loop logging should only be enable with verbose or change to debug? +# Use count methods? +# Sort input to enable cache clearing(caches max out at ~200MB so not essential) +# Optimise slice cache testing(load only takes 8 min sso not essential) +# Add verbose logging/debug to show reassigned xrefs? + +sub parse_and_load{ + my ($self, $files, $old_assembly, $new_assembly) = @_; + + #Add num files to config and check this in BaseImporter(generically) + if(scalar(@$files) != 1){ + throw('You must currently define a single file to load miRanda features from:\t'.join(' ', @$files)); + } + + my $file = $files->[0]; + $self->log_header("Parsing miRanda data from:\t$file"); + + my $analysis_adaptor = $self->db->get_AnalysisAdaptor(); + my $ftype_adaptor = $self->db->get_FeatureTypeAdaptor(); + my $extf_adaptor = $self->db->get_ExternalFeatureAdaptor; + my $trans_adaptor = $self->db->dnadb->get_TranscriptAdaptor; + my $dbentry_adaptor = $self->db->get_DBEntryAdaptor; + my $set = $self->{static_config}{feature_sets}{'miRanda miRNA targets'}{feature_set}; + + my ($dbentry, $schema_build, %features_by_name, %slice_cache, $ens_display_name, %feature_cache); + my (@txs, @sid_dnames, %xref_cache, %seen_mifeat_trans, %skipped_features, %failed_reassignment); + my ($overlaps_3_utr); + # this object is only used for projection + my $dummy_analysis = new Bio::EnsEMBL::Analysis(-logic_name => 'miRandaProjection'); + + #Some hairy counting to make sure the XREF reassignment + #is doing something sensible + #Can probably change all this to use the count methods + my $slice_skipped = 0; + my $old_mid_skipped = 0; + my $cnt = 0; + my $proj_skipped = 0; + my $seq_skipped = 0; + my $row_cnt = 0; + my $new_xrefs = 0; + my $old_as_new_xrefs = 0; + my $failed_new_sids = 0; + my $utr_failed_old_sids = 0; + my $no_proj_failed_old_sids = 0; + my $previously_skipped = 0; + my $with_overlapping_tx = 0; + my $retired_sids = 0; + my $existing_sids = 0; + my $total_xrefs = 0; + my $species = $self->db->species; + my $analysis = $set->analysis; + + + #Need to allow this to be passed + $schema_build ||= $self->db->_get_schema_build($self->db->dnadb); + + if(! $species){ + throw('Must define a species to define the external_db'); + } + #Just to make sure we hav homo_sapiens and not Homo Sapiens + ($species = lc($species)) =~ s/ /_/; + + my $edb_name = $species.'_core_Transcript', + + + + open (FILE, "<$file") || die "Can't open $file"; + + #We used to have redundant target features wrt xrefs + #Similarity mmu-miR-192 miRanda miRNA_target 5 127885168 127885188 - . 15.2253 4.048320e-03 ENSMUST00000031367 Slc15a4 + #Similarity mmu-miR-192 miRanda miRNA_target 5 127885168 127885188 - . 15.2397 3.945600e-03 ENSMUST00000075376 Slc15a4 + + #Have now changed this to nr features with multiple xrefs + #Hence have had to remove transcript sid from diplay_label + + #Several 'seen' caches have be implemented to prevent redundant storing + #Currently these span the whole data set, but could be cleaned after every feature + #if the input is sorted correctly + + #Need to make sure we are counting correctly + #skipped counts will reflect lines in input, not nr features + + + #Could add UnmappedObjects in here + + LINE: while () { + next LINE if ($_ =~ /^\s*\#/o || $_ =~ /^\s*$/o); + $row_cnt ++; + #Added next for old miRbase IDs. + + #Sanger + ##GROUP SEQ METHOD FEATURE CHR START END STRAND PHASE SCORE PVALUE_OG TRANSCRIPT_ID EXTERNAL_NAME + #Similarity mmu-miR-707 miRanda miRNA_target 2 120824620 120824640 + . 15.3548 2.796540e-02 ENST00000295228 INHBB + + + my ($group, $id, $method, $feature, $chr, $start, $end, $strand, undef, undef, undef, $ens_id, $display_name) = split; + #We never use $diplay_name now, as it never match the transcript display name + #which as the transcript number suffic attached to the gene display name + #e.g. BRCA2-001 + + #%seen_mifeat_trans handles redundancy between existant(i.e. not retired) sids in input + #file and those identified when trying to reannotated a retired sid + + #ALREADY ANNOTATED OR SKIPPED + if(exists $seen_mifeat_trans{$id.':'.$chr.':'.$start.':'.$end}{$ens_id} ){ + $self->log("Skipping previously reannotated miRNA target:\t".$id.':'.$chr.':'.$start.':'.$end.' - '.$ens_id); + $old_as_new_xrefs ++; + } + elsif(exists $skipped_features{$id.':'.$chr.':'.$start.':'.$end}){ + $self->log("Skipping previous failed feature:\t".$id.':'.$chr.':'.$start.':'.$end."\t". + $skipped_features{$id.':'.$chr.':'.$start.':'.$end}); + $previously_skipped++; + #Could increment count in here to count old xrefs failed for each fail type + next; + } + + + #Added next for old miRbase IDs. + if ( $id =~ /\*$/o ){ + $self->log("Skipping old miRbase ID:\t$id"); + + $skipped_features{$id.':'.$chr.':'.$start.':'.$end} = 'Old invalid miRbase ID'; + $old_mid_skipped ++; + + #$old_xrefs_skipped ++; + #Just want to make sure we have comparable + #old skipped xrefs and re-annotated xrefs + #so this is not useful here + + + next LINE; + } + + $strand = ($strand =~ /\+/o) ? 1 : -1; + + #Now moved transript xref info exclusively to xrefs + ##my $id = $ens_id =~ s/[\"\']//g; # strip quotes + #my $id = $ens_id.':'.$seq; + + + #change this to only test once + #if exists and not defined then skip + + if(! defined $slice_cache{$chr}){ + + #Was originally limiting to chromosome + + if($old_assembly){ + $slice_cache{$chr} = $self->slice_adaptor->fetch_by_region(undef, + $chr, + undef, + undef, + undef, + $old_assembly); + }else{ + $slice_cache{$chr} = $self->slice_adaptor->fetch_by_region(undef, $chr); + } + + if(! defined $slice_cache{$chr}){ + warn "Can't get slice $chr for sequence $id\n"; + + $slice_skipped ++; + $skipped_features{$id.':'.$chr.':'.$start.':'.$end} = "Failed to fetch slice $chr"; + + #Add UnmappedObject here? + next LINE; + } + } + + + #We can add coding xref to feature type based on the miRbase name + #.e.g hsa-mir-24-1 + #However, this isn't stored as an xref + #It is stored in the gene.description + #e.g. hsa-mir-24-1 [Source:miRBase;Acc:MI0000080] + #Not easy to fetch as descriptions not indexed! + # + + #Cache/store FeatureType + + if(! exists $features_by_name{$id}){ + $features_by_name{$id} = $ftype_adaptor->fetch_by_name($id); + + if(! defined $features_by_name{$id}){ + ($features_by_name{$id}) = @{$ftype_adaptor->store(Bio::EnsEMBL::Funcgen::FeatureType->new + ( + -name => $id, + -class => 'RNA', + -description => $method.' '.$feature, + ))}; + + #Need to add source gene xref here to enable target conequences implied by source variation + + } + } + + + #Make sure we have the target transcript before we store the feature + #Have to do this as we can't always run with the correct core DB + #as it may be too old. Hence we have to hard code the edb.release + + + ##This should enever happen, as the search regions are defined by ens transcript + ##i.e there is always a ensembl iD + + if (! $ens_id) { + warn("No xref available for miRNA $id\n"); + $skipped_xref++; + next; + } + + + + if(exists $feature_cache{$id.':'.$chr.':'.$start.':'.$end}){ + $feature = $feature_cache{$id.':'.$chr.':'.$start.':'.$end}; + } + else{ + + $feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new + ( + -display_label => $id, + -start => $start, + -end => $end, + -strand => $strand, + -feature_type => $features_by_name{$id}, + -feature_set => $set, + -slice => $slice_cache{$chr}, + ); + + # project if necessary + if ($new_assembly) { + + $feature = $self->project_feature($feature, $new_assembly); + + if (! defined $feature) { + $proj_skipped ++; + $skipped_features{$id.':'.$chr.':'.$start.':'.$end} = 'Failed projection'; + next; + } + + #This was failing as old assembly seqs are not stored, hence are returned as Ns + #if ($old_seq ne $feature->seq){ + # $skipped_features{$id.':'.$chr.':'.$start.':'.$end} = 'Projected sequence mismatch'; + # $seq_skipped++; + # next; + #} + + #Assembly mapping tends to ignore single seq mismatches, but does handle gaps. + #This is not generally a problemas a new assembly is normally a reshuffle of existing + #clones/contigs which will have exactly the same seq(actually there is a small amount of change) + #but this only affected a handful of transcripts + #old super(contigs) which have be integrated into a chromosome are not mapped + #so we will lose these + #the rest (retired clones/contigs and new clones/contigs) are aligned + #but again mismatches tend to be ignored. + + #There may be a tendency for this later case to contain more mismatches due to an entirely new + #clone seqeunce, hence let's filter/count these? + + #Tested and import using the following hack using old 67 DB to test seq mismatches + #only 1 failed. + #my $old_slice = $v67_sa->fetch_by_region( undef, $chr, $start, $end, $strand); + + #if ($old_slice->seq ne $feature->seq){ + # $skipped_features{$id.':'.$chr.':'.$start.':'.$end} = 'Projected sequence mismatch'; + # $seq_skipped++; + # next LINE; + #} + } + } + + + + ### DEFINE TRANSCRIPT XREF INFO + + #Check transcript exists + $display_name = $self->get_core_display_name_by_stable_id($self->db->dnadb, $ens_id, 'transcript'); + + + # ATTEMPT TO REANNOTATE + if (! defined $display_name){ # Transcript does not exist in current release + + #Set to 0 as we are not storing this sid + $seen_mifeat_trans{$id.':'.$chr.':'.$start.':'.$end}{$ens_id} = 0; + + $self->log("$id $ens_id stable ID has been retired"); + $retired_sids ++; + + #Try and re-annotate on newer overlapping transcripts + @txs = @{$trans_adaptor->fetch_all_by_Slice($feature->feature_Slice)}; + @sid_dnames = (); + + #COUNT unique miRNA features which have failed reannotated + if(! exists $failed_reassignment{$id.':'.$chr.':'.$start.':'.$end}){ + $failed_reassignment{$id.':'.$chr.':'.$start.':'.$end} = 1; + } + + + + if (@txs) { #OVERLAPPING TRANSCRIPTS + $with_overlapping_tx ++; + + foreach my $tx(@txs){ + + #Check we have previously seen this xref + if(! exists $seen_mifeat_trans{$id.':'.$chr.':'.$start.':'.$end}{$tx->stable_id}){ + + #Do UTR checking here + $overlaps_3_utr = 0; + + #Transcript will always return wrt to feature_Slice of miRNA feature. + #As miRNA are complimentary to the the mRNA, which is complimentary + #to the sense strand of the gene, these should always be on the same strand. + + #Could add some more detailed strand/UTR fail counts in here + #but leave for now + #$same_strand = 1; + + if($tx->seq_region_strand != $strand){ + #$same_strand = 0; + } + elsif($strand == 1){ + + if( ($end <= $tx->seq_region_end) && + ($start > $tx->coding_region_end) ){ + $overlaps_3_utr = 1; + } + + } + else{#Must be -1 + + if( ($end < $tx->coding_region_start) && + ($start >= $tx->seq_region_start) ){ + $overlaps_3_utr = 1; + } + } + + #could count no utr match here if we set same_strand boolean + + if($overlaps_3_utr){ + $display_name = $self->get_core_display_name_by_stable_id($self->db->dnadb, + $tx->stable_id, + 'transcript'); + push @sid_dnames, [$tx->stable_id, $display_name]; + $seen_mifeat_trans{$id.':'.$chr.':'.$start.':'.$end}{$tx->stable_id} = 1; + #want to count re-annotated xrefs here + #These may have been represented in the original file + $new_xrefs ++; + } + else{ + # FAILED annotate new sid + # This currently includes +ve and -ve strand transcripts + $failed_new_sids ++; + } + } + + if(! @sid_dnames){ + #FAILED TO REANNOTATE retired sid xref + $utr_failed_old_sids ++; + + next LINE; + } + } + } + else{ #FAILED TO REANNOTATE XREF - NO OVERLAPPING TRANSCRIPTS + $no_proj_failed_old_sids ++; + next LINE; + } + + + #COUNT unique miRNA features which have failed reannotated + if(@sid_dnames){ + $failed_reassignment{$id.':'.$chr.':'.$start.':'.$end} = 0 + } + + } + else { #Add xref for existing transcript + $seen_mifeat_trans{$id.':'.$chr.':'.$start.':'.$end} = {$ens_id => 1}; + #$display_name = $ens_display_name; + @sid_dnames = ([$ens_id, $display_name]); + $existing_sids++; + } + + + #shouldn't need this if as we call next for everycase above + #if (@xref_details) { + #Only if we have target transcripts + + # STORE FEATURE + if (! exists $feature_cache{$id.':'.$chr.':'.$start.':'.$end}){ + ($feature) = @{$extf_adaptor->store($feature)}; + $feature_cache{$id.':'.$chr.':'.$start.':'.$end} = $feature; + $cnt++; + } + + + # STORE XREFS + foreach my $xref_info(@sid_dnames) { + + #Handle release/version in xref version as stable_id version? + + $dbentry = Bio::EnsEMBL::DBEntry->new + ( + -dbname => $edb_name, + -release => $schema_build, + #-release => '58_37k',#'46_36h', #Hard coded due to schema to old to use with API + -status => 'KNOWNXREF', + #-display_label_linkable => 1, + -db_display_name => 'EnsemblTranscript', + -type => 'MISC', + -primary_id => $xref_info->[0], + -display_id => $xref_info->[1], + -info_type => 'MISC', + -info_text => 'TRANSCRIPT', + -linkage_annotation => 'miRanda target - negative influence', + #could have version here if we use the correct dnadb to build the cache + -analysis => $analysis, + ); + + $dbentry_adaptor->store($dbentry, $feature->dbID, 'ExternalFeature', 1); #1 is ignore release flag + $total_xrefs++; + } + } + + close FILE; + + #scalar context returns count + my $miRNA_failed_reassignment = grep/1/, values %failed_reassignment; + my $miRNA_skipped = $old_mid_skipped + $slice_skipped + + $proj_skipped + $seq_skipped + $miRNA_failed_reassignment; + + $self->log_header($set->name." Import Report"); + $self->log(sprintf("%-090s", "miRNA feature:target pairs seen(i.e. input rows):").$row_cnt); + $self->log(sprintf("%-090s","Stored NR miRanda miRNA ExternalFeatures:").$cnt); + $self->log(sprintf("%-090s","Total skipped miRanda miRNA target features(inc reassigned):").$miRNA_skipped); + $self->log(sprintf("%-090s","Old miRbase IDs skipped").$old_mid_skipped); + $self->log(sprintf("%-090s","Skipped features on unknown slice:").$slice_skipped."\n"); + + if($new_assembly){ + $self->log(sprintf("%-090s","Skipped due to failed assembly projection:").$proj_skipped); + $self->log(sprintf("%-090s","Skipped due to seq mismatch for assembly projection:")."$seq_skipped\n"); + } + + $self->log("The following numbers are counted from the mappable/valid miRNA target features"); + $self->log(sprintf("%-090s", "Total stored Transcript xrefs(current and retired re-assigned):"). $total_xrefs); + $self->log(sprintf("%-090s", "Total current Transcript xrefs:"). $existing_sids); + $self->log(sprintf("%-090s", "Total skipped miRanda miRNA target xrefs:").($previously_skipped + $miRNA_skipped)); + $self->log(sprintf("%-090s", "Retired Transcript xrefs:").$retired_sids); + $self->log(sprintf("%-090s", "Unique miRNA features which completely failed reassignment:").$miRNA_failed_reassignment); + $self->log(sprintf("%-090s", "Total new Xrefs assigned due to retired Transcript:").$new_xrefs); + $self->log(sprintf("%-090s", "Previously assigned Transcript Xrefs skipped:").$old_as_new_xrefs); + $self->log(sprintf("%-090s", "Retired Transcript Xrefs with no new overlapping Transcript:").$no_proj_failed_old_sids); + $self->log(sprintf("%-090s", "Retired Transcript Xrefs with new overlapping Transcript(s):").$with_overlapping_tx); + $self->log(sprintf("%-090s", "Retired Transcript Xrefs with new overlapping Transcript(s), all fail 3'UTR/strand test:"). + $utr_failed_old_sids); + #This figure may include transcripts which would have failed in the original set! + $self->log(sprintf("%-090s","Total new Transcript xrefs considered which fail 3' UTR/strand test:"). $failed_new_sids); + $self->log(sprintf("%-090s","True new Xref assignments due to retired Transcripts:").($new_xrefs - $old_as_new_xrefs)); + + #We also want unique miRNA feature which we re-assigned + #Hard to calculate as the re-assignment might be to a current transcript in the input + + #Now set states + foreach my $status(qw(DISPLAYABLE MART_DISPLAYABLE)){ + $set->adaptor->store_status($status, $set); + } + + return; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/redfly.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/redfly.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,430 @@ +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +package Bio::EnsEMBL::Funcgen::Parsers::redfly; + +use strict; + +use File::Basename; + +# To get files for REDfly, download the following 2 GFF3 files (e.g. via wget): +# +# http://redfly.ccr.buffalo.edu/datadumps/tbfs_dump.gff +# http://redfly.ccr.buffalo.edu/datadumps/crm_dump.gff + +#contact? + +#TFBS +#2L REDfly regulatory_region 2456365 2456372 . . . ID="Unspecified_dpp:REDFLY:TF000068"; Dbxref="Flybase:FBgn0000490", "PMID:8543160", "REDfly:644, "FlyBase:"; Evidence="footprint/binding assay"; Factor="Unspecified"; Target="dpp"; +#2L REDfly regulatory_region 2456352 2456369 . . . ID="dl_dpp:REDFLY:TF000069"; Dbxref="Flybase:FBgn0000490", "PMID:8458580", "REDfly:645, "FlyBase:FBgn0000463"; Evidence="footprint/binding assay"; Factor="dl"; Target="dpp"; + +#CRMs +#2L REDfly regulatory_region 2455781 2457764 . . . ID="dpp_intron2"; Dbxref="Flybase:FBgn0000490", "PMID:8167377", "REDfly:247; Evidence="reporter construct (in vivo)"; Ontology_term="FBbt:00005304"; +#2L REDfly regulatory_region 2445769 2446581 . . . ID="dpp_dpp813"; Dbxref="Flybase:FBgn0000490", "PMID:7821226", "REDfly:246; Evidence="reporter construct (in vivo)"; Ontology_term="FBbt:00005653","FBbt:00001051"; + + + +use Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser; +use Bio::EnsEMBL::DBEntry; +use Bio::EnsEMBL::Funcgen::ExternalFeature; +use Bio::EnsEMBL::Utils::Exception qw( throw ); +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser); + + + + + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_, type => 'REDfly'); + + #Set default feature_type and feature_set config + + #We need to capture version/release/data of external feature sets. + #This can be nested in the description? Need to add description to feature_set? + + $self->{static_config}{feature_types} = + { + 'REDfly TFBS' => { + -name => 'REDfly TFBS', + -class => 'Transcription Factor', + -description => 'REDfly transciption factor binding site', + }, + 'REDfly CRM' => { + -name => 'REDfly CRM', + -class => 'Regulatory Motif', + -description => 'REDfly cis regulatory motif', + }, + }; + + + $self->{static_config}{analyses} = + { + 'REDfly TFBS' => { + -logic_name => 'REDfly TFBS', + -description => 'REDfly transcription factor binding sites (http://redfly.ccr.buffalo.edu/)', + -display_label => 'REDfly TFBS', + -displayable => 1, + }, + + 'REDfly CRM' => { + -logic_name => 'REDfly CRM', + -description => 'REDfly cis regulatory motif (http://redfly.ccr.buffalo.edu/)', + -display_label => 'REDfly CRM', + -displayable => 1, + }, + }; + + $self->{static_config}{feature_sets} = + { + 'REDfly TFBSs' => { + feature_set => + { + -feature_type => 'REDfly TFBS', + -analysis => 'REDfly TFBS', + }, + xrefs => 1, + }, + + 'REDfly CRMs' => { + feature_set => + { + -feature_type => 'REDfly CRM', + -analysis => 'REDfly CRM', + }, + + xrefs => 1, + }, + }; + + + #Move xref flag here? + $self->{config} = { + 'REDfly CRMs' => { + #file => $ENV{'EFG_DATA'}.'/input/REDFLY/redfly_crm.gff', + gff_attrs => { + 'ID' => 1, + }, + }, + + 'REDfly TFBSs' => { + #file => $ENV{'EFG_DATA'}.'/input/REDFLY/redfly_tfbs.gff', + gff_attrs => { + 'ID' => 1, + 'Factor' => 1, + 'Target' => 1, + }, + desc_suffix => ' binding site', + } + }; + + + $self->validate_and_store_config([keys %{$self->{static_config}{feature_sets}}]); + $self->set_feature_sets; + + return $self; +} + + + + + +# Parse file and return hashref containing: +# +# - arrayref of features +# - arrayref of factors + + + + +sub parse_and_load { + my ($self, $files, $old_assembly, $new_assembly) = @_; + + if(scalar(@$files) != 2){ + throw('You must currently define a crm and tfbs file to load redfly features from:\t'.join(' ', @$files)); + } + + #More validation of files here? + $self->{config}{'REDfly CRMs'}{file} = grep(/crm/, @$files); + $self->{config}{'REDfly TFBSs'}{file} = grep(/tfbs/, @$files); + + my %slice_cache; + my $extf_adaptor = $self->db->get_ExternalFeatureAdaptor; + my $dbentry_adaptor = $self->db->get_DBEntryAdaptor; + my $ftype_adaptor = $self->db->get_FeatureTypeAdaptor; + # this object is only used for projection + my $dummy_analysis = new Bio::EnsEMBL::Analysis(-logic_name => 'REDflyProjection');#do we need this? + my $species = $self->db->species; + + if(! $species){ + throw('Must define a species to define the external_db'); + } + + #Just to make sure we hav homo_sapiens and not Homo Sapiens + ($species = lc($species)) =~ s/ /_/; + + + foreach my $import_set(@{$self->import_sets}){ + $self->log_header("Parsing $import_set data"); + + my %factor_cache; # name -> factor_id + my %target_cache; + my $config = $self->{'config'}{$import_set}; + my $fset = $self->{static_config}{feature_sets}{$import_set}{feature_set}; + my %gff_attrs = %{$config->{'gff_attrs'}}; + + + # Parse motifs.txt file + my $file = $config->{'file'}; + my $skipped = 0; + my $factor_cnt = 0; + my $factor_xref_cnt = 0; + my $feature_cnt = 0; + my $feature_target_cnt = 0; + + open (FILE, "<$file") || die("Can't open $file\n$!\n"); + ; # skip header + + LINE: while (my $line = ) { + next if ($line =~ /^\s*\#/o || $line =~ /^\s*$/o); + chomp $line; + my %attr_cache;#Can we move this outside the loop and rely on it being reset each time? + + + #GFF3 + #Is this format valid, missing " after REDfly xref + #2L REDfly regulatory_region 2456365 2456372 . . . ID="Unspecified_dpp:REDFLY:TF000068"; Dbxref="Flybase:FBgn0000490", "PMID:8543160", "REDfly:644, "FlyBase:"; Evidence="footprint/binding assay"; Factor="Unspecified"; Target="dpp"; + #seq_name, source, feature, start, end, score, strand, frame, [attrs] + my ($chromosome, undef, $feature, $start, $end, undef, undef, undef, $attrs) = split /\t/o, $line; + my @attrs = split/\;\s+/o, $attrs; + + + #UCSC coords + $start ++; + $end ++; + + + + foreach my $gff_attr(keys %gff_attrs){ + + if(($attr_cache{$gff_attr}) = grep {/^${gff_attr}\=/} @attrs){ + $attr_cache{$gff_attr} =~ s/(^${gff_attr}\=\")(.*)(\")/$2/; + + #warn "attr cache is $attr_cache{$gff_attr} "; + + } + else{ + warn "Skipping import, unable to find mandatory $gff_attr attribute in:\t$line"; + next LINE; + } + } + + + #For TFBS + #Factor = coding gene name display_label + #Target = Target gene? + #Ignore other xrefs for name, just put ID in feature as display_label + + #These are mixed up! and where not getting any coding xrefs! + + + #For CRM + #Can we split the ID and have Reguatory XREF? + #e.g. ID="dpp_dpp813"; => dpp + + + + + #This can be moved to the BaseExternalParser + + if(! exists $slice_cache{$chromosome}){ + + if($old_assembly){ + $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', + $chromosome, + undef, + undef, + undef, + $old_assembly); + }else{ + $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', $chromosome); + } + } + + if(! defined $slice_cache{$chromosome}){ + warn "Can't get slice $chromosome for motif $attr_cache{'ID'};\n"; + $skipped++; + next; + } + + + #get feature_type first + + #we are not maintaining this link in the DB! + #Do we need another xref for this or a different table? + + my $feature_type; + + #TFBSs + if(exists $attr_cache{'Factor'}){ + + if(! exists $factor_cache{$attr_cache{'Factor'}}){ + + $factor_cache{$attr_cache{'Factor'}} = $ftype_adaptor->fetch_by_name($attr_cache{'Factor'}); + + if(! defined $factor_cache{$attr_cache{'Factor'}}){ + + #Would need to add CODING DBEntry here! + #Will this work on a scalar ref to a hash? + my $desc = (exists $config->{'desc_suffix'}) ? $attr_cache{'Factor'}.$config->{'desc_suffix'} : undef; + + ($factor_cache{$attr_cache{'Factor'}}) = @{$ftype_adaptor->store(Bio::EnsEMBL::Funcgen::FeatureType->new + ( + -name => $attr_cache{'Factor'}, + -class => $fset->feature_type->class, + -description => $desc, + ))}; + + $feature_type = $factor_cache{$attr_cache{'Factor'}}; + $factor_cnt ++; + my $stable_id = $self->get_core_stable_id_by_display_name($self->db->dnadb, $attr_cache{'Factor'}); + + #Handle release/version in xref version as stable_id version? + + if(! defined $stable_id){ + warn "Could not generate CODING xref for feature_type:\t". $attr_cache{'Factor'}."\n"; + }else{ + #warn "got $stable_id for ".$attr_cache{'Factor'}; + my $dbentry = Bio::EnsEMBL::DBEntry->new( + -dbname => $species.'_core_Gene', + #-release => $self->db->dnadb->dbc->dbname, + -status => 'KNOWNXREF',#This is for the external DB + #-display_label_linkable => 1, + -#db_display_name => $self->db->dnadb->dbc->dbname, + -db_display_name => 'EnsemblGene', + -type => 'MISC',#Is for the external_db + -primary_id => $stable_id, + -display_id => $attr_cache{'Factor'}, + -info_type => 'MISC', + -into_text => 'GENE', + -linkage_annotation => 'REDfly Coding' + -analysis => $fset->analysis, + + #-description => 'cisRED motif gene xref',#This is now generic and no longer resitricted to REDfly + #could have version here if we use the correct dnadb to build the cache + ); + + $dbentry_adaptor->store($dbentry, $factor_cache{$attr_cache{'Factor'}}->dbID, 'FeatureType', 1);#1 is ignore release flag + $factor_xref_cnt ++; + } + } + } + } + else{ + #CRMs + $feature_type = $fset->feature_type; + } + + + #Now build actual feature + $feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new + ( + -display_label => $attr_cache{'ID'}, + -start => $start, + -end => $end, + -strand => 0, + -feature_type => $feature_type, + -feature_set => $fset, + -slice => $slice_cache{$chromosome}, + ); + + # project if necessary + if ($new_assembly) { + $feature = $self->project_feature($feature, $new_assembly); + + if(! defined $feature){ + $skipped ++; + next; + } + } + + ($feature) = @{$extf_adaptor->store($feature)}; + $feature_cnt++; + + my $target = (exists $attr_cache{'Target'}) ? $attr_cache{'Target'} : (split/_/, $attr_cache{'ID'})[0]; + my $stable_id; + + if($target ne 'Unspecified'){ + $stable_id = $self->get_core_stable_id_by_display_name($self->db->dnadb, $target); + } + + + if(! defined $stable_id){ + warn "Could not generate TARGET xref for feature:\t". $attr_cache{'ID'}."\n" if $target ne 'Unspecified'; + } + else{ + #Handle release/version in xref version as stable_id version? + my $dbentry = Bio::EnsEMBL::DBEntry->new( + -dbname => $species.'_core_Gene', + #-release => $self->db->dnadb->dbc->dbname, + -status => 'KNOWNXREF', + #-display_label_linkable => 1, + -#db_display_name => $self->db->dnadb->dbc->dbname, + -db_display_name => 'EnsemblGene', + -type => 'MISC',# + -primary_id => $stable_id, + -display_id => $target, + -info_type => 'MISC', + -info_text => 'GENE', + -linkage_annotation => $fset->feature_type->name.' Target', + -analysis => $fset->analysis, + + #could have version here if we use the correct dnadb to build the cache + ); + + $dbentry_adaptor->store($dbentry, $feature->dbID, 'ExternalFeature', 1);#1 is ignore release flag + + $feature_target_cnt ++; + } + } + + close FILE; + + $self->log("Loaded ".$fset->name); + $self->log("$factor_cnt feature types"); + $self->log("$factor_xref_cnt feature type coding xrefs"); + $self->log("$feature_cnt features"); + $self->log("$feature_target_cnt feature target xrefs"); + $self->log("Skipped $skipped features"); + + } + + return; +} + + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/vista.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/vista.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,246 @@ +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +package Bio::EnsEMBL::Funcgen::Parsers::vista; + +use strict; + +# Parse data from LBL enhancers, see http://enhancer.lbl.gov/cgi-bin/imagedb.pl?show=1;search.result=yes;form=search;search.form=no;action=search;search.sequence=1 +# e.g. +# +# >chr16:84987588-84988227 | element 1 | positive | neural tube[12/12] | hindbrain (rhombencephalon)[12/12] | limb[3/12] | cranial nerve[8/12] +# AACTGAAGGGACCCCGTTAGCATAtaaacaaaaggtggggggtagccccgagcctcttct +# ctgacagccagtggcggcagtgatgaatttgtgaagttatctaattttccactgttttaa +# ttagagacttgggctctgaggcctcgcagctggcttctttgtgctgtattctgttgcctg +# acagag + +use Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser; +use Bio::EnsEMBL::Utils::Exception qw( throw ); + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser); + + + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_, type => 'Vista'); + + #Set default feature_type and feature_set config + $self->{static_config}{feature_types} = {( + 'VISTA Target' => { + -name => 'VISTA Target', + -class => 'Search Region', + -description => 'VISTA target region', + }, + 'VISTA Enhancer' => { + -name => 'VISTA Enhancer', + -class => 'Enhancer', + -description => 'Enhancer identified by positive VISTA assay', + }, + 'VISTA Target - Negative' => { + -name => 'VISTA Target - Negative', + -class => 'Search Region', + -description => 'Enhancer negative region identified by VISTA assay', + }, + )}; + + $self->{static_config}{analyses} = { + VISTA => { + -logic_name => 'VISTA', + -description => 'VISTA Enhancer Assay (http://enhancer.lbl.gov/)', + -display_label => 'VISTA', + -displayable => 1, + }, + }; + + #This is used as the entry point to store/validate + #So all of the above needs to be referenced in here + $self->{static_config}{feature_sets} = { + 'VISTA enhancer set' => + { + #Stored in this order + + #Entries here are flexible + #Can be omited if defined in feature_set + #top level analyses/feature_types definition required if no DB defaults available + #These can be a ref to the whole or subset of the top level analyses/feature_types hash + #A key with an empty hash or undef(with or without a matching key in the top level analyses/feature_types hash + + #analyses => $self->{static_config}{analyses}, + feature_types => $self->{static_config}{feature_types}, + + #feature_type and analysis values must be string key to top level hash + #This wont work for feature_types as they are not unique by name!!!!!! + #This is why we have top level hash where we can define a unique compound key name + + feature_set => + { + -feature_type => 'VISTA Target',#feature_types config key name not object + -display_label => 'VISTA Enhancers', + -description => 'Experimentally validated enhancers', + -analysis => 'VISTA',#analyses config key name not object + }, + } + }; + + #$self->validate_and_store_feature_types; + $self->validate_and_store_config([keys %{$self->{static_config}{feature_sets}}]); + $self->set_feature_sets; + + return $self; +} + + + +# Parse file and return hashref containing: +# +# - arrayref of features +# - arrayref of factors + + + +sub parse_and_load{ + my ($self, $files, $old_assembly, $new_assembly) = @_; + + if(scalar(@$files) != 1){ + throw('You must provide a unique file path to load VISTA features from:\t'.join(' ', @$files));; + } + + my $file = $files->[0]; + $self->log_header("Parsing and loading LBNL VISTA enhancer data from:\t$file"); + + my $extfeat_adaptor = $self->db->get_ExternalFeatureAdaptor; + my $dummy_analysis = new Bio::EnsEMBL::Analysis(-logic_name => 'EnhancerProjection'); # this object is only used for projection + my $fset_config = $self->{static_config}{feature_sets}{'VISTA enhancer set'}; + my $feature_positive = $fset_config->{'feature_types'}{'VISTA Enhancer'}; + my $feature_negative = $fset_config->{'feature_types'}{'VISTA Target - Negative'}; + my $set = $fset_config->{feature_set}; + + use Bio::EnsEMBL::Registry; + my %id_prefixes = ( + homo_sapiens => 'hs', + mus_musculus => 'mm', + ); + + my $species = Bio::EnsEMBL::Registry->get_alias($self->db->species); + + if( (! defined $species) || + (! exists $id_prefixes{$species}) ){ + throw("Failed to get a VISTA ID prefix for species alias:\t$species"); + } + + $species = $id_prefixes{$species}; + + ### Read file + open (FILE, "<$file") || die "Can't open $file"; + my $cnt = 0; + my $skipped = 0; + + + while () { + + next if ($_ !~ /^>/o); # only read headers + + # OLD >chr16:84987588-84988227 | element 1 | positive | neural tube[12/12] | hindbrain (rhombencephalon)[12/12] | limb[3/12] | cranial nerve[8/12] + # v66 >Mouse|chr12:112380949-112381824 | element 3 | positive | neural tube[4/4] | hindbrain (rhombencephalon)[4/4] | forebrain[4/4] + + #ID naming scheme change from LBNL-1 to hs1 or mm1 + #But the flat file and url use two different naming schemes! + #VISTA URL is: where experiment id is element number and species_id 1 = human and 2 = mouse + #http://enhancer.lbl.gov/cgi-bin/imagedb3.pl?form=presentation&show=1&experiment_id=1&organism_id=1 + + #Add links to cell_type for tissues in @expression_pattern? + #This would be vista specific cell_type_annotation? Or we could just have associated_cell_type? (without annotation) + #Just link for now + + my (undef, $coords, $element, $posneg, @expression_patterns) = split /\s*\|\s*/o;#was \s+ + # parse co-ordinates & id + my ($chr, $start, $end) = $coords =~ /chr([^:]+):(\d+)-(\d+)/o; + my ($element_number) = $element =~ /\s*element\s*(\d+)/o; + + # seq_region ID and co-ordinates + my $chr_slice; + + if ($old_assembly) { + $chr_slice = $self->slice_adaptor->fetch_by_region('chromosome', $chr, undef, undef, undef, $old_assembly); + } else { + $chr_slice = $self->slice_adaptor->fetch_by_region('chromosome', $chr); + } + + if (!$chr_slice) { + warn "Can't get slice for chromosme $chr\n"; + next; + } + + my $seq_region_id = $chr_slice->get_seq_region_id; + throw("Can't get seq_region_id for chromosome $chr") if (!$seq_region_id); + + # Assume these are all on the positive strand? Is this correct? + my $strand = 1; + + + my $feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new + ( + -start => $start,#is this in UCSC coords? + -end => $end, #is this in UCSC coords? + -strand => $strand, + -feature_type => $posneg eq 'positive' ? $feature_positive : $feature_negative, + -slice => $self->slice_adaptor->fetch_by_region('chromosome', $chr, undef, undef, $strand, $old_assembly), + -display_label => $species.$element_number,#"LBNL-$element_number", + -feature_set => $set, + ); + + + # project if necessary + if ($new_assembly) { + + $feature = $self->project_feature($feature, $dummy_analysis, $new_assembly); + + if(! defined $feature){ + $skipped ++; + next; + } + } + + $cnt ++; + $extfeat_adaptor->store($feature); + } + + close FILE; + + $self->log('Parsed '.($cnt+$skipped).' features'); + $self->log("Loaded $cnt features"); + $self->log("Skipped $skipped features"); + + #Now set states + foreach my $status(qw(DISPLAYABLE MART_DISPLAYABLE)){ + $set->adaptor->store_status($status, $set); + } + + + return; + +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/Probe.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Probe.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,807 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::Probe +# + + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::Funcgen::Probe - A module to represent a nucleotide probe. + +=head1 SYNOPSIS + +use Bio::EnsEMBL::Funcgen::Probe; + +# + +my $probe = Bio::EnsEMBL::Funcgen::Probe->new( + -PROBE_SET => $probe_set, + -NAME => 'Probe-1', + -ARRAY => $array, + -ARRAY_CHIP_ID => $ac_dbid, + -CLASS => "EXPERIMENTAL", +); + +=head1 DESCRIPTION + +An Probe object represents an probe on a microarray. The data (currently the +name, probe_set_id, length, pair_index and class) are stored +in the oligo_probe table. + +For Affy arrays, a probe can be part of more than one array, but only part of +one probeset. On each Affy array the probe has a slightly different name. For +example, two different complete names for the same probe might be +DrosGenome1:AFFX-LysX-5_at:535:35; and Drosophila_2:AFFX-LysX-5_at:460:51;. In +the database, these two probes will have the same oligo_probe_id. Thus the same +Affy probe can have a number of different names and complete names depending on +which array it is on. + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::Probe; + +use Bio::EnsEMBL::Utils::Argument qw( rearrange ) ; +use Bio::EnsEMBL::Utils::Exception qw( throw warning ); +use Bio::EnsEMBL::Funcgen::Storable; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Storable); + + +=head2 new + + Arg [-NAME] : string - probe name + Used when the probe is on one array. + Arg [-NAMES] : Listref of strings - probe names + Used when the probe is on multiple arrays. + Arg [-ARRAY] : Bio::EnsEMBL::Funcgen::Array + Used when the probe is on one array. + Arg [-ARRAYS] : Listref of Bio::EnsEMBL::Funcgen::Array + Used when the probe is on multiple arrays. + Arg [-ARRAY_CHIP_ID] : int - array_chip db ID + Used when the probe is on one array. + Arg [-ARRAY_CHIP_IDS] : Listref of ints - array_chip dbIDs + Used when the probe is on multiple array chips + Arg [-NAMES] : Listref of ints - arary_chip db IDs + Used when the probe is on multiple arrays. + Arg [-PROBE_SET] : Bio::EnsEMBL::ProbeSet + Each probe is part of one(and only one) probeset, if not probe set + then probeset = probe i.e. probe_set size = 1 + Arg [-LENGTH] : int - probe length + Will obviously be the same for all probes if same probe + is on multiple arrays. + Arg [-CLASS] : string - probe class e.g. CONTROL, EXPERIMENTAL + Will be the same for all probes if same probe is on + multiple arrays. + Arg [-DESCRIPTION] : (optional) string - description + + + Example : my $probe = Bio::EnsEMBL::Probe->new( + -NAME => 'Probe-1', + -PROBE_SET => $probe_set, + -ARRAY => $array, + -ARRAY_CHIP_ID => $array_chip_id, + -LENGTH => 25, + -CLASS => 'EXPERIMENTAL', + -DESCRIPTION => 'Some useful description', + + ); + Description: Creates a new Bio::EnsEMBL::Probe object. + Returntype : Bio::EnsEMBL::Probe + Exceptions : Throws if not supplied with probe name(s) and array(s) + Caller : General + Status : Medium Risk + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + my ( + $names, $name, + $array_chip_ids, $array_chip_id, + $arrays, $array, + $probeset, $aclass, + $length, $desc + ) = rearrange([ + 'NAMES', 'NAME', + 'ARRAY_CHIP_IDS', 'ARRAY_CHIP_ID', + 'ARRAYS', 'ARRAY', + 'PROBE_SET', 'CLASS', + 'LENGTH', 'DESCRIPTION' + ], @_); + + + @$names = ($name) if(ref($names) ne "ARRAY"); + @$array_chip_ids = ($array_chip_id) if (ref($array_chip_ids) ne "ARRAY"); + @$arrays = ($array) if (ref($arrays) ne "ARRAY"); + + #We need to record duplicates for each probe_set i.e. each array. + #the relationship is really array_chip to name, as everything else stays the same + #can't have same probe_set_id as this wouldn't maintain relationship + #need unique ps id's or array_chip_id in probe table? + #Then we can miss probeset id's out totally if required + #or should we just duplicate everything with unique db IDs + + + if (defined $$names[0]) { + + if(scalar(@$names) != scalar(@$array_chip_ids)){ + throw("You have not specified valid name:array_chip_id pairs\nYou need a probe name for each Array"); + } + + if(defined $$arrays[0]){ + if(scalar(@$names) != scalar(@$arrays)){ + throw("You have not specified valid name:Array pairs\nYou need a probe name for each Array\n"); + } + } + else{ + warn("You have not specified and Array objects, this will result in multiple/redundant queries based on the array_chip_id\nYou should pass Array objects to speed up this process"); + #Is this true? We should cache this in the ArrayChip and make sure we're caching it in the caller. + } + + # Probe(s) have been specified + # Different names reflect different array + + for my $i(0..$#{$names}){ + $self->add_array_chip_probename($$array_chip_ids[$i], $$names[$i], $$arrays[$i]); + } + } else { + throw('You need to provide a probe name (or names) to create an Probe'); + } + + $self->probeset($probeset) if defined $probeset; + $self->class($aclass) if defined $aclass; + $self->length($length) if defined $length; + $self->description($desc) if defined $desc; + + return $self; +} + +#only takes single values for array and array_chip +#as we're shortcuting the constructor and simply blessing the hash +#therefore attr keys should not be lc and not prefix with '-' + +=head2 new_fast + + Args : Hashref with all internal attributes set + Example : none + Description: Quick and dirty version of new. Only works if the code is very + disciplined. Cannot add array chip probe names unless we recreate + the data structure in the caller. + Returntype : Bio::EnsEMBL::Funcgen::Probe + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub new_fast { + bless ($_[1], $_[0]); +} + + + +=head2 add_array_chip_probename + + Arg [1] : int - db ID of array_chip + Arg [2] : string - probe name + Arg [3] : Bio::EnsEMBL::Funcgen::Array + Example : $probe->add_array_chip_probename($ac_dbid, $probename, $array); + Description: Adds a probe name / array pair to a probe, allowing incremental + generation of a probe. + Returntype : None + Exceptions : None + Caller : General, + Probe->new(), + ProbeAdaptor->_obj_from_sth(), + AffyProbeAdaptor->_obj_from_sth() + Status : Medium Risk - Change to take ArrayChip object. + +=cut + +sub add_array_chip_probename { + my $self = shift; + my ($ac_dbid, $probename, $array) = @_; + $self->{ 'arrays' } ||= {}; + $self->{ 'probenames' } ||= {}; + + #mass redundancy here, possibility of fetching same array over and over!!!!!!!!!!!!!! + #Need to implement cache in caller i.e. adaptor + #Made mandatory to force creation of cache + #we need access to adaptor before we can test is valid and stored + #let's no test each time for adaptor as this would slow down + #Just test here instead. + + if(! (ref($array) && $array->isa('Bio::EnsEMBL::Funcgen::Array') && $array->dbID)){ + #$array = $self->adaptor()->db()->get_ArrayAdaptor()->fetch_by_array_chip_dbID($ac_dbid); + throw('You must pass a valid Bio::EnsEMBL::Funcgen::Array. Maybe you want to generate a cache in the caller?'); + } + + #mapping between probename and ac_dbid is conserved through array name between hashes + #only easily linked from arrays to probenames,as would have to do foreach on array name + + #Can we change the implementation of this so we're only storing the array once, reverse + #the cache? But we want access to the array and using an object reference as a key is ???? + #How would this impact on method functionality? + + #We now handle multiple names per probe/array + #This will not capture the relationship between + #probe name and position on array! + #Not a problem for affy as name is position + #Currently not a problem for nimblegen as probes never have more than 1 name??? + + $self->{ 'arrays' }->{$ac_dbid} = $array; + + $self->{ 'probenames' }->{$array->name()} ||= []; + push @{$self->{ 'probenames' }->{$array->name()}}, $probename; + + return; +} + + +=head2 get_all_ProbeFeatures + + Args : None + Example : my $features = $probe->get_all_ProbeFeatures(); + Description: Get all features produced by this probe. The probe needs to be + database persistent. + Returntype : Listref of Bio::EnsEMBL:Funcgen::ProbeFeature objects + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub get_all_ProbeFeatures { + my $self = shift; + if ( $self->adaptor() && $self->dbID() ) { + return $self->adaptor()->db()->get_ProbeFeatureAdaptor()->fetch_all_by_Probe($self); + } else { + warning('Need database connection to retrieve Features'); + return []; + } +} + +=head2 get_all_Arrays + + Args : None + Example : my $arrays = $probe->get_all_Arrays(); + Description: Returns all arrays that this probe is part of. Only works if the + probe was retrieved from the database or created using + add_Array_probename (rather than add_arrayname_probename). + Returntype : Listref of Bio::EnsEMBL::Funcgen::Array objects + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub get_all_Arrays { + my $self = shift; + + #Arrays are currently preloaded using a cache in _objs_from_sth + return [ values %{$self->{'arrays'}} ]; +} + +=head2 get_names_Arrays + + Args : None + Example : my %name_array_pairs = %{$probe->get_names_Arrays}; + Description: Returns Array name hash + Returntype : hashref of probe name Bio::EnsEMBL::Funcgen::Array pairs + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub get_names_Arrays { + my $self = shift; + + #Arrays are currently preloaded using a cache in _objs_from_sth + return $self->{'arrays'}; +} + + + + + +=head2 get_all_probenames + + Arg [1] : Optional - list of array names, defaults to all available + Example : my @probenames = @{$probe->get_all_probenames()}; + Description: Retrieves all names for this probe. Only makes sense for probes + that are part of a probeset (i.e. Affy probes), in which case + get_all_complete_names() would be more appropriate. + Returntype : Listref of strings + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub get_all_probenames { + my ($self, @array_names) = @_; + + my @names; + @array_names = keys %{$self->{'probenames'}} if ! @array_names; + + foreach my $array(@array_names){ + push @names, @{$self->{'probenames'}->{$array}}; + } + + return \@names; +} + + + +=head2 get_probename + + Arg [1] : string - array name + Example : my $probename = $probe->get_probename('Array-1'); + Description: For a given array, retrieve the name for this probe. + Returntype : string + Exceptions : Throws if the array name is required but not specified + Warns if probe has more than one name for the given array. + Caller : General + Status : Medium Risk + +=cut + + +#we can have dulplicate probes on same array for Nimblegen +#what defines and unique probe? +#If we have a duplicate on the same array or even on the same array_chip, then we can still return the same name +#Needs more work + +sub get_probename { + my ($self, $arrayname) = @_; + + + my $probename; + + if (! $arrayname){ + + #Sanity check that there is only one non-AFFY array + my @ac_ids = keys %{$self->{'arrays'}}; + + if((scalar @ac_ids == 1) && ($self->get_all_Arrays()->[0]->vendor() ne "AFFY")){ + $arrayname = $self->get_all_Arrays()->[0]->name(); + } + else{ + throw('Cannot retrieve name for Probe('.$self->dbID.") without arrayname if more than 1 array chip(@ac_ids) and not NIMBELGEN(".$self->get_all_Arrays()->[0]->vendor().")\n"); + } + } + + + #Need to check if this exists before derefing + #Warn here? + return if(! exists ${$self->{'probenames'}}{$arrayname}); + + + my @names = @{$self->{'probenames'}->{$arrayname}}; + + + if(scalar(@names) > 1){ + my $p_info = ''; + + if($self->probeset){ + $p_info = " probeset ".$self->probeset->name; + } + + warn("Found replicate probes with different names for array ${arrayname}${p_info}.Returning comma separated string list:\t".join(',', @names)."\n"); + return join(',', @names); + + } + else{ + ($probename) = @{$self->{'probenames'}->{$arrayname}}; + } + + return $probename; +} + + + +=head2 get_all_complete_names + + Args : None + Example : my @compnames = @{$probe->get_all_complete_names()}; + Description: Retrieves all complete names for this probe. The complete name + is a concatenation of the array name, the probeset name and the + probe name. + Returntype : Listref of strings + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub get_all_complete_names { + my $self = shift; + + my ($probeset, @result); + my $pset = $self->probeset; + + if($pset){ + $probeset = $pset->name; + } + + $probeset .= ':' if $probeset; + + + #warn "For Nimblegen this need to be Container:Seqid::probeid?"; + + while ( my (undef, $array) = each %{$self->{'arrays'}} ) { + #would have to put test in here for $self->arrays()->vendor() + #if($array->vendor() eq "AFFY"){ + + foreach my $name(@{$self->{'probenames'}{$array->name()}}){ + + push @result, $array->name().":$probeset".$name; + } + } + + return \@result; +} + + + +#For affy this matters as name will be different, but not for Nimblegen +#Need to consolidate this +#have get name method which throws if there is more than one array +#detects array vendor and does appropriate method + +=head2 get_complete_name + + Arg [1] : string - array name + Example : my $compname = $probe->get_complete_name('Array-1'); + Description: For a given array, retrieve the complete name for this probe. + Returntype : string + Exceptions : Throws if the array name not specified or not known for this probe + Caller : General + Status : Medium Risk + +=cut + +sub get_complete_name { + my $self = shift; + my $arrayname = shift; + + + throw('Must provide and array name argument to retreive the complete name') if ! defined $arrayname; + + my $probename = $self->get_probename($arrayname); + + if (!defined $probename) { + throw('Unknown array name'); + } + + my $probeset = $self->probeset()->name(); + $probeset .= ':' if $probeset; + + return "$arrayname:$probeset$probename"; +} + +=head2 probeset + + Arg [1] : (optional) Bio::EnsEMBL::Funcgen::ProbeSet + Example : my $probe_set = $probe->probeset(); + Description: Getter and setter of probe_set attribute for Probe objects. + Returntype : Bio::EnsEMBL::Funcgen::ProbeSet + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub probeset { + my $self = shift; + + $self->{'probe_set'} = shift if @_; + return $self->{'probe_set'}; +} + +=head2 class + + Arg [1] : (optional) string - class + Example : my $class = $probe->class(); + Description: Getter and setter of class attribute for Probe + objects e.g. CONTROL, EXPERIMENTAL + Returntype : string + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub class { + my $self = shift; + $self->{'class'} = shift if @_; + return $self->{'class'}; +} + +=head2 length + + Arg [1] : (optional) int - probe length + Example : my $probelength = $probe->length(); + Description: Getter and setter of length attribute for Probe + objects. + Returntype : int + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub length { + my $self = shift; + $self->{'length'} = shift if @_; + return $self->{'length'}; +} + +=head2 description + + Arg [1] : (optional) string - description + Example : my $pdesc = $probe->description(); + Description: Getter and setter of description attribute for Probe + objects. + Returntype : string + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub description { + my $self = shift; + $self->{'description'} = shift if @_; + return $self->{'description'}; +} + + +=head2 feature_count + + Arg[0] : recount flag + Example : my $num_features = $probe->feature_count(); + Description: Counts the number of ProbeFeatures associated with this Probe + Returntype : int + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + + +sub feature_count{ + my ($self, $recount) = @_; + + if($recount || + (! $self->{feature_count})){ + $self->{feature_count} = $self->adaptor->db->get_ProbeFeatureAdaptor->count_probe_features_by_probe_id($self->dbID); + } + + return $self->{feature_count}; +} + + + + +### ARRAY DESIGN SPECIFIC METHODS + +=head2 add_Analysis_score + + Arg [1] : Bio::EnsEMBL::Analysis + Arg [2] : string - analysis score (as string a precision may differ between analyses)?? + Example : $probe->add_Analysis_score($analysis, $score); + Description: Setter for probe analysis attributes from an array design + Returntype : None + Exceptions : throws if args are not met or valid + Caller : General + Status : at risk + +=cut + +sub add_Analysis_score{ + my ($self, $anal, $score) = @_; + + if(! ($anal && $anal->dbID() && $anal->isa("Bio::EnsEMBL::Analysis"))){ + throw("Must provide a valid stored Bio::EnsEMBL::Analysis"); + } + + throw("Must provide a score to add to the probe") if ! defined $score; + + $self->{'analysis'}{$anal->dbID()} = $score; + + return; +} + +=head2 add_Analysis_CoordSystem_score + + Arg [1] : Bio::EnsEMBL::Analysis + Arg [2] : Bio::EnsEMBL::CoordSystem + Arg [3] : string - analysis score (as string a precision may differ between analyses)?? + Example : $probe->add_Analysis_CoordSystem_score($analysis, $coord_sys, $score); + Description: Setter for coord system dependant probe analysis attributes from an array design + Returntype : None + Exceptions : throws if args are not met or valid + Caller : General + Status : at risk + +=cut + +sub add_Analysis_CoordSystem_score{ + my ($self, $anal, $cs, $score) = @_; + + if(! ($anal && $anal->dbID() && $anal->isa("Bio::EnsEMBL::Analysis"))){ + throw("Must provide a valid stored Bio::EnsEMBL::Analysis"); + } + + if(! ($cs && $cs->dbID() && $cs->isa("Bio::EnsEMBL::Funcgen::CoordSystem"))){ + throw("Must provide a valid stored Bio::EnsEMBL::Funcgen::CoordSystem"); + } + + throw("Must provide a score to add to the probe") if ! defined $score; + + $self->{'analysis_coord_system'}{$anal->dbID()}{$cs->dbID()} = $score; + + return; +} + +=head2 get_score_by_Analysis + + Arg [1] : Bio::EnsEMBL::Analysis + Example : my $anal_score = $probe->get_analysis_score($analysis); + Description: Setter for probe analysis attributes from an array design + Returntype : string + Exceptions : throws if args are not met or valid + Caller : General + Status : at risk + +=cut + +sub get_score_by_Analysis{ + my ($self, $anal) = @_; + + $self->get_all_design_scores() if ! defined $self->{'analysis'}; + + if(! ($anal && $anal->dbID() && $anal->isa("Bio::EnsEMBL::Analysis"))){ + throw("Must provide a valid stored Bio::EnsEMBL::Analysis"); + } + + + return (exists $self->{'analysis'}{$anal->dbID()}) ? $self->{'analysis'}{$anal->dbID()} : undef; +} + +=head2 get_score_by_Analysis_CoordSystem + + Arg [1] : Bio::EnsEMBL::Analysis + Arg [2] : Bio::EnsEMBL::CoordSystem + Arg [3] : string - analysis score (as string a precision may differ between analyses)?? + Example : $probe->add_analysis($analysis, $coord_sys, $score); + Description: Setter for coord system dependant probe analysis attributes from an array design + Returntype : None + Exceptions : throws if args are not met or valid + Caller : General + Status : at risk + +=cut + +sub get_score_by_Analysis_CoordSystem{ + my ($self, $anal, $cs) = @_; + + $self->get_all_design_scores() if ! defined $self->{'analysis_coord_system'}; + + if(! ($anal && $anal->dbID() && $anal->isa("Bio::EnsEMBL::Analysis"))){ + throw("Must provide a valid stored Bio::EnsEMBL::Analysis"); + } + + if(! ($cs && $cs->dbID() && $cs->isa("Bio::EnsEMBL::Funcgen::CoordSystem"))){ + throw("Must provide a valid stored Bio::EnsEMBL::Funcgen::CoordSystem"); + } + + my $score = undef; + + if(exists $self->{'analysis_coord_system'}{$anal->dbID()} && + exists $self->{'analysis_coord_system'}{$anal->dbID()}{$cs->dbID()}){ + $score = $self->{'analysis_coord_system'}{$anal->dbID()}{$cs->dbID()}; + } + + return $score; +} + + +=head2 get_all_design_scores + + Arg [1] : Boolean - No fetch flag, to fetch design scores from DB, used in adaptor + To avoid testing DB for each probe when no design scores have been added. + Example : my @anal_score_coordsets = @{$probe->get_all_design_scores()}; + Description: Gets all design scores as analysis_id, score and optionally coord_system_id + Returntype : ARRAYREF + Exceptions : throws if no fetch flag is not defined and adaptor or probe is not defined and or stored. + Caller : General + Status : at risk + +=cut + +#not named get_all_Analysis_scores as this would imply only non-cs dependent scores +#hence named after table, as this returns simple table records + +sub get_all_design_scores{ + my ($self, $no_fetch) = @_; + + my ($analysis_id, $cs_id, $score, @design_scores); + + if(! $no_fetch){#can assume we have none stored already due to implementation of add methods + + throw("Probe must have and adaptor to fetch design scores from the DB") if(! $self->adaptor()); + + foreach my $probe_analysis(@{$self->adaptor->fetch_all_design_records($self)}){ + #we can't use the add methods here as would be cyclical + #nor do we need extra validation + + ($analysis_id, $cs_id, $score) = @$probe_analysis; + + if($cs_id){ + $self->{'analysis_coord_system'}{$analysis_id}{$cs_id} = $score; + }else{ + $self->{'analysis'}{$analysis_id} = $score; + } + } + } + + #populate array from attrs + if(exists $self->{'analysis_coord_system'}){ + + foreach $analysis_id(keys %{$self->{'analysis_coord_system'}}){ + + foreach $cs_id(keys %{$self->{'analysis_coord_system'}{$analysis_id}}){ + push @design_scores, [$analysis_id, $self->{'analysis_coord_system'}{$analysis_id}{$cs_id}, $cs_id]; + } + } + } + + if(exists $self->{'analysis'}){ + + foreach $analysis_id(keys %{$self->{'analysis'}}){ + + push @design_scores, [$analysis_id, $self->{'analysis'}{$analysis_id}]; + } + } + + + return \@design_scores; + +} + + +#do we need get_all methods for Analysis and Analysis_CoordSystem? +#maybe if we split into another Class and Adaptor + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/ProbeFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/ProbeFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,508 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::ProbeFeature +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::ProbeFeature - A module to represent an nucleotide probe +genomic mapping. + +=head1 SYNOPSIS + +use Bio::EnsEMBL::Funcgen::ProbeFeature; + +my $feature = Bio::EnsEMBL::Funcgen::ProbeFeature->new( + -PROBE => $probe, + -MISMATCHCOUNT => 0, + -SLICE => $chr_1_slice, + -START => 1_000_000, + -END => 1_000_024, + -STRAND => -1, + -ANALYSIS => $analysis, + -CIGAR_STRING => '1U2M426D2M1m21M', +); + + +=head1 DESCRIPTION + +An ProbeFeature object represents the genomic placement of an Probe +object. The data are stored in the probe_feature table. + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::ProbeFeature; + +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Utils::Exception qw( throw ); +use Bio::EnsEMBL::Feature; +use Bio::EnsEMBL::Funcgen::Storable; +use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(median); + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Feature Bio::EnsEMBL::Funcgen::Storable); + + +=head2 new + + Arg [-PROBE] : Bio::EnsEMBL::Funcgen::Probe - probe + A ProbeFeature must have a probe. This probe must already be stored if + you plan to store the feature. + Arg [-MISMATCHCOUNT]: int + Number of mismatches over the length of the probe. + Arg [-SLICE] : Bio::EnsEMBL::Slice + The slice on which this feature is. + Arg [-START] : int + The start coordinate of this feature relative to the start of the slice + it is sitting on. Coordinates start at 1 and are inclusive. + Arg [-END] : int + The end coordinate of this feature relative to the start of the slice + it is sitting on. Coordinates start at 1 and are inclusive. + Arg [-STRAND] : int + The orientation of this feature. Valid values are 1, -1 and 0. + Arg [-dbID] : (optional) int + Internal database ID. + Arg [-ADAPTOR] : (optional) Bio::EnsEMBL::DBSQL::BaseAdaptor + Database adaptor. + Example : my $feature = Bio::EnsEMBL::Funcgen::ProbeFeature->new( + -PROBE => $probe, + -MISMATCHCOUNT => 0, + -SLICE => $chr_1_slice, + -START => 1_000_000, + -END => 1_000_024, + -STRAND => -1, + -ANALYSIS => $analysis, + -CIGARLINE => '15M2m3d4M', + #Can represent transcript alignment as gapped genomic alignments + #D(eletions) representing introns + #Lowercase m's showing sequence mismatches + ); + Description: Constructor for ProbeFeature objects. + Returntype : Bio::EnsEMBL::Funcgen::ProbeFeature + Exceptions : None + Caller : General + Status : At risk + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + my ($probe, $mismatchcount, $pid, $cig_line) + = rearrange(['PROBE', 'MISMATCHCOUNT', 'PROBE_ID', 'CIGAR_STRING'], @_); + + #remove mismatch? + #mandatory args? + + #warn "creating probe feature with $pid"; + $self->{'probe_id'} = $pid if $pid; + $self->probe($probe) if $probe; + $self->mismatchcount($mismatchcount) if defined $mismatchcount;#do not remove until probe mapping pipeline fixed + $self->cigar_string($cig_line) if defined $cig_line; + + #do we need to validate this against the db? Grab from slice and create new if not present? Will this be from the dnadb? + + #do we need this coordsys id if we're passing a slice? We should have the method but not in here? + + return $self; +} + +=head2 new_fast + + Args : Hashref with all internal attributes set + Example : none + Description: Quick and dirty version of new. Only works if the code is very + disciplined. + Returntype : Bio::EnsEMBL::Funcgen::ProbeFeature + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub new_fast { + bless ($_[1], $_[0]); +} + +=head2 probeset + + Arg [1] : (optional) string - probeset + Example : my $probeset = $feature->probeset(); + Description: Getter and setter for the probeset for this feature. Shortcut + for $feature->probe->probeset(), which should be used instead. + Probeset is not persisted if set with this method. + Returntype : string + Exceptions : None + Caller : General + Status : Medium Risk + : Use $feature->probe->probeset() because this may be removed + +=cut + +sub probeset { + my $self = shift; + + $self->{'probeset'} = shift if @_; + + if (! $self->{'probeset'}) { + $self->{'probeset'} = $self->probe()->probeset(); + } + + #We could bypass this entirely and call directly using proveset_id? + + + return $self->{'probeset'}; +} + + +#Only ever needs to be set in _objs_from_sth +#This is to allow linkage of probe_feature glyphs without retrieving the probeset. + +sub probeset_id{ + my $self = shift; + + return $self->{'_probeset_id'}; +} + +=head2 mismatchcount + + Arg [1] : int - number of mismatches + Example : my $mismatches = $feature->mismatchcount(); + Description: Getter and setter for number of mismatches for this feature. + Returntype : int + Exceptions : None + Caller : General + Status : High Risk + +=cut + +sub mismatchcount { + my $self = shift; + + + #replace with dynamic check of cigarline? + + $self->{'mismatchcount'} = shift if @_; + + return $self->{'mismatchcount'}; +} + + +=head2 cigar_string + + Arg [1] : str - Cigar line alignment annotation (M = Align & Seq match, m = Align matcht & Seq mismatch, D = Deletion in ProbeFeature wrt genome, U = Unknown at time of alignment) + Example : my $cg = $feature->cigar_string(); + Description: Getter and setter for number of the cigar line attribute for this feature. + Returntype : str + Exceptions : None + Caller : General + Status : High Risk + +=cut + +sub cigar_string { + my $self = shift; + + $self->{'cigar_string'} = shift if @_; + + return $self->{'cigar_string'}; +} + + +=head2 probe + + Arg [1] : Bio::EnsEMBL::Funcgen::Probe - probe + Example : my $probe = $feature->probe(); + Description: Getter, setter and lazy loader of probe attribute for + ProbeFeature objects. Features are retrieved from the database + without attached probes, so retrieving probe information for a + feature will involve another query. + Returntype : Bio::EnsEMBL::Funcgen::Probe + Exceptions : None + Caller : General + Status : at risk + +=cut + +sub probe { + my $self = shift; + my $probe = shift; + + #can we not use _probe_id here? + #why is probe_id not set sometimes? + + + #warn "in pf and probe is ".$self->{'probe_id'}; + + if ($probe) { + + #warn "Probe defined and is ".$probe. "and probe id is".$self->{'probe_id'}; + + if ( !ref $probe || !$probe->isa('Bio::EnsEMBL::Funcgen::Probe') ) { + throw('Probe must be a Bio::EnsEMBL::Funcgen::Probe object'); + } + $self->{'probe'} = $probe; + } + + if ( ! defined $self->{'probe'}){ + # && $self->dbID() && $self->adaptor() ) { + #$self->{'probe'} = $self->adaptor()->db()->get_ProbeAdaptor()->fetch_by_ProbeFeature($self); + #warn "fetching probe with dbID ".$self->probe_id(); + $self->{'probe'} = $self->adaptor()->db()->get_ProbeAdaptor()->fetch_by_dbID($self->probe_id()); + } + return $self->{'probe'}; +} + + +=head2 probe_id + + Example : my $probe_id = $pfeature->probe_id(); + Description: Getter for the probe db id of the ProbeFeature + Returntype : int + Exceptions : None + Caller : General + Status : at risk + +=cut + +sub probe_id{ + my $self = shift; + + return $self->{'probe_id'} || $self->probe->dbID(); +} + +=head2 get_results_by_channel_id + + Arg [1] : int - channel_id (mandatory) + Arg [2] : string - Analysis name e.g. RawValue, VSN (optional) + Example : my @results = $feature->results(); + Description: Getter, setter and lazy loader of results attribute for + ProbeFeature objects. + Returntype : List ref to arrays containing ('score', 'Analysis logic_name'); + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub get_results_by_channel_id { + my $self = shift; + my $channel_id = shift; + my $anal_name = shift; + + warn("This method not fully implemented, remove/deprecate?"); + + #$self->{'results'} ||= {}; + $self->{'results_complete'} ||= 0; + + if(! $self->{'results'} || ($anal_name && ! exists $self->{'results'}{$anal_name})){ + #fetch all, set complete set flag + $self->{'results_complete'} ||= 1 if(! $anal_name); + + foreach my $results_ref(@{$self->adaptor->fetch_results_by_channel_analysis($self->probe->dbID(), + $channel_id, $anal_name)}){ + + $self->{'results'}{$$results_ref[1]} = $$results_ref[0]; + } + } + + return $self->{'results'} +} + + +#The experiment/al chip specificity has already been done by the ofa->fetch_all_by_Slice_Experiment +#This may be called with no preceding Experiment specificity +#this would return results for all experiments +#do we need to set a default Experiment? + + +#THis should return both Chip and Channel based results +#just Chip for now +#maybe retrieve and hash all if not Analysis object passed? Then return what? + + +=head2 get_result_by_Analysis_ExperimentalChips + + Arg [1] : Bio::EnsEMBL::Analysis + Arg [2] : listref - Bio::EnsEMBL::Funcgen::ExperimentalChip + Example : my $result = $feature->get_result_by_Analysis_ExperimentalChips($anal, \@echips); + Description: Getter of results attribute for a given Analysis and set of ExperimentalChips + Returntype : float + Exceptions : Throws is no Analysis or ExperimentalChips are not passed? + Caller : General + Status : High Risk + +=cut + + +#make ExperimentalChips optional? + +#or have ResultSetAdaptor? Do we need a ResultSet? +#may not have ExperimentalChip, so would need to return ec dbID aswell + + +######This will break/return anomalous if +#ECs are passed from different experiments +#ECs are passed from different Arrays + + +sub get_result_by_Analysis_ExperimentalChips{ + my ($self, $anal, $exp_chips) = @_; + + throw("Need to pass listref of ExperiemntalChips") if(scalar(@$exp_chips) == 0); + throw("Need to pass a valid Bio::EnsEMBL::Analysis") if ! $anal->isa("Bio::EnsEMBL::Analysis"); + + my (%query_ids, %all_ids, %ac_ids); + my $anal_name = $anal->logic_name(); + + foreach my $ec(@$exp_chips){ + + throw("Need to pass a listref of Bio::EnsEMBL::Funcgen::ExperimentalChip objects") + if ! $ec->isa("Bio::EnsEMBL::Funcgen::ExperimentalChip"); + + #my $tmp_id = $self->adaptor->db->get_ArrayAdaptor->fetch_by_array_chip_dbID($ec->array_chip_id())->dbID(); + + #$array_id ||= $tmp_id; + + #throw("You have passed ExperimentalChips from different if($array_id != $tmp_id) + + #if(exists $ac_ids{$ec->array_chip_id()}){ +# throw("Multiple chip query only works with contiguous chips within an array, rather than duplicates"); + # } + + $ac_ids{$ec->array_chip_id()} = 1; + $all_ids{$ec->dbID()} = 1; + $query_ids{$ec->dbID()} = 1 if(! exists $self->{'results'}{$anal_name}{$ec->dbID()}); + + } + + + my @ec_ids = keys %query_ids; + my @all_ids = keys %all_ids; + + + #warn "ec ids @ec_ids\n"; + #warn "all ids @all_ids\n"; + + #$self->{'results'} ||= {}; + #$self->{'results_complete'} ||= 0;#do we need this now? + + if((scalar(@all_ids) - scalar(@ec_ids))> 1){ + throw("DATA ERROR - There is more than one result stored for the following ExperimentalChip ids: @all_ids"); + } + elsif(! $self->{'results'} || (($anal_name && scalar(@ec_ids) > 0) && scalar(@all_ids) == scalar(@ec_ids))){ + #fetch all, set complete set flag + #$self->{'results_complete'} ||= 1 if(! $anal_name); + #would need to look up chip and channel analyses here and call relevant fetch + #or pass the chip and then build the query as = or IN dependent on context of logic name + #if there are multiple results, last one will overwrite others + #could do foreach here to deal with retrieving all i.e. no logic name + #Can supply mutliple chips, but probe ids "should" be unique(in the DB at least) amongst contiguous array_chips + #build the cache based on logic name and table_id + #cahce key?? should we cat the ec_ids together? + + my @result_refs = @{$self->adaptor->fetch_results_by_Probe_Analysis_experimental_chip_ids($self->probe(), + $anal, + \@ec_ids)}; + + #Remove lines with no result + while(@result_refs && (! $result_refs[0]->[0])){ + shift @result_refs; + } + + my $num_results = scalar(@result_refs); + my ($result, $mpos); + #throw("Fetched more than one result for this ProbeFeature, Analysis and ExperimentalChips") if (scalar(@result_refs) >1); + + #No sort needed as we sort in the query + + if($num_results == 1){ + $result = $result_refs[0]->[0]; + } + elsif($num_results == 2){#mean + $result = ($result_refs[0]->[0] + $result_refs[1]->[0])/2; + + } + elsif($num_results > 2){#median or mean of median flanks + $mpos = $num_results/2; + + if($mpos =~ /\./){#true median + $mpos =~ s/\..*//; + $mpos ++; + $result = $result_refs[$mpos]->[0]; + }else{ + $result = ($result_refs[$mpos]->[0] + $result_refs[($mpos+1)]->[0])/2 ; + } + } + + $self->{'results'}{$anal_name}{":".join(":", @ec_ids).":"} = $result; + } + + #do we return the ec ids here to, or do we trust that the user will know to only pass contiguous rather than duplicate chips + + #how are we going to retrieve the result for one of many possible ec id keys? + #options, cat ec dbids as key, and grep them to find full key, then return result + #this may hide the duplicate chip problem + #If a query has already been made and cached,another query with one differing ID(duplicate result) may never be queried as we already have a cahced result + #We shoulld pick up duplicates before this happens + #If we try and mix ExperimentalChips from different experiments, then this would also cause multiple results, and hence hide some data + + my @keys; + foreach my $id(@all_ids){ + my @tmp = grep(/:${id}:/, keys %{$self->{'results'}{$anal_name}}); + #Hacky needs sorting, quick fix for release!! + + if(@tmp){ + push @keys, grep(/:${id}:/, keys %{$self->{'results'}{$anal_name}}); + + last; + } + + } + + throw("Got more than one key for the results cache") if scalar(@keys) > 1; + + return $self->{'results'}{$anal_name}{$keys[0]}; +} + + +#Will this be too slow, can we not do one query across all tables + +sub get_result_by_ResultSet{ + my ($self, $rset) = @_; + + my $results = $rset->adaptor->fetch_results_by_probe_id_ResultSet($self->probe_id(), $rset); + + return median($results); +} + + + + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/ProbeSet.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/ProbeSet.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,300 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::ProbeSet +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::Funcgen::ProbeSet - A module to represent a probeset. + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Registry; + use Bio::EnsEMBL::Funcgen::ProbeSet; + + + my $reg = Bio::EnsEMBL::Registry->load_adaptors_from_db(-host => 'ensembldb.ensembl.org', + -user => 'anonymous'); + + my $pset_adaptor = $reg->get_adaptor($species, 'funcgen', 'ProbeSet'); + + ### Creating/storing a ProbeSet ### + + my $probe_set = Bio::EnsEMBL::Funcgen::ProbeSet->new(-NAME => 'ProbeSet-1', + -SIZE => 1, + -FAMILY => "ENCODE REGIONS",#optional + ); + + $pset_adaptor->store($probe_set); + + + ### Fetching associated transcripts ### + # Generated by the Ensembl array mapping pipeline + + my @dbentries = @{$probe_set->fetch_all_Transcript_DBEntries}; + my $trans_adaptor = $reg->get_adpator($species, 'core', 'Transcript'); + + foreach my $dbe(@dbentries){ + + my $tx = $trans_adaptor->fetch_by_stable_id($dbe->primary_id); + + #Print the transcript info and the linkage annotation + print $tx->stable_id."\t".$probe_set->name.' '.$dbe->linkage_annotation."\n"; + } + + #Alternatively these annotations are also available in a transcript centric manner + #using the ProbeSetAdaptor + + +=head1 DESCRIPTION + +A ProbeSet object represents a set of probes on a microarray. The +data (currently the name, size, and family) are stored in the probe_set +table. ProbeSets are only really relevant for Affy probes, or when +avaliable these will be analagous to Nimblegen feature sets. + +For Affy arrays, a probeset can be part of more than one array, containing unique +probes. + +#Need to rewrite this bit +#Something about array_chip_id i.e. experimental validation etc +On each Affy array the probe has a slightly different name. For +example, two different complete names for the same probe might be +DrosGenome1:AFFX-LysX-5_at:535:35; and Drosophila_2:AFFX-LysX-5_at:460:51;. In +the database, these two probes will have the same probe_id. Thus the same +Affy probe can have a number of different names and complete names depending on +which array it is on. + + +=head1 SEE ALSO + + Bio::EnsEMBL::Funcgen::ProbeSetAdaptor + ensembl-functgenomics/scripts/examples/microarray_annotation_example.pl + + Or for details on how to run the array mapping pipeline see: + ensembl-functgenomics/docs/array_mapping.txt + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::ProbeSet; + +use Bio::EnsEMBL::Utils::Argument qw( rearrange ) ; +use Bio::EnsEMBL::Utils::Exception qw( throw warning ); +use Bio::EnsEMBL::Funcgen::Storable; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Storable); + + +=head2 new + + Arg [-NAME] : string - probeset name + Arg [-SIZE] : int - probe set size + Will be the same for all probes sets if same probe set + is on multiple arrays. + Arg [-FAMILY] : string - probe set family, generic descriptor for probe set e.g. ENCODE REGIONS, RANDOM + Will be the same for all probes sets if same probe set is on multiple arrays. + Example : my $probeset = Bio::EnsEMBL::Funcgen::ProbeSet->new( + -NAME => 'ProbeSet-1', + -SIZE => 1, + -FAMILY => "ENCODE_REGIONS", + ); + Description: Creates a new Bio::EnsEMBL::Funcgen::ProbeSet object. + Returntype : Bio::EnsEMBL::Funcgen::ProbeSet + Exceptions : Throws if not supplied with probeset name and array chip id(s) + Caller : General + Status : Medium Risk + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + #warn("The only way to get array names/ids, is to retrieve all the probes!!!"); + + + my ( + $name, $size, + $family + ) = rearrange([ + 'NAME', 'SIZE', + 'FAMILY', + ], @_); + + + $self->name($name) if defined $name; + $self->family($family) if defined $family; + $self->size($size) if defined $size; + + return $self; +} + + + +#=head2 get_all_ProbeFeatures + +# Args : None +# Example : my $features = $probeset->get_all_ProbeFeatures(); +# Description: Get all features produced by this probeset. The probeset needs to be +# database persistent. +# Returntype : Listref of Bio::EnsEMBL::Funcgen::ProbeFeature objects +# Exceptions : None +# Caller : General +# Status : Medium Risk + +#=cut + +sub get_all_ProbeFeatures { + my $self = shift; + + throw("Not implemented yet, do we want to do this for ProbeSet or just probe?"); + + if ( $self->adaptor() && $self->dbID() ) { + return $self->adaptor()->db()->get_ProbeFeatureAdaptor()->fetch_all_by_ProbeSet($self); + } else { + warning('Need database connection to retrieve Features'); + return []; + } +} + +=head2 get_all_Arrays + + Args : None + Example : my $arrays = $probeset->get_all_Arrays(); + Description: Returns all arrays that this probeset is part of. Only works if the + probedet was retrieved from the database or created using + add_Array_probename. + Returntype : Listref of Bio::EnsEMBL::Funcgen::Array objects + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub get_all_Arrays { + my $self = shift; + + if (defined $self->{'arrays'}) { + return $self->{'arrays'}; + } + else{ + $self->{arrays} = $self->adaptor->db->get_ArrayAdaptor->fetch_all_by_ProbeSet($self); + } + + $self->{arrays} +} + + +=head2 get_all_Probes + + Args : None + Example : my @probes = @{$probeset->get_all_Probes(); + Description: Returns all probes belonging to this ProbeSet + Returntype : Listref of Bio::EnsEMBL::Funcgen::Probe objects + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub get_all_Probes { + my $self = shift; + + if (defined $self->{'probes'}) { + return $self->{'probes'}; + } + else{ + $self->{probes} = $self->adaptor->db->get_ProbeAdaptor->fetch_all_by_ProbeSet($self); + } + + $self->{probes} +} + + + +#sub get_all_array_chips_ids? +#sub get_all_Results? from_Experiment? + +=head2 name + + Arg [1] : string - aprobeset name + Example : my $probesetname = $probeset->name('probeset-1'); + Description: Getter/Setter for the name attribute of ProbeSet objects. + Returntype : string + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub name { + my $self = shift; + $self->{'name'} = shift if @_; + return $self->{'name'}; +} + + +=head2 family + + Arg [1] : (optional) string - family + Example : my $family = $probe->family(); + Description: Getter and setter of family attribute for ProbeSet + objects. e.g. EXPERIMENTAL or CONTROL + Returntype : string + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub family { + my $self = shift; + $self->{'family'} = shift if @_; + return $self->{'family'}; +} + +=head2 size + + Arg [1] : (optional) int - probeset size + Example : my $probeset_size = $probeset->size(); + Description: Getter and setter of probeset size attribute for ProbeSet + objects. + Returntype : int + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub size { + my $self = shift; + $self->{'size'} = shift if @_; + return $self->{'size'}; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/RegulatoryFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/RegulatoryFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,814 @@ + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::Funcgen::RegulatoryFeature + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Registry; + use Bio::EnsEMBL::Funcgen::RegulatoryFeature; + my $reg = Bio::EnsEMBL::Registry->load_adaptors_from_db + ( + -host => 'ensembldb.ensembl.org', + -user => 'anonymous' + ); + + my $regfeat_adaptor = $reg->get_adaptor($species, 'funcgen', 'RegulatoryFeature'); + + + ### Creating/storing a RegulatoryFeature Set ### + my $feature = Bio::EnsEMBL::Funcgen::RegulatoryFeature->new + ( + -SLICE => $chr_1_slice, + -START => 1_000_000, + -END => 1_000_024, + -STRAND => 0, + -DISPLAY_LABEL => $text, + -FEATURE_SET => $fset, + -FEATURE_TYPE => $reg_ftype, + -ATTRIBUTE_CACHE => \%attr_cache, + ); + + my ($stored_feat) = @{$regfeat_adaptor->store([$feature])}; + + + ### Fetching some RegualtoryFeatures + my @regfeats = @{$regfeat_adaptor->fetch_all_by_Slice_FeatureSets($slice, \@fsets)}; + + + ### Print the bound and core loci + print join(' - ', ($reg_feat->bound_start, + $reg_feat->start, + $reg_feat->end, + $reg_feat->bound_end)."\n"; + + + ### Getting some supporting evidence for a RegualtoryFeatures + my @reg_attrs = @{$reg_feat->regulatory_attributes('annotated')}; + + +=head1 DESCRIPTION + +A RegulatoryFeature object represents the output of the Ensembl RegulatoryBuild: + http://www.ensembl.org/info/docs/funcgen/regulatory_build.html + +It is comprises many possible histone, transcription factor, polymerase and open +chromatin features, which have been combined to provide a summary view and +classification of the regulatory status at a given loci. + + +=head1 SEE ALSO + +Bio::EnsEMBL:Funcgen::DBSQL::RegulatoryFeatureAdaptor +Bio::EnsEMBL::Funcgen::SetFeature + +=cut + + +package Bio::EnsEMBL::Funcgen::RegulatoryFeature; + +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Utils::Exception qw( throw ); +use strict; +use warnings; + +use base qw(Bio::EnsEMBL::Funcgen::SetFeature); #@ISA + + +=head2 new + + Arg [-SLICE] : Bio::EnsEMBL::Slice - The slice on which this feature is located. + Arg [-START] : int - The start coordinate of this feature relative to the start of the slice + it is sitting on. Coordinates start at 1 and are inclusive. + Arg [-END] : int -The end coordinate of this feature relative to the start of the slice + it is sitting on. Coordinates start at 1 and are inclusive. + Arg [-FEATURE_SET] : Bio::EnsEMBL::Funcgen::FeatureSet - Regulatory Feature set + Arg [-FEATURE_TYPE] : Bio::EnsEMBL::Funcgen::FeatureType - Regulatory Feature sub type + Arg [-BINARY_STRING] : (optional) string - Regulatory Build binary string + Arg [-STABLE_ID] : (optional) string - Stable ID for this RegualtoryFeature e.g. ENSR00000000001 + Arg [-DISPLAY_LABEL] : (optional) string - Display label for this feature + Arg [-ATTRIBUTE_CACHE] : (optional) HASHREF of feature class dbID|Object lists + Arg [-PROJECTED] : (optional) boolean - Flag to specify whether this feature has been projected or not + Arg [-dbID] : (optional) int - Internal database ID. + Arg [-ADAPTOR] : (optional) Bio::EnsEMBL::DBSQL::BaseAdaptor - Database adaptor. + + Example : my $feature = Bio::EnsEMBL::Funcgen::RegulatoryFeature->new( + -SLICE => $chr_1_slice, + -START => 1000000, + -END => 1000024, + -DISPLAY_LABEL => $text, + -FEATURE_SET => $fset, + -FEATURE_TYPE => $reg_ftype, + -ATTRIBUTE_CACHE => \%attr_cache, + ); + + + Description: Constructor for RegulatoryFeature objects. + Returntype : Bio::EnsEMBL::Funcgen::RegulatoryFeature + Exceptions : None + Caller : General + Status : Stable + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_); + + my ($stable_id, $attr_cache, $bin_string, $projected) + = rearrange(['STABLE_ID', 'ATTRIBUTE_CACHE', 'BINARY_STRING', 'PROJECTED'], @_); + + #None of these are mandatory at creation + #under different use cases + $self->{binary_string} = $bin_string if defined $bin_string; + $self->{stable_id} = $stable_id if defined $stable_id; + $self->{projected} = $projected if defined $projected; + $self->attribute_cache($attr_cache) if $attr_cache; + + return $self; +} + + +=head2 display_label + + Example : my $label = $feature->display_label(); + Description: Getter for the display label of this feature. + Returntype : String + Exceptions : None + Caller : General + Status : Stable + +=cut + +sub display_label { + my $self = shift; + + if(! defined $self->{'display_label'}){ + $self->{'display_label'} = $self->feature_type->name.' Regulatory Feature'; + + if( defined $self->cell_type ){ + $self->{'display_label'} .= ' - '.$self->cell_type->name; + } + } + + return $self->{display_label}; +} + +=head2 display_id + + Example : print $feature->display_id(); + Description: This method returns a string that is considered to be + the 'display' identifier. In this case the stable Id is + preferred + Returntype : String + Exceptions : none + Caller : web drawing code, Region Report tool + Status : Stable + +=cut + +sub display_id { return $_[0]->{stable_id}; } + + +=head2 binary_string + + Arg [1] : optional string - binary string from regulatory build + Example : my $bin_string = $feature->binary_string(); + Description: Getter and setter for the binary_string for this feature. + Returntype : String + Exceptions : None + Caller : General + Status : At Risk - May change to BLOB, remove setter functionality + +=cut + +sub binary_string{ + my ($self, $bin_string) = @_; + + if (defined $bin_string){ + #added v67 + warn "RegualtoryFeature::binary_string setter functionality is being removed\n"; + $self->{binary_string} = $bin_string; + } + + return $self->{binary_string}; +} + + +=head2 stable_id + + Arg [1] : (optional) string - stable_id e.g ENSR00000000001 + Example : my $stable_id = $feature->stable_id(); + Description: Getter and setter for the stable_id attribute for this feature. + Returntype : string + Exceptions : None + Caller : General + Status : At risk - setter functionality to be removed + +=cut + +sub stable_id { + my $self = shift; + + if (@_){ + #added v67 + warn "RegualtoryFeature::stable_id setter functionality is being removed\n"; + $self->{stable_id} = shift; + } + + return $self->{stable_id}; +} + + +=head2 regulatory_attributes + + Arg [1] : String (optional) - Class of feature e.g. annotated or motif + Example : print "Regulatory Attributes:\n\t".join("\n\t", (map $_->feature_type->name, @{$feature->regulatory_attributes()}))."\n"; + Description: Getter for the regulatory_attributes for this feature. + Returntype : ARRAYREF + Exceptions : Throws if feature class not valid + Caller : General + Status : At Risk + +=cut + + +sub regulatory_attributes{ + my ($self, $feature_class) = @_; + + #Incorporating the MFs like this does cause some redundancy in the DB + #But will speed up display of the RegFeat image including the MFs + #Redefine the cache to have class keys e.g. TFBS, OpenChromatin, Histone Mods + #Can't do this as we need the table key to be able to fetch the features + #Really need something to be able to draw the image first, then create the zmenu details later. + + my %adaptors = ( + 'annotated' => $self->adaptor->db->get_AnnotatedFeatureAdaptor, + 'motif' => $self->adaptor->db->get_MotifFeatureAdaptor, + #external + ); + + my @fclasses; + + if(defined $feature_class){ + + if(exists $adaptors{lc($feature_class)}){ + @fclasses = (lc($feature_class)); + } + else{ + throw("The feature class you specified is not valid:\t$feature_class\n". + "Please use one of:\t".join(', ', keys %adaptors)); + } + } + else{ + @fclasses = keys %adaptors; + } + + foreach my $fclass(@fclasses){ + #Now structured as hash to facilitate faster has_attribute method + #Very little difference to array based cache + + my @attr_dbIDs = keys %{$self->{'attribute_cache'}{$fclass}}; + + + if(scalar(@attr_dbIDs) > 0){ + + if( ! ( ref($self->{'regulatory_attributes'}{$fclass}->[0]) && + ref($self->{'regulatory_attributes'}{$fclass}->[0])->isa('Bio::EnsEMBL::Feature') )){ + + $adaptors{$fclass}->force_reslice(1);#So we don't lose attrs which aren't on the slice + $self->{'regulatory_attributes'}{$fclass} = $adaptors{$fclass}->fetch_all_by_dbID_list(\@attr_dbIDs, $self->slice); + + #Having problems here if we are trying to project between Y PAR and X + #Current dest_slice mapping code simply changes the start end values assuming the slice is correct + #currently no test for seq_region name match + + + #foreach my $attr(@{ $self->{'regulatory_attributes'}{$fclass}}){ + # warn "$attr ".$attr->dbID." ".$attr->feature_Slice->name."\n"; + #} + + + $adaptors{$fclass}->force_reslice(0); + + #Problems here with attrs not being returning when they do not lie on dest slice + #i.e. core projected to cell line, but dest slice only over laps a region of the core which + #actually has no attrs. + #either use the feature_Slice and reslice everthing to the dest slice + #or skip test in attr obj_frm_sth? + # + + #This method transfers to the query slice, do not use fetch_by_dbID + #It also should use _final_clause + #This is currently only specified in the MotifFeatureAdaptor + #as these are required to be sorted to relate to the structure string + + #but we are stll storing in has where order is not preserved!! + #so this will not match order of underlying strcture! + + #separate so we can have ordered array returned + #do we need redundant caches? + #defo need db id cache for 'has' methods + + #foreach my $attr(@{$fclass_attrs}){ + # $self->{'regulatory_attributes'}{$fclass}{$attr->dbID} = $attr; + #} + } + } + else{ + $self->{'regulatory_attributes'}{$fclass} = []; + } + } + + return [ map { @{$self->{'regulatory_attributes'}{$_}} } @fclasses ]; +} + +=head2 has_attribute + + Arg [1] : Attribute Feature dbID + Arg [2] : Attribute Feature class e.g. motif or annotated + Example : if($regf->has_attribute($af->dbID, 'annotated'){ #do something here } + Description: Identifies whether this RegualtoryFeature has a given attribute + Returntype : Boolean + Exceptions : Throws if args are not defined + Caller : General + Status : Stable + +=cut + + +sub has_attribute{ + my ($self, $dbID, $fclass) = @_; + + throw('Must provide a dbID and a Feature class argument') if ! $dbID && $fclass; + + return exists ${$self->attribute_cache}{$fclass}{$dbID}; +} + +=head2 get_focus_attributes + + Arg [1] : None + Example : my @focus_attrs = @{$regf->get_focus_attributes}; + Description: Getter for the focus features of this RegualtoryFeature, used to defined the core region + Returntype : ARRAYREF + Exceptions : None + Caller : General + Status : Stable + +=cut + +sub get_focus_attributes{ + my $self = shift; + + if(! exists $self->{'focus_attributes'} || + ! @{$self->{'focus_attributes'}}){ + $self->_sort_attributes; + } + + + return $self->{'focus_attributes'}; +} + + +=head2 get_nonfocus_attributes + + Arg [1] : None + Example : my @non_focus_attrs = @{$regf->get_nonfocus_attributes}; + Description: Getter for the non-focus features of this RegulatoryFeature, used to defined + the non core region i.e. the whiskers. + Returntype : ARRAYREF + Exceptions : None + Caller : General + Status : Stable + +=cut + +sub get_nonfocus_attributes{ + my $self = shift; + + #Test focus here as we may not have any nonfocus + #But focus will show that we have sorted already + if(! exists $self->{'focus_attributes'} || + ! @{$self->{'focus_attributes'}}){ + $self->_sort_attributes; + } + + return $self->{'nonfocus_attributes'}; +} + +#Add pod here + +sub _sort_attributes{ + my $self = shift; + + $self->{'focus_attributes'} = []; + $self->{'nonfocus_attributes'} = []; + + foreach my $attrf(@{$self->regulatory_attributes}){ + + if($attrf->isa('Bio::EnsEMBL::Funcgen::MotifFeature') || + $attrf->feature_set->is_focus_set){ + push @{$self->{'focus_attributes'}}, $attrf; + } + else{ + push @{$self->{'nonfocus_attributes'}}, $attrf; + } + } + + return; +} + + +=head2 attribute_cache + + Arg [1] : optional - HASHREF of attribute table keys with values as either a list of attribute + feature dbIDs or objects. If passing object, any MotifFeature objects should be in position + order with respect to the slice. + Example : $feature->attribute_cache(\%attribute_feature_info); + Description: Setter for the regulatory_attribute cache for this feature. This is a short cut method used by the + regulatory build and the webcode to avoid unnecessary fetching and enable enable lazy loading + Returntype : HASHREF + Exceptions : Throws if trying to overwrite existing cache + Caller : RegulatoryFeatureAdaptor.pm and build_regulatory_features.pl + Status : At Risk + +=cut + + +sub attribute_cache{ + my ($self, $attr_hash) = @_; + +# if(! defined $attr_hash){ +# $self->regulatory_attributes; #Fetch the attrs? +# +# +# #Do we need to do this now we have separated the caches? +# +# } + + if(defined $attr_hash){ + + foreach my $fclass(keys %{$attr_hash}){ + + if(exists $self->{'attribute_cache'}{$fclass}){ + throw("You are trying to overwrite a pre-existing regulatory attribute cache entry for feature class:\t$fclass"); + } + else{ + $self->{'attribute_cache'}{$fclass} = $attr_hash->{$fclass}; + } + } + } + + return $self->{'attribute_cache'} || {}; +} + + +=head2 bound_start + + Example : my $bound_start = $feature->bound_start(); + Description: Getter for the bound_start attribute for this feature. + Gives the 5' most start value of the underlying attribute + features. + Returntype : string + Exceptions : None + Caller : General + Status : Stable + +=cut + +sub bound_start { + my $self = shift; + $self->get_underlying_structure if ! defined $self->{'bound_start'}; + + return $self->{'bound_start'}; +} + + +=head2 bound_end + + Example : my $bound_end = $feature->bound_start(); + Description: Getter for the bound_end attribute for this feature. + Gives the 3' most end value of the underlying attribute + features. + Returntype : string + Exceptions : None + Caller : General + Status : Stable + +=cut + +sub bound_end { + my $self = shift; + $self->get_underlying_structure if ! defined $self->{'bound_end'}; + + return $self->{'bound_end'}; +} + + +=head2 bound_seq_region_start + + Example : my $bound_sr_start = $feature->bound_seq_region_start; + Description: Getter for the seq_region bound_start attribute for this feature. + Gives the 5' most start value of the underlying attribute + features. + Returntype : string + Exceptions : None + Caller : General + Status : Stable + +=cut + +sub bound_seq_region_start { + my $self = shift; + + if(! defined $self->{bound_seq_region_start}){ + + if($self->slice->strand == 1){ + $self->{bound_seq_region_start} = $self->slice->start + $self->bound_start - 1; + } + else{ #strand = -1 + $self->{bound_seq_region_start} = $self->slice->end - $self->bound_end + 1; + } + } + + return $self->{bound_seq_region_start}; +} + + +=head2 bound_seq_region_end + + Example : my $bound_sr_end = $feature->bound_seq_region_end; + Description: Getter for the seq_region bound_end attribute for this feature. + Gives the 3' most end value of the underlying attribute + features. + Returntype : string + Exceptions : None + Caller : General + Status : Stable + +=cut + + +sub bound_seq_region_end { + my $self = shift; + + if(! defined $self->{bound_seq_region_end}){ + + if($self->slice->strand == 1){ + $self->{bound_seq_region_end} = $self->slice->start + $self->bound_end - 1; + } + else{ #strand = -1 + $self->{bound_seq_region_end} = $self->slice->end - $self->bound_start + 1; + } + } + + return $self->{bound_seq_region_end}; +} + + + + + + +=head2 is_projected + + Arg [1] : optional - boolean + Example : if($regf->is_projected){ #do something different here } + Description: Getter/Setter for the projected attribute. + Returntype : boolean + Exceptions : None + Caller : General + Status : At risk - remove setter functionality + +=cut + +sub is_projected { + my $self = shift; + + if(@_){ + #added v67 + warn "RegulatoryFeature::is_projected setter functionality is being removed\n"; + $self->{'projected'} = shift; + } + + return $self->{'projected'}; +} + + +=head2 get_underlying_structure + + Example : $self->get_underlying_structure() if(! exists $self->{'bound_end'}); + Description: Getter for the bound_end attribute for this feature. + Gives the 3' most end value of the underlying attribute + features. + Returntype : string + Exceptions : None + Caller : General + Status : At Risk + +=cut + +#This should really be precomputed and stored in the DB to avoid the MF attr fetch +#Need to be aware of projecting here, as these will expire if we project after this method is called + +sub get_underlying_structure{ + my $self = shift; + + if(! defined $self->{underlying_structure}){ + + my @attrs = @{$self->regulatory_attributes()}; + + if(! @attrs){ + throw('No underlying regulatory_attribute features to get_underlying_structure for dbID '.$self->dbID); + #This should never happen even with a projection build + } + else{ + + + #We only need to set the bounds when storing on full slice/seq_region values + #else they should be fetched from the DB + + if(! defined $self->{'bound_start'}){ + + my (@start_ends); + + foreach my $attr(@attrs){ + push @start_ends, ($attr->start, $attr->end); + } + + #Accounts for core region, where data may be absent on this cell type + push @start_ends, ($self->start, $self->end); + + @start_ends = sort { $a <=> $b } @start_ends; + + $self->{'bound_end'} = pop @start_ends; + $self->{'bound_start'} = shift @start_ends; + + #Need to account for projection build here + #i.e. attr extremeties may not extend past core start/end + + if($self->is_projected){ + $self->{'bound_end'} = $self->end if $self->end > $self->{'bound_end'}; + $self->{'bound_start'} = $self->start if $self->start < $self->{'bound_start'}; + } + } + + #Now deal with MotifFeature loci + my @mf_loci; + + foreach my $mf(@{$self->regulatory_attributes('motif')}){ + push @mf_loci, ($mf->start, $mf->end); + } + + $self->{underlying_structure} = [$self->{'bound_start'}, $self->start, @mf_loci, $self->end, $self->{'bound_end'}]; + } + } + + return $self->{underlying_structure}; +} + + +=head2 is_unique_to_FeatureSets + + Arg[1] : optional - ARRAYREF of regualtory Bio::EnsEMBL::Funcgen::FeatureSet objects + Default is FeatureSet of given RegulatoryFeature, else need to be + defined explicitly. + Arg[2] : optional - HASHREF Params hash: + { + include_projected => 0|1, # Boolean, include 'projected' features + } + Example : if($reg_feat->is_unique_to_FeatureSets($fsets)}{ + #then do some analysis here + } + Description: Identifies whether this RegualtoryFeature is unique to a set of FeatureSets. + Returntype : boolean + Exceptions : Throw is arguments not stored or valid. + Caller : General + Status : At risk + +=cut + +#Probably want to add in an FeatureType constraint here +#e.g. so we can compare active vs inactive or poised promoters + +#omit include_multi doesn't make sense here + +sub is_unique_to_FeatureSets{ + my ($self, $fsets, $params_hash) = @_; + + $fsets ||= [$self->feature_set]; + my @fset_ids; + + + #define to avoid deref fails below. + $params_hash ||= {}; + if(ref($params_hash) ne 'HASH'){ + throw("The params hash argument must be a valid HASHREF:\t".ref($params_hash)); + } + + + foreach my $fset(@$fsets){ + #assume we have an adaptor set + $self->adaptor->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::FeatureSet', $fset); + + if($fset->feature_class ne 'regulatory'){ + throw('Found non-regulatory FeatureSet'); + } + + push @fset_ids, $fset->dbID; + } + + my $stable_id; + ($stable_id = $self->stable_id) =~ s/^[A-Z0]+//; + + + my @other_rf_ids = @{$self->adaptor->_fetch_other_dbIDs_by_stable_feature_set_ids + ($stable_id, + \@fset_ids, + { include_projected => $params_hash->{include_projected}} )}; + + return (@other_rf_ids) ? 0 : 1; +} + + + +=head2 get_other_RegulatoryFeatures + + Arg[1] : optional - ARRAYREF of regualtory Bio::EnsEMBL::Funcgen::FeatureSet objects + Default is FeatureSet of given RegulatoryFeature, else need to be + defined explicitly. + Arg[2] : optional - HASHREF Params hash: + { + include_projected => 0|1, # Boolean, include 'projected' features + include_multicell => 0|1, # Boolean, include MultiCell features + } + Example : my @other_fsets = @{$reg_feat->get_other_FeatureSets($fsets)}; + Description: Gets other RegualtoryFeatures (linked via the stable ID) which are present in the + specified list of FeatureSets. + Returntype : ARRAYREF of Bio::EnsEMBL::Funcgen::RegulatoryFeature objects + Exceptions : Throw is arguments not stored or valid. + Caller : General + Status : At risk + +=cut + +sub get_other_RegulatoryFeatures{ + my ($self, $fsets, $params_hash) = @_; + + #define to avoid deref fails below. + $params_hash ||= {}; + if(ref($params_hash) ne 'HASH'){ + throw("The params hash argument must be a valid HASHREF:\t".ref($params_hash)); + } + + $fsets ||= [$self->feature_set]; + my @fset_ids; + + foreach my $fset(@$fsets){ + #assume we have an adaptor set + $self->adaptor->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::FeatureSet', $fset); + + if($fset->feature_class ne 'regulatory'){ + throw('Found non-regulatory FeatureSet'); + } + + push @fset_ids, $fset->dbID; + } + + my $stable_id; + ($stable_id = $self->stable_id) =~ s/^[A-Z0]+//; + + my @other_fsets_ids = @{$self->adaptor->_fetch_other_dbIDs_by_stable_feature_set_ids + ($stable_id, \@fset_ids, + { + include_projected => $params_hash->{include_projected}, + include_multicell => $params_hash->{include_multicell}, + })}; + + return $self->adaptor->fetch_all_by_dbID_list(\@other_fsets_ids); +} + + + +1; + +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/ResultFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/ResultFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,258 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::ResultFeature +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::Funcgen::ResultFeature - A module to represent a lightweight ResultFeature object + +=head1 SYNOPSIS + +use Bio::EnsEMBL::Funcgen::ResultFeature; + +my $rfeature = Bio::EnsEMBL::Funcgen::ResultFeature->new_fast([$start, $end, $score ]); + +my @rfeatures = @{$rset->get_displayable_ResultFeature_by_Slice($slice)}; + +foreach my $rfeature (@rfeatures){ + my $score = $rfeature->score(); + my $rf_start = $rfeature->start(); + my $rf_end = $rfeature->end(); +} + +=head1 DESCRIPTION + +This is a very sparse class designed to be as lightweight as possible to enable fast rendering in the web browser. +As such only the information absolutely required is contained. Any a piori information is omitted e.g. seq_region_id, +this will already be known as ResultFeatures are retrieved via a Slice method in ResultSet via the ResultSetAdaptor, +likewise with analysis and experimental_chip information. ResultFeatures are transient objects, in that they are not +stored in the DB, but are a very small subset of information from the result and oligo_feature tables. ResultFeatures +should only be generated by the ResultSetAdaptor as there is no parameter checking in place. + +=cut + +use strict; +use warnings; + +#Could set global named vars here for element names. Would take more memory + +package Bio::EnsEMBL::Funcgen::ResultFeature; + +use base ('Bio::EnsEMBL::Feature'); + +=head2 new_fast + + Args : Array with attributes start, end, strand, score, probe, result_set_id, winow_size IN THAT ORDER. + WARNING: None of these are validated, hence can omit some where not needed + Example : none + Description: Fast and list version of new. Only works if the code is very disciplined. + Returntype : Bio::EnsEMBL::Funcgen::ResultFeature + Exceptions : None + Caller : ResultSetAdaptor + Status : At Risk + +=cut + +sub new_fast { + my ($class, @args) = @_; + #return bless ($arrayref, $class); + + #Passing arrayref here may cause problems with changing vars after obj creation + + #warn "in new fast with @args"; + + bless \@args, $class; +} + + + + +=head2 start + + Example : my $start = $rf->start(); + Description: Getter of the start attribute for ResultFeature + objects. + Returntype : int + Exceptions : None + Caller : General + Status : At Risk - Now also sets to enable projection + +=cut + +sub start { + $_[0]->[0] = $_[1] if $_[1]; + $_[0]->[0]; +} + + +=head2 end + + Example : my $start = $rf->end(); + Description: Getter of the end attribute for ResultFeature + objects. + Returntype : int + Exceptions : None + Caller : General + Status : At Risk - Now also sets to enable projection + +=cut + +sub end { + $_[0]->[1] = $_[1] if $_[1]; + $_[0]->[1]; +} + + +#Do we need to chacnge this to strand and have slice strand context, as with start and end + +sub strand { $_[0]->[2];} + +=head2 score + + Example : my $score = $rf->score(); + Description: Getter of the score attribute for ResultFeature + objects + Returntype : string/float/double? + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub score { $_[0]->[3];} + + +=head2 probe + + Example : my $probe = $rf->probe(); + Description: Getter of the probe attribute for ResultFeature + objects + Returntype : Bio::EnsEMBL::Funcgen::Probe + Exceptions : None + Caller : General + Status : At Risk - This can only be used for Features with window 0. + +=cut + +#probe_id is currently not available in the result_feature table, so this would be a result/probe_feature query. + +sub probe { $_[0]->[4];} + + +#The following are only used for storage and retrieval, hence why they are not included in new_fast which is streamlined +#for performance +#These have no validation so all thi smust be done in the caller/storer i.e. the adaptor + +sub result_set_id { $_[0]->[5];} +sub window_size { $_[0]->[6];} + +#May not ever need this +#We pass the slice to store +#Don't normally want to remap, so don't need furing fetch +#Now also sets for to enable projection + +sub slice { + $_[0]->[7] = $_[1] if $_[1]; + $_[0]->[7]; +} + + +#Had to reimplement these as they used direct hash calls rather than acessor +#redefined to use accessors to array + +sub length { + my $self = shift; + return $self->end - $self->start + 1; +} + +=head2 move + + Arg [1] : int start + Arg [2] : int end + Arg [3] : (optional) int strand + Example : None + Description: Sets the start, end and strand in one call rather than in + 3 seperate calls to the start(), end() and strand() methods. + This is for convenience and for speed when this needs to be + done within a tight loop. + Returntype : none + Exceptions : Thrown is invalid arguments are provided + Caller : general + Status : Stable + +=cut + +sub move { + my $self = shift; + + throw('start and end arguments are required') if(@_ < 2); + + my $start = shift; + my $end = shift; + my $strand = shift; + + if(defined($start) && defined($end) && $end < $start) { + throw('start must be less than or equal to end'); + } + if(defined($strand) && $strand != 0 && $strand != -1 && $strand != 1) { + throw('strand must be 0, -1 or 1'); + } + + $self->[0] = $start; + $self->[1] = $end; + $self->[2] = $strand if(defined($strand)); +} + + + +=head2 feature_Slice + + Args : none + Example : $slice = $feature->feature_Slice() + Description: Reimplementation of Bio::EnsEMBL::Feature method to enable + assembly mapping + Returntype : Bio::EnsEMBL::Slice or undef if this feature has no attached + Slice. + Exceptions : warning if Feature does not have attached slice. + Caller : web drawing code + Status : Stable + +=cut + + +sub feature_Slice { + my ($self) = @_; + + my $slice = $self->[7]; + + if(!$slice) { + warning('Cannot obtain Feature_Slice for feature without attached slice'); + return undef; + } + + return $slice->sub_Slice($self->[0], $self->[1]); +} + + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/ResultSet.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/ResultSet.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,585 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::ResultSet +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::ResultSet - A module to represent ResultSet. + + +=head1 SYNOPSIS + +use Bio::EnsEMBL::Funcgen::ResultSet; + +my $result_set = Bio::EnsEMBL::Funcgen::ResultSet->new( + -dbid => $dbid, + -analysis => $analysis, + -table_name => 'experimental_chip', + -table_id => $ec_id, +); + + + +=head1 DESCRIPTION + +A ResultSet object provides access to a set raw results from an Experiment. A set will be one or more +contiguous chips to be treated as one set, with the same analysis. Duplicate sets will form a separate +result set, as will the same raw data analysed or normalised in a different manner. + +=cut + +#To do +#Change add_table_id to add_ExperimentalChip_Channel? + + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::ResultSet; + +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Utils::Exception qw( throw deprecate); +use Bio::EnsEMBL::Funcgen::Set; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Set); + + +=head2 new + + Arg [-ANALYSIS] : + + + + Example : my $feature = Bio::EnsEMBL::Funcgen::ResultSet->new( + -dbid => $dbid, + -analysis => $analysis, + -table_name => 'experimental_chip', + -table_id => $ec_id, + -result_feature_set => 1, + ); + Description: Constructor for ResultSet objects. + Returntype : Bio::EnsEMBL::Funcgen::ResultSet + Exceptions : Throws if no experiment_id defined + Caller : General + Status : At risk + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_, ('-feature_class' => 'result')); + + my ($table_name, $table_id, $rf_set, $dbfile_data_dir) + = rearrange(['TABLE_NAME', 'TABLE_ID', 'RESULT_FEATURE_SET', 'DBFILE_DATA_DIR'], @_); + + $self->{'table_id_hash'} = {}; + + #maybe don't need tha analysis args as mandatory as we're testing in the adaptor store method + if (! $table_name){ + throw("Need to pass the following arg:\t-table_name"); + } + + #do we need some control of creating new objects with dbID and adding result_groups/feature_sets and them storing/updating them + #potential for someone to create one from new using a duplicate dbID and then linking incorrect data to a pre-existing ResultGroup + #we need to verify that each table_name/id in the set is from the same experiment + $self->table_name($table_name); + $self->add_table_id($table_id) if $table_id; + $self->result_feature_set($rf_set) if $rf_set; + $self->dbfile_data_dir($dbfile_data_dir) if $dbfile_data_dir; + + return $self; +} + + +#These are CollectionContainer? methods +#For a core track the would probably be in the Analysis +#All other collection methods are in ResultFeatureAdaptor(and parents) + +=head2 get_dbfile_path_by_window_size + + Arg[1] : int - window size + Arg[2] : OPTIONAL Bio::EnsEMBL::Slice Used when generating individual seq_region Collections + Example : my $filepath = $self->get_dbfile_path_by_ResultSet_window_size($rset, $wsize); + Description: Generates the default dbfile path for a given ResultSet and window_size + Returntype : string + Exceptions : Throws if Slice is not valid + Caller : general + Status : At risk + +=cut + +sub get_dbfile_path_by_window_size{ + my ($self, $window_size, $slice) = @_; + + if($slice){ + + if(! (ref($slice) && $slice->isa("Bio::EnsEMBL::Slice"))){ + throw('You must provide a valid Bio::EnsEMBL::Slice'); + } + + $window_size .= '.'.$slice->seq_region_name; + } + + return $self->dbfile_data_dir.'/result_features.'.$self->name.'.'.$window_size.'.col'; +} + + +=head2 dbfile_data_dir + + Arg[1] : OPTIONAL string - data directory for this ResultSet + Example : my $dbfile_data_dir = $self->dbfile_data_dir; + Description: Getter/Setter for the root dbfile data directory for this ResultSet + Returntype : string + Exceptions : None + Caller : self + Status : at risk + +=cut + + + +sub dbfile_data_dir{ + my ($self, $data_dir) = @_; + + $self->{'dbfile_data_dir'} = $data_dir if defined $data_dir; + + return $self->{'dbfile_data_dir'}; +} + + + +=head2 result_feature_set + + Arg [1] : optional - boolean 0 or 1. + Example : if($rset->result_feature_set){ ...use result_feature table ...}; + Description: Getter and setter for the result_feature_set attribute. + Returntype : boolean + Exceptions : None + Caller : General + Status : At Risk + +=cut + + +sub result_feature_set{ + my $self = shift; + + $self->{'result_feature_set'} = shift if @_;; + return $self->{'result_feature_set'}; +} + + +=head2 table_name + + Arg [1] : (optional) string - table_name (experimental_chip, channel or input_set) + Example : $result_set->experiment_id($exp_id); + Description: Getter and setter for the table_name for this ResultSet. + Returntype : string + Exceptions : None + Caller : General + Status : At Risk + +=cut + + +sub table_name{ + my $self = shift; + + if (@_){ + + if($self->{'table_name'} && ($self->{'table_name'} ne $_[0])){ + throw("Cannot mix table name/types of a ResultSet"); + } + + $self->{'table_name'} = $_[0]; + } + + return $self->{'table_name'}; +} + + + +=head2 add_table_id + + Example : $result_set->add_table_id($ec_id, $cc_id); + Description: Caches table_id result_set_input_id to the ResultSet. In the case of an + array ResultSet, the unique result_set_input_id is used to key into the + result table, it also reduces redundancy and enable mapping of results to chips + rather than just the ResultSet. This enables result retrieval + based on chips in the same set which have a differing status. + In the case of a sequencing ResultSet, this simply refers to the InputSet ids. + Returntype : None + Exceptions : Throws if no table_id defined + Caller : General + Status : At Risk + +=cut + +sub add_table_id { + my ($self, $table_id, $cc_id) = @_; + + if (! defined $table_id){ + throw("Need to pass a table_id"); + }else{ + + if((exists $self->{'table_id_hash'}->{$table_id}) && (defined $self->{'table_id_hash'}->{$table_id})){ + throw("You are attempting to redefine a result_set_input_id which is already defined"); + } + + $self->{'table_id_hash'}->{$table_id} = $cc_id; + + } + + return; +} + + +=head2 table_ids + + Example : $result_set->feature_group_id($fg_id); + Description: Getter and setter for the feature_group_id for this ResultSet. + Returntype : int + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub table_ids { + my $self = shift; + + return [ keys %{$self->{'table_id_hash'}} ]; +} + + +sub chip_channel_ids { + my $self = shift; + + deprecate('ResultSet::chip_channel_ids is deprecated, please use result_set_input_ids'); + + return $self->result_set_input_ids; +} + +=head2 result_set_input_ids + + Example : my @rset_rsi_ids = @{$result_set->result_set_input_ids()}; + Description: Getter for the input ids for this ResultSet. + Returntype : arrayref + Exceptions : None + Caller : General + Status : At Risk + +=cut + + +sub result_set_input_ids { + my $self = shift; + + return [ values %{$self->{'table_id_hash'}} ]; +} + + + + +=head2 contains + + Example : if($result_set->contains($chip_or_channel)){...do some chip or channel erpartions here...}; + Description: Returns true if the given Channel or ExperimentalChip is part of this ResultSet + Returntype : boolean + Exceptions : warns if ResultSet table name is not of argument type + Caller : General + Status : At Risk + +=cut + + +sub contains{ + my ($self, $chip_channel) = @_; + + my $contains = 0; + my @tables = $chip_channel->adaptor->_tables(); + my ($table_name, undef) = @{$tables[0]}; + + if($table_name ne $self->table_name()){ + warn("ResultSet(".$self->table_name().") cannot contain ${table_name}s"); + }else{ + $contains = 1 if (exists $self->{'table_id_hash'}->{$chip_channel->dbID()}); + } + + return $contains; +} + +=head2 get_result_set_input_id + + Arg [1] : int - dbID (experimental_chip, channel or input_set) + Example : $result_set->get_result_set_input_id($ec_id); + Description: Retrieves a result_set_input_id from the cache given a dbID + Returntype : int + Exceptions : none + Caller : General + Status : At Risk + +=cut + +sub get_result_set_input_id{ + my ($self, $table_id) = @_; + + return (exists $self->{'table_id_hash'}->{$table_id}) ? $self->{'table_id_hash'}->{$table_id} : undef; +} + + +sub get_chip_channel_id{ + my ($self, $table_id) = @_; + + deprecate('ResultSet::get_chip_channel_ids is dperecated, please us get_result_set_input_id'); + return $self->get_result_set_input_id($table_id); +} + + + +=head2 get_InputSets + + Example : my @ecs = @{$result_set->get_ExperimentalChips()}; + Description: Retrieves a chip_channel_id from the cahce given an ExperimentalChip dbID + Returntype : Listref of ExperimentalChip object + Exceptions : warns is not an experimental_chip ResultSet + Caller : General + Status : At Risk + +=cut + +sub get_InputSets{ + my $self = shift; + + if($self->table_name ne 'input_set'){ + warn 'Cannot get_InputSets for an array based ResultSet'; + return; + } + + + + if(! defined $self->{'input_sets'}){ + my $is_adaptor = $self->adaptor->db->get_InputSetAdaptor(); + + foreach my $is_id(@{$self->table_ids()}){ + push @{$self->{'input_sets'}}, $is_adaptor->fetch_by_dbID($is_id); + } + } + + return $self->{'input_sets'}; +} + + +=head2 get_ExperimentalChips + + Example : my @ecs = @{$result_set->get_ExperimentalChips()}; + Description: Retrieves a chip_channel_id from the cahce given an ExperimentalChip dbID + Returntype : Listref of ExperimentalChip object + Exceptions : warns is not an experimental_chip ResultSet + Caller : General + Status : At Risk + +=cut + +sub get_ExperimentalChips{ + my $self = shift; + + if($self->table_name eq 'input_set'){ + warn 'Cannot get_ExperimentalChips for an InputSet ResultSet'; + return; + } + + if(! defined $self->{'experimental_chips'}){ + my $ec_adaptor = $self->adaptor->db->get_ExperimentalChipAdaptor(); + + if($self->table_name() eq "experimental_chip"){ + + foreach my $ec_id(@{$self->table_ids()}){ + #warn "Getting ec with id $ec_id"; + push @{$self->{'experimental_chips'}}, $ec_adaptor->fetch_by_dbID($ec_id); + #should this be hashed on chip_channel_id? + } + }else{ + #warn("Retrieving ExperimentalChips for a Channel ResultSet"); + + my %echips; + my $chan_adaptor = $self->adaptor->db->get_ChannelAdaptor(); + + foreach my $chan_id(@{$self->table_ids()}){ + my $chan = $chan_adaptor->fetch_by_dbID($chan_id); + $echips{$chan->experimental_chip_id} ||= $ec_adaptor->fetch_by_dbID($chan->experimental_chip_id); + } + + @{$self->{'experimental_chips'}} = values %echips; + } + } + + return $self->{'experimental_chips'}; +} + + + +=head2 get_replicate_set_by_result_set_input_id + + Arg[0] : int - chip_channel_id + Example : my $rep_set_name = $result_set->get_replicate_set_by_result_set_input_id($cc_id); + Description: Retrieves the replicate set name defined by the corresponding ExperimentalChip + Returntype : String - replicate set name + Exceptions : + Caller : General + Status : At Risk - implement for Channels? + +=cut + +#Where is this used? + +sub get_replicate_set_by_result_set_input_id{ + my ($self, $cc_id) = @_; + + if( ! defined $self->{'_replicate_cache'}){ + + warn "Generating replicate cache!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"; + + + foreach my $ec (@{$self->get_ExperimentalChips()}){ + + $self->{'_replicate_cache'}{$self->get_result_set_input_id($ec->dbID())} = $ec->replicate(); + + + } + } + + + #warn here of absent replicate info? + + return (exists $self->{'_replicate_cache'}{$cc_id}) ? $self->{'_replicate_cache'}{$cc_id} : undef; + +} + +sub get_replicate_set_by_chip_channel_id{ + my ($self, $cc_id) = @_; + + deprecate('Please use get_replicate_set_by_result_set_input_id instead'); + return $self->get_replicate_set_by_result_set_input_id($cc_id); +} + + +=head2 get_displayable_ResultFeatures_by_Slice + + Arg[1] : Bio::EnsEMBL::Slice + Arg[2] : Boolean - with probe flag, will nest Probe object in ResultFeature + Example : my @results = @{$ResultSet->get_all_displayable_ResultFeatures_by_Slice($slice)}; + Description: Simple wrapper method for ResultFeatureAdaptor::fetch_all_by_Slice_ResultSet + Returntype : Arrayref of ResultFeatures + Exceptions : None + Caller : General + Status : At Risk + +=cut + + +sub get_displayable_ResultFeatures_by_Slice{ + my ($self, $slice, $with_probe, $max_bins, $window_size, $constraint) = @_; + return $self->adaptor->fetch_ResultFeatures_by_Slice_ResultSet($slice, $self, 'DISPLAYABLE', $with_probe, $max_bins, $window_size, $constraint); +} + + + + +=head2 get_ResultFeatures_by_Slice + + Arg[1] : Bio::EnsEMBL::Slice + Arg[2] : string - Status name e.g. 'DISPLAYABLE' + Arg[3] : Boolean - with probe flag, will nest Probe object in ResultFeature + Arg[4] : int - Max bins i.e. pixel width of display + Arg[5] : int - window_size + Arg[6] : string - constraint + Example : my @rfs_with_rpobe = @{$ResultSet->get_all_ResultFeatures_by_Slice($slice, undef, 1)}; + Description: Simple wrapper method for ResultFeatureAdaptor::fetch_all_by_Slice_ResultSet + Returntype : Arrayref of ResultFeatures + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub get_ResultFeatures_by_Slice{ + my ($self, $slice, $status, $with_probe, $max_bins, $window_size, $constraint) = @_; + return $self->adaptor->db->get_ResultFeatureAdaptor->fetch_all_by_Slice_ResultSet($slice, $self, $status, $with_probe, $max_bins, $window_size, $constraint); +} + + + +#Floats unpack inaccurately so need 3 sigfiging +#This should match the format in which they are originally stored +#This is dependant on ResultSet type i.e. reads or intensity? +#No format for reads! +#Should this be set in the ResultSet instead? +#It may be more efficient for the caller to test for format first rather than blindly printf'ing +#even if there is no format? +#This needs setting in new, so we don't have to eval for every score. + +sub score_format{ + return '%.3f'; +} + + + + +=head2 log_label + + Example : print $rset->log_label(); + Description: Get a string of the unique key fields for logging purposes + Returntype : string + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub log_label { + my $self = shift; + + my $label; + + if(defined $self->feature_type()){ + $label = $self->feature_type->name.":"; + }else{ + $label = "Unknown FeatureType:"; + } + + if(defined $self->cell_type()){ + $label .= $self->cell_type->name; + }else{ + $label .= "Uknown CellType"; + } + + return $self->name.":".$self->analysis->logic_name.":".$label; +} + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/Alignment.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/Alignment.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,67 @@ +=pod + +=head1 NAME + +Bio::EnsEMBL::Funcgen::RunnableDB::Alignment; + +=head1 DESCRIPTION + +'Alignment' Is a base class for other classes dealing with Alignment +It contains virtually nothing so it may disappear and just pass to Funcgen + +=cut + +package Bio::EnsEMBL::Funcgen::RunnableDB::Alignment; + +use warnings; +use strict; +use Bio::EnsEMBL::Funcgen::Utils::Helper; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::InputSet; +use Bio::EnsEMBL::Funcgen::DataSet; +use Bio::EnsEMBL::Funcgen::FeatureSet; +use Bio::EnsEMBL::Funcgen::AnnotatedFeature; + +use base ('Bio::EnsEMBL::Funcgen::RunnableDB::Funcgen'); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); +use Data::Dumper; + +sub fetch_input { + my $self = shift @_; + + $self->param("file_type","sam"); + + $self->SUPER::fetch_input(); + + #Just override input folders... maybe consider overriding output folders too? + my $input_dir = $self->_work_dir()."/fastq/".$self->_species()."/".$self->_experiment_name()."/". + $self->_cell_type()->name."_".$self->_feature_type()->name; + $self->_input_dir($input_dir); + + #Folder where to send the final results... + my $repository = $self->_work_dir()."/alignments/".$self->_species."/".$self->_assembly()."/".$self->_experiment_name(); + $self->_repository($repository); + + return 1; +} + +#Private getter / setter to the input folder +sub _input_dir { + return $_[0]->_getter_setter('input_dir',$_[1]); +} + +#Private getter / setter to the repository folder +sub _repository { + return $_[0]->_getter_setter('repository',$_[1]); +} + +#Private getter / setter to an input file +sub _input_file { + return $_[0]->_getter_setter('input_file',$_[1]); +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/AnnotateRegulatoryFeatures.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/AnnotateRegulatoryFeatures.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1651 @@ +=pod + +=head1 NAME + +Bio::EnsEMBL::Hive::RunnableDB::Funcgen::AnnotateRegulatoryFeatures + +=head1 DESCRIPTION + +'AnnotateRegulatoryFeatures' + implements all Damian's regulatory annotation scripts in one single process + +=cut + +package Bio::EnsEMBL::Funcgen::RunnableDB::AnnotateRegulatoryFeatures; + +use warnings; +use strict; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); + +use base ('Bio::EnsEMBL::Funcgen::RunnableDB::Annotation'); + +use constant NO_ROWS => '0E0'; + +sub fetch_input { # fetch parameters... + my $self = shift @_; + + $self->SUPER::fetch_input(); + + my $cell_type = $self->param('cell_type') || throw "No cell type defined"; + $self->_cell_type($cell_type); + + return 1; +} + +sub run { # Check parameters and do appropriate database/file operations... + my $self = shift @_; + + #Create a database for this cell_type... maybe add this step to setup? + my %wparams = %{$self->_workdb_params}; + my $dbparams = "-h".$wparams{'-host'}." -u".$wparams{'-user'}." -P".$wparams{'-port'}." -p".$wparams{'-pass'}; + #my $dbname = "annotation_".$self->_species."_".$self->_release."_".$self->_cell_type; + my $ct = $self->_cell_type; + $ct =~ s/\-/_/g; + my $dbname = "annotation_".$self->_efgdba->dbc->dbname."_".$ct; + + system("mysql $dbparams -e 'drop database if exists ".$dbname."; create database ".$dbname."';") && throw "Could not create work database $dbname"; + $wparams{'-dbname'} = $dbname; + + eval{ + $self->_workdb(Bio::EnsEMBL::DBSQL::DBConnection->new( %wparams )); + }; + if($@) { throw "Error creating the Work DB Connection: $@"; } + if(!$self->_workdb()){ throw "Could not create Work DB server connection"; } + + my $workdb = $self->_workdb; + + my $dump_dir = $self->_output_dir."/tmp_".$self->_cell_type."_".int(rand(100)); + + open(REPORT,">".$self->_output_dir."/".$self->_cell_type.".report"); + + _copy_data($self->_dnadba->dbc,$self->_efgdba->dbc,$workdb,$dump_dir); + + #Perhaps make a feature factory?... Each feature knows which data it needs... + # ORDER MATTERS!!?? + _create_transcript_features($workdb); + _create_repeat_features($workdb); + _create_gene_features($workdb); + _create_PolIII_features($workdb); + _create_exon_features($workdb); + _create_intergenic_features($workdb); + _cage_ditag_transcript_tss($workdb); + + #Not in use at the moment... + #&karyotype_features($dbh,$dbu); + #&cpg_features($dbh,$dbu); + + #Process + _process_regulatory_features($self->_dnadba->dbc,$self->_efgdba->dbc,$workdb,$self->_cell_type); + + _generate_random_frags($workdb); + + #read from file or parameters... + my @gen_feats; + push @gen_feats, "protein_coding_exon1_plus_enhancer"; + push @gen_feats, "protein_coding_transcript_downstream_2500"; + push @gen_feats, "protein_coding_single_exon_gene_plus_enhancer"; + push @gen_feats, "protein_coding_intron1"; + push @gen_feats, "protein_coding_gene_body"; + push @gen_feats, "intergenic_2500"; + push @gen_feats, "RNA_gene_single_exon_gene_plus_enhancer"; + # This one needs cage_ditag_transcript_tss + push @gen_feats, "tss_centred_5000"; + push @gen_feats, "PolIII_transcribed_gene_plus_enhancer"; + + _calculate_associations($workdb, $self->_output_dir."/".$self->_cell_type, \@gen_feats, $self->_species); + + #&clean_temp(); + + close(REPORT); + + return 1; +} + + +sub write_output { + my $self = shift @_; + + return 1; + +} + +# Dumps data from core and other relevant databases and imports it into a working db +sub _copy_data { + my ($dnadb, $efgdb, $workdb, $dump_dir) = (shift,shift,shift,shift); + + # Copy data from core + system("mkdir -p ".$dump_dir) && throw "Error creating $dump_dir folder"; + + my $dump_templ = "mysqldump --opt --skip-lock-tables -h ".$dnadb->host. + " -u ".$dnadb->username. + " -P ".$dnadb->port. + " ".$dnadb->dbname. + ' %s '. + " > ${dump_dir}/".'%s'.".dump"; + + my $load_templ = "mysql -h ".$workdb->host. + " -u ".$workdb->username. + " -P ".$workdb->port. + " -p".$workdb->password. + " ".$workdb->dbname. + " < ${dump_dir}/".'%s'.".dump"; + + my @core_tables = ( + 'transcript', + 'transcript_stable_id', + 'gene', + 'gene_attrib', + 'attrib_type', + 'exon', + 'exon_transcript', + 'coord_system', + 'meta_coord', + 'seq_region', + 'seq_region_attrib', + 'assembly', + 'seq_region_attrib', + 'simple_feature', + 'analysis', + 'karyotype', + 'xref', + 'object_xref', + 'external_db', + 'translation', + 'analysis_description' + ); + + push @core_tables, ('repeat_feature','repeat_consensus'); + + foreach my $table (@core_tables){ + + #unless( $core_dbu->table_exists($table)){die ("ERROR: The core database does not contain table $table")} + + my $command = sprintf($dump_templ,$table,$table); + system($command) && throw "Error running $command"; + $command = sprintf($load_templ,$table); + system($command) && throw "Error running $command"; + } + + # copy data from funcgen + $dump_templ = "mysqldump --opt --skip-lock-tables -h ".$efgdb->host. + " -u ".$efgdb->username. + " -P ".$efgdb->port; + if($efgdb->password){ + $dump_templ .= " -p".$efgdb->password; + } + $dump_templ .= " ".$efgdb->dbname. + ' %s '. + " > ${dump_dir}/".'%s'.".dump"; + + my @funcgen_tables =( 'regulatory_attribute', + 'feature_type', + 'regulatory_feature', + 'feature_set', + 'data_set', + 'supporting_set', + 'external_feature', + 'meta', + 'annotated_feature' + ); + + + foreach my $table (@funcgen_tables){ + + #Do some checking?? + #unless( $source_dbu->table_exists($table)){die ("ERROR: The funcgen database does not contain table $table")} + + my $command = sprintf($dump_templ,$table,$table); + system($command) && throw "Error running $command"; + $command = sprintf($load_templ,$table); + system($command) && throw "Error running $command"; + } + + system("rm -r -f ".$dump_dir) && throw "Error deleting $dump_dir folder"; + +} + +sub _create_PolIII_features{ + my($dbc) = @_; + + my @sql; + push @sql, "insert into PolIII_transcribed_gene select * from all_repeats where feature_type = 'PolIII_transcribed_repeat'"; + push @sql, "insert into PolIII_transcribed_gene select * from all_repeats where feature_type = 'tRNA_repeat'"; + + push @sql, "drop table if exists PolIII_transcribed_gene_plus_enhancer"; + push @sql, "create table PolIII_transcribed_gene_plus_enhancer select seq_region_id, seq_region_name,feature_id, 'PolIII_transcribed_gene_plus_enhancer' as feature_type,if(feature_strand = 1,feature_start-2500,feature_start) as feature_start, if(feature_strand = 1,feature_end,feature_end +2500) as feature_end, feature_strand from PolIII_transcribed_gene"; + + push @sql, _col_types_and_indices('PolIII_transcribed_gene_plus_enhancer'); + + _run_sql($dbc,@sql) or throw "cannot run sql"; +} + + +sub _create_gene_features{ + my ($dbc) = @_; + + my @sql; + push @sql,"drop table if exists protein_coding_gene"; + push @sql,"create table protein_coding_gene select sr.seq_region_id,sr.name as seq_region_name,gene_id as feature_id,'protein_coding_gene' as feature_type, t.seq_region_start as feature_start,t.seq_region_end as feature_end, t.seq_region_strand as feature_strand from gene t, seq_region sr where t.seq_region_id = sr.seq_region_id and t.biotype in ('protein_coding')"; + push @sql,"alter table protein_coding_gene add index(seq_region_name)"; + push @sql,"alter table protein_coding_gene add index(seq_region_id)"; + + + push @sql,"drop table if exists pseudogene"; + push @sql,"create table pseudogene select sr.seq_region_id,sr.name as seq_region_name,gene_id as feature_id,concat(biotype,'_gene') as feature_type, t.seq_region_start as feature_start,t.seq_region_end as feature_end, t.seq_region_strand as feature_strand from gene t, seq_region sr where t.seq_region_id = sr.seq_region_id and t.biotype in ('pseudogene','repeat','retrotransposed')"; + push @sql,"alter table pseudogene add index(seq_region_name)"; + push @sql,"alter table pseudogene add index(seq_region_id)"; + + + push @sql,"drop table if exists RNA_gene"; + push @sql,"create table RNA_gene select sr.seq_region_id,sr.name as seq_region_name,gene_id as feature_id,concat(biotype,'_gene') as feature_type, t.seq_region_start as feature_start,t.seq_region_end as feature_end, t.seq_region_strand as feature_strand from gene t, seq_region sr where t.seq_region_id = sr.seq_region_id and t.biotype like '%RNA%' and t.biotype not like '%pseudogene%'"; + push @sql,"alter table RNA_gene add index(seq_region_name)"; + push @sql,"alter table RNA_gene add index(seq_region_id)"; + + + #This section is possibly not necessary... + push @sql,"drop table if exists processed_transcript"; # used for QC + push @sql,"create table processed_transcript select sr.seq_region_id,sr.name as seq_region_name,gene_id as feature_id,concat(biotype,'_gene') as feature_type, t.seq_region_start as feature_start,t.seq_region_end as feature_end, t.seq_region_strand as feature_strand from gene t, seq_region sr where t.seq_region_id = sr.seq_region_id and t.biotype in ('processed_transcript')"; + push @sql,"alter table processed_transcript add index(seq_region_name)"; + push @sql,"alter table processed_transcript add index(seq_region_id)"; + + + push @sql,"drop table if exists RNA_pseudogene"; + push @sql,"create table RNA_pseudogene select sr.seq_region_id,sr.name as seq_region_name,gene_id as feature_id,concat(biotype,'_gene') as feature_type, t.seq_region_start as feature_start,t.seq_region_end as feature_end, t.seq_region_strand as feature_strand from gene t, seq_region sr where t.seq_region_id = sr.seq_region_id and t.biotype like '%RNA%' and t.biotype like '%pseudogene%'"; + push @sql,"alter table RNA_pseudogene add index(seq_region_name)"; + push @sql,"alter table RNA_pseudogene add index(seq_region_id)"; + + + push @sql,"drop table if exists lincRNA_gene"; + push @sql,"create table lincRNA_gene select sr.seq_region_id,sr.name as seq_region_name,gene_id as feature_id,concat(biotype,'_gene') as feature_type, t.seq_region_start as feature_start,t.seq_region_end as feature_end, t.seq_region_strand as feature_strand from gene t, seq_region sr where t.seq_region_id = sr.seq_region_id and t.biotype = 'lincRNA' "; + push @sql,"alter table lincRNA_gene add index(seq_region_name)"; + push @sql,"alter table lincRNA_gene add index(seq_region_id)"; + + _run_sql($dbc,@sql) or throw "cannot run sql"; + + @sql=(); + push @sql,"drop table if exists PolIII_transcribed_gene"; + push @sql,"create table PolIII_transcribed_gene select * from lincRNA_gene where 1=0"; # make empty table + + foreach my $desc ('Nuclear RNase P','RNase MRP','Vault','Y RNA','5S ribosomal RNA','7SK RNA',){ + push @sql,"insert into PolIII_transcribed_gene select sr.seq_region_id,sr.name as seq_region_name,gene_id as feature_id, '$desc' as feature_type, t.seq_region_start as feature_start,t.seq_region_end as feature_end, t.seq_region_strand as feature_strand from gene t, seq_region sr where t.seq_region_id = sr.seq_region_id and t.description like '$desc".'%'."'"; + } + push @sql,"alter table PolIII_transcribed_gene add index(seq_region_name)"; + push @sql,"alter table PolIII_transcribed_gene add index(seq_region_id)"; + + _run_sql($dbc,@sql) or throw "cannot run sql"; + +} + +# creates tables storing transcript related features +sub _create_transcript_features{ + my ($dbc) = @_; + + my @sql; + push @sql,"drop table if exists protein_coding_transcript"; + push @sql,"create table protein_coding_transcript select sr.seq_region_id,sr.name as seq_region_name,transcript_id as feature_id,'protein_coding_transcript' as feature_type, t.seq_region_start as feature_start,t.seq_region_end as feature_end, t.seq_region_strand as feature_strand from transcript t, seq_region sr where t.seq_region_id = sr.seq_region_id and t.biotype in ('protein_coding')"; + push @sql,"alter table protein_coding_transcript add index(seq_region_name)"; + push @sql,"alter table protein_coding_transcript add index(seq_region_id)"; + + push @sql,"drop table if exists protein_coding_transcript_downstream_2500"; + push @sql,"create table protein_coding_transcript_downstream_2500 select sr.seq_region_id,sr.name as seq_region_name,transcript_id as feature_id,'protein_coding_transcript_downstream_2500' as feature_type,if(seq_region_strand = 1, t.seq_region_end+1,t.seq_region_start -2500) as feature_start,if(seq_region_strand = 1,t.seq_region_end+2500,t.seq_region_start - 1) as feature_end, t.seq_region_strand as feature_strand from transcript t, seq_region sr where t.seq_region_id = sr.seq_region_id and t.biotype in ('protein_coding')"; + push @sql,"alter table protein_coding_transcript_downstream_2500 add index(seq_region_name)"; + push @sql,"alter table protein_coding_transcript_downstream_2500 add index(seq_region_id)"; + + push @sql,"drop table if exists pseudogene_transcript"; + push @sql,"create table pseudogene_transcript select sr.seq_region_id,sr.name as seq_region_name,transcript_id as feature_id,concat(biotype,'_transcript') as feature_type, t.seq_region_start as feature_start,t.seq_region_end as feature_end, t.seq_region_strand as feature_strand from transcript t, seq_region sr where t.seq_region_id = sr.seq_region_id and t.biotype in ('pseudogene','repeat','retrotransposed')"; + push @sql,"alter table pseudogene_transcript add index(seq_region_name)"; + push @sql,"alter table pseudogene_transcript add index(seq_region_id)"; + + push @sql,"drop table if exists RNA_transcript"; + push @sql,"create table RNA_transcript select sr.seq_region_id,sr.name as seq_region_name,transcript_id as feature_id,concat(biotype,'_transcript') as feature_type, t.seq_region_start as feature_start,t.seq_region_end as feature_end, t.seq_region_strand as feature_strand from transcript t, seq_region sr where t.seq_region_id = sr.seq_region_id and t.biotype like '%RNA%' and t.biotype not like '%pseudogene%'"; + push @sql,"alter table RNA_transcript add index(seq_region_name)"; + push @sql,"alter table RNA_transcript add index(seq_region_id)"; + + push @sql,"drop table if exists RNA_pseudogene_transcript"; + push @sql,"create table RNA_pseudogene_transcript select sr.seq_region_id,sr.name as seq_region_name,transcript_id as feature_id,concat(biotype,'_transcript') as feature_type, t.seq_region_start as feature_start,t.seq_region_end as feature_end, t.seq_region_strand as feature_strand from transcript t, seq_region sr where t.seq_region_id = sr.seq_region_id and t.biotype like '%RNA%' and t.biotype like '%pseudogene%'"; + push @sql,"alter table RNA_pseudogene_transcript add index(seq_region_name)"; + push @sql,"alter table RNA_pseudogene_transcript add index(seq_region_id)"; + + _run_sql($dbc,@sql) or throw "Couldn't create transcript features"; + +} + +#for repeats on top level sequence +sub _create_repeat_features{ + my($dbc)=@_; + + my @sql; + my $temp1 = "tmp_".int(rand(1000)); + # we lose a few mappings because some mappings are on parts of the contigs + # which are not included in the assembly + push @sql,"drop table if exists $temp1"; + + my $q= "create table $temp1 + select f.*, + f.seq_region_start as chr_start, + f.seq_region_end as chr_end, + f.seq_region_id as chromosome_id, + asm_sr.name as chr_name +from repeat_feature f, + analysis an, + seq_region asm_sr +where f.analysis_id = an.analysis_id +and an.logic_name = 'RepeatMask' +and f.seq_region_id = asm_sr.seq_region_id "; + + $q =~ tr/\n/ /; + push @sql, $q; + + push @sql,"alter table $temp1 add index(repeat_consensus_id)"; + # now add the repeat type + push @sql,"drop table if exists all_repeats"; + push @sql,"create table all_repeats select t1.chromosome_id as seq_region_id,t1.chr_name as seq_region_name,repeat_feature_id as feature_id,concat(rc.repeat_class,'_repeat') as feature_type,chr_start as feature_start,chr_end as feature_end , 0 as feature_strand from $temp1 t1, repeat_consensus rc where rc.repeat_consensus_id = t1.repeat_consensus_id "; + # we want the U6 snRNA, 5S rRNA, 7SLRNA etc separately for pol3 transcribed regions + push @sql,"insert into all_repeats select t1.chromosome_id as seq_region_id,t1.chr_name as seq_region_name,repeat_feature_id as feature_id,'PolIII_transcribed_repeat' as feature_type,chr_start as feature_start,chr_end as feature_end , 0 as feature_strand from $temp1 t1, repeat_consensus rc where rc.repeat_consensus_id = t1.repeat_consensus_id and rc.repeat_name in ('U6','5S','7SLRNA','7SK') "; + + push @sql, "alter table all_repeats add index(feature_type)"; + + push @sql,"drop table $temp1"; + + + _run_sql($dbc,@sql) or throw "cannot run sql"; + + my $aref = ['Satellite_repeat','Satellite/centr_repeat','LTR/ERV1_repeat','LTR/MaLR_repeat','SINE/Alu_repeat' ]; + @sql = (); + foreach my $type (@$aref){ + @sql = (); + my $table = $type; + $table =~ tr/\//_/; + $table =~ tr/-/_/; + $table =~ tr/?/_/; + push @sql,"drop table if exists $table"; + push @sql,"create table $table select * from all_repeats where feature_type = '$type'"; + push @sql,"update $table set feature_type = '$table'"; + push @sql, _col_types_and_indices("$table"); + _run_sql($dbc,@sql) or throw "cannot run sql for $type"; + #print "$table\t$table\t$table\t0\t0\t0\n"; + } + +} + +sub _create_exon_features{ + my($dbc)=@_; + my @sql; + + my $temp1 = "tmp_".int(rand(1000))."_1"; + # denormalise exon, transcript and exon_transcript + push @sql, "drop table if exists $temp1"; + push @sql, "create table $temp1 select e.*,et.rank,et.transcript_id,t.biotype from exon e, exon_transcript et, transcript t where e.exon_id = et.exon_id and et.transcript_id = t.transcript_id order by et.transcript_id,e.seq_region_start"; + push @sql, "alter table $temp1 add index(transcript_id)"; + + # get rank of last exon for each transcript + my $temp2 = "tmp_".int(rand(1000))."_2"; + push @sql, "drop table if exists $temp2"; + push @sql, "create table $temp2 select transcript_id,max(rank) as max_rank from $temp1 group by transcript_id"; + push @sql, "alter table $temp2 add index(transcript_id)"; + my $temp3 = "tmp_".int(rand(1000))."_3"; + push @sql, "drop table if exists $temp3"; + push @sql, "create table $temp3 select t1.*,t2.max_rank from $temp1 t1, $temp2 t2 where t1.transcript_id = t2.transcript_id"; + push @sql, "drop table if exists $temp2"; + push @sql, "drop table if exists $temp1"; + push @sql, "alter table $temp3 add index(transcript_id)"; + push @sql, "alter table $temp3 add index(max_rank)"; + push @sql, "alter table $temp3 add index(rank)"; + push @sql, "alter table $temp3 add index(seq_region_id)"; + + + # get first intron for each transcript + my $temp4 = "tmp_".int(rand(1000))."_4"; + push @sql, "create table $temp4 select t3.seq_region_id,sr.name as seq_region_name,t3.transcript_id as feature_id, concat(t3.biotype,'_intron1') as feature_type,if(t3.seq_region_strand = 1,t3.seq_region_end+1,b.seq_region_end+1) as feature_start, if(t3.seq_region_strand = 1,b.seq_region_start-1,t3.seq_region_start -1) as feature_end,t3.seq_region_strand as feature_strand from $temp3 t3, $temp3 b, seq_region sr where t3.seq_region_id = sr.seq_region_id and t3.transcript_id = b.transcript_id and t3.rank = 1 and b.rank = 2"; + push @sql,"drop table if exists intron1"; + push @sql,"alter table $temp4 rename as intron1"; + push @sql,_col_types_and_indices("intron1"); + push @sql,_split_by_biotype("intron1"); + _run_sql($dbc,@sql); + + # promoter defined as 500 bp upstream + push @sql, "drop table if exists exon1_plus_promoter"; + push @sql, "create table exon1_plus_promoter select t3.seq_region_id,sr.name as seq_region_name,transcript_id as feature_id, concat(t3.biotype,'_exon1_plus_promoter') as feature_type,if(seq_region_strand = 1,t3.seq_region_start-500,seq_region_start) as feature_start, if(seq_region_strand = 1,seq_region_end,seq_region_end +500) as feature_end,t3.seq_region_strand as feature_strand from $temp3 t3, seq_region sr where t3.seq_region_id = sr.seq_region_id and t3.rank = 1"; + push @sql,_col_types_and_indices("exon1_plus_promoter"); + push @sql,_split_by_biotype("exon1_plus_promoter"); + + + # enhancer defined as 2500 bp upstream + push @sql, "drop table if exists exon1_plus_enhancer"; + push @sql, "create table exon1_plus_enhancer select t3.seq_region_id,sr.name as seq_region_name,transcript_id as feature_id, concat(t3.biotype,'_exon1_plus_enhancer') as feature_type,if(seq_region_strand = 1,t3.seq_region_start-2500,seq_region_start) as feature_start, if(seq_region_strand = 1,seq_region_end,seq_region_end +2500) as feature_end,t3.seq_region_strand as feature_strand from $temp3 t3, seq_region sr where t3.seq_region_id = sr.seq_region_id and t3.rank = 1"; + push @sql,_col_types_and_indices("exon1_plus_enhancer"); + push @sql,_split_by_biotype("exon1_plus_enhancer"); + + push @sql, "drop table if exists single_exon_gene"; + push @sql, "create table single_exon_gene select t3.seq_region_id,sr.name as seq_region_name,exon_id as feature_id, concat(t3.biotype, '_single_exon_gene') as feature_type,t3.seq_region_start as feature_start, t3.seq_region_end as feature_end,t3.seq_region_strand as feature_strand from $temp3 t3, seq_region sr where t3.seq_region_id = sr.seq_region_id and t3.max_rank = 1"; + push @sql,_col_types_and_indices("single_exon_gene"); + push @sql,_split_by_biotype("single_exon_gene"); + + push @sql, "drop table if exists single_exon_gene_plus_enhancer"; + push @sql, "create table single_exon_gene_plus_enhancer select t3.seq_region_id,sr.name as seq_region_name,transcript_id as feature_id, concat(t3.biotype,'_single_exon_gene_plus_enhancer') as feature_type,if(seq_region_strand = 1,t3.seq_region_start-2500,seq_region_start) as feature_start, if(seq_region_strand = 1,seq_region_end,seq_region_end +2500) as feature_end,t3.seq_region_strand as feature_strand from $temp3 t3, seq_region sr where t3.seq_region_id = sr.seq_region_id and t3.max_rank = 1"; + push @sql,_col_types_and_indices("single_exon_gene_plus_enhancer"); + push @sql,_split_by_biotype("single_exon_gene_plus_enhancer"); + + # gene body is all exons and introns except the first of each + push @sql, "drop table if exists gene_body"; + push @sql, "create table gene_body select t3.seq_region_id,sr.name as seq_region_name,transcript_id as feature_id, concat(t3.biotype,'_gene_body') as feature_type,min(t3.seq_region_start) as feature_start, max(t3.seq_region_end) as feature_end,t3.seq_region_strand as feature_strand from $temp3 t3, seq_region sr where t3.seq_region_id = sr.seq_region_id and t3.max_rank != 1 and t3.rank != 1 group by transcript_id"; + push @sql,_col_types_and_indices("gene_body"); + push @sql,_split_by_biotype("gene_body"); + + + push @sql, "drop table if exists exon_plus_flanks_500"; + push @sql, "create table exon_plus_flanks_500 select distinct t3.seq_region_id,sr.name as seq_region_name,exon_id as feature_id, concat(t3.biotype,'_exon_plus_flanks_500') as feature_type,t3.seq_region_start-500 as feature_start, seq_region_end + 500 as feature_end,t3.seq_region_strand as feature_strand from $temp3 t3, seq_region sr where t3.seq_region_id = sr.seq_region_id "; + push @sql,_col_types_and_indices("exon_plus_flanks_500"); + push @sql,_split_by_biotype("exon_plus_flanks_500"); + + + push @sql, "drop table if exists $temp3"; + + _run_sql($dbc,@sql) or throw "cannot run sql"; + +} + +sub _split_by_biotype{ + my($feat_name) = @_; + + my @sql; + + my $prots = "'protein_coding_$feat_name', 'IG_V_gene_$feat_name', 'IG_C_gene_$feat_name', 'IG_J_gene_$feat_name', 'IG_D_gene_$feat_name'"; + push @sql,"drop table if exists protein_coding_$feat_name"; + push @sql,"create table protein_coding_$feat_name select * from $feat_name where feature_type in ($prots)"; + push @sql,"update protein_coding_$feat_name set feature_type = 'protein_coding_$feat_name'"; + push @sql,_col_types_and_indices("protein_coding_$feat_name"); + + push @sql,"drop table if exists RNA_gene_$feat_name"; + push @sql,"create table RNA_gene_$feat_name select * from $feat_name where feature_type like '%RNA_".$feat_name."'"; + push @sql,_col_types_and_indices("RNA_gene_$feat_name"); + + if($feat_name eq 'exon1_plus_enhancer'){ + push @sql,"drop table if exists snRNA_gene_$feat_name"; + push @sql,"create table snRNA_gene_$feat_name select * from $feat_name where feature_type like '%snRNA_".$feat_name."'"; + push @sql,"update snRNA_gene_$feat_name set feature_type = 'snRNA_gene_$feat_name'"; + push @sql,_col_types_and_indices("snRNA_gene_$feat_name"); + + push @sql,"drop table if exists snoRNA_gene_$feat_name"; + push @sql,"create table snoRNA_gene_$feat_name select * from $feat_name where feature_type like '%snoRNA_".$feat_name."'"; + push @sql,"update snoRNA_gene_$feat_name set feature_type = 'snoRNA_gene_$feat_name'"; + push @sql,_col_types_and_indices("snoRNA_gene_$feat_name"); + + push @sql,"drop table if exists miRNA_gene_$feat_name"; + push @sql,"create table miRNA_gene_$feat_name select * from $feat_name where feature_type like '%miRNA_".$feat_name."'"; + push @sql,"update miRNA_gene_$feat_name set feature_type = 'miRNA_gene_$feat_name'"; + push @sql,_col_types_and_indices("miRNA_gene_$feat_name"); + + push @sql,"drop table if exists miscRNA_gene_$feat_name"; + push @sql,"create table miscRNA_gene_$feat_name select * from $feat_name where feature_type like '%miscRNA_".$feat_name."'"; + push @sql,"update miscRNA_gene_$feat_name set feature_type = 'miscRNA_gene_$feat_name'"; + push @sql,_col_types_and_indices("miscRNA_gene_$feat_name"); + + push @sql,"drop table if exists rRNA_gene_$feat_name"; + push @sql,"create table rRNA_gene_$feat_name select * from $feat_name where feature_type like '%rRNA_".$feat_name."'"; + push @sql,"update rRNA_gene_$feat_name set feature_type = 'rRNA_gene_$feat_name'"; + push @sql,_col_types_and_indices("rRNA_gene_$feat_name"); + } + + my $pseuds = "'pseudogene_$feat_name','repeat_$feat_name','retrotransposed_$feat_name'"; + push @sql,"drop table if exists pseudogene_$feat_name"; + push @sql,"create table pseudogene_$feat_name select * from $feat_name where feature_type in ($pseuds)"; + push @sql,_col_types_and_indices("pseudogene_$feat_name"); + + return @sql; +} + + +sub _cage_ditag_transcript_tss{ + my($dbc)=@_; + + my $dbh = $dbc->db_handle; + + my @sql; + my $temp1 = "tmp_".int(rand(1000))."_1"; + push @sql, "drop table if exists $temp1"; + push @sql, "create table $temp1 select f.seq_region_id,sr.name as seq_region_name,if(f.seq_region_strand = 1,f.seq_region_start,f.seq_region_end) as feature_start,f.seq_region_strand as feature_strand from transcript f , seq_region sr where sr.seq_region_id = f.seq_region_id and f.biotype not like '%pseudogene%' and f.biotype not in('repeat','retrotransposed')"; + + if($dbh->selectrow_array("select count(*) from information_schema.tables where table_schema ='".$dbc->dbname."' and table_name='ditag_feature'") > 0){ + #if($dbu->table_exists("ditag_feature")){ + push @sql, "insert into $temp1 select f.seq_region_id,sr.name as seq_region_name,if(f.seq_region_strand = 1,f.seq_region_start,f.seq_region_end) as feature_start,f.seq_region_strand from ditag_feature f , seq_region sr where sr.seq_region_id = f.seq_region_id"; + } + + my $temp2 = "tmp_".int(rand(1000))."_2"; + push @sql, "drop table if exists $temp2"; + push @sql, "create table $temp2 select distinct * from $temp1"; + + # we may want to do clustering here to reduce the total number of features + + push @sql, "drop table if exists tss_upstream_500"; + push @sql, "create table tss_upstream_500 select seq_region_id,seq_region_name,if(feature_strand = -1,feature_start,feature_start-500) as feature_start,if(feature_strand = -1,feature_start+500,feature_start) as feature_end,feature_strand,'tss_upstream_500' as feature_type from $temp2"; +push @sql,"alter table tss_upstream_500 add column feature_id int(10) not null auto_increment primary key"; + push @sql,_col_types_and_indices("tss_upstream_500"); + + + push @sql, "drop table if exists tss_downstream_500"; + push @sql, "create table tss_downstream_500 select seq_region_id,seq_region_name,if(feature_strand = -1,feature_start-500,feature_start) as feature_start,if(feature_strand = -1,feature_start,feature_start+500) as feature_end,feature_strand,'tss_downstream_500' as feature_type from $temp2"; +push @sql,"alter table tss_downstream_500 add column feature_id int(10) not null auto_increment primary key"; + push @sql,_col_types_and_indices("tss_downstream_500"); + + + push @sql, "drop table if exists tss_centred_500"; + push @sql, "create table tss_centred_500 select seq_region_id,seq_region_name,feature_start-250 as feature_start,feature_start+250 as feature_end,feature_strand,'tss_centred_500' as feature_type from $temp2"; +push @sql,"alter table tss_centred_500 add column feature_id int(10) not null auto_increment primary key"; + push @sql,_col_types_and_indices("tss_centred_500"); + + + + push @sql, "drop table if exists tss_centred_5000"; + push @sql, "create table tss_centred_5000 select seq_region_id,seq_region_name,feature_start-2500 as feature_start,feature_start+2500 as feature_end,feature_strand,'tss_centred_5000' as feature_type from $temp2"; +push @sql,"alter table tss_centred_5000 add column feature_id int(10) not null auto_increment primary key"; + push @sql,_col_types_and_indices("tss_centred_5000"); + + # + _run_sql($dbh,@sql) or die; + + @sql = (); + push @sql,"drop table if exists $temp1"; + push @sql,"drop table if exists $temp2"; + _run_sql($dbh,@sql) or die; + +} + +sub _create_intergenic_features{ + my ($dbc) = @_; + + my $dbh = $dbc->db_handle; + + # we process one seq_region ata a time + # if we want to include more biotypes of gene then we should select from + # the gene table + my $q = "select distinct seq_region_id from protein_coding_gene"; + my $regions_aref = $dbh->selectcol_arrayref($q); + unless(@$regions_aref > 0){ throw "no data returned by :\n$q\n" } + + my @sql; + my $temp1 = "tmp_".int(rand(1000)); + push @sql, "drop table if exists $temp1"; + # start and end are signed so we can use -ve numbers as flags + push @sql, "create table $temp1 (seq_region_id int(10) unsigned,feature_start int(10), feature_end int(10))"; + _run_sql($dbh,@sql) or throw "error running sql"; + + # first we get the regions which lie around/between the genes + my @intergenics; + foreach my $id (@$regions_aref){ + + @intergenics = (); + + my $region_length = $dbh->selectrow_array("select length from seq_region where seq_region_id = $id"); + + $q = "select feature_start,feature_end from protein_coding_gene where seq_region_id = $id order by feature_start"; + my $genes_aaref = $dbh->selectall_arrayref($q); + unless(defined $genes_aaref){ + throw "query failed on:\n$q\n".$dbh->errstr; + } + + my $last_end; + if($genes_aaref->[0]->[0] == 1){ + $last_end = $genes_aaref->[0]->[1]; + }else{ + my @intergenic; + $intergenic[0] = -1; # use -ve to flag chromosome end + $intergenic[1] = $genes_aaref->[0]->[0] -1; + push @intergenics,\@intergenic; + $last_end = $genes_aaref->[0]->[1]; + } + + + for(my $i=1;$i< @$genes_aaref;$i++){ + + #print "gene $i ".join("\t",@{$genes_aaref->[$i]})."\n"; + my @intergenic; + if($genes_aaref->[$i]->[0] > $last_end+1){ + $intergenic[0] = $last_end+1; + $intergenic[1] = $genes_aaref->[$i]->[0] -1; + push @intergenics,\@intergenic; + } + + if($genes_aaref->[$i]->[1] > $last_end){ + $last_end = $genes_aaref->[$i]->[1]; + } + + } + + if($last_end < $region_length){ + my @intergenic; + $intergenic[0] = $last_end+1; + $intergenic[1] = - $region_length; # use -ve to flag chromosome end + push @intergenics,\@intergenic; + } + + @sql = (); + foreach my $aref (@intergenics){ + my $q = "insert into $temp1 values($id,"; + $q .= join(',',@$aref); + $q .= ")"; + push @sql,$q; + + } + _run_sql($dbh,@sql) or throw "Error running sql"; + + } + + # now we create gene-distal regions by shortening the regions in $temp1 + _intergenic_variant($dbh,$temp1,2500); + _intergenic_variant($dbh,$temp1,5000); + _intergenic_variant($dbh,$temp1,10000); + + _run_sql($dbh,"drop table $temp1") or throw "cannot drop $temp1 table"; + +} + + +sub _intergenic_variant{ + my($dbh,$temp1,$dist) = @_; + + my $final_table = 'intergenic_'.$dist; + my @sql; + push @sql,"drop table if exists $final_table"; + my $temp2 = "tmp_".int(rand(1000)); + push @sql,"drop table if exists $temp2"; + push @sql,"create table $temp2 select seq_region_id,feature_start,feature_end from $temp1 where 1=0"; + # only add/subtract $dist if we are not at the chromosome end + push @sql,"insert into $temp2 select seq_region_id,if(feature_start> 0,feature_start +$dist,feature_start * -1) as feature_start,if(feature_end > 0,feature_end - $dist, feature_end * -1) as feature_end from $temp1"; + + push @sql,"delete from $temp2 where feature_end <= feature_start"; + push @sql,"create table $final_table select t2.seq_region_id,sr.name as seq_region_name,'$final_table' as feature_type,t2.feature_start,t2.feature_end,'0' as feature_strand from $temp2 t2, seq_region sr where t2.seq_region_id = sr.seq_region_id"; + push @sql,"alter table $final_table add column feature_id int(10) not null auto_increment primary key"; + + push @sql,_col_types_and_indices($final_table); + + _run_sql($dbh,@sql) or throw "cannot run sql"; + _run_sql($dbh,"drop table $temp2") or throw "cannot drop $temp2 table"; + +} + + +sub _col_types_and_indices{ + my $table = shift; + + my @sql; + + push @sql,"alter table $table modify column feature_start int(10) unsigned"; + push @sql,"alter table $table modify column feature_end int(10) unsigned"; + push @sql,"alter table $table modify column feature_strand tinyint(2)"; + push @sql,"alter table $table modify column feature_type varchar(45)"; + push @sql,"alter table $table add index(seq_region_id)"; + push @sql,"alter table $table add index(seq_region_name)"; + push @sql,"alter table $table add index(feature_strand)"; + push @sql,"alter table $table add index(feature_start)"; + push @sql,"alter table $table add index(feature_end)"; + return @sql; +} + +sub _process_regulatory_features { + my ($dnadb, $efgdb, $workdb, $cell_name) = @_; + + _copy_with_rename($efgdb->db_handle,'seq_region',$workdb->db_handle,'func_seq_region'); + + my @sql = (); + + push @sql, "drop table if exists seq_name_lookup"; + push @sql, "create table seq_name_lookup select distinct seq_region_id,name as seq_region_name from func_seq_region"; + push @sql, "alter table seq_name_lookup add index(seq_region_id)"; + push @sql, "alter table seq_name_lookup add index(seq_region_name)"; + _run_sql($workdb,@sql) or throw "cannot create seq_name_lookup"; + + #add seq_region_name to regulatory_features + @sql = (); + my $temp1 = "tmp_".int(rand(1000)); + push @sql, "create table $temp1 select r.*,s.seq_region_name from regulatory_feature r,seq_name_lookup s where r.seq_region_id = s.seq_region_id"; + push @sql, "drop table regulatory_feature"; + push @sql, "alter table $temp1 rename as regulatory_feature"; + push @sql, "alter table regulatory_feature add index(feature_set_id)"; + _run_sql($workdb,@sql) or throw "cannot update regulatory_feature table"; + + @sql = (); + # remove all but the current set of features + my $feature_set_name ='RegulatoryFeatures'; + if($cell_name){ + $feature_set_name .= ':'.$cell_name; + } + + my $kh = "select feature_set_id from feature_set where name = '$feature_set_name'"; + my $feature_set_id = $workdb->db_handle->selectrow_array($kh); + if($feature_set_id == 0){ throw "Could not find feature_set_id for $feature_set_name"; } + + push @sql, "delete from regulatory_feature where feature_set_id != $feature_set_id"; + push @sql, "alter table regulatory_feature add index(regulatory_feature_id)"; + push @sql, "alter table regulatory_feature add index(seq_region_start)"; + push @sql, "alter table regulatory_feature add index(seq_region_end)"; + push @sql, "alter table regulatory_feature add index(seq_region_name)"; + push @sql, "alter table regulatory_feature add index(feature_type_id)"; + _run_sql($workdb,@sql) or throw "could not delete irrelevant regulatory features"; + + # we remove features which have very long 'whiskers' + # these tend to contain multiple focus features which results in + # a group of regulatory features all of which have the same attributes + @sql = (); + my $filtered_features_table = 'regulatory_features_filtered'; + push @sql,"drop table if exists $filtered_features_table"; + push @sql,"create table $filtered_features_table select *, 'regulatory_feature' as feature_type from regulatory_feature"; + push @sql, "alter table $filtered_features_table add index(seq_region_name)"; + push @sql, "alter table $filtered_features_table add index(feature_type_id)"; + push @sql, "alter table $filtered_features_table add index(feature_set_id)"; + push @sql, "alter table $filtered_features_table add index(regulatory_feature_id)"; + push @sql, "alter table $filtered_features_table add index(seq_region_start)"; + push @sql, "alter table $filtered_features_table add index(seq_region_end)"; + + my $temp2 = "tmp_".int(rand(1000)); + my $temp3 = "tmp_".int(rand(1000)); + + push @sql,"create table $temp3 select ra.regulatory_feature_id , af.* from regulatory_attribute ra, annotated_feature af where ra.attribute_feature_id = af.annotated_feature_id"; + + push @sql,"create table $temp2 select j.regulatory_feature_id,count(*) as n_attribs,count(distinct j.feature_set_id) as n_attrib_types,max(j.seq_region_end)-min(j.seq_region_start) as len,min(j.seq_region_start)as attribs_start,max(j.seq_region_end) as attribs_end,rf.binary_string from $temp3 j, $filtered_features_table rf where j.regulatory_feature_id=rf.regulatory_feature_id group by regulatory_feature_id "; + push @sql,"alter table $temp2 add index(regulatory_feature_id)"; + push @sql,"alter table $temp2 add index(attribs_start)"; + push @sql,"alter table $temp2 add index(attribs_end)"; + + push @sql,"delete r from $filtered_features_table r, $temp2 t2 where r.regulatory_feature_id = t2.regulatory_feature_id and t2.len > 5000"; + + push @sql, "alter table $filtered_features_table add index(seq_region_name)"; + push @sql, "alter table $filtered_features_table add index(feature_type_id)"; + push @sql, "alter table $filtered_features_table add index(feature_set_id)"; + push @sql, "alter table $filtered_features_table add index(regulatory_feature_id)"; + push @sql, "alter table $filtered_features_table add index(seq_region_start)"; + push @sql, "alter table $filtered_features_table add index(seq_region_end)"; + push @sql, "alter table $filtered_features_table add index(binary_string)"; + # we want to remove features which occur in the same place and have the same + # attributes as one another, leaving only one representative + push @sql, "delete b from $filtered_features_table a, $filtered_features_table b, $temp2 t2a, $temp2 t2b where a.seq_region_name=b.seq_region_name and b.binary_string = a.binary_string and a.regulatory_feature_id < b.regulatory_feature_id and a.regulatory_feature_id = t2a.regulatory_feature_id and b.regulatory_feature_id = t2b.regulatory_feature_id and t2a.attribs_start=t2b.attribs_start and t2a.attribs_end = t2b.attribs_end"; + + push @sql,"drop table if exists $temp2"; + + _run_sql($workdb,@sql) or throw "cannot run sql"; + + # reg features which overlap centromeric repeats tend to contain too many marks# which suggests there is something wrong with the mappings. + # so we get rid of them + + @sql = (); + push @sql,"drop table if exists $temp3"; + + push @sql,"create table $temp3 select f.regulatory_feature_id from $filtered_features_table f, Satellite_centr_repeat s where s.seq_region_name = f.seq_region_name and s.feature_end >= f.seq_region_start and s.feature_start <= f.seq_region_end"; + push @sql,"delete f from $filtered_features_table f,$temp3 t where f.regulatory_feature_id = t.regulatory_feature_id"; + + + push @sql,"drop table if exists $temp3"; + + push @sql,"create table $temp3 select f.regulatory_feature_id from $filtered_features_table f, Satellite_repeat s where s.seq_region_name = f.seq_region_name and s.feature_end >= f.seq_region_start and s.feature_start <= f.seq_region_end"; + push @sql,"delete f from $filtered_features_table f,$temp3 t where f.regulatory_feature_id = t.regulatory_feature_id"; + + push @sql,"drop table if exists $temp3"; + + # reg feats on the mitochondrial DNA are unlikely to use the same + # 'histone code' as the rest of the genome. they might even be artefacts. + # so we remove them (this should not be necessary as there should not be features on MT.. + push @sql, "delete from $filtered_features_table where seq_region_name = 'MT'"; + _run_sql($workdb,@sql) or throw "cannot run sql"; + + @sql=(); + # we make the col names compatible with the genomic features in the analysis + push @sql,"alter table $filtered_features_table change column seq_region_start feature_start int(10) unsigned"; + push @sql,"alter table $filtered_features_table change column seq_region_end feature_end int(10) unsigned"; + push @sql,"alter table $filtered_features_table change column seq_region_strand feature_strand tinyint(2)"; + push @sql,"alter table $filtered_features_table modify column feature_type varchar(45)"; + + _run_sql($workdb,@sql) or throw "cannot run sql"; + +} + +sub _generate_random_frags { + my ($workdb) = @_; + + my $originals_table = "regulatory_features_filtered"; + my $outtable = "mockreg_features_filtered"; + + my @sql; + + push @sql,"drop table if exists banned_region_table"; + push @sql,"create table banned_region_table ( seq_region_name varchar(10), feature_start int(11), feature_end int(11))"; + my $dbh = $workdb->db_handle; + _run_sql($dbh,@sql) or throw "Cannot run sql"; + + @sql = (); + + # first we need the lengths of the seq regions so we know the max + # tfrag_enc_end value we can generate + my %enc_len; + my $q = "select distinct seq_region_name from $originals_table"; + my $col_ref = $dbh->selectcol_arrayref($q) or throw $dbh->errstr; + my $clause = " where name in('"; + $clause .= join("','",@$col_ref); + $clause .= "')"; + + $q="select name, length from seq_region" .$clause; + my $aaref = $dbh->selectall_arrayref($q) or throw $dbh->errstr; + foreach my $aref (@$aaref){ + $enc_len{$aref->[0]} = $aref->[1]; + } + + my $tmp_table = "tmp_".int(rand(1000)); + push @sql, "drop table if exists $tmp_table"; + push @sql, "create table $tmp_table (regulatory_feature_id int(10) unsigned,seq_region_name char(10),feature_start int(10) unsigned,feature_end int(10) unsigned,feature_strand tinyint(2),feature_type varchar(40))"; + + # next we need the seq_region and length of each original feature + my %frag_len; + $q = "select seq_region_name, feature_end - feature_start +1 as length,feature_type,feature_strand,regulatory_feature_id from $originals_table order by (feature_end - feature_start +1) desc "; + $aaref = $dbh->selectall_arrayref($q) or throw "failed on \n$q\n".$dbh->errstr; + + foreach my $aref (@$aaref){ + my $enc = $aref->[0]; + my $len = $aref->[1]; + my $type = $aref->[2]; + my $ori = $aref->[3]; + my $id = $aref->[4]; + + my($start,$end) = _get_rand_region($dbh,$enc,$len,%enc_len); + + my $q = "insert into $tmp_table values("; + $q .= $id.","; + $q .= "\'".$enc."\'," ; + $q .= $start.","; + $q .= $end.","; + $q .= "\'".$ori."\'," ; + $q .= "\'".$type."\'" ; + $q .= ")"; + + push @sql,$q; + + #Add frag to the banned list (so it won't be generated again)... + my $frag_q = "insert into banned_region_table values('$enc',$start,$end) "; + _run_sql($dbh,$frag_q) or throw("failed on \n$q\n".$dbh->errstr); + + } + + push @sql, "drop table if exists $outtable"; + push @sql, "create table $outtable select t.*,o.binary_string from $tmp_table t, $originals_table o where t.regulatory_feature_id = o.regulatory_feature_id"; # select f.seq_region_name, f.feature_type,f.feature_start,f.feature_end ,f.feature_strand from temp f"; + + push @sql, "alter table $outtable add index(regulatory_feature_id)"; + push @sql, "alter table $outtable add index(seq_region_name)"; + push @sql, "alter table $outtable add index(feature_start)"; + push @sql, "alter table $outtable add index(feature_type)"; + push @sql, "alter table $outtable add index(feature_strand)"; + + push @sql, "drop table if exists banned_region_table"; + push @sql, "drop table if exists $tmp_table"; + + _run_sql($dbh,@sql) or throw "Cannot run sql"; + + +} + +sub _calculate_associations{ + my ($dbc, $dumpdir, $gen_feats, $species) = @_; + + #Pass these as parameters? + my $reg_feat_table = 'regulatory_features_filtered'; + my $mock_reg_table = 'mockreg_features_filtered'; + my $gen_feat_table = 'genomic_features'; + my $overlaps_table = 'reg_feat_gen_feat_overlaps'; + my $mock_olaps_table = 'mock_feat_gen_feat_overlaps'; + my $flags_table = 'regulatory_feature_association_flags'; + my $types_table = 'regulatory_features_classified'; + + my $dbh = $dbc->db_handle; + + system("mkdir -p ".$dumpdir) && throw "Error creating $dumpdir folder"; + + #@gen_feats is the list of feature classes relevant for classification... + _create_gen_feats_table($dbh,$gen_feats,$gen_feat_table); + + + # open a file for writing the results + my $reg_results_file = $dumpdir.'/reg_results.tab'; + my $mock_results_file = $dumpdir.'/mock_results.tab'; + + my $reg_ofh; + my $mock_ofh; + open($reg_ofh,"> $reg_results_file") or die "couldn't open $reg_results_file"; + open($mock_ofh,"> $mock_results_file") or die "couldn't open $mock_results_file"; + # filename for temp storage of genomic features + my $gen_file = $dumpdir.'/genomic_features'; + + my $q = "select distinct seq_region_name from $gen_feat_table"; + my $chrs_ref = $dbh->selectcol_arrayref($q); + foreach my $chr (@$chrs_ref){ + + my $command = "mysql -h ".$dbc->host." -u ".$dbc->username." -P ".$dbc->port." -p".$dbc->password." ".$dbc->dbname.' -BN -e"'. + "select feature_type_id,feature_start,feature_end from genomic_features where seq_region_name = '$chr' order by feature_start,feature_end".'"'. + " > $gen_file"; + system($command) && throw "Error dumping genomics features for $chr"; + + _overlap_analysis($dbh,$gen_file,$chr,$reg_feat_table,$reg_ofh,scalar(@{$gen_feats})); + _overlap_analysis($dbh,$gen_file,$chr,$mock_reg_table,$mock_ofh,scalar(@{$gen_feats})); + } + close($reg_ofh); + close($mock_ofh); + + _load_results($dbh,$reg_results_file,$overlaps_table,$reg_feat_table,$gen_feats); + _load_results($dbh,$mock_results_file,$mock_olaps_table,$mock_reg_table,$gen_feats); + + my $pat_bits = $dbh->selectrow_array("select length(binary_string) from regulatory_features_filtered limit 1"); + my @pats = _sub_pats($dbh,$reg_feat_table,$pat_bits); + + my $patt_count_thresh = 100; + + my $overlap_results_file = $dumpdir.'/overlap_results.tab'; + my $overlap_ofh; + open($overlap_ofh,"> $overlap_results_file") or throw "couldn't open $overlap_results_file"; + #for each pattern we want the number of reg feats with that pattern + # and then for each genomic feature we want the overlap counts for the + # real and mock reg feats with that pattern + my($reg_count,$real_count,$mock_count); + foreach my $pat (@pats){ + $reg_count = $dbh->selectrow_array("select count(1) from $overlaps_table where pattern like '$pat'"); + if($reg_count >= $patt_count_thresh){ + print $overlap_ofh "$pat\t$reg_count"; + foreach my $gen (@{$gen_feats}){ + $real_count = $dbh->selectrow_array("select count(1) from $overlaps_table where pattern like '$pat' and $gen"); + $mock_count = $dbh->selectrow_array("select count(1) from $mock_olaps_table where pattern like '$pat' and $gen"); + print $overlap_ofh "\t$real_count\t$mock_count"; + } + print $overlap_ofh "\n" or throw "failed to print to file"; + } + } + close($overlap_ofh); + + _create_pattern_results_tables($dbh, $overlap_results_file,$gen_feats); + + my $assoc_thresh = 51; #70; + my $second_thresh = 50; + _association_tables($dbh,$gen_feats,'pattern_overlap_summary','pattern_overlap_summary_chi',$assoc_thresh,$second_thresh); + + _create_flags_table($dbh,$flags_table,$gen_feats); + + _create_types_table($dbh,$flags_table,$types_table,$species); + + system("rm -r -f ".$dumpdir) && throw "Error deleting $dumpdir folder"; + + +} + +sub _create_gen_feats_table{ + my($dbh,$gen_feats_aref,$table)=@_; + + my @sql; + # create a temporary table of gen_feat,id + my $temp1 = "tmp_".int(rand(100))."_1";; + push @sql,"drop table if exists $temp1"; + push @sql,"create table $temp1 (feature_type varchar(60),feature_type_id int(10) unsigned)"; + for(my $i=0;$i < @$gen_feats_aref;$i++){ + push @sql, "insert into $temp1 values( '".$gen_feats_aref->[$i]."','".$i."')"; + } + push @sql,"alter table $temp1 add index(feature_type)"; + + # concatenate all the gen feats into one table + my $temp2 = "tmp_".int(rand(100))."_2";; + push @sql,"drop table if exists $temp2"; + push @sql, "create table $temp2 (feature_type varchar(60), seq_region_name varchar(40), feature_strand tinyint(2), feature_start int(10) unsigned, feature_end int(10) unsigned)"; + foreach my $feat (@$gen_feats_aref){ + push @sql, "insert into $temp2 select distinct feature_type, seq_region_name, feature_strand, feature_start, feature_end from $feat"; + } + + push @sql,"alter table $temp2 add index(feature_type)"; + + push @sql,"drop table if exists $table"; + push @sql, "create table $table select t2.*,t1.feature_type_id from $temp1 t1, $temp2 t2 where t1.feature_type = t2.feature_type"; + push @sql,"alter table $table add index(seq_region_name)"; + + push @sql,"drop table if exists $temp1"; + push @sql,"drop table if exists $temp2"; + + _run_sql($dbh,@sql) or throw "Error running sql"; + + +} + + +sub _overlap_analysis{ + my($dbh,$gen_file,$chr,$reg_feat_table,$ofh,$columns)=@_; + + my $zeros = '0' x $columns; + my @zero = split('',$zeros); + + my $q = "select regulatory_feature_id,feature_start,feature_end from $reg_feat_table where seq_region_name = '$chr' order by feature_start,feature_end"; + my $reg_aaref = $dbh->selectall_arrayref($q); + unless(defined $reg_aaref){ throw "failed on \n $q\n".$dbh->errstr; } + + # regulatory features may not be present on all chrs + if($reg_aaref eq NO_ROWS){return} + #if($reg_aaref == 0 ){return} + + # create a hash to store the overlap flags + my %flags; + foreach my $reg_ref (@$reg_aaref){ + my @zero = split('',$zeros); + $flags{$reg_ref->[0]} = \@zero; + } + + open(IN,$gen_file) or throw "failed to open $gen_file"; + + while(){ + chop; + my($type,$start,$end) = split("\t",$_); + + foreach my $reg_ref (@$reg_aaref){ + + # if the reg and gen feats overlap set the flag for that gen feat + if($reg_ref->[1] <= $end && $reg_ref->[2] >= $start){ + $flags{$reg_ref->[0]}->[$type] += 1; + } + + # trim the regulatory feature array + # remove any reg feat whose end < this gen feat start + if($reg_ref->[2] < $start){shift(@$reg_aaref)} + + # if the start of the reg feat is > end of the gen feat + # we can move onto the next gen feat + if($reg_ref->[1] > $end){last} + } + + } + + while(my($id,$aref)=each(%flags)){ + print $ofh $id."\t".join("\t",@$aref)."\n"; + } + +} + + +sub _load_results{ + my($dbh,$results_file,$overlaps_table,$reg_feat_table,$gen_feat_ref)=@_; + + my @sql; + + #create the summary table + push @sql,"drop table if exists $overlaps_table"; + my $q = "create table $overlaps_table (regulatory_feature_id int(10) unsigned, "; + $q .= join(' int(1) unsigned, ',@$gen_feat_ref); + $q .= " int(1) unsigned)"; + push @sql, $q; + + push @sql, "load data local infile '$results_file' into table $overlaps_table"; + push @sql, "alter table $overlaps_table add index(regulatory_feature_id)"; + + my $temp1 = "tmp_".int(rand(100)); + push @sql,"drop table if exists $temp1"; + + push @sql,"create table $temp1 select o.*,r.binary_string as pattern from $reg_feat_table r, $overlaps_table o where o.regulatory_feature_id = r.regulatory_feature_id"; + push @sql,"drop table $overlaps_table"; + push @sql,"alter table $temp1 rename as $overlaps_table "; + + _run_sql($dbh,@sql) or throw "cannot run sql"; +} + + +sub _sub_pats{ + my($dbh,$reg_feat_table,$pat_bits)=@_; + + my $aref = $dbh->selectcol_arrayref("select binary_string from regulatory_features_filtered group by binary_string having count(*) > 1 "); + unless(defined $aref && @$aref >0){ die "no binary strings found with query :- \n select binary_string from regulatory_features_filtered group by binary_string having count(*) > 1"} + + my %pats; + foreach my $orig (@$aref){ + my $zero_count = $orig =~ tr/0/_/; + #if($zero_count >= $pat_bits-2){next}#dont want pats with only one mark + # following the introduction of projected builds and the use of TFBS + # as focus features, patterns with only one mark are not necessarily + # just a focus feature. we therefore want to analyse them + if($zero_count == $pat_bits){next} + if($zero_count == $pat_bits - 1){ # just one bit set + $pats{$orig} = 1; + next; + } + + my @ch = split('',$orig); + # we create the set of all patterns which comprise the original but + # with one of the set bits made into '_' + for(my $i = 0;$i<@ch;$i++){ + if($ch[$i] eq '1'){ + $ch[$i]='_'; + $pats{(join('',@ch))} = 1; + $ch[$i] = 1; + + } + $pats{$orig} = 1; + } + } + + return keys(%pats); + +} + +sub _create_pattern_results_tables{ + my($dbh, $overlap_results_file,$gen_feats_ref)=@_; + + my @sql; + push @sql,"drop table if exists raw_overlap_results"; + + my $bitsize=255; + my $q = "create table raw_overlap_results (pattern varchar(${bitsize}),total_reg_feats int(10)"; + foreach my $gen (@$gen_feats_ref){ + $q .= ",$gen"."_real int(10), $gen"."_mock int(10)"; + } + $q .= ")"; + push @sql,$q; + #Similar as mysqlimport... + push @sql, "load data local infile '$overlap_results_file' into table raw_overlap_results"; + + push @sql,"drop table if exists pattern_overlap_summary"; + push @sql,"drop table if exists pattern_overlap_summary_chi"; + $q = "create table pattern_overlap_summary (pattern varchar(${bitsize})"; + my $q1 = "create table pattern_overlap_summary_chi (pattern varchar(${bitsize})"; + my $q2 = "insert into pattern_overlap_summary select pattern"; + my $q3 = "insert into pattern_overlap_summary_chi select pattern"; + foreach my $gen (@$gen_feats_ref){ + # using int(10) to remove non integer part of %age + $q .= ",$gen int(10)"; + $q1 .= ",$gen float"; + $q2 .= ",100* $gen".'_real/total_reg_feats '."as $gen"; + #$q3 .= ",($gen"."_real- $gen"."_mock)*($gen"."_real- $gen"."_mock)/ $gen"."_mock as $gen" + $q3 .= ",($gen"."_real- $gen"."_mock)*($gen"."_real- $gen"."_mock)/ if($gen"."_mock,$gen"."_mock,1) as $gen" + } + $q .= ")"; + $q1 .= ")"; + $q2 .= " from raw_overlap_results"; + $q3 .= " from raw_overlap_results"; + push @sql,$q; + push @sql,$q1; + push @sql,$q2; + push @sql,$q3; + push @sql,"alter table pattern_overlap_summary_chi add index(pattern)"; + push @sql,"alter table pattern_overlap_summary add index(pattern)"; + + _run_sql($dbh,@sql) or die; + +} + +sub _association_tables{ + my($dbh, $gen_feat_ref,$summary_table,$chi_table,$assoc_thresh,$second_thresh)=@_; + + + foreach my $feat (@$gen_feat_ref){ + # scan for initial associations + my $q = "select perc.pattern, perc.$feat from $summary_table perc, $chi_table chi where perc.$feat >= $assoc_thresh and chi.$feat > 8 and perc.pattern = chi.pattern"; + + my $aaref = $dbh->selectall_arrayref($q); + unless(defined $aaref){ throw "failed on \n$q\n".$dbh->errstr} + foreach my $aref (@$aaref){ + push @$aref,$feat; + } + + my @sql; + my $assoc_table = $feat."_assoc"; + my $not_assoc_table = $feat."_not_assoc"; + my $zero_one_table = $feat."_0_1"; + push @sql,"drop table if exists $assoc_table"; + push @sql,"create table $assoc_table select * from $summary_table where 1 = 0"; + + push @sql,"drop table if exists $not_assoc_table"; + push @sql,"create table $not_assoc_table select * from $summary_table where 1 = 0"; + push @sql,"drop table if exists $zero_one_table"; + push @sql,"create table $zero_one_table select pattern from $summary_table where 1 = 0"; + _run_sql($dbh,@sql) or throw "error running sql"; + + foreach my $aref (@$aaref){ + my $pattern = $aref->[0]; + + _run_sql($dbh, "insert into $assoc_table select * from $summary_table where pattern like '$pattern'") or throw "error running sql"; + + + # if none of the patterns have percent < $second_thresh then the original pattern + # is OK and it stays in the table. we remove all the + # other patterns which have percent < $assoc_thresh + $q = "select pattern from $assoc_table where $feat < $second_thresh"; + my $not_ref = $dbh->selectcol_arrayref($q); + die $q."\n".$dbh->errstr unless(defined $not_ref); + + if(@$not_ref > 0){ + foreach my $pat (@$not_ref){ + _run_sql($dbh,"insert into $not_assoc_table select * from $assoc_table where pattern = '$pat'") or throw "error running sql"; + _run_sql($dbh,"delete from $assoc_table where pattern = '$pat'") or throw "error running sql"; + + my $zero_one = _add_nots($pattern,$pat); + _run_sql($dbh,"insert into $zero_one_table values('$zero_one')") or throw "error running sql"; + } + } + + + @sql=(); + push @sql,"drop table if exists temp_$$"; + push @sql,"create table temp_$$ select distinct * from $assoc_table"; + push @sql,"drop table $assoc_table"; + push @sql,"alter table temp_$$ rename as $assoc_table"; + push @sql,"alter table $assoc_table add index(pattern)"; + _run_sql($dbh,@sql); + } + + @sql = (); + # the assoc table currently contains patterns which lie between + # the assoc_thresh and the second_thresh + push @sql,"delete from $assoc_table where $feat < $assoc_thresh"; + # distinct assoc table + # **** and add back the two gene bits to the patterns **** + push @sql,"drop table if exists temp_$$"; + push @sql,"create table temp_$$ select distinct * from $assoc_table"; + push @sql,"drop table $assoc_table"; + push @sql,"alter table temp_$$ rename as $assoc_table"; + #push @sql,"update $assoc_table set pattern = concat(pattern,'__')"; + push @sql,"alter table $assoc_table add index(pattern)"; + + # distinct not_assoc table + # **** and add the two gene bits to the patterns **** + # distinct not_assoc table + # **** and add the two gene bits to the patterns **** + push @sql,"drop table if exists temp_$$"; + push @sql,"create table temp_$$ select distinct * from $not_assoc_table"; + push @sql,"drop table $not_assoc_table"; + push @sql,"alter table temp_$$ rename as $not_assoc_table"; + #push @sql,"update $not_assoc_table set pattern = concat(pattern,'__')"; + push @sql,"alter table $assoc_table add index(pattern)"; + + _run_sql($dbh,@sql) or die; + } + +} + +sub _add_nots{ + my($pat,$not) = @_; + + my @pat = split('',$pat); + my @not = split('',$not); + + for(my $i = 0;$i<@pat;$i++){ + if($pat[$i] ne '1' && $not[$i] eq '1'){ + $pat[$i] = 0; + } + } + return join('',@pat); +} + +sub _create_flags_table{ + my($dbh,$flags_table,$gen_feat_ref)=@_; + + + my @sql; + push @sql,"drop table if exists $flags_table"; + push @sql, "create table $flags_table select regulatory_feature_id,binary_string from regulatory_feature"; + push @sql, "alter table $flags_table add index(binary_string)"; + + _run_sql($dbh,@sql) or throw "error running sql"; + + + @sql = (); + foreach my $feat (@$gen_feat_ref){ + + my $assoc_table = $feat."_assoc"; + my $not_assoc_table = $feat."_not_assoc"; + + #what do we want here? check if the assoc table is empty or see if it exists? + # hack covering specific cases where there are no associations!!! + unless( ($feat eq 'intergenic_2500') + || ($feat eq 'protein_coding_gene_body') + || ($feat eq 'protein_coding_exon1_plus_enhancer') + || ($feat eq 'protein_coding_intron1') + || ($feat eq 'tss_centred_5000') + || ($feat eq 'PolIII_transcribed_gene_plus_enhancer') + || ($dbh->selectrow_array("select count(*) from $assoc_table") > 0)){ + next; + } + + push @sql, "alter table $flags_table add column $feat int(1) default 0"; + + my $q = "select pattern from $assoc_table"; + my $pat_ref = $dbh->selectcol_arrayref($q); + unless(defined $pat_ref){die "failed on \n$q\n".$dbh->errstr} + foreach my $pat (@$pat_ref){ + push @sql, "update $flags_table set $feat = 1 where binary_string like '$pat'"; + + } + + $q = "select pattern from $not_assoc_table"; + $pat_ref = $dbh->selectcol_arrayref($q); + unless(defined $pat_ref){die "failed on \n$q\n".$dbh->errstr} + foreach my $pat (@$pat_ref){ + push @sql, "update $flags_table set $feat = 0 where binary_string like '$pat'"; + } + } + + # hack covering specific cases where there are no associations!!! +# unless(_column_exists($dbh,$flags_table,'intergenic_2500')){ +# push @sql, "alter table $flags_table add column intergenic_2500 int(1) default 0"; +# } +# unless(_column_exists($dbh,$flags_table,'protein_coding_gene_body')){ +# push @sql, "alter table $flags_table add column protein_coding_gene_body int(1) default 0"; +# } +# unless(_column_exists($dbh,$flags_table,'protein_coding_exon1_plus_enhancer')){ +# push @sql, "alter table $flags_table add column protein_coding_exon1_plus_enhancer int(1) default 0"; +# } +# unless(_column_exists($dbh,$flags_table,'protein_coding_intron1')){ +# push @sql, "alter table $flags_table add column protein_coding_intron1 int(1) default 0"; +# } +# unless(_column_exists($dbh,$flags_table,'tss_centred_5000')){ +# push @sql, "alter table $flags_table add column tss_centred_5000 int(1) default 0"; +# } +# unless(_column_exists($dbh,$flags_table,'PolIII_transcribed_gene_plus_enhancer')){ +# push @sql, "alter table $flags_table add column PolIII_transcribed_gene_plus_enhancer int(1) default 0"; +# } + + _run_sql($dbh,@sql) or throw "error running sql"; + +} + + +sub _column_exists { + + my $dbh = shift; + my $table = shift; + my $col = shift; + + my $query = "select $col from $table limit 1"; + my $sth; + + unless($sth=$dbh->prepare($query)){ + throw("ERROR: preparation of statement $query failed"); + } + + unless($sth->execute){ + $sth->finish; + return(0); + } + + $sth->finish; + return(1); + +} + +sub _create_types_table{ + my($dbh,$flags_table,$types_table,$species)=@_; + + my @sql; + + push @sql, "drop table if exists $types_table"; + + # The same for all species... + #if($species eq 'homo_sapiens'){ + push @sql, "create table $types_table select regulatory_feature_id,binary_string,0 as cell_type_specific,tss_centred_5000 as promoter_associated,protein_coding_gene_body as gene_associated, intergenic_2500 as non_gene_associated,0 as unclassified, PolIII_transcribed_gene_plus_enhancer as poliii_transcription_associated from $flags_table"; + #}else{ + # push @sql, "create table $types_table select regulatory_feature_id,binary_string,0 as cell_type_specific,if(protein_coding_exon1_plus_enhancer+protein_coding_intron1 > 0, 1,0) as promoter_associated,protein_coding_gene_body as gene_associated, intergenic_2500 as non_gene_associated,0 as unclassified from $flags_table"; + #} + + # apply arbitrary rules to resolve conflicts + push @sql, "update $types_table set promoter_associated = 0 where promoter_associated and gene_associated"; # as used for v58 + #push @sql, "update $types_table set gene_associated = 0 where promoter_associated and gene_associated"; # + + push @sql, "update $types_table set gene_associated = 0 where poliii_transcription_associated and gene_associated"; + push @sql, "update $types_table set non_gene_associated = 0 where poliii_transcription_associated and non_gene_associated"; + push @sql, "update $types_table set poliii_transcription_associated = 0 where poliii_transcription_associated and promoter_associated"; + + # use unclassified col to flag conflicts + push @sql, "update $types_table set unclassified = 1 where promoter_associated and non_gene_associated"; + push @sql, "update $types_table set unclassified = 1 where gene_associated and non_gene_associated"; + # set both of the conflicting cols to 0 where there is a conflict + push @sql, "update $types_table set promoter_associated = 0 where unclassified"; + push @sql, "update $types_table set gene_associated = 0 where unclassified"; push @sql, "update $types_table set non_gene_associated = 0 where unclassified"; + + # set unclassified for rows with no flags + push @sql, "update $types_table set unclassified = 1 where gene_associated + non_gene_associated + promoter_associated+poliii_transcription_associated = 0"; + + _run_sql($dbh,@sql) or throw "error running sql"; + + # add the feature_type_id column + + _run_sql($dbh, "alter table $types_table add column feature_type_id int(10) unsigned") or throw "could not add feature_type_id column in $types_table"; + # set the type_id + # now that we have moved to single cell line classification there are + # no Cell type specific classifications + @sql = (); + foreach my $ft ('Gene Associated', + 'Non-Gene Associated', + 'Promoter Associated', + 'Unclassified', + 'PolIII Transcription Associated', + ){ + + my $ftid = $dbh->selectrow_array("select feature_type_id from feature_type where name = '$ft' and class = 'Regulatory Feature'"); + if(!defined($ftid) || ($ftid <1)){ throw "feature_type_id for $ft does not exist or value is less than 1"} + my $cts = 0; + my $col; + if($ft =~ 'specific'){ + $cts = 1; + ($col) = $ft =~ /(.+ *.+) -.*/; + $col = lc($col); + $col =~ tr/ /_/; + $col =~ tr/-/_/; + }else{ + $col = lc($ft); + $col =~ tr/ /_/; + $col =~ tr/-/_/; + } + print "ft $ft col $col\n"; + push @sql, "update $types_table set feature_type_id = $ftid where cell_type_specific = $cts and $col = 1"; + + } + _run_sql($dbh,@sql) or throw "cannot run sql"; + + # summary report + # remove and not cell_type_specific... + my $res = $dbh->selectrow_array(" select count(*) from regulatory_features_classified where promoter_associated and not cell_type_specific "); + _commentary("promoter_associated $res\n"); + $res = $dbh->selectrow_array(" select count(*) from regulatory_features_classified where gene_associated and not cell_type_specific"); + _commentary("gene_associated $res\n"); + $res = $dbh->selectrow_array(" select count(*) from regulatory_features_classified where non_gene_associated and not cell_type_specific"); + _commentary("non_gene_associated $res\n"); + $res = $dbh->selectrow_array(" select count(*) from regulatory_features_classified where poliii_transcription_associated and not cell_type_specific"); + _commentary("PolIII_transcription_associated $res\n"); + + $res = $dbh->selectrow_array(" select count(*) from regulatory_features_classified where unclassified and not cell_type_specific"); + _commentary("unclassified $res\n"); + + _qc($dbh,); + + + +} + + +sub _qc{ + my($dbh) = @_; + + my @sql; + #goto GENE_ASSOC; + push @sql, "drop table if exists promoter_associated_temp"; + push @sql, "create table promoter_associated_temp select rfc.regulatory_feature_id, seq_region_name,seq_region_start,seq_region_end,rfc.binary_string from regulatory_feature rf, regulatory_features_classified rfc where rfc.promoter_associated and rfc.regulatory_feature_id =rf.regulatory_feature_id order by seq_region_name;"; + push @sql, "alter table promoter_associated_temp add index(seq_region_name)"; + push @sql, "drop table if exists promoter_features"; + push @sql, "create table promoter_features select * from protein_coding_exon1_plus_enhancer "; + #push @sql, "insert into promoter_features select * from protein_coding_intron1"; + push @sql, "insert into promoter_features select * from RNA_gene_exon1_plus_enhancer"; + push @sql, " alter table promoter_features add index(seq_region_name)"; + + _run_sql($dbh,@sql) or throw "cannot run sql"; + + my $res; + $res = $dbh->selectrow_array("select count( distinct pa.regulatory_feature_id ) from promoter_associated_temp pa, promoter_features e where pa.seq_region_name=e.seq_region_name and pa.seq_region_start <= e.feature_end and pa.seq_region_end >= e.feature_start"); + _commentary("promoter_associated features\n"); + _commentary("$res overlap an exon1_plus_2.5kb (both RNA and prot_cod)\n"); + + $res = $dbh->selectrow_array("select count( distinct pa.regulatory_feature_id ) from promoter_associated_temp pa, protein_coding_intron1 e where pa.seq_region_name=e.seq_region_name and pa.seq_region_start <= e.feature_end and pa.seq_region_end >= e.feature_start"); + _commentary("$res overlap a protein coding intron1\n"); + + $res = $dbh->selectrow_array("select count( distinct pa.regulatory_feature_id ) from promoter_associated_temp pa, processed_transcript e where pa.seq_region_name=e.seq_region_name and pa.seq_region_start <= if(e.feature_strand = -1 ,e.feature_end+2500,e.feature_end) and pa.seq_region_end >= if(e.feature_strand = -1 ,e.feature_start,e.feature_start - 2500)"); + _commentary("$res overlap a 'processed transcript'\n"); + + $res = $dbh->selectrow_array("select count( distinct pa.regulatory_feature_id ) from promoter_associated_temp pa, protein_coding_gene_body e where pa.seq_region_name=e.seq_region_name and pa.seq_region_start <= e.feature_end and pa.seq_region_end >= e.feature_start"); + _commentary("$res overlap a protein coding gene body\n"); + $res = $dbh->selectrow_array("select count( distinct pa.regulatory_feature_id ) from promoter_associated_temp pa, intergenic_2500 e where pa.seq_region_name=e.seq_region_name and pa.seq_region_start > e.feature_start and pa.seq_region_end < e.feature_end"); + _commentary("$res are at least 2500bp from any part of a protein coding gene\n"); + $res = $dbh->selectrow_array("select count( distinct pa.regulatory_feature_id ) from promoter_associated_temp pa, pseudogene_exon1_plus_enhancer e where pa.seq_region_name=e.seq_region_name and pa.seq_region_start > e.feature_start and pa.seq_region_end < e.feature_end"); + _commentary("$res overlap a pseudogene exon1_plus_2.5kb \n"); + + @sql=(); + push @sql, "drop table if exists gene_associated_temp"; + push @sql, "create table gene_associated_temp select rfc.regulatory_feature_id, seq_region_name,seq_region_start,seq_region_end from regulatory_feature rf, regulatory_features_classified rfc where rfc.gene_associated and rfc.regulatory_feature_id =rf.regulatory_feature_id order by seq_region_name"; + push @sql, "alter table gene_associated_temp add index(seq_region_name)"; + _run_sql($dbh,@sql) or throw "cannot run sql"; + + _commentary("gene_associated features\n"); + + $res = $dbh->selectrow_array("select count( distinct pa.regulatory_feature_id ) from gene_associated_temp pa, protein_coding_gene e where pa.seq_region_name=e.seq_region_name and pa.seq_region_start <= e.feature_end and pa.seq_region_end >= e.feature_start"); + _commentary("$res overlap some part of a protein coding gene\n"); + + $res = $dbh->selectrow_array("select count( distinct pa.regulatory_feature_id ) from gene_associated_temp pa, RNA_gene e where pa.seq_region_name=e.seq_region_name and pa.seq_region_start <= e.feature_end and pa.seq_region_end >= e.feature_start"); + _commentary("$res overlap some part of an RNA gene\n"); + + $res = $dbh->selectrow_array("select count( distinct pa.regulatory_feature_id ) from gene_associated_temp pa, intergenic_2500 e where pa.seq_region_name=e.seq_region_name and pa.seq_region_start > e.feature_start and pa.seq_region_end < e.feature_end"); + _commentary("$res are at least 2500bp from any part of a protein coding gene\n"); + + $res = $dbh->selectrow_array("select count( distinct pa.regulatory_feature_id ) from gene_associated_temp pa, pseudogene_transcript e where pa.seq_region_name=e.seq_region_name and pa.seq_region_start <= e.feature_end and pa.seq_region_end >= e.feature_start"); + _commentary("$res overlap some part of a pseudogene\n"); + + $res = $dbh->selectrow_array("select count( distinct pa.regulatory_feature_id ) from gene_associated_temp pa, processed_transcript e where pa.seq_region_name=e.seq_region_name and pa.seq_region_start <= e.feature_end and pa.seq_region_end >= e.feature_start"); + _commentary("$res overlap some part of a 'processed_transcript'\n"); +} + +sub _commentary{ + my ($comm) = @_; + print $comm; + print REPORT $comm; +} + + +# returns start and end +sub _get_rand_region{ + my ($dbh,$enc,$len,%enc_len) = @_; + + my $bad = 1; + my $end; + my $start; + my $try = 1; + while($bad){ + $try++; + if($try > 1000000){ + throw "can't find a random region for seq_region $enc, length $len\n"; + } + + # get a random number which is within the encode region + $end = int(rand($enc_len{$enc})); + + # the number must be higher than the length of the frag + if($end > $len){ + $bad = 0; + } + $start = $end - $len +1; + + if($bad == 0){ + #Avoid regions that were already taken... + my $q = "select * from banned_region_table where seq_region_name = '$enc' and $start <= feature_end and $end >= feature_start limit 1 "; + my $aaref = $dbh->selectall_arrayref($q); + unless(defined $aaref){die $dbh->errstr} + if(scalar(@$aaref) > 0){ $bad = 1; } + } + } + + return ($start,$end); + +} + + +#Help function to copy a table between databases with different names... +#Require DBI db handle objects +sub _copy_with_rename{ + my($s_dbh,$source_table,$t_dbh,$targ_table)=@_; + + my $q = "desc $source_table"; + my $desc_aaref=$s_dbh->selectall_arrayref($q); + unless(defined $desc_aaref){die "failed on:\n $q\n".$s_dbh->errstr} + + _run_sql($t_dbh,"drop table if exists $targ_table"); + $q = " create table $targ_table ("; + my $select; + foreach my $aref (@$desc_aaref){ + $q .= $aref->[0].' '.$aref->[1].','; + $select .= $aref->[0].','; + } + chop $q; #remove last comma + chop $select; + $q .= ")"; + print $q."\n"; + _run_sql($t_dbh,$q) or throw "Error creating table"; + + $q = "select $select from $source_table"; + my $aaref = $s_dbh->selectall_arrayref($q); + unless(defined $aaref){die "failed on:\n $q\n".$s_dbh->errstr} + my @sql; + foreach my $aref (@$aaref){ + $q= "insert into $targ_table values('".join("','",@$aref); + $q .= "')"; + push @sql,$q; + + } + + _run_sql($t_dbh,@sql) or throw "Error copying data"; + +} + + +# _run_sql +# +# Arg [1] : scalar database handle from DBI module +# Arg [2] : array of scalars containing lines of text in the format of SQL statements +# Function : submits each element of Arg[2] to the $dbh->prepare and +# $dbh->execute methods and checks for successful execution. +# Returns 0 on failure, 1 on success. Emits error messages. +# Returntype: int +# Exceptions: none +# Example : _run_sql($dbh, @array); + +sub _run_sql{ + my $dbh = shift; + my (@array)=@_; + + my $sth; + + foreach my $query(@array){ + + eval { + unless($sth=$dbh->prepare($query)){ + warn("Error: preparation of statement failed on: $query\n"); + warn("database error message: ".$dbh->errstr."\n"); + return(0); + } + + unless($sth->execute){ # returns true on success + warn("Error: statement execution failed on: $query\n"); + warn("statement handle error message:".$sth->errstr."\n"); + return(0); + } + }; + if($@){ warn "Error running SQL"; return(0); } + + } + + return(1); +} + +#Private getter / setter to the Work DB Connection +sub _workdb { + return $_[0]->_getter_setter('workdb',$_[1]); +} + +#Private getter / setter to the cell type +sub _cell_type { + return $_[0]->_getter_setter('cell_type',$_[1]); +} + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/Annotation.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/Annotation.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,160 @@ +=pod + +=head1 NAME + +Bio::EnsEMBL::Funcgen::RunnableDB::Annotation + +=head1 DESCRIPTION + +'Annotation' is a base class for runnables running the Annotatin Pipeline +It performs common tasks such as connecting to the EFG DB etc... + +=cut + +package Bio::EnsEMBL::Funcgen::RunnableDB::Annotation; + +use warnings; +use strict; + +use Bio::EnsEMBL::Funcgen::Utils::Helper; +use Bio::EnsEMBL::DBSQL::DBConnection; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::InputSet; +use Bio::EnsEMBL::Funcgen::DataSet; +use Bio::EnsEMBL::Funcgen::FeatureSet; +use Bio::EnsEMBL::Funcgen::AnnotatedFeature; +use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); +use Data::Dumper; +use DBI; + +#use base ('Bio::EnsEMBL::Hive::ProcessWithParams'); +use base ('Bio::EnsEMBL::Hive::Process'); + +#This defines a set of parameters based on given parameters from the pipeline: +sub fetch_input { # nothing to fetch... just the DB parameters... + my $self = shift @_; + + my $dnadb_params = $self->param('dnadb') || throw "No parameters for Core DB"; + eval{ $self->_dnadba(Bio::EnsEMBL::DBSQL::DBAdaptor->new(%{ $dnadb_params })); }; + if($@) { throw "Error creating the Core DB Adaptor: $@"; } + if(!$self->_dnadba()){ throw "Could not connect to Core DB"; } + $self->_dnadb_params($dnadb_params); + + #Get efg connection, otherwise fail.. + my $efgdb_params = $self->param('efgdb') || throw "No parameters for EFG DB"; + + eval{ + $self->_efgdba(Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor->new( + %{ $efgdb_params }, + #Why is this not working??? + #-dnadb => $self->_dnadba, + -dnadb_user => $self->_dnadba->dbc->username, + -dnadb_port => $self->_dnadba->dbc->port, + -dnadb_host => $self->_dnadba->dbc->host, + -dnadb_dbname => $self->_dnadba->dbc->dbname, + )); + }; + + if($@) { throw "Error creating the EFG DB Adaptor: $@"; } + if(!$self->_efgdba()){ throw "Could not connect to EFG DB"; } + $self->_efgdb_params($efgdb_params); + + #Get work db connection, otherwise fail.. + my $workdb_params = $self->param('workdb') || throw "No parameters for Work server"; + $self->_workdb_params($workdb_params); + + my $species = $self->param('species') || throw "No species defined"; + $self->_species($species); + + #Not is use, currently... + #my $release = $self->param('release') || throw "Release number not specified"; + #$self->_release($release); + + my $work_dir = $self->param('work_dir') || throw "'work_dir' is a required parameter"; + $self->_work_dir($work_dir); + + #Work with conventions here too?? work_dir/output/dbname ?? + my $output_dir = $self->param('output_dir') || throw "'output_dir' is a required parameter"; + $self->_output_dir($output_dir); + + return 1; +} + + +sub run { + my $self = shift @_; + + return 1; +} + + +sub write_output { + my $self = shift @_; + + return 1; + +} + + +#Private Generic getter and setter +sub _getter_setter { + my ($self, $param_name, $param_value) = @_; + if(!$param_name){ return undef; } + if(!$param_value){ + $param_value = $self->param($param_name); + } else { + $self->param($param_name, $param_value); + } + return $param_value; +} + +# Private getter / setters : Maybe do some validation in some cases... + +#Private getter / setter to the Work DB Connection params +sub _workdb_params { + return $_[0]->_getter_setter('workdb_params',$_[1]); +} + +#Private getter / setter to the EFG DB Adaptor +sub _efgdba { + return $_[0]->_getter_setter('efgdb',$_[1]); +} + +#Private getter / setter to the EFG DB Connection params +sub _efgdb_params { + return $_[0]->_getter_setter('efgdb_params',$_[1]); +} + +#Private getter / setter to the Core DB Adaptor +sub _dnadba { + return $_[0]->_getter_setter('dnadb',$_[1]); +} + +#Private getter / setter to the Core DB Connection params +sub _dnadb_params { + return $_[0]->_getter_setter('dnadb_params',$_[1]); +} + +#Private getter / setter to the Species name +sub _species { + return $_[0]->_getter_setter('species',$_[1]); +} + +#Private getter / setter to the work folder +sub _work_dir { + return $_[0]->_getter_setter('work_dir',$_[1]); +} + +#Private getter / setter to the output folder +sub _output_dir { + return $_[0]->_getter_setter('output_dir',$_[1]); +} + +#Private getter / setter to the release number +sub _release { + return $_[0]->_getter_setter('release',$_[1]); +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/ClusterMotifs.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/ClusterMotifs.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,233 @@ +=pod + +=head1 NAME + +Bio::EnsEMBL::Funcgen::RunnableDB::ClusterMotifs + +=head1 DESCRIPTION + +'ClusterMotifs' + +=cut + + +package Bio::EnsEMBL::Funcgen::RunnableDB::ClusterMotifs; + +use warnings; +use strict; + +use base ('Bio::EnsEMBL::Funcgen::RunnableDB::Motif'); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); +use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw (run_system_cmd); +use Data::Dumper; + +sub fetch_input { # nothing to fetch... just the parameters... + my $self = shift @_; + + $self->SUPER::fetch_input(); + + return 1; +} + +sub run { + my $self = shift @_; + my @matrices; + + my $base = $self->_output_dir."/".$self->_feature_set->name; + + my $motif_file = $base.".transfac"; + #Maybe check if files are empty or not there at all! + + #Need to clump all motif files together... + run_system_cmd("cat ".$self->_output_dir."/*.tmp_TRANSFAC > ".$motif_file); + + eval { + #RUN STAMP here to cluster all motifs... change the Jaspar directories!! + my $bin_folder = $self->_bin_folder(); + run_system_cmd("${bin_folder}/STAMP -tf $motif_file -align SWU -cc PCC -sd ${bin_folder}/STAMP_data/Jaspar2010_1000random_PCC_SWU.scores -chp -nooverlapalign -match ${bin_folder}/STAMP_data/Jaspar_Core_PBM_PolII.Transfac -match_top 1 -ma IR -out $base"); + + #TODO Refactor this section... + my $is_top = 0; + my $top_matrix; #Is the one with smallest p + my %matrix_scores; + my $min_p = 1; + open(FILE,$motif_file); + while(){ + my $line = $_; + if($line =~ /^DE\s+(\S+)\s+(\S+)/){ + my $name = $1; my $p = $2; + $matrix_scores{$name} = $p; + if($p < $min_p){ + $top_matrix = ""; + $is_top = 1; + $min_p = $p; + #import the top matrix... + #Also check if it matches any known matrix... + my $ff = $base."_match_pairs.txt"; + if(-e $ff){ + open(FIM,$ff); + while(){ + if(/^>\s*(\S+)\s+/){ + if($1 eq $name){ + my $fline = ; chomp($fline); my ($jaspar, $score) = split(/\s+/, $fline); + if($score < 0.005){ $line =~ s/\n/\t${jaspar}\n/; } + last; + } + } + } + close FIM; + } else { warn $ff." was not found!"; } + } + } + if($is_top){ + $top_matrix .= $line; + if($line =~ /^XX/){ $is_top = 0; } + } + + } + close FILE; + if($top_matrix){ + push @matrices, _transfac_to_jaspar($top_matrix); + } else { warn "No top matrix found!!"; } + + #Just check what are the most similar matrices to the clusters... + run_system_cmd("${bin_folder}/STAMP -tf ${base}_tree_clusters.txt -align SWU -cc PCC -sd ${bin_folder}/STAMP_data/Jaspar2010_1000random_PCC_SWU.scores -nooverlapalign -match ${bin_folder}/STAMP_data/Jaspar_Core_PBM_PolII.Transfac -match_top 1 -ma IR -out ${base}_global"); + + #Recluster based on matrix similarity... + push @matrices, @{_recluster_motifs($self->_feature_set->name, $self->_bin_folder(),$base,\%matrix_scores)}; + }; + if ($@){ warn "No motif found: ".$@; } + + #run_system_cmd("rm -f ".$self->_output_dir."/*.tmp_TRANSFAC"); + + $self->_matrix_to_store(\@matrices); + + return 1; +} + + +sub write_output { # Nothing is written at this stage (for the moment) + my $self = shift @_; + + #Store the final matrices obtained... + open(FO,">".$self->_output_dir."/".$self->_feature_set->name.".final"); + print FO join("\n",@{$self->_matrix_to_store()}); + close FO; + + return 1; +} + +#TODO Need refactoring and optimization... +sub _recluster_motifs { + my ($fset, $bin_folder, $base, $scores) = (shift, shift, shift, shift); + my %clusters; + my @result_matrix = (); + + open(FILE,$base."_tree_clusters.txt"); + while(){ + if(/^DE\s+(\S+)\s*$/){ + #try to obtain a score here!... + my $cluster_id = $1; + my $matrix = $_; + while(){ $matrix .= $_; last if(/^XX/); } + my $cluster_members = ; + chomp($cluster_members); + $cluster_members =~ s/^XX\s+\Cluster_Members:\s+//; + $clusters{$cluster_id}{"matrix"}=$matrix; + push @{$clusters{$cluster_id}{"elements"}}, split(/\s+/,$cluster_members); + } + } + close FILE; + + #recalculate scores + foreach my $cluster (keys %clusters){ + my $score = 0; + map { $score += $scores->{$_}; } @{$clusters{$cluster}{"elements"}}; + $score = $score / scalar(@{$clusters{$cluster}{"elements"}}); + $clusters{$cluster}{"matrix"} =~ s/\n/\t${score}\n/; + $clusters{$cluster}{"score"} = $score; + } + + + my %reclust; + open(FILE,$base."_global_match_pairs.txt"); + while(){ + chomp; + if(/^>\s+(\S+)\s*$/){ + my $clust_id = $1; + my $match = ; + chomp($match); + my ($jaspar, $score, undef, undef) = split(/\s/,$match); + #Make this an alpha-parameter? + if($score < 0.005){ + #add the matrix in the first line... + $clusters{$clust_id}{"matrix"} =~ s/\n/\t${jaspar}\n/; + push @{$reclust{$jaspar}}, $clust_id; + } + } + } + close FILE; + + foreach my $jaspar (keys %reclust) { + if(scalar(@{$reclust{$jaspar}})>1){ + my $score = 0; + map { $score += $clusters{$_}{"score"}; } @{$reclust{$jaspar}}; + $score = $score / scalar(@{$reclust{$jaspar}}); + my $clust_file = $base."_cluster_".$jaspar; + open(FO,">".$clust_file); + map { print FO $clusters{$_}{"matrix"}; } @{$reclust{$jaspar}}; + close FO; + run_system_cmd("${bin_folder}/STAMP -tf $clust_file -align SWU -cc PCC -sd ${bin_folder}/STAMP_data/Jaspar2010_1000random_PCC_SWU.scores -chp -nooverlapalign -match ${bin_folder}/STAMP_data/Jaspar_Core_PBM_PolII.Transfac -match_top 1 -ma IR -out $clust_file"); + my $matrix; + open(FILE,"${clust_file}FBP.txt"); + #Re-add the averaged score here! + while(){ + if(/^DE/){ + $matrix = "DE ".$fset."_cluster_${jaspar}\t".$score."\t".$jaspar."\n"; + } else { $matrix .= $_; } + } + close FILE; + push @result_matrix, _transfac_to_jaspar($matrix); + map { delete $clusters{$_}; } @{$reclust{$jaspar}}; + } + } + + map { push @result_matrix, _transfac_to_jaspar($clusters{$_}{"matrix"}); } keys %clusters; + + return \@result_matrix; + +} + + +#private function that transforms a transfac matrix to jaspar format +sub _transfac_to_jaspar{ + my ($transfac) = shift; + my $jaspar; + + my @lines = split(/\n/,$transfac); + pop @lines; #XX + my $title = shift @lines; + $title =~ s/^DE/>/; + $jaspar = $title."\n"; + my @as; my @cs; my @gs; my @ts; + foreach my $line (@lines){ + my (undef,$a,$c,$g,$t,undef) = split(/\s+/,$line); + #convert it to integers if necessary to make it simpler after + if($a<=1 && $c<=1 && $g<=1 && $t<=1){ $a = int(100*$a); $c=int(100*$c); $g=int(100*$g); $t=int(100*$t); } + push @as, $a; push @cs, $c; push @gs, $g; push @ts, $t; + } + $jaspar .= "A [ ".join("\t",@as)." ]\n" ; + $jaspar .= "C [ ".join("\t",@cs)." ]\n" ; + $jaspar .= "G [ ".join("\t",@gs)." ]\n" ; + $jaspar .= "T [ ".join("\t",@ts)." ]\n" ; + + return $jaspar; +} + +#Private getter / setter to the matrices +sub _matrix_to_store { + return $_[0]->_getter_setter('matrix_to_store',$_[1]); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/ConvergeReplicates.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/ConvergeReplicates.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,108 @@ +=pod + +=head1 NAME + +Bio::EnsEMBL::Hive::RunnableDB::Funcgen::ConvergeReplicates + +=head1 DESCRIPTION + +'ConvergeReplicates' Just merges the replicates BAM and convert it to a SAM +into the alignments folder for the peaks pipeline... + +=cut + +package Bio::EnsEMBL::Funcgen::RunnableDB::ConvergeReplicates; + +use warnings; +use strict; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); +#use Data::Dumper; + +use base ('Bio::EnsEMBL::Funcgen::RunnableDB::Alignment'); + +#TODO... Maybe use and update the tracking database... +sub fetch_input { # fetch parameters... + my $self = shift @_; + + $self->SUPER::fetch_input(); + + my $species = $self->_species(); + my $gender = $self->_cell_type()->gender; + $gender = $gender ? $gender : "male"; + my $assembly = $self->_assembly(); + + my $sam_header = $self->_work_dir()."/sam_header/".$species."/".$species."_".$gender."_".$assembly."_unmasked.header.sam"; + $self->_sam_header($sam_header); + + my $repository = $self->_repository(); + if(! -d $repository){ + system("mkdir -p $repository") && throw("Couldn't create directory $repository"); + } + + #my $nbr_replicates = $self->param('nbr_replicates') || throw "Number of replicates not given"; + my $nbr_replicates = $self->param('nbr_replicates') || 2; + $self->_nbr_replicates($nbr_replicates); + + return 1; +} + +sub run { + my $self = shift @_; + + my $sam_header = $self->_sam_header(); + + my $output_file_prefix = $self->_repository()."/".$self->_set_name(); + my $file_prefix = $self->_output_dir()."/".$self->_set_name(); + my $merge_cmd; + if($self->_nbr_replicates()>1){ + $merge_cmd="samtools merge -h $sam_header ${file_prefix}.bam ${file_prefix}.[0-9]*.sorted.bam "; + } else { + $merge_cmd = "cp ${file_prefix}.[0-9]*.sorted.bam ${file_prefix}.bam"; + } + if(system($merge_cmd) != 0){ throw "Error merging replicate bam files: $merge_cmd"; } + + + my $convert_cmd = "samtools view -h ${file_prefix}.bam > ${output_file_prefix}.samse.sam"; + if(system($convert_cmd) != 0){ throw "Error converting merged bam to sam: $convert_cmd"; } + + my $zip_cmd = "gzip ${output_file_prefix}.samse.sam"; + if(system($zip_cmd) != 0){ warn "Error zipping sam"; } + + #Merge the alignment logs too... + my $alignment_log = $output_file_prefix.".alignment.log"; + my $log_cmd="echo \"Alignment QC - total reads as input: \" >> ${alignment_log}"; + $log_cmd="${log_cmd};samtools flagstat ${file_prefix}.bam | head -n 1 >> ${alignment_log}"; + $log_cmd="${log_cmd}; echo \"Alignment QC - mapped reads: \" >> ${alignment_log} "; + $log_cmd="${log_cmd};samtools view -u -F 4 ${file_prefix}.bam | samtools flagstat - | head -n 1 >> ${alignment_log}"; + $log_cmd="${log_cmd}; echo \"Alignment QC - reliably aligned reads (mapping quality >= 1): \" >> ${alignment_log}"; + $log_cmd="${log_cmd};samtools view -u -F 4 -q 1 ${file_prefix}.bam | samtools flagstat - | head -n 1 >> ${alignment_log}"; + + if(system($log_cmd) != 0){ warn "Error making the alignment statistics"; } + + my $rm_cmd="rm -f ${file_prefix}.bam"; + if(system($rm_cmd) != 0){ warn "Error removing temp files. Remove them manually: $rm_cmd"; } + + return 1; +} + + +sub write_output { # Nothing to do here + my $self = shift @_; + + return 1; + +} + +#Private getter / setter to the sam header +sub _sam_header { + return $_[0]->_getter_setter('sam_header',$_[1]); +} + +#Private getter / setter to the sam header +sub _nbr_replicates { + return $_[0]->_getter_setter('nbr_replicates',$_[1]); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/Funcgen.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/Funcgen.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,399 @@ +=pod + +=head1 NAME + +Bio::EnsEMBL::Funcgen::RunnableDB::Funcgen + +=head1 DESCRIPTION + +'Funcgen' is a base class for other runnables of the Funcgen Hive Pipeline +It performs common tasks such as connecting to the EFG DB etc... + +=cut + +package Bio::EnsEMBL::Funcgen::RunnableDB::Funcgen; + +use warnings; +use strict; + +use Bio::EnsEMBL::Funcgen::Utils::Helper; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::InputSet; +use Bio::EnsEMBL::Funcgen::DataSet; +use Bio::EnsEMBL::Funcgen::FeatureSet; +use Bio::EnsEMBL::Funcgen::AnnotatedFeature; +#use Bio::EnsEMBL::Hive::DBSQL::AnalysisDataAdaptor; +use base ('Bio::EnsEMBL::Hive::Process'); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); +use Data::Dumper; + +#This defines a set of parameters based on given parameters from the pipeline: +sub fetch_input { # nothing to fetch... just the DB parameters... + my $self = shift @_; + + #An example of debug, in case needed + #print Dumper $self->param('dnadb'); + if(!$self->param('bin_dir')){ throw "Folder with funcgen binaries bin_dir required"; } + $self->_bin_dir($self->param('bin_dir')); + + my $dnadb_params = $self->param('dnadb') || throw "No parameters for Core DB"; + my $efgdb_params = $self->param('efgdb') || throw "No parameters for EFG DB"; + + #Get efg connection, otherwise fail.. + eval{ + $self->_efgdba(Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor->new + ( + %{ $efgdb_params }, + #let efg dba hanle dnadb + { + -dnadb_name => $dnadb_params->{-dbname}, + -dnadb_user => $dnadb_params->{-user}, + -dnadb_host => $dnadb_params->{-host}, + -dnadb_port => $dnadb_params->{-port}, + -dnadb_pass => $dnadb_params->{-pass}, + + } + )); + + #Actually test connections + $self->_efgdba->dbc->db_handle; + $self->_efgdba->dnadb->dbc->db_handle; + }; + + if($@) { throw "Error creating the EFG DBAdaptor and/or dna DBAdaptor $@"; } + + #Set some params + my $cell_type = $self->param('cell_type') || throw "No cell_type given"; + my $feature_type = $self->param('feature_type') || throw "No feature_type given"; + my $experiment_name = $self->param('experiment_name') || throw "No experiment_name given"; + $self->_experiment_name($experiment_name); + my $set_name = $self->param('set_name') || $cell_type."_".$feature_type."_".$experiment_name; + $self->_set_name($set_name); + my $group_name = $self->param('group') || 'efg'; + my $species = $self->param('species') || throw "No species defined"; + $self->_species($species); + my $assembly = $self->param('assembly') || throw "No assembly version given"; + $self->_assembly($assembly); + my $file_type = $self->param('file_type') || throw "No file type given"; + $self->_file_type($file_type); + my $work_dir = $self->param('work_dir') || throw "'work_dir' is a required parameter"; + $self->_work_dir($work_dir); + + + + #Configure DBAdaptors + my $efgdba = $self->_efgdba(); + #To avoid farm issues... + $efgdba->dbc->disconnect_when_inactive(1); + $efgdba->dnadb->dbc->disconnect_when_inactive(1); + + #Fetch & Set object params + #CellType + my $cta = $efgdba->get_CellTypeAdaptor(); + my $ct_obj = $cta->fetch_by_name($cell_type); + if(!$ct_obj){ throw "Cell type $cell_type does not exist in the database"; } + $self->_cell_type($ct_obj); + + #FeatureType + my $fta = $efgdba->get_FeatureTypeAdaptor(); + my $ft_obj = $fta->fetch_by_name($feature_type); + if(!$ft_obj){ throw "Feature type $feature_type does not exist in the database"; } + $self->_feature_type($ft_obj); + + #ExperimentalGroup + my $ega = $efgdba->get_ExperimentalGroupAdaptor(); + my $eg_obj = $ega->fetch_by_name($group_name); + if(!$eg_obj){ throw "Experimental Group $group_name does not exist in the database"; } + $self->_group($eg_obj); + + + if($file_type eq 'sam'){ + #Change the directory structure so it will agree with the rest, without the need to do uc() + my $sam_header = $self->_work_dir()."/sam_header/".$species."/".$species."_"; + $sam_header .= $ct_obj->gender() ? $ct_obj->gender() : 'male'; + #Carefull with naming standards... + #$sam_header .= "_".$assembly."_unmasked.fa.fai"; + $sam_header .= "_".$assembly."_unmasked.fasta.fai"; + $self->_sam_header($sam_header); + } + + #Work with conventions here too?? work_dir/output/dbname ?? + my $output_dir = $self->param('output_dir') || throw "'output_dir' is a required parameter"; + $self->_output_dir($output_dir."/".$experiment_name); + + return 1; +} + + +sub run { + my $self = shift @_; + + return 1; +} + + +sub write_output { + my $self = shift @_; + + return 1; + +} + + +#Private Function to check and create Experiment and Feature/Data sets as needed +#Requires some global parameters that are not set in Funcgen->fetch_input, such as +#'analysis', 'feature_set_name', 'data_set_name' (these could be given as local parameters...) +sub _check_Experiment { + + #Todo make it more generic and accept multiple input_subsets + #Also maybe pass parameters as hash list... + my ($self, $analysis, $input_subset, $fset_name) = @_; + + #Global parameters set in Funcgen->fetch_input + my $efgdba = $self->_efgdba(); + my $set_name = $self->_set_name(); + my $group = $self->_group(); + my $cell_type = $self->_cell_type(); + my $feature_type = $self->_feature_type(); + + my $iset_name = $set_name; + my $dset_name = $fset_name; + + # set experiment: Reuse if already exists? (This comes from result sets) + my $ea = $efgdba->get_ExperimentAdaptor; + my $exp = $ea->fetch_by_name($set_name); + + my @date = (localtime)[5,4,3]; + $date[0] += 1900; $date[1]++; + + if (! defined $exp) { + + #Group needs to be set manually, like Cell_Type and Feature_Type + #Do not create Group on the fly here, as it will cause concurrency issues... + $exp = Bio::EnsEMBL::Funcgen::Experiment->new + ( + -NAME => $set_name, + -EXPERIMENTAL_GROUP => $group, + -DATE => join('-', @date), + -PRIMARY_DESIGN_TYPE => 'binding_site_identification', + -ADAPTOR => $ea, + ); + + ($exp) = @{$ea->store($exp)}; + + } + throw("Can't create experiment $set_name ") unless $exp; + + my $isa = $efgdba->get_InputSetAdaptor(); + my $iset = $isa->fetch_by_name($iset_name); + + if (! defined $iset){ + + $iset = Bio::EnsEMBL::Funcgen::InputSet->new + ( + -name => $iset_name, + -experiment => $exp, + -feature_type => $feature_type, + -cell_type => $cell_type, + -vendor => 'SOLEXA', + -format => 'SEQUENCING', + -feature_class => 'result' + # Analysis is not being used?? + #-analysis => $self->feature_analysis, + ); + warn "Storing new InputSet:\t$iset_name\n"; + ($iset) = @{$isa->store($iset)}; + + $iset->add_new_subset($input_subset); + $iset->adaptor->store_InputSubsets($iset->get_InputSubsets); + } else { + + #We only expect one subset here (? why??)... + #shouldn't we be adding the control file also when used?? But this is SWEmbl-specific... + #And it should be the same file name... + #Maybe do some file checking here??? + warn "InputSet already exists:\t$iset_name\n"; + my @issets = @{$iset->get_InputSubsets}; + + #if(scalar(@issets) > 1){ + # throw("InputSet $iset_name has more than one InputSubset:\t".join("\t", (map $_->name, @issets))); + #} elsif((scalar(@issets) == 1) && ($issets[0]->name ne $self->param('input_file'))){ + # throw("InputSet $iset_name already has an InputSubset(".$issets[0]->name.") which does not match ".$self->param('input_file')); + #} elsif(scalar(@issets) == 0){ #we can just add this InputSubset + # $iset->add_new_subset($self->input_id); + # $iset->adaptor->store_InputSubsets($iset->get_InputSubsets); + #} + + if(scalar(@issets)==0){ + #we can just add this InputSubset. Add an extra 'input:' as prefix? + $iset->add_new_subset($input_subset); + $iset->adaptor->store_InputSubsets($iset->get_InputSubsets); + } else { + #warn("Need to uncomment this section!! - it was commented just for testing purposes!!"); + #we just need to check if our file(s) is(are) already here... + if(!$iset->get_subset_by_name($input_subset)){ + #throw("InputSet $iset_name has InputSubsets(".join("\t", (map $_->name, @issets)).") which do not match ".$input_subset); + #warn("InputSet $iset_name has InputSubsets(".join("\t", (map $_->name, @issets)).") which do not match ".$input_subset); + } + } + } + + my $fsa = $efgdba->get_FeatureSetAdaptor(); + my $fset = $fsa->fetch_by_name($fset_name); + + if ( ! defined $fset ) { + + $fset = Bio::EnsEMBL::Funcgen::FeatureSet->new + ( + -analysis => $analysis, + -feature_type => $feature_type, + -cell_type => $cell_type, + -name => $fset_name, + -feature_class => 'annotated', + -experiment_id => $exp->dbID, + #The adaptor is needed to store! + -adaptor => $fsa + + ); + + warn "Storing new FeatureSet:\t$fset_name\n"; + ($fset) = @{$fsa->store($fset)}; + + } + else { + warn "FeatureSet already exists:\t$fset_name\n"; + + if(@{$efgdba->get_AnnotatedFeatureAdaptor->fetch_all_by_FeatureSets([$fset])}){ + throw "Feature Set $set_name already contains data. Please rollback before rerunning"; + } + + } + + my $dsa = $efgdba->get_DataSetAdaptor; + my $dset = $dsa->fetch_by_name($dset_name); + + + if ( ! defined $dset ) { + + $dset = Bio::EnsEMBL::Funcgen::DataSet->new + ( + -SUPPORTING_SETS => [$iset], + -FEATURE_SET => $fset, + -DISPLAYABLE => 1, + -NAME => $dset_name, + -SUPPORTING_SET_TYPE => 'input', + ); + + warn "Storing new DataSet:\t$dset_name\n"; + ($dset) = @{$dsa->store($dset)} + } + else { + + warn "DataSet already exists:\t$dset_name\n"; + + # need to check whether InputSets and supporting_sets are the same and + # possibly add InputSet to supporting_sets + + my $ssets = $dset->get_supporting_sets(); + + my %ssets_dbIDs = (); + map { $ssets_dbIDs{$_->dbID}='' } (@{$ssets}); + $dset->add_supporting_sets([ $iset ]) if (! exists $ssets_dbIDs{$iset->dbID}); + + } + +} + + +#Private Generic getter and setter +sub _getter_setter { + my ($self, $param_name, $param_value) = @_; + if(!$param_name){ return undef; } + if(!$param_value){ + $param_value = $self->param($param_name); + } else { + $self->param($param_name, $param_value); + } + return $param_value; +} + +# Private getter / setters : Maybe do some validation in some cases... + +#Private getter / setter to the EFG DB Adaptor +sub _efgdba { + return $_[0]->_getter_setter('efgdb',$_[1]); +} + +#Private getter / setter to the Core DB Adaptor +sub _dnadba { + return $_[0]->_getter_setter('dnadb',$_[1]); +} + +#Private getter / setter to the Cell Type object +sub _cell_type { + return $_[0]->_getter_setter('cell_type',$_[1]); +} + +#Private getter / setter to the Feature Type object +sub _feature_type { + return $_[0]->_getter_setter('feature_type',$_[1]); +} + +#Private getter / setter to the Species name +sub _species { + return $_[0]->_getter_setter('species',$_[1]); +} + +#Private getter / setter to the assembly name +sub _assembly { + return $_[0]->_getter_setter('assembly',$_[1]); +} + +#Private getter / setter to the Analysis object +sub _analysis { + return $_[0]->_getter_setter('analysis',$_[1]); +} + +#Private getter / setter to the Group +sub _group { + return $_[0]->_getter_setter('group',$_[1]); +} + +#Private getter / setter to the Experiment Name (do not mix with the Set Name) +sub _experiment_name { + return $_[0]->_getter_setter('experiment_name',$_[1]); +} + +#Private getter / setter to the Set Name +sub _set_name { + return $_[0]->_getter_setter('set_name',$_[1]); +} + +#Private getter / setter to the file type +sub _file_type { + return $_[0]->_getter_setter('file_type',$_[1]); +} + +#Private getter / setter to the sam header (only set when file type is sam) +sub _sam_header { + return $_[0]->_getter_setter('sam_header',$_[1]); +} + +#Private getter / setter to the work folder +sub _work_dir { + return $_[0]->_getter_setter('work_dir',$_[1]); +} + +#Private getter / setter to the output folder +sub _output_dir { + return $_[0]->_getter_setter('output_dir',$_[1]); +} + +#Private getter / setter to the bin folder +sub _bin_dir { + return $_[0]->_getter_setter('bin_dir',$_[1]); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/Import.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/Import.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,192 @@ +=pod + +=head1 NAME + +Bio::EnsEMBL::Hive::RunnableDB::Funcgen::Import + +=head1 DESCRIPTION + +'Import' is the base Runnable for the Import Pipeline + +=cut + +package Bio::EnsEMBL::Funcgen::RunnableDB::Import; + +use base ('Bio::EnsEMBL::Hive::Process'); + +use warnings; +use strict; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw (strip_param_args generate_slices_from_names strip_param_flags run_system_cmd); +use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); +use Bio::EnsEMBL::Funcgen::Importer; +#use Data::Dumper; + +#global values for the Helper... maybe pass as parameters... +$main::_debug_level = 0; +$main::_tee = 0; +$main::_no_log = 1; + +sub fetch_input { # fetch parameters... + my $self = shift @_; + + if(!defined($ENV{EFG_SRC})){ throw "NEED to define EFG_SRC"; } + + #Either test $ENV{EFG_DATA} or use a changed version of Importer.pm + + $self->param('name',$self->param('input_set')); + $self->param('output_dir',$self->param('output_dir')."/".$self->param('input_set')); + + $ENV{EFG_DATA} = $self->param('output_dir'); + + #if($self->param('prepared')){ + # $self->param('result_file', $self->param('output_dir')."/".$self->param('result_file')) + #} else { + # + #} + + my $Imp = Bio::EnsEMBL::Funcgen::Importer->new + ( + -name => $self->param('name'), + -format => $self->param('format'), + -vendor => $self->param('vendor'), + -parser => $self->param('parser'), + -dbname => $self->param('dbname'), + -pass => $self->param('pass'), + -host => $self->param('host'), + -user => $self->param('user'), + -port => $self->param('port'), + # -registry_pass => $self->param('registry_pass'), + -registry_host => $self->param('registry_host'), + -registry_user => $self->param('registry_user'), + -registry_port => $self->param('registry_port'), + -release => $self->param('registry_version'), + # -ssh => $ssh, + -group => $self->param('group'), + -location => $self->param('location'), + -contact => $self->param('contact'), + # -array_set => $array_set, + -input_set_name => $self->param('input_set'), + -input_feature_class => $self->param('input_feature_class'), + # -array_name => $array_name, + -result_set_name => $self->param('input_set'), #not implemented yet + -feature_type_name => $self->param('feature_type'), + -feature_analysis => $self->param('feature_analysis'), + -cell_type_name => $self->param('cell_type'), + # -write_mage => $write_mage, + # -update_xml => $update_xml, + # -no_mage => $no_mage, + -assembly => $self->param('assembly'), + -data_dir => $self->param('data_dir'), + -output_dir => $self->param('output_dir'), + -recover => $self->param('recover'), + # -dump_fasta => $fasta, + # -norm_method => , + -species => $self->param('species'), + -farm => $self->param('farm'), + -batch_job => $self->param('batch_job'), + -prepared => $self->param('prepared'), + -verbose => $self->param('verbose'), + -input_dir => $self->param('input_dir'), + # -exp_date => $exp_date, + -result_files => [ $self->param('result_file') ], + -total_features => $self->param('total_features'), + # -old_dvd_format => $old_dvd_format, + # -ucsc_coords => $ucsc, + # -release => $release, + _no_log => 1, + ); + + if(!$Imp){ throw "Could not create importer"; } + $self->param('importer', $Imp); + + #print "Output_dir: ".$Imp->get_dir('output')."\n"; + + return 1; +} + +sub run { # Check parameters and do appropriate database/file operations... + my $self = shift @_; + + my $Imp = $self->param('importer'); + + if($self->param('wrap_up')){ + + # run the merge script here... + my $cmd = $ENV{EFG_SRC}."/scripts/import/merge_and_index_collections.pl ". + " -dbhost ".$self->param('host'). + " -dbport ".$self->param('port'). + " -dbuser ".$self->param('user'). + " -dbpass ". $self->param('pass'). + " -dbname ". $self->param('dbname'). + " -data_dir ". $self->param('output_dir'). + " -result_set_name ".$self->param('input_set'); + run_system_cmd($cmd); + + + #set appropriate states... + #dont get the result set directly from $Imp as it is not initialized... + my $rseta = $Imp->db->get_ResultSetAdaptor(); + my ($rset) = @{$rseta->fetch_all_by_name($self->param("name"))}; + if(!$rset){ throw "Could not find ResultSet"; } + $rseta->set_imported_states_by_Set($rset); + return 1; + } + + my @slices; + my @skip_slices; + #Allow for a list of slices as input and not only one? + push(@slices, $self->param('slice')) if $self->param('slice'); + #@slices = @{&generate_slices_from_names($Imp->slice_adaptor,\@slices, \@skip_slices, $toplevel, $nonref, $incdups)}; + @slices = @{&generate_slices_from_names($Imp->slice_adaptor, \@slices, \@skip_slices, 1, 0, 1)};#toplevel, nonref, incdups + $Imp->slices(\@slices); + + if(!$self->param('prepared')){ + + $Imp->init_experiment_import; #Define and store sets once here before setting off parallel farm jobs. + + #Preparing data... + $Imp->read_and_import_data('prepare'); + + #and now prepare all the jobs to be run... + my @rep_out_ids; + #Create the necessary LoadReads jobs + #Get the slices from the preparation process itself? + foreach my $slice (@slices){ + my $new_input_id = eval($self->input_id); + $new_input_id->{"slice"} = $slice->seq_region_name; + $new_input_id->{"result_file"} = $Imp->output_file; + $new_input_id->{"total_features"} = $Imp->counts('total_features'); + push(@rep_out_ids,$new_input_id); + } + + # Carefull with flow numbers... 6,7... maybe pass as parameter?? + #Wrapup job + my ($funnel_job_id) = @{ $self->dataflow_output_id($self->input_id, 2, { -semaphore_count => scalar(@rep_out_ids) } ) }; + #All the fanned jobs... + $self->dataflow_output_id(\@rep_out_ids, 1, { -semaphored_job_id => $funnel_job_id } ); + + + } else { + + #eval { + #This eval is because a few will crash because the slice is not in the sequence!! + #Check that this is working as it should!... + $Imp->register_experiment(); + #}; + #if($@){ warn "Carefull with possible failure in import: $@"; } + } + + return 1; +} + + +sub write_output { # Create the relevant jobs + my $self = shift @_; + + return 1; + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/ImportMotifFeatures.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/ImportMotifFeatures.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,232 @@ +=pod + +=head1 NAME + +Bio::EnsEMBL::Hive::RunnableDB::Funcgen::ImportMotifFeatures + +=head1 DESCRIPTION + +'Import' is the base Runnable for the Import Pipeline + +=cut + +package Bio::EnsEMBL::Funcgen::RunnableDB::ImportMotifFeatures; + +use base ('Bio::EnsEMBL::Hive::Process'); + + +use warnings; +use strict; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw (run_system_cmd generate_slices_from_names); +use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); +use Bio::EnsEMBL::Funcgen::Importer; +#use Data::Dumper; + +#global values for the Helper... maybe pass as parameters... +$main::_debug_level = 0; +$main::_tee = 0; +$main::_no_log = 1; + +sub fetch_input { # fetch parameters... + my $self = shift @_; + + throw "No matrix given" if ! $self->param('matrix'); + #throw "No feature type given" if ! $self->param('feature_type'); + throw "No file given" if ! $self->param('file'); + my $file_name = $self->param('file'); + + ### Create DB connections + my $coredb = Bio::EnsEMBL::DBSQL::DBAdaptor->new( + -user => $self->param('dnadb_user'), + -port => $self->param('dnadb_port'), + -host => $self->param('dnadb_host'), + -dbname => $self->param('dnadb_name'), + ); + + + ### Create DB connections + my $db = Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor->new( + -dbname => $self->param('dbname'), + -port => $self->param('port'), + -pass => $self->param('pass'), + -host => $self->param('host'), + -user => $self->param('user'), + -dnadb => $coredb, + ); + + + #test db connections + $db->dbc->db_handle; + + $self->param('dba', $db); + + if(! -e $file_name ){ throw " Could not find ".$file_name; } + + my $fta = $db->get_FeatureTypeAdaptor(); + + #We're associating to this matrix the associated feature types + my $bma = $db->get_BindingMatrixAdaptor(); + + #Get by name... + my @bms = @{$bma->fetch_all_by_name($self->param('matrix'))}; + if(scalar(@bms) == 0){ + throw $self->param('matrix')." could not be found"; + } + if(scalar(@bms) > 1){ + throw "More than one matrix with the same name is not currently supported"; + } + my $bm = $bms[0]; + $self->param('matrix', $bm); + + #check if there is already data for this matrix stored... if there is, throw an error... + #TODO test specifically for data in slices... + my $count = $db->dbc->db_handle->selectrow_array("select count(1) from motif_feature where binding_matrix_id=".$bm->dbID); + if(($count>0) && !($self->param('slices'))){ throw "Data for ".$bm->name." already exists! Remove it first"; } + + my $ft = $bm->feature_type; + my @fts = ($ft); + + #Should we be doing this? + push @fts, @{$fta->fetch_all_by_association($ft)}; + + my @fsets; + my $fsa = $db->get_FeatureSetAdaptor(); + map { map { push @fsets, $_->dbID; } @{$fsa->fetch_all_by_FeatureType($_)} } @fts; + + my $output_file_name = $self->param('output_dir')."/annotated_features_".$bm->name.".tab"; + + my $query = "select distinct sr.name, af.seq_region_start, af.seq_region_end, af.annotated_feature_id from ". + " annotated_feature af, seq_region sr where af.seq_region_id=sr.seq_region_id and ". + " feature_set_id in (".join(",",@fsets).")"; + my $cmd = "mysql -e \"".$query."\" -quick -h".$self->param('host')." -P".$self->param('port'). + " -u".$self->param('user')." -p".$self->param('pass').' '.$self->param('dbname')." >".$output_file_name; + + print $cmd."\n"; + + system($cmd) && throw "Error dumping Annotated Features for ".$bm->name." $cmd"; + + $self->param('output_file_name',$output_file_name); + + return 1; +} + +sub run { # Check parameters and do appropriate database/file operations... + + my $self = shift @_; + my $sa = $self->param('dba')->get_SliceAdaptor; + my $mfa = $self->param('dba')->get_MotifFeatureAdaptor; + + my $bm = $self->param('matrix'); + + + + my $results = $self->param('output_dir')."/overlaps_".$bm->name.".tab"; + my $cmd = "perl ".$self->param('efg_src')."/scripts/miscellaneous/cooccur.pl ".$self->param('file')." ".$self->param('output_file_name')." > ".$results; + system($cmd) && throw "Error executing coocur: $cmd"; + + my @slices; + if($self->param('slices')){ + @slices = split(/,/,$self->param('slices')); + } + + my %data; + my %slice_cache; + open(FILE,$results); + while(){ + chomp; + my ($sr,$start,$end,$desc,$score,$strand,$bm_name,$sr_af,$start_af,$end_af,$af_id) = split("\t"); + # only include those completely included in the annotated feature + next if ($start < $start_af); + next if ($end > $end_af); + + #Quick hack to only import specific slices... + my $filter = 0; + if(scalar(@slices)>0){ + $filter=1; + foreach my $slice (@slices){ + if($slice eq $sr){ $filter = 0; } + } + } + + next if($filter); + + if(!defined($slice_cache{$sr})){ + my $slice = $sa->fetch_by_region('toplevel',$sr); + if($slice) { + $slice_cache{$sr} = $slice; + } else { + warn "Slice $sr not found: silently ignoring entry"; + } + } + + #maybe double check overlap?? Should be fine, though... + #maybe also cross-check bm_name with $bm->name? + if($bm->name ne $bm_name){ warn "Entry is for $bm_name, not for ".$bm->name." : entry ignored"; next; } + $data{$sr}{$start}{$end}{$strand}{'score'} = $score; + #Carefull there may be duplicates. Maybe turn into an array instead + $data{$sr}{$start}{$end}{$strand}{'assoc_feats'}{$af_id}=1; + + } + close FILE; + + #New procedure may be of loading all Filtered matches, + # and storing all associated annotated features... + #Carefull the score may need to be the relative_affinity... + + #Keep the threshold; + my $min_relative_affinity=1; + foreach my $sr (keys %data){ + my $slice = $slice_cache{$sr}; + foreach my $start (keys %{$data{$sr}}){ + foreach my $end (keys %{$data{$sr}{$start}}){ + foreach my $strand (keys %{$data{$sr}{$start}{$end}}){ + + my $motif_slice = $sa->fetch_by_region('toplevel',$sr, $start, $end, $strand); + my $relative_affinity = $bm->relative_affinity($motif_slice->seq); + if($relative_affinity < $min_relative_affinity){ + $min_relative_affinity = $relative_affinity; + } + my $mf = Bio::EnsEMBL::Funcgen::MotifFeature->new + ( + -slice => $slice, + -start => $start, + -end => $end, + -strand => $strand, + -binding_matrix => $bm, + #-score => $data{$sr}{$start}{$end}{$strand}{'score'}, + #Only round on the MotifFeature, not the BindingMatrix + -score => sprintf("%.3f", $relative_affinity), + ); + if($mf){ + $mfa->store($mf); #($mf) = store gives null? + foreach my $af_id (keys %{$data{$sr}{$start}{$end}{$strand}{'assoc_feats'}}){ + my $sql = "INSERT INTO associated_motif_feature (annotated_feature_id, motif_feature_id) VALUES ($af_id, ".$mf->dbID.")"; + $self->param('dba')->dbc->do($sql); + } + + #A safer way is to fetch the af and save it through the API + #$mfa->store_associated_AnnotatedFeature($mf,$af); + } + } + } + } + } + + #Update the matrix threshold (need to do it directly in SQL?) + my $sql = "UPDATE binding_matrix set threshold=".$min_relative_affinity." where binding_matrix_id=".$bm->dbID; + $self->param('dba')->dbc->do($sql); + + return 1; +} + + +sub write_output { + my $self = shift @_; + + return 1; + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/InferMotifs.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/InferMotifs.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,154 @@ +=pod + +=head1 NAME + +Bio::EnsEMBL::Funcgen::RunnableDB::InferMotifs + +=head1 DESCRIPTION + +'Funcgen::InferMotifs' + +=cut + + +package Bio::EnsEMBL::Funcgen::RunnableDB::InferMotifs; + +use warnings; +use strict; + +use base ('Bio::EnsEMBL::Funcgen::RunnableDB::Motif'); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); +use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw (run_system_cmd); +use Data::Dumper; + +sub fetch_input { # nothing to fetch... just the parameters... + my $self = shift @_; + + $self->SUPER::fetch_input(); + + if(!$self->param('bin')){ throw "No bin defined!"; } + $self->_bin($self->param('bin')); + + return 1; +} + +sub run { + + my $self = shift @_; + + my $afa = $self->_efgdba()->get_AnnotatedFeatureAdaptor(); + #Order features descending by score + my @features = sort { $b->score <=> $a->score } @{$afa->fetch_all_by_FeatureSets( [ $self->_feature_set ] )}; + + # Print the sequences for this bin + my $sa = $self->_efgdba()->dnadb->get_SliceAdaptor(); + my $fasta_file = $self->_output_dir."/bin.".$self->_bin.".fasta"; + open(FO,">".$fasta_file) or throw "Couldn't open output file"; + my $bin_start = ($self->_bin - 1)*$self->_bin_size; + for(my $i=$bin_start; $i<($bin_start+$self->_bin_size); $i++){ + my $ft = $features[$i]; + my $sr_name = $ft->seq_region_name; + my $start = POSIX::floor($ft->summit - $self->_window_size); + my $end = POSIX::floor($ft->summit + $self->_window_size); + my $slice = $sa->fetch_by_region( undef, $sr_name, $start, $end); + if(defined($slice)){ + print FO ">".$sr_name.",".$start.",".$end."\n".$slice->get_repeatmasked_seq()->seq."\n"; + } else { + warn $sr_name.",".$start.",".$end." could not be found\n"; + } + } + close FO; + + #Create a background to try and avoid repetitive A's + #fasta-get-markov is part of the meme package (but is very simple to replicate if needed) + #Use a pre-compiled background unique for the whole of human... + #run_system_cmd("cat ".$fasta_file." | fasta-get-markov -m 3 > ".$fasta_file.".bg"); + + #Run MEME for this bin + my $species = $self->_species; + my $meme_file = $self->_output_dir."/bin.".$self->_bin.".meme"; + #Replace meme parameters by parameters from an analysis in the database... + #run_system_cmd("meme.bin -nostatus -dna -text -revcomp -mod zoops -evt 0.00001 -nmotifs 10 -minsites 50 -minw 6 -maxw 20 -bfile ~/src/STAMP.v1.1/tests/${species}_masked.bg $fasta_file > ".$meme_file); + run_system_cmd($self->_bin_folder()."/meme.bin -nostatus -dna -text -revcomp -mod zoops -evt 1e-20 -nmotifs 5 -minsites 50 -minw 6 -maxw 20 $fasta_file > ".$meme_file); + + my $basename = $self->_feature_set->name.".bin_".$self->_bin; + if(_process_meme_to_STAMP($meme_file, $basename) == 0){ + warn "No motif found "; + } + + #Eliminate the temp files... + #run_system_cmd("rm -f ".$fasta_file); + #run_system_cmd("rm -f ".$meme_file); + + return 1; +} + + +sub write_output { # Nothing is written at this stage (for the moment) + + my $self = shift @_; + + return 1; + +} + + +sub _process_meme_to_STAMP { + my ($meme_file, $basename) = (shift, shift, shift); + + my $num_motifs = 0; + my $log_odds = 1; + open(FO,">".$meme_file.".tmp_TRANSFAC"); + open(FILE, $meme_file); + while(){ + chomp; + if(/^MOTIF\s+.*E-value\s*=\s+(\S+)/) { $log_odds = $1; } + if(/^BL\s+MOTIF\s+(\d+)\s+width=(\d+)\s+seqs=(\d+)\s*$/){ + my $motif_num = $1; + my $motif_width = $2; + my $num_seqs = $3; + + + my @init = (0) x $motif_width; + my %matrix; + #Initialize with 0 + for (my $j=0; $j<$motif_width; $j++){ + $matrix{"A"}->[$j] = $matrix{"C"}->[$j] = $matrix{"G"}->[$j] = $matrix{"T"}->[$j] = 0; + } + + for (my $i=1;$i<=$num_seqs;$i++){ + my $line = ; + $line =~ /^\S+\s+\(\s*\d+\)\s+(\w+)\s+\d+\s*$/; + my @seq = split(//,uc($1)); + if(scalar(@seq) != $motif_width){ throw "Problems parsing motif file"; } + for (my $j=0; $j<$motif_width; $j++){ $matrix{$seq[$j]}->[$j]++ } + } + my $end = ; + if(! $end =~ /^\/\/\s*$/){ + throw "Error in the MEME file: expecting \/\/ after $num_seqs lines "; + } + print FO "DE ".$basename.".".$motif_num."\t".$log_odds."\n"; + for(my $i=0; $i<$motif_width; $i++){ + print FO $i."\t".$matrix{"A"}->[$i]."\t".$matrix{"C"}->[$i]."\t".$matrix{"G"}->[$i]."\t".$matrix{"T"}->[$i]."\n" + } + print FO "XX\n"; + + $num_motifs++; + + } + + } + close FILE; + close FO; + + return $num_motifs; + +} + +#Private getter / setter to the bin +sub _bin { + return $_[0]->_getter_setter('bin',$_[1]); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/MakeDnaseProfile.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/MakeDnaseProfile.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,169 @@ +=pod + +=head1 NAME + +Bio::EnsEMBL::Funcgen::RunnableDB::MakeDnaseProfile + +=head1 DESCRIPTION + +=cut + + +package Bio::EnsEMBL::Funcgen::RunnableDB::MakeDnaseProfile; + +use warnings; +use strict; +use Bio::EnsEMBL::Hive::DBSQL::AnalysisDataAdaptor; +use base ('Bio::EnsEMBL::Hive::ProcessWithParams'); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); +use Data::Dumper; + +sub fetch_input { # nothing to fetch... just the parameters... + my $self = shift @_; + + $self->SUPER::fetch_input(); + + my $work_dir = $self->param('work_dir'); + my $matrix = $self->param('matrix'); + my $dnase = $self->param('dnase'); + #throw "Dnase file $dnase does not exist in $work_dir" if(! -e $work_dir."/".$dnase); + throw "Matrix file $matrix does not exist in $work_dir" if(! -e $work_dir."/matches/".$matrix); + my $is_male = $self->param('is_male'); + throw "Need to define is_male" if(!defined($is_male)); + #throw "mapability.bedGraph file expected in $work_dir" if(! -e $work_dir."/mapability.bedGraph"); + + return 1; +} + +sub run { # Create Groups and Analysis and stuff... + my $self = shift @_; + + my $hlen = 100; + my $work_dir = $self->param('work_dir'); + my $matrix = $self->param('matrix'); + my $dnase = $self->param('dnase'); + my $is_male = $self->param('is_male'); + + my $motif_size; + my %motifs; + #Or remove // from matrix name + open(FILE,$work_dir."/matches_vf/".$matrix); + ; #title + while(){ + chomp; + #10 71532 71542 Gata1 8.55866 1 MA0035.2 + my ($chr,$start,$end,$name,$score,$strand)=split("\t"); + next if(!$is_male && ($chr =~ /Y/)); + #Ignore Mitochondria... + next if($chr =~ /MT/); + if(!defined($motif_size)){ $motif_size = $end - $start; } + $motifs{$chr}{$start}{$strand}=$score; + } + close FILE; + + #Maybe pass this as parameters + my $out_file = $work_dir."/output/".$matrix."_".$dnase.".counts"; + open(FO,">".$out_file); + + foreach my $chr (sort keys %motifs){ + my %datap = (); + my %datan = (); + foreach my $start (sort keys %{$motifs{$chr}}){ + for(my $i=$start-$hlen;$i<$start+$hlen+$motif_size;$i++){ + $datap{$i} = 0; + $datan{$i} = 0; + } + } + + #Add a mappability score to extra filter... + #A lot more time and didn't see much evidence of improvement... + #my %mappability = (); + #foreach my $start (sort keys %{$motifs{$chr}}){ + # for(my $i=$start-$hlen;$i<$start+$hlen+$motif_size;$i++){ + # $mappability{$i} = 0; + # } + #} + #open(FILE,$work_dir."/mapability.bedGraph"); + #while(){ + # chomp; + # my ($cur_chr,$start,$end,$score)=split(); + # next if($cur_chr ne $chr); + #for(my $i=$start; $i<$end; $i++){ + # if(defined($mappability{$i})){ $mappability{$i}=$score; } + # } + #} + #close FILE; + + my $folder; + if($is_male){ + $folder = "male"; + } else { + $folder = "female"; + } + #open(FILE,$work_dir."/dnase/".$folder."/".$dnase); + open(FILE,"gzip -dc ${work_dir}/dnase/${folder}/${dnase}".'AlnRep*.bam.unique.tagAlign.gz |'); + while(){ + chomp; + my ($cur_chr,$start,$end,undef,undef,$strand)=split("\t"); + next if(($cur_chr ne $chr) || !defined($datap{$start})); + if($strand eq '+'){ + $datap{$start}++; + } else { + $datan{$end-1}++; + } + } + close FILE; + + foreach my $start (sort keys %{$motifs{$chr}}){ + foreach my $strand (keys %{$motifs{$chr}{$start}}){ + my $score = $motifs{$chr}{$start}{$strand}; + + #filter out those that have no mappability in more than 20% of their extension... + #my $count = 0; + #for(my $i=$start-$hlen;$i<$start+$hlen+$motif_size;$i++){ + # if($mappability{$i} <= 0.5){ $count++; } + #} + #Remove if more than 20% of region is "non_mappable" (approach from Centipede paper) + #if(($count / (2*$hlen + $motif_size)) > 0.2){ + # warn "Motif at chr: $chr start: $start strand: $strand was ignored due to low mappability"; + # next; + #} + + print FO $chr."\t".$start."\t".($start+$motif_size)."\t".$matrix."\t".$score."\t".$strand; + for(my $i=$start-$hlen;$i<$start+$hlen+$motif_size;$i++){ + if($strand eq '+'){ + print FO "\t".$datap{$i}; + } else { + print FO "\t".$datan{$i}; + } + } + for(my $i=$start-$hlen;$i<$start+$hlen+$motif_size;$i++){ + if($strand eq '+'){ + print FO "\t".$datan{$i}; + } else { + print FO "\t".$datap{$i}; + } + } + print FO "\n"; + } + } + } + + close FO; + + return 1; +} + + +sub write_output { # Nothing to do here + my $self = shift @_; + + $self->dataflow_output_id($self->input_id, 2); + + return 1; + + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/Motif.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/Motif.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,163 @@ +=pod + +=head1 NAME + +Bio::EnsEMBL::Funcgen::RunnableDB::Motif + +=head1 DESCRIPTION + +'Funcgen::Motif' + +TODO: Try to collate that to Funcgen + +=cut + + +package Bio::EnsEMBL::Funcgen::RunnableDB::Motif; + +use warnings; +use strict; + +use base ('Bio::EnsEMBL::Hive::Process'); + +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); +use Data::Dumper; + +sub fetch_input { # nothing to fetch... just the parameters... + my $self = shift @_; + + $self->SUPER::fetch_input(); + + if(!$self->param('bin_dir')){ throw "No binary folder given"; } + $self->_bin_folder($self->param('bin_dir')); + + if(!$self->param('species')){ throw "No species given"; } + $self->_species($self->param('species')); + + if(!$self->param('bin_size')){ throw "No bin size given"; } + $self->_bin_size($self->param('bin_size')); + + if(!$self->param('window_size')){ throw "No window size given"; } + $self->_window_size($self->param('window_size')); + + #Get the core db... possibly make these non-mandatory + if(!$self->param('dnadb_host')){ throw "No core host given"; } + if(!$self->param('dnadb_port')){ throw "No core port given"; } + if(!$self->param('dnadb_user')){ throw "No core user given"; } + #if(!$self->param('dnadb_name')){ throw "No core dbname given"; } + my $dba; + eval{ + $dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new( + -host => $self->param('dnadb_host'), + -port => $self->param('dnadb_port'), + -user => $self->param('dnadb_user'), + -dbname => $self->param('dnadb_name'), + -species => $self->_species, + ); + }; + if($@) { throw "Error creating the Core DB Adaptor: $@"; } + if(!$dba){ throw "Could not connect to Core DB"; } + + #Get the efg db... always read only user + if(!$self->param('dbhost')){ throw "No host given"; } + if(!$self->param('dbport')){ throw "No port given"; } + if(!$self->param('dbuser')){ throw "No user given"; } + if(!$self->param('dbname')){ throw "No dbname given"; } + eval{ + $self->_efgdba(Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor->new( + -host => $self->param('dbhost'), + -port => $self->param('dbport'), + -user => $self->param('dbuser'), + -dbname => $self->param('dbname'), + -species => $self->_species, + -dnadb => $dba, + )); + }; + if($@) { throw "Error creating the EFG DB Adaptor: $@"; } + if(!$self->_efgdba()){ throw "Could not connect to EFG DB"; } + + my $dnadbc = $self->_efgdba()->dnadb->dbc; + warn $dnadbc->host." ".$dnadbc->port." ".$dnadbc->username." ".$dnadbc->dbname; + + + if(!$self->param('feature_set')){ throw "No feature set given"; } + my $fseta = $self->_efgdba()->get_FeatureSetAdaptor(); + my $fset = $fseta->fetch_by_name($self->param('feature_set')); + if(!$fset){ + throw $self->param('feature_set')." is not a valid Feature Set"; + } + $self->_feature_set($fset); + + if(!$self->param('output_dir')){ throw "No output dir given"; } + $self->_output_dir($self->param('output_dir')."/".$self->_feature_set->name); + + return 1; +} + +sub run { + my $self = shift @_; + + return 1; +} + + +sub write_output { # Nothing is written at this stage (for the moment) + + my $self = shift @_; + + return 1; + +} + +#Private Generic getter and setter +sub _getter_setter { + my ($self, $param_name, $param_value) = @_; + if(!$param_name){ return undef; } + if(!$param_value){ + $param_value = $self->param($param_name); + } else { + $self->param($param_name, $param_value); + } + return $param_value; +} + +# Private getter / setters : Maybe do some validation in some cases... + +#Private getter / setter to the bin folder +sub _bin_folder { + return $_[0]->_getter_setter('bin_folder',$_[1]); +} + +#Private getter / setter to the EFG DB Adaptor +sub _efgdba { + return $_[0]->_getter_setter('efgdb',$_[1]); +} + +#Private getter / setter to the bin size +sub _bin_size { + return $_[0]->_getter_setter('bin_size',$_[1]); +} + +#Private getter / setter to the window size +sub _window_size { + return $_[0]->_getter_setter('window_size',$_[1]); +} + +#Private getter / setter to the output dir +sub _output_dir { + return $_[0]->_getter_setter('output_dir',$_[1]); +} + +#Private getter / setter to the feature set +sub _feature_set { + return $_[0]->_getter_setter('feature_set',$_[1]); +} + +#Private getter / setter to the species +sub _species { + return $_[0]->_getter_setter('species',$_[1]); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/RunBWA.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/RunBWA.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,103 @@ +=pod + +=head1 NAME + +Bio::EnsEMBL::Hive::RunnableDB::Funcgen::RunBWA + +=head1 DESCRIPTION + +'RunBWA' runs BWA with an input file + +=cut + +package Bio::EnsEMBL::Funcgen::RunnableDB::RunBWA; + +use base ('Bio::EnsEMBL::Funcgen::RunnableDB::Alignment'); + +use warnings; +use strict; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); + +sub fetch_input { # fetch parameters... + my $self = shift @_; + + $self->SUPER::fetch_input(); + $self->_input_dir($self->_output_dir()); + + my $input_file = $self->param('input_file') || throw "No input file given"; + $input_file = $self->_input_dir()."/".$input_file; + if(! -e $input_file){ throw " Couldn't find $input_file"; } + $self->_input_file($input_file); + + my $species = $self->_species(); + my $gender = $self->_cell_type()->gender(); + $gender = $gender ? $gender : "male"; + my $assembly = $self->_assembly(); + #TODO: Maybe check if the index file is really there? Eventually the bwa output will tell you though + my $bwa_index = $self->_work_dir()."/bwa_indexes/".$species."/".$species."_".$gender."_".$assembly."_unmasked.fasta"; + $self->_bwa_index($bwa_index); + + my $bwa_bin = $self->param('bwa_bin') || $self->_bin_dir().'/bwa'; + $self->_bwa_bin($bwa_bin); + + return 1; +} + +sub run { + my $self = shift @_; + + my $input_file = $self->_input_file(); + my $bwa_index = $self->_bwa_index(); + + my $bwa_bin = $self->_bwa_bin(); + + #TODO Pass the location of the binary to be sure we'll be running the right version? +# my $bwa_cmd = "$bwa_bin aln $bwa_index $input_file"; + #Allow this to work with paired reads?? Maybe not for the moment... + #in that case pass bwa algorithm as parameter... + #If using -q make sure we've got the correct Sanger quality scores... +# $bwa_cmd .= " | $bwa_bin samse $bwa_index - $input_file"; +# $bwa_cmd .= " | samtools view -uS - "; +# $bwa_cmd .= " | samtools sort - ${input_file}.sorted"; +# if(system($bwa_cmd) != 0){ throw "Problems running $bwa_cmd"; } + + #Silent errors seem to be passing... running one command at a time!? + my $bwa_cmd = "$bwa_bin aln $bwa_index $input_file > ${input_file}.aln"; + if(system($bwa_cmd) != 0){ throw "Problems running $bwa_cmd"; } + $bwa_cmd = "$bwa_bin samse $bwa_index ${input_file}.aln $input_file > ${input_file}.aln.sam"; + if(system($bwa_cmd) != 0){ throw "Problems running $bwa_cmd"; } + if(system("rm ${input_file}.aln") != 0){ warn "Couldn't remove tmp file. Remove it manually."; } + $bwa_cmd = $self->_bin_dir()."/samtools view -uS ${input_file}.aln.sam > ${input_file}.aln.bam"; + if(system($bwa_cmd) != 0){ throw "Problems running $bwa_cmd"; } + if(system("rm ${input_file}.aln.sam") != 0){ warn "Couldn't remove tmp file. Remove it manually."; } + $bwa_cmd = $self->_bin_dir()."/samtools sort ${input_file}.aln.bam ${input_file}.sorted"; + if(system($bwa_cmd) != 0){ throw "Problems running $bwa_cmd"; } + if(system("rm ${input_file}.aln.bam") != 0){ warn "Couldn't remove tmp file. Remove it manually."; } + + if(system("rm $input_file") != 0){ warn "Couldn't remove tmp file. Remove it manually."; } + + return 1; +} + + +sub write_output { # Nothing to write + my $self = shift @_; + + return 1; + +} + + +#Private getter / setter to the bwa indexes +sub _bwa_index { + return $_[0]->_getter_setter('bwa_index',$_[1]); +} + +#Private getter / setter to the bwa bin +sub _bwa_bin { + return $_[0]->_getter_setter('bwa_bin',$_[1]); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/RunCCAT.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/RunCCAT.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,286 @@ +=pod + +=head1 NAME + +Bio::EnsEMBL::Funcgen::RunnableDB::RunCCAT + +=head1 DESCRIPTION + +'RunCCAT' Runs CCAT "broad peak" caller and stores peaks as an annotated feature set. +Assumes Files are organized and named with a specific convention +($repository_folder)/experiment_name/cell-type_feature-type/ +unless specific files are given + +=cut + +package Bio::EnsEMBL::Funcgen::RunnableDB::RunCCAT; + +use warnings; +use strict; +use Bio::EnsEMBL::Funcgen::Utils::Helper; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::InputSet; +use Bio::EnsEMBL::Funcgen::DataSet; +use Bio::EnsEMBL::Funcgen::FeatureSet; +use Bio::EnsEMBL::Funcgen::AnnotatedFeature; +use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); +use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw (run_system_cmd); + + +use base ('Bio::EnsEMBL::Funcgen::RunnableDB::SWEmbl'); + +sub fetch_input { # fetch and preprocess the input file plus do some more checking + my $self = shift @_; + + $self->SUPER::fetch_input(); + + my $efgdba = $self->_efgdba(); + my $fsa = $efgdba->get_FeatureSetAdaptor(); + if(!$self->_feature_set($fsa->fetch_by_name($self->_feature_set_name()))){ + throw "Feature Set was not Created"; + } + + my $bin_dir = $self->_bin_dir(); + + my $analysis = $self->_analysis(); + #Use the definitions that are on the database + + my $cell_type = $self->_cell_type()->name; + my $feature_type = $self->_feature_type()->name; + my $experiment_name = $self->_experiment_name(); + + my $file_type = $self->_file_type(); + + if(($file_type ne 'sam') && ($file_type ne 'bed') ){ + throw "Only sam and bed currently supported for CCAT"; + } + + my $output_dir = $self->_output_dir(); + + my $size_file = $output_dir."/".$self->_set_name.".sizes"; + + #Get the size file... similar to the sam header... + open(FILE, $self->_sam_header); + #Consider having it pregenerated... + open(SIZES,">".$size_file); + while(){ + chomp; + /^(\S+)\s+(\d+)\s+/; + my $slice = $1; + my $slice_size = $2; + if(!$slice || !$slice_size){ throw " Could not process sam header line $_ "; } + + #Mouse Hack!! + next if(($self->_species eq 'mus_musculus') && !($slice =~ /chromosome/)); + print SIZES $slice."\t".$slice_size."\n"; + } + close SIZES; + close FILE; + + $self->_size_file($size_file); + + my $input_dir = $self->_input_dir(); + + my $file_name = $self->_input_file(); + my $input_file = $input_dir."/".$file_name; + if(-e $input_file){ + my $output_file = $output_dir."/".$file_name; + if(! $self->param('reenter')){ + #TODO Validate if existent file is ok. + $self->_preprocess_file($input_file, $output_file, $file_type) || + throw "Error processing data file $input_file"; + } else { + if(! -e $output_file){ warn "$output_file does not exist. May need to rerun from scratch."; } + } + $self->_input_file($output_file); + + if(!$self->_results_file($self->param('results_file'))){ + $self->_results_file($output_file.".".$analysis->logic_name); + } + } else { throw "No valid data file was given: ".$input_file; } + + #Always require a control file... + my $control_file = $output_dir."/".$self->_control_file(); + if(! -e $control_file){ throw "No valid control file was given: ".$control_file; } + $self->_control_file($control_file); + + #May need to convert it... + if($file_type eq 'sam'){ + + my $input_file_bed = $self->_input_file; + $input_file_bed =~ s/\.sam/\.bed/; + + #Mouse hack + if($self->_species eq 'mus_musculus'){ + if(! $self->param('reenter')){ + run_system_cmd($bin_dir."/samtools view -Su ".$self->_input_file." | ${bin_dir}/bamToBed | grep 'chromosome' >${input_file_bed}"); + } + } else { + if(! $self->param('reenter')){ + run_system_cmd($bin_dir."/samtools view -Su ".$self->_input_file." | ${bin_dir}/bamToBed >${input_file_bed}"); + } + } + $self->_input_file($input_file_bed); + + my $control_file_bed = $self->_control_file; + $control_file_bed =~ s/\.sam/\.bed/; + #Mouse Hack + if($self->_species eq 'mus_musculus'){ + if(! $self->param('reenter')){ + run_system_cmd($bin_dir."/samtools view -Su ".$self->_control_file." | ${bin_dir}/bamToBed | grep 'chromosome' >${control_file_bed}"); + } + } else { + if(! $self->param('reenter')){ + run_system_cmd($bin_dir."/samtools view -Su ".$self->_control_file." | ${bin_dir}/bamToBed >${control_file_bed}"); + } + } + $self->_control_file($control_file_bed); + + } + + return 1; +} + +sub run { # call SWEmbl + my $self = shift @_; + + if($self->param('reenter')){ return 1; } + + my $analysis = $self->_analysis; + #/bin/CCAT + # + my $bin_dir = $self->_bin_dir(); + my $command = $bin_dir."/".$analysis->program_file . + " ".$self->_input_file() . + " ".$self->_control_file() . + " ". $self->_size_file() . + " ".$bin_dir."/ccat_config/".$analysis->parameters . + " " . $self->_results_file; + warn "Running analysis:\t$command"; + run_system_cmd($command); + + return 1; +} + +sub write_output { # Store results + my $self = shift @_; + + $self->_parse_result_file(); + + return 1; +} + +sub _parse_result_file { + + my $self = shift @_; + + ### annotated features + my $fset = $self->_feature_set(); + + my $efgdba = $self->_efgdba(); + my $sa = $efgdba->get_SliceAdaptor(); + + #Cache slices and features... + my %slice; + my @af; + + my %cache_af; + + open(FILE,$self->_results_file().".significant.region"); + while(){ + chomp; + + #Content of CCAT output file + my ($seqid,$summit,$start,$end,$chipreads,$ctrlreads,$fold,$fdr)= split("\t"); + #Hardcode a minimum fdr... may pass as parameter + next if ($fdr>0.05); + my $score = $fold; + + #This seqid may vary depending on the input given to SWEmbl... + # handle it manually at least for the moment... namely the sam seqid... + #Make sure to test thoroughly to see if it works... + #e.g. chromosome:GRCh37:15:1:102531392:1 + if($seqid =~ /^\S*:\S*:(\S+):\S+:\S+:\S/) { $seqid = $1; } + #In case UCSC input is used... + $seqid =~ s/^chr//i; + + if($self->param('slice') && ($seqid ne $self->param('slice'))){ + warn "Feature being ignored as it is not in specified slice ".$self->param('slice')." : Region:". + $seqid." Start:".$start." End:".$end." Score:".$score." Summit:".$summit."\n"; + next; + } + + #May have some sort of repeats(?). Since it is ordered with significance, ignore remaining hits. + next if(defined($cache_af{$seqid."_".$start})); + $cache_af{$seqid."_".$start} = 1; + + #next if ($seqid =~ m/^M/); + + # filtering is done as a post-processing e.g. FilterBlacklist.pm + #$summit = int($summit);#Round up? + + unless (exists $slice{"$seqid"}) { + $slice{"$seqid"} = $sa->fetch_by_region(undef, $seqid); + } + + if( ($start < 1) || ($end > $slice{"$seqid"}->end)){ + warn "Feature being ignored due to coordinates out of slice: Region:". + $seqid." Start:".$start." End:".$end." Score:".$score." Summit:".$summit."\n"; + } + + #Gracefully handle errors... + my $af; + eval{ + $af = Bio::EnsEMBL::Funcgen::AnnotatedFeature->new + ( + -slice => $slice{"$seqid"}, + -start => $start, + -end => $end, + -strand => 0, + -score => $score, + -summit => $summit, + -feature_set => $fset, + ); + }; + if($@) { warn($@); next; } + if(!$af) { warn("Could not create feature - Region:". + $seqid." Start:".$start." End:".$end." Score:".$score. + " Summit:".$summit); next; } + + push(@af, $af); + } + close FILE; + + # Batch store features... + if(scalar(@af>0)){ + $efgdba->get_AnnotatedFeatureAdaptor->store(@af); + } else { + warn "No significant features detected!"; + } + #Do this on a wrapup runnable...so it will only be visible after reads are loaded... + $fset->adaptor->set_imported_states_by_Set($fset); + + # Status should not be set at this stage + +} + + +#Private getter / setter to the feature set +sub _feature_set { + return $_[0]->_getter_setter('feature_set',$_[1]); +} + +#Private getter / setter to the results file +sub _results_file { + return $_[0]->_getter_setter('results_file',$_[1]); +} + +#Private getter / setter to the size file +sub _size_file { + return $_[0]->_getter_setter('size_file',$_[1]); +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/RunCentipede.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/RunCentipede.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,70 @@ +=pod + +=head1 NAME + +Bio::EnsEMBL::Funcgen::RunnableDB::RunCentipede + +=head1 DESCRIPTION + +=cut + + +package Bio::EnsEMBL::Funcgen::RunnableDB::RunCentipede; + +use warnings; +use strict; +use Bio::EnsEMBL::Hive::DBSQL::AnalysisDataAdaptor; +use base ('Bio::EnsEMBL::Hive::ProcessWithParams'); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); +use Data::Dumper; + +sub fetch_input { # nothing to fetch... just the parameters... + my $self = shift @_; + + $self->SUPER::fetch_input(); + + my $work_dir = $self->param('work_dir'); + my $matrix = $self->param('matrix'); + my $dnase = $self->param('dnase'); + + my $file = $work_dir."/output/".$matrix."_".$dnase.".counts"; + throw "expected file $file does not exist" if(! -e $file); + + return 1; +} + +sub run { # Create Groups and Analysis and stuff... + my $self = shift @_; + + my $hlen = 100; + my $work_dir = $self->param('work_dir'); + my $matrix = $self->param('matrix'); + my $dnase = $self->param('dnase'); + my $is_male = $self->param('is_male'); + + my $file = $work_dir."/output/".$matrix."_".$dnase.".counts"; + open(FO,">".$file.".r"); + print FO "library(CENTIPEDE);\n"; + #print FO "pdf(file='".$file.".pdf')\n"; + print FO 'data<-read.table("'.$file.'");'."\n"; + print FO 'fit <- fitCentipede(Xlist = list(DNase=as.matrix(data[,7:dim(data)[2]])), Y=cbind(rep(1, dim(data)[1]), data[,5]));'."\n"; + print FO 'write.table(data[which(fit$PostPr>0.99),1:6],file="'.$file.'.sites",quote=FALSE,sep="\t",row.names=FALSE,col.names=FALSE);'."\n"; + #print FO 'plotProfile(fit$LambdaParList[[1]])'."\n"; + #print FO "dev.off();\n"; + close FO; + + system("/software/bin/R-2.11.1 CMD BATCH --slave ${file}.r ${file}.Rout"); + + system("gzip $file"); + #system("rm -f $file"); + + return 1; +} + + +sub write_output { # Nothing to do here + return 1; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/RunSWEmbl.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/RunSWEmbl.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,300 @@ +=pod + +=head1 NAME + +Bio::EnsEMBL::Funcgen::RunnableDB::RunSWEmbl + +=head1 DESCRIPTION + +'RunSWEmbl' Runs SWEmbl peak caller and stores peaks as an annotated feature set. +Assumes Files are organized and named with a specific convention +($repository_folder)/experiment_name/cell-type_feature-type/ +unless specific files are given + +=cut + +package Bio::EnsEMBL::Funcgen::RunnableDB::RunSWEmbl; + +use warnings; +use strict; +use Bio::EnsEMBL::Funcgen::Utils::Helper; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::InputSet; +use Bio::EnsEMBL::Funcgen::DataSet; +use Bio::EnsEMBL::Funcgen::FeatureSet; +use Bio::EnsEMBL::Funcgen::AnnotatedFeature; +use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); +#use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw (strip_param_args generate_slices_from_names strip_param_flags); + +use base ('Bio::EnsEMBL::Funcgen::RunnableDB::SWEmbl'); + +#use Data::Dumper; + +sub fetch_input { # fetch and preprocess the input file plus do some more checking + my $self = shift @_; + + $self->SUPER::fetch_input(); + + # Consider using: Bio::EnsEMBL::Helper to debug and/or log to a file (given parameter) + + my $efgdba = $self->_efgdba(); + my $fsa = $efgdba->get_FeatureSetAdaptor(); + if(!$self->_feature_set($fsa->fetch_by_name($self->_feature_set_name()))){ + throw "Feature Set was not Created"; + } + + #if(!$self->_feature_set($fsa->fetch_by_name($self->_feature_set_name()))){ + # throw "Feature set was not created. Maybe you're skipping the Setup Step of the Pipeline"; + #} + + my $analysis = $self->_analysis(); + #Use the definitions that are on the database + $self->_command($analysis->program_file); + $self->_command_options($analysis->parameters); + + my $cell_type = $self->_cell_type()->name; + my $feature_type = $self->_feature_type()->name; + my $experiment_name = $self->_experiment_name(); + + my $file_type = $self->_file_type(); + + my $output_dir = $self->_output_dir(); + + my $input_dir = $self->_input_dir(); + + my $file_name = $self->_input_file(); + my $input_file = $input_dir."/".$file_name; + if(-e $input_file){ + my $output_file = $output_dir."/".$file_name; + #TODO Validate if existent file is ok. + #TODO Add flag to enable override existent file... + if(!$self->param('reenter')){ + $self->_preprocess_file($input_file, $output_file, $file_type) || + throw "Error processing data file $input_file"; + } else { + if(! -e $output_file){ warn "$output_file not found. May need to rerun from scratch"; } + } + $self->_input_file($output_file); + + if(!$self->_results_file($self->param('results_file'))){ + $self->_results_file($output_file.".".$analysis->logic_name); + } + } else { throw "No valid data file was given: ".$input_file; } + + if(!$self->_skip_control()){ + my $control_file = $output_dir."/".$self->_control_file(); + if(! -e $control_file){ throw "No valid control file was given: ".$control_file; } + $self->_control_file($control_file); + } + + return 1; +} + + + + +sub run { # call SWEmbl + my $self = shift @_; + + if($self->param('reenter')){ return 1; } + + #Don't leave this here... as it can go wrong!! + #if( -e $self->_results_file()){ + # return 1; + #} + + my %fswitches = ( + bed => '-B', + sam => '-S', + #maq => '-M', + #eland => '-E', + #bam => '-?', + ); + + #Ideally it will unify to sam or bam + my $format_switch = $fswitches{$self->_file_type()}; + + throw("Could not identify valid SWEmbl input format") if(! $format_switch); + + my $command = $self->_bin_dir()."/".$self->_command() . " $format_switch -z -i " . $self->_input_file() . ' ' . + $self->_command_options() . ' -o ' . $self->_results_file(); + + if($self->_control_file() && !$self->_skip_control()){ + $command = $command." -r ".$self->_control_file(); + } + + #warn "Running analysis:\t$command"; + if(system($command)) { throw("FAILED to run $command"); } + + return 1; +} + +sub write_output { # Store SWEmbl results + my $self = shift @_; + + #This is now handled in SetupPeaksPipeline... + #$self->_add_experiment_to_db(); + + #TODO Add an option to only process certain slices... + #$self->_parse_result_file(@slices); + + $self->_parse_result_file(); + + #idea of calling the load reads only after peak calling... + #my $input_file = $self->_input_file(); + #my $nbr_reads = $self->_get_number_of_reads($input_file, $self->_file_type()); + #if($nbr_reads < 1) { throw "$input_file is empty"; } + #Get relevant slices to avoid creating unnecessary jobs... + #my @slices = $self->_get_reads_slices($input_file, $self->_file_type()); + + #my @rep_out_ids; + #Create the necessary LoadReads jobs + #@slices = @{&generate_slices_from_names($slice_adaptor, @slices, undef, 1, 1, 1)};#toplevel, nonref, incdups + #foreach my $slice (@slices){ + # my $new_input_id = eval($self->input_id); + # $new_input_id->{"slice"} = $slice; + # $new_input_id->{"nbr_reads"} = $nbr_reads; + # push(@rep_out_ids,$new_input_id); + #} + + #WrapUp... + #my ($funnel_job_id) = @{ $self->dataflow_output_id($new_input_id, 3, { -semaphore_count => scalar(@rep_out_ids) }) }; + #All the fanned jobs... + #my $fan_job_ids = $self->dataflow_output_id(\@rep_out_ids, 2, { -semaphored_job_id => $funnel_job_id } ); + + return 1; +} + + + +sub _parse_result_file { + + my $self = shift @_; + + ### annotated features + my $fset = $self->_feature_set(); + + my $efgdba = $self->_efgdba(); + my $sa = $efgdba->get_SliceAdaptor(); + + #Cache slices and features... + my %slice; + my @af; + + + #Parse the output file + my $results_file = $self->_results_file(); + if(!open(FILE,$results_file)){ throw "Could not open results file : ".$results_file; } + while(){ + chomp; + #Ignore headers + if(/^#/ || /^Region/){ next; } + #Parse SWEmbl output here... not much sense in making a parser, + # unless SWEmbl ouput is used elsewhere (or becomes more complex to parse); + my ($seqid, $start, $end, undef, undef, undef, $score, undef, undef, $summit) = split(/\s+/); + + #This seqid may vary depending on the input given to SWEmbl... + # handle it manually at least for the moment... namely the sam seqid... + #Make sure to test thoroughly to see if it works... + #e.g. chromosome:GRCh37:15:1:102531392:1 + if($seqid =~ /^\S*:\S*:(\S+):\S+:\S+:\S/) { $seqid = $1; } + #In case UCSC input is used... carefull names may not match with ensembl db! + $seqid =~ s/^chr//i; + + if($self->param('slice') && ($seqid ne $self->param('slice'))){ + warn "Feature being ignored as it is not in specified slice ".$self->param('slice')." : Region:". + $seqid." Start:".$start." End:".$end." Score:".$score." Summit:".$summit."\n"; + next; + } + + #Just in case some of the non-aligned were missed in a filtering step... though this seriously affects peak calling + #if($seqid =~ /\*/){ next; } + + # skip mitochondria calls - remove this when we have pre-processing step to filter alignments using blacklist? + #next if ($seqid =~ m/^M/); + + # filtering is done as a post-processing e.g. FilterBlacklist.pm + $summit = int($summit);#Round up? + + unless (exists $slice{"$seqid"}) { + $slice{"$seqid"} = $sa->fetch_by_region(undef, $seqid); + } + + #Do not enter peaks that are invalid ENSEMBL slices... + #See if this slows down the process significantly... + #May do that as post-processing?? Cannot cache since each feature should be independent... + + #This step seems irrelevant as negative coordinates are still passing and errors are likely in further steps... + #my $feature_slice = $sa->fetch_by_region(undef, $seqid, $start, $end); + #if(!$slice{"$seqid"} || !$feature_slice){ + + #Sometimes there are naming issues with the slices... e.g. special contigs... which are not "valid" slices in ENSEMBL + #if(!$slice{"$seqid"} || !(($start =~ /^-?\d+$/) && ($end =~ /^\d+$/))){ + + # warn "Feature being ignored due to incorrect coordinates: Region:".$seqid." Start:".$start." End:".$end." Score:".$score." Summit:".$summit."\n"; + # next; + #} + + if( ($start < 1) || ($end > $slice{"$seqid"}->end)){ + warn "Feature being ignored due to coordinates out of slice: Region:".$seqid." Start:".$start." End:".$end." Score:".$score." Summit:".$summit."\n"; + } + + #Gracefully handle errors... + my $af; + eval{ + $af = Bio::EnsEMBL::Funcgen::AnnotatedFeature->new + ( + -slice => $slice{"$seqid"}, + -start => $start, + -end => $end, + -strand => 0, + -score => $score, + -summit => $summit, + -feature_set => $fset, + ); + }; + if($@) { warn($@); next; } + if(!$af) { warn("Could not create feature - Region:".$seqid." Start:".$start." End:".$end." Score:".$score." Summit:".$summit); next; } + + #if($self->param('slice') && ($af->seq_region_name ne $self->param('slice'))){ + # warn "Feature being ignored as it is not in specified slice ".$self->param('slice')." : Region:". + # $af->seq_region_name." Start:".$start." End:".$end." Score:".$score." Summit:".$summit."\n"; + # next; + #} + + push(@af, $af); + } + close FILE; + + # Batch store features... + $efgdba->get_AnnotatedFeatureAdaptor->store(@af); + + #Do this on a wrapup runnable...so it will only be visible after reads are loaded... + $fset->adaptor->set_imported_states_by_Set($fset); + +} + + +#Private getter / setter to the feature set +sub _feature_set { + return $_[0]->_getter_setter('feature_set',$_[1]); +} + +#Private getter / setter to the command to be executed +sub _command { + return $_[0]->_getter_setter('command',$_[1]); +} + +#Private getter / setter to the command options +sub _command_options { + return $_[0]->_getter_setter('command_options',$_[1]); +} + +#Private getter / setter to the results file +sub _results_file { + return $_[0]->_getter_setter('results_file',$_[1]); +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/SWEmbl.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/SWEmbl.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,219 @@ +=pod + +=head1 NAME + +Bio::EnsEMBL::Funcgen::RunnableDB::SWEmbl; + +=head1 DESCRIPTION + +'SWEmbl' Is a base class for other classes dealing with SWEmbl +It contains virtually nothing so it may disappear and just pass to Funcgen + +=cut + +package Bio::EnsEMBL::Funcgen::RunnableDB::SWEmbl; + +use warnings; +use strict; +use Bio::EnsEMBL::Funcgen::Utils::Helper; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::InputSet; +use Bio::EnsEMBL::Funcgen::DataSet; +use Bio::EnsEMBL::Funcgen::FeatureSet; +use Bio::EnsEMBL::Funcgen::AnnotatedFeature; + +use base ('Bio::EnsEMBL::Funcgen::RunnableDB::Funcgen'); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); +use Data::Dumper; + +sub fetch_input { + my $self = shift @_; + + $self->SUPER::fetch_input(); + + my $efgdba = $self->_efgdba; + + my $aa = $efgdba->get_AnalysisAdaptor; + my $analysis = $self->param('analysis') || throw "Need to specify the analysis"; + my $analysis_obj = $aa->fetch_by_logic_name($analysis); + #Here we do not create or modify the analysis on the fly!! - similarly as cell or feature type... + if(!$analysis_obj){ throw "Analysis $analysis is not in the database"; } + $self->_analysis($analysis_obj); + + $self->_feature_set_name($self->_set_name()."_".$analysis); + + my $cell_type = $self->_cell_type()->name; + my $feature_type = $self->_feature_type()->name; + my $file_type = $self->_file_type(); + my $experiment_name = $self->_experiment_name(); + + #TODO Change this to accept samse and sampe and others?? - add an extra parameter when needed? + #Also not necessarily .gz ... maybe force the use of the parameter 'data_file' in the input_id + #my $input_file = $self->param('data_file') || $cell_type."_".$feature_type.".samse.".$file_type.".gz"; + my $input_file = $self->param('data_file') || $self->_set_name().".samse.".$file_type.".gz"; + $self->_input_file($input_file); + + my $work_dir = $self->_work_dir."/alignments/".$self->_species()."/".$self->_assembly()."/".$experiment_name; + my $input_dir = $self->param('input_dir') || $work_dir; + $self->_input_dir($input_dir); + + my $skip_control = $self->_skip_control($self->param('skip_control')); + if(!$skip_control){ + #May also be passed as input_id, but then input_id may need to be TEXT + #Control must be in same dir as input file... maybe change this... + if(!$self->param('control_file')){ + #For the moment the control file has to be the same file type as the input file, but does not need to be the case... + #TODO Need to validate here if that's the case... + my $control_feature = $self->param('control_feature') || throw "Need to define 'control_feature'"; + my $control_file = $cell_type."_".$control_feature."_".$experiment_name.".samse.".$file_type.".gz"; + $self->_control_file($control_file); + } + } + + return 1; +} + + +# Private function only to be called by subclasses of this class +# prepares data to be used with SWEMBL... +# sorts the input, removes mythochondria and unaligned reads... +sub _preprocess_file{ + + #Consider using hash to process input + my ($self, $input, $output, $file_type) = (shift, shift, shift, shift); + + #For the moment we always overwrite any existent file... + #Maybe reuse previously cached files? How to check if they are corrupted? + #Maybe create a -reuse flag? + + #running piped system commands is a potential source of untraceable errors!! + #TODO try changing this... e.g. with Bio::DB::Sam ... ? + my $command; + if($file_type eq 'bed'){ + $command = "gzip -dc ${input}"; + + #Remove mitochondria before SWEMBL + warn "Excluding mytochondria reads before passing to SWEMBL. Make sure Bed file has MT for mytochondria"; + warn "Duplicates are not removed in Bed files while they are in sam files"; + #This probably won't work for many BED files... e.g chrM or sam-like "beds"... + $command = " | grep -v '^MT' "; + $command .= " | sort -k1,1 -k2,2n -k3,3n | gzip -c > $output"; + + } elsif($file_type eq 'sam'){ + + #Manual Alternative to samtools + #$command = "gzip -dc ".$self->param($parameter); + #$command .= ' | grep -vE \'^@\' | grep -vE "^[^[:space:]]+[[:blank:]]4[[:blank:]].*$" | sort -k3,3 -k4,4n | gzip -c > '.$work_file; + + #Sometimes the input sam file may have incorrect headings which will mess up subsequent steps... + #$command = "gzip -dc $input | grep -v '^\@' | "; # This is not likely to happen now + $command = "gzip -dc $input | "; + + #PREPROCESSING... remove mitochondria before SWEMBL : we need to cater for different approaches + #TODO do this in a better, more generic way + $command .= "grep -vE '^[^[:space:]]+[[:blank:]][^[:space:]]+[[:blank:]][^[:space:]]+\:[^[:space:]]+\:MT\:' | "; + $command .= "grep -v '^MT' | grep -v '^chrM' | "; + + #Remove unmapped reads... + $command .= $self->_bin_dir()."/samtools view -uSh -t ".$self->_sam_header()." -F 4 - | "; + + # This piped sort is not working!! (this problem has been reported in the samtools mailing list) + #TODO Check why this pipe in the sort is not working... + #$command .= "samtools sort -o - ".$self->param($parameter)."_tmp | " ; + #$command .= "samtools view -h - | gzip -c > $work_file"; + + # Create temp file for now and remove it when we figure out what's wrong with the piped sort! + $command .= $self->_bin_dir()."/samtools sort - ${input}_tmp ; " ; + + #Add a remove duplicates step (this is not supported with BED files for the moment) + $command .= $self->_bin_dir()."/samtools rmdup -s ${input}_tmp.bam - | "; + $command .= $self->_bin_dir()."/samtools view -h - | gzip -c > $output"; + + #Alternative with no rmdup... + #$command .= $self->_bin_dir()."/samtools view -h ${input}_tmp.bam | gzip -c > $output"; + + $command .= " ; rm -f ${input}_tmp.bam"; + + } + else{ + throw("$file_type file format not supported"); + } + + system($command) && throw("Failed processing $input with command $command"); + + return 1; +} + +# Private function only to be called by subclasses of this class +# gets the number of reads in a sam or bed file +#sub _get_number_of_reads { +# my ($self, $file, $file_type) = (shift, shift, shift); +# if(($file_type ne "bed") && ($file_type ne "sam")){ throw "Only bed and sam file types supported"; } +# my $nbr_reads = 0; +# #If needed, add an option to check if is zipped or not... +# my $open_cmd = "gzip -dc $file |"; +# open(FILE,$open_cmd); +# while(){ +# if($file_type eq "sam"){ +# next if /^\@SQ/; +# }else { +# next if /track name=/o; +# } +# $nbr_reads++; +# } +# close FILE; +# return $nbr_reads; +#} + +# Private function only to be called by subclasses of this class +# gets the number of reads in a sam or bed file +#sub _get_slices { +# #NOT DONE!! +# my ($self, $file, $file_type) = (shift, shift, shift); +# if(($file_type ne "bed") && ($file_type ne "sam")){ throw "Only bed and sam file types supported"; } +# my $nbr_reads = 0; +# #If needed, add an option to check if is zipped or not... +# my $open_cmd = "gzip -dc $file |"; +# open(FILE,$open_cmd); +# while(){ +# if($file_type eq "sam"){ +# next if /^@SQ/; +# }else { +# next if /track name=/o; +# } +# $nbr_reads++; +# } +# close FILE; +# return $nbr_reads; +#} + + +#Private getter / setter to the Feature Set Name +sub _feature_set_name { + return $_[0]->_getter_setter('feature_set_name',$_[1]); +} + +#Private getter / setter to the input folder +sub _input_dir { + return $_[0]->_getter_setter('input_dir',$_[1]); +} + +#Private getter / setter to the Input subset (usually a file name) +sub _input_file { + return $_[0]->_getter_setter('input_file',$_[1]); +} + +#Private getter / setter to the skip control option +sub _skip_control { + return $_[0]->_getter_setter('skip_control',$_[1]); +} + +#Private getter / setter to the control file +sub _control_file { + return $_[0]->_getter_setter('control_file',$_[1]); +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/SetupAlignmentPipeline.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/SetupAlignmentPipeline.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,172 @@ +=pod + +=head1 NAME + +Bio::EnsEMBL::Hive::RunnableDB::Funcgen::SetupAlignmentPipeline + +=head1 DESCRIPTION + +'SetupAlignmentPipeline' Does all the setup before the Alignment is run +Checks for existence of input files, etc... +This Runnable CAN be run multiple times in parallell! + +=cut + +package Bio::EnsEMBL::Funcgen::RunnableDB::SetupAlignmentPipeline; + +use warnings; +use strict; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); +use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(is_gzipped); +use Data::Dumper; + +use base ('Bio::EnsEMBL::Funcgen::RunnableDB::Alignment'); + +#TODO... Maybe use and update the tracking database... +sub fetch_input { # fetch parameters... + my $self = shift @_; + + $self->SUPER::fetch_input(); + + #Magic default number... + my $fastq_chunk_size = 8000000; + if($self->param("fastq_chunk_size")){ $fastq_chunk_size = $self->param("fastq_chunk_size")}; + $self->_fastq_chunk_size($fastq_chunk_size); + + #Sets up the output dir for this experiment_name + my $output_dir = $self->_output_dir(); + if(! -d $output_dir){ + system("mkdir -p $output_dir") && throw("Couldn't create output directory $output_dir"); + } + + my $input_dir = $self->_input_dir(); + if(! -d $input_dir ){ throw " Couldn't find input directory $input_dir"; } + + opendir(DIR,$input_dir); + my @dirs = grep(/^\d/,readdir(DIR)); + closedir(DIR); + + if(scalar(@dirs)==0){ throw "No replicates found in $input_dir"; } + + my @input_files; + my @replicates; + foreach my $dir (@dirs){ + #TODO: maybe use some other code for replicates? (e.g. Rep\d ) + if($dir =~ /^(\d)$/){ + my $replicate = $1; + + opendir(DIR,$input_dir."/".$replicate); + my @files = grep(/.fastq/,readdir(DIR)); + closedir(DIR); + + if(scalar(@files)==0){ throw "No files for replicate $replicate"; } + + my $file_count = 0; + for my $file (@files){ + push @input_files, { + path => $input_dir."/".$replicate."/".$file, + replicate => $replicate, + file_index => $file_count++, + }; + } + + push @replicates, $replicate; + } else { warn "Invalid replicate $dir ignored"; } + } + + $self->_input_files(\@input_files); + $self->_replicates(\@replicates); + + return 1; +} + +sub run { + my $self = shift @_; + + my $fastq_chunk_size = $self->_fastq_chunk_size(); + + my @output_ids; + my $set_name = $self->_set_name(); + + foreach my $file_info (@{$self->_input_files()}){ + my $file = $file_info->{'path'}; + my $replicate = $file_info->{'replicate'}; + my $file_index = $file_info->{'file_index'}; + + my $cmd; + + if($file =~ /^(.*.fastq).gz$/){ + $cmd = "gunzip -c"; + } + elsif($file =~ /^(.*.fastq).bz2$/){ + $cmd = "bunzip2 -c" + } + else { + $cmd = "cat"; + } + + $cmd .= ' '.$file.' | split -d -a 4 -l '.$fastq_chunk_size.' - '. $self->_output_dir().'/'.$set_name."_".$replicate.'_'.$file_index.'_'; + + if(system($cmd) != 0){ throw "Problems running $cmd"; } + } + + + return 1; +} + + +sub write_output { # Create the relevant job + my $self = shift @_; + + my $set_name = $self->_set_name; + + my (@align_output_ids, @merge_output_ids); + + opendir(DIR,$self->_output_dir()); + for my $split_file ( grep(/^${set_name}_\d+_\d+_\d+$/,readdir(DIR)) ){ + my $output = eval($self->input_id); + $output->{input_file} = $split_file; + push @align_output_ids, $output; + } + closedir(DIR); + + # merge data for each replicate + + for my $rep (@{$self->_replicates}){ + my $output = eval($self->input_id); + $output->{replicate} = $rep; + push @merge_output_ids, $output; + } + + + # files to align + $self->dataflow_output_id(\@align_output_ids, 1); + + # merge data acros replicates + $self->dataflow_output_id($self->input_id, 2);#input_id + return 1; + +} + +#Private getter / setter to the fastq chunk size +sub _fastq_chunk_size { + return $_[0]->_getter_setter('fastq_chunk_size',$_[1]); +} + +#Private getter / setter to the output_ids list +sub _output_ids { + return $_[0]->_getter_setter('output_ids',$_[1]); +} + +#Private getter / setter to the output_ids list +sub _replicates { + return $_[0]->_getter_setter('replicates',$_[1]); +} + +sub _input_files { + return $_[0]->_getter_setter('input_files',$_[1]); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/SetupAnnotationPipeline.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/SetupAnnotationPipeline.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,75 @@ +=pod + +=head1 NAME + +Bio::EnsEMBL::Hive::RunnableDB::Funcgen::SetupAnnotationPipeline + +=head1 DESCRIPTION + +'SetupAnnotationPipeline' Checks cell types and creates annotation processes for each +This Runnable CANNOT be run multiple times in parallell! + +=cut + +package Bio::EnsEMBL::Funcgen::RunnableDB::SetupAnnotationPipeline; + +use warnings; +use strict; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); + +use base ('Bio::EnsEMBL::Funcgen::RunnableDB::Annotation'); + +sub fetch_input { # fetch parameters... + my $self = shift @_; + + $self->SUPER::fetch_input(); + + #Sets up the output dir + my $output_dir = $self->_output_dir(); + if(! -d $output_dir){ + system("mkdir -p $output_dir") && throw("Couldn't create output directory $output_dir"); + } + + return 1; +} + +sub run { # Check parameters and do appropriate database/file operations... + my $self = shift @_; + + my $efgdba = $self->_efgdba(); + #Check how many different cell types exist with regulatory features for current species + #Creates the appropriate jobs + my @reg_sets = @{$efgdba->get_FeatureSetAdaptor->fetch_all_by_type('regulatory')}; + my @cell_types; + foreach my $set (@reg_sets){ + if($set->cell_type->name ne 'MultiCell'){ + push @cell_types, $set->cell_type->name; + } + } + $self->_cell_types_to_run(\@cell_types); + + return 1; +} + + +sub write_output { # Create the relevant job + my $self = shift @_; + + foreach my $cell_type (@{$self->_cell_types_to_run()}){ + my $new_input_id = eval($self->input_id); + $new_input_id->{"cell_type"} = $cell_type; + $self->dataflow_output_id($new_input_id, 2, { } ); + } + + return 1; + +} + +#Private getter / setter to the cell_types_to_run list +sub _cell_types_to_run { + return $_[0]->_getter_setter('cell_types_to_run',$_[1]); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/SetupMotifInference.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/SetupMotifInference.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,81 @@ +=pod + +=head1 NAME + +Bio::EnsEMBL::Funcgen::RunnableDB::SetupMotifInference + +=head1 DESCRIPTION + +'SetupMotifInference' + +=cut + + +package Bio::EnsEMBL::Funcgen::RunnableDB::SetupMotifInference; + +use warnings; +use strict; + +use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); +use Data::Dumper; +use POSIX qw(floor); + + +use base ('Bio::EnsEMBL::Funcgen::RunnableDB::Motif'); + + + +sub fetch_input { # nothing to fetch... just the parameters... + my $self = shift @_; + + $self->SUPER::fetch_input(); + + if(! -d $self->_output_dir){ + system('mkdir -p '. $self->_output_dir) && throw "Error creating output dir ". $self->_output_dir; + } + + return 1; +} + +sub run { # Create Subtasks of binsize peaks each, ignoring the last set of peaks ( < binsize peaks ) + my $self = shift @_; + + my $afa = $self->_efgdba()->get_AnnotatedFeatureAdaptor(); + my @features = @{$afa->fetch_all_by_FeatureSets( [ $self->_feature_set ] )}; + my $bins = POSIX::floor(scalar(@features)/$self->param('bin_size')); + if($bins < 1){ + warn "Insuficient peaks. Please select a smaller bin size."; + } + warn "Number of bins is $bins"; + + #Create jobs + my @bin_input_ids; + for (my $i=1;$i<=$bins;$i++){ + #Need to add the specific file to the input_id... + my $new_input_id = eval($self->input_id); + $new_input_id->{"bin"} = $i; + push(@bin_input_ids, $new_input_id); + } + $self->_output_ids(\@bin_input_ids); + + return 1; +} + + +sub write_output { # Nothing is written at this stage (for the moment) + my $self = shift @_; + + if($self->_output_ids && scalar($self->_output_ids)>0){ + my ($converge_job_id) = @{ $self->dataflow_output_id($self->input_id, 3, { -semaphore_count => scalar(@{$self->_output_ids}) }) }; + $self->dataflow_output_id($self->_output_ids, 2, { -semaphored_job_id => $converge_job_id }); + } + return 1; + +} + +#Private getter / setter to the output ids +sub _output_ids { + return $_[0]->_getter_setter('output_ids',$_[1]); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/SetupPeaksPipeline.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/SetupPeaksPipeline.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,146 @@ +=pod + +=head1 NAME + +Bio::EnsEMBL::Hive::RunnableDB::Funcgen::SetupPeaksPipeline + +=head1 DESCRIPTION + +'SetupPeakPipeline' Does all the setup before the Peaks Pipeline +Checks for existence of Cell and Feature Type +(only flags if they do not exist, does not try to create them!!) +Creates Experiment and Set Entries, Checks Analysis, Group etc... +This Runnable CANNOT be run multiple times in parallell! + +=cut + +package Bio::EnsEMBL::Funcgen::RunnableDB::SetupPeaksPipeline; + +use base ('Bio::EnsEMBL::Funcgen::RunnableDB::SWEmbl'); + +use warnings; +use strict; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); +#use Data::Dumper; + +sub fetch_input { # fetch parameters... + my $self = shift @_; + + $self->SUPER::fetch_input(); + + my $efgdba = $self->_efgdba(); + #my $dnadba = $self->param('dnadb'); + + #Need to change this... + + #my $group = $self->_group_name(); + my $group = 'efg'; + $efgdba->fetch_group_details($group) || throw "Group $group does not exist in the database. Please create it"; + + #Theoretically we should be getting this at the registry... + #my $csa = $dnadba->get_adaptor('coordsystem'); + + my $assembly = $self->_assembly(); + ##Check if the assembly is allowed in the current db we're working on... + ##In certain DBs this doesn't work: e.g. dev_*_funcgen + #if(!grep(/$assembly/, map($_->version, @{ $csa->fetch_all()}))){ throw $assembly." is not a valid assembly"; } + + #Checks if the input dir for this experiment_name exists... all input files (including controls must be in this folder) + my $input_dir = $self->_input_dir(); + if(! -d $input_dir){ throw("Didn't find input directory $input_dir"); } + + #Sets up the output dir for this experiment_name + my $output_dir = $self->_output_dir(); + if(! -d $output_dir){ + system("mkdir -p $output_dir") && throw("Couldn't create output directory $output_dir"); + } + + ##Better pass this as a parameter... as sometimes the db does not return the species name... + #my $species = $dnadba->species; + my $species = $self->_species(); + + my $file_type = $self->_file_type(); + if(($file_type ne 'sam') && ($file_type ne 'bed')){ throw "File type $file_type currently not supported"; } + + #infer sam_header from species and assembly... + if($file_type eq 'sam'){ + #Change the directory structure so it will agree with the rest, without the need to do uc() + my $sam_header = $self->_sam_header(); + if(! -e $sam_header){ throw " File type is sam but could not find sam header $sam_header"; } + } + + #Check if input file exists... do not do preprocessing here... as this can be done in parallel... + my $input_file = $self->_input_file(); + if(! -e $input_dir."/".$input_file){ throw " Couldn't find input file $input_file in $input_dir"; } + + #Check if control file exists + if(!$self->_skip_control()){ + my $control_file = $self->_control_file(); + if(! -e $input_dir."/".$control_file ){ + #Force throw or just warn? + throw "Couldn't find control file ${input_dir}/${control_file}"; + #$self->_skip_control(1); + } + } else { + throw "CCAT requires a control. Cannot skip it" if($self->_analysis()->logic_name =~ /ccat/i); + } + + return 1; +} + +sub run { # Check parameters and do appropriate database/file operations... + my $self = shift @_; + + #Preprocess control file if needed. this cannot be done in parallel as several sets may require same control file + if(!$self->_skip_control()){ + #Do not mix this "input" file with the true data input file... + my $input_file = $self->_input_dir().'/'.$self->_control_file(); + my $output_file = $self->_output_dir().'/'.$self->_control_file(); + #Only do it if it hasn't already been done.. + if(! -e $output_file){ + $self->_preprocess_file($input_file, $output_file, $self->_file_type()) || throw "Error processing file $input_file"; + } + } + + # Check experiment, data set, feature set and create as appropriate... + $self->_check_Experiment($self->_analysis(), $self->_input_file(), $self->_feature_set_name()); + + if(!$self->_skip_control()){ + #Add control file as a subset, if needed... + my $input_set = $self->_efgdba()->get_InputSetAdaptor()->fetch_by_name($self->_set_name()); + if(!$input_set) { throw "Input set was not created"; } + my $isubset_name = $self->_control_file(); + if(!$input_set->get_subset_by_name($isubset_name)){ + #Change InputSetAdaptor so one subset could be stored each time? + $input_set->add_new_subset($isubset_name); + #this expects a behavior where subsets already stored will just be ignored and no error is thrown + $input_set->adaptor->store_InputSubsets($input_set->get_InputSubsets); + } + } + + return 1; +} + + +sub write_output { # Create the relevant job + my $self = shift @_; + + #These numbers need to be parameters... + ## If feature type is H3K36me3 or H3K27me3 use broad peak caller... Maybe all histone data? + #if($self->_feature_type()->class eq 'Histone'){ + #if(($self->_feature_type()->name eq 'H3K36me3') || ($self->_feature_type()->name eq 'H3K27me3')){ + # $self->dataflow_output_id( $self->input_id, $self->_broad_peak_id); + + if($self->_analysis()->logic_name =~ /ccat/i){ + $self->dataflow_output_id( $self->input_id, 4); + } else { + $self->dataflow_output_id( $self->input_id, 3); + } + + return 1; + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/WrapUpAlignment.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/WrapUpAlignment.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,133 @@ +=pod + +=head1 NAME + +Bio::EnsEMBL::Funcgen::RunnableDB::WrapUpAlignment; + +=head1 DESCRIPTION + +'WrapUpAlignment' Merges all results from alignment jobs into a single file + +=cut + +package Bio::EnsEMBL::Funcgen::RunnableDB::WrapUpAlignment; + +use warnings; +use strict; +use Bio::EnsEMBL::Funcgen::Utils::Helper; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::InputSet; +use Bio::EnsEMBL::Funcgen::DataSet; +use Bio::EnsEMBL::Funcgen::FeatureSet; +use Bio::EnsEMBL::Funcgen::AnnotatedFeature; + +use base ('Bio::EnsEMBL::Funcgen::RunnableDB::Alignment'); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); +use Data::Dumper; +use File::Copy; + +sub fetch_input { + my $self = shift @_; + + $self->SUPER::fetch_input(); + + #my $replicate = $self->param('replicate') || throw "No replicate given"; + #$self->_replicate($replicate); + + #my $nbr_subfiles = $self->param('nbr_subfiles') || throw "Number of subfiles not given"; + #if($nbr_subfiles<1){ throw "We need at least one subfile. Empty or invalid set given: $nbr_subfiles"; } + #$self->_nbr_subfiles($nbr_subfiles); + + my $species=$self->_species(); + my $gender=$self->_cell_type->gender(); + $gender = $gender ? $gender : "male"; + my $assembly=$self->_assembly(); + my $sam_header = $self->_work_dir()."/sam_header/".$species."/".$species."_".$gender."_".$assembly."_unmasked.header.sam"; + $self->_sam_header($sam_header); + + + my $repository = $self->_repository(); + if(! -d $repository){ + system("mkdir -p $repository") && throw("Couldn't create directory $repository"); + } + + return 1; +} + +sub run { + my $self = shift @_; + + # my $replicate = $self->_replicate(); + my $sam_header = $self->_sam_header(); + + my $set_name = $self->_set_name(); + my $file_prefix = $self->_output_dir()."/".$set_name; + my $input_file_pattern = "${file_prefix}_[0-9]*_[0-9]*_[0-9]*.sorted.bam"; + my $merge_cmd="samtools merge -f -h $sam_header ${file_prefix}.sorted.bam $input_file_pattern"; + + #Maybe remove duplicates as default behavior ?. + #my $merge_cmd="samtools merge -h $sam_header - ${file_prefix}.[0-9]*.sorted.bam | "; + #$merge_cmd.=" samtools rmdup -s - ${file_prefix}.sorted.bam"; + + if(system($merge_cmd) != 0){ throw "Error merging file: $merge_cmd"; } + + #keep end result as bam... it's more compact: only when passing to the peak caller we pass to sam or other format... + #merge_cmd.=" samtools view -h - | gzip -c > ${file_prefix}${align_type}.sam.gz" + + my $alignment_log = $file_prefix.".alignment.log"; + + my $log_cmd="echo \"Alignment QC - total reads as input: \" >> ${alignment_log}"; + $log_cmd.=";samtools flagstat ${file_prefix}.sorted.bam | head -n 1 >> ${alignment_log}"; + $log_cmd.=";echo \"Alignment QC - mapped reads: \" >> ${alignment_log} "; + $log_cmd.=";samtools view -u -F 4 ${file_prefix}.sorted.bam | samtools flagstat - | head -n 1 >> ${alignment_log}"; + $log_cmd.="; echo \"Alignment QC - reliably aligned reads (mapping quality >= 1): \" >> ${alignment_log}"; + $log_cmd.=";samtools view -u -F 4 -q 1 ${file_prefix}.sorted.bam | samtools flagstat - | head -n 1 >> ${alignment_log}"; + #Maybe do some percentages? + + if(system($log_cmd) != 0){ warn "Error making the alignment statistics"; } + print STDERR "logged\n"; + my $repository = $self->_repository(); + + move("${file_prefix}.sorted.bam","${repository}/${set_name}.samse.bam"); + + my $convert_cmd = "samtools view -h ${repository}/${set_name}.samse.bam | gzip -c - > ${repository}/${set_name}.samse.sam.gz"; + + + #print STDERR "using convert cmd: ".$convert_cmd; + + if(system($convert_cmd) != 0){ warn "Error converting to zipped sam"; } + #print STDERR "converted\n"; + my $rm_cmd="rm -f $input_file_pattern"; + if(system($rm_cmd) != 0){ warn "Error removing temp files. Remove them manually"; } + #$rm_cmd="rm -f ${file_prefix}.sorted.bam"; + #if(system($rm_cmd) != 0){ warn "Error removing final bam files. Remove it manually"; } + + return 1; +} + + +sub write_output { + my $self = shift @_; + + return 1; + +} + +#Private getter / setter to the sam header +sub _sam_header { + return $_[0]->_getter_setter('sam_header',$_[1]); +} + +#Private getter / setter to the replicate number +sub _replicate { + return $_[0]->_getter_setter('replicate',$_[1]); +} + +#Private getter / setter to the replicate number +sub _nbr_subfiles { + return $_[0]->_getter_setter('nbr_subfiles',$_[1]); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/SegmentationFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/SegmentationFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,168 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::SegmentationFeature +# +# You may distribute this module under the same terms as Perl itself + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::SegmentationFeature - Genomic segmentation feature + +=head1 SYNOPSIS + +use Bio::EnsEMBL::Funcgen::SegmentationFeature; + +my $feature = Bio::EnsEMBL::Funcgen::SegmentationFeature->new( + -SLICE => $chr_1_slice, + -START => 1_000_000, + -END => 1_000_024, + -STRAND => 0, + -FEATURE_SET => $fset, + -FEATURE_TYPE => $ftype, +); + + + +=head1 DESCRIPTION + +An SegmentationFeature object represents the genomic placement of a prediction +generated by the eFG analysis pipeline. This normally represents the +output of a peak calling analysis. It can have a score and/or a summit, the +meaning of which depend on the specific Analysis used to infer the feature. +For example, in the case of a feature derived from a peak call over a ChIP-seq +experiment, the score is the peak caller score, and summit is the point in the +feature where more reads align with the genome. + +=head1 SEE ALSO + +Bio::EnsEMBL::Funcgen::DBSQL::SegmentationFeatureAdaptor + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::SegmentationFeature; + +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Utils::Exception qw( throw ); +use Bio::EnsEMBL::Funcgen::SetFeature; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::SetFeature); + +=head2 new + + Arg [-SLICE] : Bio::EnsEMBL::Slice - The slice on which this feature is. + Arg [-START] : int - The start coordinate of this feature relative to the start of + the slice it is sitting on. Coordinates start at 1 and are inclusive. + Arg [-END] : int -The end coordinate of this feature relative to the start of + the slice it is sitting on. Coordinates start at 1 and are inclusive. + Arg [-STRAND] : int - The orientation of this feature. Valid values are 1, -1 and 0. + Arg [-FEATURE_SET] : Bio::EnsEMBL::Funcgen::FeatureSet + Arg [-FEATURE_TYPE] : Bio::Ensembl::Funcgen::FeatureType + Arg [-DISPLAY_LABEL]: optional string - Display label for this feature + Arg [-SCORE] : optional int - Score assigned by analysis pipeline + Arg [-dbID] : optional int - Internal database ID. + Arg [-ADAPTOR] : optional Bio::EnsEMBL::DBSQL::BaseAdaptor - Database adaptor. + + Example : my $feature = Bio::EnsEMBL::Funcgen::SegmentationFeature->new + ( + -SLICE => $chr_1_slice, + -START => 1_000_000, + -END => 1_000_024, + -STRAND => -1, + -FEATURE_SET => $fset, + -FEATURE_TYPE => $ftype, + -DISPLAY_LABEL => $text, + -SCORE => $score, + ); + + + Description: Constructor for SegmentationFeature objects. + Returntype : Bio::EnsEMBL::Funcgen::SegmentationFeature + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_); + #Hard code strand => 0 here? And remove from input params? + my ($score, $ftype) = rearrange(['SCORE', 'FEATURE_TYPE'], @_); + + #test ftype as SetFeature method defaults to feature_set->feature_type + throw('You must pass a valid FeatureType') if ! defined $ftype; + + $self->{score} = $score if $score; + $self->{feature_type} = $ftype; + + return $self; +} + + +=head2 score + + Arg [1] : (optional) int - score + Example : my $score = $feature->score(); + Description: Getter and setter for the score attribute for this feature. + Returntype : int + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub score { + my $self = shift; + return $self->{'score'}; +} + + +=head2 display_label + + Example : my $label = $feature->display_label(); + Description: Getter for the display label of this feature. + Returntype : String + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub display_label { + my $self = shift; + + if(! $self->{'display_label'} && $self->adaptor){ + $self->{'display_label'} = $self->feature_type->name()." -"; + $self->{'display_label'} .= " ".$self->cell_type->name(); + } + + return $self->{'display_label'}; +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/Set.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Set.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,348 @@ + +# +# Ensembl module for Bio::EnsEMBL::Funcgen::Set +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::Funcgen::Set - A module to represent a base Set object. + + +=head1 SYNOPSIS + +use Bio::EnsEMBL::Funcgen::Set; + +@INC = qw (Bio::EnsEMBL::Funcgen::Set) + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + +} + +=head1 DESCRIPTION + +A base Set object which provides access common methods available across all Funcgen Set classes. + + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::Set; + +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate); +use Bio::EnsEMBL::Funcgen::Storable; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Storable); + + +=head2 new + + Example : my $self = $class->SUPER::new(@_); + Description: Constructor for Set objects. + Returntype : Bio::EnsEMBL::Funcgen::Set + Exceptions : None + Caller : General + Status : At risk + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + #TYPE was never parsed here? + #Only in inheritants that used it i.e. FeatureSet + + my ($name, $anal, $ftype, $ctype, $set_type, $fclass, $type) + = rearrange(['NAME', 'ANALYSIS', 'FEATURE_TYPE', 'CELL_TYPE', 'SET_TYPE', 'FEATURE_CLASS', 'TYPE'], @_); + + throw('Need to specify a name') if ! defined $name; + + $self->set_type($set_type); + $self->feature_class($fclass); + $self->feature_class($type) if $type;#Remove this when fully implemented + $self->{'name'} = $name; + $self->cell_type($ctype) if $ctype; + $self->feature_type($ftype) if $ftype; + + if(defined $anal){ + $self->analysis($anal); + }elsif($self->set_type ne 'input'){ + #Could move this to child Sets and just set analysis here + #As with ftype + throw('Must pass a valid -analysis parameter for a '.ref($self)); + } + + return $self; +} + + + + + + +=head2 name + + Example : my $set->name('SET1'); + Description: Getter/Setter for the name of this Set. + Returntype : string + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub name { + my $self = shift; + + return $self->{'name'}; +} + +=head2 cell_type + + Example : my $dset_ctype_name = $dset->cell_type->name(); + Description: Getter for the cell_type for this DataSet. + Returntype : Bio::EnsEMBL::Funcgen::CellType + Exceptions : throws if arg not valid + Caller : General + Status : At Risk + +=cut + +sub cell_type { + my ($self, $ctype) = @_; + + if(defined $ctype){ + + if(! (ref($ctype) eq 'Bio::EnsEMBL::Funcgen::CellType' + && $ctype->dbID())){ + throw('Must pass a valid stored Bio::EnsEMBL::Funcgen::CellType'); + } + $self->{'cell_type'} = $ctype; + } + + return $self->{'cell_type'}; +} + +=head2 feature_type + + Example : my $dset_ftype_name = $dset->feature_type->name(); + Description: Getter for the feature_type for this DataSet. + Returntype : Bio::EnsEMBL::Funcgen::FeatureType + Exceptions : Throws if arg not valid + Caller : General + Status : At Risk + +=cut + +sub feature_type { + my ($self, $ftype) = @_; + + if(defined $ftype){ + + if(! (ref($ftype) eq 'Bio::EnsEMBL::Funcgen::FeatureType' + && $ftype->dbID())){ + throw('Must pass a valid stored Bio::EnsEMBL::Funcgen::FeatureType'); + } + $self->{'feature_type'} = $ftype; + } + + + return $self->{'feature_type'}; +} + + +=head2 feature_class + + Arg[0] : string - feature class e.g. result, annotated, regulatory or external. + Example : my $fclass = $dset->feature_class; + Description: Getter for the feature_type for this Set. + Returntype : string + Exceptions : None + Caller : General + Status : At Risk + +=cut + +#Supercededs type method in FeatureSet + +sub feature_class { + my ($self, $fclass) = @_; + + if(defined $fclass){ + + #Leave this an implement in inheritants + #if(! grep /^${fclass}$/, ('annotated', 'result', 'external', 'regulatory')){ + # throw("You have no supplied a valid feature class:\t$fclass"); + #} + + $self->{'feature_class'} = $fclass; + } + + return $self->{'feature_class'}; +} + + + +=head2 analysis + + Example : my $anal_name = $set->analysis->logic_name(); + Description: Getter for the analysis attribute for this Set. + Returntype : Bio::EnsEMBL::Analysis + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub analysis { + my $self = shift; + + if(@_){ + throw('Must pass a valid stored Analysis') if (! (ref($_[0]) eq 'Bio::EnsEMBL::Analysis' + && $_[0]->dbID())); + $self->{'analysis'} = shift; + } + + + return $self->{'analysis'}; +} + +=head2 display_label + + Example : print $set->display_label(); + Description: Getter for the display_label attribute for this Set. + This is more appropriate for teh predicted_features of the set. + Use the individual display_labels for each raw result set. + Returntype : str + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub display_label { + my $self = shift; + + + #Add display label in table? + #Can we aborc ResultSet method into this? + + if(! $self->{'display_label'}){ + + #if($self->product_FeatureSet->feature_type->class() eq 'Regulatory Feature'){ + # $self->{'display_label'} = 'Regulatory Features'; + #} + #else{ + + #This only works for annotated/regulatory_feature sets and result sets + #Move to other Set classes? + + $self->{'display_label'} = $self->feature_type->name()." -"; + $self->{'display_label'} .= " ".($self->cell_type->display_label() || + $self->cell_type->description() || + $self->cell_type()->name()); + + + if($self->set_type eq 'result'){ + $self->{'display_label'} .= " signal"; + } + else{ + $self->{'display_label'} .= " enriched sites"; + } + } + + return $self->{'display_label'}; +} + + + +=head2 set_type + + Example : my $set_type = $set->set_type; + Description: Getter for the Set type for this Set. + Returntype : string e.g. result, feature, data, input + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub set_type { + my ($self, $set_type) = @_; + + if(defined $set_type){ + $self->{'_set_type'} = $set_type; + } + elsif(! defined $self->{'_set_type'}){ + my @namespace = split/\:\:/, ref($self); + ($self->{'_set_type'} = lc($namespace[$#namespace])) =~ s/set//; + + } + + return $self->{'_set_type'}; +} + +=head2 type + + Example : my $type = $set->type; + Description: Getter for the type for this Set. + e.g. annotated, external, regulatory for FeatureSets + or + array, sequencing for InputSets + Currently not applicable to DataSets or ResultSets + Exceptions : None + Returntype : string + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub type { + my $self = shift; + + deprecate("Please use feature_class instead"); + + return $self->feature_class(@_); + + #$self->{'feature_class'} = shift if @_; + + #return $self->{'feature_class'}; +} + + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/SetFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/SetFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,280 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::SetFeature +# + + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::Funcgen::SetFeature - Base class for features of a Set. + +=head1 SYNOPSIS + + # Would normally be created from an in inheriting class e.g. AnnotatedFeature.pm + + use base qw(Bio::Ensembl::Funcgen::SetFeature); + + sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_); + # More construction here + } + + # Alternative direct contruction + + my $feat = Bio::EnsEMBL::Funcgen::SetFeature-> + ( + -start => 100, + -end => 220, + -strand => -1, + -slice => $slice, + -set => $fset, + -feature_type => $ftype, + -display_label => $label, + ); + + # Accessing some attributes + + + my $start = $feat->start; + my $end = $feat->end; + my $strand = $feat->strand; + my $fset = $feat->set; + my $cell_type = $feat->cell_type; + + # Printing some information + + print $feature->display_label.' has the FeatureType '.$feat->feature_type->name."\n"; + +=head1 DESCRIPTION + +This is a base class for features which are contained within a Funcgen FeatureSet or ResultSet. +It provides generic methods for attributes which are common across all inheriting classes. + +=cut + + +package Bio::EnsEMBL::Funcgen::SetFeature; + +use strict; +use warnings; + +use Bio::EnsEMBL::Feature; +use Bio::EnsEMBL::Funcgen::Storable; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw); + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Feature Bio::EnsEMBL::Funcgen::Storable); +#can't use base with dual inheritance + +=head2 new + + + Arg [-SET] : Bio::EnsEMBL::Funcgen::ResultSet or FeatureSet. + Arg [-DISPLAY_LABEL]: (optional) String - Display label for this feature + Arg [-FEATURE_TYPE] : (optional) Bio::EnsEMBL::Funcgen::FeatureType. + Defaults to Feature/ResultSet FeatureType. + + #Bio::EnsEMBL::Feature arguments + Arg [-SLICE] : Bio::EnsEMBL::Slice - The slice on which this feature is. + Arg [-STRAND] : (optional) Int - The orientation of this feature relative to the + strand it is on. Valid values are 1, -1 and 0. + Arg [-START] : Int - The start coordinate of this feature relative to the start of the slice + it is sitting on. Coordinates start at 1 and are inclusive. + Arg [-END] : Int -The end coordinate of this feature relative to the start of the slice + it is sitting on. Coordinates start at 1 and are inclusive. + Arg [-ANALYSIS] : (optional) Bio::EnsEMBL::Analysis. Defaults to Feature/ResultSet Analysis. + Arg [-dbID] : (optional) Int - Internal database ID. + Arg [-ADAPTOR] : (optional) Bio::EnsEMBL::Funcgen::DBSQL::BaseFeatureAdaptor + + + + Example : my $feature = Bio::EnsEMBL::Funcgen::SetFeature->new + ( + -SLICE => $chr_1_slice, + -START => 1000000, + -END => 1000024, + -STRAND => -1, + -DISPLAY_LABEL => $text, + -SET => $fset, + ); + + Description: Constructor for SetFeature objects. Should never be called directly, only by its children. + Returntype : Bio::EnsEMBL::Funcgen::SetFeature + Exceptions : Throws if no valid ResultSet or FeatureSet passed + Throws if FeatureType is passed but not valid + Caller : General + Status : At Risk - FEATURE_SET arg to be removed, superceded by SET in v67 + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my ($display_label, $fset, $ftype, $set) + = rearrange(['DISPLAY_LABEL', 'FEATURE_SET', 'FEATURE_TYPE', 'SET'], @_); + + $set ||= $fset; + + if( ( ref($set) ne 'Bio::EnsEMBL::Funcgen::FeatureSet') && + ( ref($set) ne 'Bio::EnsEMBL::Funcgen::ResultSet') ){ + throw("Must pass valid Bio::EnsEMBL::Funcgen::FeatureSet or ResultSet object"); + } + + #Grab FeatureSet first so we can pass analysis to base Feature class + #Funcgen analysis is currently always at the Set level + #if this ever changes the SetFeature->analysis method will also need changing + my $self = $class->SUPER::new(@_, -analysis => $set->analysis); + + if($ftype){ + + if (ref($ftype) ne 'Bio::EnsEMBL::Funcgen::FeatureType') { + throw('feature_type param must be a valid Bio::EnsEMBL::Funcgen::FeatureType'); + } + + $self->{feature_type} = $ftype; + } + + #Setting attrs directly removes the need for setter code in methods + $self->{set} = $set; + $self->{display_label} = $display_label if defined $display_label; + + return $self; +} + + + +=head2 feature_set + + Example : my $set = $efeature->feature_set(); + Description: WARNING: Can now also return ResultSet aswell as FeatureSet attribute for this feature. + Returntype : Bio::EnsEMBL::Funcgen::FeatureSet or ResultSet + Exceptions : None + Caller : General + Status : At Risk - marked as to be removed in v67 + +=cut + +sub feature_set { + #??? deprecate + #check webcode? + + return $_[0]->{set}; +} + + +=head2 set + + Example : my $set = $set_feature->set(); + Description: Getter for the set attribute for this feature. + Returntype : Bio::EnsEMBL::Funcgen::FeatureSet or ResultSet + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub set { + return $_[0]->{set}; +} + +=head2 cell_type + + Example : my $cell_name = $set_feature->cell_type->name; + Description: Getter for the CellType attribute for the Set of this Feature. + May not always be for some Set types e.g. ExternalFeatures. + Returntype : Bio::EnsEMBL::Funcgen::CellType + Exceptions : None + Caller : General + Status : stable + +=cut + +sub cell_type{ + return $_[0]->set->cell_type; +} + + +=head2 feature_type + + Example : my $ft_name = $set_feature->feature_type->name; + Description: Getter for the FeatureType attribute for this feature. + If not explicitly set, defaults to the Set FeatureType + Returntype : Bio::EnsEMBL::Funcgen::FeatureType + Exceptions : None + Caller : General + Status : stable + +=cut + +sub feature_type{ + my $self = shift; + + if(! defined $self->{feature_type}){ + $self->{feature_type} = $self->set->feature_type; + } + + return $self->{feature_type}; +} + + +=head2 analysis + + Example : my $analysis = $setfeature->analysis; + Description: Getter for the Analysis attribute for this feature. + Re-implementation of Bio::EnsEMBL::Feature->analysis. + Returntype : Bio::EnsEMBL::Analysis + Exceptions : None + Caller : General + Status : stable + +=cut + +#what about MFs? add as feature_set as MOODS/PWM analysis not represented +#This is a mandatory requirement for Bio::EnsEMBL::Feature +#Do we ever actually have analysis at the feature level? + +sub analysis{ + return $_[0]->set->analysis; +} + + +=head2 display_label + + Example : my $label = $feature->display_label; + Description: Getter for the display label of this feature. + This will most likely be over-ridden by inheriting class + Returntype : String + Exceptions : None + Caller : General + Status : Stable + +=cut + +sub display_label{ + return $_[0]->{display_label}; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/Storable.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Storable.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,519 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::Storable +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::Funcgen::Storable + +=head1 SYNOPSIS + + my $dbID = $storable_object->dbID(); + my $adaptor = $storable_object->adaptor(); + if($storable_object->is_stored($db_adaptor))) { + + } +=head1 DESCRIPTION + +This is a simple wrapper class to provide convenience methods for the StorableAdaptor. +Only get type methods have been implemented here to avoid obfuscating DB writes which +should only be done by the specific 'Storable'Adaptors. + +=head1 SEE ALSO + +Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::Storable; + + +use Bio::EnsEMBL::Registry; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Storable; + +use vars qw(@ISA); + +@ISA = qw(Bio::EnsEMBL::Storable); + +=head2 new + + Arg [-STATES] : Arrayref of states + Arg [-dbID] : database internal id + Example : none + Caller : internal calls + Description : create a new Storable object + Returntype : Bio::EnsEMBL::Storable + Exceptions : Adaptor not a Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor + Status : Stable + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + + my ($states, $assoc_ftypes) = rearrange(['STATES', 'ASSOCIATED_FEATURE_TYPES'] ,@_); + + if ($self->adaptor() && (! $self->adaptor->isa("Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor"))){ + throw("Adaptor muct be a valid Bio::EnsEMBL::Funcgen::DBSQL::BaseAdaptor"); + } + + #will these break using _new_fast + #THerefore ResultFeature, Probe and ProbeFeature should not be Funcgen::Storables + + @{$self->{'states'}} = @$states if $states; + $self->associated_feature_types($assoc_ftypes) if(defined $assoc_ftypes); + + + return $self; +} + + + + + + +=head2 has_status + + Arg [1] : string - status e.g. IMPORTED, DISPLAYABLE + Example : if($experimental_chip->has_status('IMPORTED'){ ... skip import ... }; + Description: Tests whether storable has a given status + Returntype : BOOLEAN + Exceptions : Throws if not status is provided + Caller : general + Status : At risk + +=cut + + + +sub has_status{ + my ($self, $status) = @_; + + throw("Must provide a status to check") if ! $status; + + my @state = grep(/$status/, @{$self->get_all_states()}); + my $boolean = scalar(@state);#will be 0 or 1 due to table contraints + + return $boolean; +} + + + +#There is a potential to create an obj from scratch which may already exist in the db +#If we add a state to this (obj has not dbID so will not retrieve stored states) +# and then try and store it, this will result in adding the state to the previously stored obj. +#The behaviour is silent and could cause problems. + +#To resolve this the adaptor implementations must throw if we find a matching object +#We must force the user to generate the obj from the db(use recover) rather than from scratch +#to make them aware of the situation. This is useful to protect objects where we do not want to overwrite previous data +#e.g. experiment, experimental_chip, channel +#For objects which are routinely resued, we must make sure we always try the db first(not just when recover is set) +#Then warn/throw if there are differing attributes + +#This is not possible for set objects, but is not a problem as it will just create another set entry rather than overwriting +#All update/store_states methods should be okay so long as we have a dbID first. + + + + +=head2 get_all_states + + Example : my @ec_states = @{$experimental_chip->get_all_states()}; + Description: Retrieves all states from DB and merges with current states array + Returntype : LISTREF + Exceptions : None + Caller : general + Status : At risk + +=cut + + + +sub get_all_states{ + my ($self) = @_; + + my %states; + + #This could miss states in the DB for storables which have been created and had states added + #but already exist with states in the DB + #The way to get around this is to throw if we try and store an object without a dbID which matches + #something in the DB. + #Remove func in adaptors(ec and channel only?) to automatically use prestored objects, throw instead if no dbID and matches. + #force use of recover to retrieve object from DB and then skip to relevant step based on states. + #Have states => next method hash in Importer/ArrayDefs? + + + + if($self->is_stored($self->adaptor->db()) && ! $self->{'states'}){ + @{$self->{'states'}} = @{$self->adaptor->fetch_all_states($self)}; + } + + return $self->{'states'}; +} + + +=head2 add_status + + Example : $ec->add_state('DISPLAYABLE'); + Description: Adds a state to a new or previously stored Storable + Returntype : None + Exceptions : Throws if no status supplied + Caller : general + Status : At risk + +=cut + + + +sub add_status{ + my ($self, $status) = @_; + + throw("Must pass a status to add e.g. 'DISPLAYABLE'") if ! $status; + + + + #this does not resolve the problem!!??? + #can add a status to an unstored object which + + if($self->adaptor && $self->is_stored($self->adaptor->db()) && ! $self->{'states'}){ + @{$self->{'states'}} = @{$self->adaptor->fetch_all_states($self)}; + } + + push @{$self->{'states'}}, $status; + + return; +} + +sub is_displayable{ + my $self = shift; + + return $self->has_status('DISPLAYABLE'); +} + +#These DBEntry methods are here to enable xrefs to FeatureType, Probe, ProbeSet & ProbeFeature +#They only work as SetFeature has Storable as the second element of @ISA and Bio::EnsEMBL::Feature +#get_all_DBEntries be incorporated into Bio::EnsEMBL::Storable as generic method +#With the other wrapper methods in the Storables of the non-core APIs? +#Or can these be moved to core as a supra core class? +#i.e. Is common to all non-core APIs but not relevant for core? +#Can we bring these together to stop code propogation? + + +=head2 get_all_Gene_DBEntries + + Example : my @gene_dbentries = @{ $storable->get_all_Gene_DBEntries }; + Description: Retrieves Ensembl Gene DBEntries (xrefs) for this Storable. + This does _not_ include the corresponding translations + DBEntries (see get_all_DBLinks). + + This method will attempt to lazy-load DBEntries from a + database if an adaptor is available and no DBEntries are present + on the transcript (i.e. they have not already been added or + loaded). + Returntype : Listref of Bio::EnsEMBL::DBEntry objects + Exceptions : none + Caller : general + Status : at risk + +=cut + +#Need to add optional Transcript/Gene param so we can filter +#Filter here or would be better to restrict in sql query ni DBEntryAdaptor? + +sub get_all_Gene_DBEntries { + my $self = shift; + + + #We wouldn't need this if we made the xref schema multi species + #my $species = $self->adaptor->db->species; + my $species = Bio::EnsEMBL::Registry->get_alias($self->adaptor->db->species); + + if(!$species){ + throw('You must specify a DBAdaptor -species to retrieve DBEntries based on the external_db.db_name'); + } + + #safety in case we get Homo sapiens + ($species = lc($species)) =~ s/ /_/; + + + + return $self->get_all_DBEntries($species.'_core_Gene'); +} + +=head2 get_all_Transcript_DBEntries + + Arg[0] : optional - Bio::EnsEMBL::Transcript to filter DBEntries on. + Example : my @transc_dbentries = @{ $set_feature->get_all_Transcript_DBEntries }; + Description: Retrieves ensembl Transcript DBEntries (xrefs) for this Storable. + This does _not_ include the corresponding translations + DBEntries (see get_all_DBLinks). + + This method will attempt to lazy-load DBEntries from a + database if an adaptor is available and no DBEntries are present + on the Storable (i.e. they have not already been added or + loaded). + Returntype : Listref of Bio::EnsEMBL::DBEntry objects + Exceptions : none + Caller : general + Status : at risk + +=cut + +#change the to ensembl_core when we implement Gene/Transcript/Protein|Translation links on the same external_db + +sub get_all_Transcript_DBEntries { + my ($self, $transcript) = @_; + + #We wouldn't need this if we made the xref schema multi species + my $species = Bio::EnsEMBL::Registry->get_alias($self->adaptor->db->species); + #Need to make sure this is latin name + + + if(!$species){ + throw('You must specify a DBAdaptor -species to retrieve DBEntries based on the external_db.db_name'); + } + + #safety in case we get Homo sapiens + #($species = lc($species)) =~ s/ /_/; + + my $dbes = $self->get_all_DBEntries($species.'_core_Transcript'); + + #This needs to be moved to the DBEntryAdaptor and restrict the query using the + #dbprimary_acc + + if($transcript){ + my $sid = $transcript->stable_id; + + #Test for sid here? + + if(ref($transcript) && $transcript->isa('Bio::EnsEMBL::Transcript')){ + my @dbes; + + foreach my $dbe(@$dbes){ + if($dbe->primary_id eq $sid){ + push @dbes, $dbe; + } + } + $dbes = \@dbes; + } + else{ + throw('Transcript argument must be a valid Bio::EnsEMBL::Transcript'); + } + } + + + return $dbes; +} + + +=head2 get_all_UnmappedObjects + + Example : my @uos = @{$storable->get_all_UnmappedObjects }; + Description: Retrieves UnamappedObjects for this Storable. + Returntype : arrayref of Bio::EnsEMBL::UnmappedObject objects + Exceptions : none + Caller : general + Status : At risk - move to Bio::Ensembl::Storable? + +=cut + +sub get_all_UnmappedObjects{ + my $self = shift; + #Do we want to add external_db or analysis param here? + + my $class = ref($self); + $class =~ s/.*:://; + + return $self->adaptor->db->get_UnmappedObjectAdaptor->fetch_all_by_object_type_id($class, $self->dbID); +} + + +=head2 get_all_DBEntries + + Arg[1] : string - External DB name e.g. ensembl_core_Gene + Arg[2] : string - External DB type + Example : my @dbentries = @{ $set_feature->get_all_DBEntries }; + Description: Retrieves DBEntries (xrefs) for this SetFeature. + This does _not_ include the corresponding translations + DBEntries (see get_all_DBLinks). + + This method will attempt to lazy-load DBEntries from a + database if an adaptor is available and no DBEntries are present + on the SetFeature (i.e. they have not already been added or + loaded). + Returntype : Listref of Bio::EnsEMBL::DBEntry objects + Exceptions : none + Caller : general, get_all_DBLinks + Status : Stable - at risk move to storable + +=cut + + +#We could add 3rd arg here which would be xref(info_)type e.g. Gene/Transcript etc. +#Move info_type to ox.linkage_type to sit along side linkage_annotated + + +sub get_all_DBEntries { + my $self = shift; + my $ex_db_exp = shift; + my $ex_db_type = shift; + + my $cache_name = "dbentries"; + + if(defined($ex_db_exp)){ + $cache_name .= $ex_db_exp; + } + if(defined($ex_db_type)){ + $cache_name .= $ex_db_type; + } + + #Need to add tests for valid objects for xrefs + + # if not cached, retrieve all of the xrefs for this gene + + #This is not using the caching optimally + #It seems for naive(ex_db_exp,ex_db_type) queries we create a naive cache + #This means that further more specific queries will make another query and not use the cache + + + if( (! defined $self->{$cache_name}) && $self->adaptor() ){ + + my @tables = $self->adaptor->_tables; + @tables = split/_/, $tables[0]->[0];#split annotated_feature + my $object_type = join('', (map ucfirst($_), @tables));#change to AnnotatedFeature + + $self->{$cache_name} = + $self->adaptor->db->get_DBEntryAdaptor->_fetch_by_object_type($self->dbID(), $object_type, $ex_db_exp, $ex_db_type); + } + elsif( ! defined $self->{$cache_name} ){ + throw('You must have set and adaptor to be able to get_all_DBEntries'); + } + + + $self->{$cache_name} ||= []; + + return $self->{$cache_name}; +} + + +=head2 add_DBEntry + + Arg [1] : Bio::EnsEMBL::DBEntry $dbe + The dbEntry to be added + Example : my $dbe = Bio::EnsEMBL::DBEntry->new(...); + $transcript->add_DBEntry($dbe); + Description: Associates a DBEntry with this object. Note that adding + DBEntries will prevent future lazy-loading of DBEntries for this + storable (see get_all_DBEntries). + Returntype : none + Exceptions : thrown on incorrect argument type + Caller : general + Status : Stable + +=cut + +sub add_DBEntry { + my $self = shift; + my $dbe = shift; + + unless($dbe && ref($dbe) && $dbe->isa('Bio::EnsEMBL::DBEntry')) { + throw('Expected DBEntry argument'); + } + + $self->{'dbentries'} ||= []; + push @{$self->{'dbentries'}}, $dbe; +} + + +=head2 associated_feature_types + + Example : my @associated_ftypes = @{$feature->associated_feature_types()}; + Description: Getter/Setter for other associated FeatureTypes. + Returntype : ARRAYREF of Bio::EnsEMBL::Funcgen:FeatureType objects + Exceptions : None + Caller : General + Status : At risk + +=cut + +sub associated_feature_types{ + my ($self, $ftypes) = @_; + + #Lazy load as we don't want to have to do a join on all features when most will not have any + + + if(defined $ftypes){ + + if(ref($ftypes) eq 'ARRAY'){ + + foreach my $ftype(@$ftypes){ + + if( ! $ftype->isa('Bio::EnsEMBL::Funcgen::FeatureType') ){ + throw('You must pass and ARRAYREF of stored Bio::EnsEMBL::Funcgen::FeatureType objects'); + } + #test is stored in adaptor + } + + if(defined $self->{'associated_feature_types'}){ + warn('You are overwriting associated feature types'); + #we could simply add the new ones and make them NR. + } + + $self->{'associated_feature_types'} = $ftypes; + } + else{ + throw('You must pass and ARRAYREF of stored Bio::EnsEMBL::Funcgen::FeatureType objects'); + } + } + + + if(! defined $self->{'associated_feature_types'}){ + #This will fail if we have not stored yet + + if(defined $self->adaptor){ + $self->{'associated_feature_types'} = $self->adaptor->db->get_FeatureTypeAdaptor->fetch_all_by_association($self); + } + + } + + + #This has the potential to return undef, or an arrayref which may be empty. + return $self->{'associated_feature_types'}; +} + + + + + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/Utils/EFGUtils.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Utils/EFGUtils.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,673 @@ +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::Funcgen::Utils::EFGUtils + +=head1 DESCRIPTION + +This module collates a variety of miscellaneous methods. + + +=head1 SYNOPSIS + + BEGIN + { + unshift(@INC,"/path/of/local/src/modules"); + } + + use Utils; + + &Utils::send_mail($to_address, $title, $message); + +=cut + + +# No API/Object based methods in here + +############################################################################### + +package Bio::EnsEMBL::Funcgen::Utils::EFGUtils; +require Exporter; +@ISA = qw(Exporter); +@EXPORT_OK = qw(get_date species_name get_month_number species_chr_num + open_file median mean run_system_cmd backup_file + is_gzipped is_sam is_bed get_file_format strip_param_args + generate_slices_from_names strip_param_flags + get_current_regulatory_input_names add_external_db); + +use Bio::EnsEMBL::Utils::Exception qw( throw ); +use File::Path qw (mkpath); +use File::Basename qw (dirname); +use strict; +use Time::Local; +use FileHandle; +use Carp; + +sub get_date{ + my ($format, $file) = @_; + + my ($time, $sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst); + + + throw("File does not exist or is not a regular file:\t$file") if $file && ! -f $file; + + + ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = (defined $file) ? + localtime((stat($file))[9]) : localtime(); + + #print " ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst)\n"; + + if((! defined $format && ! defined $file) || $format eq "date"){ + $time = ($year+1900)."-".$mday."-".($mon+1); + } + elsif($format eq "time"){#not working! + $time = "${hour}:${min}:${sec}"; + } + elsif($format eq "timedate"){# + $time = localtime(); + } + else{#add mysql formats here, datetime etc... + croak("get_date does not handle format:\t$format"); + } + + return $time; +} + + +#migrate this data to defs file!!?? +#must contain all E! species and any other species which are used in local DB extractions +#NEED TO ADD FLY!! + +sub species_name{ + my($species) = @_; + my %species_names = ( + "HOMO_SAPIENS", "human", + "MUS_MUSCULUS", "mouse", + "RATTUS_NORVEGICUS", "rat", + "CANIS_FAMILIARIS", "dog", + "PAN_TROGOLODYTES", "chimp", + "GALLUS_GALLUS", "chicken", + "SACCHAROMYCES_CEREVISIAE", "yeast", + "HUMAN", "HOMO_SAPIENS", + "MOUSE", "MUS_MUSCULUS", + "RAT","RATTUS_NORVEGICUS", + "DOG", "CANIS_FAMILIARIS", + "CHIMP", "PAN_TROGOLODYTES", + "CHICKEN", "GALLUS_GALLUS", + "YEAST", "SACCHAROMYCES_CEREVISIAE", + ); + + return $species_names{uc($species)}; +} + +sub get_month_number{ + my($mon) = @_; + my %month_nos =( + "jan", "01", + "feb", "02", + "mar", "03", + "apr", "04", + "may", "05", + "jun", "06", + "jul", "07", + "aug", "08", + "sep", "09", + "oct", "10", + "nov", "11", + "dec", "12", + ); + return $month_nos{lc($mon)}; +} + + +sub species_chr_num{ + my ($species, $val) = @_; + + ($species = lc($species)) =~ s/ /_/; + + my %species_chrs = ( + homo_sapiens => {( + 'x' => 23, + 'y' => 24, + 'mt' => 25, + )}, + + mus_musculus => {( + 'x' => 20, + 'y' => 21, + 'mt' => 22, + )}, + + rattus_norvegicus => {( + 'x' => 21, + 'y' => 22, + 'mt' => 23, + )}, + ); + + die("species not defined in chromosome hash") if(! exists $species_chrs{$species}); + + return (exists $species_chrs{$species}{lc($val)}) ? $species_chrs{$species}{lc($val)} : $val; +} + +#Sort should always be done in the caller if required + +sub median{ + my ($scores, $sort) = shift; + + return undef if (! @$scores); + + + my ($median); + my $count = scalar(@$scores); + my $index = $count-1; + #need to deal with lines with no results!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + #deal with one score fastest + return $scores->[0] if ($count == 1); + + + if($sort){ + #This is going to sort the reference here, so will affect + #The array in the caller + #We need to deref to avoid this + } + + #taken from Statistics::Descriptive + #remeber we're dealing with size starting with 1 but indices starting at 0 + + if ($count % 2) { #odd number of scores + $median = $scores->[($index+1)/2]; + } + else { #even, get mean of flanks + $median = ($scores->[($index)/2] + $scores->[($index/2)+1] ) / 2; + } + + + return $median; +} + + +sub mean{ + my $scores = shift; + + my $total = 0; + + map $total+= $_, @$scores; + my $mean = $total/(scalar(@$scores)); + + return $mean; + +} + +#Should really extend this to detect previous file? +#Or do in caller? + +sub open_file{ + my ($file, $operator, $file_permissions) = @_; + + my $dir_permissions = $file_permissions || 0755; + + $operator ||= '<'; + + if($operator !~ /%/){ + $operator = "$operator $file"; + } + else{ + #We have some piping to do + $operator = sprintf($operator, $file); + } + + #Get dir here and create if not exists + my $dir = dirname($file); + mkpath($dir, {verbose => 1, mode => $dir_permissions}) if(! -d $dir); + my $fh = new FileHandle "$operator"; + + + #This does not catch incorrectly defined named pipes + + if(! defined $fh){ + croak("Failed to open $operator"); + } + + + #Have to chmod here as umask will over-ride permissions passed to FileHandle + if(defined $file_permissions){ + + #Catch non-numeric here as chmod still returns true + if($file_permissions =~ /[^0-9]/){ + croak("Failed to change $file permissions using:\t$file_permissions"); + } + + #chmod requires a literal octal number e.g. 0775 not '0775' + #should catch numbers as strings here, but perl makes this very hard to test + #Can't even system this as if we build the cmd line with an octal it will be converted to a decimal + #These is still no way of testing for non-octal number or string + #eval/sprintf will also not fail if there are non-octal digits i.e. 1999 + + #eval will treat octal number and string as true octal number + #else will pass non-octal string/number which we can't catch + chmod(eval($file_permissions), $file); + } + + return $fh; +} + + + +################################################################################ + +=head2 run_system_cmd + + Description : Method to control the execution of the standard system() command + + ReturnType : none + + Example : $Helper->debug(2,"dir=$dir file=$file"); + + Exceptions : throws exception if system command returns none zero + +=cut + +################################################################################ + +sub run_system_cmd{ + my ($command, $no_exit) = @_; + + my $redirect = ''; + + #$self->debug(3, "system($command)"); + + # decide where the command line output should be redirected + + #This should account for redirects + + #if ($self->{_debug_level} >= 3){ + + # if (defined $self->{_debug_file}){ + # $redirect = " >>".$self->{_debug_file}." 2>&1"; + # } + # else{ + # $redirect = ""; + # } + #} + #else{ + #$redirect = " > /dev/null 2>&1"; + #} + + # execute the passed system command + my $status = system("$command $redirect"); + my $exit_code = $status >> 8; + + + if ($status == -1) { + warn "Failed to execute: $!\n"; + } + elsif ($status & 127) { + warn sprintf("Child died with signal %d, %s coredump\nError:\t$!",($status & 127),($status & 128) ? 'with' : 'without'); + } + elsif($status != 0) { + warn sprintf("Child exited with value %d\nError:\t$!\n", $exit_code); #get the true exit code + } + + #We're not catchign error message here! + + if ($exit_code != 0){ + + if (! $no_exit){ + throw("System command failed:\t$command\n"); + } + else{ + warn("System command returned non-zero exit code:\t$command\n"); + } + } + + #reverse boolean logic for perl...can't do this anymore due to tab2mage successful non-zero exit codes :/ + + return $exit_code; +} + + +sub backup_file{ + my $file_path = shift; + + throw("Must define a file path to backup") if(! $file_path); + + if (-f $file_path) { + #$self->log("Backing up:\t$file_path"); + system ("mv ${file_path} ${file_path}.".`date '+%T'`) == 0 || return 0; + } + + return 1; + +} + + +sub get_file_format{ + my $file = shift; + + my $format = &is_bed($file); + + if(! $format){ + $format = &is_sam($file); + + #Add more testes here + } + + + return $format; +} + +sub is_gzipped { + my ($file, $fail_if_compressed) = @_; + + throw ("File does not exist:\t$file") if ! -e $file; + + open(FILE, "file -L $file |") + or throw("Can't execute command 'file' on '$file'"); + my $file_info = ; + close FILE; + + my $gzip = 0; + + if($file_info =~ /compressed data/){ + + if($file_info =~ /gzip/){ + $gzip = 1; + } + else{ + throw("File is compressed but not with gzip, please unzip or gzip:\t$file_info"); + } + } + + return $gzip; +} + +#Change these to also return the gz status + +sub is_sam{ + my $file = shift; + + warn "Only checking file suffix for is_sam"; + #Could check for header here altho this is not mandatory! + #Can we use web format guessing code? + + my $gz = (&is_gzipped($file, 1)) ? '.gz' : ''; + + return ($file =~ /.sam${gz}/) ? 'sam' : 0; +} + +#need is bam here too! + +sub is_bed { + my $file = shift; + + #Use open_file here! + + if(&is_gzipped($file, 1)){ + + open(FILE, "zcat $file 2>&1 |") or throw("Can't open file via zcat:\t$file"); + } + else{ + open(FILE, $file) or throw("Can't open file:\t$file"); + } + + my @line; + #$verbose =1; + + + while () { + chomp; + @line = split("\t", $_); + last; + } + close FILE; + + if (scalar @line < 6) { + warn "Infile '$file' does not have 6 or more columns. We expect bed format:\t". + "CHROM START END NAME SCORE STRAND.\n"; + return 0; + #} elsif ($line[0] !~ m/^((chr)?[MTXYNT_\d]+)$/) { + # warn ("1st column must contain name of seq_region (e.g. chr1 or 1) in '$file'"); + # return 0; + #Commented this out for now due to HSCHR_RANDOM seqs + #How does the webcode handle this? + } + elsif ($line[1] !~ m/^\d+$/ && $line[2] =~ m/^\d+$/) { + warn "2nd and 3rd column must contain start and end respectively in '$file'\n"; + return 0; + } + elsif ($line[5] !~ m/^[+-\.]$/) { + warn "6th column must define strand (either +, - or .) in '$file'\n"; + return 0; + } + + return 'bed'; +} + + +#These subs are useful for implementing +#a farm mode in a run script, where a script can +#submit itself to the farm as slice based jobs + +#strip cmd line params and associated arguments from a list +#should not be used to remove flag options i.e. no following args +#as this may cause removal of any following @ARGV; +#Can this be used on flattened args hash? + +sub strip_param_args{ + my ($args, @strip_params) = @_; + + my $param_name; + my $seen_opt = 0; + + foreach my $i(0..$#{$args}){ + + if($args->[$i] =~ /^[-]+/){ + $seen_opt = 0;#Reset seen opt if we seen a new one + + ($param_name = $args->[$i]) =~ s/^[-]+//; + + if(grep/^${param_name}$/, @strip_params){ + $seen_opt = 1; + } + } + + #$args->[$i] = '' if $args->[$i] =~ /^[-]+farm/;#Only remove current flag + #$seen_opt = 1 if $args->[$i] =~ /^[-]+skip_slices/; + #$seen_opt = 1 if $args->[$i] =~ /^[-]+slice/;#Don't have full param name incase we have just specified -slice + + $args->[$i] = '' if $seen_opt;#Remove option and args following option + } + + return $args; +} + + +sub strip_param_flags{ + my ($args, @strip_params) = @_; + + my @args = @$args; + + foreach my $flag(@strip_params){ + @args = grep(!/[-]+${flag}$/, @args); + } + + return \@args; +} + +#Generates slices from names or optionally alll default top level nonref + +sub generate_slices_from_names{ + my ($slice_adaptor, $slice_names, $skip_slices, $highestlevel, $non_ref, $inc_dups, $assembly) = @_; + + #Test if $assembly is old? + + + + my (@slices, $slice, $sr_name); + + if(@$slice_names){ + + foreach my $name(@$slice_names){ + $slice = $slice_adaptor->fetch_by_region(undef, $name, undef, undef, undef, $assembly); + + #WHy is this failing for hap regions? + + if(! $slice){ + + #Need to eval this as it will break with incorrect formating + + eval { $slice = $slice_adaptor->fetch_by_name($name) }; + + if(! $slice){ + throw("Could not fetch slice by region or name:\t".$name); + } + } + + $sr_name = $slice->seq_region_name; + + next if(grep/^${sr_name}$/, @$skip_slices); + push @slices, $slice; + } + } + elsif($highestlevel){ + + my $level = 'toplevel'; + + if($assembly){ + $level = 'chromosome'; + warn "Cannot get toplevel for old assembly version $assembly, defaulting to 'chromosome' level"; + #Would ignore old assembly and just fetch current assembly otherwise as there is no toplevel for old assemblies + #No need for projection on non-ref unassembled seqs as these will/should be identical + #Only need need to project assembled seq i.e. haps(lrgs?). + #Only rollback toplevel data when cleaning after projection, otherwise we may lose some data. + #Change default delete to use all toplevel ref seqs (and non-ref with cs version e.g. haps but not lrgs) + } + + my @tmp_slices = @{$slice_adaptor->fetch_all($level, $assembly, $non_ref, $inc_dups)}; + + if(@$skip_slices){ + + foreach $slice(@tmp_slices){ + $sr_name = $slice->seq_region_name; + push @slices, $slice if ! grep/^${sr_name}$/, @$skip_slices; + } + } + else{ + @slices = @tmp_slices; + } + } + else{ + throw('You must either pass an arrayref of slice names or specify the toplevel flag'); + } + + + if(! @slices){ + throw("You have specified slice_names and skip_slices paramters which have generated no slices.\nslice_names:\t".join(' ',@$slice_names)."\nskip_slices:\t".join(' ', @$skip_slices)); + } + + return \@slices; +} + + +# Tracking DB methods +# Move to DBAdaptor? Can we add this as a separate package in the same module? + +sub get_current_regulatory_input_names{ + my ($tdb, $efg_db, $focus) = @_; + + #Validate is production? + my $sql; + + + + if($focus){ + $focus = 'Focus'; + $sql = 'SELECT efgdb_set_name from dataset where is_focus=true and is_current=true and species="'.$efg_db->species.'"'; + } + else{ + $focus = 'Non-focus'; + #0 rather than false so we don't get NULLs + $sql = 'SELECT efgdb_set_name from dataset where is_focus=0 and is_current=true and species="'.$efg_db->species.'"'; + } + + + #Currently efgdb_set_name can either be data_set or feature_set name! + #Need to standardise this + + my @prd_names = @{$tdb->db_handle->selectcol_arrayref($sql)}; + my @names; + my @failed_sets; + + foreach my $prd_name(@prd_names){ + + $sql = "SELECT name from feature_set where name like '${prd_name}%'"; + my @tmp_names = @{$efg_db->dbc->db_handle->selectcol_arrayref($sql)}; + + #This is causing problems with multiple feature sets with differing analyses + + #Do this via InputSets(using query extension?) instead of using like? + + #This is very hacky right now to get it to work + #Need to standardise and review tracking db data. + + if(scalar(@tmp_names) > 1){ + + $sql = "SELECT name from feature_set where name ='${prd_name}_ccat_histone'"; + @tmp_names = @{$efg_db->dbc->db_handle->selectcol_arrayref($sql)}; + + if(scalar(@tmp_names) == 1){ + push @names, $tmp_names[0]; + }else{ + push @failed_sets, $prd_name; + } + } + elsif(scalar(@tmp_names) == 0){ + push @failed_sets, $prd_name; + } + else{ + push @names, $tmp_names[0]; + } + + } + + if(@failed_sets){ + throw("Failed to find unique $focus FeatureSets for production dataset names:\n\t". + join("\n\t", @failed_sets)."\n"); + } + + return @names; +} + +#Handy function to add an external_db entry when needed +sub add_external_db{ + my ($efg_db, $db_name,$db_release,$db_display_name) = @_; + my $sql = "select external_db_id from external_db where db_name='$db_name' and db_release='$db_release'"; + my ($db_id) = $efg_db->dbc->db_handle->selectrow_array($sql); + if($db_id){ + warn "External DB $db_name $db_release already exists in db with db_id $db_id\n"; + } else { + #TODO check if it there was a failure + $efg_db->dbc->do("insert into external_db (db_name, db_release, status, dbprimary_acc_linkable, priority, db_display_name, type) values('$db_name', '$db_release', 'KNOWNXREF', '1', '5', '$db_display_name', 'MISC')"); + } + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/Utils/Encode.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Utils/Encode.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,96 @@ +# Ensembl module for Bio::EnsEMBL::Funcgen::Utils::Encode +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::Funcgen::Utils::Encode - provides some handy methods to deal +with Encode regions. + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Funcgen::Utils::Encode qw(get_encode_regions); + + my $encode_regions = &get_encode_regions($dnadb, $assembly_version); + +=head1 DESCRIPTION + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::Utils::Encode; + +require Exporter; +use vars qw(@ISA @EXPORT_OK); +@ISA = qw(Exporter); +@EXPORT_OK = qw(get_encode_regions); + +use Bio::EnsEMBL::Utils::Exception qw(throw); +use POSIX; +use Data::Dumper; + +=head2 get_encode_regions + + Arg [1] : Bio::EnsEMBL::DBSQL::DBAdaptor + Description : fetch encode region coordinates from core db + Returns : reference to list of Bio::EnsEMBL::MiscFeatures Encode region slices + Exceptions : Throws if no Bio::EnsEMBL::DBSQL::DBAdaptor defined + Example : get_encode_regions($dnadb); + +=cut + +sub get_encode_regions { + + my $db = shift; + + throw("Need to pass a valid Bio::EnsEMBL::DBSQL::DBAdaptor") + if (! ($db && $db->isa("Bio::EnsEMBL::DBSQL::DBAdaptor"))); + + my $sa = $db->get_SliceAdaptor(); + my $tls = $sa->fetch_all('toplevel'); + #map { print Dumper $_->name } @$tls; + + my $mfa = $db->get_MiscFeatureAdaptor(); + + my @encode_regions; + map { + push @encode_regions, @{$mfa->fetch_all_by_Slice_and_set_code($_, 'encode')}; + } @$tls; + + return \@encode_regions; + + + #my %encode_regions; + #map { $encode_regions{$_->display_id} = sprintf + # ("%s:%s:%s:%d:%d:%d", + # $_->slice->coord_system_name, + # $_->slice->coord_system()->version(), + # $_->slice->seq_region_name, + # $_->start, $_->end, $_->strand); + # } @encode_regions; + # + #return \%encode_regions; + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/Utils/HealthChecker.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Utils/HealthChecker.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1086 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::Funcgen::Utils::HealthChecker + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +B provides several methods to health check and update tables prior to +release. Using the updte_DB_for_release method runs the following: + + validate_new_seq_regions - _pre_stores seq_region & coord_system info from new core DB + check_regbuild_strings - Validates or inserts regbuild_string entries + check_meta_species_version - Validates meta species and version wrt dbname + set_current_coord_system - Updates coord_system.is_current to 1 for current schema_build (required for mart) + update_meta_coord - Regenerates meta_coord.max_length values (required for Slice range queries) + clean_xrefs - Removes old unused xref and external_db records + validate_DataSets - Performs various checks on Data/Feature/ResultSets links and states + check_stable_ids - Check for any NULL stable IDs + log_data_sets - Logs all DISPLAYABLE DataSets + analyse_and_optimise_tables - Does what is says + + +=cut + +################################################################################ + +package Bio::EnsEMBL::Funcgen::Utils::HealthChecker; + +use strict; +use Bio::EnsEMBL::Funcgen::Utils::Helper; +use Bio::EnsEMBL::Funcgen::ProbeFeature; +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Utils::Exception qw( throw ); +use Bio::EnsEMBL::Analysis; +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Utils::Helper); + + + +#TO DO +# 1 DONE Print all fails and warnings in summary at end of script. +# 2 validate_RegulatoryFeature_Sets +# 3 Some of these can be migrated or mirrored in java HCs for safety + + +################################################################################ + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_); + + #validate and set type, analysis and feature_set here + my ($db, $builds, $skip_mc, $check_displayable, $skip_analyse, $meta_coord_tables, $skip_xrefs, $fix) = + rearrange(['DB', 'BUILDS', 'SKIP_META_COORD', 'CHECK_DISPLAYABLE', 'SKIP_ANALYSE', 'META_COORD_TABLES', 'SKIP_XREF_CLEANUP', 'FIX'], @_); + + + if (! ($db && ref($db) && + $db->isa('Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor'))){ + throw('You must provide a valid Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor'); + } + + #test connection + $db->dbc->db_handle; + + $self->{'db'} = $db; + $self->{'mysql_connect_string'} = 'mysql -h'.$db->dbc->host.' -u'.$db->dbc->username.' -p' + .$db->dbc->password.' '.$db->dbc->dbname.' -P'.$db->dbc->port; + $self->{'dbname'} = $db->dbc->dbname; + $self->{'builds'} = (scalar(@$builds)>0) ? $builds : []; + $self->{'skip_meta_coord'} = $skip_mc; + $self->{'skip_xrefs'} = $skip_xrefs; + $self->{'skip_analyse'} = $skip_analyse; + $self->{'check_displayable'} = $check_displayable; + $self->{fix} = $fix; + + if(defined $meta_coord_tables){ + + throw('-skip_meta_coord is set, Cannot build meta_coord entries for tables '.join(', ', @$meta_coord_tables)); + + if(! ref($meta_coord_tables) eq 'ARRAY'){ + throw('-meta_coord_tables parameter must be an array ref'); + } + + @{$self->{'meta_coord_tables'}} = @$meta_coord_tables; + } + + return $self; +} + +sub db{ return $_[0]->{db}; } + +sub fix{ return $_[0]->{fix}; } + +=head2 update_db_for_release + + Arg[0] : + Example : + Description: Wrapper method to perform all common update functions + Returntype : + Exceptions : None + Caller : General + Status : at risk + +=cut + +sub update_db_for_release{ + my ($self, @args) = @_; + + if(@args){ + } + + #do seq_region_update to validate dnadb first + #hence avoiding redoing longer methods + $self->validate_new_seq_regions;#($force_srs); + #$self->update_meta_schema_version; + $self->check_regbuild_strings; + $self->check_meta_species_version; + $self->set_current_coord_system; + $self->update_meta_coord; + $self->clean_xrefs; + $self->validate_DataSets; + $self->check_stable_ids; + $self->log_data_sets(); + $self->analyse_and_optimise_tables;#ALWAYS LAST!! + + $self->log_header('??? Have you dumped/copied GFF dumps ???'); + + #Log footer? Pass optional counts hash? + $self->log('Finished updating '.$self->{'dbname'}." for release\n\n"); +} + +sub validate_new_seq_regions{ + my ($self, $force) = @_; + + + #We need to add some functionality to handle non-standard schema_build progression here + #This should be used before any data is loaded + #It should also warn if there any duplicates if it is not run before coord_systems are duplicated + #Should this just be handled in the BaseFeatureAdaptor/CoordSystemAdaptor? + + + + + #do we need to add the none default levels here? + #or are we only bothered about those which constitute the toplevel? + + #To make sure we have all the correct levels in eFG we need to get all the names. + #then get all by name from the core db and set them as the dnadb. + # we also need to get all the toplevel seq_regions and store them in the seq_region table + #use BaseFeatureAdaptor::_pre_store with and array of pseudo feature on each top level slice + + #Validate the efgdb and dnadb schema version are the same first + #This is because we expect the schem_build to be the same for a release + #However, this will autoset the dnadb if no defined, so will always match! + + if(! $force){ + my $efgdb_sm = join('_', @{$self->get_schema_and_build($self->{'dbname'})}); + my $dnadb_sm = join('_', @{$self->get_schema_and_build($self->db->dnadb->dbc->dbname)}); + + if($efgdb_sm ne $dnadb_sm){ + $self->report("WARNING Skipped validate_new_seq_regions as schema_versions are mismatched:\t". + "efgdb $efgdb_sm\tdnadb $dnadb_sm"); + return 0; + } + } + + my $pf_adaptor = $self->db->get_ProbeFeatureAdaptor(); + my $slice_adaptor = $self->db->dnadb->get_SliceAdaptor(); + my $dnadb_csa = $self->db->dnadb->get_CoordSystemAdaptor; + + $self->log_header('Validating new coord_systems/seq_regions'); + + my @slices; + my %versioned_levels; + my $default_version; + + #Grab unversioned top level slices and versioned levels + #May miss some old versioned level if the new assembly no longer has them + foreach my $slice(@{$slice_adaptor->fetch_all('toplevel', undef, 1)}){ + + if (! $slice->coord_system->version){ + push @slices, $slice; + } + else{ + + if($default_version && + ($default_version ne $slice->coord_system->version)){ + throw("Found more than one default CoordSystem version:\t${default_version}\t".$slice->coord_system->version); + } + else{ + $default_version = $slice->coord_system->version; + } + } + } + + + #Get all versioned levels for all builds + foreach my $cs(@{$dnadb_csa->fetch_all}){ + + if($cs->version){ + $versioned_levels{$cs->version} ||= []; + push @{$versioned_levels{$cs->version}}, $cs->name; + } + } + + push @{$self->{'builds'}}, $default_version if scalar(@{$self->{'builds'}}) == 0; + + + + #Grab slices for each versioned level + foreach my $build(@{$self->{'builds'}}){ + + if(! exists $versioned_levels{$build}){ + throw("CoordSystem version $build does not exist in the dnadb ".$self->db->dnadb->dbc->dbname); + } + + foreach my $level(@{$versioned_levels{$build}}){ + $self->log("Getting slices for $level $build"); + push @slices, @{$slice_adaptor->fetch_all($level, $build)}; + } + } + + $self->log("Importing seq_region/coord_system info for builds:\t".join(',', @{$self->{'builds'}})); + + foreach my $slice(@slices){ + + if($slice->start() != 1){ + $self->log("Reslicing slice:\t".$slice->name()); + #we must have some sort of PAR linked region i.e. Y + $slice = $slice_adaptor->fetch_by_region($slice->coord_system_name(), $slice->seq_region_name()); + } + + + #we need test if it needs doing first? + #we would need to test for the coord_systems outside of this loop + #and then for each seq_region inside the loop if the coord_system is present + + $self->log("_pre_storing seq_region info for slice:\t".$slice->name()); + + my $pseudo_feature = Bio::EnsEMBL::Funcgen::ProbeFeature->new + ( + -slice => $slice, + -start => 0, + -end => 0, + -strand => 0, + ); + + $pf_adaptor->_pre_store($pseudo_feature); + #This will create a meta_coord entry of max_length 1 for features which have an absent meta_coord entry + + } + + + $self->log("Finished validating seq_regions\n"); + + return; +} + + + + +sub update_meta_coord{ + my ($self, @table_names) = @_; + + my $species_id = $self->db->species_id; + + if($self->{'skip_meta_coord'}){ + $self->log("Skipping meta_coord update\n"); + return; + } + + + $self->log_header('Updating meta_coord table'); + + + #set default table_name + if(! @table_names || scalar(@table_names) == 0){ + + #Can we do this via DBAdaptor and get all available adaptors which are BaseFeatureAdaptors then grab the first table name + + if(defined $self->{'meta_coord_tables'}){ + @table_names = @{$self->{'meta_coord_tables'}}; + } + else{#default + + @table_names = qw( + regulatory_feature + probe_feature + external_feature + annotated_feature + result_feature + segmentation_feature + ); + } + } + + #backup meta coord + if(system($self->{'mysql_connect_string'}." -e 'SELECT * FROM meta_coord'" + . '> '.$self->{'dbname'}.'meta_coord.backup' + ) != 0 ){ + + throw("Can't dump the original meta_coord for back up");#will this get copied to log? + } + else { + $self->log('Original meta_coord table backed up in '. $self->{'dbname'}.'.meta_coord.backup'); + } + + + #Update each max_length for table_name and coord_system + + my $sql; + + foreach my $table_name(@table_names){ + + $sql = "select distinct(cs.name), mc.coord_system_id, cs.version, mc.max_length from coord_system cs, meta_coord mc where mc.table_name='$table_name' and mc.coord_system_id=cs.coord_system_id and cs.species_id = $species_id"; + + $self->log(''); + $self->log("Updating meta_coord max_length for $table_name:"); + $self->log("name\tcoord_system_id\tversion\tmax_length"); + + #can we test for emtpy array here? Then skip delete. + + my @info = @{$self->db->dbc->db_handle->selectall_arrayref($sql)}; + + #log this + map {$self->log(join("\t", @{$_}))} @info; + + # Clean old entries + $self->log("Deleting old meta_coord entries"); + $sql = "DELETE mc FROM meta_coord mc, coord_system cs WHERE mc.table_name ='$table_name' and mc.coord_system_id = cs.coord_system_id and cs.species_id = $species_id"; + $self->db->dbc->db_handle->do($sql); + + # Generate new max_lengths + $self->log("Generating new max_lengths"); + + #Is this query running for each redundant cs_id? + #would it be more efficient to retrieve the NR cs_ids first and loop the query for each cs_id? + + #Can we get the dbID of the largest feature for ease of checking? + #This won't work as we're grouping by coord_system + #would need to select distinct coord_system_id for table first + #This may well slow down quite a bit doing it this way + + $sql = "select distinct s.coord_system_id from coord_system cs, seq_region s, $table_name t WHERE t.seq_region_id = s.seq_region_id and s.coord_system_id = cs.coord_system_id and cs.species_id = $species_id"; + my @cs_ids = @{$self->db->dbc->db_handle->selectall_arrayref($sql)}; + #Convert single element arrayrefs to scalars + map $_ = ${$_}[0], @cs_ids; + + $self->log("New max_lengths for $table_name are:"); + + $self->log(join("\t", ('coord_system_id', 'max_length', 'longest record dbID'))); + + foreach my $cs_id(@cs_ids){ + #This will always give a length of 1 even if there are no features present + + my @cs_lengths; + + #The probe_feature table is now too big to do this in one go + #We need to break this down into sr_ids + + $sql = "SELECT distinct t.seq_region_id from $table_name t, seq_region sr where t.seq_region_id=sr.seq_region_id and sr.coord_system_id=$cs_id"; + + my @sr_ids = @{$self->db->dbc->db_handle->selectcol_arrayref($sql)}; + + + #Get longest feature for all seq_regions + foreach my $sr_id(@sr_ids){ + $sql = "SELECT (t.seq_region_end - t.seq_region_start + 1 ) as max, t.${table_name}_id " + . "FROM $table_name t " + . "WHERE t.seq_region_id = $sr_id "; + $sql .= ' and t.window_size=0' if $table_name eq 'result_feature'; + $sql .= " order by max desc limit 1"; + + + #Problem here is that DBs without 0 wsize result_feture entries will not get a meta_coord entry + #We need to implement this in the _pre_store method too? + + + my ($cs_length, $table_id); + ($cs_length, $table_id) = $self->db->dbc->db_handle->selectrow_array($sql); + push @cs_lengths, [$cs_length, $table_id] if $cs_length; + } + + + if(@cs_lengths){ + #This will now contain a list of arrays refs contain the max length and feature id for + #each seq_region in this coord_system + #Now sort to get the longest + #Can't sort on 2 day array in the normal way + #One list list lists, the comparatee is no longer a list but a reference + @cs_lengths = sort { $b->[0] <=> $a->[0] } @cs_lengths; + $self->log(join("\t\t", ($cs_id, @{$cs_lengths[0]}))); + $sql = "INSERT INTO meta_coord values('${table_name}', $cs_id, ".$cs_lengths[0][0].')'; + $self->db->dbc->db_handle->do($sql); + } + } + } + + $self->log("Finished updating meta_coord max_lengths\n"); + + return; +} + + +#change this to check_meta_species_version + +sub check_meta_species_version{ + my ($self) = @_; + + $self->log_header('Checking meta species.production_name and schema_version against dbname'); + + my $dbname = $self->db->dbc->dbname; + (my $dbname_species = $dbname) =~ s/_funcgen_.*//; + my $mc = $self->db->get_MetaContainer; + my $schema_version = $mc->list_value_by_key('schema_version')->[0]; + + if(! defined $schema_version){ + $self->report("FAIL:\tNo meta schema_version defined"); + } + elsif($dbname !~ /funcgen_${schema_version}_/){ + $self->report("FAIL:\tMeta schema_version ($schema_version) does not match the dbname ($dbname)."); + } + + + my @latin_names = @{$mc->list_value_by_key('species.production_name')}; + + if(scalar(@latin_names) > 1){ + $self->report("FAIL:\tFound more than one species.production_name in meta:\t".join(", ", @latin_names)); + } + elsif(scalar(@latin_names) == 1 && ($latin_names[0] ne $dbname_species)){ + $self->report("FAIL:\tFound mismatch between meta species.production_name and dbname:\t".$latin_names[0]." vs $dbname_species"); + } + elsif(scalar(@latin_names) == 0){ + $self->report("WARNING:\tFound no meta species.production_name setting as:\t$dbname_species"); + $self->db->dbc->db_handle->do("INSERT into meta(species_id, meta_key, meta_value) values(1, 'species.production_name', '$dbname_species')"); + } + #else is okay + + return; +} + + + +#Move to Java HC? Or update if update flag specified +#Using same code used by build_reg_feats! + +sub check_regbuild_strings{ + my ($self) = @_; + + #Removed $update arg as we would always want to do this manually + + $self->log_header('Checking regbuild strings'); + my $species_id = $self->db()->species_id(); + + + my @regf_fsets; + my $passed = 1; + my $fset_a = $self->db->get_FeatureSetAdaptor; + #my $mc = $self->db->get_MetaContainer; + my $regf_a = $self->db->get_RegulatoryFeatureAdaptor; + #We now want to chek all build + @regf_fsets = @{$fset_a->fetch_all_by_type('regulatory')}; + + + if(scalar(@regf_fsets) == 0){ + $self->report('WARNING:check_regbuild_strings found no regulatory FeatureSets (fine if '.$self->db->species.' your species does not have a regulatory build'); + } + else{ + + warn "Need to check/update regbuild.version and regbuild.initial_release_date regbuild.last_annotation_update"; + + + #How do we validate this? + #Check all feature_sets exist + #Pull back some features from a test slice and check the number of bits match. + #Check the feature_type string exists and matches else create. + + + foreach my $fset(@regf_fsets){ + $self->log_header("Validating regbuild_string entries for FeatureSets:\t".$fset->name); + + #Fail for old versions as we want to remove these + if( $fset->name =~ /_v[0-9]+$/){ + $self->report("FAIL:\t".$fset->name." is an old RegulatoryFeature set, please remove!"); + next; + } + + my $cell_type = (defined $fset->cell_type) ? $fset->cell_type->name : 'core'; + + #This has been lifted from build_regulatory_features.pl store_regbuild_strings + #Need to move this to a RegulatoryBuilder module + my $dset = $self->db->get_DataSetAdaptor->fetch_by_product_FeatureSet($fset); + + my @ssets = @{$dset->get_supporting_sets}; + + if(! @ssets){ + throw('You must provide a DataSet with associated supporting sets'); + } + + + + my %reg_strings = + ( + "regbuild.${cell_type}.feature_set_ids" => join(',', map { + $_->dbID} sort {$a->name cmp $b->name + } @ssets), + + "regbuild.${cell_type}.feature_type_ids" => join(',', map { + $_->feature_type->dbID} sort {$a->name cmp $b->name + } @ssets), + ); + + my @ffset_ids; + + #Skip this now as we use the ftype classes for defining the focus sets + #foreach my $fset(@ssets){ + + + # #This might fail if soem TFs haven't been included as focus i.e. are part of PolII/III + # #e.g. TFIIIC-110 +## +# if( ($fset->feature_type->class eq 'Transcription Factor') || +# ($fset->feature_type->class eq 'Open Chromatin') ){ +# push @ffset_ids, $fset->dbID; +# } +# } + + + my ($sql, %db_reg_string); + + foreach my $string_key(keys %reg_strings){ + my ($string)= $self->db->dbc->db_handle->selectrow_array("select string from regbuild_string where name='${string_key}' and species_id=$species_id"); + + if (! defined $string) { + $sql = "insert into regbuild_string (species_id, name, string) values ($species_id, '${string_key}', '$reg_strings{${string_key}}');"; + + $self->report("WARNING:\tInserting absent $string_key into regbuild_string table"); + eval { $self->db->dbc->do($sql) }; + die("Couldn't store $string_key in regbuild_string table\n$sql\n$@") if $@; + } + elsif ($string ne $reg_strings{$string_key}){ + $sql = "update regbuild_string set string='".$reg_strings{$string_key}."' where name='${string_key}';"; + + if($self->fix){ + $self->report("WARNING:\tUpdating mismatched $string_key found in regbuild_string table:\t${string}");#\tUpdate using:\t$sql"); + eval { $self->db->dbc->do($sql) }; + die("Couldn't update $string_key in regbuild_string table\n$sql\n$@") if $@; + + } + else{ + $self->report("FAIL:\tMismatched $string_key found in regbuild_string table:\t${string}\n\tUpdate using:\t$sql"); + } + } + + $db_reg_string{$string_key} = $string; + } + + + #Now need to tidy this block wrt new code added above + my $fset_string_key = "regbuild.${cell_type}.feature_set_ids"; + my $ftype_string_key = "regbuild.${cell_type}.feature_type_ids"; + my $fset_string = $db_reg_string{$fset_string_key}; + my $ftype_string = $db_reg_string{$ftype_string_key}; + + if(! ($fset_string && $ftype_string)){ + $self->report("FAIL:\tSkipping fset vs ftype string test for $cell_type") + } + else{ + + #This is now effectively handled by the loop above + + $self->log("Validating :\t$fset_string_key vs $ftype_string_key"); + + my @fset_ids = split/,/, $fset_string; + my @ftype_ids = split/,/, $ftype_string; + my @new_ftype_ids; + my $ftype_fail = 0; + + #Now need to work backwards through ftypes to remove pseudo ftypes before validating + #New string should be A,A,A;S,S,S,S,S,S;P,P,P + #Where A is and Anchor/Seed set + #S is a supporting set + #P is a pseudo feature type e.g. TSS proximal + + + if(scalar(@fset_ids) != scalar(@ftype_ids)){ + $self->report("FAIL:\tLength mismatch between:\n\t$fset_string_key(".scalar(@fset_ids).")\t$fset_string\n\tAND\n\t$ftype_string_key(".scalar(@ftype_ids).")\t$ftype_string"); + } + + foreach my $i(0..$#fset_ids){ + my $supporting_set_id = $fset_ids[$i]; + my $sset = $fset_a->fetch_by_dbID($supporting_set_id); + + if(! defined $sset){ + $self->report("FAIL:\t$fset_string_key $supporting_set_id does not exist in the DB"); + } + else{ + #test/build ftype string + + if(defined $ftype_string){ + + if($sset->feature_type->dbID != $ftype_ids[$i]){ + $ftype_fail = 1; + $self->report("FAIL:\t$fset_string_key $supporting_set_id(".$sset->name.") FeatureType(".$sset->feature_type->name.") does not match $ftype_string_key $ftype_ids[$i]"); + } + } + + push @new_ftype_ids, $sset->feature_type->dbID; + + } + } + + + #Set ftype_string + #This will not account for pseudo ftypes? Remove!!!? + my $new_ftype_string = join(',', @new_ftype_ids); + + if(! defined $ftype_string){ + $self->log("Updating $ftype_string_key to:\t$new_ftype_string"); + $self->db->dbc->db_handle->do("INSERT into regbuild_string(species_id, name, string) values($species_id, '$ftype_string_key', '$new_ftype_string')"); + } + elsif($ftype_fail){ + $self->report("FAIL:\t$ftype_string_key($ftype_string) does not match $fset_string_key types($new_ftype_string)"); + } + + + #Finally validate versus a reg feat + #Need to change this to ftype string rather than fset string? + my $id_row_ref = $self->db->dbc->db_handle->selectrow_arrayref('select regulatory_feature_id from regulatory_feature where feature_set_id='.$fset->dbID.' limit 1'); + + if(! defined $id_row_ref){ + $self->report("FAIL:\tNo RegulatoryFeatures found for FeatureSet ".$fset->name); + } + else{ + my ($regf_dbID) = @$id_row_ref; + my $rf_string = $regf_a->fetch_by_dbID($regf_dbID)->binary_string; + + if(length($rf_string) != scalar(@fset_ids)){ + $self->report("FAIL:\tRegulatory string length mismatch between RegulatoryFeature($regf_dbID) and $fset_string_key:\n$rf_string(".length($rf_string).")\n$fset_string(".scalar(@fset_ids).")"); + } + } + } + } + } + + return; +} + + +#Change this to log sets and incorporate RegFeat FeatureSet as standard +#Grab all reg fsets +#grab all displayable data sets which aren't reg sets? + +sub log_data_sets{ + my $self = shift; + + my $dset_adaptor = $self->db->get_DataSetAdaptor; + my ($status); + my $txt = 'Checking '; + $status = 'DISPLAYABLE' if($self->{'check_displayable'}); + $txt.= $status.' ' if $status; + $txt .= 'DataSets'; + $self->log_header($txt); + + #Check for status first to avoid warning from BaseAdaptor. + eval { $dset_adaptor->_get_status_name_id($status) }; + + if($@){ + $self->report("FAIL: You have specified check_displayable, but the DISPLAYABLE status_name is not present in the DB"); + return; + } + + + my @dsets; + my $dsets = $dset_adaptor->fetch_all($status); + @dsets = @$dsets if defined $dsets; + + + $self->log('Found '.scalar(@dsets).' DataSets'); + + + foreach my $dset(@dsets){ + $self->log_set("DataSet:\t\t", $dset) ; + + my $fset = $dset->product_FeatureSet; + $self->log_set("Product FeatureSet:\t", $fset) if $fset; + + my @supporting_sets = @{$dset->get_supporting_sets}; + + $self->log('Found '.scalar(@supporting_sets).' supporting sets:'); + + if(my @supporting_sets = @{$dset->get_supporting_sets}){ + #type here could be result, experimental or feature + #and feature could be annotated or experimental + #Move this to log set? + + map { my $type = ref($_); + $type =~ s/.*://; + $type .= '('.$_->feature_class.')' if($type eq 'FeatureSet'); + #Need to sprintf $type here to fixed width + $self->log_set($type.":\t", $_)} @supporting_sets; + } + $self->log(); + } + + return; +} + +sub log_set{ + my ($self, $text, $set) = @_; + + $text .= $set->display_label.'('.$set->name.')'; + $text .= "\tDISPLAYABLE" if($set->is_displayable); + $self->log($text); + + return; +} + + +sub check_stable_ids{ + my ($self, @slices) = @_; + + my $species_id = $self->db()->species_id(); + + $self->log_header('Checking stable IDs'); + + my $fset_a = $self->db->get_FeatureSetAdaptor; + + my @regf_fsets = @{$fset_a->fetch_all_by_type('regulatory')}; + + if(!@regf_fsets){ + $self->report('WARNING: No regulatory FeatureSets found (fine if '.$self->db->species.' does not have a regulatory build)'); + } + else{ + + foreach my $fset(@regf_fsets){ + + if($fset->name =~ /_v[0-9]$/){ + $self->log("Skipping stable_id test on archived set:\t".$fset->name); + next; + } + + #Can't count NULL field, so have to count regulatory_feature_id!!! + + #getting SR product here!! + my $sql = "select count(rf.regulatory_feature_id) from regulatory_feature rf, seq_region sr, coord_system cs where rf.stable_id is NULL and rf.seq_region_id = sr.seq_region_id and sr.coord_system_id = cs.coord_system_id and cs.species_id = $species_id and rf.feature_set_id=".$fset->dbID; + + + + my ($null_sids) = @{$self->db->dbc->db_handle->selectrow_arrayref($sql)}; + + if($null_sids){ + $self->report("FAIL: Found a total of $null_sids NULL stable IDs for ".$fset->name); + + my $slice_a = $self->db->get_SliceAdaptor; + + if(! @slices){ + @slices = @{$slice_a->fetch_all('toplevel', 1)}; + } + + foreach my $slice(@slices){ + my $sr_name=$slice->seq_region_name; + $sql = 'select count(rf.stable_id) from regulatory_feature rf, seq_region sr, coord_system cs where rf.seq_region_id=sr.seq_region_id and sr.name="'.$sr_name.'" and sr.coord_system_id = cs.coord_system_id and cs.species_id = '.$species_id.' and rf.stable_id is NULL and rf.feature_set_id='.$fset->dbID; + ($null_sids) = @{$self->db->dbc->db_handle->selectrow_arrayref($sql)}; + + #This is not reporting properly. + + $self->log($fset->name.":\t$null_sids NULL stable IDs on ".$slice->name) if $null_sids; + } + } + else{ + $self->log($fset->name.":\tNo NULL stable IDs found"); + } + } + } + + return; + +} + + +#This is for mart to enable them to join to the seq_region table without +#creating a product from all the reundant seq_region entries for each schema_build + +sub set_current_coord_system{ + my ($self) = @_; + + + + my $schema_build = $self->db->_get_schema_build($self->db->dnadb); + $self->log_header("Setting current coord_system on $schema_build"); + + my $sql = "update coord_system set is_current=False where schema_build !='$schema_build'"; + $self->db->dbc->do($sql); + $sql = 'update coord_system set is_current=True where schema_build ="'.$schema_build.'" and attrib like "%default_version%"'; + $self->db->dbc->do($sql); + + return; +} + +sub validate_DataSets{ + my $self = shift; + + $self->log_header('Validating DataSets'); + + + #checks regualtory feature data and supporting sets + #links between DataSet and FeatureSet, i.e. correct naming, not linking to old set + #Displayable, DAS_DISPLAYABLE, IMPORTED_ASSM, MART_DISPLAYABLE + #Naming of result_set should match data/feature_set + #warn non-attr feature/result sets which are DISPLAYABL + #warn about DISPLAYABLE sets which do not have displayable set in analysis_description.web_data + + + my $fset_a = $self->db->get_FeatureSetAdaptor; + my $dset_a = $self->db->get_DataSetAdaptor; + my ($dset_states, $rset_states, $fset_states) = $self->get_regbuild_set_states($self->db); + + my %rf_fsets; + my %set_states; + my $sql; + + RF_FSET: foreach my $rf_fset(@{$fset_a->fetch_all_by_type('regulatory')}){ + my $rf_fset_name = $rf_fset->name; + + + $self->log("Validating $rf_fset_name"); + + + $rf_fsets{$rf_fset_name} = $rf_fset;#Do we only need the name for checking the dsets independantly? + + if($rf_fset_name =~ /_v[0-9]+$/){ + $self->report("FAIL:\tFound archived regulatory FeatureSet:\t$rf_fset_name"); + next RF_FSET; + } + + foreach my $state(@$fset_states){ + + if(! $rf_fset->has_status($state)){ + $self->report("WARNING:\tUpdating FeatureSet $rf_fset_name with status $state"); + + $sql = 'INSERT into status select '. $rf_fset->dbID. + ", 'feature_set', status_name_id from status_name where name='$state'"; + + $self->db->dbc->db_handle->do($sql); + } + } + + #Do we need to warn about other states? + + + + my $rf_dset = $dset_a->fetch_by_product_FeatureSet($rf_fset); + + if(! $rf_dset){ + $self->report("FAIL:\tNo DataSet for FeatureSet:\t$rf_fset_name"); + next RF_FSET; + } + + + + if($rf_fset_name ne $rf_dset->name){ + $self->report("FAIL:\tFound Feature/DataSet name mismatch:\t$rf_fset_name vs ".$rf_dset->name); + next RF_FSET; + } + + + foreach my $state(@$dset_states){ + + if(! $rf_dset->has_status($state)){ + #Can we just update and warn here? + #Or do this separately in case we want some control over this? + $self->report("WARNING:\tUpdating DataSet $rf_fset_name with status $state"); + + $sql = 'INSERT into status select '.$rf_dset->dbID. + ", 'data_set', status_name_id from status_name where name='$state'"; + $self->db->dbc->db_handle->do($sql); + } + } + + + + #We should check fset ctype matches all attr_set ctypes? + #May have problems if we want to merge two lines into the same ctype build + + + + foreach my $ra_fset(@{$rf_dset->get_supporting_sets}){ + + foreach my $state(@$fset_states){ + + if(! $ra_fset->has_status($state)){ + #Can we just update and warn here? + #Or do this separately in case we want some control over this? + $self->report("WARNING:\tUpdating FeatureSet ".$ra_fset->name." with status $state"); + + $sql = 'INSERT into status select '.$ra_fset->dbID. + ", 'feature_set', status_name_id from status_name where name='$state'"; + $self->db->dbc->db_handle->do($sql); + } + } + + + + my $ra_dset = $dset_a->fetch_by_product_FeatureSet($ra_fset); + my @ssets = @{$ra_dset->get_supporting_sets(undef, 'result')}; + my @displayable_sets; + + foreach my $sset(@ssets){ + + if($sset->has_status('DISPLAYABLE')){ + push @displayable_sets, $sset; + } + } + #Change this to get all then check status + #else print update sql + + if(scalar(@displayable_sets) > 1){#There should only be one + $self->report("FAIL:\tThere should only be one DISPLAYABLE supporting ResultSet for DataSet:\t".$ra_dset->name); + } + elsif(scalar(@displayable_sets) == 0){ + + my $msg; + + if(scalar(@ssets) == 1){ + + #fix here? + + $msg = "Found unique non-DISPLAYABLE ResultSet:\t".$ssets[0]->name. + "\n\tinsert into status select ".$ssets[0]->dbID. + ", 'result_set', status_name_id from status_name where name='DISPLAYABLE';"; + } + else{ + $msg = "Found ".scalar(@ssets)." ResultSets ".join("\t", map($_->name, @ssets)); + } + + $self->report("FAIL:\tThere are no DISPLAYABLE supporting ResultSets for DataSet:\t". + $ra_dset->name."\n$msg"); + + + + next; #$ra_fset + } + + my $ra_rset = $ssets[0]; + + foreach my $state(@$rset_states){ + + if(! $ra_rset->has_status($state)){ + #Can we just update and warn here? + #Or do this separately in case we want some control over this? + $self->report("WARNING:\tUpdating ResultSet ".$ra_rset->name." with status $state"); + + $sql = 'INSERT into status select '.$ra_rset->dbID. + ", 'result_set', status_name_id from status_name where name='$state'"; + $self->db->dbc->db_handle->do($sql); + } + } + } + } + + return; +} # End of validate_DataSets + + + + + + +sub analyse_and_optimise_tables{ + my $self = shift; + + #myisamchk --analyze. or analyze statement + + if($self->{'skip_analyse'}){ + $self->log_header('Skipping analyse/optimise tables'); + return; + } + + $self->log_header("Analysing and optimising tables"); + + + my $sql = 'show tables;'; + my @tables = @{$self->db->dbc->db_handle->selectall_arrayref($sql)}; + map $_ = "@{$_}", @tables; + my $analyse_sql = 'analyze table '; + my $optimise_sql = 'optimize table '; + + + + foreach my $table(@tables){ + $self->log("Analysing and optimising $table"); + + + + #Remove analyse as optimise does everything this does + my @anal_info = @{$self->db->dbc->db_handle->selectall_arrayref($analyse_sql.$table)}; + + foreach my $line_ref(@anal_info){ + my $status = $line_ref->[3]; + $self->report("FAIL: analyse $table status $status") if (!($status eq 'OK' || $status eq 'Table is already up to date')); + } + + my @opt_info = @{$self->db->dbc->db_handle->selectall_arrayref($optimise_sql.$table)}; + + foreach my $line_ref(@opt_info){ + + my $status = $line_ref->[3]; + $self->report("FAIL: optimise $table status $status") if (!( $status eq 'OK' || $status eq 'Table is already up to date')); + } + + } + + return; +}# end of analyse_and_optimise_tables + + +sub clean_xrefs{ + my ($self) = @_; + + if($self->{'skip_xrefs'}){ + $self->log_header('Skipping clean_xrefs'); + return; + } + + $self->log_header("Cleaning unlinked xref records"); + + my $sql = 'DELETE x FROM xref x LEFT JOIN object_xref ox ON ox.xref_id = x.xref_id WHERE ox.xref_id IS NULL'; + #Should this also take accoumt of unmapped_objects? + #No, as unmapped_object doesn't use xref, but probably should + + my $row_cnt = $self->db->dbc->do($sql); + + $self->reset_table_autoinc('xref', 'xref_id', $self->db); + $row_cnt = 0 if $row_cnt eq '0E0'; + $self->log("Deleted $row_cnt unlinked xref records"); + + + #Now remove old edbs + $self->log_header("Cleaning unlinked external_db records"); + + #Need to account for xref and unmapped_object here + $sql = 'DELETE edb FROM external_db edb '. + 'LEFT JOIN xref x ON x.external_db_id = edb.external_db_id '. + 'LEFT JOIN unmapped_object uo ON uo.external_db_id=edb.external_db_id '. + 'WHERE x.external_db_id IS NULL and uo.external_db_id is NULL'; + $row_cnt = $self->db->dbc->do($sql); + + $self->reset_table_autoinc('external_db', 'external_db_id', $self->db); + $row_cnt = 0 if $row_cnt eq '0E0'; + $self->log("Deleted $row_cnt unlinked external_db records"); + + + #Shouldn't clean orphaned oxs here as this means a rollback been done underneath the ox data + #or we have xref_id=0! + #Leave this to HC? + + + + + + return; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Funcgen/Utils/Helper.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Utils/Helper.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,2388 @@ +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + +=head1 NAME + +Bio::EnsEMBL::Funcgen::Utils::Helper + +=head1 SYNOPSIS + + + e.g. + + + my $object = Bio::EnsEMBL::Object->new + ( + logging => 1, + log_file => "/tmp/Misc.log", + debug_level => 2, + debug_file => "/tmp/Misc.dbg", + ); + + $object->log("This is a log message."); + $object->debug(1,"This is a debug message."); + $object->system("rmdir /tmp/test"); + + + ---------------------------------------------------------------------------- + + +=head1 OPTIONS + +=over 8 + + +=item B<-debug> + +Turns on and defines the verbosity of debugging output, 1-3, default = 0 = off + +=over 8 + +=item B<-log_file|l> + +Defines the log file, default = "${instance}.log" + +=item B<-help> + +Print a brief help message and exits. + +=item B<-man> + +Prints the manual page and exits. + +=back + +=head1 DESCRIPTION + +B performs several debugging and logging functions, aswell as providing several inheritable EFGUtils methods. + +=cut + +################################################################################ + +package Bio::EnsEMBL::Funcgen::Utils::Helper; + +use Bio::Root::Root; +use Data::Dumper; +use Bio::EnsEMBL::Utils::Exception qw (throw stack_trace); +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw (get_date); +#use Devel::Timer; +use Carp;#? Can't use unless we can get it to redirect +use File::Basename; + + +use strict; +use vars qw(@ISA); +@ISA = qw(Bio::Root::Root); + +#List of valid rollback levels +#To be used in conjunction with -full_delete +my @rollback_tables = ('data_set', 'feature_set', 'result_set', 'input_set', 'experiment', 'array', 'array_chip', 'experimental_chip'); + +#Some local filevars to avoid assigning to package typeglobs +my ($DBGFILE, $LOGFILE); + +################################################################################ + +=head2 new + + Description : Constructor method to create a new object with passed or + default attributes. + + Arg [1] : hash containing optional attributes :- + log_file - name of log file (default = undef -> STDOUT) + debug_level - level of detail of debug message [1-3] (default = 0 = off) + debug_file - name of debug file (default = undef -> STDERR) + + ReturnType : Helper + + Example : my $Helper = new Bio::EnsEMBL::Helper( + debug_level => 3, + debug_file => "/tmp/efg.debug", + log_file => "/tmp/efg.log", + ); + + Exceptions : throws exception if failed to open debug file + : throws exception if failed to open log file + +=cut + +################################################################################ + +#To do , change to rearrange + +sub new{ + my ($caller, %args) = @_; + + my ($self, %attrdata, $argname); + my $class = ref($caller) || $caller; + + #Create object from parent class + $self = $class->SUPER::new(%args); + + #we need to mirror ensembl behaviour here + #use rearrange and set default afterwards if not defined + + # objects private data and default values + #Not all of these need to be in main + + %attrdata = ( + _tee => $main::_tee, + _debug_level => $main::_debug_level, + _debug_file => $main::_debug_file, + _log_file => $main::_log_file,#default should be set in caller + _no_log => $main::_no_log,#suppresses log file generation if log file not defined + _default_log_dir => $main::_default_log_dir, + ); + + # set each class attribute using passed value or default value + foreach my $attrname (keys %attrdata){ + ($argname = $attrname) =~ s/^_//; # remove leading underscore + $self->{$attrname} = (exists $args{$argname}) ? $args{$argname} : $attrdata{$attrname}; + } + + + $self->{'_tee'} = 1 if $self->{'_no_log'}; + #should we undef log_file here too? + #This currently only turns off default logging + + $self->{_default_log_dir} ||= $ENV{'HOME'}.'/logs'; + $self->{'_report'} = []; + + + # DEBUG OUTPUT & STDERR + + #should default to lowest or highest debug level here! + + if(defined $self->{_debug_level} && $self->{_debug_level}){ + $main::_debug_level = $self->{_debug_level}; + + if(defined $self->{_debug_file}){ + $main::_debug_file = $self->{_debug_file}; + + open($DBGFILE, '>>', $self->{_debug_file}) + or throw("Failed to open debug file : $!"); + + #open (DBGFILE, "{_debug_file});#Mirrors STDERR to debug file + } + else{ + open($DBGFILE, '>&STDERR'); + } + + select $DBGFILE; $| = 1; # make debug file unbuffered + + $self->debug(1,"Debugging started ".localtime()." on $0 at level ".$self->{_debug_level}." ..."); + } + + my $log_file = $self->{_log_file}; + + + # LOG OUTPUT + if (defined $self->{_log_file}){ + + #This causes print on unopened file as we try and log in the DESTROY + throw('You have specified mutually exclusive parameters log_file and no_log') if($self->{'_no_log'}); + $main::_log_file = $self->{_log_file}; + + #we need to implment tee here + if($self->{'_tee'}){ + open($LOGFILE, ' | tee -a '.$log_file); + } + else{ + open($LOGFILE, '>>', $log_file) + or throw("Failed to open log file : $log_file\nError: $!"); + } + } + else{ + #Change this to get the name of the control script and append with PID.out + #This is to ensure that we always capture output + #We need to also log params + #We will have to call this from the child class. + + + #Only do this if we don't have supress default logs set + #To avoid loads of loags during testing + if(! $self->{'_no_log'}){ + + my @stack = stack_trace(); + my $top_level = $stack[$#stack]; + my (undef, $file) = @{$top_level}; + $file =~ s/.*\///; + + $self->run_system_cmd('mkdir '.$self->{_default_log_dir}) if(! -e $self->{_default_log_dir}); + $self->{'_log_file'} = $self->{_default_log_dir}.'/'.$file.'.'.$$.'.log'; + warn "No log file defined, defaulting to:\t".$self->{'_log_file'}."\n"; + + #we should still tee here + if($self->{'_tee'}){ + open($LOGFILE, '| tee -a '.$self->{'_log_file'}); + } + else{ + open($LOGFILE, '>', $self->{'_log_file'}) + or throw('Failed to open log file : '.$self->{'_log_file'}."\nError: $!"); + } + + } + else{ + #Have to include STD filehandles in operator + open($LOGFILE, '>&STDOUT'); + } + } + + select $LOGFILE; $| = 1; # make log file unbuffered + $self->log("\n\nLogging started at ".localtime()."..."); + + # RESET STDOUT TO DEFAULT + select STDOUT; $| = 1; + + $self->debug(2,"Helper class instance created."); + + return $self; +} + + +################################################################################ + +=head2 DESTROY + + Description : Called by gargbage collection to enable tidy up before object deleted + + ReturnType : none + + Example : none - should not be called directly + + Exceptions : none + +=cut + +################################################################################ + +sub DESTROY{ + my ($self) = @_; + + + $self->report; + + if($self->{_log_file}){ + $self->log("Logging complete ".localtime()."."); + $self->log('Virtual Memory '.`ps -p $$ -o vsz |tail -1`); + $self->log('Resident Memory '.`ps -p $$ -o rss |tail -1`); + + + + + # close LOGFILE; # if inherited object then cannot close filehandle !!! + } + + if($self->{_debug_level}){ + $self->debug(1,"Debugging complete ".localtime()."."); + # close DBGFILE; # if inherited object then cannot close filehandle !!! + } + + if(defined $self->{'_timer'}){ + $self->{'_timer'}->report(); + } + + $self->debug(2,"Bio::EnsEMBL::Helper class instance destroyed."); + + return; +} + + + + +##Need generic method in here to get stack and line info +###Use Root.pm stack methods! +# and replace this with caller line method for logging +sub _get_stack{ + my ($self) = shift; + + + #need to resolve this method with that in debug, pass log or debug arg for different format + + my @prog = (caller(2)) ? caller(2) : (caller(1)) ? caller(1) : (undef,"undef",0); + + return "[".localtime()." - ".basename($prog[1]).":$prog[2]]"; +} + + +################################################################################ + +=head2 log + + Arg[0] : string - log message. + Arg[1] : boolean - memory usage, appends current process memory stats + Description : Method to write messages to a previously set up log file. + Return type : none + Example : $root->log("Processing file $filename ...", 1); + Exceptions : none + +=cut + +################################################################################ + +sub log{ + my ($self, $message, $mem, $date, $no_return) = @_; + + if($mem){ + $message.= " :: ".`ps -p $$ -o vsz |tail -1`; + chomp $message; + $message .= " KB"; + } + + if($date){ + my $time = localtime(); + chomp($time); + $message .= ' - '.localtime(); + } + + $message .= "\n" if ! $no_return; + + print $LOGFILE "::\t$message"; + + # Add to debug file if not printing to STDERR? + # only if verbose? + # this would double print everything to STDOUT if tee and debug has not redefined STDERR + + $self->debug(1,$message); +} + +################################################################################ + + +=head2 report + + Arg[0] : optional string - log message. + Arg[1] : optional boolean - memory usage, appends current process memory stats + Description : Wrapper method for log, which also stores message for summary reporting + Return type : none + Example : $root->report("WARNING: You have not done this or that and want it reported at the end of a script"); + Exceptions : none + +=cut + +################################################################################ + +sub report{ + my ($self, $message, $mem) = @_; + + if(defined $message){ + + $self->log($message, $mem); + + push @{$self->{'_report'}}, $message; + } + elsif(scalar(@{$self->{'_report'}})){ + print $LOGFILE "\n::\tSUMMARY REPORT\t::\n"; + print $LOGFILE join("\n", @{$self->{'_report'}})."\n"; + + $self->{'_report'} = []; + } + + return; +} + + + + + + +################################################################################ + +=head2 log_header + + Arg[0] : string - log message. + Arg[1] : boolean - memory usage, appends current process memory stats + Description : Wrapper method to format a log as a header line + Return type : none + Example : $root->log("Processing file $filename ...", 1); + Exceptions : none + +=cut + +################################################################################ + +sub log_header{ + my ($self, $message, $mem, $date) = @_; + + print $LOGFILE "\n\n"; + $self->log("::\t$message\t::\t::", $mem, $date); + print $LOGFILE "\n"; +} + + + + + +################################################################################ + +=head2 debug + + Description : Method to write debug info to a previously set up debug file. + Over-rides Root.pm on/off style debugging + + Args : int: debug level and string: log message. + + ReturnType : none + + Example : $root->debug(2,"dir=$dir file=$file"); + + Exceptions : none + +=cut + +################################################################################ + +sub debug{ + my ($self,$level,$message) = @_; + + + + #Can we not detect whther message is a scalar, array or hash and Dump or print accordingly? + + my (@call,$cnt,$prog_name,$prog_line,$call_name,$call_line); + + $prog_name = $call_name = "undef"; + $prog_line = $call_line = $cnt = 0; + + # if debug on at the requested level then output the passed message + if (defined $self->{_debug_level} && $level <= $self->{_debug_level}){ + + ######Replace this with Carp method? + while (@call = caller($cnt++)){ + + if ($cnt == 2){ + $call_name = basename($call[1]); + $call_line = $call[2] + } + + $prog_name = basename($call[1]); + $prog_line = $call[2]; + } + + #This still attempts to print if file not opened + print $DBGFILE "debug $message\t: [$$ - $prog_name:$prog_line $call_name:$call_line]\n"; + + #carp("carping $message"); + } +} + + +################################################################################ + +=head2 debug_hash + + Description : Method to write the contents of passed hash to debug output. + + Args : int: debug level and hashref. + + ReturnType : none + + Example : $Helper->debug_hash(3,\%hash); + + Exceptions : none + +=cut + +################################################################################ + +sub debug_hash{ + my ($self,$level,$hashref) = @_; + + my ($attr); + + # if debug on at the requested level then output the passed hash + if (defined $self->{_debug_level} && $level <= $self->{_debug_level}){ + print $DBGFILE Data::Dumper::Dumper(\$hashref)."\n"; + } +} + + + +################################################################################ + +=head2 run_system_cmd + + Description : Method to control the execution of the standard system() command + + ReturnType : none + + Example : $Helper->debug(2,"dir=$dir file=$file"); + + Exceptions : throws exception if system command returns none zero + +=cut + +################################################################################ + + +#Move most of this to EFGUtils.pm +#Maintain wrapper here with throws, only warn in EFGUtils + +sub run_system_cmd{ + my ($self, $command, $no_exit) = @_; + + my $redirect = ''; + + $self->debug(3, "system($command)"); + + # decide where the command line output should be redirected + + #This should account for redirects + #This just sends everything to 1 no? + + if (defined $self->{_debug_level} && $self->{_debug_level} >= 3){ + + if (defined $self->{_debug_file}){ + $redirect = " >>".$self->{_debug_file}." 2>&1"; + } + else{ + $redirect = ""; + } + } + else{ + #$redirect = " > /dev/null 2>&1"; + } + + # execute the passed system command + my $status = system("$command $redirect"); + my $exit_code = $status >> 8; + + if ($status == -1) { + warn "Failed to execute: $!\n"; + } + elsif ($status & 127) { + warn sprintf("Child died with signal %d, %s coredump\nError:\t$!",($status & 127),($status & 128) ? 'with' : 'without'); + } + elsif($status != 0) { + warn sprintf("Child exited with value %d\nError:\t$!\n", $exit_code); #get the true exit code + } + + if ($exit_code != 0){ + + if (! $no_exit){ + throw("System command failed:\t$command\nExit code:\t$exit_code\n$!"); + } + else{ + warn("System command returned non-zero exit code:\t$command\nExit code:\t$exit_code\n$!"); + } + } + + #reverse boolean logic for perl...can't do this anymore due to tab2mage successful non-zero exit codes :/ + + return $exit_code; +} + + +#add sys_get method ehre to handle system calls which retrieve data? +#i.e.backtick commands `find . -name *fasta` +#or use want or flag with above method? +#should open pipe instead to capture error? + +sub get_data{ + my ($self, $data_type, $data_name) = @_; + + #This method is just to provide standard checking for specific get_data/config methods + + if(defined $data_name){ + throw("Defs data name $data_name for type '$data_type' does not exist\n") if (! exists $self->{"${data_type}"}{$data_name}); + }else{ + throw("Defs data type $data_type does not exist\n") if (! exists $self->{"${data_type}"}); + } + + return (defined $data_name) ? $self->{"${data_type}"}{$data_name} : $self->{"${data_type}"}; +} + + +#sub Timer{ +# my ($self) = shift; + +# $self->{'_timer'} = new Devel::Timer() if(! defined $self->{'_timer'}); + +# return $self->{'_timer'}; + +#} + + +sub set_header_hash{ + my ($self, $header_ref, $fields) = @_; + + my %hpos; + + for my $x(0..$#{$header_ref}){ + $hpos{$header_ref->[$x]} = $x; + } + + + if($fields){ + + foreach my $field(@$fields){ + + if(! exists $hpos{$field}){ + throw("Header does not contain mandatory field:\t${field}"); + } + } + } + + return \%hpos; +} + +#Move this to EFGUtils? + +sub backup_file{ + my ($self, $file_path) = @_; + + throw("Must define a file path to backup") if(! $file_path); + + if (-f $file_path) { + $self->log("Backing up:\t$file_path"); + system ("mv ${file_path} ${file_path}.".`date '+%T'`); + } + + return; + +} + +#This should move to Utils +#as it is a simple string manipulation + +sub get_schema_and_build{ + my ($self, $dbname) = @_; + my @dbname = split/_/, $dbname; + return [$dbname[($#dbname -1)], $dbname[($#dbname )]]; +} + +=head2 get_regbuild_set_states + + Arg [1] : Bio::EnsEMBL::DBAdaptor + Example : my ($dset_states, $rset_states, $fset_states) = $helper->get_regbuild_set_states($db); + Description: Returns Array refs of appropriate states for sets use din the regulatory build + Returntype : Array + Exceptions : Warns if cannot find chromosome CoordSystem + Caller : HealthChecker & regulatory build code + Status : At risk + +=cut + + +sub get_regbuild_set_states{ + my ($self, $db) = @_; + + my $cs_a = $db->get_CoordSystemAdaptor; + + #These states need to be mirrored in RegulatorySets.java + + my $chrom_cs = $cs_a->fetch_by_name('chromosome'); + my (@dset_states, @rset_states, @fset_states); + + if(! defined $chrom_cs){ + #This species most likely does not have a regbuild + #really just need to get the 'highest' level here + warn "Could not find Chromosome CoordSystem. ".$db->dbc->dbname.". most likely does not contain a RegulatoryBuild"; + } + else{ + my $imp_cs_status = 'IMPORTED_'.$cs_a->fetch_by_name('chromosome')->version; + + #What about non-chromosome assemblies? + #top level will not return version...why not? + @dset_states = ('DISPLAYABLE'); + @rset_states = (@dset_states, 'DAS_DISPLAYABLE', $imp_cs_status); + @fset_states = (@rset_states, 'MART_DISPLAYABLE'); + } + + return (\@dset_states, \@rset_states, \@fset_states); +} + + + +=head2 define_and_validate_sets + + Arg [1] : hash - set constructor parameters: + -dbadaptor Bio::EnsEMBL::Funcgen::DBAdaptor + -name Data/FeatureSet/ResultSet name to create + -feature_type Bio::EnsEMBL::Funcgen::FeatureType + -cell_type Bio::EnsEMBL::Funcgen::CellType + -analysis FeatureSet Bio::EnsEMBL::Analysis + -feature_class e.g. annotated or regulatory + -description FeatureSet description + -recovery Allows definition of extant sets so long as they match + -append Boolean - Forces import on top of previously imported data + -rollback Rolls back product feature set. + -supporting_sets Complete set of pre-stored supporting or input sets for this DataSet + -slices ARRAYREF of Slices to rollback + Example : my $dset = $self->define_and_validate_Set(%params); + Description: Checks whether set is already in DB based on set name, rolls back features + if roll back flag set. Or creates new DataSet and Feature|ResultSet if not present. + Returntype : Bio::EnsEMBL::Funcgen::DataSet + Exceptions : Throws if DBAdaptor param not valid + Caller : Importers and Parsers + Status : At risk + +=cut + +sub define_and_validate_sets{ + my $self = shift; + + #change slice to slices to support multi slice import from InputSet::define_sets + #Can't do full rollback in slice mode + #This may not be safe in slice mode as we will then have mixed inputs/outputs + + my ($name, $anal, $ftype, $ctype, $type, $append, $db, $ssets, $description, $rollback, $recovery, $slices, $display_label) = rearrange(['NAME', 'ANALYSIS', 'FEATURE_TYPE', 'CELL_TYPE', 'FEATURE_CLASS', 'APPEND', + 'DBADAPTOR', 'SUPPORTING_SETS', 'DESCRIPTION', 'ROLLBACK', 'RECOVERY', 'SLICES', 'DISPLAY_LABEL'], @_); + + + #VALIDATE CONFIG HASH + #$config_hash ||= {};#default so exists will work without testing + #if(keys %{$config_hash}){ + # #There is a module to handle config hashes somewhere! + # throw('config_hash not yet implemented for define_and_validate_sets'); + #my @known_config = ('full_delete');#We never want full delete here as this is a create method! + #Can we set vars from has by refs like getopts? + #map { + # throw("Found unsupported config hash parameter:\t$_") if ! grep(/^${_}$/, @known_config); + #} keys %{$config_hash}; + # } + + #define rollback level + #extract this to _set_rollback_level($rollback_mode, $feature_class) + my $rollback_level = 0; + + #These should be globally defined so all rollback methods can use them + my %valid_rollback_modes = + ( + product_features => 1, + #Just product features and FeatureSet status, what about DataSet status? + #full delete does nothing here? + + sets => 2, + #Includes product_features and + #deletes supporting_sets entries unless we specify append + #revoke all states on Feature/Data/InputSets + #Full delete removes Feature/Data/InputSet entries + #Never includes ResultSets! + + supporting_features => 3, + #Includes product_feature and sets + #Removes all states and supporting features + #inc. ResultSet results/ResultFeatures + #Full_delete remove supporting set entries + #Otherwise just rollback states for affected sets + ); + + if($rollback){ + if(! exists $valid_rollback_modes{$rollback}){ + #Default to some sensible values + $rollback = 'product_features';#default for FeatureSets + + #Always want overwrite supporting sets if there is a difference + $rollback = 'sets' if ($type eq 'regulatory'); + $rollback = 'supporting_sets' if ($type eq 'result'); + + warn ("You have not set a valid rollback mode(product_features|sets|supporting_features), defaulting to $rollback for feature class $type\n"); + } + + $rollback_level = $valid_rollback_modes{$rollback}; + } + + + if($slices && (ref($slices) ne 'ARRAY')){ + throw('-slices param must be an ARRAYREF of Bio::EnsEMBL::Slice objects'); + #Rest of validation done in other methods + } + + + + #But how are we going to resolve the append behaviour when we also want to validate the ssets? + #Can't, so append also functions to enable addition in the absence of some or all previous data/esets? + #No this is not true, we want to be able to fetch an extant set for import, + #we just need to be aware of sset IMPORTED status? + #This should be a recovery thing, allow fetch, but validate sets? + + + #Check mandatory params + if(! (ref($db) && $db->isa('Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor'))){ + throw('Must provide a valid Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor'); + } + + throw('Must provide a -name ') if(! defined $name); + + #Not necessarily, just do rollback then append? + #But then we'd potentially have a supporting set associated which has had it's data removed from the feature set. + #Generating sets for an ExpSet will always have append set + #This could be valid for generically grabing/creating sets for adding new supporting sets e.g. reg build + throw('-append and -rollback are mutually exclusive') if $rollback_level && $append; + + #This will never happen due to previous test? append will always fail? + #warn('You are defining a pre-existing FeatureSet without rolling back'. + # ' previous data, this could result in data duplication') if $append && ! $rollback_level; + #Is this really possible, surely the supporting set will fail to store due to unique key? + + + #Should we warn here about append && recovery? + #Aren't these mutually exclusive? + #Do we know if we have new data? append should override recovery, or just specifiy append + #This will stop the import and highlight the issue to the user + #We need to be able to run with both otherwise the import will not work + + + throw('Must provide a -feature_class e.g. annotated, external, result or regulatory') if(! defined $type); + #Check for annotated, external, regulatory etc here? + #Should never be external as we don't have DataSets for external sets? + + $db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::FeatureType', $ftype); + if (defined $ctype){ + $db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::CellType', $ctype); + } + elsif($type ne 'regulatory'){ + throw('Only Data/FeatureSets with type \'regulatory\' can have an undefined CellType'); + #Coudl extend this to core set by name eq 'RegulatoryFeatures'? + } + + $db->is_stored_and_valid('Bio::EnsEMBL::Analysis', $anal); + + my $dset_adaptor = $db->get_DataSetAdaptor; + my $fset_adaptor = $db->get_FeatureSetAdaptor; + my $rset_adaptor = $db->get_ResultSetAdaptor; + + #DataSet centric definition to enable multiple DataSets + #to be generated from the same supporting sets + my $dset = $dset_adaptor->fetch_by_name($name); + my ($fset, $rset, @input_sets); + + #Validate stored vs passed set data + + if(defined $dset){ + $self->log('Found Stored DataSet '.$dset->name); + + if($type ne 'result'){#i.e. annotated + + #Does this account for regulatory? + + $fset = $dset->product_FeatureSet; + #Here we have the possiblity that a feature_set with a different name may have + #been associated with the DataSet + + if(defined $fset){ + $self->log("Found associated product FeatureSet:\t".$fset->name); + + #if(! $clobber && + if($fset->name ne $name){ + throw('Invalid product FeatureSet name ('.$fset->name.') for DataSet ('.$name.'). Rollback will overwrite the FeatureSet and mismatched name will be retained.'); + #Need to clobber both or give explicit name for datasets or rename dataset??? + #Force this throw for now, make this fix manual as we may end up automatically overwriting data + } + } + + #This needs to be modified to support InputSets in ResultSets? + #Would never have mixed Input/ResultSets so no need + #Could potential need to do it for mixed Result/FeatureSets + #if we ever use an analysis which uses both set types + + #check supporting_sets here if defined + #We have the problem here of wanting to add ssets to a previously existing dset + #we may not know the original sset, or which of the ssets are new + #Hence there is a likelihood of a mismatch. + + #Much of this is replicated in store_udpated sets + + + if(defined $ssets){ + my @sorted_ssets = sort {$a->dbID <=> $b->dbID} @{$ssets}; + my @stored_ssets = sort {$a->dbID <=> $b->dbID} @{$dset->get_supporting_sets}; + my $mismatch = 0; + + $mismatch = 1 if(scalar(@sorted_ssets) != scalar(@stored_ssets)); + + if(! $mismatch){ + + for my $i(0..$#stored_ssets){ + + if($stored_ssets[$i]->dbID != $sorted_ssets[$i]->dbID){ + $mismatch=1; + last; + } + } + } + + + + + if($mismatch){ + #We're really print this names here which may hide the true cell/feature/anal type differences. + my $mismatch = 'There is a (name/type/analysis) mismatch between the supplied supporting_sets and the'. + ' supporting_sets in the DB for DataSet '.$dset->name."\n\nStored:\n" + .join(', ', (map { $_->name } @stored_ssets))."\n\nSupplied supporting_sets:\n" + .join(', ', (map { $_->name } @sorted_ssets)); + + + if($append){ + warn($mismatch."\n\nAppending supporting set data to unvalidated supporting sets"); + } + elsif($rollback_level > 1){#supporting set rollback + warn($mismatch."\n\nReplacing previously stored supporting sets with newly defined sets\n"); + + if($slices){ + warn("WARNING:\tPerforming supporting_set rollback in slice mode. This may corrupt the supporting_set definition for other slices in this DataSet if they are not re-generated using the same supporting_sets\n"); + } + + #Remove supporting_set entries + #This should be in a rollback_DataSet method + #This has moved to DataSetAdaptor::store_update_sets + + #Reset supporting sets + $dset->{'supporting_sets'} = undef; + $dset->add_supporting_sets(\@sorted_ssets); + #Move this to last block? + #This will currently fail as it test for product_FeatureSet + #How do we get around this? Remove IMPORTED status and only throw if fset has IMPORTED status? + + #warn "pre store sset ".@{$dset->get_supporting_sets}; + + #($dset) = @{$dset_adaptor->store_updated_sets([$dset], $rollback_level)}; + #$dset->adaptor->store_regbuild_meta_strings($dset, $rollback_level) if $type eq 'regulatory'; + } + else{ + throw($mismatch); + } + } + } + else{ + warn("No supporting sets defined, skipping supporting set validation for definition of DataSet:\t".$name); + } + } + else{#result_features from InputSet + #Do we ever pass supporting sets here? + #Do we need to test vs stored_sets? + + + #There is the potential for more than one ResultSet to be associated with DataSet + #But as we are using the same name, this restricts the number wrt the cardinality + #of the name field. i.e. 1 name per analysis/cell_type/feature_type. + #This now works slightly differently to the rest of this method as we + #need to treat the ResultSet as we are currently treating the FeatureSet below. + + #However, the use case of this method is for one InputSet giving rise to one ResultSet + #Hence just throw if we find more than one or have a name mismatch??? + my @stored_sets = @{$dset->get_supporting_sets}; + + + + #THis assumes we will always have supporting sets + #and is failing as we have removed this test in DataSet::new + #But where are we storing it without the supporting set? + + if(scalar(@stored_sets) > 1){ + throw('define_and_validate_sets does not yet support DataSets with multiple supporting ResultSets for result_features'); + } + elsif(! @stored_sets){ + throw("DataSet($name) does not have any stored supporting sets. These should have been defined when storing the DataSet"); + #Or should we handle this? + } + + $rset = $stored_sets[0]; + + if($rset->set_type ne 'result'){ + throw("DataSet already contains a supporting set which is not a ResultSet:\t".$rset->set_type."\t".$stored_sets[0]->name); + } + elsif($ssets){ + #Do we ever pass supporting sets, test for completeness + + #Just test we have the same supplied ssets if it is defined + if(scalar(@$ssets) != 1){ + throw("ResultFeature data sets currently only support one supporting ResultSet.\nSupproting sets:\t". + join(', ', (map { $_->name.'('.$_->set_type } @$ssets))); + } + elsif(! ($rset->dbID == $ssets->[0]->dbID) && + ($ssets->[0]->set_type eq 'result')){ + throw('Supplied supporting set('.$ssets->[0]->name.') does not match stored supporting set('.$rset->name.')'); + } + } + + @input_sets = @{$rset->get_InputSets}; + } + } + + + + if($type eq 'result'){ + + #Validate the defined InputSets + if (scalar(@$ssets) > 1) { + throw("define_and_validate_sets does not yet support multiple InputSets for defining a ResultSet:\t".$name); + + } + + if ($ssets->[0]->set_type ne 'input') { + throw("To define a ResultSet($name) containing result_features, you must provide and InputSet as a supporting set\nArray based ResultSets(i.e. experimental_chip/channel) are not defined using this method, see specific Import Parsers."); + } + + + #Try and grab the rset just in case it has been orphaned somehow + if (! defined $rset) { + $rset = $rset_adaptor->fetch_all_by_name($name, $ftype, $ctype, $anal)->[0]; + #Should only ever be one given all parts of unique key + @input_sets = @{$rset->get_InputSets} if $rset; + + } + + + if (defined $rset) { #Validate stored InputSets + + if (scalar(@input_sets) != scalar(@$ssets)) { + throw('Found mismatch between number of previously stored InputSets('.scalar(@input_sets).') and defined InputSets('.scalar(@$ssets).'). You must provide a complete list of InputSets to define your ResultSet.'); + } + + if ($input_sets[0]->dbID != $ssets->[0]->dbID) { + throw('Found dbID mismatch between previously stored InputSet('.$input_sets[0]->name.') and define InputSet('.$ssets->[0]->name.')'); + } + + #rollback ResultSet/InputSet here? + if($rollback_level > 2){ + warn "rollback not yet fully implemented for Result/InputSets"; + + #Does this need to be by slice? + #What about states if we are running in parallel? + + if($slices){ + map {$self->rollback_ResultSet($rset, $rollback, $_)} @$slices; + } + else{ + $self->rollback_ResultSet($rset, $rollback); + } + + } + + } + else{#define ResultSet + ($rset) = @{$rset_adaptor->store(Bio::EnsEMBL::Funcgen::ResultSet->new + ( + -name => $name, + -feature_type => $ftype, + -cell_type => $ctype, + -table_name => 'input_set', + -table_id => $ssets->[0]->dbID, + -analysis => $anal + ) + )}; + + } + } + else{#annotated/regulatory/external i.e. FeatureSet + + #Try and grab the fset just in case it has been orphaned somehow + if(! defined $fset){ + $fset = $fset_adaptor->fetch_by_name($name); + + if(defined $fset){ + #Now we need to test whether it is attached to a dset + #Will be incorrect dset if it is as we couldn't get it before + #else we test the types and rollback + $self->log("Found stored orphan FeatureSet:\t".$fset->name); + + my $stored_dset = $dset_adaptor->fetch_by_product_FeatureSet($fset); + + if(defined $stored_dset){ + throw('Found FeatureSet('.$name.') associated with incorrect DataSet('.$stored_dset->name. + ").\nTry using another -name in the set parameters hash"); + + } + } + } + + #Rollback or create FeatureSet + if(defined $fset){ + + if($rollback_level){ + #Don't check for IMPORTED here as we want to rollback anyway + #Not forcing delete here as this may be used as a supporting set itself. + + $self->rollback_FeatureSet($fset, undef, $slices); + } + elsif ($append || $recovery) { + #This is only true if we have an sset mismatch + + #Do we need to revoke IMPORTED here too? + #This behaves differently dependant on the supporting set. + #InputSet status refers to loading in FeatureSet, where as ResultSet status refers to loading into result table + + #So we really want to revoke it + #But this leaves us vulnerable to losing data if the import crashes after this point + #because we have no way of assesing which is complete data and which is incomplete data + #within a feature set. + #This means we need a status on supporting_set, not InputSet or ResultSet + #as this has to be in the context of a dataset. + #Grrr, this means we need a SupportingSet class which simply wraps the InputSet/ResultSet + #We also need a single dbID for the supporting_set table + #Which means we will have to do some wierdity with the normal dbID implementation + #i.e. Have supporting_set_id, so we can still access all the normal dbID method for the given Set class + #This will have to be hardcoded into the state methods + #Also will need to specify when we want to store as supporting_status or normal set status. + + #This is an awful lot to protect against vulnerability + #Also as there easy way to track what features came from which supporting set + #There isn't currently a viable way to rollback, hence will have to redo the whole set. + + #Maybe we can enforce this by procedure? + #By simply not associating the supporting set until it has been loaded into the feature set? + #This may cause even more tracking problems + + #Right then, simply warn and do not revoke feature_set IMPORTED to protect old data? + #Parsers should identify supporting_sets(InputSets) which exist but do not have IMPORTED + #status and fail, specifying -recover which will rollback_FeatureSet which will revoke the IMPORTED status + + #This can mean a failed import can leave a partially imported feature set with the IMPORTED status!!! + + #We just need to handle InputSets and ResultSets differently. + #In parsers or here? + #Probably best in the parsers as this is where the states are set. + + + #Should we throw here for ResultSet? + #Force rollback of FeatureSet first or create new one? + #And throw for InputSet? + #This again comes back to whether we will ever have more than one file + #for a give InputSet, currently not. + + $self->log("WARNING\t::\tAdding data to a extant FeatureSet:\t".$fset->name); + } else { + throw('Found extant FeatureSet '.$fset->name.'. Maybe you want to specify the rollback, append or recovery parameter or roll back the FeatureSet separately?'); + } + } else { + #create a new one + $self->log("Creating new FeatureSet:\t".$name); + + $fset = Bio::EnsEMBL::Funcgen::FeatureSet->new( + -name => $name, + -feature_type => $ftype, + -cell_type => $ctype, + -analysis => $anal, + -feature_class => $type, + -description => $description, + -display_label => $display_label, + ); + ($fset) = @{$fset_adaptor->store($fset)}; + } + } + + #Create/Update the DataSet + if(defined $dset){ + #Could do these updates above? + #But delayed to reduce redundancy + + if($type ne 'result'){ + + if(! defined $dset->product_FeatureSet){ + $self->log("Updating DataSet with new product FeatureSet:\t".$fset->name); + $dset->product_FeatureSet($fset); + } + + $dset = $dset_adaptor->store_updated_sets([$dset], $rollback_level)->[0]; + #This cannot store the focus sets as we don't know which are which yet + #Only the script knows this + # $dset->adaptor->store_regbuild_meta_strings($dset, $rollback_level) if $type eq 'regulatory'; + } + else{ + #We may have the case where we have a DataSet(with a FeatureSet) but no ResultSet + #i.e. Load result_features after peak calls + #So update dset with ResultSet + + if(! @{$dset->get_supporting_sets}){ + $self->log("Updating DataSet with new ResultSet:\t".$rset->name); + $dset->add_supporting_sets([$rset]); + $dset = $dset_adaptor->store_updated_sets([$dset], $rollback_level)->[0]; + } + } + } + else{ + $self->log("Creating new ${type}_feature DataSet:\t".$name); + + if($type ne 'result'){ + ($dset) = @{$dset_adaptor->store(Bio::EnsEMBL::Funcgen::DataSet->new + ( + -name => $name, + -feature_set => $fset, + -supporting_sets => $ssets, + ))}; + #$dset->adaptor->store_regbuild_meta_strings($dset, $rollback_level) if $type eq 'regulatory'; + } + else{ + warn "creating dataset $name with supporting set $rset"; + ($dset) = @{$dset_adaptor->store(Bio::EnsEMBL::Funcgen::DataSet->new + ( + -name => $name, + -supporting_sets => [$rset], + ))}; + } + } + + return $dset; +} + + +#Rollback/load methods migrated from DBAdaptor +#Move to SetAdaptors, better located and will remove cyclical dependancy + +=head2 rollback_FeatureSet + + Arg [0] : Bio::EnsEMBL::Funcgen::FeatureSet + Arg [1] : optional - boolean force delete flag, if this FeatureSet is use as a support + for another DataSet. + Arg [2] : optional - arrayref of Bio::EnsEMBL::Slice objects to rollback + Arg [3] : optional - boolean flag to perform full rollback i.e. default will just remove feature + specifying this with also delete the feature_set record + Example : $self->rollback_FeatureSet($fset); + Description: Deletes all status and feature entries for this FeatureSet. + Checks whether FeatureSet is a supporting set in any other DataSet. + Returntype : none + Exceptions : Throws if any deletes fails or if db method unavailable + Caller : Importers and Parsers + Status : At risk + +=cut + + +sub rollback_FeatureSet{ + my ($self, $fset, $force_delete, $slices, $full_delete) = @_; + + #Remove force delete and just throw? + #Currently only used in project_feature_set. + #May want to keep an old RegBuild for mapping/comparison? + #Coudl get around this by simply deleting the data_set? Unknown impact. + #Move to config hash? + #No need for rollback_level here as we always want to do the same thing + + my ($sql, $slice_name); + my $slice_join = ''; + my $table = $fset->feature_class.'_feature'; + my $adaptor = $fset->adaptor || throw('FeatureSet must have an adaptor'); + my $db = $adaptor->db; + #Cyclical dpendancy here, so not strictly necessary. + $db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::FeatureSet', $fset); + + + $self->log_header('Rolling back '.$fset->feature_class." FeatureSet:\t".$fset->name); + + if($slices){ + + if($full_delete){ + throw("Cannot specify a full_delete for a Slice based rollback:\t".$fset->name); + } + + + if(! ref($slices) eq 'ARRAY'){ + throw('Slices must be an ARRAYREF of Slice objects'); + } + + map { throw("Must pass a valid Bio::EnsEMBL::Slice") if (! (ref($_) && $_->isa('Bio::EnsEMBL::Slice'))) } @$slices; + $self->log("Restricting to slices:\n\t\t".join("\n\t\t", (map { $_->name } @$slices)) ); + #Allow subslice rollback only for one slice at a time + my $subslice = (scalar(@$slices) == 1) ? 1 : 0; + my @sr_ids; + + foreach my $slice(@$slices){ + my $efg_sr_id = $fset->get_FeatureAdaptor->get_seq_region_id_by_Slice($slice); + + if(! $efg_sr_id){ + $self->log("Slice is not present in eFG DB:\t".$slice->name); + }else{ + + if(! $subslice){#Test is not subslice + my $full_slice = $slice->adaptor->fetch_by_region(undef, $slice->seq_region_name); + + if(($slice->start != 1) || + ($full_slice->end != $slice->end)){ + throw("Can only rollback subslices one at a time:\nRollback slice:\t" + .$slice->name."\nFull slice:\t".$full_slice->name); + } + } + + push @sr_ids, $efg_sr_id; + } + } + + if(scalar(@sr_ids) == 1){ + #Allow sub slice rollback + #add range here from meta coord? + $slice_join = " and f.seq_region_id=$sr_ids[0] and f.seq_region_start<=".$slices->[0]->end.' and f.seq_region_end>='.$slices->[0]->start; + } + else{ + $slice_join = ' and f.seq_region_id in ('.join(', ', @sr_ids).')'; + } + } + + + + #Check whether this is a supporting set for another data_set + my @dsets = @{$db->get_DataSetAdaptor->fetch_all_by_supporting_set($fset)}; + + if(@dsets){ + my $txt = $fset->name." is a supporting set of the following DataSets:\t".join(', ', (map {$_->name} @dsets)); + + if($force_delete){ + $self->log("WARNING:\t$txt\n"); + } + else{ + throw($txt."\nPlease resolve or specify the force_delete argument") + } + } + + #Remove states + if(! $slices){ + $fset->adaptor->revoke_states($fset); + + #Revoke InputSet states here as this refers to whether + #they are imported in the FeatureSet + #Do this in FeatureSet->revoke_states? + + my $dset = $db->get_DataSetAdaptor->fetch_by_product_FeatureSet($fset); + + #Account for absent dset if we have an external_feature set + + if((! defined $dset) && + $fset->feature_class ne 'external'){ + warn "WARNING:\tFeatureSet ".$fset->name." does not have an associated DataSet. Rollback may be incomplete"; + } + + if($dset){ + + foreach my $sset(@{$dset->get_supporting_sets}){ + #Maybe skip this if we defined slice? + + #??? Do we want to do this? + #This is dependant on the feature_class of the InputSet + #result InputSets may have been imported as ResultFeatureCollections + #So we want to leave those in place + #annotated feature_class InputSets are directly imports, so the status of these refers + #to the FeatureSet import status + #Where is the imported status set for SWEmbl? + + if(($sset->feature_class eq 'annotated') && + $sset->isa('Bio::EnsEMBL::Funcgen::InputSet')){ + + $self->rollback_InputSet($sset) if $sset->isa('Bio::EnsEMBL::Funcgen::InputSet'); + $self->rollback_InputSet($sset);#add full delete here? + #Do not want to rollback here for other type of sset + } + } + } + } + else{ + $self->log('Skipping '.$fset->name.' revoke_states for partial Slice rollback, maybe revoke IMPORTED? '); + } + + #should add some log statements here? + + my $row_cnt; + + #Rollback reg attributes + if($fset->feature_class eq 'regulatory'){ + $sql = "DELETE ra from regulatory_attribute ra, $table f where f.${table}_id=ra.${table}_id and f.feature_set_id=".$fset->dbID.$slice_join; + $self->rollback_table($sql, 'regulatory_attribute', undef, $db); + + + + if($full_delete){ + #Now delete meta entries + #This is messy as we use the following meta_key nomencalture + #which do not match the fset names + #regbuild.feature_set_ids_v5 + #regbuild.feature_type_ids_v5 + #regbuild.focus_feature_set_ids + #regbuild.initial_release_date_v6 + #regbuild.last_annotation_update_v6 + #regbuild.version NEED TO ADD THIS + #Also need to revise how these are generated by build_reg_feats. + #WHat about new cell_type level feature sets? + #How will we model these in the meta table? + + warn "Need to revise meta table entries before we add a delete here, remove manually for now for:\t".$fset->name; + + #We would only remove meta entries if we are performing a full rollback + my $version; + ($version = $fset->name) =~ s/.*_v([0-9]+)$/$1/; + $version = ($version eq $fset->name) ? '' : "_v${version}"; + + #These are versionless meta_keys and apply to all sets + #handle these in reg build script + #'regbuild.initial_release_date', + #'regbuild.last_annotation_update' + #'regbuild.version' + + foreach my $mkey('regbuild.%s.feature_set_ids', + 'regbuild.%s.feature_type_ids', + 'regbuild.%s.focus_feature_set_ids'){ + + my $meta_key = sprintf($mkey, $fset->cell_type->name).$version; + $sql = "DELETE from meta where meta_key='${meta_key}'"; + $self->rollback_table($sql, 'meta', undef, $db); + } + } + } + + + #Need to remove object xrefs here + #Do not remove xrefs as these may be used by something else! + $sql = "DELETE ox from object_xref ox, $table f where ox.ensembl_object_type='".ucfirst($fset->feature_class)."Feature' and ox.ensembl_id=f.${table}_id and f.feature_set_id=".$fset->dbID.$slice_join; + $self->rollback_table($sql, 'object_xref', 'object_xref_id', $db); + + + #Remove associated_feature_type records + #Do not remove actual feature_type records as they may be used by something else. + + $sql ="DELETE aft from associated_feature_type aft, $table f where f.feature_set_id=".$fset->dbID." and f.${table}_id=aft.table_id and aft.table_name='".$fset->feature_class."_feature'".$slice_join; + $self->rollback_table($sql, 'associated_feature_type', undef, $db); + + + + #Remove features + $sql = "DELETE f from $table f where f.feature_set_id=".$fset->dbID.$slice_join; + $self->rollback_table($sql, $table, "${table}_id", $db); + + if($full_delete){ #Also delete feature/data_set records + + $sql = "DELETE from feature_set where feature_set_id=".$fset->dbID; + $self->rollback_table($sql, 'feature_set', 'feature_set_id', $db); + $self->log("Deleted feature_set entry for:\t".$fset->name); + + + $sql = "DELETE from data_set where feature_set_id=".$fset->dbID; + $self->rollback_table($sql, 'data_set', 'data_set_id', $db); + $self->log("Deleted associated data_set entry for:\t".$fset->name); + } + + return; +} + + +=head2 rollback_ResultSet + + Arg[1] : Bio::EnsEMBL::Funcgen::ResultSet + Arg[2] : Boolean - optional flag to roll back array results + Example : $self->rollback_ResultSet($rset); + Description: Deletes all status. chip_channel and result_set entries for this ResultSet. + Will also rollback_results sets if rollback_results specified. This will also + update or delete associated ResultSets where appropriate. + Returntype : Arrayref containing the ResultSet and associated DataSet which have not been rolled back + Exceptions : Throws if ResultSet not valid + Throws is result_rollback flag specified but associated product FeatureSet found. + Caller : General + Status : At risk + +=cut + +#Need to change slice to slices ref here +#Need to add full rollback, which will specify to remove all sets +#as well as results and +#These params need clarifying as their nature changes between input_set and array rsets +#Don't we always want to rollback_results? +#force should only really be used to rollback InputSet ResultFeature sets +#i.e. Read collections which are not used as direct input for the linked product FeatureSet +#This should fail with array data associated with a product feature set + +#Do we want to separate ResultFeature rollback from result rollback? +#Currently the array based collection rollback is done by hand +#Could be done via the ResultFeature Collector, but should probably use this method. + + +#rollback_results is only used in the MAGE parser to identify sets which have an +#associated product fset. +#Can't really separate due to integrated functionality + +sub rollback_ResultSet{ + my ($self, $rset, $rollback_results, $slice, $force, $full_delete) = @_; + + if(! (ref($rset) && $rset->can('adaptor') && defined $rset->adaptor)){ + throw('Must provide a valid stored Bio::EnsEMBL::ResultSet'); + } + + if($slice && $rset->table_name ne 'input_set'){ + throw('Can only rollback_ResultSet by Slice if the ResultSet contains InputSets'); + } + + #We're still validating against itself?? + #And reciprocating part of the test :| + my $sql; + my $db = $rset->adaptor->db;#This needs to be tested + $db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::ResultSet', $rset); + $self->log("Rolling back ResultSet:\t".$rset->name); + my $dset_adaptor = $self->db->get_DataSetAdaptor; + my $rset_adaptor = $self->db->get_ResultSetAdaptor; + my @skipped_sets; + + ### Check if this ResultSet is part of a DataSet with a product feature set + + foreach my $dset(@{$dset_adaptor->fetch_all_by_supporting_set($rset)}){ + + if (defined $dset){ + $self->log('Found linked DataSet('.$dset->name.") for ResultSet:\t".$rset->log_label); + + if(my $fset = $dset->product_FeatureSet){ + @skipped_sets = ($rset,$dset); + + #What impact does this have on result_rollback? + #None as we never get there + #But what if we have specified rollback results? + #We should throw here as we can't perform the rollback + + if($rollback_results){ + + if($rset->table_name ne 'input_set' || + (! $force)){#is an input_set/reads collection + #This will always throws for non-input_set ResultSets + + throw("Could not rollback supporting ResultSet and results for:\t".$rset->log_label. + "\nEither manually resolve the supporting/feature set relationship or set the 'force' flag.\n"); + # ."Alternatively omit the rollback_results argument if you simply want to redefine the ResultSet without loading any new data"); + #This last bit is no longer true + #Remove rollback_results? + } + else{ + @skipped_sets = (); + $self->log("Forcing results rollback for InputSet based ResultSet:\t".$rset->log_label); + } + } + + if(@skipped_sets){ + $self->log('Skipping rollback. Found product FeatureSet('.$fset->name.") for supporting ResultSet:\t".$rset->log_label); + } + + } + elsif((! defined $slice) && + $full_delete){ + #Found rset in dset, but not yet processed so can remove safely. + $self->unlink_ResultSet_DataSet($rset, $dset); + } + } + } + + + #Now do similar for all associated ResultSets + if(! @skipped_sets){ + + + #Rollback results if required + if($rollback_results){ + + $self->log("Rolling back results for ResultSet:\t".$rset->log_label); + #Check result_set_input_ids are present in other result sets. + my @assoc_rsets = @{$rset_adaptor->fetch_all_linked_by_ResultSet($rset)}; + my $feature_supporting = 0; + + foreach my $assoc_rset(@assoc_rsets){ + + foreach my $dset(@{$dset_adaptor->fetch_all_by_supporting_set($assoc_rset)}){ + + #Check for other product_FeatureSets + if(my $fset = $dset->product_FeatureSet){ + $feature_supporting++; + $self->log('Found product FeatureSet('.$fset->name. + ") for associated supporting ResultSet:\t".$rset->log_label); + + if($rset->table_name ne 'input_set' || + (! $force)){#is an input_set/reads collection + $feature_supporting++; + } + } + } + } + + + if(! $feature_supporting){ + + #RollBack result_feature table first + $self->rollback_ResultFeatures($rset, $slice); + + #Now rollback other states + $rset->adaptor->revoke_states($rset); + + + #This also handles Echip status rollback + if ($rset->table_name ne 'input_set'){ + $self->log("Rolling back result table for ResultSet:\t".$rset->log_label); + $self->rollback_results($rset->result_set_input_ids); + } + + $self->log('Removing result_set_input entries from associated ResultSets') if @assoc_rsets; + + if((! $slice) && + $full_delete){ + + #Now remove result_set_input_ids from associated rsets. + foreach my $assoc_rset(@assoc_rsets){ + $sql = 'DELETE from result_set_input where result_set_id='.$assoc_rset->dbID. + ' and result_set_input_id in('.join', ', @{$assoc_rset->result_set_input_ids}.')'; + $db->dbc->do($sql); + + # we need to delete complete subsets from the result_set table. + my $subset = 1; + + foreach my $cc_id(@{$assoc_rset->result_set_input_ids}){ + + if(! grep { /$cc_id/ } @{$rset->result_set_input_ids}){ + $subset = 0; + last; + } + } + + #$assoc_rset is complete subset of $rset so can delete + #We know this does not have an assoicated product feature set + #Only if it is not derived from an input_set + if($subset){ + $self->log("Deleting associated subset ResultSet:\t".$assoc_rset->log_label); + + #Delete status entries first + $assoc_rset->adaptor->revoke_states($assoc_rset); + + #All cc records will have already been deleted + $sql = 'DELETE from result_set where result_set_id='.$assoc_rset->dbID; + $db->dbc->do($sql); + } + } + } + + + #Now warn about Echips in Experiments which may need removing. + if($rset->table_name ne 'input_set'){ + my %experiment_chips; + + foreach my $echip(@{$rset->get_ExperimentalChips}){ + $experiment_chips{$echip->experiment->name}{$echip->unique_id} = undef; + } + + foreach my $exp(keys %experiment_chips){ + $self->log("Experiment $exp has had ".scalar(values %{$experiment_chips{$exp}}). + " ExperimentalChips rolled back:\t".join('; ', values %{$experiment_chips{$exp}}). + ".\nTo fully remove these, use the rollback_experiment.pl (with -chip_ids) script"); + } + } + else{ + #Should only be one to rollback + foreach my $iset(@{$rset->get_InputSets}){ + $self->rollback_InputSet($iset); + } + } + } + else{ + #$self->log("Skipping result rollback, found $feature_supporting associated supporting ResultSets for:\t".$rset->log_label); + #warn("Skipping result rollback, found $feature_supporting associated supporting ResultSets for:\t".$rset->log_label); + #do we need to return this info in skipped_rsets? + #This is just to allow importer to know which ones + #weren't rolled back to avoid naming clashes. + #so no. + + #But the results persist on the same chip_channel_ids + #So not returning this rset may result in loading of more data + #This should fail as status entries will not have been removed + #Still we should throw here as we'll most likely want to manually resolve this + #Besides this would be obfuscating the function + + throw("Could not rollback ResultSet and results, found $feature_supporting associated supporting ". + "ResultSets for:\t".$rset->log_label."\nManually resolve the supporting/feature set relationship or omit the ". + "rollback_results argument if you simply want to redefine the ResultSet without loading any new data"); + } + } + else{ + $self->log('Skipping results rollback'); + + if($rset->name =~ /_IMPORT$/){ + throw("Rolling back an IMPORT set without rolling back the result can result in ophaning result records for a whole experiment. Specify the result_rollback flag if you want to rollback the results for:\t".$rset->log_label); + } + } + + #Delete chip_channel and result_set records + #This should only be done with full delete + if((! $slice) && + $full_delete){ + $sql = 'DELETE from result_set_input where result_set_id='.$rset->dbID; + $self->rollback_table($sql, 'result_set_input', 'result_set_input_id', $db); + + $sql = 'DELETE from result_set where result_set_id='.$rset->dbID; + $db->dbc->do($sql); + $self->rollback_table($sql, 'result_set', 'result_set_id', $db); + } + } + + return \@skipped_sets; +} + + + +sub unlink_ResultSet_DataSet{ + my ($self, $rset, $dset, $new_name) = @_; + + #validate set vars + + my $db = $rset->adaptor->db; + + $self->log("Removing supporting ResultSet from DataSet:\t".$dset->name."\tResultSet:".$rset->log_label); + my $sql = 'DELETE from supporting_set where data_set_id='.$dset->dbID. + ' and type="result" and supporting_set_id='.$rset->dbID; + + warn "Removing ".$rset->log_label." as a supporting set to DataSet:\t".$dset->name. + "\nThis may result in a DataSet with no supporting sets"; + $db->dbc->do($sql); + + if($new_name){ + #We risk overwriting any previously renamed result sets. + #Should use datestamp? + $sql = 'UPDATE result_set set name="OLD_'.$rset->name.'" where result_set_id='.$rset->dbID; + $self->db->dbc->do($sql); + + if($dset->product_FeatureSet){ + $self->log('Associated DataSet('.$dset->name.') has already been processed. It is not wise to replace a supporting set without first rolling back the FeatureSet, as there may be additional supporting data'); + warn 'Associated DataSet('.$dset->name.') has already been processed. It is not wise to replace a supporting set without first rolling back the FeatureSet, as there may be additional supporting data'; + } + } + + return; +} + +=head2 rollback_InputSet + + Arg[1] : Bio::EnsEMBL::Funcgen::InputSet + Example : $self->rollback_InputSet($eset); + Description: Deletes all status entries for this InputSet and it's Subsets + Returntype : none + Exceptions : Throws if any deletes fails or if db method unavailable + Caller : Importers and Parsers + Status : At risk + +=cut + + +sub rollback_InputSet{ + my ($self, $eset, $force_delete, $full_delete) = @_; + + + #Need to implement force_delete!!!!!!!!!!!!!!!!!!!!!! + #Need to check this is not used in a DataSet/ResultSet + + my $adaptor = $eset->adaptor || throw('InputSet must have an adaptor'); + my $db = $adaptor->db; + + + $db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::InputSet', $eset); + + $self->log("Rolling back InputSet:\t".$eset->name); + + #SubSets + foreach my $esset(@{$eset->get_InputSubsets}){ + $esset->adaptor->revoke_states($esset); + } + + #InputSet + $eset->adaptor->revoke_states($eset); + + return; +} + + +=head2 rollback_results + + Arg[1] : Arrayref of chip_channel ids + Example : $self->rollback_results($rset->chip_channels_ids); + Description: Deletes all result records for the given chip_channel ids. + Also deletes all status records for associated experimental_chips or channels + Returntype : None + Exceptions : Throws if no chip_channel ids provided + Caller : General + Status : At risk + +=cut + +#changed implementation to take arrayref + +sub rollback_results{ + my ($self, $cc_ids) = @_; + + my @cc_ids = @{$cc_ids}; + + #Need to test for $self->db here? + + + if(! scalar(@cc_ids) >0){ + throw('Must pass an array ref of result_set_input_ids to rollback'); + } + + #Rollback status entries + #Cannot use revoke_states here? + #We can if we retrieve the Chip or Channel first + #Add to ResultSet adaptor + my $sql = 'DELETE s from status s, result_set_input rsi WHERE rsi.result_set_input_id IN ('.join(',', @cc_ids). + ') AND rsi.table_id=s.table_id AND rsi.table_name=s.table_name'; + + if(! $self->db->dbc->do($sql)){ + throw("Status rollback failed for result_set_input_ids:\t@cc_ids\n".$self->db->dbc->db_handle->errstr()); + } + + + #Rollback result entries + $sql = 'DELETE from result where result_set_input_id in ('.join(',', @cc_ids).');'; + $self->rollback_table($sql, 'result', 'result_id', $self->db); + return; +} + + +=head2 rollback_ResultFeatures + + Arg[0] : Bio::EnsEMBL::Funcgen::ResultSet + Arg[1] : Optional - Bio::EnsEMBL::Slice + Arg[2] : Optional - no_revoke Boolean. This is only used when generating new windows + from a 0 window size which has been projected from a previous assembly. + Example : $self->rollback_result_features($rset); + Description: Deletes all result_feature records for the given ResultSet. + Also deletes 'RESULT_FEATURE_SET' status. + Returntype : None + Exceptions : Throws if ResultSet not provided + Caller : General + Status : At risk + +=cut + + +sub rollback_ResultFeatures{ + my ($self, $rset, $slice, $no_revoke) = @_; + + if(! (ref($rset) && $rset->can('adaptor') && defined $rset->adaptor)){ + throw('Must provide a valid stored Bio::EnsEMBL::ResultSet'); + } + + if(! $slice && $no_revoke){ + throw("Cannot rollback_ResultFeatures with no_reovke unless you specify a Slice"); + } + #else warn if slice and no_revoke? + + my ($sql, $slice_name, $slice_constraint); + + if($slice){ + + if(ref($slice) && $slice->isa('Bio::EnsEMBL::Slice')){ + my $sr_id = $rset->adaptor->db->get_ResultFeatureAdaptor->get_seq_region_id_by_Slice($slice); + + if($sr_id){ + + #Need to test for full slice here + my $full_slice = $slice->adaptor->fetch_by_region(undef, $slice->seq_region_name); + $slice_name = "\t".$slice->name; + $slice_constraint = ' and seq_region_id='.$sr_id; + + if(($slice->start != 1) || + ($slice->end != $full_slice->end)){ + + throw("rollback_ResultFeatures does not yet support non-full length Slices:\t".$slice_name); + + #Need to test whether we have non-0 wsize collections without the exact seq_region values + #$sql='SELECT window_size from result_feature where result_feature_id='.$rset->dbID. + # ' and window_size!=0 and seq_region_start!='.$slice->start.' and seq_region_end!='.$slice->end.$slice_constraint; + } + } + else{#seq_region is not yet present in DB + return; + } + } + else{ + throw('slice argument must be a valid Bio::EnsEMBL::Slice'); + } + } + + #We're still validating against itself?? + #And reciprocating part of the test :| + my $db = $rset->adaptor->db; + $db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::ResultSet', $rset); + + #Do this conditionally on whether it is a result_feature_set? + #This may break if we have removed the status but not finished the rollback so no! + $self->log("Rolling back result_feature table for ResultSet:\t".$rset->name.$slice_name); + + #Rollback status entry + if($rset->has_status('RESULT_FEATURE_SET') && ! $no_revoke){ + $rset->adaptor->revoke_status('RESULT_FEATURE_SET', $rset); + } + + #Cannot use revoke_states here? + #We can if we retrieve the Chip or Channel first + #Add to ResultSet adaptor + $sql = 'DELETE from result_feature where result_set_id='.$rset->dbID.$slice_constraint; + $self->rollback_table($sql, 'result_feature', 'result_feature_id', $db); + + return; +} + + + +=head2 rollback_ArrayChips + + Arg[1] : ARRAYREF: Bio::EnsEMBL::Funcgen::ArrayChip objects + Example : $self->rollback_ArrayChips([$achip1, $achip2]); + Description: Deletes all Probes, ProbeSets, ProbeFeatures and + states associated with this ArrayChip + Returntype : None + Exceptions : Throws if ArrayChip not valid and stored + Throws if ArrayChips are not of same class + Caller : General + Status : At risk + +=cut + +#This should be tied to a CS id!!! +#And analysis dependant? +#We may not want to delete alignment by different analyses? +#In practise the slice methods ignore analysis_id for this table +#So we currently never use this! +#So IMPORTED status should be tied to CS id and Analysis id? + +sub rollback_ArrayChips{ + my ($self, $acs, $mode, $force, $keep_xrefs, $no_clean_up, $force_clean_up) = @_; + + #no_clean_up and force_clean_up allow analyze/optimize to be skipped until the last rollback + #We could get around this by specifying all ArrayChips for all formats at the same time? + #Need to implement in RollbackArrays + + $mode ||= 'probe'; + + if($mode && ($mode ne 'probe' && + $mode ne 'probe_feature' && + $mode ne 'ProbeAlign' && + $mode ne 'ProbeTranscriptAlign' && + $mode ne 'probe2transcript')){ + throw("You have passed an invalid mode argument($mode), you must omit or specify either 'probe2transcript', 'probe', 'ProbeAlign, 'ProbeTranscriptAlign' or 'probe_feature' for all of the Align output"); + } + + if($force && ($force ne 'force')){ + throw("You have not specified a valid force argument($force), you must specify 'force' or omit"); + } + + if($keep_xrefs && ($keep_xrefs ne 'keep_xrefs')){ + throw("You have not specified a valid keep_xrefs argument($keep_xrefs), you must specify 'keep_xrefs' or omit"); + } + + + if($keep_xrefs){ + + if($mode eq 'probe' || $mode eq 'probe2transcript'){ + throw("You cannot specify 'keep_xrefs' with mode $mode, you can only rollback features e.g. probe_feature, ProbeAlign or ProbeTranscriptAlign"); + } + + if($force){ + throw("You cannot 'force' delete the probe2transcript xrefs and 'keep_xrefs' at the same time. Please specify just one."); + } + } + + + + + my ($adaptor, $db, %classes); + + foreach my $ac(@$acs){ + $adaptor ||= $ac->adaptor || throw('ArrayChip must have an adaptor'); + $db ||= $adaptor->db; + $db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::ArrayChip', $ac); + + if(! $ac->get_Array->class){ + throw('The ArrayChip you are trying to rollback does not have a class attribute'); + } + + + $classes{$ac->get_Array->class} = undef; + + #if($class && ($class ne $ac->get_Array->class)){ + # throw('You can only rollback_ArrayChips for ArrayChips with the same class'); + #} + } + + + #This is always the case as we register the association before we set the Import status + #Hence the 2nd stage of the import fails as we have an associated ExperimentalChip + #We need to make sure the ExperimentalChip and Channel have not been imported!!! + warn "NOTE: rollback_ArrayChips. Need to implement ExperimentlChip check, is the problem that ExperimentalChips are registered before ArrayChips imported?"; + #Check for dependent ExperimentalChips + #if(my @echips = @{$db->get_ExperimentalChipAdaptor->fetch_all_by_ArrayChip($ac)}){ +# my %exps; +# my $txt = "Experiment\t\t\t\tExperimentalChip Unique IDs\n"; + +# foreach my $ec(@echips){ +# $exps{$ec->get_Experiment->name} ||= ''; + +# $exps{$ec->get_Experiment->name} .= "\t".$ec->unique_id; +# } + +# map {$txt.= "\t".$_.":".$exps{$_}."\n"} keys %exps; + +# throw("Cannot rollback ArrayChip:\t".$ac->name. +# "\nFound Dependent Experimental Data:\n".$txt); +# } + + + my $ac_names = join(', ', (map { $_->name } @$acs)); + my $ac_ids = join(', ', (map { $_->dbID } @$acs)); + + + $self->log("Rolling back ArrayChips $mode entries:\t$ac_names"); + my ($row_cnt, $probe_join, $sql); + #$ac->adaptor->revoke_states($ac);#This need to be more specific to the type of rollback + my $species = $db->species; + + if(!$species){ + throw('Cannot rollback probe2transcript level xrefs without specifying a species for the DBAdaptor'); + } + #Will from registry? this return Homo sapiens? + #Or homo_sapiens + ($species = lc($species)) =~ s/ /_/; + + my $transc_edb_name = "${species}_core_Transcript"; + my $genome_edb_name = "${species}_core_Genome"; + + #Maybe we want to rollback ProbeAlign and ProbeTranscriptAlign output separately so we + #can re-run just one part of the alignment step. + + + #We want this Probe(Transcript)Align rollback available in the environment + #So we can do it natively and before we get to the RunnableDB stage, + #where we would be trying multiple rollbacks in parallel + #Wrapper script? + #Or do we keep it simple here and maintain probe_feature wide rollback + #And just the ProbeAlign/ProbeTranscriptAlign roll back in the environment? + + + #We can restrict the probe deletes using the ac_id + #We should test for other ac_ids using the same probe_id + #Then fail unless we have specified force delete + + #These should be deleted for all other modes but only if force is set? + #This may delete xrefs for other ArrayChips + + #The issues is if we need to specify force for one delete but don't want to delete something else? + #force should only be used to delete upto and including the mode specified + #no mode equates to probe mode + #if no force then we fail if previous levels/modes have xrefs etc... + + + #Let's grab the edb ids first and use them directly, this will avoid table locks on edb + #and should also speed query up? + + + if($mode eq 'probe2transcript' || + $force){ + + #Delete ProbeFeature UnmappedObjects + $self->log("Deleting probe2transcript ProbeFeature UnmappedObjects"); + $sql = "DELETE uo FROM analysis a, unmapped_object uo, probe p, probe_feature pf, external_db e WHERE a.logic_name ='probe2transcript' AND a.analysis_id=uo.analysis_id AND p.probe_id=pf.probe_id and pf.probe_feature_id=uo.ensembl_id and uo.ensembl_object_type='ProbeFeature' and uo.external_db_id=e.external_db_id AND e.db_name ='${transc_edb_name}' AND p.array_chip_id IN($ac_ids)"; + $self->rollback_table($sql, 'unmapped_object', 'unmapped_object_id', $db, $no_clean_up); + + + #Delete ProbeFeature Xrefs/DBEntries + $self->log("Deleting probe2transcript ProbeFeature Xrefs"); + $sql = "DELETE ox FROM xref x, object_xref ox, probe p, probe_feature pf, external_db e WHERE x.external_db_id=e.external_db_id AND e.db_name ='${transc_edb_name}' AND x.xref_id=ox.xref_id AND ox.ensembl_object_type='ProbeFeature' AND ox.ensembl_id=pf.probe_feature_id AND pf.probe_id=p.probe_id AND ox.linkage_annotation!='ProbeTranscriptAlign' AND p.array_chip_id IN($ac_ids)"; + $self->rollback_table($sql, 'object_xref', 'object_xref_id', $db, $no_clean_up); + + + #Probe/Set specific entries + for my $xref_object('Probe', 'ProbeSet'){ + $probe_join = ($xref_object eq 'ProbeSet') ? 'p.probe_set_id' : 'p.probe_id'; + + #Delete Probe/Set UnmappedObjects + + $self->log("Deleting probe2transcript $xref_object UnmappedObjects"); + + $sql = "DELETE uo FROM analysis a, unmapped_object uo, probe p, external_db e WHERE a.logic_name='probe2transcript' AND a.analysis_id=uo.analysis_id AND uo.ensembl_object_type='${xref_object}' AND $probe_join=uo.ensembl_id AND uo.external_db_id=e.external_db_id AND e.db_name='${transc_edb_name}' AND p.array_chip_id IN($ac_ids)"; + #.' and edb.db_release="'.$schema_build.'"'; + $self->rollback_table($sql, 'unmapped_object', 'unmapped_object_id', $db, $no_clean_up); + + #Delete Probe/Set Xrefs/DBEntries + $sql = "DELETE ox FROM xref x, object_xref ox, external_db e, probe p WHERE x.xref_id=ox.xref_id AND e.external_db_id=x.external_db_id AND e.db_name ='${transc_edb_name}' AND ox.ensembl_object_type='${xref_object}' AND ox.ensembl_id=${probe_join} AND p.array_chip_id IN($ac_ids)"; + $self->log("Deleting probe2transcript $xref_object xref records"); + $self->rollback_table($sql, 'object_xref', 'object_xref_id', $db, $no_clean_up); + } + } + elsif(! $keep_xrefs){#Need to check for existing xrefs if not force + #we don't know whether this is on probe or probeset level + #This is a little hacky as there's not way we can guarantee this xref will be from probe2transcript + #until we get the analysis_id moved from identity_xref to xref + #We are also using the Probe/Set Xrefs as a proxy for all other Xrefs and UnmappedObjects + #Do we need to set a status here? Would have problem rolling back the states of associated ArrayChips + + for my $xref_object('Probe', 'ProbeSet'){ + + $probe_join = ($xref_object eq 'ProbeSet') ? 'p.probe_set_id' : 'p.probe_id'; + + $row_cnt = $db->dbc->db_handle->selectrow_array("SELECT COUNT(*) FROM xref x, object_xref ox, external_db e, probe p WHERE x.xref_id=ox.xref_id AND e.external_db_id=x.external_db_id AND e.db_name ='${transc_edb_name}' and ox.ensembl_object_type='${xref_object}' and ox.ensembl_id=${probe_join} AND p.array_chip_id IN($ac_ids)"); + + if($row_cnt){ + throw("Cannot rollback ArrayChips($ac_names), found $row_cnt $xref_object Xrefs. Pass 'force' argument or 'probe2transcript' mode to delete"); + } + else{ + #$self->log("Found $row_cnt $xref_object Xrefs"); + } + } + } + + + #ProbeFeatures inc ProbeTranscriptAlign xrefs + + if($mode ne 'probe2transcript'){ + + if(($mode eq 'probe' && $force) || + $mode eq 'probe_feature' || + $mode eq 'ProbeAlign' || + $mode eq 'ProbeTranscriptAlign'){ + + + #Should really revoke some state here but we only have IMPORTED + + #ProbeTranscriptAlign Xref/DBEntries + + #my (@anal_ids) = @{$db->get_AnalysisAdaptor->generic_fetch("a.module='ProbeAlign'")}; + #Grrrr! AnalysisAdaptor is not a standard BaseAdaptor implementation + #my @anal_ids = @{$db->dbc->db_handle->selectall_arrayref('select analysis_id from analysis where module like "%ProbeAlign"')}; + #@anal_ids = map {$_= "@$_"} @anal_ids; + + if($mode ne 'ProbeAlign'){ + my $lnames = join(', ', (map { "'${_}_ProbeTranscriptAlign'" } keys(%classes))); + + $sql = "DELETE ox from object_xref ox, xref x, probe p, probe_feature pf, external_db e WHERE ox.ensembl_object_type='ProbeFeature' AND ox.linkage_annotation='ProbeTranscriptAlign' AND ox.xref_id=x.xref_id AND e.external_db_id=x.external_db_id and e.db_name='${transc_edb_name}' AND ox.ensembl_id=pf.probe_feature_id AND pf.probe_id=p.probe_id AND p.array_chip_id IN($ac_ids)"; + $self->log("Deleting ProbeFeature Xref/DBEntry records for:\t$lnames"); + $self->rollback_table($sql, 'object_xref', 'object_xref_id', $db, $no_clean_up); + + + #Can't include uo.type='ProbeTranscriptAlign' in these deletes yet as uo.type is enum'd to xref or probe2transcript + #will have to join to analysis and do a like "%ProbeTranscriptAlign" on the the logic name? + #or/and ur.summary_description='Promiscuous probe'? + + $sql = "DELETE uo from unmapped_object uo, probe p, external_db e, analysis a WHERE uo.ensembl_object_type='Probe' AND uo.analysis_id=a.analysis_id AND a.logic_name in (${lnames}) AND e.external_db_id=uo.external_db_id and e.db_name='${transc_edb_name}' AND uo.ensembl_id=p.probe_id AND p.array_chip_id IN($ac_ids)"; + + $self->log("Deleting UnmappedObjects for:\t${lnames}"); + $self->rollback_table($sql, 'unmapped_object', 'unmapped_object_id', $db, $no_clean_up); + + + #Now the actual ProbeFeatures + $sql = "DELETE pf from probe_feature pf, probe p, analysis a WHERE a.logic_name in(${lnames}) AND a.analysis_id=pf.analysis_id AND pf.probe_id=p.probe_id AND p.array_chip_id IN($ac_ids)"; + $self->log("Deleting ProbeFeatures for:\t${lnames}"); + $self->rollback_table($sql, 'probe_feature', 'probe_feature_id', $db, $no_clean_up); + } + + if($mode ne 'ProbeTranscriptAlign'){ + my $lnames = join(', ', (map { "'${_}_ProbeAlign'" } keys(%classes))); + + $sql = "DELETE uo from unmapped_object uo, probe p, external_db e, analysis a WHERE uo.ensembl_object_type='Probe' AND uo.analysis_id=a.analysis_id AND a.logic_name=(${lnames}) AND e.external_db_id=uo.external_db_id and e.db_name='${genome_edb_name}' AND uo.ensembl_id=p.probe_id AND p.array_chip_id IN($ac_ids)"; + $self->log("Deleting UnmappedObjects for:\t${lnames}"); + $self->rollback_table($sql, 'unmapped_object', 'unmapped_object_id', $db, $no_clean_up); + + + $sql = "DELETE pf from probe_feature pf, probe p, analysis a WHERE a.logic_name in(${lnames}) AND a.analysis_id=pf.analysis_id AND pf.probe_id=p.probe_id AND p.array_chip_id IN($ac_ids)"; + $self->log("Deleting ProbeFeatures for:\t${lnames}"); + $self->rollback_table($sql, 'probe_feature', 'probe_feature_id', $db, $no_clean_up); + } + } + else{ + #Need to count to see if we can carry on with a unforced probe rollback? + #Do we need this level of control here + #Can't we assume that if you want probe you also want probe_feature? + #Leave for safety, at least until we get the dependant ExperimetnalChip test sorted + #What about if we only want to delete one array from an associated set? + #This would delete all the features from the rest? + + $sql = "select count(*) from object_xref ox, xref x, probe p, external_db e WHERE ox.ensembl_object_type='ProbeFeature' AND ox.linkage_annotation='ProbeTranscriptAlign' AND ox.xref_id=x.xref_id AND e.external_db_id=x.external_db_id and e.db_name='${transc_edb_name}' AND ox.ensembl_id=p.probe_id AND p.array_chip_id IN($ac_ids)"; + $row_cnt = $db->dbc->db_handle->selectrow_array($sql); + + if($row_cnt){ + throw("Cannot rollback ArrayChips($ac_names), found $row_cnt ProbeFeatures. Pass 'force' argument or 'probe_feature' mode to delete"); + } + else{ + $self->log("Found $row_cnt ProbeFeatures"); + } + } + + if($mode eq 'probe'){ + #Don't need to rollback on a CS as we have no dependant EChips? + #Is this true? Should we enforce a 3rd CoordSystem argument, 'all' string we delete all? + + foreach my $ac(@$acs){ + $ac->adaptor->revoke_states($ac);#Do we need to change this to revoke specific states? + #Current states are only IMPORTED, so not just yet, but we could change this for safety? + } + + #ProbeSets + $sql = "DELETE ps from probe p, probe_set ps where p.array_chip_id IN($ac_ids) and p.probe_set_id=ps.probe_set_id"; + $self->rollback_table($sql, 'probe_set', 'probe_set_id', $db, $no_clean_up); + + #Probes + $sql = "DELETE from probe where array_chip_id IN($ac_ids)"; + $self->rollback_table($sql, 'probe', 'probe_id', $db, $no_clean_up); + } + } + + $self->log("Finished $mode roll back for ArrayChip:\t$ac_names"); + return; +} + + +#This will just fail silently if the reset value +#Is less than the true autoinc value +#i.e. if there are parallel inserts going on +#So we can never assume that the $new_auto_inc will be used + + +sub rollback_table{ + my ($self, $sql, $table, $id_field, $db, $no_clean_up, $force_clean_up) = @_; + + my $row_cnt; + eval { $row_cnt = $db->dbc->do($sql) }; + + if($@){ + throw("Failed to rollback table $table using sql:\t$sql\n$@"); + } + + $row_cnt = 0 if $row_cnt eq '0E0'; + $self->log("Deleted $row_cnt $table records"); + + if($force_clean_up || + ($row_cnt && ! $no_clean_up)){ + $self->refresh_table($table, $id_field, $db); + } + + return; +} + +#Now separated so that we can do this once at the end of a rollback of many Sets + +sub refresh_table{ + my ($self, $table, $id_field, $db) = @_; + + #This only works if the new calue is available + #i.e. do not need lock for this to be safe + $self->reset_table_autoinc($table, $id_field, $db) if $id_field; + + $self->log("Optimizing and Analyzing $table"); + + $db->dbc->do("optimize table $table");#defrag data, sorts indices, updates table stats + $db->dbc->do("analyze table $table");#analyses key distribution + + return; +} + + + +sub reset_table_autoinc{ + #Is this called elsewhere or can we merge with + my($self, $table_name, $autoinc_field, $db) = @_; + + if(! ($table_name && $autoinc_field && $db)){ + throw('You must pass a table_name and an autoinc_field to reset the autoinc value'); + } + + if(! (ref($db) && $db->isa('Bio::EnsEMBL::DBSQL::DBAdaptor'))){ + throw('Must pass a valid Bio::EnsEMBL::DBSQL::DBAdaptor'); + } + + #my $sql = "show table status where name='$table_name'"; + #my ($autoinc) = ${$db->dbc->db_handle->selectrow_array($sql)}[11]; + #11 is the field in the show table status table + #We cannot select just the Auto_increment, so this will fail if the table format changes + + #Why do we need autoinc here? + + my $sql = "select $autoinc_field from $table_name order by $autoinc_field desc limit 1"; + my ($current_auto_inc) = $db->dbc->db_handle->selectrow_array($sql); + my $new_autoinc = ($current_auto_inc) ? ($current_auto_inc + 1) : 1; + $sql = "ALTER TABLE $table_name AUTO_INCREMENT=$new_autoinc"; + $db->dbc->do($sql); + return; +} + + + + +=head2 get_core_display_name_by_stable_id + + Args [1] : Bio::EnsEMBL::DBSQL::DBAdaptor + Args [2] : stable ID from core DB. + Args [3] : stable feature type e.g. gene, transcript, translation + Example : $self->validate_and_store_feature_types; + Description: Builds a cache of stable ID to display names. + Returntype : string - display name + Exceptions : Throws is type is not valid. + Caller : General + Status : At risk + +=cut + +# -------------------------------------------------------------------------------- +# Build a cache of ensembl stable ID -> display_name +# Return hashref keyed on {$type}{$stable_id} +#Need to update cache if we're doing more than one 'type' at a time +# as it will never get loaded for the new type! + +sub get_core_display_name_by_stable_id{ + my ($self, $cdb, $stable_id, $type) = @_; + + $type = lc($type); + + if($type !~ /(gene|transcript|translation)/){ + throw("Cannot get display_name for stable_id $stable_id with type $type"); + } + + if(! exists $self->{'display_name_cache'}->{$stable_id}){ + ($self->{'display_name_cache'}->{$stable_id}) = $cdb->dbc->db_handle->selectrow_array("SELECT x.display_label FROM $type t, xref x where t.display_xref_id=x.xref_id and t.stable_id='${stable_id}'"); + } + + return $self->{'display_name_cache'}->{$stable_id}; +} + + +=head2 get_core_stable_id_by_display_name + + Args [1] : Bio::EnsEMBL::DBSQL::DBAdaptor + Args [2] : display name (e.g. from core DB or GNC name) + Example : + Description: Builds a cache of stable ID to display names. + Returntype : string - gene stable ID + Exceptions : None + Caller : General + Status : At risk + +=cut + +# -------------------------------------------------------------------------------- +# Build a cache of ensembl stable ID -> display_name +# Return hashref keyed on {$type}{$stable_id} +#Need to update cache if we're doing more than one 'type' at a time +# as it will never get loaded for the new type! + +sub get_core_stable_id_by_display_name{ + my ($self, $cdb, $display_name) = @_; + + #if($type !~ /(gene|transcript|translation)/){ +# throw("Cannot get display_name for stable_id $stable_id with type $type"); +# } + + if(! exists $self->{'stable_id_cache'}->{$display_name}){ + ($self->{'stable_id_cache'}->{$display_name}) = $cdb->dbc->db_handle->selectrow_array("SELECT g.stable_id FROM gene g, xref x where g.display_xref_id=x.xref_id and and x.display_label='${display_name}'"); + } + + return $self->{'stable_id_cache'}->{$display_name}; +} + + + + + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Gene.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Gene.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1618 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Gene - Object representing a genes + +=head1 SYNOPSIS + + my $gene = Bio::EnsEMBL::Gene->new( + -START => 123, + -END => 1045, + -STRAND => 1, + -SLICE => $slice + ); + + # print gene information + print("gene start:end:strand is " + . join( ":", map { $gene->$_ } qw(start end strand) ) + . "\n" ); + + # set some additional attributes + $gene->stable_id('ENSG000001'); + $gene->description('This is the gene description'); + +=head1 DESCRIPTION + +A representation of a Gene within the Ensembl system. A gene is a set of one or +more alternative transcripts. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Gene; + +use strict; + +use POSIX; +use Bio::EnsEMBL::Feature; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw warning deprecate); +use Bio::EnsEMBL::Utils::Scalar qw(assert_ref); + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Feature); + + +=head2 new + + Arg [-START] : + int - start postion of the gene + Arg [-END] : + int - end position of the gene + Arg [-STRAND] : + int - 1,-1 tehe strand the gene is on + Arg [-SLICE] : + Bio::EnsEMBL::Slice - the slice the gene is on + Arg [-STABLE_ID] : + string - the stable identifier of this gene + Arg [-VERSION] : + int - the version of the stable identifier of this gene + Arg [-EXTERNAL_NAME] : + string - the external database name associated with this gene + Arg [-EXTERNAL_DB] : + string - the name of the database the external name is from + Arg [-EXTERNAL_STATUS]: + string - the status of the external identifier + Arg [-DISPLAY_XREF]: + Bio::EnsEMBL::DBEntry - The external database entry that is used + to label this gene when it is displayed. + Arg [-TRANSCRIPTS]: + Listref of Bio::EnsEMBL::Transcripts - this gene's transcripts + Arg [-CREATED_DATE]: + string - the date the gene was created + Arg [-MODIFIED_DATE]: + string - the date the gene was last modified + Arg [-DESCRIPTION]: + string - the genes description + Arg [-BIOTYPE]: + string - the biotype e.g. "protein_coding" + Arg [-STATUS]: + string - the gene status i.e. "KNOWN","NOVEL" + Arg [-SOURCE]: + string - the genes source, e.g. "ensembl" + Arg [-IS_CURRENT]: + Boolean - specifies if this is the current version of the gene + Arg [-CANONICAL_TRANSCRIPT]: + Bio::EnsEMBL::Transcript - the canonical transcript of this gene + Arg [-CANONICAL_TRANSCRIPT_ID]: + integer - the canonical transcript dbID of this gene, if the + transcript object itself is not available. + Arg [-CANONICAL_ANNOTATION]: + string - canonical annotation + + Example : $gene = Bio::EnsEMBL::Gene->new(...); + Description: Creates a new gene object + Returntype : Bio::EnsEMBL::Gene + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_); + my ( + $stable_id, $version, + $external_name, $type, + $external_db, $external_status, + $display_xref, $description, + $transcripts, $created_date, + $modified_date, $confidence, + $biotype, $source, + $status, $is_current, + $canonical_transcript_id, $canonical_transcript, + $canonical_annotation + ) + = rearrange( [ + 'STABLE_ID', 'VERSION', + 'EXTERNAL_NAME', 'TYPE', + 'EXTERNAL_DB', 'EXTERNAL_STATUS', + 'DISPLAY_XREF', 'DESCRIPTION', + 'TRANSCRIPTS', 'CREATED_DATE', + 'MODIFIED_DATE', 'CONFIDENCE', + 'BIOTYPE', 'SOURCE', + 'STATUS', 'IS_CURRENT', + 'CANONICAL_TRANSCRIPT_ID', 'CANONICAL_TRANSCRIPT', + 'CANONICAL_ANNOTATION' + ], + @_ + ); + + + if ($transcripts) { + $self->{'_transcript_array'} = $transcripts; + $self->recalculate_coordinates(); + } + + $self->stable_id($stable_id); + $self->version($version); + $self->{'created_date'} = $created_date; + $self->{'modified_date'} = $modified_date; + + $self->external_name($external_name) if ( defined $external_name ); + $self->external_db($external_db) if ( defined $external_db ); + $self->external_status($external_status) + if ( defined $external_status ); + $self->display_xref($display_xref) if ( defined $display_xref ); + $self->biotype($type) if ( defined $type ); + $self->biotype($biotype) if ( defined $biotype ); + $self->description($description); + $self->status($confidence); # incase old naming is used. + # kept to ensure routine is backwards compatible. + $self->status($status); # add new naming + $self->source($source); + + # default to is_current + $is_current = 1 unless (defined($is_current)); + $self->{'is_current'} = $is_current; + + # Add the canonical transcript if we were given one, otherwise add the + # canonical transcript internal ID if we were given one. + if ( defined($canonical_transcript) ) { + $self->canonical_transcript($canonical_transcript); + } elsif ( defined($canonical_transcript_id) ) { + $self->{'canonical_transcript_id'} = $canonical_transcript_id; + } + + $self->canonical_annotation($canonical_annotation) + if ( defined $canonical_annotation ); + + return $self; +} + + +=head2 is_known + + Example : print "Gene ".$gene->stable_id." is KNOWN\n" if $gene->is_known; + Description: Returns TRUE if this gene has a status of 'KNOWN' + Returntype : TRUE if known, FALSE otherwise + Exceptions : none + Caller : general + Status : Stable + +=cut + + +sub is_known{ + my $self = shift; + return ( $self->{'status'} eq "KNOWN" || $self->{'status'} eq "KNOWN_BY_PROJECTION" ); +} + + +=head2 external_name + + Arg [1] : (optional) String - the external name to set + Example : $gene->external_name('BRCA2'); + Description: Getter/setter for attribute external_name. + Returntype : String or undef + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub external_name { + my $self = shift; + + $self->{'external_name'} = shift if (@_); + + if (defined $self->{'external_name'}) { + return $self->{'external_name'}; + } + + my $display_xref = $self->display_xref(); + + if (defined $display_xref) { + return $display_xref->display_id(); + } else { + return undef; + } +} + + +=head2 status + + Arg [1] : (optional) String - status to set + Example : $gene->status('KNOWN'); + Description: Getter/setter for attribute status + Returntype : String + Exceptions : none + Caller : general + Status : Medium Risk + +=cut + +sub status { + my $self = shift; + $self->{'status'} = shift if( @_ ); + return $self->{'status'}; +} + + +=head2 source + + Arg [1] : (optional) String - the source to set + Example : $gene->source('ensembl'); + Description: Getter/setter for attribute source + Returntype : String + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub source { + my $self = shift; + $self->{'source'} = shift if( @_ ); + return ( $self->{'source'} || "ensembl" ); +} + + +=head2 external_db + + Arg [1] : (optional) String - name of external db to set + Example : $gene->external_db('HGNC'); + Description: Getter/setter for attribute external_db. The db is the one that + belongs to the external_name. + Returntype : String + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub external_db { + my $self = shift; + + $self->{'external_db'} = shift if( @_ ); + + if( exists $self->{'external_db'} ) { + return $self->{'external_db'}; + } + + my $display_xref = $self->display_xref(); + + if( defined $display_xref ) { + return $display_xref->dbname() + } else { + return undef; + } +} + + +=head2 external_status + + Arg [1] : (optional) String - status of the external db + Example : $gene->external_status('KNOWNXREF'); + Description: Getter/setter for attribute external_status. The status of + the external db of the one that belongs to the external_name. + Returntype : String + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub external_status { + my $self = shift; + + $self->{'_ext_status'} = shift if ( @_ ); + return $self->{'_ext_status'} if exists $self->{'_ext_status'}; + + my $display_xref = $self->display_xref(); + + if( defined $display_xref ) { + return $display_xref->status() + } else { + return undef; + } +} + + +=head2 description + + Arg [1] : (optional) String - the description to set + Example : $gene->description('This is the gene\'s description'); + Description: Getter/setter for gene description + Returntype : String + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub description { + my $self = shift; + $self->{'description'} = shift if( @_ ); + return $self->{'description'}; +} + + +=head2 equals + + Arg [1] : Bio::EnsEMBL::Gene gene + Example : if ($geneA->equals($geneB)) { ... } + Description : Compares two genes for equality. + The test for eqality goes through the following list + and terminates at the first true match: + + 1. If Bio::EnsEMBL::Feature::equals() returns false, + then the genes are *not* equal. + 2. If the biotypes differ, then the genes are *not* + equal. + 3. If both genes have stable IDs: if these are the + same, the genes are equal, otherwise not. + 4. If both genes have the same number of transcripts + and if these are (when compared pair-wise sorted by + start-position and length) the same, then they are + equal, otherwise not. + + Return type : Boolean (0, 1) + + Exceptions : Thrown if a non-gene is passed as the argument. + +=cut + +sub equals { + my ( $self, $gene ) = @_; + + if ( !defined($gene) ) { return 0 } + if ( $self eq $gene ) { return 1 } + + assert_ref( $gene, 'Bio::EnsEMBL::Gene' ); + + my $feature_equals = $self->SUPER::equals($gene); + if ( defined($feature_equals) && $feature_equals == 0 ) { + return 0; + } + + if ( $self->biotype() ne $gene->biotype() ) { + return 0; + } + + if ( defined( $self->stable_id() ) && defined( $gene->stable_id() ) ) + { + if ( $self->stable_id() eq $gene->stable_id() ) { return 1 } + else { return 0 } + } + + my @self_transcripts = sort { + $a->start() <=> $b->start() || + $a->length() <=> $b->length() + } @{ $self->get_all_Transcripts() }; + my @gene_transcripts = sort { + $a->start() <=> $b->start() || + $a->length() <=> $b->length() + } @{ $gene->get_all_Transcripts() }; + + if ( scalar(@self_transcripts) != scalar(@gene_transcripts) ) { + return 0; + } + + while (@self_transcripts) { + my $self_transcript = shift(@self_transcripts); + my $gene_transcript = shift(@gene_transcripts); + + if ( !$self_transcript->equals($gene_transcript) ) { + return 0; + } + } + + return 1; +} ## end sub equals + +=head2 canonical_transcript + + Arg [1] : (optional) Bio::EnsEMBL::Transcipt - canonical_transcript object + Example : $gene->canonical_transcript($canonical_transcript); + Description: Getter/setter for the canonical_transcript + Returntype : Bio::EnsEMBL::Transcript + Exceptions : Throws if argument is not a transcript object. + Caller : general + Status : Stable + +=cut + +sub canonical_transcript { + my ( $self, $transcript ) = @_; + + if ( defined($transcript) ) { + # We're attaching a new canonical transcript. + + assert_ref( $transcript, 'Bio::EnsEMBL::Transcript' ); + + # If there's already a canonical transcript, make sure it doesn't + # think it's still canonical. + if ( defined( $self->{'canonical_transcript'} ) ) { + $self->{'canonical_transcript'}->is_canonical(0); + } + + $self->{'canonical_transcript'} = $transcript; + $self->{'canonical_transcript_id'} = $transcript->dbID(); + + $transcript->is_canonical(1); + + } elsif ( !defined( $self->{'canonical_transcript'} ) + && defined( $self->{'canonical_transcript_id'} ) + && $self->{'canonical_transcript_id'} != 0 ) + { + # We have not attached a canoncical transcript, but we have the dbID + # of one. + + if ( defined( $self->adaptor() ) ) { + my $transcript_adaptor = + $self->adaptor()->db()->get_TranscriptAdaptor(); + + my $canonical_transcript = + $transcript_adaptor->fetch_by_dbID( + $self->{'canonical_transcript_id'} ); + + if ( defined($canonical_transcript) ) { + # Recusive call... + $self->canonical_transcript($canonical_transcript); + } + + } else { + warning( "Gene has no adaptor " + . "when trying to fetch canonical transcript." ); + } + + } ## end elsif ( !defined( $self->...)) + + return $self->{'canonical_transcript'}; +} ## end sub canonical_transcript + + +=head2 canonical_annotation + + Arg [1] : (optional) String - canonical_annotation + Example : $gene->canonical_annotation('This is the canonical_annotation'); + Description: Getter/setter for the canonical_annotation + Returntype : String + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub canonical_annotation { + my $self = shift; + $self->{'canonical_annotation'} = shift if( @_ ); + return $self->{'canonical_annotation'}; +} + +=head2 get_all_Attributes + + Arg [1] : (optional) String $attrib_code + The code of the attribute type to retrieve values for + Example : my ($author) = @{ $gene->get_all_Attributes('author') }; + my @gene_attributes = @{ $gene->get_all_Attributes }; + Description: Gets a list of Attributes of this gene. + Optionally just get Attributes for given code. + Returntype : Listref of Bio::EnsEMBL::Attribute + Exceptions : warning if gene does not have attached adaptor and attempts lazy + load. + Caller : general + Status : Stable + +=cut + +sub get_all_Attributes { + my $self = shift; + my $attrib_code = shift; + + if ( ! exists $self->{'attributes' } ) { + if (!$self->adaptor() ) { + return []; + } + + my $attribute_adaptor = $self->adaptor->db->get_AttributeAdaptor(); + $self->{'attributes'} = $attribute_adaptor->fetch_all_by_Gene($self); + } + + if ( defined $attrib_code ) { + my @results = grep { uc($_->code()) eq uc($attrib_code) } + @{$self->{'attributes'}}; + return \@results; + } else { + return $self->{'attributes'}; + } +} + + +=head2 add_Attributes + + Arg [1-N] : list of Bio::EnsEMBL::Attribute's @attribs + Attribute(s) to add + Example : my $attrib = Bio::EnsEMBL::Attribute->new(...); + $gene->add_Attributes($attrib); + Description: Adds an Attribute to the Gene. If you add an attribute before + you retrieve any from database, lazy loading will be disabled. + Returntype : none + Exceptions : throw on incorrect arguments + Caller : general + Status : Stable + +=cut + +sub add_Attributes { + my $self = shift; + my @attribs = @_; + + if( ! exists $self->{'attributes'} ) { + $self->{'attributes'} = []; + } + + for my $attrib ( @attribs ) { + if( ! $attrib->isa( "Bio::EnsEMBL::Attribute" )) { + throw( "Argument to add_Attribute has to be an Bio::EnsEMBL::Attribute" ); + } + push( @{$self->{'attributes'}}, $attrib ); + } + + return; +} + + +=head2 add_DBEntry + + Arg [1] : Bio::EnsEMBL::DBEntry $dbe + The dbEntry to be added + Example : my $dbe = Bio::EnsEMBL::DBEntery->new(...); + $gene->add_DBEntry($dbe); + Description: Associates a DBEntry with this gene. Note that adding DBEntries + will prevent future lazy-loading of DBEntries for this gene + (see get_all_DBEntries). + Returntype : none + Exceptions : thrown on incorrect argument type + Caller : general + Status : Stable + +=cut + +sub add_DBEntry { + my $self = shift; + my $dbe = shift; + + unless($dbe && ref($dbe) && $dbe->isa('Bio::EnsEMBL::DBEntry')) { + throw('Expected DBEntry argument'); + } + + $self->{'dbentries'} ||= []; + push @{$self->{'dbentries'}}, $dbe; +} + + +=head2 get_all_DBEntries + + Arg [1] : (optional) String, external database name + + Arg [2] : (optional) String, external_db type + + Example : @dbentries = @{ $gene->get_all_DBEntries() }; + + Description: Retrieves DBEntries (xrefs) for this gene. This does + *not* include DBEntries that are associated with the + transcripts and corresponding translations of this + gene (see get_all_DBLinks()). + + This method will attempt to lazy-load DBEntries + from a database if an adaptor is available and no + DBEntries are present on the gene (i.e. they have not + already been added or loaded). + + Return type: Listref of Bio::EnsEMBL::DBEntry objects + Exceptions : none + Caller : get_all_DBLinks, GeneAdaptor::store + Status : Stable + +=cut + +sub get_all_DBEntries { + my ( $self, $db_name_exp, $ex_db_type ) = @_; + + my $cache_name = 'dbentries'; + + if ( defined($db_name_exp) ) { + $cache_name .= $db_name_exp; + } + + if ( defined($ex_db_type) ) { + $cache_name .= $ex_db_type; + } + + # if not cached, retrieve all of the xrefs for this gene + if ( !defined( $self->{$cache_name} ) && defined( $self->adaptor() ) ) + { + $self->{$cache_name} = + $self->adaptor()->db()->get_DBEntryAdaptor() + ->fetch_all_by_Gene( $self, $db_name_exp, $ex_db_type ); + } + + $self->{$cache_name} ||= []; + + return $self->{$cache_name}; +} ## end sub get_all_DBEntries + +=head2 get_all_object_xrefs + + Arg [1] : (optional) String, external database name + + Arg [2] : (optional) String, external_db type + + Example : @oxrefs = @{ $gene->get_all_object_xrefs() }; + + Description: Retrieves xrefs for this gene. This does *not* + include xrefs that are associated with the + transcripts or corresponding translations of this + gene (see get_all_xrefs()). + + This method will attempt to lazy-load xrefs from a + database if an adaptor is available and no xrefs are + present on the gene (i.e. they have not already been + added or loaded). + + NB: This method is an alias for the + get_all_DBentries() method. + + Return type: Listref of Bio::EnsEMBL::DBEntry objects + + Status : Stable + +=cut + +sub get_all_object_xrefs { + my $self = shift; + return $self->get_all_DBEntries(@_); +} + +=head2 get_all_DBLinks + + Arg [1] : String database name (optional) + SQL wildcard characters (_ and %) can be used to + specify patterns. + + Example : @dblinks = @{ $gene->get_all_DBLinks() }; + @dblinks = @{ $gene->get_all_DBLinks('Uniprot%') }; + + Description: Retrieves *all* related DBEntries for this gene. This + includes all DBEntries that are associated with the + transcripts and corresponding translations of this + gene. + + If you only want to retrieve the DBEntries + associated with the gene (and not the transcript + and translations) then you should use the + get_all_DBEntries() call instead. + + Note: Each entry may be listed more than once. No + uniqueness checks are done. Also if you put in an + incorrect external database name no checks are done + to see if this exists, you will just get an empty + list. + + Return type: Listref of Bio::EnsEMBL::DBEntry objects + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_DBLinks { + my ( $self, $db_name_exp, $ex_db_type ) = @_; + + my @links = + @{ $self->get_all_DBEntries( $db_name_exp, $ex_db_type ) }; + + # Add all of the transcript and translation xrefs to the return list. + foreach my $transcript ( @{ $self->get_all_Transcripts() } ) { + push( @links, + @{$transcript->get_all_DBLinks( $db_name_exp, $ex_db_type ) } + ); + } + + return \@links; +} + +=head2 get_all_xrefs + + Arg [1] : String database name (optional) + SQL wildcard characters (_ and %) can be used to + specify patterns. + + Example : @xrefs = @{ $gene->get_all_xrefs() }; + @xrefs = @{ $gene->get_all_xrefs('Uniprot%') }; + + Description: Retrieves *all* related xrefs for this gene. This + includes all xrefs that are associated with the + transcripts and corresponding translations of this + gene. + + If you want to retrieve the xrefs associated + with only the gene (and not the transcript + or translations) then you should use the + get_all_object_xrefs() method instead. + + Note: Each entry may be listed more than once. No + uniqueness checks are done. Also if you put in an + incorrect external database name no checks are done + to see if this exists, you will just get an empty + list. + + NB: This method is an alias for the + get_all_DBLinks() method. + + Return type: Listref of Bio::EnsEMBL::DBEntry objects + + Status : Stable + +=cut + +sub get_all_xrefs { + my $self = shift; + return $self->get_all_DBLinks(@_); +} + +=head2 get_all_Exons + + Example : my @exons = @{ $gene->get_all_Exons }; + Description: Returns a set of all the exons associated with this gene. + Returntype : Listref of Bio::EnsEMBL::Exon objects + Exceptions : none + Caller : general + Status : Stable + +=cut + + +sub get_all_Exons { + my $self = shift; + + my %h; + my @out = (); + + foreach my $trans ( @{$self->get_all_Transcripts} ) { + foreach my $e ( @{$trans->get_all_Exons} ) { + $h{$e->start()."-".$e->end()."-".$e->strand()."-".$e->phase()."-".$e->end_phase()} = $e; + } + } + + push @out, values %h; + + return \@out; +} + + +=head2 get_all_homologous_Genes + + Description: Queries the Ensembl Compara database and retrieves all + Genes from other species that are orthologous. + REQUIRES properly setup Registry conf file. Meaning that + one of the aliases for each core db has to be "Genus species" + e.g. "Homo sapiens" (as in the name column in genome_db table + in the compara database). + Returntype : listref [ + Bio::EnsEMBL::Gene, + Bio::EnsEMBL::Compara::Homology, + string $species # needed as cannot get spp from Gene + ] + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_homologous_Genes { + my $self = shift; + + if( exists( $self->{'homologues'} ) ){ + return $self->{'homologues'}; + } + $self->{'homologues'} = []; + + # TODO: Find a robust way of retrieving compara dba directly. + # For now look through all DBAs + my $compara_dba; + foreach my $dba( @{Bio::EnsEMBL::Registry->get_all_DBAdaptors} ){ + if( $dba->isa('Bio::EnsEMBL::Compara::DBSQL::DBAdaptor') ){ + $compara_dba = $dba; + last; + } + } + unless( $compara_dba ){ + warning("No compara in Bio::EnsEMBL::Registry"); + return $self->{'homologues'}; + } + + # Get the compara 'member' corresponding to self + my $member_adaptor = $compara_dba->get_adaptor('Member'); + my $query_member = $member_adaptor->fetch_by_source_stable_id + ("ENSEMBLGENE",$self->stable_id); + unless( $query_member ){ return $self->{'homologues'} }; + + # Get the compara 'homologies' corresponding to 'member' + my $homology_adaptor = $compara_dba->get_adaptor('Homology'); + my @homolos = @{$homology_adaptor->fetch_all_by_Member($query_member)}; + unless( scalar(@homolos) ){ return $self->{'homologues'} }; + + # Get the ensembl 'genes' corresponding to 'homologies' + foreach my $homolo( @homolos ){ + foreach my $member_attrib( @{$homolo->get_all_Member_Attribute} ){ + my ($member, $attrib) = @{$member_attrib}; + my $hstable_id = $member->stable_id; + next if ($hstable_id eq $query_member->stable_id); # Ignore self + my $hgene = undef; + eval { $hgene = $member->get_Gene;} ; + unless( $hgene ){ + # Something up with DB. Create a new gene is best we can do + $hgene = Bio::EnsEMBL::Gene->new + ( -stable_id=>$hstable_id, + -description=>$member->description, ); + } + my $hspecies = $member->genome_db->name; + push @{$self->{'homologues'}}, [$hgene,$homolo,$hspecies]; + } + } + return $self->{'homologues'}; +} + + +=head2 biotype + + Arg [1] : (optional) String - the biotype to set + Example : $gene->biotype("protein_coding"); + Description: Getter/setter for the attribute biotype + Returntype : String + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub biotype { + my $self = shift; + + $self->{'biotype'} = shift if( @_ ); + return ( $self->{'biotype'} || "protein_coding" ); +} + + +=head2 add_Transcript + + Arg [1] : Bio::EnsEMBL::Transcript $trans + The transcript to add to the gene + Example : my $transcript = Bio::EnsEMBL::Transcript->new(...); + $gene->add_Transcript($transcript); + Description: Adds another Transcript to the set of alternatively + spliced Transcripts of this gene. If it shares exons + with another Transcript, these should be object-identical. + Returntype : none + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub add_Transcript { + my ($self, $trans) = @_; + + if( !ref $trans || ! $trans->isa("Bio::EnsEMBL::Transcript") ) { + throw("$trans is not a Bio::EnsEMBL::Transcript!"); + } + + $self->{'_transcript_array'} ||= []; + push(@{$self->{'_transcript_array'}},$trans); + + $self->recalculate_coordinates(); +} + + +=head2 get_all_Transcripts + + Example : my @transcripts = @{ $gene->get_all_Transcripts }; + Description: Returns the Transcripts in this gene. + Returntype : Listref of Bio::EnsEMBL::Transcript objects + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_Transcripts { + my $self = shift; + + if( ! exists $self->{'_transcript_array'} ) { + if( defined $self->adaptor() ) { + my $ta = $self->adaptor()->db()->get_TranscriptAdaptor(); + my $transcripts = $ta->fetch_all_by_Gene( $self ); + $self->{'_transcript_array'} = $transcripts; + } + } + return $self->{'_transcript_array'}; +} + + +=head2 get_all_alt_alleles + + Example : my @alt_genes = @{ $gene->get_all_alt_alleles }; + foreach my $alt_gene (@alt_genes) { + print "Alternate allele: " . $alt_gene->stable_id() . "\n"; + } + Description: Returns a listref of Gene objects that represent this Gene on + an alternative haplotype. Empty list if there is no such + Gene (eg there is no overlapping haplotype). + Returntype : listref of Bio::EnsEMBL::Gene objects + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_alt_alleles { + my $self = shift; + my $result = $self->adaptor()->fetch_all_alt_alleles( $self ); + return $result; +} + + +=head2 version + + Arg [1] : (optional) Int + A version number for the stable_id + Example : $gene->version(2); + Description: Getter/setter for version number + Returntype : Int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub version { + my $self = shift; + $self->{'version'} = shift if(@_); + return $self->{'version'}; +} + + +=head2 stable_id + + Arg [1] : (optional) String - the stable ID to set + Example : $gene->stable_id("ENSG0000000001"); + Description: Getter/setter for stable id for this gene. + Returntype : String + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub stable_id { + my $self = shift; + $self->{'stable_id'} = shift if(@_); + return $self->{'stable_id'}; +} + + +=head2 is_current + + Arg [1] : Boolean $is_current + Example : $gene->is_current(1) + Description: Getter/setter for is_current state of this gene. + Returntype : Int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub is_current { + my $self = shift; + $self->{'is_current'} = shift if (@_); + return $self->{'is_current'}; +} + + +=head2 created_date + + Arg [1] : (optional) String - created date to set (as a UNIX time int) + Example : $gene->created_date('1141948800'); + Description: Getter/setter for attribute created_date + Returntype : String + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub created_date { + my $self = shift; + $self->{'created_date'} = shift if ( @_ ); + return $self->{'created_date'}; +} + + +=head2 modified_date + + Arg [1] : (optional) String - modified date to set (as a UNIX time int) + Example : $gene->modified_date('1141948800'); + Description: Getter/setter for attribute modified_date + Returntype : String + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub modified_date { + my $self = shift; + $self->{'modified_date'} = shift if ( @_ ); + return $self->{'modified_date'}; +} + + +=head2 transform + + Arg [1] : String - coordinate system name to transform to + Arg [2] : String - coordinate system version + Example : my $new_gene = $gene->transform('supercontig'); + Description: Moves this gene to the given coordinate system. If this gene has + Transcripts attached, they move as well. + Returntype : Bio::EnsEMBL::Gene + Exceptions : throw on wrong parameters + Caller : general + Status : Stable + +=cut + +sub transform { + my $self = shift; + + # catch for old style transform calls + if( !@_ || ( ref $_[0] && ($_[0]->isa( "Bio::EnsEMBL::Slice" ) or $_[0]->isa( "Bio::EnsEMBL::LRGSlice" )) )) { + deprecate('Calling transform without a coord system name is deprecated.'); + return $self->_deprecated_transform(@_); + } + + my $new_gene = $self->SUPER::transform(@_); + + if ( !defined($new_gene) ) { + # check if this gene projects at all to requested coord system, + # if not we are done. + my @segments = @{ $self->project(@_) }; + if ( !@segments ) { + return undef; + } + } + + # + # If you are transforming the gene then make sure the transcripts and exons are loaded + # + + foreach my $tran (@{$self->get_all_Transcripts}){ + $tran->get_all_Exons(); + } + + if( exists $self->{'_transcript_array'} ) { + my @new_transcripts; + my ( $strand, $slice ); + my $low_start = POSIX::INT_MAX; + my $hi_end = POSIX::INT_MIN; + for my $old_transcript ( @{$self->{'_transcript_array'}} ) { + my $new_transcript = $old_transcript->transform( @_ ); + # this can fail if gene transform failed + + return undef unless $new_transcript; + + if( ! defined $new_gene ) { + if( $new_transcript->start() < $low_start ) { + $low_start = $new_transcript->start(); + } + if( $new_transcript->end() > $hi_end ) { + $hi_end = $new_transcript->end(); + } + $slice = $new_transcript->slice(); + $strand = $new_transcript->strand(); + } + push( @new_transcripts, $new_transcript ); + } + + if( ! defined $new_gene ) { + %$new_gene = %$self; + bless $new_gene, ref( $self ); + + $new_gene->start( $low_start ); + $new_gene->end( $hi_end ); + $new_gene->strand( $strand ); + $new_gene->slice( $slice ); + } + + $new_gene->{'_transcript_array'} = \@new_transcripts; + } + return $new_gene; +} + + +=head2 transfer + + Arg [1] : Bio::EnsEMBL::Slice $destination_slice + Example : my $new_gene = $gene->transfer($slice); + Description: Moves this Gene to given target slice coordinates. If Transcripts + are attached they are moved as well. Returns a new gene. + Returntype : Bio::EnsEMBL::Gene + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub transfer { + my $self = shift; + + my $new_gene = $self->SUPER::transfer( @_ ); + return undef unless $new_gene; + + if( exists $self->{'_transcript_array'} ) { + my @new_transcripts; + for my $old_transcript ( @{$self->{'_transcript_array'}} ) { + my $new_transcript = $old_transcript->transfer( @_ ); + push( @new_transcripts, $new_transcript ); + } + $new_gene->{'_transcript_array'} = \@new_transcripts; + } + return $new_gene; +} + + +=head2 display_xref + + Arg [1] : (optional) Bio::EnsEMBL::DBEntry - the display xref to set + Example : $gene->display_xref($db_entry); + Description: Getter/setter display_xref for this gene. + Returntype : Bio::EnsEMBL::DBEntry + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub display_xref { + my $self = shift; + $self->{'display_xref'} = shift if(@_); + return $self->{'display_xref'}; +} + + +=head2 display_id + + Example : print $gene->display_id(); + Description: This method returns a string that is considered to be + the 'display' identifier. For genes this is (depending on + availability and in this order) the stable Id, the dbID or an + empty string. + Returntype : String + Exceptions : none + Caller : web drawing code + Status : Stable + +=cut + +sub display_id { + my $self = shift; + return $self->{'stable_id'} || $self->dbID || ''; +} + + +=head2 recalculate_coordinates + + Example : $gene->recalculate_coordinates; + Description: Called when transcript added to the gene, tries to adapt the + coords for the gene. + Returntype : none + Exceptions : none + Caller : internal + Status : Stable + +=cut + +sub recalculate_coordinates { + my $self = shift; + + my $transcripts = $self->get_all_Transcripts(); + + return if(!$transcripts || !@$transcripts); + + my ( $slice, $start, $end, $strand ); + $slice = $transcripts->[0]->slice(); + $strand = $transcripts->[0]->strand(); + $start = $transcripts->[0]->start(); + $end = $transcripts->[0]->end(); + + my $transsplicing = 0; + + for my $t ( @$transcripts ) { + if( $t->start() < $start ) { + $start = $t->start(); + } + + if( $t->end() > $end ) { + $end = $t->end(); + } + + if( $t->slice()->name() ne $slice->name() ) { + throw( "Transcripts with different slices not allowed on one Gene" ); + } + + if( $t->strand() != $strand ) { + $transsplicing = 1; + } + } + if( $transsplicing ) { + warning( "Gene contained trans splicing event" ); + } + + $self->start( $start ); + $self->end( $end ); + $self->strand( $strand ); + $self->slice( $slice ); +} + + +=head2 get_all_DASFactories + + Example : $dasref = $prot->get_all_DASFactories + Description: Retrieves a listref of registered DAS objects + TODO: Abstract to a DBLinkContainer obj + Returntype : [ DAS_objects ] + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_DASFactories { + my $self = shift; + return [ $self->adaptor()->db()->_each_DASFeatureFactory ]; +} + + +=head2 get_all_DAS_Features + + Example : $features = $prot->get_all_DAS_Features; + Description: Retreives a hash reference to a hash of DAS feature + sets, keyed by the DNS, NOTE the values of this hash + are an anonymous array containing: + (1) a pointer to an array of features + (2) a pointer to the DAS stylesheet + Returntype : hashref of Bio::SeqFeatures + Exceptions : none + Caller : webcode + Status : Stable + +=cut + +sub get_all_DAS_Features{ + my ($self, @args) = @_; + my $slice = $self->feature_Slice; + return $self->SUPER::get_all_DAS_Features($slice); +} + + + + +=head2 add_unconventional_transcript_association + + Arg [1] : Bio::EnsEMBL::Transcript $trans + The transcript to associate with the gene, in an unconventional manner. + Arg [2] : String $type + The type of association between this gene and this transcript, e.g. + "antisense","sense_intronic","sense_overlaping_exonic","chimeric_sense_exonic" + Example : my $transcript = Bio::EnsEMBL::Transcript->new(...); + $gene->add_unconventional_transcript_association($transcript, "antisense"); + Description: Associate a transcript with this gene in a way that is + non-conventional. + Returntype : none + Exceptions : none + Caller : general + Status : At Risk. + +=cut + +sub add_unconventional_transcript_association { + + my ($self, $transcript, $type) = @_; + + if( !ref $transcript || ! $transcript->isa("Bio::EnsEMBL::Transcript") ) { + throw("$transcript is not a Bio::EnsEMBL::Transcript!"); + } + + my $uta = Bio::EnsEMBL::UnconventionalTranscriptAssociation->new($transcript, $self, $type); + $self->{'_unconventional_transcript_array'} ||= []; + push(@{$self->{'_unconventional_transcript_array'}}, $uta); + +} + + +=head2 get_all_unconventional_transcript_associations + + Arg [1] : (optional) String - Only get transcripts where the association + between this gene and the transcripts is of a certain type. + Example : my @transcripts = @{ $gene->get_all_unconventional_transcript_associations, "antisense" }; + Description: Returns the unconventional transcripts associated with this gene. + Returntype : Listref of Bio::EnsEMBL::UnconventionalTranscriptAssociation objects + Exceptions : none + Caller : general + Status : At risk. + +=cut + +sub get_all_unconventional_transcript_associations { + + my ($self, $type) = @_; + + if( ! exists $self->{'_unconventional_transcript_array'} ) { + $self->{'_unconventional_transcript_array'} = []; + if( defined $self->adaptor() ) { + my $utaa = $self->adaptor()->db()->get_UnconventionalTranscriptAssociationAdaptor(); + my $utas = $utaa->fetch_all_by_gene( $self, $type ); + $self->{'_unconventional_transcript_array'} = $utas; + } + } + + return $self->{'_unconventional_transcript_array'}; +} + +=head2 remove_unconventional_transcript_associations + + Args : None + Example : $gene->remove_unconventional_transcript_associations(); + Description: Returns the unconventional transcripts associated with this gene. + Returntype : Listref of Bio::EnsEMBL::UnconventionalTranscriptAssociation objects + Exceptions : none + Caller : general + Status : At risk. + +=cut + +sub remove_unconventional_transcript_associations { + + my $self = shift; + + $self->{'_unconventional_transcript_array'} = []; + +} + +=head2 load + + Arg [1] : Boolean $load_xrefs + Load (or don't load) xrefs. Default is to load xrefs. + Example : $gene->load(); + Description : The Ensembl API makes extensive use of + lazy-loading. Under some circumstances (e.g., + when copying genes between databases), all data of + an object needs to be fully loaded. This method + loads the parts of the object that are usually + lazy-loaded. It will also call the equivalent + method on all the transcripts of the gene. + Returns : + +=cut + +sub load { + my ( $self, $load_xrefs ) = @_; + + if ( !defined($load_xrefs) ) { $load_xrefs = 1 } + + foreach my $transcript ( @{ $self->get_all_Transcripts() } ) { + $transcript->load($load_xrefs); + } + + $self->analysis(); + $self->get_all_Attributes(); + $self->stable_id(); + $self->canonical_transcript(); + + if ($load_xrefs) { + $self->get_all_DBEntries(); + } +} + +=head2 is_ref + + Description: getter setter for the gene attribute is_ref + Arg [1] : (optional) 1 or 0 + return : boolean + +=cut + +sub is_reference{ + my ( $self, $is_ref) = @_; + + if(defined($is_ref)){ + $self->{'is_ref'} = $is_ref; + } + else{ + $self->{'is_ref'} = $self->adaptor->is_ref($self->dbID); + } + return $self->{'is_ref'}; +} + +=head2 summary_as_hash + + Example : $gene_summary = $gene->summary_as_hash(); + Description : Extends Feature::summary_as_hash + Retrieves a summary of this Gene object. + + Returns : hashref of arrays of descriptive strings + Status : Intended for internal use +=cut + +sub summary_as_hash { + my $self = shift; + my $summary_ref = $self->SUPER::summary_as_hash; + $summary_ref->{'description'} = $self->description; + $summary_ref->{'biotype'} = $self->biotype; + $summary_ref->{'external_name'} = $self->external_name; + return $summary_ref; +} + + +########################### +# DEPRECATED METHODS FOLLOW +########################### + +=head2 DEPRECATED add_DBLink + + Description: DEPRECATED This method has been deprecated in favour of the + add_DBEntry method. Objects are responible for holding only + xrefs directly associated with themselves now. + +=cut + + +sub add_DBLink{ + my ($self,$value) = @_; + + throw("add_DBLink is deprecated. You probably want add_DBEntry."); + + # unless(defined $value && ref $value + # && $value->isa('Bio::Annotation::DBLink') ) { + # throw("This [$value] is not a DBLink"); + # } + + # if( !defined $self->{'_db_link'} ) { + # $self->{'_db_link'} = []; + # } + + # push(@{$self->{'_db_link'}},$value); +} + + +=head2 temporary_id + + Function: DEPRECATED: Use dbID or stable_id or something else instead + +=cut + +sub temporary_id { + my ($obj,$value) = @_; + deprecate( "I cant see what a temporary_id is good for, please use " . + "dbID or stableID or\n try without an id." ); + if( defined $value) { + $obj->{'temporary_id'} = $value; + } + return $obj->{'temporary_id'}; +} + + +=head2 chr_name + + Description: DEPRECATED. Use project, tranform, or transfer to obtain this + gene in another coord system. Use $gene->slice->seq_region_name + to get the name of the underlying coord system. Or + $gene->slice->name(). + +=cut + +sub chr_name { + my $self = shift; + + deprecate( "Use project() to obtain other coordinate systems" ); + + my $gene_slice = $self->slice(); + if( $gene_slice->coord_system()->name eq "chromosome" ) { + return $gene_slice->seq_region_name(); + } + + my $coords = $self->project( "toplevel" ); + + if( @$coords ) { + return $coords->[0]->[2]->seq_region_name(); + } +} + + +=head2 fetch_coded_for_regulatory_factors + + Arg [1] : none + Example : $gene->fetch_coded_for_regualtory_factors() + Description: DEPRECATED: Fetches any regulatory_factors that are coded for by + this gene. + Returntype : Listref of Bio::Ensembl::RegulatoryFactor + Exceptions : + Caller : ? + Status : At Risk + : under development + +=cut + +sub fetch_coded_for_regulatory_factors { + + my ($self) = @_; + + my $rfa = $self->adaptor->db->get_RegulatoryFactorAdaptor(); + + return $rfa->fetch_factors_coded_for_by_gene($self); + +} + + +=head2 type + + Description: DEPRECATED. Use biotype() instead. + +=cut + +sub type { + deprecate("Use biotype() instead"); + biotype(@_); +} + + +=head2 confidence + + Description: DEPRECATED. Use status() instead. + +=cut + +sub confidence { + deprecate("Use status() instead"); + status(@_); +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/IdMapping/Archiver.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/Archiver.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,334 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::IdMapping::Archiver - create gene_archive and peptide_archive + +=head1 SYNOPSIS + + my $archiver = Bio::EnsEMBL::IdMapping::Archiver->new( + -LOGGER => $logger, + -CONF => $conf, + -CACHE => $cache + ); + + # create gene and peptide archive + $archiver->create_archive($mapping_session_id); + + # dump existing archive tables to file + my $num_entries = + $archiver->dump_table_to_file( 'source', 'gene_archive', + 'gene_archive_existing.txt', 1 ); + +=head1 DESCRIPTION + +This module creates the gene_archive and peptide_archive +tables. Data is written to a file as tab-delimited text for +loading into a MySQL database (this can be done manually, or using +StableIdmapper->upload_file_into_table()). + +An archive entry for a given source gene is created if no target +gene exists, or if any of its transcripts or their translations +changed. Non-coding transcripts only have an entry in gene_archive (i.e. +without a corresponding peptide_archive entry). + +=head1 METHODS + + create_archive + dump_gene + dump_tuple + dump_nc_row + mapping_session_id + +=cut + + +package Bio::EnsEMBL::IdMapping::Archiver; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::IdMapping::BaseObject; +our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append); +use Digest::MD5 qw(md5_hex); + + +# instance variables +my $pa_id; + + +=head2 create_archive + + Arg[1] : Int $mapping_session_id - the mapping_session_id for this run + Example : $archiver->create_archive($stable_id_mapper->mapping_session_id); + Description : Creates the gene_archive and peptide_archive tables and writes + the data to a tab-delimited file. The decision as to what to + archive is deferred to dump_gene(), see documentation there for + details. + Return type : none + Exceptions : Thrown on missing argument. + Caller : id_mapping.pl + Status : At Risk + : under development + +=cut + +sub create_archive { + my $self = shift; + my $mapping_session_id = shift; + + # argument check + unless ($mapping_session_id) { + $self->logger->warning("No mapping_session_id set."); + } + + $self->mapping_session_id($mapping_session_id); + + # get filehandles to write gene and peptide archive + my $ga_fh = $self->get_filehandle('gene_archive_new.txt', 'tables'); + my $pa_fh = $self->get_filehandle('peptide_archive_new.txt', 'tables'); + + # get the currently highest peptide_archive_id from the source db + my $s_dba = $self->cache->get_DBAdaptor('source'); + my $s_dbh = $s_dba->dbc->db_handle; + my $sql = qq(SELECT MAX(peptide_archive_id) FROM peptide_archive); + $pa_id = $self->fetch_value_from_db($s_dbh, $sql); + + unless ($pa_id) { + $self->logger->warning("No max(peptide_archive_id) found in db.\n", 1); + $self->logger->info("That's ok if this is the first stable ID mapping for this species.\n", 1); + } + + $pa_id++; + $self->logger->debug("Starting with peptide_archive_id $pa_id.\n"); + + # lookup hash of target gene stable IDs + my %target_genes = map { $_->stable_id => $_ } + values %{ $self->cache->get_by_name("genes_by_id", 'target') }; + + # loop over source genes and dump to archive (dump_gene() will decide whether + # to do full or partial dump) + foreach my $source_gene (values %{ $self->cache->get_by_name("genes_by_id", + 'source') }) { + + $self->dump_gene($source_gene, $target_genes{$source_gene->stable_id}, + $ga_fh, $pa_fh); + } + + close($ga_fh); + close($pa_fh); +} + + +=head2 dump_gene + + Arg[1] : Bio::EnsEMBL::IdMapping::TinyGene $s_gene - source gene + Arg[2] : Bio::EnsEMBL::IdMapping::TinyGene $t_gene - target gene + Arg[3] : Filehandle $ga_fh - filehandle for writing gene_archive data + Arg[4] : Filehandle $pa_fh - filehandle for writing peptide_archive data + Example : my $target_gene = $gene_mappings{$source_gene->stable_id}; + $archiver->dump_gene($source_gene, $target_gene, $ga_fh, $pa_fh); + Description : Given a source gene, it will write a gene_achive and + peptide_achive entry for it if no target gene exists, or if any + of its transcripts or their translation changed. + Return type : none + Exceptions : none + Caller : create_archive() + Status : At Risk + : under development + +=cut + +sub dump_gene { + my ($self, $s_gene, $t_gene, $ga_fh, $pa_fh) = @_; + + # private method, so no argument check done for performance reasons + + # deal with ncRNA differently + # hope this simple biotype regex is accurate enough... + my $is_ncRNA = 0; + $is_ncRNA = 1 if ($s_gene->biotype =~ /RNA/); + + # loop over source transcripts + foreach my $s_tr (@{ $s_gene->get_all_Transcripts }) { + my $s_tl = $s_tr->translation; + + # we got a coding transcript + if ($s_tl) { + + # do a full dump of this gene if no target gene exists + if (! $t_gene) { + $self->dump_tuple($s_gene, $s_tr, $s_tl, $ga_fh, $pa_fh); + + # otherwise, only dump if any transcript or its translation changed + } else { + + my $changed_flag = 1; + + foreach my $t_tr (@{ $t_gene->get_all_Transcripts }) { + my $t_tl = $t_tr->translation; + next unless ($t_tl); + + if (($s_tr->stable_id eq $t_tr->stable_id) and + ($s_tl->stable_id eq $t_tl->stable_id) and + ($s_tl->seq eq $t_tl->seq)) { + $changed_flag = 0; + } + } + + if ($changed_flag) { + $self->dump_tuple($s_gene, $s_tr, $s_tl, $ga_fh, $pa_fh); + } + } + + # now deal with ncRNAs (they don't translate but we still want to archive + # them) + } elsif ($is_ncRNA) { + + if (! $t_gene) { + + $self->dump_nc_row($s_gene, $s_tr, $ga_fh); + + } else { + + my $changed_flag = 1; + + foreach my $t_tr (@{ $t_gene->get_all_Transcripts }) { + $changed_flag = 0 if ($s_tr->stable_id eq $t_tr->stable_id); + } + + if ($changed_flag) { + $self->dump_nc_row($s_gene, $s_tr, $ga_fh); + } + + } + } + } +} + + +=head2 dump_tuple + + Arg[1] : Bio::EnsEMBL::IdMapping::TinyGene $gene - gene to archive + Arg[2] : Bio::EnsEMBL::IdMapping::TinyTrancript $tr - its transcript + Arg[3] : Bio::EnsEMBL::IdMapping::TinyTranslation $tl - its translation + Arg[4] : Filehandle $ga_fh - filehandle for writing gene_archive data + Arg[5] : Filehandle $pa_fh - filehandle for writing peptide_archive data + Example : $archive->dump_tuple($s_gene, $s_tr, $s_tl, $ga_fh, $pa_fh); + Description : Writes entry lines for gene_archive and peptide_archive. + Return type : none + Exceptions : none + Caller : dump_gene() + Status : At Risk + : under development + +=cut + +sub dump_tuple { + my ($self, $gene, $tr, $tl, $ga_fh, $pa_fh) = @_; + + # private method, so no argument check done for performance reasons + + # gene archive + print $ga_fh join("\t", + $gene->stable_id, + $gene->version, + $tr->stable_id, + $tr->version, + $tl->stable_id, + $tl->version, + $pa_id, + $self->mapping_session_id + ); + print $ga_fh "\n"; + + # peptide archive + my $pep_seq = $tl->seq; + print $pa_fh join("\t", $pa_id, md5_hex($pep_seq), $pep_seq); + print $pa_fh "\n"; + + # increment peptide_archive_id + $pa_id++; +} + + +=head2 dump_nc_row + + Arg[1] : Bio::EnsEMBL::IdMapping::TinyGene $gene - gene to archive + Arg[2] : Bio::EnsEMBL::IdMapping::TinyTrancript $tr - its transcript + Arg[3] : Filehandle $ga_fh - filehandle for writing gene_archive data + Example : $archive->dump_nc_row($s_gene, $s_tr, $ga_fh); + Description : Writes an entry line for gene_archive for non-coding + transcripts. + Return type : none + Exceptions : none + Caller : dump_gene() + Status : At Risk + : under development + +=cut + +sub dump_nc_row { + my ($self, $gene, $tr, $ga_fh) = @_; + + # private method, so no argument check done for performance reasons + + # gene archive + print $ga_fh join("\t", + $gene->stable_id, + $gene->version, + $tr->stable_id, + $tr->version, + '\N', + '\N', + '\N', + $self->mapping_session_id + ); + print $ga_fh "\n"; +} + + +=head2 mapping_session_id + + Arg[1] : (optional) Int - mapping_session_id to set + Example : my $msi = $archiver->mapping_session_id; + Description : Getter/setter for mapping_session_id. + Return type : Int + Exceptions : none + Caller : create_archive() + Status : At Risk + : under development + +=cut + +sub mapping_session_id { + my $self = shift; + $self->{'_mapping_session_id'} = shift if (@_); + return $self->{'_mapping_session_id'}; +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/IdMapping/BaseObject.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/BaseObject.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,478 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::IdMapping::BaseObject - base object for IdMapping objects + +=head1 SYNOPSIS + + # this object isn't instantiated directly but rather extended + use Bio::EnsEMBL::IdMapping::BaseObject; + our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject); + +=head1 DESCRIPTION + +This is the base object for some of the objects used in the IdMapping +application. An object that extends BaseObject will have a ConfParser, +Logger and Cache object. BaseObject also implements some useful utility +functions related to file and db access. + +This isn't very clean OO design but it's efficient and easy to use... + +=head1 METHODS + + new + get_filehandle + file_exists + fetch_value_from_db + dump_table_to_file + upload_file_into_table + logger + conf + cache + +=cut + + +package Bio::EnsEMBL::IdMapping::BaseObject; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append); + + +=head2 new + + Arg [LOGGER]: Bio::EnsEMBL::Utils::Logger $logger - a logger object + Arg [CONF] : Bio::EnsEMBL::Utils::ConfParser $conf - a configuration object + Arg [CACHE] : Bio::EnsEMBL::IdMapping::Cache $cache - a cache object + Example : my $object = Bio::EnsEMBL::IdMapping::BaseObjectSubclass->new( + -LOGGER => $logger, + -CONF => $conf, + -CACHE => $cache + ); + Description : Constructor + Return type : implementing subclass type + Exceptions : thrown on wrong or missing arguments + Caller : general + Status : At Risk + : under development + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my ($logger, $conf, $cache) = rearrange(['LOGGER', 'CONF', 'CACHE'], @_); + + unless ($logger and ref($logger) and + $logger->isa('Bio::EnsEMBL::Utils::Logger')) { + throw("You must provide a Bio::EnsEMBL::Utils::Logger for logging."); + } + + unless ($conf and ref($conf) and + $conf->isa('Bio::EnsEMBL::Utils::ConfParser')) { + throw("You must provide configuration as a Bio::EnsEMBL::Utils::ConfParser object."); + } + + unless ($cache and ref($cache) and + $cache->isa('Bio::EnsEMBL::IdMapping::Cache')) { + throw("You must provide configuration as a Bio::EnsEMBL::IdMapping::Cache object."); + } + + my $self = {}; + bless ($self, $class); + + # initialise + $self->logger($logger); + $self->conf($conf); + $self->cache($cache); + + return $self; +} + + +=head2 get_filehandle + + Arg[1] : String $filename - filename for filehandle + Arg[2] : String $path_append - append subdirectory name to basedir + Arg[3] : String $mode - filehandle mode (<|>|>>) + Example : my $fh = $object->get_filehandle('mapping_stats.txt', 'stats', + '>'); + print $fh "Stats:\n"; + Description : Returns a filehandle to a file for reading or writing. The file + is qualified with the basedir defined in the configuration and + an optional subdirectory name. + Return type : filehandle + Exceptions : thrown on missing filename + Caller : general + Status : At Risk + : under development + +=cut + +sub get_filehandle { + my $self = shift; + my $filename = shift; + my $path_append = shift; + my $mode = shift; + + throw("Need a filename for this filehandle.") unless (defined($filename)); + + my $path = $self->conf->param('basedir'); + $path = path_append($path, $path_append) if (defined($path_append)); + + $mode ||= '>'; + + open(my $fh, $mode, "$path/$filename") or + throw("Unable to open $path/$filename: $!"); + + return $fh; +} + + +=head2 file_exists + + Arg[1] : String $filename - filename to test + Arg[2] : Boolean $path_append - turn on pre-pending of basedir + Example : unless ($object->file_exists('gene_mappings.ser', 1)) { + $object->do_gene_mapping; + } + Description : Tests if a file exists and has non-zero size. + Return type : Boolean + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub file_exists { + my $self = shift; + my $filename = shift; + my $path_append = shift; + + my $path = $self->conf->param('basedir'); + $path = path_append($path, $path_append) if (defined($path_append)); + + return (-s "$path/$filename"); +} + + +=head2 fetch_value_from_db + + Arg[1] : DBI::db $dbh - a DBI database handle + Arg[2] : String $sql - SQL statement to execute + Example : my $num_genes = $object->fetch_value_from_db($dbh, + 'SELECT count(*) FROM gene'); + Description : Executes an SQL statement on a db handle and returns the first + column of the first row returned. Useful for queries returning a + single value, like table counts. + Return type : Return type of SQL statement + Exceptions : thrown on wrong or missing arguments + Caller : general + Status : At Risk + : under development + +=cut + +sub fetch_value_from_db { + my $self = shift; + my $dbh = shift; + my $sql = shift; + + throw("Need a db handle.") unless ($dbh and $dbh->isa('DBI::db')); + throw("Need an SQL query to execute.") unless ($sql); + + my $sth = $dbh->prepare($sql); + $sth->execute; + my ($retval) = $sth->fetchrow_array; + + return $retval; +} + + +=head2 dump_table_to_file + + Arg[1] : String $dbtype - db type (source|target) + Arg[2] : String $table - name of table to dump + Arg[3] : String $filename - name of dump file + Arg[4] : Boolean $check_existing - turn on test for existing dump + Example : my $rows_dumped = $object->dump_table_to_file('source', + 'stable_id_event', 'stable_id_event_existing.txt'); + Description : Dumps the contents of a db table to a tab-delimited file. The + dump file will be written to a subdirectory called 'tables' + under the basedir from your configuration. + Return type : Int - the number of rows dumped + Exceptions : thrown on wrong or missing arguments + Caller : general + Status : At Risk + : under development + +=cut + +sub dump_table_to_file { + my $self = shift; + my $dbtype = shift; + my $table = shift; + my $filename = shift; + my $check_existing = shift; + + # argument check + unless (($dbtype eq 'source') or ($dbtype eq 'target')) { + throw("Missing or unknown db type: $dbtype."); + } + throw("Need a table name.") unless ($table); + throw("Need a filename.") unless ($filename); + + # conditionally check if table was already dumped + if ($check_existing and $self->file_exists($filename, 'tables')) { + $self->logger->info("$filename exists, won't dump again.\n"); + return 0; + } + + my $fh = $self->get_filehandle($filename, 'tables'); + + my $dba = $self->cache->get_DBAdaptor($dbtype); + my $dbh = $dba->dbc->db_handle; + my $sth = $dbh->prepare("SELECT * FROM $table"); + $sth->execute; + + my $i = 0; + + while (my @row = $sth->fetchrow_array) { + $i++; + + # use '\N' for NULL values + for (my $j = 0; $j < scalar(@row); $j++) { + $row[$j] = '\N' unless (defined($row[$j])); + } + + print $fh join("\t", @row); + print $fh "\n"; + } + + $sth->finish; + + return $i; +} + + +=head2 upload_file_into_table + + Arg[1] : String $dbtype - db type (source|target) + Arg[2] : String $table - name of table to upload the data to + Arg[3] : String $filename - name of dump file + Arg[4] : Boolean $no_check_empty - don't check if table is empty + Example : my $rows_uploaded = $object->upload_file_into_table('target', + 'stable_id_event', 'stable_id_event_new.txt'); + Description : Uploads a tab-delimited data file into a db table. The data file + will be taken from a subdirectory 'tables' under your configured + basedir. If the db table isn't empty and $no_check_empty isn't + set, no data is uploaded (and a warning is issued). + Return type : Int - the number of rows uploaded + Exceptions : thrown on wrong or missing arguments + Caller : general + Status : At Risk + : under development + +=cut + +sub upload_file_into_table { + my $self = shift; + my $dbtype = shift; + my $table = shift; + my $filename = shift; + my $no_check_empty = shift; + + # argument check + unless ( ( $dbtype eq 'source' ) or ( $dbtype eq 'target' ) ) { + throw("Missing or unknown db type: $dbtype."); + } + throw("Need a table name.") unless ($table); + throw("Need a filename.") unless ($filename); + + # sanity check for dry run + if ( $self->conf->param('dry_run') ) { + $self->logger->warning( + "dry_run - skipping db upload for $filename.\n"); + return; + } + + my $file = + join( '/', $self->conf->param('basedir'), 'tables', $filename ); + my $r = 0; + + if ( -s $file ) { + + $self->logger->debug( "$file -> $table\n", 1 ); + + my $dba = $self->cache->get_DBAdaptor($dbtype); + my $dbh = $dba->dbc->db_handle; + + my $idtable = 0; + if ( $table =~ /^([^_]+)_stable_id/ ) { + # This is a stable_id table we're working with. + $idtable = 1; + $table = $1; + } + + # check table is empty + my ( $sql, $sth ); + unless ($no_check_empty) { + if ($idtable) { + $sql = + qq(SELECT count(*) FROM $table WHERE stable_id IS NOT NULL); + } + else { + $sql = qq(SELECT count(*) FROM $table); + } + $sth = $dbh->prepare($sql); + $sth->execute; + my ($c) = $sth->fetchrow_array; + $sth->finish; + + if ( $c > 0 ) { + if ($idtable) { + $self->logger->warning( + "Table $table contains $c stable IDs.\n", + 1 ); + } + else { + $self->logger->warning( + "Table $table not empty: found $c entries.\n", + 1 ); + } + $self->logger->info( "Data not uploaded!\n", 1 ); + return $r; + } + } ## end unless ($no_check_empty) + + # now upload the data + if ($idtable) { + # Create a temporary table, upload the data into it, and then + # update the main table. + $dbh->do( + qq( CREATE TABLE stable_id_$$ ( object_id INTEGER UNSIGNED, + stable_id VARCHAR(255), + version SMALLINT UNSIGNED, + created_date DATETIME, + modified_date DATETIME, + PRIMARY KEY(object_id) ) ) + ); + + $dbh->do( + qq(LOAD DATA LOCAL INFILE '$file' INTO TABLE stable_id_$$)); + + $dbh->do( + qq( + UPDATE $table, stable_id_$$ + SET $table.stable_id=stable_id_$$.stable_id, + $table.version=stable_id_$$.version, + $table.created_date=stable_id_$$.created_date, + $table.modified_date=stable_id_$$.modified_date + WHERE $table.${table}_id = stable_id_$$.object_id ) + ); + + $dbh->do(qq(DROP TABLE stable_id_$$)); + } ## end if ($idtable) + else { + $dbh->do(qq(LOAD DATA LOCAL INFILE '$file' INTO TABLE $table)); + } + $dbh->do(qq(OPTIMIZE TABLE $table)); + + } ## end if ( -s $file ) + else { + $self->logger->warning( "No data found in file $filename.\n", 1 ); + } + + return $r; +} ## end sub upload_file_into_table + + +=head2 logger + + Arg[1] : (optional) Bio::EnsEMBL::Utils::Logger - the logger to set + Example : $object->logger->info("Starting ID mapping.\n"); + Description : Getter/setter for logger object + Return type : Bio::EnsEMBL::Utils::Logger + Exceptions : none + Caller : constructor + Status : At Risk + : under development + +=cut + +sub logger { + my $self = shift; + $self->{'_logger'} = shift if (@_); + return $self->{'_logger'}; +} + + +=head2 conf + + Arg[1] : (optional) Bio::EnsEMBL::Utils::ConfParser - the configuration + to set + Example : my $basedir = $object->conf->param('basedir'); + Description : Getter/setter for configuration object + Return type : Bio::EnsEMBL::Utils::ConfParser + Exceptions : none + Caller : constructor + Status : At Risk + : under development + +=cut + +sub conf { + my $self = shift; + $self->{'_conf'} = shift if (@_); + return $self->{'_conf'}; +} + + +=head2 cache + + Arg[1] : (optional) Bio::EnsEMBL::IdMapping::Cache - the cache to set + Example : $object->cache->read_from_file('source'); + Description : Getter/setter for cache object + Return type : Bio::EnsEMBL::IdMapping::Cache + Exceptions : none + Caller : constructor + Status : At Risk + : under development + +=cut + +sub cache { + my $self = shift; + $self->{'_cache'} = shift if (@_); + return $self->{'_cache'}; +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/IdMapping/Cache.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/Cache.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1356 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::IdMapping::Cache - a cache to hold data objects used by the +IdMapping application + +=head1 DESCRIPTION + +=head1 METHODS + +=cut + + +package Bio::EnsEMBL::IdMapping::Cache; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::ScriptUtils qw(parse_bytes path_append); +use Bio::EnsEMBL::Utils::Scalar qw(assert_ref); +use Bio::EnsEMBL::IdMapping::TinyGene; +use Bio::EnsEMBL::IdMapping::TinyTranscript; +use Bio::EnsEMBL::IdMapping::TinyTranslation; +use Bio::EnsEMBL::IdMapping::TinyExon; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Storable qw(nstore retrieve); +use Digest::MD5 qw(md5_hex); + +# define available cache names here +my @cache_names = qw( + exons_by_id + transcripts_by_id + transcripts_by_exon_id + translations_by_id + genes_by_id + genes_by_transcript_id +); + + +=head2 new + + Arg [LOGGER]: Bio::EnsEMBL::Utils::Logger $logger - a logger object + Arg [CONF] : Bio::EnsEMBL::Utils::ConfParser $conf - a configuration object + Example : my $cache = Bio::EnsEMBL::IdMapping::Cache->new( + -LOGGER => $logger, + -CONF => $conf, + ); + Description : constructor + Return type : Bio::EnsEMBL::IdMapping::Cache object + Exceptions : thrown on wrong or missing arguments + Caller : general + Status : At Risk + : under development + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my ($logger, $conf, $load_instance) = + rearrange(['LOGGER', 'CONF', 'LOAD_INSTANCE'], @_); + + unless ($logger->isa('Bio::EnsEMBL::Utils::Logger')) { + throw("You must provide a Bio::EnsEMBL::Utils::Logger for logging."); + } + + unless ($conf->isa('Bio::EnsEMBL::Utils::ConfParser')) { + throw("You must provide configuration as a Bio::EnsEMBL::Utils::ConfParser object."); + } + + my $self = {}; + bless ($self, $class); + + # initialise + $self->logger($logger); + $self->conf($conf); + + if ($load_instance) { + $self->read_instance_from_file; + } + + return $self; +} + + +=head2 build_cache_by_slice + + Arg[1] : String $dbtype - db type (source|target) + Arg[2] : String $slice_name - the name of a slice (format as returned by + Bio::EnsEMBL::Slice->name) + Example : my ($num_genes, $filesize) = $cache->build_cache_by_slice( + 'source', 'chromosome:NCBI36:X:1:1000000:-1'); + Description : Builds a cache of genes, transcripts, translations and exons + needed by the IdMapping application and serialises the resulting + cache object to a file, one slice at a time. + Return type : list of the number of genes processed and the size of the + serialised cache file + Exceptions : thrown on invalid slice name + Caller : general + Status : At Risk + : under development + +=cut + +sub build_cache_by_slice { + my $self = shift; + my $dbtype = shift; + my $slice_name = shift; + + # set cache method (required for loading cache later) + $self->cache_method('BY_SEQ_REGION'); + + my $dba = $self->get_DBAdaptor($dbtype); + my $sa = $dba->get_SliceAdaptor; + + my $slice = $sa->fetch_by_name($slice_name); + unless ($slice) { + throw("Could not retrieve slice $slice_name."); + } + + my $genes = $slice->get_all_Genes( undef, undef, 1 ); + + # find common coord_system + my $common_cs_found = $self->find_common_coord_systems; + + # find out whether native coord_system is a common coord_system. + # if so, you don't need to project. + # also don't project if no common coord_system present + my $need_project = 1; + + my $csid = join( ':', + $slice->coord_system_name, + $slice->coord_system->version ); + + if ( $self->is_common_cs($csid) or !$self->highest_common_cs ) { + $need_project = 0; + } + + # build cache + my $type = "$dbtype.$slice_name"; + my $num_genes = + $self->build_cache_from_genes( $type, $genes, $need_project ); + undef $genes; + + # write cache to file, then flush cache to reclaim memory + my $size = $self->write_all_to_file($type); + + return $num_genes, $size; +} ## end sub build_cache_by_slice + + +=head2 build_cache_all + + Arg[1] : String $dbtype - db type (source|target) + Example : my ($num_genes, $filesize) = $cache->build_cache_all('source'); + Description : Builds a cache of genes, transcripts, translations and exons + needed by the IdMapping application and serialises the + resulting cache object to a file. All genes across the genome + are processed in one go. This method should be used when + build_cache_by_seq_region can't be used due to a large number + of toplevel seq_regions (e.g. 2x genomes). + Return type : list of the number of genes processed and the size of the + serialised cache file + Exceptions : thrown on invalid slice name + Caller : general + Status : At Risk + : under development + +=cut + +sub build_cache_all { + my $self = shift; + my $dbtype = shift; + + # set cache method (required for loading cache later) + $self->cache_method('ALL'); + + my $dba = $self->get_DBAdaptor($dbtype); + my $ga = $dba->get_GeneAdaptor; + + my $genes = $ga->fetch_all; + + # find common coord_system + my $common_cs_found = $self->find_common_coord_systems; + + # Build cache. Setting $need_project to 'CHECK' will cause + # build_cache_from_genes() to check the coordinate system for each + # gene. + my $type = "$dbtype.ALL"; + my $need_project = 'CHECK'; + my $num_genes = + $self->build_cache_from_genes( $type, $genes, $need_project ); + + undef $genes; + + # write cache to file, then flush cache to reclaim memory + my $size = $self->write_all_to_file($type); + + return $num_genes, $size; +} + + +=head2 build_cache_from_genes + + Arg[1] : String $type - cache type + Arg[2] : Listref of Bio::EnsEMBL::Genes $genes - genes to build cache + from + Arg[3] : Boolean $need_project - indicate if we need to project exons to + common coordinate system + Example : $cache->build_cache_from_genes( + 'source.chromosome:NCBI36:X:1:100000:1', \@genes); + Description : Builds the cache by fetching transcripts, translations and exons + for a list of genes from the database, and creating lightweight + Bio::EnsEMBL::IdMapping::TinyFeature objects containing only the + data needed by the IdMapping application. These objects are + attached to a name cache in this cache object. Exons only need + to be projected to a commond coordinate system if their native + coordinate system isn't common to source and target assembly + itself. + Return type : int - number of genes after filtering + Exceptions : thrown on wrong or missing arguments + Caller : internal + Status : At Risk + : under development + +=cut + +sub build_cache_from_genes { + my $self = shift; + my $type = shift; + my $genes = shift; + my $need_project = shift; + + throw("You must provide a type.") unless $type; + throw("You must provide a listref of genes.") + unless ( ref($genes) eq 'ARRAY' ); + + # biotype filter + if ( $self->conf()->param('biotypes') || + $self->conf()->param('biotypes_include') || + $self->conf()->param('biotypes_exclude') ) + { + $genes = $self->filter_biotypes($genes); + } + my $num_genes = scalar(@$genes); + + # initialise cache for the given type. + $self->{'cache'}->{$type} = {}; + + #my $i = 0; + #my $num_genes = scalar(@$genes); + #my $progress_id = $self->logger->init_progress($num_genes); + + # loop over genes sorted by gene location. + # the sort will hopefully improve assembly mapper cache performance and + # therefore speed up exon sequence retrieval + foreach my $gene ( sort { $a->start <=> $b->start } @$genes ) { + #$self->logger->log_progressbar($progress_id, ++$i, 2); + #$self->logger->log_progress($num_genes, ++$i, 20, 3, 1); + + if ( $need_project eq 'CHECK' ) { + # find out whether native coord_system is a common coord_system. + # if so, you don't need to project. + # also don't project if no common coord_system present + if ( $self->highest_common_cs ) { + my $csid = join( ':', + $gene->slice->coord_system_name, + $gene->slice->coord_system->version ); + if ( $self->is_common_cs($csid) ) { + $need_project = 0; + } + } + else { + $need_project = 0; + } + } + + # create lightweigt gene + my $lgene = + Bio::EnsEMBL::IdMapping::TinyGene->new_fast( [ + $gene->dbID, $gene->stable_id, + $gene->version, $gene->created_date, + $gene->modified_date, $gene->start, + $gene->end, $gene->strand, + $gene->slice->seq_region_name, $gene->biotype, + $gene->status, $gene->analysis->logic_name, + ( $gene->is_known ? 1 : 0 ), ] ); + + # build gene caches + $self->add( 'genes_by_id', $type, $gene->dbID, $lgene ); + + # transcripts + foreach my $tr ( @{ $gene->get_all_Transcripts } ) { + my $ltr = + Bio::EnsEMBL::IdMapping::TinyTranscript->new_fast( [ + $tr->dbID, $tr->stable_id, + $tr->version, $tr->created_date, + $tr->modified_date, $tr->start, + $tr->end, $tr->strand, + $tr->length, md5_hex( $tr->spliced_seq ), + ( $tr->is_known ? 1 : 0 ) ] ); + + $ltr->biotype( $tr->biotype() ); + $lgene->add_Transcript($ltr); + + # build transcript caches + $self->add( 'transcripts_by_id', $type, $tr->dbID, $ltr ); + $self->add( 'genes_by_transcript_id', $type, $tr->dbID, $lgene ); + + # translation (if there is one) + if ( my $tl = $tr->translation ) { + my $ltl = + Bio::EnsEMBL::IdMapping::TinyTranslation->new_fast( [ + $tl->dbID, $tl->stable_id, + $tl->version, $tl->created_date, + $tl->modified_date, $tr->dbID, + $tr->translate->seq, ( $tr->is_known ? 1 : 0 ), + ] ); + + $ltr->add_Translation($ltl); + + $self->add( 'translations_by_id', $type, $tl->dbID, $ltl ); + + undef $tl; + } + + # exons + foreach my $exon ( @{ $tr->get_all_Exons } ) { + my $lexon = + Bio::EnsEMBL::IdMapping::TinyExon->new_fast( [ + $exon->dbID, + $exon->stable_id, + $exon->version, + $exon->created_date, + $exon->modified_date, + $exon->start, + $exon->end, + $exon->strand, + $exon->slice->seq_region_name, + $exon->slice->coord_system_name, + $exon->slice->coord_system->version, + $exon->slice->subseq( $exon->start, $exon->end, + $exon->strand ), + $exon->phase, + $need_project, ] ); + + # get coordinates in common coordinate system if needed + if ($need_project) { + my @seg = @{ + $exon->project( $self->highest_common_cs, + $self->highest_common_cs_version ) }; + + if ( scalar(@seg) == 1 ) { + my $sl = $seg[0]->to_Slice; + $lexon->common_start( $sl->start ); + $lexon->common_end( $sl->end ); + $lexon->common_strand( $sl->strand ); + $lexon->common_sr_name( $sl->seq_region_name ); + } + } + + $ltr->add_Exon($lexon); + + $self->add( 'exons_by_id', $type, $exon->dbID, $lexon ); + $self->add_list( 'transcripts_by_exon_id', + $type, $exon->dbID, $ltr ); + + undef $exon; + } ## end foreach my $exon ( @{ $tr->get_all_Exons...}) + + undef $tr; + } ## end foreach my $tr ( @{ $gene->get_all_Transcripts...}) + + undef $gene; + } ## end foreach my $gene ( sort { $a...}) + + return $num_genes; +} ## end sub build_cache_from_genes + + +=head2 filter_biotypes + + Arg[1] : Listref of Bio::EnsEMBL::Genes $genes - the genes to filter + Example : my @filtered = @{ $cache->filter_biotypes(\@genes) }; + + Description : Filters a list of genes by biotype. Biotypes are + taken from the IdMapping configuration parameter + 'biotypes_include' or 'biotypes_exclude'. + + If the configuration parameter 'biotypes_exclude' is + defined, then rather than returning the genes whose + biotype is listed in the configuration parameter + 'biotypes_include' the method will return the genes + whose biotype is *not* listed in the 'biotypes_exclude' + configuration parameter. + + It is an error to define both these configuration + parameters. + + The old parameter 'biotypes' is equivalent to + 'biotypes_include'. + + Return type : Listref of Bio::EnsEMBL::Genes (or empty list) + Exceptions : none + Caller : internal + Status : At Risk + : under development + +=cut + +sub filter_biotypes { + my ( $self, $genes ) = @_; + + my @filtered; + my @biotypes; + my $opt_reverse; + + if ( defined( $self->conf()->param('biotypes_include') ) || + defined( $self->conf()->param('biotypes') ) ) + { + if ( defined( $self->conf()->param('biotypes_exclude') ) ) { + $self->logger() + ->error( "You may not use both " . + "'biotypes_include' and 'biotypes_exclude' " . + "in the configuration" ); + } + + if ( defined( $self->conf()->param('biotypes_include') ) ) { + @biotypes = $self->conf()->param('biotypes_include'); + } + else { + @biotypes = $self->conf()->param('biotypes'); + } + $opt_reverse = 0; + } + else { + @biotypes = $self->conf()->param('biotypes_exclude'); + $opt_reverse = 1; + } + + foreach my $gene ( @{$genes} ) { + my $keep_gene; + + foreach my $biotype (@biotypes) { + if ( $gene->biotype() eq $biotype ) { + if ($opt_reverse) { $keep_gene = 0 } + else { $keep_gene = 1 } + last; + } + } + + if ( defined($keep_gene) ) { + if ($keep_gene) { + push( @filtered, $gene ); + } + } + elsif ($opt_reverse) { + push( @filtered, $gene ); + } + } + + return \@filtered; +} ## end sub filter_biotypes + + +=head2 add + + Arg[1] : String $name - a cache name (e.g. 'genes_by_id') + Arg[2] : String type - a cache type (e.g. "source.$slice_name") + Arg[3] : String $key - key of this entry (e.g. a gene dbID) + Arg[4] : Bio::EnsEMBL::IdMappping::TinyFeature $val - value to cache + Example : $cache->add('genes_by_id', + 'source.chromosome:NCBI36:X:1:1000000:1', '1234', $tiny_gene); + Description : Adds a TinyFeature object to a named cache. + Return type : Bio::EnsEMBL::IdMapping::TinyFeature + Exceptions : thrown on wrong or missing arguments + Caller : internal + Status : At Risk + : under development + +=cut + +sub add { + my $self = shift; + my $name = shift; + my $type = shift; + my $key = shift; + my $val = shift; + + throw("You must provide a cache name (e.g. genes_by_id.") unless $name; + throw("You must provide a cache type.") unless $type; + throw("You must provide a cache key (e.g. a gene dbID).") unless $key; + + $self->{'cache'}->{$type}->{$name}->{$key} = $val; + + return $self->{'cache'}->{$type}->{$name}->{$key}; +} + +=head2 add_list + + Arg[1] : String $name - a cache name (e.g. 'genes_by_id') + Arg[2] : String type - a cache type (e.g. "source.$slice_name") + Arg[3] : String $key - key of this entry (e.g. a gene dbID) + Arg[4] : List of Bio::EnsEMBL::IdMappping::TinyFeature @val - values + to cache + Example : $cache->add_list('transcripts_by_exon_id', + 'source.chromosome:NCBI36:X:1:1000000:1', '1234', + $tiny_transcript1, $tiny_transcript2); + Description : Adds a list of TinyFeature objects to a named cache. + Return type : Listref of Bio::EnsEMBL::IdMapping::TinyFeature objects + Exceptions : thrown on wrong or missing arguments + Caller : internal + Status : At Risk + : under development + +=cut + +sub add_list { + my $self = shift; + my $name = shift; + my $type = shift; + my $key = shift; + my @vals = @_; + + throw("You must provide a cache name (e.g. genes_by_id.") unless $name; + throw("You must provide a cache type.") unless $type; + throw("You must provide a cache key (e.g. a gene dbID).") unless $key; + + push @{ $self->{'cache'}->{$type}->{$name}->{$key} }, @vals; + + return $self->{'cache'}->{$type}->{$name}->{$key}; +} + +sub get_by_key { + my $self = shift; + my $name = shift; + my $type = shift; + my $key = shift; + + throw("You must provide a cache name (e.g. genes_by_id.") unless $name; + throw("You must provide a cache type.") unless $type; + throw("You must provide a cache key (e.g. a gene dbID).") unless $key; + + # transparently load cache from file unless already loaded + unless ($self->{'instance'}->{'loaded'}->{"$type"}) { + $self->read_and_merge($type); + } + + return $self->{'cache'}->{$type}->{$name}->{$key}; +} + +sub get_by_name { + my $self = shift; + my $name = shift; + my $type = shift; + + throw("You must provide a cache name (e.g. genes_by_id.") unless $name; + throw("You must provide a cache type.") unless $type; + + # transparently load cache from file unless already loaded + unless ($self->{'instance'}->{'loaded'}->{$type}) { + $self->read_and_merge($type); + } + + return $self->{'cache'}->{$type}->{$name} || {}; +} + + +sub get_count_by_name { + my $self = shift; + my $name = shift; + my $type = shift; + + throw("You must provide a cache name (e.g. genes_by_id.") unless $name; + throw("You must provide a cache type.") unless $type; + + # transparently load cache from file unless already loaded + unless ($self->{'instance'}->{'loaded'}->{$type}) { + $self->read_and_merge($type); + } + + return scalar(keys %{ $self->get_by_name($name, $type) }); +} + + +sub find_common_coord_systems { + my $self = shift; + + # get adaptors for source db + my $s_dba = $self->get_DBAdaptor('source'); + my $s_csa = $s_dba->get_CoordSystemAdaptor; + my $s_sa = $s_dba->get_SliceAdaptor; + + # get adaptors for target db + my $t_dba = $self->get_DBAdaptor('target'); + my $t_csa = $t_dba->get_CoordSystemAdaptor; + my $t_sa = $t_dba->get_SliceAdaptor; + + # find common coord_systems + my @s_coord_systems = @{ $s_csa->fetch_all }; + my @t_coord_systems = @{ $t_csa->fetch_all }; + my $found_highest = 0; + +SOURCE: + foreach my $s_cs (@s_coord_systems) { + if ( !$s_cs->is_default() ) { next SOURCE } + + TARGET: + foreach my $t_cs (@t_coord_systems) { + if ( !$t_cs->is_default() ) { next TARGET } + + if ( $s_cs->name eq $t_cs->name ) { + + # test for identical coord_system version + if ( $s_cs->version and ( $s_cs->version ne $t_cs->version ) ) { + next TARGET; + } + + # test for at least 50% identical seq_regions + if ( $self->seq_regions_compatible( $s_cs, $s_sa, $t_sa ) ) { + $self->add_common_cs($s_cs); + + unless ($found_highest) { + $self->highest_common_cs( $s_cs->name ); + $self->highest_common_cs_version( $s_cs->version ); + } + + $found_highest = 1; + + next SOURCE; + } + } + } ## end foreach my $t_cs (@t_coord_systems) + } ## end foreach my $s_cs (@s_coord_systems) + + return $found_highest; +} ## end sub find_common_coord_systems + + +sub seq_regions_compatible { + my $self = shift; + my $cs = shift; + my $s_sa = shift; + my $t_sa = shift; + + unless ($cs and $cs->isa('Bio::EnsEMBL::CoordSystem')) { + throw('You must provide a CoordSystem'); + } + + unless ($s_sa and $t_sa and $s_sa->isa('Bio::EnsEMBL::DBSQL::SliceAdaptor') + and $t_sa->isa('Bio::EnsEMBL::DBSQL::SliceAdaptor')) { + throw('You must provide a source and target SliceAdaptor'); + } + + my %sr_match; + my $equal = 0; + + my $s_seq_regions = $s_sa->fetch_all($cs->name, $cs->version); + my $t_seq_regions = $t_sa->fetch_all($cs->name, $cs->version); + + # sanity check to prevent divison by zero + my $s_count = scalar(@$s_seq_regions); + my $t_count = scalar(@$t_seq_regions); + return(0) if ($s_count == 0 or $t_count == 0); + + foreach my $s_sr (@$s_seq_regions) { + $sr_match{$s_sr->seq_region_name} = $s_sr->length; + } + + foreach my $t_sr (@$t_seq_regions) { + if (exists($sr_match{$t_sr->seq_region_name})) { + $equal++; + + # return false if we have a region with same name but different length + return(0) unless ($sr_match{$t_sr->seq_region_name} == $t_sr->length); + } + } + + if ($equal/$s_count > 0.5 and $equal/$t_count > 0.5) { + return(1); + } else { + $self->logger->info("Only $equal seq_regions identical for ".$cs->name." ".$cs->version."\n"); + return(0); + } + +} + + +sub check_db_connection { + my $self = shift; + my $dbtype = shift; + + my $err = 0; + + eval { + my $dba = $self->get_DBAdaptor($dbtype); + $dba->dbc->connect; + }; + + if ($@) { + $self->logger->warning("Can't connect to $dbtype db: $@\n"); + $err++; + } else { + $self->logger->debug("Connection to $dbtype db ok.\n"); + $self->{'_db_conn_ok'}->{$dbtype} = 1; + } + + return $err; +} + + +sub check_db_read_permissions { + my $self = shift; + my $dbtype = shift; + + # skip this check if db connection failed (this prevents re-throwing + # exceptions). + return 1 unless ($self->{'_db_conn_ok'}->{$dbtype}); + + my $err = 0; + my %privs = %{ $self->get_db_privs($dbtype) }; + + unless ($privs{'SELECT'} or $privs{'ALL PRIVILEGES'}) { + $self->logger->warning("User doesn't have read permission on $dbtype db.\n"); + $err++; + } else { + $self->logger->debug("Read permission on $dbtype db ok.\n"); + } + + return $err; +} + + +sub check_db_write_permissions { + my $self = shift; + my $dbtype = shift; + + # skip this check if db connection failed (this prevents re-throwing + # exceptions). + return 1 unless ($self->{'_db_conn_ok'}->{$dbtype}); + + my $err = 0; + + unless ($self->do_upload) { + $self->logger->debug("No uploads, so write permission on $dbtype db not required.\n"); + return $err; + } + + my %privs = %{ $self->get_db_privs($dbtype) }; + + unless ($privs{'INSERT'} or $privs{'ALL PRIVILEGES'}) { + $self->logger->warning("User doesn't have write permission on $dbtype db.\n"); + $err++; + } else { + $self->logger->debug("Write permission on $dbtype db ok.\n"); + } + + return $err; +} + + +sub do_upload { + my $self = shift; + + if ($self->conf->param('dry_run') or + ! ($self->conf->param('upload_events') or + $self->conf->param('upload_stable_ids') or + $self->conf->param('upload_archive'))) { + return 0; + } else { + return 1; + } +} + + +sub get_db_privs { + my ( $self, $dbtype ) = @_; + + my %privs = (); + my $rs; + + # get privileges from mysql db + eval { + my $dbc = $self->get_DBAdaptor($dbtype)->dbc(); + my $sql = qq(SHOW GRANTS FOR ) . $dbc->username(); + my $sth = $dbc->prepare($sql); + $sth->execute(); + $rs = $sth->fetchall_arrayref(); + #$sth->finish(); + }; + + if ($@) { + $self->logger->warning( + "Error obtaining privileges from $dbtype db: $@\n"); + return {}; + } + + # parse the output + foreach my $r ( map { $_->[0] } @{$rs} ) { + $r =~ s/GRANT (.*) ON .*/$1/i; + foreach my $p ( split( ',', $r ) ) { + # trim leading and trailing whitespace + $p =~ s/^\s+//; + $p =~ s/\s+$//; + $privs{ uc($p) } = 1; + } + } + + return \%privs; +} ## end sub get_db_privs + + +sub check_empty_tables { + my $self = shift; + my $dbtype = shift; + + # skip this check if db connection failed (this prevents re-throwing + # exceptions). + return 1 unless ($self->{'_db_conn_ok'}->{$dbtype}); + + my $err = 0; + my $c = 0; + + if ($self->conf->param('no_check_empty_tables') or !$self->do_upload) { + $self->logger->debug("Won't check for empty stable ID and archive tables in $dbtype db.\n"); + return $err; + } + + eval { + my @tables = + qw( + gene_stable_id + transcript_stable_id + translation_stable_id + exon_stable_id + stable_id_event + mapping_session + gene_archive + peptide_archive + ); + + my $dba = $self->get_DBAdaptor($dbtype); + foreach my $table (@tables) { + if ( $table =~ /^([^_]+)_stable_id/ ) { + $table = $1; + if ( $c = + $self->fetch_value_from_db( + $dba, + "SELECT COUNT(*) FROM $table WHERE stable_id IS NOT NULL" + ) ) + { + $self->logger->warning( + "$table table in $dbtype db has $c stable IDs.\n"); + $err++; + } + } + else { + if ( $c = + $self->fetch_value_from_db( + $dba, "SELECT COUNT(*) FROM $table" + ) ) + { + $self->logger->warning( + "$table table in $dbtype db has $c entries.\n"); + $err++; + } + } + } ## end foreach my $table (@tables) + }; + + if ($@) { + $self->logger->warning( +"Error retrieving stable ID and archive table row counts from $dbtype db: $@\n" + ); + $err++; + } + elsif ( !$err ) { + $self->logger->debug( + "All stable ID and archive tables in $dbtype db are empty.\n"); + } + return $err; +} + + +sub check_sequence { + my ( $self, $dbtype ) = @_; + + # skip this check if db connection failed (this prevents re-throwing + # exceptions). + return 1 unless ( $self->{'_db_conn_ok'}->{$dbtype} ); + + my $err = 0; + my $c = 0; + + eval { + my $dba = $self->get_DBAdaptor($dbtype); + unless ( $c = + $self->fetch_value_from_db( + $dba->dnadb(), "SELECT COUNT(*) FROM dna" + ) ) + { + $err++; + } + }; + + if ($@) { + $self->logger->warning( "Error retrieving dna table row count " + . "from $dbtype database: $@\n" ); + $err++; + } elsif ($err) { + $self->logger->warning("No sequence found in $dbtype database.\n"); + } else { + $self->logger->debug( + ucfirst($dbtype) . " db has sequence ($c entries).\n" ); + } + + return $err; +} ## end sub check_sequence + + +sub check_meta_entries { + my $self = shift; + my $dbtype = shift; + + # skip this check if db connection failed (this prevents re-throwing + # exceptions). + return 1 unless ($self->{'_db_conn_ok'}->{$dbtype}); + + my $err = 0; + my $assembly_default; + my $schema_version; + + eval { + my $dba = $self->get_DBAdaptor($dbtype); + $assembly_default = $self->fetch_value_from_db($dba, + qq(SELECT meta_value FROM meta WHERE meta_key = 'assembly.default')); + $schema_version = $self->fetch_value_from_db($dba, + qq(SELECT meta_value FROM meta WHERE meta_key = 'schema_version')); + }; + + if ($@) { + $self->logger->warning("Error retrieving dna table row count from $dbtype db: $@\n"); + return ++$err; + } + + unless ($assembly_default) { + $self->logger->warning("No meta.assembly.default value found in $dbtype db.\n"); + $err++; + } else { + $self->logger->debug("meta.assembly.default value found ($assembly_default).\n"); + } + + unless ($schema_version) { + $self->logger->warning("No meta.schema_version value found in $dbtype db.\n"); + $err++; + } else { + $self->logger->debug("meta.schema_version value found ($schema_version).\n"); + } + + return $err; +} + + +sub fetch_value_from_db { + my ( $self, $dba, $sql ) = @_; + + assert_ref( $dba, 'Bio::EnsEMBL::DBSQL::DBAdaptor' ); + + if ( !defined($sql) ) { + throw("Need an SQL statement to execute.\n"); + } + + my $sth = $dba->dbc->prepare($sql); + $sth->execute(); + + my ($c) = $sth->fetchrow_array; + return $c; +} + +sub get_DBAdaptor { + my ( $self, $prefix ) = @_; + + unless ( $self->{'_dba'}->{$prefix} ) { + # connect to database + my $dba = + new Bio::EnsEMBL::DBSQL::DBAdaptor( + -host => $self->conf->param("${prefix}host"), + -port => $self->conf->param("${prefix}port"), + -user => $self->conf->param("${prefix}user"), + -pass => $self->conf->param("${prefix}pass"), + -dbname => $self->conf->param("${prefix}dbname"), + -group => $prefix, ); + + if ( !defined( $self->conf->param("${prefix}host_dna") ) ) { + # explicitely set the dnadb to itself - by default the Registry + # assumes a group 'core' for this now + $dba->dnadb($dba); + } else { + my $dna_dba = + new Bio::EnsEMBL::DBSQL::DBAdaptor( + -host => $self->conf->param("${prefix}host_dna"), + -port => $self->conf->param("${prefix}port_dna"), + -user => $self->conf->param("${prefix}user_dna"), + -pass => $self->conf->param("${prefix}pass_dna"), + -dbname => $self->conf->param("${prefix}dbname_dna"), + -group => $prefix, ); + $dba->dnadb($dna_dba); + } + + $self->{'_dba'}->{$prefix} = $dba; + } ## end unless ( $self->{'_dba'}->...) + + return $self->{'_dba'}->{$prefix}; +} ## end sub get_DBAdaptor + + +sub cache_file_exists { + my $self = shift; + my $type = shift; + + throw("You must provide a cache type.") unless $type; + + my $cache_file = $self->cache_file($type); + + if (-e $cache_file) { + $self->logger->info("Cache file found for $type.\n", 2); + $self->logger->debug("Will read from $cache_file.\n", 2); + return 1; + } else { + $self->logger->info("No cache file found for $type.\n", 2); + $self->logger->info("Will build cache from db.\n", 2); + return 0; + } +} + + +sub cache_file { + my $self = shift; + my $type = shift; + + throw("You must provide a cache type.") unless $type; + + return $self->dump_path."/$type.object_cache.ser"; +} + + +sub instance_file { + my $self = shift; + + return $self->dump_path."/cache_instance.ser"; +} + + +sub dump_path { + my $self = shift; + + $self->{'dump_path'} ||= path_append($self->conf->param('basedir'), 'cache'); + + return $self->{'dump_path'}; +} + + +sub write_all_to_file { + my $self = shift; + my $type = shift; + + throw("You must provide a cache type.") unless $type; + + my $size = 0; + $size += $self->write_to_file($type); + $size += $self->write_instance_to_file; + + return parse_bytes($size); +} + + +sub write_to_file { + my $self = shift; + my $type = shift; + + throw("You must provide a cache type.") unless $type; + + unless ($self->{'cache'}->{$type}) { + $self->logger->warning("No features found in $type. Won't write cache file.\n"); + return; + } + + my $cache_file = $self->cache_file($type); + + eval { nstore($self->{'cache'}->{$type}, $cache_file) }; + if ($@) { + throw("Unable to store $cache_file: $@\n"); + } + + my $size = -s $cache_file; + return $size; +} + + +sub write_instance_to_file { + my $self = shift; + + my $instance_file = $self->instance_file; + + eval { nstore($self->{'instance'}, $instance_file) }; + if ($@) { + throw("Unable to store $instance_file: $@\n"); + } + + my $size = -s $instance_file; + return $size; +} + + +sub read_from_file { + my $self = shift; + my $type = shift; + + throw("You must provide a cache type.") unless $type; + + my $cache_file = $self->cache_file($type); + + if (-s $cache_file) { + + #$self->logger->info("Reading cache from file...\n", 0, 'stamped'); + #$self->logger->info("Cache file $cache_file.\n", 1); + eval { $self->{'cache'}->{$type} = retrieve($cache_file); }; + if ($@) { + throw("Unable to retrieve cache: $@"); + } + #$self->logger->info("Done.\n", 0, 'stamped'); + + } else { + $self->logger->warning("Cache file $cache_file not found or empty.\n"); + } + + + return $self->{'cache'}->{$type}; +} + + +sub read_and_merge { + my $self = shift; + my $dbtype = shift; + + unless ($dbtype eq 'source' or $dbtype eq 'target') { + throw("Db type must be 'source' or 'target'."); + } + + # read cache from single or multiple files, depending on caching strategy + my $cache_method = $self->cache_method; + if ($cache_method eq 'ALL') { + $self->read_from_file("$dbtype.ALL"); + } elsif ($cache_method eq 'BY_SEQ_REGION') { + foreach my $slice_name (@{ $self->slice_names($dbtype) }) { + $self->read_from_file("$dbtype.$slice_name"); + } + } else { + throw("Unknown cache method: $cache_method."); + } + + $self->merge($dbtype); + + # flag as being loaded + $self->{'instance'}->{'loaded'}->{$dbtype} = 1; +} + + +sub merge { + my $self = shift; + my $dbtype = shift; + + unless ($dbtype eq 'source' or $dbtype eq 'target') { + throw("Db type must be 'source' or 'target'."); + } + + foreach my $type (keys %{ $self->{'cache'} || {} }) { + next unless ($type =~ /^$dbtype/); + + foreach my $name (keys %{ $self->{'cache'}->{$type} || {} }) { + + foreach my $key (keys %{ $self->{'cache'}->{$type}->{$name} || {} }) { + if (defined $self->{'cache'}->{$dbtype}->{$name}->{$key}) { + # warning("Duplicate key in cache: $name|$dbtype|$key. Skipping.\n"); + } else { + $self->{'cache'}->{$dbtype}->{$name}->{$key} = + $self->{'cache'}->{$type}->{$name}->{$key}; + } + + delete $self->{'cache'}->{$type}->{$name}->{$key}; + } + + delete $self->{'cache'}->{$type}->{$name}; + } + + delete $self->{'cache'}->{$type}; + + } +} + + +sub read_instance_from_file { + my $self = shift; + + my $instance_file = $self->instance_file; + + unless (-s $instance_file) { + throw("No valid cache instance file found at $instance_file."); + } + + eval { $self->{'instance'} = retrieve($instance_file); }; + if ($@) { + throw("Unable to retrieve cache instance: $@"); + } + + return $self->{'instance'}; +} + + +sub slice_names { + my $self = shift; + my $dbtype = shift; + + throw("You must provide a db type (source|target).") unless $dbtype; + + my $dba = $self->get_DBAdaptor($dbtype); + my $sa = $dba->get_SliceAdaptor; + + my @slice_names = (); + + if ( $self->conf->param('chromosomes') ) { + # Fetch the specified chromosomes. + foreach my $chr ( $self->conf->param('chromosomes') ) { + my $slice = $sa->fetch_by_region( 'chromosome', $chr ); + push @slice_names, $slice->name; + } + + } + elsif ( $self->conf->param('region') ) { + # Fetch the slices on the specified regions. Don't use + # SliceAdaptor->fetch_by_name() since this will fail if assembly + # versions are different for source and target db. + my ( $cs, $version, $name, $start, $end, $strand ) = + split( /:/, $self->conf->param('region') ); + + my $slice = $sa->fetch_by_region( $cs, $name, $start, $end ); + + push @slice_names, $slice->name; + + } + else { + # Fetch all slices that have genes on them. + my $ga = $dba->get_GeneAdaptor; + + foreach my $srid ( @{ $ga->list_seq_region_ids } ) { + my $slice = $sa->fetch_by_seq_region_id($srid); + + if ( !$slice->is_reference() ) { + my $slices = + $slice->adaptor() + ->fetch_by_region_unique( $slice->coord_system_name(), + $slice->seq_region_name() ); + + push( @slice_names, map { $_->name() } @{$slices} ); + } + else { + push @slice_names, $slice->name(); + } + } + } + + return \@slice_names; +} ## end sub slice_names + + +sub logger { + my $self = shift; + $self->{'logger'} = shift if (@_); + return $self->{'logger'}; +} + +sub conf { + my $self = shift; + $self->{'conf'} = shift if (@_); + return $self->{'conf'}; +} + + +sub cache_method { + my $self = shift; + $self->{'instance'}->{'cache_method'} = shift if (@_); + return $self->{'instance'}->{'cache_method'}; +} + + +sub highest_common_cs { + my $self = shift; + $self->{'instance'}->{'hccs'} = shift if (@_); + return $self->{'instance'}->{'hccs'}; +} + + +sub highest_common_cs_version { + my $self = shift; + $self->{'instance'}->{'hccsv'} = shift if (@_); + return $self->{'instance'}->{'hccsv'}; +} + + +sub add_common_cs { + my $self = shift; + my $cs = shift; + + unless ($cs and $cs->isa('Bio::EnsEMBL::CoordSystem')) { + throw('You must provide a CoordSystem'); + } + + my $csid = join(':', $cs->name, $cs->version); + + $self->{'instance'}->{'ccs'}->{$csid} = 1; +} + + +sub is_common_cs { + my $self = shift; + my $csid = shift; + + return $self->{'instance'}->{'ccs'}->{$csid}; +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/IdMapping/Entry.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/Entry.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,178 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::IdMapping::Entry - object representing a ScoredMappingMatrix entry + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +This object represents a ScoredMappingMatrix entry. It is defined by a +pair of a source and target object's internal Id and a score for this +mapping. + +=head1 METHODS + + new + new_fast + source + target + score + to_string + +=cut + +package Bio::EnsEMBL::IdMapping::Entry; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + + +=head2 new + + Example : my $entry = Bio::EnsEMBL::IdMapping::Entry->new(); + Description : Constructor. This is a no-argument constructor, so you need to + populate the object manually. Rarely used since in most cases + new_fast() is preferred. + Return type : a Bio::EnsEMBL::IdMapping::Entry object + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my $self = []; + bless ($self, $class); + + return $self; +} + + +=head2 new_fast + + Arg[1] : Arrayref $array_ref - the arrayref to bless into the Entry + object + Example : my $entry = Bio::EnsEMBL::IdMapping::Entry->new_fast([ + $source_gene->id, $target_gene->id, 0.9]); + Description : Fast constructor. + Return type : a Bio::EnsEMBL::IdMapping::Entry object + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub new_fast { + my $class = shift; + my $array_ref = shift; + return bless $array_ref, $class; +} + + +=head2 source + + Arg[1] : (optional) Int - source object's internal Id + Description : Getter/setter for source object's internal Id. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub source { + my $self = shift; + $self->[0] = shift if (@_); + return $self->[0]; +} + + +=head2 target + + Arg[1] : (optional) Int - target object's internal Id + Description : Getter/setter for target object's internal Id. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub target { + my $self = shift; + $self->[1] = shift if (@_); + return $self->[1]; +} + + +=head2 score + + Arg[1] : (optional) Float - a score + Description : Getter/setter for score for the mapping between source and + target object. + Return type : Float + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub score { + my $self = shift; + $self->[2] = shift if (@_); + return $self->[2]; +} + + +=head2 to_string + + Example : print LOG $entry->to_string, "\n"; + Description : Returns a string representation of the Entry object. Useful for + debugging and logging. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub to_string { + my $self = shift; + return sprintf('%-10s%-10s%-5.6f', $self->source, $self->target, $self->score); +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/IdMapping/ExonScoreBuilder.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/ExonScoreBuilder.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,831 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +Combines ExonScoreBuilder, ExonDirectMapper and ExonerateRunner from +Java application. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::IdMapping::ExonScoreBuilder; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::IdMapping::ScoreBuilder; +our @ISA = qw(Bio::EnsEMBL::IdMapping::ScoreBuilder); + +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::ScriptUtils qw(parse_bytes path_append); +use Bio::EnsEMBL::IdMapping::ScoredMappingMatrix; + + +# +# exon scoring is done in two steps: +# 1. map exons by overlap (if a common coord_system exists) +# 2. map remaining and poorly scoring exons using exonerate +# +sub score_exons { + my $self = shift; + + $self->logger->info( "-- Scoring exons...\n\n", 0, 'stamped' ); + + # score using overlaps, then exonerate + my $matrix = $self->overlap_score; + + my $exonerate_matrix = $self->exonerate_score($matrix); + + # log stats before matrix merging + $self->logger->info("\nOverlap scoring matrix:\n"); + $self->log_matrix_stats($matrix); + $self->logger->info("\nExonerate scoring matrix:\n"); + $self->log_matrix_stats($exonerate_matrix); + + # merge matrices + $self->logger->info( "\nMerging scoring matrices...\n", 0, + 'stamped' ); + $matrix->merge($exonerate_matrix); + + $self->logger->info( "Done.\n\n", 0, 'stamped' ); + + # debug logging + if ( $self->logger->loglevel eq 'debug' ) { + $matrix->log( 'exon', $self->conf->param('basedir') ); + } + + # log stats of combined matrix + $self->logger->info("Combined scoring matrix:\n"); + $self->log_matrix_stats($matrix); + + $self->logger->info( "\nDone with exon scoring.\n\n", 0, 'stamped' ); + + return $matrix; +} ## end sub score_exons + + +# +# direct mapping by overlap (if common coord_system exists) +# +sub overlap_score { + my $self = shift; + + my $dump_path = + path_append( $self->conf->param('basedir'), 'matrix' ); + + my $matrix = + Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'exon_overlap_matrix.ser', + ); + + my $overlap_cache = $matrix->cache_file; + + if ( -s $overlap_cache ) { + + # read from file + $self->logger->info( + "Reading exon overlap scoring matrix from file...\n", + 0, 'stamped' ); + $self->logger->debug( "Cache file $overlap_cache.\n", 1 ); + $matrix->read_from_file; + $self->logger->info( "Done.\n", 0, 'stamped' ); + + } + else { + + # build scoring matrix + $self->logger->info( + "No exon overlap scoring matrix found. Will build new one.\n"); + + if ( $self->cache->highest_common_cs ) { + $self->logger->info( "Overlap scoring...\n", 0, 'stamped' ); + $matrix = $self->build_overlap_scores($matrix); + $self->logger->info( "Done.\n", 0, 'stamped' ); + } + + # write scoring matrix to file + $matrix->write_to_file; + + } + + return $matrix; +} ## end sub overlap_score + + +# +# map the remaining exons using exonerate +# +sub exonerate_score { + my $self = shift; + my $matrix = shift; + + unless ( $matrix and + $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') ) + { + throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + my $dump_path = + path_append( $self->conf->param('basedir'), 'matrix' ); + + my $exonerate_matrix = + Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'exon_exonerate_matrix.ser', + ); + + my $exonerate_cache = $exonerate_matrix->cache_file; + + if ( -s $exonerate_cache ) { + + # read from file + $self->logger->info( "Reading exonerate matrix from file...\n", + 0, 'stamped' ); + $self->logger->debug( "Cache file $exonerate_cache.\n", 1 ); + $exonerate_matrix->read_from_file; + $self->logger->info( "Done.\n", 0, 'stamped' ); + + } + else { + + # build scoring matrix + $self->logger->info( + "No exonerate matrix found. Will build new one.\n"); + + # dump exons to fasta files + my $dump_count = $self->dump_filtered_exons($matrix); + + if ($dump_count) { + # run exonerate + $self->run_exonerate; + + # parse results + $self->parse_exonerate_results($exonerate_matrix); + + } + else { + + $self->logger->info( "No source and/or target exons dumped, " . + "so don't need to run exonerate.\n" ); + + } + + # write scoring matrix to file + $exonerate_matrix->write_to_file; + + } ## end else [ if ( -s $exonerate_cache)] + + return $exonerate_matrix; +} ## end sub exonerate_score + +# +# Algorithm: +# Get a lists of exon containers for source and target. Walk along both +# lists, set a flag when you first encounter an exon (i.e. it starts). +# Record all alternative exons until you encounter the exon again (exon +# ends), then score against all alternative exons you've recorded. +# +sub build_overlap_scores { + my ( $self, $matrix ) = @_; + + unless ($matrix + and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') ) + { + throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + # get sorted list of exon containers + $self->logger->info( "Reading sorted exons from cache...\n", + 1, 'stamped' ); + + my @source_exons = + $self->sort_exons( [ + values %{ $self->cache->get_by_name( 'exons_by_id', 'source' ) } + ] ); + + my @target_exons = + $self->sort_exons( [ + values %{ $self->cache->get_by_name( 'exons_by_id', 'target' ) } + ] ); + + $self->logger->info( "Done.\n", 1, 'stamped' ); + + # get first source and target exon container + my $source_ec = shift(@source_exons); + my $target_ec = shift(@target_exons); + + my %source_overlap = (); + my %target_overlap = (); + + $self->logger->info( "Scoring...\n", 1, 'stamped' ); + + while ( $source_ec or $target_ec ) { + my $add_source = 0; + my $add_target = 0; + + # compare exon containers + if ( $source_ec && $target_ec ) { + my $cmp = + $self->compare_exon_containers( $source_ec, $target_ec ); + if ( $cmp <= 0 ) { $add_source = 1 } + if ( $cmp >= 0 ) { $add_target = 1 } + } + elsif ($source_ec) { + $add_source = 1; + } + elsif ($target_ec) { + $add_target = 1; + } + else { + die('The world is a strange place'); + } + + if ($add_source) { + if ( $source_overlap{ $source_ec->[0] } ) { + # remove exon from list of overlapping source exons to score + # target against + delete $source_overlap{ $source_ec->[0] }; + } + else { + # add exon to list of overlapping source exons to score target + # against + $source_overlap{ $source_ec->[0] } = $source_ec->[0]; + + # score source exon against all target exons in current overlap + # list + foreach my $target_exon ( values %target_overlap ) { + if ( defined( $matrix->get_score( + $source_ec->[0]->id, $target_exon->id + ) ) ) + { + next; + } + + $self->calc_overlap_score( $source_ec->[0], $target_exon, + $matrix ); + } + } ## end else [ if ( $source_overlap{ ...})] + + # get next source exon container + $source_ec = shift(@source_exons); + } ## end if ($add_source) + + if ($add_target) { + if ( $target_overlap{ $target_ec->[0] } ) { + # remove exon from list of overlapping target exons to score + # source against + delete $target_overlap{ $target_ec->[0] }; + } + else { + # add exon to list of overlapping target exons to score source + # against + $target_overlap{ $target_ec->[0] } = $target_ec->[0]; + + # score target exon against all source exons in current overlap + # list + foreach my $source_exon ( values %source_overlap ) { + if ( defined( $matrix->get_score( + $source_exon->id, $target_ec->[0]->id + ) ) ) + { + next; + } + + $self->calc_overlap_score( $source_exon, $target_ec->[0], + $matrix ); + } + } + + # get next target exon container + $target_ec = shift(@target_exons); + } ## end if ($add_target) + } ## end while ( $source_ec or $target_ec) + + $self->logger->info( "Done.\n", 1, 'stamped' ); + + return $matrix; +} ## end sub build_overlap_scores + + +# +# Return a list of exon containers, sorted by seq_region_name, then location +# (where location is either start-1 or end, so each exon is in the list twice). +# An exon container is a listrefs of a TinyExon object and its location. This +# implements the ExonSortContainer in the java application. +# +sub sort_exons { + my $self = shift; + my $exons = shift; + + return sort { + ( $a->[0]->common_sr_name cmp $b->[0]->common_sr_name ) || + ( $a->[1] <=> $b->[1] ) + } ( map { [ $_, $_->common_start - 1 ] } @$exons ), + ( map { [ $_, $_->common_end ] } @$exons ); +} + +sub compare_exon_containers { + my $self = shift; + my $e1 = shift; + my $e2 = shift; + + return ( ( $e1->[0]->common_sr_name cmp $e2->[0]->common_sr_name ) || + ( $e1->[1] <=> $e2->[1] ) ); +} + +# +# Calculates overlap score between two exons. Its done by dividing +# overlap region by exons sizes. 1.0 is full overlap on both exons. +# Score of at least 0.5 are added to the exon scoring matrix. +# +sub calc_overlap_score { + my $self = shift; + my $source_exon = shift; + my $target_exon = shift; + my $matrix = shift; + + my ( $start, $end ); + + # don't score if exons on different strand + return unless ( $source_exon->strand == $target_exon->strand ); + + # determine overlap start + if ( $source_exon->start > $target_exon->start ) { + $start = $source_exon->start; + } + else { + $start = $target_exon->start; + } + + # determine overlap end + if ( $source_exon->end < $target_exon->end ) { + $end = $source_exon->end; + } + else { + $end = $target_exon->end; + } + + # + # Calculate score, which is defined as average overlap / exon length + # ratio. + # + + my $overlap = $end - $start + 1; + my $source_length = $source_exon->end - $source_exon->start + 1; + my $target_length = $target_exon->end - $target_exon->start + 1; + + my $score = ( $overlap/$source_length + $overlap/$target_length )/2; + + # The following penalty was removed because it meant that an exon + # whose sequence and position had not changed would become a + # candidate for similarity mapping when its phase changed. This + # caused lots of criss-crossing stable ID history entries between + # genes/transcripts/translations in some gene families. + # + ## PENALTY: + ## penalise by 10% if phase if different + #if ( $source_exon->phase != $target_exon->phase ) { + #$score *= 0.9; + #} + + # add score to scoring matrix if it's at least 0.5 + if ( $score >= 0.5 ) { + $matrix->add_score( $source_exon->id, $target_exon->id, $score ); + } + +} ## end sub calc_overlap_score + + +sub run_exonerate { + my $self = shift; + + my $source_file = $self->exon_fasta_file('source'); + my $target_file = $self->exon_fasta_file('target'); + my $source_size = -s $source_file; + my $target_size = -s $target_file; + + # check if fasta files exist and are not empty + unless ($source_size and $target_size) { + throw("Can't find exon fasta files."); + } + + # create an empty lsf log directory + my $logpath = path_append($self->logger->logpath, 'exonerate'); + system("rm -rf $logpath") == 0 or + $self->logger->error("Unable to delete lsf log dir $logpath: $!\n"); + system("mkdir -p $logpath") == 0 or + $self->logger->error("Can't create lsf log dir $logpath: $!\n"); + + # delete exonerate output from previous runs + my $dump_path = $self->cache->dump_path; + + opendir(DUMPDIR, $dump_path) or + $self->logger->error("Can't open $dump_path for reading: $!"); + + while (defined(my $file = readdir(DUMPDIR))) { + next unless /exonerate_map\.\d+/; + + unlink("$dump_path/$file") or + $self->logger->error("Can't delete $dump_path/$file: $!"); + } + + closedir(DUMPDIR); + + # determine number of jobs to split task into + my $bytes_per_job = $self->conf->param('exonerate_bytes_per_job') + || 250000; + my $num_jobs = $self->conf->param('exonerate_jobs'); + $num_jobs ||= int( $source_size/$bytes_per_job + 1 ); + + my $percent = + int( ( $self->conf->param('exonerate_threshold') || 0.5 )*100 ); + my $lsf_name = 'idmapping_exonerate_' . time; + my $exonerate_path = $self->conf->param('exonerate_path'); + my $exonerate_extra_params = + $self->conf->param('exonerate_extra_params'); + + # + # run exonerate jobs using lsf + # + my $exonerate_job = + qq{$exonerate_path } . + qq{--query $source_file --target $target_file } . + q{--querychunkid $LSB_JOBINDEX } . + qq{--querychunktotal $num_jobs } . + q{--model ungapped -M 1000 -D 100 } . + q{--showalignment FALSE --subopt no } . qq{--percent $percent } . + $self->conf->param('exonerate_extra_params') . " " . + q{--ryo 'myinfo: %qi %ti %et %ql %tl\n' } . + qq{| grep '^myinfo:' > $dump_path/exonerate_map.\$LSB_JOBINDEX} . + "\n"; + + $self->logger->info("Submitting $num_jobs exonerate jobs to lsf.\n"); + $self->logger->debug("$exonerate_job\n\n"); + + my $bsub_cmd = sprintf( + "|bsub -J%s[1-%d]%%%d -o %s/exonerate.%%I.out %s", + $lsf_name, + $num_jobs, + $self->conf()->param('exonerate_concurrent_jobs') || 200, + $logpath, + $self->conf()->param('lsf_opt_exonerate') ); + + local *BSUB; + open( BSUB, $bsub_cmd ) + or $self->logger->error("Could not open open pipe to bsub: $!\n"); + + print BSUB $exonerate_job; + $self->logger->error("Error submitting exonerate jobs: $!\n") + unless ($? == 0); + close BSUB; + + # submit dependent job to monitor finishing of exonerate jobs + $self->logger->info("Waiting for exonerate jobs to finish...\n", 0, 'stamped'); + + my $dependent_job = qq{bsub -K -w "ended($lsf_name)" -q small } . + qq{-o $logpath/exonerate_depend.out /bin/true}; + + system($dependent_job) == 0 or + $self->logger->error("Error submitting dependent job: $!\n"); + + $self->logger->info("All exonerate jobs finished.\n", 0, 'stamped'); + + # + # check results + # + my @missing; + my @error; + + for (my $i = 1; $i <= $num_jobs; $i++) { + + # check that output file exists + my $outfile = "$dump_path/exonerate_map.$i"; + push @missing, $outfile unless (-f "$outfile"); + + # check no errors occurred + my $errfile = "$logpath/exonerate.$i.err"; + push @error, $errfile if (-s "$errfile"); + } + + if (@missing) { + $self->logger->info("Couldn't find all exonerate output files. These are missing:\n"); + foreach (@missing) { + $self->logger->info("$_\n", 1); + } + + exit(1); + } + + if (@error) { + $self->logger->info("One or more exonerate jobs failed. Check these error files:\n"); + foreach (@error) { + $self->logger->info("$_\n", 1); + } + + exit(1); + } + +} + + +sub exon_fasta_file { + my $self = shift; + my $type = shift; + + throw("You must provide a type.") unless $type; + + return $self->cache->dump_path."/$type.exons.fasta"; +} + + +sub dump_filtered_exons { + my $self = shift; + my $matrix = shift; + + unless ($matrix and + $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('You must provide a ScoredMappingMatrix.'); + } + + # write exons to fasta files + my $source_count = $self->write_filtered_exons('source', $matrix); + my $target_count = $self->write_filtered_exons('target', $matrix); + + # return true if both source and target exons were written; otherwise we + # don't need to run exonerate + return (($source_count > 0) and ($target_count > 0)); +} + + +sub write_filtered_exons { + my $self = shift; + my $type = shift; + my $matrix = shift; + + throw("You must provide a type.") unless $type; + unless ($matrix and + $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('You must provide a ScoredMappingMatrix.'); + } + + $self->logger->info("\nDumping $type exons to fasta file...\n", 0, 'stamped'); + + # don't dump exons shorter than this + my $min_exon_length = $self->conf->param('min_exon_length') || 15; + + # counters + my $total_exons = 0; + my $dumped_exons = 0; + + # filehandle for fasta files + my $fh; + my $file = $self->exon_fasta_file($type); + open($fh, '>', $file) or throw("Unable to open $file for writing: $!"); + + # loop over exons, dump sequence to fasta file if longer than threshold and + # score < 1 + EXON: + foreach my $eid (sort { $b <=> $a } + keys %{ $self->cache->get_by_name('exons_by_id', $type) }) { + + my $exon = $self->cache->get_by_key('exons_by_id', $type, $eid); + + $total_exons++; + + # skip if exon shorter than threshold + next EXON if ($exon->length < $min_exon_length); + + # skip if overlap score with any other exon is 1 + if ( $type eq 'source' ) { + foreach my $target ( @{ $matrix->get_targets_for_source($eid) } ) + { + if ( $matrix->get_score( $eid, $target ) > 0.9999 ) { + next EXON; + } + } + } else { + foreach my $source ( @{ $matrix->get_sources_for_target($eid) } ) + { + if ( $matrix->get_score( $source, $eid ) > 0.9999 ) { + next EXON; + } + } + } + + # write exon to fasta file + print $fh '>', $eid, "\n", $exon->seq, "\n"; + + $dumped_exons++; + + } + + close($fh); + + # log + my $fmt = "%-30s%10s\n"; + my $size = -s $file; + $self->logger->info(sprintf($fmt, 'Total exons:', $total_exons), 1); + $self->logger->info(sprintf($fmt, 'Dumped exons:', $dumped_exons), 1); + $self->logger->info(sprintf($fmt, 'Dump file size:', parse_bytes($size)), 1); + $self->logger->info("Done.\n\n", 0, 'stamped'); + + return $dumped_exons; +} + +sub parse_exonerate_results { + my ( $self, $exonerate_matrix ) = @_; + + unless ( defined($exonerate_matrix) + && + $exonerate_matrix->isa( + 'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') + ) + { + throw('You must provide a ScoredMappingMatrix.'); + } + + $self->logger->info( "Parsing exonerate results...\n", 0, 'stamped' ); + + # loop over all result files + my $dump_path = $self->cache->dump_path; + my $num_files = 0; + my $num_lines = 0; + + opendir( DUMPDIR, $dump_path ) or + $self->logger->error("Can't open $dump_path for reading: $!"); + + my $penalised = 0; + my $killed = 0; + + while ( defined( my $file = readdir(DUMPDIR) ) ) { + unless ( $file =~ /exonerate_map\.\d+/ ) { next } + + $num_files++; + + open( F, '<', "$dump_path/$file" ); + + my $threshold = $self->conf->param('exonerate_threshold') || 0.5; + + while () { + $num_lines++; + chomp; + + # line format: + # myinfo: source_id target_id match_length source_length target_length + my ( undef, $source_id, $target_id, $match_length, $source_length, + $target_length ) + = split; + + my $score = 0; + + if ( $source_length == 0 or $target_length == 0 ) { + $self->logger->warning( + "Alignment length is 0 for $source_id or $target_id.\n"); + } + else { + $score = 2*$match_length/( $source_length + $target_length ); + + } + + if ( $score > $threshold ) { + my $source_sr = + $self->cache() + ->get_by_key( 'exons_by_id', 'source', $source_id ) + ->seq_region_name(); + my $target_sr = + $self->cache() + ->get_by_key( 'exons_by_id', 'target', $target_id ) + ->seq_region_name(); + + if ( $source_sr ne $target_sr ) { + # PENALTY: The target and source are not on the same + # seq_region. + $score *= 0.70; + + # With a penalty of 0.7, exonerate scores need to be above + # approximately 0.714 to make the 0.5 threshold. + + ++$penalised; + } + + if ( $score > $threshold ) { + $exonerate_matrix->add_score( $source_id, $target_id, + $score ); + } + else { + ++$killed; + } + } ## end if ( $score > $threshold) + + } ## end while () + + close(F); + } ## end while ( defined( my $file...)) + + closedir(DUMPDIR); + + $self->logger->info( + "Done parsing $num_lines lines from $num_files result files.\n", + 0, 'stamped' ); + $self->logger->info( "Penalised $penalised exon alignments " . + "for not being on the same seq_region " . + "($killed killed).\n", + 0, + 'stamped' ); + + return $exonerate_matrix; +} ## end sub parse_exonerate_results + + +sub non_mapped_transcript_rescore { + my $self = shift; + my $matrix = shift; + my $transcript_mappings = shift; + + # argument checks + unless ($matrix + and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') ) + { + throw( + 'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix of exons.'); + } + + unless ( $transcript_mappings + and + $transcript_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList') ) + { + throw( + 'Need a Bio::EnsEMBL::IdMapping::MappingList of transcripts.'); + } + + # create of lookup hash of mapped source transcripts to target + # transcripts + my %transcript_lookup = + map { $_->source => $_->target } + @{ $transcript_mappings->get_all_Entries }; + + my $i = 0; + + foreach my $entry ( @{ $matrix->get_all_Entries } ) { + + my @source_transcripts = @{ + $self->cache->get_by_key( 'transcripts_by_exon_id', 'source', + $entry->source ) }; + my @target_transcripts = @{ + $self->cache->get_by_key( 'transcripts_by_exon_id', 'target', + $entry->target ) }; + + my $found_mapped = 0; + + TR: + foreach my $source_tr (@source_transcripts) { + foreach my $target_tr (@target_transcripts) { + my $mapped_target = $transcript_lookup{ $source_tr->id }; + + if ( $mapped_target and ( $mapped_target == $target_tr->id ) ) { + $found_mapped = 1; + last TR; + } + } + } + + unless ($found_mapped) { + # PENALTY: The exon appears to belong to a transcript that has not + # been mapped. + $matrix->set_score( $entry->source(), $entry->target(), + 0.9*$entry->score() ); + $i++; + } + } ## end foreach my $entry ( @{ $matrix...}) + + $self->logger->debug( "Scored exons in non-mapped transcripts: $i\n", + 1 ); +} ## end sub non_mapped_transcript_rescore + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/IdMapping/GeneScoreBuilder.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/GeneScoreBuilder.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,447 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +Combines ExonScoreBuilder, ExonDirectMapper and ExonerateRunner from +Java application. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::IdMapping::GeneScoreBuilder; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::IdMapping::ScoreBuilder; +our @ISA = qw(Bio::EnsEMBL::IdMapping::ScoreBuilder); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append); +use Bio::EnsEMBL::IdMapping::ScoredMappingMatrix; + + +sub score_genes { + my $self = shift; + my $transcript_matrix = shift; + + unless ($transcript_matrix and + $transcript_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + $self->logger->info("-- Scoring genes...\n\n", 0, 'stamped'); + + # build scores based on transcript scores + my $matrix = $self->scores_from_transcript_scores($transcript_matrix); + + # debug logging + if ($self->logger->loglevel eq 'debug') { + $matrix->log('gene', $self->conf->param('basedir')); + } + + # log stats of combined matrix + my $fmt = "%-40s%10.0f\n"; + + $self->logger->info("Scoring matrix:\n"); + + $self->logger->info(sprintf($fmt, "Total source genes:", + $self->cache->get_count_by_name('genes_by_id', 'source')), 1); + + $self->logger->info(sprintf($fmt, "Total target genes:", + $self->cache->get_count_by_name('genes_by_id', 'target')), 1); + + $self->log_matrix_stats($matrix); + + $self->logger->info("\nDone with gene scoring.\n\n"); + + return $matrix; +} + + +sub scores_from_transcript_scores { + my $self = shift; + my $transcript_matrix = shift; + + unless ($transcript_matrix and + $transcript_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + my $dump_path = path_append($self->conf->param('basedir'), 'matrix'); + + my $matrix = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'gene_matrix.ser', + ); + + my $gene_cache = $matrix->cache_file; + + if (-s $gene_cache) { + + # read from file + $self->logger->info("Reading gene scoring matrix from file...\n", 0, 'stamped'); + $self->logger->debug("Cache file $gene_cache.\n", 1); + $matrix->read_from_file; + $self->logger->info("Done.\n\n", 0, 'stamped'); + + } else { + + # build scoring matrix + $self->logger->info("No gene scoring matrix found. Will build new one.\n"); + + $self->logger->info("Gene scoring...\n", 0, 'stamped'); + $matrix = $self->build_scores($matrix, $transcript_matrix); + $self->logger->info("Done.\n\n", 0, 'stamped'); + + # write scoring matrix to file + $matrix->write_to_file; + + } + + return $matrix; +} + + +sub build_scores { + my $self = shift; + my $matrix = shift; + my $transcript_matrix = shift; + + unless ($matrix and + $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('Need a gene Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + unless ($transcript_matrix and + $transcript_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + # first find out which source and target genes have scoring transcripts and + # build a "flag" matrix for these genes (all scores are 1) + $self->flag_matrix_from_transcript_scores($matrix, $transcript_matrix); + + # now calculate the actual scores for the genes in the flag matrix + my $final_matrix = $self->score_matrix_from_flag_matrix($matrix, + $transcript_matrix); + + return $final_matrix; +} + + +sub flag_matrix_from_transcript_scores { + my $self = shift; + my $matrix = shift; + my $transcript_matrix = shift; + + unless ($matrix and + $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('Need a gene Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + unless ($transcript_matrix and + $transcript_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + # initialise progress logger + my $i; + my $num_entries = $transcript_matrix->get_entry_count; + my $progress_id = $self->logger->init_progress($num_entries, 100); + + $self->logger->info("Creating flag matrix...\n", 1); + + # for every transcript scoring matrix entry, make an entry in the gene flag + # matrix. + foreach my $entry (@{ $transcript_matrix->get_all_Entries }) { + + $self->logger->log_progress($progress_id, ++$i, 1); + + my $source_gene = $self->cache->get_by_key('genes_by_transcript_id', + 'source', $entry->source); + + my $target_gene = $self->cache->get_by_key('genes_by_transcript_id', + 'target', $entry->target); + + $matrix->add_score($source_gene->id, $target_gene->id, 1); + } + + $self->logger->info("\n"); + + return $matrix; +} + + +sub score_matrix_from_flag_matrix { + my $self = shift; + my $flag_matrix = shift; + my $transcript_matrix = shift; + my $simple_scoring = shift; + + unless ( $flag_matrix and + $flag_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') ) + { + throw('Need a gene Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + unless ( $transcript_matrix + and + $transcript_matrix->isa( + 'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') + ) + { + throw( + 'Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.' + ); + } + + # create a new scoring matrix which will replace the flag matrix + my $matrix = + Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new( + -DUMP_PATH => $flag_matrix->dump_path, + -CACHE_FILE => $flag_matrix->cache_file_name, + ); + + # initialise progress logger + my $i; + my $num_entries = $flag_matrix->get_entry_count; + my $progress_id = $self->logger->init_progress( $num_entries, 100 ); + + $self->logger->info( "Creating score matrix from flag matrix...\n", + 1 ); + + my $gene_score_threshold = + $self->conf()->param('gene_score_threshold') || 0; + + # loop over flag matrix and do proper scoring for each entry + foreach my $entry ( @{ $flag_matrix->get_all_Entries } ) { + + $self->logger->log_progress( $progress_id, ++$i, 1 ); + + my $score = 0; + my $source_gene = + $self->cache->get_by_key( 'genes_by_id', 'source', + $entry->source ); + my $target_gene = + $self->cache->get_by_key( 'genes_by_id', 'target', + $entry->target ); + + if ($simple_scoring) { + + # simple scoring (used for rescoring purposes) + $score = + $self->simple_gene_gene_score( $source_gene, $target_gene, + $transcript_matrix ); + + } + else { + + # full scoring + $score = + $self->complex_gene_gene_score( $source_gene, $target_gene, + $transcript_matrix ); + } + + if ( $score > $gene_score_threshold ) { + $matrix->add_score( $entry->source, $entry->target, $score ); + } + } ## end foreach my $entry ( @{ $flag_matrix...}) + + $self->logger->info("\n"); + + return $matrix; +} ## end sub score_matrix_from_flag_matrix + + +sub complex_gene_gene_score { + my $self = shift; + my $source_gene = shift; + my $target_gene = shift; + my $transcript_matrix = shift; + + my $source_gene_score = 0; + my $target_gene_score = 0; + my $source_gene_accum_length = 0; # sum of all transcript lengths + my $target_gene_accum_length = 0; # sum of all transcript lengths + + # We are only interested in scoring with transcripts that are in the target + # gene. The scored mapping matrix may contain scores for transcripts that + # aren't in this gene so create a hash of the target genes's transcripts + my %target_transcripts = map { $_->id => 1 } + @{ $target_gene->get_all_Transcripts }; + + # loop over source transcripts + foreach my $source_transcript (@{ $source_gene->get_all_Transcripts }) { + + # now loop over target transcripts and find the highest scoring target + # transcript belonging to the target gene + my $max_source_score = -1; + + foreach my $target_transcript_id (@{ $transcript_matrix->get_targets_for_source($source_transcript->id) }) { + + next unless ($target_transcripts{$target_transcript_id}); + + my $score = $transcript_matrix->get_score( + $source_transcript->id, $target_transcript_id); + $max_source_score = $score if ($score > $max_source_score); + } + + if ($max_source_score > 0) { + $source_gene_score += ($max_source_score * $source_transcript->length); + } + + $source_gene_accum_length += $source_transcript->length; + } + + # now do the same for target genes + my %source_transcripts = map { $_->id => 1 } + @{ $source_gene->get_all_Transcripts }; + + # loop over target transcripts + foreach my $target_transcript (@{ $target_gene->get_all_Transcripts }) { + + # now loop over source transcripts and find the highest scoring source + # transcript belonging to the source gene + my $max_target_score = -1; + + foreach my $source_transcript_id (@{ $transcript_matrix->get_sources_for_target($target_transcript->id) }) { + + next unless ($source_transcripts{$source_transcript_id}); + + my $score = $transcript_matrix->get_score( + $source_transcript_id, $target_transcript->id); + $max_target_score = $score if ($score > $max_target_score); + } + + if ($max_target_score > 0) { + $target_gene_score += ($max_target_score * $target_transcript->length); + } + + $target_gene_accum_length += $target_transcript->length; + } + + # calculate overall score for this gene + my $gene_score = 0; + + if (($source_gene_accum_length + $target_gene_accum_length) > 0) { + + $gene_score = ($source_gene_score + $target_gene_score) / + ($source_gene_accum_length + $target_gene_accum_length); + + } else { + + $self->logger->warning("Combined transcript length of source (".$source_gene->id.") and target (".$target_gene->id.") gene is zero!\n", 1); + + } + + if ($gene_score > 1) { + $self->logger->warning("Illegal gene score: $gene_score (". + join("|", $source_gene_score, $target_gene_score, + $source_gene_accum_length, $target_gene_accum_length).")\n", 1); + } + + return $gene_score; +} + + +# +# Simplified scoring for genes. Score is best scoring transcript pair. +# This is used when the more elaborate gene representing score does not +# distinguish very well. +# +sub simple_gene_gene_score { + my $self = shift; + my $source_gene = shift; + my $target_gene = shift; + my $transcript_matrix = shift; + + my $gene_score = 0; + + foreach my $source_transcript (@{ $source_gene->get_all_Transcripts }) { + foreach my $target_transcript (@{ $target_gene->get_all_Transcripts }) { + + my $score = $transcript_matrix->get_score($source_transcript->id, + $target_transcript->id); + + $gene_score = $score if ($score > $gene_score); + } + } + + return $gene_score; +} + + +sub simple_gene_rescore { + my $self = shift; + my $gene_matrix = shift; + my $transcript_matrix = shift; + + $gene_matrix = $self->score_matrix_from_flag_matrix($gene_matrix, + $transcript_matrix, 1); +} + +# +# penalise scores between genes with different biotypes. +# entries are modified in place +# +sub biotype_gene_rescore { + my $self = shift; + my $matrix = shift; + + unless ($matrix + and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') ) + { + throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + my $i = 0; + + foreach my $entry ( @{ $matrix->get_all_Entries } ) { + + my $source_gene = + $self->cache->get_by_key( 'genes_by_id', 'source', + $entry->source ); + + my $target_gene = + $self->cache->get_by_key( 'genes_by_id', 'target', + $entry->target ); + + if ( $source_gene->biotype() ne $target_gene->biotype() ) { + # PENALTY: Lower the score for mappings that differ in biotype. + $matrix->set_score( $entry->source(), $entry->target(), + 0.9*$entry->score() ); + $i++; + } + } + + $self->logger->debug( "Scored genes with biotype mismatch: $i\n", 1 ); +} ## end sub biotype_gene_rescore + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/IdMapping/InternalIdMapper.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/InternalIdMapper.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,652 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +=head1 METHODS + +=cut + + +package Bio::EnsEMBL::IdMapping::InternalIdMapper; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::IdMapping::BaseObject; +our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::ScriptUtils qw(inject path_append); +use Bio::EnsEMBL::IdMapping::Entry; +use Bio::EnsEMBL::IdMapping::MappingList; +use Bio::EnsEMBL::IdMapping::SyntenyFramework; + + +# scores are considered the same if (2.0 * (s1-s2))/(s1 + s2) < this +use constant SIMILAR_SCORE_RATIO => 0.01; + + +sub map_genes { + my $self = shift; + my $gene_scores = shift; + my $transcript_scores = shift; + my $gsb = shift; + + # argument checks + unless ($gene_scores and + $gene_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('Need a gene Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + unless ($transcript_scores and + $transcript_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + unless ($gsb and + $gsb->isa('Bio::EnsEMBL::IdMapping::GeneScoreBuilder')) { + throw('Need a Bio::EnsEMBL::IdMapping::GeneScoreBuilder.'); + } + + $self->logger->info("== Internal ID mapping for genes...\n\n", 0, 'stamped'); + + my $dump_path = path_append($self->conf->param('basedir'), 'mapping'); + + my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'gene_mappings.ser', + ); + + my $mapping_cache = $mappings->cache_file; + + if (-s $mapping_cache) { + + # read from file + $self->logger->info("Reading gene mappings from file...\n", 0, 'stamped'); + $self->logger->debug("Cache file $mapping_cache.\n", 1); + $mappings->read_from_file; + $self->logger->info("Done.\n\n", 0, 'stamped'); + + } else { + + # create gene mappings + $self->logger->info("No gene mappings found. Will calculate them now.\n"); + + # determine which plugin methods to run + my @default_plugins = (qw( + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::init_basic + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::best_transcript + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::biotype + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::synteny + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::internal_id + )); + + my @plugins = $self->conf->param('plugin_internal_id_mappers_gene'); + @plugins = @default_plugins unless (defined($plugins[0])); + + my $new_mappings = Bio::EnsEMBL::IdMapping::MappingList->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'gene_mappings0.ser', + ); + my @mappings = (); + my $i = 0; + + # + # run the scoring chain + # + foreach my $plugin (@plugins) { + ($gene_scores, $new_mappings) = $self->delegate_to_plugin($plugin, $i++, + $gsb, $new_mappings, $gene_scores, $transcript_scores); + + push(@mappings, $new_mappings); + } + + # report remaining ambiguities + $self->logger->info($gene_scores->get_source_count. + " source genes are ambiguous with ". + $gene_scores->get_target_count." target genes.\n\n"); + + $self->log_ambiguous($gene_scores, 'gene'); + + # merge mappings and write to file + $mappings->add_all(@mappings); + $mappings->write_to_file; + + if ($self->logger->loglevel eq 'debug') { + $mappings->log('gene', $self->conf->param('basedir')); + } + + $self->logger->info("Done.\n\n", 0, 'stamped'); + + } + + return $mappings; +} + + +sub map_transcripts { + my $self = shift; + my $transcript_scores = shift; + my $gene_mappings = shift; + my $tsb = shift; + + # argument checks + unless ($transcript_scores and + $transcript_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + unless ($gene_mappings and + $gene_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) { + throw('Need a gene Bio::EnsEMBL::IdMapping::MappingList.'); + } + + unless ($tsb and + $tsb->isa('Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder')) { + throw('Need a Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder.'); + } + + $self->logger->info("== Internal ID mapping for transcripts...\n\n", 0, 'stamped'); + + my $dump_path = path_append($self->conf->param('basedir'), 'mapping'); + + my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'transcript_mappings.ser', + ); + + my $mapping_cache = $mappings->cache_file; + + if (-s $mapping_cache) { + + # read from file + $self->logger->info("Reading transcript mappings from file...\n", 0, + 'stamped'); + $self->logger->debug("Cache file $mapping_cache.\n", 1); + $mappings->read_from_file; + $self->logger->info("Done.\n\n", 0, 'stamped'); + + } else { + + # create transcript mappings + $self->logger->info("No transcript mappings found. Will calculate them now.\n"); + + # determine which plugin methods to run + my @default_plugins = (qw( + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::init_basic + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::non_exact_translation + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::biotype + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::mapped_gene + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::single_gene + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::internal_id + )); + + my @plugins = $self->conf->param('plugin_internal_id_mappers_transcript'); + @plugins = @default_plugins unless (defined($plugins[0])); + + my $new_mappings = Bio::EnsEMBL::IdMapping::MappingList->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'transcript_mappings0.ser', + ); + my @mappings = (); + my $i = 0; + + # + # run the scoring chain + # + foreach my $plugin (@plugins) { + ($transcript_scores, $new_mappings) = $self->delegate_to_plugin($plugin, + $i++, $tsb, $new_mappings, $transcript_scores, $gene_mappings); + + push(@mappings, $new_mappings); + } + + # report remaining ambiguities + $self->logger->info($transcript_scores->get_source_count. + " source transcripts are ambiguous with ". + $transcript_scores->get_target_count." target transcripts.\n\n"); + + $self->log_ambiguous($transcript_scores, 'transcript'); + + # merge mappings and write to file + $mappings->add_all(@mappings); + $mappings->write_to_file; + + if ($self->logger->loglevel eq 'debug') { + $mappings->log('transcript', $self->conf->param('basedir')); + } + + $self->logger->info("Done.\n\n", 0, 'stamped'); + + } + + return $mappings; + +} + + +sub map_exons { + my $self = shift; + my $exon_scores = shift; + my $transcript_mappings = shift; + my $esb = shift; + + # argument checks + unless ($exon_scores and + $exon_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix of exons.'); + } + + unless ($transcript_mappings and + $transcript_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) { + throw('Need a Bio::EnsEMBL::IdMapping::MappingList of transcripts.'); + } + + unless ($esb and + $esb->isa('Bio::EnsEMBL::IdMapping::ExonScoreBuilder')) { + throw('Need a Bio::EnsEMBL::IdMapping::ExonScoreBuilder.'); + } + + $self->logger->info("== Internal ID mapping for exons...\n\n", 0, 'stamped'); + + my $dump_path = path_append($self->conf->param('basedir'), 'mapping'); + + my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'exon_mappings.ser', + ); + + my $mapping_cache = $mappings->cache_file; + + if (-s $mapping_cache) { + + # read from file + $self->logger->info("Reading exon mappings from file...\n", 0, + 'stamped'); + $self->logger->debug("Cache file $mapping_cache.\n", 1); + $mappings->read_from_file; + $self->logger->info("Done.\n\n", 0, 'stamped'); + + } else { + + # create exon mappings + $self->logger->info("No exon mappings found. Will calculate them now.\n"); + + # determine which plugin methods to run + my @default_plugins = (qw( + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblExonGeneric::init_basic + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblExonGeneric::mapped_transcript + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblExonGeneric::internal_id + )); + + my @plugins = $self->conf->param('plugin_internal_id_mappers_exon'); + @plugins = @default_plugins unless (defined($plugins[0])); + + my $new_mappings = Bio::EnsEMBL::IdMapping::MappingList->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'exon_mappings0.ser', + ); + my @mappings = (); + my $i = 0; + + # + # run the scoring chain + # + foreach my $plugin (@plugins) { + ($exon_scores, $new_mappings) = $self->delegate_to_plugin($plugin, $i++, + $esb, $new_mappings, $exon_scores); + + push(@mappings, $new_mappings); + } + + # report remaining ambiguities + $self->logger->info($exon_scores->get_source_count. + " source exons are ambiguous with ". + $exon_scores->get_target_count." target exons.\n\n"); + + $self->log_ambiguous($exon_scores, 'exon'); + + # merge mappings and write to file + $mappings->add_all(@mappings); + $mappings->write_to_file; + + if ($self->logger->loglevel eq 'debug') { + $mappings->log('exon', $self->conf->param('basedir')); + } + + $self->logger->info("Done.\n\n", 0, 'stamped'); + + } + + return $mappings; + +} + + +# +# this is not implemented as a plugin, since a) it's too simple and b) it's +# tied to transcripts so there are no translation scores or score builder. +# +sub map_translations { + my $self = shift; + my $transcript_mappings = shift; + + # argument checks + unless ($transcript_mappings and + $transcript_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) { + throw('Need a Bio::EnsEMBL::IdMapping::MappingList of transcripts.'); + } + + $self->logger->info("== Internal ID mapping for translations...\n\n", 0, 'stamped'); + + my $dump_path = path_append($self->conf->param('basedir'), 'mapping'); + + my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'translation_mappings.ser', + ); + + my $mapping_cache = $mappings->cache_file; + + if (-s $mapping_cache) { + + # read from file + $self->logger->info("Reading translation mappings from file...\n", 0, + 'stamped'); + $self->logger->debug("Cache file $mapping_cache.\n", 1); + $mappings->read_from_file; + $self->logger->info("Done.\n\n", 0, 'stamped'); + + } else { + + # create translation mappings + $self->logger->info("No translation mappings found. Will calculate them now.\n"); + + $self->logger->info("Translation mapping...\n", 0, 'stamped'); + + # + # map translations for mapped transcripts + # + my $i = 0; + + foreach my $entry (@{ $transcript_mappings->get_all_Entries }) { + + my $source_tl = $self->cache->get_by_key('transcripts_by_id', + 'source', $entry->source)->translation; + my $target_tl = $self->cache->get_by_key('transcripts_by_id', + 'target', $entry->target)->translation; + + if ($source_tl and $target_tl) { + + # add mapping for the translations; note that the score is taken from + # the transcript mapping + my $tl_entry = Bio::EnsEMBL::IdMapping::Entry->new_fast([ + $source_tl->id, $target_tl->id, $entry->score + ]); + $mappings->add_Entry($tl_entry); + + } else { + $i++; + } + + } + + $self->logger->debug("Skipped transcripts without translation: $i\n", 1); + $self->logger->info("New mappings: ".$mappings->get_entry_count."\n\n"); + + $mappings->write_to_file; + + if ($self->logger->loglevel eq 'debug') { + $mappings->log('translation', $self->conf->param('basedir')); + } + + $self->logger->info("Done.\n\n", 0, 'stamped'); + + } + + return $mappings; + +} + + +sub delegate_to_plugin { + my $self = shift; + my $plugin = shift; + my $num = shift; + my $score_builder = shift; + my $mappings = shift; + my $scores = shift; + + # argument checks + unless ($score_builder and + $score_builder->isa('Bio::EnsEMBL::IdMapping::ScoreBuilder')) { + throw('Need a Bio::EnsEMBL::IdMapping::ScoreBuilder.'); + } + + unless ($mappings and + $mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) { + throw('Need a Bio::EnsEMBL::IdMapping::MappingList.'); + } + + unless ($scores and + $scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + # split plugin name into module and method + $plugin =~ /(.*)::(\w+)$/; + my $module = $1; + my $method = $2; + + unless ($module and $method) { + throw("Unable to determine module and method name from $plugin.\n"); + } + + # instantiate the plugin unless we already have an instance + my $plugin_instance; + if ($self->has_plugin($module)) { + + # re-use an existing plugin instance + $plugin_instance = $self->get_plugin($module); + + } else { + + # inject and instantiate the plugin module + inject($module); + $plugin_instance = $module->new( + -LOGGER => $self->logger, + -CONF => $self->conf, + -CACHE => $self->cache + ); + $self->add_plugin($plugin_instance); + + } + + # run the method on the plugin + # + # pass in a sequence number (number of method run, used for generating + # checkpoint files), the scores used for determining the mapping, and all + # other arguments passed to this method (these will vary for different object + # types) + # + # return the scores and mappings to feed into the next plugin in the chain + return $plugin_instance->$method($num, $score_builder, $mappings, $scores, @_); +} + + +sub has_plugin { + my $self = shift; + my $module = shift; + + defined($self->{'_plugins'}->{$module}) ? (return 1) : (return 0); +} + + +sub get_plugin { + my $self = shift; + my $module = shift; + + return $self->{'_plugins'}->{$module}; +} + + +sub add_plugin { + my $self = shift; + my $plugin_instance = shift; + + $self->{'_plugins'}->{ref($plugin_instance)} = $plugin_instance; +} + + +sub log_ambiguous { + my $self = shift; + my $matrix = shift; + my $type = shift; + + unless ($matrix and + $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + # create dump directory if it doesn't exist + my $debug_path = $self->conf->param('basedir').'/debug'; + unless (-d $debug_path) { + system("mkdir -p $debug_path") == 0 or + throw("Unable to create directory $debug_path.\n"); + } + + my $logfile = "$debug_path/ambiguous_${type}.txt"; + + open(my $fh, '>', $logfile) or + throw("Unable to open $logfile for writing: $!"); + + my @low_scoring = (); + my @high_scoring = (); + my $last_id; + + # log by source + foreach my $entry (sort { $a->source <=> $b->source } + @{ $matrix->get_all_Entries }) { + + $last_id ||= $entry->target; + + if ($last_id != $entry->source) { + $self->write_ambiguous($type, 'source', $fh, \@low_scoring, + \@high_scoring); + $last_id = $entry->source; + } + + if ($entry->score < 0.5) { + push @low_scoring, $entry; + } else { + push @high_scoring, $entry; + } + } + + # write last source + $self->write_ambiguous($type, 'source', $fh, \@low_scoring, \@high_scoring); + + # now do the same by target + $last_id = undef; + foreach my $entry (sort { $a->target <=> $b->target } + @{ $matrix->get_all_Entries }) { + + $last_id ||= $entry->target; + + if ($last_id != $entry->target) { + $self->write_ambiguous($type, 'target', $fh, \@low_scoring, + \@high_scoring); + $last_id = $entry->target; + } + + if ($entry->score < 0.5) { + push @low_scoring, $entry; + } else { + push @high_scoring, $entry; + } + } + + # write last target + $self->write_ambiguous($type, 'target', $fh, \@low_scoring, \@high_scoring); + + close($fh); +} + + +sub write_ambiguous { + my ($self, $type, $db_type, $fh, $low, $high) = @_; + + # if only source or target are ambiguous (i.e. you have only one mapping from + # this perspective) then log from the other perspective + if (scalar(@$low) + scalar(@$high) <= 1) { + @$low = (); + @$high = (); + return; + } + + my $first_id; + if (@$low) { + $first_id = $low->[0]->$db_type; + } else { + $first_id = $high->[0]->$db_type; + } + + my $other_db_type; + if ($db_type eq 'source') { + $other_db_type = 'target'; + } else { + $other_db_type = 'source'; + } + + print $fh "$db_type $type $first_id scores ambiguously:\n"; + + # high scorers + if (@$high) { + print $fh " high scoring ${other_db_type}s\n"; + + while (my $e = shift(@$high)) { + print $fh " ", $e->$other_db_type, " ", $e->score, "\n"; + } + } + + # low scorers + if (@$low) { + print $fh " low scoring ${other_db_type}s\n "; + + my $i = 1; + + while (my $e = shift(@$low)) { + print $fh "\n " unless (($i++)%10); + print $fh $e->$other_db_type, ", "; + } + } + + print $fh "\n"; +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/IdMapping/InternalIdMapper/BaseMapper.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/InternalIdMapper/BaseMapper.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,274 @@ + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::IdMapping::BaseObject; +our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append); +use Bio::EnsEMBL::IdMapping::MappingList; + +# scores are considered the same if (2.0 * (s1-s2))/(s1 + s2) < this +use constant SIMILAR_SCORE_RATIO => 0.01; + +# +# find the highest unambiguous score for all sources and targets in a scoring +# matrix +# +sub basic_mapping { + my $self = shift; + my $matrix = shift; + my $mapping_name = shift; + + # argument checks + unless ($matrix + and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') ) + { + throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + throw('Need a name for serialising the mapping.') + unless ($mapping_name); + + # Create a new MappingList object. Specify AUTO_LOAD to load + # serialised existing mappings if found + my $dump_path = + path_append( $self->conf->param('basedir'), 'mapping' ); + + my $mappings = + Bio::EnsEMBL::IdMapping::MappingList->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => "${mapping_name}.ser", + -AUTO_LOAD => 1, ); + + # checkpoint test: return a previously stored MappingList + if ( $mappings->loaded ) { + $self->logger->info( + "Read existing mappings from ${mapping_name}.ser.\n"); + return $mappings; + } + + my $sources_done = {}; + my $targets_done = {}; + + # sort scoring matrix entries by descending score + my @sorted_entries = + sort { $b->score <=> $a->score } @{ $matrix->get_all_Entries }; + + # debug + #my $idx = substr($mapping_name, -1); + + while ( my $entry = shift(@sorted_entries) ) { + + #$self->logger->debug("\nxxx$idx ".$entry->to_string." "); + + # we already found a mapping for either source or target + next + if ( $sources_done->{ $entry->source } + or $targets_done->{ $entry->target } ); + + #$self->logger->debug('d'); + + # there's a better mapping for either source or target + next + if ( $self->higher_score_exists( + $entry, $matrix, $sources_done, $targets_done + ) ); + + #$self->logger->debug('h'); + + # check for ambiguous mappings; they are dealt with later + my $other_sources = []; + my $other_targets = []; + + if ( $self->ambiguous_mapping( $entry, $matrix, + $other_sources, $other_targets ) ) + { + #$self->logger->debug('a'); + + $other_sources = + $self->filter_sources( $other_sources, $sources_done ); + $other_targets = + $self->filter_targets( $other_targets, $targets_done ); + + next if ( scalar(@$other_sources) or scalar(@$other_targets) ); + } + + #$self->logger->debug('A'); + + # this is the best mapping, add it + $mappings->add_Entry($entry); + + $sources_done->{ $entry->source } = 1; + $targets_done->{ $entry->target } = 1; + } ## end while ( my $entry = shift...) + + # create checkpoint + $mappings->write_to_file; + + return $mappings; +} ## end sub basic_mapping + +sub higher_score_exists { + my ( $self, $entry, $matrix, $sources_done, $targets_done ) = @_; + + my $source = $entry->source; + my $target = $entry->target; + my $score = $entry->score; + + foreach + my $other_source ( @{ $matrix->get_sources_for_target($target) } ) + { + if ( $other_source != $source + and !$sources_done->{$other_source} + and $score < $matrix->get_score( $other_source, $target ) ) + { + return 1; + } + } + + foreach + my $other_target ( @{ $matrix->get_targets_for_source($source) } ) + { + if ( $other_target != $target + and !$targets_done->{$other_target} + and $score < $matrix->get_score( $source, $other_target ) ) + { + return 1; + } + } + + return 0; +} ## end sub higher_score_exists + +# +# find ambiguous mappings (see scores_similar() for definition) +# +sub ambiguous_mapping { + my ( $self, $entry, $matrix, $other_sources, $other_targets ) = @_; + + my $source = $entry->source; + my $target = $entry->target; + my $score = $entry->score; + + my $retval = 0; + + foreach + my $other_source ( @{ $matrix->get_sources_for_target($target) } ) + { + my $other_score = $matrix->get_score( $other_source, $target ); + + if ( $other_source != $source + and ( $self->scores_similar( $score, $other_score ) + or $score < $other_score ) ) + { + $retval = 1; + push @{$other_sources}, $other_source; + } + } + + foreach + my $other_target ( @{ $matrix->get_targets_for_source($source) } ) + { + my $other_score = $matrix->get_score( $source, $other_target ); + + if ( $other_target != $target + and ( $self->scores_similar( $score, $other_score ) + or $score < $other_score ) ) + { + $retval = 1; + push @{$other_targets}, $other_target; + } + } + + return $retval; +} ## end sub ambiguous_mapping + +# +# rule for similarity taken from java code... +# +sub scores_similar { + my ( $self, $s1, $s2 ) = @_; + + # always give priority to exact matches over very similar ones + return 0 if ( $s1 == 1 and $s2 < 1 ); + + my $diff = $s1 - $s2; + $diff = -$diff if ( $diff < 0 ); + + my $pc = 2*$diff/( $s1 + $s2 ); + + return ( $pc < SIMILAR_SCORE_RATIO ); +} + +sub filter_sources { + my ( $self, $other_sources, $sources_done ) = @_; + + unless ( scalar( @{$other_sources} ) + and scalar( keys %{$sources_done} ) ) + { + return $other_sources; + } + + my @tmp = (); + + foreach my $e ( @{$other_sources} ) { + push @tmp, $e unless ( $sources_done->{$e} ); + } + + return \@tmp; +} + +sub filter_targets { + my ( $self, $other_targets, $targets_done ) = @_; + + unless ( scalar( @{$other_targets} ) + and scalar( keys %{$targets_done} ) ) + { + return $other_targets; + } + + my @tmp = (); + + foreach my $e ( @{$other_targets} ) { + push @tmp, $e unless ( $targets_done->{$e} ); + } + + return \@tmp; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/IdMapping/InternalIdMapper/EnsemblExonGeneric.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/InternalIdMapper/EnsemblExonGeneric.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,121 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblExonGeneric; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper; +our @ISA = qw(Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + + +# +# basic mapping +# +sub init_basic { + my $self = shift; + my $num = shift; + my $esb = shift; + my $mappings = shift; + my $exon_scores = shift; + + $self->logger->info("Basic exon mapping...\n", 0, 'stamped'); + + $mappings = $self->basic_mapping($exon_scores, "exon_mappings$num"); + $num++; + my $new_scores = $esb->create_shrinked_matrix($exon_scores, $mappings, + "exon_matrix$num"); + + return ($new_scores, $mappings); +} + + +# +# reduce score for mappings of exons which do not belong to mapped +# transcripts +# +sub mapped_transcript { + my $self = shift; + my $num = shift; + my $esb = shift; + my $mappings = shift; + my $exon_scores = shift; + + $self->logger->info("Exons in mapped transcript...\n", 0, 'stamped'); + + unless ($exon_scores->loaded) { + $esb->non_mapped_transcript_rescore($exon_scores, $mappings); + $exon_scores->write_to_file; + } + + $mappings = $self->basic_mapping($exon_scores, "exon_mappings$num"); + $num++; + my $new_scores = $esb->create_shrinked_matrix($exon_scores, $mappings, + "exon_matrix$num"); + + return ($new_scores, $mappings); +} + + +# +# selectively rescore by penalising scores between exons with +# different internalIDs +# +sub internal_id { + my $self = shift; + my $num = shift; + my $esb = shift; + my $mappings = shift; + my $exon_scores = shift; + + $self->logger->info( "Retry with internalID disambiguation...\n", + 0, 'stamped' ); + + if ( !$exon_scores->loaded() ) { + $esb->internal_id_rescore($exon_scores); + $exon_scores->write_to_file(); + } + + $mappings = $self->basic_mapping( $exon_scores, "exon_mappings$num" ); + $num++; + my $new_scores = + $esb->create_shrinked_matrix( $exon_scores, $mappings, + "exon_matrix$num" ); + + return ( $new_scores, $mappings ); +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/IdMapping/InternalIdMapper/EnsemblGeneGeneric.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/InternalIdMapper/EnsemblGeneGeneric.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,188 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric - default Ensembl +InternalIdMapper implementation for genes + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper; +our @ISA = qw(Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append); + + +# +# basic mapping +# +sub init_basic { + my $self = shift; + my $num = shift; + my $gsb = shift; + my $mappings = shift; + my $gene_scores = shift; + + $self->logger->info("Basic gene mapping...\n", 0, 'stamped'); + + $mappings = $self->basic_mapping($gene_scores, "gene_mappings$num"); + $num++; + my $new_scores = $gsb->create_shrinked_matrix($gene_scores, $mappings, + "gene_matrix$num"); + + return ($new_scores, $mappings); +} + + +# +# build the synteny from unambiguous mappings +# +sub synteny { + my $self = shift; + my $num = shift; + my $gsb = shift; + my $mappings = shift; + my $gene_scores = shift; + + unless ($gene_scores->loaded) { + $self->logger->info("Synteny Framework building...\n", 0, 'stamped'); + my $dump_path = path_append($self->conf->param('basedir'), 'mapping'); + my $sf = Bio::EnsEMBL::IdMapping::SyntenyFramework->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'synteny_framework.ser', + -LOGGER => $self->logger, + -CONF => $self->conf, + -CACHE => $self->cache, + ); + $sf->build_synteny($mappings); + + # use it to rescore the genes + $self->logger->info("\nSynteny assisted mapping...\n", 0, 'stamped'); + $gene_scores = $sf->rescore_gene_matrix_lsf($gene_scores); + + # checkpoint + $gene_scores->write_to_file; + } + + my $new_mappings = $self->basic_mapping($gene_scores, "gene_mappings$num"); + $num++; + my $new_scores = $gsb->create_shrinked_matrix($gene_scores, $new_mappings, + "gene_matrix$num"); + + return ($new_scores, $new_mappings); +} + + +# +# rescore with simple scoring function and try again +# +sub best_transcript { + my $self = shift; + my $num = shift; + my $gsb = shift; + my $mappings = shift; + my $gene_scores = shift; + my $transcript_scores = shift; + + $self->logger->info("Retry with simple best transcript score...\n", 0, 'stamped'); + + unless ($gene_scores->loaded) { + $gsb->simple_gene_rescore($gene_scores, $transcript_scores); + $gene_scores->write_to_file; + } + + my $new_mappings = $self->basic_mapping($gene_scores, "gene_mappings$num"); + $num++; + my $new_scores = $gsb->create_shrinked_matrix($gene_scores, $new_mappings, + "gene_matrix$num"); + + return ($new_scores, $new_mappings); +} + + +# +# rescore by penalising scores between genes with different biotypes +# +sub biotype { + my $self = shift; + my $num = shift; + my $gsb = shift; + my $mappings = shift; + my $gene_scores = shift; + + $self->logger->info("Retry with biotype disambiguation...\n", 0, 'stamped'); + + unless ($gene_scores->loaded) { + $gsb->biotype_gene_rescore($gene_scores); + $gene_scores->write_to_file; + } + + my $new_mappings = $self->basic_mapping($gene_scores, "gene_mappings$num"); + $num++; + my $new_scores = $gsb->create_shrinked_matrix($gene_scores, $new_mappings, + "gene_matrix$num"); + + return ($new_scores, $new_mappings); +} + + +# +# selectively rescore by penalising scores between genes with different +# internalIDs +# +sub internal_id { + my $self = shift; + my $num = shift; + my $gsb = shift; + my $mappings = shift; + my $gene_scores = shift; + + $self->logger->info("Retry with internalID disambiguation...\n", 0, 'stamped'); + + unless ($gene_scores->loaded) { + $gsb->internal_id_rescore($gene_scores); + $gene_scores->write_to_file; + } + + my $new_mappings = $self->basic_mapping($gene_scores, "gene_mappings$num"); + $num++; + my $new_scores = $gsb->create_shrinked_matrix($gene_scores, $new_mappings, + "gene_matrix$num"); + + return ($new_scores, $new_mappings); +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/IdMapping/InternalIdMapper/EnsemblTranscriptGeneric.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/InternalIdMapper/EnsemblTranscriptGeneric.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,305 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +=head1 METHODS + +=cut + + +package Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper; +our @ISA = qw(Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append); + + +# +# basic mapping +# +sub init_basic { + my $self = shift; + my $num = shift; + my $tsb = shift; + my $mappings = shift; + my $transcript_scores = shift; + + $self->logger->info("Basic transcript mapping...\n", 0, 'stamped'); + + $mappings = $self->basic_mapping($transcript_scores, + "transcript_mappings$num"); + $num++; + my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings, + "transcript_matrix$num"); + + return ($new_scores, $mappings); +} + + +# +# handle cases with exact match but different translation +# +sub non_exact_translation { + my $self = shift; + my $num = shift; + my $tsb = shift; + my $mappings = shift; + my $transcript_scores = shift; + + $self->logger->info("Exact Transcript non-exact Translation...\n", 0, 'stamped'); + + unless ($transcript_scores->loaded) { + $tsb->different_translation_rescore($transcript_scores); + $transcript_scores->write_to_file; + } + + $mappings = $self->basic_mapping($transcript_scores, + "transcript_mappings$num"); + $num++; + my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings, + "transcript_matrix$num"); + + return ($new_scores, $mappings); +} + + +# +# reduce score for mappings of transcripts which do not belong to mapped +# genes +# +sub mapped_gene { + my $self = shift; + my $num = shift; + my $tsb = shift; + my $mappings = shift; + my $transcript_scores = shift; + my $gene_mappings = shift; + + $self->logger->info("Transcripts in mapped genes...\n", 0, 'stamped'); + + unless ($transcript_scores->loaded) { + $tsb->non_mapped_gene_rescore($transcript_scores, $gene_mappings); + $transcript_scores->write_to_file; + } + + $mappings = $self->basic_mapping($transcript_scores, + "transcript_mappings$num"); + $num++; + my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings, + "transcript_matrix$num"); + + return ($new_scores, $mappings); +} + +# +# rescore by penalising scores between transcripts with different biotypes +# +sub biotype { + my $self = shift; + my $num = shift; + my $tsb = shift; + my $mappings = shift; + my $transcript_scores = shift; + + $self->logger->info( "Retry with biotype disambiguation...\n", + 0, 'stamped' ); + + unless ( $transcript_scores->loaded() ) { + $tsb->biotype_transcript_rescore($transcript_scores); + $transcript_scores->write_to_file(); + } + + my $new_mappings = $self->basic_mapping( $transcript_scores, + "transcript_mappings$num" ); + $num++; + my $new_scores = + $tsb->create_shrinked_matrix( $transcript_scores, $new_mappings, + "transcript_matrix$num" ); + + return ( $new_scores, $new_mappings ); +} + +# +# selectively rescore by penalising scores between transcripts with +# different internalIDs +# +sub internal_id { + my $self = shift; + my $num = shift; + my $tsb = shift; + my $mappings = shift; + my $transcript_scores = shift; + + $self->logger->info("Retry with internalID disambiguation...\n", 0, 'stamped'); + + unless ($transcript_scores->loaded) { + $tsb->internal_id_rescore($transcript_scores); + $transcript_scores->write_to_file; + } + + $mappings = $self->basic_mapping($transcript_scores, + "transcript_mappings$num"); + $num++; + my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings, + "transcript_matrix$num"); + + return ($new_scores, $mappings); +} + + +# +# handle ambiguities between transcripts in single genes +# +sub single_gene { + my $self = shift; + my $num = shift; + my $tsb = shift; + my $mappings = shift; + my $transcript_scores = shift; + + $self->logger->info("Transcripts in single genes...\n", 0, 'stamped'); + + unless ($transcript_scores->loaded) { + $transcript_scores->write_to_file; + } + + $mappings = $self->same_gene_transcript_mapping($transcript_scores, + "transcript_mappings$num"); + $num++; + my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings, + "transcript_matrix$num"); + + return ($new_scores, $mappings); +} + + +# +# modified basic mapper that maps transcripts that are ambiguous within one gene +# +sub same_gene_transcript_mapping { + my $self = shift; + my $matrix = shift; + my $mapping_name = shift; + + # argument checks + unless ($matrix and + $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + throw('Need a name for serialising the mapping.') unless ($mapping_name); + + # Create a new MappingList object. Specify AUTO_LOAD to load serialised + # existing mappings if found + my $dump_path = path_append($self->conf->param('basedir'), 'mapping'); + + my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => "${mapping_name}.ser", + -AUTO_LOAD => 1, + ); + + # checkpoint test: return a previously stored MappingList + if ($mappings->loaded) { + $self->logger->info("Read existing mappings from ${mapping_name}.ser.\n"); + return $mappings; + } + + my $sources_done = {}; + my $targets_done = {}; + + # sort scoring matrix entries by descending score + my @sorted_entries = sort { $b->score <=> $a->score || + $a->source <=> $b->source || $a->target <=> $b->target } + @{ $matrix->get_all_Entries }; + + while (my $entry = shift(@sorted_entries)) { + + # $self->logger->debug("\nxxx4 ".$entry->to_string." "); + + # we already found a mapping for either source or target yet + next if ($sources_done->{$entry->source} or + $targets_done->{$entry->target}); + + #$self->logger->debug('d'); + + my $other_sources = []; + my $other_targets = []; + my %source_genes = (); + my %target_genes = (); + + if ($self->ambiguous_mapping($entry, $matrix, $other_sources, $other_targets)) { + #$self->logger->debug('a'); + + $other_sources = $self->filter_sources($other_sources, $sources_done); + $other_targets = $self->filter_targets($other_targets, $targets_done); + + $source_genes{$self->cache->get_by_key('genes_by_transcript_id', + 'source', $entry->source)} = 1; + $target_genes{$self->cache->get_by_key('genes_by_transcript_id', + 'target', $entry->target)} = 1; + + foreach my $other_source (@{ $other_sources }) { + $source_genes{$self->cache->get_by_key('genes_by_transcript_id', + 'source', $other_source)} = 1; + } + + foreach my $other_target (@{ $other_targets }) { + $target_genes{$self->cache->get_by_key('genes_by_transcript_id', + 'target', $other_target)} = 1; + } + + # only add mapping if only one source and target gene involved + if (scalar(keys %source_genes) == 1 and scalar(keys %target_genes) == 1) { + #$self->logger->debug('O'); + $mappings->add_Entry($entry); + } + + } else { + #$self->logger->debug('A'); + + # this is the best mapping, add it + $mappings->add_Entry($entry); + } + + $sources_done->{$entry->source} = 1; + $targets_done->{$entry->target} = 1; + } + + # create checkpoint + $mappings->write_to_file; + + return $mappings; +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/IdMapping/MappingList.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/MappingList.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,263 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::IdMapping::MappingList - object holding a list of Entries + +=head1 SYNOPSIS + + # create a new MappingList + my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'gene_mappings.ser', + ); + + # add entries + my $mappings->add_Entry($entry1); + my $mappings->add_all( $entry2, $entry3 ); + + # serialise to file + $mappings->write_to_file; + + # later, read these mappings from file + my $mappings1 = Bio::EnsEMBL::IdMapping::MappingList->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'gene_mappings.ser', + ); + $mappings1->read_from_file; + +=head1 DESCRIPTION + +This object represents a list of Bio::EnsEMBL::IdMapping::Entry +objects. It's essentially an OO wrapper for an array with some type +checking and convenience methods. + +=head1 METHODS + + new + add_Entry + get_all_Entries + add_all + get_entry_count + log + to_string + +=cut + +package Bio::EnsEMBL::IdMapping::MappingList; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::IdMapping::Serialisable; +our @ISA = qw(Bio::EnsEMBL::IdMapping::Serialisable); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append); + + +=head2 new + + Arg[1-N] : see superclass + Example : my $gene_mappings = Bio::EnsEMBL::IdMapping::MappingList->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'gene_mappings.ser', + ); + Description : Constructor. + Return type : Bio::EnsEMBL::IdMapping::MappingList + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_); + + # initialise internal datastructure unless we loaded a serialised object + unless ($self->loaded) { + $self->{'cache'}->{'entries'} = []; + } + + return $self; +} + + +=head2 add_Entry + + Arg[1] : Bio::EnsEMBL::IdMapping::Entry - Entry to add + Example : $mappings->add_Entry($entry); + Description : Adds an Entry to the MappingList. + Return type : none + Exceptions : thrown on wrong or missing argument + Caller : general + Status : At Risk + : under development + +=cut + +sub add_Entry { + my $self = shift; + my $entry = shift; + + unless ($entry and $entry->isa('Bio::EnsEMBL::IdMapping::Entry')) { + throw("Need a Bio::EnsEMBL::IdMapping::Entry"); + } + + push @{ $self->{'cache'}->{'entries'} }, $entry; +} + + +=head2 get_all_Entries + + Example : foreach my $entry (@{ $mappings->get_all_Entries }) { + # do something with the entry + } + Description : Gets all Entries in the MappingList. + Return type : Arrayref of Bio::EnsEMBL::IdMapping::Entry + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_all_Entries { + my $self = shift; + return $self->{'cache'}->{'entries'}; +} + + +=head2 add_all + + Arg[1] : List of Bio::EnsEMBL::IdMapping::Entry objects + Example : my @entries = ($entry1, $entry2); + $mappings->add_all(@entries); + Description : Adds a list of Entries to the MappingList. + Return type : none + Exceptions : thrown on wrong argument + Caller : general + Status : At Risk + : under development + +=cut + +sub add_all { + my $self = shift; + my @mappings = @_; + + foreach my $mapping (@mappings) { + + unless ($mapping->isa('Bio::EnsEMBL::IdMapping::MappingList')) { + throw("Need a Bio::EnsEMBL::IdMapping::MappingList"); + } + + push @{ $self->{'cache'}->{'entries'} }, @{ $mapping->get_all_Entries }; + } +} + + +=head2 get_entry_count + + Example : my $num_entries = $mappings->get_entry_count; + Description : Returns the number of Entries in the MappingList. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_entry_count { + my $self = shift; + return scalar(@{ $self->{'cache'}->{'entries'} }); +} + + +=head2 log + + Arg[1] : String $type - object type (e.g. 'gene') + Arg[2] : String $dump_path - path for writing output + Example : $mappings->log('gene', $conf->param('basedir')); + Description : Logs all Entries in the MappingList to a file. Used for + debugging. + Return type : none + Exceptions : thrown on I/0 error + Caller : general + Status : At Risk + : under development + +=cut + +sub log { + my $self = shift; + my $type = shift; + my $dump_path = shift; + + my $debug_path = path_append($dump_path, 'debug'); + my $logfile = "$debug_path/${type}_final_scores.txt"; + + open(my $fh, '>', $logfile) or + throw("Unable to open $logfile for writing: $!"); + + foreach my $entry (@{ $self->get_all_Entries }) { + print $fh ($entry->to_string."\n"); + } + + close($fh); +} + + +=head2 to_string + + Example : print LOG $mappings->to_string, "\n"; + Description : Returns a string representation of the MappingList. This is + simply a multi-line string, where each line is a stringified + Entry. + Useful for debugging and logging. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub to_string { + my $self = shift; + + my $string = ''; + + foreach my $entry (@{ $self->get_all_Entries }) { + $string .= $entry->to_string."\n"; + } + + return $string; +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/IdMapping/ResultAnalyser.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/ResultAnalyser.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,794 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::IdMapping::ResultAnalyser - analyse stable Id mapping results + +=head1 SYNOPSIS + + # get a result analyser + my $analyser = Bio::EnsEMBL::IdMapping::ResultAnalyser->new( + -LOGGER => $logger, + -CONF => $conf, + -CACHE => $cache + ); + + # analyse results + $analyser->analyse( $gene_mappings, + $stable_id_mapper->get_all_stable_id_events('similarity') ); + + # write results to file + $analyser->write_results_to_file; + + # create click lists + $analyser->create_clicklist; + + # mapping_summary + $analyser->create_mapping_summary; + +=head1 DESCRIPTION + +This is a utility module which analyses the stable Id mapping results +by providing various sorts of mapping statistics. It also creates +clicklists and a mapping summary. + +=head1 METHODS + + analyse + analyse_db + classify_source_genes_by_type + classify_genes_by_mapping_simple + classify_genes_by_mapping + add + get + get_all_by_subclass + get_all_by_class + get_count_by_subclass + get_count_by_class + get_all_classes + class_key + write_results_to_file + create_clicklist + create_mapping_summary + read_from_file + +=cut + + +package Bio::EnsEMBL::IdMapping::ResultAnalyser; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::IdMapping::BaseObject; +our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append); + + +=head2 analyse + + Arg[1] : Bio::EnsEMBL::IdMapping::MappingList $gene_mappings - the gene + mappings to analyse + Arg[2] : Arrayref of Strings - similarity events + Example : $analyser->analyse($gene_mappings, + $stable_id_mapper->get_all_stable_id_events('similarity')); + Description : Analyses the results of a stable Id mapping run. + Return type : none + Exceptions : thrown on wrong or missing arguments + Caller : general + Status : At Risk + : under development + +=cut + +sub analyse { + my $self = shift; + my $gene_mappings = shift; + my $similarity_events = shift; + + # argument check + unless ($gene_mappings and + $gene_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) { + throw("Need a Bio::EnsEMBL::IdMapping::MappingList of genes."); + } + + unless ($similarity_events and ref($similarity_events) eq 'ARRAY') { + throw("Need a list of similarity events."); + } + + # classify source genes by type (status-logic_name-biotype) + $self->classify_source_genes_by_type; + + # classify source genes by mapping status + $self->classify_genes_by_mapping($gene_mappings, $similarity_events); +} + + +=head2 classify_source_genes_by_type + + Example : $analyser->classify_source_genes_by_type; + Description : Classifies source genes by type and adds them to the internal + datastructure. For the format of the classification string see + class_key(). + Return type : none + Exceptions : none + Caller : internal + Status : At Risk + : under development + +=cut + +sub classify_source_genes_by_type { + my $self = shift; + + foreach my $s_gene (values %{ $self->cache->get_by_name('genes_by_id', 'source') }) { + $self->add('source', $self->class_key($s_gene), 'all', $s_gene->stable_id); + } +} + + +=head2 classify_genes_by_mapping_simple + + Arg[1] : Bio::EnsEMBL::IdMapping::MapppingList $gene_mappings - gene + mappings to classify + Example : $analyser->classify_genes_by_mapping_simple; + Description : Classifies target genes by mapping ('mapped' or 'unmapped'). + Return type : none + Exceptions : thrown on wrong or missing argument + Caller : This method is not in use at the momen. + Status : At Risk + : under development + +=cut + +sub classify_genes_by_mapping_simple { + my $self = shift; + my $gene_mappings = shift; + + # argument check + unless ($gene_mappings and + $gene_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) { + throw("Need a Bio::EnsEMBL::IdMapping::MappingList of genes."); + } + + my %result = (); + + # firrst, create a lookup hash of source genes by target internal ID + my %source_genes_by_target = (); + foreach my $e (@{ $gene_mappings->get_all_Entries }) { + my $s_gene = $self->cache->get_by_key('genes_by_id', 'source', $e->source); + my $t_gene = $self->cache->get_by_key('genes_by_id', 'target', $e->target); + $source_genes_by_target{$t_gene->id} = $s_gene; + } + + # now loop over target genes + foreach my $t_gene (values %{ $self->cache->get_by_name('genes_by_id', 'target') }) { + + # check if target gene has all required properties set + unless ($t_gene->status and $t_gene->logic_name and $t_gene->biotype) { + $self->logger->warning("Missing data for target gene: ". + $t_gene->to_string."\n", 1); + } + + my $class = $self->class_key($t_gene); + + # classify as '1' if mapped (using source gene's stable ID), otherwise '0' + if (my $s_gene = $source_genes_by_target{$t_gene->id}) { + $self->add('target', $class, 'mapped', $s_gene->stable_id); + } else { + $self->add('target', $class, 'unmapped', $t_gene->stable_id); + } + + } +} + + +=head2 classify_genes_by_mapping + + Arg[1] : Bio::EnsEMBL::IdMapping::MapppingList $gene_mappings - gene + mappings to classify + Arg[2] : Arrayref of Strings - similarity events + Example : $analyser->classify_genes_by_mapping; + Description : Classifies genes by mapping. Status is + 'mapped' => stable Id was mapped + 'lost_similar' => stable Id not mapped, but there is a + similarity entry for the source Id + 'lost_definite' => not mapped and no similarity + Return type : none + Exceptions : thrown on wrong or missing argument + Caller : This method is not in use at the momen. + Status : At Risk + : under development + +=cut + +sub classify_genes_by_mapping { + my $self = shift; + my $gene_mappings = shift; + my $similarity_events = shift; + + # argument check + unless ($gene_mappings and + $gene_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) { + throw("Need a Bio::EnsEMBL::IdMapping::MappingList of genes."); + } + + unless ($similarity_events and ref($similarity_events) eq 'ARRAY') { + throw("Need a list of similarity events."); + } + + # mapped genes + foreach my $e (@{ $gene_mappings->get_all_Entries }) { + my $s_gene = $self->cache->get_by_key('genes_by_id', 'source', $e->source); + $self->add('source', $self->class_key($s_gene), 'mapped', + $s_gene->stable_id); + } + + # lookup hash for similarities + my %similar = (); + foreach my $event (@{ $similarity_events }) { + my ($stable_id) = split("\t", $event); + $similar{$stable_id} = 1; + } + + # deleted genes + foreach my $s_gene (values %{ $self->cache->get_by_name('genes_by_id', 'source') }) { + + my $stable_id = $s_gene->stable_id; + my $class = $self->class_key($s_gene); + + unless ($self->get('source', $class, 'mapped', $stable_id)) { + + # sub-classify as 'lost_similar' or 'lost_definite' + if ($similar{$stable_id}) { + $self->add('source', $class, 'lost_similar', $stable_id); + } else { + $self->add('source', $class, 'lost_definite', $stable_id); + } + + } + } + +} + + +=head2 add + + Arg[1] : String $dbtype - db type ('source' or 'target') + Arg[2] : String $class - key identifying a gene type (see class_key()) + Arg[3] : String $subclass - status identifier (e.g. 'mapped', 'lost') + Arg[4] : String $stable_id - gene stable Id + Arg[5] : String $val - value (usually 0 or 1) + Example : $analyser->add('source', 'KNOWN-ensembl-protein_coding', + 'mapped', 'ENSG00002342', 1); + Description : Add a stable Id / property pair to a name/dbtype lookup hash. + + The datastructure is a bit of a bloat, but is general enough to + be used as a lookup hash and to generate statistics (counts by + type) and debug lists (dump by type). + Return type : String - the added value + Exceptions : none + Caller : internal + Status : At Risk + : under development + +=cut + +sub add { + my ($self, $dbtype, $class, $subclass, $stable_id, $val) = @_; + + # private method, so no argument check done for performance reasons + + # default to a value of '1' + $val = 1 unless (defined($val)); + + $self->{$dbtype}->{$class}->{$subclass}->{$stable_id} = $val; +} + + +=head2 get + + Arg[1] : String $dbtype - db type ('source' or 'target') + Arg[2] : String $class - key identifying a gene type (see class_key()) + Arg[3] : String $subclass - status identifier (e.g. 'mapped', 'lost') + Arg[4] : String $stable_id - gene stable Id + Example : my $mapping_status = $analyser->get('source', + 'KNOWN-ensembl-protein_coding', 'mapped', 'ENSG00002342'); + Description : Gets a stable Id mapping status from the internal datastructure. + Return type : String + Exceptions : none + Caller : internal + Status : At Risk + : under development + +=cut + +sub get { + my ($self, $dbtype, $class, $subclass, $stable_id) = @_; + + # private method, so no argument check done for performance reasons + + return $self->{$dbtype}->{$class}->{$subclass}->{$stable_id}; +} + + +=head2 get_all_by_subclass + + Arg[1] : String $dbtype - db type ('source' or 'target') + Arg[2] : String $class - key identifying a gene type (see class_key()) + Arg[3] : String $subclass - status identifier (e.g. 'mapped', 'lost') + Example : my @mapped_stable_ids = @{ + $analyser->get_all_by_subclass( + 'source', 'KNOWN-ensembl-protein_coding', + 'mapped' + ) }; + Description : Gets a list of stable Id for a given subclass. + Return type : Arrayref of String (stable Ids) + Exceptions : thrown on missing arguments + Caller : internal + Status : At Risk + : under development + +=cut + +sub get_all_by_subclass { + my ($self, $dbtype, $class, $subclass) = @_; + + # argument check + throw("Need a dbtype (source|target).") unless ($dbtype); + throw("Need a class.") unless ($class); + throw("Need a subclass.") unless ($subclass); + + return [ keys %{ $self->{$dbtype}->{$class}->{$subclass} || {} } ]; +} + + +=head2 get_all_by_class + + Arg[1] : String $dbtype - db type ('source' or 'target') + Arg[2] : String $class - key identifying a gene type (see class_key()) + Example : my @stable_ids = @{ + $analyser->get_all_by_class( 'source', + 'KNOWN-ensembl-protein_coding' ) }; + Description : Gets a list of stable Id for a given class. + Return type : Arrayref of String (stable Ids) + Exceptions : thrown on missing arguments + Caller : internal + Status : At Risk + : under development + +=cut + +sub get_all_by_class { + my ($self, $dbtype, $class) = @_; + + # argument check + throw("Need a dbtype (source|target).") unless ($dbtype); + throw("Need a class.") unless ($class); + + my %merged = (); + + foreach my $subclass (keys %{ $self->{$dbtype}->{$class} || {} }) { + while (my ($key, $val) = each(%{ $self->{$dbtype}->{$class}->{$subclass} })) { + $merged{$key} = $val; + } + } + + return [ keys %merged ]; +} + + +=head2 get_count_by_subclass + + Arg[1] : String $dbtype - db type ('source' or 'target') + Arg[2] : String $class - key identifying a gene type (see class_key()) + Arg[3] : String $subclass - status identifier (e.g. 'mapped', 'lost') + Example : my $num_mapped = $analyser->get_count_by_subclass('source', + 'KNOWN-ensembl-protein_coding', 'mapped'); + Description : Gets the number of stable Ids for a given subclass. + Return type : Int + Exceptions : thrown on missing arguments + Caller : internal + Status : At Risk + : under development + +=cut + +sub get_count_by_subclass { + my ($self, $dbtype, $class, $subclass) = @_; + + # argument check + throw("Need a dbtype (source|target).") unless ($dbtype); + throw("Need a class.") unless ($class); + throw("Need a subclass.") unless ($subclass); + + return scalar(keys %{ $self->{$dbtype}->{$class}->{$subclass} || {} }); +} + + +=head2 get_count_by_class + + Arg[1] : String $dbtype - db type ('source' or 'target') + Arg[2] : String $class - key identifying a gene type (see class_key()) + Example : my $num_mapped = $analyser->get_count_by_class('source', + 'KNOWN-ensembl-protein_coding'); + Description : Gets the number of stable Ids for a given class. + Return type : Int + Exceptions : thrown on missing arguments + Caller : internal + Status : At Risk + : under development + +=cut + +sub get_count_by_class { + my ($self, $dbtype, $class) = @_; + + # argument check + throw("Need a dbtype (source|target).") unless ($dbtype); + throw("Need a class.") unless ($class); + + return scalar(@{ $self->get_all_by_class($dbtype, $class) }); +} + + +=head2 get_all_classes + + Arg[1] : String $dbtype - db type ('source' or 'target') + Example : foreach my $class (@{ $analyser->get_all_classes('source') }) { + print "$class\n"; + } + Description : Gets a list of classes in the ResultAnalyser. + Return type : Arrayref of String + Exceptions : thrown on missing argument + Caller : internal + Status : At Risk + : under development + +=cut + +sub get_all_classes { + my ($self, $dbtype) = @_; + + # argument check + throw("Need a dbtype (source|target).") unless ($dbtype); + + return [ sort keys %{ $self->{$dbtype} || {} } ]; +} + + +=head2 class_key + + Arg[1] : Bio::EnsEMBL::IdMapping::TinyGene $gene - a gene object + Example : my $class = $analyser->class_key($gene); + Description : Generates a key identifying a gene class. This identifier is + composed from the gene's status, logic naame, and biotye. + Return type : String + Exceptions : none + Caller : internal + Status : At Risk + : under development + +=cut + +sub class_key { + my ($self, $gene) = @_; + return join('-', map { $gene->$_ } qw(status logic_name biotype)); +} + + +=head2 write_results_to_file + + Example : $analyser->write_results_to_file; + Description : Writes the results of the result analysis to a file. This is a + human-readable text detailing the mapping statistics. + Return type : none + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub write_results_to_file { + my $self = shift; + + my $fh = $self->get_filehandle('gene_detailed_mapping_stats.txt', 'stats'); + + my $fmt1 = "%-60s%-16s%-16s%-16s\n"; + my $fmt2 = "%-60s%5.0f (%7s) %5.0f (%7s) %5.0f (%7s)\n"; + my $fmt3 = "%3.2f%%"; + + print $fh "Gene detailed mapping results:\n\n"; + + print $fh sprintf($fmt1, "Gene type", "mapped", "lost (similar)", + "lost (definite)"); + + print $fh ('-'x108), "\n"; + + foreach my $class (@{ $self->get_all_classes('source') }) { + next if ($class eq 'all'); + + my $total = $self->get_count_by_class('source', $class); + + # avoid division by zero error + unless ($total) { + $self->logger->warning("No count found for $class.\n", 1); + next; + } + + my $mapped = $self->get_count_by_subclass('source', $class, 'mapped'); + my $similar = $self->get_count_by_subclass('source', $class, + 'lost_similar'); + my $lost = $self->get_count_by_subclass('source', $class, 'lost_definite'); + + print $fh sprintf($fmt2, + $class, + $mapped, sprintf($fmt3, $mapped/$total*100), + $similar, sprintf($fmt3, $similar/$total*100), + $lost, sprintf($fmt3, $lost/$total*100)); + } + + close($fh); +} + + +=head2 create_clicklist + + Example : $analyser->create_clicklist; + Description : Writes an html file which contains a list of all lost genes, + with hyperlinks to the appropriate archive website. This is to + manually check lost genes. + Return type : none + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub create_clicklist { + my $self = shift; + + my $fh = $self->get_filehandle('genes_lost.html', 'stats'); + + # start html output + print $fh qq(\n); + print $fh qq(); + print $fh "\n"; + print $fh "Lost genes "; + print $fh $self->conf->param('sourcedbname'), ' -> ', + $self->conf->param('targetdbname'); + print $fh "\n"; + print $fh "\n\n"; + + my $prefix = $self->conf->param('urlprefix'); + unless ($prefix) { + $self->logger->warning("No urlprefix set, clicklists might not be useable.\n", 1); + } + + my $navigation; + my $clicklist; + + foreach my $class (@{ $self->get_all_classes('source') }) { + next if ($class eq 'all'); + + $navigation .= "$class "; + $clicklist .= "

$class

\n"; + + foreach my $subclass (qw(lost_similar lost_definite)) { + + # navigation + $navigation .= qq($subclass ); + + # clicklist + $clicklist .= "

$subclass

\n"; + + foreach my $stable_id (@{ $self->get_all_by_subclass('source', $class, $subclass) }) { + $clicklist .= qq($stable_id
\n); + } + + } + + $navigation .= "
\n"; + } + + # print navigation and clicklist + print $fh "$navigation\n\n"; + print $fh "$clicklist\n\n"; + + # html footer + print $fh "\n"; + + close($fh); +} + + +=head2 create_mapping_summary + + Example : $analyser->create_mapping_summary(); + Description : Writes a text file containing a summary of the mapping stats. + This will be emailed to the genebuilder for evaluation (you will + have to manually send the email, using the text in + "mapping_summary.txt" as the template). + Return type : none + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub create_mapping_summary { + my $self = shift; + + my $fh = $self->get_filehandle('mapping_summary.txt'); + + # + # title + # + print $fh qq(Stable ID mapping results\n); + print $fh qq(=========================\n\n); + + # + # timing + # + print $fh "Run at: ".localtime()."\n"; + print $fh "Runtime: "; + print $fh $self->logger->runtime, "\n\n"; + + # + # parameters used for this run + # + print $fh $self->conf->list_param_values; + print $fh "\n"; + + # + # mapping stats + # + foreach my $type (qw(exon transcript translation gene gene_detailed)) { + my $filename = "${type}_mapping_stats.txt"; + + if ($self->file_exists($filename, 'stats')) { + print $fh $self->read_from_file($filename, 'stats'); + print $fh "\n\n"; + } else { + print $fh "No mapping stats found for $type.\n\n"; + } + } + + # + # db uploads + # + my @uploads = ( + ['stable_ids' => 'Stable IDs'], + ['events' => 'Stable ID events and mapping session'], + ['archive' => 'Gene and peptide archive'], + ); + + my $fmt1 = "%-40s%-20s\n"; + + print $fh qq(Data uploaded to db:\n); + print $fh qq(====================\n\n); + + if ($self->conf->param('dry_run')) { + + print $fh "None (dry run).\n"; + + } else { + + foreach my $u (@uploads) { + my $uploaded = 'no'; + $uploaded = 'yes' if ($self->conf->is_true("upload_".$u->[0])); + print $fh sprintf($fmt1, $u->[1], $uploaded); + } + + } + + print $fh "\n"; + + # + # stats and clicklist + # + my @output = ( + ['stats' => 'statistics (including clicklists of deleted IDs)'], + ['debug' => 'detailed mapping output for debugging'], + ['tables' => 'data files for db upload'], + ); + + my $fmt2 = "%-20s%-50s\n"; + + print $fh qq(\nOutput directories:\n); + print $fh qq(===================\n\n); + + print $fh sprintf($fmt2, qw(DIRECTORY DESCRIPTION)); + print $fh ('-'x72), "\n"; + + print $fh sprintf($fmt2, 'basedir', $self->conf->param('basedir')); + + foreach my $o (@output) { + print $fh sprintf($fmt2, '$basedir/'.$o->[0], $o->[1]); + } + + print $fh "\n"; + + # + # clicklist of first 10 deleted genes + # + print $fh qq(\nFirst 10 deleted known genes:\n); + print $fh qq(=============================\n\n); + + my $in_fh = $self->get_filehandle('genes_lost.txt', 'debug', '<'); + my $prefix = $self->conf->param('urlprefix'); + my $i; + + while (<$in_fh>) { + last if (++$i > 10); + + chomp; + my ($stable_id, $type) = split(/\s+/); + + next unless ($type eq 'known'); + + print $fh sprintf($fmt2, $stable_id, "${prefix}$stable_id"); + } + + close($in_fh); + close($fh); +} + + +=head2 read_from_file + + Arg[1] : String $filename - name of file to read + Arg[2] : (optional) String $append - directory name to append to basedir + Example : my $stats_text = $analyser->read_from_file('gene_mapping_stats', + 'stats'); + Description : Reads mapping stats from a file. + Return type : String + Exceptions : none + Caller : internal + Status : At Risk + : under development + +=cut + +sub read_from_file { + my $self = shift; + my $filename = shift; + my $append = shift; + + my $in_fh = $self->get_filehandle($filename, $append, '<'); + + my $txt; + + while (<$in_fh>) { + $txt .= $_; + } + + return $txt; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/IdMapping/ScoreBuilder.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/ScoreBuilder.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,266 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::IdMapping::ScoreBuilder - score builder base class + +=head1 SYNOPSIS + +This class is not instantiated. Please see subclasses for usage examples +(e.g. GeneScoreBuilder). + +=head1 DESCRIPTION + +This is the base class for the score builders used in the stable Id +mapping application. It contains methods which are used by more than one +ScoreBuilder. + +=head1 METHODS + + create_shrinked_matrix + internal_id_rescore + log_matrix_stats + +=cut + +package Bio::EnsEMBL::IdMapping::ScoreBuilder; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::IdMapping::BaseObject; +our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append); +use Bio::EnsEMBL::IdMapping::ScoredMappingMatrix; + + +=head2 create_shrinked_matrix + + Arg[1] : Bio::EnsEMBL::Idmapping::ScoredMappingMatrix $matrix - a scoring + matrix + Arg[2] : Bio::EnsEMBL::Idmapping::MappingList $mappings - mappings + Arg[3] : String $cache_file - base name of a cache file (extension '.ser' + will be added automatically) for the returned matrix + Example : my $new_scores = $score_builder->create_shrinked_matrix( + $gene_scores, $mappings, "gene_matrix1"); + Description : Create a shrinked scoring matrix which doesn't contain entries + which were already mapped. It also logs how many new mappings + were added in this process. + Return type : Bio::EnsEMBL::IdMapping::ScoredMappingMatrix + Exceptions : thrown on wrong or missing arguments + Caller : InternalIdMapper plugin + Status : At Risk + : under development + +=cut + +# TODO: shrinked = shrunken? +sub create_shrinked_matrix { + my $self = shift; + my $matrix = shift; + my $mappings = shift; + my $cache_file = shift; # base name, extension '.ser' will be added + + # argument checks + unless ($matrix and + $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + unless ($mappings and + $mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) { + throw('Need a gene Bio::EnsEMBL::IdMapping::MappingList.'); + } + + throw('Need a cache file name.') unless ($cache_file); + + my $dump_path = path_append($self->conf->param('basedir'), 'matrix'); + $cache_file .= '.ser'; + + my $shrinked_matrix = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => $cache_file, + -AUTO_LOAD => 1, + ); + + # if we already found a saved matrix, just return it + if ($shrinked_matrix->loaded) { + + $self->logger->info("Read existing scoring matrix from $cache_file.\n"); + + } else { + + # create lookup hashes for sources and targets in the MappingList + my %sources = (); + my %targets = (); + + foreach my $entry (@{ $mappings->get_all_Entries }) { + $sources{$entry->source} = 1; + $targets{$entry->target} = 1; + } + + # add all entries to shrinked matrix which are not in the MappingList + foreach my $entry (@{ $matrix->get_all_Entries }) { + unless ($sources{$entry->source} or $targets{$entry->target}) { + $shrinked_matrix->add_Entry($entry); + } + } + + } + + # log shrinking stats + $self->logger->info('Sources '.$matrix->get_source_count.' --> '. + $shrinked_matrix->get_source_count."\n"); + $self->logger->info('Targets '.$matrix->get_target_count.' --> '. + $shrinked_matrix->get_target_count."\n"); + $self->logger->info('Entries '.$matrix->get_entry_count.' --> '. + $shrinked_matrix->get_entry_count."\n"); + $self->logger->info('New mappings: '.$mappings->get_entry_count."\n\n"); + + return $shrinked_matrix; +} + + +=head2 internal_id_rescore + + Arg[1] : Bio::EnsEMBL::Idmapping::ScoredMappingMatrix $matrix - a scoring + matrix + Example : $score_builder->internal_id_rescore($gene_scores); + Description : Rescore ambiguous mappings based on internal Ids. This is the + last disambiguation step and is only useful if objects with the + same internal Id were used in source and target dbs (e.g. in + patch builds or if objects were copied from source to target). + + If a source and target gene have the same internal Id and there + are mappings to other target genes then these *other* mappings + are penalised. + Return type : none + Exceptions : thrown on wrong or missing argument + Caller : InternalIdMapper plugins + Status : At Risk + : under development + +=cut + +sub internal_id_rescore { + my $self = shift; + my $matrix = shift; + + unless ($matrix + and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') ) + { + throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + my $i = 0; + + foreach my $source ( @{ $matrix->get_all_sources } ) { + my @entries = + sort { $b <=> $a } @{ $matrix->get_Entries_for_source($source) }; + + # nothing to do if we only have one mapping + if ( scalar(@entries) == 1 ) { next } + + # only penalise if mappings are ambiguous + if ( $entries[0]->score != $entries[1]->score ) { next } + + # only penalise if one source id == target id where score == best + # score + my $ambiguous = 0; + + foreach my $e (@entries) { + if ( $e->target == $source and $e->score == $entries[0]->score() ) + { + $ambiguous = 1; + last; + } + } + + if ( !$ambiguous ) { next } + + # now penalise those where source id != target id and score == best + # score + foreach my $e (@entries) { + if ( $e->target != $source and $e->score == $entries[0]->score() ) + { + # PENALTY: Reduce score for ambiguous mappings. + $matrix->set_score( $source, $e->target(), 0.9*$e->score() ); + $i++; + } + } + + } ## end foreach my $source ( @{ $matrix...}) + + $self->logger->debug("Scored entries with internal ID mismatch: $i\n", + 1 ); +} ## end sub internal_id_rescore + + +=head2 log_matrix_stats + + Arg[1] : Bio::EnsEMBL::Idmapping::ScoredMappingMatrix $matrix - a scoring + matrix + Example : $score_builder->log_matrix_stats; + Description : Logs scoring matrix statistics (number of entries, min/max/avg + scores). + Return type : none + Exceptions : thrown on wrong or missing argument + Caller : general + Status : At Risk + : under development + +=cut + +sub log_matrix_stats { + my $self = shift; + my $matrix = shift; + + unless ($matrix and + $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('You must provide a ScoredMappingMatrix.'); + } + + my $fmt1 = "%-40s%10.0f\n"; + my $fmt2 = "%-40s%10.5f\n"; + + $self->logger->info(sprintf($fmt1, "Scoring matrix entries:", + $matrix->get_entry_count), 1); + + $self->logger->info(sprintf($fmt1, "Scoring matrix sources:", + $matrix->get_source_count), 1); + + $self->logger->info(sprintf($fmt1, "Scoring matrix targets:", + $matrix->get_target_count), 1); + + $self->logger->info(sprintf($fmt2, "Average score:", + $matrix->get_average_score), 1); + + my ($min, $max) = @{ $matrix->get_min_max_scores }; + $self->logger->info(sprintf($fmt2, "Min. score:", $min), 1); + $self->logger->info(sprintf($fmt2, "Max. score:", $max), 1); +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/IdMapping/ScoredMappingMatrix.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/ScoredMappingMatrix.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,809 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::IdMapping::ScoredMappingMatrix - object holding a list of scored +Entries + +=head1 SYNOPSIS + + # create a new ScoredMappingMatrix + my $gene_scores = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'gene_scores.ser', + ); + + # add entries + my $gene_scores->add_Entry($entry1); + + # serialise to file + $gene_scores->write_to_file; + + # later, read these gene_scores from file + my $gene_scores1 = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'gene_gene_scores.ser', + ); + $gene_scores1->read_from_file; + +=head1 DESCRIPTION + +This object represents a collection of scores between source and target +objects. It holds a list of Bio::EnsEMBL::IdMapping::Entry objects and +has methods to retrieve indiviual or all Entries, as well as derived +data like number of unique sources or targets, or various counts and +averages. + +It is the main collection for dealing with scored relationships in the +stable Id mapping application. + +=head1 METHODS + + new + flush + sub_matrix + add_Entry + update_Entry + remove_Entry + add_score + set_score + get_Entry + get_score + get_targets_for_source + get_Entries_for_source + get_sources_for_target + get_Entries_for_target + get_all_Entries + get_all_sources + get_all_targets + get_entry_count + size + get_source_count + get_target_count + get_min_max_scores + get_average_score + merge + log + to_string + +=cut + +package Bio::EnsEMBL::IdMapping::ScoredMappingMatrix; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::IdMapping::Serialisable; +our @ISA = qw(Bio::EnsEMBL::IdMapping::Serialisable); + +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append); +use Bio::EnsEMBL::IdMapping::Entry; + + +=head2 new + + Arg[1-N] : see superclass + Example : my $gene_scores = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'gene_scores.ser', + ); + Description : Constructor. + Return type : Bio::EnsEMBL::IdMapping::ScoredMappingMatrix + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_); + + # initialise internal datastructure + unless ($self->loaded) { + $self->{'cache'}->{'matrix'} = {}; + $self->{'cache'}->{'source_list'} = {}; + $self->{'cache'}->{'target_list'} = {}; + } + + return $self; +} + + +=head2 flush + + Example : $gene_scores->flush; + Description : Flushes (empties) the scoring matrix. + Return type : none + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub flush { + my $self = shift; + + # reset caches + $self->{'cache'}->{'matrix'} = {}; + $self->{'cache'}->{'source_list'} = {}; + $self->{'cache'}->{'target_list'} = {}; +} + + +=head2 sub_matrix + + Arg[1] : Int $start - start index (inclusive) + Arg[2] : Int $end - end index (inclusive) + Example : # get the first 1000 elements in the matrix + my $sub_matrix = $gene_scores->sub_matrix(1, 1000); + Description : Returns a sub-matrix of the ScoredMappingMatrix. The arguments + ($start and $end) specify the position of the first and last + element to return (inclusive, counting starts with element 1, + not 0) + Return type : Bio::EnsEMBL::IdMapping::ScoredMappingMatrix + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub sub_matrix { + my $self = shift; + my $start = shift; + my $end = shift; + + # default to returning the full matrix if no start/end provided + $start ||= 1; + $end ||= $self->size; + + my $sub_matrix = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new( + -DUMP_PATH => $self->dump_path, + -CACHE_FILE => $self->cache_file_name, + ); + my $i = 0; + + foreach my $key (sort keys %{ $self->{'cache'}->{'matrix'} }) { + $i++; + next if ($i < $start); + last if ($i > $end); + + my ($source, $target) = split(/:/, $key); + $sub_matrix->add_score($source, $target, + $self->{'cache'}->{'matrix'}->{$key}); + } + + return $sub_matrix; +} + + +=head2 add_Entry + + Arg[1] : Bio::EnsEMBL::IdMapping::Entry $entry - Entry to add + Example : $gene_scores->add_Entry($entry); + Description : Adds an Entry to the scoring matrix. + Return type : Float - the Entry's score + Exceptions : thrown on wrong or missing argument + Caller : general + Status : At Risk + : under development + +=cut + +sub add_Entry { + my $self = shift; + my $entry = shift; + + unless ($entry and $entry->isa('Bio::EnsEMBL::IdMapping::Entry')) { + throw("Need a Bio::EnsEMBL::IdMapping::Entry"); + } + + return $self->add_score($entry->source, $entry->target, $entry->score); +} + + +=head2 update_Entry + + Arg[1] : Bio::EnsEMBL::IdMapping::Entry $entry - Entry to update + Example : $gene_scores->update_Entry($entry); + Description : Updates an Entry (or rather its score) in the scoring matrix. + Actually delegates to add_Entry(), only there as an intuitively + named wrapper. + Return type : Float - the Entry's score + Exceptions : thrown on wrong or missing argument + Caller : general + Status : At Risk + : under development + +=cut + +sub update_Entry { + return $_[0]->add_Entry($_[1]); +} + + +# +# not needed in the current application, so not implemented +# +sub remove_Entry { + warning('Method ScoredMappingMatrix->remove_Entry not implemented (yet).'); +} + + +=head2 add_score + + Arg[1] : Int $source - source object's internal Id ("dbID") + Arg[2] : Int $target - target object's internal Id ("dbID") + Arg[3] : Float $score - score for source/target pair + Example : $gene_scores->add_score(1234, 5678, 0.997); + Description : Adds a score for a source/target pair to the scoring matrix. + This is a low-level version of add_Entry(). + Return type : Float - the score + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub add_score { + my $self = shift; + my $source = shift; + my $target = shift; + my $score = shift; + + # make sure you don't put duplicates on the source and target lists + unless (exists($self->{'cache'}->{'matrix'}->{"$source:$target"})) { + push @{ $self->{'cache'}->{'source_list'}->{$source} }, $target; + push @{ $self->{'cache'}->{'target_list'}->{$target} }, $source; + } + + $self->{'cache'}->{'matrix'}->{"$source:$target"} = $score; +} + + +=head2 set_score + + Arg[1] : Int $source - source object's internal Id ("dbID") + Arg[2] : Int $target - target object's internal Id ("dbID") + Arg[3] : Float $score - score for source/target pair + Example : $gene_scores->set_score(1234, 5678, 0.997); + Description : Sets the score for a source/target pair in the scoring matrix. + This method is similar to add_score, but assumes that the Entry + has been added before, so won't update the sources and target + lists. + Return type : Float - the score + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub set_score { + my $self = shift; + my $source = shift; + my $target = shift; + my $score = shift; + + $self->{'cache'}->{'matrix'}->{"$source:$target"} = $score; +} + + +=head2 get_Entry + + Arg[1] : Int $source - source object's internal Id ("dbID") + Arg[2] : Int $target - target object's internal Id ("dbID") + Example : my $entry = $gene_scores->get_Entry($source_gene->id, + $target_gene->id); + Description : Gets an Entry from the scoring matrix for a given source and + target object. + Return type : Bio::EnsEMBL::IdMapping::Entry or undef + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_Entry { + my $self = shift; + my $source = shift; + my $target = shift; + + if (exists($self->{'cache'}->{'matrix'}->{"$source:$target"})) { + return Bio::EnsEMBL::IdMapping::Entry->new_fast( + [$source, $target, $self->{'cache'}->{'matrix'}->{"$source:$target"}] + ); + } else { + return undef; + } +} + + +=head2 get_score + + Arg[1] : Int $source - source object's internal Id ("dbID") + Arg[2] : Int $target - target object's internal Id ("dbID") + Example : my $score = $gene_scores->get_score($source_gene->id, + $target_gene->id); + Description : Gets the score from the scoring matrix for a given source and + target object. + Return type : Float or undef + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_score { + my $self = shift; + my $source = shift; + my $target = shift; + + + if (exists($self->{'cache'}->{'matrix'}->{"$source:$target"})) { + return $self->{'cache'}->{'matrix'}->{"$source:$target"}; + } else { + return undef; + } +} + + +=head2 get_targets_for_source + + Arg[1] : Int $source - source object's internal Id ("dbID") + Example : my @targets = @{ $gene_scores->get_targets_for_source(1234) }; + Description : Returns a list of all targets which have a score against a given + source object. + Return type : Arrayref of Int (target objects' internal Ids) + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_targets_for_source { + my $self = shift; + my $source = shift; + + return $self->{'cache'}->{'source_list'}->{$source} || []; +} + + +=head2 get_Entries_for_source + + Arg[1] : Int $source - source object's internal Id ("dbID") + Example : my @entries = @{ $gene_scores->get_Entries_for_source(1234) }; + Description : Returns a list of all Entries in the scoring matrix for a given + source object. + Return type : Arrayref of Bio::EnsEMBL::IdMapping::Entry objects + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_Entries_for_source { + my $self = shift; + my $source = shift; + + return [ map { $self->get_Entry($source, $_) } + @{ $self->{'cache'}->{'source_list'}->{$source} || [] } ]; +} + + +=head2 get_sources_for_target + + Arg[1] : Int $target - target object's internal Id ("dbID") + Example : my @sources = @{ $gene_scores->get_sources_for_target(5678) }; + Description : Returns a list of all sources which have a score against a given + target object. + Return type : Arrayref of Int (source objects' internal Ids) + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_sources_for_target { + my $self = shift; + my $target = shift; + + return $self->{'cache'}->{'target_list'}->{$target} || []; +} + + +=head2 get_Entries_for_target + + Arg[1] : Int $target - target object's internal Id ("dbID") + Example : my @entries = @{ $gene_scores->get_Entries_for_target(5678) }; + Description : Returns a list of all Entries in the scoring matrix for a given + target object. + Return type : Arrayref of Bio::EnsEMBL::IdMapping::Entry objects + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_Entries_for_target { + my $self = shift; + my $target = shift; + + return [ map { $self->get_Entry($_, $target) } + @{ $self->{'cache'}->{'target_list'}->{$target} || [] } ]; +} + + +=head2 get_all_Entries + + Example : foreach my $entry (@{ $gene_scores->get_all_Entries }) { + # do something with the entry + } + Description : Returns a list of all Entries in the scoring matrix. + Return type : Arrayref of Bio::EnsEMBL::IdMapping::Entry objects + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_all_Entries { + my $self = shift; + + my @result = (); + + foreach my $key (keys %{ $self->{'cache'}->{'matrix'} }) { + my ($source, $target) = split(/:/, $key); + push @result, Bio::EnsEMBL::IdMapping::Entry->new_fast( + [$source, $target, $self->{'cache'}->{'matrix'}->{$key}] + ); + } + + return \@result; +} + + +=head2 get_all_sources + + Example : my @sources = @{ $gene_scores->get_all_sources }; + Description : Returns a list of all sources in the scoring matrix. + Return type : Arrayref of Int (source objects' internal Ids) + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_all_sources { + my $self = shift; + return [keys %{ $self->{'cache'}->{'source_list'} }]; +} + + +=head2 get_all_targets + + Example : my @targets = @{ $gene_scores->get_all_targets }; + Description : Returns a list of all targets in the scoring matrix. + Return type : Arrayref of Int (target objects' internal Ids) + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_all_targets { + my $self = shift; + return [keys %{ $self->{'cache'}->{'target_list'} }]; +} + + +=head2 get_entry_count + + Example : my $num_entries = $gene_scores->get_entry_count; + Description : Returns the number of Entries in the scoring matrix. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_entry_count { + my $self = shift; + return scalar(keys %{ $self->{'cache'}->{'matrix'} }); +} + + +=head2 size + + Example : my $size = $gene_scores->size; + Description : Returns the size of the scoring matrix. Same value as returned + by get_entry_count(). + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub size { + return $_[0]->get_entry_count; +} + + +=head2 get_source_count + + Example : my $num_sources = $gene_scores->get_source_count; + Description : Returns the number of distinct sources in the scoring matrix. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_source_count { + my $self = shift; + return scalar(keys %{ $self->{'cache'}->{'source_list'} }); +} + + +=head2 get_target_count + + Example : my $num_targets = $gene_scores->get_target_count; + Description : Returns the number of distinct targets in the scoring matrix. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_target_count { + my $self = shift; + return scalar(keys %{ $self->{'cache'}->{'target_list'} }); +} + + +=head2 get_min_max_scores + + Example : my ($min_score, $max_score) = + @{ $gene_scores->get_min_max_scores }; + Description : Returns the mininum and maximum score in the scoring matrix. + Return type : Arrayref of Float [min_score, max_score] + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_min_max_scores { + my $self = shift; + + my @keys = keys %{ $self->{'cache'}->{'matrix'} }; + + return [undef, undef] unless (@keys); + + # initialise; this should make loop quicker + my $min = $self->{'cache'}->{'matrix'}->{$keys[0]}; + my $max = $self->{'cache'}->{'matrix'}->{$keys[0]}; + + foreach my $key (@keys) { + $min = $self->{'cache'}->{'matrix'}->{$key} if ($min > $self->{'cache'}->{'matrix'}->{$key}); + $max = $self->{'cache'}->{'matrix'}->{$key} if ($max < $self->{'cache'}->{'matrix'}->{$key}); + } + + return [$min, $max]; +} + + +=head2 get_average_score + + Example : my $avg_score = $gene_scores->get_average_score; + Description : Returns the average (mean) score in the matrix. + Return type : Float + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_average_score { + my $self = shift; + + my @keys = keys %{ $self->{'cache'}->{'matrix'} }; + + return undef unless (@keys); + + my $total = 0; + + foreach my $key (@keys) { + $total += $self->{'cache'}->{'matrix'}->{$key}; + } + + return $total/scalar(@keys); +} + + +=head2 merge + + Arg[1] : Bio::EnsEMBL::IdMapping::ScoredMappingMatrix $matrix - another + matrix to merge with + Example : my $update_count = $gene_scores->merge($more_gene_scores); + Description : Merges two scoring matrices. If there's an Entry for a + source/target pair in both matrices, the higher score will be + retained. + Return type : Int - number of Entries added or updated + Exceptions : thrown on wrong or missing argument + Caller : general + Status : At Risk + : under development + +=cut + +sub merge { + my $self = shift; + my $matrix = shift; + + unless ($matrix + and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') ) + { + throw( + 'You must provide a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix' + ); + } + + my $c = 0; + + # merge the matrices + foreach my $key ( keys %{ $matrix->{'cache'}->{'matrix'} } ) { + if ( !defined( $self->{'cache'}->{'matrix'}->{$key} ) + or ( $self->{'cache'}->{'matrix'}->{$key} < + $matrix->{'cache'}->{'matrix'}->{$key} ) ) + { + $self->{'cache'}->{'matrix'}->{$key} = + $matrix->{'cache'}->{'matrix'}->{$key}; + $c++; + } + } + + # merge sources and target lists + foreach my $key ( keys %{ $matrix->{'cache'}->{'source_list'} } ) { + if ( defined( $self->{'cache'}->{'source_list'}->{$key} ) ) { + # need to merge lists + my %unique = + map { $_ => 1 } @{ $self->get_targets_for_source($key) }; + map { $unique{$_} = 1 } + @{ $matrix->get_targets_for_source($key) }; + $self->{'cache'}->{'source_list'}->{$key} = [ keys %unique ]; + } else { + # no merging needed + $self->{'cache'}->{'source_list'}->{$key} = + $matrix->{'cache'}->{'source_list'}->{$key}; + } + } + + foreach my $key ( keys %{ $matrix->{'cache'}->{'target_list'} } ) { + if ( defined( $self->{'cache'}->{'target_list'}->{$key} ) ) { + # need to merge lists + my %unique = + map { $_ => 1 } @{ $self->get_sources_for_target($key) }; + map { $unique{$_} = 1 } + @{ $matrix->get_sources_for_target($key) }; + $self->{'cache'}->{'target_list'}->{$key} = [ keys %unique ]; + } else { + # no merging needed + $self->{'cache'}->{'target_list'}->{$key} = + $matrix->{'cache'}->{'target_list'}->{$key}; + } + } + + return $c; +} ## end sub merge + + +=head2 log + + Arg[1] : String $type - object type (e.g. 'gene') + Arg[2] : String $dump_path - path for writing output + Example : $gene_scores->log('gene', $conf->param('basedir')); + Description : Logs all Entries in the scoring matrix to a file. Used for + debugging. + Return type : none + Exceptions : thrown on I/0 error + Caller : general + Status : At Risk + : under development + +=cut + +sub log { + my $self = shift; + my $type = shift; + my $dump_path = shift; + + my $debug_path = path_append($dump_path, 'debug'); + my $logfile = "$debug_path/${type}_scores.txt"; + + open(my $fh, '>', $logfile) or + throw("Unable to open $logfile for writing: $!"); + + foreach my $entry (@{ $self->get_all_Entries }) { + print $fh ($entry->to_string."\n"); + } + + close($fh); +} + + +=head2 to_string + + Example : print LOG $gene_scores->to_string, "\n"; + Description : Returns a string representation of the scoring matrix. This is + simply a multi-line string, where each line is a stringified + Entry. + Useful for debugging and logging. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub to_string { + my $self = shift; + + my $string = ''; + + foreach my $entry (@{ $self->get_all_Entries }) { + $string .= $entry->to_string."\n"; + } + + return $string; +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/IdMapping/Serialisable.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/Serialisable.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,277 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::IdMapping::Serialisable - base class for serialisable objects + +=head1 SYNOPSIS + + # instantiate an object which extends Serialisable + my $object = YourObject->new( + -DUMP_PATH => '/tmp', + -CACHE_FILE => 'object_cache.ser', + ); + + # serialise object to file + my $filesize = $object->write_to_file; + print LOG "Serialised object to file of size $filesize.\n"; + + # later, create another object defining the same serialisation + # location. specifying -LOAD_AUTO will automatically load it from the + # serialisation file. + my $object1 = YourObject->new( + -DUMP_PATH => '/tmp', + -CACHE_FILE => 'object_cache.ser', + -LOAD_AUTO => 1, + ); + + # alternatively, manually load the object from file + $object1->load_from_file; + +=head1 DESCRIPTION + +This is the base class for serialisable objects used by the +stable Id mapping. It's essentially an OO wrapper for Storable, +providing a method to store (write_to_file(()) and one to retrieve +(read_from_file()) serialised objects. + +This class is not instantiated itself, but rather extended by +implementing classes. + +=head1 METHODS + + new + write_to_file + read_from_file + dump_path + cache_file_name + cache_file + loaded + +=cut + +package Bio::EnsEMBL::IdMapping::Serialisable; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::ScriptUtils qw(parse_bytes); +use Storable qw(nstore retrieve); + + +=head2 new + + Arg [DUMP_PATH] : String - path for object serialisation + Arg [CACHE_FILE] : String - filename of serialised object + Arg [AUTO_LOAD] : Boolean - determines whether object should be automatically + loaded on instantiation + Description : Constructor. + Return type : Bio::EnsEMBL::IdMapping::Serialisable implementing object + Exceptions : thrown on missing argument + Caller : implementing subclass + Status : At Risk + : under development + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my ($dump_path, $cache_file, $auto_load) = + rearrange([qw(DUMP_PATH CACHE_FILE AUTO_LOAD)], @_); + + throw("You must provide a cache file name") unless ($cache_file); + + my $self = {}; + bless ($self, $class); + + # initialise internal datastructure + $self->{'dump_path'} = $dump_path || '.'; + $self->{'cache_file_name'} = $cache_file; + + # automatically load serialised object from file if requested + if ($auto_load) { + if (-s $self->cache_file) { + $self->read_from_file; + $self->{'loaded'} = 1; + } + } + + return $self; +} + + +=head2 write_to_file + + Example : my $filesize = $object->write_to_file; + Description : Serialises an object to a file (determined by + $self->cache_file). + Return type : String - size of serialisation file + Exceptions : thrown on I/O errors + Caller : general + Status : At Risk + : under development + +=cut + +sub write_to_file { + my $self = shift; + + # create dump directory if it doesn't exist + if (my $dump_path = $self->dump_path) { + unless (-d $dump_path) { + system("mkdir -p $dump_path") == 0 or + throw("Unable to create directory $dump_path.\n"); + } + } + + my $cache_file = $self->cache_file; + + eval { nstore($self->{'cache'}, $cache_file) }; + if ($@) { + throw("Unable to store $cache_file: $@\n"); + } + + my $size = -s $cache_file; + return parse_bytes($size); +} + + +=head2 read_from_file + + Example : $object->read_from_file; + Description : Reads a serialised object from file (determined by + $self->cache_file). + Return type : Bio::EnsEMBL::IdMapping::Serialisable implementing object + Exceptions : thrown on I/O errors + Caller : general + Status : At Risk + : under development + +=cut + +sub read_from_file { + my $self = shift; + + my $cache_file = $self->cache_file; + + unless (-s $cache_file) { + throw("No valid cache file found at $cache_file."); + } + + eval { $self->{'cache'} = retrieve($cache_file); }; + if ($@) { + throw("Unable to retrieve cache: $@"); + } + + return $self; +} + + +=head2 dump_path + + Arg[1] : String - dump path for serialisation + Example : $object->dump_path('/tmp'); + Description : Getter/setter for the dump path for serialisation. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub dump_path { + my $self = shift; + $self->{'dump_path'} = shift if (@_); + return $self->{'dump_path'}; +} + + +=head2 cache_file_name + + Arg[1] : String - file name for serialisation + Example : $object->cache_file_name('object_cache.ser'); + Description : Getter/setter for the file name for serialisation. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub cache_file_name { + my $self = shift; + $self->{'cache_file_name'} = shift if (@_); + return $self->{'cache_file_name'}; +} + + +=head2 cache_file + + Example : my $cache_file = $object->cache_file; + Description : Returns the path and name of the serialised object file. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub cache_file { + my $self = shift; + return $self->dump_path.'/'.$self->cache_file_name; +} + + +=head2 loaded + + Arg[1] : Boolean - "loaded" status + Example : if ($object->loaded) { + # do something with the object that was loaded from a file + } else { + # the object wasn't loaded but is new, so fill it + } + Description : Indicates whether a given object was loaded from its serialised + state on disk. + Return type : Boolean - TRUE if loaded from disk, FALSE otherwise + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub loaded { + my $self = shift; + $self->{'loaded'} = shift if (@_); + return $self->{'loaded'}; +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/IdMapping/StableIdGenerator/AedesAegypti.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/StableIdGenerator/AedesAegypti.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,68 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +package Bio::EnsEMBL::IdMapping::StableIdGenerator::AedesAegypti; + +# Package that implements incrementing and verification of Aedes aegypti +# stable IDs as used by the VectorBase project. + +use strict; +use warnings; + +use base qw(Bio::EnsEMBL::IdMapping::StableIdGenerator::EnsemblGeneric); + +sub increment_stable_id { + + # This method will increment a stable ID. For Aedes aegypti, it will + # pick out the numerical part of the stable ID (no matter what type of + # stable ID it is) and increment it by one. It will then replace the + # numerical part by the incremented value and return the new stable + # ID. The parsing of the stable ID is very naive. + + my ( $self, $stable_id ) = @_; + + if ( !$self->is_valid($stable_id) ) { + throw("Unknown or missing stable ID: $stable_id."); + } + + $stable_id =~ /^(\D*)(\d+)(\D*)/; + + my $number_as_string = "$2"; + my $number = $2 + 1; + $stable_id = sprintf( + "%s" . sprintf( "%%0%dd", length($number_as_string) ) . "%s", + $1, $number, $3 ); + + return $stable_id; +} + +sub is_valid { + + # A stable ID is a valid Aedes aegypti stable ID if it begins with the + # character string "AAEL". + + my ( $self, $stable_id ) = @_; + + if ( !( defined($stable_id) && $stable_id =~ /^AAEL/ ) ) { return 0 } + + return 1; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/IdMapping/StableIdGenerator/AnophelesGambiae.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/StableIdGenerator/AnophelesGambiae.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,77 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +package Bio::EnsEMBL::IdMapping::StableIdGenerator::AedesAegypti; + +# Package that implements incrementing and verification of Aedes aegypti +# stable IDs as used by the VectorBase project. +# Based on Aedes_Aegypti.pm +# Differs from Aedes in that Exon stable ids like Ennnnnn not AAEL.ennnnnn +# and gene/transcript/translation start AGAP not AAEL +# also need to exclude old Ensembl-style ids + +use strict; +use warnings; + +use base qw(Bio::EnsEMBL::IdMapping::StableIdGenerator::EnsemblGeneric); + +sub increment_stable_id { + + # This method will increment a stable ID. For Anopheles, it will + # pick out the numerical part of the stable ID (no matter what type of + # stable ID it is) and increment it by one. It will then replace the + # numerical part by the incremented value and return the new stable + # ID. The parsing of the stable ID is very naive. + + my ( $self, $stable_id ) = @_; + + if ( !$self->is_valid($stable_id) ) { + throw("Unknown or missing stable ID: $stable_id."); + } + + $stable_id =~ /^(\D*)(\d+)(\D*)/; + + my $number_as_string = "$2"; + my $number = $2 + 1; + $stable_id = sprintf( + "%s" . sprintf( "%%0%dd", length($number_as_string) ) . "%s", + $1, $number, $3 ); + + return $stable_id; +} + +sub is_valid { + + # A stable ID is a valid Anopheles stable ID if it begins with the + # character string "AGAP" or (for exons) just "E" + # explicitly make the exon one E+digits to exclude old-style ENSANG ids + # otherwise ENSANGnnn found as higher then AGAPnnn + # when initial_stable_id method checks archive tables + + my ( $self, $stable_id ) = @_; + + if ( !( defined($stable_id) && ( $stable_id =~ /^AGAP/ || $stable_id =~ /^E\d+$/ ) ) ) { + return 0; + } + + return 1; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/IdMapping/StableIdGenerator/EnsemblGeneric.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/StableIdGenerator/EnsemblGeneric.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,310 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::IdMapping::StableIdGenerator::EnsemblGeneric - default Ensembl +StableIdGenerator implementation + +=head1 SYNOPSIS + + # inject the confiured StableIdGenerator plugin + my $stable_id_generator = $conf->param('plugin_stable_id_generator'); + inject($stable_id_generator); + + # create a new StableIdGenerator object + my $generator_instance = $stable_id_generator->new( + -LOGGER => $self->logger, + -CONF => $self->conf, + -CACHE => $self->cache + ); + + # determine starting stable ID for new assignments + my $new_stable_id = $generator_instance->initial_stable_id('gene'); + + # loop over genes + foreach my $target_gene (@all_target_genes) { + + # if the stable Id for this gene was mapped, assign it + if ( $mapping_exists{ $target_gene->id } ) { + my $source_gene = $mappings{ $target_gene->id }; + $target_gene->stable_id( $source_gene->stable_id ); + + # calculate and set version + my $version = + $generator_instance->calculate_version( $source_gene, + $target_gene ); + $target_gene->version($version); + + # no mapping exists, assign a new stable Id + } else { + $target_gene->stable_id($new_stable_id); + $target_gene->version('1'); + + # increment the stable Id (to be assigned to the next unmapped gene) + $new_stable_id = + $generator_instance->increment_stable_id($new_stable_id); + } + } + +=head1 DESCRIPTION + +This is the default implementation for a StableIdGenerator, which +is used by Bio::EnsEMBL::IdMapping::StableIdMapper to generate new +stable Ids and increment versions on mapped stable Ids. Refer to the +documentation in this module if you would like to implement your own +StableIdGenerator. + +The stable Id mapping application allows you to plugin your own +implementation by specifying it with the --plugin_stable_id_generator +configuration parameter. + +Requirements for a StableIdGenerator plugin: + + - inherit from Bio::EnsEMBL::IdMapping::BaseObject + - implement all methods listed in METHODS below (see method POD for + signatures) + +=head1 METHODS + + initial_stable_id + increment_stable_id + calculate_version + +=cut + +package Bio::EnsEMBL::IdMapping::StableIdGenerator::EnsemblGeneric; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::IdMapping::BaseObject; +our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + + +=head2 initial_stable_id + + Arg[1] : String $type - an entity type (gene|transcript|translation|exon) + Example : my $new_stable_id = $generator->initial_stable_id('gene'); + Description : Determine the initial stable Id to use for new assignments. This + method is called once at the beginning of stable Id mapping. + Return type : String - a stable Id of appropriate type + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::StableIdMapper::map_stable_ids() + Status : At Risk + : under development + +=cut + +sub initial_stable_id { + my ( $self, $type ) = @_; + + my $init_stable_id; + + # Use stable ID from configuration if set. + $init_stable_id = $self->conf->param("starting_${type}_stable_id"); + if ( defined($init_stable_id) ) { + $self->logger->debug( "Using pre-configured $init_stable_id " . + "as base for new $type stable IDs.\n" ); + return $init_stable_id; + } + + my $s_dba = $self->cache->get_DBAdaptor('source'); + my $s_dbh = $s_dba->dbc->db_handle; + + # look in the ${type} table first + my $sql = qq( + SELECT MAX(stable_id) + FROM ${type} + WHERE stable_id LIKE "ENS%" + OR stable_id LIKE "ASMPATCH%" + ); + + $init_stable_id = $self->fetch_value_from_db( $s_dbh, $sql ); + + # Also look in gene_archive to make sure there are no larger IDs + # there. + if ( $type ne 'exon' ) { + $sql = qq(SELECT MAX(${type}_stable_id) FROM gene_archive); + my $archived_stable_id = $self->fetch_value_from_db( $s_dbh, $sql ); + if ( $archived_stable_id && + $self->is_valid($archived_stable_id) && + ( $archived_stable_id gt $init_stable_id ) ) + { + $init_stable_id = $archived_stable_id; + } + } + + if ( defined($init_stable_id) ) { + # Since $init_stable_id now is the highest existing stable ID for + # this object type, we need to increment it to find the first one we + # want to use for new assignments. + $init_stable_id = $self->increment_stable_id($init_stable_id); + + $self->logger->debug( + "Using $init_stable_id as base for new $type stable IDs.\n"); + + } + else { + $self->logger->warning( + "Can't find highest ${type}_stable_id in source db.\n" ); + } + + return $init_stable_id; +} ## end sub initial_stable_id + + +=head2 increment_stable_id + + Arg[1] : String $stable_id - the stable Id to increment + Example : $next_stable_id = $generator->increment_stable_id( + $current_stable_id); + Description : Increments the stable Id used for new assigments. This method is + called after each new stable Id assigment to generate the next + stable Id to be used. + Return type : String - the next new stable Id + Exceptions : thrown on missing or malformed argument + Caller : Bio::EnsEMBL::IdMapping::StableIdMapper::map_stable_ids() + Status : At Risk + : under development + +=cut + +sub increment_stable_id { + my $self = shift; + my $stable_id = shift; + + if ( !$self->is_valid($stable_id) ) { + throw( sprintf( "Unknown or missing stable ID '%s'", $stable_id ) ); + } + + if ( $stable_id =~ /^LRG/ ) { + throw( sprintf( "We do not increment LRG genes... (got '%s'). " + . "Something's wrong.", + $stable_id ) ); + } + + $stable_id =~ /^(ENS|ASMPATCH)([A-Z]+)(\d+)$/; + + my $number = $3; + my $new_stable_id = $1 . $2 . ( ++$number ); + + return $new_stable_id; +} + + +=head2 is_valid + + Arg[1] : String $stable_id - the stable Id to check + Example : unless ($generator->is_valid($stable_id)) { + die "Invalid stable Id: $stable_id.\n"; + } + Description : Tests a stable Id to be valid (according to the Ensembl stable + Id format definition). + Return type : Boolean - TRUE if valid, FALSE otherwise + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub is_valid { + my ( $self, $stable_id ) = @_; + + if ( defined($stable_id) ) { + if ( $stable_id =~ /^(ENS|ASMPATCH)([A-Z]+)(\d+)$/ + || $stable_id =~ /^LRG/ ) + { + return 1; + } + } + + return 0; +} + + +=head2 calculate_version + + Arg[1] : Bio::EnsEMBL::IdMapping::TinyFeature $s_obj - source object + Arg[2] : Bio::EnsEMBL::IdMapping::TinyFeature $t_obj - target object + Example : my $version = $generator->calculate_version($source_gene, + $target_gene); + $target_gene->version($version); + Description : Determines the version for a mapped stable Id. For Ensembl + genes, the rules for incrementing the version number are: + - exons: if exon sequence changed + - transcript: if spliced exon sequence changed + - translation: if translated sequence changed + - gene: if any of its transcript changed + Return type : String - the version to be used + Exceptions : thrown on wrong argument + Caller : Bio::EnsEMBL::IdMapping::StableIdMapper::map_stable_ids() + Status : At Risk + : under development + +=cut + +sub calculate_version { + my ( $self, $s_obj, $t_obj ) = @_; + + my $version = $s_obj->version(); + + if ( $s_obj->isa('Bio::EnsEMBL::IdMapping::TinyExon') ) { + # increment version if sequence changed + if ( $s_obj->seq() ne $t_obj->seq() ) { ++$version } + } + elsif ( $s_obj->isa('Bio::EnsEMBL::IdMapping::TinyTranscript') ) { + # increment version if spliced exon sequence changed + if ( $s_obj->seq_md5_sum() ne $t_obj->seq_md5_sum() ) { ++$version } + } + elsif ( $s_obj->isa('Bio::EnsEMBL::IdMapping::TinyTranslation') ) { + # increment version if transcript or translation sequences changed + if ( $s_obj->seq() ne $t_obj->seq() ) { ++$version } + } + elsif ( $s_obj->isa('Bio::EnsEMBL::IdMapping::TinyGene') ) { + # increment version if any transcript changed + + my $s_tr_ident = join( + ":", + map { $_->stable_id() . '.' . $_->version() } sort { + $a->stable_id() cmp $b->stable_id() + } @{ $s_obj->get_all_Transcripts() } ); + my $t_tr_ident = join( + ":", + map { $_->stable_id() . '.' . $_->version() } sort { + $a->stable_id() cmp $b->stable_id() + } @{ $t_obj->get_all_Transcripts() } ); + + if ( $s_tr_ident ne $t_tr_ident ) { ++$version } + } + else { + throw( "Unknown object type: " . ref($s_obj) ); + } + + return $version; +} ## end sub calculate_version + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/IdMapping/StableIdGenerator/PristionchusPacificus.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/StableIdGenerator/PristionchusPacificus.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,68 @@ +=head1 LICENSE + + Copyright 2010, The WormBase Consortium. All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, are + permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, this list of + conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, this list + of conditions and the following disclaimer in the documentation and/or other materials + provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE WORMBASE CONSORTIUM ``AS IS'' AND ANY EXPRESS OR IMPLIED + WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + The views and conclusions contained in the software and documentation are those of the + authors and should not be interpreted as representing official policies, either expressed + or implied, of The WormBase Consortium. + +=head1 CONTACT + + Please email comments or questions to the public WormBase + help list at . + +=cut + +package Bio::EnsEMBL::IdMapping::StableIdGenerator::PristionchusPacificus; + +# Class to generate WormBase conform Pristionchus IDs to be injected into the mapper + +use strict; +use warnings; + +use base qw(Bio::EnsEMBL::IdMapping::StableIdGenerator::EnsemblGeneric); + +# PPAxxxxx +# ups the id by one +sub increment_stable_id { + my ( $self, $lastId ) = @_; + + throw("invalid stable ID: $lastId.") unless ($lastId=~/PPA/); + + $lastId =~ /^PPA(\d+)/; + + my $number = $1+1; + my $stable_id = sprintf("PPA%05d",$number); + + return $stable_id; +} + +# just in case it is actually used somewhere +sub is_valid { + my ( $self, $stableId ) = @_; + + if ( defined($stableId) && $stableId =~ /^PPA\d+/) + {return 1} else {return undef} +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/IdMapping/StableIdMapper.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/StableIdMapper.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,762 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::IdMapping::StableIdMapper; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::IdMapping::BaseObject; +our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::ScriptUtils qw(inject path_append); +use Bio::EnsEMBL::IdMapping::ScoredMappingMatrix; +use POSIX qw(strftime); + + +# instance variables +my %debug_mappings; + + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_); + + # inject a StableIdGenerator + # + # If you write your own generators, make sure they extend + # Bio::EnsEMBL::Idmapping::BaseObject and additionally implement these three + # methods: initial_stable_id(), increment_stable_id() and calculate_version(). + my $stable_id_generator = $self->conf->param('plugin_stable_id_generator') || + 'Bio::EnsEMBL::IdMapping::StableIdGenerator::EnsemblGeneric'; + $self->logger->debug("Using $stable_id_generator to generate stable Ids.\n"); + inject($stable_id_generator); + + # create a new StableIdGenerator object + my $generator_instance = $stable_id_generator->new( + -LOGGER => $self->logger, + -CONF => $self->conf, + -CACHE => $self->cache + ); + $self->stable_id_generator($generator_instance); + + return $self; +} + + +sub generate_mapping_session { + my $self = shift; + + # only run this method once + return if ($self->mapping_session_date); + + $self->logger->info("Generating new mapping_session...\n"); + + $self->mapping_session_date(time); + $self->mapping_session_date_fmt(strftime("%Y-%m-%d %T", + localtime($self->mapping_session_date))); + + my $s_dba = $self->cache->get_DBAdaptor('source'); + my $s_dbh = $s_dba->dbc->db_handle; + my $t_dba = $self->cache->get_DBAdaptor('target'); + my $t_dbh = $t_dba->dbc->db_handle; + + # check if mapping_session_id was manually set by the configuration + my $mapping_session_id = $self->conf->param('mapping_session_id'); + + if ($mapping_session_id) { + + $self->logger->debug("Using manually configured mapping_session_id $mapping_session_id\n", 1); + + } else { + + # calculate mapping_session_id from db + my $sql = qq(SELECT MAX(mapping_session_id) FROM mapping_session); + $mapping_session_id = $self->fetch_value_from_db($s_dbh, $sql); + + unless ($mapping_session_id) { + $self->logger->debug("No previous mapping_session found.\n", 1); + } + + # increment last mapping_session_id + $mapping_session_id++; + + $self->logger->debug("Using mapping_session_id $mapping_session_id\n", 1); + } + + $self->mapping_session_id($mapping_session_id); + + # write old mapping_session table to a file + my $i; + my $fh = $self->get_filehandle('mapping_session.txt', 'tables'); + + my $sth1 = $s_dbh->prepare("SELECT * FROM mapping_session"); + $sth1->execute; + + while (my @row = $sth1->fetchrow_array) { + $i++; + print $fh join("\t", @row); + print $fh "\n"; + } + + $sth1->finish; + + # append the new mapping_session to the file + my $release_sql = qq( + SELECT meta_value FROM meta WHERE meta_key = 'schema_version' + ); + my $old_release = $self->fetch_value_from_db($s_dbh, $release_sql); + my $new_release = $self->fetch_value_from_db($t_dbh, $release_sql); + + my $assembly_sql = qq( + SELECT meta_value FROM meta WHERE meta_key = 'assembly.default' + ); + my $old_assembly = $self->fetch_value_from_db($s_dbh, $assembly_sql); + my $new_assembly = $self->fetch_value_from_db($t_dbh, $assembly_sql); + + unless ($old_release and $new_release and $old_assembly and $new_assembly) { + $self->logger->warning("Not all data for new mapping_session found:\n", 1); + $self->logger->info("old_release: $old_release, new_release: $new_release"); + $self->logger->info("old_assembly: $old_assembly, new_assembly $new_assembly\n", 2); + } + + print $fh join("\t", + $mapping_session_id, + $self->conf->param('sourcedbname'), + $self->conf->param('targetdbname'), + $old_release, + $new_release, + $old_assembly, + $new_assembly, + $self->mapping_session_date_fmt); + + print $fh "\n"; + close($fh); + + $self->logger->info("Done writing ".++$i." mapping_session entries.\n\n"); +} + + +sub map_stable_ids { + my $self = shift; + my $mappings = shift; + my $type = shift; + + unless ($mappings and + $mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) { + throw("Need a Bio::EnsEMBL::IdMapping::MappingList of ${type}s."); + } + + # generate a new mapping_session and write all mapping_session data to a file + $self->generate_mapping_session; + + $self->logger->info("== Stable ID mapping for $type...\n\n", 0, 'stamped'); + + # check if there are any objects of this type at all + my %all_sources = %{ $self->cache->get_by_name("${type}s_by_id", 'source') }; + my %all_targets = %{ $self->cache->get_by_name("${type}s_by_id", 'target') }; + unless (scalar(keys %all_sources)) { + $self->logger->info("No cached ${type}s found.\n\n"); + return; + } + + my %stats = map { $_ => 0 } + qw(mapped_known mapped_novel new lost_known lost_novel); + + # create some lookup hashes from the mappings + my %sources_mapped = (); + my %targets_mapped = (); + my %scores_by_target = (); + + foreach my $e (@{ $mappings->get_all_Entries }) { + $sources_mapped{$e->source} = $e->target; + $targets_mapped{$e->target} = $e->source; + $scores_by_target{$e->target} = $e->score; + } + + # determine starting stable ID for new assignments + my $new_stable_id = $self->stable_id_generator->initial_stable_id($type); + + # + # assign mapped and new stable IDs + # + foreach my $tid (keys %all_targets) { + + my $t_obj = $all_targets{$tid}; + + # a mapping exists, assign stable ID accordingly + if (my $sid = $targets_mapped{$tid}) { + + my $s_obj = $all_sources{$sid}; + + # set target's stable ID and created_date + $t_obj->stable_id($s_obj->stable_id); + $t_obj->created_date($s_obj->created_date); + + # calculate and set version + $t_obj->version($self->stable_id_generator->calculate_version( + $s_obj, $t_obj)); + + # change modified_date if version changed + if ($s_obj->version == $t_obj->version) { + $t_obj->modified_date($s_obj->modified_date); + } else { + $t_obj->modified_date($self->mapping_session_date); + } + + # create a stable_id_event entry (not for exons) + unless ( $type eq 'exon' ) { + # Only add events when something changed. + if ( !( $s_obj->stable_id eq $t_obj->stable_id && + $s_obj->version == $t_obj->version && + $scores_by_target{$tid} > 0.9999 ) ) + { + my $key = join( "\t", + $s_obj->stable_id, $s_obj->version, + $t_obj->stable_id, $t_obj->version, + $self->mapping_session_id, $type, + $scores_by_target{$tid} ); + $self->add_stable_id_event( 'new', $key ); + } + } + + # add to debug hash + push @{ $debug_mappings{$type} }, [ $sid, $tid, $t_obj->stable_id ]; + + # stats + if ($s_obj->is_known) { + $stats{'mapped_known'}++; + } else { + $stats{'mapped_novel'}++; + } + + # no mapping was found, assign a new stable ID + } else { + + $t_obj->stable_id($new_stable_id); + $t_obj->version(1); + $t_obj->created_date($self->mapping_session_date); + $t_obj->modified_date($self->mapping_session_date); + + # create a stable_id_event entry (not for exons) + unless ($type eq 'exon') { + my $key = join("\t", + '\N', + 0, + $t_obj->stable_id, + $t_obj->version, + $self->mapping_session_id, + $type, + 0 + ); + $self->add_stable_id_event('new', $key); + } + + # increment the stable Id (to be assigned to the next unmapped object) + $new_stable_id = $self->stable_id_generator->increment_stable_id( + $new_stable_id); + + # stats + $stats{'new'}++; + + } + + } + + # + # deletion events for lost sources + # + my $fh; + if ($type eq 'gene' or $type eq 'transcript') { + $fh = $self->get_filehandle("${type}s_lost.txt", 'debug'); + } + + foreach my $sid (keys %all_sources) { + + my $s_obj = $all_sources{$sid}; + + # no mapping exists, add deletion event + unless ($sources_mapped{$sid}) { + unless ($type eq 'exon') { + my $key = join("\t", + $s_obj->stable_id, + $s_obj->version, + '\N', + 0, + $self->mapping_session_id, + $type, + 0 + ); + $self->add_stable_id_event('new', $key); + } + + # stats + my $status; + if ($s_obj->is_known) { + $stats{'lost_known'}++; + $status = 'known'; + } else { + $stats{'lost_novel'}++; + $status = 'novel'; + } + + # log lost genes and transcripts (for debug purposes) + # + # The Java app did this with a separate method + # (StableIdMapper.dumpLostGeneAndTranscripts()) which also claims to log + # losses due to merge. Since at that point this data isn't available yet + # the logging can be done much more efficient here + if ($type eq 'gene' or $type eq 'transcript') { + print $fh $s_obj->stable_id, "\t$status\n"; + } + } + } + + close($fh) if (defined($fh)); + + # + # write stable IDs to file + # + $self->write_stable_ids_to_file($type, \%all_targets); + + # also generate and write stats to file + $self->generate_mapping_stats($type, \%stats); + + $self->logger->info("Done.\n\n"); +} + + +sub generate_similarity_events { + my ( $self, $mappings, $scores, $type ) = @_; + + # argument checks + unless ( $mappings and + $mappings->isa('Bio::EnsEMBL::IdMapping::MappingList') ) + { + throw('Need a gene Bio::EnsEMBL::IdMapping::MappingList.'); + } + + unless ( $scores and + $scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') ) + { + throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + throw("Need a type (gene|transcript|translation).") unless ($type); + + my $mapped; + + # + # add similarities for mapped entries + # + foreach my $e ( @{ $mappings->get_all_Entries } ) { + + # create lookup hash for mapped sources and targets; we'll need this + # later + $mapped->{'source'}->{ $e->source } = 1; + $mapped->{'target'}->{ $e->target } = 1; + + # loop over all other entries which contain either source or target; + # add similarity if score is within 1.5% of this entry (which is the + # top scorer) + my @others = @{ $scores->get_Entries_for_target( $e->target ) }; + push @others, @{ $scores->get_Entries_for_source( $e->source ) }; + + while ( my $e2 = shift(@others) ) { + + # skip self + if ( ( $e->source eq $e2->source ) and + ( $e->target eq $e2->target ) ) + { + next; + } + + if ( $e2->score > ( $e->score*0.985 ) ) { + + my $s_obj = + $self->cache->get_by_key( "${type}s_by_id", 'source', + $e2->source ); + my $t_obj = + $self->cache->get_by_key( "${type}s_by_id", 'target', + $e2->target ); + + my $key = join( "\t", + $s_obj->stable_id, $s_obj->version, + $t_obj->stable_id, $t_obj->version, + $self->mapping_session_id, $type, + $e2->score ); + $self->add_stable_id_event( 'similarity', $key ); + + } + + # [todo] add overlap hack here? (see Java code) + # probably better solution: let synteny rescoring affect this + # decision + } ## end while ( my $e2 = shift(@others...)) + + } ## end foreach my $e ( @{ $mappings...}) + + # + # similarities for other entries + # + foreach my $dbtype ( keys %$mapped ) { + + # note: $dbtype will be either 'source' or 'target' + my $m1 = "get_all_${dbtype}s"; + my $m2 = "get_Entries_for_${dbtype}"; + + foreach my $id ( @{ $scores->$m1 } ) { + + # skip if this is a mapped source/target + if ( $mapped->{$dbtype}->{$id} ) { next } + + my @entries = + sort { $b->score <=> $a->score } @{ $scores->$m2($id) }; + + unless (@entries) { next } + + # skip if top score < 0.75 + my $top_score = $entries[0]->score; + if ( $top_score < 0.75 ) { next } + + # add similarities for all entries within 5% of top scorer + while ( my $e = shift(@entries) ) { + + if ( $e->score > ( $top_score*0.95 ) ) { + + my $s_obj = + $self->cache->get_by_key( "${type}s_by_id", 'source', + $e->source ); + my $t_obj = + $self->cache->get_by_key( "${type}s_by_id", 'target', + $e->target ); + + my $key = join( "\t", + $s_obj->stable_id, $s_obj->version, + $t_obj->stable_id, $t_obj->version, + $self->mapping_session_id, $type, + $e->score ); + $self->add_stable_id_event( 'similarity', $key ); + + } + } + + } ## end foreach my $id ( @{ $scores...}) + } ## end foreach my $dbtype ( keys %$mapped) + +} ## end sub generate_similarity_events + + +sub filter_same_gene_transcript_similarities { + my $self = shift; + my $transcript_scores = shift; + + # argument checks + unless ($transcript_scores and + $transcript_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix of transcripts.'); + } + + # create a new matrix for the filtered entries + my $filtered_scores = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new( + -DUMP_PATH => path_append($self->conf->param('basedir'), 'matrix'), + -CACHE_FILE => 'filtered_transcript_scores.ser', + ); + + # lookup hash for all target transcripts + my %all_targets = map { $_->stable_id => 1 } + values %{ $self->cache->get_by_name("transcripts_by_id", 'target') }; + + my $i = 0; + + foreach my $e (@{ $transcript_scores->get_all_Entries }) { + + my $s_tr = $self->cache->get_by_key('transcripts_by_id', 'source', + $e->source); + my $s_gene = $self->cache->get_by_key('genes_by_transcript_id', 'source', + $e->source); + my $t_gene = $self->cache->get_by_key('genes_by_transcript_id', 'target', + $e->target); + # workaround for caching issue: only gene objects in 'genes_by_id' cache + # have a stable ID assigned + #$t_gene = $self->cache->get_by_key('genes_by_id', 'target', $t_gene->id); + + #$self->logger->debug("xxx ".join(":", $s_tr->stable_id, $s_gene->stable_id, + # $t_gene->stable_id)."\n"); + + # skip if source and target transcript are in same gene, BUT keep events for + # deleted transcripts + if (($s_gene->stable_id eq $t_gene->stable_id) and + $all_targets{$s_tr->stable_id}) { + $i++; + next; + } + + $filtered_scores->add_Entry($e); + } + + $self->logger->debug("Skipped $i same gene transcript mappings.\n"); + + return $filtered_scores; +} + + +sub generate_translation_similarity_events { + my $self = shift; + my $mappings = shift; + my $transcript_scores = shift; + + # argument checks + unless ($mappings and + $mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) { + throw('Need a gene Bio::EnsEMBL::IdMapping::MappingList.'); + } + + unless ($transcript_scores and + $transcript_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + # create a fake translation scoring matrix + my $translation_scores = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new( + -DUMP_PATH => path_append($self->conf->param('basedir'), 'matrix'), + -CACHE_FILE => 'translation_scores.ser', + ); + + foreach my $e (@{ $transcript_scores->get_all_Entries }) { + + my $s_tl = $self->cache->get_by_key('transcripts_by_id', 'source', + $e->source)->translation; + my $t_tl = $self->cache->get_by_key('transcripts_by_id', 'target', + $e->target)->translation; + + # add an entry to the translation scoring matrix using the score of the + # corresponding transcripts + if ($s_tl and $t_tl) { + $translation_scores->add_score($s_tl->id, $t_tl->id, $e->score); + } + } + + # now generate similarity events using this fake scoring matrix + $self->generate_similarity_events($mappings, $translation_scores, + 'translation'); +} + + +sub write_stable_ids_to_file { + my $self = shift; + my $type = shift; + my $all_targets = shift; + + $self->logger->info("Writing ${type} stable IDs to file...\n"); + + my $fh = $self->get_filehandle("${type}_stable_id.txt", 'tables'); + + my @sorted_targets = map { $all_targets->{$_} } sort { $a <=> $b } + keys %$all_targets; + + foreach my $obj (@sorted_targets) { + + # check for missing created and modified dates + my $created_date = $obj->created_date; + unless ($created_date) { + #$self->logger->debug("Missing created_date for target ". + # $obj->to_string."\n", 1); + $created_date = $self->mapping_session_date; + } + + my $modified_date = $obj->modified_date; + unless ($modified_date) { + #$self->logger->debug("Missing modified_date for target ". + # $obj->to_string."\n", 1); + $modified_date = $self->mapping_session_date; + } + + my $row = join("\t", + $obj->id, + $obj->stable_id, + $obj->version, + strftime("%Y-%m-%d %T", localtime($created_date)), + strftime("%Y-%m-%d %T", localtime($modified_date)), + ); + + print $fh "$row\n"; + } + + close($fh); + + $self->logger->info("Done writing ".scalar(@sorted_targets)." entries.\n\n"); +} + + +sub generate_mapping_stats { + my $self = shift; + my $type = shift; + my $stats = shift; + + my $result = ucfirst($type)." mapping results:\n\n"; + + my $fmt1 = "%-10s%-10s%-10s%-10s\n"; + my $fmt2 = "%-10s%6.0f %6.0f %4.2f%%\n"; + + $result .= sprintf($fmt1, qw(TYPE MAPPED LOST PERCENTAGE)); + $result .= ('-'x40)."\n"; + + my $mapped_total = $stats->{'mapped_known'} + $stats->{'mapped_novel'}; + my $lost_total = $stats->{'lost_known'} + $stats->{'lost_novel'}; + my $known_total = $stats->{'mapped_known'} + $stats->{'lost_known'}; + my $novel_total = $stats->{'mapped_novel'} + $stats->{'lost_novel'}; + + # no split into known and novel for exons + unless ( $type eq 'exon' ) { + $result .= sprintf( $fmt2, + 'known', + $stats->{'mapped_known'}, + $stats->{'lost_known'}, + ($known_total ? $stats->{'mapped_known'}/$known_total*100 : 0) + ); + + $result .= sprintf( $fmt2, + 'novel', + $stats->{'mapped_novel'}, + $stats->{'lost_novel'}, + ($novel_total ? $stats->{'mapped_novel'}/$novel_total*100 : 0) + ); + } ## end unless ( $type eq 'exon' ) + + $result .= sprintf($fmt2, 'total', $mapped_total, $lost_total, + $mapped_total/($known_total + $novel_total)*100); + + # log result + $self->logger->info($result."\n"); + + # write result to file + my $fh = $self->get_filehandle("${type}_mapping_stats.txt", 'stats'); + print $fh $result; + close($fh); +} + + +sub dump_debug_mappings { + my $self = shift; + + foreach my $type (keys %debug_mappings) { + + $self->logger->debug("Writing $type mappings to debug/${type}_mappings.txt...\n"); + + my $fh = $self->get_filehandle("${type}_mappings.txt", 'debug'); + + foreach my $row (@{ $debug_mappings{$type} }) { + print $fh join("\t", @$row); + print $fh "\n"; + } + + close($fh); + + $self->logger->debug("Done.\n"); + } +} + + +sub write_stable_id_events { + my $self = shift; + my $event_type = shift; + + throw("Need an event type (new|similarity).") unless ($event_type); + + $self->logger->debug("Writing $event_type stable_id_events to file...\n"); + + my $fh = $self->get_filehandle("stable_id_event_${event_type}.txt", 'tables'); + my $i = 0; + + foreach my $event (@{ $self->get_all_stable_id_events($event_type) }) { + print $fh "$event\n"; + $i++; + } + + close($fh); + + $self->logger->debug("Done writing $i entries.\n"); +} + + +sub add_stable_id_event { + my ($self, $type, $event) = @_; + + # argument check + throw("Need an event type (new|similarity).") unless ($type); + + $self->{'stable_id_events'}->{$type}->{$event} = 1; +} + + +sub get_all_stable_id_events { + my ($self, $type) = @_; + + # argument check + throw("Need an event type (new|similarity).") unless ($type); + + return [ keys %{ $self->{'stable_id_events'}->{$type} } ]; +} + + +sub mapping_session_id { + my $self = shift; + $self->{'_mapping_session_id'} = shift if (@_); + return $self->{'_mapping_session_id'}; +} + + +sub mapping_session_date { + my $self = shift; + $self->{'_mapping_session_date'} = shift if (@_); + return $self->{'_mapping_session_date'}; +} + + +sub mapping_session_date_fmt { + my $self = shift; + $self->{'_mapping_session_date_fmt'} = shift if (@_); + return $self->{'_mapping_session_date_fmt'}; +} + + +sub stable_id_generator { + my $self = shift; + $self->{'_stable_id_generator'} = shift if (@_); + return $self->{'_stable_id_generator'}; +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/IdMapping/SyntenyFramework.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/SyntenyFramework.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,561 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::IdMapping::SyntenyFramework - framework representing syntenic +regions across the genome + +=head1 SYNOPSIS + + # build the SyntenyFramework from unambiguous gene mappings + my $sf = Bio::EnsEMBL::IdMapping::SyntenyFramework->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'synteny_framework.ser', + -LOGGER => $self->logger, + -CONF => $self->conf, + -CACHE => $self->cache, + ); + $sf->build_synteny($gene_mappings); + + # use it to rescore the genes + $gene_scores = $sf->rescore_gene_matrix_lsf($gene_scores); + +=head1 DESCRIPTION + +The SyntenyFramework is a set of SyntenyRegions. These are pairs of +locations very analoguous to the information in the assembly table (the +locations dont have to be the same length though). They are built from +genes that map uniquely between source and target. + +Once built, the SyntenyFramework is used to score source and target gene +pairs to determine whether they are similar. This process is slow (it +involves testing all gene pairs against all SyntenyRegions), this module +therefor has built-in support to run the process in parallel via LSF. + +=head1 METHODS + + new + build_synteny + _by_overlap + add_SyntenyRegion + get_all_SyntenyRegions + rescore_gene_matrix_lsf + rescore_gene_matrix + logger + conf + cache + +=cut + +package Bio::EnsEMBL::IdMapping::SyntenyFramework; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::IdMapping::Serialisable; +our @ISA = qw(Bio::EnsEMBL::IdMapping::Serialisable); + +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append); +use Bio::EnsEMBL::IdMapping::SyntenyRegion; +use Bio::EnsEMBL::IdMapping::ScoredMappingMatrix; + +use FindBin qw($Bin); +FindBin->again; + + +=head2 new + + Arg [LOGGER]: Bio::EnsEMBL::Utils::Logger $logger - a logger object + Arg [CONF] : Bio::EnsEMBL::Utils::ConfParser $conf - a configuration object + Arg [CACHE] : Bio::EnsEMBL::IdMapping::Cache $cache - a cache object + Arg [DUMP_PATH] : String - path for object serialisation + Arg [CACHE_FILE] : String - filename of serialised object + Example : my $sf = Bio::EnsEMBL::IdMapping::SyntenyFramework->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'synteny_framework.ser', + -LOGGER => $self->logger, + -CONF => $self->conf, + -CACHE => $self->cache, + ); + Description : Constructor. + Return type : Bio::EnsEMBL::IdMapping::SyntenyFramework + Exceptions : thrown on wrong or missing arguments + Caller : InternalIdMapper plugins + Status : At Risk + : under development + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_); + + my ($logger, $conf, $cache) = rearrange(['LOGGER', 'CONF', 'CACHE'], @_); + + unless ($logger and ref($logger) and + $logger->isa('Bio::EnsEMBL::Utils::Logger')) { + throw("You must provide a Bio::EnsEMBL::Utils::Logger for logging."); + } + + unless ($conf and ref($conf) and + $conf->isa('Bio::EnsEMBL::Utils::ConfParser')) { + throw("You must provide configuration as a Bio::EnsEMBL::Utils::ConfParser object."); + } + + unless ($cache and ref($cache) and + $cache->isa('Bio::EnsEMBL::IdMapping::Cache')) { + throw("You must provide configuration as a Bio::EnsEMBL::IdMapping::Cache object."); + } + + # initialise + $self->logger($logger); + $self->conf($conf); + $self->cache($cache); + $self->{'cache'} = []; + + return $self; +} + + +=head2 build_synteny + + Arg[1] : Bio::EnsEMBL::IdMapping::MappingList $mappings - gene mappings + to build the SyntenyFramework from + Example : $synteny_framework->build_synteny($gene_mappings); + Description : Builds the SyntenyFramework from unambiguous gene mappings. + SyntenyRegions are allowed to overlap. At most two overlapping + SyntenyRegions are merged (otherwise we'd get too large + SyntenyRegions with little information content). + Return type : none + Exceptions : thrown on wrong or missing argument + Caller : InternalIdMapper plugins + Status : At Risk + : under development + +=cut + +sub build_synteny { + my $self = shift; + my $mappings = shift; + + unless ($mappings and + $mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) { + throw('Need a gene Bio::EnsEMBL::IdMapping::MappingList.'); + } + + # create a synteny region for each mapping + my @synteny_regions = (); + + foreach my $entry (@{ $mappings->get_all_Entries }) { + + my $source_gene = $self->cache->get_by_key('genes_by_id', 'source', + $entry->source); + my $target_gene = $self->cache->get_by_key('genes_by_id', 'target', + $entry->target); + + my $sr = Bio::EnsEMBL::IdMapping::SyntenyRegion->new_fast([ + $source_gene->start, + $source_gene->end, + $source_gene->strand, + $source_gene->seq_region_name, + $target_gene->start, + $target_gene->end, + $target_gene->strand, + $target_gene->seq_region_name, + $entry->score, + ]); + + push @synteny_regions, $sr; + } + + unless (@synteny_regions) { + $self->logger->warning("No synteny regions could be identified.\n"); + return; + } + + # sort synteny regions + #my @sorted = sort _by_overlap @synteny_regions; + my @sorted = reverse sort { + $a->source_seq_region_name cmp $b->source_seq_region_name || + $a->source_start <=> $b->source_start || + $a->source_end <=> $b->source_end } @synteny_regions; + + $self->logger->info("SyntenyRegions before merging: ".scalar(@sorted)."\n"); + + # now create merged regions from overlapping syntenies, but only merge a + # maximum of 2 regions (otherwise you end up with large synteny blocks which + # won't contain much information in this context) + my $last_merged = 0; + my $last_sr = shift(@sorted); + + while (my $sr = shift(@sorted)) { + #$self->logger->debug("this ".$sr->to_string."\n"); + + my $merged_sr = $last_sr->merge($sr); + + if (! $merged_sr) { + unless ($last_merged) { + $self->add_SyntenyRegion($last_sr->stretch(2)); + #$self->logger->debug("nnn ".$last_sr->to_string."\n"); + } + $last_merged = 0; + } else { + $self->add_SyntenyRegion($merged_sr->stretch(2)); + #$self->logger->debug("mmm ".$merged_sr->to_string."\n"); + $last_merged = 1; + } + + $last_sr = $sr; + } + + # deal with last synteny region in @sorted + unless ($last_merged) { + $self->add_SyntenyRegion($last_sr->stretch(2)); + $last_merged = 0; + } + + #foreach my $sr (@{ $self->get_all_SyntenyRegions }) { + # $self->logger->debug("SRs ".$sr->to_string."\n"); + #} + + $self->logger->info("SyntenyRegions after merging: ".scalar(@{ $self->get_all_SyntenyRegions })."\n"); + +} + + +# +# sort SyntenyRegions by overlap +# +sub _by_overlap { + # first sort by seq_region + my $retval = ($b->source_seq_region_name cmp $a->source_seq_region_name); + return $retval if ($retval); + + # then sort by overlap: + # return -1 if $a is downstream, 1 if it's upstream, 0 if they overlap + if ($a->source_end < $b->source_start) { return 1; } + if ($a->source_start < $b->source_end) { return -1; } + return 0; +} + + +=head2 add_SyntenyRegion + + Arg[1] : Bio::EnsEMBL::IdMaping::SyntenyRegion - SyntenyRegion to add + Example : $synteny_framework->add_SyntenyRegion($synteny_region); + Description : Adds a SyntenyRegion to the framework. For speed reasons (and + since this is an internal method), no argument check is done. + Return type : none + Exceptions : none + Caller : internal + Status : At Risk + : under development + +=cut + +sub add_SyntenyRegion { + push @{ $_[0]->{'cache'} }, $_[1]; +} + + +=head2 get_all_SyntenyRegions + + Example : foreach my $sr (@{ $sf->get_all_SyntenyRegions }) { + # do something with the SyntenyRegion + } + Description : Get a list of all SyntenyRegions in the framework. + Return type : Arrayref of Bio::EnsEMBL::IdMapping::SyntenyRegion + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_all_SyntenyRegions { + return $_[0]->{'cache'}; +} + + +=head2 rescore_gene_matrix_lsf + + Arg[1] : Bio::EnsEMBL::IdMapping::ScoredmappingMatrix $matrix - gene + scores to rescore + Example : my $new_scores = $sf->rescore_gene_matrix_lsf($gene_scores); + Description : This method runs rescore_gene_matrix() (via the + synteny_resocre.pl script) in parallel with lsf, then combines + the results to return a single rescored scoring matrix. + Parallelisation is done by chunking the scoring matrix into + several pieces (determined by the --synteny_rescore_jobs + configuration option). + Return type : Bio::EnsEMBL::IdMapping::ScoredMappingMatrix + Exceptions : thrown on wrong or missing argument + thrown on filesystem I/O error + thrown on failure of one or mor lsf jobs + Caller : InternalIdMapper plugins + Status : At Risk + : under development + +=cut + +sub rescore_gene_matrix_lsf { + my $self = shift; + my $matrix = shift; + + unless ($matrix and + $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + # serialise SyntenyFramework to disk + $self->logger->debug("Serialising SyntenyFramework...\n", 0, 'stamped'); + $self->write_to_file; + $self->logger->debug("Done.\n", 0, 'stamped'); + + # split the ScoredMappingMatrix into chunks and write to disk + my $matrix_size = $matrix->size; + $self->logger->debug("Scores before rescoring: $matrix_size.\n"); + + my $num_jobs = $self->conf->param('synteny_rescore_jobs') || 20; + $num_jobs++; + + my $dump_path = path_append($self->conf->param('basedir'), + 'matrix/synteny_rescore'); + + $self->logger->debug("Creating sub-matrices...\n", 0, 'stamped'); + foreach my $i (1..$num_jobs) { + my $start = (int($matrix_size/($num_jobs-1)) * ($i - 1)) + 1; + my $end = int($matrix_size/($num_jobs-1)) * $i; + $self->logger->debug("$start-$end\n", 1); + my $sub_matrix = $matrix->sub_matrix($start, $end); + + $sub_matrix->cache_file_name("gene_matrix_synteny$i.ser"); + $sub_matrix->dump_path($dump_path); + + $sub_matrix->write_to_file; + } + $self->logger->debug("Done.\n", 0, 'stamped'); + + # create an empty lsf log directory + my $logpath = path_append($self->logger->logpath, 'synteny_rescore'); + system("rm -rf $logpath") == 0 or + $self->logger->error("Unable to delete lsf log dir $logpath: $!\n"); + system("mkdir -p $logpath") == 0 or + $self->logger->error("Can't create lsf log dir $logpath: $!\n"); + + # build lsf command + my $lsf_name = 'idmapping_synteny_rescore_'.time; + + my $options = $self->conf->create_commandline_options( + logauto => 1, + logautobase => "synteny_rescore", + logpath => $logpath, + interactive => 0, + is_component => 1, + ); + + my $cmd = qq{$Bin/synteny_rescore.pl $options --index \$LSB_JOBINDEX}; + + my $bsub_cmd = + sprintf( "|bsub -J%s[1-%d] " + . "-o %s/synteny_rescore.%%I.out " + . "-e %s/synteny_rescore.%%I.err %s", + $lsf_name, $num_jobs, $logpath, $logpath, + $self->conf()->param('lsf_opt_synteny_rescore') ); + + # run lsf job array + $self->logger->info("Submitting $num_jobs jobs to lsf.\n"); + $self->logger->debug("$cmd\n\n"); + + local *BSUB; + open( BSUB, $bsub_cmd ) + or $self->logger->error("Could not open open pipe to bsub: $!\n"); + + print BSUB $cmd; + $self->logger->error("Error submitting synteny rescoring jobs: $!\n") + unless ($? == 0); + close BSUB; + + # submit dependent job to monitor finishing of jobs + $self->logger->info("Waiting for jobs to finish...\n", 0, 'stamped'); + + my $dependent_job = qq{bsub -K -w "ended($lsf_name)" -q small } . + qq{-o $logpath/synteny_rescore_depend.out /bin/true}; + + system($dependent_job) == 0 or + $self->logger->error("Error submitting dependent job: $!\n"); + + $self->logger->info("All jobs finished.\n", 0, 'stamped'); + + # check for lsf errors + sleep(5); + my $err; + foreach my $i (1..$num_jobs) { + $err++ unless (-e "$logpath/synteny_rescore.$i.success"); + } + + if ($err) { + $self->logger->error("At least one of your jobs failed.\nPlease check the logfiles at $logpath for errors.\n"); + } + + # merge and return matrix + $self->logger->debug("Merging rescored matrices...\n"); + $matrix->flush; + + foreach my $i (1..$num_jobs) { + # read partial matrix created by lsf job from file + my $sub_matrix = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => "gene_matrix_synteny$i.ser", + ); + $sub_matrix->read_from_file; + + # merge with main matrix + $matrix->merge($sub_matrix); + } + + $self->logger->debug("Done.\n"); + $self->logger->debug("Scores after rescoring: ".$matrix->size.".\n"); + + return $matrix; +} + + +# +# +=head2 rescore_gene_matrix + + Arg[1] : Bio::EnsEMBL::IdMapping::ScoredmappingMatrix $matrix - gene + scores to rescore + Example : my $new_scores = $sf->rescore_gene_matrix($gene_scores); + Description : Rescores a gene matrix. Retains 70% of old score and builds + other 30% from the synteny match. + Return type : Bio::EnsEMBL::IdMapping::ScoredMappingMatrix + Exceptions : thrown on wrong or missing argument + Caller : InternalIdMapper plugins + Status : At Risk + : under development + +=cut + +sub rescore_gene_matrix { + my $self = shift; + my $matrix = shift; + + unless ($matrix and + $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + my $retain_factor = 0.7; + + foreach my $entry (@{ $matrix->get_all_Entries }) { + my $source_gene = $self->cache->get_by_key('genes_by_id', 'source', + $entry->source); + + my $target_gene = $self->cache->get_by_key('genes_by_id', 'target', + $entry->target); + + my $highest_score = 0; + + foreach my $sr (@{ $self->get_all_SyntenyRegions }) { + my $score = $sr->score_location_relationship($source_gene, $target_gene); + $highest_score = $score if ($score > $highest_score); + } + + #$self->logger->debug("highscore ".$entry->to_string." ". + # sprintf("%.6f\n", $highest_score)); + + $matrix->set_score($entry->source, $entry->target, + ($entry->score * 0.7 + $highest_score * 0.3)); + } + + return $matrix; +} + + +=head2 logger + + Arg[1] : (optional) Bio::EnsEMBL::Utils::Logger - the logger to set + Example : $object->logger->info("Starting ID mapping.\n"); + Description : Getter/setter for logger object + Return type : Bio::EnsEMBL::Utils::Logger + Exceptions : none + Caller : constructor + Status : At Risk + : under development + +=cut + +sub logger { + my $self = shift; + $self->{'_logger'} = shift if (@_); + return $self->{'_logger'}; +} + + +=head2 conf + + Arg[1] : (optional) Bio::EnsEMBL::Utils::ConfParser - the configuration + to set + Example : my $basedir = $object->conf->param('basedir'); + Description : Getter/setter for configuration object + Return type : Bio::EnsEMBL::Utils::ConfParser + Exceptions : none + Caller : constructor + Status : At Risk + : under development + +=cut + +sub conf { + my $self = shift; + $self->{'_conf'} = shift if (@_); + return $self->{'_conf'}; +} + + +=head2 cache + + Arg[1] : (optional) Bio::EnsEMBL::IdMapping::Cache - the cache to set + Example : $object->cache->read_from_file('source'); + Description : Getter/setter for cache object + Return type : Bio::EnsEMBL::IdMapping::Cache + Exceptions : none + Caller : constructor + Status : At Risk + : under development + +=cut + +sub cache { + my $self = shift; + $self->{'_cache'} = shift if (@_); + return $self->{'_cache'}; +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/IdMapping/SyntenyRegion.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/SyntenyRegion.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,528 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::IdMapping::SyntenyRegion - object representing syntenic regions + +=head1 SYNOPSIS + + # create a new SyntenyRegion from a source and a target gene + my $sr = Bio::EnsEMBL::IdMapping::SyntenyRegion->new_fast( [ + $source_gene->start, $source_gene->end, + $source_gene->strand, $source_gene->seq_region_name, + $target_gene->start, $target_gene->end, + $target_gene->strand, $target_gene->seq_region_name, + $entry->score, + ] ); + + # merge with another SyntenyRegion + my $merged_sr = $sr->merge($sr1); + + # score a gene pair against this SyntenyRegion + my $score = + $sr->score_location_relationship( $source_gene1, $target_gene1 ); + +=head1 DESCRIPTION + +This object represents a synteny between a source and a target location. +SyntenyRegions are built from mapped genes, and the their score is +defined as the score of the gene mapping. For merged SyntenyRegions, +scores are combined. + +=head1 METHODS + + new_fast + source_start + source_end + source_strand + source_seq_region_name + target_start + target_end + target_strand + target_seq_region_name + score + merge + stretch + score_location_relationship + to_string + +=cut + +package Bio::EnsEMBL::IdMapping::SyntenyRegion; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + + +=head2 new_fast + + Arg[1] : Arrayref $array_ref - the arrayref to bless into the + SyntenyRegion object + Example : my $sr = Bio::EnsEMBL::IdMapping::SyntenyRegion->new_fast([ + ]); + Description : Constructor. On instantiation, source and target regions are + reverse complemented so that source is always on forward strand. + Return type : a Bio::EnsEMBL::IdMapping::SyntenyRegion object + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + +sub new_fast { + my $class = shift; + my $array_ref = shift; + + # reverse complement source and target so that source is always on forward + # strand; this will make merging and other comparison operations easier + # at later stages + if ($array_ref->[2] == -1) { + $array_ref->[2] = 1; + $array_ref->[6] = -1 * $array_ref->[6]; + } + + return bless $array_ref, $class; +} + + +=head2 source_start + + Arg[1] : (optional) Int - source location start coordinate + Description : Getter/setter for source location start coordinate. + Return type : Int + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + +sub source_start { + my $self = shift; + $self->[0] = shift if (@_); + return $self->[0]; +} + + +=head2 source_end + + Arg[1] : (optional) Int - source location end coordinate + Description : Getter/setter for source location end coordinate. + Return type : Int + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + +sub source_end { + my $self = shift; + $self->[1] = shift if (@_); + return $self->[1]; +} + + +=head2 source_strand + + Arg[1] : (optional) Int - source location strand + Description : Getter/setter for source location strand. + Return type : Int + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + +sub source_strand { + my $self = shift; + $self->[2] = shift if (@_); + return $self->[2]; +} + + +=head2 source_seq_region_name + + Arg[1] : (optional) String - source location seq_region name + Description : Getter/setter for source location seq_region name. + Return type : String + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + +sub source_seq_region_name { + my $self = shift; + $self->[3] = shift if (@_); + return $self->[3]; +} + + +=head2 target_start + + Arg[1] : (optional) Int - target location start coordinate + Description : Getter/setter for target location start coordinate. + Return type : Int + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + +sub target_start { + my $self = shift; + $self->[4] = shift if (@_); + return $self->[4]; +} + + +=head2 target_end + + Arg[1] : (optional) Int - target location end coordinate + Description : Getter/setter for target location end coordinate. + Return type : Int + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + +sub target_end { + my $self = shift; + $self->[5] = shift if (@_); + return $self->[5]; +} + + +=head2 target_strand + + Arg[1] : (optional) Int - target location strand + Description : Getter/setter for target location strand. + Return type : Int + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + +sub target_strand { + my $self = shift; + $self->[6] = shift if (@_); + return $self->[6]; +} + + +=head2 target_seq_region_name + + Arg[1] : (optional) String - target location seq_region name + Description : Getter/setter for target location seq_region name. + Return type : String + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + +sub target_seq_region_name { + my $self = shift; + $self->[7] = shift if (@_); + return $self->[7]; +} + + +=head2 score + + Arg[1] : (optional) Float - score + Description : Getter/setter for the score between source and target location. + Return type : Int + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + +sub score { + my $self = shift; + $self->[8] = shift if (@_); + return $self->[8]; +} + + +=head2 merge + + Arg[1] : Bio::EnsEMBL::IdMapping::SyntenyRegion $sr - another + SyntenyRegion + Example : $merged_sr = $sr->merge($other_sr); + Description : Merges two overlapping SyntenyRegions if they meet certain + criteria (see documentation in the code for details). Score is + calculated as a combined distance score. If the two + SyntenyRegions aren't mergeable, this method returns undef. + Return type : Bio::EnsEMBL::IdMapping::SyntenyRegion or undef + Exceptions : warns on bad scores + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + +sub merge { + my ($self, $sr) = @_; + + # must be on same seq_region + if ($self->source_seq_region_name ne $sr->source_seq_region_name or + $self->target_seq_region_name ne $sr->target_seq_region_name) { + return 0; + } + + # target must be on same strand + return 0 unless ($self->target_strand == $sr->target_strand); + + # find the distance of source and target pair and compare + my $source_dist = $sr->source_start - $self->source_start; + my $target_dist; + if ($self->target_strand == 1) { + $target_dist = $sr->target_start - $self->target_start; + } else { + $target_dist = $self->target_end - $sr->target_end; + } + + # prevent division by zero error + if ($source_dist == 0 or $target_dist == 0) { + warn("WARNING: source_dist ($source_dist) and/or target_dist ($target_dist) is zero.\n"); + return 0; + } + + # calculate a distance score + my $dist = $source_dist - $target_dist; + $dist = -$dist if ($dist < 0); + my $d1 = $dist/$source_dist; + $d1 = -$d1 if ($d1 < 0); + my $d2 = $dist/$target_dist; + $d2 = -$d2 if ($d2 < 0); + my $dist_score = 1 - $d1 - $d2; + + # distance score must be more than 50% + return 0 if ($dist_score < 0.5); + + my $new_score = $dist_score * ($sr->score + $self->score)/2; + + if ($new_score > 1) { + warn("WARNING: Bad merge score: $new_score\n"); + } + + # extend SyntenyRegion to cover both sources and targets, set merged score + # and return + if ($sr->source_start < $self->source_start) { + $self->source_start($sr->source_start); + } + if ($sr->source_end > $self->source_end) { + $self->source_end($sr->source_end); + } + + if ($sr->target_start < $self->target_start) { + $self->target_start($sr->target_start); + } + if ($sr->target_end > $self->target_end) { + $self->target_end($sr->target_end); + } + + $self->score($new_score); + + return $self; +} + + +=head2 stretch + + Arg[1] : Float $factor - stretching factor + Example : $stretched_sr = $sr->stretch(2); + Description : Extends this SyntenyRegion to span a $factor * $score more area. + Return type : Bio::EnsEMBL::IdMapping::SyntenyRegion + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + +sub stretch { + my ($self, $factor) = @_; + + my $source_adjust = int(($self->source_end - $self->source_start + 1) * + $factor * $self->score); + $self->source_start($self->source_start - $source_adjust); + $self->source_end($self->source_end + $source_adjust); + #warn sprintf(" sss %d %d %d\n", $source_adjust, $self->source_start, + # $self->source_end); + + my $target_adjust = int(($self->target_end - $self->target_start + 1) * + $factor * $self->score); + $self->target_start($self->target_start - $target_adjust); + $self->target_end($self->target_end + $target_adjust); + + return $self; +} + + +=head2 score_location_relationship + + Arg[1] : Bio::EnsEMBL::IdMapping::TinyGene $source_gene - source gene + Arg[2] : Bio::EnsEMBL::IdMapping::TinyGene $target_gene - target gene + Example : my $score = $sr->score_location_relationship($source_gene, + $target_gene); + Description : This function calculates how well the given source location + interpolates on given target location inside this SyntenyRegion. + + Scoring is done the following way: Source and target location + are normalized with respect to this Regions source and target. + Source range will then be somewhere close to 0.0-1.0 and target + range anything around that. + + The extend of the covered area between source and target range + is a measurement of how well they agree (smaller extend is + better). The extend (actually 2*extend) is reduced by the size + of the regions. This will result in 0.0 if they overlap + perfectly and bigger values if they dont. + + This is substracted from 1.0 to give the score. The score is + likely to be below zero, but is cut off at 0.0f. + + Finally, the score is multiplied with the score of the synteny + itself. + Return type : Float + Exceptions : warns if score out of range + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + + + +sub score_location_relationship { + my ($self, $source_gene, $target_gene) = @_; + + # must be on same seq_region + if (($self->source_seq_region_name ne $source_gene->seq_region_name) or + ($self->target_seq_region_name ne $target_gene->seq_region_name)) { + return 0; + } + + # strand relationship must be the same (use logical XOR to find out) + if (($self->source_strand == $source_gene->strand) xor + ($self->target_strand == $target_gene->strand)) { + return 0; + } + + # normalise source location + my $source_rel_start = ($source_gene->start - $self->source_start) / + ($self->source_end - $self->source_start + 1); + + my $source_rel_end = ($source_gene->end - $self->source_start + 1) / + ($self->source_end - $self->source_start + 1); + + #warn " aaa ".$self->to_string."\n"; + #warn sprintf(" bbb %.6f %.6f\n", $source_rel_start, $source_rel_end); + + # cut off if the source location is completely outside + return 0 if ($source_rel_start > 1.1 or $source_rel_end < -0.1); + + # normalise target location + my ($target_rel_start, $target_rel_end); + my $t_length = $self->target_end - $self->target_start + 1; + + if ($self->target_strand == 1) { + + $target_rel_start = ($target_gene->start - $self->target_start) / $t_length; + + $target_rel_end = ($target_gene->end - $self->target_start + 1) / $t_length; + + } else { + $target_rel_start = ($self->target_end - $target_gene->end) / $t_length; + $target_rel_end = ($self->target_end - $target_gene->start + 1) / $t_length; + } + + my $added_range = (($target_rel_end > $source_rel_end) ? $target_rel_end : + $source_rel_end) - + (($target_rel_start < $source_rel_start) ? $target_rel_start : + $source_rel_start); + + my $score = $self->score * (1 - (2 * $added_range - $target_rel_end - + $source_rel_end + $target_rel_start + $source_rel_start)); + + #warn " ccc ".sprintf("%.6f:%.6f:%.6f:%.6f:%.6f\n", $added_range, + # $source_rel_start, $source_rel_end, $target_rel_start, $target_rel_end); + + $score = 0 if ($score < 0); + + # sanity check + if ($score > 1) { + warn "Out of range score ($score) for ".$source_gene->id.":". + $target_gene->id."\n"; + } + + return $score; +} + + +=head2 to_string + + Example : print LOG $sr->to_string, "\n"; + Description : Returns a string representation of the SyntenyRegion object. + Useful for debugging and logging. + Return type : String + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + +sub to_string { + my $self = shift; + return sprintf("%s:%s-%s:%s %s:%s-%s:%s %.6f", + $self->source_seq_region_name, + $self->source_start, + $self->source_end, + $self->source_strand, + $self->target_seq_region_name, + $self->target_start, + $self->target_end, + $self->target_strand, + $self->score + ); +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/IdMapping/TinyExon.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/TinyExon.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,466 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::IdMapping::TinyExon - lightweight exon object + +=head1 SYNOPSIS + + # fetch an exon from the db and create a lightweight exon object + # from it + my $exon = $exon_adaptor->fetch_by_stable_id('ENSE000345437'); + my $lightweight_exon = Bio::EnsEMBL::IdMapping::TinyExon->new_fast( [ + $exon->dbID, + $exon->stable_id, + $exon->version, + $exon->created_date, + $exon->modified_date, + $exon->start, + $exon->end, + $exon->strand, + $exon->slice->seq_region_name, + $exon->slice->coord_system_name, + $exon->slice->coord_system->version, + $exon->slice->subseq( $exon->start, $exon->end, $exon->strand ), + $exon->phase, + $need_project, + ] ); + +=head1 DESCRIPTION + +This is a lightweight exon object for the stable Id mapping. See the +documentation in TinyFeature for general considerations about its +design. + +=head1 METHODS + + start + end + strand + seq_region_name + coord_system_name + coord_system_version + seq + phase + need_project + common_start + common_end + common_strand + common_sr_name + length + is_known + +=cut + + +package Bio::EnsEMBL::IdMapping::TinyExon; + +# internal data structure (array indices): +# +# 0-4 see TinyFeature +# 5 start +# 6 end +# 7 strand +# 8 seq_region_name +# 9 coord_system_name +# 10 coord_system_version +# 11 seq +# 12 phase +# 13 need_project +# 14 common_start +# 15 common_end +# 16 common_strand +# 17 common_sr_name + + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::IdMapping::TinyFeature; +our @ISA = qw(Bio::EnsEMBL::IdMapping::TinyFeature); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + + +=head2 start + + Arg[1] : (optional) Int - the exon's start coordinate + Description : Getter/setter for the exon's start coordinate. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub start { + my $self = shift; + $self->[5] = shift if (@_); + return $self->[5]; +} + + +=head2 end + + Arg[1] : (optional) Int - the exon's end coordinate + Description : Getter/setter for the exon's end coordinate. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub end { + my $self = shift; + $self->[6] = shift if (@_); + return $self->[6]; +} + + +=head2 strand + + Arg[1] : (optional) Int - the exon's strand + Description : Getter/setter for the exon's strand. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub strand { + my $self = shift; + $self->[7] = shift if (@_); + return $self->[7]; +} + + +=head2 seq_region_name + + Arg[1] : (optional) String - seq_region name + Description : Getter/setter for the seq_region name of the slice the exon is + on. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub seq_region_name { + my $self = shift; + $self->[8] = shift if (@_); + return $self->[8]; +} + + +=head2 coord_system_name + + Arg[1] : (optional) String - coord_system name + Description : Getter/setter for the coord_system name of the slice the exon is + on. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub coord_system_name { + my $self = shift; + $self->[9] = shift if (@_); + return $self->[9]; +} + + +=head2 coord_system_version + + Arg[1] : (optional) String - coord_system version + Description : Getter/setter for the coord_system version of the slice the + exon is on. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub coord_system_version { + my $self = shift; + $self->[10] = shift if (@_); + return $self->[10]; +} + + +=head2 seq + + Arg[1] : (optional) String - the exon's sequence + Description : Getter/setter for the exon's sequence. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub seq { + my $self = shift; + $self->[11] = shift if (@_); + return $self->[11]; +} + + +=head2 phase + + Arg[1] : (optional) Int - the exon's phase + Description : Getter/setter for the exon's phase. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub phase { + my $self = shift; + $self->[12] = shift if (@_); + return $self->[12]; +} + + +=head2 need_project + + Arg[1] : (optional) Boolean - attribute to set + Description : Getter/setter for the attribute determining whether an exon + needs to be projected onto a common coord_system. You don't need + to do so if the native coord_system is common to the source and + target assemblies, or if no common coord_system is found (the + Cache object has methods to determine this). + Return type : Boolean + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub need_project { + my $self = shift; + $self->[13] = shift if (@_); + return $self->[13]; +} + + +=head2 common_start + + Arg[1] : (optional) Int - the exon's start in common coord_system + coordinates + Description : Getter/setter for the exon's start in common coord_system + coordinates. Will return $self->start if no projection to a + common coord_system is required. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub common_start { + my $self = shift; + + # when used as a setter, always set a value + $self->[14] = shift if (@_); + + # when used as a getter + if (scalar(@$self) > 14) { + # return value for common coord_system if available (but avoid + # autovivification gotcha!) + return $self->[14]; + } elsif ($self->need_project) { + # return undef if common value expected but not there (e.g. no projection + # found + return undef; + } else { + # return native value + return $self->start; + } +} + + +=head2 common_end + + Arg[1] : (optional) Int - the exon's end in common coord_system + coordinates + Description : Getter/setter for the exon's end in common coord_system + coordinates. Will return $self->end if no projection to a + common coord_system is required. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub common_end { + my $self = shift; + + # when used as a setter, always set a value + $self->[15] = shift if (@_); + + # when used as a getter + if (scalar(@$self) > 14) { + # return value for common coord_system if available (but avoid + # autovivification gotcha!) + return $self->[15]; + } elsif ($self->need_project) { + # return undef if common value expected but not there (e.g. no projection + # found + return undef; + } else { + # return native value + return $self->end; + } +} + + +=head2 common_strand + + Arg[1] : (optional) Int - the exon's strand in common coord_system + coordinates + Description : Getter/setter for the exon's strand in common coord_system + coordinates. Will return $self->strand if no projection to a + common coord_system is required. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub common_strand { + my $self = shift; + + # when used as a setter, always set a value + $self->[16] = shift if (@_); + + # when used as a getter + if (scalar(@$self) > 14) { + # return value for common coord_system if available (but avoid + # autovivification gotcha!) + return $self->[16]; + } elsif ($self->need_project) { + # return undef if common value expected but not there (e.g. no projection + # found + return undef; + } else { + # return native value + return $self->strand; + } +} + + +=head2 common_sr_name + + Arg[1] : (optional) String - seq_region name of the exon's slice on the + common coord_system + Description : Getter/setter for the seq_region name of the exon's slice on the + common coord_system coordinates. Will return + $self->seq_region_name if no projection to a common coord_system + is required. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub common_sr_name { + my $self = shift; + + # when used as a setter, always set a value + $self->[17] = shift if (@_); + + # when used as a getter + if (scalar(@$self) > 14) { + # return value for common coord_system if available (but avoid + # autovivification gotcha!) + return $self->[17]; + } elsif ($self->need_project) { + # return undef if common value expected but not there (e.g. no projection + # found + return undef; + } else { + # return native value + return $self->seq_region_name; + } +} + + +=head2 length + + Description : Returns the exon length (distance between start and end). + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub length { + my $self = shift; + return ($self->end - $self->start + 1); +} + + +=head2 is_known + + Description : Determine whether an exon is known. In the context of stable Id + mapping, this is true for all exons. + Return type : Boolean + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub is_known { + return 1; +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/IdMapping/TinyFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/TinyFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,213 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::IdMapping::TinyFeature - lightweight feature object + +=head1 SYNOPSIS + +This object isn't instantiated. See objects which inherit from it +(TinyGene, TinyTranscript, etc.) for examples. + +=head1 DESCRIPTION + +This is the base class for the lightweight feature objects used by the +stable Id maping application. For performance reasons, these objects +are instantiated using a new_fast() method. The internal implementation +is an arrayref (rather than the more common hashref), which optimises +memory usage. + +There are no adaptors to fetch TinyFeatures from the database. You +rather use the normal feature adaptors and then create the TinyFeatures +from the heavy objects you get. The memory saving will therefore mainly +take effect when serialising and reloading these objects. + +Also note that TinyFeatures don't have a slice attached to them - all +location information (where required) is stored on the feature object +directly. + +=head1 METHODS + + new_fast + id + stable_id + version + created_date + modified_date + to_string + +=cut + +package Bio::EnsEMBL::IdMapping::TinyFeature; + +# internal data structure (array indices): +# +# 0 dbID +# 1 stable_id +# 2 version +# 3 created_date +# 4 modified_date +# +# other instance variables differ by subclass implementation, so look there. + + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + + +=head2 new_fast + + Arg[1] : Arrayref $array_ref - the arrayref to bless into the new object + Description : Constructor. + Return type : Bio::EnsEMBL::IdMapping::TinyFeature implementing class + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::Cache + Status : At Risk + : under development + +=cut + +sub new_fast { + my $class = shift; + my $array_ref = shift; + return bless $array_ref, $class; +} + + +=head2 id + + Arg[1] : (optional) Int - the feature's internal Id ("dbID") + Description : Getter/setter for the feature's internal Id. + Return type : Int + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::Cache + Status : At Risk + : under development + +=cut + +sub id { + my $self = shift; + $self->[0] = shift if (@_); + return $self->[0]; +} + + +=head2 stable_id + + Arg[1] : (optional) String - the feature's stable Id + Description : Getter/setter for the feature's stable Id. + Return type : String + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::Cache + Status : At Risk + : under development + +=cut + +sub stable_id { + my $self = shift; + $self->[1] = shift if (@_); + return $self->[1]; +} + + +=head2 version + + Arg[1] : (optional) Int - the feature's stable Id version + Description : Getter/setter for the feature's stable Id version. + Return type : Int + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::Cache + Status : At Risk + : under development + +=cut + +sub version { + my $self = shift; + $self->[2] = shift if (@_); + return $self->[2]; +} + + +=head2 created_date + + Arg[1] : (optional) String - the feature's stable Id creation date + Description : Getter/setter for the feature's stable Id creation date. + Return type : String + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::Cache + Status : At Risk + : under development + +=cut + +sub created_date { + my $self = shift; + $self->[3] = shift if (@_); + return $self->[3]; +} + + +=head2 modified_date + + Arg[1] : (optional) String - the feature's stable Id modification date + Description : Getter/setter for the feature's stable Id modification date. + Return type : String + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::Cache + Status : At Risk + : under development + +=cut + +sub modified_date { + my $self = shift; + $self->[4] = shift if (@_); + return $self->[4]; +} + + +=head2 to_string + + Example : print LOG "Created ", $f->to_string, "\n"; + Description : Prints a string representation of the feature for debug + purposes. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub to_string { + my $self = shift; + return $self->id.':'.$self->stable_id.'.'.$self->version; +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/IdMapping/TinyGene.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/TinyGene.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,303 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::IdMapping::TinyGene - lightweight gene object + +=head1 SYNOPSIS + + # fetch a gene from the db and create a lightweight gene object from it + my $gene = $gene_adaptor->fetch_by_stable_id('ENSG000345437'); + my $lightweight_gene = Bio::EnsEMBL::IdMapping::TinyGene->new_fast( [ + $gene->dbID, $gene->stable_id, + $gene->version, $gene->created_date, + $gene->modified_date, $gene->start, + $gene->end, $gene->strand, + $gene->slice->seq_region_name, $gene->biotype, + $gene->status, $gene->analysis->logic_name, + ( $gene->is_known ? 1 : 0 ), + ] ); + +=head1 DESCRIPTION + +This is a lightweight gene object for the stable Id mapping. See the +documentation in TinyFeature for general considerations about its +design. + +=head1 METHODS + + start + end + strand + seq_region_name + biotype + status + logic_name + is_known + add_Transcript + get_all_Transcripts + length + +=cut + +package Bio::EnsEMBL::IdMapping::TinyGene; + +# internal data structure (array indices): +# +# 0-4 see TinyFeature +# 5 start +# 6 end +# 7 strand +# 8 seq_region_name +# 9 biotype +# 10 status +# 11 logic_name +# 12 is_known +# 13 [transcripts] + + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::IdMapping::TinyFeature; +our @ISA = qw(Bio::EnsEMBL::IdMapping::TinyFeature); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + + +=head2 start + + Arg[1] : (optional) Int - the gene's start coordinate + Description : Getter/setter for the gene's start coordinate. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub start { + my $self = shift; + $self->[5] = shift if (@_); + return $self->[5]; +} + + +=head2 end + + Arg[1] : (optional) Int - the gene's end coordinate + Description : Getter/setter for the gene's end coordinate. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub end { + my $self = shift; + $self->[6] = shift if (@_); + return $self->[6]; +} + + +=head2 strand + + Arg[1] : (optional) Int - the gene's strand + Description : Getter/setter for the gene's strand. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub strand { + my $self = shift; + $self->[7] = shift if (@_); + return $self->[7]; +} + + +=head2 seq_region_name + + Arg[1] : (optional) String - seq_region name + Description : Getter/setter for the seq_region name of the slice the gene is + on. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub seq_region_name { + my $self = shift; + $self->[8] = shift if (@_); + return $self->[8]; +} + + +=head2 biotype + + Arg[1] : (optional) String - the gene's biotype + Description : Getter/setter for the gene's biotype. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub biotype { + my $self = shift; + $self->[9] = shift if (@_); + return $self->[9]; +} + + +=head2 strand + + Arg[1] : (optional) String - the gene's status + Description : Getter/setter for the gene's status. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub status { + my $self = shift; + $self->[10] = shift if (@_); + return $self->[10]; +} + + +=head2 logic_name + + Arg[1] : (optional) String - the gene's analysis' logic_name + Description : Getter/setter for the gene's analysis' logic_name. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub logic_name { + my $self = shift; + $self->[11] = shift if (@_); + return $self->[11]; +} + + +=head2 is_known + + Arg[1] : (optional) Boolean - the gene's "known" status + Description : Getter/setter for the gene's "known" status. + Return type : Boolean + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub is_known { + my $self = shift; + $self->[12] = shift if (@_); + return $self->[12]; +} + + +=head2 add_Transcript + + Arg[1] : Bio::EnsEMBL::IdMapping::TinyTranscript $tr - the transcript to + add + Example : $tiny_gene->add_Transcript($tiny_transcript); + Description : Adds a transcript to a gene. + Return type : none + Exceptions : thrown on wrong or missing argument + Caller : general + Status : At Risk + : under development + +=cut + +sub add_Transcript { + my $self = shift; + my $tr = shift; + + unless ($tr && $tr->isa('Bio::EnsEMBL::IdMapping::TinyTranscript')) { + throw('Need a Bio::EnsEMBL::IdMapping::TinyTranscript.'); + } + + push @{ $self->[13] }, $tr; +} + + +=head2 get_all_Transcripts + + Example : foreach my $tr (@{ $tiny_gene->get_all_Transcripts }) { + # do something with transcript + } + Description : Returns all transcripts attached to that gene. + Return type : Arrayref of Bio::EnsEMBL::IdMapping::TinyTranscript objects + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_all_Transcripts { + return $_[0]->[13] || []; +} + + +=head2 length + + Description : Returns the gene length (distance between start and end). + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub length { + my $self = shift; + return ($self->end - $self->start + 1); +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/IdMapping/TinyTranscript.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/TinyTranscript.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,299 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::IdMapping::TinyTranscript - lightweight transcript object + +=head1 SYNOPSIS + + # fetch a transcript from the db and create a lightweight + # transcript object from it + my $tr = $transcript_adaptor->fetch_by_stable_id('ENST000345437'); + my $lightweight_tr = + Bio::EnsEMBL::IdMapping::TinyTranscript->new_fast( [ + $tr->dbID, $tr->stable_id, + $tr->version, $tr->created_date, + $tr->modified_date, $tr->start, + $tr->end, $tr->strand, + $tr->length, md5_hex( $tr->spliced_seq ), + ( $tr->is_known ? 1 : 0 ), + ] ); + +=head1 DESCRIPTION + +This is a lightweight transcript object for the stable Id mapping. See +the documentation in TinyFeature for general considerations about its +design. + +=head1 METHODS + + start + end + strand + length + seq_md5_sum + is_known + add_Translation + translation + add_Exon + get_all_Exons + +=cut + +package Bio::EnsEMBL::IdMapping::TinyTranscript; + +# internal data structure (array indices): +# +# 0-4 see TinyFeature +# 5 start +# 6 end +# 7 strand +# 8 length +# 9 seq_md5_sum +# 10 is_known +# 11 translation +# 12 [exons] +# 13 biotype + + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::IdMapping::TinyFeature; +our @ISA = qw(Bio::EnsEMBL::IdMapping::TinyFeature); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + + +=head2 start + + Arg[1] : (optional) Int - the transcript's start coordinate + Description : Getter/setter for the transcript's start coordinate. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub start { + my $self = shift; + $self->[5] = shift if (@_); + return $self->[5]; +} + + +=head2 end + + Arg[1] : (optional) Int - the transcript's end coordinate + Description : Getter/setter for the transcript's end coordinate. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub end { + my $self = shift; + $self->[6] = shift if (@_); + return $self->[6]; +} + + +=head2 strand + + Arg[1] : (optional) Int - the transcript's strand + Description : Getter/setter for the transcript's strand. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub strand { + my $self = shift; + $self->[7] = shift if (@_); + return $self->[7]; +} + + +=head2 length + + Arg[1] : (optional) Int - the transcript's length + Description : Getter/setter for the transcript's length. Note that this is + *not* the distance between start and end, but rather the sum of + the lengths of all exons. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub length { + my $self = shift; + $self->[8] = shift if (@_); + return $self->[8]; +} + + +=head2 seq_md5_sum + + Arg[1] : (optional) String - the md5 digest of the transcript's sequence + Description : Getter/setter for the md5 digest of the transcript's sequence. + Note that when used as a setter, you are expected to pass a + digest, not the raw sequence (i.e. the digest is not created for + you). + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub seq_md5_sum { + my $self = shift; + $self->[9] = shift if (@_); + return $self->[9]; +} + + +=head2 is_known + + Arg[1] : (optional) Boolean - the transcript's "known" status + Description : Getter/setter for the transcript's "known" status. + Return type : Boolean + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub is_known { + my $self = shift; + $self->[10] = shift if (@_); + return $self->[10]; +} + + +=head2 add_Translation + + Arg[1] : Bio::EnsEMBL::IdMapping::TinyTranslation $tl - the translation + to add + Example : $tiny_transcript->add_Translation($tiny_translation); + Description : Adds a translation to this transcript. + Return type : none + Exceptions : thrown on wrong or missing argument + Caller : general + Status : At Risk + : under development + +=cut + +sub add_Translation { + my $self = shift; + my $tl = shift; + + unless ($tl && $tl->isa('Bio::EnsEMBL::IdMapping::TinyTranslation')) { + throw('Need a Bio::EnsEMBL::IdMapping::TinyTranslation.'); + } + + $self->[11] = $tl; +} + + +=head2 translation + + Description : Getter for the transcript's translation. + Return type : Bio::EnsEMBL::IdMapping::TinyTranslation + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub translation { + return $_[0]->[11]; +} + + +=head2 add_Exon + + Arg[1] : Bio::EnsEMBL::IdMapping::TinyExon $exon - the exon to add + Example : $tiny_transcript->add_Exon($tiny_exon); + Description : Adds an exon to this transcript. + Return type : none + Exceptions : thrown on wrong or missing argument + Caller : general + Status : At Risk + : under development + +=cut + +sub add_Exon { + my $self = shift; + my $exon = shift; + + unless ($exon && $exon->isa('Bio::EnsEMBL::IdMapping::TinyExon')) { + throw('Need a Bio::EnsEMBL::IdMapping::TinyExon.'); + } + + push @{ $self->[12] }, $exon; +} + + +=head2 get_all_Exons + + Example : foreach my $exon (@{ $tiny_transcript->get_all_Exons }) { + # do something with exon + } + Description : Returns all exons attached to that transcript. + Return type : Arrayref of Bio::EnsEMBL::IdMapping::TinyExon objects + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_all_Exons { + return $_[0]->[12] || []; +} + +sub biotype { + my $self = shift; + $self->[13] = shift if (@_); + return $self->[13]; +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/IdMapping/TinyTranslation.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/TinyTranslation.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,130 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::IdMapping::TinyTranslation - lightweight translation object + +=head1 SYNOPSIS + + if ( my $tl = $tr->translation ) { + my $lightweight_tl = + Bio::EnsEMBL::IdMapping::TinyTranslation->new_fast( [ + $tl->dbID, $tl->stable_id, + $tl->version, $tl->created_date, + $tl->modified_date, $tr->dbID, + $tr->translate->seq, ( $tr->is_known ? 1 : 0 ), + ] ); + } + +=head1 DESCRIPTION + +This is a lightweight translation object for the stable Id mapping. See +the documentation in TinyFeature for general considerations about its +design. + +=head1 METHODS + + transcript_id + seq + is_known + +=cut + +package Bio::EnsEMBL::IdMapping::TinyTranslation; + +# internal data structure (array indices): +# +# 0-4 see TinyFeature +# 5 transcript_id +# 6 seq +# 7 is_known + + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::IdMapping::TinyFeature; +our @ISA = qw(Bio::EnsEMBL::IdMapping::TinyFeature); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + + +=head2 transcript_id + + Arg[1] : (optional) Int - the transcript internal Id ("dbID") + Description : Getter/setter for the transcript internal Id this translation is + attached to. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub transcript_id { + my $self = shift; + $self->[5] = shift if (@_); + return $self->[5]; +} + + +=head2 seq + + Arg[1] : (optional) String - the translation's sequence + Description : Getter/setter for the translation's sequence. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub seq { + my $self = shift; + $self->[6] = shift if (@_); + return $self->[6]; +} + + +=head2 is_known + + Arg[1] : (optional) Boolean - the translation's "known" status + Description : Getter/setter for the translation's "known" status. + Return type : Boolean + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub is_known { + my $self = shift; + $self->[7] = shift if (@_); + return $self->[7]; +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/IdMapping/TranscriptScoreBuilder.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/TranscriptScoreBuilder.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,498 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + + +=head1 SYNOPSIS + + +=head1 DESCRIPTION + +Combines ExonScoreBuilder, ExonDirectMapper and ExonerateRunner from +Java application. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::IdMapping::ScoreBuilder; +our @ISA = qw(Bio::EnsEMBL::IdMapping::ScoreBuilder); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append); +use Bio::EnsEMBL::IdMapping::ScoredMappingMatrix; + + +sub score_transcripts { + my $self = shift; + my $exon_matrix = shift; + + unless ($exon_matrix and + $exon_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + $self->logger->info("-- Scoring transcripts...\n\n", 0, 'stamped'); + + # build scores based on exon scores + my $matrix = $self->scores_from_exon_scores($exon_matrix); + + # debug logging + if ($self->logger->loglevel eq 'debug') { + $matrix->log('transcript', $self->conf->param('basedir')); + } + + # log stats of combined matrix + my $fmt = "%-40s%10.0f\n"; + + $self->logger->info("Scoring matrix:\n"); + + $self->logger->info(sprintf($fmt, "Total source transcripts:", + $self->cache->get_count_by_name('transcripts_by_id', 'source')), 1); + + $self->logger->info(sprintf($fmt, "Total target transcripts:", + $self->cache->get_count_by_name('transcripts_by_id', 'target')), 1); + + $self->log_matrix_stats($matrix); + + $self->logger->info("\nDone with transcript scoring.\n\n"); + + return $matrix; +} + + +sub scores_from_exon_scores { + my $self = shift; + my $exon_matrix = shift; + + unless ($exon_matrix and + $exon_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + my $dump_path = path_append($self->conf->param('basedir'), 'matrix'); + + my $matrix = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'transcript_matrix.ser', + ); + + my $transcript_cache = $matrix->cache_file; + + if (-s $transcript_cache) { + + # read from file + $self->logger->info("Reading transcript scoring matrix from file...\n", 0, 'stamped'); + $self->logger->debug("Cache file $transcript_cache.\n", 1); + $matrix->read_from_file; + $self->logger->info("Done.\n\n", 0, 'stamped'); + + } else { + + # build scoring matrix + $self->logger->info("No transcript scoring matrix found. Will build new one.\n"); + + $self->logger->info("Transcript scoring...\n", 0, 'stamped'); + $matrix = $self->build_scores($matrix, $exon_matrix); + $self->logger->info("Done.\n\n", 0, 'stamped'); + + # write scoring matrix to file + $matrix->write_to_file; + + } + + return $matrix; +} + + +sub build_scores { + my $self = shift; + my $matrix = shift; + my $exon_matrix = shift; + + unless ($matrix and + $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + unless ($exon_matrix and + $exon_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('Need a exon Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + # first find out which source and target transcripts have scoring exons and + # build a "flag" matrix for these transcripts (all scores are 1) + $self->flag_matrix_from_exon_scores($matrix, $exon_matrix); + + # now calculate the actual scores for the transcripts in the flag matrix + my $final_matrix = + $self->score_matrix_from_flag_matrix($matrix, $exon_matrix); + + return $final_matrix; +} + + +sub flag_matrix_from_exon_scores { + my $self = shift; + my $matrix = shift; + my $exon_matrix = shift; + + # initialise progress logger + my $i; + my $num_transcripts = + scalar(keys %{ $self->cache->get_by_name('transcripts_by_id', 'source') }); + my $progress_id = $self->logger->init_progress($num_transcripts, 100); + + $self->logger->info("Creating flag matrix...\n", 1); + + # loop over source transcripts + foreach my $source_transcript (values %{ $self->cache->get_by_name('transcripts_by_id', 'source') }) { + + # log progress + $self->logger->log_progress($progress_id, ++$i, 1); + + # get all exons for the source transcript + foreach my $source_exon (@{ $source_transcript->get_all_Exons }) { + + # get target exons for this source exon from scoring matrix + foreach my $target_exon_id (@{ $exon_matrix->get_targets_for_source($source_exon->id) }) { + + # get target transcripts that contain this exon + foreach my $target_transcript (@{ $self->cache->get_by_key('transcripts_by_exon_id', 'target', $target_exon_id) }) { + + # add scoring flag for these two transcripts + $matrix->add_score($source_transcript->id, $target_transcript->id, 1); + + } + } + } + } + + $self->logger->info("\n"); + + return $matrix; +} + + +sub score_matrix_from_flag_matrix { + my $self = shift; + my $flag_matrix = shift; + my $exon_matrix = shift; + + unless ($flag_matrix and + $flag_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + unless ($exon_matrix and + $exon_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('Need an exon Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + my $transcript_score_threshold = + $self->conf->param('transcript_score_threshold') || 0; + + # create a new scoring matrix which will replace the flag matrix + my $matrix = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new( + -DUMP_PATH => $flag_matrix->dump_path, + -CACHE_FILE => $flag_matrix->cache_file_name, + ); + + # initialise progress logger + my $i; + my $num_transcripts = + scalar(keys %{ $self->cache->get_by_name('transcripts_by_id', 'source') }); + my $progress_id = $self->logger->init_progress($num_transcripts, 100); + + $self->logger->info("Creating score matrix from flag matrix...\n", 1); + + # debug + my $fmt_d1 = "%-14s%-15s%-14s%-14s%-14s\n"; + my $fmt_d2 = "%.6f"; + + # loop over source transcripts + foreach my $source_transcript (values %{ $self->cache->get_by_name('transcripts_by_id', 'source') }) { + + # log progress + $self->logger->log_progress($progress_id, ++$i, 1); + + # We are only interested in scoring with exons that are in the target + # transcript. The ScoredMappingMatrix may contain scores for exons that + # aren't in this transcript so create a hash of the target transcript's + # exons + my %source_exons = map { $_->id => 1 } + @{ $source_transcript->get_all_Exons }; + + my $source_transcript_length = $source_transcript->length; + + # get all corresponding target transcripts from the flag matrix + foreach my $target_transcript_id (@{ $flag_matrix->get_targets_for_source($source_transcript->id) }) { + + my $target_transcript = $self->cache->get_by_key('transcripts_by_id', 'target', $target_transcript_id); + + my $source_transcript_score = 0; + my $target_transcript_score = 0; + my $target_transcript_length = $target_transcript->length; + + my %target_exons = map { $_->id => 1 } + @{ $target_transcript->get_all_Exons }; + + # now loop over source exons and find the highest scoring target exon + # belonging to the target transcript + + foreach my $source_exon (@{ $source_transcript->get_all_Exons }) { + + my $max_source_score = -1; + + foreach my $target_exon_id (@{ $exon_matrix->get_targets_for_source($source_exon->id) }) { + + next unless ($target_exons{$target_exon_id}); + + my $score = $exon_matrix->get_score($source_exon->id, + $target_exon_id); + $max_source_score = $score if ($score > $max_source_score); + } + + if ($max_source_score > 0) { + $source_transcript_score += ($max_source_score * $source_exon->length); + + } + } + + # now do the same for target exons + + foreach my $target_exon (@{ $target_transcript->get_all_Exons }) { + + my $max_target_score = -1; + + foreach my $source_exon_id (@{ $exon_matrix->get_sources_for_target($target_exon->id) }) { + + next unless ($source_exons{$source_exon_id}); + + my $score = $exon_matrix->get_score( + $source_exon_id, $target_exon->id); + $max_target_score = $score if ($score > $max_target_score); + } + + if ($max_target_score > 0) { + $target_transcript_score += ($max_target_score * $target_exon->length); + + } + } + + # + # calculate transcript score and add to scoring matrix + # + if (($source_transcript_length + $target_transcript_length) > 0) { + + # sanity check + if (($source_transcript_score > $source_transcript_length) or + ($target_transcript_score > $target_transcript_length)) { + + $self->logger->warning("Score > length for source ($source_transcript_score <> $source_transcript_length) or target ($target_transcript_score <> $target_transcript_length).\n", 1); + + } else { + +=cut + # debug + $self->logger->info($source_transcript->id.":".$target_transcript->id. + " source score: $source_transcript_score". + " source length: $source_transcript_length". + " target score: $target_transcript_score". + " target length: $target_transcript_length\n"); +=cut + + # everything is fine, add score to matrix + my $transcript_score = + ($source_transcript_score + $target_transcript_score) / + ($source_transcript_length + $target_transcript_length); + + if ($transcript_score > $transcript_score_threshold) { + $matrix->add_score($source_transcript->id, $target_transcript->id, + $transcript_score); + } + + } + + } else { + + $self->logger->warning("Combined length of source (".$source_transcript->id.") and target (".$target_transcript->id.") transcript is zero!\n", 1); + + } + + } + } + + $self->logger->info("\n"); + + return $matrix; + +} + + +# +# penalise scores between genes with different biotypes. +# entries are modified in place +# +sub biotype_transcript_rescore { + my ( $self, $matrix ) = @_; + + if ( defined($matrix) && + !$matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') ) + { + throw('Exprected Bio::EnsEMBL::IdMapping::ScoredMappingMatrix'); + } + + my $i = 0; + + foreach my $entry ( @{ $matrix->get_all_Entries } ) { + + my $source_transcript = + $self->cache->get_by_key( 'transcripts_by_id', 'source', + $entry->source() ); + + my $target_transcript = + $self->cache->get_by_key( 'transcripts_by_id', 'target', + $entry->target() ); + + if ($source_transcript->biotype() ne $target_transcript->biotype() ) + { + # PENALTY: Lower the score for mappings to transcripts of + # different biotype. + $matrix->set_score( $entry->source(), $entry->target(), + 0.9*$entry->score() ); + $i++; + } + } + + $self->logger->debug("Scored transcripts with biotype mismatch: $i\n", + 1 ); +} ## end sub biotype_transcript_rescore + + +sub different_translation_rescore { + my $self = shift; + my $matrix = shift; + + unless ($matrix + and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') ) + { + throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + my $i = 0; + + foreach my $entry ( sort { $b->score <=> $a->score } + @{ $matrix->get_all_Entries } ) + { + + # we only do this for perfect matches, i.e. transcript score == 1 + last if ( $entry->score < 1 ); + + my $source_tl = + $self->cache->get_by_key( 'transcripts_by_id', 'source', + $entry->source )->translation; + my $target_tl = + $self->cache->get_by_key( 'transcripts_by_id', 'target', + $entry->target )->translation; + + # no penalty if both transcripts have no translation + next if ( !$source_tl and !$target_tl ); + + if ( !$source_tl + or !$target_tl + or ( $source_tl->seq ne $target_tl->seq ) ) + { + # PENALTY: The transcript stable ID is now on a transcript with a + # different translation. + $matrix->set_score( $entry->source(), $entry->target(), + 0.9*$entry->score() ); + $i++; + } + + } ## end foreach my $entry ( sort { ...}) + + $self->logger->debug( + "Non-perfect translations on perfect transcripts: $i\n", + 1 ); +} ## end sub different_translation_rescore + + +sub non_mapped_gene_rescore { + my $self = shift; + my $matrix = shift; + my $gene_mappings = shift; + + # argument checks + unless ($matrix + and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') ) + { + throw( + 'Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.' + ); + } + + unless ( $gene_mappings + and $gene_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList') ) + { + throw('Need a gene Bio::EnsEMBL::IdMapping::MappingList.'); + } + + # create of lookup hash of mapped source genes to target genes + my %gene_lookup = + map { $_->source => $_->target } + @{ $gene_mappings->get_all_Entries }; + + my $i = 0; + + foreach my $entry ( @{ $matrix->get_all_Entries } ) { + + my $source_gene = + $self->cache->get_by_key( 'genes_by_transcript_id', 'source', + $entry->source ); + my $target_gene = + $self->cache->get_by_key( 'genes_by_transcript_id', 'target', + $entry->target ); + + my $mapped_target = $gene_lookup{ $source_gene->id }; + + if ( !$mapped_target or ( $mapped_target != $target_gene->id ) ) { + # PENALTY: The transcript stable ID has been mapped to an + # un-mapped gene. + $matrix->set_score( $entry->source(), $entry->target(), + 0.9*$entry->score() ); + $i++; + } + } + + $self->logger->debug( "Scored transcripts in non-mapped genes: $i\n", + 1 ); +} ## end sub non_mapped_gene_rescore + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/IdentityXref.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdentityXref.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,522 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::IdentityXref + +=head1 SYNOPSIS + + my $xref = Bio::EnsEMBL::IdentityXref->new( + -XREF_IDENTITY => 80.4, + -ENSEMBL_IDENTITY => 90.1, + -SCORE => 90, + -EVALUE => 12, + -CIGAR_LINE => '23MD3M2I40M', + -XREF_START => 1, + -XREF_END => 68, + -ENSEMBL_START => 10, + -ENSEMBL_END => 77, + -ADAPTOR => $adaptor, + -PRIMARY_ID => $primary_id, + -DBNAME => 'SwissProt' + ); + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::IdentityXref; +use vars qw(@ISA $AUTOLOAD); +use strict; +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Utils::Exception qw( deprecate ); + +@ISA = qw( Bio::EnsEMBL::DBEntry ); + + +=head2 new + + Arg [...] : XREF_IDENTITY ENSEMBL_IDENTITY SCORE EVALUE CIGAR_LINE + : XREF_START XREF_END ENSEMBL_START ENSEMBL_END + : ANALYSIS pairs + Example : see synopsis + Description: Create a new Bio::EnsEMBL::IdentityXref object + Returntype : Bio::EnsEMBL::IdentityXref + Exceptions : none + Caller : general + Status : Stable + +=cut + +my $error_shown = 0; + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + my ($query_identity, $target_identity, $score, $evalue, + $cigar_line, $query_start, $query_end, $translation_start, + $translation_end, $analysis, $xref_identity, $ensembl_identity, + $xref_start, $xref_end, $ensembl_start, $ensembl_end) = rearrange( + [qw(QUERY_IDENTITY TARGET_IDENTITY SCORE EVALUE CIGAR_LINE + QUERY_START QUERY_END TRANSLATION_START TRANSLATION_END + ANALYSIS XREF_IDENTITY ENSEMBL_IDENTITY XREF_START XREF_END ENSEMBL_START ENSEMBL_END)], @args); + + if((defined($query_identity) or defined($target_identity) or defined($query_start) or defined ($query_end) or + defined($translation_start) or defined($translation_end)) and !$error_shown){ + print STDERR "Arguments have now been changed to stop confusion so please replace the following\n"; + print STDERR "\tQUERY_IDENTITY\t->\tXREF_IDENTITY\n"; + print STDERR "\tTARGET_IDENTITY\t->\tENSEMBL_IDENTITY\n"; + print STDERR "\tQUERY_START\t->\tXREF_START\n"; + print STDERR "\tQUERY_END\t->\tXREF_END\n"; + print STDERR "\tTRANSLATION_START\t->\tENSEMBL_START\n"; + print STDERR "\tTRANSLATION_END\t->\tENSEMBL_END\n"; + print STDERR "The old arguments will be removed in a futute release so please change your code to the new names\n"; + $error_shown = 1; + } + $self->{'xref_identity'} = $query_identity || $xref_identity; + $self->{'ensembl_identity'} = $target_identity || $ensembl_identity; + $self->{'score'} = $score; + $self->{'evalue'} = $evalue; + $self->{'cigar_line'} = $cigar_line; + $self->{'xref_start'} = $query_start || $xref_start; + $self->{'xref_end'} = $query_end || $xref_start; + $self->{'ensembl_start'} = $translation_start || $ensembl_start; + $self->{'ensembl_end'} = $translation_end || $ensembl_end; + $self->{'analysis'} = $analysis; + + return $self; +} + +=head2 xref_identity + + Arg [1] : (optional) string $value + Example : $xref_identity = $id_xref->xref_identity; + Description: Getter/Setter for xref identity + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub xref_identity{ + my $obj = shift; + if( @_ ) { + my $value = shift; + $obj->{'xref_identity'} = $value; + } + return $obj->{'xref_identity'}; + +} + + +=head2 ensembl_identity + + Arg [1] : (optional) string $value + Example : $ensembl_identity = $id_xref->ensembl_identity; + Description: Getter/Setter for ensembl identity + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub ensembl_identity{ + my $obj = shift; + if( @_ ) { + my $value = shift; + $obj->{'ensembl_identity'} = $value; + } + return $obj->{'ensembl_identity'}; + +} + + + +=head2 cigar_line + + Arg [1] : string $cigar_line + Example : none + Description: get/set for attribute cigar_line + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub cigar_line { + my $self = shift; + $self->{'cigar_line'} = shift if( @_ ); + return $self->{'cigar_line'}; +} + + +=head2 ensembl_start + + Arg [1] : string $ensembl_start + Example : none + Description: get/set for attribute ensembl_start + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub ensembl_start { + my $self = shift; + $self->{'ensembl_start'} = shift if( @_ ); + return $self->{'ensembl_start'}; +} + + +=head2 ensembl_end + + Arg [1] : string $ensembl_end + Example : none + Description: get/set for attribute ensembl_end + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub ensembl_end { + my $self = shift; + $self->{'ensembl_end'} = shift if( @_ ); + return $self->{'ensembl_end'}; +} + + +=head2 xref_start + + Arg [1] : string $xref_start + Example : none + Description: get/set for attribute xref_start + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub xref_start { + my $self = shift; + $self->{'xref_start'} = shift if( @_ ); + return $self->{'xref_start'}; +} + + +=head2 xref_end + + Arg [1] : string $xref_end + Example : none + Description: get/set for attribute xref_end + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub xref_end { + my $self = shift; + $self->{'xref_end'} = shift if( @_ ); + return $self->{'xref_end'}; +} + + +=head2 score + + Arg [1] : string $score + Example : none + Description: get/set for attribute score + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub score { + my $self = shift; + $self->{'score'} = shift if( @_ ); + return $self->{'score'}; +} + + +=head2 evalue + + Arg [1] : string $evalue + Example : none + Description: get/set for attribute evalue + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub evalue { + my $self = shift; + $self->{'evalue'} = shift if( @_ ); + return $self->{'evalue'}; +} + + + + +=head2 get_mapper + + Args : none + Example : none + Description: produces a mapper object that takes coordinates from one side + of the alignment to the other side. "ensembl" and "external" + are the two coordinate systems contained. + Returntype : Bio::EnsEMBL::Mapper + Exceptions : none + Caller : general, ProteinDAS subsystem + Status : Stable + +=cut + + +sub get_mapper { + my ( $self ) = @_; + # parse the cigar_line and create a mapper ... + if( exists $self->{'_cached_mapper'} ) { + return $self->{'_cached_mapper'}; + } + + my ( @lens, @chars ); + + # if there is no cigar line, nothing is going to be loaded + if( $self->cigar_line() ) { + my @pre_lens = split( '[DMI]', $self->cigar_line() ); + @lens = map { if( ! $_ ) { 1 } else { $_ }} @pre_lens; + @chars = grep { /[DMI]/ } split( //, $self->cigar_line() ); + } + my $translation_start = $self->ensembl_start(); + my $translation_end = $self->ensembl_end(); + my $query_start = $self->xref_start(); + my $query_end = $self->xref_end(); + + # my $hit_id = $self->display_id(); + my $ensembl_id = "ensembl_id"; + my $external_id = "external_id"; + # now build the mapper + my $mapper = Bio::EnsEMBL::Mapper->new( "external", "ensembl" ); + + + for( my $i=0; $i<=$#lens; $i++ ) { + my $length = $lens[$i]; + my $char = $chars[$i]; + if( $char eq "M" ) { + $mapper->add_map_coordinates( $external_id, $query_start, + $query_start + $length - 1, 1, + $ensembl_id, $translation_start, + $translation_start + $length - 1); + $query_start += $length; + $translation_start += $length; + + } elsif( $char eq "D" ) { + $translation_start += $length; + } elsif( $char eq "I" ) { + $query_start += $length; + } + } + + $self->{'_cached_mapper'} = $mapper; + + return $mapper; +} + + + +=head2 transform_feature + + Arg [1] : a feature type with start and end $feature + This doesnt need to be a Bio::EnsEMBL::Feature as it doesnt + need an attached slice. We may have to introduce an appropriate + object type. + Example : my $ens_prot_feature_list = + $ident_xref->transform_feature( $swiss_prot_feature ); + Description: a list of potential partial features which represent all + mappable places + of the original feature in ensembl translation coordinates. + Returntype : listref of whatever was put in + Exceptions : none + Caller : general, ProteinDAS subsystem + Status : Stable + +=cut + + +sub transform_feature { + my $self= shift; + my $feature = shift; + + my $fstart = $feature->start(); + my $fend = $feature->end(); + + my $mapper = $self->get_mapper(); + my @result; + + my @coords = $mapper->map_coordinates( "external_id", $fstart, $fend, + 1, "external" ); + + for my $coord ( @coords ) { + if( $coord->isa( "Bio::EnsEMBL::Mapper::Coordinate" )) { + my $new_feature; + %{$new_feature} = %$feature; + bless $new_feature, ref( $feature ); + $new_feature->start( $coord->start() ); + $new_feature->end( $coord->end() ); + + push( @result, $new_feature ); + } + } + + return \@result; +} + + + +=head2 map_feature + + Arg [1] : a start,end capable feature object + Example : none + Description: + Returntype : list of Coordinates/Gaps which represents the mapping + Exceptions : none + Caller : another way of doing ProteinDAS + Status : Stable + +=cut + +sub map_feature { + my $self = shift; + my $feature = shift; + + + my $fstart = $feature->start(); + my $fend = $feature->end(); + + my $mapper = $self->get_mapper(); + my @coords = $mapper->map_coordinates( "external_id", $fstart, $fend, 1, + "external" ); + + return @coords; +} + + +########################### +# DEPRECATED METHODS FOLLOW +########################### + +=head2 query_identity + + Description: DEPRECATED. Use xref_identity() instead. + Status : At Risk + +=cut + +sub query_identity{ + deprecate("Use xref_identity instead"); + xref_identity(@_); +} + + +=head2 target_identity + + Description: DEPRECATED. Use ensembl_identity() instead. + Status : At Risk + +=cut + +sub target_identity{ + deprecate("Use ensembl_identity instead"); + ensembl_identity(@_); +} + + + +=head2 translation_start + + Description: DEPRECATED. Use ensembl_start() instead. + Status : At Risk + +=cut + +sub translation_start { + deprecate("Use ensembl_start instead"); + ensembl_start(@_); + +} + + +=head2 translation_end + + Description: DEPRECATED. Use ensembl_end() instead. + Status : At Risk + +=cut + +sub translation_end { + deprecate("Use ensembl_end instead"); + ensembl_end(@_); +} + + + +=head2 query_start + + Description: DEPRECATED. Use xref_start() instead. + Status : At Risk + +=cut + +sub query_start { + deprecate("Use xref_start instead"); + xref_start(@_); + +} + + + +=head2 query_end + + Description: DEPRECATED. Use xref_end() instead. + Status : At Risk + +=cut + +sub query_end { + deprecate("Use xref_end instead"); + xref_end(@_); + +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/IndividualSlice.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IndividualSlice.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,655 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::IndividualSlice - SubClass of the Slice. Represents the +slice of the genome for a certain individual (applying the alleles for +this individual) + +=head1 SYNOPSIS + + $sa = $db->get_SliceAdaptor; + + $slice = + $sa->fetch_by_region( 'chromosome', 'X', 1_000_000, 2_000_000 ); + + $individualSlice = $slice->get_by_Individual($individual_name); + + # Get the sequence from the Individual Slice: will contain IUPAC codes + # for SNPs and Ensembl ambiguity codes for indels + my $seq = $individualSlice->seq(); + print $seq; + + # Get a subSlice of the Strain + my $subSlice_individual = + $individualSlice->sub_Slice( 5_000, 8_000, 1 ) + + # Compare two different individuals in the same Slice + my $sliceIndividual2 = $slice->get_by_Individual($individual_name2); + my $differences = + $individualSlice->get_all_differences_IndividualSlice( + $sliceIndividual2); + + foreach my $af ( @{$differences} ) { + print + "There is a difference between $individual_name " + . "and $individual_name2 at ", + $af->start, "-", $af->end, + " with allele ", $af->allele_string(), "\n"; + } + +=head1 DESCRIPTION + +A IndividualSlice object represents a region of a genome for a certain +individual. It can be used to retrieve sequence or features from a +individual. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::IndividualSlice; +use vars qw(@ISA); +use strict; + +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); +use Bio::EnsEMBL::Slice; +use Bio::EnsEMBL::Mapper; +use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning); + +@ISA = qw(Bio::EnsEMBL::Slice); + + +=head2 new + + Arg [1..N] : List of named arguments + Bio::EnsEMBL::CoordSystem COORD_SYSTEM + string SEQ_REGION_NAME, + int START, + int END, + string VERSION (optional, defaults to '') + int STRAND, (optional, defaults to 1) + Bio::EnsEMBL::DBSQL::SliceAdaptor ADAPTOR (optional) + Arg[N+1] : string $individual_name + Example : $individualSlice = Bio::EnsEMBL::IndividualSlice->new(-coord_system => $cs, + -start => 1, + -end => 10000, + -strand => 1, + -seq_region_name => 'X', + -seq_region_length => 12e6, + -individual_name => $individual_name); + Description : Creates a new Bio::EnsEMBL::IndividualSlice object that will contain a shallow copy of the + Slice object, plus additional information such as the individual this Slice refers to + and listref of Bio::EnsEMBL::Variation::AlleleFeatures of differences with the + reference sequence + ReturnType : Bio::EnsEMBL::IndividualSlice + Exceptions : none + Caller : general + +=cut + +sub new{ + my $caller = shift; + my $class = ref($caller) || $caller; + + #create the IndividualSlice object as the Slice, plus the individual attribute + my ($individual_name, $sample_id) = rearrange(['INDIVIDUAL', 'SAMPLE_ID'],@_); + + my $self = $class->SUPER::new(@_); + + $self->{'individual_name'} = $individual_name; + $self->{'sample_id'} = $sample_id; + + return $self; + +} + +=head2 individual_name + + Arg [1] : (optional) string $individual_name + Example : my $individual_name = $individualSlice->individual_name(); + Description : Getter/Setter for the name of the individual in the slice + ReturnType : string + Exceptions : none + Caller : general + +=cut + +sub individual_name{ + my $self = shift; + if (@_){ + $self->{'individual_name'} = shift @_; + } + return $self->{'individual_name'}; +} + +=head2 seq + + Arg [1] : none + Example : print "SEQUENCE = ", $strainSlice->seq(); + Description: Returns the sequence of the region represented by this + StrainSlice formatted as a string. + Returntype : string + Exceptions : none + Caller : general + +=cut + +sub seq { + my $self = shift; + + # special case for in-between (insert) coordinates + return '' if($self->start() == $self->end() + 1); + + return $self->{'seq'} if($self->{'seq'}); + + if($self->adaptor()) { + my $seqAdaptor = $self->adaptor()->db()->get_SequenceAdaptor(); + my $reference_sequence = $seqAdaptor->fetch_by_Slice_start_end_strand($self,1,undef,1); #get the reference sequence for that slice + #apply all differences to the reference sequence + + # sort edits in reverse order to remove complication of + # adjusting downstream edits + my @allele_features_ordered = sort {$b->start() <=> $a->start() || $b->end() <=> $a->end()} @{$self->{'alleleFeatures'}} if (defined $self->{'alleleFeatures'}); + + foreach my $af (@allele_features_ordered){ + $af->apply_edit($reference_sequence); #change, in the reference sequence, the af + } +# return substr(${$reference_sequence},0,1) if ($self->length == 1); + return ${$reference_sequence}; #returns the reference sequence, applying the alleleFeatures + } + + # no attached sequence, and no db, so just return Ns + return 'N' x $self->length(); +} + +=head2 get_all_differences_Slice + + Args : none + Example : my $differences = $individualSlice->get_all_differences_Slice() + Description : Gets all differences between the IndividualSlice object and the Slice is defined + ReturnType : listref of Bio::EnsEMBL::Variation::AlleleFeature + Exceptions : none + Caller : general + +=cut + +sub get_all_differences_Slice{ + my $self = shift; + my $differences; #reference to the array with the differences between Slice and StrainSlice + my $ref_allele; + foreach my $difference (@{$self->{'alleleFeatures'}}){ + if ($difference->length_diff == 0){ + #the difference is a SNP, check if it is the same as the reference allele + $ref_allele = $self->SUPER::subseq($difference->start,$difference->end,$difference->strand); + $ref_allele = '-' if ($ref_allele eq ''); + if ($ref_allele ne $difference->allele_string){ + #when the alleleFeature is different from the reference allele, add to the differences list + push @{$differences},$difference; + } + } + else{ + push @{$differences},$difference; + } + } + + return $differences; + +} + +=head2 get_all_differences_IndividualSlice + + Arg[1] : Bio::EnsEMBL::IndividualSlice $is + Example : my $differences = $individualSlice->get_all_differences_IndividualSlice($individualslice) + Description : Gets differences between 2 IndividualSlice objects + ReturnType : listref of Bio::EnsEMBL::Variation::AlleleFeature + Exceptions : thrown on bad argument + Caller : general + +=cut + +sub get_all_differences_IndividualSlice{ + my $self = shift; + my $individualSlice = shift; + + if (!ref($individualSlice) || !$individualSlice->isa('Bio::EnsEMBL::IndividualSlice')){ + throw('Bio::EnsEMBL::IndividualSlice arg expected'); + } + if ( @{$self->{'alleleFeatures'}} == 0 && @{$individualSlice->{'alleleFeatures'}} == 0){ + return undef; #there are no differences in any of the Individuals + + } + my $differences; #differences between individuals + if (@{$individualSlice->{'alleleFeatures'}} == 0){ + #need to create a copy of alleleFeature for the first Individual + foreach my $difference (@{$self->{'alleleFeatures'}}){ + my %vf = %$difference; + push @{$differences},bless \%vf,ref($difference); + } + } + elsif (@{$self->{'alleleFeatures'}} == 0){ + #need to create a copy of AlleleFeature, but changing the allele by the allele in the reference sequence + foreach my $difference (@{$individualSlice->{'alleleFeatures'}}){ + push @{$differences}, $individualSlice->_convert_difference($difference); + } + } + else{ + #both individuals have differences + #create a hash with the differences in the first slice + my %allele_features_self = map {$_->start.'-'.$_->end => $_} @{$self->{'alleleFeatures'}}; + foreach my $difference (@{$individualSlice->{'alleleFeatures'}}){ + #there is no difference in the other individual slice, convert the allele + if (!defined $allele_features_self{$difference->start.'-'.$difference->end}){ + push @{$differences},$individualSlice->_convert_difference($difference); + } + else{ + #if it is defined and have the same allele, delete from the hash since it is not a difference + #between the individuals + if ($allele_features_self{$difference->start.'-'.$difference->end}->allele_string eq $difference->allele_string){ + delete $allele_features_self{$difference->start.'-'.$difference->end}; + } + } + } + #and finally, make a shallow copy of the differences in the first individual + foreach my $difference (values %allele_features_self){ + my %vf = %$difference; + push @{$differences},bless \%vf,ref($difference); + } + + } + #need to map differences to the first individual, self, since the coordinates are in the Slice coordinate system + my $mapper = $self->mapper(); #now that we have the differences, map them in the IndividualSlice + my @results; + foreach my $difference (@{$differences}){ + @results = $mapper->map_coordinates('Slice',$difference->start,$difference->end,$difference->strand,'Slice'); + #we can have 3 possibilities: + #the difference is an insertion and when mapping returns the boundaries of the insertion in the IndividualSlice + if (@results == 2){ + #the first position in the result is the beginning of the insertion + if($results[0]->start < $results[1]->start){ + $difference->start($results[0]->end+1); + $difference->end($results[1]->start-1); + } + else{ + #it is the second position the beginning of the insertion + $difference->start($results[1]->end+1); + $difference->end($results[0]->start-1); + } + $difference->strand($results[0]->strand); + } + else{ + #it can be either a SNP or a deletion, and we have the coordinates in the result, etither a Bio::EnsEMBL::Mapper::Coordinate + # or a Bio::EnsEMBL::Mapper::IndelCoordinate + $difference->start($results[0]->start); + $difference->end($results[0]->end); + $difference->strand($results[0]->strand); + } + } + + return $differences; +} + +#for a given AlleleFeature, converts the allele into the reference allele and returns +#the converted AlleleFeature + +sub _convert_difference{ + my $self = shift; + my $difference = shift; + my %new_af = %$difference; #make a copy of the alleleFeature + #and change the allele with the one from the reference Slice + $new_af{'allele_string'} = $self->SUPER::subseq($difference->start,$difference->end,$difference->strand); + return bless \%new_af,ref($difference); +} + +=head2 mapper + + Args : none + Description: Getter for the mapper between the between the IndividualSlice and the Slice it refers to. + It is done automatically when necessary to create subSlice or to get the differences between individuals + Returntype : Bio::EnsEMBL::Mapper + Exceptions : none + Caller : Internal function + +=cut + +sub mapper{ + my $self = shift; + + if (@_) { + #allow to create again the mapper + delete $self->{'mapper'}; + } + if(!defined $self->{'mapper'}){ + #create the mapper between the Slice and StrainSlice + my $mapper = Bio::EnsEMBL::Mapper->new('Slice','IndividualSlice'); + #align with Slice + #get all the VariationFeatures in the Individual Slice, from start to end in the Slice + my @allele_features_ordered = sort {$a->start() <=> $b->start() || $b->end() <=> $a->end()} @{$self->{'alleleFeatures'}} if (defined $self->{'alleleFeatures'}); + + my $start_slice = 1; + my $end_slice; + my $start_individual = 1; + my $end_individual; + my $length_allele; + my $total_length_diff = 0; + #we will walk from left to right in the slice object, updating the start and end individual every time + #there is a new alleleFeature in the Individual + foreach my $allele_feature (@allele_features_ordered){ + #we have a insertion/deletion: marks the beginning of new slice move coordinates + if ($allele_feature->length_diff != 0){ + $total_length_diff += $allele_feature->length_diff; + $length_allele = $allele_feature->length + $allele_feature->length_diff(); #length of the allele in the Individual + $end_slice = $allele_feature->start() - 1; #set the end of the slice before the alleleFeature + if ($end_slice >= $start_slice){ + #normal cases (not with gaps) + $end_individual = $end_slice - $start_slice + $start_individual; #set the end of the individual from the beginning plus the offset + #add the sequence that maps + $mapper->add_map_coordinates('Slice',$start_slice,$end_slice,1,'IndividualSlice',$start_individual,$end_individual); + #and add the indel + $mapper->add_indel_coordinates('Slice',$end_slice+1,$end_slice + $allele_feature->length,1,'IndividualSlice',$end_individual+1,$end_individual + $length_allele); + $start_individual = $end_individual + $length_allele + 1; #set the beginning of the individual after the allele + } + else{ + #add the indel + $mapper->add_indel_coordinates('Slice',$end_slice+1,$end_slice + $allele_feature->length,1,'IndividualSlice',$end_individual+1,$end_individual + $length_allele); + $start_individual += $length_allele; + } + $start_slice = $end_slice + $allele_feature->length+ 1; #set the beginning of the slice after the variation feature + } + } + if ($start_slice <= $self->length){ + #if we haven't reached the end of the IndividualSlice, add the final map coordinates between the individual and the slice + $mapper->add_map_coordinates('Slice',$start_slice,$self->length,1,'IndividualSlice',$start_individual,$start_individual + $self->length - $start_slice); + } + + $mapper->add_map_coordinates('Slice', -$self->start+1, 0,1, 'IndividualSlice', -$self->start +1,0) if ($self->start > 0); #before individualSlice + $mapper->add_map_coordinates('Slice', $self->length + 1,$self->seq_region_length - ($self->length +1),1, 'IndividualSlice', $self->length + 1 + $total_length_diff,$self->seq_region_length + $total_length_diff - ($self->length +1) ) if ($self->length <= $self->seq_region_length); #after strainSlice + $self->{'mapper'} = $mapper; + } + return $self->{'mapper'}; +} + +=head2 sub_Slice + + Arg 1 : int $start + Arg 2 : int $end + Arge [3] : int $strand + Example : none + Description: Makes another IndividualSlice that covers only part of this IndividualSlice + with the appropriate differences to the reference Slice + If a slice is requested which lies outside of the boundaries + of this function will return undef. This means that + behaviour will be consistant whether or not the slice is + attached to the database (i.e. if there is attached sequence + to the slice). Alternatively the expand() method or the + SliceAdaptor::fetch_by_region method can be used instead. + Returntype : Bio::EnsEMBL::IndividualSlice or undef if arguments are wrong + Exceptions : thrown when trying to get the subSlice in the middle of a + insertion + Caller : general + +=cut + +sub sub_Slice { + my ( $self, $start, $end, $strand ) = @_; + my $mapper = $self->mapper(); + #map from the Individual to the Slice to get the sub_Slice, and then, apply the differences in the subSlice + my @results = $mapper->map_coordinates('IndividualSlice',$start,$end,$strand,'IndividualSlice'); + my $new_start; + my $new_end; + my $new_strand; + my $new_seq; + #Get need start and end for the subSlice of the IndividualSlice + my @results_ordered = sort {$a->start <=> $b->start} grep {ref($_) eq 'Bio::EnsEMBL::Mapper::Coordinate'} @results; + $new_start = $results_ordered[0]->start(); + $new_strand = $results_ordered[0]->strand() if (ref($results_ordered[0]) eq 'Bio::EnsEMBL::Mapper::Coordinate'); +# $new_strand = $results_ordered[-1]->strand() if (ref($results_ordered[-1]) eq 'Bio::EnsEMBL::Mapper::Coordinate'); + $new_end = $results_ordered[-1]->end(); #get last element of the array, the end of the slice + + my $subSlice = $self->SUPER::sub_Slice($new_start,$new_end,$new_strand); + $subSlice->{'individual_name'} = $self->{'individual_name'}; + + my $new_alleles; #reference to an array that will contain the variationFeatures in the new subSlice + #update the VariationFeatures in the sub_Slice of the Individual + my %af; + my $new_allele_feature; + foreach my $alleleFeature (@{$self->{'alleleFeatures'}}){ + $new_allele_feature = $alleleFeature->transfer($subSlice); + #only transfer the coordinates to the SubSlice that are within the boundaries + if ($new_allele_feature->start >= 1 && $new_allele_feature->end <= $subSlice->length){ + push @{$new_alleles}, $new_allele_feature; + } + } + $subSlice->{'alleleFeatures'} = $new_alleles; + return $subSlice; + +} + +=head2 subseq + + Arg [1] : int $startBasePair + relative to start of slice, which is 1. + Arg [2] : int $endBasePair + relative to start of slice. + Arg [3] : (optional) int $strand + The strand of the individual slice to obtain sequence from. Default + value is 1. + Description: returns string of dna sequence + Returntype : txt + Exceptions : end should be at least as big as start + strand must be set + Caller : general + +=cut + +sub subseq { + my ( $self, $start, $end, $strand ) = @_; + + if ( $end+1 < $start ) { + throw("End coord + 1 is less than start coord"); + } + + # handle 'between' case for insertions + return '' if( $start == $end + 1); + + $strand = 1 unless(defined $strand); + + if ( $strand != -1 && $strand != 1 ) { + throw("Invalid strand [$strand] in call to Slice::subseq."); + } + + my $subseq; + my $seq; + if($self->adaptor){ + my $seqAdaptor = $self->adaptor()->db()->get_SequenceAdaptor(); + $subseq = ${$seqAdaptor->fetch_by_Slice_start_end_strand($self,$start,$end,$strand)}; #get the reference sequence for that slice + #apply all differences to the reference sequence + # sort edits in reverse order to remove complication of + # adjusting downstream edits + my @allele_features_ordered = sort {$b->start() <=> $a->start() || $b->end() <=> $a->end()} @{$self->{'alleleFeatures'}} if (defined $self->{'alleleFeatures'}); + my $af_start; + my $af_end; + foreach my $af (@allele_features_ordered){ + if (($af->start - $start +1 > 0) && ($end - $af->end > 0)){ + #save the current start and end of the alleleFeature before changing for apply_edit + $af_start = $af->start; + $af_end = $af->end; + #apply the difference if the feature is in the new slice + $af->start($af->start - $start +1); + $af->end($af->end - $start +1); + $af->apply_edit(\$subseq); #change, in the reference sequence, the af + #restore the initial values of alleleFeature start and end + $af->start($af_start); + $af->end($af_end); + + } + } + } + else { + ## check for gap at the beginning and pad it with Ns + if ($start < 1) { + $subseq = "N" x (1 - $start); + $start = 1; + } + $subseq .= substr ($self->seq(), $start-1, $end - $start + 1); + ## check for gap at the end and pad it with Ns + if ($end > $self->length()) { + $subseq .= "N" x ($end - $self->length()); + } + reverse_comp(\$subseq) if($strand == -1); + } + return $subseq; + +} + +=head2 get_all_Transcripts + + Args : None + Example : @transcripts = @{$individualslice->get_all_Transcripts)}; + Description: Gets all transcripts which overlap this Individual Slice. If you want to + specify a particular analysis or type, then you are better off + using get_all_Genes or get_all_Genes_by_type and iterating + through the transcripts of each gene. + Returntype : reference to a list of Bio::EnsEMBL::Transcripts + Exceptions : none + Caller : general + +=cut + +sub get_all_Transcripts { + my $self = shift; + + my $transcripts = $self->SUPER::get_all_Transcripts(1); + $self->map_to_Individual($transcripts); + + return $transcripts; +} + + +=head2 get_all_Exons + + Arg [1] : (optional) string $dbtype + The dbtype of exons to obtain. This assumes that the db has + been added to the DBAdaptor under this name (using the + DBConnection::add_db_adaptor method). + Example : @exons = @{$individualSlice->get_all_Exons}; + Description: Gets all exons which overlap this IndividualSlice. Note that these exons + will not be associated with any transcripts, so this may not + be terribly useful. + Returntype : reference to a list of Bio::EnsEMBL::Exons + Exceptions : none + Caller : general + +=cut + +sub get_all_Exons { + my $self = shift; + my $dbtype = shift; + + my $exons = $self->SUPER::get_all_Exons($dbtype); + $self->map_to_Individual($exons); #map the exons to the Individual + + return $exons; +} + +=head2 get_all_Genes + + Arg [1] : (optional) string $logic_name + The name of the analysis used to generate the genes to retrieve + Arg [2] : (optional) string $dbtype + The dbtype of genes to obtain. This assumes that the db has + been added to the DBAdaptor under this name (using the + DBConnection::add_db_adaptor method). + Example : @genes = @{$individualSlice->get_all_Genes}; + Description: Retrieves all genes that overlap this slice. + Returntype : listref of Bio::EnsEMBL::Genes + Exceptions : none + Caller : none + +=cut + +sub get_all_Genes{ + my ($self, $logic_name, $dbtype) = @_; + + my $genes = $self->SUPER::get_all_Genes($logic_name, $dbtype, 1); + + $self->map_to_Individual($genes); + + foreach my $gene (@{$genes}){ + $self->map_to_Individual($gene->get_all_Exons); #map the Exons to the Individual + $self->map_to_Individual($gene->get_all_Transcripts); #map the Transcripts to the Individual + } + + return $genes; +} + +=head2 map_to_Individual + + Arg[1] : ref $features + Example : $individualSlice->map_to_Individual($exons); + Description : Gets the features from the Slice and maps it in the IndividualSlice, using the mapper + between Slice and IndividualSlice + ReturnType : None + Exceptions : None + Caller : general + +=cut + +sub map_to_Individual{ + my $self = shift; + my $features = shift; + + my $mapper = $self->mapper(); + my (@results, @results_ordered, $new_start, $new_end, $new_strand); + #foreach of the transcripts, map them to the IndividualSlice and replace the Slice with the IndividualSlice + foreach my $feature (@{$features}){ + $feature->slice($self); #replace the IndividualSlice as the Slice for this feature (the Slice plus the AlleleFeatures) + #map from the Slice to the Individual Slice + my @results = $mapper->map_coordinates('Slice',$feature->start,$feature->end,$feature->strand,'Slice'); + #from the results, order them but filter out those that are not coordinates + @results_ordered = sort {$a->start <=> $b->start} grep {ref($_) eq 'Bio::EnsEMBL::Mapper::Coordinate'} @results; + $new_start = $results_ordered[0]->start(); + $new_strand = $results_ordered[0]->strand(); + $new_end = $results_ordered[-1]->end(); #get last element of the array, the end of the slice + $feature->start($new_start); #update new coordinates + $feature->end($new_end); + $feature->strand($new_strand); + } +} + +sub alleleFeatures{ + my $self = shift; + return $self->{'alleleFeatures'}; +} + +sub add_AlleleFeature{ + my $self = shift; + + if (@_){ + if(!ref($_[0]) || !$_[0]->isa('Bio::EnsEMBL::Variation::AlleleFeature')) { + throw("Bio::EnsEMBL::Variation::AlleleFeature argument expected"); + } + #add the alleleFeature to the individualSlice + push @{$self->{'alleleFeatures'}},shift; + } +} +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/IndividualSliceFactory.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IndividualSliceFactory.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,159 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +package Bio::EnsEMBL::IndividualSliceFactory; + +use strict; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); +use Bio::EnsEMBL::Slice; +use Bio::EnsEMBL::Mapper; +use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning); +use Scalar::Util qw(weaken); + +=head2 new +=cut + +sub new{ + my $caller = shift; + my $class = ref($caller) || $caller; + + #creates many IndividualSlice objects from the Population + + my ($population_name, $coord_system, $start, $end, $strand, $seq_region_name, $seq_region_length, $adaptor) = rearrange(['POPULATION', 'COORD_SYSTEM','START','END','STRAND','SEQ_REGION_NAME','SEQ_REGION_LENGTH', 'ADAPTOR'],@_); + + my $self = bless { + population_name => $population_name, + coord_system => $coord_system, + start => $start, + end => $end, + strand => $strand, + seq_region_name => $seq_region_name, + seq_region_length => $seq_region_length},$class; + + $self->adaptor($adaptor); + return $self; +} + +sub adaptor { + my $self = shift; + + if(@_) { + my $ad = shift; + if($ad && (!ref($ad) || !$ad->isa('Bio::EnsEMBL::DBSQL::BaseAdaptor'))) { + throw('Adaptor argument must be a Bio::EnsEMBL::DBSQL::BaseAdaptor'); + } + weaken($self->{'adaptor'} = $ad); + } + + return $self->{'adaptor'} +} + +sub get_all_IndividualSlice{ + my $self = shift; + + my $slice; + if(!$self->adaptor) { + warning('Cannot get IndividualSlice features without attached adaptor'); + return ''; + } + my $variation_db = $self->adaptor->db->get_db_adaptor('variation'); + + unless($variation_db) { + warning("Variation database must be attached to core database to " . + "retrieve variation information" ); + return ''; + } + #get the AlleleFeatures in the Population + my $af_adaptor = $variation_db->get_AlleleFeatureAdaptor; + + if( $af_adaptor ) { + #set the adaptor to retrieve data from genotype table + $af_adaptor->from_IndividualSlice(1); + #get the Individual for the given strain + my $population_adaptor = $variation_db->get_PopulationAdaptor; + my $individual_adaptor = $variation_db->get_IndividualAdaptor; + if ($population_adaptor && $individual_adaptor){ + $slice = Bio::EnsEMBL::Slice->new(-coord_system => $self->{'coord_system'}, + -start => $self->{'start'}, + -end => $self->{'end'}, + -strand => $self->{'strand'}, + -seq_region_name => $self->{'seq_region_name'}, + -seq_region_length => $self->{'seq_region_length'}, + -adaptor => $self->adaptor + ); + my $population = $population_adaptor->fetch_by_name($self->{'population_name'}); + #check that there is such population in the database + if (defined $population){ + #get all the AlleleFeatures in the $population and the Slice given + my $allele_features = $af_adaptor->fetch_all_by_Slice($slice,$population); + #get Individuals in the Population + my $individuals = $individual_adaptor->fetch_all_by_Population($population); + return $self->_rearrange_Individuals_Alleles($individuals,$allele_features); + } + else{ + warning("Population not in the database"); + return ''; + + } + } + else{ + warning("Not possible to retrieve PopulationAdaptor from the variation database"); + return ''; + } + } + + else{ + warning("Not possible to retrieve AlleleFeatureAdaptor from variation database"); + return ''; + } +} + +sub _rearrange_Individuals_Alleles{ + my $self = shift; + my $individuals = shift; + my $allele_features; + my $individual_slice; + #create the hash with all the individuals + my %individuals_ids; + #foreach of the individual, create the IndividualSlice object and add it to the mapping hash + foreach my $individual (@{$individuals}){ + $individual_slice = Bio::EnsEMBL::Variation::IndividualSlice->new( + -coord_system => $self->{'coord_system'}, + -start => $self->{'$start'}, + -end => $self->{'end'}, + -strand => $self->{'strand'}, + -seq_region_name => $self->{'seq_region_name'}, + -seq_region_length => $self->{'seq_region_length'}, + -individual => $individual->name); + + $individuals_ids{$individual->dbID} = $individual_slice; + } + + #and rearrange all the AlleleFeatures to the individuals + foreach my $allele_feature (@{$allele_features}){ + $individuals_ids{$allele_feature->{'_sample_id'}}->add_AlleleFeature($allele_feature); + } + my @result = values %individuals_ids; + return \@result; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Intron.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Intron.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,199 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME Bio::EnsEMBL::Intron - A class representing an Intron + +=head1 SYNOPSIS + + $intron = Bio::EnsEMBL::Intron->new( exon1, exon2, $analysis ); + +=cut + + +package Bio::EnsEMBL::Intron; +use strict; +use warnings; + +use Bio::EnsEMBL::Utils::Exception qw( warning throw ); + +use base qw(Bio::EnsEMBL::Feature); + +=head2 new + + Arg [1] : Bio::EnsEMBL::Exon The 5' exon for the intron; required + Arg [2] : Bio::EnsEMBL::Exon The 3' exon for the intron; required + Arg [3] : Bio::EnsEMBL::Analysis Analysis to link to this Intron + Example : $intron = new Bio::EnsEMBL::Intron($exon1, $exon2) + Description: Create an Intron object from two exons and an optional analysis + Returntype : Bio::EnsEMBL::Intron + Exceptions : exons not on the same strand or slice. + Caller : general + Status : Stable + +=cut + +sub new { + my ( $proto, $e1, $e2, $analysis ) = @_; + + my $class = ref $proto || $proto; + + my $self = $class->SUPER::new(); + + if ( $e1->strand() == -1 ) { + $self->{'end'} = $e1->start() - 1; + $self->{'start'} = $e2->end() + 1; + } else { + $self->{'start'} = $e1->end() + 1; + $self->{'end'} = $e2->start() - 1; + } + + if ( $e1->strand() != $e2->strand() ) { + # throw("Exons on different strand. Not allowed"); + } else { + $self->{'strand'} = $e1->strand(); + } + + if ( $e1->slice() ne $e2->slice() ) { + if ( ( $e1->slice()->seq_region_name() ne + $e2->slice()->seq_region_name() ) + && ( $e1->slice()->coord_system_name() ne + $e2->slice()->coord_system_name() ) ) + { + throw("Exons on different slices. Not allowed"); + } else { + warning("Exons have different slice references to the same seq_region"); + } + } else { + $self->{'slice'} = $e1->slice(); + } + + if($analysis) { + $self->analysis($analysis); + } + + $self->{'prev'} = $e1; + $self->{'next'} = $e2; + + return $self; +} ## end sub new + +=head2 length + + Args : none + Example : $length = $intron->length(); + Description: Returns the length of this intron + Returntype : Integer + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub length { + my ($self) = @_; + + # TODO: Introns on circular slices, see Feature.pm but allow for + # zero-length introns. + + return $self->{'end'} - $self->{'start'} + 1; +} + + +=head2 prev_Exon + + Args : none + Example : $exon = $intron->prev_Exon + Description: Returns the exon before this Intron + Returntype : Bio::EnsEMBL::Exon + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub prev_Exon { + my ($self) = shift; + + return $self->{'prev'}; +} + + +=head2 next_Exon + + Args : none + Example : $exon = $intron->next_Exon + Description: Returns the exon after this Intron + Returntype : Bio::EnsEMBL::Exon + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub next_Exon { + my ($self) = shift; + + return $self->{'next'}; +} + +=head2 is_splice_canonical + + Example : my $canonical = $intron->is_splice_canonical(); + Description : Indicates if the splice site is considered normal. This means + splice site variants equal to (D == donor, A == acceptor) + GT (D) => AG (A) + AT (D) => AC (A) + GC (D) => AG (A) + Returntype : Boolean indicating if the splice was as expected + Exceptions : See splice_seq + +=cut + +sub is_splice_canonical { + my ($self) = @_; + my $splice = join q{}, @{$self->splice_seq()}; + my $canonical = { + 'GTAG' => 1, 'ATAC' => 1, 'GCAG' => 1 + }->{$splice}; + return $canonical || 0; +} + +=head2 splice_seq + + Example : my ($donor, $acceptor) = @{$intron->splice_seq}; + Description : Get the donor and acceptor splice sites for this intron + Returntype : ArrayRef[String] The donor and acceptor sequences as Strings + Exceptions : Thrown if a feature Slice cannot be found + +=cut + +sub splice_seq { + my ($self) = @_; + my $slice = $self->feature_Slice(); + throw "Cannot retrieve feature_Slice() for this Intron" unless $slice; + my $length = $self->length(); + my $donor_seq = uc($slice->subseq(1,2)); + my $acceptor_seq = uc($slice->subseq($length - 1, $length)); + return [$donor_seq, $acceptor_seq]; +} + +1; + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/IntronSupportingEvidence.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IntronSupportingEvidence.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,329 @@ +package Bio::EnsEMBL::IntronSupportingEvidence; + +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::IntronSupportingEvidence + +=head1 DESCRIPTION + +Formalises an Intron with information about why it is a believed Intron. This +serves as a parallel object to Bio::EnsEMBL::Intron which you can use +to populate values in this field from. They are different objects though +due to Intron's non-existence as a DB data structure. + +=head1 SYNOPSIS + + #Example setups a ISE from the first two Exons + my ($five_prime_exon, $three_prime_exon) = @{$transcript->get_all_Exons()}[0..1]; + my $intron = Bio::EnsEMBL::Intron->new($five_prime_exon, $three_prime_exon); + +=head1 METHODS + +=cut + + +use strict; +use warnings; +use base qw/Bio::EnsEMBL::Feature/; + +use Bio::EnsEMBL::Intron; +use Bio::EnsEMBL::Utils::Argument qw/rearrange/; +use Bio::EnsEMBL::Utils::Exception qw/throw/; +use Bio::EnsEMBL::Utils::Scalar qw/assert_ref/; + +our %SUPPORTED_TYPES = map { $_ => 1 } qw/NONE DEPTH/; + +=head2 new + + Arg [-ANALYSIS] : Bio::EnsEMBL::Analysis The analysis this intron is linked to + Arg [-START] : int - start postion of the IntronSupportingEvidence + Arg [-END] : int - end position of the IntronSupportingEvidence + Arg [-STRAND] : int - strand the IntronSupportingEvidence is on + Arg [-SLICE] : Bio::EnsEMBL::Slice - the slice the IntronSupportingEvidence is on + Arg [-INTRON] : Bio::EnsEMBL::Intron Intron the evidence is based + on. Useful if you are not specifying the location + parameters as we will take them from this + Arg [-HIT_NAME] : String The name of the hit + Arg [-SCORE] : Double The score associated with the supporting evidence + Arg [-SCORE_TYPE] : String The type of score we are representing + Example : Bio::EnsEMBL::IntronSupportingEvidence->new( + -ANALYSIS => $analysis, -INTRON => $intron, + -SCORE => 100, -SCORE_TYPE => 'DEPTH'); + Description : Returns a new instance of this object + Returntype : Bio::EnsEMBL::IntronSupportEvidence + Exceptions : Thrown if data is not as requested + +=cut + +sub new { + my ($class, @args) = @_; + + my $self = $class->SUPER::new(@args); + + my ($intron, $hit_name, $score, $score_type, $is_splice_canonical) = + rearrange([qw/intron hit_name score score_type is_splice_canonical/], @args); + + if($intron) { + $self->set_values_from_Intron($intron); + } + $self->hit_name($hit_name) if $hit_name; + $self->score($score) if $score; + $self->score_type($score_type) if $score_type; + $self->is_splice_canonical($is_splice_canonical) if $is_splice_canonical; + + return $self; +} + +=head2 set_values_from_Intron + + Arg [1] : Bio::EnsEMBL::Intron The intron to base this object on + Example : $ise->set_values_from_Intron($intron); + Description : Sets the start, end, strand and slice of this ISE instance + using values from the given Intron object. + Returntype : None + Exceptions : Thrown if data is not as requested + +=cut + +sub set_values_from_Intron { + my ($self, $intron) = @_; + assert_ref($intron, 'Bio::EnsEMBL::Intron', 'intron'); + $self->start($intron->start()); + $self->end($intron->end()); + $self->strand($intron->strand()); + $self->slice($intron->slice()); + $self->is_splice_canonical($intron->is_splice_canonical()); + return; +} + +=head2 is_splice_canonical + + Arg [1] : Boolean + Example : $ise->is_splice_canonical(1); + Description : Getter/setter for is_splice_canonical. Splice canonical + indicates those Introns which have a splice junction which + is structured as expected + Returntype : Boolean + Exceptions : + +=cut + +sub is_splice_canonical { + my ($self, $is_splice_canonical) = @_; + $self->{'is_splice_canonical'} = $is_splice_canonical if defined $is_splice_canonical; + return $self->{'is_splice_canonical'}; +} + +=head2 get_Intron + + Arg [1] : Bio::EnsEMBL::Transcript + Example : my $intron = $ise->intron($transcript); + Description : Provides access to an Intron object by using a given transcript + object and its associcated array of Exons. + Returntype : Bio::EnsEMBL::Intron + Exceptions : None + +=cut + +sub get_Intron { + my ($self, $transcript) = @_; + assert_ref($transcript, 'Bio::EnsEMBL::Transcript', 'transcript'); + my $five_prime = $self->find_previous_Exon($transcript); + my $three_prime = $self->find_next_Exon($transcript); + return Bio::EnsEMBL::Intron->new($five_prime, $three_prime); +} + +=head2 hit_name + + Arg [1] : String name of the hit + Example : $ise->hit_name('hit'); + Description : Getter/setter for hit name i.e. an identifier for the alignments + Returntype : String + Exceptions : None + +=cut + +sub hit_name { + my ($self, $hit_name) = @_; + $self->{'hit_name'} = $hit_name if defined $hit_name; + return $self->{'hit_name'}; +} + +=head2 score + + Arg [1] : Number; the score associated with this feature + Example : $ise->score(100); + Description : Getter/setter for score + Returntype : Number + Exceptions : None + +=cut + +sub score { + my ($self, $score) = @_; + $self->{'score'} = $score if defined $score; + return $self->{'score'}; +} + +=head2 score_type + + Arg [1] : String the enum type. Currently only allowed NONE or DEPTH + Example : $ise->score_type('DEPTH'); + Description : Gets and sets the type of score this instance represents + Returntype : String + Exceptions : Thrown if given an unsupported type of data + +=cut + +sub score_type { + my ($self, $score_type) = @_; + if(defined $score_type) { + if(! $SUPPORTED_TYPES{$score_type}) { + my $values = join(q{, }, keys %SUPPORTED_TYPES); + throw "The score_type '$score_type' is not allowed. Allowed values are [${values}]"; + } + } + $self->{'score_type'} = $score_type if defined $score_type; + return $self->{'score_type'}; +} + +=head2 has_linked_transcripts + + Example : $ise->has_linked_transcripts(); + Description : Returns true if we have transcripts linked to this ISE + Returntype : Boolean + Exceptions : Thrown if we do not have an attached adaptor + +=cut + +sub has_linked_transcripts { + my ($self) = @_; + throw "No attached adaptor. Cannot find linked Transcripts unless this is a persisted object" unless $self->adaptor(); + my $transcript_ids = $self->adaptor()->list_linked_transcript_ids($self); + return scalar(@{$transcript_ids}) ? 1 : 0; +} + +=head2 equals + + Arg [1] : Bio::EnsEMBL::IntronSupportEvidence Object to compare to + Example : $ise->equals($another_ise); + Description : Asserts if the given IntronSupportEvidence instance was equal to this + Returntype : Boolean + Exceptions : None + +=cut + +sub equals { + my ($self, $other) = @_; + my $equal = $self->SUPER::equals($other); + return 0 if ! $equal; + return ( + ($self->hit_name()||q{}) eq ($other->hit_name()||q{}) && + ($self->score_type() eq $other->score_type()) && + ($self->score() == $other->score())) ? 1 : 0; +} + +=head2 find_previous_Exon + + Arg [1] : Bio::EnsEMBL::Transcript Transcript to search for the Exons from + Example : $ise->find_previous_Exon($transcript); + Description : Loops through those Exons available from the Transcript and + attempts to find one which was the 5' flanking exon. If the + object has already been persisted we will use dbIDs to + find the Exons + Returntype : Bio::EnsEMBL::Exon + Exceptions : None + +=cut + +sub find_previous_Exon { + my ($self, $transcript) = @_; + + #Use DB IDs if we have them + my $exon_id; + if($self->adaptor()) { + my @ids = $self->adaptor()->fetch_flanking_exon_ids($self, $transcript); + $exon_id = $ids[0] if @ids; + } + + my $exons = $transcript->get_all_Exons(); + + my $start = $self->start(); + my $end = $self->end(); + foreach my $exon (@{$exons}) { + if($exon_id) { + return $exon if $exon->dbID() == $exon_id; + next; + } + if($self->strand() == 1) { + return $exon if $exon->end() == $start-1; + } + else { + return $exon if $exon->start() == $end+1; + } + } + return; +} + +=head2 find_next_Exon + + Arg [1] : Bio::EnsEMBL::Transcript Transcript to search for the Exons from + Example : $ise->find_next_Exon($transcript); + Description : Loops through those Exons available from the Transcript and + attempts to find one which was the 3' flanking exon. If the + object has already been persisted we will use dbIDs to + find the Exons + Returntype : Bio::EnsEMBL::Exon + Exceptions : None + +=cut + +sub find_next_Exon { + my ($self, $transcript) = @_; + + #Use DB IDs if we have them + my $exon_id; + if($self->adaptor()) { + my @ids = $self->adaptor()->fetch_flanking_exon_ids($self, $transcript); + $exon_id = $ids[1] if @ids; + } + + my $exons = $transcript->get_all_Exons(); + my $start = $self->start(); + my $end = $self->end(); + foreach my $exon (@{$exons}) { + if($exon_id) { + return $exon if $exon->dbID() == $exon_id; + next; + } + if($self->strand() == 1) { + return $exon if $exon->start() == $end+1; + } + else { + return $exon if $exon->end() == $start-1; + } + } + return; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/KaryotypeBand.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/KaryotypeBand.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,234 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::KaryotypeBand + +=head1 SYNOPSIS + + use Bio::EnsEMBL::KaryotypeBand; + + # Create and populate a karyotype band (normally done by adaptor) + $kb = Bio::EnsEMBL::KaryotyeBand( + -START => 1, + -END => 1_000_000, + -SLICE => $chrX_slice, + -NAME => 'q31', + -STAIN => 'gpos50', + -ADAPTOR => $db->get_KaryotypeBandAdaptor(), + -DBID => 10 + ); + + # Can tranform this band into other coord systems, just like other + # features + $kb = $kb->transform('supercontig'); + + $start = $kb->start(); + $end = $kb->end(); + $seq_region = $kb->slice->seq_region_name(); + + # Karyotypes have internal ids as well + $kary_id = $kb->dbID(); + +=head1 DESCRIPTION + +KaryotypeBand objects encapsulate data pertaining to a +single karyotype band. Access these objects through a +Bio::EnsEMBL::DBSQL::KaryotypeBandAdaptor. + +KarytoypeBand inherits from Bio::EnsEMBL::Feature and can be used just +as any other feature can be. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::KaryotypeBand; + +use strict; +use vars qw(@ISA); + +use Bio::EnsEMBL::Feature; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(deprecate warning); + +@ISA = qw(Bio::EnsEMBL::Feature); + + +=head2 new + + Arg [NAME] : string (optional) + The name of this band + Arg [STAIN]: string (optional) + The stain of this band + Arg [...] : Arguments passed to superclass constructor. + See Bio::EnsEMBL::Feature + Example : $kb = Bio::EnsEMBL::KaryotypeBand->new(-START => $start, + -END => $end, + -SLICE => $slice, + -NAME => 'q11.21', + -STAIN => 'gneg'); + Description: Constructor. Creates a new KaryotypeBand object, which can be + treated as any other feature object. Note that karyotypes + bands always have strand = 0. + Returntype : Bio::EnsEMBL::KarytotypeBand + Exceptions : none + Caller : Bio::EnsEMBL::KaryotypeBandAdaptor + Status : Stable + +=cut + +sub new { + my $class = shift; + + my $self = $class->SUPER::new(@_); + + my ($name, $stain) = rearrange(['NAME','STAIN'],@_); + $self->{'name'} = $name; + $self->{'stain'} = $stain; + $self->{'strand'} = 0; + + return $self; +} + + +=head2 name + + Arg [1] : (optional) string $value + Example : my $band_name = $band->name(); + Description: Getter/Setter for the name of this band + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub name{ + my $self = shift; + $self->{'name'} = shift if(@_); + return $self->{'name'}; +} + + + +=head2 stain + + Arg [1] : (optional) string $value + Example : my $band_stain = $band->stain(); + Description: get/set for the band stain (e.g. 'gpos50') + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub stain{ + my $self = shift; + $self->{'stain'} = shift if(@_); + return $self->{'stain'}; +} + + + +=head2 strand + + Arg [1] : none + Example : $strand = $qtl_feat->strand(); + Description: Overrides the Feature strand method to always return a + value of 0 for qtl features (they are unstranded features) + Returntype : int (always 0) + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub strand { + my $self = shift; + return 0; +} + + +=head2 move + + Arg [1] : $start - The new end of this band + Arg [2] : $end - The new start of this band + Arg [3] : $strand - ignored always set to 0 + Example : $kb->move(1, 10_000); + Description: Overrides superclass move() method to ensure strand is always 0. + See Bio::EnsEMBL::Feature::move + Returntype : none + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub move { + my ($self, $start, $end, $strand) = @_; + + #maintain a strandedness of 0 + return $self->SUPER::move($start,$end,0); +} + + +=head2 display_id + + Arg [1] : none + Example : print $kb->display_id(); + Description: This method returns a string that is considered to be + the 'display' identifier. For karyotype bands this is the + name of the karyotype band or '' if no name is defined. + Returntype : string + Exceptions : none + Caller : web drawing code + Status : Stable + +=cut + +sub display_id { + my $self = shift; + return $self->{'name'} || ''; +} + + +=head2 chr_name + + Description: DEPRECATED - use $kary_band->slice()->seq_region_name() instead + +=cut + +sub chr_name { + my $self = shift; + + deprecate('Use $kary_band->slice()->seq_region_name() instead.'); + if(!$self->slice) { + warning('KaryotypeBand does not have Slice - cannot get seq_region_name.'); + return ''; + } + + return $self->slice->seq_region_name(); +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/LRGSlice.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/LRGSlice.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,271 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::LRGSlice - Arbitary Slice of a genome + +=head1 SYNOPSIS + + $sa = $db->get_SliceAdaptor; + + $slice = + $sa->fetch_by_region( 'LRG', 'LRG3'); + + # get some attributes of the slice + my $seqname = $slice->seq_region_name(); + my $start = $slice->start(); + my $end = $slice->end(); + + # get the sequence from the slice + my $seq = $slice->seq(); + + # get some features from the slice + foreach $gene ( @{ $slice->get_all_Genes } ) { + # do something with a gene + } + + foreach my $feature ( @{ $slice->get_all_DnaAlignFeatures } ) { + # do something with dna-dna alignments + } + +=head1 DESCRIPTION + +A LRG Slice object represents a region of a genome. It can be used to retrieve +sequence or features from an area of interest. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::LRGSlice; +use vars qw(@ISA); +use strict; + +use Bio::PrimarySeqI; + +use Bio::EnsEMBL::Slice; + +use vars qw(@ISA); + +@ISA = qw(Bio::EnsEMBL::Slice); + +sub new{ + my $class = shift; + + my $self = bless {}, $class ; + + my $slice = $self = $class->SUPER::new( @_); + + return $self; +} + +sub stable_id { + my $self = shift; + return $self->seq_region_name; +} + + +sub display_xref { + my $self = shift; + return $self->seq_region_name; +} + +sub feature_Slice { + my $self = shift; + return $self->{_chrom_slice} if defined($self->{_chrom_slice}); + + my $max=-99999999999; + my $min=9999999999; + my $chrom; + my $strand; + +# print STDERR "working out feature slcie\n"; + foreach my $segment (@{$self->project('chromosome')}) { + my $from_start = $segment->from_start(); + my $from_end = $segment->from_end(); + my $to_name = $segment->to_Slice->seq_region_name(); + $chrom = $to_name; + + my $to_start = $segment->to_Slice->start(); + my $to_end = $segment->to_Slice->end(); + if($to_start > $max){ + $max = $to_start; + } + if($to_start < $min){ + $min = $to_start; + } + if($to_end > $max){ + $max = $to_end; + } + if($to_end < $min){ + $min = $to_end; + } + my $ori = $segment->to_Slice->strand(); + $strand = $ori; + } + if(!defined($chrom)){ + warn "Could not project to chromosome for ".$self->name."??\n"; + return undef; + } + my $chrom_slice = $self->adaptor->fetch_by_region("chromosome",$chrom, $min, $max, $strand); + $self->{_chrom_slice} = $chrom_slice; + return $self->{_chrom_slice}; +} + +sub DESTROY{ +} + +sub get_all_differences{ + my $self = shift; + + my @results; + + # get seq_region_attrib diffs (always same-length substitutions) + ################################################################ + + my $sth = $self->adaptor->prepare(qq{ + SELECT sra.value + FROM seq_region_attrib sra, attrib_type at + WHERE at.code = '_rna_edit' + AND at.attrib_type_id = sra.attrib_type_id + AND sra.seq_region_id = ? + }); + + $sth->execute($self->get_seq_region_id); + + my $edit_string; + + $sth->bind_columns(\$edit_string); + + while($sth->fetch()) { + my ($start, $end, $edit) = split " ", $edit_string; + + my $slice = $self->sub_Slice($start, $end); + my $chr_proj = $slice->project("chromosome"); + my $ref_seq = '-'; + if(scalar @$chr_proj == 1) { + $ref_seq = $chr_proj->[0]->[2]->seq; + } + + + my $diff = { + 'start' => $start, + 'end' => $end, + 'type' => 'substitution', + 'seq' => $edit, + 'ref' => $ref_seq, + }; + + push @results, $diff; + } + + # get more complex differences via projections + ############################################## + + # project the LRG slice to contig coordinates + my @segs = @{$self->project("contig")}; + + # if the LRG projects into more than one segment + if(scalar @segs > 1) { + + my ($prev_end, $prev_chr_start, $prev_chr_end, $prev_was_chr); + + foreach my $seg(@segs) { + + # is this a novel LRG contig, or does it project to a chromosome? + my @chr_proj = @{$seg->[2]->project("chromosome")}; + + # if it is a normal contig + if(scalar @chr_proj) { + + # check if there has been a deletion in LRG + if($prev_was_chr && $prev_end == $seg->[0] - 1) { + + # check it's not just a break in contigs + unless( + ($chr_proj[0]->[2]->strand != $self->strand && $prev_chr_start == $chr_proj[0]->[2]->end + 1) || + ($chr_proj[0]->[2]->strand != $self->strand && $prev_chr_end == $chr_proj[0]->[2]->start - 1) + ) { + + # now get deleted slice coords, depends on the strand rel to LRG + my ($s, $e); + + # opposite strand + if($chr_proj[0]->[2]->strand != $self->strand) { + ($s, $e) = ($prev_chr_start - 1, $chr_proj[0]->[2]->end + 1); + } + + # same strand + else { + ($s, $e) = ($prev_chr_end + 1, $chr_proj[0]->[2]->start - 1); + } + + if($s > $e) { + warn "Oops, trying to create a slice from $s to $e (could have been ", $prev_chr_start - 1, "-", $chr_proj[0]->[2]->end + 1, " or ", $prev_chr_end + 1, "-", $chr_proj[0]->[2]->start - 1, ")"; + } + + else { + # get a slice representing the sequence that was deleted + my $deleted_slice = $self->adaptor->fetch_by_region("chromosome", $chr_proj[0]->[2]->seq_region_name, $s, $e); + + my $diff = { + 'start' => $seg->[0], + 'end' => $prev_end, + 'type' => 'deletion', + 'seq' => '-', + 'ref' => $deleted_slice->seq." ".$deleted_slice->start.'-'.$deleted_slice->end, + }; + + push @results, $diff; + } + } + } + + $prev_was_chr = 1; + + $prev_chr_start = $chr_proj[0]->[2]->start; + $prev_chr_end = $chr_proj[0]->[2]->end; + } + + # if it is an LRG made-up contig for an insertion + else { + $prev_was_chr = 0; + + my $diff = { + 'start' => $seg->[0], + 'end' => $seg->[1], + 'type' => 'insertion', + 'seq' => substr($self->seq, $seg->[0] - 1, $seg->[1] - $seg->[0] + 1), + 'ref' => '-', + }; + + push @results, $diff; + } + + $prev_end = $seg->[1]; + } + } + + # return results sorted by start, then end position + return [sort {$a->{start} <=> $b->{start} || $a->{end} <=> $b->{end}} @results]; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Map/DBSQL/DitagAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Map/DBSQL/DitagAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,386 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Map::DBSQL::DitagAdaptor + +=head1 SYNOPSIS + + my $ditagadaptor = $db->get_DitagAdaptor(); + my @ditags = @{ $ditagadaptor->fetch_by_type("ZZ11") }; + +=head1 DESCRIPTION + +Provides database interaction for the Bio::EnsEMBL::Map::Ditag object + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Map::DBSQL::DitagAdaptor; + +use strict; +use vars ('@ISA'); + +use Bio::EnsEMBL::Map::Ditag; +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Utils::Exception qw( throw warning ); + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); + + +=head2 fetch_all_by_name + + Arg [1] : ditag name + Example : $tag = $ditag_adaptor->fetch_by_name("U3"); + Description: Retrieves ditags from the database using the name + Returntype : listref of Bio::EnsEMBL::Map::Ditag + Caller : general + +=cut + +sub fetch_all_by_name { + my ($self, $tagname) = @_; + + if(!$tagname){ + throw "must be called with a name of a ditag."; + } + my $sth = $self->prepare("SELECT d.ditag_id, d.name, d.type, d.tag_count, d.sequence + FROM ditag d + WHERE d.name = ?"); + $sth->execute($tagname); + my $result = $self->_fetch($sth); + + return $result; + +} + + +=head2 fetch_by_dbID + + Arg [1] : ditag type + Example : @all_tags = @{$ditag_adaptor->fetch_by_dbID(1003)}; + Description: Retrieve ditags with a certain id from the database + Returntype : Bio::EnsEMBL::Map::Ditag + Caller : general + +=cut + +sub fetch_by_dbID { + my ($self, $tagid) = @_; + + if(!$tagid){ + throw "must be called with the type of a ditag."; + } + my $sth = $self->prepare("SELECT d.ditag_id, d.name, d.type, d.tag_count, d.sequence + FROM ditag d + WHERE d.ditag_id = ?"); + $sth->execute($tagid); + my $result = $self->_fetch($sth); + + return $result->[0]; +} + + +=head2 fetch_all_by_type + + Arg [1] : ditag type + Example : @all_tags = @{$ditag_adaptor->fetch_by_type("SME005")}; + Description: Retrieves all ditags of a certain type from the database + Returntype : listref of Bio::EnsEMBL::Map::Ditag + Caller : general + +=cut + +sub fetch_all_by_type { + my ($self, $tagtype) = @_; + + if(!$tagtype){ + throw "must be called with the type of a ditag."; + } + my $sth = $self->prepare("SELECT d.ditag_id, d.name, d.type, d.tag_count, d.sequence + FROM ditag d + WHERE d.type = ?"); + $sth->execute($tagtype); + my $result = $self->_fetch($sth); + + return $result; +} + + +=head2 fetch_by_name_and_type + + Arg [1] : ditag name + Arg [2] : ditag type + Example : $tag = $ditag_adaptor->fetch_by_name_and_type("U3", "SME005"); + Description: Retrieves ditags from the database using name/type combination + which should be non-ambiguous + Returntype : Bio::EnsEMBL::Map::Ditag + Caller : general + +=cut + +sub fetch_by_name_and_type { + my ($self, $tagname, $tagtype) = @_; + + if(!$tagname or !$tagtype){ + throw "must be called with a name and type of a ditag."; + } + my $sth = $self->prepare("SELECT d.ditag_id, d.name, d.type, d.tag_count, d.sequence + FROM ditag d + WHERE d.name = ? and d.type = ?"); + $sth->execute($tagname, $tagtype); + my $result = $self->_fetch($sth); + + return $result->[0]; +} + + +=head2 fetch_all + + Args : none + Example : @all_tags = @{$ditag_adaptor->fetch_all}; + Description: Retrieves all ditags from the database + Returntype : listref of Bio::EnsEMBL::Map::Ditag + Caller : general + +=cut + +sub fetch_all { + my ($self) = @_; + + my $sth = $self->prepare("SELECT d.ditag_id, d.name, d.type, d.tag_count, d.sequence + FROM ditag d"); + $sth->execute; + my $result = $self->_fetch($sth); + + return $result; +} + + +=head2 fetch_all_with_limit + + Arg [1] : ditag type + Arg [2] : row limit + Arg [3] : row offset + Description: fetch_by_type with row limit and offset + Returntype : listref of Bio::EnsEMBL::Map::Ditag + Caller : general + +=cut + +sub fetch_all_with_limit { + my ($self, $tagtype, $limit, $offset) = @_; + + my @ditags = (); + my $sql = "SELECT d.ditag_id, d.name, d.type, d.tag_count, d.sequence ". + "FROM ditag d ". + "WHERE d.type = ? LIMIT ? OFFSET ?;"; + my $sth = $self->db->dbc->prepare($sql); + $sth->execute($tagtype, $limit, $offset); + my $result = $self->_fetch($sth); + + return $result; +} + + +=head2 _fetch + + Arg [1] : statement handler object + Description: generic sql-fetch function for the ditag fetch methods + Returntype : listref of Bio::EnsEMBL::Map::Ditag + Caller : private + +=cut + +sub _fetch { + my ($self, $sth) = @_; + + my($tag_id, $name, $type, $count, $sequence); + my @tags; + + $sth->bind_columns(\$tag_id, \$name, \$type, \$count, \$sequence); + while($sth->fetch) { + push @tags, Bio::EnsEMBL::Map::Ditag->new ( + -dbID => $tag_id, + -name => $name, + -type => $type, + -tag_count => $count, + -sequence => $sequence, + -adaptor => $self, + ); + } + + return \@tags; +} + + +=head2 store + + Arg [1] : Bio::EnsEMBL::Map::Ditag + Example : $ditag_adaptor->store(\@ditags); + Description: Stores a single ditag or + a list of ditags in this database. + Returntype : none + Caller : general + +=cut + +sub store { + my ($self, $ditags) = @_; + + if(ref $ditags eq 'ARRAY'){ + if(scalar(@$ditags) == 0){ + throw("Must call store with ditag or list ref of ditags"); + } + } + elsif($ditags){ + my @ditags; + push @ditags, $ditags; + $ditags = \@ditags; + } + else{ + throw("Must call store with ditag or list ref of ditags not ".$ditags); + } + + my $db = $self->db() or throw "Couldn t get database connection."; + + TAG: + foreach my $ditag (@$ditags) { + + if ( !ref $ditag || !$ditag->isa("Bio::EnsEMBL::Map::Ditag") ) { + throw( "Object must be an Ensembl Ditag, " . "not a [" . ref($ditag) . "]" ); + } + + if ( $ditag->is_stored($db) ) { + warning( "Ditag [" . $ditag->dbID . "] is already stored in this database." ); + next TAG; + } + + #check if tag with same name/type already exists + my $sth = $self->prepare( "SELECT COUNT(*) FROM ditag + WHERE name = ? AND type = ?" ); + $sth->execute($ditag->name, $ditag->type); + if($sth->fetchrow() > 0){ + warning( "Ditag with name/type ".$ditag->name." / ".$ditag->type. + " is already stored in this database.\n". + "Use update_ditag() instead."); + next TAG; + } + + if ( $ditag->dbID ) { + my $sth = $self->prepare( "INSERT INTO ditag( ditag_id , name, type, tag_count, sequence ) ". + "VALUES( ?,?,?,?,? )" ); + $sth->bind_param(1,$ditag->dbID,SQL_INTEGER); + $sth->bind_param(2,$ditag->name,SQL_VARCHAR); + $sth->bind_param(3,$ditag->type,SQL_VARCHAR); + $sth->bind_param(4,$ditag->tag_count,SQL_VARCHAR); + $sth->bind_param(5,$ditag->sequence,SQL_VARCHAR); + $sth->execute(); + } else { + my $sth = $self->prepare( "INSERT INTO ditag( name, type, tag_count, sequence ) ". + "VALUES( ?,?,?,? )" ); + $sth->bind_param(1,$ditag->name,SQL_VARCHAR); + $sth->bind_param(2,$ditag->type,SQL_VARCHAR); + $sth->bind_param(3,$ditag->tag_count,SQL_VARCHAR); + $sth->bind_param(4,$ditag->sequence,SQL_VARCHAR); + $sth->execute(); + my $dbID = $sth->{'mysql_insertid'}; + $ditag->dbID($dbID); + $ditag->adaptor($self); + } + } + + return 1; +} + + +=head2 print_creation + + Arg [1] : ditag probe name + Arg [2] : ditag type + Arg [3] : ditag count + Arg [4] : ditag sequence + Arg [5] : (optional) ditag dbID + Description: convenience method to produce SQL insert statements + to speed up the creation of new ditags + Returntype : string + +=cut + +sub print_creation { + my ($self, $probe_name, $type, $count, $sequence, $dbid) = @_; + my $string; + if($dbid){ + $string = "INSERT INTO ditag( ditag_id, name, type, tag_count, sequence ) ". + "VALUES($dbid, '".$probe_name."', '".$type."', ".$count."'".$sequence."');\n"; + } + else { + $string = "INSERT INTO ditag( name, type, tag_count, sequence ) ". + "VALUES('".$probe_name."', '".$type."', ".$count.", '".$sequence."');\n"; + } + + return $string; +} + + +=head2 update_ditag + + Arg [1] : ditag to update + Description: update an existing ditag with new values + Returntype : true on success + +=cut + +sub update_ditag { + my ($self, $ditag) = @_; + + my $sth = $self->prepare( "UPDATE ditag SET name=?, type=?, tag_count=?, sequence=? where ditag_id=?;" ); + my $result =$sth->execute( + $ditag->name, + $ditag->type, + $ditag->tag_count, + $ditag->sequence, + $ditag->dbID, + ); + + return $result; +} + + +=head2 list_dbIDs + + Args : None + Example : my @feature_ids = @{$da->list_dbIDs()}; + Description: Gets an array of internal IDs for all Ditag objects in + the current database. + Returntype : List of ints + Exceptions : None + +=cut + +sub list_dbIDs { + my ($self, $ordered) = shift; + + return $self->_list_dbIDs('ditag'); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Map/DBSQL/DitagFeatureAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Map/DBSQL/DitagFeatureAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,724 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Map::DBSQL::DitagFeatureAdaptor + +=head1 SYNOPSIS + + my $dfa = $db->get_DitagFeatureAdaptor; + my $ditagFeatures = $dfa->fetch_all_by_Slice( $slice, "SME005" ); + + foreach my $ditagFeature (@$ditagFeatures) { + print $ditagFeature->ditag_id . " " + . $ditagFeature->slice . " " + . $ditagFeature->start . "-" + . $ditagFeature->end . " " + . $ditagFeature->strand; + } + +=head1 DESCRIPTION + +Provides database interaction for the Bio::EnsEMBL::Map::DitagFeature +object + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Map::DBSQL::DitagFeatureAdaptor; + +use strict; +use vars ('@ISA'); + +use Bio::EnsEMBL::Map::Ditag; +use Bio::EnsEMBL::Map::DitagFeature; +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Utils::Exception qw( throw warning ); + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); + + +=head2 fetch_all + + Arg [1] : none + Example : @all_tags = @{$ditagfeature_adaptor->fetch_all}; + Description: Retrieves all ditagFeatures from the database; + Usually not a good idea, use fetch_all_by_Slice instead. + Returntype : listref of Bio::EnsEMBL::Map::DitagFeature + Caller : general + Status : At Risk + +=cut + +sub fetch_all { + my $self = shift; + + my $sth = $self->prepare("SELECT df.ditag_feature_id, df.ditag_id, df.seq_region_id, + df.seq_region_start, df.seq_region_end, df.seq_region_strand, + df.analysis_id, df.hit_start, df.hit_end, df.hit_strand, + df.cigar_line, df.ditag_side, df.ditag_pair_id, d.tag_count + FROM ditag_feature df, ditag d + WHERE df.ditag_id=d.ditag_id" ); + $sth->execute; + + my $result = $self->_fetch($sth); + + return $result; +} + + +=head2 fetch_by_dbID + + Arg [1] : ditagFeature dbID + Example : @my_tags = @{$ditagfeature_adaptor->fetch_by_dbID($my_id)}; + Description: Retrieves a ditagFeature from the database. + Returntype : Bio::EnsEMBL::Map::DitagFeature + Caller : general + Status : At Risk + +=cut + +sub fetch_by_dbID { + my ($self, $dbid) = @_; + + my $sth = $self->prepare("SELECT df.ditag_feature_id, df.ditag_id, df.seq_region_id, + df.seq_region_start, df.seq_region_end, df.seq_region_strand, + df.analysis_id, df.hit_start, df.hit_end, df.hit_strand, + df.cigar_line, df.ditag_side, df.ditag_pair_id, d.tag_count + FROM ditag_feature df, ditag d + WHERE df.ditag_id=d.ditag_id AND df.ditag_feature_id = ?" ); + $sth->execute($dbid); + + my $result = $self->_fetch($sth); + + return $result->[0]; +} + + +=head2 fetch_all_by_ditagID + + Arg [1] : ditag dbID + Arg [2] : (optional) ditag-pair dbID + Arg [3] : (optional) analysis ID + Example : @my_tags = @{$ditagfeature_adaptor->fetch_all_by_ditag_id($my_id)}; + Description: Retrieves all ditagFeatures from the database linking to a specific ditag-id + Returntype : listref of Bio::EnsEMBL::Map::DitagFeature + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_ditagID { + my ($self, $ditag_id, $ditag_pair_id, $analysis_id) = @_; + + my $arg = $ditag_id; + my $sql = "SELECT df.ditag_feature_id, df.ditag_id, df.seq_region_id, + df.seq_region_start, df.seq_region_end, df.seq_region_strand, + df.analysis_id, df.hit_start, df.hit_end, df.hit_strand, + df.cigar_line, df.ditag_side, df.ditag_pair_id, d.tag_count + FROM ditag_feature df, ditag d + WHERE df.ditag_id=d.ditag_id AND df.ditag_id = ? "; + if($ditag_pair_id){ + $sql .= "AND df.ditag_pair_id = ? "; + $arg .= ", $ditag_pair_id"; + } + if($analysis_id){ + $sql .= "AND df.analysis_id = ? "; + $arg .= ", $analysis_id"; + } + $sql .= "ORDER BY df.ditag_pair_id"; + my $sth = $self->prepare($sql); + $sth->execute(split(",",$arg)); + + my $result = $self->_fetch($sth); + + return $result; +} + + +=head2 fetch_all_by_type + + Arg [1] : ditag type + Example : @my_tags = @{$ditagfeature_adaptor->fetch_all_by_type($type)}; + Description: Retrieves all ditagFeatures from the database linking to a specific ditag-type + Returntype : listref of Bio::EnsEMBL::Map::DitagFeature + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_type { + my ($self, $ditag_type) = @_; + + my $sth = $self->prepare("SELECT df.ditag_feature_id, df.ditag_id, df.seq_region_id, + df.seq_region_start, df.seq_region_end, df.seq_region_strand, + df.analysis_id, df.hit_start, df.hit_end, df.hit_strand, + df.cigar_line, df.ditag_side, df.ditag_pair_id, d.tag_count + FROM ditag_feature df, ditag d + WHERE df.ditag_id=d.ditag_id AND d.type = ? + ORDER BY df.ditag_id, df.ditag_pair_id" ); + $sth->execute($ditag_type); + + my $result = $self->_fetch($sth); + + return $result; +} + + +=head2 fetch_all_by_Slice + + Arg [1] : Bio::EnsEMBL::Slice + Arg [2] : (optional) ditag type name (specific library) or an aray ref with multiple type names + Arg [3] : (optional) analysis logic_name + Example : $tags = $ditagfeature_adaptor->fetch_all_by_Slice($slice, "SME005"); + Description: Retrieves ditagFeatures from the database overlapping a specific region + and (optional) of a specific ditag type or analysis. + Start & end locations are returned in slice coordinates, now. + Returntype : listref of Bio::EnsEMBL::Map::DitagFeatures + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_Slice { + my ($self, $slice, $tagtype, $logic_name) = @_; + + my @result; + my $moresql; + + if(!ref($slice) || !$slice->isa("Bio::EnsEMBL::Slice")) { + throw("Bio::EnsEMBL::Slice argument expected not $slice."); + } + + #get affected ditag_feature_ids + my $sql = "SELECT df.ditag_feature_id, df.ditag_id, df.seq_region_id, df.seq_region_start, + df.seq_region_end, df.seq_region_strand, df.analysis_id, df.hit_start, df.hit_end, + df.hit_strand, df.cigar_line, df.ditag_side, df.ditag_pair_id, + d.tag_count + FROM ditag_feature df, ditag d + WHERE df.ditag_id=d.ditag_id"; + if($tagtype){ + my $tagtypes = ''; + #check if array + if(ref $tagtype eq 'ARRAY'){ + my @arraytype_mod; + foreach my $arraytype (@$tagtype){ push @arraytype_mod, '"'.$arraytype.'"' } + $tagtypes = join(", ", @arraytype_mod); + } + else{ + $tagtypes = '"'.$tagtype.'"'; + } + $sql .= " AND d.type IN(".$tagtypes.")"; + } + if($logic_name){ + my $analysis = $self->db->get_AnalysisAdaptor->fetch_by_logic_name($logic_name); + if(!$analysis) { + return undef; + } + $sql .= " AND df.analysis_id = ".$analysis->dbID(); + } + $sql .= " AND df.seq_region_id = ".$slice->get_seq_region_id. + " AND df.seq_region_start <= ".$slice->end. + " AND df.seq_region_end >= ".$slice->start; + + my $sth = $self->prepare($sql); + $sth->execute(); + + my $result = $self->_fetch($sth, $slice); + push(@result, @$result); + + return \@result; +} + + +=head2 fetch_pairs_by_Slice + + Arg [1] : Bio::EnsEMBL::Slice + Arg [2] : (optional) ditag type (specific library) + Arg [3] : (optional) analysis logic_name + Example : my $ditagfeatures = $dfa->fetch_pairs_by_Slice($slice); + foreach my $ditagfeature (@$ditagfeatures){ + $minstart = $$ditagfeature2{'start'}; + $maxend = $$ditagfeature2{'end'}; + $bothstrand = $$ditagfeature2{'strand'}; + $tag_count = $$ditagfeature2{'tag_count'}; + print "$minstart, $maxend, $bothstrand, $tag_count\n"; + } + Description: Retrieves ditagFeature information in pairs from the database overlapping a specific region + and (optional) of a specific ditag type or analysis. The absotute start and end points are + fetched. + Slices should be SMALL! + Returntype : array ref with hash ref of artifical DitagFeature object + Caller : general + Status : At Risk + +=cut + +sub fetch_pairs_by_Slice { + my ($self, $slice, $tagtype, $logic_name) = @_; + my ($tag_id, $pair_id, $seq_region_id, $start, $end, $strand, $analysis_id, $tag_count); + my @result; + + my $sql = "SELECT df.ditag_id, df.ditag_pair_id, df.seq_region_id, MIN(df.seq_region_start), ". + "MAX(df.seq_region_end), df.seq_region_strand, df.analysis_id, d.tag_count ". + "FROM ditag_feature df, ditag d ". + "WHERE df.ditag_id=d.ditag_id "; + if($tagtype){ + $sql .= "AND d.type = \"".$tagtype."\""; + } + $sql .= " AND df.seq_region_id = ".$slice->get_seq_region_id. + " AND df.seq_region_start <= ".$slice->end. + " AND df.seq_region_end >= ".$slice->start; + if($logic_name){ + my $analysis = $self->db->get_AnalysisAdaptor->fetch_by_logic_name($logic_name); + if(!$analysis) { + return undef; + } + $sql .= " AND df.analysis_id = ".$analysis->dbID(); + } + $sql .= " GROUP BY df.ditag_id, df.ditag_pair_id;"; + my $sth = $self->prepare($sql); + $sth->execute(); + $sth->bind_columns( \$tag_id, \$pair_id, \$seq_region_id, \$start, \$end, \$strand, \$analysis_id ,\$tag_count); + while ( $sth->fetch ) { + #convert into relative slice coordinates + if($slice->strand == 1) { + $start = $start - $slice->start + 1; + $end = $end - $slice->start + 1; + } + else{ + $start = $slice->end - $end + 1; + $end = $slice->end - $start + 1; + $strand *= -1; + } + + my %ditag_feature_pair = ( + ditag => $tag_id, + pair_id => $pair_id, + region => $seq_region_id, + start => $start, + end => $end, + strand => $strand, + analysis => $analysis_id, + tag_count => $tag_count + ); + push(@result, \%ditag_feature_pair); + } + + return \@result; +} + + +=head2 _fetch + + Arg [1] : statement handler + Arg [2] : (optional) target-slice for the feature + Description: generic sql-fetch function for the DitagFeature fetch methods + Returntype : listref of Bio::EnsEMBL::Map::DitagFeatures + Caller : private + Status : At Risk + +=cut + +sub _fetch { + my ($self, $sth, $dest_slice) = @_; + + my ( $tag_id, $mothertag_id, $seqreg, $seqstart, $seqend, $strand, $analysis_id, $hit_start, + $hit_end, $hit_strand, $cigar_line, $ditag_side, $ditag_pair_id, $tag_count ); + $sth->bind_columns( \$tag_id, \$mothertag_id, \$seqreg, + \$seqstart, \$seqend, \$strand, + \$analysis_id, \$hit_start, \$hit_end, + \$hit_strand, \$cigar_line, \$ditag_side, + \$ditag_pair_id, \$tag_count ); + + my @ditag_features; + my $dest_slice_start; + my $dest_slice_end; + my $dest_slice_strand; + if($dest_slice) { + $dest_slice_start = $dest_slice->start(); + $dest_slice_end = $dest_slice->end(); + $dest_slice_strand = $dest_slice->strand(); + } + + while ( $sth->fetch ) { + my $analysis_obj = $self->db->get_AnalysisAdaptor->fetch_by_dbID($analysis_id); + my $slice = $self->db->get_SliceAdaptor->fetch_by_seq_region_id($seqreg); + + if($dest_slice) { + if($dest_slice_start != 1 || $dest_slice_strand != 1) { + if($dest_slice_strand == 1) { + $seqstart = $seqstart - $dest_slice_start + 1; + $seqend = $seqend - $dest_slice_start + 1; + } else { + my $tmp_seq_region_start = $seqstart; + $seqstart = $dest_slice_end - $seqend + 1; + $seqend = $dest_slice_end - $tmp_seq_region_start + 1; + $strand *= -1; + } + $slice = $dest_slice; + } + } + + push @ditag_features, + Bio::EnsEMBL::Map::DitagFeature->new( -dbid => $tag_id, + -slice => $slice, + -start => $seqstart, + -end => $seqend, + -strand => $strand, + -analysis => $analysis_obj, + -hit_start => $hit_start, + -hit_end => $hit_end, + -hit_strand => $hit_strand, + -ditag_id => $mothertag_id, + -cigar_line => $cigar_line, + -ditag_side => $ditag_side, + -ditag_pair_id => $ditag_pair_id, + -ditag => undef, + -tag_count => $tag_count, + -adaptor => $self, + ); + } + + return \@ditag_features; +} + + +=head2 sequence + + Arg [1] : dbID of DitagFeature + Example : $ditagfeature_adaptor->get_sequence($ditagFeature->dbID) + Description: get the part of the sequence of a ditag, + that is actully aligned to the genome. + Returntype : string + Exceptions : thrown if not all data needed for storing is populated in the + ditag features + Caller : Bio::EnsEMBL::Map::DitagFeature + Status : At Risk + +=cut + +sub sequence { + my ($self, $dbID) = @_; + + my $sequence = undef; + my $db = $self->db() or throw "Couldn t get database connection."; + my $sql = "SELECT d.sequence, df.hit_start, df.hit_end, df.hit_strand ". + "FROM ditag d, ditag_feature df ". + "WHERE df.ditag_id=d.ditag_id and df.ditag_feature_id = ?"; + my $sth = $db->dbc->prepare($sql); + $sth->execute( $dbID ); + my ($seq, $start, $end, $strand) = $sth->fetchrow_array(); + if($seq and $start and $end and $strand){ + $sequence = substr($seq, ($start-1), ($end-$strand)); + if($strand == -1) { + $sequence =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/; + } + } + + return $sequence; +} + + +=head2 store + + Arg [1] : (Array ref of) Bio::EnsEMBL::Map::DitagFeature + Example : $ditagfeature_adaptor->store(@ditag_features); + Description: Stores a single ditagFeature or + a list of ditagFeatures in this database. + Returntype : none + Exceptions : thrown if not all data needed for storing is populated in the + ditag features + Caller : general + Status : At Risk + +=cut + +sub store { + my ( $self, $ditag_features ) = @_; + + if ( ref $ditag_features eq 'ARRAY' ) { + if ( scalar(@$ditag_features) == 0 ) { + throw( "Must call store with ditag_feature or list ref of ditags_features" ); + } + } elsif ($ditag_features) { + my @ditag_features; + push @ditag_features, $ditag_features; + $ditag_features = \@ditag_features; + } else { + throw( "Must call store with ditag_feature or list ref of ditag_features." ); + } + + my $db = $self->db() or throw "Couldn t get database connection."; + + my $sth1 = $self->prepare( "INSERT INTO ditag_feature( ditag_id, seq_region_id, seq_region_start, + seq_region_end, seq_region_strand, analysis_id, hit_start, hit_end, + hit_strand, cigar_line, ditag_side, ditag_pair_id ) + VALUES( ?,?,?,?,?,?,?,?,?,?,?,? )" ); + my $sth2 = $self->prepare( "INSERT INTO ditag_feature( ditag_feature_ID, ditag_id, seq_region_id, + seq_region_start, seq_region_end, seq_region_strand, analysis_id, hit_start, + hit_end, hit_strand, cigar_line, ditag_side, ditag_pair_id ) + VALUES( ?,?,?,?,?,?,?,?,?,?,?,?,? )" ); +# my $sth3 = $self->prepare( "SELECT COUNT(*) FROM ditag_feature +# WHERE ditag_id = ?" ); + +TAG: + foreach my $ditag_feature (@$ditag_features) { + + if ( !ref $ditag_feature || !$ditag_feature->isa("Bio::EnsEMBL::Map::DitagFeature") ) { + throw( "Object must be an Ensembl DitagFeature, " + . "not a " . ref($ditag_feature) ); + } + if ( $ditag_feature->is_stored($db) ) { + warning( "DitagFeature " . $ditag_feature->dbID . + " is already stored in this database,". + " maybe you need to use the update() method?" ); + next TAG; + } + if(!$ditag_feature->ditag_id or !($self->db->get_DitagAdaptor->fetch_by_dbID($ditag_feature->ditag_id))){ + throw("DitagFeature must be supplied with the id of a corresponding Ditag object."); + } + if(!$ditag_feature->ditag or !$ditag_feature->ditag->isa("Bio::EnsEMBL::Map::Ditag")){ + throw("DitagFeature must be linked to a valid Ditag object."); + } + + +# #check if more than x tags with this ditag id exist +# $sth3->execute( $ditag_feature->ditag_id ); +# my ($num) = $sth3->fetchrow_array(); +# if ( ($num) and ($num > 1) ) { +# warning( "There are already at least 2 DitagFeatures relating to Ditag ". +# $ditag->ditag_id." stored in this database." ); +# if ( $num > 4 ) { +# warning( "not storing" ); +# next TAG; +# } +# } + + if ( $ditag_feature->dbID ) { + $sth2->bind_param( 1, $ditag_feature->dbID, SQL_INTEGER ); + $sth2->bind_param( 2, $ditag_feature->ditag_id, SQL_INTEGER ); + $sth2->bind_param( 3, ($ditag_feature->slice->get_seq_region_id), SQL_INTEGER ); + $sth2->bind_param( 4, $ditag_feature->start, SQL_INTEGER ); + $sth2->bind_param( 5, $ditag_feature->end, SQL_INTEGER ); + $sth2->bind_param( 6, $ditag_feature->strand, SQL_VARCHAR ); + $sth2->bind_param( 7, $ditag_feature->analysis->dbID, SQL_INTEGER ); + $sth2->bind_param( 8, $ditag_feature->hit_start, SQL_INTEGER ); + $sth2->bind_param( 9, $ditag_feature->hit_end, SQL_INTEGER ); + $sth2->bind_param( 10, $ditag_feature->hit_strand, SQL_VARCHAR ); + $sth2->bind_param( 11, $ditag_feature->cigar_line, SQL_VARCHAR ); + $sth2->bind_param( 12, $ditag_feature->ditag_side, SQL_VARCHAR ); + $sth2->bind_param( 13, $ditag_feature->ditag_pair_id, SQL_VARCHAR ); + $sth2->execute(); + } + else{ + $sth1->bind_param( 1, $ditag_feature->ditag_id, SQL_INTEGER ); + $sth1->bind_param( 2, ($ditag_feature->slice->get_seq_region_id), SQL_INTEGER ); + $sth1->bind_param( 3, $ditag_feature->start, SQL_INTEGER ); + $sth1->bind_param( 4, $ditag_feature->end, SQL_INTEGER ); + $sth1->bind_param( 5, $ditag_feature->strand, SQL_VARCHAR ); + $sth1->bind_param( 6, $ditag_feature->analysis->dbID, SQL_INTEGER ); + $sth1->bind_param( 7, $ditag_feature->hit_start, SQL_INTEGER ); + $sth1->bind_param( 8, $ditag_feature->hit_end, SQL_INTEGER ); + $sth1->bind_param( 9, $ditag_feature->hit_strand, SQL_VARCHAR ); + $sth1->bind_param( 10, $ditag_feature->cigar_line, SQL_VARCHAR ); + $sth1->bind_param( 11, $ditag_feature->ditag_side, SQL_VARCHAR ); + $sth1->bind_param( 12, $ditag_feature->ditag_pair_id, SQL_VARCHAR ); + $sth1->execute(); + my $dbID = $sth1->{'mysql_insertid'}; + $ditag_feature->dbID($dbID); + $ditag_feature->adaptor($self); + } + + } +} + + +=head2 batch_store + + Arg [1] : (Array ref of) Bio::EnsEMBL::Map::DitagFeatures + Arg [2] : bool have_dbIDs + Example : $ditagfeature_adaptor->batch_store(\@ditag_features); + Description: Stores a list of ditagFeatures in this database. + DitagFeatures are expected to have no dbID yet unless flag "have_dbIDs" is true. + They are inserted in one combined INSERT for better performance. + Returntype : none + Exceptions : thrown if not all data needed for storing is given for the + ditag features + Caller : general + Status : At Risk + +=cut + +sub batch_store { + my ( $self, $ditag_features, $have_dbIDs ) = @_; + + my @good_ditag_features; + my ($sql, $sqladd); + my $inserts = 0; + + if ( ref $ditag_features eq 'ARRAY' ) { + if ( scalar(@$ditag_features) == 0 ) { + throw( "Must call store with ditag_feature or list ref of ditag_features." ); + } + } elsif ($ditag_features) { + my @ditag_features; + push @ditag_features, $ditag_features; + $ditag_features = \@ditag_features; + } else { + throw( "Must call store with ditag_feature or list ref of ditag_features." ); + } + + my $db = $self->db() or throw "Couldn t get database connection."; + + #check whether it s a DitagFeature object and is not stored already + foreach my $ditag_feature (@$ditag_features) { + + if ( !ref $ditag_feature || !$ditag_feature->isa("Bio::EnsEMBL::Map::DitagFeature") ) { + throw( "Object must be an Ensembl DitagFeature, " + . "not a " . ref($ditag_feature) ); + } + if(!$ditag_feature->ditag_id or !($self->db->get_DitagAdaptor->fetch_by_dbID($ditag_feature->ditag_id))){ + throw("DitagFeature must be supplied with the id of a corresponding Ditag object."); + } + + if(!$ditag_feature->ditag or !$ditag_feature->ditag->isa("Bio::EnsEMBL::Map::Ditag")){ + throw("DitagFeature must be linked to a valid Ditag object."); + } + if ( $ditag_feature->is_stored($db) ) { + warning( "DitagFeature " . $ditag_feature->dbID + . " is already stored in this database." ); + next; + } + push(@good_ditag_features, $ditag_feature); + } + $ditag_features = undef; + + #create batch INSERT + if($have_dbIDs){ + $sql = "INSERT INTO ditag_feature ( ditag_feature_id, ditag_id, seq_region_id, seq_region_start, ". + "seq_region_end, seq_region_strand, analysis_id, hit_start, hit_end, ". + "hit_strand, cigar_line, ditag_side, ditag_pair_id ) VALUES "; + foreach my $ditag_feature (@good_ditag_features) { + $sqladd = ""; + if($inserts){ $sqladd = ", " } + $sqladd .= "(". $ditag_feature->ditag_feature_id.", ".$ditag_feature->ditag_id.", ". + ($ditag_feature->slice->get_seq_region_id).", ". $ditag_feature->start.", ". + $ditag_feature->end.", '".$ditag_feature->strand."', ".$ditag_feature->analysis->dbID.", ". + $ditag_feature->hit_start.", ".$ditag_feature->hit_end.", '".$ditag_feature->hit_strand. + "', '".$ditag_feature->cigar_line."', '".$ditag_feature->ditag_side."', ". + $ditag_feature->ditag_pair_id.")"; + $sql .= $sqladd; + $inserts++; + } + } + else{ + $sql = "INSERT INTO ditag_feature ( ditag_id, seq_region_id, seq_region_start, ". + "seq_region_end, seq_region_strand, analysis_id, hit_start, hit_end, ". + "hit_strand, cigar_line, ditag_side, ditag_pair_id ) VALUES "; + foreach my $ditag_feature (@good_ditag_features) { + $sqladd = ""; + if($inserts){ $sqladd = ", " } + $sqladd .= "(". $ditag_feature->ditag_id.", ".($ditag_feature->slice->get_seq_region_id).", ". + $ditag_feature->start.", ".$ditag_feature->end.", '".$ditag_feature->strand."', ". + $ditag_feature->analysis->dbID.", ".$ditag_feature->hit_start.", ".$ditag_feature->hit_end. + ", '".$ditag_feature->hit_strand."', '".$ditag_feature->cigar_line."', '". + $ditag_feature->ditag_side."', ".$ditag_feature->ditag_pair_id.")"; + $sql .= $sqladd; + $inserts++; + } + } + + #STORE + if($inserts){ + print STDERR "\nHave $inserts Features.\n"; + eval{ + $db->dbc->do($sql); + }; + if($@){ + warning("Problem inserting ditag feature batch!".$@."\n"); + } + } + else{ + warn "Nothing stored!"; + } + +} + + +=head2 update + + Arg [1] : ditagFeature to update + Description: update an existing ditagFeature with new values + Returntype : 1 on success + Status : At Risk + +=cut + +sub update { + my ($self, $ditagFeature) = @_; + + my $sth = $self->prepare( "UPDATE ditag_feature + SET ditag_id=?, seq_region_id=?, seq_region_start=?, seq_region_end=?, + seq_region_strand=?, analysis_id=?, hit_start=?, hit_end=?, hit_strand=?, + cigar_line=?, ditag_side=?, ditag_pair_id=? + where ditag_feature_id=?;" ); + + $sth->bind_param(1, $ditagFeature->ditag_id, SQL_INTEGER); + $sth->bind_param(1, $ditagFeature->seq_region_id, SQL_INTEGER); + $sth->bind_param(1, $ditagFeature->seq_region_start, SQL_INTEGER); + $sth->bind_param(1, $ditagFeature->seq_region_end, SQL_INTEGER); + $sth->bind_param(1, $ditagFeature->seq_region_strand, SQL_TINYINT); + $sth->bind_param(1, $ditagFeature->hit_start, SQL_INTEGER); + $sth->bind_param(1, $ditagFeature->hit_end, SQL_INTEGER); + $sth->bind_param(1, $ditagFeature->hit_strand, SQL_TINYINT); + $sth->bind_param(1, $ditagFeature->cigar_line, SQL_LONGVARCHAR); + $sth->bind_param(1, $ditagFeature->ditag_side, SQL_VARCHAR); + $sth->bind_param(1, $ditagFeature->ditag_pair_id, SQL_INTEGER); + $sth->bind_param(1, $ditagFeature->dbID, SQL_INTEGER); + + my $result =$sth->execute(); + + return $result; +} + + +=head2 list_dbIDs + + Args : None + Example : my @feature_ids = @{$dfa->list_dbIDs()}; + Description: Gets an array of internal IDs for all DitagFeature objects in + the current database. + Arg[1] : int. not 0 for the ids to be sorted by the seq_region. + Returntype : List of ints + Exceptions : None + Status : Stable + +=cut + +sub list_dbIDs { + my ($self, $ordered) = shift; + + return $self->_list_dbIDs('ditag_feature', undef, $ordered); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Map/DBSQL/MarkerAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Map/DBSQL/MarkerAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,462 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Map::DBSQL::MarkerAdaptor + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +Provides database interaction for the Bio::EnsEMBL::Map::Marker object + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Map::DBSQL::MarkerAdaptor; + +use strict; + +use vars ('@ISA'); + +use Bio::EnsEMBL::Map::Marker; +use Bio::EnsEMBL::Map::MapLocation; +use Bio::EnsEMBL::Map::MarkerSynonym; +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(warning); + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); + + + + +=head2 fetch_all + + Arg [1] : none + Example : @all_markers = @{$marker_adaptor->fetch_all}; + Description: Retrieves all markers from the database + Returntype : listref of Bio::EnsEMBL::Map::Markers + Exceptions : none + Caller : general + Status : stable + +=cut + +sub fetch_all { + my $self = shift; + my $dbID = shift; + + my $sth = $self->prepare("SELECT m.marker_id, m.priority, m.left_primer, + m.right_primer, m.type, + m.min_primer_dist, m.max_primer_dist, + ms.marker_synonym_id, ms.name, ms.source + FROM marker m + LEFT JOIN marker_synonym ms + ON ms.marker_synonym_id = + m.display_marker_synonym_id"); + + $sth->execute; + + my( $marker_id, $priority, $left_primer, $right_primer, $type, + $min_pdist, $max_pdist, $ms_id, $ms_name, $ms_src); + + $sth->bind_columns(\$marker_id, \$priority, + \$left_primer, \$right_primer, \$type, \$min_pdist, \$max_pdist, + \$ms_id, \$ms_name, \$ms_src); + + my @out; + while($sth->fetch) { + #create a display marker synonym for each marker created, if one is defined + my $synonym; + if($ms_id) { + $synonym = Bio::EnsEMBL::Map::MarkerSynonym->new + ($ms_id, $ms_src, $ms_name); + } + + push @out, Bio::EnsEMBL::Map::Marker->new + ($marker_id, $self, $left_primer, $right_primer, $min_pdist, $max_pdist, + $priority, $type, $synonym); + } + + return \@out; +} + + + +=head2 fetch_by_dbID + + Arg [1] : int $dbID + The internal identifier of the Marker to retrieve + Example : $marker = $marker_adaptor->fetch_by_dbID(123); + Description: Retrieves a marker object from the database via its internal + identifier. + Returntype : Bio::EnsEMBL::Map::Marker + Exceptions : thrown if no marker with $dbID is present in the database + Caller : general + Status : stable + +=cut + +sub fetch_by_dbID { + my $self = shift; + my $dbID = shift; + + $self->throw('dbID argument is required') unless($dbID); + + my $sth = $self->prepare("SELECT m.marker_id, m.display_marker_synonym_id, + m.priority, m.left_primer, m.right_primer, + m.type, + m.min_primer_dist, m.max_primer_dist, + ms.marker_synonym_id, + ms.source, ms.name + FROM marker m, marker_synonym ms + WHERE m.marker_id = ? + AND ms.marker_id = m.marker_id"); + + $sth->execute($dbID); + + my( $marker_id, $display_ms_id, $priority, $left_primer, $right_primer, + $type, $min_pdist, $max_pdist, $ms_id, $ms_src, $ms_name); + + $sth->bind_columns(\$marker_id, \$display_ms_id, \$priority, + \$left_primer, \$right_primer, \$type, + \$min_pdist, \$max_pdist, + \$ms_id, \$ms_src, \$ms_name); + + my $display_synonym; + my @synonyms; + while($sth->fetch) { + #create a new synonym for each row + my $s = new Bio::EnsEMBL::Map::MarkerSynonym->new($ms_id, $ms_src, + $ms_name); + $display_synonym = $s if($display_ms_id == $ms_id); + push @synonyms, $s; + } + + $sth->finish; + + unless($marker_id) { + $self-> warning("marker with dbID=[$dbID] not present in database"); + return undef; + } + + #now create the marker + return new Bio::EnsEMBL::Map::Marker->new( + $marker_id, $self, $left_primer, $right_primer,$min_pdist, $max_pdist, + $priority, $type,$display_synonym, \@synonyms); +} + + +=head2 fetch_all_by_synonym + + Arg [1] : string $synonym + An name of this marker + Arg [2] : (opional) string $source + The source of this name + Example : @markers = @{$marker_adaptor->fetch_all_by_synonym($id)}; + Description: Retrieves a list of markers with the synonym (alias) $synonym + and from source $source. In most cases the list will have a + single element, however it is possible that multiple markers + with the same synonym exist. + Returntype : listref of Bio::EnsEMBL::Map::Markers + Exceptions : none + Caller : general + Status : stable + +=cut + +sub fetch_all_by_synonym { + my ($self, $synonym, $source) = @_; + + $self->throw("synonym argument is required") unless($synonym); + + my $q = "SELECT marker_id + FROM marker_synonym ms + WHERE ms.name = ?"; + + my @bind_vals = ($synonym); + + if($source) { + $q .= " AND ms.source = ?"; + push(@bind_vals, $source); + } + + my $sth = $self->prepare($q); + $sth->execute(@bind_vals); + + my @out = (); + my $marker_id; + my %seen; + + #fetch the markers and filter out duplictes + while(($marker_id) = $sth->fetchrow_array) { + next if $seen{$marker_id}; + + # some synonyms point to markers that don't exist, so only add genuine ones + my $marker = $self->fetch_by_dbID($marker_id); + push @out, $marker if ($marker); + $seen{$marker_id} = 1; + } + + $sth->finish; + + return \@out; +} + + + +=head2 fetch_attributes + + Arg [1] : Bio::EnsEMBL::Map::Marker $marker + Example : $marker_adaptor->fetch_attributes($marker); + Description: Fetches the marker_synonym and map_location attributes of + a marker. This is done so that these attributes can be + lazy-loaded on request. + Returntype : none + Exceptions : none + Caller : Bio::EnsEMBL::Map::Marker::marker + Status : stable + +=cut + +sub fetch_attributes { + my $self = shift; + my $marker = shift; + + my $m_id = $marker->dbID; + + $self->throw('Marker argument does not have a dbID') unless($m_id); + + # + # First Retrieve synonyms for this marker + # + $marker->flush_MarkerSynonyms; + + my $sth = $self->prepare("SELECT ms.marker_synonym_id, ms.source, ms.name + FROM marker_synonym ms + WHERE ms.marker_id = ?"); + + my @syns = (); + my ($ms_id, $ms_src, $ms_name); + + $sth->execute($m_id); + $sth->bind_columns(\$ms_id, \$ms_src, \$ms_name); + + while($sth->fetch) { + push @syns, Bio::EnsEMBL::Map::MarkerSynonym->new($ms_id,$ms_src,$ms_name); + } + $sth->finish; + + $marker->add_MarkerSynonyms(@syns) if(@syns); + + # + # Now retrieve map locations for this marker + # + $marker->flush_MapLocations; + + $sth = $self->prepare("SELECT mloc.chromosome_name, mloc.position, + mloc.lod_score, map.map_name, ms.name + FROM marker_map_location mloc, map map, + marker_synonym ms + WHERE mloc.marker_id = ? + AND map.map_id = mloc.map_id + AND ms.marker_synonym_id = mloc.marker_synonym_id"); + + my($chr_name, $pos, $lod, $mname, $name); + my @mlocs; + + $sth->execute($m_id); + + $sth->bind_columns(\$chr_name, \$pos, \$lod, \$mname, \$name); + + while($sth->fetch) { + push(@mlocs, Bio::EnsEMBL::Map::MapLocation->new($name, $mname, + $chr_name,$pos,$lod)); + } + + $sth->finish; + + $marker->add_MapLocations(@mlocs); +} + + + +=head2 store + + Arg [1] : Bio::EnsEMBL::Map::Marker + Example : $marker_adaptor->store(@markers); + Description: Stores a list of markers in this database. + The dbID and adaptor of each marker will be set on successful + storing. + Returntype : 1 on success + Exceptions : thrown if not all data needed for storing is populated in the + marker + Caller : general + Status : stable + +=cut + +sub store { + my ($self, @markers) = @_; + + MARKER:foreach my $marker( @markers ){ + + if($marker->dbID){ + if($self->fetch_by_dbID($marker->dbID)){ + next MARKER; + } + } + # + # Sanity check + # + if(!$marker || + !ref($marker) || + !$marker->isa('Bio::EnsEMBL::Map::Marker')) { + $self->throw('Incorrect argument [$marker] to store. Expected ' . + 'Bio::EnsEMBL::Map::Marker'); + } + + # Don't store if already stored + if($marker->is_stored($self->db())) { + warning('Marker ['.$marker->dbID.'] is already stored in this DB.'); + next; + } + + # Get/test the display marker synonym + my $display_synonym = $marker->display_MarkerSynonym; + if(!$display_synonym || !ref($display_synonym) || + !$display_synonym->isa('Bio::EnsEMBL::Map::MarkerSynonym')) { + $self->throw('Cannot store Marker without an associated '. + 'display_MarkerSynonym'); + } + + # Store the Marker itself + my $q = qq( +INSERT INTO marker ( left_primer, right_primer, + min_primer_dist, max_primer_dist, + priority, type) + VALUES ( ?,?,?,?,?,?) ); + + my $sth = $self->prepare($q); + + $sth->execute( $marker->left_primer || '', + $marker->right_primer || '', + $marker->min_primer_dist || 0, + $marker->max_primer_dist || 0, + $marker->priority, + $marker->type ); + + my $dbID = $sth->{'mysql_insertid'}; + $marker->dbID($dbID); + $marker->adaptor($self); + + if(!$display_synonym->dbID) { + # Store synonym + $self->_store_MarkerSynonym($marker,$display_synonym); + } + my $display_synonym_id = $display_synonym->dbID || + $self->throw('display_MarkerSynonym must have dbID to store Marker'); + + # Update the marker with the display synonym + my $qup = qq( +UPDATE marker +SET display_marker_synonym_id = $display_synonym_id + WHERE marker_id = ? ); + my $sthup = $self->prepare($qup); + $sthup->execute($dbID); + + # Loop through all MarkerSynonyms and store if needed + foreach my $synonym( @{$marker->get_all_MarkerSynonyms} ){ + if(!$synonym->dbID) { + $self->_store_MarkerSynonym($marker,$synonym); + } + } + + # Loop through all MapLocations and store if needed + foreach my $loc( @{$marker->get_all_MapLocations} ){ + # Dunno how to implement this :( Just bomb out + $self->throw( 'Storing of MapLocation objects is not yet implemented' ) + } + + } + return 1; +} + + +=head2 _store_MarkerSynonym + + Arg [1] : Bio::EnsEMBL::Map::Marker + Arg [2] : Bio::EnsEMBL::Map::MarkerSynonym + Example : $marker_adaptor->_store_MarkerSynonym($marker,$ms); + Description: Stores a marker synonym attached to the marker in the database + The dbID of each MarkerSynonym will be set on successful + storing. + Returntype : dbID of the MarkerSynonym + Exceptions : thrown if not all data needed for storing is populated + Caller : $self->store + Status : stable + +=cut + +sub _store_MarkerSynonym{ + my $self = shift; + my $marker = shift; + my $ms = shift; + + # Sanity check + if(!$marker || !ref($marker) || + !$marker->isa('Bio::EnsEMBL::Map::Marker')) { + $self->throw("Incorrect argument [$marker] to _store_MarkerSynonym." . + "Expected Bio::EnsEMBL::Map::Marker"); + } + if(!$ms || !ref($ms) || + !$ms->isa('Bio::EnsEMBL::Map::MarkerSynonym')) { + $self->throw("Incorrect argument [$ms] to _store_MarkerSynonym." . + "Expected Bio::EnsEMBL::Map::MarkerSynonym"); + } + + # Don't store if already stored + if($ms->dbID) { + warning('MarkerSynonym ['.$ms->dbID.'] is already stored in this DB.'); + return; + } + + my $marker_id = $marker->dbID || + throw( "Marker has no dbID. Cannot store MarkerSynonym" ); + + # Store the synonym + my $q = qq( +INSERT INTO marker_synonym ( marker_id, source, name ) +VALUES ( ?,?,?) ); + + my $sth = $self->prepare($q); + + $sth->execute( $marker_id, + $ms->source, + $ms->name ); + + my $dbID = $sth->{'mysql_insertid'}; + $ms->dbID($dbID); + return $dbID; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Map/DBSQL/MarkerFeatureAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Map/DBSQL/MarkerFeatureAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,418 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Map::DBSQL::MarkerFeatureAdaptor + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +This object is responisble for all database interaction involving marker +features including the fetching and storing of marker features. + +The bulk of this objects' methods are inherited from +Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Map::DBSQL::MarkerFeatureAdaptor; + +use strict; + +use Bio::EnsEMBL::Map::MarkerFeature; +use Bio::EnsEMBL::Map::Marker; +use Bio::EnsEMBL::Map::MarkerSynonym; +use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + +use vars qw(@ISA); + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor); + + + +=head2 fetch_all_by_Marker + + Arg [1] : Bio::EnsEMBL::Map::Marker + Example : @ms = @{$marker_feature_adaptor->fetch_by_Marker($mrkr)}; + Description: Retrieves a list of MarkerFeatures for a given marker + Returntype : listref of Bio::EnsEMBL::MarkerFeatures + Exceptions : none + Caller : general + Status : stable + +=cut + +sub fetch_all_by_Marker { + my $self = shift; + my $marker = shift; + + my $constraint = 'm.marker_id = ' . $marker->dbID; + + return $self->generic_fetch($constraint, @_); +} + +=head2 fetch_all_by_Slice_and_MarkerName + + Arg [1] : Bio::EnsEMBL::Slice $slice + Arg [2] : string marker name + Example : @ms = @{$marker_feature_adaptor->fetch_all_by_Slice_and_MarkerName($slice, $name)}; + Description: Retrieves a list of MarkerFeatures for a given marker name + Returntype : listref of Bio::EnsEMBL::Map::MarkerFeatures + Exceptions : none + Caller : general + Status : stable + +=cut + +sub fetch_all_by_Slice_and_MarkerName { + my ($self, $slice, $name) = @_; + return unless $slice && $name; + + my $constraint = 'ms.name = "' . $name . '"'; + my $results = $self->fetch_all_by_Slice_constraint($slice, $constraint); + return $results; +} + + +=head2 fetch_all_by_Slice_and_priority + + Arg [1] : Bio::EnsEMBL::Slice $slice + Arg [2] : (optional) int $priority + Arg [3] : (optional) int $map_weight + Arg [3] : (optional) string $logic_name + Example : @feats = @{$mfa->fetch_all_by_Slice_and_priority($slice,80,2)}; + Description: Retrieves all marker features above a specified threshold + priority which overlap the provided slice, below a + a specified map_weight. + Returntype : listref of Bio::EnsEMBL::Map::MarkerFeatures in slice coords + Exceptions : none + Caller : general + Status : stable + +=cut + +sub fetch_all_by_Slice_and_priority { + my ($self, $slice, $priority, $map_weight, @args) = @_; + + my $constraint = ''; + if(defined $priority) { + $constraint = "m.priority > $priority"; + } + + if(defined $map_weight) { + if($constraint) { + $constraint .= " AND mf.map_weight < $map_weight"; + } else { + $constraint = "mf.map_weight < $map_weight"; + } + } + + return $self->fetch_all_by_Slice_constraint($slice, $constraint, @args); +} + + + +=head2 fetch_all_by_RawContig_and_priority + + Description: DEPRECATED use fetch_all_by_Slice_and_priority instead + +=cut + +sub fetch_all_by_RawContig_and_priority { + deprecate('Use fetch_all_by_Slice_and_priority() instead.'); + fetch_all_by_Slice_and_priority(@_); +} + + +sub fetch_all_by_Slice_and_score { + my $self = shift; + throw('fetch_all_by_Slice_and_score should not be' . + 'used to fetch marker_features'); +} + +sub _columns { + my $self = shift; + + return ('mf.marker_feature_id', 'mf.marker_id', + 'mf.seq_region_id', 'mf.seq_region_start', 'mf.seq_region_end', + 'mf.analysis_id', 'mf.map_weight', + 'm.left_primer', 'm.right_primer', 'm.min_primer_dist', + 'm.max_primer_dist', 'm.priority', 'm.type', 'ms.marker_synonym_id', + 'ms.name', 'ms.source'); +} + +sub _tables { + my $self = shift; + + return (['marker_feature', 'mf'], #primary table + ['marker', 'm'], + ['marker_synonym', 'ms']); +} + +sub _left_join { + my $self = shift; + + return ( [ 'marker_synonym', + 'm.display_marker_synonym_id = ms.marker_synonym_id' ] ); +} + +sub _default_where_clause { + my $self = shift; + + return ('mf.marker_id = m.marker_id'); +} + +sub _objs_from_sth { + my ($self, $sth, $mapper, $dest_slice) = @_; + + my ($marker_feature_id, $marker_id, + $seq_region_id, $seq_region_start, $seq_region_end, + $analysis_id, $map_weight, + $left_primer, $right_primer, $min_primer_dist, $max_primer_dist, + $priority, $type, $ms_id, $ms_name, $ms_source); + + #warning: ordering depends on _columns function implementation + $sth->bind_columns(\$marker_feature_id, \$marker_id, + \$seq_region_id, \$seq_region_start, \$seq_region_end, + \$analysis_id, \$map_weight, + \$left_primer, \$right_primer, \$min_primer_dist, \$max_primer_dist, + \$priority, \$type, \$ms_id, \$ms_name, \$ms_source); + + my @out = (); + + my %marker_cache; + my %slice_hash; +# my %sr_name_hash; + my %sr_cs_hash; + my %analysis_cache; + my $marker_adp = $self->db->get_MarkerAdaptor; + my $sa = $self->db->get_SliceAdaptor; + my $analysis_adp = $self->db->get_AnalysisAdaptor; + + my $asm_cs; + my $cmp_cs; + my $asm_cs_vers; + my $asm_cs_name; + my $cmp_cs_vers; + my $cmp_cs_name; + if($mapper) { + $asm_cs = $mapper->assembled_CoordSystem(); + $cmp_cs = $mapper->component_CoordSystem(); + $asm_cs_name = $asm_cs->name(); + $asm_cs_vers = $asm_cs->version(); + $cmp_cs_name = $cmp_cs->name(); + $cmp_cs_vers = $cmp_cs->version(); + } + + my $dest_slice_start; + my $dest_slice_end; + my $dest_slice_strand; + my $dest_slice_length; + if($dest_slice) { + $dest_slice_start = $dest_slice->start(); + $dest_slice_end = $dest_slice->end(); + $dest_slice_strand = $dest_slice->strand(); + $dest_slice_length = $dest_slice->length(); + } + + FEATURE: while($sth->fetch) { + #create a new marker unless this one has been seen already + my $marker; + if(!($marker = $marker_cache{$marker_id})) { + #create a new marker synonym for the display synonym (if defined) + my $ms; + if($ms_id) { + $ms = Bio::EnsEMBL::Map::MarkerSynonym->new + ($ms_id, $ms_source, $ms_name); + } + + #create a new marker + $marker = Bio::EnsEMBL::Map::Marker->new + ($marker_id, $marker_adp, + $left_primer, $right_primer, $min_primer_dist, $max_primer_dist, + $priority, $type, $ms); + $marker_cache{$marker_id} = $marker; + } + + #get the slice object + my $slice = $slice_hash{$seq_region_id}; + + if(!$slice) { + $slice = $sa->fetch_by_seq_region_id($seq_region_id); + $slice_hash{$seq_region_id} = $slice; +# $sr_name_hash{$seq_region_id} = $slice->seq_region_name(); + $sr_cs_hash{$seq_region_id} = $slice->coord_system(); + } + + #retrieve analysis + my $analysis; + unless($analysis = $analysis_cache{$analysis_id}) { + $analysis = $analysis_adp->fetch_by_dbID($analysis_id); + $analysis_cache{$analysis_id} = $analysis; + } + + # + # remap the feature coordinates to another coord system + # if a mapper was provided + # + if($mapper) { +# my $sr_name = $sr_name_hash{$seq_region_id}; + my $sr_cs = $sr_cs_hash{$seq_region_id}; + + ($seq_region_id,$seq_region_start,$seq_region_end) = + $mapper->fastmap($slice->seq_region_name(), $seq_region_start, $seq_region_end, 0, $sr_cs); + + #skip features that map to gaps or coord system boundaries + next FEATURE if(!defined($seq_region_id)); + + #get a slice in the coord system we just mapped to + $slice = $slice_hash{"$seq_region_id"} ||= + $sa->fetch_by_seq_region_id($seq_region_id); + } + + # + # If a destination slice was provided convert the coords + # If the dest_slice starts at 1 and is foward strand, nothing needs doing + # + if($dest_slice) { + if($dest_slice_start != 1 || $dest_slice_strand != 1) { + if($dest_slice_strand == 1) { + $seq_region_start = $seq_region_start - $dest_slice_start + 1; + $seq_region_end = $seq_region_end - $dest_slice_start + 1; + } else { + my $tmp_seq_region_start = $seq_region_start; + $seq_region_start = $dest_slice_end - $seq_region_end + 1; + $seq_region_end = $dest_slice_end - $tmp_seq_region_start + 1; + } + + #throw away features off the end of the requested slice + if($seq_region_end < 1 || $seq_region_start > $dest_slice_length) { + next FEATURE; + } + } + $slice = $dest_slice; + } + + #now create a new marker_feature using the marker + push @out, Bio::EnsEMBL::Map::MarkerFeature->new + ($marker_feature_id, $self, + $seq_region_start, $seq_region_end, $slice, + $analysis, $marker_id, $map_weight, $marker); + } + + return \@out; +} + + + + +=head2 store + + Arg [1] : Bio::EnsEMBL::Map::MarkerFeature + Example : $marker_feature_adaptor->store(@marker_features); + Description: Stores a list of marker features in this database. + The dbID and adaptor of each marker will be set on successful + storing. + Returntype : none + Exceptions : thrown if not all data needed for storing is populated in the + marker features + Caller : general + Status : stable + +=cut + +sub store { + my ($self, @mfs) = @_; + + foreach my $mf (@mfs) { + + # + # Sanity checking! + # + if(!ref($mf) || !$mf->isa('Bio::EnsEMBL::Map::MarkerFeature')) { + $self->throw("Incorrect argument [$mf] to store. Expected " . + 'Bio::EnsEMBL::Map::MarkerFeature'); + } + + #don't store this feature if it has already been stored + if($mf->is_stored($self->db())) { + warning('MarkerFeature ['.$mf->dbID.'] is already stored in this DB.'); + next; + } + + # Get/test the marker + my $marker = $mf->marker; + if(!$marker || !ref($marker) || + !$marker->isa('Bio::EnsEMBL::Map::Marker')) { + throw('Cannot store MarkerFeature without an associated Marker'); + } + + #store the marker if it has not been stored yet + if(!$marker->is_stored($self->db())) { + my $marker_adaptor = $self->db->get_adaptor('Marker'); + $marker_adaptor->store($marker); + } + my $marker_id = $marker->dbID || + throw('Associated Marker must have dbID to store MarkerFeature'); + + # Get/test the analysis + my $analysis = $mf->analysis; + if(!$analysis || !ref($analysis) || + !$analysis->isa('Bio::EnsEMBL::Analysis')) { + throw('Cannot store MarkerFeature without an associated Analysis'); + } + + #store the analysis if it has not been stored yet + if(!$analysis->is_stored($self->db())) { + my $analysis_adaptor = $self->db->get_adaptor('Analysis'); + $analysis_adaptor->store($mf->analysis()); + } + my $analysis_id = $analysis->dbID || + throw('Associated Analysis must have dbID to store MarkerFeature'); + + # Store the marker feature itself + my $original = $mf; + my $seq_region_id; + ($mf, $seq_region_id) = $self->_pre_store($mf); + + my $sth = + $self->prepare("INSERT INTO marker_feature (marker_id, + seq_region_id, seq_region_start, seq_region_end, + analysis_id, map_weight) + VALUES (?, ?, ?, ?, ?, ?)"); + $sth->execute($marker_id, + $seq_region_id, $mf->start, $mf->end, + $analysis_id, $mf->map_weight || 0); + + my $dbID = $sth->{'mysql_insertid'}; + + $original->dbID($dbID); + $original->adaptor($self); + } +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Map/DBSQL/QtlAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Map/DBSQL/QtlAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,271 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Map::DBSQL::QtlAdaptor + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +This module is responsible of retrieving QTLs from the database. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Map::DBSQL::QtlAdaptor; + +use strict; + +use Bio::EnsEMBL::Map::Qtl; +use Bio::EnsEMBL::DBSQL::BaseAdaptor; + + +use vars qw(@ISA); + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); + + +=head2 fetch_by_dbID + + Arg 1 : int $dbID + Example : none + Description: get by database internal identifier + Returntype : Bio::EnsEMBL::Map::Qtl + Exceptions : none + Caller : general + Status : stable + +=cut + +sub fetch_by_dbID { + my $self = shift; + my $dbID = shift; + + return unless $dbID; + + my $res = $self->_generic_fetch( [ "q.qtl_id = $dbID" ] ); + return $res->[0]; +} + + +=head2 fetch_all + + Example : none + Description: get all the qtl's + Returntype : listref Bio::EnsEMBL::Map::Qtl + Exceptions : none + Caller : general + Status : stable + +=cut + +sub fetch_all { + my $self = shift; + $self->_generic_fetch( [] ); +} + + + + +=head2 fetch_all_by_trait + + Arg [1] : string $trait + The phenotype we are looking for + Example : none + Description: get by phenotype/trait string + Returntype : listref Bio::EnsEMBL::Map::Qtl + Exceptions : none + Caller : general + Status : stable + +=cut + + +sub fetch_all_by_trait { + my $self = shift; + my $trait = shift; + + return [] unless $trait; + + return $self->_generic_fetch( [ "q.trait = '$trait'" ] ); +} + + + + +=head2 fetch_all_by_source_database + + Arg 1 : string $database_name + Name of the database that provides the Qtl information + Arg [2] : string $database_primary_id + The primary id of the qtl in that database + Example : none + Description: retrieve Qtl by given information + Returntype : listref Bio::EnsEMBL::Map::Qtl + Exceptions : none + Caller : general + Status : stable + +=cut + +sub fetch_all_by_source_database { + + my $self = shift; + my $database_name = shift; + my $database_primary_id = shift; + + return [] unless $database_name; + + my @conditions; + + if( $database_name ) { + push( @conditions, "q.source_database=\"$database_name\"" ); + } + + if( $database_primary_id ) { + push( @conditions, "q.source_primary_id=\"$database_primary_id\"" ); + } + + return $self->_generic_fetch( \@conditions ); +} + + +sub _generic_fetch { + my $self = shift; + my $conditions = shift; + + my $where = ''; + + if( @$conditions ) { + $where = "WHERE ".join( " and ", @$conditions ); + } + + my $query = "SELECT ". + join( ", ", $self->_columns() ). + " FROM qtl q LEFT JOIN qtl_synonym qs ON q.qtl_id = qs.qtl_id ". + $where; + + my $sth = $self->prepare( $query ); + $sth->execute(); + + return $self->_obj_from_sth( $sth ); +} + + +sub _columns { + return ( 'q.qtl_id','qs.source_database','qs.source_primary_id', + 'q.trait','q.lod_score','q.flank_marker_id_1', + 'q.flank_marker_id_2','q.peak_marker_id' ); +} + + +sub _obj_from_sth { + my $self = shift; + my $sth = shift; + + my ( $qtl_id, $source_database, + $source_primary_id, $trait, $lod_score, $flank_marker_id_1, + $flank_marker_id_2, $peak_marker_id ); + + #warning: ordering depends on _columns function implementation + $sth->bind_columns( \$qtl_id, + \$source_database, \$source_primary_id, \$trait, + \$lod_score, \$flank_marker_id_1, + \$flank_marker_id_2, \$peak_marker_id ); + + my @out = (); + my %already_seen; + + while( $sth->fetch()) { + + #multiple columns with same qtl are multiple synonyms + if(my $qtl = $already_seen{$qtl_id}) { + $qtl->add_synonym($source_database, $source_primary_id); + next; + } + + my $mad = $self->db()->get_MarkerAdaptor(); + + my $flank_marker_1 = $flank_marker_id_1 ? $mad->fetch_by_dbID( $flank_marker_id_1 ) : undef ; + my $flank_marker_2 = $flank_marker_id_2 ? $mad->fetch_by_dbID( $flank_marker_id_2 ) : undef; + my $peak_marker = $peak_marker_id ? $mad->fetch_by_dbID( $peak_marker_id ) : undef; + + my $qtl = Bio::EnsEMBL::Map::Qtl->new + ( + $qtl_id, + $self->db->get_QtlAdaptor(), + $flank_marker_1, + $peak_marker, + $flank_marker_2, + $trait, + $lod_score, + {$source_database => $source_primary_id} + ); + + push @out, $qtl; + $already_seen{$qtl_id} = $qtl; + } + + return \@out; +} + + + +=head2 list_traits + + Args : none + Example : none + Description: list of all the different traits + Returntype : listref string + Exceptions : none + Caller : general + Status : stable + +=cut + + + +sub list_traits { + my $self = shift; + + my $sth = $self->prepare( " + SELECT DISTINCT trait + FROM qtl q + " ); + + my $res = []; + + $sth->execute(); + push ( @$res , + map { $_->[0] } @{$sth->fetchall_arrayref()} + ); + + return $res; +} + + + + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Map/DBSQL/QtlFeatureAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Map/DBSQL/QtlFeatureAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,181 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Map::DBSQL::QtlFeatureAdaptor + +=head1 SYNOPSIS + + +=head1 DESCRIPTION + +This module is responsible of retrieving QtlFeatures (and their +associated Qtls) from the database. + +The bulk of this objects' methods are inherited from +Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Map::DBSQL::QtlFeatureAdaptor; + +use strict; + +use Bio::EnsEMBL::Map::Qtl; +use Bio::EnsEMBL::Map::QtlFeature; +use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; + +use vars qw(@ISA); + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor); + + + +=head2 fetch_all_by_Qtl + + Arg [1] : Bio::EnsEMBL::Map::Qtl + Example : none + Description: Retrieves a list of QtlFeatures for a given Qtl + Returntype : listref of Bio::EnsEMBL::QtlFeatures + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_Qtl { + my $self = shift; + my $qtl = shift; + + my $constraint = 'q.qtl_id = ' . $qtl->dbID; + + return $self->generic_fetch($constraint, @_); +} + + + + +sub _columns { + my $self = shift; + + return ( 'qf.seq_region_id', 'qf.seq_region_start', 'qf.seq_region_end', + 'q.qtl_id', + 'qf.analysis_id', + 'qs.source_database', 'qs.source_primary_id', + 'q.trait', 'q.lod_score', 'q.flank_marker_id_1', + 'q.flank_marker_id_2', 'q.peak_marker_id' ); +} + +sub _tables { + my $self = shift; + + return (['qtl_feature', 'qf'], #primary table + ['qtl', 'q'], + ['qtl_synonym', 'qs']); +} + +sub _left_join { + return ( [ 'qtl_synonym', 'q.qtl_id = qs.qtl_id' ] ); +} + +sub _default_where_clause { + my $self = shift; + + return ('qf.qtl_id = q.qtl_id'); +} + + +sub _objs_from_sth { + my $self = shift; + my $sth = shift; + + my ( $seq_region_id, $seq_region_start, $seq_region_end, $qtl_id, + $analysis_id, $source_database, + $source_primary_id, $trait, $lod_score, $flank_marker_id_1, + $flank_marker_id_2, $peak_marker_id ); + + #warning: ordering depends on _columns function implementation + $sth->bind_columns( \$seq_region_id, \$seq_region_start, \$seq_region_end, + \$qtl_id, \$analysis_id, + \$source_database, \$source_primary_id, \$trait, + \$lod_score, \$flank_marker_id_1, + \$flank_marker_id_2, \$peak_marker_id ); + + my @out = (); + my %already_seen; + + my $mad = $self->db()->get_MarkerAdaptor(); + my $aad = $self->db()->get_AnalysisAdaptor(); + my $sad = $self->db()->get_SliceAdaptor(); + + while( $sth->fetch()) { + + my $flank_marker_1 = $flank_marker_id_1 ? + $mad->fetch_by_dbID( $flank_marker_id_1 ) : + undef; + my $flank_marker_2 = $flank_marker_id_2 ? + $mad->fetch_by_dbID( $flank_marker_id_2 ) : + undef; + my $peak_marker = $peak_marker_id ? + $mad->fetch_by_dbID( $peak_marker_id ) : + undef; + + my $analysis = $aad->fetch_by_dbID( $analysis_id ); + + my $slice = $sad->fetch_by_seq_region_id($seq_region_id); + + #rows with the same qtl contain additional synonyms of the qtl + if(my $qtl = $already_seen{$qtl_id}) { + $qtl->add_synonym($source_database, $source_primary_id); + next; + } + + my $qtl = Bio::EnsEMBL::Map::Qtl->new + ( + $qtl_id, + $self->db->get_QtlAdaptor(), + $flank_marker_1, + $peak_marker, + $flank_marker_2, + $trait, + $lod_score, + {$source_database => $source_primary_id} + ); + + $already_seen{$qtl_id} = $qtl; + + #now create a new marker_feature using the marker + push @out, Bio::EnsEMBL::Map::QtlFeature->new + ($self, + $slice, + $seq_region_start, + $seq_region_end, + $qtl, + $analysis); + } + + return \@out; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Map/Ditag.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Map/Ditag.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,229 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Map::Ditag + +=head1 SYNOPSIS + + my $feature = Bio::EnsEMBL::Map::Ditag->new( + -dbID => $tag_id, + -name => $name, + -type => $type, + -tag_count => $tag_count, + -sequence => $sequence, + -adaptor => $dbAdaptor + ); + +=head1 DESCRIPTION + +Represents an unmapped ditag object in the EnsEMBL database. +Corresponds to original tag containing the full sequence. This can be +a single piece of sequence like CAGE tags or a ditag with concatenated +sequence from 5' and 3' end like GIS or GSC tags. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Map::Ditag; + +use strict; +use vars qw(@ISA); + +use Bio::EnsEMBL::Storable; +use Bio::EnsEMBL::Utils::Exception qw( throw ); +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); + +@ISA = qw(Bio::EnsEMBL::Storable); + + +=head2 new + + Arg [1] : (optional) int $dbID + Arg [2] : (optional) string name + Arg [3] : (optional) string type + Arg [4] : (optional) int tag_count + Arg [5] : (optional) string sequence + Arg [6] : (optional) Bio::EnsEMBL::Map::DBSQL::DitagAdaptor $adaptor + + Description: Creates a new ditag + Returntype : Bio::EnsEMBL::Map::Ditag + Exceptions : none + Caller : general + +=cut + +sub new { + my ($caller, @args) = @_; + my ($dbID, $name, $type, $tag_count, $sequence, $adaptor) = rearrange( + [ 'DBID', 'NAME', 'TYPE', 'TAG_COUNT', 'SEQUENCE', 'ADAPTOR' ], @args); + my $class = ref($caller) || $caller; + + if(!$name or !$type or !$sequence) { + throw('Missing information for Ditag object: + Bio::EnsEMBL::Map::Ditag->new ( + -dbID => $tag_id, + -name => $name, + -type => $type, + -tag_count => $tag_count, + -sequence => $sequence, + -adaptor => $dbAdaptor + );'); + } + + if(!$tag_count){ $tag_count = 0; } + + if(!($sequence =~ /^[ATCGN]+$/i)){ + throw('ditag sequence contains non-standard characters: '.$sequence); + } + + my $self = bless( {'dbID' => $dbID, + 'name' => $name, + 'type' => $type, + 'tag_count' => $tag_count, + 'sequence' => $sequence + }, $class); + + $self->adaptor($adaptor); + return $self; +} + +=head2 name + + Arg [1] : (optional) string $type + Example : $type = $ditag->name; + Description: Getter/Setter for the name of a ditag + Returntype : text + Caller : general + +=cut + +sub name { + my $self = shift; + + if(@_) { + $self->{'name'} = shift; + } + + return $self->{'name'}; +} + +=head2 dbID + + Arg [1] : (optional) int id + Example : $ditag_id = $ditag->dbID; + Description: Getter/Setter for the dbID of a ditag + Returntype : int + Caller : general + +=cut + +sub dbID { + my $self = shift; + + if(@_) { + $self->{'dbID'} = shift; + } + + return $self->{'dbID'}; +} + + +=head2 type + + Arg [1] : (optional) string $type + Example : $type = $ditag->type; + Description: Getter/Setter for the type of a ditag + Returntype : text + Caller : general + +=cut + +sub type { + my $self = shift; + + if(@_) { + $self->{'type'} = shift; + } + + return $self->{'type'}; +} + +=head2 tag_count + + Arg [1] : (optional) string $tag_count + Example : $type = $ditag->tag_count; + Description: Getter/Setter for the tag_count of a ditag + Returntype : int + Caller : general + +=cut + +sub tag_count { + my $self = shift; + + if(@_) { + $self->{'tag_count'} = shift; + } + + return $self->{'tag_count'}; +} + +=head2 sequence + + Arg [1] : (optional) string $sequence + Example : $sequence = $ditag->sequence; + Description: Getter/Setter for the sequence of a ditag + Returntype : text + Caller : general + +=cut + +sub sequence { + my $self = shift; + + if(@_) { + $self->{'sequence'} = shift; + } + + return $self->{'sequence'}; +} + + +=head2 get_ditagFeatures + + Arg : none + Example : @features = @{$ditag->get_ditagFeatures}; + Description: Fetch ditag_features created from this ditag + Returntype : listref of Bio::EnsEMBL::Map::DitagFeature + Caller : general + +=cut + +sub get_ditagFeatures { + my $self = shift; + + return $self->adaptor->db->get_adaptor("ditagFeature") + ->fetch_all_by_ditagID($self->dbID); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Map/DitagFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Map/DitagFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,558 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Map::DitagFeature + +=head1 SYNOPSIS + +my $feature = Bio::EnsEMBL::Map::DitagFeature->new( + -slice => $slice, + -start => $qstart, + -end => $qend, + -strand => $qstrand, + -hit_start => $tstart, + -hit_end => $tend, + -hit_strand => $tstrand, + -ditag_id => $ditag_id, + -ditag_side => $ditag_side, + -ditag_pair_id => $ditag_pair_id, + -cigar_line => $cigar_line, + -analysis => $analysis, +); + +=head1 DESCRIPTION + +Represents a mapped ditag object in the EnsEMBL database. These are +the original tags separated into start ("L") and end ("R") parts if +applicable, successfully aligned to the genome. Two DitagFeatures +usually relate to one parent Ditag. Alternatively there are CAGE tags +e.g. which only have a 5\'tag ("F"). + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Map::DitagFeature; + +use strict; +use vars qw(@ISA); + +use Bio::EnsEMBL::Feature; +use Bio::EnsEMBL::Utils::Exception qw( throw ); +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); + +@ISA = qw(Bio::EnsEMBL::Feature); + +=head2 new + + Arg [1] : (optional) int dbID + Arg [2] : (optional) Bio::EnsEMBL::DitagFeatureAdaptor $adaptor + Arg [3] : int start + Arg [4] : int end + Arg [5] : int strand + Arg [6] : Bio::EnsEMBL::Slice $slice + Arg [7] : (optional) Bio::EnsEMBL::Analysis + Arg [8] : int hit_start + Arg [9] : int hit_end + Arg [10] : int hit_strand + Arg [11] : int ditag_id + Arg [12] : string ditag_side + Arg [13] : (optional) sring cigar_line + Arg [14] : (optional) int ditag_pair_id + Arg [15] : (optional) int tag_count, only used for imported mappings where + identical positions where collapsed into into one feature. + Default: 1 + Arg [16] : (optional) ditag object + + Example : $ditag = Bio::EnsEMBL::Map::DitagFeature->new + (-dbID => 123, -adaptor => $adaptor, ...); + Description: Creates a new DitagFeature + Returntype : Bio::EnsEMBL::Map::DitagFeature + Caller : general + Status : At Risk + +=cut + +sub new { + my ($caller, @args) = @_; + my ( $dbID, $adaptor, $start, $end, $strand, $slice, $analysis, $hit_start, $hit_end, + $hit_strand, $ditag_id, $ditag_side, $cigar_line, $ditag_pair_id, $tag_count, $ditag ) = + rearrange( [ 'dbid', 'adaptor' ,'start', 'end', 'strand', 'slice', 'analysis', 'hit_start', + 'hit_end', 'hit_strand', 'ditag_id', 'ditag_side', 'cigar_line', 'ditag_pair_id' ,'tag_count', 'ditag'], + @args ); + my $class = ref($caller) || $caller; + + if($analysis) { + if(!ref($analysis) || !$analysis->isa('Bio::EnsEMBL::Analysis')) { + throw('-ANALYSIS argument must be a Bio::EnsEMBL::Analysis not '. + $analysis); + } + } + if(defined($strand)) { + if(!($strand =~ /^-?\d$/) or !($strand == 1) && !($strand == -1) && !($strand == 0)) { + throw('-STRAND argument must be 1, -1, or 0'); + } + } + if(defined($hit_strand)) { + if(!($hit_strand == 1) && !($hit_strand == -1) && !($hit_strand == 0)) { + throw('-HIT_STRAND argument must be 1, -1, or 0 not '.$hit_strand); + } + } + if(defined($start) && defined($end)) { + if($end+1 < $start) { + throw('Start must be less than or equal to end+1.'); + } + } + else{ + throw('Need start and end location.'); + } + if(!(defined($hit_start) && defined($hit_end))) { + throw('Need hit start and hit end location.'); + } + if(!defined($tag_count) or (!$tag_count =~ /^[\d]+$/)){ + $tag_count = 1; + } + + my $self = bless( {'dbID' => $dbID, + 'analysis' => $analysis, + 'slice' => $slice, + 'start' => $start, + 'end' => $end, + 'strand' => $strand, + 'hit_start' => $hit_start, + 'hit_end' => $hit_end, + 'hit_strand' => $hit_strand, + 'ditag_id' => $ditag_id, + 'ditag_pair_id' => $ditag_pair_id, + 'ditag_side' => $ditag_side, + 'cigar_line' => $cigar_line, + 'tag_count' => $tag_count, + 'ditag' => $ditag, + }, $class); + + $self->adaptor($adaptor); + return $self; +} + + +=head2 fetch_ditag + + Description: Deprecated, use ditag() instead + +=cut + +sub fetch_ditag { + throw("Deprecated method, please use ditag() instead.\n") +} + + +=head2 ditag + + Arg [1] : (optional) ditag object + Description: Get/Set the ditag object of this DitagFeature + Returntype : Bio::EnsEMBL::Map::Ditag + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub ditag { + my $self = shift; + + if(@_) { + $self->{'ditag'} = shift; + } elsif(!$self->{'ditag'}) { + if($self->{'ditag_id'}) { + #lazy load the ditag + my $ditag_adaptor = $self->analysis->adaptor->db->get_DitagAdaptor; + $self->{'ditag'} = $ditag_adaptor->fetch_by_dbID($self->ditag_id); + } + else{ + throw "Could not get Ditag for DitagFeature ".$self->dbID; + } + } + + return($self->{'ditag'}); +} + + +=head2 get_ditag_location + + Arg [1] : none + Description: Get the start and end location (and strand ) of the start-end pair + this DitagFeature belongs to. + If it is not a paired ditag, these will be identical + to DitagFeature->start() & DitagFeature->end(). + Please note that the returned start/end are min/max locations. + Returntype : int (start, end, strand) + Exceptions : throws if the 2 features of a pair are found on different strands + or if the second one cannot be found. + Caller : general + Status : At Risk + +=cut + +sub get_ditag_location { + my $self = shift; + + my ($start, $end, $strand); + if($self->ditag_side eq "F"){ + $start = $self->start; + $end = $self->end; + } + else{ + my ($ditag_a, $ditag_b, $more); + eval{ + ($ditag_a, $ditag_b, $more) = @{$self->adaptor->fetch_all_by_ditagID($self->ditag_id, + $self->ditag_pair_id, + $self->analysis->dbID)}; + }; + if($@ or !defined($ditag_a) or !defined($ditag_b)){ + throw("Cannot find 2nd tag of pair (".$self->dbID.", ".$self->ditag_id.", ". + $self->ditag_pair_id.", ".$self->analysis->dbID.")\n".$@); + } + else{ +# if(defined $more){ +# throw("More than two DitagFeatures were returned for ".$self->dbID.", ".$self->ditag_id +# .", ".$self->ditag_pair_id); +# } + + ($ditag_a->start < $ditag_b->start) ? ($start = $ditag_a->start) : ($start = $ditag_b->start); + ($ditag_a->end > $ditag_b->end) ? ($end = $ditag_a->end) : ($end = $ditag_b->end); + if($ditag_a->strand != $ditag_b->strand){ + throw('the strands of the two ditagFeatures are different! '.$ditag_a->strand.'/'.$ditag_b->strand); + } + } + } + + return($start, $end, $self->strand); +} + + +=head2 ditag_id + + Arg [1] : (optional) value + Description: Getter/Setter for the ditag_id + of this DitagFeature + Returntype : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub ditag_id { + my $self = shift; + + if(@_) { + $self->{'ditag_id'} = shift; + } + + return $self->{'ditag_id'}; +} + +=head2 slice + + Arg [1] : (optional) value + Description: Getter/Setter for the slice + of this DitagFeature + Returntype : slice object + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub slice { + my $self = shift; + + if(@_) { + $self->{'slice'} = shift; + } + + return $self->{'slice'}; +} + +=head2 ditag_pair_id + + Arg [1] : (optional) value + Description: Getter/Setter for the ditag_pair_id + of this DitagFeature + Returntype : int + Exceptions : none + Caller : general + Status : At Risk + +=cut +sub ditag_pair_id { + my $self = shift; + + if(@_) { + $self->{'ditag_pair_id'} = shift; + } + + return $self->{'ditag_pair_id'}; +} + +=head2 ditag_side + + Arg [1] : (optional) value + Description: Getter/Setter for the ditag_side + of this DitagFeature + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub ditag_side { + my $self = shift; + + if(@_) { + $self->{'ditag_side'} = shift; + } + + return $self->{'ditag_side'}; +} + +=head2 hit_start + + Arg [1] : (optional) value + Description: Getter/Setter for the hit_start + of this DitagFeature + Returntype : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub hit_start { + my $self = shift; + + if(@_) { + $self->{'hit_start'} = shift; + } + + return $self->{'hit_start'}; +} + +=head2 hit_end + + Arg [1] : (optional) value + Description: Getter/Setter for the hit_end + of this DitagFeature + Returntype : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub hit_end { + my $self = shift; + + if(@_) { + $self->{'hit_end'} = shift; + } + + return $self->{'hit_end'}; +} + +=head2 hit_strand + + Arg [1] : (optional) value + Description: Getter/Setter for the hit_strand + of this DitagFeature + Returntype : 1/-1/0 + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub hit_strand { + my $self = shift; + + if(@_) { + $self->{'hit_strand'} = shift; + } + + return $self->{'hit_strand'}; +} + +=head2 cigar_line + + Arg [1] : (optional) value + Description: Getter/Setter for the cigar_line + of this DitagFeature + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub cigar_line { + my $self = shift; + + if(@_) { + $self->{'cigar_line'} = shift; + } + + return $self->{'cigar_line'}; +} + +=head2 start + + Arg [1] : (optional) value + Description: Getter/Setter for the start + of this DitagFeature + Returntype : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub start { + my $self = shift; + + if(@_) { + $self->{'start'} = shift; + } + + return $self->{'start'}; +} + +=head2 end + + Arg [1] : (optional) value + Description: Getter/Setter for the end + of this DitagFeature + Returntype : int or string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub end { + my $self = shift; + + if(@_) { + $self->{'end'} = shift; + } + + return $self->{'end'}; +} + +=head2 strand + + Arg [1] : (optional) value + Description: Getter/Setter for the strand + of this DitagFeature + Returntype : 1/-1/0 + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub strand { + my $self = shift; + + if(@_) { + $self->{'strand'} = shift; + } + + return $self->{'strand'}; +} + +=head2 dbID + + Arg [1] : (optional) value + Description: Getter/Setter for the dbID + of this DitagFeature + Returntype : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub dbID { + my $self = shift; + + if(@_) { + $self->{'dbID'} = shift; + } + + return $self->{'dbID'}; +} + +=head2 sequence + + Arg [1] : (optional) value + Description: Getter/Setter for the sequence + of this DitagFeature + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub sequence { + my $self = shift; + + $self->{'sequence'} = $self->adaptor->sequence($self->dbID()); + + return $self->{'sequence'}; +} + + +=head2 tag_count + + Arg [1] : (optional) value + Description: Getter/Setter for the tag_count + of this DitagFeature + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub tag_count { + my $self = shift; + + if(@_) { + $self->{'tag_count'} = shift; + } + + return $self->{'tag_count'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Map/MapLocation.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Map/MapLocation.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,207 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Map::MapLocation + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +Represents a location on a genetic map, yac map, radition hybrid map, +etc. + +=head1 METHODS + +=cut + + +package Bio::EnsEMBL::Map::MapLocation; + +use strict; +use vars qw(@ISA); + +use Bio::EnsEMBL::Utils::Exception qw(deprecate); + +=head2 new + + Arg [1] : (optional) string $name + Arg [2] : (optional) string $map_name + Arg [3] : (optional) string $chromosome_name + Arg [4] : (optional) string $position + Arg [5] : (optional) float $lod_score + Example : $map_location = Bio::EnsEMBL::Map::MapLocation('DS1234', + 'genethon', + 'X', + '12.39', + 50.12); + Description: Creates a new MapLocation + Returntype : Bio::EnsEMBL::Map::MapLocation + Exceptions : none + Caller : general + Status : stable + +=cut + +sub new { + my ($caller, $name, $map_name, $chromosome_name, $position, $lod_score) = @_; + + my $class = ref($caller) || $caller; + + return bless( {'map_name' => $map_name, + 'name' => $name, + 'chromosome_name' => $chromosome_name, + 'position' => $position, + 'lod_score' => $lod_score}, $class ); +} + + + +=head2 map_name + + Arg [1] : string $map_name + Example : $map_name = $map_location->map_name; + Description: Getter/Setter for the map name + Returntype : string + Exceptions : none + Caller : general + Status : stable + +=cut + +sub map_name { + my $self = shift; + $self->{'map_name'} = shift if(@_); + return $self->{'map_name'}; +} + + + +=head2 name + + Arg [1] : (optional) string $name + Example : $name = $map_location->name; + Description: A name associated with the marker at this position. For + example if this is a genethon map location the name will be + the synonym of source genethon. + Returntype : string + Exceptions : none + Caller : general + Status : stable + +=cut + +sub name { + my $self = shift; + $self->{'name'} = shift if(@_); + return $self->{'name'}; +} + + +=head2 chromosome_name + + Arg [1] : (optional) string $chromosome_name + Example : $chr_name = $map_location->chromosome_name; + $map_location->chromosome_name('X'); + Description: The name of the chromosome associated with this map location + Returntype : string + Exceptions : none + Caller : general + Status : stable + +=cut + +sub chromosome_name{ + my $self = shift; + $self->{'chromosome_name'} = shift if(@_); + return $self->{'chromosome_name'}; +} + + + +=head2 position + + Arg [1] : (optional) string $position + Example : $pos = $map_location->position; + Description: Getter/Setter for the position of this map location + Returntype : string + Exceptions : none + Caller : general + Status : stable + +=cut + +sub position { + my $self = shift; + $self->{'position'} = shift if(@_); + return $self->{'position'}; +} + + + +=head2 lod_score + + Arg [1] : (optional) float $lod + Example : $lod = $map_location->lod_score; + Description: Getter/Setter for lod score of this map location + Returntype : float + Exceptions : none + Caller : general + Status : stable + +=cut + +sub lod_score { + my $self = shift; + $self->{'lod_score'} = shift if(@_); + return $self->{'lod_score'}; +} + + + +=head2 chromosome + + Description: DEPRECATED use chromosome_name() instead + +=cut + +sub chromosome { + my $self = shift; + deprecate('use chromosome_name instead'); + + if(@_) { + my $chr = shift; + if(ref($chr)) { + $self->chromosome_name($chr->seq_region_name()); + } else { + $self->chromosome_name($chr); + } + } + + #this object has no way to talk to db and thus no way to + #get a chromosome object + return $self->chromosome_name(); +} + + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Map/Marker.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Map/Marker.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,510 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Map::Marker + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +Represents a marker in the EnsEMBL database. The marker object +is unpositioned on the genome. Markers which are positioned are +represented by the MarkerFeature object. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Map::Marker; + +use strict; +use vars qw(@ISA); + +use Bio::EnsEMBL::Storable; +use Bio::EnsEMBL::Utils::Exception qw(throw); + +@ISA = qw(Bio::EnsEMBL::Storable); + + + +=head2 new + + Arg [1] : (optional) int $dbID + Arg [2] : (optional) Bio::EnsEMBL::Map::DBSQL::MarkerAdaptor $adaptor + Arg [3] : (optional) string $left_primer + Arg [4] : (optional) string $right_primer + Arg [5] : (optional) int $primer_distance + Arg [6] : (optional) int $priority + Arg [7] : (optional) string $type + Arg [8] : (optional) Bio::EnsEMBL::Map::MarkerSynonym $display_synonym + Arg [9] : (optional) listref of Bio::EnsEMBL::Map::MarkerSynonyms $syns + Arg [10] : (optional) listref of Bio::EnsEMBL::Map::MapLocations $locs + Example : $marker = Bio::EnsEMBL::Map::MarkerSynonym->new + (123, $adaptor, + $left_primer, $right_primer, 400, + 80, $ms1, [$ms1, $ms2], [$mloc1, $mloc2]); + Description: Creates a new Marker + Returntype : Bio::EnsEMBL::Map::Marker + Exceptions : none + Caller : general + Status : stable + +=cut + +sub new { + my ($caller, $dbID, $adaptor, $left_primer, $right_primer, + $min_primer_dist, $max_primer_dist, $priority, $type, $display_synonym, + $syns, $mlocs) = @_; + + my $class = ref($caller) || $caller; + + my $self = bless( {'dbID' => $dbID, + 'left_primer' => $left_primer, + 'right_primer' => $right_primer, + 'min_primer_dist' => $min_primer_dist, + 'max_primer_dist' => $max_primer_dist, + 'priority' => $priority, + 'type' => $type, + 'display_marker_synonym' => $display_synonym + }, $class); + + $self->adaptor($adaptor); + + #only load the marker synononyms if they were supplied, otherwise they + # will be lazy-loaded + if($syns && @$syns) { + $self->{'marker_synonyms'} = $syns; + } + + #only load the map_locations if they were supplied, otherwise they will + # be lazy-loaded + if($mlocs) { + foreach my $ml (@$mlocs) { + $self->add_MapLocation($ml); + } + } + + return $self; +} + + +=head2 left_primer + + Arg [1] : (optional) string $left_primer + Example : $left_primer = $marker->left_primer; + Description: Getter/Setter for the left primer sequence of this marker + Returntype : string + Exceptions : none + Caller : general + Status : stable + +=cut + +sub left_primer { + my $self = shift; + + if(@_) { + $self->{'left_primer'} = shift; + } + + return $self->{'left_primer'}; +} + + + +=head2 right_primer + + Arg [1] : (optional) string $right_primer + Example : $right_primer = $marker->right_primer; + Description: Getter/Setter for the right primer sequence of this marker + Returntype : string + Exceptions : none + Caller : general + Status : stable + +=cut + +sub right_primer { + my $self = shift; + + if(@_) { + $self->{'right_primer'} = shift; + } + + return $self->{'right_primer'}; +} + + + +=head2 min_primer_dist + + Arg [1] : (optional) string $min + Example : $dist = $marker->min_primer_dist; + Description: Getter/Setter for the minimum seperation distance between the + left and right primers of this marker + Returntype : int + Exceptions : none + Caller : general + Status : stable + +=cut + +sub min_primer_dist { + my $self = shift; + + if(@_) { + $self->{'min_primer_dist'} = shift; + } + + return $self->{'min_primer_dist'}; +} + + +=head2 max_primer_dist + + Arg [1] : (optional) string $max + Example : $dist = $marker->max_primer_dist; + Description: Getter/Setter for the maximum seperation distance between the + left and right primers of this marker + Returntype : int + Exceptions : none + Caller : general + Status : stable + +=cut + +sub max_primer_dist { + my $self = shift; + + if(@_) { + $self->{'max_primer_dist'} = shift; + } + + return $self->{'max_primer_dist'}; +} + + + +=head2 priority + + Arg [1] : (optional) int $priority + Example : $priority = $marker->priority; + Description: Getter/Setter for priority of this marker which can be used to + determine which markers are displayed. + Returntype : int + Exceptions : none + Caller : general + Status : stable + +=cut + +sub priority { + my $self = shift; + + if(@_) { + $self->{'priority'} = shift; + } + + return $self->{'priority'}; +} + + + + +=head2 type + + Arg [1] : (optional) string $type + Example : $type = $marker->type; + Description: Getter/Setter for type of this marker. Rat markers are typed + as 'est' or 'microsatellite'. Other markers may not have + defined types. + Returntype : string + Exceptions : none + Caller : general + Status : stable + +=cut + +sub type { + my $self = shift; + + if(@_) { + $self->{'type'} = shift; + } + + return $self->{'type'}; +} + + + + +=head2 get_all_MarkerSynonyms + + Arg [1] : none + Example : @synonyms = @{$marker->get_all_MarkerSynonyms}; + Description: Retrieves a list of marker synonyms associated with this + marker. If this marker is connected to the datbase (i.e. it + has an adaptor and + Returntype : listref of Bio::EnsEMBL::Map::MarkerSynonyms + Exceptions : none + Caller : general + Status : stable + +=cut + +sub get_all_MarkerSynonyms { + my $self = shift; + + #lazy-load the marker synonyms if they haven't been retrieved + if(!exists $self->{'marker_synonyms'} && + $self->adaptor && $self->{'dbID'}) { + $self->adaptor->fetch_attributes($self); + } + + return $self->{'marker_synonyms'} || []; +} + + + +=head2 add_MarkerSynonyms + + Arg [1] : Bio::EnsEMBL::MarkerSynonym $ms + Example : $marker->add_MarkerSynonym($ms); + Description: Associates a new synonym with this marker. Adding marker + synonyms to a marker which has not yet retrieved its + synonyms from the database will prevent the loading of these + from the database at request time (unless flush_MarkerSynonyms + is called first). + Returntype : none + Exceptions : thrown if incorrect argument is passed + Caller : general + Status : stable + +=cut + +sub add_MarkerSynonyms { + my ($self, @ms) = @_; + + #create the array if it does not exist it + $self->{'marker_synonyms'} ||= []; + + push(@{$self->{'marker_synonyms'}}, @ms); +} + + + +=head2 flush_MarkerSynonyms + + Arg [1] : none + Example : $marker->flush_MarkerSynonyms; + Description: clears all of the marker sysnonyms which have been added to + this marker. + Returntype : none + Exceptions : none + Caller : general + Status : stable + +=cut + +sub flush_MarkerSynonyms { + my $self = shift; + + delete $self->{'marker_synonyms'}; +} + + + +=head2 display_MarkerSynonym + + Arg [1] : (optional) Bio::EnsEMBL::DBSQL::MarkerSynonym $ms + Example : none + Description: Getter/Setter for the 'display' synonym of this marker + Returntype : Bio::EnsEMBL::Map::MarkerSynonym + Exceptions : thrown if the argument is invalid + Caller : general + Status : stable + +=cut + +sub display_MarkerSynonym { + my $self = shift; + + if(@_) { + my $ms = shift; + if($ms && !(ref $ms && $ms->isa('Bio::EnsEMBL::Map::MarkerSynonym'))) { + throw("ms arg must be Bio::EnsEMBL::Map::MarkerSynonym not [$ms]"); + } + $self->{'display_marker_synonym'} = $ms; + } + + + return $self->{'display_marker_synonym'}; +} + + + +=head2 get_all_MarkerFeatures + + Arg [1] : none + Example : @marker_features = @{$marker->get_all_MarkerFeatures}; + Description: Retrieves the marker features which are associated with this + marker. I.e. locations where this marker has been mapped to + the genome via e-PCR + Returntype : listref of Bio::EnsEMBL::Map::MarkerFeatures + Exceptions : none + Caller : general + Status : stable + +=cut + +sub get_all_MarkerFeatures { + my $self = shift; + + my $marker_feature_adaptor = $self->adaptor->db->get_MarkerFeatureAdaptor; + + #these results are not cached to avoid a circular reference loop + return $marker_feature_adaptor->fetch_all_by_Marker($self); +} + + + +=head2 get_all_MapLocations + + Arg [1] : none + Example : @map_locations = @{$marker->get_all_MapLocations}; + Description: Retrieves all map locations which are associated with this + marker. + Returntype : listref of Bio::EnsEMBL::Map::MapLocations + Exceptions : none + Caller : general + Status : stable + +=cut + +sub get_all_MapLocations { + my $self = shift; + + #lazy-load the map locations if they have not been fetched yet + if(!exists $self->{'map_locations'} && + $self->adaptor && $self->{'dbID'}) { + $self->adaptor->fetch_attributes($self); + } + + my @out = values %{$self->{'map_locations'}}; + + return \@out; +} + + + +=head2 get_MapLocation + + Arg [1] : string $map_name + Example : $map_location = $marker->get_MapLocation('genethon'); + Description: Retrieves the location of this marker in a specified map. + If this marker is not defined in the specified map then + undef is returned. + Returntype : Bio::EnsEMBL::Map::MapLocation + Exceptions : thrown if the map_name arg is not provided + Caller : general + Status : stable + +=cut + +sub get_MapLocation { + my $self = shift; + my $map_name = shift; + + #lazy-load the map locations if they have not been fetched yet + if(!exists $self->{'map_locations'} && + $self->adaptor && $self->{'dbID'}) { + $self->adaptor->fetch_attributes($self); + } + + unless($map_name) { + throw('map_name argument is required, or use get_all_MapLocations'); + } + + return $self->{'map_locations'}->{$map_name}; +} + + + +=head2 add_MapLocations + + Arg [1..n] : @mlocs list of Bio::EnsEMBL::MapLocations + Example : $marker->add_MapLocations(@mlocs); + Description: Associates 1 or more map locations with this marker + using this function to manually load map locations will prevent + lazy-loading of locations from the database. + Returntype : listref of Bio::EnsEMBL::MapLocations + Exceptions : throws if map location has no name + Caller : general + Status : stable + +=cut + +sub add_MapLocations { + my ($self, @mlocs) = @_; + + foreach my $ml (@mlocs) { + unless($ml && ref $ml && $ml->isa('Bio::EnsEMBL::Map::MapLocation')) { + throw("args must be Bio::EnsEMBL::Map::MapLocations not [$ml]"); + } + + my $mname = $ml->map_name; + unless($mname) { + throw("map location arg [$ml] does not define a map name"); + } + + $self->{'map_locations'}->{$mname} = $ml; + } +} + + + + +=head2 flush_MapLocations + + Arg [1] : none + Example : $marker->get_all_MapLocations; + Description: Removes map locations associated with this marker. Markers may + be lazy-loaded from the database (again) after this. + Returntype : none + Exceptions : + Caller : + Status : stable + +=cut + +sub flush_MapLocations{ + my $self = shift; + + delete $self->{'map_locations'}; +} + + +1; + + + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Map/MarkerFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Map/MarkerFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,200 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Map::MarkerFeature + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +Represents a marker feature in the EnsEMBL database. A marker feature +is a marker which has been mapped to the genome by ePCR. Each marker +has one marker feature per mapped location. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Map::MarkerFeature; + +use strict; +use vars qw(@ISA); + +use Bio::EnsEMBL::Feature; + +@ISA = qw(Bio::EnsEMBL::Feature); + + + +=head2 new + + Arg [1] : (optional) int $dbID + Arg [2] : (optional) Bio::EnsEMBL::Adaptor $adaptor + Arg [3] : (optional) int $start + Arg [4] : (optional) int $end + Arg [5] : (optional) Bio::EnsEMBL::Slice $slice + Arg [6] : (optional) Bio::EnsEMBL::Analysis + Arg [7] : (optional) int $marker_id + Arg [8] : (optional) int $map_weight + Arg [9] : (optional) Bio::EnsEMBL::Map::Marker $marker + Example : $marker = Bio::EnsEMBL::Map::MarkerFeature->new(123, $adaptor, + 100, 200, + $ctg, 123); + Description: Creates a new MarkerFeature + Returntype : Bio::EnsEMBL::Map::MarkerFeature + Exceptions : none + Caller : general + Status : stable + +=cut + +sub new { + my ($caller, $dbID, $adaptor, $start, $end, $slice, $analysis, + $marker_id, $map_weight, $marker) = @_; + + my $class = ref($caller) || $caller; + + my $self = bless( { + 'dbID' => $dbID, + 'start' => $start, + 'end' => $end, + 'strand' => 0, + 'slice' => $slice, + 'analysis' => $analysis, + 'marker_id' => $marker_id, + 'marker' => $marker, + 'map_weight' => $map_weight }, $class); + + $self->adaptor($adaptor); + return $self; +} + + + +=head2 _marker_id + + Arg [1] : (optional) int $marker_id + Example : none + Description: PRIVATE Getter/Setter for the internal identifier of the marker + associated with this marker feature + Returntype : int + Exceptions : none + Caller : internal + Status : stable + +=cut + +sub _marker_id { + my $self = shift; + + if(@_) { + $self->{'marker_id'} = shift; + } + + return $self->{'marker_id'}; +} + + + +=head2 marker + + Arg [1] : (optional) Bio::EnsEMBL::Map::Marker $marker + Example : $marker = $marker_feature->marker; + Description: Getter/Setter for the marker associated with this marker feature + If the marker has not been set and the database is available + the marker will automatically be retrieved (lazy-loaded). + Returntype : Bio::EnsEMBL::Map::Marker + Exceptions : none + Caller : general + Status : stable + +=cut + +sub marker { + my $self = shift; + + if(@_) { + $self->{'marker'} = shift; + } elsif(!$self->{'marker'} && $self->adaptor && $self->{'marker_id'}) { + #lazy load the marker if it is not already loaded + my $ma = $self->adaptor->db->get_MarkerAdaptor; + $self->{'marker'} = $ma->fetch_by_dbID($self->{'marker_id'}); + } + + return $self->{'marker'}; +} + + + +=head2 map_weight + + Arg [1] : (optional) int $map_weight + Example : $map_weight = $marker_feature->map_weight; + Description: Getter/Setter for the map weight of this marker. This is the + number of times that this marker has been mapped to the genome. + E.g. a marker iwth map weight 3 has been mapped to 3 locations + in the genome. + Returntype : int + Exceptions : none + Caller : general + Status : stable + +=cut + +sub map_weight { + my $self = shift; + + if(@_) { + $self->{'map_weight'} = shift; + } + + return $self->{'map_weight'}; +} + + + +=head2 display_id + + Arg [1] : none + Example : print $mf->display_id(); + Description: This method returns a string that is considered to be + the 'display' identifier. For marker features this is the + name of the display synonym or '' if it is not defined. + Returntype : string + Exceptions : none + Caller : web drawing code + Status : stable + +=cut + +sub display_id { + my $self = shift; + my $marker = $self->{'marker'}; + + return '' if(!$marker); + my $ms = $marker->display_MarkerSynonym(); + return '' if(!$ms); + return $ms->name() || ''; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Map/MarkerSynonym.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Map/MarkerSynonym.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,135 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Map::MarkerSynonym + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +Represents an alias for a marker in the EnsEMBL database. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Map::MarkerSynonym; + +use strict; +use vars qw(@ISA); + + +=head2 new + + Arg [1] : (optional) int $dbID + Arg [2] : (optional) string $source + Arg [3] : (optional) string $name + Example : $synonym = Bio::EnsEMBL::Map::MarkerSynonym->new(12,$src,$name); + Description: Creates a new MarkerSynonym + Returntype : Bio::EnsEMBL::Map::MarkerSynonym + Exceptions : non + Caller : general + Status : stable + +=cut + +sub new { + my ($caller, $dbID, $source, $name) = @_; + + my $class = ref($caller) || $caller; + + return bless( {'dbID' => $dbID, + 'source' => $source, + 'name' => $name}, $class ); +} + + +=head2 dbID + + Arg [1] : (optional) int $dbID + Example : $mid = $marker_synonym->dbID; + Description: Getter/Setter for the internal id of this synonym + Returntype : int + Exceptions : none + Caller : general + Status : stable + +=cut + +sub dbID { + my $self = shift; + + if(@_) { + $self->{'dbID'} = shift; + } + + return $self->{'dbID'}; +} + + +=head2 source + + Arg [1] : (optional) string $source + Example : $source = $marker_synonym->source; + Description: Getter/Setter for the source of this marker synonym + Returntype : string + Exceptions : none + Caller : general + Status : stable + +=cut + +sub source { + my $self = shift; + + if(@_) { + $self->{'source'} = shift; + } + + return $self->{'source'}; +} + + +=head2 name + + Arg [1] : (optional) string $name + Example : $name = $marker_synonym->name; + Description: Getter/Setter for the name/identifier of this synonym + Returntype : string + Exceptions : none + Caller : general + Status : stable + +=cut + +sub name { + my $self = shift; + + if(@_) { + $self->{'name'} = shift; + } + + return $self->{'name'} +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Map/Qtl.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Map/Qtl.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,340 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Map::Qtl + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +Represents a Qtl in the EnsEMBL database. A quantitative trait locus is +defined by three markers, two flanking and one peak (optional) marker. +Its a region (or more often a group of regions) which is likely to +affect the phenotype (trait) described in this Qtl. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Map::Qtl; + +use strict; +use vars qw(@ISA); + +use Bio::EnsEMBL::Storable; +use Bio::EnsEMBL::Utils::Exception qw(throw deprecate); + +@ISA = qw(Bio::EnsEMBL::Storable); + + + +=head2 new + + Arg [1] : int $dbID + Arg [2] : Bio::EnsEMBL::Map::DBSQL::QtlAdaptor $adaptor + Arg [3] : Bio::EnsEMBL::Map::Marker $flank_marker_1 + Arg [4] : Bio::EnsEMBL::Map::Marker $peak_marker + Arg [5] : Bio::EnsEMBL::Map::Marker $flank_marker_2 + Arg [6] : string $trait + Arg [7] : float $lod_score + Arg [8] : hashref $synonyms + A hashref with source keys and identifier values + Example : none + Description: Creates a new Qtl object. Usually done by Adaptor + Returntype : Bio::EnsEMBL::Map::Qtl + Exceptions : none + Caller : general, DBSQL::QtlAdaptor, DBSQL::QtlFeatureAdaptor + Status : stable + +=cut + +sub new { + my ( $class, $dbID, $adaptor, $flank_marker_1, $peak_marker, + $flank_marker_2, $trait, $lod_score, + $synonyms ) = @_; + + $class = ref( $class ) ||$class; + my $self = bless( { + 'dbID' => $dbID, + 'flank_marker_1' => $flank_marker_1, + 'flank_marker_2' => $flank_marker_2, + 'peak_marker' => $peak_marker, + 'trait' => $trait, + 'lod_score' => $lod_score, + 'synonyms' => $synonyms + }, $class ); + $self->adaptor($adaptor); + return $self; +} + + +=head2 add_synonym + + Arg [1] : string $source + The source of the synonym + Arg [2] : string $identifier + The identifier from this source + Example : $qtl->add_synonym('rat genome database', '65516'); + Description: Adds a synonym to this qtl + Returntype : none + Exceptions : thrown if arguments are not provided + Caller : general + Status : stable + +=cut + +sub add_synonym { + my $self = shift; + my $source = shift; + my $identifier = shift; + + unless($source && $identifier) { + throw('Source and identifier arguments are required'); + } + + $self->{'synonyms'}->{$source} = $identifier; +} + + +=head2 get_synonyms + + Arg [1] : none + Example : + foreach my $source ($keys %{$qtl->get_synonyms}) { + print $source . ':'. $qtl->get_synonyms->{$source}; + } + Description: Returns a hashref of synonyms keyed on their source name + Returntype : hashref of synonyms keyed on their source name + Exceptions : none + Caller : general + Status : stable + +=cut + +sub get_synonyms { + my $self = shift; + + return $self->{'synonyms'} || {}; +} + + + +=head2 trait + + Arg [1] : string $trait + Phenotype of this Qtl + Example : none + Description: Getter/Setter for the trait attribute + Returntype : string + Exceptions : none + Caller : general + Status : stable + +=cut + +sub trait { + my $self = shift; + + if(@_) { + $self->{'trait'} = shift; + } + + return $self->{'trait'}; +} + + +=head2 lod_score + + Arg [1] : float $lod_score + A score for the Qtl + Example : none + Description: Getter/Setter for attribute lod_score + Returntype : float + Exceptions : none + Caller : general + Status : stable + +=cut + +sub lod_score { + my $self = shift; + + if(@_) { + $self->{'lod_score'} = shift; + } + + return $self->{'lod_score'}; +} + + +=head2 peak_marker + + Arg [1] : Bio::EnsEMBL::Map::Marker $peak_marker + an optional Marker which has the peak probablitity + for this traits occurence + Example : none + Description: Getter/Setter for attribute peak_marker + Returntype : Bio::EnsEMBL::Map::Marker + Exceptions : none + Caller : general + Status : stable + +=cut + +sub peak_marker { + my $self = shift; + + if(@_) { + $self->{'peak_marker'} = shift; + } + + return $self->{'peak_marker'}; +} + + +=head2 flank_marker_1 + + Arg [1] : Bio::EnsEMBL::Map::Marker $flank_marker_1 + One flanking marker of the interest region, the two flanking + markers define the region + Example : none + Description: Getter/Setter attribute flanking_marker_1 + Returntype : Bio::EnsEMBL::Map::Marker + Exceptions : none + Caller : general + Status : stable + +=cut + +sub flank_marker_1 { + my $self = shift; + + if(@_) { + $self->{'flank_marker_1'} = shift; + } + + return $self->{'flank_marker_1'}; +} + + + +=head2 flank_marker_2 + + Arg [1] : Bio::EnsEMBL::Map::Marker $flank_marker_2 + One flanking marker of the interest region, the two flanking + markers define the region + Example : none + Description: Getter/Setter attribute flanking_marker_2 + Returntype : Bio::EnsEMBL::Map::Marker + Exceptions : none + Caller : general + Status : stable + +=cut + + +sub flank_marker_2 { + my $self = shift; + + if(@_) { + $self->{'flank_marker_2'} = shift; + } + + return $self->{'flank_marker_2'}; +} + + + +=head2 get_QtlFeatures + + Args : none + Example : none + Description: return the qtl feature which is associated with this + Qtl. It comes in chromosomal slice coordinates. There can + only be one. + Returntype : Bio::EnsEMBL::Map::QtlFeature + Exceptions : only works with adaptored Qtls + Caller : general + Status : stable + +=cut + +sub get_QtlFeature { + my $self = shift; + + my $adaptor = $self->adaptor(); + return undef unless $adaptor; + my $result = $adaptor->db()->get_QtlFeatureAdaptor()-> + fetch_all_by_Qtl( $self ); + + if( @$result ) { + return $result->[0]; + } else { + return; + } +} + + + + + +=head2 source_database + +This method is deprecated. Use get_synonyms or add_synonym instead. + +=cut + +sub source_database { + my $self = shift; + + deprecate('Use get_synonyms or add_synonym instead'); + + my $syns = $self->get_synonyms; + my ($source) = keys %$syns; + + return $source || ''; +} + + +=head2 source_primary_id + +This method is deprecated. Use get_synonyms or add_synonym instead. + +=cut + +sub source_primary_id { + my $self = shift; + + deprecate('Use get_synonyms or add_synonym instead'); + + my $syns = $self->get_synonyms; + my ($source) = keys %$syns; + + if($source) { + return $syns->{$source}; + } + + return ''; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Map/QtlFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Map/QtlFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,149 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Map::QtlFeature + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +Represents a QtlFeature in the EnsEMBL database. QtlFeatures are +generally very long and its not clear wether a representation in Contig +coordinates actually makes sense. In the database they will have +chromosomal coordinates. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Map::QtlFeature; + +use strict; +use vars qw(@ISA); + +use Bio::EnsEMBL::Feature; + +@ISA = qw(Bio::EnsEMBL::Feature); + + +=head2 new + + Arg [1] : Bio::EnsEMBL::Map::DBSQL::QtlFeatureAdaptor $adaptor + Example : none + Description: Create a QtlFeature + Returntype : Bio::EnsEMBL::Map::QtlFeature + Exceptions : none + Caller : general, DBSQL::QtlFeatureAdaptor + Status : Stable + +=cut + +sub new { + my ( $class, $adaptor, $slice, $start, $end, $qtl, $analysis ) = @_; + + $class = ref( $class ) ||$class; + my $self = bless( { + 'slice' => $slice, + 'start' => $start, + 'end' => $end, + 'qtl' => $qtl, + 'analysis' => $analysis, + 'strand' => 0 + }, $class ); + + $self->adaptor($adaptor); + return $self; +} + + +=head2 qtl + + Arg [1] : Bio::EnsEMBL::Map::Qtl $qtl + the qtl object for this feature + Example : none + Description: return the Qtl object associated with this location + Returntype : Bio::EnsEMBL::Map::Qtl + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub qtl { + my $self = shift; + + if(@_) { + $self->{'qtl'} = shift; + } + + return $self->{'qtl'}; +} + + + +=head2 strand + + Arg [1] : none + Example : $strand = $qtl_feat->strand(); + Description: Overrides the Feature strand method to always return a + value of 0 for qtl features (they are unstranded features) + Returntype : int (always 0) + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub strand { + my $self = shift; + return 0; +} + + + +=head2 move + + Arg [1] : $start - The new end of this qtl feature + Arg [2] : $end - The new start of this qtl feature + Arg [3] : $strand - ignored always set to 0 + Example : $qtl_feat->move(1, 10_000); + Description: Overrides superclass move() method to ensure strand is always 0. + See Bio::EnsEMBL::Feature::move + Returntype : none + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub move { + my ($self, $start, $end, $strand) = @_; + + #maintain a strandedness of 0 + return $self->SUPER::move($start,$end,0); +} + + + +1; + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/MappedSlice.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/MappedSlice.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,668 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::MappedSlice - an object representing a mapped slice + +=head1 SYNOPSIS + + # get a reference slice + my $slice = + $slice_adaptor->fetch_by_region( 'chromosome', 14, 900000, 950000 ); + + # create MappedSliceContainer based on the reference slice + my $msc = Bio::EnsEMBL::MappedSliceContainer->new( -SLICE => $slice ); + + # set the adaptor for fetching AssemblySlices + my $asa = $slice->adaptor->db->get_AssemblySliceAdaptor; + $msc->set_AssemblySliceAdaptor($asa); + + # add an AssemblySlice to your MappedSliceContainer + $msc->attach_AssemblySlice('NCBIM36'); + + foreach my $mapped_slice ( @{ $msc->get_all_MappedSlices } ) { + print $mapped_slice->name, "\n"; + + foreach my $sf ( @{ $mapped_slice->get_all_SimpleFeatures } ) { + print " ", &to_string($sf), "\n"; + } + } + +=head1 DESCRIPTION + +NOTE: this code is under development and not fully functional nor tested +yet. Use only for development. + +This object represents a mapped slice, i.e. a slice that's attached +to a reference slice and a mapper to convert coordinates to/from the +reference. The attachment is done via a MappedSliceContainer which +has the reference slice and the "container slice" defining the common +coordinate system for all MappedSlices. + +A MappedSlice is supposed to behave as close to a Bio::EnsEMBL::Slice +as possible. Most Slice methods are implemented in MappedSlice and will +return an equivalent value to what Slice does. There are some exceptions +of unimplemented methods, either because there is no useful equivalent +for a MappedSlice to do, or they are too complicated. + +Not supported Bio::EnsEMBL::Slice methods: + + All deprecated methods + All Bio::PrimarySeqI compliance methods + expand + get_generic_features + get_seq_region_id + seq_region_Slice + +Not currently supported but maybe should/could: + + calculate_pi + calculate_theta + get_base_count + get_by_Individual + get_by_strain + invert + +Internally, a MappedSlice is a collection of Bio::EnsEMBL::Slices and +associated Bio::EnsEMBL::Mappers which map the slices to the common +container coordinate system. + +MappedSlices are usually created and attached to a MappedSliceContainer +by an adaptor/factory. + +=head1 METHODS + + new + add_Slice_Mapper_pair + get_all_Slice_Mapper_pairs + adaptor + container + name + seq_region_name + start + end + strand + length + seq_region_length + centrepoint + coord_system + coord_system_name + is_toplevel + seq (not implemented yet) + subseq (not implemented yet) + get_repeatmasked_seq (not implemented yet) + sub_MappedSlice (not implemented yet) + project (not implemented yet) + +=head1 RELATED MODULES + + Bio::EnsEMBL::MappedSlice + Bio::EnsEMBL::DBSQL::AssemblySliceAdaptor + Bio::EnsEMBL::Compara::AlignSlice + Bio::EnsEMBL::Compara::AlignSlice::Slice + Bio::EnsEMBL::AlignStrainSlice + Bio::EnsEMBL::StrainSlice + +=cut + +package Bio::EnsEMBL::MappedSlice; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Mapper; +use Scalar::Util qw(weaken); + +use vars qw($AUTOLOAD); + + +=head2 new + + Arg [ADAPTOR] : Adaptor $adaptor - an adaptor of the appropriate type + Arg [CONTAINER] : Bio::EnsEMBL::MappedSliceContainer $container - the + container this MappedSlice is attached to + Arg [NAME] : String $name - name + Example : my $mapped_slice = Bio::EnsEMBL::MappedSlice->new( + -ADAPTOR => $adaptor, + -CONTAINER => $container, + -NAME => $name, + ); + Description : Constructor. Usually you won't call this method manually, but + the MappedSlice will be constructed by an adaptor/factory. + Return type : Bio::EnsEMBL::MappedSlice + Exceptions : thrown on wrong or missing arguments + Caller : general, MappedSlice adaptors + Status : At Risk + : under development + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my ($adaptor, $container, $name) = + rearrange([qw(ADAPTOR CONTAINER NAME)], @_); + + # arguement check + unless ($container and ref($container) and + $container->isa('Bio::EnsEMBL::MappedSliceContainer')) { + throw("Need a MappedSliceContainer."); + } + + my $self = {}; + bless ($self, $class); + + # + # initialise object + # + + # need to weaken reference to prevent circular reference + weaken($self->{'container'} = $container); + + $self->adaptor($adaptor) if (defined($adaptor)); + $self->{'name'} = $name if (defined($name)); + + $self->{'slice_mapper_pairs'} = []; + + return $self; +} + + +=head2 add_Slice_Mapper_pair + + Arg[1] : Bio::EnsEMBL::Slice $slice - slice to add + Arg[2] : Bio::EnsEMBL::Mapper $mapper - the mapper for this slice + Example : $mapped_slice->add_Slice_Mapper_pair($slice, $mapper); + Description : Adds a native slice and a corresponding mapper to map to/from + the artificial container coord system. + Return type : listref of Bio::EnsEMBL::MappedSlice + Exceptions : thrown on wrong or missing arguments + Caller : general, MappedSlice adaptors + Status : At Risk + : under development + +=cut + +sub add_Slice_Mapper_pair { + my $self = shift; + my $slice = shift; + my $mapper = shift; + + # argument check + unless ($slice and ref($slice) and ($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice')) ) { + throw("You must provide a slice."); + } + + unless ($mapper and ref($mapper) and $mapper->isa('Bio::EnsEMBL::Mapper')) { + throw("You must provide a mapper."); + } + + push @{ $self->{'slice_mapper_pairs'} }, [ $slice, $mapper ]; + + return $self->{'slice_mapper_pairs'}; +} + + +=head2 get_all_Slice_Mapper_pairs + + Example : foreach my $pair (@{ $self->get_all_Slice_Mapper_pairs }) { + my ($slice, $mapper) = @$pair; + + # get container coordinates + my @coords = $mapper->map_coordinates( + $slice->seq_region_name, + $slice->start, + $slice->end, + $slice->strand, + 'mapped_slice' + ); + + # .... + } + Description : Gets all Slice/Mapper pairs this MappedSlice is composed of. + Each slice (and features on it) can be mapped onto the + artificial container coord system using the mapper. + Return type : listref of listref of a Bio::EnsEMBL::Slice and + Bio::EnsEMBL::Mapper pair + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_all_Slice_Mapper_pairs { + my $self = shift; + return $self->{'slice_mapper_pairs'}; +} + + +=head2 adaptor + + Arg[1] : (optional) Adaptor $adaptor - the adaptor/factory for this + object + Example : $mapped_slice->adaptor($assembly_slice_adaptor); + Description : Getter/setter for the adaptor/factory for this object. + Return type : Adaptor of appropriate type + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub adaptor { + my $self = shift; + weaken($self->{'adaptor'} = shift) if (@_); + return $self->{'adaptor'}; +} + + +=head2 container + + Arg[1] : (optional) Bio::EnsEMBL::MappedSliceContainer - the container + this object is attached to + Example : my $container = $mapped_slice->container; + print $container->ref_slice->name, "\n"; + Description : Getter/setter for the container this object is attached to. The + container will give you access to the reference slice, a common + artificial container slice, and a mapper to map to it from the + container coord system. + + The implementation uses a weak reference to attach the container + since the container holds a list of MappedSlices itself. + Return type : Bio::EnsEMBL::MappedSliceContainer + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub container { + my $self = shift; + weaken($self->{'container'} = shift) if (@_); + return $self->{'container'}; +} + + +=head2 name + + Arg[1] : String - the name of this object + Example : my $name = $mapped_slice->container->ref_slice->name . + ":mapped_" . $ident_string; + $mapped_slice->name($name); + Description : Getter/setter for this object's name + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub name { + my $self = shift; + $self->{'name'} = shift if (@_); + return $self->{'name'}; +} + + +=head2 seq_region_name + + Example : my $sr_name = $mapped_slice->seq_region_name; + Description : Returns the seq_region name of the reference slice. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub seq_region_name { + my $self = shift; + return $self->container->ref_slice->seq_region_name; +} + + +=head2 start + + Example : my $start = $mapped_slice->start; + Description : Returns the start of the container slice. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub start { + my $self = shift; + return $self->container->container_slice->start; +} + + +=head2 end + + Example : my $end = $mapped_slice->end; + Description : Returns the end of the container slice. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub end { + my $self = shift; + return $self->container->container_slice->end; +} + + +=head2 strand + + Example : my $strand = $mapped_slice->strand; + Description : Returns the strand of the container slice. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub strand { + my $self = shift; + return $self->container->container_slice->strand; +} + + +=head2 length + + Example : my $length = $mapped_slice->length; + Description : Returns the length of the container slice + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub length { + my $self = shift; + return $self->container->container_slice->length; +} + + +=head2 seq_region_length + + Example : my $sr_length = $mapped_slice->seq_region_length; + Description : Returns the seq_region length of the reference slice. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub seq_region_length { + my $self = shift; + return $self->container->ref_slice->seq_region_length; +} + + +=head2 centrepoint + + Example : my $centrepoint = $mapped_slice->centrepoint; + Description : Returns the centrepoint of the container slice. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub centrepoint { + my $self = shift; + return $self->container->container_slice->centrepoint; +} + + +=head2 coord_system + + Example : my $cs = $mapped_slice->coord_system; + Description : Returns the coord system of the container slice. + Return type : Bio::EnsEMBL::CoordSystem + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub coord_system { + my $self = shift; + return $self->container->container_slice->coord_system; +} + +=head2 coord_system_name + + Example : my $cs_name = $mapped_slice->coord_system_name; + Description : Returns the coord system name of the container slice. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub coord_system_name { + my $self = shift; + return $self->container->container_slice->coord_system_name; +} + +=head2 is_toplevel + + Example : my $toplevel_flag = $mapped_slice->is_toplevel; + Description : Returns weather the container slice is toplevel. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub is_toplevel { + my $self = shift; + return $self->container->container_slice->is_toplevel; +} + + +=head2 seq + + Example : my $seq = $mapped_slice->seq() + Description : Retrieves the expanded sequence of this mapped slice, + including "-" characters where there are inserts in any other + mapped slices. This will align with the sequence returned by + the container's seq() method. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub seq { + my $self = shift; + + # create an empty string + my $ms_seq = ''; + + # this coord represents the current position in the MS sequence + my $start = 0; + + # get slice/mapper pairs from mapped slice (usually only one anyway) + foreach my $pair(@{$self->get_all_Slice_Mapper_pairs()}) { + my ($s, $m) = @$pair; + + # make sure to send extra args + # eg strain slices might need read coverage filtering + my $seq = $s->seq(@_); + + # project from mapped slice to reference slice using the mapper + foreach my $ref_coord($m->map_coordinates('mapped_slice', 1, CORE::length($seq), $s->strand, 'mapped_slice')) { + + # normal coord + if(!$ref_coord->isa('Bio::EnsEMBL::Mapper::IndelCoordinate') && !$ref_coord->isa('Bio::EnsEMBL::Mapper::Gap')) { + + # project from reference slice to container slice using the container's mapper + foreach my $ms_coord($self->container->mapper->map_coordinates($self->container->ref_slice->seq_region_name, $ref_coord->start, $ref_coord->end, $ref_coord->strand, 'ref_slice')) { + + # normal coord + if(!$ms_coord->isa('Bio::EnsEMBL::Mapper::IndelCoordinate') && !$ms_coord->isa('Bio::EnsEMBL::Mapper::Gap')) { + $ms_seq .= substr($seq, $start, $ms_coord->length); + + $start += $ms_coord->length(); + } + + # indel coord + else { + $ms_seq .= '-' x $ms_coord->length(); + } + } + } + + # indel / gap + else { + + # if there's a gap here aswell, add corresponding sequence + if($ref_coord->gap_length > 0) { + $ms_seq .= substr($seq, $start, $ref_coord->gap_length); + $start += $ref_coord->gap_length; + } + + # add "-" to the sequence + $ms_seq .= '-' x ($ref_coord->length() - $ref_coord->gap_length()); + } + } + } + + return $ms_seq; +} + +sub subseq { +} + +sub get_repeatmasked_seq { +} + +sub sub_MappedSlice { +} + +sub project { +} + + +=head2 AUTOLOAD + + Arg[1..N] : Arguments passed on to the calls on the underlying slices. + Example : my @simple_features = @{ $mapped_slice->get_all_SimpleFeatures }; + Description : Aggregate data gathered from composing Slices. + This will call Slice->get_all_* and combine the results. + Coordinates will be transformed to be on the container slice + coordinate system. + + Calls involving DAS features are skipped since the DAS adaptor + handles coordinate conversions natively. + Return type : listref of features (same type as corresponding Slice method) + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub AUTOLOAD { + my $self = shift; + + my $method = $AUTOLOAD; + $method =~ s/.*:://; + + # AUTOLOAD should only deal with get_all_* methods + return unless ($method =~ /^get_all_/); + + # skip DAS methods + return if ($method =~ /DAS/); + + my @mapped_features = (); + + foreach my $pair (@{ $self->get_all_Slice_Mapper_pairs }) { + my ($slice, $mapper) = @$pair; + #warn $slice->name; + + # call $method on each native slice composing the MappedSlice + my @features = @{ $slice->$method(@_) }; + + # map features onto the artificial container coordinate system + foreach my $f (@features) { + + my @coords = $mapper->map_coordinates( + $f->slice->seq_region_name, + $f->start, + $f->end, + $f->strand, + 'mapped_slice' + ); + + # sanity check + if (scalar(@coords) > 1) { + warning("Got more than one Coordinate returned, expected only one!\n"); + } + + $f->start($coords[0]->start); + $f->end($coords[0]->end); + $f->strand($coords[0]->strand); + $f->slice($self->container->container_slice); + + push @mapped_features, $f; + } + + } + + return \@mapped_features; +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/MappedSliceContainer.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/MappedSliceContainer.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,638 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::MappedSliceContainer - container for mapped slices + +=head1 SYNOPSIS + + # get a reference slice + my $slice = + $slice_adaptor->fetch_by_region( 'chromosome', 14, 900000, 950000 ); + + # create MappedSliceContainer based on the reference slice + my $msc = Bio::EnsEMBL::MappedSliceContainer->new( -SLICE => $slice ); + + # set the adaptor for fetching AssemblySlices + my $asa = $slice->adaptor->db->get_AssemblySliceAdaptor; + $msc->set_AssemblySliceAdaptor($asa); + + # add an AssemblySlice to your MappedSliceContainer + $msc->attach_AssemblySlice('NCBIM36'); + + foreach my $mapped_slice ( @{ $msc->get_all_MappedSlices } ) { + print $mapped_slice->name, "\n"; + + foreach my $sf ( @{ $mapped_slice->get_all_SimpleFeatures } ) { + print " ", &to_string($sf), "\n"; + } + } + +=head1 DESCRIPTION + +NOTE: this code is under development and not fully functional nor tested +yet. Use only for development. + +A MappedSliceContainer holds a collection of one or more +Bio::EnsEMBL::MappedSlices. It is based on a real reference slice and +contains an artificial "container slice" which defines the common +coordinate system used by all attached MappedSlices. There is also a +mapper to convert coordinates between the reference and the container +slice. + +Attaching MappedSlices to the container is delegated to adaptors +(which act more as object factories than as traditional Ensembl db +adaptors). The adaptors will also modify the container slice and +associated mapper if required. This design allows us to keep the +MappedSliceContainer generic and encapsulate the data source specific +code in the adaptor/factory module. + +In the simplest use case, all required MappedSlices are attached to the +MappedSliceContainer at once (by a single call to the adaptor). This +object should also allow "hot-plugging" of MappedSlices (e.g. attach a +MappedSlice representing a strain to a container that already contains a +multi-species alignment). The methods for attaching new MappedSlice will +be responsable to perform the necessary adjustments to coordinates and +mapper on the existing MappedSlices. + +=head1 METHODS + + new + set_adaptor + get_adaptor + set_AssemblySliceAdaptor + get_AssemblySliceAdaptor + set_AlignSliceAdaptor (not implemented yet) + get_AlignSliceAdaptor (not implemented yet) + set_StrainSliceAdaptor (not implemented yet) + get_StrainSliceAdaptor (not implemented yet) + attach_AssemblySlice + attach_AlignSlice (not implemented yet) + attach_StrainSlice (not implemented yet) + get_all_MappedSlices + sub_MappedSliceContainer (not implemented yet) + ref_slice + container_slice + mapper + expanded + +=head1 RELATED MODULES + + Bio::EnsEMBL::MappedSlice + Bio::EnsEMBL::DBSQL::AssemblySliceAdaptor + Bio::EnsEMBL::Compara::AlignSlice + Bio::EnsEMBL::Compara::AlignSlice::Slice + Bio::EnsEMBL::AlignStrainSlice + Bio::EnsEMBL::StrainSlice + +=cut + +package Bio::EnsEMBL::MappedSliceContainer; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::CoordSystem; +use Bio::EnsEMBL::Slice; +use Bio::EnsEMBL::Mapper; + + +# define avalable adaptormajs to use with this container +my %adaptors = map { $_ => 1 } qw(assembly align strain); + + +=head2 new + + Arg [SLICE] : Bio::EnsEMBL::Slice $slice - the reference slice for this + container + Arg [EXPANDED] : (optional) Boolean $expanded - set expanded mode (default: + collapsed) + Example : my $slice = $slice_adaptor->fetch_by_region('chromosome', 1, + 9000000, 9500000); + my $msc = Bio::EnsEMBL::MappedSliceContainer->new( + -SLICE => $slice, + -EXPANDED => 1, + ); + Description : Constructor. See the general documentation of this module for + details about this object. Note that the constructor creates an + empty container, so you'll have to attach MappedSlices to it to + be useful (this is usually done by an adaptor/factory). + Return type : Bio::EnsEMBL::MappedSliceContainer + Exceptions : thrown on wrong or missing argument + Caller : general + Status : At Risk + : under development + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my ($ref_slice, $expanded) = rearrange([qw(SLICE EXPANDED)], @_); + + # argument check + unless ($ref_slice and ref($ref_slice) and + ($ref_slice->isa('Bio::EnsEMBL::Slice') or $ref_slice->isa('Bio::EnsEMBL::LRGSlice')) ) { + throw("You must provide a reference slice."); + } + + my $self = {}; + bless ($self, $class); + + # initialise object + $self->{'ref_slice'} = $ref_slice; + $self->{'expanded'} = $expanded || 0; + + $self->{'mapped_slices'} = []; + + # create the container slice + $self->_create_container_slice($ref_slice); + + return $self; +} + + +# +# Create an artificial slice which represents the common coordinate system used +# for this MappedSliceContainer +# +sub _create_container_slice { + my $self = shift; + my $ref_slice = shift; + + # argument check + unless ($ref_slice and ref($ref_slice) and + ($ref_slice->isa('Bio::EnsEMBL::Slice') or $ref_slice->isa('Bio::EnsEMBL::LRGSlice')) ) { + throw("You must provide a reference slice."); + } + + # create an artificial coordinate system for the container slice + my $cs = Bio::EnsEMBL::CoordSystem->new( + -NAME => 'container', + -RANK => 1, + ); + + # Create a new artificial slice spanning your container. Initially this will + # simply span your reference slice + my $container_slice = Bio::EnsEMBL::Slice->new( + -COORD_SYSTEM => $cs, + -START => 1, + -END => $ref_slice->length, + -STRAND => 1, + -SEQ_REGION_NAME => 'container', + ); + + $self->{'container_slice'} = $container_slice; + + # Create an Mapper to map to/from the reference slice to the container coord + # system. + my $mapper = Bio::EnsEMBL::Mapper->new('ref_slice', 'container'); + + $mapper->add_map_coordinates( + $ref_slice->seq_region_name, + $ref_slice->start, + $ref_slice->end, + 1, + $container_slice->seq_region_name, + $container_slice->start, + $container_slice->end, + ); + + $self->{'mapper'} = $mapper; +} + + +=head2 set_adaptor + + Arg[1] : String $type - the type of adaptor to set + Arg[2] : Adaptor $adaptor - the adaptor to set + Example : my $adaptor = Bio::EnsEMBL::DBSQL::AssemblySliceAdaptor->new; + $msc->set_adaptor('assembly', $adaptor); + Description : Parameterisable wrapper for all methods that set adaptors (see + below). + Return type : same as Arg 2 + Exceptions : thrown on missing type + Caller : general + Status : At Risk + : under development + +=cut + +sub set_adaptor { + my $self = shift; + my $type = shift; + my $adaptor = shift; + + # argument check + unless ($type and $adaptors{$type}) { + throw("Missing or unknown adaptor type."); + } + + $type = ucfirst($type); + my $method = "set_${type}SliceAdaptor"; + + return $self->$method($adaptor); +} + + +=head2 get_adaptor + + Arg[1] : String $type - the type of adaptor to get + Example : my $assembly_slice_adaptor = $msc->get_adaptor('assembly'); + Description : Parameterisable wrapper for all methods that get adaptors (see + below). + Return type : An adaptor for the requested type of MappedSlice. + Exceptions : thrown on missing type + Caller : general + Status : At Risk + : under development + +=cut + +sub get_adaptor { + my $self = shift; + my $type = shift; + + # argument check + unless ($type and $adaptors{$type}) { + throw("Missing or unknown adaptor type."); + } + + $type = ucfirst($type); + my $method = "get_${type}SliceAdaptor"; + + return $self->$method; +} + + +=head2 set_AssemblySliceAdaptor + + Arg[1] : Bio::EnsEMBL::DBSQL::AssemblySliceAdaptor - the adaptor to set + Example : my $adaptor = Bio::EnsEMBL::DBSQL::AssemblySliceAdaptor->new; + $msc->set_AssemblySliceAdaptor($adaptor); + Description : Sets an AssemblySliceAdaptor for this container. The adaptor can + be used to attach MappedSlice for alternative assemblies. + Return type : Bio::EnsEMBL::DBSQL::AssemblySliceAdaptor + Exceptions : thrown on wrong or missing argument + Caller : general, $self->get_adaptor + Status : At Risk + : under development + +=cut + +sub set_AssemblySliceAdaptor { + my $self = shift; + my $assembly_slice_adaptor = shift; + + unless ($assembly_slice_adaptor and ref($assembly_slice_adaptor) and + $assembly_slice_adaptor->isa('Bio::EnsEMBL::DBSQL::AssemblySliceAdaptor')) { + throw("Need a Bio::EnsEMBL::AssemblySliceAdaptor."); + } + + $self->{'adaptors'}->{'AssemblySlice'} = $assembly_slice_adaptor; +} + + +=head2 get_AssemblySliceAdaptor + + Example : my $assembly_slice_adaptor = $msc->get_AssemblySliceAdaptor; + Description : Gets a AssemblySliceAdaptor from this container. The adaptor can + be used to attach MappedSlice for alternative assemblies. + Return type : Bio::EnsEMBL::DBSQL::AssemblySliceAdaptor + Exceptions : thrown on wrong or missing argument + Caller : general, $self->get_adaptor + Status : At Risk + : under development + +=cut + +sub get_AssemblySliceAdaptor { + my $self = shift; + + unless ($self->{'adaptors'}->{'AssemblySlice'}) { + warning("No AssemblySliceAdaptor attached to MappedSliceContainer."); + } + + return $self->{'adaptors'}->{'AssemblySlice'}; +} + + +# [todo] +sub set_AlignSliceAdaptor { + throw("Not implemented yet!"); +} + + +# [todo] +sub get_AlignSliceAdaptor { + throw("Not implemented yet!"); +} + + +# [todo] +sub set_StrainSliceAdaptor { + my $self = shift; + my $strain_slice_adaptor = shift; + + unless ($strain_slice_adaptor and ref($strain_slice_adaptor) and + $strain_slice_adaptor->isa('Bio::EnsEMBL::DBSQL::StrainSliceAdaptor')) { + throw("Need a Bio::EnsEMBL::StrainSliceAdaptor."); + } + + $self->{'adaptors'}->{'StrainSlice'} = $strain_slice_adaptor; +} + + +# [todo] +sub get_StrainSliceAdaptor { + my $self = shift; + + unless ($self->{'adaptors'}->{'StrainSlice'}) { + warning("No StrainSliceAdaptor attached to MappedSliceContainer."); + } + + return $self->{'adaptors'}->{'StrainSlice'}; +} + + +=head2 attach_AssemblySlice + + Arg[1] : String $version - assembly version to attach + Example : $msc->attach_AssemblySlice('NCBIM36'); + Description : Attaches a MappedSlice for an alternative assembly to this + container. + Return type : none + Exceptions : thrown on missing argument + Caller : general, Bio::EnsEMBL::DBSQL::AssemblySliceAdaptor + Status : At Risk + : under development + +=cut + +sub attach_AssemblySlice { + my $self = shift; + my $version = shift; + + throw("Need a version.") unless ($version); + + my $asa = $self->get_AssemblySliceAdaptor; + return unless ($asa); + + my @mapped_slices = @{ $asa->fetch_by_version($self, $version) }; + + push @{ $self->{'mapped_slices'} }, @mapped_slices; +} + + +=head2 attach_StrainSlice + + Arg[1] : String $strain - name of strain to attach + Example : $msc->attach_StrainSlice('Watson'); + Description : Attaches a MappedSlice for an alternative strain to this + container. + Return type : none + Exceptions : thrown on missing argument + Caller : general, Bio::EnsEMBL::DBSQL::StrainSliceAdaptor + Status : At Risk + : under development + +=cut + +sub attach_StrainSlice { + my $self = shift; + my $strain = shift; + + throw("Need a strain.") unless ($strain); + + my $ssa = $self->get_StrainSliceAdaptor; + return unless ($ssa); + + my @mapped_slices = @{ $ssa->fetch_by_name($self, $strain) }; + + push @{ $self->{'mapped_slices'} }, @mapped_slices; +} + + + +=head2 get_all_MappedSlices + + Example : foreach my $mapped_slice (@{ $msc->get_all_MappedSlices }) { + print $mapped_slice->name, "\n"; + } + Description : Returns all MappedSlices attached to this container. + Return type : listref of Bio::EnsEMBL::MappedSlice + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_all_MappedSlices { + my $self = shift; + return $self->{'mapped_slices'}; +} + + +# [todo] +sub sub_MappedSliceContainer { + throw("Not implemented yet!"); +} + + +=head2 ref_slice + + Arg[1] : (optional) Bio::EnsEMBL::Slice - the reference slice to set + Example : my $ref_slice = $mapped_slice_container->ref_slice; + print "This MappedSliceContainer is based on the reference + slice ", $ref_slice->name, "\n"; + Description : Getter/setter for the reference slice. + Return type : Bio::EnsEMBL::Slice + Exceptions : thrown on wrong argument type + Caller : general + Status : At Risk + : under development + +=cut + +sub ref_slice { + my $self = shift; + + if (@_) { + my $slice = shift; + + unless (ref($slice) and ($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice'))) { + throw("Need a Bio::EnsEMBL::Slice."); + } + + $self->{'ref_slice'} = $slice; + } + + return $self->{'ref_slice'}; +} + + +=head2 container_slice + + Arg[1] : (optional) Bio::EnsEMBL::Slice - the container slice to set + Example : my $container_slice = $mapped_slice_container->container_slice; + print "The common slice used by this MappedSliceContainer is ", + $container_slice->name, "\n"; + Description : Getter/setter for the container slice. This is an artificial + slice which defines the common coordinate system used by the + MappedSlices attached to this container. + Return type : Bio::EnsEMBL::Slice + Exceptions : thrown on wrong argument type + Caller : general + Status : At Risk + : under development + +=cut + +sub container_slice { + my $self = shift; + + if (@_) { + my $slice = shift; + + unless (ref($slice) and ($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice')) ) { + throw("Need a Bio::EnsEMBL::Slice."); + } + + $self->{'container_slice'} = $slice; + } + + return $self->{'container_slice'}; +} + + +=head2 mapper + + Arg[1] : (optional) Bio::EnsEMBL::Mapper - the mapper to set + Example : my $mapper = Bio::EnsEMBL::Mapper->new('ref', 'mapped'); + $mapped_slice_container->mapper($mapper); + Description : Getter/setter for the mapper to map between reference slice and + the artificial container coord system. + Return type : Bio::EnsEMBL::Mapper + Exceptions : thrown on wrong argument type + Caller : internal, Bio::EnsEMBL::MappedSlice->AUTOLOAD + Status : At Risk + : under development + +=cut + +sub mapper { + my $self = shift; + + if (@_) { + my $mapper = shift; + + unless (ref($mapper) and $mapper->isa('Bio::EnsEMBL::Mapper')) { + throw("Need a Bio::EnsEMBL::Mapper."); + } + + $self->{'mapper'} = $mapper; + } + + return $self->{'mapper'}; +} + + +=head2 expanded + + Arg[1] : (optional) Boolean - expanded mode to set + Example : if ($mapped_slice_container->expanded) { + # do more elaborate mapping than in collapsed mode + [...] + } + Description : Getter/setter for expanded mode. + + By default, MappedSliceContainer use collapsed mode, which + means that no inserts in the reference sequence are allowed + when constructing the MappedSlices. in this mode, the + mapped_slice artificial coord system will be identical with the + ref_slice coord system. + + By setting expanded mode, you allow inserts in the reference + sequence. + Return type : Boolean + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub expanded { + my $self = shift; + $self->{'expanded'} = shift if (@_); + return $self->{'expanded'}; +} + +=head2 seq + + Example : my $seq = $container->seq() + Description : Retrieves the expanded sequence of the artificial container + slice, including "-" characters where there are inserts in any + of the attached mapped slices. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub seq { + my $self = shift; + + my $container_seq = ''; + + # check there's a mapper + if(defined($self->mapper)) { + my $start = 0; + my $slice = $self->ref_slice(); + my $seq = $slice->seq(); + + foreach my $coord($self->mapper->map_coordinates($slice->seq_region_name, $slice->start, $slice->end, $slice->strand, 'ref_slice')) { + # if it is a normal coordinate insert sequence + if(!$coord->isa('Bio::EnsEMBL::Mapper::IndelCoordinate')) { + $container_seq .= substr($seq, $start, $coord->length()); + $start += $coord->length; + } + + # if it is a gap or indel insert "-" + else { + $container_seq .= '-' x $coord->length(); + } + } + } + + return $container_seq; +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Mapper.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Mapper.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1041 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Mapper + +=head1 SYNOPSIS + + $map = Bio::EnsEMBL::Mapper->new( 'rawcontig', 'chromosome' ); + + # add a coodinate mapping - supply two pairs or coordinates + $map->add_map_coordinates( + $contig_id, $contig_start, $contig_end, $contig_ori, + $chr_name, chr_start, $chr_end + ); + + # map from one coordinate system to another + my @coordlist = + $mapper->map_coordinates( 627012, 2, 5, -1, "rawcontig" ); + +=head1 DESCRIPTION + +Generic mapper to provide coordinate transforms between two disjoint +coordinate systems. This mapper is intended to be 'context neutral' - in +that it does not contain any code relating to any particular coordinate +system. This is provided in, for example, Bio::EnsEMBL::AssemblyMapper. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Mapper; +use strict; +use integer; + +use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning stack_trace_dump); +use Bio::EnsEMBL::Mapper::Pair; +use Bio::EnsEMBL::Mapper::IndelPair; +use Bio::EnsEMBL::Mapper::Unit; +use Bio::EnsEMBL::Mapper::Coordinate; +use Bio::EnsEMBL::Mapper::IndelCoordinate; +use Bio::EnsEMBL::Mapper::Gap; + +use Bio::EnsEMBL::Utils::Exception qw(throw); + +# use Data::Dumper; + +=head2 new + + Arg [1] : string $from + The name of the 'from' coordinate system + Arg [2] : string $to + The name of the 'to' coordinate system + Arg [3] : (optional) Bio::EnsEMBL::CoordSystem $from_cs + The 'from' coordinate system + Arg [4] : (optional) Bio::EnsEMBL::CoordSystem $to_cs + Example : my $mapper = Bio::EnsEMBL::Mapper->new('FROM', 'TO'); + Description: Constructor. Creates a new Bio::EnsEMBL::Mapper object. + Returntype : Bio::EnsEMBL::Mapper + Exceptions : none + Caller : general + +=cut + +sub new { + my ( $proto, $from, $to, $from_cs, $to_cs ) = @_; + + if ( !defined($to) || !defined($from) ) { + throw("Must supply 'to' and 'from' tags"); + } + + my $class = ref($proto) || $proto; + + my $self = bless( { "_pair_$from" => {}, + "_pair_$to" => {}, + 'pair_count' => 0, + 'to' => $to, + 'from' => $from, + 'to_cs' => $to_cs, + 'from_cs' => $from_cs + }, + $class ); + + # do sql to get any componente with muliple assemblys. + + return $self; +} + +=head2 flush + + Args : none + Example : none + Description: removes all cached information out of this mapper + Returntype : none + Exceptions : none + Caller : AssemblyMapper, ChainedAssemblyMapper + +=cut + +sub flush { + my $self = shift; + my $from = $self->from(); + my $to = $self->to(); + + $self->{"_pair_$from"} = {}; + $self->{"_pair_$to"} = {}; + + $self->{'pair_count'} = 0; +} + + + +=head2 map_coordinates + + Arg 1 string $id + id of 'source' sequence + Arg 2 int $start + start coordinate of 'source' sequence + Arg 3 int $end + end coordinate of 'source' sequence + Arg 4 int $strand + raw contig orientation (+/- 1) + Arg 5 string $type + nature of transform - gives the type of + coordinates to be transformed *from* + Function generic map method + Returntype array of Bio::EnsEMBL::Mapper::Coordinate + and/or Bio::EnsEMBL::Mapper::Gap + Exceptions none + Caller Bio::EnsEMBL::Mapper + +=cut + +sub map_coordinates { + my ( $self, $id, $start, $end, $strand, $type ) = @_; + + unless ( defined($id) + && defined($start) + && defined($end) + && defined($strand) + && defined($type) ) + { + throw("Expecting 5 arguments"); + } + + # special case for handling inserts: + if ( $start == $end + 1 ) { + return $self->map_insert( $id, $start, $end, $strand, $type ); + } + + if ( !$self->{'_is_sorted'} ) { $self->_sort() } + + my $hash = $self->{"_pair_$type"}; + + my ( $from, $to, $cs ); + + if ( $type eq $self->{'to'} ) { + $from = 'to'; + $to = 'from'; + $cs = $self->{'from_cs'}; + } else { + $from = 'from'; + $to = 'to'; + $cs = $self->{'to_cs'}; + } + + unless ( defined $hash ) { + throw("Type $type is neither to or from coordinate systems"); + } + + if ( !defined $hash->{ uc($id) } ) { + # one big gap! + my $gap = Bio::EnsEMBL::Mapper::Gap->new( $start, $end ); + return $gap; + } + + my $last_used_pair; + my @result; + + my ( $start_idx, $end_idx, $mid_idx, $pair, $self_coord ); + my $lr = $hash->{ uc($id) }; + + $start_idx = 0; + $end_idx = $#{$lr}; + + # binary search the relevant pairs + # helps if the list is big + while ( ( $end_idx - $start_idx ) > 1 ) { + $mid_idx = ( $start_idx + $end_idx ) >> 1; + $pair = $lr->[$mid_idx]; + $self_coord = $pair->{$from}; + if ( $self_coord->{'end'} < $start ) { + $start_idx = $mid_idx; + } else { + $end_idx = $mid_idx; + } + } + + my $rank = 0; + my $orig_start = $start; + my $last_target_coord = undef; + for ( my $i = $start_idx; $i <= $#{$lr}; $i++ ) { + $pair = $lr->[$i]; + my $self_coord = $pair->{$from}; + my $target_coord = $pair->{$to}; + + # + # But not the case for haplotypes!! need to test for this case??? + # so removing this till a better solution is found + # + # + # if($self_coord->{'start'} < $start){ + # $start = $orig_start; + # $rank++; + # } + + if ( defined($last_target_coord) + and $target_coord->{'id'} ne $last_target_coord ) + { + if ( $self_coord->{'start'} < $start ) + { # i.e. the same bit is being mapped to another assembled bit + $start = $orig_start; + } + } else { + $last_target_coord = $target_coord->{'id'}; + } + + # if we haven't even reached the start, move on + if ( $self_coord->{'end'} < $orig_start ) { + next; + } + + # if we have over run, break + if ( $self_coord->{'start'} > $end ) { + last; + } + + if ( $start < $self_coord->{'start'} ) { + # gap detected + my $gap = Bio::EnsEMBL::Mapper::Gap->new( $start, + $self_coord->{'start'} - 1, $rank ); + + push( @result, $gap ); + $start = $gap->{'end'} + 1; + } + my ( $target_start, $target_end, $target_ori ); + my $res; + if ( exists $pair->{'indel'} ) { + # When next pair is an IndelPair and not a Coordinate, create the + # new mapping Coordinate, the IndelCoordinate. + $target_start = $target_coord->{'start'}; + $target_end = $target_coord->{'end'}; + + #create a Gap object + my $gap = Bio::EnsEMBL::Mapper::Gap->new( $start, + ( $self_coord->{'end'} < $end ? $self_coord->{'end'} : $end ) ); + #create the Coordinate object + my $coord = + Bio::EnsEMBL::Mapper::Coordinate->new( $target_coord->{'id'}, + $target_start, $target_end, $pair->{'ori'}*$strand, $cs ); + #and finally, the IndelCoordinate object with + $res = Bio::EnsEMBL::Mapper::IndelCoordinate->new( $gap, $coord ); + } else { + # start is somewhere inside the region + if ( $pair->{'ori'} == 1 ) { + $target_start = + $target_coord->{'start'} + + ( $start - $self_coord->{'start'} ); + } else { + $target_end = + $target_coord->{'end'} - ( $start - $self_coord->{'start'} ); + } + + # Either we are enveloping this map or not. If yes, then end + # point (self perspective) is determined solely by target. If + # not we need to adjust. + + if ( $end > $self_coord->{'end'} ) { + # enveloped + if ( $pair->{'ori'} == 1 ) { + $target_end = $target_coord->{'end'}; + } else { + $target_start = $target_coord->{'start'}; + } + } else { + # need to adjust end + if ( $pair->{'ori'} == 1 ) { + $target_end = + $target_coord->{'start'} + + ( $end - $self_coord->{'start'} ); + } else { + $target_start = + $target_coord->{'end'} - ( $end - $self_coord->{'start'} ); + } + } + + $res = + Bio::EnsEMBL::Mapper::Coordinate->new( $target_coord->{'id'}, + $target_start, $target_end, $pair->{'ori'}*$strand, + $cs, $rank ); + + } ## end else [ if ( exists $pair->{'indel'...})] + + push( @result, $res ); + + $last_used_pair = $pair; + $start = $self_coord->{'end'} + 1; + + } ## end for ( my $i = $start_idx...) + + if ( !defined $last_used_pair ) { + my $gap = Bio::EnsEMBL::Mapper::Gap->new( $start, $end ); + push( @result, $gap ); + + } elsif ( $last_used_pair->{$from}->{'end'} < $end ) { + # gap at the end + my $gap = + Bio::EnsEMBL::Mapper::Gap->new( + $last_used_pair->{$from}->{'end'} + 1, + $end, $rank ); + push( @result, $gap ); + } + + if ( $strand == -1 ) { + @result = reverse(@result); + } + + return @result; +} ## end sub map_coordinates + + + +=head2 map_insert + + Arg [1] : string $id + Arg [2] : int $start - start coord. Since this is an insert should always + be one greater than end. + Arg [3] : int $end - end coord. Since this is an insert should always + be one less than start. + Arg [4] : int $strand (0, 1, -1) + Arg [5] : string $type - the coordinate system name the coords are from. + Arg [6] : boolean $fastmap - if specified, this is being called from + the fastmap call. The mapping done is not any faster for + inserts, but the return value is different. + Example : + Description: This is in internal function which handles the special mapping + case for inserts (start = end +1). This function will be called + automatically by the map function so there is no reason to + call it directly. + Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and/or Gap objects + Exceptions : none + Caller : map_coordinates() + +=cut + +sub map_insert { + my ($self, $id, $start, $end, $strand, $type, $fastmap) = @_; + + # swap start/end and map the resultant 2bp coordinate + ($start, $end) =($end,$start); + + my @coords = $self->map_coordinates($id, $start, $end, $strand, $type); + + if(@coords == 1) { + my $c = $coords[0]; + # swap start and end to convert back into insert + ($c->{'start'}, $c->{'end'}) = ($c->{'end'}, $c->{'start'}); + } else { + throw("Unexpected: Got ",scalar(@coords)," expected 2.") if(@coords != 2); + + # adjust coordinates, remove gaps + my ($c1, $c2); + if($strand == -1) { + ($c2,$c1) = @coords; + } else { + ($c1, $c2) = @coords; + } + @coords = (); + + if(ref($c1) eq 'Bio::EnsEMBL::Mapper::Coordinate') { + # insert is after first coord + if($c1->{'strand'} * $strand == -1) { + $c1->{'end'}--; + } else { + $c1->{'start'}++; + } + @coords = ($c1); + } + if(ref($c2) eq 'Bio::EnsEMBL::Mapper::Coordinate') { + # insert is before second coord + if($c2->{'strand'} * $strand == -1) { + $c2->{'start'}++; + } else { + $c2->{'end'}--; + } + if($strand == -1) { + unshift @coords, $c2; + } else { + push @coords, $c2; + } + } + } + + if($fastmap) { + return undef if(@coords != 1); + my $c = $coords[0]; + return ($c->{'id'}, $c->{'start'}, $c->{'end'}, + $c->{'strand'}, $c->{'coord_system'}); + } + + return @coords; +} + + + + + + +=head2 fastmap + + Arg 1 string $id + id of 'source' sequence + Arg 2 int $start + start coordinate of 'source' sequence + Arg 3 int $end + end coordinate of 'source' sequence + Arg 4 int $strand + raw contig orientation (+/- 1) + Arg 5 int $type + nature of transform - gives the type of + coordinates to be transformed *from* + Function inferior map method. Will only do ungapped unsplit mapping. + Will return id, start, end strand in a list. + Returntype list of results + Exceptions none + Caller Bio::EnsEMBL::AssemblyMapper + +=cut + +sub fastmap { + my ($self, $id, $start, $end, $strand, $type) = @_; + + my ($from, $to, $cs); + + if($end+1 == $start) { + return $self->map_insert($id, $start, $end, $strand, $type, 1); + } + + if( ! $self->{'_is_sorted'} ) { $self->_sort() } + + if($type eq $self->{'to'}) { + $from = 'to'; + $to = 'from'; + $cs = $self->{'from_cs'}; + } else { + $from = 'from'; + $to = 'to'; + $cs = $self->{'to_cs'}; + } + + my $hash = $self->{"_pair_$type"} or + throw("Type $type is neither to or from coordinate systems"); + + + my $pairs = $hash->{uc($id)}; + + foreach my $pair (@$pairs) { + my $self_coord = $pair->{$from}; + my $target_coord = $pair->{$to}; + + # only super easy mapping is done + if( $start < $self_coord->{'start'} || + $end > $self_coord->{'end'} ) { + next; + } + + if( $pair->{'ori'} == 1 ) { + return ( $target_coord->{'id'}, + $target_coord->{'start'}+$start-$self_coord->{'start'}, + $target_coord->{'start'}+$end-$self_coord->{'start'}, + $strand, $cs ); + } else { + return ( $target_coord->{'id'}, + $target_coord->{'end'} - ($end - $self_coord->{'start'}), + $target_coord->{'end'} - ($start - $self_coord->{'start'}), + -$strand, $cs ); + } + } + + return (); +} + + + +=head2 add_map_coordinates + + Arg 1 int $id + id of 'source' sequence + Arg 2 int $start + start coordinate of 'source' sequence + Arg 3 int $end + end coordinate of 'source' sequence + Arg 4 int $strand + relative orientation of source and target (+/- 1) + Arg 5 int $id + id of 'target' sequence + Arg 6 int $start + start coordinate of 'target' sequence + Arg 7 int $end + end coordinate of 'target' sequence + Function Stores details of mapping between + 'source' and 'target' regions. + Returntype none + Exceptions none + Caller Bio::EnsEMBL::Mapper + +=cut + +sub add_map_coordinates { + my ( $self, $contig_id, $contig_start, $contig_end, $contig_ori, + $chr_name, $chr_start, $chr_end ) + = @_; + + unless ( defined($contig_id) + && defined($contig_start) + && defined($contig_end) + && defined($contig_ori) + && defined($chr_name) + && defined($chr_start) + && defined($chr_end) ) + { + throw("7 arguments expected"); + } + + if ( ( $contig_end - $contig_start ) != ( $chr_end - $chr_start ) ) { + throw("Cannot deal with mis-lengthed mappings so far"); + } + + my $from = Bio::EnsEMBL::Mapper::Unit->new( $contig_id, $contig_start, + $contig_end ); + my $to = + Bio::EnsEMBL::Mapper::Unit->new( $chr_name, $chr_start, $chr_end ); + + my $pair = Bio::EnsEMBL::Mapper::Pair->new( $from, $to, $contig_ori ); + + # place into hash on both ids + my $map_to = $self->{'to'}; + my $map_from = $self->{'from'}; + + push( @{ $self->{"_pair_$map_to"}->{ uc($chr_name) } }, $pair ); + push( @{ $self->{"_pair_$map_from"}->{ uc($contig_id) } }, $pair ); + + $self->{'pair_count'}++; + $self->{'_is_sorted'} = 0; +} ## end sub add_map_coordinates + + +=head2 add_indel_coordinates + + Arg 1 int $id + id of 'source' sequence + Arg 2 int $start + start coordinate of 'source' sequence + Arg 3 int $end + end coordinate of 'source' sequence + Arg 4 int $strand + relative orientation of source and target (+/- 1) + Arg 5 int $id + id of 'targe' sequence + Arg 6 int $start + start coordinate of 'targe' sequence + Arg 7 int $end + end coordinate of 'targe' sequence + Function stores details of mapping between two regions: + 'source' and 'target'. Returns 1 if the pair was added, 0 if it + was already in. Used when adding an indel + Returntype int 0,1 + Exceptions none + Caller Bio::EnsEMBL::Mapper + +=cut + +sub add_indel_coordinates{ + my ($self, $contig_id, $contig_start, $contig_end, + $contig_ori, $chr_name, $chr_start, $chr_end) = @_; + + unless(defined($contig_id) && defined($contig_start) && defined($contig_end) + && defined($contig_ori) && defined($chr_name) && defined($chr_start) + && defined($chr_end)) { + throw("7 arguments expected"); + } + + #we need to create the IndelPair object to add to both lists, to and from + my $from = + Bio::EnsEMBL::Mapper::Unit->new($contig_id, $contig_start, $contig_end); + my $to = + Bio::EnsEMBL::Mapper::Unit->new($chr_name, $chr_start, $chr_end); + + my $pair = Bio::EnsEMBL::Mapper::IndelPair->new($from, $to, $contig_ori); + + # place into hash on both ids + my $map_to = $self->{'to'}; + my $map_from = $self->{'from'}; + + push( @{$self->{"_pair_$map_to"}->{uc($chr_name)}}, $pair ); + push( @{$self->{"_pair_$map_from"}->{uc($contig_id)}}, $pair ); + + $self->{'pair_count'}++; + + $self->{'_is_sorted'} = 0; + return 1; +} + + +=head2 map_indel + + Arg [1] : string $id + Arg [2] : int $start - start coord. Since this is an indel should always + be one greater than end. + Arg [3] : int $end - end coord. Since this is an indel should always + be one less than start. + Arg [4] : int $strand (0, 1, -1) + Arg [5] : string $type - the coordinate system name the coords are from. + Example : @coords = $mapper->map_indel(); + Description: This is in internal function which handles the special mapping + case for indels (start = end +1). It will be used to map from + a coordinate system with a gap to another that contains an + insertion. It will be mainly used by the Variation API. + Returntype : Bio::EnsEMBL::Mapper::Unit objects + Exceptions : none + Caller : general + +=cut + +sub map_indel { + my ( $self, $id, $start, $end, $strand, $type ) = @_; + + # swap start/end and map the resultant 2bp coordinate + ( $start, $end ) = ( $end, $start ); + + if ( !$self->{'_is_sorted'} ) { $self->_sort() } + + my $hash = $self->{"_pair_$type"}; + + my ( $from, $to, $cs ); + + if ( $type eq $self->{'to'} ) { + $from = 'to'; + $to = 'from'; + $cs = $self->{'from_cs'}; + } else { + $from = 'from'; + $to = 'to'; + $cs = $self->{'to_cs'}; + } + + unless ( defined $hash ) { + throw("Type $type is neither to or from coordinate systems"); + } + my @indel_coordinates; + + my ( $start_idx, $end_idx, $mid_idx, $pair, $self_coord ); + my $lr = $hash->{ uc($id) }; + + $start_idx = 0; + $end_idx = $#{$lr}; + + # binary search the relevant pairs + # helps if the list is big + while ( ( $end_idx - $start_idx ) > 1 ) { + $mid_idx = ( $start_idx + $end_idx ) >> 1; + $pair = $lr->[$mid_idx]; + $self_coord = $pair->{$from}; + if ( $self_coord->{'end'} <= $start ) { + $start_idx = $mid_idx; + } else { + $end_idx = $mid_idx; + } + } + + for ( my $i = $start_idx; $i <= $#{$lr}; $i++ ) { + $pair = $lr->[$i]; + my $self_coord = $pair->{$from}; + my $target_coord = $pair->{$to}; + + if ( exists $pair->{'indel'} ) { + #need to return unit coordinate + my $to = + Bio::EnsEMBL::Mapper::Unit->new( $target_coord->{'id'}, + $target_coord->{'start'}, + $target_coord->{'end'}, ); + push @indel_coordinates, $to; + last; + } + } + + return @indel_coordinates; +} ## end sub map_indel + + +=head2 add_Mapper + + Arg 1 Bio::EnsEMBL::Mapper $mapper2 + Example $mapper->add_Mapper($mapper2) + Function add all the map coordinates from $mapper to this mapper. + This object will contain mapping pairs from both the old + object and $mapper2. + Returntype int 0,1 + Exceptions throw if 'to' and 'from' from both Bio::EnsEMBL::Mappers + are incompatible + Caller $mapper->methodname() + +=cut + +sub add_Mapper{ + my ($self, $mapper) = @_; + + my $mapper_to = $mapper->{'to'}; + my $mapper_from = $mapper->{'from'}; + if ($mapper_to ne $self->{'to'} or $mapper_from ne $self->{'from'}) { + throw("Trying to add an incompatible Mapper"); + } + + my $count_a = 0; + foreach my $seq_name (keys %{$mapper->{"_pair_$mapper_to"}}) { + push(@{$self->{"_pair_$mapper_to"}->{$seq_name}}, + @{$mapper->{"_pair_$mapper_to"}->{$seq_name}}); + $count_a += scalar(@{$mapper->{"_pair_$mapper_to"}->{$seq_name}}); + } + my $count_b = 0; + foreach my $seq_name (keys %{$mapper->{"_pair_$mapper_from"}}) { + push(@{$self->{"_pair_$mapper_from"}->{$seq_name}}, + @{$mapper->{"_pair_$mapper_from"}->{$seq_name}}); + $count_b += scalar(@{$mapper->{"_pair_$mapper_from"}->{$seq_name}}); + } + + if ($count_a == $count_b) { + $self->{'pair_count'} += $count_a; + } else { + throw("Trying to add a funny Mapper"); + } + + $self->{'_is_sorted'} = 0; + return 1; +} + + + +=head2 list_pairs + + Arg 1 int $id + id of 'source' sequence + Arg 2 int $start + start coordinate of 'source' sequence + Arg 3 int $end + end coordinate of 'source' sequence + Arg 4 string $type + nature of transform - gives the type of + coordinates to be transformed *from* + Function list all pairs of mappings in a region + Returntype list of Bio::EnsEMBL::Mapper::Pair + Exceptions none + Caller Bio::EnsEMBL::Mapper + +=cut + +sub list_pairs { + my ( $self, $id, $start, $end, $type ) = @_; + + if ( !$self->{'_is_sorted'} ) { $self->_sort() } + + if ( !defined $type ) { + throw("Expected 4 arguments"); + } + + if ( $start > $end ) { + throw( "Start is greater than end " + . "for id $id, start $start, end $end\n" ); + } + + my $hash = $self->{"_pair_$type"}; + + my ( $from, $to ); + + if ( $type eq $self->{'to'} ) { + $from = 'to'; + $to = 'from'; + } else { + $from = 'from'; + $to = 'to'; + } + + unless ( defined $hash ) { + throw("Type $type is neither to or from coordinate systems"); + } + + my @list; + + unless ( exists $hash->{ uc($id) } ) { + return (); + } + + @list = @{ $hash->{ uc($id) } }; + + my @output; + if ( $start == -1 && $end == -1 ) { + return @list; + } else { + + foreach my $p (@list) { + + if ( $p->{$from}->{'end'} < $start ) { + next; + } + if ( $p->{$from}->{'start'} > $end ) { + last; + } + push( @output, $p ); + } + return @output; + } +} ## end sub list_pairs + + +=head2 to + + Arg 1 Bio::EnsEMBL::Mapper::Unit $id + id of 'source' sequence + Function accessor method form the 'source' + and 'target' in a Mapper::Pair + Returntype Bio::EnsEMBL::Mapper::Unit + Exceptions none + Caller Bio::EnsEMBL::Mapper + +=cut + +sub to { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'to'} = $value; + } + + return $self->{'to'}; +} + +=head2 from + + Arg 1 Bio::EnsEMBL::Mapper::Unit $id + id of 'source' sequence + Function accessor method form the 'source' + and 'target' in a Mapper::Pair + Returntype Bio::EnsEMBL::Mapper::Unit + Exceptions none + Caller Bio::EnsEMBL::Mapper + +=cut +sub from { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'from'} = $value; + } + + return $self->{'from'}; +} + + +# _dump +# +# Arg 1 *FileHandle $fh +# Function convenience dump function +# possibly useful for debugging +# Returntype none +# Exceptions none +# Caller internal +# + +sub _dump{ + my ($self,$fh) = @_; + + if( !defined $fh ) { + $fh = \*STDERR; + } + + foreach my $id ( keys %{$self->{'_pair_hash_from'}} ) { + print $fh "From Hash $id\n"; + foreach my $pair ( @{$self->{'_pair_hash_from'}->{uc($id)}} ) { + print $fh " ",$pair->from->start," ",$pair->from->end,":",$pair->to->start," ",$pair->to->end," ",$pair->to->id,"\n"; + } + } + +} + + +# _sort +# +# Function sort function so that all +# mappings are sorted by +# chromosome start +# Returntype none +# Exceptions none +# Caller internal +# + +sub _sort { + my ($self) = @_; + + my $to = $self->{'to'}; + my $from = $self->{'from'}; + + foreach my $id ( keys %{ $self->{"_pair_$from"} } ) { + @{ $self->{"_pair_$from"}->{$id} } = + sort { $a->{'from'}->{'start'} <=> $b->{'from'}->{'start'} } + @{ $self->{"_pair_$from"}->{$id} }; + } + + foreach my $id ( keys %{ $self->{"_pair_$to"} } ) { + @{ $self->{"_pair_$to"}->{$id} } = + sort { $a->{'to'}->{'start'} <=> $b->{'to'}->{'start'} } + @{ $self->{"_pair_$to"}->{$id} }; + } + + $self->_merge_pairs(); + $self->_is_sorted(1); +} + +# this function merges pairs that are adjacent into one +sub _merge_pairs { + my $self = shift; + + my ( $lr, $lr_from, $del_pair, $next_pair, $current_pair ); + + my $map_to = $self->{'to'}; + my $map_from = $self->{'from'}; + $self->{'pair_count'} = 0; + + for my $key ( keys %{$self->{"_pair_$map_to"}} ) { + $lr = $self->{"_pair_$map_to"}->{$key}; + + my $i = 0; + my $next = 1; + my $length = $#{$lr}; + while( $next <= $length ) { + $current_pair = $lr->[$i]; + $next_pair = $lr->[$next]; + $del_pair = undef; + + if(exists $current_pair->{'indel'} || exists $next_pair->{'indel'}){ + #necessary to modify the merge function to not merge indels + $next++; + $i++; + } + else{ + # duplicate filter + if( $current_pair->{'to'}->{'start'} == $next_pair->{'to'}->{'start'} + and $current_pair->{'from'}->{'id'} == $next_pair->{'from'}->{'id'} ) { + $del_pair = $next_pair; + } elsif(( $current_pair->{'from'}->{'id'} eq $next_pair->{'from'}->{'id'} ) && + ( $next_pair->{'ori'} == $current_pair->{'ori'} ) && + ( $next_pair->{'to'}->{'start'} -1 == $current_pair->{'to'}->{'end'} )) { + + if( $current_pair->{'ori'} == 1 ) { + # check forward strand merge + if( $next_pair->{'from'}->{'start'} - 1 == $current_pair->{'from'}->{'end'} ) { + # normal merge with previous element + $current_pair->{'to'}->{'end'} = $next_pair->{'to'}->{'end'}; + $current_pair->{'from'}->{'end'} = $next_pair->{'from'}->{'end'}; + $del_pair = $next_pair; + } + } else { + # check backward strand merge + if( $next_pair->{'from'}->{'end'} + 1 == $current_pair->{'from'}->{'start'} ) { + # yes its a merge + $current_pair->{'to'}->{'end'} = $next_pair->{'to'}->{'end'}; + $current_pair->{'from'}->{'start'} = $next_pair->{'from'}->{'start'}; + $del_pair = $next_pair; + } + } + } + + if( defined $del_pair ) { + splice( @$lr, $next, 1 ); + $lr_from = $self->{"_pair_$map_from"}->{uc($del_pair->{'from'}->{'id'})}; + for( my $j=0; $j <= $#{$lr_from}; $j++ ) { + if( $lr_from->[$j] == $del_pair ) { + splice( @$lr_from, $j, 1 ); + last; + } + } + $length--; + if( $length < $next ) { last; } + } + else { + $next++; + $i++; + } + } + + } + $self->{'pair_count'} += scalar( @$lr ); + } +} + + +# _is_sorted +# +# Arg 1 int $sorted +# Function toggle for whether the (internal) +# map data are sorted +# Returntype int +# Exceptions none +# Caller internal +# + +sub _is_sorted { + my ($self, $value) = @_; + $self->{'_is_sorted'} = $value if (defined($value)); + return $self->{'_is_sorted'}; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Mapper/Coordinate.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Mapper/Coordinate.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,208 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Mapper::Coordinate + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +Representation of a mapped region in a sequence; returned from Mapper.pm +when the target region maps on to valid sequence. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Mapper::Coordinate; + +use strict; + +=head2 new + + Arg [1] char|int id of object in mapped region + Arg [2] int start of region + Arg [3] int end of region + Arg [4] int strand if region + Arg [5] Bio::EnsEMBL::CoordSystem coordsytem of the region + Function creates a new Coordinate object + Returntype Bio::EnsEMBL::Mapper::Coordinate + Exceptions none + Status Stable + +=cut + +sub new { + my ( $proto, $id, $start, $end, $strand, $coord_system, $rank ) = @_; + + my $class = ref($proto) || $proto; + + return + bless( { 'id' => $id, + 'start' => $start, + 'end' => $end, + 'strand' => $strand, + 'coord_system' => $coord_system, + 'rank' => $rank || 0 + }, + $class ); +} + + +=head2 start + + Arg 1 int $start + start coordinate of object in mapped region + Function getter/setter method + Returntype int + Exceptions none + Caller Bio::EnsEMBL::Mapper::Coordinate + Status Stable + +=cut + +sub start { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'start'} = $value; + } + + return $self->{'start'}; +} + + +=head2 end + + Arg 1 int $end + end coordinate of object in mapped region + Function getter/setter method + Returntype int + Exceptions none + Caller Bio::EnsEMBL::Mapper::Coordinate + Status Stable + +=cut + +sub end { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'end'} = $value; + } + + return $self->{'end'}; +} + +=head2 strand + + Arg 1 int $strand + strand of object in mapped region + Function getter/setter method + Returntype int + Exceptions none + Caller Bio::EnsEMBL::Mapper::Coordinate + Status Stable + +=cut + +sub strand { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'strand'} = $value; + } + + return $self->{'strand'}; +} + +=head2 id + + Arg 1 char|int $id + id of object in mapped region + e.g. seq_region_id / seq_region_name + Function getter/setter method + Returntype char|int + Exceptions none + Caller Bio::EnsEMBL::Mapper::Coordinate + Status Stable + +=cut + +sub id { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'id'} = $value; + } + + return $self->{'id'}; +} + +=head2 coord_system + + Arg 1 Bio::EnsEMBL::CoordSystem + Function getter/setter method + Returntype Bio::EnsEMBL::CoordSystem + Exceptions none + Caller Bio::EnsEMBL::Mapper::Coordinate + Status Stable + +=cut + +sub coord_system { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'coord_system'} = $value; + } + + return $self->{'coord_system'}; +} + +=head2 length + + Function getter method + Returntype int + Exceptions none + Caller ? + Status Stable + +=cut + +sub length { + my ($self) = @_; + + return $self->{'end'} - $self->{'start'} + 1; +} + +sub rank { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'rank'} = $value; + } + + return $self->{'rank'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Mapper/Gap.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Mapper/Gap.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,135 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Mapper::Gap + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +Representation of a gap in a sequence; returned from Mapper.pm when the +target region is in a gap. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Mapper::Gap; + +use strict; + +=head2 new + + Arg [1] : int $start + Arg [2] : int $end + Example : $gap = Bio::EnsEMBL::Mapper::Gap($start, $end); + Description: Creates a new Gap object. + Returntype : Bio::EnsEMBL::Mapper::Gap + Exceptions : none + Caller : Bio::EnsEMBL::Mapper + Status : Stable + +=cut + +sub new { + my ( $proto, $start, $end, $rank ) = @_; + + my $class = ref($proto) || $proto; + + return bless( { 'start' => $start, 'end' => $end, 'rank' => $rank || 0 }, $class ); +} + +=head2 start + + Arg [1] : (optional) int $start + start coordinate of gap region + Example : $start = $gap->start(); + Description: Getter/Setter for the start attribute + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub start { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'start'} = $value; + } + + return $self->{'start'}; +} + +=head2 end + + Arg [1] : (optional) int $newval + The new value to set the end coordinate to + Example : $end = $gap->end() + Description: Getter/Setter for the end coordinate of the gap region + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub end { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'end'} = $value; + } + + return $self->{'end'}; +} + +=head2 length + + Arg [1] : none + Example : $len = $gap->length(); + Description: Getter for the length of this gap region + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub length { + my ($self) = @_; + + return $self->{'end'} - $self->{'start'} + 1; +} + +sub rank { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'rank'} = $value; + } + + return $self->{'rank'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Mapper/IndelCoordinate.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Mapper/IndelCoordinate.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,135 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Mapper::IndelCoordinate + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +Representation of a indel in a sequence; returned from Mapper.pm when +the target region is in a deletion. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Mapper::IndelCoordinate; + +use Bio::EnsEMBL::Mapper::Gap; +use Bio::EnsEMBL::Mapper::Coordinate; + +use vars qw(@ISA); +use strict; + +@ISA = qw(Bio::EnsEMBL::Mapper::Coordinate Bio::EnsEMBL::Mapper::Gap); + + +=head2 new + + Arg [1] : Bio::EnsEMBL::Mapper::Gap $gap + Arg [2] : Bio::EnsEMBL::Mapper::Coordinate $coordinate + Example : $indelCoord = Bio::EnsEMBL::Mapper::IndelCoordinate($gap, $coordinate); + Description: Creates a new IndelCoordinate object. + Returntype : Bio::EnsEMBL::Mapper::IndelCoordinate + Exceptions : none + Caller : Bio::EnsEMBL::Mapper + +=cut + +sub new { + my ( $proto, $gap, $coordinate ) = @_; + + my $class = ref($proto) || $proto; + + return + bless( { 'start' => $coordinate->start(), + 'end' => $coordinate->end(), + 'strand' => $coordinate->strand(), + 'id' => $coordinate->id(), + 'coord_system' => $coordinate->coord_system(), + 'gap_start' => $gap->start(), + 'gap_end' => $gap->end() + }, + $class ); +} + +=head2 gap_start + + Arg[1] : (optional) int $gap_start + Example : $gap_start = $ic->gap_start() + Description : Getter/Setter for the start of the Gap region + ReturnType : int + Exceptions : none + Caller : general + +=cut + +sub gap_start { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'gap_start'} = $value; + } + + return $self->{'gap_start'}; +} + +=head2 gap_end + + Arg[1] : (optional) int $gap_end + Example : $gap_end = $ic->gap_end() + Description : Getter/Setter for the end of the Gap region + ReturnType : int + Exceptions : none + Caller : general + +=cut + +sub gap_end { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'gap_end'} = $value; + } + + return $self->{'gap_end'}; +} + +=head2 gap_length + + Args : None + Example : $gap_length = $ic->gap_length() + Description : Getter for the length of the Gap region + ReturnType : int + Exceptions : none + Caller : general + +=cut + +sub gap_length { + my ($self) = @_; + + return $self->{'gap_end'} - $self->{'gap_start'} + 1; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Mapper/IndelPair.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Mapper/IndelPair.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,55 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Mapper::IndelPair + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +Two regions mapped between different coordinate systems are each +represented by a Bio::EnsEMBL::Mapper::Unit and joined together as a +Bio::EnsEMBL::Mapper::Pair, when one of the regions is an indel. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Mapper::IndelPair; + +use vars qw(@ISA); +use strict; + +@ISA = qw(Bio::EnsEMBL::Mapper::Pair); + +sub new { + my ($proto, @args) = @_; + + my $class = ref($proto) || $proto; + + my $self = $class->SUPER::new(@args); # create the Pair object + $self->{'indel'} = 1; # and add the Indel flag + + return $self; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Mapper/Pair.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Mapper/Pair.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,118 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Mapper::Pair + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +Two regions mapped between different coordinate systems are each +represented by a Bio::EnsEMBL::Mapper::Unit and joined together as a +Bio::EnsEMBL::Mapper::Pair. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Mapper::Pair; + +use strict; + +sub new { + my ( $proto, $from, $to, $ori ) = @_; + + my $class = ref($proto) || $proto; + + return + bless( { 'from' => $from, 'to' => $to, 'ori' => $ori }, $class ); +} + +=head2 to + + Arg 1 Bio::EnsEMBL::Mapper::Unit $seqobj + from and to represent the two regions + which are mapped to each other + Function accessor method + Returntype Bio::EnsEMBL::Mapper::Unit + Exceptions none + Caller Bio::EnsEMBL::Mapper::Pair + Status : Stable + +=cut + +sub to { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'to'} = $value; + } + + return $self->{'to'}; +} + +=head2 from + + Arg 1 Bio::EnsEMBL::Mapper::Unit $seqobj + from and to represent the two regions + which are mapped to each other + Function accessor method + Returntype Bio::EnsEMBL::Mapper::Unit + Exceptions none + Caller Bio::EnsEMBL::Mapper::Pair + Status : Stable + +=cut +sub from { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'from'} = $value; + } + + return $self->{'from'}; +} + +=head2 ori + + Arg 1 Bio::EnsEMBL::Mapper::Unit $ori + Function accessor method + relative orientation of the the + two mapped regions + Returntype Bio::EnsEMBL::Mapper::Unit + Exceptions none + Caller Bio::EnsEMBL::Mapper::Pair + Status : Stable + +=cut + +sub ori { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'ori'} = $value; + } + + return $self->{'ori'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Mapper/RangeRegistry.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Mapper/RangeRegistry.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,376 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Mapper::RangeRegistry + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Mapper::RangeRegistry; + + $rr = Bio::EnsEMBL::Mapper::RangeRegistry->new(); + + # Get a fixed width chunk around the range of intereset. This + # will be used if any registration is actually necessary. + $chunk_start = ( $start >> 20 ) << 20 + 1; + $chunk_end = ( ( $end >> 20 ) + 1 ) << 20; + + # Check if any registration is necessary for the range. If it is + # register a large chunked area instead and return a listref of + # unregistered areas that need to be loaded. + if ( + $pairs = $rr->check_and_register( + $id, $start, $end, $chunk_start, $chunk_end + ) ) + { + foreach my $pair (@$pairs) { + my ( $pair_start, $pair_end ) = @$pair; + # Fetch mappings for these regions from the assembly table and + # load them into the mapper. + ...; + } + } else { + # The range ($start - $end) is already registered + ...; + } + + # Check if any registration is necessary. If it is register the + # region and return a listref of pairs that need to be loaded. + if ( $pairs = $rr->check_and_register( $id, $start, $end ) ) { + ...; + } + +=head1 DESCRIPTION + +This module maintains an internal list of registered regions and is +used to quickly ascertain if and what regions of a provided range need +registration. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Mapper::RangeRegistry; + +use strict; + +use Bio::EnsEMBL::Utils::Exception qw(throw); +use integer; + +=head2 new + + Arg [1] : none + Example : my $rr = Bio::EnsEMBL::Mapper::RangeRegistry->new(); + Description: Creates a new RangeRegistry object + Returntype : Bio::EnsEMBL::Mapper::RangeRegistry + Exceptions : none + Caller : AssemblyMapperAdaptor + Status : Stable + +=cut + +sub new { + my ($proto) = @_; + + my $class = ref($proto) || $proto; + + return bless( { 'registry' => {} }, $class ); +} + +sub flush { + my ($self) = @_; + $self->{'registry'} = {}; +} + +=head2 check_and_register + + Arg [1] : string $id + The id of the range to be checked/registered (e.g. a sequenceid) + Arg [2] : int $start + The start of the range to be checked + Arg [3] : int $end + The end of the range to be checked + Arg [4] : (optional) int $rstart + The start of the range to be registered + if the checked range was not fully registered + Arg [5] : (optional) int $rend + The end of the range to be registerd + if the checked range was not fully registered + Example : $ranges=$rr->check_and_register('X',500,600, 1,1_000_000)); + Description: Checks the range registry to see if the entire range + denoted by ($id : $start-$end) is already registered. If + it already is, then undef is returned. If it is not then + the range specified by $rstart and $rend is registered, and + a list of regions that were required to completely register + $rstart-$rend is returned. If $rstart and $rend are not + defined they default to $start and $end respectively. + + The reason there is a single call to do both the checking and + registering is to reduce the overhead. Much of the work to + check if a range is registered is the same as registering a + region around that range. + Returntype : undef or listref of [start,end] range pairs + Exceptions : throw if rstart is greater than start + throw if rend is less than end + throw if end is less than start + throw if id, start, or end are not defined + Caller : AssemblyMapperAdaptor + Status : Stable + +=cut + +#"constants" +my $START = 0; +my $END = 1; + +sub check_and_register { + my ( $self, $id, $start, $end, $rstart, $rend ) = @_; + + $rstart = $start if ( !defined($rstart) ); + $rend = $end if ( !defined($rend) ); + + # + # Sanity checks + # + if ( !defined($id) || !defined($start) || !defined($end) ) { + throw("ID, start, end arguments are required"); + } + + # The following was commented out due to Ensembl Genomes requirements + # for bacterial genomes. + # + ## if ( $start > $end ) { + ## throw( "start argument [$start] must be less than " + ## . "(or equal to) end argument [$end]" ); + ## } + ## + ## if ( $rstart > $rend ) { + ## throw( "rstart argument [$rstart] must be less than " + ## . "(or equal to) rend argument [$rend] argument" ); + ## } + + if ( $rstart > $start ) { + throw("rstart [$rstart] must be less than or equal to start [$start]"); + } + + if ( $rend < $end ) { + throw("rend [$rend] must be greater than or equal to end [$end]"); + } + + my $reg = $self->{'registry'}; + my $list = $reg->{$id} ||= []; + + my @gap_pairs; + + my $len = scalar(@$list); + + if ( $len == 0 ) { + # this is the first request for this id, return a gap pair for the + # entire range and register it as seen + $list->[0] = [ $rstart, $rend ]; + return [ [ $rstart, $rend ] ]; + } + + ##### + # loop through the list of existing ranges recording any "gaps" where + # the existing range does not cover part of the requested range + # + + my $start_idx = 0; + my $end_idx = $#$list; + my ( $mid_idx, $range ); + + # binary search the relevant pairs + # helps if the list is big + while ( ( $end_idx - $start_idx ) > 1 ) { + $mid_idx = ( $start_idx + $end_idx ) >> 1; + $range = $list->[$mid_idx]; + + if ( $range->[1] < $rstart ) { + $start_idx = $mid_idx; + } else { + $end_idx = $mid_idx; + } + } + + my ( $gap_start, $gap_end, $r_idx, $rstart_idx, $rend_idx ); + $gap_start = $rstart; + + for ( my $CUR = $start_idx ; $CUR < $len ; $CUR++ ) { + my ( $pstart, $pend ) = @{ $list->[$CUR] }; + + # no work needs to be done at all if we find a range pair that + # entirely overlaps the requested region + if ( $pstart <= $start && $pend >= $end ) { + return undef; + } + + # find adjacent or overlapping regions already registered + if ( $pend >= ( $rstart - 1 ) && $pstart <= ( $rend + 1 ) ) { + if ( !defined($rstart_idx) ) { + $rstart_idx = $CUR; + } + $rend_idx = $CUR; + } + + if ( $pstart > $rstart ) { + $gap_end = ( $rend < $pstart ) ? $rend : $pstart - 1; + push @gap_pairs, [ $gap_start, $gap_end ]; + } + + $gap_start = ( $rstart > $pend ) ? $rstart : $pend + 1; + + # if($pstart > $rend && !defined($r_idx)) { + if ( $pend >= $rend && !defined($r_idx) ) { + $r_idx = $CUR; + last; + } + } ## end for ( my $CUR = $start_idx... + + # do we have to make another gap? + if ( $gap_start <= $rend ) { + push @gap_pairs, [ $gap_start, $rend ]; + } + + # + # Merge the new range into the registered list + # + if ( defined($rstart_idx) ) { + my ( $new_start, $new_end ); + + if ( $rstart < $list->[$rstart_idx]->[0] ) { + $new_start = $rstart; + } else { + $new_start = $list->[$rstart_idx]->[0]; + } + + if ( $rend > $list->[$rend_idx]->[1] ) { + $new_end = $rend; + } else { + $new_end = $list->[$rend_idx]->[1]; + } + + splice( @$list, $rstart_idx, + $rend_idx - $rstart_idx + 1, + [ $new_start, $new_end ] ); + + } elsif ( defined($r_idx) ) { + splice( @$list, $r_idx, 0, [ $rstart, $rend ] ); + } else { + push( @$list, [ $rstart, $rend ] ); + } + + return \@gap_pairs; +} ## end sub check_and_register + +# overlap size is just added to make RangeRegistry generally more useful + +=head2 overlap_size + + Arg [1] : string $id + Arg [2] : int $start + Arg [3] : int $end + Example : my $overlap_size = $rr->( "chr1", 1, 100_000_000 ) + Description: finds out how many bases of the given range are registered + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub overlap_size { + my ( $self, $id, $start, $end ) = @_; + + my $overlap = 0; + + if ( $start > $end ) { return 0 } + + my $reg = $self->{'registry'}; + my $list = $reg->{$id} ||= []; + + my $len = scalar(@$list); + + if ( $len == 0 ) { + # this is the first request for this id, return a gap pair for the + # entire range and register it as seen + return 0; + } + + ##### + # loop through the list of existing ranges recording any "gaps" where + # the existing range does not cover part of the requested range + # + + my $start_idx = 0; + my $end_idx = $#$list; + my ( $mid_idx, $range ); + + # binary search the relevant pairs + # helps if the list is big + while ( ( $end_idx - $start_idx ) > 1 ) { + $mid_idx = ( $start_idx + $end_idx ) >> 1; + $range = $list->[$mid_idx]; + if ( $range->[1] < $start ) { + $start_idx = $mid_idx; + } else { + $end_idx = $mid_idx; + } + } + + for ( my $CUR = $start_idx ; $CUR < $len ; $CUR++ ) { + my ( $pstart, $pend ) = @{ $list->[$CUR] }; + + if ( $pstart > $end ) { + # No more interesting ranges here. + last; + } + + # no work needs to be done at all if we find a range pair that + # entirely overlaps the requested region + if ( $pstart <= $start && $pend >= $end ) { + $overlap = $end - $start + 1; + last; + } + + my $mstart = ( $start < $pstart ? $pstart : $start ); + my $mend = ( $end < $pend ? $end : $pend ); + + if ( $mend - $mstart >= 0 ) { + $overlap += ( $mend - $mstart + 1 ); + } + } + + return $overlap; +} ## end sub overlap_size + + +# low level function to access the ranges +# only use for read access + +sub get_ranges { + my ( $self, $id ) = @_; + + return $self->{'registry'}->{$id}; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Mapper/Unit.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Mapper/Unit.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,119 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Mapper::Unit - One side of a map pair + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +Two regions mapped between different coordinate systems are each +represented by a Bio::EnsEMBL::Mapper::Unit and joined together as a +Bio::EnsEMBL::Mapper::Pair. + +=head1 METHODS + +=cut + + +package Bio::EnsEMBL::Mapper::Unit; + +use strict; + +sub new { + my ( $proto, $id, $start, $end ) = @_; + + my $class = ref($proto) || $proto; + + return + bless( { 'id' => $id, 'start' => $start, 'end' => $end }, $class ); +} + +=head2 id + + Arg 1 int|char $id + the id of the object (e.g. seq_region_name) which is mapped + Function accessor method + Returntype int|char + Exceptions none + Caller Bio::EnsEMBL::Mapper::Unit + Status Stable + +=cut + +sub id { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'id'} = $value; + } + + return $self->{'id'}; +} + +=head2 start + + Arg 1 int $start + the start coordinate of the mapped + region which this object represents + Function accessor method + Returntype int + Exceptions none + Caller Bio::EnsEMBL::Mapper::Unit + Status Stable + +=cut + +sub start { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'start'} = $value; + } + + return $self->{'start'}; +} + +=head2 end + + Arg 1 int $end + the end coordinate of the mapped + region which this object represents + Function accessor method + Returntype int + Exceptions none + Caller Bio::EnsEMBL::Mapper::Unit + Status Stable + +=cut + +sub end { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'end'} = $value; + } + + return $self->{'end'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/MiscFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/MiscFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,382 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::MiscFeature - A miscelaneous feature with arbitrary features and +associations. + +=head1 SYNOPSIS + + use Bio::EnsEMBL::MiscFeature; + use Bio::EnsEMBL::MiscSet; + use Bio::EnsEMBL::Attribute; + + my $mfeat = Bio::EnsEMBL::MiscFeature->new( + -START => 1200, + -END => 100_000, + -STRAND => 1, + -SLICE => $slice + ); + + # Can add attributes to the misc feature and associate with various + # sets + my $clone_set = Bio::EnsEMBL::MiscSet->new( + -CODE => 'clone', + -NAME => '1MB clone set', + -DESCRIPTION => '1MB CloneSet' + ); + + my $tiling_path_set = Bio::EnsEMBL::MiscSet->new( + -CODE => 'tilingpath', + -NAME => 'tiling path set' + ); + + my $attrib1 = Bio::EnsEMBL::Attribute->new( + -VALUE => 'RLX12451', + -CODE => 'name', + -NAME => 'name' + ); + + my $attrib2 = Bio::EnsEMBL::Attribute->new( + -VALUE => '4', + -CODE => 'version', + -NAME => 'version' + ); + + my $attrib3 = Bio::EnsEMBL::Attribute->new( + -VALUE => 'AL42131.4', + -CODE => 'synonym', + -NAME => 'synonym' + ); + + # can associate a misc feature with any number of sets + + $mfeat->add_MiscSet($clone_set); + $mfeat->add_MiscSet($tiling_path_set); + + # can add arbitrary attributes to a misc feature + + $mfeat->add_Attribute($attrib1); + $mfeat->add_Attribute($attrib2); + $mfeat->add_Attribute($attrib3); + + my ($name_attrib) = @{ $mfeat->get_all_Attributes('name') }; + my @all_attribs = @{ $mfeat->get_all_Attributes() }; + + my @all_sets = @{ $mfeat->get_all_MiscSets() }; + my ($clone_set) = @{ $mfeat->get_all_CloneSets('clone') }; + + + # Can do normal feature operations as well + $mfeat = $mfeat->transform('supercontig'); + print $mfeat->slice->seq_region_name, ' ', $mfeat->start, '-', + $mfeat->end; + + +=head1 DESCRIPTION + +MiscFeatures are extremely general features with a location, and an +arbitrary group of attributes. They are grouped with other features of +the same 'type' through the use of MiscSets (see Bio::EnsEMBL::MiscSet). +Attributes are attached in the fom of Bio::EnsEMBL::Attribute objects. +See Bio::EnsEMBL::DBSQL::MiscFeatureAdaptor for ways to fetch or store +MiscFeatures. + +=cut + + +package Bio::EnsEMBL::MiscFeature; + +use strict; +use warnings; + +use Bio::EnsEMBL::Feature; +use Bio::EnsEMBL::Utils::Exception qw(throw); +use Scalar::Util qw(weaken isweak); + +use vars qw(@ISA); + +@ISA = qw(Bio::EnsEMBL::Feature); + +=head2 new + + Arg [-SLICE]: Bio::EnsEMBL::SLice - Represents the sequence that this + feature is on. The coordinates of the created feature are + relative to the start of the slice. + Arg [-START]: The start coordinate of this feature relative to the start + of the slice it is sitting on. Coordinates start at 1 and + are inclusive. + Arg [-END] : The end coordinate of this feature relative to the start of + the slice it is sitting on. Coordinates start at 1 and are + inclusive. + Arg [-STRAND]: The orientation of this feature. Valid values are 1,-1,0. + Arg [-SEQNAME] : A seqname to be used instead of the default name of the + of the slice. Useful for features that do not have an + attached slice such as protein features. + Arg [-dbID] : (optional) internal database id + Arg [-ADAPTOR]: (optional) Bio::EnsEMBL::DBSQL::BaseAdaptor + Example : $feature = Bio::EnsEMBL::MiscFeature->new(-start => 1, + -end => 100, + -strand => 1, + -slice => $slice, + -analysis => $analysis); + Description: Constructs a new Bio::EnsEMBL::Feature. Generally subclasses + of this method are instantiated, rather than this class itself. + Returntype : Bio::EnsEMBL::MiscFeature + Exceptions : Thrown on invalid -SLICE, -ANALYSIS, -STRAND ,-ADAPTOR arguments + Caller : general, subclass constructors + Status : Stable + +=cut + + +sub new { + my $class = shift; + + my $self = $class->SUPER::new(@_); + + $self->{'attributes'} = []; + + return $self; +} + + +=head2 new_fast + + Arg [...] : hashref to bless as new MiscFeature + Example : $miscfeature = Bio::EnsEMBL::MiscFeature->new_fast(); + Description: Creates a new Miscfeature. + Returntype : Bio::EnsEMBL::MiscFeature + Exceptions : none + Caller : general + Status : Stable + +=cut + + +sub new_fast { + my $class = shift; + my $hashref = shift; + + $hashref->{'attributes'} ||= []; + + my $self = bless $hashref, $class; + weaken($self->{adaptor}) if ( ! isweak($self->{adaptor}) ); + return $self; +} + + + +=head2 add_Attribute + + Arg [1] : Bio::EnsEMBL::Attribute $attribute + Example : $misc_feature->add_attribute($attribute); + Description: Adds an attribute to this misc. feature + Returntype : none + Exceptions : throw on wrong argument type + Caller : general + Status : Stable + +=cut + +sub add_Attribute { + my ($self, $attrib) = @_; + + if( ! defined $attrib || ! $attrib->isa( "Bio::EnsEMBL::Attribute" )) { + throw( "You have to provide a Bio::EnsEMBL::Attribute, not a [$attrib]" ); + } + + $self->{'attributes'} ||= []; + push @{$self->{'attributes'}}, $attrib +} + + + +=head2 add_MiscSet + + Arg [1] : Bio::EnsEMBL::MiscSet $set + The set to add + Example : $misc_feature->add_MiscSet(Bio::EnsEMBL::MiscSet->new(...)); + Description: Associates this MiscFeature with a given Set. + Returntype : none + Exceptions : throw if the set arg is not provided, + throw if the set to be added does not have a code + Caller : general + Status : Stable + +=cut + + +sub add_MiscSet { + my $self = shift; + my $miscSet = shift; + + if(!$miscSet || !ref($miscSet) || !$miscSet->isa('Bio::EnsEMBL::MiscSet')) { + throw('Set argument must be a Bio::EnsEMBL::MiscSet'); + } + + $self->{'miscSets'} ||= []; + + push( @{$self->{'miscSets'}}, $miscSet ); +} + + + +=head2 get_all_MiscSets + + Arg [1] : optional string $code + The code of the set to retrieve + Example : $set = $misc_feature->get_all_MiscSets($code); + Description: Retrieves a set that this feature is associated with via its + code. Can return empty lists. Usually returns about one elements lists. + Returntype : listref of Bio::EnsEMBL::MiscSet + Exceptions : throw if the code arg is not provided + Caller : general + Status : Stable + +=cut + + +sub get_all_MiscSets { + my $self = shift; + my $code = shift; + + $self->{'miscSets'} ||= []; + if( defined $code ) { + my @results = grep { uc($_->code())eq uc( $code ) } @{$self->{'miscSets'}}; + return \@results; + } else { + return $self->{'miscSets'}; + } +} + + +=head2 get_all_Attributes + + Arg [1] : optional string $code + The code of the Attribute objects to retrieve + Example : @attributes = @{ $misc_feature->get_all_Attributes('name') }; + Description: Retrieves a list of Attribute objects for given code or all + of the associated Attributes. + Returntype : listref of Bio::EnsEMBL::Attribute + Exceptions : + Caller : general + Status : Stable + +=cut + +sub get_all_Attributes { + my $self = shift; + my $code = shift; + + my @results; + my $result; + + if( defined $code ) { + @results = grep { uc( $_->code() ) eq uc( $code )} @{$self->{'attributes'}}; + return \@results; + } else { + return $self->{'attributes'}; + } +} + +=head2 get_all_attribute_values + + Arg [1] : string $code + The code of the Attribute object values to retrieve + Example : @attributes_vals = @{$misc_feature->get_all_attribute_values('name')}; + Description: Retrieves a list of Attribute object values for given code or all + of the associated Attributes. + Returntype : listref of values + Exceptions : + Caller : general + Status : Stable + +=cut + +sub get_all_attribute_values { + my $self = shift; + my $code = shift; + my @results = map { uc( $_->code() ) eq uc( $code ) ? $_->value : () } + @{$self->{'attributes'}}; + return \@results; +} + +=head2 get_scalar_attribute + + Arg [1] : string $code + The code of the Attribute object values to retrieve + Example : $vals = $misc_feature->get_scalar_attribute('name'); + Description: Retrieves a value for given code or all + of the associated Attributes. + Returntype : scalar value + Exceptions : + Caller : general + Status : Stable + + +=cut + + +sub get_scalar_attribute { + my $self = shift; + my $code = shift; + my @results = grep { uc( $_->code() ) eq uc( $code )} @{$self->{'attributes'}}; + return @results ? $results[0]->value() : ''; +} + +sub get_first_scalar_attribute { + my $self = shift; + foreach my $code ( @_ ) { + my @results = grep { uc( $_->code() ) eq uc( $code )} @{$self->{'attributes'}}; + return $results[0]->value() if @results; + } + return ''; +} +=head2 display_id + + Arg [1] : none + Example : print $kb->display_id(); + Description: This method returns a string that is considered to be + the 'display' identifier. For misc_features this is the first + name or synonym attribute or '' if neither are defined. + Returntype : string + Exceptions : none + Caller : web drawing code + Status : Stable + +=cut + +sub display_id { + my $self = shift; + my ($attrib) = @{$self->get_all_Attributes('name')}; + ($attrib) = @{$self->get_all_Attributes('synonym')} if(!$attrib); + if( defined $attrib ) { + return $attrib->value(); + } else { + return ''; + } +} + + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/MiscSet.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/MiscSet.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,187 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::MiscSet - This is a set representing a classification of +a group of miscellaneuos features. + +=head1 SYNOPSIS + + use Bio::EnsEMBL::MiscSet; + + my $misc_set = Bio::EnsEMBL::MiscSet->new( + 1234, $adaptor, 'tilepath', + 'Assembly Tiling Path', + 'The tiling path of clones', 1e6 + ); + + my $misc_feature->add_set($misc_set); + +=head1 DESCRIPTION + +MiscSets represent classsifications or groupings of MiscFeatures. +Features are classified into sets essentially to define what they are +and how they may be used. Generally MiscFeatures are retrieved on +the basis of their associated sets. See Bio::EnsEMBL::MiscFeature, +Bio::EnsEMBL::DBSQL::MiscFeatureAdaptor. + +Note that MiscSets and MiscFeatures were formerly known as MapSets and +MapFrags + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::MiscSet; + +use strict; + +use Bio::EnsEMBL::Storable; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); + +use vars qw(@ISA); + +@ISA = qw(Bio::EnsEMBL::Storable); + + +=head2 new + + Arg [1] : int $misc_set_id + The internal identifier for this misc set + Arg [2] : string $code + The unique code which identifies this set type + Arg [3] : string $name + The human readable name of this set + Arg [4] : string $desc + The description of this set + Arg [5] : int $max_len + The maximum length of features of this mapset + Example : $set = new Bio::EnsEMBL::MiscSet(1234, 'tilepath', + 'Assembly Tiling Path', + 'The tiling path of clones', + 1e6); + Description: Instantiates a Bio::EnsEMBL::MiscSet + Returntype : Bio::EnsEMBL::MiscSet + Exceptions : none + Caller : MiscFeatureAdaptor + Status : Stable + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + my($code, $name, $desc, $max_len) = + rearrange([qw(CODE NAME DESCRIPTION LONGEST_FEATURE)], @_); + + $self->{'code'} = $code; + $self->{'name'} = $name; + $self->{'description'} = $desc; + $self->{'longest_feature'} = $max_len; + + return $self; +} + +=head2 code + + Arg [1] : string $newval (optional) + The new value to set the code attribute to + Example : $code = $obj->code() + Description: Getter/Setter for the code attribute + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub code{ + my $self = shift; + $self->{'code'} = shift if(@_); + return $self->{'code'}; +} + + +=head2 name + + Arg [1] : string $newval (optional) + The new value to set the code attribute to + Example : $name = $obj->name() + Description: Getter/Setter for the name attribute + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub name { + my $self = shift; + $self->{'name'} = shift if(@_); + return $self->{'name'}; +} + + +=head2 description + + Arg [1] : string $newval (optional) + The new value to set the description attribute to + Example : $description = $obj->description() + Description: Getter/Setter for the description attribute + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub description{ + my $self = shift; + $self->{'description'} = shift if(@_); + return $self->{'description'}; +} + + +=head2 longest_feature + + Arg [1] : int $newval (optional) + The new value to set the longest_feature attribute to + Example : $longest_feature = $obj->longest_feature() + Description: Getter/Setter for the longest_feature attribute + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub longest_feature{ + my $self = shift; + $self->{'longest_feature'} = shift if(@_); + return $self->{'longest_feature'}; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/OntologyTerm.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/OntologyTerm.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,347 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::OntologyTerm + +=head1 DESCRIPTION + +An ontology term object, (most often) created by +Bio::EnsEMBL::DBSQL::GOTermAdaptor and used in querying for +transcripts, genes, and translations using the relevant adaptors and +methods. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::OntologyTerm; + +use strict; +use warnings; + +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); + +use base qw( Bio::EnsEMBL::Storable ); + +=head2 new + + Arg [-ACCESSION] : String + The accession of the ontology term. + + Arg [-ONTOLOGY] : String + The ontology that the term belongs to. + + Arg [-NAMESPACE] : String + The namespace of the ontology term. + + Arg [-NAME] : String + The name of the ontology term. + + Arg [-SUBSETS] : (optional) Listref of strings + The subsets within the ontology to which this + term belongs. + + Arg [-DEFINITION] : (optional) String + The definition of the ontology term. + + Arg [-SYNONYMS] : (optional) Listref of strings + The synonyms of this term. + + Arg : Further arguments required for parent class + Bio::EnsEMBL::Storable. + + Description : Creates an ontology term object. + + Example : + + my $term = Bio::EnsEMBL::OntologyTerm->new( + '-accession' => 'GO:0021785', + '-ontology' => 'GO', + '-namespace' => 'biological_process', + '-name' => 'branchiomotor neuron axon guidance', + '-definition' => 'The process in which a branchiomotor ' + . 'neuron growth cone is directed to a specific target site. ' + . 'Branchiomotor neurons are located in the hindbrain and ' + . 'innervate branchial arch-derived muscles that control jaw ' + . 'movements, facial expression, the larynx, and the pharynx.', + '-synonyms' => [ 'BMN axon guidance', + 'branchial motor axon guidance', + 'special visceral motor neuron axon guidance' ] + + # ... other arguments required by Bio::EnsEMBL::Storable. + ); + + Return type : Bio::EnsEMBL::OntologyTerm + +=cut + +sub new { + my $proto = shift(@_); + + my $this = $proto->SUPER::new(@_); + + my ( $accession, $ontology, $namespace, $name, $definition, $subsets ) + = rearrange( [ 'ACCESSION', 'ONTOLOGY', 'NAMESPACE', 'NAME', + 'DEFINITION', 'SUBSETS' ], + @_ ); + + $this->{'accession'} = $accession; + $this->{'ontology'} = $ontology; + $this->{'namespace'} = $namespace; + $this->{'name'} = $name; + $this->{'definition'} = $definition; + $this->{'subsets'} = [ @{$subsets} ]; + + $this->{'child_terms_fetched'} = 0; + $this->{'parent_terms_fetched'} = 0; + + return $this; +} + +=head2 accession + + Arg : None + + Description : Returns the accession for the ontology term in question. + + Example : my $accession = $term->accession(); + + Return type : String + +=cut + +sub accession { + my ($this) = @_; + return $this->{'accession'}; +} + +=head2 ontology + + Arg : None + + Description : Returns the ontology for the ontology term in question. + + Example : my $ontology = $term->ontology(); + + Return type : String + +=cut + +sub ontology { + my ($this) = @_; + return $this->{'ontology'}; +} + +=head2 namespace + + Arg : None + + Description : Returns the namespace for the ontology term in question. + + Example : my $acc = $term->namespace(); + + Return type : String + +=cut + +sub namespace { + my ($this) = @_; + return $this->{'namespace'}; +} + +=head2 name + + Arg : None + + Description : Returns the name for the ontology term in question. + + Example : my $name = $term->name(); + + Return type : String + +=cut + +sub name { + my ($this) = @_; + return $this->{'name'}; +} + +=head2 definition + + Arg : None + + Description : Returns the definition for the ontology term in question. + + Example : my $definition = $term->definition(); + + Return type : String + +=cut + +sub definition { + my ($this) = @_; + return $this->{'definition'}; +} + +=head2 synonyms + + Arg : None + + Description : Returns the list of synonyms defined for this term + (if any). + + Example : my @synonyms = @{ $term->synonyms() }; + + Return type : Listref of strings + +=cut + +sub synonyms { + my ($this) = @_; + + if ( !exists( $this->{'synonyms'} ) ) { + $this->{'synonyms'} = + $this->adaptor()->_fetch_synonyms_by_dbID( $this->dbID() ); + } + + return $this->{'synonyms'}; +} + +=head2 subsets + + Arg : None + + Description : Returns a list of subsets that this term is part + of. The list might be empty. + + Example : my @subsets = @{ $term->subsets() }; + + Return type : listref of strings + +=cut + +sub subsets { + my ($this) = @_; + return $this->{'subsets'}; +} + +=head2 children + + Arg : (optional) List of strings + The type of relations to retrieve children for. + + Description : Returns the children terms of this ontology term. + + Example : my @children = + @{ $term->children( 'is_a', 'part_of' ) }; + + Return type : listref of Bio::EnsEMBL::OntologyTerm + +=cut + +sub children { + my ( $this, @relations ) = @_; + + my @terms = @{ $this->adaptor()->fetch_all_by_parent_term($this) }; + + if (@relations) { + @terms = (); + foreach my $relation (@relations) { + if ( exists( $this->{'children'}{$relation} ) ) { + push( @terms, @{ $this->{'children'}{$relation} } ); + } + } + } + + return \@terms; +} + +=head2 descendants + + Arg : None + + Description : Returns the complete set of 'is_a' and 'part_of' + descendant terms of this ontology term, down to + and including any leaf terms. + + Example : my @descendants = @{ $term->descendants() }; + + Return type : listref of Bio::EnsEMBL::OntologyTerm + +=cut + +sub descendants { + my ($this) = @_; + return $this->adaptor()->fetch_all_by_ancestor_term($this); +} + +=head2 parents + + Arg : (optional) List of strings + The type of relations to retrieve parents for. + + Description : Returns the parent terms of this ontology term. + + Example : my @parents = + @{ $term->parents( 'is_a', 'part_of' ) }; + + Return type : listref of Bio::EnsEMBL::OntologyTerm + +=cut + +sub parents { + my ( $this, @relations ) = @_; + + my @terms = @{ $this->adaptor()->fetch_all_by_child_term($this) }; + + if (@relations) { + @terms = (); + foreach my $relation (@relations) { + if ( exists( $this->{'parents'}{$relation} ) ) { + push( @terms, @{ $this->{'parents'}{$relation} } ); + } + } + } + + return \@terms; +} + +=head2 ancestors + + Arg : None + + Description : Returns the complete set of 'is_a' and 'part_of' + ancestor terms of this ontology term, up to and + including the root term. + + Example : my @ancestors = @{ $term->ancestors() }; + + Return type : listref of Bio::EnsEMBL::OntologyTerm + +=cut + +sub ancestors { + my ($this) = @_; + return $this->adaptor()->fetch_all_by_descendant_term($this); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/OntologyXref.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/OntologyXref.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,179 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::OntologyXref + +=head1 DESCRIPTION + +This class extends the DBEntry in order to associate Evidence Tags +to the relationship between EnsEMBL objects and ontology accessions +(primarily GO accessions). + +The relationship to GO that is stored in the database is actually +derived through the relationship of EnsEMBL peptides to SwissProt +peptides, i.e. the relationship is derived like this: + + ENSP -> SWISSPROT -> GO + +And the evidence tag describes the relationship between the SwissProt +Peptide and the GO entry. + +In reality, however, we store this in the database like this: + + ENSP -> SWISSPROT + ENSP -> GO + +and the evidence tag hangs off of the relationship between the ENSP and +the GO identifier. Some ENSPs are associated with multiple closely +related Swissprot entries which may both be associated with the same GO +identifier but with different evidence tags. For this reason a single +'OntologyXref' can have multiple evidence tags. + +=head1 SYNOPSIS + + my $ontology_xref = Bio::EnsEMBL::OntologyXref->new(); + $ontology_xref->add_linkage_type('IEA'); + + foreach my $evtag ( @{ $ontology_xref->get_all_linkage_types() } ) { + print "$evtag\n"; + } + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::OntologyXref; + +use strict; + +use base qw( Bio::EnsEMBL::DBEntry ); + +=head2 add_linkage_type + + Arg [1] : string $value + allowed values: + 'IC', 'IDA', 'IEA', 'IEP', 'IGI', 'IMP', 'IPI', + 'ISS', NAS', 'ND', 'TAS', 'NR', 'RCA' + Arg [2] : (optional) Bio::EnsEMBL::DBEntry $source + Example : $ontology_xref->add_linkage_type('IGI'); + Description: Associates a linkage type and source DBEntry with + this ontology_xref + Returntype : integer; number of linkages + Exceptions : thrown if $linkage_type argument not supplied or + the optional DBEntry is not a DBEntry object. + Caller : DBEntryAdaptor + Status : Experimantal + +=cut + +sub add_linkage_type { + my ( $self, $lt, $source_dbentry ) = @_; + + if ( !defined($lt) ) { + $self->throw("linkage type argument required"); + } + + if ( defined($source_dbentry) + && !$source_dbentry->isa('Bio::EnsEMBL::DBEntry') ) + { + $self->throw("source_dbentry must be a Bio::EnsEMBL::DBEntry"); + } + + $self->{'linkage_types'} ||= []; + + push @{ $self->{'linkage_types'} }, + [ $lt, ( $source_dbentry || () ) ]; +} + + +=head2 get_all_linkage_info + + Arg [1] : none + Example : + + foreach ( @{ $ontology_xref->get_all_linkage_info() } ) { + print "evidence: $_->[0] via $_->[1]->display_id"; + } + + Description: Retrieves a list of evidence-tag/source-DBEntry pairs + associated with this ontology_xref + Returntype : listref of listrefs + Exceptions : none + Caller : geneview? general. + Status : Experimental + +=cut + +sub get_all_linkage_info { + my ($self) = @_; + + return $self->{'linkage_types'} || []; +} + + +=head2 get_all_linkage_types + + Arg [1] : none + Example : + + print( join( ' ', @{ $ontology_xref->get_all_linkage_types() } ), + "\n" ); + + Description: Retrieves a unique list of evidence tags associated with + this ontology_xref + Returntype : none + Exceptions : none + Caller : geneview? general + Status : Stable + +=cut + +sub get_all_linkage_types { + my ($self) = @_; + + my %seen; + return [ grep { !$seen{$_}++ } + map { $_->[0] } @{ $self->{'linkage_types'} } ]; + + #return [ map{ $_->[0]} @{ $self->{'linkage_types'} || [] } ]; +} + + +=head2 flush_linkage_types + + Arg [1] : none + Example : $ontology_xref->flush_linkage_types(); + Description: Removes any associated evidence tags + Returntype : none + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub flush_linkage_types { + my ($self) = @_; + + $self->{'linkage_types'} = []; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Operon.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Operon.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,367 @@ + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Operon - Object representing an operon + +=head1 SYNOPSIS + + my $operon = Bio::EnsEMBL::Operon->new( + -START => 123, + -END => 1045, + -STRAND => 1, + -SLICE => $slice, + -DISPLAY_LABEL => $name + ); + + # print operon information + print("operon start:end:strand is " + . join( ":", map { $operon->$_ } qw(start end strand) ) + . "\n" ); + +=head1 DESCRIPTION + +A representation of an Operon within the Ensembl system. +An operon is a collection of one or more polycistronic transcripts which contain one or more genes. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Operon; + +use strict; +use warnings; + +use Bio::EnsEMBL::Feature; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw warning deprecate); +use Bio::EnsEMBL::Utils::Scalar qw(assert_ref); + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Feature); + +=head2 new + + Arg [-START] : + int - start postion of the operon + Arg [-END] : + int - end position of the operon + Arg [-STRAND] : + int - 1,-1 the strand the operon is on + Arg [-SLICE] : + Bio::EnsEMBL::Slice - the slice the operon is on + Arg [-STABLE_ID] : + string - the stable identifier of this operon + Arg [-VERSION] : + int - the version of the stable identifier of this operon + Arg [-DISPLAY_LABEL]: + A name/label for this operon + Arg [-CREATED_DATE]: + string - the date the operon was created + Arg [-MODIFIED_DATE]: + string - the date the operon was last modified + + Example : $gene = Bio::EnsEMBL::Operon->new(...); + Description: Creates a new operon object + Returntype : Bio::EnsEMBL::Operon + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_); + my ( $stable_id, $version, $created_date, $modified_date,$display_label) = + rearrange( [ 'STABLE_ID', 'VERSION', + 'CREATED_DATE', 'MODIFIED_DATE', + 'DISPLAY_LABEL' ], + @_ ); + + $self->stable_id($stable_id); + $self->version($version); + $self->{'created_date'} = $created_date; + $self->{'modified_date'} = $modified_date; + $self->display_label($display_label); + + return $self; +} + +=head2 created_date + + Arg [1] : (optional) String - created date to set (as a UNIX time int) + Example : $gene->created_date('1141948800'); + Description: Getter/setter for attribute created_date + Returntype : String + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub created_date { + my $self = shift; + $self->{'created_date'} = shift if ( @_ ); + return $self->{'created_date'}; +} + + +=head2 modified_date + + Arg [1] : (optional) String - modified date to set (as a UNIX time int) + Example : $gene->modified_date('1141948800'); + Description: Getter/setter for attribute modified_date + Returntype : String + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub modified_date { + my $self = shift; + $self->{'modified_date'} = shift if ( @_ ); + return $self->{'modified_date'}; +} + + +=head2 display_label + + Arg [1] : (optional) String - the name/label to set + Example : $operon->name('accBCD'); + Description: Getter/setter for attribute name. + Returntype : String or undef + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub display_label { + my $self = shift; + $self->{'display_label'} = shift if (@_); + return $self->{'display_label'}; +} + +=head2 stable_id + + Arg [1] : (optional) String - the stable ID to set + Example : $operon->stable_id("accR2"); + Description: Getter/setter for stable id for this operon. + Returntype : String + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub stable_id { + my $self = shift; + $self->{'stable_id'} = shift if (@_); + return $self->{'stable_id'}; +} + +=head2 version + + Arg [1] : (optional) Int - the stable ID version to set + Example : $operon->version(1); + Description: Getter/setter for stable id version for this operon. + Returntype : Int + Exceptions : none + Caller : general + Status : Stable + +=cut +sub version { + my $self = shift; + $self->{'version'} = shift if(@_); + return $self->{'version'}; +} + +=head2 get_all_OperonTranscripts + + Example : my $ots = $operon->get_all_OperonTranscripts(); + Description: Retrieve all operon transcripts belonging to this operon + Returntype : Arrayref of Bio::EnsEMBL::OperonTranscript + Exceptions : none + Caller : general + Status : Stable + +=cut +sub get_all_OperonTranscripts { + my $self = shift; + if ( !exists $self->{'_operon_transcript_array'} ) { + if ( defined $self->adaptor() ) { + my $ta = $self->adaptor()->db()->get_OperonTranscriptAdaptor(); + my $transcripts = $ta->fetch_all_by_Operon($self); + $self->{'_operon_transcript_array'} = $transcripts; + } + } + return $self->{'_operon_transcript_array'}; +} + +=head2 add_OperonTranscript + + Arg [1] : Bio::EnsEMBL::OperonTranscript - operon transcript to attach to this operon + Example : $operon->add_OperonTranscript($ot); + Description: Attach a polycistronic operon transcript to this operon + Exceptions : if argument is not Bio::EnsEMBL::OperonTranscript + Caller : general + Status : Stable + +=cut +sub add_OperonTranscript { + my ( $self, $trans ) = @_; + + assert_ref($trans,"Bio::EnsEMBL::OperonTranscript"); + + $self->{'_operon_transcript_array'} ||= []; + push( @{ $self->{'_operon_transcript_array'} }, $trans ); + + #$self->recalculate_coordinates(); + return; +} + +=head2 add_DBEntry + + Arg [1] : Bio::EnsEMBL::DBEntry $dbe + The dbEntry to be added + Example : my $dbe = Bio::EnsEMBL::DBEntery->new(...); + $operon->add_DBEntry($dbe); + Description: Associates a DBEntry with this operon. Note that adding DBEntries + will prevent future lazy-loading of DBEntries for this operon + (see get_all_DBEntries). + Returntype : none + Exceptions : thrown on incorrect argument type + Caller : general + Status : Stable + +=cut + +sub add_DBEntry { + my $self = shift; + my $dbe = shift; + + unless($dbe && ref($dbe) && $dbe->isa('Bio::EnsEMBL::DBEntry')) { + throw('Expected DBEntry argument'); + } + + $self->{'dbentries'} ||= []; + push @{$self->{'dbentries'}}, $dbe; +} + + +=head2 get_all_Attributes + + Arg [1] : (optional) String $attrib_code + The code of the attribute type to retrieve values for + Example : my ($author) = @{ $operon->get_all_Attributes('author') }; + my @operon_attributes = @{ $operon->get_all_Attributes }; + Description: Gets a list of Attributes of this operon. + Optionally just get Attributes for given code. + Returntype : Listref of Bio::EnsEMBL::Attribute + Exceptions : warning if gene does not have attached adaptor and attempts lazy + load. + Caller : general + Status : Stable + +=cut + +sub get_all_Attributes { + my $self = shift; + my $attrib_code = shift; + + if ( !exists $self->{'attributes'} ) { + if ( !$self->adaptor() ) { + return []; + } + + my $attribute_adaptor = $self->adaptor->db->get_AttributeAdaptor(); + $self->{'attributes'} = $attribute_adaptor->fetch_all_by_Operon($self); + } + + if ( defined $attrib_code ) { + my @results = + grep { uc( $_->code() ) eq uc($attrib_code) } + @{ $self->{'attributes'} }; + return \@results; + } else { + return $self->{'attributes'}; + } +} + +=head2 get_all_DBEntries + + Arg [1] : (optional) String, external database name + + Arg [2] : (optional) String, external_db type + + Example : @dbentries = @{ $gene->get_all_DBEntries() }; + + Description: Retrieves DBEntries (xrefs) for this operon. This does + *not* include DBEntries that are associated with the + transcripts and corresponding translations of this + gene (see get_all_DBLinks()). + + This method will attempt to lazy-load DBEntries + from a database if an adaptor is available and no + DBEntries are present on the gene (i.e. they have not + already been added or loaded). + + Return type: Listref of Bio::EnsEMBL::DBEntry objects + Exceptions : none + Caller : get_all_DBLinks, OperonAdaptor::store + Status : Stable + +=cut + +sub get_all_DBEntries { + my ( $self, $db_name_exp, $ex_db_type ) = @_; + + my $cache_name = 'dbentries'; + + if ( defined($db_name_exp) ) { + $cache_name .= $db_name_exp; + } + + if ( defined($ex_db_type) ) { + $cache_name .= $ex_db_type; + } + + # if not cached, retrieve all of the xrefs for this gene + if ( !defined( $self->{$cache_name} ) && defined( $self->adaptor() ) ) { + $self->{$cache_name} = + $self->adaptor()->db()->get_DBEntryAdaptor() + ->fetch_all_by_Operon( $self, $db_name_exp, $ex_db_type ); + } + + $self->{$cache_name} ||= []; + + return $self->{$cache_name}; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/OperonTranscript.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/OperonTranscript.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,375 @@ + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::OperonTranscript - Object representing a polycistronic transcript that is part of an operon + +=head1 SYNOPSIS + +my $operon_transcript = + Bio::EnsEMBL::OperonTranscript->new( -START => $start, + -END => $end, + -STRAND => $strand, + -SLICE => $slice ); +$operon->add_OperonTranscript($operon_transcript); + +=head1 DESCRIPTION + +A representation of a polycistronic transcript from an operon within the Ensembl system. An operon is a collection of one or more polycistronic transcripts, which contain one or more genes. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::OperonTranscript; + +use strict; +use warnings; + +use Bio::EnsEMBL::Feature; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw warning deprecate); +use Bio::EnsEMBL::Utils::Scalar qw(assert_ref); + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Feature); + +=head2 new + + Arg [-START] : + int - start postion of the operon transcript + Arg [-END] : + int - end position of the operon transcript + Arg [-STRAND] : + int - 1,-1 tehe strand the operon transcript is on + Arg [-SLICE] : + Bio::EnsEMBL::Slice - the slice the operon transcript is on + Arg [-STABLE_ID] : + string - the stable identifier of this operon transcript + Arg [-VERSION] : + int - the version of the stable identifier of this operon transcript + Arg [-CREATED_DATE]: + string - the date the operon transcript was created + Arg [-MODIFIED_DATE]: + string - the date the operon transcript was last modified + + Example : $gene = Bio::EnsEMBL::OperonTranscript->new(...); + Description: Creates a new operon transcript object + Returntype : Bio::EnsEMBL::OperonTranscript + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_); + my ( $stable_id, $version, $created_date, $modified_date, $display_label ) = + rearrange( [ 'STABLE_ID', 'VERSION', + 'CREATED_DATE', 'MODIFIED_DATE', 'DISPLAY_LABEL' ], + @_ ); + + $self->stable_id($stable_id); + $self->version($version); + $self->{'created_date'} = $created_date; + $self->{'modified_date'} = $modified_date; + $self->display_label($display_label); + return $self; +} + +=head2 created_date + + Arg [1] : (optional) String - created date to set (as a UNIX time int) + Example : $gene->created_date('1141948800'); + Description: Getter/setter for attribute created_date + Returntype : String + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub created_date { + my $self = shift; + $self->{'created_date'} = shift if ( @_ ); + return $self->{'created_date'}; +} + + +=head2 modified_date + + Arg [1] : (optional) String - modified date to set (as a UNIX time int) + Example : $gene->modified_date('1141948800'); + Description: Getter/setter for attribute modified_date + Returntype : String + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub modified_date { + my $self = shift; + $self->{'modified_date'} = shift if ( @_ ); + return $self->{'modified_date'}; +} + + +=head2 display_label + + Arg [1] : (optional) String - the name/label to set + Example : $operon->name('accBCD'); + Description: Getter/setter for attribute name. + Returntype : String or undef + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub display_label { + my $self = shift; + $self->{'display_label'} = shift if (@_); + return $self->{'display_label'}; +} + +=head2 stable_id + + Arg [1] : (optional) String - the stable ID to set + Example : $operon->stable_id("accR2A"); + Description: Getter/setter for stable id for this operon transcript. + Returntype : String + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub stable_id { + my $self = shift; + $self->{'stable_id'} = shift if (@_); + return $self->{'stable_id'}; +} + +=head2 version + + Arg [1] : (optional) Int - the stable ID version to set + Example : $operon->version(1); + Description: Getter/setter for stable id version for this operon transcript. + Returntype : Int + Exceptions : none + Caller : general + Status : Stable + +=cut +sub version { + my $self = shift; + $self->{'version'} = shift if(@_); + return $self->{'version'}; +} + +=head2 operon + + Example : $operon = $ot->operon(); + Description: getter for the operon to which this transcript belongs + Returntype : Bio::EnsEMBL::Operon + Exceptions : none + Caller : general + Status : Stable + +=cut +sub operon { + my $self = shift; + if ( !exists $self->{'operon'} ) { + if ( defined $self->adaptor() ) { + my $ta = $self->adaptor()->db()->get_OperonAdaptor(); + my $operon = $ta->fetch_by_operon_transcript($self); + $self->{'operon'} = $operon; + } + } + return $self->{'operon'}; +} + +=head2 get_all_Genes + + Example : $genes = $ot->get_all_Genes(); + Description: get all the genes that are attached to this operon transcript + Returntype : Arrayref of Bio::EnsEMBL::Gene + Exceptions : none + Caller : general + Status : Stable + +=cut +sub get_all_Genes { + my $self = shift; + if(! defined $self->{_gene_array}) { + if(defined $self->dbID() && defined $self->adaptor()) { + my $ta = $self->adaptor()->db()->get_OperonTranscriptAdaptor(); + my $transcripts = $ta->fetch_genes_by_operon_transcript($self); + $self->{_gene_array} = $transcripts; + } + else { + $self->{_gene_array} = []; + } + } + return $self->{_gene_array}; +} +=head2 add_gene + + Arg [1] : Bio::EnsEMBL::Gene - gene to attach to this polycistronic transcript + Example : $operon->add_gene($gene); + Description: Attach a gene to this polycistronic transcript + Exceptions : if argument is not Bio::EnsEMBL::Gene + Caller : general + Status : Stable + +=cut +sub add_gene { + my ($self,$gene) = @_; + assert_ref($gene,'Bio::EnsEMBL::Gene'); + push @{$self->get_all_Genes()},$gene; + return; +} + +=head2 add_DBEntry + + Arg [1] : Bio::EnsEMBL::DBEntry $dbe + The dbEntry to be added + Example : my $dbe = Bio::EnsEMBL::DBEntery->new(...); + $operon->add_DBEntry($dbe); + Description: Associates a DBEntry with this operon. Note that adding DBEntries + will prevent future lazy-loading of DBEntries for this operon + (see get_all_DBEntries). + Returntype : none + Exceptions : thrown on incorrect argument type + Caller : general + Status : Stable + +=cut + +sub add_DBEntry { + my $self = shift; + my $dbe = shift; + + unless($dbe && ref($dbe) && $dbe->isa('Bio::EnsEMBL::DBEntry')) { + throw('Expected DBEntry argument'); + } + + $self->{'dbentries'} ||= []; + push @{$self->{'dbentries'}}, $dbe; +} + + +=head2 get_all_Attributes + + Arg [1] : (optional) String $attrib_code + The code of the attribute type to retrieve values for + Example : my ($author) = @{ $ot->get_all_Attributes('author') }; + my @ot_attributes = @{ $ot->get_all_Attributes }; + Description: Gets a list of Attributes of this operon transcript. + Optionally just get Attributes for given code. + Returntype : Listref of Bio::EnsEMBL::Attribute + Exceptions : warning if gene does not have attached adaptor and attempts lazy + load. + Caller : general + Status : Stable + +=cut + +sub get_all_Attributes { + my $self = shift; + my $attrib_code = shift; + + if ( !exists $self->{'attributes'} ) { + if ( !$self->adaptor() ) { + return []; + } + + my $attribute_adaptor = $self->adaptor->db->get_AttributeAdaptor(); + $self->{'attributes'} = $attribute_adaptor->fetch_all_by_OperonTranscript($self); + } + + if ( defined $attrib_code ) { + my @results = + grep { uc( $_->code() ) eq uc($attrib_code) } + @{ $self->{'attributes'} }; + return \@results; + } else { + return $self->{'attributes'}; + } +} + +=head2 get_all_DBEntries + + Arg [1] : (optional) String, external database name + + Arg [2] : (optional) String, external_db type + + Example : @dbentries = @{ $gene->get_all_DBEntries() }; + + Description: Retrieves DBEntries (xrefs) for this operon transcript. This does + *not* include DBEntries that are associated with the + transcripts and corresponding translations of this + gene (see get_all_DBLinks()). + + This method will attempt to lazy-load DBEntries + from a database if an adaptor is available and no + DBEntries are present on the gene (i.e. they have not + already been added or loaded). + + Return type: Listref of Bio::EnsEMBL::DBEntry objects + Exceptions : none + Caller : get_all_DBLinks, OperontTranscriptAdaptor::store + Status : Stable + +=cut + +sub get_all_DBEntries { + my ( $self, $db_name_exp, $ex_db_type ) = @_; + + my $cache_name = 'dbentries'; + + if ( defined($db_name_exp) ) { + $cache_name .= $db_name_exp; + } + + if ( defined($ex_db_type) ) { + $cache_name .= $ex_db_type; + } + + # if not cached, retrieve all of the xrefs for this gene + if ( !defined( $self->{$cache_name} ) && defined( $self->adaptor() ) ) { + $self->{$cache_name} = + $self->adaptor()->db()->get_DBEntryAdaptor() + ->fetch_all_by_Operon( $self->operon(), $db_name_exp, $ex_db_type ); + } + + $self->{$cache_name} ||= []; + + return $self->{$cache_name}; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/PaddedSlice.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/PaddedSlice.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,228 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::PaddedSlice + +=head1 DESCRIPTION + +Used when dumping Slices which represet a portion of the sequence region +they map to e.g. the first section of human Y. The code will return N +as sequence if an attempt is made to retrieve sequence not covered by the +Slice given. This makes the code very memory efficient if sequence dumping +is carried out using C calls. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::PaddedSlice; + +use Bio::EnsEMBL::Utils::Argument qw/rearrange/; +use Bio::EnsEMBL::Utils::Scalar qw/assert_ref assert_strand/; +use base qw/Bio::EnsEMBL::Utils::Proxy/; + +=head2 new() + + Arg [SLICE] : The Slice to proxy + Example : my $newobj = Bio::EnsEMBL::PaddedSlice->new($myobj); + Description : Provides a new instance of a padded slice + Returntype : Bio::EnsEMBL::PaddedSlice + Exceptions : None + Caller : public + Status : - + +=cut + +sub new { + my ($class, @args) = @_; + my ($slice) = rearrange([qw/slice/], @args); + return $class->SUPER::new($slice); +} + +=head2 start() + + Example : $slice->start(); + Description : Always returns 1 since all padded slices start at 1 + Returntype : Int + Exceptions : None + Caller : public + Status : - + +=cut + +sub start { + my ($self) = @_; + return 1; +} + +=head2 end() + + Example : $slice->end(); + Description : Always returns the backing slice sequence region length + Returntype : Int + Exceptions : None + Caller : public + Status : - + +=cut + +sub end { + my ($self) = @_; + return $self->seq_region_length(); +} + +=head2 length() + + Example : $slice->length(); + Description : Delegates to C + Returntype : Int + Exceptions : None + Caller : public + Status : - + +=cut + +sub length { + my ($self) = @_; + return $self->end(); +} + +=head2 seq() + + Example : my $seq = $slice->seq() + Description : Returns the entire sequence of the backing slice but padded + with N's at the beginning and the end of the slice where + applicable + Returntype : Scalar string + Exceptions : None + Caller : public + Status : - + +=cut + +sub seq { + my ($self) = @_; + my $parent_slice = $self->__proxy(); + my $pad_start = 'N' x ( $parent_slice->start() - 1 ); + my $pad_end = 'N' x ( $parent_slice->seq_region_length() - $parent_slice->end() ); + my $seq = $parent_slice->seq(); + return $pad_start . $seq . $pad_end; +} + +=head2 subseq() + + Arg [1] : Int; start position of the subslice + Arg [2] : Int; end position of the subslice + Arg [3] : Int; strand of the subslice + Example : my $subseq = $slice->subseq(1, 1_000_000); + Description : Returns a portion of the sequence padded with N's if required + Returntype : Scalar string + Exceptions : None + Caller : public + Status : - + +=cut + +sub subseq { + my ( $self, $start, $end, $strand ) = @_; + + if ( $end+1 < $start ) { + throw("End coord + 1 is less than start coord"); + } + + return '' if( $start == $end + 1); + + $strand = 1 unless(defined $strand); + assert_strand($strand, 'strand'); + + my $parent_slice = $self->__proxy(); + + #Coords relative to the SeqRegion i.e. huge + my $parent_start = $parent_slice->start(); + my $parent_end = $parent_slice->end(); + + #Return if we were upstream of overlap + if($start < $parent_start && $end < $parent_start) { + return N x (( $end - $start )+1); + } + #Return if we were downstream of overlap + if($start > $parent_end && $end > $parent_end) { + return N x (( $end - $start )+1); + } + + my $prefix = ''; + my $suffix = ''; + my $subslice_start = ($start - $parent_start)+1; + my $subslice_end = ($end - $parent_start) + 1; + if($start < $parent_start) { + $prefix = N x ($parent_start - $start); + $subslice_start = 1; + } + if($end > $parent_end) { + $suffix = N x ($end - $parent_end); + $subslice_end = (($parent_end - $parent_start)+1); + } + + my $subseq = $parent_slice->subseq($subslice_start, $subslice_end, $strand); + + return $prefix . $subseq . $suffix; +} + +=head2 subseq() + + Arg [1] : Int; start position of the subslice + Arg [2] : Int; end position of the subslice + Arg [3] : Int; strand of the subslice + Example : my $subseq = $slice->subseq(1, 1_000_000); + Description : Returns a portion of the sequence padded with N's if required + Returntype : Scalar string + Exceptions : None + Caller : public + Status : - + +=cut + +sub sub_Slice { + die "Unsupported"; +} + + +=head2 __resolver() + + Description : Delegates all non-overriden actions onto the backing slice + Returntype : CodeRef + Exceptions : None + Caller : public + Status : - + +=cut + +sub __resolver { + my ($self, $package_name, $method) = @_; + return sub { + my ($local_self, @args) = @_; + return $local_self->__proxy()->$method(@args); + }; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/PepDnaAlignFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/PepDnaAlignFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,96 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::PepDnaAlignFeature - Ensembl specific pep-dna pairwise +alignment feature + +=head1 SYNOPSIS + + See BaseAlignFeature + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::PepDnaAlignFeature; + +use Bio::EnsEMBL::BaseAlignFeature; + +use vars qw(@ISA); +use strict; + +@ISA = qw( Bio::EnsEMBL::BaseAlignFeature ); + +=head2 transform + + Arg [1] : none + Example : none + Description: Overwrites Bio:EnsEMBL:Feature->transform as + to give error message + Status : Stable + +=cut + +sub transform { + my $self = shift; + + $self->throw( "PepDnaAlignFeatures cant be transformed as". + " they are not on EnsEMBL coord system" ); +} + +=head2 _hit_unit + + Arg [1] : none + Example : none + Description: PRIVATE implementation of abstract superclass method. Returns + 3 as the 'unit' used for the hit sequence. + Returntype : int + Exceptions : none + Caller : Bio::EnsEMBL::BaseAlignFeature + Status : Stable + + +=cut + +sub _hit_unit { + return 3; +} + +=head2 _query_unit + + Arg [1] : none + Example : none + Description: PRIVATE implementation of abstract superclass method. Returns + 1 as the 'unit' used for the query sequence. + Returntype : int + Exceptions : none + Caller : Bio::EnsEMBL::BaseAlignFeature + Status : Stable + + +=cut + +sub _query_unit { + return 1; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Pipeline/Base.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Pipeline/Base.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,244 @@ +package Bio::EnsEMBL::Pipeline::Base; + +use strict; +use warnings; +use base qw/Bio::EnsEMBL::Hive::Process/; + +use Bio::EnsEMBL::Utils::Exception qw/throw/; +use Bio::EnsEMBL::Utils::IO qw/work_with_file/; +use Bio::EnsEMBL::Utils::Scalar qw/check_ref/; +use File::Find; +use File::Spec; +use File::Path qw/mkpath/; +use POSIX qw/strftime/; + +# Takes in a key, checks if the current $self->param() was an empty array +# and replaces it with the value from $self->param_defaults() +sub reset_empty_array_param { + my ($self, $key) = @_; + my $param_defaults = $self->param_defaults(); + my $current = $self->param($key); + my $replacement = $self->param_defaults()->{$key}; + if(check_ref($current, 'ARRAY') && check_ref($replacement, 'ARRAY')) { + if(! @{$current}) { + $self->fine('Restting param %s because the given array was empty', $key); + $self->param($key, $replacement); + } + } + return; +} + +=head2 get_Slices + + Arg[1] : String type of DB to use (defaults to core) + Arg[2] : Boolean should we filter the slices if it is human + Example : my $slices = $self->get_Slices('core', 1); + Description : Basic get_Slices() method to return all distinct slices + for a species but also optionally filters for the + first portion of Human Y which is a non-informative region + (composed solely of N's). The code will only filter for + GRCh37 forcing the developer to update the test for other + regions. + Returntype : ArrayRef[Bio::EnsEMBL::Slice] + Exceptions : Thrown if you are filtering Human but also are not on GRCh37 + +=cut + +sub get_Slices { + my ($self, $type, $filter_human) = @_; + my $dba = $self->get_DBAdaptor($type); + throw "Cannot get a DB adaptor" unless $dba; + + my $sa = $dba->get_SliceAdaptor(); + my @slices = @{$sa->fetch_all('toplevel', undef, 1, undef, undef)}; + + if($filter_human) { + my $production_name = $self->production_name(); + if($production_name eq 'homo_sapiens') { + my ($cs) = @{$dba->get_CoordSystem()->fetch_all()}; + my $expected = 'GRCh37'; + if($cs->version() ne $expected) { + throw sprintf(q{Cannot continue as %s's coordinate system %s is not the expected %s }, $production_name, $cs->version(), $expected); + } + @slices = grep { + if($_->seq_region_name() eq 'Y' && $_->end() < 2649521) { + $self->info('Filtering small Y slice'); + 0; + } + else { + 1; + } + } @slices; + } + } + + return [ sort { $a->length() <=> $b->length() } @slices ]; +} + +# Registry is loaded by Hive (see beekeeper_extra_cmdline_options() in conf) +sub get_DBAdaptor { + my ($self, $type) = @_; + my $species = $self->param('species'); + $type ||= 'core'; + return Bio::EnsEMBL::Registry->get_DBAdaptor($species, $type); +} + +sub cleanup_DBAdaptor { + my ($self, $type) = @_; + my $dba = $self->get_DBAdaptor($type); + $dba->clear_caches; + $dba->dbc->disconnect_if_idle; + return; +} + +sub get_dir { + my ($self, @extras) = @_; + my $base_dir = $self->param('base_path'); + my $dir = File::Spec->catdir($base_dir, @extras); + mkpath($dir); + return $dir; +} + +sub web_name { + my ($self) = @_; +# my $mc = $self->get_DBAdaptor()->get_MetaContainer(); +# my $name = $mc->single_value_by_key('species.url'); # change back + my $name = ucfirst($self->production_name()); + return $name; +} + +sub scientific_name { + my ($self) = @_; + my $dba = $self->get_DBAdaptor(); + my $mc = $dba->get_MetaContainer(); + my $name = $mc->get_scientific_name(); + $dba->dbc()->disconnect_if_idle(); + return $name; +} + +sub assembly { + my ($self) = @_; + my $dba = $self->get_DBAdaptor(); + return $dba->get_CoordSystemAdaptor()->fetch_all()->[0]->version(); +} + +sub production_name { + my ($self, $name) = @_; + my $dba; + if($name) { + $dba = Bio::EnsEMBL::Registry->get_DBAdaptor($name, 'core'); + } + else { + $dba = $self->get_DBAdaptor(); + } + my $mc = $dba->get_MetaContainer(); + my $prod = $mc->get_production_name(); + $dba->dbc()->disconnect_if_idle(); + return $prod; +} + +# Closes file handle, and deletes the file stub if no data was written to +# the file handle (using tell). We can also only close a file handle and unlink +# the data if it was open otherwise we just ignore it +# Returns success if we managed to delete the file + +sub tidy_file_handle { + my ($self, $fh, $path) = @_; + if($fh->opened()) { + my $unlink = ($fh->tell() == 0) ? 1 : 0; + $fh->close(); + if($unlink && -f $path) { + unlink($path); + return 1; + } + } + return 0; +} + +sub info { + my ($self, $msg, @params) = @_; + if ($self->debug() > 1) { + my $formatted_msg; + if(scalar(@params)) { + $formatted_msg = sprintf($msg, @params); + } + else { + $formatted_msg = $msg; + } + printf STDERR "INFO [%s]: %s %s\n", $self->_memory_consumption(), strftime('%c',localtime()), $formatted_msg; + } + return +} + +sub fine { + my ($self, $msg, @params) = @_; + if ($self->debug() > 2) { + my $formatted_msg; + if(scalar(@params)) { + $formatted_msg = sprintf($msg, @params); + } + else { + $formatted_msg = $msg; + } + printf STDERR "FINE [%s]: %s %s\n", $self->_memory_consumption(), strftime('%c',localtime()), $formatted_msg; + } + return +} + +sub _memory_consumption { + my ($self) = @_; + my $content = `ps -o rss $$ | grep -v RSS`; + return q{?MB} if $? >> 8 != 0; + $content =~ s/\s+//g; + my $mem = $content/1024; + return sprintf('%.2fMB', $mem); +} + +sub find_files { + my ($self, $dir, $boolean_callback) = @_; + $self->throw("Cannot find path $dir") unless -d $dir; + my @files; + find(sub { + my $path = $File::Find::name; + if($boolean_callback->($_)) { + push(@files, $path); + } + }, $dir); + return \@files; +} + +sub unlink_all_files { + my ($self, $dir) = @_; + $self->info('Removing files from the directory %s', $dir); + #Delete anything which is a file & not the current or higher directory + my $boolean_callback = sub { + return ( $_[0] =~ /^\.\.?$/) ? 0 : 1; + }; + my $files = $self->find_files($dir, $boolean_callback); + foreach my $file (@{$files}) { + $self->fine('Unlinking %s', $file); + unlink $file; + } + $self->info('Removed %d file(s)', scalar(@{$files})); + return; +} + +sub assert_executable { + my ($self, $exe) = @_; + if(! -x $exe) { + my $output = `which $exe 2>&1`; + chomp $output; + my $rc = $? >> 8; + if($rc != 0) { + my $possible_location = `locate -l 1 $exe 2>&1`; + my $loc_rc = $? >> 8; + if($loc_rc != 0) { + my $msg = 'Cannot find the executable "%s" after trying "which" and "locate -l 1". Please ensure it is on your PATH or use an absolute location and try again'; + $self->throw(sprintf($msg, $exe)); + } + } + } + return 1; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Pipeline/ChecksumGenerator.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Pipeline/ChecksumGenerator.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,143 @@ +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::Pipeline::ChecksumGenerator + +=head1 DESCRIPTION + +Creates a CHECKSUMS file in the given directory which is produced from running +the sum command over every file in the directory. This excludes the CHECKSUMS +file, parent directory or any hidden files. + +Allowed parameters are: + +=over 8 + +=item dir - The directory to generate checksums for + +=item gzip - If the resulting file should be gzipped. Defaults to false + +=back + +=cut + +package Bio::EnsEMBL::Pipeline::ChecksumGenerator; + +use strict; +use warnings; + +use base qw/Bio::EnsEMBL::Pipeline::Base/; + +use File::Spec; +use Bio::EnsEMBL::Utils::IO qw/work_with_file gz_work_with_file/; + +sub param_defaults { + my ($self) = @_; + return { + gzip => 0 + }; +} + +sub fetch_input { + my ($self) = @_; + my $dir = $self->param('dir'); + $self->throw("No 'dir' parameter specified") unless $dir; + $self->throw("Dir $dir does not exist") unless -d $dir; + return; +} + +sub run { + my ($self) = @_; + my @checksums; + + my $dir = $self->param('dir'); + $self->info('Checksumming directory %s', $dir); + + opendir(my $dh, $dir) or die "Cannot open directory $dir"; + my @files = sort { $a cmp $b } readdir($dh); + closedir($dh) or die "Cannot close directory $dir"; + + foreach my $file (@files) { + next if $file =~ /^\./; #hidden file or up/current dir + next if $file =~ /^CHECKSUM/; + my $path = File::Spec->catfile($dir, $file); + my $checksum = $self->checksum($path); + push(@checksums, [$checksum, $file]) + } + + $self->param('checksums', \@checksums); + return; +} + +sub write_output { + my ($self) = @_; + my $dir = $self->param('dir'); + my $checksum = File::Spec->catfile($dir, 'CHECKSUMS'); + $checksum .= '.gz' if $self->param('gzip'); + if(-f $checksum) { + $self->info('Checksum file already exists. Removing'); + unlink $checksum; + } + + my @checksums = @{$self->param('checksums')}; + + return unless @checksums; + + my $writer = sub { + my ($fh) = @_; + foreach my $entry (@checksums) { + my $line = join(qq{\t}, @{$entry}); + print $fh $line; + print $fh "\n"; + } + return; + }; + my @params = ($checksum, 'w', $writer); + + + if($self->param('gzip')) { + gz_work_with_file(@params); + } + else { + work_with_file(@params); + } + + $self->permissions($checksum); + return; +} + +sub checksum { + my ($self, $path) = @_; + my $checksum = `sum $path`; + $checksum =~ s/\s* $path//xms; + chomp($checksum); + return $checksum; +} + +sub permissions { + my ($self, $file) = @_; + my $mode = 0666; + chmod($mode, $file) or $self->throw("Cannot perform the chmod to mode $mode for file $file"); + return; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/Base.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/Base.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,22 @@ +package Bio::EnsEMBL::Pipeline::FASTA::Base; + +use strict; +use warnings; +use base qw/Bio::EnsEMBL::Pipeline::Base/; + +use File::Spec; + +sub fasta_path { + my ( $self, @extras ) = @_; + return $self->get_dir('fasta', $self->param('species'), @extras); +} + +sub old_path { + my ($self, $species) = @_; + my $base = $self->param('ftp_dir'); + my $prod = $self->production_name($species); + my $release = $self->param('previous_release'); + my $dir = File::Spec->catdir($base, "release-$release", 'fasta', $prod, 'dna'); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/BlatIndexer.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/BlatIndexer.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,236 @@ +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::Pipeline::FASTA::BlastIndexer + +=head1 DESCRIPTION + +Creates 2bit file of the given GZipped file. The resulting index +is created under the parameter location I in blat/index. The filename +is prefixed with the port number of the blat server this file should be +run on. + +The module also performs filtering of non-reference sequence regions +and can filter the redundant Y chromosome piece for human (as 2bit does +not like repeated sequence region names). + +Allowed parameters are: + +=over 8 + +=item file - The file to index + +=item program - The location of the faToTwoBit program + +=item port_offset - Value to add onto the species_id from the website DB + to name the file correctly + +=item base_path - The base of the dumps + +=item index - The type of file to index; supported values are empty, + I, I or I. If specified we will look for this + string in the filename surrounded by '.' e.g. .dna. + +=back + +The registry should also have a DBAdaptor for the website schema +registered under the species B and the group B for species_id to +Blat port number. + +=cut + + +package Bio::EnsEMBL::Pipeline::FASTA::BlatIndexer; + +use strict; +use warnings; +use base qw/Bio::EnsEMBL::Pipeline::FASTA::Indexer/; + +use File::Spec; +use File::stat; +use Bio::EnsEMBL::Utils::IO qw/work_with_file/; +use Bio::EnsEMBL::Utils::Exception qw/throw/; +use Bio::EnsEMBL::Registry; + +sub param_defaults { + my ($self) = @_; + return { + program => 'faToTwoBit', + port_offset => 30000, + 'index' => 'dna', #or dna_rm and dna_sm + }; +} + +sub fetch_input { + my ($self) = @_; + $self->assert_executable($self->param('program')); + $self->assert_executable('zcat'); + $self->assert_executable('gunzip'); + return; +} + +sub run { + my ($self) = @_; + if($self->run_indexing()) { + $self->SUPER::run(); + } + return; +} + +sub run_indexing { + my ($self) = @_; + my $index = $self->param('index'); + if($index) { + my $file = $self->param('file'); + return (index($file, ".${index}.") > -1) ? 1 : 0; + } + return 1; +} + +sub index_file { + my ($self, $file) = @_; + + my $target_file = $self->target_file(); + my $cmd = sprintf(q{%s %s %s}, + $self->param('program'), $file, $target_file); + + $self->info('About to run "%s"', $cmd); + my $output = `$cmd 2>&1`; + my $rc = $? >> 8; + throw "Cannot run program '$cmd'. Return code was ${rc}. Program output was $output" if $rc; + unlink $file or throw "Cannot remove the file '$file' from the filesystem: $!"; + + #Check the file size. If it's 16 bytes then reject as that is an empty file for 2bit + my $filesize = stat($target_file)->size(); + if($filesize <= 16) { + unlink $file; + my $msg = sprintf( + 'The file %s produced a 2bit file %d byte(s). Lower than 17 bytes therefore empty 2 bit file', + $file, $filesize + ); + $self->throw($msg); + } + + return; +} + +sub decompress { + my ($self) = @_; + + #If we have no non-reference seq regions then use normal decompress + if(! $self->has_non_refs()) { + return $self->SUPER::decompress(); + } + + #Filter for non-refs + my $source = $self->param('file'); + my $target_dir = $self->target_dir(); + my ($vol, $dir, $file) = File::Spec->splitpath($source); + $file =~ s/.gz$//; + my $target = File::Spec->catdir($target_dir, $file); + + my $allowed_regions = $self->allowed_regions(); + + $self->info('Decompressing %s to %s', $source, $target); + + open my $src_fh, '-|', "zcat $source" or throw "Cannot decompress $source to $target"; + work_with_file($target, 'w', sub { + my ($trg_fh) = @_; + my $transfer = 0; + while(my $line = <$src_fh>) { + #HEADER + if(index($line, '>') == 0) { + #regex is looking for NNN:NNN:NNN:1:80:1 i.e. the name + my ($name) = $line =~ />.+\s(.+:.+:.+:\d+:\d+:\d+)/; + $transfer = ($allowed_regions->{$name}) ? 1 : 0; + if($transfer) { + $self->info('%s was an allowed Slice', $name); + } + else { + $self->info('%s will be skipped; not a reference Slice', $name); + } + } + print $trg_fh $line if $transfer; + } + }); + close($src_fh); + + return $target; +} + +sub allowed_regions { + my ($self) = @_; + my $filter_human = 1; + my @slices = grep { $_->is_reference() } @{$self->get_Slices('core', $filter_human)}; + my %hash = map { $_->name() => 1 } @slices; + return \%hash; +} + +#Filename like 30061.Homo_sapiens.GRCh37.2bit +sub target_filename { + my ($self) = @_; + my $port = $self->blat_port(); + my $name = $self->web_name(); + my $assembly = $self->assembly(); + return join(q{.}, $port, $name, $assembly, '2bit'); +} + +sub target_file { + my ($self) = @_; + my $target_dir = $self->target_dir(); + my $target_filename = $self->target_filename(); + return File::Spec->catfile($target_dir, $target_filename); + return; +} + +sub target_dir { + my ($self) = @_; + return $self->get_dir('blat', $self->param('index')); +} + +sub blat_port { + my ($self) = @_; + my $dba = Bio::EnsEMBL::Registry->get_DBAdaptor('multi', 'web'); + my $id = $dba->dbc()->sql_helper()->execute_single_result( + -SQL => 'select species_id from species where name =?', + -PARAMS => [$self->web_name()] + ); + return $id + $self->param('port_offset'); +} + +sub has_non_refs { + my ($self) = @_; + my $sql = <<'SQL'; +select count(*) +from attrib_type at +join seq_region_attrib sra using (attrib_type_id) +join seq_region sr using (seq_region_id) +join coord_system cs using (coord_system_id) +where cs.species_id =? +and at.code =? +SQL + my $dba = $self->get_DBAdaptor(); + return $dba->dbc()->sql_helper()->execute_single_result( + -SQL => $sql, -PARAMS => [$dba->species_id(), 'non_ref']); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/ConcatFiles.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/ConcatFiles.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,166 @@ +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::Pipeline::FASTA::ConcatFiles + +=head1 DESCRIPTION + +Performs a find in the DNA dumps directory for the given species and then +concats files which match a specified name pattern. We only allow +two types of concats; DNA and RM DNA. The concat file is a series +of cat command calls from all other Gzipped FASTA dumps (allowed under +the GZip specification). + +Allowed parameters are: + +=over 8 + +=item release - Needed to build the target path + +=item species - Required to indicate which species we are working with + +=item data_type - The type of data to work with. Can be I, I or I + +=item base_path - The base of the dumps + +=back + +=cut + +package Bio::EnsEMBL::Pipeline::FASTA::ConcatFiles; + +use strict; +use warnings; +use base qw/Bio::EnsEMBL::Pipeline::FASTA::Base/; + +use File::Spec; +use File::stat; + +sub param_defaults { + my ($self) = @_; + return { + dna => { + regex => qr/.+\.dna\..+\.fa\.gz$/, + }, + dna_rm => { + regex => qr/.+\.dna_rm\..+\.fa\.gz$/, + }, + dna_sm => { + regex => qr/.+\.dna_sm\..+\.fa\.gz$/, + }, + }; +} + +sub fetch_input { + my ($self) = @_; + foreach my $key (qw/data_type species release base_path/) { + $self->throw("Cannot find the required parameter $key") unless $self->param($key); + } + return; +} + +# sticks ends of files together into one big file. +sub run { + my ($self) = @_; + + my @file_list = @{$self->get_dna_files()}; + my $count = scalar(@file_list); + my $running_total_size = 0; + + if($count) { + my $target_file = $self->target_file(); + $self->info("Concatting type %s with %d file(s) into %s", $self->param('data_type'), $count, $target_file); + + if(-f $target_file) { + $self->info("Target already exists. Removing"); + unlink $target_file or $self->throw("Could not remove $target_file: $!"); + } + + $self->info('Running concat'); + foreach my $file (@file_list) { + $self->fine('Processing %s', $file); + $running_total_size += stat($file)->size; + system("cat $file >> $target_file") + and $self->throw( sprintf('Cannot concat %s into %s. RC %d', $file, $target_file, ($?>>8))); + } + + $self->info("Catted files together"); + + my $catted_size = stat($target_file)->size; + + if($running_total_size != $catted_size) { + $self->throw(sprintf('The total size of the files catted together should be %d but was in fact %d. Failing as we expect the catted size to be the same', $running_total_size, $catted_size)); + } + + $self->param('target_file', $target_file); + } + else { + $self->throw("Cannot continue as we found no files to concat"); + } + return; +} + +sub write_output { + my ($self) = @_; + my $file = $self->param('target_file'); + if($file) { + $self->dataflow_output_id({ file => $file, species => $self->param('species') }, 1); + } + return; +} + +sub get_dna_files { + my ($self) = @_; + my $path = $self->fasta_path('dna'); + my $data_type = $self->param('data_type'); + my $regex_hash = $self->param($data_type); + if(! $regex_hash ) { + $self->throw("We do not have an entry for the data_type $data_type in our regex lookup hash. Edit this module"); + } + my $regex = $regex_hash->{regex}; + my $filter = sub { + my ($filename) = @_; + return ($filename =~ $regex && $filename !~ /\.toplevel\./) ? 1 : 0; + }; + my $files = $self->find_files($path, $filter); + return [ sort @{$files} ]; +} + + +sub target_file { + my ($self) = @_; + # File name format looks like: + # ......fa.gz + # e.g. Homo_sapiens.GRCh37.64.dna_rm.toplevel.fa.gz + my @name_bits; + push @name_bits, $self->web_name(); + push @name_bits, $self->assembly(); + push @name_bits, $self->param('release'); + push @name_bits, $self->param('data_type'); + push @name_bits, 'toplevel'; + push @name_bits, 'fa', 'gz'; + my $file_name = join( '.', @name_bits ); + my $dir = $self->fasta_path('dna'); + return File::Spec->catfile( $dir, $file_name ); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/CopyDNA.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/CopyDNA.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,117 @@ +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::Pipeline::FASTA::CopyDNA + +=head1 DESCRIPTION + +Performs a find in the DNA dumps directory, for the given species, in the +previous release FTP dump directory. Any files matching the normal gzipped +fasta extension will be copied over to this release's directory. + +Previous release is defined as V; override this class if your +definition of the previous release is different. + +Allowed parameters are: + +=over 8 + +=item release - Needed to build the target path + +=item previous_release - Needed to build the source path + +=item ftp_dir - Current location of the FTP directory for the previous + release. Should be the root i.e. the level I is in + +=item species - Species to work with + +=item base_path - The base of the dumps; reused files will be copied to here + +=back + +=cut + +package Bio::EnsEMBL::Pipeline::FASTA::CopyDNA; + +use strict; +use warnings; + +use base qw/Bio::EnsEMBL::Pipeline::FASTA::Base/; + +use File::Copy; +use File::Find; +use File::Spec; + +sub fetch_input { + my ($self) = @_; + my @required = qw/release ftp_dir species/; + foreach my $key (@required) { + $self->throw("Need to define a $key parameter") unless $self->param($key); + } + return; +} + +sub run { + my ($self) = @_; + + my $new_path = $self->new_path(); + #Remove all files from the new path + $self->unlink_all_files($new_path); + + my $files = $self->get_dna_files(); + foreach my $old_file (@{$files}) { + my $new_file = $self->new_filename($old_file); + $self->fine('copy %s %s', $old_file, $new_file); + copy($old_file, $new_file) or $self->throw("Cannot copy $old_file to $new_file: $!"); + } + + return; +} + +sub new_filename { + my ($self, $old_filename) = @_; + my ($old_volume, $old_dir, $old_file) = File::Spec->splitpath($old_filename); + my $old_release = $self->param('previous_release'); + my $release = $self->param('release'); + my $new_file = $old_file; + $new_file =~ s/\.$old_release\./.$release./; + my $new_path = $self->new_path(); + return File::Spec->catfile($new_path, $new_file); +} + +sub new_path { + my ($self) = @_; + return $self->fasta_path('dna'); +} + +sub get_dna_files { + my ($self) = @_; + my $old_path = $self->old_path(); + my $filter = sub { + my ($filename) = @_; + return ($filename =~ /\.fa\.gz$/ || $filename eq 'README') ? 1 : 0; + }; + my $files = $self->find_files($old_path, $filter); + return $files; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/DumpFile.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/DumpFile.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,976 @@ +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::Pipeline::FASTA::DumpFile + +=head1 DESCRIPTION + +The main workhorse of the FASTA dumping pipeline. This module has two +functions + +=over 8 + +=item 1 - Dumping Genomic DNA sequences in a memory efficient manner in unmasked, softmasked & hardmasked formats + +=item 2 - Dumping Genes as cDNA, proteins and ncRNA transcripts (abinitio included) + +=back + +The script is responsible for creating the filenames of these target +files, taking data from the database and the formatting of the FASTA +headers. It is also responsible for the creation of README files pertaining +to the type of dumps produced. The final files are all Gzipped at normal +levels of compression. + +B This code will remove any files already found in the target directory +on its first run as it assumes all data will be dumped in the one process. It +is selective of its directory meaning a rerun of DNA dumps will not cause +the protein/cdna files to be removed. + +Allowed parameters are: + +=over 8 + +=item species - The species to dump + +=item sequence_type_list - The data to dump. I, I and I are allowed + +=item release - A required parameter for the version of Ensembl we are dumping for + +=item db_types - Array reference of the database groups to use. Defaults to core + +=item process_logic_names - Array reference of transcript logic names to only process (only produce dumps for these). Applied before skip_logic_names + +=item skip_logic_names - Array reference of transcript logic names to skip over (we do not produce dumps for these) + + +=item base_path - The base of the dumps + +=item dna_chunk_size - Indicates the number of 60bp chunks to retrieve and + process when formatting FASTA files. Normally do not + touch + +=item allow_appending - If the same file name is generated we will + append into that file rather than overwriting + +=item overwrite_files - If the same file name is generated we will overwrite + the into that file rather than appending + +=back + +=cut + +package Bio::EnsEMBL::Pipeline::FASTA::DumpFile; + +use strict; +use warnings; + +use base qw(Bio::EnsEMBL::Pipeline::FASTA::Base); + +use File::Spec; +use IO::Compress::Gzip qw/gzip $GzipError/; +use IO::File; +use Bio::EnsEMBL::PaddedSlice; +use Bio::EnsEMBL::Utils::BiotypeMapper; +use Bio::EnsEMBL::Utils::Exception qw/throw/; +use Bio::EnsEMBL::Utils::Scalar qw/check_ref/; +use Bio::EnsEMBL::Utils::IO::FASTASerializer; +use Bio::EnsEMBL::Utils::IO qw/work_with_file gz_work_with_file/; + +my $DNA_INDEXING_FLOW = 1; +my $PEPTIDE_INDEXING_FLOW = 2; +my $GENE_INDEXING_FLOW = 3; + +sub param_defaults { + my ($self) = @_; + return { + #user configurable + allow_appending => 1, + overwrite_files => 0, + + dna_chunk_size => 17000, + + skip_logic_names => [], + process_logic_names => [], + + #DON'T MESS + #used to track if we need to reopen a file in append mode or not + generated_files => {}, + remove_files_from_dir => {}, + dataflows => [] + }; +} + +sub fetch_input { + my ($self) = @_; + + my %sequence_types = map { $_ => 1 } @{ $self->param('sequence_type_list') }; + $self->param('sequence_types', \%sequence_types); + + my $dba = $self->get_DBAdaptor(); + my $analyses = $dba->get_MetaContainer()->list_value_by_key('repeat.analysis'); + $self->param('analyses', $analyses); + + my $types = $self->param('db_types'); + $types = ['core'] unless $types; + $self->param('db_types', $types); + + my %skip_logic_names = map { $_ => 1 } @{$self->param('skip_logic_names')}; + $self->param('skip_logic', \%skip_logic_names); + $self->param('skip_logic_active', 1) if @{$self->param('skip_logic_names')}; + my %process_logic_names = map { $_ => 1 } @{$self->param('process_logic_names')}; + $self->param('process_logic', \%process_logic_names); + $self->param('process_logic_active', 1) if @{$self->param('process_logic_names')}; + + return; +} + +sub run { + my ($self) = @_; + my $types = $self->param('db_types'); + foreach my $type (@{$types}) { + my $dba = $self->get_DBAdaptor($type); + if(! $dba) { + $self->info("Cannot continue with %s as we cannot find a DBAdaptor", $type); + next; + } + $self->run_type($type); + } + return; +} + +sub write_output { + my ($self) = @_; + my $dataflows = $self->param('dataflows'); + foreach my $flow (@{$dataflows}) { + $self->dataflow_output_id(@{$flow}); + } + return; +} + +sub run_type { + my ($self, $type) = @_; + + my $species = $self->param('species'); + my $sequence_types = $self->param('sequence_types'); + + # dump file for each type on a per slice basis + # types are dna,cDNA, peptide, ncRNA + + #Only run if we are told to & the current DBA is the same as the attached DNADB by checking the Stringified ref + my $dba = $self->get_DBAdaptor($type); + if ( $sequence_types->{dna} && $dba eq $dba->dnadb() ) { + $self->info( "Starting dna dump for " . $species ); + $self->_dump_dna($type); + $self->_create_README('dna'); + } + + if ( $sequence_types->{cdna} ) { #includes peptides whether you like it or not + $self->info( "Starting cdna dump for " . $species ); + my ($transcripts, $peptide) = $self->_dump_transcripts('cdna', $type); + + $self->info( "Starting prediction transcript dumps for " . $species ); + my ($pred_transcripts, $pred_proteins) = $self->_dump_prediction_transcripts($type); + + $self->_create_README('cdna') if $transcripts || $pred_transcripts; + $self->_create_README('pep') if $peptide || $pred_proteins; + } + if ( $sequence_types->{ncrna} ) { + $self->info( "Starting ncRNA dump for " . $species ); + my ($ncrna_transcripts) = $self->_dump_transcripts('ncrna', $type); + $self->_create_README('ncrna') if $ncrna_transcripts; + } + + $self->cleanup_DBAdaptor($type); +} + +# Dump entire sequence, also dump data into chromosome files as appropriate +sub _dump_dna { + my ($self,$type) = @_; + + my @chromosomes; + my @non_chromosomes; + my $filter_human = 1; + foreach my $s (@{$self->get_Slices($type, $filter_human)}) { + my $chr = $s->is_chromosome(); + push(@chromosomes, $s) if $chr; + push(@non_chromosomes, $s) if ! $chr; + } + + ############ NON CHROMOSOME WORK + $self->info('Processing %d non-chromosome(s)', scalar(@non_chromosomes)); + if(@non_chromosomes) { + my ( $non_specific_file, $non_specific_fh, $other_serializer ) = + $self->_generate_fasta_serializer( 'dna', 'nonchromosomal' ); + my ( $rm_non_specific_file, $rm_non_specific_fh, $other_rm_serializer ) = + $self->_generate_fasta_serializer( 'dna_sm', 'nonchromosomal' ); + foreach my $s (@non_chromosomes) { + $self->_dump_slice($s, $other_serializer, $other_rm_serializer); + } + #Quick close of the SM FH to flush all data out to disk; skip gzipping & leave that to the next call + $self->tidy_file_handle($rm_non_specific_fh, $rm_non_specific_file, 1); + my ($hard_mask_fh, $hard_mask_file) = $self->_convert_softmask_to_hardmask($rm_non_specific_file, $rm_non_specific_fh); + + $self->tidy_file_handle( $non_specific_fh, $non_specific_file ); + $self->tidy_file_handle( $rm_non_specific_fh, $rm_non_specific_file ); + $self->tidy_file_handle( $hard_mask_fh, $hard_mask_file); + $self->info('Dumped non-chromosomes'); + } + + ############ CHROMOSOME WORK + $self->info('Processing %d chromosome(s)', scalar(@chromosomes)); + foreach my $s (@chromosomes) { + my ( $chromo_file_name, $chromo_fh, $chromo_serializer ) = + $self->_generate_fasta_serializer( 'dna', 'chromosome', + $s->seq_region_name(), undef); + # repeat masked data too + my ( $rm_chromo_file_name, $rm_chromo_fh, $rm_chromo_serializer ) = + $self->_generate_fasta_serializer( 'dna_sm', 'chromosome', + $s->seq_region_name(), undef); + + $self->_dump_slice($s, $chromo_serializer, $rm_chromo_serializer); + + #Quick close of the SM FH to flush all data out to disk; skip gzipping & leave that to the next call + $self->tidy_file_handle($rm_chromo_fh, $rm_chromo_file_name, 1); + my ($chromo_hard_mask_fh, $chromo_hard_mask_file) = $self->_convert_softmask_to_hardmask($rm_chromo_file_name, $rm_chromo_fh); + + $self->tidy_file_handle($chromo_fh, $chromo_file_name); + $self->tidy_file_handle($rm_chromo_fh, $rm_chromo_file_name); + $self->tidy_file_handle($chromo_hard_mask_fh, $chromo_hard_mask_file); + } + $self->info("Dumped chromosomes"); + + #input_id + push(@{$self->param('dataflows')}, [{ data_type => 'dna', species => $self->param('species') }, $DNA_INDEXING_FLOW]); + push(@{$self->param('dataflows')}, [{ data_type => 'dna_sm', species => $self->param('species') }, $DNA_INDEXING_FLOW]); + push(@{$self->param('dataflows')}, [{ data_type => 'dna_rm', species => $self->param('species') }, $DNA_INDEXING_FLOW]); + + return; +} + +sub _dump_slice { + my ($self, $s, $serialiser, $rm_serialiser) = @_; + + my $analyses = $self->param('analyses'); + + my $chr = $s->is_chromosome(); + $self->info('Starting slice - %s:%d-%d', $s->seq_region_name(), $s->start(), $s->end()); + $self->info(' Slice is a chromosome') if $chr; + $self->info(' Slice is non-chromosomal') if ! $chr; + + # Make a padded slice (to automatically pad with N's outside of known regions) + # and make a repeat-masked slice and then pad that too. + my $padded_slice = Bio::EnsEMBL::PaddedSlice->new(-SLICE => $s); + $serialiser->print_Seq($padded_slice); + + my $soft_mask = 1; + my $masked_slice = $s->get_repeatmasked_seq($analyses, $soft_mask); + my $padded_masked_slice = Bio::EnsEMBL::PaddedSlice->new(-SLICE => $masked_slice); + $rm_serialiser->print_Seq($padded_masked_slice); + + return; +} + +#Assumes we are working with un-compressed files +sub _convert_softmask_to_hardmask { + my ($self, $soft_mask_file, $soft_mask_fh) = @_; + if(! -f $soft_mask_file) { + $self->info('Skipping as the target file %s does not exist. Must have been deleted', $soft_mask_file); + return; + } + my $hard_mask_file = $soft_mask_file; + $hard_mask_file =~ s/\.dna_sm\./.dna_rm./; + my $hm_fh = IO::File->new($hard_mask_file, 'w'); + $self->info('Converting soft-masked file %s into hard-masked file %s', $soft_mask_file, $hard_mask_file); + work_with_file($soft_mask_file, 'r', sub { + my ($sm_fh) = @_; + while(my $line = <$sm_fh>) { + if(index($line, '>') == 0) { + $line =~ s/dna_sm/dna_rm/; + } + else { + $line =~ tr/[acgtn]/N/; + } + print $hm_fh $line; + }; + return; + }); + return ($hm_fh, $hard_mask_file); +} + +sub _dump_transcripts { + my ($self, $transcript_type, $type) = @_; + + my $has_transcript_data = 0; + my $has_protein_data = 0; + + my $transcript_level = ($transcript_type ne 'ncrna') ? 'all' : undef; + my ( $filename, $fh, $transcript_serializer ) = + $self->_generate_fasta_serializer( $transcript_type, $transcript_level ); + + my ( $peptide_filename, $pep_fh, $peptide_serializer ); + + # some cDNAs are translated, make a file to receive them. + if ( $transcript_type eq 'cdna') { + ( $peptide_filename, $pep_fh, $peptide_serializer ) = + $self->_generate_fasta_serializer( 'pep', 'all' ); + } + + # work out what biotypes correspond to $transcript_type + my $biotype_mapper = Bio::EnsEMBL::Utils::BiotypeMapper->new(); + my $biotypes_list = $biotype_mapper->group_members($transcript_type); + + my $dba = $self->get_DBAdaptor($type); + my $gene_adaptor = $dba->get_GeneAdaptor(); + + # get all the transcripts that are $transcript_type e.g. cdna, ncrna, + foreach my $biotype ( @{$biotypes_list} ) { + my $gene_list = $gene_adaptor->fetch_all_by_biotype($biotype); + $self->info("Biotype %s has %d gene(s)", $biotype, scalar( @{$gene_list} )); + while ( my $gene = shift @{$gene_list} ) { + $self->fine( 'Gene %s', $gene->display_id ); + my $transcript_list = $gene->get_all_Transcripts(); + foreach my $transcript ( @{$transcript_list} ) { + $self->fine( 'Transcript %s', $transcript->display_id ); + next unless $self->ok_to_process_logic_name($transcript); + + # foreach transcripts of all genes with biotypes classed as cdna + my $transcript_seq = $transcript->seq(); + $self->_create_display_id($transcript, $transcript_seq, $transcript_type); + $transcript_serializer->print_Seq($transcript_seq); + if ($biotype_mapper->member_of_group( $biotype, 'peptide_producing')) { + my $translation = $transcript->translation(); + if ($translation) { + my $translation_seq = $transcript->translate(); + $self->_create_display_id($translation, $translation_seq, $transcript_type); + $peptide_serializer->print_Seq($translation_seq); + + $has_protein_data = 1; + } + } + + $has_transcript_data = 1; + } + } + } + + $self->tidy_file_handle( $fh, $filename ); + if ( $transcript_type eq 'cdna' ) { + $self->tidy_file_handle( $pep_fh, $peptide_filename ); + } + + if($has_protein_data) { + push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($peptide_filename), species => $self->param('species') }, $PEPTIDE_INDEXING_FLOW]); + } + if($has_transcript_data) { + push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($filename), species => $self->param('species') }, $GENE_INDEXING_FLOW]); + } + + return ($has_transcript_data, $has_protein_data); +} + +# Dump prediction transcripts and peptides. All predicted transcripts have translations +sub _dump_prediction_transcripts { + my ($self, $type) = @_; + my $dba = $self->get_DBAdaptor($type); + + my $has_transcript_data = 0; + my $has_protein_data = 0; + + my $prediction_transcript_adaptor = $dba->get_PredictionTranscriptAdaptor(); + my $transcript_list = $prediction_transcript_adaptor->fetch_all(); + my $count = scalar(@{$transcript_list}); + $self->info('Found %d prediction transcript(s)', $count); + if($count) { + my ( $abinitio_filename, $fh, $abinitio_serializer ) = + $self->_generate_fasta_serializer( 'cdna', 'abinitio' ); + my ( $abinitio_peptide_filename, $pep_fh, $abinitio_peptide_serializer ) = + $self->_generate_fasta_serializer( 'pep', 'abinitio' ); + + while ( my $transcript = shift @{$transcript_list} ) { + next unless $self->ok_to_process_logic_name($transcript); + + $has_transcript_data = 1; + my $transcript_seq = $transcript->seq(); + $self->_create_display_id( $transcript, $transcript_seq, 'cdna' ); + $abinitio_serializer->print_Seq($transcript_seq); + + my $translation_seq = $transcript->translate(); + if ( $transcript->translation() ) { + $has_protein_data = 1; + $self->_create_display_id( $transcript, $translation_seq, 'pep' ); + $abinitio_peptide_serializer->print_Seq($translation_seq); + } + } + + $self->tidy_file_handle( $fh, $abinitio_filename ); + $self->tidy_file_handle( $pep_fh, $abinitio_peptide_filename ); + + if($has_protein_data) { + push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($abinitio_peptide_filename), species => $self->param('species') }, $PEPTIDE_INDEXING_FLOW]); + } + if($has_transcript_data) { + push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($abinitio_filename), species => $self->param('species') }, $GENE_INDEXING_FLOW]); + } + } + + + return ($has_transcript_data, $has_protein_data); +} + +# We can optionally skip the Gzip process & just delegate to the super class +# for it's cleanup routines which only work with an open file handle. Therefore +# only pass it onto the super implementation *if* the handle was open. +# Also only Gzip if the source file exists (it could have been unlinked from +# an earlier call) + +sub tidy_file_handle { + my ($self, $fh, $path, $no_gzip) = @_; + if($fh->opened()) { + my $tidy = $self->SUPER::tidy_file_handle($fh, $path); + return 1 if $tidy; + } + + return if $no_gzip; #don't gzip if we were told to skip + return if ! -f $path; #don't gzip if we had no file + + my $target = $path.".gz"; + $self->info('Gzipping "%s"', $path); + my %args; + if($self->param('generated_files')->{$target}) { + if($self->param('allow_appending')) { + $self->info('Going to append to the file %s as we have created two files of the same name in the same session', $target); + $args{Append} = 1; + } + elsif($self->param('overwrite_files')) { + $self->info('Overwriting the file %s as we have created two files of the same name in the same session', $target); + } + else { + $self->throw("Cannot continue. The file %s has already been created this session. Fail!"); + } + } + gzip $path => $target, %args or throw "GZip error compressing $path to $target: $GzipError"; + $self->info(' Removing original file from filesystem'); + unlink $path or throw "Could not delete $path: $!"; + $self->info(' Finished'); + $self->param('generated_files')->{$target} = 1; + return 0; +} + +#We assume a transcript is ok to process unless proven otherwise +sub ok_to_process_logic_name { + my ($self, $transcript) = @_; + my $ok = 1; + my $logic_name = $transcript->analysis()->logic_name(); + if($self->param('process_logic_active')) { + if(! $self->param('process_logic')->{$logic_name}) { + $self->fine('Transcript %s has been filtered because logic_name %s is not in the active logic name list', $transcript->stable_id(), $logic_name); + $ok = 0; + } + } + if($self->param('skip_logic_active')) { + if($self->param('skip_logic')->{$logic_name}) { + $self->fine('Transcript %s has been filtered because logic_name %s is in the skip logic name list', $transcript->stable_id(), $logic_name); + $ok = 0; + } + } + return $ok; +} + +#Generates a FASTA serializer but returns the (filename, handle & instance) +sub _generate_fasta_serializer { + my ( $self, $datatype, $level, $section, $header_formatter ) = @_; + $header_formatter ||= $self->_custom_header(); + my $chunk = $self->param('dna_chunk_size'); + my $filename = $self->_generate_file_name( $datatype, $level, $section ); + my $fh = IO::File->new($filename, '>') or throw "Cannot open $filename for writing: $!"; + my $ser = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($fh, $header_formatter, $chunk); + return ( $filename, $fh, $ser ); +} + +# +# _generate_file_name(data type, level, section ) +# dna toplevel undef +# dna chromosome 6 + +sub _generate_file_name { + my ( $self, $data_type, $level, $section ) = @_; #level & section is optional + + # File name format looks like: + # ......fa + # e.g. Homo_sapiens.GRCh37.64.dna_rm.chromosome.HG905_PATCH.fa + # Homo_sapiens.GRCh37.64.dna.chromosome.20.fa + # Ciona_savignyi.CSAV2.0.65.dna.toplevel.fa + my @name_bits; + push @name_bits, $self->web_name(); + push @name_bits, $self->assembly(); + push @name_bits, $self->param('release'); + push @name_bits, lc($data_type); + push @name_bits, $level if $level; + push @name_bits, $section if $section; + push @name_bits, 'fa'; + + my $file_name = join( '.', @name_bits ); + + $data_type =~ s/_[rs]m$//; # remove repeatmask or softmask designation from path component + my $data_type_dir = $self->fasta_path($data_type); + $self->_remove_files_from_dir($data_type_dir); + return File::Spec->catfile( $data_type_dir, $file_name ); +} + +# Attempts to remove any generated files previously present for the instance +# of the Process +sub _remove_files_from_dir { + my ($self, $dir) = @_; + if(! $self->param('remove_files_from_dir')->{$dir}) { + $self->unlink_all_files($dir); + $self->param('remove_files_from_dir')->{$dir} = 1; + } + return; +} + +##Logic used to generate the expected format for a FASTA header +sub _create_display_id { + my ($self, $object, $seq, $type) = @_; + + my $stable_id; + my $location; + my $decoded_type; + my $decoded_status; + my %attributes; + + if(check_ref( $object, 'Bio::EnsEMBL::Transcript')) { + $attributes{transcript_biotype} = $object->biotype(); + + #If pred transcript then no gene but type & status are different + if(check_ref($object, 'Bio::EnsEMBL::PredictionTranscript')) { + $stable_id = $object->stable_id(); + $location = $object->feature_Slice()->name(); + $decoded_type = $type; + $decoded_status = lc($object->analysis()->logic_name()); + if($type eq 'pep') { + $attributes{transcript} = $stable_id; + } + } + #Must be a real "transcript" + else { + $stable_id = $object->stable_id(); + $location = $object->feature_Slice()->name(); + my $gene = $object->get_Gene(); + $attributes{gene} = $gene->stable_id(); + $attributes{gene_biotype} = $gene->biotype(); + + #If ncRNA then we set type to the logic name and status to gene's biotype (taken from original script) + if($type eq 'ncrna') { + $decoded_type = lc($object->analysis()->logic_name()); + $decoded_status = $gene->biotype(); + } + elsif($object->biotype() =~ /pseudogene/i && ! $object->translation()) { + $decoded_type = $type; + $decoded_status = 'pseudogene'; + } + #Otherwise use type & object's transcript's status + else { + $decoded_type = $type; + $decoded_status = lc($object->status()); + } + } + } + #If it's a translation then grab the transcript and gene then set accordingly + elsif(check_ref($object, 'Bio::EnsEMBL::Translation')) { + my $transcript = $object->transcript(); + my $gene = $transcript->get_Gene(); + $stable_id = $object->stable_id(); + $location = $transcript->feature_Slice()->name(); + %attributes = ( + gene => $gene->stable_id(), + gene_biotype => $gene->biotype(), + transcript => $transcript->stable_id(), + transcript_biotype => $transcript->biotype() + ); + $decoded_type = 'pep'; + $decoded_status = lc($transcript->status()); + } + else { + throw sprintf( 'Do not understand how to format a display_id for type "%s"', + ref($object) ); + } + + my $attr_str = join(q{ }, + map { $_.':'.$attributes{$_} } + grep { exists $attributes{$_} } + qw/gene transcript gene_biotype transcript_biotype/); + + my $format = '%s %s:%s %s %s'; + + my $id = sprintf( $format, $stable_id, $decoded_type, $decoded_status, $location, $attr_str); + $seq->display_id($id); + + return; +} + +sub _custom_header { + my ($self) = @_; + return sub { + my $slice = shift; + if ( !$slice->isa('Bio::EnsEMBL::Slice') ) { + return $slice->display_id(); + } + + #RMS means masked data. soft_mask() true means it was softmasked + my $dna_type = 'dna'; + if($slice->isa('Bio::EnsEMBL::RepeatMaskedSlice')) { + $dna_type .= ($slice->soft_mask()) ? '_sm' : '_rm'; + } + + my $id = $slice->seq_region_name; + my $idtype = $slice->coord_system->name; + my $location = $slice->name; + my $ref = $slice->assembly_exception_type(); + my $header = sprintf('%s %s:%s %s %s', $id, $dna_type, $idtype, $location, $ref); + return $header; + }; +} + +sub _final_filename { + my ($self, $filename) = @_; + return $filename if $filename =~ /\.gz$/; + return $filename.'.gz'; +} + +sub assembly_accession { + my ($self) = @_; + my $mc = $self->get_DBAdaptor()->get_MetaContainer(); + return $mc->single_value_by_key('assembly.accession'); +} + +sub assembly_accession_type { + my ($self) = @_; + my $mc = $self->get_DBAdaptor()->get_MetaContainer(); + return $mc->single_value_by_key('assembly.web_accession_type'); +} + +sub _create_README { + + #Text for readme files + + my %text = ( + dna => <<'README', +####################### +Fasta DNA dumps +####################### + +----------- +FILE NAMES +------------ +The files are consistently named following this pattern: + ......fa.gz + +: The systematic name of the species. +: The assembly build name. +: The release number. +: + * 'dna' - unmasked genomic DNA sequences. + * 'dna_rm' - masked genomic DNA. Interspersed repeats and low + complexity regions are detected with the RepeatMasker tool and masked + by replacing repeats with 'N's. + * 'dna_sm' - soft-masked genomic DNA. All repeats and low complexity regions + have been replaced with lowercased versions of their nucleic base + One of the following: + * 'chromosome'a - The top-level coordinate system in most species in Ensembl + * 'nonchromosomal' - Contains DNA that has not been assigned a chromosome + * 'seqlevel' - This is usually sequence scaffolds, chunks or clones. + -- 'scaffold' - Larger sequence contigs from the assembly of shorter + sequencing reads (often from whole genome shotgun, WGS) which could + not yet be assembled into chromosomes. Often more genome sequencing + is needed to narrow gaps and establish a tiling path. + -- 'chunk' - While contig sequences can be assembled into large entities, + they sometimes have to be artificially broken down into smaller entities + called 'chunks'. This is due to limitations in the annotation + pipeline and the finite record size imposed by MySQL which stores the + sequence and annotation information. + -- 'clone' - In general this is the smallest sequence entity. It is often + identical to the sequence of one BAC clone, or sequence region + of one BAC clone which forms the tiling path. +: The actual sequence identifier. Depending on the the + could represent the name of a chromosome, a scaffold, a contig, a clone .. + Field is empty for seqlevel files +fa: All files in these directories represent FASTA database files +gz: All files are compacted with GNU Zip for storage efficiency. + +----------- +TOPLEVEL +---------- +These files contain the full sequence of the assembly in fasta format. +They contain one chromosome per file. + +EXAMPLES + The genomic sequence of human chromosome 1: + Homo_sapiens.GRCh37.57.dna.chromosome.1.fa.gz + + The masked version of the genome sequence on human chromosome 1 + (contains '_rm' or '_sm' in the name): + Homo_sapiens.GRCh37.57.dna_rm.chromosome.1.fa.gz + Homo_sapiens.GRCh37.57.dna_sm.chromosome.1.fa.gz + + Non-chromosomal assembly sequences: + e.g. mitochondrial genome, sequence contigs not yet mapped on chromosomes + Homo_sapiens.GRCh37.57.dna.nonchromosomal.fa.gz + Homo_sapiens.GRCh37.57.dna_rm.nonchromosomal.fa.gz + Homo_sapiens.GRCh37.57.dna_sm.nonchromosomal.fa.gz + + +-------------- +SPECIAL CASES +-------------- +Some chromosomes have alternate haplotypes which are presented in files with +the haplotype sequence only: + Homo_sapiens.GRCh37.56.dna_rm.chromosome.HSCHR6_MHC_QBL.fa.gz + Homo_sapiens.GRCh37.56.dna_rm.chromosome.HSCHR17_1.fa.gz + + +Some species have sequenced Y chromosomes and the pseudoautosomal region (PAR) +on the Y is annotated. By definition the PAR region is identical on the +X and Y chromosome. We provide this sequence in the following way. +-- The Y chromosome file contains the complete sequence of the PAR: + Homo_sapiens.GRCh37.56.dna.chromosome.Y.fa.gz +-- The top level file includes only the unique portion of Y (i.e. the PAR + (region is N-masked): + Homo_sapiens.GRCh37.56.dna.toplevel.fa.gz + +README + + pep => <<'README', +#################### +Fasta Peptide dumps +#################### +These files hold the protein translations of Ensembl gene predictions. + +----------- +FILE NAMES +------------ +The files are consistently named following this pattern: + .....fa.gz + +: The systematic name of the species. +: The assembly build name. +: The release number. +: pep for peptide sequences + + * 'pep.all' - the super-set of all translations resulting from Ensembl known + or novel gene predictions. + * 'pep.abinitio' translations resulting from 'ab initio' gene + prediction algorithms such as SNAP and GENSCAN. In general, all + 'ab initio' predictions are based solely on the genomic sequence and + not any other experimental evidence. Therefore, not all GENSCAN + or SNAP predictions represent biologically real proteins. +fa : All files in these directories represent FASTA database files +gz : All files are compacted with GNU Zip for storage efficiency. + +EXAMPLES (Note: Most species do not sequences for each different ) + for Human: + Homo_sapiens.NCBI36.40.pep.all.fa.gz + contains all known and novel peptides + Homo_sapiens.NCBI36.40.pep.abinitio.fa.gz + contains all abinitio predicted peptide + +Difference between known and novel +---------------------------------- +Protein models that can be mapped to species-specific entries in +Swiss-Prot, RefSeq or SPTrEMBL are referred to in Ensembl as +known genes. Those that cannot be mapped are called novel +(e.g. genes predicted on the basis of evidence from closely related species). + +For models annotated by HAVANA the status is set manually. Models that have +an HGNC name are referred to as known and the remaining models are referred to +as novel. + +------------------------------- +FASTA Sequence Header Lines +------------------------------ +The FASTA sequence header lines are designed to be consistent across +all types of Ensembl FASTA sequences. This gives enough information +for the sequence to be identified outside the context of the FASTA +database file. + +General format: + +>ID SEQTYPE:STATUS LOCATION GENE TRANSCRIPT + +Example of Ensembl Peptide header: + +>ENSP00000328693 pep:novel chromosome:NCBI35:1:904515:910768:1 gene:ENSG00000158815:transcript:ENST00000328693 gene_biotype:protein_coding transcript_biotype:protein_coding + ^ ^ ^ ^ ^ ^ ^ ^ + ID | | LOCATION GENE:stable gene ID | GENE: gene biotype TRANSCRIPT: transcript biotype + | STATUS TRANSCRIPT: stable transcript ID + SEQTYPE + +README + + cdna => <<'README', +################## +Fasta cDNA dumps +################# + +These files hold the cDNA sequences corresponding to Ensembl gene predictions. + +------------ +FILE NAMES +------------ +The files are consistently named following this pattern: +.....fa.gz + +: The systematic name of the species. +: The assembly build name. +: The release number. +: cdna for cDNA sequences + + * 'cdna.all' - the super-set of all transcripts resulting from + Ensembl known, novel and pseudo gene predictions (see more below). + * 'cdna.abinitio' - transcripts resulting from 'ab initio' gene prediction + algorithms such as SNAP and GENSCAN. In general all 'ab initio' + predictions are solely based on the genomic sequence and do not + use other experimental evidence. Therefore, not all GENSCAN or SNAP + cDNA predictions represent biologically real cDNAs. + Consequently, these predictions should be used with care. + +EXAMPLES (Note: Most species do not sequences for each different ) + for Human: + Homo_sapiens.NCBI36.40.cdna.all.fa.gz + cDNA sequences for all transcripts: known, novel and pseudo + Homo_sapiens.NCBI36.40.cdna.abinitio.fa.gz + cDNA sequences for 'ab-initio' prediction transcripts. + +Difference between known and novel transcripts +----------------------------------------------- +Transcript or protein models that can be mapped to species-specific entries +in Swiss-Prot, RefSeq or SPTrEMBL are referred to as known genes in Ensembl. +Those that cannot be mapped are called novel genes (e.g. genes predicted on +the basis of evidence from closely related species). + +For models annotated by HAVANA the status is set manually. Models that have +an HGNC name are referred to as known and the remaining models are referred to +as novel. + +------------------------------- +FASTA Sequence Header Lines +------------------------------ +The FASTA sequence header lines are designed to be consistent across +all types of Ensembl FASTA sequences. This gives enough information +for the sequence to be identified outside the context of the FASTA file. + +General format: + +>ID SEQTYPE:STATUS LOCATION GENE + +Example of an Ensembl cDNA header: + +>ENST00000289823 cdna:known chromosome:NCBI35:8:21922367:21927699:1 gene:ENSG00000158815 gene_biotype:protein_coding transcript_biotype:protein_coding + ^ ^ ^ ^ ^ ^ ^ + ID | | LOCATION GENE: gene stable ID GENE: gene biotype TRANSCRIPT: transcript biotype + | STATUS + SEQTYPE + + +README + + ncrna => <<'README', +################## +Fasta RNA dumps +################# + +These files hold the transcript sequences corresponding to non-coding RNA genes (ncRNA). + +------------ +FILE NAMES +------------ +The files are consistently named following this pattern: +....fa.gz + +: The systematic name of the species. +: The assembly build name. +: The release number. +: ncrna for non-coding RNA sequences + +EXAMPLES + for Human: + Homo_sapiens.NCBI36.40.ncrna.fa.gz + Transcript sequences for all ncRNA gene types. + + +------------------------------- +FASTA Sequence Header Lines +------------------------------ +The FASTA sequence header lines are designed to be consistent across +all types of Ensembl FASTA sequences. This gives enough information +for the sequence to be identified outside the context of the FASTA file. + +General format: + +>ENST00000347977 ncrna:miRNA chromosome:NCBI35:1:217347790:217347874:-1 gene:ENSG00000195671 gene_biotype:ncRNA transcript_biotype:ncRNA + ^ ^ ^ ^ ^ ^ ^ + ID | | LOCATION GENE: gene stable ID GENE: gene biotype TRANSCRIPT: transcript biotype + | STATUS + SEQTYPE + + +README + ); + + my $warning = <<'README'; +#### README #### + +IMPORTANT: Please note you can download correlation data tables, +supported by Ensembl, via the highly customisable BioMart and +EnsMart data mining tools. See http://www.ensembl.org/biomart/martview or +http://www.ebi.ac.uk/biomart/ for more information. + +README + + my ( $self, $data_type ) = @_; + my $base_path = $self->fasta_path(); + my $path = File::Spec->catfile( $base_path, $data_type, 'README' ); + my $accession = $self->assembly_accession(); + my $txt = $text{$data_type}; + throw "Cannot find README text for type $data_type" unless $txt; + + #Add accession information if it is available + if($data_type eq 'dna' && $accession) { + my $type = $self->assembly_accession_type(); + $warning .= <assert_executable('sendmail'); + + my $dump_dna = $self->jobs('DumpDNA'); + my $copy_dna = $self->jobs('CopyDNA'); + my $dump_genes = $self->jobs('DumpGenes'); + my $blast_dna = $self->jobs('BlastDNAIndex'); + my $blast_gene = $self->jobs('BlastGeneIndex'); + my $blast_pep = $self->jobs('BlastPepIndex'); + my $blat = $self->jobs('BlatDNAIndex', 100); + my $blat_sm = $self->jobs('BlatSmDNAIndex', 100); + + my @args = ( + $dump_dna->{successful_jobs}, + $dump_dna->{failed_jobs}, + $copy_dna->{successful_jobs}, + $copy_dna->{failed_jobs}, + $dump_genes->{successful_jobs}, + $dump_genes->{failed_jobs}, + $blast_dna->{successful_jobs}, + $blast_dna->{failed_jobs}, + $blast_gene->{successful_jobs}, + $blast_gene->{failed_jobs}, + $blast_pep->{successful_jobs}, + $blast_pep->{failed_jobs}, + $blat->{successful_jobs}, + $blat->{failed_jobs}, + $blat_sm->{successful_jobs}, + $blat_sm->{failed_jobs}, + $self->failed(), + $self->summary($dump_dna), + $self->summary($copy_dna), + $self->summary($dump_genes), + $self->summary($blast_dna), + $self->summary($blast_gene), + $self->summary($blast_pep), + $self->summary($blat), + $self->summary($blat_sm), + ); + + my $msg = sprintf(<<'MSG', @args); +Your FASTA Pipeline has finished. We have: + + * %d species with new DNA Dumped (%d failed) + * %d species with copied DNA (%d failed) + * %d species with genes dumped (%d failed) + * %d species with BLAST DNA indexes generated (%d failed) + * %d species with BLAST GENE indexes generated (%d failed) + * %d species with BLAST PEPTIDE indexes generated (%d failed) + * %d species with BLAT DNA generated (%d failed) + * %d species with BLAT soft-masked DNA generated (%d failed) + +%s + +=============================================================================== + +Full breakdown follows ... + +%s + +%s + +%s + +%s + +%s + +%s + +%s + +%s + +MSG + $self->param('text', $msg); + return; +} + +sub jobs { + my ($self, $logic_name, $minimum_runtime) = @_; + my $aa = $self->db->get_AnalysisAdaptor(); + my $aja = $self->db->get_AnalysisJobAdaptor(); + my $analysis = $aa->fetch_by_logic_name($logic_name); + my $id = $analysis->dbID(); + my @jobs = @{$aja->generic_fetch("j.analysis_id =$id")}; + $_->{input} = destringify($_->input_id()) for @jobs; + @jobs = + sort { $a->{input}->{species} cmp $b->{input}->{species} } + grep { + if($minimum_runtime) { + if($minimum_runtime > $_->runtime_msec()) { + 1; + } + else { + 0; + } + } + else { + 1; + } + } + @jobs; + my %passed_species = map { $_->{input}->{species}, 1 } grep { $_->status() eq 'DONE' } @jobs; + my %failed_species = map { $_->{input}->{species}, 1 } grep { $_->status() eq 'FAILED' } @jobs; + return { + analysis => $analysis, + name => $logic_name, + jobs => \@jobs, + successful_jobs => scalar(keys %passed_species), + failed_jobs => scalar(keys %failed_species), + }; +} + + +sub failed { + my ($self) = @_; + my $failed = $self->db()->get_AnalysisJobAdaptor()->fetch_all_failed_jobs(); + if(! @{$failed}) { + return 'No jobs failed. Congratulations!'; + } + my $output = <<'MSG'; +The following jobs have failed during this run. Please check your hive's error msg table for the following jobs: + +MSG + foreach my $job (@{$failed}) { + my $analysis = $self->db()->get_AnalysisAdaptor()->fetch_by_dbID($job->analysis_id()); + my $line = sprintf(q{ * job_id=%d %s(%5d) input_id='%s'}, $job->dbID(), $analysis->logic_name(), $analysis->dbID(), $job->input_id()); + $output .= $line; + $output .= "\n"; + } + return $output; +} + +my $sorter = sub { + my $status_to_int = sub { + my ($v) = @_; + return ($v->status() eq 'FAILED') ? 0 : 1; + }; + my $status_sort = $status_to_int->($a) <=> $status_to_int->($b); + return $status_sort if $status_sort != 0; + return $a->{input}->{species} cmp $b->{input}->{species}; +}; + +sub summary { + my ($self, $data) = @_; + my $name = $data->{name}; + my $underline = '~'x(length($name)); + my $output = "$name\n$underline\n\n"; + my @jobs = @{$data->{jobs}}; + if(@jobs) { + foreach my $job (sort $sorter @{$data->{jobs}}) { + my $species = $job->{input}->{species}; + $output .= sprintf(" * %s - job_id=%d %s\n", $species, $job->dbID(), $job->status()); + } + } + else { + $output .= "No jobs run for this analysis\n"; + } + $output .= "\n"; + return $output; +} + +1; \ No newline at end of file diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/FindDirs.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/FindDirs.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,59 @@ +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::Pipeline::FASTA::FindDirs + +=head1 DESCRIPTION + +Finds all directories under the given species directory. This is used to +flow any further processing only dependent on the directory and so +inherits from JobFactory and Bio::EnsEMBL::Pipeline::FASTA::Base to bring in methods which +know about the FTP structure. + +Allowed parameters are: + +=over 8 + +=item species - The species to work with + +=back + +=cut + +package Bio::EnsEMBL::Pipeline::FASTA::FindDirs; + +use strict; +use warnings; + +use base qw/Bio::EnsEMBL::Pipeline::FindDirs Bio::EnsEMBL::Pipeline::FASTA::Base/; + +use File::Spec; + +sub fetch_input { + my ($self) = @_; + $self->throw("No 'species' parameter specified") unless $self->param('species'); + $self->param('path', $self->fasta_path()); + $self->SUPER::fetch_input(); + return; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/Indexer.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/Indexer.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,58 @@ +package Bio::EnsEMBL::Pipeline::FASTA::Indexer; + +use strict; +use warnings; + +use base qw/Bio::EnsEMBL::Pipeline::FASTA::Base/; + +use File::Copy qw/copy/; +use File::Spec; +use Bio::EnsEMBL::Utils::Exception qw/throw/; + +sub decompress { + my ($self) = @_; + my $source = $self->param('file'); + my $target_dir = $self->target_dir(); + my ($vol, $dir, $file) = File::Spec->splitpath($source); + my $target = File::Spec->catdir($target_dir, $file); + my $gunzipped_target = $target; + $gunzipped_target =~ s/.gz$//; + $self->info('Copying from %s to %s', $source, $target); + copy($source, $target) or throw "Cannot copy $source to $target: $!"; + $self->info('Decompressing %s to %s', $source, $gunzipped_target); + system("gunzip -f $target") and throw sprintf('Could not gunzip. Exited with code %d', ($? >>8)); + return $gunzipped_target; +} + +sub repeat_mask_date { + my ($self) = @_; + my $res = $self->get_DBAdaptor()->dbc()->sql_helper()->execute_simple( + -SQL => <<'SQL', +select max(date_format( created, "%Y%m%d")) +from analysis a join meta m on (a.logic_name = lower(m.meta_value)) +where meta_key =? +SQL + -PARAMS => ['repeat.analysis'] + ); + return $res->[0] if @$res; + return q{}; +} + +sub run { + my ($self) = @_; + my $decompressed = $self->decompress(); + $self->index_file($decompressed); + $self->cleanup_DBAdaptor(); + return; +} + +sub index_file { + die "Implement"; +} + +sub target_dir { + die "Implement"; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/ReuseSpeciesFactory.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/ReuseSpeciesFactory.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,158 @@ +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::Pipeline::FASTA::ReuseSpeciesFactory + +=head1 DESCRIPTION + +An extension of the SpeciesFactory code. This uses the ensembl production +database to decide if a species has had an update to its DNA or not. An update +means any change to the assembly or repeat masking. + +Allowed parameters are: + +=over 8 + +=item release - Needed to query production with + +=item ftp_dir - If not specified then we cannot reuse + +=item force_species - Specify species we want to redump even though + our queries of production could say otherwise + +=back + +The registry should also have a DBAdaptor for the production schema +registered under the species B and the group B. + +The code adds an additional flow output: + +=over 8 + +=item 4 - Perform DNA reuse + +=back + +=cut + +package Bio::EnsEMBL::Pipeline::FASTA::ReuseSpeciesFactory; + +use strict; +use warnings; + +use base qw/Bio::EnsEMBL::Pipeline::FASTA::SpeciesFactory/; + +use Bio::EnsEMBL::Registry; +use File::Spec; + +sub param_defaults { + my ($self) = @_; + my $p = { + %{$self->SUPER::param_defaults()}, + + force_species => [], + }; + return $p; +} + +sub fetch_input { + my ($self) = @_; + $self->SUPER::fetch_input(); + $self->throw("Need to define a release parameter") unless $self->param('release'); + + #Calculate quick lookup + my %force_species_lookup; + foreach my $s (@{$self->param('force_species')}) { + my $new = Bio::EnsEMBL::Registry->get_alias($s); + $force_species_lookup{$new} = 1; + } + $self->param('force_species_lookup', \%force_species_lookup); + + return; +} + +sub dna_flow { + my ($self, $dba) = @_; + my $parent_flow = $self->SUPER::dna_flow($dba); + return 0 if ! $parent_flow; #return if parent decided to skip + if(! $self->param('ftp_dir')) { + $self->fine('No ftp_dir param defined so will flow %s to %d', $dba->species(), $parent_flow); + return $parent_flow; + } + my $requires_new_dna = $self->requires_new_dna($dba); + return $parent_flow if $requires_new_dna; + return 4; #nominated flow for copying +} + +sub requires_new_dna { + my ($self, $dba) = @_; + + if($self->force_run($dba)) { + $self->fine('Automatically forcing DNA rerun for %s', $dba->species()); + return 1; + } + + if(!$self->has_source_dir($dba)) { + $self->fine('Source directory is missing; forcing DNA rerun for %s', $dba->species()); + return 1; + } + + my $sql = <<'SQL'; +select count(*) +from changelog c +join changelog_species cs using (changelog_id) +join species s using (species_id) +where c.release_id = ? +and (c.assembly = ? or c.repeat_masking = ?) +and c.team = ? +and c.status = ? +and production_name = ? +SQL + my $production_name = $dba->get_MetaContainer()->get_production_name(); + $dba->dbc()->disconnect_if_idle(); + my $release = $self->param('release'); + my $params = [ $release, 'Y', 'Y', 'Genebuild', 'handed_over', $production_name ]; + my $prod_dba = $self->get_production_DBAdaptor(); + my $result = $prod_dba->dbc()->sql_helper()->execute_single_result(-SQL => $sql, -PARAMS => $params); + $prod_dba->dbc()->disconnect_if_idle(); + return $result; +} + +sub force_run { + my ($self, $dba) = @_; + my $new = Bio::EnsEMBL::Registry->get_alias($dba->species()); + return ($self->param('force_species_lookup')->{$new}) ? 1 : 0; +} + +sub has_source_dir { + my ($self, $dba) = @_; + my $dir = $self->old_path($dba->species()); + $dba->dbc()->disconnect_if_idle(); + return (-d $dir) ? 1 : 0; +} + +sub get_production_DBAdaptor { + my ($self) = @_; + return Bio::EnsEMBL::Registry->get_DBAdaptor('multi', 'production'); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/SCPBlast.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/SCPBlast.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,209 @@ +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::Pipeline::FASTA::SCPBlast + +=head1 DESCRIPTION + +Performs a find in the Blast index directory, for the given species and copies +them to the specified target servers. + +Allowed parameters are: + +=over 8 + +=item no_scp - If true then we will not run SCP but still finish cleanly without error + +=item type - The type of dump to copy. Required parameter + +=item genomic_dir - Needed if you are copying DNA genomic files + +=item genes_dir - Needed if you are copying DNA gene files + +=item target_servers - The servers to copy to. Expects to be an array + +=item species - Species to work with + +=item scp_user - The user to scp as. Defaults to the current user + +=item scp_identity - Give an identity file to use during ssh commands + (useful when you are not scping as yourself) + +=item base_path - The base of the dumps. The source blast directory is + constructed from this path + +=back + +=cut + +package Bio::EnsEMBL::Pipeline::FASTA::SCPBlast; + +use strict; +use warnings; + +use base qw/Bio::EnsEMBL::Pipeline::FASTA::Base/; + +use Bio::EnsEMBL::Utils::Scalar qw/check_ref/; +use File::Spec; + +sub param_defaults { + my ($self) = @_; + return { + no_scp => 0, +# genomic_dir => '', +# genes_dir => '', +# target_servers => ['srv1', 'srv2'], + + scp_user => $ENV{USER}, #defaults to the current user +# scp_identity => '', + +# type => 'genes'/'genomic', +# species => '', + }; +} + +sub fetch_input { + my ($self) = @_; + if($self->param('no_scp')) { + $self->info('Skipping as no_scp has been specified'); + return; + } + + my $servers = $self->param('target_servers'); + + if(!check_ref($servers, 'ARRAY') || ! @{$servers}) { + my $msg = 'Will not perform copy as we have no servers'; + my $is_error = 0; + $self->db()->get_JobMessageAdaptor()->register_message( + $self->input_job()->dbID(), $msg, $is_error + ); + $self->info($msg); + return; + } + + foreach my $key (qw/type species/) { + $self->throw("Key $key is required") unless $self->param($key); + } + my $type = $self->param('type'); + if($type ne 'genomic' && $type ne 'genes') { + $self->throw("param 'type' must be set to 'genomic' or 'genes'"); + } + $self->target_dir(); #prodding for fetch's sake + return; +} + +sub run { + my ($self) = @_; + if($self->param('no_scp')) { + $self->info('Skipping as no_scp has been specified'); + return; + } + my $servers = $self->param('target_servers'); + return unless @{$servers}; + my $files = $self->get_files(); + foreach my $server (@{$servers}) { + $self->info('Copying files to %s for species %s', $server, $self->param('species')); + $self->copy_to_server($files, $server); + } + return; +} + +sub write_output { + my ($self) = @_; + $self->cleanup_DBAdaptor(); + return; +} + +sub copy_to_server { + my ($self, $files, $server) = @_; + my $target_dir = $self->target_dir(); + $self->check_remote_dir($target_dir, $server); + my $user = $self->param('scp_user'); + my $identity = $self->identity_param(); + foreach my $file (@{$files}) { + my ($volume, $directory, $filename) = File::Spec->splitpath($file); + my $target_path = File::Spec->catfile($target_dir, $filename); + my $cmd = sprintf('scp %s %s %s@%s:%s', $identity, $file, $user, $server, $target_path); + $self->fine('Executing %s', $cmd); + system($cmd) and $self->throw(sprintf("Cannot run command '%s'. RC %d", $cmd, ($?>>8))); + } + return; +} + +sub get_files { + my ($self) = @_; + my $species = $self->web_name(); + my $filter = sub { + my ($filename) = @_; + return ($filename =~ /^$species.+fa.+$/) ? 1 : 0; + }; + my $files = $self->find_files($self->blast_dir(), $filter); + $self->info('Found %d file(s) to copy', scalar(@{$files})); + return $files; +} + +sub blast_dir { + my ($self) = @_; + return $self->get_dir('blast', $self->param('type')); +} + +sub target_dir { + my ($self) = @_; + my $t = $self->param('type'); + my $key = "${t}_dir"; + my $dir = $self->param($key); + $self->throw("Cannot locate the parameter $key. We expect to do so") unless $dir; + return $dir; +} + +sub check_remote_dir { + my ($self, $remote_dir, $server) = @_; + my ($echo_rc) = $self->ssh_cmd($server, "echo -n 1"); + $self->throw("Cannot connect to $server") if $echo_rc; #1 means fail + my ($exists_rc) = $self->ssh_cmd($server, "test -d $remote_dir"); + if($exists_rc == 1) { + $self->info('Directory %s does not exist on %s. Will create it'); + my ($mkdir_rc, $mkdir_out) = $self->ssh_cmd($server, "mkdir -p $remote_dir"); + if($mkdir_rc == 1) { + $self->throw("Cannot create the directory $remote_dir on $server. Output from cmd was $mkdir_out. Check and rerun"); + } + } + return; +} + +sub ssh_cmd { + my ($self, $server, $cmd) = @_; + my $user = $self->param('scp_user'); + my $identity = $self->identity_param(); + $self->fine("Running command '%s' on '%s' as user '%s'", $cmd, $server, $user); + my $ssh_cmd = sprintf('ssh %s %s@%s "%s"', $identity, $user, $server, $cmd); + my $output = `$ssh_cmd`; + my $rc = $? >> 8; + return ($rc, $output); +} + +sub identity_param { + my ($self) = @_; + return ($self->param('scp_identity')) ? '-i '.$self->param('scp_identity') : q{}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/SpeciesFactory.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/SpeciesFactory.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,166 @@ +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::Pipeline::FASTA::SpeciesFactory + +=head1 DESCRIPTION + +A module which generates dump jobs for each species it finds in the Ensembl +Registry. The type of dump performed is controlled by the I +parameter. The species we run the code on can be controlled by specifying +the I parameter or by reducing the number of DBAdaptors loaded into +the registry. + +Allowed parameters are: + +=over 8 + +=item sequence_type_list - The type of dump to perform. Should be an array and + can contain I, I and I. Defaults + to all of these. + +=item species - Can be an array of species to perform dumps for or a single + species name. If specified only jobs will be created for + those species. Defaults to nothing so all species are processed + +item db_types - Specify the types of database to dump. Defaults to core and + should be an array. + +=back + +The code flows to two outputs. Please take note if you are reusing this module + +=over 8 + +=item 2 - Perform DNA dumps + +=item 3 - Perform Gene dumps + +=back + +Multiple types of DB can be specifed with the I method call but +be aware that this is flowed as 1 job per species for all types. + +=cut + +package Bio::EnsEMBL::Pipeline::FASTA::SpeciesFactory; + +use strict; +use warnings; + +use base qw/Bio::EnsEMBL::Pipeline::SpeciesFactory/; + +use Bio::EnsEMBL::Registry; + +sub param_defaults { + my ($self) = @_; + return { + %{$self->SUPER::param_defaults()}, + sequence_type_list => [qw/dna cdna ncrna/], + }; +} + +sub fetch_input { + my ($self) = @_; + $self->SUPER::fetch_input(); + $self->reset_empty_array_param('sequence_type_list'); + my %sequence_types = map { $_ => 1 } @{ $self->param('sequence_type_list') }; + $self->param('sequence_types', \%sequence_types); + + return; +} + +sub run { + my ($self) = @_; + my @dna; + my @genes; + my @species; + foreach my $dba (@{$self->param('dbas')}) { + if(!$self->process_dba($dba)) { + $self->fine('Skipping %s', $dba->species()); + next; + } + + my $dna_flow = $self->dna_flow($dba); + if($dna_flow) { + push(@dna, [$self->input_id($dba, 'dna'), $dna_flow]); + } + + my $genes_flow = $self->genes_flow($dba); + if($genes_flow) { + push(@genes, [$self->input_id($dba, 'genes'), $genes_flow]); + } + + push(@species, [ { species => $dba->species() }, 5 ]); + } + $self->param('dna', \@dna); + $self->param('genes', \@genes); + $self->param('species', \@species); + return; +} + +sub write_output { + my ($self) = @_; + $self->do_flow('dna'); + $self->do_flow('genes'); + $self->do_flow('species'); + return; +} + +# return 0 if we do not want to do any flowing otherwise return 2 + +sub dna_flow { + my ($self, $dba) = @_; + return 0 unless $self->param('sequence_types')->{dna}; + return 2; +} + +# return 0 if we do not want to do any flowing otherwise return 3 + +sub genes_flow { + my ($self, $dba) = @_; + my $types = $self->param('sequence_types'); + return 0 if ! $types->{cdna} && ! $types->{ncrna}; + return 3; +} + +sub input_id { + my ($self, $dba, $type) = @_; + my $mc = $dba->get_MetaContainer(); + my $input_id = { + db_types => $self->db_types($dba), + species => $mc->get_production_name(), + }; + if($type eq 'dna') { + $input_id->{sequence_type_list} = ['dna']; + } + else { + my $types = $self->param('sequence_types'); + my @types; + push(@types, 'cdna') if $types->{cdna}; + push(@types, 'ncrna') if $types->{ncrna}; + $input_id->{sequence_type_list} = \@types; + } + return $input_id; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/WuBlastIndexer.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/WuBlastIndexer.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,160 @@ +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::Pipeline::FASTA::WuBlastIndexer + +=head1 DESCRIPTION + +Creates WUBlast indexes of the given GZipped file. The resulting index +is created under the parameter location I in blast and then in a +directory defined by the type of dump. The type of dump also changes the file +name generated. Genomic dumps have their release number replaced with the +last repeat masked date. + +Allowed parameters are: + +=over 8 + +=item file - The file to index + +=item program - The location of the xdformat program + +=item molecule - The type of molecule to index. I and I are allowed + +=item type - Type of index we are creating. I and I are allowed + +=item base_path - The base of the dumps + +=item release - Required for correct DB naming + +=back + +=cut + +package Bio::EnsEMBL::Pipeline::FASTA::WuBlastIndexer; + +use strict; +use warnings; +use base qw/Bio::EnsEMBL::Pipeline::FASTA::Indexer/; + +use Bio::EnsEMBL::Utils::Exception qw/throw/; +use File::Copy qw/copy/; +use File::Spec; +use POSIX qw/strftime/; + +sub param_defaults { + my ($self) = @_; + return { + program => 'xdformat', +# molecule => 'pep', #pep or dna +# type => 'genes' #genes or genomic + }; +} + +sub fetch_input { + my ($self) = @_; + my $mol = $self->param('molecule'); + if($mol ne 'dna' && $mol ne 'pep') { + throw "param 'molecule' must be set to 'dna' or 'pep'"; + } + my $type = $self->param('type'); + if($type ne 'genomic' && $type ne 'genes') { + throw "param 'type' must be set to 'genomic' or 'genes'"; + } + $self->assert_executable($self->param('program')); + $self->assert_executable('gunzip'); +} + +sub write_output { + my ($self) = @_; + $self->dataflow_output_id({ + species => $self->param('species'), + type => $self->param('type'), + molecule => $self->param('molecule'), + index_base => $self->param('index_base') + }, 1); + return; +} + +sub index_file { + my ($self, $file) = @_; + my $molecule_arg = ($self->param('molecule') eq 'dna') ? '-n' : '-p' ; + my $silence = ($self->debug()) ? 0 : 1; + my $target_dir = $self->target_dir(); + my $target_file = $self->target_file($file); + my $db_title = $self->db_title($file); + my $date = $self->db_date(); + + my $cmd = sprintf(q{cd %s && %s %s -q%d -I -t %s -d %s -o %s %s }, + $target_dir, $self->param('program'), $molecule_arg, $silence, $db_title, $date, $target_file, $file); + + $self->info('About to run "%s"', $cmd); + my $output = `$cmd 2>&1`; + my $rc = $? >> 8; + throw "Cannot run program '$cmd'. Return code was ${rc}. Program output was $output" if $rc; + unlink $file or throw "Cannot remove the file '$file' from the filesystem: $!"; + $self->param('index_base', $target_file); + return; +} + +sub target_file { + my ($self, $file) = @_; + my $target_dir = $self->target_dir(); + my $target_filename = $self->target_filename($file); + return File::Spec->catfile($target_dir, $target_filename); +} + +# Produce a dir like /nfs/path/to/blast/genes/XXX && /nfs/path/to/blast/dna/XXX +sub target_dir { + my ($self) = @_; + return $self->get_dir('blast', $self->param('type')); +} + +sub db_title { + my ($self, $source_file) = @_; + my ($vol, $dir, $file) = File::Spec->splitpath($source_file); + my $release = $self->param('release'); + my $title = $file; + $title =~ s/$release\.//; + return $title; +} + +sub db_date { + my ($self) = @_; + return strftime('%d-%m-%Y', gmtime()); +} + +#Source like Homo_sapiens.GRCh37.68.dna.toplevel.fa +#Filename like Homo_sapiens.GRCh37.20090401.dna.toplevel.fa +sub target_filename { + my ($self, $source_file) = @_; + my ($vol, $dir, $file) = File::Spec->splitpath($source_file); + if($self->param('type') eq 'genomic') { + my @split = split(/\./, $file); + my $rm_date = $self->repeat_mask_date(); + $split[-4] = $rm_date; + return join(q{.}, @split); + } + return $file; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Pipeline/FindDirs.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Pipeline/FindDirs.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,80 @@ +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::Pipeline::FindDirs + +=head1 DESCRIPTION + +Finds all directories under the given path. + +Allowed parameters are: + +=over 8 + +=item path - The path to search + +=back + +=cut + +package Bio::EnsEMBL::Pipeline::FindDirs; + +use strict; +use warnings; + +use base qw/Bio::EnsEMBL::Hive::RunnableDB::JobFactory/; + +use File::Spec; + +sub fetch_input { + my ($self) = @_; + $self->throw("No 'path' parameter specified") unless $self->param('path'); + my $dirs = $self->dirs(); + $self->param('inputlist', $dirs); + return; +} + +sub dirs { + my ($self) = @_; + + my @dirs; + + my $dir = $self->param('path'); + $self->info('Searching directory %s', $dir); + + opendir(my $dh, $dir) or die "Cannot open directory $dir"; + my @files = sort { $a cmp $b } readdir($dh); + closedir($dh) or die "Cannot close directory $dir"; + + foreach my $file (@files) { + next if $file =~ /^\./; #hidden file or up/current dir + my $path = File::Spec->catdir($dir, $file); + if(-d $path) { + $self->fine('Adding %s to the list of found dirs', $path); + push(@dirs, $path); + } + } + + return \@dirs; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Pipeline/Flatfile/Base.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Pipeline/Flatfile/Base.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,12 @@ +package Bio::EnsEMBL::Pipeline::Flatfile::Base; + +use strict; +use warnings; +use base qw/Bio::EnsEMBL::Pipeline::Base/; + +sub data_path { + my ($self) = @_; + return $self->get_dir($self->param('type'), $self->param('species')); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Pipeline/Flatfile/ChecksumGenerator.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Pipeline/Flatfile/ChecksumGenerator.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,60 @@ +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::Pipeline::Flatfile::ChecksumGenerator + +=head1 DESCRIPTION + +Creates a CHECKSUMS file in the given directory which is produced from running +the sum command over every file in the directory. This excludes the CHECKSUMS +file, parent directory or any hidden files. + +Allowed parameters are: + +=over 8 + +=item species - Species to work with + +=item type - Type of data to work with + +=back + +=cut + +package Bio::EnsEMBL::Pipeline::Flatfile::ChecksumGenerator; + +use strict; +use warnings; + +use base qw/Bio::EnsEMBL::Pipeline::ChecksumGenerator Bio::EnsEMBL::Pipeline::Flatfile::Base/; + +sub fetch_input { + my ($self) = @_; + $self->throw("No 'species' parameter specified") unless $self->param('species'); + $self->throw("No 'type' parameter specified") unless $self->param('type'); + my $dir = $self->data_path(); + $self->param('dir', $dir); + $self->SUPER::fetch_input(); + return; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Pipeline/Flatfile/DumpFile.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Pipeline/Flatfile/DumpFile.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,253 @@ +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::Pipeline::Flatfile::DumpFile + +=head1 DESCRIPTION + +The main workhorse of the Flatfile dumping pipeline. + +The script is responsible for creating the filenames of these target +files, taking data from the database and the formatting of the flat files +headers. The final files are all Gzipped at normal levels of compression. + +Allowed parameters are: + +=over 8 + +=item species - The species to dump + +=item base_path - The base of the dumps + +=item release - The current release we are emitting + +=item type - The type of data we are emitting. Should be embl or genbank + +=back + +=cut + +package Bio::EnsEMBL::Pipeline::Flatfile::DumpFile; + +use strict; +use warnings; + +use base qw(Bio::EnsEMBL::Pipeline::Flatfile::Base); + +use Bio::EnsEMBL::Utils::Exception qw/throw/; +use Bio::EnsEMBL::Utils::SeqDumper; +use Bio::EnsEMBL::Utils::IO qw/gz_work_with_file work_with_file/; +use File::Path qw/rmtree/; + +sub param_defaults { + my ($self) = @_; + return { + supported_types => {embl => 1, genbank => 1}, + }; +} + +sub fetch_input { + my ($self) = @_; + + my $type = $self->param('type'); + throw "No type specified" unless $type; + throw "Unsupported type '$type' specified" unless $self->param('supported_types')->{$type}; + + throw "Need a species" unless $self->param('species'); + throw "Need a release" unless $self->param('release'); + throw "Need a base_path" unless $self->param('base_path'); + + return; +} + +sub run { + my ($self) = @_; + + my $root = $self->data_path(); + if(-d $root) { + $self->info('Directory "%s" already exists; removing', $root); + rmtree($root); + } + + my $type = $self->param('type'); + my $target = "dump_${type}"; + my $seq_dumper = $self->_seq_dumper(); + + my @chromosomes; + my @non_chromosomes; + foreach my $s (@{$self->get_Slices()}) { + my $chr = $s->is_chromosome(); + push(@chromosomes, $s) if $chr; + push(@non_chromosomes, $s) if ! $chr; + } + + if(@non_chromosomes) { + my $path = $self->_generate_file_name('nonchromosomal'); + $self->info('Dumping non-chromosomal data to %s', $path); + gz_work_with_file($path, 'w', sub { + my ($fh) = @_; + foreach my $slice (@non_chromosomes) { + $self->fine('Dumping non-chromosomal %s', $slice->name()); + $seq_dumper->$target($slice, $fh); + } + return; + }); + } + else { + $self->info('Did not find any non-chromosomal data'); + } + + foreach my $slice (@chromosomes) { + $self->fine('Dumping chromosome %s', $slice->name()); + my $path = $self->_generate_file_name($slice->coord_system_name(), $slice->seq_region_name()); + my $args = {}; + if(-f $path) { + $self->fine('Path "%s" already exists; appending', $path); + $args->{Append} = 1; + } + gz_work_with_file($path, 'w', sub { + my ($fh) = @_; + $seq_dumper->$target($slice, $fh); + return; + }, $args); + } + + $self->_create_README(); + + return; +} + +sub _seq_dumper { + my ($self) = @_; + my $seq_dumper = Bio::EnsEMBL::Utils::SeqDumper->new(); + $seq_dumper->disable_feature_type('similarity'); + $seq_dumper->disable_feature_type('genscan'); + $seq_dumper->disable_feature_type('variation'); + $seq_dumper->disable_feature_type('repeat'); + return $seq_dumper; +} + +sub _generate_file_name { + my ($self, $section, $name) = @_; + + # File name format looks like: + # ....dat.gz + # e.g. Homo_sapiens.GRCh37.64.chromosome.20.dat.gz + # Homo_sapiens.GRCh37.64.nonchromosomal.dat.gz + my @name_bits; + push @name_bits, $self->web_name(); + push @name_bits, $self->assembly(); + push @name_bits, $self->param('release'); + push @name_bits, $section if $section; + push @name_bits, $name if $name; + push @name_bits, 'dat', 'gz'; + + my $file_name = join( '.', @name_bits ); + my $path = $self->data_path(); + return File::Spec->catfile($path, $file_name); +} + +sub _create_README { + my ($self) = @_; + my $species = $self->scientific_name(); + my $format = uc($self->param('type')); + + my $readme = <catfile($self->data_path(), 'README'); + work_with_file($path, 'w', sub { + my ($fh) = @_; + print $fh $readme; + return; + }); + return; +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Pipeline/Flatfile/FindDirs.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Pipeline/Flatfile/FindDirs.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,57 @@ +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::Pipeline::Flatfile::FindDirs + +=head1 DESCRIPTION + +Finds all directories under the given species directory. This is used to +flow any further processing only dependent on the directory + +Allowed parameters are: + +=over 8 + +=item species - The species to work with + +=back + +=cut + +package Bio::EnsEMBL::Pipeline::FASTA::FindDirs; + +use strict; +use warnings; + +use base qw/Bio::EnsEMBL::Pipeline::FindDirs Bio::EnsEMBL::Pipeline::Flatfile::Base/; + +use File::Spec; + +sub fetch_input { + my ($self) = @_; + $self->throw("No 'species' parameter specified") unless $self->param('species'); + $self->param('path', $self->data_path()); + $self->SUPER::fetch_input(); + return; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Pipeline/PipeConfig/FASTA_conf.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Pipeline/PipeConfig/FASTA_conf.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,296 @@ +package Bio::EnsEMBL::Pipeline::PipeConfig::FASTA_conf; + +use strict; +use warnings; + +use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf'); + +use Bio::EnsEMBL::ApiVersion qw/software_version/; + +sub default_options { + my ($self) = @_; + + return { + # inherit other stuff from the base class + %{ $self->SUPER::default_options() }, + + ### OVERRIDE + + #'registry' => 'Reg.pm', # default option to refer to Reg.pm, should be full path + #'base_path' => '', #where do you want your files + + ### Optional overrides + ftp_dir => '', + + species => [], + + dump_types => [], + + db_types => [], + + force_species => [], + + process_logic_names => [], + + skip_logic_names => [], + + release => software_version(), + + previous_release => (software_version() - 1), + + ### SCP code + + blast_servers => [], + blast_genomic_dir => '', + blast_genes_dir => '', + + scp_user => $self->o('ENV', 'USER'), + scp_identity => '', + no_scp => 0, + + ### Defaults + + pipeline_name => 'fasta_dump_'.$self->o('release'), + + wublast_exe => 'xdformat', + blat_exe => 'faToTwoBit', + port_offset => 30000, + + email => $self->o('ENV', 'USER').'@sanger.ac.uk', + }; +} + +sub pipeline_create_commands { + my ($self) = @_; + return [ + # inheriting database and hive tables' creation + @{$self->SUPER::pipeline_create_commands}, + ]; +} + +## See diagram for pipeline structure +sub pipeline_analyses { + my ($self) = @_; + + return [ + + { + -logic_name => 'ScheduleSpecies', + -module => 'Bio::EnsEMBL::Pipeline::FASTA::ReuseSpeciesFactory', + -parameters => { + species => $self->o('species'), + sequence_type_list => $self->o('dump_types'), + ftp_dir => $self->o('ftp_dir'), + force_species => $self->o('force_species'), + }, + -input_ids => [ {} ], + -flow_into => { + 1 => 'Notify', + 2 => 'DumpDNA', + 3 => 'DumpGenes', + 4 => 'CopyDNA', + 5 => 'ChecksumGeneratorFactory' + }, + }, + + ######### DUMPING DATA + + { + -logic_name => 'DumpDNA', + -module => 'Bio::EnsEMBL::Pipeline::FASTA::DumpFile', + -parameters => { + process_logic_names => $self->o('process_logic_names'), + skip_logic_names => $self->o('skip_logic_names'), + }, + -can_be_empty => 1, + -flow_into => { + 1 => 'ConcatFiles' + }, + -can_be_empty => 1, + -max_retry_count => 1, + -hive_capacity => 10, + -rc_name => 'dump', + }, + + { + -logic_name => 'DumpGenes', + -module => 'Bio::EnsEMBL::Pipeline::FASTA::DumpFile', + -flow_into => { + 2 => ['BlastPepIndex'], + 3 => ['BlastGeneIndex'] + }, + -max_retry_count => 1, + -hive_capacity => 10, + -can_be_empty => 1, + -rc_name => 'dump', + -wait_for => 'DumpDNA' #block until DNA is done + }, + + { + -logic_name => 'ConcatFiles', + -module => 'Bio::EnsEMBL::Pipeline::FASTA::ConcatFiles', + -can_be_empty => 1, + -max_retry_count => 5, + -flow_into => { + 1 => [qw/BlastDNAIndex BlatDNAIndex BlatSmDNAIndex/] + }, + }, + + ######## COPY DATA + + { + -logic_name => 'CopyDNA', + -module => 'Bio::EnsEMBL::Pipeline::FASTA::CopyDNA', + -can_be_empty => 1, + -hive_capacity => 5, + -parameters => { + ftp_dir => $self->o('ftp_dir') + }, + }, + + ######## INDEXING + + { + -logic_name => 'BlastDNAIndex', + -module => 'Bio::EnsEMBL::Pipeline::FASTA::WuBlastIndexer', + -parameters => { + molecule => 'dna', type => 'genomic', program => $self->o('wublast_exe') + }, + -hive_capacity => 10, + -can_be_empty => 1, + -rc_name => 'indexing', + }, + + { + -logic_name => 'BlastPepIndex', + -module => 'Bio::EnsEMBL::Pipeline::FASTA::WuBlastIndexer', + -parameters => { + molecule => 'pep', type => 'genes', program => $self->o('wublast_exe') + }, + -hive_capacity => 5, + -can_be_empty => 1, + -flow_into => { + 1 => [qw/SCPBlast/], + }, + }, + + { + -logic_name => 'BlastGeneIndex', + -module => 'Bio::EnsEMBL::Pipeline::FASTA::WuBlastIndexer', + -parameters => { + molecule => 'dna', type => 'genes', program => $self->o('wublast_exe') + }, + -hive_capacity => 5, + -can_be_empty => 1, + -flow_into => { + 1 => [qw/SCPBlast/], + }, + }, + + { + -logic_name => 'BlatDNAIndex', + -module => 'Bio::EnsEMBL::Pipeline::FASTA::BlatIndexer', + -parameters => { + port_offset => $self->o('port_offset'), + program => $self->o('blat_exe'), + 'index' => 'dna' + }, + -can_be_empty => 1, + -hive_capacity => 5, + -rc_name => 'indexing', + }, + + { + -logic_name => 'BlatSmDNAIndex', + -module => 'Bio::EnsEMBL::Pipeline::FASTA::BlatIndexer', + -parameters => { + port_offset => $self->o('port_offset'), + program => $self->o('blat_exe'), + 'index' => 'dna_sm' + }, + -can_be_empty => 1, + -hive_capacity => 5, + -rc_name => 'indexing', + }, + + ######## COPYING + { + -logic_name => 'SCPBlast', + -module => 'Bio::EnsEMBL::Pipeline::FASTA::SCPBlast', + -parameters => { + target_servers => $self->o('blast_servers'), + genomic_dir => $self->o('blast_genomic_dir'), + genes_dir => $self->o('blast_genes_dir'), + + scp_user => $self->o('scp_user'), + scp_identity => $self->o('scp_identity'), + + no_scp => $self->o('no_scp'), + }, + -hive_capacity => 3, + -can_be_empty => 1, + -wait_for => [qw/DumpDNA DumpGenes BlastDNAIndex BlastGeneIndex BlastPepIndex/] + }, + + ####### CHECKSUMMING + + { + -logic_name => 'ChecksumGeneratorFactory', + -module => 'Bio::EnsEMBL::Pipeline::FASTA::FindDirs', + -parameters => { + column_names => [qw/dir/], + input_id => { 'dir' => '#dir#' }, + fan_branch_code => 2, + }, + -wait_for => [qw/DumpDNA DumpGenes BlastDNAIndex BlastGeneIndex BlastPepIndex/], + -flow_into => { 2 => ['ChecksumGenerator'] } + }, + + { + -logic_name => 'ChecksumGenerator', + -module => 'Bio::EnsEMBL::Pipeline::ChecksumGenerator', + -hive_capacity => 10, + }, + + ####### NOTIFICATION + + { + -logic_name => 'Notify', + -module => 'Bio::EnsEMBL::Pipeline::FASTA::EmailSummary', + -parameters => { + email => $self->o('email'), + subject => $self->o('pipeline_name').' has finished', + }, + -wait_for => ['SCPBlast', 'ChecksumGenerator'], + } + + ]; +} + +sub pipeline_wide_parameters { + my ($self) = @_; + + return { + %{ $self->SUPER::pipeline_wide_parameters() }, # inherit other stuff from the base class + base_path => $self->o('base_path'), + db_types => $self->o('db_types'), + release => $self->o('release'), + previous_release => $self->o('previous_release'), + }; +} + +# override the default method, to force an automatic loading of the registry in all workers +sub beekeeper_extra_cmdline_options { + my $self = shift; + return "-reg_conf ".$self->o("registry"); +} + +sub resource_classes { + my $self = shift; + return { + 'dump' => { LSF => '-q long -M1000000 -R"select[mem>1000] rusage[mem=1000]"' }, + 'indexing' => { LSF => '-q normal -M2000000 -R"select[mem>2000] rusage[mem=2000]"' }, + } +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Pipeline/PipeConfig/Flatfile_conf.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Pipeline/PipeConfig/Flatfile_conf.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,137 @@ +package Bio::EnsEMBL::Pipeline::PipeConfig::Flatfile_conf; + +use strict; +use warnings; + +use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf'); + +use Bio::EnsEMBL::ApiVersion qw/software_version/; + +sub default_options { + my ($self) = @_; + + return { + # inherit other stuff from the base class + %{ $self->SUPER::default_options() }, + + ### OVERRIDE + + #'registry' => 'Reg.pm', # default option to refer to Reg.pm, should be full path + #'base_path' => '', #where do you want your files + + ### Optional overrides + species => [], + + release => software_version(), + + types => [qw/embl genbank/], + + ### Defaults + + pipeline_name => 'flatfile_dump_'.$self->o('release'), + + email => $self->o('ENV', 'USER').'@sanger.ac.uk', + + }; +} + +sub pipeline_create_commands { + my ($self) = @_; + return [ + # inheriting database and hive tables' creation + @{$self->SUPER::pipeline_create_commands}, + ]; +} + +## See diagram for pipeline structure +sub pipeline_analyses { + my ($self) = @_; + + return [ + + { + -logic_name => 'ScheduleSpecies', + -module => 'Bio::EnsEMBL::Pipeline::SpeciesFactory', + -parameters => { + species => $self->o('species'), + randomize => 1, + }, + -input_ids => [ {} ], + -flow_into => { + 1 => 'Notify', + 2 => ['DumpTypeFactory'], + }, + }, + + ######### DUMPING DATA + + { + -logic_name => 'DumpTypeFactory', + -module => 'Bio::EnsEMBL::Hive::RunnableDB::JobFactory', + -parameters => { + column_names => ['type'], + inputlist => $self->o('types'), + input_id => { species => '#species#', type => '#type#' }, + fan_branch_code => 2 + }, + -flow_into => { 2 => ['DumpFlatfile', 'ChecksumGenerator'] }, + }, + + { + -logic_name => 'DumpFlatfile', + -module => 'Bio::EnsEMBL::Pipeline::Flatfile::DumpFile', + -max_retry_count => 1, + -hive_capacity => 10, + -rc_name => 'dump', + }, + + ####### CHECKSUMMING + + { + -logic_name => 'ChecksumGenerator', + -module => 'Bio::EnsEMBL::Pipeline::Flatfile::ChecksumGenerator', + -wait_for => [qw/DumpFlatfile/], + -hive_capacity => 10, + }, + + ####### NOTIFICATION + + { + -logic_name => 'Notify', + -module => 'Bio::EnsEMBL::Hive::RunnableDB::NotifyByEmail', + -parameters => { + email => $self->o('email'), + subject => $self->o('pipeline_name').' has finished', + text => 'Your pipeline has finished. Please consult the hive output' + }, + -wait_for => ['ChecksumGenerator'], + } + + ]; +} + +sub pipeline_wide_parameters { + my ($self) = @_; + + return { + %{ $self->SUPER::pipeline_wide_parameters() }, # inherit other stuff from the base class + base_path => $self->o('base_path'), + release => $self->o('release'), + }; +} + +# override the default method, to force an automatic loading of the registry in all workers +sub beekeeper_extra_cmdline_options { + my $self = shift; + return "-reg_conf ".$self->o("registry"); +} + +sub resource_classes { + my $self = shift; + return { + #Max memory consumed in a previous run was 1354MB. This gives us some breathing room + 1 => { -desc => 'dump', 'LSF' => '-q normal -M1600000 -R"select[mem>1600] rusage[mem=1600]"'}, + } +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Pipeline/SpeciesFactory.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Pipeline/SpeciesFactory.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,164 @@ +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 NAME + +Bio::EnsEMBL::Pipeline::SpeciesFactory + +=head1 DESCRIPTION + +A module which generates dump jobs for each species it finds in the Ensembl +Registry. The species we run the code on can be controlled by specifying +the I parameter or by reducing the number of DBAdaptors loaded into +the registry. + +Allowed parameters are: + +=over 8 + +=item species - Can be an array of species to perform dumps for or a single + species name. If specified only jobs will be created for + those species. Defaults to nothing so all species are processed + +item db_types - Specify the types of database to dump. Defaults to core and + should be an array. + +=back + +The code flows once per species to branch 2. + +=cut + +package Bio::EnsEMBL::Pipeline::SpeciesFactory; + +use strict; +use warnings; + +use base qw/Bio::EnsEMBL::Pipeline::Base/; + +use Bio::EnsEMBL::Registry; + +sub param_defaults { + my ($self) = @_; + return { + db_types => [qw/core/], + species => [] + }; +} + +sub fetch_input { + my ($self) = @_; + + $self->reset_empty_array_param('db_types'); + + my $core_dbas = $self->get_DBAdaptors(); + $self->info('Found %d core DBAdaptor(s) to process', scalar(@{$core_dbas})); + $self->param('dbas', $core_dbas); + + my %species_lookup = + map { $_ => 1 } + map { Bio::EnsEMBL::Registry->get_alias($_) } + @{$self->param('species')}; + $self->param('species_lookup', \%species_lookup); + + return; +} + +sub run { + my ($self) = @_; + my @dna; + my @genes; + my @species; + foreach my $dba (@{$self->param('dbas')}) { + if(!$self->process_dba($dba)) { + $self->fine('Skipping %s', $dba->species()); + next; + } + my $input_id = $self->input_id($dba); + push(@species, [ $input_id, 2 ]); + } + $self->param('species', \@species); + return; +} + +sub write_output { + my ($self) = @_; + $self->do_flow('species'); + return; +} + +sub get_DBAdaptors { + my ($self) = @_; + return Bio::EnsEMBL::Registry->get_all_DBAdaptors(-GROUP => 'core'); +} + +sub do_flow { + my ($self, $key) = @_; + my $targets = $self->param($key); + foreach my $entry (@{$targets}) { + my ($input_id, $flow) = @{$entry}; + $self->fine('Flowing %s to %d for %s', $input_id->{species}, $flow, $key); + $self->dataflow_output_id($input_id, $flow); + } + return; +} + +sub process_dba { + my ($self, $dba) = @_; + + #Reject if DB was ancestral sequences + return 0 if $dba->species() =~ /ancestral/i; + + #If species is defined then make sure we only allow those species through + if(@{$self->param('species')}) { + my $lookup = $self->param('species_lookup'); + my $name = $dba->species(); + my $aliases = Bio::EnsEMBL::Registry->get_all_aliases($name); + push(@{$aliases}, $name); + my $found = 0; + foreach my $alias (@{$aliases}) { + if($lookup->{$alias}) { + $found = 1; + last; + } + } + return $found; + } + + #Otherwise just accept + return 1; +} + +sub input_id { + my ($self, $dba, $type) = @_; + my $mc = $dba->get_MetaContainer(); + my $input_id = { + db_types => $self->db_types($dba), + species => $mc->get_production_name(), + }; + return $input_id; +} + +sub db_types { + my ($self, $dba) = @_; + return $self->param('db_types'); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/PredictionExon.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/PredictionExon.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,266 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::PredictionExon - A class representing an Exon from an ab +initio prediction method + +=head1 SYNOPSIS + + $exon = new Bio::EnsEMBL::PredictionExon( + -START => 100, + -END => 200, + -STRAND => 1, + -SLICE => $slice, + -DBID => $dbID, + -P_VALUE => 23.5, + -SCORE => 99 + ); + + # seq() returns a Bio::Seq + my $seq = $exon->seq->seq(); + + # peptide() only makes sense within transcript context + my $pep = $exon->peptide($transcript)->seq(); + + # Normal feature operations can be performed: + $exon = $exon->transform('clone'); + $exon->move( $new_start, $new_end, $new_strand ); + print $exon->slice->seq_region_name(); + +=head1 DESCRIPTION + +This is a class which represents an prediction exon which is part of a +predcition transcript. See Bio::EnsEMBL:PredictionTranscript + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::PredictionExon; +use vars qw(@ISA); +use strict; + +use Bio::EnsEMBL::Feature; +use Bio::EnsEMBL::Exon; +use Bio::EnsEMBL::Utils::Exception qw( warning throw deprecate ); +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); + + +@ISA = qw(Bio::EnsEMBL::Exon); + + +=head2 new + + Args : see SUPERCLASS Bio::EnsEMBL::Exon + Example : none + Description: create an Exon object + Returntype : Bio::EnsEMBL::PredictionExon + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub new { + my $class = shift; + + $class = ref $class || $class; + + my $self = $class->SUPER::new( @_ ); + + my ( $p_value, $score ) = + rearrange( [ "P_VALUE", "SCORE" ], @_ ); + + $self->{'p_value'} = $p_value; + $self->{'score'} = $score; + + return $self; +} + + +=head2 score + + Arg [1] : string $newval (optional) + The new value to set the score attribute to + Example : $score = $obj->score() + Description: Getter/Setter for the score attribute + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub score{ + my $self = shift; + $self->{'score'} = shift if(@_); + return $self->{'score'}; +} + + + +=head2 p_value + + Arg [1] : string $newval (optional) + The new value to set the p_value attribute to + Example : $p_value = $obj->p_value() + Description: Getter/Setter for the p_value attribute + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub p_value{ + my $self = shift; + $self->{'p_value'} = shift if(@_); + return $self->{'p_value'}; +} + + +=head2 end_phase + + Arg [1] : (optional) int $end_phase + Example : $end_phase = $feat->end_phase; + Description: Gets/Sets the end phase of the exon. + end_phase = number of bases from the last incomplete codon of + this exon. + Usually, end_phase = (phase + exon_length)%3 + but end_phase could be -1 if the exon is half-coding and its 3 + prime end is UTR. + Returntype : int + Exceptions : warning if end_phase is called without an argument and the + value is not set. + Caller : general + Status : Stable + +=cut + + + +sub end_phase { + my $self = shift; + if( @_ ) { + throw( "End_phase setting not supported" ); + } + return ($self->phase() + $self->length()) % 3; +} + + +=head2 transform + + Arg 1 : String $coordinate_system_name + Arg [2] : String $coordinate_system_version + Description: moves this exon to the given coordinate system. If this exon has + attached supporting evidence, they move as well. + Returntype : Bio::EnsEMBL::Exon + Exceptions : wrong parameters + Caller : general + Status : Stable + +=cut + + +sub transform { + my $self = shift; + + # catch for old style transform calls + if( !@_ || ( ref $_[0] && ($_[0]->isa( "Bio::EnsEMBL::Slice" ) or $_[0]->isa( "Bio::EnsEMBL::LRGSlice" )))) { + throw( "transform needs coordinate systems details now," . + "please use transfer" ); + } + + my $new_exon = Bio::EnsEMBL::Feature::transform( $self, @_ ); + return undef unless $new_exon; + + #dont want to share the same sequence cache + delete $new_exon->{'_seq_cache'}; + + return $new_exon; +} + + + +=head2 transfer + + Arg [1] : Bio::EnsEMBL::Slice $destination_slice + Example : none + Description: Moves this Exon to given target slice coordinates. If Features + are attached they are moved as well. Returns a new exon. + Returntype : Bio::EnsEMBL::Gene + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub transfer { + my $self = shift; + + my $new_exon = Bio::EnsEMBL::Feature::transfer( $self, @_ ); + return undef unless $new_exon; + + #dont want to share the same sequence cache + delete $new_exon->{'_seq_cache'}; + + return $new_exon; +} + + +=head2 add_supporting_features + + Description: For compatibility with Bio::EnsEMBL::Exon + Does nothing. + Returntype : none + Status : Stable + +=cut + +sub add_supporting_features { } + + +=head2 get_all_supporting_features + + Description: For compatibility with Bio::EnsEMBL::Exon + Does nothing and returns empty list + Returntype : empty list. + Status : Stable + +=cut + +sub get_all_supporting_features { return []; } + + +=head2 find_supporting_evidence + + Description: For compatibility with Bio::EnsEMBL::Exon + Does nothing. + Returntype : empty list. + Status : Stable + +=cut + +sub find_supporting_evidence { return []; } + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/PredictionTranscript.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/PredictionTranscript.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,571 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +PredictionTranscript + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +Container for single transcript ab initio gene prediction such as +GenScan or SNAP. Is directly storable/retrievable in Ensembl using +PredictionTranscriptAdaptor. + +Creation: + + my $tran = new Bio::EnsEMBL::PredictionTranscript(); + $tran->add_Exon($pred_exon); + + my $tran = + new Bio::EnsEMBL::PredictionTranscript( -EXONS => @pred_exons ); + +Manipulation: + + # Returns an array of PredictionExon objects + my @pred_exons = @{ $tran->get_all_Exons }; + + # Returns the peptide translation as string + my $pep = $tran->translate()->seq(); + + # Get the exon cdna sequence. + my $cdna = $trans->spliced_seq(); + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::PredictionTranscript; + +use vars qw(@ISA); +use strict; + +use Bio::Seq; +use Bio::EnsEMBL::Feature; +use Bio::EnsEMBL::Transcript; +use Bio::EnsEMBL::Translation; + +use Bio::EnsEMBL::Utils::Exception qw( deprecate throw warning ); +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); + +@ISA = qw(Bio::EnsEMBL::Transcript); + + +=head2 new + + Arg [-DISPLAY_LABEL] + string - a displayable identifier for this prediction + Arg [...] : See Bio::EnsEMBL::Transcript superclass constructor + Example : $pt = Bio::EnsEMBL::PredictionTranscript->new + ( '-start' => $seq_region_start, + '-end' => $seq_region_end, + '-strand' => $seq_region_strand, + '-adaptor' => $self, + '-slice' => $slice, + '-analysis' => $analysis, + '-dbID' => $prediction_transcript_id, + '-display_label' => $display_label); + Description: Constructor. Creates a new Bio::EnsEMBL::PredictionTranscript + object + Returntype : Bio::EnsEMBL::PredictionTranscript + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub new { + my $class = shift; + + my $self = $class->SUPER::new(@_); + + my ($display_label) = rearrange(['DISPLAY_LABEL'], @_); + + $self->{'display_label'} = $display_label; + + return $self; +} + + +=head2 coding_region_start + + Arg [1] : none + Example : $coding_region_start = $pt->coding_region_start + Description: Retrieves the start of the coding region of this transcript in + slice coordinates. For prediction transcripts this + is always the start of the transcript (i.e. there is no UTR). + By convention, the coding_region_start is always lower than + the value returned by the coding_end method. + The value returned by this function is NOT the biological + coding start since on the reverse strand the biological coding + start would be the higher genomic value. + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub coding_region_start { + my $self = shift; + return $self->start(); +} + + +=head2 coding_region_end + + Arg [1] : none + Example : $coding_region_end = $transcript->coding_region_end + Description: Retrieves the start of the coding region of this prediction + transcript. For prediction transcripts this is always the same + as the end since no UTRs are stored. + By convention, the coding_region_end is always higher than the + value returned by the coding_region_start method. + The value returned by this function is NOT the biological + coding start since on the reverse strand the biological coding + end would be the lower genomic value. + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub coding_region_end { + my $self = shift; + return $self->end(); +} + + + +=head2 get_all_translateable_Exons + + Arg [1] : none + Example : $exons = $self->get_all_translateable_Exons + Description: Retrieves the translateable portion of all exons in this + transcript. For prediction transcripts this means all exons + since no UTRs are stored for them. + Returntype : listref of Bio::EnsEMBL::PredictionExons + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_translateable_Exons { + my $self = shift; + return $self->get_all_Exons(); +} + + +=head2 display_label + + Arg [1] : string $newval (optional) + The new value to set the display_label attribute to + Example : $display_label = $pt->display_label() + Description: Getter/Setter for a displayable identifier for this + prediction transcript. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub display_label{ + my $self = shift; + return $self->{'display_label'} = shift if(@_); + return $self->{'display_label'}; +} + + + +=head2 stable_id + + Arg [1] : none + Example : print $pt->stable_id(); + Description: Gets a 'stable' identifier for this prediction transcript. Note + that prediction transcripts do not have true *stable* + identifiers (i.e. identifiers maintained between releases). + This method chains to the display_label method and is intended + to provide polymorphism with the Transcript class. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub stable_id { return display_label(@_); } + +sub get_all_DBEntries { return []; } + +sub get_all_DBLinks { return []; } + +sub add_DBEntry {} + +sub external_db { return undef; } + +sub external_status { return undef; } + +sub external_name { return undef; } + +sub is_known { return 0;} + + +=head2 translation + + Arg [1] : none + Example : $translation = $pt->translation(); + Description: Retrieves a Bio::EnsEMBL::Translation object for this prediction + transcript. Note that this translation is generated on the fly + and is not stored in the database. The translation always + spans the entire transcript (no UTRs; all CDS) and does not + have an associated dbID, stable_id or adaptor. + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub translation { + my $self = shift; + + #calculate translation on the fly + my $strand = $self->strand(); + + my $start_exon; + my $end_exon; + + my @exons = @{$self->get_all_Exons()}; + + return undef if(!@exons); + + $start_exon = $exons[0]; + $end_exon = $exons[-1]; + + my $pta; + + if($self->adaptor()) { + $pta = $self->adaptor()->db()->get_TranslationAdaptor(); + } else { + #warning("PredictionTranscript has no adaptor, may not be able to obtain " . + # "translation"); + } + + my $Xseq = $self->spliced_seq(); + my $start_phase = $start_exon->phase; + if( $start_phase > 0 ) { + $Xseq = "N"x$start_phase . $Xseq; + } + + my $tmpSeq = new Bio::Seq( -id => $self->display_id, + -seq => $Xseq, + -moltype => 'dna', + -alphabet => 'dna' ); + + return Bio::EnsEMBL::Translation->new + (-ADAPTOR => $pta, + -START_EXON => $start_exon, + -END_EXON => $end_exon, + -SEQ_START => 1, + -SEQ_END => $end_exon->length(), + -SEQ => $tmpSeq->translate()->seq()); +} + + + +=head2 translate + + Args : none + Function : Give a peptide translation of all exons currently in + the PT. Gives empty string when none is in. + Returntype: a Bio::Seq as in transcript->translate() + Exceptions: none + Caller : general + Status : Stable + +=cut + + +sub translate { + my ($self) = @_; + + my $dna = $self->translateable_seq(); + + my $codon_table_id; + if ( defined( $self->slice() ) ) { + my $attrib; + + ($attrib) = @{ $self->slice()->get_all_Attributes('codon_table') }; + if ( defined($attrib) ) { + $codon_table_id = $attrib->value(); + } + } + $codon_table_id ||= 1; #default will be vertebrates + + if( CORE::length( $dna ) % 3 == 0 ) { + # $dna =~ s/TAG$|TGA$|TAA$//i; + my $codon_table = Bio::Tools::CodonTable->new( -id => $codon_table_id ); + + if ( $codon_table->is_ter_codon( substr( $dna, -3, 3 ) ) ) { + substr( $dna, -3, 3, '' ); + } + } + # the above line will remove the final stop codon from the mrna + # sequence produced if it is present, this is so any peptide produced + # won't have a terminal stop codon + # if you want to have a terminal stop codon either comment this line out + # or call translatable seq directly and produce a translation from it + + my $bioseq = new Bio::Seq( -id => $self->display_id, + -seq => $dna, + -moltype => 'dna', + -alphabet => 'dna' ); + + my $translation = $bioseq->translate(undef,undef,undef,$codon_table_id); + + return $translation; +} + + +=head2 cdna_coding_start + + Arg [1] : none + Example : $relative_coding_start = $transcript->cdna_coding_start(); + Description: Retrieves the position of the coding start of this transcript + in cdna coordinates (relative to the start of the 5prime end of + the transcript, excluding introns, including utrs). This is + always 1 for prediction transcripts because they have no UTRs. + Returntype : int + Exceptions : none + Caller : five_prime_utr, get_all_snps, general + Status : Stable + +=cut + +sub cdna_coding_start { return 1 } + + + +=head2 cdna_coding_end + + Arg [1] : none + Example : $relative_coding_start = $transcript->cdna_coding_end(); + Description: Retrieves the position of the coding end of this transcript + in cdna coordinates (relative to the start of the 5prime end of + the transcript, excluding introns, including utrs). This is + always te length of the cdna for prediction transcripts because + they have no UTRs. + Returntype : int + Exceptions : none + Caller : five_prime_utr, get_all_snps, general + Status : Stable + +=cut + +sub cdna_coding_end { + my ($self) = @_; + return length( $self->spliced_seq() ); +} + + +=head2 transform + + Arg 1 : String $coordinate_system_name + Arg [2] : String $coordinate_system_version + Example : $ptrans = $ptrans->transform('chromosome', 'NCBI33'); + $ptrans = $ptrans->transform('clone'); + Description: Moves this PredictionTranscript to the given coordinate system. + If this Transcript has Exons attached, they move as well. + A new Transcript is returned or undefined if this PT is not + defined in the new coordinate system. + Returntype : Bio::EnsEMBL::PredictionTranscript + Exceptions : wrong parameters + Caller : general + Status : Stable + +=cut + +sub transform { + my $self = shift; + + # catch for old style transform calls + if( ref $_[0] && ($_[0]->isa( "Bio::EnsEMBL::Slice" ) or $_[0]->isa( "Bio::EnsEMBL::LRGSlice" ))) { + throw("transform needs coordinate systems details now," . + "please use transfer"); + } + + my $new_transcript = Bio::EnsEMBL::Feature::transform($self, @_ ); + return undef unless $new_transcript; + + #go through the _trans_exon_array so as not to prompt lazy-loading + if(exists($self->{'_trans_exon_array'})) { + my @new_exons; + foreach my $old_exon ( @{$self->{'_trans_exon_array'}} ) { + my $new_exon = $old_exon->transform(@_); + push(@new_exons, $new_exon); + } + $new_transcript->{'_trans_exon_array'} = \@new_exons; + } + + return $new_transcript; +} + + + +=head2 transfer + + Arg 1 : Bio::EnsEMBL::Slice $destination_slice + Example : $ptrans = $ptrans->transfer($slice); + Description: Moves this PredictionTranscript to the given slice. + If this Transcripts has Exons attached, they move as well. + If this transcript cannot be moved then undef is returned + instead. + Returntype : Bio::EnsEMBL::PredictionTranscript + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub transfer { + my $self = shift; + + my $new_transcript = $self->SUPER::transfer( @_ ); + return undef unless $new_transcript; + + if( exists $self->{'_trans_exon_array'} ) { + my @new_exons; + for my $old_exon ( @{$self->{'_trans_exon_array'}} ) { + my $new_exon = $old_exon->transfer( @_ ); + push( @new_exons, $new_exon ); + } + + $new_transcript->{'_trans_exon_array'} = \@new_exons; + } + + return $new_transcript; +} + +=head2 get_all_Exons + + Arg [1] : none + Example : my @exons = @{$transcript->get_all_Exons()}; + Description: Returns an listref of the exons in this transcipr in order. + i.e. the first exon in the listref is the 5prime most exon in + the transcript. + Returntype : a list reference to Bio::EnsEMBL::Exon objects + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_Exons { + my ($self) = @_; + if( ! defined $self->{'_trans_exon_array'} && defined $self->adaptor() ) { + $self->{'_trans_exon_array'} = $self->adaptor()->db()-> + get_PredictionExonAdaptor()->fetch_all_by_PredictionTranscript( $self ); + } + return $self->{'_trans_exon_array'}; +} + +=head2 display_id + + Arg [1] : none + Example : print $rf->display_id(); + Description: This method returns a string that is considered to be + the 'display' identifier. For prediction transcripts this is + (depending on availability and in this order) the stable Id, the + dbID or an empty string. + Returntype : string + Exceptions : none + Caller : web drawing code + Status : Stable + +=cut + +sub display_id { + my $self = shift; + return $self->stable_id || $self->dbID || ''; +} + +=head2 get_all_Attributes + + Arg [1] : none + Example : + Description: DOES NOTHING, Returns empty listref. Provided here to prevent + Transcript attributes being returned for PredictionTranscripts. + Returntype : EMPTY listref Bio::EnsEMBL::Attribute + Exceptions : none + Caller : general + Status : At risk + +=cut + +sub get_all_Attributes { + my $self = shift; + + return []; +} + + + +=head2 get_exon_count + + Description: DEPRECATED - use get_all_Exons instead + +=cut + +sub get_exon_count { + my $self = shift; + deprecate('Use scalar(@{$transcript->get_all_Exon()s}) instead'); + return scalar( @{$self->get_all_Exons} ); +} + + +=head2 set_exon_count + + Description: DEPRECATED - this method does nothing now + +=cut + +sub set_exon_count { + deprecate('This method no longer does anything.'); +} + + + +=head2 get_cdna + + Description : DEPRECATED - use spliced_seq() or translateable_seq instead + +=cut + +sub get_cdna { + my $self = shift; + deprecate('use spliced_seq instead'); + return $self->spliced_seq(); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/ProjectionSegment.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/ProjectionSegment.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,120 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::ProjectionSegment - part of the list that is returned from +project function calls + +=head1 SYNOPSIS + + $slice = + $sa->fetch_by_region( 'chromosome', 'X', 1_000_000, 2_000_000 ); + + my $projection = $slice->project("clone"); + + foreach my $projection_segment (@$projection) { + print( " from_start ", $projection_segment->from_start(), "\n" ); + print( " from_end ", $projection_segment->from_end(), "\n" ); + print( " to_Slice ", + $projection_segment->to_Slice()->name(), "\n" ); + } + +=head1 DESCRIPTION + +The ProjectionSegment is a helper object to make the arrays returned by +project more accessible. Instead of writing $segment->[0], $segment->[1] +or $segment->[2] its possible to use the more descriptive notation of +$segment->from_start(), $segement->from_end(), $segment->to_Slice(). + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::ProjectionSegment; + +# +# WARNING: THIS CLASS IS REPRESENTED BY A BLESSED ARRAY REFERENCE +# NOT A HASH REFERENCE +# + + + + + +=head2 from_start + + Args : none + Example : $coord_in_fetaure_start = $segment->from_start() + Description: First element in projects returned segment lists + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub from_start { + my $self = shift; + return $self->[0]; +} + + + +=head2 from_end + + Args : none + Example : $coord_in_feature_end = $segment->from_end() + Description: Second element in projects returned segment lists + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub from_end { + my $self = shift; + return $self->[1]; +} + + + + +=head2 to_Slice + + Args : none + Example : $target_slice = $segment->to_Slice() + Description: Third element in projects returned segment lists + Returntype : Bio::EnsEMBL::Slice + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub to_Slice { + my $self = shift; + return $self->[2]; +} + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/ProteinFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/ProteinFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,175 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::ProteinFeature + +=head1 SYNOPSIS + + my $feature = Bio::EnsEMBL::ProteinFeature->new( + -start => $start, + -end => $end, + -hstart => $hit_start, + -hend => $hit_end, + -hseqname => $hit_name + ); + +=head1 DESCRIPTION + +ProteinFeature objects represent domains or other features of interest +on a peptide sequence. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::ProteinFeature; + +use strict; + +use Bio::EnsEMBL::FeaturePair; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::FeaturePair); + + + +=head2 new + + Arg [IDESC] : (optional) string An interpro description + Arg [INTERPRO_AC] : (optional) string An interpro accession + Arg [TRANSLATION_ID] : (optional) integer A translation dbID + Arg [...] : named arguments to FeaturePair superclass + Example : + + $pf = + Bio::EnsEMBL::ProteinFeature->new( -IDESC => $idesc, + -INTERPRO_AC => $iac, + @fp_args ); + + Description: Instantiates a Bio::EnsEMBL::ProteinFeature + Returntype : Bio::EnsEMBL::FeaturePair + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub new { + my $proto = shift; + + my $class = ref($proto) || $proto; + + my ( $idesc, $interpro_ac, $translation_id ) = + rearrange( [ 'IDESC', 'INTERPRO_AC', 'TRANSLATION_ID' ], @_ ); + + my $self = $class->SUPER::new(@_); + + # the strand of protein features is always 0 + $self->{'strand'} = 0; + $self->{'idesc'} = $idesc || ''; + $self->{'interpro_ac'} = $interpro_ac || ''; + $self->{'translation_id'} = $translation_id || ''; + + return $self; +} + + +=head2 strand + + Arg [1] : Ignored + Description: Overwrites Bio::EnsEMBL::Feature->strand to not allow + : the strand to be set. + Returntype : int + Status : Stable + +=cut + +#do not allow the strand to be set +sub strand { + my $self = shift; + return $self->{'strand'}; +} + + + +=head2 idesc + + Arg [1] : (optional) string The interpro description + Example : print $protein_feature->idesc(); + Description: Getter/Setter for the interpro description of this protein + feature. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub idesc{ + my $self = shift; + $self->{'idesc'} = shift if(@_); + return $self->{'idesc'}; +} + + + +=head2 interpro_ac + + Arg [1] : (optional) string The interpro accession + Example : print $protein_feature->interpro_ac(); + Description: Getter/Setter for the interpro accession of this protein + feature. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub interpro_ac{ + my $self = shift; + $self->{'interpro_ac'} = shift if(@_); + return $self->{'interpro_ac'}; +} + + +=head2 translation_id + + Arg [1] : (optional) integer The dbID of the translation + Example : print $protein_feature->translation_id(); + Description: Getter/Setter for the translation dbID of this protein + feature. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub translation_id { + my $self = shift; + $self->{'translation_id'} = shift if (@_); + return $self->{'translation_id'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Registry.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Registry.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,2832 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Registry + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Registry; + + my $registry = 'Bio::EnsEMBL::Registry'; + + $registry->load_all("configuration_file"); + + $gene_adaptor = $registry->get_adaptor( 'Human', 'Core', 'Gene' ); + +=head1 DESCRIPTION + +All Adaptors are stored/registered using this module. This module should +then be used to get the adaptors needed. + +The registry can be loaded from a configuration file using the load_all +method. + +If a filename is passed to load_all then this is used. Else if the +environment variable ENSEMBL_REGISTRY is set to the name on an existing +configuration file, then this is used. Else if the file .ensembl_init +in your home directory exist, it is used. + +For the Web server ENSEMBL_REGISTRY should be set in SiteDefs.pm. This +will then be passed on to load_all. + + +The registry can also be loaded via the method load_registry_from_db +which given a database host will load the latest versions of the Ensembl +databases from it. + +The four types of registries are for db adaptors, dba adaptors, dna +adaptors and the standard type. + +=head2 db + +These are registries for backwards compatibility and enable the +subroutines to add other adaptors to connections. + +e.g. get_all_db_adaptors, get_db_adaptor, add_db_adaptor, +remove_db_adaptor are the old DBAdaptor subroutines which are now +redirected to the Registry. + +So if before we had + + my $sfa = $self->adaptor()->db()->get_db_adaptor('blast'); + +We now want to change this to + + my $sfa = + Bio::EnsEMBL::Registry->get_adaptor( "human", "core", "blast" ); + + +=head2 DBA + +These are the stores for the DBAdaptors + +The Registry will create all the DBConnections needed now if you set up +the configuration correctly. So instead of the old commands like + + my $db = Bio::EnsEMBL::DBSQL::DBAdaptor->new(...); + my $exon_adaptor = $db->get_ExonAdaptor; + +we should now have just + + my $exon_adaptor = + Bio::EnsEMBL::Registry->get_adaptor( "human", "core", "exon" ); + + +=head2 DNA + +This is an internal Registry and allows the configuration of a dnadb. +An example here is to set the est database to get its dna data from the +core database. + + ## set the est db to use the core for getting dna data. + # Bio::EnsEMBL::Utils::ConfigRegistry->dnadb_add( "Homo Sapiens", + # "core", "Homo Sapiens", "est" ); + + +=head2 adaptors + +This is the registry for all the general types of adaptors like +GeneAdaptor, ExonAdaptor, Slice Adaptor etc. + +These are accessed by the get_adaptor subroutine i.e. + + my $exon_adaptor = + Bio::EnsEMBL::Registry->get_adaptor( "human", "core", "exon" ); + +=head1 METHODS + +=cut + + + +package Bio::EnsEMBL::Registry; +use strict; +use warnings; + +our $NEW_EVAL = 0; + +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; +use Bio::EnsEMBL::Utils::Exception qw( deprecate throw warning ); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::ConfigRegistry; +use Bio::EnsEMBL::ApiVersion; +use Bio::EnsEMBL::Utils::URI qw/parse_uri/; + +use DBI qw(:sql_types); + +use vars qw(%registry_register); + +# This is a map from group names to Ensembl DB adaptors. Used by +# load_all() and reset_DBAdaptor(). +my %group2adaptor = ( + 'blast' => 'Bio::EnsEMBL::External::BlastAdaptor', + 'compara' => 'Bio::EnsEMBL::Compara::DBSQL::DBAdaptor', + 'core' => 'Bio::EnsEMBL::DBSQL::DBAdaptor', + 'estgene' => 'Bio::EnsEMBL::DBSQL::DBAdaptor', + 'funcgen' => 'Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor', + 'regulation' => 'Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor', + 'haplotype' => 'Bio::EnsEMBL::ExternalData::Haplotype::DBAdaptor', + 'hive' => 'Bio::EnsEMBL::Hive::DBSQL::DBAdaptor', + 'ontology' => 'Bio::EnsEMBL::DBSQL::OntologyDBAdaptor', + 'otherfeatures' => 'Bio::EnsEMBL::DBSQL::DBAdaptor', + 'pipeline' => 'Bio::EnsEMBL::Pipeline::DBSQL::DBAdaptor', + 'snp' => 'Bio::EnsEMBL::ExternalData::SNPSQL::DBAdaptor', + 'stable_ids' => 'Bio::EnsEMBL::DBSQL::DBAdaptor', + 'variation' => 'Bio::EnsEMBL::Variation::DBSQL::DBAdaptor', + 'vega' => 'Bio::EnsEMBL::DBSQL::DBAdaptor', + 'vega_update' => 'Bio::EnsEMBL::DBSQL::DBAdaptor', +); + + +=head2 load_all + + Will load the registry with the configuration file which is + obtained from the first in the following and in that order. + + 1) If an argument is passed to this method, this is used as the + name of the configuration file to read. + + 2) If the environment variable ENSEMBL_REGISTRY is set, this is + used as the name of the configuration file to read. + + 3) If the file .ensembl_init exist in the home directory, it is + used as the configuration file. + + Arg [1] : (optional) string + Name of file to load the registry from. + + Arg [2] : (optional) integer + If not 0, will print out all information. + + Arg [3] : (optional) integer + If not 0, the database connection will not be + cleared, if 0 or if not set the database connections + will be cleared (this is the default). + + Arg [4]: (optional) boolean + This option will turn off caching for slice features, + so, every time a set of features is retrieved, + they will come from the database instead of the + cache. This option is only recommended for advanced + users, specially if you need to store and retrieve + features. It might reduce performance when querying + the database if not used properly. If in doubt, do + not use it or ask in the developer mailing list. + + Example : Bio::EnsEMBL::Registry->load_all(); + Returntype : Int count of the DBAdaptor instances which can be found in the + registry due to this method being called. Will never be negative + Exceptions : none + Status : Stable + +=cut + +sub load_all { + my ($class, $config_file, $verbose, $no_clear, $no_cache ) = @_; + + if ( !defined($config_file) ) { + if ( defined( $ENV{ENSEMBL_REGISTRY} ) ) { + $config_file = $ENV{ENSEMBL_REGISTRY}; + } elsif ( defined( $ENV{HOME} ) ) { + $config_file = $ENV{HOME} . "/.ensembl_init"; + } + } + + $verbose ||= 0; + $no_clear ||= 0; + $no_cache ||= 0; + + my $original_count = $class->get_DBAdaptor_count(); + + if ( !defined($config_file) ) { + if ($verbose) { + print( STDERR + "No default registry configuration to load.\n" ); + } + } elsif ( !-e $config_file ) { + if ($verbose) { + printf( STDERR "Configuration file '%s' does not exist. " + . "Registry configuration not loaded.\n", + $config_file ); + } + } else { + if ( defined( $registry_register{'seen'} ) ) { + if ( !$no_clear ) { + if ($verbose) { + print( STDERR "Clearing previously loaded " + . "registry configuration\n" ); + } + $class->clear(); + } + } + $registry_register{'seen'} = 1; + + if ($verbose) { + printf( STDERR + "Loading registry configuration from '%s'.\n", + $config_file ); + } + + my $cfg; + + my $test_eval = eval { require Config::IniFiles }; + + if ($@ or (!$test_eval)) { + # The user does not have the 'Config::IniFiles' module. + if ($verbose) { + print( STDERR "No Config::IniFiles module found, " + . "assuming this is not an ini-file\n" ); + } + # If the configuration file *is* an ini-file, we can expect a + # load of compilation errors from the next eval... + } else { + # The user has the 'Config::IniFiles' module installed. See + # if this is an ini-file or not... + $cfg = Config::IniFiles->new( -file => $config_file ); + } + + if ( defined $cfg ) { + my %default_adaptor_args = (); + + if ( $cfg->SectionExists('default') ) { + # The 'default' section is special. It contain default + # values that should be implicit to all other section in + # this configuration file. Aliases are added if there + # is also a 'species' setting. + + my $alias = $cfg->val( 'default', 'alias' ); + $cfg->delval( 'default', 'alias' ); + + my $species = $cfg->val( 'default', 'species' ); + + if ( defined($alias) && defined($species) ) { + Bio::EnsEMBL::Utils::ConfigRegistry->add_alias( + -species => $species, + -alias => [ split( /\n/, $alias ) ] + ); + } + + %default_adaptor_args = + map { '-' . $_ => $cfg->val( 'default', $_ ) } + $cfg->Parameters('default'); + } + + foreach my $section ( $cfg->Sections() ) { + if ( $section eq 'default' ) + { # We have already done the 'default' section. + next; + } + + my $group = $cfg->val( $section, 'group' ) + || $cfg->val( 'default', 'group' ); + + if ( !defined($group) ) { + printf( STDERR "Key 'group' is undefined " + . "for configuration section '%s', " + . "skipping this section.\n", + $section ); + next; + } + + my $adaptor = $group2adaptor{ lc($group) }; + if ( !defined($adaptor) ) { + printf( STDERR "Unknown group '%s' " + . "for configuration section '%s', " + . "skipping this section.\n", + $group, $section ); + next; + } + + # Handle aliases. A section must have both an 'alias' + # setting and a 'species' setting for aliases to be + # added. The 'species' setting might be inherited from + # the 'default' section. + + my $alias = $cfg->val( $section, 'alias' ); + $cfg->delval( $section, 'alias' ); + + my $species = $cfg->val( $section, 'species' ) + || $cfg->val( 'default', 'species' ); + + if ( defined($alias) && defined($species) ) { + Bio::EnsEMBL::Utils::ConfigRegistry->add_alias( + -species => $species, + -alias => [ split( /\n/, $alias ) ] + ); + } + + # Fill in the adaptor initialization arguments. + # We trust the user to provide sensible key-value pairs. + my %adaptor_args = %default_adaptor_args; + foreach my $parameter ( $cfg->Parameters($section) ) { + $adaptor_args{ '-' . $parameter } = + $cfg->val( $section, $parameter ); + + # when set, do not use the feature cache in the + # different adaptors + if ($no_cache) { + $adaptor_args{'-no_cache'} = 1; + } + } + + if ($verbose) { + printf( "Configuring adaptor '%s' " + . "for configuration section '%s'...\n", + $adaptor, $section ); + } + + my $test_eval = eval "require $adaptor"; + if ($@ or (!$test_eval)) { die($@) } + + $adaptor->new(%adaptor_args); + + } ## end foreach my $section ( $cfg->Sections... + } else { + # This is probably no ini-file but an old style piece + # of configuration written in Perl. We need to try to + # require() it. + + my $test_eval; + if($NEW_EVAL) { + require Bio::EnsEMBL::Utils::IO; + my $contents = Bio::EnsEMBL::Utils::IO::slurp($config_file); + $test_eval = eval $contents; + } + else { + $test_eval = eval { require($config_file) }; + # To make the web code avoid doing this again we delete first + delete $INC{$config_file}; + } + + #Now raise the exception just in case something above is + #catching this + if ($@ or (!$test_eval)) { die($@) } + + } + } ## end else [ if ( !defined($config_file... + + my $count = $class->get_DBAdaptor_count() - $original_count; + return $count >= 0 ? $count : 0; +} ## end sub load_all + +=head2 clear + + Will clear the registry and disconnect from all databases. + + Example : Bio::EnsEMBL::Registry->clear(); + Returntype : none + Exceptions : none + Status : Stable + +=cut + +sub clear{ + my ($self); + + foreach my $dba (@{$registry_register{'_DBA'}}){ + if($dba->dbc->connected){ + $dba->dbc->db_handle->disconnect(); + } + } + %registry_register = (); + return; +} + +# +# db adaptors. (for backwards compatibility) +# + +=head2 add_db + + Arg [1] : db (DBAdaptor) to add adaptor to. + Arg [2] : name of the name to add the adaptor to in the registry. + Arg [3] : The adaptor to be added to the registry. + Example : Bio::EnsEMBL::Registry->add_db($db, "lite", $dba); + Returntype : none + Exceptions : none + Status : At Risk. + : This is here for backwards compatibility only and may + : be removed eventually. Solution is to make sure the + : db and the adaptor have the same species and the call + : is then no longer needed. + +=cut + +sub add_db { + my ( $class, $db, $name, $adap ) = @_; + + if ( lc( $db->species() ) ne lc( $adap->species ) ) { + $registry_register{_SPECIES}{ lc( $db->species() ) } + { lc( $db->group() ) }{'_special'}{ lc($name) } = $adap; + } + return; +} + +=head2 remove_db + + Arg [1] : db (DBAdaptor) to remove adaptor from. + Arg [2] : name to remove the adaptor from in the registry. + Example : my $db = Bio::EnsEMBL::Registry->remove_db($db, "lite"); + Returntype : adaptor + Exceptions : none + Status : At Risk. + : This is here for backwards compatibility only and may + : be removed eventually. Solution is to make sure the + : db and the adaptor have the same species and the call + : is then no longer needed. + +=cut + +sub remove_db { + my ( $class, $db, $name ) = @_; + + my $ret = + $registry_register{_SPECIES}{ lc( $db->species() ) } + { lc( $db->group() ) }{'_special'}{ lc($name) }; + + $registry_register{_SPECIES}{ lc( $db->species() ) } + { lc( $db->group() ) }{'_special'}{ lc($name) } = undef; + + return $ret; +} + +=head2 get_db + + Arg [1] : db (DBAdaptor) to get adaptor from. + Arg [2] : name to get the adaptor for in the registry. + Example : my $db = Bio::EnsEMBL::Registry->get_db("Human", "core", "lite"); + Returntype : adaptor + Exceptions : See get_DBAdaptor() + Status : At Risk. + : This is here for backwards compatibility only and may + : be removed eventually. Solution is to make sure the + : db and the adaptor have the same species then call + : get_DBAdaptor instead. + +=cut + +sub get_db { + my ( $class, $db, $name ) = @_; + + my $ret = Bio::EnsEMBL::Registry->get_DBAdaptor( lc( $db->species ), + lc($name) ); + + if ( defined($ret) ) { return $ret } + + return $registry_register{_SPECIES}{ lc( $db->species() ) } + { lc( $db->group() ) }{'_special'}{ lc($name) }; +} + +=head2 get_all_db_adaptors + + Arg [1] : db (DBAdaptor) to get all the adaptors from. + Example : my $db = Bio::EnsEMBL::Registry->get_all_db_adaptors($db); + Returntype : adaptor + Exceptions : none + Status : At Risk. + : This is here for backwards compatibility only and + : may be removed eventually. Solution is to make + : sure the dbs all have the same species then call + : get_all_DBAdaptors(-species => "human"); + + +=cut + +sub get_all_db_adaptors { + my ( $class, $db ) = @_; + my %ret = (); + + # we now also want to add all the DBAdaptors for the same species. + # as add_db_adaptor does not add if it is from the same species. + + foreach my $dba ( @{ $registry_register{'_DBA'} } ) { + if ( lc( $dba->species() ) eq lc( $db->species() ) ) { + $ret{ $dba->group() } = $dba; + } + } + + foreach my $key ( + keys %{ + $registry_register{_SPECIES} + { $class->get_alias( $db->species() ) }{ lc( $db->group() ) } + {'_special'} } ) + { + $ret{$key} = + $registry_register{_SPECIES} + { $class->get_alias( $db->species() ) }{ lc( $db->group() ) } + {'_special'}{$key}; + } + + return \%ret; +} ## end sub get_all_db_adaptors + + +# +# DBAdaptors +# + +=head2 add_DBAdaptor + + Arg [1] : name of the species to add the adaptor to in the registry. + Arg [2] : name of the group to add the adaptor to in the registry. + Arg [3] : DBAdaptor to be added to the registry. + Example : Bio::EnsEMBL::Registry->add_DBAdaptor("Human", "core", $dba); + Returntype : none + Exceptions : none + caller : internal + Status : Stable + +=cut + +sub add_DBAdaptor { + my ( $class, $species, $group, $adap ) = @_; + + if ( !( $class->alias_exists($species) ) ) { + $class->add_alias( $species, $species ); + } + + $species = $class->get_alias($species); + + $registry_register{_SPECIES}{$species}{ lc($group) }{'_DB'} = $adap; + + if ( !defined( $registry_register{'_DBA'} ) ) { + $registry_register{'_DBA'} = [$adap]; + } else { + push( @{ $registry_register{'_DBA'} }, $adap ); + } + return; +} + + + +=head2 get_DBAdaptor + + Arg [1] : name of the species to get the adaptor for in the registry. + Arg [2] : name of the group to get the adaptor for in the registry. + Arg [3] : if set will not give warnings when looking for alias. + Example : $dba = Bio::EnsEMBL::Registry->get_DBAdaptor("Human", "core"); + Returntype : DBAdaptor + Exceptions : If $species is not defined and if no valid internal name + could be found for $species. If thrown check your API and DB + version + Status : Stable + +=cut + +sub get_DBAdaptor { + my ( $class, $species, $group, $no_alias_check ) = @_; + + if ( !defined($species) ) { + throw('Species not defined.'); + } + + my $ispecies = $class->get_alias( $species, $no_alias_check ); + + if ( !defined($ispecies) ) { + if(! $no_alias_check) { + throw("Can not find internal name for species '$species'"); + } + } + else { $species = $ispecies } + + return $registry_register{_SPECIES}{$species}{ lc($group) }{'_DB'}; +} + +=head2 get_all_DBAdaptors + + Arg [SPECIES]: (optional) string + species name to get adaptors for + Arg [GROUP] : (optional) string + group name to get adaptors for + Example : + @dba = + @{ Bio::EnsEMBL::Registry->get_all_DBAdaptors() }; + + @human_dbas = + @{ Bio::EnsEMBL::Registry->get_all_DBAdaptors( + -species => 'human' + ) }; + + Returntype : list of DBAdaptors + Exceptions : none + Status : Stable + +=cut + +sub get_all_DBAdaptors { + my ( $class, @args ) = @_; + + my ( $species, $group ) = rearrange( [qw(SPECIES GROUP)], @args ); + + if ( defined($species) ) { $species = $class->get_alias($species) } + + my @ret; + foreach my $dba ( @{ $registry_register{'_DBA'} } ) { + if ( ( !defined($species) || lc($species) eq lc( $dba->species() ) ) + && ( !defined($group) || lc($group) eq lc( $dba->group() ) ) ) + { + push( @ret, $dba ); + } + } + + return \@ret; +} + +=head2 get_all_DBAdaptors_by_connection + + Arg [1] : DBConnection used to find DBAdaptors + Returntype : reference to list of DBAdaptors + Exceptions : none + Example : @dba = @{ Bio::EnsEMBL::Registry + ->get_all_DBAdaptors_by_connection($dbc) }; + Status : Stable + +=cut + +sub get_all_DBAdaptors_by_connection { + my ( $self, $dbc_orig ) = @_; + + my @return; + + foreach my $dba ( @{ $registry_register{'_DBA'} } ) { + my $dbc = $dba->dbc(); + + if ( defined($dbc) + && $dbc->can('equals') + && $dbc->equals($dbc_orig) ) + { + push( @return, $dba ); + } + } + + return \@return; +} + +=head2 get_all_DBAdaptors_by_dbname + + Arg [1] : string, name of database + Returntype : reference to list of DBAdaptors + Exceptions : none + Example : @dba = @{ Bio::EnsEMBL::Registry + ->get_all_DBAdaptors_by_dbname($dbname) }; + Status : Stable + +=cut + +sub get_all_DBAdaptors_by_dbname { + my ( $self, $dbname ) = @_; + + my @return; + + foreach my $dba ( @{ $registry_register{'_DBA'} } ) { + my $dbc = $dba->dbc(); + + if ( defined($dbc) && $dbc->dbname() eq $dbname ) { + push( @return, $dba ); + } + } + + return \@return; +} + +=head2 remove_DBAdaptor + + Arg [1] : name of the species to get the adaptor for in the registry. + Arg [2] : name of the group to get the adaptor for in the registry. + Example : $dba = Bio::EnsEMBL::Registry->remove_DBAdaptor("Human", "core"); + Returntype : none + Exceptions : none + Status : At risk + +=cut + +sub remove_DBAdaptor { + my ( $class, $species, $group ) = @_; + + $species = $class->get_alias($species); + + delete $registry_register{_SPECIES}{$species}{$group}; + # This will remove the DBAdaptor and all the other adaptors + + # Now remove if from the _DBA array + my $index; + + foreach my $i ( 0 .. $#{ $registry_register{'_DBA'} } ) { + my $dba = $registry_register{'_DBA'}->[$i]; + + if ( ( $dba->species eq $species ) + && $dba->group eq $group ) + { + $index = $i; + last; + } + } + + # Now remove from _DBA cache + if ( defined($index) ) { + splice( @{ $registry_register{'_DBA'} }, $index, 1 ); + } + + return; +} ## end sub remove_DBAdaptor + + + +=head2 reset_DBAdaptor + + Arg [1]: string - species e.g. homo_sapiens + Arg [2]: string - DB group e.g. core + Arg [3]: string - new dbname + Args [4-7]: string - optional DB parameters, defaults to current db params if omitted + Arg [8]: hashref - Hash ref of additional parameters e.g. eFG dnadb params for auto selecting dnadb + Usage : $reg->reset_registry_db( 'homo_sapiens', 'core', + 'homo_sapiens_core_37_35j' ); + Description: Resets a DB within the registry. + Exceptions: Throws if mandatory params not supplied + Throws if species name is not already seen by the registry + Throws if no current DB for species/group available + Status : At risk + +=cut + +sub reset_DBAdaptor { + my ( + $self, $species, $group, $dbname, $host, + $port, $user, $pass, $params + ) = @_; + + # Check mandatory params + if ( !( defined $species && defined $group && defined $dbname ) ) { + throw( + 'Must provide at least a species, group, and dbname parameter ' + . 'to redefine a DB in the registry' ); + } + + # Validate species here + my $alias = $self->get_alias($species); + throw("Could not find registry alias for species:\t$species") + if ( !defined $alias ); + + # Get all current defaults if not defined + + my $db = $self->get_DBAdaptor( $alias, $group ); + my $class; + + if ($db) { + $class = ref($db); + $host ||= $db->dbc->host; + $port ||= $db->dbc->port; + $user ||= $db->dbc->username; + $pass ||= $db->dbc->password; + } else { + #Now we need to test mandatory params + $class = $group2adaptor{ lc($group) }; + + if ( !( $host && $user ) ) { + throw("No comparable $alias $group DB present in Registry. " + . "You must pass at least a dbhost and dbuser" ); + } + } + + $self->remove_DBAdaptor( $alias, $group ); + + # ConfigRegistry should automatically add this to the Registry + $db = $class->new( + -user => $user, + -host => $host, + -port => $port, + -pass => $pass, + -dbname => $dbname, + -species => $alias, + -group => $group, + %{$params} ); + + return $db; +} ## end sub reset_DBAdaptor + + +# +# DNA Adaptors +# + +=head2 add_DNAAdaptor + + Arg [1] : name of the species to add the adaptor to in the registry. + Arg [2] : name of the group to add the adaptor to in the registry. + Arg [3] : name of the species to get the dna from + Arg [4] : name of the group to get the dna from + Example : Bio::EnsEMBL::Registry->add_DNAAdaptor("Human", "estgene", "Human", "core"); + Returntype : none + Exceptions : none + Status : Stable + +=cut + +sub add_DNAAdaptor { + my ( $class, $species, $group, $dnadb_species, $dnadb_group ) = @_; + + $species = $class->get_alias($species); + $dnadb_species = $class->get_alias($dnadb_species); + if ( $dnadb_group->isa('Bio::EnsEMBL::DBSQL::DBAdaptor') ) { + deprecated(""); + } else { + $registry_register{_SPECIES}{$species}{ lc($group) }{'_DNA'} = + $dnadb_group; + $registry_register{_SPECIES}{$species}{ lc($group) }{'_DNA2'} = + $dnadb_species; + } + return; +} + +=head2 get_DNAAdaptor + + Arg [1] : name of the species to get the adaptor for in the registry. + Arg [2] : name of the group to get the adaptor for in the registry. + Example : $dnaAdap = Bio::EnsEMBL::Registry->get_DNAAdaptor("Human", "core"); + Returntype : adaptor + Exceptions : none + Status : Stable + +=cut + +sub get_DNAAdaptor { + my ( $class, $species, $group ) = @_; + + $species = $class->get_alias($species); + my $new_group = + $registry_register{_SPECIES}{$species}{ lc($group) }{'_DNA'}; + my $new_species = + $registry_register{_SPECIES}{$species}{ lc($group) }{'_DNA2'}; + + if ( defined $new_group ) { + return $class->get_DBAdaptor( $new_species, $new_group ); + } + + return; +} + +# +# General Adaptors +# + +=head2 add_adaptor + + Arg [1] : name of the species to add the adaptor to in the registry. + Arg [2] : name of the group to add the adaptor to in the registry. + Arg [3] : name of the type to add the adaptor to in the registry. + Arg [4] : The DBAdaptor to be added to the registry. + Arg [5] : (optional) Set to allow overwrites of existing adaptors. + Example : Bio::EnsEMBL::Registry->add_adaptor("Human", "core", "Gene", $adap); + Returntype : none + Exceptions : none + Caller : internal + Status : Stable + +=cut + +sub add_adaptor { + my ( $class, $species, $group, $type, $adap, $reset ) = @_; + + $species = $class->get_alias($species); + + # Since the adaptors are not stored initially, only their class paths + # when the adaptors are obtained, we need to store these instead. It + # is not necessarily an error if the registry is overwritten without + # the reset set but it is an indication that we are overwriting a + # database which should be a warning for now + + if ( defined($reset) ) + { # JUST RESET THE HASH VALUE NO MORE PROCESSING NEEDED + $registry_register{_SPECIES}{$species}{ lc($group) }{ lc($type) } = + $adap; + return; + } + + if ( + defined( + $registry_register{_SPECIES}{$species}{ lc($group) }{ lc($type) } + ) ) + { + # print STDERR ( + # "Overwriting Adaptor in Registry for $species $group $type\n"); + $registry_register{_SPECIES}{$species}{ lc($group) }{ lc($type) } = + $adap; + return; + } + $registry_register{_SPECIES}{$species}{ lc($group) }{ lc($type) } = + $adap; + + if ( !defined( $registry_register{_SPECIES}{$species}{'list'} ) ) { + $registry_register{_SPECIES}{$species}{'list'} = [$type]; + } else { + push( @{ $registry_register{_SPECIES}{$species}{'list'} }, $type ); + } + + if ( !defined( $registry_register{_TYPE}{ lc($type) }{$species} ) ) { + $registry_register{_TYPE}{ lc($type) }{$species} = [$type]; + } else { + push( @{ $registry_register{_TYPE}{ lc($type) }{$species} }, + $adap ); + } + return; +} ## end sub add_adaptor + + +=head2 get_adaptor + + Arg [1] : name of the species to add the adaptor to in the registry. + Arg [2] : name of the group to add the adaptor to in the registry. + Arg [3] : name of the type to add the adaptor to in the registry. + Example : $adap = Bio::EnsEMBL::Registry->get_adaptor("Human", "core", "Gene"); + Returntype : adaptor + Exceptions : Thrown if a valid internal name cannot be found for the given + name. If thrown check your API and DB version. Also thrown if + no type or group was given + Status : Stable + +=cut + +sub get_adaptor { + my ( $class, $species, $group, $type ) = @_; + + my $ispecies = $class->get_alias($species); + + if ( !defined($ispecies) ) { + throw("Can not find internal name for species '$species'"); + } + else { $species = $ispecies } + + throw 'No adaptor group given' if ! defined $group; + throw 'No adaptor type given' if ! defined $type; + + + if($type =~ /Adaptor$/i) { + warning("Detected additional Adaptor string in given the type '$type'. Removing it to avoid possible issues. Alter your type to stop this message"); + $type =~ s/Adaptor$//i; + } + + my %dnadb_adaptors = ( + 'sequence' => 1, + 'assemblymapper' => 1, + 'karyotypeband' => 1, + 'repeatfeature' => 1, + 'coordsystem' => 1, + 'assemblyexceptionfeature' => 1 + ); + + # warn "$species, $group, $type"; + + $type = lc($type); + + # For historical reasons, allow use of group 'regulation' to refer to + # group 'funcgen'. + if ( lc($group) eq 'regulation' ) { $group = 'funcgen' } + + my $dnadb_group = + $registry_register{_SPECIES}{$species}{ lc($group) }{'_DNA'}; + + if ( defined($dnadb_group) + && defined( $dnadb_adaptors{ lc($type) } ) ) + { + $species = + $registry_register{_SPECIES}{$species}{ lc($group) }{'_DNA2'}; + $group = $dnadb_group; + } + + my $ret = + $registry_register{_SPECIES}{$species}{ lc($group) }{ lc($type) }; + + if ( !defined($ret) ) { return } + if ( ref($ret) ) { return $ret } + + # Not instantiated yet + + my $dba = $registry_register{_SPECIES}{$species}{ lc($group) }{'_DB'}; + my $module = $ret; + + my $test_eval = eval "require $module"; + if ($@ or (!$test_eval)) { + warning("'$module' cannot be found.\nException $@\n"); + return; + } + + if ( + !defined( + $registry_register{_SPECIES}{$species}{ lc($group) }{'CHECKED'} ) + ) + { + $registry_register{_SPECIES}{$species}{ lc($group) }{'CHECKED'} = 1; + $class->version_check($dba); + } + + my $adap = "$module"->new($dba); + Bio::EnsEMBL::Registry->add_adaptor( $species, $group, $type, $adap, + 'reset' ); + $ret = $adap; + + return $ret; +} ## end sub get_adaptor + +=head2 get_all_adaptors + + Arg [SPECIES] : (optional) string + species name to get adaptors for + Arg [GROUP] : (optional) string + group name to get adaptors for + Arg [TYPE] : (optional) string + type to get adaptors for + Example : @adaps = @{Bio::EnsEMBL::Registry->get_all_adaptors()}; + Returntype : ref to list of adaptors + Exceptions : none + Status : Stable + +=cut + +sub get_all_adaptors{ + my ($class,@args)= @_; + my ($species, $group, $type); + my @ret=(); + my (%species_hash, %group_hash, %type_hash); + + + if(@args == 1){ # Old species only one parameter + warn("-SPECIES argument should now be used to get species adaptors"); + $species = $args[0]; + } + else{ + # new style -SPECIES, -GROUP, -TYPE + ($species, $group, $type) = + rearrange([qw(SPECIES GROUP TYPE)], @args); + } + + if(defined($species)){ + $species_hash{$species} = 1; + } + else{ + # get list of species + foreach my $dba (@{$registry_register{'_DBA'}}){ + $species_hash{lc($dba->species())} = 1; + } + } + if(defined($group)){ + $group_hash{$group} = 1; + } + else{ + foreach my $dba (@{$registry_register{'_DBA'}}){ + $group_hash{lc($dba->group())} = 1; + } + } + + if ( defined($type) ) { + $type_hash{$type} = 1; + } else { + foreach my $dba ( @{ $registry_register{'_DBA'} } ) { + foreach my $ty ( + @{ $registry_register{_SPECIES}{ lc( $dba->species ) }{'list'} } + ) + { + $type_hash{ lc($ty) } = 1; + } + } + } + + ### NOW NEED TO INSTANTIATE BY CALLING get_adaptor + foreach my $sp ( keys %species_hash ) { + foreach my $gr ( keys %group_hash ) { + foreach my $ty ( keys %type_hash ) { + my $temp = $class->get_adaptor( $sp, $gr, $ty ); + if ( defined($temp) ) { + push @ret, $temp; + } + } + } + } + + return (\@ret); +} + + +=head2 add_alias + + Arg [1] : name of the species to add alias for + Arg [2] : name of the alias + Example : Bio::EnsEMBL::Registry->add_alias("Homo Sapiens","Human"); + Description: add alternative name for the species. + Returntype : none + Exceptions : none + Status : Stable + +=cut + +sub add_alias{ + my ($class, $species,$key) = @_; + + $registry_register{'_ALIAS'}{lc($key)} = lc($species); + return; +} + +=head2 remove_alias + + Arg [1] : name of the species to remove alias for + Arg [2] : name of the alias + Example : Bio::EnsEMBL::Registry->remove_alias("Homo Sapiens","Human"); + Description: remove alternative name for the species. + Returntype : none + Exceptions : none + Status : Stable + +=cut + +sub remove_alias{ + my ($class, $species,$key) = @_; + + delete $registry_register{'_ALIAS'}{lc($key)}; + return; +} + + + +=head2 get_alias + + Arg [1] : name of the possible alias to get species for + Example : Bio::EnsEMBL::Registry->get_alias("Human"); + Description: get proper species name. + Returntype : species name + Exceptions : none + Status : Stable + +=cut + +sub get_alias { + my ( $class, $key, $no_warn ) = @_; + + if ( !defined( $registry_register{'_ALIAS'}{ lc($key) } ) ) { + if ( ( !defined( $registry_register{_SPECIES}{ lc($key) } ) ) and + ( !defined( $registry_register{_ALIAS}{ lc($key) } ) ) ) + { + if ( ( !defined($no_warn) ) or ( !$no_warn ) ) { + warning( "$key is not a valid species name " . + "(check DB and API version)" ); + } + return; + } + else { return $key } + } + + return $registry_register{'_ALIAS'}{ lc($key) }; +} + +=head2 get_all_aliases + + Arg [1] : Species name to retrieve aliases for + (may be an alias as well). + Example : Bio::EnsEMBL::Registry->get_all_aliases('Homo sapiens'); + Description: Returns all known aliases for a given species (but not the + species name/alias that was given). + Returntype : ArrayRef of all known aliases + Exceptions : none + Status : Development + +=cut + +sub get_all_aliases { + my ( $class, $key ) = @_; + + my $species = $registry_register{_ALIAS}{ lc($key) }; + + my @aliases; + if ( defined($species) ) { + foreach my $alias ( keys( %{ $registry_register{_ALIAS} } ) ) { + if ( $species ne $alias + && $species eq $registry_register{_ALIAS}{ lc($alias) } ) + { + push( @aliases, $alias ); + } + } + } + + return \@aliases; +} + +=head2 alias_exists + + Arg [1] : name of the possible alias to get species for + Example : Bio::EnsEMBL::Registry->alias_exists("Human"); + Description: does the species name exist. + Returntype : 1 if exists else 0 + Exceptions : none + Status : Stable + +=cut + +sub alias_exists { + my ( $class, $key ) = @_; + + return defined( $registry_register{'_ALIAS'}{ lc($key) } ); +} + +=head2 set_disconnect_when_inactive + + Example : Bio::EnsEMBL::Registry->set_disconnect_when_inactive(); + Description: Set the flag to make sure that the database connection is dropped if + not being used on each database. + Returntype : none + Exceptions : none + Status : Stable + +=cut + +sub set_disconnect_when_inactive{ + foreach my $dba ( @{get_all_DBAdaptors()}){ + my $dbc = $dba->dbc; + # Disconnect if connected + $dbc->disconnect_if_idle() if $dbc->connected(); + $dbc->disconnect_when_inactive(1); + } + return; +} + +=head2 set_reconnect_when_lost + + Example : Bio::EnsEMBL::Registry->set_reconnect_when_lost(); + Description: Set the flag to make sure that the database connection is not lost before it's used. + This is useful for long running jobs (over 8hrs). + Returntype : none + Exceptions : none + Status : Stable + +=cut + +sub set_reconnect_when_lost{ + foreach my $dba ( @{get_all_DBAdaptors()}){ + my $dbc = $dba->dbc; + $dbc->reconnect_when_lost(1); + } + return; +} + +=head2 disconnect_all + + Example : Bio::EnsEMBL::Registry->disconnect_all(); + Description: disconnect from all the databases. + Returntype : none + Exceptions : none + Status : Stable + +=cut + +sub disconnect_all { + foreach my $dba ( @{get_all_DBAdaptors()||[]} ){ + my $dbc = $dba->dbc; + next unless $dbc; + # Disconnect if connected + $dbc->disconnect_if_idle() if $dbc->connected(); + } + return; +} + +=head get_DBAdaptor_count + + Example : Bio::EnsEMBL::Registry->get_DBAdaptor_count(); + Description : Returns the count of database adaptors currently held by + the registry + Returntype : Int count of database adaptors currently known + Exceptions : None + +=cut + +sub get_DBAdaptor_count { + return scalar(@{$registry_register{'_DBA'}}) if(defined $registry_register{'_DBA'}); + return 0; +} + +=head2 change_access + + Will change the username and password for a set of databases. + if host,user or database names are missing then these are not checked. + So for example if you do not specify a database then ALL databases on + the specified host and port will be changed. + + Arg [1] : name of the host to change access on + Arg [2] : port number to change access on + Arg [3] : name of the user to change access on + Arg [4] : name of the database to change access on + Arg [5] : name of the new user + Arg [6] : new password + + Example : Bio::EnsEMBL::Registry->get_alias("Human"); + Description: change username and password on one or more databases + Returntype : none + Exceptions : none + Status : Stable + +=cut + +sub change_access{ + my ($self, $host,$port,$user,$dbname,$new_user,$new_pass) = @_; + foreach my $dba ( @{$registry_register{'_DBA'}}){ + my $dbc = $dba->dbc; + if((((!defined($host)) or ($host eq $dbc->host))) and + (((!defined($port)) or ($port eq $dbc->port))) and + (((!defined($user)) or ($user eq $dbc->username))) and + ((!defined($dbname)) or ($dbname eq $dbc->dbname))){ + if($dbc->connected()){ + $dbc->db_handle->disconnect(); + $dbc->connected(undef); + } + # over write the username and password + $dbc->username($new_user); + $dbc->password($new_pass); + } + } + return; +} + + + +=head2 load_registry_from_url + + Arg [1] : string $url + Arg [2] : (optional) integer + If not 0, will print out all information. + Arg [3] : (optional) integer + This option will turn off caching for slice features, so, + every time a set of features is retrieved, they will come + from the database instead of the cache. This option is only + recommended for advanced users, specially if you need to + store and retrieve features. It might reduce performance when + querying the database if not used properly. If in doubt, do + not use it or ask in the developer mailing list. + + Example : load_registry_from_url( + 'mysql://anonymous@ensembldb.ensembl.org:3306'); + + load_registry_from_url( + 'mysql://anonymous@ensembldb.ensembl.org:3306/homo_sapiens_core_65_37?group=core&species=homo_sapiens' + ); + + load_registry_from_url( + 'mysql://anonymous@ensembldb.ensembl.org:3306/homo_sapiens_core_65_37?group=core' + ); + + + Description: Will load the correct versions of the ensembl + databases for the software release it can find on + a database instance into the registry. Also adds + a set of standard aliases. The url format is: + mysql://[[username][:password]@]hostname[:port]. You + can also request a specific version for the databases + by adding a slash and the version number but your + script may crash as the API version won't match the + DB version. + + You can also specify a database name which will cause the + loading of a single DBAdaptor instance. Parameters are + mapped from a normal URL parameter set to their DBAdaptor + equivalent. Group must be defined. + + Returntype : Int count of the DBAdaptor instances which can be found in the + registry + + Exceptions : Thrown if the given URL does not parse according to the above + scheme and if the specified database cannot be connected to + (see L for more information) + Status : Stable + +=cut + +sub load_registry_from_url { + my ( $self, $url, $verbose, $no_cache ) = @_; + + if ( $url =~ /^mysql\:\/\/([^\@]+\@)?([^\:\/]+)(\:\d+)?(\/\d+)?\/?$/x ) { + my $user_pass = $1; + my $host = $2; + my $port = $3; + my $version = $4; + + $user_pass =~ s/\@$//; + my ( $user, $pass ) = $user_pass =~ m/([^\:]+)(\:.+)?/x; + $pass =~ s/^\://x if ($pass); + $port =~ s/^\://x if ($port); + $version =~ s/^\///x if ($version); + + return $self->load_registry_from_db( + -host => $host, + -user => $user, + -pass => $pass, + -port => $port, + -db_version => $version, + -verbose => $verbose, + -no_cache => $no_cache + ); + } + my $uri = parse_uri($url); + if($uri) { + if($uri->scheme() eq 'mysql') { + my %params = $uri->generate_dbsql_params(); + if($params{-DBNAME}) { + $params{-SPECIES} = $params{-DBNAME} unless $params{-SPECIES}; + $params{-NO_CACHE} = 1 if $no_cache; + my $group = $params{-GROUP}; + my $class = $self->_group_to_adaptor_class($group); + if($verbose) { + printf("Loading database '%s' from group '%s' with DBAdaptor class '%s' from url %s\n", $params{-DBNAME}, $group, $class, $url); + } + $class->new(%params); + return 1; + } + } + } + throw("Only MySQL URLs are accepted. Given URL was '${url}'"); +} ## end sub load_registry_from_url + + +=head2 load_registry_from_db + + Arg [HOST] : string + The domain name of the database host to connect to. + + Arg [USER] : string + The name of the database user to connect with. + + Arg [PASS] : (optional) string + The password to be used to connect to the database. + + Arg [PORT] : (optional) integer + The port to use when connecting to the database. + + Arg [VERBOSE]: (optional) boolean + Whether to print database messages. + + Arg [SPECIES]: (optional) string + By default, all databases that are found on the + server and that corresponds to the correct release + are probed for aliases etc. For some people, + depending on where they are in the world, this might + be a slow operation. With the '-species' argument, + one may reduce the startup time by restricting the + set of databases that are probed to those of a + particular species. + + Note that the latin name of the species is required, + e.g., 'homo sapiens', 'gallus gallus', 'callithrix + jacchus' etc. It may be the whole species name, + or only the first part of the name, e.g. 'homo', + 'gallus', or 'callithrix'. This will be used in + matching against the name of the databases. + + Arg [DB_VERSION]: (optional) integer + By default, only databases corresponding to the + current API version are loaded. This argument + allows the script to use databases from another + version although it might not work properly. This + argument should only be used for production or + testing purposes and if you really know what you are + doing. + + Arg [WAIT_TIMEOUT]: (optional) integer + Time in seconds for the wait timeout to happen. + Time after which the connection is deleted if not + used. By default this is 28800 (8 hours), so set + this to greater than this if your connection are + getting deleted. Only set this if you are having + problems and know what you are doing. + + Arg [-NO_CACHE]: (optional) boolean + This option will turn off caching for slice features, + so, every time a set of features is retrieved, they + will come from the database instead of the cache. This + option is only recommended for advanced users, specially + if you need to store and retrieve features. It might + reduce performance when querying the database if not + used properly. If in doubt, do not use it or ask in the + developer mailing list. + + Arg [SPECIES_SUFFIX]: (optional) string + This option will append the string to the species name + in the registry for all databases found on this server. + + Example : + + $registry->load_registry_from_db( + -host => 'ensembldb.ensembl.org', + -user => 'anonymous', + -verbose => '1' + ); + + Description: Will load the correct versions of the Ensembl + databases for the software release it can find on a + database instance into the registry. Also adds a set + of standard aliases. + + Returntype : Int count of the DBAdaptor instances which can be found in the + registry due to this method call. + + Exceptions : Thrown if the given MySQL database cannot be connected to + or there is any error whilst querying the database. + Status : Stable + +=cut + +sub load_registry_from_db { + my ( $self, @args ) = @_; + + my ( $host, $port, $user, + $pass, $verbose, $db_version, + $wait_timeout, $no_cache, $species, $species_suffix ) + = rearrange( [ 'HOST', 'PORT', + 'USER', 'PASS', + 'VERBOSE', 'DB_VERSION', + 'WAIT_TIMEOUT', 'NO_CACHE', + 'SPECIES', 'SPECIES_SUFFIX' ], + @args ); + + if ( defined($species) ) { + $species = lc($species); + $species =~ tr/ -/__/; + } + if (!defined($species_suffix)) { + $species_suffix = ""; + } + + my $ontology_db; + my $ontology_version; + + my $stable_ids_db; + my $stable_ids_version; + + $user ||= "ensro"; + if ( !defined($port) ) { + $port = 3306; + if ( $host eq "ensembldb.ensembl.org" ) { + if ( (!defined($db_version)) or ($db_version >= 48) ) { + $port = 5306; + } + } + } + + $wait_timeout ||= 0; + + my $original_count = $self->get_DBAdaptor_count(); + + my $err_pattern = 'Cannot %s to the Ensembl MySQL server at %s:%d; check your settings & DBI error message: %s'; + + my $dbh = DBI->connect( "DBI:mysql:host=$host;port=$port", $user, $pass ) or + throw(sprintf($err_pattern, 'connect', $host, $port, $DBI::errstr)); + $dbh->ping() or + throw(sprintf($err_pattern, 'ping', $host, $port, $DBI::errstr)); + + my $res = $dbh->selectall_arrayref('SHOW DATABASES'); + my @dbnames = map { $_->[0] } @$res; + + my %temp; + my $software_version = software_version(); + + if ( defined($db_version) ) { + $software_version = $db_version; + } + + if ($verbose) { + printf( "Will only load v%d databases\n", $software_version ); + } + + # From the list of all the databses create a tempory hash of those we + # are interested in + + for my $db (@dbnames) { + if ( $db =~ /^(\w+_collection_\w+(?:_\d+)?)_((\d+)_\w+)/ ) + { # NEEDS TO BE FIRST TO PICK UP COLLECTION DBS + if ( $3 eq $software_version ) { + $temp{$1} = $2; + } + } elsif ( $db =~ /^(.+)_(userdata)$/x ) { + $temp{$1} = $2; + } elsif ( + $db =~ /^(ensembl_compara # compara database + (?:_\w+)*?) # optional ensembl genomes bit + _ + (\d+)$/x ) + { # db version + if ( $2 eq $software_version ) { + $temp{$1} = $2; + } + } elsif ( $db =~ /^(ensembl_ancestral(?:_\w+?)*?)_(\d+)$/x ) { + if ( $2 eq $software_version ) { + $temp{$1} = $2; + } + } elsif ( $db =~ /^ensembl(?:genomes)?_ontology_(?:\d+_)?(\d+)/x ) { + if ( $1 eq $software_version ) { + $ontology_db = $db; + $ontology_version = $1; + } + } elsif ( $db =~ /^ensembl(?:genomes)?_stable_ids_(?:\d+_)?(\d+)/x ) { + if ( $1 eq $software_version ) { + $stable_ids_db = $db; + $stable_ids_version = $1; + } + + } elsif ( + $db =~ /^([a-z]+_[a-z0-9]+(?:_[a-z0-9]+)? # species name e.g. homo_sapiens or canis_lupus_familiaris + _ + [a-z]+ # db type + (?:_\d+)?) # optional end bit for ensembl genomes databases + _ + (\d+) # database release + _ + (\w+)$ # assembly number can have letters too e.g 37c + /x + ) + { + + # Species specific databases (core, cdna, vega etc.) + + my ( $sp_name, $db_rel, $assem ) = ( $1, $2, $3 ); + + if ( !defined($species) || $sp_name =~ /^$species/ ) { + if ( $db_rel eq $software_version ) { + $temp{$sp_name} = $db_rel . "_" . $assem; + } + } + + } else { + # warn( sprintf( "Skipping database '%s'\n", $db ) ); + } + } ## end for my $db (@dbnames) + + @dbnames = (); + + foreach my $key ( keys %temp ) { + push @dbnames, $key . "_" . $temp{$key}; + } + + # Register Core like databases + my $core_like_dbs_found = 0; + foreach my $type (qw(core cdna vega vega_update otherfeatures rnaseq)) { + + my @dbs = grep { /^[a-z]+_[a-z0-9]+(?:_[a-z0-9]+)? # species name + _ + $type # the database type + _ + (?:\d+_)? # optional end bit for ensembl genomes + \d+ # database release + _ + /x } @dbnames; + + if(@dbs) { + $core_like_dbs_found = 1; + } + + foreach my $database (@dbs) { + if ( index( $database, 'collection' ) != -1 ) { + # Skip multi-species databases. + next; + } + + + my ( $species, $num ) = + ( $database =~ /(^[a-z]+_[a-z0-9]+(?:_[a-z0-9]+)?) # species name + _ + $type # type + _ + (?:\d+_)? # optional endbit for ensembl genomes + (\d+) # databases release + _ + /x ); + + if(!defined($species)){ + warn "Cannot extract species name from database '$database'"; + } + + my $dba = + Bio::EnsEMBL::DBSQL::DBAdaptor->new( + -group => $type, + -species => $species.$species_suffix, + -host => $host, + -user => $user, + -pass => $pass, + -port => $port, + -dbname => $database, + -wait_timeout => $wait_timeout, + -no_cache => $no_cache ); + + if ($verbose) { + printf( "Species '%s' loaded from database '%s'\n", + $species, $database ); + } + } + } + + # Register multi-species databases + + my @multi_dbs = grep { /^\w+_collection_core_\w+$/ } @dbnames; + + foreach my $multidb (@multi_dbs) { + my $sth = $dbh->prepare( + sprintf( + "SELECT species_id, meta_value FROM %s.meta " + . "WHERE meta_key = 'species.db_name'", + $dbh->quote_identifier($multidb) ) ); + + $sth->execute(); + + my ( $species_id, $species ); + $sth->bind_columns( \( $species_id, $species ) ); + + while ( $sth->fetch() ) { + my $dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new( + -group => "core", + -species => $species.$species_suffix, + -species_id => $species_id, + -multispecies_db => 1, + -host => $host, + -user => $user, + -pass => $pass, + -port => $port, + -dbname => $multidb, + -wait_timeout => $wait_timeout, + -no_cache => $no_cache + ); + + if ($verbose) { + printf( "Species '%s' (id:%d) loaded from database '%s'\n", + $species, $species_id, $multidb ); + } + } + } ## end foreach my $multidb (@multi_dbs) + + if(!$core_like_dbs_found && $verbose) { + print("No core-like databases found. Check your DB_VERSION (used '$software_version')\n"); + } + + # User upload DBs + + my @userupload_dbs = grep { /_userdata$/ } @dbnames; + for my $userupload_db (@userupload_dbs) { + if ( index( $userupload_db, 'collection' ) != -1 ) { + # Skip multi-species databases. + next; + } + + my ($species) = ( $userupload_db =~ /(^.+)_userdata$/ ); + my $dba = + Bio::EnsEMBL::DBSQL::DBAdaptor->new( + -group => "userupload", + -species => $species.$species_suffix, + -host => $host, + -user => $user, + -pass => $pass, + -port => $port, + -wait_timeout => $wait_timeout, + -dbname => $userupload_db, + -no_cache => $no_cache ); + + if ($verbose) { + printf( "%s loaded\n", $userupload_db ); + } + } + + # Register multi-species userupload databases. + my @userdata_multidbs = grep { /^.+_collection_userdata$/ } @dbnames; + + foreach my $multidb (@userdata_multidbs) { + my $sth = $dbh->prepare( + sprintf( + "SELECT species_id, meta_value FROM %s.meta " + . "WHERE meta_key = 'species.db_name'", + $dbh->quote_identifier($multidb) ) ); + + $sth->execute(); + + my ( $species_id, $species ); + $sth->bind_columns( \( $species_id, $species ) ); + + while ( $sth->fetch() ) { + my $dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new( + -group => "userupload", + -species => $species.$species_suffix, + -species_id => $species_id, + -multispecies_db => 1, + -host => $host, + -user => $user, + -pass => $pass, + -port => $port, + -dbname => $multidb, + -wait_timeout => $wait_timeout, + -no_cache => $no_cache + ); + + if ($verbose) { + printf( "Species '%s' (id:%d) loaded from database '%s'\n", + $species, $species_id, $multidb ); + } + } + } ## end foreach my $multidb (@userdata_multidbs) + + # Variation + + my $test_eval = eval "require Bio::EnsEMBL::Variation::DBSQL::DBAdaptor"; + if ($@or (!$test_eval)) { + # Ignore variations as code required not there for this + if ($verbose) { + print( + "Bio::EnsEMBL::Variation::DBSQL::DBAdaptor module not found " + . "so variation databases will be ignored if found\n" ); + } + } + else { + my @variation_dbs = + grep { /^[a-z]+_[a-z0-9]+(?:_[a-z0-9]+)?_variation_(?:\d+_)?\d+_/ } @dbnames; + + if(! @variation_dbs && $verbose) { + print("No variation databases found\n"); + } + + for my $variation_db (@variation_dbs) { + + if ( index( $variation_db, 'collection' ) != -1 ) { + # Skip multi-species databases. + next; + } + + my ( $species, $num ) = + ( $variation_db =~ /(^[a-z]+_[a-z0-9]+(?:_[a-z0-9]+)?)_variation_(?:\d+_)?(\d+)_/ ); + my $dba = + Bio::EnsEMBL::Variation::DBSQL::DBAdaptor->new( + -group => "variation", + -species => $species.$species_suffix, + -host => $host, + -user => $user, + -pass => $pass, + -port => $port, + -wait_timeout => $wait_timeout, + -dbname => $variation_db, + -no_cache => $no_cache ); + + if ($verbose) { + printf( "%s loaded\n", $variation_db ); + } + } + + # Register variation multispecies databases + my @variation_multidbs = + grep { /^\w+_collection_variation_\w+$/ } @dbnames; + + foreach my $multidb (@variation_multidbs) { + my $sth = $dbh->prepare( + sprintf( 'SELECT species_id, meta_value FROM %s.meta ', + $dbh->quote_identifier($multidb) ) + . "WHERE meta_key = 'species.db_name'" + ); + + $sth->execute(); + + my ( $species_id, $species ); + $sth->bind_columns( \( $species_id, $species ) ); + + while ( $sth->fetch() ) { + my $dba = Bio::EnsEMBL::Variation::DBSQL::DBAdaptor->new( + -group => 'variation', + -species => $species.$species_suffix, + -species_id => $species_id, + -multispecies_db => 1, + -host => $host, + -user => $user, + -pass => $pass, + -port => $port, + -dbname => $multidb, + -wait_timeout => $wait_timeout, + -no_cache => $no_cache + ); + + if ($verbose) { + printf( "Species '%s' (id:%d) loaded from database '%s'\n", + $species, $species_id, $multidb ); + } + } + } ## end foreach my $multidb (@variation_multidbs) + } + + my $func_eval = eval "require Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor"; + if ($@ or (!$func_eval)) { + if ($verbose) { + # Ignore funcgen DBs as code required not there for this + print("Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor module not found " + . "so functional genomics databases will be ignored if found\n" + ); + } + } else { + my @funcgen_dbs = + grep { /^[a-z]+_[a-z0-9]+(?:_[a-z0-9]+)?_funcgen_(?:\d+_)?\d+_/ } @dbnames; + + if(! @funcgen_dbs && $verbose) { + print("No funcgen databases found\n"); + } + + for my $funcgen_db (@funcgen_dbs) { + if ( index( $funcgen_db, 'collection' ) != -1 ) { + # Skip multi-species databases. + next; + } + + my ( $species, $num ) = + ( $funcgen_db =~ /(^[a-z]+_[a-z0-9]+(?:_[a-z0-9]+)?)_funcgen_(?:\d+_)?(\d+)_/ ); + my $dba = Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor->new( + -group => "funcgen", + -species => $species.$species_suffix, + -host => $host, + -user => $user, + -pass => $pass, + -port => $port, + -wait_timeout => $wait_timeout, + -dbname => $funcgen_db, + -no_cache => $no_cache + ); + + if ($verbose) { + printf( "%s loaded\n", $funcgen_db ); + } + } + + # Register functional genomics multispecies databases + my @funcgen_multidbs = + grep { /^\w+_collection_funcgen_\w+$/ } @dbnames; + + foreach my $multidb (@funcgen_multidbs) { + my $sth = $dbh->prepare( + sprintf( 'SELECT species_id, meta_value FROM %s.meta ', + $dbh->quote_identifier($multidb) ) + . "WHERE meta_key = 'species.db_name'" + ); + + $sth->execute(); + + my ( $species_id, $species ); + $sth->bind_columns( \( $species_id, $species ) ); + + while ( $sth->fetch() ) { + my $dba = Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor->new( + -group => 'funcgen', + -species => $species.$species_suffix, + -species_id => $species_id, + -multispecies_db => 1, + -host => $host, + -user => $user, + -pass => $pass, + -port => $port, + -dbname => $multidb, + -wait_timeout => $wait_timeout, + -no_cache => $no_cache + ); + + if ($verbose) { + printf( "Species '%s' (id:%d) loaded from database '%s'\n", + $species, $species_id, $multidb ); + } + } + } ## end foreach my $multidb (@funcgen_multidbs) + } ## end else [ if ($@) ] + + # Compara + + my @compara_dbs = grep { /^ensembl_compara/ } @dbnames; + + if (@compara_dbs) { + my $comp_eval = eval "require Bio::EnsEMBL::Compara::DBSQL::DBAdaptor"; + if ($@ or (!$comp_eval)) { + # Ignore Compara as code required not there for this + if ($verbose) { + printf( + "Bio::EnsEMBL::Compara::DBSQL::DBAdaptor " + . "not found so the following compara " + . "databases will be ignored: %s\n", + join( ', ', @compara_dbs ) ); + } + } else { + foreach my $compara_db (@compara_dbs) { + # Looking for EnsEMBL Genomes Comparas. + # ensembl_compara_bacteria_2_53 is registered as + # 'bacteria', ensembl_compara_pan_homology_2_53 is + # registered as 'pan_homology', ensembl_compara_53 is + # registered as 'multi', and the alias 'compara' still + # operates. + + my ($species) = + $compara_db =~ /^ensembl_compara_(\w+)(?:_\d+){2}$/xm; + + $species ||= 'multi'; + + my $dba = Bio::EnsEMBL::Compara::DBSQL::DBAdaptor->new( + -group => 'compara', + -species => $species.$species_suffix, + -host => $host, + -user => $user, + -pass => $pass, + -port => $port, + -wait_timeout => $wait_timeout, + -dbname => $compara_db, + -no_cache => $no_cache + ); + + if ($verbose) { + printf( "%s loaded\n", $compara_db ); + } + } ## end foreach my $compara_db (@compara_dbs) + } ## end else [ if ($@) + } elsif ($verbose) { + print("No Compara databases found\n"); + } + + # Ancestral sequences + + my @ancestral_dbs = + sort grep { /^ensembl_ancestral/ } @dbnames; + + if (@ancestral_dbs) { + my $ancestral_db = shift @ancestral_dbs; + + my $dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new( + -group => 'core', + -species => 'Ancestral sequences'.$species_suffix, + -host => $host, + -user => $user, + -pass => $pass, + -port => $port, + -wait_timeout => $wait_timeout, + -dbname => $ancestral_db, + -no_cache => $no_cache + ); + + if ($verbose) { + printf( "%s loaded\n", $ancestral_db ); + + if (@ancestral_dbs) { + # If we still had some more then report the problem. + printf( + "Multiple ancestral databases found.\n" + . "Ignoring the following: %s\n", + join( ', ', @ancestral_dbs ) ); + } + } + } elsif ($verbose) { + print("No ancestral database found\n"); + } + + # Ontology + + if ( defined($ontology_version) && $ontology_version != 0 ) { + require Bio::EnsEMBL::DBSQL::OntologyDBAdaptor; + + my $dba = + Bio::EnsEMBL::DBSQL::OntologyDBAdaptor->new( + '-species' => 'multi' . $species_suffix, + '-group' => 'ontology', + '-host' => $host, + '-port' => $port, + '-user' => $user, + '-pass' => $pass, + '-dbname' => $ontology_db, ); + + if ($verbose) { + printf( "%s loaded\n", $ontology_db ); + } + } + elsif ($verbose) { + print("No ontology database found\n"); + } + + + if ( defined($stable_ids_db) && $stable_ids_version != 0 ) { + + my $dba = + Bio::EnsEMBL::DBSQL::DBAdaptor->new( + '-species' => 'multi' . $species_suffix, + '-group' => 'stable_ids', + '-host' => $host, + '-port' => $port, + '-user' => $user, + '-pass' => $pass, + '-dbname' => $stable_ids_db, ); + + if ($verbose) { + printf( "%s loaded\n", $stable_ids_db ); + } + + } + + + Bio::EnsEMBL::Utils::ConfigRegistry->add_alias( + -species => 'multi'.$species_suffix, + -alias => ['compara'.$species_suffix] ); + + Bio::EnsEMBL::Utils::ConfigRegistry->add_alias( + -species => 'multi'.$species_suffix, + -alias => ['ontology'.$species_suffix] ); + + + Bio::EnsEMBL::Utils::ConfigRegistry->add_alias( + -species => 'multi'.$species_suffix, + -alias => ['stable_ids'.$species_suffix] ); + + Bio::EnsEMBL::Utils::ConfigRegistry->add_alias( + -species => 'Ancestral sequences'.$species_suffix, + -alias => ['ancestral_sequences'.$species_suffix] ); + + # Register aliases as found in adaptor meta tables. + + $self->find_and_add_aliases( '-handle' => $dbh, + '-species_suffix' => $species_suffix ); + + $self->_additional_aliases($species_suffix); + + $dbh->disconnect(); + + my $count = $self->get_DBAdaptor_count() - $original_count; + return $count >= 0 ? $count : 0; + +} ## end sub load_registry_from_db + + +# Used as a place to push "hack" aliases +sub _additional_aliases { + my ($self, $species_suffix) = @_; + + #Adding branch-68 thirteen-lined ground squirrel "old" aliases + Bio::EnsEMBL::Utils::ConfigRegistry->add_alias( + -species => 'ictidomys_tridecemlineatus'.$species_suffix, + -alias => ['spermophilus_tridecemlineatus'.$species_suffix] ); + Bio::EnsEMBL::Utils::ConfigRegistry->add_alias( + -species => 'ictidomys_tridecemlineatus'.$species_suffix, + -alias => ['spermophilus tridecemlineatus'.$species_suffix] ); + + return; +} # end sub _additional_aliases + +=head2 _group_to_adaptor_class + + Arg [1] : The group you wish to decode to an adaptor class + Example : Bio::EnsEMBL::Registry->_group_to_adaptor_class('core'); + Description : Has an internal lookup of groups to their adaptor classes + Returntype : String + Exceptions : Thrown if the group is unknown + Status : Stable + +=cut + +sub _group_to_adaptor_class { + my ($self, $group) = @_; + my $class = { + core => 'Bio::EnsEMBL::DBSQL::DBAdaptor', + cdna => 'Bio::EnsEMBL::DBSQL::DBAdaptor', + otherfeatures => 'Bio::EnsEMBL::DBSQL::DBAdaptor', + rnaseq => 'Bio::EnsEMBL::DBSQL::DBAdaptor', + vega => 'Bio::EnsEMBL::DBSQL::DBAdaptor', + variation => 'Bio::EnsEMBL::Variation::DBSQL::DBAdaptor', + funcgen => 'Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor', + compara => 'Bio::EnsEMBL::Compara::DBSQL::DBAdaptor', + }->{$group}; + throw "Group '${group}' is unknown" if ! $class; + return $class; +} + + +=head2 find_and_add_aliases + + Arg [ADAPTOR] : (optional) Bio::EnsEMBL::DBSQL::DBAdaptor + The adaptor to use to retrieve aliases from. + + Arg [GROUP] : (optional) string + The group you want to find aliases for. If not + given assumes all types. + + Arg [HANDLE] : (optional) DBI database handle + A connected database handle to use instead of + the database handles stored in the DBAdaptors. + Bypasses the use of MetaContainer. + + Arg [SPECIES_SUFFIX]: (optional) string + This option will append the string to the species + name in the registry for all databases. + + Example : Bio::EnsEMBL::Registry->find_and_add_aliases( + -ADAPTOR => $dba, + -GROUP => 'core' + ); + + Description : Looks in the meta container for each database for + an entry called "species.alias". If any are found + then the species adaptor is registered to that + set of aliases. This can work across any adaptor + which has a MetaContainer. If no MetaContainer + can be returned from a given adaptor then no alias + searching is performed. + + Return type : none + Exceptions : Throws if an alias is found in more than one species. + Status : Stable + +=cut + +sub find_and_add_aliases { + my $class = shift ; + + my ($adaptor, $group, $dbh, $species_suffix ) = + rearrange( [ 'ADAPTOR', 'GROUP', 'HANDLE', 'SPECIES_SUFFIX' ], @_ ); + + #Can be undef; needs to be something to avoid warnings + $species_suffix ||= q{}; + + my @dbas; + if ( defined($adaptor) ) { + @dbas = ($adaptor); + } elsif ( defined($dbh) ) { + + if ( length($species_suffix) > 0 ) { + my @full = @{ $class->get_all_DBAdaptors( '-GROUP' => $group ) }; + + foreach my $db (@full) { + if ( $db->species =~ /$species_suffix/ ) { + push( @dbas, $db ); + } + } + + } else { + @dbas = @{ $class->get_all_DBAdaptors( '-GROUP' => $group ) }; + } + + } else { + @dbas = @{ $class->get_all_DBAdaptors( '-GROUP' => $group ) }; + } + + foreach my $dba (@dbas) { + my @aliases; + my $species = $dba->species(); + + if ( defined($dbh) ) { + my $dbname = $dba->dbc()->dbname(); + my $sth = $dbh->prepare( sprintf( + "SELECT meta_value FROM %s.meta " + . "WHERE meta_key = 'species.alias' " + . "AND species_id = ?", + $dbh->quote_identifier($dbname) ) ); + + # Execute, and don't care about errors (there will be errors for + # databases without a 'meta' table. + $sth->{'PrintError'} = 0; + $sth->{'RaiseError'} = 0; + if ( !$sth->execute( $dba->species_id() ) ) { next } + $sth->{'PrintError'} = $dbh->{'PrintError'}; + $sth->{'RaiseError'} = $dbh->{'RaiseError'}; + + my $alias; + $sth->bind_columns( \$alias ); + while ( $sth->fetch() ) { + push( @aliases, $alias ); + } + } else { + my $meta_container = eval { $dba->get_MetaContainer() }; + + if ( defined($meta_container) ) { + push( @aliases, + @{ $meta_container->list_value_by_key('species.alias') } + ); + } + + # Need to disconnect so we do not spam the MySQL servers trying to + # get aliases. Can only call disonnect if dbc was defined. + if ( defined( $dba->dbc() ) ) { + $dba->dbc()->disconnect_if_idle(); + } + } + + foreach my $alias (@aliases) { + my $alias_suffix = $alias.$species_suffix; + #Lowercase because stored aliases are lowercased + my $lc_species = lc($species); + my $lc_alias_suffix = lc($alias_suffix); + if ( !$class->alias_exists( $alias_suffix ) + && $lc_species ne $lc_alias_suffix ) + { + $class->add_alias( $species, $alias_suffix ); + } elsif ( + $lc_species ne $class->get_alias( $alias_suffix ) ) + { + $class->remove_alias( $species, $alias_suffix ); + } + } + + } ## end foreach my $dba (@dbas) + return; +} ## end sub find_and_add_aliases + + +=head2 load_registry_from_multiple_dbs + + Arg [1] : Array of hashes, each hash being a set of arguments to + load_registry_from_db() (see above). + + Example : + + $registry->load_registry_from_multiple_dbs( { + '-host' => 'ensembldb.ensembl.org', + '-user' => 'anonymous', + '-verbose' => '1' + }, + { + '-host' => 'server.example.com', + '-user' => 'anonymouse', + '-password' => 'cheese', + '-verbose' => '1' + } ); + + Description: Will call load_registry_from_db() (see above) + multiple times and merge the resulting registries + into one, effectively allowing a user to connect to + databases on multiple database servers from within + one program. + + If a database is found on more than one server, the + first found instance of that database will be used. + + Returntype : Int count of the DBAdaptor instances which can be found in the + registry + +=cut + +sub load_registry_from_multiple_dbs { + my ( $self, @args ) = @_; + + my $original_count = $self->get_DBAdaptor_count(); + + my %merged_register = %registry_register; + + foreach my $arg (@args) { + local %registry_register = (); + + my $verbose; + + ($verbose) = rearrange( ['VERBOSE'], %{$arg} ); + + $self->load_registry_from_db( %{$arg} ); + + # + # Merge the localized %registry_register into %merged_register. + # + + # Merge the _SPECIES and _ALIAS sections of %registry_register. + foreach my $section ( 'Species', 'Alias' ) { + my $section_key = '_' . uc($section); + + while ( my ( $key, $value ) = + each( %{ $registry_register{$section_key} } ) ) + { + if ( !exists( $merged_register{$section_key}{$key} ) ) { + $merged_register{$section_key}{$key} = $value; + } elsif ($verbose) { + printf( "%s '%s' found on multiple servers, " + . "using first found\n", + $section, $key ); + } + } + } + } ## end foreach my $arg (@args) + + # Add the DBAs from the _SPECIES section into the _DBA section. + foreach my $species_hash ( values( %{ $merged_register{_SPECIES} } ) ) + { + foreach my $group_hash ( values( %{$species_hash} ) ) { + if ( ref($group_hash) eq 'HASH' && exists( $group_hash->{_DB} ) ) + { + push( @{ $merged_register{_DBA} }, $group_hash->{_DB} ); + } + } + } + + %registry_register = %merged_register; + + my $count = $self->get_DBAdaptor_count() - $original_count; + return $count >= 0 ? $count : 0; +} ## end sub load_registry_from_multiple_dbs + +# +# Web specific routines +# + +=head2 DEPRECATED load_registry_with_web_adaptors + + DEPRECATED: Use load_registry_from_db instead. + +=cut + +sub load_registry_with_web_adaptors{ + my $class = shift; + + deprecate('Use the load_registry_from_db instead'); + my $site_eval = eval{ require SiteDefs }; + if ($@ or (!defined($site_eval))){ die "Can't use SiteDefs.pm - $@\n"; } + SiteDefs->import(qw(:ALL)); + + my $species_eval = eval{ require SpeciesDefs }; + if ($@ or (!defined($species_eval))){ die "Can't use SpeciesDefs.pm - $@\n"; } + my $conf = new SpeciesDefs(); + + my %species_alias = %{$SiteDefs::ENSEMBL_SPECIES_ALIASES}; + + foreach my $spec (keys %species_alias){ + Bio::EnsEMBL::Registry->add_alias($species_alias{$spec},$spec); + } + return; +} + +=head2 set_default_track + + Sets a flag to say that that this species/group are a default track and do not + need to be added as another web track. + + Arg [1] : name of the species to get the adaptors for in the registry. + Arg [2] : name of the type to get the adaptors for in the registry. + Example : $merged = Bio::EnsEMBL::Registry->set_default_track("Human","core"); + Returntype : none + Exceptions : none + Status : At Risk. + +=cut + +sub set_default_track { + my ( $class, $species, $group ) = @_; + + $species = get_alias($species); + $registry_register{'def_track'}{$species}{ lc($group) } = 1; + return; +} + +=head2 default_track + + Check flag to see if this is a default track + + Arg [1] : name of the species to get the adaptors for in the registry. + Arg [2] : name of the type to get the adaptors for in the registry. + Example : $merged = Bio::EnsEMBL::Registry->set_default_track("Human","core"); + Returntype : int + Exceptions : none + Status : At Risk. + +=cut + +sub default_track { + my ( $class, $species, $group ) = @_; + + $species = get_alias($species); + if ( + defined( $registry_register{'def_track'}{$species}{ lc($group) } ) ) + { + return 1; + } + + return 0; +} + + +=head2 add_new_tracks + + Will add new gene tracks to the configuration of the WEB server if they are + not of the type default and the configuration already has genes in the display. + + Arg [1] : hash of the default configuration of the web page + Returntype : none + Exceptions : none + Called by : UserConfig.pm + Status : At Risk. + +=cut + +sub add_new_tracks{ + my($class, $conf, $pos) = @_; + + my $start = 0; + my $reg = $class; + my $species_reg = $reg->get_alias($conf->{'species'},"nothrow"); + my %pars; +# print STDERR "Species $species_reg check for default tracks\n"; + if(defined($species_reg)){ + foreach my $dba (@{$reg->get_all_DBAdaptors()}){ + if(!$reg->default_track($dba->species,$dba->group)){ + $pars{'available'} = "species ".$reg->get_alias($dba->species()); + $pars{'db_alias'} = $dba->group(); +# print STDERR "Adding new track for ".$dba->species."\t".$dba->group."\n"; + $conf->add_new_track_generictranscript('',$dba->group(), "black",$pos,%pars); + $pos++; + } + } + } + return $pos; + +} + +=head2 no_version_check + + getter/setter for whether to run the version checking + + Arg[0] : (optional) int + Returntype : int or undef if not set + Exceptions : none + Status : At Risk. + +=cut + +sub no_version_check { + my ( $self, $arg ) = @_; + ( defined $arg ) + && ( $registry_register{'_no_version_check'} = $arg ); + + return $registry_register{'_no_version_check'}; +} + +=head2 no_cache_warnings + + Arg[0] : boolean for turning the flag on and off + Description : Turns off any warnings about not using caching in all available + adaptors. + Returntype : boolean Current status + Exceptions : None + +=cut + +sub no_cache_warnings { + my ($self, $arg) = @_; + if(defined $arg) { + $Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor::SILENCE_CACHE_WARNINGS = $arg; + } + return $Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor::SILENCE_CACHE_WARNINGS; +} + + +=head2 version_check + + run the database/API code version check for a DBAdaptor + + Arg[0] : DBAdaptor to check + Returntype : int 1 if okay, 0 if not the same + Exceptions : none + Status : At Risk. + +=cut + + +sub version_check { + my ( $self, $dba ) = @_; + + # Check the datbase and versions match + # give warning if they do not. + my $check = no_version_check(); + + if ( ( + defined( $ENV{HOME} ) + and ( -e $ENV{HOME} . "/.ensemblapi_no_version_check" ) ) + or ( defined($check) and ( $check != 0 ) ) ) + { + return 1; + } + + my $mca = + $self->get_adaptor( $dba->species(), $dba->group(), + "MetaContainer" ); + + my $database_version = 0; + if ( defined($mca) ) { + $database_version = $mca->get_schema_version(); + } + + if ( $database_version == 0 ) { + # Try to work out the version + if ( $dba->dbc()->dbname() =~ /^_test_db_/x ) { + return 1; + } + if ( $dba->dbc()->dbname() =~ /(\d+)_\S+$/x ) { + $database_version = $1; + } elsif ( $dba->dbc()->dbname() =~ /ensembl_compara_(\d+)/x ) { + $database_version = $1; + } elsif ( $dba->dbc()->dbname() =~ /ensembl_help_(\d+)/x ) { + $database_version = $1; + } elsif ( $dba->dbc()->dbname() =~ /ensembl_ontology_(\d+)/x ) { + $database_version = $1; + } elsif ( $dba->dbc()->dbname() =~ /ensembl_stable_ids_(\d+)/x ) { + $database_version = $1; + } else { + warn( + sprintf( + "No database version for database %s " + . ". You must be using a post version 34 database " + . "with version 34 or later code.\n" + . "You need to update your database " + . "or use the appropriate Ensembl software release " + . "to ensure your script does not crash\n", + $dba->dbc()->dbname() ) ); + } + } ## end if ( $database_version... + + if ( $database_version != software_version() ) { + warn( + sprintf( + "For %s there is a difference in the software release (%s) " + . "and the database release (%s). " + . "You should update one of these to ensure that your script " + . "does not crash.\n", + $dba->dbc()->dbname(), + software_version(), $database_version + ) ); + return 0; + } + + return 1; # Ok +} ## end sub version_check + + +=head2 get_species_and_object_type + + Description: Get the species name, object type (gene, transcript, + translation, or exon etc.), and database type for a + stable ID. + + Arg [1] : String stable_id + The stable ID to find species and object type for. + + Arg [2] : String known_type (optional) + The type of the stable ID, if it is known. + + Arg [3] : String known_species (optional) + The species, if known + + Arg [4] : String known_db_type (optional) + The database type, if known + + Example : my $stable_id = 'ENST00000326632'; + + my ( $species, $object_type, $db_type ) = + $registry->get_species_and_object_type($stable_id); + + my $adaptor = + $registry->get_adaptor( $species, $db_type, + $object_type ); + + my $object = $adaptor->fetch_by_stable_id($stable_id); + + Return type: Array consisting of the species name, object type, + and database type. The array may be empty if no + match is found. + + Exceptions : none + Status : At Risk. + +=cut + +my %stable_id_stmts = ( + gene => 'SELECT m.meta_value ' + . 'FROM %1$s.gene ' + . 'JOIN %1$s.seq_region USING (seq_region_id) ' + . 'JOIN %1$s.coord_system USING (coord_system_id) ' + . 'JOIN %1$s.meta m USING (species_id) ' + . 'WHERE stable_id = ? ' + . 'AND m.meta_key = "species.production_name"', + transcript => 'SELECT m.meta_value ' + . 'FROM %1$s.transcript ' + . 'JOIN %1$s.seq_region USING (seq_region_id) ' + . 'JOIN %1$s.coord_system USING (coord_system_id) ' + . 'JOIN %1$s.meta m USING (species_id) ' + . 'WHERE stable_id = ? ' + . 'AND m.meta_key = "species.production_name"', + exon => 'SELECT m.meta_value ' + . 'FROM %1$s.exon ' + . 'JOIN %1$s.seq_region USING (seq_region_id) ' + . 'JOIN %1$s.coord_system USING (coord_system_id) ' + . 'JOIN %1$s.meta m USING (species_id) ' + . 'WHERE stable_id = ? ' + . 'AND m.meta_key = "species.production_name"', + translation => 'SELECT m.meta_value ' + . 'FROM %1$s.translation tl ' + . 'JOIN %1$s.transcript USING (transcript_id) ' + . 'JOIN %1$s.seq_region USING (seq_region_id) ' + . 'JOIN %1$s.coord_system USING (coord_system_id) ' + . 'JOIN %1$s.meta m USING (species_id) ' + . 'WHERE tl.stable_id = ? ' + . 'AND m.meta_key = "species.production_name"', + operon => 'SELECT m.meta_value ' + . 'FROM %1$s.operon ' + . 'JOIN %1$s.seq_region USING (seq_region_id) ' + . 'JOIN %1$s.coord_system USING (coord_system_id) ' + . 'JOIN %1$s.meta m USING (species_id) ' + . 'WHERE stable_id = ? ' + . 'AND m.meta_key = "species.production_name"', + operontranscript => 'SELECT m.meta_value ' + . 'FROM %1$s.operon_transcript ' + . 'JOIN %1$s.seq_region USING (seq_region_id) ' + . 'JOIN %1$s.coord_system USING (coord_system_id) ' + . 'JOIN %1$s.meta m USING (species_id) ' + . 'WHERE stable_id = ? ' + . 'AND m.meta_key = "species.production_name"', + +); + + +sub get_species_and_object_type { + my ($self, $stable_id, $known_type, $known_species, $known_db_type, $force_long_lookup) = @_; + + #get the stable_id lookup database adaptor + my $stable_ids_dba = $self->get_DBAdaptor("multi", "stable_ids", 1); + + if ($stable_ids_dba && ! $force_long_lookup) { + my $statement = 'SELECT name, object_type, db_type FROM stable_id_lookup join species using(species_id) WHERE stable_id = ?'; + + if ($known_species) { + $statement .= ' AND name = ?'; + } + if ($known_db_type) { + $statement .= ' AND db_type = ?'; + } + if ($known_type) { + $statement .= ' AND object_type = ?'; + } + + my $sth = $stable_ids_dba->dbc()->prepare($statement); + $sth->bind_param(1, $stable_id, SQL_VARCHAR); + my $param_count = 1; + if ($known_species) { + $known_species = $self->get_alias($known_species); + $param_count++; + $sth->bind_param($param_count, $known_species, SQL_VARCHAR); + } + if ($known_db_type) { + $param_count++; + $sth->bind_param($param_count, $known_db_type, SQL_VARCHAR); + } + if ($known_type) { + $param_count++; + $sth->bind_param($param_count, $known_type, SQL_VARCHAR); + } + $sth->execute(); + my ($species, $type, $db_type) = $sth->fetchrow_array(); + $sth->finish(); + return ($species ,$type, $db_type); + + } else { + if (defined $known_type && !exists $stable_id_stmts{lc $known_type}) { + return; + } + + my @types = defined $known_type ? ($known_type) : ('Gene', 'Transcript', 'Translation', 'Exon', 'Operon', 'OperonTranscript'); + + if(! $known_db_type) { + $known_db_type = 'core'; + } + + my %get_adaptors_args; + $get_adaptors_args{'-group'} = $known_db_type; + if ($known_species) { + $get_adaptors_args{'-species'} = $known_species; + } + + my @dbas = sort { $a->dbc->host cmp $b->dbc->host || $a->dbc->port <=> $b->dbc->port } + @{$self->get_all_DBAdaptors(%get_adaptors_args)}; + foreach my $dba (@dbas) { + + foreach my $type (@types) { + my $statement = sprintf $stable_id_stmts{lc $type}, $dba->dbc->dbname; + + my $sth = $dba->dbc()->prepare($statement); + $sth->bind_param(1, $stable_id, SQL_VARCHAR); + $sth->execute; + + my $species = $sth->fetchall_arrayref->[0][0]; + + $sth->finish; + + return ($species, $type, $known_db_type) if defined $species; + } + + } ## end foreach my $dba ( sort { $a...}) + + } + + return; +} ## end sub get_species_and_object_type + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/RepeatConsensus.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/RepeatConsensus.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,256 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +package Bio::EnsEMBL::RepeatConsensus; + +use strict; + +use Bio::EnsEMBL::Storable; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Scalar::Util qw(weaken isweak); + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Storable); + + +=head2 new + + Arg [NAME] : string (optional) + The name of this repeat consensus + Arg [LENGTH]: int (optional) + The length of the repeat consensus sequence + Arg [REPEAT_CLASS]: string (optional) + The type of repeat consensus + Arg [REPEAT_CONSENSUS]: string (optional) + The sequence of this repeat consensus + Arg [REPEAT_TYPE]: string + Its like class only more general + Arg [...]: Named arguments to superclass constructor + (see Bio::EnsEMBL::Storable) + Example : $rc = Bio::EnsEMBL::RepeatConsensus->new + (-REPEAT_CONSENSUS => 'AATG' + -NAME => '(AATG)n', + -REPEAT_CLASS => 'Simple_repeat', + -LENGTH => '4', + -DBID => 1023, + -ADAPTOR => $rc_adaptor); + Description: Creates a new Bio::EnsEMBL::RepeatConsensus object + Returntype : Bio::EnsEMBL::RepeatConsensus + Exceptions : none + Caller : RepeatFeatureAdaptors + Status : Stable + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + my ($name, $length, $repeat_class, $repeat_consensus, $repeat_type ) = + rearrange(['NAME', 'LENGTH', 'REPEAT_CLASS', 'REPEAT_CONSENSUS', 'REPEAT_TYPE'], @_); + + $self->{'name'} = $name; + $self->{'length'} = $length; + $self->{'repeat_class'} = $repeat_class; + $self->{'repeat_consensus'} = $repeat_consensus; + $self->{'repeat_type'} = $repeat_type; + + return $self; +} + + +=head2 new_fast + + Arg [1] : hashref to bless as a new RepeatConsensus + + Description: Creates a new Bio::EnsEMBL::RepeatConsensus object + Returntype : Bio::EnsEMBL::RepeatConsensus + Exceptions : none + Caller : internal + Status : Stable + +=cut + +sub new_fast { + my $class = shift; + my $hashref = shift; + my $self = bless $hashref, $class; + weaken($self->{adaptor}) if ( ! isweak($self->{adaptor}) ); + return $self; +} + + +=head2 name + + Arg [1] : string $name (optional) + Example : $name = $repeat_consensus->name() + Description: Getter/Setter for the name of this repeat_consensus + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub name { + my $self = shift; + $self->{'name'} = shift if(@_); + return $self->{'name'}; +} + + +=head2 length + + Arg [1] : int $length (optional) + Example : $length = $repeat_consensus->length() + Description: Getter/Setter for the length of this repeat_consensus + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub length { + my $self = shift; + $self->{'length'} = shift if(@_); + return $self->{'length'}; +} + + +=head2 repeat_class + + Arg [1] : string $class (optional) + The class of + Example : $class = $repeat_consensus->repeat_class() + Description: Getter/Setter for the class of this repeat_consensus + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub repeat_class { + my $self = shift; + $self->{'repeat_class'} = shift if(@_); + return $self->{'repeat_class'}; +} + +=head2 repeat_type + + Arg [1] : string $type (optional) + The type of the consensus + Example : $type = $repeat_consensus->repeat_type() + Description: Getter/Setter for the type of this repeat_consensus + Returntype : string + Exceptions : none + Caller : general + +=cut + +sub repeat_type { + my $self = shift; + $self->{'repeat_type'} = shift if(@_); + return $self->{'repeat_type'}; +} + + +=head2 desc + + Arg [1] : none + Example : $desc = $repeat_consensus->desc() + Description: Getter for the description of this repeat consensus as extracted + from the repeat_class. This method is probably useless. + Returntype : string + Exceptions : none + Caller : general + Status : Medium risk + +=cut + +sub desc { + my $self = shift; + my $class = $self->repeat_class or return; + return "class=$class"; +} + + + +=head2 repeat_consensus + + Arg [1] : string $consensus_seq (optional) + The sequence of this repeat consensus + Example : $consensus = $repeat_consensus->repeat_consensus(); + Description: Getter/Setter for the sequence of this repeat_consensus. + Returntype : string + Exceptions : none + Caller : general + +=cut + +sub repeat_consensus { + my $self = shift; + $self->{'repeat_consensus'} = shift if(@_); + return $self->{'repeat_consensus'}; +} + + + +=head2 seq + + Arg [1] : none + Example : none + Description: Returns the repeat consensus. This method is useless - Use + repeat_consensus() instead. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub seq { + my( $self ) = @_; + return $self->repeat_consensus; +} + +1; + +__END__ + +=head1 NAME - Bio::EnsEMBL::RepeatConsensus + +=head1 DESCRIPTION + +This object represents an entry in the +repeat_consensus table. + +It can contain the consensus sequence for a +repeat such as a particular Alu, or "cag" for a +simple triplet repeat. + +=head1 AUTHOR + +James Gilbert B jgrg@sanger.ac.uk + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/RepeatFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/RepeatFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,270 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::RepeatFeature - A feature representing a repeat on a piece of +sequence. + +=head1 SYNOPSIS + + my $rf = new Bio::EnsEMBL::Feature( + -start => 100, + -end => 220, + -strand => -1, + -slice => $slice, + -analysis => $analysis, + -repeat_consensus => $rc, + -hstart => 10, + -hend => 100, + -hstrand => 1, + -score => 83.2 + ); + + my $hstart = $feat->hstart; + my $hend = $feat->hend; + + # move the feature to the chromosomal coordinate system + $feature = $feature->transform('chromosome'); + + # move the feature to a different slice + # (possibly on another coord system) + $feature = $feature->transfer($new_slice); + + # project the feature onto another coordinate system possibly across + # boundaries: + @projection = @{ $feature->project('contig') }; + + # change the start, end, and strand of the feature in place + $feature->move( $new_start, $new_end, $new_strand ); + +=head1 DESCRIPTION + +This a feature representing a repeat region on a sequence + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::RepeatFeature; + +use strict; + +use Bio::EnsEMBL::Utils::Exception qw(throw); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); + +use base qw/Bio::EnsEMBL::Feature/; + +=head2 new + + Arg [REPEAT_CONSENSUS] : Bio::EnsEMBL::RepeatConsensus (optional) + The repeat consensus for this repeat feature + Arg [HSTART] : int (optional) + The hit start on the consensus sequence + Arg [HEND] : int (optional) + The hit end on the consensus sequence + Arg [SCORE] : float (optional) + The score + Arg [...] : Named arguments to superclass constructor + (see Bio::EnsEMBL::Feaure) + Example : $rf = Bio::EnsEMBL::RepeatFeature->new(-REPEAT_CONSENSUS => $rc, + -HSTART => 10, + -HEND => 100, + -SCORE => 58.0, + -START => 1_000_100, + -END => 1_000_190, + -STRAND => 1, + -ANALYSIS => $an, + -SLICE => $chr_slice); + Description: Creates a new Bio::EnsEMBL::RepeatFeature object + Returntype : Bio::EnsEMBL::RepeatFeature + Exceptions : none + Caller : RepeatFeatureAdaptors + Status : Stable + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + my ($repeat_consensus, $hstart, $hend, $score) = + rearrange(['REPEAT_CONSENSUS','HSTART','HEND','SCORE'], @_); + + $self->repeat_consensus($repeat_consensus); + $self->{'hstart'} = $hstart; + $self->{'hend'} = $hend; + $self->{'score'} = $score; + + return $self; +} + + +=head2 repeat_consensus + + Arg [1] : (optional) Bio::EnsEMBL::RepeatConsensus + Example : $repeat_consensus = $repeat->repeat_consensus; + Description: Getter/Setter for the repeat consensus of this repeat + Returntype : Bio::EnsEMBL::RepeatConsensus + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub repeat_consensus { + my $self = shift; + + if(@_) { + my $rc = shift; + if(defined($rc)) { + if(!ref($rc) || !$rc->isa('Bio::EnsEMBL::RepeatConsensus')) { + throw('RepeatConsensus arg must be a Bio::EnsEMBL::RepeatConsensus'); + } + } + $self->{'repeat_consensus'} = $rc; + } + + return $self->{'repeat_consensus'}; +} + + + +=head2 hstart + + Arg [1] : (optional) int $hstart + Example : $hit_start = $repeat->hstart; + Description: Getter/Setter for the start bp of this repeat match on the + consensus sequence. + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub hstart { + my $self = shift; + $self->{'hstart'} = shift if(@_); + return $self->{'hstart'}; +} + + +=head2 score + + Arg [1] : (optional) float $score + Example : $score = $repeat->score(); + Description: Getter/Setter for the score of this repeat feature + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub score { + my $self = shift; + $self->{'score'} = shift if(@_); + return $self->{'score'}; +} + + + +=head2 hend + + Arg [1] : (optional) int $hend + Example : $hit_end = $repeat->hend; + Description: Getter/Setter for the end bp of this repeat match on the + consensus sequence. + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub hend { + my $self = shift; + $self->{'hend'} = shift if(@_); + return $self->{'hend'}; +} + + + +=head2 hstrand + + Arg [1] : none + Example : none + Description: always returns 1. method exists for consistancy with other + features. + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub hstrand { + return 1; +} + + +=head2 display_id + + Arg [1] : none + Example : print $rf->display_id(); + Description: This method returns a string that is considered to be + the 'display' identifier. For repeat_features this is the + name of the repeat consensus if it is available otherwise it is + an empty string. + Returntype : string + Exceptions : none + Caller : web drawing code + Status : Stable + +=cut + +sub display_id { + my $self = shift; + + my $id = ''; + + my $rc = $self->{'repeat_consensus'}; + if($rc) { + $id = $rc->name(); + } + + return $id; +} + + +1; + +__END__ + +=head1 NAME - Bio::EnsEMBL::RepeatFeature + +=head1 AUTHOR + +James Gilbert B jgrg@sanger.ac.uk + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/RepeatMaskedSlice.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/RepeatMaskedSlice.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,308 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::RepeatMaskedSlice - Arbitary Slice of a genome + +=head1 SYNOPSIS + + $sa = $db->get_SliceAdaptor(); + + $slice = + $sa->fetch_by_region( 'chromosome', 'X', 1_000_000, 2_000_000 ); + + $repeat_masked_slice = $slice->get_repeatmasked_seq(); + + # get repeat masked sequence: + my $dna = $repeat_masked_slice->seq(); + $dna = $repeat_masked_slice->subseq( 1, 1000 ); + +=head1 DESCRIPTION + +This is a specialised Bio::EnsEMBL::Slice class that is used to retrieve +repeat masked genomic sequence rather than normal genomic sequence. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::RepeatMaskedSlice; + +use strict; +use warnings; + +use Bio::EnsEMBL::Slice; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); +use Bio::EnsEMBL::Utils::Exception; + +use vars qw(@ISA); + +@ISA = ('Bio::EnsEMBL::Slice'); + +# The BLOCK_PWR is the lob_bin of the chunksize where you want your repeat features +# to be retreived. This will create repeat feature retrieval calls that are likely +# to be on the same slice and hopefully create cache hits and less database traffic +my $BLOCK_PWR = 18; + + + +=head2 new + + Arg [-REPEAT_MASK] : The logic name of the repeats to be used for masking. + If not provided, all repeats in the database are used. + Arg [...] : Named superclass arguments. See B. + Example : my $slice = Bio::EnsEMBL::RepeatMaskedSlice->new + (-START => $start, + -END => $end, + -STRAND => $strand, + -SEQ_REGION_NAME => $seq_region, + -SEQ_REGION_LENGTH => $seq_region_length, + -COORD_SYSTEM => $cs, + -ADAPTOR => $adaptor, + -REPEAT_MASK => ['repeat_masker'], + -SOFT_MASK => 1, + -NOT_DEFAULT_MASKING_CASES => {"repeat_class_SINE/MIR" => 1, + "repeat_name_AluSp" => 0}); + Description: Creates a Slice which behaves exactly as a normal slice but + that returns repeat masked sequence from the seq method. + Returntype : Bio::EnsEMBL::RepeatMaskedSlice + Exceptions : none + Caller : RawComputes (PredictionTranscript creation code). + Status : Stable + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my ($logic_names, $soft_mask, $not_default_masking_cases) = rearrange(['REPEAT_MASK', + 'SOFT_MASK', + 'NOT_DEFAULT_MASKING_CASES'], @_); + + my $self = $class->SUPER::new(@_); + + + $logic_names ||= ['']; + if(ref($logic_names) ne 'ARRAY') { + throw("Reference to list of logic names argument expected."); + } + + $self->{'repeat_mask_logic_names'} = $logic_names; + $self->{'soft_mask'} = $soft_mask; + $self->{'not_default_masking_cases'} = $not_default_masking_cases; + $self->{'not_default_masking_cases'} ||= {}; + + return $self; +} + + +=head2 repeat_mask_logic_names + + Arg [1] : reference to list of strings $logic_names (optional) + Example : $rm_slice->repeat_mask_logic_name(['repeat_masker']); + Description: Getter/Setter for the logic_names of the repeats that are used + to mask this slices sequence. + Returntype : reference to list of strings + Exceptions : none + Caller : seq() method + Status : Stable + +=cut + +sub repeat_mask_logic_names { + my $self = shift; + + if(@_) { + my $array = shift; + if(ref($array) ne 'ARRAY') { + throw('Reference to list of logic names argument expected.'); + } + } + + return $self->{'repeat_mask_logic_names'}; +} + + +=head2 soft_mask + + Arg [1] : boolean $soft_mask (optional) + Example : $rm_slice->soft_mask(0); + Description: Getter/Setter which is used to turn on/off softmasking of the + sequence returned by seq. + Returntype : boolean + Exceptions : none + Caller : seq() method + Status : Stable + +=cut + +sub soft_mask { + my $self = shift; + $self->{'soft_mask'} = shift if(@_); + return $self->{'soft_mask'} || 0; +} + +=head2 not_default_masking_cases + + Arg [1] : hash reference $not_default_masking_cases (optional, default is {}) + The values are 0 or 1 for hard and soft masking respectively + The keys of the hash should be of 2 forms + "repeat_class_" . $repeat_consensus->repeat_class, + e.g. "repeat_class_SINE/MIR" + "repeat_name_" . $repeat_consensus->name + e.g. "repeat_name_MIR" + depending on which base you want to apply the not default masking either + the repeat_class or repeat_name. Both can be specified in the same hash + at the same time, but in that case, repeat_name setting has priority over + repeat_class. For example, you may have hard masking as default, and + you may want soft masking of all repeat_class SINE/MIR, + but repeat_name AluSp (which are also from repeat_class SINE/MIR) + Example : $rm_slice->not_default_masking_cases({"repeat_class_SINE/MIR" => 1, + "repeat_name_AluSp" => 0}); + Description: Getter/Setter which is used to escape some repeat class or name from the default + masking in place. + Returntype : hash reference + Exceptions : none + Caller : seq() and subseq() methods + Status : Stable + +=cut + +sub not_default_masking_cases { + my $self = shift; + $self->{'not_default_masking_cases'} = shift if (@_); + return $self->{'not_default_masking_cases'}; +} + +=head2 seq + + Arg [1] : none + Example : print $rmslice->seq(), "\n"; + Description: Retrieves the entire repeat masked sequence for this slice. + See also the B implementation of this + method. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub seq { + my $self = shift; + + # + # get all the features + # + my $repeats = $self->_get_repeat_features($self); + my $soft_mask = $self->soft_mask(); + my $not_default_masking_cases = $self->not_default_masking_cases(); + + # + # get the dna + # + my $dna = $self->SUPER::seq(@_); + + # + # mask the dna + # + $self->_mask_features(\$dna,$repeats,$soft_mask,$not_default_masking_cases); + return $dna; +} + +=head2 subseq + + Arg [1] : none + Example : print $rmslice->subseq(1, 1000); + Description: Retrieves a repeat masked sequence from a specified subregion + of this slice. See also the B + implementation of this method. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub subseq { + my $self = shift; + my $start = shift; + my $end = shift; + my $strand = shift; + + my $subsequence_slice = $self->sub_Slice($start, $end, $strand); + + # If frequent subseqs happen on repeatMasked sequence this results in + # a lot of feature retrieval from the database. To avoid this, features + # are only retrieved from subslices with fixed space boundaries. + # The access happens in block to make cache hits more likely + # ONLY DO IF WE ARE CACHING + + my $subslice; + if(! $self->adaptor()->db()->no_cache()) { + + my $seq_region_slice = $self->seq_region_Slice(); + # The blocksize can be defined on the top of this module. + my $block_min = ($subsequence_slice->start()-1) >> $BLOCK_PWR; + my $block_max = ($subsequence_slice->end()-1) >> $BLOCK_PWR; + + my $sub_start = ($block_min << $BLOCK_PWR)+1; + my $sub_end = ($block_max+1)<<$BLOCK_PWR; + if ($sub_end > $seq_region_slice->length) { + $sub_end = $seq_region_slice->length ; + } + $subslice = $seq_region_slice->sub_Slice($sub_start, $sub_end); + } + else { + $subslice = $subsequence_slice; + } + + my $repeats = $self->_get_repeat_features($subslice); + my $soft_mask = $self->soft_mask(); + my $not_default_masking_cases = $self->not_default_masking_cases(); + my $dna = $subsequence_slice->SUPER::seq(); + $subsequence_slice->_mask_features(\$dna,$repeats,$soft_mask,$not_default_masking_cases); + return $dna; +} + +=head2 _get_repeat_features + + Args [1] : Bio::EnsEMBL::Slice to fetch features for + Description : Gets repeat features for the given slice + Returntype : ArrayRef[Bio::EnsEMBL::RepeatFeature] array of repeats + +=cut + + + +sub _get_repeat_features { + my ($self, $slice) = @_; + my $logic_names = $self->repeat_mask_logic_names(); + my @repeats; + foreach my $l (@$logic_names) { + push @repeats, @{$slice->get_all_RepeatFeatures($l)}; + } + return \@repeats; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Root.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Root.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,200 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 AUTHOR + + Originally from Steve Chervitz. Refactored by Ewan Birney. + +=cut + +=head1 NAME + +Bio::EnsEMBL::Root + +=head1 DESCRIPTION + +Do not use Bio::EnsEMBL::Root anymore. It is included for backwards +compatibility (every object in EnsEMBL used to inherit from this class) +but will eventually be phased out. The replacement for the _rearrage +method is the rearrange method which can be imported in the following +way: + + use Bio::EnsEMBL::Utils::Argument qw(rearrange); + + # can now call rearrange as a class method (instead as object method) + my ( $start, $end ) = rearrange( [ 'START', 'END' ], @args ); + +If you want to use the throw or warn methods the replacement use the +class methods throw and warning from the Bio::EnsEMBL::Utils::Exception +class: + + use Bio::EnsEMBL::Utils::Exception qw(throw warning); + + # can now call throw or warning even without blessed reference + warning('This is a warning'); + throw('This is an exception'); + +This module was stolen from BioPerl to avoid problems with moving to +BioPerl 1 from 0.7 + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Root; + +use strict; +use vars qw($VERBOSITY); +use Bio::EnsEMBL::Utils::Exception qw( ); +use Bio::EnsEMBL::Utils::Argument qw( ); + + +$VERBOSITY = 0; + +sub new{ + my($caller,@args) = @_; + + my $class = ref($caller) || $caller; + return bless({}, $class); +} + + +=head2 throw + + DEPRECATED + +=cut + +sub throw{ + my ($self,$string) = @_; + + Bio::EnsEMBL::Utils::Exception->warning("\n------------------ DEPRECATED ---------------------\n". + "Bio::EnsEMBL::Root::throw has been deprecated\n". + "use Bio::EnsEMBL::Utils::Exception qw(throw); \n". + "throw('message'); #instead\n". + "\n---------------------------------------------------\n"); + + Bio::EnsEMBL::Utils::Exception->throw($string); + + +} + +=head2 warn + + DEPRECATED + +=cut + +sub warn{ + my ($self,$string) = @_; + + + + Bio::EnsEMBL::Utils::Exception->warning("\n------------------ DEPRECATED ---------------------\n". + "Bio::EnsEMBL::Root::warn has been deprecated\n". + "use Bio::EnsEMBL::Utils::Exception qw(warning); \n". + "warning('message'); #instead\n". + "\n---------------------------------------------------\n"); + + Bio::EnsEMBL::Utils::Exception->warning($string); + +} + + + + +=head2 verbose + + DEPRECATED + +=cut + +sub verbose{ + my ($self,$value) = @_; + + Bio::EnsEMBL::Utils::Exception->warning("\n------------------ DEPRECATED ---------------------\n". + "Bio::EnsEMBL::Root::verbose has been deprecated\n". + "use Bio::EnsEMBL::Utils::Exception qw(verbose); \n". + "verbose(value); #instead\n". + "\n---------------------------------------------------\n"); + + Bio::EnsEMBL::Utils::Exception->verbose($value); + + } + +=head2 stack_trace_dump + + DEPRECATED + +=cut + +sub stack_trace_dump{ + my ($self) = @_; + + Bio::EnsEMBL::Utils::Exception->warning("\n------------------ DEPRECATED ---------------------\n". + "Bio::EnsEMBL::Root::stack_trace_dump has been deprecated\n". + "use Bio::EnsEMBL::Utils::Exception qw(stack_trace_dump); \n". + "stack_trace_dump(); #instead\n". + "\n---------------------------------------------------\n"); + + Bio::EnsEMBL::Utils::Exception->stack_trace_dump(); + +} + + +=head2 stack_trace + + DEPRECATED + +=cut + +sub stack_trace{ + my ($self) = @_; + + Bio::EnsEMBL::Utils::Exception->warning("\n------------------ DEPRECATED ---------------------\n". + "Bio::EnsEMBL::Root::stack_trace has been deprecated\n". + "use Bio::EnsEMBL::Utils::Exception qw(stack_trace); \n". + "stack_trace(); #instead\n". + "\n---------------------------------------------------\n"); + + Bio::EnsEMBL::Utils::Exception->stack_trace(); + +} + + +=head2 _rearrange + + DEPRECATED + +=cut + +#----------------' +sub _rearrange { +#---------------- + my($self,$order,@param) = @_; + + my $mess = "use Bio::EnsEMBL::Utils::Argument qw(rearrange); \n"; + $mess .= "rearrange(order, list); #instead\n"; + + Bio::EnsEMBL::Utils::Exception->deprecate($mess); + + return Bio::EnsEMBL::Utils::Argument->rearrange($order,@param); + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/SNP.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/SNP.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,177 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::SNP + +=head1 SYNOPSIS + + $snp = new Bio::EnsEMBL::SNP( + -start => 10, + -end => 10, + -strand => 1, + -source => 'The SNP Consortium', + -score => 99, # new meaning + -status => 'suspected', # new + -alleles => 't|c' # new + ); + + # add it to an annotated sequence + + $annseq->add_SeqFeature($feat); + +=head1 DESCRIPTION + +This class was written because the EnsEMBL::ExternalData::Variation +object is way too slow. There was simply too much chaining to bioperl +methods many, many layers deep. This object behaves like a Variation +but has a much faster constructor, and faster accessors for the relevant +methods needed by the web. + +=head1 METHODS + +=cut + + +package Bio::EnsEMBL::SNP; +use vars qw(@ISA); +use strict; + + +use Bio::EnsEMBL::ExternalData::Variation; +use Scalar::Util qw(weaken isweak); + +@ISA = qw( Bio::EnsEMBL::ExternalData::Variation ); + + +sub new_fast { + my $class = shift; + my $hashref = shift; + my $self = bless $hashref, $class; + weaken($self->{adaptor}) if ( ! isweak($self->{adaptor}) ); + return $self; +} + +sub dbID { + my $self = shift; + + if(@_) { + $self->{'dbID'} = shift; + } + + return $self->{'dbID'}; +} + +sub position { + my ($self, $arg) = @_; + + if(defined $arg) { + $self->{_gsf_start} = $arg; + $self->{_gsf_end} = $arg; + } + + return $self->{_gsf_start}; +} + +sub start { + my ($self, $arg) = @_; + + if(defined $arg) { + $self->{_gsf_start} = $arg; + } + + return $self->{_gsf_start}; +} + +sub end { + my ($self, $arg) = @_; + + if(defined $arg) { + $self->{_gsf_end} = $arg; + } + + return $self->{_gsf_end}; +} + + +sub source { + my ($self, $arg) = @_; + + if(defined $arg) { + $self->{_source} = $arg; + } + + return $self->{_source}; + } + +sub score { + my ($self, $arg) = @_; + + if(defined $arg) { + $self->{_gsf_score} = $arg; + } + + return $self->{_gsf_score}; +} + + +sub source_tag { + my ($self, $arg) = @_; + + if(defined $arg) { + $self->{_source_tag} = $arg; + } + + return $self->{_source_tag}; +} + +sub source_version { + my ($self, $arg) = @_; + + if(defined $arg) { + $self->{_source_version} = $arg; + } + + return $self->{_source_version}; +} + + +=head2 display_name + + Arg [1] : none + Example : print $snp->display_name(); + Description: This method returns a string that is considered to be + the 'display' identifier. For snps this is the + returns the same thing as the id method. + Returntype : string + Exceptions : none + Caller : web drawing code + +=cut + +sub display_name { + my $self = shift; + return $self->id(); +} + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/SeqEdit.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/SeqEdit.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,431 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL:SeqEdit - A class representing a post transcriptional edit to a +sequence. + +=head1 SYNOPSIS + + use Bio::EnsEMBL::SeqEdit; + use Bio::EnsEMBL::Attribute; + + # construct a SeqEdit object using a Transcript attribute + + ($attribute) = @{ $translation->get_all_Attributes('_rna_edit') }; + + $seq_edit = Bio::EnsEMBL::SeqEdit( -ATTRIB => $attribute ); + + print $seq_edit->start(), "\n"; + print $seq_edit->end(), "\n"; + print $seq_edit->alt_seq(), "\n"; + + # apply the edit to some sequence + $seq = $transcript->spliced_seq(); + print "Before modifiction: $seq\n"; + + $seq_edit->apply_edit( \$seq ); + print "After modification: $seq\n"; + + # construct an attribute object from a SeqEdit and add it to a + # translation + + $seq_edit = Bio::EnsEMBL::SeqEdit->new( + -CODE => '_selenocysteine', + -NAME => 'Selenocysteine', + -DESC => 'Selenocysteine', + -START => 10, + -END => 10, + -ALT_SEQ => 'U' + ); + + $attribute = $seq_edit->get_Attribute(); + $translation->add_Attributes($attribute); + +=head1 DESCRIPTION + +This is a class used to represent post transcriptional +modifications to sequences. SeqEdit objects are stored as ordinary +Bio::EnsEMBL::Attributes with a parseable value and can be used to +represent RNA editing, selenocysteines etc. + +Also see B + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::SeqEdit; + +use strict; +use warnings; + +use Bio::EnsEMBL::Attribute; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw); + + +=head2 new + + Arg [-ATTRIB] : Bio::EnsEMBL::Attribute + Constructs a new SeqEdit from an Attribute. + Can only be provided if no other constructor arguments + are provided. + Arg [-START] : The start position of the edit. + Arg [-END] : The end position of the edit. + Arg [-ALT_SEQ] : The alternate sequence + Arg [-CODE] : A code for this SeqEdit + Arg [-NAME] : A name for this SeqEdit + Arg [-DESCRIPTION] : Arg passed to superclass constructor + Example : my $sea = Bio::EnsEMBL::SeqEdit->new(-ATTRIB => $attrib); + my $sea = Bio::EnsEMBL::SeqEdit->new + (-START => 10, + -END => 12, + -ALT_SEQ => 'ACG', + -CODE => '_rna_edit', + -NAME => 'RNA Edit', + -DESCRIPTION => 'RNA edit'); + Description: Constructs a SeqEdit representing a single edit to a + sequence, such as an rna modification or a selenocysteine. + Returntype : Bio::EnsEMBL::SeqEdit + Exceptions : throws if attribute set and other args aswell + throws if start and end not set correctly of attribure not set + Caller : general + Status : Stable + +=cut + +sub new { + my $class = shift; + + my ($attrib, $start, $end, $alt_seq, $name, $desc, $code) = + rearrange([qw(ATTRIB START END ALT_SEQ NAME DESCRIPTION CODE)], @_); + + my $self; + + if($attrib) { + if(defined($start) || defined($end) || defined($alt_seq) || + defined($name) || defined($desc) || defined($code)) { + throw("Cannot specify -ATTRIB argument with additional arguments."); + } + + if(!ref($attrib) || !$attrib->isa('Bio::EnsEMBL::Attribute')) { + throw('Bio::EnsEMBL::Attribute argument expected.'); + } + + ($start, $end, $alt_seq) = split(/\s+/, $attrib->value()); + + if($start !~ /\d+/ || $end !~ /\d+/) { + throw('Could not parse value of attribute: '.$attrib->value()); + } + + $name = $attrib->name(); + $code = $attrib->code(); + $desc = $attrib->description(); + + + } + + if(defined($end) && defined($start) && $start > $end+1) { + throw("start must be less than or equal to end + 1"); + } + + if(defined($start) && $start < 1) { + throw("start must be greater than or equal to 1"); + } + + if(defined($end) && $end < 0) { + throw("end must be greater than or equal to 0"); + } + + $alt_seq ||= ''; + + return bless {'start' => $start, + 'end' => $end, + 'alt_seq' => $alt_seq, + 'description' => $desc, + 'name' => $name, + 'code' => $code}, $class; +} + + + +=head2 start + + Arg [1] : (optional) int $start - the new start position + Example : $start = $se_attrib->start(); + Description: Getter/Setter for the start position of the region replaced + by the alt_seq. + + Coordinates are inclusive and one-based, which means that + inserts are unusually represented by a start 1bp higher than + the end. + + E.g. start = 1, end = 1 is a replacement of the first base but + start = 1, end = 0 is an insert BEFORE the first base. + Returntype : int + Exceptions : none + Caller : Transcript, Translation + Status : Stable + +=cut + +sub start { + my $self = shift; + + if(@_) { + my $start = shift; + if(defined($start) && $start < 1) { + throw("start must be greater than or equal to 1"); + } + $self->{'start'} = $start; + } + + return $self->{'start'}; +} + + +=head2 end + + Arg [1] : (optional) int $end - the new end position + Example : $end = $se_attrib->end(); + Description: Getter/Setter for the end position of the region replaced + by the alt_seq. + + Coordinates are inclusive and one-based, which means that + inserts are unusually represented by a start 1bp higher than + the end. + + E.g. start = 1, end = 1 is a replacement of the first base but + start = 1, end = 0 is an insert BEFORE the first base. + Returntype : int + Exceptions : throws if end <= 0 + Caller : Transcript, Translation + Status : Stable + +=cut + +sub end { + my $self = shift; + + if(@_) { + my $end = shift; + if(defined($end) && $end < 0) { + throw("end must be greater than or equal to 0"); + } + $self->{'end'} = $end; + } + + return $self->{'end'}; +} + + +=head2 alt_seq + + Arg [1] : (optional) string $alt_seq + Example : my $alt_seq = $se_attrib->alt_seq(); + Description: Getter/Setter for the replacement sequence used by this edit. + The sequence may either be a string of amino acids or + nucleotides depending on the context in which this edit is + used. + + In the case of a deletion the replacement sequence is an empty + string. + Returntype : string + Exceptions : none + Caller : Transcript, Translation + Status : Stable + +=cut + +sub alt_seq { + my $self = shift; + $self->{'alt_seq'} = shift || '' if(@_); + return $self->{'alt_seq'}; +} + + +=head2 length_diff + + Arg [1] : none + Example : my $diff = $sea->length_diff(); + Description: Returns the difference in length caused by applying this + edit to a sequence. This may be be negative (deletion), + positive (insertion) or 0 (replacement). + + If either start or end are not defined 0 is returned. + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub length_diff { + my $self = shift; + + return 0 if(!defined($self->{'end'}) || !defined($self->{'start'})); + + return length($self->{'alt_seq'}) - ($self->{'end'} - $self->{'start'} + 1); +} + + + +=head2 name + + Arg [1] : (optional) string $name + Example : my $name = $seqedit->name(); + Description: Getter/Setter for the name of this SeqEdit + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub name { + my $self = shift; + $self->{'name'} = shift if(@_); + return $self->{'name'}; +} + + + + +=head2 code + + Arg [1] : (optional) string $code + Example : my $code = $seqedit->code(); + Description: Getter/Setter for the code of this SeqEdit + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub code { + my $self = shift; + $self->{'code'} = shift if(@_); + return $self->{'code'}; +} + + + +=head2 description + + Arg [1] : (optional) string $desc + Example : my $desc = $seqedit->description(); + Description: Getter/Setter for the description of this SeqEdit + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub description { + my $self = shift; + $self->{'description'} = shift if(@_); + return $self->{'description'}; +} + + + +=head2 get_Attribute + + Arg [1] : none + Example : my $attrib = $seqedit->get_Attribute(); + $transcript->add_Attributes($attrib); + Description: Converts a SeqEdit object into an Attribute object. This + allows the SeqEdit to be stored as any other attribute in the + ensembl database. The start/end and alt_seq properties + should be set before calling this method. + Returntype : Bio::EnsEMBL::Attribute + Exceptions : warning if start/end or alt_seq properties are not defined + Caller : general + Status : Stable + +=cut + +sub get_Attribute { + my $self = shift; + + my $start = $self->start(); + my $end = $self->end(); + my $alt_seq = $self->alt_seq(); + + my $value; + + if(defined($start) && defined($end) && defined($alt_seq)) { + $value = join(' ', $start, $end, $alt_seq); + } else { + warning('Attribute value cannot be created unless start, end and alt_seq' . + 'properties are defined'); + $value = ''; + } + + return Bio::EnsEMBL::Attribute->new(-CODE => $self->code(), + -VALUE => $value, + -NAME => $self->name(), + -DESCRIPTION => $self->description()); +} + + +=head2 apply_edit + + Arg [1] : reference to string $seqref + Example : $sequence = 'ACTGAATATTTAAGGCA'; + $seqedit->apply_edit(\$sequence); + print $sequence, "\n"; + Description: Applies this edit directly to a sequence which is + passed by reference. The coordinates of this SeqEdit + are assumed to be relative to the start of the sequence + argument. + If either the start or end of this SeqEdit are not defined + this function will not do anything to the passed sequence. + Returntype : reference to the same sequence that was passed in + Exceptions : none + Caller : Transcript, Translation + Status : Stable + +=cut + +sub apply_edit { + my $self = shift; + my $seqref = shift; + + if(ref($seqref) ne 'SCALAR') { + throw("Reference to scalar argument expected"); + } + + if(!defined($self->{'start'}) || !defined($self->{'end'})) { + return $seqref; + } + + my $len = $self->{'end'} - $self->{'start'} + 1; + substr($$seqref, $self->{'start'} - 1, $len) = $self->{'alt_seq'}; + + return $seqref; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/SeqFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/SeqFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1255 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::SeqFeature - Ensembl specific sequence feature. + +=head1 DESCRIPTION + +Do not use this module if you can avoid it. It has been replaced by +Bio::EnsEMBL::Feature. This module has a long history of usage but has +become very bloated, and quite unweildy. It was decided to replace +it completely with a smaller, light-weight feature class rather than +attempting to refactor this class, and maintain strict backwards +compatibility. + +Part of the complexity of this class was in its extensive +inheritance. As an example the following is a simplified inheritance +heirarchy that was present for Bio::EnsEMBL::DnaAlignFeature: + + Bio::EnsEMBL::DnaAlignFeature + Bio::EnsEMBL::BaseAlignFeature + Bio::EnsEMBL::FeaturePair + Bio::EnsEMBL::SeqFeature + Bio::EnsEMBL::SeqFeatureI + Bio::SeqFeatureI + Bio::RangeI + Bio::Root::RootI + +The new Bio::EnsEMBL::Feature class is much shorter, and hopefully much +easier to understand and maintain. + +=head1 METHODS + +=cut + + +# Let the code begin... + + +package Bio::EnsEMBL::SeqFeature; + +use vars qw(@ISA); +use strict; + + +use Bio::EnsEMBL::SeqFeatureI; +use Bio::EnsEMBL::Analysis; +use Bio::EnsEMBL::Root; + +@ISA = qw(Bio::EnsEMBL::Root Bio::EnsEMBL::SeqFeatureI); + +use Bio::EnsEMBL::Utils::Argument qw(rearrange); + +sub new { + my($caller,@args) = @_; + + my $self = {}; + + if(ref $caller) { + bless $self, ref $caller; + } else { + bless $self, $caller; + } + + $self->{'_gsf_tag_hash'} = {}; + $self->{'_gsf_sub_array'} = []; + $self->{'_parse_h'} = {}; + $self->{'_is_splittable'} = 0; + + my ($start,$end,$strand,$frame,$score,$analysis,$seqname, $source_tag, + $primary_tag, $percent_id, $p_value, $phase, $end_phase) = + + &rearrange([qw(START + END + STRAND + FRAME + SCORE + ANALYSIS + SEQNAME + SOURCE_TAG + PRIMARY_TAG + PERCENT_ID + P_VALUE + PHASE + END_PHASE + )],@args); + + # $gff_string && $self->_from_gff_string($gff_string); + + if ( defined $analysis && $analysis ne "") { $self->analysis($analysis)}; + if ( defined ($start) && $start ne "" ) { $self->start($start)}; + if ( defined ($end ) && $end ne "" ) { $self->end($end)} + if ( defined $strand && $strand ne "") { $self->strand($strand)} + if ( defined $frame && $frame ne "") { $self->frame($frame)} + if ( defined $score && $score ne "") { $self->score($score)} + if ( defined $seqname && $seqname ne "") { $self->seqname($seqname)}; + if ( defined $percent_id && $percent_id ne ""){ $self->percent_id($percent_id)}; + if ( defined $p_value && $p_value ne "") { $self->p_value($p_value)}; + if ( defined $phase && $phase ne "") { $self->phase($phase)}; + if ( defined $end_phase && $end_phase ne "") { $self->end_phase($end_phase)}; + + return $self; # success - we hope! + +} + + + + + + +=head2 start + + Title : start + Usage : $start = $feat->start + $feat->start(20) + Function: Get/set on the start coordinate of the feature + Returns : integer + Args : none + + +=cut + +sub start{ + my ($self,$value) = @_; + + if (defined($value)) { + if ($value !~ /^\-?\d+/ ) { + $self->throw("$value is not a valid start"); + } + + $self->{'_gsf_start'} = $value + } + + return $self->{'_gsf_start'}; + +} + +=head2 end + + Title : end + Usage : $end = $feat->end + $feat->end($end) + Function: get/set on the end coordinate of the feature + Returns : integer + Args : none + + +=cut + +sub end{ + my ($self,$value) = @_; + + if (defined($value)) { + if( $value !~ /^\-?\d+/ ) { + $self->throw("[$value] is not a valid end"); + } + + $self->{'_gsf_end'} = $value; + } + + return $self->{'_gsf_end'}; +} + +=head2 length + + Title : length + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub length{ + my ($self,@args) = @_; + + return $self->end - $self->start +1; +} + + +=head2 strand + + Title : strand + Usage : $strand = $feat->strand() + $feat->strand($strand) + Function: get/set on strand information, being 1,-1 or 0 + Returns : -1,1 or 0 + Args : none + + +=cut + +sub strand { + my ($self,$value) = @_; + + if (defined($value)) { + if( $value eq '+' ) { $value = 1; } + if( $value eq '-' ) { $value = -1; } + if( $value eq '.' ) { $value = 0; } + + if( $value != -1 && $value != 1 && $value != 0 ) { + $self->throw("$value is not a valid strand info"); + } + $self->{'_gsf_strand'} = $value; + } + + return $self->{'_gsf_strand'}; +} + + +=head2 move + + Arg [1] : int $start + Arg [2] : int $end + Arg [3] : (optional) int $strand + Example : $feature->move(100, 200, -1); + Description: Moves a feature to a different location. This is faster + then calling 3 seperate accesors in a large loop. + Returntype : none + Exceptions : none + Caller : BaseFeatureAdaptor + +=cut + +sub move { + my ($self, $start, $end, $strand) = @_; + + $self->{'_gsf_start'} = $start; + $self->{'_gsf_end'} = $end; + if(defined $strand) { + $self->{'_gsf_strand'} = $strand; + } +} + + +=head2 score + + Title : score + Usage : $score = $feat->score() + $feat->score($score) + Function: get/set on score information + Returns : float + Args : none if get, the new value if set + + +=cut + +sub score { + my ($self,$value) = @_; + + if(defined ($value) ) { + if( $value !~ /^[+-]?\d+\.?\d*(e-\d+)?/ ) { + $self->throw("'$value' is not a valid score"); + } + $self->{'_gsf_score'} = $value; + } + + return $self->{'_gsf_score'}; +} + +=head2 frame + + Title : frame + Usage : $frame = $feat->frame() + $feat->frame($frame) + Function: get/set on frame information + Returns : 0,1,2 + Args : none if get, the new value if set + + +=cut + +sub frame { + my ($self,$value) = @_; + + if (defined($value)) { + if( $value != 1 && $value != 2 && $value != 3 ) { + $self->throw("'$value' is not a valid frame"); + } + $self->{'_gsf_frame'} = $value; + } + + return $self->{'_gsf_frame'}; +} + +=head2 primary_tag + + Title : primary_tag + Usage : $tag = $feat->primary_tag() + $feat->primary_tag('exon') + Function: get/set on the primary tag for a feature, + eg 'exon' + Returns : a string + Args : none + + +=cut + +sub primary_tag{ + my ($self,$arg) = @_; + + if (defined($arg)) { + # throw warnings about setting primary tag + my ($p,$f,$l) = caller; + $self->warn("$f:$l setting primary_tag now deprecated." . + "Primary tag is delegated to analysis object"); + } + + unless($self->analysis) { + return ''; + } + + return $self->analysis->gff_feature(); +} + +=head2 source_tag + + Title : source_tag + Usage : $tag = $feat->source_tag() + $feat->source_tag('genscan'); + Function: Returns the source tag for a feature, + eg, 'genscan' + Returns : a string + Args : none + + +=cut + +sub source_tag{ + my ($self,$arg) = @_; + + if (defined($arg)) { + # throw warnings about setting primary tag + my ($p,$f,$l) = caller; + $self->warn("$f:$l setting source_tag now deprecated. " . + "Source tag is delegated to analysis object"); + } + + unless($self->analysis) { + return ""; + } + + return $self->analysis->gff_source(); +} + + +=head2 analysis + + Title : analysis + Usage : $sf->analysis(); + Function: Store details of the program/database + and versions used to create this feature. + + Example : + Returns : + Args : + + +=cut + +sub analysis { + my ($self,$value) = @_; + + if (defined($value)) { + unless(ref($value) && $value->isa('Bio::EnsEMBL::Analysis')) { + $self->throw("Analysis is not a Bio::EnsEMBL::Analysis object " + . "but a $value object"); + } + + $self->{_analysis} = $value; + } else { + #if _analysis is not defined, create a new analysis object + unless(defined $self->{_analysis}) { + $self->{_analysis} = new Bio::EnsEMBL::Analysis(); + } + } + + return $self->{_analysis}; +} + +=head2 validate + + Title : validate + Usage : $sf->validate; + Function: Checks whether all the data is present in the + object. + Example : + Returns : + Args : + + +=cut + +sub validate { + my ($self) = @_; + + $self->vthrow("Seqname not defined in feature") unless defined($self->seqname); + $self->vthrow("start not defined in feature") unless defined($self->start); + $self->vthrow("end not defined in feature") unless defined($self->end); + $self->vthrow("strand not defined in feature") unless defined($self->strand); + $self->vthrow("score not defined in feature") unless defined($self->score); + $self->vthrow("analysis not defined in feature") unless defined($self->analysis); + + if ($self->end < $self->start) { + $self->vthrow("End coordinate < start coordinate"); + } + +} + + + +sub vthrow { + my ($self,$message) = @_; + + print(STDERR "Error validating feature [$message]\n"); + print(STDERR " Seqname : [" . $self->{_seqname} . "]\n"); + print(STDERR " Start : [" . $self->{_gsf_start} . "]\n"); + print(STDERR " End : [" . $self->{_gsf_end} . "]\n"); + print(STDERR " Strand : [" . + ((defined ($self->{_gsf_strand})) ? $self->{_gsf_strand} : "undefined") . "]\n"); + + print(STDERR " Score : [" . $self->{_gsf_score} . "]\n"); + + print(STDERR " Analysis : [" . $self->{_analysis}->dbID . "]\n"); + + $self->throw("Invalid feature - see dump on STDERR"); +} + + +=head2 validate_prot_feature + + Title : validate_prot_feature + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +# Shouldn't this go as "validate" into Pro_SeqFeature? +sub validate_prot_feature{ + my ($self,$num) = @_; + $self->throw("Seqname not defined in feature") unless defined($self->seqname); + $self->throw("start not defined in feature") unless defined($self->start); + $self->throw("end not defined in feature") unless defined($self->end); + if ($num == 1) { + $self->throw("score not defined in feature") unless defined($self->score); + $self->throw("percent_id not defined in feature") unless defined($self->percent_id); + $self->throw("evalue not defined in feature") unless defined($self->p_value); + } + $self->throw("analysis not defined in feature") unless defined($self->analysis); +} + + + +# These methods are specified in the SeqFeatureI interface but we don't want +# people to store data in them. These are just here in order to keep +# existing code working + + +=head2 has_tag + + Title : has_tag + Usage : $value = $self->has_tag('some_tag') + Function: Returns the value of the tag (undef if + none) + Returns : + Args : + + +=cut + +sub has_tag{ + my ($self,$tag) = (shift, shift); + + return exists $self->{'_gsf_tag_hash'}->{$tag}; +} + +=head2 add_tag_value + + Title : add_tag_value + Usage : $self->add_tag_value('note',"this is a note"); + Returns : nothing + Args : tag (string) and value (any scalar) + + +=cut + +sub add_tag_value{ + my ($self,$tag,$value) = @_; + + if( !defined $self->{'_gsf_tag_hash'}->{$tag} ) { + $self->{'_gsf_tag_hash'}->{$tag} = []; + } + + push(@{$self->{'_gsf_tag_hash'}->{$tag}},$value); +} + +=head2 each_tag_value + + Title : each_tag_value + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub each_tag_value { + my ($self,$tag) = @_; + if( ! exists $self->{'_gsf_tag_hash'}->{$tag} ) { + $self->throw("asking for tag value that does not exist $tag"); + } + + return @{$self->{'_gsf_tag_hash'}->{$tag}}; +} + + +=head2 all_tags + + Title : all_tags + Usage : @tags = $feat->all_tags() + Function: gives all tags for this feature + Returns : an array of strings + Args : none + + +=cut + +sub all_tags{ + my ($self,@args) = @_; + + return keys %{$self->{'_gsf_tag_hash'}}; +} + + + +=head2 seqname + + Arg [1] : string $seqname + Example : $seqname = $self->seqname(); + Description: Obtains the seqname of this features sequence. This is set + automatically when a sequence with a name is attached, or may + be set manually. + Returntype : string + Exceptions : none + Caller : general, attach_seq + +=cut + +sub seqname{ + my ($self,$seqname) = @_; + + my $seq = $self->contig(); + + if(defined $seqname) { + $self->{_seqname} = $seqname; + } else { + if($seq && ref $seq && $seq->can('name')) { + $self->{_seqname} = $seq->name(); + } + } + + return $self->{_seqname}; +} + + +=head2 attach_seq + + Title : attach_seq + Usage : $sf->attach_seq($seq) + Function: Attaches a Bio::PrimarySeqI object to this feature. This + Bio::PrimarySeqI object is for the *entire* sequence: ie + from 1 to 10000 + Example : + Returns : + Args : + + +=cut + +sub attach_seq{ + my ($self, $seq) = @_; + + $self->contig($seq); +} + +=head2 seq + + Example : $tseq = $sf->seq() + Function: returns the sequence (if any ) for this feature truncated to the range spanning the feature + Returns : a Bio::PrimarySeq object (I reckon) + +=cut + +sub seq{ + my ($self,$arg) = @_; + + if( defined $arg ) { + $self->throw("Calling SeqFeature::Generic->seq with an argument. " . + "You probably want attach_seq"); + } + + if( ! exists $self->{'_gsf_seq'} ) { + return undef; + } + + # assumming our seq object is sensible, it should not have to yank + # the entire sequence out here. + + my $seq = $self->{'_gsf_seq'}->trunc($self->start(),$self->end()); + + + if( $self->strand == -1 ) { + + # ok. this does not work well (?) + #print STDERR "Before revcom", $seq->str, "\n"; + $seq = $seq->revcom; + #print STDERR "After revcom", $seq->str, "\n"; + } + + return $seq; +} + +=head2 entire_seq + + Title : entire_seq + Usage : $whole_seq = $sf->entire_seq() + Function: gives the entire sequence that this seqfeature is attached to + Example : + Returns : + Args : + + +=cut + +sub entire_seq{ + my ($self) = @_; + + return $self->contig; +} + + +=head2 sub_SeqFeature + + Title : sub_SeqFeature + Usage : @feats = $feat->sub_SeqFeature(); + Function: Returns an array of sub Sequence Features + Returns : An array + Args : none + + +=cut + +sub sub_SeqFeature { + my ($self) = @_; + + if ( $self->{'_gsf_sub_array'} ) { + return @{ $self->{'_gsf_sub_array'} }; + } else { + return (); + } +} + +=head2 add_sub_SeqFeature + + Title : add_sub_SeqFeature + Usage : $feat->add_sub_SeqFeature($subfeat); + $feat->add_sub_SeqFeature($subfeat,'EXPAND') + Function: adds a SeqFeature into the subSeqFeature array. + with no 'EXPAND' qualifer, subfeat will be tested + as to whether it lies inside the parent, and throw + an exception if not. + + If EXPAND is used, the parents start/end/strand will + be adjusted so that it grows to accommodate the new + subFeature + Returns : nothing + Args : An object which has the SeqFeatureI interface + + +=cut + +sub add_sub_SeqFeature{ + my ($self,$feat,$expand) = @_; + + if( !$feat->isa('Bio::SeqFeatureI') ) { + $self->warn("$feat does not implement Bio::SeqFeatureI. Will add it anyway, but beware..."); + } + + if( $expand eq 'EXPAND' ) { + # if this doesn't have start/end set - forget it! + if( !defined $self->start && !defined $self->end ) { + $self->start($feat->start()); + $self->end($feat->end()); + $self->strand($feat->strand); + } else { + my ($start,$end); + if( $feat->start < $self->start ) { + $start = $feat->start; + } + + if( $feat->end > $self->end ) { + $end = $feat->end; + } + + $self->start($start); + $self->end($end); + + } + } else { + if( !defined($feat->start()) || !defined($feat->end()) || + !defined($self->start()) || !defined($self->end())) { + $self->throw( "This SeqFeature and the sub_SeqFeature must define". + " start and end."); + } + if($feat->start() > $feat->end() || $self->start() > $self->end()) { + $self->throw("This SeqFeature and the sub_SeqFeature must have " . + "start that is less than or equal to end."); + } + if($feat->start() < $self->start() || $feat->end() > $self->end() ) { + $self->throw("$feat is not contained within parent feature, " . + "and expansion is not valid"); + } + } + + push(@{$self->{'_gsf_sub_array'}},$feat); + +} + +=head2 flush_sub_SeqFeature + + Title : flush_sub_SeqFeature + Usage : $sf->flush_sub_SeqFeature + Function: Removes all sub SeqFeature + (if you want to remove only a subset, take + an array of them all, flush them, and add + back only the guys you want) + Example : + Returns : none + Args : none + + +=cut + +sub flush_sub_SeqFeature { + my ($self) = @_; + + $self->{'_gsf_sub_array'} = []; # zap the array implicitly. +} + + +sub id { + my ($self,$value) = @_; + + if (defined($value)) { + $self->{_id} = $value; + } + + return $self->{_id}; + +} + +=head2 percent_id + + Title : percent_id + Usage : $pid = $feat->percent_id() + $feat->percent_id($pid) + Function: get/set on percentage identity information + Returns : float + Args : none if get, the new value if set + +=cut + +sub percent_id { + my ($self,$value) = @_; + + if (defined($value)) + { + $self->{_percent_id} = $value; + } + + return $self->{_percent_id}; +} + +=head2 p_value + + Title : p_value + Usage : $p_val = $feat->p_value() + $feat->p_value($p_val) + Function: get/set on p value information + Returns : float + Args : none if get, the new value if set + +=cut + +sub p_value { + my ($self,$value) = @_; + + if (defined($value)) + { + $self->{_p_value} = $value; + } + + return $self->{_p_value}; +} + +=head2 phase + + Title : phase + Usage : $phase = $feat->phase() + $feat->phase($phase) + Function: get/set on start phase of predicted exon feature + Returns : [0,1,2] + Args : none if get, 0,1 or 2 if set. + +=cut + +sub phase { + my ($self, $value) = @_; + + if (defined($value) ) + { + $self->throw("Valid values for Phase are [0,1,2]") if ($value < 0 || $value > 2); + $self->{_phase} = $value; + } + + return $self->{_phase}; +} + +=head2 end_phase + + Title : end_phase + Usage : $end_phase = $feat->end_phase() + $feat->end_phase($end_phase) + Function: returns end_phase based on phase and length of feature + Returns : [0,1,2] + Args : none if get, 0,1 or 2 if set. + +=cut + +sub end_phase { + my ($self, $value) = @_; + + if (defined($value)) + { + $self->throw("Valid values for Phase are [0,1,2]") if ($value < 0 || $value > 2); + $self->{_end_phase} = $value; + } + + return $self->{_end_phase}; +} + + +sub gffstring { + my ($self) = @_; + + my $str; + + my $strand = "+"; + + if ((defined $self->strand)&&($self->strand == -1)) { + $strand = "-"; + } + + $str .= (defined $self->seqname) ? $self->seqname."\t" : "\t"; + $str .= (defined $self->source_tag) ? $self->source_tag."\t" : "\t"; + $str .= (defined $self->primary_tag) ? $self->primary_tag."\t" : "\t"; + $str .= (defined $self->start) ? $self->start."\t" : "\t"; + $str .= (defined $self->end) ? $self->end."\t" : "\t"; + $str .= (defined $self->score) ? $self->score."\t" : "\t"; + $str .= (defined $self->strand) ? $strand."\t" : ".\t"; + $str .= (defined $self->phase) ? $self->phase."\t" : ".\t"; + eval{ + $str .= (defined $self->end_phase) ? $self->end_phase."\t" : ".\t"; + }; + + return $str; +} + + +=head2 external_db + + Title : external_db + Usage : $pid = $feat->external_db() + $feat->external_db($dbid) + Function: get/set for an external db accession number (e.g.: Interpro) + Returns : + Args : none if get, the new value if set + +=cut + +sub external_db { + my ($self,$value) = @_; + + if (defined($value)) + { + $self->{'_external_db'} = $value; + } + + return $self->{'_external_db'}; +} + + + +=head2 contig + + Arg [1] : Bio::PrimarySeqI $seq + Example : $seq = $self->contig; + Description: Accessor to attach/retrieve a sequence to/from a feature + Returntype : Bio::PrimarySeqI + Exceptions : none + Caller : general + +=cut + +sub contig { + my ($self, $arg) = @_; + + if ($arg) { + unless (defined $arg && ref $arg && $arg->isa("Bio::PrimarySeqI")) { + $self->throw("Must attach Bio::PrimarySeqI objects to SeqFeatures"); + } + + $self->{'_gsf_seq'} = $arg; + + # attach to sub features if they want it + + foreach my $sf ($self->sub_SeqFeature) { + if ($sf->can("attach_seq")) { + $sf->attach_seq($arg); + } + } + } + #print STDERR "contig is ".$self->{'_gsf_seq'}." with name ".$self->{'_gsf_seq'}->name."\n" unless(!$self->{'_gsf_seq'}); +# my ($p, $f, $l) = caller; +# print STDERR "Caller = ".$f." ".$l."\n"; + return $self->{'_gsf_seq'}; +} + + + +sub is_splittable { + my ($self, $arg) = @_; + + if (defined $arg) { + $self->{'_is_splittable'} = $arg; + } + return $self->{'_is_splittable'}; +} + + +sub transform { + my ($self, $slice) = @_; + + unless (defined $slice) { + + if ((defined $self->contig) && + ($self->contig->isa("Bio::EnsEMBL::RawContig"))) { + + # we are already in rawcontig coords, nothing needs to be done + return $self; + + } + else { + # transform to raw_contig coords from Slice coords + return $self->_transform_to_RawContig(); + } + } + + if (defined $self->contig) { + + if ($self->contig->isa("Bio::EnsEMBL::RawContig")) { + + # transform to slice coords from raw contig coords + return $self->_transform_to_Slice($slice); + } + elsif ($self->contig->isa( "Bio::EnsEMBL::Slice" ) or $self->contig->isa( "Bio::EnsEMBL::LRGSlice" )) { + + # transform to slice coords from other slice coords + return $self->_transform_between_Slices($slice); + } + else { + + # Unknown contig type + $self->throw("Cannot transform unknown contig type @{[$self->contig]}"); + } + } + else { + + #Can't convert to slice coords without a contig to work with + return $self->throw("Object's contig is not defined - cannot transform"); + } + +} + + +sub _transform_to_Slice { + my ($self, $slice) = @_; + + $self->throw("can't transform coordinates of $self without a contig defined") + unless $self->contig; + + unless($self->contig->adaptor) { + $self->throw("cannot transform coordinates of $self without adaptor " . + "attached to contig"); + } + + my $dbh = $self->contig->adaptor->db; + + my $mapper = + $dbh->get_AssemblyMapperAdaptor->fetch_by_type($slice->assembly_type); + my $rca = $dbh->get_RawContigAdaptor; + + my @mapped = $mapper->map_coordinates_to_assembly( + $self->contig->dbID, + $self->start, + $self->end, + $self->strand + ); + + unless (@mapped) { + $self->throw("couldn't map $self to Slice"); + } + + unless (@mapped == 1) { + $self->throw("$self should only map to one chromosome - " . + "something bad has happened ..."); + } + + if ($mapped[0]->isa("Bio::EnsEMBL::Mapper::Gap")) { + $self->warn("feature lies on gap\n"); + return; + } + + if( ! defined $slice->chr_name() ) { + my $slice_adaptor = $slice->adaptor(); + %$slice = %{$slice_adaptor->fetch_by_chr_name( $mapped[0]->id() )}; + } + + # mapped coords are on chromosome - need to convert to slice + if($slice->strand == 1) { + $self->start ($mapped[0]->start - $slice->chr_start + 1); + $self->end ($mapped[0]->end - $slice->chr_start + 1); + $self->strand ($mapped[0]->strand); + } else { + $self->start ($slice->chr_end - $mapped[0]->end + 1); + $self->end ($slice->chr_end - $mapped[0]->start + 1); + $self->strand ($mapped[0]->strand * -1); + } + + $self->seqname($mapped[0]->id); + + #set the contig to the slice + $self->contig($slice); + + return $self; +} + + +sub _transform_between_Slices { + my ($self, $to_slice) = @_; + + my $from_slice = $self->contig; + + $self->throw("New contig [$to_slice] is not a Bio::EnsEMBL::Slice") + unless ($to_slice->isa("Bio::EnsEMBL::Slice") or $to_slice->isa("Bio::EnsEMBL::LRGSlice") ); + + if ((my $c1 = $from_slice->chr_name) ne (my $c2 = $to_slice->chr_name)) { + $self->warn("Can't transform between chromosomes: $c1 and $c2"); + return; + } + + my($start, $end, $strand); + + #first convert to assembly coords + if($from_slice->strand == 1) { + $start = $from_slice->chr_start + $self->start - 1; + $end = $from_slice->chr_start + $self->end - 1; + $strand = $self->strand; + } else { + $start = $from_slice->chr_end - $self->end + 1; + $end = $from_slice->chr_end - $self->start + 1; + $strand = $self->strand; + } + + #now convert to the other slice's coords + if($to_slice->strand == 1) { + $self->start ($start - $to_slice->chr_start + 1); + $self->end ($end - $to_slice->chr_start + 1); + $self->strand($strand); + } else { + $self->start ($to_slice->chr_end - $end + 1); + $self->end ($to_slice->chr_end - $start + 1); + $self->strand($strand * -1); + } + + $self->contig($to_slice); + + return $self; +} + + +sub _transform_to_RawContig { + my($self) = @_; + + #print STDERR "transforming ".$self." to raw contig coords\n"; + $self->throw("can't transform coordinates of $self without a contig defined") + unless $self->contig; + + my $slice = $self->contig; + + unless($slice->adaptor) { + $self->throw("can't transform coordinates of $self without an adaptor " . + "attached to the feature's slice"); + } + + my $dbh = $slice->adaptor->db; + + my $mapper = + $dbh->get_AssemblyMapperAdaptor->fetch_by_type($slice->assembly_type); + my $rca = $dbh->get_RawContigAdaptor; + + #first convert the features coordinates to assembly coordinates + my($start, $end, $strand); + if($slice->strand == 1) { + $start = $slice->chr_start + $self->start - 1; + $end = $slice->chr_start + $self->end - 1; + $strand = $self->strand; + } else { + $start = $slice->chr_end - $self->end + 1; + $end = $slice->chr_end - $self->start + 1; + $strand = $self->strand * -1; + } + + #convert the assembly coordinates to RawContig coordinates + my @mapped = $mapper->map_coordinates_to_rawcontig( + $slice->chr_name, + $start, + $end, + $strand + ); + + unless (@mapped) { + $self->throw("couldn't map $self"); + return $self; + } + + if (@mapped == 1) { + + if ($mapped[0]->isa("Bio::EnsEMBL::Mapper::Gap")) { + $self->warn("feature lies on gap\n"); + return; + } + + my $rc = $rca->fetch_by_dbID($mapped[0]->id); + + $self->start ($mapped[0]->start); + $self->end ($mapped[0]->end); + $self->strand ($mapped[0]->strand); + $self->seqname ($mapped[0]->id); + #print STDERR "setting contig to be ".$mapped[0]->id."\n"; + $self->contig($rca->fetch_by_dbID($mapped[0]->id)); + + return $self; + } + else { + + # more than one object returned from mapper + # possibly more than one RawContig in region + + my (@gaps, @coords); + + foreach my $m (@mapped) { + + if ($m->isa("Bio::EnsEMBL::Mapper::Gap")) { + push @gaps, $m; + } + elsif ($m->isa("Bio::EnsEMBL::Mapper::Coordinate")) { + push @coords, $m; + } + } + + # case where only one RawContig maps + if (@coords == 1) { + + $self->start ($coords[0]->start); + $self->end ($coords[0]->end); + $self->strand ($coords[0]->strand); + $self->seqname($coords[0]->id); + #print STDERR "2 setting contig to be ".$coords[0]->id."\n"; + $self->contig ($rca->fetch_by_dbID($coords[0]->id)); + + $self->warn("Feature [$self] truncated as lies partially on a gap"); + return $self; + } + + unless ($self->is_splittable) { + $self->warn("Feature spans >1 raw contig - can't split\n"); + return; + } + + my @out; + my $obj = ref $self; + + SPLIT: foreach my $map (@mapped) { + + if ($map->isa("Bio::EnsEMBL::Mapper::Gap")) { + $self->warn("piece of evidence lies on gap\n"); + next SPLIT; + } + + my $feat = $obj->new; + + $feat->start ($map->start); + $feat->end ($map->end); + $feat->strand ($map->strand); + #print STDERR "3 setting contig to be ".$mapped[0]->id."\n"; + $feat->contig ($rca->fetch_by_dbID($map->id)); + $feat->adaptor($self->adaptor) if $self->adaptor(); + $feat->display_label($self->display_label) if($self->can('display_label')); + $feat->analysis($self->analysis); + push @out, $feat; + } + + return @out; + } +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/SeqFeatureI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/SeqFeatureI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,178 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::SeqFeatureI + +=head1 DESCRIPTION + +Do not use this class. It is deprecated and has been replaced by +Bio::EnsEMBL::Feature. + +=head1 METHODS + +=cut + + +# Let the code begin... + + +package Bio::EnsEMBL::SeqFeatureI; + +use vars qw(@ISA); +use strict; +use Carp; + +# Object preamble - inherits from Bio::Root::Object + +use Bio::SeqFeatureI; +use Bio::EnsEMBL::Root; + +@ISA = qw(Bio::EnsEMBL::Root Bio::SeqFeatureI); + + +=head1 Abstract methods + +These methods must be implemented in all subclasses. + + +=head2 analysis + + Title : analysis + Usage : $sf->analysis(); + Function: Store details of the program/database + and versions used to create this feature. + + Example : + Returns : + Args : + + +=cut + +sub analysis { + my ($self,$value) = @_; + + $self->throw("Have not implemeneted analysis"); + +} + +=head2 validate + + Title : validate + Usage : $sf->validate; + Function: Checks whether all the data is present in the + object. + Example : + Returns : + Args : + + +=cut + +sub validate { + my ($self,$value) = @_; + + $self->throw("Have not implemeneted validate"); + + +} + + +sub id { + my ($self,$value) = @_; + + $self->throw("Have not implemented id"); +} + +=head2 percent_id + + Title : percent_id + Usage : $pid = $feat->percent_id() + $feat->percent_id($pid) + Function: get/set on percentage identity information + Returns : float + Args : none if get, the new value if set + +=cut + +sub percent_id { + my ($self) = @_; + $self->throw("percent_id() not yet implemented"); +} + +=head2 e_value + + Title : p_value + Usage : $p_val = $feat->p_value() + $feat->p_value($p_val) + Function: get/set on p value information + Returns : float + Args : none if get, the new value if set + +=cut + +sub e_value { + my ($self) = @_; + $self->throw("e value() not yet implemented"); +} + +=head2 phase + + Title : phase + Usage : $phase = $feat->phase() + $feat->phase($phase) + Function: get/set on start phase of predicted exon feature + Returns : [0,1,2] + Args : none if get, 0,1 or 2 if set. + +=cut + +sub phase { + my ($self) = @_; + $self->throw("phase() not yet implemented"); +} + +=head2 end_phase + + Title : end_phase + Usage : $end_phase = $feat->end_phase() + $feat->end_phase($end_phase) + Function: get/set on end phase of predicted exon feature + Returns : [0,1,2] + Args : none if get, 0,1 or 2 if set. + +=cut + +sub end_phase { + my ($self) = @_; + $self->throw("end_phase() not yet implemented"); +} + + +# this is a bit too sneaky. +sub location { + my ($self)= @_; + return $self; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/SeqRegionSynonym.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/SeqRegionSynonym.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,108 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::SeqRegionSynonym - +Object representing an alternatice name. + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +This object holds information about alternative name to +Ensembl seq regions. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::SeqRegionSynonym; + +use strict; +use warnings; +no warnings qw(uninitialized); + +use Bio::EnsEMBL::Storable; +use Bio::Annotation::DBLink; + +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(deprecate); + +our @ISA = qw(Bio::EnsEMBL::Storable); + +=head2 new + + Args [...] : list of named parameters + Example : my $srs = new Bio::EnsEMBL::SeqRegionSynonym( + -adaptor => $adaptor, + -synonym => $alt_name, + -external_db_id => 1234 + -seq_region_id => 12); + Description: Creates a new SeqRegionSynonym object + Returntype : Bio::EnsEMBL::SeqRegionSynonym + Exceptions : none + Caller : Bio::EnsEMBL::SeqRegionSynonymAdaptor + Status : At Risk +=cut + +sub new { + my ($class, @args) = @_; + + my $self = bless {},$class; + + my ( $adaptor, $synonym, $ex_db, $seq_region_id, $dbid) = + rearrange ( ['ADAPTOR','SYNONYM','EXTERNAL_DB_ID','SEQ_REGION_ID','DBID'], @args ); + + $self->adaptor($adaptor); + + if( defined $ex_db ) { $self->external_db_id( $ex_db ) } + if( defined $seq_region_id ) { $self->seq_region_id( $seq_region_id ) } + if (defined $dbid) { $self->{'dbID'} = $dbid} + + if( defined $synonym ) { + $self->name( $synonym ) ; + } else { + warn "No alternative name given\n"; + return undef; + } + + return $self; +} + +sub name{ + my $self = shift; + $self->{'name'} = shift if(@_); + return $self->{'name'}; +} + +sub external_db_id{ + my $self = shift; + $self->{'ex_db'} = shift if(@_); + return $self->{'ex_db'}; +} + +sub seq_region_id{ + my $self = shift; + $self->{'seq_region_id'} = shift if(@_); + return $self->{'seq_region_id'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/SimpleFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/SimpleFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,187 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::SimpleFeature - A simple feature with a location and label + +=head1 SYNOPSIS + + use Bio::EnsEMBL::SimpleFeature; + + $feature = Bio::EnsEMBL::SimpleFeature->new( + -start => 100, + -end => 220, + -strand => -1, + -slice => $slice, + -analysis => $analysis, + -score => 58, + -display_label => 'EponineTSS', + -dbID => 1230, + -adaptor => $adaptor + ); + +=head1 DESCRIPTION + +This is a simple feature which extends the Feature class to add +display_label and score attributes. + +=head1 METHODS + +=cut + +use strict; + +package Bio::EnsEMBL::SimpleFeature; + +use vars qw(@ISA); + +use Bio::EnsEMBL::Feature; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Scalar::Util qw(weaken isweak); + +@ISA = qw(Bio::EnsEMBL::Feature); + + +=head2 new + + Arg [DISPLAY_LABEL]: The label assigned to this simple feature + Arg [...] : Named arguments passed to superclass + Example : $feature = Bio::EnsEMBL::SimpleFeature->new + (-start => 1, + -end => 100, + -strand => 1, + -slice => $slice, + -analysis => $analysis, + -adaptor => $adaptor, + -dbID => 10, + -display_label => 'EponineTSS', + -score => 100); + Description: Constructs a new Bio::EnsEMBL::Feature. Generally subclasses + of this method are instantiated, rather than this class itself. + Returntype : Bio::EnsEMBL::Feature + Exceptions : Thrown on invalid -SLICE, -ANALYSIS, -STRAND arguments + Caller : general, subclass constructors + Status : Stable + +=cut + +sub new { + my $caller = shift; + + #allow this to be called as class or object method + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_); + + my ($display_label, $score) = rearrange(['DISPLAY_LABEL','SCORE'],@_); + + $self->{'display_label'} = $display_label; + $self->{'score'} = $score; + + return $self; +} + + +=head2 new_fast + + Arg [1] : hashref to be blessed + Description: Construct a new Bio::EnsEMBL::Feature using the hashref. + Exceptions : none + Returntype : Bio::EnsEMBL::Feature + Caller : general, subclass constructors + Status : Stable + +=cut + + +sub new_fast { + my $class = shift; + my $hashref = shift; + my $self = bless $hashref, $class; + weaken($self->{adaptor}) if ( ! isweak($self->{adaptor}) ); + return $self; +} + + +=head2 display_label + + Arg [1] : (optional) string $value + Example : $label = $simple_feature->display_label(); + Description: Getter/Setter for the display label associated with this + feature. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub display_label{ + my $self = shift; + + $self->{'display_label'} = shift if(@_); + + return $self->{'display_label'}; +} + + +=head2 display_id + + Arg [1] : none + Example : print $rf->display_id(); + Description: This method returns a string that is considered to be + the 'display' identifier. For simple features this is the + display_label if it is available otherwise it is an empty + string. + Returntype : string + Exceptions : none + Caller : web drawing code + Status : Stable + +=cut + +sub display_id { + my $self = shift; + return $self->{'display_label'} || ''; +} + + + +=head2 score + + Arg [1] : (optional) string $value + Example : $score = $simple_feature->score(); + Description: Getter/Setter for the score associated with this + feature. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub score { + my $self = shift; + $self->{'score'} = shift if(@_); + return $self->{'score'}; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Slice.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Slice.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,3970 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Slice - Arbitary Slice of a genome + +=head1 SYNOPSIS + + $sa = $db->get_SliceAdaptor; + + $slice = + $sa->fetch_by_region( 'chromosome', 'X', 1_000_000, 2_000_000 ); + + # get some attributes of the slice + my $seqname = $slice->seq_region_name(); + my $start = $slice->start(); + my $end = $slice->end(); + + # get the sequence from the slice + my $seq = $slice->seq(); + + # get some features from the slice + foreach $gene ( @{ $slice->get_all_Genes } ) { + # do something with a gene + } + + foreach my $feature ( @{ $slice->get_all_DnaAlignFeatures } ) { + # do something with dna-dna alignments + } + +=head1 DESCRIPTION + +A Slice object represents a region of a genome. It can be used to retrieve +sequence or features from an area of interest. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Slice; +use vars qw(@ISA); +use strict; + +use Bio::PrimarySeqI; + + +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning stack_trace_dump); +use Bio::EnsEMBL::RepeatMaskedSlice; +use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); +use Bio::EnsEMBL::ProjectionSegment; +use Bio::EnsEMBL::Registry; +use Bio::EnsEMBL::Utils::Iterator; +use Bio::EnsEMBL::DBSQL::MergedAdaptor; + +use Bio::EnsEMBL::StrainSlice; +#use Bio::EnsEMBL::IndividualSlice; +#use Bio::EnsEMBL::IndividualSliceFactory; +use Bio::EnsEMBL::Mapper::RangeRegistry; +use Bio::EnsEMBL::SeqRegionSynonym; +use Scalar::Util qw(weaken isweak); + +# use Data::Dumper; + +my $reg = "Bio::EnsEMBL::Registry"; + +@ISA = qw(Bio::PrimarySeqI); + + +=head2 new + + Arg [...] : List of named arguments + Bio::EnsEMBL::CoordSystem COORD_SYSTEM + string SEQ_REGION_NAME, + int START, + int END, + int SEQ_REGION_LENGTH, (optional) + string SEQ (optional) + int STRAND, (optional, defaults to 1) + Bio::EnsEMBL::DBSQL::SliceAdaptor ADAPTOR (optional) + Example : $slice = Bio::EnsEMBL::Slice->new(-coord_system => $cs, + -start => 1, + -end => 10000, + -strand => 1, + -seq_region_name => 'X', + -seq_region_length => 12e6, + -adaptor => $slice_adaptor); + Description: Creates a new slice object. A slice represents a region + of sequence in a particular coordinate system. Slices can be + used to retrieve sequence and features from an area of + interest in a genome. + + Coordinates start at 1 and are inclusive. Negative + coordinates or coordinates exceeding the length of the + seq_region are permitted. Start must be less than or equal. + to end regardless of the strand. + + Slice objects are immutable. Once instantiated their attributes + (with the exception of the adaptor) may not be altered. To + change the attributes a new slice must be created. + Returntype : Bio::EnsEMBL::Slice + Exceptions : throws if start, end, coordsystem or seq_region_name not specified or not of the correct type + Caller : general, Bio::EnsEMBL::SliceAdaptor + Status : Stable + +=cut + +sub new { + my $caller = shift; + + #new can be called as a class or object method + my $class = ref($caller) || $caller; + + my ($seq, $coord_system, $seq_region_name, $seq_region_length, + $start, $end, $strand, $adaptor, $empty) = + rearrange([qw(SEQ COORD_SYSTEM SEQ_REGION_NAME SEQ_REGION_LENGTH + START END STRAND ADAPTOR EMPTY)], @_); + + #empty is only for backwards compatibility + if ($empty) { + deprecate( "Creation of empty slices is no longer needed" + . "and is deprecated" ); + my $self = bless( { 'empty' => 1 }, $class ); + $self->adaptor($adaptor); + return $self; + } + + if ( !defined($seq_region_name) ) { + throw('SEQ_REGION_NAME argument is required'); + } + if ( !defined($start) ) { throw('START argument is required') } + if ( !defined($end) ) { throw('END argument is required') } + + ## if ( $start > $end + 1 ) { + ## throw('start must be less than or equal to end+1'); + ## } + + if ( !defined($seq_region_length) ) { $seq_region_length = $end } + + if ( $seq_region_length <= 0 ) { + throw('SEQ_REGION_LENGTH must be > 0'); + } + + if ( defined($seq) && CORE::length($seq) != ( $end - $start + 1 ) ) { + throw('SEQ must be the same length as the defined LENGTH not ' + . CORE::length($seq) + . ' compared to ' + . ( $end - $start + 1 ) ); + } + + if(defined($coord_system)) { + if(!ref($coord_system) || !$coord_system->isa('Bio::EnsEMBL::CoordSystem')){ + throw('COORD_SYSTEM argument must be a Bio::EnsEMBL::CoordSystem'); + } + if($coord_system->is_top_level()) { + throw('Cannot create slice on toplevel CoordSystem.'); + } + } else { + warning("Slice without coordinate system"); + #warn(stack_trace_dump()); + } + + $strand ||= 1; + + if($strand != 1 && $strand != -1) { + throw('STRAND argument must be -1 or 1'); + } + + if(defined($adaptor)) { + if(!ref($adaptor) || !$adaptor->isa('Bio::EnsEMBL::DBSQL::SliceAdaptor')) { + throw('ADAPTOR argument must be a Bio::EnsEMBL::DBSQL::SliceAdaptor'); + } + } + + my $self = bless {'coord_system' => $coord_system, + 'seq' => $seq, + 'seq_region_name' => $seq_region_name, + 'seq_region_length' => $seq_region_length, + 'start' => int($start), + 'end' => int($end), + 'strand' => $strand}, $class; + + $self->adaptor($adaptor); + + return $self; + +} + +=head2 new_fast + + Arg [1] : hashref to be blessed + Description: Construct a new Bio::EnsEMBL::Slice using the hashref. + Exceptions : none + Returntype : Bio::EnsEMBL::Slice + Caller : general + Status : Stable + +=cut + + +sub new_fast { + my $class = shift; + my $hashref = shift; + my $self = bless $hashref, $class; + weaken($self->{adaptor}) if ( ! isweak($self->{adaptor}) ); + return $self; +} + +=head2 adaptor + + Arg [1] : (optional) Bio::EnsEMBL::DBSQL::SliceAdaptor $adaptor + Example : $adaptor = $slice->adaptor(); + Description: Getter/Setter for the slice object adaptor used + by this slice for database interaction. + Returntype : Bio::EnsEMBL::DBSQL::SliceAdaptor + Exceptions : thorws if argument passed is not a SliceAdaptor + Caller : general + Status : Stable + +=cut + +sub adaptor{ + my $self = shift; + + if(@_) { + my $ad = shift; + if(defined($ad)) { + if(!ref($ad) || !$ad->isa('Bio::EnsEMBL::DBSQL::SliceAdaptor')) { + throw('Argument must be a Bio::EnsEMBL::DBSQL::SliceAdaptor'); + } + } + weaken($self->{'adaptor'} = $ad); + } + + return $self->{'adaptor'}; +} + + + +=head2 seq_region_name + + Arg [1] : none + Example : $seq_region = $slice->seq_region_name(); + Description: Returns the name of the seq_region that this slice is on. For + example if this slice is in chromosomal coordinates the + seq_region_name might be 'X' or '10'. + + This function was formerly named chr_name, but since slices can + now be on coordinate systems other than chromosomal it has been + changed. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub seq_region_name { + my $self = shift; + return $self->{'seq_region_name'}; +} + + + +=head2 seq_region_length + + Arg [1] : none + Example : $seq_region_length = $slice->seq_region_length(); + Description: Returns the length of the entire seq_region that this slice is + on. For example if this slice is on a chromosome this will be + the length (in basepairs) of the entire chromosome. + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub seq_region_length { + my $self = shift; + return $self->{'seq_region_length'}; +} + + +=head2 coord_system + + Arg [1] : none + Example : print $slice->coord_system->name(); + Description: Returns the coordinate system that this slice is on. + Returntype : Bio::EnsEMBL::CoordSystem + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub coord_system { + my $self = shift; + return $self->{'coord_system'}; +} + +=head2 coord_system_name + + Arg [1] : none + Example : print $slice->coord_system_name() + Description: Convenience method. Gets the name of the coord_system which + this slice is on. + Returns undef if this Slice does not have an attached + CoordSystem. + Returntype: string or undef + Exceptions: none + Caller : general + Status : Stable + +=cut + +sub coord_system_name { + my $self = shift; + my $csystem = $self->{'coord_system'}; + return ($csystem) ? $csystem->name() : undef; +} + + +=head2 centrepoint + + Arg [1] : none + Example : $cp = $slice->centrepoint(); + Description: Returns the mid position of this slice relative to the + start of the sequence region that it was created on. + Coordinates are inclusive and start at 1. + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub centrepoint { + my $self = shift; + return ($self->{'start'}+$self->{'end'})/2; +} + +=head2 start + + Arg [1] : none + Example : $start = $slice->start(); + Description: Returns the start position of this slice relative to the + start of the sequence region that it was created on. + Coordinates are inclusive and start at 1. Negative coordinates + or coordinates exceeding the length of the sequence region are + permitted. Start is always less than or equal to end + regardless of the orientation of the slice. + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub start { + my $self = shift; + return $self->{'start'}; +} + + + +=head2 end + + Arg [1] : none + Example : $end = $slice->end(); + Description: Returns the end position of this slice relative to the + start of the sequence region that it was created on. + Coordinates are inclusive and start at 1. Negative coordinates + or coordinates exceeding the length of the sequence region are + permitted. End is always greater than or equal to start + regardless of the orientation of the slice. + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub end { + my $self = shift; + return $self->{'end'}; +} + + + +=head2 strand + + Arg [1] : none + Example : $strand = $slice->strand(); + Description: Returns the orientation of this slice on the seq_region it has + been created on + Returntype : int (either 1 or -1) + Exceptions : none + Caller : general, invert + Status : Stable + +=cut + +sub strand{ + my $self = shift; + return $self->{'strand'}; +} + + + + + +=head2 name + + Arg [1] : none + Example : my $results = $cache{$slice->name()}; + Description: Returns the name of this slice. The name is formatted as a colon + delimited string with the following attributes: + coord_system:version:seq_region_name:start:end:strand + + Slices with the same name are equivalent and thus the name can + act as a hash key. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub name { + my $self = shift; + + my $cs = $self->{'coord_system'}; + + return join(':', + ($cs) ? $cs->name() : '', + ($cs) ? $cs->version() : '', + $self->{'seq_region_name'}, + $self->{'start'}, + $self->{'end'}, + $self->{'strand'}); +} + + + +=head2 length + + Arg [1] : none + Example : $length = $slice->length(); + Description: Returns the length of this slice in basepairs + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub length { + my ($self) = @_; + + my $length = $self->{'end'} - $self->{'start'} + 1; + + if ( $self->{'start'} > $self->{'end'} && $self->is_circular() ) { + $length += $self->{'seq_region_length'}; + } + + return $length; +} + +=head2 is_reference + Arg : none + Example : my $reference = $slice->is_reference() + Description: Returns 1 if slice is a reference slice else 0 + Returntype : int + Caller : general + Status : At Risk + +=cut + +sub is_reference { + my ($self) = @_; + + if ( !defined( $self->{'is_reference'} ) ) { + $self->{'is_reference'} = + $self->adaptor()->is_reference( $self->get_seq_region_id() ); + } + + return $self->{'is_reference'}; +} + +=head2 is_toplevel + Arg : none + Example : my $top = $slice->is_toplevel() + Description: Returns 1 if slice is a toplevel slice else 0 + Returntype : int + Caller : general + Status : At Risk + +=cut + +sub is_toplevel { + my ($self) = @_; + + if ( !defined( $self->{'toplevel'} ) ) { + $self->{'toplevel'} = + $self->adaptor()->is_toplevel( $self->get_seq_region_id() ); + } + + return $self->{'toplevel'}; +} + +=head2 is_circular + Arg : none + Example : my $circ = $slice->is_circular() + Description: Returns 1 if slice is a circular slice else 0 + Returntype : int + Caller : general + Status : Stable + +=cut + +sub is_circular { + my ($self) = @_; + my $adaptor = $self->adaptor(); + return 0 if ! defined $adaptor; + if (! exists $self->{'circular'}) { + my $id = $adaptor->get_seq_region_id($self); + $self->{circular} = $adaptor->is_circular($id); + } + return $self->{circular}; +} + +=head2 invert + + Arg [1] : none + Example : $inverted_slice = $slice->invert; + Description: Creates a copy of this slice on the opposite strand and + returns it. + Returntype : Bio::EnsEMBL::Slice + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub invert { + my $self = shift; + + # make a shallow copy of the slice via a hash copy and flip the strand + my %s = %$self; + $s{'strand'} = $self->{'strand'} * -1; + + # reverse compliment any attached sequence + reverse_comp(\$s{'seq'}) if($s{'seq'}); + + # bless and return the copy + return bless \%s, ref $self; +} + + + +=head2 seq + + Arg [1] : none + Example : print "SEQUENCE = ", $slice->seq(); + Description: Returns the sequence of the region represented by this + slice formatted as a string. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub seq { + my $self = shift; + + # special case for in-between (insert) coordinates + return '' if($self->start() == $self->end() + 1); + + return $self->{'seq'} if($self->{'seq'}); + + if($self->adaptor()) { + my $seqAdaptor = $self->adaptor()->db()->get_SequenceAdaptor(); + return ${$seqAdaptor->fetch_by_Slice_start_end_strand($self,1,undef,1)}; + } + + # no attached sequence, and no db, so just return Ns + return 'N' x $self->length(); +} + + + +=head2 subseq + + Arg [1] : int $startBasePair + relative to start of slice, which is 1. + Arg [2] : int $endBasePair + relative to start of slice. + Arg [3] : (optional) int $strand + The strand of the slice to obtain sequence from. Default + value is 1. + Description: returns string of dna sequence + Returntype : txt + Exceptions : end should be at least as big as start + strand must be set + Caller : general + Status : Stable + +=cut + +sub subseq { + my ( $self, $start, $end, $strand ) = @_; + + if ( $end+1 < $start ) { + throw("End coord + 1 is less than start coord"); + } + + # handle 'between' case for insertions + return '' if( $start == $end + 1); + + $strand = 1 unless(defined $strand); + + if ( $strand != -1 && $strand != 1 ) { + throw("Invalid strand [$strand] in call to Slice::subseq."); + } + my $subseq; + if($self->adaptor){ + my $seqAdaptor = $self->adaptor->db->get_SequenceAdaptor(); + $subseq = ${$seqAdaptor->fetch_by_Slice_start_end_strand + ( $self, $start, + $end, $strand )}; + } else { + ## check for gap at the beginning and pad it with Ns + if ($start < 1) { + $subseq = "N" x (1 - $start); + $start = 1; + } + $subseq .= substr ($self->seq(), $start-1, $end - $start + 1); + ## check for gap at the end and pad it with Ns + if ($end > $self->length()) { + $subseq .= "N" x ($end - $self->length()); + } + reverse_comp(\$subseq) if($strand == -1); + } + return $subseq; +} + +=head2 sub_Slice_Iterator + + Arg[1] : int The chunk size to request + Example : my $i = $slice->sub_Slice_Iterator(60000); + while($i->has_next()) { warn $i->next()->name(); } + Description : Returns an iterator which batches subslices of this Slice + in the requested chunk size + Returntype : Bio::EnsEMBL::Utils::Iterator next() will return the next + chunk of Slice + Exceptions : None + +=cut + +sub sub_Slice_Iterator { + my ($self, $chunk_size) = @_; + throw "Need a chunk size to divide the slice by" if ! $chunk_size; + my $here = 1; + my $end = $self->length(); + my $iterator_sub = sub { + while($here <= $end) { + my $there = $here + $chunk_size - 1; + $there = $end if($there > $end); + my $slice = $self->sub_Slice($here, $there); + $here = $there + 1; + return $slice; + } + return; + }; + return Bio::EnsEMBL::Utils::Iterator->new($iterator_sub); +} + +=head2 assembly_exception_type + + Example : $self->assembly_exception_type(); + Description : Returns the type of slice this is. If it is reference then you + will get 'REF' back. Otherwise you will get the first + element from C. If no + assembly exception exists you will get an empty string back. + Returntype : String + Exceptions : None + Caller : Public + Status : Beta + +=cut + +sub assembly_exception_type { + my ($self) = @_; + my $type = q{}; + if($self->is_reference()) { + $type = 'REF'; + } + else { + my $assembly_exceptions = $self->get_all_AssemblyExceptionFeatures(); + if(@{$assembly_exceptions}) { + $type = $assembly_exceptions->[0]->type(); + } + } + return $type; +} + +=head2 is_chromosome + + Example : print ($slice->is_chromosome()) ? 'I am a chromosome' : 'Not one'; + Description : Uses a number of rules known to indicate a chromosome region + other and takes into account those regions which can be + placed on a Chromsome coordinate system but in fact are not + assembled into one. + Returntype : Boolean indicates if the current object is a chromosome + Exceptions : None + +=cut + +sub is_chromosome { + my ($self) = @_; + my $coord_system = $self->coord_system->name; + my $seq_name = $self->seq_region_name; + + if (($seq_name =~ /random + |^Un\d{4}$ + |^Un\.\d{3}\.\d*$ + |E\d\d\w*$ + |_NT_ + |scaffold_ + |cutchr + |unplaced + |chunk + |clone + |contig + |genescaffold + |group + |reftig + |supercontig + |ultracontig + /x) or ( $coord_system !~ /^chromosome$/i )) { + return 0; + } + + return 1; +} + + +=head2 get_base_count + + Arg [1] : none + Example : $c_count = $slice->get_base_count->{'c'}; + Description: Retrieves a hashref containing the counts of each bases in the + sequence spanned by this slice. The format of the hash is : + { 'a' => num, + 'c' => num, + 't' => num, + 'g' => num, + 'n' => num, + '%gc' => num } + + All bases which are not in the set [A,a,C,c,T,t,G,g] are + included in the 'n' count. The 'n' count could therefore be + inclusive of ambiguity codes such as 'y'. + The %gc is the ratio of GC to AT content as in: + total(GC)/total(ACTG) * 100 + This function is conservative in its memory usage and scales to + work for entire chromosomes. + Returntype : hashref + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_base_count { + my $self = shift; + + my $a = 0; + my $c = 0; + my $t = 0; + my $g = 0; + + my $start = 1; + my $end; + + my $RANGE = 100_000; + my $len = $self->length(); + + my $seq; + + while ( $start <= $len ) { + $end = $start + $RANGE - 1; + $end = $len if ( $end > $len ); + + $seq = $self->subseq( $start, $end ); + + $a += $seq =~ tr/Aa//; + $c += $seq =~ tr/Cc//; + $t += $seq =~ tr/Tt//; + $g += $seq =~ tr/Gg//; + + $start = $end + 1; + } + + my $actg = $a + $c + $t + $g; + + my $gc_content = 0; + if ( $actg > 0 ) { # Avoid dividing by 0 + $gc_content = sprintf( "%1.2f", ( ( $g + $c )/$actg )*100 ); + } + + return { 'a' => $a, + 'c' => $c, + 't' => $t, + 'g' => $g, + 'n' => $len - $actg, + '%gc' => $gc_content }; +} + + + +=head2 project + + Arg [1] : string $name + The name of the coordinate system to project this slice onto + Arg [2] : string $version + The version of the coordinate system (such as 'NCBI34') to + project this slice onto + Example : + my $clone_projection = $slice->project('clone'); + + foreach my $seg (@$clone_projection) { + my $clone = $segment->to_Slice(); + print $slice->seq_region_name(), ':', $seg->from_start(), '-', + $seg->from_end(), ' -> ', + $clone->seq_region_name(), ':', $clone->start(), '-',$clone->end(), + $clone->strand(), "\n"; + } + Description: Returns the results of 'projecting' this slice onto another + coordinate system. Projecting to a coordinate system that + the slice is assembled from is analagous to retrieving a tiling + path. This method may also be used to 'project up' to a higher + level coordinate system, however. + + This method returns a listref of triplets [start,end,slice] + which represents the projection. The start and end defined the + region of this slice which is made up of the third value of + the triplet: a slice in the requested coordinate system. + Returntype : list reference of Bio::EnsEMBL::ProjectionSegment objects which + can also be used as [$start,$end,$slice] triplets + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub project { + my $self = shift; + my $cs_name = shift; + my $cs_version = shift; + + throw('Coord_system name argument is required') if(!$cs_name); + + my $slice_adaptor = $self->adaptor(); + + if(!$slice_adaptor) { + warning("Cannot project without attached adaptor."); + return []; + } + + if(!$self->coord_system()) { + warning("Cannot project without attached coord system."); + return []; + } + + + my $db = $slice_adaptor->db(); + my $csa = $db->get_CoordSystemAdaptor(); + my $cs = $csa->fetch_by_name($cs_name, $cs_version); + my $slice_cs = $self->coord_system(); + + if(!$cs) { + throw("Cannot project to unknown coordinate system " . + "[$cs_name $cs_version]"); + } + + # no mapping is needed if the requested coord system is the one we are in + # but we do need to check if some of the slice is outside of defined regions + if($slice_cs->equals($cs)) { + return $self->_constrain_to_region(); + } + + my @projection; + my $current_start = 1; + + # decompose this slice into its symlinked components. + # this allows us to handle haplotypes and PARs + my $normal_slice_proj = + $slice_adaptor->fetch_normalized_slice_projection($self); + foreach my $segment (@$normal_slice_proj) { + my $normal_slice = $segment->[2]; + + $slice_cs = $normal_slice->coord_system(); + + my $asma = $db->get_AssemblyMapperAdaptor(); + my $asm_mapper = $asma->fetch_by_CoordSystems($slice_cs, $cs); + + # perform the mapping between this slice and the requested system + my @coords; + + if( defined $asm_mapper ) { + @coords = $asm_mapper->map($normal_slice->seq_region_name(), + $normal_slice->start(), + $normal_slice->end(), + $normal_slice->strand(), + $slice_cs); + } else { + $coords[0] = Bio::EnsEMBL::Mapper::Gap->new( $normal_slice->start(), + $normal_slice->end()); + } + + + # my $last_rank = 0; + #construct a projection from the mapping results and return it + foreach my $coord (@coords) { + my $coord_start = $coord->start(); + my $coord_end = $coord->end(); + my $length = $coord_end - $coord_start + 1; + + if ( $coord_start > $coord_end ) { + $length = + $normal_slice->seq_region_length() - + $coord_start + + $coord_end + 1; + } + +# if( $last_rank != $coord->rank){ +# $current_start = 1; +# print "LAST rank has changed to ".$coord->rank."from $last_rank \n"; +# } +# $last_rank = $coord->rank; + + #skip gaps + if($coord->isa('Bio::EnsEMBL::Mapper::Coordinate')) { + + my $coord_cs = $coord->coord_system(); + + # If the normalised projection just ended up mapping to the + # same coordinate system we were already in then we should just + # return the original region. This can happen for example, if we + # were on a PAR region on Y which refered to X and a projection to + # 'toplevel' was requested. + if($coord_cs->equals($slice_cs)) { + # trim off regions which are not defined + return $self->_constrain_to_region(); + } + #create slices for the mapped-to coord system + my $slice = $slice_adaptor->fetch_by_seq_region_id( + $coord->id(), + $coord_start, + $coord_end, + $coord->strand()); + + my $current_end = $current_start + $length - 1; + + if ($current_end > $slice->seq_region_length() && $slice->is_circular ) { + $current_end -= $slice->seq_region_length(); + } + + push @projection, bless([$current_start, $current_end, $slice], + "Bio::EnsEMBL::ProjectionSegment"); + } + + $current_start += $length; + } + } + + return \@projection; +} + + +sub _constrain_to_region { + my $self = shift; + + my $entire_len = $self->seq_region_length(); + + #if the slice has negative coordinates or coordinates exceeding the + #exceeding length of the sequence region we want to shrink the slice to + #the defined region + + if($self->{'start'} > $entire_len || $self->{'end'} < 1) { + #none of this slice is in a defined region + return []; + } + + my $right_contract = 0; + my $left_contract = 0; + if($self->{'end'} > $entire_len) { + $right_contract = $entire_len - $self->{'end'}; + } + if($self->{'start'} < 1) { + $left_contract = $self->{'start'} - 1; + } + + my $new_slice; + if($left_contract || $right_contract) { + #if slice in negative strand, need to swap contracts + if ($self->strand == 1) { + $new_slice = $self->expand($left_contract, $right_contract); + } + elsif ($self->strand == -1) { + $new_slice = $self->expand($right_contract, $left_contract); + } + } else { + $new_slice = $self; + } + + return [bless [1-$left_contract, $self->length()+$right_contract, + $new_slice], "Bio::EnsEMBL::ProjectionSegment" ]; +} + + +=head2 expand + + Arg [1] : (optional) int $five_prime_expand + The number of basepairs to shift this slices five_prime + coordinate by. Positive values make the slice larger, + negative make the slice smaller. + coordinate left. + Default = 0. + Arg [2] : (optional) int $three_prime_expand + The number of basepairs to shift this slices three_prime + coordinate by. Positive values make the slice larger, + negative make the slice smaller. + Default = 0. + Arg [3] : (optional) bool $force_expand + if set to 1, then the slice will be contracted even in the case + when shifts $five_prime_expand and $three_prime_expand overlap. + In that case $five_prime_expand and $three_prime_expand will be set + to a maximum possible number and that will result in the slice + which would have only 2pbs. + Default = 0. + Arg [4] : (optional) int* $fpref + The reference to a number of basepairs to shift this slices five_prime + coordinate by. Normally it would be set to $five_prime_expand. + But in case when $five_prime_expand shift can not be applied and + $force_expand is set to 1, then $$fpref will contain the maximum possible + shift + Arg [5] : (optional) int* $tpref + The reference to a number of basepairs to shift this slices three_prime + coordinate by. Normally it would be set to $three_prime_expand. + But in case when $five_prime_expand shift can not be applied and + $force_expand is set to 1, then $$tpref will contain the maximum possible + shift + Example : my $expanded_slice = $slice->expand( 1000, 1000); + my $contracted_slice = $slice->expand(-1000,-1000); + my $shifted_right_slice = $slice->expand(-1000, 1000); + my $shifted_left_slice = $slice->expand( 1000,-1000); + my $forced_contracted_slice = $slice->expand(-1000,-1000, 1, \$five_prime_shift, \$three_prime_shift); + + Description: Returns a slice which is a resized copy of this slice. The + start and end are moved outwards from the center of the slice + if positive values are provided and moved inwards if negative + values are provided. This slice remains unchanged. A slice + may not be contracted below 1bp but may grow to be arbitrarily + large. + Returntype : Bio::EnsEMBL::Slice + Exceptions : warning if an attempt is made to contract the slice below 1bp + Caller : general + Status : Stable + +=cut + +sub expand { + my $self = shift; + my $five_prime_shift = shift || 0; + my $three_prime_shift = shift || 0; + my $force_expand = shift || 0; + my $fpref = shift; + my $tpref = shift; + + if ( $self->{'seq'} ) { + warning( + "Cannot expand a slice which has a manually attached sequence "); + return undef; + } + + my $sshift = $five_prime_shift; + my $eshift = $three_prime_shift; + + if ( $self->{'strand'} != 1 ) { + $eshift = $five_prime_shift; + $sshift = $three_prime_shift; + } + + my $new_start = $self->{'start'} - $sshift; + my $new_end = $self->{'end'} + $eshift; + + if (( $new_start <= 0 || $new_start > $self->seq_region_length() || $new_end <= 0 || $new_end > $self->seq_region_length() ) && ( $self->is_circular() ) ) { + + if ( $new_start <= 0 ) { + $new_start = $self->seq_region_length() + $new_start; + } + if ( $new_start > $self->seq_region_length() ) { + $new_start -= $self->seq_region_length(); + } + + if ( $new_end <= 0 ) { + $new_end = $self->seq_region_length() + $new_end; + } + if ( $new_end > $self->seq_region_length() ) { + $new_end -= $self->seq_region_length(); + } + + } + + if ( $new_start > $new_end && (not $self->is_circular() ) ) { + + if ($force_expand) { + # Apply max possible shift, if force_expand is set + if ( $sshift < 0 ) { + # if we are contracting the slice from the start - move the + # start just before the end + $new_start = $new_end - 1; + $sshift = $self->{start} - $new_start; + } + + if ( $new_start > $new_end ) { + # if the slice still has a negative length - try to move the + # end + if ( $eshift < 0 ) { + $new_end = $new_start + 1; + $eshift = $new_end - $self->{end}; + } + } + # return the values by which the primes were actually shifted + $$tpref = $self->{strand} == 1 ? $eshift : $sshift; + $$fpref = $self->{strand} == 1 ? $sshift : $eshift; + } + if ( $new_start > $new_end ) { + throw('Slice start cannot be greater than slice end'); + } + } + + #fastest way to copy a slice is to do a shallow hash copy + my %new_slice = %$self; + $new_slice{'start'} = int($new_start); + $new_slice{'end'} = int($new_end); + + return bless \%new_slice, ref($self); +} ## end sub expand + + + +=head2 sub_Slice + + Arg 1 : int $start + Arg 2 : int $end + Arge [3] : int $strand + Example : none + Description: Makes another Slice that covers only part of this slice + If a slice is requested which lies outside of the boundaries + of this function will return undef. This means that + behaviour will be consistant whether or not the slice is + attached to the database (i.e. if there is attached sequence + to the slice). Alternatively the expand() method or the + SliceAdaptor::fetch_by_region method can be used instead. + Returntype : Bio::EnsEMBL::Slice or undef if arguments are wrong + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub sub_Slice { + my ( $self, $start, $end, $strand ) = @_; + + if( $start < 1 || $start > $self->{'end'} ) { + # throw( "start argument not valid" ); + return undef; + } + + if( $end < $start || $end > $self->{'end'} ) { + # throw( "end argument not valid" ) + return undef; + } + + my ( $new_start, $new_end, $new_strand, $new_seq ); + if( ! defined $strand ) { + $strand = 1; + } + + if( $self->{'strand'} == 1 ) { + $new_start = $self->{'start'} + $start - 1; + $new_end = $self->{'start'} + $end - 1; + $new_strand = $strand; + } else { + $new_start = $self->{'end'} - $end + 1;; + $new_end = $self->{'end'} - $start + 1; + $new_strand = -$strand; + } + + if( defined $self->{'seq'} ) { + $new_seq = $self->subseq( $start, $end, $strand ); + } + + #fastest way to copy a slice is to do a shallow hash copy + my $new_slice = {%{$self}}; + $new_slice->{'start'} = int($new_start); + $new_slice->{'end'} = int($new_end); + $new_slice->{'strand'} = $new_strand; + if( $new_seq ) { + $new_slice->{'seq'} = $new_seq; + } + weaken($new_slice->{adaptor}); + + return bless $new_slice, ref($self); +} + + + +=head2 seq_region_Slice + + Arg [1] : none + Example : $slice = $slice->seq_region_Slice(); + Description: Returns a slice which spans the whole seq_region which this slice + is on. For example if this is a slice which spans a small region + of chromosome X, this method will return a slice which covers the + entire chromosome X. The returned slice will always have strand + of 1 and start of 1. This method cannot be used if the sequence + of the slice has been set manually. + Returntype : Bio::EnsEMBL::Slice + Exceptions : warning if called when sequence of Slice has been set manually. + Caller : general + Status : Stable + +=cut + +sub seq_region_Slice { + my $self = shift; + + if($self->{'seq'}){ + warning("Cannot get a seq_region_Slice of a slice which has manually ". + "attached sequence "); + return undef; + } + + # quick shallow copy + my $slice; + %{$slice} = %{$self}; + bless $slice, ref($self); + weaken($slice->{adaptor}); + + $slice->{'start'} = 1; + $slice->{'end'} = $slice->{'seq_region_length'}; + $slice->{'strand'} = 1; + + return $slice; +} + + +=head2 get_seq_region_id + + Arg [1] : none + Example : my $seq_region_id = $slice->get_seq_region_id(); + Description: Gets the internal identifier of the seq_region that this slice + is on. Note that this function will not work correctly if this + slice does not have an attached adaptor. Also note that it may + be better to go through the SliceAdaptor::get_seq_region_id + method if you are working with multiple databases since is + possible to work with slices from databases with different + internal seq_region identifiers. + Returntype : int or undef if slices does not have attached adaptor + Exceptions : warning if slice is not associated with a SliceAdaptor + Caller : assembly loading scripts, general + Status : Stable + +=cut + +sub get_seq_region_id { + my ($self) = @_; + + if($self->adaptor) { + return $self->adaptor->get_seq_region_id($self); + } else { + warning('Cannot retrieve seq_region_id without attached adaptor.'); + return undef; + } +} + + +=head2 get_all_Attributes + + Arg [1] : optional string $attrib_code + The code of the attribute type to retrieve values for. + Example : ($htg_phase) = @{$slice->get_all_Attributes('htg_phase')}; + @slice_attributes = @{$slice->get_all_Attributes()}; + Description: Gets a list of Attributes of this slice''s seq_region. + Optionally just get Attrubutes for given code. + Returntype : listref Bio::EnsEMBL::Attribute + Exceptions : warning if slice does not have attached adaptor + Caller : general + Status : Stable + +=cut + +sub get_all_Attributes { + my $self = shift; + my $attrib_code = shift; + + my $result; + my @results; + + if(!$self->adaptor()) { + warning('Cannot get attributes without an adaptor.'); + return []; + } + + my $attribute_adaptor = $self->adaptor->db->get_AttributeAdaptor(); + + if( defined $attrib_code ) { + @results = grep { uc($_->code()) eq uc($attrib_code) } + @{$attribute_adaptor->fetch_all_by_Slice( $self )}; + $result = \@results; + } else { + $result = $attribute_adaptor->fetch_all_by_Slice( $self ); + } + + return $result; +} + + +=head2 get_all_PredictionTranscripts + + Arg [1] : (optional) string $logic_name + The name of the analysis used to generate the prediction + transcripts obtained. + Arg [2] : (optional) boolean $load_exons + If set to true will force loading of all PredictionExons + immediately rather than loading them on demand later. This + is faster if there are a large number of PredictionTranscripts + and the exons will be used. + Example : @transcripts = @{$slice->get_all_PredictionTranscripts}; + Description: Retrieves the list of prediction transcripts which overlap + this slice with logic_name $logic_name. If logic_name is + not defined then all prediction transcripts are retrieved. + Returntype : listref of Bio::EnsEMBL::PredictionTranscript + Exceptions : warning if slice does not have attached adaptor + Caller : none + Status : Stable + +=cut + +sub get_all_PredictionTranscripts { + my ($self,$logic_name, $load_exons) = @_; + + if(!$self->adaptor()) { + warning('Cannot get PredictionTranscripts without attached adaptor'); + return []; + } + my $pta = $self->adaptor()->db()->get_PredictionTranscriptAdaptor(); + return $pta->fetch_all_by_Slice($self, $logic_name, $load_exons); +} + + + +=head2 get_all_DnaAlignFeatures + + Arg [1] : (optional) string $logic_name + The name of the analysis performed on the dna align features + to obtain. + Arg [2] : (optional) float $score + The mimimum score of the features to retrieve + Arg [3] : (optional) string $dbtype + The name of an attached database to retrieve the features from + instead, e.g. 'otherfeatures'. + Arg [4] : (optional) float hcoverage + The minimum hcoverage od the featurs to retrieve + Example : @dna_dna_align_feats = @{$slice->get_all_DnaAlignFeatures}; + Description: Retrieves the DnaDnaAlignFeatures which overlap this slice with + logic name $logic_name and with score above $score. If + $logic_name is not defined features of all logic names are + retrieved. If $score is not defined features of all scores are + retrieved. + Returntype : listref of Bio::EnsEMBL::DnaDnaAlignFeatures + Exceptions : warning if slice does not have attached adaptor + Caller : general + Status : Stable + +=cut + +sub get_all_DnaAlignFeatures { + my ($self, $logic_name, $score, $dbtype, $hcoverage) = @_; + + if(!$self->adaptor()) { + warning('Cannot get DnaAlignFeatures without attached adaptor'); + return []; + } + + my $db; + + if($dbtype) { + $db = $self->adaptor->db->get_db_adaptor($dbtype); + if(!$db) { + warning("Don't have db $dbtype returning empty list\n"); + return []; + } + } else { + $db = $self->adaptor->db; + } + + my $dafa = $db->get_DnaAlignFeatureAdaptor(); + + if(defined($score) and defined ($hcoverage)){ + warning "cannot specify score and hcoverage. Using score only"; + } + if(defined($score)){ + return $dafa->fetch_all_by_Slice_and_score($self,$score, $logic_name); + } + return $dafa->fetch_all_by_Slice_and_hcoverage($self,$hcoverage, $logic_name); +} + + + +=head2 get_all_ProteinAlignFeatures + + Arg [1] : (optional) string $logic_name + The name of the analysis performed on the protein align features + to obtain. + Arg [2] : (optional) float $score + The mimimum score of the features to retrieve + Arg [3] : (optional) string $dbtype + The name of an attached database to retrieve features from + instead. + Arg [4] : (optional) float hcoverage + The minimum hcoverage od the featurs to retrieve + Example : @dna_pep_align_feats = @{$slice->get_all_ProteinAlignFeatures}; + Description: Retrieves the DnaPepAlignFeatures which overlap this slice with + logic name $logic_name and with score above $score. If + $logic_name is not defined features of all logic names are + retrieved. If $score is not defined features of all scores are + retrieved. + Returntype : listref of Bio::EnsEMBL::DnaPepAlignFeatures + Exceptions : warning if slice does not have attached adaptor + Caller : general + Status : Stable + +=cut + +sub get_all_ProteinAlignFeatures { + my ($self, $logic_name, $score, $dbtype, $hcoverage) = @_; + + if(!$self->adaptor()) { + warning('Cannot get ProteinAlignFeatures without attached adaptor'); + return []; + } + + my $db; + + if($dbtype) { + $db = $self->adaptor->db->get_db_adaptor($dbtype); + if(!$db) { + warning("Don't have db $dbtype returning empty list\n"); + return []; + } + } else { + $db = $self->adaptor->db; + } + + my $pafa = $db->get_ProteinAlignFeatureAdaptor(); + + if(defined($score) and defined ($hcoverage)){ + warning "cannot specify score and hcoverage. Using score only"; + } + if(defined($score)){ + return $pafa->fetch_all_by_Slice_and_score($self,$score, $logic_name); + } + return $pafa->fetch_all_by_Slice_and_hcoverage($self,$hcoverage, $logic_name); + +} + + + +=head2 get_all_SimilarityFeatures + + Arg [1] : (optional) string $logic_name + the name of the analysis performed on the features to retrieve + Arg [2] : (optional) float $score + the lower bound of the score of the features to be retrieved + Example : @feats = @{$slice->get_all_SimilarityFeatures}; + Description: Retrieves all dna_align_features and protein_align_features + with analysis named $logic_name and with score above $score. + It is probably faster to use get_all_ProteinAlignFeatures or + get_all_DnaAlignFeatures if a sepcific feature type is desired. + If $logic_name is not defined features of all logic names are + retrieved. If $score is not defined features of all scores are + retrieved. + Returntype : listref of Bio::EnsEMBL::BaseAlignFeatures + Exceptions : warning if slice does not have attached adaptor + Caller : general + Status : Stable + +=cut + +sub get_all_SimilarityFeatures { + my ($self, $logic_name, $score) = @_; + + my @out = (); + + push @out, @{$self->get_all_ProteinAlignFeatures($logic_name, $score) }; + push @out, @{$self->get_all_DnaAlignFeatures($logic_name, $score) }; + + return \@out; +} + + + +=head2 get_all_SimpleFeatures + + Arg [1] : (optional) string $logic_name + The name of the analysis performed on the simple features + to obtain. + Arg [2] : (optional) float $score + The mimimum score of the features to retrieve + Example : @simple_feats = @{$slice->get_all_SimpleFeatures}; + Description: Retrieves the SimpleFeatures which overlap this slice with + logic name $logic_name and with score above $score. If + $logic_name is not defined features of all logic names are + retrieved. If $score is not defined features of all scores are + retrieved. + Returntype : listref of Bio::EnsEMBL::SimpleFeatures + Exceptions : warning if slice does not have attached adaptor + Caller : general + Status : Stable + +=cut + +sub get_all_SimpleFeatures { + my ($self, $logic_name, $score, $dbtype) = @_; + + if(!$self->adaptor()) { + warning('Cannot get SimpleFeatures without attached adaptor'); + return []; + } + + my $db; + if($dbtype) { + $db = $self->adaptor->db->get_db_adaptor($dbtype); + if(!$db) { + warning("Don't have db $dbtype returning empty list\n"); + return []; + } + } else { + $db = $self->adaptor->db; + } + + my $sfa = $db->get_SimpleFeatureAdaptor(); + + return $sfa->fetch_all_by_Slice_and_score($self, $score, $logic_name); +} + + + +=head2 get_all_RepeatFeatures + + Arg [1] : (optional) string $logic_name + The name of the analysis performed on the repeat features + to obtain. + Arg [2] : (optional) string/array $repeat_type + Limits features returned to those of the specified + repeat_type. Can specify a single value or an array reference + to limit by more than one + Arg [3] : (optional) string $db + Key for database e.g. core/vega/cdna/.... + Example : @repeat_feats = @{$slice->get_all_RepeatFeatures(undef,'Type II Transposons')}; + Description: Retrieves the RepeatFeatures which overlap with + logic name $logic_name and with score above $score. If + $logic_name is not defined features of all logic names are + retrieved. + Returntype : listref of Bio::EnsEMBL::RepeatFeatures + Exceptions : warning if slice does not have attached adaptor + Caller : general + Status : Stable + +=cut + +sub get_all_RepeatFeatures { + my ($self, $logic_name, $repeat_type, $dbtype) = @_; + + if(!$self->adaptor()) { + warning('Cannot get RepeatFeatures without attached adaptor'); + return []; + } + + my $db; + if($dbtype) { + $db = $self->adaptor->db->get_db_adaptor($dbtype); + if(!$db) { + warning("Don't have db $dbtype returning empty list\n"); + return []; + } + } else { + $db = $self->adaptor->db; + } + + my $rpfa = $db->get_RepeatFeatureAdaptor(); + + return $rpfa->fetch_all_by_Slice($self, $logic_name, $repeat_type); +} + +=head2 get_all_LD_values + + Arg [1] : (optional) Bio::EnsEMBL::Variation::Population $population + Description : returns all LD values on this slice. This function will only work correctly if the variation + database has been attached to the core database. If the argument is passed, will return the LD information + in that population + ReturnType : Bio::EnsEMBL::Variation::LDFeatureContainer + Exceptions : none + Caller : contigview, snpview + Status : Stable + +=cut + +sub get_all_LD_values{ + my $self = shift; + my $population = shift; + + + if(!$self->adaptor()) { + warning('Cannot get LDFeatureContainer without attached adaptor'); + return []; + } + + my $variation_db = $self->adaptor->db->get_db_adaptor('variation'); + + unless($variation_db) { + warning("Variation database must be attached to core database to " . + "retrieve variation information" ); + return []; + } + + my $ld_adaptor = $variation_db->get_LDFeatureContainerAdaptor; + + if( $ld_adaptor ) { + return $ld_adaptor->fetch_by_Slice($self,$population); + } else { + return []; + + } + +# my $ld_adaptor = Bio::EnsEMBL::DBSQL::MergedAdaptor->new(-species => $self->adaptor()->db()->species, -type => "LDFeatureContainer"); + +# if( $ld_adaptor ) { +# my $ld_values = $ld_adaptor->fetch_by_Slice($self,$population); +# if (@{$ld_values} > 1){ +# warning("More than 1 variation database attached. Trying to merge LD results"); +# my $ld_value_merged = shift @{$ld_values}; +# #with more than 1 variation database attached, will try to merge in one single LDContainer object. +# foreach my $ld (@{$ld_values}){ +# #copy the ld values to the result hash +# foreach my $key (keys %{$ld->{'ldContainer'}}){ +# $ld_value_merged->{'ldContainer'}->{$key} = $ld->{'ldContainer'}->{$key}; +# } +# #and copy the variationFeatures as well +# foreach my $key (keys %{$ld->{'variationFeatures'}}){ +# $ld_value_merged->{'variationFeatures'}->{$key} = $ld->{'variationFeatures'}->{$key}; +# } + +# } +# return $ld_value_merged; +# } +# else{ +# return shift @{$ld_values}; +# } +# } else { +# warning("Variation database must be attached to core database to " . +# "retrieve variation information" ); +# return []; +# } +} + +sub _get_VariationFeatureAdaptor { + + my $self = shift; + + if(!$self->adaptor()) { + warning('Cannot get variation features without attached adaptor'); + return undef; + } + + my $vf_adaptor = Bio::EnsEMBL::DBSQL::MergedAdaptor->new( + -species => $self->adaptor()->db()->species, + -type => "VariationFeature" + ); + + if( $vf_adaptor ) { + return $vf_adaptor; + } + else { + warning("Variation database must be attached to core database to " . + "retrieve variation information" ); + + return undef; + } +} + +=head2 get_all_VariationFeatures + Args : $so_terms [optional] - list of so_terms to limit the fetch to + Description : Returns all germline variation features on this slice. This function will + only work correctly if the variation database has been attached to the core + database. + If $so_terms is specified, only variation features with a consequence type + that matches or is an ontological child of any of the supplied terms will + be returned + ReturnType : listref of Bio::EnsEMBL::Variation::VariationFeature + Exceptions : none + Caller : contigview, snpview + Status : Stable + +=cut + +sub get_all_VariationFeatures{ + my $self = shift; + + if (my $vf_adaptor = $self->_get_VariationFeatureAdaptor) { + return $vf_adaptor->fetch_all_by_Slice_SO_terms($self, @_); + } + else { + return []; + } +} + +=head2 get_all_somatic_VariationFeatures + + Args : $filter [optional] + Description : Returns all somatic variation features on this slice. This function will only + work correctly if the variation database has been attached to the core database. + ReturnType : listref of Bio::EnsEMBL::Variation::VariationFeature + Exceptions : none + Status : Stable + +=cut + +sub get_all_somatic_VariationFeatures { + my $self = shift; + + if (my $vf_adaptor = $self->_get_VariationFeatureAdaptor) { + return $vf_adaptor->fetch_all_somatic_by_Slice($self); + } + else{ + return []; + } +} + +=head2 get_all_VariationFeatures_with_annotation + + Arg [1] : $variation_feature_source [optional] + Arg [2] : $annotation_source [optional] + Arg [3] : $annotation_name [optional] + Description : returns all germline variation features on this slice associated with a phenotype. + This function will only work correctly if the variation database has been + attached to the core database. + If $variation_feature_source is set only variations from that source + are retrieved. + If $annotation_source is set only variations whose annotations come from + $annotation_source will be retrieved. + If $annotation_name is set only variations with that annotation will be retrieved. + $annotation_name can be a phenotype's internal dbID. + ReturnType : listref of Bio::EnsEMBL::Variation::VariationFeature + Exceptions : none + Caller : contigview, snpview + Status : Stable + +=cut + +sub get_all_VariationFeatures_with_annotation{ + my $self = shift; + my $source = shift; + my $p_source = shift; + my $annotation = shift; + + if (my $vf_adaptor = $self->_get_VariationFeatureAdaptor) { + return $vf_adaptor->fetch_all_with_annotation_by_Slice($self, $source, $p_source, $annotation); + } + else { + return []; + } +} + +=head2 get_all_somatic_VariationFeatures_with_annotation + + Arg [1] : $variation_feature_source [optional] + Arg [2] : $annotation_source [optional] + Arg [3] : $annotation_name [optional] + Description : returns all somatic variation features on this slice associated with a phenotype. + (see get_all_VariationFeatures_with_annotation for further documentation) + ReturnType : listref of Bio::EnsEMBL::Variation::VariationFeature + Exceptions : none + Status : Stable + +=cut + +sub get_all_somatic_VariationFeatures_with_annotation{ + my $self = shift; + my $source = shift; + my $p_source = shift; + my $annotation = shift; + + if (my $vf_adaptor = $self->_get_VariationFeatureAdaptor) { + return $vf_adaptor->fetch_all_somatic_with_annotation_by_Slice($self, $source, $p_source, $annotation); + } + else { + return [] unless $vf_adaptor; + } +} + +=head2 get_all_VariationFeatures_by_VariationSet + + Arg [1] : Bio::EnsEMBL:Variation::VariationSet $set + Description :returns all variation features on this slice associated with a given set. + This function will only work correctly if the variation database has been + attached to the core database. + ReturnType : listref of Bio::EnsEMBL::Variation::VariationFeature + Exceptions : none + Caller : contigview, snpview + Status : Stable + +=cut + +sub get_all_VariationFeatures_by_VariationSet { + my $self = shift; + my $set = shift; + + if (my $vf_adaptor = $self->_get_VariationFeatureAdaptor) { + return $vf_adaptor->fetch_all_by_Slice_VariationSet($self, $set); + } + else { + return []; + } +} + +=head2 get_all_StructuralVariations + + Description: DEPRECATED. Use get_all_StructuralVariationFeatures instead + +=cut + +sub get_all_StructuralVariations{ + my $self = shift; + my $source = shift; + my $study = shift; + my $sv_class = shift; + + deprecate('Use get_all_StructuralVariationFeatures() instead.'); + + return $self->get_all_StructuralVariationFeatures($source,$sv_class); +} + + +=head2 get_all_CopyNumberVariantProbes + + Description: DEPRECATED. Use get_all_CopyNumberVariantProbeFeatures instead + +=cut + +sub get_all_CopyNumberVariantProbes { + my $self = shift; + my $source = shift; + my $study = shift; + + deprecate('Use get_all_CopyNumberVariantProbeFeatures() instead.'); + + return $self->get_all_CopyNumberVariantProbeFeatures($source); +} + + +sub _get_StructuralVariationFeatureAdaptor { + + my $self = shift; + + if(!$self->adaptor()) { + warning('Cannot get structural variation features without attached adaptor'); + return undef; + } + + my $svf_adaptor = Bio::EnsEMBL::DBSQL::MergedAdaptor->new( + -species => $self->adaptor()->db()->species, + -type => "StructuralVariationFeature" + ); + + if( $svf_adaptor ) { + return $svf_adaptor; + } + else { + warning("Variation database must be attached to core database to " . + "retrieve variation information" ); + + return undef; + } +} + + +=head2 get_all_StructuralVariationFeatures + + Arg[1] : string $source [optional] + Arg[2] : int $include_evidence [optional] + Arg[3] : string $sv_class (SO term) [optional] + Description : returns all structural variation features on this slice. This function will only work + correctly if the variation database has been attached to the core database. + If $source is set, only structural variation features with that source name will be + returned. By default, it only returns structural variant features which are not labelled + as "CNV_PROBE". + If $include_evidence is set (i.e. $include_evidence=1), structural variation features from + both structural variation (SV) and their supporting structural variations (SSV) will be + returned. By default, it only returns features from structural variations (SV). + ReturnType : listref of Bio::EnsEMBL::Variation::StructuralVariationFeature + Exceptions : none + Caller : contigview, snpview, structural_variation_features + Status : Stable + +=cut + +sub get_all_StructuralVariationFeatures { + my $self = shift; + my $source = shift; + my $include_evidence = shift; + my $somatic = shift; + my $sv_class = shift; + + my $operator = ''; + + if (!defined($sv_class)) { + $sv_class = 'SO:0000051'; # CNV_PROBE + $operator = '!'; # All but CNV_PROBE + } + + $somatic = (!defined($somatic) || !$somatic) ? 0 : 1; + + my $svf_adaptor = $self->_get_StructuralVariationFeatureAdaptor; + + my $variation_db = $self->adaptor->db->get_db_adaptor('variation'); + + # Get the attrib_id + my $at_adaptor = $variation_db->get_AttributeAdaptor; + my $SO_term = $at_adaptor->SO_term_for_SO_accession($sv_class); + my $attrib_id = $at_adaptor->attrib_id_for_type_value('SO_term',$SO_term); + + if (!$attrib_id) { + warning("The Sequence Ontology accession number is not found in the database"); + return []; + } + + # Get the structural variations features + if( $svf_adaptor ) { + + my $constraint = qq{ svf.somatic=$somatic AND svf.class_attrib_id $operator=$attrib_id }; + $constraint .= qq{ AND svf.is_evidence=0 } if (!$include_evidence); + + if($source) { + return $svf_adaptor->fetch_all_by_Slice_constraint($self, qq{$constraint AND s.name = '$source'}); + }else { + return $svf_adaptor->fetch_all_by_Slice_constraint($self, $constraint); + } + } + else { + warning("Variation database must be attached to core database to " . + "retrieve variation information" ); + return []; + } +} + + +=head2 get_all_StructuralVariationFeatures_by_VariationSet + + Arg [1] : Bio::EnsEMBL:Variation::VariationSet $set + Description :returns all structural variation features on this slice associated with a + given set. + This function will only work correctly if the variation database has been + attached to the core database. + ReturnType : listref of Bio::EnsEMBL::Variation::StructuralVariationFeature + Exceptions : none + Caller : contigview, snpview + Status : Stable + +=cut + +sub get_all_StructuralVariationFeatures_by_VariationSet { + my $self = shift; + my $set = shift; + + if (my $svf_adaptor = $self->_get_StructuralVariationFeatureAdaptor) { + return $svf_adaptor->fetch_all_by_Slice_VariationSet($self, $set); + } + else { + return []; + } +} + + +=head2 get_all_somatic_StructuralVariationFeatures + + Arg[1] : string $source [optional] + Arg[2] : int $include_evidence [optional] + Arg[3] : string $sv_class (SO term) [optional] + Description : returns all somatic structural variation features on this slice. This function will only work + correctly if the variation database has been attached to the core database. + If $source is set, only somatic structural variation features with that source name will be + returned. By default, it only returns somatic structural variant features which are not labelled + as "CNV_PROBE". + If $include_evidence is set (i.e. $include_evidence=1), structural variation features from + both structural variation (SV) and their supporting structural variations (SSV) will be + returned. By default, it only returns features from structural variations (SV). + ReturnType : listref of Bio::EnsEMBL::Variation::StructuralVariationFeature + Exceptions : none + Caller : contigview, snpview, structural_variation_features + Status : Stable + +=cut + +sub get_all_somatic_StructuralVariationFeatures { + my $self = shift; + my $source = shift; + my $include_evidence = shift; + my $sv_class = shift; + + return $self->get_all_StructuralVariationFeatures($source,$include_evidence,1,$sv_class); +} + + +=head2 get_all_CopyNumberVariantProbeFeatures + + Arg[1] : string $source [optional] + Description : returns all copy number variant probes on this slice. This function will only work + correctly if the variation database has been attached to the core database. + If $source is set, only CNV probes with that source name will be returned. + If $study is set, only CNV probes of that study will be returned. + ReturnType : listref of Bio::EnsEMBL::Variation::StructuralVariationFeature + Exceptions : none + Caller : contigview, snpview, structural_variation_feature + Status : At Risk + +=cut + +sub get_all_CopyNumberVariantProbeFeatures { + my $self = shift; + my $source = shift; + + return $self->get_all_StructuralVariationFeatures($source,0,0,'SO:0000051'); +} + + +=head2 get_all_VariationFeatures_by_Population + + Arg [1] : Bio::EnsEMBL::Variation::Population + Arg [2] : $minimum_frequency (optional) + Example : $pop = $pop_adaptor->fetch_by_dbID(659); + @vfs = @{$slice->get_all_VariationFeatures_by_Population( + $pop,$slice)}; + Description: Retrieves all variation features in a slice which are stored for + a specified population. If $minimum_frequency is supplied, only + variations with a minor allele frequency (MAF) greater than + $minimum_frequency will be returned. + Returntype : listref of Bio::EnsEMBL::Variation::VariationFeature + Exceptions : throw on incorrect argument + Caller : general + Status : At Risk + +=cut + +sub get_all_VariationFeatures_by_Population { + my $self = shift; + + if (my $vf_adaptor = $self->_get_VariationFeatureAdaptor) { + return $vf_adaptor->fetch_all_by_Slice_Population($self, @_); + } + else { + return []; + } +} + + + + +=head2 get_all_IndividualSlice + + Args : none + Example : my $individualSlice = $slice->get_by_Population($population); + Description : Gets the specific Slice for all the individuls in the population + ReturnType : listref of Bio::EnsEMB::IndividualSlice + Exceptions : none + Caller : general + +=cut + +sub get_all_IndividualSlice{ + my $self = shift; + + my $individualSliceFactory = Bio::EnsEMBL::IndividualSliceFactory->new( + -START => $self->{'start'}, + -END => $self->{'end'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor(), + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -COORD_SYSTEM => $self->{'coord_system'}, + ); + return $individualSliceFactory->get_all_IndividualSlice(); +} + +=head2 get_by_Individual + + Arg[1] : Bio::EnsEMBL::Variation::Individual $individual + Example : my $individualSlice = $slice->get_by_Individual($individual); + Description : Gets the specific Slice for the individual + ReturnType : Bio::EnsEMB::IndividualSlice + Exceptions : none + Caller : general + +=cut + +sub get_by_Individual{ + my $self = shift; + my $individual = shift; + + return Bio::EnsEMBL::IndividualSlice->new( + -START => $self->{'start'}, + -END => $self->{'end'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor(), +# -SEQ => $self->{'seq'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -COORD_SYSTEM => $self->{'coord_system'}, + -INDIVIDUAL => $individual); + +} + + + +=head2 get_by_strain + + Arg[1] : string $strain + Example : my $strainSlice = $slice->get_by_strain($strain); + Description : Gets the specific Slice for the strain + ReturnType : Bio::EnsEMB::StrainSlice + Exceptions : none + Caller : general + +=cut + +sub get_by_strain{ + my $self = shift; + my $strain_name = shift; + + return Bio::EnsEMBL::StrainSlice->new( + -START => $self->{'start'}, + -END => $self->{'end'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor(), + -SEQ => $self->{'seq'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -COORD_SYSTEM => $self->{'coord_system'}, + -STRAIN_NAME => $strain_name); + +} + +sub calculate_theta{ + my $self = shift; + my $strains = shift; + my $feature = shift; #optional parameter. Name of the feature in the Slice you want to calculate + + if(!$self->adaptor()) { + warning('Cannot get variation features without attached adaptor'); + return 0; + } + my $variation_db = $self->adaptor->db->get_db_adaptor('variation'); + + unless($variation_db) { + warning("Variation database must be attached to core database to " . + "retrieve variation information" ); + return 0; + } + + #need to get coverage regions for the slice in the different strains + my $coverage_adaptor = $variation_db->get_ReadCoverageAdaptor; + my $strain; + my $differences = []; + my $slices = []; + if ($coverage_adaptor){ + my $num_strains = scalar(@{$strains}) +1; + if (!defined $feature){ + #we want to calculate for the whole slice + push @{$slices}, $self; #add the slice as the slice to calculate the theta value + } + else{ + #we have features, get the slices for the different features + my $features = $self->get_all_Exons(); + map {push @{$slices},$_->feature_Slice} @{$features}; #add the slices of the features + } + my $length_regions = 0; + my $snps = 0; + my $theta = 0; + my $last_position = 0; + #get all the differences in the slice coordinates + foreach my $strain_name (@{$strains}){ + my $strain = $self->get_by_strain($strain_name); #get the strainSlice for the strain + + my $results = $strain->get_all_differences_Slice; + push @{$differences}, @{$results} if (defined $results); + } + #when we finish, we have, in max_level, the regions covered by all the sample + #sort the differences by the genomic position + my @differences_sorted = sort {$a->start <=> $b->start} @{$differences}; + foreach my $slice (@{$slices}){ + my $regions_covered = $coverage_adaptor->fetch_all_regions_covered($slice,$strains); + if (defined $regions_covered){ + foreach my $range (@{$regions_covered}){ + $length_regions += ($range->[1] - $range->[0]) + 1; #add the length of the genomic region + for (my $i = $last_position;$i<@differences_sorted;$i++){ + if ($differences_sorted[$i]->start >= $range->[0] && $differences_sorted[$i]->end <= $range->[1]){ + $snps++; #count differences in the region + } + elsif ($differences_sorted[$i]->end > $range->[1]){ + $last_position = $i; + last; + } + } + } + #when all the ranges have been iterated, calculate rho + #this is an intermediate variable called a in the formula + # a = sum i=2..strains 1/i-1 + } + } + my $a = _calculate_a($num_strains); + $theta = $snps / ($a * $length_regions); + return $theta; + } + else{ + return 0; + } +} + + + + +sub _calculate_a{ + my $max_level = shift; + + my $a = 0; + for (my $i = 2; $i <= $max_level+1;$i++){ + $a += 1/($i-1); + } + return $a; +} + +sub calculate_pi{ + my $self = shift; + my $strains = shift; + my $feature = shift; + + if(!$self->adaptor()) { + warning('Cannot get variation features without attached adaptor'); + return 0; + } + my $variation_db = $self->adaptor->db->get_db_adaptor('variation'); + + unless($variation_db) { + warning("Variation database must be attached to core database to " . + "retrieve variation information" ); + return 0; + } + + #need to get coverage regions for the slice in the different strains + my $coverage_adaptor = $variation_db->get_ReadCoverageAdaptor; + my $differences = []; + my $slices = []; + if ($coverage_adaptor){ + my $num_strains = scalar(@{$strains}) +1; + if (!defined $feature){ + #we want to calculate for the whole slice + push @{$slices}, $self; #add the slice as the slice to calculate the theta value + } + else{ + #we have features, get the slices for the different features + my $features = $self->get_all_Exons(); + map {push @{$slices},$_->feature_Slice} @{$features}; #add the slices of the features + } + my @range_differences = (); + my $pi = 0; + my $regions = 0; + my $last_position = 0; #last position visited in the sorted list of differences + my $triallelic = 0; + my $is_triallelic = 0; + foreach my $slice (@{$slices}){ + foreach my $strain_name (@{$strains}){ + my $strain = $slice->get_by_strain($strain_name); #get the strainSlice for the strain + my $results = $strain->get_all_differences_Slice; + push @{$differences}, @{$results} if (defined $results); + } + my @differences_sorted = sort {$a->start <=> $b->start} @{$differences}; + + my $regions_covered = $coverage_adaptor->fetch_all_regions_covered($slice,$strains); + #when we finish, we have, in max_level, the regions covered by all the sample + #sort the differences + if (defined $regions_covered){ + foreach my $range (@{$regions_covered}){ + for (my $i = $last_position;$i<@differences_sorted;$i++){ + if ($differences_sorted[$i]->start >= $range->[0] && $differences_sorted[$i]->end <= $range->[1]){ + #check wether it is the same region or different + if (!defined $range_differences[0] || ($differences_sorted[$i]->start == $range_differences[0]->start)){ + if (defined $range_differences[0] && ($differences_sorted[$i]->allele_string ne $range_differences[0]->allele_string)){ + $is_triallelic = 1; + } + push @range_differences, $differences_sorted[$i]; + } + else{ + #new site, calc pi for the previous one + $pi += 2 * (@range_differences/($num_strains)) * ( 1 - (@range_differences/$num_strains)); + if ($is_triallelic) { + $triallelic++; + $is_triallelic = 0; + } + $regions++; + @range_differences = (); + #and start a new range + push @range_differences, $differences_sorted[$i]; + } + } + elsif ($differences_sorted[$i]->end > $range->[1]){ + $last_position = $i; + last; + } + } + #calculate pi for last site, if any + if (defined $range_differences[0]){ + $pi += 2 * (@range_differences/$num_strains) * ( 1 - (@range_differences/$num_strains)); + $regions++; + } + } + } + $pi = $pi / $regions; #calculate average pi + print "Regions with variations in region $regions and triallelic $triallelic\n\n"; + } + return $pi; + } + else{ + return 0; + } + +} + + + + + +=head2 get_all_genotyped_VariationFeatures + + Args : none + Function : returns all variation features on this slice that have been genotyped. This function will only work + correctly if the variation database has been attached to the core database. + ReturnType : listref of Bio::EnsEMBL::Variation::VariationFeature + Exceptions : none + Caller : contigview, snpview, ldview + Status : At Risk + : Variation database is under development. + +=cut + +sub get_all_genotyped_VariationFeatures{ + my $self = shift; + + if( my $vf_adaptor = $self->_get_VariationFeatureAdaptor) { + return $vf_adaptor->fetch_all_genotyped_by_Slice($self); + } + else { + return []; + } +} + + +=head2 get_all_SNPs + + Description: DEPRECATED. Use get_all_VariationFeatures insted + +=cut + +sub get_all_SNPs { + my $self = shift; + + deprecate('Use get_all_VariationFeatures() instead.'); + + my $snps; + my $vf = $self->get_all_genotyped_VariationFeatures(); + if( $vf->[0] ) { + #necessary to convert the VariationFeatures into SNP objects + foreach my $variation_feature (@{$vf}){ + push @{$snps},$variation_feature->convert_to_SNP(); + } + return $snps; + } else { + return []; + } +} + +=head2 get_all_genotyped_SNPs + + Description : DEPRECATED. Use get_all_genotyped_VariationFeatures insted + +=cut + +sub get_all_genotyped_SNPs { + my $self = shift; + + deprecate("Use get_all_genotyped_VariationFeatures instead"); + my $vf = $self->get_all_genotyped_VariationFeatures; + my $snps; + if ($vf->[0]){ + foreach my $variation_feature (@{$vf}){ + push @{$snps},$variation_feature->convert_to_SNP(); + } + return $snps; + } else { + return []; + } +} + +sub get_all_SNPs_transcripts { + my $self = shift; + + deprecate("DEPRECATED"); + + return []; + +} + + + +=head2 get_all_Genes + + Arg [1] : (optional) string $logic_name + The name of the analysis used to generate the genes to retrieve + Arg [2] : (optional) string $dbtype + The dbtype of genes to obtain. This assumes that the db has + been added to the DBAdaptor under this name (using the + DBConnection::add_db_adaptor method). + Arg [3] : (optional) boolean $load_transcripts + If set to true, transcripts will be loaded immediately rather + than being lazy-loaded on request. This will result in a + significant speed up if the Transcripts and Exons are going to + be used (but a slow down if they are not). + Arg [4] : (optional) string $source + The source of the genes to retrieve. + Arg [5] : (optional) string $biotype + The biotype of the genes to retrieve. + Example : @genes = @{$slice->get_all_Genes}; + Description: Retrieves all genes that overlap this slice. + Returntype : listref of Bio::EnsEMBL::Genes + Exceptions : none + Caller : none + Status : Stable + +=cut + +sub get_all_Genes{ + my ($self, $logic_name, $dbtype, $load_transcripts, $source, $biotype) = @_; + + if(!$self->adaptor()) { + warning('Cannot get Genes without attached adaptor'); + return []; + } + + my $ga; + if($dbtype) { + my $db = $reg->get_db($self->adaptor()->db(), $dbtype); + if(defined($db)){ + $ga = $reg->get_adaptor( $db->species(), $db->group(), "Gene" ); + } + else{ + $ga = $reg->get_adaptor( $self->adaptor()->db()->species(), $dbtype, "Gene" ); + } + if(!defined $ga) { + warning( "$dbtype genes not available" ); + return []; + } + } else { + $ga = $self->adaptor->db->get_GeneAdaptor(); + } + + return $ga->fetch_all_by_Slice( $self, $logic_name, $load_transcripts, $source, $biotype); +} + +=head2 get_all_Genes_by_type + + Arg [1] : string $type + The biotype of genes wanted. + Arg [2] : (optional) string $logic_name + Arg [3] : (optional) boolean $load_transcripts + If set to true, transcripts will be loaded immediately rather + than being lazy-loaded on request. This will result in a + significant speed up if the Transcripts and Exons are going to + be used (but a slow down if they are not). + Example : @genes = @{$slice->get_all_Genes_by_type('protein_coding', + 'ensembl')}; + Description: Retrieves genes that overlap this slice of biotype $type. + This is primarily used by the genebuilding code when several + biotypes of genes are used. + + The logic name is the analysis of the genes that are retrieved. + If not provided all genes will be retrieved instead. + + Returntype : listref of Bio::EnsEMBL::Genes + Exceptions : none + Caller : genebuilder, general + Status : Stable + +=cut + +sub get_all_Genes_by_type{ + my ($self, $type, $logic_name, $load_transcripts) = @_; + + if(!$self->adaptor()) { + warning('Cannot get Genes without attached adaptor'); + return []; + } + + return $self->get_all_Genes($logic_name, undef, $load_transcripts, undef, $type); +} + + +=head2 get_all_Genes_by_source + + Arg [1] : string source + Arg [2] : (optional) boolean $load_transcripts + If set to true, transcripts will be loaded immediately rather + than being lazy-loaded on request. This will result in a + significant speed up if the Transcripts and Exons are going to + be used (but a slow down if they are not). + Example : @genes = @{$slice->get_all_Genes_by_source('ensembl')}; + Description: Retrieves genes that overlap this slice of source $source. + + Returntype : listref of Bio::EnsEMBL::Genes + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_Genes_by_source { + my ($self, $source, $load_transcripts) = @_; + + if(!$self->adaptor()) { + warning('Cannot get Genes without attached adaptor'); + return []; + } + + return $self->get_all_Genes(undef, undef, $load_transcripts, $source); +} + +=head2 get_all_Transcripts + + Arg [1] : (optional) boolean $load_exons + If set to true exons will not be lazy-loaded but will instead + be loaded right away. This is faster if the exons are + actually going to be used right away. + Arg [2] : (optional) string $logic_name + the logic name of the type of features to obtain + Arg [3] : (optional) string $db_type + Example : @transcripts = @{$slice->get_all_Transcripts)_}; + Description: Gets all transcripts which overlap this slice. If you want to + specify a particular analysis or type, then you are better off + using get_all_Genes or get_all_Genes_by_type and iterating + through the transcripts of each gene. + Returntype : reference to a list of Bio::EnsEMBL::Transcripts + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_Transcripts { + my $self = shift; + my $load_exons = shift; + my $logic_name = shift; + my $dbtype = shift; + if(!$self->adaptor()) { + warning('Cannot get Transcripts without attached adaptor'); + return []; + } + + + my $ta; + if($dbtype) { + my $db = $reg->get_db($self->adaptor()->db(), $dbtype); + if(defined($db)){ + $ta = $reg->get_adaptor( $db->species(), $db->group(), "Transcript" ); + } else{ + $ta = $reg->get_adaptor( $self->adaptor()->db()->species(), $dbtype, "Transcript" ); + } + if(!defined $ta) { + warning( "$dbtype genes not available" ); + return []; + } + } else { + $ta = $self->adaptor->db->get_TranscriptAdaptor(); + } + return $ta->fetch_all_by_Slice($self, $load_exons, $logic_name); +} + + +=head2 get_all_Exons + + Arg [1] : none + Example : @exons = @{$slice->get_all_Exons}; + Description: Gets all exons which overlap this slice. Note that these exons + will not be associated with any transcripts, so this may not + be terribly useful. + Returntype : reference to a list of Bio::EnsEMBL::Exons + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_Exons { + my $self = shift; + + if(!$self->adaptor()) { + warning('Cannot get Exons without attached adaptor'); + return []; + } + + return $self->adaptor->db->get_ExonAdaptor->fetch_all_by_Slice($self); +} + + + +=head2 get_all_QtlFeatures + + Args : none + Example : none + Description: returns overlapping QtlFeatures + Returntype : listref Bio::EnsEMBL::Map::QtlFeature + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_QtlFeatures { + my $self = shift; + + if(!$self->adaptor()) { + warning('Cannot get QtlFeatures without attached adaptor'); + return []; + } + + my $qfAdaptor; + if( $self->adaptor()) { + $qfAdaptor = $self->adaptor()->db()->get_QtlFeatureAdaptor(); + } else { + return []; + } + + return $qfAdaptor->fetch_all_by_Slice_constraint( $self ); +} + + + + +=head2 get_all_KaryotypeBands + + Arg [1] : none + Example : @kary_bands = @{$slice->get_all_KaryotypeBands}; + Description: Retrieves the karyotype bands which this slice overlaps. + Returntype : listref oif Bio::EnsEMBL::KaryotypeBands + Exceptions : none + Caller : general, contigview + Status : Stable + +=cut + +sub get_all_KaryotypeBands { + my ($self) = @_; + + if(!$self->adaptor()) { + warning('Cannot get KaryotypeBands without attached adaptor'); + return []; + } + + my $kadp = $self->adaptor->db->get_KaryotypeBandAdaptor(); + return $kadp->fetch_all_by_Slice($self); +} + + + + +=head2 get_repeatmasked_seq + + Arg [1] : listref of strings $logic_names (optional) + Arg [2] : int $soft_masking_enable (optional) + Arg [3] : hash reference $not_default_masking_cases (optional, default is {}) + The values are 0 or 1 for hard and soft masking respectively + The keys of the hash should be of 2 forms + "repeat_class_" . $repeat_consensus->repeat_class, + e.g. "repeat_class_SINE/MIR" + "repeat_name_" . $repeat_consensus->name + e.g. "repeat_name_MIR" + depending on which base you want to apply the not default + masking either the repeat_class or repeat_name. Both can be + specified in the same hash at the same time, but in that case, + repeat_name setting has priority over repeat_class. For example, + you may have hard masking as default, and you may want soft + masking of all repeat_class SINE/MIR, but repeat_name AluSp + (which are also from repeat_class SINE/MIR). + Your hash will be something like {"repeat_class_SINE/MIR" => 1, + "repeat_name_AluSp" => 0} + Example : $rm_slice = $slice->get_repeatmasked_seq(); + $softrm_slice = $slice->get_repeatmasked_seq(['RepeatMask'],1); + Description: Returns Bio::EnsEMBL::Slice that can be used to create repeat + masked sequence instead of the regular sequence. + Sequence returned by this new slice will have repeat regions + hardmasked by default (sequence replaced by N) or + or soft-masked when arg[2] = 1 (sequence in lowercase) + Will only work with database connection to get repeat features. + Returntype : Bio::EnsEMBL::RepeatMaskedSlice + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_repeatmasked_seq { + my ($self,$logic_names,$soft_mask,$not_default_masking_cases) = @_; + + return Bio::EnsEMBL::RepeatMaskedSlice->new + (-START => $self->{'start'}, + -END => $self->{'end'}, + -STRAND => $self->{'strand'}, + -ADAPTOR => $self->adaptor(), + -SEQ => $self->{'seq'}, + -SEQ_REGION_NAME => $self->{'seq_region_name'}, + -SEQ_REGION_LENGTH => $self->{'seq_region_length'}, + -COORD_SYSTEM => $self->{'coord_system'}, + -REPEAT_MASK => $logic_names, + -SOFT_MASK => $soft_mask, + -NOT_DEFAULT_MASKING_CASES => $not_default_masking_cases); +} + + + +=head2 _mask_features + + Arg [1] : reference to a string $dnaref + Arg [2] : array_ref $repeats + reference to a list Bio::EnsEMBL::RepeatFeature + give the list of coordinates to replace with N or with + lower case + Arg [3] : int $soft_masking_enable (optional) + Arg [4] : hash reference $not_default_masking_cases (optional, default is {}) + The values are 0 or 1 for hard and soft masking respectively + The keys of the hash should be of 2 forms + "repeat_class_" . $repeat_consensus->repeat_class, + e.g. "repeat_class_SINE/MIR" + "repeat_name_" . $repeat_consensus->name + e.g. "repeat_name_MIR" + depending on which base you want to apply the not default masking either + the repeat_class or repeat_name. Both can be specified in the same hash + at the same time, but in that case, repeat_name setting has priority over + repeat_class. For example, you may have hard masking as default, and + you may want soft masking of all repeat_class SINE/MIR, + but repeat_name AluSp (which are also from repeat_class SINE/MIR). + Your hash will be something like {"repeat_class_SINE/MIR" => 1, + "repeat_name_AluSp" => 0} + Example : none + Description: replaces string positions described in the RepeatFeatures + with Ns (default setting), or with the lower case equivalent + (soft masking). The reference to a dna string which is passed + is changed in place. + Returntype : none + Exceptions : none + Caller : seq + Status : Stable + +=cut + +sub _mask_features { + my ($self,$dnaref,$repeats,$soft_mask,$not_default_masking_cases) = @_; + + $soft_mask = 0 unless (defined $soft_mask); + $not_default_masking_cases = {} unless (defined $not_default_masking_cases); + + # explicit CORE::length call, to avoid any confusion with the Slice + # length method + my $dnalen = CORE::length($$dnaref); + + REP:foreach my $old_f (@{$repeats}) { + my $f = $old_f->transfer( $self ); + my $start = $f->start; + my $end = $f->end; + my $length = ($end - $start) + 1; + + # check if we get repeat completely outside of expected slice range + if ($end < 1 || $start > $dnalen) { + # warning("Unexpected: Repeat completely outside slice coordinates."); + next REP; + } + + # repeat partly outside slice range, so correct + # the repeat start and length to the slice size if needed + if ($start < 1) { + $start = 1; + $length = ($end - $start) + 1; + } + + # repeat partly outside slice range, so correct + # the repeat end and length to the slice size if needed + if ($end > $dnalen) { + $end = $dnalen; + $length = ($end - $start) + 1; + } + + $start--; + + my $padstr; + # if we decide to define masking on the base of the repeat_type, we'll need + # to add the following, and the other commented line few lines below. + # my $rc_type = "repeat_type_" . $f->repeat_consensus->repeat_type; + my $rc_class = "repeat_class_" . $f->repeat_consensus->repeat_class; + my $rc_name = "repeat_name_" . $f->repeat_consensus->name; + + my $masking_type; + # $masking_type = $not_default_masking_cases->{$rc_type} if (defined $not_default_masking_cases->{$rc_type}); + $masking_type = $not_default_masking_cases->{$rc_class} if (defined $not_default_masking_cases->{$rc_class}); + $masking_type = $not_default_masking_cases->{$rc_name} if (defined $not_default_masking_cases->{$rc_name}); + + $masking_type = $soft_mask unless (defined $masking_type); + + if ($masking_type) { + $padstr = lc substr ($$dnaref,$start,$length); + } else { + $padstr = 'N' x $length; + } + substr ($$dnaref,$start,$length) = $padstr; + } +} + + +=head2 get_all_SearchFeatures + + Arg [1] : scalar $ticket_ids + Example : $slice->get_all_SearchFeatures('BLA_KpUwwWi5gY'); + Description: Retreives all search features for stored blast + results for the ticket that overlap this slice + Returntype : listref of Bio::EnsEMBL::SeqFeatures + Exceptions : none + Caller : general (webby!) + Status : Stable + +=cut + +sub get_all_SearchFeatures { + my $self = shift; + my $ticket = shift; + local $_; + unless($ticket) { + throw("ticket argument is required"); + } + + if(!$self->adaptor()) { + warning("Cannot get SearchFeatures without an attached adaptor"); + return []; + } + + my $sfa = $self->adaptor()->db()->get_db_adaptor('blast'); + + my $offset = $self->start-1; + + my $features = $sfa ? $sfa->get_all_SearchFeatures($ticket, $self->seq_region_name, $self->start, $self->end) : []; + + foreach( @$features ) { + $_->start( $_->start - $offset ); + $_->end( $_->end - $offset ); + }; + return $features; + +} + +=head2 get_all_AssemblyExceptionFeatures + + Arg [1] : string $set (optional) + Example : $slice->get_all_AssemblyExceptionFeatures(); + Description: Retreives all misc features which overlap this slice. If + a set code is provided only features which are members of + the requested set are returned. + Returntype : listref of Bio::EnsEMBL::AssemblyExceptionFeatures + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_AssemblyExceptionFeatures { + my $self = shift; + my $misc_set = shift; + + my $adaptor = $self->adaptor(); + + if(!$adaptor) { + warning('Cannot retrieve features without attached adaptor.'); + return []; + } + + my $aefa = $adaptor->db->get_AssemblyExceptionFeatureAdaptor(); + + return $aefa->fetch_all_by_Slice($self); +} + + + +=head2 get_all_MiscFeatures + + Arg [1] : string $set (optional) + Arg [2] : string $database (optional) + Example : $slice->get_all_MiscFeatures('cloneset'); + Description: Retreives all misc features which overlap this slice. If + a set code is provided only features which are members of + the requested set are returned. + Returntype : listref of Bio::EnsEMBL::MiscFeatures + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_MiscFeatures { + my $self = shift; + my $misc_set = shift; + my $dbtype = shift; + my $msa; + + my $adaptor = $self->adaptor(); + if(!$adaptor) { + warning('Cannot retrieve features without attached adaptor.'); + return []; + } + + my $mfa; + if($dbtype) { + my $db = $reg->get_db($adaptor->db(), $dbtype); + if(defined($db)){ + $mfa = $reg->get_adaptor( lc($db->species()), $db->group(), "miscfeature" ); + } else{ + $mfa = $reg->get_adaptor( $adaptor->db()->species(), $dbtype, "miscfeature" ); + } + if(!defined $mfa) { + warning( "$dbtype misc features not available" ); + return []; + } + } else { + $mfa = $adaptor->db->get_MiscFeatureAdaptor(); + } + + if($misc_set) { + return $mfa->fetch_all_by_Slice_and_set_code($self,$misc_set); + } + + return $mfa->fetch_all_by_Slice($self); +} + +=head2 get_all_MarkerFeatures + + Arg [1] : (optional) string logic_name + The logic name of the marker features to retrieve + Arg [2] : (optional) int $priority + Lower (exclusive) priority bound of the markers to retrieve + Arg [3] : (optional) int $map_weight + Upper (exclusive) priority bound of the markers to retrieve + Example : my @markers = @{$slice->get_all_MarkerFeatures(undef,50, 2)}; + Description: Retrieves all markers which lie on this slice fulfilling the + specified map_weight and priority parameters (if supplied). + Returntype : reference to a list of Bio::EnsEMBL::MarkerFeatures + Exceptions : none + Caller : contigview, general + Status : Stable + +=cut + +sub get_all_MarkerFeatures { + my ($self, $logic_name, $priority, $map_weight) = @_; + + if(!$self->adaptor()) { + warning('Cannot retrieve MarkerFeatures without attached adaptor.'); + return []; + } + + my $ma = $self->adaptor->db->get_MarkerFeatureAdaptor; + + my $feats = $ma->fetch_all_by_Slice_and_priority($self, + $priority, + $map_weight, + $logic_name); + return $feats; +} + + +=head2 get_MarkerFeatures_by_Name + + Arg [1] : string marker Name + The name (synonym) of the marker feature(s) to retrieve + Example : my @markers = @{$slice->get_MarkerFeatures_by_Name('z1705')}; + Description: Retrieves all markers with this ID + Returntype : reference to a list of Bio::EnsEMBL::MarkerFeatures + Exceptions : none + Caller : contigview, general + Status : Stable + +=cut + +sub get_MarkerFeatures_by_Name { + my ($self, $name) = @_; + + if(!$self->adaptor()) { + warning('Cannot retrieve MarkerFeatures without attached adaptor.'); + return []; + } + + my $ma = $self->adaptor->db->get_MarkerFeatureAdaptor; + + my $feats = $ma->fetch_all_by_Slice_and_MarkerName($self, $name); + return $feats; +} + + +=head2 get_all_compara_DnaAlignFeatures + + Arg [1] : string $qy_species + The name of the species to retrieve similarity features from + Arg [2] : string $qy_assembly + The name of the assembly to retrieve similarity features from + Arg [3] : string $type + The type of the alignment to retrieve similarity features from + Arg [4] : compara dbadptor to use. + Example : $fs = $slc->get_all_compara_DnaAlignFeatures('Mus musculus', + 'MGSC3', + 'WGA'); + Description: Retrieves a list of DNA-DNA Alignments to the species specified + by the $qy_species argument. + The compara database must be attached to the core database + for this call to work correctly. As well the compara database + must have the core dbadaptors for both this species, and the + query species added to function correctly. + Returntype : reference to a list of Bio::EnsEMBL::DnaDnaAlignFeatures + Exceptions : warning if compara database is not available + Caller : contigview + Status : Stable + +=cut + +sub get_all_compara_DnaAlignFeatures { + my ($self, $qy_species, $qy_assembly, $alignment_type, $compara_db) = @_; + + if(!$self->adaptor()) { + warning("Cannot retrieve DnaAlignFeatures without attached adaptor"); + return []; + } + + unless($qy_species && $alignment_type # && $qy_assembly + ) { + throw("Query species and assembly and alignmemt type arguments are required"); + } + + if(!defined($compara_db)){ + $compara_db = Bio::EnsEMBL::Registry->get_DBAdaptor("compara", "compara"); + } + unless($compara_db) { + warning("Compara database must be attached to core database or passed ". + "as an argument to " . + "retrieve compara information"); + return []; + } + + my $dafa = $compara_db->get_DnaAlignFeatureAdaptor; + return $dafa->fetch_all_by_Slice($self, $qy_species, $qy_assembly, $alignment_type); +} + +=head2 get_all_compara_Syntenies + + Arg [1] : string $query_species e.g. "Mus_musculus" or "Mus musculus" + Arg [2] : string $method_link_type, default is "SYNTENY" + Arg [3] : compara dbadaptor to use. + Description: gets all the compara syntenyies for a specfic species + Returns : arrayref of Bio::EnsEMBL::Compara::SyntenyRegion + Status : Stable + +=cut + +sub get_all_compara_Syntenies { + my ($self, $qy_species, $method_link_type, $compara_db) = @_; + + if(!$self->adaptor()) { + warning("Cannot retrieve features without attached adaptor"); + return []; + } + + unless($qy_species) { + throw("Query species and assembly arguments are required"); + } + + unless (defined $method_link_type) { + $method_link_type = "SYNTENY"; + } + + if(!defined($compara_db)){ + $compara_db = Bio::EnsEMBL::Registry->get_DBAdaptor("compara", "compara"); + } + unless($compara_db) { + warning("Compara database must be attached to core database or passed ". + "as an argument to " . + "retrieve compara information"); + return []; + } + my $gdba = $compara_db->get_GenomeDBAdaptor(); + my $mlssa = $compara_db->get_MethodLinkSpeciesSetAdaptor(); + my $dfa = $compara_db->get_DnaFragAdaptor(); + my $sra = $compara_db->get_SyntenyRegionAdaptor(); + + my $this_gdb = $gdba->fetch_by_core_DBAdaptor($self->adaptor()->db()); + my $query_gdb = $gdba->fetch_by_registry_name($qy_species); + my $mlss = $mlssa->fetch_by_method_link_type_GenomeDBs($method_link_type, [$this_gdb, $query_gdb]); + + my $cs = $self->coord_system()->name(); + my $sr = $self->seq_region_name(); + my ($dnafrag) = @{$dfa->fetch_all_by_GenomeDB_region($this_gdb, $cs, $sr)}; + return $sra->fetch_all_by_MethodLinkSpeciesSet_DnaFrag($mlss, $dnafrag, $self->start, $self->end); +} + +=head2 get_all_Haplotypes + + Arg [1] : (optional) boolean $lite_flag + if true lightweight haplotype objects are used + Example : @haplotypes = $slice->get_all_Haplotypes; + Description: Retrieves all of the haplotypes on this slice. Only works + if the haplotype adaptor has been attached to the core adaptor + via $dba->add_db_adaptor('haplotype', $hdba); + Returntype : listref of Bio::EnsEMBL::External::Haplotype::Haplotypes + Exceptions : warning is Haplotype database is not available + Caller : contigview, general + Status : Stable + +=cut + +sub get_all_Haplotypes { + my($self, $lite_flag) = @_; + + if(!$self->adaptor()) { + warning("Cannot retrieve features without attached adaptor"); + return []; + } + + my $haplo_db = $self->adaptor->db->get_db_adaptor('haplotype'); + + unless($haplo_db) { + warning("Haplotype database must be attached to core database to " . + "retrieve haplotype information" ); + return []; + } + + my $haplo_adaptor = $haplo_db->get_HaplotypeAdaptor; + + my $haplotypes = $haplo_adaptor->fetch_all_by_Slice($self, $lite_flag); + + return $haplotypes; +} + + +sub get_all_DASFactories { + my $self = shift; + return [ $self->adaptor()->db()->_each_DASFeatureFactory ]; +} + +sub get_all_DASFeatures_dsn { + my ($self, $source_type, $dsn) = @_; + + if(!$self->adaptor()) { + warning("Cannot retrieve features without attached adaptor"); + return []; + } + my @X = grep { $_->adaptor->dsn eq $dsn } $self->adaptor()->db()->_each_DASFeatureFactory; + + return [ $X[0]->fetch_all_Features( $self, $source_type ) ]; +} + +=head2 get_all_DAS_Features + + Arg [1] : none + Example : $features = $slice->get_all_DASFeatures; + Description: Retreives a hash reference to a hash of DAS feature + sets, keyed by the DNS, NOTE the values of this hash + are an anonymous array containing: + (1) a pointer to an array of features; + (2) a pointer to the DAS stylesheet + Returntype : hashref of Bio::SeqFeatures + Exceptions : ? + Caller : webcode + Status : Stable + +=cut +sub get_all_DAS_Features{ + my ($self) = @_; + + $self->{_das_features} ||= {}; # Cache + $self->{_das_styles} ||= {}; # Cache + $self->{_das_segments} ||= {}; # Cache + my %das_features; + my %das_styles; + my %das_segments; + my $slice = $self; + + foreach my $dasfact( @{$self->get_all_DASFactories} ){ + my $dsn = $dasfact->adaptor->dsn; + my $name = $dasfact->adaptor->name; +# my $type = $dasfact->adaptor->type; + my $url = $dasfact->adaptor->url; + + my ($type) = $dasfact->adaptor->mapping; + if (ref $type eq 'ARRAY') { + $type = shift @$type; + } + $type ||= $dasfact->adaptor->type; + # Construct a cache key : SOURCE_URL/TYPE + # Need the type to handle sources that serve multiple types of features + + my $key = join('/', $name, $type); + if( $self->{_das_features}->{$key} ){ # Use cached + $das_features{$name} = $self->{_das_features}->{$key}; + $das_styles{$name} = $self->{_das_styles}->{$key}; + $das_segments{$name} = $self->{_das_segments}->{$key}; + } else { # Get fresh data + my ($featref, $styleref, $segref) = $dasfact->fetch_all_Features( $slice, $type ); + $self->{_das_features}->{$key} = $featref; + $self->{_das_styles}->{$key} = $styleref; + $self->{_das_segments}->{$key} = $segref; + $das_features{$name} = $featref; + $das_styles{$name} = $styleref; + $das_segments{$name} = $segref; + } + } + + return (\%das_features, \%das_styles, \%das_segments); +} + +sub get_all_DASFeatures{ + my ($self, $source_type) = @_; + + + if(!$self->adaptor()) { + warning("Cannot retrieve features without attached adaptor"); + return []; + } + + my %genomic_features = map { ( $_->adaptor->dsn => [ $_->fetch_all_Features($self, $source_type) ] ) } $self->adaptor()->db()->_each_DASFeatureFactory; + return \%genomic_features; + +} + +sub old_get_all_DASFeatures{ + my ($self,@args) = @_; + + if(!$self->adaptor()) { + warning("Cannot retrieve features without attached adaptor"); + return []; + } + + my %genomic_features = + map { ( $_->adaptor->dsn => [ $_->fetch_all_by_Slice($self) ] ) } + $self->adaptor()->db()->_each_DASFeatureFactory; + return \%genomic_features; + +} + + +=head2 get_all_ExternalFeatures + + Arg [1] : (optional) string $track_name + If specified only features from ExternalFeatureAdaptors with + the track name $track_name are retrieved. + If not set, all features from every ExternalFeatureAdaptor are + retrieved. + Example : @x_features = @{$slice->get_all_ExternalFeatures} + Description: Retrieves features on this slice from external feature adaptors + Returntype : listref of Bio::SeqFeatureI implementing objects in slice + coordinates + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_ExternalFeatures { + my ($self, $track_name) = @_; + + if(!$self->adaptor()) { + warning("Cannot retrieve features without attached adaptor"); + return []; + } + + my $features = []; + + my $xfa_hash = $self->adaptor->db->get_ExternalFeatureAdaptors; + my @xf_adaptors = (); + + if($track_name) { + #use a specific adaptor + if(exists $xfa_hash->{$track_name}) { + push @xf_adaptors, $xfa_hash->{$track_name}; + } + } else { + #use all of the adaptors + push @xf_adaptors, values %$xfa_hash; + } + + + foreach my $xfa (@xf_adaptors) { + push @$features, @{$xfa->fetch_all_by_Slice($self)}; + } + + return $features; +} + + +=head2 get_all_DitagFeatures + + Arg [1] : (optional) string ditag type + Arg [1] : (optional) string logic_name + Example : @dna_dna_align_feats = @{$slice->get_all_DitagFeatures}; + Description: Retrieves the DitagFeatures of a specific type which overlap + this slice with. If type is not defined, all features are + retrieved. + Returntype : listref of Bio::EnsEMBL::DitagFeatures + Exceptions : warning if slice does not have attached adaptor + Caller : general + Status : Stable + +=cut + +sub get_all_DitagFeatures { + my ($self, $type, $logic_name) = @_; + + if(!$self->adaptor()) { + warning('Cannot get DitagFeatures without attached adaptor'); + return []; + } + + my $dfa = $self->adaptor->db->get_DitagFeatureAdaptor(); + + return $dfa->fetch_all_by_Slice($self, $type, $logic_name); +} + + + + +# GENERIC FEATURES (See DBAdaptor.pm) + +=head2 get_generic_features + + Arg [1] : (optional) List of names of generic feature types to return. + If no feature names are given, all generic features are + returned. + Example : my %features = %{$slice->get_generic_features()}; + Description: Gets generic features via the generic feature adaptors that + have been added via DBAdaptor->add_GenricFeatureAdaptor (if + any) + Returntype : Hash of named features. + Exceptions : none + Caller : none + Status : Stable + +=cut + +sub get_generic_features { + + my ($self, @names) = @_; + + if(!$self->adaptor()) { + warning('Cannot retrieve features without attached adaptor'); + return []; + } + + my $db = $self->adaptor()->db(); + + my %features = (); # this will hold the results + + # get the adaptors for each feature + my %adaptors = %{$db->get_GenericFeatureAdaptors(@names)}; + + foreach my $adaptor_name (keys(%adaptors)) { + + my $adaptor_obj = $adaptors{$adaptor_name}; + # get the features and add them to the hash + my $features_ref = $adaptor_obj->fetch_all_by_Slice($self); + # add each feature to the hash to be returned + foreach my $feature (@$features_ref) { + $features{$adaptor_name} = $feature; + } + } + + return \%features; + +} + +=head2 project_to_slice + + Arg [1] : Slice to project to. + Example : my $chr_projection = $clone_slice->project_to_slice($chrom_slice); + foreach my $segment ( @$chr_projection ){ + $chr_slice = $segment->to_Slice(); + print $clone_slice->seq_region_name(). ':'. $segment->from_start(). '-'. + $segment->from_end(). ' -> '.$chr_slice->seq_region_name(). ':'. $chr_slice->start(). + '-'.$chr_slice->end(). + $chr_slice->strand(). " length: ".($chr_slice->end()-$chr_slice->start()+1). "\n"; + } + Description: Projection of slice to another specific slice. Needed for where we have multiple mappings + and we want to state which one to project to. + Returntype : list reference of Bio::EnsEMBL::ProjectionSegment objects which + can also be used as [$start,$end,$slice] triplets. + Exceptions : none + Caller : none + Status : At Risk + +=cut + +sub project_to_slice { + my $self = shift; + my $to_slice = shift; + + throw('Slice argument is required') if(!$to_slice); + + my $slice_adaptor = $self->adaptor(); + + if(!$slice_adaptor) { + warning("Cannot project without attached adaptor."); + return []; + } + + + my $mapper_aptr = $slice_adaptor->db->get_AssemblyMapperAdaptor(); + + my $cs = $to_slice->coord_system(); + my $slice_cs = $self->coord_system(); + my $to_slice_id = $to_slice->get_seq_region_id; + + my @projection; + my $current_start = 1; + + # decompose this slice into its symlinked components. + # this allows us to handle haplotypes and PARs + my $normal_slice_proj = + $slice_adaptor->fetch_normalized_slice_projection($self); + foreach my $segment (@$normal_slice_proj) { + my $normal_slice = $segment->[2]; + + $slice_cs = $normal_slice->coord_system(); + + my $asma = $self->adaptor->db->get_AssemblyMapperAdaptor(); + my $asm_mapper = $asma->fetch_by_CoordSystems($slice_cs, $cs); + + # perform the mapping between this slice and the requested system + my @coords; + + if( defined $asm_mapper ) { + @coords = $asm_mapper->map($normal_slice->seq_region_name(), + $normal_slice->start(), + $normal_slice->end(), + $normal_slice->strand(), + $slice_cs, undef, $to_slice); + } else { + $coords[0] = Bio::EnsEMBL::Mapper::Gap->new( $normal_slice->start(), + $normal_slice->end()); + } + + my $last_rank =0; + #construct a projection from the mapping results and return it + foreach my $coord (@coords) { + my $coord_start = $coord->start(); + my $coord_end = $coord->end(); + my $length = $coord_end - $coord_start + 1; + + + if( $last_rank != $coord->rank){ + $current_start = 1; + } + $last_rank = $coord->rank; + + #skip gaps + if($coord->isa('Bio::EnsEMBL::Mapper::Coordinate')) { + if($coord->id != $to_slice_id){ # for multiple mappings only get the correct one + $current_start += $length; + next; + } + my $coord_cs = $coord->coord_system(); + + # If the normalised projection just ended up mapping to the + # same coordinate system we were already in then we should just + # return the original region. This can happen for example, if we + # were on a PAR region on Y which refered to X and a projection to + # 'toplevel' was requested. +# if($coord_cs->equals($slice_cs)) { +# # trim off regions which are not defined +# return $self->_constrain_to_region(); +# } + + #create slices for the mapped-to coord system + my $slice = $slice_adaptor->fetch_by_seq_region_id( + $coord->id(), + $coord_start, + $coord_end, + $coord->strand()); + + my $current_end = $current_start + $length - 1; + + push @projection, bless([$current_start, $current_end, $slice], + "Bio::EnsEMBL::ProjectionSegment"); + } + + $current_start += $length; + } + } + + + # delete the cache as we may want to map to different set next time and old + # results will be cached. + + $mapper_aptr->delete_cache; + + return \@projection; +} + + +=head2 get_all_synonyms + + Args : none. + Example : my @alternative_names = @{$slice->get_all_synonyms()}; + Description: get a list of alternative names for this slice + Returntype : reference to list of SeqRegionSynonym objects. + Exception : none + Caller : general + Status : At Risk + +=cut + +sub get_all_synonyms{ + my $self = shift; + my $external_db_id =shift; + + if ( !defined( $self->{'synonym'} ) ) { + my $adap = $self->adaptor->db->get_SeqRegionSynonymAdaptor(); + $self->{'synonym'} = + $adap->get_synonyms( $self->get_seq_region_id($self) ); + } + + return $self->{'synonym'}; +} + +=head2 add_synonym + + Args[0] : synonym. + Example : $slice->add_synonym("alt_name"); + Description: add an alternative name for this slice + Returntype : none + Exception : none + Caller : general + Status : At Risk + +=cut + +sub add_synonym{ + my $self = shift; + my $syn = shift; + my $external_db_id = shift; + + my $adap = $self->adaptor->db->get_SeqRegionSynonymAdaptor(); + if ( !defined( $self->{'synonym'} ) ) { + $self->{'synonym'} = $self->get_all_synonyms(); + } + my $new_syn = Bio::EnsEMBL::SeqRegionSynonym->new( #-adaptor => $adap, + -synonym => $syn, + -external_db_id => $external_db_id, + -seq_region_id => $self->get_seq_region_id($self)); + + push (@{$self->{'synonym'}}, $new_syn); + + return; +} + +=head2 summary_as_hash + + Example : $slice_summary = $slice->summary_as_hash(); + Description : Retrieves a textual summary of this slice. + Returns : hashref of descriptive strings +=cut + +sub summary_as_hash { + my $self = shift; + my %summary; + $summary{'display_id'} = $self->display_id; + $summary{'start'} = $self->start; + $summary{'end'} = $self->end; + $summary{'strand'} = $self->strand; + $summary{'Is_circular'} = $self->is_circular ? "true" : "false"; + $summary{'region_name'} = $self->seq_region_name(); + return \%summary; +} + +# +# Bioperl Bio::PrimarySeqI methods: +# + +=head2 id + + Description: Included for Bio::PrimarySeqI interface compliance (0.7) + +=cut + +sub id { name(@_); } + + +=head2 display_id + + Description: Included for Bio::PrimarySeqI interface compliance (1.2) + +=cut + +sub display_id { name(@_); } + + +=head2 primary_id + + Description: Included for Bio::PrimarySeqI interface compliance (1.2) + +=cut + +sub primary_id { name(@_); } + + +=head2 desc + +Description: Included for Bio::PrimarySeqI interface compliance (1.2) + +=cut + +sub desc{ return $_[0]->coord_system->name().' '.$_[0]->seq_region_name(); } + + +=head2 moltype + +Description: Included for Bio::PrimarySeqI interface compliance (0.7) + +=cut + +sub moltype { return 'dna'; } + +=head2 alphabet + + Description: Included for Bio::PrimarySeqI interface compliance (1.2) + +=cut + +sub alphabet { return 'dna'; } + + +=head2 accession_number + + Description: Included for Bio::PrimarySeqI interface compliance (1.2) + +=cut + +sub accession_number { name(@_); } + + +# sub DEPRECATED METHODS # +############################################################################### + +=head1 DEPRECATED METHODS + +=head2 get_all_AffyFeatures + + Description: DEPRECATED, use functionality provided by the Ensembl + Functional Genomics API instead. + +=cut + +sub get_all_AffyFeatures { + deprecate( 'Use functionality provided by the ' + . 'Ensembl Functional Genomics API instead.' ); + throw('Can not delegate deprecated functionality.'); + + # Old code: + +# my $self = shift; +# my @arraynames = @_; +# +# my $sa = $self->adaptor(); +# if ( ! $sa ) { +# warning( "Cannot retrieve features without attached adaptor." ); +# } +# my $fa = $sa->db()->get_AffyFeatureAdaptor(); +# my $features; +# +# if ( @arraynames ) { +# $features = $fa->fetch_all_by_Slice_arrayname( $self, @arraynames ); +# } else { +# $features = $fa->fetch_all_by_Slice( $self ); +# } +# return $features; +} + +=head2 get_all_OligoFeatures + + Description: DEPRECATED, use functionality provided by the Ensembl + Functional Genomics API instead. + +=cut + +sub get_all_OligoFeatures { + + deprecate( 'Use functionality provided by the ' + . 'Ensembl Functional Genomics API instead.' ); + throw('Can not delegate deprecated functionality.'); + + # Old code: + +# my $self = shift; +# my @arraynames = @_; +# +# my $sa = $self->adaptor(); +# if ( ! $sa ) { +# warning( "Cannot retrieve features without attached adaptor." ); +# } +# my $fa = $sa->db()->get_OligoFeatureAdaptor(); +# my $features; +# +# if ( @arraynames ) { +# $features = $fa->fetch_all_by_Slice_arrayname( $self, @arraynames ); +# } else { +# $features = $fa->fetch_all_by_Slice( $self ); +# } +# return $features; +} + +=head2 get_all_OligoFeatures_by_type + + Description: DEPRECATED, use functionality provided by the Ensembl + Functional Genomics API instead. + +=cut + +sub get_all_OligoFeatures_by_type { + + deprecate( 'Use functionality provided by the ' + . 'Ensembl Functional Genomics API instead.' ); + throw('Can not delegate deprecated functionality.'); + + # Old code: + +# my ($self, $type, $logic_name) = @_; +# +# throw('Need type as parameter') if !$type; +# +# my $sa = $self->adaptor(); +# if ( ! $sa ) { +# warning( "Cannot retrieve features without attached adaptor." ); +# } +# my $fa = $sa->db()->get_OligoFeatureAdaptor(); +# +# my $features = $fa->fetch_all_by_Slice_type( $self, $type, $logic_name ); +# +# return $features; +} + +=head2 get_all_supercontig_Slices + + Description: DEPRECATED use get_tiling_path("NTcontig") instead + +=cut + + +sub get_all_supercontig_Slices { + my $self = shift; + + deprecate("Use get_tiling_path('NTcontig') instead"); + + my $result = []; + + if( $self->adaptor() ) { + my $superctg_names = + $self->adaptor()->list_overlapping_supercontigs( $self ); + + for my $name ( @$superctg_names ) { + my $slice; + $slice = $self->adaptor()->fetch_by_supercontig_name( $name ); + $slice->name( $name ); + push( @$result, $slice ); + } + } else { + warning( "Slice needs to be attached to a database to get supercontigs" ); + } + + return $result; +} + + + + + +=head2 get_Chromosome + + Description: DEPRECATED use this instead: + $slice_adp->fetch_by_region('chromosome', + $slice->seq_region_name) + +=cut + +sub get_Chromosome { + my $self = shift @_; + + deprecate("Use SliceAdaptor::fetch_by_region('chromosome'," . + '$slice->seq_region_name) instead'); + + my $csa = $self->adaptor->db->get_CoordSystemAdaptor(); + my ($top_cs) = @{$csa->fetch_all()}; + + return $self->adaptor->fetch_by_region($top_cs->name(), + $self->seq_region_name(), + undef,undef,undef, + $top_cs->version()); +} + + + +=head2 chr_name + + Description: DEPRECATED use seq_region_name() instead + +=cut + +sub chr_name{ + deprecate("Use seq_region_name() instead"); + seq_region_name(@_); +} + + + +=head2 chr_start + + Description: DEPRECATED use start() instead + +=cut + +sub chr_start{ + deprecate('Use start() instead'); + start(@_); +} + + + +=head2 chr_end + + Description: DEPRECATED use end() instead + Returntype : int + Exceptions : none + Caller : SliceAdaptor, general + +=cut + +sub chr_end{ + deprecate('Use end() instead'); + end(@_); +} + + +=head2 assembly_type + + Description: DEPRECATED use version instead + +=cut + +sub assembly_type{ + my $self = shift; + deprecate('Use $slice->coord_system()->version() instead.'); + return $self->coord_system->version(); +} + + +=head2 get_tiling_path + + Description: DEPRECATED use project instead + +=cut + +sub get_tiling_path { + my $self = shift; + deprecate('Use $slice->project("seqlevel") instead.'); + return []; +} + + +=head2 dbID + + Description: DEPRECATED use SliceAdaptor::get_seq_region_id instead + +=cut + +sub dbID { + my $self = shift; + deprecate('Use SliceAdaptor::get_seq_region_id instead.'); + if(!$self->adaptor) { + warning('Cannot retrieve seq_region_id without attached adaptor.'); + return 0; + } + return $self->adaptor->get_seq_region_id($self); +} + + +=head2 get_all_MapFrags + + Description: DEPRECATED use get_all_MiscFeatures instead + +=cut + +sub get_all_MapFrags { + my $self = shift; + deprecate('Use get_all_MiscFeatures instead'); + return $self->get_all_MiscFeatures(@_); +} + +=head2 has_MapSet + + Description: DEPRECATED use get_all_MiscFeatures instead + +=cut + +sub has_MapSet { + my( $self, $mapset_name ) = @_; + deprecate('Use get_all_MiscFeatures instead'); + my $mfs = $self->get_all_MiscFeatures($mapset_name); + return (@$mfs > 0); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/SplicingEvent.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/SplicingEvent.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,149 @@ + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::SlicingEvent - Object representing an alternative splicing event + +=head1 SYNOPSIS + + my $ase = + Bio::EnsEMBL::SplicingEvent->new( -START => 123, + -END => 1045, + -STRAND => 1, + -GENE_ID => $gene->dbID, + -SLICE => $slice ); + + # set some additional attributes + $ase->name('ENSG00000000003-CNE-3'); + $ase->type('CNE'); + +=head1 DESCRIPTION + +A representation of an Alternative Splicing Event within the Ensembl system. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::SplicingEvent; + +use strict; + +use POSIX; +use Bio::EnsEMBL::Feature; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw warning deprecate); + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Feature); + +## Add to gene get_all_splicing_events + +sub gene_id { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'gene_id'} = $value; + } + + return $self->{'gene_id'}; +} + +sub name { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'name'} = $value; + } + + return $self->{'name'}; +} + +sub type { + my ( $self, $value ) = @_; + + if ( defined($value) ) { + $self->{'type'} = $value; + } + + return $self->{'type'}; +} + +sub add_Feature { + my ( $self, $feature ) = @_; + + if ( !ref($feature) + || !$feature->isa("Bio::EnsEMBL::SplicingEventFeature") ) + { + throw("$feature is not a Bio::EnsEMBL::SplicingEventFeature!"); + } + + $self->{'_feature_array'} ||= []; + + push( @{ $self->{'_feature_array'} }, $feature ); +} + +sub get_all_Features { + my ($self) = @_; + + if ( !exists( $self->{'_feature_array'} ) ) { + if ( defined( $self->adaptor() ) ) { + my $fta = + $self->adaptor()->db()->get_SplicingEventFeatureAdaptor(); + my $features = $fta->fetch_all_by_SplicingEvent($self); + $self->{'_feature_array'} = $features; + } + } + + return $self->{'_feature_array'}; +} + +sub add_Pair { + my ( $self, $feature ) = @_; + + if ( !ref($feature) + || !$feature->isa("Bio::EnsEMBL::SplicingEventPair") ) + { + throw("$feature is not a Bio::EnsEMBL::SplicingEventPair!"); + } + + $self->{'_pair_array'} ||= []; + + push( @{ $self->{'_pair_array'} }, $feature ); +} + +sub get_all_Pairs { + my ($self) = @_; + + if ( !exists( $self->{'_pair_array'} ) ) { + if ( defined( $self->adaptor() ) ) { + my $pa = + $self->adaptor()->db()->get_SplicingTranscriptPairAdaptor(); + my $pairs = $pa->fetch_all_by_SplicingEvent($self); + $self->{'_pair_array'} = $pairs; + } + } + + return $self->{'_pair_array'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/SplicingEventFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/SplicingEventFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,143 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::SplicingEventFeature - Object representing an alternative splicing event + +=head1 SYNOPSIS + + my $ase = Bio::EnsEMBL::SplicingEventFeature->new( + -START => 123, + -END => 1045, + -EXON_ID => $exon->dbID + ); + + # set some additional attributes + $ase->type('flanking_exon'); + +=head1 DESCRIPTION + +A representation of an Alternative Splicing Event Feature within the Ensembl system. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::SplicingEventFeature; + +use strict; + +use POSIX; +use Bio::EnsEMBL::Feature; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw warning deprecate); + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Feature); + + + +## Add to gene get_all_splicing_events + + + +sub exon_id{ + my $self = shift; + $self->{'exon_id'} = shift if (@_); + + if (defined $self->{'exon_id'}) { + return $self->{'exon_id'}; + } + + return undef; +} + +sub transcript_id{ + my $self = shift; + $self->{'transcript_id'} = shift if (@_); + + if (defined $self->{'transcript_id'}) { + return $self->{'transcript_id'}; + } + + return undef; +} + +sub feature_order{ + my $self = shift; + $self->{'feature_order'} = shift if (@_); + + if (defined $self->{'feature_order'}) { + return $self->{'feature_order'}; + } + + return undef; +} + +sub type{ + my $self = shift; + $self->{'type'} = shift if (@_); + + if (defined $self->{'type'}) { + return $self->{'type'}; + } + + return undef; +} + +sub start{ + my $self = shift; + $self->{'start'} = shift if (@_); + + if (defined $self->{'start'}) { + return $self->{'start'}; + } + + return undef; +} + +sub end{ + my $self = shift; + $self->{'end'} = shift if (@_); + + if (defined $self->{'end'}) { + return $self->{'end'}; + } + + return undef; +} + + +sub transcript_association{ + my $self = shift; + $self->{'transcript_association'} = shift if (@_); + + if (defined $self->{'transcript_association'}) { + return $self->{'transcript_association'}; + } + + return undef; +} + + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/SplicingTranscriptPair.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/SplicingTranscriptPair.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,106 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::SplicingTranscriptPair - Object representing an alternative splicing transcript pair + +=head1 SYNOPSIS + + my $ase = Bio::EnsEMBL::SplicingTranscriptPair->new( + -START => 123, + -END => 1045, + -TRANSCRIPT_ID_1 => $tran1->dbID, + -TRANSCRIPT_ID_2 => %tran2->dbID + ); + +=head1 DESCRIPTION + +A representation of an Alternative Splicing Transcrript Pair within the Ensembl system. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::SplicingTranscriptPair; + +use strict; + +use POSIX; +use Bio::EnsEMBL::Feature; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw warning deprecate); + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Feature); + + + + +sub transcript_id_1{ + my $self = shift; + $self->{'transcript_id_1'} = shift if (@_); + + if (defined $self->{'transcript_id_1'}) { + return $self->{'transcript_id_1'}; + } + + return undef; +} + +sub transcript_id_2{ + my $self = shift; + $self->{'transcript_id_2'} = shift if (@_); + + if (defined $self->{'transcript_id_2'}) { + return $self->{'transcript_id_2'}; + } + + return undef; +} + + +sub start{ + my $self = shift; + $self->{'start'} = shift if (@_); + + if (defined $self->{'start'}) { + return $self->{'start'}; + } + + return undef; +} + +sub end{ + my $self = shift; + $self->{'end'} = shift if (@_); + + if (defined $self->{'end'}) { + return $self->{'end'}; + } + + return undef; +} + + + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/StableIdEvent.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/StableIdEvent.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,300 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::StableIdEvent- object representing a stable ID mapping event + +=head1 SYNOPSIS + + my $old_id = Bio::EnsEMBL::ArchiveStableId->new( + -stable_id => 'ENSG001', + -version => 1, + -type => 'Gene', + ); + + my $new_id = Bio::EnsEMBL::ArchiveStableId->new( + -stable_id => 'ENSG001', + -version => 2, + -type => 'Gene', + ); + + my $event = Bio::EnsEMBL::StableIdEvent->new( + -old_id => $old_id, + -new_id => $new_id, + -score => 0.997 + ); + + # directly access attributes in old and new ArchiveStableId + my $old_stable_id = $event->get_attribute( 'old', 'stable_id' ); + +=head1 DESCRIPTION + +This object represents a stable ID mapping event. Such an event links two +ArchiveStableIds with a mapping score. + +=head1 METHODS + + new + old_ArchiveStableId + new_ArchiveStableId + score + get_attribute + ident_string + +=head1 RELATED MODULES + + Bio::EnsEMBL::ArchiveStableId + Bio::EnsEMBL::DBSQL::ArchiveStableIdAdaptor + Bio::EnsEMBL::StableIdHistoryTree + +=cut + +package Bio::EnsEMBL::StableIdEvent; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); + + +=head2 new + + Arg[1] : Bio::EnsEMBL::ArchiveStableId $old_id + The old ArchiveStableId in the mapping event + Arg[2] : Bio::EnsEMBL::ArchiveStableId $new_id + The new ArchiveStableId in the mapping event + Arg[3] : (optional) float $score - score of this mapping event + Example : my $event = Bio::EnsEMBL::StableIdEvent->new( + $arch_id1, $arch_id2, 0.977); + Description : object constructor + Return type : Bio::EnsEMBL::StableIdEvent + Exceptions : thrown on wrong argument types + Caller : Bio::EnsEMBL::DBSQL::ArchiveStableIdAdaptor::fetch_history_tree_by_stable_id, general + Status : At Risk + : under development + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my ($old_id, $new_id, $score) = rearrange([qw(OLD_ID NEW_ID SCORE)], @_); + + throw("Need old or new Bio::EnsEMBL::ArchiveStableId to create StableIdEvent") + unless ($old_id || $new_id); + + my $self = {}; + bless $self, $class; + + # initialise object + $self->old_ArchiveStableId($old_id); + $self->new_ArchiveStableId($new_id); + $self->score($score); + + return $self; +} + + +=head2 old_ArchiveStableId + + Arg[1] : (optional) Bio::EnsEMBL::ArchiveStableId $archive_id, or undef + The old ArchiveStableId to set for this mapping event + Example : # getter + my $archive_id = $event->old_ArchiveStableId; + + # setter + $event->old_ArchiveStableId($archive_id); + Description : Getter/setter for old ArchiveStableId in this mapping event. + Return type : Bio::EnsEMBL::ArchiveStableId + Exceptions : thrown on wrong argument type + Caller : general + Status : At Risk + : under development + +=cut + +sub old_ArchiveStableId { + my $self = shift; + + # setter + if (@_) { + my $archive_id = shift; + + # if argument is defined, check type. undef is also legal as an argument. + if (defined($archive_id)) { + throw("Need a Bio::EnsEMBL::ArchiveStableId.") unless + (ref($archive_id) && $archive_id->isa('Bio::EnsEMBL::ArchiveStableId')); + } + + $self->{'old_id'} = $archive_id; + } + + # getter + return $self->{'old_id'}; +} + + +=head2 new_ArchiveStableId + + Arg[1] : (optional) Bio::EnsEMBL::ArchiveStableId $archive_id, or undef + The new ArchiveStableId to set for this mapping event + Example : # getter + my $archive_id = $event->new_ArchiveStableId; + + # setter + $event->new_ArchiveStableId($archive_id); + Description : Getter/setter for new ArchiveStableId in this mapping event. + Return type : Bio::EnsEMBL::ArchiveStableId + Exceptions : thrown on wrong argument type + Caller : general + Status : At Risk + : under development + +=cut + +sub new_ArchiveStableId { + my $self = shift; + + # setter + if (@_) { + my $archive_id = shift; + + # if argument is defined, check type. undef is also legal as an argument. + if (defined($archive_id)) { + throw("Need a Bio::EnsEMBL::ArchiveStableId.") unless + (ref($archive_id) && $archive_id->isa('Bio::EnsEMBL::ArchiveStableId')); + } + + $self->{'new_id'} = $archive_id; + } + + # getter + return $self->{'new_id'}; +} + + +=head2 score + + Arg[1] : (optional) float $score - the score to set + Example : my $score = $event->score; + Description : Getter/setter for mapping event score. + Return type : float or undef + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub score { + my $self = shift; + $self->{'score'} = shift if (@_); + return $self->{'score'}; +} + + +=head2 get_attribute + + Arg[1] : String $type - determines whether to get attribute from 'old' + or 'new' ArchiveStableId + Arg[2] : String $attr - ArchiveStableId attribute to fetch + Example : my $old_stable_id = $event->get_attribute('old', 'stable_id'); + Description : Accessor to attributes of the ArchiveStableIds attached to this + event. Convenience method that does the check for undef old + and/or new ArchiveStableId for you. + Return type : same as respective method in Bio::EnsEMBL::ArchiveStableId, or + undef + Exceptions : thrown on wrong arguments + Caller : general + Status : At Risk + : under development + +=cut + +sub get_attribute { + my ($self, $type, $attr) = @_; + + throw("First argument passed to this function has to be 'old' or 'new'.") + unless ($type eq 'old' or $type eq 'new'); + + my %allowed_attribs = map { $_ => 1 } + qw(stable_id version db_name release assembly); + + throw("Attribute $attr not allowed.") unless $allowed_attribs{$attr}; + + my $call = $type.'_ArchiveStableId'; + + if (my $id = $self->$call) { + return $id->$attr; + } else { + return undef; + } +} + + +=head2 ident_string + + Example : print $event->ident_string, "\n"; + Description : Returns a string that can be used to identify your StableIdEvent. + Useful in debug warnings. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub ident_string { + my $self = shift; + + my $old_id = $self->old_ArchiveStableId; + my $new_id = $self->new_ArchiveStableId; + + my $str; + + if ($old_id) { + $str = $old_id->stable_id.'.'.$old_id->version.' ('. + $old_id->release.')'; + } else { + $str = 'null'; + } + + $str .= ' -> '; + + if ($new_id) { + $str .= $new_id->stable_id.'.'.$new_id->version.' ('. + $new_id->release.')'; + } else { + $str .= 'null'; + } + + $str .= ' ['.$self->score.']'; + + return $str; +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/StableIdHistoryTree.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/StableIdHistoryTree.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1202 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::StableIdHistoryTree - object representing a stable ID history tree + +=head1 SYNOPSIS + + my $reg = "Bio::EnsEMBL::Registry"; + my $archiveStableIdAdaptor = + $reg->get_adaptor( 'human', 'core', 'ArchiveStableId' ); + + my $stable_id = 'ENSG00000068990'; + my $history = + $archiveStableIdAdaptor->fetch_history_tree_by_stable_id('ENSG01'); + + print "Unique stable IDs in this tree:\n"; + print join( ", ", @{ $history->get_unique_stable_ids } ), "\n"; + + print "\nReleases in this tree:\n"; + print join( ", ", @{ $history->get_release_display_names } ), "\n"; + + print "\nCoordinates of nodes in the tree:\n\n"; + foreach my $a ( @{ $history->get_all_ArchiveStableIds } ) { + print " Stable ID: " . $a->stable_id . "." . $a->version . "\n"; + print " Release: " + . $a->release . " (" + . $a->assembly . ", " + . $a->db_name . ")\n"; + print " coords: " + . join( ', ', @{ $history->coords_by_ArchiveStableId($a) } ) + . "\n\n"; + } + +=head1 DESCRIPTION + +This object represents a stable ID history tree graph. + +The graph is implemented as a collection of nodes (ArchiveStableId +objects) and links (StableIdEvent objects) which have positions +on an (x,y) grid. The x axis is used for releases, the y axis for +stable_ids. The idea is to create a plot similar to this (the numbers +shown on the nodes are the stable ID versions): + + ENSG001 1-------------- 2-- + \ + ENSG003 1-----1 + / + ENSG002 1-------2---------- + + 38 39 40 41 42 + +The grid coordinates of the ArchiveStableId objects in this example +would be (note that coordinates are zero-based): + + ENSG001.1 (0, 0) + ENSG001.2 (2, 0) + ENSG003.1 (release 41) (3, 1) + ENSG003.1 (release 42) (4, 1) + ENSG002.1 (0, 2) + ENSG002.2 (1, 2) + +The tree will only contain those nodes which had a change in the stable +ID version. Therefore, in the above example, in release 39 ENSG001 was +present and had version 1 (but will not be drawn there, to unclutter the +output). + +The grid positions will be calculated by the API and will try to +untangle the tree (i.e. try to avoid overlapping lines). + +=head1 METHODS + + new + add_ArchiveStableIds + add_ArchiveStableIds_for_events + remove_ArchiveStableId + flush_ArchiveStableIds + add_StableIdEvents + remove_StableIdEvent + flush_StableIdEvents + get_all_ArchiveStableIds + get_all_StableIdEvents + get_latest_StableIdEvent + get_release_display_names + get_release_db_names + get_unique_stable_ids + optimise_tree + coords_by_ArchiveStableId + calculate_coords + consolidate_tree + reset_tree + current_dbname + current_release + current_assembly + is_incomplete + +=head1 RELATED MODULES + + Bio::EnsEMBL::ArchiveStableId + Bio::EnsEMBL::DBSQL::ArchiveStableIdAdaptor + Bio::EnsEMBL::StableIdEvent + +=cut + +package Bio::EnsEMBL::StableIdHistoryTree; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + + +=head2 new + + Arg [CURRENT_DBNAME] : (optional) name of current db + Arg [CURRENT_RELEASE] : (optional) current release number + Arg [CURRENT_ASSEMBLY] : (optional) current assembly name + Example : my $history = Bio::EnsEMBL::StableIdHistoryTree->new; + Description : object constructor + Return type : Bio::EnsEMBL::StableIdHistoryTree + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my $self = {}; + bless $self, $class; + + my ($current_dbname, $current_release, $current_assembly) = + rearrange([qw( CURRENT_DBNAME CURRENT_RELEASE CURRENT_ASSEMBLY )], @_ ); + + # initialise + $self->{'current_dbname'} = $current_dbname; + $self->{'current_release'} = $current_release; + $self->{'current_assembly'} = $current_assembly; + + return $self; +} + + +=head2 add_ArchiveStableIds + + Arg[1..n] : Bio::EnsEMBL::ArchiveStableId's @archive_ids + The ArchiveStableIds to add to the the history tree + Example : my $archive_id = $archiveStableIdAdaptor->fetch_by_stable_id( + 'ENSG00024808'); + $history->add_ArchiveStableId($archive_id); + Description : Adds ArchiveStableIds (nodes) to the history tree. No + calculation of grid coordinates is done at this point, you need + to initiate this manually with calculate_coords(). + ArchiveStableIds are only added once for each release (to avoid + duplicates). + Return type : none + Exceptions : thrown on invalid or missing argument + Caller : Bio::EnsEMBL::DBSQL::ArchiveStableIdAdaptor::fetch_history_by_stable_id, general + Status : At Risk + : under development + +=cut + +sub add_ArchiveStableIds { + my ($self, @archive_ids) = @_; + + throw("You must provide one or more Bio::EnsEMBL::ArchiveStableIds to add.") + unless (@archive_ids); + + foreach my $archive_id (@archive_ids) { + throw("Bio::EnsEMBL::ArchiveStableId object expected.") + unless (ref($archive_id) && + $archive_id->isa('Bio::EnsEMBL::ArchiveStableId')); + + $self->{'nodes'}->{$self->_node_id($archive_id)} = $archive_id; + } +} + + +=head2 add_ArchiveStableIds_for_events + + Example : my $history = Bio::EnsEMBL::StableIdHistoryTree->new; + $history->add_StableIdEvents($event1, $event2); + $history->add_ArchiveStableIds_for_events; + Description : Convenience method that adds all ArchiveStableIds for all + StableIdEvents attached to this object to the tree. + Return type : none + Exceptions : none + Caller : Bio::EnsEMBL::DBSQL::ArchiveStableIdAdaptor::fetch_history_by_stable_id, general + Status : At Risk + : under development + +=cut + +sub add_ArchiveStableIds_for_events { + my $self = shift; + + foreach my $event (@{ $self->get_all_StableIdEvents }) { + if ($event->old_ArchiveStableId) { + $self->add_ArchiveStableIds($event->old_ArchiveStableId); + } + if ($event->new_ArchiveStableId) { + $self->add_ArchiveStableIds($event->new_ArchiveStableId); + } + } +} + + +=head2 remove_ArchiveStableId + + Arg[1] : Bio::EnsEMBL::ArchiveStableId $archive_id + the ArchiveStableId to remove from the tree + Example : $history->remove_ArchiveStableId($archive_id); + Description : Removes an ArchiveStableId from the tree. + Return type : none + Exceptions : thrown on missing or invalid argument + Caller : Bio::EnsEMBL::DBSQL::ArchiveStableIdAdaptor::fetch_history_by_stable_id, general + Status : At Risk + : under development + +=cut + +sub remove_ArchiveStableId { + my ($self, $archive_id) = @_; + + throw("Bio::EnsEMBL::ArchiveStableId object expected.") + unless ($archive_id && ref($archive_id) && + $archive_id->isa('Bio::EnsEMBL::ArchiveStableId')); + + my %nodes = %{ $self->{'nodes'} }; + delete $nodes{$self->_node_id($archive_id)}; + $self->{'nodes'} = \%nodes; +} + + +=head2 flush_ArchiveStableIds + + Example : $history->flush_ArchiveStableIds; + Description : Remove all ArchiveStableIds from the tree. + Return type : none + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub flush_ArchiveStableIds { + my $self = shift; + $self->{'nodes'} = undef; +} + + +# +# generate a unique node identifier +# +sub _node_id { + my ($self, $archive_id) = @_; + return $archive_id->stable_id . ':' . $archive_id->db_name; +} + + +=head2 add_StableIdEvents + + Arg[1..n] : Bio::EnsEMBL::StableIdEvent's @events + The StableIdEvents to add to the the history tree + Example : $history->add_StableIdEvents($event); + Description : Adds StableIdEvents (links) to the history tree. Note that + ArchiveStableIds attached to the StableIdEvent aren't added to + the tree automatically, you'll need to call + add_ArchiveStableIds_for_events later. + Return type : none + Exceptions : thrown on invalid or missing argument + Caller : Bio::EnsEMBL::DBSQL::ArchiveStableIdAdaptor::fetch_history_by_stable_id, general + Status : At Risk + : under development + +=cut + +sub add_StableIdEvents { + my ($self, @events) = @_; + + throw("You must provide one or more Bio::EnsEMBL::StableIdsEvents to add.") + unless (@events); + + foreach my $event (@events) { + throw("Bio::EnsEMBL::StableIdEvent object expected.") + unless ($event->isa('Bio::EnsEMBL::StableIdEvent')); + + $self->{'links'}->{$self->_link_id($event)} = $event; + } +} + + +=head2 remove_StableIdEvent + + Arg[1] : Bio::EnsEMBL::StableIdEvent $event + the StableIdEvent to remove from the tree + Example : $history->remove_StableIdEvent($event); + Description : Removes a StableIdEvent from the tree. + Return type : none + Exceptions : thrown on missing or invalid arguments + Caller : Bio::EnsEMBL::DBSQL::ArchiveStableIdAdaptor::fetch_history_by_stable_id, general + Status : At Risk + : under development + +=cut + +sub remove_StableIdEvent { + my ($self, $event) = @_; + + throw("Bio::EnsEMBL::StableIdEvent object expected.") unless + ($event && ref($event) && $event->isa('Bio::EnsEMBL::StableIdEvent')); + + my %links = %{ $self->{'links'} }; + delete $links{$self->_link_id($event)}; + $self->{'links'} = \%links; +} + + +=head2 flush_StableIdEvents + + Example : $history->flush_StableIdEvents; + Description : Removes all StableIdEvents from the tree. + Return type : none + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub flush_StableIdEvents { + my $self = shift; + $self->{'links'} = undef; +} + + +# +# generate a unique link identifier +# +sub _link_id { + my ($self, $event) = @_; + + my ($old_id, $old_db_name, $new_id, $new_db_name); + if ($event->old_ArchiveStableId) { + $old_id = $event->old_ArchiveStableId->stable_id; + $old_db_name = $event->old_ArchiveStableId->db_name; + } + if ($event->new_ArchiveStableId) { + $new_id = $event->new_ArchiveStableId->stable_id; + $new_db_name = $event->new_ArchiveStableId->db_name; + } + + return join(':', $old_id, $old_db_name, $new_id, $new_db_name); +} + + +=head2 get_all_ArchiveStableIds + + Example : foreach my $arch_id (@{ $history->get_all_ArchiveStableIds }) { + print $arch_id->stable_id, '.', $arch_id->version, "\n"; + } + Description : Gets all ArchiveStableIds (nodes) in this tree. + Return type : Arrayref of Bio::EnsEMBL::ArchiveStableId objects + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_all_ArchiveStableIds { + my $self = shift; + return [ values %{ $self->{'nodes'} } ]; +} + + +=head2 get_all_current_ArchiveStableIds + + Example : foreach my $arch_id (@{ $history->get_all_current_ArchiveStableIds }) { + print $arch_id->stable_id, '.', $arch_id->version, "\n"; + } + Description : Convenience method to get all current ArchiveStableIds in this + tree. + + Note that no lazy loading of "current" status is done at that + stage; as long as you retrieve your StableIdHistoryTree object + from ArchiveStableIdAdaptor, you'll get the right answer. In + other use cases, if you want to make sure you really get all + current stable IDs, loop over the result of + get_all_ArchiveStableIds() and call + ArchiveStableId->current_version() on all of them. + Return type : Arrayref of Bio::EnsEMBL::ArchiveStableId objects + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_all_current_ArchiveStableIds { + my $self = shift; + + my @current = (); + + foreach my $arch_id (@{ $self->get_all_ArchiveStableIds }) { + push @current, $arch_id if ($arch_id->is_current); + } + + return \@current; +} + + +=head2 get_all_StableIdEvents + + Example : foreach my $event (@{ $history->get_all_StableIdsEvents }) { + print "Old stable ID: ", + ($event->get_attribute('old', 'stable_id') or 'none'), "\n"; + print "New stable ID: ", + ($event->get_attribute('new', 'stable_id') or 'none'), "\n"; + print "Mapping score: ", $event->score, "\n"; + } + Description : Gets all StableIdsEvents (links) in this tree. + Return type : Arrayref of Bio::EnsEMBL::StableIdEvent objects + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_all_StableIdEvents { + my $self = shift; + return [ values %{ $self->{'links'} } ]; +} + + +=head2 get_latest_StableIdEvent + + Arg[1] : Bio::EnsEMBL::ArchiveStableId $arch_id - the stable ID to get + the latest Event for + Example : my $arch_id = Bio::EnsEMBL::ArchiveStableId->new( + -stable_id => 'ENSG00001' + ); + my $event = $history->get_latest_Event($arch_id); + Description : Returns the latest StableIdEvent found in the tree where a given + stable ID is the new stable ID. If more than one is found (e.g. + in a merge scenario in the latest mapping), preference is given + to self-events. + Return type : Bio::EnsEMBL::StableIdEvent + Exceptions : thrown on missing or wrong argument + Caller : Bio::EnsEMBL::DBSQL::ArchiveStableIdAdaptor::add_all_current_to_history, general + Status : At Risk + : under development + +=cut + +sub get_latest_StableIdEvent { + my $self = shift; + my $arch_id = shift; + + unless ($arch_id and $arch_id->isa('Bio::EnsEMBL::ArchiveStableId')) { + throw("Need a Bio::EnsEMBL::ArchiveStableId."); + } + + my @all_events = @{ $self->get_all_StableIdEvents }; + my @self_events = (); + + while (my $event = shift(@all_events)) { + if ($event->new_ArchiveStableId and + $event->new_ArchiveStableId->stable_id eq $arch_id->stable_id) { + push @self_events, $event; + } + } + + my @sorted = sort { $b->new_ArchiveStableId->release <=> + $a->new_ArchiveStableId->release } @self_events; + + # give priority to self events + my $latest; + while ($latest = shift @sorted) { + last if (($latest->old_ArchiveStableId and + $latest->old_ArchiveStableId->stable_id eq $arch_id->stable_id) + or !$latest->old_ArchiveStableId); + } + + return $latest; +} + + +=head2 get_release_display_names + + Example : print "Unique release display_names in this tree:\n" + foreach my $name (@{ $history->get_release_display_names }) { + print " $name\n"; + } + Description : Returns a chronologically sorted list of unique release + display_names in this tree. + + This method can be used to determine the number of columns when + plotting the history tree. + Return type : Arrayref of strings. + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_release_display_names { + my $self = shift; + + my @display_names = map { $_->[1] } @{ $self->_sort_releases }; + + return \@display_names; +} + + +=head2 get_release_db_names + + Example : print "Unique release db_names in this tree:\n" + foreach my $name (@{ $history->get_release_db_names }) { + print " $name\n"; + } + Description : Returns a chronologically sorted list of unique release + db_names in this tree. + Return type : Arrayref of strings. + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_release_db_names { + my $self = shift; + + my @db_names = map { $_->[0] } @{ $self->_sort_releases }; + + return \@db_names; +} + + +# +# Create a chronologically sorted list of releases. +# +# Return type : Arrayref of arrayrefs (db_name, release) +# +sub _sort_releases { + my $self = shift; + + unless ($self->{'sorted_tree'}->{'releases'}) { + + my %unique = (); + + foreach my $archive_id (@{ $self->get_all_ArchiveStableIds }) { + $unique{join(':', $archive_id->db_name, $archive_id->release)} = 1; + } + + # sort releases by release number, then db_name; this should get them into + # chronological order + my @releases = sort { $a->[1] <=> $b->[1] || $a->[0] cmp $b->[0] } + map { [ split(/:/, $_) ] } keys(%unique); + + $self->{'sorted_tree'}->{'releases'} = \@releases; + + } + + return $self->{'sorted_tree'}->{'releases'}; +} + + +=head2 get_unique_stable_ids + + Example : print "Unique stable IDs in this tree:\n" + foreach my $id (@{ $history->get_unique_stable_ids }) { + print " $id\n"; + } + Description : Returns a list of unique stable IDs in this tree. Version is not + taken into account here. This method can be used to determine + the number of rows when plotting the history with each stable ID + occupying one line. + + Sort algorithm will depend on what was chosen when the sorted + tree was generated. This ranges from a simple alphanumeric sort + to algorithms trying to untangle the history tree. If no + pre-sorted data is found, an alphanumerically sorted list will + be returned by default. + Return type : Arrayref of strings. + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub get_unique_stable_ids { + my $self = shift; + + unless ($self->{'sorted_tree'}->{'stable_ids'}) { + $self->{'sorted_tree'}->{'stable_ids'} = $self->_sort_stable_ids; + } + + return $self->{'sorted_tree'}->{'stable_ids'}; +} + + +# +# Returns a list of stable IDs in this history tree, sorted alphabetically. +# This is the simplest sort function used and doesn't try to untangle the tree. +# +# Return type : Arrayref +# +sub _sort_stable_ids { + my $self = shift; + my %unique = map { $_->stable_id => 1 } @{ $self->get_all_ArchiveStableIds }; + return [sort keys %unique]; +} + + +=head2 optimise_tree + + Example : $history->optimise_tree; + Description : This method sorts the history tree so that the number of + overlapping branches is minimised (thus "untangling" the tree). + + It uses a clustering algorithm for this which iteratively moves + the nodes with the largest vertical distance next to each other + and looking for a mininum in total branch length. This might not + produce the overall optimum but usually converges on a local + optimum very quickly. + Return type : none + Exceptions : none + Caller : calculate_coords + Status : At Risk + : under development + +=cut + +sub optimise_tree { + my $self = shift; + + # get all non-self events + my @links; + foreach my $event (@{ $self->get_all_StableIdEvents }) { + next unless ($event->old_ArchiveStableId and $event->new_ArchiveStableId); + my $old_id = $event->old_ArchiveStableId->stable_id; + my $new_id = $event->new_ArchiveStableId->stable_id; + push @links, [$old_id, $new_id] if ($old_id ne $new_id); + } + + # get initial list of sorted unique stable IDs and put them into a position + # lookup hash + my $i = 0; + my %pos = map { $_ => $i++ } @{ $self->_sort_stable_ids }; + + my $opt_length; + my $successive_fails = 0; + my $k = 0; + my %seen; + + # for debug purposes: + # find the number of permutations for the given number of stable IDs + my $fact = $self->_factorial(scalar(keys %pos)); + + OPT: + while ($successive_fails < 100) { + + # sort links by vertical distance + #warn "sorting\n"; + $self->_sort_links(\@links, \%pos); + + # loop over sorted links + SORTED: + foreach my $link (@links) { + + #warn " trying ".join('-', @$link)."\n"; + + $k++; + + # remember last sort order + my %last = %pos; + + #my $this_order = join(':', sort { $pos{$a} <=> $pos{$b} } keys %pos); + #warn " before $this_order\n"; + + # try both to move bottom node next to top node's current position and + # top node next to bottom node's position - one of the methods might give + # you better results + DIRECT: + foreach my $direction (qw(up down)) { + + # move the nodes next to each other + $self->_move_nodes($link, \%pos, $direction); + + # next if we've seen this sort order before + my $new_order = join(':', sort { $pos{$a} <=> $pos{$b} } keys %pos); + #warn " after ($direction) $new_order\n"; + if ($seen{$new_order}) { + #warn " seen\n"; + %pos = %last; + next DIRECT; + } + $seen{$new_order} = 1; + + # calculate total link length for this sort order + my $total_length = $self->_total_link_length(\@links, \%pos); + + if (!$opt_length or $total_length < $opt_length) { + #warn " better ($total_length/$opt_length)\n"; + $opt_length = $total_length; + $successive_fails = 0; + next OPT; + } else { + #warn " worse ($total_length/$opt_length)\n"; + %pos = %last; + $successive_fails++; + } + } + + } + + last OPT; + + } + + #warn "Needed $k tries (of $fact) to find optimal tree.\n"; + + my @best = sort { $pos{$a} <=> $pos{$b} } keys %pos; + $self->{'sorted_tree'}->{'stable_ids'} = \@best; +} + + +# +# find the number of permutations for a give array size. +# used for debugging code (compare implemented algorithm to looping over all +# possible permutations). +# +sub _factorial { + my ($self, $n) = @_; + my $s = 1; + $s *= $n-- while $n > 0; + return $s; +} + + +# +# sort links by vertical distance +# +sub _sort_links { + my ($self, $links, $pos) = @_; + + my @lookup; + + foreach my $link (@$links) { + my $dist = $pos->{$link->[0]} - $pos->{$link->[1]}; + $dist = -$dist if ($dist < 0); + push @lookup, [$dist, $link]; + #warn " $dist ".join(' ', @$link)."\n"; + } + + @$links = map { $_->[1] } sort { $b->[0] <=> $a->[0] } @lookup; +} + + +# +# make two nodes adjacent by moving the second node next to the first node +# all other node coordinates are adjusted accordingly +# +sub _move_nodes { + my ($self, $link, $pos, $direction) = @_; + + my $first_pos = $pos->{$link->[0]}; + my $second_pos = $pos->{$link->[1]}; + + # swap positions if necessary + if ($first_pos > $second_pos) { + my $tmp = $second_pos; + $second_pos = $first_pos; + $first_pos = $tmp; + } + #warn " $first_pos:$second_pos\n"; + + foreach my $p (keys %$pos) { + + my $val = $pos->{$p}; + + #warn " $p $val\n"; + if ($direction eq 'up') { + if ($val > $first_pos and $val < $second_pos) { + $val++; + } elsif ($val == $second_pos) { + $val = $first_pos + 1; + } + } else { + if ($val > $first_pos and $val < $second_pos) { + $val--; + } elsif ($val == $first_pos) { + $val = $second_pos - 1; + } + } + + #warn " $p $val\n"; + $pos->{$p} = $val; + #warn "\n"; + } +} + + +# +# calculate the total link (vertical distance) length based on this sort order +# +sub _total_link_length { + my ($self, $links, $pos) = @_; + + my $total_length; + + foreach my $link (@$links) { + my $length = $pos->{$link->[0]} - $pos->{$link->[1]}; + $length = -$length if ($length < 0); + $total_length += $length; + } + + return $total_length; +} + + +=head2 coords_by_ArchiveStableId + + Arg[1] : Bio::EnsEMBL::ArchiveStableId $archive_id + The ArchiveStableId to get tree grid coordinates for + Example : my ($x, $y) = + @{ $history->coords_by_ArchiveStableId($archive_id) }; + print $archive_id->stable_id, " coords: $x, $y\n"; + Description : Returns the coordinates of an ArchiveStableId in the history + tree grid. If the ArchiveStableId isn't found in this tree, an + empty list is returned. + + Coordinates are zero-based (i.e. the top leftmost element in + the grid has coordinates [0, 0], not [1, 1]). This is to + facilitate using them to create a matrix as a two-dimensional + array of arrays. + Return type : Arrayref (x coordinate, y coordinate) + Exceptions : thrown on wrong argument type + Caller : general + Status : At Risk + : under development + +=cut + +sub coords_by_ArchiveStableId { + my ($self, $archive_id) = @_; + + throw("Bio::EnsEMBL::ArchiveStableId object expected.") + unless ($archive_id and ref($archive_id) and + $archive_id->isa('Bio::EnsEMBL::ArchiveStableId')); + + return $self->{'sorted_tree'}->{'coords'}->{$self->_node_id($archive_id)} + || []; +} + + +=head2 calculate_coords + + Example : $history->calculate_coords; + Description : Pre-calculates the grid coordinates of all nodes in the tree. + Return type : none + Exceptions : none + Caller : ArchiveStableIdAdaptor::fetch_history_by_stable_id + Status : At Risk + : under development + +=cut + +sub calculate_coords { + my $self = shift; + + # reset any previous tree cordinate calculations + $self->reset_tree; + + # the "master" information for the sorted tree is stored as the sorted lists + # of releases (x) and stable IDs (y). Sort them now. + my $db_names = $self->get_release_db_names; + + # untangle tree by sorting stable IDs appropriately + $self->optimise_tree; + my $stable_ids = $self->get_unique_stable_ids; + + # for performance reasons, additionally store coordinates in a lookup hash + foreach my $archive_id (@{ $self->get_all_ArchiveStableIds }) { + + # coordinates are positions in the sorted lists + my $x = $self->_index_of($archive_id->db_name, $db_names); + my $y = $self->_index_of($archive_id->stable_id, $stable_ids); + + $self->{'sorted_tree'}->{'coords'}->{$self->_node_id($archive_id)} = + [ $x, $y ]; + } +} + +# +# Description : Returns the index of an element in an array +# Example : my @array = (a, b, c); +# my $i = _index_of('b', \@array); # will return 1 +# Return type : Int (or undef if element is not found in array) +# +sub _index_of { + my ($self, $element, $arrayref) = @_; + + throw("Expecting arrayref argument.") unless (ref($arrayref) eq 'ARRAY'); + + my @array = @$arrayref; + + while (my $e = pop(@array)) { + return scalar(@array) if ($e eq $element); + } + + return undef; +} + + +=head2 consolidate_tree + + Example : $history->consolidate_tree; + Description : Consolidate the history tree. This means removing nodes where + there wasn't a change and bridging gaps in the history. The end + result will be a sparse tree which only contains the necessary + information. + Return type : none + Exceptions : none + Caller : ArchiveStableIdAdaptor->fetch_history_tree_by_stable_id + Status : At Risk + : under development + +=cut + +sub consolidate_tree { + my $self = shift; + + # + # get all self-events and creations/deletions and sort them (by stable ID and + # chronologically) + # + my @event_lookup; + + foreach my $event (@{ $self->get_all_StableIdEvents }) { + + my $old_id = $event->old_ArchiveStableId; + my $new_id = $event->new_ArchiveStableId; + + if (!$old_id or !$new_id or ($old_id->stable_id eq $new_id->stable_id)) { + if ($old_id) { + push @event_lookup, [$old_id->stable_id, $old_id->release, + $old_id->db_name, $event]; + } else { + push @event_lookup, [$new_id->stable_id, $new_id->release - 1, + $new_id->db_name, $event]; + } + } + } + + my @self_events = map { $_->[3] } + sort { $a->[0] cmp $b->[0] || $a->[1] <=> $b->[1] || $a->[2] cmp $b->[2] } + @event_lookup; + + # + # consolidate tree + # + my $last = shift(@self_events); + + while (my $event = shift(@self_events)) { + + my $lo = $last->old_ArchiveStableId; + my $ln = $last->new_ArchiveStableId; + my $eo = $event->old_ArchiveStableId; + my $en = $event->new_ArchiveStableId; + + if ($lo and $eo and $en and $lo->stable_id eq $eo->stable_id + and $lo->version eq $eo->version) { + + # this removes redundant nodes and connects the remaining nodes: + # + # o--o--o -> o-----o + # 1 1 1 1 1 + + #warn 'A: '.$last->ident_string.' | '.$event->ident_string."\n"; + + $self->remove_StableIdEvent($last); + $self->remove_StableIdEvent($event); + + $event->old_ArchiveStableId($lo); + + $self->add_StableIdEvents($event); + + } elsif ($ln and $eo and $ln->db_name ne $eo->db_name + and $ln->stable_id eq $eo->stable_id and $ln->version eq $eo->version) { + + # try to brigde gaps + + if ($en) { + + # o--o o--o -> o--o-----o + # 1 2 2 2 1 2 2 + # + # o o--o -> o-----o + # 1 1 1 1 1 + + #warn 'X: '.$last->ident_string.' | '.$event->ident_string."\n"; + + $self->remove_StableIdEvent($event); + $event->old_ArchiveStableId($ln); + $self->add_StableIdEvents($event); + + } elsif ($lo) { + + # there's a deletion event, deal with it differently + + if ($lo->version eq $ln->version) { + + # o--o o -> o-----o + # 1 1 1 1 1 + + #warn 'Y: '.$last->ident_string.' | '.$event->ident_string."\n"; + + $self->remove_StableIdEvent($last); + $last->new_ArchiveStableId($eo); + $self->add_StableIdEvents($last); + + } else { + + # o--o o -> o--o--o + # 1 2 2 1 2 2 + + #warn 'Z: '.$last->ident_string.' | '.$event->ident_string."\n"; + + $self->remove_StableIdEvent($event); + $event->old_ArchiveStableId($ln); + $event->new_ArchiveStableId($eo); + $self->add_StableIdEvents($event); + + } + + } else { + + # creation followed by deletion in next mapping + # + # o o -> o--o + # 1 1 1 1 + + #warn 'Q: '.$last->ident_string.' | '.$event->ident_string."\n"; + + $self->remove_StableIdEvent($last); + $self->remove_StableIdEvent($event); + $event->old_ArchiveStableId($ln); + $event->new_ArchiveStableId($eo); + $self->add_StableIdEvents($event); + + } + + } else { + #warn 'C: '.$last->ident_string.' | '.$event->ident_string."\n"; + } + + $last = $event; + } + + # now add ArchiveStableIds of the remaining events to the tree + $self->add_ArchiveStableIds_for_events; +} + + +=head2 reset_tree + + Example : $history->reset_tree; + Description : Resets all pre-calculated tree grid data. Mostly used internally + by methods that modify the tree. + Return type : none + Exceptions : none + Caller : internal + Status : At Risk + : under development + +=cut + +sub reset_tree { + my $self = shift; + $self->{'sorted_tree'} = undef; +} + + +=head2 current_dbname + + Arg[1] : (optional) String $dbname - the dbname to set + Example : my $dbname = $history->current_dbname; + Description : Getter/setter for current dbname. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub current_dbname { + my $self = shift; + $self->{'current_dbname'} = shift if (@_); + return $self->{'current_dbname'}; +} + + +=head2 current_release + + Arg[1] : (optional) Int $release - the release to set + Example : my $release = $history->current_release; + Description : Getter/setter for current release. + Return type : Int + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub current_release { + my $self = shift; + $self->{'current_release'} = shift if (@_); + return $self->{'current_release'}; +} + + +=head2 current_assembly + + Arg[1] : (optional) String $assembly - the assembly to set + Example : my $assembly = $history->current_assembly; + Description : Getter/setter for current assembly. + Return type : String + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub current_assembly { + my $self = shift; + $self->{'current_assembly'} = shift if (@_); + return $self->{'current_assembly'}; +} + + +=head2 is_incomplete + + Arg[1] : (optional) Boolean $incomplete + Example : if ($history->is_incomplete) { + print "Returned tree is incomplete due to too many mappings + in the database.\n"; + } + Description : Getter/setter for incomplete flag. This is used by + ArchiveStableIdAdaptor to indicate that it finished building + the tree prematurely due to too many mappins in the db and can + be used by applications to print warning messages. + Return type : Boolean + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub is_incomplete { + my $self = shift; + $self->{'incomplete'} = shift if (@_); + return $self->{'incomplete'}; +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Storable.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Storable.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,240 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Storable + +=head1 SYNOPSIS + + my $dbID = $storable_object->dbID(); + my $adaptor = $storable_object->adaptor(); + if ( $storable_object->is_stored($db_adaptor) ) { ... } + +=head1 DESCRIPTION + +This is a storable base class. All objects which are storable +in the database should inherit from this class. It provides two +getter/setters: dbID() adaptor(). And a is_stored() method that can be +used to determine if an object is already stored in a database. + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Storable; + + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Scalar qw(assert_ref); +use Scalar::Util qw(weaken); + +=head2 new + + Arg [-ADAPTOR] : Bio::EnsEMBL::DBSQL::BaseAdaptor + Arg [-dbID] : database internal id + Caller : internal calls + Description : create a new Storable object + Returntype : Bio::EnsEMBL::Storable + Exceptions : Adaptor not a Bio::EnsEMBL::DBSQL::BaseAdaptor + Status : Stable + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my ($adaptor, $dbID) = rearrange(['ADAPTOR', 'dbID'],@_); + + if($adaptor) { + if(!ref($adaptor) || !$adaptor->isa('Bio::EnsEMBL::DBSQL::BaseAdaptor')) { + throw('-ADAPTOR argument must be a Bio::EnsEMBL::DBSQL::BaseAdaptor'); + } + } + + my $self = bless({'dbID' => $dbID}, $class); + $self->adaptor($adaptor); + return $self; +} + + +=head2 dbID + + Arg [1] : int $dbID + Description: getter/setter for the database internal id + Returntype : int + Exceptions : none + Caller : general, set from adaptor on store + Status : Stable + +=cut + +sub dbID { + my $self = shift; + $self->{'dbID'} = shift if(@_); + return $self->{'dbID'}; +} + + + +=head2 adaptor + + Arg [1] : Bio::EnsEMBL::DBSQL::BaseAdaptor $adaptor + Description: get/set for this objects Adaptor + Returntype : Bio::EnsEMBL::DBSQL::BaseAdaptor + Exceptions : none + Caller : general, set from adaptor on store + Status : Stable + +=cut + +sub adaptor { + my ($self, $adaptor) = @_; + if(scalar(@_) > 1) { + if(defined $adaptor) { + assert_ref($adaptor, 'Bio::EnsEMBL::DBSQL::BaseAdaptor', 'adaptor'); + $self->{adaptor} = $adaptor; + weaken($self->{adaptor}); + } + else { + $self->{adaptor} = undef; + } + } + return $self->{adaptor} +} + + + +=head2 is_stored + + Arg [1] : Bio::EnsEMBL::DBSQL::DBConnection + : or Bio::EnsEMBL::DBSQL::DBAdaptor + Example : do_something if($object->is_stored($db)); + Description: Returns true if this object is stored in the provided database. + This works under the assumption that if the adaptor and dbID are + set and the database of the adaptor shares the port, dbname and + hostname with the provided database, this object is stored in + that database. + Returntype : 1 or 0 + Exceptions : throw if dbID is set but adaptor is not + throw if adaptor is set but dbID is not + throw if incorrect argument is passed + Caller : store methods + Status : Stable + +=cut + +my $message_only_once =1; + +sub is_stored { + my $self = shift; + my $db = shift; + + if($db and $db->isa('Bio::EnsEMBL::DBSQL::DBAdaptor')) { + $db = $db->dbc(); + } + if(!$db || !ref($db) || !$db->isa('Bio::EnsEMBL::DBSQL::DBConnection')) { + throw('db argument must be a Bio::EnsEMBL::DBSQL::DBConnection'); + } + + my $adaptor = $self->{'adaptor'}; + my $dbID = $self->{'dbID'}; + + if($dbID && !$adaptor) { + if($message_only_once){ + warning("Storable object has a dbID but not an adaptor.\n" . + 'Storable objects must have neither OR both.'); + $message_only_once = 0; + } + return 0; + } + + if($adaptor && !$dbID) { + if($message_only_once){ + warning("Storable object has an adaptor but not a dbID.\n". + "Storable objects must have neither OR both."); + $message_only_once = 0; + } + return 0; + } + + return 0 if (!$adaptor && !$dbID); + + my $cur_db = $adaptor->dbc(); + + # + # Databases are the same if they share the same port, host and username + # + if ( $db->port() eq $cur_db->port() + && $db->host() eq $cur_db->host() + && $db->dbname() eq $cur_db->dbname() ) + { + return 1; + } + + return 0; +} + +sub get_all_DAS_Features{ + my ($self, $slice) = @_; + + $self->{_das_features} ||= {}; # Cache + $self->{_das_styles} ||= {}; # Cache + $self->{_das_segments} ||= {}; # Cache + my %das_features; + my %das_styles; + my %das_segments; + + foreach my $dasfact( @{$self->get_all_DASFactories} ){ + my $dsn = $dasfact->adaptor->dsn; + my $name = $dasfact->adaptor->name; + my $url = $dasfact->adaptor->url; + + # Construct a cache key : SOURCE_URL/TYPE + # Need the type to handle sources that serve multiple types of features + + my ($type) = ref($dasfact->adaptor->mapping) eq 'ARRAY' ? @{$dasfact->adaptor->mapping} : $dasfact->adaptor->mapping; + $type ||=$dasfact->adaptor->type; + my $key = join('/', $name, $type); + + if( $self->{_das_features}->{$key} ){ # Use cached + $das_features{$name} = $self->{_das_features}->{$key}; + $das_styles{$name} = $self->{_das_styles}->{$key}; + $das_segments{$name} = $self->{_das_segments}->{$key}; + } else { # Get fresh data + + my ($featref, $styleref, $segref) = ($type =~ /^ensembl_location/) ? ($dasfact->fetch_all_Features( $slice, $type )) : $dasfact->fetch_all_by_ID( $self ); + + $self->{_das_features}->{$key} = $featref; + $self->{_das_styles}->{$key} = $styleref; + $self->{_das_segments}->{$key} = $segref; + $das_features{$name} = $featref; + $das_styles{$name} = $styleref; + $das_segments{$name} = $segref; + } + } + + return (\%das_features, \%das_styles, \%das_segments); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/StrainSlice.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/StrainSlice.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,839 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::StrainSlice - SubClass of the Slice. Represents the slice +of the genome for a certain strain (applying the variations) + +=head1 SYNOPSIS + + $sa = $db->get_SliceAdaptor; + + $slice = + $sa->fetch_by_region( 'chromosome', 'X', 1_000_000, 2_000_000 ); + + $strainSlice = $slice->get_by_strain($strain_name); + + # get the sequence from the Strain Slice + my $seq = $strainSlice->seq(); + print $seq; + + # get allele features between this StrainSlice and the reference + my $afs = $strainSlice->get_all_AlleleFeatures_Slice(); + foreach my $af ( @{$afs} ) { + print "AlleleFeature in position ", $af->start, "-", $af->end, + " in strain with allele ", $af->allele_string, "\n"; + } + + # compare a strain against another strain + my $strainSlice_2 = $slice->get_by_strain($strain_name_2); + my $differences = + $strainSlice->get_all_differences_StrainSlice($strainSlice_2); + + foreach my $difference ( @{$differences} ) { + print "Difference in position ", $difference->start, "-", + $difference->end(), " in strain with allele ", + $difference->allele_string(), "\n"; + } + +=head1 DESCRIPTION + +A StrainSlice object represents a region of a genome for a certain +strain. It can be used to retrieve sequence or features from a strain. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::StrainSlice; +use vars qw(@ISA); +use strict; + +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); +use Bio::EnsEMBL::Slice; +use Bio::EnsEMBL::Mapper; +use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning); + +@ISA = qw(Bio::EnsEMBL::Slice); + + +=head2 new + + Arg[1] : Bio::EnsEMBL::Slice $slice + Arg[2] : string $strain_name + Example : $strainSlice = Bio::EnsEMBL::StrainSlice->new(-.... => , + -strain_name => $strain_name); + Description : Creates a new Bio::EnsEMBL::StrainSlice object that will contain a shallow copy of the + Slice object, plus additional information such as the Strain this Slice refers to + and listref of Bio::EnsEMBL::Variation::AlleleFeatures of differences with the + reference sequence + ReturnType : Bio::EnsEMBL::StrainSlice + Exceptions : none + Caller : general + +=cut + +sub new{ + my $caller = shift; + my $class = ref($caller) || $caller; + + my ($strain_name) = rearrange(['STRAIN_NAME'],@_); + + my $self = $class->SUPER::new(@_); + + $self->{'strain_name'} = $strain_name; + + if(!$self->adaptor()) { + warning('Cannot get new StrainSlice features without attached adaptor'); + return ''; + } + my $variation_db = $self->adaptor->db->get_db_adaptor('variation'); + + unless($variation_db) { + warning("Variation database must be attached to core database to " . + "retrieve variation information" ); + return ''; + } + + my $af_adaptor = $variation_db->get_AlleleFeatureAdaptor; + + if( $af_adaptor ) { + #get the Individual for the given strain + my $ind_adaptor = $variation_db->get_IndividualAdaptor; + + if ($ind_adaptor){ + my $individual = shift @{$ind_adaptor->fetch_all_by_name($self->{'strain_name'})}; #the name should be unique for a strain + #check that the individua returned isin the database + + if (defined $individual){ + my $allele_features = $af_adaptor->fetch_all_by_Slice($self,$individual); + #warning("No strain genotype data available for Slice ".$self->name." and Strain ".$individual->name) if ! defined $allele_features->[0]; + my $vf_ids = {}; #hash containing the relation vf_id->af + $self->{'_strain'} = $individual; + map {defined $_->{'_variation_feature_id'} ? $vf_ids->{$_->{'_variation_feature_id'}} = $_ : '' +} @{$allele_features}; +# my $new_allele_features = $self->_filter_af_by_coverage($allele_features); +# $self->{'alleleFeatures'} = $new_allele_features; + $self->{'alleleFeatures'} = $allele_features || []; + $self->{'_vf_ids'} = $vf_ids; + return $self; + } + else{ + warning("Strain ($self->{strain_name}) not in the database"); + return $self; + } + } + else{ + warning("Not possible to retrieve IndividualAdaptor from the variation database"); + return ''; + } + } else { + warning("Not possible to retrieve VariationFeatureAdaptor from variation database"); + return ''; + } +} + +=head2 _filter_af_by_coverage + + Arg [1] : listref to Bio::EnsEMBL::Variation::AlleleFeatures $allele_features + Example : my $new_list_allele_features = $strainSlice->_filter_af_by_coverage($allele_features); + Description : For a list of allele features, gets a new list where they are filter depending on coverage + ReturnType : listref of Bio::EnsEMBL::Variation::AlleleFeature + Exceptions : none + Caller : internal function + +=cut + +sub _filter_af_by_coverage{ + my $self = shift; + my $allele_features = shift; + + my $variation_db = $self->adaptor->db->get_db_adaptor('variation'); + + unless($variation_db) { + warning("Variation database must be attached to core database to " . + "retrieve variation information" ); + return ''; + } + + my $rc_adaptor = $variation_db->get_ReadCoverageAdaptor(); + #this is ugly, but ReadCoverage is always defined in the positive strand + +### EK : - it looks like the arguments to fetch_all_by_Slice_Sample_depth have changed +### passing 1 will only get you the coverage of level 1 +### by omitting the parameter we take into account all coverage regions +# my $rcs = $rc_adaptor->fetch_all_by_Slice_Sample_depth($self,$self->{'_strain'},1); + my $rcs = $rc_adaptor->fetch_all_by_Slice_Sample_depth($self,$self->{'_strain'}); + my $new_af; + foreach my $af (@{$allele_features}){ + foreach my $rc (@{$rcs}){ + if ($af->start <= $rc->end and $af->start >= $rc->start){ + push @{$new_af}, $af; + last; + } + } + } + + return $new_af; +} + + +=head2 strain_name + + Arg [1] : (optional) string $strain_name + Example : my $strain_name = $strainSlice->strain_name(); + Description : Getter/Setter for the name of the strain + ReturnType : string + Exceptions : none + Caller : general + +=cut + +sub strain_name{ + my $self = shift; + if (@_){ + $self->{'strain_name'} = shift @_; + } + return $self->{'strain_name'}; +} + + +=head2 display_Slice_name + + Args : none + Example : my $strain_name = $strainSlice->display_Slice_name(); + Description : Getter for the name of the strain + ReturnType : string + Exceptions : none + Caller : webteam + +=cut + +sub display_Slice_name{ + my $self = shift; + + return $self->strain_name; +} + +=head2 seq + + Arg [1] : int $with_coverage (optional) + Example : print "SEQUENCE = ", $strainSlice->seq(); + Description: Returns the sequence of the region represented by this + slice formatted as a string in the strain. If flag with_coverage + is set to 1, returns sequence if there is coverage in the region + Returntype : string + Exceptions : none + Caller : general + +=cut + +sub seq { + my $self = shift; + my $with_coverage = shift; + + $with_coverage ||= 0; + + # special case for in-between (insert) coordinates + return '' if($self->start() == $self->end() + 1); + + return $self->{'seq'} if($self->{'seq'}); + + if($self->adaptor()) { + my $seqAdaptor = $self->adaptor()->db()->get_SequenceAdaptor(); + my $reference_sequence = $seqAdaptor->fetch_by_Slice_start_end_strand($self,1,undef,1); #get the reference sequence for that slice + + #apply all differences to the reference sequence + #first, in case there are any indels, create the new sequence (containing the '-' bases) + # sort edits in reverse order to remove complication of + # adjusting downstream edits + my @indels_ordered = sort {$b->start() <=> $a->start()} @{$self->{'alignIndels'}} if (defined $self->{'alignIndels'}); + + foreach my $vf (@indels_ordered){ + $vf->apply_edit($reference_sequence); #change, in the reference sequence, the vf + } + + #need to find coverage information if diffe + # sort edits in reverse order to remove complication of + # adjusting downstream edits + my @variation_features_ordered = sort {$b->start() <=> $a->start()} @{$self->{'alleleFeatures'}} if (defined $self->{'alleleFeatures'}); + + foreach my $vf (@variation_features_ordered){ + $vf->apply_edit($reference_sequence); #change, in the reference sequence, the vf + } + + #need to find coverage information if different from reference + my $indAdaptor = $self->adaptor->db->get_db_adaptor('variation')->get_IndividualAdaptor; + my $ref_strain = $indAdaptor->get_reference_strain_name; + $self->_add_coverage_information($reference_sequence) if ($with_coverage == 1 && $self->strain_name ne $ref_strain); + return substr(${$reference_sequence},0,1) if ($self->length == 1); + return substr(${$reference_sequence},0,$self->expanded_length); #returns the reference sequence, applying the variationFeatures. Remove additional bases added due to indels + } + + # no attached sequence, and no db, so just return Ns + return 'N' x $self->length(); +} + +sub expanded_length() { + my $self = shift; + + my $length = $self->SUPER::length(); + + foreach my $af(@{$self->{'alleleFeatures'}}) { + $length += $af->length_diff() if $af->length_diff > 0; + } + + return $length; +} + + + +sub _add_coverage_information{ + my $self = shift; + my $reference_sequence = shift; + + my $variation_db = $self->adaptor->db->get_db_adaptor('variation'); + + unless($variation_db) { + warning("Variation database must be attached to core database to " . + "retrieve variation information" ); + return ''; + } + + my $rc_adaptor = $variation_db->get_ReadCoverageAdaptor(); +### EK : - it looks like the arguments to fetch_all_by_Slice_Sample_depth have changed +### passing 1 will only get you the coverage of level 1 +### by omitting the parameter we take into account all coverage regions +# my $rcs = $rc_adaptor->fetch_all_by_Slice_Sample_depth($self,$self->{'_strain'},1); + my $rcs = $rc_adaptor->fetch_all_by_Slice_Sample_depth($self,$self->{'_strain'}); + my $rcs_sorted; + @{$rcs_sorted} = sort {$a->start <=> $b->start} @{$rcs} if ($self->strand == -1); + $rcs = $rcs_sorted if ($self->strand == -1); + my $start = 1; + + + # wm2 - new code to mask sequence, instead starts with masked string + # and unmasks seq where there is read coverage + + # get all length-changing vars + my @indels_ordered = sort {$a->start() <=> $b->start()} @{$self->{'alignIndels'}} if (defined $self->{'alignIndels'}); + + my $masked_seq = '~' x length($$reference_sequence); + + foreach my $rc(@{$rcs}) { + my ($start, $end) = ($rc->start, $rc->end); + + # adjust region for indels + foreach my $indel(@indels_ordered) { + next if $rc->start > $end; + + # if within RC region, only need adjust the end + $start += $indel->length_diff unless $indel->start > $start; + $end += $indel->length_diff; + } + + # adjust coords for seq boundaries + $start = 1 if $start < 1; + $end = CORE::length($masked_seq) if $end > CORE::length($masked_seq); + + # now unmask the sequence using $$reference_sequence + substr($masked_seq, $start - 1, $end - $start + 1) = substr($$reference_sequence, $start - 1, $end - $start + 1); + } + + # wm2 - old code, starts with sequence and masks regions between read coverage - BUGGY +# foreach my $rc (@{$rcs}){ +# $rc->start(1) if ($rc->start < 0); #if the region lies outside the boundaries of the slice +# $rc->end($self->end - $self->start + 1) if ($rc->end + $self->start > $self->end); +# +# warn "Adjusted: ", $rc->start, "-", $rc->end; +# +# warn "Covering from ", $start, " over ", ($rc->start - $start - 1), " bases"; +# +# substr($$reference_sequence, $start-1,($rc->start - $start - 1),'~' x ($rc->start - $start - 1)) if ($rc->start - 1 > $start); +# $start = $rc->end; +# +# } +# substr($$reference_sequence, $start, ($self->length - $start) ,'~' x ($self->length - $start)) if ($self->length -1 > $start); + + # copy the masked sequence to the reference sequence + $$reference_sequence = $masked_seq; +} + + +=head2 get_AlleleFeature + + Arg[1] : Bio::EnsEMBL::Variation::VariationFeature $vf + Example : my $af = $strainSlice->get_AlleleFeature($vf); + Description : Returns the AlleleFeature object associated with the VariationFeature (if any) + ReturnType : Bio::EnsEMBL::Variation::AlleleFeature + Exceptions : none + Caller : general + +=cut + +sub get_AlleleFeature{ + my $self = shift; + my $vf = shift; + + my $af; + #look at the hash containing the relation vf_id->alleleFeature, if present, return object, otherwise, undef + $af = $self->{'_vf_ids'}->{$vf->dbID} if (defined $self->{'_vf_ids'}->{$vf->dbID}); + return $af; +} + + +=head2 get_all_AlleleFeatures_Slice + + Arg[1] : int $with_coverage (optional) + Example : my $af = $strainSlice->get_all_AlleleFeatures_Slice() + Description : Gets all AlleleFeatures between the StrainSlice object and the Slice is defined. + If argument $with_coverage set to 1, returns only AF if they have coverage information + ReturnType : listref of Bio::EnsEMBL::Variation::AlleleFeature + Exceptions : none + Caller : general + +=cut + +sub get_all_AlleleFeatures_Slice{ + my $self = shift; + my $with_coverage = shift; + + my $variation_db = $self->adaptor->db->get_db_adaptor('variation'); + + unless($variation_db) { + warning("Variation database must be attached to core database to " . + "retrieve variation information" ); + return ''; + } + my $indAdaptor = $variation_db->get_IndividualAdaptor(); + my $ref_name = $indAdaptor->get_reference_strain_name; + return [] if ($self->strain_name eq $ref_name); + $with_coverage ||= 0; #by default, get all AlleleFeatures + if ($with_coverage == 1){ + my $new_allele_features = $self->_filter_af_by_coverage($self->{'alleleFeatures'}); + return $new_allele_features || []; + } + + return $self->{'alleleFeatures'} || []; +} + +=head2 get_all_differences_StrainSlice + + Arg[1] : Bio::EnsEMBL::StrainSlice $ss + Example : my $differences = $strainSlice->get_all_differences_StrainSlice($ss) + Description : Gets differences between 2 StrainSlice objects + ReturnType : listref of Bio::EnsEMBL::Variation::AlleleFeature + Exceptions : thrown on bad argument + Caller : general + +=cut + +sub get_all_differences_StrainSlice{ + my $self = shift; + my $strainSlice = shift; + + if (!ref($strainSlice) || !$strainSlice->isa('Bio::EnsEMBL::StrainSlice')){ + throw('Bio::EnsEMBL::StrainSlice arg expected'); + } + if ( @{$self->{'alleleFeatures'}} == 0 && @{$strainSlice->{'alleleFeatures'}} == 0){ + return undef; #there are no differences in any of the Strains + + } + my $differences; #differences between strains + if (@{$strainSlice->{'alleleFeatures'}} == 0){ + #need to create a copy of VariationFeature + foreach my $difference (@{$self->{'alleleFeatures'}}){ + my %vf = %$difference; + push @{$differences},bless \%vf,ref($difference); + } + } + elsif (@{$self->{'alleleFeatures'}} == 0){ + #need to create a copy of VariationFeature, but changing the allele by the allele in the reference + foreach my $difference (@{$strainSlice->{'alleleFeatures'}}){ + push @{$differences}, $strainSlice->_convert_difference($difference); + } + } + else{ + #both strains have differences + #create a hash with the differences in the self strain slice + my %variation_features_self = map {$_->start => $_} @{$self->{'alleleFeatures'}}; + foreach my $difference (@{$strainSlice->{'alleleFeatures'}}){ + #there is no difference in the other strain slice, convert the allele + if (!defined $variation_features_self{$difference->start}){ + push @{$differences},$strainSlice->_convert_difference($difference); + } + else{ + #if it is defined and have the same allele, delete from the hash + if ($variation_features_self{$difference->start}->allele_string eq $difference->allele_string){ + delete $variation_features_self{$difference->start}; + } + } + } + #and copy the differences that in the self + foreach my $difference (values %variation_features_self){ + my %vf = %$difference; + push @{$differences},bless \%vf,ref($difference); + } + + } + #need to map differences to the self + my $mapper = $self->mapper(); #now that we have the differences, map them in the StrainSlice +# print Dumper($mapper); + my @results; + foreach my $difference (@{$differences}){ + @results = $mapper->map_coordinates('Slice',$difference->start,$difference->end,$difference->strand,'Slice'); + #we can have 3 possibilities: + #the difference is an insertion and when mapping returns the boundaries of the insertion in the StrainSlice + if (@results == 2){ + #the first position in the result is the beginning of the insertion + if($results[0]->start < $results[1]->start){ + $difference->start($results[0]->end+1); + $difference->end($results[1]->start-1); + } + else{ + $difference->start($results[1]->end+1); + $difference->end($results[0]->start-1); + } + $difference->strand($results[0]->strand); + } + else{ + #it can be either a SNP or a deletion, and we have the coordinates in the result, etither a Bio::EnsEMBL::Mapper::Coordinate + # or a Bio::EnsEMBL::Mapper::IndelCoordinate +# print "Difference: ",$difference->start, "-", $difference->end,"strand ",$difference->strand,"\n"; + $difference->start($results[0]->start); + $difference->end($results[0]->end); + $difference->strand($results[0]->strand); + } + } + + return $differences; +} + +#for a given VariationFeatures, converts the allele into the reference allele and returns a new list with +#the converted VariationFeatures +sub _convert_difference{ + my $self = shift; + my $difference = shift; + my %new_vf = %$difference; #make a copy of the variationFeature + #and change the allele with the one from the reference Slice + $new_vf{'allele_string'} = $self->SUPER::subseq($difference->start,$difference->end,$difference->strand); + return bless \%new_vf,ref($difference); +} + +=head2 sub_Slice + + Arg 1 : int $start + Arg 2 : int $end + Arge [3] : int $strand + Example : none + Description: Makes another StrainSlice that covers only part of this slice + with the appropriate differences to the reference Slice + If a slice is requested which lies outside of the boundaries + of this function will return undef. This means that + behaviour will be consistant whether or not the slice is + attached to the database (i.e. if there is attached sequence + to the slice). Alternatively the expand() method or the + SliceAdaptor::fetch_by_region method can be used instead. + Returntype : Bio::EnsEMBL::StrainSlice or undef if arguments are wrong + Exceptions : thrown when trying to get the subSlice in the middle of a + insertion + Caller : general + +=cut + +sub sub_Slice { + my ( $self, $start, $end, $strand ) = @_; + my $mapper = $self->mapper(); + #finally map from the Slice to the Strain + my @results = $mapper->map_coordinates('StrainSlice',$start,$end,$strand,'StrainSlice'); + my $new_start; + my $new_end; + my $new_strand; + my $new_seq; + + #Get need start and end for the subSlice of the StrainSlice + my @results_ordered = sort {$a->start <=> $b->start} @results; + $new_start = $results_ordered[0]->start(); + $new_strand = $results_ordered[0]->strand() if (ref($results_ordered[0]) eq 'Bio::EnsEMBL::Mapper::Coordinate'); + $new_strand = $results_ordered[-1]->strand() if (ref($results_ordered[-1]) eq 'Bio::EnsEMBL::Mapper::Coordinate'); + $new_end = $results_ordered[-1]->end(); #get last element of the array, the end of the slice + + my $subSlice = $self->SUPER::sub_Slice($new_start,$new_end,$new_strand); + $subSlice->{'strain_name'} = $self->{'strain_name'}; + + my $new_variations; #reference to an array that will contain the variationFeatures in the new subSlice + #update the VariationFeatures in the sub_Slice of the Strain + my $vf_start; + my $vf_end; + my $offset = $subSlice->start - $self->start; + + foreach my $variationFeature (@{$self->{'alleleFeatures'}}){ + #calculate the new position of the variation_feature in the subSlice + $vf_start = $variationFeature->start - $offset; + $vf_end = $variationFeature->end - $offset; + if ($vf_start >= 1 and $vf_end <= $subSlice->length){ + #copy the variationFeature + my %new_vf; + %new_vf = %$variationFeature; + #and shift to the new coordinates + $new_vf{'start'} = $vf_start; + $new_vf{'end'} = $vf_end; + my $test = bless \%new_vf, ref($variationFeature); + push @{$new_variations}, $test; + } + } + $subSlice->{'alleleFeatures'} = $new_variations; + return $subSlice; + +} + +=head2 ref_subseq + + Arg [1] : int $startBasePair + relative to start of slice, which is 1. + Arg [2] : int $endBasePair + relative to start of slice. + Arg [3] : (optional) int $strand + The strand of the slice to obtain sequence from. Default + value is 1. + Description: returns string of dna from reference sequence + Returntype : txt + Exceptions : end should be at least as big as start + strand must be set + Caller : general + +=cut + +sub ref_subseq{ + my $self = shift; + my $start = shift; + my $end = shift; + my $strand = shift; + # special case for in-between (insert) coordinates + return '' if($start == $end + 1); + + my $subseq; + if($self->adaptor){ + my $seqAdaptor = $self->adaptor->db->get_SequenceAdaptor(); + $subseq = ${$seqAdaptor->fetch_by_Slice_start_end_strand + ( $self, $start, + $end, $strand )}; + } else { + ## check for gap at the beginning and pad it with Ns + if ($start < 1) { + $subseq = "N" x (1 - $start); + $start = 1; + } + $subseq .= substr ($self->seq(), $start-1, $end - $start + 1); + ## check for gap at the end and pad it with Ns + if ($end > $self->length()) { + $subseq .= "N" x ($end - $self->length()); + } + reverse_comp(\$subseq) if($strand == -1); + } + return $subseq; +} + +=head2 subseq + + Arg [1] : int $startBasePair + relative to start of slice, which is 1. + Arg [2] : int $endBasePair + relative to start of slice. + Arg [3] : (optional) int $strand + The strand of the slice to obtain sequence from. Default + value is 1. + Description: returns string of dna sequence + Returntype : txt + Exceptions : end should be at least as big as start + strand must be set + Caller : general + +=cut + +sub subseq { + my ( $self, $start, $end, $strand ) = @_; + + if ( $end+1 < $start ) { + throw("End coord + 1 is less than start coord"); + } + + # handle 'between' case for insertions + return '' if( $start == $end + 1); + + $strand = 1 unless(defined $strand); + + if ( $strand != -1 && $strand != 1 ) { + throw("Invalid strand [$strand] in call to Slice::subseq."); + } + + my $subseq; + my $seq; + if($self->adaptor){ + + + $seq = $self->seq; + reverse_comp(\$seq) if ($strand == -1); + $subseq = substr($seq,$start-1,$end - $start + 1); + } + else { + ## check for gap at the beginning and pad it with Ns + if ($start < 1) { + $subseq = "N" x (1 - $start); + $start = 1; + } + $subseq .= substr ($self->seq(), $start-1, $end - $start + 1); + ## check for gap at the end and pad it with Ns + if ($end > $self->length()) { + $subseq .= "N" x ($end - $self->length()); + } + reverse_comp(\$subseq) if($strand == -1); + } + return $subseq; + +} + + +sub mapper{ + my $self = shift; + + if (@_) { + delete $self->{'mapper'}; + } + if(!defined $self->{'mapper'}){ + #create the mapper between the Slice and StrainSlice + my $mapper = Bio::EnsEMBL::Mapper->new('Slice','StrainSlice'); + #align with Slice + #get all the VariationFeatures in the strain Slice, from start to end in the Slice + my @variation_features_ordered = sort {$a->start() <=> $b->start()} @{$self->{'alleleFeatures'}} if (defined $self->{'alleleFeatures'}); + + my $start_slice = 1; + my $end_slice; + my $start_strain = 1; + my $end_strain; + my $length_allele; + foreach my $variation_feature (@variation_features_ordered){ + #we have a insertion/deletion: marks the beginning of new slice move coordinates + if ($variation_feature->length_diff != 0){ + $length_allele = $variation_feature->length + $variation_feature->length_diff(); + $end_slice = $variation_feature->start() - 1; + + if ($end_slice >= $start_slice){ + $end_strain = $end_slice - $start_slice + $start_strain; + #add the sequence that maps + $mapper->add_map_coordinates('Slice',$start_slice,$end_slice,1,'StrainSlice',$start_strain,$end_strain); + #add the indel + $mapper->add_indel_coordinates('Slice',$end_slice+1,$end_slice + $variation_feature->length,1,'StrainSlice',$end_strain+1,$end_strain + $length_allele); + $start_strain = $end_strain + $length_allele + 1; + } + else{ + #add the indel + $mapper->add_indel_coordinates('Slice',$end_slice+1,$end_slice + $variation_feature->length,1,'StrainSlice',$end_strain+1,$end_strain + $length_allele); + $start_strain += $length_allele; + } + $start_slice = $end_slice + $variation_feature->length+ 1; + } + } + if ($start_slice <= $self->length){ + $mapper->add_map_coordinates('Slice',$start_slice,$self->length,1,'StrainSlice',$start_strain,$start_strain + $self->length - $start_slice); + } + $self->{'mapper'} = $mapper; + } + return $self->{'mapper'}; +} + +=head2 get_all_differences_Slice + + Description : DEPRECATED use get_all_AlleleFeatures instead + +=cut + +sub get_all_differences_Slice{ + my $self = shift; + + deprecate('Use get_all_differences_Slice instead'); + return $self->get_all_AlleleFeatures_Slice(@_); +} + +=head2 get_all_VariationFeatures + + Arg[1] : int $with_coverage (optional) + Description :returns all alleleFeatures features on this slice. + ReturnType : listref of Bio::EnsEMBL::Variation::AlleleFeature + Exceptions : none + Caller : contigview, snpview + +=cut + +sub get_all_VariationFeatures { + my $self = shift; + my $with_coverage = shift; + $with_coverage ||= 0; + return $self->get_all_AlleleFeatures_Slice($with_coverage); +} + +=head2 get_original_seq_region_position + + Arg [1] : int $position + relative to start of slice, which is 1. + Description: Placeholder method - this method has no explicit use beyond + providiing compatibility with AlignSlice. To map positions + between the StrainSlice and the reference slice, use the + mapper and its methods. + Returntype : ($strainSlice, $seq_region_position), an array where the first + element is a Bio::EnsEMBL::StrainSlice and the second one is the + requested seq_region_position. + Exceptions : none + Caller : general + +=cut + +sub get_original_seq_region_position { + my $self = shift; + my $position = shift; + #coordinates in the AlignSlice and Slice are the same, so far will return the same Slice + #and coordinate + return ($self,$position); +} + + +=head2 remove_indels + + Args : none + Example : $strainSlice->remove_indels(); + Description : Removes insertions and deletions from the allele features + of this object + ReturnType : none + Exceptions : none + Caller : webteam + +=cut + +sub remove_indels { + my $self = shift; + + my @new_afs = grep { $_->variation->var_class ne 'in-del' } @{$self->{'alleleFeatures'}}; + + $self->{'alleleFeatures'} = \@new_afs; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/TopLevelAssemblyMapper.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/TopLevelAssemblyMapper.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,436 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::TopLevelAssemblyMapper - +Handles mapping between a given coordinate system and the toplevel +pseudo coordinate system. + +=head1 SYNOPSIS + + $db = Bio::EnsEMBL::DBSQL::DBAdaptor->new(...); + $asma = $db->get_AssemblyMapperAdaptor(); + $csa = $db->get_CoordSystemAdaptor(); + + my $toplevel = $cs_adaptor->fetch_by_name('toplevel'); + my $ctg_cs = $cs_adaptor->fetch_by_name('contig'); + + $asm_mapper = $map_adaptor->fetch_by_CoordSystems( $cs1, $cs2 ); + + # map to toplevel coord system for this region + @chr_coords = + $asm_mapper->map( 'AL30421.1.200.92341', 100, 10000, -1, $ctg_cs ); + + # list toplevel seq_region_ids for this region + @chr_ids = + $asm_mapper->list_ids( 'AL30421.1.200.92341', 1, 1000, -1, + $ctg_cs ); + +=head1 DESCRIPTION + +The TopLevelAssemblyMapper performs mapping between a provided +coordinate system and the toplevel pseudo cooordinate system. The +toplevel coordinate system is not a real coordinate system, but +represents the highest coordinate system that can be mapped to in a +given region. It is only possible to perform unidirectional mapping +using this mapper, because it does not make sense to map from the +toplevel coordinate system to another coordinate system. + +=head1 METHODS + +=cut + + +use strict; +use warnings; + +package Bio::EnsEMBL::TopLevelAssemblyMapper; + +use Bio::EnsEMBL::Utils::Exception qw(throw); +use Bio::EnsEMBL::Mapper; +use Bio::EnsEMBL::CoordSystem; +use Scalar::Util qw(weaken); + +=head2 new + + Arg [1] : Bio::EnsEMBL::DBAdaptor $dbadaptor the adaptor for + the database this mapper is using. + Arg [2] : Toplevel CoordSystem + Arg [3] : Other CoordSystem + Description: Creates a new TopLevelAssemblyMapper object + Returntype : Bio::EnsEMBL::DBSQL::TopLevelAssemblyMapper + Exceptions : throws if any of the 3 arguments are missing/ not + : of the correct type. + Caller : Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor + Status : Stable + +=cut + + +sub new { + my ($caller, $adaptor, $toplevel_cs, $other_cs) = @_; + + my $class = ref($caller) || $caller; + + if(!ref($toplevel_cs)) { + throw('Toplevel CoordSystem argument expected.'); + } + if(!ref($other_cs)) { + throw('Other CoordSystem argument expected.'); + } + + if(!$toplevel_cs->is_top_level()) { + throw($toplevel_cs->name() . " is not the toplevel CoordSystem."); + } + if($other_cs->is_top_level()) { + throw("Other CoordSystem argument should not be toplevel CoordSystem."); + } + + my $cs_adaptor = $adaptor->db()->get_CoordSystemAdaptor(); + my $coord_systems = $cs_adaptor->fetch_all(); + + my $self = bless {'coord_systems' => $coord_systems, + 'toplevel_cs' => $toplevel_cs, + 'other_cs' => $other_cs}, $class; + + $self->adaptor($adaptor); + return $self; +} + +sub adaptor { + my $self = shift; + + weaken($self->{'adaptor'} = shift) if(@_); + + return $self->{'adaptor'}; +} + +=head2 map + + Arg [1] : string $frm_seq_region + The name of the sequence region to transform FROM + Arg [2] : int $frm_start + The start of the region to transform FROM + Arg [3] : int $frm_end + The end of the region to transform FROM + Arg [4] : int $strand + The strand of the region to transform FROM + Arg [5] : Bio::EnsEMBL::CoordSystem + The coordinate system to transform FROM + Arg [6] : if set will do a fastmap + Example : @coords = $mapper->map('X', 1_000_000, 2_000_000, + 1, $chr_cs); + Description: Transforms coordinates from one coordinate system + to another. + Returntype : List of Bio::EnsEMBL::Mapper::Coordinate and/or + Bio::EnsEMBL::Mapper:Gap objects + Exceptions : thrown if if the specified TO coordinate system is not one + of the coordinate systems associated with this mapper + Caller : general + Status : Stable + +=cut + + +sub map { + throw('Incorrect number of arguments.') if(@_ != 6 && @_ != 7); + + my($self, $frm_seq_region_name, $frm_start, $frm_end, $frm_strand, $frm_cs, + $fastmap) = @_; + + if($frm_cs->is_top_level()) { + throw("The toplevel CoordSystem can only be mapped TO, not FROM."); + } + + my @tmp; + push @tmp, $frm_seq_region_name; + my $seq_region_id = @{$self->adaptor()->seq_regions_to_ids($frm_cs, \@tmp)}[0]; + + my $mapper = $self->{'mapper'}; + my $toplevel_cs = $self->{'toplevel_cs'}; + my $other_cs = $self->{'other_cs'}; + my $adaptor = $self->adaptor; + + if($frm_cs != $other_cs && !$frm_cs->equals($other_cs)) { + throw("Coordinate system " . $frm_cs->name . " " . $frm_cs->version . + " is neither the assembled nor the component coordinate system " . + " of this AssemblyMapper"); + } + + my $coord_systems = $self->{'coord_systems'}; + + my $csa = $self->adaptor()->db()->get_CoordSystemAdaptor(); + + # + # TBD try to make this more efficient + # + my $from_rank = $other_cs->rank(); + foreach my $cs (@$coord_systems) { + last if($cs->rank >= $from_rank); + + #check if a mapping path even exists to this coordinate system + my @mapping_path = @{ $csa->get_mapping_path( $cs, $other_cs ) }; + + if(@mapping_path) { + + # Try to map to this coord system. If we get back any coordinates then + # it is our 'toplevel' that we were looking for + my $mapper = $adaptor->fetch_by_CoordSystems($other_cs, $cs); + + if($fastmap) { + my @result = $mapper->fastmap($frm_seq_region_name, $frm_start, $frm_end, + $frm_strand, $frm_cs); + return @result if(@result); + } else { + my @coords = $mapper->map($frm_seq_region_name, $frm_start, $frm_end, + $frm_strand, $frm_cs); + + if(@coords > 1 || !$coords[0]->isa('Bio::EnsEMBL::Mapper::Gap')) { + return @coords; + } + } + } + } + + # the toplevel coordinate system for the region requested *is* the + # requested region. + if($fastmap) { + return ($seq_region_id,$frm_start, $frm_end, $frm_strand, $other_cs); + } + return Bio::EnsEMBL::Mapper::Coordinate->new + ($seq_region_id,$frm_start,$frm_end, $frm_strand, $other_cs); +} + +# +# for polymorphism with AssemblyMapper +# +=head2 flush + + Args : none + Example : none + Description: polymorphism with AssemblyMapper, does nothing + Returntype : none + Exceptions : none + Status : Stable + +=cut + +sub flush {} + +=head2 fastmap + + Arg [1] : string $frm_seq_region + The name of the sequence region to transform FROM + Arg [2] : int $frm_start + The start of the region to transform FROM + Arg [3] : int $frm_end + The end of the region to transform FROM + Arg [4] : int $strand + The strand of the region to transform FROM + Arg [5] : Bio::EnsEMBL::CoordSystem + The coordinate system to transform FROM + Example : @coords = $mapper->fastmap('X', 1_000_000, 2_000_000, + 1, $chr_cs); + Description: Transforms coordinates from one coordinate system + to another. + Returntype : List of Bio::EnsEMBL::Mapper::Coordinate and/or + Bio::EnsEMBL::Mapper:Gap objects + Exceptions : thrown if if the specified TO coordinate system is not one + of the coordinate systems associated with this mapper + Caller : general + Status : Stable + +=cut + +sub fastmap { + my $self = shift; + return $self->map(@_,1); +} + +=head2 assembled_CoordSystem + + Arg [1] : none + Example : $cs = $mapper->assembled_CoordSystem + Description: Retrieves the assembled CoordSystem from this mapper + Returntype : Bio::EnsEMBL::CoordSystem + Exceptions : none + Caller : internal, AssemblyMapperAdaptor + Status : Stable + +=cut + +sub assembled_CoordSystem { + my $self = shift; + return $self->{'toplevel_cs'}; +} + +=head2 component_CoordSystem + + Arg [1] : none + Example : $cs = $mapper->component_CoordSystem + Description: Retrieves the component CoordSystem from this mapper + Returntype : Bio::EnsEMBL::CoordSystem + Exceptions : none + Caller : internal, AssemblyMapperAdaptor + Status : Stable + +=cut + +sub component_CoordSystem { + my $self = shift; + return $self->{'other_cs'}; +} + + +sub _list { + my($self, $frm_seq_region_name, $frm_start, $frm_end, $frm_cs, $seq_regions) = @_; + + my $mapper = $self->{'mapper'}; + my $toplevel_cs = $self->{'toplevel_cs'}; + my $other_cs = $self->{'other_cs'}; + my $adaptor = $self->adaptor; + + if($frm_cs->is_top_level()) { + throw("The toplevel CoordSystem can only be mapped TO, not FROM."); + } + if($frm_cs != $other_cs && !$frm_cs->equals($other_cs)) { + throw("Coordinate system " . $frm_cs->name . " " . $frm_cs->version . + " is neither the assembled nor the component coordinate system " . + " of this AssemblyMapper"); + } + + my $coord_systems = $self->{'coord_systems'}; + my $csa = $self->adaptor()->db()->get_CoordSystemAdaptor(); + + # + # TBD try to make this more efficient + # + my $from_rank = $other_cs->rank(); + foreach my $cs (@$coord_systems) { + last if($cs->rank >= $from_rank); + + #check if a mapping path even exists to this coordinate system + my @mapping_path = @{ $csa->get_mapping_path( $cs, $other_cs ) }; + + if(@mapping_path) { + + # Try to map to this coord system. If we get back any coordinates then + # it is our 'toplevel' that we were looking for + my $mapper = $adaptor->fetch_by_CoordSystems($other_cs, $cs); + + my @result; + + my @tmp; + push @tmp, $frm_seq_region_name; + my $seq_region_id = @{$self->adaptor()->seq_regions_to_ids($frm_cs, \@tmp)}[0]; + + if($seq_regions) { + @result = $mapper->list_seq_regions($frm_seq_region_name, $frm_start, + $frm_end, $frm_cs); + } else { + @result = $mapper->list_ids($frm_seq_region_name, $frm_start, + $frm_end, $frm_cs); + } + + return @result if(@result); + } + } + + # the toplevel coordinate system for the region requested *is* the + return ($frm_seq_region_name); + + + # requested region. + if($seq_regions) { + return ($frm_seq_region_name); + } + + #this seems a bit silly and inefficient, but it is probably never + #called anyway. + my $slice_adaptor = $adaptor->db()->get_SliceAdaptor(); + my $slice = $slice_adaptor->fetch_by_region($other_cs->name(), + $frm_seq_region_name, + undef,undef,undef,$other_cs); + return ($slice_adaptor->get_seq_region_id($slice)); +} + + + +=head2 list_seq_regions + + Arg [1] : string $frm_seq_region + The name of the sequence region of interest + Arg [2] : int $frm_start + The start of the region of interest + Arg [3] : int $frm_end + The end of the region to transform of interest + Arg [5] : Bio::EnsEMBL::CoordSystem $frm_cs + The coordinate system to obtain overlapping ids of + Example : foreach $id ($asm_mapper->list_ids('X',1,1000,$ctg_cs)) {...} + Description: Retrieves a list of overlapping seq_region names + of another coordinate system. This is the same as the + list_ids method but uses seq_region names rather internal ids + Returntype : List of strings + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub list_seq_regions { + throw('Incorrect number of arguments.') if(@_ != 5); + return _list(@_,1); +} + + +=head2 list_ids + + Arg [1] : string $frm_seq_region + The name of the sequence region of interest. + Arg [2] : int $frm_start + The start of the region of interest + Arg [3] : int $frm_end + The end of the region to transform of interest + Arg [5] : Bio::EnsEMBL::CoordSystem $frm_cs + The coordinate system to obtain overlapping ids of + Example : foreach $id ($asm_mapper->list_ids('X',1,1000,$chr_cs)) {...} + Description: Retrieves a list of overlapping seq_region internal identifiers + of another coordinate system. This is the same as the + list_seq_regions method but uses internal identfiers rather + than seq_region strings + Returntype : List of ints + Exceptions : thrown if the from CoordSystem is the toplevel coord system + thrown if the from CoordSystem is not the one used in the mapper + Caller : general + Status : Stable + +=cut + +sub list_ids { + throw('Incorrect number of arguments.') if(@_ != 5); + return _list(@_,0); +} + + + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Transcript.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Transcript.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,2993 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Transcript - object representing an Ensembl transcript + +=head1 SYNOPSIS + +Creation: + + my $tran = new Bio::EnsEMBL::Transcript(); + my $tran = new Bio::EnsEMBL::Transcript( -EXONS => \@exons ); + +Manipulation: + + # Returns an array of Exon objects + my @exons = @{ $tran->get_all_Exons() }; + + # Returns the peptide translation of the exons as a Bio::Seq + if ( $tran->translation() ) { + my $pep = $tran->translate(); + } else { + print "Transcript ", $tran->stable_id(), " is non-coding\n"; + } + +=head1 DESCRIPTION + +A representation of a transcript within the Ensembl system. A transcript +consists of a set of Exons and (possibly) a Translation which defines the +coding and non-coding regions of the exons. + +=cut + +package Bio::EnsEMBL::Transcript; + +use strict; + +use Bio::EnsEMBL::Feature; +use Bio::EnsEMBL::Intron; +use Bio::EnsEMBL::TranscriptMapper; +use Bio::EnsEMBL::Utils::TranscriptSNPs; +use Bio::EnsEMBL::SeqEdit; + +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Utils::Exception qw( deprecate warning throw ); +use Bio::EnsEMBL::Utils::Scalar qw( assert_ref ); + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Feature); + + +=head2 new + + Arg [-EXONS] : + reference to list of Bio::EnsEMBL::Exon objects - exons which make up + this transcript + Arg [-STABLE_ID] : + string - the stable identifier of this transcript + Arg [-VERSION] : + int - the version of the stable identifier of this transcript + Arg [-EXTERNAL_NAME] : + string - the external database name associated with this transcript + Arg [-EXTERNAL_DB] : + string - the name of the database the external name is from + Arg [-EXTERNAL_STATUS]: + string - the status of the external identifier + Arg [-DISPLAY_XREF]: + Bio::EnsEMBL::DBEntry - The external database entry that is used + to label this transcript when it is displayed. + Arg [-CREATED_DATE]: + string - the date the transcript was created + Arg [-MODIFIED_DATE]: + string - the date the transcript was last modified + Arg [-DESCRIPTION]: + string - the transcipts description + Arg [-BIOTYPE]: + string - the biotype e.g. "protein_coding" + Arg [-STATUS]: + string - the transcripts status i.e. "KNOWN","NOVEL" + Arg [-IS_CURRENT]: + Boolean - specifies if this is the current version of the transcript + Example : $tran = new Bio::EnsEMBL::Transcript(-EXONS => \@exons); + Description: Constructor. Instantiates a Transcript object. + Returntype : Bio::EnsEMBL::Transcript + Exceptions : throw on bad arguments + Caller : general + Status : Stable + +=cut + +sub new { + my $proto = shift; + + my $class = ref($proto) || $proto; + + my $self = $class->SUPER::new(@_); + + my ( + $exons, $stable_id, $version, + $external_name, $external_db, $external_status, + $display_xref, $created_date, $modified_date, + $description, $biotype, $confidence, + $external_db_name, $status, $is_current + ); + + # Catch for old style constructor calling: + if ( ( @_ > 0 ) && ref( $_[0] ) ) { + $exons = [@_]; + deprecate( "Transcript constructor should use named arguments.\n" + . "Use Bio::EnsEMBL::Transcript->new(-EXONS => \@exons);\n" + . "instead of Bio::EnsEMBL::Transcript->new(\@exons);" ); + } else { + ( + $exons, $stable_id, $version, + $external_name, $external_db, $external_status, + $display_xref, $created_date, $modified_date, + $description, $biotype, $confidence, + $external_db_name, $status, $is_current + ) + = rearrange( [ + 'EXONS', 'STABLE_ID', + 'VERSION', 'EXTERNAL_NAME', + 'EXTERNAL_DB', 'EXTERNAL_STATUS', + 'DISPLAY_XREF', 'CREATED_DATE', + 'MODIFIED_DATE', 'DESCRIPTION', + 'BIOTYPE', 'CONFIDENCE', + 'EXTERNAL_DB_NAME', 'STATUS', + 'IS_CURRENT' + ], + @_ + ); + } + + if ($exons) { + $self->{'_trans_exon_array'} = $exons; + $self->recalculate_coordinates(); + } + + $self->stable_id($stable_id); + $self->version($version); + $self->{'created_date'} = $created_date; + $self->{'modified_date'} = $modified_date; + $self->external_name($external_name) if ( defined $external_name ); + $self->external_db($external_db) if ( defined $external_db ); + $self->external_status($external_status) + if ( defined $external_status ); + $self->display_xref($display_xref) if ( defined $display_xref ); + $self->edits_enabled(1); + + $self->description($description); + $self->status($confidence); # old style name + $self->status($status); # new style name + $self->biotype($biotype); + + # default is_current + $is_current = 1 unless ( defined($is_current) ); + $self->{'is_current'} = $is_current; + + return $self; +} ## end sub new + +=head2 get_all_DBLinks + + Arg [1] : String database name (optional) + SQL wildcard characters (_ and %) can be used to + specify patterns. + + Example : my @dblinks = @{ $transcript->get_all_DBLinks() }; + my @dblinks = @{ $transcript->get_all_DBLinks('Uniprot%') }; + + Description: Retrieves *all* related DBEntries for this + transcript. This includes all DBEntries that are + associated with the corresponding translation. + + If you only want to retrieve the DBEntries associated + with the transcript (and not the translation) then + you should use the get_all_DBEntries() call instead. + + Note: Each entry may be listed more than once. No + uniqueness checks are done. Also if you put in an + incorrect external database name no checks are done + to see if this exists, you will just get an empty + list. + + Return type: Listref of Bio::EnsEMBL::DBEntry objects + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_DBLinks { + my ( $self, $db_name_exp, $ex_db_type ) = @_; + + my @links = + @{ $self->get_all_DBEntries( $db_name_exp, $ex_db_type ) }; + + # Add all of the transcript and translation xrefs to the return list. + my $translation = $self->translation(); + if ( defined($translation) ) { + push( @links, + @{$translation->get_all_DBEntries( $db_name_exp, $ex_db_type ) } + ); + } + + @links = sort { _compare_xrefs() } @links; + + return \@links; +} + +=head2 get_all_xrefs + + Arg [1] : String database name (optional) + SQL wildcard characters (_ and %) can be used to + specify patterns. + + Example : @xrefs = @{ $transcript->get_all_xrefs() }; + @xrefs = @{ $transcript->get_all_xrefs('Uniprot%') }; + + Description: Retrieves *all* related xrefs for this transcript. + This includes all xrefs that are associated with the + corresponding translation of this transcript. + + If you want to retrieve the xrefs associated with + only the transcript (and not the translation) then + you should use the get_all_object_xrefs() method + instead. + + Note: Each entry may be listed more than once. No + uniqueness checks are done. Also if you put in an + incorrect external database name no checks are done + to see if this exists, you will just get an empty + list. + + NB: This method is an alias for the + get_all_DBLinks() method. + + Return type: Listref of Bio::EnsEMBL::DBEntry objects + + Status : Stable + +=cut + +sub get_all_xrefs { + my $self = shift; + return $self->get_all_DBLinks(@_); +} + +=head2 get_all_DBEntries + + Arg [1] : (optional) String, external database name + + Arg [2] : (optional) String, external database type + + Example : my @dbentries = @{ $transcript->get_all_DBEntries() }; + + Description: Retrieves DBEntries (xrefs) for this transcript. + This does *not* include the corresponding + translations DBEntries (see get_all_DBLinks()). + + This method will attempt to lazy-load DBEntries + from a database if an adaptor is available and no + DBEntries are present on the transcript (i.e. they + have not already been added or loaded). + + Returntype : Listref of Bio::EnsEMBL::DBEntry objects + Exceptions : none + Caller : get_all_DBLinks, TranscriptAdaptor::store + Status : Stable + +=cut + +sub get_all_DBEntries { + my ( $self, $ex_db_exp, $ex_db_type ) = @_; + + my $cache_name = 'dbentries'; + + if ( defined($ex_db_exp) ) { + $cache_name .= $ex_db_exp; + } + + if ( defined($ex_db_type) ) { + $cache_name .= $ex_db_type; + } + + # if not cached, retrieve all of the xrefs for this transcript + if ( !defined( $self->{$cache_name} ) && defined( $self->adaptor() ) ) + { + $self->{$cache_name} = + $self->adaptor()->db()->get_DBEntryAdaptor() + ->fetch_all_by_Transcript( $self, $ex_db_exp, $ex_db_type ); + } + + $self->{$cache_name} ||= []; + + return $self->{$cache_name}; +} ## end sub get_all_DBEntries + +=head2 get_all_object_xrefs + + Arg [1] : (optional) String, external database name + + Arg [2] : (optional) String, external_db type + + Example : @oxrefs = @{ $transcript->get_all_object_xrefs() }; + + Description: Retrieves xrefs for this transcript. This does + *not* include xrefs that are associated with the + corresponding translations of this transcript (see + get_all_xrefs()). + + This method will attempt to lazy-load xrefs from a + database if an adaptor is available and no xrefs are + present on the transcript (i.e. they have not already + been added or loaded). + + NB: This method is an alias for the + get_all_DBentries() method. + + Return type: Listref of Bio::EnsEMBL::DBEntry objects + + Status : Stable + +=cut + +sub get_all_object_xrefs { + my $self = shift; + return $self->get_all_DBEntries(@_); +} + +=head2 add_DBEntry + + Arg [1] : Bio::EnsEMBL::DBEntry $dbe + The dbEntry to be added + Example : my $dbe = Bio::EnsEMBL::DBEntery->new(...); + $transcript->add_DBEntry($dbe); + Description: Associates a DBEntry with this transcript. Note that adding + DBEntries will prevent future lazy-loading of DBEntries for this + gene (see get_all_DBEntries). + Returntype : none + Exceptions : thrown on incorrect argument type + Caller : general + Status : Stable + +=cut + +sub add_DBEntry { + my $self = shift; + my $dbe = shift; + + unless($dbe && ref($dbe) && $dbe->isa('Bio::EnsEMBL::DBEntry')) { + throw('Expected DBEntry argument'); + } + + $self->{'dbentries'} ||= []; + push @{$self->{'dbentries'}}, $dbe; +} + + +=head2 get_all_supporting_features + + Example : my @evidence = @{ $transcript->get_all_supporting_features }; + Description: Retreives any supporting features added manually by + calls to add_supporting_features. + Returntype : Listref of Bio::EnsEMBL::FeaturePair objects + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_supporting_features { + my $self = shift; + + if( !exists $self->{_supporting_evidence} ) { + if($self->adaptor) { + my $tsfa = $self->adaptor->db->get_TranscriptSupportingFeatureAdaptor(); + $self->{_supporting_evidence} = $tsfa->fetch_all_by_Transcript($self); + } + } + + return $self->{_supporting_evidence} || []; +} + + +=head2 add_supporting_features + + Arg [1-N] : Bio::EnsEMBL::FeaturePair $feature + The supporting features to add + Example : $transcript->add_supporting_features(@features); + Description: Adds a list of supporting features to this Transcript. + The added features can be retieved by + get_all_supporting_features(). + Returntype : none + Exceptions : throw if any of the features are not FeaturePairs + throw if any of the features are not in the same coordinate + system as the Transcript + Caller : general + Status : Stable + +=cut + +sub add_supporting_features { + my ($self, @features) = @_; + + return unless @features; + + $self->{_supporting_evidence} ||= []; + + # check whether this feature object has been added already + FEATURE: foreach my $feature (@features) { + + if (!defined($feature) || ref($feature) eq "ARRAY") { + throw("Element in transcript supporting features array is undefined or is an ARRAY for " . $self->dbID); + } + if (!$feature || !$feature->isa("Bio::EnsEMBL::FeaturePair")) { + print "feature = " . $feature . "\n"; + throw("Supporting feat [$feature] not a " . + "Bio::EnsEMBL::FeaturePair"); + } + + if ((defined $self->slice() && defined $feature->slice())&& + ( $self->slice()->name() ne $feature->slice()->name())){ + throw("Supporting feat not in same coord system as exon\n" . + "exon is attached to [".$self->slice()->name()."]\n" . + "feat is attached to [".$feature->slice()->name()."]"); + } + + foreach my $added_feature ( @{ $self->{_supporting_evidence} } ){ + # compare objects + if ( $feature == $added_feature ){ + #this feature has already been added + next FEATURE; + } + } + + #no duplicate was found, add the feature + push(@{$self->{_supporting_evidence}}, $feature); + } +} + + +=head2 flush_supporting_features + + Example : $transcript->flush_supporting_features; + Description : Removes all supporting evidence from the transcript. + Return type : (Empty) listref + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub flush_supporting_features { + my $self = shift; + $self->{'_supporting_evidence'} = []; +} + + +=head2 external_db + + Arg [1] : (optional) String - name of external db to set + Example : $transcript->external_db('HGNC'); + Description: Getter/setter for attribute external_db. The db is the one that + belongs to the external_name. + Returntype : String + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub external_db { + my ( $self, $ext_dbname ) = @_; + + if(defined $ext_dbname) { + return ( $self->{'external_db'} = $ext_dbname ); + } + + if( exists $self->{'external_db'} ) { + return $self->{'external_db'}; + } + + my $display_xref = $self->display_xref(); + + if( defined $display_xref ) { + return $display_xref->dbname() + } else { + return undef; + } +} + + +=head2 external_status + + Arg [1] : (optional) String - status of the external db + Example : $transcript->external_status('KNOWNXREF'); + Description: Getter/setter for attribute external_status. The status of + the external db of the one that belongs to the external_name. + Returntype : String + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub external_status { + my ( $self, $ext_status ) = @_; + + if(defined $ext_status) { + return ( $self->{'external_status'} = $ext_status ); + } + + if( exists $self->{'external_status'} ) { + return $self->{'external_status'}; + } + + my $display_xref = $self->display_xref(); + + if( defined $display_xref ) { + return $display_xref->status() + } else { + return undef; + } +} + + +=head2 external_name + + Arg [1] : (optional) String - the external name to set + Example : $transcript->external_name('BRCA2-001'); + Description: Getter/setter for attribute external_name. + Returntype : String or undef + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub external_name { + my ($self, $ext_name) = @_; + + if(defined $ext_name) { + return ( $self->{'external_name'} = $ext_name ); + } + + if( exists $self->{'external_name'} ) { + return $self->{'external_name'}; + } + + my $display_xref = $self->display_xref(); + + if( defined $display_xref ) { + return $display_xref->display_id() + } else { + return undef; + } +} + + +=head2 is_known + + Example : print "Transcript ".$transcript->stable_id." is KNOWN\n" if + $transcript->is_known; + Description: Returns TRUE if this gene has a status of 'KNOWN' + Returntype : TRUE if known, FALSE otherwise + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub is_known { + my $self = shift; + return ( $self->{'status'} eq "KNOWN" || $self->{'status'} eq "KNOWN_BY_PROJECTION" ); +} + + +=head2 status + + Arg [1] : string $status + Example : none + Description: get/set for attribute status + Returntype : string + Exceptions : none + Caller : general + Status : Medium Risk + +=cut + +sub status { + my $self = shift; + $self->{'status'} = shift if( @_ ); + return $self->{'status'}; +} + +=head2 biotype + + Arg [1] : string $biotype + Example : none + Description: get/set for attribute biotype + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub biotype { + my $self = shift; + $self->{'biotype'} = shift if( @_ ); + return ( $self->{'biotype'} || "protein_coding" ); +} + + +=head2 display_xref + + Arg [1] : (optional) Bio::EnsEMBL::DBEntry - the display xref to set + Example : $transcript->display_xref($db_entry); + Description: Getter/setter for display_xref for this transcript. + Returntype : Bio::EnsEMBL::DBEntry + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub display_xref { + my $self = shift; + $self->{'display_xref'} = shift if(@_); + return $self->{'display_xref'}; +} + +=head2 is_canonical + + Args [1] : (optional) Boolean is_canonical + + Example : if ($transcript->is_canonical()) { ... } + + Description : Returns true (non-zero) if the transcript is the + canonical transcript of its gene, false (0) if not. If the code + returns an undefined it is because its state is not currently + known. Internally the code will consult the database for this + value if it is unknown and the transcript has a dbID and an + attached adaptor + + Return type : Boolean + + Status : Stable + +=cut + +sub is_canonical { + my ( $self, $value ) = @_; + + #Shortcut call + return $self->{is_canonical} if defined $self->{is_canonical}; + + if ( defined($value) ) { + $self->{is_canonical} = ( $value ? 1 : 0 ); + } + else { + if(! defined $self->{is_canonical} && $self->dbID() && $self->adaptor()) { + $self->{is_canonical} = $self->adaptor()->is_Transcript_canonical($self); + } + } + + return $self->{is_canonical}; +} + +=head2 translation + + Args : None + Example : if ( $transcript->translation() ) { + print( $transcript->translation()->stable_id(), "\n" ); + } else { + print("Pseudogene\n"); + } + Description: Getter/setter for the Translation object which + defines the CDS (and as a result the peptide encoded + by) this transcript. This function will return + undef if this transcript is a pseudogene, i.e. a + non-translating transcript such as an ncRNA. This + is the accepted method of determining whether a + transcript is a pseudogene or not. + Returntype : Bio::EnsEMBL::Translation + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub translation { + my ( $self, $translation ) = @_; + + if ( defined($translation) ) { + assert_ref( $translation, 'Bio::EnsEMBL::Translation' ); + + $self->{'translation'} = $translation; + $translation->transcript($self); + + $self->{'cdna_coding_start'} = undef; + $self->{'cdna_coding_end'} = undef; + + $self->{'coding_region_start'} = undef; + $self->{'coding_region_end'} = undef; + + $self->{'transcript_mapper'} = undef; + + } elsif ( @_ > 1 ) { + if ( defined( $self->{'translation'} ) ) { + # Removing existing translation + + $self->{'translation'}->transcript(undef); + delete( $self->{'translation'} ); + + $self->{'cdna_coding_start'} = undef; + $self->{'cdna_coding_end'} = undef; + + $self->{'coding_region_start'} = undef; + $self->{'coding_region_end'} = undef; + + $self->{'transcript_mapper'} = undef; + } + } elsif ( !exists( $self->{'translation'} ) + && defined( $self->adaptor() ) ) + { + $self->{'translation'} = + $self->adaptor()->db()->get_TranslationAdaptor() + ->fetch_by_Transcript($self); + } + + return $self->{'translation'}; +} ## end sub translation + +=head2 get_all_alternative_translations + + Args : None + Example : + + my @alt_translations = + @{ $transcript->get_all_alternative_translations() }; + + Description: Fetches all alternative translations defined for this + transcript. The canonical translation is not returned. + + Returntype : Arrayref to Bio::EnsEMBL::Translation + Exceptions : None + Caller : General + Status : Stable + +=cut + +sub get_all_alternative_translations { + my ($self) = @_; + + if ( !exists( $self->{'alternative_translations'} ) + && defined( $self->adaptor() ) ) + { + my $pa = $self->adaptor()->db()->get_TranslationAdaptor(); + my @translations = + @{ $pa->fetch_all_alternative_by_Transcript($self) }; + + $self->{'alternative_translations'} = \@translations; + } + + return $self->{'alternative_translations'}; +} + +=head2 add_alternative_translation + + Args : Bio::EnsEMBL::Translation $translation + Example : + + $transcript->add_alternative_translation($translation); + + Description: Adds an alternative translation to this transcript. + Returntype : None + Exceptions : None + Caller : General + Status : Stable + +=cut + +sub add_alternative_translation { + my ( $self, $translation ) = @_; + + if ( !( defined($translation) + && ref($translation) + && $translation->isa('Bio::EnsEMBL::Translation') ) ) + { + throw("Bio::EnsEMBL::Translation argument expected."); + } + + # Load the existsing alternative translations from the database if + # they haven't already been loaded. + $self->get_all_alternative_translations(); + + push( @{ $self->{'alternative_translations'} }, $translation ); +} + +=head2 spliced_seq + + Args : none + Example : none + Description: Retrieves all Exon sequences and concats them together. + No phase padding magic is done, even if phases do not align. + Returntype : Text + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub spliced_seq { + my ( $self ) = @_; + + my $seq_string = ""; + for my $ex ( @{$self->get_all_Exons()} ) { + my $seq = $ex->seq(); + + if(!$seq) { + warning("Could not obtain seq for exon. Transcript sequence may not " . + "be correct."); + $seq_string .= 'N' x $ex->length(); + } else { + $seq_string .= $seq->seq(); + } + } + + # apply post transcriptional edits + if($self->edits_enabled()) { + my @seqeds = @{$self->get_all_SeqEdits()}; + + # sort edits in reverse order to remove complication of + # adjusting downstream edits + @seqeds = sort {$b->start() <=> $a->start()} @seqeds; + + foreach my $se (@seqeds) { + $se->apply_edit(\$seq_string); + } + } + + return $seq_string; +} + + +=head2 translateable_seq + + Args : none + Example : print $transcript->translateable_seq(), "\n"; + Description: Returns a sequence string which is the the translateable part + of the transcripts sequence. This is formed by splicing all + Exon sequences together and apply all defined RNA edits. + Then the coding part of the sequence is extracted and returned. + The code will not support monkey exons any more. If you want to + have non phase matching exons, defined appropriate _rna_edit + attributes! + + An empty string is returned if this transcript is a pseudogene + (i.e. is non-translateable). + Returntype : Text + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub translateable_seq { + my ( $self ) = @_; + + if ( !$self->translation() ) { + return ''; + } + + my $mrna = $self->spliced_seq(); + + my $start = $self->cdna_coding_start(); + my $end = $self->cdna_coding_end(); + + $mrna = substr( $mrna, $start - 1, $end - $start + 1 ); + + my $start_phase = $self->translation->start_Exon->phase(); + if( $start_phase > 0 ) { + $mrna = "N"x$start_phase . $mrna; + } + if( ! $start || ! $end ) { + return ""; + } + + return $mrna; +} + + +=head2 cdna_coding_start + + Arg [1] : (optional) $value + Example : $relative_coding_start = $transcript->cdna_coding_start; + Description: Retrieves the position of the coding start of this transcript + in cdna coordinates (relative to the start of the 5prime end of + the transcript, excluding introns, including utrs). + + This will return undef if this is a pseudogene (i.e. a + transcript with no translation). + Returntype : int + Exceptions : none + Caller : five_prime_utr, get_all_snps, general + Status : Stable + +=cut + +sub cdna_coding_start { + my $self = shift; + + if( @_ ) { + $self->{'cdna_coding_start'} = shift; + } + + if(!defined $self->{'cdna_coding_start'} && defined $self->translation){ + # calc coding start relative from the start of translation (in cdna coords) + my $start = 0; + + my @exons = @{$self->get_all_Exons}; + my $exon; + + while($exon = shift @exons) { + if($exon == $self->translation->start_Exon) { + #add the utr portion of the start exon + $start += $self->translation->start; + last; + } else { + #add the entire length of this non-coding exon + $start += $exon->length; + } + } + + # adjust cdna coords if sequence edits are enabled + if($self->edits_enabled()) { + my @seqeds = @{$self->get_all_SeqEdits()}; + # sort in reverse order to avoid adjustment of downstream edits + @seqeds = sort {$b->start() <=> $a->start()} @seqeds; + + foreach my $se (@seqeds) { + # use less than start so that start of CDS can be extended + if($se->start() < $start) { + $start += $se->length_diff(); + } + } + } + + $self->{'cdna_coding_start'} = $start; + } + + return $self->{'cdna_coding_start'}; +} + + +=head2 cdna_coding_end + + Arg [1] : (optional) $value + Example : $cdna_coding_end = $transcript->cdna_coding_end; + Description: Retrieves the end of the coding region of this transcript in + cdna coordinates (relative to the five prime end of the + transcript, excluding introns, including utrs). + + This will return undef if this transcript is a pseudogene + (i.e. a transcript with no translation and therefor no CDS). + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub cdna_coding_end { + my $self = shift; + + if( @_ ) { + $self->{'cdna_coding_end'} = shift; + } + + if(!defined $self->{'cdna_coding_end'} && defined $self->translation) { + my @exons = @{$self->get_all_Exons}; + + my $end = 0; + while(my $exon = shift @exons) { + if($exon == $self->translation->end_Exon) { + # add coding portion of the final coding exon + $end += $self->translation->end; + last; + } else { + # add entire exon + $end += $exon->length; + } + } + + # adjust cdna coords if sequence edits are enabled + if($self->edits_enabled()) { + my @seqeds = @{$self->get_all_SeqEdits()}; + # sort in reverse order to avoid adjustment of downstream edits + @seqeds = sort {$b->start() <=> $a->start()} @seqeds; + + foreach my $se (@seqeds) { + # use less than or equal to end+1 so end of the CDS can be extended + if($se->start() <= $end + 1) { + $end += $se->length_diff(); + } + } + } + + $self->{'cdna_coding_end'} = $end; + } + + return $self->{'cdna_coding_end'}; +} + + +=head2 coding_region_start + + Arg [1] : (optional) $value + Example : $coding_region_start = $transcript->coding_region_start + Description: Retrieves the start of the coding region of this transcript + in genomic coordinates (i.e. in either slice or contig coords). + By convention, the coding_region_start is always lower than + the value returned by the coding_end method. + The value returned by this function is NOT the biological + coding start since on the reverse strand the biological coding + start would be the higher genomic value. + + This function will return undef if this is a pseudogene + (a non-translated transcript). + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub coding_region_start { + my ($self, $value) = @_; + + if( defined $value ) { + $self->{'coding_region_start'} = $value; + } elsif(!defined $self->{'coding_region_start'} && + defined $self->translation) { + #calculate the coding start from the translation + my $start; + my $strand = $self->translation()->start_Exon->strand(); + if( $strand == 1 ) { + $start = $self->translation()->start_Exon->start(); + $start += ( $self->translation()->start() - 1 ); + } else { + $start = $self->translation()->end_Exon->end(); + $start -= ( $self->translation()->end() - 1 ); + } + $self->{'coding_region_start'} = $start; + } + + return $self->{'coding_region_start'}; +} + + +=head2 coding_region_end + + Arg [1] : (optional) $value + Example : $coding_region_end = $transcript->coding_region_end + Description: Retrieves the end of the coding region of this transcript + in genomic coordinates (i.e. in either slice or contig coords). + By convention, the coding_region_end is always higher than the + value returned by the coding_region_start method. + The value returned by this function is NOT the biological + coding end since on the reverse strand the biological coding + end would be the lower genomic value. + + This function will return undef if this is a pseudogene + (a non-translated transcript). + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub coding_region_end { + my ($self, $value ) = @_; + + my $strand; + my $end; + + if( defined $value ) { + $self->{'coding_region_end'} = $value; + } elsif( ! defined $self->{'coding_region_end'} + && defined $self->translation() ) { + $strand = $self->translation()->start_Exon->strand(); + if( $strand == 1 ) { + $end = $self->translation()->end_Exon->start(); + $end += ( $self->translation()->end() - 1 ); + } else { + $end = $self->translation()->start_Exon->end(); + $end -= ( $self->translation()->start() - 1 ); + } + $self->{'coding_region_end'} = $end; + } + + return $self->{'coding_region_end'}; +} + + +=head2 edits_enabled + + Arg [1] : (optional) boolean $newval + Example : $transcript->edits_enabled(1); + Description: Enables/Disables the application of SeqEdits to this transcript. + Edits are enabled by default, and affect the cdna/mrna + sequences coordinates and the resultant translation. + Returntype : boolean - the current value of the edits + Exceptions : none + Caller : general, cdna_coding_start, cdna_coding_end, length + Status : Stable + +=cut + +sub edits_enabled { + my ( $self, $boolean ) = @_; + + if ( defined($boolean) ) { + $self->{'edits_enabled'} = $boolean; + + # flush cached values that will be different with/without edits + $self->{'cdna_coding_start'} = undef; + $self->{'cdna_coding_end'} = undef; + $self->{'transcript_mapper'} = undef; + } + + return $self->{'edits_enabled'}; +} + + +=head2 get_all_SeqEdits + + Arg [1] : none + Example : my @seqeds = @{$transcript->get_all_SeqEdits()}; + Description: Retrieves all post transcriptional sequence modifications for + this transcript. + Returntype : Bio::EnsEMBL::SeqEdit + Exceptions : none + Caller : spliced_seq() + Status : Stable + +=cut + +sub get_all_SeqEdits { + my $self = shift; + + my @seqeds; + + my $attribs = $self->get_all_Attributes('_rna_edit'); + + # convert attributes to SeqEdit objects + foreach my $a (@$attribs) { + push @seqeds, Bio::EnsEMBL::SeqEdit->new(-ATTRIB => $a); + } + + return \@seqeds; +} + + +=head2 get_all_Attributes + + Arg [1] : optional string $attrib_code + The code of the attribute type to retrieve values for. + Example : ($rna_edits) = @{$transcript->get_all_Attributes('_rna_edit')}; + @transc_attributes = @{$transcript->get_all_Attributes()}; + Description: Gets a list of Attributes of this transcript. + Optionally just get Attrubutes for given code. + Returntype : listref Bio::EnsEMBL::Attribute + Exceptions : warning if transcript does not have attached adaptor and + attempts lazy load. + Caller : general + Status : Stable + +=cut + +sub get_all_Attributes { + my $self = shift; + my $attrib_code = shift; + + if( ! exists $self->{'attributes' } ) { + if(!$self->adaptor() ) { + return []; + } + + my $attribute_adaptor = $self->adaptor->db->get_AttributeAdaptor(); + $self->{'attributes'} = $attribute_adaptor->fetch_all_by_Transcript($self); + } + + if( defined $attrib_code) { + my @results = grep { uc($_->code()) eq uc($attrib_code) } + @{$self->{'attributes'}}; + return \@results; + } else { + return $self->{'attributes'}; + } +} + + +=head2 add_Attributes + + Arg [1...] : Bio::EnsEMBL::Attribute $attribute + You can have more Attributes as arguments, all will be added. + Example : $transcript->add_Attributes($rna_edit_attribute); + Description: Adds an Attribute to the Transcript. Usefull to do _rna_edits. + If you add an attribute before you retrieve any from database, + lazy load will be disabled. + Returntype : none + Exceptions : throw on incorrect arguments + Caller : general + Status : Stable + +=cut + +sub add_Attributes { + my ( $self, @attribs ) = @_; + + if ( !exists( $self->{'attributes'} ) ) { + $self->{'attributes'} = []; + } + + my $seq_change = 0; + foreach my $attrib (@attribs) { + assert_ref( $attrib, 'Bio::EnsEMBL::Attribute' ); + + push( @{ $self->{'attributes'} }, $attrib ); + + if ( $attrib->code() eq "_rna_edit" ) { + $seq_change = 1; + } + } + + if ($seq_change) { + my $translation = $self->translation(); + if ( defined($translation) ) { + delete( $translation->{'seq'} ); + } + } + + # flush cdna coord cache b/c we may have added a SeqEdit + delete( $self->{'cdna_coding_start'} ); + delete( $self->{'cdna_coding_end'} ); + delete( $self->{'transcript_mapper'} ); +} ## end sub add_Attributes + + +=head2 add_Exon + + Title : add_Exon + Usage : $trans->add_Exon($exon) + Returns : None + Args [1]: Bio::EnsEMBL::Exon object to add + Args [2]: rank + Exceptions: throws if not a valid Bio::EnsEMBL::Exon + : or exon clashes with another one + Status : Stable + +=cut + +sub add_Exon { + my ( $self, $exon, $rank ) = @_; + + assert_ref( $exon, 'Bio::EnsEMBL::Exon' ); + + $self->{'_trans_exon_array'} ||= []; + + if ( defined($rank) ) { + $self->{'_trans_exon_array'}->[ $rank - 1 ] = $exon; + return; + } + + my $was_added = 0; + + my $ea = $self->{'_trans_exon_array'}; + + if ( @{$ea} ) { + if ( $exon->strand() == 1 ) { + + my $exon_start = $exon->start(); + + if ( $exon_start > $ea->[-1]->end() ) { + push( @{$ea}, $exon ); + $was_added = 1; + } else { + # Insert it at correct place + + my $i = 0; + foreach my $e ( @{$ea} ) { + if ( $exon_start < $e->start() ) { + if ( $exon->end() >= $e->start() ) { + # Overlap + last; + } + if ( $i and $exon_start <= $ea->[$i-1]->end() ) { + # Overlap + last; + } + splice( @{$ea}, $i, 0, $exon ); + $was_added = 1; + last; + } + ++$i; + } + + } + + } else { + + my $exon_end = $exon->end(); + + if ( $exon_end < $ea->[-1]->start() ) { + push( @{$ea}, $exon ); + $was_added = 1; + } else { + # Insert it at correct place + + my $i = 0; + foreach my $e ( @{$ea} ) { + if ( $exon_end > $e->end() ) { + if ( $exon->start() <= $e->end() ) { + # Overlap + last; + } + if ( $i and $exon_end >= $ea->[$i-1]->start() ) { + # Overlap + last; + } + splice( @{$ea}, $i, 0, $exon ); + $was_added = 1; + last; + } + ++$i; + } + + } + + } ## end else [ if ( $exon->strand() ==...)] + } else { + push( @{$ea}, $exon ); + $was_added = 1; + } + + # sanity check: + if ( !$was_added ) { + # The exon was not added because it was overloapping with an + # existing exon. + my $all_str = ''; + + foreach my $e ( @{$ea} ) { + $all_str .= ' ' + . $e->start() . '-' + . $e->end() . ' (' + . $e->strand() . ') ' + . ( $e->stable_id() || '' ) . "\n"; + } + + my $cur_str = ' ' + . $exon->start() . '-' + . $exon->end() . ' (' + . $exon->strand() . ') ' + . ( $exon->stable_id() || '' ) . "\n"; + + throw( "Exon overlaps with other exon in same transcript.\n" + . "Transcript Exons:\n$all_str\n" + . "This Exon:\n$cur_str" ); + } + + # recalculate start, end, slice, strand + $self->recalculate_coordinates(); +} ## end sub add_Exon + + +=head2 get_all_Exons + + Arg [CONSTITUTIVE] : Boolean + Only return constitutive exons if true (non-zero) + + Examples : my @exons = @{ $transcript->get_all_Exons() }; + + my @exons = + @{ $transcript->get_all_Exons( -constitutive => 1 ) }; + + Description: Returns an listref of the exons in this transcript + in order, i.e. the first exon in the listref is the + 5prime most exon in the transcript. Only returns + constitutive exons if the CONSTITUTIVE argument is + true. + + Returntype : listref to Bio::EnsEMBL::Exon objects + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_Exons { + my ( $self, @args ) = @_; + + my $constitutive; + if (@args) { + $constitutive = rearrange( ['CONSTITUTIVE'], @args ); + } + + if (!defined( $self->{'_trans_exon_array'} ) + && defined( $self->adaptor() ) ) + { + $self->{'_trans_exon_array'} = + $self->adaptor()->db()->get_ExonAdaptor() + ->fetch_all_by_Transcript($self); + } + + my @result; + if ( defined($constitutive) && $constitutive != 0 ) { + foreach my $exon ( @{ $self->{'_trans_exon_array'} } ) { + if ( $exon->is_constitutive() ) { + push( @result, $exon ); + } + } + } else { + @result = @{ $self->{'_trans_exon_array'} }; + } + + return \@result; +} ## end sub get_all_Exons + +=head2 get_all_constitutive_Exons + + Arg : None + + Examples : my @exons = @{ $transcript->get_all_constitutive_Exons() }; + + Description: Returns an listref of the constitutive exons in this + transcript in order, i.e. the first exon in the + listref is the 5prime most exon in the transcript. + + Returntype : listref to Bio::EnsEMBL::Exon objects + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_constitutive_Exons { + my ($self) = @_; + return $self->get_all_Exons( '-constitutive' => 1 ); +} + +=head2 get_all_IntronSupportingEvidence + + Example : $ise->get_all_IntronSupportingEvidence(); + Description : Fetches all ISE instances linked to this Transript + Returntype : ArrayRef[Bio::EnsEMBL::IntronSupportEvidence] retrieved from + the DB or from those added via C + Exceptions : None + +=cut + +sub get_all_IntronSupportingEvidence { + my ($self) = @_; + if(! defined $self->{_ise_array} && defined $self->adaptor()) { + my $isea = $self->adaptor()->db()->get_IntronSupportingEvidenceAdaptor(); + $self->{_ise_array} = $isea->fetch_all_by_Transcript($self); + } + return $self->{_ise_array}; +} + + +=head2 add_IntronSupportingEvidence + + Arg [1] : Bio::EnsEMBL::IntronSupportEvidence Object to add + Example : $ise->add_IntronSupportingEvidence($ise); + Description : Adds the IntronSupportEvidence instance to this Transcript. The + code checks to see if it is a unique ISE instance + Returntype : Boolean; true means it was added. False means it was not + as this ISE was already attached + Exceptions : None + +=cut + +sub add_IntronSupportingEvidence { + my ($self, $ise) = @_; + assert_ref($ise, 'Bio::EnsEMBL::IntronSupportingEvidence', 'IntronSupportingEvidence'); + my $unique = 1; + foreach my $other_ise (@{$self->{_ise_array}}) { + if($ise->equals($other_ise)) { + $unique = 0; + last; + } + } + if($unique) { + push(@{$self->{_ise_array}}, $ise); + return 1; + } + return 0; +} + +=head2 get_all_Introns + + Arg [1] : none + Example : my @introns = @{$transcript->get_all_Introns()}; + Description: Returns an listref of the introns in this transcript in order. + i.e. the first intron in the listref is the 5prime most exon in + the transcript. + Returntype : listref to Bio::EnsEMBL::Intron objects + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_Introns { + my ($self) = @_; + if( ! defined $self->{'_trans_exon_array'} && defined $self->adaptor() ) { + $self->{'_trans_exon_array'} = $self->adaptor()->db()-> + get_ExonAdaptor()->fetch_all_by_Transcript( $self ); + } + + my @introns=(); + my @exons = @{$self->{'_trans_exon_array'}}; + for(my $i=0; $i < scalar(@exons)-1; $i++){ + my $intron = new Bio::EnsEMBL::Intron($exons[$i],$exons[$i+1]); + push(@introns, $intron) + } + return \@introns; +} + + +=head2 length + + Args : none + Example : my $t_length = $transcript->length + Description: Returns the sum of the length of all the exons in the transcript. + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub length { + my( $self ) = @_; + + my $length = 0; + foreach my $ex (@{$self->get_all_Exons}) { + $length += $ex->length; + } + + # adjust the length if post transcriptional edits are enabled + if($self->edits_enabled()) { + foreach my $se (@{$self->get_all_SeqEdits()}) { + $length += $se->length_diff(); + } + } + + return $length; +} + + +=head2 flush_Exons + + Arg [1] : none + Example : $transcript->flush_Exons(); + Description: Removes all Exons from this transcript and flushes related + internal caches. + Returntype : none + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub flush_Exons { + my ($self) = @_; + + $self->{'transcript_mapper'} = undef; + $self->{'coding_region_start'} = undef; + $self->{'coding_region_end'} = undef; + $self->{'cdna_coding_start'} = undef; + $self->{'cdna_coding_end'} = undef; + $self->{'start'} = undef; + $self->{'end'} = undef; + $self->{'strand'} = undef; + + $self->{'_trans_exon_array'} = []; +} + +=head2 flush_IntronSupportingEvidence + + Example : $transcript->flush_IntronSupportingEvidence(); + Description: Removes all IntronSupportingEvidence from this transcript + Returntype : none + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub flush_IntronSupportingEvidence { + my ($self) = @_; + $self->{_ise_array} = []; + return; +} + +=head2 five_prime_utr + + Arg [1] : none + Example : my $five_prime = $transcrpt->five_prime_utr + or warn "No five prime UTR"; + Description: Obtains a Bio::Seq object of the five prime UTR of this + transcript. If this transcript is a pseudogene + (i.e. non-translating) or has no five prime UTR undef is + returned instead. + Returntype : Bio::Seq or undef + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub five_prime_utr { + my $self = shift; + + my $cdna_coding_start = $self->cdna_coding_start(); + + return undef if(!$cdna_coding_start); + + my $seq = substr($self->spliced_seq, 0, $cdna_coding_start - 1); + + return undef if(!$seq); + + return + Bio::Seq->new( -id => $self->display_id, + -moltype => 'dna', + -alphabet => 'dna', + -seq => $seq ); +} + + +=head2 three_prime_utr + + Arg [1] : none + Example : my $three_prime = $transcrpt->three_prime_utr + or warn "No three prime UTR"; + Description: Obtains a Bio::Seq object of the three prime UTR of this + transcript. If this transcript is a pseudogene + (i.e. non-translating) or has no three prime UTR, + undef is returned instead. + Returntype : Bio::Seq or undef + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub three_prime_utr { + my $self = shift; + + my $cdna_coding_end = $self->cdna_coding_end(); + + return undef if(!$cdna_coding_end); + + my $seq = substr($self->spliced_seq, $cdna_coding_end); + + return undef if(!$seq); + + return + Bio::Seq->new( -id => $self->display_id, + -moltype => 'dna', + -alphabet => 'dna', + -seq => $seq ); +} + +=head2 five_prime_utr_Feature + + Example : my $five_prime = $transcrpt->five_prime_utr_Feature + or warn "No five prime UTR"; + Description: Returns the genomic coordinates of the start and end of the + 5' UTR of this transcript. Note that if you want the sequence + of the 5' UTR use C as this will return the + sequence from the spliced transcript. + Returntype : Bio::EnsEMBL::Feature or undef if there is no UTR + Exceptions : none + +=cut + +sub five_prime_utr_Feature { + my ($self) = @_; + my ($start, $end); + my $cdna_coding = $self->cdna_coding_start(); + my ($genomic_pos) = $self->cdna2genomic($cdna_coding, $cdna_coding); + if($self->strand() == 1) { + $start = $self->seq_region_start(); + if($start == $genomic_pos->start()) { + return; # just return as we have no UTR + } + $end = $genomic_pos->start() - 1; + } + else { + $end = $self->seq_region_end(); + if($end == $genomic_pos->start()) { + return; # just return as we have no UTR + } + $start = $genomic_pos->start() + 1; + } + + my $feature = Bio::EnsEMBL::Feature->new( + -START => $start, + -END => $end, + -STRAND => $self->strand(), + -SLICE => $self->slice(), + ); + return $feature; +} + +=head2 three_prime_utr_Feature + + Example : my $five_prime = $transcrpt->three_prime_utr_Feature + or warn "No three prime UTR"; + Description: Returns the genomic coordinates of the start and end of the + 3' UTR of this transcript. Note that if you want the sequence + of the 3' UTR use C as this will return the + sequence from the spliced transcript. + Returntype : Bio::EnsEMBL::Feature or undef if there is no UTR + Exceptions : none + +=cut + +sub three_prime_utr_Feature { + my ($self) = @_; + my ($start, $end); + my $cdna_coding = $self->cdna_coding_end(); + my ($genomic_pos) = $self->cdna2genomic($cdna_coding, $cdna_coding); + if($self->strand() == 1) { + $end = $self->seq_region_end(); + if($end == $genomic_pos->start()) { + return; # just return as we have no UTR + } + $start = $genomic_pos->start() + 1; + } + else { + $start = $self->seq_region_start(); + if($start == $genomic_pos->start()) { + return; # just return as we have no UTR + } + $end = $genomic_pos->start() - 1; + } + my $feature = Bio::EnsEMBL::Feature->new( + -START => $start, + -END => $end, + -STRAND => $self->strand(), + -SLICE => $self->slice(), + ); + return $feature; +} + + +=head2 get_all_translateable_Exons + + Args : none + Example : none + Description: Returns a list of exons that translate with the + start and end exons truncated to the CDS regions. + This function does not take into account any SeqEdits + (post transcriptional RNA modifictions) when constructing the + the 'translateable' exons, and it does not update the phase + information of the created 'translateable' exons. + + If this transcript is a pseudogene (i.e. non-translateable) + a reference to an empty list is returned. + + Returntype : listref Bio::EnsEMBL::Exon + Exceptions : throw if translation has invalid information + Caller : Genebuild + Status : Stable + +=cut + + +sub get_all_translateable_Exons { + my ( $self ) = @_; + + #return an empty list if there is no translation (i.e. pseudogene) + my $translation = $self->translation or return []; + my $start_exon = $translation->start_Exon; + my $end_exon = $translation->end_Exon; + my $t_start = $translation->start; + my $t_end = $translation->end; + + my( @translateable ); + + foreach my $ex (@{$self->get_all_Exons}) { + + if ($ex ne $start_exon and ! @translateable) { + next; # Not yet in translated region + } + + my $length = $ex->length; + + my $adjust_start = 0; + my $adjust_end = 0; + # Adjust to translation start if this is the start exon + if ($ex == $start_exon ) { + if ($t_start < 1 or $t_start > $length) { + warning("WARN: Translation start '$t_start' is outside exon $ex length=$length"); + return []; + } + $adjust_start = $t_start - 1; + } + + # Adjust to translation end if this is the end exon + if ($ex == $end_exon) { +# if ($t_end < 1 or $t_end > $length) { +# throw("Translation end '$t_end' is outside exon $ex length=$length"); +# } + $adjust_end = $t_end - $length; + } + + # Make a truncated exon if the translation start or + # end causes the coordinates to be altered. + if ($adjust_end || $adjust_start) { + my $newex = $ex->adjust_start_end( $adjust_start, $adjust_end ); + + push( @translateable, $newex ); + } else { + push(@translateable, $ex); + } + + # Exit the loop when we've found the last exon + last if $ex eq $end_exon; + } + return \@translateable; +} + + +=head2 translate + + Args : none + Example : none + Description: Return the peptide (plus eventual stop codon) for + this transcript. Does N-padding of non-phase + matching exons. It uses translateable_seq + internally. Returns undef if this Transcript does + not have a translation (i.e. pseudogene). + Returntype : Bio::Seq or undef + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub translate { + my ($self) = @_; + + if ( !defined( $self->translation() ) ) { return undef } + + my $mrna = $self->translateable_seq(); + + # Alternative codon tables (such as the mitochondrial codon table) + # can be specified for a sequence region via the seq_region_attrib + # table. A list of codon tables and their codes is at: + # http://www.ncbi.nlm.nih.gov/htbin-post/Taxonomy/wprintgc?mode=c + + my $codon_table_id; + my ( $complete5, $complete3 ); + if ( defined( $self->slice() ) ) { + my $attrib; + + ($attrib) = @{ $self->slice()->get_all_Attributes('codon_table') }; + if ( defined($attrib) ) { + $codon_table_id = $attrib->value(); + } + + ($attrib) = @{ $self->slice()->get_all_Attributes('complete5') }; + if ( defined($attrib) ) { + $complete5 = $attrib->value(); + } + + ($attrib) = @{ $self->slice()->get_all_Attributes('complete3') }; + if ( defined($attrib) ) { + $complete3 = $attrib->value(); + } + } + $codon_table_id ||= 1; # default vertebrate codon table + + # Remove final stop codon from the mrna if it is present. Produced + # peptides will not have '*' at end. If terminal stop codon is + # desired call translatable_seq directly and produce a translation + # from it. + + if ( CORE::length($mrna) % 3 == 0 ) { + my $codon_table = + Bio::Tools::CodonTable->new( -id => $codon_table_id ); + + if ( $codon_table->is_ter_codon( substr( $mrna, -3, 3 ) ) ) { + substr( $mrna, -3, 3, '' ); + } + } + + if ( CORE::length($mrna) < 1 ) { return undef } + + my $display_id = $self->translation->display_id() + || scalar( $self->translation() ); + + my $peptide = Bio::Seq->new( -seq => $mrna, + -moltype => 'dna', + -alphabet => 'dna', + -id => $display_id ); + + my $translation = + $peptide->translate( undef, undef, undef, $codon_table_id, undef, + undef, $complete5, $complete3 ); + + if ( $self->edits_enabled() ) { + $self->translation()->modify_translation($translation); + } + + return $translation; +} ## end sub translate + + +=head2 seq + + Description: Returns a Bio::Seq object which consists of just + : the sequence of the exons concatenated together, + : without messing about with padding with N\'s from + : Exon phases like B does. + Args : none + Example : none + Returntype : Bio::Seq + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub seq { + my ($self) = @_; + + return + Bio::Seq->new( -id => $self->display_id, + -moltype => 'dna', + -alphabet => 'dna', + -seq => $self->spliced_seq ); +} + + +=head2 pep2genomic + + Description: See Bio::EnsEMBL::TranscriptMapper::pep2genomic + +=cut + +sub pep2genomic { + my $self = shift; + return $self->get_TranscriptMapper()->pep2genomic(@_); +} + + +=head2 genomic2pep + + Description: See Bio::EnsEMBL::TranscriptMapper::genomic2pep + +=cut + +sub genomic2pep { + my $self = shift; + return $self->get_TranscriptMapper()->genomic2pep(@_); +} + + +=head2 cdna2genomic + + Description: See Bio::EnsEMBL::TranscriptMapper::cdna2genomic + +=cut + +sub cdna2genomic { + my $self = shift; + return $self->get_TranscriptMapper()->cdna2genomic(@_); +} + + +=head2 genomic2cdna + + Description: See Bio::EnsEMBL::TranscriptMapper::genomic2cdna + +=cut + +sub genomic2cdna { + my $self = shift; + return $self->get_TranscriptMapper->genomic2cdna(@_); +} + + +=head2 get_TranscriptMapper + + Args : none + Example : my $trans_mapper = $transcript->get_TranscriptMapper(); + Description: Gets a TranscriptMapper object which can be used to perform + a variety of coordinate conversions relating this transcript, + genomic sequence and peptide resulting from this transcripts + translation. + Returntype : Bio::EnsEMBL::TranscriptMapper + Exceptions : none + Caller : cdna2genomic, pep2genomic, genomic2cdna, cdna2genomic + Status : Stable + +=cut + +sub get_TranscriptMapper { + my ( $self ) = @_; + return $self->{'transcript_mapper'} ||= + Bio::EnsEMBL::TranscriptMapper->new($self); +} + + +=head2 start_Exon + + Title : start_Exon + Usage : $start_exon = $transcript->start_Exon; + Returntype : Bio::EnsEMBL::Exon + Description : The first exon in the transcript. + Args : NONE + Status : Stable + +=cut + +sub start_Exon { + my $self = shift; + return $self->get_all_Exons()->[0]; +} + + +=head2 end_Exon + + Title : end_exon + Usage : $end_exon = $transcript->end_Exon; + Description : The last exon in the transcript. + Returntype : Bio::EnsEMBL::Exon + Args : NONE + Status : Stable + +=cut + +sub end_Exon { + my $self = shift; + return $self->get_all_Exons()->[-1]; +} + + +=head2 description + + Title : description + Usage : $obj->description($newval) + Function: + Returns : String + Args : newvalue (optional) + Status : Stable + +=cut + +sub description { + my $self = shift; + $self->{'description'} = shift if( @_ ); + return $self->{'description'}; +} + + +=head2 version + + Title : version + Usage : $obj->version() + Function: + Returns : String + Args : + Status : Stable + +=cut + +sub version { + my $self = shift; + $self->{'version'} = shift if( @_ ); + return $self->{'version'}; +} + + +=head2 stable_id + + Title : stable_id + Usage : $obj->stable_id + Function: + Returns : String + Args : + Status : Stable + +=cut + +sub stable_id { + my $self = shift; + $self->{'stable_id'} = shift if( @_ ); + return $self->{'stable_id'}; +} + + +=head2 is_current + + Arg [1] : Boolean $is_current + Example : $transcript->is_current(1) + Description: Getter/setter for is_current state of this transcript. + Returntype : Int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub is_current { + my $self = shift; + $self->{'is_current'} = shift if (@_); + return $self->{'is_current'}; +} + + +=head2 created_date + + Arg [1] : (optional) string to be used for the created date + Example : none + Description: get/set for attribute created date + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub created_date { + my $self = shift; + $self->{'created_date'} = shift if ( @_ ); + return $self->{'created_date'}; +} + + +=head2 modified_date + + Arg [1] : (optional) string to be used for the modified date + Example : none + Description: get/set for attribute modified date + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub modified_date { + my $self = shift; + $self->{'modified_date'} = shift if ( @_ ); + return $self->{'modified_date'}; +} + + +=head2 swap_exons + + Arg [1] : Bio::EnsEMBL::Exon $old_Exon + An exon that should be replaced + Arg [2] : Bio::EnsEMBL::Exon $new_Exon + The replacement Exon + Example : none + Description: exchange an exon in the current Exon list with a given one. + Usually done before storing of Gene, so the Exons can + be shared between Transcripts. + Returntype : none + Exceptions : none + Caller : GeneAdaptor->store() + Status : Stable + +=cut + +sub swap_exons { + my ( $self, $old_exon, $new_exon ) = @_; + + my $arref = $self->{'_trans_exon_array'}; + for(my $i = 0; $i < @$arref; $i++) { + if($arref->[$i] == $old_exon) { + $arref->[$i] = $new_exon; + last; + } + } + + if( defined $self->{'translation'} ) { + if( $self->translation()->start_Exon() == $old_exon ) { + $self->translation()->start_Exon( $new_exon ); + } + if( $self->translation()->end_Exon() == $old_exon ) { + $self->translation()->end_Exon( $new_exon ); + } + } +} + + +=head2 equals + + Arg [1] : Bio::EnsEMBL::Transcript transcript + Example : if ($transcriptA->equals($transcriptB)) { ... } + Description : Compares two transcripts for equality. + The test for eqality goes through the following list + and terminates at the first true match: + + 1. If Bio::EnsEMBL::Feature::equals() returns false, + then the transcripts are *not* equal. + 2. If the biotypes differ, then the transcripts are + *not* equal. + 3. If both transcripts have stable IDs: if these are + the same, the transcripts are equal, otherwise not. + 4. If both transcripts have the same number of exons + and if these are (when compared pair-wise sorted by + start-position and length) the same, then they are + equal, otherwise not. + + Return type : Boolean (0, 1) + + Exceptions : Thrown if a non-transcript is passed as the argument. + +=cut + +sub equals { + my ( $self, $transcript ) = @_; + + if ( !defined($transcript) ) { return 0 } + if ( $self eq $transcript ) { return 1 } + + assert_ref( $transcript, 'Bio::EnsEMBL::Transcript' ); + + my $feature_equals = $self->SUPER::equals($transcript); + if ( defined($feature_equals) && $feature_equals == 0 ) { + return 0; + } + + if ( $self->biotype() ne $transcript->biotype() ) { + return 0; + } + + if ( defined( $self->stable_id() ) && + defined( $transcript->stable_id() ) ) + { + if ( $self->stable_id() eq $transcript->stable_id() ) { + return 1; + } + else { + return 0; + } + } + + my @self_exons = sort { + $a->start() <=> $b->start() || + $a->length() <=> $b->length() + } @{ $self->get_all_Exons() }; + my @transcript_exons = sort { + $a->start() <=> $b->start() || + $a->length() <=> $b->length() + } @{ $transcript->get_all_Exons() }; + + if ( scalar(@self_exons) != scalar(@transcript_exons) ) { + return 0; + } + + while (@self_exons) { + my $self_exon = shift(@self_exons); + my $transcript_exon = shift(@transcript_exons); + + if ( !$self_exon->equals($transcript_exon) ) { + return 0; + } + } + + return 1; +} ## end sub equals + +=head2 transform + + Arg 1 : String $coordinate_system_name + Arg [2] : String $coordinate_system_version + Example : $transcript = $transcript->transform('contig'); + $transcript = $transcript->transform('chromosome', 'NCBI33'); + Description: Moves this Transcript to the given coordinate system. + If this Transcript has Exons attached, they move as well. + A new Transcript is returned. If the transcript cannot be + transformed to the destination coordinate system undef is + returned instead. + Returntype : Bio::EnsEMBL::Transcript + Exceptions : wrong parameters + Caller : general + Status : Medium Risk + : deprecation needs to be removed at some time + +=cut + + +sub transform { + my $self = shift; + + # catch for old style transform calls + if( ref $_[0] eq 'HASH') { + deprecate("Calling transform with a hashref is deprecate.\n" . + 'Use $trans->transfer($slice) or ' . + '$trans->transform("coordsysname") instead.'); + my (undef, $new_ex) = each(%{$_[0]}); + return $self->transfer($new_ex->slice); + } + + my $new_transcript = $self->SUPER::transform(@_); + if ( !defined($new_transcript) ) { + my @segments = @{ $self->project(@_) }; + # if it projects, maybe the exons transform well? + # lazy load them here + if ( !@segments ) { + return undef; + } + $self->get_all_Exons(); + } + + + if( exists $self->{'_trans_exon_array'} ) { + my @new_exons; + my ( $low_start, $hi_end, $slice ); + # we want to check whether the transform preserved 5prime 3prime + # ordering. This assumes 5->3 order. No complaints on transsplicing. + + my ( $last_new_start, $last_old_strand, + $last_new_strand, $start_exon, $end_exon, + $last_seq_region_name ); + my $first = 1; + my $ignore_order = 0; + my $order_broken = 0; + + for my $old_exon ( @{$self->{'_trans_exon_array'}} ) { + my $new_exon = $old_exon->transform( @_ ); + return undef if( !defined $new_exon ); + if( ! defined $new_transcript ) { + if( !$first ) { + if( $old_exon->strand() != $last_old_strand ) { + # transsplicing, ignore ordering + $ignore_order = 1; + } + + if( $new_exon->slice()->seq_region_name() ne + $last_seq_region_name ) { + return undef; + } + + if( $last_new_strand == 1 and + $new_exon->start() < $last_new_start ) { + $order_broken = 1; + } + + if( $last_new_strand == -1 and + $new_exon->start() > $last_new_start ) { + $order_broken = 1; + } + + #additional check that if exons were on same strand previously, they should be again + if(($last_old_strand == $old_exon->strand()) and !($last_new_strand == $new_exon->strand())){ + return undef; + } + + if( $new_exon->start() < $low_start ) { + $low_start = $new_exon->start(); + } + if( $new_exon->end() > $hi_end ) { + $hi_end = $new_exon->end(); + } + } else { + $first = 0; + $low_start = $new_exon->start(); + $hi_end = $new_exon->end(); + } + + $last_seq_region_name = $new_exon->slice()->seq_region_name(); + $last_old_strand = $old_exon->strand(); + $last_new_start = $new_exon->start(); + $last_new_strand = $new_exon->strand(); + } + + if( defined $self->{'translation'} ) { + if( $self->translation()->start_Exon() == $old_exon ) { + $start_exon = $new_exon; + } + if( $self->translation()->end_Exon() == $old_exon ) { + $end_exon = $new_exon; + } + } + push( @new_exons, $new_exon ); + } + + if( $order_broken && !$ignore_order ) { + warning( "Order of exons broken in transform of ".$self->dbID() ); + return undef; + } + + if( !defined $new_transcript ) { + %$new_transcript = %$self; + bless $new_transcript, ref( $self ); + $new_transcript->start( $low_start ); + $new_transcript->end( $hi_end ); + $new_transcript->slice( $new_exons[0]->slice() ); + $new_transcript->strand( $new_exons[0]->strand() ); + } + + $new_transcript->{'_trans_exon_array'} = \@new_exons; + + # should be ok to do inside exon array loop + # translations only exist together with the exons ... + + if( defined $self->{'translation'} ) { + my $new_translation; + %$new_translation = %{$self->{'translation'}};; + bless $new_translation, ref( $self->{'translation'} ); + $new_transcript->{'translation'} = $new_translation; + $new_translation->start_Exon( $start_exon ); + $new_translation->end_Exon( $end_exon ); + } + } + + if( exists $self->{'_supporting_evidence'} ) { + my @new_features; + for my $old_feature ( @{$self->{'_supporting_evidence'}} ) { + my $new_feature = $old_feature->transform( @_ ); + if (defined $new_feature) { + push @new_features, $new_feature; + } + } + $new_transcript->{'_supporting_evidence'} = \@new_features; + } + + if(exists $self->{_ise_array}) { + my @new_features; + foreach my $old_feature ( @{$self->{_ise_array}} ) { + my $new_feature = $old_feature->transform(@_); + push( @new_features, $new_feature ); + } + $new_transcript->{_ise_array} = \@new_features; + } + + + # flush cached internal values that depend on the exon coords + $new_transcript->{'transcript_mapper'} = undef; + $new_transcript->{'coding_region_start'} = undef; + $new_transcript->{'coding_region_end'} = undef; + $new_transcript->{'cdna_coding_start'} = undef; + $new_transcript->{'cdna_coding_end'} = undef; + + return $new_transcript; +} + + +=head2 transfer + + Arg 1 : Bio::EnsEMBL::Slice $destination_slice + Example : $transcript = $transcript->transfer($slice); + Description: Moves this transcript to the given slice. + If this Transcripts has Exons attached, they move as well. + Returntype : Bio::EnsEMBL::Transcript + Exceptions : none + Caller : general + Status : Stable + +=cut + + +sub transfer { + my $self = shift; + + my $new_transcript = $self->SUPER::transfer( @_ ); + return undef unless $new_transcript; + + if( defined $self->{'translation'} ) { + my $new_translation; + %$new_translation = %{$self->{'translation'}};; + bless $new_translation, ref( $self->{'translation'} ); + $new_transcript->{'translation'} = $new_translation; + } + + if( exists $self->{'_trans_exon_array'} ) { + my @new_exons; + for my $old_exon ( @{$self->{'_trans_exon_array'}} ) { + my $new_exon = $old_exon->transfer( @_ ); + if( defined $new_transcript->{'translation'} ) { + if( $new_transcript->translation()->start_Exon() == $old_exon ) { + $new_transcript->translation()->start_Exon( $new_exon ); + } + if( $new_transcript->translation()->end_Exon() == $old_exon ) { + $new_transcript->translation()->end_Exon( $new_exon ); + } + } + push( @new_exons, $new_exon ); + } + + $new_transcript->{'_trans_exon_array'} = \@new_exons; + } + + if( exists $self->{'_supporting_evidence'} ) { + my @new_features; + for my $old_feature ( @{$self->{'_supporting_evidence'}} ) { + my $new_feature = $old_feature->transfer( @_ ); + push( @new_features, $new_feature ); + } + $new_transcript->{'_supporting_evidence'} = \@new_features; + } + + if(exists $self->{_ise_array}) { + my @new_features; + foreach my $old_feature ( @{$self->{_ise_array}} ) { + my $new_feature = $old_feature->transfer(@_); + push( @new_features, $new_feature ); + } + $new_transcript->{_ise_array} = \@new_features; + } + + + # flush cached internal values that depend on the exon coords + $new_transcript->{'transcript_mapper'} = undef; + $new_transcript->{'coding_region_start'} = undef; + $new_transcript->{'coding_region_end'} = undef; + $new_transcript->{'cdna_coding_start'} = undef; + $new_transcript->{'cdna_coding_end'} = undef; + + return $new_transcript; +} + + +=head recalculate_coordinates + + Args : none + Example : none + Description: called when exon coordinate change happened to recalculate the + coords of the transcript. This method should be called if one + of the exons has been changed. + Returntype : none + Exceptions : none + Caller : internal + Status : Stable + +=cut + +sub recalculate_coordinates { + my ($self) = @_; + + my $exons = $self->get_all_Exons(); + + if ( !$exons || !@{$exons} ) { return } + + my ( $slice, $start, $end, $strand ); + + my $e_index; + for ( $e_index = 0; $e_index < @{$exons}; $e_index++ ) { + my $e = $exons->[$e_index]; + + # Skip missing or unmapped exons! + if ( defined($e) && defined( $e->start() ) ) { + $slice = $e->slice(); + $strand = $e->strand(); + $start = $e->start(); + $end = $e->end(); + + last; + } + } + + my $transsplicing = 0; + + # Start loop after first exon with coordinates + for ( ; $e_index < @{$exons}; $e_index++ ) { + my $e = $exons->[$e_index]; + + # Skip missing or unmapped exons! + if ( !defined($e) || !defined( $e->start() ) ) { next } + + if ( $e->start() < $start ) { + $start = $e->start(); + } + + if ( $e->end() > $end ) { + $end = $e->end(); + } + + if ( defined($slice) + && $e->slice() + && $e->slice()->name() ne $slice->name() ) + { + throw( "Exons with different slices " + . "are not allowed on one Transcript" ); + } + + if ( $e->strand() != $strand ) { + $transsplicing = 1; + } + } ## end for ( ; $e_index < @{$exons...}) + if ($transsplicing) { + warning("Transcript contained trans splicing event"); + } + + $self->start($start); + $self->end($end); + $self->strand($strand); + $self->slice($slice); + + # flush cached internal values that depend on the exon coords + $self->{'transcript_mapper'} = undef; + $self->{'coding_region_start'} = undef; + $self->{'coding_region_end'} = undef; + $self->{'cdna_coding_start'} = undef; + $self->{'cdna_coding_end'} = undef; +} ## end sub recalculate_coordinates + + +=head2 display_id + + Arg [1] : none + Example : print $transcript->display_id(); + Description: This method returns a string that is considered to be + the 'display' identifier. For transcripts this is (depending on + availability and in this order) the stable Id, the dbID or an + empty string. + Returntype : string + Exceptions : none + Caller : web drawing code + Status : Stable + +=cut + +sub display_id { + my $self = shift; + return $self->{'stable_id'} || $self->dbID || ''; +} + + +=head2 get_all_peptide_variations + + Description: See Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_peptide_variations + Status : At Risk + : Will be replaced with modules from the ensembl-variation package + + +=cut + +sub get_all_peptide_variations { + my ($self, $source, $snps) = @_; + + if(!$snps) { + my $shash = Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_cdna_SNPs($self, $source); + $snps = $shash->{'coding'}; + } + + return Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_peptide_variations($self, + $snps); +} + + +=head2 get_all_SNPs + + Description: See Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_SNPs + + Status : At Risk + : Will be replaced with modules from the ensembl-variation package + +=cut + +sub get_all_SNPs { + return Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_SNPs(@_); +} + + +=head2 get_all_cdna_SNPs + + Description: See Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_cdna_SNPs + + Status : At Risk + : Will be replaced with modules from the ensembl-variation package + +=cut + +sub get_all_cdna_SNPs { + return Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_cdna_SNPs(@_); +} + + +=head2 get_all_DASFactories + + Arg [1] : none + Function : Retrieves a listref of registered DAS objects + Returntype: [ DAS_objects ] + Exceptions: + Caller : + Example : $dasref = $prot->get_all_DASFactories + Status : Stable + +=cut + +sub get_all_DASFactories { + my $self = shift; + return [ $self->adaptor()->db()->_each_DASFeatureFactory ]; +} + + +=head2 get_all_DAS_Features + + Arg [1] : none + Example : $features = $prot->get_all_DAS_Features; + Description: Retreives a hash reference to a hash of DAS feature + sets, keyed by the DNS, NOTE the values of this hash + are an anonymous array containing: + (1) a pointer to an array of features; + (2) a pointer to the DAS stylesheet + Returntype : hashref of Bio::SeqFeatures + Exceptions : ? + Caller : webcode + Status : Stable + + +=cut + +sub get_all_DAS_Features { + my ($self,@args) = @_; + + my $db = $self->adaptor->db; + my $GeneAdaptor = $db->get_GeneAdaptor; + my $Gene = $GeneAdaptor->fetch_by_transcript_stable_id($self->stable_id); + my $slice = $Gene->feature_Slice; + return $self->SUPER::get_all_DAS_Features($slice); +} + + + +=head2 _compare_xrefs + + Description: compare xrefs based on priority (descending), then + name (ascending), then display_label (ascending) + +=cut + +sub _compare_xrefs { + # compare on priority first (descending) + if ( $a->priority() != $b->priority() ) { + return $b->priority() <=> $a->priority(); + } else { + # equal priorities, compare on external_db name + if ( $a->dbname() ne $b->dbname() ) { + return $a->dbname() cmp $b->dbname(); + } else { + # equal priorities and names, compare on display_label + return $a->display_id() cmp $b->display_id(); + } + } +} + + +=head2 load + + Arg [1] : Boolean $load_xrefs + Load (or don't load) xrefs. Default is to load xrefs. + Example : $transcript->load(); + Description : The Ensembl API makes extensive use of + lazy-loading. Under some circumstances (e.g., + when copying genes between databases), all data of + an object needs to be fully loaded. This method + loads the parts of the object that are usually + lazy-loaded. It will also call the equivalent + method on any translation and on all exons of the + transcript. + Returntype : None + +=cut + +sub load { + my ( $self, $load_xrefs ) = @_; + + if ( !defined($load_xrefs) ) { $load_xrefs = 1 } + + my $translation = $self->translation(); + if ( defined($translation) ) { + $translation->load($load_xrefs); + + my $alt_translations = $self->get_all_alternative_translations(); + + if ( defined($alt_translations) ) { + foreach my $alt_translation ( @{$alt_translations} ) { + $alt_translation->load($load_xrefs); + } + } + } + + foreach my $exon ( @{ $self->get_all_Exons() } ) { + $exon->load(); + } + + $self->stable_id(); + $self->analysis(); + $self->get_all_Attributes(); + $self->get_all_supporting_features(); + + if ($load_xrefs) { + $self->get_all_DBEntries(); + } + +} ## end sub load + +=head2 summary_as_hash + + Example : $transcript_summary = $transcript->summary_as_hash(); + Description : Extends Feature::summary_as_hash + Retrieves a summary of this Transcript. + Returns : hashref of descriptive strings + Status : Intended for internal use +=cut + +sub summary_as_hash { + my $self = shift; + my $summary_ref = $self->SUPER::summary_as_hash; + $summary_ref->{'description'} = $self->description; + $summary_ref->{'biotype'} = $self->biotype; + my $parent_gene = $self->get_Gene(); + $summary_ref->{'Parent'} = $parent_gene->display_id; + return $summary_ref; +} + +=head2 get_Gene + + Example : $gene = $transcript->get_Gene; + Description : Locates the parent Gene using a transcript dbID + Returns : Bio::EnsEMBL::Gene + +=cut + +sub get_Gene { + my $self = shift; + my $gene_adaptor = $self->adaptor->db->get_GeneAdaptor(); + my $parent_gene = $gene_adaptor->fetch_by_transcript_id($self->dbID); + return $parent_gene; +} + +########################### +# DEPRECATED METHODS FOLLOW +########################### + +=head2 sort + + Description: DEPRECATED. This method is no longer needed. Exons are sorted + automatically when added to the transcript. + +=cut + +sub sort { + my $self = shift; + + deprecate( "Exons are kept sorted, you dont have to call sort any more" ); + # Fetch all the features + my @exons = @{$self->get_all_Exons()}; + + # Empty the feature table + $self->flush_Exons(); + + # Now sort the exons and put back in the feature table + my $strand = $exons[0]->strand; + + if ($strand == 1) { + @exons = sort { $a->start <=> $b->start } @exons; + } elsif ($strand == -1) { + @exons = sort { $b->start <=> $a->start } @exons; + } + + foreach my $e (@exons) { + $self->add_Exon($e); + } +} + + +# _translation_id +# Usage : DEPRECATED - not needed anymore + +sub _translation_id { + my $self = shift; + deprecate( "This method shouldnt be necessary any more" ); + if( @_ ) { + my $value = shift; + $self->{'_translation_id'} = $value; + } + return $self->{'_translation_id'}; + +} + + +=head2 created + + Description: DEPRECATED - this attribute is not part of transcript anymore + +=cut + +sub created{ + my $obj = shift; + deprecate( "This attribute is no longer supported" ); + if( @_ ) { + my $value = shift; + $obj->{'created'} = $value; + } + return $obj->{'created'}; +} + + +=head2 modified + + Description: DEPRECATED - this attribute is not part of transcript anymore + +=cut + +sub modified{ + my $obj = shift; + deprecate( "This attribute is no longer supported" ); + if( @_ ) { + my $value = shift; + $obj->{'modified'} = $value; + } + return $obj->{'modified'}; +} + + +=head2 temporary_id + + Function: DEPRECATED: Use dbID or stable_id or something else instead + +=cut + +sub temporary_id{ + my ($obj,$value) = @_; + deprecate( "I cant see what a temporary_id is good for, please use dbID" . + "or stableID or\ntry without an id." ); + if( defined $value) { + $obj->{'temporary_id'} = $value; + } + return $obj->{'temporary_id'}; +} + + +=head2 type + + Description: DEPRECATED. Use biotype() instead. + +=cut + +sub type { + deprecate("Use biotype() instead"); + biotype(@_); +} + + +=head2 confidence + + Description: DEPRECATED. Use status() instead. + +=cut + +sub confidence { + deprecate("Use status() instead"); + status(@_); +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/TranscriptFactory.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/TranscriptFactory.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,325 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::TranscriptFactory - Module having the fset2transcript* +subroutines + +=head1 SYNOPSIS + + use Bio::EnsEMBL::TranscriptFactory; + + &Bio::EnsEMBL::TranscriptFactory::fset2transcript($fset_id); + +=head1 DESCRIPTION + +Module containing the subroutines fset2transcript*, +which create transcripts from features (formally housed in +Bio::EnsEMBL::DBSQL::Utils). + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::TranscriptFactory; + +use strict; + +use Bio::EnsEMBL::Exon; +use Bio::EnsEMBL::Translation; +use Bio::EnsEMBL::Transcript; + +sub fset2transcript { + my ($genscan,$contig)=@_; + + + unless ($genscan->isa ("Bio::EnsEMBL::SeqFeatureI")) + {print "$genscan must be Bio::EnsEMBL::SeqFeatureI\n";} + + my $transcript = new Bio::EnsEMBL::Transcript; + $transcript->temporary_id($contig->id . "." . $genscan->seqname); + + my @exons; + my $count= 1; + + foreach my $f ($genscan->sub_SeqFeature) { + + my $exon = new Bio::EnsEMBL::Exon; + $transcript->add_Exon($exon); + $exon->contig ($contig); + $exon->start ($f->start); + $exon->end ($f->end ); + $exon->strand ($f->strand); + $exon->phase ($f->phase); + $exon->end_phase( ($exon->phase + $exon->length)%3 ); + #$exon->score($f->score); +# $exon->p_value($f->p_value); + $exon->slice($contig->primary_seq); + + push(@exons,$exon); + $count++; + + } + + if( $count == 1 ) { + $genscan->throw("Got a 0 exon genscan"); + } + + my $translation = new Bio::EnsEMBL::Translation; + # + # This code got changed due to Translation convention changing. Should work... + # + + if ($exons[0]->strand == 1) { + @exons = sort {$a->start <=> $b->start} @exons; + } else { + @exons = sort {$b->start <=> $a->start} @exons; + } + + $translation->start(1); + $translation->end($exons[scalar(@exons)-1]->length); + + $translation->start_Exon($exons[0]); + $translation->end_Exon($exons[$#exons]); + + my $endphase = $exons[0]->end_phase; + + foreach my $exon (@exons) { + + if ( $exon == $exons[0] ){ + next; + } + $exon->phase ($endphase); + $endphase = $exon->end_phase; + } + + $transcript->translation($translation); + + return $transcript; +} + +sub fset2transcript_guess_phases { + my ($fset,$contig) = @_; + + my $transcript = new Bio::EnsEMBL::Transcript; + + $transcript->temporary_id($contig->id . "." . $fset->id); + + + my @exons; + my $count = 1; + + foreach my $f ($fset->sub_SeqFeature) { + + my $exon = new Bio::EnsEMBL::Exon; + $exon->contig ($contig); + $exon->start ($f->start); + $exon->end ($f->end ); + $exon->strand ($f->strand); + #$exon->score($f->score); +# $exon->p_value($f->p_value); + $exon->slice($contig); + $exon->phase($f->phase); + push(@exons,$exon); + $count++; + + } + + my $translation = new Bio::EnsEMBL::Translation; + + if ($exons[0]->strand == 1) { + @exons = sort {$a->start <=> $b->start} @exons; + } else { + @exons = sort {$b->start <=> $a->start} @exons; + } + + $translation->start (1); + $translation->end ($exons[$#exons]->end - $exons[$#exons]->start + 1); + $translation->start_Exon($exons[0]); + $translation->end_Exon($exons[$#exons]); + $transcript->translation($translation); + + my $endphase = 0; + + foreach my $exon (@exons) { + + $exon ->phase ($endphase); + $transcript->add_Exon($exon); + + $endphase = $exon->end_phase(($exon->phase + $exon->length)%3); + + } + + + if ($transcript->translate->seq !~ /\*/) { + return $transcript; + } + + $endphase = 1; + + foreach my $exon (@exons) { + $exon->phase($endphase); + $endphase = $exon->end_phase(($exon->phase + $exon->length)%3); + } + + if ($transcript->translate->seq !~ /\*/) { + return $transcript; + } + + $endphase = 2; + + foreach my $exon (@exons) { + $exon->phase($endphase); + $endphase = $exon->end_phase(($exon->phase + $exon->length)%3); + } + + if ($transcript->translate->seq !~ /\*/) { + return $transcript; + } +} + +sub fset2transcript_3frame { + my ($fset,$contig) = @_; + + my @f = $fset->sub_SeqFeature; + + if ($f[0]->strand == 1) { + @f = sort {$a->start <=> $b->start} @f; + } else { + @f = sort {$b->start <=> $a->start} @f; + } + + my @transcripts; + + my $startphase = 0; + + while ($startphase < 3) { + my $endphase = $startphase; + + my $transcript = new Bio::EnsEMBL::Transcript; + + push(@transcripts,$transcript); + + $transcript->temporary_id($contig->id . "." . $endphase); + + my $count = 1; + my @exons; + + + foreach my $f (@f) { + #print "exon seqname = ".$f->seqname."\n"; + my $exon = new Bio::EnsEMBL::Exon; + #print STDERR "exon ".$f->gffstring."\n"; + push(@exons,$exon); + $exon->seqname($f->seqname); + $exon->temporary_id ($contig->id . ".$count"); + $exon->contig ($contig); + $exon->start ($f->start); + $exon->end ($f->end ); + $exon->strand ($f->strand); + $exon->slice($contig); + $exon->phase ($endphase); + $exon->end_phase( ($exon->phase + $exon->length)%3 ); + #$exon->score ($f->score); +# $exon->p_value ($f->p_value); + $endphase = $exon->end_phase; + + $transcript->add_Exon($exon); + $count++; + + #print STDERR "Added exon start " . $exon->start . " end " . $exon->end . " strand " . $exon->strand . " score " . $exon->score . " pvalue " . $exon->p_value . "\n"; + } + + my $translation = new Bio::EnsEMBL::Translation; + + my $contig_id = ""; + my $fset_id = ""; + + if (defined($contig->id)) { + $contig_id = $contig->id; + } + if (defined($fset->id)) { + $fset_id = $fset->id; + } + + $translation->temporary_id($contig_id . "." . $fset_id); + $translation->start (1); + $translation->end ($exons[$#exons]->end - $exons[$#exons]->start + 1); + $translation->start_Exon($exons[0]); + $translation->end_Exon ($exons[$#exons]); + $transcript->translation($translation); + + # print STDERR "Phase $startphase " . $transcript->translate->seq . "\n"; + + $startphase++; + } + #print "finshed fset2transcript_3frame\n"; + return @transcripts; +} + + +sub fset2transcript_with_seq { + my ($genscan,$seq)=@_; + + + unless ($genscan->isa ("Bio::EnsEMBL::SeqFeatureI")) + {print "$genscan must be Bio::EnsEMBL::SeqFeatureI\n";} + unless ($seq->isa ("Bio::PrimarySeqI") || $seq->isa ("Bio::SeqI")) + {print "$seq must be Bio::SeqI or a Bio::PrimarySeqI\n";} + + #print STDERR "running fset2transcript\n"; + my $transcript = new Bio::EnsEMBL::Transcript; + $transcript->temporary_id($seq->id . "." . $genscan->seqname); + + my @exons; + my $count= 1; + + foreach my $f ($genscan->sub_SeqFeature) { + + my $exon = new Bio::EnsEMBL::Exon; + $exon->contig ($seq); + $exon->start ($f->start); + $exon->end ($f->end ); + $exon->strand ($f->strand); + $exon->phase ($f->phase); + $exon->end_phase( ($exon->phase + $exon->length)%3 ); + #$exon->score ($f->score); + #print STDERR "contig is a = ".$seq."\n"; + $exon->slice($seq); + + push(@exons,$exon); + $count++; + + } + + foreach my $exon (@exons) { + + $transcript->add_Exon($exon); + + + } + return $transcript; + +} + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/TranscriptMapper.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/TranscriptMapper.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,512 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +TranscriptMapper - A utility class used to perform coordinate conversions +between a number of coordinate systems relating to transcripts + +=head1 SYNOPSIS + + my $trmapper = Bio::EnsEMBL::TranscriptMapper->new($transcript); + + @coords = $trmapper->cdna2genomic( 123, 554 ); + + @coords = $trmapper->genomic2cdna( 141, 500, -1 ); + + @coords = $trmapper->genomic2cds( 141, 500, -1 ); + + @coords = $trmapper->pep2genomic( 10, 60 ); + + @coords = $trmapper->genomic2pep( 123, 400, 1 ); + +=head1 DESCRIPTION + +This is a utility class which can be used to perform coordinate conversions +between a number of coordinate systems relating to transcripts. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::TranscriptMapper; + +use strict; +use warnings; + +use Bio::EnsEMBL::Utils::Exception qw(throw); + +use Bio::EnsEMBL::Mapper; +use Bio::EnsEMBL::Mapper::Gap; +use Bio::EnsEMBL::Mapper::Coordinate; + + + +=head2 new + + Arg [1] : Bio::EnsEMBL::Transcript $transcript + The transcript for which a TranscriptMapper should be created. + Example : $trans_mapper = Bio::EnsEMBL::TranscriptMapper->new($transcript) + Description: Creates a TranscriptMapper object which can be used to perform + various coordinate transformations relating to transcripts. + Note that the TranscriptMapper uses the transcript state at the + time of creation to perform the conversions, and that a new + TranscriptMapper must be created if the Transcript is altered. + 'Genomic' coordinates are coordinates which are relative to the + slice that the Transcript is on. + Returntype : Bio::EnsEMBL::TranscriptMapper + Exceptions : throws if a transcript is not an argument + Caller : Transcript::get_TranscriptMapper + Status : Stable + +=cut + +sub new { + my $caller = shift; + my $transcript = shift; + + my $class = ref($caller) || $caller; + + if(!ref($transcript) || !$transcript->isa('Bio::EnsEMBL::Transcript')) { + throw("Transcript argument is required."); + } + + + my $exons = $transcript->get_all_Exons(); + my $start_phase; + if(@$exons) { + $start_phase = $exons->[0]->phase; + } else { + $start_phase = -1; + } + + # Create a cdna <-> genomic mapper and load it with exon coords + my $mapper = _load_mapper($transcript,$start_phase); + + my $self = bless({'exon_coord_mapper' => $mapper, + 'start_phase' => $start_phase, + 'cdna_coding_start' => $transcript->cdna_coding_start(), + 'cdna_coding_end' => $transcript->cdna_coding_end()}, + $class); +} + + +=head2 _load_mapper + + Arg [1] : Bio::EnsEMBL::Transcript $transcript + The transcript for which a mapper should be created. + Example : my $mapper = _load_mapper($transcript); + Description: loads the mapper + Returntype : Bio::EnsEMBL::Mapper + Exceptions : none + Caller : Internal + Status : Stable + +=cut + +sub _load_mapper { + my $transcript = shift; + my $start_phase = shift; + + my $mapper = Bio::EnsEMBL::Mapper->new( 'cdna', 'genomic'); + + my $edits_on = $transcript->edits_enabled(); + my @edits; + + if($edits_on) { + @edits = @{$transcript->get_all_SeqEdits()}; + @edits = sort {$a->start() <=> $b->start()} @edits; + } + + my $edit_shift = 0; + + my $cdna_start = undef; + + my $cdna_end = 0; + + + foreach my $ex (@{$transcript->get_all_Exons}) { + my $gen_start = $ex->start(); + my $gen_end = $ex->end(); + + $cdna_start = $cdna_end + 1; + $cdna_end = $cdna_start + $ex->length() - 1; + + my $strand = $ex->strand(); + + # add deletions and insertions into pairs when SeqEdits turned on + # ignore mismatches (i.e. treat as matches) + if($edits_on) { + while(@edits && $edits[0]->start() + $edit_shift <= $cdna_end) { + + my $edit = shift(@edits); + my $len_diff = $edit->length_diff(); + + if($len_diff) { + # break pair into two parts, finish first pair just before edit + + my $prev_cdna_end = $edit->start() + $edit_shift - 1; + my $prev_cdna_start = $cdna_start; + my $prev_len = $prev_cdna_end - $prev_cdna_start + 1; + + my $prev_gen_end; + my $prev_gen_start; + if($strand == 1) { + $prev_gen_start = $gen_start; + $prev_gen_end = $gen_start + $prev_len - 1; + } else { + $prev_gen_start = $gen_end - $prev_len + 1; + $prev_gen_end = $gen_end; + } + + if($prev_len > 0) { # only create map pair if not boundary case + $mapper->add_map_coordinates + ('cdna', $prev_cdna_start, $prev_cdna_end, $strand, + 'genome', $prev_gen_start,$prev_gen_end); + } + + $cdna_start = $prev_cdna_end + 1; + + if($strand == 1) { + $gen_start = $prev_gen_end + 1; + } else { + $gen_end = $prev_gen_start - 1; + } + + $cdna_end += $len_diff; + + if($len_diff > 0) { + # insert in cdna, shift cdna coords along + $cdna_start += $len_diff; + } else { + # delete in cdna (insert in genomic), shift genomic coords along + + if($strand == 1) { + $gen_start -= $len_diff; + } else { + $gen_end += $len_diff; + } + } + + $edit_shift += $len_diff; + } + } + } + + my $pair_len = $cdna_end - $cdna_start + 1; + + if($pair_len > 0) { + $mapper->add_map_coordinates('cdna', $cdna_start, $cdna_end, $strand, + 'genome', $gen_start, $gen_end); + } + } + + return $mapper; +} + + +=head2 cdna2genomic + + Arg [1] : $start + The start position in cdna coordinates + Arg [2] : $end + The end position in cdna coordinates + Example : @cdna_coords = $transcript_mapper->cdna2genomic($start, $end); + Description: Converts cdna coordinates to genomic coordinates. The + return value is a list of coordinates and gaps. + Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and + Bio::EnsEMBL::Mapper::Gap objects + Exceptions : throws if no start or end + Caller : general + Status : Stable + +=cut + + +sub cdna2genomic { + my ($self,$start,$end) = @_; + + if( !defined $end ) { + throw("Must call with start/end"); + } + + my $mapper = $self->{'exon_coord_mapper'}; + + return $mapper->map_coordinates( 'cdna', $start, $end, 1, "cdna" ); + +} + + +=head2 genomic2cdna + + Arg [1] : $start + The start position in genomic coordinates + Arg [2] : $end + The end position in genomic coordinates + Arg [3] : $strand + The strand of the genomic coordinates (default value 1) + Example : @coords = $trans_mapper->genomic2cdna($start, $end, $strnd); + Description: Converts genomic coordinates to cdna coordinates. The + return value is a list of coordinates and gaps. Gaps + represent intronic or upstream/downstream regions which do + not comprise this transcripts cdna. Coordinate objects + represent genomic regions which map to exons (utrs included). + Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and + Bio::EnsEMBL::Mapper::Gap objects + Exceptions : throws if start, end or strand not defined + Caller : general + Status : Stable + +=cut + +sub genomic2cdna { + my ($self, $start, $end, $strand) = @_; + + unless(defined $start && defined $end && defined $strand) { + throw("start, end and strand arguments are required\n"); + } + + my $mapper = $self->{'exon_coord_mapper'}; + + return $mapper->map_coordinates("genome", $start, $end, $strand,"genomic"); + +} + + +=head2 cds2genomic + + Arg [1] : int $start + start position in cds coords + Arg [2] : int $end + end position in cds coords + Example : @genomic_coords = $transcript_mapper->cds2genomic(69, 306); + Description: Converts cds coordinates into genomic coordinates. The + coordinates returned are relative to the same slice that the + transcript used to construct this TranscriptMapper was on. + Returntype : list of Bio::EnsEMBL::Mapper::Gap and + Bio::EnsEMBL::Mapper::Coordinate objects + Exceptions : throws if no end + Caller : general + Status : at risk + +=cut + +sub cds2genomic { + my ( $self, $start, $end ) = @_; + + if ( !( defined($start) && defined($end) ) ) { + throw("Must call with start and end"); + } + + # Move start end into translate cDNA coordinates now. + $start = $start +( $self->{'cdna_coding_start'} - 1 ) ; + $end = $end + ( $self->{'cdna_coding_start'} - 1 ); + + return $self->cdna2genomic( $start, $end ); +} + +=head2 pep2genomic + + Arg [1] : int $start + start position in peptide coords + Arg [2] : int $end + end position in peptide coords + Example : @genomic_coords = $transcript_mapper->pep2genomic(23, 102); + Description: Converts peptide coordinates into genomic coordinates. The + coordinates returned are relative to the same slice that the + transcript used to construct this TranscriptMapper was on. + Returntype : list of Bio::EnsEMBL::Mapper::Gap and + Bio::EnsEMBL::Mapper::Coordinate objects + Exceptions : throws if no end + Caller : general + Status : Stable + +=cut + +sub pep2genomic { + my ( $self, $start, $end ) = @_; + + if ( !( defined($start) && defined($end) ) ) { + throw("Must call with start and end"); + } + + # Take possible N-padding at beginning of CDS into account. + my $start_phase = $self->{'start_phase'}; + my $shift = ( $start_phase > 0 ) ? $start_phase : 0; + + # Move start end into translate cDNA coordinates now. + $start = 3*$start - 2 + ( $self->{'cdna_coding_start'} - 1 ) - $shift; + $end = 3*$end + ( $self->{'cdna_coding_start'} - 1 ) - $shift; + + return $self->cdna2genomic( $start, $end ); +} + + +=head2 genomic2cds + + Arg [1] : int $start + The genomic start position + Arg [2] : int $end + The genomic end position + Arg [3] : int $strand + The genomic strand + Example : @cds_coords = $trans_mapper->genomic2cds($start, $end, $strand); + Description: Converts genomic coordinates into CDS coordinates of the + transcript that was used to create this transcript mapper. + Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and + Bio::EnsEMBL::Mapper::Gap objects + Exceptions : throw if start, end or strand not defined + Caller : general + Status : Stable + +=cut + +sub genomic2cds { + my ($self, $start, $end, $strand) = @_; + + if(!defined($start) || !defined($end) || !defined($strand)) { + throw("start, end and strand arguments are required"); + } + + if($start > $end + 1) { + throw("start arg must be less than or equal to end arg + 1"); + } + + my $cdna_cstart = $self->{'cdna_coding_start'}; + my $cdna_cend = $self->{'cdna_coding_end'}; + + #this is a pseudogene if there is no coding region + if(!defined($cdna_cstart)) { + #return a gap of the entire requested region, there is no CDS + return Bio::EnsEMBL::Mapper::Gap->new($start,$end); + } + + my @coords = $self->genomic2cdna($start, $end, $strand); + + my @out; + + foreach my $coord (@coords) { + if($coord->isa('Bio::EnsEMBL::Mapper::Gap')) { + push @out, $coord; + } else { + my $start = $coord->start; + my $end = $coord->end; + + if($coord->strand == -1 || $end < $cdna_cstart || $start > $cdna_cend) { + #is all gap - does not map to peptide + push @out, Bio::EnsEMBL::Mapper::Gap->new($start,$end); + } else { + #we know area is at least partially overlapping CDS + + my $cds_start = $start - $cdna_cstart + 1; + my $cds_end = $end - $cdna_cstart + 1; + + if($start < $cdna_cstart) { + #start of coordinates are in the 5prime UTR + push @out, Bio::EnsEMBL::Mapper::Gap->new($start, $cdna_cstart-1); + + #start is now relative to start of CDS + $cds_start = 1; + } + + my $end_gap = undef; + if($end > $cdna_cend) { + #end of coordinates are in the 3prime UTR + $end_gap = Bio::EnsEMBL::Mapper::Gap->new($cdna_cend + 1, $end); + #adjust end to relative to CDS start + $cds_end = $cdna_cend - $cdna_cstart + 1; + } + + #start and end are now entirely in CDS and relative to CDS start + $coord->start($cds_start); + $coord->end($cds_end); + + push @out, $coord; + + if($end_gap) { + #push out the region which was in the 3prime utr + push @out, $end_gap; + } + } + } + } + + return @out; + +} + + +=head2 genomic2pep + + Arg [1] : $start + The start position in genomic coordinates + Arg [2] : $end + The end position in genomic coordinates + Arg [3] : $strand + The strand of the genomic coordinates + Example : @pep_coords = $transcript->genomic2pep($start, $end, $strand); + Description: Converts genomic coordinates to peptide coordinates. The + return value is a list of coordinates and gaps. + Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and + Bio::EnsEMBL::Mapper::Gap objects + Exceptions : throw if start, end or strand not defined + Caller : general + Status : Stable + +=cut + +sub genomic2pep { + my ($self, $start, $end, $strand) = @_; + + unless(defined $start && defined $end && defined $strand) { + throw("start, end and strand arguments are required"); + } + + my @coords = $self->genomic2cds($start, $end, $strand); + + my @out; + + my $start_phase = $self->{'start_phase'}; + + #take into account possible N padding at beginning of CDS + my $shift = ($start_phase > 0) ? $start_phase : 0; + + foreach my $coord (@coords) { + if($coord->isa('Bio::EnsEMBL::Mapper::Gap')) { + push @out, $coord; + } else { + + #start and end are now entirely in CDS and relative to CDS start + + #convert to peptide coordinates + my $pep_start = int(($coord->start + $shift + 2) / 3); + my $pep_end = int(($coord->end + $shift + 2) / 3); + $coord->start($pep_start); + $coord->end($pep_end); + + push @out, $coord; + } + } + + return @out; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Translation.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Translation.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1227 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Translation - A class representing the translation of a +transcript + +=head1 SYNOPSIS + + my $translation = Bio::EnsEMBL::Translation->new( + -START_EXON => $exon1, + -END_EXON => $exon2, + -SEQ_START => 98, + -SEQ_END => 39 + ); + + # stable ID setter + $translation->stable_id('ENSP00053458'); + + # get start and end position in start/end exons + my $start = $translation->start; + my $end = $translation->end; + +=head1 DESCRIPTION + +A Translation object defines the CDS and UTR regions of a Transcript +through the use of start_Exon/end_Exon, and start/end attributes. + +=cut + + +package Bio::EnsEMBL::Translation; + +use vars qw($AUTOLOAD @ISA); +use strict; + +use Scalar::Util qw(weaken isweak); + +use Bio::EnsEMBL::Utils::Exception qw( deprecate throw warning ); +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Utils::Scalar qw( assert_ref ); + +use Bio::EnsEMBL::Storable; + +@ISA = qw(Bio::EnsEMBL::Storable); + + +=head2 new + + Arg [-START_EXON] : The Exon object in which the translation (CDS) starts + Arg [-END_EXON] : The Exon object in which the translation (CDS) ends + Arg [-SEQ_START] : The offset in the start_Exon indicating the start + position of the CDS. + Arg [-SEQ_END] : The offset in the end_Exon indicating the end + position of the CDS. + Arg [-STABLE_ID] : The stable identifier for this Translation + Arg [-VERSION] : The version of the stable identifier + Arg [-DBID] : The internal identifier of this Translation + Arg [-ADAPTOR] : The TranslationAdaptor for this Translation + Arg [-SEQ] : Manually sets the peptide sequence of this translation. + May be useful if this translation is not stored in + a database. + Arg [-CREATED_DATE]: the date the translation was created + Arg [-MODIFIED_DATE]: the date the translation was modified + Example : my $tl = Bio::EnsEMBL::Translation->new + (-START_EXON => $ex1, + -END_EXON => $ex2, + -SEQ_START => 98, + -SEQ_END => 39); + Description: Constructor. Creates a new Translation object + Returntype : Bio::EnsEMBL::Translation + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + + my ( $start_exon, $end_exon, $seq_start, $seq_end, + $stable_id, $version, $dbID, $adaptor, $seq, + $created_date, $modified_date ) = + rearrange( [ "START_EXON", "END_EXON", "SEQ_START", "SEQ_END", + "STABLE_ID", "VERSION", "DBID", "ADAPTOR", + "SEQ", "CREATED_DATE", "MODIFIED_DATE" ], @_ ); + + my $self = bless { + 'start_exon' => $start_exon, + 'end_exon' => $end_exon, + 'dbID' => $dbID, + 'start' => $seq_start, + 'end' => $seq_end, + 'stable_id' => $stable_id, + 'version' => $version, + 'created_date' => $created_date, + 'modified_date' => $modified_date, + 'seq' => $seq + }, $class; + + $self->adaptor($adaptor); + + return $self; +} + +=head2 new_fast + + Arg [1] : hashref to be blessed + Description: Construct a new Bio::EnsEMBL::Translation using the hashref. + Exceptions : none + Returntype : Bio::EnsEMBL::Translation + Caller : general, subclass constructors + Status : Stable + +=cut + + +sub new_fast { + my $class = shift; + my $hashref = shift; + my $self = bless $hashref, $class; + weaken($self->{adaptor}) if ( ! isweak($self->{adaptor}) ); + return $self; +} + +=head2 transcript + + Arg [1] : Transcript object (optional) + Description : Sets or retrieves the transcript object associated + with this translation object. + Exceptions : Throws if there is no adaptor or no dbID defined for + the translation object. + Returntype : Bio::EnsEMBL::Transcript +=cut + +sub transcript { + my ( $self, $transcript ) = @_; + + if ( defined($transcript) ) { + assert_ref( $transcript, 'Bio::EnsEMBL::Transcript' ); + + $self->{'transcript'} = $transcript; + + weaken( $self->{'transcript'} ); # Avoid circular references. + + } elsif ( @_ > 1 ) { + # Break connection to transcript. + delete( $self->{'transcript'} ); + } elsif ( !defined( $self->{'transcript'} ) ) { + my $adaptor = $self->adaptor; + if ( !defined($adaptor) ) { + throw( "Adaptor is not set for translation, " + . "can not fetch its transcript." ); + } + + my $dbID = $self->{'dbID'}; + if ( !defined($dbID) ) { + throw( "dbID is not set for translation, " + . " can not fetch its transcript." ); + } + + $self->{'transcript'} = + $adaptor->db()->get_TranscriptAdaptor() + ->fetch_by_translation_id($dbID); + + # Do not weaken the reference if we had to get the transcript from the + # database. The user is probably working on translations directly, + # not going through transcripts. + #weaken( $self->{'transcript'} ); # Avoid circular references. + } + + return $self->{'transcript'}; +} ## end sub transcript + + +=head2 start + + Arg [1] : (optional) int $start - start position to set + Example : $translation->start(17); + Description: Getter/setter for the value of start, which is a position within + the exon given by start_Exon. + + If you need genomic coordinates, use the genomic_start() + method. + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub start{ + my $obj = shift; + if( @_ ) { + my $value = shift; + + $obj->{'start'} = $value; + } + return $obj->{'start'}; + +} + + +=head2 end + + Arg [1] : (optional) int $end - end position to set + Example : $translation->end(8); + Description: Getter/setter for the value of end, which is a position within + the exon given by end_Exon. + + If you need genomic coordinates, use the genomic_end() + method. + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub end { + my $self = shift; + if( @_ ) { + my $value = shift; + + $self->{'end'} = $value; + } + return $self->{'end'}; + +} + + +=head2 start_Exon + + Arg [1] : (optional) Bio::EnsEMBL::Exon - start exon to assign + Example : $translation->start_Exon($exon1); + Description: Getter/setter for the value of start_Exon, which denotes the + exon at which translation starts (and within this exon, at the + position indicated by start, see above). + Returntype : Bio::EnsEMBL::Exon + Exceptions : thrown on wrong argument type + Caller : general + Status : Stable + +=cut + +sub start_Exon { + my $self = shift; + + if( @_ ) { + my $value = shift; + if( !ref $value || !$value->isa('Bio::EnsEMBL::Exon') ) { + throw("Got to have an Exon object, not a $value"); + } + $self->{'start_exon'} = $value; + } + return $self->{'start_exon'}; +} + + +=head2 end_Exon + + Arg [1] : (optional) Bio::EnsEMBL::Exon - start exon to assign + Example : $translation->start_Exon($exon1); + Description: Getter/setter for the value of end_Exon, which denotes the + exon at which translation ends (and within this exon, at the + position indicated by end, see above). + Returntype : Bio::EnsEMBL::Exon + Exceptions : thrown on wrong argument type + Caller : general + Status : Stable + +=cut + +sub end_Exon { + my $self = shift; + if( @_ ) { + my $value = shift; + if( !ref $value || !$value->isa('Bio::EnsEMBL::Exon') ) { + throw("Got to have an Exon object, not a $value"); + } + $self->{'end_exon'} = $value; + } + + return $self->{'end_exon'}; +} + +=head2 cdna_start + + Arg [1] : (optional) Bio::EnsEMBL::Transcript $transcript + The transcript which this is a translation of. + Example : $translation_cdna_start = $translation->cdna_start(); + Description : Returns the start position of the translation in cDNA + coordinates. + If no transcript is given, the method will use + TranscriptAdaptor->fetch_by_translation_id() to locate + the correct transcript. + Return type : Integer + Exceptions : Throws if the given (optional) argument is not a + transcript. + Caller : General + Status : At Risk (Under Development) + +=cut + +sub cdna_start { + my ( $self, $transcript ) = @_; + + if ( defined($transcript) + && ( !ref($transcript) + || !$transcript->isa('Bio::EnsEMBL::Transcript') ) ) + { + throw("Argument is not a transcript"); + } + + if ( !exists( $self->{'cdna_start'} ) ) { + if ( !defined($transcript) ) { + # We were not given a transcript, get the transcript out of + # the database. + $transcript = $self->transcript(); + } + + $self->{'cdna_start'} = + $self->start_Exon()->cdna_coding_start($transcript); + } + + return $self->{'cdna_start'}; +} + +=head2 cdna_end + + Arg [1] : (optional) Bio::EnsEMBL::Transcript $transcript + The transcript which this is a translation of. + Example : $translation_cdna_end = $translation->cdna_end(); + Description : Returns the end position of the translation in cDNA + coordinates. + If no transcript is given, the method will use + TranscriptAdaptor->fetch_by_translation_id() to locate + the correct transcript. + Return type : Integer + Exceptions : Throws if the given (optional) argument is not a + transcript. + Caller : General + Status : At Risk (Under Development) + +=cut + +sub cdna_end { + my ( $self, $transcript ) = @_; + + if ( defined($transcript) + && ( !ref($transcript) + || !$transcript->isa('Bio::EnsEMBL::Transcript') ) ) + { + throw("Argument is not a transcript"); + } + + if ( !exists( $self->{'cdna_end'} ) ) { + if ( !defined($transcript) ) { + # We were not given a transcript, get the transcript out of + # the database. + $transcript = $self->transcript(); + } + + $self->{'cdna_end'} = + $self->end_Exon()->cdna_coding_end($transcript); + } + + return $self->{'cdna_end'}; +} + +=head2 genomic_start + + Args : None + Example : $translation_genomic_start = + $translation->genomic_start(); + Description : Returns the start position of the translation in + genomic coordinates on the forward strand. + Return type : Integer + Exceptions : None + Caller : General + Status : At Risk (Under Development) + +=cut + +sub genomic_start { + my $self = shift; + + if ( !exists $self->{'genomic_start'} ) { + if ( $self->start_Exon()->strand() >= 0 ) { + $self->{'genomic_start'} = + $self->start_Exon()->start() + ( $self->start() - 1 ); + } else { + $self->{'genomic_start'} = + $self->end_Exon()->end() - ( $self->end() - 1 ); + } + } + + return $self->{'genomic_start'}; +} + +=head2 genomic_end + + Args : None + Example : $translation_genomic_end = $translation->genomic_end(); + Description : Returns the end position of the translation in genomic + coordinates on the forward strand. + Return type : Integer + Exceptions : None + Caller : General + Status : At Risk (Under Development) + +=cut + +sub genomic_end { + my $self = shift; + + if ( !exists $self->{'genomic_end'} ) { + if ( $self->end_Exon()->strand() >= 0 ) { + $self->{'genomic_end'} = + $self->end_Exon()->start() + ( $self->end() - 1 ); + } else { + $self->{'genomic_end'} = + $self->start_Exon()->end() - ( $self->start() - 1 ); + } + } + + return $self->{'genomic_end'}; +} + +=head2 version + + Arg [1] : (optional) string $version - version to set + Example : $translation->version(2); + Description: Getter/setter for attribute version + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub version { + my $self = shift; + $self->{'version'} = shift if( @_ ); + return $self->{'version'}; +} + + +=head2 stable_id + + Arg [1] : (optional) string $stable_id - stable ID to set + Example : $translation->stable_id('ENSP0059890'); + Description: Getter/setter for attribute stable_id + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub stable_id { + my $self = shift; + $self->{'stable_id'} = shift if( @_ ); + return $self->{'stable_id'}; +} + +=head2 created_date + + Arg [1] : (optional) string $created_date - created date to set + Example : $translation->created_date('2007-01-10 20:52:00'); + Description: Getter/setter for attribute created date + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub created_date { + my $self = shift; + $self->{'created_date'} = shift if ( @_ ); + return $self->{'created_date'}; +} + + +=head2 modified_date + + Arg [1] : (optional) string $modified_date - modification date to set + Example : $translation->modified_date('2007-01-10 20:52:00'); + Description: Getter/setter for attribute modified date + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub modified_date { + my $self = shift; + $self->{'modified_date'} = shift if ( @_ ); + return $self->{'modified_date'}; +} + + + +=head2 transform + + Arg [1] : hashref $old_new_exon_map + a hash that maps old to new exons for a whole gene + Description: maps start end end exon according to mapping table. + If an exon is not mapped, just keep the old one. + Returntype : none + Exceptions : none + Caller : Transcript->transform() + Status : Stable + +=cut + +sub transform { + my $self = shift; + my $href_exons = shift; + + my $start_exon = $self->start_Exon(); + my $end_exon = $self->end_Exon(); + + if ( exists $href_exons->{$start_exon} ) { + $self->start_Exon($href_exons->{$start_exon}); + } else { + # do nothing, the start exon wasnt mapped + } + + if ( exists $href_exons->{$end_exon} ) { + $self->end_Exon($href_exons->{$end_exon}); + } else { + # do nothing, the end exon wasnt mapped + } +} + + +=head2 get_all_DBEntries + + Arg [1] : (optional) String, external database name + + Arg [2] : (optional) String, external_db type + + Example : @dbentries = @{ $translation->get_all_DBEntries() }; + + Description: Retrieves DBEntries (xrefs) for this translation. + + This method will attempt to lazy-load DBEntries + from a database if an adaptor is available and no + DBEntries are present on the translation (i.e. they + have not already been added or loaded). + + Returntype : Listref to Bio::EnsEMBL::DBEntry objects + Exceptions : none + Caller : TranslationAdaptor::store + Status : Stable + +=cut + +sub get_all_DBEntries { + my ( $self, $ex_db_exp, $ex_db_type ) = @_; + + my $cache_name = 'dbentries'; + + if ( defined($ex_db_exp) ) { + $cache_name .= $ex_db_exp; + } + + if ( defined($ex_db_type) ) { + $cache_name .= $ex_db_type; + } + + # if not cached, retrieve all of the xrefs for this translation + if ( !defined( $self->{$cache_name} ) && defined( $self->adaptor() ) ) + { + $self->{$cache_name} = + $self->adaptor()->db()->get_DBEntryAdaptor() + ->fetch_all_by_Translation( $self, $ex_db_exp, $ex_db_type ); + } + + $self->{$cache_name} ||= []; + + return $self->{$cache_name}; +} ## end sub get_all_DBEntries + +=head2 get_all_object_xrefs + + Arg [1] : (optional) String, external database name + + Arg [2] : (optional) String, external_db type + + Example : @oxrefs = @{ $translation->get_all_object_xrefs() }; + + Description: Retrieves xrefs for this translation. + + This method will attempt to lazy-load xrefs from a + database if an adaptor is available and no xrefs + are present on the translation (i.e. they have not + already been added or loaded). + + NB: This method is an alias for the + get_all_DBentries() method. + + Return type: Listref of Bio::EnsEMBL::DBEntry objects + + Status : Stable + +=cut + +sub get_all_object_xrefs { + my $self = shift; + return $self->get_all_DBEntries(@_); +} + +=head2 add_DBEntry + + Arg [1] : Bio::EnsEMBL::DBEntry $dbe + The dbEntry to be added + Example : $translation->add_DBEntry($xref); + Description: Associates a DBEntry with this translation. Note that adding + DBEntries will prevent future lazy-loading of DBEntries for this + translation (see get_all_DBEntries). + Returntype : none + Exceptions : thrown on incorrect argument type + Caller : general + Status : Stable + +=cut + +sub add_DBEntry { + my $self = shift; + my $dbe = shift; + + unless($dbe && ref($dbe) && $dbe->isa('Bio::EnsEMBL::DBEntry')) { + throw('Expected DBEntry argument'); + } + + $self->{'dbentries'} ||= []; + push @{$self->{'dbentries'}}, $dbe; +} + + +=head2 get_all_DBLinks + + Arg [1] : String database name (optional) + SQL wildcard characters (_ and %) can be used to + specify patterns. + + Example : my @dblinks = @{ $translation->get_all_DBLinks() }; + my @dblinks = @{ $translation->get_all_DBLinks('Uniprot%') }; + + Description: This is here for consistancy with the Transcript + and Gene classes. It is a synonym for the + get_all_DBEntries() method. + + Return type: Listref to Bio::EnsEMBL::DBEntry objects + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_DBLinks { + my $self = shift; + return $self->get_all_DBEntries(@_); +} + +=head2 get_all_xrefs + + Arg [1] : String database name (optional) + SQL wildcard characters (_ and %) can be used to + specify patterns. + + Example : @xrefs = @{ $translation->get_all_xrefs() }; + @xrefs = @{ $translation->get_all_xrefs('Uniprot%') }; + + Description: This method is here for consistancy with the Gene + and Transcript classes. It is an alias for the + get_all_DBLinks() method, which in turn directly + calls get_all_DBEntries(). + + Return type: Listref of Bio::EnsEMBL::DBEntry objects + + Status : Stable + +=cut + +sub get_all_xrefs { + my $self = shift; + return $self->get_all_DBLinks(@_); +} + +=head2 get_all_ProteinFeatures + + Arg [1] : (optional) string $logic_name + The analysis logic_name of the features to retrieve. If not + specified, all features are retrieved instead. + Example : $features = $self->get_all_ProteinFeatures('PFam'); + Description: Retrieves all ProteinFeatures associated with this + Translation. If a logic_name is specified, only features with + that logic_name are returned. If no logic_name is provided all + associated protein_features are returned. + + ProteinFeatures are lazy-loaded from the database unless they + added manually to the Translation or had already been loaded. + Returntype : Bio::EnsEMBL::ProteinFeature + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_ProteinFeatures { + my $self = shift; + my $logic_name = shift; + + if(!$self->{'protein_features'}) { + my $adaptor = $self->adaptor(); + my $dbID = $self->dbID(); + + return [] if (!$adaptor || !$dbID); + + my %hash; + $self->{'protein_features'} = \%hash; + + my $pfa = $adaptor->db()->get_ProteinFeatureAdaptor(); + my $name; + foreach my $f (@{$pfa->fetch_all_by_translation_id($dbID)}) { + my $analysis = $f->analysis(); + if($analysis) { + $name = lc($f->analysis->logic_name()); + } else { + warning("ProteinFeature has no attached analysis\n"); + $name = ''; + } + $hash{$name} ||= []; + push @{$hash{$name}}, $f; + } + } + + # a specific type of protein feature was requested + if(defined($logic_name)) { + $logic_name = lc($logic_name); + return $self->{'protein_features'}->{$logic_name} || []; + } + + my @features = (); + + # all protein features were requested + foreach my $type (keys %{$self->{'protein_features'}}) { + push @features, @{$self->{'protein_features'}->{$type}}; + } + + return \@features; +} + + +=head2 get_all_DomainFeatures + + Example : @domain_feats = @{$translation->get_all_DomainFeatures}; + Description: A convenience method which retrieves all protein features + that are considered to be 'Domain' features. Features which + are 'domain' features are those with analysis logic names: + 'pfscan', 'scanprosite', 'superfamily', 'pfam', 'prints', + 'smart', 'pirsf', 'tigrfam'. + Returntype : listref of Bio::EnsEMBL::ProteinFeatures + Exceptions : none + Caller : webcode (protview) + Status : Stable + +=cut + +sub get_all_DomainFeatures{ + my ($self) = @_; + + my @features; + + my @types = ('pfscan', #profile (prosite or pfam motifs) + 'scanprosite', #prosite + 'superfamily', + 'pfam', + 'smart', + 'tigrfam', + 'pirsf', + 'prints'); + + foreach my $type (@types) { + push @features, @{$self->get_all_ProteinFeatures($type)}; + } + + return \@features; +} + + +=head2 add_ProteinFeature + + Arg [1] : Bio::EnsEMBL::ProteinFeature $pf + The ProteinFeature to be added + Example : $translation->add_ProteinFeature($pf); + Description: Associates a ProteinFeature with this translation. Note that + adding ProteinFeatures will prevent future lazy-loading of + ProteinFeatures for this translation (see + get_all_ProteinFeatures). + Returntype : none + Exceptions : thrown on incorrect argument type + Caller : general + Status : Stable + +=cut + +sub add_ProteinFeature { + my $self = shift; + my $pf = shift; + + unless ($pf && ref($pf) && $pf->isa('Bio::EnsEMBL::ProteinFeature')) { + throw('Expected ProteinFeature argument'); + } + + my $analysis = $pf->analysis; + throw("ProteinFeature has no attached Analysis.") unless $analysis; + + push @{ $self->{'protein_features'}->{$analysis->logic_name} }, $pf; +} + + +=head2 display_id + + Example : print $translation->display_id(); + Description: This method returns a string that is considered to be + the 'display' identifier. For translations this is (depending on + availability and in this order) the stable Id, the dbID or an + empty string. + Returntype : string + Exceptions : none + Caller : web drawing code + Status : Stable + +=cut + +sub display_id { + my $self = shift; + return $self->{'stable_id'} || $self->dbID || ''; +} + + +=head2 length + + Example : print "Peptide length =", $translation->length(); + Description: Retrieves the length of the peptide sequence (i.e. number of + amino acids) represented by this Translation object. + Returntype : int + Exceptions : none + Caller : webcode (protview etc.) + Status : Stable + +=cut + +sub length { + my $self = shift; + my $seq = $self->seq(); + + return ($seq) ? CORE::length($seq) : 0; +} + + +=head2 seq + + Example : print $translation->seq(); + Description: Retrieves a string representation of the peptide sequence + of this Translation. This retrieves the transcript from the + database and gets its sequence, or retrieves the sequence which + was set via the constructor. + Returntype : string + Exceptions : warning if the sequence is not set and cannot be retrieved from + the database. + Caller : webcode (protview etc.) + Status : Stable + +=cut + +sub seq { + my ( $self, $sequence ) = @_; + + if ( defined($sequence) ) { + + $self->{'seq'} = $sequence; + + } elsif ( !defined( $self->{'seq'} ) ) { + + my $transcript = $self->transcript(); + + my $canonical_translation = $transcript->translation(); + my $is_alternative; + if(!$canonical_translation) { + throw "Transcript does not have a canonical translation"; + } + if ( defined( $canonical_translation->stable_id() ) + && defined( $self->stable_id() ) ) + { + # Try stable ID. + $is_alternative = + ( $canonical_translation->stable_id() ne $self->stable_id() ); + } elsif ( defined( $canonical_translation->dbID() ) + && defined( $self->dbID() ) ) + { + # Try dbID. + $is_alternative = + ( $canonical_translation->dbID() != $self->dbID() ); + } else { + # Resort to using geomic start/end coordinates. + $is_alternative = ( ($canonical_translation->genomic_start() != + $self->genomic_start() ) + || ( $canonical_translation->genomic_end() != + $self->genomic_end() ) ); + } + + if ($is_alternative) { + # To deal with non-canonical (alternative) translations, subsitute + # the canonical translation in the transcript with $self for a + # while. + + $transcript->translation($self); + } + + my $seq = $transcript->translate(); + if ( defined($seq) ) { + $self->{'seq'} = $seq->seq(); + } + + if ($is_alternative) { + # Reinstate the real canonical translation. + + $transcript->translation($canonical_translation); + } + + } ## end elsif ( !defined( $self->...)) + + if ( !defined( $self->{'seq'} ) ) { + return ''; # Empty string + } + + return $self->{'seq'}; + +} ## end sub seq + + +=head2 get_all_Attributes + + Arg [1] : optional string $attrib_code + The code of the attribute type to retrieve values for. + Example : ($sc_attr) = @{$tl->get_all_Attributes('_selenocysteine')}; + @tl_attributes = @{$translation->get_all_Attributes()}; + Description: Gets a list of Attributes of this translation. + Optionally just get Attrubutes for given code. + Recognized attribute "_selenocysteine" + Returntype : listref Bio::EnsEMBL::Attribute + Exceptions : warning if translation does not have attached adaptor and + attempts lazy load. + Caller : general, modify_translation + Status : Stable + +=cut + +sub get_all_Attributes { + my $self = shift; + my $attrib_code = shift; + + if( ! exists $self->{'attributes' } ) { + if(!$self->adaptor() ) { +# warning('Cannot get attributes without an adaptor.'); + return []; + } + + my $aa = $self->adaptor->db->get_AttributeAdaptor(); + $self->{'attributes'} = $aa->fetch_all_by_Translation( $self ); + } + + if( defined $attrib_code ) { + my @results = grep { uc($_->code()) eq uc($attrib_code) } + @{$self->{'attributes'}}; + return \@results; + } else { + return $self->{'attributes'}; + } +} + + +=head2 add_Attributes + + Arg [1..N] : Bio::EnsEMBL::Attribute $attribute + Attributes to add. + Example : $translation->add_Attributes($selenocysteine_attribute); + Description: Adds an Attribute to the Translation. Usefull to + do _selenocysteine. + If you add an attribute before you retrieve any from database, + lazy load will be disabled. + Returntype : none + Exceptions : throw on incorrect arguments + Caller : general + Status : Stable + +=cut + +sub add_Attributes { + my $self = shift; + my @attribs = @_; + + if( ! exists $self->{'attributes'} ) { + $self->{'attributes'} = []; + } + + for my $attrib ( @attribs ) { + if( ! $attrib->isa( "Bio::EnsEMBL::Attribute" )) { + throw( "Argument to add_Attribute must be a Bio::EnsEMBL::Attribute" ); + } + push( @{$self->{'attributes'}}, $attrib ); + $self->{seq}=undef; + } +} + + +=head2 get_all_SeqEdits + + Example : my @seqeds = @{$transcript->get_all_SeqEdits()}; + Description: Retrieves all post transcriptional sequence modifications for + this transcript. + Returntype : Bio::EnsEMBL::SeqEdit + Exceptions : none + Caller : spliced_seq() + Status : Stable + +=cut + +sub get_all_SeqEdits { + my $self = shift; + + my @seqeds; + + my $attribs; + + my @edits = ('initial_met', '_selenocysteine', 'amino_acid_sub'); + + + foreach my $edit(@edits){ + $attribs = $self->get_all_Attributes($edit); + + # convert attributes to SeqEdit objects + foreach my $a (@$attribs) { + push @seqeds, Bio::EnsEMBL::SeqEdit->new(-ATTRIB => $a); + } + } + return \@seqeds; +} + + +=head2 modify_translation + + Arg [1] : Bio::Seq $peptide + Example : my $seq = Bio::Seq->new(-SEQ => $dna)->translate(); + $translation->modify_translation($seq); + Description: Applies sequence edits such as selenocysteines to the Bio::Seq + peptide thats passed in + Returntype : Bio::Seq + Exceptions : none + Caller : Bio::EnsEMBL::Transcript->translate + Status : Stable + +=cut + +sub modify_translation { + my ( $self, $seq ) = @_; + + my @seqeds = @{ $self->get_all_SeqEdits() }; + + # Sort in reverse order to avoid complication of adjusting + # downstream edits. + # HACK: The translation ENSP00000420939 somehow makes the next line + # bomb out ($a or $b becomes undef) if the start() method + # is used. I haven't been able to find out why. It has 10 + # Selenocysteine seqedits that looks correct. + # /Andreas (release 59) + @seqeds = sort { $b->{'start'} <=> $a->{'start'} } @seqeds; + + # Apply all edits. + my $peptide = $seq->seq(); + foreach my $se (@seqeds) { + $se->apply_edit( \$peptide ); + } + + $seq->seq($peptide); + + return $seq; +} + +=head2 load + + Arg [1] : Boolean $load_xrefs + Load (or don't load) xrefs. Default is to load xrefs. + Example : $translation->load(); + Description : The Ensembl API makes extensive use of + lazy-loading. Under some circumstances (e.g., + when copying genes between databases), all data of + an object needs to be fully loaded. This method + loads the parts of the object that are usually + lazy-loaded. + Returns : none + +=cut + +sub load { + my ( $self, $load_xrefs ) = @_; + + if ( !defined($load_xrefs) ) { $load_xrefs = 1 } + + $self->seq(); + + $self->stable_id(); + $self->get_all_Attributes(); + $self->get_all_ProteinFeatures(); + + if ($load_xrefs) { + $self->get_all_DBEntries(); + } +} + +=head2 temporary_id + + Description: DEPRECATED This method should not be needed. Use dbID, + stable_id or something else. + +=cut + +sub temporary_id { + my $self = shift; + deprecate( "I cant see what a temporary_id is good for, please use " . + "dbID or stableID or\n try without an id." ); + $self->{'temporary_id'} = shift if( @_ ); + return $self->{'temporary_id'}; +} + + +=head2 get_all_DASFactories + + Function : Retrieves a listref of registered DAS objects + Returntype: Listref of DAS Objects + Exceptions: none + Caller : webcode + Example : $dasref = $prot->get_all_DASFactories; + Status : Stable + +=cut + +sub get_all_DASFactories { + my $self = shift; + return [ $self->adaptor()->db()->_each_DASFeatureFactory ]; +} + + +=head2 get_all_DAS_Features + + Example : $features = $prot->get_all_DAS_Features; + Description: Retreives a hash reference to a hash of DAS feature + sets, keyed by the DNS, NOTE the values of this hash + are an anonymous array containing: + (1) a pointer to an array of features; + (2) a pointer to the DAS stylesheet + Returntype : hashref of Bio::SeqFeatures + Exceptions : none + Caller : webcode + Status : Stable + +=cut + +sub get_all_DAS_Features{ + my $self = shift; + + my $db = $self->adaptor->db; + my $GeneAdaptor = $db->get_GeneAdaptor; + my $Gene = $GeneAdaptor->fetch_by_translation_stable_id($self->stable_id) || return; + my $slice = $Gene->feature_Slice; + + return $self->SUPER::get_all_DAS_Features($slice); +} + +=head2 summary_as_hash + + Example : $translation_summary = $translation->summary_as_hash(); + Description : Retrieves a textual summary of this Translation. + Not inherited from Feature. + Returns : hashref of arrays of descriptive strings + Status : Intended for internal use +=cut + +sub summary_as_hash { + my $self = shift; + my %summary; + $summary{'ID'} = $self->display_id; + $summary{'genomic_start'} = $self->genomic_start; + $summary{'genomic_end'} = $self->genomic_end; + my $transcript = $self->transcript; + $summary{'Parent'} = $transcript->display_id; + return \%summary; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/UnconventionalTranscriptAssociation.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/UnconventionalTranscriptAssociation.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,144 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::UnconventionalTranscriptAssociation - A class representing +an some sort of unconventional association between a gene and a +transcript. + +=head1 SYNOPSIS + + $ex = new Bio::EnsEMBL::UnconventionalTranscriptAssociation( $gene, + $transcript, $type ); + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::UnconventionalTranscriptAssociation; +use vars qw(@ISA); +use strict; + +use Bio::EnsEMBL::Utils::Exception qw( warning throw deprecate ); +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); + +=head2 new + + Args [1] : Bio::EnsEMBL::Gene - the gene which is associated. + Args [2] : Bio::EnsEMBL::Transcript - the transcript which is associated. + Args [3] : String type - the type of assocation, e.g. "antisense", + "sense_intronic","sense_overlaping_exonic","chimeric_sense_exonic". + Example : $uta = new Bio::EnsEMBL::UnconventionalTranscriptAssociation($gene, $transcript, "antisense") + Description: create an UnconventionalTranscriptAssociation object. + Returntype : Bio::EnsEMBL::UnconventionalTranscriptAssociation. + Exceptions : Wrong argument types + Caller : general + Status : At risk + +=cut + +sub new { + + my ($class, $transcript, $gene, $type) = @_; + + $class = ref $class || $class; + + my $self = {}; + + if( !ref $gene || ! $gene->isa("Bio::EnsEMBL::Gene") ) { + throw("$gene is not a Bio::EnsEMBL::Gene!"); + } + + if( !ref $transcript || ! $transcript->isa("Bio::EnsEMBL::Transcript") ) { + throw("$transcript is not a Bio::EnsEMBL::Transcript!"); + } + + $self->{'gene'} = $gene; + $self->{'transcript'} = $transcript; + $self->{'type'} = $type; + + return bless $self, $class; +} + + +=head2 gene + + Args : none + Example : $gene = $uta->gene() + Description: Getter/setter for the gene part of this association. + Returntype : Bio::EnsEMBL::Gene + Exceptions : none + Caller : general + Status : At risk + +=cut + +sub gene { + my ($self) = shift; + + $self->{'gene'} = shift if (@_); + return $self->{'gene'}; + +} + +=head2 transcript + + Args : none + Example : $transcript = $uta->transcript() + Description: Getter/setter for the transcript part of this association. + Returntype : Bio::EnsEMBL::Transcript + Exceptions : none + Caller : General + Status : At risk + +=cut + +sub transcript { + my ($self) = shift; + + $self->{'transcript'} = shift if (@_); + return $self->{'transcript'}; + +} + +=head2 interaction_type + + Args : none + Example : $type = $uta->interaction_type() + Description: Getter/setter for the interaction_type of this association. + Returntype : String + Exceptions : none + Caller : General + Status : At risk + +=cut + +sub interaction_type { + my ($self) = shift; + + $self->{'interaction_type'} = shift if (@_); + return $self->{'interaction_type'}; + +} + +1; + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/UnmappedObject.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/UnmappedObject.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,485 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio:EnsEMBL::UnmappedObject - A object representing why a particular entity +was NOT mapped to the ensembl. + +=head1 SYNOPSIS + + use Bio::EnsEMBL::UnmappedObject; + + my $uo = Bio::EnsEMBL::UnmappedObject->new( + -type => 'xref', + -analysis => $analysis, + -external_db_id => 4100, + -identifier => "Q12345", + -query_score => 45.5, + -target_score => 29.2, + -ensembl_id => 122346, + -ensembl_type => "Translation", + -summary => "match failed for exonerate", + -full_desc => "match failed for the xref exonerate run " + . "as match was below threshold of 90" + ); + +=head1 DESCRIPTION + +UnmappedObjects represent entities NOT mapped to ensembl. Therefore this +should help users to find out why certain accessions etc can not be +found. + +=head1 METHODS + +=cut + + + +package Bio::EnsEMBL::UnmappedObject; + +use strict; +use warnings; + +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw); +use Bio::EnsEMBL::Storable; + +use vars qw(@ISA); + +@ISA = qw(Bio::EnsEMBL::Storable); + + +=head2 new + + Arg [TYPE] : the type of mapping i.e. 'xref','cDNA' + Arg [ANALYSIS] : Analysis object. + Arg [EXTERNAL_DB_ID] : id for the external db id this identifier is from + Arg [IDENTIFIER] : name of the identifier i.e. accession + Arg [QUERY_SCORE] : (optional) The query score + Arg [TARGET_SCORE] : (optional) The target score + Arg [SUMMARY] : The summary reason for not mapping. + Arg [FULL_DESC] : The Full description of why it did not map. + Arg [ENSEMBL_ID] : (optional) internal ensembl id for the best match + Arg [ENSEMBL_OBJECT_TYPE] : (optional) the type of object for the best match + Example : see SYNOPSIS + Returntype : Bio::EnsEMBL::UnmappedObject + Exceptions : If any of the none optional args are missing + Caller : general + Status : At Risk + +=cut + +sub new { + my $caller = shift; + + #allow constructor to be called as class or object method + my $class = ref($caller) || $caller; + + + + my ($dbID, $unmapped_reason_id, $type, $analysis, $ex_db_id, $identifier, + $query_score, $target_score, $summary, $full_desc, + $ensembl_id, $ensembl_object_type, $adaptor ) = + rearrange([qw(UNMAPPED_OBJECT_ID UNMAPPED_REASON_ID TYPE ANALYSIS + EXTERNAL_DB_ID IDENTIFIER QUERY_SCORE TARGET_SCORE + SUMMARY FULL_DESC ENSEMBL_ID ENSEMBL_OBJECT_TYPE ADAPTOR)], @_); + + my $self = $class->SUPER::new(@_); + if(defined($analysis)) { + if(!ref($analysis) || !$analysis->isa('Bio::EnsEMBL::Analysis')) { + throw('-ANALYSIS argument must be a Bio::EnsEMBL::Analysis not '. + $analysis); + } + } + else{ + throw('-ANALYSIS argument must be given'); + } + $self->{'analysis'} = $analysis; + $self->{'dbID'} = $dbID if (defined($dbID)); + $self->{'description'} = $full_desc || throw('FULL_DESC must be given'); + $self->{'summary'} = $summary || throw('SUMMARY must be given'); + $self->{'type'} = $type || throw('TYPE must be given'); + $self->{'external_db_id'} = $ex_db_id; + + if (lc($type) eq "xref") { + throw('EXTERNAL_DB_ID must be given') if ! defined $ex_db_id; + } + + $self->{'identifier'} = $identifier || throw('IDENTIFIER must be given'); + $self->{'query_score'} = $query_score if(defined($query_score)); + $self->{'target_score'} = $target_score if(defined($target_score)); + $self->{'ensembl_id'} = $ensembl_id if(defined($ensembl_id)); + $self->{'ensembl_object_type'} = $ensembl_object_type + if(defined($ensembl_object_type)); + $self->{'unmapped_reason_id'} = $unmapped_reason_id + if(defined($unmapped_reason_id)); + $self->adaptor($adaptor) if(defined($adaptor)); + return $self; +} + +=head2 new_fast + + Arg [...] : none + Example : $feature = Bio::EnsEMBL::UnmappedObject->new_fast(); + Description: Creates a new Unmapped Object. + Returntype : Bio::EnsEMBL::UnmappedObject + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub new_fast{ + my $caller = shift; + + #allow constructor to be called as class or object method + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + return $self; +} + +=head2 description + + Arg [1] : (optional) * to be set to + Example : print $unmappedObject->description."\n"; + Description : Basic getter/setter for description + ReturnType : String + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub description{ + my $self = shift; + + if(@_) { + my $des = shift; + $self->{'description'} = $des; + } + + return $self->{'description'}; +} + +=head2 summary + + Arg [1] : (optional) summary to be set to + Example : print $unmappedObject->summary."\n"; + Description : Basic getter/setter for summary + ReturnType : String + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub summary{ + my $self = shift; + + if(@_) { + my $des = shift; + $self->{'summary'} = $des; + } + + return $self->{'summary'}; +} + +=head2 type + + Arg [1] : (optional) type to be set to + Example : print $unmappedObject->type."\n"; + Description : Basic getter/setter for type + ReturnType : String + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub type{ + my $self = shift; + + if(@_) { + my $arg = shift; + $self->{'type'} = $arg; + } + + return $self->{'type'}; +} + +=head2 ensembl_object_type + + Arg [1] : (optional) ensembl object type to be set to + Example : print $unmappedObject->ensembl_object_type."\n"; + Description : Basic getter/setter for ensembl object type + ReturnType : String + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub ensembl_object_type{ + my $self = shift; + + if(@_) { + my $arg = shift; + $self->{'ensembl_object_type'} = $arg; + } + + return $self->{'ensembl_object_type'}; +} + +=head2 ensembl_id + + Arg [1] : (optional) ensembl id to be set to + Example : print $unmappedObject->ensembl_id."\n"; + Description : Basic getter/setter for ensembl id + ReturnType : String + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub ensembl_id{ + my $self = shift; + + if(@_) { + my $arg = shift; + $self->{'ensembl_id'} = $arg; + } + + return $self->{'ensembl_id'}; +} + +=head2 external_db_id + + Arg [1] : (optional) external_db_id to be set to + Example : print $unmappedObject->external_db_id."\n"; + Description : Basic getter/setter for external_db_id + ReturnType : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub external_db_id{ + my $self = shift; + + if(@_) { + my $arg = shift; + $self->{'external_db_id'} = $arg; + } + + return $self->{'external_db_id'}; +} + +=head2 external_db_name + + Example : print $unmappedObject->external_db_name()."\n"; + Description : Basic getter for external_db_name + ReturnType : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub external_db_name{ + my $self = shift; + + my $handle = $self->adaptor; + if(defined($handle) and defined($self->{'external_db_id'})){ + my $sth = $handle->prepare("select db_name from external_db where external_db_id = ".$self->{'external_db_id'}); + $sth->execute(); + my $name; + $sth->bind_columns(\$name); + $sth->fetch(); + return $name; + } + return ""; +} + + +sub stable_id{ + my ($self) = shift; + + my $handle = $self->adaptor; + if(defined($handle)){ + my $sql = "select stable_id from ".lc($self->{'ensembl_object_type'})." where ". + lc($self->{'ensembl_object_type'})."_id = ". + $self->{'ensembl_id'}; + my $sth = $handle->prepare($sql); + $sth->execute(); + my $name; + $sth->bind_columns(\$name); + $sth->fetch(); + return $name; + } + return ""; +} + + # my $adaptor; +# if($self->{'ensembl_object_type'} eq "Transcript"){ +# $adaptor= $self->adaptor->db->get_TranscriptAdaptor(); +# } +# elsif($self->{'ensembl_object_type'} eq "Translation"){ +# $adaptor= $self->adaptor->db->get_TranslationAdaptor(); +# } +# elsif($self->{'ensembl_object_type'} eq "Gene"){ +# $adaptor= $self->adaptor->db->get_GeneAdaptor(); +# } +# else{ +# return undef; +# } +# my $object = $adaptor->fetch_by_dbID($self->{'ensembl_id'}); +# if(defined($object)){ +# return $object->stable_id; +# } +# else{ +# return undef; +# } +#} + + +=head2 identifier + + Arg [1] : (optional) identifier to be set to + Example : print $unmappedObject->identifier."\n"; + Description : Basic getter/setter for identifier + ReturnType : String + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub identifier{ + my $self = shift; + + if(@_) { + my $arg = shift; + $self->{'identifier'} = $arg; + } + + return $self->{'identifier'}; +} + +=head2 query_score + + Arg [1] : (optional) query_score to be set to + Example : print $unmappedObject->query_score."\n"; + Description : Basic getter/setter for query_score + ReturnType : float + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub query_score{ + my $self = shift; + + if(@_) { + my $arg = shift; + $self->{'query_score'} = $arg; + } + + return $self->{'query_score'}; +} + +=head2 target_score + + Arg [1] : (optional) target_score to be set to + Example : print $unmappedObject->target_score."\n"; + Description : Basic getter/setter for target_score + ReturnType : float + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub target_score{ + my $self = shift; + + if(@_) { + my $arg = shift; + $self->{'target_score'} = $arg; + } + + return $self->{'target_score'}; +} + +=head2 unmapped_reason_id + + Arg [1] : (optional) unmapped_reason_id to be set to + Example : print $unmappedObject->unmapped_reason_id."\n"; + Description : Basic getter/setter for unmapped_reason_id + ReturnType : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub unmapped_reason_id{ + my $self = shift; + + if(@_) { + my $arg = shift; + $self->{'unmapped_reason_id'} = $arg; + } + + return $self->{'unmapped_reason_id'}; +} + +=head2 analysis + + Arg [1] : (optional) analysis to be set to + Example : print $unmappedObject->analysis->logic_name."\n"; + Description : Basic getter/setter for analysis + ReturnType : Bio::EnsEMBL::Analysis + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub analysis { + my $self = shift; + + if(@_) { + my $an = shift; + if(defined($an) && (!ref($an) || !$an->isa('Bio::EnsEMBL::Analysis'))) { + throw('analysis argument must be a Bio::EnsEMBL::Analysis'); + } + $self->{'analysis'} = $an; + } + + return $self->{'analysis'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Upstream.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Upstream.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,582 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Upstream - Object that defines an upstream region + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Upstream; + + my $upstream = Bio::EnsEMBL::Upstream->new( + -transcript => $transcript, + -length => 2000 # bp + ); + + # Retrieve coordinates of upstream region + my $upstream_region_start = $upstream->upstart; + my $upstream_region_end = $upstream->upend; + + # Retrieve coordinates in 'downstream' first intron + my $intron_region_start = $upstream->downstart; + my $intron_region_end = $upstream->downend; + + # Coordinates are returned in the same scheme as the input transcript. + # However, the coordinates of an upstream region can be transformed to + # any other scheme using a slice + + $upstream->transform($slice); + + # Coordinates can be retrieved in scheme in the same manner as the + # above. + +=head1 DESCRIPTION + +An object that determines the upstream region of a transcript. Such a +region is non-coding and ensures that other genes or transcripts are +not present. Ultimately, these objects can be used to looking for +promoter elements. To this end, it is also possible to derive a region +downstream of the first exon, within the first intron and where promoter +elements sometimes are found. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Upstream; + +use strict; +use vars qw(@ISA); +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::DBSQL::SimpleFeatureAdaptor; + +use Bio::EnsEMBL::Utils::Exception qw(throw); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); + +@ISA = qw(Bio::EnsEMBL::Feature); + + +=head2 new + + Arg [transcript] : (optional) Bio::EnsEMBL::Transcript + Arg [length] : (optional) int $length + Example : $upstream = Bio::EnsEMBL::Upstream->new(-transcript => $transcript, + -length => 2000); + Description: Creates a new upstream object + Returntype : Bio::EnsEMBL::Upstream + Exceptions : none + Caller : Bio::EnsEMBL::Transcript, general + Status : Stable + +=cut + +sub new { + my ($class, @args) = @_; + my $self = {}; + + bless $self, $class; + + my ($transcript, + $length) = rearrange([qw(TRANSCRIPT + LENGTH + )],@args); + + $self->transcript($transcript) if defined $transcript; + $self->length($length) if $length; + + return $self +} + +=head2 transcript + + Arg : (optional) Bio::EnsEMBL::Transcript + Example : $self->transcript($transcript); + Description: Getter/setter for transcript object + Returntype : Bio::EnsEMBL::Transcript + Exceptions : Throws if argument is not undefined + or a Bio::EnsEMBL::Transcript + Caller : $self->new, $self->_derive_coords, + $self->_first_coding_Exon + Status : Stable + +=cut + + +sub transcript { + my $self = shift; + + if (@_){ + $self->{_transcript} = shift; + + if (defined $self->{_transcript}) { + throw("Transcript is not a Bio::EnsEMBL::Transcript") + if (! $self->{_transcript}->isa("Bio::EnsEMBL::Transcript")); + $self->_flush_cache; + } + } + + return $self->{_transcript} +} + +=head2 length + + Arg : (optional) int $length + Example : $self->length(2000); # bp + Description: Getter/setter for upstream region length. + Returntype : int + Exceptions : Throws if length is requested before it has been set. + Caller : $self->new, $self->_derive_coords + Status : Stable + +=cut + +sub length { + my $self = shift; + + if (@_){ + $self->{_length} = shift; + $self->_flush_cache; + } + + throw("Region length has not been set.") + unless $self->{_length}; + + return $self->{_length} +} + +=head2 _flush_cache + + Arg : none + Example : $self->_flush_cache; + Description: Empties cached coordinates (called when + coordinate scheme or region length has changed). + Returntype : none + Exceptions : none + Caller : $self->length, $self->transform + Status : Stable + +=cut + +sub _flush_cache { + my $self = shift; + + $self->upstart(undef); + $self->upend(undef); + $self->downstart(undef); + $self->downend(undef); +} + +=head2 upstart + + Arg : none + Example : $self->upstart; + Description: Returns the start coordinate of the region + upstream of the transcript. This coordinate + is always the furthest from the translation + initiation codon, whereas upend always abutts + the translation initiation codon. + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub upstart { + my $self = shift; + + if (@_) { + $self->{_upstart} = shift @_; + return + } + + if (! defined $self->{_upstart}) { + $self->_derive_coords('up'); + } + + return $self->{_upstart} +} + +=head2 upend + + Arg : none + Example : $self->upend; + Description: Returns the end coordinate of the region + upstream of the transcript. This coordinate + always always abutts the translation + initiation codon, whereas upstart always + returns the coorindate furthest from the + translation initiation codon. + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub upend { + my $self = shift; + + if (@_) { + $self->{_upend} = shift @_; + return + } + + if (! defined $self->{_upend}) { + $self->_derive_coords('up'); + } + + return $self->{_upend} +} + +=head2 downstart + + Arg : none + Example : $self->downstart; + Description: Returns the start coordinate of the region + in the first intron of the transcript. This + coordinate is always closest to the first + exon (irregardless of strand). + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub downstart { + my $self = shift; + + if (@_) { + $self->{_downstart} = shift @_; + return + } + + if (! defined $self->{_downstart}) { + $self->_derive_coords('down'); + } + + return $self->{_downstart} +} + +=head2 downend + + Arg : none + Example : $self->downend; + Description: Returns the end coordinate of the region + in the first intron of the transcript. This + coordinate is always furthest from the first + exon (irregardless of strand). + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub downend { + my $self = shift; + + if (@_) { + $self->{_downend} = shift @_; + return + } + + if (! defined $self->{_downend}) { + $self->_derive_coords('down'); + } + + return $self->{_downend} +} + +=head2 transform + + Arg : + Example : + Description: Not yet implemented + Returntype : + Exceptions : + Caller : + Status : At Risk + +=cut + + +# Over-riding inherited class. As yet unimplemented. + +sub transform { + my $self = shift; + + throw("No transform method implemented for " . $self); +} + +=head2 derive_upstream_coords + + Arg : none + Example : my ($upstart, $upend) + = $self->derive_upstream_coords; + Description: Derives upstream coordinates (for + compatability with older scripts). + Returntype : arrayref + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub derive_upstream_coords { + my $self = shift; + + return [$self->upstart, $self->upend] +} + +=head2 derive_downstream_coords + + Arg : none + Example : my ($downstart, $downend) + = $self->derive_downstream_coords; + Description: Derives downstream coordinates (for + compatability with older scripts). + Returntype : arrayref + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub derive_downstream_coords { + my $self = shift; + + return [$self->downstart, $self->downend] +} + +=head2 _derive_coords + + Arg : string $direction (either 'up' or 'down'). + Example : $self->_derive_coords('up'); + Description: Determines the coordinates of either upstream + or downstream region. + Returntype : none + Exceptions : Throws if argument is not either 'up' or 'down' + Caller : $self->upstart, $self->upend, $self->downstart, + $self->downend + Status : Stable + +=cut + +sub _derive_coords { + my ($self, $direction) = @_; + + # Check direction + throw("Must specify either \'up\' of \'down\'-stream direction to derive coords.") + unless (($direction eq 'up')||($direction eq 'down')); + + # Put things in easily accessible places. + my $core_db_slice_adaptor = $self->transcript->slice->adaptor; + my $region_length = $self->length; + + # Whatever coord system the gene is currently is, transform to the toplevel. + my $transcript = $self->transcript->transform('toplevel'); + + # Use our transformed transcript to determine the upstream region coords. + # End should always be just before the coding start (like ATG), including 3' UTR. + # Start is the outer limit of the region upstream (furthest from ATG). + + my $region_start; + my $region_end; + + if ($transcript->strand == 1){ + if ($direction eq 'up'){ + $region_end = $transcript->coding_region_start - 1; + $region_start = $region_end - $region_length; + } elsif ($direction eq 'down'){ + $region_end = $self->_first_coding_Exon->end + 1; + $region_start = $region_end + $region_length; + } + } elsif ($transcript->strand == -1) { + if ($direction eq 'up'){ + $region_end = $transcript->coding_region_end + 1; + $region_start = $region_end + $region_length; + + } elsif ($direction eq 'down'){ + $region_end = $self->_first_coding_Exon->start - 1; + $region_start = $region_end - $region_length; + } + } + + # Trim the upstream/downstream region to remove extraneous coding sequences + # from other genes and/or transcripts. + + my ($slice_low_coord, $slice_high_coord) = sort {$a <=> $b} ($region_start, $region_end); + + my $region_slice + = $core_db_slice_adaptor->fetch_by_region($transcript->slice->coord_system->name, + $transcript->slice->seq_region_name, + $slice_low_coord, + $slice_high_coord); + + if ($transcript->strand == 1) { + if ($direction eq 'up') { + $region_start += $self->_bases_to_trim('left_end', $region_slice); + } elsif ($direction eq 'down') { + $region_start -= $self->_bases_to_trim('right_end', $region_slice); + } + } elsif ($transcript->strand == -1) { + if ($direction eq 'up') { + $region_start -= $self->_bases_to_trim('right_end', $region_slice); + } elsif ($direction eq 'down') { + $region_start += $self->_bases_to_trim('left_end', $region_slice); + } + } + + # Always return start < end + + ($region_start, $region_end) = sort {$a <=> $b} ($region_start, $region_end); + + if ($direction eq 'up') { + $self->upstart($region_start); + $self->upend($region_end); + } elsif ($direction eq 'down') { + $self->downstart($region_start); + $self->downend($region_end); + } +} + +=head2 _bases_to_trim + + Arg : string $end_to_trim (either 'right_end' or + 'left_end'). + Arg : Bio::EnsEMBL::Slice + Example : $self->_derive_coords('right_end', $slice); + Description: Finds exons from other genes/transcripts that + invade our upstream/downstream slice and + returns the number of bases that should be + truncated from the appropriate end of the + upstream/downstream region. + Returntype : in + Exceptions : Throws if argument is not either 'right_end' + or 'left_end' + Caller : $self->_derive_coords + Status : Stable + +=cut + +# Method to look for coding regions that invade the upstream region. For +# now, this method returns the number of bases to trim. I doesn't yet +# do anything special if an exon is completely swallowed (truncates at +# the end of the overlapping exon and discards any non-coding sequence +# further upstream) or overlaps the 'wrong' end of the region (cases where +# two alternate exons share one end of sequence - does this happen?). + +# The input argument 'end' defines the end of the slice that should be +# truncated. + +sub _bases_to_trim { + my ($self, $end_to_trim, $slice) = @_; + + throw "Slice end argument must be either left_end or right_end" + unless ($end_to_trim eq 'right_end' || $end_to_trim eq 'left_end'); + + my @overlap_coords; + my $slice_length = $slice->length; + my $right_trim = 0; + my $left_trim = 0; + + foreach my $exon (@{$slice->get_all_Exons}){ + next if $exon->stable_id eq $self->_first_coding_Exon->stable_id; + + my $start = $exon->start; + my $end = $exon->end; + + # Choose from four possible exon arrangements + + # -----|********************|----- Slice + # --|=========================|--- Exon arrangement 1 + # ----------|======|-------------- Exon arrangement 2 + # --|=======|--------------------- Exon arrangement 3 + # -------------------|=========|-- Exon arrangement 4 + + + if ($start <= 0 && $end >= $slice_length) { # exon arrangement 1 + $right_trim = $slice_length - 1; + $left_trim = $slice_length - 1; + last; + + } elsif ($start >= 0 && $end <= $slice_length) { # exon arrangement 2 + my $this_right_trim = ($slice_length - $start) + 1; + + $right_trim = $this_right_trim + if $this_right_trim > $right_trim; + + $left_trim = $end + if $end > $left_trim; + + } elsif ($start <= 0 && $end < $slice_length) { # exon arrangement 3 + $right_trim = $slice_length; # a bit draconian + $left_trim = $end + if $end > $left_trim; + + } elsif ($start > 0 && $end >= $slice_length) { # exon arrangement 4 + my $this_right_trim = ($slice_length - $start) + 1; + + $right_trim = $this_right_trim + if $this_right_trim > $right_trim; + + $left_trim = $slice_length; # also a bit draconian + } + + } + + return $right_trim if $end_to_trim eq 'right_end'; + return $left_trim if $end_to_trim eq 'left_end'; +} + +=head2 _first_coding_Exon + + Arg : none + Example : $self->_first_coding_Exon; + Description: Finds the first exon of our transcript that + contains coding bases. + Returntype : Bio::EnsEMBL::Exon + Exceptions : none + Caller : $self->_derive_coords, $self->_bases_to_trim + Status : Stable + +=cut + +sub _first_coding_Exon { + my $self = shift; + + unless ($self->{_first_coding_exon}){ + + my $exons = $self->transcript->get_all_translateable_Exons; + + $self->{_first_coding_exon} = $exons->[0] + if $self->transcript->strand == 1; + $self->{_first_coding_exon} = $exons->[-1] + if $self->transcript->strand == -1; + } + + return $self->{_first_coding_exon} +} + + +return 1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/Argument.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/Argument.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,161 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::Argument - Utility functions for argument handling + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Utils::Argument qw(rearrange) + + package Feature; + + sub new { + my $class = shift; + my ( $start, $end, $strand ) = + rearrange( [ 'START', 'END', 'STRAND' ], @_ ); + + return + bless( { 'start' => $start, 'end' => $end, 'strand' => $strand }, + $class ); + } + +=head1 DESCRIPTION + +This is derived from the Bio::Root module in BioPerl. The _rearrange +object method taken from BioPerl has been renamed rearrange and is now +a static class method. This method was originally written by Lincoln +Stein, and has since been refactored several times by various people (as +described below). + +It is recommended that this package be used instead of inheriting +unnecessarily from the Bio::EnsEMBL::Root or Bio::Root object. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Utils::Argument; + +use strict; +use warnings; + +use Exporter; + +use vars qw(@ISA @EXPORT); + +@ISA = qw(Exporter); +@EXPORT = qw(rearrange); + + + +=head2 rearrange + + Usage : rearrange( array_ref, list_of_arguments) + Purpose : Rearranges named parameters to requested order. + Example : use Bio::EnsEMBL::Utils::Argument qw(rearrange); + : rearrange([qw(SEQUENCE ID DESC)],@param); + : Where @param = (-sequence => $s, + : -id => $i, + : -desc => $d); + Returns : @params - an array of parameters in the requested order. + : The above example would return ($s, $i, $d) + Argument : $order : a reference to an array which describes the desired + : order of the named parameters. + : @param : an array of parameters, either as a list (in + : which case the function simply returns the list), + : or as an associative array with hyphenated tags + : (in which case the function sorts the values + : according to @{$order} and returns that new array.) + : The tags can be upper, lower, or mixed case + : but they must start with a hyphen (at least the + : first one should be hyphenated.) + Source : This function was taken from CGI.pm, written by Dr. Lincoln + : Stein, and adapted for use in Bio::Seq by Richard Resnick and + : then adapted for use in Bio::Root::Object.pm by Steve A. Chervitz. + : This has since been adapted as an exported static method in this + class Bio::EnsEMBL::Utils::Argument + Comments : (SAC) + : This method may not be appropriate for method calls that are + : within in an inner loop if efficiency is a concern. + : + : Parameters can be specified using any of these formats: + : @param = (-name=>'me', -color=>'blue'); + : @param = (-NAME=>'me', -COLOR=>'blue'); + : @param = (-Name=>'me', -Color=>'blue'); + : A leading hyphenated argument is used by this function to + : indicate that named parameters are being used. + : Therefore, a ('me', 'blue') list will be returned as-is. + : + : Note that Perl will confuse unquoted, hyphenated tags as + : function calls if there is a function of the same name + : in the current namespace: + : -name => 'foo' is interpreted as -&name => 'foo' + : + : For ultimate safety, put single quotes around the tag: + : ('-name'=>'me', '-color' =>'blue'); + : This can be a bit cumbersome and I find not as readable + : as using all uppercase, which is also fairly safe: + : (-NAME=>'me', -COLOR =>'blue'); + : + : Personal note (SAC): I have found all uppercase tags to + : be more managable: it involves less single-quoting, + : the code is more readable, and there are no method naming + : conlicts. + : Regardless of the style, it greatly helps to line + : the parameters up vertically for long/complex lists. + +=cut + + +sub rearrange { + my $order = shift; + + if ( $order eq "Bio::EnsEMBL::Utils::Argument" ) { + # skip object if one provided + $order = shift; + } + + # If we've got parameters, we need to check to see whether + # they are named or simply listed. If they are listed, we + # can just return them. + unless ( @_ && $_[0] && substr( $_[0], 0, 1 ) eq '-' ) { + return @_; + } + + # Push undef onto the end if % 2 != 0 to stop warnings + push @_,undef unless $#_ %2; + my %param; + while( @_ ) { + #deletes all dashes & uppercases at the same time + (my $key = shift) =~ tr/a-z\055/A-Z/d; + $param{$key} = shift; + } + + # What we intend to do is loop through the @{$order} variable, + # and for each value, we use that as a key into our associative + # array, pushing the value at that key onto our return array. + return map { $param{uc($_)} } @$order; +} + +1; + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/AssemblyProjector.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/AssemblyProjector.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,459 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::AssemblyProjector - +utility class to post-process projections from one assembly to another + +=head1 SYNOPSIS + + # connect to an old database + my $dba_old = new Bio::EnsEMBL::DBSQL::DBAdaptor( + -host => 'ensembldb.ensembl.org', + -port => 3306, + -user => ensro, + -dbname => 'mus_musculus_core_46_36g', + -group => 'core_old', + ); + + # connect to the new database containing the mapping between old and + # new assembly + my $dba_new = new Bio::EnsEMBL::DBSQL::DBAdaptor( + -host => 'ensembldb.ensembl.org', + -port => 3306, + -user => ensro, + -dbname => 'mus_musculus_core_47_37', + -group => 'core_new', + ); + + my $assembly_projector = Bio::EnsEMBL::Utils::AssemblyProjector->new( + -OLD_ASSEMBLY => 'NCBIM36', + -NEW_ASSEMBLY => 'NCBIM37', + -ADAPTOR => $dba_new, + -EXTERNAL_SOURCE => 1, + -MERGE_FRAGMENTS => 1, + -CHECK_LENGTH => 0, + ); + + # fetch a slice on the old assembly + my $slice_adaptor = $dba_old->get_SliceAdaptor; + my $slice = + $slice_adaptor->fetch_by_region( 'chromosome', 1, undef, undef, + undef, 'NCBIM36' ); + + my $new_slice = $assembly_projector->old_to_new($slice); + + print $new_slice->name, " (", $assembly_projector->last_status, ")\n"; + +=head1 DESCRIPTION + +This class implements some utility functions for converting coordinates +between assemblies. A mapping between the two assemblies has to present +the database for this to work, see the 'Related Modules' section below +on how to generate the mapping. + +In addition to the "raw" projecting of features and slices, the methods +in this module also apply some sensible rules to the results of the +projection (like discarding unwanted results or merging fragmented +projections). These are the rules (depending on configuration): + +Discard the projected feature/slice if: + + 1. it doesn't project at all (no segments returned) + 2. [unless MERGE_FRAGMENTS is set] the projection is fragmented (more + than one segment) + 3. [if CHECK_LENGTH is set] the projection doesn't have the same + length as the original feature/slice + 4. all segments are on same chromosome and strand + +If a projection fails any of these rules, undef is returned instead of +a projected feature/slice. You can use the last_status() method to find +out about the results of the rules tests. + +Also note that when projecting features, only a shallow projection is +performed, i.e. other features attached to your features (e.g. the +transcripts of a gene) are not projected automatically, so it will be +the responsability of the user code project all levels of features +involved. + +=head1 METHODS + + new + project + old_to_new + new_to_old + adaptor + external_source + old_assembly + new_assembly + merge_fragments + check_length + +=head1 RELATED MODULES + +The process of creating a whole genome alignment between two assemblies +(which is the basis for the use of the methods in this class) is done by +a series of scripts. Please see + + ensembl/misc-scripts/assembly/README + +for a high-level description of this process, and POD in the individual +scripts for the details. + +=cut + +package Bio::EnsEMBL::Utils::AssemblyProjector; + +use strict; +use warnings; +no warnings qw(uninitialized); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Slice; +use Scalar::Util qw(weaken); + +=head2 new + + Arg [ADAPTOR] : Bio::EnsEMBL::DBSQL::DBAdaptor $adaptor - a db adaptor + for a database containing the assembly mapping + Arg [EXTERNAL_SOURCE] : (optional) Boolean $external_source - indicates if + source is from a different database + Arg [OLD_ASSEMBLY] : name of the old assembly + Arg [OLD_ASSEMBLY] : name of the new assembly + Arg [OBJECT_TYPE] : (optional) object type ('slice' or 'feature') + Arg [MERGE_FRAGMENTS] : (optional) Boolean - determines if segments are merged + to return a single object spanning all segments + (default: true) + Arg [CHECK_LENGTH] : (optional) Boolean - determines if projected objects + have to have same length as original (default: false) + Example : my $ap = Bio::EnsEMBL::Utils::AssemblyProjector->new( + -DBADAPTOR => $dba, + -OLD_ASSEMBLY => NCBIM36, + -NEW_ASSEMBLY => NCBIM37, + ); + Description : Constructor. + Return type : a Bio::EnsEMBL::Utils::AssemblyProjector object + Exceptions : thrown on missing arguments + thrown on invalid OBJECT_TYPE + Caller : general + Status : At Risk + : under development + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my ($adaptor, $external_source, $old_assembly, $new_assembly, + $merge_fragments, $check_length) = rearrange([qw(ADAPTOR EXTERNAL_SOURCE + OLD_ASSEMBLY NEW_ASSEMBLY MERGE_FRAGMENTS CHECK_LENGTH)], @_); + + unless ($adaptor and ref($adaptor) and + $adaptor->isa('Bio::EnsEMBL::DBSQL::DBAdaptor')) { + throw("You must provide a DBAdaptor to a database containing the assembly mapping."); + } + + unless ($old_assembly and $new_assembly) { + throw("You must provide an old and new assembly name."); + } + + my $self = {}; + bless ($self, $class); + + # initialise + $self->adaptor($adaptor); + $self->{'old_assembly'} = $old_assembly; + $self->{'new_assembly'} = $new_assembly; + + # by default, merge fragments + $self->{'merge_fragments'} = $merge_fragments || 1; + + # by default, do not check length + $self->{'check_length'} = $check_length || 0; + + # by default, features and slices are expected in same database as the + # assembly mapping + $self->{'external_source'} = $external_source || 0; + + return $self; +} + + +=head2 project + + Arg[1] : Bio::EnsEMBL::Slice or Bio::EnsEMBL::Feature $object - + the object to project + Arg[2] : String $to_assembly - assembly to project to + Example : my $new_slice = $assembly_projector->project($old_slice, + 'NCBIM37'); + Description : Projects a Slice or Feature to the specified assembly. + + Several tests are performed on the result to discard unwanted + results. All projection segments have to be on the same + seq_region and strand. If -MERGE_FRAGMENTS is set, gaps will be + bridged by creating a single object from first_segment_start to + last_segment_end. If -CHECK_LENGTH is set, the projected object + will have to have the same length as the original. You can use + the last_status() method to find out what the result of some of + these rule tests were. Please see the comments in the code for + more details about these rules. + + The return value of this method will always be a single object, + or undef if the projection fails any of the rules. + + Note that when projecting features, only a "shallow" projection + is performed, i.e. attached features aren't projected + automatically! (e.g. if you project a gene, its transcripts will + have to be projected manually before storing the new gene) + Return type : same a Arg 1, or undef if projection fails any of the rules + Exceptions : thrown on invalid arguments + Caller : general, $self->old_to_new, $self->new_to_old + Status : At Risk + : under development + +=cut + +sub project { + my ($self, $object, $to_assembly) = @_; + + throw("Need an assembly version to project to.") unless ($to_assembly); + throw("Need an object to project.") unless ($object and ref($object)); + + my ($slice, $object_type); + + if ($object->isa('Bio::EnsEMBL::Feature')) { + $object_type = 'feature'; + } elsif ($object->isa('Bio::EnsEMBL::Slice')) { + $object_type = 'slice'; + } else { + throw("Need a Feature or Slice to project."); + } + + # if the feature or slice is sourced from another db, we have to "transfer" + # it to the db that contains the assembly mapping. the transfer is very + # shallow but that should do for our purposes + if ($self->external_source) { + my $slice_adaptor = $self->adaptor->get_SliceAdaptor; + + if ($object_type eq 'feature') { + + # createa a new slice from the target db + my $f_slice = $object->slice; + my $target_slice = $slice_adaptor->fetch_by_name($f_slice->name); + + # now change the feature so that it appears it's from the target db + $object->slice($target_slice); + + } else { + + # createa a new slice from the target db + $object = $slice_adaptor->fetch_by_name($object->name); + + } + } + + if ($object_type eq 'feature') { + $slice = $object->feature_Slice; + } else { + $slice = $object; + } + + # warn if trying to project to assembly version the object already is on + if ($slice->coord_system->version eq $to_assembly) { + warning("Assembly version to project to ($to_assembly) is the same as your object's assembly (".$slice->coord_system->version.")."); + } + + # now project the slice + my $cs_name = $slice->coord_system_name; + my @segments = @{ $slice->project($cs_name, $to_assembly) }; + + # we need to reverse the projection segment list if the orignial + if ($slice->strand == -1) { + @segments = reverse(@segments); + } + + # apply rules to projection results + # + # discard the projected feature/slice if + # 1. it doesn't project at all (no segments returned) + # 2. [unless MERGE_FRAGMENTS is set] the projection is fragmented (more + # than one segment) + # 3. [if CHECK_LENGTH is set] the projection doesn't have the same length + # as the original feature/slice + # 4. all segments are on same chromosome and strand + + # keep track of the status of applied rules + my @status = (); + + # test (1) + return undef unless (@segments); + #warn "DEBUG: passed test 1\n"; + + # test (2) + return undef if (!($self->merge_fragments) and scalar(@segments) > 1); + push @status, 'fragmented' if (scalar(@segments) > 1); + #warn "DEBUG: passed test 2\n"; + + # test (3) + my $first_slice = $segments[0]->to_Slice; + my $last_slice = $segments[-1]->to_Slice; + my $length_mismatch = (($last_slice->end - $first_slice->start + 1) != + $object->length); + return undef if ($self->check_length and $length_mismatch); + push @status, 'length_mismatch' if ($length_mismatch); + #warn "DEBUG: passed test 3\n"; + + # test (4) + my %sr_names = (); + my %strands = (); + foreach my $seg (@segments) { + my $sl = $seg->to_Slice; + $sr_names{$sl->seq_region_name}++; + $strands{$sl->strand}++; + } + return undef if (scalar(keys %sr_names) > 1 or scalar(keys %strands) > 1); + #warn "DEBUG: passed test 4\n"; + + # remember rule status + $self->last_status(join('|', @status)); + + # everything looks fine, so adjust the coords of your feature/slice + my $new_slice = $first_slice; + $new_slice->{'end'} = $last_slice->end; + + if ($object_type eq 'slice') { + return $new_slice; + } else { + + $object->start($new_slice->start); + $object->end($new_slice->end); + $object->strand($new_slice->strand); + $object->slice($new_slice->seq_region_Slice); + + # undef dbID and adaptor so you can store the feature in the target db + $object->dbID(undef); + $object->adaptor(undef); + + return $object; + } + +} + + +=head2 old_to_new + + Arg[1] : Bio::EnsEMBL::Slice or Bio::EnsEMBL::Feature $object - + the object to project + Example : my $new_slice = $assembly_projector->old_to_new($old_slice); + Description : Projects a Slice or Feature from old to new assembly. + This method is just a convenience wrapper for $self->project. + Return type : same a Arg 1, or undef + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub old_to_new { + my ($self, $object) = @_; + return $self->project($object, $self->new_assembly); +} + + +=head2 new_to_old + + Arg[1] : Bio::EnsEMBL::Slice or Bio::EnsEMBL::Feature $object - + the object to project + Example : my $old_slice = $assembly_projector->new_to_old($new_slice, 1); + Description : Projects a Slice or Feature from new to old assembly. + This method is just a convenience wrapper for $self->project. + Return type : same a Arg 1, or undef + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub new_to_old { + my ($self, $object) = @_; + return $self->project($object, $self->old_assembly); +} + + +# +# accessors +# +sub adaptor { + my $self = shift; + weaken($self->{'adaptor'} = shift) if (@_); + return $self->{'adaptor'}; +} + + +sub external_source { + my $self = shift; + $self->{'external_source'} = shift if (@_); + return $self->{'external_source'}; +} + + +sub old_assembly { + my $self = shift; + $self->{'old_assembly'} = shift if (@_); + return $self->{'old_assembly'}; +} + + +sub new_assembly { + my $self = shift; + $self->{'new_assembly'} = shift if (@_); + return $self->{'new_assembly'}; +} + + +sub merge_fragments { + my $self = shift; + $self->{'merge_fragments'} = shift if (@_); + return $self->{'merge_fragments'}; +} + + +sub check_length { + my $self = shift; + $self->{'check_length'} = shift if (@_); + return $self->{'check_length'}; +} + + +sub last_status { + my $self = shift; + $self->{'last_status'} = shift if (@_); + return $self->{'last_status'}; +} + + +1; + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/BiotypeMapper.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/BiotypeMapper.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,319 @@ +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 NAME + +BiotypeMapper - Translates EnsEMBL biotypes into Sequence Ontology terms and back + +=head1 AUTHOR + +Kieron Taylor, 2011 - ktaylor@ebi.ac.uk + +=head1 SYNOPSIS + +use Bio::EnsEMBL::Utils::BiotypeMapper + +my $ontology_adaptor = $registry->get_adaptor( 'Multi', 'Ontology', 'OntologyTerm' ); +my $biotype_mapper = new BiotypeMapper($ontology_adaptor); + +print $biotype_mapper->translate_feature_to_SO_term($feature); + +my $list_of_biotypes = $biotype_mapper->biotypes_belonging_to_group('protein-coding'); + +=head1 DESCRIPTION + +BiotypeMapper provides a series of nearest matches between EnsEMBL biotypes and +the Sequence Ontology (http://www.sequenceontology.org). In addition, biotypes +are members of groupings, such as "short non-coding". This allows one to +conveniently select all the biotypes of a certain kind. + +SO Mappings are imperfect due to the inexact correspondance of biotypes to +several SO terms. The a best guess has been chosen in each case. + +Reverse mappings from SO to biotype are vague, due to many-to-one relationships. +In this case a list of possible terms is given. + +=cut + +package Bio::EnsEMBL::Utils::BiotypeMapper; + +use strict; +use warnings; + +use Bio::EnsEMBL::Utils::Exception; + +my %gene_so_mapping = ( + 'protein_coding' => 'SO:0001217', # protein_coding_gene + 'pseudogene' => 'SO:0000336', # pseudogene + 'processed_transcript' => 'SO:0001503', # processed_transcript + 'lincRNA' => 'SO:0001641', # lincRNA_gene + 'polymorphic_pseudogene'=> 'SO:0000336', # pseudogene + 'Mt_tRNA' => 'SO:0000088', # mt_gene + 'IG_D_gene' => 'SO:0000510', # D_gene + 'snoRNA' => 'SO:0001267', #snoRNA_gene + 'misc_RNA' => 'SO:0000356', #RNA + 'miRNA' => 'SO:0001265', #miRNA_gene + 'rRNA' => 'SO:0001637', #rRNA_gene + 'snRNA' => 'SO:0001268', #snRNA_gene + 'snRNA_pseudogene' => 'SO:0000336', # pseudogene + 'tRNA_pseudogene' => 'SO:0000778', # pseudogenic_tRNA + 'rRNA_pseudogene' => 'SO:0000777', # pseudogenic_rRNA + 'TR_J_gene' => 'SO:0000470', # J_gene + 'TR_V_gene' => 'SO:0000466', # V_gene + 'TR_C_gene' => 'SO:0000478', # C_gene + 'ncRNA' => 'SO:0001263', # ncRNA_gene + 'tRNA' => 'SO:0001272', # tRNA_gene + 'retrotransposed' => 'SO:0000569', # retrotransposed +## heavily abbreviated + ); + +my %transcript_so_mapping = ( + 'processed_transcript' => 'SO:0001503', # processed_transcript + 'nonsense_mediated_decay' => 'SO:0001621', # NMD_transcript_variant + 'retained_intron' => 'SO:0000681', # aberrant_processed_transcript + 'transcribed_unprocessed_pseudogene'=> 'SO:0000516', # pseudogenic_transcript + 'processed_pseudogene' => 'SO:0000043', # processed_pseudogene + 'unprocessed_pseudogene' => 'SO:0000336', # pseudogene + 'unitary_pseudogene' => 'SO:0000336', + 'pseudogene' => 'SO:0000336', # pseudogene + 'transcribed_processed_pseudogene' => 'SO:0000043', + 'retrotransposed' => 'SO:0000569', #retrotransposed + 'ncrna_host' => 'SO:0000483', + 'polymorphic_pseudogene' => 'SO:0000336', + 'lincRNA' => 'SO:0001463', + 'ncrna_host' => 'SO:0000483', + '3prime_overlapping_ncrna' => 'SO:0000483', + 'TR_V_gene' => 'SO:0000466', + 'TR_V_pseudogene' => 'SO:0000336', + + 'TR_J_gene' => 'SO:0000470', + 'IG_C_gene' => 'SO:0000478', + 'IG_C_pseudogene' => 'SO:0000336', + 'TR_C_gene' => 'SO:0000478', + 'IG_J_pseudogene' => 'SO:0000336', + 'miRNA' => 'SO:0000276', #miRNA + 'miRNA_pseudogene' => 'SO:0000336', + 'disrupted_domain' => 'SO:0000681', # aberrant_processed_transcript + 'rRNA' => 'SO:0000252', #rRNA + 'rRNA_pseudogene' => 'SO:0000777', + 'scRNA_pseudogene' => 'SO:0000336', + 'snoRNA' => 'SO:0000275', # snoRNA + 'snoRNA_pseudogene' => 'SO:0000336', + 'snRNA' => 'SO:0000274', # snRNA + 'snRNA_pseudogene' => 'SO:0000336', + + ); + +my %feature_so_mapping = ( + 'Bio::EnsEMBL::Gene' => 'SO:0000704', # gene + 'Bio::EnsEMBL::Transcript' => 'SO:0000673', # transcript + 'Bio::EnsEMBL::Slice' => 'SO:0000001', # region + 'Bio::EnsEMBL::Variation::VariationFeature' => 'SO:0001060', # sequence variant + 'Bio::EnsEMBL::Variation::StructuralVariationFeature' => 'SO:0001537', # structural variant + 'Bio::EnsEMBL::Compara::ConstrainedElement' => 'SO:0001009', #DNA_constraint_sequence ???? + 'Bio::EnsEMBL::Funcgen::RegulatoryFeature' => 'SO:0001679', # transcription_regulatory_region +); + +my %grouping_of_biotypes = ( + # Genebuilder/Havana categorisation + 'protein_coding' => [qw( protein_coding polymorphic_pseudogene )], + 'pseudogene' => [qw( pseudogene retrotransposed snRNA_pseudogene tRNA_pseudogene + miRNA_pseudogene Mt_tRNA_pseudogene rRNA_pseudogene + scRNA_pseudogene misc_RNA_pseudogene snoRNA_pseudogene + )], + 'long_noncoding' => [qw( 3prime_overlapping_ncrna antisense lincRNA ncrna_host non_coding + processed_transcript sense_intronic sense_overlapping + )], + 'short_noncoding'=> [qw( miRNA misc_RNA Mt_tRNA + rRNA snoRNA snRNA + )], + # practical Ensembl core categories for fasta dumping + 'cdna' => [qw( protein_coding polymorphic_pseudogene IG_V_gene TR_V_gene + IG_J_gene TR_J_gene IG_D_gene IG_C_gene TR_C_gene pseudogene + retrotransposed IG_V_pseudogene TR_V_pseudogene + IG_J_pseudogene IG_C_pseudogene processed_transcript + antisense ambiguous_orf transcribed_processed_pseudogene + disrupted_domain + )], + 'peptide_producing' => [qw( protein_coding polymorphic_pseudogene IG_V_gene TR_V_gene + IG_J_gene TR_J_gene IG_D_gene IG_C_gene TR_C_gene + nonsense_mediated_decay + )], + 'ncrna' => [qw( ncRNA miRNA miRNA_pseudogene misc_RNA misc_RNA_pseudogene Mt_tRNA + Mt_tRNA_pseudogene Mt_rRNA rRNA rRNA_pseudogene scRNA_pseudogene + snoRNA snoRNA_pseudogene snRNA snRNA_pseudogene tRNA_pseudogene + 3prime_overlapping_ncrna lincRNA ncrna_host non_coding + sense_intronic sense_overlapping tRNA + )], +); + +=head2 new + + Constructor + Arg [1] : OntologyAdaptor from the EnsEMBL registry + Returntype : Bio::EnsEMBL::BiotypeMapper + +=cut + +sub new { + my $class = shift; + my $self = { + ontology_adaptor => shift, + }; + + bless $self, $class; + return $self; +} + +=head2 translate_feature_to_SO_term + + Arg [0] : Bio::EnsEMBL::Feature, subclass or related Storable + Description: Translates a Feature type into an SO term. If the Feature is a + Gene or Transcript, then a further refinement of the type is made + via Biotype + Returntype : String + +=cut + +sub translate_feature_to_SO_term { + my $self = shift; + my $feature = shift; + my $so_accession; + my $so_term; + if (ref($feature) eq "Bio::EnsEMBL::Gene" and exists $gene_so_mapping{$feature->biotype}) { + $so_accession = $gene_so_mapping{$feature->biotype}; + } + elsif (ref($feature) eq "Bio::EnsEMBL::Transcription" and exists $transcript_so_mapping{$feature->biotype}) { + $so_accession = $transcript_so_mapping{$feature->biotype}; + } + else { + $so_accession = $feature_so_mapping{ref($feature)}; + } + if (defined($so_accession)) { + $so_term = $self->{'ontology_adaptor'}->fetch_by_accession($so_accession); + } + else { + throw ("Ontology mapping not found for ".ref($feature)); + return "????????"; + } + + return $so_term->name; +} + + +=head2 translate_SO_to_biotype + + Arg [0] : Sequence Ontology term, either in name or URI format + Description: Returns the closest corresponding Ensembl biotypes to a given SO term + Returntype : Listref Array of Strings containing possible biotypes +=cut + +sub translate_SO_to_biotype { + my $self = shift; + my $translate_me = shift; + + my @so_names; +# look up text in ontology database + if ($translate_me !~ /^SO:/) { + my $so_terms = $self->{'ontology_adaptor'}->fetch_all_by_name($translate_me); + @so_names = []; + foreach my $term (@{$so_terms}) { + push @so_names,$term->accession(); + } + } + else { + push @so_names,$translate_me; + } +# convert list of accessions into biotypes + my @biotypes; + foreach my $accession (@so_names) { + foreach my $key (keys %gene_so_mapping) { + if ($gene_so_mapping{$key} eq $accession) { + push @biotypes,$key; + } + } + foreach my $key (keys %transcript_so_mapping) { + if ($transcript_so_mapping{$key} eq $accession) { + push @biotypes,$key; + } + } + foreach my $key (keys %feature_so_mapping) { + if ($feature_so_mapping{$key} eq $accession) { + push @biotypes,$key; + } + } + } + + return \@biotypes; +} + +=head2 belongs_to_groups + + Arg [0] : Biotype (string) + Description: Returns the group names that include the given biotype + Returntype : Listref of strings +=cut + +sub belongs_to_groups { + my $self = shift; + my $member = shift; + my @belongs_to; + foreach my $group (keys %grouping_of_biotypes) { + $group = lc($group); + foreach my $biotype ( @{ $grouping_of_biotypes{$group} }) { + if ($biotype eq $member) {push @belongs_to,$group;} + } + } + return \@belongs_to; +} + +=head2 group_members + + Arg [0] : Biotype group name (string) + Description: Returns a list of biotypes that belong in the group. + Returntype : Listref of strings +=cut + +sub group_members { + my $self = shift; + my $group = lc(shift); + if (exists($grouping_of_biotypes{$group})) { + my @biotypes = @{ $grouping_of_biotypes{$group} }; + return \@biotypes; + } + else { + throw ("$group is not a valid group name for biotypes"); + } +} + +=head2 member_of_group + + Arg [0] : Biotype (string) + Arg [1] : Group to check (string) + Description: Returns true if a biotype is present in a group + Returntype : Boolean +=cut + +sub member_of_group { + my $self = shift; + my $biotype = shift; + my $query_group = lc(shift); + my @groups = @{ $self->belongs_to_groups($biotype) }; + while (my $group = lc(shift @groups)) { + if ($group eq $query_group) { + return 1; + } + } + return 0; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/BitString.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/BitString.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,87 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::BitString - bitstring object implementation + +=head1 DESCRIPTION + +This is an implementation of a bitstring object, taken from Damian +Convey's book "Object Oriented Perl". + +=head1 METHODS + +=cut + + +package Bio::EnsEMBL::Utils::BitString; + +use strict; +use warnings; +no warnings 'uninitialized'; + + +sub new { + my $class = ref($_[0])||$_[0]; + my $initbits = join '', map {$_?1:0} @_[1..$#_]; + my $bs = pack 'b*', $initbits; + bless \$bs, $class; +} + + +sub get { + my ($self, $bitnum) = @_; + return vec($$self,$bitnum,1); +} + + +sub set { + my ($self, $bitnum, $newval) = @_; + vec($$self,$bitnum,1) = $newval?1:0; +} + + +sub bitcount { + 8 * length ${$_[0]}; +} + + +sub complement { + my ($self) = @_; + my $complement = ~$$self; + bless \$complement, ref($self); +} + + +sub print_me { + my ($self) = @_; + for (my $i=0; $i < $self->bitcount(); $i++) + { + print $self->get($i); + print ' ' unless ($i+1)%8; + print "\n" unless ($i+1)%64; + } + print "\n"; +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/Cache.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/Cache.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,703 @@ +# This package, originally distributed by CPAN, has been modified from +# its original version in order to be used by the ensembl project. +# +# 8 July 2002 - changed package name +# + +#package Tie::Cache; # old package +package Bio::EnsEMBL::Utils::Cache; + +use strict; +use vars qw( + $VERSION $Debug $STRUCT_SIZE $REF_SIZE + $BEFORE $AFTER $KEY $VALUE $BYTES $DIRTY +); + +$VERSION = .17; +$Debug = 0; # set to 1 for summary, 2 for debug output +$STRUCT_SIZE = 240; # per cached elem bytes overhead, approximate +$REF_SIZE = 16; + +# NODE ARRAY STRUCT +$KEY = 0; +$VALUE = 1; +$BYTES = 2; +$BEFORE = 3; +$AFTER = 4; +$DIRTY = 5; + +=pod + +=head1 NAME + +Tie::Cache - LRU Cache in Memory + +=head1 SYNOPSIS + + use Tie::Cache; + tie %cache, 'Tie::Cache', 100, { Debug => 1 }; + tie %cache2, 'Tie::Cache', { MaxCount => 100, MaxBytes => 50000 }; + tie %cache3, 'Tie::Cache', 100, { Debug => 1 , WriteSync => 0}; + + # Options ################################################################## + # + # Debug => 0 - DEFAULT, no debugging output + # 1 - prints cache statistics upon destroying + # 2 - prints detailed debugging info + # + # MaxCount => Maximum entries in cache. + # + # MaxBytes => Maximum bytes taken in memory for cache based on approximate + # size of total cache structure in memory + # + # There is approximately 240 bytes used per key/value pair in the cache for + # the cache data structures, so a cache of 5000 entries would take + # at approximately 1.2M plus the size of the data being cached. + # + # MaxSize => Maximum size of each cache entry. Larger entries are not cached. + # This helps prevent much of the cache being flushed when + # you set an exceptionally large entry. Defaults to MaxBytes/10 + # + # WriteSync => 1 - DEFAULT, write() when data is dirtied for + # TRUE CACHE (see below) + # 0 - write() dirty data as late as possible, when leaving + # cache, or when cache is being DESTROY'd + # + ############################################################################ + + # cache supports normal tied hash functions + $cache{1} = 2; # STORE + print "$cache{1}\n"; # FETCH + + # FIRSTKEY, NEXTKEY + while(($k, $v) = each %cache) { print "$k: $v\n"; } + + delete $cache{1}; # DELETE + %cache = (); # CLEAR + +=head1 DESCRIPTION + +This module implements a least recently used (LRU) cache in memory +through a tie interface. Any time data is stored in the tied hash, +that key/value pair has an entry time associated with it, and +as the cache fills up, those members of the cache that are +the oldest are removed to make room for new entries. + +So, the cache only "remembers" the last written entries, up to the +size of the cache. This can be especially useful if you access +great amounts of data, but only access a minority of the data a +majority of the time. + +The implementation is a hash, for quick lookups, +overlaying a doubly linked list for quick insertion and deletion. +On a WinNT PII 300, writes to the hash were done at a rate +3100 per second, and reads from the hash at 6300 per second. +Work has been done to optimize refreshing cache entries that are +frequently read from, code like $cache{entry}, which moves the +entry to the end of the linked list internally. + +=cut Documentation continues at the end of the module. + +sub TIEHASH { + my($class, $max_count, $options) = @_; + + if(ref($max_count)) { + $options = $max_count; + $max_count = $options->{MaxCount}; + } + + unless($max_count || $options->{MaxBytes}) { + die('you must specify cache size with either MaxBytes or MaxCount'); + } + + my $sync = exists($options->{WriteSync}) ? $options->{WriteSync} : 1; + + my $self = bless + { + # how many items to cache + max_count=> $max_count, + + # max bytes to cache + max_bytes => $options->{MaxBytes}, + + # max size (in bytes) of an individual cache entry + max_size => $options->{MaxSize} || ($options->{MaxBytes} ? (int($options->{MaxBytes}/10) + 1) : 0), + + # class track, so know if overridden subs should be used + 'class' => $class, + 'subclass' => $class ne 'Tie::Cache' ? 1 : 0, + + # current sizes + count=>0, + bytes=>0, + + # inner structures + head=>0, + tail=>0, + nodes=>{}, + 'keys'=>[], + + # statistics + hit => 0, + miss => 0, + + # config + sync => $sync, + dbg => $options->{Debug} || $Debug + + + }, $class; + + if (($self->{max_bytes} && ! $self->{max_size})) { + die("MaxSize must be defined when MaxBytes is"); + } + + if($self->{max_bytes} and $self->{max_bytes} < 1000) { + die("cannot set MaxBytes to under 1000, each raw entry takes $STRUCT_SIZE bytes alone"); + } + + if($self->{max_size} && $self->{max_size} < 3) { + die("cannot set MaxSize to under 3 bytes, assuming error in config"); + } + + $self; +} + +# override to write data leaving cache +sub write { undef; } +# commented this section out for speed +# my($self, $key, $value) = @_; +# 1; +#} + +# override to get data if not in cache, should return $value +# associated with $key +sub read { undef; } +# commented this section out for speed +# my($self, $key) = @_; +# undef; +#} + +sub FETCH { + my($self, $key) = @_; + + my $node = $self->{nodes}{$key}; + if($node) { + # refresh node's entry + $self->{hit}++; # if $self->{dbg}; + + # we used to call delete then insert, but we streamlined code + if(my $after = $node->[$AFTER]) { + $self->{dbg} > 1 and $self->print("update() node $node to tail of list"); + # reconnect the nodes + my $before = $after->[$BEFORE] = $node->[$BEFORE]; + if($before) { + $before->[$AFTER] = $after; + } else { + $self->{head} = $after; + } + + # place at the end + $self->{tail}[$AFTER] = $node; + $node->[$BEFORE] = $self->{tail}; + $node->[$AFTER] = undef; + $self->{tail} = $node; # always true after this + } else { + # if there is nothing after node, then we are at the end already + # so don't do anything to move the nodes around + die("this node is the tail, so something's wrong") + unless($self->{tail} eq $node); + } + + $self->print("FETCH [$key, $node->[$VALUE]]") if ($self->{dbg} > 1); + $node->[$VALUE]; + } else { + # we have a cache miss here + $self->{miss}++; # if $self->{dbg}; + + # its fine to always insert a node, even when we have an undef, + # because even if we aren't a sub-class, we should assume use + # that would then set the entry. This model works well with + # sub-classing and reads() that might want to return undef as + # a valid value. + my $value; + if ($self->{subclass}) { + $self->print("read() for key $key") if $self->{dbg} > 1; + $value = $self->read($key); + } + + if(defined $value) { + my $length; + if($self->{max_size}) { + # check max size of entry, that it not exceed max size + $length = &_get_data_length(\$key, \$value); + if($length > $self->{max_size}) { + $self->print("direct read() [$key, $value]") if ($self->{dbg} > 1); + return $value; + } + } + # if we get here, we should insert the new node + $node = &create_node($self, \$key, \$value, $length); + &insert($self, $node); + $value; + } else { + undef; + } + } +} + +sub STORE { + my($self, $key, $value) = @_; + my $node; + + $self->print("STORE [$key,$value]") if ($self->{dbg} > 1); + + # do not cache undefined values + defined($value) || return(undef); + + # check max size of entry, that it not exceed max size + my $length; + if($self->{max_size}) { + $length = &_get_data_length(\$key, \$value); + if($length > $self->{max_size}) { + if ($self->{subclass}) { + $self->print("direct write() [$key, $value]") if ($self->{dbg} > 1); + $self->write($key, $value); + } + return $value; + } + } + + # do we have node already ? + if($self->{nodes}{$key}) { + $node = &delete($self, $key); +# $node = &delete($self, $key); +# $node->[$VALUE] = $value; +# $node->[$BYTES] = $length || &_get_data_length(\$key, \$value); + } + + # insert new node + $node = &create_node($self, \$key, \$value, $length); +# $node ||= &create_node($self, \$key, \$value, $length); + &insert($self, $node); + + # if the data is sync'd call write now, otherwise defer the data + # writing, but mark it dirty so it can be cleanup up at the end + if ($self->{subclass}) { + if($self->{sync}) { + $self->print("sync write() [$key, $value]") if $self->{dbg} > 1; + $self->write($key, $value); + } else { + $node->[$DIRTY] = 1; + } + } + + $value; +} + +sub DELETE { + my($self, $key) = @_; + + $self->print("DELETE $key") if ($self->{dbg} > 1); + my $node = $self->delete($key); + $node ? $node->[$VALUE] : undef; +} + +sub CLEAR { + my($self) = @_; + + $self->print("CLEAR CACHE") if ($self->{dbg} > 1); + + if($self->{subclass}) { + my $flushed = $self->flush(); + $self->print("FLUSH COUNT $flushed") if ($self->{dbg} > 1); + } + + my $node; + while($node = $self->{head}) { + $self->delete($self->{head}[$KEY]); + } + + 1; +} + +sub EXISTS { + my($self, $key) = @_; + exists $self->{nodes}{$key}; +} + +# firstkey / nextkey emulate keys() and each() behavior by +# taking a snapshot of all the nodes at firstkey, and +# iterating through the keys with nextkey +# +# this method therefore will only supports one each() / keys() +# happening during any given time. +# +sub FIRSTKEY { + my($self) = @_; + + $self->{'keys'} = []; + my $node = $self->{head}; + while($node) { + push(@{$self->{'keys'}}, $node->[$KEY]); + $node = $node->[$AFTER]; + } + + shift @{$self->{'keys'}}; +} + +sub NEXTKEY { + my($self, $lastkey) = @_; + shift @{$self->{'keys'}}; +} + +sub DESTROY { + my($self) = @_; + + # if debugging, snapshot cache before clearing + if($self->{dbg}) { + if($self->{hit} || $self->{miss}) { + $self->{hit_ratio} = + sprintf("%4.3f", $self->{hit} / ($self->{hit} + $self->{miss})); + } + $self->print($self->pretty_self()); + if($self->{dbg} > 1) { + $self->print($self->pretty_chains()); + } + } + + $self->print("DESTROYING") if $self->{dbg} > 1; + $self->CLEAR(); + + 1; +} + +####PERL##LRU##TIE##CACHE##PERL##LRU##TIE##CACHE##PERL##LRU##TIE##CACHE +## Helper Routines +####PERL##LRU##TIE##CACHE##PERL##LRU##TIE##CACHE##PERL##LRU##TIE##CACHE + +# we use scalar_refs for the data for speed +sub create_node { + my($self, $key, $value, $length) = @_; + (defined($$key) && defined($$value)) + || die("need more localized data than $$key and $$value"); + + # max_size always defined when max_bytes is + if (($self->{max_size})) { + $length = defined $length ? $length : &_get_data_length($key, $value) + } else { + $length = 0; + } + + # ORDER SPECIFIC, see top for NODE ARRAY STRUCT + my $node = [ $$key, $$value, $length ]; +} + +sub _get_data_length { + my($key, $value) = @_; + my $length = 0; + my %refs; + + my @data = ($$key, $$value); + while(my $elem = shift @data) { + next if $refs{$elem}; + $refs{$elem} = 1; + if(ref $elem && $elem =~ /(SCALAR|HASH|ARRAY)/) { + my $type = $1; + $length += $REF_SIZE; # guess, 16 bytes per ref, probably more + if (($type eq 'SCALAR')) { + $length += length($$elem); + } elsif (($type eq 'HASH')) { + while (my($k,$v) = each %$elem) { + for my $kv($k,$v) { + if ((ref $kv)) { + push(@data, $kv); + } else { + $length += length($kv); + } + } + } + } elsif (($type eq 'ARRAY')) { + for my $val (@$elem){ + if ((ref $val)) { + push(@data, $val); + } else { + $length += length($val); + } + } + } + } else { + $length += length($elem); + } + } + + $length; +} + +sub insert { + my($self, $new_node) = @_; + + $new_node->[$AFTER] = 0; + $new_node->[$BEFORE] = $self->{tail}; + $self->print("insert() [$new_node->[$KEY], $new_node->[$VALUE]]") if ($self->{dbg} > 1); + + $self->{nodes}{$new_node->[$KEY]} = $new_node; + + # current sizes + $self->{count}++; + $self->{bytes} += $new_node->[$BYTES] + $STRUCT_SIZE; + + if($self->{tail}) { + $self->{tail}[$AFTER] = $new_node; + } else { + $self->{head} = $new_node; + } + $self->{tail} = $new_node; + + ## if we are too big now, remove head + while(($self->{max_count} && ($self->{count} > $self->{max_count})) || + ($self->{max_bytes} && ($self->{bytes} > $self->{max_bytes}))) + { + if($self->{dbg} > 1) { + $self->print("current/max: ". + "bytes ($self->{bytes}/$self->{max_bytes}) ". + "count ($self->{count}/$self->{max_count}) " + ); + } + my $old_node = $self->delete($self->{head}[$KEY]); + if ($self->{subclass}) { + if($old_node->[$DIRTY]) { + $self->print("dirty write() [$old_node->[$KEY], $old_node->[$VALUE]]") + if ($self->{dbg} > 1); + $self->write($old_node->[$KEY], $old_node->[$VALUE]); + } + } +# if($self->{dbg} > 1) { +# $self->print("after delete - bytes $self->{bytes}; count $self->{count}"); +# } + } + + 1; +} + +sub delete { + my($self, $key) = @_; + my $node = $self->{nodes}{$key} || return; +# return unless $node; + + $self->print("delete() [$key, $node->[$VALUE]]") if ($self->{dbg} > 1); + + my $before = $node->[$BEFORE]; + my $after = $node->[$AFTER]; + + # my($before, $after) = $node->{before,after}; + if($before) { + ($before->[$AFTER] = $after); + } else { + $self->{head} = $after; + } + + if($after) { + ($after->[$BEFORE] = $before); + } else { + $self->{tail} = $before; + } + + delete $self->{nodes}{$key}; + $self->{bytes} -= ($node->[$BYTES] + $STRUCT_SIZE); + $self->{count}--; + + $node; +} + +sub flush { + my $self = shift; + + $self->print("FLUSH CACHE") if ($self->{dbg} > 1); + + my $node = $self->{head}; + my $flush_count = 0; + while($node) { + if($node->[$DIRTY]) { + $self->print("flush dirty write() [$node->[$KEY], $node->[$VALUE]]") + if ($self->{dbg} > 1); + $self->write($node->[$KEY], $node->[$VALUE]); + $node->[$DIRTY] = 0; + $flush_count++; + } + $node = $node->[$AFTER]; + } + + $flush_count; +} + +sub print { + my($self, $msg) = @_; + print "$self: $msg\n"; +} + +sub pretty_self { + my($self) = @_; + + my(@prints); + for(sort keys %{$self}) { + next unless defined $self->{$_}; + push(@prints, "$_=>$self->{$_}"); + } + + "{ " . join(", ", @prints) . " }"; +} + +sub pretty_chains { + my($self) = @_; + my($str); + my $k = $self->FIRSTKEY(); + + $str .= "[head]->"; + my($curr_node) = $self->{head}; + while($curr_node) { + $str .= "[$curr_node->[$KEY],$curr_node->[$VALUE]]->"; + $curr_node = $curr_node->[$AFTER]; + } + $str .= "[tail]->"; + + $curr_node = $self->{tail}; + while($curr_node) { + $str .= "[$curr_node->[$KEY],$curr_node->[$VALUE]]->"; + $curr_node = $curr_node->[$BEFORE]; + } + $str .= "[head]"; + + $str; +} + +1; + +__END__ + +=head1 INSTALLATION + +Tie::Cache installs easily using the make or nmake commands as +shown below. Otherwise, just copy Cache.pm to $PERLLIB/site/Tie + + > perl Makefile.PL + > make + > make test + > make install + + * use nmake for win32 + ** you can also just copy Cache.pm to $perllib/Tie + +=head1 BENCMARKS + +There is another simpler LRU cache implementation in CPAN, +Tie::Cache::LRU, which has the same basic size limiting +functionality, and for this functionality, the exact same +interface. + +Through healthy competition, Michael G Schwern got +Tie::Cache::LRU mostly faster than Tie::Cache on reads & writes: + + Cache Size 5000 Tie::Cache 0.17 Tie::Cache::LRU 0.21 + 10000 Writes 1.55 CPU sec 1.10 CPU sec + 40000 Reads 1.82 CPU sec 1.58 CPU sec + 10000 Deletes 0.55 CPU sec 0.59 CPU sec + +Unless you are using TRUE CACHE or MaxBytes functionality, +using Tie::Cache::LRU should be an easy replacement for Tie::Cache. + +=head1 TRUE CACHE + +To use class as a true cache, which acts as the sole interface +for some data set, subclass the real cache off Tie::Cache, +with @ISA = qw( 'Tie::Cache' ) notation. Then override +the read() method for behavior when there is a cache miss, +and the write() method for behavior when the cache's data +changes. + +When WriteSync is 1 or TRUE (DEFAULT), write() is called immediately +when data in the cache is modified. If set to 0, data that has +been modified in the cache gets written out when the entries are deleted or +during the DESTROY phase of the cache object, usually at the end of +a script. + +To have the dirty data write() periodically while WriteSync is set to 0, +there is a flush() cache API call that will flush the dirty writes +in this way. Just call the flush() API like: + + my $write_flush_count = tied(%cache)->flush(); + +The flush() API was added in the .17 release thanks to Rob Bloodgood. + +=head1 TRUE CACHE EXAMPLE + + use Tie::Cache; + + # personalize the Tie::Cache object, by inheriting from it + package My::Cache; + @ISA = qw(Tie::Cache); + + # override the read() and write() member functions + # these tell the cache what to do with a cache miss or flush + sub read { + my($self, $key) = @_; + print "cache miss for $key, read() data\n"; + rand() * $key; + } + sub write { + my($self, $key, $value) = @_; + print "flushing [$key, $value] from cache, write() data\n"; + } + + my $cache_size = $ARGV[0] || 2; + my $num_to_cache = $ARGV[1] || 4; + my $Debug = $ARGV[2] || 1; + + tie %cache, 'My::Cache', $cache_size, {Debug => $Debug}; + + # load the cache with new data, each through its contents, + # and then reload in reverse order. + for(1..$num_to_cache) { print "read data $_: $cache{$_}\n" } + while(my($k, $v) = each %cache) { print "each data $k: $v\n"; } + for(my $i=$num_to_cache; $i>0; $i--) { print "read data $i: $cache{$i}\n"; } + + # flush writes now, trivial use since will happen in DESTROY() anyway + tied(%cache)->flush(); + + # clear cache in 2 ways, write will flush out to disk + %cache = (); + undef %cache; + +=head1 NOTES + +Many thanks to all those who helped me make this module a reality, +including: + + :) Tom Hukins who provided me insight and motivation for + finishing this module. + :) Jamie McCarthy, for trying to make Tie::Cache be all + that it can be. + :) Rob Fugina who knows how to "TRULY CACHE". + :) Rob Bloodgood, for the TRUE CACHE flush() API + +=head1 AUTHOR + +Please send any questions or comments to Joshua Chamas +at chamas@alumni.stanford.org + +=head1 COPYRIGHT + +Copyright (c) 1999-2002 Joshua Chamas, Chamas Enterprises Inc. +Sponsored by development on NodeWorks http://www.nodeworks.com + +All rights reserved. This program is free software; +you can redistribute it and/or modify it under the same +terms as Perl itself. + +=cut + + + + + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/CigarString.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/CigarString.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,350 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 AUTHOR + +Juguang Xiao + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::CigarString, a utilites module to generate cigar +strings + +=head1 DESCRIPTION + +Sequence alignment hits were previously stored within the core database +as ungapped alignments. This imposed 2 major constraints on alignments: + +a) alignments for a single hit record would require multiple rows in the + database, and +b) it was not possible to accurately retrieve the exact original + alignment. + +Therefore, in the new branch sequence alignments are now stored as +ungapped alignments in the cigar line format (where CIGAR stands for +Concise Idiosyncratic Gapped Alignment Report). + +In the cigar line format alignments are sotred as follows: + + M: Match + D: Deletino + I: Insertion + +An example of an alignment for a hypthetical protein match is shown +below: + + + Query: 42 PGPAGLP----GSVGLQGPRGLRGPLP-GPLGPPL... + PG P G GP R PLGP + Sbjct: 1672 PGTP*TPLVPLGPWVPLGPSSPR--LPSGPLGPTD... + +protein_align_feature table as the following cigar line: + + 7M4D12M2I2MD7M + +=cut + +package Bio::EnsEMBL::Utils::CigarString; + +use strict; +use vars qw(@ISA); +use Bio::EnsEMBL::Root; + +@ISA = qw(Bio::EnsEMBL::Root); + +=head2 split_hsp + + Name : split_hsp (this name is derived from the original sub in BlastWorn) + Usage : my $hsp; # a ready Bio::Search::HSP::GenericHSP object. +my $factory = new Bio::EnsEMBL::Utils::CigarString; +my $cigar_string = $factory->split_hsp($hsp); + + Function: generate cigar string. + Argument: a HSP object. + Returns : a text string. + +=cut + +sub split_hsp { + my ($self, $hsp) = @_; + + $self->throw("a defined object needed") unless($hsp && defined($hsp)); + unless(ref($hsp) && $hsp->isa('Bio::Search::HSP::GenericHSP')){ + $self->throw("a HSP object needed"); + } + + my ($qtype, $htype) = $self->_findTypes($hsp); + my ($qstrand, $hstrand) = $self->_findStrands($hsp); + my ($qinc, $hinc) = $self->_findIncrements($qstrand,$hstrand,$qtype,$htype); + + my @gaps = (); + my @qchars = split(//, $hsp->query_string); + my @hchars = split(//, $hsp->hit_string); + my $qstart; + if($qstrand == 1){ + $qstart = $hsp->query->start; + }elsif($qstart == -1){ + $qstart = $hsp->query->end; + }else{ + $self->warn("[$qstart], invalid strand value on query"); + $qstart = $hsp->query->start; + # Is this a SearchIO's bug??? + } + + my $hstart; + if($hstrand == 1){ + $hstart = $hsp->subject->start; + }elsif($hstrand != -1){ + $hstart = $hsp->subject->end; + }else{ + $self->throw("[$hstart], invalid strand value on subject"); + } + + my $qend = $qstart; + my $hend = $hstart; + my $count = 0; + my $found = 0; + + my @align_coordinates = (); + while($count <= $#qchars){ + if($qchars[$count] ne '-' && $hchars[$count] ne '-') { + $qend += $qinc; + $hend += $hinc; + $found = 1; + }else{ # gapped region + push(@align_coordinates, [$qstart, $hstart]) if($found == 1); + + $qstart = $qend; + $qstart += $qinc if($qchars[$count] ne '-'); + + $hstart = $hend; + $hstart += $hinc if($hchars[$count] ne '-'); + + $qend = $qstart; + $hend = $hstart; + $found = 0; + } + $count++; + } + + if($found){ + push(@align_coordinates, [$qstart, $hstart]); + } + + my $cigar_string = ""; + my $last = $#align_coordinates; + if($last >= 0){ + for(my $i=0; $i<$last; $i++){ + my $q_this_start = $align_coordinates[$i]->[0]; + my $q_next_start = $align_coordinates[$i+1]->[0]; + my $q_length = ($q_next_start-$q_this_start-1)*$qinc; + $q_length = abs($q_length); + my $h_this_start = $align_coordinates[$i]->[1]; + my $h_next_start = $align_coordinates[$i+1]->[1]; + my $h_length = ($h_next_start-$h_this_start-1)*$hinc; + $h_length = abs($h_length); + + my $diff = $q_length - $h_length; + if($diff > 0){ # Insertion + $cigar_string .= $diff unless($diff == 1); + $cigar_string .= 'I'; + }elsif($diff < 0){ # Deletion + $cigar_string .= -$diff unless($diff == -1); + $cigar_string .= 'D'; + }else{ # e.g $diff == 0, Match + $cigar_string .= $q_length unless($q_length == 1); + $cigar_string .= 'M'; + } + + } # for + } # if + + return $cigar_string; +} + + +sub _findStrands { + my ($self,$hsp) = @_; + + my $qstrand = $hsp->query->strand; + unless($qstrand == 1 || $qstrand == -1){ + $self->warn("query's strand value is neither 1 or -1"); + $qstrand = 1; + } + + my $hstrand = $hsp->subject->strand; + unless($hstrand == 1 || $hstrand == -1){ + $self->warn("subject's strand value is neither 1 or -1"); + $hstrand = 1; + } + + return ( $qstrand, $hstrand); +} + +sub _findTypes { + my ($self,$hsp) = @_; + + my $type1; + my $type2; + my $len1 = $hsp->query->length(); + my $len2 = $hsp->subject->length(); + + if ($len1/$len2 > 2) { + $type1 = 'dna'; + $type2 = 'pep'; + } elsif ($len2/$len1 > 2) { + $type1 = 'pep'; + $type2 = 'dna'; + } else { + $type1 = 'dna'; + $type2 = 'dna'; + } + + return ($type1,$type2); +} + +sub _findIncrements { + my ($self,$qstrand,$hstrand,$qtype,$htype) = @_; + + my $qinc = 1 * $qstrand; + my $hinc = 1 * $hstrand; + + if ($qtype eq 'dna' && $htype eq 'pep') { + $qinc = 3 * $qinc; + } + if ($qtype eq 'pep' && $htype eq 'dna') { + $hinc = 3 * $hinc; + } + + return ($qinc,$hinc); +} + +# This is a core logic of cigar string. The finite state machine theory is +# apply. See the below table, x-axis represents the input, with 3 options: +# (+/+) -- Both current query and subject bases are non-gap. Match +# (-/+) -- The current query base is gap, but subject not. Deletion +# (+/-) -- The current subject base is gap, but query not. Insertion +# While the y-axis means the current state with letter 'M', 'D', 'I' +# +# The content of this table is the action taken in response of the input and +# the current state. +# R remain the state, counter increment. +# G;X generate the cigar line based on the current state and counter; +# clear the counter to zero and change to the state X +# +# || +/+ | -/+ | +/- | +# -------+----------------------+ +# M || R | G;D | G;I | +# ------------------------------+ +# D || G;M | R | G;I | +# ------------------------------+ +# I || G;M | G;D | R | +# + +=head2 generate_cigar_string + + Name : generate_cigar_string + Usage: $cigar_string = $self->generate_cigar_string(\@qchars, \@hchars); + Function: generate the cigar string for a piece of alignment. + Args: 2 array references. The lengths of 2 arrays are the same + Return: a cigar string + +=cut + +# Developer's Note: The method is originally abstracted from the concept of +# cigar string. It only asks the essential information of 2 sequence characters +# of the alignment, while the BlastWorn::split_HSP asks more unused information +# for cigar string, which is useful to form align_coordinates. - Juguang + +my ($count, $state); # strictly only used in the following 2 subs + +sub generate_cigar_string { + +# my ($self, $qstart, $hstart, $qinc, $hinc, $qchars_ref, $hchars_ref) = @_; + + my ($self, $qchars_ref, $hchars_ref) = @_; + + my @qchars = @{$qchars_ref}; + my @hchars = @{$hchars_ref}; + + unless(scalar(@qchars) == scalar(@hchars)){ + $self->throw("two sequences are not equal in lengths"); + } + + $count = 0; + $state = 'M'; # the current state of gaps, (M, D, I) + + my $cigar_string = ''; + for(my $i=0; $i <= $#qchars; $i++){ + my $qchar = $qchars[$i]; + my $hchar = $hchars[$i]; + if($qchar ne '-' && $hchar ne '-'){ # Match + $cigar_string .= $self->_sub_cigar_string('M'); + }elsif($qchar eq '-'){ # Deletion + $cigar_string .= $self->_sub_cigar_string('D'); + }elsif($hchar eq '-'){ # Insertion + $cigar_string .= $self->_sub_cigar_string('I'); + }else{ + $self->throw("Impossible state that 2 gaps on each seq aligned"); + } + } + $cigar_string .= $self->_sub_cigar_string('X'); # not forget the tail. + return $cigar_string; +} + +sub _sub_cigar_string { + my ($self, $new_state) = @_; + my $sub_cigar_string = ''; + if($state eq $new_state){ + $count++; # Remain the state and increase the counter + }else{ + $sub_cigar_string .= $count unless $count == 1; + $sub_cigar_string .= $state; + $count = 1; + $state = $new_state; + } + return $sub_cigar_string; +} + +=head2 generate_cigar_string_by_hsp + + Name : generate_cigar_string_by_hsp + Usage : my $hsp; # a ready GenericHSP object +my $cigar_string = $self->generate_cigar_string_by_hsp($hsp); + Function: generate a cigar string by given HSP object. + Args : a GenericHSP object + Returns: a text string of cigar string + +=cut + +sub generate_cigar_string_by_hsp { + my ($self, $hsp) = @_; + + unless(ref($hsp) && $hsp->isa('Bio::Search::HSP::GenericHSP')){ + $self->throw("A GenericHSP object needed"); + } + + my @qchars = split(//, $hsp->query_string); + my @hchars = split(//, $hsp->hit_string); + + return $self->generate_cigar_string(\@qchars, \@hchars); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/CliHelper.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/CliHelper.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,290 @@ + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::CliHelper + +=head1 VERSION + +$Revision: 1.6 $ + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Utils::CliHelper; + + my $cli = Bio::EnsEMBL::Utils::CliHelper->new(); + + # get the basic options for connecting to a database server + my $optsd = $cli->get_dba_opts(); + + # add the print option + push(@$optsd,"print|p"); + + # process the command line with the supplied options plus a reference to a help subroutine + my $opts = $cli->process_args($optsd,\&usage); + + # use the command line options to get an array of database details + for my $db_args (@{$cli->get_dba_args_for_opts($opts)}) { + # use the args to create a DBA + my $dba = new Bio::EnsEMBL::DBSQL::DBAdaptor(%{$db_args}); + ... + } + + For adding secondary databases, a prefix can be supplied. For instance, to add a second set of + db params prefixed with dna (-dnahost -dbport etc.) use the prefix argument with get_dba_opts and + get_dba_args_for_opts: + # get the basic options for connecting to a database server + my $optsd = + [ @{ $cli_helper->get_dba_opts() }, @{ $cli_helper->get_dba_opts('gc') } ]; + # process the command line with the supplied options plus a help subroutine + my $opts = $cli_helper->process_args( $optsd, \&usage ); + # get the dna details + my ($dna_dba_details) = + @{ $cli_helper->get_dba_args_for_opts( $opts, 1, 'dna' ) }; + my $dna_db = + Bio::EnsEMBL::DBSQL::DBAdaptor->new( %{$dna_dba_details} ) ); + +=head1 DESCRIPTION + +Utilities for a more consistent approach to parsing and handling EnsEMBL script command lines + +=head1 METHODS + +See subroutines. + +=cut + +package Bio::EnsEMBL::Utils::CliHelper; + +use warnings; +use strict; + +use Carp; +use Data::Dumper; +use Getopt::Long qw(:config auto_version no_ignore_case); + +use Bio::EnsEMBL::Registry; +use Bio::EnsEMBL::DBSQL::DBConnection; +use Bio::EnsEMBL::DBSQL::DBAdaptor; + +my $dba_opts = + [ { args => [ 'host', 'dbhost', 'h' ], type => '=s' }, + { args => [ 'port', 'dbport', 'P' ], type => ':i' }, + { args => [ 'user', 'dbuser', 'u' ], type => '=s' }, + { args => [ 'pass', 'dbpass', 'p' ], type => ':s' }, + { args => [ 'dbname', 'D' ], type => ':s' }, + { args => [ 'pattern', 'dbpattern' ], type => ':s' }, + { args => [ 'driver' ], type => ':s' }, + { args => [ 'species_id' ], type => ':i' }, + { args => [ 'species' ], type => ':i' }, + ]; + +=head2 new() + + Description : Construct a new instance of a CliHelper object + Returntype : Bio::EnsEMBL::Utils:CliHelper + Status : Under development + +=cut + +sub new { + my ( $class, @args ) = @_; + my $self = bless( {}, ref($class) || $class ); + return $self; +} + +=head2 get_dba_opts() + + Arg [1] : Optional prefix for dbnames e.g. dna + Description : Retrieves the standard options for connecting to one or more Ensembl databases + Returntype : Arrayref of option definitions + Status : Under development + +=cut + +sub get_dba_opts { + my ( $self, $prefix ) = @_; + $prefix ||= ''; + my @dba_opts = map { + my $opt = join '|', map { $prefix . $_ } @{ $_->{args} }; + $opt . $_->{type}; + } @{$dba_opts}; + return \@dba_opts; +} + +=head2 process_args() + + Arg [1] : Arrayref of supported command line options (e.g. from get_dba_opts) + Arg [2] : Ref to subroutine to be invoked when -help or -? is supplied + Description : Retrieves the standard options for connecting to one or more Ensembl databases + Returntype : Hashref of parsed options + Status : Under development + +=cut + +sub process_args { + my ( $self, $opts_def, $usage_sub ) = @_; + my $opts = {}; + push @{$opts_def}, q/help|?/ => $usage_sub; + GetOptions( $opts, @{$opts_def} ) + || croak 'Could not parse command line arguments'; + return $opts; +} + +=head2 get_dba_args_for_opts() + + Arg [1] : Hash of options (e.g. parsed from command line options by process_args()) + Arg [2] : If set to 1, the databases are assumed to have a single species only. Default is 0. + Arg [3] : Optional prefix to use when parsing e.g. dna + Description : Uses the parsed command line options to generate an array of DBAdaptor arguments + : (e.g. expands dbpattern, finds all species_ids for multispecies databases) + : These can then be passed directly to Bio::EnsEMBL::DBSQL::DBAdaptor->new() + Returntype : Arrayref of DBA argument hash refs + Status : Under development + +=cut + +sub get_dba_args_for_opts { + my ( $self, $opts, $single_species, $prefix ) = @_; + $prefix ||= ''; + $single_species ||= 0; + my ( $host, $port, $user, $pass, $dbname, $pattern, $driver ) = + map { $prefix . $_ } qw(host port user pass dbname pattern driver); + my @db_args; + if ( defined $opts->{$host} ) { + my $dbc = + Bio::EnsEMBL::DBSQL::DBConnection->new( -USER => $opts->{$user}, + -PASS => $opts->{$pass}, + -HOST => $opts->{$host}, + -PORT => $opts->{$port}, + -DRIVER => $opts->{$driver} ); + my @dbnames; + if ( defined $opts->{$dbname} ) { + push @dbnames, $opts->{$dbname}; + } elsif ( defined $opts->{$pattern} ) { + # get a basic DBConnection and use to find out which dbs are involved + @dbnames = + grep { m/$opts->{pattern}/smx } + @{ $dbc->sql_helper()->execute_simple(q/SHOW DATABASES/) }; + } else { + print Dumper($opts); + croak 'dbname or dbpattern arguments required'; + } + for my $dbname (@dbnames) { + + #Decipher group of DBAdaptor by capturing the name_name(_name?)_core_ code. Otherwise we don't know + my ($group) = $dbname =~ /^[a-z]+_[a-z0-9]+(?:_[a-z0-9]+)?_([a-z]+)(?:_\d+)?_\d+/; + + my $multi = 0; + my $species_ids = [ [ 1, undef ] ]; + if ( !$single_species ) { + $species_ids = + $dbc->sql_helper() + ->execute( +"SELECT species_id,meta_value FROM $dbname.meta WHERE meta_key='species.production_name'" + ); + if ( scalar( @{$species_ids} ) > 1 ) { + $multi = 1; + } + if ( defined $opts->{species_id} ) { + $species_ids = + [ [ $opts->{species_id}, $opts->{species} ] ]; + } + } + for my $species_id ( @{$species_ids} ) { + my $args = { + -HOST => $opts->{$host}, + -USER => $opts->{$user}, + -PORT => $opts->{$port}, + -PASS => $opts->{$pass}, + -DBNAME => $dbname, + -DRIVER => $opts->{$driver}, + -SPECIES_ID => $species_id->[0], + -SPECIES => $species_id->[1], + -MULTISPECIES_DB => $multi }; + $args->{-GROUP} = $group if $group; + push(@db_args, $args); + } + } + } ## end if ( defined $opts->{$host...}) + else { + croak '(db)host arguments required'; + } + return \@db_args; +} ## end sub get_dba_args_for_opts + +=head2 get_dba_args_for_opts() + + Arg [1] : Hash of options (e.g. parsed from command line options by process_args()) + Arg [2] : If set to 1, the databases are assumed to have a single species only. Default is 0. + Arg [3] : Optional prefix to use when parsing e.g. dna + Description : Uses the parsed command line options to generate an array DBAdaptors. + : Note this can overload connections on a server + Returntype : Arrayref of Bio::EnsEMBL::DBSQL::DBAdaptor + Status : Under development + +=cut + +sub get_dbas_for_opts { + my ( $self, $opts, $single_species, $prefix ) = @_; + +# get all the DBA details that we want to work with and create DBAs for each in turn + my $dbas; + for my $args ( + @{ $self->get_dba_args_for_opts( $opts, $single_species, $prefix ) } ) + { + push @{$dbas}, Bio::EnsEMBL::DBSQL::DBAdaptor->new( %{$args} ); + } + return $dbas; +} + +=head2 load_registry_for_opts + + Arg [1] : Hash of options (e.g. parsed from command line options by process_args()) + Arg [2] : Optional prefix to use when parsing e.g. dna or master + Description : Loads a Registry from the given options hash. If a C + option is given then the code will call C. Otherwise + we use the database parameters given to call + C. + Returntype : Integer of the number of DBAdaptors loaded + Status : Under development + +=cut + +sub load_registry_for_opts { + my ($self, $opts, $prefix) = @_; + $prefix ||= q{}; + if($opts->{registry}) { + my $location = $opts->{registry}; + return Bio::EnsEMBL::Registry->load_all($location); + } + my ( $host, $port, $user, $pass ) = map { $prefix . $_ } qw(host port user pass); + my %args = ( + -HOST => $opts->{$host}, + -PORT => $opts->{$port}, + -USER => $opts->{$user}, + ); + $args{-PASS} = $opts->{$pass}; + return Bio::EnsEMBL::Registry->load_registry_from_db(%args); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/Collector.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/Collector.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1749 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::Collector + +=head1 SYNOPSIS + + # Inherit this base module in your feature specific Collector + # instance: + + package Bio::EnsEMBL::Funcgen::Collector::ResultFeature; + use base('Bio::EnsEMBL::Utils::Collector'); + + # ... and define package config variables + $Bio::EnsEMBL::Funcgen::Collector::bin_model = 'SIMPLE'; + $Bio::EnsEMBL::Funcgen::Collector::window_sizes = + [ 30, 65, 130, 260, 450, 648, 950, 1296 ]; + # Could replace 30 with 0 here for low density data at natural resolution + + $Bio::EnsEMBL::Utils::Collector::bin_method = + 'count'; # only used by collector + + $Bio::EnsEMBL::Utils::Collector::packed_size = 2; + + # ... or simply use this module in a script either defining package + # config variables, or passing as parameters to the constructor: + + my $collector = + Bio::EnsEMBL::Utils::BaseCollector->new( -pack_template => 'v' ); + + $Bio::EnsEMBL::Funcgen::Collector::pack_template = 'v'; + + # Config variables can also be over-ridden by passing a config hash to + # the store_window_bins_by_Slice() method: + + $collector->store_window_bins_by_Slice( $slice, ( + -pack_template => 'v', + -packed_size => 2 ) ); + + # NOTE: Over-riding default config variables can cause problems when + # storing or fetching data. e.g. Fetch may revert to using defaults or + # table partitions may not match window sizes. + +=head1 DESCRIPTION + +This package is the base Collector class which contains generic +getter/setter methods along with the main 'collecting' methods which +perform the majority of the work in generating compressed data +collections optimised for web display. The bins produced are aimed at +storage in a BLOB representing an entire seq_region i.e. even bins with +no features/null data are encoded as a 0 score. Non-BLOB collections +are currently not supported. + +If your Collection class defines a Bio::EnsEMBL::Feature, then its +adaptor should inherit from the relevant Collection class. + +The minimum prerequisites of the input features/data are that they have +a start() and end() method. For instance a Bio::EnsEMBL::Features +generated from a database or parsed from a flat file. + +NOTE: This Collector does not have a lightweight mode previously used +for dynamic/on the fly collecting i.e. it does not take advantage of +bypassing object creation via the related BaseFeatureAdaptor method. + +=cut + +package Bio::EnsEMBL::Utils::Collector; + +use Bio::EnsEMBL::Utils::Argument ('rearrange'); +use Bio::EnsEMBL::Utils::Exception ('throw'); +use strict; +use warnings; + +### Global package config vars + +# Defaults +our $max_view_width = 1000000; # Max bp width in location/detailed view +our $max_data_type_size = 16777216; # Default is 16MB for long BLOB +# This is really a guide value as this should be set in the inheriting +# Collector class by deducting the rest of the row size from this value. +# Is is upto the inheritor to handle checking whether this size has been +# exceeded. + +# NOTE: Theoretically the min window size is: slice_length/(16777216/2) +# So for human chr1: 249,250,621/(16777216/2) = 29.7 => 30. However, +# this size does not seem to directly translate to the MySQL +# max_allowed_packet_size. Increasing max_allowed_packet_size to 64MB +# solves this issue, and substr operation doesn't appear to incur any of +# the potential memory(4*) usage issues. + +# Others global package variables which are set in the inheriting +# Collector class. +our ( $bin_model, $bin_method, $pack_template, + $packed_size, $window_sizes ); + +=head2 new + + Args : None + Example : + + my $collector = Bio::EnsEMBL::XXX::Collector::FEATURE->new(); + $collector->store_windows_by_Slice($slice); + + # Where XXX is, e.g. Compara, FuncGen etc. + + Description: Simple new method to enable use of collector + when not inherited by a descendant of + Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor + + Returntype : Bio::EnsEMBL::XXX::Collector + Exceptions : None + Caller : Collector script + Status : At Risk + +=cut + +sub new { + return bless {}, $_[0]; # Simply blesses this class as an empty hash. + + # Do not set anything here, as will not be first in ISA for + # FeatureAdaptors. Hence, not guaranteed to be called. +} + +=head2 new_assembly + + Args : optional - string assembly version e.g. GRCh37 + Example : $collector->new_assembly('GRCh37'); + Description: Getter/Setter for new assembly version which should be + used to project only 0 wsize Collections. + Returntype : string + Exceptions : None + Caller : store_window_bins_by_Slice() or + write_collection() in inheriting Collector class. + Status : At Risk + +=cut + +sub new_assembly { + my ( $self, $new_asm ) = @_; + + if ( defined($new_asm) ) { + $self->{'new_assembly'} = $new_asm; + } + + return $self->{'new_assembly'}; +} + +### Setter/Getter methods for basic/mandatory config +# Can also be set using package variables in the inheriting +# Collector/Adaptor or run script. Allows over-riding of defaults set +# in Adaptor/Collector. + +# Package variables used here instead of attrs to enable easy +# default config in inheriting class/script method. Provided +# for easy/standardised fetch access outside of this package +# i.e. Collectors/Adaptors + +=head2 max_data_type_size + + Args : optional - int Maximum size of collection in bytes + Example : $collector->max_data_type_size($new_max_size); + Description: Getter/Setter for max_data_type_size, default is + currently set at in this class as 16777216 (16MB), for + long BLOB. This is used by the write_collection() + method to determine when to build and store a compressed + collection. + Returntype : int + Exceptions : None + Caller : bins_per_record() and + write_collection() in inheriting Collector class. + Status : At Risk + +=cut + +sub max_data_type_size { + my ( $self, $size ) = @_; + + # Validate is sensible integer + + if ( defined($size) ) { + if ( $size !~ /^\d+$/ ) { + throw("max_data_type_size must be a integer of bytes, not $size"); + } + $max_data_type_size = $size; + } elsif ( !defined($max_data_type_size) ) { + # This should never happen as we have defaults in this module. + throw( 'You must define a ' + . '$Bio::EnsEMBL::Utils::Collector::max_data_type_size ' + . 'or pass -max_data_type_size config' ); + } + + return $max_data_type_size; +} + +=head2 max_view_width + + Args : optional - int Maximum width of view + Example : $collector->max_view_width($new_max_width); + Description: Getter/Setter for max_view_width, default is currently + set at in this class as 500000bp, for maximum level of + zoom permitted by location view. + Returntype : int + Exceptions : None + Caller : general + Status : At Risk + +=cut + +sub max_view_width { + my ( $self, $size ) = @_; + + # Validate is sensible integer + + if ( defined($size) ) { + if ( $size !~ /^\d+$/ ) { + throw("max_view_width must be a integer, not $size"); + } + $max_view_width = $size; + } elsif ( !defined $max_view_width ) { + # This should never happen as we have defaults in this module. + throw( 'You must define a ' + . '$Bio::EnsEMBL::Utils::Collector::max_view_width ' + . 'or pass -max_view_width config' ); + } + + return $max_view_width; +} + +=head2 bin_method + + Args[0] : optional - string name of bin method e.g. 'max_magnitude' + Args[1] : optional - Bio::EnsEMBL::Funcgen::Parsers::InputSet + Example : my $bin_method = $self->bin_method(); + Description: Getter/Setter for bin_method, default is normally set in + the inheriting Collector class either by package variable + or by passing a config hash via the store methods. + Returntype : string + Exceptions : Throws if cannot set by package variable + Caller : general + Status : At Risk + +=cut + +sub bin_method { + my ( $self, $bmethod, $config ) = @_; + + if ( defined($bmethod) ) { + $bin_method = $bmethod; + } + + if ( !defined($bin_method) ) { + throw( 'You must define a ' + . '$Bio::EnsEMBL::Utils::Collector::bin_method ' + . 'or pass -bin_method config' ); + } + + if ( !$self->can( "_calculate_" . $bin_method ) ) { + throw("$bin_method is not a valid/available binning method"); + } + + my $set_up_method = "_set_up_" . $bin_method; + if ( $self->can($set_up_method) ) { + $self->$set_up_method($config); + } + + return $bin_method; +} + +=head2 bin_model + + Args : optional - string bin model e.g. SIMPLE or COMPLEX + Example : my $bin_model = $self->bin_model; + Description: Getter/Setter for bin_model, default should be set in + inheriting Collector class. Currently only supports + 'SIMPLE' bin model. + Returntype : string + Exceptions : Throws if bin_model is not SIMPLE + Caller : general + Status : At Risk + +=cut + +sub bin_model { + my ( $self, $bmodel ) = @_; + + if ( defined($bmodel) ) { + $bin_model = $bmodel; + } + + if ( !defined($bin_model) ) { + throw( 'You must define a ' + . '$Bio::EnsEMBL::Utils::Collector::bin_model ' + . 'or pass -bin_model config' ); + } + + if ( $bin_model ne 'SIMPLE' ) { + throw( 'Bio::EnsEMBL::Utils::Collector does not yet support ' + . 'non-SIMPLE bin models' ); + } + + return $bin_model; +} + + +=head2 window_sizes + + Args : optional - arrayref of window sizes + Example : + + foreach my $wsize ( @{ $collector->window_sizes } ) + { # Do some collecting + } + + Description: Getter/Setter for window_sizes. Default should be set + in inheriting Collector (if the config is dynamic), + FeatureAdaptor class or script using package variable or + this method. + NOTE: Redefining these may cause a mismatch with the + table partition definition. + Returntype : arrayref of ints + Exceptions : Throws if cannot set a valid array of int window sizes + Caller : general + Status : At Risk - rename bin_sizes? + +=cut + +sub window_sizes { + my ( $self, $sizes ) = @_; + + if ( defined($sizes) ) { + $window_sizes = $sizes; + } + + if ( !( ref($window_sizes) + && ( ref($window_sizes) eq 'ARRAY' ) + && ( scalar(@$window_sizes) > 0 ) ) ) + { + throw('Must pass -windows_sizes in the config ' + . 'or define $Bio::EnsEMBL::Utils::Collector::window_sizes ' + . 'in your Collector as an array ref of integer window_sizes' ); + } + + return $window_sizes; +} + + + + +=head2 has_window_size + + Args : int - window size to validate + Example : if( $collector->has_window_size('30') ){ + #Do something wrt to 30bp window size + } + + Description: Simple utility method to validate whether this Collector + has a given window_size + Returntype : Boolean + Exceptions : Throws if window size not specified + Caller : general + Status : At Risk + +=cut + + +sub has_window_size{ + my ( $self, $size ) = @_; + + if(! defined $size){ + throw('You must pass a window size to validate'); + } + + return grep(/$size/, @$window_sizes); +} + + + + +### Getter/Setters for BLOB collection config +# NOTE: Overriding the defaults here may cause a mismatch when the data +# is retrieved. + +=head2 pack_template + + Args : optional - string perl 'pack' template + Example : $self->pack_template('v'); + Description: Getter/Setter for pack_template. Default should be set + in inheriting Collector (if the config is dynamic), + FeatureAdaptor class or script using package variable or + this method. + Returntype : string + Exceptions : Throws if cannot set pack_template from package variable + Caller : FeatureAdaptor::_obj_from_sth + Status : At Risk + +=cut + +sub pack_template { + my ( $self, $template ) = @_; + + if ( defined($template) ) { + $pack_template = $template; + } + + if ( !defined($pack_template) ) { + throw( 'Must pass a per score ' + . '-pack_template in the config ' + . 'or define $Bio::EnsEMBL::Utils::Collector::pack_template ' + . 'in your Collector' ); + } + + return $pack_template; +} + +=head2 packed_size + + Args : optional - int size of perl 'pack' template in bytes + Example : $self->packed_size(2); + Description: Getter/Setter for packed_size. Default should be set + in inheriting Collector (if the config is dynamic), + FeatureAdaptor class or script using package variable or + this method. + Returntype : string + Exceptions : Throws if cannot set pack_template from pacakge variable + Caller : current_packed_size() and + FeatureAdaptor::_obj_from_sth() + Status : At Risk + +=cut + +sub packed_size { + my ( $self, $size ) = @_; + + if ( defined($size) ) { + $packed_size = $size; + } + + if ( !defined($packed_size) ) { + throw( 'Must pass -packed_size(wrt to pack_template) config ' + . 'or define $Bio::EnsEMBL::Utils::Collector::packed_size ' + . 'in your Collector' ); + } + + if ( $packed_size !~ /^\d+$/ ) { + throw( "$packed_size is not an integer, " + . "must pass a size integer for packed_size " + . "which specifies size of pack_template:\t" + . $pack_template ); + } + + return $packed_size; +} + +=head2 bins_per_record + + Example : my $bin_per_records = $self->bin_per_record + Description: Simple method to calculate the max number of bins + allowed per record given the current config. + Returntype : int + Exceptions : None + Caller : + Status : At Risk + +=cut + +sub bins_per_record { + return int( $max_data_type_size/$packed_size ); +} + + +=head2 current_packed_size + + Arg[0] : int - window size + Example : my $cps = $self->current_packed_size($wsize); + Description: Simple method to calculate the max number of bins + allowed per record given the current config. + Returntype : int + Exceptions : None + Caller : + Status : At Risk + +=cut + +sub current_packed_size { + my ( $self, $wsize ) = @_; + return ( scalar( @{ $self->score_cache($wsize) } )*$packed_size ); +} + + +=head2 score_cache + + Arg[0] : int - window size + Example : my $cps = $self->current_packed_size($wsize); + Description: Handles caching of bin scores for each window size + Returntype : arrayref + Exceptions : Throws if no window size defined + Caller : current_packed_size() and store_collection() + methods + Status : At Risk + +=cut + +sub score_cache { + my ( $self, $wsize, $scores ) = @_; + + if ( !defined($wsize) ) { + throw('Must pass a window size argument'); + } + + $self->{'score_cache'}{$wsize} ||= []; + + if ( defined($scores) ) { + push( @{ $self->{'score_cache'}{$wsize} }, @{$scores} ); + } + + return $self->{'score_cache'}{$wsize}; +} + + +=head2 collection_start + + Arg[0] : int - window_size + Arg[1] : optional int - seq_region_start + Example : my $coll_start->(150); + Description: Getter/Setter collection seq_region_start + Returntype : int + Exceptions : Throws if no window size defined + Caller : store_window_bin_by_Slice() and write_collection() + Status : At Risk + +=cut + +sub collection_start { + my ( $self, $wsize, $sr_start ) = @_; + + if ( !defined($wsize) ) { + throw('Must pass a window size argument'); + } + + if ( defined($sr_start) ) { + $self->{'collection_start'}{$wsize} = $sr_start; + } + + return $self->{'collection_start'}{$wsize}; +} + + +=head2 collection_end + + Arg[0] : int - window_size + Arg[1] : optional int - seq_region_end + Example : my $coll_end->(150); + Description: Getter/Setter collection seq_region_end + Returntype : int + Exceptions : Throws if no window size defined + Caller : inheriting Collector write_collection method + Status : At Risk + +=cut + +sub collection_end{ + my ($self, $wsize, $sr_end) = @_; + throw('Must pass a window size argument') if ! defined $wsize; + + if(defined $sr_end){ + $self->{'collection_end'}{$wsize} = $sr_end; + } + else{ + return $self->{'collection_end'}{$wsize}; + } +} + + +=head2 collection_strand + + Arg[0] : int - window_size + Arg[1] : optional int - seq_region_strand + Example : my $coll_start->(0); + Description: Getter/Setter collection seq_region_strand + Returntype : int + Exceptions : Throws if no window size defined + Caller : inheriting Collector write_collection method + Status : At Risk - Collections are currently strandless + +=cut + +sub collection_strand { + my ( $self, $wsize, $strand ) = @_; + + if ( !defined($wsize) ) { + throw('Must pass a window size argument'); + } + + if ( defined $strand ) { + $self->{'collection_strand'}{$wsize} = $strand; + } + + return $self->{'collection_strand'}{$wsize}; +} + + +### Here follows the actual working methods + +=head2 _get_Slice_chunks + + Description: Defines the optimal set of slice chunks to use for + generating collections such that redundant fetches + are minimized. + Returntype : hashref of window_size chunk size pairs + Exceptions : Throws if no window sizes or max_view_width defined + Caller : store_window_bin_by_Slice() + Status : At Risk + +=cut + +sub _get_Slice_chunks { + my $self = shift; + + if ( !defined($window_sizes) || !defined($max_view_width) ) { + throw( 'You must pass both a window_size array ref ' + . 'and max_view_width arguments' ); + } + + if ( !defined( $self->{'_slice_chunks'} ) ) { + # Calulate sensible slice length based on window sizes + my @wsizes = sort { $a <=> $b } @$window_sizes; + + # Handle calculating only 0 wsize + if ( scalar(@wsizes) == 1 + && $wsizes[0] == 0 ) + { + return { $max_view_width => [0] }; + } + + my $multiplier = int( $max_view_width/$wsizes[$#wsizes] ); + + my $chunk_length = $multiplier*$wsizes[$#wsizes]; + my $not_divisible = 1; + + my %chunk_windows; # Registry of chunk lengths to run with windows + my %workable_chunks = map { $_ => {} } @wsizes; + + # get rid of natural resolution as this will always work + delete $workable_chunks{'0'}; + + while ( $not_divisible && $chunk_length != 0 ) { + $not_divisible = 0; + + foreach my $wsize (@wsizes) { + if ( $wsize == 0 ) { + # Special wsize for normal data + next; + } + + # Set not divisible if modulus is true + if ( $chunk_length % $wsize ) { + $not_divisible = 1; + } else { + $workable_chunks{$wsize}{$chunk_length} = []; + } + } + + # Gradually shrink the length until we find a workable slice + # length for all windows. + if ($not_divisible) { + $chunk_length -= $wsizes[$#wsizes]; + } + } + + my %chunk_sets; + + if ( $chunk_length == 0 ) { + print "Could not find chunk length " + . "for all window sizes, " + . "attempting to subset windows " + . "using alternate slice length\n"; + + foreach my $wsize ( keys(%workable_chunks) ) { + # Loop through windows, seeing if they are workable in the other + # windows. + + foreach my $chunk ( keys( %{ $workable_chunks{$wsize} } ) ) { + + foreach my $other_wsize ( keys %workable_chunks ) { + next if $wsize == $other_wsize; + + if ( exists( $workable_chunks{$other_wsize}{$chunk} ) ) { + # only push it onto the other wsize, as we will do the + # reverse later + $chunk_sets{$chunk}{$wsize} = undef; + } + } + } + } + + # %chunk_sets represents co-occurence of wsizes with repect to + # chunks. Take the set which has the most windows and the longest + # chunk. Then get the largest which handles the rest. + + # define possible set lengths + my $i = 0; + my %set_lengths; + + map { $set_lengths{$i} = []; $i++ } @wsizes; + + # get rid of natural resolution as this will always work + delete $set_lengths{'0'}; + + # Store chunks lengths for each set size + foreach my $chunk ( keys(%chunk_sets) ) { + my $set_size = scalar( values( %{ $chunk_sets{$chunk} } ) ); + push( @{ $set_lengths{$set_size} }, $chunk ); + } + + # Get the biggest set with the longest length; + + # Scalar here as we are disregarding natural resolution of 0 in + # loop. + my $largest_size = scalar(@wsizes); + my $found_largest_set = 0; + + while ( !$found_largest_set ) { + $largest_size--; + + if ( scalar( @{ $set_lengths{$largest_size} } ) > 0 ) { + $found_largest_set = 1; + } + } + + my ($largest_chunk) = + sort { $b <=> $a } @{ $set_lengths{$largest_size} }; + + my @largest_windows = keys %{ $chunk_sets{$largest_chunk} }; + @{ $chunk_windows{$largest_chunk} } = @largest_windows; + + print "Largest chunk $largest_chunk($largest_size) " + . "contains windows: @largest_windows\n"; + + my %remaining_windows = map { $_ => {} } @wsizes; + + # get rid of natural resolution as this will always work + delete $remaining_windows{'0'}; + + map { delete $remaining_windows{$_} } @largest_windows; + my $remaining_set_size = scalar( keys(%remaining_windows) ); + + # Use array here for practicality, would need to maintain hash if + # we need to iterate. + my @rwindows = keys(%remaining_windows); + + # Could be one window, but this will not be in the co-occurence + # hash %chunk_sets. + my $next_chunk; + + if ( scalar(@rwindows) == 1 ) { + my ($last_window) = @rwindows; + # Find a suitably large chunk for this one window. + $multiplier = int( 500000/$last_window ); + $next_chunk = $multiplier*$last_window; + } else { + + foreach my $chunk ( sort { $b <=> $a } + @{ $set_lengths{$remaining_set_size} } ) + { + my $seen_count = 0; + + foreach my $rwindow (@rwindows) { + if ( grep /$rwindow/, + ( values( %{ $chunk_sets{$chunk} } ) ) ) + { + $seen_count++; + } + } + + if ( $seen_count == $remaining_set_size ) { + $next_chunk = $chunk; + last; + } + } + + } + + @{ $chunk_windows{$next_chunk} } = @rwindows; + + if ( defined($next_chunk) ) { + print "Found next chunk length $next_chunk " + . "contains remaining windows:\t@rwindows\n"; + } else { + warn "Need to write iterative method for set definition"; + throw( 'Could not find workable slice length ' + . 'for remaining windows: ' + . join( ', ', @rwindows ) ); + } + } else { + @{ $chunk_windows{$chunk_length} } = keys(%workable_chunks); + print "Found workable chunk length $chunk_length " + . "for all window sizes:\t" + . join( ' ', @{ $chunk_windows{$chunk_length} } ) . "\n"; + } + + $self->{'_slice_chunks'} = \%chunk_windows; + } ## end if ( !defined( $self->...)) + + return $self->{'_slice_chunks'}; +} ## end sub _get_Slice_chunks + + + + +=head2 set_config + + Arg[0] : optional hash - parameter hash(see above methods for more info): + + WINDOW_SIZES => array ref - subset of defined window + sizes + BIN_METHOD => string + MAX_VIEW_WIDTH => int + MAX_DATA_TYPE_SIZE => int + PACK_TEMPLATE => string + PACKED_SIZE => int + BIN_MODEL => string + NEW_ASSEMBLY => string + METHOD_CONFIG => hash of method specific config params + SKIP_ZERO_WINDOW => boolean - skips generation of 0 wsize + this is used if already generated + from an assembly projection. + + NOTE: Over-riding any of the default config may cause + problems when storing or retrieving Collection data, + except sub sets of default window sizes. + + Description: This method replaces the constructor as new will not be + called for Adaptor based Collectors. + Separating this from the store method is currently + redundant as jobs are normally submitetd in Slice based + jobs. However, this will be required if the store method + is further seaprated into fetch/generate and store methods + Returntype : None + Exceptions : Throws if no window sizes or max_view_width defined + Caller : Inheritor Collector e.g. Bio::EnsEMBL::Funcgen:Collector::ResultFeature + or script. + Status : At Risk + +=cut + +sub set_config { + my ( $self, %config ) = @_; + + my ( $wsizes, $bmethod, $mv_width, + $md_type_size, $template, $psize, + $bmodel, $new_assm, $skip_zero_window, + $method_config ) + = rearrange( [ 'WINDOW_SIZES', 'BIN_METHOD', + 'MAX_VIEW_WIDTH', 'MAX_DATA_TYPE_SIZE', + 'PACK_TEMPLATE', 'PACKED_SIZE', + 'BIN_MODEL', 'NEW_ASSEMBLY', + 'SKIP_ZERO_WINDOW', 'METHOD_CONFIG' ], + %config ); + + ### VAILDATE/SET VARS/CONFIG + + # Attrs used in this method + $self->bin_method( $bmethod, $method_config ); + $self->bin_model($bmodel); + $self->window_sizes($wsizes); + + # Set to undef if we have empty array? To change this we need to + # pass the config hash -window_sizes conditionally + # This currently overwrite the defaults! + # if ( ref($window_sizes) eq 'ARRAY' + # && scalar( @{$window_sizes} ) == 0 ) + # { + # $window_sizes = undef; + # } + + # Attrs used in other (store) methods + $self->pack_template($template); + $self->packed_size($psize); + $self->max_data_type_size($md_type_size); + $self->max_view_width($mv_width); + + # Other vars + $self->new_assembly($new_assm); + $self->{'_only_natural'} = 0; + $self->{'_store_natural'} = grep /^0$/, @$window_sizes; + + ### Set window_sizes + + if ( $self->new_assembly() ) { + print "Assembly projection may cause problems " + . "for large Collections, " + . "defaulting to window_sizes = (0)\n"; + + + if ( $skip_zero_window ) { + throw( "You cannot -skip_zero_window or " + . "omit 0 from -window_sizes " + . "when projecting to a new assembly($new_assm) " + . "which should only be generated using window_size=0" ); + } + + + + + # Then build the bins on the projected 0 level single Features + + # Test we haven't explicity set window_sizes to be something else + if ( defined($wsizes) + && !( scalar(@$wsizes) == 1 && $wsizes->[0] == 0 ) ) + { + throw( "You have set window_sizes config " + . "which are not safe when projecting to " + . "a new assembly($new_assm), " + . "please omit window_sizes config or set to 0" ); + } + + $self->window_sizes( [0] ); + } else { + + if ( $wsizes && $skip_zero_window && + ( grep /^0$/, @$wsizes )) { + #Only test passed params not default config + + throw( "You have specied skip_zero_window " + . "and window_size 0 in your parameters, " + . "please remove one of these" ); + } + elsif ( defined($window_sizes) && !grep /^0$/, @$window_sizes ) { + $skip_zero_window = 1; + # re-add 0 window as we need this to build the collections + # see ... + unshift( @{$window_sizes}, 0 ); + } + } + + + if ( $self->{'_store_natural'} && scalar( @{$window_sizes} ) == 1 ) { + $self->{'_only_natural'} = 1; + } + if ($skip_zero_window) { + $self->{'_store_natural'} = 0; + } + + return; +} ## end sub set_config + +=head2 store_window_bins_by_Slice + + Arg[0] : Bio::EnsEMBL:Slice + Example : $collector->store_window_bins_by_Slice($slice); + Description: This is the main run method, it loops through + optimal slice chunks from _define_window_chunks, + calls _bin_features_by_Slice as appropriate and + calls write_collection in the inheriting Collector + class/script. + Returntype : None + Exceptions : Throws if Bio::EnsEMBL::Slice is not defined + Caller : store methods in inheriting Collector class/script + Status : At Risk + +=cut + +sub store_window_bins_by_Slice { + my ( $self, $slice ) = @_; + + warn "Need to be careful here " + . "about cleaning start end strand caches between " + . "serially run slices"; + + if ( !( defined($slice) + && ref($slice) + && $slice->isa('Bio::EnsEMBL::Slice') ) ) + { + throw('You must pass a valid Bio::EnsEMBL::Slice'); + } + + # Rollback previously stored features. + # Change 'can' to empty method stubb with pod ??? + if ( $self->can('rollback_Features_by_Slice') ) { + $self->rollback_Features_by_Slice($slice); + } else { + warn ref($self) + . " cannot rollback_Features_by_Slice. " + . "This may result in storage failure " + . "or duplicate Collections if there is pre-existing data"; + } + + ### PROCESS CHUNKS + my %chunk_windows = %{ $self->_get_Slice_chunks }; + my (%counts); + my $store_natural = $self->{'_store_natural'}; + my $only_natural = $self->{'_only_natural'}; + $counts{0} = 0; # Set natural res count to 0 + my $slice_end = $slice->end; + my $orig_start = $slice->start; + my $region = $slice->coord_system_name; + my $version = $slice->coord_system->version; + my $seq_region_name = $slice->seq_region_name; + my $strand = $slice->strand; + + # Warn if this is not a full slice. Version needed in case we are + # projecting from a non-default version slice + my $full_slice = + $slice->adaptor->fetch_by_region( $region, $seq_region_name, undef, + undef, undef, $version ); + + if ( ( $full_slice->start() != $orig_start ) + || ( $full_slice->end() != $slice_end ) ) + { + warn "Generating collections using sub-Slices " + . "can result in data issues/artifacts"; + # Last chunk might not be the correct window length. Test + # slices less than chunk length can cause failures in + # _bin_features_by_window_sizes others? + } + + # Set the initial collection_start to orig_start. This is not the + # case for 0 wsize where it must always be the true feature start. + for my $wsize (@$window_sizes) { + if ( $wsize == 0 ) { next } + $self->collection_start( $wsize, $orig_start ); + + # Also reset collection end and score cache in case we are running + # serially. + $self->{collection_end}{$wsize} = undef; + $self->{'score_cache'}{$wsize} = []; + } + + my $first_chunk_length = 1; + + foreach my $chunk_length ( sort keys %chunk_windows ) { + print "Processing windows " + . join( ', ', @{ $chunk_windows{$chunk_length} } ) + . " with chunk length $chunk_length\n"; + + # Set window counts to 0 + map $counts{$_} = 0, @{ $chunk_windows{$chunk_length} }; + + # May need to reset flat file parser handle or other caches via + # inheriting Collector + if ( !$first_chunk_length ) { + # Change 'can' to empty method stubb with pod??? + if ( $self->can('reinitialise_input') ) { + $self->reinitialise_input(); + } + } + + $first_chunk_length = 0; + + # Now walk through slice using slice length chunks and build all + # windows in each chunk. + my $in_slice = 1; + my $start_adj = 0; + my ( $sub_slice, $sub_end, $features, $bins ); + my $sub_start = 1; + my $slice_length = $slice->length(); + + # Always create in local coords for fetch + # Then change to seq_region coords for store if required + + while ($in_slice) { + $sub_start += $start_adj; + $sub_end = $sub_start + $chunk_length - 1; + + if ( $sub_end >= $slice_length ) { + # Surplus bins are removed in store/write_collection in caller + $in_slice = 0; + } + + $sub_slice = + $slice->adaptor->fetch_by_region( $region, $seq_region_name, + $sub_start + $orig_start - 1, + $sub_end + $orig_start - 1, + $strand, $version ); + + # Can't subslice as this will not clip if we go over the length of + # the slice, unlike normal slice fetching. Will clipping the end + # to the slice end cause any problems here? How will this affect + # bin clipping? + + ### Grab features and shift chunk coords + $features = $self->get_Features_by_Slice($sub_slice); + + # warn "Binning " + # . scalar(@$features) + # . " Features for chunk length $chunk_length, on Slice " + # . $sub_slice->name; + + if ( ( @{$features} ) + && ref( $features->[0] ) =~ /Bio::EnsEMBL::Utils::Collection/ ) + { + # Would need to create base module with generic methods: + # window_size, ... + + # Check that the returned feature/collections support window_size. + # All Collections should be able to + + if ( $features->[0]->can('window_size') ) { + if ( $features->[0]->window_size != 0 ) { + throw( "You are trying to generated Collections from " + . "a non-zero window sized Collection:\t" + . $features->[1]->{'window_size'} ); + } + + # This should never happen + # if ( !$skip_zero_window ) { + # throw( 'You have retrieved data from a Collection ' + # . 'which without using -skip_zero_window ' + # . 'i.e. you are trying to generate overwrite ' + # . 'the data you are generating the Collections from' ); + # } + + } else { + throw( 'Something is wrong, ' + . 'the Collection you have retrieved ' + . 'does not support the method window_size' ); + } + } ## end if ( ( @{$features} ) ...) + + # Set collection start here for 0 window_size + if ( @{$features} + && $store_natural + && !defined( $self->collection_start(0) ) ) + { + $self->collection_start( 0, + $features->[0]->start + $sub_start ); + } + + if ($in_slice) { + $start_adj = $chunk_length; + } + + # Collect features into wsize bins + if ( !$only_natural ) { + # Get hashref of wsize=>bin array pairs + $bins = + $self->_bin_features_by_Slice_window_sizes( + -slice => $sub_slice, + -window_sizes => $chunk_windows{$chunk_length}, + -features => $features, ); + } + + # Handle 0 wsize + if ($store_natural) { + foreach my $feature ( @{$features} ) { + $counts{0}++; + + if ( $bin_model eq 'SIMPLE' ) { + $self->collection_start( 0, $feature->start + $sub_start ); + + $self->write_collection( + 0, + $slice, # Pass Slice to sub-slice when storing + $feature->end + $sub_start, + $feature->strand, # Need to pass strand for 0 resolution + $feature->scores, ); + } + } + + print "Window size 0 (natural resolution) has " + . scalar( @{$features} ) + . " feature bins for:\t" + . $sub_slice->name . "\n"; + } + + # Now store collections for wsizes >0 + my $num_bins; + + foreach my $wsize ( sort keys( %{$bins} ) ) { + $num_bins = scalar( @{ $bins->{$wsize} } ); + $counts{$wsize} += $num_bins; + + if ( $bin_model eq 'SIMPLE' ) { + $self->write_collection( + $wsize, + $slice, + #$sub_start, + $sub_end, + $slice->strand, # This is most likely 1! + # Override this woth 0 in descendant Collector if required. + $bins->{$wsize}, ); + + } else { + throw( 'Bio::EnsEMBL::Utils::Collector ' + . 'does not yet support non-SIMPLE bin models' ); + # i.e. More than one score + } + } + } ## end while ($in_slice) + + # Turn off storing of natural resolution for next chunk length sets + $store_natural = 0; + } ## end foreach my $chunk_length ( ...) + + # Write last collections for each wsize + + foreach my $wsize (@$window_sizes) { + + if ( ( $wsize == 0 && !$store_natural ) + || ( $wsize != 0 && $only_natural ) ) + { + next; + } + + print "Writing final $wsize window_size collection, " + . "this may result in slightly different " + . "bin numbers from counts due to removing " + . "overhanging bins past end of slice\n"; + $self->write_collection( $wsize, $slice ); + } + + # Print some counts + foreach my $wsize ( sort ( keys %counts ) ) { + print "Generated " + . $counts{$wsize} + . " bins for window size $wsize for " + . $slice->name . "\n"; + # Some may have failed to store if we are projecting to a new + # assembly. + } + + return; +} ## end sub store_window_bins_by_Slice + +=head2 _bin_features_by_Slice_window_sizes + + Args[0] : Bio::EnsEMBL::Slice + Args[1] : ARRAYREF of window sizes + Args[2] : ARRAYREF of features with start and end method + e.g. Bio::EnsEMBL::Features + Example : + + $bins = + $self->_bin_features_by_window_sizes( + -slice => $slice, + -window_sizes => $chunk_windows{$chunk_length}, + -features => $features, ); + + Description: Bins feature scores for a given list of window sizes and + predefined method. + Returntype : HASHREF of scores per bin per window size + Exceptions : None + Caller : store_window_bins_by_Slice + Status : At Risk + +=cut + +sub _bin_features_by_Slice_window_sizes { + my ( $self, @args ) = @_; + + my ( $slice, $wsizes, $features ) = + rearrange( [ 'SLICE', 'WINDOW_SIZES', 'FEATURES' ], @args ); + + # Generate these once in caller? + my $calc_method = '_calculate_' . $bin_method; + my $post_method = '_post_process_' . $bin_method; + + # Do this conditional on the Collection type i.e. is + # collection seq_region blob then no else yes Would need + # $Bio::EnsEMBL::Utils::Collector::collection_format=BLOB|STANDARD + # if ( !defined($features) || !@{$features} ) { return {} } + + # Set up some hashes to store data by window_size + my ( %bins, %nbins, %bin_counts ); + my $slice_start = $slice->start(); + my $slice_length = $slice->length(); + + # Set up some bin data for the windows + foreach my $wsize (@$wsizes) { + $nbins{$wsize} = int( $slice_length/$wsize ); # int rounds down + # nbins is index of the bin not the 'number' + # Unless $slice_length is a multiple! + if ( !( $slice_length % $wsize ) ) { $nbins{$wsize}-- } + + # Create default bins with 0 + $bins{$wsize} = []; + map { $bins{$wsize}->[$_] = 0 } ( 0 .. $nbins{$wsize} ); + + # Set bin counts to 0 for each bin + $bin_counts{$wsize} = []; + + # This is adding an undef to the start of the array!? + map { $bin_counts{$wsize}->[ ($_) ] = 0 } @{ $bins{$wsize} }; + + foreach my $bin ( @{ $bins{$wsize} } ) { + $bin_counts{$wsize}->[$bin] = 0; + } + } + + my $feature_index = 0; + my ( $bin_index, @bin_masks ); + + foreach my $feature ( @{$features} ) { + # Set up the bins for each window size + + foreach my $wsize (@$wsizes) { + my $start_bin = int( ( $feature->start )/$wsize ); + my $end_bin = int( ( $feature->end )/$wsize ); + + if ( $end_bin > $nbins{$wsize} ) { + $end_bin = $nbins{$wsize}; + } + + $self->$calc_method( $feature, $start_bin, $end_bin, + $wsize, \%bins, \%bin_counts ); + } + + } + + # Now do post processing of bins if required + if ( $self->can($post_method) ) { + $self->$post_method( \%bins, \%bin_counts ); + } + + return \%bins; +} ## end sub _bin_features_by_Slice_window_sizes +# end sub _bin_features_by_Slice + + +### Here follows the bin methods +# These may also be defined in the inheriting Collector class. No tests +# as these are internal and require speed. + + +=head2 _calculate_count + + Args[0] : feature e.g. Bio::EnsEMBL::Feature + Args[1] : int - start bin + Args[2] : int - end bin + Args[3] : int - window_size + Args[4] : hashref - score bins + Example : $self->$calc_method + Description: Adds count to bins which this feature overlaps + Returntype : None + Exceptions : None + Caller : _bin_features_by_window_sizes + Status : At Risk + +=cut + +sub _calculate_count { + my ( $self, $feature, $start_bin, $end_bin, $wsize, $bins_ref ) = @_; + + my $bin_index; + + for ( $bin_index = $start_bin; $bin_index <= $end_bin; ++$bin_index ) + { + $bins_ref->{$wsize}->[$bin_index]++; + } + + return; +} + + +=head2 _calculate_average_score + + Args[0] : feature e.g. Bio::EnsEMBL::Feature + Args[1] : int - start bin + Args[2] : int - end bin + Args[3] : int - window_size + Args[4] : hashref - score bins + Example : $self->$calc_method + Description: Adds score to bins which this feature overlaps + Returntype : None + Exceptions : None + Caller : _bin_features_by_window_sizes + Status : At Risk + +=cut + + +sub _calculate_average_score { + my ( $self, $feature, $start_bin, $end_bin, $wsize, $bins_ref, + $bin_counts_ref ) + = @_; + + # This is simple an average of all the scores for features which + # overlap this bin. No weighting with respect to the bin or the + # feature. + + my $score = $self->get_score_by_Feature($feature); + + for ( my $bin_index = $start_bin; + $bin_index <= $end_bin; + ++$bin_index ) + { + # We should really push onto array here so we can have median or + # mean. + + $bins_ref->{$wsize}->[$bin_index] += $score; + $bin_counts_ref->{$wsize}->[$bin_index]++; + } + + return; +} + + +=head2 _post_process_average_score + + Args[0] : hashref - score bins + Args[1] : hashref - count bins + Example : $self->$post_method + Description: Post processes bins to calculate average score + Returntype : None + Exceptions : None + Caller : _bin_features_by_window_sizes + Status : At Risk + +=cut + +sub _post_process_average_score { + my ( $self, $bins_ref, $bin_counts_ref ) = @_; + + foreach my $wsize ( keys %{$bins_ref} ) { + foreach my $bin_index ( 0 .. $#{ $bins_ref->{$wsize} } ) { + + if ( $bin_counts_ref->{$wsize}->[$bin_index] ) { + $bins_ref->{$wsize}->[$bin_index] /= + $bin_counts_ref->{$wsize}->[$bin_index]; + } + + } + } + + return; +} + + +=head2 _calculate_max_magnitude + + Args[0] : feature e.g. Bio::EnsEMBL::Feature + Args[1] : int - start bin + Args[2] : int - end bin + Args[3] : int - window_size + Args[4] : hashref - score bins + Example : $self->$calc_method + Description: Sets max +/-ve scores for bins which this feature overlaps + Returntype : None + Exceptions : None + Caller : _bin_features_by_window_sizes + Status : At Risk + +=cut + +sub _calculate_max_magnitude { + my ( $self, $feature, $start_bin, $end_bin, $wsize, $bins_ref ) = @_; + + my $score = $self->get_score_by_Feature($feature); + + # Max magnitude + # Take the highest value +ve or -ve score + for ( my $bin_index = $start_bin; + $bin_index <= $end_bin; + ++$bin_index ) + { + + # We really need to capture the lowest -ve and higest +ve scores + # here and post process to pick between them. + + $bins_ref->{$wsize}->[$bin_index] ||= [ 0, 0 ]; #-ve, +ve + + if ( $score < $bins_ref->{$wsize}->[$bin_index]->[0] ) { + $bins_ref->{$wsize}->[$bin_index]->[0] = $score; + } elsif ( $score > $bins_ref->{$wsize}->[$bin_index][1] ) { + $bins_ref->{$wsize}->[$bin_index]->[1] = $score; + } + } + + return; +} ## end sub _calculate_max_magnitude + + +=head2 _post_process_max_magnitude + + Args[0] : hashref - score bins + Args[1] : hashref - count bins + Example : $self->$post_method + Description: Post processes bins to pick largest +ve or -ve score + Returntype : None + Exceptions : None + Caller : _bin_features_by_window_sizes + Status : At Risk + +=cut + +sub _post_process_max_magnitude { + my ( $self, $bins_ref ) = @_; + + # Take the highest value +ve or -ve score + + foreach my $wsize ( keys %{$bins_ref} ) { + foreach my $bin_index ( 0 .. $#{ $bins_ref->{$wsize} } ) { + + # Have potential for no listref in a given bin + + # default value if we haven't seen anything is 0 + # Actually want an array of -ve +ve values + + if ( $bins_ref->{$wsize}->[$bin_index] ) { + my $tmp_minus = -$bins_ref->{$wsize}->[$bin_index]->[0]; + + if ( $tmp_minus > $bins_ref->{$wsize}->[$bin_index]->[1] ) { + $bins_ref->{$wsize}->[$bin_index] = + $bins_ref->{$wsize}->[$bin_index]->[0]; + } else { + $bins_ref->{$wsize}->[$bin_index] = + $bins_ref->{$wsize}->[$bin_index]->[1]; + } + + } + + } + } + return; +} ## end sub _post_process_max_magnitude + + +=head2 _calculate_RPKM + + Args[0] : feature e.g. Bio::EnsEMBL::Feature + Args[1] : int - start bin + Args[2] : int - end bin + Args[3] : int - window_size + Args[4] : hashref - score bins + Example : $self->$calc_method + Description: Stores counts to calculate Read Per Kb per Million(RPKM) + Returntype : None + Exceptions : None + Caller : _bin_features_by_window_sizes + Status : At Risk + +=cut + +sub _calculate_RPKM { + my ( $self, $feature, $start_bin, $end_bin, $wsize, $bins_ref ) = @_; + + $self->_calculate_count( $feature, $start_bin, $end_bin, + $wsize, $bins_ref ); + + return; +} + + +=head2 _post_process_RPKM + + Args[0] : hashref - score bins + Args[1] : hashref - count bins + Example : $self->$post_method + Description: Post processes bins to calculate average score + Returntype : None + Exceptions : None + Caller : _bin_features_by_window_sizes + Status : At Risk + +=cut + +sub _post_process_RPKM { + my ( $self, $bins_ref ) = @_; + + #10^9 x C / NGB + #C = Reads overlapping bin + #N = Total reads in the experiment + #G = Length of bin in bps + #(don't really have to account for non-ref/HAPs or gender here + #as should be close enough, CellTypes/gender differences will be miniscule) + #B = length of each bin + + foreach my $wsize(keys %{$bins_ref}){ + + foreach my $bin_index(0..$#{$bins_ref->{$wsize}}){ + $bins_ref->{$wsize}->[$bin_index] = + ((10**9) * + $bins_ref->{$wsize}->[$bin_index])/(($self->_RPKM_factor($wsize)) * $wsize); + } + } + + return; + +} + + +=head2 _set_up_RPKM + + Args[0] : hashref - method config e.g + { + DNADB => Bio::EnsEMBL::DBSQL::DBAdaptor, + TOTAL_FEATURE => $total_feature_count, + } + + Example : $self->$set_up_method($config); + Description: Sets the RPKM factor + Returntype : None + Exceptions : Throws is required config params are not set + Caller : bin_method + Status : At Risk + +=cut + + +sub _set_up_RPKM{ + my ($self, $config) = @_; + + my ($dnadb, $total_features) = rearrange([ 'DNADB', 'TOTAL_FEATURES'], %{$config}); + + #Test specifically here to notify about config hash + if(! $total_features){ + throw("For RPKM you must pass a valid 'total_features' ". + "as part of the method config hash."); + } + + if(! $dnadb){ + throw("For RPKM you must pass 'dnadb' as part of the method config hash."); + } + + foreach my $wsize(@{$self->window_sizes}){ + #Should never have 0 here + $self->_RPKM_factor($wsize, ($wsize * $total_features)); #N*G + + warn "setting $wsize RPKM factor($wsize * $total_features) to ". + $self->_RPKM_factor($wsize); + } + + return; +} ## end sub _set_up_RPKM + + +=head2 _RPKM_factor + + Args[0] : int - RPKM factor i.e. (Total reads in the experiment * + Genome length) + Example : $self->_RPKM_factor($wsize, $factor); + Description: Gets/Sets the RPKM factor + Returntype : int + Exceptions : None + Caller : _set_up_RPKM, _post_process_RPKM + Status : At Risk + +=cut + +sub _RPKM_factor{ + my ($self, $wsize, $factor) = @_; + + if (! defined $wsize){ + throw("You must pass at least window_size to get or set the RPKM factor"); + } + + if(defined $factor){ + $self->{'RPKM_factor'}{$wsize} = $factor; + } + elsif(! exists $self->{'RPKM_factor'}{$wsize}){ + #This should never happen unless the window sizes + #are redefined after initialisation + throw("You have requested an RPKM factor for a window_size". + " which has not been set:\t$wsize"); + } + + return $self->{'RPKM_factor'}{$wsize}; +} + +=head2 get_diploid_genome_length_by_gender + + Args[0] : string - RPKM factor i.e. (Total reads in the experiment * + Genome length) + Args[1] : string - gender e.g. male or female + Example : + + my $glength = + $self->get_diploid_genome_length_by_gender( $dnadb, $gender ); + + Description: Gets the gender specific diploid genome length, + including non-ref but not including haplotypes. Only + handles species with X/Y sex chromosomes. + Returntype : int + Exceptions : None + Caller : _set_up_RPKM, _post_process_RPKM + Status : At Risk - Move to and export from generic Utils Slice module??? + +=cut + +sub get_diploid_genome_length_by_gender { + my ( $dnadb, $gender ) = @_; + + my %sex_chrs = ( 'Y' => 'male', + 'X' => 'female', ); + + my $dip_length = 0; + + if (!(( ref($dnadb) && $dnadb->isa('Bio::EnsEMBL::DBSQL::DBAdaptor') ) + && $dnadb->grou() eq 'core' + && ( defined $gender && $gender =~ /(male|female)/ ) ) ) + { + throw( "Must provide valid " + . "Bio::EnsEMBL::DBSQL::DBAdaptor($dnadb) and " + . "gender ($gender) arguments" ); + } + + my @ref_slices = $dnadb->get_SliceAdaptor->fetch_all('toplevel'); + + # Include non-ref(unassembled), but omit haps/lrgs(i.e. redundant) + + foreach my $slice ( + @{ $dnadb->get_SliceAdaptor->fetch_all( 'toplevel', undef, 1, 1 ) } + ) + { + # Include duplicated region for true diploid length + + # Skip haps/lrgs + if ( ( $slice->coord_system->name() eq 'chromosome' + && !$slice->is_reference() ) + || $slice->coord_system->name() eq 'lrg' ) + { + next; + } + + if ( exists( $sex_chrs{ $slice->seq_region_name() } ) ) { + if ( $gender eq 'male' ) { + $dip_length += $slice->length; + } elsif ( $sex_chrs{ $slice->seq_region_name } eq 'male' ) { + next; + } + } + + $dip_length += 2*$slice->length; + } + + return $dip_length; +} ## end sub get_diploid_genome_length_by_gender + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/ConfParser.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/ConfParser.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,618 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::ConfParser - configuration parser for perl scripts + +=head1 SYNOPSIS + + my $conf = new Bio::EnsEMBL::Utils::ConfParser( + -SERVERROOT => "/path/to/ensembl", + -DEFAULT_CONF => "my.default.conf" + ); + + # parse options from configuration file and commandline + $conf->parse_options( + 'mandatory_string_opt=s' => 1, + 'optional_numeric_opt=n' => 0, + ); + + # get a paramter value + my $val = $conf->param('manadatory_string_op'); + +=head1 DESCRIPTION + +This module parses a configuration file and the commandline options +passed to a script (the latter superseed the former). Configuration +files contain ini-file style name-value pairs, and the commandline +options are passed to Getopt::Long for parsing. + +The parameter values are consequently accessible via the param() +method. You can also create a commandline string of all current +parameters and their values to pass to another script. + +=cut + +package Bio::EnsEMBL::Utils::ConfParser; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Getopt::Long; +use Text::Wrap; +use Cwd qw(abs_path); +use Pod::Usage qw(pod2usage); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::ScriptUtils qw(user_proceed); + + +=head2 new + + Arg [SERVERROOT] : + String $serverroot - root directory of your ensembl code + Arg [DEFAULT_CONF] : + String $default_conf - default configuration file + Example : my $conf = new Bio::EnsEMBL::Utils::ConfParser( + -SERVERROOT => '/path/to/ensembl', + -DEFAULT_CONF => 'my.default.conf' + ); + Description : object constructor + Return type : Bio::EnsEMBL::Utils::ConfParser object + Exceptions : thrown if no serverroot is provided + Caller : general + Status : At Risk + : under development + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my ($serverroot, $default_conf) = + rearrange([qw(SERVERROOT DEFAULT_CONF)], @_); + + throw("You must supply a serverroot.") unless ($serverroot); + + my $self = {}; + bless ($self, $class); + + $self->serverroot($serverroot); + $self->default_conf($default_conf || "$ENV{HOME}/.ensembl_script.conf"); + + return $self; +} + + +=head2 parse_options + + Arg[1..n] : pairs of option definitions and mandatory flag (see below for + details) + Example : $conf->parse_options( + 'mandatory_string_opt=s' => 1, + 'optional_numeric_opt=n' => 0, + ); + Description : This method reads options from an (optional) configuration file + and parses the commandline options supplied by the user. + Commandline options will superseed config file settings. The + string "$SERVERROOT" in the configuration entries will be + replaced by the appropriate value. + + The arguments passed to this method are pairs of a Getopt::Long + style option definition (in fact it will be passed to + GetOptions() directly) and a flag indicating whether this + option is mandatory (1) or optional (0). + + In addition to these user-defined options, a set of common + options is always parsed. See _common_options() for details. + + If you run your script with --interactive the user will be + asked to confirm the parameters after parsing. + + All parameters will then be accessible via $self->param('name'). + Return type : true on success + Exceptions : thrown if configuration file can't be opened + thrown on missing mandatory parameters + Caller : general + Status : At Risk + : under development + +=cut + +sub parse_options { + my ($self, @params) = @_; + + # add common options to user supplied list + push @params, $self->_common_options; + + # read common commandline options + my %h; + my %params = @params; + + Getopt::Long::Configure('pass_through'); + &GetOptions(\%h, keys %params); + + # reads config file + my $conffile = $h{'conffile'} || $self->default_conf; + $conffile = abs_path($conffile); + + if (-e $conffile) { + open(CONF, $conffile) or throw( + "Unable to open configuration file $conffile for reading: $!"); + + my $serverroot = $self->serverroot; + my $last; + + while (my $line = ) { + chomp $line; + + # remove leading and trailing whitespace + $line =~ s/^\s*//; + $line =~ s/\s*$//; + + # join with next line if terminated with backslash (this is to allow + # multiline configuration settings + $line = $last . $line; + if ($line =~ /\\$/) { + $line =~ s/\\$//; + $last = $line; + next; + } else { + $last = undef; + } + + # remove comments + $line =~ s/^[#;].*//; + $line =~ s/\s+[;].*$//; + + # read options into internal parameter datastructure + next unless ($line =~ /(\w\S*)\s*=\s*(.*)/); + my $name = $1; + my $val = $2; + + # strip optional quotes from parameter values + $val =~ s/^["'](.*)["']/$1/; + + # replace $SERVERROOT with value + if ($val =~ /\$SERVERROOT/) { + $val =~ s/\$SERVERROOT/$serverroot/g; + $val = abs_path($val); + } + $self->param($name, $val); + } + + $self->param('conffile', $conffile); + } + + # override configured parameter with commandline options + map { $self->param($_, $h{$_}) } keys %h; + + # check for required params, convert comma to list, maintain an ordered + # list of parameters and list of 'flag' type params + my @missing = (); + my $i = 0; + + foreach my $param (@params) { + next if ($i++ % 2); + + my $required = $params{$param}; + my ($list, $flag); + $list = 1 if ($param =~ /\@$/); + $flag = 1 if ($param =~ /!$/); + $param =~ s/(^\w+).*/$1/; + + $self->comma_to_list($param) if ($list); + + push @missing, $param if ($required and !$self->param($param)); + push @{ $self->{'_ordered_params'} }, $param; + $self->{'_flag_params'}->{$param} = 1 if ($flag); + } + + if (@missing) { + throw("Missing parameters: @missing.\nYou must specify them on the commandline or in your conffile.\n"); + } + + # error handling and --help + pod2usage(1) if ($self->param('help')); + + # ask user to confirm parameters to proceed + $self->confirm_params; + + return(1); +} + + +# +# Commonly used options. These are parsed by default even if they are not +# passed to parse_options() explicitely. +# +sub _common_options { + my $self = shift; + return ( + 'conffile|conf=s' => 0, + 'logfile|log=s' => 0, + 'logauto!' => 0, + 'logautobase=s' => 0, + 'logautoid=s' => 0, + 'logpath=s' => 0, + 'logappend|log_append|log-append!' => 0, + 'loglevel=s' => 0, + 'is_component|is-component!' => 0, + 'interactive|i!' => 0, + 'dry_run|dry-run|dry|n!' => 0, + 'help|h|?' => 0, + ); +} + + +=head2 confirm_params + + Example : $conf->confirm_params; + Description : If the script is run with the --interactive switch, this method + prints a table of all parameters and their values and asks user + to confirm if he wants to proceed. + Return type : true on success + Exceptions : none + Caller : parse_options() + Status : At Risk + : under development + +=cut + +sub confirm_params { + my $self = shift; + + if ($self->param('interactive')) { + # print parameter table + print "Running script with these parameters:\n\n"; + print $self->list_param_values; + + # ask user if he wants to proceed + exit unless user_proceed("Continue?", 1, 'n'); + } + + return(1); +} + + +=head2 param + + Arg[1] : Parameter name + Arg[2..n] : (optional) List of values to set + Example : # getter + my $dbname = $conf->param('dbname'); + + # setter + $conf->param('port', 3306); + $conf->param('chromosomes', 1, 6, 'X'); + Description : Getter/setter for parameters. Accepts single-value params and + list params. + Return type : Scalar value for single-value parameters, array of values for + list parameters + Exceptions : thrown if no parameter name is supplied + Caller : general + Status : At Risk + : under development + +=cut + +sub param { + my $self = shift; + my $name = shift or throw("You must supply a parameter name"); + + # setter + if (@_) { + if (scalar(@_) == 1) { + # single value + $self->{'_param'}->{$name} = shift; + } else { + # list of values + undef $self->{'_param'}->{$name}; + @{ $self->{'_param'}->{$name} } = @_; + } + } + + # getter + if (ref($self->{'_param'}->{$name}) eq 'ARRAY') { + # list parameter + return @{ $self->{'_param'}->{$name} }; + } elsif (defined($self->{'_param'}->{$name})) { + # single-value parameter + return $self->{'_param'}->{$name}; + } else { + return undef; + } +} + + +=head2 is_true + + Arg[1] : Parameter name + Example : unless ($conf->is_true('upload')) { + print "Won't upload data.\n"; + next; + } + Description : Checks whether a param value is set to 'true', which is defined + here as TRUE (in the Perl sense) but not the string 'no'. + Return type : Boolean + Exceptions : thrown if no parameter name is supplied + Caller : general + Status : At Risk + : under development + +=cut + +sub is_true { + my $self = shift; + my $name = shift or throw("You must supply a parameter name"); + + my $param = $self->param($name); + + if ($param and !($param =~ /^no$/i)) { + return(1); + } else { + return(0); + } +} + + +=head2 list_params + + Example : print "Current parameter names:\n"; + foreach my $param (@{ $conf->list_params }) { + print " $param\n"; + } + Description : Returns a list of the currently available parameter names. The + list will be in the same order as option definitions were + passed to the new() method. + Return type : Arrayref of parameter names + Exceptions : none + Caller : list_param_values(), create_commandline_options() + Status : At Risk + : under development + +=cut + +sub list_params { + my $self = shift; + return $self->{'_ordered_params'} || []; +} + + +=head2 list_param_values + + Example : print LOG $conf->list_param_values; + Description : prints a table of the parameters used in the script + Return type : String - the table to print + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub list_param_values { + my $self = shift; + + $Text::Wrap::colums = 72; + + my $txt = sprintf " %-20s%-40s\n", qw(PARAMETER VALUE); + $txt .= " " . "-"x70 . "\n"; + + foreach my $key (@{ $self->list_params }) { + my $val; + if (defined($self->param($key))) { + $txt .= Text::Wrap::wrap(sprintf(' %-19s ', $key), ' 'x24, + join(", ", $self->param($key)))."\n"; + } + } + + $txt .= "\n"; + + return $txt; +} + + +=head2 create_commandline_options + + Arg[1..n] : param/value pairs which should be added to or override the + currently defined parameters + Example : $conf->create_commandline_options( + 'dbname' => 'homo_sapiens_vega_33_35e', + 'interactive' => 0 + ); + Description : Creates a commandline options string of all current paramters + that can be passed to another script. + Return type : String - commandline options string + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub create_commandline_options { + my ($self, %replace) = @_; + + my %param_hash; + + # deal with list values + foreach my $param (@{ $self->list_params }) { + my ($first, @rest) = $self->param($param); + next unless (defined($first)); + + if (@rest) { + $first = join(",", $first, @rest); + } + $param_hash{$param} = $first; + } + + # replace values + foreach my $key (keys %replace) { + $param_hash{$key} = $replace{$key}; + } + + # create the commandline options string + my $options_string; + foreach my $param (keys %param_hash) { + + my $val = $param_hash{$param}; + + # deal with 'flag' type params correctly + if ($self->{'_flag_params'}->{$param}) { + # change 'myparam' to 'nomyparam' if no value set + $param = 'no'.$param unless ($val); + + # unset value (this is how flags behave) + $val = undef; + } else { + # don't add the param if it's not a flag param and no value is set + next unless (defined($val)); + + # quote the value if it contains blanks + if ($val =~ /\s+/) { + # use an appropriate quoting style + ($val =~ /'/) ? ($val = qq("$val")) : ($val = qq('$val')); + } + } + + $options_string .= sprintf(qq(--%s %s ), $param, $val); + } + + return $options_string; +} + + +=head2 comma_to_list + + Arg[1..n] : list of parameter names to parse + Example : $conf->comma_to_list('chromosomes'); + Description : Transparently converts comma-separated lists into arrays (to + allow different styles of commandline options, see perldoc + Getopt::Long for details). Parameters are converted in place + (accessible through $self->param('name')). + Return type : true on success + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub comma_to_list { + my $self = shift; + + foreach my $param (@_) { + $self->param($param, split (/,/, join (',', $self->param($param)))); + } + + return(1); +} + + +=head2 list_or_file + + Arg[1] : Name of parameter to parse + Example : $conf->list_or_file('gene'); + Description : Determines whether a parameter holds a list or it is a filename + to read the list entries from. + Return type : true on success + Exceptions : thrown if list file can't be opened + Caller : general + Status : At Risk + : under development + +=cut + +sub list_or_file { + my ($self, $param) = @_; + + my @vals = $self->param($param); + return unless (@vals); + + my $firstval = $vals[0]; + + if (scalar(@vals) == 1 && -e $firstval) { + # we didn't get a list of values, but a file to read values from + @vals = (); + + open(IN, $firstval) or throw("Cannot open $firstval for reading: $!"); + + while(){ + chomp; + push(@vals, $_); + } + + close(IN); + + $self->param($param, @vals); + } + + $self->comma_to_list($param); + + return(1); +} + + +=head2 serverroot + + Arg[1] : (optional) String - root directory of your ensembl checkout + Example : my $serverroot = $conf->serverroot; + Description : Getter/setter for the root directory of your ensembl checkout. + Return type : String + Exceptions : none + Caller : new(), general + Status : At Risk + : under development + +=cut + +sub serverroot { + my $self = shift; + $self->{'_serverroot'} = shift if (@_); + return $self->{'_serverroot'}; +} + + +=head2 default_conf + + Arg[1] : (optional) String - default configuration file + Example : $conf->default_conf('my.default.conf'); + Description : Getter/setter for the default configuration file. + Return type : String + Exceptions : none + Caller : new(), general + Status : At Risk + : under development + +=cut + +sub default_conf { + my $self = shift; + $self->{'_default_conf'} = shift if (@_); + return $self->{'_default_conf'}; +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/ConfigRegistry.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/ConfigRegistry.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,337 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::ConfigRegistry; + +=head1 SYNOPSIS + + + Bio::EnsEMBL::Utils::ConfigRegistry->load_core($dba); + + +=head1 DESCRIPTION + +The ConfigRegistry will "Register" a set of adaptors based on the type +of database that is being loaded. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Utils::ConfigRegistry; + +use strict; +use warnings; + +use Bio::EnsEMBL::Registry; +my $reg = "Bio::EnsEMBL::Registry"; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::DBSQL::DBConnection; +use Bio::EnsEMBL::DBSQL::DBAdaptor; + +use Bio::EnsEMBL::Utils::Exception qw(warning throw deprecate stack_trace_dump); + + + +sub gen_load { + my ($dba) = @_; + my $config_sub; + + # At some point we hope to set the group in the DBadaptor, hence this + # long check etc. should be simpler. + + if ( $dba->isa('Bio::EnsEMBL::Compara::DBSQL::DBAdaptor') ) { + if ( !defined( $dba->group() ) ) { + $dba->group('compara'); + } + $config_sub = \&Bio::EnsEMBL::Utils::ConfigRegistry::load_compara; + } elsif ( $dba->isa('Bio::EnsEMBL::Lite::DBAdaptor') ) { + if ( !defined( $dba->group() ) ) { + $dba->group('lite'); + } + $config_sub = \&Bio::EnsEMBL::Utils::ConfigRegistry::load_lite; + } elsif ( $dba->isa('Bio::EnsEMBL::External::BlastAdaptor') ) { + if ( !defined( $dba->group() ) ) { + $dba->group('blast'); + } + $config_sub = \&Bio::EnsEMBL::Utils::ConfigRegistry::load_blast; + } elsif ( $dba->isa('Bio::EnsEMBL::ExternalData::SNPSQL::DBAdaptor') ) + { + if ( !defined( $dba->group() ) ) { + $dba->group('SNP'); + } + $config_sub = \&Bio::EnsEMBL::Utils::ConfigRegistry::load_SNP; + } elsif ( $dba->isa('Bio::EnsEMBL::Pipeline::DBSQL::DBAdaptor') ) { + if ( !defined( $dba->group() ) ) { + $dba->group('pipeline'); + } + $config_sub = \&Bio::EnsEMBL::Utils::ConfigRegistry::load_pipeline; + } elsif ( $dba->isa('Bio::EnsEMBL::Hive::DBSQL::DBAdaptor') ) { + if ( !defined( $dba->group() ) ) { + $dba->group('hive'); + } + $config_sub = \&Bio::EnsEMBL::Utils::ConfigRegistry::load_hive; + } elsif ( + $dba->isa('Bio::EnsEMBL::ExternalData::Haplotype::DBAdaptor') ) + { + if ( !defined( $dba->group() ) ) { + $dba->group('haplotype'); + } + $config_sub = \&Bio::EnsEMBL::Utils::ConfigRegistry::load_haplotype; + } elsif ( $dba->isa('Bio::EnsEMBL::Variation::DBSQL::DBAdaptor') ) { + if ( !defined( $dba->group() ) ) { + $dba->group('variation'); + } + $config_sub = \&Bio::EnsEMBL::Utils::ConfigRegistry::load_variation; + } elsif ( $dba->isa('Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor') ) { + if ( !defined( $dba->group() ) ) { + $dba->group('funcgen'); + } + $config_sub = \&Bio::EnsEMBL::Utils::ConfigRegistry::load_funcgen; + } elsif ( $dba->isa('Bio::Ensembl::DBSQL::OntologyTermAdaptor') ) { + if ( !defined( $dba->group() ) ) { + $dba->group('ontology'); + } + $config_sub = \&Bio::EnsEMBL::Utils::ConfigRegistry::load_ontology; + } elsif ( $dba->isa('Bio::EnsEMBL::DBSQL::DBAdaptor') ) { + #vega uses the core DBAdaptor so test if vega is in the dbname + if ( !defined( $dba->group() ) ) { + $dba->group('core'); + } + + if ( $dba->group eq "estgene" ) { + $config_sub = \&Bio::EnsEMBL::Utils::ConfigRegistry::load_estgene; + } elsif ( $dba->group eq "otherfeatures" ) { + $config_sub = + \&Bio::EnsEMBL::Utils::ConfigRegistry::load_otherfeatures; + } elsif ( $dba->group eq 'vega' || $dba->group eq 'vega_update' ) { + $config_sub = \&Bio::EnsEMBL::Utils::ConfigRegistry::load_vega; + } else { + $config_sub = \&Bio::EnsEMBL::Utils::ConfigRegistry::load_core; + } + + } else { + # none standard DBA adaptor + if ( !defined( $dba->group() ) ) { + $dba->group('none_standard'); + } + $config_sub = + \&Bio::EnsEMBL::Utils::ConfigRegistry::load_and_attach_dnadb_to_core; + # throw("Unknown DBAdaptor type $dba\n"); + } + + # return if the connection and species, group are the same + + if ( defined( $dba->species ) ) { + my $db_reg = $reg->get_DBAdaptor( $dba->species, $dba->group, 1 ); + if ( defined($db_reg) ) { + if ( $dba->dbc->equals( $db_reg->dbc ) ) { return $db_reg } + else { + my $msg = + sprintf( 'WARN: Species (%s) and group (%s) ' + . 'same for two seperate databases', + $dba->species(), $dba->group() ); + + warn "${msg}\nModify species name for one of these\n"; + $dba->species( + find_unique_species( $dba->species, $dba->group ) ); + } + } + } else { # no species + + my @db_reg = + @{ $reg->get_all_DBAdaptors_by_connection( $dba->dbc ) }; + + foreach my $db_adaptor (@db_reg) { + if ( $db_adaptor->group eq $dba->group ) { + # found same db connection and group + return $db_adaptor; + } + } + + $dba->species( find_unique_species( "DEFAULT", $dba->group ) ); + if ( $dba->species ne "DEFAULT" ) { + warn "WARN: For multiple species " + . "use species attribute in DBAdaptor->new()\n"; + } + } + + Bio::EnsEMBL::Registry->add_DBAdaptor( $dba->species(), $dba->group(), + $dba ); + + #call the loading subroutine. (add the adaptors to the DBAdaptor) + &{$config_sub}($dba); + + return $dba; +} ## end sub gen_load + + + +sub find_unique_species { + my ( $species, $group ) = @_; + + $reg->add_alias( $species, $species ); + + my $i = 0; + my $free = 0; + + while ( !$free ) { + if ( $i == 0 ) { + if ( !defined( $reg->get_DBAdaptor( $species, $group ) ) ) { + $free = 1; + $i = ""; + } else { + $i = 1; + } + } else { + # set needed self alias + $reg->add_alias( $species . $i, $species . $i ); + + if ( !defined( $reg->get_DBAdaptor( $species . $i, $group ) ) ) { + $free = 1; + } else { + $i++; + } + } + } + + $species .= $i; + return ($species); +} ## end sub find_unique_species + + + +sub load_adaptors { + my ($dba) = @_; + + my %pairs = %{ $dba->get_available_adaptors() }; + + while ( my ( $key, $value ) = each(%pairs) ) { + Bio::EnsEMBL::Registry->add_adaptor( $dba->species(), $dba->group(), + $key, $value ); + } +} + +sub load_and_attach_dnadb_to_core { + my ($dba) = @_; + + load_adaptors($dba); + $reg->add_DNAAdaptor( $dba->species(), $dba->group(), $dba->species(), + 'core' ); +} + + +=head2 load_core + Arg [1] : DBAdaptor with DBConnection already attached + Returntype : DBAdaptor + Exceptions : none +=cut +sub load_core { load_adaptors(@_) } + + +# +# 1) core. no need to add dnadb +# 2) not core add dnadb +# 3) +# + +=head2 load_compara + Arg [1] : DBAdaptor with DBConnection already attached + Returntype : DBAdaptor + Exceptions : none +=cut +sub load_compara { load_adaptors(@_) } + +=head2 load_hive + Arg [1] : DBAdaptor with DBConnection already attached + Returntype : DBAdaptor + Exceptions : none +=cut +sub load_hive { load_adaptors(@_) } + +=head2 load_pipeline + Arg [1] : DBAdaptor with DBConnection already attached + Returntype : DBAdaptor + Exceptions : none +=cut +sub load_pipeline { load_adaptors(@_) } + +=head2 load_SNP + Arg [1] : DBAdaptor with DBConnection already attached + Returntype : DBAdaptor + Exceptions : none +=cut +sub load_SNP { load_adaptors(@_) } + +sub load_haplotype { load_adaptors(@_) } + +sub load_ontology { load_adaptors(@_) } + + +# these that need to attach to the core to get the sequence data + +sub load_estgene { load_and_attach_dnadb_to_core(@_) } + +sub load_variation { load_and_attach_dnadb_to_core(@_) } + +sub load_funcgen { load_and_attach_dnadb_to_core(@_) } + +=head2 load_otherfeatures + Arg [1] : DBAdaptor with DBConnection alredy attached + Returntype : DBAdaptor + Exceptions : none + +=cut +sub load_otherfeatures { load_and_attach_dnadb_to_core(@_) } + +=head2 load_vega + Arg [1] : DBAdaptor with DBConnection already attached + Returntype : DBAdaptor + Exceptions : none +=cut +sub load_vega { load_and_attach_dnadb_to_core(@_) } + + +sub add_alias { + my ( $class, @args ) = @_; + + my ( $species, $aliases ) = rearrange( [qw(SPECIES ALIAS)], @args ); + + # Make sure it exists itself + Bio::EnsEMBL::Registry->add_alias( $species, $species ); + + if ( defined($aliases) ) { + foreach my $ali (@$aliases) { + Bio::EnsEMBL::Registry->add_alias( $species, $ali ); + } + } +} + +# +# overwrite/load new types. Done this way to enable no changes to CVS for +# external users. External users should add there own "GROUPS" in the file +# User_defined_load. +# + +eval{ require Bio::EnsEMBL::Utils::User_defined_load }; + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/ConversionSupport.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/ConversionSupport.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,2018 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::ConversionSupport - Utility module for Vega release and +schema conversion scripts + +=head1 SYNOPSIS + + my $serverroot = '/path/to/ensembl'; + my $support = new Bio::EnsEMBL::Utils::ConversionSupport($serverroot); + + # parse common options + $support->parse_common_options; + + # parse extra options for your script + $support->parse_extra_options( 'string_opt=s', 'numeric_opt=n' ); + + # ask user if he wants to run script with these parameters + $support->confirm_params; + + # see individual method documentation for more stuff + +=head1 DESCRIPTION + +This module is a collection of common methods and provides helper +functions for the Vega release and schema conversion scripts. Amongst +others, it reads options from a config file, parses commandline options +and does logging. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Utils::ConversionSupport; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Getopt::Long; +use Text::Wrap; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use FindBin qw($Bin $Script); +use POSIX qw(strftime); +use Cwd qw(abs_path); +use DBI; +use Data::Dumper; +use Fcntl qw(:flock SEEK_END); + +my $species_c = 1; #counter to be used for each database connection made + +=head2 new + + Arg[1] : String $serverroot - root directory of your ensembl sandbox + Example : my $support = new Bio::EnsEMBL::Utils::ConversionSupport( + '/path/to/ensembl'); + Description : constructor + Return type : Bio::EnsEMBL::Utils::ConversionSupport object + Exceptions : thrown if no serverroot is provided + Caller : general + +=cut + +sub new { + my $class = shift; + (my $serverroot = shift) or throw("You must supply a serverroot."); + my $self = { + '_serverroot' => $serverroot, + '_param' => { interactive => 1 }, + '_warnings' => 0, + }; + bless ($self, $class); + return $self; +} + +=head2 parse_common_options + + Example : $support->parse_common_options; + Description : This method reads options from a configuration file and parses + some commandline options that are common to all scripts (like + db connection settings, help, dry-run). Commandline options + will override config file settings. + + All options will be accessible via $self->param('name'). + Return type : true on success + Exceptions : thrown if configuration file can't be opened + Caller : general + +=cut + +sub parse_common_options { + my $self = shift; + + # read commandline options + my %h; + Getopt::Long::Configure("pass_through"); + &GetOptions( \%h, + 'dbname|db_name=s', + 'host|dbhost|db_host=s', + 'port|dbport|db_port=n', + 'user|dbuser|db_user=s', + 'pass|dbpass|db_pass=s', + 'conffile|conf=s', + 'logfile|log=s', + 'nolog', + 'logpath=s', + 'log_base_path=s', + 'logappend|log_append', + 'verbose|v', + 'interactive|i=s', + 'dry_run|dry|n', + 'help|h|?', + ); + + # reads config file + my $conffile = $h{'conffile'} || $self->serverroot . "/sanger-plugins/vega/conf/ini-files/Conversion.ini"; + $conffile = abs_path($conffile); + if (-e $conffile) { + open(CONF, $conffile) or throw( + "Unable to open configuration file $conffile for reading: $!"); + my $serverroot = $self->serverroot; + while () { + chomp; + + # remove comments + s/^[#;].*//; + s/\s+[;].*$//; + + # read options into internal parameter datastructure, removing whitespace + next unless (/(\w\S*)\s*=\s*(\S*)\s*/); + my $name = $1; + my $val = $2; + if ($val =~ /\$SERVERROOT/) { + $val =~ s/\$SERVERROOT/$serverroot/g; + $val = abs_path($val); + } + $self->param($name, $val); + } + $self->param('conffile', $conffile); + } + elsif ($conffile) { + warning("Unable to open configuration file $conffile for reading: $!"); + } + +# override configured parameter with commandline options + map { $self->param($_, $h{$_}) } keys %h; + + return (1) if $self->param('nolog'); + + # if logpath & logfile are not set, set them here to /ensemblweb/vega_dev/shared/logs/conversion/DBNAME/SCRIPNAME_NN.log + if (! defined($self->param('log_base_path'))) { + $self->param('log_base_path','/ensemblweb/shared/logs/conversion/'); + } + my $dbname = $self->param('dbname'); + $dbname =~ s/^vega_//; + if (not (defined($self->param('logpath')) )){ + $self->param('logpath', $self->param('log_base_path')."/".$dbname."/" ); + } + if ( not defined $self->param('logfile') ){ + my $log = $Script; + $log =~ s/.pl$//g; + my $counter; + for ($counter=1 ; (-e $self->param('logpath')."/".$log."_".sprintf("%03d", $counter).".log"); $counter++){ + # warn $self->param('logpath')."/".$log."_".$counter.".log"; + } + $self->param('logfile', $log."_".sprintf("%03d", $counter).".log"); + } + return(1); +} + +=head2 parse_extra_options + + Arg[1-N] : option descriptors that will be passed on to Getopt::Long + Example : $support->parse_extra_options('string_opt=s', 'numeric_opt=n'); + Description : Parse extra commandline options by passing them on to + Getopt::Long and storing parameters in $self->param('name). + Return type : true on success + Exceptions : none (caugth by $self->error) + Caller : general + +=cut + +sub parse_extra_options { + my ($self, @params) = @_; + Getopt::Long::Configure("no_pass_through"); + eval { + # catch warnings to pass to $self->error + local $SIG{__WARN__} = sub { die @_; }; + &GetOptions(\%{ $self->{'_param'} }, @params); + }; + $self->error($@) if $@; + return(1); +} + +=head2 allowed_params + + Arg[1-N] : (optional) List of allowed parameters to set + Example : my @allowed = $self->allowed_params(qw(param1 param2)); + Description : Getter/setter for allowed parameters. This is used by + $self->confirm_params() to avoid cluttering of output with + conffile entries not relevant for a given script. You can use + $self->get_common_params() as a shortcut to set them. + Return type : Array - list of allowed parameters + Exceptions : none + Caller : general + +=cut + +sub allowed_params { + my $self = shift; + + # setter + if (@_) { + @{ $self->{'_allowed_params'} } = @_; + } + + # getter + if (ref($self->{'_allowed_params'}) eq 'ARRAY') { + return @{ $self->{'_allowed_params'} }; + } else { + return (); + } +} + +=head2 get_common_params + + Example : my @allowed_params = $self->get_common_params, 'extra_param'; + Description : Returns a list of commonly used parameters in the conversion + scripts. Shortcut for setting allowed parameters with + $self->allowed_params(). + Return type : Array - list of common parameters + Exceptions : none + Caller : general + +=cut + +sub get_common_params { + return qw( + conffile + dbname + host + port + user + pass + nolog + logpath + log_base_path + logfile + logappend + verbose + interactive + dry_run + ); +} + +=head2 get_loutre_params + + Arg : (optional) return a list to parse or not + Example : $support->parse_extra_options($support->get_loutre_params('parse')) + Description : Returns a list of commonly used loutre db parameters - parse option is + simply used to distinguish between reporting and parsing parameters + Return type : Array - list of common parameters + Exceptions : none + Caller : general + +=cut + +sub get_loutre_params { + my ($self,$p) = @_; + if ($p) { + return qw( + loutrehost=s + loutreport=s + loutreuser=s + loutrepass:s + loutredbname=s + ); + } + else { + return qw( + loutrehost + loutreport + loutreuser + loutrepass + loutredbname + ); + } +} + +=head2 remove_vega_params + + Example : $support->remove_vega_params; + Description : Removes Vega db conection parameters. Usefull to avoid clutter in log files when + working exclusively with loutre + Return type : none + Exceptions : none + Caller : general + +=cut + +sub remove_vega_params { + my $self = shift; + foreach my $param (qw(dbname host port user pass)) { + $self->{'_param'}{$param} = undef; + } +} + +=head2 confirm_params + + Example : $support->confirm_params; + Description : Prints a table of parameters that were collected from config + file and commandline and asks user to confirm if he wants + to proceed. + Return type : true on success + Exceptions : none + Caller : general + +=cut + +sub confirm_params { + my $self = shift; + + # print parameter table + print "Running script with these parameters:\n\n"; + print $self->list_all_params; + + if ($self->param('host') eq 'ensweb-1-10') { + # ask user if he wants to proceed + exit unless $self->user_proceed("**************\n\n You're working on ensdb-1-10! Is that correct and you want to continue ?\n\n**************"); + } + else { + # ask user if he wants to proceed + exit unless $self->user_proceed("Continue?"); + } + return(1); +} + +=head2 list_all_params + + Example : print LOG $support->list_all_params; + Description : prints a table of the parameters used in the script + Return type : String - the table to print + Exceptions : none + Caller : general + +=cut + +sub list_all_params { + my $self = shift; + my $txt = sprintf " %-21s%-90s\n", qw(PARAMETER VALUE); + $txt .= " " . "-"x121 . "\n"; + $Text::Wrap::colums = 130; + my @params = $self->allowed_params; + foreach my $key (@params) { + my @vals = $self->param($key); + if (@vals) { + $txt .= Text::Wrap::wrap( sprintf(' %-21s', $key), + ' 'x24, + join(", ", @vals) + ) . "\n"; + } + } + $txt .= "\n"; + return $txt; +} + +=head2 create_commandline_options + + Arg[1] : Hashref $settings - hashref describing what to do + Allowed keys: + allowed_params => 0|1 # use all allowed parameters + exclude => [] # listref of parameters to exclude + replace => {param => newval} # replace value of param with + # newval + Example : $support->create_commandline_options({ + allowed_params => 1, + exclude => ['verbose'], + replace => { 'dbname' => 'homo_sapiens_vega_33_35e' } + }); + Description : Creates a commandline options string that can be passed to any + other script using ConversionSupport. + Return type : String - commandline options string + Exceptions : none + Caller : general + +=cut + +sub create_commandline_options { + my ($self, $settings, $param_hash) = @_; + my %param_hash = $param_hash ? %$param_hash : (); + + # get all allowed parameters + if ($settings->{'allowed_params'}) { + # exclude params explicitly stated + my %exclude = map { $_ => 1 } @{ $settings->{'exclude'} || [] }; + foreach my $param ($self->allowed_params) { + unless ($exclude{$param}) { + my ($first, @rest) = $self->param($param); + next unless (defined($first)); + + if (@rest) { + $first = join(",", $first, @rest); + } + $param_hash{$param} = $first; + } + } + } + + # replace values + foreach my $key (keys %{ $settings->{'replace'} || {} }) { + $param_hash{$key} = $settings->{'replace'}->{$key}; + } + + # create the commandline options string + my $options_string; + foreach my $param (keys %param_hash) { + $options_string .= sprintf("--%s %s ", $param, $param_hash{$param}); + } + return $options_string; +} + +=head2 check_required_params + + Arg[1-N] : List @params - parameters to check + Example : $self->check_required_params(qw(dbname host port)); + Description : Checks $self->param to make sure the requested parameters + have been set. Dies if parameters are missing. + Return type : true on success + Exceptions : none + Caller : general + +=cut + +sub check_required_params { + my ($self, @params) = @_; + my @missing = (); + foreach my $param (@params) { + push @missing, $param unless defined $self->param($param); + } + if (@missing) { + throw("Missing parameters: @missing.\nYou must specify them on the commandline or in your conffile.\n"); + } + return(1); +} + +=head2 user_proceed + + Arg[1] : (optional) String $text - notification text to present to user + Example : # run a code snipped conditionally + if ($support->user_proceed("Run the next code snipped?")) { + # run some code + } + + # exit if requested by user + exit unless ($support->user_proceed("Want to continue?")); + Description : If running interactively, the user is asked if he wants to + perform a script action. If he doesn't, this section is skipped + and the script proceeds with the code. When running + non-interactively, the section is run by default. + Return type : TRUE to proceed, FALSE to skip. + Exceptions : none + Caller : general + +=cut + +sub user_proceed { + my ($self, $text) = @_; + + if ($self->param('interactive')) { + print "$text\n" if $text; + print "[y/N] "; + my $input = lc(<>); + chomp $input; + unless ($input eq 'y') { + print "Skipping.\n"; + return(0); + } + } + + return(1); +} + +=head2 user_confirm + + Description : DEPRECATED - please use user_proceed() instead + +=cut + +sub user_confirm { + my $self = shift; + exit unless $self->user_proceed("Continue?"); +} + +=head2 read_user_input + + Arg[1] : (optional) String $text - notification text to present to user + Example : my $ret = $support->read_user_input("Choose a number [1/2/3]"); + if ($ret == 1) { + # do something + } elsif ($ret == 2) { + # do something else + } + Description : If running interactively, the user is asked for input. + Return type : String - user's input + Exceptions : none + Caller : general + +=cut + +sub read_user_input { + my ($self, $text) = @_; + + if ($self->param('interactive')) { + print "$text\n" if $text; + my $input = <>; + chomp $input; + return $input; + } +} + +=head2 comma_to_list + + Arg[1-N] : list of parameter names to parse + Example : $support->comma_to_list('chromosomes'); + Description : Transparently converts comma-separated lists into arrays (to + allow different styles of commandline options, see perldoc + Getopt::Long for details). Parameters are converted in place + (accessible through $self->param('name')). + Return type : true on success + Exceptions : none + Caller : general + +=cut + +sub comma_to_list { + my $self = shift; + foreach my $param (@_) { + $self->param($param, + split (/,/, join (',', $self->param($param)))); + } + return(1); +} + +=head2 list_or_file + + Arg[1] : Name of parameter to parse + Example : $support->list_or_file('gene'); + Description : Determines whether a parameter holds a list or it is a filename + to read the list entries from. + Return type : true on success + Exceptions : thrown if list file can't be opened + Caller : general + +=cut + +sub list_or_file { + my ($self, $param) = @_; + my @vals = $self->param($param); + return unless (@vals); + + my $firstval = $vals[0]; + if (scalar(@vals) == 1 && -e $firstval) { + # we didn't get a list of values, but a file to read values from + @vals = (); + open(IN, $firstval) or throw("Cannot open $firstval for reading: $!"); + while(){ + chomp; + push(@vals, $_); + } + close(IN); + $self->param($param, @vals); + } + $self->comma_to_list($param); + return(1); +} + +=head2 param + + Arg[1] : Parameter name + Arg[2-N] : (optional) List of values to set + Example : my $dbname = $support->param('dbname'); + $support->param('port', 3306); + $support->param('chromosomes', 1, 6, 'X'); + Description : Getter/setter for parameters. Accepts single-value params and + list params. + Return type : Scalar value for single-value parameters, array of values for + list parameters + Exceptions : thrown if no parameter name is supplied + Caller : general + +=cut + +sub param { + my $self = shift; + my $name = shift or throw("You must supply a parameter name"); + + # setter + if (@_) { + if (scalar(@_) == 1) { + # single value + $self->{'_param'}->{$name} = shift; + } else { + # list of values + undef $self->{'_param'}->{$name}; + @{ $self->{'_param'}->{$name} } = @_; + } + } + + # getter + if (ref($self->{'_param'}->{$name}) eq 'ARRAY') { + # list parameter + return @{ $self->{'_param'}->{$name} }; + } elsif (defined($self->{'_param'}->{$name})) { + # single-value parameter + return $self->{'_param'}->{$name}; + } else { + return (); + } +} + +=head2 error + + Arg[1] : (optional) String - error message + Example : $support->error("An error occurred: $@"); + exit(0) if $support->error; + Description : Getter/setter for error messages + Return type : String - error message + Exceptions : none + Caller : general + +=cut + +sub error { + my $self = shift; + $self->{'_error'} = shift if (@_); + return $self->{'_error'}; +} + +=head2 warnings + + Example : print LOG "There were ".$support->warnings." warnings.\n"; + Description : Returns the number of warnings encountered while running the + script (the warning counter is increased by $self->log_warning). + Return type : Int - number of warnings + Exceptions : none + Caller : general + +=cut + +sub warnings { + my $self = shift; + return $self->{'_warnings'}; +} + +=head2 serverroot + + Arg[1] : (optional) String - root directory of your ensembl sandbox + Example : my $serverroot = $support->serverroot; + Description : Getter/setter for the root directory of your ensembl sandbox. + This is set when ConversionSupport object is created, so + usually only used as a getter. + Return type : String - the server root directory + Exceptions : none + Caller : general + +=cut + +sub serverroot { + my $self = shift; + $self->{'_serverroot'} = shift if (@_); + return $self->{'_serverroot'}; +} + +=head2 get_database + + Arg[1] : String $database - the type of database to connect to + (eg core, otter) + Arg[2] : (optional) String $prefix - the prefix used for retrieving the + connection settings from the configuration + Example : my $db = $support->get_database('core'); + Description : Connects to the database specified. + Return type : DBAdaptor of the appropriate type + Exceptions : thrown if asking for unknown database + Caller : general + +=cut + +sub get_database { + my $self = shift; + my $database = shift or throw("You must provide a database"); + my $prefix = shift || ''; + $self->check_required_params( + "${prefix}host", + "${prefix}port", + "${prefix}user", + "${prefix}dbname", + ); + my %adaptors = ( + core => 'Bio::EnsEMBL::DBSQL::DBAdaptor', + ensembl => 'Bio::EnsEMBL::DBSQL::DBAdaptor', + evega => 'Bio::EnsEMBL::DBSQL::DBAdaptor', + otter => 'Bio::Otter::DBSQL::DBAdaptor', + vega => 'Bio::Otter::DBSQL::DBAdaptor', + compara => 'Bio::EnsEMBL::Compara::DBSQL::DBAdaptor', + loutre => 'Bio::Vega::DBSQL::DBAdaptor', + ); + throw("Unknown database: $database") unless $adaptors{$database}; + + $self->dynamic_use($adaptors{$database}); + my $species = 'species' . $species_c; + my $dba = $adaptors{$database}->new( + -host => $self->param("${prefix}host"), + -port => $self->param("${prefix}port"), + -user => $self->param("${prefix}user"), + -pass => $self->param("${prefix}pass") || '', + -dbname => $self->param("${prefix}dbname"), + -group => $database, + -species => $species, + ); + #can use this approach to get dna from another db +# my $dna_db = $adaptors{$database}->new( +# -host => 'otterlive', +# -port => '3301', +# -user => $self->param("${prefix}user"), +# -pass => $self->param("${prefix}pass"), +# -dbname => 'loutre_human', +# ); +# $dba->dnadb($dna_db); + + # otherwise explicitely set the dnadb to itself - by default the Registry assumes + # a group 'core' for this now + $dba->dnadb($dba); + + $species_c++; + + $self->{'_dba'}->{$database} = $dba; + $self->{'_dba'}->{'default'} = $dba unless $self->{'_dba'}->{'default'}; + return $self->{'_dba'}->{$database}; +} + + +=head2 get_dbconnection + + Arg[1] : (optional) String $prefix - the prefix used for retrieving the + connection settings from the configuration + Example : my $dbh = $self->get_dbconnection; + Description : Connects to the database server specified. You don't have to + specify a database name (this is useful for running commands + like $dbh->do('show databases')). + Return type : DBI database handle + Exceptions : thrown if connection fails + Caller : general + Status : At Risk + +=cut + +sub get_dbconnection { + my $self = shift; + my $prefix = shift; + + $self->check_required_params( + "${prefix}host", + "${prefix}port", + "${prefix}user", + ); + + my $dsn = "DBI:" . ($self->param('driver')||'mysql') . + ":host=" . $self->param("${prefix}host") . + ";port=" . $self->param("${prefix}port"); + + if ($self->param("${prefix}dbname")) { + $dsn .= ";dbname=".$self->param("${prefix}dbname"); + } + +# warn $dsn; + + my $dbh; + eval{ + $dbh = DBI->connect($dsn, $self->param("${prefix}user"), + $self->param("${prefix}pass"), {'RaiseError' => 1, 'PrintError' => 0}); + }; + + if (!$dbh || $@ || !$dbh->ping) { + $self->log_error("Could not connect to db server as user ". + $self->param("${prefix}user") . + " using [$dsn] as a locator:\n" . $DBI::errstr . $@); + } + + $self->{'_dbh'} = $dbh; + return $self->{'_dbh'}; + +} + + +=head2 dba + + Arg[1] : (optional) String $database - type of db apaptor to retrieve + Example : my $dba = $support->dba; + Description : Getter for database adaptor. Returns default (i.e. created + first) db adaptor if no argument is provided. + Return type : Bio::EnsEMBL::DBSQL::DBAdaptor or Bio::Otter::DBSQL::DBAdaptor + Exceptions : none + Caller : general + +=cut + +sub dba { + my ($self, $database) = shift; + return $self->{'_dba'}->{$database} || $self->{'_dba'}->{'default'}; +} + +=head2 dynamic_use + + Arg [1] : String $classname - The name of the class to require/import + Example : $self->dynamic_use('Bio::EnsEMBL::DBSQL::DBAdaptor'); + Description: Requires and imports the methods for the classname provided, + checks the symbol table so that it doesnot re-require modules + that have already been required. + Returntype : true on success + Exceptions : Warns to standard error if module fails to compile + Caller : internal + +=cut + +sub dynamic_use { + my ($self, $classname) = @_; + my ($parent_namespace, $module) = $classname =~/^(.*::)(.*)$/ ? ($1,$2) : ('::', $classname); + + no strict 'refs'; + # return if module has already been imported + return 1 if $parent_namespace->{$module.'::'} && %{ $parent_namespace->{$module.'::'}||{} }; + + eval "require $classname"; + throw("Failed to require $classname: $@") if ($@); + $classname->import(); + + return 1; +} + +=head2 get_chrlength + + Arg[1] : (optional) Bio::EnsEMBL::DBSQL::DBAdaptor $dba + Arg[2] : (optional) String $version - coord_system version + Arg[3] : (optional) String $type - type of region eg chromsome (defaults to 'toplevel') + Arg[4] : (optional) Boolean - return non reference slies as well (required for haplotypes eq 6-COX) + Arg[5] : (optional) Override chromosome parameter filtering with this array reference. Empty denotes all. + Example : my $chr_length = $support->get_chrlength($dba); + Description : Get all chromosomes and their length from the database. Return + chr_name/length for the chromosomes the user requested (or all + chromosomes by default) + Return type : Hashref - chromosome_name => length + Exceptions : thrown if not passing a Bio::EnsEMBL::DBSQL::DBAdaptor + Caller : general + +=cut + +sub get_chrlength { + my ($self, $dba, $version,$type,$include_non_reference,$chroms) = @_; + $dba ||= $self->dba; + $type ||= 'toplevel'; + throw("get_chrlength should be passed a Bio::EnsEMBL::DBSQL::DBAdaptor\n") + unless ($dba->isa('Bio::EnsEMBL::DBSQL::DBAdaptor')); + + my $sa = $dba->get_SliceAdaptor; + + my @chromosomes = map { $_->seq_region_name } + @{ $sa->fetch_all($type,$version,$include_non_reference) }; + my %chr = map { $_ => $sa->fetch_by_region($type, $_, undef, undef, undef, $version)->length } @chromosomes; + + my @wanted = $self->param('chromosomes'); + @wanted = @$chroms if defined $chroms and ref($chroms) eq 'ARRAY'; + + if (@wanted) { + # check if user supplied invalid chromosome names + foreach my $chr (@wanted) { + my $found = 0; + foreach my $chr_from_db (keys %chr) { + if ($chr_from_db eq $chr) { + $found = 1; + last; + } + } + unless ($found) { + warning("Didn't find chromosome $chr in database " . + $self->param('dbname')); + } + } + + # filter to requested chromosomes only + HASH: + foreach my $chr_from_db (keys %chr) { + foreach my $chr (@wanted) { + if ($chr_from_db eq $chr) { + next HASH; + } + } + delete($chr{$chr_from_db}); + } + } + + return \%chr; +} + +=head2 get_ensembl_chr_mapping + + Arg[1] : (optional) Bio::EnsEMBL::DBSQL::DBAdaptor $dba + Arg[2] : (optional) String $version - coord_system version + Example : my $ensembl_mapping = $support->get_ensembl_chr_mapping($dba); + Description : Gets a mapping between Vega chromosome names and their + equivalent Ensembl chromosomes. Works with non-reference chromosomes + Return type : Hashref - Vega name => Ensembl name + Exceptions : thrown if not passing a Bio::EnsEMBL::DBSQL::DBAdaptor + Caller : general + +=cut + +sub get_ensembl_chr_mapping { + my ($self, $dba, $version) = @_; + $dba ||= $self->dba; + throw("get_ensembl_chr_mapping should be passed a Bio::EnsEMBL::DBSQL::DBAdaptor\n") unless ($dba->isa('Bio::EnsEMBL::DBSQL::DBAdaptor')); + + my $sa = $dba->get_SliceAdaptor; + my @chromosomes = map { $_->seq_region_name } + @{ $sa->fetch_all('chromosome', $version, 1) }; + + my %chrs; + foreach my $chr (@chromosomes) { + my $sr = $sa->fetch_by_region('chromosome', $chr, undef, undef, undef, $version); + my ($ensembl_name_attr) = @{ $sr->get_all_Attributes('ensembl_name') }; + if ($ensembl_name_attr) { + $chrs{$chr} = $ensembl_name_attr->value; + } else { + $chrs{$chr} = $chr; + } + } + return \%chrs; +} + +=head2 get_taxonomy_id + + Arg[1] : Bio::EnsEMBL::DBSQL::DBAdaptor $dba + Example : my $sid = $support->get_taxonony_id($dba); + Description : Retrieves the taxononmy ID from the meta table + Return type : Int - the taxonomy ID + Exceptions : thrown if no taxonomy ID is found in the database + Caller : general + +=cut + +sub get_taxonomy_id { + my ($self, $dba) = @_; + $dba ||= $self->dba; + my $sql = 'SELECT meta_value FROM meta WHERE meta_key = "species.taxonomy_id"'; + my $sth = $dba->dbc->db_handle->prepare($sql); + $sth->execute; + my ($tid) = $sth->fetchrow_array; + $sth->finish; + $self->throw("Could not determine taxonomy_id from database.") unless $tid; + return $tid; +} + +=head2 get_species_scientific_name + + Arg[1] : Bio::EnsEMBL::DBSQL::DBAdaptor $dba + Example : my $species = $support->get_species_scientific_name($dba); + Description : Retrieves the species scientific name (Genus species) from the + meta table + Return type : String - species scientific name + Exceptions : thrown if species name can not be determined from db + Caller : general + +=cut + +sub get_species_scientific_name { + my ($self, $dba) = @_; + $dba ||= $self->dba; + my $sql = "SELECT meta_value FROM meta WHERE meta_key = \'species.scientific_name\'"; + my $sth = $dba->dbc->db_handle->prepare($sql); + $sth->execute; + my @sp; + while (my @row = $sth->fetchrow_array) { + push @sp, $row[0]; + } + if (! @sp || @sp > 1) { + $self->throw("Could not retrieve a single species scientific name from database."); + } + $sth->finish; + my $species = $sp[0]; + $species =~ s/ /_/g; + return $species; +} + +=head2 species + + Arg[1] : (optional) String $species - species name to set + Example : my $species = $support->species; + my $url = "http://vega.sanger.ac.uk/$species/"; + Description : Getter/setter for species name (Genus_species). If not set, it's + determined from database's meta table + Return type : String - species name + Exceptions : none + Caller : general + +=cut + +sub species { + my $self = shift; + $self->{'_species'} = shift if (@_); + # get species name from database if not set + unless ($self->{'_species'}) { + $self->{'_species'} = $self->get_species_scientific_name; + } + return $self->{'_species'}; +} + +=head2 sort_chromosomes + + Arg[1] : (optional) Hashref $chr_hashref - Hashref with chr_name as keys + Example : my $chr = { '6-COX' => 1, '1' => 1, 'X' => 1 }; + my @sorted = $support->sort_chromosomes($chr); + Description : Sorts chromosomes in an intuitive way (numerically, then + alphabetically). If no chromosome hashref is passed, it's + retrieve by calling $self->get_chrlength() + Return type : List - sorted chromosome names + Exceptions : thrown if no hashref is provided + Caller : general + +=cut + +sub sort_chromosomes { + my ($self, $chr_hashref) = @_; + $chr_hashref = $self->get_chrlength unless ($chr_hashref); + throw("You have to pass a hashref of your chromosomes") + unless ($chr_hashref and ref($chr_hashref) eq 'HASH'); + return (sort _by_chr_num keys %$chr_hashref); +} + +=head2 _by_chr_num + + Example : my @sorted = sort _by_chr_num qw(X, 6-COX, 14, 7); + Description : Subroutine to use in sort for sorting chromosomes. Sorts + numerically, then alphabetically + Return type : values to be used by sort + Exceptions : none + Caller : internal ($self->sort_chromosomes) + +=cut + +sub _by_chr_num { + my @awords = split /-/, $a; + my @bwords = split /-/, $b; + + my $anum = $awords[0]; + my $bnum = $bwords[0]; + + if ($anum !~ /^[0-9]*$/) { + if ($bnum !~ /^[0-9]*$/) { + return $anum cmp $bnum; + } else { + return 1; + } + } + if ($bnum !~ /^[0-9]*$/) { + return -1; + } + + if ($anum <=> $bnum) { + return $anum <=> $bnum; + } else { + if ($#awords == 0) { + return -1; + } elsif ($#bwords == 0) { + return 1; + } else { + return $awords[1] cmp $bwords[1]; + } + } +} + +=head2 split_chromosomes_by_size + + Arg[1] : (optional) Int $cutoff - the cutoff in bp between small and + large chromosomes + Arg[2] : (optional) Boolean to include duplicate regions, ie PAR or not + (default is no) + Arg[3] : (optional) Coordsystem version to retrieve + + Example : my $chr_slices = $support->split_chromosomes_by_size; + foreach my $block_size (keys %{ $chr_slices }) { + print "Chromosomes with blocksize $block_size: "; + print join(", ", map { $_->seq_region_name } + @{ $chr_slices->{$block_size} }); + } + Description : Determines block sizes for storing DensityFeatures on + chromosomes, and return slices for each chromosome. The block + size is determined so that you have 150 bins for the smallest + chromosome over 5 Mb in length. For chromosomes smaller than 5 + Mb, an additional smaller block size is used to yield 150 bins + for the overall smallest chromosome. This will result in + reasonable resolution for small chromosomes and high + performance for big ones. Does not return non-reference seq_regions + Return type : Hashref (key: block size; value: Arrayref of chromosome + Bio::EnsEMBL::Slices) + Exceptions : none + Caller : density scripts + +=cut + +sub split_chromosomes_by_size { + my $self = shift; + my $cutoff = shift || 5000000; + my $dup = shift || 0; + my $cs_version = shift; + my $include_non_reference = 1; #get non reference slices + my $slice_adaptor = $self->dba->get_SliceAdaptor; + my $top_slices; + if ($self->param('chromosomes')) { + foreach my $chr ($self->param('chromosomes')) { + push @{ $top_slices }, $slice_adaptor->fetch_by_region('chromosome', $chr); + } + } else { + $top_slices = $slice_adaptor->fetch_all('chromosome',$cs_version,$include_non_reference,$dup); + } + + # filter out patches, if present + $top_slices = [ grep { $_->is_reference or $self->is_haplotype($_,$self->dba) } @$top_slices ]; + + my ($big_chr, $small_chr, $min_big_chr, $min_small_chr); + foreach my $slice (@{ $top_slices }) { + next if ($slice->length eq 10000); #hack for chrY pseudoslice + if ($slice->length < $cutoff) { + if (! $min_small_chr or ($min_small_chr > $slice->length)) { + $min_small_chr = $slice->length; + } + # push small chromosomes onto $small_chr + push @{ $small_chr }, $slice; + } + elsif (! $min_big_chr or ($min_big_chr > $slice->length) ){ + $min_big_chr = $slice->length; + } + # push _all_ chromosomes onto $big_chr + push @{ $big_chr }, $slice; + } + my $chr_slices; + $chr_slices->{int($min_big_chr/150)} = $big_chr if $min_big_chr; + $chr_slices->{int($min_small_chr/150)} = $small_chr if $min_small_chr; + return $chr_slices; +} + +=head2 log + + Arg[1] : String $txt - the text to log + Arg[2] : Int $indent - indentation level for log message + Example : my $log = $support->log_filehandle; + $support->log('Log foo.\n', 1); + Description : Logs a message to the filehandle initialised by calling + $self->log_filehandle(). You can supply an indentation level + to get nice hierarchical log messages. + Return type : true on success + Exceptions : thrown when no filehandle can be obtained + Caller : general + +=cut + +sub log { + my ($self, $txt, $indent) = @_; + $indent ||= 0; + + # strip off leading linebreaks so that indenting doesn't break + $txt =~ s/^(\n*)//; + + $txt = $1." "x$indent . $txt; + my $fh = $self->{'_log_filehandle'}; + throw("Unable to obtain log filehandle") unless $fh; + print $fh "$txt"; + return(1); +} + +=head2 lock_log + + Description : Use flock-style locks to lock log and fastforward to end. + Useful if log is being written to by multiple processes. +=cut + +sub lock_log { + my ($self) = @_; + + my $fh = $self->{'_log_filehandle'}; + return if -t $fh or -p $fh; # Shouldn't lock such things + flock($self->{'_log_filehandle'},LOCK_EX) || return 0; + seek($self->{'_log_filehandle'},0,SEEK_END); # fail ok, prob not reg file + return 1; +} + +=head2 unlock_log + + Description : Unlock log previously locked by lock_log. + +=cut + +sub unlock_log { + my ($self) = @_; + + my $fh = $self->{'_log_filehandle'}; + return if -t $fh or -p $fh; # We don't lock such things + # flush is implicit in flock + flock($self->{'_log_filehandle'},LOCK_UN) || return 0; + return 1; +} + +=head2 log_warning + + Arg[1] : String $txt - the warning text to log + Arg[2] : Int $indent - indentation level for log message + Arg[3] : Bool - add a line break before warning if true + Example : my $log = $support->log_filehandle; + $support->log_warning('Log foo.\n', 1); + Description : Logs a message via $self->log and increases the warning counter. + Return type : true on success + Exceptions : none + Caller : general + +=cut + +sub log_warning { + my ($self, $txt, $indent, $break) = @_; + $txt = "WARNING: " . $txt; + $txt = "\n$txt" if ($break); + $self->log($txt, $indent); + $self->{'_warnings'}++; + return(1); +} + +=head2 log_error + + Arg[1] : String $txt - the error text to log + Arg[2] : Int $indent - indentation level for log message + Example : my $log = $support->log_filehandle; + $support->log_error('Log foo.\n', 1); + Description : Logs a message via $self->log and exits the script. + Return type : none + Exceptions : none + Caller : general + +=cut + +sub log_error { + my ($self, $txt, $indent) = @_; + $txt = "ERROR: ".$txt; + $self->log($txt, $indent); + $self->log("Exiting.\n"); + exit; +} + +=head2 log_verbose + + Arg[1] : String $txt - the warning text to log + Arg[2] : Int $indent - indentation level for log message + Example : my $log = $support->log_filehandle; + $support->log_verbose('Log this verbose message.\n', 1); + Description : Logs a message via $self->log if --verbose option was used + Return type : TRUE on success, FALSE if not verbose + Exceptions : none + Caller : general + +=cut + +sub log_verbose { + my ($self, $txt, $indent) = @_; + return(0) unless $self->param('verbose'); + $self->log($txt, $indent); + return(1); +} + +=head2 log_stamped + + Arg[1] : String $txt - the warning text to log + Arg[2] : Int $indent - indentation level for log message + Example : my $log = $support->log_filehandle; + $support->log_stamped('Log this stamped message.\n', 1); + Description : Appends timestamp and memory usage to a message and logs it via + $self->log + Return type : TRUE on success + Exceptions : none + Caller : general + +=cut + +sub log_stamped { + my ($self, $txt, $indent) = @_; + # append timestamp and memory usage to log text + $txt =~ s/(\n*)$//; + $txt .= " ".$self->date_and_mem.$1; + $self->log($txt, $indent); + return(1); +} + +=head2 log_filehandle + + Arg[1] : (optional) String $mode - file access mode + Example : my $log = $support->log_filehandle; + # print to the filehandle + print $log 'Lets start logging...\n'; + # log via the wrapper $self->log() + $support->log('Another log message.\n'); + Description : Returns a filehandle for logging (STDERR by default, logfile if + set from config or commandline). You can use the filehandle + directly to print to, or use the smart wrapper $self->log(). + Logging mode (truncate or append) can be set by passing the + mode as an argument to log_filehandle(), or with the + --logappend commandline option (default: truncate) + Return type : Filehandle - the filehandle to log to + Exceptions : thrown if logfile can't be opened + Caller : general + +=cut + +sub log_filehandle { + my ($self, $mode, $date) = @_; + $mode ||= '>'; + $mode = '>>' if ($self->param('logappend')); + my $fh = \*STDERR; + if (my $logfile = $self->param('logfile')) { + $logfile .= "_$date" if $date; + if (my $logpath = $self->param('logpath')) { + unless (-e $logpath) { + system("mkdir $logpath") == 0 or + $self->log_error("Can't create log dir $logpath: $!\n"); + } + $logfile = "$logpath/$logfile"; + } + open($fh, "$mode", $logfile) or throw( + "Unable to open $logfile for writing: $!"); + } + $self->{'_log_filehandle'} = $fh; + return $self->{'_log_filehandle'}; +} + +=head2 filehandle + + Arg[1] : String $mode - file access mode + Arg[2] : String $file - input or output file + Example : my $fh = $support->filehandle('>>', '/path/to/file'); + # print to the filehandle + print $fh 'Your text goes here...\n'; + Description : Returns a filehandle (*STDOUT for writing, *STDIN for reading + by default) to print to or read from. + Return type : Filehandle - the filehandle + Exceptions : thrown if file can't be opened + Caller : general + +=cut + +sub filehandle { + my ($self, $mode, $file) = @_; + $mode ||= ">"; + my $fh; + if ($file) { + open($fh, "$mode", $file) or throw( + "Unable to open $file for writing: $!"); + } elsif ($mode =~ />/) { + $fh = \*STDOUT; + } elsif ($mode =~ /init_log_date; + Description : Opens a filehandle to a logfile with the date in the file name + Return type : Filehandle - the log filehandle + Exceptions : none + Caller : general + +=cut + +sub init_log_date { + my $self = shift; + my $date = $self->date; + return $self->init_log($date); +} + +=head2 init_log + + Example : $support->init_log; + Description : Opens a filehandle to the logfile and prints some header + information to this file. This includes script name, date, user + running the script and parameters the script will be running + with. + Return type : Filehandle - the log filehandle + Exceptions : none + Caller : general + +=cut + +sub init_log { + my $self = shift; + my $date = shift; + + # get a log filehandle + my $log = $self->log_filehandle(undef,$date); + + # print script name, date, user who is running it + my $hostname = `hostname`; + chomp $hostname; + my $script = "$hostname:$Bin/$Script"; + my $user = `whoami`; + chomp $user; + $self->log("Script: $script\nDate: ".$self->date_and_time."\nUser: $user\n"); + + # print parameters the script is running with + $self->log("Parameters:\n\n"); + $self->log($self->list_all_params); + + # remember start time + $self->{'_start_time'} = time; + + return $log; +} + +=head2 finish_log + + Example : $support->finish_log; + Description : Writes footer information to a logfile. This includes the + number of logged warnings, timestamp and memory footprint. + Return type : TRUE on success + Exceptions : none + Caller : general + +=cut + +sub finish_log { + my $self = shift; + $self->log("\nAll done. ".$self->warnings." warnings. "); + if ($self->{'_start_time'}) { + $self->log("Runtime "); + my $diff = time - $self->{'_start_time'}; + my $sec = $diff % 60; + $diff = ($diff - $sec) / 60; + my $min = $diff % 60; + my $hours = ($diff - $min) / 60; + $self->log("${hours}h ${min}min ${sec}sec "); + } + $self->log($self->date_and_mem."\n\n"); + return(1); +} + +=head2 date_and_mem + + Example : print LOG "Time, memory usage: ".$support->date_and_mem."\n"; + Description : Prints a timestamp and the memory usage of your script. + Return type : String - timestamp and memory usage + Exceptions : none + Caller : general + +=cut + +sub date_and_mem { + my $date = strftime "%Y-%m-%d %T", localtime; + my $mem = `ps -p $$ -o vsz |tail -1`; + chomp $mem; + return "[$date, mem $mem]"; +} + +=head2 date + + Example : print "Date: " . $support->date . "\n"; + Description : Prints a nicely formatted datetamp (YYYY-DD-MM) + Return type : String - the timestamp + Exceptions : none + Caller : general + +=cut + +sub date { + return strftime "%Y-%m-%d", localtime; +} + +=head2 date_and_time + + Example : print "Date: " . $support->date . "\n"; + Description : Prints a nicely formatted timestamp (YYYY-DD-MM hh:mm:ss) + Return type : String - the timestamp + Exceptions : none + Caller : general + +=cut + +sub date_and_time { + return strftime "%Y-%m-%d %T", localtime; +} + +=head2 format_time + + Example : print $support->format_time($gene->modifed_date) . "\n"; + Description : Prints timestamps from the database + Return type : String - nicely formatted time stamp + Exceptions : none + Caller : general + +=cut + + +sub date_format { + my( $self, $time, $format ) = @_; + my( $d,$m,$y) = (localtime($time))[3,4,5]; + my %S = ('d'=>sprintf('%02d',$d),'m'=>sprintf('%02d',$m+1),'y'=>$y+1900); + (my $res = $format ) =~s/%(\w)/$S{$1}/ge; + return $res; +} + + +=head2 mem + + Example : print "Memory usage: " . $support->mem . "\n"; + Description : Prints the memory used by your script. Not sure about platform + dependence of this call ... + Return type : String - memory usage + Exceptions : none + Caller : general + +=cut + +sub mem { + my $mem = `ps -p $$ -o vsz |tail -1`; + chomp $mem; + return $mem; +} + +=head2 commify + + Arg[1] : Int $num - a number to commify + Example : print "An easy to read number: ".$self->commify(100000000); + # will print 100,000,000 + Description : put commas into a number to make it easier to read + Return type : a string representing the commified number + Exceptions : none + Caller : general + Status : stable + +=cut + +sub commify { + my $self = shift; + my $num = shift; + + $num = reverse($num); + $num =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g; + + return scalar reverse $num; +} + +=head2 fetch_non_hidden_slices + + Arg[1] : B::E::SliceAdaptor + Arg[2] : B::E::AttributeAdaptor + Arg[3] : string $coord_system_name (optional) - 'chromosome' by default + Arg[4] : string $coord_system_version (optional) - 'otter' by default + Example : $chroms = $support->fetch_non_hidden_slice($sa,$aa); + Description : retrieve all slices from a loutre database that don't have a hidden attribute. + Doesn't retrieve non-reference slices + Return type : arrayref + Caller : general + Status : stable + +=cut + +sub fetch_non_hidden_slices { + my $self = shift; + my $aa = shift or throw("You must supply an attribute adaptor"); + my $sa = shift or throw("You must supply a slice adaptor"); + my $cs = shift || 'chromosome'; + my $cv = shift || 'Otter'; + my $visible_chroms; + foreach my $chrom ( @{$sa->fetch_all($cs,$cv)} ) { + my $chrom_name = $chrom->name; + my $attribs = $aa->fetch_all_by_Slice($chrom,'hidden'); + if ( scalar(@$attribs) > 1 ) { + $self->log_warning("More than one hidden attribute for chromosome $chrom_name\n"); + } + elsif ($attribs->[0]->value == 0) { + push @$visible_chroms, $chrom; + } + elsif ($attribs->[0]->value == 1) { + $self->log_verbose("chromosome $chrom_name is hidden\n"); + } + else { + $self->log_warning("No hidden attribute for chromosome $chrom_name\n"); + } + } + return $visible_chroms; +} + +=head2 get_non_hidden_slice_names + + Arg[1] : B::E::SliceAdaptor + Arg[2] : B::E::AttributeAdaptor + Arg[3] : string $coord_system_name (optional) - 'chromosome' by default + Arg[4] : string $coord_system_version (optional) - 'otter' by default + Example : $chrom_names = $support->get_non_hidden_slice_names($sa,$aa); + Description : retrieve names of all slices from a loutre database that don't have a hidden attribute. + Doesn't retrieve non-reference slices + Return type : arrayref of names of all non-hidden slices + Caller : general + Status : stable + +=cut + +sub get_non_hidden_slice_names { + my $self = shift; + my $aa = shift or throw("You must supply an attribute adaptor"); + my $sa = shift or throw("You must supply a slice adaptor"); + my $cs = shift || 'chromosome'; + my $cv = shift || 'Otter'; + my $visible_chrom_names; + foreach my $chrom ( @{$sa->fetch_all($cs,$cv)} ) { + my $chrom_name = $chrom->seq_region_name; + my $attribs = $aa->fetch_all_by_Slice($chrom,'hidden'); + if ( scalar(@$attribs) > 1 ) { + $self->log_warning("More than one hidden attribute for chromosome $chrom_name\n"); + } + elsif ($attribs->[0]->value == 0) { + push @$visible_chrom_names, $chrom_name; + } + elsif ($attribs->[0]->value == 1) { + $self->log_verbose("chromosome $chrom_name is hidden\n"); + } + else { + $self->log_warning("No hidden attribute for chromosome $chrom_name\n"); + } + } + return $visible_chrom_names; +} + + +=head2 get_wanted_chromosomes + + Arg[1] : B::E::SliceAdaptor + Arg[2] : B::E::AttributeAdaptor + Arg[3] : string $coord_system_name (optional) - 'chromosome' by default + Arg[4] : string $coord_system_version (optional) - 'otter' by default + Example : $chr_names = $support->get_wanted_chromosomes($laa,$lsa); + Description : retrieve names of slices from a lutra database that are ready for dumping to Vega. + Deals with list of names to ignore (ignore_chr = LIST) + Return type : arrayref of slices + Caller : general + Status : stable + +=cut + +sub get_wanted_chromosomes { + my $self = shift; + my $aa = shift or throw("You must supply an attribute adaptor"); + my $sa = shift or throw("You must supply a slice adaptor"); + my $cs = shift || 'chromosome'; + my $cv = shift || 'Otter'; + my $export_mode = $self->param('release_type'); + my $release = $self->param('vega_release'); + my $names; + my $chroms = $self->fetch_non_hidden_slices($aa,$sa,$cs,$cv); + CHROM: + foreach my $chrom (@$chroms) { + my $attribs = $aa->fetch_all_by_Slice($chrom); + my $vals = $self->get_attrib_values($attribs,'vega_export_mod'); + if (scalar(@$vals > 1)) { + $self->log_warning ("Multiple attribs for \'vega_export_mod\', please fix before continuing"); + exit; + } + next CHROM if (! grep { $_ eq $export_mode} @$vals); + $vals = $self->get_attrib_values($attribs,'vega_release',$release); + if (scalar(@$vals > 1)) { + $self->log_warning ("Multiple attribs for \'vega_release\' value = $release , please fix before continuing"); + exit; + } + next CHROM if (! grep { $_ eq $release} @$vals); + my $name = $chrom->seq_region_name; + if (my @ignored = $self->param('ignore_chr')) { + next CHROM if (grep {$_ eq $name} @ignored); + } + push @{$names}, $name; + } + return $names; +} + +=head2 is_haplotype + + Arg[1] : B::E::Slice + Arg[2]: : B::E::DBAdaptor (optional, if you don't supply one then the *first* one you generated is returned, which may or may not be what you want!) + Description : Is the slice a Vega haplotype? At the moment this is + implemented by testing for presence of vega_ref_chrom but non_ref + which is correct in practice, but really misses the prupose of + vega_ref_chrom, so this might bite us if that changes. + Return type : boolean + +=cut + +sub is_haplotype { + my ($self,$slice,$dba) = @_; + + $dba ||= $self->dba; + my $aa = $dba->get_adaptor('Attribute'); + + my $attribs = $aa->fetch_all_by_Slice($slice); + return (@{$self->get_attrib_values($attribs,'vega_ref_chrom')} and + @{$self->get_attrib_values($attribs,'non_ref',1)}); +} + +=head2 get_unique_genes + + Arg[1] : B::E::Slice + Arg[2] : B::E::DBAdaptor (optional, if you don't supply one then the *first* one you generated is returned, which may or may not be what you want!) + Example : $genes = $support->get_unique_genes($slice,$dba); + Description : Retrieve genes that are only on the slice itself - used for human where assembly patches + are in the assembly_exception table. Needs the PATCHes to have 'non_ref' seq_region_attributes. + Return type : arrayref of genes + Caller : general + Status : stable + +=cut + +sub get_unique_genes { + my $self = shift; + my ($slice,$dba) = @_; + $slice or throw("You must supply a slice"); + $dba ||= $self->dba; + + my $sa = $dba->get_adaptor('Slice'); + my $ga = $dba->get_adaptor('Gene'); + my $patch = 0; + my $genes = []; + if ( ! $slice->is_reference() and ! $self->is_haplotype($slice,$dba) ) { +# if ( 0 ) { + $patch = 1; + my $slices = $sa->fetch_by_region_unique( $slice->coord_system_name(),$slice->seq_region_name(),undef,undef,undef,$slice->coord_system()->version() ); + foreach my $slice ( @{$slices} ) { + push @$genes,@{$ga->fetch_all_by_Slice($slice)}; + # my $start = $slice->start; + } + } + else { + $genes = $ga->fetch_all_by_Slice($slice); + } + return ($genes, $patch); +} + + + +=head2 get_attrib_values + + Arg[1] : Arrayref of B::E::Attributes + Arg[2] : 'code' to search for + Arg[3] : 'value' to search for (optional) + Example : my $c = $self->get_attrib_values($attribs,'name')); + Description : (i) In the absence of an attribute value argument, examines an arrayref + of B::E::Attributes for a particular attribute type, returning the values + for each attribute of that type. Can therefore be used to test for the + number of attributes of that type. + (ii) In the presence of the optional value argument it returns all + attributes with that value ie can be used to test for the presence of an + attribute with that particular value. + Return type : arrayref of values for that attribute + Caller : general + Status : stable + +=cut + +sub get_attrib_values { + my $self = shift; + my $attribs = shift; + my $code = shift; + my $value = shift; + if (my @atts = grep {$_->code eq $code } @$attribs) { + my $r = []; + if ($value) { + if (my @values = grep {$_->value eq $value} @atts) { + foreach (@values) { + push @$r, $_->value; + } + return $r; + } + else { + return []; + } + } + else { + foreach (@atts) { + push @$r, $_->value; + } + return $r; + } + } + else { + return []; + } +} + +=head2 fix_attrib_value + + Arg[1] : Arrayref of existing B::E::Attributes + Arg[2] : dbID of object + Arg[3] : name of object (just for reporting) + Arg[4] : attrib_type.code + Arg[5] : attrib_type.value + Arg[6] : interactive ? (0 by default) + Arg[7] : table + Example : $support->fix_attrib_value($attribs,$chr_id,$chr_name,'vega_export_mod','N',1); + Description : adds a new attribute to an object, or updates an existing attribute with a new value + Can be run in interactive or non-interactive mode (default) + Return type : arrayref of results + Caller : general + Status : only ever tested with seq_region_attributes to date + +=cut + +sub fix_attrib_value { + my $self = shift; + my $attribs = shift; + my $id = shift; + my $name = shift; + my $code = shift; + my $value = shift; + my $interact = shift || 0; + my $table = shift || 'seq_region_attrib'; + + #transiently set interactive parameter to zero + my $int_before; + if (! $interact) { + $int_before = $self->param('interactive'); + $self->param('interactive',0); + } + + #get any existing value(s) for this attribute + my $existings = $self->get_attrib_values($attribs,$code); + + #add a new attribute if there is none... + if (! @$existings ) { + if ($self->user_proceed("Do you want to set $name attrib (code = $code) to value $value ?")) { + my $r = $self->store_new_attribute($id,$code,$value); + + #reset interactive parameter + $self->param('interactive',$int_before) if (! $interact); + return $r; + } + } + #...warn and exit if you're trying to update more than one value for the same attribute... + elsif (scalar @$existings > 1) { + $self->log_warning("You shouldn't be trying to update multiple attributes with the same code at once ($name:$code,$value), looks like you have duplicate entries in the (seq_region_)attrib table\n"); + exit; + } + + #...or update an attribute with new values... + else { + my $existing = $existings->[0]; + if ($existing ne $value) { + if ($self->user_proceed("Do you want to reset $name attrib (code = $code) from $existing to $value ?")) { + my $r = $self->update_attribute($id,$code,$value); + $self->param('interactive',$int_before) if (! $interact); + push @$r, $existing; + return $r; + } + } + #...or make no change + else { + $self->param('interactive',$int_before) if (! $interact); + return []; + } + } +} + +=head2 _get_attrib_id + + Arg[1] : attrib_type.code + Arg[2] : database handle + Example : $self->_get_attrib_id('name',$dbh) + Description : get attrib_type.attrib_type_id from a attrib_type.code + Return type : attrib_type.attrib_type_id + Caller : internal + Status : stable + +=cut + +sub _get_attrib_id { + my $self = shift; + my $attrib_code = shift; + my $dbh = shift; + my ($attrib_id) = $dbh->selectrow_array( + qq(select attrib_type_id + from attrib_type + where code = ?), + {}, + ($attrib_code) + ); + if (! $attrib_id) { + $self->log_warning("There is no attrib_type_id for code $attrib_code, please patch the attrib_table\n"); + exit; + } + else { + return $attrib_id; + } +} + +=head2 store_new_attribute + + Arg[1] : seq_region.seq_region_id + Arg[2] : attrib_type.code + Arg[3] : attrib_type.value + ARG[4] : table to update (seq_region_attribute by default) + Example : $support->store_new_attribute(23,name,5); + Description : uses MySQL to store an entry (code and value) in an attribute table + (seq_region_attrib by default) + Return type : array_ref + Caller : general + Status : stable + +=cut + +sub store_new_attribute { + my $self = shift; + my $sr_id = shift; + my $attrib_code = shift; + my $attrib_value = shift || ''; + my $table = shift || 'seq_region_attrib'; + + #get database handle + my $dbh = $self->get_dbconnection('loutre'); + #get attrib_type_id for this particular attribute + my $attrib_id = $self->_get_attrib_id($attrib_code,$dbh); + #store + my $r = $dbh->do( + qq(insert into $table + values (?,?,?)), + {}, + ($sr_id,$attrib_id,$attrib_value) + ); + return ['Stored',$r]; +} + +=head2 update_attribute + + Arg[1] : seq_region.seq_region_id + Arg[2] : attrib_type.code + Arg[3] : attrib_type.value + ARG[4] : table to update (seq_region_attribute by default) + Example : $support->update_attribute(23,name,5); + Description : uses MySQL to update an attribute table (seq_region_attrib by default) + Return type : array_ref + Caller : general + Status : stable + +=cut + +sub update_attribute { + my $self = shift; + my $sr_id = shift; + my $attrib_code = shift; + my $attrib_value = shift; + my $table = shift || 'seq_region_attrib'; + my $dbh = $self->get_dbconnection('loutre'); + my $attrib_id = $self->_get_attrib_id($attrib_code,$dbh); + #update + my $r = $dbh->do( + qq(update $table + set value = ? + where seq_region_id = $sr_id + and attrib_type_id = $attrib_id), + {}, + ($attrib_value) + ); + return ['Updated',$r]; +} + + +=head2 remove_duplicate_attribs + + Arg[1] : db handle + Arg[2] : table + Example : $support->remove_duplicate_attribs($dbh,'gene'); + Description : uses MySQL to remove duplicate entries from an attribute table + Return type : none + Caller : general + Status : stable + +=cut + +sub remove_duplicate_attribs { + my $self = shift; + my $dbh = shift; + my $table = shift; + $dbh->do(qq(create table nondup_${table}_attrib like ${table}_attrib)); + $dbh->do(qq(insert into nondup_${table}_attrib (select ${table}_id, attrib_type_id, value from ${table}_attrib group by ${table}_id, attrib_type_id, value))); + $dbh->do(qq(delete from ${table}_attrib)); + $dbh->do(qq(insert into ${table}_attrib (select ${table}_id, attrib_type_id, value from nondup_${table}_attrib))); + $dbh->do(qq(drop table nondup_${table}_attrib)); +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/Converter.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/Converter.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,213 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 AUTHOR + +Juguang Xiao + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::Converter, a converter factory + +=head1 SYNOPSIS + + my $converter = Bio::EnsEMBL::Utils::Converter->new( + -in => 'Bio::SeqFeature::Generic', + -out => 'Bio::EnsEMBL::SimpleFeature' + ); + + my ( $fearture1, $feature2 ); + my $ens_simple_features = + $converter->convert( [ $feature1, $feature2 ] ); + my @ens_simple_features = @{$ens_simple_features}; + +=head1 DESCRIPTION + +Module to converter the business objects between EnsEMBL and any other +projects, currently BioPerl. + +What the ready conversions are, + + Bio::SeqFeature::Generic <-> Bio::EnsEMBL::SeqFeature, Bio::EnsEMBL::SimpleFeature + Bio::SeqFeature::FeaturePair <-> Bio::EnsEMBL::SeqFeature, Bio::EnsEMBL::RepeatFeature + Bio::Search::HSP::GenericHSP -> Bio::EnsEMBL::BaseAlignFeature's submodules + Bio::Tools::Prediction::Gene -> Bio::EnsEMBL::PredictionTranscript + Bio::Tools::Prediction::Exon -> Bio::EnsEMBL::Exon + Bio::Pipeline::Analysis -> Bio::EnsEMBL::Analysis + +=head1 METHODS + +=cut + + +package Bio::EnsEMBL::Utils::Converter; + +use strict; +use Bio::EnsEMBL::Root; +our @ISA =qw(Bio::EnsEMBL::Root); + +=head2 new + + Title : new + Usage : + my $converter = Bio::EnsEMBL::Utils::Converter->new( + -in => 'Bio::SeqFeature::Generic', + -out => 'Bio::EnsEMBL::SimpleFeature' + ); + + Function: constructor for converter object + Returns : L + Args : + in - the module name of the input. + out - the module name of the output. + analysis - a Bio::EnsEMBL::Analysis object, if converting other objects to EnsEMBL features. + contig - a Bio::EnsEMBL::RawContig object, if converting other objects to EnsEMBL features. + +=cut + +sub new { + my ($caller, @args) = @_; + my $class = ref($caller) || $caller; + + if($class =~ /Bio::EnsEMBL::Utils::Converter::(\S+)/){ + my $self = $class->SUPER::new(@args); + $self->_initialize(@args); + return $self; + }else{ + my %params = @args; + @params{map {lc $_} keys %params} = values %params; + my $module = $class->_guess_module($params{-in}, $params{-out}); + + return undef unless($class->_load_module($module)); + return "$module"->new(@args); + } +} + +# This would be invoked by sub-module's _initialize. + +sub _initialize { + my ($self, @args) = @_; + + my ($in, $out) = $self->_rearrange([qw(IN OUT)], @args); + + $self->in($in); + $self->out($out); +} + +=head2 _guess_module + + Usage : $module = $class->_guess_module( + 'Bio::EnsEMBL::SimpleFeature', + 'Bio::EnsEMBL::Generic' + ); + +=cut + +sub _guess_module { + my ($self, $in, $out) = @_; + if($in =~ /^Bio::EnsEMBL::(\S+)/ and $out =~ /^Bio::EnsEMBL::(\S+)/){ + $self->throw("Cannot convert between EnsEMBL objects.\n[$in] to [$out]"); + }elsif($in =~ /^Bio::EnsEMBL::(\S+)/){ + return 'Bio::EnsEMBL::Utils::Converter::ens_bio'; + }elsif($out =~ /^Bio::EnsEMBL::(\S+)/){ + return 'Bio::EnsEMBL::Utils::Converter::bio_ens'; + }else{ + $self->throw("Cannot convert between non-EnsEMBL objects.\n[$in] to [$out]"); + } +} + +=head2 convert + + Title : convert + Usage : my $array_ref = $converter->convert(\@input); + Function: does the actual conversion + Returns : an array ref of converted objects + Args : an array ref of converting objects + +=cut + +sub convert{ + my ($self, $input) = @_; + + $input || $self->throw("Need a ref of array of input objects to convert"); + + my $output_module = $self->out; + $self->throw("Cannot load [$output_module] perl module") + unless $self->_load_module($output_module); + + unless(ref($input) eq 'ARRAY'){ + $self->warn("The input is supposed to be an array ref"); + return $self->_convert_single($input); + } + + my @output = (); + foreach(@{$input}){ + push(@output, $self->_convert_single($_)); + } + + return \@output; +} + +sub _convert_single{ + shift->throw("Not implemented. Please check the instance subclass"); +} + +foreach my $field (qw(in out)){ + my $slot=__PACKAGE__ ."::$field"; + no strict 'refs'; + *$field=sub{ + my $self=shift; + $self->{$slot}=shift if @_; + return $self->{$slot}; + }; +} + +=head2 _load_module + + This method is copied from Bio::Root::Root + +=cut + +sub _load_module { + my ($self, $name) = @_; + my ($module, $load, $m); + $module = "_<$name.pm"; + return 1 if $main::{$module}; + + # untaint operation for safe web-based running (modified after a fix + # a fix by Lincoln) HL + if ($name !~ /^([\w:]+)$/) { + $self->throw("$name is an illegal perl package name"); + } + + $load = "$name.pm"; + my $io = Bio::Root::IO->new(); + # catfile comes from IO + $load = $io->catfile((split(/::/,$load))); + eval { + require $load; + }; + if ( $@ ) { + $self->throw("Failed to load module $name. ".$@); + } + return 1; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/Converter/bio_ens.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/Converter/bio_ens.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,407 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 AUTHOR + +Juguang Xiao + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::Converter::bio_ens + +=head1 SYNOPISIS + +You should not use this module directly. Please check out the +Bio::EnsEMBL::Utils::Converter module. + +=head1 DESCRIPTION + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Utils::Converter::bio_ens; + +use strict; +use vars qw(@ISA); +use Bio::EnsEMBL::Analysis; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Utils::Converter; +use Scalar::Util qw(weaken); +@ISA = qw(Bio::EnsEMBL::Utils::Converter); + +=head2 new + +Please see Bio::EnsEMBL::Utils::Converter::new + +=cut + +sub new { + my ($caller, @args) = @_; + my $class = ref($caller) || $caller; + + if($class eq 'Bio::EnsEMBL::Utils::Converter::bio_ens'){ + my %params = @args; + @params{map{lc $_} keys %params} = values %params; + my $module = $class->_guess_module($params{-in}, $params{-out}); + return undef unless ($class->_load_module($module)); + return "$module"->new(@args); + }else{ + my $self = $class->SUPER::new(@args); +# $self->_initialize(@args); + return $self; + } +} + +sub _initialize { + my ($self, @args) = @_; + $self->SUPER::_initialize(@args); + + my ($dbadaptor, + $dbdriver, $dbhost, $dbport, $dbuser, $dbpass, $dbname, + $analysis, $analysis_dbid, $analysis_logic_name, + $contig, $contig_dbid, $contig_name, + $translation_id) = + + $self->_rearrange([qw(DBADAPTOR + DBDRIVER DBHOST DBPORT DBUSER DBPASS DBNAME + ANALYSIS ANALYSIS_DBID ANALYSIS_LOGIC_NAME + CONTIG CONTIG_DBID CONTIG_NAME + TRANSLATION_ID)], @args); + + + if(defined $dbadaptor){ + $self->dbadaptor($dbadaptor); + }elsif(defined $dbname){ + $self->ensembl_db(@args); + }else{ + # No db information. + } + + if(defined $analysis){ + $self->analysis($analysis); + # then ignore the analysis_dbid and analysis_logic_name + }elsif(defined $analysis_dbid){ + $self->analysis_dbID($analysis_dbid); + }elsif(defined $analysis_logic_name){ + $self->analysis_logic_name($analysis_logic_name); + }else{ + # No analysis information offered + } + + if(defined $contig){ + ($contig) = ref($contig) eq 'ARRAY' ? @{$contig} : $contig; + $self->contig($contig); + }elsif(defined $contig_dbid){ + $self->contig_dbID($contig_dbid); + }elsif(defined $contig_name){ + $self->contig_name($contig_name); + }else{ + # No contig information + } + + if(defined $translation_id){ + $self->translation_id($translation_id); + } +} + + +sub _guess_module { + my ($self, $in, $out) = @_; + my $tail; + if($in eq 'Bio::Search::HSP::GenericHSP'){ + $tail = 'bio_ens_hsp'; + }elsif($in eq 'Bio::SeqFeature::Generic'){ + $tail = 'bio_ens_seqFeature'; + }elsif($in eq 'Bio::SeqFeature::FeaturePair'){ + $tail = 'bio_ens_featurePair'; + }elsif($in eq 'Bio::Pipeline::Analysis'){ + $tail = 'bio_ens_analysis'; + }elsif($in eq 'Bio::Tools::Prediction::Gene'){ + $tail = 'bio_ens_predictionGene'; + }elsif($in eq 'Bio::Tools::Prediction::Exon'){ + $tail = 'bio_ens_predictionExon'; + }elsif($in eq 'Bio::SeqFeature::Gene::GeneStructure'){ + $tail = 'bio_ens_gene'; + }elsif($in eq 'Bio::SeqFeature::Gene::Transcript'){ + $tail = 'bio_ens_transcript'; + }elsif($in eq 'Bio::SeqFeature::Gene::Exon'){ + $tail = 'bio_ens_exon'; + }else{ + $self->throw("[$in] to [$out], not supported"); + } + return "Bio::EnsEMBL::Utils::Converter::$tail"; +} + + +=head2 analysis + + Title : analysis + Usage : $self->analysis + Function: get and set for analysis + Return : L + Args : L + +=cut + +sub analysis { + my ($self, $arg) = @_; + if(defined($arg)){ + # convert the analysis, if it's not Bio::Pipeline::Analysis + if($arg->isa('Bio::Pipeline::Analysis')){ + my $converter_for_analysis = new Bio::EnsEMBL::Utils::Converter( + -in => 'Bio::Pipeline::Analysis', + -out => 'Bio::EnsEMBL::Analysis' + ); + ($arg) = @{ $converter_for_analysis->convert([$arg]) }; + } + + $self->throws("A Bio::EnsEMBL::Analysis object expected.") + unless($arg->isa('Bio::EnsEMBL::Analysis')); + $self->{_analysis} = $arg; + $self->{_analysis_dbid} = $arg->dbID; + $self->{_analysis_logic_name} = $arg->logic_name; + } + return $self->{_analysis}; +} + + +=head2 contig + + Title : contig + Usage : $self->contig + Function: get and set for contig + Return : + Args : + +=cut + +sub contig { + my ($self, $arg) = @_; + if(defined($arg)){ + if($arg->isa('Bio::EnsEMBL::RawContig')){ + $self->{_contig_dbid} = $arg->dbID; + $self->{_contig_name} = $arg->name; + }elsif($arg->isa('Bio::EnsEMBL::Slice')){ + $self->{_slice_dbid} = $arg->dbID; + }elsif($arg->isa('Bio::PrimarySeqI')){ + ; + }else{ + $self->throw("a Bio::EnsEMBL::RawContig needed"); + } + $self->{_contig} = $arg; + + } + return $self->{_contig}; +} + +=head2 dbadaptor + + Title : dbadaptor + Usage : $self->dbadaptor + Function: get and set for dbadaptor + Return : L + Args : L + +=cut + +sub dbadaptor { + my ($self, $arg) = @_; + if(defined($arg)){ + $self->throws("A Bio::EnsEMBL::DBSQL::DBAdaptor object expected.") unless(defined $arg); + weaken($self->{_dbadaptor} = $arg); + } + return $self->{_dbadaptor}; +} + +=head2 ensembl_db + + Title : ensembl_db + Usage : + Function: + Return : + Args : + +=cut + +sub ensembl_db { + my ($self, @args) = @_; + + my ($dbdriver, $dbhost, $dbport, $dbuser, $dbpass, $dbname) = $self->_rearrange( + [qw(DBDRIVER DBHOST DBPORT DBUSER DBPASS DBNAME)], @args); + + my $dbadaptor = new Bio::EnsEMBL::DBSQL::DBAdaptor( + -driver => $dbdriver, + -host => $dbhost, + -port => $dbport, + -user => $dbuser, + -pass => $dbpass, + -dbname => $dbname + ); + $self->dbadaptor($dbadaptor); +} + +=head2 analysis_dbID + + Title : analysis_dbID + Usage : + Function: + Return : + Args : + +=cut + +sub analysis_dbID { + my ($self, $arg) = @_; + + if(defined $arg){ + my $analysis; + eval{ + $analysis = $self->dbadaptor->get_AnalysisAdaptor->fetch_by_dbID($arg); + }; + $self->throw("Failed during fetching analysis by dbID\n$@") if($@); + $self->analysis($analysis); + } + $self->{_analysis_dbid}; +} + + +=head2 analysis_logic_name + + Title : analysis_logic_name + Usage : + Function: + Return : + Args : + +=cut + +sub analysis_logic_name { + my ($self, $arg) = @_; + + return $self->{_analysis_logic_name} unless(defined $arg); + my $analysis; + eval{ + $analysis = + $self->dbadaptor->get_AnalysisAdaptor->fetch_by_logic_name($arg); + }; + $self->throw("Not found analysis with logic name as \[$arg\]\n$@") if($@); + + $self->analysis($analysis); + return $self->{_analysis_logic_name}; +} + +=head2 contig_dbID + + Title : contig_dbID + Usage : $self->contig_dbID + Function: get and set for contig_dbID + Return : + Args : + +=cut + +sub contig_dbID { + my ($self, $arg) = @_; + if(defined($arg)){ + my $contig; + eval{ + $contig = + $self->dbadaptor->get_RawContigAdaptor->fetch_by_dbID($arg); + }; + $self->throw("Failed during fetching contig by dbID\n$@") if($@); + $self->contig($contig); + } + return $self->{_contig_dbid}; +} + +=head2 contig_name + Title : contig_name + Usage : $self->contig_name + Function: get and set for contig_name + Return : + Args : +=cut + +sub contig_name { + my ($self, $arg) = @_; + if(defined($arg)){ + my $contig; + eval{ + $contig = + $self->dbadaptor->get_RawContigAdaptor->fetch_by_name($arg); + }; + $self->throw("Failed during fetching contig by dbID\n$@") if($@); + $self->contig($contig); + } + return $self->{_contig_name}; +} + +=head2 slice_dbID + Title : slice + Usage : $self->slice + Function: get and set for slice + Return : L + Args : L +=cut + +sub slice_dbID { + my ($self, $arg) = @_; + if(defined($arg)){ + my $slice; + $self->throw("undefined dbadpator") unless defined $self->dbadpaotr; + + eval{ + my $sliceAdaptor = $self->dbadaptor->get_SliceAdaptor; + $slice = $sliceAdaptor->fetch_by_dbID($arg); + }; + + $self->throw("Failed to fetch slice by dbID\n$@") if($@); + $self->contig($slice); + } +} + +=head2 slice_chr_start_end + Title : slice_chr_start_end + Usage : my $slice = $self->slice_chr_start_end($chr, $start, $end); + Function: get and set for slice_chr_start_end + Return : + Args : +=cut + +sub slice_chr_start_end { + my ($self, $chr, $start, $end) = @_; + if(defined($chr) && defined($start) && defined($end)){ + my $slice; + eval{ + my $sliceAdaptor = $self->dbadaptor->get_SliceAdaptor; + $slice = $sliceAdaptor->fetch_by_chr_start_end($chr, $start, $end); + }; + $self->throw("Failed to fetch slice by chr start end\n$@") if($@); + $self->contig($slice); + } +} + +sub translation_id { + my ($self, $arg) = @_; + return $self->{_translation_id} = $arg if(defined($arg)); + return $self->{_translation_id}; +} +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/Converter/bio_ens_analysis.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/Converter/bio_ens_analysis.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,78 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 AUTHOR + +Juguang Xiao + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::Converter::bio_ens_analysis, a converter instance +specific for Analysis. + +=head1 SYNOPISIS + + my $converter = Bio::EnsEMBL::Utils::Converter( + -in => 'Bio::Pipeline::Analysis', + -out => 'Bio::EnsEMBL::Analysis' + ); + my $biopipe_analysis; + my ($ens_analysis) = @{ $converter->convert( [$biopipe_analysis] ) }; + +=head1 DESCRIPTION + +=head1 METHODS + +=cut + + +package Bio::EnsEMBL::Utils::Converter::bio_ens_analysis; + +use strict; +use vars qw(@ISA); +use Bio::EnsEMBL::Utils::Converter::bio_ens; +use Bio::EnsEMBL::Analysis; + +@ISA = qw(Bio::EnsEMBL::Utils::Converter::bio_ens); + +sub _convert_single { + my ($self, $input) = @_; + + $self->throw("a Bio::Pipeline::Analysis object needed") + unless(ref($input) && $input->isa('Bio::Pipeline::Analysis')); + + my $ens_analysis = Bio::EnsEMBL::Analysis->new( + -logic_name => $input->logic_name, + -db => $input->db, + -db_version => $input->db_version, + -db_file => $input->db_file, + -program => $input->program, + -program_version => $input->program_version, + -program_file => $input->program_file, + -parameters => $input->analysis_parameters, + -module => $input->runnable, + -gff_source => $input->gff_source, + -gff_feature => $input->gff_feature, + -id =>$input->dbID + ); + + return $ens_analysis; +} +; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/Converter/bio_ens_exon.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/Converter/bio_ens_exon.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,122 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 AUTHOR + +Juguang Xiao + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::Converter::bio_ens_exon + +=head1 SYNOPISIS + +=head1 DESCRIPTION + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Utils::Converter::bio_ens_exon; + +use strict; +use vars qw(@ISA %GTF_ENS_PHASE); +use Bio::EnsEMBL::Utils::Converter; +use Bio::EnsEMBL::Exon; +use Bio::EnsEMBL::DnaPepAlignFeature; +use Bio::EnsEMBL::Utils::Converter::bio_ens; +@ISA = qw(Bio::EnsEMBL::Utils::Converter::bio_ens); + +BEGIN { + %GTF_ENS_PHASE = ( + 0 => 0, + 1 => 2, + 2 => 1, + '.' => -1 + ); +} + +sub _initialize { + my ($self, @args) = @_; + $self->SUPER::_initialize(@args); + + $self->{_bio_ens_seqFeature} = new Bio::EnsEMBL::Utils::Converter ( + -in => 'Bio::SeqFeature::Generic', + -out => 'Bio::EnsEMBL::SeqFeature', + ); + + $self->{_bio_ens_featurePair} = new Bio::EnsEMBL::Utils::Converter ( + -in => 'Bio::SeqFeature::FeaturePair', + -out => 'Bio::EnsEMBL::FeaturePair' + ); +} + +sub _attach_supporting_feature { + my ($self, $exon, $ens_exon) = @_; + unless($exon->has_tag('supporting_feature')){ + return; + } + my ($sf) = $exon->each_tag_value('supporting_feature'); + unless(defined $sf){ + $self->warn("no supporting feature is attached in exon"); + return; + } +# $self->{_bio_ens_seqFeature}->contig($self->contig); + $self->{_bio_ens_seqFeature}->analysis($self->analysis); + my $ens_f1 = $self->{_bio_ens_seqFeature}->_convert_single($sf->feature1); + my $ens_f2 = $self->{_bio_ens_seqFeature}->_convert_single($sf->feature2); + $self->{_bio_ens_featurePair}->contig($self->contig); + $self->{_bio_ens_featurePair}->analysis($self->analysis); + my $ens_sf = $self->{_bio_ens_featurePair}->_convert_single($sf); + my @align_feautre_args = ( + -feature1 => $ens_f1, + -feature2 => $ens_f2, + -features => [$ens_sf] + ); + + my $ens_supporting_feature = + Bio::EnsEMBL::DnaPepAlignFeature->new(@align_feautre_args); + + $ens_exon->add_supporting_features($ens_supporting_feature); +} + +sub _convert_single { + my ($self, $arg) = @_; + unless($arg && $arg->isa('Bio::SeqFeature::Gene::Exon')){ + $self->throw("a Bio::SeqFeature::Gene::Exon object needed"); + } + my $exon = $arg; + my $ens_exon = Bio::EnsEMBL::Exon->new_fast( + $self->contig, $exon->start, $exon->end, $exon->strand); + + my ($phase) = $exon->each_tag_value('phase'); + $ens_exon->phase($GTF_ENS_PHASE{$phase}); + my $ens_end_phase = 3 - ($exon->length - $phase) % 3; + $ens_end_phase = 0 if $ens_end_phase == 3; + $ens_exon->end_phase($ens_end_phase); + if($self->contig->isa('Bio::EnsEMBL::RawContig')){ + $ens_exon->sticky_rank(1); + } + $self->_attach_supporting_feature($exon, $ens_exon); + return $ens_exon; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/Converter/bio_ens_featurePair.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/Converter/bio_ens_featurePair.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,154 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 AUTHOR + +Juguang Xiao + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::Converter::bio_ens_featurePair + +=head1 SYNOPISIS + +=head1 DESCRIPTION + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Utils::Converter::bio_ens_featurePair; + +use strict; +use vars qw(@ISA); + +use Bio::EnsEMBL::FeaturePair; +use Bio::EnsEMBL::RepeatConsensus; +use Bio::EnsEMBL::ProteinFeature; +use Bio::EnsEMBL::Utils::Converter; +use Bio::EnsEMBL::Utils::Converter::bio_ens; +@ISA = qw(Bio::EnsEMBL::Utils::Converter::bio_ens); + +sub _initialize { + my ($self, @args) = @_; + $self->SUPER::_initialize(@args); + my ($translation_id) = $self->_rearrange([qw(TRANSLATION_ID)], @args); + $self->translation_id($translation_id); + + # internal converter for seqFeature + $self->{_bio_ens_seqFeature} = new Bio::EnsEMBL::Utils::Converter ( + -in => 'Bio::SeqFeature::Generic', + -out => 'Bio::EnsEMBL::SeqFeature', + ); +} + +sub _convert_single { + my ($self, $pair) = @_; + unless($pair && $pair->isa('Bio::SeqFeature::FeaturePair')){ + $self->throw('a Bio::SeqFeature::FeaturePair object needed'); + } + + if($self->out eq 'Bio::EnsEMBL::RepeatFeature'){ + return $self->_convert_single_to_repeatFeature($pair); + }elsif($self->out eq 'Bio::EnsEMBL::FeaturePair'){ + return $self->_convert_single_to_featurePair($pair); + }elsif($self->out eq 'Bio::EnsEMBL::ProteinFeature'){ + return $self->_convert_single_to_proteinFeature($pair); + }else{ + my $output_module = $self->out; + $self->throw("Cannot covert to [$output_module]"); + } +} + +sub _convert_single_to_featurePair { + my ($self, $pair) = @_; + my $feature1 = $pair->feature1; + my $feature2 = $pair->feature2; + $self->{_bio_ens_seqFeature}->contig($self->contig); + $self->{_bio_ens_seqFeature}->analysis($self->analysis); + my $ens_f1 = $self->{_bio_ens_seqFeature}->_convert_single($feature1); + my $ens_f2 = $self->{_bio_ens_seqFeature}->_convert_single($feature2); + my $ens_fp = Bio::EnsEMBL::FeaturePair->new( + -feature1 => $ens_f1, + -feature2 => $ens_f2 + ); + return $ens_fp; +} + +sub _convert_single_to_proteinFeature { + my ($self, $pair) = @_; + my $featurePair = $self->_convert_single_to_featurePair($pair); + my $proteinFeature = Bio::EnsEMBL::ProteinFeature->new( + -feature1 => $featurePair->feature1, + -feature2 => $featurePair->feature2 + ); + $proteinFeature->seqname($self->translation_id); + return $proteinFeature; +} + +sub _convert_single_to_repeatFeature { + my ($self, $pair) = @_; + my $feature1 = $pair->feature1; + my $feature2 = $pair->feature2; + my $ens_repeatfeature = new Bio::EnsEMBL::RepeatFeature( + -seqname => $feature1->seq_id, + -start => $feature1->start, + -end => $feature1->end, + -strand => $feature1->strand, + -source_tag => $feature1->source_tag, + ); + + my ($h_start, $h_end); + if($feature1->strand == 1){ + $h_start = $feature2->start; + $h_end = $feature2->end; + }elsif($feature1->strand == -1){ + $h_start = $feature2->end; + $h_end = $feature2->start; + }else{ + $self->throw("strand cannot be outside of (1, -1)"); + } + + $ens_repeatfeature->hstart($h_start); + $ens_repeatfeature->hend($h_end); + my $repeat_name = $feature2->seq_id; + my $repeat_class = $feature1->primary_tag; + $repeat_class ||= $feature2->primary_tag; + $repeat_class ||= "not sure"; + my $ens_repeat_consensus = + $self->_create_consensus($repeat_name, $repeat_class); + $ens_repeatfeature->repeat_consensus($ens_repeat_consensus); + + my($contig) = ref ($self->contig) eq 'ARRAY' ? @{$self->contig} : $self->contig; + + $ens_repeatfeature->attach_seq($contig); + $ens_repeatfeature->analysis($self->analysis); + return $ens_repeatfeature; +} + +sub _create_consensus{ + my ($self, $repeat_name, $repeat_class) = @_; + my $consensus = new Bio::EnsEMBL::RepeatConsensus; + $consensus->name($repeat_name); + $consensus->repeat_class($repeat_class); + return $consensus; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/Converter/bio_ens_gene.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/Converter/bio_ens_gene.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,82 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 AUTHOR + +Juguang Xiao + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::Converter::bio_ens_gene + +=head1 SYNOPISIS + +=head1 DESCRIPTION + +This module is to convert from objects of +Bio::SeqFeature::Gene::GeneStructure to those of Bio::EnsEMBL::Gene + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Utils::Converter::bio_ens_gene; + +use strict; +use vars qw(@ISA); +use Bio::EnsEMBL::Utils::Converter::bio_ens; +@ISA = qw(Bio::EnsEMBL::Utils::Converter::bio_ens); + +sub _initialize { + my ($self, @args) = @_; + $self->SUPER::_initialize(@args); + + $self->{_converter_for_transcripts} = new Bio::EnsEMBL::Utils::Converter( + -in => 'Bio::SeqFeature::Gene::Transcript', + -out => 'Bio::EnsEMBL::Transcript' + ); + +} + +sub _convert_single { + my ($self, $input) = @_; + + unless($input->isa('Bio::SeqFeature::Gene::GeneStructure')){ + $self->throw("a Bio::SeqFeature::Gene::GeneStructure object needed"); + } + my $gene = $input; + my $ens_gene = Bio::EnsEMBL::Gene->new(); + $ens_gene->analysis($self->analysis); + my @transcripts = $gene->transcripts; + + # contig is needed by exon and Supporting Feature; S.F. needs an analysis. + $self->{_converter_for_transcripts}->contig($self->contig); + $self->{_converter_for_transcripts}->analysis($self->analysis); + + my $ens_transcripts = $self->{_converter_for_transcripts}->convert( + \@transcripts); + + foreach(@{$ens_transcripts}){ + $ens_gene->add_Transcript($_); + } + return $ens_gene; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/Converter/bio_ens_hit.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/Converter/bio_ens_hit.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,158 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 DESCRIPTION + +Sequence alignment hits were previously stored within the core database +as ungapped alignments. This imposed 2 major constraints on alignments: + + a) alignments for a single hit record would require multiple rows in + the database, and + b) it was not possible to accurately retrieve the exact original alignment. + +Therefore, in the new branch sequence alignments are now stored as +ungapped alignments in the cigar line format (where CIGAR stands for +Concise Idiosyncratic Gapped Alignment Report). + +In the cigar line format alignments are sotred as follows: + + M: Match + D: Deletino + I: Insertion + +An example of an alignment for a hypthetical protein match is shown +below: + + + Query: 42 PGPAGLP----GSVGLQGPRGLRGPLP-GPLGPPL... + PG P G GP R PLGP + Sbjct: 1672 PGTP*TPLVPLGPWVPLGPSSPR--LPSGPLGPTD... + +protein_align_feature table as the following cigar line: + + 7M4D12M2I2MD7M + +=cut + +package Bio::EnsEMBL::Utils::Converter::bio_ens_hit; + +use strict; +use vars qw(@ISA); +use Bio::EnsEMBL::Utils::Converter::bio_ens; +use Bio::EnsEMBL::DnaDnaAlignFeature; +use Bio::EnsEMBL::DnaPepAlignFeature; +use Bio::EnsEMBL::PepDnaAlignFeature; +use Bio::EnsEMBL::ProteinFeature; + +@ISA = qw(Bio::EnsEMBL::Utils::Converter::bio_ens); + +sub _initialize { + my ($self, @args) = @_; + $self->SUPER::_initialize(@args); + + # After super initialized, analysis and contig are ready. + my $bio_ens_seqFeature_converter = new Bio::EnsEMBL::Utils::Converter( + -in => 'Bio::SeqFeature::Generic', + -out => 'Bio::EnsEMBL::SeqFeature', + -analysis => $self->analysis, + -contig => $self->contig + ); + $self->_bio_ens_seqFeature_converter($bio_ens_seqFeature_converter); + +} + +sub _convert_single { + my ($self, $input) = @_; + + my $in = $self->in; + my $out = $self->out; + + if($in =~ /Bio::Search::Hit::GenericHit/){ + return $self->_convert_single_hit($input); + }elsif($in =~ /Bio::Search::HSP::GenericHSP/){ + return $self->_convert_single_hsp($input); + }else{ + $self->throw("[$in]->[$out], not implemented"); + } +} + +sub _convert_single_hit { + + +} + +sub _convert_single_hsp { + my ($self, $hsp) = @_; + + unless(ref($hsp) && $hsp->isa('Bio::Search::HSP::GenericHSP')){ + $self->throw("a GenericHSP object needed"); + } + + my $bio_ens_seqFeature_converter = $self->_bio_ens_seqFeature_converter; + my $ens_feature1 = $bio_ens_seqFeature_converter->_convert_single( + $hsp->feature1); + my $ens_feature2 = $bio_ens_seqFeature_converter->_convert_single( + $hsp->feature2); + + $ens_feature1->p_value($hsp->evalue); + $ens_feature1->score($hsp->score); + $ens_feature1->percent_id($hsp->percent_identity); + $ens_feature2->p_value($hsp->evalue); + $ens_feature2->score($hsp->score); + $ens_feature2->percent_id($hsp->percent_identity); + + my $cigar_string = $hsp->cigar_string; + my @args = ( + -feature1 => $ens_feature1, + -feature2 => $ens_feature2, + -cigar_string => $cigar_string + ); + + my $contig = $self->contig; + # choose the AlignFeature based on the blast program + my $program = $hsp->algorithm; + + $self->throw("HSP does not have algorithm value") unless(defined($program)); + my $align_feature; + + if($program =~ /blastn/i){ + $align_feature = new Bio::EnsEMBL::DnaDnaAlignFeature(@args); + $align_feature->attach_seq($contig); + }elsif($program =~ /blastx/i){ + $align_feature = new Bio::EnsEMBL::DnaPepAlignFeature(@args); + $align_feature->attach_seq($contig); + }else{ + $self->throw("$program is not supported yet"); + } + + return $align_feature; +} + +# an internal getter/setter for a converter used for seq feature conversion. + +sub _bio_ens_seqFeature_converter { + my ($self, $arg) = @_; + if(defined $arg){ + $self->{_bio_ens_seqFeature_converter} = $arg; + } + return $self->{_bio_ens_seqFeature_converter}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/Converter/bio_ens_hsp.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/Converter/bio_ens_hsp.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,133 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + + +package Bio::EnsEMBL::Utils::Converter::bio_ens_hsp; + +use strict; +use vars qw(@ISA); +use Bio::EnsEMBL::Utils::Converter::bio_ens; +use Bio::EnsEMBL::ProteinFeature; + +@ISA = qw(Bio::EnsEMBL::Utils::Converter::bio_ens); + +sub _initialize { + my ($self, @args) = @_; + $self->SUPER::_initialize(@args); + + # After super initialized, analysis and contig are ready. + my $bio_ens_seqFeature_converter = new Bio::EnsEMBL::Utils::Converter( + -in => 'Bio::SeqFeature::Generic', + -out => 'Bio::EnsEMBL::SeqFeature', + -analysis => $self->analysis, + -contig => $self->contig + ); + $self->_bio_ens_seqFeature_converter($bio_ens_seqFeature_converter); + +} + +sub _convert_single { + my ($self, $hsp) = @_; + + unless(ref($hsp) && $hsp->isa('Bio::Search::HSP::GenericHSP')){ + $self->throw("a GenericHSP object needed"); + } + + my $in = $self->in; + my $out = $self->out; + + if($out =~ /^Bio::EnsEMBL::ProteinFeature$/){ + return $self->_convert_single_to_proteinFeature($hsp); + }elsif($out =~/^Bio::EnsEMBL::(DnaDna|DnaPep|PepDna)AlignFeature/){ + return $self->_convert_single_to_alignFeature($hsp); + }else{ + $self->throw("[$in]->[$out], not implemented"); + } +} + +sub _convert_single_to_featurePair { + my ($self, $hsp) = @_; + + my $bio_ens_seqFeature_converter = $self->_bio_ens_seqFeature_converter; + my $ens_feature1 = $bio_ens_seqFeature_converter->_convert_single( + $hsp->feature1); + my $ens_feature2 = $bio_ens_seqFeature_converter->_convert_single( + $hsp->feature2); + + $ens_feature1->p_value($hsp->evalue); + $ens_feature1->score($hsp->score); + $ens_feature1->percent_id($hsp->percent_identity); + $ens_feature2->p_value($hsp->evalue); + $ens_feature2->score($hsp->score); + $ens_feature2->percent_id($hsp->percent_identity); + + my $featurePair = Bio::EnsEMBL::FeaturePair->new( + -feature1 => $ens_feature1, + -feature2 => $ens_feature2 + ); + + return $featurePair; +} + +sub _convert_single_to_proteinFeature { + my ($self, $hsp) = @_; + + my $ens_featurePair = $self->_convert_single_to_featurePair($hsp); + my $ens_proteinFeature = Bio::EnsEMBL::ProteinFeature->new( + -feature1 => $ens_featurePair->feature1, + -feature2 => $ens_featurePair->feature2 + ); + $ens_proteinFeature->seqname($self->translation_id); + return $ens_proteinFeature; +} + +sub _convert_single_to_alignFeature { + my ($self, $hsp) = @_; + my $ens_featurePair = $self->_convert_single_to_featurePair($hsp); + my $cigar_string = $hsp->cigar_string; + my @args = ( + -feature1 => $ens_featurePair->feature1, + -feature2 => $ens_featurePair->feature2, + -cigar_string => $hsp->cigar_string + ); + my $contig = $self->contig; + # choose the AlignFeature based on the blast program + my $program = $hsp->algorithm; + + $self->throw("HSP does not have algorithm value") unless(defined($program)); + my $align_feature; + if($program =~ /blastn/i){ + $align_feature = new Bio::EnsEMBL::DnaDnaAlignFeature(@args); +# $align_feature->attach_seq($contig); + }elsif($program =~ /blastx/i){ + $align_feature = new Bio::EnsEMBL::DnaPepAlignFeature(@args); +# $align_feature->attach_seq($contig); + }else{ + $self->throw("\[$program\] is not supported yet"); + } + return $align_feature; +} + +sub _bio_ens_seqFeature_converter { + my ($self) = shift ; + return $self->{_bio_ens_seqFeature_converter} = shift if(@_); + return $self->{_bio_ens_seqFeature_converter}; +} +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/Converter/bio_ens_predictionExon.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/Converter/bio_ens_predictionExon.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,78 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 AUTHOR + +Juguang Xiao + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::Converter::bio_ens_predictionExon + +=head1 SYNOPISIS + + my $converter = new Bio::EnsEMBL::Utils::Converter( + -in => 'Bio::Tools::Prediction::Exon', + -out => 'Bio::EnsEMBL::Exon', + -contig => $ens_contig + ); + +=head1 DESCRIPTION + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Utils::Converter::bio_ens_predictionExon; + +use strict; +use vars qw(@ISA); +use Bio::EnsEMBL::Utils::Converter; +use Bio::EnsEMBL::Exon; +use Bio::EnsEMBL::Utils::Converter::bio_ens; +@ISA = qw(Bio::EnsEMBL::Utils::Converter::bio_ens); + +sub _convert_single { + my ($self, $input) = @_; + + + $input || $self->throw("a input object needed"); + $self->throw("a Bio::Tools::Prediction::Exon object needed") + unless($input->isa("Bio::Tools::Prediction::Exon")); + + my $output = Bio::EnsEMBL::Exon->new( + -start => $input->start, + -end => $input->end, + -strand => $input->strand + ); + + $output->score($input->score); + $output->p_value($input->significance); + + $output->phase($input->get_tag_values("phase")); # only first element is used + $output->end_phase($input->get_tag_values("end_phase")); + + $output->contig($self->contig); + + return $output; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/Converter/bio_ens_predictionGene.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/Converter/bio_ens_predictionGene.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,109 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 AUTHOR + +Juguang Xiao + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::Converter::bio_ens_predictionGene + +=head1 SYNOPISIS + +=head1 DESCRIPTION + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Utils::Converter::bio_ens_predictionGene; + +use strict; +use vars qw(@ISA); +use Bio::EnsEMBL::Utils::Converter; +use Bio::EnsEMBL::PredictionTranscript; +use Bio::EnsEMBL::Utils::Converter::bio_ens; +@ISA = qw(Bio::EnsEMBL::Utils::Converter::bio_ens); + +sub _initialize { + my ($self, @args) = @_; + + $self->{_predictionExonConverter} = new Bio::EnsEMBL::Utils::Converter( + -in => 'Bio::SeqFeature::Gene::Exon', + -out => 'Bio::EnsEMBL::Exon', + ); + $self->SUPER::_initialize(@args); + + $self->{_predictionExonConverter}->contig($self->contig); + $self->{_predictionExonConverter}->analysis($self->analysis); +} + +sub _convert_single { + my ($self, $input) = @_; + + $self->throw("one argument needed") unless($input and defined($input)); + $self->throw("a Bio::Tools::Prediction::Gene object needed") + unless(ref($input) && $input->isa('Bio::Tools::Prediction::Gene')); + + my $output = Bio::EnsEMBL::PredictionTranscript->new; + $output->analysis($self->analysis); + + my @exons = sort {$a->start <=> $b->start} $input->exons; + + # Not sure on the correctivity of phase calculation. + my $previous_end_phase = -1; + foreach(@exons){ + my $length = $_->length; + my $frame = $_->frame; + my $phase = ($previous_end_phase+1) %3; + my $end_phase = ($length-$frame) %1; + $previous_end_phase = $end_phase; + $_->add_tag_value("phase", $phase); + $_->add_tag_value("end_phase", $end_phase); + } + + my @ens_exons = @{$self->{_predictionExonConverter}->convert(\@exons)}; + + $output->add_Exon($_) foreach(@ens_exons); + + return $output; + +} + +=head2 contig + Title : contig + Usage : $self->contig + Function: get and set for contig + Return : L + Args : L +=cut + +sub contig { + my ($self, $arg) = @_; + if(defined($arg)){ + $self->throws("A Bio::EnsEMBL::RawContig object expected.") unless(defined $arg); + $self->{_contig} = $arg; + # assign it to the sub converter which converts exons + $self->{_predictionExonConverter}->contig($arg); + } + return $self->{_contig}; +} +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/Converter/bio_ens_seqFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/Converter/bio_ens_seqFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,133 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 AUTHOR + +Juguang Xiao + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::Converter::bio_ens_seqFeature + +=head1 SYNOPISIS + +Please read Bio::EnsEMBL::Utils::Converter + +=head1 DESCRIPTION + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Utils::Converter::bio_ens_seqFeature; + +use strict; +use vars qw(@ISA); +use Bio::EnsEMBL::SeqFeature; +use Bio::EnsEMBL::SimpleFeature; +use Bio::EnsEMBL::Exon; +use Bio::EnsEMBL::Utils::Converter::bio_ens; +@ISA = qw(Bio::EnsEMBL::Utils::Converter::bio_ens); + +sub _convert_single { + my ($self, $in) = @_; + + unless($in && defined($in) && $in->isa('Bio::SeqFeature::Generic')){ + $self->throw("a Bio::SeqFeature::Generic object needed"); + } + + my $seqFeature = $in; + my $seq_id = $seqFeature->seq_id; + unless(defined($seq_id) && $seq_id){ + $self->warn("No seq_id value. EnsEMBL SeqFeature will validate it"); + $seq_id = 'Unknown'; + } + + # Debated issue here. There are p_value and percent_id in EnsEMBL API and DB + # schema, but not in bioperl. If in bioperl there are tags called p_value or + # percent_id, then the values are passed, otherwise set the default 1. + # + # the problem arise when I try to converter the seqfeature for tmhmm to + # EnsEMBL seqFeature. + # -- Juguang, 11 July '03 + my $score = $in->score || 0; + my $percent_id; + if($in->has_tag('percent_id')){ + ($percent_id) = $in->get_tag_values('percent_id'); + }else{ + $percent_id ||= 0; + } + my $p_value; + if($in->has_tag('p_value')){ + ($p_value) = $in->get_tag_values('p_value'); + }elsif($in->has_tag('evalue')){ + ($p_value) = $in->get_tag_values('evalue'); + }else{ + $p_value ||= 1; + } + my $ens_seqFeature; + my %args = ( + -start => $in->start, + -end => $in->end, + -strand => $in->strand, + -score => $score, + -analysis => $self->analysis, + -source_tag => $in->source_tag, + -seqname => $seq_id, + -percent_id => $percent_id, + -p_value => $p_value + ); + + my $output_module = $self->out; + + if($output_module eq 'Bio::EnsEMBL::SeqFeature'){ + + $ens_seqFeature = new Bio::EnsEMBL::SeqFeature(%args); + }elsif($self->out eq 'Bio::EnsEMBL::SimpleFeature'){ + $ens_seqFeature = new Bio::EnsEMBL::SimpleFeature(%args); + # The field that there is in SimpleFeature, but not in SeqFeature. + $ens_seqFeature->display_label('__NONE__'); + }elsif($self->out eq 'Bio::EnsEMBL::Exon'){ + $ens_seqFeature = Bio::EnsEMBL::Exon->new_fast( + $self->contig, $seqFeature->start, $seqFeature->end, + $seqFeature->strand); + }elsif($self->out eq 'Bio::EnsEMBL::ProteinFeature'){ + my $seq_id2 = $self->analysis->logic_name; + unless(defined $self->translation_id){ + $self->throw('translation_id unset, in ProteinFeature conversion'); + } + $args{'-seqname'} = $self->translation_id; + $ens_seqFeature = Bio::EnsEMBL::ProteinFeature->new( + -feature1 => Bio::EnsEMBL::SeqFeature->new(%args), + -feature2 => Bio::EnsEMBL::SeqFeature->new( + -start => 0, + -end => 0, + -seqname => $seq_id2 + ) + ); + }else{ + $self->throw("[$output_module] as -out, not supported"); + } + + $ens_seqFeature->attach_seq($self->contig); + return $ens_seqFeature; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/Converter/bio_ens_transcript.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/Converter/bio_ens_transcript.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,76 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 AUTHOR + +Juguang Xiao + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::Converter::bio_ens_transcript - the instance converter + +=head1 SYNOPISIS + +=head1 DESCRIPTION + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Utils::Converter::bio_ens_transcript; + +use strict; +use vars qw(@ISA); +use Bio::EnsEMBL::Transcript; +use Bio::EnsEMBL::Utils::Converter::bio_ens; +@ISA = qw(Bio::EnsEMBL::Utils::Converter::bio_ens); + +sub _convert_single { + my ($self, $arg) = @_; + unless($arg->isa('Bio::SeqFeature::Gene::Transcript')){ + $self->throw("A Bio::SeqFeature::Gene::Transcript object needed"); + } + my $transcript = $arg; + + my @exons = $transcript->exons_ordered; + $self->{_converter_for_exons}->contig($self->contig); + $self->{_converter_for_exons}->analysis($self->analysis); + + my $ens_exons = $self->{_converter_for_exons}->convert(\@exons); + + my $ens_transcript = Bio::EnsEMBL::Transcript->new(@{$ens_exons}); + $ens_transcript->start($transcript->start); + $ens_transcript->end($transcript->end); +# $ens_transcript->strand($transcript->strand); + return $ens_transcript; +} + + +sub _initialize { + my ($self, @args) = @_; + $self->SUPER::_initialize(@args); + + $self->{_converter_for_exons} = new Bio::EnsEMBL::Utils::Converter( + -in => 'Bio::SeqFeature::Gene::Exon', + -out => 'Bio::EnsEMBL::Exon' + ); + +} + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/Converter/ens_bio.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/Converter/ens_bio.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,90 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 AUTHOR + +Juguang Xiao + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::Converter::ens_bio + +=head1 SYNOPISIS + +You are not supposed to use this module directly. Please read +Bio::EnsEMBL::Utils::Converter + +=head1 DESCRIPTION + +This is a helper module to assist Bio::EnsEMBL::Utils::Converter find +which converter instance should be used, based on the -in and -out +parameters. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Utils::Converter::ens_bio; + +use strict; +use vars qw(@ISA); +use Bio::EnsEMBL::Utils::Converter; +@ISA = qw(Bio::EnsEMBL::Utils::Converter); + +=head2 new + Please see Bio::EnsEMBL::Utils::Converter::new +=cut + +sub new { + my ($caller, @args) = @_; + my $class = ref($caller) || $caller; + + if($class eq 'Bio::EnsEMBL::Utils::Converter::ens_bio'){ + my %params = @args; + @params{map{lc $_} keys %params} = values %params; + my $module = $class->_guess_module($params{-in}, $params{-out}); + return undef unless ($class->_load_module($module)); + return "$module"->new(@args); + }else{ + my $self = $class->SUPER::new(@args); +# $self->_initialize(@args); + return $self; + } + +} + +# Unlike bio_ens, ens_bio does not need _initialize method for analysis and +# contig information. +# + +sub _guess_module { + my ($self, $in, $out) = @_; + my $tail; + if($in eq 'Bio::EnsEMBL::SeqFeature'){ + $tail = 'ens_bio_seqFeature'; + }elsif($in eq 'Bio::Ens::EMBL::FeaturePair'){ + $tail = 'ens_bio_featurePair'; + }else{ + $self->throw("[$in] to [$out], not supported"); + } + return "Bio::EnsEMBL::Utils::Converter::$tail"; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/Converter/ens_bio_featurePair.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/Converter/ens_bio_featurePair.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,93 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 AUTHOR + +Juguang Xiao + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::Converter::ens_bio_featurePair + +=head1 SYNOPISIS + +=head1 DESCRIPTION + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Utils::Converter::ens_bio_featurePair; + +use strict; +use vars qw(@ISA); +use Bio::SeqFeature::Generic; +use Bio::SeqFeature::FeaturePair; +use Bio::EnsEMBL::Utils::Converter::ens_bio; +@ISA = qw(Bio::EnsEMBL::Utils::Converter::ens_bio); + +sub _convert_single { + my ($self, @args) = @_; + +} + +# convert object from Bio::EnsEMBL::RepeatFeature +# to Bio::SeqFeature::FeaturePair + +sub _convert_single_repeatFeature { + my ($self, $ens_repeat) = @_; + + my $feature1 = new Bio::SeqFeature::Generic( + -start => $ens_repeat->start, + -end => $ens_repeat->end, + -strand => $ens_repeat->strand, + -source_tag => $ens_repeat->source_tag + -primary_tag => $ens_repeat->repeat_class, + -seq_id => $ens_repeat->seqname + ); + + my ($start2, $end2); + if($ens_repeat->strand == 1){ + $start2 = $ens_repeat->hstart; + $end2 = $ens_repeat->hend; + }elsif($ens_repeat->strand == -1){ + $start2 = $ens_repeat->hend; + $end2 = $ens_repeat->hstart; + }else{ + $self->throw("strand cannot be out of range (1, -1)"); + } + + my $feature2 = new Bio::SeqFeature::Generic( + -start => $start2, + -end => $end2, + -source_tag => $ens_repeat->source_tag, + -primary_tag => $ens_repeat->repeat_class, + -seq_id => $ens_repeat->repeat_name + ); + + my $output_module = $self->out; + require "$output_module"; + return new Bio::SeqFeature::FeaturePair( + -feature1 => $feature1, + -feature2 => $feature2 + ); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/Converter/ens_bio_seqFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/Converter/ens_bio_seqFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,66 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 AUTHOR + +Juguang Xiao + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::Converter::ens_bio_seqFeature + +=head1 SYNOPISIS + +=head1 DESCRIPTION + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Utils::Converter::ens_bio_seqFeature; + +use strict; +use vars qw(@ISA); +use Bio::EnsEMBL::Utils::Converter::ens_bio; +@ISA = qw(Bio::EnsEMBL::Utils::Converter::ens_bio); + +sub _convert_single { + my ($self, $in) = @_; + + $self->throw("Input not defined") unless($in && defined($in)); + unless(ref($in) && $in->isa('Bio::EnsEMBL::SeqFeature')){ + $self->throw('A Bio::EnsEMBL::SeqFeature object needed'); + } + + my @args = ( + -start => $in->start, + -end => $in->end, + -strand => $in->strand, + -score => $in->score, + -source_tag => $in->source_tag, + -seq_id => $in->seqname + ); + + my $seqFeature = new Bio::SeqFeature::Generic(@args); + + return $seqFeature; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/EasyArgv.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/EasyArgv.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,137 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=head1 AUTHOR + +Juguang Xiao + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::EasyArgv + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Utils::EasyArgv; + + my $db = get_ens_db_from_argv; # this method is exported. + + use Getopt::Long; + + my $others; + &GetOptions( 'others=s' => \$others ); + +=head1 DESCRIPTION + +This is a lazy but easy way to get the db-related arguments. All you +need to do is to invoke get_ens_db_from_argv before using standard +Getopt. The below options will be absorbed and removed from @ARGV. + +db_file, host, db_host, dbhost, user, db_user, dbuser, pass, db_pass, +dbpass, dbname, db_name. + +Now you can take advantage of Perl's do method to execute a file as perl +script and get returned the last line of it. For your most accessed db +setting, you can have a file named, say, ensdb_homo_core_18.perlobj, +with the content like + + use strict; # The ceiling line + + use Bio::EnsEMBL::DBSQL::DBAdaptor; + + my $db = Bio::EnsEMBL::DBSQL::DBAdaptor->new( + -host => 'ensembldb.ensembl.org', + -user => 'anonymous', + -dbname => 'homo_sapiens_core_18_34' + ); + + $db; # The floor line + +In the your command line, you just need to write like + + perl my_script.pl -db_file ensdb_homo_core_18.perlobj + +rather than the verbose + + -host ensembldb.ensembl.org -user anonymous \ + -dbname homo_sapiens_core_18_34 + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Utils::EasyArgv; + +use strict; +use vars qw($debug); +use Exporter (); +our @ISA= qw(Exporter); +our @EXPORT = qw(get_ens_db_from_argv +); +use Bio::Root::Root; # For _load_module +use Getopt::Long; + +sub _debug_print; + +sub get_ens_db_from_argv { + my ($db_file, $host, $user, $pass, $dbname, $driver, $db_module); + $host = 'localhost'; + $driver ='mysql'; + $db_module = 'Bio::EnsEMBL::SQL::DBAdaptor'; + Getopt::Long::config('pass_through'); + &GetOptions( + 'db_file=s' => \$db_file, + 'driver|dbdriver|db_driver=s' => \$driver, + 'host|dbhost|db_host=s' => \$host, + 'user|dbuser|db_user=s' => \$user, + 'pass|dbpass|db_pass=s' => \$pass, + 'dbname|db_name=s' => \$dbname, + 'db_module=s' => \$db_module + ); + + my $db; + if(defined $db_file){ + -e $db_file or die "'$db_file' is defined but does not exist\n"; + eval { $db = do($db_file) }; + $@ and die "'$db_file' is not a perlobj file\n"; + $db->isa('Bio::EnsEMBL::DBSQL::DBAdaptor') + or die "'$db_file' is not EnsEMBL DBAdaptor\n"; + _debug_print "I get a db from file\n"; + + }elsif(defined $host and defined $user and defined $dbname){ + Bio::Root::Root::_load_module($db_module); + $db = $db_module->new( + -host => $host, + -user => $user, + -pass => $pass, + -dbname => $dbname, + -driver => $driver + ); + }else{ + die "Cannot get the db, due to the insufficient information\n"; + } + return $db; +} + +sub _debug_print { + print STDERR @_ if $debug; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/Eprof.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/Eprof.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,254 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::Eprof - Bespoke Ensembl profiler + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Utils::Eprof( 'eprof_start', 'eprof_end', + 'eprof_dump' ); + + &eprof_start('function-a'); + # ... do something + &eprof_end('function-a'); + + &eprof_dump( \*STDERR ); + + # there is an object based set for above as well, for running + # multiple concurrent profilers + +=head1 DESCRIPTION + +This is an Ensembl profiler as we broke the Perl profilers. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Utils::Eprof; + +use strict; +use warnings; + +use Bio::EnsEMBL::Utils::Exception ('throw'); +use Bio::EnsEMBL::Utils::EprofStack; + +use base qw( Exporter ); + +our @EXPORT_OK = + ( 'eprof_start', 'eprof_end', 'eprof_dump', 'eprof_reset' ); + +my $global; + +sub new { + my ($proto) = @_; + + my $class = ref($proto) || $proto; + my $self = bless( { '_tags' => {} }, $class ); + + return $self; +} + +=head2 eprof_start + + Title : eprof_start + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub eprof_start { + my ($tag) = @_; + + if ( !defined($global) ) { + $global = Bio::EnsEMBL::Utils::Eprof->new(); + } + + $global->start($tag); +} + +=head2 eprof_end + + Title : eprof_end + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub eprof_end { + my ($tag) = @_; + + if ( !defined($global) ) { + $global = Bio::EnsEMBL::Utils::Eprof->new(); + } + + $global->end($tag); +} + +sub eprof_dump { + my ($fh) = @_; + + if ( !defined($global) ) { return } + + $global->dump($fh); +} + +=head2 eprof_reset + + Title : eprof_reset + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub eprof_reset { undef($global) } + +=head2 dump + + Title : dump + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub dump { + my ( $self, $fh ) = @_; + + my @tags = sort { + $self->_tags()->{$a}->total_time() + <=> $self->_tags()->{$b}->total_time() + } keys %{ $self->_tags() }; + + foreach my $tag (@tags) { + my $st = $self->_tags->{$tag}; + + if ( $st->number() == 0 ) { next } + + my $STD = '---'; + + if ( $st->number() > 1 ) { + my $SS = + $st->total_time_time() - + $st->total_time()*$st->total_time()/$st->number(); + + if ( $SS > 0 ) { + $STD = sprintf( "%6f", + sqrt( $SS/$st->number()/( $st->number() - 1 ) ) + ); + } + } + + print( $fh sprintf( "Eprof: %20s %6f %6f %d %s [%6f,%6f]\n", + $st->tag(), $st->total_time(), + $st->total_time()/$st->number(), $st->number(), + $STD, $st->min_time(), + $st->max_time() ) ); + } ## end foreach my $tag (@tags) +} ## end sub dump + +=head2 start + + Title : start + Usage : $eprof->start('this_tag'); + Function: + Example : + Returns : + Args : + + +=cut + +sub start { + my ( $self, $tag ) = @_; + + if ( !defined($tag) ) { + $self->throw("No tag, can't start."); + } + + if ( !defined( $self->_tags()->{$tag} ) ) { + $self->_tags()->{$tag} = Bio::EnsEMBL::Utils::EprofStack->new($tag); + } + + $self->_tags()->{$tag}->push_stack(); +} + +=head2 end + + Title : end + Usage : $eprof->end('this_tag'); + Function: + Example : + Returns : + Args : + + +=cut + +sub end { + my ( $self, $tag ) = @_; + + if ( !defined($tag) ) { + $self->throw("No tag, can't end."); + } + + if ( !defined( $self->_tags()->{$tag} ) ) { + $self->throw( + sprintf( "Ending with a nonexistant tag '%s'", $tag ) ); + } + + $self->_tags->{$tag}->pop_stack(); +} + +=head2 _tags + + Title : _tags + Usage : $obj->_tags($newval) + Function: + Returns : value of _tags + Args : newvalue (optional) + + +=cut + +sub _tags { + my ($obj) = @_; + return $obj->{'_tags'}; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/EprofStack.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/EprofStack.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,290 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Util::EprofStack - DESCRIPTION of Object + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Utils::EprofStack; + +use strict; +use warnings; + +use POSIX; + +use Bio::EnsEMBL::Utils::Exception ('warning'); + +BEGIN { + eval { + require Time::HiRes; + Time::HiRes->import('time'); + }; +} + +sub new { + my ( $proto, $name ) = @_; + + my $class = ref($proto) || $proto; + + my $self = bless( { 'is_active' => 0, + 'total_time' => 0, + 'total_time_time' => 0, + 'max_time' => 0, + 'min_time' => 999999999, + 'number' => 0, + 'tag' => $name + }, + $class ); + + return $self; +} + +=head2 push_stack + + Title : push_stack + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub push_stack { + my ( $self, @args ) = @_; + + if ( $self->{'is_active'} == 1 ) { + warning( + sprintf( "Attempting to push stack on tag '%s' " + . "when active. Discarding previous push." + . $self->tag() ) ); + } + + # my ( $user, $sys ) = times(); + # $self->{'current_start'} = (POSIX::times)[0]; + + $self->{'current_start'} = time(); + $self->{'is_active'} = 1; +} + +=head2 pop_stack + + Title : pop_stack + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub pop_stack { + my ( $self, @args ) = @_; + + if ( $self->{'is_active'} == 0 ) { + warning( + sprintf( "Attempting to pop stack on tag '%s' " + . "when not active. Ignoring.", + $self->tag() ) ); + } + + # my ( $user, $sys ) = times(); + # my $clocktime = + # ( (POSIX::times)[0] - $self->{'current_start'} )/ + # POSIX::sysconf(&POSIX::_SC_CLK_TCK); + + my $clocktime = time() - $self->{'current_start'}; + + if ( $self->{'max_time'} < $clocktime ) { + $self->{'max_time'} = $clocktime; + } + if ( $self->{'min_time'} > $clocktime ) { + $self->{'min_time'} = $clocktime; + } + + $self->{'total_time'} += $clocktime; + $self->{'total_time_time'} += $clocktime*$clocktime; + $self->{'number'}++; + $self->{'is_active'} = 0; +} ## end sub pop_stack + +=head2 total_time_time + + Title : total_time_time + Usage : $obj->total_time_time($newval) + Function: + Returns : value of total_time_time + Args : newvalue (optional) + + +=cut + +sub total_time_time { + my ( $self, $value ) = @_; + + if ( defined($value) ) { $self->{'total_time_time'} = $value } + + return $self->{'total_time_time'}; +} + +=head2 max_time + + Title : max_time + Usage : $obj->max_time($newval) + Function: + Returns : value of max_time + Args : newvalue (optional) + + +=cut + +sub max_time { + my ( $self, $value ) = @_; + + if ( defined($value) ) { $self->{'max_time'} = $value } + + return $self->{'max_time'}; +} + +=head2 min_time + + Title : min_time + Usage : $obj->min_time($newval) + Function: + Returns : value of min_time + Args : newvalue (optional) + + +=cut + +sub min_time { + my ( $self, $value ) = @_; + + if ( defined($value) ) { $self->{'min_time'} = $value } + + return $self->{'min_time'}; +} + +=head2 total_time + + Title : total_time + Usage : $obj->total_time($newval) + Function: + Returns : value of total_time + Args : newvalue (optional) + + +=cut + +sub total_time { + my ( $self, $value ) = @_; + + if ( defined($value) ) { $self->{'total_time'} = $value } + + return $self->{'total_time'}; +} + +=head2 number + + Title : number + Usage : $obj->number($newval) + Function: + Returns : value of number + Args : newvalue (optional) + + +=cut + +sub number { + my ( $self, $value ) = @_; + + if ( defined($value) ) { $self->{'number'} = $value } + + return $self->{'number'}; +} + +=head2 is_active + + Title : is_active + Usage : $obj->is_active($newval) + Function: + Returns : value of is_active + Args : newvalue (optional) + + +=cut + +sub is_active { + my ( $self, $value ) = @_; + + if ( defined($value) ) { $self->{'is_active'} = $value } + + return $self->{'is_active'}; +} + +=head2 current_start + + Title : current_start + Usage : $obj->current_start($newval) + Function: + Returns : value of current_start + Args : newvalue (optional) + + +=cut + +sub current_start { + my ( $self, $value ) = @_; + + if ( defined($value) ) { $self->{'current_start'} = $value } + + return $self->{'current_start'}; +} + +=head2 tag + + Title : tag + Usage : $obj->tag($newval) + Function: + Returns : value of tag + Args : newvalue (optional) + + +=cut + +sub tag { + my ( $self, $value ) = @_; + + if ( defined($value) ) { $self->{'tag'} = $value } + + return $self->{'tag'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/Exception.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/Exception.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,536 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::Exception - Utility functions for error handling + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Utils::Exception + qw(throw warning deprecate verbose try catch); + + or to get all methods just + + use Bio::EnsEMBL::Utils::Exception; + + eval { throw("this is an exception with a stack trace") }; + if ($@) { + print "Caught exception:\n$@"; + } + + # Or you can us the try/catch confortable syntax instead to deal with + # throw or die. Don't forget the ";" after the catch block. With + # this syntax, the original $@ is in $_ in the catch subroutine. + + try { + throw("this is an exception with a stack trace"); + } + catch { print "Caught exception:\n$_" }; + + # silence warnings + verbose('OFF'); + + warning('this is a silent warning'); + + #show deprecated and warning messages but not info + verbose('DEPRECATE'); + + warning('this is a warning'); + + # show all messages + verbose('ALL'); + + info('this is an informational message'); + + sub my_sub { deprecate('use other_sub() instead') } + + verbose('EXCEPTION'); + info( 'This is a high priority info message.', 1000 ); + +=head1 DESCRIPTION + +This is derived from the Bio::Root module in BioPerl. Some formatting +has been changed and the deprecate function has been added. Most +notably the object methods are now static class methods that can be +called without inheriting from Bio::Root or Bio::EnsEMBL::Root. This is +especially useful for throwing exceptions with stack traces outside of a +blessed context. + +The originaly implementations of these methods were by Steve Chervitz +and refactored by Ewan Birney. + +It is recommended that these functions be used instead of inheriting +unnecessarily from the Bio::EnsEMBL::Root or Bio::Root object. The +functions exported by this package provide a set of useful error +handling methods. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Utils::Exception; + +use strict; +use warnings; + +use Bio::EnsEMBL::ApiVersion; + +use Exporter; + +use vars qw(@ISA @EXPORT); + +@ISA = qw(Exporter); +@EXPORT = qw(throw warning stack_trace_dump + stack_trace verbose deprecate info try catch); + +my $VERBOSITY = 3000; +my $DEFAULT_INFO = 4000; +my $DEFAULT_DEPRECATE = 3000; +my $DEFAULT_WARNING = 2000; +my $DEFAULT_EXCEPTION = 1000; + + +=head2 throw + + Arg [1] : string $msg + Arg [2] : (optional) int $level + override the default level of exception throwing + Example : use Bio::EnsEMBL::Utils::Exception qw(throw); + throw('We have a problem'); + Description: Throws an exception which if not caught by an eval will + provide a stack trace to STDERR and die. If the verbosity level + is lower than the level of the throw, then no error message is + displayed but the program will still die (unless the exception + is caught). + Returntype : none + Exceptions : thrown every time + Caller : generally on error + +=cut + +sub throw { + my $string = shift; + + # For backwards compatibility with Bio::EnsEMBL::Root::throw: Allow + # to be called as an object method as well as class method. Root + # function now deprecated so call will have the string instead. + + $string = shift if ( ref($string) ); # Skip object if one provided. + $string = shift if ( $string eq "Bio::EnsEMBL::Utils::Exception" ); + + my $level = shift; + $level = $DEFAULT_EXCEPTION if ( !defined($level) ); + + if ( $VERBOSITY < $level ) { + die("\n"); # still die, but silently + } + + my $std = stack_trace_dump(3); + + my $out = sprintf( + "\n" . + "-------------------- EXCEPTION --------------------\n" . + "MSG: %s\n" . + "%s" . + "Date (localtime) = %s\n" . + "Ensembl API version = %s\n" . + "---------------------------------------------------\n", + $string, $std, scalar( localtime() ), software_version() ); + + die($out); +} ## end sub throw + + + +=head2 warning + + Arg [1] : string warning(message); + Arg [2] : (optional) int level + Override the default level of this warning changning the level + of verbosity at which it is displayed. + Example : use Bio::EnsEMBL::Utils::Exception qw(warning) + warning('This is a warning'); + Description: If the verbosity level is higher or equal to the level of this + warning then a warning message is printed to STDERR. If the + verbosity lower then nothing is done. Under the default + levels of warning and verbosity warnings will be displayed. + Returntype : none + Exceptions : warning every time + Caller : general + +=cut + +sub warning { + my $string = shift; + + # See throw() for this: + $string = shift if ( ref($string) ); # Skip object if one provided. + $string = shift if ( $string eq "Bio::EnsEMBL::Utils::Exception" ); + + my $level = shift; + $level = $DEFAULT_WARNING if ( !defined($level) ); + + return if ( $VERBOSITY < $level ); + + my @caller = caller; + my $line = $caller[2] || ''; + + # Use only two sub-dirs for brevity when reporting the file name. + my $file; + my @path = split( /\//, $caller[1] ); + $file = pop(@path); + my $i = 0; + while ( @path && $i < 2 ) { + $i++; + $file = pop(@path) . "/$file"; + } + + @caller = caller(1); + my $caller_line; + my $caller_file; + $i = 0; + if (@caller) { + @path = split( /\//, $caller[1] ); + $caller_line = $caller[2]; + $caller_file = pop(@path); + while ( @path && $i < 2 ) { + $i++; + $caller_file = pop(@path) . "/$caller_file"; + } + } + + my $out = + sprintf( "\n" . + "-------------------- WARNING ----------------------\n" . + "MSG: %s\n" . + "FILE: %s LINE: %d\n", + $string, $file, $line ); + + if ( defined($caller_file) ) { + $out .= sprintf( "CALLED BY: %s LINE: %d\n", $caller_file, + $caller_line ); + } + $out .= sprintf( + "Date (localtime) = %s\n" . + "Ensembl API version = %s\n" . + "---------------------------------------------------\n", + scalar( localtime() ), software_version() ); + + warn($out); + +} ## end sub warning + + + +=head2 info + + Arg [1] : string $string + The message to be displayed + Arg [2] : (optional) int $level + Override the default level of this message so it is displayed at + a different level of verbosity than it normally would be. + Example : use Bio::EnsEMBL::Utils::Exception qw(verbose info) + Description: This prints an info message to STDERR if verbosity is higher + than the level of the message. By default info messages are not + displayed. + Returntype : none + Exceptions : none + Caller : general + +=cut + +sub info { + my $string = shift; + $string = shift if($string eq "Bio::EnsEMBL::Utils::Exception"); + my $level = shift; + + $level = $DEFAULT_INFO if(!defined($level)); + + return if($VERBOSITY < $level); + + print STDERR "INFO: $string\n"; +} + + + +=head2 verbose + + Arg [1] : (optional) int + Example : use Bio::EnsEMBL::Utils::Exception qw(verbose warning); + #turn warnings and everything more important on (e.g. exception) + verbose('WARNING'); + warning("Warning displayed"); + info("This won't be displayed"); + deprecate("This won't be diplayed"); + + #turn exception messages on + verbose('EXCEPTION'); + warning("This won't do anything"); + throw("Die with a message"); + + #turn everying off + verbose('OFF'); #same as verbose(0); + warning("This won't do anything"); + throw("Die silently without a message"); + + #turn on all messages + verbose('ALL'); + info("All messages are now displayed"); + + if(verbose() > 3000) { + print "Verbosity is pretty high"; + } + + Description: Gets/Sets verbosity level which defines which messages are + to be displayed. An integer value may be passed or one of the + following strings: + 'OFF' (= 0) + 'EXCEPTION' (= 1000) + 'WARNING' (= 2000) + 'DEPRECATE' (= 3000) + 'INFO' (= 4000) + 'ALL' (= 1000000) + + Returntype : int + Exceptions : none + Caller : general + +=cut + + +sub verbose { + if(@_) { + my $verbosity = shift; + $verbosity = shift if($verbosity eq "Bio::EnsEMBL::Utils::Exception"); + if($verbosity =~ /\d+/) { #check if verbosity is an integer + $VERBOSITY = $verbosity; + } else { + $verbosity = uc($verbosity); + if($verbosity eq 'OFF' || $verbosity eq 'NOTHING' || + $verbosity eq 'NONE') { + $VERBOSITY = 0; + } elsif($verbosity eq 'EXCEPTION' || $verbosity eq 'THROW') { + $VERBOSITY = $DEFAULT_EXCEPTION; + } elsif($verbosity eq 'WARNING' || $verbosity eq 'WARN') { + $VERBOSITY = $DEFAULT_WARNING; + } elsif($verbosity eq 'DEPRECATE' || $verbosity eq 'DEPRECATED') { + $VERBOSITY = $DEFAULT_DEPRECATE; + } elsif($verbosity eq 'INFO') { + $VERBOSITY = $DEFAULT_INFO; + } elsif($verbosity eq 'ON' || $verbosity eq 'ALL') { + $VERBOSITY = 1e6; + } else { + $VERBOSITY = $DEFAULT_WARNING; + warning("Unknown level of verbosity: $verbosity"); + } + } + } + + return $VERBOSITY; +} + + + +=head2 stack_trace_dump + + Arg [1] : (optional) int $levels + The number of levels to ignore from the top of the stack when + creating the dump. This is useful when this is called internally + from a warning or throw function when the immediate caller and + stack_trace_dump function calls are themselves uninteresting. + Example : use Bio::EnsEMBL::Utils::Exception qw(stack_trace_dump); + print STDERR stack_trace_dump(); + Description: Returns a stack trace formatted as a string + Returntype : string + Exceptions : none + Caller : general, throw, warning + +=cut + +sub stack_trace_dump{ + my @stack = stack_trace(); + + my $levels = 2; #default is 2 levels so stack_trace_dump call is not present + $levels = shift if(@_); + $levels = shift if($levels eq "Bio::EnsEMBL::Utils::Exception"); + $levels = 1 if($levels < 1); + + while($levels) { + $levels--; + shift @stack; + } + + my $out; + my ($module,$function,$file,$position); + + + foreach my $stack ( @stack) { + ($module,$file,$position,$function) = @{$stack}; + $out .= "STACK $function $file:$position\n"; + } + + return $out; +} + + + +=head2 stack_trace + + Arg [1] : none + Example : use Bio::EnsEMBL::Utils::Exception qw(stack_trace) + Description: Gives an array to a reference of arrays with stack trace info + each coming from the caller(stack_number) call + Returntype : array of listrefs of strings + Exceptions : none + Caller : general, stack_trace_dump() + +=cut + +sub stack_trace { + my $i = 0; + my @out; + my $prev; + while ( my @call = caller($i++)) { + + # major annoyance that caller puts caller context as + # function name. Hence some monkeying around... + $prev->[3] = $call[3]; + push(@out,$prev); + $prev = \@call; + } + $prev->[3] = 'toplevel'; + push(@out,$prev); + return @out; +} + + +=head2 deprecate + + Arg [1] : string $mesg + A message describing why a method is deprecated + Example : use Bio::EnsEMBL::Utils::Exception qw(deprecate) + sub old_sub { + deprecate('Please use new_sub() instead'); + } + Description: Prints a warning to STDERR that the method which called + deprecate() is deprecated. Also prints the line number and + file from which the deprecated method was called. Deprecated + warnings only appear once for each location the method was + called from. No message is displayed if the level of verbosity + is lower than the level of the warning. + Returntype : none + Exceptions : warning every time + Caller : deprecated methods + +=cut + +my %DEPRECATED; + +sub deprecate { + my $mesg = shift; + $mesg = shift if($mesg eq "Bio::EnsEMBL::Utils::Exception"); #skip object if one provided + + my $level = shift; + + $level = $DEFAULT_DEPRECATE if(!defined($level)); + + return if($VERBOSITY < $level); + + my @caller = caller(1); + my $subname = $caller[3] ; + my $line = $caller[2]; + + #use only 2 subdirs for brevity when reporting the filename + my $file; + my @path = $caller[1]; + $file = pop(@path); + my $i = 0; + while(@path && $i < 2) { + $i++; + $file .= pop(@path); + } + + #keep track of who called this method so that the warning is only displayed + #once per deprecated call + return if $DEPRECATED{"$line:$file:$subname"}; + + if ( $VERBOSITY > -1 ) { + print STDERR + "\n------------------ DEPRECATED ---------------------\n" + . "Deprecated method call in file $file line $line.\n" + . "Method $subname is deprecated.\n" + . "$mesg\n" + . "Ensembl API version = " + . software_version() . "\n" + . "---------------------------------------------------\n"; + } + + $DEPRECATED{"$line:$file:$subname"} = 1; +} + +=head2 try/catch + + Arg [1] : anonymous subroutine + the block to be tried + Arg [2] : return value of the catch function + Example : use Bio::EnsEMBL::Utils::Exception qw(throw try catch) + The syntax is: + try { block1 } catch { block2 }; + { block1 } is the 1st argument + catch { block2 } is the 2nd argument + e.g. + try { + throw("this is an exception with a stack trace"); + } catch { + print "Caught exception:\n$_"; + }; + In block2, $_ is assigned the value of the first + throw or die statement executed in block 1. + + Description: Replaces the classical syntax + eval { block1 }; + if ($@) { block2 } + by a more confortable one. + In the try/catch syntax, the original $@ is in $_ in the catch subroutine. + This try/catch implementation is a copy and paste from + "Programming Perl" 3rd Edition, July 2000, by L.Wall, T. Christiansen + & J. Orwant. p227, and is only possible because of subroutine prototypes. + Returntype : depend on what is implemented the try or catch block + Exceptions : none + Caller : general + +=cut + +sub try (&$) { + my ($try, $catch) = @_; + eval { &$try }; + if ($@) { + chop $@; + local $_ = $@; + &$catch; + } +} + +sub catch (&) { + shift; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/IO.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/IO.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,474 @@ +package Bio::EnsEMBL::Utils::IO; + +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=pod + +=head1 NAME + +Bio::EnsEMBL::Utils::IO + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Utils::IO qw/slurp work_with_file slurp_to_array fh_to_array/; + #or + # use Bio::EnsEMBL::Utils::IO qw/:slurp/; #brings in any method starting with slurp + # use Bio::EnsEMBL::Utils::IO qw/:array/; #brings in any method which ends with _array + # use Bio::EnsEMBL::Utils::IO qw/:gz/; #brings all methods which start with gz_ + # use Bio::EnsEMBL::Utils::IO qw/:all/; #brings all methods in + + #As a scalar + my $file_contents = slurp('/my/file/location.txt'); + print length($file_contents); + + #As a ref + my $file_contents_ref = slurp('/my/file/location.txt', 1); + print length($$file_contents_ref); + + #Sending it to an array + my $array = slurp_to_array('/my/location'); + work_with_file('/my/location', 'r', sub { + $array = process_to_array($_[0], sub { + #Gives us input line by line + return "INPUT: $_"; + }); + }); + + #Simplified vesion but without the post processing + $array = fh_to_array($fh); + + #Sending this back out to another file + work_with_file('/my/file/newlocation.txt', 'w', sub { + my ($fh) = @_; + print $fh $$file_contents_ref; + return; + }); + + #Gzipping the data to another file + gz_work_with_file('/my/file.gz', 'w', sub { + my ($fh) = @_; + print $fh $$file_contents_ref; + return; + }); + + #Working with a set of lines manually + work_with_file('/my/file', 'r', sub { + my ($fh) = @_; + iterate_lines($fh, sub { + my ($line) = @_; + print $line; #Send the line in the file back out + return; + }); + return; + }); + + #Doing the same in one go + iterate_file('/my/file', sub { + my ($line) = @_; + print $line; #Send the line in the file back out + return; + }); + + #Move all data from one file handle to another. Bit like a copy + move_data($src_fh, $trg_fh); + +=head1 DESCRIPTION + +A collection of subroutines aimed to helping IO based operations + +=head1 METHODS + +See subroutines. + +=head1 MAINTAINER + +$Author: ady $ + +=head1 VERSION + +$Revision: 1.10 $ + +=cut + +use strict; +use warnings; + +use base qw(Exporter); + +our $GZIP_OK = 0; +our @EXPORT_OK = qw/slurp slurp_to_array fh_to_array process_to_array work_with_file gz_slurp gz_slurp_to_array gz_work_with_file iterate_file iterate_lines move_data/; +our %EXPORT_TAGS = ( + all => [@EXPORT_OK], + slurp => [qw/slurp slurp_to_array gz_slurp gz_slurp_to_array/], + array => [qw/fh_to_array process_to_array slurp_to_array gz_slurp_to_array/], + gz => [qw/gz_slurp gz_slurp_to_array gz_work_with_file/], + iterate => [qw/iterate_file iterate_lines/], +); +use Bio::EnsEMBL::Utils::Exception qw(throw); +use Bio::EnsEMBL::Utils::Scalar qw(:assert); +use IO::File; +eval { + require IO::Compress::Gzip; + require IO::Uncompress::Gunzip; + $GZIP_OK = 1; +}; + +=head2 slurp() + + Arg [1] : string $file + Arg [2] : boolean; $want_ref + Arg [3] : boolean; $binary + Indicates if we want to return a scalar reference + Description : Forces the contents of a file into a scalar. This is the + fastest way to get a file into memory in Perl. You can also + get a scalar reference back to avoid copying the file contents + in Scalar references. If the input file is binary then specify + with the binary flag + Returntype : Scalar or reference of the file contents depending on arg 2 + Example : my $contents = slurp('/tmp/file.txt'); + Exceptions : If the file did not exist or was not readable + Status : Stable + +=cut + +sub slurp { + my ($file, $want_ref, $binary) = @_; + my $contents = q{}; + work_with_file($file, 'r', sub { + my ($fh) = @_; + binmode($fh) if $binary; + my $size_left = -s $file; + while( $size_left > 0 ) { + my $read_cnt = sysread($fh, $contents, $size_left, length($contents)); + unless( $read_cnt ) { + throw "read error in file $file: $!" ; + last; + } + $size_left -= $read_cnt ; + } + return; + }); + return ($want_ref) ? \$contents : $contents; +} + +=head2 gz_slurp() + + Arg [1] : string $file + Arg [2] : boolean; $want_ref Indicates if we want to return a scalar reference + Arg [3] : boolean; $binary + Arg [4] : HashRef arguments to pass into IO compression layers + Description : Forces the contents of a file into a scalar. This is the + fastest way to get a file into memory in Perl. You can also + get a scalar reference back to avoid copying the file contents + in Scalar references. If the input file is binary then specify + with the binary flag + Returntype : Scalar or reference of the file contents depending on arg 2 + Example : my $contents = slurp('/tmp/file.txt.gz'); + Exceptions : If the file did not exist or was not readable + Status : Stable + +=cut + +sub gz_slurp { + my ($file, $want_ref, $binary, $args) = @_; + my $contents; + gz_work_with_file($file, 'r', sub { + my ($fh) = @_; + local $/ = undef; + binmode($fh) if $binary; + $contents = <$fh>; + return; + }, $args); + return ($want_ref) ? \$contents : $contents; +} + +=head2 slurp_to_array() + + Arg [1] : string $file + Arg [2] : boolean $chomp + Description : Sends the contents of the given file into an ArrayRef + Returntype : ArrayRef + Example : my $contents_array = slurp_to_array('/tmp/file.txt'); + Exceptions : If the file did not exist or was not readable + Status : Stable + +=cut + +sub slurp_to_array { + my ($file, $chomp) = @_; + my $contents; + work_with_file($file, 'r', sub { + my ($fh) = @_; + $contents = fh_to_array($fh, $chomp); + return; + }); + return $contents; +} + +=head2 gz_slurp_to_array() + + Arg [1] : string $file + Arg [2] : boolean $chomp + Arg [3] : HashRef arguments to pass into IO compression layers + Description : Sends the contents of the given gzipped file into an ArrayRef + Returntype : ArrayRef + Example : my $contents_array = slurp_to_array('/tmp/file.txt.gz'); + Exceptions : If the file did not exist or was not readable + Status : Stable + +=cut + +sub gz_slurp_to_array { + my ($file, $chomp, $args) = @_; + my $contents; + gz_work_with_file($file, 'r', sub { + my ($fh) = @_; + $contents = fh_to_array($fh, $chomp); + return; + }, $args); + return $contents; +} + +=head2 fh_to_array() + + Arg [1] : Glob/IO::Handle $fh + Arg [2] : boolean $chomp + Description : Sends the contents of the given filehandle into an ArrayRef. + Will perform chomp on each line if specified. If you require + any more advanced line based processing then see + L. + Returntype : ArrayRef + Example : my $contents_array = fh_to_array($fh); + Exceptions : None + Status : Stable + +=cut + +sub fh_to_array { + my ($fh, $chomp) = @_; + if($chomp) { + return process_to_array($fh, sub { + my ($line) = @_; + chomp($line); + return $line; + }); + } + my @contents = <$fh>; + return \@contents; +} + +=head2 process_to_array + + Arg [1] : Glob/IO::Handle $fh + Arg [2] : CodeRef $callback + Description : Sends the contents of the given file handle into an ArrayRef + via the processing callback. Assumes line based input. + Returntype : ArrayRef + Example : my $array = process_to_array($fh, sub { return "INPUT: $_"; }); + Exceptions : If the fh did not exist or if a callback was not given. + Status : Stable + +=cut + +sub process_to_array { + my ($fh, $callback) = @_; + assert_file_handle($fh, 'FileHandle'); + assert_ref($callback, 'CODE', 'callback'); + my @contents; + iterate_lines($fh, sub { + my ($line) = @_; + push(@contents, $callback->($line)); + return; + }); + return \@contents; +} + +=head2 iterate_lines + + Arg [1] : Glob/IO::Handle $fh + Arg [2] : CodeRef $callback + Description : Iterates through each line from the given file handle and + hands them to the callback one by one + Returntype : None + Example : iterate_lines($fh, sub { print "INPUT: $_"; }); + Exceptions : If the fh did not exist or if a callback was not given. + Status : Stable + +=cut + +sub iterate_lines { + my ($fh, $callback) = @_; + assert_file_handle($fh, 'FileHandle'); + assert_ref($callback, 'CODE', 'callback'); + while( my $line = <$fh> ) { + $callback->($line); + } + return; +} + +=head2 iterate_file + + Arg [1] : string $file + Arg [3] : CodeRef the callback which is used to iterate the lines in + the file + Description : Iterates through each line from the given file and + hands them to the callback one by one + Returntype : None + Example : iterate_file('/my/file', sub { print "INPUT: $_"; }); + Exceptions : If the file did not exist or if a callback was not given. + Status : Stable + +=cut + + +sub iterate_file { + my ($file, $callback) = @_; + work_with_file($file, 'r', sub { + my ($fh) = @_; + iterate_lines($fh, $callback); + return; + }); + return; +} + + + +=head2 work_with_file() + + Arg [1] : string $file + Arg [2] : string; $mode + Supports all modes specified by the C function as well as those + supported by IO::File + Arg [3] : CodeRef the callback which is given the open file handle as + its only argument + Description : Performs the nitty gritty of checking if a file handle is open + and closing the resulting filehandle down. + Returntype : None + Example : work_with_file('/tmp/out.txt', 'w', sub { + my ($fh) = @_; + print $fh 'hello'; + return; + }); + Exceptions : If we could not work with the file due to permissions + Status : Stable + +=cut + +sub work_with_file { + my ($file, $mode, $callback) = @_; + throw "We need a file name to open" if ! $file; + throw "We need a mode to open the requested file with" if ! $mode; + assert_ref($callback, 'CODE', 'callback'); + my $fh = IO::File->new($file, $mode) or + throw "Cannot open '${file}' in mode '${mode}': $!"; + $callback->($fh); + close($fh) or throw "Cannot close FH from ${file}: $!"; + return; +} + +=head2 gz_work_with_file() + + Arg [1] : string $file + Arg [2] : string; $mode + Supports modes like C, C, C<\>> and C<\<> + Arg [3] : CodeRef the callback which is given the open file handle as + its only argument + Arg [4] : HashRef used to pass options into the IO + compression/uncompression modules + Description : Performs the nitty gritty of checking if a file handle is open + and closing the resulting filehandle down. + Returntype : None + Example : work_with_file('/tmp/out.txt.gz', 'w', sub { + my ($fh) = @_; + print $fh 'hello'; + return; + }); + Exceptions : If we could not work with the file due to permissions + Status : Stable + +=cut + +sub gz_work_with_file { + my ($file, $mode, $callback, $args) = @_; + throw "IO::Compress was not available"if ! $GZIP_OK; + throw "We need a file name to open" if ! $file; + throw "We need a mode to open the requested file with" if ! $mode; + assert_ref($callback, 'CODE', 'callback'); + $args ||= {}; + + my $fh; + { + no warnings qw/once/; + if($mode =~ '>$' || $mode eq 'w') { + $args->{Append} = 1 if $mode =~ />>$/; + $fh = IO::Compress::Gzip->new($file, %$args) or throw "Cannot open '$file' for writing: $IO::Compress::Gzip::GzipError"; + } + elsif($mode eq '<' || $mode eq 'r') { + $fh = IO::Uncompress::Gunzip->new($file, %$args) or throw "Cannot open '$file' for writing: $IO::Uncompress::Gunzip::GunzipError"; + } + else { + throw "Could not decipher a mode from '$mode'"; + } + }; + $callback->($fh); + close($fh) or throw "Cannot close FH from ${file}: $!"; + return; +} + +=head2 move_data + + Arg [1] : FileHandle $src_fh + Arg [2] : FileHandle $trg_fh + Arg [3] : int $buffer. Defaults to 8KB + Description : Moves data from the given source filehandle to the target one + using a 8KB buffer or user specified buffer + Returntype : None + Example : move_data($src_fh, $trg_fh, 16*1024); # copy in 16KB chunks + Exceptions : If inputs were not as expected + +=cut + +sub move_data { + my ($src_fh, $trg_fh, $buffer_size) = @_; + assert_file_handle($src_fh, 'SourceFileHandle'); + assert_file_handle($trg_fh, 'TargetFileHandle'); + + $buffer_size ||= 8192; #Default 8KB + my $buffer; + while(1) { + my $read = sysread($src_fh, $buffer, $buffer_size); + if(! defined $read) { + throw "Error whilst reading from filehandle: $!"; + } + if($read == 0) { + last; + } + my $written = syswrite($trg_fh, $buffer); + if(!defined $written) { + throw "Error whilst writing to filehandle: $!"; + } + } + return; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/IO/FASTASerializer.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/IO/FASTASerializer.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,248 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::IO::FASTASerializer + +=head1 SYNOPSIS + + my $serializer = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($filehandle); + $serializer->chunk_factor(1000); + $serializer->line_width(60); + $serializer->print_Seq($slice); + + $serializer = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($filehandle, + sub { + my $slice = shift; + return "Custom header"; + } + ); + +=head1 DESCRIPTION + + Replacement for SeqDumper, making better use of shared code. Outputs FASTA + format with optional custom header and formatting parameters. Set line_width + and chunk_factor to dictate buffer size depending on application. A 60kb + buffer is used by default with a line width of 60 characters. + + Custom headers are set by supplying an anonymous subroutine to new(). Custom + header code must accept a Slice or Bio::PrimarySeqI compliant object as + argument and return a string. + + The custom header method can be overridden later through set_custom_header() + but this is not normally necessary. + +=cut + +package Bio::EnsEMBL::Utils::IO::FASTASerializer; + +use strict; +use warnings; +use Bio::EnsEMBL::Utils::Exception; +use Bio::EnsEMBL::Utils::Scalar qw/assert_ref check_ref/; + +use base qw(Bio::EnsEMBL::Utils::IO::Serializer); + +=head2 new + + Arg [1] : Filehandle (optional) + Arg [2] : CODEREF subroutine for writing custom headers + Arg [3] : [optional] Chunking size (integer) + Arg [4] : [optional] Line width (integer) + Example : $dumper = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($filehandle,$header_function,1000,60); + Description: Constructor + Allows the specification of a custom function for rendering + header lines. + Returntype : Bio::EnsEMBL::Utils::IO::FASTASerializer; + Exceptions : none + Caller : general + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + my $filehandle = shift; + my $header_function = shift; + my $chunk_factor = shift; + my $line_width = shift; + + my $self = $class->SUPER::new($filehandle); + + $self->{'header_function'} = $header_function; + $self->{'line_width'} = ($line_width)? $line_width : 60; + $self->{'chunk_factor'} = ($chunk_factor)? $chunk_factor : 1000; + # gives a 60kb buffer by default, increase for higher database and disk efficiency. + + # TODO: Check this error trap works as intended + if ( defined($self->{'header_function'}) ) { + if (ref($self->{'header_function'}) ne "CODE") { + throw("Custom header function must be an anonymous subroutine when instantiating FASTASerializer");} + } + else { + $self->{'header_function'} = sub { + my $slice = shift; + + if(check_ref($slice, 'Bio::EnsEMBL::Slice')) { + my $id = $slice->seq_region_name; + my $seqtype = 'dna'; + my $idtype = $slice->coord_system->name; + my $location = $slice->name; + + return "$id $seqtype:$idtype $location"; + } + else { + # must be a Bio::Seq , or we're doomed + + return $slice->display_id; + } + }; + + } + + return $self; +} + +=head2 print_metadata + + Arg [1] : Bio::EnsEMBL::Slice + Description: Printing header lines into FASTA files. Usually handled + internally to the serializer. + Returntype : None + Caller : print_Seq +=cut + +sub print_metadata { + my $self = shift; + my $slice = shift; + my $fh = $self->{'filehandle'}; + my $function = $self->header_function(); + my $metadata = $function->($slice); + print $fh '>'.$metadata."\n"; +} + +=head2 print_Seq + + Arg [1] : Bio::EnsEMBL::Slice or other Bio::PrimarySeqI compliant object + + Description: Serializes the slice into FASTA format. Buffering is used + While other Bioperl PrimarySeqI implementations can be used, + a custom header function will be required to accommodate it. + + Returntype : None + +=cut + +sub print_Seq { + my $self = shift; + my $slice = shift; + my $fh = $self->{'filehandle'}; + + $self->print_metadata($slice); + my $width = $self->{line_width}; + + # set buffer size + my $chunk_size = $self->{'chunk_factor'} * $width; + + my $start = 1; + my $end = $slice->length(); + + #chunk the sequence to conserve memory, and print + + my $here = $start; + + while($here <= $end) { + my $there = $here + $chunk_size - 1; + $there = $end if($there > $end); + my $seq = $slice->subseq($here, $there); + $seq =~ s/(.{1,$width})/$1\n/g; + print $fh $seq or die "Error writing to file handle"; + $here = $there + 1; + } + + if ($slice->length > 0) {$self->{'achieved_something'} = 1;} + +} + +=head2 line_width + + Arg [1] : Integer e.g. 60 or 80 + Description: Set and get FASTA format line width. Default is 60 + Returntype : Integer + +=cut + +sub line_width { + my $self = shift; + my $line_width = shift; + if ($line_width) { $self->{'line_width'} = $line_width }; + return $self->{'line_width'} +} + +=head2 chunk_factor + Arg [1] : Integer e.g. 1000 + Description: Set and get the multiplier used to dictate buffer size + Chunk factor x line width = buffer size in bases. + Returntype : Integer +=cut + +sub chunk_factor { + my $self = shift; + my $chunk_factor = shift; + if ($chunk_factor) { $self->{'chunk_factor'} = $chunk_factor}; + return $self->{'chunk_factor'} +} + +=head2 set_custom_header + + Arg [1] : CODE reference + Description: Set the custom header function. Normally this is done at + construction time, but can be overridden here. + Example : $serializer->set_custom_header( sub { return 'New header'}); + Returntype : + +=cut + +sub set_custom_header { + my ($self, $new_header_function) = @_; + $self->header_function($new_header_function); + return; +} + +=head2 header_function + + Arg [1] : CODE reference (optional) + Description: Getter/setter for the custom header code + Example : $serializer->header_function( sub { return 'New header'}); + Returntype : CODE + +=cut + +sub header_function { + my ($self, $header_function) = @_; + if($header_function) { + assert_ref($header_function, 'CODE', 'header_function'); + $self->{header_function} = $header_function; + } + return $self->{header_function}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/IO/FeatureSerializer.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/IO/FeatureSerializer.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,91 @@ +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 NAME + +FeatureSerializer - An abstract serializer for turning EnsEMBL Features into other formats + +=head1 AUTHOR + +Kieron Taylor, 2012 - ktaylor@ebi.ac.uk + +=head1 SYNOPSIS + +my $serializer = new Bio::EnsEMBL::Utils::IO::FeatureSerializer( $filehandle ); +$serializer->print_feature_list( \@list_of_features ); + +=head1 DESCRIPTION + +Generic class for serializing features Subclass this class to create a +format-specific serializer. Be sure to implement print_feature at the +bare minimum + +=cut + +package Bio::EnsEMBL::Utils::IO::FeatureSerializer; + +use strict; +use warnings; + +use base qw(Bio::EnsEMBL::Utils::IO::Serializer); + +# Majority of methods inherited from Serializer + +sub print_feature { + throw( "print_feature method not implemented."); +} + +=head2 print_feature_list + + Arg [1] : Listref of Features + Description: Run print_feature on every feature in the list + +=cut + +sub print_feature_list { + my $self = shift; + my $feature_list = shift; + if (ref($feature_list) eq 'ARRAY') { + if (scalar(@$feature_list) > 0) {$self->{'achieved_something'} = 1;} + foreach my $feature (@{$feature_list}) { + $self->print_feature($feature); + } + } + else { + throw( "print_feature_list requires a listref as argument" ); + } +} + +=head2 print_feature_Iterator + + Arg [1] : Bio::EnsEMBL::Utils::Iterator + Description: Automatically spools through an iterator for convenience + Returntype : None +=cut + +sub print_feature_Iterator { + my $self = shift; + my $iterator = shift; + if ($iterator->can('has_next')) { + $iterator->each( + sub { + $self->print_feature($_); + $self->{'achieved_something'} = 1; + } + ); + } + else { + throw("Supplied iterator does not look like Bio::EnsEMBL::Utils::Iterator"); + } +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/IO/GFFParser.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/IO/GFFParser.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,328 @@ +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 NAME + +GFFParser - simple gff3 parser. + + +=head1 AUTHOR + +Monika Komorowska, 2012 - monika@ebi.ac.uk + +=head1 SYNOPSIS + +use strict; +use Bio::EnsEMBL::Utils::IO::GFFParser; +use IO::File; + +my $file_name = "features.gff"; +my $fh = IO::File->new($file_name, 'r'); +my $parser = Bio::EnsEMBL::Utils::IO::GFFParser->new($fh); + +my @header_lines = @{$parser->parse_header()}; +#do something with the header lines array, e.g. print array elements + +foreach my $header_line (@header_lines) { + print $header_line . "\n"; +} +print "\n\n"; +my $feature = $parser->parse_next_feature(); + +while (defined($feature) ) { + + my %feature = %{$feature}; + + #do something with the feature, e.g. print hash keys and values + foreach my $key (keys %feature) { + if ($key ne 'attribute') { + print $key . " " . $feature{$key} ."\n"; + } else { + print $key . "\n"; + my %attribs = %{$feature{$key}}; + foreach my $attrib_key (keys %attribs) { + printf("\t%s %s\n", $attrib_key, join(q{, }, @{wrap_array($values)})); + + } + } + } + print "\n\n"; + $feature = $parser->parse_next_feature(); +} + +my $sequence = $parser->parse_next_sequence(); + +while (defined($sequence)) { + my %sequence = %{$sequence}; + + foreach my $key (keys %sequence) { + print $key . " " . $sequence{$key} ."\n"; + } + print "\n\n"; + + $sequence = $parser->parse_next_sequence(); +} + +$parser->close(); + +$fh->close(); + + + +=head1 DESCRIPTION + +GFF3 format as defined in http://www.sequenceontology.org/gff3.shtml + +Use parse_header method to parse a GFF3 file header, and parse_next_feature to parse the next feature line in the file. + +This class can be extended to convert a feature hash into a feature object reversing +the processing done by GFFSerializer. + +=cut + +package Bio::EnsEMBL::Utils::IO::GFFParser; +use strict; +use warnings; +use Bio::EnsEMBL::Utils::Exception; +use IO::File; +use URI::Escape; +use Bio::EnsEMBL::Utils::Scalar qw/wrap_array/; + + +my %strand_conversion = ( '+' => '1', '?' => '0', '-' => '-1'); + +=head2 new + + Constructor + Arg [1] : File handle + + Returntype : Bio::EnsEMBL::Utils::IO::GFFParser + +=cut + +sub new { + my $class = shift; + my $self = { + filehandle => shift, + }; + bless $self, $class; + if (!defined($self->{'filehandle'})) { + throw("GFFParser requires a valid filehandle to a GFF3 formatted file"); + } + return $self; + +} + +=head2 parse_header + + Arg [1] : File handle + Description: Returns a arrayref with each header line stored in array element + Returntype : Arrayref of GFF3 file header lines + +=cut + +sub parse_header { + + my $self = shift; + + my $next_line; + my @header_lines; + + while (($next_line = $self->_read_line()) && ($next_line =~ /^[\#|\s]/) ) { + + #stop parsing features if ##FASTA directive encountered + last if ($next_line =~ /\#\#FASTA/ ); + + #header lines start with ## (except for the ##FASTA directive indicating sequence section) + if ($next_line =~ /^[\#]{2}/ ) { + push @header_lines, $next_line; + if ($next_line =~ /gff-version\s+(\d+)/) { + if ($1 != 3) { + warning("File has been formatted in GFF version $1. GFFParser may return unexpected results as it is designed to parse GFF3 formatted files."); + } + } + } + } + + if (defined($next_line)) { + $self->{'first_non_header_line'} = $next_line; + } + return \@header_lines; + +} + +=head2 parse_next_feature + + Arg [1] : File handle + Description: Returns a hashref in the format - + { + seqid => scalar, + source => scalar, + type => scalar, + start => scalar, + end => scalar, + score => scalar, + strand => scalar, + phase => scalar, + attribute => hashref, + + } + Returntype : Hashref of a GFF3 feature line + +=cut + +sub parse_next_feature { + + my $self = shift; + + my $next_line; + my $feature_line; + + while (($next_line = $self->_read_line() ) && defined($next_line) ) { + + #stop parsing features if ##FASTA directive + last if ($next_line =~ /\#\#FASTA/); + + + next if ($next_line =~ /^\#/ || $next_line =~ /^\s*$/ || + $next_line =~ /^\/\//); + + $feature_line = $next_line; + last; + } + + return undef unless $feature_line; + + my %feature; + my %attribute; + + + #strip off trailing comments + $feature_line =~ s/\#.*//; + + my @chunks = split(/\t/, $feature_line); + + %feature = ( + 'seqid' => uri_unescape($chunks[0]), + 'source' => uri_unescape($chunks[1]), + 'type' => uri_unescape($chunks[2]), + 'start' => $chunks[3], + 'end' => $chunks[4], + 'score' => $chunks[5], + 'strand' => $strand_conversion{$chunks[6]}, + 'phase' => $chunks[7] + ); + + if ($chunks[8]) { + my @attributes = split( /;/, $chunks[8] ); + my %attributes; + foreach my $attribute (@attributes) { + my ( $name, $value ) = split( /=/, $attribute ); + $name = uri_unescape($name); + my @split_values = map { uri_unescape($_) } split(/,/, $value); + if(scalar(@split_values) > 1) { + $attributes{$name} = \@split_values; + } + else { + $attributes{$name} = $split_values[0]; + } + } + $feature{'attribute'} = \%attributes; + } + + return \%feature; +} + +=head2 parse_next_sequence + + Arg [1] : File handle + Description: Returns a hashref in the format - + { + header => scalar, + sequence => scalar, + + } + Returntype : Hashref of a GFF3 sequence line + +=cut + +sub parse_next_sequence { + + my $self = shift; + + my $next_line; + my $sequence; + my $header; + + while (($next_line = $self->_read_line() ) && defined($next_line) ) { + + next if ($next_line =~ /^\#/ || $next_line =~ /^\s*$/ || + $next_line =~ /^\/\//); + + if ($next_line =~ /^>/) { + if ($header) { + #next fasta header encountered + $self->{'next_fasta_header'} = $next_line; + last; + + } else { + $header = $next_line; + } + } else { + $sequence .= $next_line; + } + } + + return undef unless ($sequence || $header); + + my %sequence = (header => $header , sequence => $sequence ); + + return \%sequence; +} + + +sub _read_line { + + my $self = shift; + my $fh = $self->{'filehandle'}; + + my $line; + + if (defined($self->{'first_non_header_line'})) { + $line = $self->{'first_non_header_line'}; + $self->{'first_non_header_line'} = undef; + } elsif ( defined($self->{'next_fasta_header'} )) { + $line = $self->{'next_fasta_header'}; + $self->{'next_fasta_header'} = undef; + } + else { + $line = <$fh>; + if (defined($line)) { + chomp $line; + if (!$line) { + #parse next line if current line is empty + $line = $self->_read_line(); + } + } + } + + return $line; +} + +sub close { + + my $self = shift; + $self->{"filehandle"} = undef; + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/IO/GFFSerializer.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/IO/GFFSerializer.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,217 @@ +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 NAME + +GFFSerializer - Feature to GFF converter + +=head1 AUTHOR + +Kieron Taylor, 2011 - ktaylor@ebi.ac.uk + +=head1 SYNOPSIS + +use Bio::EnsEMBL::Utils::IO::GFFSerializer; +use Bio::EnsEMBL::Utils::BiotypeMapper; + +my $ontology_adaptor = $registry->get_adaptor( 'Multi', 'Ontology', 'OntologyTerm' ); +my $biotype_mapper = new BiotypeMapper($ontology_adaptor); +my $serializer = new GFFSerializer($biotype_mapper,$output_fh); + +my $variation_feature_adaptor = $registry->get_adaptor( $config{'species'}, 'variation', 'variationfeature' ); +$serializer->print_metadata("Variation Features:"); +my $iterator = $variation_feature_adaptor->fetch_Iterator_by_Slice($slice,undef,60000); +$serializer->print_feature_Iterator($iterator); + +=head1 DESCRIPTION + +Subclass of Serializer that can turn a feature into a line for the GFF3 format. Requires +a BiotypeMapper in order to translate biotypes to SO terms. + +=cut + +package Bio::EnsEMBL::Utils::IO::GFFSerializer; +use strict; +use warnings; +use Bio::EnsEMBL::Utils::Exception; +use Bio::EnsEMBL::Utils::BiotypeMapper; +use URI::Escape; +use Bio::EnsEMBL::Utils::IO::FeatureSerializer; + +use base qw(Bio::EnsEMBL::Utils::IO::FeatureSerializer); + +my %strand_conversion = ( '1' => '+', '0' => '?', '-1' => '-'); + +=head2 new + + Constructor + Arg [1] : Ontology Adaptor + Arg [2] : Optional File handle + + Returntype : Bio::EnsEMBL::Utils::IO::GFFSerializer + +=cut + +sub new { + my $class = shift; + my $self = { + ontology_adaptor => shift, + filehandle => shift, + }; + bless $self, $class; + if ( ref($self->{'ontology_adaptor'}) ne "Bio::EnsEMBL::DBSQL::OntologyTermAdaptor" ) { + throw("GFF format requires an instance of Bio::EnsEMBL::DBSQL::OntologyTermAdaptor to function. See also Bio::EnsEMBL::Utils::BiotypeMapper"); + } + $self->{'mapper'} = new Bio::EnsEMBL::Utils::BiotypeMapper($self->{'ontology_adaptor'}); + + if (!defined ($self->{'filehandle'})) { + # no file handle, let the handle point to a copy of STDOUT instead + open $self->{'filehandle'}, ">&STDOUT"; + $self->{'stdout'} = 1; + } + return $self; +} + +=head2 print_feature + + Arg [1] : Bio::EnsEMBL::Feature, subclass or related pseudo-feature + Example : $reporter->print_feature($feature,$slice_start_coordinate,"X") + Description: Asks a feature for its summary, and generates a GFF3 + compliant entry to hand back again + Additional attributes are handed through to column 9 of the + output using exact spelling and capitalisation of the + feature-supplied hash. + Returntype : none +=cut + +sub print_feature { + my $self = shift; + my $feature = shift; + my $biotype_mapper = $self->{'mapper'}; + + my $text_buffer = ""; + if ($feature->can('summary_as_hash') ) { + my %summary = %{$feature->summary_as_hash}; + my $row = ""; +# Column 1 - seqname, the name of the sequence/chromosome the feature is on. Landmark for start below + if (!defined($summary{'seq_region_name'})) {$summary{'seq_region_name'} = "?";} + $row .= $summary{'seq_region_name'}."\t"; + +# Column 2 - source, complicated with Ensembl not being the originator of all data + $row .= "EnsEMBL\t"; + +# Column 3 - feature, the ontology term for the kind of feature this row is + my $so_term = $biotype_mapper->translate_feature_to_SO_term($feature); + $row .= $so_term."\t"; + +# Column 4 - start, the start coordinate of the feature, here shifted to chromosomal coordinates +# Start and end must be in ascending order for GFF. Circular genomes require the length of +# the circuit to be added on. + if ($summary{'start'} > $summary{'end'}) { + #assumes this is not a Compara circular sequence and can treat is as a Feature + if ($feature->slice() && $feature->slice()->is_circular() ) { + $summary{'end'} = $summary{'end'} + $feature->seq_region_length; + } + # non-circular, but end still before start + else {$summary{'end'} = $summary{'start'};} + } + $row .= $summary{'start'} . "\t"; + +# Column 5 - end, coordinates (absolute) for the end of this feature + $row .= $summary{'end'} . "\t"; + +# Column 6 - score, for variations only. + if (exists($summary{'score'})) { + $row .= $summary{'score'}."\t"; + } + else { + $row .= ".\t"; + } + +# Column 7 - strand, up or down + if (exists($summary{'strand'})) { + $row .= $strand_conversion{$summary{'strand'}}."\t"; + } + else { + $row .= ".\t"; + } + +# Column 8 - reading frame, necessary only for Exons + $row .= ".\t"; + +# Column 9 - the 'other' section for all GFF and GVF compliant attributes +# We include Stable ID and biotype where possible to supplement the information in the other columns + delete $summary{'seq_region_start'}; + delete $summary{'seq_region_name'}; + delete $summary{'start'}; + delete $summary{'end'}; + delete $summary{'strand'}; + delete $summary{'score'}; +# Slice the hash for specific keys in GFF-friendly order + my @ordered_keys = qw(ID Name Alias Parent Target Gap Derives_from Note Dbxref Ontology_term Is_circular); + my @ordered_values = @summary{@ordered_keys}; + while (my $key = shift @ordered_keys) { + my $value = shift @ordered_values; + if ($value) { + $row .= $key."=".uri_escape($value,'\t\n\r;=%&,').";"; + } + delete $summary{$key}; + } +# Catch the remaining keys, containing whatever else the Feature provided + foreach my $attribute ( keys(%summary)) { + if (ref $summary{$attribute} eq "ARRAY") { + $row .= $attribute."=".join (',',@{$summary{$attribute}}) . ";" + } + else { + if ($summary{$attribute}) { $row .= $attribute."=".uri_escape($summary{$attribute},'\t\n\r;=%&,') . ";"; } + } + } +# trim off any trailing commas left by the ordered keys stage above: + $text_buffer .= $row."\n"; + } + else { + warning("Feature failed to self-summarise"); + } + #filehandle is inherited + my $fh = $self->{'filehandle'}; + print $fh $text_buffer; +} + +=head2 print_main_header + + Arg [1] : Arrayref of slices going into the file. + Description: Printing the header text or metadata required for GFF, + using a list of slices to be written + Returntype : None +=cut + +sub print_main_header { + my $self = shift; + my $arrayref_of_slices = shift; + my $fh = $self->{'filehandle'}; + + print $fh "##gff-version 3\n"; + foreach my $slice (@{$arrayref_of_slices}) { + if (not defined($slice)) { warning("Slice not defined.\n"); return;} + print $fh "##sequence-region ",$slice->seq_region_name," ",$slice->start," ",$slice->end,"\n"; + } +} + +sub print_metadata { + my $self = shift; + my $text = shift; + my $fh = $self->{'filehandle'}; + print $fh "\n#".$text."\n"; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/IO/ReportSerializer.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/IO/ReportSerializer.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,214 @@ +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 NAME + +Report Serializer - generating textual summary reports + +=head1 AUTHOR + +Kieron Taylor, 2011 - ktaylor@ebi.ac.uk + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Registry; + use Bio::EnsEMBL::Utils::IO::ReportSerializer; + use IO::File; + + my $registry = 'Bio::EnsEMBL::Registry'; + $output_fh = IO::File->new($config{'output'},'w') or die; + $serializer = new ReportSerializer($output_fh); + my $slice_adaptor = $registry->get_adaptor( 'Human', 'Core', 'Slice' ); + my $slice = $slice_adaptor->fetch_by_toplevel_location("6:1000000..1500000"); + + $serializer->print_section_header($slice); + $serializer->print_feature_list($slice->get_all_Genes); + +=head1 DESCRIPTION + +Subclass of Serializer that can turn a feature into a text block +Unsuited to very large slices, because it requires a select-all approach for features. + +=cut + +package Bio::EnsEMBL::Utils::IO::ReportSerializer; +use strict; +use warnings; +use Bio::EnsEMBL::Utils::Exception; +use URI::Escape; +use Bio::EnsEMBL::Utils::IO::FeatureSerializer; + +use base qw(Bio::EnsEMBL::Utils::IO::FeatureSerializer); + +my %strand_conversion = ( '1' => '+', '0' => '?', '-1' => '-'); + +my %feature_conversion = ( 'Bio::EnsEMBL::Gene' => 'Gene', + 'Bio::EnsEMBL::Transcript' => 'Transcript', + 'Bio::EnsEMBL::Translation' => 'Translation', + 'Bio::EnsEMBL::Variation::StructuralVariationFeature' => 'Structural Variation', + 'Bio::EnsEMBL::Variation::VariationFeature' => 'Variation', + 'Bio::EnsEMBL::Funcgen::RegulatoryFeature' => 'Regulatory Feature', + 'Bio::EnsEMBL::Compara::ConstrainedElement' => 'Constrained Element', + 'Feature' => 'Feature', +); + +# Hash for selecting the correct attributes of unseen features for crude summary. This hash is +# for fallback behaviour, slicing summary hashes for a limited set of values. +my %printables = ( + 'Bio::EnsEMBL::Gene' => ['ID','biotype','start','end'], + 'Bio::EnsEMBL::Transcript' => ['ID','start','end'], + 'Bio::EnsEMBL::Translation' => ['ID'], + 'Bio::EnsEMBL::Variation::VariationFeature' => ['ID','start','end','strand','seq_region_name'], + 'Bio::EnsEMBL::Variation::StructuralVariationFeature' => ['ID','start','end','strand','seq_region_name'], + 'Bio::EnsEMBL::Funcgen::RegulatoryFeature' => ['ID','start','end','strand'], + 'Bio::EnsEMBL::Compara::ConstrainedElement' => ['ID','start','end','strand','seq_region_name'], + ); + +=head2 print_feature + + Arg [1] : Bio::EnsEMBL::Feature, subclass or related pseudo-feature + Example : $reporter->print_feature($feature,$slice_start_coordinate,"X") +=cut + +sub print_feature { + my $self = shift; + my $feature = shift; + my $fh = $self->{'filehandle'}; + my $feature_type = ref($feature); + + if ($feature->can('summary_as_hash') ) { + my %summary = %{$feature->summary_as_hash}; + my @values = @summary{ @{$printables{$feature_type}} }; + print $fh join(',',@values)."\n"; + } + else { + warning("Feature failed to self-summarise"); + } +} + +=head2 print_feature_list + + Arg [1] : Listref of Bio::EnsEMBL::Feature, subclass or related pseudo-feature + Description: Relies on a list of similar features to print in a block together. + Overrides superclass method + Results are truncated after the first 100 features for brevity. + Example : $reporter->print_feature_list(\@features); +=cut + +sub print_feature_list { + my $self = shift; + my $feature_list = shift; + if (scalar(@$feature_list) > 0) {$self->{'achieved_something'} = 1;} #from superclass + my $fh = $self->{'filehandle'}; + + my $example_feature = $feature_list->[0]; + my $feature_type = ref($example_feature); + my $feature_count = 0; + unless (defined $feature_type) {$feature_type = "Feature"}; + if (scalar(@$feature_list) > 0) { + print $fh "There are ",scalar(@$feature_list)," ",$feature_conversion{$feature_type},(scalar(@$feature_list) != 1) ? "s":""," in this region\n"; + } + if (scalar(@$feature_list) > 100 ) { print $fh "Too many to display, results truncated to the first 100\n";} + print $fh "\n"; + foreach my $feature (@$feature_list) { + $feature_count++; + my %attributes = %{$feature->summary_as_hash}; + + if ($feature_count == 100) {last;} + # Begin the feature-specific formatting code + if ($feature_type eq "Bio::EnsEMBL::Gene") { + print $fh "\tGene ".$feature_count.": ".$attributes{'external_name'}.",".$attributes{'ID'}."\n"; + print $fh "\tBiotype: ".$attributes{'biotype'}."\n"; + print $fh "\tLocation: ".$attributes{'start'}."-".$attributes{'end'}." bp\n\n"; + + print $fh "\tTranscripts and proteins\n"; + foreach my $transcript (@{$feature->get_all_Transcripts}) { + my %tr_summary = %{$transcript->summary_as_hash}; + print $fh "\t\t ".$tr_summary{'ID'}; + my $translation = $transcript->translation; + if (defined $translation) { + my %pr_summary = %{$translation->summary_as_hash}; + print $fh " - ".$pr_summary{'ID'}."\n\n"; + } + else {print $fh " - no protein\n\n";} + } + print $fh "\n"; + } + elsif ($feature_type eq "Bio::EnsEMBL::Funcgen::RegulatoryFeature") { + print $fh "\t".$attributes{'ID'}."\n"; + } + elsif ($feature_type eq "Bio::EnsEMBL::Compara::ConstrainedElement") { + print $fh "\t".$attributes{'start'}."-".$attributes{'end'}."\n"; + } + elsif ( $feature_type eq "Bio::EnsEMBL::Variation::StructuralVariationFeature" + or $feature_type eq "Bio::EnsEMBL::Variation::VariationFeature") { + print $fh "\tID: ".$attributes{'ID'}." Position: ". + $attributes{'start'}."-".$attributes{'end'}." on strand ".$attributes{'strand'}." \n"; + } + else { + # slice favourite values out unformatted. + my @values = @attributes{ @{$printables{$feature_type}} }; + print $fh $feature_type.join(',',@values)."\n"; + + } + } +} + +# Just print individuals without awareness of list size and position. +sub print_feature_iterator { + my $self = shift; + my $feature_iterator = shift; + while ($feature_iterator->has_next) { + my $feature = $feature_iterator->next; + $self->print_feature($feature); + } + $self->{'achieved_something'} = 1; +} + +=head2 print_main_header + + Arg [1] : Arrayref of slices going into the file. + Description: Printing the header text for this report + Requires a slice list in order to report how many will be printed + Returntype : None +=cut + +sub print_main_header { + my $self = shift; + my $arrayref_of_slices = shift; + my $fh = $self->{'filehandle'}; + + my $regions = scalar @{$arrayref_of_slices}; + print $fh "Report for $regions region"; + if ($regions > 1) { print $fh "s";} + print $fh "\n\n"; +} + + +=head2 print_section_header + + Arg [1] : Bio::EnsEMBL::Slice + Description: Prints a summary of the slice + Intended to be used prior to print_feature_list() + Returntype : None + +=cut + +sub print_section_header { + my $self = shift; + my $slice = shift; + my $fh = $self->{'filehandle'}; + + print $fh " Region: ",$slice->seq_region_name," ",$slice->start,"-",$slice->end," bp\n\n"; + +} + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/IO/Serializer.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/IO/Serializer.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,141 @@ +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 NAME + +Serializer - An abstract serializer for turning EnsEMBL data into other formats + +=head1 AUTHOR + +Kieron Taylor, 2011 - ktaylor@ebi.ac.uk + +=head1 SYNOPSIS + +my $serializer = new Serializer( $filehandle ); +$serializer->print_feature_list( \@list_of_features ); + +=head1 DESCRIPTION + +Subclass this class to create a format-specific serializer. +Be sure to implement print_feature at the bare minimum + +=cut + +package Bio::EnsEMBL::Utils::IO::Serializer; +use strict; +use warnings; +use Bio::EnsEMBL::Utils::Exception; +use Bio::EnsEMBL::Utils::SeqDumper; + + +=head2 new + + Constructor + Arg [1] : Optional File handle + Returntype : Bio::EnsEMBL::Utils::IO::Serializer + +=cut + +sub new { + my $class = shift; + my $self = { + 'filehandle' => shift, + 'achieved_something' => 0, + }; + bless $self, $class; + if (!defined ($self->{'filehandle'})) { + # no file handle, let the handle point to a copy of STDOUT instead + open $self->{'filehandle'}, ">&STDOUT"; + $self->{'stdout'} = 1; + } + return $self; +} + +=head2 DESTROY + + Destructor + Description: Restores default state of the STDOUT filehandle as it is a copy + and may not flush correctly. +=cut + +sub DESTROY { + my $self = shift; + if ($self->{'stdout'}) { + close $self->{'filehandle'}; + } +} + +=head2 print_metadata + + Arg [1] : String + Description: Pipes a custom string into the filehandle that the serializer is using + +=cut + +sub print_metadata { + my $self = shift; + my $text = shift; + my $fh = $self->{'filehandle'}; + print $fh "\n".$text."\n"; +} + +=head2 print_main_header + + Arg [1] : Data for header, depends on serializer + Description: Printing the header text or metadata required for this file format, + Re-implement in the serializer. + Returntype : None +=cut + +sub print_main_header { + my $self = shift; + warning("No writer for headers in this format. Nothing done" ); +} + +=head2 printed_something + Description: Check if serializer has printed any useful data. Not accurate with FASTA + due to non-reporting dumper. + Returntype : Boolean +=cut + +sub printed_something { + my $self = shift; + if ($self->{'achieved_something'}) { return 1;} + else {return 0;} +} + +=head2 formatted_write + + Arg [1] : Line format, see Perldoc of formline() + Arg [2] : Array of arguments to suit the line format in Arg [1] + Description: Writes data to the filehandle and rigidly formats it. + Refer to Perldoc on formline() to specify valid formats. + Useful for fixed-width file formats. + Suicides in the event of file system issues. + Example : my $FORMAT = '^<<<<<<<<<<<<<<<<<<<|<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n'; + $serializer->formatted_write($FORMAT,@text_fields); + Returntype : None +=cut + +sub formatted_write { + my ($self, $FORMAT, @values) = @_; + my $fh = $self->{'filehandle'}; + + #while the last value still contains something + while(defined($values[-1]) and $values[-1] ne '') { + formline($FORMAT, @values); + print $fh $^A or die "Failed write to filehandle"; + $^A = ''; + } +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/Iterator.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/Iterator.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,428 @@ +package Bio::EnsEMBL::Utils::Iterator; + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + + Bio::EnsEMBL::Utils::Iterator + +=head1 SYNOPSIS + + my $variation_iterator = + $variation_adaptor->fetch_Iterator_by_VariationSet($1kg_set); + + while ( my $variation = $variation_iterator->next ) { + # operate on variation object + print $variation->name, "\n"; + } + +=head1 DESCRIPTION + + Some adaptor methods may return more objects than can fit in memory at once, in these cases + you can fetch an iterator object instead of the usual array reference. The iterator object + allows you to iterate over the set of objects (using the next() method) without loading the + entire set into memory at once. You can tell if an iterator is exhausted with the has_next() + method. The peek() method allows you to fetch the next object from the iterator without + advancing the iterator - this is useful if you want to check some property of en element in + the set while leaving the iterator unchanged. + + You can filter and transform an iterator in an analogous way to using map and grep on arrays + using the provided map() and grep() methods. These methods return another iterator, and only + perform the filtering and transformation on each element as it is requested, so again these + can be used without loading the entire set into memory. + + Iterators can be combined together with the append() method which merges together the + iterator it is called on with the list of iterators passed in as arguments. This is + somewhat analogous to concatenating arrays with the push function. append() returns a new + iterator which iterates over each component iterator until it is exhausted before moving + on to the next iterator, in the order in which they are supplied to the method. + + An iterator can be converted to an array (reference) containing all the elements in the + set with the to_arrayref() method, but note that this array may consume a lot of memory if + the set the iterator is iterating over is large and it is recommended that you do not call + this method unless there is no way of working with each element at a time. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +use Bio::EnsEMBL::Utils::Exception qw(throw); + +=head2 new + + Argument : either a coderef representing the iterator, in which case this + anonymous subroutine is assumed to return the next object in the + set when called and to return undef when the set is exhausted, + or an arrayref, in which case we return an iterator over this + array. If the argument is not defined then we return an 'empty' + iterator that immediately returns undef + + Example : + + my @dbIDs = fetch_relevant_dbIDs(); + + my $iterator = Bio::EnsEMBL::Utils::Iterator->new( + sub { return $self->fetch_by_dbID(shift @dbIDs) } + ); + + NB: this is a very simple example showing how to call the constructor + that would be rather inefficient in practice, real examples should + probably be smarter about batching up queries to minimise trips to + the database. See examples in the Variation API. + + Description: Constructor, creates a new iterator object + Returntype : Bio::EnsEMBL::Utils::Iterator instance + Exceptions : thrown if the supplied argument is not the expected + Caller : general + Status : Experimental + +=cut + +sub new { + my $class = shift; + + my $arg = shift; + + my $coderef; + + if (not defined $arg) { + # if the user doesn't supply an argument, we create a + # simple 'empty' iterator that immediately returns undef + + $coderef = sub { return undef }; + } + elsif (ref $arg eq 'ARRAY') { + # if the user supplies an arrayref as an argument, we + # create an iterator over this array + + $coderef = sub { return shift @$arg }; + } + elsif (ref $arg eq 'CODE'){ + $coderef = $arg; + } + else { + throw("The supplied argument does not look like an arrayref or a coderef ".(ref $arg)) + } + + my $self = {sub => $coderef}; + + return bless $self, $class; +} + + +=head2 next + + Example : $obj = $iterator->next + Description: returns the next object from this iterator, or undef if the iterator is exhausted + Returntype : Object type will depend on what this iterator is iterating over + Exceptions : none + Caller : general + Status : Experimental + +=cut + +sub next { + my $self = shift; + + $self->{next} = $self->{sub}->() unless defined $self->{next}; + + return delete $self->{next}; +} + +=head2 has_next + + Example : if ($iterator->has_next) { my $obj = $iterator->next } + Description: Boolean - true if this iterator has more elements to fetch, false when + it is exhausted + Returntype : boolean + Exceptions : none + Caller : general + Status : Experimental + +=cut + +sub has_next { + my $self = shift; + + $self->{next} = $self->{sub}->() unless defined $self->{next}; + + return defined $self->{next}; +} + +=head2 peek + + Example : $obj = $iterator->peek + Description: returns the next object from this iterator, or undef if the iterator is exhausted, + much like next but does not advance the iterator (so the same object will be + returned on the following call to next or peek) + Returntype : Object type will depend on what this iterator is iterating over + Exceptions : none + Caller : general + Status : Experimental + +=cut + +sub peek { + my $self = shift; + + $self->{next} = $self->{sub}->() unless defined $self->{next}; + + return $self->{next}; +} + +=head2 grep + + Example : my $filtered_iterator = $original_iterator->grep(sub {$_->name =~ /^rs/}); + Description: filter this iterator, returning another iterator + Argument : a coderef which returns true if the element should be included in the + filtered set, or false if the element should be filtered out. $_ will be + set locally to each element in turn so you should be able to write a block + in a similar way as for the perl grep function (although it will need to be + preceded with the sub keyword). Otherwise you can pass in a reference to a + subroutine which expects a single argument with the same behaviour. + Returntype : Bio::EnsEMBL::Utils::Iterator + Exceptions : thrown if the argument is not a coderef + Caller : general + Status : Experimental + +=cut + +sub grep { + my ($self, $coderef) = @_; + + throw('Argument should be a coderef') unless ref $coderef eq 'CODE'; + + return Bio::EnsEMBL::Utils::Iterator->new(sub { + while ($self->has_next) { + local $_ = $self->next; + return $_ if $coderef->($_); + } + return undef; + }); +} + +=head2 map + + Example : my $transformed_iterator = $original_iterator->map(sub {$_->name}); + Description: transform the elements of this iterator, returning another iterator + Argument : a coderef which returns the desired transformation of each element. + $_ will be set locally set to each original element in turn so you + should be able to write a block in a similar way as for the perl map + function (although it will need to be preceded with the sub keyword). + Otherwise you can pass in a reference to a subroutine which expects a + single argument with the same behaviour. + Returntype : Bio::EnsEMBL::Utils::Iterator + Exceptions : thrown if the argument is not a coderef + Caller : general + Status : Experimental + +=cut + + +sub map { + my ($self, $coderef) = @_; + + throw('Argument should be a coderef') unless ref $coderef eq 'CODE'; + + return Bio::EnsEMBL::Utils::Iterator->new(sub { + local $_ = $self->next; + return defined $_ ? $coderef->($_) : undef; + }); +} + + +=head2 each + + Example : $iterator->each(sub { print $_->name, "\n"; }); + Description: Performs a full iteration of the current iterator instance. + Argument : a coderef which returns the desired transformation of each element. + $_ will be set locally set to each element. + Returntype : None + Exceptions : thrown if the argument is not a coderef + Caller : general + Status : Experimental + +=cut + + +sub each { + my ($self, $coderef) = @_; + throw('Argument should be a coderef') unless ref $coderef eq 'CODE'; + while($self->has_next()) { + local $_ = $self->next(); + $coderef->($_); + } + return; +} + + +=head2 to_arrayref + + Example : my $arrayref = $iterator->to_arrayref; + Description: return a reference to an array containing all elements from the + iterator. This is created by simply iterating over the iterator + until it is exhausted and adding each element in turn to an array. + Note that this may consume a lot of memory for iterators over + large collections + Returntype : arrayref + Exceptions : none + Caller : general + Status : Experimental + +=cut + +sub to_arrayref { + my ($self) = @_; + + my @array; + + while ($self->has_next) { + push @array, $self->next; + } + + return \@array; +} + +=head2 append + + Example : my $combined_iterator = $iterator1->append($iterator2, $iterator3); + Description: return a new iterator that combines this iterator with the others + passed as arguments, this new iterator will iterate over each + component iterator (in the order supplied here) until it is + exhausted and then move on to the next iterator until all are + exhausted + Argument : an array of Bio::EnsEMBL::Utils::Iterator objects + Returntype : Bio::EnsEMBL::Utils::Iterator + Exceptions : thrown if any of the arguments are not iterators + Caller : general + Status : Experimental + +=cut + +sub append { + my ($self, @queue) = @_; + + for my $iterator (@queue) { + throw("Argument to append doesn't look like an iterator") + unless UNIVERSAL::can($iterator, 'has_next') && UNIVERSAL::can($iterator, 'next'); + } + + # push ourselves onto the front of the queue + unshift @queue, $self; + + return Bio::EnsEMBL::Utils::Iterator->new(sub { + # shift off any exhausted iterators + while (@queue && not $queue[0]->has_next) { + shift @queue; + } + + # and return the next object from the iterator at the + # head of the queue, or undef if the queue is empty + return @queue ? $queue[0]->next : undef; + }); +} + +=head2 take + + Example : my $limited_iterator = $iterator->take(5); + Description: return a new iterator that only iterates over the + first n elements of this iterator + Argument : a positive integer + Returntype : Bio::EnsEMBL::Utils::Iterator + Exceptions : thrown if the argument is negative + Caller : general + Status : Experimental + +=cut + +sub take { + my ($self, $n) = @_; + + throw("Argument cannot be negative") if $n < 0; + + my $cnt = 0; + + return Bio::EnsEMBL::Utils::Iterator->new(sub { + return $cnt++ >= $n ? undef : $self->next; + }); +} + +=head2 skip + + Example : my $limited_iterator = $iterator->skip(5); + Description: skip over the first n elements of this iterator (and then return + the same iterator for your method chaining convenience) + Argument : a positive integer + Returntype : Bio::EnsEMBL::Utils::Iterator + Exceptions : thrown if the argument is negative + Caller : general + Status : Experimental + +=cut + +sub skip { + my ($self, $n) = @_; + + throw("Argument cannot be negative") if $n < 0; + + $self->next for (0 .. $n-1); + + return $self; +} + +=head2 reduce + + Example : my $tot_length = $iterator->reduce(sub { $_[0] + $_[1]->length }, 0); + Description: reduce this iterator with the provided coderef, using the (optional) + second argument as the initial value of the accumulator + Argument[1]: a coderef that expects 2 arguments, the current accumulator + value and the next element in the set, and returns the next + accumulator value. Unless the optional second argument is + provided the first accumulator value passed in will be the + first element in the set + Argument[2]: (optional) an initial value to use for the accumulator instead + of the first value of the set + Returntype : returntype of the coderef + Exceptions : thrown if the argument is not a coderef + Caller : general + Status : Experimental + +=cut + +sub reduce { + my ($self, $coderef, $init_val) = @_; + + throw('Argument should be a coderef') unless ref $coderef eq 'CODE'; + + my $result = defined $init_val ? $init_val : $self->next; + + while ($self->has_next) { + $result = $coderef->($result, $self->next); + } + + return $result; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/Logger.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/Logger.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,739 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::ConversionSupport - Utility module for Vega release and +schema conversion scripts + +=head1 SYNOPSIS + + my $serverroot = '/path/to/ensembl'; + my $suport = new Bio::EnsEMBL::Utils::ConversionSupport($serverroot); + + # parse common options + $support->parse_common_options; + + # parse extra options for your script + $support->parse_extra_options( 'string_opt=s', 'numeric_opt=n' ); + + # ask user if he wants to run script with these parameters + $support->confirm_params; + + # see individual method documentation for more stuff + +=head1 DESCRIPTION + +This module is a collection of common methods and provides helper +functions for the Vega release and schema conversion scripts. Amongst +others, it reads options from a config file, parses commandline options +and does logging. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Utils::Logger; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use FindBin qw($Bin $Script); +use POSIX qw(strftime); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw); +use Bio::EnsEMBL::Utils::ScriptUtils qw(parse_bytes); + +my %level_defs = ( + 'error' => 1, + 'warn' => 2, + 'warning' => 2, + 'info' => 3, + 'debug' => 4, + 'verbose' => 4, +); + +my @reverse_level_defs = (undef, qw(error warning info debug)); + +=head2 new + + Arg[1] : String $serverroot - root directory of your ensembl sandbox + Example : my $support = new Bio::EnsEMBL::Utils::ConversionSupport( + '/path/to/ensembl'); + Description : constructor + Return type : Bio::EnsEMBL::Utils::ConversionSupport object + Exceptions : thrown on invalid loglevel + Caller : general + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my ($logfile, $logauto, $logautobase, $logautoid, $logpath, $logappend, + $loglevel, $is_component) = rearrange( + ['LOGFILE', 'LOGAUTO', 'LOGAUTOBASE', 'LOGAUTOID', 'LOGPATH', 'LOGAPPEND', + 'LOGLEVEL', 'IS_COMPONENT'], @_); + + my $self = { '_warnings' => 0, }; + bless ($self, $class); + + # initialise + $self->logfile($logfile); + $self->logpath($logpath); + $self->logappend($logappend); + $self->is_component($is_component); + + # automatic logfile creation + $self->logauto($logauto); + $logautoid ||= strftime("%Y%m%d-%H%M%S", localtime); + $self->log_auto_id($logautoid); + $self->create_auto_logfile($logautobase); + + $loglevel ||= 'info'; + if ($loglevel =~ /^\d+$/ and $loglevel > 0 and $loglevel < 5) { + $self->{'loglevel'} = $loglevel; + } elsif ($level_defs{lc($loglevel)}) { + $self->{'loglevel'} = $level_defs{lc($loglevel)}; + } else { + throw('Unknown loglevel: $loglevel.'); + } + + return $self; +} + + +=head2 log_generic + + Arg[1] : String $txt - the text to log + Arg[2] : Int $indent - indentation level for log message + Example : my $log = $support->log_filehandle; + $support->log('Log foo.\n', 1); + Description : Logs a message to the filehandle initialised by calling + $self->log_filehandle(). You can supply an indentation level + to get nice hierarchical log messages. + Return type : true on success + Exceptions : thrown when no filehandle can be obtained + Caller : general + +=cut + +sub log_generic { + my ($self, $txt, $indent, $stamped) = @_; + + $indent ||= 0; + my $fh = $self->log_filehandle; + + # append timestamp and memory usage to log text if requested + if ($stamped) { + $txt =~ s/^(\n*)(.*)(\n*)$/$2/; + $txt = sprintf("%-60s%20s", $txt, $self->time_and_mem); + $txt = $1.$txt.$3; + } + + # strip off leading linebreaks so that indenting doesn't break + $txt =~ s/^(\n*)//; + + # indent + $txt = $1." "x$indent . $txt; + + print $fh "$txt"; + + return(1); +} + + +=head2 error + + Arg[1] : String $txt - the error text to log + Arg[2] : Int $indent - indentation level for log message + Example : my $log = $support->log_filehandle; + $support->log_error('Log foo.\n', 1); + Description : Logs a message via $self->log and exits the script. + Return type : none + Exceptions : none + Caller : general + +=cut + +sub error { + my ($self, $txt, $indent, $stamped) = @_; + + return(0) unless ($self->{'loglevel'} >= 1); + + $txt = "ERROR: ".$txt; + $self->log_generic($txt, $indent, $stamped); + + $self->log_generic("\nExiting prematurely.\n\n"); + $self->log_generic("Runtime: ".$self->runtime." ".$self->date_and_mem."\n\n"); + + exit(1); +} + + +=head2 warning + + Arg[1] : String $txt - the warning text to log + Arg[2] : Int $indent - indentation level for log message + Example : my $log = $support->log_filehandle; + $support->log_warning('Log foo.\n', 1); + Description : Logs a message via $self->log and increases the warning counter. + Return type : true on success + Exceptions : none + Caller : general + +=cut + +sub warning { + my ($self, $txt, $indent, $stamped) = @_; + + return(0) unless ($self->{'loglevel'} >= 2); + + $txt = "WARNING: " . $txt; + $self->log_generic($txt, $indent, $stamped); + + $self->{'_warnings'}++; + + return(1); +} + + +sub info { + my ($self, $txt, $indent, $stamped) = @_; + + return(0) unless ($self->{'loglevel'} >= 3); + + $self->log_generic($txt, $indent, $stamped); + return(1); +} + + +=head2 debug + + Arg[1] : String $txt - the warning text to log + Arg[2] : Int $indent - indentation level for log message + Example : my $log = $support->log_filehandle; + $support->log_verbose('Log this verbose message.\n', 1); + Description : Logs a message via $self->log if --verbose option was used + Return type : TRUE on success, FALSE if not verbose + Exceptions : none + Caller : general + +=cut + +sub debug { + my ($self, $txt, $indent, $stamped) = @_; + + return(0) unless ($self->{'loglevel'} >= 4); + + $self->log_generic($txt, $indent, $stamped); + return(1); +} + + +sub log_progress { + my $self = shift; + my $name = shift; + my $curr = shift; + my $indent = shift; + + throw("You must provide a name and the current value for your progress bar") + unless ($name and $curr); + + # return if we haven't reached the next increment + return if ($curr < int($self->{'_progress'}->{$name}->{'next'})); + + my $index = $self->{'_progress'}->{$name}->{'index'}; + my $num_bins = $self->{'_progress'}->{$name}->{'numbins'}; + my $percent = $index/$num_bins*100; + + my $log_str; + $log_str .= ' 'x$indent if ($index == 0); + $log_str .= "\b"x4; + $log_str .= sprintf("%3s%%", $percent); + $log_str .= "\n" if ($curr == $self->{'_progress'}->{$name}->{'max_val'}); + + $self->info($log_str); + + # increment counters + $self->{'_progress'}->{$name}->{'index'}++; + $self->{'_progress'}->{$name}->{'next'} += $self->{'_progress'}->{$name}->{'binsize'}; +} + + +sub log_progressbar { + my $self = shift; + my $name = shift; + my $curr = shift; + my $indent = shift; + + throw("You must provide a name and the current value for your progress bar") + unless ($name and $curr); + + # return if we haven't reached the next increment + return if ($curr < int($self->{'_progress'}->{$name}->{'next'})); + + my $index = $self->{'_progress'}->{$name}->{'index'}; + my $num_bins = $self->{'_progress'}->{$name}->{'numbins'}; + my $percent = $index/$num_bins*100; + + my $log_str = "\r".(' 'x$indent)."[".('='x$index).(' 'x($num_bins-$index))."] ${percent}\%"; + $log_str .= "\n" if ($curr == $self->{'_progress'}->{$name}->{'max_val'}); + + $self->info($log_str); + + # increment counters + $self->{'_progress'}->{$name}->{'index'}++; + $self->{'_progress'}->{$name}->{'next'} += $self->{'_progress'}->{$name}->{'binsize'}; +} + + +sub init_progress { + my $self = shift; + my $max = shift; + my $num_bins = shift || 50; + + throw("You must provide the maximum value for your progress bar") + unless (defined($max)); + + # auto-generate a unique name for your progressbar + my $name = time . '_' . int(rand(1000)); + + # calculate bin size; we will use 50 bins (2% increments) + my $binsize = $max/$num_bins; + + $self->{'_progress'}->{$name}->{'max_val'} = $max; + $self->{'_progress'}->{$name}->{'binsize'} = $binsize; + $self->{'_progress'}->{$name}->{'numbins'} = $num_bins; + $self->{'_progress'}->{$name}->{'next'} = 0; + $self->{'_progress'}->{$name}->{'index'} = 0; + + return $name; +} + + +=head2 log_filehandle + + Arg[1] : (optional) String $mode - file access mode + Example : my $log = $support->log_filehandle; + # print to the filehandle + print $log 'Lets start logging...\n'; + # log via the wrapper $self->log() + $support->log('Another log message.\n'); + Description : Returns a filehandle for logging (STDERR by default, logfile if + set from config or commandline). You can use the filehandle + directly to print to, or use the smart wrapper $self->log(). + Logging mode (truncate or append) can be set by passing the + mode as an argument to log_filehandle(), or with the + --logappend commandline option (default: truncate) + Return type : Filehandle - the filehandle to log to + Exceptions : thrown if logfile can't be opened + Caller : general + +=cut + +sub log_filehandle { + my ($self, $mode) = @_; + + unless ($self->{'_log_filehandle'}) { + $mode ||= '>'; + $mode = '>>' if ($self->logappend); + + my $fh = \*STDERR; + + if (my $logfile = $self->logfile) { + if (my $logpath = $self->logpath) { + unless (-e $logpath) { + system("mkdir -p $logpath") == 0 or + throw("Can't create log dir $logpath: $!\n"); + } + + $logfile = "$logpath/".$self->logfile; + } + + open($fh, "$mode", $logfile) or + throw("Unable to open $logfile for writing: $!"); + } + + $self->{'_log_filehandle'} = $fh; + } + + return $self->{'_log_filehandle'}; +} + + +=head2 extract_log_identifier + + Arg[1] : + Example : + Description : + Return type : + Exceptions : + Caller : + Status : + +=cut + +sub extract_log_identifier { + my $self = shift; + + if (my $logfile = $self->logfile) { + $logfile =~ /.+\.([^\.]+)\.log/; + return $1; + } else { + return undef; + } +} + + +=head2 init_log + + Example : $support->init_log; + Description : Opens a filehandle to the logfile and prints some header + information to this file. This includes script name, date, user + running the script and parameters the script will be running + with. + Return type : Filehandle - the log filehandle + Exceptions : none + Caller : general + +=cut + +sub init_log { + my $self = shift; + my $params = shift; + + # get a log filehandle + my $log = $self->log_filehandle; + + # remember start time + $self->{'_start_time'} = time; + + # don't log parameters if this script is run by another one + if ($self->logauto or ! $self->is_component) { + # print script name, date, user who is running it + my $hostname = `hostname`; + chomp $hostname; + my $script = "$hostname:$Bin/$Script"; + my $user = `whoami`; + chomp $user; + $self->info("Script: $script\nDate: ".$self->date."\nUser: $user\n"); + + # print parameters the script is running with + if ($params) { + $self->info("Parameters:\n\n"); + $self->info($params); + } + } + + return $log; +} + + +=head2 finish_log + + Example : $support->finish_log; + Description : Writes footer information to a logfile. This includes the + number of logged warnings, timestamp and memory footprint. + Return type : TRUE on success + Exceptions : none + Caller : general + +=cut + +sub finish_log { + my $self = shift; + + $self->info("\nAll done for $Script.\n"); + $self->info($self->warning_count." warnings. "); + $self->info("Runtime: ".$self->runtime." ".$self->date_and_mem."\n\n"); + + return(1); +} + + +sub runtime { + my $self = shift; + + my $runtime = "n/a"; + + if ($self->{'_start_time'}) { + my $diff = time - $self->{'_start_time'}; + my $sec = $diff % 60; + $diff = ($diff - $sec) / 60; + my $min = $diff % 60; + my $hours = ($diff - $min) / 60; + + $runtime = "${hours}h ${min}min ${sec}sec"; + } + + return $runtime; +} + + +=head2 date_and_mem + + Example : print LOG "Time, memory usage: ".$support->date_and_mem."\n"; + Description : Prints a timestamp and the memory usage of your script. + Return type : String - timestamp and memory usage + Exceptions : none + Caller : general + +=cut + +sub date_and_mem { + my $date = strftime "%Y-%m-%d %T", localtime; + my $mem = `ps -p $$ -o vsz |tail -1`; + chomp $mem; + $mem = parse_bytes($mem*1000); + return "[$date, mem $mem]"; +} + + +sub time_and_mem { + my $date = strftime "%T", localtime; + my $mem = `ps -p $$ -o vsz |tail -1`; + chomp $mem; + $mem = parse_bytes($mem*1000); + $mem =~ s/ //; + return "[$date|$mem]"; +} + + +=head2 date + + Example : print "Date: " . $support->date . "\n"; + Description : Prints a nicely formatted timestamp (YYYY-DD-MM hh:mm:ss) + Return type : String - the timestamp + Exceptions : none + Caller : general + +=cut + +sub date { + return strftime "%Y-%m-%d %T", localtime; +} + + +=head2 mem + + Example : print "Memory usage: " . $support->mem . "\n"; + Description : Prints the memory used by your script. Not sure about platform + dependence of this call ... + Return type : String - memory usage + Exceptions : none + Caller : general + +=cut + +sub mem { + my $mem = `ps -p $$ -o vsz |tail -1`; + chomp $mem; + return $mem; +} + + +=head2 warning_count + + Example : print LOG "There were ".$support->warnings." warnings.\n"; + Description : Returns the number of warnings encountered while running the + script (the warning counter is increased by $self->log_warning). + Return type : Int - number of warnings + Exceptions : none + Caller : general + +=cut + +sub warning_count { + my $self = shift; + return $self->{'_warnings'}; +} + + +=head2 logfile + + Arg[1] : + Example : + Description : Getter and setter for the logfile + Return type : + Exceptions : + Caller : + Status : + +=cut + +sub logfile { + my $self = shift; + $self->{'_logfile'} = shift if (@_); + return $self->{'_logfile'}; +} + + +=head2 log_auto_id + + Arg[1] : + Example : + Description : + Return type : + Exceptions : + Caller : + Status : + +=cut + +sub log_auto_id { + my $self = shift; + $self->{'_log_auto_id'} = shift if (@_); + return $self->{'_log_auto_id'}; +} + + +sub logauto { + my $self = shift; + $self->{'_log_auto'} = shift if (@_); + return $self->{'_log_auto'}; +} + + +=head2 create_auto_logfile + + Arg[1] : + Example : + Description : + Return type : + Exceptions : + Caller : + Status : At Risk + : under development + +=cut + +sub create_auto_logfile { + my $self = shift; + my $logautobase = shift; + + # do nothing if automatic logfile generation isn't set + return unless ($self->logauto); + + # an explicit logfile name overrides LOGAUTO + return if ($self->logfile); + + # argument check + unless ($logautobase) { + throw('Need a base logfile name for auto-generating logfile.'); + } + + # create a logfile name + $self->logfile("${logautobase}_".$self->log_auto_id.".log"); +} + + +=head2 logpath + + Arg[1] : + Example : + Description : + Return type : + Exceptions : + Caller : + Status : + +=cut + +sub logpath { + my $self = shift; + $self->{'_logpath'} = shift if (@_); + return $self->{'_logpath'}; +} + + +=head2 logappend + + Arg[1] : + Example : + Description : + Return type : + Exceptions : + Caller : + Status : + +=cut + +sub logappend { + my $self = shift; + $self->{'_logappend'} = shift if (@_); + return $self->{'_logappend'}; +} + + +=head2 is_component + + Arg[1] : + Example : + Description : + Return type : + Exceptions : + Caller : + Status : + +=cut + +sub is_component { + my $self = shift; + $self->{'_is_component'} = shift if (@_); + return $self->{'_is_component'}; +} + + +sub loglevel { + my $self = shift; + return $reverse_level_defs[$self->{'loglevel'}]; +} + + +# +# deprecated methods (left here for backwards compatibility +# +sub log_error { + return $_[0]->error(@_); +} + +sub log_warning { + return $_[0]->warning(@_); +} + +sub log { + return $_[0]->info(@_); +} + +sub log_verbose { + return $_[0]->debug(@_); +} + +sub log_stamped { + return $_[0]->log(@_, 1); +} + + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/PolyA.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/PolyA.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,322 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::PolyA + +=head1 SYNOPSIS + + my $seq; # a Bio::Seq object + my $polyA = Bio::EnsEMBL::Utils::PolyA->new(); + + # returns a new Bio::Seq object with the trimmed sequence + my $trimmed_seq = $polyA->clip($seq); + + # cat put Ns in the place of the polyA/polyT tail + my $masked_seq = $polyA->mask($seq); + + # can put in lower case the polyA/polyT using any flag: + my $softmasked_seq = $poly->mask( $seq, 'soft' ); + +=head1 DESCRIPTION + + It reads a Bio::Seq object, it first finds out whether it has a + polyA or a polyT and then performs one operation in the seq string: + clipping, masking or softmasking. It then returns a new Bio::Seq + object with the new sequence. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Utils::PolyA; + +use Bio::EnsEMBL::Root; +use Bio::Seq; +use vars qw(@ISA); + +use strict; + +@ISA = qw(Bio::EnsEMBL::Root); + + +=head2 new + +=cut + +sub new{ + my ($class) = @_; + if (ref($class)){ + $class = ref($class); + } + my $self = {}; + bless($self,$class); + + return $self; +} + + +############################################################ + +sub clip{ + my ($self, $bioseq) = @_; + + # print STDERR "past a $bioseq\n"; + my $seq = $bioseq->seq; + $self->_clip(1); + $self->_mask(0); + $self->_softmask(0); + my $new_seq = $self->_find_polyA($seq); + my $cdna = Bio::Seq->new(); + if (length($new_seq) > 0){ + $cdna->seq($new_seq); + } + else { + print "While clipping the the polyA tail, sequence ".$bioseq->display_id." totally disappeared.\n"; + print "Returning undef\n"; + return undef; + } + $cdna->display_id( $bioseq->display_id ); + $cdna->desc( $bioseq->desc ); + + return $cdna; +} + +############################################################ + +sub mask{ + my ($self, $bioseq, $flag ) = @_; + + my $seq = $bioseq->seq; + $self->_clip(0); + if ( $flag ){ + $self->_mask(0); + $self->_softmask(1); + } + else{ + $self->_mask(1); + $self->_softmask(0); + } + my $new_seq = $self->_find_polyA($seq); + my $cdna = new Bio::Seq; + $cdna->seq($new_seq); + $cdna->display_id( $bioseq->display_id ); + $cdna->desc( $bioseq->desc ); + + return $cdna; +} + +############################################################ + + + + +############################################################ + +sub _find_polyA{ + my ($self, $seq) = @_; + my $new_seq; + my $length = length($seq); + + # is it a polyA or polyT? + my $check_polyT = substr( $seq, 0, 6 ); + + my $check_polyA = substr( $seq, -6 ); + + my $t_count = $check_polyT =~ tr/Tt//; + my $a_count = $check_polyA =~ tr/Aa//; + + #### polyA #### + if ( $a_count >= 5 && $a_count > $t_count ){ + + # we calculate the number of bases we want to chop + my $length_to_mask = 0; + + # we start with 3 bases + my ($piece, $count ) = (3,0); + + # count also the number of Ns, consider the Ns as potential As + my $n_count = 0; + + # take 3 by 3 bases from the end + while( $length_to_mask < $length ){ + my $chunk = substr( $seq, ($length - ($length_to_mask + 3)), $piece); + $count = $chunk =~ tr/Aa//; + $n_count = $chunk =~ tr/Nn//; + if ( ($count + $n_count) >= 2*( $piece )/3 ){ + $length_to_mask += 3; + } + else{ + last; + } + } + + if ( $length_to_mask > 0 ){ + # do not mask the last base if it is not an A: + my $last_base = substr( $seq, ( $length - $length_to_mask ), 1); + my $previous_to_last = substr( $seq, ( $length - $length_to_mask - 1), 1); + if ( !( $last_base eq 'A' || $last_base eq 'a') ){ + $length_to_mask--; + } + elsif( $previous_to_last eq 'A' || $previous_to_last eq 'a' ){ + $length_to_mask++; + } + my $clipped_seq = substr( $seq, 0, $length - $length_to_mask ); + my $mask; + if ( $self->_clip ){ + $mask = ""; + } + elsif( $self->_mask ){ + $mask = "N" x ($length_to_mask); + } + elsif ( $self->_softmask ){ + $mask = lc substr( $seq, ( $length - $length_to_mask ) ); + } + $new_seq = $clipped_seq . $mask; + } + else{ + $new_seq = $seq; + } + } + #### polyT #### + elsif( $t_count >=5 && $t_count > $a_count ){ + + # calculate the number of bases to chop + my $length_to_mask = -3; + + # we start with 3 bases: + my ($piece, $count) = (3,3); + + # count also the number of Ns, consider the Ns as potential As + my $n_count = 0; + + # take 3 by 3 bases from the beginning + while ( $length_to_mask < $length ){ + my $chunk = substr( $seq, $length_to_mask + 3, $piece ); + #print STDERR "length to mask: $length_to_mask\n"; + #print "chunk: $chunk\n"; + $count = $chunk =~ tr/Tt//; + $n_count = $chunk =~ tr/Nn//; + if ( ($count+$n_count) >= 2*( $piece )/3 ){ + $length_to_mask +=3; + } + else{ + last; + + } + } + if ( $length_to_mask >= 0 ){ + # do not chop the last base if it is not a A: + #print STDERR "clipping sequence $seq\n"; + my $last_base = substr( $seq, ( $length_to_mask + 3 - 1 ), 1 ); + my $previous_to_last = substr( $seq, ( $length_to_mask + 3 ), 1 ); + if ( !( $last_base eq 'T' || $last_base eq 't' ) ){ + $length_to_mask--; + } + elsif( $previous_to_last eq 'T' || $previous_to_last eq 't' ){ + $length_to_mask++; + } + my $clipped_seq = substr( $seq, $length_to_mask + 3); + my $mask; + if ( $self->_clip ){ + $mask = ""; + } + elsif( $self->_mask ){ + $mask = "N" x ($length_to_mask+3); + } + elsif ($self->_softmask){ + $mask = lc substr( $seq, 0, ($length_to_mask + 3) ); + } + $new_seq = $mask.$clipped_seq; + } + else{ + $new_seq = $seq; + } + } + else{ + # we cannot be sure of what it is + # do not clip + $new_seq = $seq; + } + + return $new_seq; +} + +############################################################ + +sub _mask{ + my ($self,$mask) = @_; + if (defined($mask)){ + $self->{_mask} = $mask; + } + return $self->{_mask}; +} + +############################################################ + +sub _clip{ + my ($self,$clip) = @_; + if (defined($clip)){ + $self->{_clip} = $clip; + } + return $self->{_clip}; +} + +############################################################ + +sub _softmask{ + my ($self,$softmask) = @_; + if (defined($softmask)){ + $self->{_softmask} = $softmask; + } + return $self->{_softmask}; +} + +############################################################ + + +sub has_polyA_track{ + my ($self, $seq) = @_; + my $new_seq; + my $length = length($seq); + + # is it a polyA or polyT? + my $check_polyT = substr( $seq, 0, 10 ); + + my $check_polyA = substr( $seq, -10 ); + + print STDERR "polyA: $check_polyA\n"; + + my $t_count = $check_polyT =~ tr/Tt//; + my $a_count = $check_polyA =~ tr/Aa//; + + ## testing with this short cut + if ( $a_count >=7 || $t_count >=7 ){ + return 1; + } + else{ + return 0; + } +} + + +################ +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/Proxy.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/Proxy.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,192 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::Proxy + +=head1 SYNOPSIS + + #Simple arounds logging proxy + package myproxy; + use base qw/Bio::EnsEMBL::Utils::Proxy/; + sub __resolver { + my ($invoker, $package, $method) = @_; + return sub { + my ($self, @args); + warn "Entering into ${package}::${method}"; + my @capture = $self->$method(@args); + warn "Exiting from ${package}::${method}"; + return @capture; + }; + } + + 1; + +=head1 DESCRIPTION + +This class offers Proxy objects similar to those found in Java's +C object. This class should be overriden and +then implement C<__resolver()>. The C<__resolver()> method returns a +subroutine to the intended action which the proxy object installs into +the calling class' scope. + +All methods internal to the proxy are prefixed with a double underscore +to avoid corruption/intrusion into the normal public and private scope of +most classes. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Utils::Proxy; + +use Bio::EnsEMBL::Utils::Exception qw/throw/; + +use vars '$AUTOLOAD'; + +=head2 new + + Arg [1] : The object to proxy + Example : my $newobj = Bio::EnsEMBL::Utils::Proxy->new($myobj); + Description : Provides a new instance of a proxy + Returntype : Bio::EnsEMBL::Utils::Proxy the new instance + Exceptions : None + Caller : public + Status : - + +=cut + +sub new { + my ($class, $proxy) = @_; + my $self = bless({}, ref($class)||$class); + $self->{__proxy} = $proxy; + return $self; +} + +=head2 __proxy + + Example : - + Description : The proxy accessor + Returntype : Any the proxied object + Exceptions : None + Caller : - + Status : - + +=cut + +sub __proxy { + my ($self) = @_; + return $_[0]->{__proxy}; +} + +=head2 isa + + Args : Object type to test + Example : $obj->isa('Bio::EnsEMBL::Utils::Proxy'); + Description : Overriden to provide C support for proxies. Will return + true if this object is assignable to the given type or the + proxied object is + Returntype : Boolean; performs same as a normal can + Exceptions : None + Caller : caller + Status : status + +=cut + + +sub isa { + my ($self, $class) = @_; + return 1 if $self->SUPER::isa($class); + return 1 if $self->__proxy()->isa($class); + return 0; +} + +=head2 can + + Args : Method name to test + Example : $obj->can('__proxy'); + Description : Overriden to provide C support for proxies. Will return + true if this object implements the given method or the + proxied object can + Returntype : Code; performs same as a normal can + Exceptions : None + Caller : caller + Status : status + +=cut + +sub can { + my ($self, $method) = @_; + return 1 if $self->SUPER::can($method); + return 1 if $self->__proxy()->can($method); + return 0; +} + +=head2 DESTROY + + Example : - + Description : Provided because of AutoLoad + Returntype : None + Exceptions : None + Caller : - + Status : - + +=cut + + + +sub DESTROY { + # left blank +} + +=head2 AUTOLOAD + + Example : - + Description : Performs calls to C<__resolver()> and installs the subroutine + into the current package scope. + Returntype : None + Exceptions : Thrown if C<__resolver()> could not return a subroutine + Caller : - + Status : - + +=cut + +sub AUTOLOAD { + my ($self, @args) = @_; + my ($package_name, $method_name) = $AUTOLOAD =~ m/ (.*) :: (.*) /xms; + my $sub = $self->__resolver($package_name, $method_name, @args); + if(! $sub) { + my $type = ref $self ? 'object' : 'class'; + throw qq{Can't locate $type method "$method_name" via package "$package_name". No subroutine was generated}; + } + *$AUTOLOAD = $sub; + goto &$sub; +} + +sub __resolver { + my ($self, $package_name, $method, @args) = @_; + #override to provide the subroutine to install + throw "Unimplemented __resolver() in $package_name. Please implement"; +} + +1; + +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/Scalar.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/Scalar.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,458 @@ +package Bio::EnsEMBL::Utils::Scalar; + +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=pod + +=head1 NAME + +Bio::EnsEMBL::Utils::Scalar + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Utils::Scalar qw(check_ref assert_ref wrap_array check_ref_can assert_ref_can assert_numeric assert_integer scope_guard); + + check_ref([], 'ARRAY'); # Will return true + check_ref({}, 'ARRAY'); # Will return false + check_ref($dba, 'Bio::EnsEMBL::DBSQL::DBAdaptor'); #Returns true if $dba is a DBAdaptor + + assert_ref([], 'ARRAY'); #Returns true + assert_ref({}, 'ARRAY'); #throws an exception + assert_ref($dba, 'Bio::EnsEMBL::Gene'); #throws an exception if $dba is not a Gene + + wrap_array([]); #Returns the same reference + wrap_array($a); #Returns [$a] if $a was not an array + wrap_array(undef); #Returns [] since incoming was undefined + wrap_array(); #Returns [] since incoming was empty (therefore undefined) + + check_ref_can([], 'dbID'); #returns false as ArrayRef is not blessed + check_ref_can($gene, 'dbID'); #returns true as Gene should implement dbID() + check_ref_can(undef); #Throws an exception as we gave no method to test + + assert_ref_can([], 'dbID'); #throws an exception since ArrayRef is not blessed + assert_ref_can($gene, 'dbID'); #returns true if gene implements dbID() + assert_ref_can(undef); #Throws an exception as we gave no method to test + + asssert_integer(1, 'dbID'); #Passes + asssert_integer(1.1, 'dbID'); #Fails + asssert_numeric(1E-11, 'dbID'); #Passes + asssert_numeric({}, 'dbID'); #Fails + + #Scope guards + my $v = 'wibble'; + { + #Build a guard to reset $v to wibble + my $guard = scope_guard(sub { $v = 'wibble'}); + $v = 'wobble'; + warn $v; # prints wobble + } + # $guard is out of scope; sub is triggered and $v is reset + warn $v; # prints wibble + + #Tags are also available for exporting + use Bio::EnsEMBL::Utils::Scalar qw(:assert); # brings in all assert methods + use Bio::EnsEMBL::Utils::Scalar qw(:check); #brings in all check methods + use Bio::EnsEMBL::Utils::Scalar qw(:array); #brings in wrap_array + use Bio::EnsEMBL::Utils::Scalar qw(:all); #import all methods + +=head1 DESCRIPTION + +A collection of subroutines aimed to helping Scalar based operations + +=head1 METHODS + +See subroutines. + +=head1 MAINTAINER + +$Author: ady $ + +=head1 VERSION + +$Revision: 1.12.14.1 $ + +=cut + +use strict; +use warnings; + +use base qw(Exporter); + +our %EXPORT_TAGS; +our @EXPORT_OK; + +@EXPORT_OK = qw( + check_ref check_ref_can + assert_ref assert_ref_can assert_numeric assert_integer assert_boolean assert_strand assert_file_handle + wrap_array + scope_guard +); +%EXPORT_TAGS = ( + assert => [qw(assert_ref assert_ref_can assert_integer assert_numeric assert_boolean assert_strand assert_file_handle)], + check => [qw(check_ref check_ref_can)], + array => [qw/wrap_array/], + all => [@EXPORT_OK] +); + +use Bio::EnsEMBL::Utils::Exception qw(throw); +use Scalar::Util qw(blessed looks_like_number openhandle); + +=head2 check_ref() + + Arg [1] : The reference to check + Arg [2] : The type we expect + Description : A subroutine which checks to see if the given object/ref is + what you expect. If you give it a blessed reference then it + will perform an isa() call on the object after the defined + tests. If it is a plain reference then it will use ref(). + + An undefined value will return a false. + Returntype : Boolean indicating if the reference was the type we + expect + Example : my $ok = check_ref([], 'ARRAY'); + Exceptions : If the expected type was not set + Status : Stable + +=cut + +sub check_ref { + my ($ref, $expected) = @_; + throw('No expected type given') if ! defined $expected; + if(defined $ref) { + if(blessed($ref)) { + return 1 if $ref->isa($expected); + } + else { + my $ref_ref_type = ref($ref); + return 1 if defined $ref_ref_type && $ref_ref_type eq $expected; + } + } + return 0; +} + +=head2 assert_ref() + + Arg [1] : The reference to check + Arg [2] : The type we expect + Arg [3] : The attribute name you are asserting; not required but allows + for more useful error messages to be generated. Defaults to + C<-Unknown->. + Description : A subroutine which checks to see if the given object/ref is + what you expect. This behaves in an identical manner as + C does except this will raise exceptions when + the values do not match rather than returning a boolean + indicating the situation. + + Undefs cause exception circumstances. + Returntype : Boolean; true if we managed to get to the return + Example : assert_ref([], 'ARRAY'); + Exceptions : If the expected type was not set and if the given reference + was not assignable to the expected value + Status : Stable + +=cut + +sub assert_ref { + my ($ref, $expected, $attribute_name) = @_; + $attribute_name ||= '-Unknown-'; + throw('No expected type given') if ! defined $expected; + my $class = ref($ref); + throw("The given reference for attribute $attribute_name was undef") unless defined $ref; + throw("Asking for the type of the attribute $attribute_name produced no type; check it is a reference") unless $class; + if(blessed($ref)) { + throw("${attribute_name}'s type '${class}' is not an ISA of '${expected}'") if ! $ref->isa($expected); + } + else { + throw("$attribute_name was expected to be '${expected}' but was '${class}'") if $expected ne $class; + } + return 1; +} + +=head2 wrap_array() + + Arg : The reference we want to wrap in an array + Description : Takes in a reference and returns either the reference if it + was already an array, the reference wrapped in an array or + an empty array (if the given value was undefined). + Returntype : Array Reference + Example : my $a = wrap_array($input); + Exceptions : None + Status : Stable + +=cut + +sub wrap_array { + my ($incoming_reference) = @_; + if(defined $incoming_reference) { + if(check_ref($incoming_reference, 'ARRAY')) { + return $incoming_reference; + } + else { + return [$incoming_reference]; + } + } + return []; +} + +=head2 check_ref_can + + Arg [1] : The reference to check + Arg [2] : The method we expect to run + Description : A subroutine which checks to see if the given object/ref is + implements the given method. This is very similar to the + functionality given by C but works + by executing C on the object meaning we consult the + object's potentially overriden version rather than Perl's + default mechanism. + Returntype : CodeRef + Example : check_ref_can($gene, 'dbID'); + Exceptions : If the expected type was not set. + Status : Stable + +=cut + +sub check_ref_can { + my ($ref, $method) = @_; + throw('No method given') if ! defined $method; + return unless defined $ref && blessed($ref); + return $ref->can($method); +} + +=head2 assert_ref_can + + Arg [1] : The reference to check + Arg [2] : The method we expect to run + Arg [3] : The attribute name you are asserting; not required but allows + for more useful error messages to be generated. Defaults to + C<-Unknown->. + Description : A subroutine which checks to see if the given object/ref is + implements the given method. Will throw exceptions. + Returntype : Boolean; true if we managed to get to the return + Example : assert_ref_can($gene, 'dbID'); + Exceptions : If the reference is not defined, if the object does not + implement the given method and if no method was given to check + Status : Stable + +=cut + +sub assert_ref_can { + my ($ref, $method, $attribute_name) = @_; + $attribute_name ||= '-Unknown-'; + throw('No method given') if ! defined $method; + throw "The given reference $attribute_name is not defined" unless defined $ref; + throw "The given reference $attribute_name is not blessed" unless blessed($ref); + if(! $ref->can($method)) { + my $str_ref = ref($ref); + throw sprintf(q{The given blessed reference '%s' for attribute '%s' does not implement the method '%s'}, $str_ref, $attribute_name, $method); + } + return 1; +} + +=head2 assert_numeric + + Arg [1] : The Scalar to check + Arg [2] : The attribute name you are asserting; not required but allows + for more useful error messages to be generated. Defaults to + C<-Unknown->. + Description : A subroutine which checks to see if the given scalar is + number or not. If not then we raise exceptions detailing why + Returntype : Boolean; true if we had a numeric otherwise we signal failure + via exceptions + Example : assert_numeric(1, 'dbID'); + Exceptions : If the Scalar is not defined, if the Scalar was blessed and + if the value was not a number + Status : Stable + +=cut + +sub assert_numeric { + my ($integer, $attribute_name) = @_; + $attribute_name ||= '-Unknown-'; + throw "$attribute_name attribute is undefined" if ! defined $integer; + throw "The given attribute $attribute_name is blessed; cannot work with blessed values" if blessed($integer); + if(! looks_like_number($integer)) { + throw "Attribute $attribute_name was not a number"; + } + return 1; +} + +=head2 assert_integer + + Arg [1] : The Scalar to check + Arg [2] : The attribute name you are asserting; not required but allows + for more useful error messages to be generated. Defaults to + C<-Unknown->. + Description : A subroutine which checks to see if the given scalar is + a whole integer; we delegate to L for number + checking. + Returntype : Boolean; true if we had a numeric otherwise we signal failure + via exceptions + Example : assert_integer(1, 'dbID'); + Exceptions : See L and we raise exceptions if the value + was not a whole integer + Status : Stable + +=cut + +sub assert_integer { + my ($integer, $attribute_name) = @_; + $attribute_name ||= '-Unknown-'; + assert_numeric($integer, $attribute_name); + if($integer != int($integer)) { + throw "Attribute $attribute_name was a number but not an Integer"; + } + return 1; +} + +=head2 assert_boolean + + Arg [1] : The Scalar to check + Arg [2] : The attribute name you are asserting; not required but allows + for more useful error messages to be generated. Defaults to + C<-Unknown->. + Description : A subroutine which checks to see if the given scalar is + a boolean i.e. value is set to C<1> or C<0> + Returntype : Boolean; true if we were given a boolean otherwise we signal + failure via exceptions + Example : assert_boolean(1, 'is_circular'); + Exceptions : See L and we raise exceptions if the value + was not equal to the 2 valid states + Status : Stable + +=cut + +sub assert_boolean { + my ($boolean, $attribute_name) = @_; + $attribute_name ||= '-Unknown-'; + assert_numeric($boolean, $attribute_name); + if($boolean != 0 && $boolean != 1) { + throw "Attribute $attribute_name was an invalid boolean. Expected: 1 or 0. Got: $boolean"; + } + return 1; +} + +=head2 assert_strand + + Arg [1] : The Scalar to check + Arg [2] : The attribute name you are asserting; not required but allows + for more useful error messages to be generated. Defaults to + C<-Unknown->. + Description : A subroutine which checks to see if the given scalar is + a whole integer and if the value is set to C<1>, C<0> or C<-1> + Returntype : Boolean; true if we had a strand integer otherwise we signal + failure via exceptions + Example : assert_strand(1, 'strand'); + Exceptions : See L and we raise exceptions if the value + was not equal to the 3 valid states + Status : Stable + +=cut + +sub assert_strand { + my ($strand, $attribute_name) = @_; + $attribute_name ||= '-Unknown-'; + assert_numeric($strand, $attribute_name); + if($strand != -1 && $strand != 0 && $strand ne 1) { + throw "Attribute $attribute_name was an invalid strand. Expected: 1, 0 or -1. Got: $strand"; + } + return 1; +} + + +=head2 assert_file_handle + + Arg [1] : The Scalar to check + Arg [2] : The attribute name you are asserting; not required but allows + for more useful error messages to be generated. Defaults to + C<-Unknown->. + Description : A subroutine which checks to see if the given scalar is + actually a file handle. This will handle those which are Glob + references and those which inherit from C. It will + also cope with a blessed Glob reference. + Returntype : Boolean; + Example : assert_file_handle($fh, '-FILE_HANDLE'); + Exceptions : Raised if not defined, not a reference and was not a + GLOB or did not inherit from IO::Handle + Status : Stable + +=cut + +sub assert_file_handle { + my ($file_handle, $attribute_name) = @_; + $attribute_name ||= '-Unknown-'; + throw "Attribute $attribute_name was undefined" if ! defined $file_handle; + my $ref = ref($file_handle); + throw "Attribute $attribute_name was not a reference. Got: $file_handle" if ! $ref; + if(!openhandle($file_handle)) { + if(blessed($file_handle)) { + if(! $file_handle->isa('IO::Handle')) { + throw "Attribute $attribute_name was blessed but did not inherit from IO::Handle. Ref was: $ref"; + } + } + else { + throw "Attribute $attribute_name was not a file handle. Ref was: $ref"; + } + } + return 1; +} + +=head2 scope_guard + + Arg [1] : CodeRef The block of code to exit once it escapes out of scope + Description : Simple subroutine which blesses your given code reference into + a L object. This has + a DESTROY implemented which will cause the code reference + to execute once the object goes out of scope and its reference + count hits 0. + Returntype : Bio::EnsEMBL::Utils::Scalar::ScopeGuard + Example : my $v = 'wibble'; + { + #Build a guard to reset $v to wibble + my $guard = scope_guard(sub { $v = 'wibble'}); + $v = 'wobble'; + warn $v; + } + # $guard is out of scope; sub is triggered and $v is reset + warn $v; + Exceptions : Raised if argument was not a CodeRef + Status : Stable + +=cut + +sub scope_guard { + my ($callback) = @_; + assert_ref($callback, 'CODE', 'callback'); + return bless($callback, 'Bio::EnsEMBL::Utils::Scalar::ScopeGuard'); +} + +1; + +#### SUPER SECRET PACKAGE. IGNORE ME +package Bio::EnsEMBL::Utils::Scalar::ScopeGuard; +sub DESTROY { + my ($self) = @_; + $self->(); + return; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/SchemaConversion.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/SchemaConversion.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,347 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::SchemaConversion - Utility module for Vega schema conversion script + +=head1 SYNOPSIS + + my $serverroot = '/path/to/ensembl'; + my $conversion = + Bio::EnsEMBL::Utils::ConversionSupport->new($serverroot); + + # parse common options + $conversion->conv_usage->parse_common_options; + + # convert from schema 19 to 20+ + $conversion->do_conversion() + +=head1 DESCRIPTION + +This module is a helper module for database conversion, for +both vega-vega and ensembl-vega schemas. It provides a wrapper +around SeqStoreConverter::BasicConverter and the species specific +methods therein. Also provides access to helper functions in +Bio::EnsEMBL::Utils::ConversionSupport + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Utils::SchemaConversion; + +use Bio::EnsEMBL::Utils::ConversionSupport; +use strict; +use warnings; + +use Data::Dumper; + +=head2 new + + Example : $conversion->Bio::EnsEMBL::Utils::SchemaConversion->new($serverroot); + Description : Constructor, including an instance of a Bio::EnsEMBL::Utils::ConversionSupport + object. Parses input file and checks input with user + Return type : Bio::EnsEMBL::Utils::SchemaConversion object + Exceptions : thrown if $Siteroot not passed over + Caller : $Siteroot/utils/vega_schema_conversion + +=cut + +sub new { + my $class = shift; + my $support = shift; + my $self = {}; + bless ($self,$class); + $self->{config} = Bio::EnsEMBL::Utils::ConversionSupport->new($support); + $self->conv_support->parse_common_options; + $self->conv_support->parse_extra_options('do_vega_sc=s', + 'do_ens_sc=s', + 'source_db=s', + 'core_sql=s', + 'vega_sql=s', + 'patch_sql=s', + 'force=s', + 'do_features=s'); + + #check input and show help + $self->conv_usage() if ($self->conv_support->param("help")); + $self->conv_usage("configuration file needed") unless ($self->conv_support->param("conffile")); + $self->conv_usage("password for database access needed") unless ($self->conv_support->param("pass")); + $self->conv_usage("can only do conversion to ensembl OR Vega, not both") if ($self->conv_support->param('do_vega_sc') && $self->conv_support->param('do_ens_sc')); + $self->conv_usage("You need to do vega->veg or ensembl->vega conversion") unless ($self->conv_support->param('do_vega_sc') || $self->conv_support->param('do_ens_sc')); + + # ask user to confirm parameters to proceed + $self->conv_support->allowed_params('conffile', + 'do_vega_sc', + 'do_ens_sc', + 'host', + 'port', + 'user', + 'pass', + 'source_db', + 'dbname', + 'force', + 'do_features', + 'verbose', + 'logpath', + 'logfile', + 'core_sql', + 'vega_sql', + 'patch_sql'); + $self->conv_support->confirm_params; + + return $self; +} + +=head2 conv_support + + Example : $conversion->conv_support; + Description : Provides access to Bio::EnsEMBL::Utils::ConversionSupport methods + Return type : Bio::EnsEMBL::Utils::ConversionSuppor object + Exceptions : none + Caller : general + +=cut + +sub conv_support { + my $self = shift; + return $self->{config}; +} + +=head2 conv_obj + + Example : $conversion->conv_obj; + Description : Provides access to SeqStoreConverter::BasicConverter methods + Return type : SeqStoreConverter::BasicConverter object + Exceptions : none + Caller : general + +=cut + +sub conv_obj { + my $self = shift; + return $self->{'converter_object'}; +} + + +=head2 species_alias + + Example : $self->species_alias + Description : examines name of source database to determine which conversion module to use + Return type : string + Exceptions : die if wrong species name used + Caller : $self + +=cut + +sub species_alias { + my $self=shift; + my $name = shift; + return 'CanisFamiliaris' if $name =~ /canis/; + return 'HomoSapiens' if $name =~ /homo/; + return 'MusMusculus' if $name =~ /mus/; + return 'DanioRerio' if $name =~ /danio/; + ##hack - should use own modules + return 'HomoSapiens' if $name =~ /sus/; + die "invalid name of source database, please check configuration file"; +} + +=head2 choose_conversion_type + + Example : $conversion->choose_conversion_type + Description : compares conversion type (ensembl or vega) and species type with + available modules and chooses that to use for the conversion. Stores + a converter object within the caller + Return type : none + Exceptions : none + Caller : $Siteroot/utils/vega_schema_conversion + +=cut + +sub choose_conversion_type { + my $self = shift; + my $converter; + my $species; + + $species = $self->species_alias($self->conv_support->param('source_db')); + if ($self->conv_support->param('do_vega_sc')) { + $species = "vega::".$species; + eval "require SeqStoreConverter::$species"; + if($@) { + warn("Could not require conversion module SeqStoreConverter::$species\ for vega conversion\n" . + "Using SeqStoreConverter::BasicConverter instead:\n$@"); + require SeqStoreConverter::BasicConverter; + $species = "BasicConverter"; + } + else { + warn "Using conversion module SeqStoreConverter::$species for vega conversion\n"; + } + } + else { + eval "require SeqStoreConverter::$species"; + if($@) { + warn("Could not require conversion module SeqStoreConverter::$species for Ensembl conversion\n" . + "Using SeqStoreConverter::BasicConverter instead:\n$@"); + require SeqStoreConverter::BasicConverter; + $species = "BasicConverter"; + } + else { + warn "Using conversion module SeqStoreConverter::$species for Ensembl conversion\n"; + } + $self->conv_support->param('vega_sql',0); + } + $converter = "SeqStoreConverter::$species"->new + ( $self->conv_support->param('user'), + $self->conv_support->param('pass'), + $self->conv_support->param('host').':'.$self->conv_support->param('port'), + $self->conv_support->param('source_db'), + $self->conv_support->param('dbname'), + $self->conv_support->param('core_sql'), + $self->conv_support->param('vega_sql'), + $self->conv_support->param('force'), + $self->conv_support->param('verbose'), + '', + ); + + $self->{'converter_object'} = $converter; +} + +=head2 do_conversion + + Example : $conversion->do_conversion + Description : does the database conversion + Return type : none + Exceptions : none + Caller : $Siteroot/utils/vega_schema_conversion + +=cut + + +sub do_conversion { + my $self= shift; + $self->conv_obj->debug( "\n\n*** converting " . $self->conv_obj->source . " to " . + $self->conv_obj->target() . " ***"); + $self->conv_obj->transfer_meta(); + $self->conv_obj->create_coord_systems(); + $self->conv_obj->create_seq_regions(); + $self->conv_obj->create_assembly(); + $self->conv_obj->create_attribs(); + $self->conv_obj->set_top_level(); + $self->conv_obj->transfer_dna(); + $self->conv_obj->back_patch_schema(); + $self->conv_obj->transfer_genes(); + $self->conv_obj->transfer_prediction_transcripts(); + + if ($self->conv_support->param('do_features')) { + $self->conv_obj->transfer_features(); + } +#use this for both ensembl and vega for now, +#but might need changing when vega gets eg transcript modified dates + $self->conv_obj->transfer_vega_stable_ids(); + $self->conv_obj->copy_other_tables(); + $self->conv_obj->copy_repeat_consensus(); + $self->conv_obj->create_meta_coord(); + if ($self->conv_support->param('do_vega_sc')) { + $self->conv_obj->copy_other_vega_tables(); + $self->conv_obj->update_clone_info(); + $self->conv_obj->remove_supercontigs(); + $self->conv_obj->copy_internal_clone_names(); + $self->conv_obj->copy_assembly_exception; + } +} + +=head2 make_schema_up_to_date + + Example : $conversion->make_schema_up_to_date + Description : patches schema to latest version + Return type : none + Exceptions : none + Caller : $conversion + +=cut + +sub make_schema_up_to_date { + my $self = shift; + $self->conv_obj->debug ("\nPatching schema to latest version\n"); + my $user = $self->conv_obj->user; + my $pass = $self->conv_obj->password; + my $port = $self->conv_obj->port; + my $host = $self->conv_obj->host; + my $target = $self->conv_obj->target; + my $patch_schema = $self->conv_support->param('patch_sql'); + my $cmd = "/usr/local/mysql/bin/mysql -u $user -p$pass -P $port -h $host $target < $patch_schema"; + system ($cmd); +} + + + +=head2 conv_usage + + Example : $conversion->conv_usage("message") + Description : prints usage information and exits + Return type : none + Exceptions : none + Caller : $Siteroot/utils/vega_schema_conversion + +=cut + +sub conv_usage { + my $self = shift; + my $msg = shift; + + print STDERR "\nMSG: $msg\n" if($msg); + + print STDERR < + +options: --conf configuration file (uses conf/Conversion.ini by default): + + fields: + do_vega_sc (do vega conversion: 0 or 1) + do_ens_sc (do ensembl conversion: 0 or 1) + user (a mysql db user with read/write priveleges) + host (eg ecs3f) + port (eg 3310) + source_db (schema 19 source database) + dbname (schema 20+ target database) + force (overwrite existing target database: 0 or 1) + verbose (print out debug statements: 0 or 1) + logpath (location of log file) + do_features (transfer dna- and protein-align features, for debugging: 0 or 1) + core_sql (location of ensembl schema creation script: ensembl/sql/table.sql) + vega_sql (location of creation script for additional vega tables: ensembl/sql/vega_specific_tables.sql) + patch_sql (location of schema patching script: ensembl/sql/vega_latest_schema.sql) + + --log name of log_file + --help display this message + +EOF + exit; + +} + +1; + + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/ScriptUtils.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/ScriptUtils.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,265 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::ScriptUtils; + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Utils::ScriptUtils; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Exporter; +our @ISA = qw(Exporter); + +our @EXPORT_OK = qw( + user_proceed + commify + sort_chromosomes + parse_bytes + directory_hash + path_append + dynamic_use + inject +); + + +=head2 user_proceed + + Arg[1] : (optional) String $text - notification text to present to user + Example : # run a code snipped conditionally + if ($support->user_proceed("Run the next code snipped?")) { + # run some code + } + + # exit if requested by user + exit unless ($support->user_proceed("Want to continue?")); + Description : If running interactively, the user is asked if he wants to + perform a script action. If he doesn't, this section is skipped + and the script proceeds with the code. When running + non-interactively, the section is run by default. + Return type : TRUE to proceed, FALSE to skip. + Exceptions : none + Caller : general + +=cut + +sub user_proceed { + my ($text, $interactive, $default) = @_; + + unless (defined($default)) { + die("Need a default answer for non-interactive runs."); + } + + my $input; + + if ($interactive) { + print "$text\n" if $text; + print "[y/N] "; + + $input = lc(<>); + chomp $input; + } else { + $input = $default; + } + + if ($input eq 'y') { + return(1); + } else { + print "Skipping.\n" if ($interactive); + return(0); + } +} + + +=head2 sort_chromosomes + + Arg[1] : (optional) Hashref $chr_hashref - Hashref with chr_name as keys + Example : my $chr = { '6-COX' => 1, '1' => 1, 'X' => 1 }; + my @sorted = $support->sort_chromosomes($chr); + Description : Sorts chromosomes in an intuitive way (numerically, then + alphabetically). If no chromosome hashref is passed, it's + retrieve by calling $self->get_chrlength() + Return type : List - sorted chromosome names + Exceptions : thrown if no hashref is provided + Caller : general + +=cut + +sub sort_chromosomes { + my @chromosomes = @_; + + return (sort _by_chr_num @chromosomes); +} + + +=head2 _by_chr_num + + Example : my @sorted = sort _by_chr_num qw(X, 6-COX, 14, 7); + Description : Subroutine to use in sort for sorting chromosomes. Sorts + numerically, then alphabetically + Return type : values to be used by sort + Exceptions : none + Caller : internal ($self->sort_chromosomes) + +=cut + +sub _by_chr_num { + my @awords = split /-/, $a; + my @bwords = split /-/, $b; + + my $anum = $awords[0]; + my $bnum = $bwords[0]; + + if ($anum !~ /^[0-9]*$/) { + if ($bnum !~ /^[0-9]*$/) { + return $anum cmp $bnum; + } else { + return 1; + } + } + if ($bnum !~ /^[0-9]*$/) { + return -1; + } + + if ($anum <=> $bnum) { + return $anum <=> $bnum; + } else { + if ($#awords == 0) { + return -1; + } elsif ($#bwords == 0) { + return 1; + } else { + return $awords[1] cmp $bwords[1]; + } + } +} + + +=head2 commify + + Arg[1] : Int $num - a number to commify + Example : print "An easy to read number: ".$self->commify(100000000); + # will print 100,000,000 + Description : put commas into a number to make it easier to read + Return type : a string representing the commified number + Exceptions : none + Caller : general + Status : stable + +=cut + +sub commify { + my $num = shift; + + $num = reverse($num); + $num =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g; + + return scalar reverse $num; +} + + +sub parse_bytes { + my $bytes = shift; + + my @suffixes = qw(bytes kb Mb Gb Tb); + + my $length = length($bytes); + my $order = int(($length-1)/3); + + my $parsed = sprintf('%.1f', $bytes/10**(3*$order)); + + return "$parsed ".$suffixes[$order]; +} + + +sub directory_hash { + my $filename = shift; + + my (@md5) = md5_hex($filename) =~ /\G(..)/g; + return join('/', @md5[0..2]); +} + + +sub path_append { + my $path1 = shift; + my $path2 = shift; + + # default to current directory + $path1 = '.' unless (defined($path1)); + + my $return_path = "$path1/$path2"; + + unless (-d $return_path) { + system("mkdir -p $return_path") == 0 or + die("Unable to create directory $return_path: $!\n"); + } + + return $return_path; +} + + +=head2 inject + + Arg [1] : String $classname - The name of the class to require/import + Example : $self->inject('Bio::EnsEMBL::DBSQL::DBAdaptor'); + Description: Requires and imports the methods for the classname provided, + checks the symbol table so that it doesnot re-require modules + that have already been required. + Returntype : true on success + Exceptions : Warns to standard error if module fails to compile + Caller : internal + +=cut + +sub inject { + my $classname = shift; + my ($parent_namespace, $module) = $classname =~/^(.*::)(.*)$/ ? + ($1,$2) : ('::', $classname); + no strict 'refs'; + + # return if module has already been imported + return 1 if $parent_namespace->{$module.'::'}; + + eval "require $classname"; + die("Failed to require $classname: $@") if ($@); + + $classname->import(); + + return 1; +} + + +sub dynamic_use { + return inject(@_); +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/SeqDumper.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/SeqDumper.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1121 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::SeqDumper + +=head1 SYNOPSIS + + $seq_dumper = Bio::EnsEMBL::Utils::SeqDumper->new(); + + # don't dump snps or repeats + $seq_dumper->disable_feature_type('repeat'); + $seq_dumper->disable_feature_type('variation'); + + # dump EMBL format to STDOUT + $seq_dumper->dump( $slice, 'EMBL' ); + + # dump GENBANK format to a file + $seq_dumper->dump( $slice, 'GENBANK', 'out.genbank' ); + + # dump FASTA format to a file + $seq_dumper->dump( $slice, 'FASTA', 'out.fasta' ); + +=head1 DESCRIPTION + +A relatively simple and lite-weight flat file dumper for Ensembl slices. +The memory efficiency could be improved and this is currently not very +good for dumping very large sequences such as whole chromosomes. + +=head1 METHODS + +=cut + + +package Bio::EnsEMBL::Utils::SeqDumper; + +use strict; + +use IO::File; +use vars qw(@ISA); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + +#keys must be uppercase +my $DUMP_HANDLERS = + { 'FASTA' => \&dump_fasta, + 'EMBL' => \&dump_embl, + 'GENBANK' => \&dump_genbank }; + +my @COMMENTS = + ('This sequence was annotated by the Ensembl system. Please visit ' . + 'the Ensembl web site, http://www.ensembl.org/ for more information.', + + 'All feature locations are relative to the first (5\') base ' . + 'of the sequence in this file. The sequence presented is '. + 'always the forward strand of the assembly. Features ' . + 'that lie outside of the sequence contained in this file ' . + 'have clonal location coordinates in the format: ' . + '.:..', + + 'The /gene indicates a unique id for a gene, /note="transcript_id=..."' . + ' a unique id for a transcript, /protein_id a unique id for a peptide ' . + 'and note="exon_id=..." a unique id for an exon. These ids are ' . + 'maintained wherever possible between versions.', + + 'All the exons and transcripts in Ensembl are confirmed by ' . + 'similarity to either protein or cDNA sequences.'); + + +=head2 new + + Arg [1] : none + Example : $seq_dumper = Bio::EnsEMBL::Utils::SeqDumper->new; + Description: Creates a new SeqDumper + Returntype : Bio::EnsEMBL::Utils::SeqDumper + Exceptions : none + Caller : general + +=cut + +sub new { + my ($caller, $slice, $params) = @_; + + my $class = ref($caller) || $caller; + + my $feature_types = {'gene' => 1, + 'genscan' => 1, + 'repeat' => 1, + 'similarity' => 1, + 'variation' => 1, + 'contig' => 1, + 'marker' => 1, + 'estgene' => 0, + 'vegagene' => 0}; + + my $self = bless {'feature_types' => $feature_types}, $class; + + foreach my $p (sort keys %{$params || {}}) { + $self->{$p} = $params->{$p}; + } + + return $self; +} + + + +=head2 enable_feature_type + + Arg [1] : string $type + Example : $seq_dumper->enable_feature_type('similarity'); + Description: Enables the dumping of a specific type of feature + Returntype : none + Exceptions : warn if invalid feature type is passed, + thrown if no feature type is passed + Caller : general + +=cut + +sub enable_feature_type { + my ($self, $type) = @_; + + $type || throw("type arg is required"); + + if(exists($self->{'feature_types'}->{$type})) { + $self->{'feature_types'}->{$type} = 1; + } else { + warning("unknown feature type '$type'\n" . + "valid types are: " . join(',', keys %{$self->{'feature_types'}})); + } +} + + + +=head2 attach_database + + Arg [1] : string name + Arg [2] : Bio::EnsEMBL::DBSQL::DBAdaptor + Example : $seq_dumper->attach_database('estgene', $estgene_db); + Description: Attaches a database to the seqdumper that can be used to + dump data which is external to the ensembl core database. + Currently this is necessary to dump est genes and vega genes + Returntype : none + Exceptions : thrown if incorrect argument is supplied + Caller : general + +=cut + +sub attach_database { + my ($self, $name, $db) = @_; + + $name || throw("name arg is required"); + unless($db && ref($db) && $db->isa('Bio::EnsEMBL::DBSQL::DBAdaptor')) { + throw("db arg must be a Bio::EnsEMBL::DBSQL::DBAdaptor not a [$db]"); + } + + $self->{'attached_dbs'}->{$name} = $db; +} + + + +=head2 get_database + + Arg [1] : string $name + Example : $db = $seq_dumper->get_database('vega'); + Description: Retrieves a database that has been attached to the + seqdumper via the attach database call. + Returntype : Bio::EnsEMBL::DBSQL::DBAdaptor + Exceptions : thrown if incorrect argument is supplied + Caller : dump_feature_table + +=cut + +sub get_database { + my ($self, $name) = @_; + + $name || throw("name arg is required"); + + return $self->{'attached_dbs'}->{$name}; +} + + + +=head2 remove_database + + Arg [1] : string $name + Example : $db = $seq_dumper->remove_database('estgene'); + Description: Removes a database that has been attached to the seqdumper + via the attach database call. The database that is removed + is returned (or undef if it did not exist). + Returntype : Bio::EnsEMBL::DBSQL::DBAdaptor + Exceptions : thrown if incorrect argument is supplied + Caller : general + +=cut + +sub remove_database { + my ($self, $name) = @_; + + $name || throw("name arg is required"); + + if(exists $self->{'attached_dbs'}->{$name}) { + return delete $self->{'attached_dbs'}->{$name}; + } + + return undef; +} + + +=head2 disable_feature_type + + Arg [1] : string $type + Example : $seq_dumper->disable_feature_type('genes'); + Description: Disables the dumping of a specific type of feature + Returntype : none + Exceptions : warn if an invalid feature type is passed, + thrown if no feature type is passed + Caller : general + +=cut + +sub disable_feature_type { + my ($self, $type) = @_; + + $type || throw("type arg is required"); + + if(exists($self->{'feature_types'}->{$type})) { + $self->{'feature_types'}->{$type} = 0; + } else { + warning("unknown feature type '$type'\n" . + "valid types are: " . join(',', keys %{$self->{'feature_types'}})); + } +} + + + +=head2 is_enabled + + Arg [1] : string $type + Example : do_something() if($seq_dumper->is_enabled('gene')); + Description: checks if a specific feature type is enabled + Returntype : none + Exceptions : warning if invalid type is passed, + thrown if no type is passed + Caller : general + +=cut + +sub is_enabled { + my ($self, $type) = @_; + + $type || throw("type arg is required"); + + if(exists($self->{'feature_types'}->{$type})) { + return $self->{'feature_types'}->{$type}; + } else { + warning("unknown feature type '$type'\n" . + "valid types are: " . join(',', keys %{$self->{'feature_types'}})); + } +} + + +=head2 dump + + Arg [1] : Bio::EnsEMBL::Slice slice + The slice to dump + Arg [2] : string $format + The name of the format to dump + Arg [3] : (optional) $outfile + The name of the file to dump to. If no file is specified STDOUT + is used + Arg [4] : (optional) $seq + Sequence to dump + Arg [4] : (optional) $no_append + Default action is to open the file in append mode. This will + turn that mode off + Example : $seq_dumper->dump($slice, 'EMBL'); + Description: Dumps a region of a genome specified by the slice argument into + an outfile of the format $format + Returntype : none + Exceptions : thrown if slice or format args are not supplied + Caller : general + +=cut + + +sub dump { + my ($self, $slice, $format, $outfile, $seq, $no_append) = @_; + + $format || throw("format arg is required"); + $slice || throw("slice arg is required"); + + my $dump_handler = $DUMP_HANDLERS->{uc($format)}; + + unless($dump_handler) { + throw("No dump handler is defined for format $format\n"); + } + + + my $FH = IO::File->new;; + if($outfile) { + my $mode = ($no_append) ? '>' : '>>'; + $FH->open("${mode}${outfile}") or throw("Could not open file $outfile: $!"); + } else { + $FH = \*STDOUT; + #mod_perl did not like the following + #$FH->fdopen(fileno(STDOUT), "w") + # or throw("Could not open currently selected output filehandle " . + # "for writing"); + } + + &$dump_handler($self, $slice, $FH, $seq); + + $FH->close if ($outfile); #close if we were writing to a file +} + +=head2 dump_embl + + Arg [1] : Bio::EnsEMBL::Slice + Arg [2] : IO::File $FH + Arg [3] : optional sequence string + Example : $seq_dumper->dump_embl($slice, $FH); + Description: Dumps an EMBL flat file to an open file handle + Returntype : none + Exceptions : none + Caller : dump + +=cut + +sub dump_embl { + my $self = shift; + my $slice = shift; + my $FH = shift; + my $SEQ = shift; + + my $len = $slice->length; + + my $version; + my $acc; + + my $cs = $slice->coord_system(); + my $name_str = $cs->name() . ' ' . $slice->seq_region_name(); + $name_str .= ' ' . $cs->version if($cs->version); + + my $start = $slice->start; + my $end = $slice->end; + + #determine if this slice is the entire seq region + #if it is then we just use the name as the id + my $slice_adaptor = $slice->adaptor(); + my $full_slice = + $slice->adaptor->fetch_by_region($cs->name, + $slice->seq_region_name, + undef,undef,undef, + $cs->version); + + + my $entry_name = $slice->seq_region_name(); + + + + if($full_slice->name eq $slice->name) { + $name_str .= ' full sequence'; + $acc = $slice->seq_region_name(); + my @acc_ver = split(/\./, $acc); + if(@acc_ver == 2) { + $acc = $acc_ver[0]; + $version = $acc_ver[0] . '.'. $acc_ver[1]; + } elsif(@acc_ver == 1 && $cs->version()) { + $version = $acc . '.'. $cs->version(); + } else { + $version = $acc; + } + } else { + $name_str .= ' partial sequence'; + $acc = $slice->name(); + $version = $acc; + } + + $acc = $slice->name(); + + + + #line breaks are allowed near the end of the line on ' ', "\t", "\n", ',' + $: = (" \t\n-,"); + + ############# + # dump header + ############# + + my $EMBL_HEADER = +'@< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~ +'; + + #ID and moltype + # HTG = High Throughput Genome division, probably most suitable + # and it would be hard to come up with another appropriate division + # that worked for all organisms (e.g. plants are in PLN but human is + # in HUM). + my $VALUE = "$entry_name standard; DNA; HTG; $len BP."; + $self->write($FH, $EMBL_HEADER, 'ID', $VALUE); + $self->print( $FH, "XX\n" ); + + #Accession + $self->write($FH, $EMBL_HEADER, 'AC', $acc); + $self->print( $FH, "XX\n" ); + + #Version + $self->write($FH, $EMBL_HEADER, 'SV', $version); + $self->print( $FH, "XX\n" ); + + #Date + $self->write($FH, $EMBL_HEADER, 'DT', $self->_date_string); + $self->print( $FH, "XX\n" ); + + my $meta_container = $slice->adaptor()->db()->get_MetaContainer(); + + #Description + $self->write($FH, $EMBL_HEADER, 'DE', $meta_container->get_scientific_name() . + " $name_str $start..$end annotated by Ensembl"); + $self->print( $FH, "XX\n" ); + + #key words + $self->write($FH, $EMBL_HEADER, 'KW', '.'); + $self->print( $FH, "XX\n" ); + + #Species + my $species_name = $meta_container->get_scientific_name(); + if(my $cn = $meta_container->get_common_name()) { + $species_name .= " ($cn)"; + } + + $self->write($FH, $EMBL_HEADER, 'OS', $species_name); + + #Classification + my $cls = $meta_container->get_classification(); + $self->write($FH, $EMBL_HEADER, 'OC', join('; ', reverse(@{$cls})) . '.'); + $self->print( $FH, "XX\n" ); + + #References (we are not dumping refereneces) + + #Database References (we are not dumping these) + + #comments + foreach my $comment (@COMMENTS) { + $self->write($FH, $EMBL_HEADER, 'CC', $comment); + $self->print( $FH, "XX\n" ); + } + + #################### + #DUMP FEATURE TABLE + #################### + $self->print( $FH, "FH Key Location/Qualifiers\n" ); + + my $FEATURE_TABLE = +'FT ^<<<<<<<<<<<<<<<^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~ +'; + $self->_dump_feature_table($slice, $FH, $FEATURE_TABLE); + + #write an XX after the feature tables + $self->print( $FH, "XX\n" ); + + ################### + #DUMP SEQUENCE + ################### + + if(!defined($SEQ)){ + $SEQ = $slice->seq(); + } +# my $SEQ = $slice->seq(); + my $length = length($SEQ); + my $a_count = $SEQ =~ tr/aA/aA/; + my $c_count = $SEQ =~ tr/cC/cC/; + my $t_count = $SEQ =~ tr/tT/tT/; + my $g_count = $SEQ =~ tr/gG/gG/; + my $other_count = $length - $a_count - $c_count - $t_count - $g_count; + + my $value = "Sequence $length BP; $a_count A; $c_count C; " . + "$g_count G; $t_count T; $other_count other;"; + $self->print($FH, 'SQ '.$value."\n"); + + $self->write_embl_seq($FH, \$SEQ); + + + $self->print( $FH, "//\n" ); + + # Set formatting back to normal + $: = " \n-"; +} + + + + +=head2 dump_genbank + + Arg [1] : Bio::EnsEMBL::Slice + Arg [2] : IO::File $FH + Example : $seq_dumper->dump_genbank($slice, $FH); + Description: Dumps a GENBANK flat file to an open file handle + Returntype : none + Exceptions : none + Caller : dump + +=cut + +sub dump_genbank { + my ($self, $slice, $FH, $SEQ) = @_; + + #line breaks are allowed near the end of the line on ' ', "\t", "\n", ',' + $: = " \t\n-,"; + + my $GENBANK_HEADER = +'^<<<<<<<<< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~ +'; + + my $GENBANK_SUBHEADER = +' ^<<<<<<< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~ +'; + + my $GENBANK_FT = +' ^<<<<<<<<<<<<<< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~ +'; + + my $version; + my $acc; + + my $cs = $slice->coord_system(); + + my $name_str = $cs->name() . ' ' . $slice->seq_region_name(); + + $name_str .= ' ' . $cs->version if($cs->version); + + #determine if this slice is the entire seq region + #if it is then we just use the name as the id + my $slice_adaptor = $slice->adaptor(); + my $full_slice = + $slice->adaptor->fetch_by_region($cs->name, + $slice->seq_region_name, + undef,undef,undef, + $cs->version); + + + my $entry_name = $slice->seq_region_name(); + + if($full_slice->name eq $slice->name) { + $name_str .= ' full sequence'; + $acc = $slice->seq_region_name(); + my @acc_ver = split(/\./, $acc); + if(@acc_ver == 2) { + $acc = $acc_ver[0]; + $version = $acc_ver[0] . $acc_ver[1]; + } elsif(@acc_ver == 1 && $cs->version()) { + $version = $acc . $cs->version(); + } else { + $version = $acc; + } + } else { + $name_str .= ' partial sequence'; + $acc = $slice->name(); + $version = $acc; + } + + $acc = $slice->name(); # to keep format consistent for all + + my $length = $slice->length; + my $start = $slice->start(); + my $end = $slice->end(); + + my $date = $self->_date_string; + + my $meta_container = $slice->adaptor()->db()->get_MetaContainer(); + + #LOCUS + my $tag = 'LOCUS'; + my $value = "$entry_name $length bp DNA HTG $date"; + $self->write($FH, $GENBANK_HEADER, $tag, $value); + + #DEFINITION + $tag = "DEFINITION"; + $value = $meta_container->get_scientific_name() . + " $name_str $start..$end reannotated via EnsEMBL"; + $self->write($FH, $GENBANK_HEADER, $tag, $value); + + #ACCESSION + $self->write($FH, $GENBANK_HEADER, 'ACCESSION', $acc); + + #VERSION + $self->write($FH, $GENBANK_HEADER, 'VERSION', $version); + + # KEYWORDS + $self->write($FH, $GENBANK_HEADER, 'KEYWORDS', '.'); + + # SOURCE + my $common_name = $meta_container->get_common_name(); + $common_name = $meta_container->get_scientific_name() unless $common_name; + $self->write($FH, $GENBANK_HEADER, 'SOURCE', $common_name); + + #organism + my @cls = $meta_container->get_classification(); + shift @cls; + $self->write($FH, $GENBANK_SUBHEADER, 'ORGANISM', $meta_container->get_scientific_name()); + $self->write($FH, $GENBANK_SUBHEADER, '', join('; ', reverse @cls) . "."); + + #refereneces + + #comments + foreach my $comment (@COMMENTS) { + $self->write($FH, $GENBANK_HEADER, 'COMMENT', $comment); + } + + #################### + # DUMP FEATURE TABLE + #################### + $self->print( $FH, "FEATURES Location/Qualifiers\n" ); + $self->_dump_feature_table($slice, $FH, $GENBANK_FT); + + #################### + # DUMP SEQUENCE + #################### + + if(!defined($SEQ)){ + $SEQ = $slice->seq(); + } +# my $SEQ = $slice->seq(); + my $a_count = $SEQ =~ tr/aA/aA/; + my $c_count = $SEQ =~ tr/cC/cC/; + my $t_count = $SEQ =~ tr/tT/tT/; + my $g_count = $SEQ =~ tr/gG/gG/; + my $bp_length = length($SEQ); + my $other_count = $bp_length - $a_count - $c_count - $t_count - $g_count; + + $tag = 'BASE COUNT'; + $value = "$a_count a $c_count c $g_count g $t_count t"; + $value .= " $other_count n" if($other_count); + $self->print($FH, qq{$tag $value\n}); + $self->print( $FH, "ORIGIN\n" ); + + $self->write_genbank_seq($FH, \$SEQ); + + $self->print( $FH, "//\n" ); + + # Set formatting back to normal + $: = " \n-"; +} + + + +=head2 _dump_feature_table + + Arg [1] : Bio::EnsEMBL::Slice slice + Example : none + Description: Helper method used to dump feature tables used in EMBL, FASTA, + GENBANK. Assumes formating of file handle has been setup + already to use $FEAT and $VALUE values. + Returntype : none + Exceptions : none + Caller : internal + +=cut + +sub _dump_feature_table { + my $self = shift; + my $slice = shift; + my $FH = shift; + my $FORMAT = shift; + + #use only the core database to dump features (except for bloody snps) + my $lite = $slice->adaptor->db->remove_db_adaptor('lite'); + + my $meta = $slice->adaptor->db->get_MetaContainer; + + #lump file handle and format string together for simpler method calls + my @ff = ($FH, $FORMAT); + my $value; + + #source + my $classification = join(', ', $meta->get_classification()); + $self->write(@ff,'source', "1.." . $slice->length()); + $self->write(@ff,'' , '/organism="'.$meta->get_scientific_name(). '"'); + $self->write(@ff,'' , '/db_xref="taxon:'.$meta->get_taxonomy_id().'"'); + + # + # Transcripts & Genes + # + my @gene_slices; + if($self->is_enabled('gene')) { + push @gene_slices, $slice; + } + + # Retrieve slices of other database where we need to pull genes from + + my $gene_dbs = {'vegagene' => 'vega', + 'estgene' => 'estgene'}; + + foreach my $gene_type (keys %$gene_dbs) { + if($self->is_enabled($gene_type)) { + my $db = $self->get_database($gene_dbs->{$gene_type}); + if($db) { + my $sa = $db->get_SliceAdaptor(); + push @gene_slices, $sa->fetch_by_name($slice->name()); + } else { + warning("A [". $gene_dbs->{$gene_type} ."] database must be " . + "attached to this SeqDumper\n(via a call to " . + "attach_database) to retrieve genes of type [$gene_type]"); + } + } + } + + foreach my $gene_slice (@gene_slices) { + my @genes = @{$gene_slice->get_all_Genes(undef,undef, 1)}; + while(my $gene = shift @genes) { + $value = $self->features2location( [$gene] ); + $self->write( @ff, 'gene', $value ); + $self->write( @ff, "", '/gene='.$gene->stable_id() ); + + + if(defined($gene->display_xref)){ + $self->write( @ff, "",'/locus_tag="'.$gene->display_xref->display_id.'"'); + } + my $desc = $gene->description; + if(defined($desc) and $desc ne ""){ + $desc =~ s/\"//; + $self->write( @ff, "", '/note="'.$gene->description.'"'); + } + + + + foreach my $transcript (@{$gene->get_all_Transcripts}) { + my $translation = $transcript->translation; + + # normal transcripts get dumped differently than pseudogenes + if($translation) { + #normal transcript + $value = $self->features2location($transcript->get_all_Exons); + $self->write(@ff, 'mRNA', $value); + $self->write(@ff,'' , '/gene="'.$gene->stable_id().'"'); + $self->write(@ff,'' + ,'/note="transcript_id='.$transcript->stable_id().'"'); + + # ...and a CDS section + $value = + $self->features2location($transcript->get_all_translateable_Exons); + $self->write(@ff,'CDS', $value); + my $codon_start = $self->transcript_to_codon_start($transcript); + $self->write(@ff,'' , qq{/codon_start="${codon_start}"}) if $codon_start > 1; + $self->write(@ff,'' , '/gene="'.$gene->stable_id().'"'); + $self->write(@ff,'' , '/protein_id="'.$translation->stable_id().'"'); + $self->write(@ff,'' ,'/note="transcript_id='.$transcript->stable_id().'"'); + + foreach my $dbl (@{$transcript->get_all_DBLinks}) { + $value = '/db_xref="'.$dbl->dbname().':'.$dbl->display_id().'"'; + $self->write(@ff, '', $value); + } + + $value = '/translation="'.$transcript->translate()->seq().'"'; + $self->write(@ff, '', $value); + } else { + #pseudogene + $value = $self->features2location($transcript->get_all_Exons); + $self->write(@ff, 'misc_RNA', $value); + $self->write(@ff,'' , '/gene="'.$gene->stable_id().'"'); + foreach my $dbl (@{$transcript->get_all_DBLinks}) { + $value = '/db_xref="'.$dbl->dbname().':'.$dbl->primary_id().'"'; + $self->write(@ff, '', $value); + } + $self->write(@ff,'' , '/note="'.$transcript->biotype().'"'); + $self->write(@ff,'' + ,'/note="transcript_id='.$transcript->stable_id().'"'); + } + } + } + + # exons + foreach my $gene (@{$gene_slice->get_all_Genes(undef,undef,1)}) { + foreach my $exon (@{$gene->get_all_Exons}) { + $self->write(@ff,'exon', $self->features2location([$exon])); + $self->write(@ff,'' , '/note="exon_id='.$exon->stable_id().'"'); + } + } + } + + # + # genscans + # + if($self->is_enabled('genscan')) { + my @genscan_exons; + my @transcripts = @{$slice->get_all_PredictionTranscripts(undef,1)}; + while(my $transcript = shift @transcripts) { + my $exons = $transcript->get_all_Exons(); + push @genscan_exons, @$exons; + $self->write(@ff, 'mRNA', $self->features2location($exons)); + $self->write(@ff, '', '/product="'.$transcript->translate()->seq().'"'); + $self->write(@ff, '', '/note="identifier='.$transcript->stable_id.'"'); + $self->write(@ff, '', '/note="Derived by automated computational' . + ' analysis using gene prediction method:' . + $transcript->analysis->logic_name . '"'); + } + } + + # + # snps + # + if($self->is_enabled('variation') && $slice->can('get_all_VariationFeatures')) { +# $slice->adaptor->db->add_db_adaptor('lite', $lite) if $lite; + + my @variations = @{$slice->get_all_VariationFeatures()}; + while(my $snp = shift @variations) { + my $ss = $snp->start; + my $se = $snp->end; + #skip snps that hang off edge of slice + next if($ss < 1 || $se > $slice->length); + + $self->write(@ff, 'variation', "$ss..$se"); + $self->write(@ff, '' , '/replace="'.$snp->allele_string.'"'); + #$self->write(@ff, '' , '/evidence="'.$snp->status.'"'); + my $rs_id = $snp->variation_name(); + my $db = $snp->source(); +# foreach my $link ($snp->each_DBLink) { +# my $id = $link->primary_id; +# my $db = $link->database; + $self->write(@ff, '', "/db_xref=\"$db:$rs_id\""); +# } + } + +# $slice->adaptor->db->remove_db_adaptor('lite') if $lite; + } + + # + # similarity features + # + if($self->is_enabled('similarity')) { + foreach my $sim (@{$slice->get_all_SimilarityFeatures}) { + $self->write(@ff, 'misc_feature', $self->features2location([$sim])); + $self->write(@ff, '' , '/note="match: '.$sim->hseqname. + ' : '.$sim->hstart.'..'.$sim->hend.'('.$sim->hstrand.')"'); + } + } + + # + # repeats + # + if($self->is_enabled('repeat')) { + my @rfs = @{$slice->get_all_RepeatFeatures()}; + + while(my $repeat = shift @rfs) { + $self->write(@ff, 'repeat_region', $self->features2location([$repeat])); + $self->write(@ff, '' , '/note="' . $repeat->repeat_consensus->name. + ' repeat: matches ' . $repeat->hstart.'..'.$repeat->hend . + '('.$repeat->hstrand.') of consensus"'); + } + + } + + # + # markers + # + if($self->is_enabled('marker') && $slice->can('get_all_MarkerFeatures')) { + my @markers = @{$slice->get_all_MarkerFeatures()}; + while(my $mf = shift @markers) { + $self->write(@ff, 'STS', $self->features2location([$mf])); + if($mf->marker->display_MarkerSynonym) { + $self->write(@ff, '' , '/standard_name="' . + $mf->marker->display_MarkerSynonym->name . '"'); + } + + + #grep out synonyms without a source + my @synonyms = @{$mf->marker->get_all_MarkerSynonyms}; + @synonyms = grep {$_->source } @synonyms; + foreach my $synonym (@synonyms) { + $self->write(@ff, '', '/db_xref="'.$synonym->source. + ':'.$synonym->name.'"'); + } + $self->write(@ff, '', '/note="map_weight='.$mf->map_weight.'"'); + } + } + + # + # contigs + # + if($self->is_enabled('contig')) { + foreach my $segment (@{$slice->project('seqlevel')}) { + my ($start, $end, $slice) = @$segment; + $self->write(@ff, 'misc_feature', + $start .'..'. $end); + $self->write(@ff, '', '/note="contig '.$slice->seq_region_name . + ' ' . $slice->start . '..' . $slice->end . + '(' . $slice->strand . ')"'); + } + } + + $slice->adaptor->db->add_db_adaptor('lite', $lite) if $lite; + +} + +# /codon_start= is the first base to start translating from. This maps +# Ensembl start exon phase to this concept. Here we present the usage +# of phase in this concept where each row shows the sequence the +# spliced_seq() method will return + +# 123456789 +# ATTATGACG +# Phase == 0 ...+++### codon_start=0 // start from 1st A +# Phase == 1 ..+++### codon_start=3 // start from 2nd A (base 3 in the given spliced sequence) +# Phase == 2 .+++### codon_start=2 // start from 2nd A (base 2 in the spliced sequence) +# +# In the case of the final 2 phases we will generate a X codon +# + +sub transcript_to_codon_start { + my ($self, $transcript) = @_; + my $start_phase = $transcript->start_Exon()->phase(); + return ( $start_phase == 1 ) ? 3 : + ( $start_phase == 2 ) ? 2 : + ( $start_phase == 0 ) ? 1 : + -1; +} + + +=head2 dump_fasta + + Arg [1] : Bio::EnsEMBL::Slice + Arg [2] : IO::File $FH + Example : $seq_dumper->dump_fasta($slice, $FH); + Description: Dumps an FASTA flat file to an open file handle + Returntype : none + Exceptions : none + Caller : dump + +=cut + +sub dump_fasta { + my $self = shift; + my $slice = shift; + my $FH = shift; + + my $id = $slice->seq_region_name; + my $seqtype = 'dna'; + my $idtype = $slice->coord_system->name; + my $location = $slice->name; + my $start = 1; + my $end = $slice->length(); + + my $header = ">$id $seqtype:$idtype $location\n"; + $self->print( $FH, $header ); + + #set the formatting to FASTA + my $FORMAT = '^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +'; + + #chunk the sequence in 60kb chunks to use less memory + my $cur = $start; + while($cur <= $end) { + my $to = $cur + 59_999; + $to = $end if($to > $end); + my $seq = $slice->subseq($cur, $to); + $cur = $to + 1; + $self->write($FH, $FORMAT, $seq); + } +} + + + +=head2 features2location + + Arg [1] : listref of Bio::EnsEMBL::SeqFeatures + Example : $location = $self->features2location(\@features); + Description: Constructs an EMBL location string from a list of features + Returntype : string + Exceptions : none + Caller : internal + +=cut + +sub features2location { + my $self = shift; + my $features = shift; + + my @join = (); + + foreach my $f (@$features) { + my $slice = $f->slice; + my $start = $f->start(); + my $end = $f->end(); + my $strand = $f->strand(); + + if($start >= 1 && $end <= $slice->length) { + #this feature in on a slice and doesn't lie outside the boundary + + if($strand == 1) { + push @join, "$start..$end"; + } else { + push @join, "complement($start..$end)"; + } + } else { + my @fs = (); + #this feature is outside the boundary of the dump, + # yet implemented and 'seqlevel' is guaranteed to be 1step + my $projection = $f->project('seqlevel'); + foreach my $segment (@$projection) { + my $slice = $segment->[2]; + my $slc_start = $slice->start(); + my $slc_end = $slice->end(); + my $seq_reg = $slice->seq_region_name(); + if($slice->strand == 1) { + push @join, "$seq_reg:$slc_start..$slc_end"; + } else { + push @join, "complement($seq_reg:$slc_start..$slc_end)"; + } + } + } + } + + my $out = join ',', @join; + + if(scalar @join > 1) { + $out = "join($out)"; + } + + return $out; +} + + +sub _date_string { + my $self = shift; + + my ($sec, $min, $hour, $mday,$mon, $year) = localtime(time()); + + my $month = ('JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', + 'AUG', 'SEP', 'OCT', 'NOV', 'DEC')[$mon]; + $year += 1900; + + return "$mday-$month-$year"; +} + + +sub write { + my ($self, $FH, $FORMAT, @values) = @_; + + #while the last value still contains something + while(defined($values[-1]) and $values[-1] ne '') { + formline($FORMAT, @values); + $self->print( $FH, $^A ); + $^A = ''; + } +} + +sub write_genbank_seq { + my $self = shift; + my $FH = shift; + my $seq = shift; + my $base_total = shift; + + $base_total ||= 0; + + my $GENBANK_SEQ = +'@>>>>>>>> ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<<~ +'; + + my $total = -59 + $base_total; + #keep track of total and print lines of 60 bases with spaces every 10bp + while($$seq) { + $total += 60; + formline($GENBANK_SEQ,$total, $$seq, $$seq, $$seq, $$seq, $$seq, $$seq); + $self->print( $FH, $^A ); + $^A = ''; + } +} + +sub write_embl_seq { + my $self = shift; + my $FH = shift; + my $seq = shift; + my $base_total = shift; + + $base_total ||= 0; + + my $EMBL_SEQ = +' ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<<@>>>>>>>>>~ +'; + #keep track of total and print lines of 60 bases with spaces every 10bp + my $length = length($$seq); + my $total = $length - $base_total; + while($$seq) { + $total -= 60; + $total = 0 if($total < 0); + formline($EMBL_SEQ, + $$seq, $$seq, $$seq, $$seq, $$seq, $$seq, + $length - $total); + $self->print( $FH, $^A ); + $^A = ''; + } +} + +sub print { + my( $self, $FH, $string ) = @_; + if(!print $FH $string){ + print STDERR "Problem writing to disk\n"; + print STDERR "the string is $string\n"; + die "Could not write to file handle"; + } +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/SeqRegionCache.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/SeqRegionCache.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,97 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::SeqRegionCache - A shared LRU cache of information about +seq_regions + +=head1 SYNOPSIS + + use Bio::EnsEMBL::DBSQL::DBAdaptor; + + $db = Bio::EnsEMBL::DBSQL::DBAdaptor->new(...); + + $seq_region_cache = $db->get_SeqRegionCache(); + + $key = "$seq_region_name:$coord_system_id"; + + $array = $seq_region_cache->{$key}; + + if ($array) { + $name = $array->[1]; + $length = $array->[3]; + } else { + # cache miss, get the info from the database + # ... + + # cache the retrieved information + $seq_region_cache->{$key} = [ + $seq_region_id, $seq_region_name, + $coord_system_id, $seq_region_length + ]; + } + +=head1 DESCRIPTION + +This module is simply a convenient place to put a cache of sequence +region information which is shared by several adaptors for a given +database. + +=head1 METHODS + +=cut + +use strict; +use Bio::EnsEMBL::Utils::Cache; + +package Bio::EnsEMBL::Utils::SeqRegionCache; + +our $SEQ_REGION_CACHE_SIZE = 40000; + + + +sub new { + my $class = shift; + + my %id_cache; + my %name_cache; + + # + # the items to cache should be listrefs to + # [ sr_id, sr_name, cs_id, sr_length ] + # + # The name cache key is "sr_name:cs_id" + # The id cache is keyed on "sr_id" + # + + tie(%name_cache, 'Bio::EnsEMBL::Utils::Cache', $SEQ_REGION_CACHE_SIZE); + tie(%id_cache, 'Bio::EnsEMBL::Utils::Cache', $SEQ_REGION_CACHE_SIZE); + + return bless {'name_cache' => \%name_cache, + 'id_cache' => \%id_cache}, $class; +} + + +1; + + + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/Sequence.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/Sequence.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,113 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::Sequence - Utility functions for sequences + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp expand); + + my $seq = 'ACTTTAAAGGCTATCCCAATATG'; + + print "my sequence = $seq\n"; + + reverse_comp( \$seq ); + + print "my reverse comp = $seq\n"; + + my $compressed_seq = '(AC)3'; + + print "my expanded seq is = expand($compressed_seq)"; + +=head1 METHODS + +=cut + + +package Bio::EnsEMBL::Utils::Sequence; + +use strict; +use warnings; + +use Exporter; + +use vars qw(@ISA @EXPORT_OK); + +@ISA = qw(Exporter); + +@EXPORT_OK = qw(&reverse_comp &expand); + + +=head2 reverse_comp + + Arg [1] : reference to a string $seqref + Example : use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); + + $seq = 'ACCTGAA'; + reverse_comp(\$seq); + print $seq; + + Description: Does an in place reverse compliment of a passed in string + reference. The string is passed by reference + rather than by value for memory efficiency. + Returntype : none + Exceptions : none + Caller : SequenceAdaptor, SliceAdaptor + +=cut + +sub reverse_comp { + my $seqref = shift; + + $$seqref = reverse( $$seqref ); + $$seqref =~ + tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/; + + return; +} + +=head2 expand + + Arg [1] : reference to a string $seqref + Example : use Bio::EnsEMBL::Utils::Sequence qw(expand); + + $seq = '(AC)3'; + expand(\$seq); + print $seq; + + + Description: Expands a genomic sequence. The string is passed by reference + rather than by value for memory efficiency. + Returntype : none + Exceptions : none + Caller : SequenceAdaptor, SliceAdaptor + +=cut + +sub expand { + my $seq_ref = shift; + $$seq_ref =~ s/(\w*)\((\w+)\)(\d+)/$1.$2 x $3/eg if ($$seq_ref =~ /\(/);#expressions with parenthesis, expand the alleles + return; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/Slice.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/Slice.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,137 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::Slice - Utility functions for slices + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Utils::Slice qw(split_Slices); + + # ... + + # get all chromosomes in the database + my $slices = $slice_adaptor->fetch_all('chromosome'); + + # split the chromosomes into equal chunks of size less than 1MB + # with an overlap of 1kb + $slices = split_Slices( $slices, 1e6, 1e3 ); + +=head1 METHODS + +=cut + + +package Bio::EnsEMBL::Utils::Slice; + +use strict; +use warnings; + +use Exporter; + +use vars qw(@ISA @EXPORT_OK); + +@ISA = qw(Exporter); + +@EXPORT_OK = qw(&split_Slices); + +use Bio::EnsEMBL::Utils::Exception qw(throw); +use POSIX; + +=head2 split_Slices + + Arg [1] : ref to list of slices + Arg [2] : int maxlength of sub slices + Arg [3] : int overlap length (optional) + Example : my $sub_slices = split_Slices($slices,$maxlen,$overlap) + Description: splits a slice into smaller slices + Returntype : ref to list of slices + Exceptions : maxlen <1 or overlap < 0 + +=cut + +sub split_Slices{ + my ($slice_big,$max_length,$overlap)=@_; + + if(!defined($max_length) or $max_length < 1){ + throw("maxlength needs to be set and > 0"); + } + + if(!defined($overlap)){ + $overlap = 0; + } + elsif($overlap < 0){ + throw("negative overlaps not allowed"); + } + + my @out=(); + + foreach my $slice (@$slice_big){ + + my $start = $slice->start; + my $end; + my $multiple; + my $number; + my $length = $slice->length; + + if($max_length && ($length > $overlap)) { + #No seq region may be longer than max_length but we want to make + #them all similar size so that the last one isn't much shorter. + #Divide the seq_region into the largest equal pieces that are shorter + #than max_length + + #calculate number of slices to create + $number = ($length-$overlap) / ($max_length-$overlap); + $number = ceil($number); #round up to int + + #calculate length of created slices + $multiple = $length / $number; + $multiple = floor($multiple); #round down to int + } else { + #just one slice of the whole seq_region + $number = 1; + $multiple = $length; + } + + my $i; + for(my $i=0; $i < $number; $i++) { + $end = $start + $multiple + $overlap; + + #any remainder gets added to the last slice of the seq_region + $end = $slice->end if($i == $number-1); + push @out, Bio::EnsEMBL::Slice->new + (-START => $start, + -END => $end, + -STRAND => 1, + -SEQ_REGION_NAME => $slice->seq_region_name, + -SEQ_REGION_LENGTH => $slice->seq_region_length, + -COORD_SYSTEM => $slice->coord_system, + -ADAPTOR => $slice->adaptor); + $start += $multiple + 1; + } + } + + return \@out; +} + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/SqlHelper.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/SqlHelper.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1080 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::SqlHelper + +=head1 VERSION + +$Revision: 1.25 $ + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Utils::SqlHelper; + + my $helper = + Bio::EnsEMBL::Utils::SqlHelper->new( -DB_CONNECTION => $dbc ); + + my $arr_ref = $helper->execute( + -SQL => 'select name, age from tab where col =?', + -CALLBACK => sub { + my @row = @{ shift @_ }; + return { name => $row[0], age => $row[1] }; + }, + -PARAMS => ['A'] ); + + use Data::Dumper; + print Dumper($arr_ref), "\n"; + # Prints out [name=>'name', age=>1] maybe .... + + + # For transactional work; only works if your MySQL table + # engine/database supports transactional work (such as InnoDB) + + $helper->transaction( + -CALLBACK => sub { + if ( $helper->execute_single_result( + -SQL => 'select count(*) from tab' + ) ) + { + return $helper->execute_update('delete from tab'); + } else { + return + $helper->batch( -SQL => 'insert into tab (?,?)', + -DATA => [ [ 1, 2 ], [ 1, 3 ], [ 1, 4 ] ] ); + } + } ); + +=head1 DESCRIPTION + +Easier database interaction + +=head1 METHODS + +See subrotuines. + +=cut + +package Bio::EnsEMBL::Utils::SqlHelper; + +use warnings; +use strict; + +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Scalar qw(assert_ref check_ref); +use Bio::EnsEMBL::Utils::Exception qw(throw); +use Bio::EnsEMBL::Utils::Iterator; +use English qw( -no_match_vars ); #Used for $PROCESS_ID +use Scalar::Util qw(weaken); #Used to not hold a strong ref to DBConnection + +=pod + +=head2 new() + + Arg [DB_CONNECTION] : Bio::EnsEMBL::DBSQL::DBConnection $db_connection + Returntype : Instance of helper + Exceptions : If the object given as a DBConnection is not one or it + was undefined + Status : Stable + +Creates a new instance of this object. + + my $dba = get_dba('mydb'); # New DBAdaptor from somewhere + my $helper = Bio::EnsEMBL::Utils::SqlHelper->new( + -DB_CONNECTION => $dba->dbc() ); + + $helper->execute_update( -SQL => 'update tab set flag=?', + -PARAMS => [1] ); + +=cut + +sub new { + my ( $class, @args ) = @_; + + my ($db_connection) = rearrange([qw(db_connection)], @args); + + my $self = bless( {}, ref($class) || $class ); + throw('-DB_CONNECTION construction parameter was undefined.') + unless defined $db_connection; + $self->db_connection($db_connection); + + return $self; +} + +=pod + +=head2 db_connection() + + Arg [1] : Bio::EnsEMBL::DBSQL::DBConnection $db_connection + Description : Sets and retrieves the DBConnection + Returntype : Bio::EnsEMBL::DBSQL::DBConnection + Exceptions : If the object given as a DBConnection is not one or if an + attempt is made to set the value more than once + Status : Stable + +=cut + +sub db_connection { + my ($self, $db_connection) = @_; + if(defined $db_connection) { + if(exists $self->{db_connection}) { + throw('Cannot reset the DBConnection object; already defined '); + } + assert_ref($db_connection, 'Bio::EnsEMBL::DBSQL::DBConnection', 'db_connection'); + $self->{db_connection} = $db_connection; + weaken $self->{db_connection}; + } + return $self->{db_connection}; +} + +# --------- SQL Methods + +=pod + +=head2 execute() - Execute a SQL statement with a custom row handler + + Arg [SQL] : string SQL to execute + Arg [CALLBACK] : CodeRef; The callback to use for mapping a row to a data + point; leave blank for a default mapping to a 2D array + Arg [USE_HASHREFS] : boolean If set to true will cause HashRefs to be returned + to the callback & not ArrayRefs + Arg [PARAMS] : ArrayRef The binding parameters to the SQL statement + Arg [PREPARE_PARAMS] : boolean Parameters to be passed onto the Statement Handle + prepare call + Arg [ITERATOR] : boolean Request a L + rather than a 2D array + Returntype : ArrayRef/L + Exceptions : If errors occur in the execution of the SQL + Status : Stable + + my $arr_ref = $helper->execute( + -SQL => 'select a,b,c from tab where col =?', + -CALLBACK => sub { + my @row = @{ shift @_ }; + return { A => $row[0], B => $row[1], C => $row[2] }; + }, + -PARAMS => ['A'] ); + + #Or with hashrefs + my $arr_ref = $helper->execute( + -SQL => 'select a,b,c from tab where col =?', + -USE_HASHREFS => 1, + -CALLBACK => sub { + my $row = shift @_; + return { A => $row->{a}, B => $row->{b}, C => $row->{c} }; + }, + -PARAMS => ['A'] ); + +Uses a callback defined by the C decalaration. Here we specify how +the calling code will deal with each row of a database's result set. The +sub can return any type of Object/hash/data structure you require. + +Should you not specify a callback then a basic one will be assigned to +you which will return a 2D array structure e.g. + + my $arr_ref = $helper->execute( + -SQL => 'select a,b,c from tab where col =?', + -PARAMS => ['A'] ); + +This is equivalent to DBI's c subroutine. + +As an extension to this method you can write a closure subroutine which +takes in two parameters. The first is the array/hash reference & the +second is the statement handle used to execute. 99% of the time you will +not need it but there are occasions where you do need it. An example of +usage would be: + + my $conn = get_conn(); #From somwewhere + my $arr_ref = $conn->execute( + -SQL => 'select a,b,c from tab where col =?', + -USE_HASHREFS => 1, + -CALLBACK => sub { + my ( $row, $sth ) = @_; + #Then do something with sth + return { A => $row->[0], B => $row->[1], C => $row->[2] }; + }, + -PARAMS => ['A'] ); + +Any arguments to bind to the incoming statement. This can be a set of scalars +or a 2D array if you need to specify any kind of types of sql objects i.e. + + use DBI qw(:sql_types); + + my $conn = get_conn(); + my $arr_ref = $conn->execute( + -SQL => + 'select a,b,c from tab where col =? and num_col=? and other=?', + -USE_HASHREFS => 1, + -CALLBACK => sub { + my @row = @{ shift @_ }; + return { A => $row[0], B => $row[1], C => $row[2] }; + }, + -PARAMS => [ '1', SQL_VARCHAR ], + [ 2, SQL_INTEGER ], + 'hello' ); + +Here we import DBI's sql types into our package and then pass in +multiple anonymous array references as parameters. Each param is +tested in the input and if it is detected to be an ARRAY reference we +dereference the array and run DBI's bind_param method. In fact you can +see each part of the incoming paramaters array as the contents to call +C with. The only difference is the package tracks the bind +position for you. + +We can get back a L object which can be used +to iterate over the results set without first materializing the data into +memory. An example would be: + + my $iterator = $helper->execute( + -SQL => 'select a,b,c from tab where col =?', + -PARAMS => ['A'] + -ITERATOR => 1); + while($iterator->has_next()) { + my $row = $iterator->next(); + #Do something + } + +This is very useful for very large datasets. + +=cut + +sub execute { + my ( $self, @args ) = @_; + my ($sql, $callback, $use_hashrefs, $params, $prepare_params, $iterator) = + rearrange([qw(sql callback use_hashrefs params prepare_params iterator)], @args); + my $has_return = 1; + + #If no callback then we execute using a default one which returns a 2D array + if(!defined $callback) { + throw('Cannot use fetchrow_hashref() with default mappers. Turn off this option') if $use_hashrefs; + $callback = $self->_mappers()->{array_ref}; + } + + return $self->_execute( $sql, $callback, $has_return, $use_hashrefs, $params, $prepare_params, $iterator ); +} + +=pod + +=head2 execute_simple() + + Arg [SQL] : string $sql + Arg [PARAMS] : ArrayRef $params + Arg [CALLBACK] : CodeRef $callback + Returntype : ArrayRef of 1D elements + Exceptions : If errors occur in the execution of the SQL + Status : Stable + + my $classification = + $helper->execute_simple( + -SQL => + 'select meta_val from meta where meta_key =? order by meta_id', + -PARAMS => ['species.classification'] ); + +Identical to C except you do not specify a sub-routine reference. +Using this code assumes you want an array of single scalar values as returned +by the given SQL statement. + +=cut + +sub execute_simple { + my ( $self, @args ) = @_; + my ($sql, $params, $callback) = rearrange([qw(sql params callback)], @args); + my $has_return = 1; + my $use_hashrefs = 0; + $callback ||= $self->_mappers()->{first_element}; + return $self->_execute($sql, $callback, $has_return, $use_hashrefs, $params); +} + +=pod + +=head2 execute_no_return() + + Arg [SQL] : string sql + Arg [CALLBACK] : CodeRef The callback to use for mapping a row to a data point; + we assume you are assigning into a data structure which + has requirements other than simple translation into an + array + Arg [USE_HASHREFS] : boolean If set to true will cause HashRefs to be returned + to the callback & not ArrayRefs + Arg [PARAMS] : ArrayRef The binding parameters to the SQL statement + Returntype : None + Exceptions : If errors occur in the execution of the SQL + Status : Stable + +Whilst all other execute methods will return something; this assumes that the +given mapper subroutine will be performing the business of placing values +somewhere or doing something with them. + +There is a huge temptation to nest queries using this method; do not! Execute +the values into an array using one of the other methods then run your subqueries +on them; or make a better first query. SQL is flexible; so use it. + +=cut + +sub execute_no_return { + my ( $self, @args ) = @_; + my ($sql, $callback, $use_hashrefs, $params) = rearrange([qw(sql callback use_hashrefs params)], @args); + throw('No callback defined but this is a required parameter for execute_no_return()') if ! $callback; + my $has_return = 0; + my $prepare_params = []; + $self->_execute( $sql, $callback, $has_return, $use_hashrefs, $params); + return; +} + +=pod + +=head2 execute_into_hash() + + Arg [SQL] : string $sql + Arg [CALLBACK] : CodeRef The callback to use for mapping to a value in a hash + keyed by the first element in your result set; + leave blank for a default mapping to a scalar value + of the second element + Arg [PARAMS] : The binding parameters to the SQL statement + Returntype : HashRef keyed by column 1 & value is the return of callback + Exceptions : If errors occur in the execution of the SQL + Status : Stable + +A variant of the execute methods but rather than returning a list of +mapped results this will assume the first column of a returning map & +the calling subroutine will map the remainder of your return as the +hash's key. + +B. + +For example: + + my $sql = 'select key, one, two from table where something =?'; + my $mapper = sub { + my ( $row, $value ) = @_; + #Ignore field 0 as that is being used for the key + my $obj = Some::Obj->new( one => $row->[1], two => $row->[2] ); + return $obj; + }; + + my $hash = + $helper->execute_into_hash( -SQL => $sql, + -CALLBACK => $mapper, + -PARAMS => ['val'] ); + + #Or for a more simple usage + my $sql = 'select biotype, count(gene_id) from gene group by biotype'; + my $biotype_hash = $conn->execute_into_hash( -SQL => $sql ); + print $biotype_hash->{protein_coding} || 0, "\n"; + +The basic pattern assumes a scenario where you are mapping in a one +key to one value. For more advanced mapping techniques you can use the +second value passed to the subroutine paramater set. This is shown as +C<$value> in the above examples. This value is what is found in the HASH +being populated in the background. So on the first time you encounter it +for the given key it will be undefined. For future invocations it will +be set to the value you gave it. This allows us to setup code like the +following + + my %args = ( -SQL => 'select meta_key, meta_value from meta ' + . 'where meta_key =? order by meta_id', + -PARAMS => ['species.classification'] ); + + my $hash = $helper->execute_into_hash( + %args, + -CALLBACK => sub { + my ( $row, $value ) = @_; + $value = [] if !defined $value; + push( @{$value}, $row->[1] ); + return $value; + } ); + + #OR + + $hash = $helper->execute_into_hash( + %args, + -CALLBACK => sub { + my ( $row, $value ) = @_; + if ( defined $value ) { + push( @{$value}, $row->[1] ); + return; + } + my $new_value = [ $row->[1] ]; + return $new_value; + } ); + +The code understands that returning a defined value means to push this +value into the background hash. In example one we keep on re-inserting +the Array of classifications into the hash. Example two shows an early +return from the callback which indicates to the code we do not have any +value to re-insert into the hash. Of the two methods example one is +clearer but is possibliy slower. + +B Therefore indexing for the data you are concerned +with begins at position 1. + +=cut + +sub execute_into_hash { + my ( $self, @args ) = @_; + my ($sql, $callback, $params) = rearrange([qw(sql callback params)], @args); + my $hash = {}; + + #If no callback then we execute using a default one which sets value to 2nd element + if(!defined $callback) { + $callback = $self->_mappers()->{second_element}; + } + + #Default mapper uses the 1st key + something else from the mapper + my $mapper = sub { + my $row = shift @_; + my $key = $row->[0]; + my $value = $hash->{$key}; + my $new_value = $callback->($row, $value); + if(defined $new_value) { + $hash->{ $key } = $new_value; + } + return; + }; + + $self->execute_no_return( + -SQL => $sql, + -CALLBACK => $mapper, + -PARAMS => $params + ); + + return $hash; +} + +=pod + +=head2 execute_single_result() + + Arg [SQL] : string $sql + Arg [CALLBACK] : CodeRef The callback to use for mapping a row to a data point; + leave blank for a default scalar mapping + Arg [USE_HASHREFS] : boolean If set to true will cause HashRefs to be returned + to the callback & not ArrayRefs + Arg [PARAMS] : ArrayRef The binding parameters to the SQL statement + Returntype : Scalar + Exceptions : If errors occur in the execution of the SQL, if the query + returned more than 1 row and if we found no rows. + Status : Stable + + my $meta_count = + $helper->execute_single_result( + -SQL => 'select count(*) from meta where species_id =?', + -PARAMS => [1] ); + +Very similar to C except it will raise an exception if we have more +or less than one row returned + +=cut + +sub execute_single_result { + my ( $self, @args ) = @_; + my ($sql, $callback, $use_hashrefs, $params) = rearrange( + [qw(sql callback use_hashrefs params)], @args); + + my $results = $self->execute_simple( + -SQL => $sql, + -CALLBACK => $callback, + -USE_HASHREFS => $use_hashrefs, + -PARAMS => $params + ); + + my $result_count = scalar(@{$results}); + if($result_count != 1) { + $params = [] if ! $params; + my $type = ($result_count == 0) ? 'No' : 'Too many'; + my $msg = "${type} results returned. Expected 1 but got $result_count for query '${sql}' with params ["; + $msg .= join( ',', map {(defined $_) ? $_ : '-undef-';} @{$params} ); + $msg .= ']'; + throw($msg); + } + return $results->[0]; +} + +=pod + +=head2 transaction() + + Arg [CALLBACK] : CodeRef The callback used for transaction isolation; once + the subroutine exists the code will decide on rollback + or commit. Required + Arg [RETRY] : integer the number of retries to attempt with this + transactional block. Defaults to 0. + Arg [PAUSE] : integer the time in seconds to pause in-between retries. + Defaults to 1. + Arg [CONDITION] : CodeRef allows you to inspect the exception raised + and should your callback return true then the + retry will be attempted. If not given then all + exceptions mean attempt a retry (if specified) + Returntype : Return of the callback + Exceptions : If errors occur in the execution of the SQL + Status : Stable + + my $val = $helper->transaction( + -CALLBACK => sub { + my ($dbc) = @_; + #Do something + return 1; + } ); + + #Or because of the arguments method we use + my $val = $helper->transaction( + sub { + my ($dbc) = @_; + #Do something + return 1; + } ); + +Creates a transactional block which will ensure that the connection is +committed when your submmited subroutine has finished or will rollback +in the event of an error occuring in your block. + +The code will always force AutoCommit off but will restore it to its +previous setting. If your DBI/DBD driver does not support manual +commits then this code will break. The code will turn off the +C method to allow transactions to work as +expected. + +An effect of using REPEATABLE READ transaction isolation (InnoDB's +default) is that your data is as fresh as when you started your current +transaction. To ensure the freshest data use C if you are +going to issue updates. + +Creating a transaction within a transaction results in the commit +rollback statements occuring in the top level transaction. That way any +block of code which is meant to to be transaction can be wrapped in +this block ( assuming the same instance of SQLHelper is passed around & +used). + +You can also request the retry of a transactional block of code which is +causing problems. This is not a perfect solution as it indicates your +programming model is broken. This mode can be specified as such: + + my $val = $helper->transaction( + -RETRY => 3, -PAUSE => 2, + -CALLBACK => sub { + my ($dbc) = @_; + #Do something + return 1; + } ); + +The C<-RETRY> argument indicates the number of times we attempt the transaction +and C<-PAUSE> indicates the time in-between attempts. These retries will +only occur in the root transaction block i.e. you cannot influence the +retry system in a sub transaction. You can influence if the retry is done with +the C<-CONDITION> argument which accepts a Code reference (same as the +C<-CALLBACK> parameter). This allows you to inspect the error thrown to +retry only in some situations e.g. + + my $val = $helper->transaction( + -RETRY => 3, -PAUSE => 2, + -CALLBACK => sub { + my ($dbc) = @_; + #Do something + return 1; + }, + -CONDITION => sub { + my ($error) = @_; + return ( $error =~ /deadlock/ ) ? 1 : 0; + } + ); + +Here we attempt a transaction and will B retry when we have an error +with the phrase deadlock. + +=cut + +sub transaction { + my ($self, @args) = @_; + + my ($callback, $retry, $pause, $condition) = rearrange([qw(callback retry pause condition)], @args); + + throw('-CALLBACK was not a CodeRef. Got a reference of type ['.ref($callback).']. Check your parameters') + unless check_ref($callback, 'CODE'); + + #Setup defaults + $retry = 0 unless defined $retry; + $pause = 1 unless defined $pause; + if(! defined $condition) { + $condition = sub { + return 1; + }; + } + + assert_ref($condition, 'CODE', '-CONDITION'); + + my $dbc = $self->db_connection(); + my $original_dwi; + my $ac; + + my $error; + my $result; + + #If we were already in a transaction then we do not do any management of the + #session & wait for the parent transaction(s) to finish + my $perform_transaction = $self->_perform_transaction_code(); + if($perform_transaction) { + ($original_dwi, $ac) = $self->_enable_transaction(); + } + else { + #If we were in a transaction then ignore any attempts at retry here + $retry = 0; + } + + for(my $iteration = 0; $iteration <= $retry; $iteration++) { + eval { + $result = $callback->($dbc); + $dbc->db_handle()->commit() if $perform_transaction; + }; + $error = $@; + #If we were allowed to deal with the error then we apply rollbacks & then + #retry or leave to the remainder of the code to throw + if($perform_transaction && $error) { + eval { $dbc->db_handle()->rollback(); }; + #If we were not on our last iteration then warn & allow the retry + if($iteration != $retry) { + if($condition->($error)) { + warn("Encountered error on attempt ${iteration} of ${retry} and have issued a rollback. Will retry after sleeping for $pause second(s): $error"); + sleep $pause; + } + else { + last; #break early if condition of error was not matched + } + } + } + + #Always break the loop if we had a successful attempt + last if ! $error; + } + + if($perform_transaction) { + $self->_disable_transaction($original_dwi, $ac); + } + + throw("ABORT: Transaction aborted because of error: ${error}") if $error; + + return $result; +} + +=pod + +=head2 execute_update() + + Arg [SQL] : string $sql + Arg [CALLBACK] : CodeRef The callback to use for calling methods on the + DBI statement handle or DBConnection object after an + update command + Arg [PARAMS] : ArrayRef The binding parameters to the SQL statement + Arg [PREPARE_PARAMS] : ArrayRef Parameters to bind to the prepare() StatementHandle call + Returntype : Number of rows affected + Exceptions : If errors occur in the execution of the SQL + Status : Stable + +Used for performing updates but conforms to the normal execute statement +subroutines. + + use DBI qw(:sql_types); + $helper->execute_update(-SQL => 'update tab set name = ? where id =?', + -PARAMS => [ 'andy', [ 1, SQL_INTEGER ] ] ); + +If you need to do something a bit more advanced with your DML then you can +give the method a closure and this will be called after the execute has been +issued i.e. + + my $obj; + $helper->execute_update( + -SQL => 'insert into tab (name) values(?)', + -CALLBACK => sub { + my ( $sth, $dbh ) = @_; + $obj->{id} = $dbh->{mysql_insertid}; + }, + -PARAMS => [ $obj->name() ] ); + +This lets us access the statement handle & database handle to access other +properties such as the last identifier inserted. + +=cut + +sub execute_update { + my ($self, @args) = @_; + my ($sql, $callback, $params, $prepare_params) = rearrange([qw(sql callback params prepare_params)], @args); + my $rv = 0; + my $sth; + eval { + my @prepare_params; + @prepare_params = @{$prepare_params} if check_ref($prepare_params, 'ARRAY'); + $sth = $self->db_connection()->prepare($sql, @prepare_params); + $self->_bind_params($sth, $params); + $rv = $sth->execute(); + $callback->($sth, $self->db_connection()->db_handle()) if $callback; + }; + my $error = $@; + $self->_finish_sth($sth); + if($error) { + my $params = join ' ', map { (defined $_) ? $_ : q{undef} } @{$params}; + throw("Cannot apply sql '${sql}' with params '${params}': ${error}"); + } + return $rv; +} + +=head2 execute_with_sth() + + Arg [SQL] : string $sql + Arg [CALLBACK] : CodeRef The callback to use for working with the statement + handle once returned. This is B a mapper. + Arg [PARAMS] : ArrayRef The binding parameters to the SQL statement + Arg [PREPARE_PARAMS] : ArrayRef Used to pass parameters to the statement handle + prepare method + Description : A subrotuine which abstracts resource handling and statement + preparing leaving the developer to define how to handle + and process the statement. + Returntype : Anything you wish to return from the callback + Exceptions : If errors occur in the execution of the SQL + Status : Stable + + my $meta_count = $helper->execute_with_sth( + -SQL => 'select count(*) from meta where species_id =?', + -PARAMS => [1], + -CALLBACK => sub { + my ($sth) = @_; + my $count; + $sth->bind_columns( \$count ); + while ( $sth->fetch ) { + print $count, "\n"; + } + return $count; + } ); + +Very similar to C except this gives you full control over the +lifecycle of the statement handle & how you wish to proceed with working +with a statement handle. This is for situations where you believe going through +the mappers causes too much of a slow-down (since we have to execute a +subroutine for every row in order to map it correctly). + +However please benchmark before adopting this method as it increases the +complexity of your code and the mapper slow down only becomes apparent when +working with very large numbers of rows. + +=cut + +sub execute_with_sth { + my ($self, @args) = @_; + my ($sql, $callback, $params, $prepare_params) = rearrange([qw(sql callback params prepare_params)], @args); + my $sth = $self->_base_execute( $sql, $params, $prepare_params, $callback ); + my $result = eval {$callback->($sth)}; + my $error = $@; + $self->_finish_sth($sth); + die $error if $error; + return $result; +} + +=pod + +=head2 batch() + + Arg [SQL] : string $sql + Arg [CALLBACK] : CodeRef The callback to use for working with the statement + handle once returned; specify this or -DATA + Arg [DATA] : ArrayRef The data to insert; specify this or -CALLBACK + Arg [COMMIT_EVERY] : Integer defines the rate at which to issue commits to + the DB handle. This is important when working with + InnoDB databases since it affects the speed of rollback + (larger gaps inbetween commits means more to rollback). + + Ignored if using the callback version. + Arg [PREPARE_PARAMS] : ArrayRef Used to pass parameters to the statement handle + prepare method + Returntype : integer rows updated + Exceptions : If errors occur in the execution of the SQL + Status : Stable + + my $alotofdata = getitfromsomewhere(); + $helper->batch( + -SQL => 'insert into table (one,two) values(?,?)', + -CALLBACk => sub { + my ( $sth, $dbc ) = @_; + foreach my $data (@alotofdata) { + $sth->execute( @{$data} ); + } + } ); + + #Or for a 2D array data driven approach + $helper->batch( -SQL => 'insert into table (one,two) values(?,?)', + -DATA => $alotofdata ); + +Takes in a sql statement & a code reference. Your SQL is converted into a +prepared statement & then given as the first parameter to the closure. The +second parameter is the DBH which created the statement. This is intended +to let you do mass insertion into a database without the need to +re-preparing the same statement. + +This can be combined with the transaction() code to provide a construct +which does batch insertion & is transactionally aware. + +We can also use data based batch insertions i.e. + + #Needs to be like: + # [ [1,2], [3,4] ] + #Or if using the DBI types: + # [ [ [ 1, SQL_INTEGER ], [ 2, SQL_INTEGER ] ], + # [ [ 3, SQL_INTEGER ], [ 4, SQL_INTEGER ] ] ]; + + my $alotofdata = getitfromsomewhere(); + $helper->batch( -SQL => 'insert into table (one,two) values(?,?)', + -DATA => $alotofdata ); + +This does exactly what the previous example. + +All batch statements will return the value the callback computes. If you are +using the previous example with a data array then the code will return the +number affected rows by the query. + +=cut + +sub batch { + my ($self, @args) = @_; + my ($sql, $callback, $data, $commit_every, $prepare_params) = + rearrange([qw(sql callback data commit_every prepare_params)], @args); + + if(! defined $callback && ! defined $data) { + throw('You need to define a callback for insertion work or the 2D data array'); + } + + my $result; + if(defined $callback) { + $result = $self->_callback_batch($sql, $callback, $prepare_params); + } + else { + $result = $self->_data_batch($sql, $data, $commit_every, $prepare_params); + } + return $result if defined $result; + return; +} + +#------- Internal methods + +my $default_mappers = { + first_element => sub { + my ($row) = @_; + return $row->[0]; + }, + second_element => sub { + my ($row) = @_; + return $row->[1]; + }, + array_ref => sub { + my $row = shift @_; + return [@{$row}]; + } +}; + +sub _mappers { + my ($self) = @_; + return $default_mappers; +} + +sub _perform_transaction_code { + my ($self) = @_; + return $self->{_transaction_active}->{$PROCESS_ID} ? 0 : 1; +} + +sub _enable_transaction { + my ($self) = @_; + my $dbc = $self->db_connection(); + my $original_dwi = $dbc->disconnect_when_inactive(); + $dbc->disconnect_when_inactive(0); + my $ac = $dbc->db_handle()->{'AutoCommit'}; + $dbc->db_handle()->{'AutoCommit'} = 0; + $self->{_transaction_active}->{$PROCESS_ID} = 1; + return ($original_dwi, $ac); +} + +sub _disable_transaction { + my ($self, $original_dwi, $ac) = @_; + my $dbc = $self->db_connection(); + $dbc->db_handle()->{'AutoCommit'} = $ac; + $dbc->disconnect_when_inactive($original_dwi); + delete $self->{_transaction_active}->{$PROCESS_ID}; + return; +} + +sub _bind_params { + my ( $self, $sth, $params ) = @_; + + return if ! defined $params; #Return quickly if we had no data + + if(! check_ref($params, 'ARRAY')) { + throw(qq{The given parameters reference '${params}' is not an ARRAY; wrap in an ArrayRef}); + } + + my $count = 1; + foreach my $param (@{$params}) { + if ( check_ref($param, 'ARRAY') ) { + $sth->bind_param( $count, @{$param} ); + } + else { + $sth->bind_param( $count, $param ); + } + $count++; + } + return; +} + +sub _execute { + my ( $self, $sql, $callback, $has_return, $use_hashrefs, $params, $prepare_params, $iterator ) = @_; + + throw('Not given a mapper. _execute() must always been given a CodeRef') unless check_ref($callback, 'CODE'); + + my $sth = $self->_base_execute($sql, $params, $prepare_params); + + my $sth_processor; + if($use_hashrefs) { + $sth_processor = sub { + while( my $row = $sth->fetchrow_hashref() ) { + my $v = $callback->($row, $sth); + return $v if $has_return; + } + $self->_finish_sth($sth); + return undef; + }; + } + else { + $sth_processor = sub { + while( my $row = $sth->fetchrow_arrayref() ) { + my $v = $callback->($row, $sth); + return $v if $has_return; + } + $self->_finish_sth($sth); + return undef; + }; + } + + my $iter = Bio::EnsEMBL::Utils::Iterator->new($sth_processor); + if($has_return) { + return $iter if $iterator; + return $iter->to_arrayref(); + } + else { + #Force iteration if we had no return since the caller is expecting this + $iter->each(sub {}); + } + return; +} + +sub _base_execute { + my ( $self, $sql, $params, $prepare_params) = @_; + + $params = [] unless $params; + + my $conn = $self->db_connection; + + my $sth; + eval { + my @prepare_params; + @prepare_params = @{$prepare_params} if check_ref($prepare_params, 'ARRAY'); + $sth = $conn->prepare($sql, @prepare_params); + throw("Cannot continue as prepare() did not return a handle with prepare params '@prepare_params'") + unless $sth; + $self->_bind_params( $sth, $params ); + $sth->execute(); + }; + + my $error = $@; + if($error) { + throw("Cannot run '${sql}' with params '@{$params}' due to error: $error") if $error; + } + + return $sth; +} + +sub _finish_sth { + my ($self, $sth) = @_; + eval { $sth->finish() if defined $sth; }; + warn('Cannot finish() the statement handle: $@') if $@; + return; +} + +sub _callback_batch { + my ($self, $sql, $callback, $prepare_params) = @_; + my $error; + my $sth; + my $closure_return; + eval { + my @prepare_params; + @prepare_params = @{$prepare_params} if check_ref($prepare_params, 'ARRAY'); + $sth = $self->db_connection()->prepare($sql, @prepare_params); + $closure_return = $callback->($sth, $self->db_connection()); + }; + $error = $@; + $self->_finish_sth($sth); + throw("Problem detected during batch work: $error") if $error; + + return $closure_return if defined $closure_return; + return; +} + +sub _data_batch { + my ($self, $sql, $data, $commit_every, $prepare_params) = @_; + + #Input checks + assert_ref($data, 'ARRAY', '-DATA'); + my $data_length = scalar(@{$data}); + return 0 unless $data_length > 0; + my $first_row = $data->[0]; + throw('I expect to work with a 2D ArrayRef but this is not one') unless check_ref($first_row, 'ARRAY'); + + my $callback = sub { + my ($sth, $dbc) = @_; + my $total_affected = 0; + #Iterate over each data point + for(my $data_index = 0; $data_index < $data_length; $data_index++) { + my $row = $data->[$data_index]; + $self->_bind_params($sth, $row); + my $affected = eval {$sth->execute()}; + if($@) { + throw("Problem working with $sql with params @{$row}: $@"); + } + my $num_affected = ($affected) ? $affected : 0; #Get around DBI's 0E0 + $total_affected += $num_affected; + + #Lets us do a commit once every x rows apart from 0. We also finish + #off with a commit if the code told us we were doing it + if($commit_every) { + if( ($data_index % $commit_every == 0) && $data_index != 0) { + $dbc->db_handle()->commit(); + } + } + } + + #finish off with a commit if the code told us we were doing it + if($commit_every) { + $dbc->db_handle()->commit(); + } + + return $total_affected || 0; + }; + + return $self->_callback_batch($sql, $callback, $prepare_params) +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/TranscriptAlleles.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/TranscriptAlleles.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,703 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +TranscriptAlleles - A utility class used to obtain information about the +relationships between a transcript and Alleles + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Utils::TranscriptAlleles; + + # get the peptide variations caused by a set of Alleles + + %variations = %{ + Bio::EnsEMBL::Utils::TranscriptAlleles::get_all_peptide_variations( + $transcript, $alleles ) }; + +=head1 DESCRIPTION + +This is a utility class which can be used to find consequence type of an +AlleleFeature in a transcript, and to determine the amino acid changes +caused by the AlleleFeature in the Transcript + + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Utils::TranscriptAlleles; + +use strict; +use warnings; + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Variation::ConsequenceType; +use vars qw(@ISA @EXPORT_OK); + +use Data::Dumper; + +@ISA = qw(Exporter); + +@EXPORT_OK = qw(&get_all_ConsequenceType &type_variation); + + +=head2 get_all_ConsequenceType + + Arg [1] : $transcript the transcript to obtain the peptide variations for + Arg [2] : $alleles listref of AlleleFeatures + Example : $consequence_types = get_all_ConsequenceType($transcript, \@alleles); + foreach my $ct (@{$consequence_types}){ + print "Allele : ", $ct->allele_string, " has a consequence type of :",$ct->type; + print " and is affecting the transcript with ",@{$ct->aa_alleles}, "in position ", + $ct->aa_start,"-", $ct->aa_end if (defined $ct->aa_alleles); + print "\n"; + } + Description: Takes a list of AlleleFeatures and a Transcritpt, and return a list + of ConsequenceType of the alleles in the given Transcript + Returntype : listref of Bio::EnsEMBL::Variation::ConsequenceType + Exceptions : none + Caller : general + +=cut + +sub get_all_ConsequenceType { + my $transcript = shift; + my $alleles = shift; + + if(!ref($transcript) || !$transcript->isa('Bio::EnsEMBL::Transcript')) { + throw('Bio::EnsEMBL::Transcript argument is required.'); + } + + if(!ref($alleles) || (ref($alleles) ne 'ARRAY')) { + throw('Reference to a list of Bio::EnsEMBL::Variation::AlleleFeature objects is required'); + } + + + my @alleles_ordered = sort { $a->start <=> $b->start} @$alleles; #sort the alleles by the genomic position + my @same_codon; #contains up to 3 allele features, that are in the same codon, but each position can contain more than 1 allele + my @out; #array containing the consequence types of the alleles in the transcript + foreach my $allele (@alleles_ordered) { +# foreach my $allele (@{$alleles}) { + #get consequence type of the AlleleFeature + # my $new_allele = $allele->transform('chromosome'); + #my $consequence_type = Bio::EnsEMBL::Variation::ConsequenceType->new($transcript->dbID(),'',$allele->start,$allele->end,$allele->strand,[$allele->allele_string]); + ### REAL HACK BY js5 because something is borked in TranscriptMapper + ### This relies on the Allele being of the form i.e. a SNP! [ACGT-](/[ACGT-])+ + ### The rest don't work anyway until we have a AlignStrainSlice + ### MUST BE SORTED.... + + #we have to consider het alleles + my $allele_string; + if ($allele->allele_string =~ /[\|\\\/]/){ + my @alleles = split /[\|\\\/]/,$allele->allele_string; + if ($alleles[0] ne $allele->ref_allele_string){ + $allele_string = $alleles[0]; + } + else{ + $allele_string = $alleles[1]; + } + } + else{ + $allele_string = $allele->allele_string; + } + my $opposite_strand = 0; #to indicate wether transcript and allele and in different strands + my $transcript_allele = $allele_string; + if( $transcript->strand != $allele->strand ) { + $transcript_allele =~tr/ACGT/TGCA/; + $opposite_strand = 1; + } + + my $consequence_type = Bio::EnsEMBL::Variation::ConsequenceType->new($transcript->dbID(),'',$allele->start, $allele->end, $transcript->strand, [$transcript_allele]); + #calculate the consequence type of the Allele if different from the reference Allele + #if (($opposite_strand && $allele->ref_allele_string eq $allele_string) || (!$opposite_strand && $allele->ref_allele_string eq $allele_string)){ #same allele as reference, there is no consequence, called SARA + if ($allele->ref_allele_string eq $allele_string) { #same allele as reference, there is no consequence, called SARA + #same allele as reference, there is no consequence, called SARA + #we have to calculate if there are more than 2 in the same codon + empty_codon(\@out,\@same_codon); + $consequence_type->type('SARA'); + push @out, $consequence_type; + next; + } + + my $ref_consequences = type_variation($transcript,"",$consequence_type); + if ($allele->start != $allele->end){ + empty_codon(\@out,\@same_codon); + #do not calculate for indels effects of 2 or more in same codon + push @out, @{$ref_consequences}; + next; + } + + my $new_consequence = shift @{$ref_consequences}; + if (! defined $new_consequence ) { + empty_codon(\@out,\@same_codon); + push @out, $consequence_type; # should be empty + next; + } + + if ( !defined $new_consequence->aa_start){ + empty_codon(\@out,\@same_codon); + push @out, $new_consequence; + next; + } + #first element of the codon + if (!defined $same_codon[0]){ + push @{$same_codon[0]}, $new_consequence; #goes to the first position + next; + } + #for alleles with aa effect, find out if they are in the same codon + if ($same_codon[-1]->[0]->aa_start == $new_consequence->aa_start){ + #they are in the same codon, find out if it is the same position + if ($same_codon[-1]->[0]->start == $new_consequence->start){ + #it is the same position + push @{$same_codon[-1]},$new_consequence; #push in the last + } + else{ + push @{$same_codon[$#same_codon + 1]},$new_consequence; #this is a new element in the codon + } + + } + else{ + #if there is more than one element in the same_codon array, calculate the effect of the codon + if (@same_codon > 1){ + calculate_same_codon(\@same_codon); + } + map {push @out, @{$_}} @same_codon; + @same_codon = (); + push @{$same_codon[0]}, $new_consequence; #push the element not in the same codon + } + } + #add last consequence_type + empty_codon(\@out,\@same_codon); + + return \@out; +} + +sub empty_codon{ + my $out = shift; + my $same_codon = shift; + + if (@{$same_codon} == 1){ + map {push @{$out}, @{$_}} @{$same_codon}; + } + elsif (@{$same_codon} > 1){ + calculate_same_codon($same_codon); + map {push @{$out}, @{$_}} @{$same_codon}; + } + @{$same_codon} = (); +} + +# recalculates the effect of 2 or 3 SNPs in the same codon +sub calculate_same_codon{ + my $same_codon = shift; + my $new_codon; + my $old_aa; + my $codon_table = Bio::Tools::CodonTable->new; + if (@{$same_codon} == 3){ + #if there are 3 alleles in the same codon + map {$new_codon .= @{$_->[0]->alleles};$old_aa = $_->[0]->aa_alleles()->[0]} @{$same_codon}; + } + else{ + #if there are 2 alleles affecting the same codon + my $first_pos = ($same_codon->[0]->[0]->cdna_start -1) % 3; #position of the first allele in the codon + my $second_pos = ($same_codon->[1]->[0]->cdna_start -1)% 3; #position of the second allele in the codon + if ($first_pos == 0){ + #codon starts with first allele + $new_codon = $same_codon->[0]->[0]->alleles->[0]; #first base in the codon + if ($second_pos == 1){ + $new_codon .= $same_codon->[1]->[0]->alleles->[0]; #second base in the codon + $new_codon .= substr($same_codon->[1]->[0]->codon,2,1); #third base in the codon + } + else{ + $new_codon .= substr($same_codon->[1]->[0]->codon,1,1); #second base in the codon + $new_codon .= $same_codon->[1]->[0]->alleles->[0]; #third base in the codon + } + } + else{ + #alleles are in position 1 and 2 in the codon + $new_codon = substr($same_codon->[1]->[0]->codon,0,1); #first base in the codon + $new_codon .= $same_codon->[0]->[0]->alleles->[0]; #second base in the codon + $new_codon .= $same_codon->[1]->[0]->alleles->[0]; #third base in the codon + } + $old_aa = $same_codon->[0]->[0]->aa_alleles->[0]; + } + #calculate the new_aa + my $new_aa = $codon_table->translate($new_codon); + #and update the aa_alleles field in all the codons + foreach my $codon (@{$same_codon}){ + map {$_->aa_alleles([$old_aa,$new_aa])} @{$codon}; + } + +} +# +# Classifies a variation which is in the vicinity of a transcript +# +sub type_variation { + my $tr = shift; + my $g = shift; + my $var = shift; + + my $UPSTREAM = 5000; + my $DOWNSTREAM = 5000; + + #empty type first in the case of recursive call + $var->empty_type if defined $var->type; + + if (!$var->isa('Bio::EnsEMBL::Variation::ConsequenceType')) { + throw("Not possible to calculate the consequence type for ",ref($var)," : Bio::EnsEMBL::Variation::ConsequenceType object expected"); + } + + if (($var->start < $tr->start - $UPSTREAM) || ($var->end > $tr->end + $DOWNSTREAM)){ + #since the variation is more than UPSTREAM and DOWNSTREAM of the transcript, there is no effect in the transcript + return []; + } + + + # check the cache + my $tran_features = $tr->{_variation_effect_feature_cache}; + + # populate it if not found + unless ($tran_features) { + $tran_features = { + mapper => $tr->get_TranscriptMapper, + }; + + my ($attrib) = @{$tr->slice()->get_all_Attributes('codon_table')}; #for mithocondrial dna it is necessary to change the table + + my $codon_table; + $codon_table = $attrib->value() if($attrib); + $codon_table ||= 1; # default vertebrate codon table + + if ($tran_features->{translation} = $tr->translate(undef, undef, undef, $codon_table)) { + $tran_features->{translateable_seq} = $tr->translateable_seq; + + # to include the stop codon we need to translate the Bio::Seq sequence, not just + # $tr->translation, this is the source of the missing STOP_LOSTs + my $mrna_seqobj = Bio::Seq->new( + -seq => $tran_features->{translateable_seq}, + -moltype => 'dna', + -alphabet => 'dna' + ); + + $tran_features->{peptide} = $mrna_seqobj->translate(undef, undef, undef, $codon_table)->seq; + } + + $tr->{_variation_effect_feature_cache} = $tran_features; + } + + if ( !defined( $tran_features->{translation} ) ) + { # for other biotype rather than coding/IG genes + # check if the variation is completely outside the transcript: + + if ( $var->end() < $tr->start() ) { + $var->type( ( $tr->strand() == 1 ) ? 'UPSTREAM' : 'DOWNSTREAM' ); + return [$var]; + } + if ( $var->start() > $tr->end() ) { + $var->type( ( $tr->strand() == 1 ) ? 'DOWNSTREAM' : 'UPSTREAM' ); + return [$var]; + } + + if ( $var->start() >= $tr->start() and $var->end() <= $tr->end() ) + { # within the transcript + if ( $tr->biotype() eq "miRNA" ) { + my ($attribute) = @{ $tr->get_all_Attributes('miRNA') }; + + # the value is the mature miRNA coordinate within miRNA + # transcript + if ( defined($attribute) + && $attribute->value() =~ /(\d+)-(\d+)/ ) + { + # transfer cdna value to genomic coordinates + my @mapper_objs = $tr->cdna2genomic( $1, $2, $tr->strand() ); + + foreach my $obj (@mapper_objs) + { #Note you can get more than one mature seq per miRNA + if ( $obj->isa("Bio::EnsEMBL::Mapper::Coordinate") ) { + if ( $var->start() >= $obj->start() + and $var->end() <= $obj->end() ) + { + $var->type("WITHIN_MATURE_miRNA"); + return [$var]; + } + } + } + } + } + + $var->type("WITHIN_NON_CODING_GENE"); + return [$var]; + + } ## end if ( $var->start() >= ...) + } ## end if ( !defined( $tr->translation...)) + + # get a transcript mapper object + my $tm = $tran_features->{mapper}; + + # map to CDNA coords + my @cdna_coords = $tm->genomic2cdna($var->start,$var->end,$var->strand); + + # map to CDS cooords + my @cds_coords = $tm->genomic2cds($var->start, $var->end,$var->strand); + + # map to peptide coords + my @pep_coords = $tm->genomic2pep($var->start, $var->end, $var->strand); + + # get the phase of the first exon + my $exon_phase = $tr->start_Exon->phase; + + # check for partial codon consequence + if( + @pep_coords == 1 + && @cds_coords == 1 + && !($cds_coords[0]->isa('Bio::EnsEMBL::Mapper::Gap')) + && !($pep_coords[0]->isa('Bio::EnsEMBL::Mapper::Gap')) + ) { + + # get the CDS sequence + my $cds = $tran_features->{translateable_seq}; + + my $start = $pep_coords[0]->start(); + my $codon_cds_start = ($start * 3) - 2; + + my $last_codon_length = length($cds) - ($codon_cds_start - 1); + + if($last_codon_length < 3 && $last_codon_length > 0) { + $var->type("PARTIAL_CODON"); + + # add the CDS coords + $var->cds_start($cds_coords[0]->start + ($exon_phase > 0 ? $exon_phase : 0)); + $var->cds_end($cds_coords[0]->end + ($exon_phase > 0 ? $exon_phase : 0)); + + # add the cDNA coords + $var->cdna_start($cdna_coords[0]->start); + $var->cdna_end($cdna_coords[0]->end); + + return [$var]; + } + } + + + # Handle simple cases where the variation is not split into parts. + # Call method recursively with component parts in complicated case. + # E.g. a single multi-base variation may be both intronic and coding + + if(@cdna_coords > 1) { + my @out; + #this will be a new type, complex_indel + $var->type('COMPLEX_INDEL'); + return [$var]; +# foreach my $c (@coords) { +# my %new_var = %{$var}; +# $new_var{'end'} = $var->start + $c->length() - 1; +# $var->start( $new_var{'end'} + 1); +# #empty the type before re-run +# $var->empty_type ; +# push @out, @{type_variation($tr, $g, bless \%new_var, ref($var))}; +# } +# return \@out; + + + } + + # look at different splice distances + my @coords_splice_2 = $tm->genomic2cdna($var->start -2, $var->end +2, $var->strand); + my @coords_splice_3 = $tm->genomic2cdna($var->start -3, $var->end +3, $var->strand); + my @coords_splice_8 = $tm->genomic2cdna($var->start -8, $var->end +8, $var->strand); + + my ($splice_site_2, $splice_site_3, $splice_site_8); + + if (scalar @coords_splice_2 >1) { + $splice_site_2=1; + } + elsif (scalar @coords_splice_3 >1) { + $splice_site_3=1; + } + elsif (scalar @coords_splice_8 >1) { + $splice_site_8=1; + } + + + my $c = $cdna_coords[0]; + if($c->isa('Bio::EnsEMBL::Mapper::Gap')) { + + # check if the variation is completely outside the transcript: + + if($var->end < $tr->start()) { + $var->type( ($tr->strand() == 1) ? 'UPSTREAM' : 'DOWNSTREAM' ); + return [$var]; + } + if($var->start > $tr->end()) { + $var->type( ($tr->strand() == 1) ? 'DOWNSTREAM' : 'UPSTREAM' ); + return [$var]; + } + + # nonsense-mediated decay transcript + if($tr->biotype() eq 'nonsense_mediated_decay') { + $var->type("NMD_TRANSCRIPT"); + #return [$var]; + } + + # variation must be intronic since mapped to cdna gap, but is within + # transcript, note that ESSENTIAL_SPLICE_SITE only consider first (AG) and last (GT) 2 bases inside the intron. + # if variation is in intron, we need to check the lenth of intron, if it's shoter than 6, we call it SYNONYMOUS_CODING rather then INTRONIC + + foreach my $intron (@{$tran_features->{introns}}) { + if ($intron->length <=5) {#the length of frameshift intron could be 1,2,4,5 bases + if ($var->start>=$intron->start and $var->end<=$intron->end) { + #this is a type of SYNONYMOUS_CODING since changes happen in frameshift intron, which don't change exon structure + $var->type('SYNONYMOUS_CODING'); + return [$var]; + } + } + } + #if it's not in frameshift intron, then it's in normal intron + $var->type('INTRONIC'); + + if ($splice_site_2) { + $var->type('ESSENTIAL_SPLICE_SITE'); + } + elsif ($splice_site_3 or $splice_site_8) { + $var->type('SPLICE_SITE'); + } + return [$var]; + } + + # nonsense-mediated decay transcript + if($tr->biotype() eq 'nonsense_mediated_decay') { + $var->type("NMD_TRANSCRIPT"); + #return [$var]; + } + + #now variation must be in exons, the first 3 bs into exon could be splice_site + + if ($splice_site_2 or $splice_site_3) { + + my ($se_s, $se_e, $ee_s, $ee_e) = ($tr->start_Exon->start, $tr->start_Exon->end, $tr->end_Exon->start, $tr->end_Exon->end); + ($se_s, $se_e, $ee_s, $ee_e) = ($se_e, $se_s, $ee_e, $ee_s) if $tr->strand < 0; + + # check coord relative to first exon + # near beginning of first exon is obv not a splice site + if($var->start <= $se_e) { + if(abs($se_e - $var->start) <= 3) { + $var->type('SPLICE_SITE'); + } + } + + # also check relative to last exon + # near end of last exon is also not a splice site + elsif($var->start >= $ee_s) { + if(abs($ee_s - $var->start) <= 3) { + $var->type('SPLICE_SITE'); + } + } + + # if not near either end of transcript, then it is definitely a splice site + else { + $var->type('SPLICE_SITE'); + } + } + + $var->cdna_start( $c->start() ); + $var->cdna_end( $c->end() ); + + if(@cds_coords > 1) { +# my @out; + #this is a new type, complex_indel + $var->type('COMPLEX_INDEL'); + return [$var]; +# foreach my $c (@coords) { +# my %new_var = %{$var}; +# $new_var{'end'} = $var->start + $c->length() - 1; +# $var->start( $new_var{'end'} + 1); +# #empty the type before re-run +# $var->empty_type ; +# push @out, @{type_variation($tr, $g, bless \%new_var, ref($var))}; +# } +# return \@out; + } + + $c = $cds_coords[0]; + + if($c->isa('Bio::EnsEMBL::Mapper::Gap')) { + # mapped successfully to CDNA but not to CDS, must be UTR + + if($var->end < $tr->coding_region_start()) { + $var->type( ($tr->strand() == 1) ? '5PRIME_UTR' : '3PRIME_UTR' ); + } + elsif($var->start > $tr->coding_region_end()) { + $var->type( ($tr->strand() == 1) ? '3PRIME_UTR' : '5PRIME_UTR'); + } + else { + throw('Unexpected: CDNA variation which is not in CDS is not in UTR'); + } + return [$var]; + } + + # we need to add the exon phase on in case of weird transcripts + # where the first exon is not in normal phase + $var->cds_start( $c->start() + ($exon_phase > 0 ? $exon_phase : 0)); + $var->cds_end( $c->end() + ($exon_phase > 0 ? $exon_phase : 0)); + + + if(@pep_coords != 1 || $pep_coords[0]->isa('Bio::EnsEMBL::Mapper::Gap')) { + throw("Unexpected: Could map to CDS but not to peptide coordinates."); + } + + $c = $pep_coords[0]; + + $var->aa_start( $c->start()); + $var->aa_end( $c->end()); + + apply_aa_change($tr, $var); + + return [$var]; +} + +# +# Determines the effect of a coding variation on the peptide sequence +# + +sub apply_aa_change { + my $tr = shift; + my $var = shift; + + my ($attrib) = @{$tr->slice()->get_all_Attributes('codon_table')}; #for mithocondrial dna it is necessary to change the table + + my $codon_table; + $codon_table = $attrib->value() if($attrib); + $codon_table ||= 1; # default vertebrate codon table + + # check the cache + my $tran_features = $tr->{_variation_effect_feature_cache}; + + # populate it if not found + unless ($tran_features) { + $tran_features = { + mapper => $tr->get_TranscriptMapper, + }; + + if ($tran_features->{translation} = $tr->translate(undef, undef, undef, $codon_table)) { + $tran_features->{translateable_seq} = $tr->translateable_seq; + + # to include the stop codon we need to translate the Bio::Seq sequence, not just + # $tr->translation, this is the source of the missing STOP_LOSTs + my $mrna_seqobj = Bio::Seq->new( + -seq => $tran_features->{translateable_seq}, + -moltype => 'dna', + -alphabet => 'dna' + ); + + $tran_features->{peptide} = $mrna_seqobj->translate(undef, undef, undef, $codon_table)->seq; + } + + $tr->{_variation_effect_feature_cache} = $tran_features; + } + + my $mrna = $tran_features->{translateable_seq}; # get from cache + + my $peptide = $tran_features->{peptide}; # get from cache + + my $len = $var->aa_end - $var->aa_start + 1; + my $old_aa = substr($peptide, $var->aa_start -1 , $len); + + my $codon_cds_start = $var->aa_start * 3 - 2; + my $codon_cds_end = $var->aa_end * 3; + my $codon_len = $codon_cds_end - $codon_cds_start + 1; + + my @alleles = @{$var->alleles}; + + my $var_len = $var->cds_end - $var->cds_start + 1; + + my @aa_alleles = ($old_aa); + + my $ref_codon = substr($mrna, $codon_cds_start-1, $codon_len); + my @codons; + push @codons, $ref_codon; + + #here could generate multi type if have multi-allele change: "ACTAGT/-/T" + foreach my $a (@alleles) { + $a =~ s/\-//; + my $cds = $mrna; + + if($var_len != length($a)) { + if(abs(length($a) - $var_len) % 3) { + # frameshifting variation, do not set peptide_allele string + # since too complicated and could be very long + + $var->type('FRAMESHIFT_CODING'); + return [$var]; + } + + if($codon_len == 0) { # insertion + $aa_alleles[0] = '-'; + $old_aa = '-'; + } + } + + my $new_aa; + + # change sequence + substr($cds, $var->cds_start-1, $var_len) = $a; + + # get the new codon + my $codon_str = substr($cds, $codon_cds_start-1, $codon_len + length($a)-$var_len); + + push @codons, $codon_str; + $var->codon($codon_str); #add the codon to the ConsequenceType object + my $codon_seq = Bio::Seq->new(-seq => $codon_str, + -moltype => 'dna', + -alphabet => 'dna'); + + $new_aa = $codon_seq->translate(undef,undef,undef,$codon_table)->seq(); + + if(length($new_aa)<1){ + $new_aa='-'; + } + + if(uc($new_aa) ne uc($old_aa)) { + push @aa_alleles, $new_aa; + if ($new_aa =~ /\*/) { + $var->type('STOP_GAINED'); + } + elsif ($old_aa =~ /\*/) { + $var->type('STOP_LOST'); + } + } + } + + #note if type is already defined as SOTP_GAINED OR STOP_LOST, then even @aa_alleles > 1, we are not given type + # of 'NON_SYNONYMOUS_CODING' + if(@aa_alleles > 1) { + if (!$var->type or (join ' ',@{$var->type}) !~ /STOP/) { + $var->type('NON_SYNONYMOUS_CODING'); + } + } + else { + $var->type('SYNONYMOUS_CODING'); + } + + #$var->codons(\@codons); + $var->aa_alleles(\@aa_alleles); +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/TranscriptSNPs.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/TranscriptSNPs.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,444 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +TranscriptSNPs - A utility class used to obtain information about the +relationships between a transcript and SNPs + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Utils::TranscriptSNPs; + + # get and type all snps in the region of the transcript + + %snps = %{ + Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_SNPs( $transcript, + $flanking ) }; + + # get all snps overlapping the transcript in cdna coordinates + + %snps = + %{ Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_cdna_SNPs( + $transcript) }; + + # get the peptide variations caused by a set of SNPs + + %variations = %{ + Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_peptide_variations( + $transcript, $snps ) }; + +=head1 DESCRIPTION + +This is a utility class which can be used to get snps associated with a +transcript, and to determine the amino acid changes caused by the SNPs + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Utils::TranscriptSNPs; + +use strict; +use warnings; +no warnings 'uninitialized'; + + + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + + +=head2 get_all_peptide_variations + + Arg [1] : $transcript the transcript to obtain the peptide variations for + Arg [2] : $snps listref of coding snps in cdna coordinates + Example : $pep_hash = get_all_peptide_variations($transcript, \@snps); + Description: Takes a list of coding snps on this transcript in + which are in cdna coordinates and returns a hash with peptide + coordinate keys and listrefs of alternative amino acids as + values. The SNPs must additionally have a strand of 1 for the + sake of simplicity. Normally these could be generated using the + get_all_cdna_SNPs method. + + Note that the peptide encoded by the reference sequence is + also present in the results and that duplicate peptides + (e.g. resulting from synonomous mutations) are discarded. + It is possible to have greated than two peptides variations + at a given location given adjacent or overlapping snps. + Insertion/deletion variations are ignored by this method. + Example of a data structure that could be returned: + { 1 => ['I', 'M'], + 10 => ['I', 'T'], + 37 => ['N', 'D'], + 56 => ['G', 'E'], + 118 => ['R', 'K'], + 159 => ['D', 'E'], + 167 => ['Q', 'R'], + 173 => ['H', 'Q'] } + Returntype : hashref + Exceptions : none + Caller : general + +=cut + +sub get_all_peptide_variations { + my $transcript = shift; + my $snps = shift; + + if(!ref($transcript) || !$transcript->isa('Bio::EnsEMBL::Transcript')) { + throw('Bio::EnsEMBL::Transcript argument is required.'); + } + + if(!ref($snps) eq 'ARRAY') { + throw('Reference to a list of Bio::EnsEMBL::SNP objects is required'); + } + + my $codon_table = Bio::Tools::CodonTable->new; + my $codon_length = 3; + my $cdna = $transcript->spliced_seq; + + my $variant_alleles; + my $translation_start = $transcript->cdna_coding_start; + foreach my $snp (@$snps) { + #skip variations not on a single base + next if ($snp->start != $snp->end); + + my $start = $snp->start; + my $strand = $snp->strand; + + #calculate offset of the nucleotide from codon start (0|1|2) + my $codon_pos = ($start - $translation_start) % $codon_length; + + #calculate the peptide coordinate of the snp + my $peptide = ($start - $translation_start + + ($codon_length - $codon_pos)) / $codon_length; + + # skip this SNP if it falls in a partial codon + next if $start - $codon_pos + $codon_length > length($cdna); + + #retrieve the codon + my $codon = substr($cdna, $start - $codon_pos-1, $codon_length); + + #store each alternative allele by its location in the peptide + my @alleles = split(/\/|\|/, lc($snp->allele_string)); + #my @alleles = split(/\/|\|/, lc($snp->alleles)); + + foreach my $allele (@alleles) { + next if $allele eq '-'; #skip deletions + next if CORE::length($allele) != 1; #skip insertions + + #get_all_cdna_SNPs always gives strand of 1 now + #if($strand == -1) { + # #complement the allele if the snp is on the reverse strand + # $allele =~ + # tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/; + #} + + #create a data structure of variant alleles sorted by both their + #peptide position and their position within the peptides codon + $variant_alleles ||= {}; + if(exists $variant_alleles->{$peptide}) { + my $alleles_arr = $variant_alleles->{$peptide}->[1]; + push @{$alleles_arr->[$codon_pos]}, $allele; + } else { + #create a list of 3 lists (one list for each codon position) + my $alleles_arr = [[],[],[]]; + push @{$alleles_arr->[$codon_pos]}, $allele; + $variant_alleles->{$peptide} = [$codon, $alleles_arr]; + } + } + } + + my %out; + #now generate all possible codons for each peptide and translate them + foreach my $peptide (keys %$variant_alleles) { + my ($codon, $alleles) = @{$variant_alleles->{$peptide}}; + + #need to push original nucleotides onto each position + #so that all possible combinations can be generated + push @{$alleles->[0]}, substr($codon,0,1); + push @{$alleles->[1]}, substr($codon,1,1); + push @{$alleles->[2]}, substr($codon,2,1); + + my %alt_amino_acids; + foreach my $a1 (@{$alleles->[0]}) { + substr($codon, 0, 1) = $a1; + foreach my $a2 (@{$alleles->[1]}) { + substr($codon, 1, 1) = $a2; + foreach my $a3 (@{$alleles->[2]}) { + substr($codon, 2, 1) = $a3; + my $aa = $codon_table->translate($codon); + #print "$codon translation is $aa\n"; + $alt_amino_acids{$aa} = 1; + } + } + } + + my @aas = keys %alt_amino_acids; + $out{$peptide} = \@aas; + } + + return \%out; +} + + + +=head2 get_all_SNPs + + Arg [1] : Bio::EnsEMBL::Transcript $transcript + + Arg [2] : (optional) int $flanking + The number of basepairs of transcript flanking sequence to + retrieve snps from (default 0) + Arg [3] : $source type of database source (dbSNP, Glovar) + Example : $snp_hashref = get_all_transcript_SNPs($transcript) + Description: Retrieves all snps found within the region of the + provided transcript + The snps are returned in a hash with keys corresponding + to the region the snp was found in. Possible keys are: + 'three prime UTR', 'five prime UTR', 'coding', 'intronic', + 'three prime flanking', 'five prime flanking' + If no flanking argument is provided no flanking snps will be + obtained. + The listrefs which are the values of the returned hash + contain snps in coordinates of the transcript region + (i.e. first base = first base of the first exon on the + postive strand - flanking bases + 1) + + Multiple base variations and inserts/deletes are discarded + by this method and not used. + + Returntype : hasref with string keys and listrefs of Bio::EnsEMBL::SNPs for + values + Exceptions : none + Caller : general + +=cut + +sub get_all_SNPs { + my $transcript = shift; + my $flanking = shift || 0; + my $source = shift; + + if(!ref($transcript) || !$transcript->isa('Bio::EnsEMBL::Transcript')) { + throw('Bio::EnsEMBL::Transcript argument required.'); + } + + my $slice = $transcript->slice(); + + if(!$slice) { + warning("Cannot obtain SNPs for transcript without attached Slice."); + return {}; + } + + my $sa = $slice->adaptor(); + + if(!$sa) { + warning('Cannot obtain SNPs for transcript unless attached slice ' . + 'has attached adaptor'); + return {}; + } + + my %snp_hash; + + # retrieve slice in the region of the transcript + $slice = $sa->fetch_by_Feature($transcript, $flanking ); + + # copy transcript, to work in coord system we are interested in + $transcript = $transcript->transfer( $slice ); + + # get all snps in the transcript region + my $snps; + if ($source eq 'glovar') { + $snps = $slice->get_all_ExternalFeatures('GlovarSNP'); + } + elsif ($source eq 'variation') { + $snps = $slice->get_all_VariationFeatures; + } + else { + $snps = $slice->get_all_SNPs; # dont need once use new snp api (i think) + } + + my $trans_start = $flanking + 1; + my $trans_end = $slice->length - $flanking; + my $trans_strand = $transcript->get_all_Exons->[0]->strand; + + # classify each snp + foreach my $snp (@$snps) { + my $key; + + if(($trans_strand == 1 && $snp->end < $trans_start) || + ($trans_strand == -1 && $snp->start > $trans_end)) { + # this snp is upstream from the transcript + $key = 'five prime flanking'; + } + + elsif(($trans_strand == 1 && $snp->start > $trans_end) || + ($trans_strand == -1 && $snp->start < $trans_start)) { + # this snp is downstream from the transcript + $key = 'three prime flanking'; + } + + else { + #snp is inside transcript region check if it overlaps an exon + foreach my $e (@{$transcript->get_all_Exons}) { + if($snp->end >= $e->start && $snp->start <= $e->end) { + # this snp is in an exon + + if(($trans_strand == 1 && + $snp->end < $transcript->coding_region_start) || + ($trans_strand == -1 && + $snp->start > $transcript->coding_region_end)) { + # this snp is in the 5' UTR + $key = 'five prime UTR'; + } + + elsif(($trans_strand == 1 && + $snp->start > $transcript->coding_region_end)|| + ($trans_strand == -1 && + $snp->end < $transcript->coding_region_start)) { + # this snp is in the 3' UTR + $key = 'three prime UTR'; + } + + else { + # snp is coding + $key = 'coding'; + } + last; + } + } + unless($key) { + # snp was not in an exon and is therefore intronic + $key = 'intronic'; + } + } + + unless($key) { + #warning('SNP could not be mapped. In/Dels not supported yet...'); + next; + } + + if(exists $snp_hash{$key}) { + push @{$snp_hash{$key}}, $snp; + } + else { + $snp_hash{$key} = [$snp]; + } + } + + return \%snp_hash; +} + + + +=head2 get_all_cdna_SNPs + + Arg [1] : Bio::EnsEMBL::Transcript $transcript + Arg [2] : $source type of database source (dbSNP, Glovar) + Example : $cdna_snp_hasref = $transcript->get_all_cdna_SNPs; + Description: Retrieves all snps found within exons of the provided + transcript. + The snps are returned in a hash with three keys corresponding + to the region the snp was found in. Valid keys are: + 'three prime UTR', 'five prime UTR', 'coding' + The listrefs which are the values of the returned hash + contain snps in CDNA coordinates. + + Multiple base variations and insertions/deletions are not + used by this function and are discarded. + Returntype : hasref with string keys and listrefs of Bio::EnsEMBL::SNPs for + values + Exceptions : none + Caller : general + +=cut + +sub get_all_cdna_SNPs { + my ($transcript, $source) = @_; + + #retrieve all of the snps from this transcript + my $all_snps = get_all_SNPs($transcript, 0, $source); + my %snp_hash; + + my @cdna_types = ('three prime UTR', 'five prime UTR','coding'); + + my $slice = $transcript->slice(); + my $sa = $slice->adaptor(); + + $slice = $sa->fetch_by_Feature($transcript); + + # copy transcript in order to work in coord system of interest + $transcript = $transcript->transfer($slice); + + foreach my $type (@cdna_types) { + $snp_hash{$type} = []; + foreach my $snp (@{$all_snps->{$type}}) { + my @coords = $transcript->genomic2cdna($snp->start, $snp->end, + $snp->strand); + + #skip snps that don't map cleanly (possibly an indel...) + if(scalar(@coords) != 1) { + #warning("snp of type $type does not map cleanly\n"); + next; + } + + my ($coord) = @coords; + + unless($coord->isa('Bio::EnsEMBL::Mapper::Coordinate')) { + #warning("snp of type $type maps to gap\n"); + next; + } + + my $alleles; + my $ambicode; + + # get alleles and ambig_code (with fallback to old snp API) + $alleles = $snp->allele_string || $snp->{'alleles'}; + $ambicode = $snp->ambig_code || $snp->{'_ambiguity_code'}; + + #we arbitrarily put the SNP on the +ve strand because it is easier to + #work with in the webcode + if($coord->strand == -1) { + $alleles =~ + tr/acgthvmrdbkynwsACGTDBKYHVMRNWS\//tgcadbkyhvmrnwsTGCAHVMRDBKYNWS\//; + $ambicode =~ + tr/acgthvmrdbkynwsACGTDBKYHVMRNWS\//tgcadbkyhvmrnwsTGCAHVMRDBKYNWS\//; + } + #copy the snp and convert to cdna coords... + my $new_snp; + %$new_snp = %$snp; + bless $new_snp, ref $snp; + $new_snp->start($coord->start); + $new_snp->end($coord->end); + $new_snp->strand(1); + $new_snp->allele_string($alleles); + $new_snp->ambig_code($ambicode); + push @{$snp_hash{$type}}, $new_snp; + } + } + + return \%snp_hash; +} + + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/URI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/URI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,617 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::URI + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Utils::URI qw/parse_uri is_uri/; + # or use Bio::EnsEMBL::Utils::URI qw/:all/; # to bring everything in + + my $db_uri = parse_uri('mysql://user@host:3157/db'); + my $http_uri = parse_uri('http://www.google.co.uk:80/search?q=t'); + + is_uri('mysql://user@host'); # returns 1 + is_uri('file:///my/path'); # returns 1 + is_uri('/my/path'); # returns 0 + +=head1 DESCRIPTION + +This object is a generic URI parser which is primarily used in the +parsing of database URIs into a more managable data structure. We also provide +the resulting URI object + +=head1 DEPENDENCIES + +L is an optional dependency but if available the code will attempt +to perform URI encoding/decoding on parameters. If you do not want this +functionality then modify the global C<$Bio::EnsEMBL::Utils::URI::URI_ESCAPE> +to false; + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Utils::URI; + +use strict; +use warnings; + +use Scalar::Util qw/looks_like_number/; +use Bio::EnsEMBL::Utils::Exception qw(throw); +use File::Spec; + +our $URI_ESCAPE; +$URI_ESCAPE = 0; +eval { + require URI::Escape; + URI::Escape->import(); + $URI_ESCAPE = 1; +}; + +use base qw/Exporter/; +our @EXPORT_OK; +our %EXPORT_TAGS; +@EXPORT_OK = qw/parse_uri is_uri/; +%EXPORT_TAGS = ( all => [@EXPORT_OK] ); + +####URI Parsing + +=head2 is_uri + + Arg[1] : Scalar; URI to parse + Example : is_uri('mysql://user:pass@host:415/db'); + Description : Looks for the existence of a URI scheme to decide if this + is a classical URI. Whilst non-scheme based URIs can still be + interprited it is useful to use when you need to know if + you are going to work with a URI or not + Returntype : Boolean + Caller : General + Status : Beta + +=cut + +sub is_uri { + my ($uri) = @_; + return 0 if ! $uri; + my $SCHEME = qr{ ([^:]*) :// }xms; + return ($uri =~ $SCHEME) ? 1 : 0; +} + +=head2 parse_uri + + Arg[1] : Scalar; URI to parse + Example : my $uri = parse_uri('mysql://user:pass@host:415/db'); + Description : A URL parser which attempts to convert many different types + of URL into a common data structure. + Returntype : Bio::EnsEMBL::Utils::URI + Caller : General + Status : Beta + +=cut + +sub parse_uri { + my ($url) = @_; + + my $SCHEME = qr{ ([^:]*) :// }xms; + my $USER = qr{ ([^/:\@]+)? :? ([^/\@]+)? \@ }xms; + my $HOST = qr{ ([^/:]+)? :? ([^/]+)? }xms; + my $DB = qr{ / ([^/?]+)? /? ([^/?]+)? }xms; + my $PARAMS = qr{ \? (.+)}xms; + + my $p; + + if($url =~ qr{ $SCHEME ([^?]+) (?:$PARAMS)? }xms) { + my $scheme = $1; + $scheme = ($URI_ESCAPE) ? uri_unescape($scheme) : $scheme; + $p = Bio::EnsEMBL::Utils::URI->new($scheme); + my ($locator, $params) = ($2, $3); + + if($scheme eq 'file') { + $p->path($locator); + } + elsif($scheme eq 'sqlite') { + $p->path($locator); + } + else { + if($locator =~ s/^$USER//) { + $p->user($1); + $p->pass($2); + } + if($locator =~ s/^$HOST//) { + $p->host(($URI_ESCAPE) ? uri_unescape($1) : $1); + $p->port(($URI_ESCAPE) ? uri_unescape($2) : $2); + } + + if($p->is_db_scheme() || $scheme eq q{}) { + if($locator =~ $DB) { + $p->db_params()->{dbname} = ($URI_ESCAPE) ? uri_unescape($1) : $1; + $p->db_params()->{table} = ($URI_ESCAPE) ? uri_unescape($2) : $2; + } + } + else { + $p->path($locator); + } + } + + if(defined $params) { + my @kv_pairs = split(/;|&/, $params); + foreach my $kv_string (@kv_pairs) { + my ($key, $value) = map { ($URI_ESCAPE) ? uri_unescape($_) : $_ } split(/=/, $kv_string); + $p->add_param($key, $value); + } + } + } + + return $p; +} + +####URI Object + +=pod + +=head2 new() + + Arg[1] : String; scheme the URI will confrom to + Description : New object call + Returntype : Bio::EnsEMBL::Utils::URIParser::URI + Exceptions : Thrown if scheme is undefined. + Status : Stable + +=cut + +sub new { + my ($class, $scheme) = @_; + $class = ref($class) || $class; + throw "Scheme cannot be undefined. Empty string is allowed" if ! defined $scheme; + + my $self = bless ({ + params => {}, + param_keys => [], + db_params => {}, + scheme => $scheme, + }, $class); + + return $self; +} + +=head2 db_schemes() + + Description: Returns a hash of scheme names known to be databases + Returntype : HashRef + Exceptions : None + Status : Stable + +=cut + +sub db_schemes { + my ($self) = @_; + return {map { $_ => 1 } qw/mysql ODBC sqlite Oracle Sybase/}; +} + + +=head2 is_db_scheme() + + Description: Returns true if the code believes the scheme to be a Database + Returntype : Boolean + Exceptions : None + Status : Stable + +=cut + +sub is_db_scheme { + my ($self) = @_; + return ( exists $self->db_schemes()->{$self->scheme()} ) ? 1 : 0; +} + +=head2 scheme() + + Description : Getter for the scheme attribute + Returntype : String + Exceptions : None + Status : Stable + +=cut + +sub scheme { + my ($self) = @_; + return $self->{scheme}; +} + +=head2 path() + + Arg[1] : Setter argument + Description : Getter/setter for the path attribute + Returntype : String + Exceptions : None + Status : Stable + +=cut + +sub path { + my ($self, $path) = @_; + $self->{path} = $path if defined $path; + return $self->{path}; +} + +=head2 user() + + Arg[1] : Setter argument + Description : Getter/setter for the user attribute + Returntype : String + Exceptions : None + Status : Stable + +=cut + +sub user { + my ($self, $user) = @_; + $self->{user} = $user if defined $user; + return $self->{user}; +} + +=head2 pass() + + Arg[1] : Setter argument + Description : Getter/setter for the password attribute + Returntype : String + Exceptions : None + Status : Stable + +=cut + +sub pass { + my ($self, $pass) = @_; + $self->{pass} = $pass if defined $pass; + return $self->{pass}; +} + +=head2 host() + + Arg[1] : Setter argument + Description : Getter/setter for the host attribute + Returntype : String + Exceptions : None + Status : Stable + +=cut + +sub host { + my ($self, $host) = @_; + $self->{host} = $host if defined $host; + return $self->{host}; +} + +=head2 port() + + Arg[1] : Setter argument + Description : Getter/setter for the port attribute + Returntype : Integer + Exceptions : If port is not a number, less than 1 or not a whole integer + Status : Stable + +=cut + +sub port { + my ($self, $port) = @_; + if(defined $port) { + if(! looks_like_number($port) || $port < 1 || int($port) != $port) { + throw "Port $port is not a number, less than 1 or not a whole integer"; + } + $self->{port} = $port if defined $port; + } + return $self->{port}; +} + +=head2 param_keys() + + Description : Getter for the paramater map keys in the order they were first + seen. Keys should only appear once in this array + Returntype : ArrayRef + Exceptions : None + Status : Stable + +=cut + +sub param_keys { + my ($self) = @_; + return [@{$self->{param_keys}}]; +} + +=head2 param_exists_ci() + + Arg[1] : String; Key + Description : Performs a case-insensitive search for the given key + Returntype : Boolean; returns true if your given key was seen + Exceptions : None + Status : Stable + +=cut + +sub param_exists_ci { + my ($self, $key) = @_; + my %keys = map { uc($_) => 1 } @{$self->param_keys()}; + return ($keys{uc($key)}) ? 1 : 0; +} + +=head2 add_param() + + Arg[1] : String; key + Arg[1] : Scalar; value + Description : Add a key/value to the params map. Multiple inserts of the same + key is allowed + Returntype : None + Exceptions : None + Status : Stable + +=cut + +sub add_param { + my ($self, $key, $value) = @_; + if(!exists $self->{params}->{$key}) { + $self->{params}->{$key} = []; + push(@{$self->{param_keys}}, $key); + } + push(@{$self->{params}->{$key}}, $value); + return; +} + +=head2 get_params() + + Arg[1] : String; key + Description : Returns the values which were found to be linked to the given + key. Arrays are returned because one key can have many + values in a URI + Returntype : ArrayRef[Scalar] + Exceptions : None + Status : Stable + +=cut + +sub get_params { + my ($self, $key) = @_; + return [] if ! exists $self->{params}->{$key}; + return [@{$self->{params}->{$key}}]; +} + +=head2 db_params() + + Description : Storage of parameters used only for database URIs since + they require + Returntype : HashRef; Database name is keyed under C and the + table is keyed under C + Exceptions : None + Status : Stable + +=cut + +sub db_params { + my ($self) = @_; + return $self->{db_params}; +} + +=head2 generate_dbsql_params() + + Arg[1] : boolean $no_table alows you to avoid pushing -TABLE as an + output value + Description : Generates a Hash of Ensembl compatible parameters to be used + to construct a DB object. We combine those parameters + which are deemed to be part of the C method + under C<-DBNAME> and C<-TABLE>. We also search for a number + of optional parameters which are lowercased equivalents + of the construction parameters available from a + L, + L as well as C + being supported. + + We also convert the scheme type into the driver attribute + + Returntype : Hash (not a reference). Output can be put into a C + constructor. + Exceptions : None + Status : Stable + +=cut + +sub generate_dbsql_params { + my ($self, $no_table) = @_; + my %db_params; + + $db_params{-DRIVER} = $self->scheme(); + $db_params{-HOST} = $self->host() if $self->host(); + $db_params{-PORT} = $self->port() if $self->port(); + $db_params{-USER} = $self->user() if $self->user(); + $db_params{-PASS} = $self->pass() if $self->pass(); + + my $dbname; + my $table; + if($self->scheme() eq 'sqlite') { + ($dbname, $table) = $self->_decode_sqlite(); + } + else { + $dbname = $self->db_params()->{dbname}; + $table = $self->db_params()->{table}; + } + + $db_params{-DBNAME} = $dbname if $dbname; + $db_params{-TABLE} = $table if ! $no_table && $table; + + foreach my $boolean_param (qw/disconnect_when_inactive reconnect_when_connection_lost is_multispecies no_cache verbose/) { + if($self->param_exists_ci($boolean_param)) { + $db_params{q{-}.uc($boolean_param)} = 1; + } + } + foreach my $value_param (qw/species group species_id wait_timeout/) { + if($self->param_exists_ci($value_param)) { + $db_params{q{-}.uc($value_param)} = $self->get_params($value_param)->[0]; + } + } + + return %db_params; +} + +=head2 _decode_sqlite + + Description : Performs path gymnastics to decode into a number of possible + options. The issue with SQLite is that the normal URI scheme + looks like sqlite:///my/path.sqlite/table but how do we know + that the DB name is C and the table is + C
? + + The code takes a path, looks for the full path & if it cannot + be found looks for the file a directory back. In the above + example it would have looked for C, + found it to be non-existant, looked for C + and found it. + + If the path splitting procdure resulted in just 1 file after + the first existence check e.g. C it assumes + that should be the name. If no file can be found we default to + the full length path. + Caller : internal + +=cut + +sub _decode_sqlite { + my ($self) = @_; + my $dbname; + my $table; + my $path = $self->path(); + if(-f $path) { + $dbname = $path; + } + else { + my ($volume, $directories, $file) = File::Spec->splitpath($path); + my @splitdirs = File::Spec->splitdir($directories); + if(@splitdirs == 1) { + $dbname = $path; + } + else { + my $new_file = pop(@splitdirs); + $new_file ||= q{}; + my $new_path = File::Spec->catpath($volume, File::Spec->catdir(@splitdirs), $new_file); + if($new_path ne File::Spec->rootdir() && -f $new_path) { + $dbname = $new_path; + $table = $file; + } + else { + $dbname = $path; + } + } + } + + $self->db_params()->{dbname} = $dbname if $dbname; + $self->db_params()->{table} = $table if $table; + + return ($dbname, $table); +} + +=head2 generate_uri() + + Description : Generates a URI string from the paramaters in this object + Returntype : String + Exceptions : None + Status : Stable + +=cut + +sub generate_uri { + my ($self) = @_; + my $scheme = sprintf('%s://', ($URI_ESCAPE) ? uri_escape($self->scheme()) : $self->scheme()); + my $user_credentials = q{}; + my $host_credentials = q{}; + my $location = q{}; + + if($self->user() || $self->pass()) { + my $user = $self->user(); + my $pass = $self->pass(); + if($URI_ESCAPE) { + $user = uri_escape($user) if $user; + $pass = uri_escape($pass) if $pass; + } + $user_credentials = sprintf('%s%s@', + ( $user ? $user : q{} ), + ( $pass ? q{:}.$pass : q{} ) + ); + } + + if($self->host() || $self->port()) { + my $host = $self->host(); + my $port = $self->port(); + if($URI_ESCAPE) { + $host = uri_escape($host) if $host; + $port = uri_escape($port) if $port; + } + $host_credentials = sprintf('%s%s', + ( $host ? $host : q{} ), + ( $port ? q{:}.$port : q{} ) + ); + } + + if($self->is_db_scheme() || $self->scheme() eq '') { + if($self->scheme() eq 'sqlite') { + if(! $self->path()) { + my $tmp_loc = $self->db_params()->{dbname}; + throw "There is no dbname available" unless $tmp_loc; + $tmp_loc .= q{/}.$self->db_params()->{table} if $self->db_params()->{table}; + $self->path($tmp_loc); + } + $location = $self->path(); + } + else { + my $dbname = $self->db_params()->{dbname}; + my $table = $self->db_params()->{table}; + if($dbname || $table) { + if($URI_ESCAPE) { + $dbname = uri_escape($dbname) if $dbname; + $table = uri_escape($table) if $table; + } + $location = sprintf('/%s%s', + ($dbname ? $dbname : q{}), + ($table ? q{/}.$table : q{}) + ); + } + } + } + else { + $location = $self->path() if $self->path(); + } + + my $param_string = q{}; + if(@{$self->param_keys()}) { + $param_string = q{?}; + my @params; + foreach my $key (@{$self->param_keys}) { + my $values_array = $self->get_params($key); + foreach my $value (@{$values_array}) { + my $encoded_key = ($URI_ESCAPE) ? uri_escape($key) : $key; + my $encoded_value = ($URI_ESCAPE) ? uri_escape($value) : $value; + push(@params, ($encoded_value) ? "$encoded_key=$encoded_value" : $encoded_key); + } + } + $param_string .= join(q{;}, @params); + } + + return join(q{}, $scheme, $user_credentials, $host_credentials, $location, $param_string); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/VegaCuration/Gene.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/VegaCuration/Gene.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,73 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Utils::VegaCuration::Gene; + +use strict; +use warnings; +use vars qw(@ISA); + +use Bio::EnsEMBL::Utils::ConversionSupport; + +@ISA = qw(Bio::EnsEMBL::Utils::ConversionSupport); + + +=head2 find_gaps + + Args : arrayref of B::E::Transcripts + Example : my $gaps = find_gaps($all_transcripts) + Description: identifies regions of a gene that are not covered by any transcript + Returntype : int + Exceptions : none + Caller : internal + +=cut + +sub find_gaps { + my $self = shift; + my ($all_transcripts) = @_; + my $gaps = 0; + my @sorted_transcripts = sort {$a->start <=> $b->start || $b->end <=> $a->end} @{$all_transcripts}; + if ( my $first_transcript = shift @sorted_transcripts ) { + my $pos = $first_transcript->end; + foreach my $transcript (@sorted_transcripts) { + next if ($transcript->end < $pos ); + if ($transcript->start < $pos && $transcript->end > $pos ) { + $pos = $transcript->end; + next; + } + elsif ($transcript->end > $pos) { + $gaps++; + $pos = $transcript->end; + } + } + } + return $gaps; +} diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/VegaCuration/Transcript.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/VegaCuration/Transcript.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,307 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Utils::VegaCuration::Transcript; + +use strict; +use warnings; +no warnings 'uninitialized'; +use vars qw(@ISA); + +use Bio::EnsEMBL::Utils::VegaCuration::Gene; +use Data::Dumper; + +@ISA = qw(Bio::EnsEMBL::Utils::VegaCuration::Gene); + + +=head2 find_non_overlaps + + Args : arrayref of B::E::Transcripts + Example : find_non_overlaps($all_transcripts) + Description: identifies any non-overlapping transcripts + Returntype : array refs of stable IDs + Exceptions : none + +=cut + +sub find_non_overlaps { + my $self = shift; + my ($all_transcripts) = @_; + my $non_overlaps = []; + foreach my $transcript1 (@{$all_transcripts}) { + foreach my $transcript2 (@{$all_transcripts}) { + if ($transcript1->end < $transcript2->start) { + push @{$non_overlaps}, $transcript1->stable_id; + push @{$non_overlaps}, $transcript2->stable_id; + } + } + } + return $non_overlaps; +} + +=head2 check_remarks_and_update_names + + Arg[1] : B::E::Gene (with potentially duplicated transcript names) + Arg[2] : counter 1 (no. of patched genes) + Arg[3] : counter 2 (no. of patched transcripts) + Example : $support->update_names($gene,\$c1,\$c2) + Description: - checks remarks and patches transcripts with identical names according to + CDS and length + Returntype : true | false (depending on whether patched or not), counter1, counter2 + +=cut + +sub check_remarks_and_update_names { + my $self = shift; + my ($gene,$gene_c,$trans_c) = @_; + my $action = ($self->param('dry_run')) ? 'Would add' : 'Added'; + my $aa = $gene->adaptor->db->get_AttributeAdaptor; + my $dbh = $gene->adaptor->db->dbc->db_handle; + + #get list of IDs that have previously been sent to annotators + my $seen_genes = $self->get_havana_fragmented_loci_comments; + + my $gsi = $gene->stable_id; + my $gid = $gene->dbID; + my $g_name; + my $study_more = 1; + eval { + $g_name = $gene->display_xref->display_id; + }; + if ($@) { + $g_name = $gene->get_all_Attributes('name')->[0]->value; + } + + #get existing gene remarks + my $remarks = [ map {$_->value} @{$gene->get_all_Attributes('remark')} ]; + + #shout if there is no remark to identify this as being fragmented + if ( grep {$_ eq 'fragmented locus' } @$remarks) { + $study_more = 0; + } + else { + $self->log_warning("Gene $gsi should have a fragmented locus remark\n"); + } + + ##patch transcript names according to length and CDS + $gene_c++; + + #separate coding and non_coding transcripts + my $coding_trans = []; + my $noncoding_trans = []; + foreach my $trans ( @{$gene->get_all_Transcripts()} ) { + if ($trans->translate) { + push @$coding_trans, $trans; + } + else { + push @$noncoding_trans, $trans; + } + } + + #sort transcripts coding > non-coding, then on length + my $c = 0; + $self->log("\nPatching names according to CDS and length:\n",1); + foreach my $array_ref ($coding_trans,$noncoding_trans) { + foreach my $trans ( sort { $b->length <=> $a->length } @$array_ref ) { + $trans_c++; + my $tsi = $trans->stable_id; + my $t_name; + eval { + $t_name = $trans->display_xref->display_id; + }; + if ($@) { + $t_name = $trans->get_all_Attributes('name')->[0]->value; + } + $c++; + my $ext = sprintf("%03d", $c); + my $new_name = $g_name.'-'.$ext; + $self->log(sprintf("%-20s%-3s%-20s", "$t_name ", "-->", "$new_name")."\n",1); + if (! $self->param('dry_run')) { + + # update transcript display xref + $dbh->do(qq(UPDATE xref x, external_db edb + SET x.display_label = "$new_name" + WHERE x.external_db_id = edb.external_db_id + AND x.dbprimary_acc = "$tsi" + AND edb.db_name = "Vega_transcript")); + } + } + } + return ($study_more,$gene_c,$trans_c); +} + +=head2 check_names_and_overlap + + Arg[1] : arayref of arrayrefs of duplicated names + Arg[2] : B::E::Gene (with potentially duplicated transcript names) + Arg[3] : FH (to log new duplicates) + Example : $support->check_names_and_overlap($transcripts,$gene,$fh) + Description: checks pairs of transcripts identified as having duplicate Vega names: + - to see if they have identical names in loutre (shouldn't have) + - distinguish between overlapping and non overlapping transcripts + Returntype : none + +=cut + +sub check_names_and_overlap { + my $self = shift; + my ($transcript_info,$gene,$n_flist_fh) = @_; + my $ta = $gene->adaptor->db->get_TranscriptAdaptor; + my $gsi = $gene->stable_id; + my $g_name = $gene->get_all_Attributes('name')->[0]->value; + foreach my $set (values %{$transcript_info} ) { + next if (scalar @{$set} == 1); + my $transcripts = []; + my $all_t_names; + my %ids_to_names; + foreach my $id1 (@{$set}) { + my ($name1,$tsi1) = split /\|/, $id1; + $ids_to_names{$tsi1} = $name1; + $all_t_names .= "$tsi1 [$name1] "; + my $t = $ta->fetch_by_stable_id($tsi1); + push @{$transcripts}, $t; + } + + my $non_overlaps; + eval { + $non_overlaps = $self->find_non_overlaps($transcripts); + }; + if ($@) { + $self->log_warning("Problem looking for overlapping transcripts for gene $gsi (is_current = 0 ?). Skipping this bit\n"); + } + + #if the transcripts don't overlap + elsif (@{$non_overlaps}) { + my $tsi_string; + foreach my $id (@{$non_overlaps}) { + my $string = " $id [ $ids_to_names{$id} ] "; + $tsi_string .= $string; + } + + $self->log_warning("NEW: Non-overlapping: $gsi ($g_name) has non-overlapping transcripts ($tsi_string) with duplicated Vega names, and it has no \'fragmented locus\' gene remark. Neither has it been OKeyed by Havana before. Transcript names are being patched but this needs checking by Havana.\n"); + #log gsi (to be sent to Havana) + print $n_flist_fh "$gsi\n"; + } + #...otherwise if the transcripts do overlap + else { + $self->log_warning("NEW: Overlapping: $gsi ($g_name) has overlapping transcripts ($all_t_names) with duplicated Vega names and it has no \'fragmented locus\' gene_remark. Neither has it been OKeyed by Havana before. Transcript names are being patched but this could be checked by Havana if they were feeling keen.\n"); + print $n_flist_fh "$gsi\n"; + } + } +} + +=head2 get_havana_fragmented_loci_comments + + Args : none + Example : my $results = $support->get_havana_fragmented_loci_comments + Description: parses the HEREDOC containing Havana comments in this module + Returntype : hashref + +=cut + +sub get_havana_fragmented_loci_comments { + my $seen_genes; + while () { + next if /^\s+$/ or /#+/; + my ($obj,$comment) = split /=/; + $obj =~ s/^\s+|\s+$//g; + $comment =~ s/^\s+|\s+$//g; + $seen_genes->{$obj} = $comment; + } + return $seen_genes; +} + + + +#details of genes with duplicated transcript names that have already been reported to Havana +#identified as either fragmented or as being OK to patch +__DATA__ + +OTTMUSG00000005478 = fragmented +OTTMUSG00000001936 = fragmented +OTTMUSG00000017081 = fragmented +OTTMUSG00000011441 = fragmented +OTTMUSG00000013335 = fragmented +OTTMUSG00000011654 = fragmented +OTTMUSG00000001835 = fragmented +OTTHUMG00000035221 = fragmented +OTTHUMG00000037378 = fragmented +OTTHUMG00000060732 = fragmented +OTTHUMG00000132441 = fragmented +OTTHUMG00000031383 = fragmented +OTTHUMG00000012716 = fragmented +OTTHUMG00000031102 = fragmented +OTTHUMG00000148816 = fragmented +OTTHUMG00000149059 = fragmented +OTTHUMG00000149221 = fragmented +OTTHUMG00000149326 = fragmented +OTTHUMG00000149644 = fragmented +OTTHUMG00000149574 = fragmented +OTTHUMG00000058101 = fragmented + +OTTHUMG00000150119 = OK +OTTHUMG00000149850 = OK +OTTHUMG00000058101 = OK +OTTHUMG00000058907 = OK + +OTTMUSG00000011654 = fragmented +OTTMUSG00000019369 = fragmented +OTTMUSG00000017081 = fragmented +OTTMUSG00000001835 = fragmented +OTTMUSG00000011499 = fragmented +OTTMUSG00000013335 = fragmented +OTTMUSG00000008023 = fragmented +OTTMUSG00000019369 = fragmented + + +OTTMUSG00000022266 +OTTMUSG00000006697 + + + + + +OTTMUSG00000012302 = +OTTMUSG00000013368 = +OTTMUSG00000015766 = +OTTMUSG00000016025 = +OTTMUSG00000001066 = +OTTMUSG00000016331 = +OTTMUSG00000006935 = +OTTMUSG00000007263 = +OTTMUSG00000000304 = +OTTMUSG00000009150 = +OTTMUSG00000008023 = +OTTMUSG00000017077 = +OTTMUSG00000003440 = +OTTMUSG00000016310 = +OTTMUSG00000026199 = +OTTMUSG00000028423 = +OTTMUSG00000007427 = diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/VegaCuration/Translation.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/VegaCuration/Translation.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,447 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + This software is distributed under a modified Apache license. + For license details, please see + http://www.ensembl.org/info/about/code_licence.html +=head1 CONTACT + Please email comments or questions to the public Ensembl + developers list at . + Questions may also be sent to the Ensembl help desk at + . + +=cut +=head1 NAME +=head1 SYNOPSIS +=head1 DESCRIPTION +=head1 METHODS +=cut + +package Bio::EnsEMBL::Utils::VegaCuration::Translation; +use strict; +use warnings; +use vars qw(@ISA); +use Data::Dumper; +use Bio::EnsEMBL::Utils::VegaCuration::Transcript; +@ISA = qw(Bio::EnsEMBL::Utils::VegaCuration::Transcript); + +=head2 check_CDS_start_end_remarks + + Args : B::E::Transcript + Example : my $results = $support->check_CDS_end_remarks($transcript) + Description: identifies incorrect 'CDS end...' transcript remarks in a + otter-derived Vega database + Returntype : hashref + +=cut + +sub check_CDS_start_end_remarks { + my $self = shift; + my $trans = shift; + # info for checking + my @remarks = @{$trans->get_all_Attributes('remark')}; + my $coding_end = $trans->cdna_coding_end; + my $coding_start = $trans->cdna_coding_start; + my $trans_end = $trans->length; + my $trans_seq = $trans->seq->seq; + my $stop_codon = substr($trans_seq, $coding_end-3, 3); + my $start_codon = substr($trans_seq, $coding_start-1, 3); + #hashref to return results + my $results; + + #extra CDS end not found remarks + if (grep {$_->value eq 'CDS end not found'} @remarks) { + if ( ($coding_end != $trans_end) + && ( grep {$_ eq $stop_codon} qw(TGA TAA TAG) ) ) { + $results->{'END_EXTRA'} = 1; + } + } + #missing CDS end not found remark + if ( $coding_end == $trans_end ) { + if (! grep {$_->value eq 'CDS end not found'} @remarks) { + if (grep {$_ eq $stop_codon} qw(TGA TAA TAG)) { + $results->{'END_MISSING_2'} = 1; + } + else { + $results->{'END_MISSING_1'} = $stop_codon; + } + } + } + #extra CDS start not found remark + if (grep {$_->value eq 'CDS start not found'} @remarks) { + if ( ($coding_start != 1) + && ($start_codon eq 'ATG') ) { + $results->{'START_EXTRA'} = 1; + } + } + #missing CDS start not found remark + if ( $coding_start == 1) { + if ( ! grep {$_->value eq 'CDS start not found'} @remarks) { + if ($start_codon eq 'ATG') { + $results->{'START_MISSING_2'} = 1; + } else { + $results->{'START_MISSING_1'} = $start_codon; + } + } + } + return $results; +} + +=head2 check_CDS_start_end_remarks_loutre + + Args : B::E::Transcript + Example : my $results = $support->check_CDS_end_remarks($transcript) + Description: identifies incorrect 'CDS end...' transcript attribs in a loutre + of a loutre-derived Vega database. + Returntype : hashref + +=cut + +sub check_CDS_start_end_remarks_loutre { + my $self = shift; + my $trans = shift; + + # info for checking + my @stops = qw(TGA TAA TAG); + my %attributes; + foreach my $attribute (@{$trans->get_all_Attributes()}) { + push @{$attributes{$attribute->code}}, $attribute; + } +# warn $trans->stable_id; +# warn Data::Dumper::Dumper(\%attributes); + my $coding_end = $trans->cdna_coding_end; + my $coding_start = $trans->cdna_coding_start; + my $trans_end = $trans->length; + my $trans_seq = $trans->seq->seq; + my $stop_codon_offset = 3 + $trans->translation->end_Exon->end_phase; + my $initial_exon_phase = $trans->translation->start_Exon->phase; + my $stop_codon = substr($trans_seq, $coding_end-3, 3); + my $start_codon = substr($trans_seq, $coding_start-1, 3); + + my $start_codon_incorrect = 1; + if ($start_codon eq 'ATG' ) { + $start_codon_incorrect = 0; + } + elsif ($start_codon eq 'CTG') { + foreach my $attrib (@{$attributes{'remark'}}) { + if ($attrib->value =~ /non[- ]ATG start/) { + $start_codon_incorrect = 0; + } + } + } + +# warn "$start_codon -- $initial_exon_phase -- $coding_start -- $start_codon_incorrect"; + + #hashref to return results + my $results; + + #extra CDS end not found remarks + if ($attributes{'cds_end_NF'}) { + if ( ($attributes{'cds_end_NF'}->[0]->value == 1) + && ($coding_end != $trans_end) + && ( grep {$_ eq $stop_codon} @stops) ) { +# warn $trans->stable_id.": $coding_end--$trans_end--$stop_codon"; +# warn $trans->translation->end_Exon->end_phase; + $results->{'END_EXTRA'} = $stop_codon; + } + } + #missing CDS end not found remark + if ( $coding_end == $trans_end ) { + if ($attributes{'cds_end_NF'}) { + if ($attributes{'cds_end_NF'}->[0]->value == 0 ) { + if (! grep {$_ eq $stop_codon} @stops) { +# warn $trans->translation->end_Exon->end_phase; + $results->{'END_MISSING'}{'WRONG'} = $stop_codon; + } + } + } + elsif (! grep {$_ eq $stop_codon} @stops) { + $results->{'END_MISSING'}{'ABSENT'} = $stop_codon; + } + } + #extra CDS start not found remark + if ( $attributes{'cds_start_NF'}) { + if ( ($attributes{'cds_start_NF'}->[0]->value == 1 ) + && (! $start_codon_incorrect)) { + unless ( ($coding_start == 1) && ( $initial_exon_phase > 0)) { + $results->{'START_EXTRA'} = $start_codon; + } + } + } + #missing CDS start not found remark + if ( $coding_start == 1) { + if ( $attributes{'cds_start_NF'} ) { + if ( $attributes{'cds_start_NF'}->[0]->value == 0 ) { + if ($start_codon_incorrect) { + $results->{'START_MISSING'}{'ABSENT'} = $start_codon; + } + elsif ($initial_exon_phase > 0) { + $results->{'START_MISSING'}{'INITIAL_PHASE'} = $initial_exon_phase; + } + } + } + elsif ($start_codon ne 'ATG') { + $results->{'START_MISSING'}{'ABSENT'} = $start_codon; + } + + } + return $results; +} + +=head2 get_havana_seleno_comments + + Args : none + Example : my $results = $support->get_havana_seleno_comments + Description: parses the HEREDOC containing Havana comments in this module + Returntype : hashref + +=cut + +sub get_havana_seleno_comments { + my $seen_translations; + while () { + next if /^\s+$/ or /#+/; + my ($obj,$comment) = split /=/; + $obj =~ s/^\s+|\s+$//g; + $comment =~ s/^\s+|\s+$//g; + # We add the origin as now "seen" can come from a number of places, and have + # a number of consequences in different cases, not just discounted Secs from this method. -- ds23 + $seen_translations->{$obj} = [ $comment,"notsec-havana" ]; + } + return $seen_translations; +} + +sub check_for_stops { + my $support = shift; + my ($gene,$seen_transcripts,$log_object) = @_; + my $transcripts; + my $has_log_object=defined($log_object); + if($has_log_object){ + my @help = $log_object->species_params->get_trans($gene->stable_id); + $transcripts=\@help; + }else{ + $log_object=$support; + $transcripts=$gene->get_all_Transcripts; + } + + my $gname = $gene->get_all_Attributes('name')->[0]->value; + my $gsi = $gene->stable_id; + my $scodon = 'TGA'; + my $mod_date = $support->date_format( $gene->modified_date,'%d/%m/%y' ); + my $hidden_remak_ttributes; + TRANS: + foreach my $trans (@$transcripts) { + my $tsi = $trans->stable_id; + my $tID = $trans->dbID; + my $tname = $trans->get_all_Attributes('name')->[0]->value; + if($has_log_object){ + $hidden_remak_ttributes=$log_object->species_params->get_attributes->{$tsi}->{'hidden_remark'}; + }else{ + $hidden_remak_ttributes=$trans->get_all_Attributes('hidden_remark'); + } + foreach my $rem (@$hidden_remak_ttributes) { + if ($rem->value =~ /not_for_Vega/) { + #$support->log_verbose("Skipping transcript $tname ($tsi) since 'not_for_Vega'\n",1); + $log_object->_save_log('log_verbose', '', $gsi, '', $tsi, '', "Skipping transcript $tname ($tsi) since 'not_for_Vega'"); + next TRANS; + } + } + + # go no further if there is a ribosomal framshift attribute + foreach my $attrib (@{$trans->get_all_Attributes('_rib_frameshift')}) { + if ($attrib->value) { + $log_object->_save_log('log', '', $gsi, '', $tsi, '', "Skipping $tsi ($tname) since it has a ribosomal frameshift attribute"); + next TRANS; + } + } + + #$support->log_verbose("Studying transcript $tsi ($tname, $tID)\n",1); + $log_object->_save_log('log_verbose', '', $gsi, '', $tsi, '', "Studying transcript $tsi ($tname, $tID)"); + my $peptide; + + # go no further if the transcript doesn't translate or if there are no stops + next TRANS unless ($peptide = $trans->translate); + my $pseq = $peptide->seq; + my $orig_seq = $pseq; + # (translate method trims stops from sequence end) + + next TRANS unless ($pseq =~ /\*/); + # warn sprintf("Stop codon is '%s'\n",substr($trans->translateable_seq,-3)); + #$support->log_verbose("Stops found in $tsi ($tname)\n",1); + $log_object->_save_log('log_verbose', '', $gsi, '', $tsi, '', "Stops found in $tsi ($tname)"); + + # find out where and how many stops there are + my @found_stops; + my $mrna = $trans->translateable_seq; + my $offset = 0; + my $tstop; + while ($pseq =~ /^([^\*]*)\*(.*)/) { + my $pseq1_f = $1; + $pseq = $2; + my $seq_flag = 0; + $offset += length($pseq1_f) * 3; + my $stop = substr($mrna, $offset, 3); + my $aaoffset = int($offset / 3)+1; + push(@found_stops, [ $stop, $aaoffset ]); + $tstop .= "$aaoffset "; + $offset += 3; + } + + # are all stops TGA...? + my $num_stops = scalar(@found_stops); + my $num_tga = 0; + my $positions; + foreach my $stop (@found_stops) { + $positions .= $stop->[0]."(".$stop->[1].") "; + if ($stop->[0] eq $scodon) { + $num_tga++; + } + } + my $source = $gene->source; + #...no - an internal stop codon error in the database... + if ($num_tga < $num_stops) { + if ($source eq 'havana') { + #$support->log_warning("INTERNAL STOPS HAVANA: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons [$mod_date]:\nSequence = $orig_seq\nStops at $positions)\n\n"); + $log_object->_save_log('log_warning', '', $gsi, 'TRANSCRIPT', $tsi, 'VQCT_internal_stop', "INTERNAL STOPS HAVANA: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons [$mod_date]: Sequence = $orig_seq Stops at $positions)"); + } + else { + #$support->log_warning("INTERNAL STOPS EXTERNAL: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons[$mod_date]:\nSequence = $orig_seq\nStops at $positions)\n\n"); + $log_object->_save_log('log_warning', '', $gsi, 'TRANSCRIPT', $tsi, 'VQCT_internal_stop', "INTERNAL STOPS EXTERNAL: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons[$mod_date]: Sequence = $orig_seq Stops at $positions)"); + } + } + #...yes - check remarks + else { + my $flag_remark = 0; # 1 if word seleno has been used + my $flag_remark2 = 0; # 1 if existing remark has correct numbering + my $alabel = 'Annotation_remark- selenocysteine '; + my $alabel2 = 'selenocysteine '; + my $annot_stops; + my $remarks; + my $att; + #get both hidden_remarks and remarks + foreach my $remark_type ('remark','hidden_remark') { + if($has_log_object){ + $att=$log_object->species_params->get_attributes->{$trans->stable_id}->{$remark_type}; + }else{ + $att=$trans->get_all_Attributes($remark_type) + } + foreach my $attrib ( @$att) { + push @{$remarks->{$remark_type}}, $attrib->value; + } + } + #parse remarks to check syntax for location of edits + while (my ($attrib,$remarks)= each %$remarks) { + foreach my $text (@{$remarks}) { + if ( ($attrib eq 'remark') && ($text=~/^$alabel(.*)/) ){ + #$support->log_warning("seleno remark for $tsi stored as Annotation_remark not hidden remark) [$mod_date]\n"); + $log_object->_save_log('log_warning', '', $gsi, '', $tsi, 'VQCT_wrong_selC_coord', "seleno remark for $tsi stored as Annotation_remark not hidden remark) [$mod_date]"); + $annot_stops=$1; + } + elsif ($text =~ /^$alabel2(.*)/) { + my $maybe = $1; + if($maybe =~ /^\s*\d+(\s+\d+)*\s*$/) { + $annot_stops=$maybe; + } else { + $log_object->_save_log('log', '', $gene->stable_id, '', $tsi, '', "Maybe annotated stop in incorrect format, maybe just a remark that happens to begin '$alabel2'". + " -- might need to investigate: '$alabel2$maybe' [$mod_date]"); + } + } + } + } + + #check the location of the annotated edits matches actual stops in the sequence + my @annotated_stops; + if ($annot_stops){ + my $i = 0; + foreach my $offset (split(/\s+/, $annot_stops)) { + #OK if it matches a known stop + if ( + defined($found_stops[$i]) && defined($found_stops[$i]->[1]) && ($found_stops[$i]->[1] == $offset)) { + push @annotated_stops, $offset; + } + # catch old annotations where number was in DNA not peptide coordinates + elsif (defined($found_stops[$i]) && defined($found_stops[$i]->[1]) && (($found_stops[$i]->[1] * 3) == $offset)) { + $log_object->_save_log('log_warning', '', $gene->stable_id, 'DNA', $tsi, 'VQCT_wrong_selC_coord', "DNA: Annotated stop for transcript tsi ($tname) is in DNA not peptide coordinates) [$mod_date]"); + } + # catch old annotations where number off by one + elsif (defined($found_stops[$i]) && defined($found_stops[$i]->[1]) && (($found_stops[$i]->[1]) == $offset+1)) { + $log_object->_save_log('log_warning', '', $gene->stable_id, 'PEPTIDE', $tsi, 'VQCT_wrong_selC_coord', "PEPTIDE: Annotated stop for transcript $tsi ($tname) is out by one) [$mod_date]"); + } + elsif (defined($offset) && ($offset=~/^\d+$/)){ + if ($offset == length($orig_seq)+1) { + if($seen_transcripts->{$tsi} && $seen_transcripts->{$tsi}->[1] eq 'known-tga-stop') { + $log_object->_save_log('log', '', $gene->stable_id, 'TRANSCRIPT', $tsi, '', "Annotated stop for transcript $tsi ($tname) known to be a stop codon. Ok. [$mod_date]"); + } elsif($seen_transcripts->{$tsi} && $seen_transcripts->{$tsi}->[1] eq 'known-terminal-sec') { + $log_object->_save_log('log', '', $gene->stable_id, 'TRANSCRIPT', $tsi, '', "Annotated stop for transcript $tsi ($tname) known to be a terminal Sec. Ok. [$mod_date]"); + } else { + $log_object->_save_log('log_warning', '', $gene->stable_id, 'TRANSCRIPT', $tsi, '', "Annotated stop for transcript $tsi ($tname) \"$offset\" matches actual stop codon yet has no entry in script config to disambiguate it. Please investigate and add appropriate entry to config arrays in add_selcys.pl. [$mod_date]"); + } + } + else { + $log_object->_save_log('log_warning', '', $gene->stable_id, 'TRANSCRIPT', $tsi, 'VQCT_wrong_selC_coord', "Annotated stop for transcript $tsi ($tname) \"$offset\" does not match a TGA codon) [$mod_date]"); + push @annotated_stops, $offset; + } + } + $i++; + } + } + + #check location of found stops matches annotated ones - any new ones are reported + foreach my $stop ( @found_stops ) { + my $pos = $stop->[1]; + my $seq = $stop->[0]; + unless ( grep { $pos == $_} @annotated_stops) { + if ($seen_transcripts->{$tsi} && $seen_transcripts->{$tsi}->[1] eq 'notsec-havana') { + #$support->log_verbose("Transcript $tsi ($tname) has potential selenocysteines but has been discounted by annotators:\n\t".$seen_transcripts->{$tsi}.") [$mod_date]\n"); + $log_object->_save_log('log_verbose', '', $gene->stable_id, '', $tsi, 'VQCT_pot_selC', "Transcript $tsi ($tname) has potential selenocysteines but has been discounted by annotators: ".$seen_transcripts->{$tsi}->[0].") [$mod_date]"); + } + else { + #$support->log("POTENTIAL SELENO ($seq) in $tsi ($tname, gene $gname) found at $pos [$mod_date]\n"); + $log_object->_save_log('log', '', $gene->stable_id, '', $tsi, 'VQCT_pot_selC', "POTENTIAL SELENO ($seq) in $tsi ($tname, gene $gname) found at $pos [$mod_date]"); + } + } + } + } + } +} +sub _save_log{ + my $self=shift; + my $log_type = shift; + my $chrom_name=shift || ''; + my $gsi=shift || ''; + my $type=shift || ''; + my $tsi=shift || ''; + my $tag=shift || ''; + my $txt=shift || ''; + $self->$log_type($txt."\n"); +} + +#details of annotators comments +__DATA__ +OTTHUMT00000144659 = FIXED- changed to transcript +OTTHUMT00000276377 = FIXED- changed to transcript +OTTHUMT00000257741 = FIXED- changed to nmd +OTTHUMT00000155694 = NOT_FIXED- should be nmd but external annotation but cannot be fixed +OTTHUMT00000155695 = NOT_FIXED- should be nmd but external annotation but cannot be fixed +OTTHUMT00000282573 = FIXED- changed to unprocessed pseudogene +OTTHUMT00000285227 = FIXED- changed start site +OTTHUMT00000151008 = FIXED- incorrect trimming of CDS, removed extra stop codon +OTTHUMT00000157999 = FIXED- changed incorrect stop +OTTHUMT00000150523 = FIXED- incorrect trimming of CDS +OTTHUMT00000150525 = FIXED- incorrect trimming of CDS +OTTHUMT00000150522 = FIXED- incorrect trimming of CDS +OTTHUMT00000150521 = FIXED- incorrect trimming of CDS +OTTHUMT00000246819 = FIXED- corrected frame +OTTHUMT00000314078 = FIXED- corrected frame +OTTHUMT00000080133 = FIXED- corrected frame +OTTHUMT00000286423 = FIXED- changed to transcript +OTTMUST00000055509 = FIXED- error +OTTMUST00000038729 = FIXED- corrected frame +OTTMUST00000021760 = FIXED- corrected frame +OTTMUST00000023057 = FIXED- corrected frame +OTTMUST00000015207 = FIXED- corrected frame +OTTMUST00000056646 = FIXED- error +OTTMUST00000059686 = FIXED- corrected frame +OTTMUST00000013426 = FIXED- corrected frame +OTTMUST00000044412 = FIXED- error diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Utils/ensembl_init.example --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/ensembl_init.example Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,83 @@ +# +# Example of configuration file used by Bio::EnsEMBL::Registry::load_all +# method to store/register all kind of Adaptors. + +use strict; +use warnings; + +use Bio::EnsEMBL::Utils::ConfigRegistry; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Compara::DBSQL::DBAdaptor; + +my @aliases; + +new Bio::EnsEMBL::DBSQL::DBAdaptor( + '-host' => 'ensembldb.ensembl.org', + '-user' => 'anonymous', + '-port' => '3306', + '-species' => 'Homo sapiens', + '-group' => 'core', + '-dbname' => 'homo_sapiens_core_23_34e' +); + +@aliases = ( 'H_Sapiens', 'homo sapiens', + 'Homo_Sapiens', 'Homo_sapiens', + 'Homo', 'homo', + 'human' ); + +Bio::EnsEMBL::Utils::ConfigRegistry->add_alias( + '-species' => 'Homo sapiens', + '-alias' => [@aliases] ); + +new Bio::EnsEMBL::DBSQL::DBAdaptor( + '-host' => 'ensembldb.ensembl.org', + '-user' => 'anonymous', + '-port' => '3306', + '-species' => 'Mus musculus', + '-group' => 'core', + '-dbname' => 'mus_musculus_core_23_32c' +); + +@aliases = ( 'M_Musculus', 'mus musculus', + 'Mus_Musculus', 'Mus_musculus', + 'Mus', 'mus', + 'mouse' ); + +Bio::EnsEMBL::Utils::ConfigRegistry->add_alias( + '-species' => 'Mus musculus', + '-alias' => [@aliases] ); + +new Bio::EnsEMBL::DBSQL::DBAdaptor( + '-host' => 'ensembldb.ensembl.org', + '-user' => 'anonymous', + '-port' => '3306', + '-species' => 'Rattus norvegicus', + '-group' => 'core', + '-dbname' => 'rattus_norvegicus_core_23_3c' +); + +@aliases = ( 'R_Norvegicus', 'rattus norvegicus', + 'Rattus_Norvegicus', 'Rattus_norvegicus', + 'Rattus', 'rattus', + 'rat' ); + +Bio::EnsEMBL::Utils::ConfigRegistry->add_alias( + '-species' => 'Rattus norvegicus', + '-alias' => [@aliases] ); + +new Bio::EnsEMBL::Compara::DBSQL::DBAdaptor( + '-host' => 'ensembldb.ensembl.org', + '-user' => 'anonymous', + '-port' => '3306', + '-species' => 'Compara23', + '-dbname' => 'ensembl_compara_23_1' +); + +@aliases = ( 'ensembl_compara_23_1', 'compara23' ); + +Bio::EnsEMBL::Utils::ConfigRegistry->add_alias( + '-species' => 'Compara23', + '-alias' => [@aliases] +); + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Allele.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Allele.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,384 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# Ensembl module for Bio::EnsEMBL::Variation::Allele +# +# Copyright (c) 2004 Ensembl +# + + +=head1 NAME + +Bio::EnsEMBL::Variation::Allele - A single allele of a nucleotide variation. + +=head1 SYNOPSIS + + $allele = Bio::EnsEMBL::Variation::Allele->new + (-allele => 'A', + -frequency => 0.85, + -population => $population); + + $delete = Bio::EnsEMBL::Variation::Allele->new + (-allele => '-', + -frequency => 0.15, + -population => $population); + + ... + + $astr = $a->allele(); + $pop = $a->population(); + $freq = $a->frequency(); + + print $a->allele(); + if($a->populaton) { + print " found in population ", $allele->population->name(); + } + if(defined($a->frequency())) { + print " with frequency ", $a->frequency(); + } + print "\n"; + + + +=head1 DESCRIPTION + +This is a class representing a single allele of a variation. In addition to +the nucleotide(s) (or absence of) that representing the allele frequency +and population information may be present. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::Allele; + +use Bio::EnsEMBL::Storable; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning); +use Bio::EnsEMBL::Utils::Scalar qw(assert_ref check_ref); +use Scalar::Util qw(weaken); +use Bio::EnsEMBL::Variation::Failable; + +our @ISA = ('Bio::EnsEMBL::Storable', 'Bio::EnsEMBL::Variation::Failable'); + + +=head2 new + + Arg [-dbID]: int - unique internal identifier for the Allele + Arg [-ADAPTOR]: Bio::EnsEMBL::Variation::DBSQL::AlleleAdaptor + Arg [-ALLELE]: string - the nucleotide string representing the allele + Arg [-FREQUENCY]: float - the frequency of the allele + Arg [-POPULATION]: Bio::EnsEMBL::Variation::Population - the population + in which the allele was recorded + Example : $allele = Bio::EnsEMBL::Variation::Allele->new + (-allele => 'A', + -frequency => 0.85, + -population => $pop); + + Description: Constructor. Instantiates a new Allele object. + Returntype : Bio::EnsEMBL::Variation::Allele + Exceptions : none + Caller : general + Status : At Risk + +=cut + + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my ($dbID, $adaptor, $allele, $freq, $count, $pop, $ss_id, $variation_id, $population_id) = + rearrange(['dbID', 'ADAPTOR', 'ALLELE', 'FREQUENCY', 'COUNT', 'POPULATION', 'SUBSNP', 'VARIATION_ID', 'POPULATION_ID'], @_); + + # set subsnp_id to undefined if it's 0 in the DB + #$ss_id = undef if (defined $ss_id && $ss_id == 0); + + # add ss to the subsnp_id + $ss_id = 'ss'.$ss_id if defined $ss_id && $ss_id !~ /^ss/; + + # Check that we at least get a BaseAdaptor + assert_ref($adaptor,'Bio::EnsEMBL::Variation::DBSQL::BaseAdaptor'); + # If the adaptor is not an AlleleAdaptor, try to get it via the passed adaptor + unless (check_ref($adaptor,'Bio::EnsEMBL::Variation::DBSQL::AlleleAdaptor')) { + $adaptor = $adaptor->db->get_AlleleAdaptor(); + # Verify that we could get the AlleleAdaptor + assert_ref($adaptor,'Bio::EnsEMBL::Variation::DBSQL::AlleleAdaptor'); + } + + my $self = bless {}, $class; + + $self->dbID($dbID); + $self->adaptor($adaptor); + $self->allele($allele); + $self->frequency($freq); + $self->count($count); + $self->subsnp($ss_id); + $self->{'_variation_id'} = $variation_id; + $self->{'_population_id'} = $population_id; + $self->population($pop) if (defined($pop)); + + return $self; +} + +sub new_fast { + my $class = shift; + my $hashref = shift; + return bless $hashref, $class; +} + + +# An internal method for getting a unique hash key identifier, used by the Variation module +sub _hash_key { + my $self = shift; + + # By default, return the dbID + my $dbID = $self->dbID(); + return $dbID if (defined($dbID)); + + # If no dbID is specified, e.g. if we are creating a 'custom' object, return a fake dbID. This is necessary since e.g. Variation stores + # its alleles in a hash with dbID as key. To create fake dbIDs, use the string representing the memory address. + ($dbID) = sprintf('%s',$self) =~ m/\(([0-9a-fx]+)\)/i; + return $dbID; +} + +=head2 allele + + Arg [1] : string $newval (optional) + The new value to set the allele attribute to + Example : print $a->allele(); + $a1->allele('A'); + $a2->allele('-'); + Description: Getter/Setter for the allele attribute. The allele is a string + of nucleotide sequence, or a '-' representing the absence of + sequence (deletion). + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub allele{ + my $self = shift; + return $self->{'allele'} = shift if(@_); + return $self->{'allele'}; +} + + + + +=head2 frequency + + Arg [1] : float $newval (optional) + The new value to set the frequency attribute to + Example : $frequency = $a->frequency(); + Description: Getter/Setter for the frequency attribute. The frequency is + the frequency of the occurance of the allele. If the population + attribute it is the frequency of the allele within that + population. + Returntype : float + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub frequency{ + my $self = shift; + return $self->{'frequency'} = shift if(@_); + return $self->{'frequency'}; +} + +=head2 count + + Arg [1] : int $count (optional) + The new value to set the count attribute to + Example : $frequency = $allele->count() + Description: Getter/Setter for the observed count of this allele + within its associated population. + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub count{ + my $self = shift; + return $self->{'count'} = shift if(@_); + return $self->{'count'}; +} + + + +=head2 population + + Arg [1] : Bio::EnsEMBL::Variation::Population $newval (optional) + The new value to set the population attribute to + Example : $population = $a->population(); + Description: Getter/Setter for the population attribute + Returntype : Bio::EnsEMBL::Variation::Population + Exceptions : throw on incorrect argument + Caller : general + Status : At Risk + +=cut + +sub population{ + my $self = shift; + + if(@_) { + assert_ref($_[0],'Bio::EnsEMBL::Variation::Population'); + $self->{'population'} = shift; + $self->{'_population_id'} = $self->{'population'}->dbID(); + } + + # Population can be lazy-loaded, so get it from the database if we have a sample_id but no cached object + if (!defined($self->{'population'}) && defined($self->{'_population_id'})) { + + # Check that an adaptor is attached + assert_ref($self->adaptor(),'Bio::EnsEMBL::Variation::DBSQL::AlleleAdaptor'); + + # Get a population object + my $population = $self->adaptor->db->get_PopulationAdaptor()->fetch_by_dbID($self->{'_population_id'}); + + # Set the population + $self->{'population'} = $population; + } + + return $self->{'population'}; +} + + +=head2 subsnp + + Arg [1] : string $newval (optional) + The new value to set the subsnp attribute to + Example : print $a->subsnp(); + Description: Getter/Setter for the subsnp attribute. + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub subsnp{ + my $self = shift; + if(@_) { + $self->{'subsnp'} = shift; + } + + my $ssid = $self->{'subsnp'}; + if(defined($ssid)) { + $ssid = 'ss'.$ssid unless $ssid =~ /^ss/; + } + + return $ssid; +} + + +=head2 variation + + Arg [1] : Bio::EnsEMBL::Variation::Variation $newval (optional) + The new value to set the variation attribute to + Example : print $a->variation->name(); + Description: Getter/Setter for the variation attribute. + Returntype : Bio::EnsEMBL::Variation::Variation + Exceptions : throw on incorrect argument + Caller : general + +=cut + +sub variation { + my $self = shift; + my $variation = shift; + + # Set the dbID of the variation object on this allele + if(defined($variation)) { + assert_ref($variation,'Bio::EnsEMBL::Variation::Variation'); + $self->{'_variation_id'} = $variation->dbID(); + return $variation; + } + + # Load the variation from the database if we have a variation_id + if (defined($self->{'_variation_id'})) { + + # Check that an adaptor is attached + assert_ref($self->adaptor(),'Bio::EnsEMBL::Variation::DBSQL::BaseAdaptor'); + + # Get a variation object + $variation = $self->adaptor->db->get_VariationAdaptor()->fetch_by_dbID($self->{'_variation_id'}); + + $self->{variation} = $variation; + } + + # Return the variation object + return $self->{variation}; +} + + +=head2 subsnp_handle + + Arg [1] : string $newval (optional) + The new value to set the subsnp_handle attribute to + Example : print $a->subsnp_handle(); + Description: Getter/Setter for the subsnp_handle attribute. + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub subsnp_handle{ + my $self = shift; + my $handle = shift; + + # if changing handle + if(defined($handle)) { + $self->{'subsnp_handle'} = $handle; + } + elsif (!defined($self->{'subsnp_handle'})) { + + # Check that this allele has an adaptor attached + assert_ref($self->adaptor(),'Bio::EnsEMBL::Variation::DBSQL::AlleleAdaptor'); + + $self->{'subsnp_handle'} = $self->adaptor->get_subsnp_handle($self); + } + + return $self->{'subsnp_handle'}; +} + +sub _weaken { + my $self = shift; + + # If the variation is not defined, do nothing + return unless (defined($self->variation())); + + # Weaken the link to the variation + weaken($self->{'variation'}); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/AlleleFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/AlleleFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,646 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# Ensembl module for Bio::EnsEMBL::Variation::AlleleFeature +# +# Copyright (c) 2005 Ensembl +# + + +=head1 NAME + +Bio::EnsEMBL::Variation::AlleleFeature - A genomic position for an allele in a sample. + +=head1 SYNOPSIS + + # Allele feature representing a single nucleotide polymorphism + $af = Bio::EnsEMBL::Variation::AlleleFeature->new + (-start => 100, + -end => 100, + -strand => 1, + -slice => $slice, + -allele_string => 'A', + -variation_name => 'rs635421', + -variation => $v); + ... + + # a allele feature is like any other ensembl feature, can be + # transformed etc. + $af = $af->transform('supercontig'); + + print $af->start(), "-", $af->end(), '(', $af->strand(), ')', "\n"; + + print $af->name(), ":", $af->allele_string(); + + # Get the Variation object which this feature represents the genomic + # position of. If not already retrieved from the DB, this will be + # transparently lazy-loaded + my $v = $af->variation(); + +=head1 DESCRIPTION + +This is a class representing the genomic position of a allele in a sample +from the ensembl-variation database. The actual variation information is +represented by an associated Bio::EnsEMBL::Variation::Variation object. Some +of the information has been denormalized and is available on the feature for +speed purposes. A AlleleFeature behaves as any other Ensembl feature. +See B and B. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::AlleleFeature; + +use Bio::EnsEMBL::Feature; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code); +use Bio::EnsEMBL::Variation::ConsequenceType; + +our @ISA = ('Bio::EnsEMBL::Feature'); + +=head2 new + + Arg [-ADAPTOR] : + see superclass constructor + + Arg [-START] : + see superclass constructor + Arg [-END] : + see superclass constructor + + Arg [-STRAND] : + see superclass constructor + + Arg [-SLICE] : + see superclass constructor + + Arg [-VARIATION_NAME] : + string - the name of the variation this feature is for (denormalisation + from Variation object). + + Arg [-SOURCE] : + string - the name of the source where the SNP comes from + + Arg [-VARIATION] : + int - the variation object associated with this feature. + + Arg [-VARIATION_ID] : + int - the internal id of the variation object associated with this + identifier. This may be provided instead of a variation object so that + the variation may be lazy-loaded from the database on demand. + + Arg [-SAMPLE_ID] : + int - the internal id of the sample object associated with this + identifier. This may be provided instead of the object so that + the population/individual may be lazy-loaded from the database on demand. + + Arg [-ALLELE_STRING] : + string - the allele for this AlleleFeature object. + + Arg [-OVERLAP_CONSEQUENCES] : + listref of Bio::EnsEMBL::Variation::OverlapConsequence objects. + + Example : + $af = Bio::EnsEMBL::Variation::AlleleFeature->new + (-start => 100, + -end => 100, + -strand => 1, + -slice => $slice, + -allele_string => 'A', + -consequence_type => 'NON_SYNONYMOUS_CODING', + -variation_name => 'rs635421', + -source => 'Celera', + -sample_id => $sample_id, + -variation => $v); + + Description: Constructor. Instantiates a new AlleleFeature object. + Returntype : Bio::EnsEMBL::Variation::AlleleFeature + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + my ($allele, $overlap_consequences, $var_name, $variation, $variation_id, $population, $sample_id, $source) = + rearrange([qw(ALLELE_STRING OVERLAP_CONSEQUENCES VARIATION_NAME + VARIATION VARIATION_ID SAMPLE_ID SOURCE)], @_); + + $self->{'allele_string'} = $allele; + $self->{'overlap_consequences'} = $overlap_consequences; + $self->{'variation_name'} = $var_name; + $self->{'variation'} = $variation; + $self->{'_variation_id'} = $variation_id; + $self->{'_sample_id'} = $sample_id; + $self->{'source'} = $source; + + return $self; +} + + + +sub new_fast { + my $class = shift; + my $hashref = shift; + return bless $hashref, $class; +} + + +=head2 allele_string + + Arg [1] : string $newval (optional) + The new value to set the allele attribute to + Example : $allele = $obj->allele_string() + Description: Getter/Setter for the allele attribute. + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub allele_string{ + my $self = shift; + return $self->{'allele_string'} = shift if(@_); + + return $self->{'allele_string'} if ($self->{'_half_genotype'}); #for half genotypes + return join('|',split (//,unambiguity_code($self->{'allele_string'}))); #for heterozygous alleles +} + + + +=head2 consequence_type + + Arg [1] : (optional) String $term_type + Description: Get a list of all the unique consequence terms of this + AlleleFeature. By default returns Ensembl display terms + (e.g. 'NON_SYNONYMOUS_CODING'). $term_type can also be 'label' + (e.g. 'Non-synonymous coding'), 'SO' (Sequence Ontology, e.g. + 'non_synonymous_codon') or 'NCBI' (e.g. 'missense') + Returntype : listref of strings + Exceptions : none + Status : At Risk + +=cut + +sub consequence_type { + + my $self = shift; + my $term_type = shift; + + if($self->_is_sara) { + return ['SARA']; + } + else { + delete $self->{consequence_type} if defined($term_type); + + unless ($self->{consequence_type}) { + + $term_type ||= 'SO'; + my $method_name = $term_type.($term_type eq 'label' ? '' : '_term'); + $method_name = 'SO_term' unless $self->most_severe_OverlapConsequence->can($method_name); + + # work out the terms from the OverlapConsequence objects + $self->{consequence_type} = + [ $self->_is_sara ? 'SARA' : map { $_->$method_name } @{ $self->get_all_OverlapConsequences } ]; + + return $self->{consequence_type}; + } + } +} + + +=head2 get_all_OverlapConsequences + + Description: Get a list of all the unique OverlapConsequences of this AlleleFeature + Returntype : listref of Bio::EnsEMBL::Variation::OverlapConsequence objects + Exceptions : none + Status : At Risk + +=cut + +sub get_all_OverlapConsequences { + my $self = shift; + return $self->{overlap_consequences} +} + + +=head2 most_severe_OverlapConsequence + + Description: Get the OverlapConsequence considered (by Ensembl) to be the most severe + consequence of all the alleles of this AlleleFeature + Returntype : Bio::EnsEMBL::Variation::OverlapConsequence + Exceptions : none + Status : At Risk + +=cut + +sub most_severe_OverlapConsequence { + my $self = shift; + + unless ($self->{_most_severe_consequence}) { + + my $highest; + + for my $cons (@{ $self->get_all_OverlapConsequences }) { + $highest ||= $cons; + if ($cons->rank < $highest->rank) { + $highest = $cons; + } + } + + $self->{_most_severe_consequence} = $highest; + } + + return $self->{_most_severe_consequence}; +} + +=head2 display_consequence + + Arg [1] : (optional) String $term_type + Description: Get the term for the most severe consequence of this + AlleleFeature. By default returns Ensembl display terms + (e.g. 'NON_SYNONYMOUS_CODING'). $term_type can also be 'label' + (e.g. 'Non-synonymous coding'), 'SO' (Sequence Ontology, e.g. + 'non_synonymous_codon') or 'NCBI' (e.g. 'missense') + Returntype : string + Exceptions : none + Caller : webteam + Status : At Risk + +=cut + +sub display_consequence { + my $self = shift; + my $term_type = shift; + + if($self->_is_sara) { + return 'SARA'; + } + else { + $term_type ||= 'SO'; + my $method_name = $term_type.($term_type eq 'label' ? '' : '_term'); + $method_name = 'SO_term' unless $self->most_severe_OverlapConsequence->can($method_name); + + return $self->most_severe_OverlapConsequence->$method_name; + } +} + + +=head2 get_all_TranscriptVariations + + Arg [1] : (optional) listref of Bio::EnsEMBL::Transcript objects + Example : $af->get_all_TranscriptVariations; + Description : Get all the TranscriptVariations associated with this AlleleFeature. + If the optional list of Transcripts is supplied, get only TranscriptVariations + associated with those Transcripts. + Returntype : listref of Bio::EnsEMBL::Variation::TranscriptVariation objects + Exceptions : Thrown on wrong argument type + Caller : general + Status : At Risk + +=cut + +sub get_all_TranscriptVariations { + my $self = shift; + my $trs = shift; + + my $cons = $self->variation_feature->get_all_TranscriptVariations($trs); + + # convert the TV to a SARA one if this is a SARA genotype + if($self->_is_sara) { + $_->_convert_to_sara foreach @$cons; + } + + # otherwise we need to rearrange the TranscriptVariationAlleles based + # on the alleles of this genotype + else { + my %alleles; + $alleles{$_} = 1 foreach split /\||\/|\\/, $self->allele_string; + + $_->_rearrange_alleles(\%alleles) foreach @$cons; + } + + return $cons; +} + +=head2 variation_name + + Arg [1] : string $newval (optional) + The new value to set the variation_name attribute to + Example : $variation_name = $obj->variation_name() + Description: Getter/Setter for the variation_name attribute. This is the + name of the variation associated with this feature. + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub variation_name{ + my $self = shift; + return $self->{'variation_name'} = shift if(@_); + return $self->{'variation_name'}; +} + +=head2 variation + + Arg [1] : (optional) Bio::EnsEMBL::Variation::Variation $variation + Example : $v = $af->variation(); + Description: Getter/Setter for the variation associated with this feature. + If not set, and this AlleleFeature has an associated adaptor + an attempt will be made to lazy-load the variation from the + database. + Returntype : Bio::EnsEMBL::Variation::Variation + Exceptions : throw on incorrect argument + Caller : general + Status : At Risk + +=cut + +sub variation { + my $self = shift; + + if(@_) { + if(!ref($_[0]) || !$_[0]->isa('Bio::EnsEMBL::Variation::Variation')) { + throw("Bio::EnsEMBL::Variation::Variation argument expected"); + } + $self->{'variation'} = shift; + } + elsif(!defined($self->{'variation'}) && $self->{'adaptor'} && + defined($self->{'_variation_id'})) { + # lazy-load from database on demand + my $va = $self->{'adaptor'}->db()->get_VariationAdaptor(); + $self->{'variation'} = $va->fetch_by_dbID($self->{'_variation_id'}); + } + + return $self->{'variation'}; +} + +=head2 variation_feature + + Arg [1] : (optional) Bio::EnsEMBL::Variation::VariationFeature $vf + Example : $vf = $af->variation_feature(); + Description: Getter/Setter for the variation feature associated with this feature. + If not set, and this AlleleFeature has an associated adaptor + an attempt will be made to lazy-load the variation from the + database. + Returntype : Bio::EnsEMBL::Variation::VariationFeature + Exceptions : throw on incorrect argument + Caller : general + Status : At Risk + +=cut + +sub variation_feature { + my $self = shift; + + if(@_) { + if(!ref($_[0]) || !$_[0]->isa('Bio::EnsEMBL::Variation::VariationFeature')) { + throw("Bio::EnsEMBL::Variation::VariationFeature argument expected"); + } + $self->{'variation_feature'} = shift; + } + elsif(!defined($self->{'variation_feature'}) && $self->{'adaptor'} && + defined($self->{'_variation_feature_id'})) { + # lazy-load from database on demand + my $va = $self->{'adaptor'}->db()->get_VariationFeatureAdaptor(); + $self->{'variation_feature'} = $va->fetch_by_dbID($self->{'_variation_feature_id'}); + } + + return $self->{'variation_feature'}; +} + +=head2 individual + + Arg [1] : (optional) Bio::EnsEMBL::Variation::Individual $individual + Example : $p = $af->individual(); + Description: Getter/Setter for the individual associated with this feature. + If not set, and this AlleleFeature has an associated adaptor + an attempt will be made to lazy-load the individual from the + database. + Returntype : Bio::EnsEMBL::Variation::Individual + Exceptions : throw on incorrect argument + Caller : general + Status : At Risk + +=cut + +sub individual { + my $self = shift; + + if(@_) { + if(!ref($_[0]) || !$_[0]->isa('Bio::EnsEMBL::Variation::Individual')) { + throw("Bio::EnsEMBL::Variation::Individual argument expected"); + } + $self->{'individual'} = shift; + } + elsif(!defined($self->{'individual'}) && $self->{'adaptor'} && + defined($self->{'_sample_id'})) { + # lazy-load from database on demand + my $ia = $self->{'adaptor'}->db()->get_IndividualAdaptor(); + $self->{'individual'} = $ia->fetch_by_dbID($self->{'_sample_id'}); + if (!defined $self->{'individual'}){ + warning("AlleleFeature attached to Strain, not Individual"); + } + } + + return $self->{'individual'}; +} + + +=head2 apply_edit + + Arg [1] : reference to string $seqref + Arg [2] : int $start of the seq_ref + Example : $sequence = 'ACTGAATATTTAAGGCA'; + $af->apply_edit(\$sequence,$start); + print $sequence, "\n"; + Description: Applies this AlleleFeature directly to a sequence which is + passed by reference. + If either the start or end of this AlleleFeature are not defined + this function will not do anything to the passed sequence. + Returntype : reference to the same sequence that was passed in + Exceptions : none + Caller : Slice + Status : At Risk + +=cut + +sub apply_edit { + + my $self = shift; + my $seqref = shift; + + if(ref($seqref) ne 'SCALAR') { + throw("Reference to scalar argument expected"); + } + + if(!defined($self->{'start'}) || !defined($self->{'end'})) { + return $seqref; + } + + + my $len = $self->length; + my $as = $self->{'allele_string'}; + $as =~ s/\-//g; + + substr($$seqref, $self->{'start'}-1, $len) = $as; + + return $seqref; + +} + +=head2 length_diff + + Arg [1] : none + Example : my $diff = $af->length_diff(); + Description: Returns the difference in length caused by applying this + AlleleFeature to a sequence. This may be be negative (deletion), + positive (insertion) or 0 (replacement). + If either start or end are not defined 0 is returned. + Returntype : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub length_diff { + + my $self = shift; + + return 0 if(!defined($self->{'end'}) || !defined($self->{'start'})); + + return length($self->{'allele_string'}) - ($self->{'end'} - $self->{'start'} + 1) if ($self->{'allele_string'} ne '-'); + return 0 - ($self->{'end'} - $self->{'start'} +1) if ($self->{'allele_string'} eq '-'); #do we need the +1 in the distance ?? + +} + + +sub length { + my $self = shift; + return $self->{'end'} - $self->{'start'} + 1; +} + +=head2 source + + Arg [1] : string $source (optional) + The new value to set the source attribute to + Example : $source = $vf->source() + Description: Getter/Setter for the source attribute + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub source{ + my $self = shift; + return $self->{'source'} = shift if(@_); + return $self->{'source'}; +} + +=head2 ref_allele_string + + Args : None + Example : $allele = $obj->ref_allele_string() + Description: Getter for the reference allele. + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + + sub ref_allele_string{ + my $self = shift; + + my $reference_allele; + if ( ref ($self->slice) eq 'Bio::EnsEMBL::Slice' ){ + #we already have the reference slice, so just get the sequence + $reference_allele = $self->seq; + } + else{ + #we have a Strain or IndividualSlice, get the reference sequence from the method + $reference_allele = $self->slice->ref_subseq($self->start,$self->end,$self->strand) || '-'; + } + + return $reference_allele; + } + +=head2 get_all_sources + + Args : none + Example : my @sources = @{$af->get_all_sources()}; + Description : returns a list of all the sources for this + AlleleFeature + ReturnType : reference to list of strings + Exceptions : none + Caller : general + Status : At Risk + : Variation database is under development. +=cut + +sub get_all_sources{ + my $self = shift; + + my @sources; + my %sources; + if ($self->{'adaptor'}){ + map {$sources{$_}++} @{$self->{'adaptor'}->get_all_synonym_sources($self)}; + $sources{$self->source}++; + @sources = keys %sources; + return \@sources; + } + return \@sources; +} + +sub _is_sara{ + my $self = shift; + + if(!defined($self->{_is_sara})) { + my $allele_string = $self->allele_string; + my $ref = $self->ref_allele_string; + + my $is_sara = 1; + + foreach my $a(split /\/|\||\\/, $allele_string) { + $is_sara = 0 if $ref !~ /$a/i; + } + + $self->{_is_sara} = $is_sara; + } + + return $self->{_is_sara}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/BaseStructuralVariation.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/BaseStructuralVariation.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,568 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# Ensembl module for Bio::EnsEMBL::Variation::BaseStructuralVariation +# +# Copyright (c) 2011 Ensembl +# + + +=head1 NAME + +Bio::EnsEMBL::Variation::BaseStructuralVariation - Ensembl representation of a structural variant. + +=head1 SYNOPSIS + + # Structural variation representing a CNV + $sv = Bio::EnsEMBL::Variation::StructuralVariation->new + (-variation_name => 'esv25480', + -class_so_term => 'structural_variant', + -source => 'DGVa', + -source_description => 'Database of Genomic Variants Archive', + -study_name => 'estd20', + -study_description => 'Conrad 2009 "Origins and functional impact of copy number variation in the human genome." PMID:19812545 [remapped from build NCBI36]', + -study_url => 'ftp://ftp.ebi.ac.uk/pub/databases/dgva/estd20_Conrad_et_al_2009', + -external_reference => 'pubmed/19812545'); + + ... + + print $sv->name(), ":", $sv->var_class(); + +=head1 DESCRIPTION + +This is a class representing a structural variation from the +ensembl-variation database. A structural variant may have a copy number variation, a tandem duplication, +an inversion of the sequence or others structural variations. + +The position of a StructuralVariation object on the Genome is represented +by the B class. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::BaseStructuralVariation; + +use Bio::EnsEMBL::Storable; +use Bio::EnsEMBL::Utils::Exception qw(throw warning deprecate); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Variation::Utils::Constants qw(%VARIATION_CLASSES); +use Bio::EnsEMBL::Variation::Failable; +our @ISA = ('Bio::EnsEMBL::Storable','Bio::EnsEMBL::Variation::Failable'); + +=head2 new + + Arg [-dbID] : + see superclass constructor + + Arg [-ADAPTOR] : + see superclass constructor + + Arg [-VARIATION_NAME] : + string - the name of the structural variant. + + Arg [-CLASS_SO_TERM] : + string - the sequence ontology term defining the class of the structural variant. + + Arg [-SOURCE] : + string - the name of the source where the structural variant comes from + + Arg [-SOURCE_DESCRIPTION] : + string - description of the source + + Arg [-TYPE] : + string - the class of structural variant e.g. 'copy_number_variation' + + Arg [-STUDY] : + object ref - the study object describing where the structural variant comes from. + + Arg [-VALIDATION_STATUS] : + string - the status of the structural variant (e.g. validated, not validated, ...) + + Arg [-IS_EVIDENCE] : + int - flag to inform whether the structural variant is a supporting evidence (1) or not (0). + + Arg [-IS_SOMATIC] : + int - flag to inform whether the structural variant is a somatic (1) or germline (0). + + Example for a structural variation: + $sv = Bio::EnsEMBL::Variation::StructuralVariation->new + (-variation_name => 'esv25480', + -class_so_term => 'copy_number_variation', + -source => 'DGVa', + -source_description => 'Database of Genomic Variants Archive', + + Description: Constructor. Instantiates a new structural variant object. + Returntype : Bio::EnsEMBL::Variation::StructuralVariation or + Bio::EnsEMBL::Variation::SupportingStructuralVariation + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my ( + $dbID, + $adaptor, + $var_name, + $source, + $source_version, + $source_description, + $class_so_term, + $study, + $validation_status, + $is_evidence, + $is_somatic + ) = rearrange([qw( + dbID + ADAPTOR + VARIATION_NAME + SOURCE + SOURCE_VERSION + SOURCE_DESCRIPTION + CLASS_SO_TERM + STUDY + VALIDATION_STATES + IS_EVIDENCE + IS_SOMATIC + )], @_); + + my $self = bless { + 'dbID' => $dbID, + 'adaptor' => $adaptor, + 'variation_name' => $var_name, + 'source' => $source, + 'source_version' => $source_version, + 'source_description' => $source_description, + 'class_SO_term' => $class_so_term, + 'study' => $study, + 'validation_status' => $validation_status, + 'is_evidence' => $is_evidence || 0, + 'is_somatic' => $is_somatic || 0, + }; + return $self; +} + + + +sub new_fast { + my $class = shift; + my $hashref = shift; + return bless $hashref, $class; +} + + +=head2 display_id + + Arg [1] : none + Example : print $sv->display_id(), "\n"; + Description: Returns the 'display' identifier for this structural variant. + Returntype : string + Exceptions : none + Caller : webcode + Status : At Risk + +=cut + +sub display_id { + my $self = shift; + return $self->{'variation_name'} || ''; +} + + + +=head2 variation_name + + Arg [1] : string $newval (optional) + The new value to set the variation_name attribute to + Example : $variation_name = $obj->variation_name() + Description: Getter/Setter for the variation_name attribute. This is the + name of the variation associated with this feature. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub variation_name{ + my $self = shift; + return $self->{'variation_name'} = shift if(@_); + return $self->{'variation_name'}; +} + +=head2 var_class + + Args : None + Example : my $sv_class = $sv->var_class() + Description : Getter/setter for the class of structural variant + ReturnType : String + Exceptions : none + Caller : General + Status : At Risk + +=cut + +sub var_class { + my $self = shift; + + unless ($self->{class_display_term}) { + my $display_term = $VARIATION_CLASSES{$self->{class_SO_term}}->{display_term}; + + warn "No display term for SO term: ".$self->{class_SO_term} unless $display_term; + + $self->{class_display_term} = $display_term || $self->{class_SO_term}; + } + + return $self->{class_display_term}; +} + + +=head2 class_SO_term + + Args : None + Example : my $sv_so_term = $svf->class_SO_term() + Description : Getter for the class of structural variant, returning the SO term + ReturnType : String + Exceptions : none + Caller : General + Status : At Risk + +=cut + +sub class_SO_term { + my $self = shift; + + return $self->{class_SO_term}; +} + + +=head2 source + + Arg [1] : string $source (optional) + The new value to set the source attribute to + Example : $source = $svf->source() + Description: Getter/Setter for the source attribute + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub source { + my $self = shift; + return $self->{'source'} = shift if(@_); + return $self->{'source'}; +} + + +=head2 source_version + + Arg [1] : string $source_version (optional) + The new value to set the source_version attribute to + Example : $source_version = $svf->source_version() + Description: Getter/Setter for the source_version attribute + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub source_version { + my $self = shift; + return $self->{'source_version'} = shift if(@_); + return $self->{'source_version'}; +} + + +=head2 source_description + + Arg [1] : string $source_description (optional) + The new value to set the source_description attribute to + Example : $source_description = $svf->source_description() + Description: Getter/Setter for the source_description attribute + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub source_description { + my $self = shift; + return $self->{'source_description'} = shift if(@_); + return $self->{'source_description'}; +} + + +=head2 get_all_validation_states + + Arg [1] : none + Example : my @vstates = @{$v->get_all_validation_states()}; + Description: Retrieves all validation states for this structural variation. Current + possible validation statuses are 'validated','not validated', + 'high quality' + Returntype : reference to list of strings + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub get_all_validation_states { + my $self = shift; + + return $self->{'validation_status'} || []; +} + + +=head2 is_evidence + + Arg [1] : int $flag (optional) + Example : $is_evidence = $obj->is_evidence() + Description: Getter/Setter of a flag to inform whether the structural variant is a + supporting evidence (1) or not (0). + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub is_evidence{ + my $self = shift; + return $self->{'is_evidence'} = shift if(@_); + return $self->{'is_evidence'}; +} + +=head2 is_somatic + + Arg [1] : int $flag (optional) + Example : $is_somatic = $obj->is_somatic() + Description: Getter/Setter of a flag to inform whether the structural variant is somatic (1) or germline (0). + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub is_somatic{ + my $self = shift; + return $self->{'is_somatic'} = shift if(@_); + return $self->{'is_somatic'}; +} + + +=head2 study + + Arg [1] : Bio::EnsEMBL::Variation::Study (optional) + Example : $study = $sv->study() + Description: Getter/Setter for the study object + Returntype : Bio::EnsEMBL::Variation::Study + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub study { + my $self = shift; + return $self->{'study'} = shift if(@_); + return $self->{'study'}; +} + + +=head2 study_name + + Arg [1] : string $study (optional) + The new value to set the study attribute to + Example : $study = $sv->study() + Description: Getter/Setter for the study attribute + Returntype : string + Exceptions : none + Caller : general + Status : Deprecated + +=cut + +sub study_name { + my $self = shift; + deprecate('Use the method "study" instead (returns a Bio::EnsEMBL::Variation::Study object).'); + return undef if (!$self->study); + return $self->study->name = shift if(@_); + return $self->study->name; +} + + + +=head2 study_description + + Arg [1] : string $study_description (optional) + The new value to set the study_description attribute to + Example : $study_description = $sv->study_description() + Description: Getter/Setter for the study_description attribute + Returntype : string + Exceptions : none + Caller : general + Status : Deprecated + +=cut + +sub study_description { + my $self = shift; + deprecate('Use the method "study" instead (returns a Bio::EnsEMBL::Variation::Study object).'); + return undef if (!$self->study); + return $self->study->description = shift if(@_); + return $self->study->description; +} + +=head2 study_url + + Arg [1] : string $newval (optional) + The new value to set the study_url attribute to + Example : $paper = $obj->study_url() + Description: Getter/Setter for the study_url attribute.This is the link to the website where the data are stored. + Returntype : string + Exceptions : none + Caller : general + Status : Deprecated + +=cut + +sub study_url{ + my $self = shift; + deprecate('Use the method "study" instead (returns a Bio::EnsEMBL::Variation::Study object).'); + return undef if (!$self->study); + return $self->study->url = shift if(@_); + return $self->study->url; +} + + +=head2 external_reference + + Arg [1] : string $newval (optional) + The new value to set the external reference attribute to + Example : $paper = $obj->external_reference() + Description: Getter/Setter for the external reference attribute. This is the + pubmed/id or project name associated with this study. + Returntype : string + Exceptions : none + Caller : general + Status : Deprecated + +=cut + +sub external_reference{ + my $self = shift; + deprecate('Use the method "study" instead (returns a Bio::EnsEMBL::Variation::Study object).'); + return undef if (!$self->study); + return $self->study->external_reference = shift if(@_); + return $self->study->external_reference; +} + + +=head2 get_all_StructuralVariationFeatures + + Args : None + Example : $svfs = $sv->get_all_StructuralVariationFeatures(); + Description : Retrieves all StructuralVariationFeature for this structural variant + ReturnType : reference to list of Bio::EnsEMBL::Variation::StructuralVariationFeature + Exceptions : None + Caller : general + Status : At Risk + +=cut + +sub get_all_StructuralVariationFeatures{ + my $self = shift; + + if(defined $self->{'adaptor'}) { + + # get structural variation feature adaptor + my $svf_adaptor = $self->{'adaptor'}->db()->get_StructuralVariationFeatureAdaptor(); + + return $svf_adaptor->fetch_all_by_StructuralVariation($self); + } + else { + warn("No variation database attached"); + return []; + } +} + + +=head2 get_all_StructuralVariationAnnotations + + Args : None + Example : $svas = $sv->get_all_StructuralVariationAnnotations(); + Description : Retrieves all get_all_StructuralVariationAnnotation for this structural variant + ReturnType : reference to list of Bio::EnsEMBL::Variation::StructuralVariationAnnotation + Exceptions : None + Caller : general + Status : At Risk + +=cut + +sub get_all_StructuralVariationAnnotations{ + my $self = shift; + + if(defined $self->{'adaptor'}) { + + # get structural variation annotation adaptor + my $sva_adaptor = $self->{'adaptor'}->db()->get_StructuralVariationAnnotationAdaptor(); + + return $sva_adaptor->fetch_all_by_StructuralVariation($self); + } + else { + warn("No variation database attached"); + return []; + } +} + + +=head2 summary_as_hash + + Example : $sv_summary = $sv->summary_as_hash(); + Description : Retrieves a textual summary of this StructuralVariation object. + Returns : hashref of descriptive strings + +=cut + +sub summary_as_hash { + my $self = shift; + my %summary; + $summary{'display_id'} = $self->display_id; + $summary{'study_name'} = $self->study_name; + $summary{'study_description'} = $self->study_description; + $summary{'class'} = $self->var_class; + return \%summary; + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/BaseTranscriptVariation.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/BaseTranscriptVariation.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,680 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Variation::BaseTranscriptVariation + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Variation::BaseTranscriptVariation; + +=head1 DESCRIPTION + +A helper class for representing an overlap of a Transcript and a +Variation (either sequence or structural). Should not be invoked directly. + +=cut + +package Bio::EnsEMBL::Variation::BaseTranscriptVariation; + +use strict; +use warnings; + +use Digest::MD5 qw(md5_hex); + +use Bio::EnsEMBL::Utils::Scalar qw(assert_ref check_ref); +use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(overlap within_cds); + +use base qw(Bio::EnsEMBL::Variation::VariationFeatureOverlap); + +=head2 transcript_stable_id + + Description: Returns the stable_id of the associated Transcript + Returntype : string + Exceptions : none + Status : At Risk + +=cut + +sub transcript_stable_id { + my $self = shift; + return $self->SUPER::_feature_stable_id(@_); +} + +=head2 transcript + + Arg [1] : (optional) Bio::EnsEMBL::Transcript + Description: Get/set the associated Transcript + Returntype : Bio::EnsEMBL::Transcript + Exceptions : throws if argument is wrong type + Status : At Risk + +=cut + +sub transcript { + my ($self, $transcript) = @_; + assert_ref($transcript, 'Bio::EnsEMBL::Transcript') if $transcript; + return $self->SUPER::feature($transcript, 'Transcript'); +} + +=head2 feature + + Arg [1] : (optional) Bio::EnsEMBL::Transcript + Description: Get/set the associated Transcript (overriding the superclass feature method) + Returntype : Bio::EnsEMBL::Transcript + Exceptions : throws if argument is wrong type + Status : At Risk + +=cut + +sub feature { + my $self = shift; + return $self->transcript(@_); +} + +=head2 cdna_start + + Arg [1] : (optional) int $start + Example : $cdna_start = $tv->cdna_start; + Description: Getter/Setter for the start position of this variation on the + transcript in cDNA coordinates. + Returntype : int + Exceptions : None + Caller : general + Status : Stable + +=cut + +sub cdna_start { + my ($self, $cdna_start) = @_; + + $self->{cdna_start} = $cdna_start if defined $cdna_start; + + unless (exists $self->{cdna_start}) { + my $cdna_coords = $self->cdna_coords; + + my ($first, $last) = ($cdna_coords->[0], $cdna_coords->[-1]); + + $self->{cdna_start} = $first->isa('Bio::EnsEMBL::Mapper::Gap') ? undef : $first->start; + $self->{cdna_end} = $last->isa('Bio::EnsEMBL::Mapper::Gap') ? undef : $last->end; + } + + return $self->{cdna_start}; +} + +=head2 cdna_end + + Arg [1] : (optional) int $end + Example : $cdna_end = $tv->cdna_end; + Description: Getter/Setter for the end position of this variation on the + transcript in cDNA coordinates. + Returntype : int + Exceptions : None + Caller : general + Status : Stable + +=cut + +sub cdna_end { + my ($self, $cdna_end) = @_; + + $self->{cdna_end} = $cdna_end if defined $cdna_end; + + # call cdna_start to calculate the start and end + $self->cdna_start unless exists $self->{cdna_end}; + + return $self->{cdna_end}; +} + +=head2 cds_start + + Arg [1] : (optional) int $start + Example : $cds_start = $tv->cds_start; + Description: Getter/Setter for the start position of this variation on the + transcript in CDS coordinates. + Returntype : int + Exceptions : None + Caller : general + Status : Stable + +=cut + +sub cds_start { + my ($self, $cds_start) = @_; + + $self->{cds_start} = $cds_start if defined $cds_start; + + unless (exists $self->{cds_start}) { + my $cds_coords = $self->cds_coords; + + my ($first, $last) = ($cds_coords->[0], $cds_coords->[-1]); + my $exon_phase = $self->transcript->start_Exon->phase; + + $self->{cds_start} = $first->isa('Bio::EnsEMBL::Mapper::Gap') ? undef : $first->start + ($exon_phase > 0 ? $exon_phase : 0); + $self->{cds_end} = $last->isa('Bio::EnsEMBL::Mapper::Gap') ? undef : $last->end + ($exon_phase > 0 ? $exon_phase : 0); + } + + return $self->{cds_start}; +} + +=head2 cds_end + + Arg [1] : (optional) int $end + Example : $cds_end = $tv->cds_end; + Description: Getter/Setter for the end position of this variation on the + transcript in CDS coordinates. + Returntype : int + Exceptions : None + Caller : general + Status : Stable + +=cut + +sub cds_end { + my ($self, $cds_end) = @_; + + $self->{cds_end} = $cds_end if defined $cds_end; + + # call cds_start to calculate the start and end + $self->cds_start unless exists $self->{cds_end}; + + return $self->{cds_end}; +} + +=head2 translation_start + + Arg [1] : (optional) int $start + Example : $translation_start = $tv->translation_start; + Description: Getter/Setter for the start position of this variation on the + transcript in peptide coordinates. + Returntype : int + Exceptions : None + Caller : general + Status : Stable + +=cut + +sub translation_start { + my ($self, $translation_start) = @_; + + $self->{translation_start} = $translation_start if defined $translation_start; + + unless (exists $self->{translation_start}) { + my $translation_coords = $self->translation_coords; + + my ($first, $last) = ($translation_coords->[0], $translation_coords->[-1]); + + $self->{translation_start} = $first->isa('Bio::EnsEMBL::Mapper::Gap') ? undef : $first->start; + $self->{translation_end} = $last->isa('Bio::EnsEMBL::Mapper::Gap') ? undef : $last->end; + } + + return $self->{translation_start}; +} + + +=head2 translation_end + + Arg [1] : (optional) int $end + Example : $transaltion_end = $tv->translation_end; + Description: Getter/Setter for the end position of this variation on the + transcript in peptide coordinates. + Returntype : int + Exceptions : None + Caller : general + Status : Stable + +=cut + +sub translation_end { + my ($self, $translation_end) = @_; + + $self->{translation_end} = $translation_end if defined $translation_end; + + # call translation_start to calculate the start and end + $self->translation_start unless exists $self->{translation_end}; + + return $self->{translation_end}; +} + +=head2 cdna_coords + + Description: Use the TranscriptMapper to calculate the cDNA + coordinates of this variation + Returntype : listref of Bio::EnsEMBL::Coordinate and Bio::EnsEMBL::Gap objects + Exceptions : None + Caller : general + Status : Stable + +=cut + +sub cdna_coords { + my $self = shift; + + unless ($self->{_cdna_coords}) { + my $vf = $self->base_variation_feature; + my $tran = $self->transcript; + $self->{_cdna_coords} = [ $self->_mapper->genomic2cdna($vf->start, $vf->end, $tran->strand) ]; + } + + return $self->{_cdna_coords}; +} + +=head2 cds_coords + + Description: Use the TranscriptMapper to calculate the CDS + coordinates of this variation + Returntype : listref of Bio::EnsEMBL::Coordinate and Bio::EnsEMBL::Gap objects + Exceptions : None + Caller : general + Status : Stable + +=cut + +sub cds_coords { + my $self = shift; + + unless ($self->{_cds_coords}) { + my $vf = $self->base_variation_feature; + my $tran = $self->transcript; + $self->{_cds_coords} = [ $self->_mapper->genomic2cds($vf->start, $vf->end, $tran->strand) ]; + } + + return $self->{_cds_coords}; +} + +=head2 translation_coords + + Description: Use the TranscriptMapper to calculate the peptide + coordinates of this variation + Returntype : listref of Bio::EnsEMBL::Coordinate and Bio::EnsEMBL::Gap objects + Exceptions : None + Caller : general + Status : Stable + +=cut + +sub translation_coords { + my $self = shift; + + unless ($self->{_translation_coords}) { + my $vf = $self->base_variation_feature; + my $tran = $self->transcript; + $self->{_translation_coords} = [ $self->_mapper->genomic2pep($vf->start, $vf->end, $tran->strand) ]; + } + + return $self->{_translation_coords}; +} + + +=head2 distance_to_transcript + + Arg [1] : (optional) int $distance + Example : $distance = $tv->distance_to_transcript; + Description: Getter/Setter for the distance of this variant to the transcript. + This is the shortest distance between variant start/end and + transcript start/end, so if a variant falls 5' of a transcript + on the forward strand this distance will be that between the + variant end and the transcript start; if it falls 3' it will be + the distance between the variant start and the transcript end. + Returntype : int + Exceptions : None + Caller : general + Status : Stable + +=cut + +sub distance_to_transcript { + my ($self, $distance) = @_; + + $self->{distance_to_transcript} = $distance if defined $distance; + + unless (exists $self->{distance_to_transcript}) { + my $vf = $self->base_variation_feature; + my $tr = $self->transcript; + + my @dists = ( + $vf->start - $tr->start, + $vf->start - $tr->end, + $vf->end - $tr->start, + $vf->end - $tr->end + ); + + # make positive if <0 and sort + @dists = sort {$a <=> $b} map {$_ < 0 ? 0 - $_ : $_} @dists; + + $self->{distance_to_transcript} = $dists[0]; + } + + return $self->{distance_to_transcript}; +} + +=head2 get_overlapping_ProteinFeatures + + Description: Find any ProteinFeatures (e.g. pfam or interpro domains etc.) that + the associated variation feature lies in + Returntype : listref of Bio::EnsEMBL::ProteinFeatures (possibly empty) + Exceptions : None + Caller : general + Status : At Risk + +=cut + +sub get_overlapping_ProteinFeatures { + my $self = shift; + + unless (exists $self->{_protein_features}) { + + $self->{_protein_features } = []; + + my $tl = $self->transcript->translation; + + if (defined $tl) { + + my $tl_start = $self->translation_start; + my $tl_end = $self->translation_end; + + if (defined $tl_start && defined $tl_end) { + for my $feat (@{ $tl->get_all_ProteinFeatures }) { + if (overlap($feat->start, $feat->end, $tl_start, $tl_end)) { + push @{ $self->{_protein_features} }, $feat; + } + } + } + } + } + + return $self->{_protein_features}; +} + +=head2 exon_number + + Description: Identify which exon(s) this variant falls in + Returntype : '/'-separated string containing the exon number(s) and the total + number of exons in this transcript, or undef if this variant + does not fall in any exons + Exceptions : None + Caller : general + Status : At Risk + +=cut + +sub exon_number { + my $self = shift; + $self->_exon_intron_number unless exists $self->{exon_number}; + return $self->{exon_number}; +} + +=head2 intron_number + + Description: Identify which intron(s) this variant falls in + Returntype : '/'-separated string containing the intron number(s) and the total + number of introns in this transcript, or undef if this variant + does not fall in any introns + Exceptions : None + Caller : general + Status : At Risk + +=cut + +sub intron_number { + my $self = shift; + $self->_exon_intron_number unless exists $self->{intron_number}; + return $self->{intron_number}; +} + +sub _exon_intron_number { + my $self = shift; + + # work out which exon or intron this variant falls in + + # ensure the keys exist so even if we don't fall in an exon + # or intron we'll only call this method once + + $self->{exon_number} = $self->{intron_number} = undef; + + my $vf = $self->base_variation_feature; + + my $vf_start = $vf->start; + my $vf_end = $vf->end; + + my $strand = $self->transcript->strand; + + my $exons = $self->_exons; + + my $tot_exons = scalar(@$exons); + + my $exon_count = 0; + + my $prev_exon; + + my (@overlapped_exons, @overlapped_introns); + + for my $exon (@$exons) { + + $exon_count++; + + if (overlap($vf_start, $vf_end, $exon->start, $exon->end)) { + push @overlapped_exons, $exon_count; + #$self->{exon_number} = defined($self->{exon_number}) ? $self->{exon_number}.",".$exon_count : $exon_count; + } + + if ($prev_exon) { + my $intron_start = $strand == 1 ? $prev_exon->end + 1 : $exon->end + 1; + my $intron_end = $strand == 1 ? $exon->start - 1 : $prev_exon->start - 1; + + if ($prev_exon && overlap($vf_start, $vf_end, $intron_start, $intron_end)) { + push @overlapped_introns, $exon_count - 1; + #$self->{intron_number} = defined($self->{intron_number}) ? $self->{intron_number}.",".($exon_count - 1) : ($exon_count - 1); + } + } + + $prev_exon = $exon; + } + + if(@overlapped_exons) { + $self->{exon_number} = (scalar @overlapped_exons > 1 ? $overlapped_exons[0].'-'.$overlapped_exons[-1] : $overlapped_exons[0]).'/'.$tot_exons; + } + if(@overlapped_introns) { + $self->{intron_number} = (scalar @overlapped_introns > 1 ? $overlapped_introns[0].'-'.$overlapped_introns[-1] : $overlapped_introns[0]).'/'.($tot_exons - 1); + } +} + +sub _intron_effects { + my $self = shift; + + # internal method used by Bio::EnsEMBL::Variation::Utils::VariationEffect + # when calculating various consequence types + + # this method is a major bottle neck in the effect calculation code so + # we cache results and use local variables instead of method calls where + # possible to speed things up - caveat bug-fixer! + + unless ($self->{_intron_effects}) { + + my $vf = $self->base_variation_feature; + + my $intron_effects = {}; + + my $found_effect = 0; + + my $vf_start = $vf->start; + my $vf_end = $vf->end; + + my $insertion = $vf_start == $vf_end+1; + + for my $intron (@{ $self->_introns }) { + + my $intron_start = $intron->start; + my $intron_end = $intron->end; + + # under various circumstances the genebuild process can introduce + # artificial short (<= 12 nucleotide) introns into transcripts + # (e.g. to deal with errors in the reference sequence etc.), we + # don't want to categorise variations that fall in these introns + # as intronic, or as any kind of splice variant + + my $frameshift_intron = ( abs($intron_end - $intron_start) <= 12 ); + + if ($frameshift_intron) { + if (overlap($vf_start, $vf_end, $intron_start, $intron_end)) { + $intron_effects->{within_frameshift_intron} = 1; + next; + } + } + + if (overlap($vf_start, $vf_end, $intron_start, $intron_start+1)) { + $intron_effects->{start_splice_site} = 1; + } + + if (overlap($vf_start, $vf_end, $intron_end-1, $intron_end)) { + $intron_effects->{end_splice_site} = 1; + } + + # we need to special case insertions between the donor and acceptor sites + + if (overlap($vf_start, $vf_end, $intron_start+2, $intron_end-2) or + ($insertion && ($vf_start == $intron_start+2 || $vf_end == $intron_end-2)) ) { + $intron_effects->{intronic} = 1; + } + + # the definition of splice_region (SO:0001630) is "within 1-3 bases + # of the exon or 3-8 bases of the intron", the intron start is the + # first base of the intron so we only need to add or subtract 7 from + # it to get the correct coordinate. We also need to special case + # insertions between the edge of an exon and a donor or acceptor site + # and between a donor or acceptor site and the intron + + if ( overlap($vf_start, $vf_end, $intron_start-3, $intron_start-1) or + overlap($vf_start, $vf_end, $intron_start+2, $intron_start+7) or + overlap($vf_start, $vf_end, $intron_end-7, $intron_end-2 ) or + overlap($vf_start, $vf_end, $intron_end+1, $intron_end+3 ) or + ($insertion && ( + $vf_start == $intron_start || + $vf_end == $intron_end || + $vf_start == $intron_start+2 || + $vf_end == $intron_end-2 + ) )) { + + $intron_effects->{splice_region} = 1; + } + } + + $self->{_intron_effects} = $intron_effects; + } + + return $self->{_intron_effects}; +} + +# NB: the methods below all cache their data in the associated transcript itself, this +# gives a significant speed up when you are calculating the effect of all variations +# on a transcript, and means that the cache will be freed when the transcript itself +# is garbage collected rather than us having to maintain a transcript feature cache +# ourselves + +sub _introns { + my $self = shift; + + my $tran = $self->transcript; + + my $introns = $tran->{_variation_effect_feature_cache}->{introns} ||= $tran->get_all_Introns; + + return $introns; +} + +sub _exons { + my $self = shift; + + my $tran = $self->transcript; + + my $exons = $tran->{_variation_effect_feature_cache}->{exons} ||= $tran->get_all_Exons; + + return $exons; +} + +sub _mapper { + my $self = shift; + + my $tran = $self->transcript; + + my $mapper = $tran->{_variation_effect_feature_cache}->{mapper} ||= $tran->get_TranscriptMapper; + + return $mapper; +} +sub _translateable_seq { + my $self = shift; + + my $tran = $self->transcript; + + my $tran_seq = $tran->{_variation_effect_feature_cache}->{translateable_seq} ||= $tran->translateable_seq; + + return $tran_seq; +} + +sub _peptide { + my $self = shift; + + my $tran = $self->transcript; + + my $peptide = $tran->{_variation_effect_feature_cache}->{peptide}; + + unless ($peptide) { + my $translation = $tran->translate; + $peptide = $translation ? $translation->seq : undef; + $tran->{_variation_effect_feature_cache}->{peptide} = $peptide; + } + + return $peptide; +} + +sub _translation_md5 { + my $self = shift; + + my $tran = $self->transcript; + + unless (exists $tran->{_variation_effect_feature_cache}->{translation_md5}) { + $tran->{_variation_effect_feature_cache}->{translation_md5} = + $self->_peptide ? md5_hex($self->_peptide) : undef; + } + + return $tran->{_variation_effect_feature_cache}->{translation_md5}; +} + +sub _codon_table { + my $self = shift; + + my $tran = $self->transcript; + + my $codon_table = $tran->{_variation_effect_feature_cache}->{codon_table}; + + unless ($codon_table) { + # for mithocondrial dna we need to to use a different codon table + my $attrib = $tran->slice->get_all_Attributes('codon_table')->[0]; + + # default to the vertebrate codon table which is denoted as 1 + $codon_table = $attrib ? $attrib->value : 1; + + $tran->{_variation_effect_feature_cache}->{codon_table} = $codon_table + } + + return $codon_table; +} + +1; \ No newline at end of file diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/BaseTranscriptVariationAllele.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/BaseTranscriptVariationAllele.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,87 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Variation::BaseTranscriptVariationAllele + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Variation::BaseTranscriptVariationAllele; + +=head1 DESCRIPTION + +An helper class for representing an overlap of a Transcript and a +Variation allele (either sequence or structural). Should not be invoked +directly. + +=cut + +package Bio::EnsEMBL::Variation::BaseTranscriptVariationAllele; + +use strict; +use warnings; + +use base qw(Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele); + +=head2 base_transcript_variation + + Description: Get/set the associated BaseTranscriptVariation + Returntype : Bio::EnsEMBL::Variation::BaseTranscriptVariation + Exceptions : throws if the argument is the wrong type + Status : At Risk + +=cut + +sub base_transcript_variation { + my ($self, $btv) = @_; + assert_ref($btv, 'Bio::EnsEMBL::Variation::BaseTranscriptVariation') if $btv; + return $self->variation_feature_overlap($btv); +} + +=head2 transcript + + Description: Get the associated Transcript + Returntype : Bio::EnsEMBL::Transcript + Exceptions : none + Status : At Risk + +=cut + +sub transcript { + my $self = shift; + return $self->base_transcript_variation->transcript; +} + +=head2 base_variation_feature + + Description: Get the associated BaseVariationFeature + Returntype : Bio::EnsEMBL::Variation::BaseVariationFeature + Exceptions : none + Status : At Risk + +=cut + +sub base_variation_feature { + my $self = shift; + return $self->base_transcript_variation->base_variation_feature; +} + +1; \ No newline at end of file diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/BaseVariationFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/BaseVariationFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,53 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# Ensembl module for Bio::EnsEMBL::Variation::VariationFeature +# +# Copyright (c) 2004 Ensembl +# + + +=head1 NAME + +Bio::EnsEMBL::Variation::BaseVariationFeature - Abstract base class for variation features + +=head1 SYNOPSIS + +None + +=head1 DESCRIPTION + +Abstract base class representing variation features. Should not be instantiated +directly. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::BaseVariationFeature; + +use Bio::EnsEMBL::Feature; + +our @ISA = ('Bio::EnsEMBL::Feature'); + +1; \ No newline at end of file diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/BaseVariationFeatureOverlap.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/BaseVariationFeatureOverlap.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,444 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Variation::BaseVariationFeatureOverlap + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Variation::BaseVariationFeatureOverlap; + + my $bvfo = Bio::EnsEMBL::Variation::BaseVariationFeatureOverlap->new( + -feature => $feature, + -base_variation_feature => $var_feat + ); + + print "consequence type: ", (join ",", @{ $bvfo->consequence_type }), "\n"; + print "most severe consequence: ", $bvfo->display_consequence, "\n"; + +=head1 DESCRIPTION + +A BaseVariationFeatureOverlap represents a BaseVariationFeature which is in close +proximity to another Ensembl Feature. It is the superclass of variation feature +specific classes such as VariationFeatureOverlap and StructuralVariationOverlap +and has methods common to all such objects. You will not normally instantiate this +class directly, instead instantiating one of the more specific subclasses. + +=cut + +package Bio::EnsEMBL::Variation::BaseVariationFeatureOverlap; + +use strict; +use warnings; + +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Scalar qw(assert_ref); +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(overlap within_cds); + +sub new { + my $class = shift; + + my ( + $adaptor, + $base_variation_feature, + $feature, + $no_transfer + ) = rearrange([qw( + ADAPTOR + BASE_VARIATION_FEATURE + FEATURE + NO_TRANSFER + )], @_); + + assert_ref($base_variation_feature, 'Bio::EnsEMBL::Variation::BaseVariationFeature'); + + if (defined $feature) { + + assert_ref($feature, 'Bio::EnsEMBL::Feature'); + + # we need to ensure the Feature and the BaseVariationFeature live on the same slice + # so we explicitly transfer the Feature here + unless($no_transfer && $no_transfer == 1) { + $feature = $feature->transfer($base_variation_feature->slice) + or throw("Unable to transfer the supplied feature to the same slice as the base variation feature"); + } + } + + my $self = bless { + base_variation_feature => $base_variation_feature, + feature => $feature, + adaptor => $adaptor, + }, $class; + + return $self; +} + +sub new_fast { + my ($class, $hashref) = @_; + return bless $hashref, $class; +} + +=head2 feature + + Arg [1] : (optional) A Bio::EnsEMBL::Feature + Description: Get/set the associated Feature, lazy-loading it if required + Returntype : Bio::EnsEMBL::Feature + Exceptions : throws isf the argument is the wrong type + Status : At Risk + +=cut + +sub feature { + my ($self, $feature, $type) = @_; + + if ($feature) { + assert_ref($feature, 'Bio::EnsEMBL::Feature'); + $self->{feature} = $feature; + } + + if ($type && !$self->{feature}) { + + # try to lazy load the feature + + if (my $adap = $self->{adaptor}) { + + my $get_method = 'get_'.$type.'Adaptor'; + + # XXX: this can doesn't work because the method is AUTOLOADed, need to rethink this... + #if ($adap->db->dnadb->can($get_method)) { + if (my $fa = $adap->db->dnadb->$get_method) { + + # if we have a stable id for the feature use that + if (my $feature_stable_id = $self->{_feature_stable_id}) { + if (my $f = $fa->fetch_by_stable_id($feature_stable_id)) { + $self->{feature} = $f; + delete $self->{_feature_stable_id}; + } + } + elsif (my $feature_label = $self->{_feature_label}) { + # get a slice covering the vf + + #for my $f ($fa->fetch_all_by_Slice_constraint) + } + } + #} + else { + warn "Cannot get an adaptor for type: $type"; + } + } + } + + return $self->{feature}; +} + +sub _fetch_feature_for_stable_id { + + # we shouldn't actually need this method as there will apparently + # soon be core support for fetching any feature by its stable id, + # but I'm waiting for core to add this... + + my ($self, $feature_stable_id) = @_; + + my $type_lookup = { + G => { type => 'Gene', group => 'core' }, + T => { type => 'Transcript', group => 'core' }, + R => { type => 'RegulatoryFeature', group => 'funcgen' }, + }; + + if ($feature_stable_id =~ /^ENS[A-Z]*([G|R|T])\d+$/) { + + my $type = $type_lookup->{$1}->{type}; + my $group = $type_lookup->{$1}->{group}; + + if (my $adap = $self->{adaptor}) { + + my $get_method = 'get_'.$type.'Adaptor'; + + if ($adap->db->dnadb->can($get_method)) { + if (my $fa = $adap->db->dnadb->$get_method) { + + # if we have a stable id for the feature use that + if (my $feature_stable_id = $self->{_feature_stable_id}) { + if (my $f = $fa->fetch_by_stable_id($feature_stable_id)) { + $self->{feature} = $f; + delete $self->{_feature_stable_id}; + } + } + elsif (my $feature_label = $self->{_feature_label}) { + # get a slice covering the vf + + + #for my $f ($fa->fetch_all_by_Slice_constraint) + } + } + } + else { + warn "Cannot get an adaptor for type: $type"; + } + } + } +} + +sub _fetch_adaptor_for_group { + my ($self, $group) = @_; + +} + +sub _feature_stable_id { + my $self = shift; + if ($self->feature && $self->feature->can('stable_id')) { + return $self->feature->stable_id; + } + elsif (my $id = $self->{_feature_stable_id}) { + return $id; + } + else { + return undef; + } +} + +=head2 base_variation_feature + + Arg [1] : (optional) A Bio::EnsEMBL::Variation::BaseVariationFeature + Description: Get/set the associated BaseVariationFeature + Returntype : Bio::EnsEMBL::Variation::BaseVariationFeature + Exceptions : throws if the argument is the wrong type + Status : At Risk + +=cut + +sub base_variation_feature { + my ($self, $bvf) = @_; + + if ($bvf) { + assert_ref($bvf, 'Bio::EnsEMBL::Variation::BaseVariationFeature'); + $self->{base_variation_feature} = $bvf; + } + + return $self->{base_variation_feature}; +} + +=head2 add_BaseVariationFeatureOverlapAllele + + Arg [1] : A Bio::EnsEMBL::Variation::BaseVariationFeatureOverlapAllele instance + Description: Add an allele to this BaseVariationFeatureOverlap + Returntype : none + Exceptions : throws if the argument is not the expected type + Status : At Risk + +=cut + +sub add_BaseVariationFeatureOverlapAllele { + my ($self, $bvfoa) = @_; + + assert_ref($bvfoa, 'Bio::EnsEMBL::Variation::BaseVariationFeatureOverlapAllele'); + + if ($bvfoa->is_reference) { + $self->{reference_allele} = $bvfoa; + } + else { + my $alt_alleles = $self->{alt_alleles} ||= []; + push @$alt_alleles, $bvfoa; + } +} + +=head2 get_reference_BaseVariationFeatureOverlapAllele + + Description: Get the object representing the reference allele of this BaseVariationFeatureOverlapAllele + Returntype : Bio::EnsEMBL::Variation::BaseVariationFeatureOverlapAllele instance + Exceptions : none + Status : At Risk + +=cut + +sub get_reference_BaseVariationFeatureOverlapAllele { + my $self = shift; + return $self->{reference_allele}; +} + +=head2 get_all_alternate_BaseVariationFeatureOverlapAlleles + + Description: Get a list of the alternate alleles of this BaseVariationFeatureOverlapAllele + Returntype : listref of Bio::EnsEMBL::Variation::BaseVariationFeatureOverlapAllele objects + Exceptions : none + Status : At Risk + +=cut + +sub get_all_alternate_BaseVariationFeatureOverlapAlleles { + my $self = shift; + + $self->{alt_alleles} ||= []; + + return $self->{alt_alleles}; +} + +=head2 get_all_BaseVariationFeatureOverlapAlleles + + Description: Get a list of the all the alleles, both reference and alternate, of this + BaseVariationFeatureOverlap + Returntype : listref of Bio::EnsEMBL::Variation::BaseVariationFeatureOverlapAllele objects + Exceptions : none + Status : At Risk + +=cut + +sub get_all_BaseVariationFeatureOverlapAlleles { + my $self = shift; + + my @alleles = @{ $self->get_all_alternate_BaseVariationFeatureOverlapAlleles }; + + my $ref = $self->get_reference_BaseVariationFeatureOverlapAllele; + + unshift @alleles, $ref if defined $ref; + + return \@alleles; +} + +=head2 consequence_type + + Arg [1] : (optional) String $term_type + Description: Get a list of all the unique consequence terms of the alleles of this + BaseVariationFeatureOverlap. By default returns Ensembl display terms + (e.g. 'NON_SYNONYMOUS_CODING'). $term_type can also be 'label' + (e.g. 'Non-synonymous coding'), 'SO' (Sequence Ontology, e.g. + 'non_synonymous_codon') or 'NCBI' (e.g. 'missense') + Returntype : listref of strings + Exceptions : none + Status : At Risk + +=cut + +sub consequence_type { + my $self = shift; + my $term_type = shift; + + my $method_name; + + # delete cached term + if(defined($term_type)) { + delete $self->{_consequence_type}; + $method_name = $term_type.($term_type eq 'label' ? '' : '_term'); + $method_name = 'SO_term' unless defined $self->most_severe_OverlapConsequence && $self->most_severe_OverlapConsequence->can($method_name); + } + + $method_name ||= 'SO_term'; + + unless ($self->{_consequence_type}) { + + # use a hash to ensure we don't include redundant terms (because more than one + # allele may have the same consequence SO_term) + + my %cons_types; + + for my $allele (@{ $self->get_all_alternate_BaseVariationFeatureOverlapAlleles }) { + for my $cons (@{ $allele->get_all_OverlapConsequences }) { + $cons_types{$cons->$method_name} = $cons->rank; + } + } + + # sort the consequence types by rank such that the more severe terms are earlier in the list + + $self->{_consequence_type} = [ sort { $cons_types{$a} <=> $cons_types{$b} } keys %cons_types ]; + } + + return $self->{_consequence_type}; +} + +=head2 most_severe_OverlapConsequence + + Description: Get the OverlapConsequence considered (by Ensembl) to be the most severe + consequence of all the alleles of this VariationFeatureOverlap + Returntype : Bio::EnsEMBL::Variation::OverlapConsequence + Exceptions : none + Status : At Risk + +=cut + +sub most_severe_OverlapConsequence { + my $self = shift; + + unless ($self->{_most_severe_consequence}) { + + my $highest; + + for my $allele (@{ $self->get_all_alternate_BaseVariationFeatureOverlapAlleles }) { + for my $cons (@{ $allele->get_all_OverlapConsequences }) { + $highest ||= $cons; + if ($cons->rank < $highest->rank) { + $highest = $cons; + } + } + } + + $self->{_most_severe_consequence} = $highest; + } + + return $self->{_most_severe_consequence}; +} + +=head2 display_consequence + + Arg [1] : (optional) String $term_type + Description: Get the term for the most severe OverlapConsequence of this + VariationFeatureOverlap. By default returns Ensembl display terms + (e.g. 'NON_SYNONYMOUS_CODING'). $term_type can also be 'label' + (e.g. 'Non-synonymous coding'), 'SO' (Sequence Ontology, e.g. + 'non_synonymous_codon') or 'NCBI' (e.g. 'missense') + Returntype : string + Exceptions : none + Status : At Risk + +=cut + +sub display_consequence { + my $self = shift; + my $term_type = shift; + + my $method_name; + + # delete cached term + if(defined($term_type)) { + $method_name = $term_type.($term_type eq 'label' ? '' : '_term'); + $method_name = 'SO_term' unless @{$self->get_all_OverlapConsequences} && $self->get_all_OverlapConsequences->[0]->can($method_name); + } + + $method_name ||= 'SO_term'; + + my $worst_conseq = $self->most_severe_OverlapConsequence; + + return $worst_conseq ? $worst_conseq->$method_name : ''; +} + +sub adaptor { + my $self = shift; + $self->{adaptor} = shift if @_; + + # make adaptor an anonymous hash in its absence + # this allows the VEP to cache OverlapConsequences in offline mode + $self->{adaptor} ||= {}; + + return $self->{adaptor}; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/BaseVariationFeatureOverlapAllele.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/BaseVariationFeatureOverlapAllele.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,279 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Variation::BaseVariationFeatureOverlapAllele + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Variation::BaseVariationFeatureOverlapAllele; + + my $bvfoa = Bio::EnsEMBL::Variation::BaseVariationFeatureOverlapAllele->new( + -base_variation_feature_overlap => $bvfo, + -is_reference => 0, + ); + + print "consequence SO terms: ", (join ",", map { $_->SO_term } @{ $bvfoa->get_all_OverlapConsequences }), "\n"; + +=head1 DESCRIPTION + +A BaseVariationFeatureOverlapAllele object represents a single allele of a +BaseVariationFeatureOverlap. It is the super-class of variation feature specific +classes such as VariationFeatureOverlapAllele and StructuralVariationOverlapAllele +and contains methods not specific to any particular variation feature type. +Ordinarily you will not create these objects yourself, but instead you would +create one of the more specific subclasses. + +=cut + +package Bio::EnsEMBL::Variation::BaseVariationFeatureOverlapAllele; + +use strict; +use warnings; + +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Scalar qw(assert_ref); +use Bio::EnsEMBL::Utils::Exception qw(throw); +use Bio::EnsEMBL::Variation::Utils::Constants qw(%OVERLAP_CONSEQUENCES); +use Scalar::Util qw(weaken); + +=head2 new + + Arg [-BASE_VARIATION_FEATURE_OVERLAP] : + The Bio::EnsEMBL::BaseVariationFeatureOverlap with which this allele is + associated + + Arg [-IS_REFERENCE] : + A flag indicating if this allele is the reference allele or not + + Example : + my $bvfoa = Bio::EnsEMBL::Variation::BaseVariationFeatureOverlapAllele->new( + -base_variation_feature_overlap => $bvfo, + -is_reference => 0 + ); + + Description: Constructs a new BaseVariationFeatureOverlapAllele instance given a + BaseVariationFeatureOverlap and a flag indicating if this is the + reference allele + Returntype : A new Bio::EnsEMBL::Variation::BaseVariationFeatureOverlapAllele instance + Exceptions : throws unlessBASE_VARIATION_FEATURE_OVERLAP is supplied + Status : At Risk + +=cut + +sub new { + my $class = shift; + + my ( + $base_variation_feature_overlap, + $is_reference + ) = rearrange([qw( + BASE_VARIATION_FEATURE_OVERLAP + IS_REFERENCE + )], @_); + + assert_ref($base_variation_feature_overlap, 'Bio::EnsEMBL::Variation::BaseVariationFeatureOverlap'); + + my $self = bless { + base_variation_feature_overlap => $base_variation_feature_overlap, + is_reference => $is_reference, + }, $class; + + # avoid a memory leak, because the bvfo also has a reference to us + weaken $self->{base_variation_feature_overlap}; + + return $self; +} + +sub new_fast { + my ($class, $hashref) = @_; + my $self = bless $hashref, $class; + # avoid a memory leak, because the bvfo also has a reference to us + weaken $self->{base_variation_feature_overlap} if $self->{base_variation_feature_overlap}; + return $self; +} + +=head2 base_variation_feature_overlap + + Description: Get/set the associated BaseVariationFeatureOverlap + Returntype : Bio::EnsEMBL::Variation::BaseVariationFeatureOverlap + Exceptions : throws if the argument is the wrong type + Status : At Risk + +=cut + +sub base_variation_feature_overlap { + my ($self, $bvfo) = @_; + + if ($bvfo) { + assert_ref($bvfo, 'Bio::EnsEMBL::Variation::BaseVariationFeatureOverlap'); + $self->{base_variation_feature_overlap} = $bvfo; + # avoid a memory leak, because the bvfo also has a reference to us + weaken $self->{base_variation_feature_overlap}; + } + + return $self->{base_variation_feature_overlap}; +} + +=head2 base_variation_feature + + Description: Get the associated BaseVariationFeature + Returntype : Bio::EnsEMBL::Variation::BaseVariationFeature + Exceptions : none + Status : At Risk + +=cut + +sub base_variation_feature { + my $self = shift; + return $self->base_variation_feature_overlap->base_variation_feature(@_); +} + +=head2 feature + + Description: Get the associated Feature + Returntype : Bio::EnsEMBL::Feature (or relevant subclass) + Exceptions : none + Status : At Risk + +=cut + +sub feature { + my $self = shift; + return $self->base_variation_feature_overlap->feature(@_); +} + +=head2 is_reference + + Args [1] : A boolean value + Description: Get/set a flag indicating if this allele is the reference allele + Returntype : bool + Exceptions : none + Status : At Risk + +=cut + +sub is_reference { + my ($self, $is_reference) = @_; + $self->{is_reference} = $is_reference if defined $is_reference; + return $self->{is_reference}; +} + +=head2 get_all_OverlapConsequences + + Description: Get a list of all the OverlapConsequences of this allele, calculating them + on the fly if necessary + Returntype : listref of Bio::EnsEMBL::Variation::OverlapConsequence objects + Exceptions : none + Status : At Risk + +=cut + +sub get_all_OverlapConsequences { + my $self = shift; + + unless ($self->{overlap_consequences}) { + + # calculate consequences on the fly + + my $cons = []; + + my $assigned_tier; + + # loop over all the possible consequences + for my $oc (@{$self->get_sorted_OverlapConsequences}) { + + last if defined($assigned_tier) and $oc->tier > $assigned_tier; + + # check that this consequence applies to this type of variation feature + + if ($oc->variant_feature_class && $self->base_variation_feature->isa($oc->variant_feature_class)) { + + # check that this consequence applies to this type of feature + + if ($self->feature->isa($oc->feature_class)) { + + # if so, check if the predicate of this consequence holds for this bvfoa + my $check = $oc->predicate->($self); + + #print STDERR $self->base_variation_feature->variation_name." ".$oc->{SO_term}." ".$self->feature->stable_id. " $check\n"; + + if ($check) { + push @$cons, $oc; + $assigned_tier = $oc->tier; + } + } + } + } + + $self->{overlap_consequences} = $cons; + } + + return $self->{overlap_consequences}; +} + +=head2 add_OverlapConsequence + + Arg [1] : Bio::EnsEMBL::Variation::OverlapConsequence instance + Description: Add an OverlapConsequence to this allele's list + Returntype : none + Exceptions : throws if the argument is the wrong type + Status : At Risk + +=cut + +sub add_OverlapConsequence { + my ($self, $oc) = @_; + assert_ref($oc, 'Bio::EnsEMBL::Variation::OverlapConsequence'); + $self->{overlap_consequences} ||= []; + push @{ $self->{overlap_consequences} }, $oc; +} + +sub SO_isa { + my ($self, $query) = @_; + + if (my $adap = $self->base_variation_feature_overlap->{adaptor}) { + if (my $ota = $adap->db->dnadb->get_OntologyTermAdaptor) { + my $term = $ota->fetch_by_accession(); + my @parents = $ota->fetch_by_child_term($term); + } + } + + for my $cons (@{ $self->get_all_OverlapConsequences }) { + if ($cons->SO_term eq $query) { + return 1; + } + } +} + +sub get_sorted_OverlapConsequences { + my $self = shift; + + if(!defined($self->base_variation_feature_overlap->adaptor->{sorted_cons})) { + my @sorted = sort {$a->tier <=> $b->tier} values %OVERLAP_CONSEQUENCES; + $self->base_variation_feature_overlap->adaptor->{sorted_cons} = \@sorted; + } + + return $self->base_variation_feature_overlap->adaptor->{sorted_cons}; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/ConsequenceType.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/ConsequenceType.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,578 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# EnsEMBL module for ConsequenceType +# Copyright EMBL-EBI/Sanger center 2005 +# +# +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::EnsEMBL::Variation::ConsequenceType + +=head1 SYNOPSIS + + +=head1 DESCRIPTION + +Represents the effect of a Variation in a Transcript + +=cut + +package Bio::EnsEMBL::Variation::ConsequenceType; + +use strict; + +use Bio::EnsEMBL::Utils::Exception qw(warning); + +#conversion of consequence type to bit value + +#there is a special type, SARA, that only applies to the effect of the Alleles, not Variations, and is equivalent +#to Same As Reference Allele, meaning that the Allele is the same as in reference sequence, so has no effect +#but it is not stored anywhere in the database and need no conversion at all +#when creating the VariationFeature object, thus the absence in the hash +our %CONSEQUENCE_TYPES = ( + 'ESSENTIAL_SPLICE_SITE' => 1, + 'STOP_GAINED' => 2, + 'STOP_LOST' => 4, + 'COMPLEX_INDEL' => 8, + 'FRAMESHIFT_CODING' => 16, + 'NON_SYNONYMOUS_CODING' => 32, + 'SPLICE_SITE' => 64, + 'PARTIAL_CODON' => 128, + 'SYNONYMOUS_CODING' => 256, + 'REGULATORY_REGION' => 512, + 'WITHIN_MATURE_miRNA' => 1024, + '5PRIME_UTR' => 2048, + '3PRIME_UTR' => 2094, + 'UTR' => 4096, + 'INTRONIC' => 8192, + 'NMD_TRANSCRIPT' => 16384, + 'WITHIN_NON_CODING_GENE' => 32768, + 'UPSTREAM' => 65536, + 'DOWNSTREAM' => 131072, + 'HGMD_MUTATION' => 262144, + 'NO_CONSEQUENCE' => 524288, + 'INTERGENIC' => 1048576, + '_' => 2097152, +); + + +our %CONSEQUENCE_DESCRIPTIONS = ( + 'ESSENTIAL_SPLICE_SITE' => 'In the first 2 or the last 2 basepairs of an intron', + 'STOP_GAINED' => 'In coding sequence, resulting in the gain of a stop codon', + 'STOP_LOST' => 'In coding sequence, resulting in the loss of a stop codon', + 'COMPLEX_INDEL' => 'Insertion or deletion that spans an exon/intron or coding sequence/UTR border', + 'FRAMESHIFT_CODING' => 'In coding sequence, resulting in a frameshift', + 'NON_SYNONYMOUS_CODING' => 'In coding sequence and results in an amino acid change in the encoded peptide sequence', + 'SPLICE_SITE' => '1-3 bps into an exon or 3-8 bps into an intron', + 'PARTIAL_CODON' => 'Located within the final, incomplete codon of a transcript whose end coordinate is unknown', + 'SYNONYMOUS_CODING' => 'In coding sequence, not resulting in an amino acid change (silent mutation)', + 'REGULATORY_REGION' => 'In regulatory region annotated by Ensembl', + 'WITHIN_MATURE_miRNA' => 'Located within a microRNA', + '5PRIME_UTR' => 'In 5 prime untranslated region', + '3PRIME_UTR' => 'In 3 prime untranslated region', + 'INTRONIC' => 'In intron', + 'NMD_TRANSCRIPT' => 'Located within a transcript predicted to undergo nonsense-mediated decay', + 'WITHIN_NON_CODING_GENE' => 'Located within a gene that does not code for a protein', + 'UPSTREAM' => 'Within 5 kb upstream of the 5 prime end of a transcript', + 'DOWNSTREAM' => 'Within 5 kb downstream of the 3 prime end of a transcript', + 'HGMD_MUTATION' => 'Mutation from the HGMD database - consequence unknown', + 'INTERGENIC' => 'More than 5 kb either upstream or downstream of a transcript', +); + + +our %CONSEQUENCE_LABELS = ( + 'ESSENTIAL_SPLICE_SITE' => 'Essential splice site', + 'STOP_GAINED' => 'Stop gained', + 'STOP_LOST' => 'Stop lost', + 'COMPLEX_INDEL' => 'Complex in/del', + 'FRAMESHIFT_CODING' => 'Frameshift coding', + 'NON_SYNONYMOUS_CODING' => 'Non-synonymous coding', + 'SPLICE_SITE' => 'Splice site', + 'PARTIAL_CODON' => 'Partial codon', + 'SYNONYMOUS_CODING' => 'Synonymous coding', + 'REGULATORY_REGION' => 'Regulatory region', + 'WITHIN_MATURE_miRNA' => 'Within mature miRNA', + '5PRIME_UTR' => '5 prime UTR', + '3PRIME_UTR' => '3 prime UTR', + 'INTRONIC' => 'Intronic', + 'NMD_TRANSCRIPT' => 'NMD transcript', + 'WITHIN_NON_CODING_GENE' => 'Within non-coding gene', + 'UPSTREAM' => 'Upstream', + 'DOWNSTREAM' => 'Downstream', + 'HGMD_MUTATION' => 'HGMD mutation', + 'INTERGENIC' => 'Intergenic', +); + +# hash storing whether consequence affects peptide sequence +our %AFFECTS_PEPTIDE = ( + 'STOP_GAINED' => 1, + 'STOP_LOST' => 1, + 'COMPLEX_INDEL' => 1, + 'FRAMESHIFT_CODING' => 1, + 'NON_SYNONYMOUS_CODING' => 1, + 'PARTIAL_CODON' => 1, +); + + +=head2 new + + Arg [1] : (optional) int $transcript_id + Arg [2] : (optional) int $variation_feature_id + Arg [2] : (optional) int $start + Arg [3] : (optional) int $end + Arg [4] : (optional) int $strand + Arg [5] : (optional) refarray $alleles + Example : $synonym = Bio::EnsEMBL::Variation::ConsequenceType->new($transcript_id,$variation_feature_id,$start,$end,$strand,['A','C']); + Description: Creates a new ConsequenceType + Returntype : Bio::EnsEMBL::Variation::ConsequenceType + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub new { + my ($caller, $transcript_id, $variation_feature_id, $start, $end, $strand, $alleles) = @_; + + my $class = ref($caller) || $caller; + + return bless( {'transcript_id' => $transcript_id, + 'variation_feature_id' => $variation_feature_id, + 'alleles' => $alleles, + 'start' => $start, + 'end' => $end, + 'strand' => $strand}, $class ); +} + + +=head2 transcript_id + + Arg [1] : (optional) int $transcript_id + Example : $transcript_id = $consequence_type->transcript_id; + Description: Getter/Setter for the internal id of the transcript_id calculated + the effect of the Variation + Returntype : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub transcript_id { + my $self = shift; + + if(@_) { + $self->{'transcript_id'} = shift; + } + + return $self->{'transcript_id'}; +} + + +=head2 variation_feature_id + + Arg [1] : (optional) int $variation_feature_id + Example : $variation_feature_id = $consequence_type->variation_feature_id; + Description: Getter/Setter for the variation_feature affecting the transcript + Returntype : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub variation_feature_id { + my $self = shift; + + if(@_) { + $self->{'variation_feature_id'} = shift; + } + + return $self->{'variation_feature_id'}; +} + +=head2 alleles + + Arg [1] : (optional) array ref $alleles + Example : @alleles = @{$consequence_type->alleles}; + Description: Getter/Setter for the alleles for the variation + Returntype : reference to array + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub alleles { + my $self = shift; + + if(@_) { + $self->{'alleles'} = shift; + } + + return $self->{'alleles'}; +} + +=head2 start + + Arg [1] : (optional) int $start + Example : $start = $consequence_type->start + Description: Getter/Setter for the start of the variation in the sequence + Returntype : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub start { + my $self = shift; + + if(@_) { + $self->{'start'} = shift; + } + + return $self->{'start'} +} + + +=head2 end + + Arg [1] : (optional) int $end + Example : $end = $consequence_type->end + Description: Getter/Setter for the end of the variation in the sequence + Returntype : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub end { + my $self = shift; + + if(@_) { + $self->{'end'} = shift; + } + + return $self->{'end'} +} + + +=head2 strand + + Arg [1] : (optional) int $strand + Example : $strand = $consequence_type->strand + Description: Getter/Setter for the strand of the variation in the sequence + Returntype : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub strand { + my $self = shift; + + if(@_) { + $self->{'strand'} = shift; + } + + return $self->{'strand'} +} + + +=head2 aa_start + + Arg [1] : (optional) int $aa_start + Example : $aa_start = $consequence_type->aa_start + Description: Getter/Setter for the start of the aa in peptide coordinates + Returntype : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub aa_start { + my $self = shift; + + if(@_) { + $self->{'aa_start'} = shift; + } + + return $self->{'aa_start'} +} + +=head2 aa_end + + Arg [1] : (optional) int $aa_end + Example : $aa_end = $consequence_type->aa_end + Description: Getter/Setter for the end of the aa in peptide coordinates + Returntype : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub aa_end { + my $self = shift; + + if(@_) { + $self->{'aa_end'} = shift; + } + + return $self->{'aa_end'} +} + +=head2 cdna_start + + Arg [1] : (optional) int $cdna_start + Example : $cdna_start = $consequence_type->cdna_start + Description: Getter/Setter for the start of the variation in the cdna coordinates + Returntype : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub cdna_start { + my $self = shift; + + if(@_) { + $self->{'cdna_start'} = shift; + } + + return $self->{'cdna_start'} +} + +=head2 cdna_end + + Arg [1] : (optional) int $cdna_end + Example : $cdna_end = $consequence_type->cdna_end + Description: Getter/Setter for the end of the variation in cdna coordinates + Returntype : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub cdna_end { + my $self = shift; + + if(@_) { + $self->{'cdna_end'} = shift; + } + + return $self->{'cdna_end'} +} + + +=head2 type + + Arg [1] : string $type + (possible types 'FRAMESHIFT_CODING','STOP_GAINED','STOP_LOST','NON_SYNONYMOUS_CODING', + 'SYNONYMOUS_CODING','REGULATORY_REGION','WITHIN_MATURE_miRNA','5PRIME_UTR','3PRIME_UTR','INTRONIC','UPSTREAM','DOWNSTREAM','WITHIN_NON_CODING_GENE','INTERGENIC', 'SARA') + Example : $consequence_type = $consequence_type->type + Description: Getter/Setter for consequence type of the variation in the transcript + Returntype : none + Exceptions : warning if the consequence type is not recognised + Caller : general + Status : At Risk + +=cut + +sub type { + my $self = shift; + + if(@_) { + my $type = shift; + #there is a special type, SARA, that only applies to the effect of the Alleles, and is equivalent + #to Same As Reference Allele, which is not stored anywhere in the database and need no conversion at all + #when creating the VariationFeature object, thus the absence in the hash + if (defined $CONSEQUENCE_TYPES{$type} || $type eq 'SARA'){ + push @{$self->{'type'}}, $type; + } + else{ + warning("Trying to set the consequence type to a not valid value. Possible values: ",keys %CONSEQUENCE_TYPES,"\n"); + } + } + return $self->{'type'} +} + + +=head2 aa_alleles + + Arg [1] : (optional) string $aa_alleles + Example : $aa_alleles = $consequence_type->aa_alleles + Description: Getter/Setter for the aa that changes in the transcript + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub aa_alleles { + my $self = shift; + + if(@_) { + $self->{'aa_alleles'} = shift; + } + + return $self->{'aa_alleles'} +} + + +=head2 codon + + Arg [1] : (optional) string $codon + Example : $codon = $consequence_type->codon + Description: Getter/Setter for the codon affected by that Allele in the transcript + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub codon { + my $self = shift; + + if(@_) { + $self->{'codon'} = shift; + } + + return $self->{'codon'} +} + + +=head2 codons + + Arg [1] : (optional) string $codons + Example : $codons = $consequence_type->codons + Description: Getter/Setter for the possible codons + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub codons { + my $self = shift; + + if(@_) { + $self->{'codons'} = shift; + } + + return $self->{'codons'} +} + +=head2 cds_start + + Arg [1] : (optional) int $cds_start + Example : $cds_start = $consequence_type->cds_start + Description: Getter/Setter for the start of the variation in the coding sequence + Returntype : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub cds_start { + my $self = shift; + + if(@_) { + $self->{'cds_start'} = shift; + } + + return $self->{'cds_start'} +} + +=head2 cds_end + + Arg [1] : (optional) int $cds_end + Example : $cds_end = $consequence_type->cds_end + Description: Getter/Setter for the end of the variation in the coding sequence + Returntype : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub cds_end { + my $self = shift; + + if(@_) { + $self->{'cds_end'} = shift; + } + + return $self->{'cds_end'} +} + +=head2 display_consequence + + Arg [1] : (optional) string $consequence_type + Example : $display_consequence = $ct->display_consequence(); + Description: Getter for the consequence type to display, + when more than one + Returntype : string + Exceptions : throw on incorrect argument + Caller : webteam + Status : At Risk + +=cut + +sub display_consequence{ + my $self = shift; + + my $highest_priority; + #get the value to display from the consequence_type attribute + $highest_priority = 'INTERGENIC'; + foreach my $ct (@{$self->type}){ + if ($CONSEQUENCE_TYPES{$ct} < $CONSEQUENCE_TYPES{$highest_priority}){ + $highest_priority = $ct; + } + } + + return $highest_priority; +} + +sub empty_type{ + my $self = shift; + + $self->{'type'} = (); + return $self->type; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/AlleleAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/AlleleAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,492 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# +# Ensembl module for Bio::EnsEMBL::Variation::DBSQL::AlleleAdaptor +# +# Copyright (c) 2011 Ensembl +# +# You may distribute this module under the same terms as perl itself +# +# + +=head1 NAME + +Bio::EnsEMBL::Variation::DBSQL::AlleleAdaptor + +=head1 SYNOPSIS + $reg = 'Bio::EnsEMBL::Registry'; + + $reg->load_registry_from_db(-host => 'ensembldb.ensembl.org',-user => 'anonymous'); + + $va = $reg->get_adaptor("human","variation","allele"); + + + +=head1 DESCRIPTION + +This adaptor provides database connectivity for Allele objects. +Alleles may be retrieved from the Ensembl variation database by +several means using this module. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::DBSQL::AlleleAdaptor; + +use Bio::EnsEMBL::Variation::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Scalar qw(assert_ref); + +use Bio::EnsEMBL::Variation::Allele; +use Bio::EnsEMBL::Utils::Iterator; + +use DBI qw(:sql_types); + +our @ISA = ('Bio::EnsEMBL::Variation::DBSQL::BaseAdaptor'); + +my $DEFAULT_ITERATOR_CACHE_SIZE = 1000; + + +sub store { + my ($self, $allele) = @_; + + my $dbh = $self->dbc->db_handle; + + # look up or store allele code + my $allele_code = $self->_allele_code($allele->allele); + + my $sth = $dbh->prepare_cached(q{ + INSERT DELAYED INTO allele ( + variation_id, + subsnp_id, + allele_code_id, + sample_id, + frequency, + count + ) VALUES (?,?,?,?,?,?) + }); + + $sth->execute( + $allele->{_variation_id} || $allele->variation->dbID, + $allele->{subsnp}, + $allele_code, + $allele->population ? $allele->population->dbID : undef, + $allele->frequency, + $allele->count + ); + + $sth->finish; +} + +sub store_multiple { + my ($self, $alleles) = @_; + + my $dbh = $self->dbc->db_handle; + + my $q_string = join ",", map {'(?,?,?,?,?,?)'} @$alleles; + + my @args = map { + $_->{_variation_id} || $_->variation->dbID, + $_->{subsnp}, + $self->_allele_code($_->allele), + $_->population ? $_->population->dbID : undef, + $_->frequency, + $_->count + } @$alleles; + + my $sth = $dbh->prepare_cached(qq{ + INSERT INTO allele ( + variation_id, + subsnp_id, + allele_code_id, + sample_id, + frequency, + count + ) VALUES $q_string + }); + + $sth->execute(@args); + + $sth->finish; +} + +sub store_to_file_handle { + my ($self, $allele, $file_handle) = @_; + + my $dbh = $self->dbc->db_handle; + + print $file_handle join("\t", + $allele->{_variation_id} || $allele->variation->dbID || '\N', + $allele->{subsnp} || '\N', + $self->_allele_code($allele->allele), + $allele->population ? $allele->population->dbID : '\N', + defined($allele->frequency) ? $allele->frequency : '\N', + defined($allele->count) ? $allele->count : '\N', + )."\n"; +} + +=head2 fetch_all + + Description: fetch_all should not be used for Alleles. + Exceptions : thrown on invocation + Status : At risk + +=cut + +sub fetch_all { + my $self = shift; + + throw("fetch_all cannot be used for Allele objects"); +} + +=head2 fetch_all_by_subsnp_id + + Arg [1] : string $subsnp_id + Example : $alleles = $allele_adaptor->fetch_all_by_subsnp_id('ss123'); + Description: Retrieves all allele objects via a component subsnp ID + Returntype : listref of Bio::EnsEMBL::Variation::Allele objects + Exceptions : throw if name argument is not defined + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_subsnp_id { + my $self = shift; + my $name = shift; + + $name =~ s/^ss//gi; + + throw('name argument expected') if(!defined($name)); + + # Add the constraint on the subsnp_id column and pass to generic_fetch + my $constraint = qq{ a.subsnp_id = $name }; + + return $self->generic_fetch($constraint); +} + + +=head2 fetch_all_by_Variation + + Arg [1] : Bio::EnsEMBL::Variation::Variation + Arg [2] : Bio::EnsEMBL::Variation::Population (optional) + Example : @alleles = @{$allele_adaptor->fetch_all_by_Variation($var)}; + Description: Retrieves all alleles which are associated with a specified + variation. If the optional population argument is specified, only retrieve + alleles for that population. + Returntype : listref of Bio::EnsEMBL::Variation::Allele + Exceptions : throw on incorrect argument + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_Variation { + my $self = shift; + my $variation = shift; + my $population = shift; + + # Make sure that we are passed a Variation object + assert_ref($variation,'Bio::EnsEMBL::Variation::Variation'); + + # If we got a population argument, make sure that it is a Population object + assert_ref($population,'Bio::EnsEMBL::Variation::Population') if (defined($population)); + + # Add a constraint on the variation_id column and pass to generic fetch + my $variation_id = $variation->dbID(); + my $constraint = qq{ a.variation_id = $variation_id }; + + # If required, add a constraint on the sample id + if (defined($population)) { + my $sample_id = $population->dbID(); + $constraint .= qq{ AND a.sample_id = $sample_id }; + } + + # Add the constraint for failed alleles + $constraint .= " AND " . $self->db->_exclude_failed_alleles_constraint(); + + my $alleles = $self->generic_fetch($constraint); + + # If a population was specified, attach the population to the allele object + map {$_->population($population)} @{$alleles} if (defined($population)); + + # Return the alleles + return $alleles; +} + +=head2 get_all_failed_descriptions + + Arg[1] : Bio::EnsEMBL::Variation::Allele + The allele object to get the failed descriptions for + Example : + my $failed_descriptions = $adaptor->get_all_failed_descriptions($allele); + if (scalar(@{$failed_descriptions})) { + print "The allele '" . $allele->allele() . "' has been flagged as failed because '" . join("' and '",@{$failed_descriptions}) . "'\n"; + } + + Description : Gets the unique descriptions for the reasons why the supplied allele has failed. + ReturnType : reference to a list of strings + Exceptions : thrown on incorrect argument + Caller : general + Status : At Risk + +=cut + +sub get_all_failed_descriptions { + my $self = shift; + my $allele = shift; + + # Call the internal get method without any constraints + my $description = $self->_internal_get_failed_descriptions($allele) || []; + + return $description; +} + +=head2 get_subsnp_handle + + Arg[1] : Bio::EnsEMBL::Variation::Allele + The allele object to get the subsnp handle for + Example : + my $handle = $adaptor->get_subsnp_handle($allele); + print "The allele '" . $allele->allele() . "' of subsnp 'ss" . $allele->subsnp_id() . "' was submitted by '$handle'\n"; + + Description : Gets the submitter handle for the specified allele + ReturnType : string + Exceptions : thrown on incorrect argument + Caller : general + Status : At Risk + +=cut + +sub get_subsnp_handle { + my $self = shift; + my $allele = shift; + + # Assert that the object passed is an Allele + assert_ref($allele,'Bio::EnsEMBL::Variation::Allele'); + + # Get the subsnp id and get rid of any 'ss' prefix + my $ssid = $allele->subsnp() || ""; + $ssid =~ s/^ss//; + + my $stmt = qq{ + SELECT + handle + FROM + subsnp_handle + WHERE + subsnp_id = ? + LIMIT 1 + }; + my $sth = $self->prepare($stmt); + $sth->execute($ssid); + my $handle; + $sth->bind_columns(\$handle); + $sth->fetch(); + + return $handle; +} + + +# API-internal method for getting failed descriptions for an Allele +sub _internal_get_failed_descriptions { + my $self = shift; + my $allele = shift; + my $constraint = shift; + + # Assert that the object passed is an Allele + assert_ref($allele,'Bio::EnsEMBL::Variation::Allele'); + + my $stmt = qq{ + SELECT DISTINCT + fd.description + FROM + failed_allele fa JOIN + failed_description fd ON ( + fd.failed_description_id = fa.failed_description_id + ) + WHERE + fa.allele_id = ? + }; + $stmt .= qq{ AND $constraint } if (defined($constraint)); + + my $sth = $self->prepare($stmt); + $sth->execute($allele->dbID()); + my @descriptions; + my $description; + $sth->bind_columns(\$description); + while ($sth->fetch()) { + push(@descriptions,$description); + } + return \@descriptions; +} + +sub _objs_from_sth { + my $self = shift; + my $sth = shift; + + my ($allele_id, $variation_id, $subsnp_id, $allele, $frequency, $sample_id, $count, $last_allele_id); + my @alleles; + + $sth->bind_columns(\$allele_id, \$variation_id, \$subsnp_id, \$allele, \$frequency, \$sample_id, \$count); + + while($sth->fetch()) { + + # The left join to failed allele can create duplicate rows, so check that we've got a new Allele before creating the object + unless (defined($last_allele_id) && $last_allele_id == $allele_id) { + + my $obj = Bio::EnsEMBL::Variation::Allele->new( + -dbID => $allele_id, + -VARIATION_ID => $variation_id, + -SUBSNP => $subsnp_id, + -ALLELE => $allele, + -FREQUENCY => $frequency, + -POPULATION_ID => $sample_id, + -COUNT => $count, + -ADAPTOR => $self + ); + + push(@alleles,$obj); + $last_allele_id = $allele_id; + } + + } + + return \@alleles; +} + +# method used by superclass to construct SQL +sub _tables { + my $self = shift; + + my @tables = ( + ['allele', 'a'], ['allele_code', 'ac'] + ); + + # If we are excluding failed_alleles, add that table + push(@tables,['failed_allele', 'fa']) unless ($self->db->include_failed_variations()); + + return @tables; +} + +# Add a left join to the failed_variation table +sub _left_join { + my $self = shift; + + # If we are including failed variations, skip the left join + return () if ($self->db->include_failed_variations()); + return ([ 'failed_allele', 'fa.allele_id = a.allele_id']); +} + +sub _columns { + return qw( a.allele_id a.variation_id a.subsnp_id ac.allele a.frequency a.sample_id a.count ); +} + +sub _write_columns { + return qw(variation_id subsnp_id allele_code_id sample_id frequency count); +} + +sub _default_where_clause { + return 'a.allele_code_id = ac.allele_code_id'; +} + +sub _cache_allele_codes { + my $self = shift; + + my $dbh = $self->dbc->db_handle; + + my $sth = $dbh->prepare(qq{SELECT allele_code_id, allele FROM allele_code}); + $sth->execute; + + my ($code, $allele); + $sth->bind_columns(\$code, \$allele); + my %allele_codes; + $allele_codes{defined($allele) ? $allele : ''} = $code while $sth->fetch; + $sth->finish(); + + $self->db->{_allele_codes} = \%allele_codes; + + return $self->db->{_allele_codes}; +} + +sub _allele_code { + my ($self, $allele) = @_; + + # check if cache is loaded + my $just_loaded = 0; + + if(!exists($self->db->{_allele_codes})) { + $self->_cache_allele_codes; + $just_loaded = 1; + } + + if(!exists($self->db->{_allele_codes}->{$allele})) { + + # check another process hasn't created it by reloading the cache + $self->_cache_allele_codes unless $just_loaded; + + # if it still doesn't exist + if(!exists($self->db->{_allele_codes}->{$allele})) { + my $dbh = $self->dbc->db_handle; + + my $sth = $dbh->prepare(q{ + INSERT INTO allele_code ( + allele + ) + VALUES (?) + }); + eval { + $sth->execute($allele); + }; + $sth->finish; + + my $allele_code; + + # insert failed, another process did it maybe? + if($@) { + my $sth2 = $dbh->prepare(qq{ + SELECT allele_code_id + FROM allele_code + WHERE allele = ? + }); + $sth2->execute($allele); + $sth2->bind_columns(\$allele_code); + $sth2->fetch(); + + throw("ERROR: Failed to insert allele '$allele' into allele_code") unless defined($allele_code); + + $sth2->finish(); + } + else { + $allele_code = $dbh->last_insert_id(undef, undef, 'allele_code', 'allele_code_id'); + } + + $self->db->{_allele_codes}->{$allele} = $allele_code; + } + } + + return $self->db->{_allele_codes}->{$allele}; +} +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/AlleleFeatureAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/AlleleFeatureAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,371 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# +# Ensembl module for Bio::EnsEMBL::Variation::DBSQL::AlleleFeatureAdaptor +# +# Copyright (c) 2004 Ensembl +# +# You may distribute this module under the same terms as perl itself +# +# + +=head1 NAME + +Bio::EnsEMBL::Variation::DBSQL::AlleleFeatureAdaptor + +=head1 SYNOPSIS + $reg = 'Bio::EnsEMBL::Registry'; + + $reg->load_registry_from_db(-host => 'ensembldb.ensembl.org',-user => 'anonymous'); + + $afa = $reg->get_adaptor("human","variation","allelefeature"); + $sa = $reg->get_adaptor("human","core","slice"); + + # Get a VariationFeature by its internal identifier + $af = $afa->fetch_by_dbID(145); + + # get all AlleleFeatures in a region + $slice = $sa->fetch_by_region('chromosome', 'X', 1e6, 2e6); + foreach $af (@{$afa->fetch_all_by_Slice($slice)}) { + print $af->start(), '-', $af->end(), ' ', $af->allele(), "\n"; + } + + +=head1 DESCRIPTION + +This adaptor provides database connectivity for AlleleFeature objects. +Genomic locations of alleles in samples can be obtained from the +database using this adaptor. See the base class BaseFeatureAdaptor for more information. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::DBSQL::AlleleFeatureAdaptor; + +use Bio::EnsEMBL::Variation::AlleleFeature; +use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; +use Bio::EnsEMBL::Variation::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw); +use Bio::EnsEMBL::Utils::Sequence qw(expand); +use Bio::EnsEMBL::Variation::Utils::Constants qw(%OVERLAP_CONSEQUENCES); + +our @ISA = ('Bio::EnsEMBL::Variation::DBSQL::BaseAdaptor', 'Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor'); + + +=head2 fetch_all_by_Slice + + Arg[0] : Bio::EnsEMBL::Slice $slice + Arg[1] : (optional) Bio::EnsEMBL::Variation::Individual $individual + Example : my $vf = $vfa->fetch_all_by_Slice($slice,$individual); + Description : Gets all the VariationFeatures in a certain Slice for a given + Individual (optional). Individual must be a designated strain. + ReturnType : listref of Bio::EnsEMBL::Variation::AlleleFeature + Exceptions : thrown on bad arguments + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_Slice{ + my $self = shift; + my $slice = shift; + my $individual = shift; + + if(!ref($slice) || !$slice->isa('Bio::EnsEMBL::Slice')) { + throw('Bio::EnsEMBL::Slice arg expected'); + } + + if (defined $individual){ + if(!ref($individual) || !$individual->isa('Bio::EnsEMBL::Variation::Individual')) { + throw('Bio::EnsEMBL::Variation::Individual arg expected'); + } + if(!defined($individual->dbID())) { + throw("Individual arg must have defined dbID"); + } + } + + %{$self->{'_slice_feature_cache'}} = (); #clean the cache to avoid caching problems + + my $genotype_adaptor = $self->db->get_IndividualGenotypeFeatureAdaptor; #get genotype adaptor + my $genotypes = $genotype_adaptor->fetch_all_by_Slice($slice, $individual); #and get all genotype data + my $afs = $self->SUPER::fetch_all_by_Slice_constraint($slice, $self->db->_exclude_failed_variations_constraint()); #get all AlleleFeatures within the Slice + my @new_afs = (); + + # merge AlleleFeatures with genotypes + foreach my $af (@{$afs}){ + + # get the variation ID of this AF + my $af_variation_id = $af->{_variation_id} || $af->variation->dbID; + + # get all genotypes that have this var id + foreach my $gt(grep {$_->{_variation_id} == $af_variation_id} @$genotypes) { + + # create a clone of the AF + my $new_af = { %$af }; + bless $new_af, ref $af; + + # add the genotype + $new_af->allele_string($gt->ambiguity_code); + + # add the individual + $new_af->individual($gt->individual); + + push @new_afs, $new_af; + } + } + + return \@new_afs; +} + +sub _tables{ + my $self = shift; + + my @tables = ( + ['variation_feature', 'vf'], + ['source', 's FORCE INDEX(PRIMARY)'] + ); + + # if we are including failed_variations, add that table + push(@tables,['failed_variation', 'fv']) unless ($self->db->include_failed_variations()); + + return @tables; +} + +#ÊAdd a left join to the failed_variation table +sub _left_join { + my $self = shift; + + # If we are including failed variations, skip the left join + return () if ($self->db->include_failed_variations()); + return ([ 'failed_variation', 'fv.variation_id = vf.variation_id']); +} + +sub _columns{ + my $self = shift; + + return qw(vf.variation_id + vf.seq_region_id vf.seq_region_start vf.seq_region_end + vf.seq_region_strand vf.variation_name s.name vf.variation_feature_id vf.allele_string vf.consequence_types); +} + +sub _default_where_clause{ + my $self = shift; + return "vf.source_id = s.source_id"; +} + +sub _objs_from_sth{ + my ($self, $sth, $mapper, $dest_slice) = @_; + + # + # This code is ugly because an attempt has been made to remove as many + # function calls as possible for speed purposes. Thus many caches and + # a fair bit of gymnastics is used. + # + + my $sa = $self->db()->dnadb()->get_SliceAdaptor(); + + my @features; + my %slice_hash; + my %sr_name_hash; + my %sr_cs_hash; + + my ( + $variation_id, $seq_region_id, $seq_region_start, $seq_region_end, + $seq_region_strand, $variation_name, $source_name, + $variation_feature_id, $allele_string, $cons, $last_vf_id + ); + + $sth->bind_columns( + \$variation_id, \$seq_region_id, \$seq_region_start, \$seq_region_end, + \$seq_region_strand, \$variation_name, \$source_name, + \$variation_feature_id, \$allele_string, \$cons + ); + + my $asm_cs; + my $cmp_cs; + my $asm_cs_vers; + my $asm_cs_name; + my $cmp_cs_vers; + my $cmp_cs_name; + if($mapper) { + $asm_cs = $mapper->assembled_CoordSystem(); + $cmp_cs = $mapper->component_CoordSystem(); + $asm_cs_name = $asm_cs->name(); + $asm_cs_vers = $asm_cs->version(); + $cmp_cs_name = $cmp_cs->name(); + $cmp_cs_vers = $cmp_cs->version(); + } + + my $dest_slice_start; + my $dest_slice_end; + my $dest_slice_strand; + my $dest_slice_length; + if($dest_slice) { + $dest_slice_start = $dest_slice->start(); + $dest_slice_end = $dest_slice->end(); + $dest_slice_strand = $dest_slice->strand(); + $dest_slice_length = $dest_slice->length(); + } + + FEATURE: while($sth->fetch()) { + + next if (defined($last_vf_id) && $last_vf_id == $variation_feature_id); + $last_vf_id = $variation_feature_id; + + #get the slice object + my $slice = $slice_hash{"ID:".$seq_region_id}; + if(!$slice) { + $slice = $sa->fetch_by_seq_region_id($seq_region_id); + $slice_hash{"ID:".$seq_region_id} = $slice; + $sr_name_hash{$seq_region_id} = $slice->seq_region_name(); + $sr_cs_hash{$seq_region_id} = $slice->coord_system(); + } + # + # remap the feature coordinates to another coord system + # if a mapper was provided + # + if($mapper) { + my $sr_name = $sr_name_hash{$seq_region_id}; + my $sr_cs = $sr_cs_hash{$seq_region_id}; + + ($sr_name,$seq_region_start,$seq_region_end,$seq_region_strand) = + $mapper->fastmap($sr_name, $seq_region_start, $seq_region_end, + $seq_region_strand, $sr_cs); + + #skip features that map to gaps or coord system boundaries + next FEATURE if(!defined($sr_name)); + + #get a slice in the coord system we just mapped to + if($asm_cs == $sr_cs || ($cmp_cs != $sr_cs && $asm_cs->equals($sr_cs))) { + $slice = $slice_hash{"NAME:$sr_name:$cmp_cs_name:$cmp_cs_vers"} ||= + $sa->fetch_by_region($cmp_cs_name, $sr_name,undef, undef, undef,$cmp_cs_vers); + } + else { + $slice = $slice_hash{"NAME:$sr_name:$asm_cs_name:$asm_cs_vers"} ||= + $sa->fetch_by_region($asm_cs_name, $sr_name, undef, undef, undef, $asm_cs_vers); + } + } + + # + # If a destination slice was provided convert the coords + # If the dest_slice starts at 1 and is foward strand, nothing needs doing + # + if($dest_slice) { + if($dest_slice_start != 1 || $dest_slice_strand != 1) { + if($dest_slice_strand == 1) { + $seq_region_start = $seq_region_start - $dest_slice_start + 1; + $seq_region_end = $seq_region_end - $dest_slice_start + 1; + } + + else { + my $tmp_seq_region_start = $seq_region_start; + $seq_region_start = $dest_slice_end - $seq_region_end + 1; + $seq_region_end = $dest_slice_end - $tmp_seq_region_start + 1; + $seq_region_strand *= -1; + } + + #throw away features off the end of the requested slice + if($seq_region_end < 1 || $seq_region_start > $dest_slice_length) { + next FEATURE; + } + } + $slice = $dest_slice; + } + + my $overlap_consequences = [ map { $OVERLAP_CONSEQUENCES{$_} } split /,/, $cons ]; + + push @features, Bio::EnsEMBL::Variation::AlleleFeature->new_fast({ + 'start' => $seq_region_start, + 'end' => $seq_region_end, + 'strand' => $seq_region_strand, + 'slice' => $slice, + 'allele_string' => '', + 'overlap_consequences' => $overlap_consequences, + 'variation_name' => $variation_name, + 'adaptor' => $self, + 'source' => $source_name, + '_variation_id' => $variation_id, + '_variation_feature_id' => $variation_feature_id, + '_vf_allele_string' => $allele_string, + '_sample_id' => '' + }); + } + + return\@features; +} + +=head2 get_all_synonym_sources + + Args[1] : Bio::EnsEMBL::Variation::AlleleFeature vf + Example : my @sources = @{$af_adaptor->get_all_synonym_sources($af)}; + Description : returns a list of all the sources for synonyms of this + AlleleFeature + ReturnType : reference to list of strings + Exceptions : none + Caller : general + Status : At Risk + : Variation database is under development. +=cut + +sub get_all_synonym_sources{ + my $self = shift; + my $af = shift; + my %sources; + my @sources; + + if(!ref($af) || !$af->isa('Bio::EnsEMBL::Variation::AlleleFeature')) { + throw("Bio::EnsEMBL::Variation::AlleleFeature argument expected"); + } + + if (!defined($af->{'_variation_id'}) && !defined($af->{'variation'})){ + warning("Not possible to get synonym sources for the AlleleFeature: you need to attach a Variation first"); + return \@sources; + } + #get the variation_id + my $variation_id; + if (defined ($af->{'_variation_id'})){ + $variation_id = $af->{'_variation_id'}; + } + else{ + $variation_id = $af->variation->dbID(); + } + #and go to the varyation_synonym table to get the extra sources + my $source_name; + my $sth = $self->prepare(qq{SELECT s.name + FROM variation_synonym vs, source s + WHERE s.source_id = vs.source_id + AND vs.variation_id = ? + }); + $sth->bind_param(1,$variation_id,SQL_INTEGER); + $sth->execute(); + $sth->bind_columns(\$source_name); + while ($sth->fetch){ + $sources{$source_name}++; + } + @sources = keys(%sources); + return \@sources; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/AttributeAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/AttributeAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,150 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::DBSQL::AttributeAdaptor; + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + +use Bio::EnsEMBL::Variation::OverlapConsequence; + +use base qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); + +sub attrib_value_for_id { + my ($self, $attrib_id) = @_; + + unless ($self->{attribs}) { + + my $attribs; + my $attrib_ids; + + my $sql = qq{ + SELECT a.attrib_id, t.code, a.value + FROM attrib a, attrib_type t + WHERE a.attrib_type_id = t.attrib_type_id + }; + + my $sth = $self->prepare($sql); + + $sth->execute; + + while (my ($attrib_id, $type, $value) = $sth->fetchrow_array) { + $attribs->{$attrib_id}->{type} = $type; + $attribs->{$attrib_id}->{value} = $value; + $attrib_ids->{$type}->{$value} = $attrib_id; + } + + $self->{attribs} = $attribs; + $self->{attrib_ids} = $attrib_ids; + } + + return defined $attrib_id ? + $self->{attribs}->{$attrib_id}->{value} : + undef; +} + +sub attrib_id_for_type_value { + my ($self, $type, $value) = @_; + + unless ($self->{attrib_ids}) { + # call this method to populate the attrib hash + $self->attrib_value_for_id; + } + + return $self->{attrib_ids}->{$type}->{$value}; +} + +sub display_term_for_SO_term { + my ($self, $SO_term) = @_; + return $self->_SO_mappings->{SO_terms}->{$SO_term}->{display_term}; +} + +sub SO_accession_for_SO_term { + my ($self, $SO_term) = @_; + return $self->_SO_mappings->{SO_terms}->{$SO_term}->{SO_accession}; +} + +sub SO_term_for_SO_accession { + my ($self, $SO_accession) = @_; + return $self->_SO_mappings->{SO_accessions}->{$SO_accession}->{SO_term}; +} + +sub display_term_for_SO_accession { + my ($self, $SO_accession) = @_; + return $self->_SO_mappings->{SO_accessions}->{$SO_accession}->{display_term}; +} + +sub _SO_mappings { + my ($self) = @_; + + unless ($self->{SO_mappings}) { + my $mapping; + + for my $set (@{ $self->_fetch_sets_by_type('SO_term') }) { + + my $term_map = $mapping->{SO_terms}->{$set->{SO_term}} ||= {}; + $term_map->{display_term} = $set->{display_term} || $set->{SO_term}; + $term_map->{SO_accession} = $set->{SO_accession}; + + my $acc_map = $mapping->{SO_accessions}->{$set->{SO_accession}} ||= {}; + $acc_map->{SO_term} = $set->{SO_term}; + $acc_map->{display_term} = $set->{display_term} || $set->{SO_term}; + } + + $self->{SO_mappings} = $mapping + + } + + return $self->{SO_mappings}; +} + +sub _fetch_sets_by_type { + my ($self, $type) = @_; + + my $sql = qq{ + SELECT s.attrib_set_id, t.code, a.value + FROM attrib a, attrib_type t, attrib_set s + WHERE t.attrib_type_id = a.attrib_type_id + AND a.attrib_id = s.attrib_id + AND s.attrib_set_id IN ( + SELECT s.attrib_set_id + FROM attrib a, attrib_type t, attrib_set s + WHERE a.attrib_type_id = t.attrib_type_id + AND a.attrib_id = s.attrib_id + AND t.code = ? + ) + }; + + my $sth = $self->prepare($sql); + + $sth->execute($type); + + my $sets; + + while (my ($set_id, $type, $value) = $sth->fetchrow_array) { + $sets->{$set_id}->{$type} = $value; + } + + return [values %$sets]; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/BaseAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/BaseAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,333 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + + +# Ensembl module for Bio::EnsEMBL::Variation::DBSQL::BaseAdaptor +# +# Copyright (c) 2010 Ensembl +# +# You may distribute this module under the same terms as perl itself +# +# + +=head1 NAME + +Bio::EnsEMBL::Variation::DBSQL::BaseAdaptor + +=head1 SYNOPSIS + +Abstract class - should not be instantiated. Implementation of +abstract methods must be performed by subclasses. + +=head1 DESCRIPTION + +This adaptor provides generic database connectivity for various Variation objects. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::DBSQL::BaseAdaptor; + +use Bio::EnsEMBL::DBSQL::BaseAdaptor; + +use Bio::EnsEMBL::Variation::Utils::Constants qw(%OVERLAP_CONSEQUENCES); + +our @ISA = ('Bio::EnsEMBL::DBSQL::BaseAdaptor'); + +sub AttributeAdaptor { + my $self = shift; + + unless ($self->{_attribute_adaptor}) { + $self->{_attribute_adaptor} = $self->db->get_AttributeAdaptor if $self->db; + } + + return $self->{_attribute_adaptor}; +} + +sub _consequence_type_map { + + # return a hash mapping between the string terms of a mysql set and + # the equivalent numerical values + + my ($self, $table, $column) = @_; + + my $key = $table.'_'.$column.'_map'; + + unless ($self->{$key}) { + + my $map_sth = $self->prepare(qq{SHOW COLUMNS FROM $table LIKE '$column'}); + + $map_sth->execute; + + my $types = $map_sth->fetchrow_hashref->{Type}; + + # Type will look like set('type1','type2'), so tidy it up a bit before splitting + + $types =~ s/set\(//; + $types =~ s/\)//; + $types =~ s/'//g; + + my $map; + + my $pow = 0; + + # mysql assigns the set values in consecutive powers of 2, so so shall we + + for my $type (split /,/, $types) { + $map->{$type} = 2**$pow++; + } + + $self->{$key} = $map; + } + + return $self->{$key}; +} + +sub _get_consequence_constraint { + + my ($self, $table, $query_terms, $without_children, $term_subset) = @_; + + # we build up the numerical value for our query by ORing together all the children of all the terms + my $query = 0; + + # get a hash mapping consequence terms to numerical values (specifically powers of 2) + my $cons_map = $self->_consequence_type_map($table, 'consequence_types'); + + for my $query_term (@$query_terms) { + + # we allow either an ontology term object, or just a string + $query_term = UNIVERSAL::can($query_term, 'name') ? $query_term->name : $query_term; + + # we store only the most specific consequence term, so we need to get all children of + # each query term + my $terms = $without_children ? [ ($self->_get_term_object($query_term)) ] : $self->_get_child_terms($query_term); + + # and then we OR together all relevant terms + + for my $term (@$terms) { + next unless $cons_map->{$term->name}; + $query |= $cons_map->{$term->name}; + } + } + + my $subset_mask; + if ($term_subset) { + for my $query_term (@$term_subset) { + + # we allow either an ontology term object, or just a string + $query_term = UNIVERSAL::can($query_term, 'name') ? $query_term->name : $query_term; + + my $terms = [ ($self->_get_term_object($query_term)) ]; + + # and then we OR together all relevant terms + + for my $term (@$terms) { + next unless $cons_map->{$term->name}; + $subset_mask |= $cons_map->{$term->name}; + } + } + } + + unless ($self->{_possible_consequences}) { + + # we need a list of the numerical values of all possible + # consequence term combinations we have actually observed + + my $sth = $self->dbc->prepare(qq{ + SELECT DISTINCT(consequence_types) + FROM $table + }); + + $sth->execute; + + my $cons; + + $sth->bind_columns(\$cons); + + my @poss_cons; + + while ($sth->fetch) { + # construct the numerical value by ORing together each combination + # (this is much quicker than SELECTing consequence_types+0 above which + # is what I used to do, but this seems to mean the db can't use the index) + + my $bit_val = 0; + + for my $term (split /,/, $cons) { + $bit_val |= $cons_map->{$term}; + } + + push @poss_cons, $bit_val; + } + + $self->{_possible_consequences} = \@poss_cons; + } + + # we now find all combinations that include our query by ANDing + # the query with all possible combinations and combine these into + # our query string + + #my $id_str = join ',', grep { $_ & $query } @{ $self->{_possible_consequences} }; + my @cons_vals = grep { $_ & $query } @{ $self->{_possible_consequences} }; + + if ($subset_mask) { + # When only including a subset of types, filter combinations to ones which + # include at least one of the the subset types. + @cons_vals = grep { $_ & $subset_mask } @cons_vals; + } + + if (!scalar(@cons_vals)) { + return undef; + } + + my $id_str = join ',', @cons_vals; + + my $constraint = "consequence_types IN ($id_str)"; + + return $constraint; +} + +sub _consequences_for_set_number { + my ($self, $set_number, $map) = @_; + + my @consequences; + + for my $term (keys %$map) { + if ($set_number & $map->{$term}) { + push @consequences, $OVERLAP_CONSEQUENCES{$term}; + } + } + + return \@consequences; +} + +sub _get_term_object { + my ($self, $term) = @_; + + my $oa = $self->{_ontology_adaptor} ||= + Bio::EnsEMBL::Registry->get_adaptor( 'Multi', 'Ontology', 'OntologyTerm' ); + + my $terms = $oa->fetch_all_by_name($term, 'SO'); + + if (@$terms > 1) { + warn "Ambiguous term '$term', just using first result"; + } + elsif (@$terms == 0) { + warn "Didn't find an ontology term for '$term'"; + } + + return $terms->[0]; +} + +sub _get_child_terms { + my ($self, $parent_term) = @_; + + my $parent_obj = $self->_get_term_object($parent_term); + + my $all_terms = $parent_obj->descendants; + + unshift @$all_terms, $parent_obj; + + return $all_terms; +} + +sub _get_parent_terms { + my ($self, $child_term) = @_; + + my $child_obj = $self->_get_term_object($child_term); + + my $all_terms = $child_obj->ancestors; + + unshift @$all_terms, $child_obj; + + return $all_terms; +} + +sub _set_number_for_consequences { + my ($self, $cons_list, $map) = @_; + + my $val = 0; + + for my $cons (@$cons_list) { + $val |= $map->{$cons->SO_term}; + } + + return $val; +} + +sub _transcript_variation_consequences_for_set_number { + my ($self, $set_number) = @_; + my $map = $self->_consequence_type_map('transcript_variation', 'consequence_types'); + return $self->_consequences_for_set_number($set_number, $map); +} + +sub _variation_feature_consequences_for_set_number { + my ($self, $set_number) = @_; + my $map = $self->_consequence_type_map('variation_feature', 'consequence_types'); + return $self->_consequences_for_set_number($set_number, $map); +} + +sub _transcript_variation_set_number_for_consequences { + my ($self, $cons) = @_; + my $map = $self->_consequence_type_map('transcript_variation', 'consequence_types'); + return $self->_set_number_for_consequences($cons, $map); +} + +sub _variation_feature_set_number_for_consequences { + my ($self, $cons) = @_; + my $map = $self->_consequence_type_map('variation_feature', 'consequence_types'); + return $self->_set_number_for_consequences($cons, $map); +} + +=head2 ploidy + + Arg[1] : int $ploidy + Example : my $ploidy = $adaptor->ploidy(); + Description : Gets/sets the ploidy for this database + ReturnType : int + Exceptions : None + Caller : general + Status : At Risk + +=cut + +sub ploidy { + my $self = shift; + my $ploidy = shift; + + if(defined($ploidy)) { + $self->{ploidy} = $ploidy; + } + elsif(!defined($self->{ploidy})) { + my $mc = $self->db->get_MetaContainer; + throw("Could not retrieve MetaContainer") unless defined($mc); + + $self->{ploidy} = $mc->ploidy; + } + + return $self->{ploidy}; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/BaseGenotypeAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/BaseGenotypeAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,178 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# +# Ensembl module for Bio::EnsEMBL::Variation::DBSQL::PopulationGenotypeAdaptor +# +# Copyright (c) 2004 Ensembl +# +# You may distribute this module under the same terms as perl itself +# +# + +=head1 NAME + +Bio::EnsEMBL::Variation::DBSQL::BaseGenotypeAdaptor + +=head1 DESCRIPTION + +Abstract adaptor class for fetching genotypes. Should not be invoked directly. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::DBSQL::BaseGenotypeAdaptor; + +use Bio::EnsEMBL::Variation::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Scalar qw(assert_ref); + + +our @ISA = ('Bio::EnsEMBL::Variation::DBSQL::BaseAdaptor'); + +=head2 get_subsnp_handle + + Arg[1] : Bio::EnsEMBL::Variation::Allele + The allele object to get the subsnp handle for + Example : my $handle = $adaptor->get_subsnp_handle($allele); + print "The allele '" . $allele->allele() . "' of subsnp 'ss" . $allele->subsnp_id() . "' was submitted by '$handle'\n"; + + Description : Gets the submitter handle for the specified genotype + ReturnType : string + Exceptions : thrown on incorrect argument + Caller : general + Status : At Risk + +=cut + +sub get_subsnp_handle { + my $self = shift; + my $gt = shift; + + # Assert that the object passed is a Genotype + assert_ref($gt,'Bio::EnsEMBL::Variation::Genotype'); + + # Get the subsnp id and get rid of any 'ss' prefix + my $ssid = $gt->subsnp() || ""; + $ssid =~ s/^ss//; + + my $stmt = qq{ + SELECT + handle + FROM + subsnp_handle + WHERE + subsnp_id = ? + LIMIT 1 + }; + my $sth = $self->prepare($stmt); + $sth->execute($ssid); + my $handle; + $sth->bind_columns(\$handle); + $sth->fetch(); + + return $handle; +} + +# caches genotype codes in a hash on the adaptor object +sub _cache_genotype_codes { + my $self = shift; + + if(!defined($self->{_genotype_code_adaptor})) { + $self->{_genotype_code_adaptor} = $self->db->get_GenotypeCodeAdaptor; + } + + my %gt_codes = map {(join "|", @{$_->genotype}) => $_->dbID} @{$self->{_genotype_code_adaptor}->fetch_all()}; + + $self->db->{_genotype_codes} = \%gt_codes; + + return $self->db->{_genotype_codes}; +} + +# get or (if not yet in DB) add a new GT code +sub _genotype_code { + my ($self, $genotype) = @_; + + # check if cache is loaded + my $just_loaded = 0; + + if(!exists($self->db->{_genotype_codes})) { + $self->_cache_genotype_codes; + $just_loaded = 1; + } + + my $gt_string = join "|", @$genotype; + + if(!exists($self->db->{_genotype_codes}->{$gt_string})) { + + # check another process hasn't created it by reloading the cache + $self->_cache_genotype_codes unless $just_loaded; + + # if it still doesn't exist + if(!exists($self->db->{_genotype_codes}->{$gt_string})) { + + # get allele codes + if(!defined($self->{_allele_adaptor})) { + $self->{_allele_adaptor} = $self->db->get_AlleleAdaptor; + } + + my %allele_codes = map {$_ => $self->{_allele_adaptor}->_allele_code($_)} @$genotype; + + my $dbh = $self->dbc->db_handle; + + my $sth = $dbh->prepare(q{ + SELECT max(genotype_code_id) FROM genotype_code + }); + $sth->execute(); + + my $max_id; + $sth->bind_columns(\$max_id); + $sth->fetch; + $sth->finish; + $max_id ||= 0; + + my $gt_code = $max_id + 1; + + $sth = $dbh->prepare(q{ + INSERT INTO genotype_code ( + genotype_code_id, allele_code_id, haplotype_id + ) + VALUES (?,?,?) + }); + + for my $hap_id(1..(scalar @$genotype)) { + $sth->execute($gt_code, $allele_codes{$genotype->[$hap_id-1]}, $hap_id); + } + + $sth->finish; + + $self->db->{_genotype_codes}->{$gt_string} = $gt_code; + } + } + + return $self->db->{_genotype_codes}->{$gt_string}; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/BaseStructuralVariationAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/BaseStructuralVariationAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,434 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + + +# Ensembl module for Bio::EnsEMBL::Variation::DBSQL::BaseStructuralVariationAdaptor +# +# Copyright (c) 2011 Ensembl +# +# You may distribute this module under the same terms as perl itself +# +# + +=head1 NAME + +Bio::EnsEMBL::Variation::DBSQL::BaseStructuralVariationAdaptor + +=head1 DESCRIPTION + +Abstract adaptor class for fetching structural variants. Should not be invoked directly. + +By default, the 'fetch_all_by_...'-methods will not return variations +that have been flagged as failed in the Ensembl QC. This behaviour can be modified +by setting the include_failed_variations flag in Bio::EnsEMBL::Variation::DBSQL::DBAdaptor. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::DBSQL::BaseStructuralVariationAdaptor; + +use Bio::EnsEMBL::Variation::BaseStructuralVariation; +use Bio::EnsEMBL::Variation::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Scalar qw(assert_ref); +use DBI qw(:sql_types); + +our @ISA = ('Bio::EnsEMBL::Variation::DBSQL::BaseAdaptor'); + +my $DEFAULT_ITERATOR_CACHE_SIZE = 10000; + +# method used by superclass to construct SQL +sub _tables { + my $self = shift; + my @tables = (['structural_variation', 'sv'], ['source', 's']); + + # If we are excluding failed_structural_variations, add that table + push(@tables,['failed_structural_variation', 'fsv']) unless ($self->db->include_failed_variations()); + + return @tables; +} + +sub _columns { + return qw( sv.structural_variation_id sv.variation_name sv.validation_status s.name s.version + s.description sv.class_attrib_id sv.study_id sv.is_evidence sv.somatic ); +} + +# Add a left join to the failed_structural_variation table +sub _left_join { + my $self = shift; + + # If we are including failed structural variations, skip the left join + return () if ($self->db->include_failed_variations()); + return (['failed_structural_variation', 'fsv.structural_variation_id=sv.structural_variation_id']); +} + + +sub _default_where_clause { + my $self = shift; + return 'sv.source_id=s.source_id'; +} + + +=head2 fetch_all + + Description: Returns a listref of all germline structural variants + Returntype : listref of Bio::EnsEMBL::Variation::StructuralVariation or Bio::EnsEMBL::Variation::SupportingStructuralVariation objects + Status : At risk + +=cut + +sub fetch_all { + my $self = shift; + my $constraint = 'sv.somatic = 0'; + return $self->generic_fetch($constraint); +} + + +=head2 fetch_all_somatic + + Description: Returns a listref of all somatic structural variants + Returntype : listref of Bio::EnsEMBL::Variation::StructuralVariation or Bio::EnsEMBL::Variation::SupportingStructuralVariation objects + Status : At risk + +=cut + +sub fetch_all_somatic { + my $self = shift; + my $constraint = 'sv.somatic = 1'; + return $self->generic_fetch($constraint); +} + + +=head2 list_dbIDs + + Arg [1] : none + Example : @feature_ids = @{$simple_feature_adaptor->list_dbIDs()}; + Description: Gets an array of internal ids for all simple features in + the current db + Returntype : list of ints + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub list_dbIDs { + my $self = shift; + return $self->_list_dbIDs('structural_variation'); +} + + +=head2 fetch_by_name + + Args[1] : string $name + Example : my $structural_variation = $sv_adaptor->fetch_by_name('esv263'); + Description : returns the structural variation with the given variation name (or undef if one isn't found). + If the name argument is undef this will be converted to NULL in the SQL statement generated. + ReturnType : Bio::EnsEMBL::Variation::StructuralVariation or Bio::EnsEMBL::Variation::SupportingStructuralVariation object + Exceptions : thrown if there are multiple objects found with the same variation name + Caller : general + Status : Stable + +=cut + +sub fetch_by_name { + my ($self, $name) = @_; + + my $constraint = $self->_internal_exclude_failed_constraint("sv.variation_name='$name'"); + + my $objs = $self->generic_fetch($constraint); + throw("Multiple structural variations found with the same name: '$name'") if @$objs > 1; + return $objs->[0] if @$objs == 1; +} + + +=head2 fetch_all_by_Study + + Arg [1] : Bio::EnsEMBL::Variation::Study $study_id + Example : my $study = $study_adaptor->fetch_by_name('estd1'); + foreach my $sv (@{$sv_adaptor->fetch_all_by_Study($study)}){ + print $sv->variation_name,"\n"; + } + Description : Retrieves all structural variations from a specified study + ReturnType : reference to list of Bio::EnsEMBL::Variation::StructuralVariation or + Bio::EnsEMBL::Variation::SupportingStructuralVariation objects + Exceptions : throw if incorrect argument is passed + warning if provided study does not have a dbID + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_Study { + my $self = shift; + my $study = shift; + + if(!ref($study) || !$study->isa('Bio::EnsEMBL::Variation::Study')) { + throw("Bio::EnsEMBL::Variation::Study arg expected"); + } + + if(!$study->dbID()) { + warning("Study does not have dbID, cannot retrieve structural variants"); + return []; + } + + my $constraint = $self->_internal_exclude_failed_constraint('sv.study_id = '.$study->dbID); + + my $result = $self->generic_fetch($constraint); + + return $result; +} + + +=head2 fetch_all_by_dbID_list + + Arg [1] : listref $list + Example : $ssv = $sv_adaptor->fetch_all_by_dbID_list([907,1132]); + Description: Retrieves a listref of structural variant objects via a list of internal + dbID identifiers + Returntype : listref of Bio::EnsEMBL::Variation::StructuralVariation or + Bio::EnsEMBL::Variation::SupportingStructuralVariation objects + Exceptions : throw if list argument is not defined + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_dbID_list { + my $self = shift; + my $list = shift; + + if(!defined($list) || ref($list) ne 'ARRAY') { + throw("list reference argument is required"); + } + + my $id_str = (@$list > 1) ? " IN (".join(',',@$list).")" : ' = \''.$list->[0].'\''; + + my $constraint = $self->_internal_exclude_failed_constraint("sv.structural_variation_id $id_str"); + + my $result = $self->generic_fetch($constraint); + + return $result; +} + + +=head2 fetch_Iterator_by_dbID_list + + Arg [1] : reference to list of ints $list + Example : $variation_iterator = $va->fetch_Iterator_by_dbID_list([124, 56, 90]); + Description: Retrieves an iterator over a set of structural variations via their internal identifiers. + Returntype : Bio::EnsEMBL::Utils::Iterator + Exceptions : throw on bad argument + Caller : general + Status : Experimental + +=cut + +sub fetch_Iterator_by_dbID_list { + my ($self, $dbid_list, $cache_size) = @_; + + unless ((defined $dbid_list) && (ref $dbid_list eq 'ARRAY')) { + throw("list reference argument is required"); + } + + $cache_size ||= $DEFAULT_ITERATOR_CACHE_SIZE; + + # create an iterator that fetches structural variations in blocks of + # $cache_size and returns them in turn + + my @object_cache; + + return Bio::EnsEMBL::Utils::Iterator->new(sub { + + if (@object_cache == 0 && @$dbid_list > 0 ) { + my @dbids = splice @$dbid_list, 0, $cache_size; + + # Create a constraint on the dbIDs + my $id_str = "(" . join(",",@dbids) . ")"; + my $constraint = qq{sv.structural_variation_id IN $id_str}; + + @object_cache = @{ $self->generic_fetch($constraint) }; + } + + return shift @object_cache; + } + ); +} + + +# Exclude the constraint for failed structural variant +sub _internal_exclude_failed_constraint { + my $self = shift; + my $constraint = shift; + $constraint .= " AND " . $self->db->_exclude_failed_structural_variations_constraint(); + + return $constraint; +} + + +# API-internal method for getting failed descriptions for an Allele +sub _internal_get_failed_descriptions { + my $self = shift; + my $allele = shift; + my $constraint = shift; + + # Assert that the object passed is an Allele + assert_ref($allele,'Bio::EnsEMBL::Variation::BaseStructuralVariation'); + + my $stmt = qq{ + SELECT DISTINCT + fd.description + FROM + failed_structural_variation fsv JOIN + failed_description fd ON ( + fd.failed_description_id = fsv.failed_description_id + ) + WHERE + fsv.structural_variation_id = ? + }; + $stmt .= qq{ AND $constraint } if (defined($constraint)); + + my $sth = $self->prepare($stmt); + $sth->execute($allele->dbID()); + my @descriptions; + my $description; + $sth->bind_columns(\$description); + while ($sth->fetch()) { + push(@descriptions,$description); + } + return \@descriptions; +} + + +=head2 get_all_failed_descriptions + + Arg[1] : Bio::EnsEMBL::Variation::BaseStructuralVariation $sv + The structural variant object to get the failed descriptions for + Example : + my $failed_descriptions = $adaptor->get_all_failed_descriptions($sv); + if (scalar(@{$failed_descriptions})) { + print "The structural variant'" . $sv->variation_name . "' has been flagged as failed because '" . join("' and '",@{$failed_descriptions}) . "'\n"; + } + + Description : Gets the unique descriptions for the reasons why the supplied structural variant has failed. + ReturnType : reference to a list of strings + Exceptions : thrown on incorrect argument + Caller : general + Status : At Risk + +=cut + +sub get_all_failed_descriptions { + my $self = shift; + my $sv = shift; + + # Call the internal get method without any constraints + my $description = $self->_internal_get_failed_descriptions($sv) || []; + + return $description; +} + + +sub store { + my ($self, $sv) = @_; + + my $dbh = $self->dbc->db_handle; + + # look up source_id + if(!defined($sv->{source_id})) { + my $sth = $dbh->prepare(q{ + SELECT source_id FROM source WHERE name = ? + }); + $sth->execute($sv->{source}); + + my $source_id; + $sth->bind_columns(\$source_id); + $sth->fetch(); + $sth->finish(); + $sv->{source_id} = $source_id; + } + throw("No source ID found for source name ", $sv->{source}) unless defined($sv->{source_id}); + + # look up study_id + if(!defined($sv->{study_id}) && defined($sv->{study_name})) { + my $sth = $dbh->prepare(q{ + SELECT study_id FROM study WHERE name = ? + }); + $sth->execute($sv->{study_name}); + + my $study_id; + $sth->bind_columns(\$study_id); + $sth->fetch(); + $sth->finish(); + $sv->{study_id} = $study_id; + } + + # look up class_attrib_id + my $class_attrib_id; + if(defined($sv->{class_SO_term})) { + my $sth = $dbh->prepare(q{ + SELECT attrib_id FROM attrib WHERE value = ? + }); + $sth->execute($sv->{class_SO_term}); + + $sth->bind_columns(\$class_attrib_id); + $sth->fetch(); + $sth->finish(); + } + throw("No class ID found for the class name ", $sv->{class_SO_term}) unless defined($class_attrib_id); + + + my $sth = $dbh->prepare(q{ + INSERT INTO structural_variation ( + source_id, + study_id, + variation_name, + validation_status, + class_attrib_id, + is_evidence, + somatic + ) VALUES (?,?,?,?,?,?,?) + }); + + $sth->execute( + $sv->{source_id}, + $sv->{study_id} || undef, + $sv->variation_name, + (join ",", @{$sv->get_all_validation_states}) || undef, + $class_attrib_id || 0, + $sv->is_evidence || 0, + $sv->somatic || 0 + ); + + $sth->finish; + + # get dbID + my $dbID = $dbh->last_insert_id(undef, undef, 'structural_variation', 'structural_variation_id'); + $sv->{dbID} = $dbID; + $sv->{adaptor} = $self; + + $sth->finish; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/DBAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/DBAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,225 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# +# Ensembl module for Bio::EnsEMBL::Variation::DBSQL::DBAdaptor +# +# Copyright (c) 2004 Ensembl +# +# You may distribute this module under the same terms as perl itself +# +# + +=head1 NAME + +Bio::EnsEMBL::DBSQL::DBAdaptor + +=head1 SYNOPSIS + + + +=head1 DESCRIPTION + +This module provides a connection to an Ensembl variation database and +provides a means to obtain ObjectAdaptors. + +=head1 METHODS + +=cut + +use strict; +use warnings; + + +package Bio::EnsEMBL::Variation::DBSQL::DBAdaptor; + + +use Bio::EnsEMBL::DBSQL::DBAdaptor; + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); + +our @ISA = ('Bio::EnsEMBL::DBSQL::DBAdaptor'); + +our $DEFAULT_INCLUDE_FAILED_VARIATIONS = 0; + +sub get_available_adaptors{ + my %pairs = ( + 'Population' => 'Bio::EnsEMBL::Variation::DBSQL::PopulationAdaptor', + 'Individual' => 'Bio::EnsEMBL::Variation::DBSQL::IndividualAdaptor', + 'Variation' => 'Bio::EnsEMBL::Variation::DBSQL::VariationAdaptor', + 'VariationFeature' => 'Bio::EnsEMBL::Variation::DBSQL::VariationFeatureAdaptor', + 'StructuralVariation' => 'Bio::EnsEMBL::Variation::DBSQL::StructuralVariationAdaptor', + 'SupportingStructuralVariation' => 'Bio::EnsEMBL::Variation::DBSQL::SupportingStructuralVariationAdaptor', + 'StructuralVariationFeature' => 'Bio::EnsEMBL::Variation::DBSQL::StructuralVariationFeatureAdaptor', + 'StructuralVariationAnnotation' => 'Bio::EnsEMBL::Variation::DBSQL::StructuralVariationAnnotationAdaptor', + 'Study' => 'Bio::EnsEMBL::Variation::DBSQL::StudyAdaptor', + 'VariationAnnotation' => 'Bio::EnsEMBL::Variation::DBSQL::VariationAnnotationAdaptor', + 'AlleleFeature' => 'Bio::EnsEMBL::Variation::DBSQL::AlleleFeatureAdaptor', + 'LDFeatureContainer' => 'Bio::EnsEMBL::Variation::DBSQL::LDFeatureContainerAdaptor', + 'IndividualGenotype' => 'Bio::EnsEMBL::Variation::DBSQL::IndividualGenotypeAdaptor', + 'IndividualGenotypeFeature' => 'Bio::EnsEMBL::Variation::DBSQL::IndividualGenotypeFeatureAdaptor', + 'PopulationGenotype' => 'Bio::EnsEMBL::Variation::DBSQL::PopulationGenotypeAdaptor', + 'TranscriptVariation' => 'Bio::EnsEMBL::Variation::DBSQL::TranscriptVariationAdaptor', + 'MetaCoordContainer' => 'Bio::EnsEMBL::DBSQL::MetaCoordContainer', + 'MetaContainer' => 'Bio::EnsEMBL::Variation::DBSQL::MetaContainer', + 'ReadCoverage' => 'Bio::EnsEMBL::Variation::DBSQL::ReadCoverageAdaptor', + 'GenotypeCode' => 'Bio::EnsEMBL::Variation::DBSQL::GenotypeCodeAdaptor', + 'VariationAnnotation' => 'Bio::EnsEMBL::Variation::DBSQL::VariationAnnotationAdaptor', + 'ReadCoverageCollection' => 'Bio::EnsEMBL::Variation::DBSQL::ReadCoverageCollectionAdaptor', + 'VariationSet' => 'Bio::EnsEMBL::Variation::DBSQL::VariationSetAdaptor', + 'OverlapConsequence' => 'Bio::EnsEMBL::Variation::DBSQL::OverlapConsequenceAdaptor', + 'Attribute' => 'Bio::EnsEMBL::Variation::DBSQL::AttributeAdaptor', + 'Allele' => 'Bio::EnsEMBL::Variation::DBSQL::AlleleAdaptor', + 'ProteinFunctionPredictionMatrix' => 'Bio::EnsEMBL::Variation::DBSQL::ProteinFunctionPredictionMatrixAdaptor', + 'Phenotype' => 'Bio::EnsEMBL::Variation::DBSQL::PhenotypeAdaptor', + ); + + return (\%pairs); +} + + +=head2 include_failed_variations + + Arg [1] : int $newval (optional) + Example : + #ÊGet a DBAdaptor for the human variation database + my $dba = $registry->get_DBAdaptor('human','variation'); + + #ÊConfigure the DBAdaptor to return failed variations when using + #Êfetch methods in the various object adaptors + $dba->include_failed_variations(1); + + #ÊGet a variation set adaptor + my $vs_adaptor = $dba->get_VariationSetAdaptor(); + + #ÊGet a variation set for the 1000 genomes high coverage Yoruba trio data + my $vs = $vs_adaptor->fetch_by_name('1000 genomes - High coverage - Trios - YRI'); + + # Get the iterator for the variations belonging to this variation set. + #ÊThis will now include variations that has been flagged as being failed. + #ÊThe default behaviour is not to return these. + my $it = $vs->get_Variation_Iterator(); + + # Iterate over the variations + while ($it->has_next()) { + + # Get the next variation object in the iterator + my $v = $it->next(); + + # Check if the variation is flagged as failed + if ($v->is_failed()) { + # Do something... + } + # If not, check if any of its subsnps have been flagged as failed + elsif ($v->has_failed_subsnps()) { + #ÊDo something else... + } + else { + #ÊDo something else... + } + } + + Description: Getter/Setter for the behaviour of the adaptors connected through this + DBAdaptor when it comes to variations and alleles that have been flagged as failed. + The default behaviour is not to return these variations or alleles in e.g. the + 'fetch_all_by...'-type methods. If this flag is set, those methods will + instead also return failed variations and alleles. Note that a variation is considered + failed when the variation itself is failed. If only some alleles belonging + to the variation are failed, the entire variation will not be considered + to be failed. + Returntype : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub include_failed_variations { + my $self = shift; + my $include = shift; + + #ÊIf the flag should be modified, do that + if (defined($include)) {$self->{'include_failed_variations'} = $include;} + + #ÊIn case the flag has not been set at all, set it to the default value + unless (exists($self->{'include_failed_variations'})) {$self->{'include_failed_variations'} = $DEFAULT_INCLUDE_FAILED_VARIATIONS;} + + return $self->{'include_failed_variations'}; +} + +# API-internal method for getting the constraint to filter out failed variations. Assumes that the +# failed_variation table has been (left) joined to the query and that the table alias is either supplied +# or equals 'fv' +sub _exclude_failed_variations_constraint { + my $self = shift; + my $table_alias = shift; + + # If not specified, assume that the failed_variation table alias is 'fv' + $table_alias ||= 'fv'; + + return $self->_exclude_failed_constraint('variation_id',$table_alias); +} + +# API-internal method for getting the constraint to filter out failed structural variations. Assumes that the +# failed_structural_variation table has been (left) joined to the query and that the table alias is either supplied +# or equals 'fsv' +sub _exclude_failed_structural_variations_constraint { + my $self = shift; + my $table_alias = shift; + + # If not specified, assume that the failed_structural_variation table alias is 'fsv' + $table_alias ||= 'fsv'; + + return $self->_exclude_failed_constraint('structural_variation_id',$table_alias); +} + +# API-internal method for getting the constraint to filter out failed alleles. Assumes that the +# failed_allele table has been (left) joined to the query and that the table alias is either supplied +# or equals 'fa' +sub _exclude_failed_alleles_constraint { + my $self = shift; + my $table_alias = shift; + + # If not specified, assume that the failed_variation table alias is 'fv' + $table_alias ||= 'fa'; + + return $self->_exclude_failed_constraint('allele_id',$table_alias); +} + +sub _exclude_failed_constraint { + my $self = shift; + my $key_column = shift; + my $table_alias = shift; + + #ÊIf we should include failed objects, no extra condition is needed + return qq{ 1 } if ($self->include_failed_variations()); + + # Otherwise, add a constraint on the alias table to have the key_column NULL + my $constraint = qq{ + ( + $table_alias.$key_column IS NULL + ) + }; + + return $constraint; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/GenotypeCodeAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/GenotypeCodeAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,160 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# +# Ensembl module for Bio::EnsEMBL::Variation::DBSQL::GenotypeCodeAdaptor +# +# Copyright (c) 2011 Ensembl +# +# You may distribute this module under the same terms as perl itself +# +# + +=head1 NAME + +Bio::EnsEMBL::Variation::DBSQL::GenotypeCodeAdaptor + +=head1 SYNOPSIS + $reg = 'Bio::EnsEMBL::Registry'; + + $reg->load_registry_from_db(-host => 'ensembldb.ensembl.org',-user => 'anonymous'); + + $va = $reg->get_adaptor("human","variation","genotypecode"); + + + +=head1 DESCRIPTION + +This adaptor provides database connectivity for GenotypeCode objects. +GenotypeCodes may be retrieved from the Ensembl variation database by +several means using this module. + +GenotypeCode objects are internal objects utilised by the CompressedGenotype +adaptor - they are not intended for external use. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::DBSQL::GenotypeCodeAdaptor; + +use Bio::EnsEMBL::Variation::DBSQL::BaseAdaptor; + +use Bio::EnsEMBL::Variation::GenotypeCode; + +use DBI qw(:sql_types); + +our @ISA = ('Bio::EnsEMBL::Variation::DBSQL::BaseAdaptor'); + + +sub fetch_all_by_dbID_list { + my ($self, $id_list_ref) = @_; + + if(!defined($id_list_ref) || ref($id_list_ref) ne 'ARRAY') { + throw("id_list list reference argument is required"); + } + + return [] if(!@$id_list_ref); + + my @out; + + # mysql is faster and we ensure that we do not exceed the max query size by + # splitting large queries into smaller queries of 200 ids + my $max_size = 200; + my @id_list = @$id_list_ref; + + while(@id_list) { + my @ids; + if(@id_list > $max_size) { + @ids = splice(@id_list, 0, $max_size); + } else { + @ids = splice(@id_list, 0); + } + + my $id_str; + if(@ids > 1) { + $id_str = " IN (" . join(',', @ids). ")"; + } else { + $id_str = " = ?"; + $self->bind_param_generic_fetch($ids[0],SQL_INTEGER); + } + + my $constraint = "gc.genotype_code_id $id_str"; + + push @out, @{$self->generic_fetch($constraint)}; + } + + return \@out; +} + +# +sub fetch_all_single_bp { + my $self = shift; + + my $constraint = "length(ac.allele) = 1"; + + return $self->generic_fetch($constraint); +} + +sub _objs_from_sth { + my $self = shift; + my $sth = shift; + + my $ploidy = $self->ploidy; + + my ($gt_code_id, $haplotype_id, $allele); + my (@result, %gts); + + $sth->bind_columns(\$gt_code_id, \$haplotype_id, \$allele); + + $gts{$gt_code_id}{$haplotype_id} = $allele while $sth->fetch; + + foreach $gt_code_id(keys %gts) { + my @gt = map {$gts{$gt_code_id}{$_}} sort {$a <=> $b} keys %{$gts{$gt_code_id}}; + + # splice it down to ploidy size + @gt = splice @gt, 0, $ploidy; + + push @result, Bio::EnsEMBL::Variation::GenotypeCode->new_fast({ + dbID => $gt_code_id, + genotype => \@gt, + }); + } + + return \@result; +} + +# method used by superclass to construct SQL +sub _tables { + return (['genotype_code','gc'],['allele_code','ac']); +} + +sub _columns { + return qw( gc.genotype_code_id gc.haplotype_id ac.allele ); +} + +sub _default_where_clause { + return 'gc.allele_code_id = ac.allele_code_id'; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/IndividualAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/IndividualAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,632 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# +# Ensembl module for Bio::EnsEMBL::Variation::DBSQL::IndividualAdaptor +# +# Copyright (c) 2004 Ensembl +# +# You may distribute this module under the same terms as perl itself +# +# + +=head1 NAME + +Bio::EnsEMBL::DBSQL::IndividualAdaptor + +=head1 SYNOPSIS + $reg = 'Bio::EnsEMBL::Registry'; + + $reg->load_registry_from_db(-host => 'ensembldb.ensembl.org',-user => 'anonymous'); + + $ia = $reg->get_adaptor("human","variation","individual"); + $pa = $reg->get_adaptor("human","variation","population"); + + # Get an individual by its internal identifier + my $ind = $ia->fetch_by_dbID(52); + + # Get all individuals with a particular name + foreach my $ind (@{$ia->fetch_all_by_name('PKH053(M)')}) { + print "Individual ", $ind->name(), "\n"; + } + + # get all individuals from a population + my $pop = $pa->fetch_by_name('PACIFIC'); + foreach my $ind (@{$ia->fetch_all_by_Population($pop)}) { + print $ind->name(), "\n"; + } + + # get all children of an individual + foreach my $child (@{$ia->fetch_all_by_parent($ind)}) { + print $child->name(), " is a child of ", $ind->name(), "\n"; + } + + +=head1 DESCRIPTION + +This adaptor provides database connectivity for Individual objects. +Individuals may be retrieved from the ensembl variation database by +several means using this module. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::DBSQL::IndividualAdaptor; + +use Bio::EnsEMBL::Variation::DBSQL::SampleAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + +use Bio::EnsEMBL::Variation::Individual; +our @ISA = ('Bio::EnsEMBL::Variation::DBSQL::SampleAdaptor'); + +sub store { + my ($self, $ind) = @_; + + my $dbh = $self->dbc->db_handle; + + my $sth = $dbh->prepare(q{ + INSERT INTO sample ( + name, + description, + display + ) VALUES (?,?,?) + }); + + $sth->execute( + $ind->name, + $ind->description, + $ind->display + ); + $sth->finish; + + # get the sample_id inserted + my $dbID = $dbh->last_insert_id(undef, undef, 'sample', 'sample_id'); + + $ind->{dbID} = $dbID; + $ind->{adaptor} = $self; + + # retrieve individual type ID - default to 3 (outbred) + my $individual_type_id = 3; + + if(defined($ind->type_individual)) { + my $sth = $dbh->prepare(q{ + SELECT individual_type_id + FROM individual_type + WHERE name = ? + }); + $sth->execute($ind->type_individual); + + $sth->bind_columns(\$individual_type_id); + $sth->fetch(); + $sth->finish(); + } + + # add entry to individual table also + $sth = $dbh->prepare(q{ + INSERT INTO individual ( + sample_id, + gender, + father_individual_sample_id, + mother_individual_sample_id, + individual_type_id + ) VALUES (?,?,?,?,?) + }); + $sth->execute( + $ind->dbID, + $ind->gender || 'Unknown', + $ind->father_Individual ? $ind->father_Individual->dbID : undef, + $ind->mother_Individual ? $ind->mother_Individual->dbID : undef, + $individual_type_id + ); + $sth->finish; + + # store individual/population relationships + $sth = $dbh->prepare(q{ + INSERT INTO individual_population ( + individual_sample_id, + population_sample_id + ) VALUES (?,?) + }); + + foreach my $pop(@{$ind->{populations}}) { + next unless defined($pop->dbID); + + $sth->execute( + $ind->dbID, + $pop->dbID + ); + } + + $sth->finish; +} + +=head2 fetch_individual_by_synonym + + Arg [1] : $individual_synonym + Example : my $ind = $ind_adaptor->fetch_individual_by_synonym($individual_synonym,$source); + Description : Retrieves individual for the synonym given in the source. If no source is provided, retrieves all the synonyms + Returntype : list of Bio::EnsEMBL::Variation::Individual + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub fetch_individual_by_synonym{ + my $self = shift; + my $synonym_name = shift; + my $source = shift; + my $individuals; + my $ind; + #return all sample_id from the database + my $samples = $self->SUPER::fetch_sample_by_synonym($synonym_name, $source); + foreach my $sample_id (@{$samples}){ + #get the ones that are individuals + $ind = $self->fetch_by_dbID($sample_id); + push @{$individuals}, $ind if (defined $ind); + } + return $individuals; +} + +=head2 fetch_all_by_name + + Arg [1] : string $name the name of the individuals to retrieve + Example : my @inds = @{$ind_adaptor->fetch_all_by_name('CEPH1332.05')}; + Description: Retrieves all individuals with a specified name. Individual + names may be non-unique which is why this method returns a + reference to a list. + Returntype : reference to a list of Individual ids + Exceptions : throw if no argument passed + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_name { + my $self = shift; + my $name = shift; + + defined($name) || throw("name argument expected"); + + my $sth = $self->prepare + (q{SELECT i.sample_id, s.name, s.description, s.display, + i.gender, i.father_individual_sample_id, i.mother_individual_sample_id, it.name, it.description + FROM individual i, sample s, individual_type it + WHERE s.name = ? + AND it.individual_type_id = i.individual_type_id + AND s.sample_id = i.sample_id}); + + $sth->bind_param(1,$name,SQL_VARCHAR); + $sth->execute(); + + my $result = $self->_objs_from_sth($sth); + + $sth->finish(); + + return $result; +} + + + + +=head2 fetch_all_by_Population + + Arg [1] : Bio::EnsEMBL::Variation::Population $pop + Example : my $pop = $pop_adaptor->fetch_by_name('PACIFIC'); + foreach my $ind (@{$ia->fetch_all_by_Population($pop)}) { + print $ind->name(), "\n"; + } + Description: Retrieves all individuals from a specified population + Returntype : reference to list of Bio::EnsEMBL::Variation::Individual objects + Exceptions : throw if incorrect argument is passed + warning if provided Population does not have an dbID + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_Population { + my $self = shift; + my $pop = shift; + + if(!ref($pop) || !$pop->isa('Bio::EnsEMBL::Variation::Population')) { + throw("Bio::EnsEMBL::Variation::Population arg expected"); + } + + if(!$pop->dbID()) { + warning("Population does not have dbID, cannot retrieve Individuals"); + return []; + } + + my $sth = $self->prepare + (q{SELECT i.sample_id, s.name, s.description, s.display, + i.gender, i.father_individual_sample_id, i.mother_individual_sample_id, it.name, it.description + FROM individual i, individual_population ip, sample s, individual_type it + WHERE i.sample_id = ip.individual_sample_id + AND i.sample_id = s.sample_id + AND i.individual_type_id = it.individual_type_id + AND ip.population_sample_id = ?}); + + $sth->bind_param(1,$pop->dbID,SQL_INTEGER); + $sth->execute(); + + my $results = $self->_objs_from_sth($sth); + + $sth->finish(); + + return $results; +} + + + + +=head2 fetch_all_by_parent_Individual + + Arg [1] : Bio::EnsEMBL::Variation::Individual + Example : my @children = @{$ia->fetch_all_by_parent_Individual($ind)}; + Description: Retrieves all individuals which are children of a provided + parent individual. This function operates under the assumptions + that Male individuals can only be fathers, Female individuals + can only be mothers and Unknown individuals can only be one + or the other - not both. + Returntype : reference to list of Bio::EnsEMBL::Variation::Individuals + Exceptions : throw if incorrect argument passed + warning if provided individual has no dbID + Caller : general, Individual::get_all_child_Individuals + Status : At Risk + +=cut + +sub fetch_all_by_parent_Individual { + my $self = shift; + my $parent = shift; + + if(!ref($parent) || !$parent->isa('Bio::EnsEMBL::Variation::Individual')) { + throw("Bio::EnsEMBL::Variation::Individual argument expected"); + } + + if(!defined($parent->dbID())) { + warning("Cannot fetch child Individuals for parent without dbID"); + return []; + } + + my $gender = $parent->gender() || ''; + + my $father_sql = + q{SELECT i.sample_id, s.name, s.description, s.display, + i.gender, i.father_individual_sample_id, i.mother_individual_sample_id, it.name, it.description + FROM individual i, sample s, individual_type it + WHERE i.father_individual_sample_id = ? + AND i.individual_type_id = it.individual_type_id + AND s.sample_id = i.sample_id}; + my $mother_sql = + q{SELECT i.sample_id, s.name, s.description, s.display, + i.gender, i.father_individual_sample_id, i.mother_individual_sample_id, it.name, it.description + FROM individual i, sample s, individual_type it + WHERE i.mother_individual_sample_id = ? + AND i.individual_type_id = it.individual_type_id + AND i.sample_id = s.sample_id}; + + if($gender eq 'Male') { + my $sth = $self->prepare($father_sql); + $sth->bind_param(1,$parent->dbID,SQL_INTEGER); + $sth->execute(); + my $result = $self->_objs_from_sth($sth); + $sth->finish(); + return $result; + } + elsif($gender eq 'Female') { + my $sth = $self->prepare($mother_sql); + $sth->bind_param(1,$parent->dbID,SQL_INTEGER); + $sth->execute(); + my $result = $self->_objs_from_sth($sth); + $sth->finish(); + return $result; + } + + # unknown gender + + my $sth = $self->prepare($mother_sql); + $sth->bind_param(1,$parent->dbID,SQL_INTEGER); + $sth->execute(); + my $result = $self->_objs_from_sth($sth); + $sth->finish(); + + # if this parent was a mother, finish now and return results + return if(@$result); + + # otherwise assume was a father (or nothing) + $sth = $self->prepare($father_sql); + $sth->bind_param(1,$parent->dbID,SQL_INTEGER); + $sth->execute(); + $result = $self->_objs_from_sth($sth); + $sth->finish(); + + return $result; +} + +=head2 fetch_all_strains + + Args : none + Example : my $strains = $ind_adaptor->fetch_all_strains(); + Description: Retrieves Individuals that should be considered as strain (fully inbred) in the specie. + Returntype : list of Bio::EnsEMBL::Variation::Individual + Exceptions : none + Caller : Bio:EnsEMBL:Variation::Individual + Status : At Risk + +=cut + +sub fetch_all_strains{ + my $self = shift; + + return $self->generic_fetch("it.name = 'fully_inbred'"); + +} + +=head2 get_display_strains + + Args : none + Example : my $strains = $ind_adaptor->get_display_strains(); + Description: Retrieves strain_names that are going to be displayed in the web (reference + default + others) + Returntype : list of strings + Exceptions : none + Caller : web + Status : At Risk + +=cut + +sub get_display_strains{ + my $self = shift; + my @strain_names; + my $name; + #first, get the reference strain + $name = $self->get_reference_strain_name(); + push @strain_names, $name; + #then, get the default ones + my $default_strains = $self->get_default_strains(); + push @strain_names, @{$default_strains}; + + #and finally, get the others + my $sth = $self->prepare(qq{SELECT name FROM sample WHERE display = ?}); + + $sth->bind_param(1, 'DISPLAYABLE'); + $sth->execute; + $sth->bind_columns(\$name); +# my $sth = $self->prepare(qq{SELECT meta_value from meta where meta_key = ? +# }); +# $sth->bind_param(1,'individual.display_strain',SQL_VARCHAR); +# $sth->execute(); +# $sth->bind_columns(\$name); + while ($sth->fetch()){ + push @strain_names, $name; + } + $sth->finish; + return \@strain_names; + +} + + +=head2 get_default_strains + + Args : none + Example : my $strains = $ind_adaptor->get_default_strains(); + Description: Retrieves strain_names that are defined as default in the database(mainly, for web purposes) + Returntype : list of strings + Exceptions : none + Caller : web + Status : At Risk + +=cut + +sub get_default_strains{ + my $self = shift; + my @strain_names; + my $name; + + my $sth = $self->prepare(qq{SELECT name FROM sample WHERE display = ?}); + + $sth->bind_param(1, 'DEFAULT'); + $sth->execute; + $sth->bind_columns(\$name); + +# my $sth = $self->prepare(qq{SELECT meta_value from meta where meta_key = ? +# }); +# $sth->bind_param(1,'individual.default_strain',SQL_VARCHAR); +# $sth->execute(); +# $sth->bind_columns(\$name); + while ($sth->fetch()){ + push @strain_names, $name; + } + $sth->finish; + return \@strain_names; + +} + + +=head2 get_reference_strain_name + + Args : none + Example : my $reference_strain = $ind_adaptor->get_reference_strain_name(); + Description: Retrieves the reference strain_name that is defined as default in the database(mainly, for web purposes) + Returntype : string + Exceptions : none + Caller : web + Status : At Risk + +=cut + +sub get_reference_strain_name{ + my $self = shift; + + my $name; + + my $sth = $self->prepare(qq{SELECT name FROM sample WHERE display = ?}); + + $sth->bind_param(1, 'REFERENCE'); + $sth->execute; + $sth->bind_columns(\$name); + $sth->fetch(); + $sth->finish; + + return $name; +# +# my $sth = $self->prepare(qq{SELECT meta_value from meta where meta_key = ? +# }); +# $sth->bind_param(1,'individual.reference_strain',SQL_VARCHAR); +# $sth->execute(); +# $sth->bind_columns(\$name); +# $sth->fetch(); +# $sth->finish; +# +# return $name; + +} + + +=head2 fetch_all_strains_with_coverage + + Args : none + Example : my $strains = $ind_adaptor->fetch_all_strains_with_coverage(); + Description: Retrieves strain that have coverage information + Returntype : list of Bio::EnsEMBL::Variation::Individual + Exceptions : none + Caller : web + Status : At Risk + +=cut + +sub fetch_all_strains_with_coverage{ + + my $self = shift; + + my $sample_id; + my @strains; + my $sth = $self->prepare(qq{SELECT DISTINCT sample_id from read_coverage + }); + $sth->execute(); + $sth->bind_columns(\$sample_id); + while ($sth->fetch()){ + push @strains, $self->fetch_by_dbID($sample_id) + } + $sth->finish; + return \@strains; +} +# +# private method, constructs Individuals from an executed statement handle +# ordering of columns must be consistant +# +sub _objs_from_sth { + my $self = shift; + my $sth = shift; + + my ($dbID, $name, $desc, $gender, $display_flag, $father_id, $mother_id,$it_name,$it_desc); + + $sth->bind_columns(\$dbID, \$name, \$desc, \$display_flag, \$gender, + \$father_id, \$mother_id, \$it_name, \$it_desc); + + my %seen; + my %wanted_fathers; + my %wanted_mothers; + + my @inds; + + + while($sth->fetch()) { + # get objects for mother and father if they were already constructed + # otherwise may have to be lazy-loaded later + my $father; + if(defined($father_id)) { + $father = $seen{$father_id}; + if(!$father) { + $wanted_fathers{$dbID} ||= []; + push @{$wanted_fathers{$father_id}}, $dbID; + } + } + my $mother; + if(defined($mother_id)) { + $mother = $seen{$mother_id}; + if(!$mother) { + $wanted_mothers{$mother_id} ||= []; + push @{$wanted_mothers{$mother_id}}, $dbID; + } + } + + + my $ind = $seen{$dbID} ||= Bio::EnsEMBL::Variation::Individual->new + (-dbID => $dbID, + -adaptor => $self, + -description => $desc, + -display => $display_flag, + -gender => $gender, + -name => $name, + -father_individual => $father, + -mother_individual => $mother, + -father_individual_sample_id => $father_id, + -mother_individual_sample_id => $mother_id, + -type_individual => $it_name, + -type_description => $it_desc); + + $seen{$dbID} = $ind; + + push @inds, $ind; + } + + # load any of the 'wanted' parent individuals that we did not have at the + # of creation, but which we have now + + foreach my $wanted_id (keys %wanted_fathers) { + if($seen{$wanted_id}) { + # add father to every child that wanted it + foreach my $ind_id (@{$wanted_fathers{$wanted_id}}) { + $seen{$ind_id}->father_Individual($seen{$wanted_id}); + } + } + } + foreach my $wanted_id (keys %wanted_mothers) { + if($seen{$wanted_id}) { + # add mother to every child that wanted it + foreach my $ind_id (@{$wanted_mothers{$wanted_id}}) { + $seen{$ind_id}->mother_Individual($seen{$wanted_id}); + } + } + } + + return \@inds; + +} + +sub _tables{return (['individual','i'], + ['sample','s'], + ['individual_type','it'])} + +sub _columns{ + return qw(s.sample_id s.name s.description s.display i.gender i.father_individual_sample_id i.mother_individual_sample_id it.name it.description); +} + +sub _default_where_clause{ + return 's.sample_id = i.sample_id AND i.individual_type_id = it.individual_type_id'; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/IndividualGenotypeAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/IndividualGenotypeAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,290 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# +# Ensembl module for Bio::EnsEMBL::Variation::DBSQL::IndividualGenotypeAdaptor +# +# Copyright (c) 2005 Ensembl +# +# You may distribute this module under the same terms as perl itself +# +# + +=head1 NAME + +Bio::EnsEMBL::Variation::DBSQL::IndividualGenotypeAdaptor + +=head1 SYNOPSIS + +Adaptor for IndividualGenotype objects. + +=head1 DESCRIPTION + +This adaptor provides database connectivity for IndividualGenotype objects. +IndividualGenotypes may be retrieved from the Ensembl variation database by +several means using this module. + +=head1 METHODS + +=cut +package Bio::EnsEMBL::Variation::DBSQL::IndividualGenotypeAdaptor; + +use strict; +use warnings; + +use vars qw(@ISA); + +use Bio::EnsEMBL::Variation::DBSQL::BaseGenotypeAdaptor; +use Bio::EnsEMBL::Variation::IndividualGenotype; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); + +@ISA = ('Bio::EnsEMBL::Variation::DBSQL::BaseGenotypeAdaptor'); + +# stores a listref of individual genotype objects +sub store { + my ($self, $gts, $merge) = @_; + + throw("First argument to store is not a listref") unless ref($gts) eq 'ARRAY'; + + # sort genotypes into rows by variation + my %by_var; + push @{$by_var{$_->variation->dbID.'_'.($_->{subsnp} ? $_->{subsnp} : '')}}, $_ for @$gts; + + # get unique genotypes and codes + my %unique_gts = map {$_->genotype_string() => 1} @$gts; + $unique_gts{$_} = $self->_genotype_code([split /\|/, $_]) for keys %unique_gts; + + # get variation objects + my %var_objs = map {$_->variation->dbID => $_->variation} @$gts; + + my $dbh = $self->dbc->db_handle; + + my $sth = $dbh->prepare_cached(qq{ + INSERT INTO compressed_genotype_var( + variation_id, + subsnp_id, + genotypes + ) VALUES (?,?,?) + }); + + my $update_sth = $dbh->prepare(qq{ + UPDATE compressed_genotype_var + SET genotypes = ? + WHERE variation_id = ? + AND subsnp_id = ? + }); + + my $rows_added = 0; + + foreach my $combo_id(keys %by_var) { + my $genotype_string = ''; + + my ($var_id, $subsnp_id) = split /\_/, $combo_id; + $subsnp_id = undef if $subsnp_id eq ''; + + # check for existing genotypes + my @existing_gts = ($merge ? + grep { + (defined($_->{subsnp}) && defined($subsnp_id) && $_->{subsnp} eq $subsnp_id) || + (!defined($_->{subsnp}) && !defined($subsnp_id)) + } @{$self->fetch_all_by_Variation($var_objs{$var_id})} : ()); + + # update if existing + if(@existing_gts) { + + # refresh unique_gts + %unique_gts = map {$_->genotype_string() => 1} (@existing_gts, @$gts); + $unique_gts{$_} = $self->_genotype_code([split /\|/, $_]) for keys %unique_gts; + + # make sure we don't put in duplicates + my %by_ind = map {$_->individual->dbID => $_} (@existing_gts, @$gts); + + $genotype_string .= pack("ww", $_->individual->dbID, $unique_gts{$_->genotype_string}) for values %by_ind; + + $update_sth->execute( + $genotype_string, + $var_id, + $subsnp_id + ); + } + + else { + $genotype_string .= pack("ww", $_->individual->dbID, $unique_gts{$_->genotype_string}) for @$gts; + + $sth->execute( + $var_id, + $subsnp_id, + $genotype_string + ); + + $rows_added++; + } + + } + + $sth->finish; + $update_sth->finish; + + return $rows_added; +} + + +=head2 fetch_all_by_Variation + + Arg [1] : Bio::EnsEMBL::Variation $variation + Example : my $var = $variation_adaptor->fetch_by_name( "rs1121" ) + $igtypes = $igtype_adaptor->fetch_all_by_Variation( $var ) + Description: Retrieves a list of individual genotypes for the given Variation. + If none are available an empty listref is returned. + Returntype : listref Bio::EnsEMBL::Variation::IndividualGenotype + Exceptions : none + Caller : general + Status : At Risk + +=cut + + +sub fetch_all_by_Variation { + my $self = shift; + my $variation = shift; + my $individual = shift; + + if(!ref($variation) || !$variation->isa('Bio::EnsEMBL::Variation::Variation')) { + throw('Bio::EnsEMBL::Variation::Variation argument expected'); + } + + if(!defined($variation->dbID())) { + warning("Cannot retrieve genotypes for variation without dbID"); + return []; + } + + my $results = $self->generic_fetch("g.variation_id = " . $variation->dbID()); + + # individual can be an individual or a population + if (defined $individual && defined $individual->dbID){ + if($individual->isa('Bio::EnsEMBL::Variation::Individual')) { + @$results = grep {$_->individual->dbID == $individual->dbID} @$results; + } + elsif($individual->isa('Bio::EnsEMBL::Variation::Population')) { + my %include = map {$_->dbID => 1} @{$individual->get_all_Individuals}; + @$results = grep {$include{$_->individual->dbID}} @$results; + } + else { + throw("Argument supplied is not of type Bio::EnsEMBL::Variation::Sample"); + } + } + + $_->variation($variation) for @$results; + + # flip genotypes for flipped variations + #if(defined $variation->flipped && $variation->flipped == 1) { + # foreach my $gt(@$results) { + # my @new_gt; + # + # foreach my $allele(@{$gt->{genotype}}) { + # reverse_comp(\$allele); + # push @new_gt, $allele; + # } + # $gt->{genotype} = \@new_gt; + # } + #} + + return $results; +} + +sub fetch_all_by_Slice { + my $self = shift; + + my $cga = $self->db->get_IndividualGenotypeFeatureAdaptor(); + + return $cga->fetch_all_by_Slice(@_); +} + +sub _tables{ + my $self = shift; + + return (['compressed_genotype_var','g'],['failed_variation','fv']); +} + +#ÊAdd a left join to the failed_variation table +sub _left_join { return ([ 'failed_variation', 'fv.variation_id = g.variation_id']); } + +sub _columns{ + return qw(g.variation_id g.subsnp_id g.genotypes); +} + +sub _objs_from_sth{ + my $self = shift; + my $sth = shift; + + my ($variation_id, $subsnp_id, $genotypes); + + $sth->bind_columns(\$variation_id, \$subsnp_id, \$genotypes); + + my (%individual_hash, %gt_code_hash, @results); + + while($sth->fetch) { + my @genotypes = unpack("(ww)*", $genotypes); + + while(@genotypes) { + my $sample_id = shift @genotypes; + my $gt_code = shift @genotypes; + + my $igtype = Bio::EnsEMBL::Variation::IndividualGenotype->new_fast({ + _variation_id => $variation_id, + subsnp => $subsnp_id, + adaptor => $self, + }); + + $individual_hash{$sample_id} ||= []; + push @{$individual_hash{$sample_id}}, $igtype; + + $gt_code_hash{$gt_code} ||= []; + push @{$gt_code_hash{$gt_code}}, $igtype; + + push @results, $igtype; + } + } + + # fetch individuals + my $ia = $self->db()->get_IndividualAdaptor(); + my $inds = $ia->fetch_all_by_dbID_list([keys %individual_hash]); + + foreach my $i (@$inds) { + foreach my $igty (@{$individual_hash{$i->dbID()}}) { + $igty->{individual} = $i; + } + } + + # get all genotypes from codes + my $gtca = $self->db->get_GenotypeCodeAdaptor(); + my $gtcs = $gtca->fetch_all_by_dbID_list([keys %gt_code_hash]); + + foreach my $gtc(@$gtcs) { + foreach my $igty(@{$gt_code_hash{$gtc->dbID}}) { + $igty->{genotype} = $gtc->genotype; + } + } + + return \@results; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/IndividualGenotypeFeatureAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/IndividualGenotypeFeatureAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,378 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# +# Ensembl module for Bio::EnsEMBL::Variation::DBSQL::IndividualGenotypeFeatureAdaptor +# +# Copyright (c) 2005 Ensembl +# +# You may distribute this module under the same terms as perl itself +# +# + +=head1 NAME + +Bio::EnsEMBL::Variation::DBSQL::IndividualGenotypeFeatureAdaptor + +=head1 SYNOPSIS + $reg = 'Bio::EnsEMBL::Registry'; + + $reg->load_registry_from_db(-host => 'ensembldb.ensembl.org',-user => 'anonymous'); + + $iga = $reg->get_adaptor("human","variation","individualgenotype"); + + #returns all genotypes in a certain Slice + + $genotypes = $iga->fetch_by_Slice($slice); + + + +=head1 DESCRIPTION + +This adaptor provides database connectivity for IndividualGenotypeFeature objects. +IndividualGenotypeFeatures may be retrieved from the Ensembl variation database by +several means using this module. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Variation::DBSQL::IndividualGenotypeFeatureAdaptor; + +use strict; +use warnings; + +use vars qw(@ISA); + +use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + +use Bio::EnsEMBL::Variation::IndividualGenotypeFeature; + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor); + + +=head2 fetch_all_by_Variation + + Arg [1] : Bio::EnsEMBL::Variation $variation + Example : my $var = $variation_adaptor->fetch_by_name( "rs1121" ) + $igtypes = $igtype_adaptor->fetch_all_by_Variation( $var ) + Description: Retrieves a list of individual genotypes for the given Variation. + If none are available an empty listref is returned. + Returntype : listref Bio::EnsEMBL::Variation::IndividualGenotype + Exceptions : none + Caller : general + Status : At Risk + +=cut + + +sub fetch_all_by_Variation { + my $self = shift; + my $variation = shift; + my $individual = shift; + + my $res; + if(!ref($variation) || !$variation->isa('Bio::EnsEMBL::Variation::Variation')) { + throw('Bio::EnsEMBL::Variation::Variation argument expected'); + } + + if(!defined($variation->dbID())) { + warning("Cannot retrieve genotypes for variation without set dbID"); + return []; + } + + $self->{_variation_id} = $variation->dbID; + + # foreach of the hitting variation Features, get the Genotype information + foreach my $vf (@{$variation->get_all_VariationFeatures}){ + + # get the feature slice for this VF + my $fs = $vf->feature_Slice(); + + # if the feature slice is start > end + if($fs->start > $fs->end) { + + # get a new slice with the start and end the right way round + # otherwise the call won't pick any variations up + my $new_fs = $fs->{'adaptor'}->fetch_by_region($fs->coord_system->name,$fs->seq_region_name,$fs->end,$fs->start); + $fs = $new_fs; + } + + map {$_->variation($variation); push @{$res}, $_} @{$self->fetch_all_by_Slice($fs, $individual)}; + } + + delete $self->{_variation_id}; + + return $res; +} + +=head2 fetch_all_by_Slice + + Arg [1] : Bio::EnsEMBL:Slice $slice + Arg [2] : (optional) Bio::EnsEMBL::Variation::Individual $individual + Example : my @IndividualGenotypesFeatures = @{$ca->fetch_all_by_Slice($slice)}; + Description: Retrieves all IndividualGenotypeFeature features for a given slice for + a certain individual (if provided). + Returntype : reference to list Bio::EnsEMBL::Variation::IndividualGenotype + Exceptions : throw on bad argument + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_Slice{ + my $self = shift; + my $slice = shift; + my $individual = shift; + + my @results; + my $features; + my $constraint; + + #if passed inividual or population, add constraint + if (defined $individual && defined $individual->dbID){ + my $instr; + + if($individual->isa("Bio::EnsEMBL::Variation::Population")) { + my $inds = $individual->get_all_Individuals; + my @list; + push @list, $_->dbID foreach @$inds; + $instr = (@list > 1) ? " IN (".join(',',@list).")" : ' = \''.$list[0].'\''; + $constraint = " c.sample_id $instr"; + } + else { + $constraint = ' c.sample_id = ' . $individual->dbID; + } + } + + $features = $self->SUPER::fetch_all_by_Slice_constraint($slice,$constraint); + + + my $seq_region_slice = $slice->seq_region_Slice; + + foreach my $indFeature (@{$features}){ + if ($indFeature->start > 0 && ($slice->end-$slice->start +1) >= $indFeature->end){ + if ($indFeature->slice->strand == -1){ #ignore the different strand transformation + + # Position will change if the strand is negative so change the strand to 1 temporarily + $indFeature->slice->{'strand'} = 1; + my $newFeature = $indFeature->transfer($seq_region_slice); + $indFeature->slice->{'strand'} = -1; + $newFeature->slice->{'strand'} = -1; + $newFeature->variation($indFeature->variation); + push @results, $newFeature; + } + else{ + push @results,$indFeature->transfer($seq_region_slice); + } + } + } + + return \@results; + +} + +sub _tables{ + return (['compressed_genotype_region','c']); +} + +sub _columns{ + return qw(sample_id seq_region_id seq_region_start seq_region_end seq_region_strand genotypes); +} + +sub _write_columns{ + return $_[0]->_columns; +} + +sub _objs_from_sth{ + my ($self, $sth, $mapper, $dest_slice) = @_; + + # + # This code is ugly because an attempt has been made to remove as many + # function calls as possible for speed purposes. Thus many caches and + # a fair bit of gymnastics is used. + # + + my $sa = $self->db()->dnadb()->get_SliceAdaptor(); + + my (@results, %slice_hash, %sr_name_hash, %sr_cs_hash, %individual_hash, %gt_code_hash); + + my ($sample_id, $seq_region_id, $seq_region_start, $seq_region_end, $seq_region_strand, $genotypes); + + $sth->bind_columns( + \$sample_id, \$seq_region_id, \$seq_region_start, + \$seq_region_end, \$seq_region_strand, \$genotypes + ); + + my $asm_cs; + my $cmp_cs; + my $asm_cs_vers; + my $asm_cs_name; + my $cmp_cs_vers; + my $cmp_cs_name; + if($mapper) { + $asm_cs = $mapper->assembled_CoordSystem(); + $cmp_cs = $mapper->component_CoordSystem(); + $asm_cs_name = $asm_cs->name(); + $asm_cs_vers = $asm_cs->version(); + $cmp_cs_name = $cmp_cs->name(); + $cmp_cs_vers = $cmp_cs->version(); + } + + my $dest_slice_start; + my $dest_slice_end; + my $dest_slice_strand; + my $dest_slice_length; + if($dest_slice) { + $dest_slice_start = $dest_slice->start(); + $dest_slice_end = $dest_slice->end(); + $dest_slice_strand = $dest_slice->strand(); + $dest_slice_length = $dest_slice->length(); + } + + FEATURE: while($sth->fetch()) { + #get the slice object + my $slice = $slice_hash{"ID:".$seq_region_id}; + + if(!$slice) { + $slice = $sa->fetch_by_seq_region_id($seq_region_id); + $slice_hash{"ID:".$seq_region_id} = $slice; + $sr_name_hash{$seq_region_id} = $slice->seq_region_name(); + $sr_cs_hash{$seq_region_id} = $slice->coord_system(); + } + + my $orig_start = $seq_region_start; + + # + # remap the feature coordinates to another coord system + # if a mapper was provided + # + if($mapper) { + my $sr_name = $sr_name_hash{$seq_region_id}; + my $sr_cs = $sr_cs_hash{$seq_region_id}; + + ($sr_name,$seq_region_start,$seq_region_end,$seq_region_strand) = + $mapper->fastmap($sr_name, $seq_region_start, $seq_region_end, + $seq_region_strand, $sr_cs); + + #skip features that map to gaps or coord system boundaries + next FEATURE if(!defined($sr_name)); + + #get a slice in the coord system we just mapped to + if($asm_cs == $sr_cs || ($cmp_cs != $sr_cs && $asm_cs->equals($sr_cs))) { + $slice = $slice_hash{"NAME:$sr_name:$cmp_cs_name:$cmp_cs_vers"} ||= + $sa->fetch_by_region($cmp_cs_name, $sr_name,undef, undef, undef, + $cmp_cs_vers); + } else { + $slice = $slice_hash{"NAME:$sr_name:$asm_cs_name:$asm_cs_vers"} ||= + $sa->fetch_by_region($asm_cs_name, $sr_name, undef, undef, undef, + $asm_cs_vers); + } + } + + # + # If a destination slice was provided convert the coords + # If the dest_slice starts at 1 and is foward strand, nothing needs doing + # + if($dest_slice) { + if($dest_slice_start != 1 || $dest_slice_strand != 1) { + if($dest_slice_strand == 1) { + $seq_region_start = $seq_region_start - $dest_slice_start + 1; + $seq_region_end = $seq_region_end - $dest_slice_start + 1; + } else { + my $tmp_seq_region_start = $seq_region_start; + # $seq_region_start = $dest_slice_end - $seq_region_end + 1; + $seq_region_start = $seq_region_start - $dest_slice_start + 1; + $seq_region_end = $dest_slice_end - $tmp_seq_region_start + 1; + # $seq_region_strand *= -1; + } + + #throw away features off the end of the requested slice + if($seq_region_end < 1 || $seq_region_start > $dest_slice_length) { + next FEATURE; + } + } + $slice = $dest_slice; + } + + #my @genotypes = unpack '(ww)*', $genotypes; + my @genotypes = unpack '(www)*', $genotypes; + my $snp_start = $seq_region_start; + + #while( my( $gt_code, $gap ) = splice @genotypes, 0, 2 ) { + while( my( $variation_id, $gt_code, $gap ) = splice @genotypes, 0, 3 ) { + + if(defined($self->{_variation_id})) { + if($variation_id != $self->{_variation_id}) { + $snp_start += $gap + 1 if defined $gap; + next; + } + } + + my $igtype = Bio::EnsEMBL::Variation::IndividualGenotypeFeature->new_fast({ + 'start' => $snp_start, + 'end' => $snp_start, + 'strand' => $seq_region_strand, + 'slice' => $slice, + 'gt_code' => $gt_code, + 'adaptor' => $self, + '_variation_id' => $variation_id + }); + + $individual_hash{$sample_id} ||= []; + push @{$individual_hash{$sample_id}}, $igtype; + + $gt_code_hash{$gt_code} ||= []; + push @{$gt_code_hash{$gt_code}}, $igtype; + + push @results, $igtype; + $snp_start += $gap + 1 if defined $gap; + } + } + + # get all individual in one query (faster) + # and add to already created genotypes + my $ia = $self->db()->get_IndividualAdaptor(); + my $inds = $ia->fetch_all_by_dbID_list([keys %individual_hash]); + + foreach my $i (@$inds) { + foreach my $igty (@{$individual_hash{$i->dbID()}}) { + $igty->{individual} = $i; + } + } + + # get all genotypes from codes + my $gtca = $self->db->get_GenotypeCodeAdaptor(); + my $gtcs = $gtca->fetch_all_by_dbID_list([keys %gt_code_hash]); + + foreach my $gtc(@$gtcs) { + foreach my $igty(@{$gt_code_hash{$gtc->dbID}}) { + $igty->{genotype} = $gtc->genotype; + } + } + + # unique sort the results on individual and position (we don't care if GTs disagree) + my %tmp_hash = map {$_->{individual}."_".$_->{start}."_".$_->{end} => $_} @results; + + return [values %tmp_hash]; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/LDFeatureContainerAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/LDFeatureContainerAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,637 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# +# Ensembl module for Bio::EnsEMBL::Variation::DBSQL::LDFeatureContainerAdaptor +# +# Copyright (c) 2004 Ensembl +# +# You may distribute this module under the same terms as perl itself +# +# + +=head1 NAME + +Bio::EnsEMBL::Variation::DBSQL::LDFeatureContainerAdaptor + +=head1 SYNOPSIS + $reg = 'Bio::EnsEMBL::Registry'; + + $reg->load_registry_from_db(-host => 'ensembldb.ensembl.org',-user => 'anonymous'); + + $sa = $reg->get_adaptor("human","core","slice"); + $lda = $reg->get_adaptor("human","variation","ldfeaturecontainer"); + $vfa = $reg->get_adaptor("human","variation","variationfeature"); + + # Get a LDFeatureContainer in a region + $slice = $sa->fetch_by_region('chromosome', 'X', 1e6, 2e6); + + $ldContainer = $lda->fetch_by_Slice($slice); + + print "Name of the ldContainer is: ", $ldContainer->name(); + + # fetch ld featureContainer for a particular variation feature + + $vf = $vfa->fetch_by_dbID(145); + + $ldContainer = $lda->fetch_by_VariationFeature($vf); + + print "Name of the ldContainer: ", $ldContainer->name(); + + +=head1 DESCRIPTION + +This adaptor provides database connectivity for LDFeature objects. +LD Features may be retrieved from the Ensembl variation database by +several means using this module. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::DBSQL::LDFeatureContainerAdaptor; + +use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; +use Bio::EnsEMBL::Variation::LDFeatureContainer; +use vars qw(@ISA); +use Data::Dumper; + +use POSIX; +use FileHandle; + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + +use constant MAX_SNP_DISTANCE => 100_000; + +use base qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); + +our $MAX_SNP_DISTANCE = 100000; +our $BINARY_FILE = ''; +our $TMP_PATH = ''; + +sub executable { + my $self = shift; + $BINARY_FILE = shift if @_; + unless( $BINARY_FILE ) { + my $binary_name = 'calc_genotypes'; + ($BINARY_FILE) = grep {-e $_} map {"$_/calc_genotypes"} split /:/,$ENV{'PATH'}; + } + return $BINARY_FILE; +} + +sub temp_path { + my $self = shift; + $TMP_PATH = shift if @_; + $TMP_PATH ||= '/tmp'; + return $TMP_PATH; +} + +=head2 fetch_by_Slice + + Arg [1] : Bio::EnsEMBL::Slice $slice + The slice to fetch genes on. Assuming it is always correct (in the top level) + Arg [2] : (optional) Bio::EnsEMBL::Variation::Population $population. Population where + we want to select the LD information + Example : $ldFeatureContainer = $ldfeaturecontainer_adaptor->fetch_by_Slice($slice); + Description: Overwrites superclass method to add the name of the slice to the LDFeatureContainer. + Returntype : Bio::EnsEMBL::Variation::LDFeatureContainer + Exceptions : thrown on bad argument + Caller : general + Status : At Risk + +=cut +sub fetch_by_Slice { + my $self = shift; + my $slice = shift; + my $population = shift; + + if(!ref($slice) || !$slice->isa('Bio::EnsEMBL::Slice')) { + throw('Bio::EnsEMBL::Slice arg expected'); + } + my $sth; + my $in_str; + my $siblings = {}; + #when there is no population selected, return LD in the HapMap and PerlEgen populations + $in_str = $self->_get_LD_populations($siblings); + #if a population is passed as an argument, select the LD in the region with the population + if ($population){ + if(!ref($population) || !$population->isa('Bio::EnsEMBL::Variation::Population')) { + throw('Bio::EnsEMBL::Variation::Population arg expected'); + } + my $population_id = $population->dbID; + $in_str = " = $population_id"; +# if ($in_str =~ /$population_id/){ +# $in_str = "IN ($population_id)"; +#' } +# else{ +# warning("Not possible to calculate LD for a non HapMap or PerlEgen population: $population_id"); +# return {}; +# } + } + + if ($in_str eq ''){ + #there is no population, not a human specie or not passed as an argument, return the empy container + my $t = Bio::EnsEMBL::Variation::LDFeatureContainer->new( + '-ldContainer'=> {}, + '-name' => $slice->name, + '-variationFeatures' => {} + ); + return $t + } + + $sth = $self->prepare(qq{SELECT c.sample_id,c.seq_region_id,c.seq_region_start,c.seq_region_end,c.genotypes,ip.population_sample_id + FROM compressed_genotype_region c, individual_population ip + WHERE ip.individual_sample_id = c.sample_id + AND ip.population_sample_id $in_str + AND c.seq_region_id = ? + AND c.seq_region_start >= ? and c.seq_region_start <= ? + AND c.seq_region_end >= ? + ORDER BY c.seq_region_id, c.seq_region_start},{mysql_use_result => 1}); + + $sth->bind_param(1,$slice->get_seq_region_id,SQL_INTEGER); + $sth->bind_param(2,$slice->start - MAX_SNP_DISTANCE,SQL_INTEGER) if ($slice->start - MAX_SNP_DISTANCE >= 1); + $sth->bind_param(2,1,SQL_INTEGER) if ($slice->start - MAX_SNP_DISTANCE < 1); + $sth->bind_param(3,$slice->end,SQL_INTEGER); + $sth->bind_param(4,($slice->start < 1 ? 1 : $slice->start),SQL_INTEGER); + + $sth->execute(); + + my $ldFeatureContainer = $self->_objs_from_sth($sth,$slice,$siblings); + + $sth->finish(); + #and store the name of the slice in the Container + $ldFeatureContainer->name($slice->name()); + return $ldFeatureContainer; +} + +=head2 fetch_by_VariationFeature + + Arg [1] : Bio::EnsEMBL:Variation::VariationFeature $vf + Arg [2] : (optional) Bio::EnsEMBL::Variation::Population $pop + Example : my $ldFeatureContainer = $ldFetureContainerAdaptor->fetch_by_VariationFeature($vf); Description: Retrieves LDFeatureContainer for a given variation feature. If optional population is supplied, values are only returned for that population. + Returntype : reference to Bio::EnsEMBL::Variation::LDFeatureContainer + Exceptions : throw on bad argument + Caller : general + Status : At Risk + +=cut + +sub fetch_by_VariationFeature { + my $self = shift; + my $vf = shift; + my $pop = shift; + + if(!ref($vf) || !$vf->isa('Bio::EnsEMBL::Variation::VariationFeature')) { + throw('Bio::EnsEMBL::Variation::VariationFeature arg expected'); + } + + if(!defined($vf->dbID())) { + throw("VariationFeature arg must have defined dbID"); + } + + # cache the position so objs_from_sth picks it up later to filter + $self->{_vf_pos} = $vf->seq_region_start; + + # fetch by slice using expanded feature slice + my $ldFeatureContainer = $self->fetch_by_Slice($vf->feature_Slice->expand(MAX_SNP_DISTANCE,MAX_SNP_DISTANCE),$pop); + + # delete the cached pos + delete $self->{_vf_pos}; + + $ldFeatureContainer->name($vf->dbID); + + return $ldFeatureContainer; + + +} + + +sub get_populations_by_Slice{ + my $self = shift; + + my $population_hash = $self->get_populations_hash_by_Slice(@_); + return [values(%{$population_hash})]; +} + +sub get_populations_hash_by_Slice{ + my $self = shift; + my $slice = shift; + + if(!ref($slice) || !$slice->isa('Bio::EnsEMBL::Slice')) { + throw('Bio::EnsEMBL::Slice arg expected'); + } + + my $pop_list = $self->_get_LD_populations(); + + my ($sr, $slice_start, $slice_end) = ($slice->get_seq_region_id, $slice->start, $slice->end); + + my %results; + my $pop_threshold = 20; # number of individuals required + my $gen_threshold = 3; # number of genotypes per individual required + + # just get the population list if it's really too long + if($slice->length > 10000000) { + + my $sth = $self->prepare(qq{SELECT sample_id, name FROM sample WHERE sample_id $pop_list;}); + $sth->execute; + + + %results = map {$_->[0] => $_->[1]} @{$sth->fetchall_arrayref()}; + } + + # do a guesstimate for long slices, otherwise it takes too long + elsif($slice->length > 5000000) { + + my $sth = $self->prepare(qq{ + SELECT distinct(c.sample_id), s.name + FROM compressed_genotype_region c, individual_population ip, sample s, individual i + WHERE c.sample_id = ip.individual_sample_id + AND ip.population_sample_id = s.sample_id + AND c.sample_id = i.sample_id + AND c.seq_region_id = ? + AND c.seq_region_start >= ? and c.seq_region_start <= ? + AND c.seq_region_end >= ? + AND i.father_individual_sample_id is NULL AND i.mother_individual_sample_id is NULL + AND (s.sample_id $pop_list) + }); + + $sth->execute($slice->get_seq_region_id, $slice->start, $slice->end, $slice->start); + + my %counts = (); + + while(my $row = $sth->fetchrow_arrayref()) { + my ($ind_id, $pop_id, $pop_name) = @$row; + $results{$pop_id} = $pop_name; + $counts{$pop_id}++; + } + + #ÊDelete the populations that doesn't have enough individuals + delete @results{ grep {$counts{$_} <= $pop_threshold} keys(%counts)}; + } + + else { + + my $sth = $self->prepare(qq{ + SELECT s.sample_id, s.name, c.sample_id, c.seq_region_start, c.seq_region_end, c.genotypes + FROM compressed_genotype_region, individual_population ip, sample s, individual i + WHERE c.sample_id = ip.individual_sample_id + AND ip.population_sample_id = s.sample_id + AND c.sample_id = i.sample_id + AND c.seq_region_id = ? + AND c.seq_region_start >= ? and c.seq_region_start <= ? + AND c.seq_region_end >= ? + AND i.father_individual_sample_id is NULL AND i.mother_individual_sample_id is NULL + AND (s.sample_id $pop_list) + }); + + $sth->execute($sr, $slice_start, $slice_end, $slice_start); + + my (%enough, %counts, %sample_pop, %counts_pop); + + my $row_count = 0; + + while(my $row = $sth->fetchrow_arrayref()) { + my ($population_id, $population_name, $sample_id, $start, $end, $genotypes) = @$row; + + $row_count++; + + next if $enough{$sample_id}; + + + $results{$population_id} = $population_name; + $sample_pop{$sample_id} = $population_name; + + # if the row is only partially within the slice + if($start < $slice_start || $end > $slice_end) { + + my @genotypes = unpack("(www)*", $genotypes); + my $snp_start = $start; + + while( my( $variation_id, $gt_code, $gap ) = splice @genotypes, 0, 3 ) { + if( + ($snp_start >= $slice_start) && + ($snp_start <= $slice_end) + ) { + $counts{$sample_id}++; + } + + $snp_start += $gap + 1 if defined $gap; + last if $snp_start > $slice_end; + } + } + + # if the row is fully within the slice + else { + $counts{$sample_id} += (((length($genotypes) - 2) / 4) + 1); + } + + $enough{$sample_id} = 1 if $counts{$sample_id} >= $gen_threshold; + $counts_pop{$population_id}++ if $counts{$sample_id} >= $gen_threshold; + } + + delete @results{grep {$counts_pop{$_} <= $pop_threshold} keys %counts_pop}; + } + + return \%results; +} + + +#for a given population, gets all individuals that are children (have father or mother) +sub _get_siblings{ + my $self = shift; + my $population_id = shift; + my $siblings = shift; + + my $sth_individual = $self->db->dbc->prepare(qq{SELECT i.sample_id + FROM individual i, individual_population ip + WHERE ip.individual_sample_id = i.sample_id + AND ip.population_sample_id = ? + AND i.father_individual_sample_id IS NOT NULL + AND i.mother_individual_sample_id IS NOT NULL + }); + my ($individual_id); + $sth_individual->execute($population_id); + $sth_individual->bind_columns(\$individual_id); + while ($sth_individual->fetch){ + $siblings->{$population_id.'-'.$individual_id}++; #necessary to have in the key the population, since some individuals are shared between + #populations + } +} + +#reads one line from the compress_genotypes table, uncompress the data, and writes it to the different hashes: one containing the number of bases for the variation and the other with the actual genotype information we need to print in the file +sub _store_genotype{ + my $self = shift; + my $genotype_codes = shift; + my $individual_information = shift; + my $alleles_variation = shift; + my $individual_id = shift; + my $seq_region_start = shift; + my $genotype = shift; + my $population_id = shift; + my $slice = shift; + + my $snp_start = $seq_region_start; + my ($slice_start, $slice_end) = ($slice->start, $slice->end); + + my @genotypes = unpack("(www)*", $genotype); + while( my( $variation_id, $gt_code, $gap ) = splice @genotypes, 0, 3 ) { + my $gt = $genotype_codes->{$gt_code}; + + if( + defined $gt && + ($snp_start >= $slice_start) && + ($snp_start <= $slice_end) && + $gt->[0] =~ /^[ACGT]$/ && + $gt->[1] =~ /[ACGT]$/ + ) { + $alleles_variation->{$snp_start}->{$population_id}->{$gt->[0]}++; + $alleles_variation->{$snp_start}->{$population_id}->{$gt->[1]}++; + + $individual_information->{$population_id}->{$snp_start}->{$individual_id}->{allele_1} = $gt->[0]; + $individual_information->{$population_id}->{$snp_start}->{$individual_id}->{allele_2} = $gt->[1]; + } + + $snp_start += $gap + 1 if defined $gap; + } +} + +# +# Converts the genotype into the required format for the calculation of the pairwise_ld value: AA, Aa or aa +# From the Allele table, will select the alleles and compare to the alleles in the genotype +# + +sub _convert_genotype{ + my $self = shift; + my $alleles_variation = shift; #reference to the hash containing the alleles for the variation present in the genotypes + my $individual_information = shift; #reference to a hash containing the values to be written to the file + my @alleles_ordered; #the array will contain the alleles ordered by apparitions in the genotypes (only 2 values possible) + + @alleles_ordered = sort({$alleles_variation->{$b} <=> $alleles_variation->{$a}} keys %{$alleles_variation}); + + #let's convert the allele_1 allele_2 to a genotype in the AA, Aa or aa format, where A corresponds to the major allele and a to the minor + foreach my $individual_id (keys %{$individual_information}){ + #if both alleles are different, this is the Aa genotype + if ($individual_information->{$individual_id}{allele_1} ne $individual_information->{$individual_id}{allele_2}){ + $individual_information->{$individual_id}{genotype} = 'Aa'; + } + #when they are the same, must find out which is the major + else{ + if ($alleles_ordered[0] eq $individual_information->{$individual_id}{allele_1}){ + #it is the major allele + $individual_information->{$individual_id}{genotype} = 'AA'; + } + else{ + $individual_information->{$individual_id}{genotype} = 'aa'; + } + + } + } +} + +sub _get_LD_populations{ + my $self = shift; + my $siblings = shift; + my ($pop_id,$population_name); +# my $sth = $self->db->dbc->prepare(qq{SELECT s.sample_id, s.name +# FROM population p, sample s +# WHERE (s.name like 'PERLEGEN:AFD%' +# OR s.name like 'CSHL-HAPMAP%') +# AND s.sample_id = p.sample_id}); + my $sth = $self->db->dbc->prepare(qq{SELECT sample_id, name FROM sample WHERE display = 'LD'}); + + $sth->execute(); + $sth->bind_columns(\$pop_id,\$population_name); + #get all the children that we do not want in the genotypes + my @pops; + while($sth->fetch){ + if($population_name =~ /CEU|YRI|MEX/){ + $self->_get_siblings($pop_id,$siblings); + } + push @pops, $pop_id; + } + + my $in_str = " IN (" . join(',', @pops). ")"; + + return $in_str if (defined $pops[0]); + return '' if (!defined $pops[0]); + +} + + +sub _objs_from_sth { + my $self = shift; + my $sth = shift; + my $slice = shift; + my $siblings = shift; + + my ($sample_id,$ld_region_id,$ld_region_start,$ld_region_end,$d_prime,$r2,$sample_count); + my ($vf_id1,$vf_id2); + + my %feature_container = (); + my %vf_objects = (); + + #get all Variation Features in Slice + my $vfa = $self->db->get_VariationFeatureAdaptor(); + my $variations = $vfa->fetch_all_by_Slice($slice); #retrieve all variation features + #create a hash that maps the position->vf_id + my %pos_vf = (); + my $region_Slice = $slice->seq_region_Slice(); + map {$pos_vf{$_->seq_region_start} = $_->transfer($region_Slice)} @{$variations}; + + my %alleles_variation = (); #will contain a record of the alleles in the variation. A will be the major, and a the minor. When more than 2 alleles + #, the genotypes for that variation will be discarded + my %individual_information = (); #to store the relation of snps->individuals + my %regions; #will contain all the regions in the population and the number of genotypes in each one + my $previous_seq_region_id = 0; + + my %_pop_ids; + + my ($individual_id, $seq_region_id, $seq_region_start,$seq_region_end,$genotypes, $population_id); + my @cmd = qw(calc_genotypes); + + #open the pipe between processes if the binary file exists in the PATH + my $bin = $self->executable; + #open the pipe between processes if the binary file exists in the PATH + if( ! $bin ) { + warning("Binary file calc_genotypes not found. Please, read the ensembl-variation/C_code/README.txt file if you want to use LD calculation\n"); + goto OUT; + } + + # fetch all genotype codes as a hash + my $gtca = $self->db->get_GenotypeCodeAdaptor; + my %genotype_codes; + $genotype_codes{$_->dbID} = $_->genotype for @{$gtca->fetch_all_single_bp()}; + + #my $file= $self->temp_path."/".sprintf( "ld%08x%08x%08x", $$, time, rand( 0x7fffffff) ); + #open IN, ">$file.in"; + $sth->bind_columns(\$individual_id, \$seq_region_id, \$seq_region_start, \$seq_region_end, \$genotypes, \$population_id); + while($sth->fetch()) { + #only print genotypes without parents genotyped + if (!exists $siblings->{$population_id . '-' . $individual_id}){ #necessary to use the population_id + $self->_store_genotype(\%genotype_codes, \%individual_information,\%alleles_variation, $individual_id, $seq_region_start, $genotypes, $population_id, $slice); + $previous_seq_region_id = $seq_region_id; + } + } + $sth->finish(); + + #we have to print the variations + my (%in_files, %in_file_names); + + foreach my $snp_start (sort{$a<=>$b} keys %alleles_variation){ + foreach my $population (keys %{$alleles_variation{$snp_start}}){ + + my $fh; + + # create file handles in hash + if(!defined($in_files{$population})) { + $fh = FileHandle->new; + my $f_name = $self->temp_path."/".sprintf( "ld%08x%08x%08x%08x", $population, $$, time, rand( 0x7fffffff)); + $in_file_names{$population} = $f_name; + $fh->open(">".$f_name.".in"); + $in_files{$population} = $fh; + } + + else { + $fh = $in_files{$population}; + } + + #if the variation has 2 alleles, print all the genotypes to the file + if (keys %{$alleles_variation{$snp_start}{$population}} == 2){ + $self->_convert_genotype($alleles_variation{$snp_start}{$population},$individual_information{$population}{$snp_start}); + foreach my $individual_id (keys %{$individual_information{$population}{$snp_start}}){ + print $fh join("\t",$previous_seq_region_id,$snp_start, $snp_start, + $population, $individual_id, + $individual_information{$population}{$snp_start}{$individual_id}{genotype})."\n" || warn $!; + } + } + } + } + + # close file handles and check file sizes + foreach my $key(keys %in_files) { + my $f = $in_files{$key}; + + my @stats = stat $f; + $f->close; + if($stats[7] == 0) { + unlink($in_file_names{$key}.'.in'); + delete $in_file_names{$key}; + delete $in_files{$key}; + } + } + + # run LD binary + `$bin <$_\.in >$_\.out` for values %in_file_names; + + + foreach my $file(values %in_file_names) { + open OUT, "$file.out"; + while(){ + my %ld_values = (); + + # 936 965891 164284 166818 0.628094 0.999996 120 + #get the ouput into the hashes + chomp; + + ($sample_id,$ld_region_id,$ld_region_start,$ld_region_end,$r2,$d_prime,$sample_count) = split /\s/; + + # skip entries unrelated to selected vf if doing fetch_all_by_VariationFeature + if(defined($self->{_vf_pos})) { + next unless $ld_region_start == $self->{_vf_pos} || $ld_region_end == $self->{_vf_pos}; + } + + $ld_values{'d_prime'} = $d_prime; + $ld_values{'r2'} = $r2; + $ld_values{'sample_count'} = $sample_count; + + if (!defined $pos_vf{$ld_region_start} || !defined $pos_vf{$ld_region_end}){ + next; #problem to fix in the compressed genotype table: some of the positions seem to be wrong + } + $vf_id1 = $pos_vf{$ld_region_start}->dbID(); + $vf_id2 = $pos_vf{$ld_region_end}->dbID(); + + $feature_container{$vf_id1 . '-' . $vf_id2}->{$sample_id} = \%ld_values; + $vf_objects{$vf_id1} = $pos_vf{$ld_region_start}; + $vf_objects{$vf_id2} = $pos_vf{$ld_region_end}; + + $_pop_ids{$sample_id} = 1; + } + close OUT || die "Could not close filehandle: $!\n"; + + unlink( "$file.in" ); + unlink( "$file.out" ); + } + +OUT: + my $t = Bio::EnsEMBL::Variation::LDFeatureContainer->new( + '-ldContainer'=> \%feature_container, + '-name' => '', + '-variationFeatures' => \%vf_objects + ); + + $t->{'_pop_ids'} =\%_pop_ids; + + return $t; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/MetaContainer.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/MetaContainer.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,89 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# +# EnsEMBL module for Bio::EnsEMBL::Variation::DBSQL::MetaContainer +# +# Cared for by Daniel Rios +# +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + + Bio::EnsEMBL::Variation::DBSQL::MetaContainer - + Encapsulates all access to variation database meta information + +=head1 SYNOPSIS + + my $meta_container = $db_adaptor->get_MetaContainer(); + + my $default_population = $meta_container->get_default_LDPopulation(); + +=head1 DESCRIPTION + + An object that encapsulates specific access to variation db meta data + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Variation::DBSQL::MetaContainer; + +use vars qw(@ISA); +use strict; + +use Bio::EnsEMBL::DBSQL::BaseMetaContainer; +use Bio::EnsEMBL::Utils::Exception; + + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseMetaContainer); + + +sub get_schema_version { + my $self = shift; + + my $arrRef = $self->list_value_by_key( 'schema_version' ); + + if( @$arrRef ) { + my ($ver) = ($arrRef->[0] =~ /^\s*(\d+)\s*$/); + if(!defined($ver)){ # old style format + return 0; + } + return $ver; + } else { + warning("Please insert meta_key 'schema_version' " . + "in meta table at variation db.\n"); + } + return 0; +} + +sub ploidy { + my $self = shift; + + my $values = $self->list_value_by_key('ploidy'); + + # default to 2 + return scalar @$values ? $values->[0] : 2; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/PhenotypeAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/PhenotypeAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,88 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# +# Ensembl module for Bio::EnsEMBL::Variation::DBSQL::PhenotypeAdaptor +# +# Copyright (c) 2004 Ensembl +# +# You may distribute this module under the same terms as perl itself +# +# + +=head1 NAME + +Bio::EnsEMBL::Variation::DBSQL::PhenotypeAdaptor + +=head1 SYNOPSIS + $reg = 'Bio::EnsEMBL::Registry'; + + $reg->load_registry_from_db(-host => 'ensembldb.ensembl.org',-user => 'anonymous'); + + $pa = $reg->get_adaptor("human","variation","phenotype"); + + # Get a list of all phenotypes. + $phenotypes = $pa->fetch_all(); + +=head1 DESCRIPTION + +This adaptor provides database connectivity for Phenotype objects. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::DBSQL::PhenotypeAdaptor; + +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Variation::Phenotype; + +our @ISA = ('Bio::EnsEMBL::DBSQL::BaseAdaptor'); + +=head2 fetch_all + + Example : my $phenotypes = $phenotype_adaptor->fetch_all(); + Description : Retrieves an array of all phenotyes. + Returntype : listref of Bio::EnsEMBL::Variation::Phenotype. + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub fetch_all { + my $self = shift; + my @phenotypes; + + my $sth = $self->prepare(qq{SELECT phenotype_id, description from phenotype}); + $sth->execute(); + my ($dbID, $phenotype_description); + $sth->bind_columns(\$dbID, \$phenotype_description); + while ($sth->fetch) { + push @phenotypes, Bio::EnsEMBL::Variation::Phenotype->new( + -dbID => $dbID, + -DESCRIPTION => $phenotype_description,); + } + + return \@phenotypes; +} diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/PopulationAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/PopulationAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,704 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# +# Ensembl module for Bio::EnsEMBL::Variation::DBSQL::PopulationAdaptor +# +# Copyright (c) 2004 Ensembl +# +# You may distribute this module under the same terms as perl itself +# +# + +=head1 NAME + +Bio::EnsEMBL::Variation::DBSQL::PopulationAdaptor + +=head1 SYNOPSIS + $reg = 'Bio::EnsEMBL::Registry'; + + $reg->load_registry_from_db(-host => 'ensembldb.ensembl.org',-user => 'anonymous'); + + $pa = $reg->get_adaptor("human","variation","population"); + + # Get a Population by its internal identifier + $pop = $pa->fetch_by_dbID(145); + + # fetch a population by its name + $pop = $pa->fetch_by_name('PACIFIC'); + + # fetch all sub populations of a population + foreach $sub_pop (@{$pa->fetch_all_by_super_Population($pop)}) { + print $sub_pop->name(), " is a sub population of ", $pop->name(), "\n"; + } + + # fetch all super populations + foreach $super_pop (@{$pa->fetch_all_by_sub_Population($pop)}) { + print $pop->name(), " is a sub population of ", $super_pop->name(), "\n"; + } + + +=head1 DESCRIPTION + +This adaptor provides database connectivity for Population objects. +Populations may be retrieved from the Ensembl variation database by +several means using this module. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::DBSQL::PopulationAdaptor; + +use Bio::EnsEMBL::Variation::DBSQL::SampleAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Scalar qw(wrap_array); + +use Bio::EnsEMBL::Variation::Population; + +our @ISA = ('Bio::EnsEMBL::Variation::DBSQL::SampleAdaptor'); + +sub store { + my ($self, $pop) = @_; + + my $dbh = $self->dbc->db_handle; + + my $sth = $dbh->prepare(q{ + INSERT INTO sample ( + name, + size, + description + ) VALUES (?,?,?) + }); + + $sth->execute( + $pop->name, + $pop->size, + $pop->description + ); + $sth->finish; + + # get the sample_id inserted + my $dbID = $dbh->last_insert_id(undef, undef, 'sample', 'sample_id'); + + $pop->{dbID} = $dbID; + $pop->{adaptor} = $self; + + # add entry to population table also + $sth = $dbh->prepare(q{ + INSERT INTO population (sample_id) VALUES (?) + }); + $sth->execute($dbID); + $sth->finish; +} + +=head2 fetch_population_by_synonym + + Arg [1] : $population_synonym + Example : my $pop = $pop_adaptor->fetch_population_by_synonym($population_synonym,$source); + Description : Retrieves populations for the synonym given in the source. If no source is provided, retrieves all the synonyms + Returntype : list of Bio::EnsEMBL::Variation::Population + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub fetch_population_by_synonym{ + my $self = shift; + my $synonym_name = shift; + my $source = shift; + my $pops; + my $pop; + #return all sample_id from the database + my $samples = $self->SUPER::fetch_sample_by_synonym($synonym_name, $source); + foreach my $sample_id (@{$samples}){ + #get the ones that are individuals + $pop = $self->fetch_by_dbID($sample_id); + push @{$pops}, $pop if (defined $pop); + } + return $pops; +} + + +=head2 fetch_by_name + + Arg [1] : string $name + Example : $pop = $pop_adaptor->fetch_by_name('NUSPAE:Singapore_HDL'); + Description: Retrieves a population object via its name + Returntype : Bio::EnsEMBL::Variation::Population + Exceptions : throw if name argument is not defined + Caller : general + Status : At Risk + +=cut + +sub fetch_by_name { + my $self = shift; + my $name = shift; + + throw('name argument expected') if(!defined($name)); + + my $sth = $self->prepare(q{SELECT p.sample_id, s.name, s.size, s.description + FROM population p, sample s + WHERE s.name = ? + AND s.sample_id = p.sample_id}); + + $sth->bind_param(1,$name,SQL_VARCHAR); + $sth->execute(); + + my $result = $self->_objs_from_sth($sth); + + $sth->finish(); + + return undef if(!@$result); + + return $result->[0]; +} + +=head2 fetch_all_by_dbID_list + + Arg [1] : listref $list + Example : $pops = $pop_adaptor->fetch_all_by_dbID_list([907,1132]); + Description: Retrieves a listref of population objects via a list of internal + dbID identifiers + Returntype : listref of Bio::EnsEMBL::Variation::Population objects + Exceptions : throw if list argument is not defined + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_dbID_list { + my $self = shift; + my $list = shift; + + if(!defined($list) || ref($list) ne 'ARRAY') { + throw("list reference argument is required"); + } + + return [] unless scalar @$list >= 1; + + my $id_str = (@$list > 1) ? " IN (".join(',',@$list).")" : ' = \''.$list->[0].'\''; + + my $sth = $self->prepare(qq{SELECT p.sample_id, s.name, s.size, s.description + FROM population p, sample s + WHERE s.sample_id $id_str + AND s.sample_id = p.sample_id}); + $sth->execute(); + + my $result = $self->_objs_from_sth($sth); + + $sth->finish(); + + return undef if(!@$result); + + return $result; +} + + +=head2 fetch_all_by_name_search + + Arg [1] : string $name + Example : $pop = $pop_adaptor->fetch_all_by_name_search('CEU'); + Description: Retrieves a list of population objects whose name matches the + search term. + Returntype : Listref of Bio::EnsEMBL::Variation::Population objects + Exceptions : throw if name argument is not defined + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_name_search { + my $self = shift; + my $name = shift; + + throw('name argument expected') if(!defined($name)); + + my $sth = $self->prepare(q{SELECT p.sample_id, s.name, s.size, s.description + FROM population p, sample s + WHERE s.name like concat('%', ?, '%') + AND s.sample_id = p.sample_id}); + + $sth->bind_param(1,$name,SQL_VARCHAR); + $sth->execute(); + + my $result = $self->_objs_from_sth($sth); + + $sth->finish(); + + return $result; +} + + +=head2 fetch_all_by_super_Population + + Arg [1] : Bio::EnsEMBL::Variation::Population $pop + Example : foreach $sub_pop (@{$pa->fetch_all_by_super_Population($pop)}) { + print $sub_pop->name(), "\n"; + } + Description: Retrieves all sub populations of a provided population. + Returntype : Bio::EnsEMBL::Population + Exceptions : throw on bad argument + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_super_Population { + my $self = shift; + my $pop = shift; + + if(!ref($pop) || !$pop->isa('Bio::EnsEMBL::Variation::Population')) { + throw('Bio::EnsEMBL::Variation::Population argument expected'); + } + + if(!$pop->dbID()) { + warning("Cannot retrieve sub populations for population without dbID"); + return []; + } + + my $sth = $self->prepare(q{SELECT p.sample_id, s.name, s.size, + s.description + FROM population p, population_structure ps, sample s + WHERE p.sample_id = ps.sub_population_sample_id + AND ps.super_population_sample_id = ? + AND p.sample_id = s.sample_id}); + + $sth->bind_param(1,$pop->dbID,SQL_INTEGER); + $sth->execute(); + + my $result = $self->_objs_from_sth($sth); + + $sth->finish(); + + return $result; +} + + + +=head2 fetch_all_by_sub_Population + + Arg [1] : Bio::EnsEMBL::Variation::Population $pop + Example : foreach $super_pop (@{$pa->fetch_all_by_sub_Population($pop)}) { + print $super_pop->name(), "\n"; + } + Description: Retrieves all super populations for a provided population + Returntype : Bio::EnsEMBL::Population + Exceptions : throw on bad argument + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_sub_Population { + my $self = shift; + my $pop = shift; + + if(!ref($pop) || !$pop->isa('Bio::EnsEMBL::Variation::Population')) { + throw('Bio::EnsEMBL::Variation::Population argument expected'); + } + + if(!$pop->dbID()) { + warning("Cannot retrieve super populations for population without dbID"); + return []; + } + + my $sth = $self->prepare(q{SELECT p.sample_id, s.name, s.size, + s.description + FROM population p, population_structure ps, sample s + WHERE p.sample_id = ps.super_population_sample_id + AND ps.sub_population_sample_id = ? + AND p.sample_id = s.sample_id}); + + $sth->bind_param(1,$pop->dbID,SQL_INTEGER); + $sth->execute(); + + my $result = $self->_objs_from_sth($sth); + + $sth->finish(); + + return $result; +} + + +=head2 fetch_default_LDPopulation + + Args : none + Example : $population = $pop_adaptor->fetch_default_LDPopulation(); + Description : Obtains the population it is used as a default in the LD display of the pairwise LD data + ReturnType : Bio::EnsEMBL::Variation::Population + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub fetch_default_LDPopulation{ + my $self = shift; + my $population_id; + + my $sth = $self->prepare(qq{SELECT meta_value from meta where meta_key = ?}); + + $sth->bind_param(1,'pairwise_ld.default_population',SQL_VARCHAR); + $sth->execute(); + $sth->bind_columns(\$population_id); + $sth->fetch(); + $sth->finish; + + if (defined $population_id){ + return $self->fetch_by_dbID($population_id); + } + else{ + return undef; + } +} + + +=head2 fetch_all_LD_Populations + + Example : @populations = @{$pop_adaptor->fetch_all_LD Populations(); + Description : Gets all populations that can be used in the LD display + ReturnType : listref of Bio::EnsEMBL::Variation::Population objects + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub fetch_all_LD_Populations{ + my $self = shift; + + return [grep {$_->name !~ /ALL|AFR|AMR|ASN|EUR/} @{$self->generic_fetch(qq{ s.display = 'LD' })}]; +} + + +=head2 fetch_all_HapMap_Populations + + Example : @populations = @{$pop_adaptor->fetch_all_HapMap_populations(); + Description : Gets all populations from the HapMap project (human only!) + ReturnType : listref of Bio::EnsEMBL::Variation::Population objects + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub fetch_all_HapMap_Populations { + my $self = shift; + + return $self->generic_fetch(qq{ s.name like 'cshl-hapmap%' }); +} + + +=head2 fetch_all_1KG_Populations + + Example : @populations = @{$pop_adaptor->fetch_all_1KG_populations(); + Description : Gets all populations from the 1000 genomes project (human only!) + ReturnType : listref of Bio::EnsEMBL::Variation::Population objects + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub fetch_all_1KG_Populations{ + my $self = shift; + + return $self->generic_fetch(qq{ s.name like '1000GENOMES%' }); +} + + +=head2 fetch_all_by_Individual + + Arg [1] : Bio::EnsEMBL::Variation::Individual $ind + Example : my $ind = $ind_adaptor->fetch_by_name('NA12004'); + foreach my $pop (@{$pop_adaptor->fetch_all_by_Individual($ind)}){ + print $pop->name,"\n"; + } + Description : Retrieves all populations from a specified individual + ReturnType : reference to list of Bio::EnsEMBL::Variation::Population objects + Exceptions : throw if incorrect argument is passed + warning if provided individual does not have a dbID + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_Individual{ + my $self = shift; + my $ind = shift; + + if(!ref($ind) || !$ind->isa('Bio::EnsEMBL::Variation::Individual')) { + throw("Bio::EnsEMBL::Variation::Individual arg expected"); + } + + if(!$ind->dbID()) { + warning("Individual does not have dbID, cannot retrieve Individuals"); + return []; + } + + my $sth = $self->prepare(qq{SELECT p.sample_id, s.name, s.size, s.description + FROM population p, individual_population ip, sample s + WHERE s.sample_id = ip.population_sample_id + AND s.sample_id = p.sample_id + AND ip.individual_sample_id = ? + }); + $sth->bind_param(1,$ind->dbID,SQL_INTEGER); + $sth->execute(); + + my $results = $self->_objs_from_sth($sth); + + $sth->finish(); + + return $results; +} + + +=head2 fetch_all_by_Individual_list + + Arg [1] : reference to list of of Bio::EnsEMBL::Variation::Individual objects + Example : foreach my $pop (@{$pop_adaptor->fetch_all_by_Individual_list($inds)}){ + print $pop->name,"\n"; + } + Description : Retrieves all populations from a specified individual + ReturnType : reference to list of Bio::EnsEMBL::Variation::Population objects + Exceptions : throw if incorrect argument is passed + warning if provided individual does not have a dbID + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_Individual_list{ + my $self = shift; + my $list = shift; + + if(!ref($list) || !$list->[0]->isa('Bio::EnsEMBL::Variation::Individual')) { + throw("Listref of Bio::EnsEMBL::Variation::Individual arg expected"); + } + + if(!$list->[0]->dbID()) { + warning("First Individual does not have dbID, cannot retrieve Populations"); + return []; + } + + my $id_str = " IN (" . join(',', map {$_->dbID} @$list). ")"; + + my $sth = $self->prepare(qq{ + SELECT p.sample_id, s.name, s.size, s.description + FROM population p, individual_population ip, sample s + WHERE s.sample_id = ip.population_sample_id + AND s.sample_id = p.sample_id + AND ip.individual_sample_id $id_str + }); + $sth->execute(); + + my $results = $self->_objs_from_sth($sth); + + $sth->finish(); + + return $results; +} + + +=head2 fetch_tagged_Population + + Arg [1] : Bio::EnsEMBL::Variation::VariationFeature $vf + Example : my $vf = $vf_adaptor->fetch_by_name('rs205621'); + my $populations_tagged = $vf->is_tagged(); + foreach my $pop (@{$vf_adaptor->is_tagged}){ + print $pop->name," has been tagged using a 0.99 r2 criteria\n"; + } + Description : Retrieves all populations from a specified variation feature that have been tagged + ReturnType : reference to list of Bio::EnsEMBL::Variation::Population objects + Exceptions : throw if incorrect argument is passed + warning if provided variation feature does not have a dbID + Caller : general + Status : At Risk + +=cut + +sub fetch_tagged_Population{ + my $self = shift; + my $variation_feature = shift; + + if(!ref($variation_feature) || !$variation_feature->isa('Bio::EnsEMBL::Variation::VariationFeature')) { + throw("Bio::EnsEMBL::Variation::VariationFeature arg expected"); + } + + if(!$variation_feature->dbID()) { + warning("Variation feature does not have dbID, cannot retrieve tagged populations"); + return []; + } + + my $sth = $self->prepare(qq{ + SELECT p.sample_id, s.name, s.size, s.description + FROM population p, tagged_variation_feature tvf, sample s + WHERE p.sample_id = tvf.sample_id + AND s.sample_id = p.sample_id + AND tvf.tagged_variation_feature_id = ? + }); + $sth->bind_param(1,$variation_feature->dbID,SQL_INTEGER); + $sth->execute(); + my $results = $self->_objs_from_sth($sth); + + $sth->finish(); + + return $results; +} + +=head2 fetch_tag_Population + + Arg [1] : Bio::EnsEMBL::Variation::VariationFeature $vf + Example : my $vf = $vf_adaptor->fetch_by_name('rs205621'); + my $populations_is_tag = $vf->is_tag(); + foreach my $pop (@{$vf_adaptor->is_tag}){ + print $pop->name," has been tagged using a 0.99 r2 criteria\n"; + } + Description : Retrieves all populations in which the specified variation feature is a tag + ReturnType : reference to list of Bio::EnsEMBL::Variation::Population objects + Exceptions : throw if incorrect argument is passed + warning if provided variation feature does not have a dbID + Caller : general + Status : At Risk + +=cut + +sub fetch_tag_Population{ + my $self = shift; + my $variation_feature = shift; + + if(!ref($variation_feature) || !$variation_feature->isa('Bio::EnsEMBL::Variation::VariationFeature')) { + throw("Bio::EnsEMBL::Variation::VariationFeature arg expected"); + } + + if(!$variation_feature->dbID()) { + warning("Variation feature does not have dbID, cannot retrieve tag populations"); + return []; + } + + my $sth = $self->prepare(qq{ + SELECT p.sample_id, s.name, s.size, s.description + FROM population p, tagged_variation_feature tvf, sample s + WHERE p.sample_id = tvf.sample_id + AND s.sample_id = p.sample_id + AND tvf.variation_feature_id = ? + }); + $sth->bind_param(1,$variation_feature->dbID,SQL_INTEGER); + $sth->execute(); + my $results = $self->_objs_from_sth($sth); + + $sth->finish(); + + return $results; +} + + +=head2 get_sample_id_for_population_names + + Arg [1] : $population_names reference to list of population names + Example : my $ids = $pop_adaptor->get_sample_id_for_population_names(['CSHL-HAPMAP:HAPMAP-MEX','1000GENOMES:pilot_1_CHB+JPT_low_coverage_panel']); + map {printf("Population: \%s has sample_id \%d\n",$ids->{$_},$_)} keys(%{$ids}); + Description : Retrieve the sample_ids for a list of population names + ReturnType : reference to hash with sample_ids as keys and population names as values + Caller : web + Status : At Risk + +=cut + +sub get_sample_id_for_population_names { + my $self = shift; + my $population_names = shift; + + # Wrap the argument into an arrayref + $population_names = wrap_array($population_names); + + # Define a statement handle for the lookup query + my $stmt = qq{ + SELECT + s.sample_id + s.name + FROM + sample s + WHERE + s.name = ? + LIMIT 1 + }; + my $sth = $self->prepare($stmt); + + # Loop over the population names and query the db + my %sample_ids; + foreach my $name (@{$population_names}) { + $sth->execute($name); + + my ($sid,$sname); + $sth->bind_columns(\$sid,\$sname); + $sth->execute(); + + $sample_ids{$sid} = $sname if (defined($sid)); + } + + return \%sample_ids; +} + +# +# private method, creates population objects from an executed statement handle +# ordering of columns must be consistant +# +sub _objs_from_sth { + my $self = shift; + my $sth = shift; + + my @pops; + + my ($pop_id, $name, $size, $desc); + + $sth->bind_columns(\$pop_id, \$name, \$size, \$desc); + + while($sth->fetch()) { + + push @pops, Bio::EnsEMBL::Variation::Population->new + (-dbID => $pop_id, + -ADAPTOR => $self, + -NAME => $name, + -DESCRIPTION => $desc, + -SIZE => $size); + } + + return \@pops; +} + +sub _tables{return (['population','p'], + ['sample','s']);} + +sub _columns{ + return qw(s.sample_id s.name s.size s.description); +} + +sub _default_where_clause{ + return 's.sample_id = p.sample_id'; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/PopulationGenotypeAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/PopulationGenotypeAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,349 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# +# Ensembl module for Bio::EnsEMBL::Variation::DBSQL::PopulationGenotypeAdaptor +# +# Copyright (c) 2004 Ensembl +# +# You may distribute this module under the same terms as perl itself +# +# + +=head1 NAME + +Bio::EnsEMBL::Variation::DBSQL::PopulationGenotypeAdaptor + +=head1 SYNOPSIS + $reg = 'Bio::EnsEMBL::Registry'; + + $reg->load_registry_from_db(-host => 'ensembldb.ensembl.org',-user => 'anonymous'); + + $pga = $reg->get_adaptor("human","variation","populationgenotype"); + $pa = $reg->get_adaptor("human","variation","population"); + + # Get a PopulationGenotype by its internal identifier + $pgtype = $ia->fetch_by_dbID(145); + + print $pgtype->population->name(), " ", + $pgtype->allele1(), ' ', $pgtype->allele2(), ' ', $pgtype->frequency(); + + # Get all population genotypes for an population + $pop = $pa->fetch_by_dbID(1219); + + foreach $pgtype (@{$pga->fetch_all_by_Population($pop)}) { + print $pgtype->variation()->name(), ' ', + $pgtype->frequency(); + $pgtype->allele1(), '/', $pgtype->allele2(), "\n"; + } + + + +=head1 DESCRIPTION + +This adaptor provides database connectivity for PopulationGenotype objects. +PopulationGenotypes may be retrieved from the Ensembl variation database by +several means using this module. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::DBSQL::PopulationGenotypeAdaptor; + +use Bio::EnsEMBL::Variation::DBSQL::BaseGenotypeAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + +use Bio::EnsEMBL::Variation::PopulationGenotype; + +our @ISA = ('Bio::EnsEMBL::Variation::DBSQL::BaseGenotypeAdaptor'); + + + + +sub store { + my ($self, $popgt) = @_; + + my $dbh = $self->dbc->db_handle; + + # get genotype code + my $gt_code = $self->_genotype_code($popgt->genotype); + + my $sth = $dbh->prepare_cached(q{ + INSERT DELAYED INTO population_genotype ( + variation_id, + subsnp_id, + genotype_code_id, + frequency, + sample_id, + count + ) VALUES (?,?,?,?,?,?) + }); + + $sth->execute( + $popgt->{_variation_id} || $popgt->variation->dbID, + $popgt->{subsnp}, + $gt_code, + $popgt->frequency, + $popgt->population ? $popgt->population->dbID : undef, + $popgt->count + ); + + $sth->finish; +} + + + +sub store_multiple { + my ($self, $popgts) = @_; + + my $dbh = $self->dbc->db_handle; + + my $q_string = join ",", map {'(?,?,?,?,?,?)'} @$popgts; + + my @args = map { + $_->{_variation_id} || $_->variation->dbID, + $_->{subsnp}, + $self->_genotype_code($_->genotype), + $_->frequency, + $_->population ? $_->population->dbID : undef, + $_->count + } @$popgts; + + my $sth = $dbh->prepare_cached(qq{ + INSERT INTO population_genotype ( + variation_id, + subsnp_id, + genotype_code_id, + frequency, + sample_id, + count + ) VALUES $q_string + }); + + $sth->execute(@args); + + $sth->finish; +} + +sub store_to_file_handle { + my ($self, $popgt, $file_handle) = @_; + + my $dbh = $self->dbc->db_handle; + + print $file_handle join("\t", + $popgt->{_variation_id} || $popgt->variation->dbID || '\N', + $popgt->{subsnp} || '\N', + $self->_genotype_code($popgt->genotype), + defined($popgt->frequency) ? $popgt->frequency : '\N', + $popgt->population ? $popgt->population->dbID : '\N', + defined($popgt->count) ? $popgt->count : '\N', + )."\n"; +} + +=head2 fetch_by_dbID + + Arg [1] : int $dbID + Example : $pgtype = $pgtype_adaptor->fetch_by_dbID(15767); + Description: Retrieves a population genotype via its unique internal + identifier. undef is returned if no such population genotype + exists. + Returntype : Bio::EnsEMBL::Variation::Variation::PopulationGenotype or undef + Exceptions : throw if no dbID argument is provided + Caller : general + Status : At Risk + +=cut + +sub fetch_by_dbID { + my $self = shift; + my $dbID = shift; + + if (! $dbID){ + throw('no dbID argument provided'); + } + return shift @{$self->generic_fetch("pg.population_genotype_id = " . $dbID)}; + +} + + + + +=head2 fetch_all_by_Population + + Arg [1] : Bio::EnsEMBL::Variation::Population + Example : $pop = $pop_adaptor->fetch_by_dbID(1345); + @gtys = $pgty_adaptor->fetch_all_by_Population($pop); + Description: Retrieves all genotypes which are stored for a specified + population. + Returntype : Bio::EnsEMBL::Variation::PopulationGenotype + Exceptions : throw on incorrect argument + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_Population { + my $self = shift; + my $pop = shift; + + if(!ref($pop) || !$pop->isa('Bio::EnsEMBL::Variation::Population')) { + throw('Bio::EnsEMBL::Variation::Population argument expected'); + } + + if(!defined($pop->dbID())) { + warning("Cannot retrieve genotypes for population without set dbID"); + return []; + } + + my $constraint = "pg.sample_id = " . $pop->dbID(); + + # Add the constraint for failed variations + $constraint .= " AND " . $self->db->_exclude_failed_variations_constraint(); + + return $self->generic_fetch($constraint); +} + + + +=head2 fetch_all_by_Variation + + Arg [1] : Bio::EnsEMBL::Variation $variation + Example : my $var = $variation_adaptor->fetch_by_name( "rs1121" ) + $poptypes = $poptype_adaptor->fetch_all_by_Variation( $var ) + Description: Retrieves a list of population genotypes for the given Variation. + If none are available an empty listref is returned. + Returntype : listref Bio::EnsEMBL::Variation::PopulationGenotype + Exceptions : throw on bad argument + Caller : general + Status : At Risk + +=cut + + +sub fetch_all_by_Variation { + my $self = shift; + my $variation = shift; + + if(!ref($variation) || !$variation->isa('Bio::EnsEMBL::Variation::Variation')) { + throw('Bio::EnsEMBL::Variation::Variation argument expected'); + } + + if(!defined($variation->dbID())) { + warning("Cannot retrieve genotypes for variation without set dbID"); + return []; + } + + return $self->generic_fetch("pg.variation_id = " . $variation->dbID()); +} + +=head2 fetch_all + + Description: Retrieves a list of all population genotypes. + Returntype : listref Bio::EnsEMBL::Variation::PopulationGenotype + Exceptions : throw on bad argument + Caller : general + Status : At Risk + +=cut + + +sub fetch_all { + my $self = shift; + + # Add the constraint for failed variations + my $constraint = $self->db->_exclude_failed_variations_constraint(); + + return $self->generic_fetch($constraint); +} + +sub _tables{return ( + ['population_genotype','pg'], + ['failed_variation','fv'] +)} + +#ÊAdd a left join to the failed_variation table +sub _left_join { return ([ 'failed_variation', 'fv.variation_id = pg.variation_id']); } + +sub _columns{ + return qw(pg.population_genotype_id pg.variation_id pg.subsnp_id pg.sample_id pg.genotype_code_id pg.frequency pg.count) +} + +sub _write_columns { + return qw(variation_id subsnp_id genotype_code_id frequency sample_id count); +} + +sub _objs_from_sth{ + + my $self = shift; + my $sth = shift; + + my ($dbID, $variation_id, $subsnp_id, $sample_id, $gt_code, $freq, $count); + + $sth->bind_columns(\$dbID, \$variation_id, \$subsnp_id, \$sample_id, \$gt_code, \$freq, \$count); + + my (%pop_hash, %gt_code_hash, @results); + + while($sth->fetch) { + + my $pgtype = Bio::EnsEMBL::Variation::PopulationGenotype->new_fast({ + _variation_id => $variation_id, + subsnp => $subsnp_id, + adaptor => $self, + frequency => $freq, + count => $count + }); + + $pop_hash{$sample_id} ||= []; + push @{$pop_hash{$sample_id}}, $pgtype; + + $gt_code_hash{$gt_code} ||= []; + push @{$gt_code_hash{$gt_code}}, $pgtype; + + push @results, $pgtype; + } + + # fetch populations + my $pa = $self->db()->get_PopulationAdaptor(); + my $pops = $pa->fetch_all_by_dbID_list([keys %pop_hash]); + + foreach my $p (@$pops) { + foreach my $pgty (@{$pop_hash{$p->dbID()}}) { + $pgty->{population} = $p; + } + } + + # get all genotypes from codes + my $gtca = $self->db->get_GenotypeCodeAdaptor(); + my $gtcs = $gtca->fetch_all_by_dbID_list([keys %gt_code_hash]); + + foreach my $gtc(@$gtcs) { + foreach my $pgty(@{$gt_code_hash{$gtc->dbID}}) { + $pgty->{genotype} = $gtc->genotype; + } + } + + return \@results; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/ProteinFunctionPredictionMatrixAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/ProteinFunctionPredictionMatrixAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,209 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Variation::DBSQL::ProteinFunctionPredictionMatrixAdaptor + +=head1 DESCRIPTION + +This adaptor lets you store and fetch compressed binary formatted protein +function prediction matrices from the variation databases. + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::DBSQL::ProteinFunctionPredictionMatrixAdaptor; + +use Bio::EnsEMBL::Variation::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix; + +use DBI qw(:sql_types); + +use base qw(Bio::EnsEMBL::Variation::DBSQL::BaseAdaptor); + +sub new_fake { + my $class = shift; + my $species = shift; + + my $self = bless {}, $class; + + return $self; +} + +=head2 store + + Arg [1] : ProteinFunctionPredictionMatrix $matrix - the matrix you want to store + Description: Store the given matrix in the database + Status : At risk + +=cut + +sub store { + my ($self, $matrix) = @_; + + # get our analysis attrib ID from the attrib table + + throw("You need to supply a translation MD5 to store a matrix in the database") + unless $matrix->translation_md5; + + my $analysis = $matrix->analysis; + + $analysis .= '_'.$matrix->sub_analysis if defined $matrix->sub_analysis; + + my $analysis_attrib_id = $self->db->get_AttributeAdaptor->attrib_id_for_type_value( + 'prot_func_analysis', + $analysis + ); + + throw("No attrib_id for analysis $analysis?") unless defined $analysis_attrib_id; + + my $dbh = $self->dbc->db_handle; + + # first add the MD5 to the translation_md5 table if necessary + + my $md5_sth = $dbh->prepare(qq{INSERT IGNORE INTO translation_md5 (translation_md5) VALUES (?)}); + + $md5_sth->execute($matrix->translation_md5); + + # then add the matrix + + my $matrix_sth = $dbh->prepare(qq{ + INSERT INTO protein_function_predictions (translation_md5_id, analysis_attrib_id, prediction_matrix) + VALUES ((SELECT translation_md5_id FROM translation_md5 WHERE translation_md5 = ?),?,?) + }); + + $matrix_sth->execute($matrix->translation_md5, $analysis_attrib_id, $matrix->serialize); +} + +=head2 fetch_by_analysis_translation_md5 + + Arg [1] : string $analysis - the name of the prediction tool + Arg [2] : string $translation_md5 - the hex MD5 hash of the translation sequence + Description: Fetch the prediction matrix for the given tool and peptide sequence MD5 + Returntype : Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix instance or undef + Status : At risk + +=cut + +sub fetch_by_analysis_translation_md5 { + my ($self, $analysis, $translation_md5) = @_; + + my $constraint = "t.translation_md5 = ? AND a.value = ?"; + + $self->bind_param_generic_fetch($translation_md5, SQL_VARCHAR); + $self->bind_param_generic_fetch($analysis, SQL_VARCHAR); + + my ($matrix) = @{ $self->generic_fetch($constraint) }; + + return $matrix; +} + +=head2 fetch_polyphen_predictions_by_translation_md5 + + Arg [1] : string $translation_md5 - the hex MD5 hash of the translation sequence + Arg [2] : string $model - the desired classifier model, either 'humvar' or 'humdiv', + the default is 'humvar' + Description: Fetch the polyphen prediction matrix for the given translation sequence MD5 + and classifier model + Returntype : Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix instance or undef + Status : At risk + +=cut + +sub fetch_polyphen_predictions_by_translation_md5 { + my ($self, $translation_md5, $model) = @_; + + $model ||= 'humvar'; + + $model = lc($model); + + throw("Unrecognised model for PolyPhen: '$model'") + unless (($model eq 'humvar') || ($model eq 'humdiv')); + + return $self->fetch_by_analysis_translation_md5('polyphen_'.$model, $translation_md5); +} + +=head2 fetch_sift_predictions_by_translation_md5 + + Arg [1] : string $translation_md5 - the hex MD5 hash of the translation sequence + Description: Fetch the sift prediction matrix for the given translation sequence MD5 + Returntype : Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix instance or undef + Status : At risk + +=cut + +sub fetch_sift_predictions_by_translation_md5 { + my ($self, $translation_md5) = @_; + return $self->fetch_by_analysis_translation_md5('sift', $translation_md5); +} + +sub _columns { + return qw(t.translation_md5 a.value p.prediction_matrix); +} + +sub _tables { + return ( + ['protein_function_predictions', 'p'], + ['translation_md5', 't'], + ['attrib', 'a'] + ); +} + +sub _default_where_clause { + return join ' AND ', ( + 'p.translation_md5_id = t.translation_md5_id', + 'p.analysis_attrib_id = a.attrib_id' + ); +} + +sub _objs_from_sth { + + my ($self, $sth) = @_; + + my $md5; + my $analysis; + my $matrix; + + $sth->bind_columns(\$md5, \$analysis, \$matrix); + + my @matrices; + + while ($sth->fetch) { + if ($matrix) { + my ($super_analysis, $sub_analysis) = split /_/, $analysis; + + push @matrices, Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix->new( + -translation_md5 => $md5, + -analysis => $super_analysis, + -sub_analysis => $sub_analysis, + -matrix => $matrix, + ); + } + } + + return \@matrices; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/ReadCoverageAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/ReadCoverageAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,388 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# +# Ensembl module for Bio::EnsEMBL::Variation::DBSQL::ReadCoverageAdaptor +# +# Copyright (c) 2005 Ensembl +# +# You may distribute this module under the same terms as perl itself +# +# + +=head1 NAME + +Bio::EnsEMBL::Variation::DBSQL::ReadCoverageAdaptor + +=head1 SYNOPSIS + $reg = 'Bio::EnsEMBL::Registry'; + + $reg->load_registry_from_db(-host => 'ensembldb.ensembl.org',-user => 'anonymous'); + + $rca = $reg->get_adaptor("human","variation","readcoverage"); + $sa = $reg->get_adaptor("human","core","slice"); + $pa = $reg->get_adaptor("human","variation","population"); + + # get read coverage in a region for a certain population in in a certain level + $slice = $sa->fetch_by_region('chromosome', 'X', 1e6, 2e6); + $population = $pa->fetch_by_name("UNKNOWN"); + $level = 1; + foreach $rc (@{$vfa->fetch_all_by_Slice_Sample_depth($slice,$population,$level)}) { + print $rc->start(), '-', $rc->end(), "\n"; + } + + +=head1 DESCRIPTION + +This adaptor provides database connectivity for ReadCoverage objects. +Coverage information for reads can be obtained from the database using this +adaptor. See the base class BaseFeatureAdaptor for more information. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::DBSQL::ReadCoverageAdaptor; + +use Bio::EnsEMBL::Variation::ReadCoverage; +use Bio::EnsEMBL::Mapper::RangeRegistry; +use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + +our @ISA = ('Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor'); + +=head2 fetch_all_by_Slice_Sample_depth + + Arg[0] : Bio::EnsEMBL::Slice $slice + Arg[1] : (optional) Bio::EnsEMBL::Variation::Sample $sample + Arg[2] : (optional) int $level + Example : my $features = $rca->fetch_all_by_Slice_Sample_depth($slice,$sample,$level); or + my $features = $rca->fetch_all_by_Slice_Sample_depth($slice, $sample); or + my $features = $rca->fetch_all_by_Slice_Sample_depth($slice, $level); or + my $features = $rca->fetch_all_by_Slice_Sample_depth($slice); + Description : Gets all the read coverage features for a given sample(strain or individual) in a certain level + in the provided slice + ReturnType : listref of Bio::EnsEMBL::Variation::ReadCoverage + Exceptions : thrown on bad arguments + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_Slice_Sample_depth{ + my $self = shift; + my $slice = shift; + my @args = @_; #can contain individual and/or level + my $rcs; + + if(!ref($slice) || !$slice->isa('Bio::EnsEMBL::Slice')) { + throw('Bio::EnsEMBL::Slice arg expected'); + } + if (defined $args[0]){ #contains, at least, 1 parameter, either a Individual or the level + my $constraint; + my $levels = $self->get_coverage_levels(); + if (defined $args[1]){ #contains both parameters, first the Individual and second the level + if(!ref($args[0]) || !$args[0]->isa('Bio::EnsEMBL::Variation::Sample')) { + throw('Bio::EnsEMBL::Variation::Sample arg expected'); + } + if(!defined($args[0]->dbID())) { + throw("Sample arg must have defined dbID"); + } + if ((grep {$args[1] == $_} @{$levels}) > 0){ + $constraint = "rc.sample_id = " . $args[0]->dbID . " AND rc.level = " . $args[1]; + # $constraint = "rc.sample_id = ? AND rc.level = ?"; + # $self->bind_param_generic_fetch($args[0]->dbID,SQL_INTEGER); + # $self->bind_param_generic_fetch($args[1],SQL_INTEGER); + } + else{ + warning("Level must be a number of: " . join(",", @{$levels})); + return []; + } + } + else{ #there is just 1 argument, can either be the Individual or the level + if (!ref($args[0])){ + #it should just contain the level + if ((grep {$args[0] == $_} @{$levels}) > 0){ + $constraint = "rc.level = " . $args[0]; + #$constraint = "rc.level = ? "; + #$self->bind_param_generic_fetch($args[0],SQL_INTEGER); + } + else{ + warning("Level must be a number of: " . join(",", @{$levels})); + return []; + } + } + else{ + #it should contain the Individual + if (!$args[0]->isa('Bio::EnsEMBL::Variation::Sample')){ + throw('Bio::EnsEMBL::Variation::Sample arg expected'); + } + $constraint = "rc.sample_id = " . $args[0]->dbID; + #$constraint = "rc.sample_id = ?"; + #$self->bind_param_generic_fetch($args[0]->dbID,SQL_INTEGER); + } + } + $rcs = $self->fetch_all_by_Slice_constraint($slice,$constraint); + return $rcs; + } + #call the method fetch_all_by_Slice + $rcs = $self->fetch_all_by_Slice($slice); + return $rcs; +} + +#returns a list of regions that are covered by all the sample given +sub fetch_all_regions_covered{ + my $self = shift; + my $slice = shift; + my $samples = shift; #listref of sample names to get the coverage from + + my $ind_adaptor = $self->db->get_IndividualAdaptor; + my $range_registry = []; + my $max_level = scalar(@{$samples}); + _initialize_range_registry($range_registry,$max_level); + foreach my $sample_name (@{$samples}){ + my $sample = shift@{$ind_adaptor->fetch_all_by_name($sample_name)}; + my $coverage = $self->fetch_all_by_Slice_Sample_depth($slice,$sample,1); #get coverage information + foreach my $cv_feature (@{$coverage}){ + my $range = [$cv_feature->seq_region_start,$cv_feature->seq_region_end]; #store toplevel coordinates + _register_range_level($range_registry,$range,1,$max_level); + } + } + return $range_registry->[$max_level]->get_ranges(1); +} + +=head2 get_coverage_levels + + Args : none + Example : my @coverage_levels = @{$rca->fetch_coverage_levels()}; + Description : Gets the read coverage depths calculated in the database + ReturnType : listref of integer + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub get_coverage_levels{ + my $self = shift; + my @levels; + my $level_coverage; + my $sth = $self->prepare(qq{SELECT meta_value from meta where meta_key = 'read_coverage.coverage_level' + }); + $sth->execute(); + $sth->bind_columns(\$level_coverage); + while ($sth->fetch()){ + push @levels, $level_coverage; + } + $sth->finish(); + + return \@levels; +} + +sub _initialize_range_registry{ + my $range_registry = shift; + my $max_level = shift; + + foreach my $level (1..$max_level){ + $range_registry->[$level] = Bio::EnsEMBL::Mapper::RangeRegistry->new(); + } + + return; +} + + +sub _register_range_level{ + my $range_registry = shift; + my $range = shift; + my $level = shift; + my $max_level = shift; + + return if ($level > $max_level); + my $rr = $range_registry->[$level]; + my $pair = $rr->check_and_register(1,$range->[0],$range->[1]); + my $pair_inverted = _invert_pair($range,$pair); + return if (!defined $pair_inverted); + foreach my $inverted_range (@{$pair_inverted}){ + _register_range_level($range_registry,$inverted_range,$level+1, $max_level); + } +} + +#for a given range and the one covered, returns the inverted +sub _invert_pair{ + my $range = shift; #initial range of the array + my $pairs = shift; #listref with the pairs that have been added to the range + + my @inverted_pairs; + my $inverted; + + my $rr = Bio::EnsEMBL::Mapper::RangeRegistry->new(); + + foreach my $pair (@{$pairs}){ + $rr->check_and_register(1,$pair->[0],$pair->[1]); + } + return $rr->check_and_register(1,$range->[0],$range->[1]); #register again the range +} + +sub _tables{ return (['read_coverage','rc'] + )} + + +sub _columns{ + return qw (rc.seq_region_id rc.seq_region_start rc.seq_region_end rc.level rc.sample_id); +} + + +sub _objs_from_sth{ + my ($self, $sth, $mapper, $dest_slice) = @_; + + my $sa = $self->db()->dnadb()->get_SliceAdaptor(); + + # + # This code is ugly because an attempt has been made to remove as many + # function calls as possible for speed purposes. Thus many caches and + # a fair bit of gymnastics is used. + # + + my ($seq_region_id, $seq_region_start, $seq_region_end, $level, $sample_id); + my $seq_region_strand = 1; #assume all reads are in the + strand + + $sth->bind_columns(\$seq_region_id, \$seq_region_start, \$seq_region_end, \$level, \$sample_id); + + my @features; + my %seen_pops; #hash containing the populations seen + my %slice_hash; + my %sr_name_hash; + my %sr_cs_hash; + my $asm_cs; + my $cmp_cs; + my $asm_cs_vers; + my $asm_cs_name; + my $cmp_cs_vers; + my $cmp_cs_name; + + if($mapper) { + $asm_cs = $mapper->assembled_CoordSystem(); + $cmp_cs = $mapper->component_CoordSystem(); + $asm_cs_name = $asm_cs->name(); + $asm_cs_vers = $asm_cs->version(); + $cmp_cs_name = $cmp_cs->name(); + $cmp_cs_vers = $cmp_cs->version(); + } + + my $dest_slice_start; + my $dest_slice_end; + my $dest_slice_strand; + my $dest_slice_length; + + if($dest_slice) { + $dest_slice_start = $dest_slice->start(); + $dest_slice_end = $dest_slice->end(); + $dest_slice_strand = $dest_slice->strand(); + $dest_slice_length = $dest_slice->length(); + } + + my $read_coverage; + FEATURE: while ($sth->fetch()){ + + #get the population_adaptor object + my $pa = $self->db()->get_PopulationAdaptor(); + my $ia = $self->db()->get_IndividualAdaptor(); + #get the slice object + my $slice = $slice_hash{"ID:".$seq_region_id}; + if(!$slice) { + $slice = $sa->fetch_by_seq_region_id($seq_region_id); + $slice_hash{"ID:".$seq_region_id} = $slice; + $sr_name_hash{$seq_region_id} = $slice->seq_region_name(); + $sr_cs_hash{$seq_region_id} = $slice->coord_system(); + } + # + # remap the feature coordinates to another coord system + # if a mapper was provided + # + if($mapper) { + my $sr_name = $sr_name_hash{$seq_region_id}; + my $sr_cs = $sr_cs_hash{$seq_region_id}; + + ($sr_name,$seq_region_start,$seq_region_end,$seq_region_strand) = + $mapper->fastmap($sr_name, $seq_region_start, $seq_region_end, + $seq_region_strand, $sr_cs); + + #skip features that map to gaps or coord system boundaries + next FEATURE if(!defined($sr_name)); + + #get a slice in the coord system we just mapped to + if($asm_cs == $sr_cs || ($cmp_cs != $sr_cs && $asm_cs->equals($sr_cs))) { + $slice = $slice_hash{"NAME:$sr_name:$cmp_cs_name:$cmp_cs_vers"} ||= + $sa->fetch_by_region($cmp_cs_name, $sr_name,undef, undef, undef, + $cmp_cs_vers); + } else { + $slice = $slice_hash{"NAME:$sr_name:$asm_cs_name:$asm_cs_vers"} ||= + $sa->fetch_by_region($asm_cs_name, $sr_name, undef, undef, undef, + $asm_cs_vers); + } + } + + # + # If a destination slice was provided convert the coords + # If the dest_slice starts at 1 and is foward strand, nothing needs doing + # + if($dest_slice) { + if($dest_slice_start != 1 || $dest_slice_strand != 1) { + if($dest_slice_strand == 1) { + $seq_region_start = $seq_region_start - $dest_slice_start + 1; + $seq_region_end = $seq_region_end - $dest_slice_start + 1; + } else { + my $tmp_seq_region_start = $seq_region_start; + $seq_region_start = $dest_slice_end - $seq_region_end + 1; + $seq_region_end = $dest_slice_end - $tmp_seq_region_start + 1; + $seq_region_strand *= -1; + } + + #throw away features off the end of the requested slice + if($seq_region_end < 1 || $seq_region_start > $dest_slice_length) { + next FEATURE; + } + } + $slice = $dest_slice; + } + + my $sample; + if($sample_id){ + $sample = $seen_pops{$sample_id} ||= $pa->fetch_by_dbID($sample_id); + $sample = $seen_pops{$sample_id} ||= $ia->fetch_by_dbID($sample_id); + } + $read_coverage = Bio::EnsEMBL::Variation::ReadCoverage->new(-start => $seq_region_start, + -end => $seq_region_end, + -slice => $slice, + -adaptor => $self, + -level => $level, + -sample => $sample, + -strand => 1 + ); + push @features, $read_coverage; + } + $sth->finish(); + return \@features; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/ReadCoverageCollectionAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/ReadCoverageCollectionAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,321 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# +# Ensembl module for Bio::EnsEMBL::Variation::DBSQL::ReadCoverageCollectionAdaptor +# +# Copyright (c) 2005 Ensembl +# +# You may distribute this module under the same terms as perl itself +# +# + +=head1 NAME + +Bio::EnsEMBL::Variation::DBSQL::ReadCoverageCollectionAdaptor + +=head1 SYNOPSIS + $reg = 'Bio::EnsEMBL::Registry'; + + $reg->load_registry_from_db(-host => 'ensembldb.ensembl.org',-user => 'anonymous'); + + $rca = $reg->get_adaptor("human","variation","readcoveragecollection"); + $sa = $reg->get_adaptor("human","core","slice"); + $pa = $reg->get_adaptor("human","variation","population"); + + # get read coverage in a region for a certain population in in a certain level + $slice = $sa->fetch_by_region('chromosome', 'X', 1e6, 2e6); + $population = $pa->fetch_by_name("UNKNOWN"); + $level = 1; + foreach $rc (@{$vfa->fetch_all_by_Slice_Sample($slice,$population,$level)}) { + print $rc->start(), '-', $rc->end(), "\n"; + } + + +=head1 DESCRIPTION + +This adaptor provides database connectivity for ReadCoverageCollection objects. +Coverage information for reads can be obtained from the database using this +adaptor. See the base class BaseFeatureAdaptor for more information. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::DBSQL::ReadCoverageCollectionAdaptor; + +use Bio::EnsEMBL::Variation::ReadCoverageCollection; +use Bio::EnsEMBL::Mapper::RangeRegistry; +use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + +our @ISA = ('Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor'); +my %Printable = ( "\\"=>'\\', "\r"=>'r', "\n"=>'n', "\t"=>'t', "\""=>'"' ); + +#my $_pack_type = "n"; + +=head2 fetch_all_by_Slice_SampleId + + Arg[0] : Bio::EnsEMBL::Slice $slice + Arg[1] : (optional) $sample_id + Arg[2] : (optional) int $display_size (default 700) + Arg[3] : (optional) int $display_type (one of "AVERAGE" or "MAX","MIN") (default "AVERAGE") + Arg[4] : (optional) int $window_size + Example : my $reads_coverages = $rcca->fetch_all_by_Slice_SampleId($slice,$sample_id,$display_size,$display_type,$window_size); + Window_size defines which set of pre-averaged scores to use. + Valid values are 50, 500 or 5000. There is no need to define + the window_size because the program will select the most + appropriate window_size to use based on the slice_length and the + display_size. + Description : Gets all the read coverage collections for a given sample(strain or individual) in a given slice + ReturnType : listref of Bio::EnsEMBL::Variation::ReadCoverageCollection + Exceptions : thrown on bad arguments + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_Slice_SampleId{ + my $self = shift; + my ($slice,$sample_id,$display_size,$display_type,$window_size) = @_; + + my $rcc = []; + + if(!ref($slice) || !$slice->isa('Bio::EnsEMBL::Slice')) { + throw('Bio::EnsEMBL::Slice arg expected'); + } +# if (defined $sample) { +# if (!ref($sample) || !$sample->isa('Bio::EnsEMBL::Variation::Sample')) { +# throw('Bio::EnsEMBL::Variation::Sample arg expected'); +# } +# if (! defined($sample->dbID())) { +# throw("Sample arg must have defined dbID"); +# } +# } + + #default display_size is 700 + if (!defined $display_size) { + $display_size = 700; + } + + #default display_mode is AVERAGE + if (!defined $display_type) { + $display_type = "AVERAGE"; + } + + #set up bucket object for storing bucket_size number of scores + my $bucket_size = ($slice->length)/$display_size; + + #default window size is the largest bucket that gives at least + #display_size values ie get speed but reasonable resolution + my @window_sizes = (50,500,5000); + #my $num_windows = 100; + + #check if valid window_size + my $found = 0; + if (defined $window_size) { + foreach my $win_size (@window_sizes) { + if ($win_size == $window_size) { + $found = 1; + last; + } + } + if (!$found) { + warning("Invalid window_size $window_size"); + return $rcc; + } + } + + if (!defined $window_size) { + #set window_size to be the largest for when for loop fails + $window_size = $window_sizes[scalar(@window_sizes)-1]; + for (my $i = 1; $i < scalar(@window_sizes); $i++) { + if ($bucket_size < $window_sizes[$i]) { + $window_size = $window_sizes[$i-1]; + last; + } + } + } + + #print "window_size is $window_size\n"; + + $rcc = $self->_fetch_all_by_Slice_WindowSize_SampleId($slice,$window_size,$sample_id) if defined $sample_id; + $rcc = $self->_fetch_all_by_Slice_WindowSize_SampleId($slice,$window_size) if !$sample_id; + + + if (scalar(@$rcc ==0)) { + return $rcc; + } + + #Find the min and max scores for y axis scaling. Save in first + #read_coverage_collection object + my ($min_y_axis, $max_y_axis) = _find_min_max_score($rcc); + + #add min and max scores to the first read_coverage_collection object + if ((scalar @$rcc) > 0) { + $rcc->[0]->y_axis_min($min_y_axis); + $rcc->[0]->y_axis_max($max_y_axis); + } + return ($rcc); + +} + +sub _fetch_all_by_Slice_WindowSize_SampleId { + + my ($self,$slice,$window_size,$sample_id) = @_; + + my $rcc = []; + my $rcs = []; + my $reads_string_avg=[]; + my $reads_string_min=[]; + my $reads_string_max=[]; + + my ($window_start,$window_end); + + my $last_sql = (defined $sample_id) ? "AND sample_id = ?" : ''; + my $sql = qq{SELECT window_size,window_start,window_end,read_coverage_string_avg,read_coverage_string_min,read_coverage_string_max,sample_id + FROM read_coverage_collection + WHERE seq_region_id = ? + AND window_size = ? + AND window_end > ? + AND window_start < ? + $last_sql + }; + + my $sth = $self->prepare($sql); + $sth->execute($slice->get_seq_region_id,$window_size,$slice->start,$slice->end,$sample_id) if defined $sample_id; + $sth->execute($slice->get_seq_region_id,$window_size,$slice->start,$slice->end) if !defined $sample_id; + #these bind_param are not working here + #$sth->bind_param(1,$slice->get_seq_region_id,SQL_INTEGER); + #$sth->bind_param(2,$window_size,SQL_INTEGER); + #$sth->bind_param(3,$slice->start,SQL_INTEGER); + #$sth->bind_param(4,$slice->end,SQL_INTEGER); + + while (my @arrays = $sth->fetchrow_array()) { + $window_size = $arrays[0]; + $window_start = $arrays[1]; + $window_end = $arrays[2]; + $reads_string_avg = _unpack_strings($arrays[3]); + $reads_string_min = _unpack_strings($arrays[4]); + $reads_string_max = _unpack_strings($arrays[5]); + $sample_id = $arrays[6] if $arrays[6]; + #need to find the start_offset for slice_start and end_offset for slice_end + #in the read coverage collection row + + #print "window_size is $window_size,window_start is $window_start,window_end is $window_end,avg is @$reads_string_avg,min is @$reads_string_min,max is @$reads_string_max\n"; + for (my $j = 0; $j < @$reads_string_avg; $j++) { + my $rc = {}; + my $seq_region_start = $window_start + $window_size * $j ; + my $seq_region_end = $seq_region_start + $window_size -1; + next if ($seq_region_start > $slice->end or $seq_region_end < $slice->start); + #covert seq_region_start and seq_region_end tp a start and end relative to the slice + my ($start,$end); + if ($slice->strand == -1) { + $start = $slice->end - $seq_region_end + 1; + $end = $slice->end - $seq_region_start + 1; + } + else { + $start = $seq_region_start - $slice->start + 1; + $end = $seq_region_end - $slice->start +1; + } + $rc->{'start'} = $start; + $rc->{'end'} = $end; + $rc->{'seq_region_start'} = $seq_region_start; + $rc->{'seq_region_end'} = $seq_region_end; + $rc->{'strand'} = $slice->strand; + $rc->{'read_coverage_avg'} = $reads_string_avg->[$j]; + $rc->{'read_coverage_min'} = $reads_string_min->[$j]; + $rc->{'read_coverage_max'} = $reads_string_max->[$j]; + push @{$rcs}, $rc; + } + }#end while loop + + foreach my $rc (@$rcs) { + push @$rcc,$self-> _create_feature_fast('Bio::EnsEMBL::Variation::ReadCoverageCollection', + {'adaptor' => $self, + 'slice' => $slice, + 'start' => $rc->{'start'}, + 'end' => $rc->{'end'}, + 'strand' => $rc->{'strand'}, + 'window_size' => $window_size, + 'window_start' => $window_start, + 'window_end' => $window_end, + 'seq_region_start' => $rc->{'seq_region_start'}, + 'seq_region_end' => $rc->{'seq_region_end'}, + 'seq_region_strand' => 1, + 'sample_id' => (defined $sample_id) ? $sample_id : '', + 'read_coverage_avg' => $rc->{'read_coverage_avg'}, + 'read_coverage_min' => $rc->{'read_coverage_min'}, + 'read_coverage_max' => $rc->{'read_coverage_max'}, + }); + } + + #sort into numerical order based on position + my @sorted_features = sort {$a->{start} <=> $b->{start}} @$rcc; + return\@sorted_features; +} + +sub _unpack_strings { + + my $string = shift; + + my $pack_type = "n" x (length($string)/2); + my @arrays = unpack($pack_type,$string); + return \@arrays; +} + +sub escape ($) { + local $_ = ( defined $_[0] ? $_[0] : '' ); + s/([\r\n\t\\\"])/\\$Printable{$1}/sg; + return $_; +} + + +sub _find_min_max_score { + + my ($scores) = @_; + my $min; + my $max; + + foreach my $score (@$scores) { + #find min and max of diff scores + if (defined $score->read_coverage_min) { + #if min hasn't been defined yet, then define min and max + unless (defined $min and defined $max) { + $min = $score->read_coverage_min; + $max = $score->read_coverage_max; + } + if ($min > $score->read_coverage_min) { + $min = $score->read_coverage_min; + } + if ($max < $score->read_coverage_max) { + $max = $score->read_coverage_max; + } + } + } + + return ($min, $max); +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/SampleAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/SampleAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,325 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# +# Ensembl module for Bio::EnsEMBL::Variation::DBSQL::SampleAdaptor +# +# Copyright (c) 2005 Ensembl +# +# You may distribute this module under the same terms as perl itself +# +# + +=head1 NAME + +Bio::EnsEMBL::Variation::DBSQL::SampleAdaptor + +=head1 SYNOPSIS + +Abstract class - should not be instantiated. Implementation of +abstract methods must be performed by subclasses. + +Base adaptors provides: + +#using the adaptor of the subclass, given the id of the sample, returns all synonyms in database +$synonyms = $sample_adaptor->fetch_synonyms($sample_id); + +#using the adaptor of the subclass and given the name of the synonym, returns the sample +$sample = $sample_adaptor->fetch_sample_by_synonym($sample_synonym_id); + +=head1 DESCRIPTION + +This is a base class implementing common methods in population, individual and strain. This base +class is simply a way of merging similar concepts that should have the same ID + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::DBSQL::SampleAdaptor; +use vars qw(@ISA @EXPORT); +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + + +our @ISA = ('Bio::EnsEMBL::DBSQL::BaseAdaptor'); +@EXPORT = (@{$DBI::EXPORT_TAGS{'sql_types'}}); + +=head2 fetch_synonyms + + Arg [1] : $sample_id + Arg [2] (optional) : $source + Example : my $dbSNP_synonyms = $pop_adaptor->fetch_synonyms($sample_id,$dbSNP); + my $all_synonyms = $pop_adaptor->fetch_synonyms($sample_id); + Description: Retrieves synonyms for the source provided. Otherwise, return all the synonyms for the sample + Returntype : list of strings + Exceptions : none + Caller : Bio:EnsEMBL:Variation::Sample + Status : At Risk + +=cut + +sub fetch_synonyms{ + my $self = shift; + my $dbID = shift; + my $source = shift; + my $sample_synonym; + my $synonyms; + + my $sql; + if (defined $source){ + $sql = qq{SELECT ss.name FROM sample_synonym ss, source s WHERE ss.sample_id = ? AND ss.source_id = s.source_id AND s.name = "$source"} + } + else{ + $sql = qq{SELECT name FROM sample_synonym WHERE sample_id = ?}; + } + my $sth = $self->prepare($sql); + $sth->bind_param(1,$dbID,SQL_INTEGER); + $sth->execute(); + $sth->bind_columns(\$sample_synonym); + while ($sth->fetch){ + push @{$synonyms},$sample_synonym; + } + return $synonyms; +} + +=head2 fetch_sample_by_synonym + + Arg [1] : $sample_synonym + Example : my $pop = $pop_adaptor->fetch_sample_by_synonym($sample_synonym,$source); + Description : Retrieves sample for the synonym given in the source. If no source is provided, retrieves all the synonyms + Returntype : list of Bio::EnsEMBL::Variation::Sample + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub fetch_sample_by_synonym{ + my $self = shift; + my $synonym_name = shift; + my $source = shift; + my $sql; + my $sample; + my $sample_array; + if (defined $source){ + $sql = qq{SELECT sample_id FROM sample_synonym ss, source s WHERE ss.name = ? and ss.source_id = s.source_id AND s.name = "$source"}; + } + else{ + $sql = qq{SELECT sample_id FROM sample_synonym WHERE name = ?}; + } + my $sample_id; + my $sth = $self->prepare($sql); + $sth->bind_param(1,$synonym_name,SQL_VARCHAR); + $sth->execute(); + $sth->bind_columns(\$sample_id); + while ($sth->fetch()){ + push @{$sample_array}, $sample_id; + } + return $sample_array; + +} + +=head2 fetch_by_dbID + + Arg [1] : int $id + The unique sample identifier for the sample to be obtained + Example : $population = $population_adaptor->fetch_by_dbID(1234); + Description: Returns the feature sample from the database defined by the + the id $id. + Returntype : Bio::EnsEMBL::Variation::Sample + Exceptions : thrown if $id arg is not provided + does not exist + Caller : general + Status : At Risk + +=cut + +sub fetch_by_dbID{ + my ($self,$id) = @_; + + throw("id argument is required") if(!defined $id); + + #construct a constraint like 't1.table1_id = 123' + my @tabs = $self->_tables; + my ($name, $syn) = @{$tabs[0]}; + #the constraint must contain the sample_id, that it is used either for individuals or populations + my $constraint = "${syn}.sample_id = ?"; + $self->bind_param_generic_fetch($id,SQL_INTEGER); + + #Should only be one + my ($feat) = @{$self->generic_fetch($constraint)}; + + return undef if(!$feat); + + return $feat; +} + + +=head2 fetch_all_by_dbID_list + + Arg [1] : listref of ints $id_list + The unique database identifiers for the samples to be obtained + Example : @individuals = @{$individual_adaptor->fetch_by_dbID_list([1234, 2131, 982]))}; + Description: Returns the samples created from the database defined by the + the ids in contained in the id list $id_list. If none of the + samples are found in the database a reference to an empty + list is returned. + Returntype : listref of Bio::EnsEMBL::Variation::Sample + Exceptions : thrown if $id arg is not provided + does not exist + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_dbID_list { + my ($self,$id_list_ref) = @_; + + if(!defined($id_list_ref) || ref($id_list_ref) ne 'ARRAY') { + throw("id_list list reference argument is required"); + } + + return [] if(!@$id_list_ref); + + my @out; + #construct a constraint like 't1.table1_id = 123' + my @tabs = $self->_tables; + my ($name, $syn) = @{$tabs[0]}; + + # mysql is faster and we ensure that we do not exceed the max query size by + # splitting large queries into smaller queries of 200 ids + my $max_size = 200; + my @id_list = @$id_list_ref; + + while(@id_list) { + my @ids; + if(@id_list > $max_size) { + @ids = splice(@id_list, 0, $max_size); + } else { + @ids = splice(@id_list, 0); + } + + my $id_str; + if(@ids > 1) { + $id_str = " IN (" . join(',', @ids). ")"; + } else { + $id_str = " = ?"; + $self->bind_param_generic_fetch($ids[0],SQL_INTEGER); + } + + my $constraint = "${syn}.sample_id $id_str"; + + push @out, @{$self->generic_fetch($constraint)}; + } + + return \@out; +} + + +sub _get_individual_population_hash { + my $self = shift; + my $id_list_ref = shift; + + if(!defined($id_list_ref) || ref($id_list_ref) ne 'ARRAY') { + throw("id_list list reference argument is required"); + } + + return [] if (!@$id_list_ref); + + my %ip_hash; + my $max_size = 200; + my @id_list = @$id_list_ref; + + while(@id_list) { + my @ids; + if(@id_list > $max_size) { + @ids = splice(@id_list, 0, $max_size); + } else { + @ids = splice(@id_list, 0); + } + + my $id_str; + if(@ids > 1) { + $id_str = " IN (" . join(',', @ids). ")"; + } else { + $id_str = " = ".$ids[0]; + } + + my $sth = $self->prepare(qq/ + SELECT individual_sample_id, population_sample_id + FROM individual_population + WHERE individual_sample_id $id_str + /); + + $sth->execute(); + + my ($ind, $pop); + $sth->bind_columns(\$ind, \$pop); + $ip_hash{$pop}{$ind} = 1 while $sth->fetch; + $sth->finish(); + } + + # NB COMMENTED OUT FOR NOW AS IT DOESN'T SEEM TO WORK PROPERLY + # now get super-populations + #my @pops = keys %ip_hash; + #my %new_pops; + # + ## need to iterate in case there's multiple levels + #while(scalar @pops) { + # + # my $id_str; + # if(scalar @pops) { + # $id_str = " IN (" . join(',', @pops). ")"; + # } else { + # $id_str = " = ".$pops[0]; + # } + # + # @pops = (); + # + # my $sth = $self->prepare(qq{ + # SELECT sub_population_sample_id, super_population_sample_id + # FROM population_structure + # WHERE sub_population_sample_id $id_str + # }); + # $sth->execute(); + # + # my ($sub, $super); + # $sth->bind_columns(\$sub, \$super); + # while($sth->fetch) { + # push @{$new_pops{$sub}}, $super; + # push @pops, $super; + # } + # $sth->finish(); + #} + # + #foreach my $sub(keys %new_pops) { + # foreach my $super(@{$new_pops{$sub}}) { + # $ip_hash{$super}{$_} = 1 for keys %{$ip_hash{$sub}}; + # } + #} + # + return \%ip_hash; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/StructuralVariationAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/StructuralVariationAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,393 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + + +# Ensembl module for Bio::EnsEMBL::Variation::DBSQL::StructuralVariationAdaptor +# +# Copyright (c) 2004 Ensembl +# +# You may distribute this module under the same terms as perl itself +# +# + +=head1 NAME + +Bio::EnsEMBL::Variation::DBSQL::StructuralVariationAdaptor + +=head1 SYNOPSIS + $reg = 'Bio::EnsEMBL::Registry'; + + $reg->load_registry_from_db(-host => 'ensembldb.ensembl.org',-user => 'anonymous'); + + $sva = $reg->get_adaptor("human","variation","structuralvariation"); + $sta = $reg->get_adaptor("human","variation","study"); + + # Get a StructuralVariation by its internal identifier + $sv = $sva->fetch_by_dbID(145); + + # Get a StructuralVariation by its name + $sv = $sva->fetch_by_name('esv1285'); + + # Get all StructuralVariation by a study + $study = $sta->fetch_by_name('estd1'); + foreach my $sv (@{$sva->fetch_all_by_Study($study)}){ + print $sv->variation_name,"\n"; + } + + # Modify the include_failed_variations flag in DBAdaptor to also return structural variations that have been flagged as failed + $va->db->include_failed_variations(1); + +=head1 DESCRIPTION + +This adaptor provides database connectivity for StructuralVariation objects. +Genomic locations of structural variations can be obtained from the database using this +adaptor. See the base class BaseFeatureAdaptor for more information. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::DBSQL::StructuralVariationAdaptor; + +use Bio::EnsEMBL::Variation::StructuralVariation; +use Bio::EnsEMBL::Variation::DBSQL::BaseStructuralVariationAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Scalar qw(assert_ref); +use DBI qw(:sql_types); + +our @ISA = ('Bio::EnsEMBL::Variation::DBSQL::BaseStructuralVariationAdaptor'); + +my $DEFAULT_ITERATOR_CACHE_SIZE = 10000; + +sub _default_where_clause { + my $self = shift; + return $self->SUPER::_default_where_clause().' AND is_evidence=0'; +} + +sub _objs_from_sth { + my ($self, $sth) = @_; + + # + # This code is ugly because an attempt has been made to remove as many + # function calls as possible for speed purposes. Thus many caches and + # a fair bit of gymnastics is used. + # + my @svs; + + my ($struct_variation_id, $variation_name, $validation_status, $source_name, $source_version, + $source_description, $class_attrib_id, $study_id, $is_evidence, $is_somatic); + + $sth->bind_columns(\$struct_variation_id, \$variation_name, \$validation_status, \$source_name, + \$source_version, \$source_description, \$class_attrib_id, \$study_id, \$is_evidence, + \$is_somatic); + + my $aa = $self->db->get_AttributeAdaptor; + my $sta = $self->db->get_StudyAdaptor(); + + while($sth->fetch()) { + + my $study; + $study = $sta->fetch_by_dbID($study_id) if (defined($study_id)); + + # Get the validation status + $validation_status ||= 0; + my @states = split(/,/,$validation_status); + + push @svs, Bio::EnsEMBL::Variation::StructuralVariation->new( + -dbID => $struct_variation_id, + -VARIATION_NAME => $variation_name, + -VALIDATION_STATES => \@states, + -ADAPTOR => $self, + -SOURCE => $source_name, + -SOURCE_VERSION => $source_version, + -SOURCE_DESCRIPTION => $source_description, + -CLASS_SO_TERM => $aa->attrib_value_for_id($class_attrib_id), + -STUDY => $study, + -IS_EVIDENCE => $is_evidence || 0, + -IS_SOMATIC => $is_somatic || 0 + ); + } + return \@svs; +} + + +=head2 fetch_all_by_supporting_evidence + + Arg [1] : Bio::EnsEMBL::Variation::SupportingStructuralVariation or + Bio::EnsEMBL::Variation::StructuralVariation $se + Example : my $se = $ssv_adaptor->fetch_by_name('essv2585133'); + foreach my $sv (@{$sv_adaptor->fetch_all_by_supporting_evidence($se)}){ + print $sv->variation_name,"\n"; + } + Description : Retrieves all structural variations from a specified supporting evidence + ReturnType : reference to list of Bio::EnsEMBL::Variation::StructuralVariation objects + Exceptions : throw if incorrect argument is passed + warning if provided supporting evidence does not have a dbID + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_supporting_evidence { + my $self = shift; + my $se = shift; + + if(!ref($se) || (!$se->isa('Bio::EnsEMBL::Variation::SupportingStructuralVariation') && + !$se->isa('Bio::EnsEMBL::Variation::StructuralVariation')) + ) { + throw("Bio::EnsEMBL::Variation::SupportingStructuralVariation or Bio::EnsEMBL::Variation::StructuralVariation arg expected"); + } + + if(!$se->dbID()) { + warning("The supporting evidence does not have dbID, cannot retrieve supporting evidence"); + return []; + } + + my $cols = join ",", $self->_columns(); + + my $tables; + foreach my $t ($self->_tables()) { + next if ($t->[0] eq 'failed_structural_variation' and !$self->db->include_failed_variations()); + $tables .= ',' if ($tables); + $tables .= join(' ',@$t); + # Adds a left join to the failed_structural_variation table + if ($t->[0] eq 'structural_variation' and !$self->db->include_failed_variations()) { + $tables .= qq{ LEFT JOIN failed_structural_variation fsv + ON (fsv.structural_variation_id=sv.structural_variation_id)}; + } + } + + my $constraint = $self->_default_where_clause(); + + # Add the constraint for failed structural variant + $constraint .= " AND " . $self->db->_exclude_failed_structural_variations_constraint(); + + my $sth = $self->prepare(qq{ + SELECT $cols + FROM $tables, structural_variation_association sa + WHERE $constraint + AND sa.structural_variation_id=sv.structural_variation_id + AND sa.supporting_structural_variation_id = ?}); + $sth->bind_param(1,$se->dbID,SQL_INTEGER); + $sth->execute(); + + my $results = $self->_objs_from_sth($sth); + + $sth->finish(); + + return $results; +} + + +sub _generic_fetch_by_VariationSet { + my $self = shift; + my $want_iterator = shift; + my $set = shift; + + assert_ref($set,'Bio::EnsEMBL::Variation::VariationSet'); + + if(!defined($set->dbID())) { + warning("Cannot retrieve structural variations for variation set without a dbID"); + return []; + } + + # Get the unique dbIDs for all variations in this set and all of its subsets + my $dbid_list = $self->fetch_all_dbIDs_by_VariationSet($set); + + my $num_vars = @$dbid_list; + + if ($num_vars > 100_000 && !$want_iterator) { + warn "This set contains a large number ($num_vars) of structural variations, these may not fit". + "into memory at once, considering using fetch_Iterator_by_VariationSet instead"; + } + + # Use the dbIDs to get all variations and return them + return $want_iterator ? + $self->fetch_Iterator_by_dbID_list($dbid_list) : + $self->fetch_all_by_dbID_list($dbid_list); +} + + +=head2 fetch_all_dbIDs_by_VariationSet + + Arg [1] : Bio::EnsEMBL::Variation::VariationSet + Example : @sv_ids = @{$sva_adaptor->fetch_all_dbIDs_by_VariationSet($vs)}; + Description: Gets an array of internal ids of all structural variations which are present + in a specified variation set and its subsets. + Returntype : listref of integers + Exceptions : throw on incorrect argument + Caller : general + Status : At Risk + +=cut + +sub fetch_all_dbIDs_by_VariationSet { + my $self = shift; + my $set = shift; + + # First, get ids for all subsets, + + my @var_set_ids = ($set->dbID); + + foreach my $var_set (@{$set->adaptor->fetch_all_by_super_VariationSet($set)}) { + push @var_set_ids, $var_set->dbID; + } + + my $set_str = "(" . join(",",@var_set_ids) .")"; + + # Add the constraint for failed structural variations + my $constraint = $self->_internal_exclude_failed_constraint; + + # Then get the dbIDs for all these sets + my $stmt = qq{ + SELECT DISTINCT + vssv.structural_variation_id + FROM + variation_set_structural_variation vssv LEFT JOIN + failed_structural_variation fsv ON ( + fsv.structural_variation_id = vssv.structural_variation_id + ) + WHERE + vssv.variation_set_id in $set_str + $constraint + }; + + my $sth = $self->prepare($stmt); + + $sth->execute(); + + my @result; + my $dbID; + + $sth->bind_columns(\$dbID); + + while ($sth->fetch()) { + push @result, $dbID; + } + + return \@result; +} + + +=head2 fetch_all_by_VariationSet + + Arg [1] : Bio::EnsEMBL::Variation::VariationSet + Example : @svs = @{$sva_adaptor->fetch_all_by_VariationSet($vs)}; + Description: Retrieves all structural variations which are present in a specified + variation set and its subsets. + Returntype : listref of Bio::EnsEMBL::Variation::StructuralVariation + Exceptions : throw on incorrect argument + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_VariationSet { + my $self = shift; + return $self->_generic_fetch_by_VariationSet(0, @_); +} + + +=head2 fetch_Iterator_by_VariationSet + + Arg [1] : Bio::EnsEMBL::Variation::VariationSet + Example : $sv_iterator = $sva_adaptor->fetch_Iterator_by_VariationSet($vs); + Description: Retrieves an iterator for all structural variations which are present + in a specified variation set and its subsets. + Returntype : Bio::EnsEMBL::Utils::Iterator object + Exceptions : throw on incorrect argument + Caller : general + Status : Experimental + +=cut + +sub fetch_Iterator_by_VariationSet { + my $self = shift; + my $set = shift; + my $cache_size = shift || $DEFAULT_ITERATOR_CACHE_SIZE; + + # First, get ids for all subsets, + my @var_set_ids = ($set->dbID); + map {push(@var_set_ids,$_->dbID())} @{$set->adaptor->fetch_all_by_super_VariationSet($set)}; + my $var_set_id = join(",",@var_set_ids); + + # Prepare a query for getting the span of variation_ids + my $stmt = qq{ + FROM + variation_set_structural_variation vssv LEFT JOIN + failed_structural_variation fsv ON ( + fsv.structural_variation_id = vssv.structural_variation_id + ) + WHERE + vssv.variation_set_id IN ($var_set_id) + }; + + # Add the constraint for failed structural variations + my $constraint = $self->_internal_exclude_failed_constraint; + + my $sth = $self->prepare(qq{SELECT MIN(vssv.structural_variation_id), MAX(vssv.structural_variation_id) + $stmt $constraint}); + $sth->execute(); + my ($min_sv_id,$max_sv_id); + $sth->bind_columns(\$min_sv_id,\$max_sv_id); + $sth->fetch(); + $max_sv_id ||= 0; + $min_sv_id ||= 1; + + # Prepare a statement for getting the ids in a range + $sth = $self->prepare(qq{SELECT vssv.structural_variation_id $stmt + AND vssv.structural_variation_id BETWEEN ? AND ? $constraint}); + + # Internally, we keep an Iterator that works on the dbID span we're at + my $iterator; + + return Bio::EnsEMBL::Utils::Iterator->new(sub { + + # If the iterator is empty, get a new chunk of dbIDs, unless we've fetched all dbIDs + unless (defined($iterator) && $iterator->has_next() && $min_sv_id <= $max_sv_id) { + + # Get the next chunk of dbIDs + $sth->execute($min_sv_id,$min_sv_id+$cache_size); + $min_sv_id += ($cache_size + 1); + + # Use a hash to keep track of the seen dbIDs + my %seen; + + # Loop over the dbIDs and avoid duplicates + my $dbID; + my @dbIDs; + $sth->bind_columns(\$dbID); + while ($sth->fetch()) { + push (@dbIDs,$dbID) unless ($seen{$dbID}++); + } + + # Get a new Iterator based on the new dbID span + $iterator = $self->fetch_Iterator_by_dbID_list(\@dbIDs); + + } + + return $iterator->next(); + }); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/StructuralVariationAnnotationAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/StructuralVariationAnnotationAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,334 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + + +# Ensembl module for Bio::EnsEMBL::Variation::DBSQL::StructuralVariationAnnotationAdaptor +# +# Copyright (c) 2011 Ensembl +# +# You may distribute this module under the same terms as perl itself +# +# + +=head1 NAME + +Bio::EnsEMBL::Variation::DBSQL::StructuralVariationAnnotationAdaptor + +=head1 SYNOPSIS + $reg = 'Bio::EnsEMBL::Registry'; + $reg->load_registry_from_db(-host => 'ensembldb.ensembl.org',-user => 'anonymous'); + + $svaa = $reg->get_adaptor("human","variation","structuralvariationannotation"); + $ssva = $reg->get_adaptor("human","variation","supportingstructuralvariation"); + + my $ssv = $ssva->fetch_by_name('nssv706165'); + my $sva = $svaa->fetch_all_by_StructuralVariation($ssv); + + +=head1 DESCRIPTION + +This adaptor provides database connectivity between StructuralVariation/SupportingStructuralVariation +and StructuralVariationAnnotation objects. +By default, the 'fetch_all_by_...'-methods will not return structural variants +that have been flagged as failed in the Ensembl QC. This behaviour can be modified +by setting the include_failed_variations flag in Bio::EnsEMBL::Variation::DBSQL::DBAdaptor. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::DBSQL::StructuralVariationAnnotationAdaptor; + +use Bio::EnsEMBL::Variation::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Variation::BaseStructuralVariation; +use Bio::EnsEMBL::Variation::StructuralVariationAnnotation; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + +use base qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); + + +=head2 fetch_all_by_StructuralVariation + + Arg [1] : Bio::EnsEMBL::Variation::StructuralVariation or + Bio::EnsEMBL::Variation::SupportingStructuralVariation $svar + Example : my @vas = @{$vaa->fetch_all_by_StructuralVariation($svar)}; + Description: Retrieves all variation annotations for a given variation. + Returntype : reference to list Bio::EnsEMBL::Variation::StructuralVariationAnnotation + Exceptions : throw on bad argument + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_StructuralVariation { + my $self = shift; + my $svar = shift; + + if(!ref($svar) || (!$svar->isa('Bio::EnsEMBL::Variation::StructuralVariation') && + !$svar->isa('Bio::EnsEMBL::Variation::SupportingStructuralVariation')) + ) { + throw('Bio::EnsEMBL::Variation::StructuralVariation or Bio::EnsEMBL::Variation::SupportingStructuralVariation arg expected'); + } + + if(!defined($svar->dbID())) { + throw("StructuralVariation arg must have defined dbID"); + } + + my $constraint = $self->_internal_exclude_failed_constraint("sva.structural_variation_id = ".$svar->dbID()); + + return $self->generic_fetch($constraint); +} + + +=head2 fetch_all_by_StructuralVariation_list + + Arg [1] : reference to a list of Bio::EnsEMBL::Variation::StructuralVariation or + Bio::EnsEMBL::Variation::SupportingStructuralVariation objects + Example : my @svas = @{$svaa->fetch_all_by_StructuralVariation_list($svars)}; + Description: Retrieves all variation annotations for a given list of structural variants + Returntype : reference to a list of Bio::EnsEMBL::Variation::StructuralVariationAnnotation objects + Exceptions : throw on bad argument + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_StructuralVariation_list { + my $self = shift; + my $svars = shift; + + if(!ref($svars) || (!$svars->[0]->isa('Bio::EnsEMBL::Variation::StructuralVariation') && + !$svars->[0]->isa('Bio::EnsEMBL::Variation::SupportingStructuralVariation')) + ) { + throw('Bio::EnsEMBL::Variation::StructuralVariation or Bio::EnsEMBL::Variation::SupportingStructuralVariation arg expected'); + } + + if(!defined($svars->[0]->dbID())) { + throw("StructuralVariation arg must have defined dbID"); + } + + my $in_str = join ',', map {$_->dbID()} @$svars; + + my $constraint = $self->_internal_exclude_failed_constraint("sva.structural_variation_id in (".$in_str.")"); + + return $self->generic_fetch($constraint); +} + + +=head2 fetch_all_by_StructuralVariationFeature_list + + Arg [1] : reference to a list of Bio::EnsEMBL::Variation::StructuralVariationFeature objects + Example : my @svas = @{$svaa->fetch_all_by_StructuralVariationFeature_list($svfs)}; + Description: Retrieves all variation annotations for a given list of structural variation features + Returntype : reference to a list Bio::EnsEMBL::Variation::StructuralVariationAnnotation objects + Exceptions : throw on bad argument + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_StructuralVariationFeature_list { + my $self = shift; + my $svfs = shift; + + if(!ref($svfs) || !$svfs->[0]->isa('Bio::EnsEMBL::Variation::StructuralVariationFeature')) { + throw('Listref of Bio::EnsEMBL::Variation::StructuralVariationFeature arg expected'); + } + + if(!defined($svfs->[0]->dbID())) { + throw("VariationFeatures in list must have defined dbIDs"); + } + + my $in_str = join ',', map {$_->{'structural_variation_id'}} @$svfs; + + my $constraint = $self->_internal_exclude_failed_constraint("sva.structural_variation_id in (".$in_str.")"); + + return $self->generic_fetch($constraint); +} + + +=head2 fetch_all_by_Study + + Arg [1] : Bio::EnsEMBL:Variation::Study $study + Example : my @studies = @{$studya->fetch_all_by_Study($study)}; + Description: Retrieves all structural variation annotations for a given study. + Returntype : reference to list Bio::EnsEMBL::Variation::StructuralVariationAnnotation + Exceptions : throw on bad argument + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_Study { + my $self = shift; + my $study = shift; + + if(!ref($study) || !$study->isa('Bio::EnsEMBL::Variation::Study')) { + throw('Bio::EnsEMBL::Variation::Study arg expected'); + } + + if(!defined($study->dbID())) { + throw("Study arg must have defined dbID"); + } + + my $constraint = $self->_internal_exclude_failed_constraint('sv.study_id = '.$study->dbID()); + + return $self->generic_fetch($constraint); +} + + +=head2 fetch_all_by_sample_name + + Arg [1] : string $sample_name + Example : my $sva = @{$svaa->fetch_all_by_sample_name($sample_name)}; + Description: Retrieves all structural variation annotations for a given sample. + Returntype : reference to list Bio::EnsEMBL::Variation::StructuralVariationAnnotation + Exceptions : throw if sample_name argument is not defined + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_sample_name { + my $self = shift; + my $sample_name = shift; + + throw('sample_name argument expected') if(!defined($sample_name)); + + my $constraint = $self->_internal_exclude_failed_constraint("s1.name = '$sample_name'"); + + return $self->generic_fetch($constraint); +} + + +=head2 fetch_all_by_strain_name + + Arg [1] : string $strain_name + Example : my $sva = @{$svaa->fetch_all_by_strain_name($strain_name)}; + Description: Retrieves all structural variation annotations for a given strain. + Returntype : reference to list Bio::EnsEMBL::Variation::StructuralVariationAnnotation + Exceptions : throw if strain_name argument is not defined + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_strain_name { + my $self = shift; + my $strain_name = shift; + + throw('strain_name argument expected') if(!defined($strain_name)); + + my $constraint = $self->_internal_exclude_failed_constraint("s2.name = '$strain_name'"); + + return $self->generic_fetch($constraint); +} + + +# method used by superclass to construct SQL +sub _tables { + my $self = shift; + my @tables = ([ 'structural_variation_annotation', 'sva'], + [ 'study', 'st'], + [ 'phenotype', 'p'], + [ 'sample s1', ''], + [ 'sample s2', ''], + [ 'structural_variation', 'sv'] + ); + + # If we are excluding failed_structural_variations, add that table + push(@tables,['failed_structural_variation', 'fsv']) unless ($self->db->include_failed_variations()); + + return @tables; +} + +# Add a left join to the failed_variation table +sub _left_join { + my $self = shift; + my @tables = ([ 'phenotype', 'p.phenotype_id = sva.phenotype_id'], + [ 'sample s1', 's1.sample_id = sva.sample_id'], + [ 'sample s2', 's2.sample_id = sva.strain_id'] + ); + + # If we are excluding failed_structural_variations, add that table + push(@tables,['failed_structural_variation', 'fsv.structural_variation_id=sv.structural_variation_id']) unless ($self->db->include_failed_variations()); + + return @tables; +} + +sub _default_where_clause { + my $self = shift; + + return 'sva.structural_variation_id = sv.structural_variation_id AND sv.study_id=st.study_id'; +} + +sub _columns { + return qw( sva.structural_variation_annotation_id sva.structural_variation_id p.phenotype_id p.description + sv.study_id sva.clinical_attrib_id s1.name s2.name + ); +} + + +sub _objs_from_sth { + my ($self, $sth) = @_; + + my @svas; + my ($structural_variation_annotation_id,$svar_id,$phenotype_id,$phenotype_description, + $study_id,$clinical_attrib_id,$sample_name,$strain_name,$study); + $sth->bind_columns(\$structural_variation_annotation_id,\$svar_id,\$phenotype_id,\$phenotype_description, + \$study_id,\$clinical_attrib_id,\$sample_name,\$strain_name); + + my $aa = $self->db()->get_AttributeAdaptor(); + my $sta = $self->db()->get_StudyAdaptor(); + + while($sth->fetch()) { + + $study = $sta->fetch_by_dbID($study_id); + + push @svas, Bio::EnsEMBL::Variation::StructuralVariationAnnotation->new( + -dbID => $structural_variation_annotation_id, + -_STRUCTURAL_VARIATION_ID => $svar_id, + -_PHENOTYPE_ID => $phenotype_id, + -PHENOTYPE_DESCRIPTION => $phenotype_description, + -SAMPLE_NAME => $sample_name, + -STRAIN_NAME => $strain_name, + -CLINICAL_SIGNIFICANCE => $aa->attrib_value_for_id($clinical_attrib_id), + -ADAPTOR => $self, + -STUDY => $study, + ); + } + + return \@svas; +} + + +# Exclude the constraint for failed structural variant +sub _internal_exclude_failed_constraint { + my $self = shift; + my $constraint = shift; + $constraint .= " AND " . $self->db->_exclude_failed_structural_variations_constraint(); + + return $constraint; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/StructuralVariationFeatureAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/StructuralVariationFeatureAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,661 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + + +# Ensembl module for Bio::EnsEMBL::Variation::DBSQL::StructuralVariationFeatureAdaptor +# +# Copyright (c) 2011 Ensembl +# +# You may distribute this module under the same terms as perl itself +# +# + +=head1 NAME + +Bio::EnsEMBL::Variation::DBSQL::StructuralVariationFeatureAdaptor + +=head1 SYNOPSIS + $reg = 'Bio::EnsEMBL::Registry'; + + $reg->load_registry_from_db(-host => 'ensembldb.ensembl.org',-user => 'anonymous'); + + $sa = $reg->get_adaptor("human","core","slice"); + $svfa = $reg->get_adaptor("human","variation","structuralvariationfeature"); + $sva = $reg->get_adaptor("human","variation","structuralvariation"); + + # Get a StructuralVariationFeature by its internal identifier + $svf = $svfa->fetch_by_dbID(145); + + # get all StructuralVariationFeatures in a region + $slice = $sa->fetch_by_region('chromosome', 'X', 1e6, 2e6); + foreach $svf (@{$svfa->fetch_all_by_Slice($slice)}) { + print $svf->start(), '-', $svf->end(), ' ', $svf->allele_string(), "\n"; + } + + # fetch all genome hits for a particular structural variation + $sv = $sva->fetch_by_name('esv1285'); + + foreach $svf (@{$svfa->fetch_all_by_StructuralVariation($sv)}) { + print $svf->seq_region_name(), $svf->seq_region_start(), '-', + $svf->seq_region_end(),"\n"; + } + +=head1 DESCRIPTION + +This adaptor provides database connectivity for StructuralVariationFeature objects. +Genomic locations of structural variations can be obtained from the database using this +adaptor. See the base class BaseFeatureAdaptor for more information. +By default, the 'fetch_all_by_...'-methods will not return structural variants +that have been flagged as failed in the Ensembl QC. This behaviour can be modified +by setting the include_failed_variations flag in Bio::EnsEMBL::Variation::DBSQL::DBAdaptor. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::DBSQL::StructuralVariationFeatureAdaptor; + +use Bio::EnsEMBL::Variation::StructuralVariationFeature; +use Bio::EnsEMBL::Variation::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Iterator; + +our @ISA = ('Bio::EnsEMBL::Variation::DBSQL::BaseAdaptor', 'Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor'); + + +=head2 fetch_all_by_Slice + + Arg [1] : Bio::EnsEMBL::Slice + $slice the slice from which to obtain features + Arg [2] : int $include_evidence [optional] + Example : my $svfs = $svfa->fetch_all_by_Slice($slice); + Description: Retrieves all germline structural variation features on the given Slice. + If $include_evidence is set (i.e. $include_evidence=1), structural variation features from + both structural variation (SV) and their supporting structural variations (SSV) will be + returned. By default, it only returns features from structural variations (SV). + Returntype : reference to list Bio::EnsEMBL::StructuralVariationFeature + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_Slice { + my ($self, $slice, $include_evidence) = @_; + + my $constraint = $self->_internal_exclude_failed_constraint('',1); + $constraint .= ' AND ' if ($constraint); + $constraint .= ' somatic=0 '; + $constraint .= ' AND is_evidence=0 ' if (!$include_evidence); + + return $self->fetch_all_by_Slice_constraint($slice, $constraint); +} + + +=head2 fetch_all_somatic_by_Slice + + Arg [1] : Bio::EnsEMBL::Slice + $slice the slice from which to obtain features + Arg [2] : int $include_evidence [optional] + Example : my $svfs = $svfa->fetch_all_somatic_by_Slice($slice); + Description: Retrieves all somatic structural variation features on the given Slice. + If $include_evidence is set (i.e. $include_evidence=1), structural variation features from + both structural variation (SV) and their supporting structural variations (SSV) will be + returned. By default, it only returns features from structural variations (SV). + Returntype : reference to list Bio::EnsEMBL::StructuralVariationFeature + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_all_somatic_by_Slice { + my ($self, $slice, $include_evidence) = @_; + + my $constraint = $self->_internal_exclude_failed_constraint('',1); + $constraint .= ' AND ' if ($constraint); + $constraint .= ' somatic=1 '; + $constraint .= ' AND is_evidence=0 ' if (!$include_evidence); + + return $self->fetch_all_by_Slice_constraint($slice, $constraint); +} + + +=head2 fetch_all_by_StructuralVariation + + Arg [1] : Bio::EnsEMBL:Variation::StructuralVariation or + Bio::EnsEMBL::Variation::SupportingStructuralVariation $var + Example : my @svfs = @{$svfa->fetch_all_by_StructuralVariation($var)}; + Description: Retrieves all structural variation features for a given structural variation. Most + structural variations should only hit the genome once and only a return + a single structural variation feature. + Returntype : reference to list Bio::EnsEMBL::Variation::StructuralVariationFeature + Exceptions : throw on bad argument + Caller : general + Status : Stable + +=cut + + +sub fetch_all_by_StructuralVariation { + my $self = shift; + my $var = shift; + + if(!ref($var) || (!$var->isa('Bio::EnsEMBL::Variation::StructuralVariation') && + !$var->isa('Bio::EnsEMBL::Variation::SupportingStructuralVariation')) + ) { + throw('Bio::EnsEMBL::Variation::StructuralVariation or + Bio::EnsEMBL::Variation::SupportingStructuralVariation arg expected'); + } + + if(!defined($var->dbID())) { + throw("StructuralVariation arg must have defined dbID"); + } + + my $constraint = $self->_internal_exclude_failed_constraint("svf.structural_variation_id = ".$var->dbID()); + + return $self->generic_fetch($constraint); +} + + +=head2 fetch_Iterator_by_Slice_constraint + + Arg [1] : Bio::EnsEMBL::Slice $slice + the slice from which to obtain features + Arg [2] : (optional) string $constraint + An SQL query constraint (i.e. part of the WHERE clause) + Description: Returns a listref of structural variation features created + from the database which are on the Slice defined by $slice + and fulfill the SQL constraint defined by $constraint, using the iterator method. + Returntype : reference to list StructuralVariationFeature + Exceptions : thrown if $slice is not defined + Caller : general + Status : Stable + +=cut + +sub fetch_Iterator_by_Slice_constraint { + my ($self, $slice, $constraint) = @_; + + $self->{_iterator} = 1; + + $constraint = $self->_internal_exclude_failed_constraint($constraint,1); + + my $iterator = $self->fetch_all_by_Slice_constraint($slice, $constraint); + + $self->{_iterator} = 0; + + return $iterator; +} + + +=head2 fetch_all_by_Slice_SO_term + + Arg [1] : Bio::EnsEMBL::Slice + Arg [2] : SO term (string) + Example : $slice = $slice_adaptor->fetch_by_region("chromosome", 1, 100000, 200000); + $SO_term = 'copy_number_variation'; + @svfs = @{$svf_adaptor->fetch_all_by_Slice_SO_term($slice,$SO_term)}; + Description: Retrieves all structural variation features in a slice with a variant type + (structural variation class) or an allele type (supporting structural variation class) + corresponding to the SO term. + Returntype : reference to list Bio::EnsEMBL::Variation::StructuralVariationFeature + Exceptions : throw on incorrect argument + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_Slice_SO_term { + my $self = shift; + my $slice = shift; + my $SO_term = shift; + + if(!ref($slice) || !$slice->isa('Bio::EnsEMBL::Slice')) { + throw('Bio::EnsEMBL::Slice arg expected'); + } + + my $aa = $self->db->get_AttributeAdaptor; + my $sv_class_id = $aa->attrib_id_for_type_value('SO_term',$SO_term); + + if (!defined($sv_class_id)) { + warn "The SO term '$SO_term' has not been found"; + return []; + } + + my $cols = join ",", $self->_columns(); + + my $constraint = $self->_internal_exclude_failed_constraint(); + + my $from = 'structural_variation_feature svf'; + if (!$self->db->include_failed_variations()) { + $from .= qq{ LEFT JOIN failed_structural_variation fsv + ON (fsv.structural_variation_id=svf.structural_variation_id) }; + } + + my $sth = $self->prepare(qq{ + SELECT DISTINCT $cols + FROM $from, source s + WHERE svf.source_id = s.source_id + AND svf.seq_region_id = ? + AND svf.seq_region_end > ? + AND svf.seq_region_start < ? + AND svf.class_attrib_id = ? + $constraint + }); + $sth->execute($slice->get_seq_region_id, $slice->start, $slice->end, $sv_class_id); + + my $result = $self->_objs_from_sth($sth); + $sth->finish; + + return $result; +} + + +=head2 fetch_all_by_Slice_VariationSet + + Arg [1] : Bio::EnsEMBL:Variation::Slice $slice + Arg [2] : Bio::EnsEMBL:Variation::VariationSet $set + Example : my @vsfs = @{$svfa->fetch_all_by_Slice_VariationSet($slice, $set)}; + Description: Retrieves all structural variation features in a slice that belong to a + given variation set. + Returntype : reference to list Bio::EnsEMBL::Variation::StructuralVariationFeature + Exceptions : throw on bad argument + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_Slice_VariationSet { + + my $self = shift; + my $slice = shift; + my $set = shift; + + if(!ref($slice) || !$slice->isa('Bio::EnsEMBL::Slice')) { + throw('Bio::EnsEMBL::Slice arg expected'); + } + if(!ref($set) || !$set->isa('Bio::EnsEMBL::Variation::VariationSet')) { + throw('Bio::EnsEMBL::Variation::VariationSet arg expected'); + } + + # Get the bitvalue for this set and its subsets + my $bitvalue = $set->_get_bitvalue(); + + # Add a constraint to only return StructuralVariationFeatures having the + # primary keys of the supplied VariationSet or its subsets in the variation_set_id column + my $constraint = " svf.variation_set_id & $bitvalue "; + + # Get the VariationFeatures by calling fetch_all_by_Slice_constraint + my $svfs = $self->fetch_all_by_Slice_constraint($slice,$constraint); + + return $svfs; +} + + +# method used by superclass to construct SQL +sub _tables { + my $self = shift; + + my @tables = ( ['structural_variation_feature', 'svf'], [ 'source', 's'] ); + + # If we are excluding failed_structural_variations, add that table + push(@tables,['failed_structural_variation', 'fsv']) unless ($self->db->include_failed_variations()); + + return @tables; +} + +# Add a left join to the failed_structural_variation table +sub _left_join { + my $self = shift; + + # If we are including failed structural variations, skip the left join + return () if ($self->db->include_failed_variations()); + return (['failed_structural_variation', 'fsv.structural_variation_id=svf.structural_variation_id']); +} + +sub _default_where_clause { + my $self = shift; + + return 'svf.source_id = s.source_id'; +} + +sub _columns { + return qw( svf.structural_variation_feature_id svf.seq_region_id svf.outer_start svf.seq_region_start + svf.inner_start svf.inner_end svf.seq_region_end svf.outer_end svf.seq_region_strand + svf.structural_variation_id svf.variation_name s.name s.version svf.class_attrib_id + svf.allele_string svf.somatic svf.breakpoint_order); +} + +sub _objs_from_sth { + my ($self, $sth, $mapper, $dest_slice) = @_; + + # + # This code is ugly because an attempt has been made to remove as many + # function calls as possible for speed purposes. Thus many caches and + # a fair bit of gymnastics is used. + # + + my $sa = $self->db()->dnadb()->get_SliceAdaptor(); + + my $aa = $self->db->get_AttributeAdaptor; + + my %slice_hash; + my %sr_name_hash; + my %sr_cs_hash; + + my ($structural_variation_feature_id, $seq_region_id, $outer_start, $seq_region_start, $inner_start, + $inner_end, $seq_region_end, $outer_end, $seq_region_strand, $structural_variation_id, $variation_name, + $source_name, $source_version, $class_attrib_id, $allele_string, $is_somatic, $bp_order, $last_svf_id); + + $sth->bind_columns(\$structural_variation_feature_id, \$seq_region_id, \$outer_start, \$seq_region_start, + \$inner_start, \$inner_end, \$seq_region_end, \$outer_end, \$seq_region_strand, + \$structural_variation_id, \$variation_name, \$source_name, \$source_version, + \$class_attrib_id, \$allele_string, \$is_somatic, \$bp_order); + + my $asm_cs; + my $cmp_cs; + my $asm_cs_vers; + my $asm_cs_name; + my $cmp_cs_vers; + my $cmp_cs_name; + + if($mapper) { + $asm_cs = $mapper->assembled_CoordSystem(); + $cmp_cs = $mapper->component_CoordSystem(); + $asm_cs_name = $asm_cs->name(); + $asm_cs_vers = $asm_cs->version(); + $cmp_cs_name = $cmp_cs->name(); + $cmp_cs_vers = $cmp_cs->version(); + } + + my $dest_slice_start; + my $dest_slice_end; + my $dest_slice_strand; + my $dest_slice_length; + + if($dest_slice) { + $dest_slice_start = $dest_slice->start(); + $dest_slice_end = $dest_slice->end(); + $dest_slice_strand = $dest_slice->strand(); + $dest_slice_length = $dest_slice->length(); + } + + my $finished = 0; + + my $iterator = Bio::EnsEMBL::Utils::Iterator->new(sub{ + + return undef if $finished; + + FEATURE: while( $sth->fetch ) { + + # Skip if we are getting multiple rows because of the left join to failed variation + next if (defined($last_svf_id) && $last_svf_id == $structural_variation_feature_id); + $last_svf_id = $structural_variation_feature_id; + + #get the slice object + my $slice = $slice_hash{"ID:".$seq_region_id}; + if(!$slice) { + $slice = $sa->fetch_by_seq_region_id($seq_region_id); + $slice_hash{"ID:".$seq_region_id} = $slice; + $sr_name_hash{$seq_region_id} = $slice->seq_region_name(); + $sr_cs_hash{$seq_region_id} = $slice->coord_system(); + } + + # remap the feature coordinates to another coord system + # if a mapper was provided + + if($mapper) { + my $sr_name = $sr_name_hash{$seq_region_id}; + my $sr_cs = $sr_cs_hash{$seq_region_id}; + + ($sr_name,$seq_region_start,$seq_region_end,$seq_region_strand) = + $mapper->fastmap($sr_name, $seq_region_start, $seq_region_end, + $seq_region_strand, $sr_cs); + + #skip features that map to gaps or coord system boundaries + next FEATURE if(!defined($sr_name)); + + #get a slice in the coord system we just mapped to + if($asm_cs == $sr_cs || ($cmp_cs != $sr_cs && $asm_cs->equals($sr_cs))) { + $slice = $slice_hash{"NAME:$sr_name:$cmp_cs_name:$cmp_cs_vers"} ||= + $sa->fetch_by_region($cmp_cs_name, $sr_name,undef, undef, undef, + $cmp_cs_vers); + } else { + $slice = $slice_hash{"NAME:$sr_name:$asm_cs_name:$asm_cs_vers"} ||= + $sa->fetch_by_region($asm_cs_name, $sr_name, undef, undef, undef, + $asm_cs_vers); + } + } + + # + # If a destination slice was provided convert the coords + # If the dest_slice starts at 1 and is foward strand, nothing needs doing + # + if($dest_slice) { + if($dest_slice_start != 1 || $dest_slice_strand != 1) { + if($dest_slice_strand == 1) { + $seq_region_start = $seq_region_start - $dest_slice_start + 1; + $seq_region_end = $seq_region_end - $dest_slice_start + 1; + } else { + my $tmp_seq_region_start = $seq_region_start; + $seq_region_start = $dest_slice_end - $seq_region_end + 1; + $seq_region_end = $dest_slice_end - $tmp_seq_region_start + 1; + $seq_region_strand *= -1; + } + + #throw away features off the end of the requested slice + if($seq_region_end < 1 || $seq_region_start > $dest_slice_length) { + next FEATURE; + } + } + $slice = $dest_slice; + } + return $self->_create_feature_fast('Bio::EnsEMBL::Variation::StructuralVariationFeature', + + {'outer_start' => $outer_start, + 'start' => $seq_region_start, + 'inner_start' => $inner_start, + 'inner_end' => $inner_end, + 'end' => $seq_region_end, + 'outer_end' => $outer_end, + 'strand' => $seq_region_strand, + 'slice' => $slice, + 'variation_name' => $variation_name, + 'adaptor' => $self, + 'dbID' => $structural_variation_feature_id, + 'source' => $source_name, + 'source_version' => $source_version, + 'structural_variation_id' => $structural_variation_id, + 'class_SO_term' => $aa->attrib_value_for_id($class_attrib_id), + 'class_attrib_id' => $class_attrib_id, + 'allele_string' => $allele_string, + 'is_somatic' => $is_somatic, + 'breakpoint_order' => $bp_order, + } + ); + } + + unless ($finished) { + $sth->finish; + $finished = 1; + } + + return undef; + }); + + if ($self->{_iterator}) { + return $iterator; + } + else { + my $svfs = $iterator->to_arrayref; + #warn "Got ".scalar(@$vfs). "VFs"; + return $svfs; + } +} + + +=head2 list_dbIDs + + Arg [1] : none + Example : @svfs = @{$svfa->list_dbIDs()}; + Description: Gets an array of internal ids for all simple features in + the current db + Returntype : list of integers + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub list_dbIDs { + my $self = shift; + return $self->_list_dbIDs('structural_variation_feature'); +} + + +# Exclude the constraint for failed structural variant +sub _internal_exclude_failed_constraint { + my $self = shift; + my $constraint = shift; + my $no_and = shift; + $constraint .= " AND " if (!$no_and or $constraint); + $constraint .= $self->db->_exclude_failed_structural_variations_constraint(); + + return $constraint; +} + +=head2 new_fake + + Arg [1] : string $species + Example : + $vfa = Bio::EnsEMBL::Variation::StructuralVariationFeatureAdaptor->new_fake('human'); + Description: Creates a StructuralVariationFeatureAdaptor with no underlying database + attached. Should be used only when getting consequence types for + species with no variation database available. + Returntype : Bio::EnsEMBL::Variation::StructuralVariationFeatureAdaptor + Exceptions : throw if no species given + Caller : called from webcode for species where no variation database present + Status : Stable + +=cut + +sub new_fake { + my $class = shift; + my $species = shift; + + throw("No species defined") unless defined $species; + + my $self = bless {}, $class; + + $self->{'species'} = $species; + + return $self; +} + + +sub store { + my ($self, $svf) = @_; + + my $dbh = $self->dbc->db_handle; + + # look up source_id + if(!defined($svf->{source_id})) { + my $sth = $dbh->prepare(q{ + SELECT source_id FROM source WHERE name = ? + }); + $sth->execute($svf->{source}); + + my $source_id; + $sth->bind_columns(\$source_id); + $sth->fetch(); + $sth->finish(); + $svf->{source_id} = $source_id; + } + throw("No source ID found for source name ", $svf->{source}) unless defined($svf->{source_id}); + + # look up class_attrib_id + my $class_attrib_id; + if(defined($svf->{class_SO_term})) { + my $sth = $dbh->prepare(q{ + SELECT attrib_id FROM attrib WHERE value = ? + }); + $sth->execute($svf->{class_SO_term}); + + $sth->bind_columns(\$class_attrib_id); + $sth->fetch(); + $sth->finish(); + } + throw("No class ID found for the class name ", $svf->{class_SO_term}) unless defined($class_attrib_id); + + my $sth = $dbh->prepare(q{ + INSERT INTO structural_variation_feature ( + seq_region_id, + outer_start, + seq_region_start, + inner_start, + inner_end, + seq_region_end, + outer_end, + seq_region_strand, + structural_variation_id, + allele_string, + variation_name, + source_id, + class_attrib_id, + is_evidence, + somatic, + breakpoint_order + ) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) + }); + + $sth->execute( + $svf->{slice} ? $svf->{slice}->get_seq_region_id : $svf->{seq_region_id}, + $svf->{outer_start} || undef, + $svf->{slice} ? $svf->seq_region_start : $svf->{start}, + $svf->{inner_start} || undef, + $svf->{inner_end} || undef, + $svf->{slice} ? $svf->seq_region_end : $svf->{end}, + $svf->{outer_end} || undef, + $svf->strand, + $svf->structural_variation ? $svf->structural_variation->dbID : $svf->{structural_variation_id}, + $svf->allele_string, + $svf->variation_name, + $svf->{source_id}, + $class_attrib_id || 0, + $svf->structural_variation ? $svf->structural_variation->is_evidence : 0, + $svf->structural_variation ? $svf->structural_variation->is_somatic : 0, + $svf->{breakpoint_order} || undef + ); + + $sth->finish; + + # get dbID + my $dbID = $dbh->last_insert_id(undef, undef, 'structural_variation_feature', 'structural_variation_feature_id'); + $svf->{dbID} = $dbID; + $svf->{adaptor} = $self; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/StudyAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/StudyAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,250 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# +# Ensembl module for Bio::EnsEMBL::Variation::DBSQL::StudyAdaptor +# +# Copyright (c) 2011 Ensembl +# +# You may distribute this module under the same terms as perl itself +# +# + +=head1 NAME + +Bio::EnsEMBL::Variation::DBSQL::StudyAdaptor + +=head1 SYNOPSIS + $reg = 'Bio::EnsEMBL::Registry'; + + $reg->load_registry_from_db(-host => 'ensembldb.ensembl.org',-user => 'anonymous'); + + $sta = $reg->get_adaptor("human","variation","study"); + + # fetch a study by its name + $study = $sta->fetch_by_name('estd1'); + + # fetch all study for a source + $sta = $reg->get_adaptor("human","variation","study"); + $st = $sta->fetch_all_by_source('NHGRI_GWAS_catalog'); + foreach $study (@{$sta->fetch_all_by_source('NHGRI_GWAS_catalog')}){ + print $study->dbID, " - ", $study->external_reference ,"\n"; + } + + +=head1 DESCRIPTION + +This adaptor provides database connectivity for Study objects. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::DBSQL::StudyAdaptor; + +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Variation::Study; + +use base qw{Bio::EnsEMBL::DBSQL::BaseAdaptor}; + +my %cache; + +=head2 fetch_by_name + + Arg [1] : string $name + Example : $study = $study_adaptor->fetch_by_name('estd1'); + Description: Retrieves a study object via its name + Returntype : Bio::EnsEMBL::Variation::Study + Exceptions : throw if name argument is not defined + Caller : general + Status : At Risk + +=cut + +sub fetch_by_name { + my $self = shift; + my $name = shift; + + throw('name argument expected') if(!defined($name)); + + my $result = $self->generic_fetch("st.name='$name'"); + + return ($result ? $result->[0] : undef); +} + + +=head2 fetch_by_dbID + + Arg [1] : int $dbID + Example : $study = $study_adaptor->fetch_by_dbID(254); + Description: Retrieves a Study object via its internal identifier. + If no such study exists undef is returned. + Returntype : Bio::EnsEMBL::Variation::Study + Exceptions : throw if dbID arg is not defined + Caller : general + Status : Stable + +=cut + +sub fetch_by_dbID { + my $self = shift; + my $dbID = shift; + + throw('dbID argument expected') if(!defined($dbID)); + + if (exists($cache{$dbID})) { + return $cache{$dbID}; + } + my $result = $self->generic_fetch("st.study_id=$dbID"); + + if ($result) { + $cache{$dbID} = $result->[0]; + } + + return ($result ? $result->[0] : undef); +} + + +=head2 fetch_all_by_dbID_list + + Arg [1] : listref $list + Example : $study = $study_adaptor->fetch_all_by_dbID_list([907,1132]); + Description: Retrieves a listref of study objects via a list of internal + dbID identifiers + Returntype : listref of Bio::EnsEMBL::Variation::Study objects + Exceptions : throw if list argument is not defined + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_dbID_list { + my $self = shift; + my $list = shift; + + if(!defined($list) || ref($list) ne 'ARRAY') { + throw("list reference argument is required"); + } + + my $id_str = (@$list > 1) ? " IN (".join(',',@$list).")" : ' = \''.$list->[0].'\''; + + my $result = $self->generic_fetch("st.study_id $id_str"); + + return ($result ? $result : undef); +} + + +=head2 fetch_all_by_source + + Arg [1] : string $source_name + Example : my $study = $study_adaptor->fetch_by_name('EGAS00000000001'); + Description : Retrieves all Study objects associated with a source. + Returntype : listref of Bio::EnsEMBL::Variation::Study + Exceptions : thrown if source_name not provided + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_source{ + my $self = shift; + my $source_name = shift; + + throw('source_name argument expected') if(!defined($source_name)); + + my $result = $self->generic_fetch("s.name='$source_name'"); + + return ($result ? $result : undef); +} + + +sub _fetch_all_associate_study_id { + my $self = shift; + my $study_id = shift; + + my $a_study; + my @study_list; + + my $sth = $self->prepare(qq{(SELECT DISTINCT study1_id FROM associate_study WHERE study2_id=?) + UNION + (SELECT DISTINCT study2_id FROM associate_study WHERE study1_id=?)}); + $sth->bind_param(1,$study_id,SQL_INTEGER); + $sth->bind_param(2,$study_id,SQL_INTEGER); + $sth->execute(); + $sth->bind_columns(\$a_study); + + while($sth->fetch()) { + push(@study_list,$a_study); + } + return \@study_list; +} + + +sub _columns { + return qw(st.study_id st.name st.description st.url st.external_reference st.study_type s.name); +} + +sub _tables { return (['study', 'st'],['source', 's']); } + +sub _default_where_clause { + my $self = shift; + return 'st.source_id = s.source_id'; +} + +# +# private method, creates study objects from an executed statement handle +# ordering of columns must be consistant +# +sub _objs_from_sth { + my $self = shift; + my $sth = shift; + + my @study; + + my ($study_id,$study_name,$study_description,$study_url,$external_reference,$study_type,$source_name,$associate); + + $sth->bind_columns(\$study_id, \$study_name, \$study_description, \$study_url, + \$external_reference, \$study_type, \$source_name); + + while($sth->fetch()) { + + $associate = $self->_fetch_all_associate_study_id($study_id); + + push @study, Bio::EnsEMBL::Variation::Study->new + (-dbID => $study_id, + -ADAPTOR => $self, + -NAME => $study_name, + -DESCRIPTION => $study_description, + -URL => $study_url, + -EXTERNAL_REFERENCE => $external_reference, + -TYPE => $study_type, + -SOURCE => $source_name, + -ASSOCIATE => $associate); + } + + return \@study; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/SupportingStructuralVariationAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/SupportingStructuralVariationAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,194 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# +# Ensembl module for Bio::EnsEMBL::Variation::DBSQL::SupportingStructuralVariationAdaptor +# +# Copyright (c) 2011 Ensembl +# +# You may distribute this module under the same terms as perl itself +# +# + +=head1 NAME + +Bio::EnsEMBL::Variation::DBSQL::SupportingStructuralVariationAdaptor + +=head1 SYNOPSIS + $reg = 'Bio::EnsEMBL::Registry'; + + $reg->load_registry_from_db(-host => 'ensembldb.ensembl.org',-user => 'anonymous'); + + $ssva = $reg->get_adaptor("human","variation","supportingstructuralvariation"); + + # fetch a supporting structural variation by its name + $ssv = $ssva->fetch_by_name('nssv133'); + + # fetch all supporting evidences for a structural variation + $sva = $reg->get_adaptor("human","variation","structuralvariation"); + $sv = $sva->fetch_by_dbID(145); + foreach $ssv (@{$ssva->fetch_all_by_StructuralVariation($sv)}){ + print $ssv->dbID, " - ", $ssv->name ,"\n"; + } + + # Modify the include_failed_variations flag in DBAdaptor to also return supporting evidences that have been flagged as failed + $va->db->include_failed_variations(1); + +=head1 DESCRIPTION + +This adaptor provides database connectivity for SupportingStructuralVariation objects. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::DBSQL::SupportingStructuralVariationAdaptor; + +use Bio::EnsEMBL::Variation::DBSQL::BaseStructuralVariationAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning deprecate); +use Bio::EnsEMBL::Variation::SupportingStructuralVariation; +use DBI qw(:sql_types); + +our @ISA = ('Bio::EnsEMBL::Variation::DBSQL::BaseStructuralVariationAdaptor'); + + +sub _default_where_clause { + my $self = shift; + return $self->SUPER::_default_where_clause().' AND is_evidence=1'; +} + + +sub _objs_from_sth { + my ($self, $sth) = @_; + + # + # This code is ugly because an attempt has been made to remove as many + # function calls as possible for speed purposes. Thus many caches and + # a fair bit of gymnastics is used. + # + my @svs; + + my ($struct_variation_id, $variation_name, $validation_status, $source_name, $source_version, + $source_description, $class_attrib_id, $study_id, $is_evidence, $is_somatic); + + $sth->bind_columns(\$struct_variation_id, \$variation_name, \$validation_status, \$source_name, + \$source_version, \$source_description, \$class_attrib_id, \$study_id, \$is_evidence, + \$is_somatic); + + my $aa = $self->db->get_AttributeAdaptor; + my $sta = $self->db->get_StudyAdaptor(); + + while($sth->fetch()) { + + my $study; + $study = $sta->fetch_by_dbID($study_id) if (defined($study_id)); + + # Get the validation status + $validation_status ||= 0; + my @states = split(/,/,$validation_status); + + push @svs, Bio::EnsEMBL::Variation::SupportingStructuralVariation->new( + -dbID => $struct_variation_id, + -VARIATION_NAME => $variation_name, + -VALIDATION_STATES => \@states, + -ADAPTOR => $self, + -SOURCE => $source_name, + -SOURCE_VERSION => $source_version, + -SOURCE_DESCRIPTION => $source_description, + -CLASS_SO_TERM => $aa->attrib_value_for_id($class_attrib_id), + -STUDY => $study, + -IS_EVIDENCE => $is_evidence || 0, + -IS_SOMATIC => $is_somatic || 0 + ); + } + return \@svs; +} + + +=head2 fetch_all_by_StructuralVariation + + Arg [1] : Bio::EnsEMBL::Variation::StructuralVariation $sv + Example : my $sv = $sv_adaptor->fetch_by_name('esv9549'); + foreach my $ssv (@{$ssv_adaptor->fetch_all_by_StructuralVariation($sv)}){ + print $ssv->variation_name,"\n"; + } + Description : Retrieves all supporting evidences from a specified structural variant + ReturnType : reference to list of Bio::EnsEMBL::Variation::SupportingStructuralVariation objects + Exceptions : throw if incorrect argument is passed + warning if provided structural variant does not have a dbID + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_StructuralVariation { + my $self = shift; + my $sv = shift; + + if(!ref($sv) || !$sv->isa('Bio::EnsEMBL::Variation::StructuralVariation')) { + throw("Bio::EnsEMBL::Variation::StructuralVariation arg expected"); + } + + if(!$sv->dbID()) { + warning("StructuralVariation does not have dbID, cannot retrieve structural variants"); + return []; + } + + my $cols = qq{ sa.supporting_structural_variation_id, sv.variation_name, sv.validation_status, s.name, + s.version, s.description, sv.class_attrib_id, sv.study_id, sv.is_evidence, sv.somatic }; + + my $tables; + foreach my $t ($self->_tables()) { + next if ($t->[0] eq 'failed_structural_variation' and !$self->db->include_failed_variations()); + $tables .= ',' if ($tables); + $tables .= join(' ',@$t); + # Adds a left join to the failed_structural_variation table + if ($t->[0] eq 'structural_variation' and !$self->db->include_failed_variations()) { + $tables .= qq{ LEFT JOIN failed_structural_variation fsv + ON (fsv.structural_variation_id=sv.structural_variation_id)}; + } + } + + # Special case for one human study where SV can be a supporting evidence of an other SV + my $constraint = $self->SUPER::_default_where_clause(); + + # Add the constraint for failed structural variant + $constraint .= " AND " . $self->db->_exclude_failed_structural_variations_constraint(); + + my $sth = $self->prepare(qq{SELECT $cols + FROM $tables, structural_variation_association sa + WHERE $constraint + AND sa.supporting_structural_variation_id=sv.structural_variation_id + AND sa.structural_variation_id = ?}); + $sth->bind_param(1,$sv->dbID,SQL_INTEGER); + $sth->execute(); + + my $results = $self->_objs_from_sth($sth); + + $sth->finish(); + + return $results; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/TranscriptVariationAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/TranscriptVariationAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,554 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Variation::DBSQL::TranscriptVariationAdaptor + +=head1 SYNOPSIS + my $reg = 'Bio::EnsEMBL::Registry'; + + $reg->load_registry_from_db(-host => 'ensembldb.ensembl.org',-user => 'anonymous'); + + my $tva = $reg->get_adaptor('human','variation','TranscriptVariation'); + + my $ta = $reg->get_adaptor('human','core','Transcript'); + my $va = $reg->get_adaptor('human','variation','Variation'); + my $vfa = $reg->get_adaptor('human','variation','VariationFeature'); + + # fetch all TranscriptVariations related to a Transcript + my $tran = $ta->fetch_by_stable_id('ENST00000380152'); + + for my $tv (@{ $tva->fetch_all_by_Transcripts([$tran]) }) { + print $tv->consequence_type, "\n"; + print $tv->cdna_start, '-', $tv->cdna_end, "\n"; + } + + # fetch all TranscriptVariations related to a VariationFeature + my $vf = $vfa->fetch_all_by_Variation($va->fetch_by_name('rs669'))->[0]; + + for my $tv (@{ $tva->fetch_all_by_VariationFeatures([$vf]) }) { + print $tv->transcript->stable_id, "\n"; + print $tv->translation_start, '-', $tv->translation_end, "\n"; + } + +=head1 DESCRIPTION + +This adaptor allows you to fetch TranscriptVariation objects either by the Transcripts +the associated VariationFeature falls in, or by VariationFeature directly. Storing +TranscriptVariation objects in a variation schema database is also supported. In the +database there will a separate row for each alternative allele of a TranscriptVariation, +but the methods here will fetch all alleles associated with the TranscriptVariation +at once. + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::DBSQL::TranscriptVariationAdaptor; + +use Bio::EnsEMBL::Variation::TranscriptVariation; +use Bio::EnsEMBL::Variation::TranscriptVariationAllele; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(MAX_DISTANCE_FROM_TRANSCRIPT); +use Bio::EnsEMBL::Variation::Utils::Constants qw(%OVERLAP_CONSEQUENCES); +use Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix; + +use base qw(Bio::EnsEMBL::Variation::DBSQL::VariationFeatureOverlapAdaptor); + +=head2 store + + Arg [1] : Bio::EnsEMBL::Variation::TranscriptVariation $tv + Description: Store the TranscriptVariation in the database + Status : At risk + +=cut + +sub store { + my ($self, $tv) = @_; + + my $dbh = $self->dbc->db_handle; + + my $sth = $dbh->prepare_cached(q{ + INSERT DELAYED INTO transcript_variation ( + variation_feature_id, + feature_stable_id, + allele_string, + somatic, + consequence_types, + cds_start, + cds_end, + cdna_start, + cdna_end, + translation_start, + translation_end, + distance_to_transcript, + codon_allele_string, + pep_allele_string, + hgvs_genomic, + hgvs_transcript, + hgvs_protein, + polyphen_prediction, + polyphen_score, + sift_prediction, + sift_score + ) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) + }); + + for my $allele (@{ $tv->get_all_alternate_TranscriptVariationAlleles }) { + + $sth->execute( + $tv->variation_feature->dbID, + $tv->feature->stable_id, + $allele->allele_string, + $tv->variation_feature->is_somatic, + (join ',', map { $_->SO_term } @{ $allele->get_all_OverlapConsequences }), + $tv->cds_start, + $tv->cds_end, + $tv->cdna_start, + $tv->cdna_end, + $tv->translation_start, + $tv->translation_end, + $tv->distance_to_transcript, + $allele->codon_allele_string, + $allele->pep_allele_string, + $allele->hgvs_genomic, + $allele->hgvs_transcript, + $allele->hgvs_protein, + $allele->polyphen_prediction, + $allele->polyphen_score, + $allele->sift_prediction, + $allele->sift_score + ); + } +} + +=head2 fetch_all_by_Transcripts_SO_terms + + Arg [1] : listref of Bio::EnsEMBL::Transcripts + Arg [2] : listref of SO terms + Description: Fetch all germline TranscriptVariations associated with the + given list of Transcripts with consequences with given SO terms + Returntype : listref of Bio::EnsEMBL::Variation::TranscriptVariations + Status : At risk + +=cut + +sub fetch_all_by_Transcripts_SO_terms { + my ($self, $transcripts, $terms) = @_; + my $constraint = $self->_get_consequence_constraint($terms); + return $self->fetch_all_by_Transcripts_with_constraint($transcripts, $constraint.' AND somatic = 0'); +} + +=head2 fetch_all_somatic_by_Transcripts_SO_terms + + Arg [1] : listref of Bio::EnsEMBL::Transcripts + Arg [2] : listref of SO terms + Description: Fetch all somatic TranscriptVariations associated with the + given list of Transcripts with consequences with given SO terms + Returntype : listref of Bio::EnsEMBL::Variation::TranscriptVariations + Status : At risk + +=cut + +sub fetch_all_somatic_by_Transcripts_SO_terms { + my ($self, $transcripts, $terms) = @_; + my $constraint = $self->_get_consequence_constraint($terms); + return $self->fetch_all_by_Transcripts_with_constraint($transcripts, $constraint.' AND somatic = 1'); +} + +=head2 fetch_all_by_VariationFeatures_SO_terms + + Arg [1] : listref of Bio::EnsEMBL::Variation::VariationFeatures + Arg [2] : listref of SO terms + Description: Fetch all germline TranscriptVariations associated with the + given list of VariationFeatures with consequences with given + SO terms + Returntype : listref of Bio::EnsEMBL::Variation::TranscriptVariations + Status : At risk + +=cut + +sub fetch_all_by_VariationFeatures_SO_terms { + my ($self, $vfs, $transcripts, $terms, $without_children, $included_so) = @_; + my $constraint = $self->_get_consequence_constraint($terms, $without_children, $included_so); + if (!$constraint) { + return []; + } + return $self->SUPER::fetch_all_by_VariationFeatures_with_constraint($vfs, $transcripts, $constraint); +} + +=head2 count_all_by_VariationFeatures_SO_terms + + Arg [1] : listref of Bio::EnsEMBL::Variation::VariationFeatures + Arg [2] : listref of SO terms + Description: Count TranscriptVariations associated with given + VariationFeatures with consequences with given SO terms + Returntype : int + Status : At risk + +=cut + +sub count_all_by_VariationFeatures_SO_terms { + my ($self, $vfs, $transcripts, $terms, $included_so) = @_; + my $constraint = $self->_get_consequence_constraint($terms, 1, $included_so); + if (!$constraint) { + return 0; + } + return $self->SUPER::count_all_by_VariationFeatures_with_constraint($vfs, $transcripts, $constraint); +} + +=head2 fetch_all_by_Transcripts + + Arg [1] : listref of Bio::EnsEMBL::Transcripts + Description: Fetch all germline TranscriptVariations associated with the + given list of Transcripts + Returntype : listref of Bio::EnsEMBL::Variation::TranscriptVariations + Status : At risk + +=cut + +sub fetch_all_by_Transcripts { + my ($self, $transcripts) = @_; + return $self->fetch_all_by_Transcripts_with_constraint($transcripts, 'somatic = 0'); +} + +=head2 fetch_all_somatic_by_Transcripts + + Arg [1] : listref of Bio::EnsEMBL::Transcripts + Description: Fetch all somatic TranscriptVariations associated with the + given list of Transcripts + Returntype : listref of Bio::EnsEMBL::Variation::TranscriptVariations + Status : At risk + +=cut + +sub fetch_all_somatic_by_Transcripts { + my ($self, $transcripts) = @_; + return $self->fetch_all_by_Transcripts_with_constraint($transcripts, 'somatic = 1'); +} + +=head2 fetch_all_by_translation_id + + Arg[1] : String $translation_id + The stable identifier of the translation + Description: Fetch all germline TranscriptVariations associated with the given Translation + Returntype : listref of Bio::EnsEMBL::Variation::TranscriptVariation + Status : At Risk +=cut + +sub fetch_all_by_translation_id { + my ($self, $translation_id) = @_; + my $transcript = $self->_transcript($translation_id); + my $all_tvs = $self->fetch_all_by_Transcripts([$transcript]); + return $self->_transcript_variations_on_protein($all_tvs); +} + +=head2 fetch_all_somatic_by_translation_id + + Arg[1] : String $translation_id + The stable identifier of the translation. + Description: Fetch all somatic TranscriptVariations associated with the given Translation + Returntype : listref of Bio::EnsEMBL::Variation::TranscriptVariation + Status : At Risk +=cut + +sub fetch_all_somatic_by_translation_id { + my ($self, $translation_id) = @_; + my $transcript = $self->_transcript($translation_id); + my $all_tvs = $self->fetch_all_somatic_by_Transcripts([$transcript]); + return $self->_transcript_variations_on_protein($all_tvs); +} + +=head2 fetch_all_by_translation_id_SO_terms + + Arg[1] : String $translation_id + The stable identifier of the translation + Arg[2] : listref of SO terms + Description : Fetch all germline TranscriptVariations associated with the given Translation + and having consequence types as given in the input list of SO terms + Returntype : listref of Bio::EnsEMBL::Variation::TranscriptVariation + Status : At Risk +=cut + +sub fetch_all_by_translation_id_SO_terms { + my ($self, $translation_id, $terms) = @_; + my $transcript = $self->_transcript($translation_id); + my $all_tvs = $self->fetch_all_by_Transcripts_SO_terms([$transcript], $terms); + return $self->_transcript_variations_on_protein($all_tvs); +} + +=head2 fetch_all_somatic_by_translation_id_SO_terms + + Arg[1] : String $translation_id + The stable identifier of the translation + Arg[2] : listref of SO terms + Description: Fetch all somatic TranscriptVariations associated with the given Translation + and having consequence types as given in the input list of SO terms + Returntype : listref of Bio::EnsEMBL::Variation::TranscriptVariation + Status : At Risk +=cut + +sub fetch_all_somatic_by_translation_id_SO_terms { + my ($self, $translation_id, $terms) = @_; + my $transcript = $self->_transcript($translation_id); + my $all_tvs = $self->fetch_all_somatic_by_Transcripts_SO_terms([$transcript], $terms); + return $self->_transcript_variations_on_protein($all_tvs); +} + +# Returns the associated Transcript for a given translation id +sub _transcript { + my ($self, $translation_id) = @_; + my $transcript_adaptor = $self->db()->dnadb()->get_TranscriptAdaptor(); + my $transcript = $transcript_adaptor->fetch_by_translation_stable_id($translation_id); + return $transcript; +} + +# Returns listref of TranscriptVariations whose coordinates can be mapped to the protein sequence +sub _transcript_variations_on_protein { + my ($self, $all_tvs) = @_; + my @tvs; + foreach my $tv (@$all_tvs) { + if ($tv->translation_start && $tv->translation_end) { + push(@tvs, $tv); + } + } + return \@tvs; +} + +=head2 fetch_all_by_Transcripts_with_constraint + + Arg [1] : listref of Bio::EnsEMBL::Transcripts + Arg [2] : extra SQL constraint for the query + Description: Fetch all TranscriptVariations associated with the + given list of Transcripts + Returntype : listref of Bio::EnsEMBL::Variation::TranscriptVariations + Status : At risk + +=cut + +sub fetch_all_by_Transcripts_with_constraint { + my ($self, $transcripts, $constraint) = @_; + + return $self->SUPER::fetch_all_by_Features_with_constraint($transcripts, $constraint); +} + +sub _objs_from_sth { + my ($self, $sth) = @_; + + #warn $sth->sql; + + my ( + $transcript_variation_id, + $variation_feature_id, + $feature_stable_id, + $allele_string, + $consequence_types, + $cds_start, + $cds_end, + $cdna_start, + $cdna_end, + $translation_start, + $translation_end, + $distance_to_transcript, + $codon_allele_string, + $pep_allele_string, + $hgvs_genomic, + $hgvs_transcript, + $hgvs_protein, + $polyphen_prediction, + $polyphen_score, + $sift_prediction, + $sift_score, + ); + + $sth->bind_columns( + \$transcript_variation_id, + \$variation_feature_id, + \$feature_stable_id, + \$allele_string, + \$consequence_types, + \$cds_start, + \$cds_end, + \$cdna_start, + \$cdna_end, + \$translation_start, + \$translation_end, + \$distance_to_transcript, + \$codon_allele_string, + \$pep_allele_string, + \$hgvs_genomic, + \$hgvs_transcript, + \$hgvs_protein, + \$polyphen_prediction, + \$polyphen_score, + \$sift_prediction, + \$sift_score, + ); + + my %tvs; + + while ($sth->fetch) { + + my ($ref_allele, $alt_allele) = split /\//, $allele_string; + my ($ref_codon, $alt_codon) = split /\//, $codon_allele_string || ''; + my ($ref_pep, $alt_pep) = split /\//, $pep_allele_string || ''; + + # for HGMD mutations etc. just set the alt allele to the ref allele + $alt_allele ||= $ref_allele; + + # for synonymous mutations the peptides are the same and + # there is no / in the string + $alt_pep ||= $ref_pep; + + # for TranscriptVariations with multiple alternative alleles + # there will be multiple rows in the database, so we construct + # the TV object and the reference allele object when we see + # the first row, but then only add extra allele objects when + # we see further rows, we track existing TVs in the %tvs hash, + # keyed by variation_feature_id and feature_stable_id + + my $key = $variation_feature_id.'_'.$feature_stable_id; + + my $tv = $tvs{$key}; + + unless ($tv) { + $tv = Bio::EnsEMBL::Variation::TranscriptVariation->new_fast({ + _variation_feature_id => $variation_feature_id, + _feature_stable_id => $feature_stable_id, + cds_start => $cds_start, + cds_end => $cds_end, + cdna_start => $cdna_start, + cdna_end => $cdna_end, + translation_start => $translation_start, + translation_end => $translation_end, + distance_to_transcript => $distance_to_transcript, + adaptor => $self, + }); + + $tvs{$key} = $tv; + + my $ref_allele = Bio::EnsEMBL::Variation::TranscriptVariationAllele->new_fast({ + is_reference => 1, + variation_feature_seq => $ref_allele, + transcript_variation => $tv, + codon => $ref_codon, + peptide => $ref_pep, + dbID => $transcript_variation_id, + }); + + $tv->add_TranscriptVariationAllele($ref_allele); + } + + #my $overlap_consequences = $self->_transcript_variation_consequences_for_set_number($consequence_types); + + my $overlap_consequences = [ map { $OVERLAP_CONSEQUENCES{$_} } split /,/, $consequence_types ]; + + my $allele = Bio::EnsEMBL::Variation::TranscriptVariationAllele->new_fast({ + is_reference => 0, + variation_feature_seq => $alt_allele, + transcript_variation => $tv, + codon => $alt_codon, + peptide => $alt_pep, + hgvs_genomic => $hgvs_genomic, + hgvs_transcript => $hgvs_transcript, + hgvs_protein => $hgvs_protein, + overlap_consequences => $overlap_consequences, + polyphen_prediction => $polyphen_prediction, + polyphen_score => $polyphen_score, + sift_prediction => $sift_prediction, + sift_score => $sift_score, + dbID => $transcript_variation_id, + }); + + $tv->add_TranscriptVariationAllele($allele); + } + + return [values %tvs]; +} + +sub _tables { + return ( + ['transcript_variation', 'tv'] + ); +} + +sub _columns { + return qw( + transcript_variation_id + variation_feature_id + feature_stable_id + allele_string + consequence_types + cds_start + cds_end + cdna_start + cdna_end + translation_start + translation_end + distance_to_transcript + codon_allele_string + pep_allele_string + hgvs_genomic + hgvs_transcript + hgvs_protein + polyphen_prediction + polyphen_score + sift_prediction + sift_score + ); +} + +#sub _get_prediction_matrix { +# my ($self, $analysis, $transcript_stable_id) = @_; +# +# # look in the protein function prediction table to see if there is +# # a prediction string for this transcript +# +# return undef unless ($analysis eq 'polyphen' || $analysis eq 'sift'); +# +# my $dbh = $self->dbc->db_handle; +# +# my $col = $analysis.'_predictions'; +# +# my $sth = $dbh->prepare_cached(qq{ +# SELECT $col +# FROM protein_function_predictions +# WHERE transcript_stable_id = ? +# }); +# +# $sth->execute($transcript_stable_id); +# +# my ($raw_matrix) = $sth->fetchrow_array; +# +# $sth->finish; +# +# return undef unless $raw_matrix; +# +# my $matrix = Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix->new( +# -analysis => $analysis, +# -matrix => $raw_matrix, +# ); +# +# return $matrix; +#} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/VariationAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/VariationAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1510 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# +# Ensembl module for Bio::EnsEMBL::Variation::DBSQL::VariationAdaptor +# +# Copyright (c) 2004 Ensembl +# +# You may distribute this module under the same terms as perl itself +# +# + +=head1 NAME + +Bio::EnsEMBL::Variation::DBSQL::VariationAdaptor + +=head1 SYNOPSIS + $reg = 'Bio::EnsEMBL::Registry'; + + $reg->load_registry_from_db(-host => 'ensembldb.ensembl.org',-user => 'anonymous'); + + $va = $reg->get_adaptor("human","variation","variation"); + $pa = $reg->get_adaptor("human","variation","population"); + + $va = $db->get_VariationAdaptor(); + $pa = $db->get_PopulationAdaptor(); + + # Get a Variation by its internal identifier + $var = $va->fetch_by_dbID(145); + + # fetch a variation by its name + $var = $va->fetch_by_name('rs100'); + + # check if the variation is failed and if so, get the failed description + if ($var->is_failed()) { + $desc = $var->failed_description(); + } + + # fetch all variations from a population + $pop = $pa->fetch_by_name('PACIFIC'); + @vars = {$va->fetch_all_by_Population($pop)}; + + # Modify the include_failed_variations flag in DBAdaptor to also return variations that have been flagged as failed + $va->db->include_failed_variations(1); + + # Fetch all variations from a population, including variations flagged as failed + @vars = {$va->fetch_all_by_Population($pop)}; + +=head1 DESCRIPTION + +This adaptor provides database connectivity for Variation objects. +Variations (SNPs, etc.) may be retrieved from the Ensembl variation database by +several means using this module. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::DBSQL::VariationAdaptor; + +use Bio::EnsEMBL::Variation::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Scalar qw(assert_ref check_ref wrap_array); + +use Bio::EnsEMBL::Variation::Variation; +use Bio::EnsEMBL::Variation::Allele; +use Bio::EnsEMBL::Utils::Iterator; + +use DBI qw(:sql_types); + +our @ISA = ('Bio::EnsEMBL::Variation::DBSQL::BaseAdaptor'); + +my $DEFAULT_ITERATOR_CACHE_SIZE = 10000; + +sub store { + my ($self, $var) = @_; + + my $dbh = $self->dbc->db_handle; + + # look up source_id + if(!defined($var->{source_id})) { + my $sth = $dbh->prepare(q{ + SELECT source_id FROM source WHERE name = ? + }); + $sth->execute($var->{source}); + + my $source_id; + $sth->bind_columns(\$source_id); + $sth->fetch(); + $sth->finish(); + $var->{source_id} = $source_id; + } + + throw("No source ID found for source name ", $var->{source}) unless defined($var->{source_id}); + + my $sth = $dbh->prepare(q{ + INSERT INTO variation ( + source_id, + name, + validation_status, + ancestral_allele, + flipped, + class_attrib_id, + somatic, + minor_allele, + minor_allele_freq, + minor_allele_count, + clinical_significance_attrib_id + ) VALUES (?,?,?,?,?,?,?,?,?,?,?) + }); + + $sth->execute( + $var->{source_id}, + $var->name, + (join ",", @{$var->get_all_validation_states}) || undef, + $var->ancestral_allele, + $var->{flipped}, + $var->{class_attrib_id} || $var->adaptor->db->get_AttributeAdaptor->attrib_id_for_type_value('SO_term', $var->{class_SO_term}) || 18, + $var->is_somatic, + $var->minor_allele, + $var->minor_allele_frequency, + $var->minor_allele_count, + $var->{clinical_significance_attrib_id} + ); + + $sth->finish; + + # get dbID + my $dbID = $dbh->last_insert_id(undef, undef, 'variation', 'variation_id'); + $var->{dbID} = $dbID; + $var->{adaptor} = $self; + + # flanking sequence + $sth = $dbh->prepare(q{ + INSERT INTO flanking_sequence ( + variation_id, + up_seq, + down_seq, + up_seq_region_start, + up_seq_region_end, + down_seq_region_start, + down_seq_region_end, + seq_region_id, + seq_region_strand + ) VALUES (?,?,?,?,?,?,?,?,?) + }); + + $sth->execute( + $var->dbID, + $var->{five_prime_flanking_seq}, + $var->{three_prime_flanking_seq}, + $var->{up_seq_region_start}, + $var->{up_seq_region_end}, + $var->{down_seq_region_start}, + $var->{down_seq_region_end}, + $var->{seq_region_id}, + $var->{seq_region_strand} + ); + + $sth->finish; +} + +sub update { + my ($self, $var) = @_; + + my $dbh = $self->dbc->db_handle; + + # look up source_id + if(!defined($var->{source_id})) { + my $sth = $dbh->prepare(q{ + SELECT source_id FROM source WHERE name = ? + }); + $sth->execute($var->{source}); + + my $source_id; + $sth->bind_columns(\$source_id); + $sth->fetch(); + $sth->finish(); + $var->{source_id} = $source_id; + } + + throw("No source ID found for source name ", $var->{source}) unless defined($var->{source_id}); + + my $sth = $dbh->prepare(q{ + UPDATE variation + SET source_id = ?, + name = ?, + validation_status = ?, + ancestral_allele = ?, + flipped = ?, + class_attrib_id = ?, + somatic = ?, + minor_allele = ?, + minor_allele_freq = ?, + minor_allele_count = ?, + clinical_significance_attrib_id = ? + WHERE variation_id = ? + }); + + $sth->execute( + $var->{source_id}, + $var->name, + (join ",", @{$var->get_all_validation_states}) || undef, + $var->ancestral_allele, + $var->{flipped}, + $var->{class_attrib_id} || $var->adaptor->db->get_AttributeAdaptor->attrib_id_for_type_value('SO_term', $var->{class_SO_term}) || 18, + $var->is_somatic, + $var->minor_allele, + $var->minor_allele_frequency, + $var->minor_allele_count, + $var->{clinical_significance_attrib_id}, + $var->dbID + ); + + $sth->finish; +} + +=head2 fetch_all + + Description: Returns a listref of all germline variations + Returntype : listref of Variations + Status : At risk + +=cut + +sub fetch_all { + my $self = shift; + my $constraint = 'v.somatic = 0'; + return $self->generic_fetch($constraint); +} + +=head2 fetch_all_somatic + + Description: Returns a listref of all somatic variations + Returntype : listref of Variations + Status : At risk + +=cut + +sub fetch_all_somatic { + my $self = shift; + my $constraint = 'v.somatic = 1'; + return $self->generic_fetch($constraint); +} + +=head2 fetch_Iterator + + Arg [1] : int $cache_size (optional) + Example : $var_iterator = $var_adaptor->fetch_Iterator; + Description: returns an iterator over all germline variations in the database + Returntype : Bio::EnsEMBL::Utils::Iterator + Exceptions : none + Caller : general + Status : Experimental + +=cut + +sub fetch_Iterator { + my ($self, $cache_size) = @_; + return $self->_generic_fetch_Iterator($cache_size, 'v.somatic = 0'); +} + +=head2 fetch_Iterator_somatic + + Arg [1] : int $cache_size (optional) + Example : $var_iterator = $var_adaptor->fetch_Iterator; + Description: returns an iterator over all somatic variations in the database + Returntype : Bio::EnsEMBL::Utils::Iterator + Exceptions : none + Caller : general + Status : Experimental + +=cut + +sub fetch_Iterator_somatic { + my ($self, $cache_size) = @_; + return $self->_generic_fetch_Iterator($cache_size, 'v.somatic = 1'); +} + +sub _generic_fetch_Iterator { + + my ($self, $cache_size, $constraint) = @_; + + my $full_constraint = $constraint ? + $constraint.' AND '.$self->db->_exclude_failed_variations_constraint : + $self->db->_exclude_failed_variations_constraint; + + # prepare and execute a query to fetch all dbIDs + + my $sth = $self->prepare(qq{ + SELECT v.variation_id + FROM (variation v, source s) + LEFT JOIN failed_variation fv on v.variation_id = fv.variation_id + WHERE v.source_id = s.source_id + AND $full_constraint + }); + + $sth->execute; + + my $var_id; + + $sth->bind_columns(\$var_id); + + # we probably can't fit all of these into memory at once though, + # so create an iterator that fetches $cache_size dbIDs from the + # statement handle at a time and then fetches these objects, + # storing them in a cache. We then return variation objects + # from this cache one by one, before filling it again if + # necessary + + $cache_size ||= $DEFAULT_ITERATOR_CACHE_SIZE; + + my @cache; + + my $items_to_fetch = 1; + + return Bio::EnsEMBL::Utils::Iterator->new(sub{ + + if (@cache == 0 && $items_to_fetch) { + + # our cache is empty, and there are still items to fetch, so + # fetch the next chunk of dbIDs and create objects from them + + my @dbIDs; + + my $item_count = 0; + + while( $sth->fetch ) { + + push @dbIDs, $var_id; + + if (++$item_count == $cache_size) { + # we have fetched a cache's worth of dbIDs, so flag that + # there are still items to fetch and last out of the loop + $items_to_fetch = 1; + last; + } + + # if this is the last row, this flag will be 0 outside the loop + $items_to_fetch = 0; + } + + $sth->finish unless $items_to_fetch; + + @cache = @{ $self->fetch_all_by_dbID_list(\@dbIDs) } if @dbIDs; + } + + return shift @cache; + }); +} + +=head2 fetch_by_dbID + + Arg [1] : int $dbID + Example : $var = $var_adaptor->fetch_by_dbID(5526); + Description: Retrieves a Variation object via its internal identifier. + If no such variation exists undef is returned. + Returntype : Bio::EnsEMBL::Variation::Variation + Exceptions : throw if dbID arg is not defined + Caller : general, IndividualAdaptor + Status : Stable + +=cut + +sub fetch_by_dbID { + my $self = shift; + my $dbID = shift; + + # This method uses the flanking_sequence table + $self->{'_check_flanking'} = 1; + + # Now, it should be fine to just use the superclass method + my $result = $self->SUPER::fetch_by_dbID($dbID); + + # Unset the flanking flag + delete($self->{'_check_flanking'}); + + return $result; + +} + +sub _columns { + my $self = shift; + + my @cols = ( + "v.variation_id", + "v.name AS v_name", + "v.validation_status AS v_validation_status", + "v.class_attrib_id AS v_class_attrib_id", + "s1.name AS v_source_name", + "s1.description AS v_source_description", + "s1.url AS v_source_url", + "s1.type AS v_source_type", + "v.somatic AS v_somatic", + "v.flipped AS v_flipped", + "v.ancestral_allele AS v_ancestral_allele", + "vs.moltype AS vs_moltype", + "vs.name AS vs_name", + "s2.name AS vs_source_name", + "v.minor_allele", + "v.minor_allele_freq", + "v.minor_allele_count", + "v.clinical_significance_attrib_id", + ); + if ($self->{'_check_flanking'}) { + push(@cols,qq{(fs.up_seq is not null OR fs.down_seq is not null) AS fs_flank_flag}); + } + else { + push(@cols,qq{0 AS fs_flank_flag}); + } + + return @cols; +} + +sub _tables { + my $self = shift; + + my @tables = ( + ['variation', 'v'], + ['source', 's1'], + ['variation_synonym', 'vs'], + ['source', 's2'] + ); + + # If we are constraining on sample_id, add the allele table + push(@tables,['allele', 'a']) if ($self->{'_constrain_sample'}); + # Add the flanking_sequence table if we are checking that + push(@tables,['flanking_sequence', 'fs']) if ($self->{'_check_flanking'}); + # Add the failed_variation table if we are filtering on those + push(@tables,['failed_variation', 'fv']) unless ($self->db->include_failed_variations()); + + return @tables; +} + +sub _left_join { + my $self = shift; + + my @left_join = ( + ['variation_synonym', 'v.variation_id = vs.variation_id'], + ['source s2', 'vs.source_id = s2.source_id'] + ); + + # If we are checking flanking_sequences, left join to that table + push(@left_join,['flanking_sequence', 'v.variation_id = fs.variation_id']) if ($self->{'_check_flanking'}); + # If we are filtering on failed variations, left join + push(@left_join,['failed_variation', 'v.variation_id = fv.variation_id']) unless ($self->db->include_failed_variations()); + + return @left_join; +} + +sub _default_where_clause { + my $self = shift; + + my $constraint = qq{ + s1.source_id = v.source_id + }; + + # If we are constraining on sample_id, we should have a constraint on the allele tables as well + $constraint .= qq{ AND a.variation_id = v.variation_id } if ($self->{'_constrain_sample'}); + + return $constraint; +} + +=head2 fetch_by_name + + Arg [1] : string $name + Arg [2] : string $source (optional) + Example : $var = $var_adaptor->fetch_by_name('rs1453'); + Description: Retrieves a variation object via its name + Returntype : Bio::EnsEMBL::Variation::Variation + Exceptions : throw if name argument is not defined + Caller : general + Status : Stable + +=cut + +sub fetch_by_name { + my $self = shift; + my $name = shift; + my $source = shift; + + throw('name argument expected') if(!defined($name)); + + # This method will need to left join to the flanking sequence table so set that flag + $self->{'_check_flanking'} = 1; + + # Add a constraint on the name + my $constraint = qq{v.name = ?}; + + # If a source is given, add a constraint on the source + $constraint .= qq{ AND s1.name = ?} if ( defined $source ); + + # Bind the parameters + $self->bind_param_generic_fetch($name,SQL_VARCHAR); + $self->bind_param_generic_fetch($source,SQL_VARCHAR) if ( defined $source ); + + # Get the results from generic fetch method + my $result = $self->generic_fetch($constraint); + + # Unset the check flanking flag again + delete($self->{'_check_flanking'}); + + # We need to check the synonym table in case the name was not found in the variation table + unless (scalar(@{$result})) { + + # Call the fetch by synonym method + $result = wrap_array($self->fetch_by_synonym($name,$source)); + + } + + # If we still can't find any result and the name looks like an ssId, try fetching by subsnp_id instead + unless (scalar(@{$result}) || $name !~ m/^ss\d+$/) { + + $result = wrap_array($self->fetch_by_subsnp_id($name)); + + } + + # Return the result + return undef unless (scalar(@{$result})); + return $result->[0]; +} + +# Internal method for getting the internal dbIDs for a list of names. Will also query the variation_synonym and allele (for subsnp_ids) tables +sub _name_to_dbID { + my $self = shift; + my $name_list = shift; + my $synonym = shift; + my $subsnp = shift; + + $name_list = wrap_array($name_list); + throw ("A list of names is required") unless (scalar(@{$name_list})); + + # Use a hash to store the name to dbID mapping + my %dbIDs; + + # Determine which columns we are returning + my $cols = ($synonym && $subsnp ? "CONCAT('ss',v.subsnp_id) AS name, v.variation_id" : "v.name, v.variation_id"); + + # Determine which table we are querying + my $table = ($synonym ? ($subsnp ? 'allele' : 'variation_synonym') : 'variation'); + + # Statement to get the dbIDs from variation or variation_synonym + my $stmt = qq{ + SELECT + $cols + FROM + $table v + WHERE + }; + my $sth; + + # Work on batches of $batch_size; + my $batch_size = 200; + # Make a local copy of the list to work on + my $local_list = [@{$name_list}]; + while (scalar(@{$local_list})) { + + # Get the next batch and construct the constraint + my @names = splice(@{$local_list},0,$batch_size); + my $constraint = "('" . join("','",@names) . "')"; + $constraint = ($synonym && $subsnp ? "v.subsnp_id" : "v.name") . qq{ IN $constraint}; + + # Prepare a statement + $sth = $self->prepare($stmt . qq{ $constraint }); + $sth->execute(); + + # Fetch the results and populate the hash + my ($name,$dbID); + $sth->bind_columns(\$name,\$dbID); + while ($sth->fetch()) { + $dbIDs{$name} = $dbID; + } + + } + + # If we are querying variation and have unmapped names, also query variation_synonym and allele + if (!$synonym || !$subsnp) { + + # Get the unmapped names + my @unmapped = grep {!exists($dbIDs{$_})} @{$name_list}; + + # If we are going to query for subsnp_ids, get the names that look like ssIds and strip the ss prefix + if ($synonym) { + @unmapped = grep {$_ =~ s/^ss(\d+)$/$1/} @unmapped; + } + + # Get the dbID mapping from the synonym table and add them to the hash if there are unmapped + if (scalar(@unmapped)) { + my $names = $self->_name_to_dbID(\@unmapped,1,$synonym); + map {$dbIDs{$_} = $names->{$_}} keys(%{$names}); + } + } + + # Return a hashref with the name -> dbID mapping + return \%dbIDs; +} + +sub fetch_by_synonym { + my $self = shift; + my $name = shift; + my $source = shift; + + # Do a query to get the current variation for the synonym and call a fetch method on this variation + my $constraint = qq{vs.name = ?}; + $constraint .= qq{ AND s.name = ?} if (defined($source)); + + # This statement will only return 1 row which is consistent with the behaviour of fetch_by_name. + # However, the synonym name is only guaranteed to be unique in combination with the source + my $stmt = qq{ + SELECT + vs.variation_id + FROM + variation_synonym vs JOIN + source s ON ( + s.source_id = vs.source_id + ) + WHERE + $constraint + LIMIT 1 + }; + my $sth = $self->prepare($stmt); + $sth->bind_param(1,$name,SQL_VARCHAR); + $sth->bind_param(2,$source,SQL_VARCHAR) if (defined($source)); + $sth->execute(); + + # Bind the results + my $dbID; + $sth->bind_columns(\$dbID); + # Fetch the results + $sth->fetch(); + + # Return undef in case no data could be found + return undef unless (defined($dbID)); + + # Call the fetch_by_name method using the updated name for the synonym + return $self->fetch_by_dbID($dbID); +} + + +=head2 fetch_by_subsnp_id + + Arg [1] : string $subsnp_id + Example : $var = $var_adaptor->fetch_by_subsnp_id('ss123'); + Description: Retrieves a variation object via a component subsnp ID + Returntype : Bio::EnsEMBL::Variation::Variation + Exceptions : throw if name argument is not defined + Caller : general + Status : Stable + +=cut + +sub fetch_by_subsnp_id { + my $self = shift; + my $name = shift; + + throw('name argument expected') if(!defined($name)); + + # Strip away any ss prefix + $name =~ s/^ss//gi; + + # This method will need to left join to the flanking sequence table so set that flag + $self->{'_check_flanking'} = 1; + $self->{'_constrain_sample'} = 1; + + # Add a constraint on the subsnp_id + my $constraint = qq{a.subsnp_id = ?}; + + # Bind the parameters + $self->bind_param_generic_fetch($name,SQL_INTEGER); + + # Get the results from generic fetch method + my $result = $self->generic_fetch($constraint); + + # Unset the check flanking flag again + delete($self->{'_check_flanking'}); + delete($self->{'_constrain_sample'}); + + # Return the result + return undef unless (scalar(@{$result})); + return $result->[0]; + +} + + + +=head2 fetch_all_by_source + + Arg [1] : string $source_name + Arg [2] : int $primary + Example : $var = $var_adaptor->fetch_all_by_source(); + Description: Retrieves all Variation objects associated with a source. By default ($primary=0) + returns variations that have the source or variation_synonym that have the source. + If primary set to 1, it returns only variations where the primary name is associated + with the source + Returntype : listref of Bio::EnsEMBL::Variation::Variation + Exceptions : thrown if source_name not provided + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_source { + my $self = shift; + my $source_name = shift; + my $primary = shift; + + throw('name argument expected') unless (defined($source_name)); + + # By default, returns ALL variation and variation_synonyms where source = $name + $primary ||= 0; + + # Add the constraint on the source name. If primary is true, only the variation source will be queried, otherwise the variation_synonym source will be as well + my $constraint = qq{s1.name = ?}; + $constraint = qq{($constraint OR s2.name = ?)} unless ($primary); + + # If necessary, add the constraint for filtering failed variations + $constraint .= qq{ AND } . $self->db->_exclude_failed_variations_constraint(); + + # Bind the source name parameter + $self->bind_param_generic_fetch($source_name,SQL_VARCHAR); + $self->bind_param_generic_fetch($source_name,SQL_VARCHAR); + + # Execute the superclass generic fetch + return $self->generic_fetch($constraint); + +} + + +=head2 fetch_all_by_source_type + + Arg [1] : string $source_type + Arg [2] : int $primary + Example : $var = $var_adaptor->fetch_all_by_source_type('chip'); + Description: Retrieves all Variation objects associated with a type of source. By default ($primary=0) + returns variations that have the source or variation_synonym that have the source. + If primary set to 1, it returns only variations where the primary name is associated + with the source + Returntype : listref of Bio::EnsEMBL::Variation::Variation + Exceptions : thrown if source_type not provided + Caller : general + Status : At risk + +=cut + +sub fetch_all_by_source_type { + my $self = shift; + my $source_type = shift; + my $primary = shift; + + throw('name argument expected') unless (defined($source_type)); + + # Get the source names that match the type + my $stmt = qq{ + SELECT + name + FROM + source + WHERE + type = ? + }; + my $sth = $self->prepare($stmt); + $sth->bind_param(1,$source_type,SQL_VARCHAR); + $sth->execute(); + + # Fetch the results from one source at a time. This should probably be implemented in a more efficient manner. + my $name; + $sth->bind_columns(\$name); + # Store in a hash to avoid duplicates + my %out; + while ($sth->fetch()) { + + my $result = $self->fetch_all_by_source($name,$primary); + next unless (scalar(@{$result})); + + # Add the fetched results to the total hash + map {$out{$_->dbID()} = $_} @{$result}; + } + + # Return the reference to the list of variations + return [values(%out)]; +} + + +=head2 fetch_all_by_dbID_list + + Arg [1] : reference to list of ints $list + Example : @vars = @{$va->fetch_all_by_dbID_list([124, 56, 90])}; + Description: Retrieves a set of variations via their internal identifiers. + This is faster than repeatedly calling fetch_by_dbID if there + are a large number of variations to retrieve + Returntype : reference to list of Bio::EnsEMBL::Variation::Variation objects + Exceptions : throw on bad argument + Caller : general, IndividualGenotypeAdaptor, PopulationGenotypeAdaptor + Status : At Risk + +=cut + +sub fetch_all_by_dbID_list { + my $self = shift; + my $list = shift; + + # Get an Iterator for the list + my $iterator = $self->fetch_Iterator_by_dbID_list($list); + + # Get an arrayref representing the contents of the Iterator + my $result = $iterator->to_arrayref(); + + return $result; +} + +=head2 fetch_Iterator_by_dbID_list + + Arg [1] : reference to list of ints $list + Example : $variation_iterator = $va->fetch_Iterator_by_dbID_list([124, 56, 90]); + Description: Retrieves an iterator over a set of variations via their internal identifiers. + Returntype : Bio::EnsEMBL::Utils::Iterator + Exceptions : throw on bad argument + Caller : general + Status : Experimental + +=cut + +sub fetch_Iterator_by_dbID_list { + my ($self, $dbid_list, $cache_size) = @_; + + unless ((defined $dbid_list) && (ref $dbid_list eq 'ARRAY')) { + throw("list reference argument is required"); + } + + $cache_size ||= $DEFAULT_ITERATOR_CACHE_SIZE; + + # create an iterator that fetches variations in blocks of + # $cache_size and returns them in turn + + my @object_cache; + + return Bio::EnsEMBL::Utils::Iterator->new(sub { + + if (@object_cache == 0 && @$dbid_list > 0 ) { + my @dbids = splice @$dbid_list, 0, $cache_size; + + # Create a constraint on the dbIDs + my $id_str = "(" . join(",",@dbids) . ")"; + my $constraint = qq{v.variation_id IN $id_str}; + + @object_cache = @{ $self->generic_fetch($constraint) }; + } + + return shift @object_cache; + } + ); +} + +=head2 fetch_all_by_name_list + + Arg [1] : reference to list of names $list + Example : @vars = @{$va->fetch_all_by_name_list(["rs3", "rs1333049"])}; + Description: Retrieves a set of variations via their names. This is faster + than repeatedly calling fetch_by_name if there are a large number + of variations to retrieve + Returntype : reference to list of Bio::EnsEMBL::Variation::Variation objects + Exceptions : throw on bad argument + Caller : general + Status : At Risk + +=cut + + +sub fetch_all_by_name_list { + my $self = shift; + my $list = shift; + + # Get a list of dbIDs for the names + my $dbIDs = $self->_name_to_dbID($list); + + # Then fetch the variations by dbID list instead + my @dbID_list = values(%{$dbIDs}); + return $self->fetch_all_by_dbID_list(\@dbID_list); +} + +=head2 get_all_sources + + Args : none + Example : $sources = $va->get_all_sources(); + Description : Retrieves from the database all sources in the Source table + ReturnType : array ref of string + Exceptions : none + Caller : web + Status : At Risk + +=cut + +sub get_all_sources{ + my $self = shift; + my @sources; + + my $source_name; + my $sth = $self->prepare(qq{SELECT name from source + }); + $sth->execute(); + $sth->bind_columns(\$source_name); + + while ($sth->fetch()){ + push @sources, $source_name + } + $sth->finish(); + + return \@sources; +} + + +=head2 get_default_source + + Args : none + Example : $default_source = $va->get_default_source(); + Description : Retrieves from the database the default source used for display purposes + ReturnType : string + Exceptions : none + Caller : web + Status : At Risk + +=cut + +sub get_default_source{ + my $self = shift; + + my $source_name; + my $sth = $self->prepare(qq{SELECT meta_value from meta where meta_key = ? + }); + $sth->bind_param(1,'source.default_source',SQL_VARCHAR); + $sth->execute(); + $sth->bind_columns(\$source_name); + $sth->fetch(); + $sth->finish(); + + return $source_name; +} + + +=head2 get_source_version + + Arg[1] : string $name + Example : $version = $va->get_source_version('dbSNP'); + Description : Retrieves from the database the version for the source given as an argument + ReturnType : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub get_source_version{ + my $self = shift; + my $name = shift; + my $version; + my $sth = $self->prepare(qq{SELECT version from source where name = ? + }); + + $sth->bind_param(1,$name,SQL_VARCHAR); + + $sth->execute(); + $sth->bind_columns(\$version); + $sth->fetch(); + $sth->finish(); + + return $version; +} + +=head2 get_flanking_sequence + + Arg[1] : int $variationID + Example : $flankinq_sequence = $va->get_flanking_sequence('652'); + Description : Retrieves from the database the appropriate flanking sequence (five,three) for the variation. If the flanking sequence is not in + the Flankinq_sequence table, access the core database with the coordinates + ReturnType : reference to a list containing (three_flank,five_flank) + Exceptions : throw when not possible to obtain sequence + Caller : general, Variation + Status : Stable + +=cut + +sub get_flanking_sequence{ + my $self = shift; + my $variationID = shift; + + my $flanking_sequence; #reference to an array for the three_prime and five_prime seqs + my ($seq_region_id, $seq_region_strand, $up_seq, $down_seq, $up_seq_region_start, $up_seq_region_end, $down_seq_region_start, $down_seq_region_end); + + my $sth = $self->prepare(qq{ + SELECT seq_region_id, seq_region_strand, up_seq, + down_seq, up_seq_region_start, up_seq_region_end, + down_seq_region_start, down_seq_region_end + FROM flanking_sequence + WHERE variation_id = ? + }); + + $sth->bind_param(1,$variationID,SQL_INTEGER); + $sth->execute(); #retrieve the flank from the variation database + $sth->bind_columns(\($seq_region_id, $seq_region_strand, $up_seq, $down_seq, $up_seq_region_start, $up_seq_region_end, $down_seq_region_start, $down_seq_region_end)); + $sth->fetch(); + $sth->finish(); + + if (!defined $down_seq){ + if( $seq_region_id){ + my ($s, $e) = ($down_seq_region_start, $down_seq_region_end); + ($s, $e) = ($e, $s) if $e < $s; + $down_seq = $self->_get_flank_from_core($seq_region_id, + $s, + $e, + $seq_region_strand); + } else { + warn( "*****[ERROR]: No seq_region_id for SNP with dbID: $variationID. ". + "Cannot retrieve flanking region******\n" ); + } + } + if (!defined $up_seq){ + if( $seq_region_id){ + my ($s, $e) = ($up_seq_region_start, $up_seq_region_end); + ($s, $e) = ($e, $s) if $e < $s; + $up_seq = $self->_get_flank_from_core($seq_region_id, + $s, + $e, + $seq_region_strand); + } else { + warn( "*****[ERROR]: No seq_region_id for SNP with dbID: $variationID. ". + "Cannot retrieve flanking region******\n" ); + } + } + + push @{$flanking_sequence},$down_seq,$up_seq; #add to the array the 3 and 5 prime sequences + + return $flanking_sequence; +} + +=head2 fetch_all_by_Population + + Arg [1] : Bio::EnsEMBL::Variation::Population + Arg [2] : $minimum_frequency (optional) + Example : $pop = $pop_adaptor->fetch_by_dbID(1345); + @vars = @{$va_adaptor->fetch_all_by_Population($pop)}; + Description: Retrieves all variations which are stored for a specified + population. If $minimum_frequency is supplied, only variations + with a minor allele frequency (MAF) greater than + $minimum_frequency will be returned. + Returntype : listref of Bio::EnsEMBL::Variation::Variation + Exceptions : throw on incorrect argument + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_Population { + my $self = shift; + my $pop = shift; + my $freq = shift; + + assert_ref($pop,'Bio::EnsEMBL::Variation::Population'); + + if(!defined($pop->dbID())) { + warning("Cannot retrieve genotypes for population without set dbID"); + return []; + } + + # Constraint the query using the sample_id for the population + my $constraint = qq{a.sample_id = ?}; + + # adjust frequency if given a percentage + if (defined($freq)) { + + $freq /= 100 if $freq > 1; + $constraint .= qq{ AND (IF(a.frequency > 0.5, 1-a.frequency, a.frequency) > ?) }; + } + + # Add the constraint for failed variations + $constraint .= " AND " . $self->db->_exclude_failed_variations_constraint(); + + # Bind the parameters + $self->bind_param_generic_fetch($pop->dbID(),SQL_INTEGER); + $self->bind_param_generic_fetch($freq,SQL_DOUBLE) if (defined($freq)); + + # Set the flag to indicate that we are constraining on population and should not left join to allele + $self->{'_constrain_sample'} = 1; + + # Execute the generic fetch + my $result = $self->generic_fetch($constraint); + + # Unset the flag + delete($self->{'_constrain_sample'}); + + return $result; +} + +=head2 fetch_all_by_VariationSet + + Arg [1] : Bio::EnsEMBL::Variation::VariationSet + Example : @vars = @{$va_adaptor->fetch_all_by_VariationSet($vs)}; + Description: Retrieves all variations which are present in a specified + variation set and its subsets. + Returntype : listref of Bio::EnsEMBL::Variation::Variation + Exceptions : throw on incorrect argument + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_VariationSet { + my $self = shift; + return $self->_generic_fetch_by_VariationSet(0, @_); +} + +=head2 fetch_Iterator_by_VariationSet + + Arg [1] : Bio::EnsEMBL::Variation::VariationSet + Example : $var_iterator = $va_adaptor->fetch_Iterator_by_VariationSet($vs); + Description: Retrieves an iterator for all variations which are present in a specified + variation set and its subsets. + Returntype : Bio::EnsEMBL::Utils::Iterator object + Exceptions : throw on incorrect argument + Caller : general + Status : Experimental + +=cut + +sub fetch_Iterator_by_VariationSet { + my $self = shift; + my $set = shift; + my $cache_size = shift || $DEFAULT_ITERATOR_CACHE_SIZE; + + # First, get ids for all subsets, + my @var_set_ids = ($set->dbID); + map {push(@var_set_ids,$_->dbID())} @{$set->adaptor->fetch_all_by_super_VariationSet($set)}; + my $var_set_id = join(",",@var_set_ids); + + # Prepare a query for getting the span of variation_ids + my $stmt = qq{ + FROM + variation_set_variation vsv LEFT JOIN + failed_variation fv ON ( + fv.variation_id = vsv.variation_id + ) + WHERE + vsv.variation_set_id IN ($var_set_id) + }; + + # Add the constraint for failed variations + my $constraint = " AND " . $self->db->_exclude_failed_variations_constraint(); + + my $sth = $self->prepare(qq{SELECT MIN(vsv.variation_id), MAX(vsv.variation_id) $stmt $constraint}); + $sth->execute(); + my ($min_variation_id,$max_variation_id); + $sth->bind_columns(\$min_variation_id,\$max_variation_id); + $sth->fetch(); + $max_variation_id ||= 0; + $min_variation_id ||= 1; + + # Prepare a statement for getting the ids in a range + $sth = $self->prepare(qq{SELECT vsv.variation_id $stmt AND vsv.variation_id BETWEEN ? AND ? $constraint}); + + # Internally, we keep an Iterator that works on the dbID span we're at + my $iterator; + + return Bio::EnsEMBL::Utils::Iterator->new(sub { + + # If the iterator is empty, get a new chunk of dbIDs, unless we've fetched all dbIDs + unless (defined($iterator) && $iterator->has_next() && $min_variation_id <= $max_variation_id) { + + # Get the next chunk of dbIDs + $sth->execute($min_variation_id,$min_variation_id+$cache_size); + $min_variation_id += ($cache_size + 1); + + # Use a hash to keep track of the seen dbIDs + my %seen; + + # Loop over the dbIDs and avoid duplicates + my $dbID; + my @dbIDs; + $sth->bind_columns(\$dbID); + while ($sth->fetch()) { + push (@dbIDs,$dbID) unless ($seen{$dbID}++); + } + + # Get a new Iterator based on the new dbID span + $iterator = $self->fetch_Iterator_by_dbID_list(\@dbIDs); + + } + + return $iterator->next(); + }); +} + +sub _generic_fetch_by_VariationSet { + my $self = shift; + my $want_iterator = shift; + my $set = shift; + + assert_ref($set,'Bio::EnsEMBL::Variation::VariationSet'); + + if(!defined($set->dbID())) { + warning("Cannot retrieve variations for variation set without a dbID"); + return []; + } + + # Get the unique dbIDs for all variations in this set and all of its subsets + my $dbid_list = $self->_fetch_all_dbIDs_by_VariationSet($set); + + my $num_vars = @$dbid_list; + + if ($num_vars > 100_000 && !$want_iterator) { + warn "This set contains a large number ($num_vars) of variations, these may not fit". + "into memory at once, considering using fetch_Iterator_by_VariationSet instead"; + } + + # Use the dbIDs to get all variations and return them + return $want_iterator ? + $self->fetch_Iterator_by_dbID_list($dbid_list) : + $self->fetch_all_by_dbID_list($dbid_list); +} + +sub _fetch_all_dbIDs_by_VariationSet { + my $self = shift; + my $set = shift; + + # First, get ids for all subsets, + + my @var_set_ids = ($set->dbID); + + foreach my $var_set (@{$set->adaptor->fetch_all_by_super_VariationSet($set)}) { + push @var_set_ids, $var_set->dbID; + } + + my $set_str = "(" . join(",",@var_set_ids) .")"; + + # Add the constraint for failed variations + my $constraint = " AND " . $self->db->_exclude_failed_variations_constraint(); + + # Then get the dbIDs for all these sets + my $stmt = qq{ + SELECT DISTINCT + vsv.variation_id + FROM + variation_set_variation vsv LEFT JOIN + failed_variation fv ON ( + fv.variation_id = vsv.variation_id + ) + WHERE + vsv.variation_set_id in $set_str + $constraint + }; + + my $sth = $self->prepare($stmt); + + $sth->execute(); + + my @result; + my $dbID; + + $sth->bind_columns(\$dbID); + + while ($sth->fetch()) { + push @result, $dbID; + } + + return \@result; +} + +=head2 is_failed + + Description : DEPRECATED. The appropriate subroutine on the Variation/Allele object should be used instead. + Exceptions : Thrown on invocation. + Status : DEPRECATED + +=cut + +sub is_failed { + my $self = shift; + + throw("The is_failed subroutine in VariationAdaptor is deprecated. Use the appropriate subroutine on the Variation/Allele object instead"); +} + +=head2 has_failed_subsnps + + Description : DEPRECATED. Use has_failed_alleles on the Variation object instead. + Exceptions : Thrown on invocation. + Status : DEPRECATED + +=cut + +sub has_failed_subsnps { + my $self = shift; + + throw("The has_failed_subsnps subroutine in VariationAdaptor is deprecated. Use has_failed_alleles on the Variation object instead"); +} + +=head2 get_all_failed_descriptions + + Arg[1] : Bio::EnsEMBL::Variation::Variation $variation + The variation object to get the failed descriptions for + Example : + my $failed_descriptions = $adaptor->get_all_failed_descriptions($var); + if (scalar(@{$failed_descriptions})) { + print "The variation '" . $var->name() . "' has been flagged as failed because '" . join("' and '",@{$failed_descriptions}) . "'\n"; + } + + Description : Gets the unique descriptions for the reasons why the supplied variation has failed. + ReturnType : reference to a list of strings + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub get_all_failed_descriptions { + my $self = shift; + my $variation = shift; + + # Call the internal get method without any constraints + my $description = $self->_internal_get_failed_descriptions($variation) || []; + + return $description; +} + + +=head2 get_failed_description + + Description : DEPRECATED. The appropriate subroutine on the Variation/Allele object should be used instead. + Exceptions : Thrown on invocation. + Status : DEPRECATED + +=cut + +sub get_failed_description { + my $self = shift; + + throw("The get_failed_description subroutine in VariationAdaptor is deprecated. Use the appropriate subroutine on the Variation/Allele object instead"); +} + +# API-internal method for getting failed descriptions for a variation +sub _internal_get_failed_descriptions { + my $self = shift; + my $variation = shift; + my $constraint = shift; + + # Assert that the object passed is a Variation + assert_ref($variation,'Bio::EnsEMBL::Variation::Variation'); + + my $stmt = qq{ + SELECT DISTINCT + fd.description + FROM + failed_variation fv JOIN + failed_description fd ON ( + fd.failed_description_id = fv.failed_description_id + ) + WHERE + fv.variation_id = ? + }; + $stmt .= qq{ AND $constraint } if (defined($constraint)); + + my $sth = $self->prepare($stmt); + $sth->execute($variation->dbID()); + + return [map {$_->[0]} @{$sth->fetchall_arrayref([0])}]; +} + +sub _get_flank_from_core{ + my $self = shift; + my $seq_region_id = shift; + my $seq_region_start = shift; + my $seq_region_end = shift; + my $seq_region_strand = shift; + + my $flanking_sequence; + if (defined $self->db()->dnadb()){ + my $slice_adaptor = $self->db()->dnadb()->get_SliceAdaptor(); + my $slice = $slice_adaptor->fetch_by_seq_region_id($seq_region_id); + if (!$slice){ + throw("Not possible to obtain slice for seq_region_id \"$seq_region_id\"\n"); + } + my $flank = $slice->subseq($seq_region_start,$seq_region_end,$seq_region_strand); + return $slice->subseq($seq_region_start,$seq_region_end,$seq_region_strand); + } + return ''; +} + + +sub _objs_from_sth { + my ($self, $sth) = @_; + + my %row; + + # Create the row hash using column names as keys + $sth->bind_columns( \( @row{ @{$sth->{NAME_lc} } } )); + + while ($sth->fetch) { + + # we don't actually store the returned object because + # the _obj_from_row method stores them in a temporary + # hash _temp_objs in $self + + $self->_obj_from_row(\%row); + } + + # Get the created objects from the temporary hash + my @objs = values %{ $self->{_temp_objs} }; + delete $self->{_temp_objs}; + + # If the flag for attaching allele objects was set, trigger loading of them by calling the getter method on the variation object + if ($self->{_load_alleles}) { + map {$_->get_all_Alleles()} @objs; + } + + # Return the created objects + return \@objs; +} + + +sub _obj_from_row { + + my ($self, $row) = @_; + + return undef unless $row->{variation_id}; + + # If the variation for this variation_id hasn't already been created, do that + my $obj = $self->{_temp_objs}{$row->{variation_id}}; + + unless (defined($obj)) { + + # Get the validation status + my @states; + if (defined($row->{v_validation_status})) { + @states = split(/,/,$row->{v_validation_status}); + } + + # Create the variation object + $obj = Bio::EnsEMBL::Variation::Variation->new( + -dbID => $row->{variation_id}, + -ADAPTOR => $self, + -NAME => $row->{v_name}, + -SOURCE => $row->{v_source_name}, + -SOURCE_DESCRIPTION => $row->{v_source_description}, + -SOURCE_URL => $row->{v_source_url}, + -SOURCE_TYPE => $row->{v_source_type}, + -IS_SOMATIC => $row->{v_somatic}, + -FLIPPED => $row->{v_flipped}, + -ANCESTRAL_ALLELE => $row->{v_ancestral_allele}, + -MOLTYPE => $row->{vs_moltype}, + -VALIDATION_STATES => \@states, + -FLANK_FLAG => $row->{fs_flank_flag}, + -CLASS_SO_TERM => $self->AttributeAdaptor()->attrib_value_for_id($row->{v_class_attrib_id}), + -CLINICAL_SIGNIFICANCE => $self->AttributeAdaptor->attrib_value_for_id($row->{clinical_significance_attrib_id}), + -MINOR_ALLELE => $row->{minor_allele}, + -MINOR_ALLELE_FREQUENCY => $row->{minor_allele_freq}, + -MINOR_ALLELE_COUNT => $row->{minor_allele_count}, + ); + + $self->{_temp_objs}{$row->{variation_id}} = $obj; + } + + # Add a synonym if available + if (defined($row->{vs_source_name}) && defined($row->{vs_name})) { + $obj->add_synonym($row->{vs_source_name},$row->{vs_name}); + } + +} + +=head2 load_alleles + + Arg[1] : boolean $load + A flag determining whether to load alleles at the same time as Variations are created (1) or to lazy-load them (0 = default) + Example : # Tell the VariationAdaptor to load alleles when creating variations + my $variation_adaptor->load_alleles(1); + my $variation = $variation_adaptor->fetch_by_dbID(1); + + Description : Sets the behaviour when it comes to fetching alleles at the time of object creation or on demand. This setting will be in effect for + all fetch methods for the lifespan of the adaptor or until reset. + ReturnType : none + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub load_alleles { + my $self = shift; + + $self->{_load_alleles} = defined(shift); + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/VariationAnnotationAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/VariationAnnotationAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,481 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + + +# Ensembl module for Bio::EnsEMBL::Variation::DBSQL::VariationAnnotationAdaptor +# +# Copyright (c) 2004 Ensembl +# +# You may distribute this module under the same terms as perl itself +# +# + +=head1 NAME + +Bio::EnsEMBL::Variation::DBSQL::VariationAnnotationAdaptor + +=head1 SYNOPSIS + $reg = 'Bio::EnsEMBL::Registry'; + + $reg->load_registry_from_db(-host => 'ensembldb.ensembl.org',-user => 'anonymous'); + + $vaa = $reg->get_adaptor("human","variation","variationannotation"); + $va = $reg->get_adaptor("human","variation","variation"); + + # Get a VariationAnotation by its internal identifier + $va = $vaa->fetch_by_dbID(45); + + # fetch all annotation for a particular variation + $v = $va->fetch_by_name('rs56'); + + foreach $va (@{$vaa->fetch_all_by_Variation($v)}) { + print $va->phenotype_name(), $va->phenotype_description(), $va->source_name(), $va->study_type(),"\n"; + } + +=head1 DESCRIPTION + +This adaptor provides database connectivity between Variation and VariationAnnotation objects. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::DBSQL::VariationAnnotationAdaptor; + +use Bio::EnsEMBL::Variation::Variation; +use Bio::EnsEMBL::Variation::VariationAnnotation; +use Bio::EnsEMBL::Variation::DBSQL::StudyAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; + +our @ISA = ('Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor'); + +=head2 fetch_all_by_Variation + + Arg [1] : Bio::EnsEMBL::Variation::Variation $var + Example : my @vas = @{$vaa->fetch_all_by_Variation($var)}; + Description: Retrieves all variation annotations for a given variation. + Returntype : reference to list Bio::EnsEMBL::Variation::VariationAnnotation + Exceptions : throw on bad argument + Caller : general + Status : At Risk + +=cut + + +sub fetch_all_by_Variation { + my $self = shift; + my $var = shift; + + if(!ref($var) || !$var->isa('Bio::EnsEMBL::Variation::Variation')) { + throw('Bio::EnsEMBL::Variation::Variation arg expected'); + } + + if(!defined($var->dbID())) { + throw("Variation arg must have defined dbID"); + } + + return $self->generic_fetch("va.variation_id = ".$var->dbID()); +} + +=head2 fetch_all_by_Variation_list + + Arg [1] : reference to a list of Bio::EnsEMBL::Variation::Variation objects + Example : my @vas = @{$vaa->fetch_all_by_Variation_list($vars)}; + Description: Retrieves all variation annotations for a given list of variations + Returntype : reference to a list of Bio::EnsEMBL::Variation::VariationAnnotation objects + Exceptions : throw on bad argument + Caller : general + Status : At Risk + +=cut + + +sub fetch_all_by_Variation_list { + my $self = shift; + my $vars = shift; + + if(!ref($vars) || !$vars->[0]->isa('Bio::EnsEMBL::Variation::Variation')) { + throw('Bio::EnsEMBL::Variation::Variation arg expected'); + } + + if(!defined($vars->[0]->dbID())) { + throw("Variation arg must have defined dbID"); + } + + my $in_str = join ',', map {$_->dbID()} @$vars; + + return $self->generic_fetch("va.variation_id in (".$in_str.")"); +} + +=head2 fetch_all_by_VariationFeature_list + + Arg [1] : reference to a list of Bio::EnsEMBL::Variation::VariationFeature objects + Example : my @vas = @{$vaa->fetch_all_by_VariationFeature_list($vfs)}; + Description: Retrieves all variation annotations for a given list of variation features + Returntype : reference to a list Bio::EnsEMBL::Variation::VariationAnnotation objects + Exceptions : throw on bad argument + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_VariationFeature_list { + my $self = shift; + my $vfs = shift; + + if(!ref($vfs) || !$vfs->[0]->isa('Bio::EnsEMBL::Variation::VariationFeature')) { + throw('Listref of Bio::EnsEMBL::Variation::VariationFeature arg expected'); + } + + if(!defined($vfs->[0]->dbID())) { + throw("VariationFeatures in list must have defined dbIDs"); + } + + my $in_str = join ',', map {$_->{'_variation_id'}} @$vfs; + + return $self->generic_fetch("va.variation_id in (".$in_str.")"); +} + + +=head2 fetch_all_by_Study + + Arg [1] : Bio::EnsEMBL:Variation::Study $study + Example : my @studies = @{$studya->fetch_all_by_Study($study)}; + Description: Retrieves all variation annotations for a given study. + Returntype : reference to list Bio::EnsEMBL::Variation::VariationAnnotation + Exceptions : throw on bad argument + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_Study { + my $self = shift; + my $study = shift; + + if(!ref($study) || !$study->isa('Bio::EnsEMBL::Variation::Study')) { + throw('Bio::EnsEMBL::Variation::Study arg expected'); + } + + if(!defined($study->dbID())) { + throw("Study arg must have defined dbID"); + } + return $self->generic_fetch("va.study_id = ".$study->dbID()); +} + + +=head2 fetch_all_by_phenotype_name_source_name + + Arg [1] : string $phenotype_name + Arg [2] : string $source_name (optional) + Example : $va = $va_adaptor->fetch_all_by_phenotype_source_name('BD','EGA'); + Description: Retrieves a variation annotation object via its phenotype/source name + Returntype : list of ref of Bio::EnsEMBL::Variation::VariationAnnotation + Exceptions : throw if phenotype name argument is not defined + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_phenotype_name_source_name { + + my $self = shift; + my $phenotype_name = shift; + my $source_name = shift; + + throw('phenotype_name argument expected') if(!defined($phenotype_name)); + + my $extra_sql = " p.name = $phenotype_name "; + if (defined $source_name ) { + $extra_sql .= qq( AND s.name = '$source_name' ); + } + + # Add the constraint for failed variations + $extra_sql .= " AND " . $self->db->_exclude_failed_variations_constraint(); + + return $self->generic_fetch("$extra_sql"); + +} + +=head2 fetch_all_by_phenotype_description_source_name + + Arg [1] : string $phenotype_description + Arg [2] : string $source_name (optional) + Example : $va = $va_adaptor->fetch_all_by_phenotype_description_source_name('diabetes','EGA'); + Description: Retrieves a variation annotation object via its phenotype description/source name + Returntype : list of ref of Bio::EnsEMBL::Variation::VariationAnnotation + Exceptions : throw if phenotype name argument is not defined + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_phenotype_description_source_name { + + my $self = shift; + my $phenotype_description = shift; + my $source_name = shift; + + throw('phenotype_description argument expected') if(!defined($phenotype_description)); + + my $extra_sql = qq( p.description like '%$phenotype_description%' ); + if (defined $source_name ) { + $extra_sql .= qq( AND s.name = '$source_name' ); + } + + # Add the constraint for failed variations + $extra_sql .= " AND " . $self->db->_exclude_failed_variations_constraint(); + + return $self->generic_fetch("$extra_sql"); + +} + +=head2 fetch_all_by_phenotype_id_source_name + + Arg [1] : integer $phenotype_id + Arg [2] : string $source_name (optional) + Example : $va = $va_adaptor->fetch_all_by_phenotype_id_source_name(999,'EGA'); + Description: Retrieves a variation annotation object via its phenotype id/source name + Returntype : list of ref of Bio::EnsEMBL::Variation::VariationAnnotation + Exceptions : throw if phenotype id argument is not defined + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_phenotype_id_source_name { + + my $self = shift; + my $phenotype_id = shift; + my $source_name = shift; + + throw('phenotype_id argument expected') if(!defined($phenotype_id)); + + my $extra_sql = sprintf('p.phenotype_id = %s', $self->dbc->db_handle->quote( $phenotype_id, SQL_INTEGER ) ); + + if (defined $source_name ) { + $extra_sql .= sprintf(" AND s.name = '%s'", $self->dbc->db_handle->quote( $source_name, SQL_VARCHAR ) ); + } + + # Add the constraint for failed variations + $extra_sql .= " AND " . $self->db->_exclude_failed_variations_constraint(); + + return $self->generic_fetch("$extra_sql"); + +} + + +=head2 fetch_all_by_associated_gene + + Arg [1] : string $gene_name + Example : $va = $va_adaptor->fetch_all_by_associated_gene('CAV3'); + Description: Retrieves the variation annotation objects via which are associated with the gene. + Returntype : list of ref of Bio::EnsEMBL::Variation::VariationAnnotation + Exceptions : throw if the gene_name argument is not defined + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_associated_gene { + + my $self = shift; + my $gene_name = shift; + + throw('gene_name argument expected') if(!defined($gene_name)); + + my $extra_sql = " va.associated_gene REGEXP '^(.+,)?[. .]*$gene_name(,.+)?\$'"; + + # Add the constraint for failed variations + $extra_sql .= " AND " . $self->db->_exclude_failed_variations_constraint(); + + return $self->generic_fetch($extra_sql); + +} + +=head2 count_all_by_associated_gene + + Description: Retrieves count of variation_annotation objects associated with a + given gene + Returntype : integer + Exceptions : none + Caller : general + +=cut + +sub count_all_by_associated_gene { + + my $self = shift; + my $gene_name = shift; + + throw('gene_name argument expected') if(!defined($gene_name)); + + my $extra_sql = " va.associated_gene REGEXP '^(.+,)?[. .]*$gene_name(,.+)?\$'"; + + # Add the constraint for failed variations + $extra_sql .= " AND " . $self->db->_exclude_failed_variations_constraint(); + + return $self->generic_count($extra_sql); +} + + +=head2 fetch_all + + Description: Retrieves all available variation annotation objects + Returntype : list of ref of Bio::EnsEMBL::Variation::VariationAnnotation + Exceptions : none + Caller : general + +=cut + +sub fetch_all { + + my $self = shift; + + # Add the constraint for failed variations + my $extra_sql = $self->db->_exclude_failed_variations_constraint(); + + return $self->generic_fetch("$extra_sql"); + +} + +# method used by superclass to construct SQL +sub _tables { return (['variation_annotation', 'va'], + [ 'failed_variation', 'fv'], + [ 'phenotype', 'p'], + [ 'study', 'st'], + [ 'source', 's']); +} + +# Add a left join to the failed_variation table +sub _left_join { return ([ 'failed_variation', 'fv.variation_id = va.variation_id']); } + +sub _default_where_clause { + my $self = shift; + + return 'va.phenotype_id = p.phenotype_id AND st.source_id = s.source_id AND va.study_id=st.study_id'; +} + +sub _columns { + return qw( va.variation_annotation_id va.variation_id p.phenotype_id p.name p.description + va.study_id va.associated_gene va.associated_variant_risk_allele + va.variation_names va.risk_allele_freq_in_controls va.p_value + ); +} + + + +sub _objs_from_sth { + my ($self, $sth) = @_; + + my @features; + + my ($variation_annotation_id,$var_id,$phenotype_id,$phenotype_name,$phenotype_description, + $study_id,$associated_gene,$associated_variant_risk_allele,$variation_names, + $risk_allele_freq_in_controls,$p_value,$last_va_id,$study); + $sth->bind_columns(\$variation_annotation_id,\$var_id,\$phenotype_id,\$phenotype_name, + \$phenotype_description,\$study_id, + \$associated_gene,\$associated_variant_risk_allele,\$variation_names, + \$risk_allele_freq_in_controls,\$p_value); + + my $sta = $self->db()->get_StudyAdaptor(); + + while($sth->fetch()) { + + next if (defined($last_va_id) && $last_va_id == $variation_annotation_id); + $last_va_id = $variation_annotation_id; + + $study = $sta->fetch_by_dbID($study_id); + + push @features, $self->_create_feature_fast('Bio::EnsEMBL::Variation::VariationAnnotation', + + {'dbID' => $variation_annotation_id, + '_variation_id' => $var_id, + '_phenotype_id' => $phenotype_id, + 'phenotype_name' => $phenotype_name, + 'phenotype_description' => $phenotype_description, + 'associated_gene' => $associated_gene, + 'associated_variant_risk_allele' => $associated_variant_risk_allele, + 'variation_names' => $variation_names, + 'risk_allele_freq_in_controls' => $risk_allele_freq_in_controls, + 'p_value' => $p_value, + 'adaptor' => $self, + 'study' => $study, + }); + } + + return \@features; + +} + + +=head2 fetch_phenotype_description_by_id + + Arg [1] : int $phenotype_id + Example : $phenotype = $va_adaptor->fetch_phenotype_description(10); + Description: Retrieves the phenotype description from the phenotype ID + Returntype : string + Exceptions : throw if the phenotype_id argument is not defined + Caller : general + +=cut + +sub fetch_phenotype_description_by_id { + my $self = shift; + my $phenotype_id = shift; + + throw('phenotype_id argument expected') if(!defined($phenotype_id)); + + my $sth = $self->prepare(qq{SELECT description FROM phenotype WHERE phenotype_id = ?}); + $sth->bind_param(1,$phenotype_id,SQL_INTEGER); + $sth->execute(); + + return ($sth->fetchrow_array)[0]; +} + + +=head2 fetch_annotation_number_by_phenotype_id + + Arg [1] : int $phenotype_id + Example : $phenotype = $va_adaptor->fetch_annotation_number_by_phenotype_id(10); + Description: Retrieves the number of variation annotation with the given phenotype ID + Returntype : integer + Exceptions : throw if the phenotype_id argument is not defined + Caller : general + +=cut + +sub fetch_annotation_number_by_phenotype_id { + my $self = shift; + my $phenotype_id = shift; + + throw('phenotype_id argument expected') if(!defined($phenotype_id)); + + my $sth = $self->prepare(qq{SELECT count(variation_annotation_id) FROM variation_annotation WHERE phenotype_id = ?}); + $sth->bind_param(1,$phenotype_id,SQL_INTEGER); + $sth->execute(); + + return ($sth->fetchrow_array)[0]; +} +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/VariationFeatureAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/VariationFeatureAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1738 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + + +# Ensembl module for Bio::EnsEMBL::Variation::DBSQL::VariationFeatureAdaptor +# +# Copyright (c) 2004 Ensembl +# +# You may distribute this module under the same terms as perl itself +# +# + +=head1 NAME + +Bio::EnsEMBL::Variation::DBSQL::VariationFeatureAdaptor + +=head1 SYNOPSIS + $reg = 'Bio::EnsEMBL::Registry'; + + $reg->load_registry_from_db(-host => 'ensembldb.ensembl.org',-user => 'anonymous'); + + $vfa = $reg->get_adaptor("human","variation","variationfeature"); + $sa = $reg->get_adaptor("human","core","slice"); + $va = $reg->get_adaptor("human","variation","variation"); + + # Get a VariationFeature by its internal identifier + $vf = $va->fetch_by_dbID(145); + + # Include the variations that have been flagged as failed in the fetch + $vfa->db->include_failed_variations(1); + + # get all VariationFeatures in a region + $slice = $sa->fetch_by_region('chromosome', 'X', 1e6, 2e6); + foreach $vf (@{$vfa->fetch_all_by_Slice($slice)}) { + print $vf->start(), '-', $vf->end(), ' ', $vf->allele_string(), "\n"; + } + + # fetch all genome hits for a particular variation + $v = $va->fetch_by_name('rs56'); + + foreach $vf (@{$vfa->fetch_all_by_Variation($v)}) { + print $vf->seq_region_name(), $vf->seq_region_start(), '-', + $vf->seq_region_end(),"\n"; + } + +=head1 DESCRIPTION + +This adaptor provides database connectivity for VariationFeature objects. +Genomic locations of variations can be obtained from the database using this +adaptor. See the base class BaseFeatureAdaptor for more information. +By default, the 'fetch_all_by_...'-methods will not return variations +that have been flagged as failed in the Ensembl QC. This behaviour can be modified +by setting the include_failed_variations flag in Bio::EnsEMBL::Variation::DBSQL::DBAdaptor. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::DBSQL::VariationFeatureAdaptor; + +use Bio::EnsEMBL::Variation::Allele; +use Bio::EnsEMBL::Variation::VariationFeature; +use Bio::EnsEMBL::Variation::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; +use Bio::EnsEMBL::Utils::Scalar qw(assert_ref); +use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Iterator; +use Bio::EnsEMBL::Variation::Utils::Constants qw(%OVERLAP_CONSEQUENCES); +use Bio::EnsEMBL::Variation::Utils::Sequence qw(get_validation_code); + +our @ISA = ('Bio::EnsEMBL::Variation::DBSQL::BaseAdaptor', 'Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor'); +our $MAX_VARIATION_SET_ID = 64; +our $DEBUG =0; +sub store { + my ($self, $vf) = @_; + + my $dbh = $self->dbc->db_handle; + + # look up source_id + if(!defined($vf->{source_id})) { + my $sth = $dbh->prepare(q{ + SELECT source_id FROM source WHERE name = ? + }); + $sth->execute($vf->{source}); + + my $source_id; + $sth->bind_columns(\$source_id); + $sth->fetch(); + $sth->finish(); + $vf->{source_id} = $source_id; + } + + my $sth = $dbh->prepare(q{ + INSERT INTO variation_feature ( + seq_region_id, + seq_region_start, + seq_region_end, + seq_region_strand, + variation_id, + allele_string, + variation_name, + map_weight, + flags, + source_id, + validation_status, + consequence_types, + variation_set_id, + class_attrib_id, + somatic, + minor_allele, + minor_allele_freq, + minor_allele_count + ) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) + }); + + $sth->execute( + defined($vf->{seq_region_id}) ? $vf->{seq_region_id} : $vf->slice->get_seq_region_id, + $vf->{slice} ? $vf->seq_region_start : $vf->{start}, + $vf->{slice} ? $vf->seq_region_end : $vf->{end}, + $vf->strand, + $vf->variation ? $vf->variation->dbID : $vf->{_variation_id}, + $vf->allele_string, + $vf->variation_name, + $vf->map_weight || 1, + $vf->{flags}, + $vf->{source_id}, + (join ",", @{$vf->get_all_validation_states}) || undef, + $vf->{slice} ? (join ",", @{$vf->consequence_type('SO')}) : 'intergenic_variant', + $vf->{variation_set_id} || '', + $vf->{class_attrib_id} || $vf->adaptor->db->get_AttributeAdaptor->attrib_id_for_type_value('SO_term', $vf->{class_SO_term}) || 18, + $vf->is_somatic, + $vf->minor_allele, + $vf->minor_allele_frequency, + $vf->minor_allele_count, + ); + + $sth->finish; + + # get dbID + my $dbID = $dbh->last_insert_id(undef, undef, 'variation_feature', 'variation_feature_id'); + $vf->{dbID} = $dbID; + $vf->{adaptor} = $self; +} + +=head2 fetch_all + + Description: Returns a listref of all germline variation features + Returntype : listref of VariationFeatures + Status : At risk + +=cut + +sub fetch_all { + my $self = shift; + my $constraint = 'vf.somatic = 0'; + return $self->generic_fetch($constraint); +} + +=head2 fetch_all_somatic + + Description: Returns a listref of all somatic variation features + Returntype : listref of VariationFeatures + Status : At risk + +=cut + +sub fetch_all_somatic { + my $self = shift; + my $constraint = 'vf.somatic = 1'; + return $self->generic_fetch($constraint); +} + +=head2 fetch_all_by_Slice_constraint + + Arg [1] : Bio::EnsEMBL::Slice $slice + the slice from which to obtain features + Arg [2] : (optional) string $constraint + An SQL query constraint (i.e. part of the WHERE clause) + Description: Returns a listref of germline variation features created + from the database which are on the Slice defined by $slice + and fulfill the SQL constraint defined by $constraint. + Returntype : listref of VariationFeatures + Exceptions : thrown if $slice is not defined + Caller : Bio::EnsEMBL::Slice + Status : Stable + +=cut + +sub fetch_all_by_Slice_constraint { + my ($self, $slice, $constraint) = @_; + + # by default, filter outsomatic mutations + my $somatic_constraint = 'vf.somatic = 0'; + + if ($constraint) { + $constraint .= " AND $somatic_constraint"; + } + else { + $constraint = $somatic_constraint; + } + + # Add the constraint for failed variations + $constraint .= " AND " . $self->db->_exclude_failed_variations_constraint(); + + return $self->SUPER::fetch_all_by_Slice_constraint($slice, $constraint); +} + +sub fetch_all_by_Slice_constraint_with_Variations { + my $self = shift; + $self->{_get_variations} = 1; + my $vfs = $self->fetch_all_by_Slice_constraint(@_); + $self->{_get_variations} = 0; + return $vfs; +} + +sub fetch_all_by_Slice_constraint_with_TranscriptVariations { + my $self = shift; + $self->{_get_transcript_variations} = 1; + my $vfs = $self->fetch_all_by_Slice_constraint(@_); + $self->{_get_transcript_variations} = 0; + return $vfs; +} + +sub fetch_all_somatic_by_Slice_constraint_with_TranscriptVariations { + my $self = shift; + $self->{_get_transcript_variations} = 1; + my $vfs = $self->fetch_all_somatic_by_Slice_constraint(@_); + $self->{_get_transcript_variations} = 0; + return $vfs; +} + +=head2 fetch_all_somatic_by_Slice_constraint + + Arg [1] : Bio::EnsEMBL::Slice $slice + the slice from which to obtain features + Arg [2] : (optional) string $constraint + An SQL query constraint (i.e. part of the WHERE clause) + Description: Returns a listref of somatic variation features created + from the database which are on the Slice defined by $slice + and fulfill the SQL constraint defined by $constraint. + Returntype : listref of VariationFeatures + Exceptions : thrown if $slice is not defined + Caller : Bio::EnsEMBL::Slice + Status : Stable + +=cut + +sub fetch_all_somatic_by_Slice_constraint { + my ($self, $slice, $constraint) = @_; + + my $somatic_constraint = 'vf.somatic = 1'; + + if ($constraint) { + $constraint .= " AND $somatic_constraint"; + } + else { + $constraint = $somatic_constraint; + } + + # Add the constraint for failed variations + $constraint .= " AND " . $self->db->_exclude_failed_variations_constraint(); + + return $self->SUPER::fetch_all_by_Slice_constraint($slice, $constraint); +} + +=head2 fetch_all_by_Slice + + Arg [1] : Bio::EnsEMBL::Slice $slice + the slice from which to obtain features + Example : my $vfs = $vfa->fetch_all_by_Slice($slice); + Description: Retrieves all variation features on the given Slice. + NOTE: only germline variations will be returned, if you want + somatic mutations use the fetch_all_somatic_by_Slice method. + Returntype : listref of Bio::EnsEMBL::VariationFeatures + Exceptions : none + Caller : Bio::EnsEMBL::Slice + Status : Stable + +=cut + +sub fetch_all_by_Slice { + my ($self, $slice) = @_; + return $self->fetch_all_by_Slice_constraint($slice, ''); +} + +=head2 fetch_all_somatic_by_Slice + + Arg [1] : Bio::EnsEMBL::Slice $slice + the slice from which to obtain features + Example : my $svfs = $vfa->fetch_all_somatic_by_Slice($slice); + Description: Retrieves a list of variation features representing somatic mutations on the given Slice. + Returntype : listref of Bio::EnsEMBL::VariationFeatures + Exceptions : none + Caller : Bio::EnsEMBL::Slice + Status : Stable + +=cut + +sub fetch_all_somatic_by_Slice { + my ($self, $slice) = @_; + return $self->fetch_all_somatic_by_Slice_constraint($slice, ''); +} + +=head2 fetch_all_by_Variation + + Arg [1] : Bio::EnsEMBL:Variation::Variation $var + Example : my @vfs = @{$vfa->fetch_all_by_Variation($var)}; + Description: Retrieves all variation features for a given variation. Most + variations should only hit the genome once and only a return + a single variation feature. + Returntype : reference to list Bio::EnsEMBL::Variation::VariationFeature + Exceptions : throw on bad argument + Caller : general + Status : Stable + +=cut + + +sub fetch_all_by_Variation { + my $self = shift; + my $var = shift; + + if(!ref($var) || !$var->isa('Bio::EnsEMBL::Variation::Variation')) { + throw('Bio::EnsEMBL::Variation::Variation arg expected'); + } + + if(!defined($var->dbID())) { + throw("Variation arg must have defined dbID"); + } + + return $self->generic_fetch("vf.variation_id = ".$var->dbID()); +} + +=head2 fetch_all_genotyped_by_Slice + + Arg [1] : Bio::EnsEMBL:Variation::Slice $slice + Example : my @vfs = @{$vfa->fetch_all_genotyped_by_Slice($slice)}; + Description: Retrieves all variation features that have been gentoyped for a given slice. + Most variations should only hit the genome once and only a return + a single variation feature. + Returntype : reference to list Bio::EnsEMBL::Variation::VariationFeature + Exceptions : throw on bad argument + Caller : general + Status : Stable + +=cut + +sub fetch_all_genotyped_by_Slice{ + my $self = shift; + my $slice = shift; + + my $constraint = "vf.flags & 1 AND vf.somatic = 0"; + #call the method fetch_all_by_Slice_constraint with the genotyped constraint + return $self->fetch_all_by_Slice_constraint($slice,$constraint); +} + +sub _internal_fetch_all_with_annotation_by_Slice{ + + my $self = shift; + my $slice = shift; + my $v_source = shift; + my $p_source = shift; + my $annotation = shift; + my $constraint = shift; + + if(!ref($slice) || !$slice->isa('Bio::EnsEMBL::Slice')) { + throw('Bio::EnsEMBL::Slice arg expected'); + } + + my $extra_sql = ''; + + if(defined $v_source) { + $extra_sql .= qq{ AND s.name = '$v_source' }; + } + + if(defined $p_source) { + $extra_sql .= qq{ AND ps.name = '$p_source' }; + } + + if(defined $annotation) { + if($annotation =~ /^[0-9]+$/) { + $extra_sql .= qq{ AND p.phenotype_id = $annotation }; + } + else { + $extra_sql .= qq{ AND (p.name = '$annotation' OR p.description LIKE '%$annotation%') }; + } + } + + if ($constraint) { + $extra_sql .= qq{ AND $constraint }; + } + + # Add the constraint for failed variations + $extra_sql .= " AND " . $self->db->_exclude_failed_variations_constraint(); + + my $cols = join ",", $self->_columns(); + + my $sth = $self->prepare(qq{ + SELECT $cols + FROM (variation_feature vf, variation_annotation va, + phenotype p, source s, source ps, study st) # need to link twice to source + LEFT JOIN failed_variation fv ON (fv.variation_id = vf.variation_id) + WHERE va.study_id = st.study_id + AND st.source_id = ps.source_id + AND vf.source_id = s.source_id + AND vf.variation_id = va.variation_id + AND va.phenotype_id = p.phenotype_id + $extra_sql + AND vf.seq_region_id = ? + AND vf.seq_region_end > ? + AND vf.seq_region_start < ? + GROUP BY vf.variation_feature_id + }); + + $sth->execute($slice->get_seq_region_id, $slice->start, $slice->end); + + return $self->_objs_from_sth($sth, undef, $slice); +} + +=head2 fetch_all_with_annotation_by_Slice + + Arg [1] : Bio::EnsEMBL:Variation::Slice $slice + Arg [2] : $variation_feature_source [optional] + Arg [3] : $annotation_source [optional] + Arg [4] : $annotation_name [optional] + Example : my @vfs = @{$vfa->fetch_all_with_annotation_by_Slice($slice)}; + Description: Retrieves all germline variation features associated with annotations for + a given slice. + The optional $variation_feature_source argument can be used to + retrieve only variation features from a paricular source. + The optional $annotation source argument can be used to + retrieve only variation features with annotations provided by + a particular source. + The optional $annotation_name argument can + be used to retrieve only variation features associated with + that annotation - this can also be a phenotype's dbID. + Returntype : reference to list Bio::EnsEMBL::Variation::VariationFeature + Exceptions : throw on bad argument + Caller : general + Status : Stable + +=cut + +sub fetch_all_with_annotation_by_Slice { + my $self = shift; + my ($slice, $v_source, $p_source, $annotation) = @_; + my $constraint = 'vf.somatic = 0'; + return $self->_internal_fetch_all_with_annotation_by_Slice($slice, $v_source, $p_source, $annotation, $constraint); +} + +=head2 fetch_all_somatic_with_annotation_by_Slice + + Arg [1] : Bio::EnsEMBL:Variation::Slice $slice + Arg [2] : $variation_feature_source [optional] + Arg [3] : $annotation_source [optional] + Arg [4] : $annotation_name [optional] + Example : my @vfs = @{$vfa->fetch_all_somatic_with_annotation_by_Slice($slice)}; + Description: Retrieves all somatic variation features associated with annotations for + a given slice. + (see fetch_all_with_annotation_by_Slice documentation for description of + the other parameters) + Returntype : reference to list Bio::EnsEMBL::Variation::VariationFeature + Exceptions : throw on bad argument + Caller : general + Status : Stable + +=cut + +sub fetch_all_somatic_with_annotation_by_Slice { + my $self = shift; + my ($slice, $v_source, $p_source, $annotation) = @_; + my $constraint = 'vf.somatic = 1'; + return $self->_internal_fetch_all_with_annotation_by_Slice($slice, $v_source, $p_source, $annotation, $constraint); +} + +=head2 fetch_all_by_Slice_VariationSet + + Arg [1] : Bio::EnsEMBL:Variation::Slice $slice + Arg [2] : Bio::EnsEMBL:Variation::VariationSet $set + Example : my @vfs = +@{$vfa->fetch_all_by_Slice_VariationSet($slice, $set)}; + Description: Retrieves all variation features in a slice that belong to a + given variation set. + Returntype : reference to list Bio::EnsEMBL::Variation::VariationFeature + Exceptions : throw on bad argument + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_Slice_VariationSet{ + + my $self = shift; + my $slice = shift; + my $set = shift; + + #$self->{_get_variations} = 1; + + if(!ref($slice) || !$slice->isa('Bio::EnsEMBL::Slice')) { + throw('Bio::EnsEMBL::Slice arg expected'); + } + if(!ref($set) || !$set->isa('Bio::EnsEMBL::Variation::VariationSet')) { + throw('Bio::EnsEMBL::Variation::VariationSet arg expected'); + } + + # fix for failed sets + my $failed = $self->db->include_failed_variations; + $self->db->include_failed_variations(1) if $failed == 0 && $set->name =~ /fail/; + + #ÊGet the bitvalue for this set and its subsets + my $bitvalue = $set->_get_bitvalue(); + + # Add a constraint to only return VariationFeatures having the primary keys of the supplied VariationSet or its subsets in the variation_set_id column + my $constraint = " vf.variation_set_id & $bitvalue "; + + #ÊGet the VariationFeatures by calling fetch_all_by_Slice_constraint + my $vfs = $self->fetch_all_by_Slice_constraint($slice,$constraint); + + # restore failed fetch flag + $self->db->include_failed_variations($failed); + + $self->{_get_variations} = 0; + + return $vfs; +} + + +=head2 fetch_all_by_Slice_Population + + Arg [1] : Bio::EnsEMBL::Slice + Arg [2] : Bio::EnsEMBL::Variation::Population + Arg [3] : $minimum_frequency (optional) + Example : $pop = $pop_adaptor->fetch_by_dbID(659); + $slice = $slice_adaptor->fetch_by_region("chromosome", 1, 1, 1e6); + @vfs = @{$vf_adaptor->fetch_all_by_Slice_Population($pop,$slice)}; + Description: Retrieves all variation features in a slice which are stored for + a specified population. If $minimum_frequency is supplied, only + variations with a minor allele frequency (MAF) greater than + $minimum_frequency will be returned. + Returntype : listref of Bio::EnsEMBL::Variation::VariationFeature + Exceptions : throw on incorrect argument + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_Slice_Population { + my $self = shift; + + my $slice = shift; + + if(!ref($slice) || !$slice->isa('Bio::EnsEMBL::Slice')) { + throw('Bio::EnsEMBL::Slice arg expected'); + } + + my $pop = shift; + + if(!ref($pop) || !$pop->isa('Bio::EnsEMBL::Variation::Population')) { + throw('Bio::EnsEMBL::Variation::Population arg expected'); + } + + # default to 5% frequency + my $freq = shift; + my $extra_sql = ''; + + if(defined $freq) { + + # adjust frequency if given a percentage + $freq /= 100 if $freq > 1; + $extra_sql = qq{ AND (IF(a.frequency > 0.5, 1-a.frequency, a.frequency) > $freq) } + } + + # Add the constraint for failed variations + $extra_sql .= " AND " . $self->db->_exclude_failed_variations_constraint(); + + my $cols = join ",", $self->_columns(); + + my $sth = $self->prepare(qq{ + SELECT $cols + FROM (variation_feature vf, source s, allele a) + LEFT JOIN failed_variation fv ON (fv.variation_id = vf.variation_id) + WHERE vf.source_id = s.source_id + AND vf.variation_id = a.variation_id + AND a.sample_id = ? + $extra_sql + AND vf.seq_region_id = ? + AND vf.seq_region_end >= ? + AND vf.seq_region_start <= ? + GROUP BY a.variation_id + }); + + $sth->execute($pop->dbID, $slice->get_seq_region_id, $slice->start, $slice->end); + +##### ALTERNATIVE USING SUB-QUERY - WORKS FOR MULTIALLELIC BUT TAKES FOREVER!!! +# my $cols = join ",", $self->_columns(); +# +# my $sth = $self->prepare(qq{ +# SELECT $cols +# FROM variation_feature vf, source s, allele a_outer +# WHERE +# vf.source_id = s.source_id AND +# vf.variation_id = a_outer.variation_id AND +# a_outer.sample_id = ? AND +# a_outer.frequency > ? AND +# a_outer.frequency < 0.5 AND +# vf.seq_region_id = ? AND +# vf.seq_region_end >= ? AND +# vf.seq_region_start <= ? AND +# NOT EXISTS ( +# SELECT +# * +# FROM +# allele a_inner +# WHERE +# a_inner.variation_id = a_outer.variation_id AND +# a_inner.sample_id = a_outer.sample_id AND +# a_inner.frequency < a_outer.frequency +# ); +# }); + + #$sth->execute($pop->dbID, $freq, $slice->get_seq_region_id, $slice->start, $slice->end); + + return $self->_objs_from_sth($sth, undef, $slice); +} + +sub _internal_fetch_all_with_annotation { + + my ($self, $v_source, $p_source, $annotation, $constraint) = @_; + + my $extra_sql = ''; + + if(defined $v_source) { + $extra_sql .= qq{ AND s.name = '$v_source' }; + } + + if(defined $p_source) { + $extra_sql .= qq{ AND ps.name = '$p_source' }; + } + + if(defined $annotation) { + if($annotation =~ /^[0-9]+$/) { + $extra_sql .= qq{ AND p.phenotype_id = $annotation }; + } + else { + $extra_sql .= qq{ AND (p.name = '$annotation' OR p.description LIKE '%$annotation%') }; + } + } + + if ($constraint) { + $extra_sql .= qq{ AND $constraint }; + } + + # Add the constraint for failed variations + $extra_sql .= " AND " . $self->db->_exclude_failed_variations_constraint(); + + my $cols = join ",", $self->_columns(); + + my $sth = $self->prepare(qq{ + SELECT $cols + FROM (variation_feature vf, variation_annotation va, + phenotype p, source s, source ps, study st) # need to link twice to source + LEFT JOIN failed_variation fv ON (fv.variation_id = vf.variation_id) + WHERE va.study_id = st.study_id + AND st.source_id = ps.source_id + AND vf.source_id = s.source_id + AND vf.variation_id = va.variation_id + AND va.phenotype_id = p.phenotype_id + $extra_sql + GROUP BY vf.variation_feature_id + }); + + $sth->execute; + + return $self->_objs_from_sth($sth); +} + +=head2 fetch_all_with_annotation + + Arg [1] : $variation_feature_source [optional] + Arg [2] : $annotation_source [optional] + Arg [3] : $annotation_name [optional] + Example : my @vfs = @{$vfa->fetch_all_with_annotation('EGA', undef, 123)}; + Description: Retrieves all germline variation features associated with the given annotation + Returntype : reference to list Bio::EnsEMBL::Variation::VariationFeature + Caller : webcode + Status : Experimental + +=cut + +sub fetch_all_with_annotation { + + my ($self, $v_source, $p_source, $annotation, $constraint) = @_; + + my $somatic_constraint = 'vf.somatic = 0'; + + if ($constraint) { + $constraint .= " AND $somatic_constraint"; + } + else { + $constraint = $somatic_constraint; + } + + return $self->_internal_fetch_all_with_annotation($v_source, $p_source, $annotation, $constraint); +} + +=head2 fetch_all_somatic_with_annotation + + Arg [1] : $variation_feature_source [optional] + Arg [2] : $annotation_source [optional] + Arg [3] : $annotation_name [optional] + Example : my @vfs = @{$vfa->fetch_all_somatic_with_annotation('COSMIC', undef, 807)}; + Description: Retrieves all somatic variation features associated with the given annotation + Returntype : reference to list Bio::EnsEMBL::Variation::VariationFeature + Caller : webcode + Status : Experimental + +=cut + +sub fetch_all_somatic_with_annotation { + + my ($self, $v_source, $p_source, $annotation, $constraint) = @_; + + my $somatic_constraint = 'vf.somatic = 1'; + + if ($constraint) { + $constraint .= " AND $somatic_constraint"; + } + else { + $constraint = $somatic_constraint; + } + + return $self->_internal_fetch_all_with_annotation($v_source, $p_source, $annotation, $constraint); +} + + +=head2 fetch_all_tagged_by_VariationFeature + + Args : Bio::EnsEMBL::Variation::Population $pop (optional) + Example : my $vfs = $vfa->fetch_all_tagged_by_VariationFeature(); + Description : Returns an arrayref of variation features that are tagged by this + variation feature, in the population $pop if specified. + ReturnType : list of Bio::EnsEMBL::Variation::VariationFeature + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub fetch_all_tagged_by_VariationFeature { + my ($self, $vf, $pop) = @_; + return $self->_tag_fetch($vf, $pop, 'tagged'); +} + + +=head2 fetch_all_tags_by_VariationFeature + + Args : Bio::EnsEMBL::Variation::Population $pop (optional) + Example : my $vfs = $vfa->fetch_all_tags_by_VariationFeature(); + Description : Returns an arrayref of variation features that tag this + variation feature, in the population $pop if specified. + ReturnType : list of Bio::EnsEMBL::Variation::VariationFeature + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub fetch_all_tags_by_VariationFeature { + my ($self, $vf, $pop) = @_; + return $self->_tag_fetch($vf, $pop, 'tag'); +} + + +=head2 fetch_all_tags_and_tagged_by_VariationFeature + + Args : Bio::EnsEMBL::Variation::Population $pop (optional) + Example : my $vfs = $vfa->fetch_all_tags_and_tagged_by_VariationFeature(); + Description : Returns an arrayref of variation features that either tag or are + tagged by this variation feature, in the population $pop if + specified. + ReturnType : list of Bio::EnsEMBL::Variation::VariationFeature + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub fetch_all_tags_and_tagged_by_VariationFeature { + my ($self, $vf, $pop) = @_; + my $return = $self->_tag_fetch($vf, $pop, 'tag'); + push @$return, @{$self->_tag_fetch($vf, $pop, 'tagged')}; + return $return; +} + +sub _tag_fetch { + my ($self, $vf, $pop, $type) = @_; + + assert_ref($vf, 'Bio::EnsEMBL::Variation::VariationFeature'); + assert_ref($pop, 'Bio::EnsEMBL::Variation::Population') if defined $pop; + + # set a flag to tell the query construction methods to include the tagged_variation_feature table + $self->{tag} = $type; + + # construct a constraint + my $opp_type = $type eq 'tag' ? 'tagged_' : ''; + my $constraint = "tvf.".$opp_type."variation_feature_id = ".$vf->dbID; + $constraint .= ' AND tvf.sample_id = '.$pop->dbID if defined $pop; + + # fetch features here so we can reset the tag flag + my $features = $self->generic_fetch($constraint); + + delete $self->{tag}; + + return $features; +} + + + +=head2 fetch_all_by_Slice_SO_terms + + Arg [1] : Bio::EnsEMBL::Slice + Arg [2] : listref of SO terms + Description: Fetch all germline VariationFeatures on the given slice with + consequences with given SO terms + Returntype : listref of Bio::EnsEMBL::Variation::VariationFeatures + Status : At risk + +=cut + +sub fetch_all_by_Slice_SO_terms { + my ($self, $slice, $terms, $without_children, $included_so) = @_; + + if(!ref($slice) || !$slice->isa('Bio::EnsEMBL::Slice')) { + throw('Bio::EnsEMBL::Slice arg expected'); + } + + if(!defined($terms) || scalar @$terms == 0) { + return $self->fetch_all_by_Slice($slice); + } + + my $constraint = $self->_get_consequence_constraint($terms, $without_children, $included_so); + if (!$constraint) { + return []; + } + + my $vfs = $self->fetch_all_by_Slice_constraint($slice,$constraint); + + return $vfs; +} + +=head2 fetch_all_somatic_by_Slice_SO_terms + + Arg [1] : Bio::EnsEMBL::Slice + Arg [2] : listref of SO terms + Description: Fetch all somatic VariationFeatures on the given slice with + consequences with given SO terms + Returntype : listref of Bio::EnsEMBL::Variation::VariationFeatures + Status : At risk + +=cut + +sub fetch_all_somatic_by_Slice_SO_terms { + my ($self, $slice, $terms, $without_children, $included_so) = @_; + + if(!ref($slice) || !$slice->isa('Bio::EnsEMBL::Slice')) { + throw('Bio::EnsEMBL::Slice arg expected'); + } + + if(!defined($terms) || scalar @$terms == 0) { + return $self->fetch_all_somatic_by_Slice($slice); + } + + my $constraint = $self->_get_consequence_constraint($terms, $without_children, $included_so); + if (!$constraint) { + return []; + } + + my $vfs = $self->fetch_all_somatic_by_Slice_constraint($slice,$constraint); + + return $vfs; +} + +# call to method in BaseAdaptor +sub _get_consequence_constraint { + my $self = shift; + return $self->SUPER::_get_consequence_constraint('variation_feature', @_); +} + +sub fetch_Iterator_by_Slice_constraint { + my ($self, $slice, $constraint) = @_; + + $self->{_iterator} = 1; + + my $iterator = $self->fetch_all_by_Slice_constraint($slice, $constraint); + + $self->{_iterator} = 0; + + return $iterator; +} + +# method used by import VCF script +sub _fetch_all_by_coords { + my ($self, $seq_region_id, $start, $end, $somatic) = @_; + + $somatic ||= 0; + + return $self->generic_fetch(qq{ + vf.seq_region_id = $seq_region_id AND + vf.seq_region_start = $start AND + vf.seq_region_end = $end AND + vf.somatic = $somatic + }); +} + +# method used by superclass to construct SQL +sub _tables { + my $self = shift; + + my @tables = ( + [ 'variation_feature', 'vf', 'IGNORE INDEX(consequence_type_idx)'], + [ 'source', 's'] + ); + + # If we are including failed_variations, add that table + push(@tables,['failed_variation', 'fv']) unless ($self->db->include_failed_variations()); + + # add bits for tagged variation feature + push @tables, ['tagged_variation_feature', 'tvf'] if defined $self->{tag}; + + return @tables; +} + +#ÊAdd a left join to the failed_variation table +sub _left_join { + my $self = shift; + + # If we are including failed variations, skip the left join + return () if ($self->db->include_failed_variations()); + return ([ 'failed_variation', 'fv.variation_id = vf.variation_id']); +} + +sub _default_where_clause { + my $self = shift; + + my $clause = 'vf.source_id = s.source_id'; + + # add bits for tagged variation feature + $clause .= + ' AND tvf.'. + ($self->{tag} eq 'tagged' ? $self->{tag}.'_' : ''). + 'variation_feature_id = vf.variation_feature_id' + if defined $self->{tag}; + + return $clause; +} + +sub _columns { + return qw( vf.variation_feature_id vf.seq_region_id vf.seq_region_start + vf.seq_region_end vf.seq_region_strand vf.variation_id + vf.allele_string vf.variation_name vf.map_weight s.name s.version vf.somatic + vf.validation_status vf.consequence_types vf.class_attrib_id + vf.minor_allele vf.minor_allele_freq vf.minor_allele_count); +} + +sub _objs_from_sth { + my ($self, $sth, $mapper, $dest_slice) = @_; + + #warn $sth->sql; + + # + # This code is ugly because an attempt has been made to remove as many + # function calls as possible for speed purposes. Thus many caches and + # a fair bit of gymnastics is used. + # + + my $sa = $self->db()->dnadb()->get_SliceAdaptor(); + + my $aa = $self->db->get_AttributeAdaptor; + + my @features; + my %slice_hash; + my %sr_name_hash; + my %sr_cs_hash; + + my ($variation_feature_id, $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand, $variation_id, + $allele_string, $variation_name, $map_weight, $source_name, $source_version, + $is_somatic, $validation_status, $consequence_types, $class_attrib_id, + $minor_allele, $minor_allele_freq, $minor_allele_count, $last_vf_id); + + $sth->bind_columns(\$variation_feature_id, \$seq_region_id, + \$seq_region_start, \$seq_region_end, \$seq_region_strand, + \$variation_id, \$allele_string, \$variation_name, + \$map_weight, \$source_name, \$source_version, \$is_somatic, \$validation_status, + \$consequence_types, \$class_attrib_id, + \$minor_allele, \$minor_allele_freq, \$minor_allele_count); + + my $asm_cs; + my $cmp_cs; + my $asm_cs_vers; + my $asm_cs_name; + my $cmp_cs_vers; + my $cmp_cs_name; + + if($mapper) { + $asm_cs = $mapper->assembled_CoordSystem(); + $cmp_cs = $mapper->component_CoordSystem(); + $asm_cs_name = $asm_cs->name(); + $asm_cs_vers = $asm_cs->version(); + $cmp_cs_name = $cmp_cs->name(); + $cmp_cs_vers = $cmp_cs->version(); + } + + my $dest_slice_start; + my $dest_slice_end; + my $dest_slice_strand; + my $dest_slice_length; + + if($dest_slice) { + $dest_slice_start = $dest_slice->start(); + $dest_slice_end = $dest_slice->end(); + $dest_slice_strand = $dest_slice->strand(); + $dest_slice_length = $dest_slice->length(); + } + + my $finished = 0; + + my $iterator = Bio::EnsEMBL::Utils::Iterator->new(sub{ + + return undef if $finished; + + FEATURE: while( $sth->fetch ) { + + # Skip if we are getting multiple rows because of the left join to failed variation + next if (defined($last_vf_id) && $last_vf_id == $variation_feature_id); + $last_vf_id = $variation_feature_id; + + #get the slice object + my $slice = $slice_hash{"ID:".$seq_region_id}; + if(!$slice) { + $slice = $sa->fetch_by_seq_region_id($seq_region_id); + $slice_hash{"ID:".$seq_region_id} = $slice; + $sr_name_hash{$seq_region_id} = $slice->seq_region_name(); + $sr_cs_hash{$seq_region_id} = $slice->coord_system(); + } + + # remap the feature coordinates to another coord system + # if a mapper was provided + + if($mapper) { + my $sr_name = $sr_name_hash{$seq_region_id}; + my $sr_cs = $sr_cs_hash{$seq_region_id}; + + ($sr_name,$seq_region_start,$seq_region_end,$seq_region_strand) = + $mapper->fastmap($sr_name, $seq_region_start, $seq_region_end, + $seq_region_strand, $sr_cs); + + #skip features that map to gaps or coord system boundaries + next FEATURE if(!defined($sr_name)); + + #get a slice in the coord system we just mapped to + if($asm_cs == $sr_cs || ($cmp_cs != $sr_cs && $asm_cs->equals($sr_cs))) { + $slice = $slice_hash{"NAME:$sr_name:$cmp_cs_name:$cmp_cs_vers"} ||= + $sa->fetch_by_region($cmp_cs_name, $sr_name,undef, undef, undef, + $cmp_cs_vers); + } else { + $slice = $slice_hash{"NAME:$sr_name:$asm_cs_name:$asm_cs_vers"} ||= + $sa->fetch_by_region($asm_cs_name, $sr_name, undef, undef, undef, + $asm_cs_vers); + } + } + + # + # If a destination slice was provided convert the coords + # If the dest_slice starts at 1 and is foward strand, nothing needs doing + # + if($dest_slice) { + if($dest_slice_start != 1 || $dest_slice_strand != 1) { + if($dest_slice_strand == 1) { + $seq_region_start = $seq_region_start - $dest_slice_start + 1; + $seq_region_end = $seq_region_end - $dest_slice_start + 1; + } else { + my $tmp_seq_region_start = $seq_region_start; + $seq_region_start = $dest_slice_end - $seq_region_end + 1; + $seq_region_end = $dest_slice_end - $tmp_seq_region_start + 1; + $seq_region_strand *= -1; + } + + #throw away features off the end of the requested slice + if($seq_region_end < 1 || $seq_region_start > $dest_slice_length) { + next FEATURE; + } + } + $slice = $dest_slice; + } + my $validation_code; + if (defined($validation_status)) { + $validation_code = get_validation_code([split(',',$validation_status)]); + } + + #my $overlap_consequences = $self->_variation_feature_consequences_for_set_number($consequence_types); + + my $overlap_consequences = [ map { $OVERLAP_CONSEQUENCES{$_} } split /,/, $consequence_types ]; + + # consequence_types + return $self->_create_feature_fast('Bio::EnsEMBL::Variation::VariationFeature', + #push @features, Bio::EnsEMBL::Variation::VariationFeature->new_fast( + #if use new_fast, then do not need "-" infront of key, i.e 'start' => $seq_region_start, + + {'start' => $seq_region_start, + 'end' => $seq_region_end, + 'strand' => $seq_region_strand, + 'slice' => $slice, + 'allele_string' => $allele_string, + 'variation_name' => $variation_name, + 'adaptor' => $self, + 'dbID' => $variation_feature_id, + 'map_weight' => $map_weight, + 'source' => $source_name, + 'source_version' => $source_version, + 'is_somatic' => $is_somatic, + 'validation_code' => $validation_code, + 'overlap_consequences' => $overlap_consequences, + '_variation_id' => $variation_id, + 'class_SO_term' => $aa->attrib_value_for_id($class_attrib_id), + 'minor_allele' => $minor_allele, + 'minor_allele_frequency' => $minor_allele_freq, + 'minor_allele_count' => $minor_allele_count + } + ); + } + + unless ($finished) { + $sth->finish; + $finished = 1; + } + + return undef; + }); + + if ($self->{_iterator}) { + return $iterator; + } + else { + if ($self->{_get_variations}) { + my $vfs = $iterator->to_arrayref; + my @v_ids = map { $_->{_variation_id} } @$vfs; + my $vs = $self->db->get_VariationAdaptor->fetch_all_by_dbID_list(\@v_ids); + my %vs_by_id = map { $_->dbID => $_ } @$vs; + #warn "Got variations"; + map { $_->variation( $vs_by_id{ $_->{_variation_id} }) } @$vfs; + return $vfs; + } + if ($self->{_get_transcript_variations}) { + my $vfs = $iterator->to_arrayref; + return $vfs unless @$vfs; + #warn "getting transcript variations"; + my $tvs = $self->db->get_TranscriptVariationAdaptor->fetch_all_by_VariationFeatures($vfs); + for my $tv (@$tvs) { + $tv->variation_feature->add_TranscriptVariation($tv); + #$tv->variation_feature->{transcript_variations}->{$tv->transcript_stable_id} = $tv; + } + return $vfs; + } + else { + my $vfs = $iterator->to_arrayref; + #warn "Got ".scalar(@$vfs). "VFs"; + return $vfs; + } + } +} + + +=head2 list_dbIDs + + Arg [1] : none + Example : @feature_ids = @{$simple_feature_adaptor->list_dbIDs()}; + Description: Gets an array of internal ids for all simple features in + the current db + Returntype : list of ints + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub list_dbIDs { + my $self = shift; + return $self->_list_dbIDs('variation_feature'); +} + + +=head2 get_all_synonym_sources + + Args[1] : Bio::EnsEMBL::Variation::VariationFeature vf + Example : my @sources = @{$vf_adaptor->get_all_synonym_sources($vf)}; + Description : returns a list of all the sources for synonyms of this + VariationFeature + ReturnType : reference to list of strings + Exceptions : none + Caller : general + Status : At Risk + : Variation database is under development. + +=cut + +sub get_all_synonym_sources{ + my $self = shift; + my $vf = shift; + my %sources; + my @sources; + + if(!ref($vf) || !$vf->isa('Bio::EnsEMBL::Variation::VariationFeature')) { + throw("Bio::EnsEMBL::Variation::VariationFeature argument expected"); + } + + if (!defined($vf->{'_variation_id'}) && !defined($vf->{'variation'})){ + warning("Not possible to get synonym sources for the VariationFeature: you need to attach a Variation first"); + return \@sources; + } + #get the variation_id + my $variation_id; + if (defined ($vf->{'_variation_id'})){ + $variation_id = $vf->{'_variation_id'}; + } + else{ + $variation_id = $vf->variation->dbID(); + } + #and go to the varyation_synonym table to get the extra sources + my $source_name; + my $sth = $self->prepare(qq{SELECT s.name + FROM variation_synonym vs, source s + WHERE s.source_id = vs.source_id + AND vs.variation_id = ? + }); + $sth->bind_param(1,$variation_id,SQL_INTEGER); + $sth->execute(); + $sth->bind_columns(\$source_name); + while ($sth->fetch){ + $sources{$source_name}++; + } + @sources = keys(%sources); + return \@sources; +} + +=head2 new_fake + + Arg [1] : string $species + Example : + $vfa = Bio::EnsEMBL::Variation::VariationFeatureAdaptor->new_fake('human'); + Description: Creates a VariationFeatureAdaptor with no underlying database + attached. Should be used only when getting consequence types for + species with no variation database available. + Returntype : Bio::EnsEMBL::Variation::VariationFeatureAdaptor + Exceptions : throw if no species given + Caller : called from webcode for species where no variation database present + Status : Stable + +=cut + +sub new_fake { + my $class = shift; + my $species = shift; + + throw("No species defined") unless defined $species; + + my $self = bless {}, $class; + + $self->{'species'} = $species; + + return $self; +} + +sub _parse_hgvs_genomic_position { + + my $description = shift; + + my ($start, $start_offset, $end, $end_offset) = $description=~ m/^([\-\*]?\d+)((?:[\+\-]\d+)?)(?:_([\-\*]?\d+)((?:[\+\-]\d+)?))?/; + ## end information needed even if same as start + unless ($end){$end = $start;} + unless ($end_offset){$end_offset = $start_offset;} + if($description =~ /dup/){ + ## handle as insertion for ensembl object purposes + $start = $end; + $start++; + } + if( $start_offset || $end_offset){ warn "ERROR: not expecting offsets for genomic location [$description]\n";} + + return ($start, $end); +} + + +sub _parse_hgvs_transcript_position { + ### Work out genomic coordinates from hgvs coding or non coding annotation + + ### Non-exonic notation - + ### P+n => n bases after listed exonic base + ### P-n => n bases before listed exonic base + ### -P+n => n bases after listed 5'UTR base + ### *P-n => n bases before listed 3'UTR base + + my $description = shift; + my $transcript = shift; + + my ($start,$start_offset, $end, $end_offset) = $description =~ m/^([\-\*]?\d+)((?:[\+\-]\d+)?)(?:_([\-\*]?\d+)((?:[\+\-]\d+)?))?/; + my ($start_direction, $end_direction); ## go back or forward into intron + + if($start_offset ){ + ### extract + or - for intronic positions in coding nomenclature + if (substr($start_offset,0,1) eq '+' || substr($start_offset,0,1) eq '-'){ + $start_direction = substr($start_offset,0,1); + $start_offset = substr($start_offset,1) ; + $start = $start; + } + } + else{ ### exonic + $start_offset = 0 ; + } + + if($end_offset ){ + ### this is needed for long intronic events eg. ENST00000299272.5:c.98-354_98-351dupGAAA + if (substr($end_offset,0,1) eq '+' || substr($end_offset,0,1) eq '-'){ + $end_direction = substr($end_offset,0,1); + $end_offset = substr($end_offset,1) ; + $end = $end + } + } + else{ + $end_offset = $start_offset; + } + ### add missing values if single-location variant - needed for refseq check later + unless (defined $end) { + $end = $start; + $end_direction= $start_direction ; + } + + ### Variant in the 3' UTR => convert the coordinates by setting them to be the stop codon position + the UTR offset + if (substr($start,0,1) eq '*'){ + $start = ($transcript->cdna_coding_end() - $transcript->cdna_coding_start() + 1) + int(substr($start,1)) ; + } + if (substr($end,0,1) eq '*'){ + $end = ($transcript->cdna_coding_end() - $transcript->cdna_coding_start() + 1) + int(substr($end,1)); + } + + ### Variant in the 5' UTR => convert the coordinates by setting them to be the start codon position(0) - the UTR offset + if (substr($start,0,1) eq '-'){ + $start = 0 - int(substr($start,1)) ; + } + if (substr($end,0,1) eq '-'){ + $end = 0 - int(substr($end,1)); + } + + # Get the TranscriptMapper to convert to genomic coords + my $tr_mapper = $transcript->get_TranscriptMapper(); + + if($DEBUG ==1){print "About to convert to genomic $start $end, ccs:". $transcript->cdna_coding_start() ."\n";} + #ÊThe mapper can only convert cDNA coordinates, but we have CDS (relative to the start codon), so we need to convert them + my ($cds_start, $cds_end) ; + if( defined $transcript->cdna_coding_start()){ + ($cds_start, $cds_end) = (($start + $transcript->cdna_coding_start() - ($start > 0)),($end + $transcript->cdna_coding_start() - ($end > 0))); + } + else{ + #### non coding transcript + ($cds_start, $cds_end) = ($start, $end); + } + # Convert the cDNA coordinates to genomic coordinates. + my @coords = $tr_mapper->cdna2genomic($cds_start,$cds_end); + if($DEBUG ==1){ + print "In parser: cdna2genomic coords: ". $coords[0]->start() . "-". $coords[0]->end() . " and strand ". $coords[0]->strand()." from $cds_start,$cds_end\n";} + + #ÊThrow an error if we didn't get an unambiguous coordinate back + throw ("Unable to map the cDNA coordinates $start\-$end to genomic coordinates for Transcript " .$transcript->stable_id()) if (scalar(@coords) != 1 || !$coords[0]->isa('Bio::EnsEMBL::Mapper::Coordinate')); + + my $strand = $coords[0]->strand(); + + ### overwrite exonic location with genomic coordinates + $start = $coords[0]->start(); + $end = $coords[0]->end(); + + #### intronic variants are described as after or before the nearest exon + #### - add this offset to genomic start & end positions + if(defined $start_direction ){ + if($strand == 1){ + if($start_direction eq "+"){ $start = $start + $start_offset; } + if($end_direction eq "+"){ $end = $end + $end_offset; } + + if($start_direction eq "-"){ $start = $start - $start_offset; } + if($end_direction eq "-"){ $end = $end - $end_offset; } + } + elsif($strand == -1 ){ + if($start_direction eq "+"){ $start = $start - $start_offset;} + if($end_direction eq "+"){ $end = $end - $end_offset; } + + if($start_direction eq "-"){ $start = $start + $start_offset;} + if($end_direction eq "-"){ $end = $end + $end_offset; } + } + } + if($description =~ /dup/){ + ## special case: handle as insertion for ensembl object purposes + $start = $end ; + if($strand == 1){ $start++; } + else{ $end--; } + } + + return ($start, $end, $strand); +} + +=head2 fetch_by_hgvs_notation + + Arg[1] : String $hgvs + Example : my $hgvs = 'LRG_8t1:c.3891A>T'; + $vf = $vf_adaptor->fetch_by_hgvs_notation($hgvs); + Description : Parses an HGVS notation and tries to create a VariationFeature object + based on the notation. The object will have a Variation and Alleles attached. + ReturnType : Bio::EnsEMBL::Variation::VariationFeature, undef on failure + Exceptions : thrown on error + Caller : general + Status : Stable + +=cut + +sub fetch_by_hgvs_notation { + + + my $self = shift; + my $hgvs = shift; + my $user_slice_adaptor = shift; + my $user_transcript_adaptor = shift; + if($DEBUG ==1){print "\nStarting fetch_by_hgvs_notation for $hgvs\n";} + + + ########################### Check & split input ########################### + + #ÊSplit the HGVS notation into the reference, notation type and variation description + my ($reference,$type,$description) = $hgvs =~ m/^([^\:]+)\:.*?([cgmnrp]?)\.?(.*?[\*\-0-9]+.*)$/i; + + #ÊIf any of the fields are unknown, return undef + throw ("Could not parse the HGVS notation $hgvs") + unless (defined($reference) && defined($type) && defined($description)); + + + my $extra; + if($description =~ m/\(.+\)/) { + ($description, $extra) = $description=~ /(.+?)(\(.+\))/; + throw ("Could not parse the HGVS notation $hgvs - can't interpret \'$extra\'") unless $extra eq '(p.=)'; + } + + # strip version number from reference + if ($reference=~ /^ENS|^LRG_\d+/){ + $reference =~ s/\.\d+//g; + warn ("The position specified by HGVS notation '$hgvs' refers to a nucleotide that may not have a specific reference sequence. The current Ensembl genome reference sequence will be used.") ; + } + #ÊA small fix in case the reference is a LRG and there is no underscore between name and transcript + $reference =~ s/^(LRG_[0-9]+)_?(t[0-9]+)$/$1\_$2/i; + + $description =~ s/\s+//; + + ####################### extract genomic coordinates and reference seq allele ####################### + my ($start, $end, $strand, $ref_allele, $alt_allele, $refseq_allele); + + #ÊGet a slice adaptor to enable check of supplied reference allele + my $slice_adaptor = $user_slice_adaptor || $self->db()->dnadb()->get_SliceAdaptor(); + my $slice ; + + if($type =~ m/c|n/i) { + + #ÊGet the Transcript object to convert coordinates + my $transcript_adaptor = $user_transcript_adaptor || $self->db()->dnadb()->get_TranscriptAdaptor(); + my $transcript = $transcript_adaptor->fetch_by_stable_id($reference) or throw ("Could not get a Transcript object for '$reference'"); + + ($start, $end, $strand) = _parse_hgvs_transcript_position($description, $transcript) ; + $slice = $slice_adaptor->fetch_by_region($transcript->coord_system_name(),$transcript->seq_region_name()); + ($ref_allele, $alt_allele) = _get_hgvs_alleles($description, $hgvs); + + } + + elsif($type =~ m/g/i) { + + ($start, $end) = _parse_hgvs_genomic_position($description) ; + ## grab reference allele + $slice = $slice_adaptor->fetch_by_region('chromosome', $reference ); + $strand =1; ## strand should be genome strand for HGVS genomic notation + ($ref_allele, $alt_allele) = _get_hgvs_alleles($description, $hgvs); + } + + elsif($type =~ m/p/i) { + + #ÊGet the Transcript object to convert coordinates + my $transcript_adaptor = $user_transcript_adaptor || $self->db()->dnadb()->get_TranscriptAdaptor(); + my $transcript = $transcript_adaptor->fetch_by_translation_stable_id($reference) or throw ("Could not get a Transcript object for '$reference'"); + ($ref_allele, $alt_allele, $start, $end, $strand) = _parse_hgvs_protein_position($description, $reference, $transcript) ; + $slice = $slice_adaptor->fetch_by_region($transcript->coord_system_name(),$transcript->seq_region_name()); + + } + + else{ + throw ("HGVS type $type not recognised for $hgvs"); + } + + ####################### check alleles ####################### + + #ÊGet the reference allele based on the coordinates - need to supply lowest coordinate first to slice->subseq() + + if($start > $end){ $refseq_allele = $slice->subseq($end, $start, $strand);} + else{ $refseq_allele = $slice->subseq($start, $end, $strand);} + + + # If the reference allele was omitted, set it to undef + $ref_allele = undef unless (defined($ref_allele) && length($ref_allele)); + + if ($description =~ m/ins|dup/i && $description !~ m/del/i) { + # insertion: the start & end positions are inverted by convention + if($end > $start){ ($start, $end ) = ( $end , $start); } + } + else{ + # If the reference from the sequence does not correspond to the reference given in the HGVS notation, throw an exception + if (defined($ref_allele) && $ref_allele ne $refseq_allele){ + throw ("Reference allele extracted from $reference:$start-$end ($refseq_allele) does not match reference allele given by HGVS notation $hgvs ($ref_allele)"); + } + if($DEBUG==1){print "Reference allele: $refseq_allele expected allele: $ref_allele\n";} + } + if (defined($ref_allele) && $ref_allele eq $alt_allele){ + throw ("Reference allele extracted from $reference:$start-$end ($refseq_allele) matches alt allele given by HGVS notation $hgvs ($alt_allele)"); + } + + # Use the reference allele from the sequence if none was specified in the notation + $ref_allele ||= $refseq_allele; + + + ####################### Create objects ####################### + + #ÊCreate Allele objects + my @allele_objs; + foreach my $allele ($ref_allele,$alt_allele) { + push(@allele_objs,Bio::EnsEMBL::Variation::Allele->new('-adaptor' => $self, '-allele' => $allele)); + } + + #ÊCreate a variation object. Use the HGVS string as its name + my $variation = Bio::EnsEMBL::Variation::Variation->new( + '-adaptor' => $self->db()->get_VariationAdaptor(), + '-name' => $hgvs, + '-source' => 'Parsed from HGVS notation', + '-alleles' => \@allele_objs + ); + + #ÊCreate a variation feature object + my $variation_feature = Bio::EnsEMBL::Variation::VariationFeature->new( + '-adaptor' => $self, + '-start' => $start, + '-end' => $end, + '-strand' => $strand, + '-slice' => $slice, + '-map_weight' => 1, + '-variation' => $variation, + '-allele_string' => "$ref_allele/$alt_allele" + ); + if($DEBUG==1){print "Created object $hgvs allele_string: $ref_allele/$alt_allele, start:$start, end:$end\n";} + + return $variation_feature; + +} + + +sub _get_hgvs_alleles{ + + #### extract ref and alt alleles where possible from HGVS g/c/n string + + my ($description, $hgvs) = shift; + my ($ref_allele, $alt_allele) ; + + ### A single nt substitution, reference and alternative alleles are required + if ($description =~ m/>/) { + ($ref_allele,$alt_allele) = $description =~ m/([A-Z]+)>([A-Z]+)$/i; + } + + #ÊA delins, the reference allele is optional + elsif ($description =~ m/del.*ins/i) { + ($ref_allele,$alt_allele) = $description =~ m/del(.*?)ins([A-Z]+)$/i; + } + + # A deletion, the reference allele is optional + elsif ($description =~ m/del/i) { + ($ref_allele) = $description =~ m/del([A-Z]*)$/i; + $alt_allele = '-'; + } + + # A duplication, the reference allele is optional + elsif ($description =~ m/dup/i) { + $ref_allele ="-"; + ($alt_allele) = $description =~ m/dup([A-Z]*)$/i; + } + + # An inversion, the reference allele is optional + elsif ($description =~ m/inv/i) { + ($ref_allele) = $description =~ m/inv([A-Z]*)$/i; + $alt_allele = $ref_allele; + reverse_comp(\$alt_allele); + } + + # An insertion, + elsif ($description =~ m/ins/i) { + ($alt_allele) = $description =~ m/ins([A-Z]*)$/i; + $ref_allele = '-'; + } + ## A simple repeat (eg. ENST00000522587.1:c.-310+750[13]A => alt AAAAAAAAAAAAA) + elsif ($description =~ m/\[/i) { + + my ($number, $string) = $description =~ m/\[(\d+)\]([A-Z]*)$/i; + foreach my $n(1..$number){ $alt_allele .= $string;} + $ref_allele = $string; + } + else { + throw ("The variant class for HGVS notation '$hgvs' is unknown or could not be correctly recognized"); + } + return ($ref_allele, $alt_allele) ; +} + +## Extract enough information to make a variation_feature from HGVS protein nomenclature +## Only attempts substitutions +## - assumes protein change results from minimum number of nucleotide changes +## - returns VF information only if one minimal solution found +sub _parse_hgvs_protein_position{ + + my ($description, $reference, $transcript ) = @_; + + ## only supporting the parsing of hgvs substitutions [eg. Met213Ile] + my ($from, $pos, $to) = $description =~ /^(\w+?)(\d+)(\w+?)$/; + + throw("Could not parse HGVS protein notation " . $reference . ":p.". $description ) unless $from and $pos and $to; + + + # get genomic position + my $tr_mapper = $transcript->get_TranscriptMapper(); + + my @coords = $tr_mapper->pep2genomic($pos, $pos); + + throw ("Unable to map the peptide coordinate $pos to genomic coordinates for protein $reference") if (scalar(@coords) != 1 || !$coords[0]->isa('Bio::EnsEMBL::Mapper::Coordinate')); + + my $strand = $coords[0]->strand(); + my $start = $strand > 0 ? $coords[0]->start() : $coords[0]->end(); + my $end = $strand > 0 ? $coords[0]->start() : $coords[0]->end(); + + + # get correct codon table + my $attrib = $transcript->slice->get_all_Attributes('codon_table')->[0]; + + # default to the vertebrate codon table which is denoted as 1 + my $codon_table = Bio::Tools::CodonTable->new( -id => ($attrib ? $attrib->value : 1)); + + # rev-translate + my @from_codons = $codon_table->revtranslate($from); + my @to_codons = $codon_table->revtranslate($to); + + # now iterate over all possible mutation paths + my %paths; + + foreach my $f(@from_codons) { + foreach my $t(@to_codons) { + my $key = $f.'_'.$t; + + for my $i(0..2) { + my ($a, $b) = (substr($f, $i, 1), substr($t, $i, 1)); + next if $a eq $b; + push @{$paths{$key}}, $i.'_'.uc($a).'/'.uc($b); + } + + # non consecutive paths + if(scalar @{$paths{$key}} == 2 and $paths{$key}->[0] =~ /^0/ and $paths{$key}->[1] =~ /^2/) { + splice(@{$paths{$key}}, 1, 0, '1_'.substr($f, 1, 1).'/'.substr($f, 1, 1)); + } + + $paths{$key} = join ",", @{$paths{$key}}; + } + } + + # get shortest dist and best paths with that dist + my $shortest_dist = length((sort {length($a) <=> length($b)} values %paths)[0]); + my %best_paths = map {$_ => 1} grep {length($_) eq $shortest_dist} values %paths; + + my ($ref_allele, $alt_allele); + + # nice and easy if we only have path + if(scalar keys %best_paths == 1) { + my @path = map {split /\,/, $_} keys %best_paths; + + # coords + if($strand > 0) { + $start += (split /\_/, $path[0])[0]; + $end += (split /\_/, $path[-1])[0]; + } + else { + $start -= (split /\_/, $path[0])[0]; + $end -= (split /\_/, $path[-1])[0]; + } + + # alleles + $ref_allele .= (split /\_|\//, $path[$_])[1] for 0..$#path; + $alt_allele .= (split /\_|\//, $path[$_])[2] for 0..$#path; + + } + + else { + throw("Could not uniquely determine nucleotide change from peptide change $from \-\> $to"); + } + + return ($ref_allele, $alt_allele, $start, $end, $strand); + # + #use Data::Dumper; + #$Data::Dumper::Maxdepth = 3; + #warn Dumper \@from_codons; + #warn Dumper \@to_codons; + #warn Dumper \%paths; + #warn Dumper \%best_paths; + #exit(0); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/VariationFeatureOverlapAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/VariationFeatureOverlapAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,281 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Variation::DBSQL::VariationFeatureOverlapAdaptor + +=head1 DESCRIPTION + +This is the superclass of all Adaptors that fetch VariationFeatureOverlap +objects and their various subclasses, and it provides methods common to +all such adaptors, such as fetching by VariationFeature. You should not +generally use this class directly, but instead use one of the feature +specific adaptors such as the TranscriptVariationAdaptor. + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::DBSQL::VariationFeatureOverlapAdaptor; + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + +use base qw(Bio::EnsEMBL::Variation::DBSQL::BaseAdaptor); + +sub new_fake { + my $class = shift; + my $species = shift; + + my $self = bless {}, $class; + + return $self; +} + +=head2 fetch_all_by_Features + + Arg [1] : listref of Bio::EnsEMBL::Features, or subclasses + Description: Fetch all germline VariationFeatureOverlap objects associated + with the given list of Features + Returntype : listref of Bio::EnsEMBL::Variation::VariationFeatureOverlap objects + Status : At risk + +=cut + +sub fetch_all_by_Features { + my ($self, $features) = @_; + return $self->fetch_all_by_Features_with_constraint($features,'somatic = 0'); +} + +=head2 fetch_all_somatic_by_Features + + Arg [1] : listref of Bio::EnsEMBL::Features, or subclasses + Description: Fetch all somatic VariationFeatureOverlap objects associated + with the given list of Features + Returntype : listref of Bio::EnsEMBL::Variation::VariationFeatureOverlap objects + Status : At risk + +=cut + +sub fetch_all_somatic_by_Features { + my ($self, $features) = @_; + return $self->fetch_all_by_Features_with_constraint($features,'somatic = 1'); +} + +=head2 fetch_all_by_Features_with_constraint + + Arg [1] : listref of Bio::EnsEMBL::Features, or subclasses + Arg [2] : extra SQL constraint for the query + Description: Fetch all VariationFeatureOverlap objects associated + with the given list of Features + Returntype : listref of Bio::EnsEMBL::Variation::VariationFeatureOverlap objects + Status : At risk + +=cut + +sub fetch_all_by_Features_with_constraint { + my $self = shift; + + my ($features, $constraint) = @_; + + my $vfos = $self->_func_all_by_Features_with_constraint(@_, 'fetch'); + + # Note duplicated code + my %feats_by_id = map { $_->stable_id => $_ } @$features; + + for my $vfo (@$vfos) { + if ($vfo->{_feature_stable_id}) { + my $feat_id = delete $vfo->{_feature_stable_id}; + $vfo->{feature} = $feats_by_id{$feat_id}; + } + } + + return $vfos; +} + +sub _func_all_by_Features_with_constraint { + my ($self, $features, $constraint, $func) = @_; + + my %feats_by_id = map { $_->stable_id => $_ } @$features; + + my $id_str = join ',', map {"'$_'"} keys %feats_by_id; + + my $full_constraint = "feature_stable_id in ( $id_str )"; + $full_constraint .= " AND $constraint" if $constraint; + + my $method = "generic_" . $func; + my $data = $self->$method($full_constraint); + + return $data; +} + +sub count_all_by_Features_with_constraint { + my $self = shift; + my ($features, $constraint) = @_; + + my $count = $self->_func_all_by_Features_with_constraint(@_, 'count'); + + if (!defined($count)) { $count = 0; } + + return $count; +} + +=head2 fetch_all_by_VariationFeatures + + Arg [1] : listref of Bio::EnsEMBL::Variation::VariationFeatures + Arg [2] : (optional) listref of Bio::EnsEMBL::Features to further limit the query + Description: Fetch all VariationFeatureOverlap objects associated + with the given list of VariationFeatures + Returntype : listref of Bio::EnsEMBL::Variation::VariationFeatureOverlap objects + Status : At risk + +=cut +sub fetch_all_by_VariationFeatures { + my ($self, $vfs, $features) = @_; + return $self->fetch_all_by_VariationFeatures_with_constraint($vfs,$features,undef); +} + +sub count_all_by_VariationFeatures { + my ($self, $vfs, $features) = @_; + return $self->count_all_by_VariationFeatures_with_constraint($vfs,$features,undef); +} + +sub count_all_by_VariationFeatures_with_constraint { + my $self = shift; + my ($vfs, $features, $constraint) = @_; + + my $allcounts = $self->_func_all_by_VariationFeatures_with_constraint(@_ , 'count'); + + my $total = 0; + for my $count (@$allcounts) { + $total += $count; + } + + return $total; +} + +sub _func_all_by_VariationFeatures_with_constraint { + my ($self, $vfs, $features, $constraint, $func) = @_; + + my %vfs_by_id = map { $_->dbID => $_ } @$vfs; + + my @vfids = keys %vfs_by_id; + + if (!scalar(@vfids)) { + return []; + } + + my @alldata; + + while (@vfids) { + + my $fullconstraint = $constraint; + + my @vfid_subset = splice(@vfids,0,50000); + + my $id_str = join ',', @vfid_subset; + + if ($id_str eq '') { + last; + } + + if ($fullconstraint) { + $fullconstraint .= " AND "; + } + $fullconstraint .= "variation_feature_id in ( $id_str )"; + + + my $data; + + if ($features) { + # if we're passed some features, fetch/count by features with the VF ids as an + # extra constraint + my $method = $func . "_all_by_Features_with_constraint"; + $data = $self->$method($features, $fullconstraint); + } + else { + # otherwise just fetch/count the VFs directly + my $method = "generic_" . $func; + $data = $self->$method($fullconstraint); + } + push @alldata,ref($data) eq 'ARRAY' ? @$data : $data; + } + + return \@alldata; +} + +sub fetch_all_by_VariationFeatures_with_constraint { + my $self = shift; + my ($vfs, $features, $constraint) = @_; + + my $allvfos = $self->_func_all_by_VariationFeatures_with_constraint(@_ , 'fetch'); + + + my %vfs_by_id = map { $_->dbID => $_ } @$vfs; + + # attach the VariationFeatures to the VariationFeatureOverlaps because we have them already + + for my $vfo (@$allvfos) { + if ($vfo->{_variation_feature_id}) { + $vfo->variation_feature($vfs_by_id{delete $vfo->{_variation_feature_id}}); + } + } + + return $allvfos; +} + +sub _get_VariationFeatureOverlapAlleles_under_SO_term { + my ($self, $term, $vfoas) = @_; + + my $terms = $self->_get_child_terms($term); + + my @found; + + ALLELES : for my $vfoa (@$vfoas) { + for my $cons (@{ $vfoa->get_all_OverlapConsequences }) { + for my $term (@$terms) { + if ($cons->SO_term eq $term->name) { + push @found, $vfoa; + next ALLELES; + } + } + } + } + + return \@found; +} + +# call to method in BaseAdaptor +sub _get_consequence_constraint { + my $self = shift; + return $self->SUPER::_get_consequence_constraint('transcript_variation', @_); +} + +sub fetch_all_by_SO_terms { + my ($self, $terms) = @_; + + my $constraint = $self->_get_consequence_constraint($terms); + + return $self->generic_fetch($constraint); +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/VariationSetAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/DBSQL/VariationSetAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,464 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# +# Ensembl module for Bio::EnsEMBL::Variation::DBSQL::VariationSetAdaptor +# +# Copyright (c) 2010 Ensembl +# +# You may distribute this module under the same terms as perl itself +# +# + +=head1 NAME + +Bio::EnsEMBL::Variation::DBSQL::VariationSetAdaptor + +=head1 SYNOPSIS + + $db = Bio::EnsEMBL::Variation::DBSQL::DBAdaptor->new(...); + + $vsa = $db->get_VariationSetAdaptor(); + + # retrieve a variation set by its name + $vs = $vsa->fetch_by_name('Phenotype-associated variations'); + + # retrieve a variation set by its internal identifier + $vs = $vsa->fetch_by_dbID(12); + + # retrieve all variation sets which a variation is a part of + @vs = @{$vsa->fetch_all_by_Variation($var)}; + + +=head1 DESCRIPTION + +This adaptor provides database connectivity for VariationSet objects. +VariationSets may be retrieved from the Ensembl variation database by +several means using this module. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::DBSQL::VariationSetAdaptor; + +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Scalar qw(assert_ref wrap_array); + +use Bio::EnsEMBL::Variation::VariationSet; + +our @ISA = ('Bio::EnsEMBL::DBSQL::BaseAdaptor'); + +our $MAX_VARIATION_SET_ID = 64; + +=head2 fetch_all_top_VariationSets + + Example : $vs = $vs_adaptor->fetch_all_top_VariationSets(); + Description: Retrieves all VariationSet objects that are 'toplevel', + i.e. they are not subsets of any other variation set. + Returntype : istref of Bio::EnsEMBL::Variation::VariationSet + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub fetch_all_top_VariationSets { + my $self = shift; + + #ÊAdd a constraint to only get the sets that don't have any parent sets + my $constraint = qq{ + NOT EXISTS ( + SELECT + * + FROM + variation_set_structure vss + WHERE + vss.variation_set_sub = vs.variation_set_id + ) + }; + + #ÊGet the results from generic fetch method + return $self->generic_fetch($constraint); + +} + +=head2 fetch_all_by_sub_VariationSet + + Arg [1] : Bio::EnsEMBL::Variation::VariationSet $sub + Arg [2] : (optional) boolean $only_immediate + If true, only the direct supersets of this variation set will be fetched. The default behaviour is + to recursively fetch all supersets. + Example : @vs_supersets = @{$vs_adaptor->fetch_all_by_sub_VariationSet($vs)}; + Description: Retrieves all VariationSets that are direct supersets of the specified VariationSet. + Returntype : listref of Bio::EnsEMBL::Variation::VariationSet + Exceptions : throw if sub arg is not valid + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_sub_VariationSet { + my $self = shift; + my $set = shift; + my $only_immediate = shift; + + #ÊCheck the input set + assert_ref($set,'Bio::EnsEMBL::Variation::VariationSet'); + +# First, get all VariationSets that are direct supersets of this one + + my $dbID = $set->dbID(); + + my $stmt = qq{ + SELECT + vss.variation_set_super + FROM + variation_set_structure vss + WHERE + vss.variation_set_sub = ? + }; + my $sth = $self->prepare($stmt); + $sth->execute($dbID); + + my %vs; + while (my $result = $sth->fetchrow_arrayref()) { +# For each superset, fetch all of its supersets, unless specifically told not to + my $vs_sup = $self->fetch_by_dbID($result->[0]); + $vs{$vs_sup->dbID()} = $vs_sup; + if (!defined($only_immediate)) { + foreach my $v (@{$self->fetch_all_by_sub_VariationSet($vs_sup)}) { + $vs{$v->dbID()} = $v; + } + } + } + + my @res = values(%vs); + + return \@res; +} + +=head2 fetch_all_by_super_VariationSet + + Arg [1] : Bio::EnsEMBL::Variation::VariationSet $super + Arg [2] : (optional) boolean $only_immediate + If true, only the direct subsets of this variation set will be fetched. The default behaviour is + to recursively fetch all subsets. + Example : @vs_subsets = @{$vs_adaptor->fetch_all_by_super_VariationSet($vs)}; + Description: Retrieves all VariationSets that are subsets of the specified VariationSet. + Returntype : listref of Bio::EnsEMBL::Variation::VariationSet + Exceptions : throw if super arg is not valid + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_super_VariationSet { + my $self = shift; + my $set = shift; + my $only_immediate = shift; + + #ÊCheck the input set + assert_ref($set,'Bio::EnsEMBL::Variation::VariationSet'); + +# First, get all VariationSets that are direct subsets of this one + + my $dbID = $set->dbID(); + + my $stmt = qq{ + SELECT + vss.variation_set_sub + FROM + variation_set_structure vss + WHERE + vss.variation_set_super = ? + }; + my $sth = $self->prepare($stmt); + $sth->execute($dbID); + + my %vs; + while (my $result = $sth->fetchrow_arrayref()) { +# For each subset, fetch all of its subsets unless specifically told not to + my $vs_sub = $self->fetch_by_dbID($result->[0]); + $vs{$vs_sub->dbID()} = $vs_sub; + if (!defined($only_immediate)) { + foreach my $v (@{$self->fetch_all_by_super_VariationSet($vs_sub)}) { + $vs{$v->dbID()} = $v; + } + } + } + + my @res = values(%vs); + + return \@res; +} + + +=head2 fetch_by_name + + Arg [1] : string $name + Example : $vg = $vga->fetch_by_name('Phenotype-associated variations'); + Description: Retrieves a variation set by its name. + Returntype : Bio::EnsEMBL::Variation::VariationSet + Exceptions : throw if name argument is not provided + Caller : general + Status : At Risk + +=cut + +sub fetch_by_name { + my $self = shift; + my $name = shift; + + throw('name argument expected') unless (defined($name)); + + # Add a constraint on the name column and bind the name to it + my $constraint = qq{ vs.name LIKE ? }; + $self->bind_param_generic_fetch($name,SQL_VARCHAR); + + #ÊCall the generic fetch method + my $result = wrap_array($self->generic_fetch($constraint)); + + # Return the result + return undef unless (scalar(@{$result})); + return $result->[0]; +} + +=head2 fetch_by_short_name + + Arg [1] : string $name + Example : $vg = $vga->fetch_by_short_name('ph_variants'); + Description: Retrieves a variation set by its short name. + Returntype : Bio::EnsEMBL::Variation::VariationSet + Exceptions : throw if short name argument is not provided + Caller : general + +=cut + +sub fetch_by_short_name { + my $self = shift; + my $name = shift; + + throw('short name argument expected') unless (defined($name)); + + #ÊGet the attrib_id corresponding to the 'short_name' type and specified name + my $aa = $self->db->get_AttributeAdaptor(); + my $attrib_id = $aa->attrib_id_for_type_value($self->_short_name_attrib_type_code(),$name); + return undef unless (defined($attrib_id)); + + # Add a constraint on the short_name_attrib_id column and bind the name to it + my $constraint = qq{ vs.short_name_attrib_id = ? }; + $self->bind_param_generic_fetch($attrib_id,SQL_INTEGER); + + #ÊCall the generic fetch method + my $result = wrap_array($self->generic_fetch($constraint)); + + # Return the result + return undef unless (scalar(@{$result})); + return $result->[0]; +} + + +=head2 fetch_all_by_Variation + + Arg [1] : Bio::EnsEMBL::Variation::Variation + Example : my $vgs = $vga->fetch_all_by_Variation($var); + Description: Retrieves all variation sets which a particular variation + is present in. + Returntype : reference to list of Bio::EnsEMBL::Variation::VariationSets + Exceptions : throw on incorrect argument + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_Variation { + my $self = shift; + my $var = shift; + + assert_ref($var,'Bio::EnsEMBL::Variation::Variation'); + + my $cols = join(',',$self->_columns()); + my $stmt = qq{ + SELECT + $cols + FROM + variation_set vs, + variation_set_variation vsv + WHERE + vs.variation_set_id = vsv.variation_set_id AND + vsv.variation_id = ? + }; + + my $sth = $self->prepare($stmt); + $sth->bind_param(1,$var->dbID,SQL_INTEGER); + $sth->execute(); + + my $result = $self->_objs_from_sth($sth); + $sth->finish(); + +# Fetch all supersets of the returned sets as well. Since a variation may occur at several places in a hierarchy +# which will cause duplicated data, store variation sets in a hash with dbID as key. + my %sets; + foreach my $set (@{$result}) { + $sets{$set->dbID()} = $set; + foreach my $sup (@{$self->fetch_all_by_sub_VariationSet($set)}) { + $sets{$sup->dbID()} = $sup; + } + } + + my @res = values %sets; + return \@res; +} + + +=head2 fetch_all_by_StructuralVariation + + Arg [1] : Bio::EnsEMBL::Variation::StructuralVariation + Example : my $vss = $vsa->fetch_all_by_StructuralVariation($sv); + Description: Retrieves all variation sets which a particular structural variation + is present in. + Returntype : reference to list of Bio::EnsEMBL::Variation::VariationSets + Exceptions : throw on incorrect argument + Caller : general + Status : At Risk + +=cut + +sub fetch_all_by_StructuralVariation { + my $self = shift; + my $var = shift; + + assert_ref($var,'Bio::EnsEMBL::Variation::StructuralVariation'); + + my $cols = join(',',$self->_columns()); + my $stmt = qq{ + SELECT + $cols + FROM + variation_set vs, + variation_set_structural_variation vssv + WHERE + vs.variation_set_id = vssv.variation_set_id AND + vssv.structural_variation_id = ? + }; + + my $sth = $self->prepare($stmt); + $sth->bind_param(1,$var->dbID,SQL_INTEGER); + $sth->execute(); + + my $result = $self->_objs_from_sth($sth); + $sth->finish(); + +# Fetch all supersets of the returned sets as well. Since a variation may occur at several places in a hierarchy +# which will cause duplicated data, store variation sets in a hash with dbID as key. + my %sets; + foreach my $set (@{$result}) { + $sets{$set->dbID()} = $set; + foreach my $sup (@{$self->fetch_all_by_sub_VariationSet($set)}) { + $sets{$sup->dbID()} = $sup; + } + } + + my @res = values %sets; + return \@res; +} + + +# An API-internal subroutine for getting the bitvalue of the specified variation_set and (unless specifically indicated) its subsets +sub _get_bitvalue { + my $self = shift; + my $set = shift; + my $no_subsets = shift; + + #ÊCheck the input set + assert_ref($set,'Bio::EnsEMBL::Variation::VariationSet'); + + #ÊStore the dbIDs of the set and its subsets in an array + my @dbIDs = ($set->dbID()); + unless ($no_subsets) { + map {push(@dbIDs,$_->dbID())} @{$set->adaptor->fetch_all_by_super_VariationSet($set)}; + } + + #ÊDo a quick check that none of the dbIDs are too large for being stored in the set construct. In that case, warn about this. + my @non_compatible = grep {$_ > $MAX_VARIATION_SET_ID} @dbIDs; + if (scalar(@non_compatible)) { + warn ("Variation set(s) with dbID " . join(", ",@non_compatible) . " cannot be stored in the variation_set_id SET construct. Entries for these sets won't be returned"); + } + + #ÊAdd the bitvalues of the dbIDs in the set together to get the bitvalue, use only the ones that fit within the $MAX_VARIATION_SET_ID limit + my $bitvalue = 0; + map {$bitvalue += (2 ** ($_ - 1))} grep {$_ <= $MAX_VARIATION_SET_ID} @dbIDs; + + return $bitvalue; +} + +# API-internal method for getting the attrib_type code used for short names +sub _short_name_attrib_type_code { + return q{short_name}; +} + +sub _columns { + return qw( vs.variation_set_id vs.name vs.description vs.short_name_attrib_id ); +} +sub _tables { + return ( ['variation_set','vs'] ); +} +sub _default_where_clause { + return '1'; +} + +sub _objs_from_sth { + my $self = shift; + my $sth = shift; + + my ($vs_id, $name, $description, $short_name_attrib_id); + $sth->bind_columns(\$vs_id, \$name, \$description, \$short_name_attrib_id); + + my @results; + my ($cur_vs, $cur_vs_id); + my $aa = $self->db->get_AttributeAdaptor(); + +# Construct all variation sets + + while($sth->fetch()) { + if (!defined($cur_vs) || $vs_id != $cur_vs_id) { + $cur_vs = Bio::EnsEMBL::Variation::VariationSet->new + ( + -dbID => $vs_id, + -adaptor => $self, + -name => $name, + -description => $description, + -short_name => $aa->attrib_value_for_id($short_name_attrib_id) + ); + $cur_vs_id = $vs_id; + push(@results,$cur_vs); + } + } + + return \@results; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/ExternalFeatureVariation.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/ExternalFeatureVariation.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,115 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +package Bio::EnsEMBL::Variation::ExternalFeatureVariation; + +use strict; +use warnings; + +use Bio::EnsEMBL::Variation::ExternalFeatureVariationAllele; +use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(overlap); + +use base qw(Bio::EnsEMBL::Variation::RegulationVariation); + +sub new { + my $class = shift; + + my %args = @_; + + # swap a '-external_feature' argument for a '-feature' one for the superclass + + for my $arg (keys %args) { + if (lc($arg) eq '-external_feature') { + $args{'-feature'} = delete $args{$arg}; + } + } + + # call the superclass constructor + my $self = $class->SUPER::new(%args) || return undef; + + # rebless the alleles from vfoas to efvas + map { bless $_, 'Bio::EnsEMBL::Variation::ExternalFeatureVariationAllele' } + @{ $self->get_all_ExternalFeatureVariationAlleles }; + + return $self; +} + +sub external_feature_stable_id { + my $self = shift; + return $self->SUPER::feature_stable_id(@_); +} + +sub external_feature { + my ($self, $ef) = @_; + return $self->SUPER::feature($ef, 'ExternalFeature'); +} + +sub add_ExternalFeatureVariationAllele { + my $self = shift; + return $self->SUPER::add_VariationFeatureOverlapAllele(@_); +} + +sub get_reference_ExternalFeatureVariationAllele { + my $self = shift; + return $self->SUPER::get_reference_VariationFeatureOverlapAllele(@_); +} + +sub get_all_alternate_ExternalFeatureVariationAlleles { + my $self = shift; + return $self->SUPER::get_all_alternate_VariationFeatureOverlapAlleles(@_); +} + +sub get_all_ExternalFeatureVariationAlleles { + my $self = shift; + return $self->SUPER::get_all_VariationFeatureOverlapAlleles(@_); +} + +sub target_feature_stable_id { + my ($self, $target_feature_stable_id) = @_; + + $self->{target_feature_stable_id} = $target_feature_stable_id if $target_feature_stable_id; + + unless ($self->{target_feature_stable_id}) { + + # try to fetch it from the funcgen database + + if (my $species = $self->{adaptor}->db->species) { + for my $entry ( + @{ $self->external_feature->get_all_DBEntries($species.'_core_Transcript') }, + @{ $self->external_feature->get_all_DBEntries($species.'_core_Gene') } ) { + if (my $id = $entry->primary_id) { + + $self->{target_feature_stable_id} = $id; + + # there should never be more than one, so we last out of the loop + + last; + } + } + } + else { + warn "Failed to get species from adaptor"; + } + } + + return $self->{target_feature_stable_id}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/ExternalFeatureVariationAllele.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/ExternalFeatureVariationAllele.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,52 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +package Bio::EnsEMBL::Variation::ExternalFeatureVariationAllele; + +use strict; +use warnings; + +use base qw(Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele); + +sub new_fast { + my ($self, $hashref) = @_; + + # swap an external_feature_variation argument for a variation_feature_overlap one + + if ($hashref->{external_feature_variation}) { + $hashref->{variation_feature_overlap} = delete $hashref->{external_feature_variation}; + } + + # and call the superclass + + return $self->SUPER::new_fast($hashref); +} + +sub external_feature_variation { + my $self = shift; + return $self->variation_feature_overlap(@_); +} + +sub external_feature { + my $self = shift; + return $self->external_feature_variation->external_feature; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Failable.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Failable.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,140 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# Ensembl module for Bio::EnsEMBL::Variation::BaseStructuralVariation +# +# Copyright (c) 2011 Ensembl +# + +=head1 NAME + +Bio::EnsEMBL::Variation::Failable + +=head1 DESCRIPTION + +This object returns failed information. + +=cut + + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::Failable; + + +=head2 failed_description + + Arg [1] : $failed_description (optional) + The new value to set the failed_description attribute to. Should + be a reference to a list of strings, alternatively a string can + be passed. If multiple failed descriptions are specified, they should + be separated with semi-colons. + Example : $failed_str = $sv->failed_description(); + Description: Get/Sets the failed description for this structural variant. The failed + descriptions are lazy-loaded from the database. + Returntype : Semi-colon separated string + Exceptions : Thrown on illegal argument. + Caller : general + Status : At risk + +=cut + +sub failed_description { + my $self = shift; + my $description = shift; + + # Update the description if necessary + if (defined($description)) { + + # If the description is a string, split it by semi-colon and take the reference + if (check_ref($description,'STRING')) { + my @pcs = split(/;/,$description); + $description = \@pcs; + } + # Throw an error if the description is not an arrayref + assert_ref($description.'ARRAY'); + + # Update the cached failed_description + $self->{'failed_description'} = $description; + } + # Else, fetch it from the db if it's not cached + elsif (!defined($self->{'failed_description'})) { + $self->{'failed_description'} = $self->get_all_failed_descriptions(); + } + + # Return a semi-colon separated string of descriptions + return join(";",@{$self->{'failed_description'}}); +} + + +=head2 get_all_failed_descriptions + + Example : + if ($sv->is_failed()) { + my $descriptions = $sv->get_all_failed_descriptions(); + print "Structural variant " . $sv->variation_name() . " has been flagged as failed because '"; + print join("' and '",@{$descriptions}) . "'\n"; + } + + Description: Gets all failed descriptions associated with this Structural variant. + Returntype : Reference to a list of strings + Exceptions : Thrown if an adaptor is not attached to this object. + Caller : general + Status : At risk + +=cut + +sub get_all_failed_descriptions { + my $self = shift; + + # If the failed descriptions haven't been cached yet, load them from db + unless (defined($self->{'failed_description'})) { + + # Check that this allele has an adaptor attached + unless (defined($self->adaptor())) { + throw('An adaptor must be attached to the ' . ref($self) . ' object'); + } + + $self->{'failed_description'} = $self->adaptor->get_all_failed_descriptions($self); + } + + return $self->{'failed_description'}; +} + + +=head2 is_failed + + Example : print "Structural variant '" . $sv->variation_name() . "' has " . ($sv->is_failed() ? "" : "not ") . "been flagged as failed\n"; + Description: Gets the failed attribute for this structural variant. The failed attribute + is lazy-loaded from the database. + Returntype : int + Exceptions : none + Caller : general + Status : At risk + +=cut + +sub is_failed { + my $self = shift; + + return (length($self->failed_description()) > 0); +} +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Genotype.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Genotype.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,261 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# Ensembl module for Bio::EnsEMBL::Variation::Genotype +# +# Copyright (c) 2004 Ensembl +# + + +=head1 NAME + +Bio::EnsEMBL::Variation::Genotype - Abstract base class representing a genotype + +=head1 SYNOPSIS + + print $genotype->variation()->name(), "\n"; + print $genotype->allele1(), '/', $genotype->allele2(), "\n"; + +=head1 DESCRIPTION + +This is an abstract base class representing a genotype. Specific types of +genotype are represented by subclasses such as IndividualGenotype and +PopulationGenotype. + +=head1 METHODS + +=cut + + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::Genotype; + +use Bio::EnsEMBL::Storable; +use Bio::EnsEMBL::Utils::Scalar qw(assert_ref); +use Bio::EnsEMBL::Variation::Utils::Sequence qw(strain_ambiguity_code); + +use vars qw(@ISA $AUTOLOAD); + +@ISA = qw(Bio::EnsEMBL::Storable); + + +sub new_fast { + my $class = shift; + my $hashref = shift; + return bless $hashref, $class; +} + + +=head2 allele + Args : int $index + string $new_allele (optional) + Examples : $allele1 = $genotype->allele(1); + $allele2 = $genotype->allele2(); + Description: Getter/Setter for one of the alleles that compose this genotype. + Can be called as $genotype->allele(1), or via AUTOLOAD as + $genotype->allele1() + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub allele { + my $self = shift; + my $index = shift; + my $allele = shift; + $index = 1 unless defined($index) && $index >= 1; + + $index--; + + $self->{genotype}->[$index] = $allele if defined($allele); + + return defined($self->{genotype}->[$index]) ? $self->{genotype}->[$index] : undef; +} + + + +=head2 genotype + Examples : @alleles = @{$genotype->genotype}; + Description: Getter for the genotype as an arrayref of alleles + Returntype : arrayref of strings + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub genotype { + return $_[0]->{genotype} +} + + + +=head2 genotype_string + + Examples : $genotype_string = $genotype->genotype_string; + Description: Gets the genotype as a '|'-separated string. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub genotype_string { + return join '|', @{$_[0]->genotype || []}; +} + +=head2 variation + + Arg [1] : (optional) Bio::EnsEMBL::Variation::Variation $var + Example : $var = $genotype->variation(); + Description: Getter/Setter for the Variation as + Returntype : Bio::EnsEMBL::Variation::Variation + Exceptions : throw on bad argument + Caller : general + Status : At Risk + +=cut + +sub variation { + my $self = shift; + + if(@_) { + my $v = shift; + if(defined($v) && (!ref($v) || !$v->isa('Bio::EnsEMBL::Variation::Variation'))) { + throw('Bio::EnsEMBL::Variation::Variation argument expected'); + } + return $self->{variation} = $v; + } + + if(!defined($self->{variation}) && defined($self->{_variation_id})) { + my $va = $self->adaptor->db->get_VariationAdaptor; + + if(defined($va)) { + my $v = $va->fetch_by_dbID($self->{_variation_id}); + + if(defined($v)) { + $self->{variation} = $v; + } + } + } + + return $self->{'variation'}; +} + + +=head2 subsnp + + Arg [1] : string $newval (optional) + The new value to set the subsnp attribute to + Example : print $genotype->subsnp(); + Description: Getter/Setter for the subsnp attribute. + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub subsnp{ + my $self = shift; + if(@_) { + $self->{'subsnp'} = shift; + } + + my $ssid = $self->{'subsnp'}; + if(defined($ssid)) { + $ssid = 'ss'.$ssid unless $ssid =~ /^ss/; + } + + return $ssid; +} + +=head2 subsnp_handle + + Arg [1] : string $newval (optional) + The new value to set the subsnp_handle attribute to + Example : print $genotype->subsnp_handle(); + Description: Getter/Setter for the subsnp_handle attribute. + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub subsnp_handle{ + my $self = shift; + my $handle = shift; + + # if changing handle + if(defined($handle)) { + $self->{'subsnp_handle'} = $handle; + } + elsif (!defined($self->{'subsnp_handle'})) { + + # Check that this allele has an adaptor attached + assert_ref($self->adaptor(),'Bio::EnsEMBL::Variation::DBSQL::BaseGenotypeAdaptor'); + + $self->{'subsnp_handle'} = $self->adaptor->get_subsnp_handle($self); + } + + return $self->{'subsnp_handle'}; +} + + +=head2 ambiguity_code + + Example : print $genotype->ambiguity_code(); + Description: Get the ambiguity code for this genotype + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub ambiguity_code { + return strain_ambiguity_code($_[0]->genotype_string); +} + +sub AUTOLOAD { + my $self = shift; + + my $method = $AUTOLOAD; + $method =~ s/.*:://; + + if($method =~ /(allele)\_?(\d+)/) { + $method = $1; + unshift @_, $2; + } + + else { + return; + } + + return $self->$method(@_); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/GenotypeCode.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/GenotypeCode.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,77 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# Ensembl module for Bio::EnsEMBL::Variation::GenotypeCode +# +# Copyright (c) 2004 Ensembl +# + + +=head1 NAME + +Bio::EnsEMBL::Variation::GenotypeCode - Class for a genotype code object + +=head1 SYNOPSIS + + print (join "|", $genotypecode->genotype), "\n"; + +=head1 DESCRIPTION + +This is class representing a GenotypeCode. It is intended for internal use only. + +=head1 METHODS + +=cut + + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::GenotypeCode; + +use Bio::EnsEMBL::Storable; + +use vars qw(@ISA); + +@ISA = qw(Bio::EnsEMBL::Storable); + + +sub new_fast { + my $class = shift; + my $hashref = shift; + return bless $hashref, $class; +} + +=head2 genotype + + Example : $gt = $genotypecode->genotype() + Description: Getter for the genotype represented by this GenotypeCode object. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub genotype { + return $_[0]->{genotype}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Individual.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Individual.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,402 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# Ensembl module for Bio::EnsEMBL::Variation::Individual +# +# Copyright (c) 2004 Ensembl +# + + +=head1 NAME + +Bio::EnsEMBL::Variation::Individual - A single member of a population. + +=head1 SYNOPSIS + + $individual = Bio::EnsEMBL::Variation::Individual->new + (-name => 'WI530.07', + -description => 'african', + -gender => 'Male', + -father_individual => $father_ind, + -mother_individual => $mother_ind); + + ... + + print $individual->name(), ' - ', $individual->description(), "\n"; + print "Gender: ", $individual->gender(), "\n"; + print $individual->mother_Individual->name() + if($individual->mother_Individual()); + print $individual->father_Individual->name() + if($individual->father_Individual()); + + + +=head1 DESCRIPTION + +This is a class representing a single individual. An individual may be part +of one population or several. A pedigree may be constructed using the father_Individual +and mother_Individual attributes. + +=head1 METHODS + +=cut + + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::Individual; + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Variation::Sample; + +our @ISA = ('Bio::EnsEMBL::Variation::Sample'); + +=head2 new + + Arg [-dbID] : + int - unique internal identifier + Arg [-ADAPTOR] : + Bio::EnsEMBL::Variation::DBSQL::IndividualAdaptor + Arg [-NAME] : + string - name of this individual + Arg [-DESCRIPTION] : + string description - description of this individual + Arg [-GENDER] : + string - must be one of 'Male', 'Female', 'Unknown' + Arg [-FATHER_INDIVIDUAL] : + Bio::EnsEMBL::Variation::Individual - the father of this individual + Arg [-MOTHER_INDIVIDUAL] : + Bio::EnsEMBL::Variation::Individual - the mother of this individual + Arg [-MOTHER_INDIVIDUAL_SAMPLE_ID] : + int - set the internal id of the mother individual so that the actual + mother Individual object can be retrieved on demand. + Arg [-FATHER_INDIVIDUAL_SAMPLE_ID]: + int - set the internal id of the mother individual so that the actual + mother Individual object can be retrieved on demand. + Arg [-TYPE_INDIVIDUAL]: + int - name for the type of the individual (fully or partly inbred, outbred or mutant + Arg [-TYPE_DESCRIPTION]: + string - description of the type of individual + Example : $individual = Bio::EnsEMBL::Variation::Individual->new + (-name => 'WI530.07', + -description => 'african', + -gender => 'Male', + -father_individual => $father_ind, + -mother_individual => $mother_ind, + -type_individual => 'outbred', + -type_description => 'a single organism which breeds freely'); + Description: Constructor Instantiates an Individual object. + Returntype : Bio::EnsEMBL::Variation::Individual + Exceptions : throw if gender arg is provided but not valid + Caller : general + Status : At Risk + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my ($dbID, $adaptor, $name, $desc, $display_flag, $gender, $father, $mother, $type_name, $type_desc, + $father_id, $mother_id) = + rearrange([qw(dbID adaptor name description display gender + father_individual mother_individual + type_individual type_description + father_individual_sample_id mother_individual_sample_id)], @_); + + if(defined($gender)) { + $gender = ucfirst(lc($gender)); + if($gender ne 'Male' && $gender ne 'Female' && $gender ne 'Unknown') { + throw('Gender must be one of "Male","Female","Unknown"'); + } + } + + if (defined($type_name)){ + $type_name = ucfirst(lc($type_name)); + if ($type_name ne 'Fully_inbred' && $type_name ne 'Partly_inbred' && $type_name ne 'Outbred' && $type_name ne 'Mutant'){ + throw('Type of individual must of one of: "fully_inbred", "partly_inbred", "outbred", "mutant"'); + } + } + + return bless { + 'dbID' => $dbID, + 'adaptor' => $adaptor, + 'name' => $name, + 'description' => $desc, + 'display' => $display_flag, + 'gender' => $gender, + 'father_individual' => $father, + 'mother_individual' => $mother, + 'type_individual' => $type_name, + 'type_description' => $type_desc, + '_mother_individual_sample_id' => $mother_id, + '_father_individual_sample_id' => $father_id, + }, $class; +} + + +=head2 type_individual + + Arg [1] : int $newval (optional) + The new value to set the type_individual attribute to + Example : $type_individual = $obj->type_individual(); + Description : Getter/Setter for the type_individual attribute + Returntype : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub type_individual{ + my $self = shift; + if (@_){ + my $new_name = shift; + return $self->{'type_individual'} = $new_name; + } + return $self->{'type_individual'}; +} + +=head2 type_description + + Arg [1] : int $newval (optional) + The new value to set the type_description attribute to + Example : $type_description = $obj->type_description(); + Description : Getter/Setter for the type_description attribute + Returntype : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub type_description{ + my $self = shift; + if (@_){ + my $new_desc = shift; + return $self->{'type_description'} = $new_desc; + } + return $self->{'type_description'}; +} + +=head2 gender + + Arg [1] : string $newval (optional) + The new value to set the gender attribute to + Example : $gender = $obj->gender() + Description: Getter/Setter for the gender attribute + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub gender{ + my $self = shift; + + if(@_) { + my $gender = ucfirst(lc(shift)); + + if($gender ne 'Male' && $gender ne 'Female' && $gender ne 'Unknown') { + throw('Gender must be one of "Male","Female","Unknown"'); + } + $self->{'gender'} = $gender; + } + + return $self->{'gender'}; +} + + +=head2 display + + Arg [1] : string $newval (optional) + The new value to set the display attribute to + Example : $display = $obj->display() + Description: Getter/Setter for the display attribute + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub display{ + my $self = shift; + + if(@_) { + my $display = uc(shift); + + if($display ne 'UNDISPLAYABLE' && $display ne 'REFERENCE' && $display ne 'DISPLAYABLE' && $display ne 'DEFAULT') { + throw('Display flag must be one of "REFERENCE", "DEFAULT", "DISPLAYABLE", "UNDISPLAYABLE"'); + } + + $self->{'display'} = $display; + } + + return $self->{'display'}; +} + + +=head2 get_all_Populations + + Args : none + Example : $pops = $ind->get_all_Populations(); + Description : Getter for the Populations for this Individual. Returns + empty list if there are none. + ReturnType : listref of Bio::EnsEMBL::Population + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub get_all_Populations{ + my $self = shift; + + if(!defined($self->{populations})) { + if (defined ($self->{'adaptor'})){ + my $pop_adaptor = $self->{'adaptor'}->db()->get_PopulationAdaptor(); + $self->{populations} = $pop_adaptor->fetch_all_by_Individual($self); + } + } + + return $self->{populations}; +} + + + +=head2 father_Individual + + Arg [1] : string $newval (optional) + The new value to set the father_Individual attribute to + Example : $father_Individual = $obj->father_Individual() + Description: Getter/Setter for the father of this Individual. If this + has not been set manually and this Individual has an attached + adaptor, an attempt will be made to lazy-load it from the + database. + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub father_Individual{ + my $self = shift; + + if(@_) { + my $ind = shift; + if(defined($ind) && (!ref($ind) || + !$ind->isa('Bio::EnsEMBL::Variation::Individual'))) { + throw('Bio::EnsEMBL::Variation::Individual arg expected'); + } + if($ind->gender() eq 'Female') { + throw("Father individual may not have gender of Female"); + } + return $self->{'father_individual'} = $ind; + } + + # lazy-load mother if we can + if(!defined($self->{'father_individual'}) && $self->adaptor() && + defined($self->{'_father_individual_sample_id'})) { + $self->{'father_individual'} = + $self->adaptor->fetch_by_dbID($self->{'_father_individual_sample_id'}); + } + + return $self->{'father_individual'}; +} + + + +=head2 mother_Individual + + Arg [1] : string $newval (optional) + The new value to set the mother_Individual attribute to + Example : $mother_Individual = $obj->mother_Individual() + Description: Getter/Setter for the mother of this individual. If this + has not been set manually and this Individual has an attached + adaptor, an attempt will be made to lazy-load it from the + database. + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub mother_Individual{ + my $self = shift; + + if(@_) { + my $ind = shift; + if(defined($ind) && (!ref($ind) || + !$ind->isa('Bio::EnsEMBL::Variation::Individual'))) { + throw('Bio::EnsEMBL::Variation::Individual arg expected'); + } + if($ind->gender() eq 'Male') { + throw("Mother individual may not have gender of Male"); + } + return $self->{'mother_individual'} = $ind; + } + + # lazy-load mother if we can + if(!defined($self->{'mother_individual'}) && $self->adaptor() && + defined($self->{'_mother_individual_sample_id'})) { + $self->{'mother_individual'} = + $self->adaptor->fetch_by_dbID($self->{'_mother_individual_sample_id'}); + } + + return $self->{'mother_individual'}; +} + + + +=head2 get_all_child_Individuals + + Arg [1] : none + Example : foreach my $c (@{$ind->get_all_child_Individuals}) { + print "Child: " $c->name(), "\n"; + } + Description: Retrieves all individuals from the database which are children + of this individual. This will only work if this Individual + object has been stored in the database and has an attached + adaptor. + Returntype : reference to list of Bio::EnsEMBL::Variation::Individual objects + Exceptions : warning if this object does not have an attached adaptor + Caller : general + Status : At Risk + +=cut + +sub get_all_child_Individuals { + my $self = shift; + + if(!$self->adaptor()) { + warning("Cannot retrieve child individuals without attached adaptor."); + } + + return $self->adaptor()->fetch_all_by_parent_Individual($self); + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/IndividualGenotype.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/IndividualGenotype.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,163 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# Ensembl module for Bio::EnsEMBL::Variation::IndividualGenotype +# +# Copyright (c) 2004 Ensembl +# + + +=head1 NAME + +Bio::EnsEMBL::Variation::IndividualGenotype- Module representing the genotype +of a single individual at a single position + +=head1 SYNOPSIS + + print $genotype->variation()->name(), "\n"; + print $genotype->allele1(), '/', $genotype->allele2(), "\n"; + print $genotype->individual()->name(), "\n"; + +=head1 DESCRIPTION + +This is a class representing the genotype of a single diploid individual at +a specific position + +=head1 METHODS + +=cut + + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::IndividualGenotype; + +use Bio::EnsEMBL::Variation::Genotype; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning); + +use vars qw(@ISA); + +@ISA = qw(Bio::EnsEMBL::Variation::Genotype); + + + +=head2 new + + Arg [-adaptor] : + Bio::EnsEMBL::Variation::DBSQL::IndividualGenotypeAdaptor + Arg [-genotype] : + arrayref - arrayref of alleles making up this genotype (in haplotype order) + Arg [-allele2] : + string - One of the two alleles defining this genotype + Arg [-variation] : + Bio::EnsEMBL::Variation::Variation - The variation associated with this + genotype + Arg [-individual] : + Bio::EnsEMBL::Individual - The individual this genotype is for. + Example : $ind_genotype = Bio:EnsEMBL::Variation::IndividualGenotype->new( + -start => 100, + -end => 100, + -strand => 1, + -slice => $slice, + -genotype => ['A','T'], + -variation => $variation, + -individual => $ind + ); + Description: Constructor. Instantiates an IndividualGenotype object. + Returntype : Bio::EnsEMBL::Variation::IndividualGenotype + Exceptions : throw on bad argument + Caller : general + Status : At Risk + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + my ($adaptor, $genotype, $var, $varid, $ssid, $ind) = + rearrange([qw(adaptor genotype variation _variation_id subsnp individual)],@_); + + if(defined($var) && + (!ref($var) || !$var->isa('Bio::EnsEMBL::Variation::Variation'))) { + throw("Bio::EnsEMBL::Variation::Variation argument expected"); + } + + if(defined($ind) && + (!ref($ind) || !$ind->isa('Bio::EnsEMBL::Variation::Individual'))) { + throw("Bio::EnsEMBL::Variation::Individual argument expected"); + } + + $self->{'adaptor'} = $adaptor; + $self->{'genotype'} = $genotype; + $self->{'individual'} = $ind; + $self->{'variation'} = $var; + $self->{'subsnp'} = $ssid; + $self->{'_variation_id'} = $varid unless defined $var; + + return $self; +} + + +=head2 individual + + Arg [1] : (optional) Bio::EnsEMBL::Variation::Individual $ind + Example : $ind = $ind_genotype->individual(); + Description: Getter/Setter for the individual associated with this genotype + Returntype : Bio::EnsEMBL::Variation::Individual + Exceptions : throw on bad argument + Caller : general + Status : At Risk + +=cut + + +sub individual { + my $self = shift; + if(@_) { + my $ind = shift; + if(defined($ind) && + (!ref($ind) || !$ind->isa('Bio::EnsEMBL::Variation::Individual'))) { + throw('Bio::EnsEMBL::Variation::Individual argument expected'); + } + return $self->{'individual'} = $ind; + } + + if(!defined($self->{individual}) && defined($self->{sample_id})) { + my $ia = $self->adaptor->db->get_IndividualAdaptor; + + if(defined($ia)) { + my $i = $ia->fetch_by_dbID($self->{sample_id}); + + if(defined($i)) { + $self->{individual} = $i; + delete $self->{sample_id}; + } + } + } + + return $self->{'individual'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/IndividualGenotypeFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/IndividualGenotypeFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,129 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# Ensembl module for Bio::EnsEMBL::Variation::IndividualGenotype +# +# Copyright (c) 2004 Ensembl +# + + +=head1 NAME + +Bio::EnsEMBL::Variation::IndividualGenotype- Module representing the genotype +of a single individual at a single position + +=head1 SYNOPSIS + + print $genotype->variation()->name(), "\n"; + print $genotype->allele1(), '/', $genotype->allele2(), "\n"; + print $genotype->individual()->name(), "\n"; + +=head1 DESCRIPTION + +This is a class representing the genotype of a single diploid individual at +a specific position + +=head1 METHODS + +=cut + + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::IndividualGenotypeFeature; + +use Bio::EnsEMBL::Variation::IndividualGenotype; +use Bio::EnsEMBL::Feature; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning); + +use vars qw(@ISA); + +@ISA = qw(Bio::EnsEMBL::Variation::IndividualGenotype Bio::EnsEMBL::Feature); + + + +=head2 new + + Arg [-adaptor] : + Bio::EnsEMBL::Variation::DBSQL::IndividualAdaptor + Arg [-START] : + see superclass constructor + Arg [-END] : + see superclass constructor + Arg [-STRAND] : + see superclass constructor + Arg [-SLICE] : + see superclass constructor + Arg [-allele1] : + string - One of the two alleles defining this genotype + Arg [-allele2] : + string - One of the two alleles defining this genotype + Arg [-variation] : + Bio::EnsEMBL::Variation::Variation - The variation associated with this + genotype + Arg [-individual] : + Bio::EnsEMBL::Individual - The individual this genotype is for. + Example : $ind_genotype = Bio:EnsEMBL::Variation::IndividualGenotype->new( + -start => 100, + -end => 100, + -strand => 1, + -slice => $slice, + -genotype => ['A','T'], + -variation => $variation, + -individual => $ind + ); + Description: Constructor. Instantiates an IndividualGenotype object. + Returntype : Bio::EnsEMBL::Variation::IndividualGenotype + Exceptions : throw on bad argument + Caller : general + Status : At Risk + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + my ($adaptor, $allele1, $allele2, $var, $var_id, $ind) = + rearrange([qw(adaptor genotype variation _variation_id individual)],@_); + + if(defined($var) && (!ref($var) || !$var->isa('Bio::EnsEMBL::Variation::Variation'))) { + throw("Bio::EnsEMBL::Variation::Variation argument expected"); + } + + if(defined($ind) && (!ref($ind) || !$ind->isa('Bio::EnsEMBL::Variation::Individual'))) { + throw("Bio::EnsEMBL::Variation::Individual argument expected"); + } + + $self->{'adaptor'} = $adaptor; + $self->{'allele1'} = $allele1; + $self->{'allele2'} = $allele2; + $self->{'individual'} = $ind; + $self->{'variation'} = $var; + $self->{'_variation_id'} = $var_id; + + return $self; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/IntergenicStructuralVariation.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/IntergenicStructuralVariation.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,79 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +package Bio::EnsEMBL::Variation::IntergenicStructuralVariation; + +use strict; +use warnings; + +use Bio::EnsEMBL::Variation::IntergenicStructuralVariationAllele; + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + +use base qw(Bio::EnsEMBL::Variation::StructuralVariationOverlap); + +sub new { + my $class = shift; + + my %args = @_; + + for my $arg (keys %args) { + if (lc($arg) eq '-feature') { + throw("Intergenic variations do not have an associated feature!"); + } + } + + # call the superclass constructor + my $self = $class->SUPER::new(%args) || return undef; + + # rebless the alleles from vfoas to ivas + map { bless $_, 'Bio::EnsEMBL::Variation::IntergenicStructuralVariationAllele' } + @{ $self->get_all_IntergenicStructuralVariationAlleles }; + + return $self; +} + +sub feature { + my $self = shift; + warning("Intergenic variants do not have an associated feature!") if @_; + return undef; +} + +sub add_IntergenicStructuralVariationAllele { + my $self = shift; + return $self->SUPER::add_StructuralVariationOverlapAllele(@_); +} + +sub get_reference_IntergenicStructuralVariationAllele { + my $self = shift; + return $self->SUPER::get_reference_StructuralVariationOverlapAllele(@_); +} + +sub get_all_alternate_IntergenicStructuralVariationAlleles { + my $self = shift; + return $self->SUPER::get_all_alternate_StructuralVariationOverlapAlleles(@_); +} + +sub get_all_IntergenicStructuralVariationAlleles { + my $self = shift; + return $self->SUPER::get_all_StructuralVariationOverlapAlleles(@_); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/IntergenicStructuralVariationAllele.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/IntergenicStructuralVariationAllele.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,53 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +package Bio::EnsEMBL::Variation::IntergenicStructuralVariationAllele; + +use strict; +use warnings; + +use Bio::EnsEMBL::Variation::Utils::Constants qw(%OVERLAP_CONSEQUENCES); + +use base qw(Bio::EnsEMBL::Variation::StructuralVariationOverlapAllele); + +sub new_fast { + my ($self, $hashref) = @_; + + # swap an intergenic_structural_variation argument for a variation_feature_overlap one + + if ($hashref->{intergenic_structural_variation}) { + $hashref->{variation_feature_overlap} = delete $hashref->{intergenic_structural_variation}; + } + + # and call the superclass + + return $self->SUPER::new_fast($hashref); +} + +sub intergenic_structural_variation { + my $self = shift; + return $self->variation_feature_overlap(@_); +} + +sub get_all_OverlapConsequences { + return [ $OVERLAP_CONSEQUENCES{intergenic_variant} ]; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/IntergenicVariation.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/IntergenicVariation.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,79 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +package Bio::EnsEMBL::Variation::IntergenicVariation; + +use strict; +use warnings; + +use Bio::EnsEMBL::Variation::IntergenicVariationAllele; + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + +use base qw(Bio::EnsEMBL::Variation::VariationFeatureOverlap); + +sub new { + my $class = shift; + + my %args = @_; + + for my $arg (keys %args) { + if (lc($arg) eq '-feature') { + throw("Intergenic variations do not have an associated feature!"); + } + } + + # call the superclass constructor + my $self = $class->SUPER::new(%args) || return undef; + + # rebless the alleles from vfoas to ivas + map { bless $_, 'Bio::EnsEMBL::Variation::IntergenicVariationAllele' } + @{ $self->get_all_IntergenicVariationAlleles }; + + return $self; +} + +sub feature { + my $self = shift; + warning("Intergenic variants do not have an associated feature!") if @_; + return undef; +} + +sub add_IntergenicVariationAllele { + my $self = shift; + return $self->SUPER::add_VariationFeatureOverlapAllele(@_); +} + +sub get_reference_IntergenicVariationAllele { + my $self = shift; + return $self->SUPER::get_reference_VariationFeatureOverlapAllele(@_); +} + +sub get_all_alternate_IntergenicVariationAlleles { + my $self = shift; + return $self->SUPER::get_all_alternate_VariationFeatureOverlapAlleles(@_); +} + +sub get_all_IntergenicVariationAlleles { + my $self = shift; + return $self->SUPER::get_all_VariationFeatureOverlapAlleles(@_); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/IntergenicVariationAllele.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/IntergenicVariationAllele.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,53 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +package Bio::EnsEMBL::Variation::IntergenicVariationAllele; + +use strict; +use warnings; + +use Bio::EnsEMBL::Variation::Utils::Constants qw(%OVERLAP_CONSEQUENCES); + +use base qw(Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele); + +sub new_fast { + my ($self, $hashref) = @_; + + # swap an intergenic_variation argument for a variation_feature_overlap one + + if ($hashref->{intergenic_variation}) { + $hashref->{variation_feature_overlap} = delete $hashref->{intergenic_variation}; + } + + # and call the superclass + + return $self->SUPER::new_fast($hashref); +} + +sub intergenic_variation { + my $self = shift; + return $self->variation_feature_overlap(@_); +} + +sub get_all_OverlapConsequences { + return [ $OVERLAP_CONSEQUENCES{intergenic_variant} ]; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/LDFeatureContainer.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/LDFeatureContainer.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,482 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# Ensembl module for Bio::EnsEMBL::Variation::LDFeatureContainer +# +# Copyright (c) 2004 Ensembl +# + + +=head1 NAME + +Bio::EnsEMBL::Variation::LDFeatureContainer - A container with all the ld values for quick access + +=head1 SYNOPSIS + + # LDFeature Container representing all the LD values for a certain contig + $ldContainer = Bio::EnsEMBL::Variation::LDFeatureContainer->new + (-name => NT_085213, + -ldContainer => $ldhash, + -variation_features => $vfhash); + + ... + + #get the d_prime values for a certain pair of variation_features + d_prime = $ldContainer->get_d_prime($variation_feature_1,$variation_feature_2); + #get the list of variation in the container + $variations = $ldContainer->get_variations(); + + ... + +=head1 DESCRIPTION + +This is a class representing the LD information for a certain region +from the ensembl-variation database. +See B. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::LDFeatureContainer; + +use Bio::EnsEMBL::Storable; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); + +use vars qw(@ISA); + +@ISA = qw(Bio::EnsEMBL::Storable); + +=head2 new + + Arg [-NAME] : + string - name of the feature object that the LD container refers to. (chr1,NT_08542,...) + + Arg [-LDCONTAINER] : + reference - hash containing all the LD information present, with the key + (variation_feature_1-variation_feature_2) to access the information + + Arg [-VARIATIONFEATURES] : + reference - hash containing all the Bio::EnsEMBL::Variation::VariationFeature objects that are present in the Container + + Example : + $ldContainer = Bio::EnsEMBL::Variation::LDFeatureContainer->new + (-name => 'chr1' + -ldContainer => {'variation_feature_1-variation_feature_2' => { 'sample_id_1' => + { 'd_prime' => 0.5, + 'r2' => 0.421, + 'sample_count' => 120 + }, + 'sample_id_2' => + { 'd_prime' => 0.3, + 'r2' => 0.321, + 'sample_count' => 35 + } + } + + } + -variationFeatures => hash of Bio::EnsEMBL::Variation::VariationFeature + ); + + + Description: Constructor. Instantiates a new LDFeatureContainer object. + Returntype : Bio::EnsEMBL::Variation::LDFeatureContainer + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my ($ldContainer,$name,$variationFeatures) = rearrange([qw(LDCONTAINER NAME VARIATIONFEATURES)], @_); + if (defined($ldContainer) && ref($ldContainer ne 'HASH')){ + throw("Reference to a hash object expected as a LDContainer"); + } + $ldContainer ||= {}; + + return bless {'name' => $name, + 'ldContainer' => $ldContainer, + 'variationFeatures' => $variationFeatures}, $class; +} + +=head2 name + + Arg [1] : string $newval (optional) + The new value to set the name attribute to + Example : $name = $obj->name() + Description: Getter/Setter for the name attribute. + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub name{ + my $self = shift; + return $self->{'name'} = shift if(@_); + return $self->{'name'}; +} + + +=head2 get_variations + + Example : $variations = $obj->get_variations() + Description : Gets all the variation objects contained in the LDFeatureContainer + ReturnType : list of Bio::EnsEMBL::Variation::Variation + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub get_variations{ + my $self = shift; + my @variations; + + foreach my $variation_feature (keys %{$self->{'variationFeatures'}}){ + push @variations,$self->{'variationFeatures'}->{$variation_feature}->variation(); + } + return \@variations; +} + +=head2 get_r_square + + Arg [1] : Bio::EnsEMBL::Variation::VariationFeature $variationFeature + Arg [2] : Bio::EnsEMBL::Variation::VariationFeature $variationFeature + Arg [3] : (optional) int - sample_id of population you want to get the r_square value + Example : $r_square = $obj->get_r_square($vf1,$vf2,$sample_id); + Description : Get the r_square value for a pair of variation features in the given population. If no population is provided, + return the r_square for the default population with more sample counts (in case more than 1) + ReturnType : float + Exceptions : throw on incorrect arguments + Caller : general + Status : At Risk + +=cut + +sub get_r_square{ + my $self = shift; + my $variation_feature_1 = shift; + my $variation_feature_2 = shift; + my $sample_id = shift; + + $sample_id ||= 0; #in case no population provided, to avoid warning in the hash + my $r_square; + my $key; + + #check if the default poppulation has been calculated, otherwise, find it + if (! defined $self->{'_default_population'}){ + $self->{'_default_population'} = $self->_get_major_population; + } + #first of all, check that both arguments have been properly provided + if (!ref($variation_feature_1) || !$variation_feature_1->isa('Bio::EnsEMBL::Variation::VariationFeature') || !ref($variation_feature_2) || !$variation_feature_2->isa('Bio::EnsEMBL::Variation::VariationFeature')){ + throw("Bio::EnsEMBL::Variation::VariationFeature arguments expected"); + } + else{ + #check if the ldContainer does not contain pairwise information for the variation features provided + if (!exists $self->{'ldContainer'}->{$variation_feature_1->dbID() . '-' . $variation_feature_2->dbID()} && !exists $self->{'ldContainer'}->{$variation_feature_2->dbID() . '-' . $variation_feature_1->dbID()}){ + warning("variation features have no pairwise ld information"); + } + else{ + #find out the key in the ld Hash: vf1 - vf2 or vf2 - vf1 + if (exists $self->{'ldContainer'}->{$variation_feature_1->dbID() . '-' . $variation_feature_2->dbID()}){ + $key = $variation_feature_1->dbID() . '-' . $variation_feature_2->dbID(); + } + else{ + $key = $variation_feature_2->dbID() . '-' . $variation_feature_1->dbID(); + } + #and finally, if population provided or the only population + if (exists $self->{'ldContainer'}->{$key}->{$sample_id}){ + $r_square = $self->{'ldContainer'}->{$key}->{$sample_id}->{'r2'} + } + else{ + if (exists $self->{'ldContainer'}->{$key}->{$self->{'_default_population'}}){ + #there was no population provided, return the r_square for the default population + $r_square = $self->{'ldContainer'}->{$key}->{$self->{'_default_population'}}->{'r2'}; + } + else{ + warning("variation features have no pairwise ld information for default population ", $self->{'_default_population'}); + } + } + } + + } + return $r_square; +} + +=head2 get_d_prime + + Arg [1] : Bio::EnsEMBL::Variation::VariationFeature $variationFeature + Arg [2] : Bio::EnsEMBL::Variation::VariationFeature $variationFeature + Arg [3] : (optional) int - sample_id of population you want to get the d_prime value + Example : $d_prime = $obj->get_d_prime($vf1,$vf2,$sample_id); + Description : Get the d_prime value for a pair of variation features for a known or unknown population. In case of an unknown population, the default +poulation is used + ReturnType : float + Exceptions : throw on incorrect arguments + Caller : general + Status : At Risk + +=cut + +sub get_d_prime{ + my $self = shift; + my $variation_feature_1 = shift; + my $variation_feature_2 = shift; + my $sample_id = shift; + + $sample_id ||= 0; #in case no population provided, to avoid warning in the hash + my $d_prime; + my $key; + + if (! defined $self->{'_default_population'}){ + $self->{'_default_population'} = $self->_get_major_population; + } + #first of all, check that both arguments have been properly provided + if (!ref($variation_feature_1) || !$variation_feature_1->isa('Bio::EnsEMBL::Variation::VariationFeature') || !ref($variation_feature_2) || !$variation_feature_2->isa('Bio::EnsEMBL::Variation::VariationFeature')){ + throw("Bio::EnsEMBL::Variation::VariationFeature arguments expected"); + } + else{ + #check if the ldContainer does not contain pairwise information for the variation features provided + if (!exists $self->{'ldContainer'}->{$variation_feature_1->dbID() . '-' . $variation_feature_2->dbID()} && !exists $self->{'ldContainer'}->{$variation_feature_2->dbID() . '-' . $variation_feature_1->dbID()}){ + warning("variation features have no pairwise ld information"); + } + else{ + #find out the key in the ld Hash: vf1 - vf2 or vf2 - vf1 + if (exists $self->{'ldContainer'}->{$variation_feature_1->dbID() . '-' . $variation_feature_2->dbID()}){ + $key = $variation_feature_1->dbID() . '-' . $variation_feature_2->dbID(); + } + else{ + $key = $variation_feature_2->dbID() . '-' . $variation_feature_1->dbID(); + } + #and finally, if population provided or the only population + if (exists $self->{'ldContainer'}->{$key}->{$sample_id}){ + $d_prime = $self->{'ldContainer'}->{$key}->{$sample_id}->{'d_prime'}; + } + else{ + if (exists $self->{'ldContainer'}->{$key}->{$self->{'_default_population'}}){ + #there was no population provided, return the r_square for the default population + $d_prime = $self->{'ldContainer'}->{$key}->{$self->{'_default_population'}}->{'d_prime'}; + } + else{ + warning("variation features have no pairwise ld information for default population ", $self->{'_default_population'}); + } + } + } + } + return $d_prime; +} + + +=head2 get_all_ld_values + + Example : $ld_values = $obj->get_all_ld_values(); + Description : Get all the information contained in the LDFeatureContainer object + ReturnType : reference to list of hashes [{variation1 => Bio::EnsEMBL::Variation::VariationFeature, variation2=>Bio::EnsEMBL::Variation::VariationFeature, d_prime=>d_prime, r2=>r2, sample_count=>sample_count, sample_id=>population_sample_id}] + Exceptions : no exceptions + Caller : general + Status : At Risk + +=cut + + +sub get_all_ld_values{ + my $self = shift; + my @ld_values; #contains ALL the ld values in the container + + #the keys in the ldContainer hash + my $variation_feature_id_1; + my $variation_feature_id_2; + + if (! defined $self->{'_default_population'}){ + $self->{'_default_population'} = $self->_get_major_population; + } + foreach my $key_ld (keys %{$self->{'ldContainer'}}){ + my %ld_value; #contains a single ld value in the container {variation_feature variation_feature d_prime r2 snp_distance_count} + ($variation_feature_id_1, $variation_feature_id_2) = split /-/,$key_ld; #get the variation_features ids + if (exists $self->{'ldContainer'}->{$key_ld}->{$self->{'_default_population'}}){ + #add the information to the ld_value hash + $ld_value{'variation1'} = $self->{'variationFeatures'}->{$variation_feature_id_1}; + $ld_value{'variation2'} = $self->{'variationFeatures'}->{$variation_feature_id_2}; + $ld_value{'d_prime'} = $self->{'ldContainer'}->{$key_ld}->{$self->{'_default_population'}}->{'d_prime'}; + $ld_value{'r2'} = $self->{'ldContainer'}->{$key_ld}->{$self->{'_default_population'}}->{'r2'}; + $ld_value{'sample_count'} = $self->{'ldContainer'}->{$key_ld}->{$self->{'_default_population'}}->{'sample_count'}; + $ld_value{'sample_id'} = $self->{'_default_population'}; + push @ld_values, \%ld_value; + } + } + return \@ld_values; +} + + +=head2 get_all_r_square_values + + Example : $r_square_values = $obj->get_all_r_square_values(); + Description : Get all r_square values contained in the LDFeatureContainer object + ReturnType : reference to list of [{variation1=>Bio::EnsEMBL::Variation::VariationFeature, variation2=>Bio::EnsEMBL::Variation::VariationFeature, r2=>r2, sample_id=>population_sample_id}] + Exceptions : no exceptions + Caller : general + Status : At Risk + +=cut + + +sub get_all_r_square_values{ + my $self = shift; + my @r_squares; #contains ALL the r2 values in the container + + #the keys in the ldContainer hash + my $variation_feature_id_1; + my $variation_feature_id_2; + + if (! defined $self->{'_default_population'}){ + $self->{'_default_population'} = $self->_get_major_population; + } + foreach my $key_ld (keys %{$self->{'ldContainer'}}){ + my %r_square; #contains a single r2 value in the container {variation_feature r2 population_id} + ($variation_feature_id_1, $variation_feature_id_2) = split /-/,$key_ld; #get the variation_features ids + if (exists $self->{'ldContainer'}->{$key_ld}->{$self->{'_default_population'}}){ + $r_square{'variation1'} = $self->{'variationFeatures'}->{$variation_feature_id_1}; + $r_square{'variation2'} = $self->{'variationFeatures'}->{$variation_feature_id_2}; + $r_square{'r2'} = $self->{'ldContainer'}->{$key_ld}->{$self->{'_default_population'}}->{'r2'}; + $r_square{'sample_id'} = $self->{'_default_population'}; + #and add all the ld information to the final list + push @r_squares, \%r_square; + } + } + return \@r_squares; +} + +=head2 get_all_d_prime_values + + Example : $d_prime_values = $obj->get_all_d_prime_values(); + Description : Get all d_prime values contained in the LDFeatureContainer object + ReturnType : reference to list of [{variation1=>Bio::EnsEMBL::Variation::VariationFeature, variation2=>Bio::EnsEMBL::Variation::VariationFeature, d_prime=>d_prime, sample_id=>population_sample_id}] + Exceptions : no exceptions + Caller : general + Status : At Risk + +=cut + + +sub get_all_d_prime_values{ + my $self = shift; + my @d_primes; #contains ALL the d_prime values in the container + + #the keys in the ldContainer hash + my $variation_feature_id_1; + my $variation_feature_id_2; + + if (! defined $self->{'_default_population'}){ + $self->{'_default_population'} = $self->_get_major_population; + } + foreach my $key_ld (keys %{$self->{'ldContainer'}}){ + my %d_prime; #contains a single d_prime value in the container {variation_feature d_prime population_id} + ($variation_feature_id_1, $variation_feature_id_2) = split /-/,$key_ld; #get the variation_features ids + #add the information to the ld_value array + if (exists $self->{'ldContainer'}->{$key_ld}->{$self->{'_default_population'}}){ + $d_prime{'variation1'} = $self->{'variationFeatures'}->{$variation_feature_id_1}; + $d_prime{'variation2'} = $self->{'variationFeatures'}->{$variation_feature_id_2}; + $d_prime{'d_prime'} = $self->{'ldContainer'}->{$key_ld}->{$self->{'_default_population'}}->{'d_prime'}; + $d_prime{'sample_id'} = $self->{'_default_population'}; + #and add all the ld information to the final list if exists the value + push @d_primes, \%d_prime; + } + + } + return \@d_primes; +} + +=head2 get_all_populations + + Arg [1] : (optional) Bio::EnsEMBL::Variation::VariationFeature $variationFeature + Arg [2] : (optional) Bio::EnsEMBL::Variation::VariationFeature $variationFeature + Example : $populations = $obj->get_all_populations($vf1,$vf2); + Description : If no arguments provided, returns ALL the populations present in the container. When 2 variation features provided, returns the + population/populations where these variation features occurs + ReturnType : ref to an array of int + Exceptions : throw on incorrect arguments + Caller : general + Status : At Risk + +=cut + +sub get_all_populations{ + my $self = shift; + my $variation_feature_1 = shift; + my $variation_feature_2 = shift; + my %populations; + my @populations; + my $key; + + #if no variation provided, return ALL the populations in the container + if (! defined($variation_feature_1) && ! defined($variation_feature_2)){ + foreach my $key (keys %{$self->{'ldContainer'}}){ + map {$populations{$_}++} keys %{$self->{'ldContainer'}->{$key}}; + } + @populations = keys %populations; + } + else{ + #first, check if both arguments have been properly provided + if (!ref($variation_feature_1) || !$variation_feature_1->isa('Bio::EnsEMBL::Variation::VariationFeature') || !ref($variation_feature_2) || !$variation_feature_2->isa('Bio::EnsEMBL::Variation::VariationFeature')){ + throw("Bio::EnsEMBL::Variation::VariationFeature arguments expected"); + } + #if the variation_features are correct, return the list of populations + else{ + #find out the key in the ld Hash: vf1 - vf2 or vf2 - vf1 + if (exists $self->{'ldContainer'}->{$variation_feature_1->dbID() . '-' . $variation_feature_2->dbID()}){ + $key = $variation_feature_1->dbID() . '-' . $variation_feature_2->dbID(); + } + else{ + $key = $variation_feature_2->dbID() . '-' . $variation_feature_1->dbID(); + } + @populations = keys %{$self->{'ldContainer'}->{$key}}; + } + } + + return \@populations; +} + +#returns from the container the population_id with the maximum number of pairwise_ld +sub _get_populations { + my $self = shift; + my %populations; + + foreach my $key (keys %{$self->{'ldContainer'}}){ + map {$populations{$_}++} keys %{$self->{'ldContainer'}->{$key}}; + } + my @sorted_populations = sort{$populations{$b} <=> $populations{$a}} keys %populations; + return @sorted_populations; +} + +sub _get_major_population { + my( $pop ) = $_[0]->_get_populations; + return $pop; +} +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/MotifFeatureVariation.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/MotifFeatureVariation.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,94 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +package Bio::EnsEMBL::Variation::MotifFeatureVariation; + +use strict; +use warnings; + +use Bio::EnsEMBL::Variation::MotifFeatureVariationAllele; +use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(overlap); + +use base qw(Bio::EnsEMBL::Variation::RegulationVariation); + +sub new { + my $class = shift; + + my %args = @_; + + # swap a '-motif_feature' argument for a '-feature' one for the superclass + + for my $arg (keys %args) { + if (lc($arg) eq '-motif_feature') { + $args{'-feature'} = delete $args{$arg}; + } + } + + # call the superclass constructor + my $self = $class->SUPER::new(%args) || return undef; + + # rebless the alleles from vfoas to mfvas + map { bless $_, 'Bio::EnsEMBL::Variation::MotifFeatureVariationAllele' } + @{ $self->get_all_MotifFeatureVariationAlleles }; + + return $self; +} + +sub motif_feature_stable_id { + my $self = shift; + return $self->SUPER::feature_stable_id(@_); +} + +sub motif_feature { + my ($self, $mf) = @_; + return $self->SUPER::feature($mf, 'MotifFeature'); +} + +sub add_MotifFeatureVariationAllele { + my $self = shift; + return $self->SUPER::add_VariationFeatureOverlapAllele(@_); +} + +sub get_reference_MotifFeatureVariationAllele { + my $self = shift; + return $self->SUPER::get_reference_VariationFeatureOverlapAllele(@_); +} + +sub get_all_alternate_MotifFeatureVariationAlleles { + my $self = shift; + return $self->SUPER::get_all_alternate_VariationFeatureOverlapAlleles(@_); +} + +sub get_all_MotifFeatureVariationAlleles { + my $self = shift; + return $self->SUPER::get_all_VariationFeatureOverlapAlleles(@_); +} + +sub _motif_feature_seq { + my $self = shift; + + my $mf = $self->motif_feature; + + my $mf_seq = $mf->{_variation_effect_feature_cache}->{seq} ||= $mf->seq; + + return $mf_seq; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/MotifFeatureVariationAllele.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/MotifFeatureVariationAllele.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,230 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +package Bio::EnsEMBL::Variation::MotifFeatureVariationAllele; + +use strict; +use warnings; + +use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(overlap); +use Bio::EnsEMBL::Utils::Scalar qw(assert_ref); + +use base qw(Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele); + +sub new_fast { + my ($self, $hashref) = @_; + + # swap a motif_feature_variation argument for a variation_feature_overlap one + + if ($hashref->{motif_feature_variation}) { + $hashref->{variation_feature_overlap} = delete $hashref->{motif_feature_variation}; + } + + # and call the superclass + + return $self->SUPER::new_fast($hashref); +} + +=head2 motif_feature_variation + + Description: Get/set the associated MotifFeatureVariation + Returntype : Bio::EnsEMBL::Variation::MotifFeatureVariation + Status : At Risk + +=cut + +sub motif_feature_variation { + my ($self, $mfv) = shift; + assert_ref($mfv, 'Bio::EnsEMBL::Variation::MotifFeatureVariation') if $mfv; + return $self->variation_feature_overlap($mfv); +} + +=head2 motif_feature + + Description: Get/set the associated MotifFeature + Returntype : Bio::EnsEMBL::Funcgen::MotifFeature + Status : At Risk + +=cut + +sub motif_feature { + my $self = shift; + return $self->motif_feature_variation->motif_feature; +} + +=head2 motif_start + + Description: Get the (1-based) relative position of the variation feature start in the motif + Returntype : integer + Status : At Risk + +=cut + +sub motif_start { + + my $self = shift; + + my $mf = $self->motif_feature; + my $vf = $self->variation_feature; + + return undef unless defined $vf->seq_region_start && defined $mf->seq_region_start; + + my $mf_start = $vf->seq_region_start - $mf->seq_region_start + 1; + + # adjust if the motif is on the reverse strand + + $mf_start = $mf->binding_matrix->length - $mf_start + 1 if $mf->strand < 0; + + # check that we're in bounds + + return undef if $mf_start > $mf->length; + + return $mf_start; +} + +=head2 motif_end + + Description: Get the (1-based) relative position of the variation feature end in the motif + Returntype : integer + Status : At Risk + +=cut + +sub motif_end { + + my $self = shift; + + my $mf = $self->motif_feature; + my $vf = $self->variation_feature; + + return undef unless defined $vf->seq_region_end && defined $mf->seq_region_start; + + my $mf_end = $vf->seq_region_end - $mf->seq_region_start + 1; + + # adjust if the motif is on the reverse strand + + $mf_end = $mf->binding_matrix->length - $mf_end + 1 if $mf->strand < 0; + + # check that we're in bounds + + return undef if $mf_end < 1; + + return $mf_end; +} + +=head2 in_informative_position + + Description: Identify if the variation feature falls in a high information position of the motif + Returntype : boolean + Status : At Risk + +=cut + +sub in_informative_position { + my $self = shift; + + # we can only call this for true SNPs + + my $vf = $self->variation_feature; + + unless (($vf->start == $vf->end) && ($self->variation_feature_seq ne '-')) { + return undef; + } + + # get the 1-based position + + my $start = $self->motif_start; + + return undef unless defined $start && $start >= 1 && $start <= $self->motif_feature->length; + + return $self->motif_feature->binding_matrix->is_position_informative($start); +} + +=head2 motif_score_delta + + Description: Calculate the difference in motif score between the reference and variant sequences + Returntype : float + Status : At Risk + +=cut + +sub motif_score_delta { + + my $self = shift; + my $linear = shift; + + unless ($self->{motif_score_delta}) { + + my $vf = $self->motif_feature_variation->variation_feature; + my $mf = $self->motif_feature; + + my $allele_seq = $self->feature_seq; + my $ref_allele_seq = $self->motif_feature_variation->get_reference_MotifFeatureVariationAllele->feature_seq; + + if ($allele_seq eq '-' || + $ref_allele_seq eq '-' || + length($allele_seq) != length($ref_allele_seq)) { + # we can't call a score because the sequence will change length + return undef; + } + + my ($mf_start, $mf_end) = ($self->motif_start, $self->motif_end); + + return undef unless defined $mf_start && defined $mf_end; + + my $mf_seq = $self->motif_feature_variation->_motif_feature_seq; + my $mf_seq_length = length($mf_seq); + + # trim allele_seq + if($mf_start < 1) { + $allele_seq = substr($allele_seq, 1 - $mf_start); + $mf_start = 1; + } + + if($mf_end > $mf_seq_length) { + $allele_seq = substr($allele_seq, 0, $mf_seq_length - $mf_start + 1); + $mf_end = $mf_seq_length; + } + + my $var_len = length($allele_seq); + + return undef if $var_len > $mf->length; + + my $matrix = $mf->binding_matrix; + + # get the binding affinity of the reference sequence + my $ref_affinity = $matrix->relative_affinity($mf_seq, $linear); + + # splice in the variant sequence (0-based) + substr($mf_seq, $mf_start - 1, $var_len) = $allele_seq; + + # check length hasn't changed + return undef if length($mf_seq) != $mf_seq_length; + + # and get the affinity of the variant sequence + my $var_affinity = $matrix->relative_affinity($mf_seq, $linear); + + $self->{motif_score_delta} = ($var_affinity - $ref_affinity); + } + + return $self->{motif_score_delta}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/OverlapConsequence.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/OverlapConsequence.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,436 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Variation::OverlapConsequence + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Variation::OverlapConsequence; + + my $oc = Bio::EnsEMBL::Variation::OverlapConsequence->new( + -display_term => 'NON_SYNONYMOUS_CODING', + -SO_term => 'non_synonymous_codon', + -SO_accession => 'SO:0001583', + -NCBI_term => 'missense', + -feature_SO_term => 'mRNA', + -description => 'In coding sequence and results in an amino acid change in the encoded peptide sequence', + -predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::non_synonymous_codon', + -label => 'Non-synonymous coding', + -rank => 7, + -feature_class => 'Bio::EnsEMBL::Transcript', + ); + + if ($oc->predicate($transcript_variation_allele)) { + print "This allele is: ", $oc->display_term, "\n"; + } + +=head1 DESCRIPTION + +An OverlapConsequence represents the consequence of an allele of a VariationFeature overlapping +some other Ensembl Feature (and therefore applies to VariationFeatureOverlapAllele objects as these +represent just such an event). It contains various values that represent the consequence type, such +as the Sequence Ontology (SO) term and accession (which should always be unique), the Ensembl +display_term (which will not always be unique), the relative rank of this consequence when compared +to other consequences etc. It also contains a reference to a subroutine, referred to as the +'predicate', which if a called with a VariationFeatureOverlapAllele (or a subclass) as the first and +only argument, will return a true or false value if this consequence type applies to this allele. + +The list of OverlapConsequences used by Ensembl is defined in the Bio::EnsEMBL::Variation::Utils::Constants +module, and can be imported from there. + +=cut + +package Bio::EnsEMBL::Variation::OverlapConsequence; + +use strict; +use warnings; + +use Bio::EnsEMBL::Utils::Exception qw(throw); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Variation::Utils::VariationEffect; + +=head2 new + + Arg [-SO_ACCESSION] : + The Sequence Ontology accession for this consequence type + + Arg [-SO_TERM] : + The Sequence Ontology term for this consequence type + + Arg [-FEATURE_SO_TERM] : + The Sequence Ontology term for the feature affected by this consequence type + + Arg [-FEATURE_CLASS] : + The Ensembl class that represents the feature affected by this consequence type + + Arg [-VARIANT_FEATURE_CLASS] : + The Ensembl class that represents the variation feature this consequence applies to + + Arg [-PREDICATE] : + A reference to a subroutine that checks if this consequence type holds for + a given VariationFeatureOverlapAllele (or the name of such a subroutine) + + Arg [-RANK] : + The relative rank of this consequence type when compred to other OverlapConsequence + objects + + Arg [-DISPLAY_TERM] : + The Ensembl display term for this consequence type (used by default on the website) + + Arg [-NCBI_TERM] : + The NCBI term for this consequence type + + Arg [-DESCRIPTION] : + A freetext description of this consequence type (used on the website) + + Arg [-LABEL] : + A freetext label briefly describing this consequence type (used on the website) + + Arg [-IS_DEFAULT] : + A flag indicating if this is the default consequence type used when none other applies + (in Ensembl this currently set on the intergenic OverlapConsequence) + + Example : + my $oc = Bio::EnsEMBL::Variation::OverlapConsequence->new( + -display_term => 'NON_SYNONYMOUS_CODING', + -SO_term => 'non_synonymous_codon', + -SO_accession => 'SO:0001583', + -NCBI_term => 'missense', + -feature_SO_term => 'mRNA', + -description => 'In coding sequence and results in an amino acid change in the encoded peptide sequence', + -predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::non_synonymous_codon', + -label => 'Non-synonymous coding', + -rank => 7, + -tier => 1, + -feature_class => 'Bio::EnsEMBL::Transcript', + ); + + Description: Constructs a new OverlapConsequence instance + Returntype : A new Bio::EnsEMBL::Variation::OverlapConsequence instance + Exceptions : none + Status : At Risk + +=cut + +sub new { + my $class = shift; + + my ( + $SO_accession, + $SO_term, + $feature_SO_term, + $feature_class, + $variant_feature_class, + $predicate, + $rank, + $tier, + $display_term, + $NCBI_term, + $description, + $label, + $is_default, + ) = rearrange([qw( + SO_ACCESSION + SO_TERM + FEATURE_SO_TERM + FEATURE_CLASS + VARIANT_FEATURE_CLASS + PREDICATE + RANK + TIER + DISPLAY_TERM + NCBI_TERM + DESCRIPTION + LABEL + IS_DEFAULT + )], @_); + + my $self = bless { + SO_accession => $SO_accession, + SO_term => $SO_term, + feature_SO_term => $feature_SO_term, + feature_class => $feature_class, + variant_feature_class => $variant_feature_class, + predicate => $predicate, + rank => $rank, + tier => $tier, + display_term => $display_term, + NCBI_term => $NCBI_term, + description => $description, + label => $label, + is_default => $is_default, + }, $class; + + return $self; +} + +sub new_fast { + my ($class, $hashref) = @_; + return bless $hashref, $class; +} + +=head2 SO_accession + + Arg [1] : (optional) accession to set + Description: Get/set the Sequence Ontology accession for this consequence type + Returntype : string + Exceptions : none + Status : At Risk + +=cut + +sub SO_accession { + my ($self, $SO_accession) = @_; + $self->{SO_accession} = $SO_accession if $SO_accession; + return $self->{SO_accession}; +} + +=head2 SO_term + + Arg [1] : (optional) term to set + Description: Get/set the Sequence Ontology term for this consequence type + Returntype : string + Exceptions : none + Status : At Risk + +=cut + +sub SO_term { + my ($self, $SO_term) = @_; + $self->{SO_term} = $SO_term if $SO_term; + return $self->{SO_term}; +} + +=head2 feature_SO_term + + Arg [1] : (optional) term to set + Description: Get/set the Sequence Ontology term for the feature affected by this consequence type + Returntype : string + Exceptions : none + Status : At Risk + +=cut + +sub feature_SO_term { + my ($self, $feature_SO_term) = @_; + $self->{feature_SO_term} = $feature_SO_term if $feature_SO_term; + return $self->{feature_SO_term}; +} + +=head2 feature_class + + Arg [1] : (optional) class to set + Description: Get/set the Ensembl class representing the feature affected by this consequence type + Returntype : string + Exceptions : none + Status : At Risk + +=cut + +sub feature_class { + my ($self, $feature_class) = @_; + $self->{feature_class} = $feature_class if $feature_class; + return $self->{feature_class} || ''; +} + +=head2 predicate + + Arg [1] : (optional) reference to subroutine (or the name of a subroutine) + Description: Get/set the predicate used to check if this consequence type applies + to a given VariationFeatureOverlapAllele. Currently, if you supply + a name (rather than a coderef), this subroutine must be found in the + Bio::EnsEMBL::Variation::Utils::VariationEffect module. + Returntype : coderef + Exceptions : throws if a name is supplied and the subroutine cannot be found in + the expected module + Status : At Risk + +=cut + +sub predicate { + my ($self, $predicate) = @_; + + $self->{predicate} = $predicate if $predicate; + + if ($self->{predicate} && ref $self->{predicate} ne 'CODE') { + my $name = $self->{predicate}; + + if (defined &$name && $name =~ /^Bio::EnsEMBL::Variation::Utils::VariationEffect/) { + $self->{predicate} = \&$name; + } + else { + throw("Can't find a subroutine called $name in the VariationEffect module?"); + } + } + + return $self->{predicate}; +} + +=head2 rank + + Arg [1] : (optional) rank to set + Description: Get/set the relative rank of this OverlapConsequence when compared to other + OverlapConsequence objects. This is used, for example, to determine the most + severe consequence of a VariationFeature. + Returntype : integer + Exceptions : none + Status : At Risk + +=cut + +sub rank { + my ($self, $rank) = @_; + $self->{rank} = $rank if $rank; + return $self->{rank}; +} + +=head2 tier + + Arg [1] : (optional) tier to set + Description: Get/set the tier this OverlapConsequence belongs to. Variations will be + assigned consequences in tier order; if a tier 1 consequence is assigned, + no tier 2 consequences will be checked/assigned. + Returntype : integer + Exceptions : none + Status : At Risk + +=cut + +sub tier { + my ($self, $tier) = @_; + $self->{tier} = $tier if $tier; + return $self->{tier}; +} + + +=head2 display_term + + Arg [1] : (optional) term to set + Description: Get/set the Ensembl display term for this consequence type. This is + used by default on the website. + Returntype : string + Exceptions : none + Status : At Risk + +=cut + +sub display_term { + my ($self, $display_term) = @_; + $self->{display_term} = $display_term if $display_term; + return $self->{display_term} || $self->SO_term; +} + +=head2 NCBI_term + + Arg [1] : (optional) term to set + Description: Get/set the NCBI term for this consequence type + Returntype : string + Exceptions : none + Status : At Risk + +=cut + +sub NCBI_term { + my ($self, $NCBI_term) = @_; + $self->{NCBI_term} = $NCBI_term if $NCBI_term; + return $self->{NCBI_term} || $self->SO_term; +} + +=head2 description + + Arg [1] : (optional) description to set + Description: Get/set the description for this consequence type. This is used on the + website and is intended to be a freetext description of this consequence. + Returntype : string + Exceptions : none + Status : At Risk + +=cut + +sub description { + my ($self, $description) = @_; + $self->{description} = $description if $description; + return $self->{description}; +} + +=head2 label + + Arg [1] : (optional) label to set + Description: Get/set the label for this consequence type. This is used on the + website and is intended to be a short description of this consequence. + Returntype : string + Exceptions : none + Status : At Risk + +=cut + +sub label { + my ($self, $label) = @_; + $self->{label} = $label if $label; + return $self->{label}; +} + +=head2 variant_feature_class + + Arg [1] : (optional) class as a atring + Description: Get/set the class of variant features that this consequence can apply to + Returntype : string + Exceptions : none + Status : At Risk + +=cut + +sub variant_feature_class { + my ($self, $class) = @_; + $self->{variant_feature_class} = $class if $class; + return $self->{variant_feature_class}; +} + +=head2 is_default + + Arg [1] : (optional) flag + Description: Get/set a flag indicating if this is the default consequence type. + There should only be one default OverlapConsequence, in Ensembl this + flag is only set on the INTERGENIC OverlapConsequence object. + Returntype : bool + Exceptions : none + Status : At Risk + +=cut + +sub is_default { + my ($self, $is_default) = @_; + $self->{is_default} = $is_default if defined $is_default; + return $self->{is_default}; +} + +sub get_all_parent_SO_terms { + my ($self) = @_; + + if (my $adap = $self->{adaptor}) { + if (my $goa = $adap->db->get_SOTermAdaptor) { + + } + } +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Phenotype.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Phenotype.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,116 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# Ensembl module for Bio::EnsEMBL::Variation::Phenotype +# +# Copyright (c) 2011 Ensembl +# + + +=head1 NAME + +Bio::EnsEMBL::Variation::Phenotype - Ensembl representation of a phenotype. + +=head1 SYNOPSIS + + $study = Bio::EnsEMBL::Variation::Study->new(-DESCRIPTION => 'Hemostatic factors and hematological phenotypes'); + +=head1 DESCRIPTION + +This is a class representing a phenotype from the ensembl-variation database. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::Phenotype; + +use Bio::EnsEMBL::Storable; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); + +our @ISA = ('Bio::EnsEMBL::Storable'); + + +=head2 new + Arg [-DESCRIPTION] : + phenotype description + + Example : + + $phenotype = Bio::EnsEMBL::Variation::Phenotype->new(-DESCRIPTION => 'Hemostatic factors and hematological phenotypes'); + + Description: Constructor. Instantiates a new Phenotype object. + Returntype : Bio::EnsEMBL::Variation::Phenotype + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_); + my ($dbID, $description) = rearrange([qw(dbID DESCRIPTION)], @_); + $self = { + 'dbID' => $dbID, + 'description' => $description, + }; + return bless $self, $class; +} + +=head2 id + + Example : $name = $obj->id() + Description: Getter/Setter for the id attribute + Returntype : integer + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub id { + my $self = shift; + return $self->{'dbID'} = shift if(@_); + return $self->{'dbID'}; +} + +=head2 description + + Example : $name = $obj->description() + Description: Getter/Setter for the description attribute + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub description { + my $self = shift; + return $self->{'description'} = shift if(@_); + return $self->{'description'}; +} + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/BaseVariationProcess.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/BaseVariationProcess.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,108 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +package Bio::EnsEMBL::Variation::Pipeline::BaseVariationProcess; + +use strict; +use warnings; + +use Bio::EnsEMBL::Variation::Pipeline::TranscriptFileAdaptor; + +use Bio::EnsEMBL::Hive::AnalysisJob; + +use base qw(Bio::EnsEMBL::Hive::Process); + +sub param { + my $self = shift; + + unless ($self->input_job) { + # if we don't have an input job, add a dummy one (used when we're not + # running as part of a pipeline proper) + $self->input_job(Bio::EnsEMBL::Hive::AnalysisJob->new); + } + + return $self->SUPER::param(@_); +} + +sub required_param { + my $self = shift; + my $param_name = shift; + + my $param_value = $self->param($param_name, @_); + + die "$param_name is a required parameter" unless defined $param_value; + + return $param_value; +} + +sub get_transcript_file_adaptor { + my $self = shift; + my $transcripts = shift; + + unless ($self->{tfa}) { + $self->{tfa} = Bio::EnsEMBL::Variation::Pipeline::TranscriptFileAdaptor->new( + fasta_file => $self->param('fasta_file'), + transcripts => $transcripts, + ); + } + + return $self->{tfa}; +} + +sub get_species_adaptor { + my ($self, $group) = @_; + + my $species = $self->required_param('species'); + + return $self->get_adaptor($species, $group); +} + +sub get_adaptor { + my ($self, $species, $group) = @_; + + my $dba; + + eval { + $dba = Bio::EnsEMBL::Registry->get_DBAdaptor($species, $group); + }; + + unless (defined $dba) { + $self->_load_registry(); + $dba = Bio::EnsEMBL::Registry->get_DBAdaptor($species, $group); + } + + unless (defined $dba) { + die "Failed to a get DBA for $species and group $group"; + } + + return $dba; +} + +sub _load_registry { + my ($self) = @_; + + my $reg_file = $self->required_param('ensembl_registry'); + + Bio::EnsEMBL::Registry->load_all($reg_file, 0, 1); + + return; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/FinishVariationClass.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/FinishVariationClass.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,43 @@ +package Bio::EnsEMBL::Variation::Pipeline::FinishVariationClass; + +use strict; +use warnings; + +use base qw(Bio::EnsEMBL::Variation::Pipeline::BaseVariationProcess); + +my $DEBUG = 0; + +sub run { + my $self = shift; + + my $temp_var_table = $self->required_param('temp_var_table'); + + my $temp_var_feat_table = $self->required_param('temp_var_feat_table'); + + my $var_dba = $self->get_species_adaptor('variation'); + + my $dbc = $var_dba->dbc; + + # copy the attribs across to the real tables + + $dbc->do(qq{ + UPDATE variation_feature vf, $temp_var_feat_table tvf + SET vf.class_attrib_id = tvf.class_attrib_id + WHERE vf.variation_feature_id = tvf.variation_feature_id + }); + + $dbc->do(qq{ + UPDATE variation v, $temp_var_table tv + SET v.class_attrib_id = tv.class_attrib_id + WHERE v.variation_id = tv.variation_id + }); + + # and get rid of the temp tables + $dbc->do(qq{DROP TABLE $temp_var_table}); + $dbc->do(qq{DROP TABLE $temp_var_feat_table}); + + return; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/InitTranscriptEffect.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/InitTranscriptEffect.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,113 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +package Bio::EnsEMBL::Variation::Pipeline::InitTranscriptEffect; + +use strict; +use warnings; + +use base qw(Bio::EnsEMBL::Variation::Pipeline::BaseVariationProcess); + +my $DEBUG = 0; + +sub fetch_input { + + my $self = shift; + + my $include_lrg = $self->param('include_lrg'); + + my $core_dba = $self->get_species_adaptor('core'); + my $var_dba = $self->get_species_adaptor('variation'); + + my $dbc = $var_dba->dbc(); + + my $ga = $core_dba->get_GeneAdaptor or die "Failed to get gene adaptor"; + + my @transcript_output_ids; + + my $gene_count = 0; + + # fetch all the regular genes + + my @genes = @{ $ga->fetch_all }; + + if ($include_lrg) { + # fetch the LRG genes as well + + push @genes, @{ $ga->fetch_all_by_biotype('LRG_gene') } + } + + for my $gene (@genes) { + $gene_count++; + + for my $transcript (@{ $gene->get_all_Transcripts }) { + + push @transcript_output_ids, { + transcript_stable_id => $transcript->stable_id, + }; + } + if ($DEBUG) { + last if $gene_count >= 100; + } + } + + if (@transcript_output_ids) { + + # check we actually found some transcripts + + # truncate the table because we don't want duplicates + + $dbc->do("TRUNCATE TABLE transcript_variation"); + + # disable the indexes on the table we're going to insert into as + # this significantly speeds up the TranscriptEffect process + + $dbc->do("ALTER TABLE transcript_variation DISABLE KEYS"); + + $self->param('transcript_output_ids', \@transcript_output_ids); + + $self->param( + 'rebuild_indexes', [{ + tables => ['transcript_variation'], + }] + ); + + # we need to kick off the update_vf analysis as well, + # but it doesn't have any parameters we need to set here + + $self->param( + 'update_vf', [{}] + ); + } +} + +sub write_output { + my $self = shift; + + if (my $transcript_output_ids = $self->param('transcript_output_ids')) { + $self->dataflow_output_id($self->param('rebuild_indexes'), 2); + $self->dataflow_output_id($self->param('update_vf'), 3); + $self->dataflow_output_id($transcript_output_ids, 4); + } + + return; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/InitVariationClass.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/InitVariationClass.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,109 @@ +package Bio::EnsEMBL::Variation::Pipeline::InitVariationClass; + +use strict; +use warnings; + +use base qw(Bio::EnsEMBL::Variation::Pipeline::BaseVariationProcess); + +use POSIX qw(ceil); + +my $DEBUG = 0; + +sub fetch_input { + + my $self = shift; + + my $num_chunks = $self->required_param('num_chunks'); + + my $var_dba = $self->get_species_adaptor('variation'); + + my $aa = $var_dba->get_AttributeAdaptor; + + my $dbc = $var_dba->dbc(); + + # first set everything in variation (except HGMDs) to 'sequence_alteration' by default + # because sometimes we miss them because there is no variation_feature + # or any alleles (though this should become unnecessary as we move to the + # new approach to failing for all species) + + my $default_attrib_id = $aa->attrib_id_for_type_value('SO_term', 'sequence_alteration'); + + die "No attrib_id for 'sequence_alteration'" unless defined $default_attrib_id; + + $dbc->do(qq{ + UPDATE variation v, source s + SET v.class_attrib_id = $default_attrib_id + WHERE v.source_id = s.source_id + AND s.name != 'HGMD-PUBLIC' + }); + + # now create some temp tables to store the class attribs + + my $temp_var_table = 'temp_variation_class'; + my $temp_var_feat_table = 'temp_variation_feature_class'; + + $dbc->do(qq{DROP TABLE IF EXISTS $temp_var_table}); + $dbc->do(qq{DROP TABLE IF EXISTS $temp_var_feat_table}); + + $dbc->do(qq{CREATE TABLE $temp_var_table LIKE variation}); + $dbc->do(qq{CREATE TABLE $temp_var_feat_table LIKE variation_feature}); + + $dbc->do(qq{ALTER TABLE $temp_var_table DISABLE KEYS}); + $dbc->do(qq{ALTER TABLE $temp_var_feat_table DISABLE KEYS}); + + # now get an ordered list of all the variation_ids + + my $get_var_ids_sth = $dbc->prepare(qq{ + SELECT variation_id FROM variation ORDER BY variation_id + }); + + $get_var_ids_sth->execute; + + my @var_ids; + + while (my ($var_id) = $get_var_ids_sth->fetchrow_array) { + push @var_ids, $var_id; + } + + # and split them up into as many chunks as requested + + my $num_vars = scalar @var_ids; + + my $chunk_size = ceil($num_vars / $num_chunks); + + my @output_ids; + + while (@var_ids) { + + my $start = $var_ids[0]; + my $stop = $chunk_size <= $#var_ids ? $var_ids[$chunk_size - 1] : $var_ids[$#var_ids]; + + splice(@var_ids, 0, $chunk_size); + + push @output_ids, { + variation_id_start => $start, + variation_id_stop => $stop, + temp_var_table => $temp_var_table, + temp_var_feat_table => $temp_var_feat_table, + }; + } + + $self->param('chunk_output_ids', \@output_ids); + + $self->param( + 'finish_var_class', [{ + temp_var_table => $temp_var_table, + temp_var_feat_table => $temp_var_feat_table, + }] + ); +} + +sub write_output { + my $self = shift; + + $self->dataflow_output_id($self->param('finish_var_class'), 1); + $self->dataflow_output_id($self->param('chunk_output_ids'), 2); +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/ProteinFunction/BaseProteinFunction.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/ProteinFunction/BaseProteinFunction.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,56 @@ +package Bio::EnsEMBL::Variation::Pipeline::ProteinFunction::BaseProteinFunction; + +use strict; +use warnings; + +use Bio::SimpleAlign; +use Bio::LocatableSeq; + +use base ('Bio::EnsEMBL::Variation::Pipeline::BaseVariationProcess'); + +sub get_protein_sequence { + my ($self, $md5) = @_; + + my $tfa = $self->get_transcript_file_adaptor; + + my $fasta = $tfa->get_translation_fasta($md5); + + unless (length($fasta) > 34) { + # sleep in case it's some race condition and then try again + sleep(rand(5)); + $fasta = $tfa->get_translation_fasta($md5); + + unless (length($fasta) > 34) { + die "$md5 looks wierdly short!"; + } + } + + # strip out fasta header etc. + + $fasta =~ s/>.*\n//m; + $fasta =~ s/\s//mg; + + die "No peptide for $md5?" unless length($fasta) > 0; + + return $fasta; +} + +sub get_stable_id_for_md5 { + my ($self, $md5) = @_; + + my $var_dba = $self->get_species_adaptor('variation'); + + my $get_stable_id_sth = $var_dba->prepare(qq{ + SELECT stable_id + FROM translation_mapping + WHERE md5 = ? + }); + + $get_stable_id_sth->execute($md5); + + my ($stable_id) = $get_stable_id_sth->fetchrow_array; + + return $stable_id; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/ProteinFunction/Constants.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/ProteinFunction/Constants.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,12 @@ +package Bio::EnsEMBL::Variation::Pipeline::ProteinFunction::Constants; + +use base qw(Exporter); + +our @EXPORT_OK = qw(FULL UPDATE NONE); + +use constant FULL => 1; +use constant UPDATE => 2; +use constant NONE => 3; + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/ProteinFunction/InitJobs.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/ProteinFunction/InitJobs.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,176 @@ +package Bio::EnsEMBL::Variation::Pipeline::ProteinFunction::InitJobs; + +use strict; + +use Bio::EnsEMBL::Registry; + +use Bio::EnsEMBL::Variation::Pipeline::ProteinFunction::Constants qw(FULL UPDATE NONE); + +use Digest::MD5 qw(md5_hex); + +use base qw(Bio::EnsEMBL::Variation::Pipeline::BaseVariationProcess); + +sub fetch_input { + + my $self = shift; + + my $sift_run_type = $self->required_param('sift_run_type'); + my $pph_run_type = $self->required_param('pph_run_type'); + my $include_lrg = $self->param('include_lrg'); + + my $core_dba = $self->get_species_adaptor('core'); + my $var_dba = $self->get_species_adaptor('variation'); + + # fetch all the transcripts from the core DB + + my @transcripts; + + if ($self->param('debug_mode')) { + my $ga = $core_dba->get_GeneAdaptor or die "Failed to get gene adaptor"; + + @transcripts = grep { $_->translation } @{ $ga->fetch_all_by_external_name('BRCA1')->[0]->get_all_Transcripts }; + } + else { + my $sa = $core_dba->get_SliceAdaptor or die "Failed to get slice adaptor"; + + for my $slice (@{ $sa->fetch_all('toplevel', undef, 1, undef, ($include_lrg ? 1 : undef)) }) { + for my $gene (@{ $slice->get_all_Genes(undef, undef, 1) }) { + for my $transcript (@{ $gene->get_all_Transcripts }) { + if (my $translation = $transcript->translation) { + push @transcripts, $transcript; + } + } + } + } + } + + # store a table mapping each translation stable ID to its corresponding MD5 + + $var_dba->dbc->do(qq{DROP TABLE IF EXISTS translation_mapping}); + + $var_dba->dbc->do(qq{ + CREATE TABLE translation_mapping ( + stable_id VARCHAR(255), + md5 CHAR(32), + PRIMARY KEY (stable_id), + KEY md5_idx (md5) + ) + }); + + my $add_mapping_sth = $var_dba->prepare(qq{ + INSERT IGNORE INTO translation_mapping (stable_id, md5) VALUES (?,?) + }); + + my $add_md5_sth = $var_dba->prepare(qq{ + INSERT IGNORE INTO translation_md5 (translation_md5) VALUES (?) + }); + + # build a hash mapping MD5s for our set of translations to their peptide sequences + # and also write each stable ID - MD5 mapping to the database + + my %unique_translations; + + for my $tran (@transcripts) { + + my $tl = $tran->translation; + + my $seq = $tl->seq; + + my $md5 = md5_hex($seq); + + $unique_translations{$md5} = $seq; + + $add_mapping_sth->execute($tl->stable_id, $md5); + } + + # work out which translations we need to run for which analysis + + my @translation_md5s = keys %unique_translations; + + my @sift_md5s; + my @pph_md5s; + + # if we're doing full runs, then we just use all the translations + + @sift_md5s = @translation_md5s if $sift_run_type == FULL; + @pph_md5s = @translation_md5s if $pph_run_type == FULL; + + # if we're updating we need to check which translations already have predictions + + if ($sift_run_type == UPDATE || $pph_run_type == UPDATE) { + + my $var_dbh = $var_dba->dbc->db_handle; + + my $get_existing_sth = $var_dbh->prepare(qq{ + SELECT t.translation_md5, a.value, p.prediction_matrix + FROM translation_md5 t, attrib a, protein_function_predictions p + WHERE t.translation_md5_id = p.translation_md5_id + AND a.attrib_id = p.analysis_attrib_id + }); + + # store the set of existing MD5s in a hash + + $get_existing_sth->execute; + + my $existing_md5s; + + while ( my ($md5, $analysis, $preds) = $get_existing_sth->fetchrow_array ) { + # there are 2 polyphen analyses, but we only want to track one + $analysis = 'pph' if $analysis =~ /polyphen/; + # just record true if we already have predictions for each translation + $existing_md5s->{$md5}->{$analysis} = length($preds) > 0; + } + + # exclude any translations MD5s we find in the database + + @sift_md5s = grep { ! $existing_md5s->{$_}->{sift} } @translation_md5s if $sift_run_type == UPDATE; + @pph_md5s = grep { ! $existing_md5s->{$_}->{pph} } @translation_md5s if $pph_run_type == UPDATE; + } + + # create a FASTA dump of all the necessary translation sequences + + # work out the set of all unique MD5s for sift and polyphen + + my %required_md5s = map { $_ => 1 } (@sift_md5s, @pph_md5s); + + my $fasta = $self->required_param('fasta_file'); + + open my $FASTA, ">$fasta" or die "Failed to open $fasta for writing"; + + # get rid of any existing index file + + if (-e "$fasta.fai") { + unlink "$fasta.fai" or die "Failed to delete fasta index file"; + } + + # dump out each peptide sequence + + for my $md5 (keys %required_md5s) { + my $seq = $unique_translations{$md5}; + $seq =~ s/(.{80})/$1\n/g; + # get rid of any trailing newline + chomp $seq; + print $FASTA ">$md5\n$seq\n"; + } + + close $FASTA; + + # set up our list of output ids + + $self->param('pph_output_ids', [ map { {translation_md5 => $_} } @pph_md5s ]); + $self->param('sift_output_ids', [ map { {translation_md5 => $_} } @sift_md5s ]); +} + +sub write_output { + my $self = shift; + + unless ($self->param('pph_run_type') == NONE) { + $self->dataflow_output_id($self->param('pph_output_ids'), 2); + } + + unless ($self->param('sift_run_type') == NONE) { + $self->dataflow_output_id($self->param('sift_output_ids'), 3); + } +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/ProteinFunction/ProteinFunction_conf.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/ProteinFunction/ProteinFunction_conf.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,243 @@ +package Bio::EnsEMBL::Variation::Pipeline::ProteinFunction::ProteinFunction_conf; + +use strict; +use warnings; + +use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf'); + +use Bio::EnsEMBL::Variation::Pipeline::ProteinFunction::Constants qw(FULL UPDATE NONE); + +sub default_options { + my ($self) = @_; + + return { + + # NB: You can find some documentation on this pipeline on confluence here: + # + # http://www.ebi.ac.uk/seqdb/confluence/display/EV/Protein+function+pipeline + + # Pipeline wide settings + + # If the debug_mode flag is set to 1 then we will only run the pipeline on a single gene + # (currently set to BRCA1 in InitJobs.pm), this is useful when testing new installations + # of sift and polyphen, alterations to the pipeline etc. When set to 0 the full pipeline + # will be run + + debug_mode => 0, + + species => 'Homo_sapiens', + + # the location of your ensembl checkout, the hive looks here for SQL files etc. + + ensembl_cvs_root_dir => $ENV{'HOME'}.'/ensembl-branches/HEAD/', + + pipeline_name => 'protein_function', + + pipeline_dir => '/lustre/scratch101/ensembl/'.$ENV{USER}.'/'.$self->o('pipeline_name'), + + species_dir => $self->o('pipeline_dir').'/'.$self->o('species'), + + # directory used for the hive's own output files + + output_dir => $self->o('species_dir').'/hive_output', + + # this registry file should contain connection details for the core, variation + # and compara databases (if you are using compara alignments). If you are + # doing an UPDATE run for either sift or polyphen, the variation database + # should have existing predictions in the protein_function_predictions table + + ensembl_registry => $self->o('pipeline_dir').'/ensembl.registry', + + # peptide sequences for all unique translations for this species will be dumped to this file + + fasta_file => $self->o('species_dir').'/'.$self->o('species').'_translations.fa', + + # set this flag to include LRG translations in the analysis + + include_lrg => 1, + + # connection details for the hive's own database + + pipeline_db => { + -host => 'ens-variation', + -port => 3306, + -user => 'ensadmin', + -pass => $self->o('password'), + -dbname => $ENV{USER}.'_'.$self->o('pipeline_name').'_hive', + }, + + hive_use_triggers => 0, + + # configuration for the various resource options used in the pipeline + + default_lsf_options => '-R"select[mem>2000] rusage[mem=2000]" -M2000000', + urgent_lsf_options => '-q yesterday -R"select[mem>2000] rusage[mem=2000]" -M2000000', + highmem_lsf_options => '-q long -R"select[mem>8000] rusage[mem=8000]" -M8000000', + medmem_lsf_options => '-R"select[mem>4000] rusage[mem=4000]" -M4000000', + long_lsf_options => '-q long -R"select[mem>2000] rusage[mem=2000]" -M2000000', + + # Polyphen specific parameters + + # location of the software + + pph_dir => '/software/ensembl/variation/polyphen-2.2.2', + + # where we will keep polyphen's working files etc. as the pipeline runs + + pph_working => $self->o('species_dir').'/polyphen_working', + + # specify the Weka classifier models here, if you don't want predictions from + # one of the classifier models set the value to the empty string + + humdiv_model => $self->o('pph_dir').'/models/HumDiv.UniRef100.NBd.f11.model', + + humvar_model => $self->o('pph_dir').'/models/HumVar.UniRef100.NBd.f11.model', + + # the run type for polyphen (& sift) can be one of FULL to run predictions for + # all translations regardless of whether we already have predictions in the + # database, NONE to exclude this analysis, or UPDATE to run predictions for any + # new or changed translations in the database. The variation database specified + # in the registry above is used to identify translations we already have + # predictions for. + + pph_run_type => UPDATE, + + # set this flag to use compara protein families as the alignments rather than + # polyphen's own alignment pipeline + + pph_use_compara => 0, + + # the maximum number of workers to run in parallel for polyphen and weka. Weka + # runs much faster then polyphen so you don't need as many workers. + + pph_max_workers => 500, + + weka_max_workers => 20, + + # Sift specific parameters + + # location of the software + + sift_dir => '/software/ensembl/variation/sift4.0.5', + + sift_working => $self->o('species_dir').'/sift_working', + + # the location of blastpgp etc. + + ncbi_dir => '/software/ncbiblast/bin', + + # the protein database used to build alignments if you're not using compara + + blastdb => '/data/blastdb/Ensembl/variation/sift4.0.5/uniprot/swiss_trembl.uni', + + # the following parameters mean the same as for polyphen + + sift_run_type => UPDATE, + + sift_use_compara => 0, + + sift_max_workers => 500, + }; +} + +sub pipeline_create_commands { + my ($self) = @_; + return [ + 'mysql '.$self->dbconn_2_mysql('pipeline_db', 0).q{-e 'DROP DATABASE IF EXISTS }.$self->o('pipeline_db', '-dbname').q{'}, + @{$self->SUPER::pipeline_create_commands}, + 'mysql '.$self->dbconn_2_mysql('pipeline_db', 1).q{-e 'INSERT INTO meta (meta_key, meta_value) VALUES ("hive_output_dir", "}.$self->o('output_dir').q{")'}, + ]; +} + +sub resource_classes { + my ($self) = @_; + return { + 0 => { -desc => 'default', 'LSF' => $self->o('default_lsf_options') }, + 1 => { -desc => 'urgent', 'LSF' => $self->o('urgent_lsf_options') }, + 2 => { -desc => 'highmem', 'LSF' => $self->o('highmem_lsf_options') }, + 3 => { -desc => 'long', 'LSF' => $self->o('long_lsf_options') }, + 4 => { -desc => 'medmem', 'LSF' => $self->o('medmem_lsf_options') }, + }; +} + +sub pipeline_analyses { + my ($self) = @_; + + my @common_params = ( + fasta_file => $self->o('fasta_file'), + ensembl_registry => $self->o('ensembl_registry'), + species => $self->o('species'), + debug_mode => $self->o('debug_mode'), + ); + + return [ + { -logic_name => 'init_jobs', + -module => 'Bio::EnsEMBL::Variation::Pipeline::ProteinFunction::InitJobs', + -parameters => { + sift_run_type => $self->o('sift_run_type'), + pph_run_type => $self->o('pph_run_type'), + include_lrg => $self->o('include_lrg'), + @common_params, + }, + -input_ids => [{}], + -rc_id => 2, + -flow_into => { + 2 => [ 'run_polyphen' ], + 3 => [ 'run_sift' ], + }, + }, + + { -logic_name => 'run_polyphen', + -module => 'Bio::EnsEMBL::Variation::Pipeline::ProteinFunction::RunPolyPhen', + -parameters => { + pph_dir => $self->o('pph_dir'), + pph_working => $self->o('pph_working'), + use_compara => $self->o('pph_use_compara'), + @common_params, + }, + -max_retry_count => 0, + -input_ids => [], + -hive_capacity => $self->o('pph_max_workers'), + -rc_id => 2, + -flow_into => { + 2 => [ 'run_weka' ], + }, + }, + + { -logic_name => 'run_weka', + -module => 'Bio::EnsEMBL::Variation::Pipeline::ProteinFunction::RunWeka', + -parameters => { + pph_dir => $self->o('pph_dir'), + humdiv_model => $self->o('humdiv_model'), + humvar_model => $self->o('humvar_model'), + @common_params, + }, + -max_retry_count => 0, + -input_ids => [], + -hive_capacity => $self->o('weka_max_workers'), + -rc_id => 0, + -flow_into => {}, + }, + + { -logic_name => 'run_sift', + -module => 'Bio::EnsEMBL::Variation::Pipeline::ProteinFunction::RunSift', + -parameters => { + sift_dir => $self->o('sift_dir'), + sift_working => $self->o('sift_working'), + ncbi_dir => $self->o('ncbi_dir'), + blastdb => $self->o('blastdb'), + use_compara => $self->o('sift_use_compara'), + @common_params, + }, + -failed_job_tolerance => 10, + -max_retry_count => 0, + -input_ids => [], + -hive_capacity => $self->o('sift_max_workers'), + -rc_id => 4, + -flow_into => {}, + }, + ]; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/ProteinFunction/RunPolyPhen.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/ProteinFunction/RunPolyPhen.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,185 @@ +package Bio::EnsEMBL::Variation::Pipeline::ProteinFunction::RunPolyPhen; + +use strict; + +use Digest::MD5 qw(md5_hex); +use File::Path qw(make_path remove_tree); +use Data::Dumper; + +use Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix qw(@ALL_AAS $AA_LOOKUP); +use Bio::EnsEMBL::Variation::Utils::ComparaUtils qw(dump_alignment_for_polyphen); + +use base ('Bio::EnsEMBL::Variation::Pipeline::ProteinFunction::BaseProteinFunction'); + +my $PERSIST = 1; +my $MAX_PSIC_SEQS = 8190; +my $MAX_PSIC_SEQLEN = 409650; + +sub run { + my $self = shift; + + my $translation_md5 = $self->required_param('translation_md5'); + + my $pph_dir = $self->required_param('pph_dir'); + my $working_dir = $self->required_param('pph_working'); + + my $dir = substr($translation_md5, 0, 2); + my $output_dir = "$working_dir/$dir/$translation_md5"; + + my @to_delete; + + my @pph_dirs = qw{alignments blastfiles profiles structures lock source_files}; + my @pipe_dirs = qw{features errors}; + + my $tarball = 'scratch.tgz'; + + if ( (-e $output_dir) && (!$PERSIST) ) { + # if we're not being persistent delete the old directory first + chdir $output_dir or die "Failed to chdir to $output_dir"; + my $err; + remove_tree(@pph_dirs, @pipe_dirs, {error => \$err}); + die "remove_tree failed: ".Dumper($err) if $err && @$err; + } + + unless (-d $output_dir) { + my $err; + make_path($output_dir, {error => \$err}); + die "make_path failed: ".Dumper($err) if $err && @$err; + } + + chdir $output_dir or die "Failed to chdir to $output_dir"; + + if (-e "$output_dir/$tarball") { + system("tar zxvf $tarball > /dev/null") == 0 + or die "Failed to untar $output_dir/$tarball: $!"; + } + else { + my $err; + make_path(@pph_dirs, @pipe_dirs, {error => \$err}); + die "make_path failed: ".Dumper($err) if $err && @$err; + } + + my $subs_file = "${output_dir}/source_files/subs.txt"; + my $protein_file = "${output_dir}/source_files/protein.fa"; + my $aln_file = "${output_dir}/alignments/${translation_md5}.aln"; + my $output_file = "${output_dir}/features/features.txt"; + my $error_file = "${output_dir}/errors/polyphen.err"; + + # dump the protein sequence, + + my $peptide = $self->get_protein_sequence($translation_md5); + + open (PROTEIN, ">$protein_file") or die "Failed to open file for protein $protein_file: $!"; + + my $pep_copy = $peptide; + $pep_copy =~ s/(.{80})/$1\n/g; + chomp $pep_copy; + print PROTEIN ">$translation_md5\n$pep_copy\n"; + + close PROTEIN; + + # and the substitutions. + + open (SUBS, ">$subs_file") or die "Failed to open file for SUBS $subs_file: $!"; + + #push @to_delete, $subs_file, $protein_file; + + my @aas = split //, $peptide; + + my $idx = 0; + + for my $ref (@aas) { + $idx++; + + # ignore any non standard amino acids, e.g. X + + next unless defined $AA_LOOKUP->{$ref}; + + for my $alt (@ALL_AAS) { + unless ($ref eq $alt) { + print SUBS join ("\t", + $translation_md5, + $idx, + $ref, + $alt + ), "\n"; + } + } + } + + close SUBS; + close PROTEIN; + + if ($self->param('use_compara')) { + + my $stable_id = $self->get_stable_id_for_md5($translation_md5); + + # if we're using compara alignments then dump the alignment + # to the alignment file, PolyPhen will check if it exists + # and use it in place of its own alignment if so + + eval { + dump_alignment_for_polyphen($stable_id, $aln_file); + }; + + if ($@) { + die "Failed to fetch a compara alignment for $stable_id: $@"; + } + } + + # now run polyphen itself, disconnecting for the duration + + $self->dbc->disconnect_when_inactive(1); + + # use -A option to disable polyphen's own LSF support (which conflicts with the hive) + my $cmd = "$pph_dir/bin/run_pph.pl -A -d $output_dir -s $protein_file $subs_file 1> $output_file 2> $error_file"; + + system($cmd) == 0 or die "Failed to run $cmd: $?"; + + $self->dbc->disconnect_when_inactive(0); + + my $scratch_dirs = join ' ', @pph_dirs; + + system("tar czvf $tarball $scratch_dirs > /dev/null") == 0 + or die "tar command failed: $?"; + + my $err; + remove_tree(@pph_dirs, {error => \$err}); + die "remove_tree failed: ".Dumper($err) if $err && @$err; + + my $exit_code = system("gzip -f $output_file"); + + if ($exit_code == 0) { + $output_file .= '.gz'; + } + else { + warn "Failed to gzip $output_file: $?"; + } + + $self->param('feature_file', $output_file); + + if (-s $error_file) { + warn "run_pph.pl STDERR output in $error_file\n"; + } + else { + push @to_delete, $error_file; + } + + # delete unnecesary files + + unlink @to_delete; +} + +sub write_output { + my $self = shift; + + if (my $feature_file = $self->param('feature_file')) { + $self->dataflow_output_id( [{ + translation_md5 => $self->param('translation_md5'), + feature_file => $feature_file, + }], 2); + } +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/ProteinFunction/RunSift.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/ProteinFunction/RunSift.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,197 @@ +package Bio::EnsEMBL::Variation::Pipeline::ProteinFunction::RunSift; + +use strict; + +use File::Path qw(make_path remove_tree); +use Data::Dumper; + +use Bio::EnsEMBL::Registry; +use Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix qw(@ALL_AAS); +use Bio::EnsEMBL::Variation::Utils::ComparaUtils qw(dump_alignment_for_sift); + +use base ('Bio::EnsEMBL::Variation::Pipeline::ProteinFunction::BaseProteinFunction'); + +my $MEDIAN_CUTOFF = 2.75; # as per README + +sub run { + my $self = shift; + + my $translation_md5 = $self->required_param('translation_md5'); + my $sift_dir = $self->required_param('sift_dir'); + my $working_dir = $self->required_param('sift_working'); + my $ncbi_dir = $self->required_param('ncbi_dir'); + my $blastdb = $self->required_param('blastdb'); + + my $dir = substr($translation_md5, 0, 2); + my $output_dir = "$working_dir/$dir/$translation_md5"; + + my $tarball = 'scratch.tgz'; + + unless (-d $output_dir) { + my $err; + make_path($output_dir, {error => \$err}); + die "make_path failed: ".Dumper($err) if $err && @$err; + } + + chdir $output_dir or die "Failed to chdir to $output_dir"; + + my $fasta_file = "protein.fa"; + my $aln_file = "protein.alignedfasta"; + my $res_file = "protein.SIFTprediction"; + my $subs_file = "subs.txt"; + + if (-e "$output_dir/$tarball") { + system("tar zxvf $tarball > /dev/null") == 0 + or die "Failed to untar $output_dir/$tarball: $!"; + } + + # set necessary environment variables for sift + + $ENV{NCBI} = $ncbi_dir; + $ENV{BLIMPS_DIR} = $sift_dir.'/blimps'; + $ENV{SIFT_DIR} = $sift_dir; + $ENV{tmpdir} = $output_dir; + + # fetch our protein + + my $peptide = $self->get_protein_sequence($translation_md5); + + my $alignment_ok = 1; + + unless (-e $aln_file) { + + # we need to get the multiple alignment + + if ($self->param('use_compara')) { + + my $stable_id = $self->get_stable_id_for_md5($translation_md5); + + eval { + dump_alignment_for_sift($stable_id, $aln_file); + }; + + if ($@) { + warn "Failed to get a compara alignment for $stable_id: $@"; + $alignment_ok = 0; + } + } + else { + + # do the alignment ourselves + + # first create a fasta file for the protein sequence + + open (FASTA_FILE, ">$fasta_file"); + + my $pep_copy = $peptide; + $pep_copy =~ s/(.{80})/$1\n/g; + chomp $pep_copy; + print FASTA_FILE ">$translation_md5\n$pep_copy\n"; + + close FASTA_FILE; + + # and run the alignment program + + $self->dbc->disconnect_when_inactive(1); + + my $cmd = "$sift_dir/bin/ensembl_seqs_chosen_via_median_info.csh $fasta_file $blastdb $MEDIAN_CUTOFF"; + + #die `env`."\n".$cmd; + + $self->dbc->disconnect_when_inactive(0); + + my $exit_code = system($cmd); + + if ($exit_code == 0) { + $alignment_ok = 1; + } + else { + # the alignment failed for some reason, what to do? + die "Alignment for $translation_md5 failed - cmd: $cmd"; + $alignment_ok = 0; + } + } + } + + if ($alignment_ok) { + + # work out the sift score for each possible amino acid substitution + + unless (-e $subs_file) { + + # create our substitution file + + my $pos = 0; + + open SUBS, ">$subs_file" or die "Failed to open $subs_file: $!"; + + my @aas = split //, $peptide; + + for my $ref (@aas) { + $pos++; + for my $alt (@ALL_AAS) { + unless ($ref eq $alt) { + print SUBS $ref.$pos.$alt."\n"; + } + } + } + + close SUBS; + } + + # and run sift on it + + $self->dbc->disconnect_when_inactive(1); + + my $cmd = "$sift_dir/bin/info_on_seqs $aln_file $subs_file $res_file"; + + system($cmd) == 0 or die "Failed to run $cmd: $?"; + + $self->dbc->disconnect_when_inactive(0); + + # parse and store the results + + open (RESULTS, "<$res_file") or die "Failed to open $res_file: $!"; + + # parse the results file + + my $pred_matrix = Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix->new( + -analysis => 'sift', + -peptide_length => length($peptide), + -translation_md5 => $translation_md5, + ); + + while () { + + chomp; + + next if /WARNING/; + next if /NOT SCORED/; + + my ($subst, $prediction, $score, $median_cons, $num_seqs, $blocks) = split; + + my ($ref_aa, $pos, $alt_aa) = $subst =~ /([A-Z])(\d+)([A-Z])/; + + next unless $ref_aa && $alt_aa && defined $pos; + + $pred_matrix->add_prediction( + $pos, + $alt_aa, + $prediction, + $score, + ); + } + + my $var_dba = $self->get_species_adaptor('variation'); + + my $pfpma = $var_dba->get_ProteinFunctionPredictionMatrixAdaptor + or die "Failed to get matrix adaptor"; + + $pfpma->store($pred_matrix); + } + + # tar up the files + + system("tar --remove-files --exclude *.tgz -czvf $tarball * > /dev/null") == 0 + or die "Failed to create $output_dir/$tarball: $!"; +} diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/ProteinFunction/RunWeka.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/ProteinFunction/RunWeka.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,131 @@ +package Bio::EnsEMBL::Variation::Pipeline::ProteinFunction::RunWeka; + +use strict; + +use File::Copy; +use File::Path qw(make_path remove_tree); + +use Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix; + +use base ('Bio::EnsEMBL::Variation::Pipeline::ProteinFunction::BaseProteinFunction'); + +sub run { + my $self = shift; + + my $translation_md5 = $self->required_param('translation_md5'); + + my $feature_file = $self->required_param('feature_file'); + + my $pph_dir = $self->required_param('pph_dir'); + + my $humdiv_model = $self->required_param('humdiv_model'); + my $humvar_model = $self->required_param('humvar_model'); + + # copy stuff to /tmp to avoid lustre slowness + + my ($output_dir, $feature_filename) = $feature_file =~ /(.+)\/([^\/]+)$/; + + my $tmp_dir = "/tmp/weka_${translation_md5}"; + + make_path($tmp_dir); + + copy($feature_file, $tmp_dir); + + my $input_file = "${tmp_dir}/${feature_filename}"; + + # unzip the file if necessary + + if ($input_file =~ /\.gz$/ && -e $input_file) { + system("gunzip -f $input_file") == 0 or die "Failed to gunzip input file: $input_file"; + } + + $input_file =~ s/.gz$//; + + chdir $output_dir or die "Failed to chdir to $output_dir"; + + my @to_delete; + + my $var_dba = $self->get_species_adaptor('variation'); + + my $pfpma = $var_dba->get_ProteinFunctionPredictionMatrixAdaptor + or die "Failed to get matrix adaptor"; + + for my $model ($humdiv_model, $humvar_model) { + + next unless $model; + + my $model_name = $model eq $humdiv_model ? 'humdiv' : 'humvar'; + + my $output_file = "${tmp_dir}/${model_name}.txt"; + + my $error_file = "${model_name}.err"; + + my $cmd = "$pph_dir/bin/run_weka.pl -l $model $input_file 1> $output_file 2> $error_file"; + + system($cmd) == 0 or die "Failed to run $cmd: $?"; + + if (-s $error_file) { + warn "run_weka.pl STDERR output in $error_file\n"; + } + else { + push @to_delete, $error_file; + } + + open (RESULT, "<$output_file") or die "Failed to open output file: $!"; + + my @fields; + + my $peptide = $self->get_transcript_file_adaptor->get_translation_seq($translation_md5); + + my $pred_matrix = Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix->new( + -analysis => 'polyphen', + -sub_analysis => $model_name, + -peptide_length => length($peptide), + -translation_md5 => $translation_md5, + ); + + while () { + if (/^#/) { + s/#//g; + @fields = split /\s+/; + next; + } + + die "No header line in result file $output_file?" unless @fields; + + my @values = split /\t/; + + # trim whitespace + + map { $_ =~ s/^\s+//; $_ =~ s/\s+$// } @values; + + # parse the results into a hash + + my %results = map { $fields[$_] => $values[$_] } (0 .. @fields-1); + + my $alt_aa = $results{o_aa2}; + my $prediction = $results{prediction}; + my $prob = $results{pph2_prob}; + my $position = $results{o_pos}; + + next unless $position && $alt_aa; + + $pred_matrix->add_prediction( + $position, + $alt_aa, + $prediction, + $prob, + ); + } + + # save the predictions to the database + + $pfpma->store($pred_matrix); + } + + remove_tree($tmp_dir); + + unlink @to_delete; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/RebuildIndexes.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/RebuildIndexes.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,25 @@ +package Bio::EnsEMBL::Variation::Pipeline::RebuildIndexes; + +use strict; +use warnings; + +use base qw(Bio::EnsEMBL::Variation::Pipeline::BaseVariationProcess); + +sub run { + + my $self = shift; + + my $tables = $self->param('tables'); + + my $var_dba = $self->get_species_adaptor('variation'); + + my $dbc = $var_dba->dbc; + + for my $table (@$tables) { + $dbc->do("ALTER TABLE $table ENABLE KEYS") + or warn "Failed to enable keys on $table"; + } +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/SetVariationClass.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/SetVariationClass.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,169 @@ +package Bio::EnsEMBL::Variation::Pipeline::SetVariationClass; + +use strict; +use warnings; + +use base qw(Bio::EnsEMBL::Variation::Pipeline::BaseVariationProcess); + +use Bio::EnsEMBL::Variation::Utils::Sequence qw(SO_variation_class); + + +sub run { + + my $self = shift; + + my $var_id_start = $self->required_param('variation_id_start'); + + my $var_id_stop = $self->required_param('variation_id_stop'); + + my $temp_var_table = $self->param('temp_var_table'); + + my $temp_var_feat_table = $self->param('temp_var_feat_table'); + + my $var_dba = $self->get_species_adaptor('variation'); + + my $aa = $var_dba->get_AttributeAdaptor; + + my $dbc = $var_dba->dbc; + + # fetch the failed_descriptions to avoid a join + + my $fds_sth = $dbc->prepare(qq{ + SELECT failed_description_id, description + FROM failed_description + }); + + $fds_sth->execute; + + my %fds; + + while (my ($fd_id, $desc) = $fds_sth->fetchrow_array) { + $fds{$fd_id} = $desc; + } + + $fds_sth->finish(); + + my $all_sth = $dbc->prepare(qq{ + SELECT v.variation_id, vf.variation_feature_id, vf.allele_string, fv.failed_description_id + FROM (variation v LEFT JOIN variation_feature vf ON v.variation_id = vf.variation_id) + LEFT JOIN failed_variation fv ON v.variation_id = fv.variation_id, source s + WHERE v.variation_id >= ? + AND v.variation_id <= ? + AND v.source_id = s.source_id + AND s.name != 'HGMD-PUBLIC' + }); + + my $vf_insert_sth; + my $v_insert_sth; + + if (defined $temp_var_feat_table) { + $vf_insert_sth = $dbc->prepare(qq{ + INSERT IGNORE INTO $temp_var_feat_table (class_attrib_id, variation_feature_id) + VALUES (?,?) + }); + } + else { + $vf_insert_sth = $dbc->prepare(qq{ + UPDATE variation_feature SET class_attrib_id = ? WHERE variation_feature_id = ? + }); + } + + if (defined $temp_var_table) { + $v_insert_sth = $dbc->prepare(qq{ + INSERT IGNORE INTO $temp_var_table (class_attrib_id, variation_id) + VALUES (?,?) + }); + } + else { + $v_insert_sth = $dbc->prepare(qq{ + UPDATE variation SET class_attrib_id = ? WHERE variation_id = ? + }); + } + + $all_sth->execute($var_id_start, $var_id_stop); + + my @unmapped_v_ids; + + while (my ($v_id, $vf_id, $allele_string, $fd_id) = $all_sth->fetchrow_array) { + + unless ($vf_id) { + # this variation doesn't have a corresponding variation_feature + push @unmapped_v_ids, $v_id; + next; + } + + my $ref_correct = 1; + + # check to see if this variation_feature is known not to match the reference allele, + # as this tells us if we can call insertions or deletions, or have to resort to indel + + if (defined $fd_id) { + my $fail_reason = $fds{$fd_id}; + + if ($fail_reason eq 'None of the variant alleles match the reference allele') { + $ref_correct = 0; + } + } + + my $so_term = SO_variation_class($allele_string, $ref_correct); + + my $attrib_id = $aa->attrib_id_for_type_value('SO_term', $so_term); + + die "No attrib_id for $so_term" unless defined $attrib_id; + + $vf_insert_sth->execute($attrib_id, $vf_id); + + $v_insert_sth->execute($attrib_id, $v_id); + } + + $all_sth->finish(); + + # now we need to fetch the alleles for any variations that are not mapped + # and work out their class + + if (@unmapped_v_ids) { + + my $id_str = join ',', @unmapped_v_ids; + + my $unmapped_sth = $dbc->prepare(qq{ + SELECT a.variation_id, ac.allele + FROM allele a, allele_code ac + WHERE a.variation_id IN ($id_str) + AND a.allele_code_id = ac.allele_code_id + GROUP BY ac.allele + }); + + $unmapped_sth->execute or die "Failed to fetch unmapped variation alleles for variation ids: $id_str"; + + my $unmapped_alleles; + + while (my ($v_id, $allele) = $unmapped_sth->fetchrow_array) { + push @{ $unmapped_alleles->{$v_id} ||= [] }, $allele; + } + + $unmapped_sth->finish(); + + for my $v_id (keys %$unmapped_alleles) { + + my $allele_string = join '/', @{ $unmapped_alleles->{$v_id} }; + + # we don't know what the reference is here + + my $ref_correct = 0; + + my $so_term = SO_variation_class($allele_string, $ref_correct); + + my $attrib_id = $aa->attrib_id_for_type_value('SO_term', $so_term); + + die "No attrib_id for $so_term" unless defined $attrib_id; + + $v_insert_sth->execute($attrib_id, $v_id); + } + } + + $vf_insert_sth->finish(); + $v_insert_sth->finish(); +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/TranscriptEffect.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/TranscriptEffect.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,96 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +package Bio::EnsEMBL::Variation::Pipeline::TranscriptEffect; + +use strict; +use warnings; + +use Bio::EnsEMBL::Variation::TranscriptVariation; +use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(MAX_DISTANCE_FROM_TRANSCRIPT); + +use base qw(Bio::EnsEMBL::Variation::Pipeline::BaseVariationProcess); + +my $DEBUG = 0; + +sub run { + my $self = shift; + + my $transcript_id = $self->required_param('transcript_stable_id'); + + my $disambiguate_sn_alleles = + $self->param('disambiguate_single_nucleotide_alleles'); + + my $variations_to_include; + + if (my $vars = $self->param('variations_to_include')) { + # turn the list of variation names into a hash to speed up checking + $variations_to_include = { map { $_ => 1 } @$vars }; + } + + my $core_dba = $self->get_species_adaptor('core'); + my $var_dba = $self->get_species_adaptor('variation'); + + my $ta = $core_dba->get_TranscriptAdaptor; + my $sa = $core_dba->get_SliceAdaptor; + + my $tva = $var_dba->get_TranscriptVariationAdaptor; + + my $transcript = $ta->fetch_by_stable_id($transcript_id) + or die "failed to fetch transcript for stable id: $transcript_id"; + + # we need to include failed variations + + $tva->db->include_failed_variations(1); + + my $slice = $sa->fetch_by_transcript_stable_id( + $transcript->stable_id, + MAX_DISTANCE_FROM_TRANSCRIPT + ) or die "failed to get slice around transcript: ".$transcript->stable_id; + + for my $vf ( @{ $slice->get_all_VariationFeatures }, + @{ $slice->get_all_somatic_VariationFeatures } ) { + + if (defined $variations_to_include) { + next unless $variations_to_include->{$vf->variation_name}; + } + + my $tv = Bio::EnsEMBL::Variation::TranscriptVariation->new( + -transcript => $transcript, + -variation_feature => $vf, + -adaptor => $tva, + -disambiguate_single_nucleotide_alleles => $disambiguate_sn_alleles, + ); + + # if the variation has no effect on the transcript $tv will be undef + + if ($tv && ( scalar(@{ $tv->consequence_type }) > 0) ) { + $tva->store($tv); + } + } + + return; +} + +sub write_output { + my $self = shift; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/TranscriptFileAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/TranscriptFileAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,102 @@ +package Bio::EnsEMBL::Variation::Pipeline::TranscriptFileAdaptor; + +use strict; + +use Digest::MD5 qw(md5_hex); + +sub new { + my $class = shift; + + my %args = @_; + + my $self = bless {}, $class; + + if ($args{fasta_file}) { + $self->{fasta_file} = $args{fasta_file}; + } + + if ($args{transcripts}) { + $self->_dump_translations($args{transcripts}); + } + + return $self; +} + +sub get_translation_seq { + my ($self, $translation_md5) = @_; + + my $fasta = $self->get_translation_fasta($translation_md5); + + $fasta =~ s/>.*\n//m; + $fasta =~ s/\s//mg; + + return $fasta; +} + +sub get_translation_fasta { + my ($self, $translation_md5) = @_; + + my $file = $self->{fasta_file}; + + my $fasta = `samtools faidx $file $translation_md5`; + + return $fasta; +} + +sub get_all_translation_md5s { + my $self = shift; + + my $fasta = $self->{fasta_file}; + + my @ids = map {/>(.+)\n/; $1} `grep '>' $fasta`; + + return \@ids; +} + +sub _dump_translations { + + my ($self, $transcripts) = @_; + + # dump the translations out to the FASTA file + + my $fasta = $self->{fasta_file}; + + open my $FASTA, ">$fasta" or die "Failed to open $fasta for writing"; + + # get rid of any existing index file + + if (-e "$fasta.fai") { + unlink "$fasta.fai" or die "Failed to delete fasta index file"; + } + + my %seen_md5; + + for my $transcript (@$transcripts) { + + my $tl = $transcript->translation; + + next unless $tl; + + my $protein = $tl->seq; + + my $md5 = md5_hex($protein); + + next if $seen_md5{$md5}++; + + $protein =~ s/(.{80})/$1\n/g; + + # get rid of any trailing newline + chomp $protein; + + print $FASTA ">$md5\n$protein\n"; + } + + close $FASTA; + + # index the file + + `samtools faidx $fasta`; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/UpdateVariationFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/UpdateVariationFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,58 @@ +package Bio::EnsEMBL::Variation::Pipeline::UpdateVariationFeature; + +use strict; +use warnings; + +use base qw(Bio::EnsEMBL::Variation::Pipeline::BaseVariationProcess); + +sub run { + + my $self = shift; + + my $var_dba = $self->get_species_adaptor('variation'); + + my $dbc = $var_dba->dbc; + + # first set the default consequence type + + $dbc->do(qq{ + UPDATE variation_feature + SET consequence_types = 'intergenic_variant' + }) or die "Failed to reset consequence_types on variation_feature"; + + # create a temp table (dropping it if it exists) + + my $temp_table = 'variation_feature_with_tv'; + + $dbc->do(qq{DROP TABLE IF EXISTS $temp_table}) + or die "Failed to drop pre-existing temp table"; + + $dbc->do(qq{CREATE TABLE $temp_table LIKE variation_feature}) + or die "Failed to create temp table"; + + # concatenate the consequence types from transcript_variation + + $dbc->do(qq{ + INSERT INTO $temp_table (variation_feature_id, consequence_types) + SELECT variation_feature_id, GROUP_CONCAT(DISTINCT(consequence_types)) + FROM transcript_variation + GROUP BY variation_feature_id + }) or die "Populating temp table failed"; + + # update variation_feature + + $dbc->do(qq{ + UPDATE variation_feature vf, $temp_table tvf + SET vf.consequence_types = tvf.consequence_types + WHERE vf.variation_feature_id = tvf.variation_feature_id + }) or die "Failed to update vf table"; + + # and get rid of our temp table + + $dbc->do(qq{DROP TABLE $temp_table}) + or die "Failed to drop temp table"; + +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/VariantQC/FinishVariantQC.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/VariantQC/FinishVariantQC.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,204 @@ + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +package Bio::EnsEMBL::Variation::Pipeline::VariantQC::FinishVariantQC; + +use strict; +use warnings; + +use base qw(Bio::EnsEMBL::Variation::Pipeline::BaseVariationProcess); + +my $DEBUG = 0; + +## This runs some basic checks on new tables to produce a report +## swap tables names if they look correct [TO DO] + +sub run { + + my $self = shift; + + my $dir = $self->required_param('pipeline_dir'); + + open my $report, ">", "$dir/QC_report.txt" || die "Failed to open QC_report.txt : $!\n"; + print $report "\n\tChecking post-QC tables \n\n"; + + my $var_dba = $self->get_species_adaptor('variation'); + + + ## Have all rows been processed + my ($allele_number, $all_ok) = check_all_processed($var_dba, $report); + + + ## What are the failure rates for alleles and variants + get_failure_rates($var_dba, $report, $allele_number ); + + + ## what are failure reasons for alleles and variants + check_failure_rates($var_dba, $report); + + + ## rename tables + if( $all_ok ==2 ){ + print $report "\n\tOK to rename post-QC tables \n\n"; + # rename_tables($var_dba); + } + +} + + + + ## Have all rows been processed + sub check_all_processed{ + + my $var_dba = shift; + my $report = shift; + + my $old_varfeat_ext_sth = $var_dba->dbc->prepare(qq[ select count(*) from variation_feature]); + my $new_varfeat_ext_sth = $var_dba->dbc->prepare(qq[ select count(*) from variation_feature_working]); + my $old_allele_ext_sth = $var_dba->dbc->prepare(qq[ select count(*) from allele]); + my $new_allele_ext_sth = $var_dba->dbc->prepare(qq[ select count(*) from allele_working]); + + + $old_varfeat_ext_sth->execute() || die "Failed to extract old_varfeat count \n"; + $new_varfeat_ext_sth->execute() || die "Failed to extract new_varfeat count\n"; + $old_allele_ext_sth->execute() || die "Failed to extract old_allele count\n"; + $new_allele_ext_sth->execute() || die "Failed to extract new_allele count\n"; + + + my $old_varfeat = $old_varfeat_ext_sth->fetchall_arrayref(); + my $new_varfeat = $new_varfeat_ext_sth->fetchall_arrayref(); + my $old_allele = $old_allele_ext_sth->fetchall_arrayref(); + my $new_allele = $new_allele_ext_sth->fetchall_arrayref(); + + my $all_ok = 0; ## can we proceed to rename tables? + + if($old_varfeat->[0]->[0] == $new_varfeat->[0]->[0]){ + print $report "Variation_Feature: Correct number of entries seen in variation_feature: $new_varfeat->[0]->[0]\n\n"; + $all_ok++; + } + else{ + print $report "Variation_Feature: ERROR old table has :$old_varfeat rows, new table has $new_varfeat->[0]->[0]\n\n"; + } + + if($old_allele->[0]->[0] == $new_allele->[0]->[0]){ + print $report "Allele: Correct number of entries seen : $new_allele->[0]->[0]\n"; + $all_ok++; + } + else{ + print $report "Allele: ERROR old table has : $old_allele->[0]->[0] rows, new table has: $new_allele->[0]->[0]\n"; + } + + return ( $new_allele->[0]->[0], $all_ok ); + +} + + + sub get_failure_rates{ + + my ($var_dba, $report, $allele_number) = @_; + + my $variation_ext_sth = $var_dba->dbc->prepare(qq[ select count(*) from variation]); + my $varfail_ext_sth = $var_dba->dbc->prepare(qq[ select count(distinct variation_id) from failed_variation_working]); + my $allelefail_ext_sth = $var_dba->dbc->prepare(qq[ select count(*) from failed_allele_working]); + + + $variation_ext_sth->execute() || die "Failed to extract variation count\n"; + $varfail_ext_sth->execute() || die "Failed to extract varfail count\n"; + $allelefail_ext_sth->execute() || die "Failed to extract allelefail count\n"; + + + my $variation = $variation_ext_sth->fetchall_arrayref(); + my $varfail = $varfail_ext_sth->fetchall_arrayref(); + my $allelefail = $allelefail_ext_sth->fetchall_arrayref(); + + + my $var_fail_rate = substr((100 * $varfail->[0]->[0] / $variation->[0]->[0] ), 0,5 ); + + print $report "\nVariation failure rate: $var_fail_rate % [$varfail->[0]->[0] / $variation->[0]->[0] ]\n"; + + + my $allele_fail_rate = substr((100 * $allelefail->[0]->[0] / $allele_number ), 0,5 ); + + print $report "\nAllele failure rate: $allele_fail_rate % [$allelefail->[0]->[0] /$allele_number ] \n\n"; + + +} + +## Breakdown of fails by reason +sub check_failure_rates{ + + my ($var_dba, $report) = @_; + + my $vardesc_ext_sth = $var_dba->dbc->prepare(qq[ select fd.description,count(*) + from failed_description fd, failed_variation_working fv + where fv.failed_description_id = fd.failed_description_id + group by fd.description]); + + my $alleledesc_ext_sth = $var_dba->dbc->prepare(qq[ select fd.description,count(*) + from failed_description fd, failed_allele_working fa + where fa.failed_description_id = fd.failed_description_id + group by fd.description]); + + + $vardesc_ext_sth->execute() || die "Failed to extract variation fail reasons\n"; + $alleledesc_ext_sth->execute() || die "Failed to extract allele fail reasons\n"; + + + my $vardesc = $vardesc_ext_sth->fetchall_arrayref(); + my $alleledesc = $alleledesc_ext_sth->fetchall_arrayref(); + + + + print $report "\nVariation Failure reasons:\n"; + foreach my $l (@{$vardesc}){ + print $report "\t$l->[1]\t$l->[0]\n"; + } + + print $report "\nAllele Failure reasons:\n"; + foreach my $l (@{$alleledesc}){ + print $report "\t$l->[1]\t$l->[0]\n"; + } +} + + +sub rename_tables{ + + my ($var_dba) = shift; + + ## Keep orignal tables in short term + #$var_dba->dbc->do(qq[ rename table allele to allele_before_pp ]); + #$var_dba->dbc->do(qq[ rename table variation_feature to variation_feature_before_pp ]); + #$var_dba->dbc->do(qq[ rename table failed_allele to failed_allele_before_pp ]); ## Not needed post dev phase + #$var_dba->dbc->do(qq[ rename table failed_variation to failed_variation_before_pp ]); ## Not needed post dev phase + + + ## Rename working tables + #$var_dba->dbc->do(qq[ rename table allele_working to allele ]); + #$var_dba->dbc->do(qq[ rename table variation_feature_working to variation_feature ]); + #$var_dba->dbc->do(qq[ rename table failed_allele_working to failed_allele ]); ## Not needed post dev phase + #$var_dba->dbc->do(qq[ rename table failed_variation_working to failed_variation ]); ## Not needed post dev phase + + ## does this need binning? + #$var_dba->dbc->do(qq[ update variation set flipped = 0 ]); + #$var_dba->dbc->do(qq[ update variation set flipped = 1 where variation_id in (select variation_id from variation_to_reverse_working) ]); + +} +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/VariantQC/InitVariantQC.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/VariantQC/InitVariantQC.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,212 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + + + +=head1 NAME + +Bio::EnsEMBL::Variation::Pipeline::VariantQC::InitVariantQC + +=head1 DESCRIPTION + +Initiation module for variant QC eHive process + +=cut + +package Bio::EnsEMBL::Variation::Pipeline::VariantQC::InitVariantQC; + +use strict; +use warnings; + +use base qw(Bio::EnsEMBL::Variation::Pipeline::BaseVariationProcess); + +my $DEBUG = 0; + +sub fetch_input { + + my $self = shift; + + + my $core_dba = $self->get_species_adaptor('core'); + my $var_dba = $self->get_species_adaptor('variation'); + + my $start_at_variation_id = $self->required_param('start_at_variation_id'); + + ### new variation feature tables to write to + unless($self->required_param('create_working_tables') == 0){ + create_working_tables( $var_dba); + } + if($self->required_param('create_map_table') ==1){ + ### create temp table to hold number of mapping to reference genome + create_map_weight_table($core_dba,$var_dba); + } + + + ### get max variation_id to set up batches for processing + my $data_ext_sth = $var_dba->dbc->prepare(qq[ SELECT MAX(variation_id) + FROM variation ]); + + $data_ext_sth->execute() || die "Failed to extract variation_id\n"; + my $max_id = $data_ext_sth->fetchall_arrayref(); + + + ### bin detailed qc + + my @qc_start_id; + + my $start_from = int( $start_at_variation_id / $self->param('qc_batch_size')); + my $qc_var_jobs = int( $max_id->[0]->[0] / $self->param('qc_batch_size') ); + + $self->warning( 'Defining jobs, $start_from - $qc_var_jobs batch size: ' . $self->param('qc_batch_size') ); + + for my $n ( $start_from .. $qc_var_jobs){ + my $start = $n * $self->param('qc_batch_size'); + push @qc_start_id, {start_id => $start}; + } + $self->param('qc_start_ids', \@qc_start_id); + + + + ## bin unmapped var check in larger chunks + + $self->warning( 'Running $jobs jobs, batch size: ' . $self->param('unmapped_batch_size') ); + my @unmapped_start_id; + + my $start_unmapped_from = int( $start_at_variation_id / $self->param('unmapped_batch_size') ); + my $unmapped_var_jobs = int( $max_id->[0]->[0] / $self->param('unmapped_batch_size') ); + + for my $n ( $start_unmapped_from .. $unmapped_var_jobs){ + + my $start = $n * $self->param('unmapped_batch_size'); + push @unmapped_start_id, {start_id => $start}; + } + $self->param('unmapped_start_ids', \@unmapped_start_id); + + + +} + +sub create_working_tables{ + + my $var_dba = shift; + + ## table to hold variation feature info after fliping & ref allele aissignment + $var_dba->dbc->do(qq{CREATE TABLE variation_feature_working like variation_feature}); + + ## table to hold allele info after fliping + $var_dba->dbc->do(qq{CREATE TABLE allele_working like allele}); + + ## table to hold failed variations ## TEMP FOR DEBUG + $var_dba->dbc->do(qq{CREATE TABLE failed_variation_working like failed_variation}); + + ## table to hold failed alleles ## TEMP FOR DEBUG + $var_dba->dbc->do(qq{CREATE TABLE failed_allele_working like failed_allele}); + + ## table to hold list of flipped variation_ids ## TEMP FOR DEBUG + $var_dba->dbc->do(qq{CREATE TABLE variation_to_reverse_working (variation_id int(10) unsigned not null) }); + + + +} + +## create a look-up table for the number of times a variant maps to the reference +sub create_map_weight_table{ + + my $core_dba = shift; + my $var_dba = shift; + + #### is it better to use not in () or temp columns?? + my $ref_ext_sth = $core_dba->dbc->prepare(qq [select sra.seq_region_id + from seq_region_attrib sra, attrib_type at + where sra.attrib_type_id=at.attrib_type_id + and at.name="Non Reference"]); + + $ref_ext_sth->execute()|| die "Failed to extract ref/non ref status for seq_regions\n"; + my $non_ref = $ref_ext_sth->fetchall_arrayref(); + + $var_dba->dbc->do(qq{alter table seq_region add column is_reference Tinyint(1) default 1}); + my $sr_status_sth = $var_dba->dbc->prepare(qq[update seq_region set is_reference = 0 where seq_region_id =?]); + + foreach my $srid (@{$non_ref}){ + $sr_status_sth->execute($srid->[0]) || die "ERROR updating seq regions\n"; + } + + + ### Is this slow? - 13mins for human; could chunk it.. + #create a temporary table to store the map_weight, that will be deleted by the last process + $var_dba->dbc->do(qq[ CREATE TABLE tmp_map_weight_working + SELECT variation_id, count(*) as count + FROM variation_feature,seq_region + WHERE variation_feature.seq_region_id = seq_region.seq_region_id + AND seq_region.is_reference =1 + GROUP BY variation_id] + ); + + $var_dba->dbc->do(qq{ALTER TABLE tmp_map_weight_working + ADD UNIQUE INDEX variation_idx(variation_id)}); + + #add additional variation_ids only appear in haplotype chromosomes #removed IGNORE - test!! + $var_dba->dbc->do(qq{INSERT INTO tmp_map_weight_working + SELECT variation_id, count(*) as count + FROM variation_feature,seq_region + WHERE variation_feature.seq_region_id = seq_region.seq_region_id + AND seq_region.is_reference =0 + GROUP BY variation_id}); + + ## clean up seq_region table + $var_dba->dbc->do(qq{alter table seq_region drop column is_reference}); + + ### test above & not in (tmp_map_weight) approach + +} + +sub write_output { + + my $self = shift; + + ## No map fails - larger bins used as check is very quick + + unless ($self->param('run_unmapped_var') == 0){ + my $unmapped_start_ids = $self->param('unmapped_start_ids'); + + $self->warning(scalar @{$unmapped_start_ids} .' unmapped_variant_qc jobs to do'); + + $self->dataflow_output_id($unmapped_start_ids, 3); + } + + ## Variant QC - bin start positions supplied + + unless ($self->param('run_variant_qc') == 0){ + my $qc_start_ids = $self->param('qc_start_ids'); + + $self->warning(scalar @{$qc_start_ids} .' variant_qc jobs to do'); + + $self->dataflow_output_id($qc_start_ids, 2); + } + + + ## run basic checks when everything is updated + + $self->dataflow_output_id($self->param('finish_variation_qc'), 4); + + + return; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/VariantQC/UnmappedVariant.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/VariantQC/UnmappedVariant.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,81 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +package Bio::EnsEMBL::Variation::Pipeline::VariantQC::UnmappedVariant; + +use strict; +use warnings; + +use base qw(Bio::EnsEMBL::Variation::Pipeline::BaseVariationProcess); + + +=head1 NAME + + Bio::EnsEMBL::Variation::Pipeline::VariantQC::UnmappedVariant + +=head1 DESCRIPTION + +This module extracts a list of ids for variations without a mapped location on the currents genomic sequence and enters them in the failed variation table. +It is run as a seperate independant process alongside the main variant QC + +=cut + +sub run { + + my $self = shift; + + my $first = $self->required_param('start_id'); + my $last = $first + $self->required_param('batch_size') -1; + if($first ==1){$last--;} + + my $var_dba = $self->get_species_adaptor('variation'); + + my $fail_ins_sth = $var_dba->dbc->prepare(qq[insert into failed_variation_working + (variation_id, failed_description_id) + values (?,?) + ]); + + + + my $mapfail_extr_sth = $var_dba->dbc->prepare(qq[select variation.variation_id + from variation + where variation.variation_id between $first and $last + and variation.variation_id not in( select variation_feature.variation_id from variation_feature) + ]); + + + ## export current data + $mapfail_extr_sth->execute()||die; + + my $fails = $mapfail_extr_sth->fetchall_arrayref(); + + return unless(defined $fails >[0]->[0]); + + + ## write to fail table + foreach my $row( @{$fails}){ + + $fail_ins_sth->execute($row->[0], 5)|| die "ERROR inserting variation fails info\n"; + } +} + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/VariantQC/UpdatePopulationGenotype.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/VariantQC/UpdatePopulationGenotype.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,81 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +package Bio::EnsEMBL::Variation::Pipeline::VariantQC::UpdatePopulationGenotype; + +use strict; +use warnings; + +use base qw(Bio::EnsEMBL::Variation::Pipeline::BaseVariationProcess); + + +=head1 NAME + + Bio::EnsEMBL::Variation::Pipeline::VariantQC::UpdatePopulationGenotype + +=head1 DESCRIPTION + +This module migrates the population genotype table from old to new schema version +It is run as a seperate independant process after the main variant QC + +=cut + +sub run { + + my $self = shift; + + + + my $var_dba = $self->get_species_adaptor('variation'); + + + ## populate temp table with genotypes from individual_genotype_multiple_bp + $var_dba->dbc->do(qq[INSERT IGNORE INTO genotype_code_tmp(allele_1, allele_2) + SELECT distinct allele_1, allele_2 FROM individual_genotype_multiple_bp]); + + ## populate temp table with genotypes from population_genotype + $var_dba->dbc->do(qq[INSERT IGNORE INTO genotype_code_tmp(allele_1, allele_2) + SELECT distinct allele_1, allele_2 FROM population_genotype]); + + ## populate genotype code with both alleles + $var_dba->dbc->do(qq[INSERT INTO genotype_code + SELECT t.genotype_code_id, ac.allele_code_id, 1 + FROM genotype_code_tmp t, allele_code ac + WHERE t.allele_1 = ac.allele ]); + + $var_dba->dbc->do(qq[INSERT INTO genotype_code + SELECT t.genotype_code_id, ac.allele_code_id, 2 + FROM genotype_code_tmp t, allele_code ac + WHERE t.allele_2 = ac.allele ]); + + $var_dba->dbc->do(qq[ALTER TABLE genotype_code ORDER BY genotype_code_id, haplotype_id ASC]); + + + $var_dba->dbc->do(qq[insert into population_genotype_working + select pg.population_genotype_id, pg.variation_id, pg.subsnp_id, gc.genotype_code_id, + pg.frequency, pg.sample_id, pg.count + from population_genotype pg, genotype_code_tmp gc + where pg.allele_1 = gc.allele_1 and pg.allele_2 = gc.allele_2 ]); + +} + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/VariantQC/VariantQC.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/VariantQC/VariantQC.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1065 @@ + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + + +=head1 NAME + +Bio::EnsEMBL::Variation::Pipeline::VariantQC::VariantQC + +=head1 DESCRIPTION + +Runs basic quality control on variant and allele data imported from extrenal sources + +=cut + +package Bio::EnsEMBL::Variation::Pipeline::VariantQC::VariantQC; + + +use strict; +use warnings; + +use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp expand); +use base qw(Bio::EnsEMBL::Variation::Pipeline::BaseVariationProcess); + + +our $DEBUG = 1; + +our %AMBIG_REGEXP_HASH = ( + 'M' => '[AC]', + 'R' => '[AG]', + 'W' => '[AT]', + 'S' => '[CG]', + 'Y' => '[CT]', + 'K' => '[GT]', + 'V' => '[ACG]', + 'H' => '[ACT]', + 'D' => '[AGT]', + 'B' => '[CGT]', + 'X' => '[ACGT]', + 'N' => '[ACGT]' + ); + + + +=head2 run + + Run checks on variant data; submit failed_variation and strand-corrected variation feature database updates then launch further allele checking + +=cut +sub run { + + my $self = shift; + + ### start and end variation_id supplied + my $first = $self->required_param('start_id'); + my $last = $first + $self->required_param('batch_size') -1; + if($first ==1){$last--;} + + if( $DEBUG == 1){$self->warning("Starting to run variantQC with $first & $last " );} + + + #ÊGet a string containing the possible ambiguity nucleotides + my $AMBIGUITIES = join("",keys(%AMBIG_REGEXP_HASH)); + + # Add the code for uracil in case some allele should have that + %AMBIG_REGEXP_HASH = (%AMBIG_REGEXP_HASH,('U' => 'T')); + + + + my %fail_variant; # hash of arrays of failed variation_ids + my %fail_allele; # hash of arrays of arrays failed variation_ids & alleles + my %flip; # hash of variation ids which have their strand changed in this process + + + my $var_dba = $self->get_species_adaptor('variation'); + + ## slice needed for ref check + my $core_dba = $self->get_species_adaptor('core'); + my $slice_ad = $core_dba->get_SliceAdaptor; + + + ## export current variation_feature data, adding allele_string if required + my $to_check; + if($self->required_param('do_allele_string') == 1){ + $to_check = export_data_adding_allele_string($var_dba, $first, $last); + } + else{ + $to_check = export_data_with_allele_string($var_dba, $first, $last); + } + + + foreach my $var (@{$to_check}){ + + + ## Type 1 - fail variant if >1 mapping seen + + if($var->{map} >1){ + push @{$fail_variant{1}}, $var->{v_id}; + } + + + ## Type 4 - novariation fails - flag variants & alleles as fails & don't run further checks + + if($var->{allele} =~ /NOVARIATION/){ + push @{$fail_variant{4}}, $var->{v_id}; + push @{$fail_allele{4}}, [$var->{v_id}, "NOVARIATION"]; ## unflipped allele failed throughout + next; + } + + if($var->{allele} =~ /HGMD_MUTATION/){ + ## only locations are available, so no further checking is possible. + next; + } + + + # expand alleles if they contain brackets and numbers before other checks + my $expanded = $var->{allele}; + expand(\$expanded); + + + + ## Type 13 - non-nucleotide chars seen - flag variants & alleles as fails & don't run further checks + + my $illegal_alleles = check_illegal_characters($expanded,$AMBIGUITIES); + if(defined $illegal_alleles->[0] ) { + push @{$fail_variant{13}}, $var->{v_id}; + + foreach my $ill_al( @{$illegal_alleles} ){ + push @{$fail_allele{13}}, [$var->{v_id}, $ill_al]; ## unflipped allele failed throughout + } + next; + } + + + ## Type 3 flag variation as fail if it has [A/T/G/C] allele string + + my $all_possible_check = check_four_bases($expanded); + if ($all_possible_check ==1){ + push @{$fail_variant{3}}, $var->{v_id} + } + + ## Type 14 resolve ambiguities before reference check - flag variants & alleles as fails + + if($expanded =~ m/[$AMBIGUITIES]/){ + $expanded = remove_ambiguous_alleles(\$expanded,$AMBIGUITIES); + push @{$fail_variant{14}}, $var->{v_id}; + + ## identify specific alleles to fail + my $ambiguous_alleles = find_ambiguous_alleles($var->{allele},$AMBIGUITIES); + foreach my $amb_al( @{$ambiguous_alleles} ){ + push @{$fail_allele{14}}, [$var->{v_id}, $amb_al]; ## unflipped allele failed throughout + } + + } + ## Further checks only run for variants with <3 map locations [Why not single] + + next if $var->{map} > 1; + + + ## flip allele string if on reverse strand and single mapping + + if( $var->{strand} eq "-1" ){ + reverse_comp(\$expanded ); ## for ref check + if( $var->{allele}=~ /\(/){ + $var->{allele} = rev_tandem($var->{allele}); + } + else{ + reverse_comp(\$var->{allele} ); ## for database storage + } + $var->{strand} = 1; + + ### store variation_id to use when flipping alleles + $flip{$var->{v_id}} = 1; + } + + + + # Extract reference sequence to run ref checks [ compliments for reverse strand multi-mappers] + + my $ref_seq = get_reference_base($var, $slice_ad) ; + + unless(defined $ref_seq){ + ## don't check further if obvious coordinate error + push @{$fail_variant{15}}, $var->{v_id}; + next; + } + + + my $match_coord_length = 0; ## is either allele of compatible length with given coordinates? + + my @alleles = split/\//, $var->{allele} ; + foreach my $al(@alleles){ + my $ch = $al; + expand(\$ch); ## run ref check against ATATAT not (AT)3 + if($ch eq $ref_seq){ + $var->{ref} = $al; + $match_coord_length = 1; + } + else{ + $var->{alt} .= "/" . $al; + ## if one of these suggests an insertion and the other a substitution, the sizes are not compatible + if(length($ref_seq ) == length($ch) && $ch ne "-" && $ref_seq ne "-"){$match_coord_length = 1;} + } + } + + unless ($match_coord_length == 1){ + push @{$fail_variant{15}}, $var->{v_id}; + } + + ## lengths ok - is actual base in agreement? + if( defined $var->{ref}){ + ## re-order the allele_string field with the reference allele first + $var->{allele} = $var->{ref} . $var->{alt}; + } + else{ + ## Type 2 - flag variants as fails if neither allele matches the reference + push @{$fail_variant{2}}, $var->{v_id} ; + } + } + + + ## Database updates + + ## write to new variation featues + insert_variation_features($var_dba, $to_check); + + ## write to new failed_variation table + write_variant_fails($var_dba, \%fail_variant); + + ## update variation table + write_variant_flips($var_dba, \%flip); + + + ## allele-specific checks - run on updated variation_feature table + run_allele_checks($self, \%flip, \%fail_allele); + +} + + + +=head2 run_allele_checks + + Run checks on allele table; submit submit failed_allele and strand-corrected allele updates + +=cut +sub run_allele_checks { + + my $self = shift; + my $flip = shift; + my $fail = shift; + + ### start and end variation_id supplied + my $first = $self->required_param('start_id'); + my $last = $first + $self->required_param('batch_size') - 1; + if($first ==1){$last--;} + + my %fail_all; + + my $var_dba = $self->get_species_adaptor('variation'); + + + my ($var_data, $allele_codes); + ## export current allele & variation_feature data - alleles flipped where needed + if( $self->required_param('schema') eq 'old'){ + $var_data = export_allele_data($var_dba, $first, $last, $flip); + } + else{ + $var_data = export_allele_data_new_schema($var_dba, $first, $last, $flip); + } + + + foreach my $var( keys %$var_data){ + + ## cope with unmapped variants with alleles or named alleles, but don't check them + next unless (defined $var_data->{$var}->{vf_allele} && $var_data->{$var}->{vf_allele} =~/\w+/); ## neaten + + ## isolate expected alleles + my %expected_alleles; + my @expected_alleles = split/\//, $var_data->{$var}->{vf_allele}; + foreach my $exp(@expected_alleles){ + $expected_alleles{$exp} = 1; + } + + + ## check through allele submissions + foreach my $submitted_data (@{$var_data->{$var}->{allele_data}}){ + + my $check_allele = $submitted_data->[6]; + + ## $submitted_data content: [ al.allele_id, al.subsnp_id, al.allele[code_id], al.frequency, al.sample_id, al.count, al.allele ] + + unless( exists $expected_alleles{$check_allele} && $expected_alleles{$check_allele} ==1 ){ ## check expected on allele not allele_code + + if(defined $submitted_data->[4]){ + ###fail whole experimental set + $fail_all{$submitted_data->[1]}{$submitted_data->[4]} = 1; ## subsnp_id, sample_id + } + else{ + push @{$fail->{11}}, [$var, $submitted_data->[6] ]; ## var_id, allele + } + } + } + } + ## write allele records to new table flipping strands as needed + write_allele($var_dba, $var_data, $self->required_param('schema')); + + ### fail full experimental result for sample & subsnp_id + write_allele_fails($var_dba , $fail, \%fail_all, $var_data,$self->required_param('schema') ); +} + + +=head2 export_data_with_allele_string + + Extract variant data for checking taking allele_string from variation_feature.allele_string table + +=cut +sub export_data_with_allele_string{ + + my ($var_dba, $first, $last) = @_; + + my @to_check; + + my $data_ext_sth = $var_dba->dbc->prepare(qq[SELECT v.variation_id, + v.name, + vf.variation_feature_id, + vf.seq_region_id, + vf.seq_region_start, + vf.seq_region_end, + vf.seq_region_strand, + vf.allele_string, + tmw.count, + vf.source_id, + vf.consequence_types, + vf.variation_set_id, + vf.somatic, + vf.class_attrib_id, + sr.name + FROM + variation v, + variation_feature vf, + seq_region sr, + tmp_map_weight_working tmw + WHERE + v.variation_id between ? and ? + and vf.variation_id = v.variation_id + and vf.variation_id = tmw.variation_id + and vf.seq_region_id = sr.seq_region_id + ]); + + $data_ext_sth->execute($first, $last)|| die "ERROR extracting variation feature info\n"; + + my $data = $data_ext_sth->fetchall_arrayref(); + foreach my $l(@{$data}){ + my %save; + + $save{v_id} = $l->[0]; + $save{name} = $l->[1]; + $save{vf_id} = $l->[2]; + $save{seqreg_id} = $l->[3]; + $save{start} = $l->[4]; + $save{end} = $l->[5]; + $save{strand} = $l->[6]; + $save{allele} = $l->[7]; + $save{map} = $l->[8]; + $save{source_id} = $l->[9]; + $save{consequence_types} = $l->[10]; + $save{variation_set_id} = $l->[11]; + $save{somatic} = $l->[12]; + $save{class_attrib_id} = $l->[13]; + $save{seqreg_name} = $l->[14]; + + push @to_check,\%save;; + } + + return (\@to_check); + +} + +=head2 export_data_adding_allele_string + + Extract variant data for checking taking allele_string from allele_string table + if required (dbSNP import pipeline does not populate variation_feature.allele_string) + +=cut +sub export_data_adding_allele_string{ + + my ($var_dba, $first, $last) = @_; + + my @to_check; + + my $variant_ext_sth = $var_dba->dbc->prepare(qq[SELECT v.variation_id, + v.name, + vf.variation_feature_id, + vf.seq_region_id, + vf.seq_region_start, + vf.seq_region_end, + vf.seq_region_strand, + tmw.count, + vf.source_id, + vf.consequence_types, + vf.variation_set_id, + vf.somatic, + vf.class_attrib_id, + sr.name + FROM + variation v, + variation_feature vf, + seq_region sr, + tmp_map_weight_working tmw + WHERE + v.variation_id between ? and ? + and vf.variation_id = v.variation_id + and vf.variation_id = tmw.variation_id + and vf.seq_region_id = sr.seq_region_id + ]); + + + my $allele_ext_sth = $var_dba->dbc->prepare(qq[SELECT a.variation_id, + a.allele + FROM allele_string a + WHERE a.variation_id between ? and ? + ]); + + + ### get allele information + + my %alleles; + + $allele_ext_sth->execute($first, $last)|| die "ERROR extracting allele feature info\n"; + + my $allele_data = $allele_ext_sth->fetchall_arrayref(); + + foreach my $l(@{$allele_data}){ + push @{$alleles{$l->[0]}}, $l->[1]; + } + + ### get variant information + + $variant_ext_sth->execute($first, $last)|| die "ERROR extracting variation feature info\n"; + + my $variant_data = $variant_ext_sth->fetchall_arrayref(); + foreach my $l(@{$variant_data}){ + my %save; + + $save{v_id} = $l->[0]; + $save{name} = $l->[1]; + $save{vf_id} = $l->[2]; + $save{seqreg_id} = $l->[3]; + $save{start} = $l->[4]; + $save{end} = $l->[5]; + $save{strand} = $l->[6]; + + + $save{map} = $l->[7]; + $save{source_id} = $l->[8]; + $save{consequence_types}= $l->[9]; + $save{variation_set_id} = $l->[10]; + $save{somatic} = $l->[11]; + + $save{seqreg_name} = $l->[13]; + + if( defined $alleles{$l->[0]}->[0]){ + $save{allele} = join '/', @{$alleles{$l->[0]}}; + } + else{ + $save{allele} = ""; + warn "No alleles available for variant $l->[1]"; + } + + push @to_check,\%save;; + } + + return (\@to_check); + +} + +=head2 export_allele_data + + Extract allele data and variation_feature allele string where available + Compliments alleles if variant has unique map location and is on reverse strand + + Uses old variation schema for compatibility with import pipeline + +=cut +sub export_allele_data{ + + my ($var_dba, $first, $last, $flip) = @_; + + my %save; + + my %done; + ### Look up expected allele string from new variation_feature table with flipped allele strings + my $data_ext_sth = $var_dba->dbc->prepare(qq[SELECT v.variation_id, + v.name, + vf.allele_string, + al.allele_id, + al.subsnp_id, + al.allele, + al.frequency, + al.sample_id, + al.count, + al.allele, + vf.variation_feature_id + FROM variation v join allele al on(v.variation_id = al.variation_id ) + left outer join variation_feature_working vf on (vf.variation_id = v.variation_id ) + WHERE + v.variation_id between ? and ? + ]); + + $data_ext_sth->execute($first, $last)|| die "ERROR extracting variation feature info\n"; + + my $data = $data_ext_sth->fetchall_arrayref(); + + foreach my $l(@{$data}){ + ## distinct on query causes database tmp to fill - handling hackily instead [only extracting variation_feature_id for this purpose] + next if $done{"$l->[3]\_$l->[10]"} ; + $done{"$l->[3]\_$l->[10]"} = 1; + $save{$l->[0]}{name} = $l->[1]; + $save{$l->[0]}{vf_allele} = $l->[2]; + + if($flip->{$l->[0]}){ + + ## update allele for flips + reverse_comp(\$l->[5]); + reverse_comp(\$l->[9]); + } + + push @{$save{$l->[0]}{allele_data}}, [$l->[3], $l->[4], $l->[5], $l->[6], $l->[7], $l->[8], $l->[9] ]; + + } + + return (\%save); + +} + +=head2 export_allele_data_new_schema + + Extract allele data and variation_feature allele string where available + Compliments alleles if variant has unique map location and is on reverse strand + + Uses new variation schema + +=cut +sub export_allele_data_new_schema{ + + my ($var_dba, $first, $last, $flip, $schema) = @_; + + my %save; + + + # hack to use with old schema + my %allele_codes; ## save as looked up - should be quicker than holding all unusual alleles + + #my $allele_code_ext_sth = $var_dba->dbc->prepare(qq[ select allele_code_id from allele_code where allele = ? ]); + + #my $allele_code_ins_sth = $var_dba->dbc->prepare(qq[ insert into allele_code (allele) values (?) ]); + + my %done; + ### Look up expected allele string from new variation_feature table with flipped allele strings + my $data_ext_sth = $var_dba->dbc->prepare(qq[SELECT v.variation_id, + v.name, + vf.allele_string, + al.allele_id, + al.subsnp_id, + al.allele_code_id, + al.frequency, + al.sample_id, + al.count, + alc.allele, + vf.variation_feature_id + FROM variation v join allele al on(v.variation_id = al.variation_id ) + join allele_code alc on (al.allele_code_id = alc.allele_code_id ) + left outer join variation_feature_working vf on (vf.variation_id = v.variation_id ) + WHERE + v.variation_id between ? and ? + ]); + + $data_ext_sth->execute($first, $last)|| die "ERROR extracting allele info\n"; + + my $data = $data_ext_sth->fetchall_arrayref(); + + foreach my $l(@{$data}){ + ## distinct on query causes database tmp to fill - handling hackily instead [only extracting variation_feature_id for this purpose] + next if $done{"$l->[3]\_$l->[10]"} ; + $done{"$l->[3]\_$l->[10]"} = 1; + $save{$l->[0]}{name} = $l->[1]; + $save{$l->[0]}{vf_allele} = $l->[2]; + + if($flip->{$l->[0]}){ + + ## update allele for flips + reverse_comp(\$l->[9]); + unless ($allele_codes{$l->[9]}){ + $allele_codes{$l->[9]} = get_allele_code($var_dba, $l->[9] ); + } + ## updated allele code for flip + $l->[5] = $allele_codes{$l->[9]} ; + } + + push @{$save{$l->[0]}{allele_data}}, [$l->[3], $l->[4], $l->[5], $l->[6], $l->[7], $l->[8], $l->[9] ]; + + } + + return (\%save); + +} + + +=head2 get_reference_base + + Extract sequence from genomic reference at variant coordinates + to check against supplied coordinates + +=cut +sub get_reference_base{ + + my ($var, $slice_ad) = @_; + + my $ref_seq; + + if( ($var->{end} +1) == $var->{start}){ ## convention for insertions to reference + $ref_seq = "-"; + } + + elsif( $var->{end} < $var->{start}){ ## coordinate error + warn "Incorrect coords $var->{start} - $var->{end} for $var->{name} \n"; + } + + else{ + + # retrieve the reference sequence at that mapping for deletion or substitution + + my $slice = $slice_ad->fetch_by_region('toplevel', $var->{seqreg_name}, $var->{start}, $var->{end}); + + unless (defined $slice){ die "ERROR Getting slice for $var->{seqreg_name}, $var->{start}, $var->{end}";} + $ref_seq = $slice->seq(); + + # correct for multi-mapping variants which may be on negative strand + if($var->{strand} eq "-1"){ reverse_comp(\$ref_seq);} + } + + return $ref_seq; + +} +=head2 check_illegal_characters + + Checks for non ambiguity code/ATGC character to fail + +=cut +sub check_illegal_characters{ + + my $allele = shift; + my $AMBIGUITIES = shift; + + ## HGMD_MUTATION is permitted as an allele string + $allele =~ s/\/|HGMD_MUTATION//g; + if ($allele =~ m /[^ACGTU\-$AMBIGUITIES]/i){ + ## identify specfic alleles to flag + my @fail; + my @al = split/\//, $allele; + foreach my $al(@al){ + if ($al =~ m /[^ACGTU\-$AMBIGUITIES]/i){ + push @fail, $al; + } + } + return \@fail; + } + else{ + return undef; + } +} +=head2 remove_ambiguous_alleles + + Expand ambiguous alleles to A/T/C/G + Returntype : expanded allele string + +=cut +sub remove_ambiguous_alleles{ + + my $allele_string = shift; + my $AMBIGUITIES = shift; + + $allele_string =~ s/([U$AMBIGUITIES])/$AMBIG_REGEXP_HASH{$1}/ig; + + return $allele_string; +} + +=head2 find_ambiguous_alleles + + Checks if any of the bases in an allele string are ambiguity codes + Returntype : reference to array of ambiguous alleles + +=cut +sub find_ambiguous_alleles{ + + my $allele_string = shift; + my $AMBIGUITIES = shift; + + my @fail; + my @al = split/\//, $allele_string; + foreach my $al(@al){ + if ($al =~ m /$AMBIGUITIES/i){ + push @fail, $al; + } + } + return \@fail; + +} +=head2 check_four_bases + + Checks if all 4 bases are present in allele string + Returntype : 1 if present, 0 if not + +=cut +sub check_four_bases{ + + my $allele_string =shift; + + my @alleles = split /\//, $allele_string ; + + return 0 if scalar(@alleles) < 4; + + + my %allele; + foreach my $al(@alleles){ + $allele{$al} = 1; + } + + if( exists $allele{A} && + exists $allele{T} && + exists $allele{C} && + exists $allele{G} ){ + return 1; + } + else{ + return 0; + } +} + + +=head2 write_variant_fails + + Update failed_variation_working with all variation_id/reason combinations + +=cut +sub write_variant_fails{ + + + my $var_dba = shift; + my $fail_list = shift; + + my $fail_ins_sth = $var_dba->dbc->prepare(qq[insert into failed_variation_working + (variation_id, failed_description_id) + values (?,?) + ]); + + + foreach my $reason (keys %{$fail_list}){ + + ## duplicates arise due to running on variation_features not variations + my @fails = unique(@{$fail_list->{$reason}}); + + foreach my $var( @fails ){ + $fail_ins_sth->execute($var, $reason)|| die "ERROR inserting variation fails info\n"; + } + } +} + + + +=head2 write_allele_fails + + Update failed_allele_working with all variation_id/reason combinations + + +=cut +sub write_allele_fails{ + + + my $var_dba = shift; + my $fail_list = shift; + my $fail_all = shift; + my $var_data = shift; + my $schema = shift; + + + my $allele_id_ext_sth ; + ## find allele id to fail - cope with different schemas + if($schema =~ /old/){ + $allele_id_ext_sth = $var_dba->dbc->prepare(qq[select allele_id + from allele_working + where allele =? + and allele_working.variation_id =? + ]); + } + else{ + $allele_id_ext_sth = $var_dba->dbc->prepare(qq[select allele_id + from allele_working, allele_code + where allele_code.allele =? + and allele_code.allele_code_id = allele_working.allele_code_id + and allele_working.variation_id =? + ]); + } + + + + ## those failing as a submitted set with frequencies + my $allele_set_id_ext_sth = $var_dba->dbc->prepare(qq[select allele_id + from allele_working + where allele_working.variation_id =? + and allele_working.subsnp_id =? + and allele_working.sample_id = ? + ]); + + + + my $fail_ins_sth = $var_dba->dbc->prepare(qq[insert into failed_allele_working + (allele_id, failed_description_id) + values (?,?) + ]); + + my %done; ## save on old pk to drop out eariler + my %done_new; ## save on new pk too + + foreach my $reason (keys %{$fail_list}){ + + foreach my $var( @{$fail_list->{$reason}} ){ + ## if the failure is a property of the reported allele, fail all entries with this allele + + if ($done{$reason}{$var->[0]}{$var->[1]}){ next; } ## duplicates arise due to running on multiple variation_feature.allele_string's + + $done{$reason}{$var->[0]}{$var->[1]} = 1; + + $allele_id_ext_sth->execute($var->[1], $var->[0])|| die "ERROR extracting allele id info\n"; + my $allele_id = $allele_id_ext_sth->fetchall_arrayref(); + + if ($allele_id->[0]->[0]){ + $done_new{$allele_id->[0]->[0]}{$reason} = 1; ## flag by reason and new id for screening later + $fail_ins_sth->execute($allele_id->[0]->[0], $reason)|| die "ERROR inserting allele fails info\n"; + } + else{ + warn "Error finding allele id for var : $var->[0] & allele $var->[1] - not failing \n"; + } + } + + + foreach my $var( keys %$var_data){ + + ## check through allele submissions & fail set of results for same subsnp & sample + foreach my $submitted_data (@{$var_data->{$var}->{allele_data}}){ + + next unless defined $submitted_data->[4]; ## only fails with sample info need be checked + + next unless defined $$fail_all{$submitted_data->[1]}{$submitted_data->[4]}; ## failed by ssid & sample + + ## look up new pk for submission set on variation_id, subsnp_is, sample_id [quicker in bulk, but only update once] + $allele_set_id_ext_sth->execute($var, $submitted_data->[1], $submitted_data->[4] )|| die "ERROR extracting allele id info\n"; + + my $allele_set_ids = $allele_set_id_ext_sth->fetchall_arrayref(); + foreach my $allele_id (@{$allele_set_ids}){ + + next if (exists $done_new{$allele_id->[0]}{11} && $done_new{$allele_id->[0]}{11} ==1); + $done_new{$allele_id->[0]}{11} = 1; + + $fail_ins_sth->execute($allele_id->[0], 11 )|| die "ERROR inserting allele fails info\n"; + } + undef $$fail_all{$submitted_data->[1]}{$submitted_data->[4]}; + } + } + } +} +=head2 insert_variation_features + + Create new version of variation feature tables with alleles ordered ref/alt where possible and complimented where necessary + +=cut +sub insert_variation_features{ + + my $var_dba = shift; + my $to_check = shift; + + my $varfeat_ins_sth = $var_dba->dbc->prepare(qq[insert into variation_feature_working + (variation_id, variation_name, seq_region_id, seq_region_start, seq_region_end, seq_region_strand, + allele_string, map_weight, source_id, consequence_types variation_set_id, somatic, class_attrib_id) + values (?,?,?,?,?,?,?,?,?,?,?,?,?) + ]); + + + foreach my $data (@{$to_check}){ + + $varfeat_ins_sth->execute($data->{v_id}, + $data->{name}, + $data->{seqreg_id}, + $data->{start}, + $data->{end}, + $data->{strand}, + $data->{allele}, + $data->{map}, + $data->{source_id}, + $data->{consequence_types}, + $data->{variation_set_id}, + $data->{somatic}, + $data->{class_attrib_id})|| die "ERROR importing variation feature info\n"; + + + } +} + + + +sub write_variant_flips{ + + my $var_dba = shift; + my $flip = shift; + + #my $flip_update_sth = $var_dba->dbc->prepare(qq[ update variation + # set flipped = 1 + # where variation_id = ? + # ]); + + my $flip_ins_sth = $var_dba->dbc->prepare(qq[ insert into variation_to_reverse_working + (variation_id) values ( ?) + ]); + + + foreach my $var (keys %{$flip} ){ + + #### updating VARIATION table at end - no change to imported data unless all OK + #$flip_update_sth->execute($var)|| die "ERROR updating variation flip status\n"; ## doing this at the end in bulk + $flip_ins_sth->execute($var)|| die "ERROR adding variation flip status\n"; + + } +} + +=head2 insert_variation_features + + Create new version of allele table with alleles complimented where neccessary + +=cut +sub write_allele{ + ### write data to new allele table flipping where needed; best way to recover partial updates caused by fails + + my ($var_dba, $var_data, $schema) = @_; + my %done; + + my $allele_ins_sth ; + if($schema =~ /old/){ + $allele_ins_sth = $var_dba->dbc->prepare(qq[ insert into allele_working + (variation_id, subsnp_id, allele, frequency, sample_id, count) + values (?, ?, ?, ?, ?,? ) + ]); + } + else{ + $allele_ins_sth = $var_dba->dbc->prepare(qq[ insert into allele_working + (variation_id, subsnp_id, allele_code_id, frequency, sample_id, count) + values (?, ?, ?, ?, ?,? ) + ]); + } + + foreach my $var (keys %$var_data){ + + foreach my $allele (@{$var_data->{$var}->{allele_data}}){ ## array of data from 1 row in table + next if exists $done{$allele->[0]} ; #&& $done{$allele->[0]} ==1; ## filtering on old pk + $done{$allele->[0]} = 1; + + $allele_ins_sth->execute( $var, $allele->[1], $allele->[2], $allele->[3], $allele->[4], $allele->[5]) || die "ERROR inserting allele info\n"; + } + } +} + + +sub get_allele_code{ + + my ($var_dba, $current_al) = @_; + + my $allele_code; + + my $allele_code_ext_sth = $var_dba->dbc->prepare(qq [select allele_code_id from allele_code where allele =?]); + my $allele_code_ins_sth = $var_dba->dbc->prepare(qq [insert into allele_code (allele) values (?) ]); + + + $allele_code_ext_sth->execute($current_al) || die "Failed to look up allele code for $current_al\n";; + + my $al_code = $allele_code_ext_sth->fetchall_arrayref(); + if(defined $al_code->[0]->[0]){ + $allele_code = $al_code->[0]->[0]; + } + else{ + ## insert new allele code - may be needed due to complimenting long odd things + warn "Entering allele $current_al\n"; + + $allele_code_ins_sth->execute($current_al) || die "Failed to enter allele code for $current_al\n"; + + $allele_code_ext_sth->execute($current_al) || die "Failed to look up allele code for $current_al\n";; + + my $al_code = $allele_code_ext_sth->fetchall_arrayref(); + + if(defined $al_code->[0]->[0]){ + $allele_code = $al_code->[0]->[0]; + } + else{ + die "New allele code insertion failed oddly\n"; + } + } + return $allele_code; +} + + + + +##reverse_comp (AC)17/(AC)19 => 91)GT(/71)GT( +sub rev_tandem{ + + + my $allele_string = shift; + + my $new_allele_string; + + my @parts = split/\//, $allele_string; + + foreach my $part (@parts){ + + if( $part =~/\d+$/){ + my $num = $&; + $part =~ s/\d+$|\(|\)//g; + + reverse_comp(\$part); + print "doing $part\n"; + $new_allele_string .= "(" . $part .")" . $num . "/"; + } + else{ + reverse_comp(\$part); + $new_allele_string .= $part. "/";; + } + } + $new_allele_string =~ s/\/$//; + + return $new_allele_string ; +} + +### put somewhere sensible +sub unique { + my %a; + map { $a{$_} = 1; } @_; + return sort keys %a; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/VariantQC/VariantQC_conf.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/VariantQC/VariantQC_conf.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,265 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + + + +=head1 NAME + +Bio::EnsEMBL::Variation::Pipeline::VariantQC::VariantQC_conf + +=head1 DESCRIPTION + +Configuration module for variant QC eHive process + +=cut + +package Bio::EnsEMBL::Variation::Pipeline::VariantQC::VariantQC_conf; + +use strict; +use warnings; + +use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf'); + +sub default_options { + my ($self) = @_; + + # the hash returned from this function is used to configure the pipeline, you can supply + # any of these options on the command line to override these default values + + # you shouldn't need to edit anything in this file other than these values, if you + # find you do need to then we should probably make it an option here, contact + # the variation team to discuss this - patches are welcome! + + my $login = `whoami`; + chomp $login; + + return { + + # general pipeline options that you should change to suit your environment + + # the location of your checkout of the ensembl API (the hive looks for SQL files here) + + ensembl_cvs_root_dir => $ENV{'HOME'}.'/EBI/bin/HEAD', + + # a name for your pipeline (will also be used in the name of the hive database) + + pipeline_name => 'variation_qc', + + # a directory to keep hive output files and your registry file, you should + # create this if it doesn't exist + + pipeline_dir => '/lustre/scratch110/ensembl/' . $login . '/'.$self->o('pipeline_name') . '/'. $self->o('species'), + + # a directory where hive workers will dump STDOUT and STDERR for their jobs + # if you use lots of workers this directory can get quite big, so it's + # a good idea to keep it on lustre, or some other place where you have a + # healthy quota! + + output_dir => $self->o('pipeline_dir').'/hive_output', + + # a standard ensembl registry file containing connection parameters + # for your target database(s) (and also possibly aliases for your species + # of interest that you can then supply to init_pipeline.pl with the -species + # option) + + reg_file => $self->o('pipeline_dir').'/ensembl.registry', + + # flip varation_features on reverse strand with map weight 1 unless this set to 0 + + #flip_variants => 1, TO BE IMPLEMENTED on by default + + ## The current dbSNP importer does not create the variation_feature.allele_string + ## Switch this to 0 if QC'ing external data imported with variation_feature.allele_string's + + do_allele_string => 0, + + # number of *variants* handled per batch + + qc_batch_size => 1000, + unmapped_batch_size => 100000, ## quicker check can be binned in bigger chunks + + + # Options to change for failure recovery + + # only data with variation_id >= start_at_variation_id will be imported + + start_at_variation_id => 1, + + # this can be changed for failure recovery + # working tables will not be created when create_working_table is set to 0 + + create_working_tables => 1, + + # create tmp_map_weight table unless this set to 0 + + create_map_table => 1, + + + + # configuration for the various resource options used in the pipeline + # EBI farm users should either change these here, or override them on the + # command line to suit the EBI farm. The names of each option hopefully + # reflect their usage, but you may want to change the details (memory + # requirements, queue parameters etc.) to suit your own data + + default_lsf_options => '-R"select[mem>2000] rusage[mem=2000]" -M2000000', + #urgent_lsf_options => '-q yesterday -R"select[mem>2000] rusage[mem=2000]" -M2000000', ## limit on processes + urgent_lsf_options => '-R"select[mem>2000] rusage[mem=2000]" -M2000000', + highmem_lsf_options => '-R"select[mem>15000] rusage[mem=15000]" -M15000000', # this is Sanger LSF speak for "give me 15GB of memory" + long_lsf_options => '-q long -R"select[mem>2000] rusage[mem=2000]" -M2000000', + + # options controlling the number of workers used for the parallelisable analyses + # NEED TO ESTABLISH GOOD default values for most species + + variant_qc_capacity => 30, + unmapped_var_capacity => 10, + + + # these flags control which parts of the pipeline are run + + run_variant_qc => 1, + run_unmapped_var => 1, + + # connection parameters for the hive database, you should supply the hive_db_password + # option on the command line to init_pipeline.pl (parameters for the target database + # should be set in the registry file defined above) + + # init_pipeline.pl will create the hive database on this machine, naming it + # _, and will drop any existing database with this + # name + + hive_db_host => 'ens-variation', + hive_db_port => 3306, + hive_db_user => 'ensadmin', + + pipeline_db => { + -host => $self->o('hive_db_host'), + -port => $self->o('hive_db_port'), + -user => $self->o('hive_db_user'), + -pass => $self->o('hive_db_password'), + -dbname => $ENV{'USER'}.'_'.$self->o('pipeline_name') . '_' . $self->o('species'), + }, + }; +} + +sub pipeline_create_commands { + my ($self) = @_; + return [ + 'mysql '.$self->dbconn_2_mysql('pipeline_db', 0).q{-e 'DROP DATABASE IF EXISTS }.$self->o('pipeline_db', '-dbname').q{'}, + @{$self->SUPER::pipeline_create_commands}, + 'mysql '.$self->dbconn_2_mysql('pipeline_db', 1).q{-e 'INSERT INTO meta (meta_key, meta_value) VALUES ("hive_output_dir", "}.$self->o('output_dir').q{")'}, + ]; +} + +sub resource_classes { + my ($self) = @_; + return { + 0 => { -desc => 'default', 'LSF' => $self->o('default_lsf_options') }, + 1 => { -desc => 'urgent', 'LSF' => $self->o('urgent_lsf_options') }, + 2 => { -desc => 'highmem', 'LSF' => $self->o('highmem_lsf_options') }, + 3 => { -desc => 'long', 'LSF' => $self->o('long_lsf_options') }, + }; +} + +sub pipeline_analyses { + my ($self) = @_; + + my @common_params = ( + ensembl_registry => $self->o('reg_file'), + species => $self->o('species'), + ); + + my @analyses; + + + + push @analyses, ( + + { -logic_name => 'init_run_variant_qc', + -module => 'Bio::EnsEMBL::Variation::Pipeline::VariantQC::InitVariantQC', + -parameters => { + qc_batch_size => $self->o('qc_batch_size'), + unmapped_batch_size => $self->o('unmapped_batch_size'), + + run_variant_qc => $self->o('run_variant_qc'), + run_unmapped_var => $self->o('run_unmapped_var'), + + start_at_variation_id => $self->o('start_at_variation_id'), + create_working_tables => $self->o('create_working_tables'), + create_map_table => $self->o('create_map_table'), + @common_params, + }, + -input_ids => [{}], + -rc_id => 1, + -flow_into => { + 2 => [ 'variant_qc' ], + 3 => [ 'unmapped_var' ], + 4 => [ 'finish_variation_qc' ], + + }, + }, + + { -logic_name => 'unmapped_var', + -module => 'Bio::EnsEMBL::Variation::Pipeline::VariantQC::UnmappedVariant', + -parameters => { + batch_size => $self->o('unmapped_batch_size'), + @common_params, + }, + -input_ids => [], + -hive_capacity => $self->o('unmapped_var_capacity'), + -rc_id => 0, + -flow_into => {}, + }, + + { -logic_name => 'variant_qc', + -module => 'Bio::EnsEMBL::Variation::Pipeline::VariantQC::VariantQC', + -parameters => { + batch_size => $self->o('qc_batch_size'), + do_allele_string => $self->o('do_allele_string'), + pipeline_dir => $self->o('pipeline_dir'), ### temp - write temp log files + @common_params, + }, + -input_ids => [], + -hive_capacity => $self->o('variant_qc_capacity'), + -rc_id => 0, + -flow_into => {}, + }, + + + { -logic_name => 'finish_variation_qc', + -module => 'Bio::EnsEMBL::Variation::Pipeline::VariantQC::FinishVariantQC', + -parameters => { + pipeline_dir => $self->o('pipeline_dir'), + @common_params, + }, + -input_ids => [], + -hive_capacity => 1, + -rc_id => 1, + -wait_for => [ 'variant_qc','unmapped_var' ], + -flow_into => {}, + }, + + ); + + + + return \@analyses; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/VariationConsequence_conf.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/VariationConsequence_conf.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,261 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +package Bio::EnsEMBL::Variation::Pipeline::VariationConsequence_conf; + +use strict; +use warnings; + +use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf'); + +sub default_options { + my ($self) = @_; + + # the hash returned from this function is used to configure the pipeline, you can supply + # any of these options on the command line to override these default values + + # you shouldn't need to edit anything in this file other than these values, if you + # find you do need to then we should probably make it an option here, contact + # the variation team to discuss this - patches are welcome! + + return { + + # general pipeline options that you should change to suit your environment + + # the location of your checkout of the ensembl API (the hive looks for SQL files here) + + ensembl_cvs_root_dir => $ENV{'HOME'}.'/ensembl-branches/HEAD', + + # a name for your pipeline (will also be used in the name of the hive database) + + pipeline_name => 'variation_consequence', + + # a directory to keep hive output files and your registry file, you should + # create this if it doesn't exist + + pipeline_dir => '/lustre/scratch103/ensembl/gr5/'.$self->o('pipeline_name'), + + # a directory where hive workers will dump STDOUT and STDERR for their jobs + # if you use lots of workers this directory can get quite big, so it's + # a good idea to keep it on lustre, or some other place where you have a + # healthy quota! + + output_dir => $self->o('pipeline_dir').'/hive_output', + + # a standard ensembl registry file containing connection parameters + # for your target database(s) (and also possibly aliases for your species + # of interest that you can then supply to init_pipeline.pl with the -species + # option) + + reg_file => $self->o('pipeline_dir').'/ensembl.registry', + + # if set to 1 this option tells the transcript_effect analysis to disambiguate + # ambiguity codes in single nucleotide alleles, so e.g. an allele string like + # 'T/M' will be treated as if it were 'T/A/C' (this was a request from ensembl + # genomes and we don't use it by default in the ensembl variation pipeline) + + disambiguate_single_nucleotide_alleles => 0, + + # configuration for the various resource options used in the pipeline + # EBI farm users should either change these here, or override them on the + # command line to suit the EBI farm. The names of each option hopefully + # reflect their usage, but you may want to change the details (memory + # requirements, queue parameters etc.) to suit your own data + + default_lsf_options => '-R"select[mem>2000] rusage[mem=2000]" -M2000000', + urgent_lsf_options => '-q yesterday -R"select[mem>2000] rusage[mem=2000]" -M2000000', + highmem_lsf_options => '-R"select[mem>15000] rusage[mem=15000]" -M15000000', # this is Sanger LSF speak for "give me 15GB of memory" + long_lsf_options => '-q long -R"select[mem>2000] rusage[mem=2000]" -M2000000', + + # options controlling the number of workers used for the parallelisable analyses + # these default values seem to work for most species + + transcript_effect_capacity => 50, + set_variation_class_capacity => 10, + + # set this flag to 1 to include LRG transcripts in the transcript effect analysis + + include_lrg => 1, + + # these flags control which parts of the pipeline are run + + run_transcript_effect => 1, + run_variation_class => 1, + + # connection parameters for the hive database, you should supply the hive_db_password + # option on the command line to init_pipeline.pl (parameters for the target database + # should be set in the registry file defined above) + + # init_pipeline.pl will create the hive database on this machine, naming it + # _, and will drop any existing database with this + # name + + hive_db_host => 'ens-variation', + hive_db_port => 3306, + hive_db_user => 'ensadmin', + + pipeline_db => { + -host => $self->o('hive_db_host'), + -port => $self->o('hive_db_port'), + -user => $self->o('hive_db_user'), + -pass => $self->o('hive_db_password'), + -dbname => $ENV{'USER'}.'_'.$self->o('pipeline_name'), + }, + }; +} + +sub pipeline_create_commands { + my ($self) = @_; + return [ + 'mysql '.$self->dbconn_2_mysql('pipeline_db', 0).q{-e 'DROP DATABASE IF EXISTS }.$self->o('pipeline_db', '-dbname').q{'}, + @{$self->SUPER::pipeline_create_commands}, + 'mysql '.$self->dbconn_2_mysql('pipeline_db', 1).q{-e 'INSERT INTO meta (meta_key, meta_value) VALUES ("hive_output_dir", "}.$self->o('output_dir').q{")'}, + ]; +} + +sub resource_classes { + my ($self) = @_; + return { + 0 => { -desc => 'default', 'LSF' => $self->o('default_lsf_options') }, + 1 => { -desc => 'urgent', 'LSF' => $self->o('urgent_lsf_options') }, + 2 => { -desc => 'highmem', 'LSF' => $self->o('highmem_lsf_options') }, + 3 => { -desc => 'long', 'LSF' => $self->o('long_lsf_options') }, + }; +} + +sub pipeline_analyses { + my ($self) = @_; + + my @common_params = ( + ensembl_registry => $self->o('reg_file'), + species => $self->o('species'), + ); + + my @analyses; + + if ($self->o('run_transcript_effect')) { + + push @analyses, ( + + { -logic_name => 'init_transcript_effect', + -module => 'Bio::EnsEMBL::Variation::Pipeline::InitTranscriptEffect', + -parameters => { + include_lrg => $self->o('include_lrg'), + @common_params, + }, + -input_ids => [{}], + -rc_id => 1, + -flow_into => { + 2 => [ 'rebuild_tv_indexes' ], + 3 => [ 'update_variation_feature' ], + 4 => [ 'transcript_effect' ], + }, + }, + + { -logic_name => 'transcript_effect', + -module => 'Bio::EnsEMBL::Variation::Pipeline::TranscriptEffect', + -parameters => { + disambiguate_single_nucleotide_alleles => $self->o('disambiguate_single_nucleotide_alleles'), + @common_params, + }, + -input_ids => [], + -hive_capacity => $self->o('transcript_effect_capacity'), + -rc_id => 0, + -flow_into => {}, + }, + + { -logic_name => 'rebuild_tv_indexes', + -module => 'Bio::EnsEMBL::Variation::Pipeline::RebuildIndexes', + -parameters => { + @common_params, + }, + -input_ids => [], + -hive_capacity => 1, + -rc_id => 1, + -wait_for => [ 'transcript_effect' ], + -flow_into => {}, + }, + + { -logic_name => 'update_variation_feature', + -module => 'Bio::EnsEMBL::Variation::Pipeline::UpdateVariationFeature', + -parameters => { + @common_params, + }, + -input_ids => [], + -hive_capacity => 1, + -rc_id => 1, + -wait_for => [ 'rebuild_tv_indexes' ], + -flow_into => {}, + }, + + ); + } + + if ($self->o('run_variation_class')) { + + push @analyses, ( + + { -logic_name => 'init_variation_class', + -module => 'Bio::EnsEMBL::Variation::Pipeline::InitVariationClass', + -parameters => { + num_chunks => 50, + @common_params, + }, + -input_ids => [{}], + -hive_capacity => 1, + -rc_id => 2, + -wait_for => ( $self->o('run_transcript_effect') ? [ 'update_variation_feature' ] : [] ), + -flow_into => { + 1 => [ 'finish_variation_class' ], + 2 => [ 'set_variation_class' ], + }, + }, + + { -logic_name => 'set_variation_class', + -module => 'Bio::EnsEMBL::Variation::Pipeline::SetVariationClass', + -parameters => { + @common_params, + }, + -input_ids => [], + -hive_capacity => $self->o('set_variation_class_capacity'), + -rc_id => 0, + -flow_into => {}, + }, + + { -logic_name => 'finish_variation_class', + -module => 'Bio::EnsEMBL::Variation::Pipeline::FinishVariationClass', + -parameters => { + @common_params, + }, + -input_ids => [], + -hive_capacity => 1, + -rc_id => 1, + -wait_for => [ 'set_variation_class' ], + -flow_into => {}, + }, + + ); + } + + return \@analyses; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Population.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Population.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,266 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# Ensembl module for Bio::EnsEMBL::Variation::Population +# +# Copyright (c) 2004 Ensembl +# + + +=head1 NAME + +Bio::EnsEMBL::Variation::Population - A population represents a phenotypic +group, ethnic group, set of individuals used in an assay, etc. + +=head1 SYNOPSIS + + # Population + $pop = Bio::EnsEMBL::Variation::Population->new + (-name => 'WEST AFRICA', + -description => 'Sub-Saharan Nations bordering Atlantic north' . + ' of Congo River, and Central/Southern Atlantic' . + ' Island Nations.'); + + ... + + # print out all sub populations of a population + # same could work for super populations + + print_sub_pops($pop); + + sub print_sub_pops { + my $pop = shift; + my $level = shift || 0; + + my $sub_pops = $pop->get_all_sub_Populations(); + + foreach my $sp (@$sub_pops) { + print ' ' x $level++, + 'name: ', $sp->name(), + 'desc: ', $sp->description(), + 'size: ', $sp->size(),"\n"; + print_sub_pops($sp, $level); + } + } + + + +=head1 DESCRIPTION + +This is a class representing a population. A population may consist of any +grouping of individuals, including phenotypic groups (e.g. people with +diabetes), ethnic groups (e.g. caucasians), individuals used in an assay +(e.g. subjects in experiment X), etc. + +Populations may be arranged into an arbitrary hierarchy of sub and super +populations. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::Population; + +use Bio::EnsEMBL::Variation::Sample; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw); + +our @ISA = ('Bio::EnsEMBL::Variation::Sample'); + + +=head2 new + + Arg [-dbID]: int - unique internal identifier of the sample + Arg [-ADAPTOR]: Bio::EnsEMBL::PopulationAdaptor + Arg [-NAME]: string - name of the population + Arg [-DESCRIPTION]: string - description of the population + Arg [-SIZE]: int - the size of the population + Arg [-SUB_POPULATIONS]: listref of Bio::EnsEMBL::Population objects + Example : $pop = Bio::EnsEMBL::Variation::Population->new + (-name => 'WEST AFRICA', + -description => 'Sub-Saharan Nations bordering Atlantic north' . + ' of Congo River, and Central/Southern Atlantic' . + ' Island Nations.' + -sub_populations => \@sub_pops); + Description: Constructor. Instantiates a new Population object + Returntype : Bio::EnsEMBL::Variation::Population + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + + my ($dbID, $adaptor, $name, $desc, $size, $is_strain, $sub_pops) = + rearrange(['DBID','ADAPTOR','NAME', 'DESCRIPTION', 'SIZE', + 'SUB_POPULATIONS'], @_); + + return bless {'dbID' => $dbID, + 'adaptor' => $adaptor, + 'name' => $name, + 'description' => $desc, + 'size' => $size, + 'sub_populations' => $sub_pops}, $class; +} + + + +=head2 get_all_sub_Populations + + Arg [1] : none + Example : foreach my $sub_pop (@{$pop->get_all_sub_Populations}) { + my $sub_sub_pops = $sub_pop->get_all_sub_Populations(); + } + Description: Retrieves all populations which are conceptually a sub set + of this population. + Returntype : reference to list of Bio::EnsEMBL::Variation::Population objects + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub get_all_sub_Populations { + my $self = shift; + + if(!defined($self->{'sub_populations'}) && $self->{'adaptor'}) { + # lazy-load from database + $self->{'sub_populations'} = + $self->{'adaptor'}->fetch_all_by_super_Population($self); + } + return $self->{'sub_populations'} || []; +} + + + +=head2 get_all_super_Populations + + Arg [1] : none + Example : foreach my $sup_pop (@{$pop->get_all_super_Populations}) { + my $sup_sup_pops = $sup_pop->get_all_super_Populations(); + } + Description: Retrieves all populations which this population is a part of + from the database. + Super populations may not be directly added in order to avoid + circular references and memory leaks. You must add + sub_Populations instead and store this in the database. + Returntype : reference to list of Bio::EnsEMBL::Variation::Population objects + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub get_all_super_Populations { + my $self = shift; + + return [] if(!$self->{'adaptor'}); + + # load from database - do not cache to avoid circular references (mem leak)! + return $self->{'adaptor'}->fetch_all_by_sub_Population($self); +} + + + +=head2 add_sub_Population + + Arg [1] : Bio::EnsEMBL::Variation::Population $pop + Example : $pop->add_sub_Population($sub_pop); + $sub_pop->add_super_Population($pop); + Description: Adds a sub population to this population. + Returntype : none + Exceptions : throw on incorrect argument + Caller : general + Status : At Risk + +=cut + +sub add_sub_Population { + my $self = shift; + my $pop = shift; + + if(!ref($pop) || !$pop->isa('Bio::EnsEMBL::Variation::Population')) { + throw('Bio::EnsEMBL::Variation::Population argument expected.'); + } + + if($pop == $self) { + throw("Cannot add self as sub population."); + } + + $self->{'sub_populations'} ||= []; + push @{$self->{'sub_populations'}}, $pop; + + return $pop; +} + +=head2 get_all_synonyms + + Arg [1] : (optional) string $source - the source of the synonyms to + return. + Example : @dbsnp_syns = @{$p->get_all_synonyms('dbSNP')}; + @all_syns = @{$p->get_all_synonyms()}; + Description: Retrieves synonyms for this Population. If a source argument + is provided all synonyms from that source are returned, + otherwise all synonyms are returned. + Returntype : reference to list of strings + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub get_all_synonyms { + my $self = shift; + my $source = shift; + + return [] if(!$self->adaptor()); #if there is no adaptor, return empty string + + return $self->adaptor()->fetch_synonyms($self->dbID(),$source); + +} + +=head2 get_all_Individuals + + Arg [1] : none + Example : @individuals = @{$p->get_all_individuals()}; + Description: Retrieves all Individuals belonging to this Population. + Returntype : reference to list of Bio::EnsEMBL::Variation::Individual objects + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub get_all_Individuals { + my $self = shift; + + my $ia = $self->adaptor->db->get_IndividualAdaptor; + + return (defined $ia ? $ia->fetch_all_by_Population($self) : []); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/PopulationGenotype.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/PopulationGenotype.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,201 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# Ensembl module for Bio::EnsEMBL::Variation::PopulationGenotype +# +# Copyright (c) 2004 Ensembl +# + + +=head1 NAME + +Bio::EnsEMBL::Variation::PopulationGenotype - Module for a genotype +represented in a population. + +=head1 SYNOPSIS + + print $genotype->variation()->name(), "\n"; + print $genotype->allele1(), '/', $genotype->allele2(), "\n"; + print $genotype->frequency(), "\n"; + print $genotype->population()->name(), "\n"; + +=head1 DESCRIPTION + +This class represents a genotype which is present in a population. + +=head1 METHODS + +=cut + + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::PopulationGenotype; + +use Bio::EnsEMBL::Variation::Genotype; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning); + +use vars qw(@ISA); + +@ISA = qw(Bio::EnsEMBL::Variation::Genotype); + + + +=head2 new + + Arg [-dbID] : + int - unique internal identifier + Arg [-adaptor] : + Bio::EnsEMBL::Variation::DBSQL::PopulationAdaptor + Arg [-allele1] : + string - One of the two alleles defining this genotype + Arg [-allele2] : + string - One of the two alleles defining this genotype + Arg [-variation] : + Bio::EnsEMBL::Variation::Variation - The variation associated with this + genotype + Arg [-population] : + Bio::EnsEMBL::Population - The population this genotype is for. + Arg [-frequency] : + int - the frequency this genotype occurs in this population + Example : $pop_genotype = Bio:EnsEMBL::Variation::PopulationGenotype->new + (-allele1 => 'A', + -allele2 => 'T', + -variation => $variation, + -population => $pop + -frequency => 0.87); + Description: Constructor. Instantiates a PopulationGenotype object. + Returntype : Bio::EnsEMBL::Variation::PopulationGenotype + Exceptions : throw on bad argument + Caller : general + Status : At Risk + +=cut + +sub new { + my $class = shift; + + my ($dbID, $adaptor, $genotype, $var, $pop, $freq, $count, $var_id, $ss_id) = + rearrange([qw(dbID adaptor genotype variation population frequency count _variation_id subsnp)],@_); + + if(defined($var) && + (!ref($var) || !$var->isa('Bio::EnsEMBL::Variation::Variation'))) { + throw("Bio::EnsEMBL::Variation::Variation argument expected"); + } + + if(defined($pop) && + (!ref($pop) || !$pop->isa('Bio::EnsEMBL::Variation::Population'))) { + throw("Bio::EnsEMBL::Variation::Population argument expected"); + } + + # set subsnp_id to undefined if it's 0 in the DB + $ss_id = undef if defined($ss_id) && $ss_id == 0; + + # add ss to the subsnp_id + $ss_id = 'ss'.$ss_id if defined $ss_id && $ss_id !~ /^ss/; + + return bless { + 'dbID' => $dbID, + 'adaptor' => $adaptor, + 'genotype' => $genotype, + 'variation' => $var, + '_variation_id' => defined($var) ? undef : $var_id, + 'population' => $pop, + 'frequency' => $freq, + 'count' => $count, + 'subsnp' => $ss_id + }, $class; +} + + + + +=head2 population + + Arg [1] : (optional) Bio::EnsEMBL::Variation::Population $pop + Example : $pop = $pop_genotype->population(); + Description: Getter/Setter for the population associated with this genotype + Returntype : Bio::EnsEMBL::Variation::Population + Exceptions : throw on bad argument + Caller : general + Status : At Risk + +=cut + + +sub population { + my $self = shift; + if(@_) { + my $pop = shift; + if(defined($pop) && + (!ref($pop) || !$pop->isa('Bio::EnsEMBL::Variation::Population'))) { + throw('Bio::EnsEMBL::Variation::Population argument expected'); + } + return $self->{'population'} = $pop; + } + return $self->{'population'}; +} + + + + +=head2 frequency + + Arg [1] : string $freq (optional) + The new value to set the frequency attribute to + Example : $frequency = $pop_gtype->frequency() + Description: Getter/Setter for the frequency of occurance of this genotype + within its associated population. + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub frequency{ + my $self = shift; + return $self->{'frequency'} = shift if(@_); + return $self->{'frequency'}; +} + +=head2 count + + Arg [1] : int $count (optional) + The new value to set the count attribute to + Example : $frequency = $pop_gtype->count() + Description: Getter/Setter for the observed count of this genotype + within its associated population. + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub count{ + my $self = shift; + return $self->{'count'} = shift if(@_); + return $self->{'count'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/ProteinFunctionPredictionMatrix.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/ProteinFunctionPredictionMatrix.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,682 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix + +=head1 SYNOPSIS + + # create a new matrix for polyphen predictions + + my $orig_pfpm = Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix->new( + -analysis => 'polyphen', + -peptide_length => 134, + ); + + # add some predictions + + $orig_pfpm->add_prediction(1, 'A', 'probably damaging', 0.967); + $orig_pfpm->add_prediction(2, 'C', 'benign', 0.09); + + # serialize the matrix to a compressed binary string + + my $binary_string = $pfpm->serialize; + + # store the string somewhere, fetch it later, and then create a new matrix using it + + my $new_pfpm = Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix->new( + -analysis => 'polyphen', + -matrix => $binary_string + ); + + # retrieve predictions + + my ($prediction, $score) = $new_pfpm->get_prediction(2, 'C'); + + print "A mutation to 'C' at position 2 is predicted to be $prediction\n"; + +=head1 DESCRIPTION + +This module defines a class representing a matrix of protein +function predictions, and provides method to access and set +predictions for a given position and amino acid, and also to +serialize the matrix for storage in a database, and deserialize +a matrix from the compressed format into a perl hash. + +=cut + +package Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix; + +use strict; +use warnings; + +use POSIX qw(ceil); +use List::Util qw(max); +use Compress::Zlib; + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); + +use base qw(Exporter); + +our @EXPORT_OK = qw($AA_LOOKUP @ALL_AAS); + +my $DEBUG = 0; + +# user-defined constants + +# a header which lets us identify prediction matrices and +# to check if they may have been corrupted + +our $HEADER = 'VEP'; + +# the format we use when 'pack'ing our predictions + +my $PACK_FORMAT = 'v'; + +# a hash mapping qualitative predictions to numerical values +# for all the analyses we use + +my $PREDICTION_TO_VAL = { + polyphen => { + 'probably damaging' => 0, + 'possibly damaging' => 1, + 'benign' => 2, + 'unknown' => 3, + }, + + sift => { + 'tolerated' => 0, + 'deleterious' => 1, + }, +}; + +# all valid amino acids + +our @ALL_AAS = qw(A C D E F G H I K L M N P Q R S T V W Y); + +# we use a short with all bits set to represent the lack of a prediction in +# an (uncompressed) prediction matrix, we will never observe this value +# as a real prediction even if we set all the (6) prediction bits because we +# limit the max score to 1000 so the 10 score bits will never all be set + +our $NO_PREDICTION = pack($PACK_FORMAT, 0xFFFF); + +# the number of bytes in a short + +my $BYTES_PER_PREDICTION = 2; + + +# constants derived from the the user-defined constants + +# the maximum number of distinct qualitative predictions used by any tool + +my $MAX_NUM_PREDS = max( map { scalar keys %$_ } values %$PREDICTION_TO_VAL ); + +# the number of bits used to encode the qualitative prediction + +my $NUM_PRED_BITS = ceil( log($MAX_NUM_PREDS) / log(2) ); + +throw("Cannot represent more than ".(2**6-1)." predictions") if $NUM_PRED_BITS > 6; + +# a hash mapping back from a numerical value to a qualitative prediction + +my $VAL_TO_PREDICTION = { + map { + my $tool = $_; + $tool => { + map { + $PREDICTION_TO_VAL->{$tool}->{$_} => $_ + } keys %{ $PREDICTION_TO_VAL->{$tool} } + } + } keys %$PREDICTION_TO_VAL +}; + +# a hash from amino acid single letter code to a numerical value + +our $AA_LOOKUP = { map {$ALL_AAS[$_] => $_} 0 .. $#ALL_AAS }; + +# the number of valid amino acids + +our $NUM_AAS = scalar(@ALL_AAS); + +=head2 new + + Arg [-ANALYSIS] : + The name of the analysis tool that made these predictions, + currently must be one of 'sift' or 'polyphen' + + Arg [-MATRIX] : + A gzip compressed binary string encoding the predictions, + typically created using this class and fetched from the + variation database (optional) + + Arg [-PEPTIDE_LENGTH] : + The length of the associated peptide, only required if + you want to serialize this matrix (optional) + + Arg [-TRANSLATION_MD5] : + The hex MD5 hash of the associated peptide sequence + + Description: Constructs a new ProteinFunctionPredictionMatrix object + Returntype : A new Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix instance + Exceptions : throws unless ANALYSIS is supplied and recognised + Status : At Risk + +=cut + +sub new { + my $class = shift; + + my ( + $analysis, + $sub_analysis, + $matrix, + $peptide_length, + $translation_md5, + ) = rearrange([qw( + ANALYSIS + SUB_ANALYSIS + MATRIX + PEPTIDE_LENGTH + TRANSLATION_MD5 + )], @_); + + throw("analysis argument required") unless defined $analysis; + + throw("Unrecognised analysis '$analysis'") + unless defined $PREDICTION_TO_VAL->{$analysis}; + + my $self = bless { + analysis => $analysis, + sub_analysis => $sub_analysis, + matrix => $matrix, + peptide_length => $peptide_length, + translation_md5 => $translation_md5, + }, $class; + + $self->{matrix_compressed} = defined $matrix ? 1 : 0; + + return $self; +} + +=head2 analysis + + Arg[1] : string $analysis - the name of the analysis tool (optional) + Description : Get/set the analysis name + Returntype : string + Exceptions : throws if the name is not recognised + Status : At Risk + +=cut + +sub analysis { + my ($self, $analysis) = @_; + + if ($analysis) { + throw("Unrecognised analysis '$analysis'") + unless defined $PREDICTION_TO_VAL->{$analysis}; + + $self->{analysis} = $analysis; + } + + return $self->{analysis}; +} + +=head2 sub_analysis + + Arg[1] : string $sub_analysis - the name of the sub analysis (optional) + Description : Get/set the sub analysis name + Returntype : string + Exceptions : None + Status : At Risk + +=cut + +sub sub_analysis { + my ($self, $sub_analysis) = @_; + + $self->{sub_analysis} = $sub_analysis if $sub_analysis; + + return $self->{sub_analysis}; +} + +=head2 peptide_length + + Arg[1] : int $peptide_length - the length of the peptide (optional) + Description : Get/set the length of the peptide - required when you want to + serialize a matrix, as we need to know how many rows the matrix has + Returntype : int + Exceptions : none + Status : At Risk + +=cut + +sub peptide_length { + my ($self, $peptide_length) = @_; + $self->{peptide_length} = $peptide_length if defined $peptide_length; + return $self->{peptide_length}; +} + +=head2 translation_md5 + + Arg[1] : string $translation_md5 - the hex MD5 hash of the peptide sequence (optional) + Description : Get/set the MD5 hash of the peptide sequence + Returntype : string + Exceptions : none + Status : At Risk + +=cut + +sub translation_md5 { + my ($self, $translation_md5) = @_; + $self->{translation_md5} = $translation_md5 if defined $translation_md5; + return $self->{translation_md5}; +} + +=head2 get_prediction + + Arg[1] : int $pos - the desired position + Arg[2] : string $aa - the mutant amino acid + Description : get the prediction and score for the given position and amino acid + Returntype : a list containing 2 values, the prediction and the score + Exceptions : throws if either the position or amino acid are invalid + Status : At Risk + +=cut + +sub get_prediction { + my ($self, $pos, $aa) = @_; + + # if we have it in our uncompressed hash then just return it + + if (defined $self->{preds}->{$pos}->{$aa}) { + return @{ $self->{preds}->{$pos}->{$aa} }; + } + + # otherwise we look in the serialized matrix string + + return $self->prediction_from_matrix($pos, $aa); +} + +=head2 add_prediction + + Arg[1] : int $pos - the peptide position + Arg[2] : string $aa - the mutant amino acid + Arg[3] : string $prediction - the prediction to store + Arg[4] : float $score - the score to store + Description : add a prediction to the matrix for the specified position and amino acid, + note that this just adds the prediction to a perl hash. If you want to + encode the matrix in the binary format you should call serialize on the + matrix object after you have added all the predictions. + Exceptions : none + Status : At Risk + +=cut + +sub add_prediction { + my ($self, $pos, $aa, $prediction, $score) = @_; + + $self->{preds}->{$pos}->{$aa} = [$prediction, $score]; +} + +=head2 serialize + + Arg[1] : int $peptide_length - the length of the associated peptide (optional) + Description : serialize the matrix into a compressed binary format suitable for + storage in a database, file etc. The same string can later be used + to create a new matrix object and the predictions can be retrieved + Exceptions : throws if the peptide length has not been specified + Status : At Risk + +=cut + +sub serialize { + my ($self, $peptide_length) = @_; + + $self->{peptide_length} = $peptide_length if defined $peptide_length; + + throw("peptide_length required to serialize predictions") + unless defined $self->{peptide_length}; + + # convert predictions to the binary format, and concatenate them all + # together in the correct order, inserting our dummy $NO_PREDICTION + # value to fill in any gaps + + if ($self->{preds}) { + + $self->{matrix_compressed} = 0; + + $self->{matrix} = $HEADER; + + for my $pos (1 .. $self->{peptide_length}) { + + for my $aa (@ALL_AAS) { + + my $short; + + if ($self->{preds}->{$pos}->{$aa}) { + my ($prediction, $score) = @{ $self->{preds}->{$pos}->{$aa} }; + + $short = $self->prediction_to_short($prediction, $score); + } + + $self->{matrix} .= defined $short ? $short : $NO_PREDICTION; + } + } + + # delete the hash copy, so things don't get out of sync + + $self->{preds} = undef; + } + else { + warning("There don't seem to be any predictions in the matrix to serialize!"); + } + + # and return the compressed string for storage + + return $self->compress_matrix; +} + +=head2 deserialize + + Arg [1] : coderef $coderef - an anonymous subroutine that will be called + as each prediction is decoded in order. The subroutine will be + called with 4 arguments: the peptide position, the amino acid, + the prediction and the score. This can be used, for example to + dump out the prediction matrix to a file. (optional) + Description : deserialize a binary formatted matrix into a perl hash reference + containing all the uncompressed predictions. This hash has an + entry for each position in the peptide, which is itself a hashref + with an entry for each possible alternate amino acid which is a + listref containing the prediction and score. For example, to retrieve + the prediction for a substitution of 'C' at position 23 from this + data structure, you could use code like: + + my $prediction_hash = $pfpm->deserialize; + my ($prediction, $score) = @{ $prediction_hash->{23}->{'C'} }; + + Note that if you don't explicitly deserialize a matrix this + class will keep it in the memory-efficient encoded format, and + you can access individual predictions with the get_prediction() + method. You should only use this method if you want to decode + all predictions (for example to perform some large-scale + analysis, or to reformat the predictions somehow) + + Returntype : hashref containing decoded predictions + Exceptions : throws if the binary matrix isn't in the expected format + Status : At Risk + +=cut + +sub deserialize { + my ($self, $coderef) = @_; + + if ($self->{matrix_compressed}) { + $self->expand_matrix; + } + + throw("Matrix looks corrupted") unless $self->header_ok; + + # we can work out the length of the peptide by counting the rows in the matrix + + my $length = ((length($self->{matrix}) - length($HEADER)) / $BYTES_PER_PREDICTION) / $NUM_AAS; + + for my $pos (1 .. $length) { + + for my $aa (@ALL_AAS) { + + # we call prediction_from_short directly to avoid doing all the + # checks performed in prediction_from_string + + my ($prediction, $score) = $self->prediction_from_short(substr($self->{matrix}, $self->compute_offset($pos, $aa), $BYTES_PER_PREDICTION)); + + $self->{preds}->{$pos}->{$aa} = [$prediction, $score]; + + if ($coderef) { + $coderef->($pos, $aa, $prediction, $score); + } + } + } + + return $self->{preds}; +} + +=head2 prediction_to_short + + Arg[1] : string $pred - one of the possible qualitative predictions of the tool + Arg[2] : float $prob - the prediction score (with 3 d.p.s of precision) + Description : converts a prediction and corresponding score into a 2-byte short value + Returntype : the packed short value + Exceptions : throws if the prediction argument is invalid + Status : At Risk + +=cut + +sub prediction_to_short { + my ($self, $pred, $prob) = @_; + + # we only store 3 d.p. so multiply by 1000 to turn our + # probability into a number between 0 and 1000. + # 2^10 == 1024 so we need 10 bits of our short to store + # this value + + my $val = $prob * 1000; + + # we store the prediction in the top $NUM_PRED_BITS bits + # so look up the numerical value for the prediction, + # shift this $NUM_PRED_BITS bits to the left and then set + # the appropriate bits of our short value + + $pred = lc($pred); + + my $pred_val = $PREDICTION_TO_VAL->{$self->{analysis}}->{$pred}; + + throw("No value defined for prediction: '$pred'?") + unless defined $pred_val; + + $val |= ($pred_val << (16 - $NUM_PRED_BITS)); + + printf("p2s: $pred ($prob) => 0x%04x\n", $val) if $DEBUG; + + $val = pack $PACK_FORMAT, $val; + + return $val; +} + +=head2 prediction_from_short + + Arg[1] : string $pred - the packed short value + Description : converts a 2-byte short value back into a prediction and a score + Exceptions : none + Returntype : a list containing 2 values, the prediction and the score + Status : At Risk + +=cut + +sub prediction_from_short { + my ($self, $val) = @_; + + # check it isn't our special null prediction + + if ($val eq $NO_PREDICTION) { + print "no pred\n" if $DEBUG; + return undef; + } + + # unpack the value as a short + + $val = unpack $PACK_FORMAT, $val; + + # shift the prediction bits down and look up the prediction string + + my $pred = $VAL_TO_PREDICTION->{$self->{analysis}}->{$val >> (16 - $NUM_PRED_BITS)}; + + # mask off the top 6 bits reserved for the prediction and convert back to a 3 d.p. float + + my $prob = ($val & (2**10 - 1)) / 1000; + + printf("pfs: 0x%04x => $pred ($prob)\n", $val) if $DEBUG; + + return ($pred, $prob); +} + +=head2 compress_matrix + + Description : compresses a prediction matrix with gzip + Returntype : the compressed matrix + Exceptions : throws if the matrix is an unexpected length, or if the compression fails + Status : At Risk + +=cut + +sub compress_matrix { + my ($self) = @_; + + my $matrix = $self->{matrix}; + + return undef unless $matrix; + + return $matrix if $self->{matrix_compressed}; + + # prepend a header, so we can tell if our matrix has been mangled, or + # is compressed etc. + + unless ($self->header_ok) { + $matrix = $HEADER.$matrix; + } + + throw("Prediction matrix is an unexpected length") + unless ( (length($matrix) - length($HEADER)) % $NUM_AAS) == 0; + + $self->{matrix} = Compress::Zlib::memGzip($matrix) or throw("Failed to gzip: $gzerrno"); + + $self->{matrix_compressed} = 1; + + return $self->{matrix}; +} + +=head2 header_ok + + Description : checks if the binary matrix has the expected header + Returntype : boolean + Exceptions : none + Status : At Risk + +=cut + +sub header_ok { + my ($self) = @_; + return undef unless ($self->{matrix} && !$self->{matrix_compressed}); + return substr($self->{matrix},0,length($HEADER)) eq $HEADER; +} + +=head2 expand_matrix + + Description : uncompresses a compressed prediction matrix + Returntype : the uncompressed binary matrix string + Exceptions : throws if the header is incorrect, or if the decompression fails + Status : At Risk + +=cut + +sub expand_matrix { + my ($self) = @_; + + return undef unless $self->{matrix}; + + return $self->{matrix} unless $self->{matrix_compressed}; + + $self->{matrix} = Compress::Zlib::memGunzip($self->{matrix}) + or throw("Failed to gunzip: $gzerrno"); + + $self->{matrix_compressed} = 0; + + throw("Malformed prediction matrix") unless $self->header_ok; + + return $self->{matrix}; +} + +=head2 compute_offset + + Arg[1] : int $pos - the desired position in the peptide + Arg[2] : string $aa - the desired mutant amino acid + Description : computes the correct offset into a prediction matrix for a given + peptide position and mutant amino acid + Returntype : the integer offset + Exceptions : none + Status : At Risk + +=cut + +sub compute_offset { + my ($self, $pos, $aa) = @_; + + my $offset = length($HEADER) + ( ( (($pos-1) * $NUM_AAS) + $AA_LOOKUP->{$aa} ) * $BYTES_PER_PREDICTION ); + + return $offset; +} + +=head2 prediction_from_matrix + + Arg[1] : int $pos - the desired position in the peptide + Arg[2] : string $aa - the desired mutant amino acid + Description : returns the prediction and score for the requested + position and mutant amino acid in the matrix + Returntype : a list containing 2 values, the prediction and the score + Exceptions : throws if either the position or amino acid are invalid, + or if the prediction matrix looks to be malformed + Status : At Risk + +=cut + +sub prediction_from_matrix { + my ($self, $pos, $aa) = @_; + + if ($self->{matrix_compressed}) { + # the matrix is still compressed so we try to uncompress it, + $self->expand_matrix; + } + + $aa = uc($aa) if defined $aa; + + throw("Invalid position: $pos") unless (defined $pos && $pos > 0); + + throw("Invalid amino acid: $aa") unless (defined $aa && defined $AA_LOOKUP->{$aa}); + + # compute our offset into the prediction matrix + + my $offset = $self->compute_offset($pos, $aa); + + print "offset: $offset\n" if $DEBUG; + + if ($offset + 1 > length($self->{matrix})) { + warning("Offset outside of prediction matrix for position $pos and amino acid $aa?"); + return undef; + } + + my $pred = substr($self->{matrix}, $offset, $BYTES_PER_PREDICTION); + + return $self->prediction_from_short($pred); +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/ReadCoverage.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/ReadCoverage.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,167 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# Ensembl module for Bio::EnsEMBL::Variation::ReadCoverage +# +# Copyright (c) 2005 Ensembl +# + + +=head1 NAME + +Bio::EnsEMBL::Variation::ReadCoverage - A coverage reagion for a read. + +=head1 SYNOPSIS + + # Read coverage feature representing a genomic region covered by 1 read + + $rc = Bio::EnsEMBL::Variation::ReadCoverage->new + (-start => 100, + -end => 200, + -slice => $slice, + -level => 1. + -sample => $individual); + + $rc = $rc->transform('supercontig'); + + print $rc->start(), "-", $rc->end(), "\n"; + + +=head1 DESCRIPTION + +This is a class representing the read coverage information +from the ensembl-variation database. A ReadCoverage behaves as any other Ensembl feature. + +See B. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::ReadCoverage; + +use Bio::EnsEMBL::Feature; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); + +our @ISA = ('Bio::EnsEMBL::Feature'); + + +=head2 new + + Arg [-ADAPTOR] : + see superclass constructor + + Arg [-START] : + see superclass constructor + Arg [-END] : + see superclass constructor + Arg [-SLICE] : + see superclass constructor + + Arg [-LEVEL] : + int - the number of times the region represented by start and end has been seen + + Arg [-SAMPLE] : + Bio::EnsEMBL::Variation::Individual - the individual + in which the allele was recorded + + Example : + $rc = Bio::EnsEMBL::Variation::ReadCoverage->new + (-start => 100, + -end => 100, + -slice => $slice, + -level => 1, + -sample => $individual); + + Description: Constructor. Instantiates a new ReadCoverage object. + Returntype : Bio::EnsEMBL::Variation::ReadCoverage + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + my ($level, $individual) = + rearrange([qw(LEVEL SAMPLE)], @_); + + $self->{'level'} = $level; + $self->{'sample'} = $individual; + + return $self; +} + + +=head2 level + + Arg[1] : int $newval (optional) + The new value to set the level attribute to + Example : $depth = $obj->level(); + Description : Getter/Setter for the level attribute. The level is + the number of times this feature has been seen in the genome + ReturnType : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub level{ + my $self = shift; + return $self->{'level'} = shift if (@_); + return $self->{'level'}; +} + + +=head2 sample + + Arg [1] : Bio::EnsEMBL::Variation::Individual $newval (optional) + The new value to set the sample attribute to + Example : $individual = $rc->sample(); + Description: Getter/Setter for the individual attribute + Returntype : Bio::EnsEMBL::Variation::Individual + Exceptions : throw on incorrect argument + Caller : general + Status : At Risk + +=cut + +sub sample{ + my $self = shift; + + if(@_) { + if(!ref($_[0]) || !$_[0]->isa('Bio::EnsEMBL::Variation::Individual')) { + throw('Bio::EnsEMBL::Variation::Individual argument expected.'); + } + $self->{'sample'} = shift; + } + + return $self->{'sample'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/ReadCoverageCollection.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/ReadCoverageCollection.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,338 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# Ensembl module for Bio::EnsEMBL::Variation::ReadCoverage +# +# Copyright (c) 2008 Ensembl +# + + +=head1 NAME + +Bio::EnsEMBL::Variation::ReadCoverageCollection - A collection of coverage reagion for a read. + +=head1 SYNOPSIS + + # Read coverage collection feature representing a genomic region covered by a read + + $rcc = Bio::EnsEMBL::Variation::ReadCoverageCollection->new + (-start => 100, + -end => 200, + -strand => 1, + -slice => $slice, + -window_start => 1, + -window_end => 1000, + -window_size => 50, + -read_coverage_avg => 100, + -read_coverage_min => 0, + -read_coverage_max => 200, + -sample => $individual); + + + print $rcc->start(), "-", $rcc->end(), "\n"; + + +=head1 DESCRIPTION + +This is a class representing the read coverage collection information +from the ensembl-variation database. A ReadCoverageCollection behaves as any other Ensembl feature collection. +Object for storing read_coverage scores. The scores are averaged (also the minimum and maximum scores) over different window sizes to speed up drawing over large regions. The scores are packed as integer and stored in a string +See B. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::ReadCoverageCollection; + +use Bio::EnsEMBL::Feature; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); + +our @ISA = ('Bio::EnsEMBL::Feature'); + + +=head2 new + + Arg [-ADAPTOR] : + see superclass constructor + Arg [-START] : + start relative to the slice,see superclass constructor + Arg [-END] : + end relative to the slice,see superclass constructor + Arg [-STRAND] : + 1 or -1, same as strand for the slice,see superclass constructor + Arg [-SLICE] : + see superclass constructor + Arg [-SEQ_REGION_START] : + start relative to chromosome,see superclass constructor + Arg [-SEQ_REGION_END] : + end relative to chromosome,see superclass constructor + Arg [-SEQ_REGION_STRAND] : + always 1, see superclass constructor + int Arg [-SCORE_MIN] : + average read_coverage for a window + Arg [-SCORE_AVG] : + minimum read_coverage for a window + Arg [-SCORE_MAX] : + maximum read_coverage for a window + Arg [-SAMPLE_ID] : + int - the individual in which the read_covarage is recorded + + Example : + $rc = Bio::EnsEMBL::Variation::ReadCoverage->new + (-start => 100, + -end => 100, + -slice => $slice, + -window_size => 50, + -sample_id => 1); + + Description: Constructor. Instantiates a new ReadCoverageCollection object. + Returntype : Bio::EnsEMBL::Variation::ReadCoverageCollection + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + my ($window_size,$window_start,$window_end,$sample_id,$read_coverage_avg,$read_coverage_min,$read_coverage_max,$y_axis_min,$y_axis_max) = + rearrange([qw(WINDOW_SIZE WINDOW_START WINDOW_END SAMPLE_ID READ_COVERAGE_AVG READ_COVERAGE_MIN READ_COVERAGE_MAX Y_AXIS_MIN Y_AXIS_MAX)], @_); + + $self->{'window_size'} = $window_size; + $self->{'window_start'} = $window_start; + $self->{'window_end'} = $window_end; + $self->{'sample_id'} = $sample_id; + $self->{'read_coverage_avg'} = $read_coverage_avg; + $self->{'read_coverage_min'} = $read_coverage_min; + $self->{'read_coverage_max'} = $read_coverage_max; + $self->{'y_axis_min'} = $y_axis_min; + $self->{'y_axis_max'} = $y_axis_max; + return $self; +} + +=head new_fast + + Arg [1] : hash reference $hashref + Example : none + Description: This is an ultra fast constructor which requires knowledge of + the objects internals to be used. + Returntype : + Exceptions : none + Caller : + +sub new_fast { + + my ($class, $hashref) = @_; + + return bless $hashref, $class; + +} + +=head2 window_size + + Arg[1] : int $newval (optional) + The new value to set the window_size attribute to + Example : $window_size = $obj->window_size(); + Description : Getter/Setter for the window_size attribute. + the window size this feature has been seen in the genome + ReturnType : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub window_size{ + my $self = shift; + return $self->{'window_size'} = shift if (@_); + return $self->{'window_size'}; +} + +=head2 window_start + + Arg[1] : int $newval (optional) + The new value to set the window_start attribute to + Example : $window_start = $obj->window_start(); + Description : Getter/Setter for the window_start attribute. + the window start this feature has been seen in the genome + ReturnType : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub window_start{ + my $self = shift; + return $self->{'window_start'} = shift if (@_); + return $self->{'window_start'}; +} + +=head2 window_end + + Arg[1] : int $newval (optional) + The new value to set the window_end attribute to + Example : $depth = $obj->window_end(); + Description : Getter/Setter for the window_end attribute. + the window end this feature has been seen in the genome + ReturnType : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub window_end{ + my $self = shift; + return $self->{'window_end'} = shift if (@_); + return $self->{'window_end'}; +} + +=head2 read_coverage_avg + + Arg[1] : int $newval (optional) + The new value to set the read_coverage_avg attribute to + Example : $avg = $obj->read_coverage_avg(); + Description : Getter/Setter for the score_avg attribute. + the average read_coverage this feature has + ReturnType : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub read_coverage_avg{ + my $self = shift; + return $self->{'read_coverage_avg'} = shift if (@_); + return $self->{'read_coverage_avg'}; +} + +=head2 read_coverage_min + + Arg[1] : int $newval (optional) + The new value to set the read_coverage_min attribute to + Example : $min = $obj->read_coverage_min(); + Description : Getter/Setter for the score_min attribute. + the minimum read_coverage this feature has + ReturnType : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub read_coverage_min{ + my $self = shift; + return $self->{'read_coverage_min'} = shift if (@_); + return $self->{'read_coverage_min'}; +} + +=head2 read_coverage_max + + Arg[1] : int $newval (optional) + The new value to set the read_coverage_max attribute to + Example : $max = $obj->read_coverage_max(); + Description : Getter/Setter for the read_coverage_max attribute. + the maximum read_coverage this feature has + ReturnType : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub read_coverage_max{ + my $self = shift; + return $self->{'read_coverage_max'} = shift if (@_); + return $self->{'read_coverage_max'}; +} + + +=head2 sample_id + + Arg[1] : int $newval (optional) + The new value to set the sample_id attribute to + Example : $sample_id = $obj->sample_id(); + Description : Getter/Setter for the individual dbId attribute. + the individual dbId this feature has been seen in the genome + ReturnType : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub sample_id{ + my $self = shift; + return $self->{'sample_id'} = shift if (@_); + return $self->{'sample_id'}; +} + +=head2 y_axis_min + + Arg[1] : int $newval (optional) + The new value to set the y_axiss_min attribute to + Example : $y_axis_min = $obj->y_axis_min(); + Description : Getter/Setter for the minimum of read_coverage for the collection of feature. + + ReturnType : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub y_axis_min{ + my $self = shift; + return $self->{'y_axis_min'} = shift if (@_); + return $self->{'y_axis_min'}; +} + +=head2 y_axis_max + + Arg[1] : int $newval (optional) + The new value to set the y_axiss_max attribute to + Example : $y_axis_max = $obj->y_axis_max(); + Description : Getter/Setter for the maximum of read_coverage for the collection of feature. + + ReturnType : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub y_axis_max{ + my $self = shift; + return $self->{'y_axis_max'} = shift if (@_); + return $self->{'y_axis_max'}; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/RegulationVariation.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/RegulationVariation.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,44 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +package Bio::EnsEMBL::Variation::RegulationVariation; + +use strict; +use warnings; + +use base qw(Bio::EnsEMBL::Variation::VariationFeatureOverlap); + +sub feature_label { + my ($self, $feature_label) = @_; + $self->{feature_label} = $feature_label if $feature_label; + return $self->{feature_label}; +} + +sub target_feature { + # XXX: fetch the target feature +} + +sub target_feature_stable_id { + my ($self, $target_feature_stable_id) = @_; + $self->{target_feature_stable_id} = $target_feature_stable_id if $target_feature_stable_id; + return $self->{target_feature_stable_id}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/RegulatoryFeatureVariation.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/RegulatoryFeatureVariation.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,84 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +package Bio::EnsEMBL::Variation::RegulatoryFeatureVariation; + +use strict; +use warnings; + +use Bio::EnsEMBL::Variation::RegulatoryFeatureVariationAllele; +use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(overlap); + +use base qw(Bio::EnsEMBL::Variation::RegulationVariation); + +sub new { + my $class = shift; + + my %args = @_; + + # swap a '-regulatory_feature' argument for a '-feature' one for the superclass + + for my $arg (keys %args) { + if (lc($arg) eq '-regulatory_feature') { + $args{'-feature'} = delete $args{$arg}; + } + } + + # call the superclass constructor + my $self = $class->SUPER::new(%args) || return undef; + + # rebless the alleles from vfoas to rfvas + map { bless $_, 'Bio::EnsEMBL::Variation::RegulatoryFeatureVariationAllele' } + @{ $self->get_all_RegulatoryFeatureVariationAlleles }; + + return $self; +} + +sub regulatory_feature_stable_id { + my $self = shift; + return $self->SUPER::feature_stable_id(@_); +} + +sub regulatory_feature { + my ($self, $rf) = @_; + return $self->SUPER::feature($rf, 'RegulatoryFeature'); +} + +sub add_RegulatoryFeatureVariationAllele { + my $self = shift; + return $self->SUPER::add_VariationFeatureOverlapAllele(@_); +} + +sub get_reference_RegulatoryFeatureVariationAllele { + my $self = shift; + return $self->SUPER::get_reference_VariationFeatureOverlapAllele(@_); +} + +sub get_all_alternate_RegulatoryFeatureVariationAlleles { + my $self = shift; + return $self->SUPER::get_all_alternate_VariationFeatureOverlapAlleles(@_); +} + +sub get_all_RegulatoryFeatureVariationAlleles { + my $self = shift; + return $self->SUPER::get_all_VariationFeatureOverlapAlleles(@_); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/RegulatoryFeatureVariationAllele.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/RegulatoryFeatureVariationAllele.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,52 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +package Bio::EnsEMBL::Variation::RegulatoryFeatureVariationAllele; + +use strict; +use warnings; + +use base qw(Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele); + +sub new_fast { + my ($self, $hashref) = @_; + + # swap a regulatory_variation argument for a variation_feature_overlap one + + if ($hashref->{regulatory_feature_variation}) { + $hashref->{variation_feature_overlap} = delete $hashref->{regulatory_feature_variation}; + } + + # and call the superclass + + return $self->SUPER::new_fast($hashref); +} + +sub regulatory_feature_variation { + my $self = shift; + return $self->variation_feature_overlap(@_); +} + +sub regulatory_feature { + my $self = shift; + return $self->regulatory_feature_variation->regulatory_feature; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Sample.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Sample.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,122 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# Ensembl module for Bio::EnsEMBL::Variation::Sample +# +# Copyright (c) 2005 Ensembl +# + + +=head1 NAME + +Bio::EnsEMBL::Variation::Sample - An abstract base class to represent +Population, Individual or Strain + + +=head1 SYNOPSIS + +Abstract class - should not be instantiated. Implementation of +abstract methods must be performed by subclasses. + +=head1 DESCRIPTION + +This is a base class representing population, individual and strain. This base +class is simply a way of merging similar concepts that should have the same ID + + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::Sample; + +use Bio::EnsEMBL::Storable; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw); + +our @ISA = ('Bio::EnsEMBL::Storable'); + + +=head2 name + + Arg [1] : string $newval (optional) + The new value to set the name attribute to + Example : $name = $obj->name() + Description: Getter/Setter for the name attribute + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub name{ + my $self = shift; + return $self->{'name'} = shift if(@_); + return $self->{'name'}; +} + + + +=head2 description + + Arg [1] : string $newval (optional) + The new value to set the description attribute to + Example : $description = $obj->description() + Description: Getter/Setter for the description attribute + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub description{ + my $self = shift; + return $self->{'description'} = shift if(@_); + return $self->{'description'}; +} + + + +=head2 size + + Arg [1] : int $newval (optional) + The new value to set the size attribute to + Example : $size = $obj->size() + Description: Getter/Setter for the size attribute + Returntype : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub size{ + my $self = shift; + return $self->{'size'} = shift if(@_); + return $self->{'size'}; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/StructuralVariation.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/StructuralVariation.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,122 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# Ensembl module for Bio::EnsEMBL::Variation::StructuralVariation +# +# Copyright (c) 2004 Ensembl +# + + +=head1 NAME + +Bio::EnsEMBL::Variation::StructuralVariation - Ensembl representation of a structural variation. + +=head1 SYNOPSIS + + # Structural variation representing a CNV + $sv = Bio::EnsEMBL::Variation::StructuralVariation->new + (-variation_name => 'esv25480', + -class_so_term => 'structural_variant', + -source => 'DGVa', + -source_description => 'Database of Genomic Variants Archive', + -study_name => 'estd20', + -study_description => 'Conrad 2009 "Origins and functional impact of copy number variation in the human genome." PMID:19812545 [remapped from build NCBI36]', + -study_url => 'ftp://ftp.ebi.ac.uk/pub/databases/dgva/estd20_Conrad_et_al_2009', + -external_reference => 'pubmed/19812545'); + + ... + + print $sv->name(), ":", $sv->var_class(); + +=head1 DESCRIPTION + +This is a class representing a structural variation from the +ensembl-variation database. A structural variant may have a copy number variation, a tandem duplication, +an inversion of the sequence or others structural variations. + +The position of a StructuralVariation object on the Genome is represented +by the class. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::StructuralVariation; + +use Bio::EnsEMBL::Variation::BaseStructuralVariation; + +our @ISA = ('Bio::EnsEMBL::Variation::BaseStructuralVariation'); + + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my $self = Bio::EnsEMBL::Variation::BaseStructuralVariation->new(@_); + return(bless($self, $class)); +} + +=head2 get_all_SupportingStructuralVariants + + Example : $sv->get_all_SupportingStructuralVariants(); + Description : Retrieves all SupportingStructuralVariation associated with this structural variation. + Return empty list if there are none. + Returntype : reference to list of Bio::EnsEMBL::Variation::SupportingStructuralVariation objects + Exceptions : None + Caller : general + Status : At Risk + +=cut + +sub get_all_SupportingStructuralVariants { + my $self = shift; + + if (defined ($self->{'adaptor'})){ + my $ssv_adaptor = $self->{'adaptor'}->db()->get_SupportingStructuralVariationAdaptor(); + return $ssv_adaptor->fetch_all_by_StructuralVariation($self); + } + warn("No variation database attached"); + return []; +} + + +=head2 summary_as_hash + + Example : $sv_summary = $sv->summary_as_hash(); + Description : Retrieves a textual summary of this StructuralVariation object. + Returns : hashref of descriptive strings + +=cut + +sub summary_as_hash { + my $self = shift; + my %summary; + $summary{'display_id'} = $self->display_id; + $summary{'study_name'} = $self->study_name; + $summary{'study_description'} = $self->study_description; + $summary{'class'} = $self->var_class; + return \%summary; + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/StructuralVariationAnnotation.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/StructuralVariationAnnotation.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,389 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# Ensembl module for Bio::EnsEMBL::Variation::StructuralVariationAnnotation +# +# Copyright (c) 2011 Ensembl +# + + +=head1 NAME + +Bio::EnsEMBL::Variation::StructuralVariationAnnotation - Annotations for a structural variant (sample and phenotype annotations). + +=head1 SYNOPSIS + + $study = $study_adaptor->fetch_by_name('nstd37'); + + $sva = Bio::EnsEMBL::Variation::StructuralVariationAnnotation->new + (-sample_name => 'ISCA_ID_5554', + -clinical_significance => 'Not tested', + -study => $study); + ... + + $sva->structural_variation->variation_name(),":", $sva->sample_name(); + +=head1 DESCRIPTION + +This is a class representing the annotation of a structural variant +from the ensembl-variation database. The actual structural variant information is +represented by an associated Bio::EnsEMBL::Variation::StructuralVariation object. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::StructuralVariationAnnotation; + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Variation::BaseStructuralVariation; +use Bio::EnsEMBL::Storable; +use Bio::EnsEMBL::Utils::Exception qw(deprecate); + +use vars qw(@ISA); + +@ISA = qw(Bio::EnsEMBL::Storable); + +=head2 new + + Arg [-dbID] : + int - unique internal identifier for variation_annotation + + Arg [-ADAPTOR] : + Bio::EnsEMBL::Variation::DBSQL::StructuralVariationAnnotationAdaptor + Adaptor which provides database connectivity for this StructuralVariationAnnotation object + + Arg [-_PHENOTYPE_ID] : + int _ the internal id of the phenotype + + Arg [-PHENOTYPE_DESCRIPTION] : + string - description of the phenotype + + Arg [-SAMPLE_NAME] : + string - name of the associated sample + + Arg [-STRAIN_NAME] : + string - name of the associated strain + + Arg [-CLINICAL_SIGNIFICANCE] : + string - clinical annotation for this structural variant. + + Arg [-STUDY] : + object ref - the study object describing where the annotated variation comes from. + + Arg [_STRUCTURAL_VARIATION_ID] : + int _ the internal id of the structural variant object associated with this + identifier. TUsing this identifier the structural variant may be lazy-loaded from + the database on demand. + + Example : + $study = $study_adaptor->fetch_by_name('nstd37'); + + $sva = Bio::EnsEMBL::Variation::StructuralVariationAnnotation->new + (-sample_name => 'ISCA_ID_5554', + -strain_name => 'ISCA', + -clinical_significance => 'Not tested', + -study => $study); + + Description: Constructor. Instantiates a new StructuralVariationAnnotation object. + Returntype : Bio::EnsEMBL::Variation::StructuralVariationAnnotation + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_); + + my ($dbID,$adaptor,$phenotype_id,$phenotype_description,$structural_variation_id,$sample_name, + $strain_name,$clinical_significance,$study) = + rearrange([qw(dbID ADAPTOR _PHENOTYPE_ID PHENOTYPE_DESCRIPTION _STRUCTURAL_VARIATION_ID + SAMPLE_NAME STRAIN_NAME CLINICAL_SIGNIFICANCE STUDY)],@_); + + $self->{'dbID'} = $dbID; + $self->{'adaptor'} = $adaptor; + $self->{'_phenotype_id'} = $phenotype_id; + $self->{'phenotype_description'} = $phenotype_description; + $self->{'_structural_variation_id'} = $structural_variation_id; + $self->{'sample_name'} = $sample_name; + $self->{'strain_name'} = $strain_name; + $self->{'clinical_significance'} = $clinical_significance; + $self->{'study'} = $study; + return $self; +} + + + +sub new_fast { + my $class = shift; + my $hashref = shift; + return bless $hashref, $class; +} + + +=head2 structural_variation + + Arg [1] : (optional) Bio::EnsEMBL::Variation::StructuralVariation or + Bio::EnsEMBL::Variation::SupportingStructuralVariation $structural_variation + Example : $sv = $svf->structural_variation(); + Description: Getter/Setter for the structural variant associated with this feature. + If not set, and this StructuralVariationFeature has an associated adaptor + an attempt will be made to lazy-load the structural variation from the + database. + Returntype : Bio::EnsEMBL::Variation::StructuralVariation or Bio::EnsEMBL::Variation::SupportingStructuralVariation + Exceptions : throw on incorrect argument + Caller : general + Status : Stable + +=cut + +sub structural_variation { + my $self = shift; + + if(@_) { + if(!ref($_[0]) || (!$_[0]->isa('Bio::EnsEMBL::Variation::StructuralVariation') && + !$_[0]->isa('Bio::EnsEMBL::Variation::SupportingStructuralVariation') + )) { + throw("Bio::EnsEMBL::Variation::StructuralVariation or Bio::EnsEMBL::Variation::SupportingStructuralVariation argument expected"); + } + $self->{'_structural_variation_id'} = shift; + } + elsif(!defined($self->{'structural_variation'}) && $self->{'adaptor'} && + defined($self->{'_structural_variation_id'})) { + # lazy-load from database on demand + my $sva = $self->{'adaptor'}->db()->get_StructuralVariationAdaptor(); + $self->{'structural_variation'} = $sva->fetch_by_dbID($self->{'_structural_variation_id'}); + if (!defined($self->{'structural_variation'})) { + $sva = $self->{'adaptor'}->db()->get_SupportingStructuralVariationAdaptor(); + $self->{'structural_variation'} = $sva->fetch_by_dbID($self->{'_structural_variation_id'}); + } + } + + return $self->{'structural_variation'}; +} + + +=head2 study + + Arg [1] : Bio::EnsEMBL::Variation::Study (optional) + Example : $study = $sv->study() + Description: Getter/Setter for the study object + Returntype : Bio::EnsEMBL::Variation::Study + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub study { + my $self = shift; + return $self->{'study'} = shift if(@_); + return $self->{'study'}; +} + + +=head2 study_type + + Arg [1] : string study_type (optional) + The new value to set the study_type attribute to + Example : $study_type = $obj->study_type() + Description: Getter/Setter for the study_type attribute. + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub study_type{ + my $self = shift; + return $self->{'study'}->type = shift if(@_); + return $self->{'study'}->type; +} + + +=head2 study_name + + Arg [1] : string $study_name (optional) + The new value to set the study_name attribute to + Example : $study = $sva->study_name() + Description: Getter/Setter for the study_name attribute + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub study_name{ + my $self = shift; + return $self->{'study'}->name = shift if(@_); + return $self->{'study'}->name; +} + + +=head2 study_description + + Arg [1] : string $study_description (optional) + The new value to set the study_description attribute to + Example : $study_description = $obj->study_description() + Description: Getter/Setter for the study_description attribute + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub study_description{ + my $self = shift; + return $self->{'study'}->description = shift if(@_); + return $self->{'study'}->description; +} + + +=head2 external_reference + + Arg [1] : string $newval (optional) + The new value to set the external reference attribute to + Example : $external_reference = $obj->external_reference() + Description: Getter/Setter for the external reference attribute. This is the + pubmed/id or project name associated with this study. + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub external_reference{ + my $self = shift; + return $self->{'study'}->external_reference = shift if(@_); + return $self->{'study'}->external_reference; +} + + +=head2 study_url + + Arg [1] : string $newval (optional) + The new value to set the study_url attribute to + Example : $url = $obj->study_url() + Description: Getter/Setter for the study_url attribute. This is the link to the website where the data are stored. + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub study_url{ + my $self = shift; + return $self->{'study'}->url = shift if(@_); + return $self->{'study'}->url; +} + + +=head2 sample_name + + Arg [1] : string sample_name (optional) + The new value to set the sample attribute to + Example : $sample_name = $obj->sample_name() + Description: Getter/Setter for the sample attribute. + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub sample_name { + my $self = shift; + return $self->{'sample_name'} = shift if(@_); + return $self->{'sample_name'}; +} + + +=head2 strain_name + + Arg [1] : string strain_name (optional) + The new value to set the strain attribute to + Example : $strain_name = $obj->strain_name() + Description: Getter/Setter for the strain attribute. + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub strain_name { + my $self = shift; + return $self->{'strain_name'} = shift if(@_); + return $self->{'strain_name'}; +} + + +=head2 phenotype_description + + Arg [1] : string phenotype_description (optional) + The new value to set the phenotype_description attribute to + Example : $phenotype_description = $obj->phenotype_description() + Description: Getter/Setter for the phenotype_description attribute. + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub phenotype_description{ + my $self = shift; + return $self->{'phenotype_description'} = shift if(@_); + return $self->{'phenotype_description'}; +} + + +=head2 clinical_significance + + Arg [1] : string clinical_significance (optional) + The new value to set the clinical significance attribute to + Example : $clinical_significance = $obj->clinical_significance() + Description: Getter/Setter for the clinical significance attribute. + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub clinical_significance { + my $self = shift; + return $self->{'clinical_significance'} = shift if(@_); + return $self->{'clinical_significance'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/StructuralVariationFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/StructuralVariationFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,940 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# Ensembl module for Bio::EnsEMBL::Variation::StructuralVariationFeature +# +# Copyright (c) 2011 Ensembl +# + + +=head1 NAME + +Bio::EnsEMBL::Variation::StructuralVariationFeature - A genomic position for a structural variation. + +=head1 SYNOPSIS + + # Structural variation feature representing a CNV + $svf = Bio::EnsEMBL::Variation::StructuralVariationFeature->new + (-start => 100, + -end => 200, + -strand => 1, + -slice => $slice, + -variation_name => 'esv1001', + -class_so_term => 'copy_number_variation', + -source => 'DGVa', + -source_description => 'Database of Genomic Variants Archive', + ); + + ... + + print $svf->start(), "-", $svf->end(), '(', $svf->strand(), ')', "\n"; + + print $svf->variation_name(), ":", $svf->var_class(); + +=head1 DESCRIPTION + +This is a class representing the genomic position of a structural variant +from the ensembl-variation database. A StructuralVariationFeature behaves as any other +Ensembl feature. See B and +B. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::StructuralVariationFeature; + +use Scalar::Util qw(weaken isweak); + +use Bio::EnsEMBL::Variation::BaseVariationFeature; +use Bio::EnsEMBL::Utils::Exception qw(throw warning deprecate); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Scalar qw(assert_ref); +use Bio::EnsEMBL::Slice; +use Bio::EnsEMBL::Variation::Utils::Constants qw(%VARIATION_CLASSES); +use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(MAX_DISTANCE_FROM_TRANSCRIPT); +use Bio::EnsEMBL::Variation::StructuralVariationOverlap; +use Bio::EnsEMBL::Variation::TranscriptStructuralVariation; +use Bio::EnsEMBL::Variation::IntergenicStructuralVariation; + +our @ISA = ('Bio::EnsEMBL::Variation::BaseVariationFeature'); + +=head2 new + + Arg [-dbID] : + see superclass constructor + + Arg [-ADAPTOR] : + see superclass constructor + + Arg [-START] : + see superclass constructor + Arg [-END] : + see superclass constructor + + Arg [-STRAND] : + see superclass constructor + + Arg [-SLICE] : + see superclass constructor + + Arg [-INNER_START] : + int - the 5'-greater coordinate of the underlying structural variation + + Arg [-INNER_END] : + int - the 3'-less coordinate of the underlying structural variation + + Arg [-OUTER_START] : + int - the 5'-less coordinate of the underlying structural variation + + Arg [-OUTER_END] : + int - the 3'-greater coordinate of the underlying structural variation + + Arg [-VARIATION_NAME] : + string - the name of the variation this feature is for (denormalisation + from Variation object). + + Arg [-CLASS_SO_TERM] : + string - the sequence ontology term defining the class of the structural variation. + + Arg [-ALLELE_STRING] : + string - allele sequence of the structural variation. + + Arg [-SOURCE] : + string - the name of the source where the variation comes from + + Arg [-SOURCE_VERSION]: + string - version number of the source + + Arg [-IS_SOMATIC] : + int - flag to inform whether the structural variant is a somatic (1) or germline (0). + + Arg [-BREAKPOINT_ORDER] : + int - For a structural variant with multiple breakpoints, this gives the predicted order of the breakpoint event. + + Example : + $svf = Bio::EnsEMBL::Variation::StructuralVariationFeature->new + (-start => 100, + -end => 200, + -strand => 1, + -slice => $slice, + -variation_name => 'esv25480', + -class_so_term => 'structural_variant', + -source => 'DGVa'); + + Description: Constructor. Instantiates a new StructuralVariationFeature object. + Returntype : Bio::EnsEMBL::Variation::StructuralVariationFeature + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + my ( + $var_name, + $source, + $source_version, + $class_so_term, + $inner_start, + $inner_end, + $outer_start, + $outer_end, + $allele_string, + $is_somatic, + $breakpoint_order + ) = rearrange([qw( + VARIATION_NAME + SOURCE + SOURCE_VERSION + CLASS_SO_TERM + INNER_START + INNER_END + OUTER_START + INNER_START + ALLELE_STRING + IS_SOMATIC + BREAKPOINT_ORDER + )], @_); + + + $self->{'variation_name'} = $var_name; + $self->{'source'} = $source; + $self->{'source_version'} = $source_version; + $self->{'class_SO_term'} = $class_so_term; + $self->{'inner_start'} = $inner_start; + $self->{'inner_end'} = $inner_end; + $self->{'outer_start'} = $outer_start; + $self->{'outer_end'} = $outer_end; + $self->{'allele_string'} = $allele_string; + $self->{'is_somatic'} = $is_somatic || 0; + $self->{'breakpoint_order'} = $breakpoint_order; + + return $self; +} + + + +sub new_fast { + my $class = shift; + my $hashref = shift; + return bless $hashref, $class; +} + + +=head2 display_id + + Arg [1] : none + Example : print $svf->display_id(), "\n"; + Description: Returns the 'display' identifier for this feature. For + StructuralVariationFeatures this is simply the name of the structural variation + it is associated with. + Returntype : string + Exceptions : none + Caller : webcode + Status : At Risk + +=cut + +sub display_id { + my $self = shift; + return $self->{'variation_name'} || ''; +} + + + +=head2 variation_name + + Arg [1] : string $newval (optional) + The new value to set the variation_name attribute to + Example : $variation_name = $obj->variation_name() + Description: Getter/Setter for the variation_name attribute. This is the + name of the structural variant associated with this feature. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub variation_name{ + my $self = shift; + return $self->{'variation_name'} = shift if(@_); + return $self->{'variation_name'}; +} + +=head2 allele_string + + Arg [1] : string $newval (optional) + The new value to set the allele_string attribute to + Example : $allele_string = $obj->allele_string() + Description: Getter/Setter for the allele_string attribute. This is the + genomic sequence represented by this feature. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub allele_string{ + my $self = shift; + return $self->{'allele_string'} = shift if(@_); + return $self->{'allele_string'}; +} + + + +=head2 structural_variation + + Arg [1] : (optional) Bio::EnsEMBL::Variation::StructuralVariation or + Bio::EnsEMBL::Variation::SupportingStructuralVariation $structural_variation + Example : $sv = $svf->structural_variation(); + Description: Getter/Setter for the structural variant associated with this feature. + If not set, and this StructuralVariationFeature has an associated adaptor + an attempt will be made to lazy-load the structural variation from the + database. + Returntype : Bio::EnsEMBL::Variation::StructuralVariation or + Bio::EnsEMBL::Variation::SupportingStructuralVariation + Exceptions : throw on incorrect argument + Caller : general + Status : Stable + +=cut + +sub structural_variation { + my $self = shift; + + if(@_) { + if(!ref($_[0]) || (!$_[0]->isa('Bio::EnsEMBL::Variation::StructuralVariation') && + !$_[0]->isa('Bio::EnsEMBL::Variation::SupportingStructuralVariation') + )) { + throw("Bio::EnsEMBL::Variation::StructuralVariation or Bio::EnsEMBL::Variation::SupportingStructuralVariation argument expected"); + } + $self->{'structural_variation'} = shift; + } + elsif(!defined($self->{'structural_variation'}) && $self->{'adaptor'} && + defined($self->{'structural_variation_id'})) { + # lazy-load from database on demand + my $sva = $self->{'adaptor'}->db()->get_StructuralVariationAdaptor(); + $self->{'structural_variation'} = $sva->fetch_by_dbID($self->{'structural_variation_id'}); + if (!defined($self->{'structural_variation'})) { + $sva = $self->{'adaptor'}->db()->get_SupportingStructuralVariationAdaptor(); + $self->{'structural_variation'} = $sva->fetch_by_dbID($self->{'structural_variation_id'}); + } + } + + return $self->{'structural_variation'}; +} + + + + + +=head2 get_all_VariationSets + + Args : none + Example : my @vs = @{$svf->get_all_VariationSets()}; + Description : returns a reference to a list of all the VariationSets this + StructuralVariationFeature is a member of + ReturnType : reference to list of Bio::EnsEMBL::Variation::VariationSets + Exceptions : if no adaptor is attached to this object + Caller : general + Status : At Risk +=cut + +sub get_all_VariationSets { + my $self = shift; + + if (!$self->adaptor()) { + throw('An adaptor must be attached in order to get all variation sets'); + } + my $vs_adaptor = $self->adaptor()->db()->get_VariationSetAdaptor(); + my $variation_sets = $vs_adaptor->fetch_all_by_StructuralVariation($self->structural_variation()); + + return $variation_sets; +} + + +=head2 get_nearest_Gene + + Example : $svf->get_nearest_Gene($flanking_size); + Description : Getter a Gene which is associated to or nearest to the StructuralVariationFeature + Returntype : Listref of objects of Bio::EnsEMBL::Gene + Exceptions : None + Caller : general + Status : At Risk + +=cut + +sub get_nearest_Gene{ + + my $self = shift; + my $flanking_size = shift; #flanking size is optional + $flanking_size ||= 0; + my $sa = $self->{'adaptor'}->db()->dnadb->get_SliceAdaptor(); + my $slice = $sa->fetch_by_Feature($self,$flanking_size); + my @genes = @{$slice->get_all_Genes}; + return \@genes if @genes; #$svf is on the gene + + if (! @genes) { #if $svf is not on the gene, increase flanking size + warning("flanking_size $flanking_size is not big enough to overlap a gene, increase it by 1,000,000"); + $flanking_size += 1000000; + $slice = $sa->fetch_by_Feature($self,$flanking_size); + @genes = @{$slice->get_all_Genes}; + } + if (@genes) { + my %distances = (); + foreach my $g (@genes) { + if ($g->seq_region_start > $self->start) { + $distances{$g->seq_region_start-$self->start}=$g; + } + else { + $distances{$self->start-$g->seq_region_end}=$g; + } + } + my @distances = sort {$a<=>$b} keys %distances; + my $shortest_distance = $distances[0]; + if ($shortest_distance) { + my $nearest_gene = $distances{$shortest_distance}; + return [$nearest_gene]; + } + } + else { + throw("variation_feature with flanking_size $flanking_size is not overlap with a gene, try a bigger flanking_size"); + } +} + + +=head2 is_somatic + + Arg [1] : boolean $is_somatic (optional) + The new value to set the is_somatic flag to + Example : $is_somatic = $svf->is_somatic + Description: Getter/Setter for the is_somatic flag, which identifies this structural variation feature as either somatic or germline + Returntype : boolean + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub is_somatic { + my ($self, $is_somatic) = @_; + $self->{'is_somatic'} = $is_somatic if defined $is_somatic; + return $self->{'is_somatic'}; +} + + +=head2 breakpoint_order + + Arg [1] : string $bp_order (optional) + The new value to set the breakpoint order to + Example : $bp_order = $svf->breakpoint_order() + Description: Getter/Setter for the breakpoint_order attribute + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub breakpoint_order { + my $self = shift; + return $self->{'breakpoint_order'} = shift if(@_); + return $self->{'breakpoint_order'}; +} + +=head2 get_all_StructuralVariationOverlaps + + Description : Get all the StructuralVariationOverlaps associated with this StructuralVariation, this + includes TranscriptStructuralVariations and regulatory feature overlap object. + Returntype : listref of Bio::EnsEMBL::Variation::StructuralVariationOverlap objects + Exceptions : none + Status : At Risk + +=cut + +sub get_all_StructuralVariationOverlaps { + my $self = shift; + + my $vfos = [ + @{ $self->get_all_TranscriptStructuralVariations }, + @{ $self->get_all_RegulatoryFeatureStructuralVariations }, + @{ $self->get_all_MotifFeatureStructuralVariations }, + ]; + + if (my $iv = $self->get_IntergenicStructuralVariation) { + push @$vfos, $iv; + } + + return $vfos; +} + +=head2 get_all_TranscriptStructuralVariations + + Arg [1] : (optional) listref of Bio::EnsEMBL::Transcript objects + Example : $svf->get_all_TranscriptStructuralVariations; + Description : Get all the TranscriptStructuralVariations associated with this + StructuralVariationFeature. If the optional list of Transcripts + is supplied, get only TranscriptStructuralVariations + associated with those Transcripts. + Returntype : listref of Bio::EnsEMBL::Variation::TranscriptVariation objects + Exceptions : Thrown on wrong argument type + Caller : general + Status : At Risk + +=cut + +sub get_all_TranscriptStructuralVariations { + my ($self, $transcripts) = @_; + + if ($transcripts) { + assert_ref($transcripts, 'ARRAY'); + map { assert_ref($_, 'Bio::EnsEMBL::Transcript') } @$transcripts; + } + + elsif (not defined $self->{transcript_structural_variations}) { + # this VariationFeature is not in the database so we have to build the + # TranscriptVariations ourselves + + unless ($transcripts) { + # if the caller didn't supply some transcripts fetch those around this VariationFeature + # get a slice around this transcript including the maximum distance up and down-stream + # that we still call consequences for + my $slice = $self->feature_Slice->expand( + MAX_DISTANCE_FROM_TRANSCRIPT, + MAX_DISTANCE_FROM_TRANSCRIPT + ); + + # fetch all transcripts on this slice + $transcripts = $slice->get_all_Transcripts(1); + } + + my @unfetched_transcripts = grep { + not exists $self->{transcript_structural_variations}->{$_->stable_id} + } @$transcripts; + + for my $transcript (@unfetched_transcripts) { + $self->add_TranscriptStructuralVariation( + Bio::EnsEMBL::Variation::TranscriptStructuralVariation->new( + -structural_variation_feature => $self, + -transcript => $transcript, + -adaptor => undef, + ) + ); + } + } + + if ($transcripts) { + # just return TranscriptVariations for the requested Transcripts + return [ map { $self->{transcript_structural_variations}->{$_->stable_id} } @$transcripts ]; + } + else { + # return all TranscriptVariations + return [ values %{ $self->{transcript_structural_variations} } ]; + } +} + +=head2 get_all_RegulatoryFeatureStructuralVariations + + Description : Get all the RegulatoryFeatureStructuralVariations associated with this VariationFeature. + Returntype : listref of Bio::EnsEMBL::Variation::StructuralVariationOverlap objects + Exceptions : none + Status : At Risk + +=cut + +sub get_all_RegulatoryFeatureStructuralVariations { + my $self = shift; + return $self->_get_all_RegulationStructuralVariations('RegulatoryFeature', @_); +} + +=head2 get_all_MotifFeatureStructuralVariations + + Description : Get all the MotifFeatureStructuralVariations associated with this VariationFeature. + Returntype : listref of Bio::EnsEMBL::Variation::StructuralVariationOverlap objects + Exceptions : none + Status : At Risk + +=cut + +sub get_all_MotifFeatureStructuralVariations { + my $self = shift; + return $self->_get_all_RegulationStructuralVariations('MotifFeature', @_); +} + +=head2 get_all_ExternalFeatureStructuralVariations + + Description : Get all the ExternalFeatureStructuralVariations associated with this VariationFeature. + Returntype : listref of Bio::EnsEMBL::Variation::StructuralVariationOverlap objects + Exceptions : none + Status : At Risk + +=cut + +sub get_all_ExternalFeatureStructuralVariations { + my $self = shift; + return $self->_get_all_RegulationStructuralVariations('ExternalFeature', @_); +} + +sub _get_all_RegulationStructuralVariations { + my ($self, $type) = @_; + + unless ($type && ($type eq 'RegulatoryFeature' || $type eq 'MotifFeature' || $type eq 'ExternalFeature')) { + throw("Invalid Ensembl Regulation type '$type'"); + } + + unless ($self->{regulation_structural_variations}->{$type}) { + my $fg_adaptor; + + if (my $adap = $self->adaptor) { + if(my $db = $adap->db) { + $fg_adaptor = Bio::EnsEMBL::DBSQL::MergedAdaptor->new( + -species => $adap->db->species, + -type => $type, + ); + } + + unless ($fg_adaptor) { + warning("Failed to get adaptor for $type"); + return []; + } + } + else { + warning('Cannot get variation features without attached adaptor'); + return []; + } + + my $slice = $self->feature_Slice; + + my $constructor = 'Bio::EnsEMBL::Variation::StructuralVariationOverlap'; + + eval { + $self->{regulation_structural_variations}->{$type} = [ + map { + $constructor->new( + -structural_variation_feature => $self, + -feature => $_, + ); + } map { $_->transfer($self->slice) } @{ $fg_adaptor->fetch_all_by_Slice($slice) } + ]; + }; + + $self->{regulation_structural_variations}->{$type} ||= []; + } + + return $self->{regulation_structural_variations}->{$type}; +} + + +sub get_IntergenicStructuralVariation { + my $self = shift; + my $no_ref_check = shift; + + unless (exists $self->{intergenic_structural_variation}) { + if (scalar(@{ $self->get_all_TranscriptStructuralVariations }) == 0) { + $self->{intergenic_structural_variation} = Bio::EnsEMBL::Variation::IntergenicStructuralVariation->new( + -structural_variation_feature => $self, + -no_ref_check => $no_ref_check, + ); + } + else { + $self->{intergenic_structural_variation} = undef; + } + } + + return $self->{intergenic_structural_variation}; +} + + + +=head2 TranscriptStructuralVariation + + Arg [1] : Bio::EnsEMBL::Variation::TranscriptStructuralVariation + Example : $vf->add_TranscriptStructuralVariation($tsv); + Description : Adds a TranscriptStructuralVariation to the structural variation + feature object. + Exceptions : thrown on bad argument + Caller : Bio::EnsEMBL::Variation::StructuralVariationFeature, + Bio::EnsEMBL::Varaition::Utils::VEP + Status : Stable + +=cut + +sub add_TranscriptStructuralVariation { + my ($self, $tsv) = @_; + assert_ref($tsv, 'Bio::EnsEMBL::Variation::TranscriptStructuralVariation'); + # we need to weaken the reference back to us to avoid a circular reference + weaken($tsv->{base_variation_feature}); + $self->{transcript_structural_variations}->{$tsv->transcript_stable_id} = $tsv; +} + + +=head2 var_class + + Args : None + Example : my $sv_class = $svf->var_class() + Description : Getter for the class of structural variation + ReturnType : String + Exceptions : none + Caller : General + Status : At Risk + +=cut + +sub var_class { + my $self = shift; + + unless ($self->{class_display_term}) { + my $display_term = $VARIATION_CLASSES{$self->{class_SO_term}}->{display_term}; + + warn "No display term for SO term: ".$self->{class_SO_term} unless $display_term; + + $self->{class_display_term} = $display_term || $self->{class_SO_term}; + } + + return $self->{class_display_term}; +} + + +=head2 class_SO_term + + Args : None + Example : my $sv_so_term = $svf->class_SO_term() + Description : Getter for the class of structural variation, returning the SO term + ReturnType : String + Exceptions : none + Caller : General + Status : At Risk + +=cut + +sub class_SO_term { + my $self = shift; + + return $self->{class_SO_term}; +} + + +=head2 source + + Arg [1] : string $source (optional) + The new value to set the source attribute to + Example : $source = $svf->source() + Description: Getter/Setter for the source attribute + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub source{ + my $self = shift; + return $self->{'source'} = shift if(@_); + return $self->{'source'}; +} + +=head2 source_version + + Arg [1] : string $source_version (optional) + The new value to set the source_version attribute to + Example : $source_version = $svf->source_version() + Description: Getter/Setter for the source_version attribute + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub source_version { + my $self = shift; + return $self->{'source_version'} = shift if(@_); + return $self->{'source_version'}; +} + +=head2 bound_start + + Args : None + Example : my $bound_start = $svf->bound_start(); + Description : Getter/setter for the 5'-most coordinate defined for this StructuralVariationFeature (outer_start or start) + ReturnType : int + Exceptions : none + Caller : general + Status : At risk +=cut + +sub bound_start{ + my $self = shift; + return $self->{'outer_start'} if (defined($self->{'outer_start'})); + return $self->{'start'}; +} + + +=head2 bound_end + + Args : None + Example : my $bound_end = $svf->bound_end(); + Description : Getter/setter for the 3'-most coordinate defined for this StructuralVariationFeature (outer_end or end) + ReturnType : int + Exceptions : none + Caller : general + Status : At risk +=cut + +sub bound_end{ + my $self = shift; + return $self->{'outer_end'} if (defined($self->{'outer_end'})); + return $self->{'end'}; +} + + +=head2 outer_start + + Arg [1] : int $outer_start (optional) + The new value to set the outer_start attribute to + Example : my $outer_start = $svf->outer_start(); + Description : Getter/setter for the 5'-most coordinate defined for this StructuralVariationFeature + ReturnType : int + Exceptions : none + Caller : general + Status : At risk +=cut + +sub outer_start{ + my $self = shift; + return $self->{'outer_start'} = shift if(@_); + return $self->{'outer_start'}; +} + + +=head2 outer_end + + Arg [1] : int $outer_end (optional) + The new value to set the outer_end attribute to + Example : my $outer_end = $svf->outer_end(); + Description : Getter/setter for the 3'-most coordinate defined for this StructuralVariationFeature + ReturnType : int + Exceptions : none + Caller : general + Status : At risk +=cut + +sub outer_end{ + my $self = shift; + return $self->{'outer_end'} = shift if(@_); + return $self->{'outer_end'}; +} + + +=head2 inner_start + + Arg [1] : int $inner_start (optional) + The new value to set the inner_start attribute to + Example : my $inner_start = $svf->inner_start(); + Description : Getter/setter for the 5'-less coordinate defined for this StructuralVariationFeature + ReturnType : int + Exceptions : none + Caller : general + Status : At Risk +=cut + +sub inner_start{ + my $self = shift; + return $self->{'inner_start'} = shift if(@_); + return $self->{'inner_start'}; +} + + +=head2 inner_end + + Arg [1] : int $inner_end (optional) + The new value to set the inner_end attribute to + Example : my $inner_end = $svf->inner_end(); + Description : Getter/setter for the 3'-less coordinate defined for this StructuralVariationFeature + ReturnType : int + Exceptions : none + Caller : general + Status : At Risk +=cut + +sub inner_end{ + my $self = shift; + return $self->{'inner_end'} = shift if(@_); + return $self->{'inner_end'}; +} + + +=head2 get_reference_sequence + + Args : none + Example : my $seq = $svf->get_reference_sequence + Description : returns a string containing the reference sequence for the region + covered by this StructuralVariationFeature + ReturnType : string + Exceptions : none + Caller : general + Status : At Risk +=cut + +sub get_reference_sequence{ + my $self = shift; + + return $self->feature_Slice->seq(); +} + + +sub transform { + my $self = shift; + + # run the transform method from the parent class + my $transformed = $self->SUPER::transform(@_); + + if(defined $transformed) { + # fit the start and end coords to the new coords + $transformed->_fix_bounds($self); + } + + return $transformed; +} + + +sub transfer { + my $self = shift; + + # run the transfer method from the parent class + my $transferred = $self->SUPER::transfer(@_); + + if(defined $transferred) { + # fit the start and end coords to the new coords + $transferred->_fix_bounds($self); + } + + return $transferred; +} + + +sub _fix_bounds { + my $self = shift; + my $old = shift; + + if(defined $old->{'outer_start'}) { + $self->{'outer_start'} = $self->start - ($old->start - $old->{'outer_start'}); + } + + if(defined $old->{'outer_end'}) { + $self->{'outer_end'} = $self->end + ($old->{'outer_end'} - $old->end); + } +} + +sub _sort_svos { + my $self = shift; + + return unless defined $self->{structural_variation_overlaps}; + + my @svos = @{$self->{structural_variation_overlaps}}; + + # define a feature order for sorting + my %feature_order = ( + 'Bio::EnsEMBL::Gene' => 1, + 'Bio::EnsEMBL::Transcript' => 2, + 'Bio::EnsEMBL::Exon' => 3, + ); + + # sort them nicely by feature type and position + @svos = sort { + $feature_order{ref($a->feature)} <=> $feature_order{ref($b->feature)} || + $a->feature->start <=> $b->feature->start + } @svos; + + $self->{structural_variation_overlaps} = \@svos; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/StructuralVariationOverlap.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/StructuralVariationOverlap.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,144 @@ +package Bio::EnsEMBL::Variation::StructuralVariationOverlap; + +use strict; +use warnings; + +use Bio::EnsEMBL::Variation::StructuralVariationOverlapAllele; + +use Bio::EnsEMBL::Utils::Scalar qw(assert_ref); +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); + +use base qw(Bio::EnsEMBL::Variation::BaseVariationFeatureOverlap); + +sub new { + + my $class = shift; + + my %args = @_; + + # swap a '-structural_variation_feature' argument for a '-base_variation_feature' one for the superclass + + for my $arg (keys %args) { + if (lc($arg) eq '-structural_variation_feature') { + $args{'-base_variation_feature'} = delete $args{$arg}; + } + } + + # call the superclass constructor + my $self = $class->SUPER::new(%args); + + # construct a fake 'allele' + + $self->add_StructuralVariationOverlapAllele( + Bio::EnsEMBL::Variation::StructuralVariationOverlapAllele->new_fast({ + structural_variation_overlap => $self, + }) + ); + + return $self; +} + +sub new_fast { + my ($class, $hashref) = @_; + + # swap a 'structural_variation_feature' argument for a 'base_variation_feature' one for the superclass + + if ($hashref->{structural_variation_feature}) { + $hashref->{base_variation_feature} = delete $hashref->{structural_variation_feature}; + } + + # and call the superclass + + my $self = $class->SUPER::new_fast($hashref); + +# for my $ssv (@{ $self->structural_variation_feature->structural_variation->get_all_SupportingStructuralVariants }) { +# for my $ssvf (@{ $ssv->get_all_StructuralVariationFeatures }) { +# $self->add_StructuralVariationOverlapAllele( +# Bio::EnsEMBL::Variation::StructuralVariationOverlapAllele->new_fast({ +# structural_variation_overlap => $self, +# }) +# ); +# } +# } + + unless (@{ $self->get_all_alternate_StructuralVariationOverlapAlleles }) { + + # construct a fake 'allele' + + $self->add_StructuralVariationOverlapAllele( + Bio::EnsEMBL::Variation::StructuralVariationOverlapAllele->new_fast({ + structural_variation_overlap => $self, + }) + ); + } + + return $self; +} + +sub structural_variation_feature { + my $self = shift; + return $self->base_variation_feature(@_); +} + +=head2 add_StructuralVariationOverlapAllele + + Arg [1] : A Bio::EnsEMBL::Variation::StructuralVariationOverlapAllele instance + Description: Add an allele to this StructuralVariationOverlap + Returntype : none + Exceptions : throws if the argument is not the expected type + Status : At Risk + +=cut + +sub add_StructuralVariationOverlapAllele { + my ($self, $svoa) = @_; + assert_ref($svoa, 'Bio::EnsEMBL::Variation::StructuralVariationOverlapAllele'); + return $self->SUPER::add_BaseVariationFeatureOverlapAllele($svoa); +} + +=head2 get_reference_StructuralVariationOverlapAllele + + Description: Get the object representing the reference allele of this StructuralVariationOverlap + Returntype : Bio::EnsEMBL::Variation::StructuralVariationOverlapAllele instance + Exceptions : none + Status : At Risk + +=cut + +sub get_reference_StructuralVariationOverlapAllele { + my $self = shift; + return $self->SUPER::get_reference_BaseVariationFeatureOverlapAllele(@_); +} + +=head2 get_all_alternate_StructuralVariationOverlapAlleles + + Description: Get a list of the alternate alleles of this StructuralVariationOverlap + Returntype : listref of Bio::EnsEMBL::Variation::StructuralVariationOverlapAllele objects + Exceptions : none + Status : At Risk + +=cut + +sub get_all_alternate_StructuralVariationOverlapAlleles { + my $self = shift; + return $self->SUPER::get_all_alternate_BaseVariationFeatureOverlapAlleles(@_); +} + +=head2 get_all_StructuralVariationOverlapAlleles + + Description: Get a list of the all the alleles, both reference and alternate, of + this StructuralVariationOverlap + Returntype : listref of Bio::EnsEMBL::Variation::StructuralVariationOverlapAllele objects + Exceptions : none + Status : At Risk + +=cut + +sub get_all_StructuralVariationOverlapAlleles { + my $self = shift; + return $self->SUPER::get_all_BaseVariationFeatureOverlapAlleles(@_); +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/StructuralVariationOverlapAllele.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/StructuralVariationOverlapAllele.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,56 @@ +package Bio::EnsEMBL::Variation::StructuralVariationOverlapAllele; + +use strict; +use warnings; + +use base qw(Bio::EnsEMBL::Variation::BaseVariationFeatureOverlapAllele); + +sub new_fast { + my ($class, $hashref) = @_; + + # swap a transcript_variation argument for a variation_feature_overlap one + + if ($hashref->{structural_variation_overlap}) { + $hashref->{base_variation_feature_overlap} = + delete $hashref->{structural_variation_overlap}; + } + + # and call the superclass + + return $class->SUPER::new_fast($hashref); +} + +=head2 structural_variation_overlap + + Description: Get the associated StructuralVariationOverlap + Returntype : Bio::EnsEMBL::Variation::StructuralVariationOverlap + Exceptions : none + Status : At Risk + +=cut + +sub structural_variation_overlap { + my ($self, $svo) = @_; + if ($svo) { + assert_ref($svo, 'Bio::EnsEMBL::Variation::StructuralVariationOverlap'); + } + return $self->base_variation_feature_overlap($svo); +} + + +=head2 structural_variation_feature + + Description: Get the associated StructuralVariationFeature + Returntype : Bio::EnsEMBL::Variation::StructuralVariationFeature + Exceptions : none + Status : At Risk + +=cut + +sub structural_variation_feature { + my $self = shift; + return $self->structural_variation_overlap->structural_variation_feature; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Study.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Study.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,266 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# Ensembl module for Bio::EnsEMBL::Variation::Study +# +# Copyright (c) 2011 Ensembl +# + + +=head1 NAME + +Bio::EnsEMBL::Variation::Study - Ensembl representation of a study. + +=head1 SYNOPSIS + + # Study + $study = Bio::EnsEMBL::Variation::Study->new + (-name => 'EGAS00000000001', + -external_reference => 'pubmed/17554300', + -url => 'http://www.ebi.ac.uk/ega/page.php?page=study&study=EGAS00000000001&cat=www.wtccc.studies.xml.ega&subcat=BD' + ); + ... + + +=head1 DESCRIPTION + +This is a class representing a study from the ensembl-variation database. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::Study; + +use Bio::EnsEMBL::Storable; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); + +our @ISA = ('Bio::EnsEMBL::Storable'); + + +=head2 new + + Arg [-dbID] : + see superclass constructor + Arg [-ADAPTOR] : + see superclass constructor + Arg [-NAME] : + name of the study + Arg [-DESCRIPTION] : + study description + Arg [-URL] : + string - url of the database/file where the data are stored + Arg [-EXTERNAL_REFERENCE] : + string - the pubmed/ids or project/study names + Arg [-TYPE] : + string - type of the study (e.g. GWAS) + Arg [-SOURCE] : + string - name of the source + Arg [-ASSOCIATE] : + array ref - list of the study objects associated with the current study + + Example : + + $study = Bio::EnsEMBL::Variation::Study->new + (-name => 'EGAS00000000001', + -external_reference => 'pubmed/17554300', + -url => 'http://www.ebi.ac.uk/ega/page.php?page=study&study=EGAS00000000001&cat=www.wtccc.studies.xml.ega&subcat=BD' + ); + + Description: Constructor. Instantiates a new Study object. + Returntype : Bio::EnsEMBL::Variation::Study + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + my ($dbID,$adaptor,$study_name,$study_description,$study_url,$external_reference, + $study_type,$source_name,$associate) = + rearrange([qw(dbID ADAPTOR NAME DESCRIPTION URL EXTERNAL_REFERENCE TYPE SOURCE ASSOCIATE)], @_); + + $self = { + 'dbID' => $dbID, + 'adaptor' => $adaptor, + 'name' => $study_name, + 'description' => $study_description, + 'url' => $study_url, + 'external_reference' => $external_reference, + 'type' => $study_type, + 'source' => $source_name, + 'associate' => $associate + }; + + return bless $self, $class; +} + + +=head2 name + + Arg [1] : string $newval (optional) + The new value to set the name attribute to + Example : $name = $obj->name() + Description: Getter/Setter for the name attribute + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub name{ + my $self = shift; + return $self->{'name'} = shift if(@_); + return $self->{'name'}; +} + + +=head2 description + + Arg [1] : string $newval (optional) + The new value to set the description attribute to + Example : $name = $obj->description() + Description: Getter/Setter for the description attribute + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub description{ + my $self = shift; + return $self->{'description'} = shift if(@_); + return $self->{'description'}; +} + + +=head2 url + + Arg [1] : string $newval (optional) + The new value to set the url attribute to + Example : $name = $obj->url() + Description: Getter/Setter for the url attribute + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub url{ + my $self = shift; + return $self->{'url'} = shift if(@_); + return $self->{'url'}; +} + + +=head2 external_reference + + Arg [1] : string $newval (optional) + The new value to set the external_reference attribute to + Example : $name = $obj->external_reference() + Description: Getter/Setter for the external_reference attribute + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub external_reference{ + my $self = shift; + return $self->{'external_reference'} = shift if(@_); + return $self->{'external_reference'}; +} + + +=head2 type + + Arg [1] : string $newval (optional) + The new value to set the type attribute to + Example : $name = $obj->type() + Description: Getter/Setter for the type attribute + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub type{ + my $self = shift; + return $self->{'type'} = shift if(@_); + return $self->{'type'}; +} + + +=head2 source + + Arg [1] : string $newval (optional) + The new value to set the source attribute to + Example : $name = $obj->source() + Description: Getter/Setter for the source attribute + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub source{ + my $self = shift; + return $self->{'source'} = shift if(@_); + return $self->{'source'}; +} + + +=head2 associated_studies + Example : $name = $obj->associate_studies() + Description: Getter/Setter for the associated_studies attribute + Returntype : reference to list of Bio::EnsEMBL::Variation::Study + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub associated_studies{ + my $self = shift; + + my $results; + + if (defined($self->{'associate'}) && defined($self->{'adaptor'})) { + my $studya = $self->{'adaptor'}->db()->get_StudyAdaptor(); + return $studya->fetch_all_by_dbID_list($self->{'associate'}); + } + else { + return []; + } +} diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/SupportingStructuralVariation.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/SupportingStructuralVariation.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,94 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# Ensembl module for Bio::EnsEMBL::Variation::SupportingStructuralVariation +# +# Copyright (c) 2011 Ensembl +# + + +=head1 NAME + +Bio::EnsEMBL::Variation::SupportingStructuralVariation - A supporting evidence for a structural variation. + +=head1 SYNOPSIS + + # Supporting evidence of a structural variation + $ssv = Bio::EnsEMBL::Variation::SupportingStructuralVariation->new + (-supporting_structural_evidence => 'ssv001' + -structural_variation => $structural_variation); + + ... + + +=head1 DESCRIPTION + +This is a class representing the supporting evidence of a structural variation +from the ensembl-variation database. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::SupportingStructuralVariation; + +use Bio::EnsEMBL::Variation::BaseStructuralVariation; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + +our @ISA = ('Bio::EnsEMBL::Variation::BaseStructuralVariation'); + + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my $self = Bio::EnsEMBL::Variation::BaseStructuralVariation->new(@_); + return(bless($self, $class)); +} + + +=head2 get_all_StructuralVariations + Example : $ssv = $obj->get_all_StructuralVariations() + Description: Getter of the structural variations supported by the supporting evidence. + Returntype : reference to list of Bio::EnsEMBL::Variation::StructuralVariation objects + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub get_all_StructuralVariations { + my $self = shift; + + if(defined $self->{'adaptor'}) { + my $sva = $self->{'adaptor'}->db()->get_StructuralVariationAdaptor(); + return $sva->fetch_all_by_supporting_evidence($self); + } + else { + warn("No variation database attached"); + return []; + } +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/TranscriptStructuralVariation.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/TranscriptStructuralVariation.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,136 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Variation::TranscriptStructuralVariation + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Variation::TranscriptStructuralVariation; + + my $tv = Bio::EnsEMBL::Variation::TranscriptVariation->new( + -transcript => $transcript, + -structural_variation_feature => $svf + ); + +=head1 DESCRIPTION + +A TranscriptStructuralVariation object represents a structural variation feature +which is in close proximity to an Ensembl transcript. A +TranscriptStructuralVariation object has several attributes which define the +relationship of the variation to the transcript. + +=cut + +package Bio::EnsEMBL::Variation::TranscriptStructuralVariation; + +use strict; +use warnings; + +use Bio::EnsEMBL::Variation::TranscriptStructuralVariationAllele; + +use base qw(Bio::EnsEMBL::Variation::StructuralVariationOverlap Bio::EnsEMBL::Variation::BaseTranscriptVariation); + +sub new { + + my $class = shift; + + my %args = @_; + + # swap a '-transcript' argument for a '-feature' one for the superclass + + for my $arg (keys %args) { + if (lc($arg) eq '-transcript') { + $args{'-feature'} = delete $args{$arg}; + } + } + + # call the superclass constructor + my $self = $class->SUPER::new(%args) || return undef; + + # rebless the alleles from vfoas to tvas + map { bless $_, 'Bio::EnsEMBL::Variation::TranscriptStructuralVariationAllele' } + @{ $self->get_all_TranscriptStructuralVariationAlleles }; + + return $self; +} + +=head2 add_TranscriptStructuralVariationAllele + + Arg [1] : A Bio::EnsEMBL::Variation::TranscriptStructuralVariationAllele instance + Description: Add an allele to this TranscriptStructuralVariation + Returntype : none + Exceptions : throws if the argument is not the expected type + Status : At Risk + +=cut + +sub add_TranscriptStructuralVariationAllele { + my ($self, $svoa) = @_; + assert_ref($svoa, 'Bio::EnsEMBL::Variation::TranscriptStructuralVariationAllele'); + return $self->SUPER::add_BaseVariationFeatureOverlapAllele($svoa); +} + +=head2 get_reference_TranscriptStructuralVariationAllele + + Description: Get the object representing the reference allele of this TranscriptStructuralVariation + Returntype : Bio::EnsEMBL::Variation::TranscriptStructuralVariationAllele instance + Exceptions : none + Status : At Risk + +=cut + +sub get_reference_TranscriptStructuralVariationAllele { + my $self = shift; + return $self->SUPER::get_reference_BaseVariationFeatureOverlapAllele(@_); +} + +=head2 get_all_alternate_TranscriptStructuralVariationAlleles + + Description: Get a list of the alternate alleles of this TranscriptStructuralVariation + Returntype : listref of Bio::EnsEMBL::Variation::TranscriptStructuralVariationAllele objects + Exceptions : none + Status : At Risk + +=cut + +sub get_all_alternate_TranscriptStructuralVariationAlleles { + my $self = shift; + return $self->SUPER::get_all_alternate_BaseVariationFeatureOverlapAlleles(@_); +} + +=head2 get_all_TranscriptStructuralVariationAlleles + + Description: Get a list of the all the alleles, both reference and alternate, of + this TranscriptStructuralVariation + Returntype : listref of Bio::EnsEMBL::Variation::TranscriptStructuralVariationAllele objects + Exceptions : none + Status : At Risk + +=cut + +sub get_all_TranscriptStructuralVariationAlleles { + my $self = shift; + return $self->SUPER::get_all_BaseVariationFeatureOverlapAlleles(@_); +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/TranscriptStructuralVariationAllele.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/TranscriptStructuralVariationAllele.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,76 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Variation::TranscriptStructuralVariationAllele + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Variation::TranscriptStructuralVariationAllele; + +=head1 DESCRIPTION + +A TranscriptStructuralVariationAllele object represents a single allele of a +TranscriptStructuralVariation. + +=cut + +package Bio::EnsEMBL::Variation::TranscriptStructuralVariationAllele; + +use strict; +use warnings; + +use base qw(Bio::EnsEMBL::Variation::StructuralVariationOverlapAllele Bio::EnsEMBL::Variation::BaseTranscriptVariationAllele); + +sub new_fast { + my ($class, $hashref) = @_; + + # swap a transcript_structural_variation argument for a structural_variation_overlap one + + if ($hashref->{transcript_structural_variation}) { + $hashref->{structural_variation_overlap} = delete $hashref->{transcript_structural_variation}; + } + + # and call the superclass + + return $class->SUPER::new_fast($hashref); +} + +=head2 transcript_structural_variation + + Description: Get the associated TranscriptStructuralVariation + Returntype : Bio::EnsEMBL::Variation::TranscriptStructuralVariation + Exceptions : none + Status : At Risk + +=cut + +sub transcript_structural_variation { + my ($self, $svo) = @_; + if ($svo) { + assert_ref($svo, 'Bio::EnsEMBL::Variation::TranscriptStructuralVariation'); + } + return $self->base_variation_feature_overlap($svo); +} + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/TranscriptVariation.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/TranscriptVariation.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,447 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Variation::TranscriptVariation + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Variation::TranscriptVariation; + + my $tv = Bio::EnsEMBL::Variation::TranscriptVariation->new( + -transcript => $transcript, + -variation_feature => $var_feat + ); + + print "consequence type: ", (join ",", @{$tv->consequence_type}), "\n"; + print "cdna coords: ", $tv->cdna_start, '-', $tv->cdna_end, "\n"; + print "cds coords: ", $tv->cds_start, '-', $tv->cds_end, "\n"; + print "pep coords: ", $tv->translation_start, '-',$tv->translation_end, "\n"; + print "amino acid change: ", $tv->pep_allele_string, "\n"; + print "codon change: ", $tv->codons, "\n"; + print "allele sequences: ", (join ",", map { $_->variation_feature_seq } + @{ $tv->get_all_TranscriptVariationAlleles }, "\n"; + +=head1 DESCRIPTION + +A TranscriptVariation object represents a variation feature which is in close +proximity to an Ensembl transcript. A TranscriptVariation object has several +attributes which define the relationship of the variation to the transcript. + +=cut + +package Bio::EnsEMBL::Variation::TranscriptVariation; + +use strict; +use warnings; + +use Bio::EnsEMBL::Utils::Scalar qw(assert_ref check_ref); +use Bio::EnsEMBL::Variation::TranscriptVariationAllele; +use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(overlap within_cds); +use Bio::EnsEMBL::Variation::BaseTranscriptVariation; + +use base qw(Bio::EnsEMBL::Variation::BaseTranscriptVariation Bio::EnsEMBL::Variation::VariationFeatureOverlap); + +=head2 new + + Arg [-TRANSCRIPT] : + The Bio::EnsEMBL::Transcript associated with the given VariationFeature + + Arg [-VARIATION_FEATURE] : + The Bio::EnsEMBL::VariationFeature associated with the given Transcript + + Arg [-ADAPTOR] : + A Bio::EnsEMBL::Variation::DBSQL::TranscriptVariationAdaptor + + Arg [-DISAMBIGUATE_SINGLE_NUCLEOTIDE_ALLELES] : + A flag indiciating if ambiguous single nucleotide alleles should be disambiguated + when constructing the TranscriptVariationAllele objects, e.g. a Variationfeature + with an allele string like 'T/M' would be treated as if it were 'T/A/C'. We limit + ourselves to single nucleotide alleles to avoid the combinatorial explosion if we + allowed longer alleles with potentially many ambiguous bases. + + Example : + my $tv = Bio::EnsEMBL::Variation::TranscriptVariation->new( + -transcript => $transcript, + -variation_feature => $var_feat + ); + + Description: Constructs a new TranscriptVariation instance given a VariationFeature + and a Transcript, most of the work is done in the VariationFeatureOverlap + superclass - see there for more documentation. + Returntype : A new Bio::EnsEMBL::Variation::TranscriptVariation instance + Exceptions : throws unless both VARIATION_FEATURE and TRANSCRIPT are supplied + Status : At Risk + +=cut + +sub new { + my $class = shift; + + my %args = @_; + + # swap a '-transcript' argument for a '-feature' one for the superclass + + for my $arg (keys %args) { + if (lc($arg) eq '-transcript') { + $args{'-feature'} = delete $args{$arg}; + } + } + + # call the superclass constructor + my $self = $class->SUPER::new(%args) || return undef; + + # rebless the alleles from vfoas to tvas + map { bless $_, 'Bio::EnsEMBL::Variation::TranscriptVariationAllele' } + @{ $self->get_all_TranscriptVariationAlleles }; + + return $self; +} + +sub get_TranscriptVariationAllele_for_allele_seq { + my ($self, $allele_seq) = @_; + return $self->SUPER::get_VariationFeatureOverlapAllele_for_allele_seq($allele_seq); +} + +=head2 add_TranscriptVariationAllele + + Arg [1] : A Bio::EnsEMBL::Variation::TranscriptVariationAllele instance + Description: Add an allele to this TranscriptVariation + Returntype : none + Exceptions : throws if the argument is not the expected type + Status : At Risk + +=cut + +sub add_TranscriptVariationAllele { + my ($self, $tva) = @_; + assert_ref($tva, 'Bio::EnsEMBL::Variation::TranscriptVariationAllele'); + return $self->SUPER::add_VariationFeatureOverlapAllele($tva); +} + +=head2 get_reference_TranscriptVariationAllele + + Description: Get the object representing the reference allele of this TranscriptVariation + Returntype : Bio::EnsEMBL::Variation::TranscriptVariationAllele instance + Exceptions : none + Status : At Risk + +=cut + +sub get_reference_TranscriptVariationAllele { + my $self = shift; + return $self->SUPER::get_reference_VariationFeatureOverlapAllele(@_); +} + +=head2 get_all_alternate_TranscriptVariationAlleles + + Description: Get a list of the alternate alleles of this TranscriptVariation + Returntype : listref of Bio::EnsEMBL::Variation::TranscriptVariationAllele objects + Exceptions : none + Status : At Risk + +=cut + +sub get_all_alternate_TranscriptVariationAlleles { + my $self = shift; + return $self->SUPER::get_all_alternate_VariationFeatureOverlapAlleles(@_); +} + +=head2 get_all_TranscriptVariationAlleles + + Description: Get a list of the all the alleles, both reference and alternate, of + this TranscriptVariation + Returntype : listref of Bio::EnsEMBL::Variation::TranscriptVariationAllele objects + Exceptions : none + Status : At Risk + +=cut + +sub get_all_TranscriptVariationAlleles { + my $self = shift; + return $self->SUPER::get_all_VariationFeatureOverlapAlleles(@_); +} + + +=head2 cdna_allele_string + + Description: Return a '/' delimited string of the alleles of this variation with respect + to the associated transcript + Returntype : string + Exceptions : None + Caller : general + Status : Stable + +=cut + +sub cdna_allele_string { + my $self = shift; + + unless ($self->{_cdna_allele_string}) { + $self->{_cdna_allele_string} = join '/', map { $_->feature_seq } @{ $self->get_all_TranscriptVariationAlleles }; + } + + return $self->{_cdna_allele_string}; +} + +=head2 pep_allele_string + + Description: Return a '/' delimited string of amino acid codes representing + all the possible changes made to the peptide by this variation + Returntype : string + Exceptions : None + Caller : general + Status : Stable + +=cut + +sub pep_allele_string { + my $self = shift; + + unless ($self->{_pep_allele_string}) { + + my @peptides = grep { defined } map { $_->peptide } @{ $self->get_all_TranscriptVariationAlleles }; + + $self->{_pep_allele_string} = join '/', @peptides; + } + + return $self->{_pep_allele_string}; +} + +=head2 codons + + Description: Return a '/' delimited string of all possible codon sequences. + The variant sequence within the codon will be capitalised while + the rest of the codon sequence will be in lower case + Returntype : string + Exceptions : None + Caller : general + Status : Stable + +=cut + +sub codons { + my $self = shift; + + unless ($self->{_display_codon_allele_string}) { + + my @display_codons = grep { defined } map { $_->display_codon } @{ $self->get_all_TranscriptVariationAlleles }; + + $self->{_display_codon_allele_string} = join '/', @display_codons; + } + + return $self->{_display_codon_allele_string}; +} + +=head2 codon_position + + Description: For variations that fall in the CDS, returns the base in the + codon that this variation falls in + Returntype : int - 1, 2 or 3 + Exceptions : None + Caller : general + Status : At Risk + +=cut + +sub codon_position { + my $self = shift; + + unless ($self->{_codon_position}) { + + my $cdna_start = $self->cdna_start; + + my $tran_cdna_start = $self->transcript->cdna_coding_start; + + # we need to take into account the exon phase + my $exon_phase = $self->transcript->start_Exon->phase; + + my $phase_offset = $exon_phase > 0 ? $exon_phase : 0; + + if (defined $cdna_start && defined $tran_cdna_start) { + $self->{_codon_position} = (($cdna_start - $tran_cdna_start + $phase_offset) % 3) + 1; + } + } + + return $self->{_codon_position}; +} + +=head2 affects_cds + + Description: Check if any of this TranscriptVariation's alleles lie within + the CDS of the Transcript + Returntype : boolean + Exceptions : None + Caller : general + Status : At Risk + +=cut + +sub affects_cds { + my $self = shift; + return scalar grep { within_cds($_) } @{ $self->get_all_alternate_TranscriptVariationAlleles }; +} + +=head2 affects_peptide + + Description: Check if any of this TranscriptVariation's alleles change the + resultant peptide sequence + Returntype : boolean + Exceptions : None + Caller : general + Status : At Risk + +=cut + +sub affects_peptide { + my $self = shift; + return scalar grep { $_->SO_term =~ /stop|non_syn|frameshift|inframe|initiator/ } map {@{$_->get_all_OverlapConsequences}} @{ $self->get_all_alternate_TranscriptVariationAlleles }; +} + + +sub _protein_function_predictions { + + my ($self, $analysis) = @_; + + my $tran = $self->transcript; + + my $matrix = $tran->{_variation_effect_feature_cache}->{protein_function_predictions}->{$analysis}; + + unless ($matrix || exists($tran->{_variation_effect_feature_cache}->{protein_function_predictions}->{$analysis})) { + my $pfpma = $self->{adaptor}->db->get_ProteinFunctionPredictionMatrixAdaptor; + + $matrix = $pfpma->fetch_by_analysis_translation_md5($analysis, $self->_translation_md5); + + $tran->{_variation_effect_feature_cache}->{protein_function_predictions}->{$analysis} = $matrix; + } + + return $matrix; +} + +=head2 hgvs_genomic + + Description: Return the strings representing the genomic-level effect of each of the alleles + of this variation in HGVS format + Returntype : hashref where the key is the allele sequence and then value is the HGVS string + Exceptions : none + Status : At Risk + +=cut + +sub hgvs_genomic { + return _hgvs_generic(@_,'genomic'); +} + +=head2 hgvs_coding + + Description: Return the strings representing the CDS-level effect of each of the alleles + of this variation in HGVS format + Returntype : hashref where the key is the allele sequence and then value is the HGVS string + Exceptions : none + Status : At Risk + +=cut + +sub hgvs_coding { + deprecate('HGVS coding support has been moved to hgvs_transcript. This method will be removed in the next release.'); + return _hgvs_generic(@_,'transcript'); +} + +=head2 hgvs_transcript + + Description: Return the strings representing the CDS-level effect of each of the alleles + of this variation in HGVS format + Returntype : hashref where the key is the allele sequence and then value is the HGVS string + Exceptions : none + Status : At Risk + +=cut + +sub hgvs_transcript { + return _hgvs_generic(@_,'transcript'); +} + +=head2 hgvs_protein + + Description: Return the strings representing the protein-level effect of each of the alleles + of this variation in HGVS format + Returntype : hashref where the key is the allele sequence and then value is the HGVS string + Exceptions : none + Status : At Risk + +=cut + +sub hgvs_protein { + return _hgvs_generic(@_,'protein'); +} + +=head + +# We haven't implemented support for these methods yet + +sub hgvs_rna { + return _hgvs_generic(@_,'rna'); +} + +sub hgvs_mitochondrial { + return _hgvs_generic(@_,'mitochondrial'); +} +=cut + +sub _hgvs_generic { + my $self = shift; + my $reference = pop; + my $hgvs = shift; + + #ÊThe rna and mitochondrial modes have not yet been implemented, so return undef in case we get a call to these + return undef if ($reference =~ m/rna|mitochondrial/); + + # The HGVS subroutine + my $sub = qq{hgvs_$reference}; + + # Loop over the TranscriptVariationAllele objects associated with this TranscriptVariation + foreach my $tv_allele (@{ $self->get_all_alternate_TranscriptVariationAlleles }) { + + #ÊIf an HGVS hash was supplied and the allele exists as key, set the HGVS notation for this allele + if (defined($hgvs)) { + my $notation = $hgvs->{$tv_allele->variation_feature_seq()}; + $tv_allele->$sub($notation) if defined $notation; + } + # Else, add the HGVS notation for this allele to the HGVS hash + else { + $hgvs->{$tv_allele->variation_feature_seq()} = $tv_allele->$sub(); + } + } + + return $hgvs; +} + +sub _prefetch_for_vep { + my $self = shift; + + $self->cdna_coords; + $self->cds_coords; + $self->translation_coords; + $self->pep_allele_string; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/TranscriptVariationAllele.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/TranscriptVariationAllele.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1406 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Variation::TranscriptVariationAllele + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Variation::TranscriptVariationAllele; + + my $tva = Bio::EnsEMBL::Variation::TranscriptVariationAllele->new( + -transcript_variation => $tv, + -variation_feature_seq => 'A', + -is_reference => 0, + ); + + print "sequence with respect to the transcript: ", $tva->feature_seq, "\n"; + print "sequence with respect to the variation feature: ", $tva->variation_feature_seq, "\n"; + print "consequence SO terms: ", (join ",", map { $_->SO_term } @{ $tva->get_all_OverlapConsequences }), "\n"; + print "amino acid change: ", $tva->peptide_allele_string, "\n"; + print "resulting codon: ", $tva->codon, "\n"; + print "reference codon: ", $tva->transcript_variation->get_reference_TranscriptVariationAllele->codon, "\n"; + print "PolyPhen prediction: ", $tva->polyphen_prediction, "\n"; + print "SIFT prediction: ", $tva->sift_prediction, "\n"; + +=head1 DESCRIPTION + +A TranscriptVariationAllele object represents a single allele of a TranscriptVariation. +It provides methods that are specific to the sequence of the allele, such as codon, +peptide etc. Methods that depend only on position (e.g. CDS start) will be found in +the associated TranscriptVariation. Ordinarily you will not create these objects +yourself, but instead you would create a TranscriptVariation object which will then +construct TranscriptVariationAlleles based on the allele string of the associated +VariationFeature. + +Note that any methods that are not specific to Transcripts will be found in the +VariationFeatureOverlapAllele superclass. + +=cut + +package Bio::EnsEMBL::Variation::TranscriptVariationAllele; + +use strict; +use warnings; + +use Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix qw($AA_LOOKUP); +use Bio::EnsEMBL::Utils::Exception qw(throw warning deprecate); +use Bio::EnsEMBL::Variation::Utils::Sequence qw(hgvs_variant_notation format_hgvs_string); +use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); +use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(within_cds within_intron stop_lost affects_start_codon); + +use base qw(Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele Bio::EnsEMBL::Variation::BaseTranscriptVariationAllele); + + +our $DEBUG = 0; + +sub new_fast { + my ($class, $hashref) = @_; + + # swap a transcript_variation argument for a variation_feature_overlap one + + if ($hashref->{transcript_variation}) { + $hashref->{variation_feature_overlap} = delete $hashref->{transcript_variation}; + } + + # and call the superclass + + return $class->SUPER::new_fast($hashref); +} + +=head2 transcript_variation + + Description: Get/set the associated TranscriptVariation + Returntype : Bio::EnsEMBL::Variation::TranscriptVariation + Exceptions : throws if the argument is the wrong type + Status : At Risk + +=cut + +sub transcript_variation { + my ($self, $tv) = @_; + assert_ref($tv, 'Bio::EnsEMBL::Variation::TranscriptVariation') if $tv; + return $self->variation_feature_overlap($tv); +} + +=head2 variation_feature + + Description: Get the associated VariationFeature + Returntype : Bio::EnsEMBL::Variation::VariationFeature + Exceptions : none + Status : At Risk + +=cut + +sub variation_feature { + my $self = shift; + return $self->transcript_variation->variation_feature; +} + +=head2 pep_allele_string + + Description: Return a '/' delimited string of the reference peptide and the + peptide resulting from this allele, or a single peptide if this + allele does not change the peptide (e.g. because it is synonymous) + Returntype : string or undef if this allele is not in the CDS + Exceptions : none + Status : At Risk + +=cut + +sub pep_allele_string { + my ($self) = @_; + + my $pep = $self->peptide; + + return undef unless $pep; + + my $ref_pep = $self->transcript_variation->get_reference_TranscriptVariationAllele->peptide; + + return undef unless $ref_pep; + + return $ref_pep ne $pep ? $ref_pep.'/'.$pep : $pep; +} + +=head2 codon_allele_string + + Description: Return a '/' delimited string of the reference codon and the + codon resulting from this allele + Returntype : string or undef if this allele is not in the CDS + Exceptions : none + Status : At Risk + +=cut + +sub codon_allele_string { + my ($self) = @_; + + my $codon = $self->codon; + + return undef unless $codon; + + my $ref_codon = $self->transcript_variation->get_reference_TranscriptVariationAllele->codon; + + return $ref_codon.'/'.$codon; +} + +=head2 display_codon_allele_string + + Description: Return a '/' delimited string of the reference display_codon and the + display_codon resulting from this allele. The display_codon identifies + the nucleotides affected by this variant in UPPER CASE and other + nucleotides in lower case + Returntype : string or undef if this allele is not in the CDS + Exceptions : none + Status : At Risk + +=cut + +sub display_codon_allele_string { + my ($self) = @_; + + my $display_codon = $self->display_codon; + + return undef unless $display_codon; + + my $ref_display_codon = $self->transcript_variation->get_reference_TranscriptVariationAllele->display_codon; + + return undef unless $ref_display_codon; + + return $ref_display_codon.'/'.$display_codon; +} + +=head2 peptide + + Description: Return the amino acid sequence that this allele is predicted to result in + Returntype : string or undef if this allele is not in the CDS or is a frameshift + Exceptions : none + Status : At Risk + +=cut + +sub peptide { + my ($self, $peptide) = @_; + + $self->{peptide} = $peptide if $peptide; + + unless ($self->{peptide}) { + + return undef unless $self->seq_is_unambiguous_dna; + + if (my $codon = $self->codon) { + + # the codon method can set the peptide in some circumstances + # so check here before we try an (expensive) translation + return $self->{peptide} if $self->{peptide}; + + # translate the codon sequence to establish the peptide allele + + # allow for partial codons - split the sequence into whole and partial + # e.g. AAAGG split into AAA and GG + my $whole_codon = substr($codon, 0, int(length($codon) / 3) * 3); + my $partial_codon = substr($codon, int(length($codon) / 3) * 3); + + my $pep = ''; + + if($whole_codon) { + # for mithocondrial dna we need to to use a different codon table + my $codon_table = $self->transcript_variation->_codon_table; + + my $codon_seq = Bio::Seq->new( + -seq => $whole_codon, + -moltype => 'dna', + -alphabet => 'dna', + ); + + $pep .= $codon_seq->translate(undef, undef, undef, $codon_table)->seq; + } + + if($partial_codon) { + $pep .= 'X'; + } + + $pep ||= '-'; + + $self->{peptide} = $pep; + } + } + + return $self->{peptide}; +} + +=head2 codon + + Description: Return the codon sequence that this allele is predicted to result in + Returntype : string or undef if this allele is not in the CDS or is a frameshift + Exceptions : none + Status : At Risk + +=cut + +sub codon { + my ($self, $codon) = @_; + + $self->{codon} = $codon if defined $codon; + + my $tv = $self->transcript_variation; + + return undef unless $tv->translation_start && $tv->translation_end; + + return undef unless $self->seq_is_dna; + + unless ($self->{codon}) { + + # try to calculate the codon sequence + + my $seq = $self->feature_seq; + + $seq = '' if $seq eq '-'; + + # calculate necessary coords and lengths + + my $codon_cds_start = $tv->translation_start * 3 - 2; + my $codon_cds_end = $tv->translation_end * 3; + my $codon_len = $codon_cds_end - $codon_cds_start + 1; + my $vf_nt_len = $tv->cds_end - $tv->cds_start + 1; + my $allele_len = $self->seq_length; + + if ($allele_len != $vf_nt_len) { + if (abs($allele_len - $vf_nt_len) % 3) { + # this is a frameshift variation, we don't attempt to + # calculate the resulting codon or peptide change as this + # could get quite complicated + return undef; + } + } + + # splice the allele sequence into the CDS + + my $cds = $tv->_translateable_seq; + + substr($cds, $tv->cds_start-1, $vf_nt_len) = $seq; + + # and extract the codon sequence + + my $codon = substr($cds, $codon_cds_start-1, $codon_len + ($allele_len - $vf_nt_len)); + + if (length($codon) < 1) { + $self->{codon} = '-'; + $self->{peptide} = '-'; + } + else { + $self->{codon} = $codon; + } + } + + return $self->{codon}; +} + +=head2 display_codon + + Description: Return the codon sequence that this allele is predicted to result in + with the affected nucleotides identified in UPPER CASE and other + nucleotides in lower case + Returntype : string or undef if this allele is not in the CDS or is a frameshift + Exceptions : none + Status : At Risk + +=cut + +sub display_codon { + my $self = shift; + + unless ($self->{_display_codon}) { + + if ($self->codon && defined $self->transcript_variation->codon_position) { + + my $display_codon = lc $self->codon; + + # if this allele is an indel then just return all lowercase + + if ($self->feature_seq ne '-') { + + # codon_position is 1-based, while substr assumes the string starts at 0 + + my $pos = $self->transcript_variation->codon_position - 1; + + my $len = length $self->feature_seq; + + substr($display_codon, $pos, $len) = uc substr($display_codon, $pos, $len); + } + + $self->{_display_codon} = $display_codon; + } + } + + return $self->{_display_codon}; +} + +=head2 polyphen_prediction + + Description: Return the qualitative PolyPhen-2 prediction for the effect of this allele. + (Note that we currently only have PolyPhen predictions for variants that + result in single amino acid substitutions in human) + Returntype : string (one of 'probably damaging', 'possibly damaging', 'benign', 'unknown') + if this is a non-synonymous mutation and a prediction is available, undef + otherwise + Exceptions : none + Status : At Risk + +=cut + +sub polyphen_prediction { + my ($self, $classifier, $polyphen_prediction) = @_; + + $classifier ||= 'humvar'; + + my $analysis = "polyphen_${classifier}"; + + $self->{$analysis}->{prediction} = $polyphen_prediction if $polyphen_prediction; + + unless ($self->{$analysis}->{prediction}) { + my ($prediction, $score) = $self->_protein_function_prediction($analysis); + $self->{$analysis}->{score} = $score; + $self->{$analysis}->{prediction} = $prediction; + } + + return $self->{$analysis}->{prediction}; +} + +=head2 polyphen_score + + Description: Return the PolyPhen-2 probability that this allele is deleterious (Note that we + currently only have PolyPhen predictions for variants that result in single + amino acid substitutions in human) + Returntype : float between 0 and 1 if this is a non-synonymous mutation and a prediction is + available, undef otherwise + Exceptions : none + Status : At Risk + +=cut + +sub polyphen_score { + my ($self, $classifier, $polyphen_score) = @_; + + $classifier ||= 'humvar'; + + my $analysis = "polyphen_${classifier}"; + + $self->{$analysis}->{score} = $polyphen_score if defined $polyphen_score; + + unless ($self->{$analysis}->{score}) { + my ($prediction, $score) = $self->_protein_function_prediction($analysis); + $self->{$analysis}->{score} = $score; + $self->{$analysis}->{prediction} = $prediction; + } + + return $self->{$analysis}->{score}; +} + +=head2 sift_prediction + + Description: Return the qualitative SIFT prediction for the effect of this allele. + (Note that we currently only have SIFT predictions for variants that + result in single amino acid substitutions in human) + Returntype : string (one of 'tolerated', 'deleterious') if this is a non-synonymous + mutation and a prediction is available, undef otherwise + Exceptions : none + Status : At Risk + +=cut + +sub sift_prediction { + my ($self, $sift_prediction) = @_; + + $self->{sift_prediction} = $sift_prediction if $sift_prediction; + + unless ($self->{sift_prediction}) { + my ($prediction, $score) = $self->_protein_function_prediction('sift'); + $self->{sift_score} = $score; + $self->{sift_prediction} = $prediction unless $self->{sift_prediction}; + } + + return $self->{sift_prediction}; +} + +=head2 sift_score + + Description: Return the SIFT score for this allele (Note that we currently only have SIFT + predictions for variants that result in single amino acid substitutions in human) + Returntype : float between 0 and 1 if this is a non-synonymous mutation and a prediction is + available, undef otherwise + Exceptions : none + Status : At Risk + +=cut + +sub sift_score { + my ($self, $sift_score) = @_; + + $self->{sift_score} = $sift_score if defined $sift_score; + + unless ($self->{sift_score}) { + my ($prediction, $score) = $self->_protein_function_prediction('sift'); + $self->{sift_score} = $score; + $self->{sift_prediction} = $prediction; + } + + return $self->{sift_score}; +} + + +sub _protein_function_prediction { + my ($self, $analysis) = @_; + + # we can only get results for variants that cause a single amino acid substitution, + # so check the peptide allele string first + + if ($self->pep_allele_string && $self->pep_allele_string =~ /^[A-Z]\/[A-Z]$/ && defined $AA_LOOKUP->{$self->peptide}) { + + if (my $matrix = $self->transcript_variation->_protein_function_predictions($analysis)) { + + my ($prediction, $score) = $matrix->get_prediction( + $self->transcript_variation->translation_start, + $self->peptide, + ); + + return wantarray ? ($prediction, $score) : $prediction; + } + } + + return undef; +} + +=head2 hgvs_genomic + + Description: Return a string representing the genomic-level effect of this allele in HGVS format + Returntype : string + Exceptions : none + Status : At Risk + +=cut + +sub hgvs_genomic { + return _hgvs_generic(@_,'genomic'); +} + +=head2 hgvs_coding + + Description: Return a string representing the CDS-level effect of this allele in HGVS format + Returntype : string or undef if this allele is not in the CDS + Exceptions : none + Status : At Risk + +=cut + +sub hgvs_coding { + + deprecate('HGVS coding support has been moved to hgvs_transcript. This method will be removed in the next release.'); + return hgvs_transcript(@_); +} + + +=head2 hgvs_transcript + + Description: Return a string representing the CDS-level effect of this allele in HGVS format + Returntype : string or undef if this allele is not in the CDS + Exceptions : none + Status : At Risk + +=cut + + +sub hgvs_transcript { + + + my $self = shift; + my $notation = shift; + + ##### set if string supplied + $self->{hgvs_transcript} = $notation if defined $notation; + ##### return if held + return $self->{hgvs_transcript} if defined $self->{hgvs_transcript} ; + return $self->{hgvs_coding} if defined $self->{hgvs_coding} ; + + my $variation_feature_sequence = $self->variation_feature_seq() ; + + ### don't try to handle odd characters + return undef if $variation_feature_sequence =~ m/[^ACGT\-]/ig; + + ### no result for reference allele + return undef if $self->is_reference ==1; + + ### else evaluate + + ### get reference sequence strand + my $refseq_strand = $self->transcript_variation->transcript->strand(); + + if($DEBUG ==1){ + my $var_name = $self->transcript_variation->variation_feature->variation_name(); + print "\nHGVS transcript: Checking $var_name refseq strand => $refseq_strand seq name : " . $self->transcript_variation->transcript_stable_id() . " var strand " . $self->transcript_variation->variation_feature->strand() . " vf st " . $self->variation_feature->strand() ." seqname: " . $self->variation_feature->seqname() ." seq: " . $self->variation_feature_seq ."\n"; + } + + my $hgvs_notation ; ### store components of HGVS string in hash + + ### vf strand is relative to transcript or transcript slice + if( $self->transcript_variation->variation_feature->strand() <0 && $refseq_strand >0 || + $self->transcript_variation->variation_feature->strand() >0 && $refseq_strand < 0 + ){ + reverse_comp(\$variation_feature_sequence); + } + + ### need to get ref seq from slice transcript is on for intron labelling + my ($slice_start, $slice_end, $slice) = $self->_var2transcript_slice_coords(); + + + unless($slice_start){ + #warn "TVA: VF not within transcript - no HGVS\n"; + return undef; + } + + ### decide event type from HGVS nomenclature + $hgvs_notation = hgvs_variant_notation( $variation_feature_sequence, ### alt_allele, + $slice->seq(), ### using this to extract ref allele + $slice_start, + $slice_end + ); + + ### This should not happen + unless($hgvs_notation->{'type'}){ + #warn "Error - not continuing; no HGVS annotation\n"; + return undef; + } + + ### create reference name - transcript name & seq version + $hgvs_notation->{'ref_name'} = $self->transcript_variation->transcript_stable_id() . "." . $self->transcript_variation->transcript->version(); + + + ### get position relative to transcript features [use HGVS coords not variation feature coords due to dups] + $hgvs_notation->{start} = $self->_get_cDNA_position( $hgvs_notation->{start} ); + $hgvs_notation->{end} = $self->_get_cDNA_position( $hgvs_notation->{end} ); + + + # Make sure that start is always less than end + my ($exon_start_coord, $intron_start_offset) = $hgvs_notation->{start} =~ m/(\-?[0-9]+)\+?(\-?[0-9]+)?/; + my ($exon_end_coord, $intron_end_offset) = $hgvs_notation->{end} =~ m/(\-?[0-9]+)\+?(\-?[0-9]+)?/; + $intron_start_offset ||= 0; + $intron_end_offset ||= 0; + + ($hgvs_notation->{start},$hgvs_notation->{end}) = ($hgvs_notation->{end},$hgvs_notation->{start}) if + (($exon_start_coord + $intron_start_offset) > ($exon_end_coord + $intron_end_offset)); + + + + if($self->transcript->cdna_coding_start()){ + $hgvs_notation->{'numbering'} = "c"; ### set 'c' if transcript is coding + } + else{ + $hgvs_notation->{'numbering'} = "n"; ### set 'n' if transcript non-coding + } + ### generic formatting + $self->{hgvs_transcript} = format_hgvs_string( $hgvs_notation); + + if($DEBUG ==1){print "HGVS notation for var " . $self->transcript_variation->variation_feature->variation_name() . " from hgvs transcript : " . $self->{hgvs_transcript} . " \n";} + + return $self->{hgvs_transcript}; + +} + + +=head2 hgvs_protein + + Description: Return a string representing the protein-level effect of this allele in HGVS format + Returntype : string or undef if this allele is not in the CDS + Exceptions : none + Status : At Risk + +=cut + +sub hgvs_protein { + + my $self = shift; + my $notation = shift; + my $hgvs_notation; + + if($DEBUG ==1){print "\nStarting hgvs_protein with ". $self->transcript_variation->variation_feature->variation_name() . "\n"; } + + ### set if string supplied + $self->{hgvs_protein} = $notation if defined $notation; + + ### return if set + return $self->{hgvs_protein} if defined $self->{hgvs_protein} ; + + ### don't try to handle odd characters + return undef if $self->variation_feature_seq() =~ m/[^ACGT\-]/ig; + + #### else, check viable input and create notation + return if $self->is_reference(); + + unless ($self->transcript_variation->translation_start() && $self->transcript_variation->translation_end()){ + return undef ; + } + + #### for debug + #my $var_name = $self->transcript_variation->variation_feature->variation_name(); + + ### get reference sequence [add seq version to transcript name] + $hgvs_notation->{ref_name} = $self->transcript_variation->transcript->translation()->display_id() . "." . $self->transcript_variation->transcript->translation()->version(); + + $hgvs_notation->{'numbering'} = 'p'; + + ### get default reference location [changed later in some cases eg. duplication] + $hgvs_notation->{start} = $self->transcript_variation->translation_start(); + $hgvs_notation->{end} = $self->transcript_variation->translation_end(); + + + ## get default reference & alt peptides [changed later to hgvs format] + $hgvs_notation->{alt} = $self->peptide; + $hgvs_notation->{ref} = $self->transcript_variation->get_reference_TranscriptVariationAllele->peptide; + + if(defined $hgvs_notation->{alt} && defined $hgvs_notation->{ref}){ + $hgvs_notation = _clip_alleles( $hgvs_notation); + } + + #### define type - types are different for protein numbering + $hgvs_notation = $self->_get_hgvs_protein_type($hgvs_notation); + + ##### Convert ref & alt peptides taking into account HGVS rules + $hgvs_notation = $self->_get_hgvs_peptides($hgvs_notation); + + ##### String formatting + $self->{hgvs_protein} = $self->_get_hgvs_protein_format($hgvs_notation); + return $self->{hgvs_protein} ; +} + +### HGVS: format protein string +sub _get_hgvs_protein_format{ + + my $self = shift; + my $hgvs_notation = shift; + + if ((defined $hgvs_notation->{ref} && defined $hgvs_notation->{alt} && + $hgvs_notation->{ref} eq $hgvs_notation->{alt}) || + (defined $hgvs_notation->{type} && $hgvs_notation->{type} eq "=")){ + + ### no protein change - return transcript nomenclature with flag for neutral protein consequence + $hgvs_notation->{'hgvs'} = $self->hgvs_transcript() . "(p.=)"; + return $hgvs_notation->{'hgvs'} ; + } + + ### all start with refseq name & numbering type + $hgvs_notation->{'hgvs'} = $hgvs_notation->{'ref_name'} . ":" . $hgvs_notation->{'numbering'} . "."; + + ### handle stop_lost seperately regardless of cause by del/delins => p.XposAA1extnum_AA_to_stop + if(stop_lost($self)){ ### if deletion of stop add extX and number of new aa to alt + $hgvs_notation->{alt} = substr($hgvs_notation->{alt}, 0, 3); + if($hgvs_notation->{type} eq "del"){ + my $aa_til_stop = _stop_loss_extra_AA($self,$hgvs_notation->{start}-1, "del"); + if(defined $aa_til_stop){ + $hgvs_notation->{alt} .= "extX" . $aa_til_stop; + } + } + elsif($hgvs_notation->{type} eq ">"){ + my $aa_til_stop = _stop_loss_extra_AA($self,$hgvs_notation->{start}-1, "subs"); + if(defined $aa_til_stop){ + $hgvs_notation->{alt} .= "extX" . $aa_til_stop; + } + } + else{ + # warn "TVA: stop loss for type $hgvs_notation->{type} not caught \n"; + } + $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . $hgvs_notation->{alt} ; + } + + elsif( $hgvs_notation->{type} eq "dup"){ + + if($hgvs_notation->{start} < $hgvs_notation->{end}){ + ### list only first and last peptides in long duplicated string + my $ref_pep_first = substr($hgvs_notation->{alt}, 0, 3); + my $ref_pep_last = substr($hgvs_notation->{alt}, -3, 3); + $hgvs_notation->{'hgvs'} .= $ref_pep_first . $hgvs_notation->{start} . "_" . $ref_pep_last . $hgvs_notation->{end} ."dup"; + + } + else{ + $hgvs_notation->{'hgvs'} .= $hgvs_notation->{alt} . $hgvs_notation->{start} . "dup" ; + } + + print "formating a dup $hgvs_notation->{hgvs} \n" if $DEBUG==1; + } + + elsif($hgvs_notation->{type} eq ">"){ + #### substitution + + $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref}. $hgvs_notation->{start} . $hgvs_notation->{alt}; + } + + elsif( $hgvs_notation->{type} eq "delins" || $hgvs_notation->{type} eq "ins" ){ + + #### list first and last aa in reference only + my $ref_pep_first = substr($hgvs_notation->{ref}, 0, 3); + my $ref_pep_last; + if(substr($hgvs_notation->{ref}, -1, 1) eq "X"){ + $ref_pep_last ="X"; + } + else{ + $ref_pep_last = substr($hgvs_notation->{ref}, -3, 3); + } + if($hgvs_notation->{ref} =~ /X$/){ + ### For stops & add extX & distance to next stop to alt pep + my $aa_til_stop = _stop_loss_extra_AA($self,$hgvs_notation->{start}-1, "loss"); + if(defined $aa_til_stop){ + $hgvs_notation->{alt} .="extX" . $aa_til_stop; + } + } + if($hgvs_notation->{start} == $hgvs_notation->{end} && $hgvs_notation->{type} eq "delins"){ + $hgvs_notation->{'hgvs'} .= $ref_pep_first . $hgvs_notation->{start} . $hgvs_notation->{end} . $hgvs_notation->{type} . $hgvs_notation->{alt} ; + } + else{ + ### correct ordering if needed + if($hgvs_notation->{start} > $hgvs_notation->{end}){ + ( $hgvs_notation->{start}, $hgvs_notation->{end}) = ($hgvs_notation->{end}, $hgvs_notation->{start} ); + } + + $hgvs_notation->{'hgvs'} .= $ref_pep_first . $hgvs_notation->{start} . "_" . $ref_pep_last . $hgvs_notation->{end} . $hgvs_notation->{type} . $hgvs_notation->{alt} ; + } + } + + elsif($hgvs_notation->{type} eq "fs"){ + if(defined $hgvs_notation->{alt} && $hgvs_notation->{alt} eq "X"){ ## stop gained + $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . $hgvs_notation->{alt} ; + + } + else{ ## not immediate stop - count aa until next + + my $aa_til_stop = _stop_loss_extra_AA($self, $hgvs_notation->{start}-1, "fs"); + if($aa_til_stop ){ + ### use long form if new stop found + $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . $hgvs_notation->{alt} . "fsX$aa_til_stop" ; + } + else{ + ### otherwise use short form + $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . "fs"; + } + } + } + + elsif( $hgvs_notation->{type} eq "del"){ + $hgvs_notation->{alt} = "del"; + if( length($hgvs_notation->{ref}) >3 ){ + my $ref_pep_first = substr($hgvs_notation->{ref}, 0, 3); + my $ref_pep_last = substr($hgvs_notation->{ref}, -3, 3); + $hgvs_notation->{'hgvs'} .= $ref_pep_first . $hgvs_notation->{start} . "_" . $ref_pep_last . $hgvs_notation->{end} .$hgvs_notation->{alt} ; + } + else{ + $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . $hgvs_notation->{alt} ; + } + } + + elsif($hgvs_notation->{start} ne $hgvs_notation->{end} ){ + $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . "_" . $hgvs_notation->{alt} . $hgvs_notation->{end} ; + } + + else{ + #### default to substitution + $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref}. $hgvs_notation->{start} . $hgvs_notation->{alt}; + } + + if($DEBUG==1){ print "Returning protein format: $hgvs_notation->{'hgvs'}\n";} + return $hgvs_notation->{'hgvs'}; +} + +### HGVS: get type of variation event in protein terms +sub _get_hgvs_protein_type{ + + my $self = shift; + my $hgvs_notation = shift; + + ### get allele length + my ($ref_length, $alt_length ) = $self->_get_allele_length(); + + + if( defined $hgvs_notation->{ref} && defined $hgvs_notation->{alt} ){ + ### Run type checks on peptides if available + + if($hgvs_notation->{alt} eq $hgvs_notation->{ref} ){ + #### synonymous indicated by = + $hgvs_notation->{type} = "="; + } + elsif( length($hgvs_notation->{ref}) ==1 && length($hgvs_notation->{alt}) ==1 ) { + + $hgvs_notation->{type} = ">"; + } + elsif($hgvs_notation->{ref} eq "-" || $hgvs_notation->{ref} eq "") { + + $hgvs_notation->{type} = "ins"; + } + elsif($hgvs_notation->{alt} eq "" ) { + + $hgvs_notation->{type} = "del"; + } + elsif( (length($hgvs_notation->{alt}) > length($hgvs_notation->{ref}) ) && + $hgvs_notation->{alt} =~ / $hgvs_notation->{ref}/ ) { + ### capture duplication event described as TTT/TTTTT + $hgvs_notation->{type} = "dup"; + } + + elsif( (length($hgvs_notation->{alt}) >1 && length($hgvs_notation->{ref}) ==1) || + (length($hgvs_notation->{alt}) ==1 && length($hgvs_notation->{ref}) >1) ) { + $hgvs_notation->{type} = "delins"; + } + else{ + $hgvs_notation->{type} = ">"; + } + } + + + elsif($ref_length ne $alt_length && ($ref_length - $alt_length)%3 !=0 ){ + $hgvs_notation->{type} = "fs"; + } + + elsif(length($self->variation_feature_seq()) >1 ){ + if($hgvs_notation->{start} == ($hgvs_notation->{end} + 1) ){ + ### convention for insertions - end one less than start + $hgvs_notation->{type} = "ins"; + } + elsif( $hgvs_notation->{start} != $hgvs_notation->{end} ){ + $hgvs_notation->{type} = "delins"; + } + else{ + $hgvs_notation->{type} = ">"; + } + } + else{ + #print STDERR "DEBUG ".$self->variation_feature->start."\n"; + #warn "Cannot define protein variant type [$ref_length - $alt_length]\n"; + } + return $hgvs_notation ; + +} + +### HGVS: get reference & alternative peptide +sub _get_hgvs_peptides{ + + my $self = shift; + my $hgvs_notation = shift; + + if($hgvs_notation->{type} eq "fs"){ + ### ensembl alt/ref peptides not the same as HGVS alt/ref - look up seperately + $hgvs_notation = $self->_get_fs_peptides($hgvs_notation); + } + elsif($hgvs_notation->{type} eq "ins" ){ + + ### HGVS ref are peptides flanking insertion + $hgvs_notation->{ref} = $self->_get_surrounding_peptides($hgvs_notation->{start}); + + if( $hgvs_notation->{alt} =~/\*/){ + ## inserted bases after stop irrelevant; consider as substitution gaining stop MAINTAIN PREVIOUS BEHAVIOUR FOR NOW + #$hgvs_notation->{alt} =~ s/\*\w+$/\*/; + } + else{ + ### Additional check that inserted bases do not duplicate 3' reference sequence [set to type = dup if so] + $hgvs_notation = $self->_check_for_peptide_duplication($hgvs_notation); + } + } + + + ### Convert peptide to 3 letter code as used in HGVS + $hgvs_notation->{ref} = Bio::SeqUtils->seq3(Bio::PrimarySeq->new(-seq => $hgvs_notation->{ref}, -id => 'ref', -alphabet => 'protein')) || ""; + if( $hgvs_notation->{alt} eq "-"){ + $hgvs_notation->{alt} = "del"; + } + else{ + $hgvs_notation->{alt} = Bio::SeqUtils->seq3(Bio::PrimarySeq->new(-seq => $hgvs_notation->{alt}, -id => 'ref', -alphabet => 'protein')) || ""; + } + + ### handle special cases + if( affects_start_codon($self) ){ + #### handle initiator loss -> probably no translation => alt allele is '?' + $hgvs_notation->{alt} = "?"; + $hgvs_notation->{type} = ""; + } + + elsif( $hgvs_notation->{type} eq "del"){ + #### partial last codon + $hgvs_notation->{alt} = "del"; + } + elsif($hgvs_notation->{type} eq "fs"){ + ### only quote first ref peptide for frameshift + $hgvs_notation->{ref} = substr($hgvs_notation->{ref},0,3); + } + + ### set stop to HGVS prefered "X" + if(defined $hgvs_notation->{ref}){ $hgvs_notation->{ref} =~ s/Ter|Xaa/X/g; } + if(defined $hgvs_notation->{alt}){ $hgvs_notation->{alt} =~ s/Ter|Xaa/X/g; } + + return ($hgvs_notation); + +} + +### HGVS: remove common peptides from alt and ref strings & shift start/end accordingly +sub _clip_alleles{ + + my $hgvs_notation = shift; + + my $check_alt = $hgvs_notation->{alt} ; + my $check_ref = $hgvs_notation->{ref} ; + my $check_start = $hgvs_notation->{start}; + + ### strip same bases from start of string + for (my $p =0; $p {ref}); $p++){ + my $check_next_ref = substr( $check_ref, 0, 1); + my $check_next_alt = substr( $check_alt, 0, 1); + + if($check_next_ref eq "*" && $check_next_alt eq "*"){ + ### stop re-created by variant - no protein change + $hgvs_notation->{type} = "="; + + return($hgvs_notation); + } + + if($check_next_ref eq $check_next_alt){ + $check_start++; + $check_ref = substr( $check_ref, 1); + $check_alt = substr( $check_alt, 1); + } + else{ + last; + } + } + #### strip same bases from end of string + for (my $q =0; $q < length ($check_ref); $q++){ + my $check_next_ref = substr( $check_ref, -1, 1); + my $check_next_alt = substr( $check_alt, -1, 1); + if($check_next_ref eq $check_next_alt){ + chop $check_ref; + chop $check_alt; + } + else{ + last; + } + } + + $hgvs_notation->{alt} = $check_alt; + $hgvs_notation->{ref} = $check_ref; + + ### check if clipping force type change & adjust location + if(length ($check_ref) == 0 && length ($check_alt) >= 1){ + ### re-set as ins not delins + $hgvs_notation->{type} ="ins"; + ### insertion between ref base and next => adjust next + if($hgvs_notation->{end} == $hgvs_notation->{start} ){$hgvs_notation->{end} = $check_start; } +# $hgvs_notation->{start} = $check_start; + } + elsif(length ($check_ref) >=1 && length ($check_alt) == 0){ + ### re-set as del not delins + $hgvs_notation->{type} ="del"; + $hgvs_notation->{start} = $check_start; + } + else{ + #### save trimmed peptide string & increased start position + $hgvs_notation->{start} = $check_start; + + } + + return $hgvs_notation; +} + + + + + +#### HGVS: check allele lengths to look for frameshifts +sub _get_allele_length{ + + my $self = shift; + my $ref_length = 0; + my $alt_length = 0; + + my $al_string = $self->allele_string(); + my $ref_allele = (split/\//, $al_string)[0]; + $ref_allele =~ s/\-//; + $ref_length = length $ref_allele; + + my $alt_allele = $self->variation_feature_seq(); + $alt_allele =~ s/\-//; + $alt_length = length $alt_allele; + + return ($ref_length, $alt_length ); + +} + +### HGVS: list first different peptide [may not be first changed codon] +sub _get_fs_peptides{ + + my $self = shift; + my $hgvs_notation = shift; + + ### get CDS with alt variant + my $alt_cds = $self->_get_alternate_cds(); + return undef unless defined($alt_cds); + + #### get new translation + my $alt_trans = $alt_cds->translate()->seq(); + + ### get changed end (currently in single letter AA coding) + my $ref_trans = $self->transcript->translate()->seq(); + $ref_trans .= "*"; ## appending ref stop for checking purposes + + $hgvs_notation->{start} = $self->transcript_variation->translation_start() ; + + if( $hgvs_notation->{start} > length $alt_trans){ ## deletion of stop, no further AA in alt seq + $hgvs_notation->{alt} = "del"; + $hgvs_notation->{type} = "del"; + return $hgvs_notation; + } + + while ($hgvs_notation->{start} <= length $alt_trans){ + ### frame shift may result in the same AA# + + $hgvs_notation->{ref} = substr($ref_trans, $hgvs_notation->{start}-1, 1); + $hgvs_notation->{alt} = substr($alt_trans, $hgvs_notation->{start}-1, 1); + + if($hgvs_notation->{ref} eq "*" && $hgvs_notation->{alt} eq "*"){ + ### variation at stop codon, but maintains stop codon + return ($hgvs_notation); + } + last if $hgvs_notation->{ref} ne $hgvs_notation->{alt}; + $hgvs_notation->{start}++; + } + return ($hgvs_notation); + +} + +#### HGVS: if variant is an insertion, ref pep is initially "-", so seek AA before and after insertion +sub _get_surrounding_peptides{ + + my $self = shift; + my $ref_pos = shift; + my $ref_trans = $self->transcript->translate()->seq(); + + my $end = substr($ref_trans, $ref_pos-1); + my $ref_string = substr($ref_trans, $ref_pos-1, 2); + + return ($ref_string); + +} + + +#### HGVS: alternate CDS needed to check for new stop when variant disrupts 'reference' stop +sub _get_alternate_cds{ + + my $self = shift; + + ### get reference sequence + my $reference_cds_seq = $self->transcript->translateable_seq(); + + return undef unless defined($self->transcript_variation->cds_start) && defined($self->transcript_variation->cds_end()); + + ### get sequences upstream and downstream of variant + my $upstream_seq = substr($reference_cds_seq, 0, ($self->transcript_variation->cds_start() -1) ); + my $downstream_seq = substr($reference_cds_seq, ($self->transcript_variation->cds_end() ) ); + + ### fix alternate allele if deletion or on opposite strand + my $alt_allele = $self->variation_feature_seq(); + $alt_allele =~ s/\-//; + if( $self->transcript_variation->variation_feature->strand() <0 && $self->transcript_variation->transcript->strand() >0 || + $self->transcript_variation->variation_feature->strand() >0 && $self->transcript_variation->transcript->strand() < 0 + ){ + reverse_comp(\$alt_allele) ; + } + ### build alternate seq + my $alternate_seq = $upstream_seq . $alt_allele . $downstream_seq ; + + ### create seq obj with alternative allele in the CDS sequence + my $alt_cds =Bio::PrimarySeq->new(-seq => $alternate_seq, -id => 'alt_cds', -alphabet => 'dna'); + + ### append UTR if available as stop may be disrupted + my $utr = $self->transcript_variation->transcript->three_prime_utr(); + + if (defined $utr) { + ### append the UTR to the alternative CDS + $alt_cds->seq($alt_cds->seq() . $utr->seq()); + } + else{ + ##warn "No UTR available for alternate CDS\n"; + } + + return $alt_cds; +} + +### HGVS: if inserted string is identical to 3' reference sequence, describe as duplication +sub _check_for_peptide_duplication{ + + my $self = shift; + my $hgvs_notation= shift; + + ##### get reference sequence + my $reference_cds_seq = $self->transcript->translateable_seq(); + + ##### get sequence upstream of variant + my $upstream_seq = substr($reference_cds_seq, 0, ($self->transcript_variation->cds_start() -1) ); + + ##### create translation to check against inserted peptides + my $upstream_cds =Bio::PrimarySeq->new(-seq => $upstream_seq, -id => 'alt_cds', -alphabet => 'dna'); + my $upstream_trans = $upstream_cds->translate()->seq(); + + ## Test whether alt peptide matches the reference sequence just before the variant + my $test_new_start = $hgvs_notation->{'start'} - length($hgvs_notation->{'alt'}) -1 ; + my $test_seq = substr($upstream_trans, $test_new_start, length($hgvs_notation->{'alt'})); + + if ( $test_new_start >= 0 && $test_seq eq $hgvs_notation->{alt}) { + + $hgvs_notation->{type} = 'dup'; + $hgvs_notation->{end} = $hgvs_notation->{start} -1; + $hgvs_notation->{start} -= length($hgvs_notation->{alt}); + + } + + return $hgvs_notation; + +} + +#### HGVS: if a stop is lost, seek the next in transcript & count number of extra AA's +sub _stop_loss_extra_AA{ + + my $self = shift; + my $ref_var_pos = shift; ### first effected AA - supply for frameshifts + my $test = shift; + + return undef unless $ref_var_pos; + + my $extra_aa; + + ### get the sequence with variant added + my $alt_cds = $self->_get_alternate_cds(); + return undef unless defined($alt_cds); + + ### get new translation + my $alt_trans = $alt_cds->translate(); + + my $ref_temp = $self->transcript->translate()->seq; + my $ref_len = length($ref_temp); + + if($DEBUG==1){ + print "alt translated:\n" . $alt_trans->seq() . "\n"; + print "ref translated:\n$ref_temp\n";; + } + + #### Find the number of residues that are translated until a termination codon is encountered + if ($alt_trans->seq() =~ m/\*/) { + if($DEBUG==1){print "Got $+[0] aa before stop, var event at $ref_var_pos \n";} + + if($test eq "fs" ){ + ### frame shift - count from first AA effected by variant to stop + $extra_aa = $+[0] - $ref_var_pos; + if($DEBUG==1){ print "Stop change ($test): found $extra_aa amino acids before fs stop [ $+[0] - peptide ref_start: $ref_var_pos )]\n";} + } + + else{ + $extra_aa = $+[0] - 1 - $ref_len; + if($DEBUG==1){ print "Stop change ($test): found $extra_aa amino acids before next stop [ $+[0] - 1 -normal stop $ref_len)]\n";} + } + } + + # A special case is if the first aa is a stop codon => don't display the number of residues until the stop codon + if(defined $extra_aa && $extra_aa >0){ + return $extra_aa ; + } + else{ + #warn "No stop found in alternate sequence\n"; + return undef; + } + +} + +=head +# We haven't implemented support for these methods yet + +sub hgvs_rna { + return _hgvs_generic(@_,'rna'); +} + +sub hgvs_mitochondrial { + return _hgvs_generic(@_,'mitochondrial'); +} + +=cut + +sub _hgvs_generic { + + my $self = shift; + my $reference = pop; + my $notation = shift; + + #ÊThe rna and mitochondrial modes have not yet been implemented, so return undef in case we get a call to these + return undef if ($reference =~ m/rna|mitochondrial/); + + my $sub = qq{hgvs_$reference}; + + $self->{$sub} = $notation if defined $notation; + + unless ($self->{$sub}) { + + # Use the transcript this VF is on as the reference feature + my $reference_feature = $self->transcript; + # If we want genomic coordinates, the reference_feature should actually be the slice for the underlying seq_region + $reference_feature = $reference_feature->slice->seq_region_Slice if ($reference eq 'genomic'); + + # Calculate the HGVS notation on-the-fly and pass it to the TranscriptVariation in order to distribute the result to the other alleles + $self->transcript_variation->$sub($self->variation_feature->get_all_hgvs_notations($reference_feature,substr($reference,0,1),undef,undef,$self->transcript_variation)); + } + + return $self->{$sub}; +} + + +### HGVS: move variant to transcript slice +sub _var2transcript_slice_coords{ + + my $self = shift; + + my $ref_slice = $self->transcript->feature_Slice(); #### returns with strand same as feature + my $tr_vf = $self->variation_feature->transfer($ref_slice); + + # Return undef if this VariationFeature does not fall within the supplied feature. + return undef if (!defined $tr_vf || + $tr_vf->start < 1 || + $tr_vf->end < 1 || + $tr_vf->start > ($self->transcript->end - $self->transcript->start + 1) || + $tr_vf->end > ($self->transcript->end - $self->transcript->start + 1)); + + return( $tr_vf->start() , $tr_vf->end(), $ref_slice); +} + + + +### HGVS: get variant position in transcript + +# intron: +# If the position is in an intron, the boundary position of the closest exon and +# a + or - offset into the intron is returned. +# Ordered by genome forward not 5' -> 3' + +# upstream: +# If the position is 5' of the start codon, it is reported relative to the start codon +# (-1 being the last nucleotide before the 'A' of ATG). + +#downstream: +# If the position is 3' pf the stop codon, it is reported with a '*' prefix and the offset +# from the start codon (*1 being the first nucleotide after the last position of the stop codon) + +sub _get_cDNA_position { + + my $self = shift; + my $position = shift; ### start or end of variation + + my $transcript = $self->transcript(); + my $strand = $transcript->strand(); + + #### TranscriptVariation start/stop coord relative to transcript + #### Switch to chromosome coordinates taking into account strand + $position = ( $strand > 0 ? + ( $self->transcript->start() + $position - 1 ) : + ( $self->transcript->end() - $position + 1)); + + + + # Get all exons and sort them in positional order + my @exons = sort {$a->start() <=> $b->start()} @{$transcript->get_all_Exons()}; + my $n_exons = scalar(@exons); + + my $cdna_position; + # Loop over the exons and get the coordinates of the variation in exon+intron notation + for (my $i=0; $i<$n_exons; $i++) { + + # Skip if the start point is beyond this exon + next if ($position > $exons[$i]->end()); + + + # EXONIC: If the start coordinate is within this exon + if ($position >= $exons[$i]->start()) { + # Get the cDNA start coordinate of the exon and add the number of nucleotides from the exon boundary to the variation + # If the transcript is in the opposite direction, count from the end instead + $cdna_position = $exons[$i]->cdna_start($transcript) + ( $strand > 0 ? + ( $position - $exons[$i]->start ) : + ( $exons[$i]->end() - $position ) + ); + last; #### last exon checked + } + ## INTRONIC + # Else the start coordinate is between this exon and the previous one, determine which one is closest and get coordinates relative to that one + else { + + my $updist = ($position - $exons[$i-1]->end()); + my $downdist = ($exons[$i]->start() - $position); + + # If the distance to the upstream exon is the shortest, or equal and in the positive orientation, use that + if ($updist < $downdist || ($updist == $downdist && $strand >= 0)) { + + # If the orientation is reversed, we should use the cDNA start and a '-' offset + $cdna_position = ($strand >= 0 ? + $exons[$i-1]->cdna_end($transcript) . '+' : + $exons[$i-1]->cdna_start($transcript) . '-') + . $updist; + } + # Else if downstream is shortest... + else { + # If the orientation is reversed, we should use the cDNA end and a '+' offset + $cdna_position = ($strand >= 0 ? + $exons[$i]->cdna_start($transcript) . '-' : + $exons[$i]->cdna_end($transcript) . '+') + . $downdist; + } + last; ## last exon checked + } + } + + # Shift the position to make it relative to the start & stop codons + my $start_codon = $transcript->cdna_coding_start(); + my $stop_codon = $transcript->cdna_coding_end(); + + # Disassemble the cDNA coordinate into the exon and intron parts + ### just built this now taking it appart again + my ($cdna_coord, $intron_offset) = $cdna_position =~ m/([0-9]+)([\+\-][0-9]+)?/; + + + # Start by correcting for the stop codon + if (defined($stop_codon) && $cdna_coord > $stop_codon) { + # Get the offset from the stop codon + $cdna_coord -= $stop_codon; + # Prepend a * to indicate the position is in the 3' UTR + $cdna_coord = '*' . $cdna_coord; + } + elsif (defined($start_codon)) { + # If the position is beyond the start codon, add 1 to get the correct offset + $cdna_coord += ($cdna_coord >= $start_codon); + # Subtract the position of the start codon + $cdna_coord -= $start_codon; + } + + # Re-assemble the cDNA position [ return exon num & offset & direction for intron eg. 142+363] + $cdna_position = $cdna_coord . (defined($intron_offset) ? $intron_offset : ''); + return $cdna_position; +} + + +1; + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/BaseVepFilterPlugin.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/BaseVepFilterPlugin.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,112 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Variation::Utils::BaseVepFilterPlugin + +=head1 SYNOPSIS + + # a simple example filter plugin that excludes all + # lines that are not non-synonymous changes (defined + # as those which have alternate peptides in their + # pep_allele_string) + + package NonSynonymousFilter; + + use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepFilterPlugin); + + sub feature_types { + return ['Transcript']; + } + + sub include_line { + my ($self, $tva) = @_; + + if (my $pep_alleles = $tva->pep_allele_string) { + return $pep_alleles =~ /\//; + } + + return 0; + } + + 1; + +=head1 DESCRIPTION + +This is a subclass of BaseVepPlugin aimed to make plugins that act like +filters very straightforward to write. Users should subclass this module +and then need only override the include_line method which should return +a true value if the current line should be included, or a false value +to filter the line out. A filter can then be used with the VEP just as +any other plugin by using the --plugin command line option. + +=cut + +package Bio::EnsEMBL::Variation::Utils::BaseVepFilterPlugin; + +use strict; +use warnings; + +use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin); + +=head2 include_line + + Arg[1] : An instance of a subclass of Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele + Arg[2] : A hashref containing all the data that will be printed on this line, keyed by column name + Description: This method should return true if the plugin wants to filter out this line and + false otherwise. + Returntype : boolean + Status : Experimental + +=cut + +sub include_line { + my ($self, $vfoa, $line_hash) = @_; + warn "VEP filter plugins should implement the 'include_line' method\n"; + return 1; +} + +# the following methods just override the BaseVepPlugin methods +# providing default configuration for filter plugins + +sub run { + my ($self, $vfoa, $line_hash) = @_; + + # all run does for filters is check if the plugin wants to + # include this line and then returns the appropriate type: + # an empty hashref to include the line, or undef to filter + # out the line + + return $self->include_line($vfoa, $line_hash) ? {} : undef; +} + +sub get_header_info { + # we don't want to include anything in the header + return {}; +} + +sub feature_types { + # default to running for all feature types + return ['Transcript', 'RegulatoryFeature', 'MotifFeature']; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/BaseVepPlugin.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/BaseVepPlugin.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,332 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Variation::Utils::BaseVepPlugin + +=head1 SYNOPSIS + + package FunkyPlugin; + + use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin); + + sub feature_types { + return ['Transcript']; + } + + sub get_header_info { + return { + FUNKY_PLUGIN => "Description of funky plugin" + }; + } + + sub run { + my ($self, $transcript_variation_allele) = @_; + + my $results = ... # do analysis + + return { + FUNKY_PLUGIN => $results + }; + } + + 1; + +=head1 DESCRIPTION + +To make writing plugin modules for the VEP easier, get +your plugin to inherit from this class, override (at least) +the feature_types, get_header_info and run methods to behave +according to the documentation below, and then run the VEP +with your plugin using the --plugin command +line option. + +=cut + +package Bio::EnsEMBL::Variation::Utils::BaseVepPlugin; + +use strict; +use warnings; + +=head2 new + + Arg [1] : a VEP configuration hashref + Arg [>1] : any parameters passed on the VEP command line, will be stored as a listref in $self->{params} + Description: Creates and returns a new instance of this plugin + Returntype : Bio::EnsEMBL::Variation::Utils::BaseVepPlugin instance (most likely a user-defined subclass) + Status : Experimental + +=cut + +sub new { + my ($class, $config, @params) = @_; + + # default to the current VEP version, and analysing VariationFeatures and + # Transcripts (which we expect to be the most common usage, this means that + # the run method will be called with a TranscriptVariationAllele as the + # first argument) + + return bless { + version => '2.3', + feature_types => ['Transcript'], + variant_feature_types => ['VariationFeature'], + config => $config, + params => \@params, + }, $class; +} + +=head2 version + + Arg [1] : (optional) a version number string in the form N.N.N + Description: Get/set the version of this plugin. The version should + match the version of the VEP that this plugin works with + (at least in the major version number). This is used to + detect compatibility between the VEP and plugins. + Returntype : string + Status : Experimental + +=cut + +sub version { + my ($self, $version) = @_; + $self->{version} = $version if $version; + return $self->{version}; +} + +=head2 config + + Arg [1] : a VEP configuration hashref + Description: Get/set the VEP configuration hashref + Returntype : hashref + Status : Experimental + +=cut + +sub config { + my ($self, $config) = @_; + $self->{config} = $config if $config; + return $self->{config}; +} + +=head2 params + + Arg [1] : (optional) a listref of plugin parameters + Description: Get/set the parameters of this plugin, typically as passed on the VEP command line. + Returntype : listref + Status : Experimental + +=cut + +sub params { + my ($self, $params) = @_; + $self->{params} = $params if $params; + return $self->{params} || []; +} + +=head2 get_header_info + + Description: Return a hashref with any Extra column keys as keys and a description + of the data as a value, this will be included in the VEP output file + header to help explain the results of this plugin. Plugins that do + not want to include anything in the header should return undef. + Returntype : hashref or undef + Status : Experimental + +=cut + +sub get_header_info { + my ($self) = @_; + return undef; +} + +=head2 variant_feature_types + + Description: To indicate which types of variation features a plugin is interested + in, plugins should return a listref of the types of variation feature + they can deal with. Currently this list should include one or more of: + 'VariationFeature' or 'StructuralVariationFeature' + Returntype : listref + Status : Experimental + +=cut + +sub variant_feature_types { + my ($self, $types) = @_; + $self->{variant_feature_types} = $types if $types; + return $self->{variant_feature_types}; +} + +=head2 feature_types + + Description: To indicate which types of genomic features a plugin is interested + in, plugins should return a listref of the types of feature they can deal + with. Currently this list should include one or more of: 'Transcript', + 'RegulatoryFeature' and 'MotifFeature' + Returntype : listref + Status : Experimental + +=cut + +sub feature_types { + my ($self, $types) = @_; + $self->{feature_types} = $types if $types; + return $self->{feature_types}; +} + +sub _check_types { + my ($self, $type_type, $type) = @_; + + # if we're passed an object instead of a type string + # get the type of reference and use that + if (ref $type) { + $type = ref $type; + } + + # $type_type will either be 'variant_feature' or 'feature' + # so construct the method to call and the hash key to + # store the cached results under + + my $method = $type_type.'_types'; + my $hash_key = $method.'_wanted'; + + unless (defined $self->{$hash_key}->{$type}) { + + # cache the result so we don't have to loop each time + + $self->{$hash_key}->{$type} = 0; + + for my $wanted (@{ $self->$method }) { + + # special case the intergenic class + + if ($wanted eq 'Intergenic') { + if ($wanted eq $type) { + $self->{$hash_key}->{$type} = 1; + last; + } + } + else { + + # we are fairly relaxed about how the user describes features, it can + # be the fully qualified class name, or just the specific module name, + # (i.e. the text after the last '::') in which case we automatically + # fully qualify it + + if ($wanted !~ /::/) { + + if ($type_type eq 'feature') { + + if ($wanted eq 'RegulatoryFeature' || $wanted eq 'MotifFeature') { + $wanted = "Bio::EnsEMBL::Funcgen::$wanted"; + } + else { + $wanted = "Bio::EnsEMBL::$wanted"; + } + } + elsif ($type_type eq 'variant_feature') { + $wanted = "Bio::EnsEMBL::Variation::$wanted"; + } + } + + if ($type->isa($wanted)) { + $self->{$hash_key}->{$type} = 1; + last; + } + } + } + } + + return $self->{$hash_key}->{$type}; +} + +=head2 check_feature_type + + Arg[1] : the feature type as a string (or a reference to an object of the desired type) + Description: Check if this plugin is interested in a particular feature type + Returntype : boolean + Status : Experimental + +=cut + +sub check_feature_type { + my ($self, $type) = @_; + return $self->_check_types('feature', $type); +} + +=head2 check_variant_feature_type + + Arg[1] : the variant feature type as a string (or a reference to an object of the desired type) + Description: Check if this plugin is interested in a particular variant feature type + Returntype : boolean + Status : Experimental + +=cut + +sub check_variant_feature_type { + my ($self, $type) = @_; + return $self->_check_types('variant_feature', $type); +} + +=head2 run + + Arg[1] : An instance of a subclass of Bio::EnsEMBL::Variation::BaseVariationFeatureOverlapAllele + Arg[2] : A hashref containing all the data that will be printed on this line, keyed by column name + Description: Run this plugin, this is where most of the plugin logic should reside. + When the VEP is about to finish one line of output (for a given variation-allele-feature + combination) it will call this method, passing it a relevant subclass of a + Bio::EnsEMBL::Variation::BaseVariationFeatureOverlapAllele object according to + feature types it is interested in, as specified by the plugin's feature_types method: + + feature type argument type + + 'Transcript' Bio::EnsEMBL::Variation::TranscriptVariationAllele + 'RegulatoryFeature' Bio::EnsEMBL::Variation::RegulatoryFeatureVariationAllele + 'MotifFeature' Bio::EnsEMBL::Variation::MotifFeatureVariationAllele + + Once the plugin has done its analysis it should return the results as a hashref + with a key for each type of data (which should match the keys described in + get_header_info) and a corresponding value for this particular object, or an empty + hash (*not* undef) if this plugin does not produce any annotation for this object. + Any edata will then be included in the Extra column in the VEP output file. + Please refer to the variation API documentation to see what methods are available + on each of the possible classes, bearing in mind that common functionality can be + found in the BaseVariationFeatureOverlapAllele superclass. + + If the plugin wants to filter this line out of the VEP output it can indicate + this by returning undef rather than a hashref. Using this mechanism a plugin + can act as a filter on the VEP output to limit lines to particular events + of interest. If you are writing a plugin to act as filter, consider subclassing + Bio::EnsEMBL::Variation::Utils::BaseVepFilterPlugin, a subclass of this class + which provides some convenient functionality for filter plugins. + + Returntype : hashref or undef if the plugin wants to filter out this line + Status : Experimental + +=cut + +sub run { + my ($self, $bvfoa, $line_hash) = @_; + warn "VEP plugins should implement the 'run' method\n"; + return {}; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/ComparaUtils.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/ComparaUtils.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,337 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Variation::Utils::ComparaUtils + +=head1 DESCRIPTION + +This module exports two subroutines, dump_alignment_for_polyphen and +dump_alignment_for_sift that write Compara alignments to files in the +formats expected by PolyPhen and SIFT. This allows you to use +Compara alignments in place of both these tools' alignment pipelines + +=cut + +package Bio::EnsEMBL::Variation::Utils::ComparaUtils; + +use strict; +use warnings; + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Registry; +use Bio::SimpleAlign; +use Bio::AlignIO; +use Bio::LocatableSeq; + +use base qw(Exporter); + +our @EXPORT_OK = qw(dump_alignment_for_polyphen dump_alignment_for_sift); + +my $MAX_PSIC_SEQS = 8190; +my $MAX_PSIC_SEQLEN = 409650; + +sub _ungap_alignment { + + # turn a gapped alignment into an ungapped alignment + # with respect to the given query_id, + # + # e.g. if our query sequence is no. 1 below, we want to + # turn this: + # + # 1: -ABC--DEFG-H--- + # 2: ---IJKLMN--OPQR + # 3: ST------UVWXYZ- + # + # into: + # + # 1: ABCDEFGH + # 2: --ILMN-O + # 3: T----UVX + + my ($gapped_alignment, $query_id, $include_query) = @_; + + my @seqs = $gapped_alignment->each_seq; + + # find the Seq object corresponding to the query id + + my $query_seq; + + for my $seq (@seqs) { + $query_seq = $seq; + last if $seq->display_id eq $query_id; + } + + throw("Could not find query sequence '$query_id' in the alignment") + unless $query_seq->display_id eq $query_id; + + my $qseq = $query_seq->seq; + + #print $qseq, " (".length($qseq).")\n"; + + # first we split the query sequence into its component parts and gaps, + # so -ABC--DEFG-H--- will become ('ABC','DEFG','H') and ('-','--','-','---') + + # we grep because we don't want any empty strings that otherwise seem to + # make it through + + my @parts = grep {$_} split /-+/, $qseq; + my @gaps = grep {$_} split /[^-]+/, $qseq; + + my @chunks; + + # then we compute the coordinates of each sequence chunk, + # taking into account that we might start with a gap + + my $offset = substr($qseq,0,1) eq '-' ? length(shift @gaps) : 0; + + # we build up a list of [start, length] pairs for each chunk + # of sequence, incrementing our offset by the length of each + # chunk and the gap that follows it, for our example above this + # will result in ([1,3],[6,4],[11,1]) + + for my $part (@parts) { + + my $l = length($part); + + push @chunks, [$offset, $l]; + + #print "$part: $offset - $l\n"; + + $offset += $l + length(shift @gaps || ''); + } + + # we then use this list of chunks to obtain the aligned portions + # of all other members of the alignment and create a new alignment, + + my @seq_objs; + + for my $seq (@seqs) { + + my $old_seq = $seq->seq; + my $new_seq; + + for my $chunk (@chunks) { + $new_seq .= substr($old_seq, $chunk->[0], $chunk->[1]); + } + + next if $new_seq =~ /^-*$/; + + #my $gap_count = ($new_seq =~ tr/-//); + + #next if $gap_count / length($new_seq) > 0.1; + + if ($seq->display_id eq $query_id) { + + $query_seq = Bio::LocatableSeq->new( + -SEQ => $new_seq, + -START => 1, + -END => length($new_seq), + -ID => 'QUERY', + -STRAND => 0 + ); + + unshift @seq_objs, $query_seq if $include_query; + } + else { + push @seq_objs, Bio::LocatableSeq->new( + -SEQ => $new_seq, + -START => 1, + -END => length($new_seq), + -ID => $seq->display_id, + -STRAND => 0 + ) + } + } + + my $new_align = Bio::SimpleAlign->new; + + $new_align->add_seq($_) for @seq_objs; + + # sometimes we want the query sequence back as well as the new alignment + + return wantarray ? ($query_seq, $new_align) : $new_align; +} + +sub _get_ungapped_alignment { + + # get an ungapped Bio::SimpleAlign for the given query translation + + my ($translation_stable_id, $include_query) = @_; + + my $compara_dba = Bio::EnsEMBL::Registry->get_DBAdaptor('multi', 'compara') + or throw("Failed to get compara DBAdaptor"); + + my $ma = $compara_dba->get_MemberAdaptor + or throw("Failed to get member adaptor"); + + my $fa = $compara_dba->get_FamilyAdaptor + or throw("Failed to get family adaptor"); + + my $member = $ma->fetch_by_source_stable_id("ENSEMBLPEP", $translation_stable_id) + or throw("Didn't find family member for $translation_stable_id"); + + my $fams = $fa->fetch_all_by_Member($member) + or throw("Didn't find a family for $translation_stable_id"); + + throw("$translation_stable_id is in more than one family") if @$fams > 1; + + my $orig_align = $fams->[0]->get_SimpleAlign; + + $compara_dba->dbc->disconnect_if_idle; + + return _ungap_alignment( + $orig_align, + $translation_stable_id, + $include_query + ); +} + +sub _percent_id { + my ($q, $a) = @_; + + my @q = split //, $q->seq; + my @a = split //, $a->seq; + + my $num = scalar(@q); + + my $tot = 0.0; + + for (my $i = 0; $i < $num; $i++ ) { + $tot++ if $q[$i] eq $a[$i]; + } + + return ($tot / $num); +} + +=head2 dump_alignment_for_polyphen + + Arg[1] : string $translation_stable_id - the stable of the Ensembl translation + you want to run PolyPhen on + Arg[2] : string $file - the name of the file you want to write the alignment to + Description : Fetches the Compara protein family containing the specified translation + (if available), ungaps the alignment with respect to the translation, and + writes the alignment to the specified file in the format expected by PolyPhen + Returntype : none + Exceptions : throws if an alignment cannot be found, or if the file cannot be written + Status : At Risk + +=cut + +sub dump_alignment_for_polyphen { + + my ($translation_stable_id, $file) = @_; + + # polyphen does not want the query included in the alignment + + my ($query_seq, $alignment) = _get_ungapped_alignment($translation_stable_id, 0); + + my @seqs = $alignment->each_seq; + + unless (scalar(@seqs)) { + throw("No sequences in the alignment for $translation_stable_id"); + } + + if (length($seqs[0]->seq) > $MAX_PSIC_SEQLEN) { + throw("$translation_stable_id sequence too long for PSIC"); + } + + # polyphen expects the alignment to be sorted descending by % id + + my $percent_id; + + for my $seq (@seqs) { + $percent_id->{$seq->id} = _percent_id($query_seq, $seq); + } + + my @sorted = sort { $percent_id->{$b->id} <=> $percent_id->{$a->id} } @seqs; + + # the format is similar to a clustalw .aln file, but it *must* have a + # 70 character description beginning each line, and then the alignment string + # on the rest of the line with no line breaks between sequences, PSIC + # can also only deal with a fixed maximum sequence length and number + # of sequences on the alignment, these are set in the constants at the + # top of this file + + my $num_seqs = scalar(@seqs); + + open my $ALN, ">$file" or throw("Failed to open $file for writing"); + + print $ALN "CLUSTAL $translation_stable_id (".($num_seqs > $MAX_PSIC_SEQS ? $MAX_PSIC_SEQS : $num_seqs).")\n\n"; + + my $count = 0; + + for my $seq (@sorted) { + if ($percent_id->{$seq->id} == 1) { + #warn "Ignoring identical seq ".$seq->id."\n"; + #next; + } + + my $id = $seq->id; + my $extra = 70 - length($id); + + print $ALN $seq->id, ' ' x $extra, $seq->seq, "\n"; + + last if ++$count >= $MAX_PSIC_SEQS; + } + + close $ALN; +} + +=head2 dump_alignment_for_sift + + Arg[1] : string $translation_stable_id - the stable id of the Ensembl translation + you want to run SIFT on + Arg[2] : string $file - the name of the file you want to write the alignment to + Description : Fetches the Compara protein family containing the specified translation + (if available), ungaps the alignment with respect to the translation, and + writes the alignment to the specified file in the format expected by SIFT + Returntype : none + Exceptions : throws if an alignment cannot be found, or if the file cannot be written + Status : At Risk + +=cut + +sub dump_alignment_for_sift { + + my ($translation_stable_id, $file) = @_; + + # SIFT is easy, we just fetch an ungapped alignment including the query + # and dump it in FASTA format + + my $aln = _get_ungapped_alignment($translation_stable_id, 1); + + open my $ALN, ">$file" or throw("Failed to open $file for writing"); + + my $alignIO = Bio::AlignIO->newFh( + -interleaved => 0, + -fh => $ALN, + -format => 'fasta', + -idlength => 20, + ); + + print $alignIO $aln; + + close $ALN; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/Config.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/Config.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,872 @@ +package Bio::EnsEMBL::Variation::Utils::Config; + +use base qw(Exporter); + +our @EXPORT_OK = qw( + @ATTRIB_TYPES + %ATTRIBS + @ATTRIB_SETS + @VARIATION_CLASSES + @OVERLAP_CONSEQUENCES + @FEATURE_TYPES + $OVERLAP_CONSEQUENCE_CLASS + $MAX_ATTRIB_CODE_LENGTH +); + +our $OVERLAP_CONSEQUENCE_CLASS = 'Bio::EnsEMBL::Variation::OverlapConsequence'; + +our $MAX_ATTRIB_CODE_LENGTH = 20; + +our @short_names = qw(1kg_hct 1kg_hct_ceu 1kg_hct_yri 1kg_hce 1kg_hce_ceu 1kg_hce_chb + 1kg_hce_chd 1kg_hce_jpt 1kg_hce_lwk 1kg_hce_tsi 1kg_hce_yri 1kg_lc + 1kg_lc_ceu 1kg_lc_chb_jpt 1kg_lc_yri hapmap + 1kg 1kg_afr 1kg_amr 1kg_asn 1kg_eur 1kg_afr 1kg_amr_com 1kg_asn_com 1kg_eur_com + ind_venter ind_watson ind_gill ind_ak1 ind_irish ind_angrist + ind_gates_jr ind_gates_sr ind_kriek ind_quake ind_saqqaq ind_saqqaq_hc ind_sjk ind_yh + fail_all fail_nonref fail_ambig fail_gt_fq fail_incons_map fail_mult_map + fail_no_alleles fail_no_gt fail_no_map fail_no_seq fail_non_nt fail_mult_alleles fail_dbsnp_suspect + ph_hgmd_pub ph_johnson_et_al ph_nhgri ph_omim ph_variants ph_uniprot + ph_cosmic ph_ega precious hapmap_ceu hapmap_hcb hapmap_jpt hapmap_yri + Affy_500K Affy_SNP6 Cardio-Metabo_Chip HumanOmni1-Quad Illumina_1M-duo Illumina_660Q + ); + +our @dbsnp_clinical_significance_types = qw( + unknown + untested + non-pathogenic + probable-non-pathogenic + probable-pathogenic + pathogenic + drug-response + histocompatibility + other +); + +our @dgva_clinical_significance_types = ( + 'Not tested', + 'Benign', + 'Pathogenic', + 'Uncertain Significance', + 'Uncertain Significance: likely benign', + 'Uncertain Significance: likely pathogenic' +); + +our @VARIATION_CLASSES = ( + { + SO_accession => 'SO:0001483', + SO_term => 'SNV', + display_term => 'SNP', + somatic_display_term => 'somatic_SNV', + }, + { + SO_accession => 'SO:1000002', + SO_term => 'substitution', + }, + { + SO_accession => 'SO:0001019', + SO_term => 'copy_number_variation', + display_term => 'CNV', + }, + { + SO_accession => 'SO:0000667', + SO_term => 'insertion', + }, + { + SO_accession => 'SO:0000159', + SO_term => 'deletion', + }, + { + SO_accession => 'SO:1000032', + SO_term => 'indel', + }, + { + SO_accession => 'SO:0000705', + SO_term => 'tandem_repeat', + }, + { + SO_accession => 'SO:0001059', + SO_term => 'sequence_alteration', + }, + # Structural variation classes + { + SO_accession => 'SO:0001537', + SO_term => 'structural_variant', + display_term => 'SV', + }, + { + SO_accession => 'SO:0000051', + SO_term => 'probe', + display_term => 'CNV_PROBE', + }, + { + SO_accession => 'SO:0001742', + SO_term => 'copy_number_gain', + display_term => 'Gain', + }, + { + SO_accession => 'SO:0001743', + SO_term => 'copy_number_loss', + display_term => 'Loss', + }, + { + SO_accession => 'SO:1000036', + SO_term => 'inversion', + }, + { + SO_accession => 'SO:0001784', + SO_term => 'complex_structural_alteration', + display_term => 'Complex', + }, + { + SO_accession => 'SO:1000173', + SO_term => 'tandem_duplication', + display_term => 'Tandem duplication', + }, + { + SO_accession => 'SO:0001837', + SO_term => 'mobile_element_insertion', + display_term => 'Mobile element insertion', + }, + { + SO_accession => 'SO:0001873', + SO_term => 'interchromosomal_breakpoint', + display_term => 'Interchromosomal breakpoint', + }, + { + SO_accession => 'SO:0001874', + SO_term => 'intrachromosomal_breakpoint', + display_term => 'Intrachromosomal breakpoint', + }, + { + SO_accession => 'SO:0000199', + SO_term => 'translocation', + }, + { + SO_accession => 'SO:1000035', + SO_term => 'duplication', + display_term => 'Duplication', + }, +); + +our @OVERLAP_CONSEQUENCES = ( + { + SO_accession => 'SO:0001628', + SO_term => 'intergenic_variant', + display_term => 'INTERGENIC', + rank => '38', + tier => '4', + description => 'A sequence variant located in the intergenic region, between genes', + label => 'Intergenic variant', + is_default => 1, + }, + { + SO_accession => 'SO:0001631', + SO_term => 'upstream_gene_variant', + display_term => 'UPSTREAM', + feature_SO_term => 'transcript', + feature_class => 'Bio::EnsEMBL::Transcript', + variant_feature_class => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + rank => '24', + tier => '3', + predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::upstream', + description => 'A sequence variant located 5\' of a gene', + label => 'Upstream gene variant', + }, + { + SO_accession => 'SO:0001632', + SO_term => 'downstream_gene_variant', + display_term => 'DOWNSTREAM', + feature_SO_term => 'transcript', + feature_class => 'Bio::EnsEMBL::Transcript', + variant_feature_class => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + rank => '25', + tier => '3', + predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::downstream', + description => 'A sequence variant located 3\' of a gene', + label => 'Downstream gene variant', + }, + { + SO_accession => 'SO:0001575', + SO_term => 'splice_donor_variant', + display_term => 'ESSENTIAL_SPLICE_SITE', + NCBI_term => 'splice-5', + feature_SO_term => 'primary_transcript', + feature_class => 'Bio::EnsEMBL::Transcript', + variant_feature_class => 'Bio::EnsEMBL::Variation::VariationFeature', + rank => '3', + tier => '3', + predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::donor_splice_site', + description => 'A splice variant that changes the 2 base region at the 5\' end of an intron', + label => 'Splice donor variant', + }, + { + SO_accession => 'SO:0001574', + SO_term => 'splice_acceptor_variant', + display_term => 'ESSENTIAL_SPLICE_SITE', + NCBI_term => 'splice-3', + feature_SO_term => 'primary_transcript', + feature_class => 'Bio::EnsEMBL::Transcript', + variant_feature_class => 'Bio::EnsEMBL::Variation::VariationFeature', + rank => '3', + tier => '3', + predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::acceptor_splice_site', + description => 'A splice variant that changes the 2 base region at the 3\' end of an intron', + label => 'Splice acceptor variant', + }, + { + SO_accession => 'SO:0001630', + SO_term => 'splice_region_variant', + display_term => 'SPLICE_SITE', + feature_SO_term => 'primary_transcript', + feature_class => 'Bio::EnsEMBL::Transcript', + variant_feature_class => 'Bio::EnsEMBL::Variation::VariationFeature', + rank => '13', + tier => '3', + predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::splice_region', + description => 'A sequence variant in which a change has occurred within the region of the splice site, either within 1-3 bases of the exon or 3-8 bases of the intron', + label => 'Splice region variant', + }, + { + SO_accession => 'SO:0001627', + SO_term => 'intron_variant', + display_term => 'INTRONIC', + NCBI_term => 'intron', + feature_SO_term => 'primary_transcript', + feature_class => 'Bio::EnsEMBL::Transcript', + variant_feature_class => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + rank => '20', + tier => '3', + predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::within_intron', + description => 'A transcript variant occurring within an intron', + label => 'Intron variant', + }, + { + SO_accession => 'SO:0001623', + SO_term => '5_prime_UTR_variant', + display_term => '5PRIME_UTR', + NCBI_term => 'untranslated_5', + feature_SO_term => 'mRNA', + feature_class => 'Bio::EnsEMBL::Transcript', + variant_feature_class => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + rank => '18', + tier => '3', + predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::within_5_prime_utr', + description => 'A UTR variant of the 5\' UTR', + label => '5 prime UTR variant', + }, + { + SO_accession => 'SO:0001624', + SO_term => '3_prime_UTR_variant', + display_term => '3PRIME_UTR', + NCBI_term => 'untranslated_3', + feature_SO_term => 'mRNA', + feature_class => 'Bio::EnsEMBL::Transcript', + variant_feature_class => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + rank => '19', + tier => '3', + predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::within_3_prime_utr', + description => 'A UTR variant of the 3\' UTR', + label => '3 prime UTR variant', + }, +# { +# SO_accession => 'SO:0001577', +# SO_term => 'complex_change_in_transcript', +# display_term => 'COMPLEX_INDEL', +# feature_SO_term => 'primary_transcript', +# feature_class => 'Bio::EnsEMBL::Transcript', +# variant_feature_class => 'Bio::EnsEMBL::Variation::VariationFeature', +# rank => '4', +# tier => '3', +# predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::complex_indel', +# description => 'Insertion or deletion that spans an exon/intron or coding sequence/UTR border', +# label => 'Complex change in transcript', +# }, + { + SO_accession => 'SO:0001819', + SO_term => 'synonymous_variant', + display_term => 'SYNONYMOUS_CODING', + NCBI_term => 'cds-synon', + feature_SO_term => 'mRNA', + feature_class => 'Bio::EnsEMBL::Transcript', + variant_feature_class => 'Bio::EnsEMBL::Variation::VariationFeature', + rank => '15', + tier => '3', + predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::synonymous_variant', + description => 'A sequence variant where there is no resulting change to the encoded amino acid', + label => 'Synonymous variant', + }, + { + SO_accession => 'SO:0001583', + SO_term => 'missense_variant', + display_term => 'NON_SYNONYMOUS_CODING', + NCBI_term => 'missense', + feature_SO_term => 'mRNA', + feature_class => 'Bio::EnsEMBL::Transcript', + variant_feature_class => 'Bio::EnsEMBL::Variation::VariationFeature', + rank => '12', + tier => '3', + predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::missense_variant', + description => 'A sequence variant, where the change may be longer than 3 bases, and at least one base of a codon is changed resulting in a codon that encodes for a different amino acid', + label => 'Missense variant', + }, + { + SO_accession => 'SO:0001821', + SO_term => 'inframe_insertion', + display_term => 'NON_SYNONYMOUS_CODING', + feature_SO_term => 'mRNA', + feature_class => 'Bio::EnsEMBL::Transcript', + variant_feature_class => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + rank => '10', + tier => '3', + predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::inframe_insertion', + description => 'An inframe non synonymous variant that inserts bases into in the coding sequence', + label => 'Inframe insertion', + }, + { + SO_accession => 'SO:0001822', + SO_term => 'inframe_deletion', + display_term => 'NON_SYNONYMOUS_CODING', + feature_SO_term => 'mRNA', + feature_class => 'Bio::EnsEMBL::Transcript', + variant_feature_class => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + rank => '11', + tier => '3', + predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::inframe_deletion', + description => 'An inframe non synonymous variant that deletes bases from the coding sequence', + label => 'Inframe deletion', + }, + { + SO_accession => 'SO:0001587', + SO_term => 'stop_gained', + display_term => 'STOP_GAINED', + NCBI_term => 'nonsense', + feature_SO_term => 'mRNA', + feature_class => 'Bio::EnsEMBL::Transcript', + variant_feature_class => 'Bio::EnsEMBL::Variation::VariationFeature', + rank => '4', + tier => '3', + predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::stop_gained', + description => 'A sequence variant whereby at least one base of a codon is changed, resulting in a premature stop codon, leading to a shortened transcript', + label => 'Stop gained', + }, + { + SO_accession => 'SO:0001578', + SO_term => 'stop_lost', + display_term => 'STOP_LOST', + feature_SO_term => 'mRNA', + feature_class => 'Bio::EnsEMBL::Transcript', + variant_feature_class => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + rank => '6', + tier => '3', + predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::stop_lost', + description => 'A sequence variant where at least one base of the terminator codon (stop) is changed, resulting in an elongated transcript', + label => 'Stop lost', + }, + { + SO_accession => 'SO:0001567', + SO_term => 'stop_retained_variant', + display_term => 'SYNONYMOUS_CODING', + feature_SO_term => 'mRNA', + feature_class => 'Bio::EnsEMBL::Transcript', + variant_feature_class => 'Bio::EnsEMBL::Variation::VariationFeature', + rank => '15', + tier => '3', + predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::stop_retained', + description => 'A sequence variant where at least one base in the terminator codon is changed, but the terminator remains', + label => 'Stop retained variant', + }, + { + SO_accession => 'SO:0001582', + SO_term => 'initiator_codon_variant', + display_term => 'NON_SYNONYMOUS_CODING', + feature_SO_term => 'mRNA', + feature_class => 'Bio::EnsEMBL::Transcript', + variant_feature_class => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + rank => '7', + tier => '3', + predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::affects_start_codon', + description => 'A codon variant that changes at least one base of the first codon of a transcript', + label => 'Initiator codon variant', + }, + { + SO_accession => 'SO:0001589', + SO_term => 'frameshift_variant', + display_term => 'FRAMESHIFT_CODING', + NCBI_term => 'frameshift', + feature_SO_term => 'mRNA', + feature_class => 'Bio::EnsEMBL::Transcript', + variant_feature_class => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + rank => '5', + tier => '3', + predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::frameshift', + description => 'A sequence variant which causes a disruption of the translational reading frame, because the number of nucleotides inserted or deleted is not a multiple of three', + label => 'Frameshift variant', + }, + { + SO_accession => 'SO:0001626', + SO_term => 'incomplete_terminal_codon_variant', + display_term => 'PARTIAL_CODON', + feature_SO_term => 'mRNA', + feature_class => 'Bio::EnsEMBL::Transcript', + variant_feature_class => 'Bio::EnsEMBL::Variation::VariationFeature', + rank => '14', + tier => '3', + predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::partial_codon', + description => 'A sequence variant where at least one base of the final codon of an incompletely annotated transcript is changed', + label => 'Incomplete terminal codon variant', + }, + { + SO_accession => 'SO:0001621', + SO_term => 'NMD_transcript_variant', + display_term => 'NMD_TRANSCRIPT', + feature_SO_term => 'mRNA', + feature_class => 'Bio::EnsEMBL::Transcript', + variant_feature_class => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + rank => '21', + tier => '3', + predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::within_nmd_transcript', + description => 'A variant in a transcript that is the target of NMD', + label => 'NMD transcript variant', + }, + { + SO_accession => 'SO:0001619', + SO_term => 'nc_transcript_variant', + display_term => 'WITHIN_NON_CODING_GENE', + feature_SO_term => 'ncRNA', + feature_class => 'Bio::EnsEMBL::Transcript', + variant_feature_class => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + rank => '23', + tier => '3', + predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::within_non_coding_gene', + description => 'A transcript variant of a non coding RNA', + label => 'NC transcript variant', + }, + { + SO_accession => 'SO:0001792', + SO_term => 'non_coding_exon_variant', + display_term => 'WITHIN_NON_CODING_GENE', + feature_SO_term => 'ncRNA', + feature_class => 'Bio::EnsEMBL::Transcript', + variant_feature_class => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + rank => '22', + tier => '3', + predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::non_coding_exon_variant', + description => 'A sequence variant that changes non-coding exon sequence', + label => 'Non coding exon variant', + }, + { + SO_accession => 'SO:0001620', + SO_term => 'mature_miRNA_variant', + display_term => 'WITHIN_MATURE_miRNA', + feature_SO_term => 'miRNA', + feature_class => 'Bio::EnsEMBL::Transcript', + variant_feature_class => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + rank => '17', + tier => '2', + predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::within_mature_miRNA', + description => 'A transcript variant located with the sequence of the mature miRNA', + label => 'Mature miRNA variant', + }, + { + SO_accession => 'SO:0001580', + SO_term => 'coding_sequence_variant', + display_term => 'CODING_UNKNOWN', + feature_SO_term => 'mRNA', + feature_class => 'Bio::EnsEMBL::Transcript', + variant_feature_class => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + rank => '16', + tier => '3', + predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::coding_unknown', + description => 'A sequence variant that changes the coding sequence', + label => 'Coding sequence variant', + }, + { + SO_accession => 'SO:0001566', + SO_term => 'regulatory_region_variant', + display_term => 'REGULATORY_REGION', + feature_SO_term => 'regulatory_region', + feature_class => 'Bio::EnsEMBL::Funcgen::RegulatoryFeature', + variant_feature_class => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + rank => '36', + tier => '2', + predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::within_regulatory_feature', + description => 'A sequence variant located within a regulatory region', + label => 'Regulatory region variant', + }, +# { +# SO_accession => 'SO:X000005', +# SO_term => 'pre_miRNA_variant', +# display_term => 'WITHIN_NON_CODING_GENE', +# feature_SO_term => 'miRNA', +# feature_class => 'Bio::EnsEMBL::Transcript', +# rank => '13', +# tier => '2', +# predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::within_miRNA', +# }, +# { +# SO_accession => 'SO:X000004', +# SO_term => 'miRNA_target_site_variant', +# display_term => 'REGULATORY_REGION', +# feature_SO_term => 'binding_site', +# feature_class => 'Bio::EnsEMBL::Funcgen::ExternalFeature', +# rank => '13', +# tier => '2', +# predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::within_miRNA_target_site', +# description => 'In regulatory region annotated by Ensembl', +# label => 'Regulatory region', +# }, + { + SO_accession => 'SO:0001782', + SO_term => 'TF_binding_site_variant', + display_term => 'REGULATORY_REGION', + feature_SO_term => 'TF_binding_site', + feature_class => 'Bio::EnsEMBL::Funcgen::MotifFeature', + variant_feature_class => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + rank => '30', + tier => '2', + predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::within_motif_feature', + description => 'In regulatory region annotated by Ensembl', + label => 'A sequence variant located within a transcription factor binding site', + }, + +# { +# SO_accession => 'SO:X000002', +# SO_term => 'decreased_binding_affinity', +# display_term => 'REGULATORY_REGION', +# feature_SO_term => 'binding_site', +# feature_class => 'Bio::EnsEMBL::Funcgen::MotifFeature', +# rank => '47', +# tier => '2', +# predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::decreased_binding_affinity', +# }, +# { +# SO_accession => 'SO:X000001', +# SO_term => 'increased_binding_affinity', +# display_term => 'REGULATORY_REGION', +# feature_SO_term => 'binding_site', +# feature_class => 'Bio::EnsEMBL::Funcgen::MotifFeature', +# rank => '48', +# tier => '2', +# predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::increased_binding_affinity', +# }, + + + ## NEW FOR 68 + ############# + + { + SO_accession => 'SO:0001893', + SO_term => 'transcript_ablation', + feature_SO_term => 'mRNA', + feature_class => 'Bio::EnsEMBL::Transcript', + variant_feature_class => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + rank => '1', + tier => '1', + predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::feature_ablation', + description => 'A feature ablation whereby the deleted region includes a transcript feature', + label => 'Transcript ablation', + }, +# { +# SO_accession => 'SO:0001886', +# SO_term => 'transcript_fusion', +# feature_SO_term => 'mRNA', +# feature_class => 'Bio::EnsEMBL::Transcript', +# variant_feature_class => 'Bio::EnsEMBL::Variation::BaseVariationFeature', +# rank => '2', +# tier => '2', +# predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::transcript_fusion', +# description => 'A feature fusion where the deletion brings together transcript regions', +# label => 'Transcript fusion', +# }, + { + SO_accession => 'SO:0001889', + SO_term => 'transcript_amplification', + feature_SO_term => 'mRNA', + feature_class => 'Bio::EnsEMBL::Transcript', + variant_feature_class => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + rank => '8', + tier => '1', + predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::feature_amplification', + description => 'A feature amplification of a region containing a transcript', + label => 'Transcript amplification', + }, +# { +# SO_accession => 'SO:0001883', +# SO_term => 'transcript_translocation', +# feature_SO_term => 'mRNA', +# feature_class => 'Bio::EnsEMBL::Transcript', +# variant_feature_class => 'Bio::EnsEMBL::Variation::BaseVariationFeature', +# rank => '9', + #tier => '2', +# predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::transcript_translocation', +# description => 'A feature translocation where the region contains a transcript', +# label => 'Transcript translocation', +# }, + { + SO_accession => 'SO:0001895', + SO_term => 'TFBS_ablation', + feature_SO_term => 'TF_binding_site', + feature_class => 'Bio::EnsEMBL::Funcgen::MotifFeature', + variant_feature_class => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + rank => '26', + tier => '2', + predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::feature_ablation', + description => 'A feature ablation whereby the deleted region includes a transcription factor binding site', + label => 'TFBS ablation', + }, +# { +# SO_accession => 'SO:0001888', +# SO_term => 'TFBS_fusion', +# feature_SO_term => 'TF_binding_site', +# feature_class => 'Bio::EnsEMBL::Funcgen::MotifFeature', +# variant_feature_class => 'Bio::EnsEMBL::Variation::BaseVariationFeature', +# rank => '27', + #tier => '2', +# predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::motif_feature_fusion', +# description => 'A fusion where the deletion brings together transcription factor binding sites', +# label => 'TFBS fusion', +# }, + { + SO_accession => 'SO:0001892', + SO_term => 'TFBS_amplification', + feature_SO_term => 'TF_binding_site', + feature_class => 'Bio::EnsEMBL::Funcgen::MotifFeature', + variant_feature_class => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + rank => '28', + tier => '2', + predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::feature_amplification', + description => 'A feature amplification of a region containing a transcription factor binding site', + label => 'TFBS amplification', + }, +# { +# SO_accession => 'SO:0001885', +# SO_term => 'TFBS_translocation', +# feature_SO_term => 'TF_binding_site', +# feature_class => 'Bio::EnsEMBL::Funcgen::MotifFeature', +# variant_feature_class => 'Bio::EnsEMBL::Variation::BaseVariationFeature', +# rank => '29', + #tier => '2', +# predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::motif_feature_translocation', +# description => 'A feature translocation where the region contains a transcription factor binding site', +# label => 'TFBS translocation', +# }, + { + SO_accession => 'SO:0001894', + SO_term => 'regulatory_region_ablation', + feature_SO_term => 'TF_binding_site', + feature_class => 'Bio::EnsEMBL::Funcgen::RegulatoryFeature', + variant_feature_class => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + rank => '31', + tier => '2', + predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::feature_ablation', + description => 'A feature ablation whereby the deleted region includes a regulatory region', + label => 'Regulatory region ablation', + }, +# { +# SO_accession => 'SO:0001887', +# SO_term => 'regulatory_region_fusion', +# feature_SO_term => 'TF_binding_site', +# feature_class => 'Bio::EnsEMBL::Funcgen::RegulatoryFeature', +# variant_feature_class => 'Bio::EnsEMBL::Variation::BaseVariationFeature', +# rank => '32', + #tier => '2', +# predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::regulatory_feature_fusion', +# description => 'A fusion where the deletion brings together regulatory regions', +# label => 'Regulatory region fusion', +# }, + { + SO_accession => 'SO:0001891', + SO_term => 'regulatory_region_amplification', + feature_SO_term => 'TF_binding_site', + feature_class => 'Bio::EnsEMBL::Funcgen::RegulatoryFeature', + variant_feature_class => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + rank => '33', + tier => '2', + predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::feature_amplification', + description => 'A feature amplification of a region containing a regulatory region', + label => 'Regulatory region amplification', + }, +# { +# SO_accession => 'SO:0001884', +# SO_term => 'regulatory_region_translocation', +# feature_SO_term => 'TF_binding_site', +# feature_class => 'Bio::EnsEMBL::Funcgen::RegulatoryFeature', +# variant_feature_class => 'Bio::EnsEMBL::Variation::BaseVariationFeature', +# rank => '34', + #tier => '2', +# predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::regulatory_feature_translocation', +# description => 'A feature translocation where the region contains a regulatory region', +# label => 'Regulatory region translocation', +# }, + { + SO_accession => 'SO:0001907', + SO_term => 'feature_elongation', + feature_class => 'Bio::EnsEMBL::Feature', + variant_feature_class => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + rank => '36', + tier => '3', + predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::feature_elongation', + description => 'A sequence variant that causes the extension of a genomic feature, with regard to the reference sequence', + label => 'Feature elongation', + }, + { + SO_accession => 'SO:0001906', + SO_term => 'feature_truncation', + feature_class => 'Bio::EnsEMBL::Feature', + variant_feature_class => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + rank => '37', + tier => '3', + predicate => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::feature_truncation', + description => 'A sequence variant that causes the reduction of a genomic feature, with regard to the reference sequence', + label => 'Feature truncation', + }, +); + +our @FEATURE_TYPES = ( + { + SO_accession => 'SO:0000234', + SO_term => 'mRNA', + ens_feature_class => 'Bio::EnsEMBL::Transcript', + ens_feature_subtype => 'protein_coding', + ens_variant_class => 'Bio::EnsEMBL::Variation::TranscriptVariation', + }, + { + SO_accession => 'SO:0000673', + SO_term => 'transcript', + ens_feature_class => 'Bio::EnsEMBL::Transcript', + ens_variant_class => 'Bio::EnsEMBL::Variation::TranscriptVariation', + }, + { + SO_accession => 'SO:0000185', + SO_term => 'primary_transcript', + ens_feature_class => 'Bio::EnsEMBL::Transcript', + ens_variant_class => 'Bio::EnsEMBL::Variation::TranscriptVariation', + }, + { + SO_accession => 'SO:0000655', + SO_term => 'ncRNA', + ens_feature_class => 'Bio::EnsEMBL::Transcript', + ens_variant_class => 'Bio::EnsEMBL::Variation::TranscriptVariation', + }, + { + SO_accession => 'SO:0000276', + SO_term => 'miRNA', + ens_feature_class => 'Bio::EnsEMBL::Transcript', + ens_variant_class => 'Bio::EnsEMBL::Variation::TranscriptVariation', + }, + { + SO_accession => 'SO:0005836', + SO_term => 'regulatory_region', + ens_feature_class => 'Bio::EnsEMBL::Funcgen::RegulatoryFeature', + ens_variant_class => 'Bio::EnsEMBL::Variation::RegulatoryFeatureVariation', + }, + { + SO_accession => 'SO:0000409', + SO_term => 'binding_site', + ens_feature_class => 'Bio::EnsEMBL::Funcgen::MotifFeature', + ens_variant_class => 'Bio::EnsEMBL::Variation::MotifFeatureVariation', + }, + { + SO_accession => 'SO:0005836', + SO_term => 'regulatory_region', + ens_feature_class => 'Bio::EnsEMBL::Funcgen::ExternalFeature', + ens_variant_class => 'Bio::EnsEMBL::Variation::ExternalFeatureVariation', + ens_feature_subtype => 'VISTA enhancer set', + }, + { + SO_accession => 'SO:0000409', + SO_term => 'binding_site', + ens_feature_class => 'Bio::EnsEMBL::Funcgen::ExternalFeature', + ens_variant_class => 'Bio::EnsEMBL::Variation::ExternalFeatureVariation', + ens_feature_subtype => 'cisRED motif', + }, + { + SO_accession => 'SO:0005836', + SO_term => 'regulatory_region', + ens_feature_class => 'Bio::EnsEMBL::Funcgen::ExternalFeature', + ens_variant_class => 'Bio::EnsEMBL::Variation::ExternalFeatureVariation', + ens_feature_subtype => 'miRanda miRNA target', + }, + { + SO_accession => 'SO:0000110', + SO_term => 'sequence_feature', + ens_feature_class => 'Bio::EnsEMBL::Feature', + ens_variant_class => 'Bio::EnsEMBL::Variation::StructuralVariationFeatureOverlap', + }, +); + +# attrib_types are specified as hashrefs in the @ATTRIB_TYPES array. Each hashref should have a value for the key 'code' and optionally values for the keys 'name' and 'description' +our @ATTRIB_TYPES = ( + { + code => 'SO_accession', + description => 'Sequence Ontology accession', + }, + { + code => 'SO_term', + description => 'Sequence Ontology term', + }, + { + code => 'display_term', + description => 'Ensembl display term', + }, + { + code => 'NCBI_term', + description => 'NCBI term', + }, + { + code => 'feature_SO_term', + description => 'Sequence Ontology term for the associated feature', + }, + { + code => 'rank', + description => 'Relative severity of this variation consequence', + }, + { + code => 'polyphen_prediction', + description => 'PolyPhen-2 prediction', + }, + { + code => 'sift_prediction', + description => 'SIFT prediction', + }, + { + code => 'short_name', + name => 'Short name', + description => 'A shorter name for an instance, e.g. a VariationSet', + }, + { + code => 'dbsnp_clin_sig', + name => 'dbSNP clinical significance', + description => 'The clinical significance of a variant as reported by dbSNP', + }, + { + code => 'dgva_clin_sig', + name => 'DGVa clinical significance', + description => 'The clinical significance of a structural variant as reported by DGVa', + }, + { + code => 'prot_func_analysis', + name => 'Protein function analysis ', + description => 'The program used to make protein function predictions', + }, + +); + +# attribs are specified in the %ATTRIBS hash, having the attrib_type code as hash key and a listref containing the attribs that will be loaded as value +our %ATTRIBS = ( + 'short_name' => \@short_names, + 'dbsnp_clin_sig' => \@dbsnp_clinical_significance_types, + 'dgva_clin_sig' => \@dgva_clinical_significance_types, + 'polyphen_prediction' => ['probably damaging', 'possibly damaging', 'benign', 'unknown'], + 'sift_prediction' => [qw(tolerated deleterious)], + 'prot_func_analysis' => [qw(sift polyphen_humvar polyphen_humdiv)], +); + +# attrib sets are specified by putting a hashref in the @ATTRIB_SETS array having the attrib_type code as key and the attrib as value. new attrib entries will be inserted as necessary +our @ATTRIB_SETS = ( + @VARIATION_CLASSES, + @OVERLAP_CONSEQUENCES, + @FEATURE_TYPES +); + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/Constants.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/Constants.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,687 @@ +package Bio::EnsEMBL::Variation::Utils::Constants; + +##################################################################### +# NB: THIS FILE HAS BEEN AUTOMATICALLY GENERATED, EDIT WITH CAUTION # +##################################################################### + +use strict; +use warnings; + +use base qw(Exporter); + +our @EXPORT_OK = qw(%OVERLAP_CONSEQUENCES %VARIATION_CLASSES $DEFAULT_OVERLAP_CONSEQUENCE SO_TERM_INDEL SO_TERM_CODING_SEQUENCE_VARIANT SO_TERM_STOP_RETAINED_VARIANT SO_TERM_COMPLEX_STRUCTURAL_ALTERATION SO_TERM_SPLICE_ACCEPTOR_VARIANT SO_TERM_TANDEM_REPEAT SO_TERM_INTERCHROMOSOMAL_BREAKPOINT SO_TERM_FEATURE_ELONGATION SO_TERM_TRANSCRIPT_ABLATION SO_TERM_TF_BINDING_SITE_VARIANT ATTRIB_TYPE_DISPLAY_TERM SO_TERM_INTRON_VARIANT SO_TERM_STOP_GAINED SO_TERM_SPLICE_DONOR_VARIANT ATTRIB_TYPE_SIFT_PREDICTION SO_TERM_FEATURE_TRUNCATION SO_TERM_INTRACHROMOSOMAL_BREAKPOINT SO_TERM_SPLICE_REGION_VARIANT SO_TERM_TFBS_ABLATION SO_TERM_REGULATORY_REGION_AMPLIFICATION SO_TERM_MISSENSE_VARIANT SO_TERM_SNV SO_TERM_STRUCTURAL_VARIANT SO_TERM_PROBE ATTRIB_TYPE_DBSNP_CLIN_SIG SO_TERM_SEQUENCE_ALTERATION SO_TERM_REGULATORY_REGION_VARIANT SO_TERM_SUBSTITUTION ATTRIB_TYPE_DGVA_CLIN_SIG SO_TERM_INCOMPLETE_TERMINAL_CODON_VARIANT SO_TERM_DUPLICATION SO_TERM_TFBS_AMPLIFICATION SO_TERM_5_PRIME_UTR_VARIANT ATTRIB_TYPE_PROT_FUNC_ANALYSIS SO_TERM_TANDEM_DUPLICATION SO_TERM_MOBILE_ELEMENT_INSERTION SO_TERM_MATURE_MIRNA_VARIANT SO_TERM_NC_TRANSCRIPT_VARIANT SO_TERM_DOWNSTREAM_GENE_VARIANT SO_TERM_INFRAME_INSERTION SO_TERM_INSERTION SO_TERM_NMD_TRANSCRIPT_VARIANT ATTRIB_TYPE_SO_TERM SO_TERM_INTERGENIC_VARIANT SO_TERM_NON_CODING_EXON_VARIANT SO_TERM_SYNONYMOUS_VARIANT ATTRIB_TYPE_SHORT_NAME SO_TERM_TRANSLOCATION SO_TERM_COPY_NUMBER_VARIATION ATTRIB_TYPE_POLYPHEN_PREDICTION SO_TERM_REGULATORY_REGION_ABLATION SO_TERM_INVERSION SO_TERM_STOP_LOST ATTRIB_TYPE_RANK SO_TERM_INITIATOR_CODON_VARIANT SO_TERM_3_PRIME_UTR_VARIANT SO_TERM_UPSTREAM_GENE_VARIANT ATTRIB_TYPE_SO_ACCESSION SO_TERM_COPY_NUMBER_LOSS ATTRIB_TYPE_NCBI_TERM SO_TERM_DELETION ATTRIB_TYPE_FEATURE_SO_TERM SO_TERM_FRAMESHIFT_VARIANT SO_TERM_COPY_NUMBER_GAIN SO_TERM_INFRAME_DELETION SO_TERM_TRANSCRIPT_AMPLIFICATION); + +our %EXPORT_TAGS = ( attrib_types => [qw(ATTRIB_TYPE_RANK ATTRIB_TYPE_NCBI_TERM ATTRIB_TYPE_SHORT_NAME ATTRIB_TYPE_FEATURE_SO_TERM ATTRIB_TYPE_DISPLAY_TERM ATTRIB_TYPE_SO_ACCESSION ATTRIB_TYPE_DBSNP_CLIN_SIG ATTRIB_TYPE_PROT_FUNC_ANALYSIS ATTRIB_TYPE_SO_TERM ATTRIB_TYPE_SIFT_PREDICTION ATTRIB_TYPE_POLYPHEN_PREDICTION ATTRIB_TYPE_DGVA_CLIN_SIG)], SO_consequence_terms => [qw(SO_TERM_NC_TRANSCRIPT_VARIANT SO_TERM_DOWNSTREAM_GENE_VARIANT SO_TERM_CODING_SEQUENCE_VARIANT SO_TERM_STOP_RETAINED_VARIANT SO_TERM_INFRAME_INSERTION SO_TERM_NMD_TRANSCRIPT_VARIANT SO_TERM_SPLICE_ACCEPTOR_VARIANT SO_TERM_INTERGENIC_VARIANT SO_TERM_FEATURE_ELONGATION SO_TERM_NON_CODING_EXON_VARIANT SO_TERM_TRANSCRIPT_ABLATION SO_TERM_SYNONYMOUS_VARIANT SO_TERM_TF_BINDING_SITE_VARIANT SO_TERM_INTRON_VARIANT SO_TERM_STOP_GAINED SO_TERM_SPLICE_DONOR_VARIANT SO_TERM_FEATURE_TRUNCATION SO_TERM_REGULATORY_REGION_ABLATION SO_TERM_TFBS_ABLATION SO_TERM_SPLICE_REGION_VARIANT SO_TERM_STOP_LOST SO_TERM_INITIATOR_CODON_VARIANT SO_TERM_3_PRIME_UTR_VARIANT SO_TERM_REGULATORY_REGION_AMPLIFICATION SO_TERM_UPSTREAM_GENE_VARIANT SO_TERM_MISSENSE_VARIANT SO_TERM_REGULATORY_REGION_VARIANT SO_TERM_INCOMPLETE_TERMINAL_CODON_VARIANT SO_TERM_FRAMESHIFT_VARIANT SO_TERM_TFBS_AMPLIFICATION SO_TERM_5_PRIME_UTR_VARIANT SO_TERM_INFRAME_DELETION SO_TERM_TRANSCRIPT_AMPLIFICATION SO_TERM_MATURE_MIRNA_VARIANT)], SO_class_terms => [qw(SO_TERM_INDEL SO_TERM_SNV SO_TERM_STRUCTURAL_VARIANT SO_TERM_COMPLEX_STRUCTURAL_ALTERATION SO_TERM_INSERTION SO_TERM_PROBE SO_TERM_COPY_NUMBER_LOSS SO_TERM_SEQUENCE_ALTERATION SO_TERM_TANDEM_REPEAT SO_TERM_INTERCHROMOSOMAL_BREAKPOINT SO_TERM_SUBSTITUTION SO_TERM_DELETION SO_TERM_COPY_NUMBER_GAIN SO_TERM_DUPLICATION SO_TERM_TRANSLOCATION SO_TERM_COPY_NUMBER_VARIATION SO_TERM_TANDEM_DUPLICATION SO_TERM_INVERSION SO_TERM_INTRACHROMOSOMAL_BREAKPOINT SO_TERM_MOBILE_ELEMENT_INSERTION)], ); + +use Bio::EnsEMBL::Variation::OverlapConsequence; + +use constant ATTRIB_TYPE_SO_ACCESSION => 'SO_accession'; +use constant ATTRIB_TYPE_SO_TERM => 'SO_term'; +use constant ATTRIB_TYPE_DISPLAY_TERM => 'display_term'; +use constant ATTRIB_TYPE_NCBI_TERM => 'NCBI_term'; +use constant ATTRIB_TYPE_FEATURE_SO_TERM => 'feature_SO_term'; +use constant ATTRIB_TYPE_RANK => 'rank'; +use constant ATTRIB_TYPE_POLYPHEN_PREDICTION => 'polyphen_prediction'; +use constant ATTRIB_TYPE_SIFT_PREDICTION => 'sift_prediction'; +use constant ATTRIB_TYPE_SHORT_NAME => 'short_name'; +use constant ATTRIB_TYPE_DBSNP_CLIN_SIG => 'dbsnp_clin_sig'; +use constant ATTRIB_TYPE_DGVA_CLIN_SIG => 'dgva_clin_sig'; +use constant ATTRIB_TYPE_PROT_FUNC_ANALYSIS => 'prot_func_analysis'; + +use constant SO_TERM_SNV => 'SNV'; +use constant SO_TERM_SUBSTITUTION => 'substitution'; +use constant SO_TERM_COPY_NUMBER_VARIATION => 'copy_number_variation'; +use constant SO_TERM_INSERTION => 'insertion'; +use constant SO_TERM_DELETION => 'deletion'; +use constant SO_TERM_INDEL => 'indel'; +use constant SO_TERM_TANDEM_REPEAT => 'tandem_repeat'; +use constant SO_TERM_SEQUENCE_ALTERATION => 'sequence_alteration'; +use constant SO_TERM_STRUCTURAL_VARIANT => 'structural_variant'; +use constant SO_TERM_PROBE => 'probe'; +use constant SO_TERM_COPY_NUMBER_GAIN => 'copy_number_gain'; +use constant SO_TERM_COPY_NUMBER_LOSS => 'copy_number_loss'; +use constant SO_TERM_INVERSION => 'inversion'; +use constant SO_TERM_COMPLEX_STRUCTURAL_ALTERATION => 'complex_structural_alteration'; +use constant SO_TERM_TANDEM_DUPLICATION => 'tandem_duplication'; +use constant SO_TERM_MOBILE_ELEMENT_INSERTION => 'mobile_element_insertion'; +use constant SO_TERM_INTERCHROMOSOMAL_BREAKPOINT => 'interchromosomal_breakpoint'; +use constant SO_TERM_INTRACHROMOSOMAL_BREAKPOINT => 'intrachromosomal_breakpoint'; +use constant SO_TERM_TRANSLOCATION => 'translocation'; +use constant SO_TERM_DUPLICATION => 'duplication'; +use constant SO_TERM_INTERGENIC_VARIANT => 'intergenic_variant'; +use constant SO_TERM_UPSTREAM_GENE_VARIANT => 'upstream_gene_variant'; +use constant SO_TERM_DOWNSTREAM_GENE_VARIANT => 'downstream_gene_variant'; +use constant SO_TERM_SPLICE_DONOR_VARIANT => 'splice_donor_variant'; +use constant SO_TERM_SPLICE_ACCEPTOR_VARIANT => 'splice_acceptor_variant'; +use constant SO_TERM_SPLICE_REGION_VARIANT => 'splice_region_variant'; +use constant SO_TERM_INTRON_VARIANT => 'intron_variant'; +use constant SO_TERM_5_PRIME_UTR_VARIANT => '5_prime_UTR_variant'; +use constant SO_TERM_3_PRIME_UTR_VARIANT => '3_prime_UTR_variant'; +use constant SO_TERM_SYNONYMOUS_VARIANT => 'synonymous_variant'; +use constant SO_TERM_MISSENSE_VARIANT => 'missense_variant'; +use constant SO_TERM_INFRAME_INSERTION => 'inframe_insertion'; +use constant SO_TERM_INFRAME_DELETION => 'inframe_deletion'; +use constant SO_TERM_STOP_GAINED => 'stop_gained'; +use constant SO_TERM_STOP_LOST => 'stop_lost'; +use constant SO_TERM_STOP_RETAINED_VARIANT => 'stop_retained_variant'; +use constant SO_TERM_INITIATOR_CODON_VARIANT => 'initiator_codon_variant'; +use constant SO_TERM_FRAMESHIFT_VARIANT => 'frameshift_variant'; +use constant SO_TERM_INCOMPLETE_TERMINAL_CODON_VARIANT => 'incomplete_terminal_codon_variant'; +use constant SO_TERM_NMD_TRANSCRIPT_VARIANT => 'NMD_transcript_variant'; +use constant SO_TERM_NC_TRANSCRIPT_VARIANT => 'nc_transcript_variant'; +use constant SO_TERM_NON_CODING_EXON_VARIANT => 'non_coding_exon_variant'; +use constant SO_TERM_MATURE_MIRNA_VARIANT => 'mature_miRNA_variant'; +use constant SO_TERM_CODING_SEQUENCE_VARIANT => 'coding_sequence_variant'; +use constant SO_TERM_REGULATORY_REGION_VARIANT => 'regulatory_region_variant'; +use constant SO_TERM_TF_BINDING_SITE_VARIANT => 'TF_binding_site_variant'; +use constant SO_TERM_TRANSCRIPT_ABLATION => 'transcript_ablation'; +use constant SO_TERM_TRANSCRIPT_AMPLIFICATION => 'transcript_amplification'; +use constant SO_TERM_TFBS_ABLATION => 'TFBS_ablation'; +use constant SO_TERM_TFBS_AMPLIFICATION => 'TFBS_amplification'; +use constant SO_TERM_REGULATORY_REGION_ABLATION => 'regulatory_region_ablation'; +use constant SO_TERM_REGULATORY_REGION_AMPLIFICATION => 'regulatory_region_amplification'; +use constant SO_TERM_FEATURE_ELONGATION => 'feature_elongation'; +use constant SO_TERM_FEATURE_TRUNCATION => 'feature_truncation'; + +our %VARIATION_CLASSES = ( +'SNV' => { + 'somatic_display_term' => 'somatic_SNV', + 'SO_accession' => 'SO:0001483', + 'display_term' => 'SNP' +} +, +'substitution' => { + 'somatic_display_term' => 'somatic_substitution', + 'SO_accession' => 'SO:1000002', + 'display_term' => 'substitution' +} +, +'copy_number_variation' => { + 'somatic_display_term' => 'somatic_CNV', + 'SO_accession' => 'SO:0001019', + 'display_term' => 'CNV' +} +, +'insertion' => { + 'somatic_display_term' => 'somatic_insertion', + 'SO_accession' => 'SO:0000667', + 'display_term' => 'insertion' +} +, +'deletion' => { + 'somatic_display_term' => 'somatic_deletion', + 'SO_accession' => 'SO:0000159', + 'display_term' => 'deletion' +} +, +'indel' => { + 'somatic_display_term' => 'somatic_indel', + 'SO_accession' => 'SO:1000032', + 'display_term' => 'indel' +} +, +'tandem_repeat' => { + 'somatic_display_term' => 'somatic_tandem_repeat', + 'SO_accession' => 'SO:0000705', + 'display_term' => 'tandem_repeat' +} +, +'sequence_alteration' => { + 'somatic_display_term' => 'somatic_sequence_alteration', + 'SO_accession' => 'SO:0001059', + 'display_term' => 'sequence_alteration' +} +, +'structural_variant' => { + 'somatic_display_term' => 'somatic_SV', + 'SO_accession' => 'SO:0001537', + 'display_term' => 'SV' +} +, +'probe' => { + 'somatic_display_term' => 'somatic_CNV_PROBE', + 'SO_accession' => 'SO:0000051', + 'display_term' => 'CNV_PROBE' +} +, +'copy_number_gain' => { + 'somatic_display_term' => 'somatic_Gain', + 'SO_accession' => 'SO:0001742', + 'display_term' => 'Gain' +} +, +'copy_number_loss' => { + 'somatic_display_term' => 'somatic_Loss', + 'SO_accession' => 'SO:0001743', + 'display_term' => 'Loss' +} +, +'inversion' => { + 'somatic_display_term' => 'somatic_inversion', + 'SO_accession' => 'SO:1000036', + 'display_term' => 'inversion' +} +, +'complex_structural_alteration' => { + 'somatic_display_term' => 'somatic_Complex', + 'SO_accession' => 'SO:0001784', + 'display_term' => 'Complex' +} +, +'tandem_duplication' => { + 'somatic_display_term' => 'somatic_Tandem duplication', + 'SO_accession' => 'SO:1000173', + 'display_term' => 'Tandem duplication' +} +, +'mobile_element_insertion' => { + 'somatic_display_term' => 'somatic_Mobile element insertion', + 'SO_accession' => 'SO:0001837', + 'display_term' => 'Mobile element insertion' +} +, +'interchromosomal_breakpoint' => { + 'somatic_display_term' => 'somatic_Interchromosomal breakpoint', + 'SO_accession' => 'SO:0001873', + 'display_term' => 'Interchromosomal breakpoint' +} +, +'intrachromosomal_breakpoint' => { + 'somatic_display_term' => 'somatic_Intrachromosomal breakpoint', + 'SO_accession' => 'SO:0001874', + 'display_term' => 'Intrachromosomal breakpoint' +} +, +'translocation' => { + 'somatic_display_term' => 'somatic_translocation', + 'SO_accession' => 'SO:0000199', + 'display_term' => 'translocation' +} +, +'duplication' => { + 'somatic_display_term' => 'somatic_Duplication', + 'SO_accession' => 'SO:1000035', + 'display_term' => 'Duplication' +} +, +); + +our $DEFAULT_OVERLAP_CONSEQUENCE = Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'tier' => '4', + 'SO_term' => 'intergenic_variant', + 'is_default' => 1, + 'label' => 'Intergenic variant', + 'description' => 'A sequence variant located in the intergenic region, between genes', + 'rank' => '38', + 'SO_accession' => 'SO:0001628', + 'display_term' => 'INTERGENIC' +} +); + + +our %OVERLAP_CONSEQUENCES = ( +'intergenic_variant' => $DEFAULT_OVERLAP_CONSEQUENCE, +'upstream_gene_variant' => Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'variant_feature_class' => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + 'feature_SO_term' => 'transcript', + 'description' => 'A sequence variant located 5\' of a gene', + 'SO_accession' => 'SO:0001631', + 'SO_term' => 'upstream_gene_variant', + 'tier' => '3', + 'predicate' => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::upstream', + 'label' => 'Upstream gene variant', + 'rank' => '24', + 'display_term' => 'UPSTREAM', + 'feature_class' => 'Bio::EnsEMBL::Transcript' +} +), +'downstream_gene_variant' => Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'variant_feature_class' => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + 'feature_SO_term' => 'transcript', + 'description' => 'A sequence variant located 3\' of a gene', + 'SO_accession' => 'SO:0001632', + 'SO_term' => 'downstream_gene_variant', + 'tier' => '3', + 'predicate' => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::downstream', + 'label' => 'Downstream gene variant', + 'rank' => '25', + 'display_term' => 'DOWNSTREAM', + 'feature_class' => 'Bio::EnsEMBL::Transcript' +} +), +'splice_donor_variant' => Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'variant_feature_class' => 'Bio::EnsEMBL::Variation::VariationFeature', + 'NCBI_term' => 'splice-5', + 'feature_SO_term' => 'primary_transcript', + 'description' => 'A splice variant that changes the 2 base region at the 5\' end of an intron', + 'SO_accession' => 'SO:0001575', + 'tier' => '3', + 'SO_term' => 'splice_donor_variant', + 'predicate' => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::donor_splice_site', + 'label' => 'Splice donor variant', + 'rank' => '3', + 'display_term' => 'ESSENTIAL_SPLICE_SITE', + 'feature_class' => 'Bio::EnsEMBL::Transcript' +} +), +'splice_acceptor_variant' => Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'variant_feature_class' => 'Bio::EnsEMBL::Variation::VariationFeature', + 'NCBI_term' => 'splice-3', + 'feature_SO_term' => 'primary_transcript', + 'description' => 'A splice variant that changes the 2 base region at the 3\' end of an intron', + 'SO_accession' => 'SO:0001574', + 'tier' => '3', + 'SO_term' => 'splice_acceptor_variant', + 'predicate' => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::acceptor_splice_site', + 'label' => 'Splice acceptor variant', + 'rank' => '3', + 'display_term' => 'ESSENTIAL_SPLICE_SITE', + 'feature_class' => 'Bio::EnsEMBL::Transcript' +} +), +'splice_region_variant' => Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'variant_feature_class' => 'Bio::EnsEMBL::Variation::VariationFeature', + 'feature_SO_term' => 'primary_transcript', + 'description' => 'A sequence variant in which a change has occurred within the region of the splice site, either within 1-3 bases of the exon or 3-8 bases of the intron', + 'SO_accession' => 'SO:0001630', + 'SO_term' => 'splice_region_variant', + 'tier' => '3', + 'predicate' => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::splice_region', + 'label' => 'Splice region variant', + 'rank' => '13', + 'display_term' => 'SPLICE_SITE', + 'feature_class' => 'Bio::EnsEMBL::Transcript' +} +), +'intron_variant' => Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'variant_feature_class' => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + 'NCBI_term' => 'intron', + 'feature_SO_term' => 'primary_transcript', + 'description' => 'A transcript variant occurring within an intron', + 'SO_accession' => 'SO:0001627', + 'tier' => '3', + 'SO_term' => 'intron_variant', + 'predicate' => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::within_intron', + 'label' => 'Intron variant', + 'rank' => '20', + 'display_term' => 'INTRONIC', + 'feature_class' => 'Bio::EnsEMBL::Transcript' +} +), +'5_prime_UTR_variant' => Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'variant_feature_class' => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + 'NCBI_term' => 'untranslated_5', + 'feature_SO_term' => 'mRNA', + 'description' => 'A UTR variant of the 5\' UTR', + 'SO_accession' => 'SO:0001623', + 'tier' => '3', + 'SO_term' => '5_prime_UTR_variant', + 'predicate' => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::within_5_prime_utr', + 'label' => '5 prime UTR variant', + 'rank' => '18', + 'display_term' => '5PRIME_UTR', + 'feature_class' => 'Bio::EnsEMBL::Transcript' +} +), +'3_prime_UTR_variant' => Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'variant_feature_class' => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + 'NCBI_term' => 'untranslated_3', + 'feature_SO_term' => 'mRNA', + 'description' => 'A UTR variant of the 3\' UTR', + 'SO_accession' => 'SO:0001624', + 'tier' => '3', + 'SO_term' => '3_prime_UTR_variant', + 'predicate' => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::within_3_prime_utr', + 'label' => '3 prime UTR variant', + 'rank' => '19', + 'display_term' => '3PRIME_UTR', + 'feature_class' => 'Bio::EnsEMBL::Transcript' +} +), +'synonymous_variant' => Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'variant_feature_class' => 'Bio::EnsEMBL::Variation::VariationFeature', + 'NCBI_term' => 'cds-synon', + 'feature_SO_term' => 'mRNA', + 'description' => 'A sequence variant where there is no resulting change to the encoded amino acid', + 'SO_accession' => 'SO:0001819', + 'tier' => '3', + 'SO_term' => 'synonymous_variant', + 'predicate' => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::synonymous_variant', + 'label' => 'Synonymous variant', + 'rank' => '15', + 'display_term' => 'SYNONYMOUS_CODING', + 'feature_class' => 'Bio::EnsEMBL::Transcript' +} +), +'missense_variant' => Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'variant_feature_class' => 'Bio::EnsEMBL::Variation::VariationFeature', + 'NCBI_term' => 'missense', + 'feature_SO_term' => 'mRNA', + 'description' => 'A sequence variant, where the change may be longer than 3 bases, and at least one base of a codon is changed resulting in a codon that encodes for a different amino acid', + 'SO_accession' => 'SO:0001583', + 'tier' => '3', + 'SO_term' => 'missense_variant', + 'predicate' => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::missense_variant', + 'label' => 'Missense variant', + 'rank' => '12', + 'display_term' => 'NON_SYNONYMOUS_CODING', + 'feature_class' => 'Bio::EnsEMBL::Transcript' +} +), +'inframe_insertion' => Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'variant_feature_class' => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + 'feature_SO_term' => 'mRNA', + 'description' => 'An inframe non synonymous variant that inserts bases into in the coding sequence', + 'SO_accession' => 'SO:0001821', + 'SO_term' => 'inframe_insertion', + 'tier' => '3', + 'predicate' => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::inframe_insertion', + 'label' => 'Inframe insertion', + 'rank' => '10', + 'display_term' => 'NON_SYNONYMOUS_CODING', + 'feature_class' => 'Bio::EnsEMBL::Transcript' +} +), +'inframe_deletion' => Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'variant_feature_class' => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + 'feature_SO_term' => 'mRNA', + 'description' => 'An inframe non synonymous variant that deletes bases from the coding sequence', + 'SO_accession' => 'SO:0001822', + 'SO_term' => 'inframe_deletion', + 'tier' => '3', + 'predicate' => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::inframe_deletion', + 'label' => 'Inframe deletion', + 'rank' => '11', + 'display_term' => 'NON_SYNONYMOUS_CODING', + 'feature_class' => 'Bio::EnsEMBL::Transcript' +} +), +'stop_gained' => Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'variant_feature_class' => 'Bio::EnsEMBL::Variation::VariationFeature', + 'NCBI_term' => 'nonsense', + 'feature_SO_term' => 'mRNA', + 'description' => 'A sequence variant whereby at least one base of a codon is changed, resulting in a premature stop codon, leading to a shortened transcript', + 'SO_accession' => 'SO:0001587', + 'tier' => '3', + 'SO_term' => 'stop_gained', + 'predicate' => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::stop_gained', + 'label' => 'Stop gained', + 'rank' => '4', + 'display_term' => 'STOP_GAINED', + 'feature_class' => 'Bio::EnsEMBL::Transcript' +} +), +'stop_lost' => Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'variant_feature_class' => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + 'feature_SO_term' => 'mRNA', + 'description' => 'A sequence variant where at least one base of the terminator codon (stop) is changed, resulting in an elongated transcript', + 'SO_accession' => 'SO:0001578', + 'SO_term' => 'stop_lost', + 'tier' => '3', + 'predicate' => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::stop_lost', + 'label' => 'Stop lost', + 'rank' => '6', + 'display_term' => 'STOP_LOST', + 'feature_class' => 'Bio::EnsEMBL::Transcript' +} +), +'stop_retained_variant' => Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'variant_feature_class' => 'Bio::EnsEMBL::Variation::VariationFeature', + 'feature_SO_term' => 'mRNA', + 'description' => 'A sequence variant where at least one base in the terminator codon is changed, but the terminator remains', + 'SO_accession' => 'SO:0001567', + 'SO_term' => 'stop_retained_variant', + 'tier' => '3', + 'predicate' => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::stop_retained', + 'label' => 'Stop retained variant', + 'rank' => '15', + 'display_term' => 'SYNONYMOUS_CODING', + 'feature_class' => 'Bio::EnsEMBL::Transcript' +} +), +'initiator_codon_variant' => Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'variant_feature_class' => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + 'feature_SO_term' => 'mRNA', + 'description' => 'A codon variant that changes at least one base of the first codon of a transcript', + 'SO_accession' => 'SO:0001582', + 'SO_term' => 'initiator_codon_variant', + 'tier' => '3', + 'predicate' => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::affects_start_codon', + 'label' => 'Initiator codon variant', + 'rank' => '7', + 'display_term' => 'NON_SYNONYMOUS_CODING', + 'feature_class' => 'Bio::EnsEMBL::Transcript' +} +), +'frameshift_variant' => Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'variant_feature_class' => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + 'NCBI_term' => 'frameshift', + 'feature_SO_term' => 'mRNA', + 'description' => 'A sequence variant which causes a disruption of the translational reading frame, because the number of nucleotides inserted or deleted is not a multiple of three', + 'SO_accession' => 'SO:0001589', + 'tier' => '3', + 'SO_term' => 'frameshift_variant', + 'predicate' => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::frameshift', + 'label' => 'Frameshift variant', + 'rank' => '5', + 'display_term' => 'FRAMESHIFT_CODING', + 'feature_class' => 'Bio::EnsEMBL::Transcript' +} +), +'incomplete_terminal_codon_variant' => Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'variant_feature_class' => 'Bio::EnsEMBL::Variation::VariationFeature', + 'feature_SO_term' => 'mRNA', + 'description' => 'A sequence variant where at least one base of the final codon of an incompletely annotated transcript is changed', + 'SO_accession' => 'SO:0001626', + 'SO_term' => 'incomplete_terminal_codon_variant', + 'tier' => '3', + 'predicate' => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::partial_codon', + 'label' => 'Incomplete terminal codon variant', + 'rank' => '14', + 'display_term' => 'PARTIAL_CODON', + 'feature_class' => 'Bio::EnsEMBL::Transcript' +} +), +'NMD_transcript_variant' => Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'variant_feature_class' => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + 'feature_SO_term' => 'mRNA', + 'description' => 'A variant in a transcript that is the target of NMD', + 'SO_accession' => 'SO:0001621', + 'SO_term' => 'NMD_transcript_variant', + 'tier' => '3', + 'predicate' => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::within_nmd_transcript', + 'label' => 'NMD transcript variant', + 'rank' => '21', + 'display_term' => 'NMD_TRANSCRIPT', + 'feature_class' => 'Bio::EnsEMBL::Transcript' +} +), +'nc_transcript_variant' => Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'variant_feature_class' => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + 'feature_SO_term' => 'ncRNA', + 'description' => 'A transcript variant of a non coding RNA', + 'SO_accession' => 'SO:0001619', + 'SO_term' => 'nc_transcript_variant', + 'tier' => '3', + 'predicate' => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::within_non_coding_gene', + 'label' => 'NC transcript variant', + 'rank' => '23', + 'display_term' => 'WITHIN_NON_CODING_GENE', + 'feature_class' => 'Bio::EnsEMBL::Transcript' +} +), +'non_coding_exon_variant' => Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'variant_feature_class' => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + 'feature_SO_term' => 'ncRNA', + 'description' => 'A sequence variant that changes non-coding exon sequence', + 'SO_accession' => 'SO:0001792', + 'SO_term' => 'non_coding_exon_variant', + 'tier' => '3', + 'predicate' => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::non_coding_exon_variant', + 'label' => 'Non coding exon variant', + 'rank' => '22', + 'display_term' => 'WITHIN_NON_CODING_GENE', + 'feature_class' => 'Bio::EnsEMBL::Transcript' +} +), +'mature_miRNA_variant' => Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'variant_feature_class' => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + 'feature_SO_term' => 'miRNA', + 'description' => 'A transcript variant located with the sequence of the mature miRNA', + 'SO_accession' => 'SO:0001620', + 'SO_term' => 'mature_miRNA_variant', + 'tier' => '2', + 'predicate' => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::within_mature_miRNA', + 'label' => 'Mature miRNA variant', + 'rank' => '17', + 'display_term' => 'WITHIN_MATURE_miRNA', + 'feature_class' => 'Bio::EnsEMBL::Transcript' +} +), +'coding_sequence_variant' => Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'variant_feature_class' => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + 'feature_SO_term' => 'mRNA', + 'description' => 'A sequence variant that changes the coding sequence', + 'SO_accession' => 'SO:0001580', + 'SO_term' => 'coding_sequence_variant', + 'tier' => '3', + 'predicate' => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::coding_unknown', + 'label' => 'Coding sequence variant', + 'rank' => '16', + 'display_term' => 'CODING_UNKNOWN', + 'feature_class' => 'Bio::EnsEMBL::Transcript' +} +), +'regulatory_region_variant' => Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'variant_feature_class' => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + 'feature_SO_term' => 'regulatory_region', + 'description' => 'A sequence variant located within a regulatory region', + 'SO_accession' => 'SO:0001566', + 'SO_term' => 'regulatory_region_variant', + 'tier' => '2', + 'predicate' => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::within_regulatory_feature', + 'label' => 'Regulatory region variant', + 'rank' => '36', + 'display_term' => 'REGULATORY_REGION', + 'feature_class' => 'Bio::EnsEMBL::Funcgen::RegulatoryFeature' +} +), +'TF_binding_site_variant' => Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'variant_feature_class' => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + 'feature_SO_term' => 'TF_binding_site', + 'description' => 'In regulatory region annotated by Ensembl', + 'SO_accession' => 'SO:0001782', + 'SO_term' => 'TF_binding_site_variant', + 'tier' => '2', + 'predicate' => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::within_motif_feature', + 'label' => 'A sequence variant located within a transcription factor binding site', + 'rank' => '30', + 'display_term' => 'REGULATORY_REGION', + 'feature_class' => 'Bio::EnsEMBL::Funcgen::MotifFeature' +} +), +'transcript_ablation' => Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'variant_feature_class' => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + 'feature_SO_term' => 'mRNA', + 'description' => 'A feature ablation whereby the deleted region includes a transcript feature', + 'SO_accession' => 'SO:0001893', + 'SO_term' => 'transcript_ablation', + 'tier' => '1', + 'predicate' => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::feature_ablation', + 'label' => 'Transcript ablation', + 'rank' => '1', + 'feature_class' => 'Bio::EnsEMBL::Transcript' +} +), +'transcript_amplification' => Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'variant_feature_class' => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + 'feature_SO_term' => 'mRNA', + 'description' => 'A feature amplification of a region containing a transcript', + 'SO_accession' => 'SO:0001889', + 'SO_term' => 'transcript_amplification', + 'tier' => '1', + 'predicate' => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::feature_amplification', + 'label' => 'Transcript amplification', + 'rank' => '8', + 'feature_class' => 'Bio::EnsEMBL::Transcript' +} +), +'TFBS_ablation' => Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'variant_feature_class' => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + 'feature_SO_term' => 'TF_binding_site', + 'description' => 'A feature ablation whereby the deleted region includes a transcription factor binding site', + 'SO_accession' => 'SO:0001895', + 'SO_term' => 'TFBS_ablation', + 'tier' => '2', + 'predicate' => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::feature_ablation', + 'label' => 'TFBS ablation', + 'rank' => '26', + 'feature_class' => 'Bio::EnsEMBL::Funcgen::MotifFeature' +} +), +'TFBS_amplification' => Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'variant_feature_class' => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + 'feature_SO_term' => 'TF_binding_site', + 'description' => 'A feature amplification of a region containing a transcription factor binding site', + 'SO_accession' => 'SO:0001892', + 'SO_term' => 'TFBS_amplification', + 'tier' => '2', + 'predicate' => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::feature_amplification', + 'label' => 'TFBS amplification', + 'rank' => '28', + 'feature_class' => 'Bio::EnsEMBL::Funcgen::MotifFeature' +} +), +'regulatory_region_ablation' => Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'variant_feature_class' => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + 'feature_SO_term' => 'TF_binding_site', + 'description' => 'A feature ablation whereby the deleted region includes a regulatory region', + 'SO_accession' => 'SO:0001894', + 'SO_term' => 'regulatory_region_ablation', + 'tier' => '2', + 'predicate' => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::feature_ablation', + 'label' => 'Regulatory region ablation', + 'rank' => '31', + 'feature_class' => 'Bio::EnsEMBL::Funcgen::RegulatoryFeature' +} +), +'regulatory_region_amplification' => Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'variant_feature_class' => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + 'feature_SO_term' => 'TF_binding_site', + 'description' => 'A feature amplification of a region containing a regulatory region', + 'SO_accession' => 'SO:0001891', + 'SO_term' => 'regulatory_region_amplification', + 'tier' => '2', + 'predicate' => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::feature_amplification', + 'label' => 'Regulatory region amplification', + 'rank' => '33', + 'feature_class' => 'Bio::EnsEMBL::Funcgen::RegulatoryFeature' +} +), +'feature_elongation' => Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'variant_feature_class' => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + 'description' => 'A sequence variant that causes the extension of a genomic feature, with regard to the reference sequence', + 'SO_accession' => 'SO:0001907', + 'SO_term' => 'feature_elongation', + 'tier' => '3', + 'predicate' => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::feature_elongation', + 'label' => 'Feature elongation', + 'rank' => '36', + 'feature_class' => 'Bio::EnsEMBL::Feature' +} +), +'feature_truncation' => Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'variant_feature_class' => 'Bio::EnsEMBL::Variation::BaseVariationFeature', + 'description' => 'A sequence variant that causes the reduction of a genomic feature, with regard to the reference sequence', + 'SO_accession' => 'SO:0001906', + 'SO_term' => 'feature_truncation', + 'tier' => '3', + 'predicate' => 'Bio::EnsEMBL::Variation::Utils::VariationEffect::feature_truncation', + 'label' => 'Feature truncation', + 'rank' => '37', + 'feature_class' => 'Bio::EnsEMBL::Feature' +} +), +); + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/EnsEMBL2GFF3.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/EnsEMBL2GFF3.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,511 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + + +package Bio::EnsEMBL::Variation::Utils::EnsEMBL2GFF3; + +use strict; +use warnings; + +# This module allows conversion of ensembl objects to GFF3 and GVF by inserting +# to_gff (and supporting _gff_hash) methods into the necessary feature classes + +{ + package Bio::EnsEMBL::Slice; + + sub gff_version { + return "##gff-version 3\n"; + } + + sub gff_header { + my $self = shift; + + my %args = @_; + + # build up a date string in the format specified by the GFF spec + + my ( $sec, $min, $hr, $mday, $mon, $year ) = localtime; + $year += 1900; # correct the year + $mon++; # correct the month + + my $date = sprintf "%4d-%02d-%02d", $year, $mon, $mday; + + my $region = $self->seq_region_name; + my $start = $self->start; + my $end = $self->end; + my $assembly = $self->coord_system->version; + + my $mca = $self->adaptor->db->get_MetaContainerAdaptor; + my $tax_id = $mca->get_taxonomy_id; + + my $hdr = + "##file-date $date\n" + . "##genome-build ensembl $assembly\n" + . "##species http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=$tax_id\n"; + + $hdr .= "##sequence-region $region $start $end\n" unless $args{no_sequence_region}; + + return $hdr; + } + + sub gvf_version { + return "##gvf-version 1.06\n"; + } + + sub gvf_header { + my $self = shift; + + my %args = @_; + my $hdr = $self->gff_version; + $hdr .= $self->gvf_version; + $hdr .= $self->gff_header(@_); + + my $mca = $self->adaptor->db->get_MetaContainerAdaptor; + my $schema_version = $mca->get_schema_version; + my $species_name = $mca->get_scientific_name; + $species_name =~ s/ /_/g; + my $url = 'http://e'.$schema_version.'.ensembl.org/'.$species_name; + + $hdr .= "##feature-ontology http://song.cvs.sourceforge.net/viewvc/song/ontology/so.obo?revision=1.283\n"; + $hdr .= "##data-source Source=ensembl;version=$schema_version;url=$url\n"; + $hdr .= "##file-version $schema_version\n"; + + if (my $individual = $args{individual}) { + $hdr .= "##individual-id ".$individual->to_gvf."\n"; + } + + if (my $population = $args{population}) { + $hdr .= "##attribute-method ".$population->to_gvf."\n"; + } + + return $hdr; + } +} + +{ + + package Bio::EnsEMBL::Feature; + + sub to_gff { + + my $self = shift; + + my %args = @_; + + # This parameter is assumed to be a hashref which includes extra attributes you'd + # like to have appended onto the gff line for the feature + my $extra_attrs = $args{extra_attrs}; + + my $gff = $self->_gff_hash(@_); + + return undef unless defined $gff; + + # default optional columns, and check that all required columns are present + + $gff->{score} = '.' unless defined $gff->{score}; + $gff->{strand} = '.' unless defined $gff->{strand}; + $gff->{phase} = '.' unless defined $gff->{phase}; + + for my $req (qw(source type start end)) { + die "'$req' attribute required for GFF" unless $gff->{$req}; + } + + # order as per GFF3 spec: http://www.sequenceontology.org/gff3.shtml + + my $gff_str = join( "\t", + $gff->{seqid}, $gff->{source}, $gff->{type}, $gff->{start}, + $gff->{end}, $gff->{score}, $gff->{strand}, $gff->{phase}, + ); + + if ($extra_attrs) { + + # combine the extra attributes with any existing ones (duplicate keys will get squashed, + # so attributes specified in the extra_attrs hash will override existing ones) + + $gff->{attributes} = {} unless defined $gff->{attributes}; + + @{ $gff->{attributes} }{ keys %$extra_attrs } = values %$extra_attrs; + } + + if ( $gff->{attributes} ) { + + my @attrs; + + for my $key (keys %{ $gff->{attributes} }) { + + my $val = $gff->{attributes}->{$key}; + + if (ref $val eq 'ARRAY') { + push @attrs, map { $key . '='. $_ } @$val; + } + else { + push @attrs, $key . '=' . $val; + } + } + + $gff_str .= "\t" . join( ';', @attrs ); + } + + return $gff_str; + } + + sub _gff_hash { + my $self = shift; + + my %args = @_; + + my $rebase = $args{rebase}; # use absolute or slice-relative coordinates + + my $gff_seqid = $args{gff_seqid} || $self->slice->seq_region_name; + my $gff_source = $args{gff_source} || $self->_gff_source; + + my $seqid = $rebase ? $gff_seqid.'_'.$self->slice->start.'-'.$self->slice->end : $gff_seqid; + my $start = $rebase ? $self->start : $self->seq_region_start; + my $end = $rebase ? $self->end : $self->seq_region_end; + + # GFF3 does not allow start > end, and mandates that for zero-length features (e.g. insertions) + # start = end and the implied insertion site is to the right of the specified base, so we use the + # smaller of the two values + + if ($start > $end) { + $start = $end; + } + + my $gff = { + seqid => $gff_seqid, + source => $gff_source, + type => $self->_gff_type, + start => $start, + end => $end, + strand => ( + $self->strand == 1 ? '+' : ( $self->strand == -1 ? '-' : '.' ) + ) + }; + + return $gff; + } + + sub _gff_source { + my $self = shift; + + if ($self->analysis) { + return + $self->analysis->gff_source + || $self->analysis->logic_name; + } + else { + return ref($self); + } + } + + sub _gff_type { + my $self = shift; + + return + ( $self->analysis && $self->analysis->gff_feature ) + || 'misc_feature'; + } +} + +{ + package Bio::EnsEMBL::Variation::VariationFeature; + + use Bio::EnsEMBL::Utils::Sequence qw(expand); + + my $REFERENCE_ALLELE_IDENTIFIER = '@'; + + sub to_gvf { + my $self = shift; + return $self->to_gff(@_); + } + + sub _gff_hash { + + my $self = shift; + + my $gff = $self->SUPER::_gff_hash(@_); + + my %args = @_; + + my $include_consequences = $args{include_consequences}; + my $include_coding_details = $args{include_coding_details}; + my $include_global_maf = $args{include_global_maf}; + + $gff->{source} = $self->source; + + $gff->{type} = $self->class_SO_term; + + my $source = $self->source; + + $source .= '_'.$self->source_version if defined $self->source_version; + + $gff->{attributes}->{Dbxref} = "$source:".$self->variation_name; + + $gff->{attributes}->{ID} = $self->dbID; + + # the Variant_seq attribute requires a comma separated list of alleles + + my @alleles = split '/', $self->allele_string; + my $ref_seq = shift @alleles unless @alleles == 1; # shift off the reference allele + + $gff->{attributes}->{Variant_seq} = join ',', @alleles; + + my $index = 0; + + # expand tandem repeat alleles, because TranscriptVariationAlleles use the expanded sequence + + map { expand(\$_) } @alleles; + + # if you expand e.g. (T)0 you get an empty string, which we treat as a deletion, so default to '-' + + my %allele_index = map { ($_ || '-') => $index++ } @alleles; + + if ($include_global_maf) { + + my $var = $self->variation; + + if (defined $var->minor_allele_frequency) { + + my $allele_idx; + + if ($var->minor_allele eq $ref_seq) { + $allele_idx = $REFERENCE_ALLELE_IDENTIFIER; + } + else { + $allele_idx = $allele_index{$var->minor_allele}; + } + + if (defined $allele_idx) { + $gff->{attributes}->{global_minor_allele_frequency} = + join (' ', + $allele_idx, + $var->minor_allele_frequency, + $var->minor_allele_count + ); + } + } + } + + # the reference sequence should be set to '~' if the sequence is longer than 50 nucleotides + + $ref_seq = '~' if (not $ref_seq) || (CORE::length($ref_seq) > 50); + $gff->{attributes}->{Reference_seq} = $ref_seq; + + # Hack for HGMD mutations + + if ($self->allele_string eq 'HGMD_MUTATION') { + $gff->{attributes}->{Reference_seq} = '~'; + $gff->{attributes}->{Variant_seq} = '~'; + $allele_index{$self->allele_string} = 0; + } + + if ($include_consequences || $include_coding_details) { + + for my $tv (@{ $self->get_all_TranscriptVariations }) { + + unless ($tv->get_all_alternate_TranscriptVariationAlleles) { + warn $self->variation_name." has no alternate alleles?"; + next; + } + + if ($include_coding_details) { + my $ref_tva = $tv->get_reference_TranscriptVariationAllele; + + if (my $pep = $ref_tva->peptide) { + $gff->{attributes}->{reference_peptide} = $pep; + } + } + + for my $tva (@{ $tv->get_all_alternate_TranscriptVariationAlleles }) { + + my $allele_idx = $allele_index{$tva->variation_feature_seq}; + + if (defined $allele_idx) { + + if ($include_consequences) { + for my $oc (@{ $tva->get_all_OverlapConsequences }) { + + push @{ $gff->{attributes}->{Variant_effect} ||= [] }, + join(' ', + $oc->SO_term, + $allele_idx, + $oc->feature_SO_term, + $tv->transcript_stable_id, + ); + } + } + + if ($include_coding_details) { + if ($tva->pep_allele_string) { + + push @{ $gff->{attributes}->{variant_peptide} ||= [] }, + join(' ', + $allele_idx, + $tva->peptide, + $tv->transcript_stable_id, + ); + + for my $tool (qw(sift polyphen)) { + my $pred_meth = $tool.'_prediction'; + my $score_meth = $tool.'_score'; + if (my $pred = $tva->$pred_meth) { + $pred =~ s/\s/_/g; + push @{ $gff->{attributes}->{polyphen_prediction} ||= [] }, + join(' ', + $allele_idx, + $pred, + $tva->$score_meth, + $tv->transcript_stable_id + ); + } + } + } + } + } + else { + warn "No allele_index entry for allele: ".$tva->variation_feature_seq. + " of ".$self->variation_name."? Is reference " . $tva->is_reference . " ref seq " . $ref_seq . "\n"; + } + } + } + } + + return $gff; + } +} + +{ + package Bio::EnsEMBL::Variation::StructuralVariationFeature; + + sub _gff_hash { + + my $self = shift; + + my $gff = $self->SUPER::_gff_hash(@_); + + $gff->{attributes}->{ID} = $self->variation_name; + + $gff->{source} = $self->source; + + my $sv = $self->structural_variation; + + $gff->{attributes}->{Dbxref} = $self->source . ':' . $self->variation_name; + + $gff->{attributes}->{study_accession} = $sv->study->name if $sv->study->name; + + $gff->{type} = $self->class_SO_term; + + #$gff->{attributes}->{Reference_seq} = $self->end > $self->start+50 ? '~' : $self->get_reference_sequence; + + if ( (defined $self->inner_start) && (defined $self->outer_start) && ($self->inner_start != $self->outer_start) ) { + $gff->{attributes}->{Start_range} = join ',', $self->outer_start, $self->inner_start; + } + + if ( (defined $self->inner_end) && (defined $self->outer_end) && ($self->inner_end != $self->outer_end) ) { + $gff->{attributes}->{End_range} = join ',', $self->inner_end, $self->outer_end; + } + + if (my $sv = $self->structural_variation) { + if (ref $sv eq 'Bio::EnsEMBL::Variation::SupportingStructuralVariation') { + if (my $parents = $sv->get_all_StructuralVariations) { + $gff->{attributes}->{Parent} = join ',', map { $_->variation_name } @$parents; + } + } + } + + return $gff; + } + + sub to_gvf { + my $self = shift; + return $self->to_gff(@_); + } +} + +{ + package Bio::EnsEMBL::Variation::Individual; + + sub _gff_hash { + + my $self = shift; + + my $gff; + + $gff->{Gender} = $self->gender; + + $gff->{Display_name} = $self->name; + + $gff->{ensembl_description} = $self->description; + + $gff->{Type} = $self->type_description; + + $gff->{Population} = join ',', map { $_->name } @{ $self->get_all_Populations }; + + return $gff; + } + + sub to_gvf { + my $self = shift; + + my $attrs = $self->_gff_hash(@_); + + # get rid of any empty attributes + map { delete $attrs->{$_} unless $attrs->{$_} } keys %$attrs; + + return join ';', map { $_.'='.$attrs->{$_} } keys %$attrs; + } + +} + +{ + package Bio::EnsEMBL::Variation::Population; + + sub _gff_hash { + + my $self = shift; + + my $gff; + + $gff->{Attribute} = 'Variant_freq'; + + $gff->{population} = $self->name; + + $gff->{population_size} = $self->size; + + $gff->{Comment} = $self->description; + + return $gff; + } + + sub to_gvf { + my $self = shift; + + my $attrs = $self->_gff_hash(@_); + + # get rid of any empty attributes + map { delete $attrs->{$_} unless $attrs->{$_} } keys %$attrs; + + return join ';', map { $_.'='.$attrs->{$_} } keys %$attrs; + } + +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/Sequence.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/Sequence.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,882 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# EnsEMBL module for Bio::EnsEMBL::Variation::Utils::Sequence +# +# + +=head1 NAME + +Bio::EnsEMBL::Variation::Utils::Sequence - Utility functions for sequences + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Variation::Utils::Sequence qw(ambiguity_code variation_class); + + my $alleles = 'A|C'; + + print "my alleles = $alleles\n"; + + my $ambig_code = ambiguity_code($alleles); + + print "my ambiguity code is $ambig_code\n"; + + print "my SNP class is = variation_class($alleles)"; + + +=head1 METHODS + +=cut + + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::Utils::Sequence; + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); +use Bio::EnsEMBL::Variation::Utils::Constants qw(:SO_class_terms); +use Bio::EnsEMBL::Utils::Scalar qw(wrap_array); +use Exporter; + +use vars qw(@ISA @EXPORT_OK); + +@ISA = qw(Exporter); + +@EXPORT_OK = qw( + &ambiguity_code + &variation_class + &unambiguity_code + &sequence_with_ambiguity + &hgvs_variant_notation + &format_hgvs_string + &SO_variation_class + &align_seqs + &strain_ambiguity_code + &get_all_validation_states + &get_validation_code + &add_validation_state +); + +# List of validation states. Order must match that of set in database +our @VALIDATION_STATES = qw(cluster freq submitter doublehit hapmap 1000Genome failed precious); + +=head2 ambiguity_code + + Arg[1] : string $alleles + Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw(ambiguity_code) + my $alleles = 'A|C'; + my $ambig_code = ambiguity_code($alleles); + print "the ambiguity code for $alleles is: ",$ambig_code; + Description : returns the ambiguity code for a SNP allele + ReturnType : String + The ambiguity code + Exceptions : None + Caller : Variation, VariationFeature + +=cut + +sub ambiguity_code { + my $alleles = shift; + my %duplicates; #hash containing all alleles to remove duplicates + + foreach my $a(split /[\|\/\\]/, $alleles) { + # convert Ns + my @a = ($a eq 'N' ? qw(A C G T) : ($a)); + map {$duplicates{$_}++} @a; + } + $alleles = uc( join '', sort keys %duplicates ); + #my %ambig = qw(AC M ACG V ACGT N ACT H AG R AGT D AT W CG S CGT B CT Y +#GT K C C A A T T G G - - -A -A -C -C -G -G -T -T A- A- C- C- G- G- T- T-); #for now just make e.g. 'A-' -> 'A-' + my %ambig = qw(AC M ACG V ACGT N ACT H AG R AGT D AT W CG S CGT B CT Y GT K C C A A T T G G - -); + return $ambig{$alleles}; +} + +=head2 strain_ambiguity_code + + Arg[1] : string $alleles (separated by "/", "\" or "|") + Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw(strain_ambiguity_code) + my $alleles = 'A|C'; + my $ambig_code = strain_ambiguity_code($alleles); + print "the ambiguity code for $alleles is: ",$ambig_code; + Description : returns the ambiguity code for a strain genotype + ReturnType : String + Exceptions : None + Caller : AlleleFeatureAdaptor + +=cut + +sub strain_ambiguity_code { + my $alleles = shift; + + # return normal ambiguity code for a SNP + return ambiguity_code($alleles) if($alleles =~ /^[ACGT][\|\/\\][ACGT]$/); + + # get alleles + my ($a1, $a2) = split /[\|\/\\]/, $alleles; + + # pad + if(length($a1) > length($a2)) { + $a2 .= '-' x (length($a1) - length($a2)); + } + else { + $a1 .= '-' x (length($a2) - length($a1)); + } + + # build ambiguity code base by base + my $ambig = ''; + + for my $i(0..(length($a1) - 1)) { + my $b1 = substr($a1, $i, 1); + my $b2 = substr($a2, $i, 1); + + # -/- = - + if($b1 eq '-' && $b2 eq '-') { + $ambig .= '-'; + } + + # G/- = g + elsif($b1 eq '-') { + $ambig .= lc($b2); + } + + # -/G = g + elsif($b2 eq '-') { + $ambig .= lc($b1); + } + + # A/G = R + else { + $ambig .= ambiguity_code($b1.'|'.$b2); + } + } + + return $ambig; +} + +=head2 unambiguity_code + + Arg[1] : string $alleles + Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code) + my $ambiguity_code = 'M'; + my $alleles = unambiguity_code($ambiguity_code); + print "the alleles for ambiguity code $ambiguity_code is: ",$alleles; + Description : returns the alleles for an ambiguity code + ReturnType : String + The Alleles, alphabetically sorted and in capital + Exceptions : None + Caller : Variation, VariationFeature + +=cut + +sub unambiguity_code { + my $ambiguity_code = shift; + + #my %unambig = qw(M AC V ACG N ACGT H ACT R AG D AGT W AT S CG B CGT Y CT K +#GT C CC A AA T TT G GG - -- -A -A -C -C -G -G -T -T A- A- C- C- G- G- T- T-); #for now just make e.g. 'A-' -> 'A-' + my %unambig = qw(M AC V ACG N ACGT H ACT R AG D AGT W AT S CG B CGT Y CT K GT C CC A AA T TT G GG - --); + return $unambig{$ambiguity_code}; +} + + +=head2 variation_class + + Arg[1] : string $alleles + Arg[2] : boolean $is_somatic - flag that this variation is somatic + Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw (variation_class) + my $alleles = 'A|C'; + my $variation_class = variation_class($alleles); + print "the variation class for the alleles $alleles is: ",$variation_class; + Description : return the class of the alleles according to dbSNP classification(SNP,indel,mixed,substitution...) + ReturnType : String. The class of the alleles + Exceptions : none + Caller : Variation, VariationFeature + +=cut + +sub variation_class{ + + my ($alleles, $is_somatic) = @_; + + my $class; + + if ($alleles =~ /^[ACGTN]([\|\\\/][ACGTN])+$/i) { + $class = 'snp'; + } + elsif (($alleles eq 'cnv') || ($alleles eq 'CNV')) { + $class = 'cnv'; + } + elsif ($alleles =~ /CNV\_PROBE/i) { + $class = 'cnv probe'; + } + elsif ($alleles =~ /HGMD\_MUTATION/i) { + $class = 'hgmd_mutation'; + } + else { + my @alleles = split /[\|\/\\]/, $alleles; + + if (@alleles == 1) { + #(HETEROZYGOUS) 1 allele + $class = 'het'; + } + elsif(@alleles == 2) { + if ((($alleles[0] =~ tr/ACTGN//)== length($alleles[0]) && ($alleles[1] =~ tr/-//) == 1) || + (($alleles[0] =~ tr/-//) == 1 && ($alleles[1] =~ tr/ACTGN//) == length($alleles[1])) ){ + #A/- 2 alleles + $class = 'in-del' + } + elsif (($alleles[0] =~ /LARGE|INS|DEL/) || ($alleles[1] =~ /LARGE|INS|DEL/)){ + #(LARGEDELETION) 2 alleles + $class = 'named' + } + elsif (($alleles[0] =~ tr/ACTG//) > 1 || ($alleles[1] =~ tr/ACTG//) > 1){ + #AA/GC 2 alleles + $class = 'substitution' + } + else { + warning("not possible to determine class for @alleles"); + $class = ''; + } + } + elsif (@alleles > 2) { + + if ($alleles[0] =~ /\d+/) { + #(CA)14/15/16/17 > 2 alleles, all of them contain the number of repetitions of the allele + $class = 'microsat' + } + + elsif ((grep {/-/} @alleles) > 0) { + #-/A/T/TTA > 2 alleles + $class = 'mixed' + } + else { + # warning("not possible to determine class of alleles " . @alleles); + $class = ''; + } + } + else{ + warning("no alleles available "); + $class = ''; + } + } + + if ($is_somatic) { + if ($class eq '') { + # for undetermined classes just call it somatic + $class = 'somatic'; + } + else { + # somatic mutations aren't polymorphisms, so change SNPs to SNVs + $class = 'snv' if $class eq 'snp'; + + # and prefix the class with 'somatic' + $class = 'somatic_'.$class; + } + } + + return $class; +} + +=head2 SO_variation_class + + Arg[1] : string $alleles - a slash ()'/') separated list of alleles, the first allele is + assumed to be the reference unless the $ref_correct argument is false + Arg[2] : boolean $ref_correct - flags that the first allele is not known to be the + reference sequence (so we can't call insertions or deletions and have to + resort to 'indel') + Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw (SO_variation_class) + my $alleles = 'A/C'; + my $SO_term = SO_variation_class($alleles); + print "the SO term for the alleles $alleles is: ",$SO_term; + Description : return the SO term for the class of the alleles + ReturnType : String. The SO term for the class of the alleles + Exceptions : none + Caller : Variation, VariationFeature + +=cut + +sub SO_variation_class { + + my $alleles = shift; + my $ref_correct = shift; + + $ref_correct = 1 unless defined $ref_correct; + + my $allele_class = '[A-Z]'; + + # default to sequence_alteration + my $class = SO_TERM_SEQUENCE_ALTERATION; + + if ($alleles =~ /^$allele_class(\/$allele_class)+$/) { + # A/T, A/T/G + $class = SO_TERM_SNV; + } + elsif ($alleles =~ /^$allele_class+(\/$allele_class+)+$/) { + # AA/TTT + $class = SO_TERM_SUBSTITUTION; + } + elsif ($alleles =~ /\)\d+/) { + # (CAG)8/(CAG)9 + $class = SO_TERM_TANDEM_REPEAT; + } + else { + my @alleles = split /\//, $alleles; + + if (@alleles > 1) { + + my $ref = shift @alleles; + + if ($ref eq '-') { + + if (@alleles == 1 && $alleles[0] =~ /DEL/) { + # -/(LARGEDELETION) (rather oddly!) + $class = $ref_correct ? SO_TERM_DELETION : SO_TERM_INDEL; + } + + unless (grep { $_ !~ /^$allele_class+$|INS/ } @alleles) { + # -/ATT, -/(LARGEINSERTION) + $class = $ref_correct ? SO_TERM_INSERTION : SO_TERM_INDEL; + } + + # else must be mixed insertion and deletion, so just called sequence_alteration + } + elsif ($ref =~ /^$allele_class+$/) { + unless (grep { $_ !~ /-|DEL/ } @alleles) { + # A/-, A/(LARGEDELETION) + $class = $ref_correct ? SO_TERM_DELETION : SO_TERM_INDEL; + } + } + elsif ($ref =~ /DEL/) { + unless (grep { $_ !~ /-/ } @alleles) { + # (LARGEDELETION)/-, (2345 BP DELETION)/- + $class = $ref_correct ? SO_TERM_DELETION : SO_TERM_INDEL; + } + } + } + elsif (@alleles == 1) { + if ($alleles[0] =~ /INS/) { + # (LARGEINSERTION) + $class = $ref_correct ? SO_TERM_INSERTION : SO_TERM_INDEL; + } + elsif($alleles[0] =~ /DEL/) { + # (308 BP DELETION) + $class = $ref_correct ? SO_TERM_DELETION : SO_TERM_INDEL; + } + } + } + + return $class; +} + +=head2 sequence_with_ambiguity + + Arg[1] : Bio::EnsEMBL::DBSQL::DBAdaptor $dbCore + Arg[2] : Bio::EnsEMBL::Variation::DBSQL::DBAdaptor $dbVar + Arg[3] : string $chr + Arg[4] : int $start + Arg[5] : int $end + Arg[6] : int $strand + Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw (sequence_with_ambiguity) + my $slice = sequence_with_ambiguity($dbCore,$dbVar,1,100,200); + print "the sequence with ambiguity code for your region is: ",$slice->seq() + Description : given a region, returns a Bio::EnsEMBL::Slice object with + the sequence set with ambiguity codes + ReturnType : Bio::EnsEMBL::Slice object + Exceptions : none + Caller : general + +=cut + +sub sequence_with_ambiguity{ + my ($dbCore,$dbVar,$chr,$start,$end,$strand) = @_; + + my $slice; + if (ref($dbCore) ne 'Bio::EnsEMBL::DBSQL::DBAdaptor'){ + warning('You need to provide a Bio::EnsEMBL::DBSQL::DBAdaptor as a first argument'); + return $slice; + } + if (ref($dbVar) ne 'Bio::EnsEMBL::Variation::DBSQL::DBAdaptor'){ + warning('You need to provide a Bio::EnsEMBL::Variation::DBSQL::DBAdaptor object as second argument'); + return $slice; + } + my $slice_adaptor = $dbCore->get_SliceAdaptor(); + my $vf_adaptor = $dbVar->get_VariationFeatureAdaptor; + $slice = $slice_adaptor->fetch_by_region('chromosome',$chr,$start,$end,$strand); #get the slice + my $seq = $slice->seq; + foreach my $vf (@{$vf_adaptor->fetch_all_by_Slice($slice)}){ + substr($seq,$vf->start-1,1,$vf->ambig_code); + } + $slice->{'seq'} = $seq; + return $slice; +} + +=head2 hgvs_variant_notation + + Arg[1] : string $alt_allele + Arg[2] : string $ref_sequence + Arg[3] : int $ref_start + Arg[4] : int $ref_end + Arg[5] : int $display_start (optional) + Arg[6] : int $display_end (optional) + + Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw (hgvs_variant_notation) + my $alt_allele = 'A'; + my $ref_sequence = 'CCGTGATGTGC'; + my $ref_start = 4; + my $ref_end = 4; + my $ref_name = 'test_seq'; + my $ref_type = 'g'; + my $notation = hgvs_variant_notation($alt_allele,$ref_sequence,$ref_start,$ref_end); + print "HGVS notation of your variant: $ref_name\:$ref_type\." . $notation->{'hgvs'}; + + Description : Given an allele, a reference sequence and position of variant, returns a reference to a hash containing metadata and a + string with HGVS notation of the variant. Returns undef if reference and variant alleles are identical. + The optional display_start and display_end, if specified, will be used in the notation instead of the ref_start and ref_end. + This can be useful, e.g. if we want coordinates relative to chromosome but don't want to pass the entire chromosome sequence + into the subroutine. + The data fields in the returned hash are: + 'start' -> Displayed start position of variant + 'end' -> Displayed end position of variant + 'ref' -> Reference allele + 'alt' -> Alternative allele + 'type' -> The variant class, e.g. ins, inv, >, delins + 'hgvs' -> A string with HGVS notation + ReturnType : reference to a hash + Exceptions : If the length of the interval to be displayed is different from the length of the reference allele + Caller : general + +=cut +sub hgvs_variant_notation { + my $alt_allele = shift; + my $ref_sequence = shift; + my $ref_start = shift; + my $ref_end = shift; + my $display_start = shift; + my $display_end = shift; + + # If display_start and display_end were not specified, use ref_start and ref_end + $display_start ||= $ref_start; + $display_end ||= $ref_end; + + #ÊThrow an exception if the lengths of the display interval and reference interval are different + throw("The coordinate interval for display is of different length than for the reference allele") if (($display_end - $display_start) != ($ref_end - $ref_start)); + + # Length of the reference allele. Negative lengths make no sense + my $ref_length = ($ref_end - $ref_start + 1); + if ($ref_length < 0) { + $ref_length = 0; + } + + # Remove any gap characters in the alt allele + $alt_allele =~ s/\-//g; + + # Length of alternative allele + my $alt_length = length($alt_allele); + + # Get the reference allele + my $ref_allele = substr($ref_sequence,($ref_start-1),$ref_length); + + # Check that the alleles are different, otherwise return undef + return undef unless ($ref_allele ne $alt_allele); + + # Store the notation in a hash that will be returned + my %notation; + $notation{'start'} = $display_start; + $notation{'end'} = $display_end; + $notation{'ref'} = $ref_allele; + $notation{'alt'} = $alt_allele; + + # The simplest case is a deletion + if (!$alt_length) { + $notation{'type'} = 'del'; + + # Return the notation + return \%notation; + } + + # Another case is if the allele lengths are equal + if ($ref_length == $alt_length) { + + # If length is 1 it's a single substitution + if ($ref_length == 1) { + $notation{'type'} = '>'; + return \%notation; + } + + # Check if it's an inversion + my $rev_ref = $ref_allele; + reverse_comp(\$rev_ref); + if ($alt_allele eq $rev_ref) { + $notation{'type'} = 'inv'; + return \%notation; + } + + $notation{'type'} = 'delins'; + + return \%notation; + } + + # If this is an insertion, we should check if the preceeding reference nucleotides match the insertion. In that case it should be annotated as a multiplication. + if (!$ref_length) { + + # Get the same number of nucleotides preceding the insertion as the length of the insertion + my $prev_str = substr($ref_sequence,($ref_end-$alt_length),$alt_length); + + # If they match, this is a duplication + if ($prev_str eq $alt_allele) { + + $notation{'start'} = ($display_end - $alt_length + 1); + $notation{'type'} = 'dup'; + $notation{'ref'} = $prev_str; + # Return the notation + return \%notation; + } + + # If they didn't match it's a plain insertion + $notation{'start'} = $display_end; + $notation{'end'} = $display_start; + $notation{'type'} = 'ins'; + + return \%notation; + } + + # Otherwise, the reference and allele are of different lengths. By default, this is a delins but + # we need to check if the alt allele is a multiplication of the reference + # Check if the length of the alt allele is a multiple of the reference allele + if ($alt_length%$ref_length == 0) { + my $multiple = ($alt_length / $ref_length); + if ($alt_allele eq ($ref_allele x $multiple)) { + if ($multiple == 2) { + $notation{'type'} = 'dup'; + } + else { + $notation{'type'} = '[' . $multiple . ']'; + } + return \%notation; + } + } + + # Else, it's gotta be a delins + $notation{'type'} = 'delins'; + + return \%notation; +} + + +=head2 format_hgvs_string + + Arg[1] : string reference sequence name + Arg[2] : string strand + Arg[3] : hash of hgvs information + Example : + Description : Creates HGVS formatted string from input hash + ReturnType : string in HGVS format + Exceptions : + Caller : + +=cut + +sub format_hgvs_string{ + ##### generic formatting routine for genomic and coding HGVS names + + my $hgvs_notation = shift; + + ### all start with refseq name & numbering type + $hgvs_notation->{'hgvs'} = $hgvs_notation->{'ref_name'} . ":" . $hgvs_notation->{'numbering'} . "."; + + my $coordinates; + #### if single base event, list position only once + if($hgvs_notation->{'start'} eq $hgvs_notation->{'end'}){ + $coordinates = $hgvs_notation->{'start'}; + } + else{ + $coordinates = $hgvs_notation->{'start'} . "_" . $hgvs_notation->{'end'}; + } + + ##### format rest of string according to type + + if($hgvs_notation->{'type'} eq 'del' || $hgvs_notation->{'type'} eq 'inv' || $hgvs_notation->{'type'} eq 'dup'){ + ### inversion of reference bases => list ref not alt + ### deletion of reference bases => list ref lost + ### duplication of reference bases (eg ref = GAAA alt = GAAAGAAA) => list duplicated ref (dupGAAA) + $hgvs_notation->{'hgvs'} .= $coordinates . $hgvs_notation->{'type'} . $hgvs_notation->{'ref'}; + } + + elsif( $hgvs_notation->{'type'} eq '>'){ + ### substitution - list both alleles + $hgvs_notation->{'hgvs'} .= $hgvs_notation->{'start'} . $hgvs_notation->{'ref'} . $hgvs_notation->{'type'} . $hgvs_notation->{'alt'}; + } + + elsif( $hgvs_notation->{'type'} eq 'delins'){ + $hgvs_notation->{'hgvs'} .= $coordinates . 'del' . $hgvs_notation->{'ref'} . 'ins' . $hgvs_notation->{'alt'}; + } + + elsif($hgvs_notation->{'type'} eq 'ins'){ + ## reference not listed + $hgvs_notation->{'hgvs'} .= $coordinates . $hgvs_notation->{'type'} . $hgvs_notation->{'alt'}; + } + + elsif($hgvs_notation->{'type'} =~ /\[\d+\]/){ + #### insertion described by string and number + $hgvs_notation->{'hgvs'} .= $coordinates . $hgvs_notation->{'type'} . $hgvs_notation->{'ref'}; + } + + else{ + warn "PROBLEM with generic HGVS formatter - type = ". $hgvs_notation->{'type'} ."\n"; + } + + return $hgvs_notation->{'hgvs'}; + +} + +=head2 align_seqs + + Arg[1] : string $seq1 + Arg[2] : string $seq2 + Example : my $aligned_seqs = align_seqs($seq1, $seq2); + Description : Does a simple NW align of two sequence strings. Best used on + short (<1000bp) sequences, otherwise runtime will be long + ReturnType : arrayref to a pair of strings + Exceptions : none + Caller : web flanking sequence display + +=cut + +sub align_seqs { + my $seq1 = shift; + my $seq2 = shift; + + # align parameters + my $match = 10; + my $mismatch = -10; + my $gep = -10; + + # split sequences into arrays + my @split1 = split //, $seq1; + my @split2 = split //, $seq2; + + # evaluate substitutions + my $len1 = length($seq1); + my $len2 = length($seq2); + + my (@smat, @tb); + + for (my $i=0; $i<=$len1; $i++) { + $smat[$i][0] = $i * $gep; + $tb[$i][0] = 1; + } + for (my $j=0; $j<=$len2; $j++) { + $smat[0][$j] = $j * $gep; + $tb[0][$j] = -1; + } + + my ($s, $sub, $del, $ins); + + for (my $i=1; $i<=$len1; $i++) { + for (my $j=1; $j<=$len2; $j++) { + + # calculate score + if($split1[$i-1] eq $split2[$j-1]) { + $s = $match; + } + else { + $s = $mismatch; + } + + $sub = $smat[$i-1][$j-1] + $s; + $del = $smat[$i][$j-1] + $gep; + $ins = $smat[$i-1][$j] + $gep; + + if($sub > $del && $sub > $ins) { + $smat[$i][$j] = $sub; + $tb[$i][$j] = 0; + } + elsif($del > $ins) { + $smat[$i][$j] = $del; + $tb[$i][$j] = -1; + } + else { + $smat[$i][$j] = $ins; + $tb[$i][$j] = 1; + } + } + } + + + my $i = $len1; + my $j = $len2; + my $aln_len = 0; + my (@aln1, @aln2); + + while(!($i == 0 && $j == 0)) { + if($tb[$i][$j] == 0) { + $aln1[$aln_len] = $split1[--$i]; + $aln2[$aln_len] = $split2[--$j]; + } + elsif($tb[$i][$j] == -1) { + $aln1[$aln_len] = '-'; + $aln2[$aln_len] = $split2[--$j]; + } + elsif($tb[$i][$j] == 1) { + $aln1[$aln_len] = $split1[--$i]; + $aln2[$aln_len] = '-'; + } + + $aln_len++; + } + + return [(join "", reverse @aln1), (join "", reverse @aln2)]; +} + + +=head2 array_to_bitval + + Arg[1] : arrayref $arr + Arg[2] : arrayref $ref + Example : my $bitval = array_to_bitval(['hapmap','precious'],['cluster','freq','submitter','doublehit','hapmap','1000Genome','failed','precious']); + Description : Takes a reference to an array as input and return a bit value representing the + combination of elements from a reference array. c.f. the SET datatype in MySQL + ReturnType : bitvalue that represents the combination of elements in the reference array specified in the given array + Exceptions : none + Caller : get_validation_code + +=cut + +sub array_to_bitval { + my $arr = shift; + my $ref = shift; + + #ÊEnsure that we have array references + $arr = wrap_array($arr); + $ref = wrap_array($ref); + + #ÊTurn the reference array into a hash, the values will correspond to 2 raised to the power of the position in the array + my $i=0; + my %ref_hash = map {lc($_) => $i++;} @{$ref}; + + #ÊSet the bitval + my $bitval = 0; + foreach my $a (@{$arr}) { + + my $pos = $ref_hash{lc($a)}; + if (defined($pos)) { + $bitval |= 2**$pos; + } + # Warn if the element is not present in the reference array + else { + warning("$a is not a recognised element. Recognised elements are: " . join(",",@{$ref})); + } + } + + return $bitval; +} + +=head2 bitval_to_array + + Arg [1] : int $bitval + Arg [2] : arrayref $ref + Example : my $arr = bitval_to_array(6,['cluster','freq','submitter','doublehit','hapmap','1000Genome','failed','precious']); + : print join(",",@{$arr}); #ÊWill print 'freq,submitter' + Description: Returns an array with the combination of elements from the reference array specified by the supplied bitvalue. + c.f. the SET datatype in MySQL + Returntype : reference to list of strings + Exceptions : none + Caller : get_all_validation_states + +=cut + +sub bitval_to_array { + my $bitval = shift || 0; + my $ref = shift; + + #ÊEnsure that we have array references + $ref = wrap_array($ref); + + # convert the bit value into an ordered array + my @arr; + for (my $i = 0; $i < @{$ref}; $i++) { + push(@arr,$ref->[$i]) if ((1 << $i) & $bitval); + } + + return \@arr; +} + + +=head2 add_validation_state + + Arg [1] : string $state + Example : add_validation_state('cluster'); + Description: Adds a validation state to this variation. + Returntype : none + Exceptions : warning if validation state is not a recognised type + Caller : general + Status : At Risk + +=cut + +sub add_validation_state { + my $obj = shift; + my $state = shift; + + #ÊGet the bitvalue for the new state + my $newbit = get_validation_code($state) || 0; + + #ÊBit-add it to the current validation_code + my $oldbit = $obj->{'validation_code'} || 0; + $newbit |= $oldbit; + + # Set the validation_code + $obj->{'validation_code'} = $newbit; + + return; +} + +=head2 get_all_validation_states + + Arg [1] : int $bitval + Example : my @vstates = @{get_all_validation_states($var->{'validation_code'})}; + Description: Retrieves all validation states for a specified bit value. + Returntype : reference to list of strings + Exceptions : none + Caller : general + +=cut + +sub get_all_validation_states { + return bitval_to_array(shift,\@VALIDATION_STATES); +} + +=head2 get_validation_code + + Arg [1] : arrayref $validation_status + Example : $var->{'validation_code'} = get_validation_code(['submitter','precious']); + Description: Retrieves the bit value for a combination of validation statuses. + Returntype : int + Exceptions : none + Caller : Variation::new + +=cut + +sub get_validation_code { + return array_to_bitval(shift,\@VALIDATION_STATES); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/VEP.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/VEP.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,4713 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# EnsEMBL module for Bio::EnsEMBL::Variation::Utils::Sequence +# +# + +=head1 NAME + +Bio::EnsEMBL::Variation::Utils::VEP - Methods used by the Variant Effect Predictor + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Variation::Utils::VEP qw(configure); + + my $config = configure(); + +=head1 METHODS + +=cut + + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::Utils::VEP; + +# module list +use Getopt::Long; +use FileHandle; +use File::Path qw(make_path); +use Storable qw(nstore_fd fd_retrieve freeze thaw); +use Scalar::Util qw(weaken); +use Digest::MD5 qw(md5_hex); + +use Bio::EnsEMBL::Registry; +use Bio::EnsEMBL::Variation::VariationFeature; +use Bio::EnsEMBL::Variation::DBSQL::VariationFeatureAdaptor; +use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(MAX_DISTANCE_FROM_TRANSCRIPT overlap); +use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); +use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code); +use Bio::EnsEMBL::Variation::Utils::EnsEMBL2GFF3; +use Bio::EnsEMBL::Variation::StructuralVariationFeature; +use Bio::EnsEMBL::Variation::DBSQL::StructuralVariationFeatureAdaptor; +use Bio::EnsEMBL::Variation::TranscriptStructuralVariation; + +# we need to manually include all these modules for caching to work +use Bio::EnsEMBL::CoordSystem; +use Bio::EnsEMBL::Transcript; +use Bio::EnsEMBL::Translation; +use Bio::EnsEMBL::Exon; +use Bio::EnsEMBL::ProteinFeature; +use Bio::EnsEMBL::Analysis; +use Bio::EnsEMBL::DBSQL::GeneAdaptor; +use Bio::EnsEMBL::DBSQL::SliceAdaptor; +use Bio::EnsEMBL::DBSQL::TranslationAdaptor; +use Bio::EnsEMBL::DBSQL::TranscriptAdaptor; +use Bio::EnsEMBL::DBSQL::MetaContainer; +use Bio::EnsEMBL::DBSQL::CoordSystemAdaptor; + +use Exporter; +use vars qw(@ISA @EXPORT_OK); +@ISA = qw(Exporter); + +# open socket pairs for cross-process comms +use Socket; +socketpair(CHILD, PARENT, AF_UNIX, SOCK_STREAM, PF_UNSPEC) or die "ERROR: Failed to open socketpair: $!"; +CHILD->autoflush(1); +PARENT->autoflush(1); + +@EXPORT_OK = qw( + &parse_line + &vf_to_consequences + &validate_vf + &read_cache_info + &dump_adaptor_cache + &load_dumped_adaptor_cache + &load_dumped_variation_cache + &get_all_consequences + &get_slice + &build_slice_cache + &build_full_cache + ®ions_from_hash + &get_time + &debug + &convert_to_vcf + &progress + &end_progress + @REG_FEAT_TYPES + @OUTPUT_COLS + @VEP_WEB_CONFIG + %FILTER_SHORTCUTS +); + +our @OUTPUT_COLS = qw( + Uploaded_variation + Location + Allele + Gene + Feature + Feature_type + Consequence + cDNA_position + CDS_position + Protein_position + Amino_acids + Codons + Existing_variation + Extra +); + +our @REG_FEAT_TYPES = qw( + RegulatoryFeature + MotifFeature +); + +our @VEP_WEB_CONFIG = qw( + format + check_existing + coding_only + core_type + hgnc + protein + hgvs + terms + check_frequency + freq_filter + freq_gt_lt + freq_freq + freq_pop + sift + polyphen + regulatory +); + +our %FILTER_SHORTCUTS = ( + upstream => { + '5KB_upstream_variant' => 1, + '2KB_upstream_variant' => 1, + }, + downstream => { + '5KB_downstream_variant' => 1, + '2KB_downstream_variant' => 1, + '500B_downstream_variant' => 1, + }, + utr => { + '5_prime_UTR_variant' => 1, + '3_prime_UTR_variant' => 1, + }, + splice => { + splice_donor_variant => 1, + splice_acceptor_variant => 1, + splice_region_variant => 1, + }, + coding_change => { + stop_lost => 1, + stop_gained => 1, + missense_variant => 1, + frameshift_variant => 1, + inframe_insertion => 1, + inframe_deletion => 1, + }, + regulatory => { + regulatory_region_variant => 1, + TF_binding_site_variant => 1, + }, +); + +# parses a line of input, returns VF object(s) +sub parse_line { + my $config = shift; + my $line = shift; + + # find out file format - will only do this on first line + if(!defined($config->{format}) || (defined($config->{format}) && $config->{format} eq 'guess')) { + $config->{format} = &detect_format($line); + debug("Detected format of input file as ", $config->{format}) unless defined($config->{quiet}); + + # HGVS and ID formats need DB + die("ERROR: Can't use ".uc($config->{format})." format in offline mode") if $config->{format} =~ /id|hgvs/ && defined($config->{offline}); + + # force certain options if format is VEP output + if($config->{format} eq 'vep') { + $config->{no_consequence} = 1; + delete $config->{regulatory}; + debug("Forcing no consequence calculation") unless defined($config->{quiet}); + } + } + + # check that format is vcf when using --individual + die("ERROR: --individual only compatible with VCF input files\n") if defined($config->{individual}) && $config->{format} ne 'vcf'; + + my $parse_method = 'parse_'.$config->{format}; + $parse_method =~ s/vep_//; + my $method_ref = \&$parse_method; + + my $vfs = &$method_ref($config, $line); + + $vfs = add_lrg_mappings($config, $vfs) if defined($config->{lrg}); + + return $vfs; +} + +# sub-routine to detect format of input +sub detect_format { + my $line = shift; + my @data = split /\s+/, $line; + + # HGVS: ENST00000285667.3:c.1047_1048insC + if ( + scalar @data == 1 && + $data[0] =~ /^([^\:]+)\:.*?([cgmrp]?)\.?([\*\-0-9]+.*)$/i + ) { + return 'hgvs'; + } + + # variant identifier: rs123456 + elsif ( + scalar @data == 1 + ) { + return 'id'; + } + + # VCF: 20 14370 rs6054257 G A 29 0 NS=58;DP=258;AF=0.786;DB;H2 GT:GQ:DP:HQ + elsif ( + $data[0] =~ /(chr)?\w+/ && + $data[1] =~ /^\d+$/ && + $data[3] =~ /^[ACGTN-]+$/i && + $data[4] =~ /^([\.ACGTN-]+\,?)+$/i + ) { + return 'vcf'; + } + + # pileup: chr1 60 T A + elsif ( + $data[0] =~ /(chr)?\w+/ && + $data[1] =~ /^\d+$/ && + $data[2] =~ /^[\*ACGTN-]+$/i && + $data[3] =~ /^[\*ACGTNRYSWKM\+\/-]+$/i + ) { + return 'pileup'; + } + + # ensembl: 20 14370 14370 A/G + + elsif ( + $data[0] =~ /\w+/ && + $data[1] =~ /^\d+$/ && + $data[2] =~ /^\d+$/ && + $data[3] =~ /[ACGTN-]+\/[ACGTN-]+/i + ) { + return 'ensembl'; + } + + # vep output: ID 1:142849179 - - - - INTERGENIC + elsif ( + $data[0] =~ /\w+/ && + $data[1] =~ /^\w+?\:\d+(\-\d+)*$/ && + scalar @data == 14 + ) { + return 'vep'; + } + + else { + die("ERROR: Could not detect input file format\n"); + } +} + +# parse a line of Ensembl format input into a variation feature object +sub parse_ensembl { + my $config = shift; + my $line = shift; + + my ($chr, $start, $end, $allele_string, $strand, $var_name) = split /\s+/, $line; + + my $vf = Bio::EnsEMBL::Variation::VariationFeature->new_fast({ + start => $start, + end => $end, + allele_string => $allele_string, + strand => $strand, + map_weight => 1, + adaptor => $config->{vfa}, + variation_name => $var_name, + chr => $chr, + }); + + return [$vf]; +} + +# parse a line of VCF input into a variation feature object +sub parse_vcf { + my $config = shift; + my $line = shift; + + my @data = split /\s+/, $line; + + # non-variant + my $non_variant = 0; + + if($data[4] eq '.') { + if(defined($config->{allow_non_variant})) { + $non_variant = 1; + } + else { + return []; + } + } + + # get relevant data + my ($chr, $start, $end, $ref, $alt) = ($data[0], $data[1], $data[1], $data[3], $data[4]); + + # some VCF files have a GRCh37 pos defined in GP flag in INFO column + # if user has requested, we can use that as the position instead + if(defined $config->{gp}) { + $chr = undef; + $start = undef; + + foreach my $pair(split /\;/, $data[7]) { + my ($key, $value) = split /\=/, $pair; + if($key eq 'GP') { + ($chr, $start) = split /\:/, $value; + $end = $start; + } + } + + unless(defined($chr) and defined($start)) { + warn "No GP flag found in INFO column" unless defined $config->{quiet}; + return []; + } + } + + # adjust end coord + $end += (length($ref) - 1); + + # structural variation + if((defined($data[7]) && $data[7] =~ /SVTYPE/) || $alt =~ /\<|\[|\]|\>/) { + + # parse INFO field + my %info = (); + + foreach my $bit(split /\;/, $data[7]) { + my ($key, $value) = split /\=/, $bit; + $info{$key} = $value; + } + + # like indels, SVs have the base before included for reference + $start++; + + # work out the end coord + if(defined($info{END})) { + $end = $info{END}; + } + elsif(defined($info{SVLEN})) { + $end = $start + abs($info{SVLEN}) - 1; + } + + # check for imprecise breakpoints + my ($min_start, $max_start, $min_end, $max_end); + + if(defined($info{CIPOS})) { + my ($low, $high) = split /\,/, $info{CIPOS}; + $min_start = $start + $low; + $max_start = $start + $high; + } + + if(defined($info{CIEND})) { + my ($low, $high) = split /\,/, $info{CIEND}; + $min_end = $end + $low; + $max_end = $end + $high; + } + + # get type + my $type = $info{SVTYPE}; + my $so_term; + + if(defined($type)) { + # convert to SO term + my %terms = ( + INS => 'insertion', + DEL => 'deletion', + TDUP => 'tandem_duplication', + DUP => 'duplication' + ); + + $so_term = defined $terms{$type} ? $terms{$type} : $type; + } + + my $svf = Bio::EnsEMBL::Variation::StructuralVariationFeature->new_fast({ + start => $start, + inner_start => $max_start, + outer_start => $min_start, + end => $end, + inner_end => $min_end, + outer_end => $max_end, + strand => 1, + adaptor => $config->{svfa}, + variation_name => $data[2] eq '.' ? undef : $data[2], + chr => $chr, + class_SO_term => $so_term, + }); + + return [$svf]; + } + + # normal variation + else { + # find out if any of the alt alleles make this an insertion or a deletion + my ($is_indel, $is_sub, $ins_count, $total_count); + foreach my $alt_allele(split /\,/, $alt) { + $is_indel = 1 if $alt_allele =~ /D|I/; + $is_indel = 1 if length($alt_allele) != length($ref); + $is_sub = 1 if length($alt_allele) == length($ref); + $ins_count++ if length($alt_allele) > length($ref); + $total_count++; + } + + # multiple alt alleles? + if($alt =~ /\,/) { + if($is_indel) { + my @alts; + + if($alt =~ /D|I/) { + foreach my $alt_allele(split /\,/, $alt) { + # deletion (VCF <4) + if($alt_allele =~ /D/) { + push @alts, '-'; + } + + elsif($alt_allele =~ /I/) { + $alt_allele =~ s/^I//g; + push @alts, $alt_allele; + } + } + } + + else { + $ref = substr($ref, 1) || '-'; + $start++; + + foreach my $alt_allele(split /\,/, $alt) { + $alt_allele = substr($alt_allele, 1); + $alt_allele = '-' if $alt_allele eq ''; + push @alts, $alt_allele; + } + } + + $alt = join "/", @alts; + } + + else { + # for substitutions we just need to replace ',' with '/' in $alt + $alt =~ s/\,/\//g; + } + } + + elsif($is_indel) { + # deletion (VCF <4) + if($alt =~ /D/) { + my $num_deleted = $alt; + $num_deleted =~ s/\D+//g; + $end += $num_deleted - 1; + $alt = "-"; + $ref .= ("N" x ($num_deleted - 1)) unless length($ref) > 1; + } + + # insertion (VCF <4) + elsif($alt =~ /I/) { + $ref = '-'; + $alt =~ s/^I//g; + $start++; + } + + # insertion or deletion (VCF 4+) + elsif(substr($ref, 0, 1) eq substr($alt, 0, 1)) { + + # chop off first base + $ref = substr($ref, 1) || '-'; + $alt = substr($alt, 1) || '-'; + + $start++; + } + } + + # create VF object + my $vf = Bio::EnsEMBL::Variation::VariationFeature->new_fast({ + start => $start, + end => $end, + allele_string => $non_variant ? $ref : $ref.'/'.$alt, + strand => 1, + map_weight => 1, + adaptor => $config->{vfa}, + variation_name => $data[2] eq '.' ? undef : $data[2], + chr => $chr, + }); + + # flag as non-variant + $vf->{non_variant} = 1 if $non_variant; + + # individuals? + if(defined($config->{individual})) { + my @alleles = split /\//, $ref.'/'.$alt; + + my @return; + + foreach my $ind(keys %{$config->{ind_cols}}) { + + # get alleles present in this individual + my @bits; + my $gt = (split /\:/, $data[$config->{ind_cols}->{$ind}])[0]; + + my $phased = ($gt =~ /\|/ ? 1 : 0); + + foreach my $bit(split /\||\/|\\/, $gt) { + push @bits, $alleles[$bit] unless $bit eq '.'; + } + + # shallow copy VF + my $vf_copy = { %$vf }; + bless $vf_copy, ref($vf); + + # get non-refs + my %non_ref = map {$_ => 1} grep {$_ ne $ref} @bits; + + # construct allele_string + if(scalar keys %non_ref) { + $vf_copy->{allele_string} = $ref."/".(join "/", keys %non_ref); + } + else { + $vf_copy->{allele_string} = $ref; + $vf_copy->{non_variant} = 1; + } + + # store phasing info + $vf_copy->{phased} = defined($config->{phased} ? 1 : $phased); + + # store GT + $vf_copy->{genotype} = \@bits; + + # store individual name + $vf_copy->{individual} = $ind; + + push @return, $vf_copy; + } + + return \@return; + } + else { + return [$vf]; + } + } +} + +# parse a line of pileup input into variation feature objects +sub parse_pileup { + my $config = shift; + my $line = shift; + + my @data = split /\s+/, $line; + + # pileup can produce more than one VF per line + my @return; + + # normal variant + if($data[2] ne "*"){ + my $var; + + if($data[3] =~ /^[A|C|G|T]$/) { + $var = $data[3]; + } + else { + ($var = unambiguity_code($data[3])) =~ s/$data[2]//ig; + } + + for my $alt(split //, $var){ + push @return, Bio::EnsEMBL::Variation::VariationFeature->new_fast({ + start => $data[1], + end => $data[1], + allele_string => $data[2].'/'.$alt, + strand => 1, + map_weight => 1, + adaptor => $config->{vfa}, + chr => $data[0], + }); + } + } + + # in/del + else { + my %tmp_hash = map {$_ => 1} split /\//, $data[3]; + my @genotype = keys %tmp_hash; + + foreach my $allele(@genotype){ + if(substr($allele,0,1) eq "+") { #ins + push @return, Bio::EnsEMBL::Variation::VariationFeature->new_fast({ + start => $data[1] + 1, + end => $data[1], + allele_string => '-/'.substr($allele, 1), + strand => 1, + map_weight => 1, + adaptor => $config->{vfa}, + chr => $data[0], + }); + } + elsif(substr($allele,0,1) eq "-"){ #del + push @return, Bio::EnsEMBL::Variation::VariationFeature->new_fast({ + start => $data[1] + 1, + end => $data[1] + length(substr($allele, 1)), + allele_string => substr($allele, 1).'/-', + strand => 1, + map_weight => 1, + adaptor => $config->{vfa}, + chr => $data[0], + }); + } + elsif($allele ne "*"){ + warn("WARNING: invalid pileup indel genotype: $line\n") unless defined $config->{quiet}; + } + } + } + + return \@return; +} + +# parse a line of HGVS input into a variation feature object +sub parse_hgvs { + my $config = shift; + my $line = shift; + + my $vf; + + # not all hgvs notations are supported yet, so we have to wrap it in an eval + eval { $vf = $config->{vfa}->fetch_by_hgvs_notation($line, $config->{sa}, $config->{ta}) }; + + if((!defined($vf) || (defined $@ && length($@) > 1)) && defined($config->{coordinator})) { + eval { $vf = $config->{vfa}->fetch_by_hgvs_notation($line, $config->{ofsa}, $config->{ofta}) }; + } + + if(!defined($vf) || (defined $@ && length($@) > 1)) { + warn("WARNING: Unable to parse HGVS notation \'$line\'\n$@") unless defined $config->{quiet}; + return []; + } + + # get whole chromosome slice + my $slice = $vf->slice->adaptor->fetch_by_region($vf->slice->coord_system->name, $vf->slice->seq_region_name); + + $vf = $vf->transfer($slice); + + # name it after the HGVS + $vf->{variation_name} = $line; + + # add chr attrib + $vf->{chr} = $vf->slice->seq_region_name; + + return [$vf]; +} + +# parse a variation identifier e.g. a dbSNP rsID +sub parse_id { + my $config = shift; + my $line = shift; + + my $v_obj = $config->{va}->fetch_by_name($line); + + return [] unless defined $v_obj; + + my @vfs = @{$v_obj->get_all_VariationFeatures}; + delete $_->{dbID} for @vfs; + delete $_->{overlap_consequences} for @vfs; + $_->{chr} = $_->seq_region_name for @vfs; + + return \@vfs; +} + +# parse a line of VEP output +sub parse_vep { + my $config = shift; + my $line = shift; + + my @data = split /\t/, $line; + + my ($chr, $start, $end) = split /\:|\-/, $data[1]; + $end ||= $start; + + # might get allele string from ID + my $allele_string; + + if($data[0] =~ /^\w\_\w\_\w$/) { + my @split = split /\_/, $data[0]; + $allele_string = $split[-1] if $split[-1] =~ /[ACGTN-]+\/[ACGTN-]+/; + } + + $allele_string ||= 'N/'.($data[6] =~ /intergenic/ ? 'N' : $data[2]); + + my $vf = Bio::EnsEMBL::Variation::VariationFeature->new_fast({ + start => $start, + end => $end, + allele_string => $allele_string, + strand => 1, + map_weight => 1, + adaptor => $config->{vfa}, + chr => $chr, + variation_name => $data[0], + }); + + return [$vf]; +} + + + +# converts to VCF format +sub convert_to_vcf { + my $config = shift; + my $vf = shift; + + # look for imbalance in the allele string + my %allele_lengths; + my @alleles = split /\//, $vf->allele_string; + + foreach my $allele(@alleles) { + $allele =~ s/\-//g; + $allele_lengths{length($allele)} = 1; + } + + # in/del/unbalanced + if(scalar keys %allele_lengths > 1) { + + # we need the ref base before the variation + # default to N in case we can't get it + my $prev_base = 'N'; + + unless(defined($config->{cache})) { + my $slice = $vf->slice->sub_Slice($vf->start - 1, $vf->start - 1); + $prev_base = $slice->seq if defined($slice); + } + + for my $i(0..$#alleles) { + $alleles[$i] =~ s/\-//g; + $alleles[$i] = $prev_base.$alleles[$i]; + } + + return [ + $vf->{chr} || $vf->seq_region_name, + $vf->start - 1, + $vf->variation_name, + shift @alleles, + (join ",", @alleles), + '.', '.', '.' + ]; + + } + + # balanced sub + else { + return [ + $vf->{chr} || $vf->seq_region_name, + $vf->start, + $vf->variation_name, + shift @alleles, + (join ",", @alleles), + '.', '.', '.' + ]; + } +} + + +# tries to map a VF to the LRG coordinate system +sub add_lrg_mappings { + my $config = shift; + my $vfs = shift; + + my @new_vfs; + + foreach my $vf(@$vfs) { + + # add the unmapped VF to the array + push @new_vfs, $vf; + + # make sure the VF has an attached slice + $vf->{slice} ||= get_slice($config, $vf->{chr}); + next unless defined($vf->{slice}); + + # transform LRG <-> chromosome + my $new_vf = $vf->transform($vf->{slice}->coord_system->name eq 'lrg' ? 'chromosome' : 'lrg'); + + # add it to the array if transformation worked + if(defined($new_vf)) { + + # update new VF's chr entry + $new_vf->{chr} = $new_vf->seq_region_name; + push @new_vfs, $new_vf; + } + } + + return \@new_vfs; +} + + +# wrapper for whole_genome_fetch and vf_to_consequences +# takes config and a listref of VFs, returns listref of line hashes for printing +sub get_all_consequences { + my $config = shift; + my $listref = shift; + + if ($config->{extra}) { + eval "use Plugin qw($config);" + } + + # initialize caches + $config->{$_.'_cache'} ||= {} for qw(tr rf slice); + + # build hash + my %vf_hash; + push @{$vf_hash{$_->{chr}}{int($_->{start} / $config->{chunk_size})}{$_->{start}}}, $_ for grep {!defined($_->{non_variant})} @$listref; + + my @non_variant = grep {defined($_->{non_variant})} @$listref; + debug("Skipping ".(scalar @non_variant)." non-variant loci\n") unless defined($config->{quiet}); + + # get regions + my $regions = ®ions_from_hash($config, \%vf_hash); + my $trim_regions = $regions; + + # get trimmed regions - allows us to discard out-of-range transcripts + # when using cache + #if(defined($config->{cache})) { + # my $tmp = $config->{cache}; + # delete $config->{cache}; + # $trim_regions = regions_from_hash($config, \%vf_hash); + # $config->{cache} = $tmp; + #} + + # prune caches + prune_cache($config, $config->{tr_cache}, $regions, $config->{loaded_tr}); + prune_cache($config, $config->{rf_cache}, $regions, $config->{loaded_rf}); + + # get chr list + my %chrs = map {$_->{chr} => 1} @$listref; + + my $fetched_tr_count = 0; + $fetched_tr_count = fetch_transcripts($config, $regions, $trim_regions) + unless defined($config->{no_consequences}); + + my $fetched_rf_count = 0; + $fetched_rf_count = fetch_regfeats($config, $regions, $trim_regions) + if defined($config->{regulatory}) + && !defined($config->{no_consequences}); + + # check we can use MIME::Base64 + if(defined($config->{fork})) { + eval q{ use MIME::Base64; }; + + if($@) { + debug("WARNING: Unable to load MIME::Base64, forking disabled") unless defined($config->{quiet}); + delete $config->{fork}; + } + } + + my (@temp_array, @return, @pids); + if(defined($config->{fork})) { + my $size = scalar @$listref; + + while(my $tmp_vf = shift @$listref) { + push @temp_array, $tmp_vf; + + # fork + if(scalar @temp_array >= ($size / $config->{fork}) || scalar @$listref == 0) { + + my $pid = fork; + + if(!defined($pid)) { + debug("WARNING: Failed to fork - will attempt to continue without forking") unless defined($config->{quiet}); + push @temp_array, @$listref; + push @return, @{vf_list_to_cons($config, \@temp_array, $regions)}; + last; + } + elsif($pid) { + push @pids, $pid; + @temp_array = (); + } + elsif($pid == 0) { + + $config->{forked} = $$; + $config->{quiet} = 1; + + # redirect STDERR to PARENT so we can catch errors + *STDERR = *PARENT; + + my $cons = vf_list_to_cons($config, \@temp_array, $regions); + + # what we're doing here is sending a serialised hash of the + # results through to the parent process through the socket. + # This is then thawed by the parent process. + # $$, or the PID, is added so that the input can be sorted + # back into the correct order for output + print PARENT $$." ".encode_base64(freeze($_), "\t")."\n" for @$cons; + + # some plugins may cache stuff, check for this and try and + # reconstitute it into parent's plugin cache + foreach my $plugin(@{$config->{plugins}}) { + + next unless defined($plugin->{has_cache}); + + # delete unnecessary stuff and stuff that can't be serialised + delete $plugin->{$_} for qw(config feature_types variant_feature_types version feature_types_wanted variant_feature_types_wanted params); + print PARENT $$." PLUGIN ".ref($plugin)." ".encode_base64(freeze($plugin), "\t")."\n"; + } + + # we need to tell the parent this child is finished + # otherwise it keeps listening + print PARENT "DONE $$\n"; + close PARENT; + + exit(0); + } + } + } + + debug("Calculating consequences") unless defined($config->{quiet}); + + my $fh = $config->{out_file_handle}; + my $done_processes = 0; + my $done_vars = 0; + my $total_size = $size; + my $pruned_count; + + # create a hash to store the returned data by PID + # this means we can sort it correctly on output + my %by_pid; + + # read child input + while() { + + # child finished + if(/^DONE/) { + $done_processes++; + last if $done_processes == scalar @pids; + } + + # variant finished / progress indicator + elsif(/^BUMP/) { + progress($config, ++$done_vars, $total_size);# if $pruned_count == scalar @pids; + } + + # output + elsif(/^\-?\d+ /) { + + # plugin + if(/^\-?\d+ PLUGIN/) { + + m/^(\-?\d+) PLUGIN (\w+) /; + my ($pid, $plugin) = ($1, $2); + + # remove the PID + s/^\-?\d+ PLUGIN \w+ //; + chomp; + + my $tmp = thaw(decode_base64($_)); + + next unless defined($plugin); + + # copy data to parent plugin + my ($parent_plugin) = grep {ref($_) eq $plugin} @{$config->{plugins}}; + + next unless defined($parent_plugin); + + merge_hashes($parent_plugin, $tmp); + } + + else { + # grab the PID + m/^(\-?\d+)\s/; + my $pid = $1; + die "ERROR: Could not parse forked PID from line $_" unless defined($pid); + + # remove the PID + s/^\-?\d+\s//; + chomp; + + # decode and thaw "output" from forked process + push @{$by_pid{$pid}}, thaw(decode_base64($_)); + } + } + + elsif(/^PRUNED/) { + s/PRUNED //g; + chomp; + $pruned_count++; + $total_size += $_; + } + + elsif(/^DEBUG/) { + print STDERR; + } + + # something's wrong + else { + # kill the other pids + kill(15, $_) for @pids; + die("\nERROR: Forked process failed\n$_\n"); + } + } + + end_progress($config); + + debug("Writing output") unless defined($config->{quiet}); + + waitpid($_, 0) for @pids; + + # add the sorted data to the return array + push @return, @{$by_pid{$_} || []} for @pids; + } + + # no forking + else { + push @return, @{vf_list_to_cons($config, $listref, $regions)}; + } + + if(defined($config->{debug})) { + eval q{use Devel::Size qw(total_size)}; + my $mem = memory(); + my $tot; + $tot += $_ for @$mem; + + if($tot > 1000000) { + $tot = sprintf("%.2fGB", $tot / (1024 * 1024)); + } + + elsif($tot > 1000) { + $tot = sprintf("%.2fMB", $tot / 1024); + } + + my $mem_diff = mem_diff($config); + debug( + "LINES ", $config->{line_number}, + "\tMEMORY $tot ", (join " ", @$mem), + "\tDIFF ", (join " ", @$mem_diff), + "\tCACHE ", total_size($config->{tr_cache}). + "\tRF ", total_size($config->{rf_cache}), + "\tVF ", total_size(\%vf_hash), + ); + #exit(0) if grep {$_ < 0} @$mem_diff; + } + + return \@return; +} + +sub vf_list_to_cons { + my $config = shift; + my $listref = shift; + my $regions = shift; + + # get non-variants + my @non_variants = grep {$_->{non_variant}} @$listref; + + my %vf_hash = (); + push @{$vf_hash{$_->{chr}}{int($_->{start} / $config->{chunk_size})}{$_->{start}}}, $_ for grep {!defined($_->{non_variant})} @$listref; + + # check existing VFs + &check_existing_hash($config, \%vf_hash) if defined($config->{check_existing});# && scalar @$listref > 10; + + # get overlapping SVs + &check_svs_hash($config, \%vf_hash) if defined($config->{check_svs}); + + # if we are forked, we can trim off some stuff + if(defined($config->{forked})) { + my $tmp = $config->{cache}; + delete $config->{cache}; + $regions = regions_from_hash($config, \%vf_hash); + $config->{cache} = $tmp; + + # prune caches + my $new_count = 0; + + $new_count += prune_cache($config, $config->{tr_cache}, $regions, $config->{loaded_tr}); + $new_count += prune_cache($config, $config->{rf_cache}, $regions, $config->{loaded_rf}); + + print PARENT "PRUNED $new_count\n"; + } + + my @return; + + foreach my $chr(sort {($a !~ /^\d+$/ || $b !~ /^\d+/) ? $a cmp $b : $a <=> $b} keys %vf_hash) { + my $finished_vfs = whole_genome_fetch($config, $chr, \%vf_hash); + + # non-variants? + if(scalar @non_variants) { + push @$finished_vfs, grep {$_->{chr} eq $chr} @non_variants; + + # need to re-sort + @$finished_vfs = sort {$a->{start} <=> $b->{start} || $a->{end} <=> $b->{end}} @$finished_vfs; + } + + debug("Calculating consequences") unless defined($config->{quiet}); + + my $vf_count = scalar @$finished_vfs; + my $vf_counter = 0; + + while(my $vf = shift @$finished_vfs) { + progress($config, $vf_counter++, $vf_count) unless $vf_count == 1; + + my $filter_ok = 1; + + # filtered output + if(defined($config->{filter})) { + $filter_ok = filter_by_consequence($config, $vf); + $config->{filter_count} += $filter_ok; + } + + # skip filtered lines + next unless $filter_ok; + + # original output + if(defined($config->{original})) { + push @return, $vf->{_line}; + } + + # GVF output + elsif(defined($config->{gvf})) { + $vf->source("User"); + + $config->{gvf_id} ||= 1; + + # get custom annotation + my $custom_annotation = defined($config->{custom}) ? get_custom_annotation($config, $vf) : {}; + $custom_annotation->{ID} = $config->{gvf_id}++; + + + my $tmp = $vf->to_gvf( + include_consequences => defined($config->{no_consequences}) ? 0 : 1, + extra_attrs => $custom_annotation, + ); + push @return, \$tmp; + } + + # VCF output + elsif(defined($config->{vcf})) { + + # convert to VCF, otherwise get line + my $line = $config->{format} eq 'vcf' ? [split /\s+/, $vf->{_line}] : convert_to_vcf($config, $vf); + + if(!defined($line->[7]) || $line->[7] eq '.') { + $line->[7] = ''; + } + + # get all the lines the normal way + # and process them into VCF-compatible string + my $string = 'CSQ='; + + foreach my $line(@{vf_to_consequences($config, $vf)}) { + + # use the field list (can be user-defined by setting --fields) + for my $col(@{$config->{fields}}) { + + # skip fields already represented in the VCF + next if $col eq 'Uploaded_variation' or $col eq 'Location' or $col eq 'Extra'; + + # search for data in main line hash as well as extra field + my $data = defined $line->{$col} ? $line->{$col} : $line->{Extra}->{$col}; + + # "-" means null for everything except the Allele field (confusing...) + $data = undef if defined($data) and $data eq '-' and $col ne 'Allele'; + $data =~ s/\,/\&/g if defined $data; + $string .= defined($data) ? $data : ''; + $string .= '|'; + } + + $string =~ s/\|$//; + $string .= ','; + } + + $string =~ s/\,$//; + + if(!defined($config->{no_consequences}) && $string ne 'CSQ=') { + $line->[7] .= ($line->[7] ? ';' : '').$string; + } + + # get custom annotation + if(defined($config->{custom}) && scalar @{$config->{custom}}) { + my $custom_annotation = get_custom_annotation($config, $vf); + foreach my $key(keys %{$custom_annotation}) { + $line->[7] .= ($line->[7] ? ';' : '').$key.'='.$custom_annotation->{$key}; + } + } + + my $tmp = join "\t", @$line; + push @return, \$tmp; + } + + # no consequence output from vep input + elsif(defined($config->{no_consequences}) && $config->{format} eq 'vep') { + + my $line = [split /\s+/, $vf->{_line}]; + + if($line->[13] eq '-') { + $line->[13] = ''; + } + + # get custom annotation + if(defined($config->{custom})) { + my $custom_annotation = get_custom_annotation($config, $vf); + foreach my $key(keys %{$custom_annotation}) { + $line->[13] .= ($line->[13] ? ';' : '').$key.'='.$custom_annotation->{$key}; + } + } + + my $tmp = join "\t", @$line; + push @return, \$tmp; + } + + # normal output + else { + push @return, @{vf_to_consequences($config, $vf)}; + } + + print PARENT "BUMP\n" if defined($config->{forked}); + } + + end_progress($config) unless scalar @$listref == 1; + } + + return \@return; +} + +# takes a variation feature and returns ready to print consequence information +sub vf_to_consequences { + my $config = shift; + my $vf = shift; + + # use a different method for SVs + return svf_to_consequences($config, $vf) if $vf->isa('Bio::EnsEMBL::Variation::StructuralVariationFeature'); + + my @return = (); + + # method name for consequence terms + my $term_method = $config->{terms}.'_term'; + + # find any co-located existing VFs + $vf->{existing} ||= find_existing($config, $vf) if defined $config->{check_existing}; + + # skip based on frequency checks? + if(defined($config->{check_frequency}) && defined($vf->{existing}) && $vf->{existing} ne '-' && defined($config->{va})) { + return [] unless grep {$_} map {check_frequencies($config, $_)} reverse split(/\,/, $vf->{existing}); + $vf->{freqs} = $config->{filtered_freqs}; + } + + # force empty hash into object's transcript_variations if undefined from whole_genome_fetch + # this will stop the API trying to go off and fill it again + $vf->{transcript_variations} ||= {} if defined $config->{whole_genome}; + + # regulatory stuff + if(!defined $config->{coding_only} && defined $config->{regulatory}) { + + for my $rfv (@{ $vf->get_all_RegulatoryFeatureVariations }) { + + my $rf = $rfv->regulatory_feature; + + my $base_line = { + Feature_type => 'RegulatoryFeature', + Feature => $rf->stable_id, + }; + + if(defined($config->{cell_type}) && scalar(@{$config->{cell_type}})) { + $base_line->{Extra}->{CELL_TYPE} = join ",", + map {$_.':'.$rf->{cell_types}->{$_}} + grep {$rf->{cell_types}->{$_}} + @{$config->{cell_type}}; + + $base_line->{Extra}->{CELL_TYPE} =~ s/\s+/\_/g; + } + + # this currently always returns 'RegulatoryFeature', so we ignore it for now + #$base_line->{Extra}->{REG_FEAT_TYPE} = $rf->feature_type->name; + + for my $rfva (@{ $rfv->get_all_alternate_RegulatoryFeatureVariationAlleles }) { + + my $line = init_line($config, $vf, $base_line); + + $line->{Allele} = $rfva->variation_feature_seq; + $line->{Consequence} = join ',', + map { $_->$term_method || $_->SO_term } + @{ $rfva->get_all_OverlapConsequences }; + + $line = run_plugins($rfva, $line, $config); + + push @return, $line; + } + } + + for my $mfv (@{ $vf->get_all_MotifFeatureVariations }) { + + my $mf = $mfv->motif_feature; + + # check that the motif has a binding matrix, if not there's not + # much we can do so don't return anything + + next unless defined $mf->binding_matrix; + + my $matrix = $mf->binding_matrix->description.' '.$mf->display_label; + $matrix =~ s/\s+/\_/g; + + my $base_line = { + Feature_type => 'MotifFeature', + Feature => $mf->binding_matrix->name, + Extra => { + MOTIF_NAME => $matrix, + } + }; + + if(defined($config->{cell_type}) && scalar(@{$config->{cell_type}})) { + $base_line->{Extra}->{CELL_TYPE} = join ",", + map {$_.':'.$mf->{cell_types}->{$_}} + grep {$mf->{cell_types}->{$_}} + @{$config->{cell_type}}; + + $base_line->{Extra}->{CELL_TYPE} =~ s/\s+/\_/g; + } + + for my $mfva (@{ $mfv->get_all_alternate_MotifFeatureVariationAlleles }) { + + my $line = init_line($config, $vf, $base_line); + + $line->{Extra}->{MOTIF_POS} = $mfva->motif_start if defined $mfva->motif_start; + $line->{Extra}->{HIGH_INF_POS} = ($mfva->in_informative_position ? 'Y' : 'N'); + + my $delta = $mfva->motif_score_delta if $mfva->variation_feature_seq =~ /^[ACGT]+$/; + + $line->{Extra}->{MOTIF_SCORE_CHANGE} = sprintf("%.3f", $delta) if defined $delta; + + $line->{Allele} = $mfva->variation_feature_seq; + $line->{Consequence} = join ',', + map { $_->$term_method || $_->SO_term } + @{ $mfva->get_all_OverlapConsequences }; + + $line = run_plugins($mfva, $line, $config); + + push @return, $line; + } + } + } + + # get TVs + my $tvs = $vf->get_all_TranscriptVariations; + + # only most severe + if(defined($config->{most_severe}) || defined($config->{summary})) { + + my $line = init_line($config, $vf); + + if(defined($config->{summary})) { + $line->{Consequence} = join ",", @{$vf->consequence_type($config->{terms}) || $vf->consequence_type}; + } + else { + $line->{Consequence} = $vf->display_consequence($config->{terms}) || $vf->display_consequence; + } + + push @return, $line; + } + + # pass a true argument to get_IntergenicVariation to stop it doing a reference allele check + # (to stay consistent with the rest of the VEP) + elsif ((my $iv = $vf->get_IntergenicVariation(1)) && !defined($config->{no_intergenic})) { + + for my $iva (@{ $iv->get_all_alternate_IntergenicVariationAlleles }) { + + my $line = init_line($config, $vf); + + $line->{Allele} = $iva->variation_feature_seq; + + my $cons = $iva->get_all_OverlapConsequences->[0]; + + $line->{Consequence} = $cons->$term_method || $cons->SO_term; + + $line = run_plugins($iva, $line, $config); + + push @return, $line; + } + } + + # user wants only one conseqeunce per gene + elsif(defined($config->{per_gene})) { + + # sort the TVA objects into a hash by gene + my %by_gene; + + foreach my $tv(@$tvs) { + next if(defined $config->{coding_only} && !($tv->affects_cds)); + + my $gene = $tv->transcript->{_gene_stable_id} || $config->{ga}->fetch_by_transcript_stable_id($tv->transcript->stable_id)->stable_id; + + push @{$by_gene{$gene}}, @{$tv->get_all_alternate_TranscriptVariationAlleles}; + } + + foreach my $gene(keys %by_gene) { + my ($lowest, $lowest_tva); + + # at the moment this means the one that comes out last will be picked + # if there is more than one TVA with the same rank of consequence + foreach my $tva(@{$by_gene{$gene}}) { + foreach my $oc(@{$tva->get_all_OverlapConsequences}) { + if(!defined($lowest) || $oc->rank < $lowest) { + $lowest = $oc->rank; + $lowest_tva = $tva; + } + } + } + + push @return, tva_to_line($config, $lowest_tva); + } + } + + else { + foreach my $tv(@$tvs) { + next if(defined $config->{coding_only} && !($tv->affects_cds)); + + push @return, map {tva_to_line($config, $_)} @{$tv->get_all_alternate_TranscriptVariationAlleles}; + + undef $tv->{$_} for keys %$tv; + } + } + + return \@return; +} + +# get consequences for a structural variation feature +sub svf_to_consequences { + my $config = shift; + my $svf = shift; + + my @return = (); + + my $term_method = $config->{terms}.'_term'; + + if(defined $config->{whole_genome}) { + $svf->{transcript_structural_variations} ||= []; + $svf->{regulation_structural_variations}->{$_} ||= [] for @REG_FEAT_TYPES; + } + + if ((my $iv = $svf->get_IntergenicStructuralVariation(1)) && !defined($config->{no_intergenic})) { + + for my $iva (@{ $iv->get_all_alternate_IntergenicStructuralVariationAlleles }) { + + my $line = init_line($config, $svf); + + $line->{Allele} = '-'; + + my $cons = $iva->get_all_OverlapConsequences->[0]; + + $line->{Consequence} = $cons->$term_method || $cons->SO_term; + + $line = run_plugins($iva, $line, $config); + + push @return, $line; + } + } + + foreach my $svo(@{$svf->get_all_StructuralVariationOverlaps}) { + + next if $svo->isa('Bio::EnsEMBL::Variation::IntergenicStructuralVariation'); + + my $feature = $svo->feature; + + # get feature type + my $feature_type = (split '::', ref($feature))[-1]; + + my $base_line = { + Feature_type => $feature_type, + Feature => $feature->stable_id, + Allele => $svf->class_SO_term, + }; + + if($svo->isa('Bio::EnsEMBL::Variation::BaseTranscriptVariation')) { + $base_line->{cDNA_position} = format_coords($svo->cdna_start, $svo->cdna_end); + $base_line->{CDS_position} = format_coords($svo->cds_start, $svo->cds_end); + $base_line->{Protein_position} = format_coords($svo->translation_start, $svo->translation_end); + } + + foreach my $svoa(@{$svo->get_all_StructuralVariationOverlapAlleles}) { + my $line = init_line($config, $svf, $base_line); + + $line->{Consequence} = join ",", + #map {s/feature/$feature_type/e; $_} + map {$_->$term_method} + sort {$a->rank <=> $b->rank} + @{$svoa->get_all_OverlapConsequences}; + + # work out overlap amounts + my $overlap_start = (sort {$a <=> $b} ($svf->start, $feature->start))[-1]; + my $overlap_end = (sort {$a <=> $b} ($svf->end, $feature->end))[0]; + my $overlap_length = ($overlap_end - $overlap_start) + 1; + my $overlap_pc = 100 * ($overlap_length / (($feature->end - $feature->start) + 1)); + + $line->{Extra}->{OverlapBP} = $overlap_length if $overlap_length > 0; + $line->{Extra}->{OverlapPC} = sprintf("%.2f", $overlap_pc) if $overlap_pc > 0; + + add_extra_fields($config, $line, $svoa); + + push @return, $line; + } + } + + return \@return; +} + +# run all of the configured plugins on a VariationFeatureOverlapAllele instance +# and store any results in the provided line hash +sub run_plugins { + + my ($bvfoa, $line_hash, $config) = @_; + + my $skip_line = 0; + + for my $plugin (@{ $config->{plugins} || [] }) { + + # check that this plugin is interested in this type of variation feature + + if ($plugin->check_variant_feature_type(ref $bvfoa->base_variation_feature)) { + + # check that this plugin is interested in this type of feature + + if ($plugin->check_feature_type(ref $bvfoa->feature || 'Intergenic')) { + + eval { + my $plugin_results = $plugin->run($bvfoa, $line_hash); + + if (defined $plugin_results) { + if (ref $plugin_results eq 'HASH') { + for my $key (keys %$plugin_results) { + $line_hash->{Extra}->{$key} = $plugin_results->{$key}; + } + } + else { + warn "Plugin '".(ref $plugin)."' did not return a hashref, output ignored!\n"; + } + } + else { + # if a plugin returns undef, that means it want to filter out this line + $skip_line = 1; + } + }; + if ($@) { + warn "Plugin '".(ref $plugin)."' went wrong: $@"; + } + + # there's no point running any other plugins if we're filtering this line, + # because the first plugin to skip the line wins, so we might as well last + # out of the loop now and avoid any unnecessary computation + + last if $skip_line; + } + } + } + + return $skip_line ? undef : $line_hash; +} + +# turn a TranscriptVariationAllele into a line hash +sub tva_to_line { + my $config = shift; + my $tva = shift; + + my $tv = $tva->transcript_variation; + my $t = $tv->transcript; + + # method name for consequence terms + my $term_method = $config->{terms}.'_term'; + + my $base_line = { + Feature_type => 'Transcript', + Feature => (defined $t ? $t->stable_id : undef), + cDNA_position => format_coords($tv->cdna_start, $tv->cdna_end), + CDS_position => format_coords($tv->cds_start, $tv->cds_end), + Protein_position => format_coords($tv->translation_start, $tv->translation_end), + Allele => $tva->variation_feature_seq, + Amino_acids => $tva->pep_allele_string, + Codons => $tva->display_codon_allele_string, + Consequence => join ",", map {$_->$term_method || $_->SO_term} sort {$a->rank <=> $b->rank} @{$tva->get_all_OverlapConsequences}, + }; + + my $line = init_line($config, $tva->variation_feature, $base_line); + + # HGVS + if(defined $config->{hgvs}) { + my $hgvs_t = $tva->hgvs_transcript; + my $hgvs_p = $tva->hgvs_protein; + + $line->{Extra}->{HGVSc} = $hgvs_t if $hgvs_t; + $line->{Extra}->{HGVSp} = $hgvs_p if $hgvs_p; + } + + foreach my $tool (qw(SIFT PolyPhen)) { + my $lc_tool = lc($tool); + + if (my $opt = $config->{$lc_tool}) { + my $want_pred = $opt =~ /^p/i; + my $want_score = $opt =~ /^s/i; + my $want_both = $opt =~ /^b/i; + + if ($want_both) { + $want_pred = 1; + $want_score = 1; + } + + next unless $want_pred || $want_score; + + my $pred_meth = $lc_tool.'_prediction'; + my $score_meth = $lc_tool.'_score'; + + my $pred = $tva->$pred_meth; + + if($pred) { + + if ($want_pred) { + $pred =~ s/\s+/\_/; + $line->{Extra}->{$tool} = $pred; + } + + if ($want_score) { + my $score = $tva->$score_meth; + + if(defined $score) { + if($want_pred) { + $line->{Extra}->{$tool} .= "($score)"; + } + else { + $line->{Extra}->{$tool} = $score; + } + } + } + } + } + } + + $line = add_extra_fields($config, $line, $tva); + + return $line; +} + +sub add_extra_fields { + my $config = shift; + my $line = shift; + my $bvfoa = shift; + + # overlapping SVs + if(defined $config->{check_svs} && defined $bvfoa->base_variation_feature->{overlapping_svs}) { + $line->{Extra}->{SV} = $bvfoa->base_variation_feature->{overlapping_svs}; + } + + # add transcript-specific fields + $line = add_extra_fields_transcript($config, $line, $bvfoa) if $bvfoa->isa('Bio::EnsEMBL::Variation::BaseTranscriptVariationAllele'); + + # run plugins + $line = run_plugins($bvfoa, $line, $config); + + return $line; +} + +sub add_extra_fields_transcript { + my $config = shift; + my $line = shift; + my $tva = shift; + + my $tv = $tva->base_variation_feature_overlap; + my $tr = $tva->transcript; + + # get gene + my $gene; + + $line->{Gene} = $tr->{_gene_stable_id}; + + if(!defined($line->{Gene})) { + $gene = $config->{ga}->fetch_by_transcript_stable_id($tr->stable_id); + $line->{Gene} = $gene ? $gene->stable_id : '-'; + } + + # exon/intron numbers + if ($config->{numbers}) { + $line->{Extra}->{EXON} = $tv->exon_number if defined $tv->exon_number; + $line->{Extra}->{INTRON} = $tv->intron_number if defined $tv->intron_number; + } + + if ($config->{domains}) { + my $feats = $tv->get_overlapping_ProteinFeatures; + + my @strings; + + for my $feat (@$feats) { + my $label = $feat->analysis->display_label.':'.$feat->hseqname; + + # replace any special characters + $label =~ s/[\s;=]/_/g; + + push @strings, $label; + } + + $line->{Extra}->{DOMAINS} = join ',', @strings if @strings; + } + + # distance to transcript + if($line->{Consequence} =~ /(up|down)stream/i) { + $line->{Extra}->{DISTANCE} = $tv->distance_to_transcript; + } + + # HGNC + if(defined $config->{hgnc}) { + my $hgnc; + $hgnc = $tr->{_gene_hgnc}; + + if(!defined($hgnc)) { + if(!defined($gene)) { + $gene = $config->{ga}->fetch_by_transcript_stable_id($tr->stable_id); + } + + my @entries = grep {$_->database eq 'HGNC'} @{$gene->get_all_DBEntries()}; + if(scalar @entries) { + $hgnc = $entries[0]->display_id; + } + } + + $hgnc = undef if defined($hgnc) && $hgnc eq '-'; + + $line->{Extra}->{HGNC} = $hgnc if defined($hgnc); + } + + # CCDS + if(defined($config->{ccds})) { + my $ccds = $tr->{_ccds}; + + if(!defined($ccds)) { + my @entries = grep {$_->database eq 'CCDS'} @{$tr->get_all_DBEntries}; + $ccds = $entries[0]->display_id if scalar @entries; + } + + $ccds = undef if defined($ccds) && $ccds eq '-'; + + $line->{Extra}->{CCDS} = $ccds if defined($ccds); + } + + # refseq xref + if(defined($config->{xref_refseq})) { + my $refseq = $tr->{_refseq}; + + if(!defined($refseq)) { + my @entries = grep {$_->database eq 'RefSeq_mRNA'} @{$tr->get_all_DBEntries}; + if(scalar @entries) { + $refseq = join ",", map {$_->display_id."-".$_->database} @entries; + } + } + + $refseq = undef if defined($refseq) && $refseq eq '-'; + + $line->{Extra}->{RefSeq} = $refseq if defined($refseq); + } + + # protein ID + if(defined $config->{protein}) { + my $protein = $tr->{_protein}; + + if(!defined($protein)) { + $protein = $tr->translation->stable_id if defined($tr->translation); + } + + $protein = undef if defined($protein) && $protein eq '-'; + + $line->{Extra}->{ENSP} = $protein if defined($protein); + } + + # canonical transcript + if(defined $config->{canonical}) { + $line->{Extra}->{CANONICAL} = 'YES' if $tr->is_canonical; + } + + return $line; +} + +# initialize a line hash +sub init_line { + my $config = shift; + my $vf = shift; + my $base_line = shift; + + my $line = { + Uploaded_variation => $vf->variation_name, + Location => ($vf->{chr} || $vf->seq_region_name).':'.format_coords($vf->start, $vf->end), + Existing_variation => $vf->{existing}, + Extra => {}, + }; + + # add custom info + if(defined($config->{custom}) && scalar @{$config->{custom}}) { + # merge the custom hash with the extra hash + my $custom = get_custom_annotation($config, $vf); + + for my $key (keys %$custom) { + $line->{Extra}->{$key} = $custom->{$key}; + } + } + + # individual? + $line->{Extra}->{IND} = $vf->{individual} if defined($vf->{individual}); + + # frequencies? + $line->{Extra}->{FREQS} = join ",", @{$vf->{freqs}} if defined($vf->{freqs}); + + # gmaf? + $line->{Extra}->{GMAF} = $vf->{gmaf} if defined($config->{gmaf}) && defined($vf->{gmaf}); + + # copy entries from base_line + if(defined($base_line)) { + $line->{$_} = $base_line->{$_} for keys %$base_line; + } + + return $line; +} + + +# get custom annotation for a single VF +sub get_custom_annotation { + my $config = shift; + my $vf = shift; + my $cache = shift; + + return $vf->{custom} if defined($vf->{custom}); + + my $annotation = {}; + + my $chr = $vf->{chr}; + + if(!defined($cache)) { + # spoof regions + my $regions; + $regions->{$chr} = [$vf->{start}.'-'.$vf->{end}]; + $cache = cache_custom_annotation($config, $regions, $chr); + } + + foreach my $custom(@{$config->{custom}}) { + next unless defined($cache->{$chr}->{$custom->{name}}); + + # exact type must match coords of variant exactly + if($custom->{type} eq 'exact') { + foreach my $feature(values %{$cache->{$chr}->{$custom->{name}}->{$vf->{start}}}) { + + next unless + $feature->{chr} eq $chr && + $feature->{start} eq $vf->{start} && + $feature->{end} eq $vf->{end}; + + $annotation->{$custom->{name}} .= $feature->{name}.','; + } + } + + # overlap type only needs to overlap, but we need to search the whole range + elsif($custom->{type} eq 'overlap') { + foreach my $pos(keys %{$cache->{$chr}->{$custom->{name}}}) { + foreach my $feature(values %{$cache->{$chr}->{$custom->{name}}->{$pos}}) { + + next unless + $feature->{chr} eq $chr && + $feature->{end} >= $vf->{start} && + $feature->{start} <= $vf->{end}; + + $annotation->{$custom->{name}} .= $feature->{name}.','; + } + } + } + + # trim off trailing commas + $annotation->{$custom->{name}} =~ s/\,$//g if defined($annotation->{$custom->{name}}); + } + + return $annotation; +} + +# decides whether to print a VF based on user defined consequences +sub filter_by_consequence { + my $config = shift; + my $vf = shift; + my $filters = $config->{filter}; + + # find it if we only have "no"s + my $only_nos = 0; + $only_nos = 1 if (sort {$a <=> $b} values %$filters)[-1] == 0; + + my ($yes, $no) = (0, 0); + + # get all consequences across all term types + my @types = ('SO', 'display'); + + my @cons; + push @cons, @{$vf->consequence_type($_)} for @types; + + # add regulatory consequences + if(defined($config->{regulatory})) { + foreach my $term_type(@types) { + my $term_method = $term_type.'_term'; + + for my $rfv (@{ $vf->get_all_RegulatoryFeatureVariations }) { + for my $rfva(@{$rfv->get_all_alternate_RegulatoryFeatureVariationAlleles}) { + push @cons, map {$_->$term_method} @{ $rfva->get_all_OverlapConsequences }; + } + } + for my $mfv (@{ $vf->get_all_MotifFeatureVariations }) { + for my $mfva(@{$mfv->get_all_alternate_MotifFeatureVariationAlleles}) { + push @cons, map {$_->$term_method} @{ $mfva->get_all_OverlapConsequences }; + } + } + } + } + + foreach my $con(grep {defined($_) && defined($filters->{$_})} @cons) { + if($filters->{$con} == 1) { + $yes = 1; + } + else { + $no = 1; + } + } + + # check special case, coding + if(defined($filters->{coding})) { + if(grep {$_->affects_cds} @{$vf->get_all_TranscriptVariations}) { + if($filters->{coding} == 1) { + $yes = 1; + } + else { + $no = 1; + } + } + } + + my $ok = 0; + if($only_nos) { + $ok = 1 if !$no; + } + else { + $ok = 1 if $yes && !$no; + } + + return $ok; +} + + +# takes VFs created from input, fixes and checks various things +sub validate_vf { + my $config = shift; + my $vf = shift; + + # user specified chr skip list + return 0 if defined($config->{chr}) && !$config->{chr}->{$vf->{chr}}; + + # fix inputs + $vf->{chr} =~ s/chr//ig unless $vf->{chr} =~ /^chromosome$/i; + $vf->{chr} = 'MT' if $vf->{chr} eq 'M'; + $vf->{strand} ||= 1; + $vf->{strand} = ($vf->{strand} =~ /\-/ ? "-1" : "1"); + + # sanity checks + unless($vf->{start} =~ /^\d+$/ && $vf->{end} =~ /^\d+$/) { + warn("WARNING: Start ".$vf->{start}." or end ".$vf->{end}." coordinate invalid on line ".$config->{line_number}."\n") unless defined $config->{quiet}; + return 0; + } + + # structural variation? + return validate_svf($config, $vf) if $vf->isa('Bio::EnsEMBL::Variation::StructuralVariationFeature'); + + # uppercase allele string + $vf->{allele_string} =~ tr/[a-z]/[A-Z]/; + + unless($vf->{allele_string} =~ /([ACGT-]+\/*)+/) { + warn("WARNING: Invalid allele string ".$vf->{allele_string}." on line ".$config->{line_number}." or possible parsing error\n") unless defined $config->{quiet}; + return 0; + } + + # insertion should have start = end + 1 + if($vf->{allele_string} =~ /^\-\// && $vf->{start} != $vf->{end} + 1) { + warn( + "WARNING: Alleles look like an insertion (". + $vf->{allele_string}. + ") but coordinates are not start = end + 1 (START=". + $vf->{start}.", END=".$vf->{end}. + ") on line ".$config->{line_number}."\n" + ) unless defined($config->{quiet}); + return 0; + } + + # check length of reference matches seq length spanned + my @alleles = split /\//, $vf->{allele_string}; + my $ref_allele = shift @alleles; + my $tmp_ref_allele = $ref_allele; + $tmp_ref_allele =~ s/\-//g; + + #if(($vf->{end} - $vf->{start}) + 1 != length($tmp_ref_allele)) { + # warn( + # "WARNING: Length of reference allele (".$ref_allele. + # " length ".length($tmp_ref_allele).") does not match co-ordinates ".$vf->{start}."-".$vf->{end}. + # " on line ".$config->{line_number} + # ) unless defined($config->{quiet}); + # return 0; + #} + + # flag as unbalanced + foreach my $allele(@alleles) { + $allele =~ s/\-//g; + $vf->{indel} = 1 unless length($allele) == length($tmp_ref_allele); + } + + # check reference allele if requested + if(defined $config->{check_ref}) { + my $ok = 0; + my $slice_ref_allele; + + # insertion, therefore no ref allele to check + if($ref_allele eq '-') { + $ok = 1; + } + else { + my $slice_ref = $vf->{slice}->sub_Slice($vf->{start}, $vf->{end}, $vf->{strand}); + + if(!defined($slice_ref)) { + warn "WARNING: Could not fetch sub-slice from ".$vf->{start}."\-".$vf->{end}."\(".$vf->{strand}."\) on line ".$config->{line_number} unless defined $config->{quiet}; + } + + else { + $slice_ref_allele = $slice_ref->seq; + $ok = ($slice_ref_allele eq $ref_allele ? 1 : 0); + } + } + + if(!$ok) { + warn + "WARNING: Specified reference allele $ref_allele ", + "does not match Ensembl reference allele", + ($slice_ref_allele ? " $slice_ref_allele" : ""), + " on line ".$config->{line_number} unless defined $config->{quiet}; + return 0; + } + } + + return 1; +} + + +# validate a structural variation +sub validate_svf { + my $config = shift; + my $svf = shift; + + return 1; +} + + +# takes a hash of VFs and fetches consequences by pre-fetching overlapping transcripts +# from database and/or cache +sub whole_genome_fetch { + my $config = shift; + my $chr = shift; + my $vf_hash = shift; + + my (%vf_done, @finished_vfs, %seen_rfs); + + if(defined($config->{offline}) && !-e $config->{dir}.'/'.$chr) { + debug("No cache found for chromsome $chr") unless defined($config->{quiet}); + + foreach my $chunk(keys %{$vf_hash->{$chr}}) { + foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) { + push @finished_vfs, @{$vf_hash->{$chr}{$chunk}{$pos}}; + } + } + + return \@finished_vfs; + } + + my $slice_cache = $config->{slice_cache}; + build_slice_cache($config, $config->{tr_cache}) unless defined($slice_cache->{$chr}); + + debug("Analyzing chromosome $chr") unless defined($config->{quiet}); + + # custom annotations + whole_genome_fetch_custom($config, $vf_hash, $chr) if defined($config->{custom}); + + # split up normal variations from SVs + my ($tmp_vf_hash, @svfs); + + foreach my $chunk(keys %{$vf_hash->{$chr}}) { + foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) { + foreach my $vf(@{$vf_hash->{$chr}{$chunk}{$pos}}) { + if($vf->isa('Bio::EnsEMBL::Variation::StructuralVariationFeature')) { + push @svfs, $vf; + } + else { + push @{$tmp_vf_hash->{$chr}{$chunk}{$pos}}, $vf; + } + } + } + } + + $vf_hash = $tmp_vf_hash; + + # transcript annotations + whole_genome_fetch_transcript($config, $vf_hash, $chr) + unless defined($config->{no_consequences}); + + # regulatory annotations + whole_genome_fetch_reg($config, $vf_hash, $chr) + if defined($config->{regulatory}) + && !defined($config->{no_consequences}); + + # structural variations + @finished_vfs = @{whole_genome_fetch_sv($config, \@svfs, $chr)} + if scalar @svfs; + + # sort results into @finished_vfs array + foreach my $chunk(keys %{$vf_hash->{$chr}}) { + foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) { + + # pinch slice from slice cache if we don't already have it + $_->{slice} ||= $slice_cache->{$chr} for @{$vf_hash->{$chr}{$chunk}{$pos}}; + + if(defined($config->{regulatory})) { + foreach my $type(@REG_FEAT_TYPES) { + $_->{regulation_variations}->{$type} ||= [] for @{$vf_hash->{$chr}{$chunk}{$pos}}; + } + } + + if(defined($config->{custom})) { + $_->{custom} ||= {} for @{$vf_hash->{$chr}{$chunk}{$pos}}; + } + + $_->{transcript_variations} ||= {} for @{$vf_hash->{$chr}{$chunk}{$pos}}; + + # add to final array + push @finished_vfs, @{$vf_hash->{$chr}{$chunk}{$pos}}; + } + } + + # sort + @finished_vfs = sort {$a->{start} <=> $b->{start} || $a->{end} <=> $b->{end}} @finished_vfs; + + # clean hash + delete $vf_hash->{$chr}; + + return \@finished_vfs; +} + +sub whole_genome_fetch_custom { + my $config = shift; + my $vf_hash = shift; + my $chr = shift; + + return unless scalar @{$config->{custom}}; + + # create regions based on VFs instead of chunks + my $tmp_regions; + + foreach my $chunk(keys %{$vf_hash->{$chr}}) { + foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) { + foreach my $vf(@{$vf_hash->{$chr}{$chunk}{$pos}}) { + push @{$tmp_regions->{$chr}}, ($vf->{start}-1).'-'.($vf->{end}+1); + } + } + } + + return unless defined($tmp_regions->{$chr}); + + # cache annotations + my $annotation_cache = cache_custom_annotation($config, $tmp_regions, $chr); + + # count and report + my $total_annotations = 0; + $total_annotations += scalar keys %{$annotation_cache->{$chr}->{$_}} for keys %{$annotation_cache->{$chr}}; + debug("Retrieved $total_annotations custom annotations (", (join ", ", map {(scalar keys %{$annotation_cache->{$chr}->{$_}}).' '.$_} keys %{$annotation_cache->{$chr}}), ")"); + + # compare annotations to variations in hash + debug("Analyzing custom annotations") unless defined($config->{quiet}); + my $total = scalar keys %{$vf_hash->{$chr}}; + my $i = 0; + + foreach my $chunk(keys %{$vf_hash->{$chr}}) { + progress($config, $i++, $total); + + foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) { + foreach my $vf(@{$vf_hash->{$chr}{$chunk}{$pos}}) { + + $vf->{custom} = get_custom_annotation($config, $vf, $annotation_cache); + } + } + } + + end_progress($config); +} + +sub whole_genome_fetch_transcript { + my $config = shift; + my $vf_hash = shift; + my $chr = shift; + + my $tr_cache = $config->{tr_cache}; + my $slice_cache = $config->{slice_cache}; + + my $up_down_size = MAX_DISTANCE_FROM_TRANSCRIPT; + + # check we have defined regions + return unless defined($vf_hash->{$chr}) && defined($tr_cache->{$chr}); + + # copy slice from transcript to slice cache + $slice_cache = build_slice_cache($config, $tr_cache) unless defined($slice_cache->{$chr}); + + debug("Analyzing variants") unless defined($config->{quiet}); + + my $tr_counter = 0; + my $tr_count = scalar @{$tr_cache->{$chr}}; + + while($tr_counter < $tr_count) { + + progress($config, $tr_counter, $tr_count); + print PARENT "BUMP\n" if defined($config->{forked}); + + my $tr = $tr_cache->{$chr}->[$tr_counter++]; + + # do each overlapping VF + my $s = $tr->start - $up_down_size; + my $e = $tr->end + $up_down_size; + + # get the chunks this transcript overlaps + my %chunks; + $chunks{$_} = 1 for (int($s/$config->{chunk_size})..int($e/$config->{chunk_size})); + map {delete $chunks{$_} unless defined($vf_hash->{$chr}{$_})} keys %chunks; + + # pointer to previous VF + # used to tell plugins this is the last variant analysed in this transcript + my $previous_vf; + + foreach my $chunk(keys %chunks) { + foreach my $vf( + grep {$_->{start} <= $e && $_->{end} >= $s} + map {@{$vf_hash->{$chr}{$chunk}{$_}}} + keys %{$vf_hash->{$chr}{$chunk}} + ) { + # pinch slice from slice cache if we don't already have it + $vf->{slice} ||= $slice_cache->{$chr}; + + my $tv = Bio::EnsEMBL::Variation::TranscriptVariation->new( + -transcript => $tr, + -variation_feature => $vf, + -adaptor => $config->{tva}, + -no_ref_check => 1, + -no_transfer => 1 + ); + + # prefetching stuff here prevents doing loads at the + # end and makes progress reporting more useful + $tv->_prefetch_for_vep; + + $vf->add_TranscriptVariation($tv); + + # cache VF on the transcript if it is an unbalanced sub + push @{$tr->{indels}}, $vf if defined($vf->{indel}); + + if(defined($config->{individual})) { + + # store VF on transcript, weaken reference to avoid circularity + push @{$tr->{vfs}}, $vf; + weaken($tr->{vfs}->[-1]); + + delete $previous_vf->{last_in_transcript}->{$tr->stable_id}; + $vf->{last_in_transcript}->{$tr->stable_id} = 1; + } + + $previous_vf = $vf; + } + } + } + + end_progress($config); +} + +sub whole_genome_fetch_reg { + my $config = shift; + my $vf_hash = shift; + my $chr = shift; + + my $rf_cache = $config->{rf_cache}; + my $slice_cache = $config->{slice_cache}; + + foreach my $type(keys %{$rf_cache->{$chr}}) { + debug("Analyzing ".$type."s") unless defined($config->{quiet}); + + my $constructor = 'Bio::EnsEMBL::Variation::'.$type.'Variation'; + + my $rf_counter = 0; + my $rf_count = scalar @{$rf_cache->{$chr}->{$type}}; + + while($rf_counter < $rf_count) { + + progress($config, $rf_counter, $rf_count); + print PARENT "BUMP\n" if defined($config->{forked}); + + my $rf = $rf_cache->{$chr}->{$type}->[$rf_counter++]; + + # do each overlapping VF + my $s = $rf->{start}; + my $e = $rf->{end}; + + # get the chunks this transcript overlaps + my %chunks; + $chunks{$_} = 1 for (int($s/$config->{chunk_size})..int($e/$config->{chunk_size})); + map {delete $chunks{$_} unless defined($vf_hash->{$chr}{$_})} keys %chunks; + + foreach my $chunk(keys %chunks) { + foreach my $vf( + grep {$_->{start} <= $e && $_->{end} >= $s} + map {@{$vf_hash->{$chr}{$chunk}{$_}}} + keys %{$vf_hash->{$chr}{$chunk}} + ) { + push @{$vf->{regulation_variations}->{$type}}, $constructor->new( + -variation_feature => $vf, + -feature => $rf, + -no_ref_check => 1, + -no_transfer => 1 + ); + } + } + } + + end_progress($config); + } +} + +sub whole_genome_fetch_sv { + my $config = shift; + my $svfs = shift; + my $chr = shift; + + my $tr_cache = $config->{tr_cache}; + my $rf_cache = $config->{rf_cache}; + my $slice_cache = $config->{slice_cache}; + + debug("Analyzing structural variations") unless defined($config->{quiet}); + + my($i, $total) = (0, scalar @$svfs); + + my @finished_vfs; + + foreach my $svf(@$svfs) { + progress($config, $i++, $total); + + my %done_genes = (); + + if(defined($tr_cache->{$chr})) { + foreach my $tr(grep {overlap($_->{start} - MAX_DISTANCE_FROM_TRANSCRIPT, $_->{end} + MAX_DISTANCE_FROM_TRANSCRIPT, $svf->{start}, $svf->{end})} @{$tr_cache->{$chr}}) { + my $svo = Bio::EnsEMBL::Variation::TranscriptStructuralVariation->new( + -transcript => $tr, + -structural_variation_feature => $svf, + -no_transfer => 1 + ); + + $svf->add_TranscriptStructuralVariation($svo); + } + } + + $svf->{transcript_structural_variations} ||= {}; + + # do regulatory features + if(defined($config->{regulatory}) && defined($rf_cache->{$chr})) { + foreach my $rf_type(qw/RegulatoryFeature/) {#keys %{$rf_cache->{$chr}}) { + foreach my $rf(grep {$_->{start} <= $svf->{end} && $_->end >= $svf->{end}} @{$rf_cache->{$chr}->{$rf_type}}) { + my $svo = Bio::EnsEMBL::Variation::StructuralVariationOverlap->new( + -feature => $rf, + -structural_variation_feature => $svf, + -no_transfer => 1 + ); + + push @{$svf->{regulation_structural_variations}->{$rf_type}}, $svo; + } + + $svf->{regulation_structural_variations}->{$rf_type} ||= []; + } + } + + # sort them + #$svf->_sort_svos; + push @finished_vfs, $svf; + } + + end_progress($config); + + return \@finished_vfs; +} + +# retrieves transcripts given region list +sub fetch_transcripts { + my $config = shift; + my $regions = shift; + my $trim_regions = shift; + + my $tr_cache = $config->{tr_cache}; + my $slice_cache = $config->{slice_cache}; + + my ($count_from_mem, $count_from_db, $count_from_cache, $count_duplicates, $count_trimmed) = (0, 0, 0, 0, 0); + + my %seen_trs; + + $count_from_mem = 0; + my $region_count = 0; + foreach my $chr(keys %{$regions}) { + $count_from_mem += scalar @{$tr_cache->{$chr}} if defined($tr_cache->{$chr}) && ref($tr_cache->{$chr}) eq 'ARRAY'; + $region_count += scalar @{$regions->{$chr}}; + } + + my $counter; + + debug("Reading transcript data from cache and/or database") unless defined($config->{quiet}); + + foreach my $chr(keys %{$regions}) { + foreach my $region(sort {(split /\-/, $a)[0] <=> (split /\-/, $b)[1]} @{$regions->{$chr}}) { + progress($config, $counter++, $region_count); + + # skip regions beyond the end of the chr + next if defined($slice_cache->{$chr}) && (split /\-/, $region)[0] > $slice_cache->{$chr}->length; + + next if defined($config->{loaded_tr}->{$chr}->{$region}); + + # force quiet so other methods don't mess up the progress bar + my $quiet = $config->{quiet}; + $config->{quiet} = 1; + + # try and load cache from disk if using cache + my $tmp_cache; + if(defined($config->{cache})) { + #$tmp_cache = ( + # defined($config->{tabix}) ? + # load_dumped_transcript_cache_tabix($config, $chr, $region, $trim_regions) : + # load_dumped_transcript_cache($config, $chr, $region) + #); + $tmp_cache = load_dumped_transcript_cache($config, $chr, $region); + $count_from_cache += scalar @{$tmp_cache->{$chr}} if defined($tmp_cache->{$chr}); + $config->{loaded_tr}->{$chr}->{$region} = 1; + } + + # no cache found on disk or not using cache + if(!defined($tmp_cache->{$chr})) { + + if(defined($config->{offline})) { + # restore quiet status + $config->{quiet} = $quiet; + + debug("WARNING: Could not find cache for $chr\:$region") unless defined($config->{quiet}); + next; + } + + # spoof temporary region hash + my $tmp_hash; + push @{$tmp_hash->{$chr}}, $region; + + $tmp_cache = cache_transcripts($config, $tmp_hash); + + # make it an empty arrayref that gets cached + # so we don't get confused and reload next time round + $tmp_cache->{$chr} ||= []; + + $count_from_db += scalar @{$tmp_cache->{$chr}}; + + # dump to disk if writing to cache + (defined($config->{tabix}) ? dump_transcript_cache_tabix($config, $tmp_cache, $chr, $region) : dump_transcript_cache($config, $tmp_cache, $chr, $region)) if defined($config->{write_cache}); + + $config->{loaded_tr}->{$chr}->{$region} = 1; + } + + # add loaded transcripts to main cache + if(defined($tmp_cache->{$chr})) { + while(my $tr = shift @{$tmp_cache->{$chr}}) { + + # track already added transcripts by dbID + my $dbID = $tr->dbID; + if($seen_trs{$dbID}) { + $count_duplicates++; + next; + } + + # trim out? + #if(defined($trim_regions) && defined($trim_regions->{$chr})) { + # my $tmp_count = scalar grep { + # overlap( + # (split /\-/, $_)[0], (split /\-/, $_)[1], + # $tr->{start}, $tr->{end} + # ) + # } @{$trim_regions->{$chr}}; + # + # if(!$tmp_count) { + # $count_trimmed++; + # next; + # } + #} + + $seen_trs{$dbID} = 1; + + push @{$tr_cache->{$chr}}, $tr; + } + } + + $tr_cache->{$chr} ||= []; + + undef $tmp_cache; + + # restore quiet status + $config->{quiet} = $quiet; + + # build slice cache + $slice_cache = build_slice_cache($config, $tr_cache) unless defined($slice_cache->{$chr}); + } + } + + end_progress($config); + + my $tr_count = 0; + $tr_count += scalar @{$tr_cache->{$_}} for keys %$tr_cache; + + debug("Retrieved $tr_count transcripts ($count_from_mem mem, $count_from_cache cached, $count_from_db DB, $count_duplicates duplicates)") unless defined($config->{quiet}); + + return $tr_count; +} + +sub fetch_regfeats { + my $config = shift; + my $regions = shift; + my $trim_regions = shift; + + my $rf_cache = $config->{rf_cache}; + my $slice_cache = $config->{slice_cache}; + + my ($count_from_mem, $count_from_db, $count_from_cache, $count_duplicates, $count_trimmed) = (0, 0, 0, 0, 0); + + my %seen_rfs; + + $count_from_mem = 0; + my $region_count = 0; + + foreach my $chr(keys %$regions) { + if(defined($rf_cache->{$chr}) && ref($rf_cache->{$chr}) eq 'HASH') { + $count_from_mem += scalar @{$rf_cache->{$chr}->{$_}} for keys %{$rf_cache->{$chr}}; + } + $region_count += scalar @{$regions->{$chr}}; + } + + my $counter = 0; + + debug("Reading regulatory data from cache and/or database") unless defined($config->{quiet}); + + foreach my $chr(keys %$regions) { + foreach my $region(sort {(split /\-/, $a)[0] cmp (split /\-/, $b)[1]} @{$regions->{$chr}}) { + progress($config, $counter++, $region_count); + + next if defined($config->{loaded_rf}->{$chr}->{$region}); + + # skip regions beyond the end of the chr + next if defined($slice_cache->{$chr}) && (split /\-/, $region)[0] > $slice_cache->{$chr}->length; + + # force quiet so other methods don't mess up the progress bar + my $quiet = $config->{quiet}; + $config->{quiet} = 1; + + # try and load cache from disk if using cache + my $tmp_cache; + if(defined($config->{cache})) { + $tmp_cache = load_dumped_reg_feat_cache($config, $chr, $region); + + #$tmp_cache = + # defined($config->{tabix}) ? + # load_dumped_reg_feat_cache_tabix($config, $chr, $region, $trim_regions) : + # load_dumped_reg_feat_cache($config, $chr, $region); + + + if(defined($tmp_cache->{$chr})) { + $count_from_cache += scalar @{$tmp_cache->{$chr}->{$_}} for keys %{$tmp_cache->{$chr}}; + } + + # flag as loaded + $config->{loaded_rf}->{$chr}->{$region} = 1; + } + + # no cache found on disk or not using cache + if(!defined($tmp_cache->{$chr})) { + + if(defined($config->{offline})) { + + # restore quiet status + $config->{quiet} = $quiet; + + debug("WARNING: Could not find cache for $chr\:$region") unless defined($config->{quiet}); + next; + } + + # spoof temporary region hash + my $tmp_hash; + push @{$tmp_hash->{$chr}}, $region; + + $tmp_cache = cache_reg_feats($config, $tmp_hash); + + # make it an empty arrayref that gets cached + # so we don't get confused and reload next time round + $tmp_cache->{$chr} ||= {}; + + $count_from_db += scalar @{$tmp_cache->{$chr}->{$_}} for keys %{$tmp_cache->{$chr}}; + + # dump to disk if writing to cache + #dump_reg_feat_cache($config, $tmp_cache, $chr, $region) if defined($config->{write_cache}); + (defined($config->{tabix}) ? dump_reg_feat_cache_tabix($config, $tmp_cache, $chr, $region) : dump_reg_feat_cache($config, $tmp_cache, $chr, $region)) if defined($config->{write_cache}); + + # restore deleted coord_system adaptor + foreach my $type(keys %{$tmp_cache->{$chr}}) { + $_->{slice}->{coord_system}->{adaptor} = $config->{csa} for @{$tmp_cache->{$chr}->{$type}}; + } + + # flag as loaded + $config->{loaded_rf}->{$chr}->{$region} = 1; + } + + # add loaded reg_feats to main cache + if(defined($tmp_cache->{$chr})) { + foreach my $type(keys %{$tmp_cache->{$chr}}) { + while(my $rf = shift @{$tmp_cache->{$chr}->{$type}}) { + + # filter on cell type + if(defined($config->{cell_type}) && scalar(@{$config->{cell_type}})) { + next unless grep {$rf->{cell_types}->{$_}} @{$config->{cell_type}}; + } + + # trim out? + #if(defined($trim_regions) && defined($trim_regions->{$chr})) { + # my $tmp_count = scalar grep { + # overlap( + # (split /\-/, $_)[0], (split /\-/, $_)[1], + # $rf->{start}, $rf->{end} + # ) + # } @{$trim_regions->{$chr}}; + # + # if(!$tmp_count) { + # $count_trimmed++; + # next; + # } + #} + + # track already added reg_feats by dbID + my $dbID = $rf->{dbID}; + if($seen_rfs{$dbID}) { + $count_duplicates++; + next; + } + $seen_rfs{$dbID} = 1; + + push @{$rf_cache->{$chr}->{$type}}, $rf; + } + } + } + + undef $tmp_cache; + + # restore quiet status + $config->{quiet} = $quiet; + } + } + + end_progress($config); + + my $rf_count = 0; + + foreach my $chr(keys %$rf_cache) { + foreach my $type(keys %{$rf_cache->{$chr}}) { + $rf_count += scalar @{$rf_cache->{$chr}->{$type}}; + } + } + + debug("Retrieved $rf_count regulatory features ($count_from_mem mem, $count_from_cache cached, $count_from_db DB, $count_duplicates duplicates)") unless defined($config->{quiet}); + + return $rf_count; +} + +# gets existing VFs for a vf_hash +sub check_existing_hash { + my $config = shift; + my $vf_hash = shift; + my $variation_cache; + + # we only care about non-SVs here + my %new_hash; + + foreach my $chr(keys %{$vf_hash}) { + foreach my $chunk(keys %{$vf_hash->{$chr}}) { + foreach my $pos(keys %{$vf_hash->{$chr}->{$chunk}}) { + foreach my $var(grep {$_->isa('Bio::EnsEMBL::Variation::VariationFeature')} @{$vf_hash->{$chr}->{$chunk}->{$pos}}) { + push @{$new_hash{$chr}->{$chunk}->{$pos}}, $var; + } + } + } + } + + $vf_hash = \%new_hash; + + debug("Checking for existing variations") unless defined($config->{quiet}); + + my ($chunk_count, $counter); + $chunk_count += scalar keys %{$vf_hash->{$_}} for keys %{$vf_hash}; + + foreach my $chr(keys %{$vf_hash}) { + + my %loaded_regions; + + foreach my $chunk(keys %{$vf_hash->{$chr}}) { + progress($config, $counter++, $chunk_count); + + # get the VFs for this chunk + my ($start, $end); + + # work out start and end using chunk_size + $start = $config->{chunk_size} * $chunk; + $end = $config->{chunk_size} * ($chunk + 1); + + # using cache? + if(defined($config->{cache})) { + my $tmp_regions; + push @{$tmp_regions->{$chr}}, $start.'-'.$end; + + my $converted_regions = convert_regions($config, $tmp_regions); + + foreach my $region(@{$converted_regions->{$chr}}) { + + unless($loaded_regions{$region}) { + my $tmp_cache = load_dumped_variation_cache($config, $chr, $region); + + # load from DB if not found in cache + if(!defined($tmp_cache->{$chr})) { + if(defined($config->{offline})) { + debug("WARNING: Could not find variation cache for $chr\:$region") unless defined($config->{quiet}); + next; + } + + $tmp_cache->{$chr} = get_variations_in_region($config, $chr, $region); + dump_variation_cache($config, $tmp_cache, $chr, $region) if defined($config->{write_cache}); + } + + # merge tmp_cache with the main cache + foreach my $key(keys %{$tmp_cache->{$chr}}) { + $variation_cache->{$chr}->{$key} = $tmp_cache->{$chr}->{$key}; + delete $tmp_cache->{$chr}->{$key}; + } + + # clear memory + undef $tmp_cache; + + # record this region as fetched + $loaded_regions{$region} = 1; + } + } + } + + # no cache, get all variations in region from DB + else { + + my ($min, $max); + + # we can fetch smaller region when using DB + foreach my $pos(keys %{$vf_hash->{$chr}->{$chunk}}) { + foreach my $var(@{$vf_hash->{$chr}->{$chunk}->{$pos}}) { + foreach my $coord(qw(start end)) { + $min = $var->{$coord} if !defined($min) || $var->{$coord} < $min; + $max = $var->{$coord} if !defined($max) || $var->{$coord} > $max; + } + } + } + + $variation_cache->{$chr} = get_variations_in_region($config, $chr, $min.'-'.$max); + } + + # now compare retrieved vars with vf_hash + foreach my $pos(keys %{$vf_hash->{$chr}->{$chunk}}) { + foreach my $var(@{$vf_hash->{$chr}->{$chunk}->{$pos}}) { + my @found; + my @gmafs; + + if(defined($variation_cache->{$chr})) { + if(my $existing_vars = $variation_cache->{$chr}->{$pos}) { + foreach my $existing_var(grep {$_->[1] <= $config->{failed}} @$existing_vars) { + unless(is_var_novel($config, $existing_var, $var)) { + push @found, $existing_var->[0]; + push @gmafs, $existing_var->[6].":".$existing_var->[7] if defined($existing_var->[6]) && defined($existing_var->[7]); + } + } + } + } + + $var->{existing} = join ",", @found; + $var->{existing} ||= '-'; + + $var->{gmaf} = join ",", @gmafs; + $var->{gmaf} ||= undef; + } + } + } + + delete $variation_cache->{$chr}; + } + + end_progress($config); +} + +# gets overlapping SVs for a vf_hash +sub check_svs_hash { + my $config = shift; + my $vf_hash = shift; + + debug("Checking for overlapping structural variations") unless defined($config->{quiet}); + + my ($chunk_count, $counter); + $chunk_count += scalar keys %{$vf_hash->{$_}} for keys %{$vf_hash}; + + foreach my $chr(keys %{$vf_hash}) { + foreach my $chunk(keys %{$vf_hash->{$chr}}) { + + progress($config, $counter++, $chunk_count); + + # work out start and end using chunk_size + my ($start, $end); + $start = $config->{chunk_size} * $chunk; + $end = $config->{chunk_size} * ($chunk + 1); + + # check for structural variations + if(defined($config->{sa})) { + my $slice = $config->{sa}->fetch_by_region('chromosome', $chr, $start, $end); + + if(defined($slice)) { + my $svs = $config->{svfa}->fetch_all_by_Slice($slice); + + foreach my $pos(keys %{$vf_hash->{$chr}->{$chunk}}) { + foreach my $var(@{$vf_hash->{$chr}->{$chunk}->{$pos}}) { + my $string = join ",", + map {$_->variation_name} + grep {$_->seq_region_start <= $var->end && $_->seq_region_end >= $var->start} + @$svs; + + $var->{overlapping_svs} = $string if $string; + } + } + } + } + } + } + + end_progress($config); +} + +# gets a slice from the slice adaptor +sub get_slice { + my $config = shift; + my $chr = shift; + my $otherfeatures = shift; + $otherfeatures ||= ''; + + return undef unless defined($config->{sa}) && defined($chr); + + my $slice; + + # first try to get a chromosome + eval { $slice = $config->{$otherfeatures.'sa'}->fetch_by_region('chromosome', $chr); }; + + # if failed, try to get any seq region + if(!defined($slice)) { + $slice = $config->{$otherfeatures.'sa'}->fetch_by_region(undef, $chr); + } + + return $slice; +} + + + + +# METHODS THAT DEAL WITH "REGIONS" +################################## + +# gets regions from VF hash +sub regions_from_hash { + my $config = shift; + my $vf_hash = shift; + + my %include_regions; + + # if using cache we just want the regions of cache_region_size + # since that's what we'll get from the cache (or DB if no cache found) + if(defined($config->{cache})) { + + my $region_size = $config->{cache_region_size}; + + foreach my $chr(keys %$vf_hash) { + $include_regions{$chr} = []; + my %temp_regions; + + foreach my $chunk(keys %{$vf_hash->{$chr}}) { + foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) { + my ($s, $e) = ($pos - MAX_DISTANCE_FROM_TRANSCRIPT, $pos + MAX_DISTANCE_FROM_TRANSCRIPT); + + my $low = int ($s / $region_size); + my $high = int ($e / $region_size) + 1; + + for my $i($low..($high - 1)) { + $temp_regions{(($i * $region_size) + 1).'-'.(($i + 1) * $region_size)} = 1; + } + } + } + + @{$include_regions{$chr}} = keys %temp_regions; + } + } + + # if no cache we don't want to fetch more than is necessary, so find the + # minimum covered region of the variations in the hash + else { + foreach my $chr(keys %$vf_hash) { + $include_regions{$chr} = []; + + foreach my $chunk(keys %{$vf_hash->{$chr}}) { + foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) { + add_region($_->start, $_->end, $include_regions{$chr}) for @{$vf_hash->{$chr}{$chunk}{$pos}}; + } + } + } + + # merge regions + merge_regions(\%include_regions, $config); + } + + return \%include_regions; +} + +# adds a region to region list, expanding existing one if overlaps +sub add_region { + my $start = shift; + my $end = shift; + my $region_list = shift; + + # fix end for insertions + $end = $start if $end < $start; + + my $added = 0; + my $i = 0; + + while ($i < scalar @$region_list) { + my ($region_start, $region_end) = split /\-/, $region_list->[$i]; + + if($start <= $region_end && $end >= $region_start) { + my $new_region_start = ($start < $end ? $start : $end) - MAX_DISTANCE_FROM_TRANSCRIPT; + my $new_region_end = ($start > $end ? $start : $end) + MAX_DISTANCE_FROM_TRANSCRIPT; + + $new_region_start = 1 if $new_region_start < 1; + + $region_start = $new_region_start if $new_region_start < $region_start; + $region_end = $new_region_end if $new_region_end > $region_end; + + $region_list->[$i] = $region_start.'-'.$region_end; + $added = 1; + } + + $i++; + } + + unless($added) { + my $s = $start - MAX_DISTANCE_FROM_TRANSCRIPT; + $s = 1 if $s < 1; + + push @{$region_list}, $s.'-'.($end + MAX_DISTANCE_FROM_TRANSCRIPT); + } +} + +# merges overlapping regions from scans +sub merge_regions { + my $include_regions = shift; + my $config = shift; + my $consecutive = shift; + $consecutive ||= 0; + + # now merge overlapping regions + foreach my $chr(keys %$include_regions) { + my $max_index = $#{$include_regions->{$chr}}; + my (@new_regions, %skip); + + for my $i(0..$max_index) { + next if $skip{$i}; + my ($s, $e) = split /\-/, $include_regions->{$chr}[$i]; + + for my $j(($i+1)..$max_index) { + next if $skip{$j}; + my ($ns, $ne) = split /\-/, $include_regions->{$chr}[$j]; + + if($s <= ($ne + $consecutive) && $e >= ($ns - $consecutive)) { + $s = $ns if $ns < $s; + $e = $ne if $ne > $e; + + $skip{$j} = 1; + } + } + + push @new_regions, $s.'-'.$e; + } + + # replace original + $include_regions->{$chr} = \@new_regions; + + $config->{region_count} += scalar @new_regions; + } + + return $include_regions; +} + +# converts regions as determined by scan_file to regions loadable from cache +sub convert_regions { + my $config = shift; + my $regions = shift; + + return undef unless defined $regions; + + my $region_size = $config->{cache_region_size}; + + my %new_regions; + + foreach my $chr(keys %$regions) { + my %temp_regions; + + foreach my $region(@{$regions->{$chr}}) { + my ($s, $e) = split /\-/, $region; + + my $low = int ($s / $region_size); + my $high = int ($e / $region_size) + 1; + + for my $i($low..($high - 1)) { + $temp_regions{(($i * $region_size) + 1).'-'.(($i + 1) * $region_size)} = 1; + } + } + + @{$new_regions{$chr}} = keys %temp_regions; + } + + return \%new_regions; +} + + + + + +# CACHE METHODS +############### + +# prunes a cache to get rid of features not in regions in use +sub prune_cache { + my $config = shift; + my $cache = shift; + my $regions = shift; + my $loaded = shift; + + # delete no longer in use chroms + foreach my $chr(keys %$cache) { + delete $cache->{$chr} unless defined $regions->{$chr} && scalar @{$regions->{$chr}}; + } + + my $new_count = 0; + + foreach my $chr(keys %$cache) { + + # get total area spanned by regions + my ($min, $max); + foreach my $region(@{$regions->{$chr}}) { + my ($s, $e) = split /\-/, $region; + $min = $s if !defined($min) or $s < $min; + $max = $e if !defined($max) or $e > $max; + } + + # transcript cache + if(ref($cache->{$chr}) eq 'ARRAY') { + $cache->{$chr} = prune_min_max($cache->{$chr}, $min, $max); + $new_count += scalar @{$cache->{$chr}}; + } + # regfeat cache + elsif(ref($cache->{$chr}) eq 'HASH') { + for(keys %{$cache->{$chr}}) { + $cache->{$chr}->{$_} = prune_min_max($cache->{$chr}->{$_}, $min, $max); + $new_count += scalar @{$cache->{$chr}->{$_}}; + } + } + + # update loaded regions + my %have_regions = map {$_ => 1} @{$regions->{$chr}}; + + foreach my $region(keys %{$loaded->{$chr}}) { + delete $loaded->{$chr}->{$region} unless defined $have_regions{$region}; + } + } + + return $new_count; +} + +# does the actual pruning +sub prune_min_max { + my $array = shift; + my $min = shift; + my $max = shift; + + # splice out features not in area spanned by min/max + my $i = 0; + my $f_count = scalar @{$array}; + my @new_cache; + + while($i < $f_count) { + my $f = $array->[$i]; + + $i++; + + if($max - $f->start() > 0 && $f->end - $min > 0) { + push @new_cache, $f; + } + + # do some cleaning for transcripts + elsif(defined $f->{translation}) { + delete $f->{translation}->{transcript}; + delete $f->{translation}; + } + } + + undef $array; + return \@new_cache; +} + +# get transcripts for slices +sub cache_transcripts { + my $config = shift; + my $include_regions = shift; + + my $tr_cache; + my $i; + + debug("Caching transcripts") unless defined($config->{quiet}); + + foreach my $chr(keys %$include_regions) { + + my $slice = get_slice($config, $chr); + + next unless defined $slice; + + # prefetch some things + $slice->is_circular; + + # trim bumf off the slice + delete $slice->{coord_system}->{adaptor} if defined($config->{write_cache}); + + # no regions? + if(!scalar @{$include_regions->{$chr}}) { + my $start = 1; + my $end = $config->{cache_region_size}; + + while($start < $slice->end) { + push @{$include_regions->{$chr}}, $start.'-'.$end; + $start += $config->{cache_region_size}; + $end += $config->{cache_region_size}; + } + } + + my $region_count; + + if(scalar keys %$include_regions == 1) { + my ($chr) = keys %$include_regions; + $region_count = scalar @{$include_regions->{$chr}}; + debug("Caching transcripts for chromosome $chr") unless defined($config->{quiet}); + } + + foreach my $region(@{$include_regions->{$chr}}) { + progress($config, $i++, $region_count || $config->{region_count}); + + my ($s, $e) = split /\-/, $region; + + # sanity check start and end + $s = 1 if $s < 1; + $e = $slice->end if $e > $slice->end; + + # get sub-slice + my $sub_slice = $slice->sub_Slice($s, $e); + + # for some reason unless seq is called here the sequence becomes Ns later + $sub_slice->seq; + + # add transcripts to the cache, via a transfer to the chrom's slice + if(defined($sub_slice)) { + foreach my $gene(map {$_->transfer($slice)} @{$sub_slice->get_all_Genes(undef, undef, 1)}) { + my $gene_stable_id = $gene->stable_id; + my $canonical_tr_id = $gene->{canonical_transcript_id}; + + my @trs; + + foreach my $tr(@{$gene->get_all_Transcripts}) { + $tr->{_gene_stable_id} = $gene_stable_id; + $tr->{_gene} = $gene; + + # indicate if canonical + $tr->{is_canonical} = 1 if defined $canonical_tr_id and $tr->dbID eq $canonical_tr_id; + + if(defined($config->{prefetch})) { + prefetch_transcript_data($config, $tr); + } + + # CCDS + elsif(defined($config->{ccds})) { + my @entries = grep {$_->database eq 'CCDS'} @{$tr->get_all_DBEntries}; + $tr->{_ccds} = $entries[0]->display_id if scalar @entries; + } + + # strip some unnecessary data from the transcript object + clean_transcript($tr) if defined($config->{write_cache}); + + push @trs, $tr; + } + + # sort the transcripts by translation so we can share sift/polyphen stuff + # between transcripts and save cache space + if(defined($config->{write_cache}) && (defined($config->{sift}) || defined($config->{polyphen}))) { + + my $prev_tr; + + # sort them by peptide seqeuence as transcripts with identical peptides + # will have identical SIFT/PolyPhen prediction strings + foreach my $tr(sort {$a->{_variation_effect_feature_cache}->{peptide} cmp $b->{_variation_effect_feature_cache}->{peptide}} grep {$_->{_variation_effect_feature_cache}->{peptide}} @trs) { + + if( + defined($prev_tr) && + $prev_tr->{_variation_effect_feature_cache}->{peptide} + eq $tr->{_variation_effect_feature_cache}->{peptide} + ) { + + foreach my $analysis(qw(sift polyphen)) { + next unless defined($config->{$analysis}); + $tr->{_variation_effect_feature_cache}->{protein_function_predictions}->{$analysis} = $prev_tr->{_variation_effect_feature_cache}->{protein_function_predictions}->{$analysis}; + } + } + + $prev_tr = $tr; + } + } + + # clean the gene + clean_gene($gene); + + push @{$tr_cache->{$chr}}, @trs; + } + } + } + } + + end_progress($config); + + return $tr_cache; +} + +# gets rid of extra bits of info attached to the transcript that we don't need +sub clean_transcript { + my $tr = shift; + + foreach my $key(qw(display_xref external_db external_display_name external_name external_status created_date status description edits_enabled modified_date dbentries is_current analysis transcript_mapper)) { + delete $tr->{$key} if defined($tr->{$key}); + } + + # clean all attributes but miRNA + if(defined($tr->{attributes})) { + my @new_atts; + foreach my $att(@{$tr->{attributes}}) { + push @new_atts, $att if $att->{code} eq 'miRNA'; + } + $tr->{attributes} = \@new_atts; + } + + # clean the translation + if(defined($tr->translation)) { + + # sometimes the translation points to a different transcript? + $tr->{translation}->{transcript} = $tr; + weaken($tr->{translation}->{transcript}); + + for my $key(qw(attributes protein_features created_date modified_date)) { + delete $tr->translation->{$key}; + } + } +} + +# gets rid of extra bits of info attached to genes. At the moment this is almost +# everything as genes are only used for their locations when looking at +# structural variations +sub clean_gene { + my $gene = shift; + + # delete almost everything in the gene + map {delete $gene->{$_}} + grep { + $_ ne 'start' && + $_ ne 'end' && + $_ ne 'strand' && + $_ ne 'stable_id' + } + keys %{$gene}; +} + +# build slice cache from transcript cache +sub build_slice_cache { + my $config = shift; + my $tr_cache = shift; + + my %slice_cache; + + foreach my $chr(keys %$tr_cache) { + $slice_cache{$chr} = scalar @{$tr_cache->{$chr}} ? $tr_cache->{$chr}[0]->slice : &get_slice($config, $chr); + + if(!defined($slice_cache{$chr})) { + delete $slice_cache{$chr} + } + + else { + # reattach adaptor to the coord system + $slice_cache{$chr}->{coord_system}->{adaptor} ||= $config->{csa}; + } + } + + return \%slice_cache; +} + +# pre-fetches per-transcript data +sub prefetch_transcript_data { + my $config = shift; + my $tr = shift; + + # introns + my $introns = $tr->get_all_Introns; + + if(defined($introns)) { + foreach my $intron(@$introns) { + foreach my $key(qw(adaptor analysis dbID next prev seqname)) { + delete $intron->{$key}; + } + } + } + + $tr->{_variation_effect_feature_cache}->{introns} ||= $introns; + + # translateable_seq, mapper + $tr->{_variation_effect_feature_cache}->{translateable_seq} ||= $tr->translateable_seq; + $tr->{_variation_effect_feature_cache}->{mapper} ||= $tr->get_TranscriptMapper; + + # peptide + unless ($tr->{_variation_effect_feature_cache}->{peptide}) { + my $translation = $tr->translate; + $tr->{_variation_effect_feature_cache}->{peptide} = $translation ? $translation->seq : undef; + } + + # protein features + if(defined($config->{domains}) || defined($config->{write_cache})) { + my $pfs = $tr->translation ? $tr->translation->get_all_ProteinFeatures : []; + + # clean them to save cache space + foreach my $pf(@$pfs) { + + # remove everything but the coord, analysis and ID fields + foreach my $key(keys %$pf) { + delete $pf->{$key} unless + $key eq 'start' || + $key eq 'end' || + $key eq 'analysis' || + $key eq 'hseqname'; + } + + # remove everything from the analysis but the display label + foreach my $key(keys %{$pf->{analysis}}) { + delete $pf->{analysis}->{$key} unless $key eq '_display_label'; + } + } + + $tr->{_variation_effect_feature_cache}->{protein_features} = $pfs; + } + + # codon table + unless ($tr->{_variation_effect_feature_cache}->{codon_table}) { + # for mithocondrial dna we need to to use a different codon table + my $attrib = $tr->slice->get_all_Attributes('codon_table')->[0]; + + $tr->{_variation_effect_feature_cache}->{codon_table} = $attrib ? $attrib->value : 1; + } + + # sift/polyphen + if(defined($config->{pfpma}) && defined($tr->{_variation_effect_feature_cache}->{peptide})) { + foreach my $analysis(qw(sift polyphen)) { + next unless defined($config->{$analysis}); + my $a = $analysis; + $a .= '_humvar' if $a eq 'polyphen'; + $tr->{_variation_effect_feature_cache}->{protein_function_predictions}->{$a} ||= $config->{pfpma}->fetch_by_analysis_translation_md5($a, md5_hex($tr->{_variation_effect_feature_cache}->{peptide})) + } + } + + # gene + $tr->{_gene} ||= $config->{ga}->fetch_by_transcript_stable_id($tr->stable_id); + + # gene HGNC + if(defined $config->{hgnc}) { + + # get from gene cache if found already + if(defined($tr->{_gene}->{_hgnc})) { + $tr->{_gene_hgnc} = $tr->{_gene}->{_hgnc}; + } + else { + my @entries = grep {$_->database eq 'HGNC'} @{$tr->{_gene}->get_all_DBEntries()}; + if(scalar @entries) { + $tr->{_gene_hgnc} = $entries[0]->display_id; + } + + $tr->{_gene_hgnc} ||= '-'; + + # cache it on the gene object too + $tr->{_gene}->{_hgnc} = $tr->{_gene_hgnc}; + } + } + + # CCDS + my @entries = grep {$_->database eq 'CCDS'} @{$tr->get_all_DBEntries}; + $tr->{_ccds} = $entries[0]->display_id if scalar @entries; + $tr->{_ccds} ||= '-'; + + # refseq + @entries = grep {$_->database eq 'RefSeq_mRNA'} @{$tr->get_all_DBEntries}; + if(scalar @entries) { + $tr->{_refseq} = join ",", map {$_->display_id} @entries; + } + else { + $tr->{_refseq} = '-'; + } + + # protein stable ID + $tr->{_protein} = $tr->translation ? $tr->translation->stable_id : '-'; + + return $tr; +} + +sub get_dump_file_name { + my $config = shift; + my $chr = shift; + my $region = shift; + my $type = shift; + + $type ||= 'transcript'; + + if($type eq 'transcript') { + $type = ''; + } + else { + $type = '_'.$type; + } + + #my ($s, $e) = split /\-/, $region; + #my $subdir = int($s / 1e6); + # + #my $dir = $config->{dir}.'/'.$chr.'/'.$subdir; + + my $dir = $config->{dir}.'/'.$chr; + my $dump_file = $dir.'/'.$region.$type.(defined($config->{tabix}) ? '_tabix' : '').'.gz'; + + # make directory if it doesn't exist + if(!(-e $dir) && defined($config->{write_cache})) { + make_path($dir); + } + + return $dump_file; +} + +# dumps out transcript cache to file +sub dump_transcript_cache { + my $config = shift; + my $tr_cache = shift; + my $chr = shift; + my $region = shift; + + debug("Dumping cached transcript data") unless defined($config->{quiet}); + + # clean the slice adaptor before storing + clean_slice_adaptor($config); + + strip_transcript_cache($config, $tr_cache); + + $config->{reg}->disconnect_all; + delete $config->{sa}->{dbc}->{_sql_helper}; + + my $dump_file = get_dump_file_name($config, $chr, $region, 'transcript'); + + debug("Writing to $dump_file") unless defined($config->{quiet}); + + # storable + open my $fh, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file"; + nstore_fd($tr_cache, $fh); + close $fh; +} + +#sub dump_transcript_cache_tabix { +# my $config = shift; +# my $tr_cache = shift; +# my $chr = shift; +# my $region = shift; +# +# debug("Dumping cached transcript data") unless defined($config->{quiet}); +# +# # clean the slice adaptor before storing +# clean_slice_adaptor($config); +# +# strip_transcript_cache($config, $tr_cache); +# +# $config->{reg}->disconnect_all; +# +# my $dir = $config->{dir}.'/'.$chr; +# my $dump_file = $dir.'/'.($region || "dump").'_tabix.gz'; +# +# # make directory if it doesn't exist +# if(!(-e $dir)) { +# make_path($dir); +# } +# +# debug("Writing to $dump_file") unless defined($config->{quiet}); +# +# use Storable qw(nfreeze); +# use MIME::Base64 qw(encode_base64); +# #open NEW, "| bgzip -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file"; +# # +# #foreach my $tr(sort {$a->start <=> $b->start} @{$tr_cache->{$chr}}) { +# # print NEW join "\t", ( +# # $chr, +# # $tr->start, +# # $tr->end, +# # encode_base64(freeze($tr), "") +# # ); +# # print NEW "\n"; +# #} +# #close NEW; +# # +# ## tabix it +# #my $output = `tabix -s 1 -b 2 -e 3 -f $dump_file 2>&1`; +# #die("ERROR: Failed during tabix indexing\n$output\n") if $output; +# open NEW, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file"; +# +# foreach my $tr(sort {$a->start <=> $b->start} @{$tr_cache->{$chr}}) { +# print NEW join "\t", ( +# $chr, +# $tr->start, +# $tr->end, +# encode_base64(freeze($tr), "") +# ); +# print NEW "\n"; +# } +# close NEW; +#} + +# loads in dumped transcript cache to memory +sub load_dumped_transcript_cache { + my $config = shift; + my $chr = shift; + my $region = shift; + + my $dump_file = get_dump_file_name($config, $chr, $region, 'transcript'); + + return undef unless -e $dump_file; + + debug("Reading cached transcript data for chromosome $chr".(defined $region ? "\:$region" : "")." from dumped file") unless defined($config->{quiet}); + + open my $fh, $config->{compress}." ".$dump_file." |" or return undef; + my $tr_cache; + $tr_cache = fd_retrieve($fh); + close $fh; + + # reattach adaptors + foreach my $t(@{$tr_cache->{$chr}}) { + if(defined($t->{translation})) { + $t->{translation}->{adaptor} = $config->{tra} if defined $t->{translation}->{adaptor}; + $t->{translation}->{transcript} = $t; + weaken($t->{translation}->{transcript}); + } + + $t->{slice}->{adaptor} = $config->{sa}; + } + + return $tr_cache; +} + +#sub load_dumped_transcript_cache_tabix { +# my $config = shift; +# my $chr = shift; +# my $region = shift; +# my $trim_regions = shift; +# +# my $dir = $config->{dir}.'/'.$chr; +# my $dump_file = $dir.'/'.($region || "dump").'_tabix.gz'; +# +# #print STDERR "Reading from $dump_file\n"; +# +# return undef unless -e $dump_file; +# +# debug("Reading cached transcript data for chromosome $chr".(defined $region ? "\:$region" : "")." from dumped file") unless defined($config->{quiet}); +# +# my $tr_cache; +# +# use MIME::Base64 qw(decode_base64); +# use Storable qw(thaw); +# +# my ($s, $e) = split /\-/, $region; +# my @regions = grep {overlap($s, $e, (split /\-/, $_))} @{$trim_regions->{$chr}}; +# my $regions = ""; +# $regions .= " $chr\:$_" for @regions; +# +# #print STDERR "tabix $dump_file $regions |\n"; +# #open IN, "tabix $dump_file $regions |"; +# open IN, "gzip -dc $dump_file |"; +# while() { +# +# #$DB::single = 1; +# my ($chr, $start, $end, $blob) = split /\t/, $_; +# next unless grep {overlap($start, $end, (split /\-/, $_))} @regions; +# my $tr = thaw(decode_base64($blob)); +# push @{$tr_cache->{$chr}}, $tr; +# } +# close IN; +# +# # reattach adaptors +# foreach my $t(@{$tr_cache->{$chr}}) { +# if(defined($t->{translation})) { +# $t->{translation}->{adaptor} = $config->{tra} if defined $t->{translation}->{adaptor}; +# $t->{translation}->{transcript} = $t; +# weaken($t->{translation}->{transcript}); +# } +# +# $t->{slice}->{adaptor} = $config->{sa}; +# } +# +# # add empty array ref so code doesn't try and fetch from DB too +# $tr_cache->{$chr} ||= []; +# +# return $tr_cache; +#} + +# strips cache before writing to disk +sub strip_transcript_cache { + my $config = shift; + my $cache = shift; + + foreach my $chr(keys %$cache) { + foreach my $tr(@{$cache->{$chr}}) { + foreach my $exon(@{$tr->{_trans_exon_array}}) { + delete $exon->{slice}->{adaptor}; + + for(qw(adaptor created_date modified_date is_current version is_constitutive _seq_cache dbID slice)) { + delete $exon->{$_}; + } + } + + delete $tr->{adaptor}; + delete $tr->{slice}->{adaptor}; + } + } +} + +# cleans slice adaptor before storing in cache +sub clean_slice_adaptor{ + my $config = shift; + + # clean some stuff off the slice adaptor + delete $config->{sa}->{asm_exc_cache}; + $config->{sa}->{sr_name_cache} = {}; + $config->{sa}->{sr_id_cache} = {}; + delete $config->{sa}->{db}->{seq_region_cache}; + delete $config->{sa}->{db}->{name_cache}; +} + + +# dump adaptors to cache +sub dump_adaptor_cache { + my $config = shift; + + $config->{reg}->disconnect_all; + delete $config->{sa}->{dbc}->{_sql_helper}; + + my $dir = $config->{dir}; + my $dump_file = $dir.'/adaptors.gz'; + + # make directory if it doesn't exist + if(!(-e $dir)) { + make_path($dir); + } + + open my $fh, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file"; + nstore_fd($config, $fh); + close $fh; +} + +# load dumped adaptors +sub load_dumped_adaptor_cache { + my $config = shift; + + my $dir = $config->{dir}; + my $dump_file = $dir.'/adaptors.gz'; + + return undef unless -e $dump_file; + + debug("Reading cached adaptor data") unless defined($config->{quiet}); + + open my $fh, $config->{compress}." ".$dump_file." |" or return undef; + my $cached_config; + $cached_config = fd_retrieve($fh); + close $fh; + + $config->{$_} = $cached_config->{$_} for qw(sa ga ta vfa svfa tva pfpma mca csa RegulatoryFeature_adaptor MotifFeature_adaptor); + + return 1; +} + +# dumps cached variations to disk +sub dump_variation_cache { + my $config = shift; + my $v_cache = shift; + my $chr = shift; + my $region = shift; + + my $dump_file = get_dump_file_name($config, $chr, $region, 'var'); + + open DUMP, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to adaptor dump file $dump_file"; + + foreach my $pos(keys %{$v_cache->{$chr}}) { + foreach my $v(@{$v_cache->{$chr}->{$pos}}) { + my ($name, $failed, $start, $end, $as, $strand, $ma, $maf) = @$v; + + print DUMP join " ", ( + $name, + $failed == 0 ? '' : $failed, + $start, + $end == $start ? '' : $end, + $as, + $strand == 1 ? '' : $strand, + $ma || '', + defined($maf) ? sprintf("%.2f", $maf) : '', + ); + print DUMP "\n"; + } + } + + close DUMP; +} + +# loads dumped variation cache +sub load_dumped_variation_cache { + my $config = shift; + my $chr = shift; + my $region = shift; + + my $dump_file = get_dump_file_name($config, $chr, $region, 'var'); + + return undef unless -e $dump_file; + + open DUMP, $config->{compress}." ".$dump_file." |" or return undef; + + my $v_cache; + + while() { + chomp; + my ($name, $failed, $start, $end, $as, $strand, $ma, $maf) = split / /, $_; + $failed ||= 0; + $end ||= $start; + $strand ||= 1; + $ma ||= undef; + $maf ||= undef; + + my @v = ($name, $failed, $start, $end, $as, $strand, $ma, $maf); + push @{$v_cache->{$chr}->{$start}}, \@v; + } + + close DUMP; + + return $v_cache; +} + +# caches regulatory features +sub cache_reg_feats { + my $config = shift; + my $include_regions = shift; + + my $rf_cache; + my $i; + + debug("Caching regulatory features") unless defined($config->{quiet}); + + foreach my $chr(keys %$include_regions) { + + my $slice = get_slice($config, $chr); + + next unless defined $slice; + + # prefetch some things + $slice->is_circular; + + # no regions? + if(!scalar @{$include_regions->{$chr}}) { + my $start = 1; + my $end = $config->{cache_region_size}; + + while($start < $slice->end) { + push @{$include_regions->{$chr}}, $start.'-'.$end; + $start += $config->{cache_region_size}; + $end += $config->{cache_region_size}; + } + } + + my $region_count; + + if(scalar keys %$include_regions == 1) { + my ($chr) = keys %$include_regions; + $region_count = scalar @{$include_regions->{$chr}}; + debug("Caching transcripts for chromosome $chr") unless defined($config->{quiet}); + } + + foreach my $region(@{$include_regions->{$chr}}) { + progress($config, $i++, $region_count || $config->{region_count}); + + my ($s, $e) = split /\-/, $region; + + # sanity check start and end + $s = 1 if $s < 1; + $e = $slice->end if $e > $slice->end; + + # get sub-slice + my $sub_slice = $slice->sub_Slice($s, $e); + $sub_slice->{coord_system}->{adaptor} = $config->{csa}; + + next unless defined($sub_slice); + + foreach my $type(@REG_FEAT_TYPES) { + my $features = $config->{$type.'_adaptor'}->fetch_all_by_Slice($sub_slice); + next unless defined($features); + + # cell types + if(defined($config->{cell_type}) && scalar(@{$config->{cell_type}})) { + foreach my $rf(@$features) { + + my %cl; + + # get cell type by fetching all from stable ID + if($type eq 'RegulatoryFeature') { + %cl = map { + $_->feature_set->cell_type->name => $_->feature_type->name + } @{$rf->adaptor->fetch_all_by_stable_ID($rf->stable_id)}; + } + + # get cell type by fetching regfeats that contain this MotifFeature + elsif($type eq 'MotifFeature') { + %cl = map { + $_->feature_set->cell_type->name => $_->feature_type->name + } @{$config->{'RegulatoryFeature_adaptor'}->fetch_all_by_attribute_feature($rf)}; + } + + $rf->{cell_types} = \%cl; + } + } + + push @{$rf_cache->{$chr}->{$type}}, + map { clean_reg_feat($_) } + map { $_->transfer($slice) } + @{$features}; + } + } + } + + end_progress($config); + + return $rf_cache; +} + + +# cleans reg feats for caching +sub clean_reg_feat { + my $rf = shift; + + foreach my $key(qw/adaptor binary_string bound_start bound_end attribute_cache feature_type feature_set analysis/) { + delete $rf->{$key}; + } + + if(defined($rf->{binding_matrix})) { + $rf->{_variation_effect_feature_cache}->{seq} = $rf->seq; + + foreach my $key(qw/adaptor feature_type analysis dbID/) { + delete $rf->{binding_matrix}->{$key}; + } + } + + return $rf; +} + + +# dumps out reg feat cache to file +sub dump_reg_feat_cache { + my $config = shift; + my $rf_cache = shift; + my $chr = shift; + my $region = shift; + + debug("Dumping cached reg feat data for $chr:$region") unless defined($config->{quiet}); + + # clean the slice adaptor before storing + clean_slice_adaptor($config); + + $config->{reg}->disconnect_all; + delete $config->{sa}->{dbc}->{_sql_helper}; + + foreach my $chr(keys %{$rf_cache}) { + foreach my $type(keys %{$rf_cache->{$chr}}) { + delete $_->{slice}->{coord_system}->{adaptor} for @{$rf_cache->{$chr}->{$type}}; + } + } + + my $dump_file = get_dump_file_name($config, $chr, $region, 'reg'); + + debug("Writing to $dump_file") unless defined($config->{quiet}); + + # storable + open my $fh, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file"; + nstore_fd($rf_cache, $fh); + close $fh; +} + +#sub dump_reg_feat_cache_tabix { +# my $config = shift; +# my $rf_cache = shift; +# my $chr = shift; +# my $region = shift; +# +# debug("Dumping cached reg feat data") unless defined($config->{quiet}); +# +# # clean the slice adaptor before storing +# clean_slice_adaptor($config); +# +# $config->{reg}->disconnect_all; +# delete $config->{sa}->{dbc}->{_sql_helper}; +# +# $config->{reg}->disconnect_all; +# +# my $dump_file = get_dump_file_name($config, $chr, $region, 'reg'); +# +# debug("Writing to $dump_file") unless defined($config->{quiet}); +# +# use Storable qw(nfreeze); +# use MIME::Base64 qw(encode_base64); +# open NEW, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file"; +# +# foreach my $type(keys %{$rf_cache->{$chr}}) { +# foreach my $rf(sort {$a->start <=> $b->start} @{$rf_cache->{$chr}->{$type}}) { +# print NEW join "\t", ( +# $chr, +# $rf->start, +# $rf->end, +# $type, +# encode_base64(freeze($rf), "") +# ); +# print NEW "\n"; +# } +# } +# close NEW; +#} + +# loads in dumped transcript cache to memory +sub load_dumped_reg_feat_cache { + my $config = shift; + my $chr = shift; + my $region = shift; + + my $dump_file = get_dump_file_name($config, $chr, $region, 'reg'); + + return undef unless -e $dump_file; + + debug("Reading cached reg feat data for chromosome $chr".(defined $region ? "\:$region" : "")." from dumped file") unless defined($config->{quiet}); + + open my $fh, $config->{compress}." ".$dump_file." |" or return undef; + my $rf_cache; + $rf_cache = fd_retrieve($fh); + close $fh; + + return $rf_cache; +} + + + +#sub load_dumped_reg_feat_cache_tabix { +# my $config = shift; +# my $chr = shift; +# my $region = shift; +# my $trim_regions = shift; +# +# my $dump_file = get_dump_file_name($config, $chr, $region, 'reg'); +# +# #print STDERR "Reading from $dump_file\n"; +# +# return undef unless -e $dump_file; +# +# debug("Reading cached reg feat data for chromosome $chr".(defined $region ? "\:$region" : "")." from dumped file") unless defined($config->{quiet}); +# +# my $rf_cache; +# +# use MIME::Base64 qw(decode_base64); +# use Storable qw(thaw); +# +# my ($s, $e) = split /\-/, $region; +# my @regions = grep {overlap($s, $e, (split /\-/, $_))} @{$trim_regions->{$chr}}; +# my $regions = ""; +# $regions .= " $chr\:$_" for @regions; +# +# #print STDERR "tabix $dump_file $regions |\n"; +# #open IN, "tabix $dump_file $regions |"; +# open IN, "gzip -dc $dump_file |"; +# while() { +# my ($chr, $start, $end, $type, $blob) = split /\t/, $_; +# next unless grep {overlap($start, $end, (split /\-/, $_))} @regions; +# my $rf = thaw(decode_base64($blob)); +# push @{$rf_cache->{$chr}->{$type}}, $rf; +# } +# close IN; +# +# $rf_cache->{$chr}->{$_} ||= [] for @REG_FEAT_TYPES; +# +# return $rf_cache; +#} + + +# get custom annotation for a region +sub cache_custom_annotation { + my $config = shift; + my $include_regions = shift; + my $chr = shift; + + #$include_regions = merge_regions($include_regions, $config, 1); + + my $annotation = {}; + + my $total = scalar @{$config->{custom}} * scalar @{$include_regions->{$chr}}; + my $counter = 0; + + my $max_regions_per_tabix = 1000; + + debug("Caching custom annotations") unless defined($config->{quiet}); + + foreach my $custom(@{$config->{custom}}) { + + my @regions = @{$include_regions->{$chr}}; + + while(scalar @regions) { + my $got_features = 0; + + my @tmp_regions = splice @regions, 0, $max_regions_per_tabix; + + progress($config, $counter, $total); + $counter += scalar @tmp_regions; + + # some files may have e.g. chr10 instead of 10 + for my $tmp_chr($chr, 'chr'.$chr) { + + # bigwig needs to use bigWigToWig utility + if($custom->{format} eq 'bigwig') { + foreach my $region(@tmp_regions) { + my ($s, $e) = split /\-/, $region; + my $tmp_file = $config->{tmpdir}.'/vep_tmp_'.$$.'_'.$tmp_chr.'_'.$s.'_'.$e; + my $bigwig_file = $custom->{file}; + my $bigwig_output = `bigWigToWig -chrom=$tmp_chr -start=$s -end=$e $bigwig_file $tmp_file 2>&1`; + + die "\nERROR: Problem using bigwig file $bigwig_file\n$bigwig_output" if $bigwig_output; + } + + # concatenate all the files together + my $string = $config->{tmpdir}.'/vep_tmp_'.$$.'_*'; + my $tmp_file = $config->{tmpdir}.'/vep_tmp_'.$$; + `cat $string > $tmp_file; rm $string`; + open CUSTOM, $tmp_file + or die "\nERROR: Could not read from temporary WIG file $tmp_file\n"; + } + + # otherwise use tabix + else { + # tabix can fetch multiple regions, so construct a string + my $region_string = join " ", map {$tmp_chr.':'.$_} @tmp_regions; + + open CUSTOM, "tabix ".$custom->{file}." $region_string 2>&1 |" + or die "\nERROR: Could not open tabix pipe for ".$custom->{file}."\n"; + } + + # set an error flag so we don't have to check every line + my $error_flag = 1; + + while() { + chomp; + + # check for errors + if($error_flag) { + die "\nERROR: Problem using annotation file ".$custom->{file}."\n$_\n" if /invalid pointer|tabix|get_intv/; + $error_flag = 0; + } + + my @data = split /\t/, $_; + + my $feature; + + if($custom->{format} eq 'bed') { + $feature = { + chr => $chr, + start => $data[1], + end => $data[2], + name => $data[3], + }; + } + + elsif($custom->{format} eq 'vcf') { + my $tmp_vf = parse_vcf($config, $_)->[0]; + + $feature = { + chr => $chr, + start => $tmp_vf->{start}, + end => $tmp_vf->{end}, + name => $tmp_vf->{variation_name}, + }; + } + + elsif($custom->{format} eq 'gff' || $custom->{format} eq 'gtf') { + + my $name; + + # try and get a feature name from the attributes column + foreach my $attrib(split /\s*\;\s*/, $data[8]) { + my ($key, $value) = split /\=/, $attrib; + $name = $value if $key eq 'ID'; + } + + $name ||= $data[2]."_".$data[0].":".$data[3]."-".$data[4]; + + $feature = { + chr => $chr, + start => $data[3], + end => $data[4], + name => $name, + }; + } + + elsif($custom->{format} eq 'bigwig') { + $feature = { + chr => $chr, + start => $data[0], + end => $data[0], + name => $data[1], + }; + } + + if(defined($feature)) { + $got_features = 1; + + if(!defined($feature->{name}) || $custom->{coords}) { + $feature->{name} = $feature->{chr}.":".$feature->{start}."-".$feature->{end}; + } + + # add the feature to the cache + $annotation->{$chr}->{$custom->{name}}->{$feature->{start}}->{$feature->{name}} = $feature; + } + } + close CUSTOM; + + # unlink temporary wig files + unlink($config->{tmpdir}.'/vep_tmp_'.$$) if $custom->{format} eq 'bigwig'; + + # no need to fetch e.g. "chr21" features if just "21" worked + last if $got_features; + } + } + } + + end_progress($config); + + return $annotation; +} + +# builds a full cache for this species +sub build_full_cache { + my $config = shift; + + my @slices; + + if($config->{build} =~ /all/i) { + @slices = @{$config->{sa}->fetch_all('toplevel')}; + push @slices, @{$config->{sa}->fetch_all('lrg', undef, 1, undef, 1)} if defined($config->{lrg}); + } + else { + foreach my $val(split /\,/, $config->{build}) { + my @nnn = split /\-/, $val; + + foreach my $chr($nnn[0]..$nnn[-1]) { + my $slice = get_slice($config, $chr); + push @slices, $slice if defined($slice); + } + } + } + + foreach my $slice(@slices) { + my $chr = $slice->seq_region_name; + + # check for features, we don't want a load of effectively empty dirs + my $dbc = $config->{sa}->db->dbc; + my $sth = $dbc->prepare("SELECT COUNT(*) FROM transcript WHERE seq_region_id = ?"); + $sth->execute($slice->get_seq_region_id); + + my $count; + $sth->bind_columns(\$count); + $sth->fetch; + $sth->finish; + + next unless $count > 0; + + my $regions; + + # for progress + my $region_count = int($slice->end / $config->{cache_region_size}) + 1; + my $counter = 0; + + # initial region + my ($start, $end) = (1, $config->{cache_region_size}); + + debug((defined($config->{rebuild}) ? "Rebuild" : "Creat")."ing cache for chromosome $chr") unless defined($config->{quiet}); + + while($start < $slice->end) { + + progress($config, $counter++, $region_count); + + # store quiet status + my $quiet = $config->{quiet}; + $config->{quiet} = 1; + + # spoof regions + $regions->{$chr} = [$start.'-'.$end]; + + # store transcripts + my $tmp_cache = (defined($config->{rebuild}) ? load_dumped_transcript_cache($config, $chr, $start.'-'.$end) : cache_transcripts($config, $regions)); + $tmp_cache->{$chr} ||= []; + + #(defined($config->{tabix}) ? dump_transcript_cache_tabix($config, $tmp_cache, $chr, $start.'-'.$end) : dump_transcript_cache($config, $tmp_cache, $chr, $start.'-'.$end)); + dump_transcript_cache($config, $tmp_cache, $chr, $start.'-'.$end); + undef $tmp_cache; + + # store reg feats + if(defined($config->{regulatory})) { + my $rf_cache = cache_reg_feats($config, $regions); + $rf_cache->{$chr} ||= {}; + + dump_reg_feat_cache($config, $rf_cache, $chr, $start.'-'.$end); + #(defined($config->{tabix}) ? dump_reg_feat_cache_tabix($config, $rf_cache, $chr, $start.'-'.$end) : dump_reg_feat_cache($config, $rf_cache, $chr, $start.'-'.$end)); + undef $rf_cache; + + # this gets cleaned off but needs to be there for the next loop + $slice->{coord_system}->{adaptor} = $config->{csa}; + } + + # store variations + my $variation_cache; + $variation_cache->{$chr} = get_variations_in_region($config, $chr, $start.'-'.$end); + $variation_cache->{$chr} ||= {}; + + dump_variation_cache($config, $variation_cache, $chr, $start.'-'.$end); + undef $variation_cache; + + # restore quiet status + $config->{quiet} = $quiet; + + # increment by cache_region_size to get next region + $start += $config->{cache_region_size}; + $end += $config->{cache_region_size}; + } + + end_progress($config); + + undef $regions; + } + + write_cache_info($config); +} + +# write an info file that defines what is in the cache +sub write_cache_info { + my $config = shift; + + my $info_file = $config->{dir}.'/info.txt'; + + open OUT, ">>$info_file" or die "ERROR: Could not write to cache info file $info_file\n"; + + print OUT "# CACHE UPDATED ".get_time()."\n"; + + foreach my $param(qw( + host + port + user + build + regulatory + sift + polyphen + )) { + print OUT "$param\t".(defined $config->{$param} ? $config->{$param} : '-')."\n"; + } + + # cell types + if(defined($config->{cell_type}) && scalar(@{$config->{cell_type}})) { + my $cta = $config->{RegulatoryFeature_adaptor}->db->get_CellTypeAdaptor(); + print OUT "cell_types\t".(join ",", map {$_->name} @{$cta->fetch_all}); + print OUT "\n"; + } + + close OUT; +} + +# reads in cache info file +sub read_cache_info { + my $config = shift; + + my $info_file = $config->{dir}.'/info.txt'; + + open IN, $info_file or return 0; + + while() { + next if /^#/; + chomp; + my ($param, $value) = split /\t/; + $config->{'cache_'.$param} = $value unless defined $value && $value eq '-'; + } + + close IN; + + return 1; +} + +# format coords for printing +sub format_coords { + my ($start, $end) = @_; + + if(defined($start)) { + if(defined($end)) { + if($start > $end) { + return $end.'-'.$start; + } + elsif($start == $end) { + return $start; + } + else { + return $start.'-'.$end; + } + } + else { + return $start.'-?'; + } + } + elsif(defined($end)) { + return '?-'.$end; + } + else { + return '-'; + } +} + + + + +# METHODS TO FIND CO-LOCATED / EXISTING VARIATIONS +################################################## + +# finds an existing VF in the db +sub find_existing { + my $config = shift; + my $new_vf = shift; + + if(defined($config->{vfa}->db)) { + + my $maf_cols = have_maf_cols($config) ? 'vf.minor_allele, vf.minor_allele_freq' : 'NULL, NULL'; + + my $sth = $config->{vfa}->db->dbc->prepare(qq{ + SELECT variation_name, IF(fv.variation_id IS NULL, 0, 1), seq_region_start, seq_region_end, allele_string, seq_region_strand, $maf_cols + FROM variation_feature vf LEFT JOIN failed_variation fv + ON vf.variation_id = fv.variation_id + WHERE vf.seq_region_id = ? + AND vf.seq_region_start = ? + AND vf.seq_region_end = ? + ORDER BY vf.source_id ASC + }); + + $sth->execute($new_vf->slice->get_seq_region_id, $new_vf->start, $new_vf->end); + + my @v; + for my $i(0..7) { + $v[$i] = undef; + } + + $sth->bind_columns(\$v[0], \$v[1], \$v[2], \$v[3], \$v[4], \$v[5], \$v[6], \$v[7]); + + my @found; + + while($sth->fetch) { + push @found, $v[0] unless is_var_novel($config, \@v, $new_vf) || $v[1] > $config->{failed}; + } + + $sth->finish(); + + return (scalar @found ? join ",", @found : undef); + } + + return undef; +} + +# compare a new vf to one from the cache / DB +sub is_var_novel { + my $config = shift; + my $existing_var = shift; + my $new_var = shift; + + my $is_novel = 1; + + $is_novel = 0 if $existing_var->[2] == $new_var->start && $existing_var->[3] == $new_var->end; + + if(defined($config->{check_alleles})) { + my %existing_alleles; + + $existing_alleles{$_} = 1 for split /\//, $existing_var->[4]; + + my $seen_new = 0; + foreach my $a(split /\//, ($new_var->allele_string || "")) { + reverse_comp(\$a) if $new_var->strand ne $existing_var->[5]; + $seen_new = 1 unless defined $existing_alleles{$a}; + } + + $is_novel = 1 if $seen_new; + } + + return $is_novel; +} + +# check frequencies of existing var against requested params +sub check_frequencies { + my $config = shift; + my $var_name = shift; + + my $v = $config->{va}->fetch_by_name($var_name); + + my $pass = 0; + + my $freq_pop = $config->{freq_pop}; + my $freq_freq = $config->{freq_freq}; + my $freq_gt_lt = $config->{freq_gt_lt}; + + my $freq_pop_name = (split /\_/, $freq_pop)[-1]; + $freq_pop_name = undef if $freq_pop_name =~ /1kg|hap/; + + delete $config->{filtered_freqs}; + + foreach my $a(@{$v->get_all_Alleles}) { + next unless defined $a->{population} || defined $a->{'_population_id'}; + next unless defined $a->frequency; + next if $a->frequency > 0.5; + + my $pop_name = $a->population->name; + + if($freq_pop =~ /1kg/) { next unless $pop_name =~ /^1000.+(low|phase).+/i; } + if($freq_pop =~ /hap/) { next unless $pop_name =~ /^CSHL-HAPMAP/i; } + if($freq_pop =~ /any/) { next unless $pop_name =~ /^(CSHL-HAPMAP)|(1000.+(low|phase).+)/i; } + if(defined $freq_pop_name) { next unless $pop_name =~ /$freq_pop_name/i; } + + $pass = 1 if $a->frequency >= $freq_freq and $freq_gt_lt eq 'gt'; + $pass = 1 if $a->frequency <= $freq_freq and $freq_gt_lt eq 'lt'; + + $pop_name =~ s/\:/\_/g; + push @{$config->{filtered_freqs}}, $pop_name.':'.$a->frequency; + + #warn "Comparing allele ", $a->allele, " ", $a->frequency, " for $var_name in population ", $a->population->name, " PASS $pass"; + } + + return 0 if $config->{freq_filter} eq 'exclude' and $pass == 1; + return 0 if $config->{freq_filter} eq 'include' and $pass == 0; + return 1; +} + +# gets all variations in a region +sub get_variations_in_region { + my $config = shift; + my $chr = shift; + my $region = shift; + + my ($start, $end) = split /\-/, $region; + + my %variations; + + if(defined($config->{vfa}->db)) { + my $sr_cache = $config->{seq_region_cache}; + + if(!defined($sr_cache)) { + $sr_cache = cache_seq_region_ids($config); + $config->{seq_region_cache} = $sr_cache; + } + + # no seq_region_id? + return {} unless defined($sr_cache) && defined($sr_cache->{$chr}); + + my $maf_cols = have_maf_cols($config) ? 'vf.minor_allele, vf.minor_allele_freq' : 'NULL, NULL'; + + my $sth = $config->{vfa}->db->dbc->prepare(qq{ + SELECT vf.variation_name, IF(fv.variation_id IS NULL, 0, 1), vf.seq_region_start, vf.seq_region_end, vf.allele_string, vf.seq_region_strand, $maf_cols + FROM variation_feature vf + LEFT JOIN failed_variation fv ON fv.variation_id = vf.variation_id + WHERE vf.seq_region_id = ? + AND vf.seq_region_start >= ? + AND vf.seq_region_start <= ? + }); + + $sth->execute($sr_cache->{$chr}, $start, $end); + + my @v; + for my $i(0..7) { + $v[$i] = undef; + } + + $sth->bind_columns(\$v[0], \$v[1], \$v[2], \$v[3], \$v[4], \$v[5], \$v[6], \$v[7]); + + while($sth->fetch) { + my @v_copy = @v; + push @{$variations{$v[2]}}, \@v_copy; + } + + $sth->finish(); + } + + return \%variations; +} + +sub cache_seq_region_ids { + my $config = shift; + + my (%cache, $chr, $id); + + my $sth = $config->{vfa}->db->dbc->prepare(qq{ + SELECT seq_region_id, name FROM seq_region + }); + + $sth->execute(); + $sth->bind_columns(\$id, \$chr); + $cache{$chr} = $id while $sth->fetch(); + $sth->finish; + + return \%cache; +} + +sub have_maf_cols { + my $config = shift; + + if(!defined($config->{have_maf_cols})) { + my $sth = $config->{vfa}->db->dbc->prepare(qq{ + DESCRIBE variation_feature + }); + + $sth->execute(); + my @cols = map {$_->[0]} @{$sth->fetchall_arrayref}; + $sth->finish(); + + $config->{have_maf_cols} = 0; + $config->{have_maf_cols} = 1 if grep {$_ eq 'minor_allele'} @cols; + } + + return $config->{have_maf_cols}; +} + +sub merge_hashes { + my ($x, $y) = @_; + + foreach my $k (keys %$y) { + if (!defined($x->{$k})) { + $x->{$k} = $y->{$k}; + } else { + if(ref($x->{$k}) eq 'ARRAY') { + $x->{$k} = merge_arrays($x->{$k}, $y->{$k}); + } + elsif(ref($x->{$k}) eq 'HASH') { + $x->{$k} = merge_hashes($x->{$k}, $y->{$k}); + } + else { + $x->{$k} = $y->{$k}; + } + } + } + return $x; +} + +sub merge_arrays { + my ($x, $y) = @_; + + my %tmp = map {$_ => 1} (@$x, @$y); + + return [keys %tmp]; +} + + + + +# DEBUG AND STATUS METHODS +########################## + +# gets time +sub get_time() { + my @time = localtime(time()); + + # increment the month (Jan = 0) + $time[4]++; + + # add leading zeroes as required + for my $i(0..4) { + $time[$i] = "0".$time[$i] if $time[$i] < 10; + } + + # put the components together in a string + my $time = + ($time[5] + 1900)."-". + $time[4]."-". + $time[3]." ". + $time[2].":". + $time[1].":". + $time[0]; + + return $time; +} + +# prints debug output with time +sub debug { + my $text = (@_ ? (join "", @_) : "No message"); + my $time = get_time; + + print $time." - ".$text.($text =~ /\n$/ ? "" : "\n"); +} + +# finds out memory usage +sub memory { + my @mem; + + open IN, "ps -o rss,vsz $$ |"; + while() { + next if $_ =~ /rss/i; + chomp; + @mem = split; + } + close IN; + + return \@mem; +} + +sub mem_diff { + my $config = shift; + my $mem = memory(); + my @diffs; + + if(defined($config->{memory})) { + for my $i(0..(scalar @{$config->{memory}} - 1)) { + push @diffs, $mem->[$i] - $config->{memory}->[$i]; + } + } + else { + @diffs = @$mem; + } + + $config->{memory} = $mem; + + return \@diffs; +} + +# update or initiate progress bar +sub progress { + my ($config, $i, $total) = @_; + + return if defined($config->{quiet}) || defined($config->{no_progress}); + + my $width = $config->{terminal_width} || 60; + my $percent = int(($i/$total) * 100); + my $numblobs = int((($i/$total) * $width) - 2); + + # this ensures we're not writing to the terminal too much + return if(defined($config->{prev_prog})) && $numblobs.'-'.$percent eq $config->{prev_prog}; + $config->{prev_prog} = $numblobs.'-'.$percent; + + printf("\r% -${width}s% 1s% 10s", '['.('=' x $numblobs).($numblobs == $width - 2 ? '=' : '>'), ']', "[ " . $percent . "% ]"); +} + +# end progress bar +sub end_progress { + my $config = shift; + return if defined($config->{quiet}) || defined($config->{no_progress}); + progress($config, 1,1); + print "\n"; + delete $config->{prev_prog}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/VariationEffect.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/VariationEffect.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,974 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Variation::Utils::VariationEffect + +=head1 DESCRIPTION + +This module defines a set of predicate subroutines that check the effect of a +Bio::EnsEMBL::Variation::VariationFeature on some other Bio::EnsEMBL::Feature. +All of these predicates take a VariationFeatureOverlapAllele as their first and +only argument and return a true or false value depending on whether the effect +being checked for holds or not. The link between these predicates and the +specific effect is configured in the Bio::EnsEMBL::Variation::Utils::Config +module and a list of OverlapConsequence objects that represent a link between, +for example, a Sequence Ontology consequence term, and the predicate that +checks for it is provided in the Bio::EnsEMBL::Variation::Utils::Constants +module. If you want to add a new consequence you should write a predicate in +this module and then add an entry in the configuration file. + +=cut + +package Bio::EnsEMBL::Variation::Utils::VariationEffect; + +use strict; +use warnings; + +use base qw(Exporter); + +our @EXPORT_OK = qw(overlap within_cds MAX_DISTANCE_FROM_TRANSCRIPT within_intron stop_lost affects_start_codon $UPSTREAM_DISTANCE $DOWNSTREAM_DISTANCE); + +use constant MAX_DISTANCE_FROM_TRANSCRIPT => 5000; + +our $UPSTREAM_DISTANCE = MAX_DISTANCE_FROM_TRANSCRIPT; +our $DOWNSTREAM_DISTANCE = MAX_DISTANCE_FROM_TRANSCRIPT; + +#package Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele; + +sub overlap { + my ( $f1_start, $f1_end, $f2_start, $f2_end ) = @_; + + return ( ($f1_end >= $f2_start) and ($f1_start <= $f2_end) ); +} + +sub within_feature { + my ($bvfoa, $feat) = @_; + my $bvf = $bvfoa->base_variation_feature; + $feat = $bvfoa->feature unless defined($feat); + + return overlap( + $bvf->start, + $bvf->end, + $feat->start, + $feat->end + ); +} + +sub partial_overlap_feature { + my ($bvfoa, $feat) = @_; + my $bvf = $bvfoa->base_variation_feature; + $feat = $bvfoa->feature unless defined($feat); + + return ( + within_feature(@_) and + (not complete_overlap_feature(@_)) and + (($bvf->end > $feat->end) or ($bvf->start < $feat->start)) + ); +} + +sub complete_within_feature { + my ($bvfoa, $feat) = @_; + my $bvf = $bvfoa->base_variation_feature; + $feat = $bvfoa->feature unless defined($feat); + + return ( + ($bvf->start >= $feat->start) and + ($bvf->end <= $feat->end) + ); +} + +sub complete_overlap_feature { + my ($bvfoa, $feat) = @_; + my $bvf = $bvfoa->base_variation_feature; + $feat = $bvfoa->feature unless defined($feat); + + return ( + ($bvf->start <= $feat->start) and + ($bvf->end >= $feat->end) + ); +} + +sub deletion { + my $bvfoa = shift; + + my $bvf = $bvfoa->base_variation_feature; + + # sequence variant will have alleles + if($bvf->isa('Bio::EnsEMBL::Variation::VariationFeature')) { + my ($ref_allele, $alt_allele) = _get_alleles($bvfoa); + return ( + (defined($ref_allele) && ($alt_allele eq '') and $ref_allele) or + $bvf->allele_string =~ /deletion/i + ); + } + + # structural variant depends on class + if($bvf->isa('Bio::EnsEMBL::Variation::StructuralVariationFeature')) { + return ( + ($bvfoa->base_variation_feature->class_SO_term(undef, 1) eq 'deletion') or + ($bvfoa->base_variation_feature->class_SO_term(undef, 1) =~ /deletion/i) or + ($bvfoa->base_variation_feature->class_SO_term(undef, 1) =~ /loss/i) + ); + } + + else { return 0; } +} + +sub insertion { + my $bvfoa = shift; + + my $bvf = $bvfoa->base_variation_feature; + + # sequence variant will have alleles + if($bvf->isa('Bio::EnsEMBL::Variation::VariationFeature')) { + my ($ref_allele, $alt_allele) = _get_alleles($bvfoa); + return ( + (defined($ref_allele) && ($ref_allele eq '') and $alt_allele) or + $bvf->allele_string =~ /insertion/i + ); + } + + # structural variant depends on class + if($bvf->isa('Bio::EnsEMBL::Variation::StructuralVariationFeature')) { + return ( + duplication($bvfoa) or + tandem_duplication($bvfoa) or + ($bvfoa->base_variation_feature->class_SO_term(undef, 1) eq 'insertion') or + ($bvfoa->base_variation_feature->class_SO_term(undef, 1) =~ /insertion/i) or + ($bvfoa->base_variation_feature->class_SO_term(undef, 1) =~ /gain/i) + ); + } + + else { return 0; } +} + +sub copy_number_gain { + my $bvfoa = shift; + + return (duplication($bvfoa) or tandem_duplication($bvfoa) or $bvfoa->base_variation_feature->class_SO_term(undef, 1) =~ /gain/i); +} + +sub copy_number_loss { + my $bvfoa = shift; + + return $bvfoa->base_variation_feature->class_SO_term(undef, 1) =~ /loss/i; +} + +sub duplication { + my $bvfoa = shift; + + return ( + ( + ($bvfoa->base_variation_feature->class_SO_term(undef, 1) eq 'duplication') or + ($bvfoa->base_variation_feature->class_SO_term(undef, 1) =~ /duplication/i) + ) and + (not tandem_duplication($bvfoa)) + ); +} + +sub tandem_duplication { + my $bvfoa = shift; + + my $bvf = $bvfoa->base_variation_feature; + + # for sequence variants, check sequence vs ref + if($bvf->isa('Bio::EnsEMBL::Variation::VariationFeature')) { + my ($ref_allele, $alt_allele) = _get_alleles($bvfoa); + + return 0 unless $ref_allele and $alt_allele; + return 0 unless + length($alt_allele) > length($ref_allele) and + length($alt_allele) % length($ref_allele) == 0; + + my $copies = length($alt_allele) / length($ref_allele); + + return $alt_allele eq $ref_allele x $copies; + } + + # structural variant depends on class + if($bvf->isa('Bio::EnsEMBL::Variation::StructuralVariationFeature')) { + return ( + ($bvfoa->base_variation_feature->class_SO_term(undef, 1) eq 'tandem_duplication') or + ($bvfoa->base_variation_feature->class_SO_term(undef, 1) =~ /tandem_duplication/i) + ); + } +} + +sub feature_ablation { + my $bvfoa = shift; + + return (deletion($bvfoa) and complete_overlap_feature($bvfoa)); +} + +sub feature_amplification { + my $bvfoa = shift; + + return (copy_number_gain($bvfoa) && complete_overlap_feature($bvfoa)); +} + +sub feature_elongation { + my $bvfoa = shift; + + return ( + complete_within_feature($bvfoa) and + (copy_number_gain($bvfoa) or insertion($bvfoa)) and + not( + $bvfoa->isa('Bio::EnsEMBL::Variation::BaseTranscriptVariationAllele') and + (inframe_insertion($bvfoa) or stop_lost($bvfoa)) + ) + ); +} + +sub feature_truncation { + my $bvfoa = shift; + + return ( + (partial_overlap_feature($bvfoa) or complete_within_feature($bvfoa)) and + (copy_number_loss($bvfoa) or deletion($bvfoa)) and + not( + $bvfoa->isa('Bio::EnsEMBL::Variation::BaseTranscriptVariationAllele') and + (inframe_deletion($bvfoa) or stop_gained($bvfoa)) + ) + ); +} + +#sub transcript_fusion { +# #my $bvfoa = shift; +# #my $bvf = $bvfoa->base_variation_feature; +# +# return 0; +# +# #my $transcripts = $bvf->_get_overlapping_Transcripts(); +#} + +sub _before_start { + my ($bvf, $feat, $dist) = @_; + + return ( ($bvf->end >= ($feat->start - $dist)) and + ($bvf->end < $feat->start) ); +} + +sub _after_end { + my ($bvf, $feat, $dist) = @_; + return ( ($bvf->start <= ($feat->end + $dist)) + and ($bvf->start > $feat->end) ); +} + +sub _upstream { + my ($bvf, $feat, $dist) = @_; + return $feat->strand == 1 ? + _before_start($bvf, $feat, $dist) : + _after_end($bvf, $feat, $dist); +} + +sub _downstream { + my ($bvf, $feat, $dist) = @_; + return $feat->strand == 1 ? + _after_end($bvf, $feat, $dist) : + _before_start($bvf, $feat, $dist); +} + +#package Bio::EnsEMBL::Variation::TranscriptVariationAllele; + +sub upstream { + my $vfoa = shift; + my $bvf = $vfoa->base_variation_feature; + my $feat = $vfoa->feature; + + return _upstream($bvf, $feat, $UPSTREAM_DISTANCE); +} + +sub downstream { + my $vfoa = shift; + my $bvf = $vfoa->base_variation_feature; + my $feat = $vfoa->feature; + + return _downstream($bvf, $feat, $DOWNSTREAM_DISTANCE); +} + +sub affects_transcript { + my ($bvf, $tran) = @_; + + return 0 unless $tran->isa('Bio::EnsEMBL::Transcript'); + + return overlap( + $bvf->start, + $bvf->end, + $tran->start - 5000, + $tran->end + 5000 + ); +} + +sub within_transcript { + my $bvfoa = shift; + return within_feature($bvfoa); +} + +sub within_nmd_transcript { + my $bvfoa = shift; + my $tran = $bvfoa->transcript; + + return ( within_transcript($bvfoa) and ($tran->biotype eq 'nonsense_mediated_decay') ); +} + +sub within_non_coding_gene { + my $bvfoa = shift; + my $tran = $bvfoa->transcript; + + return ( within_transcript($bvfoa) and (not $tran->translation) and (not within_mature_miRNA($bvfoa))); +} + +sub non_coding_exon_variant { + my $bvfoa = shift; + + return 0 unless within_non_coding_gene($bvfoa); + + my $bvf = $bvfoa->base_variation_feature; + my $exons = $bvfoa->base_variation_feature_overlap->_exons; + + if(scalar grep {overlap($bvf->start, $bvf->end, $_->start, $_->end)} @$exons) { + return 1; + } + else { + return 0; + } +} + +sub within_miRNA { + my $bvfoa = shift; + my $tran = $bvfoa->transcript; + + # don't call this for now + + return 0; + + return ( within_transcript($bvfoa) and ($tran->biotype eq 'miRNA') ); +} + +sub within_mature_miRNA { + my $bvfoa = shift; + + my $bvfo = $bvfoa->base_variation_feature_overlap; + my $bvf = $bvfoa->base_variation_feature; + my $tran = $bvfoa->transcript; + + return 0 unless ( within_transcript($bvfoa) and ($tran->biotype eq 'miRNA') ); + + my ($attribute) = @{ $tran->get_all_Attributes('miRNA') }; + + if (defined $attribute && $attribute->value =~ /(\d+)-(\d+)/) { + for my $coord ($bvfo->_mapper->cdna2genomic($1, $2, $tran->strand)) { + if ($coord->isa('Bio::EnsEMBL::Mapper::Coordinate')) { + if (overlap( + $bvf->start, + $bvf->end, + $coord->start, + $coord->end) ) { + return 1; + } + } + } + } + + return 0; +} + +sub donor_splice_site { + my $bvfoa = shift; + my $tran = $bvfoa->transcript; + + return $tran->strand == 1 ? + $bvfoa->base_variation_feature_overlap->_intron_effects->{start_splice_site} : + $bvfoa->base_variation_feature_overlap->_intron_effects->{end_splice_site}; +} + +sub acceptor_splice_site { + my $bvfoa = shift; + my $tran = $bvfoa->transcript; + + return $tran->strand == 1 ? + $bvfoa->base_variation_feature_overlap->_intron_effects->{end_splice_site} : + $bvfoa->base_variation_feature_overlap->_intron_effects->{start_splice_site}; +} + +sub essential_splice_site { + my $bvfoa = shift; + + return ( acceptor_splice_site($bvfoa) or donor_splice_site($bvfoa) ); +} + +sub splice_region { + my $bvfoa = shift; + + return 0 if donor_splice_site($bvfoa); + return 0 if acceptor_splice_site($bvfoa); + return 0 if essential_splice_site($bvfoa); + + return $bvfoa->base_variation_feature_overlap->_intron_effects->{splice_region}; +} + +sub within_intron { + my $bvfoa = shift; + + return $bvfoa->base_variation_feature_overlap->_intron_effects->{intronic}; +} + +sub within_cds { + my $bvfoa = shift; + my $bvf = $bvfoa->base_variation_feature; + my $tran = $bvfoa->transcript; + my $bvfo = $bvfoa->base_variation_feature_overlap; + + my $cds_coords = $bvfoa->base_variation_feature_overlap->cds_coords; + + if (@$cds_coords > 0) { + for my $coord (@$cds_coords) { + if ($coord->isa('Bio::EnsEMBL::Mapper::Coordinate')) { + if ($coord->end > 0 && $coord->start <= length($bvfo->_translateable_seq)) { + return 1; + } + } + } + } + + # we also need to check if the vf is in a frameshift intron within the CDS + + if (defined $tran->translation && + $bvfoa->base_variation_feature_overlap->_intron_effects->{within_frameshift_intron}) { + + return overlap( + $bvf->start, + $bvf->end, + $tran->coding_region_start, + $tran->coding_region_end, + ); + } + + return 0; +} + +sub within_cdna { + my $bvfoa = shift; + my $bvf = $bvfoa->base_variation_feature; + my $tran = $bvfoa->transcript; + my $bvfo = $bvfoa->base_variation_feature_overlap; + + my $cdna_coords = $bvfo->cdna_coords; + + if (@$cdna_coords > 0) { + for my $coord (@$cdna_coords) { + if ($coord->isa('Bio::EnsEMBL::Mapper::Coordinate')) { + if ($coord->end > 0 && $coord->start <= $tran->length) { + return 1; + } + } + } + } + + # we also need to check if the vf is in a frameshift intron within the cDNA + + if ($bvfoa->base_variation_feature_overlap->_intron_effects->{within_frameshift_intron}) { + return within_transcript($bvfoa); + } + + return 0; +} + +sub _before_coding { + my ($bvf, $tran) = @_; + return 0 unless defined $tran->translation; + + my $bvf_s = $bvf->start; + my $bvf_e = $bvf->end; + my $t_s = $tran->start; + my $cds_s = $tran->coding_region_start; + + # we need to special case insertions just before the CDS start + if ($bvf_s == $bvf_e+1 && $bvf_s == $cds_s) { + return 1; + } + + return overlap($bvf_s, $bvf_e, $t_s, $cds_s-1); +} + +sub _after_coding { + my ($bvf, $tran) = @_; + return 0 unless defined $tran->translation; + + my $bvf_s = $bvf->start; + my $bvf_e = $bvf->end; + my $t_e = $tran->end; + my $cds_e = $tran->coding_region_end; + + # we need to special case insertions just after the CDS end + if ($bvf_s == $bvf_e+1 && $bvf_e == $cds_e) { + return 1; + } + + return overlap($bvf_s, $bvf_e, $cds_e+1, $t_e); +} + +sub within_5_prime_utr { + my $bvfoa = shift; + my $bvf = $bvfoa->base_variation_feature; + my $tran = $bvfoa->transcript; + + my $five_prime_of_coding = + $tran->strand == 1 ? + _before_coding($bvf, $tran) : + _after_coding($bvf, $tran); + + return ( $five_prime_of_coding and within_cdna($bvfoa) ); +} + +sub within_3_prime_utr { + my $bvfoa = shift; + my $bvf = $bvfoa->base_variation_feature; + my $tran = $bvfoa->transcript; + + my $three_prime_of_coding = + $tran->strand == 1 ? + _after_coding($bvf, $tran) : + _before_coding($bvf, $tran); + + return ( $three_prime_of_coding and within_cdna($bvfoa) ); +} + +sub complex_indel { + my $bvfoa = shift; + my $bvf = $bvfoa->base_variation_feature; + + # pass the no_db flag to var_class to ensure we don't rely on the database for it + # as it may not have been set at this stage in the pipeline + my $class = $bvf->var_class(1); + + return 0 unless $class =~ /insertion|deletion|indel/; + + return @{ $bvfoa->base_variation_feature_overlap->cds_coords } > 1; +} + +sub _get_peptide_alleles { + my $bvfoa = shift; + my $bvfo = $bvfoa->base_variation_feature_overlap; + + return () if frameshift($bvfoa); + + my $alt_pep = $bvfoa->peptide; + + return () unless defined $alt_pep; + + my $ref_pep = $bvfo->get_reference_TranscriptVariationAllele->peptide; + + return () unless defined $ref_pep; + + $ref_pep = '' if $ref_pep eq '-'; + $alt_pep = '' if $alt_pep eq '-'; + + return ($ref_pep, $alt_pep); +} + +sub _get_codon_alleles { + my $bvfoa = shift; + my $bvfo = $bvfoa->base_variation_feature_overlap; + + return () if frameshift($bvfoa); + + my $alt_codon = $bvfoa->codon; + + return () unless defined $alt_codon; + + my $ref_codon = $bvfo->get_reference_TranscriptVariationAllele->codon; + + return () unless defined $ref_codon; + + $ref_codon = '' if $ref_codon eq '-'; + $alt_codon = '' if $alt_codon eq '-'; + + return ($ref_codon, $alt_codon); +} + +sub _get_alleles { + my $bvfoa = shift; + my $bvfo = $bvfoa->base_variation_feature_overlap; + + my $ref_tva = $bvfo->get_reference_VariationFeatureOverlapAllele; + + return () unless defined ($ref_tva); + + my $ref_allele = $ref_tva->variation_feature_seq; + my $alt_allele = $bvfoa->variation_feature_seq; + + return () unless defined($ref_allele) and defined($alt_allele); + + $ref_allele = '' if $ref_allele eq '-'; + $alt_allele = '' if $alt_allele eq '-'; + + return ($ref_allele, $alt_allele); +} + +sub stop_retained { + my $bvfoa = shift; + + my ($ref_pep, $alt_pep) = _get_peptide_alleles($bvfoa); + + return 0 unless $ref_pep; + + return ( $alt_pep =~ /\*/ && $ref_pep =~ /\*/ ); +} + +sub affects_start_codon { + my $bvfoa = shift; + my $bvfo = $bvfoa->base_variation_feature_overlap; + + # sequence variant + if($bvfo->isa('Bio::EnsEMBL::Variation::TranscriptVariation')) { + my ($ref_pep, $alt_pep) = _get_peptide_alleles($bvfoa); + + return 0 unless $ref_pep; + + return ( ($bvfo->translation_start == 1) and (substr($ref_pep,0,1) ne substr($alt_pep,0,1)) ); + } + + # structural variant + elsif($bvfo->isa('Bio::EnsEMBL::Variation::TranscriptStructuralVariation')) { + my $tr = $bvfo->transcript; + my $bvf = $bvfo->base_variation_feature; + + my ($tr_crs, $tr_cre) = ($tr->coding_region_start, $tr->coding_region_end); + return 0 unless defined($tr_crs) && defined($tr_cre); + + if($tr->strand == 1) { + return overlap($tr_crs, $tr_crs + 2, $bvf->start, $bvf->end); + } + else { + return overlap($tr_cre-2, $tr_cre, $bvf->start, $bvf->end); + } + } + + else { + return 0; + } +} + +sub synonymous_variant { + my $bvfoa = shift; + + my ($ref_pep, $alt_pep) = _get_peptide_alleles($bvfoa); + + return 0 unless $ref_pep; + + return ( ($alt_pep eq $ref_pep) and (not stop_retained($bvfoa) and ($alt_pep !~ /X/) and ($ref_pep !~ /X/)) ); +} + +sub missense_variant { + my $bvfoa = shift; + + my ($ref_pep, $alt_pep) = _get_peptide_alleles($bvfoa); + + return 0 unless defined $ref_pep; + + return 0 if affects_start_codon($bvfoa); + return 0 if stop_lost($bvfoa); + return 0 if stop_gained($bvfoa); + return 0 if partial_codon($bvfoa); + + return 0 if inframe_deletion($bvfoa); + return 0 if inframe_insertion($bvfoa); + + return ( $ref_pep ne $alt_pep ); +} + +sub inframe_insertion { + my $bvfoa = shift; + + # sequence variant + if($bvfoa->base_variation_feature->isa('Bio::EnsEMBL::Variation::VariationFeature')) { + my ($ref_codon, $alt_codon) = _get_codon_alleles($bvfoa); + + return 0 unless defined $ref_codon; + + return ( + (length($alt_codon) > length ($ref_codon)) && + ( ($alt_codon =~ /^\Q$ref_codon\E/) || ($alt_codon =~ /\Q$ref_codon\E$/) ) + ); + } + + # structural variant + elsif($bvfoa->base_variation_feature->isa('Bio::EnsEMBL::Variation::StructuralVariationFeature')) { + + # TO BE DONE, NO WAY OF KNOWING WHAT SEQUENCE IS INSERTED YET + return 0; + + # must be an insertion + return 0 unless insertion($bvfoa); + + my $bvfo = $bvfoa->base_variation_feature_overlap; + + my $cds_coords = $bvfo->cds_coords; + + if(scalar @$cds_coords == 1) { + + # wholly within exon + if($cds_coords->[0]->isa('Bio::EnsEMBL::Mapper::Coordinate')) { + 1; + } + } + + # variant partially overlaps + else { + return 0; + } + } + + else { + return 0; + } +} + +sub inframe_deletion { + my $bvfoa = shift; + + # sequence variant + if($bvfoa->base_variation_feature->isa('Bio::EnsEMBL::Variation::VariationFeature')) { + my ($ref_codon, $alt_codon) = _get_codon_alleles($bvfoa); + + return 0 unless defined $ref_codon; + + return ( + (length($alt_codon) < length ($ref_codon)) && + ( ($ref_codon =~ /^\Q$alt_codon\E/) || ($ref_codon =~ /\Q$alt_codon\E$/) ) + ); + } + + # structural variant + elsif($bvfoa->base_variation_feature->isa('Bio::EnsEMBL::Variation::StructuralVariationFeature')) { + + # must be a deletion + return 0 unless deletion($bvfoa); + + my $bvfo = $bvfoa->base_variation_feature_overlap; + my $cds_coords = $bvfo->cds_coords; + my $exons = $bvfo->_exons; + my $bvf = $bvfo->base_variation_feature; + + # in exon + return ( + scalar @$cds_coords == 1 and + $cds_coords->[0]->isa('Bio::EnsEMBL::Mapper::Coordinate') and + scalar grep {complete_within_feature($bvfoa, $_)} @$exons and + $bvf->length() % 3 == 0 + ); + } + + else { + return 0; + } +} + +sub stop_gained { + my $bvfoa = shift; + my $bvfo = $bvfoa->base_variation_feature_overlap; + + return 0 unless $bvfoa->isa('Bio::EnsEMBL::Variation::TranscriptVariationAllele'); + + my ($ref_pep, $alt_pep) = _get_peptide_alleles($bvfoa); + + return 0 unless defined $ref_pep; + + return ( ($alt_pep =~ /\*/) and ($ref_pep !~ /\*/) ); +} + +sub stop_lost { + my $bvfoa = shift; + + # sequence variant + if($bvfoa->isa('Bio::EnsEMBL::Variation::TranscriptVariationAllele')) { + my ($ref_pep, $alt_pep) = _get_peptide_alleles($bvfoa); + + return 0 unless defined $ref_pep; + + return ( ($alt_pep !~ /\*/) and ($ref_pep =~ /\*/) ); + } + + # structural variant + elsif($bvfoa->isa('Bio::EnsEMBL::Variation::TranscriptStructuralVariationAllele')) { + return 0 unless deletion($bvfoa); + + my $tr = $bvfoa->transcript; + my $bvf = $bvfoa->base_variation_feature; + + my ($tr_crs, $tr_cre) = ($tr->coding_region_start, $tr->coding_region_end); + return 0 unless defined($tr_crs) && defined($tr_cre); + + if($tr->strand == 1) { + return overlap($tr_cre-2, $tr_cre, $bvf->start, $bvf->end); + } + else { + return overlap($tr_crs, $tr_crs + 2, $bvf->start, $bvf->end); + } + } + + else { + return 0; + } +} + +sub frameshift { + my $bvfoa = shift; + + # sequence variant + if($bvfoa->isa('Bio::EnsEMBL::Variation::TranscriptVariationAllele')) { + + return 0 if partial_codon($bvfoa); + + my $bvfo = $bvfoa->base_variation_feature_overlap; + + return 0 unless defined $bvfo->cds_start && defined $bvfo->cds_end; + + my $var_len = $bvfo->cds_end - $bvfo->cds_start + 1; + + my $allele_len = $bvfoa->seq_length; + + # if the allele length is undefined then we can't call a frameshift + + return 0 unless defined $allele_len; + + return abs( $allele_len - $var_len ) % 3; + } + + # structural variant + elsif($bvfoa->isa('Bio::EnsEMBL::Variation::TranscriptStructuralVariationAllele')) { + my $bvf = $bvfoa->base_variation_feature; + my $exons = $bvfoa->base_variation_feature_overlap->_exons; + + return ( + ( + deletion($bvfoa) or + copy_number_loss($bvfoa) + ) and + scalar grep {complete_within_feature($bvfoa, $_)} @$exons and + $bvf->length % 3 != 0 + ); + + # TODO INSERTIONS + } + + else { + return 0; + } +} + +sub partial_codon { + my $bvfoa = shift; + + my $bvfo = $bvfoa->base_variation_feature_overlap; + + return 0 unless defined $bvfo->translation_start; + + my $cds_length = length $bvfo->_translateable_seq; + + my $codon_cds_start = ($bvfo->translation_start * 3) - 2; + + my $last_codon_length = $cds_length - ($codon_cds_start - 1); + + return ( $last_codon_length < 3 and $last_codon_length > 0 ); +} + +sub coding_unknown { + my $bvfoa = shift; + + # sequence variant + if($bvfoa->isa('Bio::EnsEMBL::Variation::TranscriptVariationAllele')) { + return (within_cds($bvfoa) and ((not $bvfoa->peptide) or (not _get_peptide_alleles($bvfoa)) or ($bvfoa->peptide =~ /X/)) and (not (frameshift($bvfoa) or inframe_deletion($bvfoa)))); + } + + # structural variant + elsif($bvfoa->isa('Bio::EnsEMBL::Variation::TranscriptStructuralVariationAllele')) { + return (within_cds($bvfoa) and not(inframe_insertion($bvfoa) or inframe_deletion($bvfoa) or frameshift($bvfoa))); + } + + else { + return 0; + } +} + +#package Bio::EnsEMBL::Variation::RegulatoryFeatureVariationAllele; + +sub within_regulatory_feature { + my $rfva = shift; + return within_feature($rfva); +} + +#package Bio::EnsEMBL::Variation::ExternalFeatureVariationAllele; + +sub within_external_feature { + my $efva = shift; + return (within_feature($efva) and (not within_miRNA_target_site($efva))); +} + +#sub within_miRNA_target_site { +# my $efva = shift; +# +# my $fset = $efva->variation_feature_overlap->feature->feature_set; +# +# return ($fset && $fset->name eq 'miRanda miRNA targets'); +#} + +#package Bio::EnsEMBL::Variation::MotifFeatureVariationAllele; + +#sub within_motif_feature { +# my $mfva = shift; +# return ( +# within_feature($mfva) and +# !increased_binding_affinity($mfva) and +# !decreased_binding_affinity($mfva) +# ); +#} + +sub within_motif_feature { + my $mfva = shift; + return within_feature($mfva); +} + +#sub increased_binding_affinity { +# my $mfva = shift; +# my $change = $mfva->binding_affinity_change; +# return (within_feature($mfva) and (defined $change) and ($change > 0)); +#} +# +#sub decreased_binding_affinity { +# my $mfva = shift; +# my $change = $mfva->binding_affinity_change; +# return (within_feature($mfva) and (defined $change) and ($change < 0)); +#} + +sub contains_entire_feature { + my $vfo = shift; + + my $bvf = $vfo->base_variation_feature; + my $feat = $vfo->feature; + + return ( ($bvf->start <= $feat->start) && ($bvf->end >= $feat->end) ); +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/dbSNP.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/dbSNP.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,256 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Variation::Utils::dbSNP + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Variation::Utils::dbSNP qw(decode_bitfield); + + my $hashref = decode_bitfield('050160000a01050512110101'); + + print "variant is precious\n" if $hashref->{precious}; + +=head1 DESCRIPTION + +This module provides a single subroutine decode_bitfield which decodes +a dbSNP bitfield from their VCF files into a hash reference with values +for each value specified in the field. + +The encoding is taken from the following NCBI document: + +ftp://ftp.ncbi.nlm.nih.gov/snp/specs/dbSNP_BitField_latest.pdf + +=cut + +package Bio::EnsEMBL::Variation::Utils::dbSNP; + +use strict; +use warnings; + +use base qw(Exporter); + +our @EXPORT_OK = qw(decode_bitfield); + +use constant ENCODING_VERSION => 5; + +# an example string, with the fields and offsets + +# F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 +# 05 0160 000a 01 05 05 12 11 01 01 +# 0 2 4 6 8 10 12 14 16 18 20 22 + +# offsets into the string for each field + +my %offsets = ( + F0 => 0, + F1_1 => 2, + F1_2 => 4, + F2_1 => 6, + F2_2 => 8, + F3 => 10, + F4 => 12, + F5 => 14, + F6 => 16, + F7 => 18, + F8 => 20, + F9 => 22, +); + +# a hash mapping the values encoded in each field to the bits used encode them +# if multiple bits are used (e.g. for version) then the values should be a +# listref of all the bits, this will be used to construct a bit mask to pick +# out the necessary information + +my %fields= ( + + F0 => { + version => [3,2,1], + }, + + F1_1 => { + trace_archive => 8, + assembly_archive => 7, + entrez_geo => 6, + probe_db => 5, + entrez_gene => 4, + entrez_sts => 3, + has_structure => 2, + submitter_link_out => 1, + }, + + F1_2 => { + clinical => 7, + precious => 6, + provisional_tpa => 5, + pubmed => 4, + sra => 3, + organism_db_link => 2, + mgc_clone => 1, + }, + + F2_1 => { + utr_3 => 8, + utr_5 => 7, + acceptor_ss => 6, + donor_ss => 5, + intron => 4, + region_3 => 3, + region_5 => 2, + in_gene => 1, + }, + + F2_2 => { + stop_loss => 6, + frameshift => 5, + missense => 4, + stop_gain => 3, + has_ref => 2, + has_syn => 1, + }, + + F3 => { + has_other_snp => 5, + has_assembly_conflict => 4, + is_assembly_specific => 3, + weight => [1,2], + }, + + F4 => { + is_mutation => 4, + is_validated => 3, + maf_all_pops => 2, + maf_some_pops => 1, + }, + + F5 => { + marker_high_density => 3, + in_haplotype_tagging_set => 2, + genotypes_available => 1, + }, + + F6 => { + tgp_2010_production => 7, + tgp_validated => 6, + tgp_2010_pilot => 5, + tgp_2009_pilot => 4, + hm_phase_3_genotyped => 3, + hm_phase_2_genotyped => 2, + hm_phase_1_genotyped => 1, + }, + + F7 => { + has_mesh => 8, + clinical_assay => 7, + has_tf => 6, + lsdb => 5, + dbgap_significant => 4, + dbgap_lod_score => 3, + third_party_annot => 2, + omim => 1, + }, + + F8 => { + var_class => [4,3,2,1], + }, + + F9 => { + is_suspect => 7, + is_somatic => 6, + contig_allele_not_present => 5, + withdrawn => 4, + cluster_no_overlap => 3, + strain_specific => 2, + genotype_conflict => 1, + }, +); + +# a lookup table for the variation class + +my %var_class = ( + 0b0001 => 'snp', + 0b0010 => 'dips', + 0b0011 => 'heterozygous', + 0b0100 => 'microsatellite', + 0b0101 => 'named', + 0b0110 => 'no_variation', + 0b0111 => 'mixed', + 0b1000 => 'multi_base', +); + +=head2 decode_bitfield + + Arg[1] : string $bitfield + Example : my $hashref = decode_bitfield('050160000a01050512110101'); + Description : Decodes a dbSNP bitfield string which encodes various attributes of a variation + Returntype : A hash reference with a key for each attribute set in the field, if the field + is boolean (e.g. precious, suspect etc.) then the value should be treated as a + true or false value, otherwise (e.g. var_class, weight) the value is the actual + value of the attribute + +=cut + +sub decode_bitfield { + + my $bitfield = shift; + + my %res; + + for my $field (keys %fields) { + + for my $value (keys %{ $fields{$field} }) { + + my $bits = $fields{$field}->{$value}; + + # if bits isn't an array, put the single bit into an array + $bits = [$bits] unless ref $bits eq 'ARRAY'; + + # OR together all the bits to give us our mask + my $mask; + + for my $bit (@$bits) { + $mask |= 2**($bit-1); + } + + # extract the relevant characters from the bitfield string, + # convert them to an integer, and apply our mask + $res{$value} = hex(substr($bitfield, $offsets{$field}, 2)) & $mask; + + # check that the version matches what we expect + if ($value eq 'version' && $res{$value} != ENCODING_VERSION) { + warn "Version field does not match the expected version (".$res{$value}." vs ".ENCODING_VERSION.")"; + return undef; + } + + # lookup the class description + $res{$value} = $var_class{$res{$value}} if $value eq 'var_class'; + + # get rid of anything set to 0 + delete $res{$value} unless $res{$value}; + } + } + + return \%res; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/Variation.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Variation.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1163 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# Ensembl module for Bio::EnsEMBL::Variation::Variation +# +# Copyright (c) 2004 Ensembl +# + + +=head1 NAME + +Bio::EnsEMBL::Variation::Variation - Ensembl representation of a nucleotide variation. + +=head1 SYNOPSIS + + $v = Bio::EnsEMBL::Variation::Variation->new(-name => 'rs123', + -source => 'dbSNP'); + + # add additional synonyms for the same SNP + $v->add_synonym('dbSNP', 'ss3242'); + $v->add_synonym('TSC', '53253'); + + # add some validation states for this SNP + $v->add_validation_status('freq'); + $v->add_validation_status('cluster'); + + # add alleles associated with this SNP + $a1 = Bio::EnsEMBL::Allele->new(...); + $a2 = Bio::EnsEMBL::Allele->new(...); + $v->add_Allele($a1); + $v->add_Allele($a2); + + # set the flanking sequences + $v->five_prime_flanking_seq($seq); + $v->three_prime_flanking_seq($seq); + + + ... + + # print out the default name and source of the variation and the version + print $v->source(), ':',$v->name(), ".",$v->version,"\n"; + + # print out every synonym associated with this variation + @synonyms = @{$v->get_all_synonyms()}; + print "@synonyms\n"; + + # print out synonyms and their database associations + my $sources = $v->get_all_synonym_sources(); + foreach my $src (@$sources) { + @synonyms = $v->get_all_synonyms($src); + print "$src: @synonyms\n"; + } + + + # print out validation states + my @vstates = @{$v->get_all_validation_states()}; + print "@validation_states\n"; + + # print out flanking sequences + print "5' flanking: ", $v->five_prime_flanking_seq(), "\n"; + print "3' flanking: ", $v->three_prime_flanking_seq(), "\n"; + + +=head1 DESCRIPTION + +This is a class representing a nucleotide variation from the +ensembl-variation database. A variation may be a SNP a multi-base substitution +or an insertion/deletion. The objects Alleles associated with a Variation +object describe the nucleotide change that Variation represents. + +A Variation object has an associated identifier and 0 or more additional +synonyms. The position of a Variation object on the Genome is represented +by the B class. + +=head1 METHODS + +=cut + + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::Variation; + +use Bio::EnsEMBL::Storable; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Scalar qw(assert_ref wrap_array); +use Bio::EnsEMBL::Variation::Utils::Sequence qw(ambiguity_code SO_variation_class); +use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning); +use Bio::EnsEMBL::Variation::Utils::Sequence; +use Bio::EnsEMBL::Variation::Utils::Constants qw(%VARIATION_CLASSES); +use Bio::EnsEMBL::Variation::Failable; +use vars qw(@ISA); +use Scalar::Util qw(weaken); + +@ISA = qw(Bio::EnsEMBL::Storable Bio::EnsEMBL::Variation::Failable); + +=head2 new + + Arg [-dbID] : + int - unique internal identifier for snp + + Arg [-ADAPTOR] : + Bio::EnsEMBL::Variation::DBSQL::VariationAdaptor + Adaptor which provides database connectivity for this Variation object + + Arg [-NAME] : + string - the name of this SNP + + Arg [-SOURCE] : + string - the source of this SNP + + Arg [-SOURCE_DESCRIPTION] : + string - description of the SNP source + + Arg [-SOURCE_TYPE] : + string - the source type of this variant + + Arg [-SYNONYMS] : + reference to hash with list reference values - keys are source + names and values are lists of identifiers from that db. + e.g.: {'dbSNP' => ['ss1231', '1231'], 'TSC' => ['1452']} + + Arg [-ANCESTRAL_ALLELES] : + string - the ancestral allele of this SNP + + Arg [-ALLELES] : + reference to list of Bio::EnsEMBL::Variation::Allele objects + + Arg [-VALIDATION_STATES] : + reference to list of strings + + Arg [-MOLTYPE] : + string - the moltype of this SNP + + Arg [-FIVE_PRIME_FLANKING_SEQ] : + string - the five prime flanking nucleotide sequence + + Arg [-THREE_PRIME_FLANKING_SEQ] : + string - the three prime flanking nucleotide sequence + + Example : $v = Bio::EnsEMBL::Variation::Variation->new + (-name => 'rs123', + -source => 'dbSNP'); + + Description: Constructor. Instantiates a new Variation object. + Returntype : Bio::EnsEMBL::Variation::Variation + Exceptions : none + Caller : general + Status : At Risk + +=cut + + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my ($dbID, $adaptor, $name, $class_so_term, $src, $src_desc, $src_url, $src_type, $is_somatic, $flipped, $syns, $ancestral_allele, + $alleles, $valid_states, $moltype, $five_seq, $three_seq, $flank_flag, $minor_allele, $minor_allele_frequency, $minor_allele_count, + $clinical_significance) = + rearrange([qw(dbID ADAPTOR NAME CLASS_SO_TERM SOURCE SOURCE_DESCRIPTION SOURCE_URL SOURCE_TYPE IS_SOMATIC + FLIPPED SYNONYMS ANCESTRAL_ALLELE ALLELES VALIDATION_STATES MOLTYPE FIVE_PRIME_FLANKING_SEQ + THREE_PRIME_FLANKING_SEQ FLANK_FLAG MINOR_ALLELE MINOR_ALLELE_FREQUENCY MINOR_ALLELE_COUNT + CLINICAL_SIGNIFICANCE)],@_); + + # convert the validation state strings into a bit field + # this preserves the same order and representation as in the database + # and filters out invalid states + my $vcode = Bio::EnsEMBL::Variation::Utils::Sequence::get_validation_code($valid_states); + + my $self = bless { + 'dbID' => $dbID, + 'adaptor' => $adaptor, + 'name' => $name, + 'class_SO_term' => $class_so_term, + 'source' => $src, + 'source_description' => $src_desc, + 'source_url' => $src_url, + 'source_type'=> $src_type, + 'is_somatic' => $is_somatic, + 'flipped' => $flipped, + 'synonyms' => $syns || {}, + 'ancestral_allele' => $ancestral_allele, + 'validation_code' => $vcode, + 'moltype' => $moltype, + 'five_prime_flanking_seq' => $five_seq, + 'three_prime_flanking_seq' => $three_seq, + 'flank_flag' => $flank_flag, + 'minor_allele' => $minor_allele, + 'minor_allele_frequency' => $minor_allele_frequency, + 'minor_allele_count' => $minor_allele_count, + 'clinical_significance' => $clinical_significance, + }, $class; + + $self->add_Allele($alleles) if defined($alleles); + + return $self; +} + +sub new_fast { + my $class = shift; + my $hashref = shift; + return bless $hashref, $class; +} + +=head2 has_failed_subsnps + + Description: DEPRECATED: Use has_failed_alleles instead. + Status : DEPRECATED + +=cut + +sub has_failed_subsnps { + my $self = shift; + + deprecate("has_failed_subsnps should no longer be used, use has_failed_alleles instead\n"); + return $self->has_failed_alleles(); +} + +=head2 has_failed_alleles + + Example : print "Variation '" . $var->name() . "' has " . ($var->has_failed_alleles() ? "" : "no ") . " failed alleles\n"; + Description: Returns true if this variation has alleles that are flagged as failed + Returntype : int + Exceptions : none + Caller : general + Status : At risk + +=cut + +sub has_failed_alleles { + my $self = shift; + + map {return 1 if ($_->is_failed())} @{$self->get_all_Alleles()}; + return 0; +} + + +=head2 add_Allele + + Arg [1] : Bio::EnsEMBL::Variation::Allele $allele + Example : $v->add_allele(Bio::EnsEMBL::Variation::Allele->new(...)); + Description: Add an Allele to this variation. + Returntype : none + Exceptions : throw on incorrect argument + Caller : general + Status : At Risk + +=cut + +sub add_Allele { + my $self = shift; + my $allele = shift; + + # This method also accepts a list of alleles so wrap the argument in an array and treat as such + $allele = wrap_array($allele); + map {assert_ref($_,'Bio::EnsEMBL::Variation::Allele')} @{$allele}; + + # Store the allele in the private hash + $self->{alleles} = [] unless (exists($self->{alleles})); + push(@{$self->{alleles}},@{$allele}); + +} + +=head2 name + + Arg [1] : string $newval (optional) + The new value to set the name attribute to + Example : $name = $obj->name() + Description: Getter/Setter for the name attribute + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub name{ + my $self = shift; + return $self->{'name'} = shift if(@_); + return $self->{'name'}; +} + + +=head2 get_all_Genes + + Args : None + Example : $genes = $v->get_all_genes(); + Description : Retrieves all the genes where this Variation + has a consequence. + ReturnType : reference to list of Bio::EnsEMBL::Gene + Exceptions : None + Caller : general + Status : At Risk + +=cut + +sub get_all_Genes{ + my $self = shift; + my $genes; + if (defined $self->{'adaptor'}){ + my $UPSTREAM = 5000; + my $DOWNSTREAM = 5000; + my $vf_adaptor = $self->adaptor()->db()->get_VariationFeatureAdaptor(); + my $vf_list = $vf_adaptor->fetch_all_by_Variation($self); + #foreach vf, get the slice is on, us ethe USTREAM and DOWNSTREAM limits to get all the genes, and see if SNP is within the gene + my $new_slice; + my $gene_list; + my $gene_hash; + + foreach my $vf (@{$vf_list}){ + #expand the slice UPSTREAM and DOWNSTREAM + $new_slice = $vf->feature_Slice()->expand($UPSTREAM,$DOWNSTREAM); + #get the genes in the new slice + $gene_list = $new_slice->get_all_Genes(); + foreach my $gene (@{$gene_list}){ + if (($vf->start >= $gene->seq_region_start - $UPSTREAM) && ($vf->start <= $gene->seq_region_end + $DOWNSTREAM) && ($vf->end <= $gene->seq_region_end + $DOWNSTREAM)){ + #the vf is affecting the gene, add to the hash if not present already + if (!exists $gene_hash->{$gene->dbID}){ + $gene_hash->{$gene->dbID} = $gene; + } + } + } + } + #and return all the genes + push @{$genes}, values %{$gene_hash}; + } + return $genes; +} + + + + +=head2 get_all_VariationFeatures + + Args : None + Example : $vfs = $v->get_all_VariationFeatures(); + Description : Retrieves all VariationFeatures for this Variation + ReturnType : reference to list of Bio::EnsEMBL::Variation::VariationFeature + Exceptions : None + Caller : general + Status : At Risk + +=cut + +sub get_all_VariationFeatures{ + my $self = shift; + + if(defined $self->adaptor) { + + # get variation feature adaptor + my $vf_adaptor = $self->adaptor()->db()->get_VariationFeatureAdaptor(); + + return $vf_adaptor->fetch_all_by_Variation($self); + } + + else { + warn("No variation database attached"); + return []; + } +} + +=head2 get_VariationFeature_by_dbID + + Args : None + Example : $vf = $v->get_VariationFeature_by_dbID(); + Description : Retrieves a VariationFeature for this Variation by it's internal + database identifier + ReturnType : Bio::EnsEMBL::Variation::VariationFeature + Exceptions : None + Caller : general + Status : At Risk + +=cut + +sub get_VariationFeature_by_dbID{ + my $self = shift; + my $dbID = shift; + + throw("No dbID defined") unless defined $dbID; + + if(defined $self->adaptor) { + + # get variation feature adaptor + my $vf_adaptor = $self->adaptor()->db()->get_VariationFeatureAdaptor(); + + my $vf = $vf_adaptor->fetch_by_dbID($dbID); + + # check defined + if(defined($vf)) { + + # check it is the same variation ID + if($vf->{_variation_id} == $self->dbID) { + return $vf; + } + + else { + warn("Variation dbID for Variation Feature does not match this Variation's dbID"); + return undef; + } + } + + else { + return undef; + } + } + + else { + warn("No variation database attached"); + return undef; + } +} + + + +=head2 get_all_synonyms + + Arg [1] : (optional) string $source - the source of the synonyms to + return. + Example : @dbsnp_syns = @{$v->get_all_synonyms('dbSNP')}; + @all_syns = @{$v->get_all_synonyms()}; + Description: Retrieves synonyms for this Variation. If a source argument + is provided all synonyms from that source are returned, + otherwise all synonyms are returned. + Returntype : reference to list of strings + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub get_all_synonyms { + my $self = shift; + my $source = shift; + + if ($source) { + $source = [$source]; + } + else { + $source = $self->get_all_synonym_sources(); + } + + my @synonyms; + map {push(@synonyms,keys(%{$self->{synonyms}{$_}}))} @{$source}; + + return \@synonyms; +} + + + +=head2 get_all_synonym_sources + + Arg [1] : none + Example : my @sources = @{$v->get_all_synonym_sources()}; + Description: Retrieves a list of all the sources for synonyms of this + Variation. + Returntype : reference to a list of strings + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub get_all_synonym_sources { + my $self = shift; + my @sources = keys %{$self->{'synonyms'}}; + return \@sources; +} + + + +=head2 add_synonym + + Arg [1] : string $source + Arg [2] : string $syn + Example : $v->add_synonym('dbSNP', 'ss55331'); + Description: Adds a synonym to this variation. + Returntype : none + Exceptions : throw if $source argument is not provided + throw if $syn argument is not provided + Caller : general + Status : At Risk + +=cut + +sub add_synonym { + my $self = shift; + my $source = shift; + my $syn = shift; + + throw("source argument is required") if(!$source); + throw("syn argument is required") if(!$syn); + + $self->{'synonyms'}{$source}{$syn}++; + + return; +} + + + +=head2 get_all_validation_states + + Arg [1] : none + Example : my @vstates = @{$v->get_all_validation_states()}; + Description: Retrieves all validation states for this variation. Current + possible validation statuses are 'cluster','freq','submitter', + 'doublehit', 'hapmap' + Returntype : reference to list of strings + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub get_all_validation_states { + my $self = shift; + + return Bio::EnsEMBL::Variation::Utils::Sequence::get_all_validation_states($self->{'validation_code'}); +} + + + + +=head2 add_validation_state + + Arg [1] : string $state + Example : $v->add_validation_state('cluster'); + Description: Adds a validation state to this variation. + Returntype : none + Exceptions : warning if validation state is not a recognised type + Caller : general + Status : At Risk + +=cut + +sub add_validation_state { + Bio::EnsEMBL::Variation::Utils::Sequence::add_validation_state(@_); +} + + + +=head2 source + + Arg [1] : string $source (optional) + The new value to set the source attribute to + Example : $source = $v->source() + Description: Getter/Setter for the source attribute + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub source{ + my $self = shift; + return $self->{'source'} = shift if(@_); + return $self->{'source'}; +} + + +=head2 source_type + + Arg [1] : string $source_type (optional) + The new value to set the source type attribute to + Example : $source_type = $v->source_type() + Description: Getter/Setter for the source type attribute + Returntype : string + Exceptions : none + Caller : general + Status : At risk + +=cut + +sub source_type{ + my $self = shift; + return $self->{'source_type'} = shift if(@_); + return $self->{'source_type'}; +} + + +=head2 source_description + + Arg [1] : string $source_description (optional) + The new value to set the source description attribute to + Example : $source_description = $v->source_description() + Description: Getter/Setter for the source description attribute + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub source_description{ + my $self = shift; + return $self->{'source_description'} = shift if(@_); + return $self->{'source_description'}; +} + + + +=head2 source_url + + Arg [1] : string $source_url (optional) + The new value to set the source URL attribute to + Example : $source_url = $v->source_url() + Description: Getter/Setter for the source URL attribute + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub source_url{ + my $self = shift; + return $self->{'source_url'} = shift if(@_); + return $self->{'source_url'}; +} + +=head2 is_somatic + + Arg [1] : boolean $is_somatic (optional) + The new value to set the is_somatic flag to + Example : $is_somatic = $v->is_somatic + Description: Getter/Setter for the is_somatic flag, which identifies this variation as either somatic or germline + Returntype : boolean + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub is_somatic { + my ($self, $is_somatic) = @_; + $self->{is_somatic} = $is_somatic if defined $is_somatic; + return $self->{is_somatic}; +} + +=head2 flipped + + Arg [1] : boolean $flipped (optional) + The new value to set the flipped flag to + Example : $flipped = $v->flipped + Description: Getter/Setter for the flipped flag, which identifies if this + variation's strand has been flipped during the import process + Returntype : boolean + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub flipped { + my ($self, $flipped) = @_; + $self->{flipped} = $flipped if defined $flipped; + return $self->{flipped}; +} + +=head2 get_all_Alleles + + Arg [1] : none + Example : @alleles = @{$v->get_all_Alleles()}; + Description: Retrieves all Alleles associated with this variation + Returntype : reference to list of Bio::EnsEMBL::Variation::Allele objects + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_Alleles { + my $self = shift; + + # If the private hash key 'alleles' does not exist, no attempt has been made to load them, so do that + unless (exists($self->{alleles})) { + + # Get an AlleleAdaptor + assert_ref($self->adaptor(),'Bio::EnsEMBL::Variation::DBSQL::BaseAdaptor'); + my $allele_adaptor = $self->adaptor->db->get_AlleleAdaptor(); + + $self->add_Allele($allele_adaptor->fetch_all_by_Variation($self)); + } + + return $self->{alleles}; +} + + + +=head2 ancestral_allele + + Arg [1] : string $ancestral_allele (optional) + Example : $ancestral_allele = v->ancestral_allele(); + Description: Getter/Setter ancestral allele associated with this variation + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub ancestral_allele { + my $self = shift; + return $self->{'ancestral_allele'} = shift if(@_); + return $self->{'ancestral_allele'}; +} + +=head2 moltype + + Arg [1] : string $moltype (optional) + The new value to set the moltype attribute to + Example : $moltype = v->moltype(); + Description: Getter/Setter moltype associated with this variation + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub moltype { + my $self = shift; + return $self->{'moltype'} = shift if(@_); + return $self->{'moltype'}; +} + + +=head2 five_prime_flanking_seq + + Arg [1] : string $newval (optional) + The new value to set the five_prime_flanking_seq attribute to + Example : $five_prime_flanking_seq = $obj->five_prime_flanking_seq() + Description: Getter/Setter for the five_prime_flanking_seq attribute + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub five_prime_flanking_seq{ + my $self = shift; + + #setter of the flanking sequence + return $self->{'five_prime_flanking_seq'} = shift if(@_); + #lazy-load the flanking sequence from the database + if (!defined $self->{'five_prime_flanking_seq'} && $self->{'adaptor'}){ + my $variation_adaptor = $self->adaptor()->db()->get_VariationAdaptor(); + ($self->{'three_prime_flanking_seq'},$self->{'five_prime_flanking_seq'}) = @{$variation_adaptor->get_flanking_sequence($self->{'dbID'})}; + } + return $self->{'five_prime_flanking_seq'}; +} + + + + +=head2 three_prime_flanking_seq + + Arg [1] : string $newval (optional) + The new value to set the three_prime_flanking_seq attribute to + Example : $three_prime_flanking_seq = $obj->three_prime_flanking_seq() + Description: Getter/Setter for the three_prime_flanking_seq attribute + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub three_prime_flanking_seq{ + my $self = shift; + + #setter of the flanking sequence + return $self->{'three_prime_flanking_seq'} = shift if(@_); + #lazy-load the flanking sequence from the database + if (!defined $self->{'three_prime_flanking_seq'} && $self->{'adaptor'}){ + my $variation_adaptor = $self->adaptor()->db()->get_VariationAdaptor(); + ($self->{'three_prime_flanking_seq'},$self->{'five_prime_flanking_seq'}) = @{$variation_adaptor->get_flanking_sequence($self->{'dbID'})}; + } + return $self->{'three_prime_flanking_seq'}; +} + + +=head2 get_all_IndividualGenotypes + + Args : none + Example : $ind_genotypes = $var->get_all_IndividualGenotypes() + Description: Getter for IndividualGenotypes for this Variation, returns empty list if + there are none + Returntype : listref of IndividualGenotypes + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub get_all_IndividualGenotypes { + my $self = shift; + my $individual = shift; + if (defined ($self->{'adaptor'})){ + my $igtya = $self->{'adaptor'}->db()->get_IndividualGenotypeAdaptor(); + + return $igtya->fetch_all_by_Variation($self, $individual); + } + return []; +} + +=head2 get_all_PopulationGenotypes + + Args : none + Example : $pop_genotypes = $var->get_all_PopulationGenotypes() + Description: Getter for PopulationGenotypes for this Variation, returns empty list if + there are none. + Returntype : listref of PopulationGenotypes + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub get_all_PopulationGenotypes { + my $self = shift; + + #simulate a lazy-load on demand situation, used by the Glovar team + if (!defined($self->{'populationGenotypes'}) && defined ($self->{'adaptor'})){ + my $pgtya = $self->{'adaptor'}->db()->get_PopulationGenotypeAdaptor(); + + return $pgtya->fetch_all_by_Variation($self); + } + return $self->{'populationGenotypes'}; + +} + + +=head2 add_PopulationGenotype + + Arg [1] : Bio::EnsEMBL::Variation::PopulationGenotype + Example : $v->add_PopulationGenotype($pop_genotype) + Description : Adds another PopulationGenotype to the Variation object + Exceptions : thrown on bad argument + Caller : general + Status : At Risk + +=cut + +sub add_PopulationGenotype{ + my $self = shift; + + if (@_){ + if(!ref($_[0]) || !$_[0]->isa('Bio::EnsEMBL::Variation::PopulationGenotype')) { + throw("Bio::EnsEMBL::Variation::PopulationGenotype argument expected"); + } + #a variation can have multiple PopulationGenotypes + push @{$self->{'populationGenotypes'}},shift; + } + +} + + +=head2 ambig_code + + Args : None + Example : my $ambiguity_code = $v->ambig_code() + Description : Returns the ambigutiy code for the alleles in the Variation + ReturnType : String $ambiguity_code + Exceptions : none + Caller : General + Status : At Risk + +=cut + +sub ambig_code{ + my $self = shift; + + my $code; + + # first try via VF + if(my @vfs = @{$self->get_all_VariationFeatures}) { + if(scalar @vfs) { + $code = $vfs[0]->ambig_code; + } + } + + # otherwise get it via alleles attatched to this object already + if(!defined($code)) { + my $alleles = $self->get_all_Alleles(); #get all Allele objects + my %alleles; #to get all the different alleles in the Variation + map {$alleles{$_->allele}++} @{$alleles}; + my $allele_string = join "|",keys %alleles; + $code = &ambiguity_code($allele_string); + } + + return $code; +} + +=head2 var_class + + Args : None + Example : my $variation_class = $vf->var_class() + Description : returns the class for the variation, according to dbSNP classification + ReturnType : String $variation_class + Exceptions : none + Caller : General + Status : At Risk + +=cut + +sub var_class{ + my $self = shift; + + unless ($self->{class_display_term}) { + + unless ($self->{class_SO_term}) { + # work out the term from the alleles + + my $alleles = $self->get_all_Alleles(); #get all Allele objects + my %alleles; #to get all the different alleles in the Variation + map {$alleles{$_->allele}++} @{$alleles}; + my $allele_string = join '/',keys %alleles; + + $self->{class_SO_term} = SO_variation_class($allele_string); + } + + # convert the SO term to the ensembl display term + + $self->{class_display_term} = $self->is_somatic ? + $VARIATION_CLASSES{$self->{class_SO_term}}->{somatic_display_term} : + $VARIATION_CLASSES{$self->{class_SO_term}}->{display_term}; + } + + return $self->{class_display_term}; +} + +=head2 derived_allele_frequency + + Arg[1] : Bio::EnsEMBL::Variation::Population $population + Example : $daf = $variation->derived_allele_frequency($population); + Description: Gets the derived allele frequency for the population. + The DAF is the frequency of the reference allele that is + different from the allele in Chimp. If none of the alleles + is the same as the ancestral, will return reference allele + frequency + Returntype : float + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub derived_allele_frequency{ + my $self = shift; + my $population = shift; + my $daf; + + if(!ref($population) || !$population->isa('Bio::EnsEMBL::Variation::Population')) { + throw('Bio::EnsEMBL::Variation::Population argument expected.'); + } + my $ancestral_allele = $self->ancestral_allele(); + if (defined $ancestral_allele){ + #get reference allele + my $vf_adaptor = $self->adaptor->db->get_VariationFeatureAdaptor(); + my $vf = shift @{$vf_adaptor->fetch_all_by_Variation($self)}; + my $ref_freq; + #get allele in population + my $alleles = $self->get_all_Alleles(); + + foreach my $allele (@{$alleles}){ + next unless defined $allele->population; + + if (($allele->allele eq $vf->ref_allele_string) and ($allele->population->name eq $population->name)){ + $ref_freq = $allele->frequency; + } + } + + if(defined $ref_freq) { + if ($ancestral_allele eq $vf->ref_allele_string){ + $daf = 1 - $ref_freq + } + elsif ($ancestral_allele ne $vf->ref_allele_string){ + $daf = $ref_freq; + } + } + } + + return $daf; +} + +=head2 derived_allele + + Arg[1] : Bio::EnsEMBL::Variation::Population $population + Example : $da = $variation->derived_allele($population); + Description: Gets the derived allele for the population. + Returntype : float + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub derived_allele { + my $self = shift(); + my $population = shift(); + + my $population_dbID = $population->dbID(); + my $ancestral_allele_str = $self->ancestral_allele(); + + if (not defined($ancestral_allele_str)) { + return; + } + + my $alleles = $self->get_all_Alleles(); + + my $derived_allele_str; + + foreach my $allele (@{$alleles}) { + my $allele_population = $allele->population(); + + if (defined($allele_population) and + $allele_population->dbID() == $population_dbID) + { + my $allele_str = $allele->allele(); + + if ($ancestral_allele_str ne $allele_str) { + if (defined($derived_allele_str)) { + return; + } else { + $derived_allele_str = $allele_str; + } + } + } + } + return $derived_allele_str; +} + +=head2 minor_allele + + Arg [1] : string $minor_allele (optional) + The new minor allele string + Example : $ma = $obj->minor_allele() + Description: Get/set the minor allele of this variation, as reported by dbSNP + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub minor_allele { + my ($self, $minor_allele) = @_; + $self->{minor_allele} = $minor_allele if defined $minor_allele; + return $self->{minor_allele} +} + +=head2 minor_allele_frequency + + Arg [1] : float $minor_allele_frequency (optional) + The new minor allele frequency + Example : $maf = $obj->minor_allele_frequency() + Description: Get/set the frequency of the minor allele of this variation, as reported by dbSNP + Returntype : float + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub minor_allele_frequency { + my ($self, $minor_allele_frequency) = @_; + $self->{minor_allele_frequency} = $minor_allele_frequency if defined $minor_allele_frequency; + return $self->{minor_allele_frequency} +} + +=head2 minor_allele_count + + Arg [1] : int $minor_allele_count (optional) + The new minor allele count + Example : $maf_count = $obj->minor_allele_count() + Description: Get/set the sample count of the minor allele of this variation, as reported by dbSNP + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub minor_allele_count { + my ($self, $minor_allele_count) = @_; + $self->{minor_allele_count} = $minor_allele_count if defined $minor_allele_count; + return $self->{minor_allele_count} +} + +=head2 clinical_significance + + Arg [1] : string $clinical_significance (optional) + The new clinical significance string + Example : $ma = $obj->clinical_significance() + Description: Get/set the clinical significance of this variation, as reported by dbSNP. + When available, this will be one of the following strings: + unknown + untested + non-pathogenic + probable-non-pathogenic + probable-pathogenic + pathogenic + drug-response + histocompatibility + other + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub clinical_significance { + my ($self, $clinical_significance) = @_; + $self->{clinical_significance} = $clinical_significance if defined $clinical_significance; + return $self->{clinical_significance} +} + +=head2 get_all_VariationAnnotations + + Args : none + Example : my $annotations = $var->get_all_VariationAnnotations() + Description: Getter for VariationAnnotations for this Variation, returns empty list if + there are none. + Returntype : listref of VariationAnnotations + Exceptions : none + Caller : general + +=cut + +sub get_all_VariationAnnotations { + my $self = shift; + + #ÊAssert the adaptor reference + assert_ref($self->adaptor(),'Bio::EnsEMBL::Variation::DBSQL::BaseAdaptor'); + + # Get the annotations from the database + return $self->adaptor->db->get_VariationAnnotationAdaptor()->fetch_all_by_Variation($self); + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/VariationAnnotation.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/VariationAnnotation.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,476 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# Ensembl module for Bio::EnsEMBL::Variation::VariationAnnotation +# +# Copyright (c) 2004 Ensembl +# + + +=head1 NAME + +Bio::EnsEMBL::Variation::VariationAnnotation - A genotype phenotype annotation for a nucleotide variation. + +=head1 SYNOPSIS + + # Variation Annotation is associated with a variation object + $va = Bio::EnsEMBL::Variation::VariationAnnotation->new + (_variation_id => 8, + -phenotype_name => 'BD', + -phenotype_description => 'Bipolar Disorder',, + -associated_gene => 'HHEX', + -associated_variant_risk_allele => 'rs13266634-C', + -variation_names => 'rs13266634', + -risk_allele_freq_in_controls => '0.3', + -p_value => '6.00E-08', + -variation => $v); + + ... + + print $va->phenotype_name(),'-',$va->phenotype_description,"\n"; + print "From source ",$va->source_name,'-',$va->study_name,"\n"; + print " With study_type ", $va->study_type(),"\n"; + + ... + # Get the Variation object which this annotation represents + # If not already retrieved from the DB, this will be + # transparently lazy-loaded + my $v = $va->variation(); + +=head1 DESCRIPTION + +This is a class representing the genotype-phenotype annotation of a variation +from the ensembl-variation database. The actual variation information is +represented by an associated Bio::EnsEMBL::Variation::Variation object. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::VariationAnnotation; + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Variation::Variation; +use Bio::EnsEMBL::Storable; + +use vars qw(@ISA); + +@ISA = qw(Bio::EnsEMBL::Storable); + +=head2 new + + Arg [-dbID] : + int - unique internal identifier for variation_annotation + Arg [-ADAPTOR] : + Bio::EnsEMBL::Variation::DBSQL::VariationAnnotationAdaptor + Arg [-PHENOTYPE_NAME] : + string - name of the phenotype + Arg [-PHENOTYPE_DESCRIPTION] : + string - description of the phenotype + Arg [-VARIATION_NAMES] : + string - name of the associated variations + Arg [-VARIATION] : + int - the variation object associated with this annotation. + Arg [_VARIATION_ID] : + int _ the internal id of the variation object associated with this + identifier. This may be provided instead of a variation object so that + the variation may be lazy-loaded from the database on demand. + Arg [-ASSOCIATED_GENE] : + string - the gene names associated with this annotation/variant. + Arg [-ASSOCIATED_VARIANT_RISK_ALLELE] : + string - the variants-risk alleles associated with this annotation. + Arg [-RISK_ALLELE_FREQ_IN_CONTROLS] : + string - the risk allele frequency in controls associated with this annotation. + Arg [-P_VALUE] : + string - the p_value associated with this annotation. + Arg [-STUDY] : + object ref - the study object describing where the annotated variation comes from + + Example : + $va = Bio::EnsEMBL::Variation::VariationAnnotation->new + (-phenotype_name => 'BD', + -phenotype_description => 'Bipolar Disorder', + -variation_names => 'rs123', + _variation_id => 10, + -associated_gene => 'HHEX', + -associated_variant_risk_allele => 'rs13266634-C', + -risk_allele_freq_in_controls => '0.3', + -p_value => '6.00E-08', + -variation => $v); + + Description: Constructor. Instantiates a new VariationAnnotation object. + Returntype : Bio::EnsEMBL::Variation::VariationAnnotation + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_); + + my ($dbID,$adaptor,$phenotype_id,$phenotype_name,$phenotype_description,$variation_id,$variation_names, + $variation,$associated_gene,$associated_variant_risk_allele,$risk_allele_freq_in_controls,$p_value, + $study) = + rearrange([qw(dbID ADAPTOR _PHENOTYPE_ID PHENOTYPE_NAME PHENOTYPE_DESCRIPTION + VARIATION_ID VARIATION_NAMES VARIATION ASSOCIATED_GENE ASSOCIATED_VARIANT_RISK_ALLELE + RISK_ALLELE_FREQ_IN_CONTROLS P_VALUE STUDY)],@_); + + $self->{'dbID'} = $dbID; + $self->{'adaptor'} = $adaptor; + $self->{'_phenotype_id'} = $phenotype_id; + $self->{'phenotype_name'} = $phenotype_name; + $self->{'phenotype_description'} = $phenotype_description; + $self->{'variation'} = $variation; + $self->{'_variation_id'} = $variation_id; + $self->{'variation_names'} = $variation_names; + $self->{'associated_gene'} = $associated_gene; + $self->{'associated_variant_risk_allele'} = $associated_variant_risk_allele; + $self->{'risk_allele_freq_in_controls'} = $risk_allele_freq_in_controls; + $self->{'p_value'} = $p_value; + $self->{'study'} => $study, + return $self; +} + + + +sub new_fast { + my $class = shift; + my $hashref = shift; + return bless $hashref, $class; +} + + +=head2 phenotype_name + + Arg [1] : string phenotype_name (optional) + The new value to set the phenotype_name attribute to + Example : $phenotype_name = $obj->phenotype_name() + Description: Getter/Setter for the phenotype_name attribute. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub phenotype_name{ + my $self = shift; + return $self->{'phenotype_name'} = shift if(@_); + return $self->{'phenotype_name'}; +} + +=head2 phenotype_description + + Arg [1] : string phenotype_description (optional) + The new value to set the phenotype_description attribute to + Example : $phenotype_description = $obj->phenotype_description() + Description: Getter/Setter for the phenotype_description attribute. + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub phenotype_description{ + my $self = shift; + return $self->{'phenotype_description'} = shift if(@_); + return $self->{'phenotype_description'}; +} + +=head2 source_name + + Arg [1] : string source_name (optional) + The new value to set the source_name attribute to + Example : $source_name = $obj->source_name() + Description: Getter/Setter for the source_name attribute. + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub source_name{ + my $self = shift; + return $self->{'study'}->source = shift if(@_); + return $self->{'study'}->source; +} + +=head2 study_type + + Arg [1] : string study_type (optional) + The new value to set the study_type attribute to + Example : $study_type = $obj->study_type() + Description: Getter/Setter for the study_type attribute. + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub study_type{ + my $self = shift; + return $self->{'study'}->type = shift if(@_); + return $self->{'study'}->type; +} + + +=head2 variation + + Arg [1] : (optional) Bio::EnsEMBL::Variation::Variation $variation + Example : $v = $va->variation(); + Description: Getter/Setter for the variation associated with this annotation. + If not set, and this VariationAnnotation has an associated adaptor + an attempt will be made to lazy-load the variation from the + database. + Returntype : Bio::EnsEMBL::Variation::Variation + Exceptions : throw on incorrect argument + Caller : general + Status : Stable + +=cut + +sub variation { + my $self = shift; + + if(@_) { + if(!ref($_[0]) || !$_[0]->isa('Bio::EnsEMBL::Variation::Variation')) { + throw("Bio::EnsEMBL::Variation::Variation argument expected"); + } + $self->{'variation'} = shift; + } + elsif(!defined($self->{'variation'}) && $self->{'adaptor'} && + defined($self->{'_variation_id'})) { + # lazy-load from database on demand + my $va = $self->{'adaptor'}->db()->get_VariationAdaptor(); + $self->{'variation'} = $va->fetch_by_dbID($self->{'_variation_id'}); + } + + return $self->{'variation'}; +} + +=head2 variation_names + + Arg [1] : string $newval (optional) + The new value to set the variation_names attribute to + Example : $variation_names = $obj->variation_names() + Description: Getter/Setter for the variation_names attribute. This is the + names of the variation associated with this study. + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub variation_names{ + my $self = shift; + return $self->{'variation_names'} = shift if(@_); + return $self->{'variation_names'}; +} + + +=head2 study_name + + Arg [1] : string $study_name (optional) + The new value to set the study_name attribute to + Example : $study = $sva->study_name() + Description: Getter/Setter for the study_name attribute + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub study_name{ + my $self = shift; + return $self->{'study'}->name = shift if(@_); + return $self->{'study'}->name; +} + + +=head2 study_description + + Arg [1] : string $study_description (optional) + The new value to set the study_description attribute to + Example : $study_description = $obj->study_description() + Description: Getter/Setter for the study_description attribute + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub study_description{ + my $self = shift; + return $self->{'study'}->description = shift if(@_); + return $self->{'study'}->description; +} + + +=head2 external_reference + + Arg [1] : string $newval (optional) + The new value to set the external reference attribute to + Example : $external_reference = $obj->external_reference() + Description: Getter/Setter for the external reference attribute. This is the + pubmed/id or project name associated with this study. + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub external_reference{ + my $self = shift; + return $self->{'study'}->external_reference = shift if(@_); + return $self->{'study'}->external_reference; +} + + +=head2 study_url + + Arg [1] : string $newval (optional) + The new value to set the study_url attribute to + Example : $url = $obj->study_url() + Description: Getter/Setter for the study_url attribute. This is the link to the website where the data are stored. + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub study_url{ + my $self = shift; + return $self->{'study'}->url = shift if(@_); + return $self->{'study'}->url; +} + + +=head2 associated_studies + Example : $name = $obj->associate_studies() + Description: Getter/Setter for the associated_studies attribute + (e.g. EGA studies can be associated to NHGRI studies). + Returntype : reference to list of Bio::EnsEMBL::Variation::Study + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub associated_studies{ + my $self = shift; + return $self->{'study'}->associated_studies; +} + + +=head2 associated_gene + + Arg [1] : string $newval (optional) + The new value to set the associated_gene attribute to + Example : $associated_gene = $obj->associated_gene() + Description: Getter/Setter for the associated_gene attribute. This is the + gene names associated with this study. + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub associated_gene{ + my $self = shift; + return $self->{'associated_gene'} = shift if(@_); + return $self->{'associated_gene'}; +} + +=head2 associated_variant_risk_allele + + Arg [1] : string $newval (optional) + The new value to set the associated_variant_risk_allele attribute to + Example : $associated_variant_risk_allele = $obj->associated_variant_risk_allele() + Description: Getter/Setter for the associated_variant_risk_allele attribute. This is the + associated_variant_risk_allele associated with this study. + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub associated_variant_risk_allele{ + my $self = shift; + return $self->{'associated_variant_risk_allele'} = shift if(@_); + return $self->{'associated_variant_risk_allele'}; +} + +=head2 risk_allele_freq_in_controls + + Arg [1] : string $newval (optional) + The new value to set the risk_allele_freq_in_controls attribute to + Example : $risk_allele_freq_in_controls = $obj->risk_allele_freq_in_controls() + Description: Getter/Setter for the risk_allele_freq_in_controls attribute. This is the + risk_allele_freq_in_controls associated with this study. + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub risk_allele_freq_in_controls{ + my $self = shift; + return $self->{'risk_allele_freq_in_controls'} = shift if(@_); + return $self->{'risk_allele_freq_in_controls'}; +} + +=head2 p_value + + Arg [1] : string $newval (optional) + The new value to set the p_value attribute to + Example : $p_value = $obj->p_value() + Description: Getter/Setter for the p_value attribute. This is the + p_value associated with this study. + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub p_value{ + my $self = shift; + return $self->{'p_value'} = shift if(@_); + return $self->{'p_value'}; +} +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/VariationFeature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/VariationFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1735 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# Ensembl module for Bio::EnsEMBL::Variation::VariationFeature +# +# Copyright (c) 2004 Ensembl +# + + +=head1 NAME + +Bio::EnsEMBL::Variation::VariationFeature - A genomic position for a nucleotide variation. + +=head1 SYNOPSIS + + # Variation feature representing a single nucleotide polymorphism + $vf = Bio::EnsEMBL::Variation::VariationFeature->new + (-start => 100, + -end => 100, + -strand => 1, + -slice => $slice, + -allele_string => 'A/T', + -variation_name => 'rs635421', + -map_weight => 1, + -variation => $v); + + # Variation feature representing a 2bp insertion + $vf = Bio::EnsEMBL::Variation::VariationFeature->new + (-start => 1522, + -end => 1521, # end = start-1 for insert + -strand => -1, + -slice => $slice, + -allele_string => '-/AA', + -variation_name => 'rs12111', + -map_weight => 1, + -variation => $v2); + + ... + + # a variation feature is like any other ensembl feature, can be + # transformed etc. + $vf = $vf->transform('supercontig'); + + print $vf->start(), "-", $vf->end(), '(', $vf->strand(), ')', "\n"; + + print $vf->name(), ":", $vf->allele_string(); + + # Get the Variation object which this feature represents the genomic + # position of. If not already retrieved from the DB, this will be + # transparently lazy-loaded + my $v = $vf->variation(); + +=head1 DESCRIPTION + +This is a class representing the genomic position of a nucleotide variation +from the ensembl-variation database. The actual variation information is +represented by an associated Bio::EnsEMBL::Variation::Variation object. Some +of the information has been denormalized and is available on the feature for +speed purposes. A VariationFeature behaves as any other Ensembl feature. +See B and B. + +=head1 METHODS + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::VariationFeature; + +use Scalar::Util qw(weaken isweak); + +use Bio::EnsEMBL::Variation::BaseVariationFeature; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Scalar qw(assert_ref); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); +use Bio::EnsEMBL::Variation::Utils::Sequence qw(ambiguity_code hgvs_variant_notation SO_variation_class format_hgvs_string); +use Bio::EnsEMBL::Variation::Utils::Sequence; +use Bio::EnsEMBL::Variation::Variation; +use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(MAX_DISTANCE_FROM_TRANSCRIPT); +use Bio::EnsEMBL::Variation::Utils::Constants qw($DEFAULT_OVERLAP_CONSEQUENCE %VARIATION_CLASSES); +use Bio::EnsEMBL::Variation::RegulatoryFeatureVariation; +use Bio::EnsEMBL::Variation::MotifFeatureVariation; +use Bio::EnsEMBL::Variation::ExternalFeatureVariation; +use Bio::EnsEMBL::Variation::IntergenicVariation; +use Bio::EnsEMBL::Slice; +use Bio::EnsEMBL::Variation::DBSQL::TranscriptVariationAdaptor; +use Bio::PrimarySeq; +use Bio::SeqUtils; + +our @ISA = ('Bio::EnsEMBL::Variation::BaseVariationFeature'); + +our $DEBUG = 0; +=head2 new + + Arg [-dbID] : + see superclass constructor + + Arg [-ADAPTOR] : + see superclass constructor + + Arg [-START] : + see superclass constructor + Arg [-END] : + see superclass constructor + + Arg [-STRAND] : + see superclass constructor + + Arg [-SLICE] : + see superclass constructor + + Arg [-VARIATION_NAME] : + string - the name of the variation this feature is for (denormalisation + from Variation object). + + Arg [-MAP_WEIGHT] : + int - the number of times that the variation associated with this feature + has hit the genome. If this was the only feature associated with this + variation_feature the map_weight would be 1. + + Arg [-VARIATION] : + int - the variation object associated with this feature. + + Arg [-SOURCE] : + string - the name of the source where the SNP comes from + + Arg [-SOURCE_VERSION] : + number - the version of the source where the SNP comes from + + Arg [-VALIDATION_CODE] : + reference to list of strings + + Arg [-OVERLAP_CONSEQUENCES] : + listref of Bio::EnsEMBL::Variation::OverlapConsequences - all the consequences of this VariationFeature + + Arg [-VARIATION_ID] : + int - the internal id of the variation object associated with this + identifier. This may be provided instead of a variation object so that + the variation may be lazy-loaded from the database on demand. + + Example : + $vf = Bio::EnsEMBL::Variation::VariationFeature->new( + -start => 100, + -end => 100, + -strand => 1, + -slice => $slice, + -allele_string => 'A/T', + -variation_name => 'rs635421', + -map_weight => 1, + -source => 'dbSNP', + -validation_code => ['cluster','doublehit'], + -variation => $v + ); + + Description: Constructor. Instantiates a new VariationFeature object. + Returntype : Bio::EnsEMBL::Variation::Variation + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + my ( + $allele_str, + $var_name, + $map_weight, + $variation, + $variation_id, + $source, + $source_version, + $is_somatic, + $validation_code, + $overlap_consequences, + $class_so_term, + $minor_allele, + $minor_allele_freq, + $minor_allele_count + ) = rearrange([qw( + ALLELE_STRING + VARIATION_NAME + MAP_WEIGHT + VARIATION + _VARIATION_ID + SOURCE + SOURCE_VERSION + IS_SOMATIC + VALIDATION_CODE + OVERLAP_CONSEQUENCES + CLASS_SO_TERM + MINOR_ALLELE + MINOR_ALLELE_FREQUENCY + MINOR_ALLELE_COUNT + )], @_); + + $self->{'allele_string'} = $allele_str; + $self->{'variation_name'} = $var_name; + $self->{'map_weight'} = $map_weight; + $self->{'variation'} = $variation; + $self->{'_variation_id'} = $variation_id; + $self->{'source'} = $source; + $self->{'source_version'} = $source_version; + $self->{'is_somatic'} = $is_somatic; + $self->{'validation_code'} = $validation_code; + $self->{'overlap_consequences'} = $overlap_consequences; + $self->{'class_SO_term'} = $class_so_term; + $self->{'minor_allele'} = $minor_allele; + $self->{'minor_allele_frequency'} = $minor_allele_freq; + $self->{'minor_allele_count'} = $minor_allele_count; + + return $self; +} + + + +sub new_fast { + + my $class = shift; + my $hashref = shift; + my $self = bless $hashref, $class; + weaken($self->{'adaptor'}) if ( ! isweak($self->{'adaptor'}) ); + return $self; + +} + + +=head2 allele_string + + Arg [1] : string $newval (optional) + The new value to set the allele_string attribute to + Example : $allele_string = $obj->allele_string() + Description: Getter/Setter for the allele_string attribute. + The allele_string is a '/' demimited string representing the + alleles associated with this features variation. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub allele_string{ + my $self = shift; + return $self->{'allele_string'} = shift if(@_); + return $self->{'allele_string'}; +} + +=head2 display_id + + Arg [1] : none + Example : print $vf->display_id(), "\n"; + Description: Returns the 'display' identifier for this feature. For + VariationFeatures this is simply the name of the variation + it is associated with. + Returntype : string + Exceptions : none + Caller : webcode + Status : At Risk + +=cut + +sub display_id { + my $self = shift; + return $self->{'variation_name'} || ''; +} + + + +=head2 variation_name + + Arg [1] : string $newval (optional) + The new value to set the variation_name attribute to + Example : $variation_name = $obj->variation_name() + Description: Getter/Setter for the variation_name attribute. This is the + name of the variation associated with this feature. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub variation_name{ + my $self = shift; + return $self->{'variation_name'} = shift if(@_); + return $self->{'variation_name'}; +} + + + +=head2 map_weight + + Arg [1] : int $newval (optional) + The new value to set the map_weight attribute to + Example : $map_weight = $obj->map_weight() + Description: Getter/Setter for the map_weight attribute. The map_weight + is the number of times this features variation was mapped to + the genome. + Returntype : int + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub map_weight{ + my $self = shift; + return $self->{'map_weight'} = shift if(@_); + return $self->{'map_weight'}; +} + +=head2 minor_allele + + Arg [1] : string $minor_allele (optional) + The new minor allele string + Example : $ma = $obj->minor_allele() + Description: Get/set the minor allele of this variation, as reported by dbSNP + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub minor_allele { + my ($self, $minor_allele) = @_; + $self->{minor_allele} = $minor_allele if defined $minor_allele; + return $self->{minor_allele} +} + +=head2 minor_allele_frequency + + Arg [1] : float $minor_allele_frequency (optional) + The new minor allele frequency + Example : $maf = $obj->minor_allele_frequency() + Description: Get/set the frequency of the minor allele of this variation, as reported by dbSNP + Returntype : float + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub minor_allele_frequency { + my ($self, $minor_allele_frequency) = @_; + $self->{minor_allele_frequency} = $minor_allele_frequency if defined $minor_allele_frequency; + return $self->{minor_allele_frequency} +} + +=head2 minor_allele_count + + Arg [1] : int $minor_allele_count (optional) + The new minor allele count + Example : $maf_count = $obj->minor_allele_count() + Description: Get/set the sample count of the minor allele of this variation, as reported by dbSNP + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub minor_allele_count { + my ($self, $minor_allele_count) = @_; + $self->{minor_allele_count} = $minor_allele_count if defined $minor_allele_count; + return $self->{minor_allele_count} +} + + + +=head2 get_all_TranscriptVariations + + Arg [1] : (optional) listref of Bio::EnsEMBL::Transcript objects + Example : $vf->get_all_TranscriptVariations; + Description : Get all the TranscriptVariations associated with this VariationFeature. + If the optional list of Transcripts is supplied, get only TranscriptVariations + associated with those Transcripts. + Returntype : listref of Bio::EnsEMBL::Variation::TranscriptVariation objects + Exceptions : Thrown on wrong argument type + Caller : general + Status : At Risk + +=cut + +sub get_all_TranscriptVariations { + + my ($self, $transcripts) = @_; + + if ($transcripts) { + assert_ref($transcripts, 'ARRAY'); + map { assert_ref($_, 'Bio::EnsEMBL::Transcript') } @$transcripts; + } + + #die unless $self->{transcript_variations}; + + if ($self->dbID && not defined $self->{transcript_variations}) { + # this VariationFeature is from the database, so we can just fetch the + # TranscriptVariations from the database as well + + if (my $db = $self->adaptor->db) { + my $tva = $db->get_TranscriptVariationAdaptor; + + # just fetch TVs for all Transcripts because that's more efficient, + # we'll only return those asked for later on + + my $tvs = $tva->fetch_all_by_VariationFeatures([$self]); + + map { $self->add_TranscriptVariation($_) } @$tvs; + } + } + elsif (not defined $self->{transcript_variations}) { + # this VariationFeature is not in the database so we have to build the + # TranscriptVariations ourselves + + unless ($transcripts) { + # if the caller didn't supply some transcripts fetch those around this VariationFeature + + # get a slice around this transcript including the maximum distance up and down-stream + # that we still call consequences for + + my $slice = $self->feature_Slice->expand( + MAX_DISTANCE_FROM_TRANSCRIPT, + MAX_DISTANCE_FROM_TRANSCRIPT + ); + + # fetch all transcripts on this slice + + $transcripts = $slice->get_all_Transcripts(1); + } + + my @unfetched_transcripts = grep { + not exists $self->{transcript_variations}->{$_->stable_id} + } @$transcripts; + + for my $transcript (@unfetched_transcripts) { + $self->add_TranscriptVariation( + Bio::EnsEMBL::Variation::TranscriptVariation->new( + -variation_feature => $self, + -transcript => $transcript, + -adaptor => ($self->adaptor->db ? $self->adaptor->db->get_TranscriptVariationAdaptor : undef), + ) + ); + } + } + + if ($transcripts) { + # just return TranscriptVariations for the requested Transcripts + return [ map { $self->{transcript_variations}->{$_->stable_id} } @$transcripts ]; + } + else { + # return all TranscriptVariations + return [ values %{ $self->{transcript_variations} } ]; + } +} + +=head2 get_all_RegulatoryFeatureVariations + + Description : Get all the RegulatoryFeatureVariations associated with this VariationFeature. + Returntype : listref of Bio::EnsEMBL::Variation::RegulatoryFeatureVariation objects + Exceptions : none + Status : At Risk + +=cut + +sub get_all_RegulatoryFeatureVariations { + my $self = shift; + return $self->_get_all_RegulationVariations('RegulatoryFeature', @_); +} + +=head2 get_all_MotifFeatureVariations + + Description : Get all the MotifFeatureVariations associated with this VariationFeature. + Returntype : listref of Bio::EnsEMBL::Variation::MotifFeatureVariation objects + Exceptions : none + Status : At Risk + +=cut + +sub get_all_MotifFeatureVariations { + my $self = shift; + return $self->_get_all_RegulationVariations('MotifFeature', @_); +} + +=head2 get_all_ExternalFeatureVariations + + Description : Get all the ExternalFeatureVariations associated with this VariationFeature. + Returntype : listref of Bio::EnsEMBL::Variation::ExternalFeatureVariation objects + Exceptions : none + Status : At Risk + +=cut + +sub get_all_ExternalFeatureVariations { + my $self = shift; + return $self->_get_all_RegulationVariations('ExternalFeature', @_); +} + +sub _get_all_RegulationVariations { + my ($self, $type) = @_; + + unless ($type && ($type eq 'RegulatoryFeature' || $type eq 'MotifFeature' || $type eq 'ExternalFeature')) { + throw("Invalid Ensembl Regulation type '$type'"); + } + + unless ($self->{regulation_variations}->{$type}) { + + my $fg_adaptor; + + if (my $adap = $self->adaptor) { + if(my $db = $adap->db) { + $fg_adaptor = Bio::EnsEMBL::DBSQL::MergedAdaptor->new( + -species => $adap->db->species, + -type => $type, + ); + } + + unless ($fg_adaptor) { + warning("Failed to get adaptor for $type"); + return []; + } + } + else { + warning('Cannot get variation features without attached adaptor'); + return []; + } + + my $slice = $self->feature_Slice; + + my $constructor = 'Bio::EnsEMBL::Variation::'.$type.'Variation'; + + eval { + $self->{regulation_variations}->{$type} = [ + map { + $constructor->new( + -variation_feature => $self, + -feature => $_, + ); + } map { $_->transfer($self->slice) } @{ $fg_adaptor->fetch_all_by_Slice($slice) } + ]; + }; + + $self->{regulation_variations}->{$type} ||= []; + } + + return $self->{regulation_variations}->{$type}; +} + +sub get_IntergenicVariation { + my $self = shift; + my $no_ref_check = shift; + + unless (exists $self->{intergenic_variation}) { + if (scalar(@{ $self->get_all_TranscriptVariations }) == 0) { + $self->{intergenic_variation} = Bio::EnsEMBL::Variation::IntergenicVariation->new( + -variation_feature => $self, + -no_ref_check => $no_ref_check, + ); + } + else { + $self->{intergenic_variation} = undef; + } + } + + return $self->{intergenic_variation}; +} + +=head2 get_all_VariationFeatureOverlaps + + Description : Get all the VariationFeatureOverlaps associated with this VariationFeature, this + includes TranscriptVariations and regulatory feature overlap object. + Returntype : listref of Bio::EnsEMBL::Variation::VariationFeatureOverlap objects + Exceptions : none + Status : At Risk + +=cut + +sub get_all_VariationFeatureOverlaps { + my $self = shift; + + my $vfos = [ + @{ $self->get_all_TranscriptVariations }, + @{ $self->get_all_RegulatoryFeatureVariations }, + @{ $self->get_all_MotifFeatureVariations }, + @{ $self->get_all_ExternalFeatureVariations }, + ]; + + if (my $iv = $self->get_IntergenicVariation) { + push @$vfos, $iv; + } + + return $vfos; +} + +=head2 add_TranscriptVariation + + Arg [1] : Bio::EnsEMBL::Variation::TranscriptVariation + Example : $vf->add_TranscriptVariation($tv); + Description : Adds a TranscriptVariation to the variation feature object. + Exceptions : thrown on bad argument + Caller : Bio::EnsEMBL::Variation::TranscriptVariationAdaptor + Status : At Risk + +=cut + +sub add_TranscriptVariation { + my ($self, $tv) = @_; + assert_ref($tv, 'Bio::EnsEMBL::Variation::TranscriptVariation'); + # we need to weaken the reference back to us to avoid a circular reference + weaken($tv->{base_variation_feature}); + $self->{transcript_variations}->{$tv->transcript_stable_id} = $tv; +} + +=head2 variation + + Arg [1] : (optional) Bio::EnsEMBL::Variation::Variation $variation + Example : $v = $vf->variation(); + Description: Getter/Setter for the variation associated with this feature. + If not set, and this VariationFeature has an associated adaptor + an attempt will be made to lazy-load the variation from the + database. + Returntype : Bio::EnsEMBL::Variation::Variation + Exceptions : throw on incorrect argument + Caller : general + Status : Stable + +=cut + +sub variation { + my $self = shift; + + if(@_) { + if(!ref($_[0]) || !$_[0]->isa('Bio::EnsEMBL::Variation::Variation')) { + throw("Bio::EnsEMBL::Variation::Variation argument expected"); + } + $self->{'variation'} = shift; + } + elsif(!defined($self->{'variation'}) && $self->adaptor() && + defined($self->{'_variation_id'})) { + # lazy-load from database on demand + my $va = $self->adaptor->db()->get_VariationAdaptor(); + $self->{'variation'} = $va->fetch_by_dbID($self->{'_variation_id'}); + } + + return $self->{'variation'}; +} + +=head2 consequence_type + + Arg [1] : (optional) String $term_type + Description: Get a list of all the unique consequence terms of this + VariationFeature. By default returns Ensembl display terms + (e.g. 'NON_SYNONYMOUS_CODING'). $term_type can also be 'label' + (e.g. 'Non-synonymous coding'), 'SO' (Sequence Ontology, e.g. + 'non_synonymous_codon') or 'NCBI' (e.g. 'missense') + Returntype : listref of strings + Exceptions : none + Status : At Risk + +=cut + +sub consequence_type { + + my $self = shift; + my $term_type = shift; + + my $method_name; + + # delete cached term + if(defined($term_type)) { + delete $self->{consequence_types}; + $method_name = $term_type.($term_type eq 'label' ? '' : '_term'); + $method_name = 'SO_term' unless @{$self->get_all_OverlapConsequences} && $self->get_all_OverlapConsequences->[0]->can($method_name); + } + + $method_name ||= 'SO_term'; + + if (exists($self->{current_consequence_method}) && $self->{current_consequence_method} ne $method_name) { + delete $self->{consequence_type}; + } + + unless ($self->{consequence_types}) { + + # work out the terms from the OverlapConsequence objects + + $self->{consequence_types} = + [ map { $_->$method_name } @{ $self->get_all_OverlapConsequences } ]; + } + + $self->{current_consequence_method} = $method_name; + + return $self->{consequence_types}; +} + +=head2 get_all_OverlapConsequences + + Description: Get a list of all the unique OverlapConsequences of this VariationFeature, + calculating them on the fly from the TranscriptVariations if necessary + Returntype : listref of Bio::EnsEMBL::Variation::OverlapConsequence objects + Exceptions : none + Status : At Risk + +=cut + +sub get_all_OverlapConsequences { + my $self = shift; + + unless ($self->{overlap_consequences}) { + + # work them out and store them in a hash keyed by SO_term as we don't + # want duplicates from different VFOs + + my %overlap_cons; + + for my $vfo (@{ $self->get_all_TranscriptVariations }) { + for my $allele (@{ $vfo->get_all_alternate_VariationFeatureOverlapAlleles }) { + for my $cons (@{ $allele->get_all_OverlapConsequences }) { + $overlap_cons{$cons->SO_term} = $cons; + } + } + } + + # if we don't have any consequences we use a default from Constants.pm + # (currently set to the intergenic consequence) + + $self->{overlap_consequences} = [ + %overlap_cons ? values %overlap_cons : $DEFAULT_OVERLAP_CONSEQUENCE + ]; + } + + return $self->{overlap_consequences}; +} + +=head2 add_OverlapConsequence + + Arg [1] : Bio::EnsEMBL::Variation::OverlapConsequence instance + Description: Add an OverlapConsequence to this VariationFeature's list + Returntype : none + Exceptions : throws if the argument is the wrong type + Status : At Risk + +=cut + +sub add_OverlapConsequence { + my ($self, $oc) = @_; + assert_ref($oc, 'Bio::EnsEMBL::Variation::OverlapConsequence'); + push @{ $self->{overlap_consequences} ||= [] }, $oc; +} + +=head2 most_severe_OverlapConsequence + + Description: Get the OverlapConsequence considered (by Ensembl) to be the most severe + consequence of all the alleles of this VariationFeature + Returntype : Bio::EnsEMBL::Variation::OverlapConsequence + Exceptions : none + Status : At Risk + +=cut + +sub most_severe_OverlapConsequence { + my $self = shift; + + unless ($self->{_most_severe_consequence}) { + + my $highest; + + for my $cons (@{ $self->get_all_OverlapConsequences }) { + $highest ||= $cons; + if ($cons->rank < $highest->rank) { + $highest = $cons; + } + } + + $self->{_most_severe_consequence} = $highest; + } + + return $self->{_most_severe_consequence}; +} + +=head2 display_consequence + + Arg [1] : (optional) String $term_type + Description: Get the term for the most severe consequence of this + VariationFeature. By default returns Ensembl display terms + (e.g. 'NON_SYNONYMOUS_CODING'). $term_type can also be 'label' + (e.g. 'Non-synonymous coding'), 'SO' (Sequence Ontology, e.g. + 'non_synonymous_codon') or 'NCBI' (e.g. 'missense') + Returntype : string + Exceptions : none + Status : At Risk + +=cut + +sub display_consequence { + my $self = shift; + my $term_type = shift; + + my $method_name; + + # delete cached term + if(defined($term_type)) { + $method_name = $term_type.($term_type eq 'label' ? '' : '_term'); + $method_name = 'SO_term' unless @{$self->get_all_OverlapConsequences} && $self->get_all_OverlapConsequences->[0]->can($method_name); + } + + $method_name ||= 'SO_term'; + + return $self->most_severe_OverlapConsequence->$method_name; +} + +=head2 add_consequence_type + + Status : Deprecated, use add_OverlapConsequence instead + +=cut + +sub add_consequence_type{ + my $self = shift; + warning('Deprecated method, use add_OverlapConsequence instead'); + return $self->add_OverlapConsequence(@_); +} + +=head2 get_consequence_type + + Status : Deprecated, use consequence_type instead + +=cut + +sub get_consequence_type { + my $self = shift; + warning('Deprecated method, use consequence_type instead'); + return $self->consequence_type; +} + +=head2 ambig_code + + Args : None + Example : my $ambiguity_code = $vf->ambig_code() + Description : Returns the ambigutiy code for the alleles in the VariationFeature + ReturnType : String $ambiguity_code + Exceptions : none + Caller : General + Status : At Risk + +=cut + +sub ambig_code{ + my $self = shift; + + return &ambiguity_code($self->allele_string()); +} + +=head2 var_class + + Args[1] : (optional) no_db - don't use the term from the database, always calculate it from the allele string + (used by the ensembl variation pipeline) + Example : my $variation_class = $vf->var_class + Description : returns the Ensembl term for the class of this variation + ReturnType : string + Exceptions : throws if we can't find a corresponding display term for an SO term + Caller : General + Status : At Risk + +=cut + +sub var_class { + + my $self = shift; + my $no_db = shift; + + unless ($self->{class_display_term}) { + + my $so_term = $self->class_SO_term(undef, $no_db); + + # convert the SO term to the ensembl display term + + $self->{class_display_term} = $self->is_somatic ? + $VARIATION_CLASSES{$so_term}->{somatic_display_term} : + $VARIATION_CLASSES{$so_term}->{display_term}; + } + + return $self->{class_display_term}; +} + +=head2 class_SO_term + + Args[1] : (optional) class_SO_term - the SO term for the class of this variation feature + Args[2] : (optional) no_db - don't use the term from the database, always calculate it from the allele string + (used by the ensembl variation pipeline) + Example : my $SO_variation_class = $vf->class_SO_term() + Description : Get/set the SO term for the class of this variation + ReturnType : string + Exceptions : none + Caller : General + Status : At Risk + +=cut + +sub class_SO_term { + my ($self, $class_SO_term, $no_db) = @_; + + $self->{class_SO_term} = $class_SO_term if $class_SO_term; + + if ($no_db || !$self->{class_SO_term}) { + $self->{class_SO_term} = SO_variation_class($self->allele_string); + } + + return $self->{class_SO_term}; +} + +=head2 get_all_validation_states + + Arg [1] : none + Example : my @vstates = @{$vf->get_all_validation_states()}; + Description: Retrieves all validation states for this variationFeature. Current + possible validation statuses are 'cluster','freq','submitter', + 'doublehit', 'hapmap' + Returntype : reference to list of strings + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub get_all_validation_states { + my $self = shift; + return Bio::EnsEMBL::Variation::Utils::Sequence::get_all_validation_states($self->{'validation_code'}); +} + + +=head2 add_validation_state + + Arg [1] : string $state + Example : $vf->add_validation_state('cluster'); + Description: Adds a validation state to this variation. + Returntype : none + Exceptions : warning if validation state is not a recognised type + Caller : general + Status : At Risk + +=cut + +sub add_validation_state { + Bio::EnsEMBL::Variation::Utils::Sequence::add_validation_state(@_); +} + +=head2 source + + Arg [1] : string $source_name (optional) - the new value to set the source attribute to + Example : $source = $vf->source; + Description: Getter/Setter for the source attribute + Returntype : the source name as a string, + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub source { + my ($self, $source) = @_; + $self->{source} = $source if $source; + return $self->{source}; +} + +=head2 source_version + + Arg [1] : number $source_version (optional) - the new value to set the source version attribute to + Example : $source_version = $vf->source_version; + Description: Getter/Setter for the source version attribute + Returntype : the source version as a number + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub source_version { + my ($self, $source_version) = @_; + $self->{source_version} = $source_version if $source_version; + return $self->{source_version}; +} + +=head2 is_somatic + + Arg [1] : boolean $is_somatic (optional) + The new value to set the is_somatic flag to + Example : $is_somatic = $vf->is_somatic + Description: Getter/Setter for the is_somatic flag, which identifies this variation feature as either somatic or germline + Returntype : boolean + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub is_somatic { + my ($self, $is_somatic) = @_; + $self->{'is_somatic'} = $is_somatic if defined $is_somatic; + return $self->{'is_somatic'}; +} + +=head2 is_tagged + + Args : None + Example : my $populations = $vf->is_tagged(); + Description : If the variation is tagged in any population, returns an array with the populations where the variation_feature + is tagged (using a criteria of r2 > 0.99). Otherwise, returns null + ReturnType : list of Bio::EnsEMBL::Variation::Population + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub is_tagged{ + my $self = shift; + + if ($self->adaptor()){ + my $population_adaptor = $self->adaptor()->db()->get_PopulationAdaptor(); + return $population_adaptor->fetch_tagged_Population($self); + } +} + +=head2 is_tag + + Args : None + Example : my $populations = $vf->is_tag(); + Description : Returns an array of populations in which this variation feature + is a tag SNP. + ReturnType : list of Bio::EnsEMBL::Variation::Population + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub is_tag{ + my $self = shift; + + if ($self->adaptor()){ + my $population_adaptor = $self->adaptor()->db()->get_PopulationAdaptor(); + return $population_adaptor->fetch_tag_Population($self); + } +} + +=head2 get_all_tagged_VariationFeatures + + Args : Bio::EnsEMBL::Variation::Population $pop (optional) + Example : my $vfs = $vf->get_all_tagged_VariationFeatures(); + Description : Returns an arrayref of variation features that are tagged by + this variation feature, in the population $pop if specified. + ReturnType : list of Bio::EnsEMBL::Variation::VariationFeature + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub get_all_tagged_VariationFeatures { + return $_[0]->adaptor->fetch_all_tagged_by_VariationFeature(@_); +} + +=head2 get_all_tag_VariationFeatures + + Args : Bio::EnsEMBL::Variation::Population $pop (optional) + Example : my $vfs = $vf->get_all_tag_VariationFeatures(); + Description : Returns an arrayref of variation features that tag this + variation feature, in the population $pop if specified. + ReturnType : list of Bio::EnsEMBL::Variation::VariationFeature + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub get_all_tag_VariationFeatures { + return $_[0]->adaptor->fetch_all_tags_by_VariationFeature(@_); +} + +=head2 get_all_tag_and_tagged_VariationFeatures + + Args : Bio::EnsEMBL::Variation::Population $pop (optional) + Example : my $vfs = $vf->get_all_tag_and_tagged_VariationFeatures(); + Description : Returns an arrayref of variation features that either tag or are + tagged by this variation feature, in the population $pop if + specified. + ReturnType : list of Bio::EnsEMBL::Variation::VariationFeature + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub get_all_tag_and_tagged_VariationFeatures { + return $_[0]->adaptor->fetch_all_tags_and_tagged_by_VariationFeature(@_); +} + + + +=head2 is_reference + Arg : none + Example : my $reference = $vf->is_reference() + Description: Returns 1 if VF's slice is a reference slice else 0 + Returntype : int + Caller : general + Status : At Risk + +=cut + +sub is_reference { + my ($self) = @_; + my $slice = $self->slice; + + if ( !defined( $self->{'is_reference'} ) ) { + $self->{'is_reference'} = $slice->is_reference(); + } + + return $self->{'is_reference'}; +} + +=head2 convert_to_SNP + + Args : None + Example : my $snp = $vf->convert_to_SNP() + Description : Creates a Bio::EnsEMBL::SNP object from Bio::EnsEMBL::VariationFeature. Mainly used for + backwards compatibility + ReturnType : Bio::EnsEMBL::SNP + Exceptions : None + Caller : general + Status : At Risk + +=cut + +sub convert_to_SNP{ + my $self = shift; + + require Bio::EnsEMBL::SNP; #for backwards compatibility. It will only be loaded if the function is called + + my $snp = Bio::EnsEMBL::SNP->new_fast({ + 'dbID' => $self->variation()->dbID(), + '_gsf_start' => $self->start, + '_gsf_end' => $self->end, + '_snp_strand' => $self->strand, + '_gsf_score' => 1, + '_type' => $self->var_class, + '_validated' => $self->get_all_validation_states(), + 'alleles' => $self->allele_string, + '_ambiguity_code' => $self->ambig_code, + '_mapweight' => $self->map_weight, + '_source' => $self->source + }); + return $snp; +} + +=head2 get_all_LD_values + + Args : none + Description : returns all LD values for this variation feature. This function will only work correctly if the variation + database has been attached to the core database. + ReturnType : Bio::EnsEMBL::Variation::LDFeatureContainer + Exceptions : none + Caller : snpview + Status : At Risk + : Variation database is under development. + +=cut + +sub get_all_LD_values{ + my $self = shift; + + if ($self->adaptor()){ + my $ld_adaptor = $self->adaptor()->db()->get_LDFeatureContainerAdaptor(); + return $ld_adaptor->fetch_by_VariationFeature($self); + } + return {}; +} + +=head2 get_all_LD_Populations + + Args : none + Description : returns a list of populations that could produces LD values + for this VariationFeature + ReturnType : listref of Bio::EnsEMBL::Variation::Population objects + Exceptions : none + Caller : snpview + Status : At Risk + +=cut + +sub get_all_LD_Populations{ + my $self = shift; + + my $pa = $self->adaptor->db->get_PopulationAdaptor; + return [] unless $pa; + + my $ld_pops = $pa->fetch_all_LD_Populations; + return [] unless $ld_pops; + + my $sth = $self->adaptor->db->prepare(qq{ + SELECT ip.population_sample_id, c.seq_region_start, c.genotypes + FROM compressed_genotype_region c, individual_population ip + WHERE c.sample_id = ip.individual_sample_id + AND c.seq_region_id = ? + AND c.seq_region_start < ? + AND c.seq_region_end > ? + }); + + my $this_vf_start = $self->seq_region_start; + + $sth->bind_param(1, $self->feature_Slice->get_seq_region_id); + $sth->bind_param(2, $self->seq_region_end); + $sth->bind_param(3, $this_vf_start); + + $sth->execute; + + my ($sample_id, $seq_region_start, $genotypes); + $sth->bind_columns(\$sample_id, \$seq_region_start, \$genotypes); + + my %have_genotypes = (); + + while($sth->fetch()) { + + next if $have_genotypes{$sample_id}; + + if($seq_region_start == $this_vf_start) { + $have_genotypes{$sample_id} = 1; + next; + } + + my @genotypes = unpack '(www)*', $genotypes; + my $gt_start = $seq_region_start; + + while(my( $var_id, $gt_code, $gap ) = splice @genotypes, 0, 3 ) { + if($gt_start == $this_vf_start) { + $have_genotypes{$sample_id} = 1; + last; + } + $gt_start += $gap + 1 if defined $gap; + } + } + + my @final_list = grep {$have_genotypes{$_->dbID}} @$ld_pops; + + return \@final_list; +} + +=head2 get_all_sources + + Args : none + Example : my @sources = @{$vf->get_all_sources()}; + Description : returns a list of all the sources for this + VariationFeature + ReturnType : reference to list of strings + Exceptions : none + Caller : general + Status : At Risk + : Variation database is under development. +=cut + +sub get_all_sources{ + my $self = shift; + + my @sources; + my %sources; + if ($self->adaptor()){ + map {$sources{$_}++} @{$self->adaptor()->get_all_synonym_sources($self)}; + $sources{$self->source}++; + @sources = keys %sources; + return \@sources; + } + return \@sources; +} + +=head2 ref_allele_string + + Args : none + Example : $reference_allele_string = $self->ref_allele_string() + Description: Getter for the reference allele_string, always the first. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub ref_allele_string{ + my $self = shift; + + my @alleles = split /[\|\\\/]/,$self->allele_string; + return $alleles[0]; +} + + +=head2 get_all_VariationSets + + Args : none + Example : my @vs = @{$vf->get_all_VariationSets()}; + Description : returns a reference to a list of all the VariationSets this + VariationFeature is a member of + ReturnType : reference to list of Bio::EnsEMBL::Variation::VariationSets + Exceptions : if no adaptor is attached to this object + Caller : general + Status : At Risk +=cut + +sub get_all_VariationSets { + my $self = shift; + + if (!$self->adaptor()) { + throw('An adaptor must be attached in order to get all variation sets'); + } + my $vs_adaptor = $self->adaptor()->db()->get_VariationSetAdaptor(); + my $variation_sets = $vs_adaptor->fetch_all_by_Variation($self->variation()); + + return $variation_sets; +} + + +=head2 get_all_Alleles + + Args : none + Example : @alleles = @{$vf->get_all_Alleles} + Description: Gets all Allele objects from the underlying variation object, + with reference alleles first. + Returntype : listref of Bio::EnsEMBL::Variation::Allele objects + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_Alleles{ + my $self = shift; + + my @alleles = @{$self->variation->get_all_Alleles}; + + # put all alleles in a hash + my %order = (); + foreach my $allele(@alleles) { + $order{$allele->allele} = 1; + } + + $order{$self->ref_allele_string} = 2; + + # now sort them by population, submitter, allele + my @new_alleles = sort { + ($a->population ? $a->population->name : "") cmp ($b->population ? $b->population->name : "") || + ($a->subsnp ? $a->subsnp : "") cmp ($b->subsnp ? $b->subsnp : "") || + $order{$b->allele} <=> $order{$a->allele} + } @alleles; + + return \@new_alleles; +} + + +=head2 get_all_PopulationGenotypes + + Args : none + Example : @pop_gens = @{$vf->get_all_PopulationGenotypes} + Description: Gets all PopulationGenotype objects from the underlying variation + object, with reference genotypes first. + Returntype : listref of Bio::EnsEMBL::Variation::PopulationGenotype objects + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_PopulationGenotypes{ + my $self = shift; + + my @gens = @{$self->variation->get_all_PopulationGenotypes}; + + # put all alleles in a hash + my %order = (); + foreach my $gen(@gens) { + # homs low priority, hets higher + $order{$gen->allele1.$gen->allele2} = ($gen->allele1 eq $gen->allele2 ? 1 : 2); + } + + # ref hom highest priority + $order{$self->ref_allele_string x 2} = 3; + + # now sort them by population, submitter, genotype + my @new_gens = sort { + ($a->population ? $a->population->name : "") cmp ($b->population ? $b->population->name : "") || + ($a->subsnp ? $a->subsnp : "") cmp ($b->subsnp ? $b->subsnp : "") || + $order{$b->allele1.$b->allele2} <=> $order{$a->allele1.$a->allele2} + } @gens; + + return \@new_gens; +} + +=head2 get_all_hgvs_notations + + Arg [1] : Bio::EnsEMBL::Feature $ref_feature (optional) + Get the HGVS notation of this VariationFeature relative to the slice it is on. If an optional reference feature is supplied, returns the coordinates + relative to this feature. + Arg [2] : string (Optional) + Indicate whether the HGVS notation should be reported in genomic coordinates or cDNA coordinates. + 'g' -> Genomic position numbering + 'c' -> cDNA position numbering + 'p' -> protein position numbering + Arg [3] : string (Optional) + A name to use for the reference can be supplied. By default the name returned by the display_id() method of the reference feature will be used. + Arg [4] : string (Optional) + Return just the HGVS notation corresponding to this allele + + Example : my $vf = $variation_feature_adaptor->fetch_by_dbID(565770); + my $tr = $transcript_adaptor->fetch_by_stable_id('ENST00000335295'); + my $hgvs = $vf->get_all_hgvs_notations($tr,'p'); + while (my ($allele,$hgvs_str) = each(%{$hgvs})) { + print "Allele $allele :\t$hgvs_str\n"; # Will print 'Allele - : ENSP00000333994.3:p.Val34_Tyr36delinsAsp' + } + + Description: Returns a reference to a hash with the allele as key and a string with the HGVS notation of this VariationFeature as value. By default uses the + slice it is plcaed on as reference but a different reference feature can be supplied. + Returntype : Hash reference + Exceptions : Throws exception if VariationFeature can not be described relative to the feature_Slice of the supplied reference feature + Caller : general + Status : Experimental + +=cut +sub get_all_hgvs_notations { + + my $self = shift; + my $ref_feature = shift; + my $numbering = shift; ## HGVS system g=genomic, c=coding, p=protein + my $reference_name = shift; ## If the ref_feature is a slice, this is over-written + my $use_allele = shift; ## optional single allele to check + my $transcript_variation = shift; ## optional transcript variation - looked up for c|p if not supplied + + my %hgvs; + + ##### don't get them for HGMD mutations or CNV probes + return {} if ($self->allele_string =~ /INS|DEL|HGMD|CNV/ig || $self->var_class() =~ /microsat/i); + ##### By default, use genomic position numbering + $numbering ||= 'g'; + + # If no reference feature is supplied, set it to the slice underlying this VariationFeature + $ref_feature ||= $self->slice(); + + # Special parsing for LRG + if (defined $reference_name && $reference_name =~ /^LRG_/) { + # Remove version + if ($reference_name =~ /(.+)\.\d+$/) { + $reference_name = $1; + } + } + + ### Check/get transcript variation available for protein & coding + if ($ref_feature->isa('Bio::EnsEMBL::Transcript')) { + + # Get a TranscriptVariation object for this VariationFeature and the supplied Transcript if it wasn't passed in the call + $transcript_variation = $self->get_all_TranscriptVariations([$ref_feature])->[0] if (!defined($transcript_variation)); + + ##### call new TranscriptVariationAllele method for each allele + } + + + if ($numbering eq 'p') { + + #### If there is no transcript variation supplied and the variant + #### is not in the translated region there is no protein change + return {} if (!defined($transcript_variation) || + !defined($transcript_variation->translation_start()) || + !defined($transcript_variation->translation_end())); + + ##### call TranscriptVariationAllele method for each allele + foreach my $transcriptVariationAllele (@{$transcript_variation->get_all_alternate_TranscriptVariationAlleles()} ){ + + my $allele_string = $transcriptVariationAllele->feature_seq(); + my $hgvs_full_string = $transcriptVariationAllele->hgvs_protein(); + + if($allele_string ne (split/\//,$self->allele_string())[1] ){ + reverse_comp(\$allele_string); ### hash returned relative to input variation feature strand regardless of transcript strand + } + $hgvs{$allele_string} = $hgvs_full_string ; + } + return \%hgvs; + } + + elsif ( $numbering =~ m/c|n/) { ### coding or non- coding transcript + + return {} if (!defined $transcript_variation); + + foreach my $transcriptVariationAllele (@{$transcript_variation->get_all_alternate_TranscriptVariationAlleles()} ){ + + my $allele_string = $transcriptVariationAllele->feature_seq(); + my $hgvs_full_string = $transcriptVariationAllele->hgvs_transcript(); + + if($allele_string ne (split/\//,$self->allele_string())[1] ){ + reverse_comp(\$allele_string); ### hash returned relative to input variation feature strand regardless of transcript strand + } + $hgvs{$allele_string} = $hgvs_full_string ; + } + return \%hgvs; + } + + elsif( $numbering =~ m/g/ ) { + #### handling both alleles together locally for genomic class + my $hgvs = $self->hgvs_genomic($ref_feature, $reference_name, $use_allele ); + return $hgvs; + } + else{ + warn("HGVS notation is not available for coordinate system: $numbering"); + return {}; + } +} + +### HGVS short flanking sequence required to check if HGVS variant class should be dup rather than ins +sub _get_flank_seq{ + + my $self = shift; + + # Get the underlying slice and sequence + my $ref_slice = $self->slice(); + + my @allele = split(/\//,$self->allele_string()); + #### add flank at least as long as longest allele to allow checking + my $add_length = 0; + + foreach my $al(@allele){ ## may have >2 alleles + if(length($al) > $add_length){ + $add_length = length $al ; + } + } + $add_length++; + + my $ref_start = $add_length ; + my $ref_end = $add_length + ($self->end() - $self->start()); + my $seq_start = ($self->start() - $ref_start); + + # Should we be at the beginning of the sequence, adjust the coordinates to not cause an exception + if ($seq_start < 0) { + $ref_start += $seq_start; + $ref_end += $seq_start; + $seq_start = 0; + } + + my $flank_seq = $ref_slice->subseq($seq_start +1 , $seq_start + $ref_end, 1); + + + return ($flank_seq, $ref_start, $ref_end ); +} + +#### format HGVS hash for genomic reference sequence +### Input: Variation feature +### Output: $hgvs_notation hash + + + +=head2 hgvs_genomic + + Arg [1] : Bio::EnsEMBL::Feature $ref_feature (optional) + Get the HGVS notation of this VariationFeature relative to the slice it is on. If an optional reference feature is supplied, returns the coordinates + relative to this feature. + Arg [2] : string (Optional) + A name to use for the reference can be supplied. By default the name returned by the display_id() method of the reference feature will be used. + Arg [4] : string (Optional) + Return just the HGVS notation corresponding to this allele + + + + Description: Returns a reference to a hash with the allele as key and a string with the genomic HGVS notation of this VariationFeature as value. By default uses the + slice it is placed on as reference but a different reference feature can be supplied. + Returntype : Hash reference + Exceptions : Throws exception if VariationFeature can not be described relative to the feature_Slice of the supplied reference feature + Caller : general + Status : Experimental + +=cut +sub hgvs_genomic{ + + my $self = shift; + my $ref_feature = shift; ## can be a transcript + my $reference_name = shift; ## If the ref_feature is a slice, this is over-written + my $use_allele = shift; ## optional single allele to check + + my %hgvs; + ########set up sequence reference + my $ref_slice; #### need this for genomic notation + + if ($ref_feature->isa('Bio::EnsEMBL::Slice')) { + $ref_slice = $ref_feature; + } + else { + # get slice if not supplied + $ref_slice = $ref_feature->feature_Slice; + } + + # Create new VariationFeature on the slice of the reference feature (unless the reference feature is the slice the VF is on) + my $tr_vf; + if ( $self->slice->coord_system->name() eq "chromosome") { + $tr_vf = $self; + } + else { + $tr_vf = $self->transfer($ref_slice); + } + # Return undef if this VariationFeature could not be transferred + return {} if (!defined($tr_vf)); + + + # Return undef if this VariationFeature does not fall within the supplied feature. + return {} if ($tr_vf->start < 1 || + $tr_vf->end < 1 || + $tr_vf->start > ($ref_feature->end - $ref_feature->start + 1) || + $tr_vf->end > ($ref_feature->end - $ref_feature->start + 1)); + + ######### define reference sequence name ################################### + + # If the reference is a slice, use the seq_region_name as identifier + $reference_name ||= $ref_feature->seq_region_name if ($ref_feature->isa('Bio::EnsEMBL::Slice')); + + # Use the feature's display id as reference name unless specified otherwise. + # If the feature is a transcript or translation, append the version number as well + $reference_name ||= $ref_feature->display_id() . ($ref_feature->isa('Bio::EnsEMBL::Transcript') && + $ref_feature->display_id !~ /\.\d+$/ ? '.' . $ref_feature->version() : ''); + + + ##### get short flank sequence for duplication checking & adjusted variation coordinates + my ($ref_seq, $ref_start, $ref_end) = _get_flank_seq($tr_vf);; + + foreach my $allele ( split(/\//,$tr_vf->allele_string()) ) { + + ## If a particular allele was requested, ignore others + next if (defined($use_allele) && $allele ne $use_allele); + + # Skip if the allele contains weird characters + next if $allele =~ m/[^ACGT\-]/ig; + + ##### vf strand is relative to slice - if transcript feature slice, may need complimenting + my $check_allele = $allele; + if( $self->strand() <0 && $ref_slice->strand >0 || + $self->strand() >0 && $ref_slice->strand < 0 + ){ + reverse_comp(\$check_allele); + if($DEBUG ==1){print "***************Flipping alt allele $allele => $check_allele to match coding strand\n";} + } + + ## work out chrom coord for hgvs string if transcript slice supplied + my ($chr_start,$chr_end); + if ( $tr_vf->slice->is_toplevel() == 1) { + $chr_start = $tr_vf->seq_region_start(); + $chr_end = $tr_vf->seq_region_end(); + } + else{ + ## add feature start to start of var-in-feature + if( $tr_vf->seq_region_start() < $tr_vf->seq_region_end()){ + $chr_start = $tr_vf->start() + $tr_vf->seq_region_start() ; + $chr_end = $tr_vf->end() + $tr_vf->seq_region_start() ; + } + else{ + $chr_start = $tr_vf->seq_region_start() - $tr_vf->start() ; + $chr_end = $tr_vf->seq_region_start() - $tr_vf->end() ; + } + } + + + my $hgvs_notation = hgvs_variant_notation( $check_allele, ## alt allele in refseq strand orientation + $ref_seq, ## substring of slice for ref allele extraction + $ref_start, ## start on substring of slice for ref allele extraction + $ref_end, + $chr_start, ## start wrt seq region slice is on (eg. chrom) + $chr_end, + ""); + + + # Skip if e.g. allele is identical to the reference slice + next if (!defined($hgvs_notation)); + + # Add the name of the reference + $hgvs_notation->{'ref_name'} = $reference_name; + # Add the position_numbering scheme + $hgvs_notation->{'numbering'} = 'g'; + + # Construct the HGVS notation from the data in the hash + $hgvs_notation->{'hgvs'} = format_hgvs_string( $hgvs_notation); + + $hgvs{$allele} = $hgvs_notation->{'hgvs'}; + } + return \%hgvs; + +} + + + +sub length { + my $self = shift; + return $self->{'end'} - $self->{'start'} + 1; +} + +=head2 summary_as_hash + + Example : $feature_summary = $feature->summary_as_hash(); + Description : Extends Feature::summary_as_hash + Retrieves a summary of this VariationFeature object. + + Returns : hashref of descriptive strings + +=cut + +sub summary_as_hash { + my $self = shift; + my $summary_ref = $self->SUPER::summary_as_hash; + $summary_ref->{'consequence_type'} = $self->display_consequence; + my @allele_list = split(/\//,$self->allele_string); + $summary_ref->{'alt_alleles'} = \@allele_list; + return $summary_ref; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/VariationFeatureOverlap.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/VariationFeatureOverlap.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,384 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Variation::VariationFeatureOverlap + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Variation::VariationFeatureOverlap; + + my $vfo = Bio::EnsEMBL::Variation::VariationFeatureOverlap->new( + -feature => $feature, + -variation_feature => $var_feat + ); + + print "consequence type: ", (join ",", @{ $vfo->consequence_type }), "\n"; + print "most severe consequence: ", $vfo->display_consequence, "\n"; + +=head1 DESCRIPTION + +A VariationFeatureOverlap represents a VariationFeature which is in close +proximity to another Ensembl Feature. It is the superclass of feature-specific +objects such as TranscriptVariation and RegulatoryFeatureVariation, and has +methods common to all such objects. You will not normally instantiate this +class directly, instead instantiating one of the feature-specific subclasses. + +=cut + +package Bio::EnsEMBL::Variation::VariationFeatureOverlap; + +use strict; +use warnings; + +use Bio::EnsEMBL::Utils::Scalar qw(assert_ref); +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Sequence qw(expand); +use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code); +use Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele; + +use base qw(Bio::EnsEMBL::Variation::BaseVariationFeatureOverlap); + +=head2 new + + Arg [-FEATURE] : + The Bio::EnsEMBL::Feature associated with the given VariationFeature + + Arg [-VARIATION_FEATURE] : + The Bio::EnsEMBL::VariationFeature associated with the given Feature + + Arg [-ADAPTOR] : + A Bio::EnsEMBL::Variation::DBSQL::VariationFeatureOverlapAdaptor + + Arg [-DISAMBIGUATE_SINGLE_NUCLEOTIDE_ALLELES] : + A flag indiciating if ambiguous single nucleotide alleles should be disambiguated + when constructing the VariationFeatureOverlapAllele objects, e.g. a Variationfeature + with an allele string like 'T/M' would be treated as if it were 'T/A/C'. We limit + ourselves to single nucleotide alleles to avoid the combinatorial explosion if we + allowed longer alleles with potentially many ambiguous bases. + + Example : + my $vfo = Bio::EnsEMBL::Variation::VariationFeatureOverlap->new( + -feature => $feature, + -variation_feature => $var_feat + ); + + Description: Constructs a new VariationFeatureOverlap instance given a VariationFeature + and a Feature + Returntype : A new Bio::EnsEMBL::Variation::VariationFeatureOverlap instance + Exceptions : throws unless both VARIATION_FEATURE and FEATURE are supplied, or if the + supplied ADAPTOR is the wrong class + Status : At Risk + +=cut + +sub new { + + my $class = shift; + + my %args = @_; + + # swap a '-variation_feature' argument for a '-base_variation_feature' one for the superclass + + for my $arg (keys %args) { + if (lc($arg) eq '-variation_feature') { + $args{'-base_variation_feature'} = delete $args{$arg}; + } + } + + my $self = $class->SUPER::new(%args); + + my ( + $adaptor, + $ref_feature, + $disambiguate_sn_alleles, + $no_ref_check, + ) = rearrange([qw( + ADAPTOR + REF_FEATURE + DISAMBIGUATE_SINGLE_NUCLEOTIDE_ALLELES + NO_REF_CHECK + )], %args); + + my $variation_feature = $self->base_variation_feature; + + assert_ref($variation_feature, 'Bio::EnsEMBL::Variation::VariationFeature'); + assert_ref($adaptor, 'Bio::EnsEMBL::Variation::DBSQL::VariationFeatureOverlapAdaptor') if $adaptor; + + $ref_feature ||= $variation_feature->slice; + + $self->{adaptor} = $adaptor; + $self->{ref_feature} = $ref_feature; + + my $ref_allele; + + # we take the reference allele sequence from the reference sequence, not from the allele string + unless($no_ref_check) { + $ref_allele = $ref_feature->subseq( + $variation_feature->start, + $variation_feature->end, + $variation_feature->strand + ); + } + + # get the variation feature allele string, expand it, and split it into separate alleles + + my $allele_string = $variation_feature->allele_string; + + expand(\$allele_string); + + my @alleles = split /\//, $allele_string; + + $ref_allele = $alleles[0] if $no_ref_check; + $ref_allele = '-' unless $ref_allele; + + if ($disambiguate_sn_alleles) { + + # if this flag is set, disambiguate any ambiguous single nucleotide alleles, so + # e.g. an allele string like T/M would be equivalent to an allele string of T/A/C + # we only do this for single nucleotide alleles to avoid the combinatorial explosion + # of long allele strings with potentially many ambiguous bases (because ensembl + # genomes want this functionality) + + my @possible_alleles; + + for my $allele (@alleles) { + + if ($allele !~ /^[ACGT-]+$/ && length($allele) == 1) { + for my $possible ( split //, unambiguity_code($allele) ) { + push @possible_alleles, $possible; + } + } + else { + # the allele is either unambiguous or longer than 1 nucleotide, so add it unaltered + push @possible_alleles, $allele; + } + } + + @alleles = @possible_alleles; + } + + # make sure the alleles are unique + + # we also want to deal with alleles like (T)0 which expand into + # an empty string and we want to treat this as a deletion, so + # we replace + # any empty strings with '-' + + @alleles = keys %{ { map { ($_ || '-') => 1 } @alleles } }; + + # create an object representing the reference allele + + my $ref_vfoa = Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele->new( + -variation_feature_overlap => $self, + -variation_feature_seq => $ref_allele, + -is_reference => 1, + ); + + $self->add_VariationFeatureOverlapAllele($ref_vfoa); + + # create objects representing the alternate alleles + + for my $allele (@alleles) { + + next if $allele eq $ref_allele; + + my $vfoa = Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele->new( + -variation_feature_overlap => $self, + -variation_feature_seq => $allele, + -is_reference => 0, + ); + + $self->add_VariationFeatureOverlapAllele($vfoa); + } + + return $self; +} + +sub new_fast { + my ($class, $hashref) = @_; + + # swap a variation_feature argument for a base_variation_feature one + + if ($hashref->{variation_feature}) { + $hashref->{base_variation_feature} = delete $hashref->{variation_feature}; + } + + return $class->SUPER::new_fast($hashref); +} + +sub dbID { + my $self = shift; + + unless ($self->{dbID}) { + # we don't really have a dbID, so concatenate all the dbIDs of our alleles + + $self->{dbID} = join '_', map { $_->dbID } @{ $self->get_all_alternate_VariationFeatureOverlapAlleles }; + } + + return $self->{dbID}; +} + +=head2 variation_feature + + Arg [1] : (optional) A Bio::EnsEMBL::Variation::VariationFeature + Description: Get/set the associated VariationFeature, lazy-loading it if required + Returntype : Bio::EnsEMBL::Variation::VariationFeature + Exceptions : throws if the argument is the wrong type + Status : At Risk + +=cut + +sub variation_feature { + my ($self, $variation_feature) = @_; + + if ($variation_feature) { + assert_ref($variation_feature, 'Bio::EnsEMBL::Variation::VariationFeature'); + $self->base_variation_feature($variation_feature); + } + + if (my $vf_id = $self->{_variation_feature_id}) { + + # lazy-load the VariationFeature + + if (my $adap = $self->{adaptor}) { + if (my $vfa = $adap->db->get_VariationFeatureAdaptor) { + if (my $vf = $vfa->fetch_by_dbID($vf_id)) { + $self->base_variation_feature($vf); + delete $self->{_variation_feature_id}; + } + } + } + } + + return $self->base_variation_feature; +} + +sub _variation_feature_id { + + # get the dbID of the variation feature, using the VariationFeature object + # if we have one, or the internal hash value if we don't + + my $self = shift; + + if (my $vf = $self->{variation_feature}) { + return $vf->dbID; + } + elsif (my $id = $self->{_variation_feature_id}) { + return $id; + } + else { + return undef; + } +} + +sub get_VariationFeatureOverlapAllele_for_allele_seq { + my ($self, $allele_seq) = @_; + return $self->{_alleles_by_seq}->{$allele_seq}; +} + +=head2 add_VariationFeatureOverlapAllele + + Arg [1] : A Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele instance + Description: Add an allele to this VariationFeatureOverlap + Returntype : none + Exceptions : throws if the argument is not the expected type + Status : At Risk + +=cut + +sub add_VariationFeatureOverlapAllele { + my ($self, $vfoa) = @_; + + assert_ref($vfoa, 'Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele'); + + $self->add_BaseVariationFeatureOverlapAllele($vfoa); + + $self->{_alleles_by_seq}->{ $vfoa->variation_feature_seq } = $vfoa; +} + +=head2 get_reference_VariationFeatureOverlapAllele + + Description: Get the object representing the reference allele of this VariationFeatureOverlapAllele + Returntype : Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele instance + Exceptions : none + Status : At Risk + +=cut + +sub get_reference_VariationFeatureOverlapAllele { + my $self = shift; + return $self->get_reference_BaseVariationFeatureOverlapAllele(@_); +} + +=head2 get_all_alternate_VariationFeatureOverlapAlleles + + Description: Get a list of the alternate alleles of this VariationFeatureOverlapAllele + Returntype : listref of Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele objects + Exceptions : none + Status : At Risk + +=cut + +sub get_all_alternate_VariationFeatureOverlapAlleles { + my $self = shift; + return $self->get_all_alternate_BaseVariationFeatureOverlapAlleles(@_); +} + +=head2 get_all_VariationFeatureOverlapAlleles + + Description: Get a list of the all the alleles, both reference and alternate, of this + VariationFeatureOverlap + Returntype : listref of Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele objects + Exceptions : none + Status : At Risk + +=cut + +sub get_all_VariationFeatureOverlapAlleles { + my $self = shift; + return $self->get_all_BaseVariationFeatureOverlapAlleles(@_); +} + +sub _convert_to_sara { + my $self = shift; + + my $ref_allele = $self->{reference_allele}; + $ref_allele->_convert_to_sara; + + $self->{alt_alleles} = [$ref_allele]; +} + +sub _rearrange_alleles { + my $self = shift; + my $keep_alleles = shift; + + # fix alt alleles + my $alt_alleles = $self->{alt_alleles}; + my @new_alleles = grep {$keep_alleles->{$_->variation_feature_seq}} @$alt_alleles; + $self->{alt_alleles} = scalar @new_alleles ? \@new_alleles : $alt_alleles; + + # copy to ref allele if homozygous non-ref + $self->{reference_allele} = $self->{alt_alleles}->[0] if scalar keys %$keep_alleles == 1; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/VariationFeatureOverlapAllele.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/VariationFeatureOverlapAllele.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,360 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele; + + my $vfoa = Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele->new( + -variation_feature_overlap => $vfo, + -variation_feature_seq => 'A', + -is_reference => 0, + ); + + print "sequence with respect to the feature: ", $vfoa->feature_seq, "\n"; + print "sequence with respect to the variation feature: ", $vfoa->variation_feature_seq, "\n"; + print "consequence SO terms: ", (join ",", map { $_->SO_term } @{ $vfoa->get_all_OverlapConsequences }), "\n"; + +=head1 DESCRIPTION + +A VariationFeatureOverlapAllele object represents a single allele of a +VariationFeatureOverlap. It is the super-class of various feature-specific allele +classes such as TranscriptVariationAllele and RegulatoryFeatureVariationAllele and +contains methods not specific to any particular feature type. Ordinarily you will +not create these objects yourself, but instead you would create e.g. a +TranscriptVariation object which will then create VariationFeatureOverlapAlleles +based on the allele string of the associated VariationFeature. + +=cut + +package Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele; + +use strict; +use warnings; + +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Scalar qw(assert_ref); +use Bio::EnsEMBL::Utils::Exception qw(throw); +use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); + +use base qw(Bio::EnsEMBL::Variation::BaseVariationFeatureOverlapAllele); + +our $UNAMBIGUOUS_NUCLEOTIDES = qr/^[ACGT-]+$/i; + +our $ALL_NUCLEOTIDES = qr/^[ACGTUMRWSYKVHDBXN-]+$/i; + +our $SPECIFIED_LENGTH = qr /(\d+) BP (INSERTION|DELETION)/i; + +=head2 new + + Arg [-VARIATION_FEATURE_OVERLAP] : + The Bio::EnsEMBL::VariationFeatureOverlap with which this allele is + associated + + Arg [-VARIATION_FEATURE_SEQ] : + The allele sequence with respect to the associated VariationFeature + + Arg [-IS_REFERENCE] : + A flag indicating if this allele is the reference allele or not + + Example : + my $vfoa = Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele->new( + -variation_feature_ovelap => $vfo, + -variation_feature_seq => 'A', + -is_reference => 0 + ); + + Description: Constructs a new VariationFeatureOverlapAllele instance given a + VariationFeatureOverlap and the sequence of the allele + Returntype : A new Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele instance + Exceptions : throws unless both VARIATION_FEATURE_OVERLAP and VARIATION_FEATURE_SEQ + are supplied + Status : At Risk + +=cut + +sub new { + my $class = shift; + + my %args = @_; + + # swap a '-variation_feature_overlap' argument for a '-base_variation_feature_overlap' + # and a '-variation_feature' for a '-base_variation_feature' for the superclass + + for my $arg (keys %args) { + if (lc($arg) eq '-variation_feature_overlap') { + $args{'-base_variation_feature_overlap'} = delete $args{$arg}; + } + } + + my $self = $class->SUPER::new(%args); + + assert_ref($self->base_variation_feature_overlap, 'Bio::EnsEMBL::Variation::VariationFeatureOverlap'); + + my ( + $variation_feature_seq, + ) = rearrange([qw( + VARIATION_FEATURE_SEQ + )], %args); + + + throw("Allele sequence required (variation "+$self->variation_feature->variation_name+")") + unless $variation_feature_seq; + + $self->{variation_feature_seq} = $variation_feature_seq; + + return $self; +} + +sub new_fast { + my ($class, $hashref) = @_; + + # swap a variation_feature_overlap argument for a base_variation_feature_overlap one + + if ($hashref->{variation_feature_overlap}) { + $hashref->{base_variation_feature_overlap} = delete $hashref->{variation_feature_overlap}; + } + + # and call the superclass + + return $class->SUPER::new_fast($hashref); +} + +=head2 dbID + + Description: Get/set the dbID of this VariationFeatureOverlapAllele + Returntype : integer + Exceptions : none + Status : At Risk + +=cut + +sub dbID { + my ($self, $dbID) = @_; + $self->{dbID} = $dbID if defined $dbID; + return $self->{dbID}; +} + +=head2 variation_feature_overlap + + Description: Get/set the associated VariationFeatureOverlap + Returntype : Bio::EnsEMBL::Variation::VariationFeatureOverlap + Exceptions : throws if the argument is the wrong type + Status : At Risk + +=cut + +sub variation_feature_overlap { + my ($self, $variation_feature_overlap) = @_; + + if ($variation_feature_overlap) { + assert_ref($variation_feature_overlap, 'Bio::EnsEMBL::Variation::VariationFeatureOverlap'); + } + + return $self->base_variation_feature_overlap($variation_feature_overlap); +} + +=head2 variation_feature + + Description: Get the associated VariationFeature + Returntype : Bio::EnsEMBL::Variation::VariationFeature + Exceptions : none + Status : At Risk + +=cut + +sub variation_feature { + my $self = shift; + return $self->variation_feature_overlap->variation_feature; +} + +=head2 feature_seq + + Description: Get the sequence of this allele relative to the associated Feature. + This will be the same as the variation_feature_seq when the associated + VariationFeature is on the same strand as the Feature, or the reverse + complement when the strands differ. + Returntype : string + Exceptions : none + Status : At Risk + +=cut + +sub feature_seq { + my $self = shift; + + unless ($self->{feature_seq}) { + + # check if we need to reverse complement the variation_feature_seq + + if (($self->variation_feature->strand != $self->feature->strand) && $self->seq_is_dna) { + my $vf_seq = $self->variation_feature_seq; + reverse_comp(\$vf_seq); + $self->{feature_seq} = $vf_seq; + } + else { + $self->{feature_seq} = $self->{variation_feature_seq}; + } + } + + return $self->{feature_seq}; +} + +=head2 variation_feature_seq + + Args [1] : The allele sequence relative to the VariationFeature + Description: Get/set the sequence of this allele relative to the associated VariationFeature. + Returntype : string + Exceptions : none + Status : At Risk + +=cut + +sub variation_feature_seq { + # the sequence of this allele relative to the variation feature + my ($self, $variation_feature_seq) = @_; + $self->{variation_feature_seq} = $variation_feature_seq if $variation_feature_seq; + return $self->{variation_feature_seq}; +} + +=head2 seq_is_unambiguous_dna + + Description: identify if the sequence of this allele is unambiguous DNA + i.e. if we can meaningfully translate it + Returntype : bool + Exceptions : none + Status : At Risk + +=cut + +sub seq_is_unambiguous_dna { + my $self = shift; + + unless (defined $self->{seq_is_unambiguous_dna}) { + $self->{seq_is_unambiguous_dna} = + $self->{variation_feature_seq} =~ /$UNAMBIGUOUS_NUCLEOTIDES/ ? 1 : 0; + } + + return $self->{seq_is_unambiguous_dna}; +} + +=head2 seq_is_dna + + Description: identify if the sequence of this allele is DNA including ambiguity + codes, use seq_is_unambiguous_dna to check for alleles that do not + include ambiguity codes + Returntype : bool + Exceptions : none + Status : At Risk + +=cut + +sub seq_is_dna { + my $self = shift; + + unless (defined $self->{seq_is_dna}) { + $self->{seq_is_dna} = + $self->{variation_feature_seq} =~ /$ALL_NUCLEOTIDES/ ? 1 : 0; + } + + return $self->{seq_is_dna}; +} + +=head2 seq_length + + Description: return the length of this allele sequence, this is better than + just using length($vfoa->feature_seq) because we check if the + sequence is valid DNA, and also look for allele strings like + "(3 BP INSERTION)" to determine the length + Returntype : int or undef if we cannot determine the length + Exceptions : none + Status : At Risk + +=cut + +sub seq_length { + my $self = shift; + + my $seq = $self->variation_feature_seq; + + if ($self->seq_is_dna) { + if ($seq eq '-') { + return 0; + } + else { + return length($seq); + } + } + elsif ($seq =~ /$SPECIFIED_LENGTH/) { + return $1; + } + + return undef; +} + +=head2 allele_string + + Description: Return a '/' delimited string of the reference allele variation_feature_seq + and the variation_feature_seq of this allele + Returntype : string + Exceptions : none + Status : At Risk + +=cut + +sub allele_string { + my $self = shift; + + my $ref = $self->variation_feature_overlap->get_reference_VariationFeatureOverlapAllele->variation_feature_seq; + + # for the HGMDs and CNV probes where the alleles are artificially set to be + # the same, just return the reference sequence + + if ($ref eq $self->variation_feature_seq) { + return $ref; + } + else { + return $ref.'/'.$self->variation_feature_seq; + } +} + + +sub _convert_to_sara { + my $self = shift; + + my $oc = Bio::EnsEMBL::Variation::OverlapConsequence->new_fast({ + 'label' => 'SARA', + 'description' => 'Same as reference allele', + 'rank' => '99', + 'display_term' => 'SARA', + 'SO_term' => 'SARA', + }); + + $self->add_OverlapConsequence($oc); + + return $self; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/EnsEMBL/Variation/VariationSet.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/VariationSet.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,410 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +# Ensembl module for Bio::EnsEMBL::Variation::VariationSet +# +# Copyright (c) 2010 Ensembl +# + +=head1 NAME + +Bio::EnsEMBL::Variation::VariationSet - Ensembl representation of a set of +variations. + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Variation::VariationSet; + + ... + + + +=head1 DESCRIPTION + +This is a class representing a set of variations that are grouped by e.g. +study, method, quality measure etc. + +=head1 METHODS + +=cut + + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::VariationSet; + +use Bio::EnsEMBL::Storable; +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning); +use Bio::EnsEMBL::Utils::Iterator; + +use vars qw(@ISA); + +@ISA = qw(Bio::EnsEMBL::Storable); + + +=head2 new + + Arg [dbID] : + int - unique internal identifier for this allele group + + Arg [ADAPTOR] : + Bio::EnsEMBL::Variation::DBSQL::VariationSetAdaptor + + Arg [NAME] : + string - the name of this variation set + + Arg [DESCRIPTION] : + string - A description explaining the charcteristics of this variation set + + Arg [SHORT_NAME] : + string - the short name of this variation set + + Example : + $ag = Bio::EnsEMBL::Variation::VariationSet->new + ( + -dbID => 12, + -adaptor => $var_set_adaptor, + -name => 'Phenotype-associated variations', + -description => 'Variations that have been associated with a phenotype', + -short_name => 'ph_variants' + ); + Description: Constructor. Instantiates a new VariationSet + Returntype : Bio::EnsEMBL::Variation::VariationSet + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub new { + my $class = shift; + + my ($dbID, $adaptor, $name, $description, $short_name) = + rearrange([qw(DBID ADAPTOR NAME DESCRIPTION SHORT_NAME)], @_); + + # Check that the dbID does not exceed the maximum dbID that can be stored in the variation_set_id SET construct in variation_set_variation + warn("Primary key variation_set_id $dbID for variation set '$name' exceeds " . $Bio::EnsEMBL::Variation::DBSQL::VariationSetAdaptor::MAX_VARIATION_SET_ID . ". Therefore, this variation set cannot be properly referenced in variation_set_variation") if ($dbID > $Bio::EnsEMBL::Variation::DBSQL::VariationSetAdaptor::MAX_VARIATION_SET_ID); + + return bless {'dbID' => $dbID, + 'adaptor' => $adaptor, + 'name' => $name, + 'description' => $description, + 'short_name' => $short_name}, $class; +} + +=head2 description + + Arg [1] : string $description + Example : print $vs->description(); + Description: Getter/Setter for the description of this VariationSet + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub description { + my $self = shift; + my $desc = shift; + + $self->{'description'} = $desc if (defined($desc)); + + return $self->{'description'}; +} + +=head2 get_all_sub_VariationSets + Arg [1] : (optional) boolean $only_immediate + If true, will only get the direct subsets of this variation. The default behaviour is + to recursively get all subsets. + Example : print $vs->get_all_sub_VariationSets(); + Description: Recursively gets all variation sets that are subsets of this variation set. + Returntype : reference to list of Bio::EnsEMBL::Variation::VariationSet + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub get_all_sub_VariationSets { + my $self = shift; + my $only_immediate = shift; + + # A database adaptor must be attached to this object + if(!$self->adaptor()) { + warning('Cannot get sub variation sets without attached adaptor'); + return []; + } + + return $self->adaptor->fetch_all_by_super_VariationSet($self,$only_immediate); +} + +=head2 get_all_super_VariationSets + Arg [1] : (optional) boolean $only_immediate + If true, will only get the direct supersets of this variation. The default behaviour is + to recursively get all supersets. + Example : print $vs->get_all_super_VariationSets(); + Description: Recursively gets all variation sets that are above this variation set. + Returntype : reference to list of Bio::EnsEMBL::Variation::VariationSet + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub get_all_super_VariationSets { + my $self = shift; + my $only_immediate = shift; + + # A database adaptor must be attached to this object + if(!$self->adaptor()) { + warning('Cannot get super variation sets without attached adaptor'); + return []; + } + + return $self->adaptor->fetch_all_by_sub_VariationSet($self,$only_immediate); +} + +=head2 get_all_Variations + + Example : print $vs->get_all_Variations(); + Description: Gets all variations belonging to this variation set and all of its subsets. + Returntype : reference to list of Bio::EnsEMBL::Variation::Variation + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub get_all_Variations { + my $self = shift; + + # A database adaptor must be attached to this object + if(!$self->adaptor()) { + warning('Cannot get variations without attached adaptor'); + return []; + } + + # Call the method in VariationAdaptor that will handle this + my $variation_adaptor = $self->adaptor->db->get_VariationAdaptor(); + if(!$variation_adaptor) { + warning('Could not get variation adaptor from database'); + return []; + } + + # Get all variations from this set (and its subsets) + return $variation_adaptor->fetch_all_by_VariationSet($self); +} + + +=head2 get_all_StructuralVariations + + Example : print $vs->get_all_StructuralVariations(); + Description: Gets all structural variations belonging to this variation set and all of its subsets. + Returntype : reference to list of Bio::EnsEMBL::Variation::StructuralVariation + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub get_all_StructuralVariations { + my $self = shift; + + # A database adaptor must be attached to this object + if(!$self->adaptor()) { + warning('Cannot get structural variations without attached adaptor'); + return []; + } + + # Call the method in StructuralVariationAdaptor that will handle this + my $sv_adaptor = $self->adaptor->db->get_StructuralVariationAdaptor(); + if(!$sv_adaptor) { + warning('Could not get structural variation adaptor from database'); + return []; + } + + # Get all variations from this set (and its subsets) + return $sv_adaptor->fetch_all_by_VariationSet($self); +} + + +=head2 get_Variation_Iterator + + Example : my $var_iterator = $vs->get_Variation_Iterator; + Description: Gets an iterator over all variations belonging to this + variation set and all of its subsets. + Returntype : Bio::EnsEMBL::Utils::Iterator + Exceptions : none + Caller : general + Status : Experimental + +=cut + +sub get_Variation_Iterator { + my $self = shift; + + # A database adaptor must be attached to this object + unless ($self->adaptor) { + warning('Cannot get variations without attached adaptor'); + return Bio::EnsEMBL::Utils::Iterator->new; + } + + # Call the method in VariationAdaptor that will handle this + my $variation_adaptor = $self->adaptor->db->get_VariationAdaptor(); + + unless ($variation_adaptor) { + warning('Could not get variation adaptor from database'); + return Bio::EnsEMBL::Utils::Iterator->new; + } + + # Get an iterator over variations from this set (and its subsets) + return $variation_adaptor->fetch_Iterator_by_VariationSet($self); +} + + +=head2 get_StructuralVariation_Iterator + + Example : my $sv_iterator = $vs->get_StructuralVariation_Iterator; + Description: Gets an iterator over all structural variations belonging to this + variation set and all of its subsets. + Returntype : Bio::EnsEMBL::Utils::Iterator + Exceptions : none + Caller : general + Status : Experimental + +=cut + +sub get_StructuralVariation_Iterator { + my $self = shift; + + # A database adaptor must be attached to this object + unless ($self->adaptor) { + warning('Cannot get structural variations without attached adaptor'); + return Bio::EnsEMBL::Utils::Iterator->new; + } + + # Call the method in StructuralVariationAdaptor that will handle this + my $sv_adaptor = $self->adaptor->db->get_StructuralVariationAdaptor(); + + unless ($sv_adaptor) { + warning('Could not get dructural variation adaptor from database'); + return Bio::EnsEMBL::Utils::Iterator->new; + } + + # Get an iterator over Structural variations from this set (and its subsets) + return $sv_adaptor->fetch_Iterator_by_VariationSet($self); +} + + +=head2 get_all_VariationFeatures_by_Slice + + Arg [1] : Bio::EnsEMBL:Variation::Slice $slice + Example : my @vfs = +@{$vs->get_all_VariationFeatures_by_Slice($slice)}; + Description: Retrieves all variation features in a slice that belong to + this set. + Returntype : reference to list Bio::EnsEMBL::Variation::VariationFeature + Exceptions : throw on bad argument + Caller : general + Status : Stable + +=cut + +sub get_all_VariationFeatures_by_Slice { + my $self = shift; + my $slice = shift; + + if(!$self->adaptor()) { + warning('Cannot get variation features without attached adaptor'); + return []; + } + + my $vfa = $self->adaptor->db->get_VariationFeatureAdaptor(); + if(!$vfa) { + warning('Could not get variation feature adaptor from database'); + return []; + } + return $vfa->fetch_all_by_Slice_VariationSet($slice,$self); +} + +=head2 name + + Arg [1] : string $name + Example : print $vs->name(); + Description: Getter/Setter for the name of this VariationSet + Returntype : string + Exceptions : none + Caller : general + Status : At Risk + +=cut + +sub name { + my $self = shift; + my $name = shift; + + $self->{'name'} = $name if (defined($name)); + + return $self->{'name'}; +} + +=head2 short_name + + Arg [1] : string $short_name + Example : print $vs->short_name(); + Description: Getter/Setter for the short name of this VariationSet + Returntype : string + Exceptions : none + Caller : general + +=cut + +sub short_name { + my $self = shift; + my $short_name = shift; + + $self->{'short_name'} = $short_name if (defined($short_name)); + + return $self->{'short_name'}; +} + +# API-internal subroutine to get the bitvalue of this set's id and all of its subsets (unless specifically indicated not to) +sub _get_bitvalue { + my $self = shift; + my @args = @_; + + # If the subsets should be exluded, call the subroutine in the adaptor and return the result. No caching. + if (@args) { + return $self->adaptor->_get_bitvalue($self,@args); + } + + # Check if we have cached the bitvalue (including subsets), otherwise get it and cache it + unless (exists($self->{'_bitvalue'})) { + $self->{'_bitvalue'} = $self->adaptor->_get_bitvalue($self); + } + + # Return the cached value + return $self->{'_bitvalue'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Event/EventGeneratorI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Event/EventGeneratorI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,89 @@ +# $Id: EventGeneratorI.pm,v 1.7 2002/10/22 07:45:14 lapp Exp $ +# +# BioPerl module for Bio::Event::EventGeneratorI +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Event::EventGeneratorI - This interface describes the basic event generator class. + +=head1 SYNOPSIS + + # Do not use this object directly + # This object has the basic methods for describing an event generator + +=head1 DESCRIPTION + +This object describes the basic event generator system. It basically allows one to attach one or many event handlers. + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Event::EventGeneratorI; +use vars qw(@ISA); +use strict; + +@ISA = qw( Bio::Root::RootI ); + +=head2 attach_EventHandler + + Title : attach_EventHandler + Usage : $parser->attatch_EventHandler($handler) + Function: Adds an event handler to listen for events + Returns : none + Args : Bio::Event::EventHandlerI + +=cut + +sub attach_EventHandler{ + my ($self) = @_; + $self->throw_not_implemented(); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Event/EventHandlerI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Event/EventHandlerI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,215 @@ +# $Id: EventHandlerI.pm,v 1.5 2002/10/22 07:45:14 lapp Exp $ +# +# BioPerl module for Bio::Event::EventHandlerI +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Event::EventHandlerI - An Event Handler Interface + +=head1 SYNOPSIS + + # do not use this module directly + # See Bio::SearchIO::SearchResultEventHandler for an example of + # implementation. + +=head1 DESCRIPTION + +This interface describes the basic methods required for a +EventHandlers. These are essentially SAX methods. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Event::EventHandlerI; +use vars qw(@ISA); +use strict; +use Bio::Root::RootI; +use Carp; + +@ISA = qw(Bio::Root::RootI); + +=head2 will_handle + + Title : will_handle + Usage : if( $handler->will_handle($event_type) ) { ... } + Function: Tests if this event builder knows how to process a specific event + Returns : boolean + Args : event type name + + +=cut + +sub will_handle{ + my ($self,$type) = @_; + $self->throw_not_implemented(); +} + +=head2 SAX methods + +=cut + +=head2 start_document + + Title : start_document + Usage : $eventgenerator->start_document(); + Function: Handle a start document event + Returns : none + Args : none + + +=cut + +sub start_document{ + my ($self,@args) = @_; + $self->throw_not_implemented; +} + +=head2 end_document + + Title : end_document + Usage : $eventgenerator->end_document(); + Function: Handle an end document event + Returns : none + Args : none + + +=cut + +sub end_document{ + my ($self,@args) = @_; + $self->throw_not_implemented; +} + +=head2 start_element + + Title : start_element + Usage : $eventgenerator->start_element + Function: Handles a start element event + Returns : none + Args : hashref with at least 2 keys 'Data' and 'Name' + + +=cut + +sub start_element{ + my ($self,@args) = @_; + $self->throw_not_implemented; +} + +=head2 end_element + + Title : start_element + Usage : $eventgenerator->end_element + Function: Handles an end element event + Returns : none + Args : hashref with at least 2 keys 'Data' and 'Name' + + +=cut + +sub end_element{ + my ($self,@args) = @_; + $self->throw_not_implemented; +} + + +=head2 in_element + + Title : in_element + Usage : if( $eventgenerator->in_element($element) ) {} + Function: Test if we are in a particular element + This is different than 'within' because 'in' tests only + if one has reached a specific element. + Returns : boolean + Args : string element name + + +=cut + +sub in_element{ + my ($self,@args) = @_; + $self->throw_not_implemented; + +} + +=head2 within_element + + Title : within_element + Usage : if( $eventgenerator->within_element($element) ) {} + Function: Test if we are within a particular element + This is different than 'in' because within can be tested + for a whole block. + Returns : boolean + Args : string element name + + +=cut + +sub within_element{ + my ($self,@args) = @_; + $self->throw_not_implemented; +} + +=head2 characters + + Title : characters + Usage : $eventgenerator->characters($str) + Function: Send a character events + Returns : none + Args : string + + +=cut + +sub characters{ + my ($self,@args) = @_; + $self->throw_not_implemented; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Expression/FeatureGroup.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Expression/FeatureGroup.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,287 @@ +# $Id: FeatureGroup.pm,v 1.1.2.2 2003/09/17 09:19:21 allenday Exp $ +# BioPerl module for Bio::Expression::FeatureGroup +# +# Copyright Allen Day , Stanley Nelson +# Human Genetics, UCLA Medical School, University of California, Los Angeles + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Expression::FeatureGroup - a set of DNA/RNA features. ISA +Bio::Expression::FeatureI + +=head1 SYNOPSIS + +# + +=head1 DESCRIPTION + +A set of DNA/RNA features. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Allen Day Eallenday@ucla.eduE + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... +package Bio::Expression::FeatureGroup; + +use strict; + +use base qw(Bio::Root::Root Bio::Expression::FeatureI); +use vars qw($DEBUG); + +=head2 new + + Title : new + Usage : $featuregroup = Bio::Expression::FeatureGroup->new(%args); + Function: create a new featuregroup object + Returns : a Bio::Expression::FeatureGroup object + Args : an optional hash of parameters to be used in initialization: + -id -- the featuregroup ID + -type -- the featuregroup type + +=cut + +sub new { + my($class,@args) = @_; + my $self = bless {}, $class; + $self->_initialize(@args); + return $self; +} + +=head2 _initialize + + Title : _initialize + Usage : $featuregroup->_initialize(@args); + Function: initialize the featuregroup object + Returns : nothing + Args : @args + +=cut + +sub _initialize{ + my ($self,@args) = @_; + my %param = @args; + + $self->type($param{-type}); + $self->id($param{-id} ); + + $self->SUPER::_initialize(@args); + $DEBUG = 1 if( ! defined $DEBUG && $self->verbose > 0); +} + +=head2 type + + Title : type + Usage : $featuregroup->type($optional_arg); + Function: get/set the type of the featuregroup + Comments: this is probably going to be a string like + "quality control", "mismatch blah blah", etc. + Returns : the featuregroup type + Args : a new value for the featuregroup type + +=cut + +sub type { + my $self = shift; + $self->{type} = shift if @_; + return $self->{type}; +} + +=head2 id + + Title : id + Usage : $featuregroup->id($optional_arg); + Function: get/set the id of the featuregroup + Returns : the featuregroup id + Args : a new value for the featuregroup id + +=cut + +sub id { + my $self = shift; + $self->{id} = shift if @_; + return $self->{id}; +} + + +=head2 standard_deviation + + Title : standard_deviation + Usage : $featuregroup->standard_deviation($optional_arg); + Function: get/set the standard deviation of the featuregroup value + Returns : the featuregroup standard deviation + Args : a new value for the featuregroup standard deviation + Notes : this method does no calculation, it merely holds a value + +=cut + +sub standard_deviation { + my $self = shift; + $self->{standard_deviation} = shift if @_; + return $self->{standard_deviation}; +} + +=head2 quantitation + + Title : quantitation + Usage : $featuregroup->quantitation($optional_arg); + Function: get/set the quantitation of the featuregroup + Returns : the featuregroup's quantitated value + Args : a new value for the featuregroup's quantitated value + Notes : this method does no calculation, it merely holds a value + +=cut + +sub quantitation { + my $self = shift; + $self->{quantitation} = shift if @_; + return $self->{quantitation}; +} + +=head2 quantitation_units + + Title : quantitation_units + Usage : $featuregroup->quantitation_units($optional_arg); + Function: get/set the quantitation units of the featuregroup + Returns : the featuregroup's quantitated value units + Args : a new value for the featuregroup's quantitated value units + +=cut + +sub quantitation_units { + my $self = shift; + $self->{quantitation_units} = shift if @_; + return $self->{quantitation_units}; +} + +=head2 presence + + Title : presence + Usage : $featuregroup->presence($optional_arg); + Function: get/set the presence call of the featuregroup + Returns : the featuregroup's presence call + Args : a new value for the featuregroup's presence call + +=cut + +sub presence { + my $self = shift; + $self->{presence} = shift if @_; + return $self->{presence}; +} + +=head2 add_feature + + Title : add_feature + Usage : $feature_copy = $featuregroup->add_feature($feature); + Function: add a feature to the featuregroup + Returns : see this_feature() + Args : a Bio::Expression::FeatureI compliant object + +=cut + +sub add_feature { + my($self,@args) = @_; + foreach my $feature (@args){ + $self->throw('Features must be Bio::Expression::FeatureI compliant') unless $feature->isa('Bio::Expression::FeatureI'); + push @{$self->{features}}, $feature; + } + + return $self->{features} ? $self->{features}->[-1] : undef; +} + +=head2 this_feature + + Title : this_feature + Usage : $feature = $featuregroup->this_feature + Function: access the last feature added to the featuregroup + Returns : the last feature added to the featuregroup + Args : none + +=cut + +sub this_feature { + my $self = shift; + return $self->{features} ? $self->{features}->[-1] : undef; +} + +=head2 each_feature + + Title : each_feature + Usage : @features = $featuregroup->each_feature + Function: returns a list of Bio::Expression::FeatureI compliant + objects + Returns : a list of objects + Args : none + +=cut + +sub each_feature { + my $self = shift; + return @{$self->{features}} if defined($self->{features}); + return (); +} + +=head2 each_feature_quantitation + + Title : each_feature_quantitation + Usage : @featurequantitions = $featuregroup->each_feature_quantitation; + Function: returns an list of quantitations of the features in the featuregroup + Returns : a list of numeric values + Args : none + +=cut + +sub each_feature_quantitation { + my $self = shift; + my @values = (); + push @values, $_->value foreach $self->each_feature; + return @values; +} + +=head2 is_qc + + Title : is_qc + Usage : $is_quality_control = $featuregroup->is_qc + Function: get/set whether or not the featuregroup is used for quality control purposes + Returns : a boolean (equivalent) + Args : a new value + +=cut + +sub is_qc { + my $self = shift; + $self->{is_qc} = shift if defined @_; + return $self->{is_qc}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Expression/FeatureGroup/FeatureGroupMas50.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Expression/FeatureGroup/FeatureGroupMas50.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,29 @@ +# Let the code begin... +package Bio::Expression::FeatureGroup::FeatureGroupMas50; + +use strict; + +use base qw(Bio::Expression::FeatureGroup); +use vars qw($DEBUG); + +use Class::MakeMethods::Emulator::MethodMaker + get_set => [qw( + + probe_set_name stat_pairs stat_pairs_used + signal detection detection_p_value + stat_common_pairs signal_log_ratio + signal_log_ratio_low + signal_log_ratio_high change change_p_value + positive negative pairs pairs_used + pairs_inavg pos_fraction log_avg + pos_neg avg_diff abs_call inc dec + inc_ratio dec_ratio pos_change + neg_change inc_dec dpos_dneg_ratio + log_avg_ratio_change diff_call + avg_diff_change b_a fold_change + sort_score + + )], +; + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Expression/FeatureI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Expression/FeatureI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,121 @@ +# $Id: FeatureI.pm,v 1.3 2002/10/30 23:15:23 allenday Exp $ +# BioPerl module for Bio::Expression::FeatureI +# +# Copyright Allen Day , Stan Nelson +# Human Genetics, UCLA Medical School, University of California, Los Angeles + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Expression::FeatureI - an interface class for DNA/RNA features + +=head1 SYNOPSIS + +Do not use this module directly + +=head1 DESCRIPTION + +This provides a standard bioperl interface class for representing +DNA and RNA features. It cannot be instantiated directly, but serves +as an abstract base class for implementors. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Allen Day Eallenday@ucla.eduE + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... +package Bio::Expression::FeatureI; + +use strict; +use Bio::Root::Root; + +use base qw(Bio::Root::Root Bio::PrimarySeqI); +use vars qw($DEBUG); + +=head2 quantitation() + + Title : value + Usage : $val = $ftr->quantitation() + Function: get/set the feature's quantitation + Returns : A numeric value + Args : a new numeric value (optional) + +=cut + +sub quantitation { + shift->throw_not_implemented(); +} + +=head2 quantitation_units() + + Title : quantitation_units + Usage : $units = $ftr->quantitation_units() + Function: get/set the units of the feature's quantitation + Returns : A string or undef + Args : a new string (optional) + +=cut + +sub quantitation_units { + shift->throw_not_implemented(); +} + +=head2 standard_deviation() + + Title : standard_deviation + Usage : $std_dev = $ftr->standard_deviation() + Function: get/set the feature's standard deviation of quantitation() + Returns : A numeric value + Args : a new numeric value (optional) + Comments: no calculation is done here + +=cut + +sub standard_deviation { + shift->throw_not_implemented(); +} + +=head2 sample_count() + + Title : sample_count + Usage : $sample_count = $ftr->sample_count() + Function: get/set the number of samples used to calculate + quantitation() + Returns : An integer + Args : a new integer (optional) + +=cut + +sub sample_count { + shift->throw_not_implemented(); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Expression/ProbeI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Expression/ProbeI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,61 @@ +# $Id: ProbeI.pm,v 1.8.2.1 2003/09/15 21:55:41 allenday Exp $ +# BioPerl module for Bio::Expression::ProbeI +# +# Copyright Allen Day , Stan Nelson +# Human Genetics, UCLA Medical School, University of California, Los Angeles + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Expression::ProbeI - an interface class for DNA/RNA probes + +=head1 SYNOPSIS + +Do not use this module directly + +=head1 DESCRIPTION + +This class ISA Bio::Expression::FeatureI, nothing more. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bio.perl.org/bioperl-bugs/ + +=head1 AUTHOR + +Allen Day Eallenday@ucla.eduE + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... +package Bio::Expression::ProbeI; + +use strict; +use Bio::Root::Root; + +use base qw(Bio::Expression::FeatureI); +use vars qw($DEBUG); + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Factory/AnalysisI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Factory/AnalysisI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,156 @@ +# $Id: AnalysisI.pm,v 1.2.2.1 2003/07/04 02:40:29 shawnh Exp $ +# +# BioPerl module for Bio::Factory::AnalysisI +# +# Cared for by Martin Senger +# For copyright and disclaimer see below. +# + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Factory::AnalysisI - An interface to analysis tool factory + +=head1 SYNOPSIS + +This is an interface module - you do not instantiate it. +Use I module: + + use Bio::Tools::Run::AnalysisFactory; + my $list = new Bio::Tools::Run::AnalysisFactory->available_analyses; + +=head1 DESCRIPTION + +This interface contains all public methods for showing available +analyses and for creating objects representing them. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bioperl.org/bioperl-bugs/ + +=head1 AUTHOR + +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2003, Martin Senger and EMBL-EBI. +All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=head1 SEE ALSO + +=over + +=item * + +http://industry.ebi.ac.uk/soaplab/Perl_Client.html + +=back + +=head1 APPENDIX + +This is actually the main documentation... + +If you try to call any of these methods directly on this +C object you will get a I +error message. You need to call them on a +C object instead. + +=cut + + +# Let the code begin... + +package Bio::Factory::AnalysisI; +use vars qw(@ISA $Revision); +use strict; +use Bio::Root::RootI; + +@ISA = qw(Bio::Root::RootI); + +BEGIN { + $Revision = q$Id: AnalysisI.pm,v 1.2.2.1 2003/07/04 02:40:29 shawnh Exp $; +} + + +# ----------------------------------------------------------------------------- + +=head2 available_categories + + Usage : $factory->available_categories; + Returns : an array reference with the names of + available categories + Args : none + +The analysis tools may be grouped into categories by their functional +similarity, or by the similar data types they deal with. This method +shows all available such categories. + +=cut + +sub available_categories { shift->throw_not_implemented(); } + +# ----------------------------------------------------------------------------- + +=head2 available_analyses + + Usage : $factory->available_analyses; + $factory->available_analyses ($category); + Returns : an array reference with the names of + all available analyses, or the analyses + available in the given '$category' + Args : none || category_name + +Show available analyses. Their names usually consist of category +analysis names, separated by C<::>. + +=cut + +sub available_analyses { shift->throw_not_implemented(); } + +# ----------------------------------------------------------------------------- + +=head2 create_analysis + + Usage : $factory->create_analysis ($name); + Returns : a Bio::Tools::Run::Analyis object + Args : analysis name + +A real I method creating an analysis object. The created +object gets all access and location information from the factory +object. + +=cut + +sub create_analysis { shift->throw_not_implemented(); } + +# ----------------------------------------------------------------------------- + + +1; +__END__ + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Factory/ApplicationFactoryI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Factory/ApplicationFactoryI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,85 @@ +# $Id: ApplicationFactoryI.pm,v 1.5 2002/10/22 07:38:32 lapp Exp $ +# +# BioPerl module for Bio::Factory::ApplicationFactoryI +# +# Cared for by Heikki Lehvaslaiho +# +# Copyright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Factory::ApplicationFactoryI - Interface class for Application Factories + +=head1 SYNOPSIS + +You wont be using this as an object, but using a derived class. + +=head1 DESCRIPTION + +Holds common Application Factory attributes in place. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email: heikki@ebi.ac.uk +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::Factory::ApplicationFactoryI; +use vars qw(@ISA); +use strict; + +use Bio::Root::RootI; +@ISA = qw(Bio::Root::RootI); + +=head2 version + + Title : version + Usage : exit if $prog->version() < 1.8 + Function: Determine the version number of the program + Example : + Returns : float or undef + Args : none + +=cut + +sub version { + shift->throw_not_implemented(); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Factory/BlastHitFactory.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Factory/BlastHitFactory.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,249 @@ +#----------------------------------------------------------------- +# $Id: BlastHitFactory.pm,v 1.7 2002/10/22 09:38:09 sac Exp $ +# +# BioPerl module for Bio::Factory::BlastHitFactory +# +# Cared for by Steve Chervitz +# +# You may distribute this module under the same terms as perl itself +#----------------------------------------------------------------- + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Factory::BlastHitFactory - Factory for Bio::Search::Hit::BlastHit objects + +=head1 SYNOPSIS + + use Bio::Factory::BlastHitFactory; + + my $hit_fact = Bio::Factory::BlastHitFactory->new(); + + my $hit = $hit_fact->create_hit( %parameters ); + +See documentation for create_hit() for information about C<%parameters>. + +=head1 DESCRIPTION + +This module encapsulates code for creating Bio::Search::Hit::BlastHit +and Bio::Search::HSP::BlastHSP objects from traditional BLAST report +data (i.e., non-XML formatted). + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Steve Chervitz Esac@bioperl.orgE + +See L for where to send bug reports and comments. + +=head1 COPYRIGHT + +Copyright (c) 2001 Steve Chervitz. All Rights Reserved. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + +#' + +package Bio::Factory::BlastHitFactory; + +use strict; +use Bio::Root::Root; +use Bio::Factory::HitFactoryI; +use Bio::Search::Hit::BlastHit; + +use vars qw(@ISA); + +@ISA = qw(Bio::Root::Root Bio::Factory::HitFactoryI); + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + return $self; +} + +=head2 create_hit + + Title : create_hit + Usage : $hit = $factory->create_hit( %params ); + Function: Creates a new Bio::Search::Hit::BlastHit object given + raw BLAST report data, formatted in traditional BLAST report format. + Returns : A single Bio::Search::Hit::BlastHit object + Args : Named parameters to be passed to the BlastHit object. + Parameter keys are case-insensitive. + See Bio::Search::Hit::BlastHit::new() documentation for + details about these parameters. + The only additional parameter required is: + -RESULT => a Bio::Search::Result::BlastResult object. + From this result object, the program, query length, + and iteration are obtained and passed on to the BlastHit. + +=cut + +sub create_hit { + my ($self, @args) = @_; + + my ($blast, $raw_data, $shallow_parse) = + $self->_rearrange( [qw(RESULT + RAW_DATA + SHALLOW_PARSE)], @args); + + my %args = @args; + $args{'-PROGRAM'} = $blast->analysis_method; + $args{'-QUERY_LEN'} = $blast->query_length; + $args{'-ITERATION'} = $blast->iterations; + + my $hit = Bio::Search::Hit::BlastHit->new( %args ); + + unless( $shallow_parse ) { + $self->_add_hsps( $hit, + $args{'-PROGRAM'}, + $args{'-QUERY_LEN'}, + $blast->query_name, + @{$raw_data} ); + } + + return $hit; +} + +#=head2 _add_hsps +# +# Usage : Private method; called automatically by create_hit(). +# Purpose : Creates BlastHSP.pm objects for each HSP in a BLAST hit alignment. +# : Also collects the full description of the hit from the +# : HSP alignment section. +# Returns : n/a +# Argument : (<$BlastHit_object>, <$program_name>, <$query_length>, <$query_name>, <@raw_data> +# 'raw data list' consists of traditional BLAST report +# format for a single HSP, supplied as a list of strings. +# Throws : Warnings for each BlastHSP.pm object that fails to be constructed. +# : Exception if no BlastHSP.pm objects can be constructed. +# : Exception if can't parse length data for hit sequence. +# Comments : Requires Bio::Search::HSP::BlastHSP.pm. +# : Sets the description using the full string present in +# : the alignment data. +#=cut + +#-------------- +sub _add_hsps { +#-------------- + my( $self, $hit, $prog, $qlen, $qname, @data ) = @_; + my $start = 0; + my $hspCount = 0; + + require Bio::Search::HSP::BlastHSP; + +# printf STDERR "\nBlastHit \"$hit\" _process_hsps(). \nDATA (%d lines) =\n@data\n", scalar(@data); + + my( @hspData, @hspList, @errs, @bad_names ); + my($line, $set_desc, @desc); + $set_desc = 0; + my $hname = $hit->name; + my $hlen; + + hit_loop: + foreach $line( @data ) { + + if( $line =~ /^\s*Length = ([\d,]+)/ ) { + $hit->_set_description(@desc); + $set_desc = 1; + $hit->_set_length($1); + $hlen = $hit->length; + next hit_loop; + } elsif( !$set_desc) { + $line =~ s/^\s+|\s+$//g; + push @desc, $line; + next hit_loop; + } elsif( $line =~ /^\s*Score/ ) { + ## This block is for setting multiple HSPs. + + if( not scalar @hspData ) { + $start = 1; + push @hspData, $line; + next hit_loop; + + } elsif( scalar @hspData) { + $hspCount++; + $self->verbose and do{ print STDERR +( $hspCount % 10 ? "+" : "+\n" ); }; + +# print STDERR "\nBlastHit: setting HSP #$hspCount \n@hspData\n"; + my $hspObj = Bio::Search::HSP::BlastHSP->new + (-RAW_DATA => \@hspData, + -RANK => $hspCount, + -PROGRAM => $prog, + -QUERY_NAME => $qname, + -HIT_NAME => $hname, + ); + push @hspList, $hspObj; + @hspData = (); + push @hspData, $line; + next; + } else { + push @hspData, $line; + } + } elsif( $start ) { + ## This block is for setting the last HSP (which may be the first as well!). + if( $line =~ /^(end|>|Parameters|CPU|Database:)/ ) { + $hspCount++; + $self->verbose and do{ print STDERR +( $hspCount % 10 ? "+" : "+\n" ); }; + +# print STDERR "\nBlastHit: setting HSP #$hspCount \n@hspData"; + + my $hspObj = Bio::Search::HSP::BlastHSP->new + (-RAW_DATA => \@hspData, + -RANK => $hspCount, + -PROGRAM => $prog, + -QUERY_NAME => $qname, + -HIT_NAME => $hname, + ); + push @hspList, $hspObj; + } else { + push @hspData, $line; + } + } + } + + $hit->{'_length'} or $self->throw( "Can't determine hit sequence length."); + + # Adjust logical length based on BLAST flavor. + if($prog =~ /TBLAST[NX]/) { + $hit->{'_logical_length'} = $hit->{'_length'} / 3; + } + + $hit->{'_hsps'} = [ @hspList ]; + +# print STDERR "\n--------> Done building HSPs for $hit (total HSPS: ${\$hit->num_hsps})\n"; + +} + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Factory/BlastResultFactory.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Factory/BlastResultFactory.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,113 @@ +#----------------------------------------------------------------- +# $Id: BlastResultFactory.pm,v 1.5 2002/10/22 07:38:32 lapp Exp $ +# +# BioPerl module for Bio::Factory::BlastResultFactory +# +# Cared for by Steve Chervitz +# +# You may distribute this module under the same terms as perl itself +#----------------------------------------------------------------- + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Factory::BlastResultFactory - Factory for Bio::Search::Result::BlastResult objects + +=head1 SYNOPSIS + + use Bio::Factory::BlastResultFactory; + + my $result_fact = Bio::Factory::BlastResultFactory->new(); + + my $result = $result_fact->create_result( %parameters ); + +See documentation for create_result() for information about C<%parameters>. + +=head1 DESCRIPTION + +This module encapsulates code for creating Bio::Search::Result::BlastResult +and Bio::Search::HSP::BlastHSP objects from traditional BLAST report +data (i.e., non-XML formatted). + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Steve Chervitz Esac@bioperl.orgE + +See L for where to send bug reports and comments. + +=head1 COPYRIGHT + +Copyright (c) 2001 Steve Chervitz. All Rights Reserved. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + +#' + +package Bio::Factory::BlastResultFactory; + +use strict; +use Bio::Root::Root; +use Bio::Factory::ResultFactoryI; +use Bio::Search::Result::BlastResult; + +use vars qw(@ISA); + +@ISA = qw(Bio::Root::Root Bio::Factory::ResultFactoryI); + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + return $self; +} + +=head2 create_result + + Title : create_result + Usage : $result = $factory->create_result( %params ); + Function: Creates a new Bio::Search::Result::BlastResult object. + Returns : A single Bio::Search::Result::BlastResult object + Args : none + +=cut + +sub create_result { + my ($self, @args) = @_; + + my $result = Bio::Search::Result::BlastResult->new( @args ); + + return $result; +} + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Factory/DriverFactory.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Factory/DriverFactory.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,186 @@ +# $Id: DriverFactory.pm,v 1.10 2002/12/01 00:05:20 jason Exp $ +# +# BioPerl module for Bio::Factory::DriverFactory +# +# Cared for by Jason Stajich and +# Hilmar Lapp +# +# Copyright Jason Stajich, Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Factory::DriverFactory - Base class for factory classes loading drivers + +=head1 SYNOPSIS + + #this class is not instantiable + +=head1 DESCRIPTION + +This a base class for factory classes that load drivers. Normally, you don't +instantiate this class directly. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email Jason Stajich Ejason@bioperl.orgE + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + +#' +package Bio::Factory::DriverFactory; +use strict; +use Bio::Root::Root; +use Bio::Root::IO; + +use vars qw(@ISA %DRIVERS); + +@ISA = qw(Bio::Root::Root); + +BEGIN { + %DRIVERS = (); +} + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + return $self; +} + +=head2 register_driver + + Title : register_driver + Usage : $factory->register_driver("genscan", "Bio::Tools::Genscan"); + Function: Registers a driver a factory class should be able to instantiate. + + This method can be called both as an instance and as a class + method. + + Returns : + Args : Key of the driver (string) and the module implementing the driver + (string). + +=cut + +sub register_driver { + my ($self, @args) = @_; + my %drivers = @args; + + foreach my $drv (keys(%drivers)) { + # note that this doesn't care whether $self is the class or the object + $self->driver_table()->{$drv} = $drivers{$drv}; + } +} + +=head2 driver_table + + Title : driver_table + Usage : $table = $factory->driver_table(); + Function: Returns a reference to the hash table storing associations of + methods with drivers. + + You use this table to look up registered methods (keys) and + drivers (values). + + In this implementation the table is class-specific and therefore + shared by all instances. You can override this in a derived class, + but note that this method can be called both as an instance and a + class method. + + This will be the table used by the object internally. You should + definitely know what you're doing if you modify the table's + contents. Modifications are shared by _all_ instances, those present + and those yet to be created. + + Returns : A reference to a hash table. + Args : + + +=cut + +sub driver_table { + my ($self, @args) = @_; + + return \%DRIVERS; +} + +=head2 get_driver + + Title : get_driver + Usage : $module = $factory->get_driver("genscan"); + Function: Returns the module implementing a driver registered under the + given key. + Example : + Returns : A string. + Args : Key of the driver (string). + +=cut + +sub get_driver { + my ($self, $key) = @_; + + if(exists($self->driver_table()->{$key})) { + return $self->driver_table()->{$key}; + } + return undef; +} + +=head2 _load_module + + Title : _load_module + Usage : $self->_load_module("Bio::Tools::Genscan"); + Function: Loads up (like use) a module at run time on demand. + Example : + Returns : TRUE on success + Args : + +=cut + +sub _load_module { + my ($self, $name) = @_; + my ($module, $load, $m); + $module = "_<$name.pm"; + return 1 if $main::{$module}; + $load = "$name.pm"; + + my $io = new Bio::Root::IO(); + # catfile comes from IO + $load = $io->catfile((split(/::/,$load))); + eval { + require $load; + }; + if ( $@ ) { + $self->throw("$load: $name cannot be found: ".$@); + } + return 1; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Factory/EMBOSS.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Factory/EMBOSS.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,284 @@ +# $Id: EMBOSS.pm,v 1.10 2002/10/22 07:38:32 lapp Exp $ +# +# BioPerl module for Bio::Factory::EMBOSS +# +# +# Cared for by Heikki Lehvaslaiho +# +# Copyright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Factory::EMBOSS - EMBOSS appliaction factory class + +=head1 SYNOPSIS + + # get an EMBOSS factory + use Bio::Factory::EMBOSS; + $f = Bio::Factory::EMBOSS -> new(); + # get an EMBOSS application object from the factory + $water = $f->program('water'); + + # here is an example of running the application + # water can compare 1 seq against 1->many sequences + # in a database using Smith-Waterman + my $seq_to_test; # this would have a seq here + my @seqs_to_check; # this would be a list of seqs to compare + # (could be just 1) + my $wateroutfile = 'out.water'; + $water->run({ '-sequencea' => $seq_to_test, + '-seqall' => \@seqs_to_check, + '-gapopen' => '10.0', + '-gapextend' => '0.5', + '-outfile' => $wateroutfile}); + # now you might want to get the alignment + use Bio::AlignIO; + my $alnin = new Bio::AlignIO(-format => 'emboss', + -file => $wateroutfile); + + while( my $aln = $alnin->next_aln ) { + # process the alignment -- these will be Bio::SimpleAlign objects + } + +=head1 DESCRIPTION + +The EMBOSS factory class encapsulates access to EMBOSS programs. A +factory object allows creation of only known applications. + +If you want to check command line options before sending them to the +program set $factory-Everbose to positive integer. The value is +passed on to programs objects and the ADC description of the available +command line options is parsed and compared to input. + +See also L and +L. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email: heikki@ebi.ac.uk +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::Factory::EMBOSS; +use vars qw(@ISA $EMBOSSVERSION); +use strict; + +use Bio::Root::Root; +use Bio::Tools::Run::EMBOSSApplication; +use Bio::Factory::ApplicationFactoryI; +@ISA = qw(Bio::Root::Root Bio::Factory::ApplicationFactoryI ); + +$EMBOSSVERSION = "2.0.0"; + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + # set up defaults + + my($location) = + $self->_rearrange([qw(LOCATION )], + @args); + + $self->{ '_programs' } = {}; + $self->{ '_programgroup' } = {}; + $self->{ '_groups' } = {}; + + $self->location($location) if $location; + + $self->_program_list; # retrieve info about available programs + + return $self; + +} + +=head2 location + + Title : location + Usage : $embossfactory->location + Function: get/set the location of EMBOSS programs. + Valid values are 'local' and 'novella'. + Returns : string, defaults to 'local' + Args : string + +=cut + +sub location { + my ($self, $value) = @_; + my %location = ('local' => '1', + 'novella' => '1' + ); + if (defined $value) { + $value = lc $value; + if ($location{$value}) { + $self->{'_location'} = $value; + } else { + $self->warn("Value [$value] not a valid value for ". + "location(). Defaulting to [local]"); + $self->{'_location'} = 'local'; + } + } + $self->{'_location'} ||= 'local'; + return $self->{'_location'}; +} + + +=head2 program + + Title : program + Usage : $embossfactory->program('program_name') + Function: Creates a representation of a single EMBOSS program + Returns : Bio::Tools::Run::EMBOSSApplication object + Args : string, program name + +=cut + +sub program { + my ($self, $value) = @_; + + unless( $self->{'_programs'}->{$value} ) { + $self->warn("Application [$value] is not available!"); + return undef; + } + my $attr = {}; + $attr->{name} = $value; + $attr->{verbose} = $self->verbose; + + my $appl = Bio::Tools::Run::EMBOSSApplication ->new($attr); + return $appl; +} + +=head2 version + + Title : $self->version + Usage : $embossfactory->version() + Function: gets the version of EMBOSS programs + Throws : if EMBOSS suite is not accessible + Returns : version value + Args : None + +=cut + +sub version { + my ($self) = @_; + my ($version); + eval { + $version = `embossversion -auto`; + }; + $self->throw("EMBOSS suite of programs is not available \n\n$@") + if $@; + chop $version; + + # compare versions + my ($thisv, $embossv); + $version =~ /(\d+)\.(\d+)\.(\d+)/; + $thisv = "$1.$2$3"; + $EMBOSSVERSION =~ /(\d+)\.(\d+)\.(\d+)/; + $embossv = "$1.$2$3"; + $self->throw("EMBOSS has to be at least version $EMBOSSVERSION\n") + if $thisv < $embossv; + + return $version; +} + + +=head2 Programs + +These methods allow the programmer to query the EMBOSS suite and find +out which program names can be used and what arguments can be used. + +=head2 program_info + + Title : program_info + Usage : $embossfactory->program_info('emma') + Function: Finds out if the program is available. + Returns : definition string of the program, undef if program name not known + Args : string, prgramname + +=cut + +sub program_info { + my ($self, $value) = @_; + return $self->{'_programs'}->{$value}; +} + + +=head2 Internal methods + +Do not call these methods directly + +=head2 _program_list + + Title : _program_list + Usage : $embossfactory->_program_list() + Function: Finds out what programs are available. + Writes the names into an internal hash. + Returns : true if successful + Args : None + +=cut + +sub _program_list { + my ($self) = @_; + if( $^O =~ /MSWIN/i || + $^O =~ /Mac/i ) { return; } + { + local * SAVERR; + open SAVERR, ">&STDERR"; + open STDERR, ">/dev/null"; + open(WOSSOUT, "wossname -auto |") || return; + open STDERR, ">&SAVERR"; + + } + local $/ = "\n\n"; + while( ) { + my ($groupname) = (/^([A-Z][A-Z0-9 ]+)$/m); + #print $groupname, "\n" if $groupname; + $self->{'_groups'}->{$groupname} = [] if $groupname; + while ( /^([a-z]\w+) +(.+)$/mg ) { + #print "$1\t$2 \n" if $1; + $self->{'_programs'}->{$1} = $2 if $1; + $self->{'_programgroup'}->{$1} = $groupname if $1; + push @{$self->{'_groups'}->{$groupname}}, $1 if $1; + } + } + close(WOSSOUT); + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Factory/FTLocationFactory.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Factory/FTLocationFactory.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,235 @@ +# $Id: FTLocationFactory.pm,v 1.9.2.4 2003/09/14 19:15:39 jason Exp $ +# +# BioPerl module for Bio::Factory::FTLocationFactory +# +# Cared for by Hilmar Lapp +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# +# (c) Hilmar Lapp, hlapp at gnf.org, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Factory::FTLocationFactory - A FeatureTable Location Parser + +=head1 SYNOPSIS + + # parse a string into a location object + $loc = Bio::Factory::FTLocationFactory->from_string("join(100..200, 400..500"); + +=head1 DESCRIPTION + +Implementation of string-encoded location parsing for the Genbank feature table +encoding of locations. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp at gmx.net + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Factory::FTLocationFactory; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::Root::Root; +use Bio::Factory::LocationFactoryI; +use Bio::Location::Simple; +use Bio::Location::Split; +use Bio::Location::Fuzzy; + + +@ISA = qw(Bio::Root::Root Bio::Factory::LocationFactoryI); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Factory::FTLocationFactory(); + Function: Builds a new Bio::Factory::FTLocationFactory object + Returns : an instance of Bio::Factory::FTLocationFactory + Args : + + +=cut + +=head2 from_string + + Title : from_string + Usage : $loc = $locfactory->from_string("100..200"); + Function: Parses the given string and returns a Bio::LocationI implementing + object representing the location encoded by the string. + + This implementation parses the Genbank feature table + encoding of locations. + Example : + Returns : A Bio::LocationI implementing object. + Args : A string. + + +=cut + +sub from_string{ + # the third parameter is purely optional and indicates a recursive + # call if set + my ($self,$locstr,$is_rec) = @_; + my $loc; + + # there is no place in FT-formatted location strings where whitespace + # carries meaning, so strip it off entirely upfront + $locstr =~ s/\s+//g if ! $is_rec; + + # does it contain an operator? + if($locstr =~ /^([A-Za-z]+)\((.*)\)$/) { + # yes: + my $op = $1; + my $oparg = $2; + if($op eq "complement") { + # parse the argument recursively, then set the strand to -1 + $loc = $self->from_string($oparg, 1); + $loc->strand(-1); + } elsif(($op eq "join") || ($op eq "order") || ($op eq "bond")) { + # This is a split location. Split into components and parse each + # one recursively, then gather into a SplitLocationI instance. + # + # Note: The following code will /not/ work with nested + # joins (you want to have grammar-based parsing for that). + $loc = Bio::Location::Split->new(-verbose => $self->verbose, + -splittype => $op); + foreach my $substr (split(/,/, $oparg)) { + $loc->add_sub_Location($self->from_string($substr, 1)); + } + } else { + $self->throw("operator \"$op\" unrecognized by parser"); + } + } else { + # no operator, parse away + $loc = $self->_parse_location($locstr); + } + return $loc; +} + +=head2 _parse_location + + Title : _parse_location + Usage : $loc = $locfactory->_parse_location( $loc_string) + + Function: Parses the given location string and returns a location object + with start() and end() and strand() set appropriately. + Note that this method is private. + Returns : A Bio::LocationI implementing object or undef on failure + Args : location string + +=cut + +sub _parse_location { + my ($self, $locstr) = @_; + my ($loc, $seqid); + + $self->debug( "Location parse, processing $locstr\n"); + + # 'remote' location? + if($locstr =~ /^(\S+):(.*)$/) { + # yes; memorize remote ID and strip from location string + $seqid = $1; + $locstr = $2; + } + + # split into start and end + my ($start, $end) = split(/\.\./, $locstr); + # remove enclosing parentheses if any; note that because of parentheses + # possibly surrounding the entire location the parentheses around start + # and/or may be asymmetrical + $start =~ s/^\(+//; + $start =~ s/\)+$//; + $end =~ s/^\(+// if $end; + $end =~ s/\)+$// if $end; + + # Is this a simple (exact) or a fuzzy location? Simples have exact start + # and end, or is between two adjacent bases. Everything else is fuzzy. + my $loctype = ".."; # exact with start and end as default + my $locclass = "Bio::Location::Simple"; + if(! defined($end)) { + if($locstr =~ /(\d+)([\.\^])(\d+)/) { + $start = $1; + $end = $3; + $loctype = $2; + $locclass = "Bio::Location::Fuzzy" + unless (abs($end - $start) <= 1) && ($loctype eq "^"); + + } else { + $end = $start; + } + } + if ( ($start =~ /[\>\<\?\.\^]/) || ($end =~ /[\>\<\?\.\^]/) ) { + $locclass = 'Bio::Location::Fuzzy'; + } + + # instantiate location and initialize + $loc = $locclass->new(-verbose => $self->verbose, + -start => $start, + -end => $end, + -strand => 1, + -location_type => $loctype); + # set remote ID if remote location + if($seqid) { + $loc->is_remote(1); + $loc->seq_id($seqid); + } + + # done (hopefully) + return $loc; + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Factory/HitFactoryI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Factory/HitFactoryI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,93 @@ +#----------------------------------------------------------------- +# $Id: HitFactoryI.pm,v 1.6 2002/10/22 07:38:32 lapp Exp $ +# +# BioPerl module for Bio::Factory::HitFactoryI +# +# Cared for by Steve Chervitz +# +# You may distribute this module under the same terms as perl itself +#----------------------------------------------------------------- + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Factory::HitFactoryI - Interface for an object that builds Bio::Search::Hit::HitI objects + +=head1 SYNOPSIS + +To be completed. + +=head1 DESCRIPTION + +To be completed. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Steve Chervitz Esac@bioperl.orgE + +See L for where to send bug reports and comments. + +=head1 COPYRIGHT + +Copyright (c) 2001 Steve Chervitz. All Rights Reserved. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. + +=cut + +#' + +package Bio::Factory::HitFactoryI; + +use strict; +use Bio::Root::RootI; + +use vars qw(@ISA); + +@ISA = qw(Bio::Root::RootI); + +=head2 create_hit + + Title : create_hit + Usage : $hit = $factory->create_hit( %params ); + Function: Creates a new Bio::Search::Hit::HitI object. + Returns : An object that implements the Bio::Search::Hit::HitI interface + Args : Named parameters (to be defined) + +=cut + +sub create_hit { + my ($self, @args) = @_; + $self->throw_not_implemented; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Factory/LocationFactoryI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Factory/LocationFactoryI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,107 @@ +# $Id: LocationFactoryI.pm,v 1.2 2002/10/22 07:45:14 lapp Exp $ +# +# BioPerl module for Bio::Factory::LocationFactoryI +# +# Cared for by Hilmar Lapp +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# +# (c) Hilmar Lapp, hlapp at gnf.org, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Factory::LocationFactoryI - DESCRIPTION of Interface + +=head1 SYNOPSIS + +Give standard usage here + +=head1 DESCRIPTION + +Describe the interface here + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp at gmx.net + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Factory::LocationFactoryI; +use vars qw(@ISA); +use strict; +use Carp; +use Bio::Root::RootI; + +@ISA = qw( Bio::Root::RootI ); + +=head2 from_string + + Title : from_string + Usage : $loc = $locfactory->from_string("100..200"); + Function: Parses the given string and returns a Bio::LocationI implementing + object representing the location encoded by the string. + + Different implementations may support different encodings. An + example of a commonly used encoding is the Genbank feature table + encoding of locations. + Example : + Returns : A Bio::LocationI implementing object. + Args : A string. + + +=cut + +sub from_string{ + my ($self,@args) = @_; + + $self->throw_not_implemented(); +} diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Factory/MapFactoryI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Factory/MapFactoryI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,111 @@ +# $Id: MapFactoryI.pm,v 1.5 2002/10/22 07:45:14 lapp Exp $ +# +# BioPerl module for Bio::Factory::MapFactoryI +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Factory::MapFactoryI - A Factory for getting markers + +=head1 SYNOPSIS + + # get a Map Factory somehow likely from Bio::MapIO system + + while( my $map = $mapin->next_map ) { + print "map name is ", $map->name, " length is ", + $map->length, " ", $map->units, "\n"; + $mapout->write_map($map); + } + +=head1 DESCRIPTION + +This interface describes the necessary minimum methods for getting +Maps from a data stream. It also supports writing Map data back to a +stream. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Factory::MapFactoryI; +use vars qw(@ISA); +use strict; +use Bio::Root::RootI; + +@ISA = qw(Bio::Root::RootI); + +=head2 next_map + + Title : next_map + Usage : my $map = $factory->next_map; + Function: Get a map from the factory + Returns : L + Args : none + +=cut + +sub next_map{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +=head2 write_map + + Title : write_map + Usage : $factory->write_map($map); + Function: Write a map out through the factory + Returns : none + Args : L + +=cut + +sub write_map{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Factory/ObjectBuilderI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Factory/ObjectBuilderI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,201 @@ +# $Id: ObjectBuilderI.pm,v 1.2 2002/10/22 07:45:14 lapp Exp $ +# +# BioPerl module for Bio::Factory::ObjectBuilderI +# +# Cared for by Hilmar Lapp +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# +# (c) Hilmar Lapp, hlapp at gmx.net, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Factory::ObjectBuilderI - Interface for an object builder + +=head1 SYNOPSIS + +Give standard usage here + +=head1 DESCRIPTION + +An object builder is different from an object factory in that it +accumulates information for the object and finally, or constantly, +depending on the implementation, builds the object. It also allows for +implementations that can tell the information feed in which kind of +information the builder is interested in which not. In addition, the +implementation may choose to filter, transform, or completely ignore +certain content it is fed for certain slots. + +Implementations will hence be mostly used by stream-based parsers to +parse only desired content, and/or skip over undesired entries. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp at gmx.net + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Factory::ObjectBuilderI; +use vars qw(@ISA); +use strict; +use Carp; +use Bio::Root::RootI; + +@ISA = qw( Bio::Root::RootI ); + +=head2 want_slot + + Title : want_slot + Usage : + Function: Whether or not the object builder wants to populate the + specified slot of the object to be built. + + The slot can be specified either as the name of the + respective method, or the initialization parameter that + would be otherwise passed to new() of the object to be + built. + + Example : + Returns : TRUE if the object builder wants to populate the slot, and + FALSE otherwise. + Args : the name of the slot (a string) + + +=cut + +sub want_slot{ + shift->throw_not_implemented(); +} + +=head2 add_slot_value + + Title : add_slot_value + Usage : + Function: Adds one or more values to the specified slot of the object + to be built. + + Naming the slot is the same as for want_slot(). + + The object builder may further filter the content to be + set, or even completely ignore the request. + + If this method reports failure, the caller should not add + more values to the same slot. In addition, the caller may + find it appropriate to abandon the object being built + altogether. + + Example : + Returns : TRUE on success, and FALSE otherwise + Args : the name of the slot (a string) + parameters determining the value to be set + + +=cut + +sub add_slot_value{ + shift->throw_not_implemented(); +} + +=head2 want_object + + Title : want_object + Usage : + Function: Whether or not the object builder is still interested in + continuing with the object being built. + + If this method returns FALSE, the caller should not add any + more values to slots, or otherwise risks that the builder + throws an exception. In addition, make_object() is likely + to return undef after this method returned FALSE. + + Example : + Returns : TRUE if the object builder wants to continue building + the present object, and FALSE otherwise. + Args : none + + +=cut + +sub want_object{ + shift->throw_not_implemented(); +} + +=head2 make_object + + Title : make_object + Usage : + Function: Get the built object. + + This method is allowed to return undef if no value has ever + been added since the last call to make_object(), or if + want_object() returned FALSE (or would have returned FALSE) + before calling this method. + + For an implementation that allows consecutive building of + objects, a caller must call this method once, and only + once, between subsequent objects to be built. I.e., a call + to make_object implies 'end_object.' + + Example : + Returns : the object that was built + Args : none + + +=cut + +sub make_object{ + shift->throw_not_implemented(); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Factory/ObjectFactory.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Factory/ObjectFactory.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,252 @@ +# $Id: ObjectFactory.pm,v 1.1.2.1 2003/03/27 10:07:56 lapp Exp $ +# +# BioPerl module for Bio::Factory::ObjectFactory +# +# Cared for by Hilmar Lapp +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# +# (c) Hilmar Lapp, hlapp at gmx.net, 2003. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2003. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Factory::ObjectFactory - Instantiates a new Bio::Root::RootI (or derived class) through a factory + +=head1 SYNOPSIS + + use Bio::Factory::ObjectFactory; + + my $factory = new Bio::Factory::ObjectFactory(-type => 'Bio::Ontology::GOterm'); + my $term = $factory->create_object(-name => 'peroxisome', + -ontology => 'Gene Factory', + -identifier => 'GO:0005777'); + + +=head1 DESCRIPTION + +This object will build L objects generically. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp at gmx.net + + +=head1 CONTRIBUTORS + +This is mostly copy-and-paste with subsequent adaptation from +Bio::Seq::SeqFactory by Jason Stajich. Most credits should in fact go +to him. + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Factory::ObjectFactory; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::Factory::ObjectFactoryI; + +@ISA = qw(Bio::Root::Root Bio::Factory::ObjectFactoryI); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Factory::ObjectFactory(); + Function: Builds a new Bio::Factory::ObjectFactory object + Returns : Bio::Factory::ObjectFactory + Args : -type => string, name of a L derived class. + There is no default. + -interface => string, name of the interface or class any type + specified needs to at least implement. + The default is Bio::Root::RootI. + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + my ($type,$interface) = $self->_rearrange([qw(TYPE INTERFACE)], @args); + + $self->{'_loaded_types'} = {}; + $self->interface($interface || "Bio::Root::RootI"); + $self->type($type) if $type; + + return $self; +} + + +=head2 create_object + + Title : create_object + Usage : my $seq = $factory->create_object(); + Function: Instantiates a new object of the previously set type. + + This object allows us to genericize the instantiation of + objects. + + You must have provided -type at instantiation, or have + called type($mytype) before you can call this method. + + Returns : an object of the type returned by type() + + The return type is configurable using new(-type =>"..."), + or by calling $self->type("My::Fancy::Class"). + Args : Initialization parameters specific to the type of + object we want. Check the POD of the class you set as type. + +=cut + +sub create_object { + my ($self,@args) = @_; + + my $type = $self->type(); # type has already been loaded upon set + return $type->new(-verbose => $self->verbose, @args); +} + +=head2 type + + Title : type + Usage : $obj->type($newval) + Function: Get/set the type of object to be created. + + This may be changed at any time during the lifetime of this + factory. + + Returns : value of type (a string) + Args : newvalue (optional, a string) + + +=cut + +sub type{ + my $self = shift; + + if(@_) { + my $type = shift; + if($type && (! $self->{'_loaded_types'}->{$type})) { + eval { + $self->_load_module($type); + }; + if( $@ ) { + $self->throw("module for '$type' failed to load: ". + $@); + } + my $o = bless {},$type; + if(!$self->_validate_type($o)) { # this may throw an exception + $self->throw("'$type' is not valid for factory ".ref($self)); + } + $self->{'_loaded_types'}->{$type} = 1; + } + return $self->{'type'} = $type; + } + return $self->{'type'}; +} + +=head2 interface + + Title : interface + Usage : $obj->interface($newval) + Function: Get/set the interface or base class that supplied types + must at least implement (inherit from). + Example : + Returns : value of interface (a scalar) + Args : on set, new value (a scalar or undef, optional) + + +=cut + +sub interface{ + my $self = shift; + my $interface = shift; + + if($interface) { + return $self->{'interface'} = $interface; + } + return $self->{'interface'}; +} + +=head2 _validate_type + + Title : _validate_type + Usage : + Function: Called to let derived factories validate the type set + via type(). + + The default implementation here checks whether the supplied + object skeleton implements the interface set via -interface + upon factory instantiation. + + Example : + Returns : TRUE if the type is to be considered valid, and FALSE otherwise. + Instead of returning FALSE this method may also just throw + an informative exception. + Args : A hash reference blessed into the specified type, allowing + queries like isa(). + + +=cut + +sub _validate_type{ + my ($self,$obj) = @_; + + if(! $obj->isa($self->interface())) { + $self->throw("invalid type: '".ref($obj). + "' does not implement '".$self->interface()."'"); + } + return 1; +} + +##################################################################### +# aliases for naming consistency or other reasons # +##################################################################### + +*create = \&create_object; + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Factory/ObjectFactoryI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Factory/ObjectFactoryI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,114 @@ +# $Id: ObjectFactoryI.pm,v 1.3 2002/10/22 07:45:14 lapp Exp $ +# +# BioPerl module for Bio::Factory::ObjectFactoryI +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Factory::ObjectFactoryI - A General object creator factory + +=head1 SYNOPSIS + +# see the implementations of this interface for details but +# basically + + my $obj = $factory->create(%args); + +=head1 DESCRIPTION + +This interface is the basic structure for a factory which creates new +objects. In this case it is up to the implementer to check arguments +and initialize whatever new object the implementing class is designed for. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Factory::ObjectFactoryI; +use vars qw(@ISA); +use strict; +use Carp; +use Bio::Root::RootI; + +@ISA = qw( Bio::Root::RootI ); + +=head2 create + + Title : create + Usage : $factory->create(%args) + Function: Create a new object + Returns : a new object + Args : hash of initialization parameters + + +=cut + +sub create{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +=head2 create_object + + Title : create_object + Usage : $obj = $factory->create_object(%args) + Function: Create a new object. + + This is supposed to supercede create(). Right now it only delegates + to create(). + Returns : a new object + Args : hash of initialization parameters + + +=cut + +sub create_object{ + my ($self,@args) = @_; + return $self->create(@args); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Factory/ResultFactoryI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Factory/ResultFactoryI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,93 @@ +#----------------------------------------------------------------- +# $Id: ResultFactoryI.pm,v 1.6 2002/10/22 07:38:32 lapp Exp $ +# +# BioPerl module Bio::Factory::ResultFactoryI +# +# Cared for by Steve Chervitz +# +# You may distribute this module under the same terms as perl itself +#----------------------------------------------------------------- + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Factory::ResultFactoryI - Interface for an object that builds Bio::Search::Result::ResultI objects + +=head1 SYNOPSIS + +To be completed. + +=head1 DESCRIPTION + +To be completed. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Steve Chervitz Esac@bioperl.orgE + +See L for where to send bug reports and comments. + +=head1 COPYRIGHT + +Copyright (c) 2001 Steve Chervitz. All Rights Reserved. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. + +=cut + +#' + +package Bio::Factory::ResultFactoryI; + +use strict; +use Bio::Root::RootI; + +use vars qw(@ISA); + +@ISA = qw(Bio::Root::RootI); + +=head2 create_result + + Title : create_result + Usage : $result = $factory->create_result( %params ); + Function: Creates a new Bio::Search::Result::ResultI object. + Returns : An object that implements the Bio::Search::Result::ResultI interface + Args : Named parameters (to be defined) + +=cut + +sub create_result { + my ($self, @args) = @_; + $self->throw_not_implemented; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Factory/SeqAnalysisParserFactory.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Factory/SeqAnalysisParserFactory.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,167 @@ +# $Id: SeqAnalysisParserFactory.pm,v 1.9 2002/12/01 00:05:20 jason Exp $ +# +# BioPerl module for Bio::Factory::SeqAnalysisParserFactory +# +# Cared for by Jason Stajich , +# and Hilmar Lapp +# +# Copyright Jason Stajich, Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Factory::SeqAnalysisParserFactory - class capable of + creating SeqAnalysisParserI compliant parsers + +=head1 SYNOPSIS + + # initialize an object implementing this interface, e.g. + $factory = Bio::Factory::SeqAnalysisParserFactory->new(); + # find out the methods it knows about + print "registered methods: ", + join(', ', keys($factory->driver_table())), "\n"; + # obtain a parser object + $parser = $factory->get_parser(-input=>$inputobj, + -params=>[@params], + -method => $method); + # $parser is an object implementing Bio::SeqAnalysisParserI + # annotate sequence with features produced by parser + while(my $feat = $parser->next_feature()) { + $seq->add_SeqFeature($feat); + } + +=head1 DESCRIPTION + +This is a factory class capable of instantiating SeqAnalysisParserI +implementing parsers. + +The concept behind this class and the interface it implements +(Bio::Factory::SeqAnalysisParserFactoryI) is a generic analysis result parsing +in high-throughput automated sequence annotation pipelines. See +Bio::SeqAnalysisParserI for more documentation of this concept. + +You can always find out the methods an instance of this class knows +about by the way given in the SYNOPSIS section. By default, and +assuming that the documentation is up-to-date, this will comprise of +genscan, mzef, estscan, blast, hmmer, gff, and sim4 (all case-insensitive). + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp, Jason Stajich + +Email Hilmar Lapp Ehlapp@gmx.netE, Jason Stajich Ejason@bioperl.orgE + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + +package Bio::Factory::SeqAnalysisParserFactory; +use strict; + +use Bio::Factory::SeqAnalysisParserFactoryI; +use Bio::Factory::DriverFactory; + +use vars qw(@ISA); +@ISA = qw(Bio::Factory::DriverFactory Bio::Factory::SeqAnalysisParserFactoryI); + +BEGIN { + Bio::Factory::DriverFactory->register_driver + ( + "genscan" => "Bio::Tools::Genscan", + "mzef" => "Bio::Tools::MZEF", + "estscan" => "Bio::Tools::ESTScan", + "bplite" => "Bio::Tools::BPlite", + "blast" => "Bio::Tools::BPlite", + "hmmer" => "Bio::Tools::HMMER::Result", + "gff" => "Bio::Tools::GFF", + "sim4" => "Bio::Tools::Sim4::Results", + "epcr" => "Bio::Tools::EPCR", + "exonerate" => "Bio::Tools::Exonerate", + ); +} + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + # no per-object initialization right now - registration of default drivers + # is only done once when the module is loaded + return $self; +} + +=head2 get_parser + + Title : get_parser + Usage : $factory->get_parser(-input=>$inputobj, + [ -params=>[@params] ], + -method => $method) + Function: Creates and returns a parser object for the given input and method. + Both file names and streams (filehandles) are allowed. + + Parameters (-params argument) are passed on to the parser object + and therefore are specific to the parser to be created. + Example : + Returns : A Bio::SeqAnalysisParserI implementing object. Exception if + creation of the parser object fails. + Args : B - object/file where analysis results are coming from, + B - parameter to use when parsing/running analysis + B - method of analysis + +=cut + +sub get_parser { + my ($self, @args) = @_; + my $parser; + my $module; + + my ($input, $params, $method) = + $self->_rearrange([qw(INPUT PARAMS METHOD)], @args); + + # retrieve module name for requested method + $method = lc $method; # method is case-insensitive + $module = $self->get_driver($method); + if(! defined($module)) { + $self->throw("Analysis parser driver for method $method not registered."); + } + # load module + $self->_load_module($module); # throws an exception on failure to load + # make sure parameters is not undef + $params = [] if( !defined $params ); + # figure out input method (file or stream) + my $inputmethod = '-file'; + if( ref($input) =~ /GLOB/i ) { + $inputmethod = '-fh'; + } + # instantiate parser and return the result + $parser = $module->new($inputmethod => $input, @$params); + if(! $parser->isa('Bio::SeqAnalysisParserI')) { + $self->throw("Driver $module registered for method $method does not implement Bio::SeqAnalyisParserI. How come?"); + } + return $parser; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Factory/SeqAnalysisParserFactoryI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Factory/SeqAnalysisParserFactoryI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,113 @@ +# $Id: SeqAnalysisParserFactoryI.pm,v 1.8 2002/12/01 00:05:20 jason Exp $ +# +# BioPerl module for Bio::Factory::SeqAnalysisParserFactoryI +# +# Cared for by Jason Stajich , +# and Hilmar Lapp +# +# Copyright Jason Stajich, Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Factory::SeqAnalysisParserFactoryI - interface describing objects capable + of creating SeqAnalysisParserI compliant parsers + +=head1 SYNOPSIS + + # initialize an object implementing this interface, e.g. + $factory = Bio::Factory::SeqAnalysisParserFactory->new(); + # obtain a parser object + $parser = $factory->get_parser(-input=>$inputobj, + -params=>[@params], + -method => $method); + # $parser is an object implementing Bio::SeqAnalysisParserI + # annotate sequence with features produced by parser + while(my $feat = $parser->next_feature()) { + $seq->add_SeqFeature($feat); + } + +=head1 DESCRIPTION + +This is an interface for factory classes capable of instantiating +SeqAnalysisParserI implementing parsers. + +The concept behind the interface is a generic analysis result parsing +in high-throughput automated sequence annotation pipelines. See +L for more documentation of this concept. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp, Jason Stajich + +Email Hilmar Lapp Ehlapp@gmx.netE, Jason Stajich Ejason@bioperl.orgE + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +package Bio::Factory::SeqAnalysisParserFactoryI; +use strict; + +use Bio::Root::RootI; +use Carp; + +use vars qw(@ISA ); +@ISA = qw(Bio::Root::RootI); + +=head2 get_parser + + Title : get_parser + Usage : $factory->get_parser(-input=>$inputobj, + [ -params=>[@params] ], + -method => $method) + Function: Creates and returns a parser object for the given input and method. + The type of input which is suitable depends on the implementation, + but a good-style implementation should allow both file names and + streams (filehandles). + + A particular implementation may not be able to create a parser for + the requested method. In this case it shall return undef. + + Parameters (-params argument) are passed on to the parser object + and therefore are specific to the parser to be created. An + implementation of this interface should make this argument optional. + Example : + Returns : A Bio::SeqAnalysisParserI implementing object. + Args : B - object/file where analysis results are coming from, + B - parameter to use when parsing/running analysis + B - method of analysis + +=cut + +sub get_parser { + my ( $self, @args) = @_; + $self->throw_not_implemented(); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Factory/SequenceFactoryI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Factory/SequenceFactoryI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,99 @@ +# $Id: SequenceFactoryI.pm,v 1.6 2002/10/22 07:45:14 lapp Exp $ +# +# BioPerl module for Bio::Factory::SequenceFactoryI +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Factory::SequenceFactoryI - This interface allows for generic building of sequences in factories which create sequences (like SeqIO) + +=head1 SYNOPSIS + +# do not use this object directly it is an interface +# get a Bio::Factory::SequenceFactoryI object like + + use Bio::Seq::SeqFactory; + my $seqbuilder = new Bio::Seq::SeqFactory('type' => 'Bio::PrimarySeq'); + + my $seq = $seqbuilder->create(-seq => 'ACTGAT', + -display_id => 'exampleseq'); + + print "seq is a ", ref($seq), "\n"; + +=head1 DESCRIPTION + +Describe the interface here + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Factory::SequenceFactoryI; + +use vars qw(@ISA); +use strict; +use Bio::Factory::ObjectFactoryI; + +@ISA = qw(Bio::Factory::ObjectFactoryI); + +=head2 create + + Title : create + Usage : my $seq = $seqbuilder->create(-seq => 'CAGT', + -id => 'name'); + Function: Instantiates new Bio::PrimarySeqI (or one of its child classes) + This object allows us to genericize the instantiation of sequence + objects. + Returns : Bio::PrimarySeqI + Args : initialization parameters specific to the type of sequence + object we want. Typically + -seq => $str, + -display_id => $name + +=cut + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Factory/SequenceProcessorI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Factory/SequenceProcessorI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,205 @@ +# $Id: SequenceProcessorI.pm,v 1.1 2002/10/24 18:35:46 lapp Exp $ +# +# BioPerl module for Bio::Factory::SequenceProcessorI +# +# Cared for by Hilmar Lapp +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# +# (c) Hilmar Lapp, hlapp at gmx.net, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Factory::SequenceProcessorI - Interface for chained sequence + processing algorithms + +=head1 SYNOPSIS + + use Bio::SeqIO; + use MySeqProcessor; # is-a Bio::Factory::SequenceProcessorI + + # obtain your source stream, e.g., an EMBL file + my $seqin = Bio::SeqIO->new(-fh => \*STDIN, -format => 'embl'); + # create your processor (it must implement this interface) + my $seqalgo = MySeqProcessor->new(); + # chain together + $seqalgo->source_stream($seqin); + # you could create more processors and chain them one after another + # ... + # finally, the last link in the chain is your SeqIO stream + my $seqpipe = $seqalgo; + + # once you've established the pipeline, proceed as if you had a + # single SeqIO stream + while(my $seq = $seqpipe->next_seq()) { + # ... do something ... + } + +=head1 DESCRIPTION + +This defines an interface that allows seamless chaining of sequence +processing algorithms encapsulated in modules while retaining the +overall Bio::SeqIO interface at the end of the pipeline. + +This is especially useful if you want an easily configurable +processing pipeline of re-usable algorithms as building blocks instead +of (hard-)coding the whole algorithm in a single script. + +There are literally no restrictions as to what an individual module +can do with a sequence object it obtains from the source stream before +it makes it available through its own next_seq() method. It can +manipulate the sequence object, but otherwise keep it intact, but it +can also create any number of new sequence objects from it, or it can +discard some, or any combination thereof. The only requirement is that +its next_seq() method return Bio::PrimarySeqI compliant objects. In +order to play nice, if a processor creates new objects it should try +to use the same sequence factory that the source stream uses, but this +is not strongly mandated. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bioperl.org/bioperl-bugs/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp at gmx.net + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Factory::SequenceProcessorI; +use vars qw(@ISA); +use strict; +use Carp; +use Bio::Root::RootI; +use Bio::Factory::SequenceStreamI; + +@ISA = qw( Bio::Factory::SequenceStreamI ); + +=head2 source_stream + + Title : source_stream + Usage : $obj->source_stream($newval) + Function: Get/set the source sequence stream for this sequence + processor. + + An implementation is not required to allow set, but will + usually do so. + + Example : + Returns : A Bio::Factory::SequenceStreamI compliant object + Args : on set, new value (a Bio::Factory::SequenceStreamI compliant + object) + + +=cut + +sub source_stream{ + shift->throw_not_implemented(); +} + +=head1 Bio::Factory::SequenceStreamI methods + + The requirement to implement these methods is inherited from + L. An implementation may not + necessarily have to implement all methods in a meaningful way. Which + methods will be necessary very much depends on the context in which + an implementation of this interface is used. E.g., if it is only used + for post-processing sequences read from a SeqIO stream, write_seq() + will not be used and hence does not need to be implemented in a + meaningful way (it may in fact even throw an exception). + + Also, since an implementor will already receive built objects from a + sequence stream, sequence_factory() may or may not be relevant, + depending on whether the processing method does or does not involve + creating new objects. + +=cut + +=head2 next_seq + + Title : next_seq + Usage : $seq = stream->next_seq + Function: Reads the next sequence object from the stream and returns it. + + In the case of a non-recoverable situation an exception + will be thrown. Do not assume that you can resume parsing + the same stream after catching the exception. Note that you + can always turn recoverable errors into exceptions by + calling $stream->verbose(2). + + Returns : a Bio::Seq sequence object + Args : none + +See L + +=cut + +=head2 write_seq + + Title : write_seq + Usage : $stream->write_seq($seq) + Function: writes the $seq object into the stream + Returns : 1 for success and 0 for error + Args : Bio::Seq object + +=cut + +=head2 sequence_factory + + Title : sequence_factory + Usage : $seqio->sequence_factory($seqfactory) + Function: Get the Bio::Factory::SequenceFactoryI + Returns : Bio::Factory::SequenceFactoryI + Args : none + + +=cut + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Factory/SequenceStreamI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Factory/SequenceStreamI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,134 @@ +# $Id: SequenceStreamI.pm,v 1.3 2002/10/22 07:45:14 lapp Exp $ +# +# BioPerl module for Bio::Factory::SequenceStreamI +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Factory::SequenceStreamI - Interface describing the basics of a Sequence Stream. + +=head1 SYNOPSIS + + # get a SequenceStreamI object somehow like with SeqIO + use Bio::SeqIO; + my $in = new Bio::SeqIO(-file => '< fastafile'); + while( my $seq = $in->next_seq ) { + } + +=head1 DESCRIPTION + +This interface is for describing objects which produces +Bio::PrimarySeqI objects or processes Bio::PrimarySeqI objects to a +data stream. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Factory::SequenceStreamI; +use vars qw(@ISA); +use strict; +use Bio::Root::RootI; + +@ISA= qw(Bio::Root::RootI); + +=head2 next_seq + + Title : next_seq + Usage : $seq = stream->next_seq + Function: Reads the next sequence object from the stream and returns it. + + Certain driver modules may encounter entries in the stream that + are either misformatted or that use syntax not yet understood + by the driver. If such an incident is recoverable, e.g., by + dismissing a feature of a feature table or some other non-mandatory + part of an entry, the driver will issue a warning. In the case + of a non-recoverable situation an exception will be thrown. + Do not assume that you can resume parsing the same stream after + catching the exception. Note that you can always turn recoverable + errors into exceptions by calling $stream->verbose(2). + Returns : a Bio::Seq sequence object + Args : none + +See L + +=cut + +sub next_seq { + shift->throw_not_implemented(); +} + +=head2 write_seq + + Title : write_seq + Usage : $stream->write_seq($seq) + Function: writes the $seq object into the stream + Returns : 1 for success and 0 for error + Args : Bio::Seq object + +=cut + +sub write_seq { + shift->throw_not_implemented(); +} + +=head2 sequence_factory + + Title : sequence_factory + Usage : $seqio->sequence_factory($seqfactory) + Function: Get the Bio::Factory::SequenceFactoryI + Returns : Bio::Factory::SequenceFactoryI + Args : none + + +=cut + +sub sequence_factory{ + shift->throw_not_implemented(); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Factory/TreeFactoryI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Factory/TreeFactoryI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,115 @@ +# $Id: TreeFactoryI.pm,v 1.6 2002/10/22 07:45:14 lapp Exp $ +# +# BioPerl module for Bio::Factory::TreeFactoryI +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Factory::TreeFactoryI - Factory Interface for getting and writing trees + from/to a data stream + +=head1 SYNOPSIS + + # get a $factory from somewhere Bio::TreeIO likely + my $treeio = new Bio::TreeIO(-format => 'newick', #this is phylip/newick format + -file => 'file.tre'); + my $treeout = new Bio::TreeIO(-format => 'nexus', + -file => ">file.nexus"); + + # convert tree formats from newick/phylip to nexus + while(my $tree = $treeio->next_tree) { + $treeout->write_tree($treeout); + } + +=head1 DESCRIPTION + +This interface describes the minimal functions needed to get and write +trees from a data stream. It is implemented by the L factory. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Factory::TreeFactoryI; +use vars qw(@ISA); +use strict; +use Bio::Root::RootI; + +@ISA = qw(Bio::Root::RootI); + +=head2 next_tree + + Title : next_tree + Usage : my $tree = $factory->next_tree; + Function: Get a tree from the factory + Returns : L + Args : none + +=cut + +sub next_tree{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +=head2 write_tree + + Title : write_tree + Usage : $treeio->write_tree($tree); + Function: Writes a tree onto the stream + Returns : none + Args : L + + +=cut + +sub write_tree{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/FeatureHolderI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/FeatureHolderI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,194 @@ +# $Id: FeatureHolderI.pm,v 1.2 2002/11/19 07:04:22 lapp Exp $ +# +# BioPerl module for Bio::FeatureHolderI +# +# Cared for by Hilmar Lapp +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::FeatureHolderI - the base interface an object with features must implement + +=head1 SYNOPSIS + + use Bio::SeqIO; + # get a feature-holding object somehow: for example, Bio::SeqI objects + # have features + my $seqio = Bio::SeqIO->new(-fh => \*STDIN, -format => 'genbank); + while (my $seq = $seqio->next_seq()) { + # $seq is-a Bio::FeatureHolderI, hence: + my @feas = $seq->get_SeqFeatures(); + # each element is-a Bio::SeqFeatureI + foreach my $fea (@feas) { + # do something with the feature objects + } + } + +=head1 DESCRIPTION + +This is the base interface that all feature-holding objects must +implement. + +Popular feature-holders are for instance L objects. Since +L defines a sub_SeqFeature() method, most +Bio::SeqFeatureI implementations like L will +implement the feature holder interface as well. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bioperl.org/bioperl-bugs/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp at gmx.net + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::FeatureHolderI; +use vars qw(@ISA); +use strict; +use Carp; +use Bio::Root::RootI; + +@ISA = qw( Bio::Root::RootI ); + +=head2 get_SeqFeatures + + Title : get_SeqFeatures + Usage : + Function: Get the feature objects held by this feature holder. + Example : + Returns : an array of Bio::SeqFeatureI implementing objects + Args : none + +At some day we may want to expand this method to allow for a feature +filter to be passed in. + +=cut + +sub get_SeqFeatures{ + shift->throw_not_implemented(); +} + +=head2 feature_count + + Title : feature_count + Usage : $obj->feature_count() + Function: Return the number of SeqFeatures attached to a feature holder. + + This is before flattening a possible sub-feature tree. + + We provide a default implementation here that just counts + the number of objects returned by get_SeqFeatures(). + Implementors may want to override this with a more + efficient implementation. + + Returns : integer representing the number of SeqFeatures + Args : None + +At some day we may want to expand this method to allow for a feature +filter to be passed in. + +Our default implementation allows for any number of additional +arguments and will pass them on to get_SeqFeatures(). I.e., in order to +support filter arguments, just support them in get_SeqFeatures(). + +=cut + +sub feature_count { + return scalar(shift->get_SeqFeatures(@_)); +} + +=head2 get_all_SeqFeatures + + Title : get_all_SeqFeatures + Usage : + Function: Get the flattened tree of feature objects held by this + feature holder. The difference to get_SeqFeatures is that + the entire tree of sub-features will be flattened out. + + We provide a default implementation here, so implementors + don''t necessarily need to implement this method. + + Example : + Returns : an array of Bio::SeqFeatureI implementing objects + Args : none + +At some day we may want to expand this method to allow for a feature +filter to be passed in. + +Our default implementation allows for any number of additional +arguments and will pass them on to any invocation of +get_SeqFeatures(), wherever a component of the tree implements +FeatureHolderI. I.e., in order to support filter arguments, just +support them in get_SeqFeatures(). + +=cut + +sub get_all_SeqFeatures{ + my $self = shift; + my @flatarr; + + foreach my $feat ( $self->get_SeqFeatures(@_) ){ + push(@flatarr,$feat); + &_add_flattened_SeqFeatures(\@flatarr,$feat,@_); + } + return @flatarr; +} + +sub _add_flattened_SeqFeatures { + my ($arrayref,$feat,@args) = @_; + my @subs = (); + + if($feat->isa("Bio::FeatureHolderI")) { + @subs = $feat->get_SeqFeatures(@args); + } elsif($feat->isa("Bio::SeqFeatureI")) { + @subs = $feat->sub_SeqFeature(); + } else { + confess ref($feat)." is neither a FeatureHolderI nor a SeqFeatureI. ". + "Don't know how to flatten."; + } + foreach my $sub (@subs) { + push(@$arrayref,$sub); + &_add_flattened_SeqFeatures($arrayref,$sub); + } + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,112 @@ +package Bio::Graphics; + +use Bio::Graphics::Panel; +use strict; + +use vars '$VERSION'; +$VERSION = 1.2003; + +1; + +=head1 NAME + +Bio::Graphics - Generate GD images of Bio::Seq objects + +=head1 SYNOPSIS + + # This script generates a PNG picture of a 10K region containing a + # set of red features and a set of blue features. Call it like this: + # red_and_blue.pl > redblue.png + # you can now view the picture with your favorite image application + + + # This script parses a GenBank or EMBL file named on the command + # line and produces a PNG rendering of it. Call it like this: + # render.pl my_file.embl | display - + + use strict; + use Bio::Graphics; + use Bio::SeqIO; + + my $file = shift or die "provide a sequence file as the argument"; + my $io = Bio::SeqIO->new(-file=>$file) or die "could not create Bio::SeqIO"; + my $seq = $io->next_seq or die "could not find a sequence in the file"; + + my @features = $seq->all_SeqFeatures; + + # sort features by their primary tags + my %sorted_features; + for my $f (@features) { + my $tag = $f->primary_tag; + push @{$sorted_features{$tag}},$f; + } + + my $wholeseq = Bio::SeqFeature::Generic->new(-start=>1,-end=>$seq->length); + + my $panel = Bio::Graphics::Panel->new( + -length => $seq->length, + -key_style => 'between', + -width => 800, + -pad_left => 10, + -pad_right => 10, + ); + $panel->add_track($wholeseq, + -glyph => 'arrow', + -bump => 0, + -double=>1, + -tick => 2); + + $panel->add_track($seq, + -glyph => 'generic', + -bgcolor => 'blue', + -label => 1, + ); + + # general case + my @colors = qw(cyan orange blue purple green chartreuse magenta yellow aqua); + my $idx = 0; + for my $tag (sort keys %sorted_features) { + my $features = $sorted_features{$tag}; + $panel->add_track($features, + -glyph => 'generic', + -bgcolor => $colors[$idx++ % @colors], + -fgcolor => 'black', + -font2color => 'red', + -key => "${tag}s", + -bump => +1, + -height => 8, + -label => 1, + -description => 1, + ); + } + + print $panel->png; + exit 0; + +=head1 DESCRIPTION + +Please see L for the full interface. + +=head1 SEE ALSO + +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Feature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Feature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,557 @@ +package Bio::Graphics::Feature; + +=head1 NAME + +Bio::Graphics::Feature - A simple feature object for use with Bio::Graphics::Panel + +=head1 SYNOPSIS + + use Bio::Graphics::Feature; + + # create a simple feature with no internal structure + $f = Bio::Graphics::Feature->new(-start => 1000, + -stop => 2000, + -type => 'transcript', + -name => 'alpha-1 antitrypsin', + -desc => 'an enzyme inhibitor', + ); + + # create a feature composed of multiple segments, all of type "similarity" + $f = Bio::Graphics::Feature->new(-segments => [[1000,1100],[1500,1550],[1800,2000]], + -name => 'ABC-3', + -type => 'gapped_alignment', + -subtype => 'similarity'); + + # build up a gene exon by exon + $e1 = Bio::Graphics::Feature->new(-start=>1,-stop=>100,-type=>'exon'); + $e2 = Bio::Graphics::Feature->new(-start=>150,-stop=>200,-type=>'exon'); + $e3 = Bio::Graphics::Feature->new(-start=>300,-stop=>500,-type=>'exon'); + $f = Bio::Graphics::Feature->new(-segments=>[$e1,$e2,$e3],-type=>'gene'); + +=head1 DESCRIPTION + +This is a simple Bio::SeqFeatureI-compliant object that is compatible +with Bio::Graphics::Panel. With it you can create lightweight feature +objects for drawing. + +All methods are as described in L with the following additions: + +=head2 The new() Constructor + + $feature = Bio::Graphics::Feature->new(@args); + +This method creates a new feature object. You can create a simple +feature that contains no subfeatures, or a hierarchically nested object. + +Arguments are as follows: + + -start the start position of the feature + -end the stop position of the feature + -stop an alias for end + -name the feature name (returned by seqname()) + -type the feature type (returned by primary_tag()) + -source the source tag + -desc a description of the feature + -segments a list of subfeatures (see below) + -subtype the type to use when creating subfeatures + -strand the strand of the feature (one of -1, 0 or +1) + -id an alias for -name + -seqname an alias for -name + -primary_id an alias for -name + -display_id an alias for -name + -display_name an alias for -name (do you get the idea the API has changed?) + -attributes a hashref of tag value attributes, in which the key is the tag + and the value is an array reference of values + -factory a reference to a feature factory, used for compatibility with + more obscure parts of Bio::DB::GFF + +The subfeatures passed in -segments may be an array of +Bio::Graphics::Feature objects, or an array of [$start,$stop] +pairs. Each pair should be a two-element array reference. In the +latter case, the feature type passed in -subtype will be used when +creating the subfeatures. + +If no feature type is passed, then it defaults to "feature". + +=head2 Non-SeqFeatureI methods + +A number of new methods are provided for compatibility with +Ace::Sequence, which has a slightly different API from SeqFeatureI: + +=over 4 + +=item add_segment(@segments) + +Add one or more segments (a subfeature). Segments can either be +Feature objects, or [start,stop] arrays, as in the -segments argument +to new(). The feature endpoints are automatically adjusted. + +=item segments() + +An alias for sub_SeqFeature(). + +=item merged_segments() + +Another alias for sub_SeqFeature(). + +=item stop() + +An alias for end(). + +=item name() + +An alias for seqname(). + +=item exons() + +An alias for sub_SeqFeature() (you don't want to know why!) + +=back + +=cut + +use strict; +use Bio::Root::Root; +use Bio::SeqFeatureI; +use Bio::SeqI; +use Bio::LocationI; + +use vars '@ISA'; +@ISA = qw(Bio::Root::Root Bio::SeqFeatureI Bio::LocationI Bio::SeqI); + +*stop = \&end; +*info = \&name; +*seqname = \&name; +*type = \&primary_tag; +*exons = *sub_SeqFeature = *merged_segments = \&segments; +*method = \&type; +*source = \&source_tag; + +sub target { return; } +sub hit { return; } + +# usage: +# Bio::Graphics::Feature->new( +# -start => 1, +# -end => 100, +# -name => 'fred feature', +# -strand => +1); +# +# Alternatively, use -segments => [ [start,stop],[start,stop]...] +# to create a multisegmented feature. +sub new { + my $class= shift; + $class = ref($class) if ref $class; + my %arg = @_; + + my $self = bless {},$class; + + $arg{-strand} ||= 0; + $self->{strand} = $arg{-strand} ? ($arg{-strand} >= 0 ? +1 : -1) : 0; + $self->{name} = $arg{-name} || $arg{-seqname} || $arg{-display_id} + || $arg{-display_name} || $arg{-id} || $arg{-primary_id}; + $self->{type} = $arg{-type} || 'feature'; + $self->{subtype} = $arg{-subtype} if exists $arg{-subtype}; + $self->{source} = $arg{-source} || $arg{-source_tag} || ''; + $self->{score} = $arg{-score} if exists $arg{-score}; + $self->{start} = $arg{-start}; + $self->{stop} = $arg{-end} || $arg{-stop}; + $self->{ref} = $arg{-ref}; + $self->{class} = $arg{-class} if exists $arg{-class}; + $self->{url} = $arg{-url} if exists $arg{-url}; + $self->{seq} = $arg{-seq} if exists $arg{-seq}; + $self->{phase} = $arg{-phase} if exists $arg{-phase}; + $self->{desc} = $arg{-desc} if exists $arg{-desc}; + $self->{attrib} = $arg{-attributes} if exists $arg{-attributes}; + $self->{factory} = $arg{-factory} if exists $arg{-factory}; + + # fix start, stop + if (defined $self->{stop} && defined $self->{start} + && $self->{stop} < $self->{start}) { + @{$self}{'start','stop'} = @{$self}{'stop','start'}; + $self->{strand} *= -1; + } + + my @segments; + if (my $s = $arg{-segments}) { + $self->add_segment(@$s); + } + $self; +} + +sub add_segment { + my $self = shift; + my $type = $self->{subtype} || $self->{type}; + $self->{segments} ||= []; + + my @segments = @{$self->{segments}}; + + for my $seg (@_) { + if (ref($seg) eq 'ARRAY') { + my ($start,$stop) = @{$seg}; + next unless defined $start && defined $stop; # fixes an obscure bug somewhere above us + my $strand = $self->{strand}; + + if ($start > $stop) { + ($start,$stop) = ($stop,$start); +# $strand *= -1; + $strand = -1; + } + push @segments,$self->new(-start => $start, + -stop => $stop, + -strand => $strand, + -type => $type); + } else { + push @segments,$seg; + } + } + if (@segments) { + local $^W = 0; # some warning of an uninitialized variable... + $self->{segments} = [ sort {$a->start <=> $b->start } @segments ]; + $self->{start} = $self->{segments}[0]->start; + ($self->{stop}) = sort { $b <=> $a } map { $_->end } @segments; + } +} + +sub segments { + my $self = shift; + my $s = $self->{segments} or return wantarray ? () : 0; + @$s; +} +sub score { + my $self = shift; + my $d = $self->{score}; + $self->{score} = shift if @_; + $d; +} +sub primary_tag { shift->{type} } +sub name { + my $self = shift; + my $d = $self->{name}; + $self->{name} = shift if @_; + $d; +} +sub seq_id { shift->ref() } +sub ref { + my $self = shift; + my $d = $self->{ref}; + $self->{ref} = shift if @_; + $d; +} +sub start { + my $self = shift; + my $d = $self->{start}; + $self->{start} = shift if @_; + $d; +} +sub end { + my $self = shift; + my $d = $self->{stop}; + $self->{stop} = shift if @_; + $d; +} +sub strand { + my $self = shift; + my $d = $self->{strand}; + $self->{strand} = shift if @_; + $d; +} +sub length { + my $self = shift; + return $self->end - $self->start + 1; +} + +sub seq { + my $self = shift; + my $dna = exists $self->{seq} ? $self->{seq} : ''; + # $dna .= 'n' x ($self->length - CORE::length($dna)); + return $dna; +} +*dna = \&seq; + +=head2 factory + + Title : factory + Usage : $factory = $obj->factory([$new_factory]) + Function: Returns the feature factory from which this feature was generated. + Mostly for compatibility with weird dependencies in gbrowse. + Returns : A feature factory + Args : None + +=cut + +sub factory { + my $self = shift; + my $d = $self->{factory}; + $self->{factory} = shift if @_; + $d; +} + +=head2 display_name + + Title : display_name + Usage : $id = $obj->display_name or $obj->display_name($newid); + Function: Gets or sets the display id, also known as the common name of + the Seq object. + + The semantics of this is that it is the most likely string + to be used as an identifier of the sequence, and likely to + have "human" readability. The id is equivalent to the LOCUS + field of the GenBank/EMBL databanks and the ID field of the + Swissprot/sptrembl database. In fasta format, the >(\S+) is + presumed to be the id, though some people overload the id + to embed other information. Bioperl does not use any + embedded information in the ID field, and people are + encouraged to use other mechanisms (accession field for + example, or extending the sequence object) to solve this. + + Notice that $seq->id() maps to this function, mainly for + legacy/convenience issues. + Returns : A string + Args : None or a new id + + +=cut + +sub display_name { shift->name } + +*display_id = \&display_name; + +=head2 accession_number + + Title : accession_number + Usage : $unique_biological_key = $obj->accession_number; + Function: Returns the unique biological id for a sequence, commonly + called the accession_number. For sequences from established + databases, the implementors should try to use the correct + accession number. Notice that primary_id() provides the + unique id for the implemetation, allowing multiple objects + to have the same accession number in a particular implementation. + + For sequences with no accession number, this method should return + "unknown". + Returns : A string + Args : None + + +=cut + +sub accession_number { + return 'unknown'; +} + +=head2 alphabet + + Title : alphabet + Usage : if( $obj->alphabet eq 'dna' ) { /Do Something/ } + Function: Returns the type of sequence being one of + 'dna', 'rna' or 'protein'. This is case sensitive. + + This is not called because this would cause + upgrade problems from the 0.5 and earlier Seq objects. + + Returns : a string either 'dna','rna','protein'. NB - the object must + make a call of the type - if there is no type specified it + has to guess. + Args : none + Status : Virtual + + +=cut + +sub alphabet{ + return 'dna'; # no way this will be anything other than dna! +} + + + +=head2 desc + + Title : desc + Usage : $seqobj->desc($string) or $seqobj->desc() + Function: Sets or gets the description of the sequence + Example : + Returns : The description + Args : The description or none + + +=cut + +sub desc { + my $self = shift; + my $d = $self->{desc}; + $self->{desc} = shift if @_; + $d; +} + +sub notes { + return shift->desc; +} + +sub low { + my $self = shift; + return $self->start < $self->end ? $self->start : $self->end; +} + +sub high { + my $self = shift; + return $self->start > $self->end ? $self->start : $self->end; +} + +=head2 location + + Title : location + Usage : my $location = $seqfeature->location() + Function: returns a location object suitable for identifying location + of feature on sequence or parent feature + Returns : Bio::LocationI object + Args : none + +=cut + +sub location { + my $self = shift; + require Bio::Location::Split unless Bio::Location::Split->can('new'); + my $location; + if (my @segments = $self->segments) { + $location = Bio::Location::Split->new(); + foreach (@segments) { + $location->add_sub_Location($_); + } + } else { + $location = $self; + } + $location; +} + +sub coordinate_policy { + require Bio::Location::WidestCoordPolicy unless Bio::Location::WidestCoordPolicy->can('new'); + return Bio::Location::WidestCoordPolicy->new(); +} + +sub min_start { shift->low } +sub max_start { shift->low } +sub min_end { shift->high } +sub max_end { shift->high} +sub start_pos_type { 'EXACT' } +sub end_pos_type { 'EXACT' } +sub to_FTstring { + my $self = shift; + my $low = $self->min_start; + my $high = $self->max_end; + return "$low..$high"; +} +sub phase { shift->{phase} } +sub class { + my $self = shift; + my $d = $self->{class}; + $self->{class} = shift if @_; + return defined($d) ? $d : ucfirst $self->method; +} + +sub gff_string { + my $self = shift; + my $name = $self->name; + my $class = $self->class; + my $group = "$class $name" if $name; + my $string; + $string .= join("\t",$self->ref,$self->source||'.',$self->method||'.', + $self->start,$self->stop, + $self->score||'.',$self->strand||'.',$self->phase||'.', + $group); + $string .= "\n"; + foreach ($self->sub_SeqFeature) { + # add missing data if we need it + $_->ref($self->ref) unless defined $_->ref; + $_->name($self->name); + $_->class($self->class); + $string .= $_->gff_string; + } + $string; +} + + +sub db { return } + +sub source_tag { + my $self = shift; + my $d = $self->{source}; + $self->{source} = shift if @_; + $d; +} + +# This probably should be deleted. Not sure why it's here, but might +# have been added for Ace::Sequence::Feature-compliance. +sub introns { + my $self = shift; + return; +} + +sub has_tag { } + +# get/set the configurator (Bio::Graphics::FeatureFile) for this feature +sub configurator { + my $self = shift; + my $d = $self->{configurator}; + $self->{configurator} = shift if @_; + $d; +} + +# get/set the url for this feature +sub url { + my $self = shift; + my $d = $self->{url}; + $self->{url} = shift if @_; + $d; +} + +# make a link +sub make_link { + my $self = shift; + if (my $url = $self->url) { + return $url; + } + + elsif (my $configurator = $self->configurator) { + return $configurator->make_link($self); + } + + else { + return; + } +} + +sub all_tags { + my $self = shift; + return keys %{$self->{attrib}}; +} +sub each_tag_value { + my $self = shift; + my $tag = shift; + my $value = $self->{attrib}{$tag} or return; + return CORE::ref $value ? @{$self->{attrib}{$tag}} + : $self->{attrib}{$tag}; +} + +sub DESTROY { } + +1; + +__END__ + +=head1 SEE ALSO + +L,L, +L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/FeatureFile.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/FeatureFile.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1343 @@ +package Bio::Graphics::FeatureFile; + +# $Id: FeatureFile.pm,v 1.20.2.2 2003/08/29 19:30:10 lstein Exp $ +# This package parses and renders a simple tab-delimited format for features. +# It is simpler than GFF, but still has a lot of expressive power. +# See __END__ for the file format + +=head1 NAME + +Bio::Graphics::FeatureFile -- A set of Bio::Graphics features, stored in a file + +=head1 SYNOPSIS + + use Bio::Graphics::FeatureFile; + my $data = Bio::Graphics::FeatureFile->new(-file => 'features.txt'); + + + # create a new panel and render contents of the file onto it + my $panel = $data->new_panel; + my $tracks_rendered = $data->render($panel); + + # or do it all in one step + my ($tracks_rendered,$panel) = $data->render; + + # for more control, render tracks individually + my @feature_types = $data->types; + for my $type (@feature_types) { + my $features = $data->features($type); + my %options = $data->style($type); + $panel->add_track($features,%options); # assuming we have a Bio::Graphics::Panel + } + + # get individual settings + my $est_fg_color = $data->setting(EST => 'fgcolor'); + + # or create the FeatureFile by hand + + # add a type + $data->add_type(EST => {fgcolor=>'blue',height=>12}); + + # add a feature + my $feature = Bio::Graphics::Feature->new( + # params + ); # or some other SeqI + $data->add_feature($feature=>'EST'); + +=head1 DESCRIPTION + +The Bio::Graphics::FeatureFile module reads and parses files that +describe sequence features and their renderings. It accepts both GFF +format and a more human-friendly file format described below. Once a +FeatureFile object has been initialized, you can interrogate it for +its consistuent features and their settings, or render the entire file +onto a Bio::Graphics::Panel. + +This moduel is a precursor of Jason Stajich's +Bio::Annotation::Collection class, and fulfills a similar function of +storing a collection of sequence features. However, it also stores +rendering information about the features, and does not currently +follow the CollectionI interface. + +=head2 The File Format + +There are two types of entry in the file format: feature entries, and +formatting entries. They can occur in any order. See the Appendix +for a full example. + +Feature entries can take several forms. At their simplest, they look +like this: + + Gene B0511.1 516-11208 + +This means that a feature of type "Gene" and name "B0511.1" occupies +the range between bases 516 and 11208. A range can be specified +equally well using a hyphen, or two dots as in 516..11208. Negative +coordinates are allowed, such as -187..1000. + +A discontinuous range ("split location") uses commas to separate the +ranges. For example: + + Gene B0511.1 516-619,3185-3294,10946-11208 + +Alternatively, the locations can be split by repeating the features +type and name on multiple adjacent lines: + + Gene B0511.1 516-619 + Gene B0511.1 3185-3294 + Gene B0511.1 10946-11208 + +A comment can be added to features by adding a fourth column. These +comments will be rendered as under-the-glyph descriptions by those +glyphs that honor descriptions: + + Gene B0511.1 516-619,3185-3294,10946-11208 "Putative primase" + +Columns are separated using whitespace, not (necessarily) tabs. +Embedded whitespace can be escaped using quote marks or backslashes in +the same way as in the shell: + + 'Putative Gene' my\ favorite\ gene 516-11208 + +Features can be grouped so that they are rendered by the "group" glyph +(so far this has only been used to relate 5' and 3' ESTs). To start a +group, create a two-column feature entry showing the group type and a +name for the group. Follow this with a list of feature entries with a +blank type. For example: + + EST yk53c10 + yk53c10.3 15000-15500,15700-15800 + yk53c10.5 18892-19154 + +This example is declaring that the ESTs named yk53c10.3 and yk53c10.5 +belong to the same group named yk53c10. + +=cut + +use strict; +use Bio::Graphics::Feature; +use Bio::DB::GFF::Util::Rearrange; +use Carp; +use IO::File; +use Text::Shellwords; + +# default colors for unconfigured features +my @COLORS = qw(cyan blue red yellow green wheat turquoise orange); +use constant WIDTH => 600; + +=head2 METHODS + +=over 4 + +=item $features = Bio::Graphics::FeatureFile-Enew(@args) + +Create a new Bio::Graphics::FeatureFile using @args to initialize the +object. Arguments are -name=Evalue pairs: + + Argument Value + -------- ----- + + -file Read data from a file path or filehandle. Use + "-" to read from standard input. + + -text Read data from a text scalar. + + -map_coords Coderef containing a subroutine to use for remapping + all coordinates. + + -smart_features Flag indicating that the features created by this + module should be made aware of the FeatureFile + object by calling their configurator() method. + + -safe Indicates that the contents of this file is trusted. + Any option value that begins with the string "sub {" + or \&subname will be evaluated as a code reference. + +The -file and -text arguments are mutually exclusive, and -file will +supersede the other if both are present. + +-map_coords points to a coderef with the following signature: + + ($newref,[$start1,$end1],[$start2,$end2]....) + = coderef($ref,[$start1,$end1],[$start2,$end2]...) + +See the Bio::Graphics::Browser (part of the generic genome browser +package) for an illustration of how to use this to do wonderful stuff. + +The -smart_features flag is used by the generic genome browser to +provide features with a way to access the link-generation code. See +gbrowse for how this works. + +If the file is trusted, and there is an option named "init_code" in +the [GENERAL] section of the file, it will be evaluated as perl code +immediately after parsing. You can use this to declare global +variables and subroutines for use in option values. + +=back + +=cut + +# args array: +# -file => parse from a file (- allowed for ARGV) +# -text => parse from a text scalar +# -map_coords => code ref to do coordinate mapping +# called with ($ref,[$start1,$stop1],[$start2,$stop2]...) +# returns ($newref,$new_coord1,$new_coord2...) + +sub new { + my $class = shift; + my %args = @_; + my $self = bless { + config => {}, + features => {}, + seenit => {}, + types => [], + max => undef, + min => undef, + stat => [], + refs => {}, + safe => undef, + },$class; + $self->{coordinate_mapper} = $args{-map_coords} + if exists $args{-map_coords} && ref($args{-map_coords}) eq 'CODE'; + $self->{smart_features} = $args{-smart_features} if exists $args{-smart_features}; + $self->{safe} = $args{-safe} if exists $args{-safe}; + + # call with + # -file + # -text + my $fh; + if (my $file = $args{-file}) { + no strict 'refs'; + if (defined fileno($file)) { + $fh = $file; + } elsif ($file eq '-') { + $self->parse_argv(); + } else { + $fh = IO::File->new($file) or croak("Can't open $file: $!\n"); + } + $self->parse_file($fh); + } elsif (my $text = $args{-text}) { + $self->parse_text($text); + } + close($fh) or warn "Error closing file: $!" if $fh; + $self; +} + +# render our features onto a panel using configuration data +# return the number of tracks inserted + +=over 4 + +=item ($rendered,$panel) = $features-Erender([$panel]) + +Render features in the data set onto the indicated +Bio::Graphics::Panel. If no panel is specified, creates one. + +In a scalar context returns the number of tracks rendered. In a list +context, returns a two-element list containing the number of features +rendered and the panel. Use this form if you want the panel created +for you. + +=back + +=cut + +#" + +sub render { + my $self = shift; + my $panel = shift; + my ($position_to_insert,$options,$max_bump,$max_label) = @_; + + $panel ||= $self->new_panel; + + # count up number of tracks inserted + my $tracks = 0; + my $color; + my %types = map {$_=>1} $self->configured_types; + + my @configured_types = grep {exists $self->{features}{$_}} $self->configured_types; + my @unconfigured_types = sort grep {!exists $types{$_}} $self->types; + + my @base_config = $self->style('general'); + + my @override = (); + if ($options && ref $options eq 'HASH') { + @override = %$options; + } else { + $options ||= 0; + if ($options == 1) { # compact + push @override,(-bump => 0,-label=>0); + } elsif ($options == 2) { #expanded + push @override,(-bump=>1); + } elsif ($options == 3) { #expand and label + push @override,(-bump=>1,-label=>1); + } elsif ($options == 4) { #hyperexpand + push @override,(-bump => 2); + } elsif ($options == 5) { #hyperexpand and label + push @override,(-bump => 2,-label=>1); + } + } + + for my $type (@configured_types,@unconfigured_types) { + my $features = $self->features($type); + my @auto_bump; + push @auto_bump,(-bump => @$features < $max_bump) if defined $max_bump; + push @auto_bump,(-label => @$features < $max_label) if defined $max_label; + + my @config = ( -glyph => 'segments', # really generic + -bgcolor => $COLORS[$color++ % @COLORS], + -label => 1, + -key => $type, + @auto_bump, + @base_config, # global + $self->style($type), # feature-specific + @override, + ); + if (defined($position_to_insert)) { + $panel->insert_track($position_to_insert++,$features,@config); + } else { + $panel->add_track($features,@config); + } + $tracks++; + } + return wantarray ? ($tracks,$panel) : $tracks; +} + +sub _stat { + my $self = shift; + my $fh = shift; + $self->{stat} = [stat($fh)]; +} + +=over 4 + +=item $error = $features-Eerror([$error]) + +Get/set the current error message. + +=back + +=cut + +sub error { + my $self = shift; + my $d = $self->{error}; + $self->{error} = shift if @_; + $d; +} + +=over 4 + +=item $smart_features = $features-Esmart_features([$flag] + +Get/set the "smart_features" flag. If this is set, then any features +added to the featurefile object will have their configurator() method +called using the featurefile object as the argument. + +=back + +=cut + +sub smart_features { + my $self = shift; + my $d = $self->{smart_features}; + $self->{smart_features} = shift if @_; + $d; +} + +sub parse_argv { + my $self = shift; + + $self->init_parse; + while (<>) { + chomp; + $self->parse_line($_); + } + $self->finish_parse; +} + +sub parse_file { + my $self = shift; + my $fh = shift or return; + $self->_stat($fh); + + $self->init_parse; + while (<$fh>) { + chomp; + $self->parse_line($_); + } + $self->finish_parse; +} + +sub parse_text { + my $self = shift; + my $text = shift; + + $self->init_parse; + foreach (split /\015?\012|\015\012?/,$text) { + $self->parse_line($_); + } + $self->finish_parse; +} + +sub parse_line { + my $self = shift; + local $_ = shift; + + s/\015//g; # get rid of carriage returns left over by MS-DOS/Windows systems + + return if /^\s*[\#]/; + + if (/^\s+(.+)/ && $self->{current_tag}) { # continuation line + my $value = $1; + my $cc = $self->{current_config} ||= 'general'; # in case no configuration named + $self->{config}{$cc}{$self->{current_tag}} .= ' ' . $value; + # respect newlines in code subs + $self->{config}{$cc}{$self->{current_tag}} .= "\n" + if $self->{config}{$cc}{$self->{current_tag}}=~ /^sub\s*\{/; + return; + } + + if (/^\s*\[([^\]]+)\]/) { # beginning of a configuration section + my $label = $1; + my $cc = $label =~ /^(general|default)$/i ? 'general' : $label; # normalize + push @{$self->{types}},$cc unless $cc eq 'general'; + $self->{current_config} = $cc; + return; + } + + if (/^([\w: -]+?)\s*=\s*(.*)/) { # key value pair within a configuration section + my $tag = lc $1; + my $cc = $self->{current_config} ||= 'general'; # in case no configuration named + my $value = defined $2 ? $2 : ''; + $self->{config}{$cc}{$tag} = $value; + $self->{current_tag} = $tag; + return; + } + + + if (/^$/) { # empty line + undef $self->{current_tag}; + return; + } + + # parse data lines + my @tokens = eval { shellwords($_||'') }; + unshift @tokens,'' if /^\s+/; + + # close any open group + if (length $tokens[0] > 0 && $self->{group}) { + push @{$self->{features}{$self->{grouptype}}},$self->{group}; + undef $self->{group}; + undef $self->{grouptype}; + } + + if (@tokens < 3) { # short line; assume a group identifier + my $type = shift @tokens; + my $name = shift @tokens; + $self->{group} = Bio::Graphics::Feature->new(-name => $name, + -type => 'group'); + $self->{grouptype} = $type; + return; + } + + my($ref,$type,$name,$strand,$bounds,$description,$url); + + if (@tokens >= 8) { # conventional GFF file + my ($r,$source,$method,$start,$stop,$score,$s,$phase,@rest) = @tokens; + my $group = join ' ',@rest; + $type = join(':',$method,$source); + $bounds = join '..',$start,$stop; + $strand = $s; + if ($group) { + my ($notes,@notes); + (undef,$self->{groupname},undef,undef,$notes) = split_group($group); + foreach (@$notes) { + if (m!^(http|ftp)://!) { $url = $_ } else { push @notes,$_ } + } + $description = join '; ',@notes if @notes; + } + $name ||= $self->{group}->display_id if $self->{group}; + $ref = $r; + } + + elsif ($tokens[2] =~ /^([+-.]|[+-]?[01])$/) { # old simplified version + ($type,$name,$strand,$bounds,$description,$url) = @tokens; + } else { # new simplified version + ($type,$name,$bounds,$description,$url) = @tokens; + } + + $type ||= $self->{grouptype} || ''; + $type =~ s/\s+$//; # get rid of excess whitespace + + # the reference is specified by the GFF reference line first, + # the last reference line we saw second, + # or the reference line in the "general" section. + { + local $^W = 0; + $ref ||= $self->{config}{$self->{current_config}}{'reference'} + || $self->{config}{general}{reference}; + } + $self->{refs}{$ref}++ if defined $ref; + + my @parts = map { [/(-?\d+)(?:-|\.\.)(-?\d+)/]} split /(?:,| )\s*/,$bounds; + + foreach (@parts) { # max and min calculation, sigh... + $self->{min} = $_->[0] if !defined $self->{min} || $_->[0] < $self->{min}; + $self->{max} = $_->[1] if !defined $self->{max} || $_->[1] > $self->{max}; + } + + if ($self->{coordinate_mapper} && $ref) { + ($ref,@parts) = $self->{coordinate_mapper}->($ref,@parts); + return unless $ref; + } + + $type = '' unless defined $type; + $name = '' unless defined $name; + + # attribute handling + my %attributes; + my $score; + if (defined $description && $description =~ /\w+=\w+/) { # attribute line + my @attributes = split /;\s*/,$description; + foreach (@attributes) { + my ($name,$value) = split /=/,$_,2; + Bio::Root::Root->throw(qq("$_" is not a valid attribute=value pair)) unless defined $value; + _unescape($name); + my @values = split /,/,$value; + _unescape(@values); + if ($name =~ /^(note|description)/) { + $description = "@values"; + } elsif ($name eq 'url') { + $url = $value; + } elsif ($name eq 'score') { + $score = $value; + } else { + push @{$attributes{$name}},@values; + } + } + } + + # either create a new feature or add a segment to it + if (my $feature = $self->{seenit}{$type,$name}) { + + # create a new first part + if (!$feature->segments) { + $feature->add_segment(Bio::Graphics::Feature->new(-type => $feature->type, + -strand => $feature->strand, + -start => $feature->start, + -end => $feature->end)); + } + $feature->add_segment(@parts); + } + + else { + my @coordinates = @parts > 1 ? (-segments => \@parts) : (-start=>$parts[0][0],-end=>$parts[0][1]); + $feature = $self->{seenit}{$type,$name} = + Bio::Graphics::Feature->new(-name => $name, + -type => $type, + $strand ? (-strand => make_strand($strand)) : (), + defined $score ? (-score=>$score) : (), + -desc => $description, + -ref => $ref, + -attributes => \%attributes, + defined($url) ? (-url => $url) : (), + @coordinates, + ); + $feature->configurator($self) if $self->smart_features; + if ($self->{group}) { + $self->{group}->add_segment($feature); + } else { + push @{$self->{features}{$type}},$feature; # for speed; should use add_feature() instead + } + } +} + +sub _unescape { + foreach (@_) { + tr/+/ /; # pluses become spaces + s/%([0-9a-fA-F]{2})/chr hex($1)/g; + } + @_; +} + +=over 4 + +=item $features-Eadd_feature($feature [=E$type]) + +Add a new Bio::FeatureI object to the set. If $type is specified, the +object will be added with the indicated type. Otherwise, the +feature's primary_tag() method will be invoked to get the type. + +=back + +=cut + +# add a feature of given type to our list +# we use the primary_tag() method +sub add_feature { + my $self = shift; + my ($feature,$type) = @_; + $type = $feature->primary_tag unless defined $type; + push @{$self->{features}{$type}},$feature; +} + + +=over 4 + +=item $features-Eadd_type($type=E$hashref) + +Add a new feature type to the set. The type is a string, such as +"EST". The hashref is a set of key=Evalue pairs indicating options to +set on the type. Example: + + $features->add_type(EST => { glyph => 'generic', fgcolor => 'blue'}) + +When a feature of type "EST" is rendered, it will use the generic +glyph and have a foreground color of blue. + +=back + +=cut + +# Add a type to the list. Hash values are used for key/value pairs +# in the configuration. Call as add_type($type,$configuration) where +# $configuration is a hashref. +sub add_type { + my $self = shift; + my ($type,$type_configuration) = @_; + my $cc = $type =~ /^(general|default)$/i ? 'general' : $type; # normalize + push @{$self->{types}},$cc unless $cc eq 'general' or $self->{config}{$cc}; + if (defined $type_configuration) { + for my $tag (keys %$type_configuration) { + $self->{config}{$cc}{lc $tag} = $type_configuration->{$tag}; + } + } +} + + + +=over 4 + +=item $features-Eset($type,$tag,$value) + +Change an individual option for a particular type. For example, this +will change the foreground color of EST features to my favorite color: + + $features->set('EST',fgcolor=>'chartreuse') + +=back + +=cut + +# change configuration of a type. Call as set($type,$tag,$value) +# $type will be added if not already there. +sub set { + my $self = shift; + croak("Usage: \$featurefile->set(\$type,\$tag,\$value\n") + unless @_ == 3; + my ($type,$tag,$value) = @_; + unless ($self->{config}{$type}) { + return $self->add_type($type,{$tag=>$value}); + } else { + $self->{config}{$type}{lc $tag} = $value; + } +} + +# break circular references +sub destroy { + my $self = shift; + delete $self->{features}; +} + +sub DESTROY { shift->destroy(@_) } + +=over 4 + +=item $value = $features-Esetting($stanza =E $option) + +In the two-element form, the setting() method returns the value of an +option in the configuration stanza indicated by $stanza. For example: + + $value = $features->setting(general => 'height') + +will return the value of the "height" option in the [general] stanza. + +Call with one element to retrieve all the option names in a stanza: + + @options = $features->setting('general'); + +Call with no elements to retrieve all stanza names: + + @stanzas = $features->setting; + +=back + +=cut + +sub setting { + my $self = shift; + if ($self->safe) { + $self->code_setting(@_); + } else { + $self->_setting(@_); + } +} + +# return configuration information +# arguments are ($type) => returns tags for type +# ($type=>$tag) => returns values of tag on type +sub _setting { + my $self = shift; + my $config = $self->{config} or return; + return keys %{$config} unless @_; + return keys %{$config->{$_[0]}} if @_ == 1; + return $config->{$_[0]}{$_[1]} if @_ > 1; +} + + +=over 4 + +=item $value = $features-Ecode_setting($stanza=E$option); + +This works like setting() except that it is also able to evaluate code +references. These are options whose values begin with the characters +"sub {". In this case the value will be passed to an eval() and the +resulting codereference returned. Use this with care! + +=back + +=cut + +sub code_setting { + my $self = shift; + my $section = shift; + my $option = shift; + + my $setting = $self->_setting($section=>$option); + return unless defined $setting; + return $setting if ref($setting) eq 'CODE'; + if ($setting =~ /^\\&(\w+)/) { # coderef in string form + my $subroutine_name = $1; + my $package = $self->base2package; + my $codestring = "\\&${package}\:\:${subroutine_name}"; + my $coderef = eval $codestring; + warn $@ if $@; + $self->set($section,$option,$coderef); + return $coderef; + } + elsif ($setting =~ /^sub\s*\{/) { + my $coderef = eval $setting; + warn $@ if $@; + $self->set($section,$option,$coderef); + return $coderef; + } else { + return $setting; + } +} + +=over 4 + +=item $flag = $features-Esafe([$flag]); + +This gets or sets and "safe" flag. If the safe flag is set, then +calls to setting() will invoke code_setting(), allowing values that +begin with the string "sub {" to be interpreted as anonymous +subroutines. This is a potential security risk when used with +untrusted files of features, so use it with care. + +=back + +=cut + +sub safe { + my $self = shift; + my $d = $self->{safe}; + $self->{safe} = shift if @_; + $self->evaluate_coderefs if $self->{safe} && !$d; + $d; +} + + +=over 4 + +=item @args = $features-Estyle($type) + +Given a feature type, returns a list of track configuration arguments +suitable for suitable for passing to the +Bio::Graphics::Panel-Eadd_track() method. + +=back + +=cut + +# turn configuration into a set of -name=>value pairs suitable for add_track() +sub style { + my $self = shift; + my $type = shift; + + my $config = $self->{config} or return; + my $hashref = $config->{$type} or return; + + return map {("-$_" => $hashref->{$_})} keys %$hashref; +} + + +=over 4 + +=item $glyph = $features-Eglyph($type); + +Return the name of the glyph corresponding to the given type (same as +$features-Esetting($type=E'glyph')). + +=back + +=cut + +# retrieve just the glyph part of the configuration +sub glyph { + my $self = shift; + my $type = shift; + my $config = $self->{config} or return; + my $hashref = $config->{$type} or return; + return $hashref->{glyph}; +} + + +=over 4 + +=item @types = $features-Econfigured_types() + +Return a list of all the feature types currently known to the feature +file set. Roughly equivalent to: + + @types = grep {$_ ne 'general'} $features->setting; + +=back + +=cut + +# return list of configured types, in proper order +sub configured_types { + my $self = shift; + my $types = $self->{types} or return; + return @{$types}; +} + +=over 4 + +=item @types = $features-Etypes() + +This is similar to the previous method, but will return *all* feature +types, including those that are not configured with a stanza. + +=back + +=cut + +sub types { + my $self = shift; + my $features = $self->{features} or return; + return keys %{$features}; +} + + +=over 4 + +=item $features = $features-Efeatures($type) + +Return a list of all the feature types of type "$type". If the +featurefile object was created by parsing a file or text scalar, then +the features will be of type Bio::Graphics::Feature (which follow the +Bio::FeatureI interface). Otherwise the list will contain objects of +whatever type you added with calls to add_feature(). + +Two APIs: + + 1) original API: + + # Reference to an array of all features of type "$type" + $features = $features-Efeatures($type) + + # Reference to an array of all features of all types + $features = $features-Efeatures() + + # A list when called in a list context + @features = $features-Efeatures() + + 2) Bio::Das::SegmentI API: + + @features = $features-Efeatures(-type=>['list','of','types']); + + # variants + $features = $features-Efeatures(-type=>['list','of','types']); + $features = $features-Efeatures(-type=>'a type'); + $iterator = $features-Efeatures(-type=>'a type',-iterator=>1); + +=back + +=cut + +# return features +sub features { + my $self = shift; + my ($types,$iterator,@rest) = $_[0]=~/^-/ ? rearrange([['TYPE','TYPES']],@_) : (\@_); + $types = [$types] if $types && !ref($types); + my @types = ($types && @$types) ? @$types : $self->types; + my @features = map {@{$self->{features}{$_}}} @types; + if ($iterator) { + require Bio::Graphics::FeatureFile::Iterator; + return Bio::Graphics::FeatureFile::Iterator->new(\@features); + } + return wantarray ? @features : \@features; +} + +=over 4 + +=item @features = $features-Efeatures($type) + +Return a list of all the feature types of type "$type". If the +featurefile object was created by parsing a file or text scalar, then +the features will be of type Bio::Graphics::Feature (which follow the +Bio::FeatureI interface). Otherwise the list will contain objects of +whatever type you added with calls to add_feature(). + +=back + +=cut + +sub make_strand { + local $^W = 0; + return +1 if $_[0] =~ /^\+/ || $_[0] > 0; + return -1 if $_[0] =~ /^\-/ || $_[0] < 0; + return 0; +} + +=head2 get_seq_stream + + Title : get_seq_stream + Usage : $stream = $s->get_seq_stream(@args) + Function: get a stream of features that overlap this segment + Returns : a Bio::SeqIO::Stream-compliant stream + Args : see below + Status : Public + +This is the same as feature_stream(), and is provided for Bioperl +compatibility. Use like this: + + $stream = $s->get_seq_stream('exon'); + while (my $exon = $stream->next_seq) { + print $exon->start,"\n"; + } + +=cut + +sub get_seq_stream { + my $self = shift; + local $^W = 0; + my @args = $_[0] =~ /^-/ ? (@_,-iterator=>1) : (-types=>\@_,-iterator=>1); + $self->features(@args); +} + +=head2 get_feature_stream(), top_SeqFeatures(), all_SeqFeatures() + +Provided for compatibility with older BioPerl and/or Bio::DB::GFF +APIs. + +=cut + +*get_feature_stream = \&get_seq_stream; +*top_SeqFeatures = *all_SeqFeatures = \&features; + + +=over 4 + +=item @refs = $features-Erefs + +Return the list of reference sequences referred to by this data file. + +=back + +=cut + +sub refs { + my $self = shift; + my $refs = $self->{refs} or return; + keys %$refs; +} + +=over 4 + +=item $min = $features-Emin + +Return the minimum coordinate of the leftmost feature in the data set. + +=back + +=cut + +sub min { shift->{min} } + +=over 4 + +=item $max = $features-Emax + +Return the maximum coordinate of the rightmost feature in the data set. + +=back + +=cut + +sub max { shift->{max} } + +sub init_parse { + my $s = shift; + + $s->{seenit} = {}; + $s->{max} = $s->{min} = undef; + $s->{types} = []; + $s->{features} = {}; + $s->{config} = {} +} + +sub finish_parse { + my $s = shift; + $s->evaluate_coderefs if $s->safe; + $s->{seenit} = {}; +} + +sub evaluate_coderefs { + my $self = shift; + $self->initialize_code(); + for my $s ($self->_setting) { + for my $o ($self->_setting($s)) { + $self->code_setting($s,$o); + } + } +} + +sub initialize_code { + my $self = shift; + my $package = $self->base2package; + my $init_code = $self->_setting(general => 'init_code') or return; + my $code = "package $package; $init_code; 1;"; + eval $code; + warn $@ if $@; +} + +sub base2package { + my $self = shift; + (my $package = overload::StrVal($self)) =~ s/[^a-z0-9A-Z_]/_/g; + $package =~ s/^[^a-zA-Z_]/_/g; + $package; +} + +sub split_group { + my $group = shift; + + $group =~ s/\\;/$;/g; # protect embedded semicolons in the group + $group =~ s/( \"[^\"]*);([^\"]*\")/$1$;$2/g; + my @groups = split(/\s*;\s*/,$group); + foreach (@groups) { s/$;/;/g } + + my ($gclass,$gname,$tstart,$tstop,@notes); + + foreach (@groups) { + + my ($tag,$value) = /^(\S+)\s*(.*)/; + $value =~ s/\\t/\t/g; + $value =~ s/\\r/\r/g; + $value =~ s/^"//; + $value =~ s/"$//; + + # if the tag is "Note", then we add this to the + # notes array + if ($tag eq 'Note') { # just a note, not a group! + push @notes,$value; + } + + # if the tag eq 'Target' then the class name is embedded in the ID + # (the GFF format is obviously screwed up here) + elsif ($tag eq 'Target' && $value =~ /([^:\"]+):([^\"]+)/) { + ($gclass,$gname) = ($1,$2); + ($tstart,$tstop) = /(\d+) (\d+)/; + } + + elsif (!$value) { + push @notes,$tag; # e.g. "Confirmed_by_EST" + } + + # otherwise, the tag and value correspond to the + # group class and name + else { + ($gclass,$gname) = ($tag,$value); + } + } + + return ($gclass,$gname,$tstart,$tstop,\@notes); +} + +# create a panel if needed +sub new_panel { + my $self = shift; + + require Bio::Graphics::Panel; + + # general configuration of the image here + my $width = $self->setting(general => 'pixels') + || $self->setting(general => 'width') + || WIDTH; + + my ($start,$stop); + my $range_expr = '(-?\d+)(?:-|\.\.)(-?\d+)'; + + if (my $bases = $self->setting(general => 'bases')) { + ($start,$stop) = $bases =~ /([\d-]+)(?:-|\.\.)([\d-]+)/; + } + + if (!defined $start || !defined $stop) { + $start = $self->min unless defined $start; + $stop = $self->max unless defined $stop; + } + + my $new_segment = Bio::Graphics::Feature->new(-start=>$start,-stop=>$stop); + my $panel = Bio::Graphics::Panel->new(-segment => $new_segment, + -width => $width, + -key_style => 'between'); + $panel; +} + +=over 4 + +=item $mtime = $features-Emtime + +=item $atime = $features-Eatime + +=item $ctime = $features-Ectime + +=item $size = $features-Esize + +Returns stat() information about the data file, for featurefile +objects created using the -file option. Size is in bytes. mtime, +atime, and ctime are in seconds since the epoch. + +=back + +=cut + +sub mtime { + my $self = shift; + my $d = $self->{m_time} || $self->{stat}->[9]; + $self->{m_time} = shift if @_; + $d; +} +sub atime { shift->{stat}->[8]; } +sub ctime { shift->{stat}->[10]; } +sub size { shift->{stat}->[7]; } + +=over 4 + +=item $label = $features-Efeature2label($feature) + +Given a feature, determines the configuration stanza that bests +describes it. Uses the feature's type() method if it has it (DasI +interface) or its primary_tag() method otherwise. + +=back + +=cut + +sub feature2label { + my $self = shift; + my $feature = shift; + my $type = eval {$feature->type} || $feature->primary_tag or return; + (my $basetype = $type) =~ s/:.+$//; + my @labels = $self->type2label($type); + @labels = $self->type2label($basetype) unless @labels; + @labels = ($type) unless @labels;; + wantarray ? @labels : $labels[0]; +} + +=over 4 + +=item $link = $features-Emake_link($feature) + +Given a feature, tries to generate a URL to link out from it. This +uses the 'link' option, if one is present. This method is a +convenience for the generic genome browser. + +=back + +=cut + +sub make_link { + my $self = shift; + my $feature = shift; + for my $label ($self->feature2label($feature)) { + my $link = $self->setting($label,'link'); + $link = $self->setting(general=>'link') unless defined $link; + next unless $link; + return $self->link_pattern($link,$feature); + } + return; +} + +sub link_pattern { + my $self = shift; + my ($pattern,$feature,$panel) = @_; + require CGI unless defined &CGI::escape; + my $n; + $pattern =~ s/\$(\w+)/ + CGI::escape( + $1 eq 'ref' ? ($n = $feature->location->seq_id) && "$n" + : $1 eq 'name' ? ($n = $feature->display_name) && "$n" # workaround broken CGI.pm + : $1 eq 'class' ? eval {$feature->class} || '' + : $1 eq 'type' ? eval {$feature->method} || $feature->primary_tag + : $1 eq 'method' ? eval {$feature->method} || $feature->primary_tag + : $1 eq 'source' ? eval {$feature->source} || $feature->source_tag + : $1 eq 'start' ? $feature->start + : $1 eq 'end' ? $feature->end + : $1 eq 'stop' ? $feature->end + : $1 eq 'segstart' ? $panel->start + : $1 eq 'segend' ? $panel->end + : $1 eq 'description' ? eval {join '',$feature->notes} || '' + : $1 + ) + /exg; + return $pattern; +} + +# given a feature type, return its label(s) +sub type2label { + my $self = shift; + my $type = shift; + $self->{_type2label} ||= $self->invert_types; + my @labels = keys %{$self->{_type2label}{$type}}; + wantarray ? @labels : $labels[0] +} + +sub invert_types { + my $self = shift; + my $config = $self->{config} or return; + my %inverted; + for my $label (keys %{$config}) { + my $feature = $config->{$label}{feature} or next; + foreach (shellwords($feature||'')) { + $inverted{$_}{$label}++; + } + } + \%inverted; +} + +=over 4 + +=item $citation = $features-Ecitation($feature) + +Given a feature, tries to generate a citation for it, using the +"citation" option if one is present. This method is a convenience for +the generic genome browser. + +=back + +=cut + +# This routine returns the "citation" field. It is here in order to simplify the logic +# a bit in the generic browser +sub citation { + my $self = shift; + my $feature = shift || 'general'; + return $self->setting($feature=>'citation'); +} + +=over 4 + +=item $name = $features-Ename([$feature]) + +Get/set the name of this feature set. This is a convenience method +useful for keeping track of multiple feature sets. + +=back + +=cut + +# give this feature file a nickname +sub name { + my $self = shift; + my $d = $self->{name}; + $self->{name} = shift if @_; + $d; +} + +1; + +__END__ + +=head1 Appendix -- Sample Feature File + + # file begins + [general] + pixels = 1024 + bases = 1-20000 + reference = Contig41 + height = 12 + + [Cosmid] + glyph = segments + fgcolor = blue + key = C. elegans conserved regions + + [EST] + glyph = segments + bgcolor= yellow + connector = dashed + height = 5; + + [FGENESH] + glyph = transcript2 + bgcolor = green + description = 1 + + Cosmid B0511 516-619 + Cosmid B0511 3185-3294 + Cosmid B0511 10946-11208 + Cosmid B0511 13126-13511 + Cosmid B0511 11394-11539 + EST yk260e10.5 15569-15724 + EST yk672a12.5 537-618,3187-3294 + EST yk595e6.5 552-618 + EST yk595e6.5 3187-3294 + EST yk846e07.3 11015-11208 + EST yk53c10 + yk53c10.3 15000-15500,15700-15800 + yk53c10.5 18892-19154 + EST yk53c10.5 16032-16105 + SwissProt PECANEX 13153-13656 Swedish fish + FGENESH Predicted gene 1 1-205,518-616,661-735,3187-3365,3436-3846 Pfam domain + FGENESH Predicted gene 2 5513-6497,7968-8136,8278-8383,8651-8839,9462-9515,10032-10705,10949-11340,11387-11524,11765-12067,12876-13577,13882-14121,14169-14535,15006-15209,15259-15462,15513-15753,15853-16219 Mysterious + FGENESH Predicted gene 3 16626-17396,17451-17597 + FGENESH Predicted gene 4 18459-18722,18882-19176,19221-19513,19572-19835 Transmembrane protein + # file ends + +=head1 SEE ALSO + +L, +L, +L, +L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut + + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1474 @@ +package Bio::Graphics::Glyph; +use GD; + +use strict; +use Carp 'croak'; +use constant BUMP_SPACING => 2; # vertical distance between bumped glyphs + + +my %LAYOUT_COUNT; + +# the CM1 and CM2 constants control the size of the hash used to +# detect collisions. +use constant CM1 => 200; # big bin, x axis +use constant CM2 => 50; # big bin, y axis +use constant CM3 => 50; # small bin, x axis +use constant CM4 => 50; # small bin, y axis + +use constant QUILL_INTERVAL => 8; # number of pixels between Jim Kent style intron "quills" + +# a bumpable graphical object that has bumpable graphical subparts + +# args: -feature => $feature_object (may contain subsequences) +# -factory => $factory_object (called to create glyphs for subsequences) +# In this scheme, the factory decides based on stylesheet information what glyph to +# draw and what configurations options to us. This allows for heterogeneous tracks. +sub new { + my $class = shift; + my %arg = @_; + + my $feature = $arg{-feature} or die "No feature"; + my $factory = $arg{-factory} || $class->default_factory; + my $level = $arg{-level} || 0; + my $flip = $arg{-flip}; + + my $self = bless {},$class; + $self->{feature} = $feature; + $self->{factory} = $factory; + $self->{level} = $level; + $self->{flip}++ if $flip; + $self->{top} = 0; + + my @subglyphs; + my @subfeatures = $self->subseq($feature); + + if (@subfeatures) { + + # dynamic glyph resolution + @subglyphs = map { $_->[0] } + sort { $a->[1] <=> $b->[1] } + map { [$_, $_->left ] } + $factory->make_glyph($level+1,@subfeatures); + + $self->{parts} = \@subglyphs; + } + + my ($start,$stop) = ($self->start, $self->stop); + if (defined $start && defined $stop) { + ($start,$stop) = ($stop,$start) if $start > $stop; # sheer paranoia + # the +1 here is critical for allowing features to meet nicely at nucleotide resolution + my ($left,$right) = $factory->map_pt($start,$stop+1); + $self->{left} = $left; + $self->{width} = $right - $left + 1; + } + if (@subglyphs) { + my $l = $subglyphs[0]->left; + $self->{left} = $l if !defined($self->{left}) || $l < $self->{left}; + my $right = ( + sort { $b<=>$a } + map {$_->right} @subglyphs)[0]; + my $w = $right - $self->{left} + 1; + $self->{width} = $w if !defined($self->{width}) || $w > $self->{width}; + } + + $self->{point} = $arg{-point} ? $self->height : undef; + #Handle glyphs that don't actually fill their space, but merely mark a point. + #They need to have their collision bounds altered. We will (for now) + #hard code them to be in the center of their feature. +# note: this didn't actually seem to work properly, all features were aligned on +# their right edges. It works to do it in individual point-like glyphs such as triangle. +# if($self->option('point')){ +# my ($left,$right) = $factory->map_pt($self->start,$self->stop); +# my $center = int(($left+$right)/2 + 0.5); + +# $self->{width} = $self->height; +# $self->{left} = $center - ($self->{width}); +# $self->{right} = $center + ($self->{width}); +# } + + return $self; +} + +sub parts { + my $self = shift; + return unless $self->{parts}; + return wantarray ? @{$self->{parts}} : $self->{parts}; +} + +sub feature { shift->{feature} } +sub factory { shift->{factory} } +sub panel { shift->factory->panel } +sub point { shift->{point} } +sub scale { shift->factory->scale } +sub start { + my $self = shift; + return $self->{start} if exists $self->{start}; + $self->{start} = $self->{flip} ? $self->panel->end + 1 - $self->{feature}->end : $self->{feature}->start; + + # handle the case of features whose endpoints are undef + # (this happens with wormbase clones where one or more clone end is not defined) + # in this case, we set the start to one minus the beginning of the panel + $self->{start} = $self->panel->offset - 1 unless defined $self->{start}; + + return $self->{start}; +} +sub stop { + my $self = shift; + return $self->{stop} if exists $self->{stop}; + $self->{stop} = $self->{flip} ? $self->panel->end + 1 - $self->{feature}->start : $self->{feature}->end; + + # handle the case of features whose endpoints are undef + # (this happens with wormbase clones where one or more clone end is not defined) + # in this case, we set the start to one plus the end of the panel + $self->{stop} = $self->panel->offset + $self->panel->length + 1 unless defined $self->{stop}; + + return $self->{stop} +} +sub end { shift->stop } +sub length { my $self = shift; $self->stop - $self->start }; +sub score { + my $self = shift; + return $self->{score} if exists $self->{score}; + return $self->{score} = ($self->{feature}->score || 0); +} +sub strand { + my $self = shift; + return $self->{strand} if exists $self->{strand}; + return $self->{strand} = ($self->{feature}->strand || 0); +} +sub map_pt { shift->{factory}->map_pt(@_) } +sub map_no_trunc { shift->{factory}->map_no_trunc(@_) } + +# add a feature (or array ref of features) to the list +sub add_feature { + my $self = shift; + my $factory = $self->factory; + for my $feature (@_) { + if (ref $feature eq 'ARRAY') { + $self->add_group(@$feature); + } else { + push @{$self->{parts}},$factory->make_glyph(0,$feature); + } + } +} + +# link a set of features together so that they bump as a group +sub add_group { + my $self = shift; + my @features = ref($_[0]) eq 'ARRAY' ? @{$_[0]} : @_; + my $f = Bio::Graphics::Feature->new( + -segments=>\@features, + -type => 'group' + ); + $self->add_feature($f); +} + +sub top { + my $self = shift; + my $g = $self->{top}; + $self->{top} = shift if @_; + $g; +} +sub left { + my $self = shift; + return $self->{left} - $self->pad_left; +} +sub right { + my $self = shift; + return $self->left + $self->layout_width - 1; +} +sub bottom { + my $self = shift; + $self->top + $self->layout_height - 1; +} +sub height { + my $self = shift; + return $self->{height} if exists $self->{height}; + my $baseheight = $self->option('height'); # what the factory says + return $self->{height} = $baseheight; +} +sub width { + my $self = shift; + my $g = $self->{width}; + $self->{width} = shift if @_; + $g; +} +sub layout_height { + my $self = shift; + return $self->layout; +} +sub layout_width { + my $self = shift; + return $self->width + $self->pad_left + $self->pad_right; +} + +# returns the rectangle that surrounds the physical part of the +# glyph, excluding labels and other "extra" stuff +sub calculate_boundaries {return shift->bounds(@_);} + +sub bounds { + my $self = shift; + my ($dx,$dy) = @_; + $dx += 0; $dy += 0; + ($dx + $self->{left}, + $dy + $self->top + $self->pad_top, + $dx + $self->{left} + $self->{width} - 1, + $dy + $self->bottom - $self->pad_bottom); +} + + +sub box { + my $self = shift; + return ($self->left,$self->top,$self->right,$self->bottom); +} + + +sub unfilled_box { + my $self = shift; + my $gd = shift; + my ($x1,$y1,$x2,$y2,$fg,$bg) = @_; + + my $linewidth = $self->option('linewidth') || 1; + + unless ($fg) { + $fg ||= $self->fgcolor; + $fg = $self->set_pen($linewidth,$fg) if $linewidth > 1; + } + + unless ($bg) { + $bg ||= $self->bgcolor; + $bg = $self->set_pen($linewidth,$bg) if $linewidth > 1; + } + + # draw a box + $gd->rectangle($x1,$y1,$x2,$y2,$fg); + + # if the left end is off the end, then cover over + # the leftmost line + my ($width) = $gd->getBounds; + + $gd->line($x1,$y1+$linewidth,$x1,$y2-$linewidth,$bg) + if $x1 < $self->panel->pad_left; + + $gd->line($x2,$y1+$linewidth,$x2,$y2-$linewidth,$bg) + if $x2 > $width - $self->panel->pad_right; +} + + +# return boxes surrounding each part +sub boxes { + my $self = shift; + my ($left,$top) = @_; + $top += 0; $left += 0; + my @result; + + $self->layout; + my @parts = $self->parts; + @parts = $self if !@parts && $self->option('box_subparts') && $self->level>0; + + for my $part ($self->parts) { + if (eval{$part->feature->primary_tag} eq 'group' or + ($part->level == 0 && $self->option('box_subparts'))) { + push @result,$part->boxes($left+$self->left+$self->pad_left,$top+$self->top+$self->pad_top); + } else { + my ($x1,$y1,$x2,$y2) = $part->box; + push @result,[$part->feature,$x1,$top+$self->top+$self->pad_top+$y1, + $x2,$top+$self->top+$self->pad_top+$y2]; + } + } + return wantarray ? @result : \@result; +} + +# this should be overridden for labels, etc. +# allows glyph to make itself thicker or thinner depending on +# domain-specific knowledge +sub pad_top { + my $self = shift; + return 0; +} +sub pad_bottom { + my $self = shift; + return 0; +} +sub pad_left { + my $self = shift; + return 0; +} +sub pad_right { + my $self = shift; +# this shouldn't be necessary + my @parts = $self->parts or return 0; + my $max = 0; + foreach (@parts) { + my $pr = $_->pad_right; + $max = $pr if $max < $pr; + } + $max; +} + +# move relative to parent +sub move { + my $self = shift; + my ($dx,$dy) = @_; + $self->{left} += $dx; + $self->{top} += $dy; + + # because the feature parts use *absolute* not relative addressing + # we need to move each of the parts horizontally, but not vertically + $_->move($dx,0) foreach $self->parts; +} + +# get an option +sub option { + my $self = shift; + my $option_name = shift; + my $factory = $self->factory; + return unless $factory; + $factory->option($self,$option_name,@{$self}{qw(partno total_parts)}); +} + +# set an option globally +sub configure { + my $self = shift; + my $factory = $self->factory; + my $option_map = $factory->option_map; + while (@_) { + my $option_name = shift; + my $option_value = shift; + ($option_name = lc $option_name) =~ s/^-//; + $option_map->{$option_name} = $option_value; + } +} + +# some common options +sub color { + my $self = shift; + my $color = shift; + my $index = $self->option($color); + # turn into a color index + return $self->factory->translate_color($index) if defined $index; + return 0; +} + +sub connector { + return shift->option('connector',@_); +} + +# return value: +# 0 no bumping +# +1 bump down +# -1 bump up +sub bump { + my $self = shift; + return $self->option('bump'); +} + +# we also look for the "color" option for Ace::Graphics compatibility +sub fgcolor { + my $self = shift; + my $color = $self->option('fgcolor'); + my $index = defined $color ? $color : $self->option('color'); + $index = 'black' unless defined $index; + $self->factory->translate_color($index); +} + +#add for compatibility +sub fillcolor { + my $self = shift; + return $self->bgcolor; +} + +# we also look for the "background-color" option for Ace::Graphics compatibility +sub bgcolor { + my $self = shift; + my $bgcolor = $self->option('bgcolor'); + my $index = defined $bgcolor ? $bgcolor : $self->option('fillcolor'); + $index = 'white' unless defined $index; + $self->factory->translate_color($index); +} +sub font { + my $self = shift; + my $font = $self->option('font'); + unless (UNIVERSAL::isa($font,'GD::Font')) { + my $ref = { + gdTinyFont => gdTinyFont, + gdSmallFont => gdSmallFont, + gdMediumBoldFont => gdMediumBoldFont, + gdLargeFont => gdLargeFont, + gdGiantFont => gdGiantFont}; + my $gdfont = $ref->{$font} || $font; + $self->configure(font=>$gdfont); + return $gdfont; + } + return $font; +} +sub fontcolor { + my $self = shift; + my $fontcolor = $self->color('fontcolor'); + return defined $fontcolor ? $fontcolor : $self->fgcolor; +} +sub font2color { + my $self = shift; + my $font2color = $self->color('font2color'); + return defined $font2color ? $font2color : $self->fgcolor; +} +sub tkcolor { # "track color" + my $self = shift; + $self->option('tkcolor') or return; + return $self->color('tkcolor') +} +sub connector_color { + my $self = shift; + $self->color('connector_color') || $self->fgcolor; +} + +sub layout_sort { + + my $self = shift; + my $sortfunc; + + my $opt = $self->option("sort_order"); + if (!$opt) { + $sortfunc = eval 'sub { $a->left <=> $b->left }'; + } elsif (ref $opt eq 'CODE') { + $sortfunc = $opt; + } elsif ($opt =~ /^sub\s+\{/o) { + $sortfunc = eval $opt; + } else { + # build $sortfunc for ourselves: + my @sortbys = split(/\s*\|\s*/o, $opt); + $sortfunc = 'sub { '; + my $sawleft = 0; + + # not sure I can make this schwartzian transfored + for my $sortby (@sortbys) { + if ($sortby eq "left" || $sortby eq "default") { + $sortfunc .= '($a->left <=> $b->left) || '; + $sawleft++; + } elsif ($sortby eq "right") { + $sortfunc .= '($a->right <=> $b->right) || '; + } elsif ($sortby eq "low_score") { + $sortfunc .= '($a->score <=> $b->score) || '; + } elsif ($sortby eq "high_score") { + $sortfunc .= '($b->score <=> $a->score) || '; + } elsif ($sortby eq "longest") { + $sortfunc .= '(($b->length) <=> ($a->length)) || '; + } elsif ($sortby eq "shortest") { + $sortfunc .= '(($a->length) <=> ($b->length)) || '; + } elsif ($sortby eq "strand") { + $sortfunc .= '($b->strand <=> $a->strand) || '; + } elsif ($sortby eq "name") { + $sortfunc .= '($a->feature->display_name cmp $b->feature->display_name) || '; + } + } + unless ($sawleft) { + $sortfunc .= ' ($a->left <=> $b->left) '; + } else { + $sortfunc .= ' 0'; + } + $sortfunc .= '}'; + $sortfunc = eval $sortfunc; + } + + # cache this + # $self->factory->set_option(sort_order => $sortfunc); + + return sort $sortfunc @_; +} + +# handle collision detection +sub layout { + my $self = shift; + return $self->{layout_height} if exists $self->{layout_height}; + + my @parts = $self->parts; + return $self->{layout_height} + = $self->height + $self->pad_top + $self->pad_bottom unless @parts; + + my $bump_direction = $self->bump; + my $bump_limit = $self->option('bump_limit') || -1; + + $_->layout foreach @parts; # recursively lay out + + # no bumping requested, or only one part here + if (@parts == 1 || !$bump_direction) { + my $highest = 0; + foreach (@parts) { + my $height = $_->layout_height; + $highest = $height > $highest ? $height : $highest; + } + return $self->{layout_height} = $highest + $self->pad_top + $self->pad_bottom; + } + + my (%bin1,%bin2); + for my $g ($self->layout_sort(@parts)) { + + my $pos = 0; + my $bumplevel = 0; + my $left = $g->left; + my $right = $g->right; + my $height = $g->{layout_height}; + + while (1) { + + # stop bumping if we've gone too far down + if ($bump_limit > 0 && $bumplevel++ >= $bump_limit) { + $g->{overbumped}++; # this flag can be used to suppress label and description + foreach ($g->parts) { + $_->{overbumped}++; + } + last; + } + + # look for collisions + my $bottom = $pos + $height; + $self->collides(\%bin1,CM1,CM2,$left,$pos,$right,$bottom) or last; + my $collision = $self->collides(\%bin2,CM3,CM4,$left,$pos,$right,$bottom) or last; + + if ($bump_direction > 0) { + $pos += $collision->[3]-$collision->[1] + BUMP_SPACING; # collision, so bump + + } else { + $pos -= BUMP_SPACING; + } + + } + + $g->move(0,$pos); + $self->add_collision(\%bin1,CM1,CM2,$left,$g->top,$right,$g->bottom); + $self->add_collision(\%bin2,CM3,CM4,$left,$g->top,$right,$g->bottom); + } + + # If -1 bumping was allowed, then normalize so that the top glyph is at zero + if ($bump_direction < 0) { + my $topmost; + foreach (@parts) { + my $top = $_->top; + $topmost = $top if !defined($topmost) or $top < $topmost; + } + my $offset = - $topmost; + $_->move(0,$offset) foreach @parts; + } + + # find new height + my $bottom = 0; + foreach (@parts) { + $bottom = $_->bottom if $_->bottom > $bottom; + } + return $self->{layout_height} = $self->pad_bottom + $self->pad_top + $bottom - $self->top + 1; +} + +# the $%occupied structure is a hash of {left,top} = [left,top,right,bottom] +sub collides { + my $self = shift; + my ($occupied,$cm1,$cm2,$left,$top,$right,$bottom) = @_; + my @keys = $self->_collision_keys($cm1,$cm2,$left,$top,$right,$bottom); + my $collides = 0; + for my $k (@keys) { + next unless exists $occupied->{$k}; + for my $bounds (@{$occupied->{$k}}) { + my ($l,$t,$r,$b) = @$bounds; + next unless $right >= $l and $left <= $r and $bottom >= $t and $top <= $b; + $collides = $bounds; + last; + } + } + $collides; +} + +sub add_collision { + my $self = shift; + my ($occupied,$cm1,$cm2,$left,$top,$right,$bottom) = @_; + my $value = [$left,$top,$right+2,$bottom]; + my @keys = $self->_collision_keys($cm1,$cm2,@$value); + push @{$occupied->{$_}},$value foreach @keys; +} + +sub _collision_keys { + my $self = shift; + my ($binx,$biny,$left,$top,$right,$bottom) = @_; + my @keys; + my $bin_left = int($left/$binx); + my $bin_right = int($right/$binx); + my $bin_top = int($top/$biny); + my $bin_bottom = int($bottom/$biny); + for (my $x=$bin_left;$x<=$bin_right; $x++) { + for (my $y=$bin_top;$y<=$bin_bottom; $y++) { + push @keys,join(',',$x,$y); + } + } + @keys; +} + +sub draw { + my $self = shift; + my $gd = shift; + my ($left,$top,$partno,$total_parts) = @_; + + local($self->{partno},$self->{total_parts}); + @{$self}{qw(partno total_parts)} = ($partno,$total_parts); + + my $connector = $self->connector; + if (my @parts = $self->parts) { + + # invoke sorter if use wants to sort always and we haven't already sorted + # during bumping. + @parts = $self->layout_sort(@parts) if !$self->bump && $self->option('always_sort'); + + my $x = $left; + my $y = $top + $self->top + $self->pad_top; + $self->draw_connectors($gd,$x,$y) if $connector && $connector ne 'none'; + + my $last_x; + for (my $i=0; $i<@parts; $i++) { + # lie just a little bit to avoid lines overlapping and + # make the picture prettier + my $fake_x = $x; + $fake_x-- if defined $last_x && $parts[$i]->left - $last_x == 1; + $parts[$i]->draw($gd,$fake_x,$y,$i,scalar(@parts)); + $last_x = $parts[$i]->right; + } + } + + else { # no part + $self->draw_connectors($gd,$left,$top) + if $connector && $connector ne 'none' && $self->{level} == 0; + $self->draw_component($gd,$left,$top); + } +} + +# the "level" is the level of testing of the glyph +# groups are level -1, top level glyphs are level 0, subcomponents are level 1 and so forth. +sub level { + shift->{level}; +} + +sub draw_connectors { + my $self = shift; + return if $self->{overbumped}; + my $gd = shift; + my ($dx,$dy) = @_; + my @parts = sort { $a->left <=> $b->left } $self->parts; + for (my $i = 0; $i < @parts-1; $i++) { + $self->_connector($gd,$dx,$dy,$parts[$i]->bounds,$parts[$i+1]->bounds); + } + + # extra connectors going off ends + if (@parts) { + my($x1,$y1,$x2,$y2) = $self->bounds(0,0); + my($xl,$xt,$xr,$xb) = $parts[0]->bounds; + $self->_connector($gd,$dx,$dy,$x1,$xt,$x1,$xb,$xl,$xt,$xr,$xb) if $x1 < $xl; + my ($xl2,$xt2,$xr2,$xb2) = $parts[-1]->bounds; + $self->_connector($gd,$dx,$dy,$parts[-1]->bounds,$x2,$xt2,$x2,$xb2) if $x2 > $xr; + } + +} + +sub _connector { + my $self = shift; + my ($gd, + $dx,$dy, + $xl,$xt,$xr,$xb, + $yl,$yt,$yr,$yb) = @_; + my $left = $dx + $xr; + my $right = $dx + $yl; + my $top1 = $dy + $xt; + my $bottom1 = $dy + $xb; + my $top2 = $dy + $yt; + my $bottom2 = $dy + $yb; + # restore this comment if you don't like the group dash working + # its way backwards. + return if $right-$left < 1 && !$self->isa('Bio::Graphics::Glyph::group'); + + $self->draw_connector($gd, + $top1,$bottom1,$left, + $top2,$bottom2,$right, + ); +} + +sub draw_connector { + my $self = shift; + my $gd = shift; + + my $color = $self->connector_color; + my $connector_type = $self->connector or return; + + if ($connector_type eq 'hat') { + $self->draw_hat_connector($gd,$color,@_); + } elsif ($connector_type eq 'solid') { + $self->draw_solid_connector($gd,$color,@_); + } elsif ($connector_type eq 'dashed') { + $self->draw_dashed_connector($gd,$color,@_); + } elsif ($connector_type eq 'quill') { + $self->draw_quill_connector($gd,$color,@_); + } else { + ; # draw nothing + } +} + +sub draw_hat_connector { + my $self = shift; + my $gd = shift; + my $color = shift; + my ($top1,$bottom1,$left,$top2,$bottom2,$right) = @_; + + my $center1 = ($top1 + $bottom1)/2; + my $quarter1 = $top1 + ($bottom1-$top1)/4; + my $center2 = ($top2 + $bottom2)/2; + my $quarter2 = $top2 + ($bottom2-$top2)/4; + + if ($center1 != $center2) { + $self->draw_solid_connector($gd,$color,@_); + return; + } + + if ($right - $left > 4) { # room for the inverted "V" + my $middle = $left + int(($right - $left)/2); + $gd->line($left,$center1,$middle,$top1,$color); + $gd->line($middle,$top1,$right-1,$center1,$color); + } elsif ($right-$left > 1) { # no room, just connect + $gd->line($left,$quarter1,$right-1,$quarter1,$color); + } + +} + +sub draw_solid_connector { + my $self = shift; + my $gd = shift; + my $color = shift; + my ($top1,$bottom1,$left,$top2,$bottom2,$right) = @_; + + my $center1 = ($top1 + $bottom1)/2; + my $center2 = ($top2 + $bottom2)/2; + + $gd->line($left,$center1,$right,$center2,$color); +} + +sub draw_dashed_connector { + my $self = shift; + my $gd = shift; + my $color = shift; + my ($top1,$bottom1,$left,$top2,$bottom2,$right) = @_; + + my $center1 = ($top1 + $bottom1)/2; + my $center2 = ($top2 + $bottom2)/2; + + $gd->setStyle($color,$color,gdTransparent,gdTransparent,); + $gd->line($left,$center1,$right,$center2,gdStyled); +} + +sub draw_quill_connector { + my $self = shift; + my $gd = shift; + my $color = shift; + my ($top1,$bottom1,$left,$top2,$bottom2,$right) = @_; + + my $center1 = ($top1 + $bottom1)/2; + my $center2 = ($top2 + $bottom2)/2; + + $gd->line($left,$center1,$right,$center2,$color); + my $direction = $self->feature->strand; + return unless $direction; + + if ($direction > 0) { + my $start = $left+4; + my $end = $right-1; + for (my $position=$start; $position <= $end; $position += QUILL_INTERVAL) { + $gd->line($position,$center1,$position-2,$center1-2,$color); + $gd->line($position,$center1,$position-2,$center1+2,$color); + } + } else { + my $start = $left+1; + my $end = $right-4; + for (my $position=$start; $position <= $end; $position += QUILL_INTERVAL) { + $gd->line($position,$center1,$position+2,$center1-2,$color); + $gd->line($position,$center1,$position+2,$center1+2,$color); + } + } +} + +sub filled_box { + my $self = shift; + my $gd = shift; + my ($x1,$y1,$x2,$y2,$bg,$fg) = @_; + + $bg ||= $self->bgcolor; + $fg ||= $self->fgcolor; + my $linewidth = $self->option('linewidth') || 1; + + $gd->filledRectangle($x1,$y1,$x2,$y2,$bg); + + $fg = $self->set_pen($linewidth,$fg) if $linewidth > 1; + + # draw a box + $gd->rectangle($x1,$y1,$x2,$y2,$fg); + + # if the left end is off the end, then cover over + # the leftmost line + my ($width) = $gd->getBounds; + + $bg = $self->set_pen($linewidth,$bg) if $linewidth > 1; + + $gd->line($x1,$y1+$linewidth,$x1,$y2-$linewidth,$bg) + if $x1 < $self->panel->pad_left; + + $gd->line($x2,$y1+$linewidth,$x2,$y2-$linewidth,$bg) + if $x2 > $width - $self->panel->pad_right; +} + +sub filled_oval { + my $self = shift; + my $gd = shift; + my ($x1,$y1,$x2,$y2,$bg,$fg) = @_; + my $cx = ($x1+$x2)/2; + my $cy = ($y1+$y2)/2; + + $fg ||= $self->fgcolor; + $bg ||= $self->bgcolor; + my $linewidth = $self->linewidth; + + $fg = $self->set_pen($linewidth) if $linewidth > 1; + $gd->arc($cx,$cy,$x2-$x1,$y2-$y1,0,360,$fg); + + # and fill it + $gd->fill($cx,$cy,$bg); +} + +sub oval { + my $self = shift; + my $gd = shift; + my ($x1,$y1,$x2,$y2) = @_; + my $cx = ($x1+$x2)/2; + my $cy = ($y1+$y2)/2; + + my $fg = $self->fgcolor; + my $linewidth = $self->linewidth; + + $fg = $self->set_pen($linewidth) if $linewidth > 1; + $gd->arc($cx,$cy,$x2-$x1,$y2-$y1,0,360,$fg); +} + +sub filled_arrow { + my $self = shift; + my $gd = shift; + my $orientation = shift; + $orientation *= -1 if $self->{flip}; + + my ($x1,$y1,$x2,$y2) = @_; + + my ($width) = $gd->getBounds; + my $indent = $y2-$y1 < $x2-$x1 ? $y2-$y1 : ($x2-$x1)/2; + + return $self->filled_box($gd,@_) + if ($orientation == 0) + or ($x1 < 0 && $orientation < 0) + or ($x2 > $width && $orientation > 0) + or ($indent <= 0) + or ($x2 - $x1 < 3); + + my $fg = $self->fgcolor; + if ($orientation >= 0) { + $gd->line($x1,$y1,$x2-$indent,$y1,$fg); + $gd->line($x2-$indent,$y1,$x2,($y2+$y1)/2,$fg); + $gd->line($x2,($y2+$y1)/2,$x2-$indent,$y2,$fg); + $gd->line($x2-$indent,$y2,$x1,$y2,$fg); + $gd->line($x1,$y2,$x1,$y1,$fg); + my $left = $self->panel->left > $x1 ? $self->panel->left : $x1; + $gd->fillToBorder($left+1,($y1+$y2)/2,$fg,$self->bgcolor); + } else { + $gd->line($x1,($y2+$y1)/2,$x1+$indent,$y1,$fg); + $gd->line($x1+$indent,$y1,$x2,$y1,$fg); + $gd->line($x2,$y2,$x1+$indent,$y2,$fg); + $gd->line($x1+$indent,$y2,$x1,($y1+$y2)/2,$fg); + $gd->line($x2,$y1,$x2,$y2,$fg); + my $right = $self->panel->right < $x2 ? $self->panel->right : $x2; + $gd->fillToBorder($right-1,($y1+$y2)/2,$fg,$self->bgcolor); + } +} + +sub linewidth { + shift->option('linewidth') || 1; +} + +sub fill { + my $self = shift; + my $gd = shift; + my ($x1,$y1,$x2,$y2) = @_; + if ( ($x2-$x1) >= 2 && ($y2-$y1) >= 2 ) { + $gd->fill($x1+1,$y1+1,$self->bgcolor); + } +} +sub set_pen { + my $self = shift; + my ($linewidth,$color) = @_; + $linewidth ||= $self->linewidth; + $color ||= $self->fgcolor; + return $color unless $linewidth > 1; + $self->panel->set_pen($linewidth,$color); +} + +sub draw_component { + my $self = shift; + my $gd = shift; + my($x1,$y1,$x2,$y2) = $self->bounds(@_); + + # clipping + my $panel = $self->panel; + return unless $x2 >= $panel->left and $x1 <= $panel->right; + + if ($self->option('strand_arrow') || $self->option('stranded')) { + $self->filled_arrow($gd,$self->feature->strand, + $x1, $y1, + $x2, $y2) + } else { + $self->filled_box($gd, + $x1, $y1, + $x2, $y2) + } +} + +# memoize _subseq -- it's a bottleneck with segments +sub subseq { + my $self = shift; + my $feature = shift; + return $self->_subseq($feature) unless ref $self; + return @{$self->{cached_subseq}{$feature}} if $self->{cached_subseq}{$feature}; + my @ss = $self->_subseq($feature); + $self->{cached_subseq}{$feature} = \@ss; + @ss; +} + +sub _subseq { + my $class = shift; + my $feature = shift; + return $feature->merged_segments if $feature->can('merged_segments'); + return $feature->segments if $feature->can('segments'); + my @split = eval { my $id = $feature->location->seq_id; + my @subs = $feature->location->sub_Location; + grep {$id eq $_->seq_id} @subs}; + return @split if @split; + return $feature->sub_SeqFeature if $feature->can('sub_SeqFeature'); + return; +} + +# synthesize a key glyph +sub keyglyph { + my $self = shift; + my $feature = $self->make_key_feature; + my $factory = $self->factory->clone; + $factory->set_option(label => 1); + $factory->set_option(description => 0); + $factory->set_option(bump => 0); + $factory->set_option(connector => 'solid'); + return $factory->make_glyph(0,$feature); +} + +# synthesize a key glyph +sub make_key_feature { + my $self = shift; + + my $scale = 1/$self->scale; # base pairs/pixel + + # one segments, at pixels 0->80 + my $offset = $self->panel->offset; + + + my $feature = + Bio::Graphics::Feature->new(-start =>0 * $scale +$offset, + -end =>80*$scale+$offset, + -name => $self->option('key'), + -strand => '+1'); + return $feature; +} + +sub all_callbacks { + my $self = shift; + my $track_level = $self->option('all_callbacks'); + return $track_level if defined $track_level; + return $self->panel->all_callbacks; +} + +sub default_factory { + croak "no default factory implemented"; +} + +1; + +__END__ + +=head1 NAME + +Bio::Graphics::Glyph - Base class for Bio::Graphics::Glyph objects + +=head1 SYNOPSIS + +See L. + +=head1 DESCRIPTION + +Bio::Graphics::Glyph is the base class for all glyph objects. Each +glyph is a wrapper around an Bio:SeqFeatureI object, knows how to +render itself on an Bio::Graphics::Panel, and has a variety of +configuration variables. + +End developers will not ordinarily work directly with +Bio::Graphics::Glyph objects, but with Bio::Graphics::Glyph::generic +and its subclasses. Similarly, most glyph developers will want to +subclass from Bio::Graphics::Glyph::generic because the latter +provides labeling and arrow-drawing facilities. + +=head1 METHODS + +This section describes the class and object methods for +Bio::Graphics::Glyph. + +=head2 CONSTRUCTORS + +Bio::Graphics::Glyph objects are constructed automatically by an +Bio::Graphics::Glyph::Factory, and are not usually created by +end-developer code. + +=over 4 + +=item $glyph = Bio::Graphics::Glyph-Enew(-feature=E$feature,-factory=E$factory) + +Given a sequence feature, creates an Bio::Graphics::Glyph object to +display it. The B<-feature> argument points to the Bio:SeqFeatureI +object to display, and B<-factory> indicates an +Bio::Graphics::Glyph::Factory object from which the glyph will fetch +all its run-time configuration information. Factories are created and +manipulated by the Bio::Graphics::Panel object. + +A standard set of options are recognized. See L. + +=back + +=head2 OBJECT METHODS + +Once a glyph is created, it responds to a large number of methods. In +this section, these methods are grouped into related categories. + +Retrieving glyph context: + +=over 4 + +=item $factory = $glyph-Efactory + +Get the Bio::Graphics::Glyph::Factory associated with this object. +This cannot be changed once it is set. + +=item $panel = $glyph-Epanel + +Get the Bio::Graphics::Panel associated with this object. This cannot +be changed once it is set. + +=item $feature = $glyph-Efeature + +Get the sequence feature associated with this object. This cannot be +changed once it is set. + +=item $feature = $glyph-Eadd_feature(@features) + +Add the list of features to the glyph, creating subparts. This is +most common done with the track glyph returned by +Ace::Graphics::Panel-Eadd_track(). + +=item $feature = $glyph-Eadd_group(@features) + +This is similar to add_feature(), but the list of features is treated +as a group and can be configured as a set. + +=back + +Retrieving glyph options: + +=over 4 + +=item $fgcolor = $glyph-Efgcolor + +=item $bgcolor = $glyph-Ebgcolor + +=item $fontcolor = $glyph-Efontcolor + +=item $fontcolor = $glyph-Efont2color + +=item $fillcolor = $glyph-Efillcolor + +These methods return the configured foreground, background, font, +alternative font, and fill colors for the glyph in the form of a +GD::Image color index. + +=item $color = $glyph-Etkcolor + +This method returns a color to be used to flood-fill the entire glyph +before drawing (currently used by the "track" glyph). + +=item $width = $glyph-Ewidth([$newwidth]) + +Return the width of the glyph, not including left or right padding. +This is ordinarily set internally based on the size of the feature and +the scale of the panel. + +=item $width = $glyph-Elayout_width + +Returns the width of the glyph including left and right padding. + +=item $width = $glyph-Eheight + +Returns the height of the glyph, not including the top or bottom +padding. This is calculated from the "height" option and cannot be +changed. + + +=item $font = $glyph-Efont + +Return the font for the glyph. + +=item $option = $glyph-Eoption($option) + +Return the value of the indicated option. + +=item $index = $glyph-Ecolor($color) + +Given a symbolic or #RRGGBB-form color name, returns its GD index. + +=item $level = $glyph-Elevel + +The "level" is the nesting level of the glyph. +Groups are level -1, top level glyphs are level 0, +subparts (e.g. exons) are level 1 and so forth. + +=back + +Setting an option: + +=over 4 + +=item $glyph-Econfigure(-name=E$value) + +You may change a glyph option after it is created using set_option(). +This is most commonly used to configure track glyphs. + +=back + +Retrieving information about the sequence: + +=over 4 + +=item $start = $glyph-Estart + +=item $end = $glyph-Eend + +These methods return the start and end of the glyph in base pair +units. + +=item $offset = $glyph-Eoffset + +Returns the offset of the segment (the base pair at the far left of +the image). + +=item $length = $glyph-Elength + +Returns the length of the sequence segment. + +=back + + +Retrieving formatting information: + +=over 4 + +=item $top = $glyph-Etop + +=item $left = $glyph-Eleft + +=item $bottom = $glyph-Ebottom + +=item $right = $glyph-Eright + +These methods return the top, left, bottom and right of the glyph in +pixel coordinates. + +=item $height = $glyph-Eheight + +Returns the height of the glyph. This may be somewhat larger or +smaller than the height suggested by the GlyphFactory, depending on +the type of the glyph. + +=item $scale = $glyph-Escale + +Get the scale for the glyph in pixels/bp. + +=item $height = $glyph-Elabelheight + +Return the height of the label, if any. + +=item $label = $glyph-Elabel + +Return a human-readable label for the glyph. + +=back + +These methods are called by Bio::Graphics::Track during the layout +process: + +=over 4 + +=item $glyph-Emove($dx,$dy) + +Move the glyph in pixel coordinates by the indicated delta-x and +delta-y values. + +=item ($x1,$y1,$x2,$y2) = $glyph-Ebox + +Return the current position of the glyph. + +=back + +These methods are intended to be overridden in subclasses: + +=over 4 + +=item $glyph-Ecalculate_height + +Calculate the height of the glyph. + +=item $glyph-Ecalculate_left + +Calculate the left side of the glyph. + +=item $glyph-Ecalculate_right + +Calculate the right side of the glyph. + +=item $glyph-Edraw($gd,$left,$top) + +Optionally offset the glyph by the indicated amount and draw it onto +the GD::Image object. + + +=item $glyph-Edraw_label($gd,$left,$top) + +Draw the label for the glyph onto the provided GD::Image object, +optionally offsetting by the amounts indicated in $left and $right. + +=back + +These methods are useful utility routines: + +=over 4 + +=item $pixels = $glyph-Emap_pt($bases); + +Map the indicated base position, given in base pair units, into +pixels, using the current scale and glyph position. + +=item $glyph-Efilled_box($gd,$x1,$y1,$x2,$y2) + +Draw a filled rectangle with the appropriate foreground and fill +colors, and pen width onto the GD::Image object given by $gd, using +the provided rectangle coordinates. + +=item $glyph-Efilled_oval($gd,$x1,$y1,$x2,$y2) + +As above, but draws an oval inscribed on the rectangle. + +=back + +=head2 OPTIONS + +The following options are standard among all Glyphs. See individual +glyph pages for more options. + + Option Description Default + ------ ----------- ------- + + -fgcolor Foreground color black + + -outlinecolor Synonym for -fgcolor + + -bgcolor Background color turquoise + + -fillcolor Synonym for -bgcolor + + -linewidth Line width 1 + + -height Height of glyph 10 + + -font Glyph font gdSmallFont + + -connector Connector type undef (false) + + -connector_color + Connector color black + + -strand_arrow Whether to indicate undef (false) + strandedness + + -label Whether to draw a label undef (false) + + -description Whether to draw a description undef (false) + + -sort_order Specify layout sort order "default" + + -always_sort Sort even when bumping is off undef (false) + + -bump_limit Maximum number of levels to bump undef (unlimited) + +For glyphs that consist of multiple segments, the B<-connector> option +controls what's drawn between the segments. The default is undef (no +connector). Options include: + + "hat" an upward-angling conector + "solid" a straight horizontal connector + "quill" a decorated line with small arrows indicating strandedness + (like the UCSC Genome Browser uses) + "dashed" a horizontal dashed line. + +The B<-connector_color> option controls the color of the connector, if +any. + +The label is printed above the glyph. You may pass an anonymous +subroutine to B<-label>, in which case the subroutine will be invoked +with the feature as its single argument. and is expected to return +the string to use as the description. If you provide the numeric +value "1" to B<-description>, the description will be read off the +feature's seqname(), info() and primary_tag() methods will be called +until a suitable name is found. To create a label with the +text "1", pass the string "1 ". (A 1 followed by a space). + +The description is printed below the glyph. You may pass an anonymous +subroutine to B<-description>, in which case the subroutine will be +invoked with the feature as its single argument and is expected to +return the string to use as the description. If you provide the +numeric value "1" to B<-description>, the description will be read off +the feature's source_tag() method. To create a description with the +text "1", pass the string "1 ". (A 1 followed by a space). + +In the case of ACEDB Ace::Sequence feature objects, the feature's +info(), Brief_identification() and Locus() methods will be called to +create a suitable description. + +The B<-strand_arrow> option, if true, requests that the glyph indicate +which strand it is on, usually by drawing an arrowhead. Not all +glyphs will respond to this request. For historical reasons, +B<-stranded> is a synonym for this option. + +By default, features are drawn with a layout based only on the +position of the feature, assuring a maximal "packing" of the glyphs +when bumped. In some cases, however, it makes sense to display the +glyphs sorted by score or some other comparison, e.g. such that more +"important" features are nearer the top of the display, stacked above +less important features. The -sort_order option allows a few +different built-in values for changing the default sort order (which +is by "left" position): "low_score" (or "high_score") will cause +features to be sorted from lowest to highest score (or vice versa). +"left" (or "default") and "right" values will cause features to be +sorted by their position in the sequence. "longer" (or "shorter") +will cause the longest (or shortest) features to be sorted first, and +"strand" will cause the features to be sorted by strand: "+1" +(forward) then "0" (unknown, or NA) then "-1" (reverse). Lastly, +"name" will sort features alphabetically by their display_name() +attribute. + +In all cases, the "left" position will be used to break any ties. To +break ties using another field, options may be strung together using a +"|" character; e.g. "strand|low_score|right" would cause the features +to be sorted first by strand, then score (lowest to highest), then by +"right" position in the sequence. Finally, a subroutine coderef can +be provided, which should expect to receive two feature objects (via +the special sort variables $a and $b), and should return -1, 0 or 1 +(see Perl's sort() function for more information); this subroutine +will be used without further modification for sorting. For example, +to sort a set of database search hits by bits (stored in the features' +"score" fields), scaled by the log of the alignment length (with +"left" position breaking any ties): + + sort_order = sub { ( $b->score/log($b->length) + <=> + $a->score/log($a->length) ) + || + ( $a->start <=> $b->start ) + } + +The -always_sort option, if true, will sort features even if bumping +is turned off. This is useful if you would like overlapping features +to stack in a particular order. Features towards the end of the list +will overlay those towards the beginning of the sort order. + +=head1 SUBCLASSING Bio::Graphics::Glyph + +By convention, subclasses are all lower-case. Begin each subclass +with a preamble like this one: + + package Bio::Graphics::Glyph::crossbox; + + use strict; + use vars '@ISA'; + @ISA = 'Bio::Graphics::Glyph'; + +Then override the methods you need to. Typically, just the draw() +method will need to be overridden. However, if you need additional +room in the glyph, you may override calculate_height(), +calculate_left() and calculate_right(). Do not directly override +height(), left() and right(), as their purpose is to cache the values +returned by their calculating cousins in order to avoid time-consuming +recalculation. + +A simple draw() method looks like this: + + sub draw { + my $self = shift; + $self->SUPER::draw(@_); + my $gd = shift; + + # and draw a cross through the box + my ($x1,$y1,$x2,$y2) = $self->calculate_boundaries(@_); + my $fg = $self->fgcolor; + $gd->line($x1,$y1,$x2,$y2,$fg); + $gd->line($x1,$y2,$x2,$y1,$fg); + } + +This subclass draws a simple box with two lines criss-crossed through +it. We first call our inherited draw() method to generate the filled +box and label. We then call calculate_boundaries() to return the +coordinates of the glyph, disregarding any extra space taken by +labels. We call fgcolor() to return the desired foreground color, and +then call $gd-Eline() twice to generate the criss-cross. + +For more complex draw() methods, see Bio::Graphics::Glyph::transcript +and Bio::Graphics::Glyph::segments. + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/Factory.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/Factory.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,431 @@ +=head1 NAME + +Bio::Graphics::Glyph::Factory - Factory for Bio::Graphics::Glyph objects + +=head1 SYNOPSIS + +See L. + +=head1 DESCRIPTION + +This class is used internally by Bio::Graphics to generate new Glyph +objects by combining a list of features with the user's desired +configuration. It is intended to be used internally by Bio::Graphics. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Lincoln Stein + +Email - lstein@cshl.org + +=head1 SEE ALSO + +L + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with an "_" +(underscore). + +=cut + +package Bio::Graphics::Glyph::Factory; + +use strict; +use Carp qw(:DEFAULT cluck); +use GD; + +my %LOADED_GLYPHS = (); +my %GENERIC_OPTIONS = ( + bgcolor => 'turquoise', + fgcolor => 'black', + fontcolor => 'black', + font2color => 'turquoise', + height => 8, + font => gdSmallFont, + bump => +1, # bump by default (perhaps a mistake?) + ); + +=head2 new + + Title : new + Usage : $f = Bio::Graphics::Glyph::Factory->new( + -stylesheet => $stylesheet, + -glyph_map => $glyph_map, + -options => $options); + Function : create a new Bio::Graphics::Glyph::Factory object + Returns : the new object + Args : $stylesheet is a Bio::Das::Stylesheet object that can + convert Bio::Das feature objects into glyph names and + associated options. + $glyph_map is a hash that maps primary tags to glyph names. + $options is a hash that maps option names to their values. + Status : Internal to Bio::Graphics + +=cut + +sub new { + my $class = shift; + my $panel = shift; + my %args = @_; + my $stylesheet = $args{-stylesheet}; # optional, for Bio::Das compatibility + my $map = $args{-map}; # map type name to glyph name + my $options = $args{-options}; # map type name to glyph options + return bless { + stylesheet => $stylesheet, + glyph_map => $map, + options => $options, + panel => $panel, + },$class; +} + +=head2 clone + + Title : clone + Usage : $f2 = $f->clone + Function : Deep copy of a factory object + Returns : a deep copy of the factory object + Args : None + Status : Internal to Bio::Graphics + +=cut + +sub clone { + my $self = shift; + my %new = %$self; + my $new = bless \%new,ref($self); + $new; +} + +=head2 stylesheet + + Title : stylesheet + Usage : $stylesheet = $f->stylesheet + Function : accessor for stylesheet + Returns : a Bio::Das::Stylesheet object + Args : None + Status : Internal to Bio::Graphics + +=cut + +sub stylesheet { shift->{stylesheet} } + +=head2 glyph_map + + Title : glyph_map + Usage : $map = $f->glyph_map + Function : accessor for the glyph map + Returns : a hash mapping primary tags to glyphs + Args : None + Status : Internal to Bio::Graphics + +=cut + +sub glyph_map { shift->{glyph_map} } + +=head2 option_map + + Title : option_map + Usage : $map = $f->option_map + Function : accessor for the option map + Returns : a hash mapping option names to values + Args : None + Status : Internal to Bio::Graphics + +=cut + +sub option_map { shift->{options} } + +=head2 global_opts + + Title : global_opts + Usage : $map = $f->global_opts + Function : accessor for global options + Returns : a hash mapping option names to values + Args : None + Status : Internal to Bio::Graphics + +This returns a set of defaults for option values. + +=cut + +sub global_opts{ shift->{global_opts} } + +=head2 panel + + Title : panel + Usage : $panel = $f->panel + Function : accessor for Bio::Graphics::Panel + Returns : a Bio::Graphics::Panel + Args : None + Status : Internal to Bio::Graphics + +This returns the panel with which the factory is associated. + +=cut + +sub panel { shift->{panel} } + +=head2 scale + + Title : scale + Usage : $scale = $f->scale + Function : accessor for the scale + Returns : a floating point number + Args : None + Status : Internal to Bio::Graphics + +This returns the scale, in pixels/bp for glyphs constructed by this +factory. + +=cut + +sub scale { shift->{panel}->scale } + +=head2 font + + Title : font + Usage : $font = $f->font + Function : accessor for the font + Returns : a font name + Args : None + Status : Internal to Bio::Graphics + +This returns a GD font name. + +=cut + +sub font { + my $self = shift; + my $glyph = shift; + $self->option($glyph,'font') || $self->{font}; +} + +=head2 map_pt + + Title : map_pt + Usage : @pixel_positions = $f->map_pt(@bp_positions) + Function : map bp positions to pixel positions + Returns : a list of pixel positions + Args : a list of bp positions + Status : Internal to Bio::Graphics + +The real work is done by the panel, but factory subclasses can +override if desired. + +=cut + +sub map_pt { + my $self = shift; + my @result = $self->panel->map_pt(@_); + return wantarray ? @result : $result[0]; +} + +=head2 map_no_trunc + + Title : map_no_trunc + Usage : @pixel_positions = $f->map_no_trunc(@bp_positions) + Function : map bp positions to pixel positions + Returns : a list of pixel positions + Args : a list of bp positions + Status : Internal to Bio::Graphics + +Same as map_pt(), but it will NOT clip pixel positions to be within +the drawing frame. + +=cut + +sub map_no_trunc { + my $self = shift; + my @result = $self->panel->map_no_trunc(@_); + return wantarray ? @result : $result[0]; +} + +=head2 translate_color + + Title : translate_color + Usage : $index = $f->translate_color($color_name) + Function : translate symbolic color names into GD indexes + Returns : an integer + Args : a color name in format "green" or "#00FF00" + Status : Internal to Bio::Graphics + +The real work is done by the panel, but factory subclasses can +override if desired. + +=cut + +sub translate_color { + my $self = shift; + my $color_name = shift; + $self->panel->translate_color($color_name); +} + +=head2 glyph + + Title : glyph + Usage : @glyphs = $f->glyph($level,$feature1,$feature2...) + Function : transform features into glyphs. + Returns : a list of Bio::Graphics::Glyph objects + Args : a feature "level", followed by a list of FeatureI objects. + Status : Internal to Bio::Graphics + +The level is used to track the level of nesting of features that have +subfeatures. + +=cut + +# create a glyph +sub make_glyph { + my $self = shift; + my $level = shift; + my @result; + my $panel = $self->panel; + my ($leftmost,$rightmost) = ($panel->left,$panel->right); + my $flip = $panel->flip; + + for my $f (@_) { + + my $type = $self->feature_to_glyph($f); + my $glyphclass = 'Bio::Graphics::Glyph'; + $type ||= 'generic'; + $glyphclass .= "\:\:\L$type"; + + unless ($LOADED_GLYPHS{$glyphclass}++) { + carp("the requested glyph class, ``$type'' is not available: $@") + unless (eval "require $glyphclass"); + } + my $glyph = $glyphclass->new(-feature => $f, + -factory => $self, + -flip => $flip, + -level => $level); + + # this is removing glyphs that are not onscreen at all. + # But never remove tracks! + push @result,$glyph if $type eq 'track' + || ($glyph->{left} + $glyph->{width} > $leftmost && $glyph->{left} < $rightmost); + + } + return wantarray ? @result : $result[0]; +} + +=head2 feature_to_glyph + + Title : feature_to_glyph + Usage : $glyph_name = $f->feature_to_glyph($feature) + Function : choose the glyph name given a feature + Returns : a glyph name + Args : a Bio::Seq::FeatureI object + Status : Internal to Bio::Graphics + +=cut + +sub feature_to_glyph { + my $self = shift; + my $feature = shift; + + return scalar $self->{stylesheet}->glyph($feature) if $self->{stylesheet}; + my $map = $self->glyph_map or return 'generic'; + if (ref($map) eq 'CODE') { + my $val = eval {$map->($feature)}; + warn $@ if $@; + return $val || 'generic'; + } + return $map->{$feature->primary_tag} || 'generic'; +} + + +=head2 set_option + + Title : set_option + Usage : $f->set_option($option_name=>$option_value) + Function : set or change an option + Returns : nothing + Args : a name/value pair + Status : Internal to Bio::Graphics + +=cut + +sub set_option { + my $self = shift; + my ($option_name,$option_value) = @_; + $self->{overriding_options}{lc $option_name} = $option_value; +} + +# options: +# the overriding_options hash has precedence +# ...followed by the option_map +# ...followed by the stylesheet +# ...followed by generic options +sub option { + my $self = shift; + my ($glyph,$option_name,$partno,$total_parts) = @_; + return unless defined $option_name; + $option_name = lc $option_name; # canonicalize + + return $self->{overriding_options}{$option_name} + if exists $self->{overriding_options} && exists $self->{overriding_options}{$option_name}; + + if (my $map = $self->option_map) { + if (defined(my $value = $map->{$option_name})) { + my $feature = $glyph->feature; + return $value unless ref $value eq 'CODE'; + return unless $feature->isa('Bio::SeqFeatureI'); + my $val = eval { $value->($feature,$option_name,$partno,$total_parts,$glyph)}; + warn $@ if $@; + return defined $val && $val eq '*default*' ? $GENERIC_OPTIONS{$option_name} : $val; + } + } + + if (my $ss = $self->stylesheet) { + my($glyph,%options) = $ss->glyph($glyph->feature); + my $value = $options{$option_name}; + return $value if defined $value; + } + + return $GENERIC_OPTIONS{$option_name}; +} + + +=head2 options + + Title : options + Usage : @option_names = $f->options + Function : return all configured option names + Returns : a list of option names + Args : none + Status : Internal to Bio::Graphics + +=cut + +# return names of all the options in the option hashes +sub options { + my $self = shift; + my %options; + if (my $map = $self->option_map) { + $options{lc($_)}++ foreach keys %$map; + } + $options{lc($_)}++ foreach keys %GENERIC_OPTIONS; + return keys %options; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/alignment.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/alignment.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,124 @@ +package Bio::Graphics::Glyph::alignment; + +use strict; + +use Bio::Graphics::Glyph::graded_segments; +use vars '@ISA'; +@ISA = 'Bio::Graphics::Glyph::graded_segments'; + +1; + +__END__ + +=head1 NAME + +Bio::Graphics::Glyph::alignment - The "alignment" glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This is identical to the "graded_segments" glyph, and is used for +drawing features that consist of discontinuous segments. The +color intensity of each segment is proportionate to the score. + +=head2 OPTIONS + +The following options are standard among all Glyphs. See +L for a full explanation. + + Option Description Default + ------ ----------- ------- + + -fgcolor Foreground color black + + -outlinecolor Synonym for -fgcolor + + -bgcolor Background color turquoise + + -fillcolor Synonym for -bgcolor + + -linewidth Line width 1 + + -height Height of glyph 10 + + -font Glyph font gdSmallFont + + -connector Connector type 0 (false) + + -connector_color + Connector color black + + -label Whether to draw a label 0 (false) + + -description Whether to draw a description 0 (false) + + -strand_arrow Whether to indicate 0 (false) + strandedness + + +In addition, the alignment glyph recognizes the following +glyph-specific options: + + Option Description Default + ------ ----------- ------- + + -max_score Maximum value of the Calculated + feature's "score" attribute + + -min_score Minimum value of the Calculated + feature's "score" attribute + +If max_score and min_score are not specified, then the glyph will +calculate the local maximum and minimum scores at run time. + + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Allen Day Eday@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/anchored_arrow.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/anchored_arrow.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,190 @@ +package Bio::Graphics::Glyph::anchored_arrow; +# package to use for drawing an arrow + +use strict; +use vars '@ISA'; +use Bio::Graphics::Glyph::arrow; +@ISA = 'Bio::Graphics::Glyph::arrow'; + +sub draw_label { + my $self = shift; + my ($gd,$left,$top,$partno,$total_parts) = @_; + my $label = $self->label or return; + my $label_align = $self->option('label_align'); + if ($label_align && ($label_align eq 'center' || $label_align eq 'right')) { + my $x = $self->left + $left; + my $font = $self->option('labelfont') || $self->font; + my $middle = $self->left + $left + ($self->right - $self->left) / 2; + my $label_width = $font->width * length($label); + if ($label_align eq 'center') { + my $new_x = $middle - $label_width / 2; + $x = $new_x if ($new_x > $x);; + } + else { + my $new_x = $left + $self->right - $label_width; + $x = $new_x if ($new_x > $x); + } + $x = $self->panel->left + 1 if $x <= $self->panel->left; + #detect collision (most likely no bump when want centering label) + #lay down all features on one line e.g. cyto bands + return if (!$self->option('bump') && ($label_width + $x) > $self->right); + $gd->string($font, + $x, + $self->top + $top, + $label, + $self->fontcolor); + } + else { + $self->SUPER::draw_label(@_); + } +} + +sub arrowheads { + my $self = shift; + my ($ne,$sw,$base_e,$base_w); + my ($x1,$y1,$x2,$y2) = $self->calculate_boundaries(@_); + + my $gstart = $x1; + my $gend = $x2; + my $pstart = $self->panel->left; + my $pend = $self->panel->right-1; + + if ($gstart <= $pstart) { # off left end + $sw = 1; + } + if ($gend >= $pend) { # off right end + $ne = 1; + } + return ($sw,$ne,!$sw,!$ne); +} + +sub no_trunc { + !shift->option('no_arrows'); +} + +1; + +__END__ + +=head1 NAME + +Bio::Graphics::Glyph::anchored_arrow - The "anchored_arrow" glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This glyph draws an arrowhead which is anchored at one or both ends +(has a vertical base) or has one or more arrowheads. The arrowheads +indicate that the feature does not end at the edge of the picture, but +continues. For example: + + |-----------------------------| both ends in picture + <----------------------| left end off picture + |----------------------------> right end off picture + <------------------------------------> both ends off picture + +You can also set the glyph so that the end is just truncated at the +end of the picture. + + |----------------------------- + +=head2 OPTIONS + +The following options are standard among all Glyphs. See +L for a full explanation. + + Option Description Default + ------ ----------- ------- + + -fgcolor Foreground color black + + -outlinecolor Synonym for -fgcolor + + -bgcolor Background color turquoise + + -fillcolor Synonym for -bgcolor + + -linewidth Line width 1 + + -height Height of glyph 10 + + -font Glyph font gdSmallFont + + -connector Connector type 0 (false) + + -connector_color + Connector color black + + -label Whether to draw a label 0 (false) + + -description Whether to draw a description 0 (false) + +In addition to the standard options, this glyph recognizes the following: + + Option Description Default + + -tick draw a scale 0 (false) + + -rel_coords use relative coordinates 0 (false) + for scale + + -no_arrows don't draw an arrow when 0 (false) + glyph is partly offscreen + +The argument for B<-tick> is an integer between 0 and 2 and has the same +interpretation as the B<-tick> option in Bio::Graphics::Glyph::arrow. + +If B<-rel_coords> is set to a true value, then the scale drawn on the +glyph will be in relative (1-based) coordinates relative to the beginning +of the glyph. + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Allen Day Eday@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/arrow.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/arrow.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,388 @@ +package Bio::Graphics::Glyph::arrow; +# package to use for drawing an arrow + +use strict; +use vars '@ISA'; +use Bio::Graphics::Glyph::generic; +@ISA = 'Bio::Graphics::Glyph::generic'; + +my %UNITS = (n => 1e-12, + n => 1e-9, + u => 1e-6, + m => 0.001, + c => 0.01, + k => 1000, + M => 1_000_000, + G => 1_000_000_000); + +sub pad_bottom { + my $self = shift; + my $val = $self->SUPER::pad_bottom(@_); + $val += $self->font->height if $self->option('tick'); + $val; +} + +# override draw method +sub draw { + my $self = shift; + my $parallel = $self->option('parallel'); + $parallel = 1 unless defined $parallel; + $self->draw_parallel(@_) if $parallel; + $self->draw_perpendicular(@_) unless $parallel; +} + +sub draw_perpendicular { + my $self = shift; + my $gd = shift; + my ($dx,$dy) = @_; + my ($x1,$y1,$x2,$y2) = $self->bounds(@_); + + my $ne = $self->option('northeast'); + my $sw = $self->option('southwest'); + $ne = $sw = 1 unless defined($ne) || defined($sw); + + # draw a perpendicular arrow at position indicated by $x1 + my $fg = $self->set_pen; + my $a2 = ($y2-$y1)/4; + + my @positions = $x1 == $x2 ? ($x1) : ($x1,$x2); + for my $x (@positions) { + if ($ne) { + $gd->line($x,$y1,$x,$y2,$fg); + $gd->line($x-$a2,$y1+$a2,$x,$y1,$fg); + $gd->line($x+$a2,$y1+$a2,$x,$y1,$fg); + } + if ($sw) { + $gd->line($x,$y1,$x,$y2,$fg); + $gd->line($x-$a2,$y2-$a2,$x,$y2,$fg); + $gd->line($x+$a2,$y2-$a2,$x,$y2,$fg); + } + } + + # add a label if requested + $self->draw_label($gd,$dx,$dy) if $self->option('label'); # this draws the label aligned to the left +} + +sub draw_parallel { + my $self = shift; + my $gd = shift; + my ($dx,$dy) = @_; + my ($x1,$y1,$x2,$y2) = $self->bounds(@_); + + my $fg = $self->set_pen; + my $a2 = ($self->height)/2; + my $center = $y1+$a2; + + my $trunc_left = $x1 < $self->panel->left; + my $trunc_right = $x2 > $self->panel->right; + + $x1 = $self->panel->left if $trunc_left; + $x2 = $self->panel->right if $trunc_right; + + $trunc_left = 0 if $self->no_trunc; + $trunc_right = 0 if $self->no_trunc; + + my ($sw,$ne,$base_w,$base_e) = $self->arrowheads; + $gd->line($x1,$center,$x2,$center,$fg); + $self->arrowhead($gd,$x1,$center,$a2,-1) if $sw && !$trunc_left; # west arrow + $self->arrowhead($gd,$x2,$center,$a2,+1) if $ne && !$trunc_right; # east arrow + $gd->line($x1,$center-$a2,$x1,$center+$a2,$fg) if $base_w && !$trunc_left; #west base + $gd->line($x2,$center-$a2,$x2,$center+$a2,$fg) if $base_e && !$trunc_right; #east base + + # turn on ticks + if ($self->option('tick')) { + local $^W = 0; # dumb uninitialized variable warning + my $font = $self->font; + my $width = $font->width; + my $font_color = $self->fontcolor; + my $height = $self->height; + + my $relative = $self->option('relative_coords'); + my $relative_coords_offset = $self->option('relative_coords_offset'); + $relative_coords_offset = 1 unless ($relative_coords_offset); + + my $start = $relative ? $relative_coords_offset : $self->feature->start-1; + my $stop = $start + $self->feature->length - 1; + + my $offset = $relative ? ($self->feature->start - $relative_coords_offset) : 0; + my $reversed = exists $self->{flip} || ($relative && $self->feature->strand < 0); + + my $unit_label = $self->option('units') || ''; + my $unit_divider = $self->option('unit_divider') || 1; + + my $units = $self->calculate_units($start/$unit_divider,$self->feature->length/$unit_divider); + my $divisor = $UNITS{$units} || 1; + + $divisor *= $unit_divider; + + my $format = min($self->feature->length,$self->panel->length)/$divisor > 10 + ? "%d$units%s" : "%.6g$units%s"; + + my $scale = $self->option('scale') || 1; ## Does the user want to override the internal scale? + + my $model = sprintf("$format ",$stop/($divisor*$scale),$unit_label); + my $minlen = $width * length($model); + + my ($major_interval,$minor_interval) = $self->panel->ticks(($stop-$start+1)/$unit_divider,$minlen); + + my $left = $sw ? $x1+$height : $x1; + my $right = $ne ? $x2-$height : $x2; + + # adjust for portions of arrow that are outside panel + $start += $self->panel->start - $self->feature->start + if $self->feature->start < $self->panel->start; + $stop -= $self->feature->end - $self->panel->end + if $self->feature->end > $self->panel->end; + + my $first_tick = $major_interval * int(0.5 + $start/$major_interval); + my $last_tick = $major_interval * int(0.5 + $stop/$major_interval); + + for (my $i = $first_tick; $i <= $last_tick; $i += $major_interval) { + + my $tickpos = $dx + ($reversed ? $self->map_pt($stop - $i + $offset) + : $self->map_pt($i + $offset)); + next if $tickpos < $left or $tickpos > $right; + + $gd->line($tickpos,$center-$a2,$tickpos,$center+$a2,$fg); + my $label = $scale ? $i / $scale : $i; + my $scaled = $label/$divisor; + $label = sprintf($format,$scaled,$unit_label); + + my $middle = $tickpos - (length($label) * $width)/2; + next if $middle < $left or $middle > $right; + + $gd->string($font,$middle,$center+$a2-1,$label,$font_color) + unless ($self->option('no_tick_label')); + } + + if ($self->option('tick') >= 2) { + + $first_tick = $minor_interval * int(0.5 + $start/$minor_interval); + $last_tick = $minor_interval * int(0.5 + $stop/$minor_interval); + + my $a4 = $self->height/4; + for (my $i = $first_tick; $i <= $last_tick; $i += $minor_interval) { + my $tickpos = $dx + ($reversed ? $self->map_pt($stop - $i + $offset) + : $self->map_pt($i + $offset)); + next if $tickpos < $left or $tickpos > $right; + + $gd->line($tickpos,$center-$a4,$tickpos,$center+$a4,$fg); + } + } + } + + # add a label if requested + $self->draw_label($gd,$dx,$dy) if $self->option('label'); + $self->draw_description($gd,$dx,$dy) if $self->option('description'); +} + +sub arrowheads { + my $self = shift; + my ($ne,$sw,$base_e,$base_w); + if ($self->option('double')) { + $ne = $sw = 1; + } else { + $ne = $self->option('northeast') || $self->option('east'); + $sw = $self->option('southwest') || $self->option('west'); + } + # otherwise use strandedness to define the arrow + unless (defined($ne) || defined($sw)) { + # turn on both if neither specified + $ne = 1 if $self->feature->strand > 0; + $sw = 1 if $self->feature->strand < 0; + ($ne,$sw) = ($sw,$ne) if $self->{flip}; + } + return ($sw,$ne,0,0) unless $self->option('base'); + return ($sw,$ne, + (!$sw && $self->feature->start>= $self->panel->start), + (!$ne && $self->feature->end <= $self->panel->end)); +} + +sub no_trunc { 0; } + +sub calculate_units { + my $self = shift; + my ($start,$length) = @_; + return 'G' if $length >= 1e9; + return 'M' if $length >= 1e6; + return 'k' if $length >= 1e3; + return '' if $length >= 1; + return 'c' if $length >= 1e-2; + return 'm' if $length >= 1e-3; + return 'u' if $length >= 1e-6; + return 'n' if $length >= 1e-9; + return 'p'; +} + +sub min { $_[0]<$_[1] ? $_[0] : $_[1] } + +1; + +__END__ + +=head1 NAME + +Ace::Graphics::Glyph::arrow - The "arrow" glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This glyph draws arrows. Depending on options, the arrows can be +labeled, be oriented vertically or horizontally, or can contain major +and minor ticks suitable for use as a scale. + +=head2 OPTIONS + +The following options are standard among all Glyphs. See +L for a full explanation. + + Option Description Default + ------ ----------- ------- + + -fgcolor Foreground color black + + -outlinecolor Synonym for -fgcolor + + -bgcolor Background color turquoise + + -fillcolor Synonym for -bgcolor + + -linewidth Line width 1 + + -height Height of glyph 10 + + -font Glyph font gdSmallFont + + -connector Connector type 0 (false) + + -connector_color + Connector color black + + -label Whether to draw a label 0 (false) + + -description Whether to draw a description 0 (false) + +In addition to the common options, the following glyph-specific +options are recognized: + + Option Description Default + ------ ----------- ------- + + -tick Whether to draw major 0 + and minor ticks. + 0 = no ticks + 1 = major ticks + 2 = minor ticks + + -parallel Whether to draw the arrow 1 (true) + parallel to the sequence + or perpendicular to it. + + -northeast Force a north or east 1 (true) + arrowhead(depending + on orientation) + + -east synonym of above + + -southwest Force a south or west 1 (true) + arrowhead(depending + on orientation) + + -west synonym of above + + -double force-doubleheaded arrow 0 (false) + + -base Draw a vertical base at the 0 (false) + non-arrowhead side + + -scale Reset the labels on the arrow 0 (false) + to reflect an externally + established scale. + + -arrowstyle "regular" to create a simple regular + arrowhead. "filled" to create + a thick filled arrowhead + + -units add units to the tick labels none + e.g. bp + + -unit_divider 1 + divide tick labels by the + indicated amount prior to + displaying (use, for example + if you want to display in + cR units) + +Set -parallel to 0 (false) to display a point-like feature such as a +polymorphism, or to indicate an important location. If the feature +start == end, then the glyph will draw a single arrow at the +designated location: + + ^ + | + +Otherwise, there will be two arrows at the start and end: + + ^ ^ + | | + +Scale: Pass in a externally established scale to reset the labels on +the arrow. This is particularly useful for manually constructed +images where the founding parameters of the panel are not 1-based. +For example, a genetic map interval ranging from 0.1 - 0.3 can be +constructed by first multiplying every value by 100. Passing + + arrow(-scale=>100); + +will draw tick marks labelled appropriately to your external scale. + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/box.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/box.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,112 @@ +package Bio::Graphics::Glyph::box; +# DAS-compatible package to use for drawing a box + +use strict; +use vars '@ISA'; +use Bio::Graphics::Glyph::generic; +@ISA = 'Bio::Graphics::Glyph::generic'; + +sub subseq { + return (); +} + +1; + +__END__ + +=head1 NAME + +Bio::Graphics::Glyph::box - The "box" glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This is the most basic glyph. It draws a filled box and optionally a +label. It does *NOT* draw subparts, and so is useful for semantic +zooming when one is zoomed out to far to see substructure. + +=head2 OPTIONS + +The following options are standard among all Glyphs. See +L for a full explanation. + + Option Description Default + ------ ----------- ------- + + -fgcolor Foreground color black + + -outlinecolor Synonym for -fgcolor + + -bgcolor Background color turquoise + + -fillcolor Synonym for -bgcolor + + -linewidth Line width 1 + + -height Height of glyph 10 + + -font Glyph font gdSmallFont + + -connector Connector type 0 (false) + + -connector_color + Connector color black + + -label Whether to draw a label 0 (false) + + -description Whether to draw a description 0 (false) + + -strand_arrow Whether to indicate 0 (false) + strandedness + + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Allen Day Eday@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/cds.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/cds.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,363 @@ +package Bio::Graphics::Glyph::cds; + +use strict; +use Bio::Graphics::Glyph::segments; +use Bio::Graphics::Util qw(frame_and_offset); +use Bio::Tools::CodonTable; +use Bio::Graphics::Glyph::translation; +use vars '@ISA'; +@ISA = qw(Bio::Graphics::Glyph::segmented_keyglyph Bio::Graphics::Glyph::translation); + +my %default_colors = qw( + frame0f cadetblue + frame1f blue + frame2f darkblue + frame0r darkred + frame1r red + frame2r crimson + ); + +sub connector { 0 }; +sub description { + my $self = shift; + return if $self->level; + return $self->SUPER::description; +}; + +sub default_color { + my ($self,$key) = @_; + return $self->factory->translate_color($default_colors{$key}); +} + +sub sixframe { + my $self = shift; + $self->{sixframe} = $self->option('sixframe') + unless exists $self->{sixframe}; + return $self->{sixframe}; +} + +sub require_subparts { + my $self = shift; + my $rs = $self->option('require_subparts'); + $rs = $self->feature->type eq 'coding' if !defined $rs; # shortcut for the "coding" aggregator + $rs; +} + +# figure out (in advance) the color of each component +sub draw { + my $self = shift; + my ($gd,$left,$top) = @_; + + my @parts = $self->parts; + @parts = $self if !@parts && $self->level == 0 && !$self->require_subparts; + + my $fits = $self->protein_fits; + + # draw the staff (musically speaking) + my ($x1,$y1,$x2,$y2) = $self->bounds($left,$top); + my $line_count = $self->sixframe ? 6 : 3; + my $height = ($y2-$y1)/$line_count; + my $grid = $self->gridcolor; + for (0..$line_count-1) { + my $offset = $y1+$height*$_+1; + $gd->line($x1,$offset,$x2,$offset,$grid); + } + + $self->{cds_part2color} ||= {}; + my $fill = $self->bgcolor; + my $strand = $self->feature->strand; + + # figure out the colors of each part + # sort minus strand features backward + @parts = map { $_->[0] } + sort { $b->[1] <=> $a->[1] } + map { [$_, $_->left ] } @parts if $strand < 0; + my $translate_table = Bio::Tools::CodonTable->new; + + for (my $i=0; $i < @parts; $i++) { + my $part = $parts[$i]; + my $feature = $part->feature; + my $pos = $strand > 0 ? $feature->start : $feature->end; + my $phase = eval {$feature->phase} || 0; + my ($frame,$offset) = frame_and_offset($pos, + $feature->strand, + -$phase); + my $suffix = $strand < 0 ? 'r' : 'f'; + my $key = "frame$frame$suffix"; + $self->{cds_frame2color}{$key} ||= $self->color($key) || $self->default_color($key) || $fill; + $part->{cds_partcolor} = $self->{cds_frame2color}{$key}; + $part->{cds_frame} = $frame; + $part->{cds_offset} = $offset; + + if ($fits && $part->feature->seq) { + + # do in silico splicing in order to find the codon that + # arises from the splice + my $protein = $part->feature->translate(undef,undef,$phase)->seq; + $part->{cds_translation} = $protein; + + BLOCK: { + length $protein >= $feature->length/3 and last BLOCK; + ($feature->length - $phase) % 3 == 0 and last BLOCK; + + my $next_part = $parts[$i+1] + or do { + $part->{cds_splice_residue} = '?'; + last BLOCK; }; + + my $next_feature = $next_part->feature or last BLOCK; + my $next_phase = eval {$next_feature->phase} or last BLOCK; + my $splice_codon = ''; + my $left_of_splice = substr($feature->seq,-$next_phase,$next_phase); + my $right_of_splice = substr($next_feature->seq,0,3-$next_phase); + $splice_codon = $left_of_splice . $right_of_splice; + length $splice_codon == 3 or last BLOCK; + my $amino_acid = $translate_table->translate($splice_codon); + $part->{cds_splice_residue} = $amino_acid; + } + } + } + + $self->Bio::Graphics::Glyph::generic::draw($gd,$left,$top); +} + + +# draw the notes on the staff +sub draw_component { + my $self = shift; + my $gd = shift; + my ($x1,$y1,$x2,$y2) = $self->bounds(@_); + + my $color = $self->{cds_partcolor} or return; + my $feature = $self->feature; + my $frame = $self->{cds_frame}; + my $linecount = $self->sixframe ? 6 : 3; + + unless ($self->protein_fits) { + my $height = ($y2-$y1)/$linecount; + my $offset = $y1 + $height*$frame; + $offset += ($y2-$y1)/2 if $self->sixframe && $self->strand < 0; + $gd->filledRectangle($x1,$offset,$x2,$offset+2,$color); + return; + } + + # we get here if there's room to draw the primary sequence + my $font = $self->font; + my $pixels_per_residue = $self->pixels_per_residue; + my $strand = $feature->strand; + my $y = $y1-1; + + $strand *= -1 if $self->{flip}; + + # have to remap feature start and end into pixel coords in order to: + # 1) correctly align the amino acids with the nucleotide seq + # 2) correct for the phase offset + my $start = $self->map_no_trunc($feature->start + $self->{cds_offset}); + my $stop = $self->map_no_trunc($feature->end + $self->{cds_offset}); + ($start,$stop) = ($stop,$start) if $self->{flip}; + + my @residues = split '',$self->{cds_translation}; + + push @residues,$self->{cds_splice_residue} if $self->{cds_splice_residue}; + for (my $i=0;$i<@residues;$i++) { + my $x = $strand > 0 ? $start + $i * $pixels_per_residue + : $stop - $i * $pixels_per_residue; + next unless ($x >= $x1 && $x <= $x2); + $gd->char($font,$x+1,$y,$residues[$i],$color); + } +} + +sub make_key_feature { + my $self = shift; + my @gatc = qw(g a t c); + my $offset = $self->panel->offset; + my $scale = 1/$self->scale; # base pairs/pixel + my $start = $offset; + my $stop = $offset + 100 * $scale; + my $seq = join('',map{$gatc[rand 4]} (1..1500)); + my $feature = + Bio::Graphics::Feature->new(-start=> $start, + -end => $stop, + -seq => $seq, + -name => $self->option('key'), + -strand=> +1, + ); + $feature->add_segment(Bio::Graphics::Feature->new( + -start=> $start, + -end => $start + ($stop - $start)/2, + -seq => $seq, + -name => $self->option('key'), + -strand=> +1, + ), + Bio::Graphics::Feature->new( + -start=> $start + ($stop - $start)/2+1, + -end => $stop, + -seq => $seq, + -name => $self->option('key'), + -phase=> 1, + -strand=> +1, + )); + $feature; +} + +# never allow our components to bump +sub bump { + my $self = shift; + return $self->SUPER::bump(@_) if $self->all_callbacks; + return 0; +} + +1; + +__END__ + +=head1 NAME + +Bio::Graphics::Glyph::cds - The "cds" glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This glyph draws features that are associated with a protein coding +region. At high magnifications, draws a series of boxes that are +color-coded to indicate the frame in which the translation occurs. At +low magnifications, draws the amino acid sequence of the resulting +protein. Amino acids that are created by a splice are optionally +shown in a distinctive color. + +=head2 OPTIONS + +The following options are standard among all Glyphs. See +L for a full explanation. + + Option Description Default + ------ ----------- ------- + + -fgcolor Foreground color black + + -outlinecolor Synonym for -fgcolor + + -bgcolor Background color turquoise + + -fillcolor Synonym for -bgcolor + + -linewidth Line width 1 + + -height Height of glyph 10 + + -font Glyph font gdSmallFont + + -connector Connector type 0 (false) + + -connector_color + Connector color black + + -label Whether to draw a label 0 (false) + + -description Whether to draw a description 0 (false) + + -strand_arrow Whether to indicate 0 (false) + strandedness + +In addition, the alignment glyph recognizes the following +glyph-specific options: + + Option Description Default + ------ ----------- ------- + + -frame0f Color for first (+) frame background color + + -frame1f Color for second (+) frame background color + + -frame2f Color for third (+) frame background color + + -frame0r Color for first (-) frame background color + + -frame1r Color for second (-) frame background color + + -frame2r Color for third (-) frame background color + + -gridcolor Color for the "staff" lightslategray + + -sixframe Draw a six-frame staff 0 (false; usually draws 3 frame) + + -require_subparts + Don't draw the reading frame 0 (false) + unless it is a feature + subpart. + +The -require_subparts option is suggested when rendering spliced +transcripts which contain multiple CDS subparts. Otherwise, the glyph +will hickup when zoomed way down onto an intron between two CDSs (a +phantom reading frame will appear). For unspliced sequences, do *not* +use -require_subparts. + +=head1 SUGGESTED STANZA FOR GENOME BROWSER + +Using the "coding" aggregator, this produces a nice gbrowse display. + + [CDS] + feature = coding + glyph = cds + frame0f = cadetblue + frame1f = blue + frame2f = darkblue + frame0r = darkred + frame1r = red + frame2r = crimson + description = 0 + height = 13 + label = CDS frame + key = CDS + citation = This track shows CDS reading frames. + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/crossbox.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/crossbox.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,124 @@ +package Bio::Graphics::Glyph::crossbox; + +use strict; +use vars '@ISA'; +@ISA = 'Bio::Graphics::Glyph::generic'; +use Bio::Graphics::Glyph::generic; + +# override draw_component to draw a crossed box rather than empty +sub draw_component { + my $self = shift; + my $gd = shift; + my $fg = $self->fgcolor; + my ($left,$top) = @_; + my($x1,$y1,$x2,$y2) = $self->bounds(@_); + $self->unfilled_box($gd, + $x1, $y1, + $x2, $y2); + + if ($self->option('bgcolor')){ + my $c = $self->color('bgcolor'); + my $xmid = ($x2+$x1)/2; + my $ymid = ($y2+$y1)/2; + $gd->fill($xmid,$ymid,$c); + } + + $gd->line($x1,$y1,$x2,$y2,$fg); + $gd->line($x1,$y2,$x2,$y1,$fg); +} + + +1; + +__END__ + +=head1 NAME + +Bio::Graphics::Glyph::crossbox - The "crossbox" glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This is a box with an 'X' inside the glyph. + +=head2 OPTIONS + +The following options are standard among all Glyphs. See +L for a full explanation. + + Option Description Default + ------ ----------- ------- + + -fgcolor Foreground color black + + -outlinecolor Synonym for -fgcolor + + -bgcolor Background color turquoise + + -fillcolor Synonym for -bgcolor + + -linewidth Line width 1 + + -height Height of glyph 10 + + -font Glyph font gdSmallFont + + -connector Connector type 0 (false) + + -connector_color + Connector color black + + -label Whether to draw a label 0 (false) + + -description Whether to draw a description 0 (false) + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Allen Day Eday@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/diamond.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/diamond.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,141 @@ +package Bio::Graphics::Glyph::diamond; +# DAS-compatible package to use for drawing a colored diamond + +use strict; +use vars '@ISA'; +use Bio::Graphics::Glyph::generic; +@ISA = 'Bio::Graphics::Glyph::generic'; + +sub draw_component { + my $self = shift; + my $gd = shift; + my $fg = $self->fgcolor; + + # find the center and vertices + my ($x1,$y1,$x2,$y2) = $self->calculate_boundaries(@_); + my $xmid = ($x1+$x2)/2; + my $ymid = ($y1+$y2)/2; + + my $h = $self->option('height')/2; + $y1 = $ymid - $h; + $y2 = $ymid + $h; + + # if it's a point-like feature, then draw symmetrically + # around the midpoing + if ($self->option('point') || $x2 - $x1 < $h*2) { + $x1 = $xmid - $h; + $x2 = $xmid + $h; + } + + elsif ($self->option('fallback_to_rectangle')) { + return $self->SUPER::draw_component($gd,@_); + } + + $gd->line($x1,$ymid,$xmid,$y1,$fg); + $gd->line($xmid,$y1,$x2,$ymid,$fg); + $gd->line($x2,$ymid,$xmid,$y2,$fg); + $gd->line($xmid,$y2,$x1,$ymid,$fg); + + if (my $c = $self->bgcolor) { + $gd->fillToBorder($xmid,$ymid,$fg,$c); + } +} + +1; + +__END__ + +=head1 NAME + +Bio::Graphics::Glyph::diamond - The "diamond" glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This glyph draws a diamond of fixed size, positioned in the center of +the feature. The height and width of the diamond are set by the +"height" option. + +=head2 OPTIONS + +The following options are standard among all Glyphs. See +L for a full explanation. + + Option Description Default + ------ ----------- ------- + + -fgcolor Foreground color black + + -outlinecolor Synonym for -fgcolor + + -bgcolor Background color turquoise + + -fillcolor Synonym for -bgcolor + + -linewidth Line width 1 + + -height Height of glyph 10 + + -font Glyph font gdSmallFont + + -connector Connector type 0 (false) + + -connector_color + Connector color black + + -label Whether to draw a label 0 (false) + + -description Whether to draw a description 0 (false) + +=head1 BUGS + +If the feature is wider than a point, then the label and description +are placed where the feature's boundary is, and not where the diamond +is drawn. + +=head1 SEE ALSO + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/dna.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/dna.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,286 @@ +package Bio::Graphics::Glyph::dna; + +use strict; +use Bio::Graphics::Glyph::generic; +use vars '@ISA'; +@ISA = qw(Bio::Graphics::Glyph::generic); + +my %complement = (g=>'c',a=>'t',t=>'a',c=>'g',n=>'n', + G=>'C',A=>'T',T=>'A',C=>'G',N=>'N'); + +# turn off description +sub description { 0 } + +# turn off label +# sub label { 1 } + +sub height { + my $self = shift; + my $font = $self->font; + return $self->dna_fits ? 2*$font->height + : $self->do_gc ? $self->SUPER::height + : 0; +} + +sub do_gc { + my $self = shift; + my $do_gc = $self->option('do_gc'); + return if defined($do_gc) && !$do_gc; + return 1; +} + +sub draw_component { + my $self = shift; + my $gd = shift; + my ($x1,$y1,$x2,$y2) = $self->bounds(@_); + + my $dna = eval { $self->feature->seq }; + $dna = $dna->seq if ref($dna) and $dna->can('seq'); # to catch Bio::PrimarySeqI objects + $dna or return; + + # workaround for my misreading of interface -- LS + $dna = $dna->seq if ref($dna) && $dna->can('seq'); + + if ($self->dna_fits) { + $self->draw_dna($gd,$dna,$x1,$y1,$x2,$y2); + } elsif ($self->do_gc) { + $self->draw_gc_content($gd,$dna,$x1,$y1,$x2,$y2); + } +} + +sub draw_dna { + my $self = shift; + + my ($gd,$dna,$x1,$y1,$x2,$y2) = @_; + my $pixels_per_base = $self->scale; + + my $feature = $self->feature; + + my $strand = $feature->strand; + $strand *= -1 if $self->{flip}; + + my @bases = split '',$strand >= 0 ? $dna : $self->reversec($dna); + my $color = $self->fgcolor; + my $font = $self->font; + my $lineheight = $font->height; + $y1 -= $lineheight/2 - 3; + my $strands = $self->option('strand') || 'auto'; + + my ($forward,$reverse); + if ($strands eq 'auto') { + $forward = $feature->strand >= 0; + $reverse = $feature->strand <= 0; + } elsif ($strands eq 'both') { + $forward = $reverse = 1; + } elsif ($strands eq 'reverse') { + $reverse = 1; + } else { + $forward = 1; + } + + my $start = $self->map_no_trunc($feature->start); + my $end = $self->map_no_trunc($feature->end); + + my $offset = int(($x1-$start-1)/$pixels_per_base); + + for (my $i=$offset;$i<@bases;$i++) { + my $x = $start + $i * $pixels_per_base; + next if $x+1 < $x1; + last if $x > $x2; + $gd->char($font,$x+1,$y1,$bases[$i],$color) if $forward; + $gd->char($font,$x+1,$y1+($forward ? $lineheight:0),$complement{$bases[$i]}||$bases[$i],$color) if $reverse; + } + +} + +sub draw_gc_content { + my $self = shift; + my $gd = shift; + my $dna = shift; + my ($x1,$y1,$x2,$y2) = @_; + + my $bin_size = length($dna) / ($self->option('gc_bins') || 100); + $bin_size = 100 if $bin_size < 100; + + my @bins; + for (my $i = 0; $i < length($dna) - $bin_size; $i+= $bin_size) { + my $subseq = substr($dna,$i,$bin_size); + my $gc = $subseq =~ tr/gcGC/gcGC/; + my $content = $gc/$bin_size; + push @bins,$content; + } + push @bins,0.5 unless @bins; # avoid div by zero + my $bin_width = ($x2-$x1)/@bins; + my $bin_height = $y2-$y1; + my $fgcolor = $self->fgcolor; + my $bgcolor = $self->factory->translate_color($self->panel->gridcolor); + my $axiscolor = $self->color('axis_color') || $fgcolor; + + $gd->line($x1, $y1, $x1, $y2, $axiscolor); + $gd->line($x2-2,$y1, $x2-2,$y2, $axiscolor); + $gd->line($x1, $y1, $x1+3,$y1, $axiscolor); + $gd->line($x1, $y2, $x1+3,$y2, $axiscolor); + $gd->line($x1, ($y2+$y1)/2,$x1+3,($y2+$y1)/2,$axiscolor); + $gd->line($x2-4,$y1, $x2-1, $y1, $axiscolor); + $gd->line($x2-4,$y2, $x2-1, $y2, $axiscolor); + $gd->line($x2-4,($y2+$y1)/2,$x2-1,($y2+$y1)/2,$axiscolor); + $gd->line($x1+5,$y2, $x2-5,$y2, $bgcolor); + $gd->line($x1+5,($y2+$y1)/2,$x2-5,($y2+$y1)/2,$bgcolor); + $gd->line($x1+5,$y1, $x2-5,$y1, $bgcolor); + $gd->string($self->font,$x1+5,$y1,'% gc',$axiscolor) if $bin_height > $self->font->height*2; + + for (my $i = 0; $i < @bins; $i++) { + my $bin_start = $x1+$i*$bin_width; + my $bin_stop = $bin_start + $bin_width; + my $y = $y2 - ($bin_height*$bins[$i]); + $gd->line($bin_start,$y,$bin_stop,$y,$fgcolor); + $gd->line($bin_stop,$y,$bin_stop,$y2 - ($bin_height*$bins[$i+1]),$fgcolor) + if $i < @bins-1; + } +} + +sub make_key_feature { + my $self = shift; + my @gatc = qw(g a t c); + my $offset = $self->panel->offset; + my $scale = 1/$self->scale; # base pairs/pixel + + my $start = $offset+1; + my $stop = $offset+100*$scale; + my $feature = + Bio::Graphics::Feature->new(-start=> $start, + -stop => $stop, + -seq => join('',map{$gatc[rand 4]} (1..500)), + -name => $self->option('key'), + -strand => '+1', + ); + $feature; +} + +1; + +__END__ + +=head1 NAME + +Bio::Graphics::Glyph::dna - The "dna" glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This glyph draws DNA sequences. At high magnifications, this glyph +will draw the actual base pairs of the sequence (both strands). At +low magnifications, the glyph will plot the GC content. + +For this glyph to work, the feature must return a DNA sequence string +in response to the dna() method. + +=head2 OPTIONS + +The following options are standard among all Glyphs. See +L for a full explanation. + + Option Description Default + ------ ----------- ------- + + -fgcolor Foreground color black + + -outlinecolor Synonym for -fgcolor + + -bgcolor Background color turquoise + + -fillcolor Synonym for -bgcolor + + -linewidth Line width 1 + + -height Height of glyph 10 + + -font Glyph font gdSmallFont + + -connector Connector type 0 (false) + + -connector_color + Connector color black + + -label Whether to draw a label 0 (false) + + -description Whether to draw a description 0 (false) + +In addition to the common options, the following glyph-specific +options are recognized: + + Option Description Default + ------ ----------- ------- + + -do_gc Whether to draw the GC true + graph at low mags + + -gc_bins Fixed number of intervals 100 + to sample across the + panel. + + -axis_color Color of the vertical axes fgcolor + in the GC content graph + + -strand Show both forward and auto + reverse strand, one of + "forward", "reverse", + "both" or "auto". + In "auto" mode, + +1 strand features will + show the plus strand + -1 strand features will + show the reverse complement + and strandless features will + show both + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/dot.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/dot.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,150 @@ +package Bio::Graphics::Glyph::dot; +# DAS-compatible package to use for drawing a ring or filled circle + +use strict; +use vars '@ISA'; +use Bio::Graphics::Glyph::generic; +@ISA = 'Bio::Graphics::Glyph::generic'; +use constant PI => 3.14159; + +sub draw { + my $self = shift; +# $self->SUPER::draw(@_); + my $gd = shift; + my $fg = $self->fgcolor; + + # now draw a circle + my ($x1,$y1,$x2,$y2) = $self->calculate_boundaries(@_); + my $xmid = (($x1+$x2)/2); my $width = abs($x2-$x1); + my $ymid = (($y1+$y2)/2); my $height = abs($y2-$y1); + + #only point ovals allowed now + my $r = $self->height ; + $gd->arc($xmid,$ymid,$r,$r,0,360,$fg); + + + if ($self->option('bgcolor')){ + my $c = $self->color('bgcolor'); + $gd->fill($xmid,$ymid,$c); + } + + #how about a fuse for the bomb? + #work in degrees, not radians. So we define PI above + if(defined $self->option('stem')){ + my $angle = $self->option('stem'); + + $gd->line($xmid+($r/PI*sin($angle*PI/180)), + $ymid+($r/PI*cos($angle*PI/180)), + $xmid+($r*sin($angle*PI/180)), + $ymid+($r*cos($angle*PI/180)),$fg); + } + + $self->draw_label($gd,@_) if $self->option('label'); +} + +1; + +__END__ + +=head1 NAME + +Bio::Graphics::Glyph::dot - The "dot" glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This glyph draws an ellipse the width of the scaled feature passed, +and height a possibly configured height (See Bio::Graphics::Glyph). + +=head2 OPTIONS + +The following options are standard among all Glyphs. See +L for a full explanation. + + Option Description Default + ------ ----------- ------- + + -fgcolor Foreground color black + + -outlinecolor Synonym for -fgcolor + + -bgcolor Background color turquoise + + -fillcolor Synonym for -bgcolor + + -linewidth Line width 1 + + -height Height of glyph 10 + + -font Glyph font gdSmallFont + + -connector Connector type 0 (false) + + -connector_color + Connector color black + + -label Whether to draw a label 0 (false) + + -description Whether to draw a description 0 (false) + +In addition to the common options, the following glyph-specific +options are recognized: + + Option Description Default + ------ ----------- ------- + + -point Whether to draw an ellipse feature width + the scaled width of the + feature or with radius + point. + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Allen Day Eday@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/ellipse.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/ellipse.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,114 @@ +package Bio::Graphics::Glyph::ellipse; + +use strict; +use vars '@ISA'; +use Bio::Graphics::Glyph; +@ISA = 'Bio::Graphics::Glyph'; + +# override draw_component to draw an oval rather than a rectangle (weird) +sub draw_component { + my $self = shift; + my $gd = shift; + my ($left,$top) = @_; + my($x1,$y1,$x2,$y2) = $self->bounds(@_); + $self->filled_oval($gd, + $x1, $y1, + $x2, $y2); +} + +1; + +__END__ + +=head1 NAME + +Bio::Graphics::Glyph::ellipse - The "ellipse" glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This glyph draws an oval instead of a box; otherwise it is similar to +the "generic" or "box" glyphs. The width of the oval is determined by +the feature width, and the height by the -height option. + +=head2 OPTIONS + +The following options are standard among all Glyphs. See +L for a full explanation. + + Option Description Default + ------ ----------- ------- + + -fgcolor Foreground color black + + -outlinecolor Synonym for -fgcolor + + -bgcolor Background color turquoise + + -fillcolor Synonym for -bgcolor + + -linewidth Line width 1 + + -height Height of glyph 10 + + -font Glyph font gdSmallFont + + -connector Connector type 0 (false) + + -connector_color + Connector color black + + -label Whether to draw a label 0 (false) + + -description Whether to draw a description 0 (false) + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/ex.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/ex.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,125 @@ +package Bio::Graphics::Glyph::ex; + +use strict; +use base 'Bio::Graphics::Glyph::generic'; + +# override draw_component to draw a crossed box rather than empty +sub draw_component { + my $self = shift; + my $gd = shift; + my $fg = $self->fgcolor; + my ($left,$top) = @_; + my($x1,$y1,$x2,$y2) = $self->bounds(@_); + + #if widthless + if($self->option('point')){ + my $arm = int($self->height/2); + my $minx = $x2 > $x1 ? $x1 : $x2; + my $centerx = abs($x2 - $x1) + $minx; + my $miny = $y2 > $y1 ? $y1 : $y2; + my $centery = abs($y2 - $y1) + $miny; + $gd->line($centerx-$arm, $centery-$arm, $centerx+$arm, $centery+$arm, $fg); + $gd->line($centerx-$arm, $centery+$arm, $centerx+$arm, $centery-$arm, $fg); + return; + } else { + $gd->line($x1,$y1,$x2,$y2,$fg); + $gd->line($x1,$y2,$x2,$y1,$fg); + } +} + + +1; + +__END__ + +=head1 NAME + +Bio::Graphics::Glyph::crossbox - The "crossbox" glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This is a box with an 'X' inside glyph. + +=head2 OPTIONS + + +The following options are standard among all Glyphs. See +L for a full explanation. + + Option Description Default + ------ ----------- ------- + + -fgcolor Foreground color black + + -outlinecolor Synonym for -fgcolor + + -bgcolor Background color turquoise + + -fillcolor Synonym for -bgcolor + + -linewidth Line width 1 + + -height Height of glyph 10 + + -font Glyph font gdSmallFont + + -connector Connector type 0 (false) + + -connector_color + Connector color black + + -label Whether to draw a label 0 (false) + + -description Whether to draw a description 0 (false) + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Allen Day Eday@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/extending_arrow.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/extending_arrow.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,77 @@ +package Bio::Graphics::Glyph::extending_arrow; + +use strict; +use vars '@ISA'; +use Bio::Graphics::Glyph::anchored_arrow; +@ISA = 'Bio::Graphics::Glyph::anchored_arrow'; + +=head1 NAME + +Bio::Graphics::Glyph::extending_arrow -- The "extending arrow" glyph + +=head1 SYNOPSIS + +This is deprecated. Use L +instead. + +=head1 DESCRIPTION + +This glyph was designed to show a segment that goes beyond the panel. +If the segment is contained within the panel, a vertical base is +shown. Otherwise, an arrow is shown. + +Also see the arrow glyph. + +=head2 OPTIONS + +See L. This glyph has been +deprecated. + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Originally by Shengqiang Shu. Temporarily deprecated by Lincoln +Stein. + +Copyright (c) 2001 Berkeley Drosophila Genome Project + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/generic.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/generic.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,300 @@ +package Bio::Graphics::Glyph::generic; + +use strict; +use Bio::Graphics::Glyph; +use vars '@ISA'; +@ISA = 'Bio::Graphics::Glyph'; + +my %complement = (g=>'c',a=>'t',t=>'a',c=>'g', + G=>'C',A=>'T',T=>'A',C=>'G'); + +# new options are 'label' -- short label to print over glyph +# 'description' -- long label to print under glyph +# label and description can be flags or coderefs. +# If a flag, label will be taken from seqname, if it exists or primary_tag(). +# description will be taken from source_tag(). + +sub pad_top { + my $self = shift; + my $top = $self->option('pad_top'); + return $top if defined $top; + my $pad = $self->SUPER::pad_top; + $pad += $self->labelheight if $self->label; + $pad; +} +sub pad_bottom { + my $self = shift; + my $bottom = $self->option('pad_bottom'); + return $bottom if defined $bottom; + my $pad = $self->SUPER::pad_bottom; + $pad += $self->labelheight if $self->description; + $pad; +} +sub pad_right { + my $self = shift; + my $pad = $self->SUPER::pad_right; + my $label_width = length($self->label||'') * $self->font->width; + my $description_width = length($self->description||'') * $self->font->width; + my $max = $label_width > $description_width ? $label_width : $description_width; + my $right = $max - $self->width; + return $pad > $right ? $pad : $right; +} + +sub labelheight { + my $self = shift; + return $self->{labelheight} ||= $self->font->height; +} +sub label { + my $self = shift; + return if $self->{overbumped}; # set by the bumper when we have hit bump limit + return unless $self->{level} == 0; + return exists $self->{label} ? $self->{label} + : ($self->{label} = $self->_label); +} +sub description { + my $self = shift; + return if $self->{overbumped}; # set by the bumper when we have hit bump limit + return unless $self->{level} == 0; + return exists $self->{description} ? $self->{description} + : ($self->{description} = $self->_description); +} +sub _label { + my $self = shift; + + # allow caller to specify the label + my $label = $self->option('label'); + return unless defined $label; + return $label unless $label eq '1'; + return "1" if $label eq '1 '; # 1 with a space + + + # figure it out ourselves + my $f = $self->feature; + + return $f->display_name if $f->can('display_name'); + return $f->info if $f->can('info'); # deprecated API + return $f->seq_id if $f->can('seq_id'); + return eval{$f->primary_tag}; +} +sub _description { + my $self = shift; + + # allow caller to specify the long label + my $label = $self->option('description'); + return unless defined $label; + return $label unless $label eq '1'; + return "1" if $label eq '1 '; + + return $self->{_description} if exists $self->{_description}; + return $self->{_description} = $self->get_description($self->feature); +} + +sub get_description { + my $self = shift; + my $feature = shift; + + # common places where we can get descriptions + return join '; ',$feature->notes if $feature->can('notes'); + return $feature->desc if $feature->can('desc'); + + my $tag = $feature->source_tag; + return undef if $tag eq ''; + $tag; +} + +sub draw { + my $self = shift; + $self->SUPER::draw(@_); + $self->draw_label(@_) if $self->option('label'); + $self->draw_description(@_) if $self->option('description'); +} + +sub draw_label { + my $self = shift; + my ($gd,$left,$top,$partno,$total_parts) = @_; + my $label = $self->label or return; + my $x = $self->left + $left; + $x = $self->panel->left + 1 if $x <= $self->panel->left; + my $font = $self->option('labelfont') || $self->font; + $gd->string($font, + $x, + $self->top + $top, + $label, + $self->fontcolor); +} +sub draw_description { + my $self = shift; + my ($gd,$left,$top,$partno,$total_parts) = @_; + my $label = $self->description or return; + my $x = $self->left + $left; + $x = $self->panel->left + 1 if $x <= $self->panel->left; + $gd->string($self->font, + $x, + $self->bottom - $self->pad_bottom + $top, + $label, + $self->font2color); +} + +sub dna_fits { + my $self = shift; + + my $pixels_per_base = $self->scale; + my $font = $self->font; + my $font_width = $font->width; + + return $pixels_per_base >= $font_width; +} + +sub arrowhead { + my $self = shift; + my $gd = shift; + my ($x,$y,$height,$orientation) = @_; + + my $fg = $self->set_pen; + my $style = $self->option('arrowstyle') || 'regular'; + + if ($style eq 'filled') { + my $poly = new GD::Polygon; + if ($orientation >= 0) { + $poly->addPt($x-$height,$y-$height); + $poly->addPt($x,$y); + $poly->addPt($x-$height,$y+$height,$y); + } else { + $poly->addPt($x+$height,$y-$height); + $poly->addPt($x,$y); + $poly->addPt($x+$height,$y+$height,$y); + } + $gd->filledPolygon($poly,$fg); + } else { + if ($orientation >= 0) { + $gd->line($x-$height,$y-$height,$x,$y,$fg); + $gd->line($x,$y,$x-$height,$y+$height,$fg); + } else { + $gd->line($x+$height,$y-$height,$x,$y,$fg); + $gd->line($x,$y,$x+$height,$y+$height,$fg); + } + } +} + +sub arrow { + my $self = shift; + my $gd = shift; + my ($x1,$x2,$y) = @_; + + my $fg = $self->set_pen; + my $height = $self->height/3; + + $gd->line($x1,$y,$x2,$y,$fg); + $self->arrowhead($gd,$x2,$y,$height,+1) if $x1 < $x2; + $self->arrowhead($gd,$x2,$y,$height,-1) if $x2 < $x1; +} + +sub reversec { + $_[1]=~tr/gatcGATC/ctagCTAG/; + return scalar reverse $_[1]; +} + +1; + +=head1 NAME + +Bio::Graphics::Glyph::generic - The "generic" glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This is identical to the "box" glyph. It is the default glyph used +when not otherwise specified. + +=head2 OPTIONS + +The following options are standard among all Glyphs. See +L for a full explanation. + + Option Description Default + ------ ----------- ------- + + -fgcolor Foreground color black + + -outlinecolor Synonym for -fgcolor + + -bgcolor Background color turquoise + + -fillcolor Synonym for -bgcolor + + -linewidth Line width 1 + + -height Height of glyph 10 + + -font Glyph font gdSmallFont + + -connector Connector type 0 (false) + + -connector_color + Connector color black + + -pad_top Top padding 0 + + -pad_bottom Bottom padding 0 + + -label Whether to draw a label 0 (false) + + -description Whether to draw a description 0 (false) + + -strand_arrow Whether to indicate 0 (false) + strandedness + +-pad_top and -pad_bottom allow you to insert some blank space between +the glyph's boundary and its contents. This is useful if you are +changing the glyph's height dynamically based on its feature's score. + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Allen Day Eday@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/graded_segments.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/graded_segments.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,218 @@ +package Bio::Graphics::Glyph::graded_segments; +#$Id: graded_segments.pm,v 1.12.2.1 2003/07/05 00:32:04 lstein Exp $ + +use strict; +use Bio::Graphics::Glyph::segments; +use vars '@ISA'; +@ISA = 'Bio::Graphics::Glyph::segments'; + +# override draw method to calculate the min and max values for the components +sub draw { + my $self = shift; + + # bail out if this isn't the right kind of feature + # handle both das-style and Bio::SeqFeatureI style, + # which use different names for subparts. + my @parts = $self->parts; + @parts = $self if !@parts && $self->level == 0; + return $self->SUPER::draw(@_) unless @parts; + + # figure out the colors + my $max_score = $self->option('max_score'); + my $min_score = $self->option('min_score'); + unless (defined $max_score && defined $min_score) { + for my $part (@parts) { + my $s = eval { $part->feature->score }; + next unless defined $s; + $max_score = $s if !defined $max_score or $s > $max_score; + $min_score = $s if !defined $min_score or $s < $min_score; + } + } + + return $self->SUPER::draw(@_) + unless defined($max_score) && defined($min_score) + && $min_score < $max_score; + + my $span = $max_score - $min_score; + + # allocate colors + my $fill = $self->bgcolor; + my ($red,$green,$blue) = $self->panel->rgb($fill); + + foreach my $part (@parts) { + my $s = eval { $part->feature->score }; + unless (defined $s) { + $part->{partcolor} = $fill; + next; + } + my ($r,$g,$b) = $self->calculate_color($s,[$red,$green,$blue],$min_score,$span); + my $idx = $self->panel->translate_color($r,$g,$b); + $part->{partcolor} = $idx; + } + $self->SUPER::draw(@_); +} + +sub calculate_color { + my $self = shift; + my ($s,$rgb,$min_score,$span) = @_; + return map { 255 - (255-$_) * min(max( ($s-$min_score)/$span, 0), 1) } @$rgb; +} + +sub min { $_[0] < $_[1] ? $_[0] : $_[1] } +sub max { $_[0] > $_[1] ? $_[0] : $_[1] } + +sub subseq { + my $class = shift; + my $feature = shift; + return $feature->segments if $feature->can('segments'); + return $feature->sub_SeqFeature if $feature->can('sub_SeqFeature'); + return; +} + +# synthesize a key glyph +sub keyglyph { + my $self = shift; + + my $scale = 1/$self->scale; # base pairs/pixel + + # two segments, at pixels 0->50, 60->80 + my $offset = $self->panel->offset; + + my $feature = + Bio::Graphics::Feature->new( + -segments=>[ [ 0*$scale +$offset,20*$scale+$offset], + [ 30*$scale +$offset,50*$scale+$offset], + [60*$scale+$offset, 80*$scale+$offset] + ], + -name => $self->option('key'), + -strand => '+1'); + ($feature->segments)[0]->score(10); + ($feature->segments)[1]->score(50); + ($feature->segments)[2]->score(100); + my $factory = $self->factory->clone; + $factory->set_option(label => 1); + $factory->set_option(bump => 0); + $factory->set_option(connector => 'solid'); + return $factory->make_glyph($feature); +} + +# component draws a shaded box +sub bgcolor { + my $self = shift; + return $self->{partcolor} || $self->SUPER::bgcolor; +} +sub fgcolor { + my $self = shift; + return $self->{partcolor} || $self->SUPER::fgcolor; +} + +1; + +=head1 NAME + +Bio::Graphics::Glyph::graded_segments - The "graded_segments" glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This is identical to the "alignment" glyph, and is used for +drawing features that consist of discontinuous segments. The +color intensity of each segment is proportionate to the score. + +=head2 OPTIONS + +The following options are standard among all Glyphs. See +L for a full explanation. + + Option Description Default + ------ ----------- ------- + + -fgcolor Foreground color black + + -outlinecolor Synonym for -fgcolor + + -bgcolor Background color turquoise + + -fillcolor Synonym for -bgcolor + + -linewidth Line width 1 + + -height Height of glyph 10 + + -font Glyph font gdSmallFont + + -connector Connector type 0 (false) + + -connector_color + Connector color black + + -label Whether to draw a label 0 (false) + + -description Whether to draw a description 0 (false) + +In addition, the alignment glyph recognizes the following +glyph-specific options: + + Option Description Default + ------ ----------- ------- + + -max_score Maximum value of the Calculated + feature's "score" attribute + + -min_score Minimum value of the Calculated + feature's "score" attribute + +If max_score and min_score are not specified, then the glyph will +calculate the local maximum and minimum scores at run time. + + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/group.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/group.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,80 @@ +package Bio::Graphics::Glyph::group; + +use strict; +use vars '@ISA'; +use Bio::Graphics::Glyph::segmented_keyglyph; +@ISA = 'Bio::Graphics::Glyph::segmented_keyglyph'; + +# group sets connector to 'dashed' +sub connector { + my $self = shift; + my $super = $self->SUPER::connector(@_); + return $super if $self->all_callbacks; + return 'dashed' unless defined($super) && ($super eq 'none' or !$super); +} + +# we don't label group (yet) +sub label { 0 } + +sub new { + my $self = shift->SUPER::new(@_); + # reset our parts to level zero + foreach (@{$self->{parts}}) { + $_->{level} = 0; + } + $self; +} + +# don't allow simple bumping in groups -- it looks terrible... +sub bump { + my $bump = shift->SUPER::bump(@_); + return unless defined $bump; + return 1 if $bump > 1; + return -1 if $bump < -1; + return $bump; +} + +1; + +__END__ + +=head1 NAME + +Bio::Graphics::Glyph::group - The "group" glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This glyph is used internally by Bio::Graphics::Panel for laying out +groups of glyphs that move in concert. It should not be used +explicitly. + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + +L, L, L, +L, L, +L, +L, +L, +L, +L, +L, + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/heterogeneous_segments.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/heterogeneous_segments.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,174 @@ +package Bio::Graphics::Glyph::heterogeneous_segments; + +# this glyph acts like graded_segments but the bgcolor of each segment is +# controlled by the source field of the feature. Use the source field name +# to set the background color: +# -waba_strong => 'blue' +# -waba_weak => 'red' +# -waba_coding => 'green' + +# $Id: heterogeneous_segments.pm,v 1.5 2002/12/23 01:33:41 lstein Exp $ + +use strict; +use Bio::Graphics::Glyph::graded_segments; +use vars '@ISA'; +@ISA = 'Bio::Graphics::Glyph::graded_segments'; + +# override draw method to calculate the min and max values for the components +sub draw { + my $self = shift; + + # bail out if this isn't the right kind of feature + # handle both das-style and Bio::SeqFeatureI style, + # which use different names for subparts. + my @parts = $self->parts; + @parts = $self if !@parts && $self->level == 0; + return $self->SUPER::draw(@_) unless @parts; + + # figure out the colors + $self->{source2color} ||= {}; + my $fill = $self->bgcolor; + for my $part (@parts) { + my $s = eval { $part->feature->source_tag } or next; + $self->{source2color}{$s} ||= $self->color(lc($s)."_color") || $fill; + $part->{partcolor} = $self->{source2color}{$s}; + } + + $self->Bio::Graphics::Glyph::generic::draw(@_); +} + + +# synthesize a key glyph +sub keyglyph { + my $self = shift; + + my $scale = 1/$self->scale; # base pairs/pixel + + # two segments, at pixels 0->50, 60->80 + my $offset = $self->panel->offset; + + my $feature = + Bio::Graphics::Feature->new( + -segments=>[ [ 0*$scale +$offset,25*$scale+$offset], + [ 25*$scale +$offset,50*$scale+$offset], + [ 50*$scale+$offset, 75*$scale+$offset] + ], + -name => $self->option('key'), + -strand => '+1'); + my @sources = grep {/_color$/} $self->factory->options; + foreach (@sources) {s/_color$//} + ($feature->segments)[0]->source_tag($sources[1]); + ($feature->segments)[1]->source_tag($sources[0]); + ($feature->segments)[2]->source_tag($sources[2]); + my $factory = $self->factory->clone; + $factory->set_option(label => 1); + $factory->set_option(bump => 0); + $factory->set_option(connector => 'solid'); + my $glyph = $factory->make_glyph(0,$feature); + return $glyph; +} + +1; + +=head1 NAME + +Bio::Graphics::Glyph::heterogeneous_segments - The "heterogeneous_segments" glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This glyph acts like graded_segments but the bgcolor of each segment (sub-feature) +can be individually set using the source field of the feature. + +Each segment type color is specified using the following nomenclature: + + -{source}_color => $color + +For example, if the feature consists of a gene containing both +confirmed and unconfirmed exons, you can make the confirmed exons +green and the unconfirmed ones red this way: + + -confirmed_color => 'green', + -unconfirmed_color => 'red' + +=head2 OPTIONS + +The following options are standard among all Glyphs. See +L for a full explanation. + + Option Description Default + ------ ----------- ------- + + -fgcolor Foreground color black + + -outlinecolor Synonym for -fgcolor + + -bgcolor Background color turquoise + + -fillcolor Synonym for -bgcolor + + -linewidth Line width 1 + + -height Height of glyph 10 + + -font Glyph font gdSmallFont + + -connector Connector type 0 (false) + + -connector_color + Connector color black + + -label Whether to draw a label 0 (false) + + -description Whether to draw a description 0 (false) + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/line.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/line.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,128 @@ +package Bio::Graphics::Glyph::line; +# an arrow without the arrowheads + +use strict; +use vars '@ISA'; +@ISA = 'Bio::Graphics::Glyph::generic'; +use Bio::Graphics::Glyph::generic; + +sub bottom { + my $self = shift; + my $val = $self->SUPER::bottom(@_); + $val += $self->font->height if $self->option('tick'); + $val += $self->labelheight if $self->option('label'); + $val; +} + +sub draw { + my $self = shift; + my $gd = shift; + my ($x1,$y1,$x2,$y2) = $self->calculate_boundaries(@_); + + my $fg = $self->fgcolor; + my $a2 = $self->SUPER::height/2; + my $center = $y1+$a2; + + $gd->line($x1,$center,$x2,$center,$fg); + # add a label if requested + $self->draw_label($gd,@_) if $self->option('label'); + +} + +1; + +__END__ + +=head1 NAME + +Bio::Graphics::Glyph::line - The "line" glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This glyph draws a line parallel to the sequence segment. + +=head2 OPTIONS + +This glyph takes only the standard options. See +L for a full explanation. + + Option Description Default + ------ ----------- ------- + + -fgcolor Foreground color black + + -outlinecolor Synonym for -fgcolor + + -bgcolor Background color turquoise + + -fillcolor Synonym for -bgcolor + + -linewidth Line width 1 + + -height Height of glyph 10 + + -font Glyph font gdSmallFont + + -connector Connector type 0 (false) + + -connector_color + Connector color black + + -label Whether to draw a label 0 (false) + + -description Whether to draw a description 0 (false) + + -strand_arrow Whether to indicate 0 (false) + strandedness + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Allen Day Eday@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/oval.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/oval.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,102 @@ +package Bio::Graphics::Glyph::oval; + +use strict; +use vars '@ISA'; +use Bio::Graphics::Glyph::ellipse; +@ISA = 'Bio::Graphics::Glyph::ellipse'; + +1; + +__END__ + +=head1 NAME + +Bio::Graphics::Glyph::oval - The "oval" glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This glyph draws an oval instead of a box. It is an alias for the +"ellipse" glyph. + +=head2 OPTIONS + +The following options are standard among all Glyphs. See +L for a full explanation. + + Option Description Default + ------ ----------- ------- + + -fgcolor Foreground color black + + -outlinecolor Synonym for -fgcolor + + -bgcolor Background color turquoise + + -fillcolor Synonym for -bgcolor + + -linewidth Line width 1 + + -height Height of glyph 10 + + -font Glyph font gdSmallFont + + -connector Connector type 0 (false) + + -connector_color + Connector color black + + -label Whether to draw a label 0 (false) + + -description Whether to draw a description 0 (false) + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/pinsertion.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/pinsertion.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,127 @@ +package Bio::Graphics::Glyph::pinsertion; +# package to use for drawing P insertion as a triangle +# p insertion is a point (one base). + +use strict; +use GD; +use vars '@ISA'; +@ISA = 'Bio::Graphics::Glyph::generic'; +use Bio::Graphics::Glyph::generic; + +sub box { + my $self = shift; + my $half = $self->insertion_width/2; + return ($self->left-$half,$self->top,$self->right+$half,$self->bottom); +} + +sub insertion_width { + my $self = shift; + return $self->option('insertion_width') || 6; +} + +# override draw method +sub draw { + my $self = shift; + + my $gd = shift; + my ($left,$top) = @_; + my ($x1,$y1,$x2,$y2) = $self->calculate_boundaries(@_); + + my $height = $self->height; + + my $half = $self->insertion_width/2; + + my $fill = $self->bgcolor; + + my $poly = GD::Polygon->new; + + if ($self->feature->strand > 0) { #plus strand + $poly->addPt($x1 - $half, $y1); + $poly->addPt($x1 + ($half), $y1); + $poly->addPt($x1, $y2); #pointer down + } else { + $poly->addPt($x1, $y1); #pointer up + $poly->addPt($x1 - $half, $y2); + $poly->addPt($x1 + ($half), $y2); + } + $gd->filledPolygon($poly, $fill); + $gd->polygon($poly, $fill); + + # add a label if requested + $self->draw_label($gd,$left,$top) if $self->option('label'); + $self->draw_description($gd,$left,$top) if $self->option('description'); +} + + +1; + +=head1 NAME + +Bio::Graphics::Glyph::pinsertion - The "Drosophila P-element Insertion" glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This glyph was designed to show P-element insertions in the Drosophila +genome, but in fact is suitable for any type of zero-width feature. +Also see the triangle glyph. + +=head2 OPTIONS + +In addition to the generic options, this glyph recognizes: + + Option Name Description Default + ----------- ----------- ------- + + -insertion_width Width of glyph in pixels 3 + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Allen Day Eday@cshl.orgE, Shengqiang Shu Esshu@bdgp.lbl.govE + +Copyright (c) 2001 Cold Spring Harbor Laboratory, BDGP + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/primers.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/primers.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,151 @@ +package Bio::Graphics::Glyph::primers; +# package to use for drawing something that looks like +# primer pairs. + +use strict; +use vars '@ISA'; +@ISA = 'Bio::Graphics::Glyph::generic'; +use Bio::Graphics::Glyph::generic; + +use constant HEIGHT => 4; + +# we do not need the default amount of room +#sub calculate_height { +# my $self = shift; +# return $self->option('label') ? HEIGHT + $self->labelheight + 2 : HEIGHT; +#} + +# override draw method +sub draw { + my $self = shift; + my $gd = shift; + my ($x1,$y1,$x2,$y2) = $self->calculate_boundaries(@_); + + my $fg = $self->fgcolor; + my $a2 = HEIGHT/2; + my $center = $y1 + $a2; + + # just draw us as a solid line -- very simple + if ($x2-$x1 < HEIGHT*2) { + $gd->line($x1,$center,$x2,$center,$fg); + return; + } + + # otherwise draw two pairs of arrows + # --> <-- + my $trunc_left = $x1 < $self->panel->left; + my $trunc_right = $x2 > $self->panel->right; + + unless ($trunc_left) { + $gd->line($x1,$center,$x1 + HEIGHT,$center,$fg); + $gd->line($x1 + HEIGHT,$center,$x1 + HEIGHT - $a2,$center-$a2,$fg); + $gd->line($x1 + HEIGHT,$center,$x1 + HEIGHT - $a2,$center+$a2,$fg); + } + + unless ($trunc_right) { + $gd->line($x2,$center,$x2 - HEIGHT,$center,$fg); + $gd->line($x2 - HEIGHT,$center,$x2 - HEIGHT + $a2,$center+$a2,$fg); + $gd->line($x2 - HEIGHT,$center,$x2 - HEIGHT + $a2,$center-$a2,$fg); + } + + # connect the dots if requested + if ($self->connect) { + my $c = $self->color('connect_color') || $self->bgcolor; + $gd->line($x1 + ($trunc_left ? 0 : HEIGHT + 2),$center, + $x2 - ($trunc_right ? 0 : HEIGHT + 2),$center, + $c); + } + + # add a label if requested + $self->draw_label($gd,@_) if $self->option('label'); + $self->draw_description($gd,@_) if $self->option('description'); + +} + +sub connect { + my $self = shift; + return $self->option('connect') if defined $self->option('connect'); + 1; # default +} + +1; + +__END__ + +=head1 NAME + +Bio::Graphics::Glyph::primers - The "STS primers" glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This glyph draws two arrows oriented towards each other and connected +by a line of a contrasting color. The length of the arrows is +immaterial, but the length of the glyph itself corresponds to the +length of the scaled feature. + +=head2 OPTIONS + +In addition to the common options, the following glyph-specific +options are recognized: + + Option Description Default + ------ ----------- ------- + + -connect Whether to connect the true + two arrowheads by a line. + + -connect_color The color to use for the bgcolor + connecting line. + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Allen Day Eday@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/processed_transcript.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/processed_transcript.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,179 @@ +package Bio::Graphics::Glyph::processed_transcript; + +# $Id: processed_transcript.pm,v 1.3.2.1 2003/07/05 00:32:04 lstein Exp $ + +use strict; +use Bio::Graphics::Glyph::transcript2; +use vars '@ISA'; +@ISA = 'Bio::Graphics::Glyph::transcript2'; +use constant DEFAULT_UTR_COLOR => '#D0D0D0'; + +sub is_utr { + my $self = shift; + return $self->feature->primary_tag =~ /UTR|untranslated_region/i; +} + +sub thin_utr { + my $self = shift; + $self->option('thin_utr'); +} + +sub utr_color { + my $self = shift; + return $self->color('utr_color') if $self->option('utr_color'); + return $self->factory->translate_color(DEFAULT_UTR_COLOR); +} + +sub height { + my $self = shift; + my $height = $self->SUPER::height; + return $height unless $self->thin_utr; + return $self->is_utr ? int($height/1.5+0.5) : $height; +} + +sub pad_top { + my $self = shift; + my $pad_top = $self->SUPER::pad_top; + return $pad_top unless $self->thin_utr && $self->is_utr; + return $pad_top + int(0.167*$self->SUPER::height + 0.5); +} + +sub bgcolor { + my $self = shift; + return $self->SUPER::bgcolor unless $self->is_utr; + return $self->utr_color; +} + +sub connector { + my $self = shift; + return 'quill' if $self->option('decorate_introns'); + return $self->SUPER::connector(@_); +} + + +1; + + +__END__ + +=head1 NAME + +Bio::Graphics::Glyph::processed_transcript - The sequence ontology transcript glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This glyph is used for drawing processed transcripts that have both +CDS and UTR segments. The CDS is drawn in the background color, and +the UTRs are drawn in an alternate color selected by the utr_color +option. In addition, you can make the UTRs thinner than the CDS by +setting the "thin_utr" option. + +For this glyph to produce the desired results, you should pass it a +compound Bio::SeqFeature that has subfeatures of primary_tag "CDS" and +"UTR". In fact, you may give it more specific types of UTR, including +5'-UTR, 3'-UTR, or the Sequence Ontology terms "untranslated_region," +"five_prime_untranslated_region," and +"three_prime_untranslated_region." + +=head2 OPTIONS + +The following options are standard among all Glyphs. See +L for a full explanation. + + Option Description Default + ------ ----------- ------- + + -fgcolor Foreground color black + + -outlinecolor Synonym for -fgcolor + + -bgcolor Background color turquoise + + -fillcolor Synonym for -bgcolor + + -linewidth Line width 1 + + -height Height of glyph 10 + + -font Glyph font gdSmallFont + + -connector Connector type undef (false) + + -connector_color + Connector color black + + -label Whether to draw a label undef (false) + + -description Whether to draw a description undef (false) + + -strand_arrow Whether to indicate undef (false) + strandedness + +In addition, the alignment glyph recognizes the following +glyph-specific options: + + Option Description Default + ------ ----------- ------- + + -thin_utr Flag. If true, UTRs will undef (false) + be drawn at 2/3 of the + height of CDS segments. + + -utr_color Color of UTR segments. Gray #D0D0D0 + + -decorate_introns + Draw strand with little arrows undef (false) + on the intron. + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/redgreen_box.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/redgreen_box.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,191 @@ +package Bio::Graphics::Glyph::redgreen_box; +#$Id: redgreen_box.pm,v 1.4.2.1 2003/07/05 00:32:04 lstein Exp $ + +use strict; +use Bio::Graphics::Glyph::generic; +use vars '@ISA'; +@ISA = 'Bio::Graphics::Glyph::generic'; + +sub bgcolor { + my $self = shift; + $self->{force_bgcolor}; +} + +sub fgcolor { + my $self = shift; + return $self->option('border') ? $self->SUPER::fgcolor : $self->{force_bgcolor}; +} + +sub draw { + my $self = shift; + my $val = $self->feature->score; + + # we're going to force all our parts to share the same colors + # since the + my @parts = $self->parts; + @parts = $self if !@parts && $self->level == 0; + my @rgb = map {int($_)} HSVtoRGB(120*(1-$val),1,255); + my $color = $self->panel->translate_color(@rgb); + $_->{force_bgcolor} = $color foreach @parts; + + $self->SUPER::draw(@_); +} + +sub HSVtoRGB ($$$) { + my ($h,$s,$v)=@_; + my ($r,$g,$b,$i,$f,$p,$q,$t); + + if( $s == 0 ) { + ## achromatic (grey) + return ($v,$v,$v); + } + + $h /= 60; ## sector 0 to 5 + $i = int($h); + $f = $h - $i; ## factorial part of h + $p = $v * ( 1 - $s ); + $q = $v * ( 1 - $s * $f ); + $t = $v * ( 1 - $s * ( 1 - $f ) ); + + if($i<1) { + $r = $v; + $g = $t; + $b = $p; + } elsif($i<2){ + $r = $q; + $g = $v; + $b = $p; + } elsif($i<3){ + $r = $p; + $g = $v; + $b = $t; + } elsif($i<4){ + $r = $p; + $g = $q; + $b = $v; + } elsif($i<5){ + $r = $t; + $g = $p; + $b = $v; + } else { + $r = $v; + $g = $p; + $b = $q; + } + return ($r,$g,$b); +} + +sub mMin { + my $n=10000000000000; + map { $n=($n>$_) ? $_ : $n } @_; + return($n); +} + +sub mMax { + my $n=0; + map { $n=($n<$_) ? $_ : $n } @_; + return($n); +} + + +1; + +=head1 NAME + +Bio::Graphics::Glyph::redgreen_box - The "redgreen_box" glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This glyph is similar to the graded_segments glyph except that it +generates a green-Ered gradient suitable for use with microarray data. +A feature score of 0 is full green; a feature score of 1.0 is full +red; intermediate scores are shades of yellow. + +=head2 OPTIONS + +The following options are standard among all Glyphs. See +L for a full explanation. + + Option Description Default + ------ ----------- ------- + + -fgcolor Foreground color black + + -outlinecolor Synonym for -fgcolor + + -bgcolor Background color turquoise + + -fillcolor Synonym for -bgcolor + + -linewidth Line width 1 + + -height Height of glyph 10 + + -font Glyph font gdSmallFont + + -connector Connector type 0 (false) + + -connector_color + Connector color black + + -label Whether to draw a label 0 (false) + + -description Whether to draw a description 0 (false) + +The following glyph-specific option is recognized: + + -border Draw a fgcolor border around 0 (false) + the box + + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/redgreen_segment.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/redgreen_segment.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,167 @@ +package Bio::Graphics::Glyph::redgreen_segment; +#$Id: redgreen_segment.pm,v 1.3.2.1 2003/07/05 00:32:04 lstein Exp $ + +use strict; +use Bio::Graphics::Glyph::graded_segments; +use vars '@ISA'; +@ISA = 'Bio::Graphics::Glyph::graded_segments'; + +sub calculate_color { + my $self = shift; + my $val = shift; + return (0,0,0) unless $val =~ /^[\d.]+$/; + return HSVtoRGB(120*(1-$val),1,255); +} + +sub HSVtoRGB ($$$) { + my ($h,$s,$v)=@_; + my ($r,$g,$b,$i,$f,$p,$q,$t); + + if( $s == 0 ) { + ## achromatic (grey) + return ($v,$v,$v); + } + + $h /= 60; ## sector 0 to 5 + $i = int($h); + $f = $h - $i; ## factorial part of h + $p = $v * ( 1 - $s ); + $q = $v * ( 1 - $s * $f ); + $t = $v * ( 1 - $s * ( 1 - $f ) ); + + if($i<1) { + $r = $v; + $g = $t; + $b = $p; + } elsif($i<2){ + $r = $q; + $g = $v; + $b = $p; + } elsif($i<3){ + $r = $p; + $g = $v; + $b = $t; + } elsif($i<4){ + $r = $p; + $g = $q; + $b = $v; + } elsif($i<5){ + $r = $t; + $g = $p; + $b = $v; + } else { + $r = $v; + $g = $p; + $b = $q; + } + return ($r,$g,$b); +} + +sub mMin { + my $n=10000000000000; + map { $n=($n>$_) ? $_ : $n } @_; + return($n); +} + +sub mMax { + my $n=0; + map { $n=($n<$_) ? $_ : $n } @_; + return($n); +} + + +1; + +=head1 NAME + +Bio::Graphics::Glyph::redgreen_segments - The "redgreen_segments" glyph + +=head1 SYNOPSIS + +See L and L. + +=head1 DESCRIPTION + +This glyph is similar to the graded_segments glyph except that it +generates a green-Ered gradient suitable for use with microarray data. +A feature score of 0 is full green; a feature score of 1.0 is full +red; intermediate scores are shades of yellow. + +=head2 OPTIONS + +The following options are standard among all Glyphs. See +L for a full explanation. + + Option Description Default + ------ ----------- ------- + + -fgcolor Foreground color black + + -outlinecolor Synonym for -fgcolor + + -bgcolor Background color turquoise + + -fillcolor Synonym for -bgcolor + + -linewidth Line width 1 + + -height Height of glyph 10 + + -font Glyph font gdSmallFont + + -connector Connector type 0 (false) + + -connector_color + Connector color black + + -label Whether to draw a label 0 (false) + + -description Whether to draw a description 0 (false) + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/rndrect.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/rndrect.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,120 @@ +package Bio::Graphics::Glyph::rndrect; + +use strict; +use base 'Bio::Graphics::Glyph::generic'; + +# override draw_component to draw an round edge rect rather than a rectangle +sub draw_component { + my $self = shift; + my $gd = shift; + my ($left,$top) = @_; + my($x1,$y1,$x2,$y2) = $self->calculate_boundaries(@_);#$self->bounds(@_); + require GD; + my $poly = GD::Polygon->new; + my $boxheight = $y2 - $y1; + + if (($x2-$x1) > 3) { + $poly->addPt($x1+1, $y1+1); + $poly->addPt($x1+2, $y1); + $poly->addPt($x2-2, $y1); + $poly->addPt($x2-1, $y1+1); + $poly->addPt($x2, $y1 + $boxheight / 2) + if (($y2 - $y1) > 6); + + $poly->addPt($x2-1, $y2-1); + $poly->addPt($x2-2, $y2); + $poly->addPt($x1+2, $y2); + $poly->addPt($x1+1, $y2-1); + $poly->addPt($x1, $y1 + $boxheight / 2) + if (($y2 - $y1) > 6); + } else { + $poly->addPt($x1, $y1); + $poly->addPt($x2, $y1); + + $poly->addPt($x2, $y2); + $poly->addPt($x1, $y2); + } + + $gd->filledPolygon($poly, $self->fillcolor); + + $gd->polygon($poly, $self->fgcolor); +} + +# group sets connector to 'solid' +sub connector { + my $self = shift; + return $self->SUPER::connector(@_) if $self->all_callbacks; + return 'solid'; +} + +sub bump { + my $self = shift; + return $self->SUPER::bump(@_) if $self->all_callbacks; + return 0; +} + + +1; + + +=head1 NAME + +Bio::Graphics::Glyph::rndrect - The "round rect" glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This glyph was designed to show seq features in round edge rectangles. +The glyph will be a rectangle if its width is E 4 pixels + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Shengqiang Shu Esshu@bdgp.lbl.govE + +Copyright (c) 2001 BDGP + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/ruler_arrow.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/ruler_arrow.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,351 @@ +package Bio::Graphics::Glyph::ruler_arrow; +# package to use for drawing an arrow as ruler (5' and 3' are marked as label) + +use strict; +use vars '@ISA'; +use Bio::Graphics::Glyph::generic; +@ISA = 'Bio::Graphics::Glyph::generic'; + +my %UNITS = (K => 1000, + M => 1_000_000, + G => 1_000_000_000); + +sub pad_bottom { + my $self = shift; + my $val = $self->SUPER::pad_bottom(@_); + $val += $self->font->height if $self->option('tick'); + $val; +} + +# override draw method +sub draw { + my $self = shift; + my $parallel = $self->option('parallel'); + $parallel = 1 unless defined $parallel; + $self->draw_parallel(@_) if $parallel; + $self->draw_perpendicular(@_) unless $parallel; + $self->draw_label(@_) if ($self->option('label')); +} + +sub draw_perpendicular { + my $self = shift; + my $gd = shift; + my ($dx,$dy) = @_; + my ($x1,$y1,$x2,$y2) = $self->bounds(@_); + + my $ne = $self->option('northeast'); + my $sw = $self->option('southwest'); + $ne = $sw = 1 unless defined($ne) || defined($sw); + + # draw a perpendicular arrow at position indicated by $x1 + my $fg = $self->set_pen; + my $a2 = ($y2-$y1)/4; + + my @positions = $x1 == $x2 ? ($x1) : ($x1,$x2); + for my $x (@positions) { + if ($ne) { + $gd->line($x,$y1,$x,$y2,$fg); + $gd->line($x-$a2,$y1+$a2,$x,$y1,$fg); + $gd->line($x+$a2,$y1+$a2,$x,$y1,$fg); + } + if ($sw) { + $gd->line($x,$y1,$x,$y2,$fg); + $gd->line($x-$a2,$y2-$a2,$x,$y2,$fg); + $gd->line($x+$a2,$y2-$a2,$x,$y2,$fg); + } + } + + # add a label if requested +# $self->draw_label($gd,$dx,$dy) if ($self->option('label') && !$self->option('ruler')); + # this draws the label aligned to the left +} + +sub draw_parallel { + my $self = shift; + my $gd = shift; + my ($dx,$dy) = @_; + my ($x1,$y1,$x2,$y2) = $self->bounds(@_); + + my $fg = $self->set_pen; + my $a2 = ($self->height)/2; + my $center = $y1+$a2; + + $x1 = $self->panel->left if $x1 < $self->panel->left; + $x2 = $self->panel->right if $x2 > $self->panel->right; + + my ($sw,$ne,$base_w,$base_e) = $self->arrowheads; + $gd->line($x1,$center,$x2,$center,$fg); + $self->arrowhead($gd,$x1,$center,$a2,-1) if $sw; # west arrow + $self->arrowhead($gd,$x2,$center,$a2,+1) if $ne; # east arrow + $gd->line($x2,$center-$a2,$x2,$center+$a2,$fg) if $base_e; #east base + $gd->line($x1,$center-$a2,$x1,$center+$a2,$fg) if $base_w; #west base + + # turn on ticks + if ($self->option('tick')) { + local $^W = 0; # dumb uninitialized variable warning + my $font = $self->font; + my $width = $font->width; + my $font_color = $self->fontcolor; + my $height = $self->height; + + my $relative = $self->option('relative_coords'); + my $start = $relative ? 1 : $self->feature->start; + my $stop = $start + $self->feature->length - 1; + + my $offset = $relative ? $self->feature->start-1 : 0; + my $reversed = $self->feature->strand < 0; + + my $units = $self->option('units') || ''; + my $divisor = $UNITS{$units} || 1 if $units; + + my ($major_ticks,$minor_ticks) = $self->panel->ticks($start,$stop,$font,$divisor); + + ## Does the user want to override the internal scale? + my $scale = $self->option('scale'); + + my $left = $sw ? $x1+$height : $x1; + my $right = $ne ? $x2-$height : $x2; + + my $format = ($major_ticks->[1]-$major_ticks->[0])/($divisor||1) < 1 ? "%.1f$units" : "%d$units"; + + for my $i (@$major_ticks) { + my $tickpos = $dx + ($reversed ? $self->map_pt($stop - $i + $offset) + : $self->map_pt($i + $offset)); + next if $tickpos < $left or $tickpos > $right; + $gd->line($tickpos,$center-$a2,$tickpos,$center+$a2,$fg); + my $label = $scale ? $i / $scale : $i; + if ($units) { + my $scaled = $label/$divisor; + $label = sprintf($format,$scaled); + } + my $middle = $tickpos - (length($label) * $width)/2; + $gd->string($font,$middle,$center+$a2-1,$label,$font_color) + unless ($self->option('no_tick_label')); + } + + if ($self->option('tick') >= 2) { + my $a4 = $self->height/4; + for my $i (@$minor_ticks) { + my $tickpos = $dx + ($reversed ? $self->map_pt($stop - $i + $offset) + : $self->map_pt($i + $offset)); + next if $tickpos < $left or $tickpos > $right; + $gd->line($tickpos,$center-$a4,$tickpos,$center+$a4,$fg); + } + } + } + + # add a label if requested +# $self->draw_label($gd,$dx,$dy) if ($self->option('label'); +# $self->draw_description($gd,$dx,$dy) if $self->option('description'); +} + +sub arrowheads { + my $self = shift; + my ($ne,$sw,$base_e,$base_w); + if ($self->option('double')) { + $ne = $sw = 1; + } else { + $ne = $self->option('northeast') || $self->option('east'); + $sw = $self->option('southwest') || $self->option('west'); + } + # otherwise use strandedness to define the arrow + unless (defined($ne) || defined($sw)) { + # turn on both if neither specified + $ne = 1 if $self->feature->strand > 0; + $sw = 1 if $self->feature->strand < 0; + } + return ($sw,$ne,0,0) unless $self->option('base'); + return ($sw,$ne,!$sw,!$ne); +} + +sub draw_label { + my $self = shift; + my ($gd,$left,$top) = @_; + + my $label5 = "5'"; + my $label3 = "3'"; + my $relative = $self->option('relative_coords'); + my $start = $relative ? 1 : $self->feature->start; + my $stop = $start + $self->feature->length - 1; + + my $offset = $relative ? $self->feature->start-1 : 0; + my $reversed = $self->feature->strand < 0; + + my $units = $self->option('units') || ''; + my $divisor = $UNITS{$units} || 1 if $units; + + my ($major_ticks,$minor_ticks) = $self->panel->ticks($start,$stop,$self->font,$divisor); + my $tick_scale = " ($major_ticks bp/"; + $tick_scale .= ($self->option('tick') >= 2)?"major tick)":"tick)"; + + my $top_left_label = $label5; + $top_left_label .= $tick_scale if ($self->option('no_tick_label') && $self->option('tick')); + #-1 direction mean lower end is 3' (minus strand on top) + ($label5, $label3) = ($label3, $label5) if ($self->option('direction') == -1); + my $x = $self->left + $left; + $x = $self->panel->left + 1 if $x <= $self->panel->left; + my $font = $self->option('labelfont') || $self->font; + $gd->string($font, + $x, + $self->top + $top, + $top_left_label, + $self->fontcolor); + my $x1 = $left + $self->panel->right - $font->width*length($label3); + $gd->string($font, + $x1, + $self->top + $top, + $label3, + $self->fontcolor); + if ($self->option('both')) {#minus strand as well + $gd->string($font, + $x, + $self->bottom - $self->pad_bottom + $top, + $label3, + $self->fontcolor); + my $x1 = $left + $self->panel->right - $font->width*length($label5); + $gd->string($font, + $x1, + $self->bottom - $self->pad_bottom + $top, + $label5, + $self->fontcolor); + } +} + + +1; + + +__END__ + +=head1 NAME + +Bio::Graphics::Glyph::arrow - The "ruler_arrow" glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This glyph draws arrows. Label, if requested, will be 5' and 3' at both ends +and tick scale is printed if no_tick_label option is set and tick option set. +Depending on options, the arrows can be labeled, be oriented vertically +or horizontally, or can contain major and minor ticks suitable for use as a scale. + +=head2 OPTIONS + +In addition to the common options, the following glyph-specific +options are recognized: + + Option Description Default + ------ ----------- ------- + + -tick Whether to draw major 0 + and minor ticks. + 0 = no ticks + 1 = major ticks + 2 = minor ticks + -label 5' at start, 3' at end 0 + above arrow + -both 5', 3' above, 0 + and 3', 5' below arrow + -direction 0 = ruler is plus strand 0 + -1 = ruler is minus strand + + -parallel Whether to draw the arrow true + parallel to the sequence + or perpendicular to it. + + -northeast Force a north or east true + arrowhead(depending + on orientation) + + -east synonym of above + + -southwest Force a south or west true + arrowhead(depending + on orientation) + + -west synonym of above + + -double force-doubleheaded arrow + + -base Draw a vertical base at the false + non-arrowhead side + + -scale Reset the labels on the arrow false + to reflect an externally + established scale. + +Set -parallel to false to display a point-like feature such as a +polymorphism, or to indicate an important location. If the feature +start == end, then the glyph will draw a single arrow at the +designated location: + + ^ + | + +Otherwise, there will be two arrows at the start and end: + + ^ ^ + | | + +Scale: Pass in a externally established scale to reset the labels on +the arrow. This is particularly useful for manually constructed +images where the founding parameters of the panel are not 1-based. +For example, a genetic map interval ranging from 0.1 - 0.3 can be +constructed by first multiplying every value by 100. Passing + + arrow(-scale=>100); + +will draw tick marks labelled appropriately to your external scale. + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Shengqiang Shu Esshu@bdgp.lbl.govE +Lincoln Stein Elstein@cshl.orgE. + +Copyright (c) 2001 BDGP, Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/segmented_keyglyph.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/segmented_keyglyph.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,91 @@ +package Bio::Graphics::Glyph::segmented_keyglyph; + +# $Id: segmented_keyglyph.pm,v 1.5 2002/07/10 01:25:00 lstein Exp $ +# Don't use this package. It's just for inheriting the segmented glyph in the panel key. + +use strict; +use Bio::Graphics::Glyph::generic; +use vars '@ISA'; +@ISA = 'Bio::Graphics::Glyph::generic'; + +sub make_key_feature { + my $self = shift; + my $scale = 1/$self->scale; # base pairs/pixel + # two segments, at pixels 0->50, 60->80 + my $offset = $self->panel->offset; + my $feature = + Bio::Graphics::Feature->new( + -segments=>[ [ 0*$scale +$offset,50*$scale+$offset], + [60*$scale+$offset, 80*$scale+$offset] + ], + -name => $self->option('key'), + -strand => '+1', + ); +} + +1; + +__END__ + +=head1 NAME + +Bio::Graphics::Glyph::segmented_keyglyph - The "segmented_keyglyph" glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This glyph is used internally by Bio::Graphics::Panel as a base calss +for drawing the keys at the bottom of the panel. It should not be +used explicitly. + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/segments.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/segments.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,347 @@ +package Bio::Graphics::Glyph::segments; +#$Id: segments.pm,v 1.21.2.1 2003/07/05 00:32:04 lstein Exp $ + +use strict; +use Bio::Location::Simple; +use Bio::Graphics::Glyph::generic; +use Bio::Graphics::Glyph::segmented_keyglyph; +use vars '@ISA'; + +use constant RAGGED_START_FUZZ => 25; # will show ragged ends of alignments + # up to this many bp. +use constant DEBUG => 0; + +@ISA = qw( Bio::Graphics::Glyph::segmented_keyglyph + Bio::Graphics::Glyph::generic + ); + +my %complement = (g=>'c',a=>'t',t=>'a',c=>'g',n=>'n', + G=>'C',A=>'T',T=>'A',C=>'G',N=>'N'); + +sub pad_left { + my $self = shift; + return $self->SUPER::pad_left unless $self->option('draw_target') && $self->option('ragged_start') && $self->dna_fits; + return $self->SUPER::pad_left unless $self->level > 0; + my $target = eval {$self->feature->hit} or return $self->SUPER::pad_left; + return $self->SUPER::pad_left unless $target->start<$target->end && $target->start < RAGGED_START_FUZZ; + return ($target->start-1) * $self->scale; +} + +sub pad_right { + my $self = shift; + return $self->SUPER::pad_right unless $self->level > 0; + return $self->SUPER::pad_right unless $self->option('draw_target') && $self->option('ragged_start') && $self->dna_fits; + my $target = eval {$self->feature->hit} or return $self->SUPER::pad_right; + return $self->SUPER::pad_right unless $target->end < $target->start && $target->start < RAGGED_START_FUZZ; + return ($target->end-1) * $self->scale; +} + +# group sets connector to 'solid' +sub connector { + my $self = shift; + return $self->SUPER::connector(@_) if $self->all_callbacks; + return ($self->SUPER::connector(@_) || 'solid'); +} + +# never allow our components to bump +sub bump { + my $self = shift; + return $self->SUPER::bump(@_) if $self->all_callbacks; + return 0; +} + +sub fontcolor { + my $self = shift; + return $self->SUPER::fontcolor unless $self->option('draw_target') || $self->option('draw_dna'); + return $self->SUPER::fontcolor unless $self->dna_fits; + return $self->bgcolor; +} + +sub draw_component { + my $self = shift; + my ($draw_dna,$draw_target) = ($self->option('draw_dna'),$self->option('draw_target')); + return $self->SUPER::draw_component(@_) + unless $draw_dna || $draw_target; + return $self->SUPER::draw_component(@_) unless $self->dna_fits; + + my $dna = $draw_target ? eval {$self->feature->hit->seq} + : eval {$self->feature->seq}; + return $self->SUPER::draw_component(@_) unless length $dna > 0; # safety + + my $show_mismatch = $draw_target && $self->option('show_mismatch'); + my $genomic = eval {$self->feature->seq} if $show_mismatch; + + my $gd = shift; + my ($x1,$y1,$x2,$y2) = $self->bounds(@_); + + # adjust for nonaligned left end (for ESTs...) The size given here is roughly sufficient + # to show a polyA end or a C. elegans trans-spliced leader. + my $offset = 0; + eval { # protect against data structures that don't implement the target() method. + if ($draw_target && $self->option('ragged_start')){ + my $target = $self->feature->hit; + if ($target->start < $target->end && $target->start < RAGGED_START_FUZZ + && $self->{partno} == 0) { + $offset = $target->start - 1; + if ($offset > 0) { + $dna = $target->subseq(1-$offset,0)->seq . $dna; + $genomic = $self->feature->subseq(1-$offset,0)->seq . $genomic; + $x1 -= $offset * $self->scale; + } + } + elsif ($target->end < $target->start && + $target->end < RAGGED_START_FUZZ && $self->{partno} == $self->{total_parts}) { + $offset = $target->end - 1; + if ($offset > 0) { + $dna .= $target->factory->get_dna($target,$offset,1); + $genomic = $self->feature->subseq(-$offset,0)->seq . $genomic; + $x2 += $offset * $self->scale; + $offset = 0; + } + } + } + }; + + $self->draw_dna($gd,$offset,lc $dna,lc $genomic,$x1,$y1,$x2,$y2); +} + +sub draw_dna { + my $self = shift; + + my ($gd,$start_offset,$dna,$genomic,$x1,$y1,$x2,$y2) = @_; + my $pixels_per_base = $self->scale; + my $feature = $self->feature; + my $target = $feature->target; + my $strand = $feature->strand; + + my @segs; + + my $complement = $strand < 0; + + if ($self->{flip}) { + $dna = $self->reversec($dna); + $genomic = $self->reversec($genomic); + $strand *= -1; + } + + warn "strand = $strand, complement = $complement" if DEBUG; + + if ($genomic && length($genomic) != length($dna) && eval { require Bio::Graphics::Browser::Realign}) { + warn "$genomic\n$dna\n" if DEBUG; + warn "strand = $strand" if DEBUG; + @segs = Bio::Graphics::Browser::Realign::align_segs($genomic,$dna); + for my $seg (@segs) { + my $src = substr($genomic,$seg->[0],$seg->[1]-$seg->[0]+1); + my $tgt = substr($dna, $seg->[2],$seg->[3]-$seg->[2]+1); + warn "@$seg\n$src\n$tgt" if DEBUG; + } + } else { + @segs = [0,length($genomic)-1,0,length($dna)-1]; + } + + my $color = $self->fgcolor; + my $font = $self->font; + my $lineheight = $font->height; + my $fontwidth = $font->width; + $y1 -= $lineheight/2 - 3; + my $pink = $self->factory->translate_color('lightpink'); + my $panel_end = $self->panel->right; + + my $start = $self->map_no_trunc($self->feature->start- $start_offset); + my $end = $self->map_no_trunc($self->feature->end - $start_offset); + + my ($last,$tlast); + for my $seg (@segs) { + + # fill in misaligned bits with dashes and bases + if (defined $last) { + my $delta = $seg->[0] - $last - 1; + my $tdelta = $seg->[2] - $tlast - 1; + warn "src gap [$last,$seg->[0]], tgt gap [$tlast,$seg->[2]], delta = $delta, tdelta = $tdelta\n" if DEBUG; + + my $gaps = $delta - $tdelta; + my @fill_in = split '',substr($dna,$tlast+1,$tdelta) if $tdelta > 0; + unshift @fill_in,('-')x$gaps if $gaps > 0; + + warn "gaps = $gaps, fill_in = @fill_in\n" if DEBUG; + + my $distance = $pixels_per_base * ($delta+1); + my $pixels_per_target = $gaps >= 0 ? $pixels_per_base : $distance/(@fill_in+1); + + warn "pixels_per_base = $pixels_per_base, pixels_per_target=$pixels_per_target\n" if DEBUG; + my $offset = $self->{flip} ? $end + ($last-1)*$pixels_per_base : $start + $last*$pixels_per_base; + + for (my $i=0; $i<@fill_in; $i++) { + + my $x = $self->{flip} ? int($offset + ($i+1)*$pixels_per_target + 0.5) + : int($offset + ($i+1)*$pixels_per_target + 0.5); + + $self->filled_box($gd,$x,$y1+3,$x+$fontwidth,$y1+$lineheight-3,$pink,$pink) unless $gaps; + $gd->char($font,$x,$y1,$complement? $complement{$fill_in[$i]} : $fill_in[$i],$color); + } + } + + my @genomic = split '',substr($genomic,$seg->[0],$seg->[1]-$seg->[0]+1); + my @bases = split '',substr($dna, $seg->[2],$seg->[3]-$seg->[2]+1); + for (my $i = 0; $i<@bases; $i++) { + my $x = $self->{flip} ? int($end + ($seg->[0] + $i - 1)*$pixels_per_base + 0.5) + : int($start + ($seg->[0] + $i) *$pixels_per_base + 0.5); + next if $x+1 < $x1; + last if $x+1 > $x2; + if ($genomic[$i] && lc($bases[$i]) ne lc($complement ? $complement{$genomic[@genomic - $i - 1]} : $genomic[$i])) { + $self->filled_box($gd,$x,$y1+3,$x+$fontwidth,$y1+$lineheight-3,$pink,$pink); + } + $gd->char($font,$x,$y1,$complement ? $complement{$bases[$i]} || $bases[$i] : $bases[$i],$color); + } + $last = $seg->[1]; + $tlast = $seg->[3]; + } + +} + +# Override _subseq() method to make it appear that a top-level feature that +# has no subfeatures appears as a feature that has a single subfeature. +# Otherwise at high mags gaps will be drawn as components rather than +# as connectors. Because of differing representations of split features +# in Bio::DB::GFF::Feature and Bio::SeqFeature::Generic, there is +# some breakage of encapsulation here. +sub _subseq { + my $self = shift; + my $feature = shift; + my @subseq = $self->SUPER::_subseq($feature); + return @subseq if @subseq; + if ($self->level == 0 && !@subseq && !eval{$feature->compound}) { + my($start,$end) = ($feature->start,$feature->end); + ($start,$end) = ($end,$start) if $start > $end; # to keep Bio::Location::Simple from bitching + # return Bio::Location::Simple->new(-start=>$start,-end=>$end); + return $self->feature; + } else { + return; + } +} + +1; + +__END__ + +=head1 NAME + +Bio::Graphics::Glyph::segments - The "segments" glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This glyph is used for drawing features that consist of discontinuous +segments. Unlike "graded_segments" or "alignment", the segments are a +uniform color and not dependent on the score of the segment. + +=head2 OPTIONS + +The following options are standard among all Glyphs. See +L for a full explanation. + + Option Description Default + ------ ----------- ------- + + -fgcolor Foreground color black + + -outlinecolor Synonym for -fgcolor + + -bgcolor Background color turquoise + + -fillcolor Synonym for -bgcolor + + -linewidth Line width 1 + + -height Height of glyph 10 + + -font Glyph font gdSmallFont + + -connector Connector type 0 (false) + + -connector_color + Connector color black + + -label Whether to draw a label 0 (false) + + -description Whether to draw a description 0 (false) + + -strand_arrow Whether to indicate 0 (false) + strandedness + + -draw_dna If true, draw the dna residues 0 (false) + when magnification level + allows. + + -draw_target If true, draw the dna residues 0 (false) + of the TARGET sequence when + magnification level allows. + SEE NOTE. + + -ragged_start When combined with -draw_target, 0 (false) + draw a few bases beyond the end + of the alignment. SEE NOTE. + + -show_mismatch When combined with -draw_target, 0 (false) + highlights mismatched bases in + pink. SEE NOTE. + +The -draw_target and -ragged_start options only work with seqfeatures +that implement the hit() method (Bio::SeqFeature::SimilarityPair). +The -ragged_start option is mostly useful for looking for polyAs and +cloning sites at the beginning of ESTs and cDNAs. Currently there is +no way of activating ragged ends. The length of the ragged starts is +hard-coded at 25 bp, and the color of mismatches is hard-coded as +light pink. + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/span.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/span.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,88 @@ +package Bio::Graphics::Glyph::span; + +use strict; +use vars '@ISA'; +@ISA = 'Bio::Graphics::Glyph::anchored_arrow'; +use Bio::Graphics::Glyph::anchored_arrow; + +sub no_trunc { 0 } + +sub arrowheads {0,0,1,1} + +1; + +__END__ + +=head1 NAME + +Bio::Graphics::Glyph::span - The "span" glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This glyph draws a span that looks like this: + + |-----------------------------| + +If one or both ends go off the edges of the panel, they are truncated: + + ----------------------| left end off picture + |---------------------------- right end off picture + ------------------------------------- both ends off picture + +=head1 OPTIONS + +The standard options are recognized. + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Allen Day Eday@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/splice_site.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/splice_site.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,115 @@ +package Bio::Graphics::Glyph::splice_site; + +use strict; +use vars '@ISA'; +@ISA = 'Bio::Graphics::Glyph::generic'; +use Bio::Graphics::Glyph::generic; + +use constant PWIDTH => 3; + +sub draw_component { + my $self = shift; + my $gd = shift; + my ($left,$top) = @_; + my($x1,$y1,$x2,$y2) = $self->bounds(@_); + + my $center = int(0.5+($x1+$x2)/2); + my $direction = $self->option('direction'); + + my $height = $y2 - $y1; + my $fraction = $self->option('height_fraction') || 1.0; + my $bottom = $y2; + $top = $y2 - $fraction * $height; + + # draw the base + my $fgcolor = $self->fgcolor; + $gd->line($center,$bottom,$center,$top,$fgcolor); + + if ($direction eq 'right') { + $gd->line($center,$top,$center + PWIDTH,$top,$fgcolor); + } elsif ($direction eq 'left') { + $gd->line($center,$top,$center - PWIDTH,$top,$fgcolor); + } + +} + +1; + + +=head1 NAME + +Bio::Graphics::Glyph::splice_site - The "splice_site" glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This glyph was designed to show an inverted "L" representing splice +donors and acceptors. The vertical part of the L points downwards and +is positioned in the center of the range (even if the range is quite +large). + +In addition to the usual glyph options, this glyph recognizes: + + Option Value Description + ------ ----- ----------- + + direction "left" or "right" direction the short part of the L + points + + height_fraction 0.0 - 1.0 fractional height of the glyph, + usually a callback that uses the + feature's score to determine its + height + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Xiaokang Pan Epan@cshl.orgE + +Copyright (c) 2001 BDGP + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/toomany.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/toomany.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,101 @@ +package Bio::Graphics::Glyph::toomany; +# DAS-compatible package to use for drawing a box + +use strict; +use vars '@ISA'; +@ISA = 'Bio::Graphics::Glyph::generic'; +use Bio::Graphics::Glyph::generic; + +# draw the thing onto a canvas +# this definitely gets overridden +sub draw { + my $self = shift; + my $gd = shift; + my ($left,$top) = @_; + my ($x1,$y1,$x2,$y2) = $self->calculate_boundaries($left,$top); + +# $self->filled_oval($gd,$x1,$y1,$x2,$y2); + for (my $m = 3;$m > 0;$m--){ + my $stack = $m * $self->height / 2; + $self->unfilled_box($gd,$x1-$stack,$y1-$stack,$x2-$stack,$y2-$stack); + } + + # add a label if requested + $self->draw_label($gd,$left,($top-($self->height*1.1))) if $self->option('label'); +} + +sub label { + return "too many to display"; +} + +1; + +__END__ + +=head1 NAME + +Bio::Graphics::Glyph::toomany - The "too many to show" glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This glyph is intended for features that are too dense to show +properly. Mostly a placeholder, it currently shows a filled oval. If +you choose a bump of 0, the ovals will overlap, to give a cloud +effect. + +=head2 OPTIONS + +There are no glyph-specific options. + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Allen Day Eday@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/track.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/track.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,92 @@ +package Bio::Graphics::Glyph::track; + +use strict; +use Bio::Graphics::Glyph; +use vars '@ISA'; +@ISA = 'Bio::Graphics::Glyph'; + +# track sets connector to empty +sub connector { + my $self = shift; + return $self->SUPER::connector(@_) if $self->all_callbacks; + return 'none'; +} + +sub draw { + my $self = shift; + my ($gd,$left,$top,$partno,$total_parts) = @_; + my @parts = $self->parts; + for (my $i=0; $i<@parts; $i++) { + $parts[$i]->draw($gd,$left,$top,0,1); + } +} + +# do nothing for components +# sub draw_component { } + +1; + + +__END__ + +=head1 NAME + +Bio::Graphics::Glyph::track - The "track" glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This glyph is used internally by Bio::Graphics::Panel for laying out +tracks. It should not be used explicitly. + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/transcript.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/transcript.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,190 @@ +package Bio::Graphics::Glyph::transcript; +# $Id: transcript.pm,v 1.12.2.1 2003/07/05 00:32:04 lstein Exp $ + +use strict; +use Bio::Graphics::Glyph::segments; +use vars '@ISA'; +@ISA = qw( Bio::Graphics::Glyph::segments); + +sub pad_left { + my $self = shift; + my $pad = $self->SUPER::pad_left; + return $pad if $self->{level} > 0; + my $strand = $self->feature->strand; + return $pad unless defined $strand && $strand < 0; + return $self->arrow_length > $pad ? $self->arrow_length : $pad; +} + +sub pad_right { + my $self = shift; + my $pad = $self->SUPER::pad_right; + return $pad if $self->{level} > 0; + my $strand = $self->feature->strand; + return $pad unless defined($strand) && $strand > 0; + return $self->arrow_length > $pad ? $self->arrow_length : $pad; +} + +sub draw_component { + my $self = shift; + return unless $self->level > 0; + $self->SUPER::draw_component(@_); +} + +sub draw_connectors { + my $self = shift; + my $gd = shift; + my ($left,$top) = @_; + $self->SUPER::draw_connectors($gd,$left,$top); + my @parts = $self->parts; + + # H'mmm. No parts. Must be in an intron, so draw intron + # spanning entire range + if (!@parts) { + my($x1,$y1,$x2,$y2) = $self->bounds(0,0); + $self->_connector($gd,$left,$top,$x1,$y1,$x1,$y2,$x2,$y1,$x2,$y2); + @parts = $self; + } + + # flip argument makes this confusing + # certainly there's a simpler way to express this idea + my $strand = $self->feature->strand; + my ($first,$last) = ($parts[0],$parts[-1]); + ($first,$last) = ($last,$first) if exists $self->{flip}; + + if ($strand >= 0) { + my($x1,$y1,$x2,$y2) = $last->bounds(@_); + my $center = ($y2+$y1)/2; + $self->{flip} ? + $self->arrow($gd,$x1,$x1-$self->arrow_length,$center) + : + $self->arrow($gd,$x2,$x2+$self->arrow_length,$center); + } else { + my($x1,$y1,$x2,$y2) = $first->bounds(@_); + my $center = ($y2+$y1)/2; + $self->{flip } ? + $self->arrow($gd,$x2,$x2+$self->arrow_length,$center) + : + $self->arrow($gd,$x1,$x1 - $self->arrow_length,$center); + } +} + +sub arrow_length { + my $self = shift; + return $self->option('arrow_length') || 8; +} + +# override option() for force the "hat" type of connector +sub connector { + my $self = shift; + return $self->SUPER::connector(@_) if $self->all_callbacks; + return ($self->option('connector') || 'hat'); +} + + +1; + +__END__ + +=head1 NAME + +Bio::Graphics::Glyph::transcript - The "transcript" glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This glyph is used for drawing transcripts. It is essentially a +"segments" glyph in which the connecting segments are hats. The +direction of the transcript is indicated by an arrow attached to the +end of the glyph. + +=head2 OPTIONS + +The following options are standard among all Glyphs. See +L for a full explanation. + + Option Description Default + ------ ----------- ------- + + -fgcolor Foreground color black + + -outlinecolor Synonym for -fgcolor + + -bgcolor Background color turquoise + + -fillcolor Synonym for -bgcolor + + -linewidth Line width 1 + + -height Height of glyph 10 + + -font Glyph font gdSmallFont + + -connector Connector type 0 (false) + + -connector_color + Connector color black + + -label Whether to draw a label 0 (false) + + -description Whether to draw a description 0 (false) + +In addition, the alignment glyph recognizes the following +glyph-specific options: + + Option Description Default + ------ ----------- ------- + + -arrow_length Length of the directional 8 + arrow. + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/transcript2.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/transcript2.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,208 @@ +package Bio::Graphics::Glyph::transcript2; + +# $Id: transcript2.pm,v 1.15.2.1 2003/07/05 00:32:04 lstein Exp $ + +use strict; +use Bio::Graphics::Glyph::transcript; +use vars '@ISA'; +@ISA = 'Bio::Graphics::Glyph::transcript'; + +use constant MIN_WIDTH_FOR_ARROW => 8; + +sub pad_left { + my $self = shift; + my $pad = $self->Bio::Graphics::Glyph::generic::pad_left; + return $pad unless ($self->feature->strand||0) < 0; #uninitialized var warning + my $first = ($self->parts)[0] || $self; + my @rect = $first->bounds(); + my $width = abs($rect[2] - $rect[0]); + return $self->SUPER::pad_left if $width < MIN_WIDTH_FOR_ARROW; + return $pad; +} + +sub pad_right { + my $self = shift; + my $pad = $self->Bio::Graphics::Glyph::generic::pad_right; + return $pad if $self->{level} > 0; + my $last = ($self->parts)[-1] || $self; + my @rect = $last->bounds(); + my $width = abs($rect[2] - $rect[0]); + return $self->SUPER::pad_right if $width < MIN_WIDTH_FOR_ARROW; + return $pad +} + +sub draw_component { + my $self = shift; + return unless $self->level > 0; + + my $gd = shift; + my ($left,$top) = @_; + my @rect = $self->bounds(@_); + + my $width = abs($rect[2] - $rect[0]); + my $filled = defined($self->{partno}) && $width >= MIN_WIDTH_FOR_ARROW; + + if ($filled) { + my $f = $self->feature; + my $strand = $f->strand; + my ($first,$last) = ($self->{partno} == 0 , $self->{partno} == $self->{total_parts}-1); + ($first,$last) = ($last,$first) if $self->{flip}; + + if ($strand < 0 && $first) { # first exon, minus strand transcript + $self->filled_arrow($gd,-1,@rect); + } elsif ($strand >= 0 && $last) { # last exon, plus strand + $self->filled_arrow($gd,+1,@rect); + } else { + $self->SUPER::draw_component($gd,@_); + } + } + + else { + $self->SUPER::draw_component($gd,@_); + } + +} + +sub draw_connectors { + my $self = shift; + my ($gd,$dx,$dy) = @_; + + my $part; + my $strand = $self->feature->strand; + $strand *= -1 if $self->{flip}; #sigh + if (my @parts = $self->parts) { + $part = $strand >= 0 ? $parts[-1] : $parts[0]; + } else { + # no parts -- so draw an intron spanning whole thing + my($x1,$y1,$x2,$y2) = $self->bounds(0,0); + $self->_connector($gd,$dx,$dy,$x1,$y1,$x1,$y2,$x2,$y1,$x2,$y2); + $part = $self; + } + my @rect = $part->bounds(); + my $width = abs($rect[2] - $rect[0]); + my $filled = $width >= MIN_WIDTH_FOR_ARROW; + + if ($filled) { + $self->Bio::Graphics::Glyph::generic::draw_connectors(@_); + } else { + $self->SUPER::draw_connectors(@_); + } +} + +sub bump { + my $self = shift; + return $self->SUPER::bump(@_) if $self->all_callbacks; + return 0; # never allow our components to bump +} + +1; + + +__END__ + +=head1 NAME + +Bio::Graphics::Glyph::transcript2 - The "transcript2" glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This glyph is used for drawing transcripts. It is like "transcript" +except that if there is sufficient room the terminal exon is shaped +like an arrow in order to indicate the direction of transcription. If +there isn't enough room, a small arrow is drawn. + +=head2 OPTIONS + +The following options are standard among all Glyphs. See +L for a full explanation. + + Option Description Default + ------ ----------- ------- + + -fgcolor Foreground color black + + -outlinecolor Synonym for -fgcolor + + -bgcolor Background color turquoise + + -fillcolor Synonym for -bgcolor + + -linewidth Line width 1 + + -height Height of glyph 10 + + -font Glyph font gdSmallFont + + -connector Connector type 0 (false) + + -connector_color + Connector color black + + -label Whether to draw a label 0 (false) + + -description Whether to draw a description 0 (false) + + -strand_arrow Whether to indicate 0 (false) + strandedness + +In addition, the alignment glyph recognizes the following +glyph-specific options: + + Option Description Default + ------ ----------- ------- + + -arrow_length Length of the directional 8 + arrow. + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/translation.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/translation.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,405 @@ +package Bio::Graphics::Glyph::translation; + +use strict; +use Bio::Graphics::Glyph::generic; +use Bio::Graphics::Util qw(frame_and_offset); +use vars '@ISA'; +@ISA = qw(Bio::Graphics::Glyph::generic); + +my %default_colors = qw( + frame0f cadetblue + frame1f blue + frame2f darkblue + frame0r darkred + frame1r red + frame2r crimson + ); + +# turn off description +sub description { 0 } + +# turn off label +# sub label { 1 } + +sub default_color { + my ($self,$key) = @_; + return $self->factory->translate_color($default_colors{$key}); +} + +sub height { + my $self = shift; + my $font = $self->font; + my $lines = $self->translation_type eq '3frame' ? 3 + : $self->translation_type eq '6frame' ? 6 + : 1; + return $self->protein_fits ? $lines*$font->height + : $self->SUPER::height; +} + +sub pixels_per_base { + my $self = shift; + return $self->scale; +} + +sub pixels_per_residue { + my $self = shift; + return $self->scale * 3; +} + +sub gridcolor { + my $self = shift; + my $color = $self->option('gridcolor') || 'lightgrey'; + $self->factory->translate_color($color); +} + +sub protein_fits { + my $self = shift; + + my $pixels_per_base = $self->pixels_per_residue; + my $font = $self->font; + my $font_width = $font->width; + + return $pixels_per_base >= $font_width; +} + +sub translation_type { + my $self = shift; + return $self->option('translation') || '1frame'; +} + +sub arrow_height { + my $self = shift; + $self->option('arrow_height') || 1; +} + +sub show_stop_codons { + my $self = shift; + my $show = $self->option('stop_codons'); + return $show if defined $show; + return 1; +} + +sub show_start_codons { + my $self = shift; + my $show = $self->option('start_codons'); + return $show if defined $show; + return 0; +} + +sub strand { + my $self = shift; + return $self->option('strand') || '+1'; +} + +sub draw_component { + my $self = shift; + my $gd = shift; + my ($x1,$y1,$x2,$y2) = $self->bounds(@_); + + my $type = $self->translation_type; + my $strand = $self->strand; + + my @strands = $type eq '6frame' ? (1,-1) + : $strand > 0 ? (1) + : -1; + my @phase = (0,2,1); # looks weird, but gives correct effect + for my $s (@strands) { + for (my $i=0; $i < @phase; $i++) { + $self->draw_frame($self->feature,$s,$i,$phase[$i],$gd,$x1,$y1,$x2,$y2); + } + } + +} + +sub draw_frame { + my $self = shift; + my ($feature,$strand,$base_offset,$phase,$gd,$x1,$y1,$x2,$y2) = @_; + return unless $feature->seq; # no sequence, arggh. + my ($seq,$pos) = $strand < 0 ? ($feature->revcom,$feature->end) + : ($feature,$feature->start); + my ($frame,$offset) = frame_and_offset($pos,$strand,$phase); + ($strand >= 0 ? $x1 : $x2) += $self->pixels_per_base * $offset; + my $lh; + if ($self->translation_type eq '6frame') { + $lh = $self->height / 6; + $y1 += $lh * $frame; + $y1 += $self->height/2 if $strand < 0; + } else { + $lh = $self->height / 3; + $y1 += $lh * $frame; + } + + $y2 = $y1; + + my $protein = $seq->translate(undef,undef,$base_offset)->seq; + my $k = $strand>=0 ? 'f' : 'r'; + my $color = $self->color("frame$frame$k") || + $self->color("frame$frame") || + $self->default_color("frame$frame$k") || $self->fgcolor; + if ($self->protein_fits) { + $self->draw_protein(\$protein,$strand,$color,$gd,$x1,$y1,$x2,$y2); + } else { + $self->draw_orfs(\$protein,$strand,$color,$gd,$x1,$y1,$x2,$y2); + } +} + +sub draw_protein { + my $self = shift; + my ($protein,$strand,$color,$gd,$x1,$y1,$x2,$y2) = @_; + my $pixels_per_base = $self->pixels_per_base; + my $font = $self->font; + + my @residues = split '',$$protein; + for (my $i=0;$i<@residues;$i++) { + my $x = $strand > 0 + ? $x1 + 3 * $i * $pixels_per_base + : $x2 - 3 * $i * $pixels_per_base; + next if $x+1 < $x1; + last if $x > $x2; + $gd->char($font,$x,$y1,$residues[$i],$color); + } +} + +sub draw_orfs { + my $self = shift; + my ($protein,$strand,$color,$gd,$x1,$y1,$x2,$y2) = @_; + my $pixels_per_base = $self->pixels_per_base * 3; + $y1++; + + my $gcolor = $self->gridcolor; + $gd->line($x1,$y1,$x2,$y1,$gcolor); + + if ($self->show_stop_codons) { + my $stops = $self->find_codons($protein,'*'); + + for my $stop (@$stops) { + my $pos = $strand > 0 + ? $x1 + $stop * $pixels_per_base + : $x2 - $stop * $pixels_per_base; + next if $pos+1 < $x1; + last if $pos > $x2; + $gd->line($pos,$y1-2,$pos,$y1+2,$color); + } + } + + my $arrowhead_height = $self->arrow_height; + + if ($self->show_start_codons) { + my $starts = $self->find_codons($protein,'M'); + + for my $start (@$starts) { + my $pos = $strand > 0 + ? $x1 + $start * $pixels_per_base + : $x2 - $start * $pixels_per_base; + next if $pos+1 < $x1; + last if $pos > $x2; + + # little arrowheads at the start codons + $strand > 0 ? $self->arrowhead($gd,$pos-$arrowhead_height,$y1, + $arrowhead_height,+1) + : $self->arrowhead($gd,$pos+$arrowhead_height,$y1, + $arrowhead_height,-1) + } + } + + $strand > 0 ? $self->arrowhead($gd,$x2-1,$y1,3,+1) + : $self->arrowhead($gd,$x1,$y1,3,-1) +} + +sub find_codons { + my $self = shift; + my $protein = shift; + my $codon = shift || '*'; + my $pos = -1; + my @stops; + while ( ($pos = index($$protein,$codon,$pos+1)) >= 0) { + push @stops,$pos; + } + \@stops; +} + +sub make_key_feature { + my $self = shift; + my @gatc = qw(g a t c); + my $offset = $self->panel->offset; + my $scale = 1/$self->scale; # base pairs/pixel + my $start = $offset; + my $stop = $offset + 100 * $scale; + my $seq = join('',map{$gatc[rand 4]} (1..500)); + my $feature = + Bio::Graphics::Feature->new(-start=> $start, + -end => $stop, + -seq => $seq, + -name => $self->option('key') + ); + $feature; +} + +1; + +__END__ + +=head1 NAME + +Bio::Graphics::Glyph::translation - The "6-frame translation" glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This glyph draws the conceptual translation of DNA sequences. At high +magnifications, it simply draws lines indicating open reading frames. +At low magnifications, it draws a conceptual protein translation. +Options can be used to set 1-frame, 3-frame or 6-frame translations. + +=head2 OPTIONS + +The following options are standard among all Glyphs. See +L for a full explanation. + + Option Description Default + ------ ----------- ------- + + -fgcolor Foreground color black + + -outlinecolor Synonym for -fgcolor + + -bgcolor Background color turquoise + + -fillcolor Synonym for -bgcolor + + -linewidth Line width 1 + + -height Height of glyph 10 + + -font Glyph font gdSmallFont + + -connector Connector type 0 (false) + + -connector_color + Connector color black + + -label Whether to draw a label 0 (false) + + -description Whether to draw a description 0 (false) + +In addition to the common options, the following glyph-specific +options are recognized: + + Option Description Default + ------ ----------- ------- + + -translation Type of translation to 1frame + perform. One of "1frame", + "3frame", or "6frame" + + -strand Forward (+1) or reverse (-1) +1 + translation. + + -frame0 Color for the first frame fgcolor + + -frame1 Color for the second frame fgcolor + + -frame2 Color for the third frame fgcolor + + -gridcolor Color for the horizontal lightgrey + lines of the reading frames + + -start_codons Draw little arrowheads 0 (false) + indicating start codons + + -stop_codons Draw little vertical ticks 1 (true) + indicating stop codons + + -arrow_height Height of the start codon 1 + arrowheads + +=head1 SUGGESTED STANZA FOR GENOME BROWSER + +This produces a nice gbrowse display in which the DNA/GC Content glyph +is sandwiched between the forward and reverse three-frame +translations. The frames are color-coordinated with the example +configuration for the "cds" glyph. + + [TranslationF] + glyph = translation + global feature = 1 + frame0 = cadetblue + frame1 = blue + frame2 = darkblue + height = 20 + fgcolor = purple + strand = +1 + translation = 3frame + key = 3-frame translation (forward) + + [DNA/GC Content] + glyph = dna + global feature = 1 + height = 40 + do_gc = 1 + fgcolor = red + axis_color = blue + + [TranslationR] + glyph = translation + global feature = 1 + frame0 = darkred + frame1 = red + frame2 = crimson + height = 20 + fgcolor = blue + strand = -1 + translation = 3frame + key = 3-frame translation (reverse) + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/triangle.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/triangle.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,143 @@ +package Bio::Graphics::Glyph::triangle; +# DAS-compatible package to use for drawing a triangle + +use strict; +use vars '@ISA'; +@ISA = 'Bio::Graphics::Glyph::generic'; +use Bio::Graphics::Glyph::generic; + +sub pad_left { + my $self = shift; + my $left = $self->SUPER::pad_left; + return $left unless $self->option('point'); + my $extra = $self->option('height')/3; + return $extra > $left ? $extra : $left; +} + +sub pad_right { + my $self = shift; + my $right = $self->SUPER::pad_right; + return $right unless $self->option('point'); + my $extra = $self->option('height')/3; + return $extra > $right ? $extra : $right; +} + +sub draw_component { + my $self = shift; + my $gd = shift; + my $fg = $self->fgcolor; + my $orient = $self->option('orient') || 'S'; + + # find the center and vertices + my ($x1,$y1,$x2,$y2) = $self->calculate_boundaries(@_); + my $xmid = ($x1+$x2)/2; + my $ymid = ($y1+$y2)/2; + + my ($vx1,$vy1,$vx2,$vy2,$vx3,$vy3); + + #make an equilateral + my ($p,$q) = ($self->option('height'),($x2-$x1)/2); + if ($self->option('point')){ + $q = $p/sqrt(3); #2; + $x1 = $xmid - $q; $x2 = $xmid + $q; + $y1 = $ymid - $q; $y2 = $ymid + $q; + } + + if ($orient eq 'S'){$vx1=$x1;$vy1=$y1;$vx2=$x2;$vy2=$y1;$vx3=$xmid;$vy3=$y2;} + elsif($orient eq 'N'){$vx1=$x1;$vy1=$y2;$vx2=$x2;$vy2=$y2;$vx3=$xmid;$vy3=$y1;} + elsif($orient eq 'W'){$vx1=$x2;$vy1=$y1;$vx2=$x2;$vy2=$y2;$vx3=$x2-$p;$vy3=$ymid;} + elsif($orient eq 'E'){$vx1=$x1;$vy1=$y1;$vx2=$x1;$vy2=$y2;$vx3=$x1+$p;$vy3=$ymid;} + + # now draw the triangle + $gd->line($vx1,$vy1,$vx2,$vy2,$fg); + $gd->line($vx2,$vy2,$vx3,$vy3,$fg); + $gd->line($vx3,$vy3,$vx1,$vy1,$fg); + + if (my $c = $self->bgcolor){ + $gd->fillToBorder($xmid,$ymid,$fg,$c) if $orient eq 'S' || $orient eq 'N'; + $gd->fillToBorder($x1+1,$ymid,$fg,$c) if $orient eq 'E'; + $gd->fillToBorder($x2-1,$ymid,$fg,$c) if $orient eq 'W'; + } +} + +1; + +__END__ + +=head1 NAME + +Bio::Graphics::Glyph::triangle - The "triangle" glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This glyph draws an equilateral triangle when -point is defined. +It draws an isoceles triangle otherwise. It is possible to draw +the triangle with the base on the N, S, E, or W side. + +=head2 OPTIONS + +In addition to the common options, the following glyph-specific +options are recognized: + + Option Description Default + ------ ----------- ------- + + -point If true, the triangle 0 + will drawn at the center + of the range, and not scaled + to the feature width. + + -orient On which side shall the S + base be? (NSEW) + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Allen Day Eday@cshl.orgE. + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Glyph/xyplot.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Glyph/xyplot.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,453 @@ +package Bio::Graphics::Glyph::xyplot; + +use strict; +use Bio::Graphics::Glyph::segments; +use vars '@ISA'; +use GD 'gdTinyFont'; + +@ISA = 'Bio::Graphics::Glyph::segments'; + +use constant DEFAULT_POINT_RADIUS=>1; + +my %SYMBOLS = ( + triangle => \&draw_triangle, + square => \&draw_square, + disc => \&draw_disc, + point => \&draw_point, + ); + +sub point_radius { + shift->option('point_radius') || DEFAULT_POINT_RADIUS; +} + +sub pad_top { 0 } + +sub draw { + my $self = shift; + my ($gd,$dx,$dy) = @_; + my ($left,$top,$right,$bottom) = $self->calculate_boundaries($dx,$dy); + + my @parts = $self->parts; + return $self->SUPER::draw(@_) unless @parts > 0; + + # figure out the scale and such like + my $max_score = $self->option('max_score'); + my $min_score = $self->option('min_score'); + + unless (defined $max_score && defined $min_score) { + my $first = $parts[0]; + $max_score = $min_score = eval { $first->feature->score} || 0; + for my $part (@parts) { + my $s = eval { $part->feature->score }; + next unless defined $s; + $max_score = $s if $s > $max_score; + $min_score = $s if $s < $min_score; + } + } + + # if a scale is called for, then we adjust the max and min to be even + # multiples of a power of 10. + if ($self->option('scale')) { + $max_score = max10($max_score); + $min_score = min10($min_score); + } + + my $height = $self->option('height'); + my $scale = $max_score > $min_score ? $height/($max_score-$min_score) + : 1; + my $x = $dx; + my $y = $dy + $self->top + $self->pad_top; + + # now seed all the parts with the information they need to draw their positions + foreach (@parts) { + my $s = eval {$_->feature->score}; + next unless defined $s; + my $position = ($s-$min_score) * $scale; + $_->{_y_position} = $bottom - $position; + } + + my $type = $self->option('graph_type'); + $self->_draw_histogram($gd,$x,$y) if $type eq 'histogram'; + $self->_draw_boxes($gd,$x,$y) if $type eq 'boxes'; + $self->_draw_line ($gd,$x,$y) if $type eq 'line' + or $type eq 'linepoints'; + $self->_draw_points($gd,$x,$y) if $type eq 'points' + or $type eq 'linepoints'; + + $self->_draw_scale($gd,$scale,$min_score,$max_score,$dx,$dy) if $self->option('scale'); +} + +sub log10 { log(shift)/log(10) } +sub max10 { + my $a = shift; + $a = 1 if $a <= 0; + my $l=int(log10($a)); + $l = 10**$l; + my $r = $a/$l; + return $r*$l if int($r) == $r; + return $l*int(($a+$l)/$l); +} +sub min10 { + my $a = shift; + $a = 1 if $a <= 0; + my $l=int(log10($a)); + $l = 10**$l; + my $r = $a/$l; + return $r*$l if int($r) == $r; + return $l*int($a/$l); +} + +sub _draw_histogram { + my $self = shift; + my ($gd,$left,$top) = @_; + + my @parts = $self->parts; + my $fgcolor = $self->fgcolor; + + # draw each of the component lines of the histogram surface + for (my $i = 0; $i < @parts; $i++) { + my $part = $parts[$i]; + my $next = $parts[$i+1]; + my ($x1,$y1,$x2,$y2) = $part->calculate_boundaries($left,$top); + $gd->line($x1,$part->{_y_position},$x2,$part->{_y_position},$fgcolor); + next unless $next; + my ($x3,$y3,$x4,$y4) = $next->calculate_boundaries($left,$top); + if ($x2 == $x3) {# connect vertically to next level + $gd->line($x2,$part->{_y_position},$x2,$next->{_y_position},$fgcolor); + } else { + $gd->line($x2,$part->{_y_position},$x2,$y2,$fgcolor); # to bottom + $gd->line($x2,$y2,$x3,$y2,$fgcolor); # to right + $gd->line($x3,$y4,$x3,$next->{_y_position},$fgcolor); # up + } + } + + # end points: from bottom to first + my ($x1,$y1,$x2,$y2) = $parts[0]->calculate_boundaries($left,$top); + $gd->line($x1,$y2,$x1,$parts[0]->{_y_position},$fgcolor); + # from last to bottom + my ($x3,$y3,$x4,$y4) = $parts[-1]->calculate_boundaries($left,$top); + $gd->line($x4,$parts[-1]->{_y_position},$x4,$y4,$fgcolor); + + # from left to right -- don't like this + # $gd->line($x1,$y2,$x4,$y4,$fgcolor); + + # That's it. Not too hard. +} + +sub _draw_boxes { + my $self = shift; + my ($gd,$left,$top) = @_; + + my @parts = $self->parts; + my $fgcolor = $self->fgcolor; + my $bgcolor = $self->bgcolor; + my $height = $self->height; + + # draw each of the component lines of the histogram surface + for (my $i = 0; $i < @parts; $i++) { + my $part = $parts[$i]; + my $next = $parts[$i+1]; + my ($x1,$y1,$x2,$y2) = $part->calculate_boundaries($left,$top); + $self->filled_box($gd,$x1,$part->{_y_position},$x2,$y2,$bgcolor,$fgcolor); + next unless $next; + my ($x3,$y3,$x4,$y4) = $next->calculate_boundaries($left,$top); + $gd->line($x2,$y2,$x3,$y4,$fgcolor) if $x2 < $x3; + } + + # That's it. +} + +sub _draw_line { + my $self = shift; + my ($gd,$left,$top) = @_; + + my @parts = $self->parts; + my $fgcolor = $self->fgcolor; + my $bgcolor = $self->bgcolor; + + # connect to center positions of each interval + my $first_part = shift @parts; + my ($x1,$y1,$x2,$y2) = $first_part->calculate_boundaries($left,$top); + my $current_x = ($x1+$x2)/2; + my $current_y = $first_part->{_y_position}; + + for my $part (@parts) { + my ($x1,$y1,$x2,$y2) = $part->calculate_boundaries($left,$top); + my $next_x = ($x1+$x2)/2; + my $next_y = $part->{_y_position}; + $gd->line($current_x,$current_y,$next_x,$next_y,$fgcolor); + ($current_x,$current_y) = ($next_x,$next_y); + } + +} + +sub _draw_points { + my $self = shift; + my ($gd,$left,$top) = @_; + my $symbol_name = $self->option('point_symbol') || 'point'; + my $symbol_ref = $SYMBOLS{$symbol_name}; + + my @parts = $self->parts; + my $bgcolor = $self->bgcolor; + my $pr = $self->point_radius; + + for my $part (@parts) { + my ($x1,$y1,$x2,$y2) = $part->calculate_boundaries($left,$top); + my $x = ($x1+$x2)/2; + my $y = $part->{_y_position}; + $symbol_ref->($gd,$x,$y,$pr,$bgcolor); + } +} + +sub _draw_scale { + my $self = shift; + my ($gd,$scale,$min,$max,$dx,$dy) = @_; + my ($x1,$y1,$x2,$y2) = $self->calculate_boundaries($dx,$dy); + + my $side = $self->option('scale'); + return if $side eq 'none'; + $side ||= 'both'; + + my $fg = $self->fgcolor; + my $half = ($y1+$y2)/2; + + $gd->line($x1+1,$y1,$x1+1,$y2,$fg) if $side eq 'left' || $side eq 'both'; + $gd->line($x2-2,$y1,$x2-2,$y2,$fg) if $side eq 'right' || $side eq 'both'; + + for ([$y1,$max],[$half,int(($max-$min)/2+0.5)]) { + $gd->line($x1,$_->[0],$x1+3,$_->[0],$fg) if $side eq 'left' || $side eq 'both'; + $gd->line($x2-4,$_->[0],$x2,$_->[0],$fg) if $side eq 'right' || $side eq 'both'; + if ($side eq 'left' or $side eq 'both') { + $gd->string(gdTinyFont, + $x1 + 5,$_->[0]-(gdTinyFont->height/3), + $_->[1], + $fg); + } + if ($side eq 'right' or $side eq 'both') { + $gd->string(gdTinyFont, + $x2-5 - (length($_->[1])*gdTinyFont->width),$_->[0]-(gdTinyFont->height/3), + $_->[1], + $fg); + } + } +} + +# we are unbumpable! +sub bump { + return 0; +} + +sub connector { + my $self = shift; + my $type = $self->option('graph_type'); + return 1 if $type eq 'line' or $type eq 'linepoints'; +} + +sub height { + my $self = shift; + return $self->option('graph_height') || $self->SUPER::height; +} + +sub draw_triangle { + my ($gd,$x,$y,$pr,$color) = @_; + my ($vx1,$vy1) = ($x-$pr,$y+$pr); + my ($vx2,$vy2) = ($x, $y-$pr); + my ($vx3,$vy3) = ($x+$pr,$y+$pr); + $gd->line($vx1,$vy1,$vx2,$vy2,$color); + $gd->line($vx2,$vy2,$vx3,$vy3,$color); + $gd->line($vx3,$vy3,$vx1,$vy1,$color); +} +sub draw_square { + my ($gd,$x,$y,$pr,$color) = @_; + $gd->line($x-$pr,$y-$pr,$x+$pr,$y-$pr,$color); + $gd->line($x+$pr,$y-$pr,$x+$pr,$y+$pr,$color); + $gd->line($x+$pr,$y+$pr,$x-$pr,$y+$pr,$color); + $gd->line($x-$pr,$y+$pr,$x-$pr,$y-$pr,$color); +} +sub draw_disc { + my ($gd,$x,$y,$pr,$color) = @_; + $gd->arc($x,$y,$pr,$pr,0,360,$color); +} +sub draw_point { + my ($gd,$x,$y,$pr,$color) = @_; + $gd->setPixel($x,$y,$color); +} + +sub _subseq { + my $class = shift; + my $feature = shift; + return $feature->segments if $feature->can('segments'); + my @split = eval { my $id = $feature->location->seq_id; + my @subs = $feature->location->sub_Location; + grep {$id eq $_->seq_id} @subs}; + return @split if @split; + return $feature->sub_SeqFeature if $feature->can('sub_SeqFeature'); + return; +} + +sub keyglyph { + my $self = shift; + + my $scale = 1/$self->scale; # base pairs/pixel + + my $feature = + Bio::Graphics::Feature->new( + -segments=>[ [ 0*$scale,9*$scale], + [ 10*$scale,19*$scale], + [ 20*$scale, 29*$scale] + ], + -name => 'foo bar', + -strand => '+1'); + ($feature->segments)[0]->score(10); + ($feature->segments)[1]->score(50); + ($feature->segments)[2]->score(25); + my $factory = $self->factory->clone; + $factory->set_option(label => 1); + $factory->set_option(bump => 0); + $factory->set_option(connector => 'solid'); + my $glyph = $factory->make_glyph(0,$feature); + return $glyph; +} + + +1; + +__END__ + +=head1 NAME + +Bio::Graphics::Glyph::xyplot - The xyplot glyph + +=head1 SYNOPSIS + + See L and L. + +=head1 DESCRIPTION + +This glyph is used for drawing features that have a position on the +genome and a numeric value. It can be used to represent gene +prediction scores, motif-calling scores, percent similarity, +microarray intensities, or other features that require a line plot. + +The X axis represents the position on the genome, as per all other +glyphs. The Y axis represents the score. Options allow you to set +the height of the glyph, the maximum and minimum scores, the color of +the line and axis, and the symbol to draw. + +The plot is designed to work on a single feature group that contains +subfeatures. It is the subfeatures that carry the score +information. The best way to arrange for this is to create an +aggregator for the feature. We'll take as an example a histogram of +repeat density in which interval are spaced every megabase and the +score indicates the number of repeats in the interval; we'll assume +that the database has been loaded in in such a way that each interval +is a distinct feature with the method name "density" and the source +name "repeat". Furthermore, all the repeat features are grouped +together into a single group (the name of the group is irrelevant). +If you are using Bio::DB::GFF and Bio::Graphics directly, the sequence +of events would look like this: + + my $agg = Bio::DB::GFF::Aggregator->new(-method => 'repeat_density', + -sub_parts => 'density:repeat'); + my $db = Bio::DB::GFF->new(-dsn=>'my_database', + -aggregators => $agg); + my $segment = $db->segment('Chr1'); + my @features = $segment->features('repeat_density'); + + my $panel = Bio::Graphics::Panel->new; + $panel->add_track(\@features, + -glyph => 'xyplot'); + +If you are using Generic Genome Browser, you will add this to the +configuration file: + + aggregators = repeat_density{density:repeat} + clone alignment etc + +=head2 OPTIONS + +The following options are standard among all Glyphs. See +L for a full explanation. + + Option Description Default + ------ ----------- ------- + + -fgcolor Foreground color black + + -outlinecolor Synonym for -fgcolor + + -bgcolor Background color turquoise + + -fillcolor Synonym for -bgcolor + + -linewidth Line width 1 + + -height Height of glyph 10 + + -font Glyph font gdSmallFont + + -label Whether to draw a label 0 (false) + + -description Whether to draw a description 0 (false) + +In addition, the alignment glyph recognizes the following +glyph-specific options: + + Option Description Default + ------ ----------- ------- + + -max_score Maximum value of the Calculated + feature's "score" attribute + + -min_score Minimum value of the Calculated + feature's "score" attribute + + -graph_type Type of graph to generate. Histogram + Options are: "histogram", + "boxes", "line", "points", + or "linepoints". + + -point_symbol Symbol to use. Options are none + "triangle", "square", "disc", + "point", and "none". + + -point_radius Radius of the symbol, in 1 + pixels + + -scale Position where the Y axis none + scale is drawn if any. + It should be one of + "left", "right" or "none" + + -graph_height Specify height of the graph Same as the + "height" option. + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Panel.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Panel.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1831 @@ +package Bio::Graphics::Panel; + +use strict; +use Bio::Graphics::Glyph::Factory; +use Bio::Graphics::Feature; +use GD;; + + +use constant KEYLABELFONT => gdMediumBoldFont; +use constant KEYSPACING => 5; # extra space between key columns +use constant KEYPADTOP => 5; # extra padding before the key starts +use constant KEYCOLOR => 'wheat'; +use constant KEYSTYLE => 'bottom'; +use constant KEYALIGN => 'left'; +use constant GRIDCOLOR => 'lightcyan'; +use constant MISSING_TRACK_COLOR =>'gray'; + +my %COLORS; # translation table for symbolic color names to RGB triple + +# Create a new panel of a given width and height, and add lists of features +# one by one +sub new { + my $class = shift; + my %options = @_; + + $class->read_colors() unless %COLORS; + + my $length = $options{-length} || 0; + my $offset = $options{-offset} || 0; + my $spacing = $options{-spacing} || 5; + my $bgcolor = $options{-bgcolor} || 0; + my $keyfont = $options{-key_font} || KEYLABELFONT; + my $keycolor = $options{-key_color} || KEYCOLOR; + my $keyspacing = $options{-key_spacing} || KEYSPACING; + my $keystyle = $options{-key_style} || KEYSTYLE; + my $keyalign = $options{-key_align} || KEYALIGN; + my $allcallbacks = $options{-all_callbacks} || 0; + my $gridcolor = $options{-gridcolor} || GRIDCOLOR; + my $grid = $options{-grid} || 0; + my $flip = $options{-flip} || 0; + my $empty_track_style = $options{-empty_tracks} || 'key'; + my $truecolor = $options{-truecolor} || 0; + + if (my $seg = $options{-segment}) { + $offset = eval {$seg->start-1} || 0; + $length = $seg->length; + } + + $offset ||= $options{-start}-1 if defined $options{-start}; + $length ||= $options{-stop}-$options{-start}+1 + if defined $options{-start} && defined $options{-stop}; + + return bless { + tracks => [], + width => $options{-width} || 600, + pad_top => $options{-pad_top}||0, + pad_bottom => $options{-pad_bottom}||0, + pad_left => $options{-pad_left}||0, + pad_right => $options{-pad_right}||0, + length => $length, + offset => $offset, + gridcolor => $gridcolor, + grid => $grid, + bgcolor => $bgcolor, + height => 0, # AUTO + spacing => $spacing, + key_font => $keyfont, + key_color => $keycolor, + key_spacing => $keyspacing, + key_style => $keystyle, + key_align => $keyalign, + all_callbacks => $allcallbacks, + truecolor => $truecolor, + flip => $flip, + empty_track_style => $empty_track_style, + },$class; +} + +sub pad_left { + my $self = shift; + my $g = $self->{pad_left}; + $self->{pad_left} = shift if @_; + $g; +} +sub pad_right { + my $self = shift; + my $g = $self->{pad_right}; + $self->{pad_right} = shift if @_; + $g; +} +sub pad_top { + my $self = shift; + my $g = $self->{pad_top}; + $self->{pad_top} = shift if @_; + $g; +} +sub pad_bottom { + my $self = shift; + my $g = $self->{pad_bottom}; + $self->{pad_bottom} = shift if @_; + $g; +} + +sub flip { + my $self = shift; + my $g = $self->{flip}; + $self->{flip} = shift if @_; + $g; +} + +# values of empty_track_style are: +# "suppress" -- suppress empty tracks entirely (default) +# "key" -- show just the key in "between" mode +# "line" -- draw a thin grey line +# "dashed" -- draw a dashed line +sub empty_track_style { + my $self = shift; + my $g = $self->{empty_track_style}; + $self->{empty_track_style} = shift if @_; + $g; +} + +sub key_style { + my $self = shift; + my $g = $self->{key_style}; + $self->{key_style} = shift if @_; + $g; +} + +# public routine for mapping from a base pair +# location to pixel coordinates +sub location2pixel { + my $self = shift; + my $end = $self->end + 1; + my @coords = $self->{flip} ? map { $end-$_ } @_ : @_; + $self->map_pt(@coords); +} + +# numerous direct calls into array used here for performance considerations +sub map_pt { + my $self = shift; + my $offset = $self->{offset}; + my $scale = $self->{scale} || $self->scale; + my $pl = $self->{pad_left}; + my $pr = $self->{width} - $self->{pad_right}; + my $flip = $self->{flip}; + my $length = $self->{length}; + my @result; + foreach (@_) { + my $val = $flip ? int (0.5 + $pr - ($length - ($_- 1)) * $scale) : int (0.5 + $pl + ($_-$offset-1) * $scale); + $val = $pl-1 if $val < $pl; + $val = $pr+1 if $val > $pr; + push @result,$val; + } + @result; +} + +sub map_no_trunc { + my $self = shift; + my $offset = $self->{offset}; + my $scale = $self->scale; + my $pl = $self->{pad_left}; + my $pr = $self->{width} - $self->{pad_right}; + my $flip = $self->{flip}; + my $length = $self->{length}; + my $end = $offset+$length; + my @result; + foreach (@_) { + my $val = $flip ? int (0.5 + $pl + ($end - ($_- 1)) * $scale) : int (0.5 + $pl + ($_-$offset-1) * $scale); + push @result,$val; + } + @result; +} + +sub scale { + my $self = shift; + $self->{scale} ||= ($self->{width}-$self->pad_left-$self->pad_right)/($self->length); +} + +sub start { shift->{offset}+1} +sub end { $_[0]->start + $_[0]->{length}-1} + +sub offset { shift->{offset} } +sub width { + my $self = shift; + my $d = $self->{width}; + $self->{width} = shift if @_; + $d; +# $d + $self->pad_left + $self->pad_right; +} + +sub left { + my $self = shift; + $self->pad_left; +} +sub right { + my $self = shift; + $self->width - $self->pad_right; +} + +sub spacing { + my $self = shift; + my $d = $self->{spacing}; + $self->{spacing} = shift if @_; + $d; +} + +sub key_spacing { + my $self = shift; + my $d = $self->{key_spacing}; + $self->{key_spacing} = shift if @_; + $d; +} + +sub length { + my $self = shift; + my $d = $self->{length}; + if (@_) { + my $l = shift; + $l = $l->length if ref($l) && $l->can('length'); + $self->{length} = $l; + } + $d; +} + +sub gridcolor {shift->{gridcolor}} + +sub all_callbacks { shift->{all_callbacks} } + +sub add_track { + my $self = shift; + $self->_do_add_track(scalar(@{$self->{tracks}}),@_); +} + +sub unshift_track { + my $self = shift; + $self->_do_add_track(0,@_); +} + +sub insert_track { + my $self = shift; + my $position = shift; + $self->_do_add_track($position,@_); +} + + +# create a feature and factory pair +# see Factory.pm for the format of the options +# The thing returned is actually a generic Glyph +sub _do_add_track { + my $self = shift; + my $position = shift; + + # due to indecision, we accept features + # and/or glyph types in the first two arguments + my ($features,$glyph_name) = ([],undef); + while ( @_ && $_[0] !~ /^-/) { + my $arg = shift; + $features = $arg and next if ref($arg); + $glyph_name = $arg and next unless ref($arg); + } + + my %args = @_; + my ($map,$ss,%options); + + foreach (keys %args) { + (my $canonical = lc $_) =~ s/^-//; + if ($canonical eq 'glyph') { + $map = $args{$_}; + delete $args{$_}; + } elsif ($canonical eq 'stylesheet') { + $ss = $args{$_}; + delete $args{$_}; + } else { + $options{$canonical} = $args{$_}; + } + } + + $glyph_name = $map if defined $map; + $glyph_name ||= 'generic'; + + local $^W = 0; # uninitialized variable warnings under 5.00503 + + my $panel_map = + ref($map) eq 'CODE' ? sub { + my $feature = shift; + return 'track' if eval { defined $feature->primary_tag && $feature->primary_tag eq 'track' }; + return 'group' if eval { defined $feature->primary_tag && $feature->primary_tag eq 'group' }; + return $map->($feature); + } + : ref($map) eq 'HASH' ? sub { + my $feature = shift; + return 'track' if eval { defined $feature->primary_tag && $feature->primary_tag eq 'track' }; + return 'group' if eval { defined $feature->primary_tag && $feature->primary_tag eq 'group' }; + return eval {$map->{$feature->primary_tag}} || 'generic'; + } + : sub { + my $feature = shift; + return 'track' if eval { defined $feature->primary_tag && $feature->primary_tag eq 'track' }; + return 'group' if eval { defined $feature->primary_tag && $feature->primary_tag eq 'group' }; + return $glyph_name; + }; + + $self->_add_track($position,$features,-map=>$panel_map,-stylesheet=>$ss,-options=>\%options); +} + +sub _add_track { + my $self = shift; + my ($position,$features,@options) = @_; + + # build the list of features into a Bio::Graphics::Feature object + $features = [$features] unless ref $features eq 'ARRAY'; + + # optional middle-level glyph is the group + foreach my $f (grep {ref $_ eq 'ARRAY'} @$features) { + next unless ref $f eq 'ARRAY'; + $f = Bio::Graphics::Feature->new( + -segments=>$f, + -type => 'group' + ); + } + + # top-level glyph is the track + my $feature = Bio::Graphics::Feature->new( + -segments=>$features, + -start => $self->offset+1, + -stop => $self->offset+$self->length, + -type => 'track' + ); + + my $factory = Bio::Graphics::Glyph::Factory->new($self,@options); + my $track = $factory->make_glyph(-1,$feature); + + splice(@{$self->{tracks}},$position,0,$track); + return $track; +} + +sub height { + my $self = shift; + my $spacing = $self->spacing; + my $key_height = $self->format_key; + my $empty_track_style = $self->empty_track_style; + my $key_style = $self->key_style; + my $bottom_key = $key_style eq 'bottom'; + my $between_key = $key_style eq 'between'; + my $draw_empty = $empty_track_style =~ /^(line|dashed)$/; + my $keyheight = $self->{key_font}->height; + my $height = 0; + for my $track (@{$self->{tracks}}) { + my $draw_between = $between_key && $track->option('key'); + my $has_parts = $track->parts; + next if !$has_parts && ($empty_track_style eq 'suppress' + or $empty_track_style eq 'key' && $bottom_key); + $height += $keyheight if $draw_between; + $height += $self->spacing; + $height += $track->layout_height; + } + + # get rid of spacing under last track + $height -= $self->spacing unless $bottom_key; + return $height + $key_height + $self->pad_top + $self->pad_bottom; +} + +sub gd { + my $self = shift; + my $existing_gd = shift; + + local $^W = 0; # can't track down the uninitialized variable warning + + return $self->{gd} if $self->{gd}; + + my $width = $self->width; + my $height = $self->height; + + my $gd = $existing_gd || GD::Image->new($width,$height, + ($self->{truecolor} && GD::Image->can('isTrueColor') ? 1 : ()) + ); + + my %translation_table; + for my $name ('white','black',keys %COLORS) { + my $idx = $gd->colorAllocate(@{$COLORS{$name}}); + $translation_table{$name} = $idx; + } + + $self->{translations} = \%translation_table; + $self->{gd} = $gd; + if ($self->bgcolor) { + $gd->fill(0,0,$self->bgcolor); + } elsif (eval {$gd->isTrueColor}) { + $gd->fill(0,0,$translation_table{'white'}); + } + + my $pl = $self->pad_left; + my $pt = $self->pad_top; + my $offset = $pt; + my $keyheight = $self->{key_font}->height; + my $bottom_key = $self->{key_style} eq 'bottom'; + my $between_key = $self->{key_style} eq 'between'; + my $left_key = $self->{key_style} eq 'left'; + my $right_key = $self->{key_style} eq 'right'; + my $empty_track_style = $self->empty_track_style; + my $spacing = $self->spacing; + + # we draw in two steps, once for background of tracks, and once for + # the contents. This allows the grid to sit on top of the track background. + for my $track (@{$self->{tracks}}) { + my $draw_between = $between_key && $track->option('key'); + next if !$track->parts && ($empty_track_style eq 'suppress' + or $empty_track_style eq 'key' && $bottom_key); + $gd->filledRectangle($pl, + $offset, + $width-$self->pad_right, + $offset+$track->layout_height + + ($between_key ? $self->{key_font}->height : 0), + $track->tkcolor) + if defined $track->tkcolor; + $offset += $keyheight if $draw_between; + $offset += $track->layout_height + $spacing; + } + + $self->draw_grid($gd) if $self->{grid}; + + $offset = $pt; + for my $track (@{$self->{tracks}}) { + my $draw_between = $between_key && $track->option('key'); + my $has_parts = $track->parts; + next if !$has_parts && ($empty_track_style eq 'suppress' + or $empty_track_style eq 'key' && $bottom_key); + + if ($draw_between) { + $offset += $self->draw_between_key($gd,$track,$offset); + } + + elsif ($self->{key_style} =~ /^(left|right)$/) { + $self->draw_side_key($gd,$track,$offset,$self->{key_style}); + } + + $self->draw_empty($gd,$offset,$empty_track_style) + if !$has_parts && $empty_track_style=~/^(line|dashed)$/; + + $track->draw($gd,0,$offset,0,1); + $self->track_position($track,$offset); + $offset += $track->layout_height + $spacing; + } + + + $self->draw_bottom_key($gd,$pl,$offset) if $self->{key_style} eq 'bottom'; + + return $self->{gd} = $gd; +} + +sub boxes { + my $self = shift; + my @boxes; + my $offset = 0; + + my $pl = $self->pad_left; + my $pt = $self->pad_top; + my $between_key = $self->{key_style} eq 'between'; + my $bottom_key = $self->{key_style} eq 'bottom'; + my $empty_track_style = $self->empty_track_style; + my $keyheight = $self->{key_font}->height; + my $spacing = $self->spacing; + + for my $track (@{$self->{tracks}}) { + my $draw_between = $between_key && $track->option('key'); + next if !$track->parts && ($empty_track_style eq 'suppress' + or $empty_track_style eq 'key' && $bottom_key); + $offset += $keyheight if $draw_between; + my $boxes = $track->boxes(0,$offset+$pt); + $self->track_position($track,$offset); + push @boxes,@$boxes; + $offset += $track->layout_height + $self->spacing; + } + return wantarray ? @boxes : \@boxes; +} + +sub track_position { + my $self = shift; + my $track = shift; + my $d = $self->{_track_position}{$track}; + $self->{_track_position}{$track} = shift if @_; + $d; +} + +# draw the keys -- between +sub draw_between_key { + my $self = shift; + my ($gd,$track,$offset) = @_; + my $key = $track->option('key') or return 0; + my $x = $self->{key_align} eq 'center' ? $self->width - (CORE::length($key) * $self->{key_font}->width)/2 + : $self->{key_align} eq 'right' ? $self->width - CORE::length($key) + : $self->pad_left; + $gd->string($self->{key_font},$x,$offset,$key,1); + return $self->{key_font}->height; +} + +# draw the keys -- left or right side +sub draw_side_key { + my $self = shift; + my ($gd,$track,$offset,$side) = @_; + my $key = $track->option('key') or return; + my $pos = $side eq 'left' ? $self->pad_left - $self->{key_font}->width * CORE::length($key)-3 + : $self->width - $self->pad_right+3; + $gd->string($self->{key_font},$pos,$offset,$key,1); +} + +# draw the keys -- bottom +sub draw_bottom_key { + my $self = shift; + my ($gd,$left,$top) = @_; + my $key_glyphs = $self->{key_glyphs} or return; + + my $color = $self->translate_color($self->{key_color}); + $gd->filledRectangle($left,$top,$self->width - $self->pad_right,$self->height-$self->pad_bottom,$color); + $gd->string($self->{key_font},$left,KEYPADTOP+$top,"KEY:",1); + $top += $self->{key_font}->height + KEYPADTOP; + + $_->draw($gd,$left,$top) foreach @$key_glyphs; +} + +# Format the key section, and return its height +sub format_key { + my $self = shift; + return 0 unless $self->key_style eq 'bottom'; + + return $self->{key_height} if defined $self->{key_height}; + + my $suppress = $self->{empty_track_style} eq 'suppress'; + my $between = $self->{key_style} eq 'between'; + + if ($between) { + my @key_tracks = $suppress + ? grep {$_->option('key') && $_->parts} @{$self->{tracks}} + : grep {$_->option('key')} @{$self->{tracks}}; + return $self->{key_height} = @key_tracks * $self->{key_font}->height; + } + + elsif ($self->{key_style} eq 'bottom') { + + my ($height,$width) = (0,0); + my %tracks; + my @glyphs; + + # determine how many glyphs become part of the key + # and their max size + for my $track (@{$self->{tracks}}) { + + next unless $track->option('key'); + next if $suppress && !$track->parts; + + my $glyph; + if (my @parts = $track->parts) { + $glyph = $parts[0]->keyglyph; + } else { + my $t = Bio::Graphics::Feature->new(-segments=> + [Bio::Graphics::Feature->new(-start => $self->offset, + -stop => $self->offset+$self->length)]); + my $g = $track->factory->make_glyph(0,$t); + $glyph = $g->keyglyph; + } + next unless $glyph; + + + $tracks{$track} = $glyph; + my ($h,$w) = ($glyph->layout_height, + $glyph->layout_width); + $height = $h if $h > $height; + $width = $w if $w > $width; + push @glyphs,$glyph; + + } + + $width += $self->key_spacing; + + # no key glyphs, no key + return $self->{key_height} = 0 unless @glyphs; + + # now height and width hold the largest glyph, and $glyph_count + # contains the number of glyphs. We will format them into a + # box that is roughly 3 height/4 width (golden mean) + my $rows = 0; + my $cols = 0; + my $maxwidth = $self->width - $self->pad_left - $self->pad_right; + while (++$rows) { + $cols = @glyphs / $rows; + $cols = int ($cols+1) if $cols =~ /\./; # round upward for fractions + my $total_width = $cols * $width; + my $total_height = $rows * $width; + last if $total_width < $maxwidth; + } + + # move glyphs into row-major format + my $spacing = $self->key_spacing; + my $i = 0; + for (my $c = 0; $c < $cols; $c++) { + for (my $r = 0; $r < $rows; $r++) { + my $x = $c * ($width + $spacing); + my $y = $r * ($height + $spacing); + next unless defined $glyphs[$i]; + $glyphs[$i]->move($x,$y); + $i++; + } + } + + $self->{key_glyphs} = \@glyphs; # remember our key glyphs + # remember our key height + return $self->{key_height} = + ($height+$spacing) * $rows + $self->{key_font}->height +KEYPADTOP; + } + + else { # no known key style, neither "between" nor "bottom" + return $self->{key_height} = 0; + } +} + +sub draw_empty { + my $self = shift; + my ($gd,$offset,$style) = @_; + $offset += $self->spacing/2; + my $left = $self->pad_left; + my $right = $self->width-$self->pad_right; + my $color = $self->translate_color(MISSING_TRACK_COLOR); + if ($style eq 'dashed') { + $gd->setStyle($color,$color,gdTransparent,gdTransparent); + $gd->line($left,$offset,$right,$offset,gdStyled); + } else { + $gd->line($left,$offset,$right,$offset,$color); + } + $offset; +} + +# draw a grid +sub draw_grid { + my $self = shift; + my $gd = shift; + + my $gridcolor = $self->translate_color($self->{gridcolor}); + my @positions; + if (ref $self->{grid} eq 'ARRAY') { + @positions = @{$self->{grid}}; + } else { + my ($major,$minor) = $self->ticks; + my $first_tick = $minor * int(0.5 + $self->start/$minor); + for (my $i = $first_tick; $i < $self->end; $i += $minor) { + push @positions,$i; + } + } + my $pl = $self->pad_left; + my $pt = $self->pad_top; + my $pb = $self->height - $self->pad_bottom; + local $self->{flip} = 0; + for my $tick (@positions) { + my ($pos) = $self->map_pt($tick); + $gd->line($pos,$pt,$pos,$pb,$gridcolor); + } +} + +# calculate major and minor ticks, given a start position +sub ticks { + my $self = shift; + my ($length,$minwidth) = @_; + + $length = $self->{length} unless defined $length; + $minwidth = gdSmallFont->width*7 unless defined $minwidth; + + my ($major,$minor); + + # figure out tick mark scale + # we want no more than 1 major tick mark every 40 pixels + # and enough room for the labels + my $scale = $self->scale; + + my $interval = 1; + + while (1) { + my $pixels = $interval * $scale; + last if $pixels >= $minwidth; + $interval *= 10; + } + + # to make sure a major tick shows up somewhere in the first half + # + $interval *= .5 if ($interval > 0.5*$length); + + return ($interval,$interval/10); +} + +# reverse of translate(); given index, return rgb triplet +sub rgb { + my $self = shift; + my $idx = shift; + my $gd = $self->{gd} or return; + return $gd->rgb($idx); +} + +sub translate_color { + my $self = shift; + my @colors = @_; + if (@colors == 3) { + my $gd = $self->gd or return 1; + return $self->colorClosest($gd,@colors); + } + elsif ($colors[0] =~ /^\#([0-9A-F]{2})([0-9A-F]{2})([0-9A-F]{2})$/i) { + my $gd = $self->gd or return 1; + my ($r,$g,$b) = (hex($1),hex($2),hex($3)); + return $self->colorClosest($gd,$r,$g,$b); + } + else { + my $color = $colors[0]; + my $table = $self->{translations} or return 1; + return defined $table->{$color} ? $table->{$color} : 1; + } +} + +# workaround for bad GD +sub colorClosest { + my ($self,$gd,@c) = @_; + return $self->{closestcache}{"@c"} if exists $self->{closestcache}{"@c"}; + return $self->{closestcache}{"@c"} = $gd->colorClosest(@c) if $GD::VERSION < 2.04; + my ($value,$index); + for (keys %COLORS) { + my ($r,$g,$b) = @{$COLORS{$_}}; + my $dist = ($r-$c[0])**2 + ($g-$c[1])**2 + ($b-$c[2])**2; + ($value,$index) = ($dist,$_) if !defined($value) || $dist < $value; + } + return $self->{closestcache}{"@c"} = $self->{translations}{$index}; +} + +sub bgcolor { + my $self = shift; + return unless $self->{bgcolor}; + $self->translate_color($self->{bgcolor}); +} + +sub set_pen { + my $self = shift; + my ($linewidth,$color) = @_; + return $self->{pens}{$linewidth,$color} if $self->{pens}{$linewidth,$color}; + + my $pen = $self->{pens}{$linewidth} = GD::Image->new($linewidth,$linewidth); + my @rgb = $self->rgb($color); + my $bg = $pen->colorAllocate(255,255,255); + my $fg = $pen->colorAllocate(@rgb); + $pen->fill(0,0,$fg); + $self->{gd}->setBrush($pen); + return gdBrushed; +} + +sub png { + my $gd = shift->gd; + $gd->png; +} + +sub read_colors { + my $class = shift; + while () { + chomp; + last if /^__END__/; + my ($name,$r,$g,$b) = split /\s+/; + $COLORS{$name} = [hex $r,hex $g,hex $b]; + } +} + +sub color_name_to_rgb { + my $class = shift; + my $color_name = shift; + $class->read_colors() unless %COLORS; + return unless $COLORS{$color_name}; + return wantarray ? @{$COLORS{$color_name}} + : $COLORS{$color_name}; +} + +sub color_names { + my $class = shift; + $class->read_colors unless %COLORS; + return wantarray ? keys %COLORS : [keys %COLORS]; +} + +1; + +__DATA__ +white FF FF FF +black 00 00 00 +aliceblue F0 F8 FF +antiquewhite FA EB D7 +aqua 00 FF FF +aquamarine 7F FF D4 +azure F0 FF FF +beige F5 F5 DC +bisque FF E4 C4 +blanchedalmond FF EB CD +blue 00 00 FF +blueviolet 8A 2B E2 +brown A5 2A 2A +burlywood DE B8 87 +cadetblue 5F 9E A0 +chartreuse 7F FF 00 +chocolate D2 69 1E +coral FF 7F 50 +cornflowerblue 64 95 ED +cornsilk FF F8 DC +crimson DC 14 3C +cyan 00 FF FF +darkblue 00 00 8B +darkcyan 00 8B 8B +darkgoldenrod B8 86 0B +darkgray A9 A9 A9 +darkgreen 00 64 00 +darkkhaki BD B7 6B +darkmagenta 8B 00 8B +darkolivegreen 55 6B 2F +darkorange FF 8C 00 +darkorchid 99 32 CC +darkred 8B 00 00 +darksalmon E9 96 7A +darkseagreen 8F BC 8F +darkslateblue 48 3D 8B +darkslategray 2F 4F 4F +darkturquoise 00 CE D1 +darkviolet 94 00 D3 +deeppink FF 14 100 +deepskyblue 00 BF FF +dimgray 69 69 69 +dodgerblue 1E 90 FF +firebrick B2 22 22 +floralwhite FF FA F0 +forestgreen 22 8B 22 +fuchsia FF 00 FF +gainsboro DC DC DC +ghostwhite F8 F8 FF +gold FF D7 00 +goldenrod DA A5 20 +gray 80 80 80 +green 00 80 00 +greenyellow AD FF 2F +honeydew F0 FF F0 +hotpink FF 69 B4 +indianred CD 5C 5C +indigo 4B 00 82 +ivory FF FF F0 +khaki F0 E6 8C +lavender E6 E6 FA +lavenderblush FF F0 F5 +lawngreen 7C FC 00 +lemonchiffon FF FA CD +lightblue AD D8 E6 +lightcoral F0 80 80 +lightcyan E0 FF FF +lightgoldenrodyellow FA FA D2 +lightgreen 90 EE 90 +lightgrey D3 D3 D3 +lightpink FF B6 C1 +lightsalmon FF A0 7A +lightseagreen 20 B2 AA +lightskyblue 87 CE FA +lightslategray 77 88 99 +lightsteelblue B0 C4 DE +lightyellow FF FF E0 +lime 00 FF 00 +limegreen 32 CD 32 +linen FA F0 E6 +magenta FF 00 FF +maroon 80 00 00 +mediumaquamarine 66 CD AA +mediumblue 00 00 CD +mediumorchid BA 55 D3 +mediumpurple 100 70 DB +mediumseagreen 3C B3 71 +mediumslateblue 7B 68 EE +mediumspringgreen 00 FA 9A +mediumturquoise 48 D1 CC +mediumvioletred C7 15 85 +midnightblue 19 19 70 +mintcream F5 FF FA +mistyrose FF E4 E1 +moccasin FF E4 B5 +navajowhite FF DE AD +navy 00 00 80 +oldlace FD F5 E6 +olive 80 80 00 +olivedrab 6B 8E 23 +orange FF A5 00 +orangered FF 45 00 +orchid DA 70 D6 +palegoldenrod EE E8 AA +palegreen 98 FB 98 +paleturquoise AF EE EE +palevioletred DB 70 100 +papayawhip FF EF D5 +peachpuff FF DA B9 +peru CD 85 3F +pink FF C0 CB +plum DD A0 DD +powderblue B0 E0 E6 +purple 80 00 80 +red FF 00 00 +rosybrown BC 8F 8F +royalblue 41 69 E1 +saddlebrown 8B 45 13 +salmon FA 80 72 +sandybrown F4 A4 60 +seagreen 2E 8B 57 +seashell FF F5 EE +sienna A0 52 2D +silver C0 C0 C0 +skyblue 87 CE EB +slateblue 6A 5A CD +slategray 70 80 90 +snow FF FA FA +springgreen 00 FF 7F +steelblue 46 82 B4 +tan D2 B4 8C +teal 00 80 80 +thistle D8 BF D8 +tomato FF 63 47 +turquoise 40 E0 D0 +violet EE 82 EE +wheat F5 DE B3 +whitesmoke F5 F5 F5 +yellow FF FF 00 +yellowgreen 9A CD 32 +gradient1 00 ff 00 +gradient2 0a ff 00 +gradient3 14 ff 00 +gradient4 1e ff 00 +gradient5 28 ff 00 +gradient6 32 ff 00 +gradient7 3d ff 00 +gradient8 47 ff 00 +gradient9 51 ff 00 +gradient10 5b ff 00 +gradient11 65 ff 00 +gradient12 70 ff 00 +gradient13 7a ff 00 +gradient14 84 ff 00 +gradient15 8e ff 00 +gradient16 99 ff 00 +gradient17 a3 ff 00 +gradient18 ad ff 00 +gradient19 b7 ff 00 +gradient20 c1 ff 00 +gradient21 cc ff 00 +gradient22 d6 ff 00 +gradient23 e0 ff 00 +gradient24 ea ff 00 +gradient25 f4 ff 00 +gradient26 ff ff 00 +gradient27 ff f4 00 +gradient28 ff ea 00 +gradient29 ff e0 00 +gradient30 ff d6 00 +gradient31 ff cc 00 +gradient32 ff c1 00 +gradient33 ff b7 00 +gradient34 ff ad 00 +gradient35 ff a3 00 +gradient36 ff 99 00 +gradient37 ff 8e 00 +gradient38 ff 84 00 +gradient39 ff 7a 00 +gradient40 ff 70 00 +gradient41 ff 65 00 +gradient42 ff 5b 00 +gradient43 ff 51 00 +gradient44 ff 47 00 +gradient45 ff 3d 00 +gradient46 ff 32 00 +gradient47 ff 28 00 +gradient48 ff 1e 00 +gradient49 ff 14 00 +gradient50 ff 0a 00 +__END__ + +=head1 NAME + +Bio::Graphics::Panel - Generate GD images of Bio::Seq objects + +=head1 SYNOPSIS + + # This script parses a GenBank or EMBL file named on the command + # line and produces a PNG rendering of it. Call it like this: + # render.pl my_file.embl | display - + + use strict; + use Bio::Graphics; + use Bio::SeqIO; + + my $file = shift or die "provide a sequence file as the argument"; + my $io = Bio::SeqIO->new(-file=>$file) or die "could not create Bio::SeqIO"; + my $seq = $io->next_seq or die "could not find a sequence in the file"; + + my @features = $seq->all_SeqFeatures; + + # sort features by their primary tags + my %sorted_features; + for my $f (@features) { + my $tag = $f->primary_tag; + push @{$sorted_features{$tag}},$f; + } + + my $panel = Bio::Graphics::Panel->new( + -length => $seq->length, + -key_style => 'between', + -width => 800, + -pad_left => 10, + -pad_right => 10, + ); + $panel->add_track( arrow => Bio::SeqFeature::Generic->new(-start=>1, + -end=>$seq->length), + -bump => 0, + -double=>1, + -tick => 2); + $panel->add_track(generic => Bio::SeqFeature::Generic->new(-start=>1, + -end=>$seq->length), + -glyph => 'generic', + -bgcolor => 'blue', + -label => 1, + ); + + # general case + my @colors = qw(cyan orange blue purple green chartreuse magenta yellow aqua); + my $idx = 0; + for my $tag (sort keys %sorted_features) { + my $features = $sorted_features{$tag}; + $panel->add_track($features, + -glyph => 'generic', + -bgcolor => $colors[$idx++ % @colors], + -fgcolor => 'black', + -font2color => 'red', + -key => "${tag}s", + -bump => +1, + -height => 8, + -label => 1, + -description => 1, + ); + } + + print $panel->png; + exit 0; + +=head1 DESCRIPTION + +The Bio::Graphics::Panel class provides drawing and formatting +services for any object that implements the Bio::SeqFeatureI +interface, including Ace::Sequence::Feature and Das::Segment::Feature +objects. It can be used to draw sequence annotations, physical +(contig) maps, or any other type of map in which a set of discrete +ranges need to be laid out on the number line. + +The module supports a drawing style in which each type of feature +occupies a discrete "track" that spans the width of the display. Each +track will have its own distinctive "glyph", a configurable graphical +representation of the feature. + +The module also supports a more flexible style in which several +different feature types and their associated glyphs can occupy the +same track. The choice of glyph is under run-time control. + +Semantic zooming (for instance, changing the type of glyph depending +on the density of features) is supported by a callback system for +configuration variables. The module has built-in support for Bio::Das +stylesheets, and stylesheet-driven configuration can be intermixed +with semantic zooming, if desired. + +You can add a key to the generated image using either of two key +styles. One style places the key captions at the top of each track. +The other style generates a graphical key at the bottom of the image. + +Note that this modules depends on GD. + +=head1 METHODS + +This section describes the class and object methods for +Bio::Graphics::Panel. + +Typically you will begin by creating a new Bio::Graphics::Panel +object, passing it the desired width of the image to generate and an +origin and length describing the coordinate range to display. The +Bio::Graphics::Panel-Enew() method has may configuration variables +that allow you to control the appearance of the image. + +You will then call add_track() one or more times to add sets of +related features to the picture. add_track() places a new horizontal +track on the image, and is likewise highly configurable. When you +have added all the features you desire, you may call png() to convert +the image into a PNG-format image, or boxes() to return coordinate +information that can be used to create an imagemap. + +=head2 CONSTRUCTORS + +new() is the constructor for Bio::Graphics::Panel: + +=over 4 + +=item $panel = Bio::Graphics::Panel-Enew(@options) + +The new() method creates a new panel object. The options are +a set of tag/value pairs as follows: + + Option Value Default + ------ ----- ------- + + -offset Base pair to place at extreme left none + of image, in zero-based coordinates + + -length Length of sequence segment, in bp none + + -start Start of range, in 1-based none + coordinates. + + -stop Stop of range, in 1-based none + coordinates. + + -segment A Bio::SeqI or Das::Segment none + object, used to derive sequence + range if not otherwise specified. + + -width Desired width of image, in pixels 600 + + -spacing Spacing between tracks, in pixels 5 + + -pad_top Additional whitespace between top 0 + of image and contents, in pixels + + -pad_bottom Additional whitespace between top 0 + of image and bottom, in pixels + + -pad_left Additional whitespace between left 0 + of image and contents, in pixels + + -pad_right Additional whitespace between right 0 + of image and bottom, in pixels + + -bgcolor Background color for the panel as a white + whole + + -key_color Background color for the key printed wheat + at bottom of panel (if any) + + -key_spacing Spacing between key glyphs in the 10 + key printed at bottom of panel + (if any) + + -key_font Font to use in printed key gdMediumBoldFont + captions. + + -key_style Whether to print key at bottom of none + panel ("bottom"), between each + track ("between"), to the left of + each track ("left"), to the right + of each track ("right") or + not at all ("none"). + + -empty_tracks What to do when a track is empty. suppress + Options are to suppress the track + completely ("suppress"), to show just + the key in "between" mode ("key"), + to draw a thin grey line ("line"), + or to draw a dashed line ("dashed"). + + -flip flip the drawing coordinates left false + to right, so that lower coordinates + are to the right. This can be + useful for drawing (-) strand + features. + + -all_callbacks Whether to invoke callbacks on false + the automatic "track" and "group" + glyphs. + + -grid Whether to draw a vertical grid in false + the background. Pass a scalar true + value to have a grid drawn at + regular intervals (corresponding + to the minor ticks of the arrow + glyph). Pass an array reference + to draw the grid at the specified + positions. + + -gridcolor Color of the grid lightcyan + + +Typically you will pass new() an object that implements the +Bio::RangeI interface, providing a length() method, from which the +panel will derive its scale. + + $panel = Bio::Graphics::Panel->new(-segment => $sequence, + -width => 800); + +new() will return undef in case of an error. + +Note that if you use the "left" or "right" key styles, you are +responsible for allocating sufficient -pad_left or -pad_right room for +the labels to appear. The necessary width is the number of characters +in the longest key times the font width (gdMediumBoldFont by default) +plus 3 pixels of internal padding. The simplest way to calculate this +is to iterate over the possible track labels, find the largest one, +and then to compute its width using the formula: + + $width = gdMediumBoldFont->width * length($longest_key) +3; + +=back + +=head2 OBJECT METHODS + +=over 4 + +=item $track = $panel-Eadd_track($glyph,$features,@options) + +The add_track() method adds a new track to the image. + +Tracks are horizontal bands which span the entire width of the panel. +Each track contains a number of graphical elements called "glyphs", +corresponding to a sequence feature. + +There are a large number of glyph types. By default, each track will +be homogeneous on a single glyph type, but you can mix several glyph +types on the same track by providing a code reference to the -glyph +argument. Other options passed to add_track() control the color and +size of the glyphs, whether they are allowed to overlap, and other +formatting attributes. The height of a track is determined from its +contents and cannot be directly influenced. + +The first two arguments are the glyph name and an array reference +containing the list of features to display. The order of the +arguments is irrelevant, allowing either of these idioms: + + $panel->add_track(arrow => \@features); + $panel->add_track(\@features => 'arrow'); + + +The glyph name indicates how each feature is to be rendered. A +variety of glyphs are available, and the number is growing. You may +omit the glyph name entirely by providing a B<-glyph> argument among +@options, as described below. + +Currently, the following glyphs are available: + + Name Description + ---- ----------- + + anchored_arrow + a span with vertical bases |---------|. If one or + the other end of the feature is off-screen, the base + will be replaced by an arrow. + + arrow An arrow; can be unidirectional or bidirectional. + It is also capable of displaying a scale with + major and minor tickmarks, and can be oriented + horizontally or vertically. + + cds Draws CDS features, using the phase information to + show the reading frame usage. At high magnifications + draws the protein translation. + + crossbox A box with a big "X" inside it. + + diamond A diamond, useful for point features like SNPs. + + dna At high magnification draws the DNA sequence. At + low magnifications draws the GC content. + + dot A circle, useful for point features like SNPs, stop + codons, or promoter elements. + + ellipse An oval. + + extending_arrow + Similar to arrow, but a dotted line indicates when the + feature extends beyond the end of the canvas. + + generic A filled rectangle, nondirectional. + + graded_segments + Similar to segments, but the intensity of the color + is proportional to the score of the feature. This + is used for showing the intensity of blast hits or + other alignment features. + + group A group of related features connected by a dashed line. + This is used internally by Panel. + + heterogeneous_segments + Like segments, but you can use the source field of the feature + to change the color of each segment. + + line A simple line. + + pinsertion A triangle designed to look like an insertion location + (e.g. a transposon insertion). + + processed_transcript multi-purpose representation of a spliced mRNA, including + positions of UTRs + + primers Two inward pointing arrows connected by a line. + Used for STSs. + + redgreen_box A box that changes from green->yellow->red as the score + of the feature increases from 0.0 to 1.0. Useful for + representing microarray results. + + rndrect A round-cornered rectangle. + + segments A set of filled rectangles connected by solid lines. + Used for interrupted features, such as gapped + alignments. + + ruler_arrow An arrow with major and minor tick marks and interval + labels. + + toomany Tries to show many features as a cloud. Not very successful. + + track A group of related features not connected by a line. + This is used internally by Panel. + + transcript Similar to segments, but the connecting line is + a "hat" shape, and the direction of transcription + is indicated by a small arrow. + + transcript2 Similar to transcript, but the direction of + transcription is indicated by a terminal exon + in the shape of an arrow. + + translation 1, 2 and 3-frame translations. At low magnifications, + can be configured to show start and stop codon locations. + At high magnifications, shows the multi-frame protein + translation. + + triangle A triangle whose width and orientation can be altered. + + xyplot Histograms and other graphs plotted against the genome. + +If the glyph name is omitted from add_track(), the "generic" glyph +will be used by default. To get more information about a glyph, run +perldoc on "Bio::Graphics::Glyph::glyphname", replacing "glyphname" +with the name of the glyph you are interested in. + +The @options array is a list of name/value pairs that control the +attributes of the track. Some options are interpretered directly by +the track. Others are passed down to the individual glyphs (see +L<"GLYPH OPTIONS">). The following options are track-specific: + + Option Description Default + ------ ----------- ------- + + -tkcolor Track color white + + -glyph Glyph class to use. "generic" + + -stylesheet Bio::Das::Stylesheet to none + use to generate glyph + classes and options. + +B<-tkcolor> controls the background color of the track as a whole. + +B<-glyph> controls the glyph type. If present, it supersedes the +glyph name given in the first or second argument to add_track(). The +value of B<-glyph> may be a constant string, a hash reference, or a +code reference. In the case of a constant string, that string will be +used as the class name for all generated glyphs. If a hash reference +is passed, then the feature's primary_tag() will be used as the key to +the hash, and the value, if any, used to generate the glyph type. If +a code reference is passed, then this callback will be passed each +feature in turn as its single argument. The callback is expected to +examine the feature and return a glyph name as its single result. + +Example: + + $panel->add_track(\@exons, + -glyph => sub { my $feature = shift; + $feature->source_tag eq 'curated' + ? 'ellipse' : 'generic'; } + ); + +The B<-stylesheet> argument is used to pass a Bio::Das stylesheet +object to the panel. This stylesheet will be called to determine both +the glyph and the glyph options. If both a stylesheet and direct +options are provided, the latter take precedence. + +If successful, add_track() returns an Bio::Graphics::Glyph object. +You can use this object to add additional features or to control the +appearance of the track with greater detail, or just ignore it. +Tracks are added in order from the top of the image to the bottom. To +add tracks to the top of the image, use unshift_track(). + +B It is not uncommon to add a group of +features which are logically connected, such as the 5' and 3' ends of +EST reads. To group features into sets that remain on the same +horizontal position and bump together, pass the sets as an anonymous +array. For example: + + $panel->add_track(segments => [[$abc_5,$abc_3], + [$xxx_5,$xxx_3], + [$yyy_5,$yyy_3]] + ); + +Typical usage is: + + $panel->add_track( transcript => \@genes, + -fillcolor => 'green', + -fgcolor => 'black', + -bump => +1, + -height => 10, + -label => 1); + +=item $track = unshift_track($glyph,$features,@options) + +unshift_track() works like add_track(), except that the new track is +added to the top of the image rather than the bottom. + +=item $gd = $panel-Egd([$gd]) + +The gd() method lays out the image and returns a GD::Image object +containing it. You may then call the GD::Image object's png() or +jpeg() methods to get the image data. + +Optionally, you may pass gd() a preexisting GD::Image object that you +wish to draw on top of. If you do so, you should call the width() and +height() methods first to ensure that the image has sufficient +dimensions. + +=item $png = $panel-Epng + +The png() method returns the image as a PNG-format drawing, without +the intermediate step of returning a GD::Image object. + +=item $boxes = $panel-Eboxes + +=item @boxes = $panel-Eboxes + +The boxes() method returns the coordinates of each glyph, useful for +constructing an image map. In a scalar context, boxes() returns an +array ref. In an list context, the method returns the array directly. + +Each member of the list is an anonymous array of the following format: + + [ $feature, $x1, $y1, $x2, $y2 ] + +The first element is the feature object; either an +Ace::Sequence::Feature, a Das::Segment::Feature, or another Bioperl +Bio::SeqFeatureI object. The coordinates are the topleft and +bottomright corners of the glyph, including any space allocated for +labels. + +=item $position = $panel-Etrack_position($track) + +After calling gd() or boxes(), you can learn the resulting Y +coordinate of a track by calling track_position() with the value +returned by add_track() or unshift_track(). This will return undef if +called before gd() or boxes() or with an invalid track. + +=item @pixel_coords = $panel-Elocation2pixel(@feature_coords) + +Public routine to map feature coordinates (in base pairs) into pixel +coordinates relative to the left-hand edge of the picture. + +=back + +=head1 GLYPH OPTIONS + +Each glyph has its own specialized subset of options, but +some are shared by all glyphs: + + Option Description Default + ------ ----------- ------- + + -fgcolor Foreground color black + + -bgcolor Background color turquoise + + -linewidth Width of lines drawn by 1 + glyph + + -height Height of glyph 10 + + -font Glyph font gdSmallFont + + -fontcolor Primary font color black + + -font2color Secondary font color turquoise + + -label Whether to draw a label false + + -description Whether to draw a false + description + + -bump Bump direction 0 + + -sort_order Specify layout sort order "default" + + -bump_limit Maximum number of levels undef (unlimited) + to bump + + -strand_arrow Whether to indicate undef (false) + strandedness + + -stranded Synonym for -strand_arrow undef (false) + + -connector Type of connector to none + use to connect related + features. Options are + "solid," "hat", "dashed", + "quill" and "none". + + -key Description of track for undef + use in key. + + -all_callbacks Whether to invoke undef + callbacks for autogenerated + "track" and "group" glyphs + + -box_subparts Return boxes around feature false + subparts rather than around the + feature itself. + + +B Colors can be expressed in either of two ways: +as symbolic names such as "cyan" and as HTML-style #RRGGBB triples. +The symbolic names are the 140 colors defined in the Netscape/Internet +Explorer color cube, and can be retrieved using the +Bio::Graphics::Panel-Ecolor_names() method. + +B The -fgcolor option controls the foreground +color, including the edges of boxes and the like. + +B The -bgcolor option controls the background used +for filled boxes and other "solid" glyphs. The foreground color +controls the color of lines and strings. The -tkcolor argument +controls the background color of the entire track. + +B The -tkcolor option used to specify the background of +the entire track. + +B The -fontcolor option controls the color of primary +text, such as labels + +B The -font2color option controls the color of +secondary text, such as descriptions. + +B The -label argument controls whether or not the ID of the +feature should be printed next to the feature. It is accepted by all +glyphs. By default, the label is printed just above the glyph and +left aligned with it. + +-label can be a constant string or a code reference. Values can be +any of: + + -label value Description + ------------ ----------- + + 0 Don't draw a label + 1 Calculate a label based on primary tag of sequence + "a string" Use "a string" as the label + code ref Invoke the code reference to compute the label + +A known bug with this naming scheme is that you can't label a feature +with the string "1". To work around this, use "1 " (note the terminal +space). + +B The -description argument controls whether or not a +brief description of the feature should be printed next to it. By +default, the description is printed just below the glyph and +left-aligned with it. A value of 0 will suppress the description. A +value of 1 will call the source_tag() method of the feature. A code +reference will be invoked to calculate the description on the fly. +Anything else will be treated as a string and used verbatim. + +B A glyph can contain subglyphs, recursively. The top +level glyph is the track, which contains one or more groups, which +contain features, which contain subfeatures, and so forth. By +default, the "group" glyph draws dotted lines between each of its +subglyphs, the "segment" glyph draws a solid line between each of its +subglyphs, and the "transcript" and "transcript2" glyphs draw +hat-shaped lines between their subglyphs. All other glyphs do not +connect their components. You can override this behavior by providing +a -connector option, to explicitly set the type of connector. Valid +options are: + + + "hat" an upward-angling conector + "solid" a straight horizontal connector + "quill" a decorated line with small arrows indicating strandedness + (like the UCSC Genome Browser uses) + "dashed" a horizontal dashed line. + +The B<-connector_color> option controls the color of the connector, if +any. + +B The -bump argument controls what happens when +glyphs collide. By default, they will simply overlap (value 0). A +-bump value of +1 will cause overlapping glyphs to bump downwards +until there is room for them. A -bump value of -1 will cause +overlapping glyphs to bump upwards. The bump argument can also be a +code reference; see below. + +B The -key argument declares that the track is to be shown in a +key appended to the bottom of the image. The key contains a picture +of a glyph and a label describing what the glyph means. The label is +specified in the argument to -key. + +B Ordinarily, when you invoke the boxes() methods to +retrieve the rectangles surrounding the glyphs (which you need to do +to create clickable imagemaps, for example), the rectangles will +surround the top level features. If you wish for the rectangles to +surround subpieces of the glyph, such as the exons in a transcript, +set box_subparts to a true value. + +B If set to true, some glyphs will indicate their +strandedness, usually by drawing an arrow. For this to work, the +Bio::SeqFeature must have a strand of +1 or -1. The glyph will ignore +this directive if the underlying feature has a strand of zero or +undef. + +B: By default, features are drawn with a layout based only on the +position of the feature, assuring a maximal "packing" of the glyphs +when bumped. In some cases, however, it makes sense to display the +glyphs sorted by score or some other comparison, e.g. such that more +"important" features are nearer the top of the display, stacked above +less important features. The -sort_order option allows a few +different built-in values for changing the default sort order (which +is by "left" position): "low_score" (or "high_score") will cause +features to be sorted from lowest to highest score (or vice versa). +"left" (or "default") and "right" values will cause features to be +sorted by their position in the sequence. "longer" (or "shorter") +will cause the longest (or shortest) features to be sorted first, and +"strand" will cause the features to be sorted by strand: "+1" +(forward) then "0" (unknown, or NA) then "-1" (reverse). + +In all cases, the "left" position will be used to break any ties. To +break ties using another field, options may be strung together using a +"|" character; e.g. "strand|low_score|right" would cause the features +to be sorted first by strand, then score (lowest to highest), then by +"right" position in the sequence. Finally, a subroutine coderef can +be provided, which should expect to receive two feature objects (via +the special sort variables $a and $b), and should return -1, 0 or 1 +(see Perl's sort() function for more information); this subroutine +will be used without further modification for sorting. For example, +to sort a set of database search hits by bits (stored in the features' +"score" fields), scaled by the log of the alignment length (with +"left" position breaking any ties): + + sort_order = sub { ( $b->score/log($b->length) + <=> + $a->score/log($a->length) ) + || + ( $a->start <=> $b->start ) + } + +B: When bumping is chosen, colliding features will +ordinarily move upward or downward without limit. When many features +collide, this can lead to excessively high images. You can limit the +number of levels that features will bump by providing a numeric +B option. + +=head2 Options and Callbacks + +Instead of providing a constant value to an option, you may subsitute +a code reference. This code reference will be called every time the +panel needs to configure a glyph. The callback will be called with +three arguments like this: + + sub callback { + my ($feature,$option_name,$part_no,$total_parts,$glyph) = @_; + # do something which results in $option_value being set + return $option_value; + } + +The five arguments are C<$feature>, a reference to the IO::SeqFeatureI +object, C<$option_name>, the name of the option to configure, +C<$part_no>, an integer index indicating which subpart of the feature +is being drawn, C<$total_parts>, an integer indicating the total +number of subfeatures in the feature, and finally C<$glyph>, the Glyph +object itself. The latter fields are useful in the case of treating +the first or last subfeature differently, such as using a different +color for the terminal exon of a gene. Usually you will only need to +examine the first argument. This example shows a callback examining +the score() attribute of a feature (possibly a BLAST hit) and return +the color "red" for high-scoring features, and "green" for low-scoring +features: + + sub callback { + my $feature = shift; + if ($feature->score > 90) { + return 'red'; + else { + return 'green'; + } + } + +The callback should return a string indicating the desired value of +the option. To tell the panel to use the default value for this +option, return the string "*default*". + +When you install a callback for a feature that contains subparts, the +callback will be invoked first for the top-level feature, and then for +each of its subparts (recursively). You should make sure to examine +the feature's type to determine whether the option is appropriate. + +Some glyphs deliberately disable this recursive feature. The "track", +"group", "transcript", "transcript2" and "segments" glyphs selectively +disable the -bump, -label and -description options. This is to avoid, +for example, a label being attached to each exon in a transcript, or +the various segments of a gapped alignment bumping each other. You +can override this behavior and force your callback to be invoked by +providing add_track() with a true B<-all_callbacks> argument. In this +case, you must be prepared to handle configuring options for the +"group" and "track" glyphs. + +In particular, this means that in order to control the -bump option +with a callback, you should specify -all_callbacks=E1, and turn on +bumping when the callback is in the track or group glyphs. + +=head2 ACCESSORS + +The following accessor methods provide access to various attributes of +the panel object. Called with no arguments, they each return the +current value of the attribute. Called with a single argument, they +set the attribute and return its previous value. + +Note that in most cases you must change attributes prior to invoking +gd(), png() or boxes(). These three methods all invoke an internal +layout() method which places the tracks and the glyphs within them, +and then caches the result. + + Accessor Name Description + ------------- ----------- + + width() Get/set width of panel + spacing() Get/set spacing between tracks + key_spacing() Get/set spacing between keys + length() Get/set length of segment (bp) + flip() Get/set coordinate flipping + pad_top() Get/set top padding + pad_left() Get/set left padding + pad_bottom() Get/set bottom padding + pad_right() Get/set right padding + start() Get the start of the sequence (bp; read only) + end() Get the end of the sequence (bp; read only) + left() Get the left side of the drawing area (pixels; read only) + right() Get the right side of the drawing area (pixels; read only) + +=head2 COLOR METHODS + +The following methods are used internally, but may be useful for those +implementing new glyph types. + +=over 4 + +=item @names = Bio::Graphics::Panel-Ecolor_names + +Return the symbolic names of the colors recognized by the panel +object. In a scalar context, returns an array reference. + +=item ($red,$green,$blue) = Bio::Graphics::Panel-Ecolor_name_to_rgb($color) + +Given a symbolic color name, returns the red, green, blue components +of the color. In a scalar context, returns an array reference to the +rgb triplet. Returns undef for an invalid color name. + +=item @rgb = $panel-Ergb($index) + +Given a GD color index (between 0 and 140), returns the RGB triplet +corresponding to this index. This method is only useful within a +glyph's draw() routine, after the panel has allocated a GD::Image and +is populating it. + +=item $index = $panel-Etranslate_color($color) + +Given a color, returns the GD::Image index. The color may be +symbolic, such as "turquoise", or a #RRGGBB triple, as in #F0E0A8. +This method is only useful within a glyph's draw() routine, after the +panel has allocated a GD::Image and is populating it. + +=item $panel-Eset_pen($width,$color) + +Changes the width and color of the GD drawing pen to the values +indicated. This is called automatically by the GlyphFactory fgcolor() +method. It returns the GD value gdBrushed, which should be used for +drawing. + +=back + +=head1 BUGS + +Please report them. + +=head1 SEE ALSO + +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L, +L + +=head1 AUTHOR + +Lincoln Stein Elstein@cshl.orgE + +Copyright (c) 2001 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Graphics/Util.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Graphics/Util.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,40 @@ +package Bio::Graphics::Util; + +# $Id: Util.pm,v 1.2.2.2 2003/07/06 21:53:55 heikki Exp $ +# Non object-oriented utilities used here-and-there in Bio::Graphics modules + +use strict; +require Exporter; +use vars '@ISA','@EXPORT','@EXPORT_OK'; +@ISA = 'Exporter'; +@EXPORT = 'frame_and_offset'; + + +=over 4 + +=item ($frame,$offset) = frame_and_offset($pos,$strand,$phase) + +Calculate the reading frame for a given genomic position, strand and +phase. The offset is the offset from $pos to the first nucleotide +of the reading frame. + +In a scalar context, returns the frame only. + +=back + +=cut + +sub frame_and_offset { + my ($pos,$strand,$phase) = @_; + $strand ||= +1; + $phase ||= 0; + my $frame = $strand >= 0 + ? ($pos - $phase - 1) % 3 + : (1 - $pos - $phase) % 3; + my $offset = -$phase % 3; + $offset *= -1 if $strand < 0; + return wantarray ? ($frame,$offset) : $frame; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/IdCollectionI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/IdCollectionI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,105 @@ +# $Id: IdCollectionI.pm,v 1.2 2002/10/22 07:38:24 lapp Exp $ + +# +# This module is licensed under the same terms as Perl itself. You use, +# modify, and redistribute it under the terms of the Perl Artistic License. +# + +=head1 NAME + +Bio::IdcollectionI - interface for objects with multiple identifiers + +=head1 SYNOPSIS + + + # to test this is an identifiable collection object + + $obj->isa("Bio::IdCollectionI") || + $obj->throw("$obj does not implement the Bio::IdCollectionI interface"); + + # accessors + @authorities = $obj->id_authorities(); + @ids = $obj->ids(); + $id = $obj->ids($authority); + +=head1 DESCRIPTION + +This interface describes methods expected on objects that have +multiple identifiers, each of which is controlled by a different +authority. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Lincoln Stein + +Email lstein@cshl.org + +=cut + +package Bio::IdCollectionI; +use vars qw(@ISA ); +use strict; +use Bio::Root::RootI; + + +@ISA = qw(Bio::Root::RootI); + +=head1 Implementation Specific Functions + +These functions are the ones that a specific implementation must +define. + +=head2 id_authorities + + Title : id_authorities + Usage : @array = $obj->id_authorities() + Function: Return the authorities which have names for this object. + The authorities can then be used to select ids. + + Returns : An array + Status : Virtual + +=cut + +sub id_authorities { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 ids + + Title : ids + Usage : @ids = $obj->ids([$authority1,$authority2...]) + Function: return a list of Bio::IdentifiableI objects, optionally + filtered by the list of authorities. + + Returns : A list of Bio::IdentifiableI objects. + Status : Virtual + +=cut + +sub ids { + my ($self) = @_; + my @authorities = @_; + $self->throw_not_implemented(); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/IdentifiableI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/IdentifiableI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,218 @@ +# $Id: IdentifiableI.pm,v 1.6 2002/10/23 18:07:49 lapp Exp $ + +# +# This module is licensed under the same terms as Perl itself. You use, +# modify, and redistribute it under the terms of the Perl Artistic License. +# + +=head1 NAME + +Bio::IdentifiableI - interface for objects with identifiers + +=head1 SYNOPSIS + + + # to test this is an identifiable object + + $obj->isa("Bio::IdentifiableI") || + $obj->throw("$obj does not implement the Bio::IdentifiableI interface"); + + # accessors + + $object_id = $obj->object_id(); + $namespace = $obj->namespace(); + $authority = $obj->authority(); + $version = $obj->version(); + + # utility function + + $lsid = $obj->lsid_string(); # gives authority:namespace:object_id + $ns_string = $obj->namespace_string(); # gives namespace:object_id.version + + +=head1 DESCRIPTION + +This interface describes methods expected on identifiable objects, ie +ones which have identifiers expected to make sense across a number of +instances and/or domains. This interface is modeled after pretty much +ubiquitous ideas for names in bioinformatics being + + databasename:object_id.version + +examples being + + swissprot:P012334.2 + +or + + GO:0007048 + +We also work well with LSID proposals which adds in the concept of an +authority, being the DNS name of the organisation assigning the namespace. +Helper functions are provided to make useful strings being + + + lsid_string - string complying to the LSID standard + namespace_string - string complying to the usual convention of + namespace:object_id.version + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney + +Email birney@sanger.ac.uk + +=cut + +package Bio::IdentifiableI; +use vars qw(@ISA ); +use strict; +use Bio::Root::RootI; + + +@ISA = qw(Bio::Root::RootI); + +=head1 Implementation Specific Functions + +These functions are the ones that a specific implementation must +define. + +=head2 object_id + + Title : object_id + Usage : $string = $obj->object_id() + Function: a string which represents the stable primary identifier + in this namespace of this object. For DNA sequences this + is its accession_number, similarly for protein sequences + + Returns : A scalar + Status : Virtual + +=cut + +sub object_id { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 version + + Title : version + Usage : $version = $obj->version() + Function: a number which differentiates between versions of + the same object. Higher numbers are considered to be + later and more relevant, but a single object described + the same identifier should represent the same concept + + Returns : A number + Status : Virtual + +=cut + +sub version { + my ($self) = @_; + $self->throw_not_implemented(); +} + + +=head2 authority + + Title : authority + Usage : $authority = $obj->authority() + Function: a string which represents the organisation which + granted the namespace, written as the DNS name for + organisation (eg, wormbase.org) + + Returns : A scalar + Status : Virtual + +=cut + +sub authority { + my ($self) = @_; + $self->throw_not_implemented(); +} + + +=head2 namespace + + Title : namespace + Usage : $string = $obj->namespace() + Function: A string representing the name space this identifier + is valid in, often the database name or the name + describing the collection + + Returns : A scalar + Status : Virtual + +=cut + +sub namespace { + my ($self) = @_; + $self->throw_not_implemented(); +} + + + +=head1 Implementation optional functions + +These functions are helper functions that are provided by +the interface but can be overridden if so wished + +=head2 lsid_string + + Title : lsid_string + Usage : $string = $obj->lsid_string() + Function: a string which gives the LSID standard + notation for the identifier of interest + + + Returns : A scalar + +=cut + +sub lsid_string { + my ($self) = @_; + + return $self->authority.":".$self->namespace.":".$self->object_id; +} + + + +=head2 namespace_string + + Title : namespace_string + Usage : $string = $obj->namespace_string() + Function: a string which gives the common notation of + namespace:object_id.version + + Returns : A scalar + +=cut + +sub namespace_string { + my ($self) = @_; + + return $self->namespace.":".$self->object_id . + (defined($self->version()) ? ".".$self->version : ''); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Index/Abstract.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Index/Abstract.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,798 @@ + +# +# $Id: Abstract.pm,v 1.41 2002/12/17 02:08:36 jason Exp $ +# +# BioPerl module for Bio::Index::Abstract +# +# Cared for by Ewan Birney +# and James Gilbert +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Index::Abstract - Abstract interface for indexing a flat file + +=head1 SYNOPSIS + +You should not be using this module directly + +=head1 USING DB_FILE + +To use DB_File and not SDBM for this index, pass the value: + + -dbm_package => 'DB_File' + +to new (see below). + +=head1 DESCRIPTION + +This object provides the basic mechanism to associate positions +in files with names. The position and filenames are stored in DBM +which can then be accessed later on. It is the equivalent of flat +file indexing (eg, SRS or efetch). + +This object is the guts to the mechanism, which will be used by the +specific objects inheriting from it. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney, James Gilbert + +Email - birney@sanger.ac.uk, jgrg@sanger.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal +methods are usually preceded with an "_" (underscore). + +=cut + + +# Let the code begin... + +package Bio::Index::Abstract; + +use strict; +use Fcntl qw( O_RDWR O_CREAT O_RDONLY ); +use vars qw( $TYPE_AND_VERSION_KEY + @ISA $USE_DBM_TYPE $DB_HASH ); + +# Object preamble - inheriets from Bio::Root::Object + +use Bio::Root::Root; +use Bio::Root::IO; +use Symbol(); + +@ISA = qw(Bio::Root::Root); + +# Generate accessor methods for simple object fields +BEGIN { + foreach my $func (qw(filename write_flag)) { + no strict 'refs'; + my $field = "_$func"; + + *$func = sub { + my( $self, $value ) = @_; + + if (defined $value) { + $self->{$field} = $value; + } + return $self->{$field}; + } + } +} + +=head2 new + + Usage : $index = Bio::Index::Abstract->new( + -filename => $dbm_file, + -write_flag => 0, + -dbm_package => 'DB_File', + -verbose => 0); + Function: Returns a new index object. If filename is + specified, then open_dbm() is immediately called. + Bio::Index::Abstract->new() will usually be called + directly only when opening an existing index. + Returns : A new index object + Args : -filename The name of the dbm index file. + -write_flag TRUE if write access to the dbm file is + needed. + -dbm_package The Perl dbm module to use for the + index. + -verbose Print debugging output to STDERR if + TRUE. + +=cut + +sub new { + my($class, @args) = @_; + my $self = $class->SUPER::new(@args); + my( $filename, $write_flag, $dbm_package, $cachesize, $ffactor ) = + $self->_rearrange([qw(FILENAME + WRITE_FLAG + DBM_PACKAGE + CACHESIZE + FFACTOR + )], @args); + + # Store any parameters passed + $self->filename($filename) if $filename; + $self->cachesize($cachesize) if $cachesize; + $self->ffactor($ffactor) if $ffactor; + $self->write_flag($write_flag) if $write_flag; + $self->dbm_package($dbm_package) if $dbm_package; + + $self->{'_filehandle'} = []; # Array in which to cache SeqIO objects + $self->{'_DB'} = {}; # Gets tied to the DBM file + + # Open database + $self->open_dbm() if $filename; + return $self; +} + +=pod + +=head2 filename + + Title : filename + Usage : $value = $self->filename(); + $self->filename($value); + Function: Gets or sets the name of the dbm index file. + Returns : The current value of filename + Args : Value of filename if setting, or none if + getting the value. + +=head2 write_flag + + Title : write_flag + Usage : $value = $self->write_flag(); + $self->write_flag($value); + Function: Gets or sets the value of write_flag, which + is wether the dbm file should be opened with + write access. + Returns : The current value of write_flag (default 0) + Args : Value of write_flag if setting, or none if + getting the value. + +=head2 dbm_package + + Usage : $value = $self->dbm_package(); + $self->dbm_package($value); + + Function: Gets or sets the name of the Perl dbm module used. + If the value is unset, then it returns the value of + the package variable $USE_DBM_TYPE or if that is + unset, then it chooses the best available dbm type, + choosing 'DB_File' in preference to 'SDBM_File'. + Bio::Abstract::Index may work with other dbm file + types. + + Returns : The current value of dbm_package + Args : Value of dbm_package if setting, or none if + getting the value. + +=cut + +sub dbm_package { + my( $self, $value ) = @_; + my $to_require = 0; + if( $value || ! $self->{'_dbm_package'} ) { + my $type = $value || $USE_DBM_TYPE || 'DB_File'; + if( $type =~ /DB_File/i ) { + eval { + require DB_File; + }; + $type = ( $@ ) ? 'SDBM_File' : 'DB_File'; + } + if( $type ne 'DB_File' ) { + eval { require "$type.pm"; }; + $self->throw($@) if( $@ ); + } + $self->{'_dbm_package'} = $type; + if( ! defined $USE_DBM_TYPE ) { + $USE_DBM_TYPE = $self->{'_dbm_package'}; + } + } + return $self->{'_dbm_package'}; +} + +=head2 db + + Title : db + Usage : $index->db + Function: Returns a ref to the hash which is tied to the dbm + file. Used internally when adding and retrieving + data from the database. + Example : $db = $index->db(); + $db->{ $some_key } = $data + $data = $index->db->{ $some_key }; + Returns : ref to HASH + Args : NONE + +=cut + +sub db { + return $_[0]->{'_DB'}; +} + + +=head2 get_stream + + Title : get_stream + Usage : $stream = $index->get_stream( $id ); + Function: Returns a file handle with the file pointer + at the approprite place + + This provides for a way to get the actual + file contents and not an object + + WARNING: you must parse the record deliminter + *yourself*. Abstract wont do this for you + So this code + + $fh = $index->get_stream($myid); + while( <$fh> ) { + # do something + } + will parse the entire file if you don't put in + a last statement in, like + + while( <$fh> ) { + /^\/\// && last; # end of record + # do something + } + + Returns : A filehandle object + Args : string represents the accession number + Notes : This method should not be used without forethought + +=cut + +#' + +sub get_stream { + my ($self,$id) = @_; + + my ($desc,$acc,$out); + my $db = $self->db(); + + if (my $rec = $db->{ $id }) { + my( @record ); + + my ($file, $begin, $end) = $self->unpack_record( $rec ); + + # Get the (possibly cached) filehandle + my $fh = $self->_file_handle( $file ); + + # move to start + seek($fh, $begin, 0); + + return $fh; + } + else { + $self->throw("Unable to find a record for $id in the flat file index"); + } +} + + +=head2 cachesize + + Usage : $index->cachesize(1000000) + Function: Sets the dbm file cache size for the index. + Needs to be set before the DBM file gets opened. + Example : $index->cachesize(1000000) + Returns : size of the curent cache + +=cut + +sub cachesize { + my( $self, $size ) = @_; + + if(defined $size){ + $self->{'_cachesize'} = $size; + } + return ( $self->{'_cachesize'} ); + +} + + +=head2 ffactor + + Usage : $index->ffactor(1000000) + Function: Sets the dbm file fill factor. + Needs to be set before the DBM file gets opened. + + Example : $index->ffactor(1000000) + Returns : size of the curent cache + +=cut + +sub ffactor { + my( $self, $size ) = @_; + + if(defined $size){ + $self->{'_ffactor'} = $size; + } + return ( $self->{'_ffactor'} ); + +} + + +=head2 open_dbm + + Usage : $index->open_dbm() + Function: Opens the dbm file associated with the index + object. Write access is only given if explicitly + asked for by calling new(-write => 1) or having set + the write_flag(1) on the index object. The type of + dbm file opened is that returned by dbm_package(). + The name of the file to be is opened is obtained by + calling the filename() method. + + Example : $index->_open_dbm() + Returns : 1 on success + +=cut + +sub open_dbm { + my( $self ) = @_; + + my $filename = $self->filename() + or $self->throw("filename() not set"); + + my $db = $self->db(); + + # Close the dbm file if already open (maybe we're getting + # or dropping write access + if (ref($db) ne 'HASH') { + untie($db); + } + + # What kind of DBM file are we going to open? + my $dbm_type = $self->dbm_package; + + # Choose mode for opening dbm file (read/write+create or read-only). + my $mode_flags = $self->write_flag ? O_RDWR|O_CREAT : O_RDONLY; + + # Open the dbm file + if ($dbm_type eq 'DB_File') { + my $hash_inf = DB_File::HASHINFO->new(); + my $cache = $self->cachesize(); + my $ffactor = $self->ffactor(); + if ($cache){ + $hash_inf->{'cachesize'} = $cache; + } + if ($ffactor){ + $hash_inf->{'ffactor'} = $ffactor; + } + tie( %$db, $dbm_type, $filename, $mode_flags, 0644, $hash_inf ) + or $self->throw("Can't open '$dbm_type' dbm file '$filename' : $!"); + } else { + tie( %$db, $dbm_type, $filename, $mode_flags, 0644 ) + or $self->throw("Can't open '$dbm_type' dbm file '$filename' : $!"); + } + + # The following methods access data in the dbm file: + + # Now, if we're a Bio::Index::Abstract caterpillar, then we + # transform ourselves into a Bio::Index:: butterfly! + if( ref($self) eq "Bio::Index::Abstract" ) { + my $pkg = $self->_code_base(); + bless $self, $pkg; + } + + # Check or set this is the right kind and version of index + $self->_type_and_version(); + + # Check files haven't changed size since they were indexed + $self->_check_file_sizes(); + + return 1; +} + +=head2 _version + + Title : _version + Usage : $type = $index->_version() + Function: Returns a string which identifes the version of an + index module. Used to permanently identify an index + file as having been created by a particular version + of the index module. Must be provided by the sub class + Example : + Returns : + Args : none + +=cut + +sub _version { + my $self = shift; + + $self->throw("In Bio::Index::Abstract, no _version method in sub class"); +} + +=head2 _code_base + + Title : _code_base + Usage : $code = $db->_code_base(); + Function: + Example : + Returns : Code package to be used with this + Args : + + +=cut + +sub _code_base { + my ($self) = @_; + my $code_key = '__TYPE_AND_VERSION'; + my $record; + + $record = $self->db->{$code_key}; + + my($code,$version) = $self->unpack_record($record); + if( wantarray ) { + return ($code,$version); + } else { + return $code; + } +} + + +=head2 _type_and_version + + Title : _type_and_version + Usage : Called by _initalize + Function: Checks that the index opened is made by the same index + module and version of that module that made it. If the + index is empty, then it adds the information to the + database. + Example : + Returns : 1 or exception + Args : none + +=cut + +sub _type_and_version { + my $self = shift; + my $key = '__TYPE_AND_VERSION'; + my $version = $self->_version(); + my $type = ref $self; + + # Run check or add type and version key if missing + if (my $rec = $self->db->{ $key }) { + my( $db_type, $db_version ) = $self->unpack_record($rec); + $self->throw("This index file is from version [$db_version] - You need to rebuild it to use module version [$version]") + unless $db_version == $version; + $self->throw("This index file is type [$db_type] - Can't access it with module for [$type]") + unless $db_type eq $type; + } else { + $self->add_record( $key, $type, $version ) + or $self->throw("Can't add Type and Version record"); + } + return 1; +} + + +=head2 _check_file_sizes + + Title : _check_file_sizes + Usage : $index->_check_file_sizes() + Function: Verifies that the files listed in the database + are the same size as when the database was built, + or throws an exception. Called by the new() + function. + Example : + Returns : 1 or exception + Args : + +=cut + +sub _check_file_sizes { + my $self = shift; + my $num = $self->_file_count() || 0; + + for (my $i = 0; $i < $num; $i++) { + my( $file, $stored_size ) = $self->unpack_record( $self->db->{"__FILE_$i"} ); + my $size = -s $file; + unless ($size == $stored_size) { + $self->throw("file $i [ $file ] has changed size $stored_size -> $size. This probably means you need to rebuild the index."); + } + } + return 1; +} + + +=head2 make_index + + Title : make_index + Usage : $index->make_index( FILE_LIST ) + Function: Takes a list of file names, checks that they are + all fully qualified, and then calls _filename() on + each. It supplies _filename() with the name of the + file, and an integer which is stored with each record + created by _filename(). Can be called multiple times, + and can be used to add to an existing index file. + Example : $index->make_index( '/home/seqs1', '/home/seqs2', '/nfs/pub/big_db' ); + Returns : Number of files indexed + Args : LIST OF FILES + +=cut + +sub make_index { + my($self, @files) = @_; + my $count = 0; + my $recs = 0; + # blow up if write flag is not set. EB fix + + if( !defined $self->write_flag ) { + $self->throw("Attempting to make an index on a read-only database. What about a WRITE flag on opening the index?"); + } + + # We're really fussy/lazy, expecting all file names to be fully qualified + $self->throw("No files to index provided") unless @files; + for(my $i=0;$ican('rel2abs') ) { + if( ! File::Spec->file_name_is_absolute($files[$i]) ) { + $files[$i] = File::Spec->rel2abs($files[$i]); + } + } else { + if( $^O =~ /MSWin/i ) { + ($files[$i] =~ m|^[A-Za-z]:/|) || + $self->throw("Not an absolute file path '$files[$i]'"); + } else { + ($files[$i] =~ m|^/|) || + $self->throw("Not an absolute file path '$files[$i]'"); + } + } + $self->throw("File does not exist '$files[$i]'") unless -e $files[$i]; + } + + # Add each file to the index + FILE : + foreach my $file (@files) { + + my $i; # index for this file + + # Get new index for this file and increment file count + if ( defined(my $count = $self->_file_count) ) { + $i = $count; + } else { + $i = 0; $self->_file_count(0); + } + + # see whether this file has been already indexed + my ($record,$number,$size); + + if( ($record = $self->db->{"__FILENAME_$file"}) ) { + ($number,$size) = $self->unpack_record($record); + + # if it is the same size - fine. Otherwise die + if( -s $file == $size ) { + warn "File $file already indexed. Skipping...\n"; + next FILE; + } else { + $self->throw("In index, $file has changed size ($size). Indicates that the index is out of date"); + } + } + + # index this file + warn "Indexing file $file\n" if( $self->verbose > 0); + + # this is supplied by the subclass and does the serious work + $recs += $self->_index_file( $file, $i ); # Specific method for each type of index + + + # Save file name and size for this index + $self->add_record("__FILE_$i", $file, -s $file) + or $self->throw("Can't add data to file: $file"); + $self->add_record("__FILENAME_$file", $i, -s $file) + or $self->throw("Can't add data to file: $file"); + + # increment file lines + $i++; $self->_file_count($i); + my $temp; + $temp = $self->_file_count(); + + + } + return ($count, $recs); +} + +=head2 _filename + + Title : _filename + Usage : $index->_filename( FILE INT ) + Function: Indexes the file + Example : + Returns : + Args : + +=cut + +sub _index_file { + my $self = shift; + + my $pkg = ref($self); + $self->throw("Error: '$pkg' does not provide the _index_file() method"); +} + + + +=head2 _file_handle + + Title : _file_handle + Usage : $fh = $index->_file_handle( INT ) + Function: Returns an open filehandle for the file + index INT. On opening a new filehandle it + caches it in the @{$index->_filehandle} array. + If the requested filehandle is already open, + it simply returns it from the array. + Example : $fist_file_indexed = $index->_file_handle( 0 ); + Returns : ref to a filehandle + Args : INT + +=cut + +sub _file_handle { + my( $self, $i ) = @_; + + unless ($self->{'_filehandle'}[$i]) { + my $fh = Symbol::gensym(); + my @rec = $self->unpack_record($self->db->{"__FILE_$i"}) + or $self->throw("Can't get filename for index : $i"); + my $file = $rec[0]; + open $fh, $file or $self->throw("Can't read file '$file' : $!"); + $self->{'_filehandle'}[$i] = $fh; # Cache filehandle + } + return $self->{'_filehandle'}[$i]; +} + + +=head2 _file_count + + Title : _file_count + Usage : $index->_file_count( INT ) + Function: Used by the index building sub in a sub class to + track the number of files indexed. Sets or gets + the number of files indexed when called with or + without an argument. + Example : + Returns : INT + Args : INT + +=cut + +sub _file_count { + my $self = shift; + if (@_) { + $self->db->{'__FILE_COUNT'} = shift; + } + return $self->db->{'__FILE_COUNT'}; +} + + +=head2 add_record + + Title : add_record + Usage : $index->add_record( $id, @stuff ); + Function: Calls pack_record on @stuff, and adds the result + of pack_record to the index database under key $id. + If $id is a reference to an array, then a new entry + is added under a key corresponding to each element + of the array. + Example : $index->add_record( $id, $fileNumber, $begin, $end ) + Returns : TRUE on success or FALSE on failure + Args : ID LIST + +=cut + +sub add_record { + my( $self, $id, @rec ) = @_; + $self->debug( "Adding key $id\n") if( $self->verbose > 0 ); + $self->db->{$id} = $self->pack_record( @rec ); + return 1; +} + + +=head2 pack_record + + Title : pack_record + Usage : $packed_string = $index->pack_record( LIST ) + Function: Packs an array of scalars into a single string + joined by ASCII 034 (which is unlikely to be used + in any of the strings), and returns it. + Example : $packed_string = $index->pack_record( $fileNumber, $begin, $end ) + Returns : STRING or undef + Args : LIST + +=cut + +sub pack_record { + my( $self, @args ) = @_; + return join "\034", @args; +} + +=head2 unpack_record + + Title : unpack_record + Usage : $index->unpack_record( STRING ) + Function: Splits the sting provided into an array, + splitting on ASCII 034. + Example : ( $fileNumber, $begin, $end ) = $index->unpack_record( $self->db->{$id} ) + Returns : A 3 element ARRAY + Args : STRING containing ASCII 034 + +=cut + +sub unpack_record { + my( $self, @args ) = @_; + return split /\034/, $args[0]; +} + +=head2 count_records + + Title : count_records + Usage : $recs = $seqdb->count_records() + Function: return count of all recs in the index + Example : + Returns : a scalar + Args : none + + +=cut + +sub count_records { + my ($self,@args) = @_; + my $db = $self->db; + my $c = 0; + while (my($id, $rec) = each %$db) { + if( $id =~ /^__/ ) { + # internal info + next; + } + $c++; + } + + return ($c); +} + + +=head2 DESTROY + + Title : DESTROY + Usage : Called automatically when index goes out of scope + Function: Closes connection to database and handles to + sequence files + Returns : NEVER + Args : NONE + + +=cut + +sub DESTROY { + my $self = shift; + untie($self->{'_DB'}); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Index/AbstractSeq.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Index/AbstractSeq.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,313 @@ +# $Id: AbstractSeq.pm,v 1.16 2002/10/22 07:38:33 lapp Exp $ +# +# BioPerl module for Bio::DB::AbstractSeq +# +# Cared for by Ewan Birney +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Index::AbstractSeq - Base class for AbstractSeq s + +=head1 SYNOPSIS + + # Make a new sequence file indexing package + + package MyShinyNewIndexer; + use Bio::Index::AbstractSeq; + + @ISA = ('Bio::Index::AbstractSeq'); + + # Now provide the necessary methods... + +=head1 DESCRIPTION + +Provides a common base class for multiple +sequence files built using the +Bio::Index::Abstract system, and provides a +Bio::DB::SeqI interface. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney + +Email birney@ebi.ac.uk + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=head1 SEE ALSO + +Bio::Index::Abstract - Module which +Bio::Index::AbstractSeq inherits off, which +provides dbm indexing for flat files (which are +not necessarily sequence files). + +=cut + +# Let's begin the code ... + + +package Bio::Index::AbstractSeq; +use vars qw(@ISA); +use strict; + +use Bio::SeqIO::MultiFile; +use Bio::Index::Abstract; +use Bio::DB::SeqI; + + +@ISA = qw(Bio::Index::Abstract Bio::DB::SeqI); + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + $self->{'_seqio_cache'} = []; + return $self; +} + +=head2 _file_format + + Title : _file_format + Usage : $self->_file_format + Function: Derived classes should override this + method (it throws an exception here) + to give the file format of the files used + Example : + Returns : + Args : + + +=cut + +sub _file_format { + my ($self,@args) = @_; + + my $pkg = ref($self); + $self->throw("Class '$pkg' must provide a file format method correctly"); +} + +=head2 fetch + + Title : fetch + Usage : $index->fetch( $id ) + Function: Returns a Bio::Seq object from the index + Example : $seq = $index->fetch( 'dJ67B12' ) + Returns : Bio::Seq object + Args : ID + +=cut + +sub fetch { + my( $self, $id ) = @_; + my $db = $self->db(); + my $seq; + + if (my $rec = $db->{ $id }) { + my ($file, $begin) = $self->unpack_record( $rec ); + + # Get the (possibly cached) SeqIO object + my $seqio = $self->_get_SeqIO_object( $file ); + my $fh = $seqio->_fh(); + + # move to start of record + $begin-- if( $^O =~ /mswin/i); # workaround for Win DB_File bug + seek($fh, $begin, 0); + + $seq = $seqio->next_seq(); + } + + # we essentially assumme that the primary_id for the database + # is the display_id + $seq->primary_id($seq->display_id()) if( defined $seq && ref($seq) && + $seq->isa('Bio::PrimarySeqI') ); + + return $seq; +} + +=head2 _get_SeqIO_object + + Title : _get_SeqIO_object + Usage : $index->_get_SeqIO_object( $file ) + Function: Returns a Bio::SeqIO object for the file + Example : $seq = $index->_get_SeqIO_object( 0 ) + Returns : Bio::SeqIO object + Args : File number (an integer) + +=cut + +sub _get_SeqIO_object { + my( $self, $i ) = @_; + + unless ($self->{'_seqio_cache'}[$i]) { + my $fh = $self->_file_handle($i); + # make a new SeqIO object + my $seqio = Bio::SeqIO->new( -Format => $self->_file_format, + -fh => $fh); + $self->{'_seqio_cache'}[$i] = $seqio; + } + return $self->{'_seqio_cache'}[$i]; +} + +=head2 get_Seq_by_id + + Title : get_Seq_by_id + Usage : $seq = $db->get_Seq_by_id() + Function: retrieves a sequence object, identically to + ->fetch, but here behaving as a Bio::DB::BioSeqI + Returns : new Bio::Seq object + Args : string represents the id + + +=cut + +sub get_Seq_by_id { + my ($self,$id) = @_; + + return $self->fetch($id); +} + +=head2 get_Seq_by_acc + + Title : get_Seq_by_acc + Usage : $seq = $db->get_Seq_by_acc() + Function: retrieves a sequence object, identically to + ->fetch, but here behaving as a Bio::DB::BioSeqI + Returns : new Bio::Seq object + Args : string represents the accession number + + +=cut + +sub get_Seq_by_acc { + my ($self,$id) = @_; + + return $self->fetch($id); +} + +=head2 get_PrimarySeq_stream + + Title : get_PrimarySeq_stream + Usage : $stream = get_PrimarySeq_stream + Function: Makes a Bio::DB::SeqStreamI compliant object + which provides a single method, next_primary_seq + Returns : Bio::DB::SeqStreamI + Args : none + + +=cut + +sub get_PrimarySeq_stream { + my $self = shift; + my $num = $self->_file_count() || 0; + my @file; + + for (my $i = 0; $i < $num; $i++) { + my( $file, $stored_size ) = $self->unpack_record( $self->db->{"__FILE_$i"} ); + push(@file,$file); + } + + my $out = Bio::SeqIO::MultiFile->new( '-format' => $self->_file_format , -files => \@file); + return $out; +} + +=head2 get_all_primary_ids + + Title : get_all_primary_ids + Usage : @ids = $seqdb->get_all_primary_ids() + Function: gives an array of all the primary_ids of the + sequence objects in the database. These + maybe ids (display style) or accession numbers + or something else completely different - they + *are not* meaningful outside of this database + implementation. + Example : + Returns : an array of strings + Args : none + + +=cut + +sub get_all_primary_ids { + my ($self,@args) = @_; + my $db = $self->db; + + # the problem is here that we have indexed things both on + # accession number and name. + + # We could take two options + # here - loop over the database, returning only one copy of each + # id that points to the same byte position, or we rely on semantics + # of accession numbers. + + # someone is going to index a database with no accession numbers. + # doh!. We have to uniquify the index... + + my( %bytepos ); + while (my($id, $rec) = each %$db) { + if( $id =~ /^__/ ) { + # internal info + next; + } + my ($file, $begin) = $self->unpack_record( $rec ); + + $bytepos{"$file:$begin"} = $id; + } + + return values %bytepos; +} + + +=head2 get_Seq_by_primary_id + + Title : get_Seq_by_primary_id + Usage : $seq = $db->get_Seq_by_primary_id($primary_id_string); + Function: Gets a Bio::Seq object by the primary id. The primary + id in these cases has to come from $db->get_all_primary_ids. + There is no other way to get (or guess) the primary_ids + in a database. + + The other possibility is to get Bio::PrimarySeqI objects + via the get_PrimarySeq_stream and the primary_id field + on these objects are specified as the ids to use here. + Returns : A Bio::Seq object + Args : primary id (as a string) + Throws : "acc does not exist" exception + + +=cut + +sub get_Seq_by_primary_id { + my ($self,$id) = @_; + return $self->fetch($id); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Index/Blast.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Index/Blast.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,468 @@ +# $Id: Blast.pm,v 1.8.2.1 2003/06/28 21:57:04 jason Exp $ +# +# BioPerl module for Bio::Index::Blast +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Index::Blast - Indexes Blast reports and supports retrieval based on query accession(s) + +=head1 SYNOPSIS + + use strict; + use Bio::Index::Blast; + my $index = new Bio::Index::Blast(-filename => $indexfile, + -write_flag => 1); + $index->make_index($file1, $file2); + + my $data = $index->get_stream($id); + + my $bplite_report = $index->fetch_report($id); + print "query is ", $bplite_report->query, "\n"; + while( my $sbjct = $bplite_report->nextSbjct ) { + print $sbjct->name, "\n"; + while( my $hsp = $sbjct->nextHSP ) { + print "\t e-value ", $hsp->P, + } + print "\n"; + } + +=head1 DESCRIPTION + +This object allows one to build an index on a blast file (or files) +and provide quick access to the blast report for that accession. +Note: for best results 'use strict'. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion +http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@cgt.mc.duke.edu + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Index::Blast; +use vars qw(@ISA $VERSION); +use strict; + +use Bio::Root::Root; +use Bio::Index::Abstract; +use Bio::Tools::BPlite; +use IO::String; + +@ISA = qw(Bio::Index::Abstract Bio::Root::Root ); + +BEGIN { + $VERSION = 0.1; +} + +sub _version { + return $VERSION; +} + +=head2 new + + Usage : $index = Bio::Index::Abstract->new( + -filename => $dbm_file, + -write_flag => 0, + -dbm_package => 'DB_File', + -verbose => 0); + Function: Returns a new index object. If filename is + specified, then open_dbm() is immediately called. + Bio::Index::Abstract->new() will usually be called + directly only when opening an existing index. + Returns : A new index object + Args : -filename The name of the dbm index file. + -write_flag TRUE if write access to the dbm file is + needed. + -dbm_package The Perl dbm module to use for the + index. + -verbose Print debugging output to STDERR if + TRUE. + +=cut + +sub new { + + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + +} + +=head2 Bio::Index::Blast implemented methods + +=cut + +=head2 fetch_report + + Title : fetch_report + Usage : my $blastreport = $idx->fetch_report($id); + Function: Returns a Bio::Tools::BPlite report object + for a specific blast report + Returns : Bio::Tools::BPlite + Args : valid id + +=cut + +sub fetch_report{ + my ($self,$id) = @_; + my $fh = $self->get_stream($id); + my $report = new Bio::Tools::BPlite(-fh => $fh, + -noclose => 1); + return $report; +} + + +# shamlessly stolen from Bio::Index::Fasta + +=head2 id_parser + + Title : id_parser + Usage : $index->id_parser( CODE ) + Function: Stores or returns the code used by record_id to + parse the ID for record from a string. Useful + for (for instance) specifying a different + parser for different flavours of blast dbs. + Returns \&default_id_parser (see below) if not + set. If you supply your own id_parser + subroutine, then it should expect a fasta + description line. An entry will be added to + the index for each string in the list returned. + Example : $index->id_parser( \&my_id_parser ) + Returns : ref to CODE if called without arguments + Args : CODE + +=cut + +sub id_parser { + my( $self, $code ) = @_; + + if ($code) { + $self->{'_id_parser'} = $code; + } + return $self->{'_id_parser'} || \&default_id_parser; +} + + + +=head2 default_id_parser + + Title : default_id_parser + Usage : $id = default_id_parser( $header ) + Function: The default Blast Query ID parser for Bio::Index::Blast.pm + Returns $1 from applying the regexp /^>\s*(\S+)/ + to $header. + Returns : ID string + Args : a header line string + +=cut + +sub default_id_parser { + if ($_[0] =~ /^\s*(\S+)/) { + return $1; + } else { + return; + } +} + +=head2 Require methods from Bio::Index::Abstract + +=cut + +=head2 _index_file + + Title : _index_file + Usage : $index->_index_file( $file_name, $i ) + Function: Specialist function to index BLAST report file(s). + Is provided with a filename and an integer + by make_index in its SUPER class. + Example : + Returns : + Args : + +=cut + +sub _index_file { + my( $self, + $file, # File name + $i, # Index-number of file being indexed + ) = @_; + + my( $begin, # Offset from start of file of the start + # of the last found record. + ); + + open(BLAST, "<$file") or die("cannot open file $file\n"); + + my (@data, @records); + my $indexpoint = 0; + my $lastline = 0; + + while( ) { + if( /(T)?BLAST[PNX]/ ) { + if( @data ) { + # if we have already read a report + # then store the data for this report + # in the CURRENT index + $self->_process_report($indexpoint, $i,join("",@data)); + + } # handle fencepost problem (beginning) + # by skipping here when empty + + # since we are at the beginning of a new report + # store this begin location for the next index + $indexpoint = $lastline; + @data = (); + } + push @data, $_; + $lastline = tell(BLAST); + } + # handle fencepost problem (end) + if( @data ) { + $self->_process_report($indexpoint,$i,join("",@data)); + } +} + +sub _process_report { + my ($self,$begin,$i,$data) = @_; + + if( ! $data ) { + $self->warn("calling _process_report without a valid data string"); + return ; + } + my $id_parser = $self->id_parser; + + my $datal = new IO::String($data); + my $report = new Bio::Tools::BPlite(-fh => $datal, + -noclose => 1); + + my $query = $report->query; + foreach my $id (&$id_parser($query)) { + print "id is $id, begin is $begin\n" if( $self->verbose > 0); + $self->add_record($id, $i, $begin); + } +} +=head2 Bio::Index::Abstract methods + +=head2 filename + + Title : filename + Usage : $value = $self->filename(); + $self->filename($value); + Function: Gets or sets the name of the dbm index file. + Returns : The current value of filename + Args : Value of filename if setting, or none if + getting the value. + +=head2 write_flag + + Title : write_flag + Usage : $value = $self->write_flag(); + $self->write_flag($value); + Function: Gets or sets the value of write_flag, which + is wether the dbm file should be opened with + write access. + Returns : The current value of write_flag (default 0) + Args : Value of write_flag if setting, or none if + getting the value. + +=head2 dbm_package + + Usage : $value = $self->dbm_package(); + $self->dbm_package($value); + + Function: Gets or sets the name of the Perl dbm module used. + If the value is unset, then it returns the value of + the package variable $USE_DBM_TYPE or if that is + unset, then it chooses the best available dbm type, + choosing 'DB_File' in preference to 'SDBM_File'. + Bio::Abstract::Index may work with other dbm file + types. + + Returns : The current value of dbm_package + Args : Value of dbm_package if setting, or none if + getting the value. + + +=head2 get_stream + + Title : get_stream + Usage : $stream = $index->get_stream( $id ); + Function: Returns a file handle with the file pointer + at the approprite place + + This provides for a way to get the actual + file contents and not an object + + WARNING: you must parse the record deliminter + *yourself*. Abstract wont do this for you + So this code + + $fh = $index->get_stream($myid); + while( <$fh> ) { + # do something + } + will parse the entire file if you don't put in + a last statement in, like + + while( <$fh> ) { + /^\/\// && last; # end of record + # do something + } + + Returns : A filehandle object + Args : string represents the accession number + Notes : This method should not be used without forethought + + +=head2 open_dbm + + Usage : $index->open_dbm() + Function: Opens the dbm file associated with the index + object. Write access is only given if explicitly + asked for by calling new(-write => 1) or having set + the write_flag(1) on the index object. The type of + dbm file opened is that returned by dbm_package(). + The name of the file to be is opened is obtained by + calling the filename() method. + + Example : $index->_open_dbm() + Returns : 1 on success + + +=head2 _version + + Title : _version + Usage : $type = $index->_version() + Function: Returns a string which identifes the version of an + index module. Used to permanently identify an index + file as having been created by a particular version + of the index module. Must be provided by the sub class + Example : + Returns : + Args : none + +=head2 _filename + + Title : _filename + Usage : $index->_filename( FILE INT ) + Function: Indexes the file + Example : + Returns : + Args : + +=head2 _file_handle + + Title : _file_handle + Usage : $fh = $index->_file_handle( INT ) + Function: Returns an open filehandle for the file + index INT. On opening a new filehandle it + caches it in the @{$index->_filehandle} array. + If the requested filehandle is already open, + it simply returns it from the array. + Example : $fist_file_indexed = $index->_file_handle( 0 ); + Returns : ref to a filehandle + Args : INT + +=head2 _file_count + + Title : _file_count + Usage : $index->_file_count( INT ) + Function: Used by the index building sub in a sub class to + track the number of files indexed. Sets or gets + the number of files indexed when called with or + without an argument. + Example : + Returns : INT + Args : INT + + +=head2 add_record + + Title : add_record + Usage : $index->add_record( $id, @stuff ); + Function: Calls pack_record on @stuff, and adds the result + of pack_record to the index database under key $id. + If $id is a reference to an array, then a new entry + is added under a key corresponding to each element + of the array. + Example : $index->add_record( $id, $fileNumber, $begin, $end ) + Returns : TRUE on success or FALSE on failure + Args : ID LIST + +=head2 pack_record + + Title : pack_record + Usage : $packed_string = $index->pack_record( LIST ) + Function: Packs an array of scalars into a single string + joined by ASCII 034 (which is unlikely to be used + in any of the strings), and returns it. + Example : $packed_string = $index->pack_record( $fileNumber, $begin, $end ) + Returns : STRING or undef + Args : LIST + +=head2 unpack_record + + Title : unpack_record + Usage : $index->unpack_record( STRING ) + Function: Splits the sting provided into an array, + splitting on ASCII 034. + Example : ( $fileNumber, $begin, $end ) = $index->unpack_record( $self->db->{$id} ) + Returns : A 3 element ARRAY + Args : STRING containing ASCII 034 + +=head2 DESTROY + + Title : DESTROY + Usage : Called automatically when index goes out of scope + Function: Closes connection to database and handles to + sequence files + Returns : NEVER + Args : NONE + + +=cut + + +1; + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Index/EMBL.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Index/EMBL.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,213 @@ +# $Id: EMBL.pm,v 1.21.2.1 2003/03/19 16:23:08 heikki Exp $ +# +# BioPerl module for Bio::Index::EMBL +# +# Cared for by Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Index::EMBL - Interface for indexing (multiple) EMBL/Swissprot +.dat files (ie flat file embl/swissprot format). + +=head1 SYNOPSIS + + # Complete code for making an index for several + # EMBL files + use Bio::Index::EMBL; + use strict; + + my $Index_File_Name = shift; + my $inx = Bio::Index::EMBL->new('-filename' => $Index_File_Name, + '-write_flag' => 'WRITE'); + $inx->make_index(@ARGV); + + # Print out several sequences present in the index + # in Fasta format + use Bio::Index::EMBL; + use strict; + + my $Index_File_Name = shift; + my $inx = Bio::Index::EMBL->new('-filename' => $Index_File_Name); + my $out = Bio::SeqIO->new('-format' => 'Fasta','-fh' => \*STDOUT); + + foreach my $id (@ARGV) { + my $seq = $inx->fetch($id); # Returns Bio::Seq object + $out->write_seq($seq); + } + + # alternatively + + my $seq1 = $inx->get_Seq_by_id($id); + my $seq2 = $inx->get_Seq_by_acc($acc); + +=head1 DESCRIPTION + +Inherits functions for managing dbm files from Bio::Index::Abstract.pm, +and provides the basic funtionallity for indexing EMBL files, and +retrieving the sequence from them. Heavily snaffled from James Gilbert's +Fasta system. Note: for best results 'use strict'. + +=head1 FEED_BACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + + + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney + +Email - birney@sanger.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let's begin the code... + + +package Bio::Index::EMBL; + +use vars qw($VERSION @ISA); +use strict; + +use Bio::Index::AbstractSeq; +use Bio::Seq; + +@ISA = qw(Bio::Index::AbstractSeq); + +sub _type_stamp { + return '__EMBL_FLAT__'; # What kind of index are we? +} + +# +# Suggested fix by Michael G Schwern to +# get around a clash with CPAN shell... +# + +BEGIN { + $VERSION = 0.1; +} + +sub _version { + return $VERSION; +} + +=head2 _index_file + + Title : _index_file + Usage : $index->_index_file( $file_name, $i ) + Function: Specialist function to index EMBL format files. + Is provided with a filename and an integer + by make_index in its SUPER class. + Example : + Returns : + Args : + +=cut + +sub _index_file { + my( $self, + $file, # File name + $i # Index-number of file being indexed + ) = @_; + + my( $begin, # Offset from start of file of the start + # of the last found record. + $id, # ID of last found record. + @accs, # accession of last record. Also put into the index + ); + + $begin = 0; + + open EMBL, $file or $self->throw("Can't open file for read : $file"); + + # Main indexing loop + $id = undef; + @accs = (); + while () { + if( /^\/\// ) { + if( ! defined $id ) { + $self->throw("Got to a end of entry line for an EMBL flat file with no parsed ID. Considering this a problem!"); + next; + } + if( ! @accs ) { + $self->warn("For id [$id] in embl flat file, got no accession number. Storing id index anyway"); + } + + $self->add_record($id, $i, $begin); + + foreach my $acc (@accs) { + if( $acc ne $id ) { + $self->add_record($acc, $i, $begin); + } + } + } elsif (/^ID\s+(\S+)/) { + $id = $1; + # not sure if I like this. Assummes tell is in bytes. + # we could tell before each line and save it. + $begin = tell(EMBL) - length( $_ ); + + } elsif (/^AC\s+(.*)?/) { + push @accs , split (/[; ]+/, $1); + } else { + # do nothing + } + } + + close EMBL; + return 1; +} + +=head2 _file_format + + Title : _file_format + Usage : Internal function for indexing system + Function: Provides file format for this database + Example : + Returns : + Args : + + +=cut + +sub _file_format{ + my ($self,@args) = @_; + + return 'EMBL'; +} + + + +1; + + + + + + + + + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Index/Fasta.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Index/Fasta.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,223 @@ +# +# $Id: Fasta.pm,v 1.20 2002/10/22 07:38:33 lapp Exp $ +# +# BioPerl module for Bio::Index::Fasta +# +# Cared for by James Gilbert +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Index::Fasta - Interface for indexing (multiple) fasta files + +=head1 SYNOPSIS + + # Complete code for making an index for several + # fasta files + use Bio::Index::Fasta; + use strict; + + my $Index_File_Name = shift; + my $inx = Bio::Index::Fasta->new( + '-filename' => $Index_File_Name, + '-write_flag' => 1); + $inx->make_index(@ARGV); + + # Print out several sequences present in the index + # in Fasta format + use Bio::Index::Fasta; + use strict; + + my $Index_File_Name = shift; + my $inx = Bio::Index::Fasta->new('-filename' => $Index_File_Name); + my $out = Bio::SeqIO->new('-format' => 'Fasta','-fh' => \*STDOUT); + + foreach my $id (@ARGV) { + my $seq = $inx->fetch($id); # Returns Bio::Seq object + $out->write_seq($seq); + } + + # or, alternatively + + my $seq = $inx->get_Seq_by_id($id); #identical to fetch + +=head1 DESCRIPTION + +Inherits functions for managing dbm files from Bio::Index::Abstract.pm, +and provides the basic funtionallity for indexing fasta files, and +retrieving the sequence from them. Note: for best results 'use strict'. + +Bio::Index::Fasta supports the Bio::DB::BioSeqI interface, meaning +it can be used a a Sequence database for other parts of bioperl + +=head1 FEED_BACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - James Gilbert + +Email - jgrg@sanger.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Index::Fasta; + +use vars qw($VERSION @ISA); +use strict; + +use Bio::Index::AbstractSeq; +use Bio::Seq; + +@ISA = qw(Bio::Index::AbstractSeq); + +# +# Suggested fix by Michael G Schwern to +# get around a clash with CPAN shell... +# + +BEGIN { + $VERSION = 0.2; +} + +sub _version { + return $VERSION; +} + +=head2 _file_format + + Title : _file_format + Function: The file format for this package, which is needed + by the SeqIO system when reading the sequence. + Returns : 'Fasta' + +=cut + +sub _file_format { + return 'Fasta'; +} + + + +=head2 _index_file + + Title : _index_file + Usage : $index->_index_file( $file_name, $i ) + Function: Specialist function to index FASTA format files. + Is provided with a filename and an integer + by make_index in its SUPER class. + Example : + Returns : + Args : + +=cut + +sub _index_file { + my( $self, + $file, # File name + $i, # Index-number of file being indexed + ) = @_; + + my( $begin, # Offset from start of file of the start + # of the last found record. + ); + + $begin = 0; + + my $id_parser = $self->id_parser; + + open FASTA, $file or $self->throw("Can't open file for read : $file"); + + # Main indexing loop + while () { + if (/^>/) { + # $begin is the position of the first character after the '>' + my $begin = tell(FASTA) - length( $_ ) + 1; + + foreach my $id (&$id_parser($_)) { + $self->add_record($id, $i, $begin); + } + } + } + + close FASTA; + return 1; +} + +=head2 id_parser + + Title : id_parser + Usage : $index->id_parser( CODE ) + Function: Stores or returns the code used by record_id to + parse the ID for record from a string. Useful + for (for instance) specifying a different + parser for different flavours of FASTA file. + Returns \&default_id_parser (see below) if not + set. If you supply your own id_parser + subroutine, then it should expect a fasta + description line. An entry will be added to + the index for each string in the list returned. + Example : $index->id_parser( \&my_id_parser ) + Returns : ref to CODE if called without arguments + Args : CODE + +=cut + +sub id_parser { + my( $self, $code ) = @_; + + if ($code) { + $self->{'_id_parser'} = $code; + } + return $self->{'_id_parser'} || \&default_id_parser; +} + + + +=head2 default_id_parser + + Title : default_id_parser + Usage : $id = default_id_parser( $header ) + Function: The default Fasta ID parser for Fasta.pm + Returns $1 from applying the regexp /^>\s*(\S+)/ + to $header. + Returns : ID string + Args : a fasta header line string + +=cut + +sub default_id_parser { + if ($_[0] =~ /^>\s*(\S+)/) { + return $1; + } else { + return; + } +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Index/Fastq.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Index/Fastq.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,221 @@ +# +# +# BioPerl module for Bio::Index::Fastq +# +# Cared for by Tony Cox +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Index::Fastq - Interface for indexing (multiple) fastq files + +=head1 SYNOPSIS + + # Complete code for making an index for several + # fastq files + use Bio::Index::Fastq; + use strict; + + my $Index_File_Name = shift; + my $inx = Bio::Index::Fastq->new( + '-filename' => $Index_File_Name, + '-write_flag' => 1); + $inx->make_index(@ARGV); + + # Print out several sequences present in the index + # in Fastq format + use Bio::Index::Fastq; + use strict; + + my $Index_File_Name = shift; + my $inx = Bio::Index::Fastq->new('-filename' => $Index_File_Name); + my $out = Bio::SeqIO->new('-format' => 'Fastq','-fh' => \*STDOUT); + + foreach my $id (@ARGV) { + my $seq = $inx->fetch($id); # Returns Bio::Seq::SeqWithQuality object + $out->write_seq($seq); + } + + # or, alternatively + + my $seq = $inx->get_Seq_by_id($id); #identical to fetch + +=head1 DESCRIPTION + +Inherits functions for managing dbm files from Bio::Index::Abstract.pm, +and provides the basic funtionallity for indexing fastq files, and +retrieving the sequence from them. Note: for best results 'use strict'. + +Bio::Index::Fastq supports the Bio::DB::BioSeqI interface, meaning +it can be used as a Sequence database for other parts of bioperl + +=head1 FEED_BACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Tony Cox + +Email - avc@sanger.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Index::Fastq; + +use vars qw($VERSION @ISA); +use strict; + +use Bio::Index::AbstractSeq; +use Bio::Seq; + +@ISA = qw(Bio::Index::AbstractSeq); + +# +# Suggested fix by Michael G Schwern to +# get around a clash with CPAN shell... +# + +BEGIN { + $VERSION = 0.2; +} + +sub _version { + return $VERSION; +} + +=head2 _file_format + + Title : _file_format + Function: The file format for this package, which is needed + by the SeqIO system when reading the sequence. + Returns : 'Fastq' + +=cut + +sub _file_format { + return 'Fastq'; +} + + + +=head2 _index_file + + Title : _index_file + Usage : $index->_index_file( $file_name, $i ) + Function: Specialist function to index FASTQ format files. + Is provided with a filename and an integer + by make_index in its SUPER class. + Example : + Returns : + Args : + +=cut + +sub _index_file { + my( $self, + $file, # File name + $i, # Index-number of file being indexed + ) = @_; + + my( $begin, # Offset from start of file of the start + # of the last found record. + ); + + $begin = 0; + + my $id_parser = $self->id_parser; + my $c = 0; + open FASTQ, $file or $self->throw("Can't open file for read : $file"); + # Main indexing loop + while () { + if (/^@/) { + # $begin is the position of the first character after the '@' + my $begin = tell(FASTQ) - length( $_ ) + 1; + foreach my $id (&$id_parser($_)) { + $self->add_record($id, $i, $begin); + $c++; + } + } + } + + close FASTQ; + return ($c); +} + +=head2 id_parser + + Title : id_parser + Usage : $index->id_parser( CODE ) + Function: Stores or returns the code used by record_id to + parse the ID for record from a string. Useful + for (for instance) specifying a different + parser for different flavours of FASTQ file. + Returns \&default_id_parser (see below) if not + set. If you supply your own id_parser + subroutine, then it should expect a fastq + description line. An entry will be added to + the index for each string in the list returned. + Example : $index->id_parser( \&my_id_parser ) + Returns : ref to CODE if called without arguments + Args : CODE + +=cut + +sub id_parser { + my( $self, $code ) = @_; + + if ($code) { + $self->{'_id_parser'} = $code; + } + return $self->{'_id_parser'} || \&default_id_parser; +} + + + +=head2 default_id_parser + + Title : default_id_parser + Usage : $id = default_id_parser( $header ) + Function: The default Fastq ID parser for Fastq.pm + Returns $1 from applying the regexp /^>\s*(\S+)/ + to $header. + Returns : ID string + Args : a fastq header line string + +=cut + +sub default_id_parser { + if ($_[0] =~ /^@\s*(\S+)/) { + return $1; + } else { + return; + } +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Index/GenBank.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Index/GenBank.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,207 @@ +# +# $Id: GenBank.pm,v 1.10 2002/10/22 07:38:33 lapp Exp $ +# +# BioPerl module for Bio::Index::Abstract +# +# Cared for by Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Index::GenBank - Interface for indexing (multiple) GenBank +.seq files (ie flat file GenBank format). + +=head1 SYNOPSIS + + # Complete code for making an index for several + # GenBank files + use Bio::Index::GenBank; + use strict; + + my $Index_File_Name = shift; + my $inx = Bio::Index::GenBank->new('-filename' => $Index_File_Name, + '-write_flag' => 'WRITE'); + $inx->make_index(@ARGV); + + # Print out several sequences present in the index + # in gcg format + use Bio::Index::GenBank; + use Bio::SeqIO; + use strict; + + my $Index_File_Name = shift; + my $inx = Bio::Index::GenBank->new('-filename' => $Index_File_Name); + my $seqio = new Bio::SeqIO(-format => 'gcg'); + foreach my $id (@ARGV) { + my $seq = $inx->fetch($id); # Returns Bio::Seq object + $seqio->write_seq($seq); + } + + # alternatively + + my $seq1 = $inx->get_Seq_by_id($id); + my $seq2 = $inx->get_Seq_by_acc($acc); + +=head1 DESCRIPTION + +Inherits functions for managing dbm files from Bio::Index::Abstract.pm, +and provides the basic funtionallity for indexing GenBank files, and +retrieving the sequence from them. Heavily snaffled from James Gilbert's +Fasta system. Note: for best results 'use strict'. + +=head1 FEED_BACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney + +Email - birney@ebi.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let's begin the code... + + +package Bio::Index::GenBank; + +use vars qw($VERSION @ISA); +use strict; + +use Bio::Index::AbstractSeq; +use Bio::Seq; + +@ISA = qw(Bio::Index::AbstractSeq); + +sub _type_stamp { + return '__GenBank_FLAT__'; # What kind of index are we? +} + +# +# Suggested fix by Michael G Schwern to +# get around a clash with CPAN shell... +# + +BEGIN { + $VERSION = 0.1; +} + +sub _version { + return $VERSION; +} + +=head2 _index_file + + Title : _index_file + Usage : $index->_index_file( $file_name, $i ) + Function: Specialist function to index GenBank format files. + Is provided with a filename and an integer + by make_index in its SUPER class. + Example : + Returns : + Args : + +=cut + +sub _index_file { + my( $self, + $file, # File name + $i # Index-number of file being indexed + ) = @_; + + my( $begin, # Offset from start of file of the start + # of the last found record. + $id, # ID of last found record. + @accs, # accession of last record. Also put into the index + ); + + $begin = 0; + + open GenBank, $file or $self->throw("Can't open file for read : $file"); + + # Main indexing loop + $id = undef; + @accs = (); + while () { + if( /^\/\// ) { + if( ! defined $id ) { + $self->throw("Got to a end of entry line for an GenBank flat file with no parsed ID. Considering this a problem!"); + next; + } + if( ! @accs ) { + $self->warn("For id [$id] in GenBank flat file, got no accession number. Storing id index anyway"); + } + + $self->add_record($id, $i, $begin); + + foreach my $acc (@accs) { + if( $acc ne $id ) { + $self->add_record($acc, $i, $begin); + } + } + @accs = (); + } elsif (/^LOCUS\s+(\S+)/) { + $id = $1; + # not sure if I like this. Assummes tell is in bytes. + # we could tell before each line and save it. + $begin = tell(GenBank) - length( $_ ); + + } elsif (/^ACCESSION(.*)/) { # ignore ? if there. + @accs = ($1 =~ /\s*(\S+)/g); + } elsif( /^VERSION(.*)/) { + my $a = $1; + $a =~ s/^\s+//; + $a =~ s/\s+$//; + $a =~ s/GI\://; + push @accs, split(/\s+/,$a); + } else { + # do nothing + } + } + + close GenBank; + return 1; +} + +=head2 _file_format + + Title : _file_format + Usage : Internal function for indexing system + Function: Provides file format for this database + Example : + Returns : + Args : + + +=cut + +sub _file_format{ + my ($self,@args) = @_; + + return 'GenBank'; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Index/SwissPfam.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Index/SwissPfam.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,206 @@ + +# +# $Id: SwissPfam.pm,v 1.15 2002/10/22 07:38:33 lapp Exp $ +# +# BioPerl module for Bio::Index::SwissPfam +# +# Cared for by Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Index::SwissPfam - Interface for indexing swisspfam files + +=head1 SYNOPSIS + + use Bio::Index::SwissPfam; + use strict; + + my $Index_File_Name = shift; + my $inx = Bio::Index::SwissPfam->new('-filename' => $Index_File_Name, + '-write_flag' => 'WRITE'); + $inx->make_index(@ARGV); + + use Bio::Index::SwissPfam; + use strict; + + my $Index_File_Name = shift; + my $inx = Bio::Index::SwissPfam->new('-filename' => $Index_File_Name); + + foreach my $id (@ARGV) { + my $seq = $inx->fetch($id); # Returns stream + while( <$seq> ) { + if(/^>/) { + print; + last; + } + } + } + + +=head1 DESCRIPTION + +SwissPfam is one of the flat files released with Pfam. This modules +provides a way of indexing this module. + +Inherits functions for managing dbm files from +Bio::Index::Abstract.pm, and provides the basic funtionallity for +indexing SwissPfam files. Only retrieves FileStreams at the +moment. Once we have something better (ie, an object!), will use +that. Heavily snaffled from James Gilbert's Fasta system. Note: for +best results 'use strict'. + +=head1 FEED_BACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney + +Email - birney@sanger.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let's begin the code... + + +package Bio::Index::SwissPfam; + +use vars qw($VERSION @ISA); +use strict; + +use Bio::Index::Abstract; +use Bio::Seq; + +@ISA = qw(Bio::Index::Abstract); + +# +# Suggested fix by Michael G Schwern to +# get around a clash with CPAN shell... +# + +BEGIN { + $VERSION = 0.1; +} + +sub _version { + return $VERSION; +} + +=head2 _index_file + + Title : _index_file + Usage : $index->_index_file( $file_name, $i ) + Function: Specialist function to index swisspfam format files. + Is provided with a filename and an integer + by make_index in its SUPER class. + Example : + Returns : + Args : + +=cut + +sub _index_file { + my( $self, + $file, # File name + $i # Index-number of file being indexed + ) = @_; + + my( $begin, # Offset from start of file of the start + # of the last found record. + $end, # Offset from start of file of the end + # of the last found record. + $id, # ID of last found record. + $acc, # accession of last record. Also put into the index + $nid, $nacc, # new ids for the record just found + ); + + $begin = 0; + $end = 0; + + open SP, $file or $self->throw("Can't open file for read : $file"); + + # Main indexing loop + while () { + if (/^>(\S+)\s+\|=*\|\s+(\S+)/) { + $nid = $1; + $nacc = $2; + my $new_begin = tell(SP) - length( $_ ); + $end = $new_begin - 1; + + if( $id ) { + $self->add_record($id, $i, $begin, $end); + if( $acc ne $id ) { + $self->add_record($acc, $i, $begin, $end); + } + } + $begin = $new_begin; + $id = $nid; + $acc = $nacc; + } + } + # Don't forget to add the last record + $end = tell(SP); + $self->add_record($id, $i, $begin, $end) if $id; + + close SP; + return 1; +} + + +=head2 fetch + + Title : fetch + Usage : $index->fetch( $id ) + Function: Returns a Bio::Seq object from the index + Example : $seq = $index->fetch( 'dJ67B12' ) + Returns : Bio::Seq object + Args : ID + +=cut + +sub fetch { + my( $self, $id ) = @_; + my $desc; + my $db = $self->db(); + if (my $rec = $db->{ $id }) { + my( @record ); + + my ($file, $begin, $end) = $self->unpack_record( $rec ); + + # Get the (possibly cached) filehandle + my $fh = $self->_file_handle( $file ); + + # move to start + seek($fh, $begin, 0); + + return $fh; + } else { + $self->throw("Unable to find a record for $id in SwissPfam flat file index"); + } +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Index/Swissprot.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Index/Swissprot.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,218 @@ + +# +# $Id: Swissprot.pm,v 1.11 2002/10/22 07:38:33 lapp Exp $ +# +# BioPerl module for Bio::Index::Abstract +# +# Cared for by Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Index::Swissprot - Interface for indexing (multiple) Swissprot +.dat files (ie flat file swissprot format). + +=head1 SYNOPSIS + + # Complete code for making an index for several + # Swissprot files + use Bio::Index::Swissprot; + use strict; + + my $Index_File_Name = shift; + my $inx = Bio::Index::Swissprot->new('-filename' => $Index_File_Name, + '-write_flag' => 'WRITE'); + $inx->make_index(@ARGV); + + # Print out several sequences present in the index + # in gcg format + use Bio::Index::Swissprot; + use Bio::SeqIO; + use strict; + + my $out = Bio::SeqIO->new( '-format' => 'gcg', '-fh' => \*STDOUT ); + my $Index_File_Name = shift; + my $inx = Bio::Index::Swissprot->new('-filename' => $Index_File_Name); + + foreach my $id (@ARGV) { + my $seq = $inx->fetch($id); # Returns Bio::Seq object + $out->write_seq($seq); + } + + # alternatively + + my $seq1 = $inx->get_Seq_by_id($id); + my $seq2 = $inx->get_Seq_by_acc($acc); + +=head1 DESCRIPTION + +Inherits functions for managing dbm files from Bio::Index::Abstract.pm, +and provides the basic funtionallity for indexing Swissprot files, and +retrieving the sequence from them. Heavily snaffled from James Gilbert's +Fasta system. Note: for best results 'use strict'. + +=head1 FEED_BACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney + +Email - birney@sanger.ac.uk +(Swissprot adaption: lorenz@ist.org) + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let's begin the code... + + +package Bio::Index::Swissprot; + +use vars qw($VERSION @ISA); +use strict; + +use Bio::Index::AbstractSeq; +use Bio::Seq; + +@ISA = qw(Bio::Index::AbstractSeq); + +sub _type_stamp { + return '__Swissprot_FLAT__'; # What kind of index are we? +} + +# +# Suggested fix by Michael G Schwern to +# get around a clash with CPAN shell... +# + +BEGIN { + $VERSION = 0.1; +} + +sub _version { + return $VERSION; +} + +=head2 _index_file + + Title : _index_file + Usage : $index->_index_file( $file_name, $i ) + Function: Specialist function to index Swissprot format files. + Is provided with a filename and an integer + by make_index in its SUPER class. + Example : + Returns : + Args : + +=cut + +sub _index_file { + my( $self, + $file, # File name + $i # Index-number of file being indexed + ) = @_; + + my( $begin, # Offset from start of file of the start + # of the last found record. + $id, # ID of last found record. + @accs, # accession of last record. Also put into the index + ); + + $begin = 0; + + open SWISSPROT, $file or $self->throw("Can't open file for read : $file"); + + # Main indexing loop + $id = undef; + @accs = (); + while () { + if( /^\/\// ) { + if( ! defined $id ) { + $self->throw("Got to a end of entry line for an Swissprot flat file with no parsed ID. Considering this a problem!"); + next; + } + if( ! @accs ) { + $self->warn("For id [$id] in Swissprot flat file, got no accession number. Storing id index anyway"); + } + + $self->add_record($id, $i, $begin); + + foreach my $acc (@accs) { + if( $acc ne $id ) { + $self->add_record($acc, $i, $begin); + } + } + @accs = (); # reset acc array + $id = undef; # reset id + } elsif (/^ID\s+(\S+)/) { + $id = $1; + # not sure if I like this. Assummes tell is in bytes. + # we could tell before each line and save it. + $begin = tell(SWISSPROT) - length( $_ ); + + } elsif (/^AC(.*)/) { # ignore ? if there. + push(@accs, ($1 =~ /\s*(\S+);/g)); + } else { + # do nothing + } + } + + close SWISSPROT; + return 1; +} + +=head2 _file_format + + Title : _file_format + Usage : Internal function for indexing system + Function: Provides file format for this database + Example : + Returns : + Args : + + +=cut + +sub _file_format{ + my ($self,@args) = @_; + + return 'swiss'; +} + + + +1; + + + + + + + + + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/LiveSeq/AARange.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/LiveSeq/AARange.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,423 @@ +# $Id: AARange.pm,v 1.10 2001/10/22 08:22:49 heikki Exp $ +# +# bioperl module for Bio::LiveSeq::AARange +# +# Cared for by Joseph Insana +# +# Copyright Joseph Insana +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::LiveSeq::AARange - AARange abstract class for LiveSeq + +=head1 SYNOPSIS + + #documentation needed + +=head1 DESCRIPTION + +This is used as possible parent for aminoacid range object classes. +Or it can be used straight away to define aminoacid ranges. The idea +is that the ranges defined are attached to a Translation object and +they refer to its coordinate-system when they are first created (via +the new() method). When they are created they are anyway linked to +the underlying DNA LiveSeq by way of the LiveSeq labels. This allows +to preserve the ranges even if the numbering changes in the +Translation due to deletions or insertions. + +The protein sequence associated with the AARange can be accessed via +the usual seq() or subseq() methods. + +The start and end of the AARange in protein coordinate system can be +fetched with aa_start() and aa_end() methods. Note: the behaviour of +these methods would be influenced by the coordinate_start set in the +corresponding Translation object. This can be desirable but can also +lead to confusion if the coordinate_start had been changed and the +original position of the AARange was to be retrieved. + +start() and end() methods of the AARange will point to the labels +identifying the first nucleotide of the first and last triplet coding +for the start and end of the AminoAcidRange. + +The underlying nucleotide sequence of the AARange can be retrieved +with the labelsubseq() method. This would retrieve the whole DNA +sequence, including possible introns. This is called "DNA_sequence". + +To fetch the nucleotide sequence of the Transcript, without introns, +the labelsubseq() of the attached Transcript (the Transcript the +Translation comes from) has to be accessed. This is called +"cDNA_sequence". + +Here are the operations to retrieve these latter two kinds of +sequences: + + $startlabel=$AARange->start; + $endtripletlabel=$AARange->end; + $endlabel=$AARange->{'seq'}->label(3,$endtripletlabel,$AARange->strand); + + $dnaseq=$AARange->labelsubseq($startlabel,undef,$endlabel)); + + $cdnaseq=$AARange->get_Transcript->labelsubseq($startlabel,undef,$endlabel); + +To simplify, these operations have been included in two additional +methods: dna_seq() and cdna_seq(). + +These would return the whole sequence, as in the examples above. But +the above general scheme can be used by specifying different labels, +to retrieve hypothetical subsequences of interest. + +=head1 AUTHOR - Joseph A.L. Insana + +Email: Insana@ebi.ac.uk, jinsana@gmx.net + +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::LiveSeq::AARange; +$VERSION=1.8; + +# Version history: +# Wed Apr 19 15:10:29 BST 2000 v 1.0 begun +# Wed Apr 19 17:26:58 BST 2000 v 1.4 new, aa_start, aa_end, seq, length created +# Wed Apr 19 19:57:42 BST 2000 v 1.5 subseq position label added +# Thu Apr 20 16:13:58 BST 2000 v 1.7 added: documentation, dna_seq, cdna_seq +# Wed Mar 28 16:58:02 BST 2001 v 1.8 carp -> warn,throw (coded methods in SeqI) + +use strict; +use vars qw($VERSION @ISA); +use Bio::LiveSeq::SeqI 3.2; # uses SeqI, inherits from it +@ISA=qw(Bio::LiveSeq::SeqI); + +=head2 new + + Title : new + Usage : $aarange = Bio::LiveSeq::AARange->new(-translation => $obj_ref, + -start => $beginaa, + -end => $endaa, + -name => "ABCD", + -description => "DCBA", + -translength => $length); + + Function: generates a new AminoAcidRange LiveSeq object + Returns : reference to a new object of class AARange + Errorcode -1 + Args : two positions in AminoAcid coordinate numbering + an object reference specifying to which translation the aminoacid + ranges refer to + a name and a description (optional) + an optional "translength" argument: this can be given when + a lot of AARanges are to be created at the same time for the same + Translation object, calculating it with $translation->length + This would increase the speed, avoiding the new() function to + calculate everytime the same length again and again for every obj. + +=cut + +sub new { + my ($thing, %args) = @_; + my $class = ref($thing) || $thing; + my ($obj,%range); + + $obj = \%range; + $obj = bless $obj, $class; + my $self=$obj; + + my ($translation,$start,$end,$name,$description,$translength)=($args{-translation},$args{-start},$args{-end},$args{-name},$args{-description},$args{-translength}); + + unless (($translation)&&(ref($translation) eq "Bio::LiveSeq::Translation")) { + $self->warn("No -translation or wrong type given"); + return (-1); + } + unless ($translength) { # if it's not given, fetch it + $translength=$translation->length; + } + my $seq=$translation->{'seq'}; + + if (($start < 1)&&($start > $translength)) { + $self->warn("$class not initialised because start aminoacid position not valid"); + return (-1); + } + if (($end < 1)&&($end > $translength)) { + $self->warn("$class not initialised because end aminoacid position not valid"); + return (-1); + } + if ($start > $end) { + $self->warn("$class not initialised because start position > end position!"); + return (-1); + } + + my ($starttripletlabel,$endtripletlabel); + if ($start == $end) { # trick to increase speed + $starttripletlabel=$endtripletlabel=$translation->label($start); + } else { + ($starttripletlabel,$endtripletlabel)=($translation->label($start),$translation->label($end)); + } + unless (($starttripletlabel > 0)&&($endtripletlabel > 0)) { + $self->warn("$class not initialised because of problems in retrieving start or end label!"); + return (-1); + } + + # unsure if needed: + #my $endlabel=$seq->label(3,$endtripletlabel); # to get the real end + #unless ($endlabel > 0) { + #carp "$class not initialised because of problems retrieving the last nucleotide of the triplet coding for the end aminoacid"; + #return (-1); + #} + $self->{'seq'}=$seq; + $self->{'start'}=$starttripletlabel; + $self->{'end'}=$endtripletlabel; + $self->{'strand'}=$translation->strand; + $self->{'translation'}=$translation; + $self->{'name'}=$name; + $self->{'description'}=$description; + $self->{'alphabet'}="protein"; + + return $obj; +} + +sub coordinate_start { + my $self=shift; + $self->warn("Cannot perform this operation in an AminoAcidRange object!"); + return (-1); +} + +sub all_labels { + my $self=shift; + $self->warn("Cannot perform this operation in an AminoAcidRange object!"); + return (-1); +} + +sub valid { + my $self=shift; + $self->warn("Cannot perform this operation in an AminoAcidRange object!"); + return (-1); +} + +=head2 get_Transcript + + Title : valid + Usage : $transcript = $obj->get_Transcript() + Function: retrieves the reference to the object of class Transcript (if any) + attached to a LiveSeq object + Returns : object reference + Args : none + +=cut + +sub get_Transcript { + my $self=shift; + return ($self->get_Translation->get_Transcript); +} + +=head2 get_Translation + + Title : valid + Usage : $translation = $obj->get_Translation() + Function: retrieves the reference to the object of class Translation (if any) + attached to a LiveSeq object + Returns : object reference + Args : none + +=cut + +sub get_Translation { + my $self=shift; + return ($self->{'translation'}); +} + +sub change { + my $self=shift; + $self->warn("Cannot change an AminoAcidRange object!"); + return (-1); +} +sub positionchange { + my $self=shift; + $self->warn("Cannot change an AminoAcidRange object!"); + return (-1); +} +sub labelchange { + my $self=shift; + $self->warn("Cannot change an AminoAcidRange object!"); + return (-1); +} + +sub subseq { + my ($self,$pos1,$pos2,$length) = @_; + if (defined ($length)) { + if ($length < 1) { + $self->warn("No sense asking for a subseq of length < 1"); + return (-1); + } + } + unless (defined ($pos1)) { + $pos1=1; + } elsif ($pos1 < 1) { # if position out of boundaries + $self->warn("Starting position for AARange cannot be < 1!"); return (-1); + if ((defined ($pos2))&&($pos1>$pos2)) { + $self->warn("1st position($pos1) cannot be > 2nd position($pos2)!"); return (-1); + } + } + my $seq=$self->seq; + my $objlength=length($seq); + unless (defined ($length)) { + $length=$objlength-$pos1+1; + } + if (defined ($pos2)) { + if ($pos2 > $objlength) { # if position out of boundaries + $self->warn("Ending position for AARange cannot be > length of AARange!"); return (-1); + } + $length=$pos2-$pos1+1; + if ((defined ($pos1))&&($pos1>$pos2)) { + $self->warn("1st position($pos1) cannot be > 2nd position($pos2)!"); return (-1); + } + } + my $str=substr($seq,$pos1-1,$length); + if (length($str) < $length) { + $self->warn("Attention, cannot return the length requested for subseq",1); + } + return $str; +} + +sub seq { + my $self=shift; + my ($aa_start,$aa_end)=($self->aa_start,$self->aa_end); + unless (($aa_start)&&($aa_end)) { # they must both exist + $self->warn("Not able to find start or end of the AminoAcid Range"); + return (0); + } + my $translseq=$self->get_Translation->seq; + return substr($translseq,$aa_start-1,$aa_end-$aa_start+1); + # Note: it will return "undef" if the translation stops before the start + # of the aarange (because of upstream nonsense mutation creating STOP). + # For the same reason it would return uncomplete (up to the STOP) string + # if the stop happens in between aarange's start and stop +} + +sub length { + my $self=shift; + my $seq=$self->seq; + my $length=length($seq); + return $length; +} + +sub label { + my ($self,$position)=@_; + my $translation=$self->get_Translation; + my $origstart=$translation->coordinate_start; # preserve it + $translation->coordinate_start($self->start); # change it + my $label=$translation->label($position); + $translation->coordinate_start($origstart); # restore it + return ($label); +} + +sub position { + my ($self,$label)=@_; + my $translation=$self->get_Translation; + my $origstart=$translation->coordinate_start; # preserve it + $translation->coordinate_start($self->start); # change it + my $position=$translation->position($label); + $translation->coordinate_start($origstart); # restore it + return ($position); +} + +=head2 aa_start + + Title : aa_start + Usage : $end = $aarange->aa_start() + Returns : integer (position, according to Translation coordinate system) of + the start of an AminoAcidRange object + Args : none + +=cut + +sub aa_start { + my $self=shift; + my $aastart=$self->get_Translation->position($self->{'start'}); +} + +=head2 aa_end + + Title : aa_end + Usage : $end = $aarange->aa_end() + Returns : integer (position, according to Translation coordinate system) of + the end of an AminoAcidRange object + Args : none + +=cut + +sub aa_end { + my $self=shift; + my $aastart=$self->get_Translation->position($self->{'end'}); +} + +=head2 dna_seq + + Title : dna_seq + Usage : $end = $aarange->dna_seq() + Returns : the sequence at DNA level of the entire AminoAcidRange + this would include introns (if present) + Args : none + +=cut + +sub dna_seq { + my $self=shift; + my $startlabel=$self->start; + my $endtripletlabel=$self->end; + my $endlabel=$self->{'seq'}->label(3,$endtripletlabel,$self->strand); + return ($self->labelsubseq($startlabel,undef,$endlabel)); +} + +=head2 cdna_seq + + Title : cdna_seq + Usage : $end = $aarange->cdna_seq() + Returns : the sequence at cDNA level of the entire AminoAcidRange + i.e. this is the part of the Transcript that codes for the + AminoAcidRange. It would be composed just of exonic DNA. + Args : none + +=cut + +sub cdna_seq { + my $self=shift; + my $startlabel=$self->start; + my $endtripletlabel=$self->end; + my $endlabel=$self->{'seq'}->label(3,$endtripletlabel,$self->strand); + return ($self->get_Transcript->labelsubseq($startlabel,undef,$endlabel)); +} + +# this checks if the attached Transcript has a Gene object attached +sub gene { + my ($self,$value) = @_; + if (defined $value) { + $self->{'gene'} = $value; + } + unless (exists $self->{'gene'}) { + unless (exists $self->get_Transcript->{'gene'}) { + return (0); + } else { + return ($self->get_Transcript->{'gene'}); + } + } else { + return $self->{'gene'}; + } +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/LiveSeq/Chain.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/LiveSeq/Chain.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1378 @@ +#!/usr/bin/perl +# $Id: Chain.pm,v 1.12 2001/06/18 08:27:53 heikki Exp $ +# +# bioperl module for Bio::LiveSeq::Chain +# +# Cared for by Joseph Insana +# +# Copyright Joseph Insana +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code +# + +=head1 NAME + +Bio::LiveSeq::Chain - DoubleChain DataStructure for Perl + +=head1 SYNOPSIS + + #documentation needed + +=head1 DESCRIPTION + +This is a general purpose module (that's why it's not in object-oriented +form) that introduces a novel datastructure in PERL. It implements +the "double linked chain". The elements of the chain can contain basically +everything. From chars to strings, from object references to arrays or hashes. +It is used in the LiveSequence project to create a dynamical DNA sequence, +easier to manipulate and change. It's use is mainly for sequence variation +analysis but it could be used - for example - in e-cell projects. +The Chain module in itself doesn't have any biological bias, so can be +used for any programming purpose. + +Each element of the chain (with the exclusion of the first and the last of the +chain) is connected to other two elements (the PREVious and the NEXT one). +There is no absolute position (like in an array), hence if positions are +important, they need to be computed (methods are provided). +Otherwise it's easy to keep track of the elements with their "LABELs". +There is one LABEL (think of it as a pointer) to each ELEMENT. The labels +won't change after insertions or deletions of the chain. So it's +always possible to retrieve an element even if the chain has been +modified by successive insertions or deletions. +From this the high potential profit for bioinformatics: dealing with +sequences in a way that doesn't have to rely on positions, without +the need of constantly updating them if the sequence changes, even +dramatically. + +=head1 AUTHOR - Joseph A.L. Insana + +Email: Insana@ebi.ac.uk, jinsana@gmx.net + +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +# DoubleChain Data Structure for PERL +# by Joseph A.L. Insana - Deathson - Filius Mortis - Fal Mortais +# insana@ebi.ac.uk, jinsana@gmx.net + +package Bio::LiveSeq::Chain; +# Version history: +# Fri Mar 10 16:46:51 GMT 2000 v1.0 begun working on chains in perl +# Sat Mar 11 05:47:21 GMT 2000 v.1.4 working on splice method +# Sun Mar 12 14:08:31 GMT 2000 v.1.5 +# Sun Mar 12 17:21:51 GMT 2000 v.2.0 splice method working, is_updownstream made +# Sun Mar 12 18:11:22 GMT 2000 v.2.04 wrapped all in package Chain.pm +# Sun Mar 12 18:49:23 GMT 2000 v.2.08 added elements() +# Sun Mar 12 21:18:04 GMT 2000 v.2.1 done array2dchain, working on *insert* +# Sun Mar 12 23:04:40 GMT 2000 v.2.16 done *insert*, up_element, create_elems +# Sun Mar 12 23:45:32 GMT 2000 v.2.17 debugged and checked +# Mon Mar 13 00:44:51 GMT 2000 v.2.2 added mutate() +# Mon Mar 13 02:00:32 GMT 2000 v 2.21 added invert_dchain() +# Mon Mar 13 03:01:21 GMT 2000 v 2.22 created updown_chain2string +# Mon Mar 13 03:45:50 GMT 2000 v.2.24 added subchain_length() +# Mon Mar 13 17:25:04 GMT 2000 v.2.26 added element_at_pos and pos_of_element +# Wed Mar 15 23:05:06 GMT 2000 v.2.27 use strict enforced +# Thu Mar 16 19:05:34 GMT 2000 v.2.3 changed dchain->chain everywhere +# Fri Mar 17 01:48:36 GMT 2000 v.2.33 mutate_element renamed, created new +# methods: set_value, get_value... +# Fri Mar 17 05:03:15 GMT 2000 v.2.4 set_value_at_pos, get_value_at_pos +# get_label_at_pos... +# Fri Mar 17 15:51:07 GMT 2000 v.2.41 renamed pos_of_element -> get_pos_of_label +# Fri Mar 17 18:10:36 GMT 2000 v.2.44 recoded subchain_length and pos_of_label +# Fri Mar 17 20:12:27 GMT 2000 v.2.5 NAMING change: index->label everywhere +# Mon Mar 20 18:33:10 GMT 2000 v.2.52 label_exists(), start(), end() +# Mon Mar 20 23:10:28 GMT 2000 v.2.6 labels() created +# Wed Mar 22 18:35:17 GMT 2000 v.2.61 chain2string() rewritten +# Tue Dec 12 14:47:58 GMT 2000 v 2.66 optimized with /use integer/ +# Tue Dec 12 16:28:45 GMT 2000 v 2.7 rewritten comments to methods in pod style + +# +$VERSION=2.7; +# +# TODO_list: +# **** cleanup code +# **** performance concerns +# *??* create hash2dchain ???? (with hashkeys used for label) +# **????** how about using array of arrays instead than hash of arrays?? +# +# further strict complaints: +# in verbose $string assignment around line 721 ??? + +# TERMINOLOGY update, naming convention: +# "chain" the datastructure +# "element" the individual units that compose a chain +# "label" the unique name of a single element +# "position" the position of an element into the chain according to a +# particular coordinate system (e.g. counting from the start) +# "value" what is stored in a single element + +use Carp qw(croak cluck carp); # as of 2.3 +use strict; # as of 2.27 +use integer; # WARNING: this is to increase performance + # a little bit of attention has to be given if float need to + # be stored as elements of the array + # the use of this "integer" affects all operations but not + # assignments. So float CAN be assigned as elements of the chain + # BUT, if you assign $z=-1.8;, $z will be equal to -1 because + # "-" counts as a unary operation! + +=head2 _updown_chain2string + + Title : chain2string + Usage : $string = Bio::LiveSeq::Chain::chain2string("down",$chain,6,9) + Function: reads the contents of the chain, outputting a string + Returns : a string + Examples: + : down_chain2string($chain) -> all the chain from begin to end + : down_chain2string($chain,6) -> from 6 to the end + : down_chain2string($chain,6,4) -> from 6, going on 4 elements + : down_chain2string($chain,6,"",10) -> from 6 to 10 + : up_chain2string($chain,10,"",6) -> from 10 to 6 upstream + Defaults: start=first element; if len undef, goes to last + if last undef, goes to end + if last defined, it overrides len (undefining it) + Error code: -1 + Args : "up"||"down" as first argument to specify the reading direction + reference (to the chain) + [first] [len] [last] optional integer arguments to specify how + much and from (and to) where to read + +=cut + +# methods rewritten 2.61 +sub up_chain2string { + _updown_chain2string("up",@_); +} +sub down_chain2string { + _updown_chain2string("down",@_); +} + +sub _updown_chain2string { + my ($direction,$chain,$first,$len,$last)=@_; + unless($chain) { cluck "no chain input"; return (-1); } + my $begin=$chain->{'begin'}; # the label of the BEGIN element + my $end=$chain->{'end'}; # the label of the END element + my $flow; + + if ($direction eq "up") { + $flow=2; # used to determine the direction of chain navigation + unless ($first) { $first=$end; } # if undef or 0, use $end + } else { # defaults to "down" + $flow=1; # used to determine the direction of chain navigation + unless ($first) { $first=$begin; } # if undef or 0, use $begin + } + + unless($chain->{$first}) { + cluck "label for first not defined"; return (-1); } + if ($last) { # if last is defined, it gets priority and len is not used + unless($chain->{$last}) { + cluck "label for last not defined"; return (-1); } + if ($len) { + warn "Warning chain2string: argument LAST:$last overriding LEN:$len!"; + undef $len; + } + } else { + if ($direction eq "up") { + $last=$begin; # if last not defined, go 'till begin (or upto len elements) + } else { + $last=$end; # if last not defined, go 'till end (or upto len elements) + } + } + + my ($string,@array); + my $label=$first; my $i=1; + my $afterlast=$chain->{$last}[$flow]; # if last=end, afterlast is undef + unless (defined $afterlast) { $afterlast=0; } # keep strict happy + + # proceed for len elements or until last, whichever comes first + # if $len undef goes till end + while (($label) && ($label != $afterlast) && ($i <= ($len || $i + 1))) { + @array=@{$chain->{$label}}; + $string .= $array[0]; + $label = $array[$flow]; + $i++; + } + return ($string); # if chain is interrupted $string won't be complete +} + +=head2 _updown_labels + + Title : labels + Usage : @labels = Bio::LiveSeq::Chain::_updown_labels("down",$chain,4,16) + Function: returns all the labels in a chain or those between two + specified ones (termed "first" and "last") + Returns : a reference to an array containing the labels + Args : "up"||"down" as first argument to specify the reading direction + reference (to the chain) + [first] [last] (integer for the starting and eneding labels) + +=cut + + +# arguments: CHAIN_REF [FIRSTLABEL] [LASTLABEL] +# returns: reference to array containing the labels +sub down_labels { + my ($chain,$first,$last)=@_; + _updown_labels("down",$chain,$first,$last); +} +sub up_labels { + my ($chain,$first,$last)=@_; + _updown_labels("up",$chain,$first,$last); +} +# arguments: "up"||"down" CHAIN_REF [FIRSTLABEL] [LASTLABEL] +# returns: reference to array containing the labels +sub _updown_labels { + my ($direction,$chain,$first,$last)=@_; + unless($chain) { cluck "no chain input"; return (0); } + my $begin=$chain->{'begin'}; # the label of the BEGIN element + my $end=$chain->{'end'}; # the label of the END element + my $flow; + if ($direction eq "up") { $flow=2; + unless ($first) { $first=$end; } + unless ($last) { $last=$begin; } + } else { $flow=1; + unless ($last) { $last=$end; } + unless ($first) { $first=$begin; } + } + unless($chain->{$first}) { warn "not existing label $first"; return (0); } + unless($chain->{$last}) { warn "not existing label $last"; return (0); } + + my $label=$first; my @labels; + my $afterlast=$chain->{$last}[$flow]; # if last=end, afterlast is undef + unless (defined $afterlast) { $afterlast=0; } # keep strict happy + + while (($label)&&($label != $afterlast)) { + push(@labels,$label); + $label=$chain->{$label}[$flow]; + } + return (\@labels); # if chain is interrupted @labels won't be complete +} + + +=head2 start + + Title : start + Usage : $start = Bio::LiveSeq::Chain::start() + Returns : the label marking the start of the chain + Errorcode: -1 + Args : none + +=cut + +sub start { + my $chain=$_[0]; + unless($chain) { cluck "no chain input"; return (-1); } + return ($chain->{'begin'}); +} + +=head2 end + + Title : end + Usage : $end = Bio::LiveSeq::Chain::end() + Returns : the label marking the end of the chain + Errorcode: -1 + Args : none + +=cut + +sub end { + my $chain=$_[0]; + unless($chain) { cluck "no chain input"; return (-1); } + return ($chain->{'end'}); +} + +=head2 label_exists + + Title : label_exists + Usage : $check = Bio::LiveSeq::Chain::label_exists($chain,$label) + Function: It checks if a label is defined, i.e. if an element is there or + is not there anymore + Returns : 1 if the label exists, 0 if it is not there, -1 error + Errorcode: -1 + Args : reference to the chain, integer + +=cut + +sub label_exists { + my ($chain,$label)=@_; + unless($chain) { cluck "no chain input"; return (-1); } + if ($label && $chain->{$label}) { return (1); } else { return (0) }; +} + + +=head2 down_get_pos_of_label + + Title : down_get_pos_of_label + Usage : $position = Bio::LiveSeq::Chain::down_get_pos_of_label($chain,$label,$first) + Function: returns the position of $label counting from $first, i.e. taking + $first as 1 of coordinate system. If $first is not specified it will + count from the start of the chain. + Returns : + Errorcode: 0 + Args : reference to the chain, integer (the label of interest) + optional: integer (a different label that will be taken as the + first one, i.e. the one to count from) + Note: It counts "downstream". To proceed backward use up_get_pos_of_label + +=cut + +sub down_get_pos_of_label { + #down_chain2string($_[0],$_[2],undef,$_[1],"counting"); + my ($chain,$label,$first)=@_; + _updown_count("down",$chain,$first,$label); +} +sub up_get_pos_of_label { + #up_chain2string($_[0],$_[2],undef,$_[1],"counting"); + my ($chain,$label,$first)=@_; + _updown_count("up",$chain,$first,$label); +} + +=head2 down_subchain_length + + Title : down_subchain_length + Usage : $length = Bio::LiveSeq::Chain::down_subchain_length($chain,$first,$last) + Function: returns the length of the chain between the labels "first" and "last", included + Returns : integer + Errorcode: 0 + Args : reference to the chain, integer, integer + Note: It counts "downstream". To proceed backward use up_subchain_length + +=cut + +# arguments: chain_ref [first] [last] +# returns the length of the chain between first and last (included) +sub down_subchain_length { + #down_chain2string($_[0],$_[1],undef,$_[2],"counting"); + my ($chain,$first,$last)=@_; + _updown_count("down",$chain,$first,$last); +} +sub up_subchain_length { + #up_chain2string($_[0],$_[1],undef,$_[2],"counting"); + my ($chain,$first,$last)=@_; + _updown_count("up",$chain,$first,$last); +} + +# arguments: DIRECTION CHAIN_REF FIRSTLABEL LASTLABEL +# errorcode 0 +sub _updown_count { + my ($direction,$chain,$first,$last)=@_; + unless($chain) { cluck "no chain input"; return (0); } + my $begin=$chain->{'begin'}; # the label of the BEGIN element + my $end=$chain->{'end'}; # the label of the END element + my $flow; + if ($direction eq "up") { $flow=2; + unless ($first) { $first=$end; } + unless ($last) { $last=$begin; } + } else { $flow=1; + unless ($last) { $last=$end; } + unless ($first) { $first=$begin; } + } + unless($chain->{$first}) { warn "not existing label $first"; return (0); } + unless($chain->{$last}) { warn "not existing label $last"; return (0); } + + my $label=$first; my $count; + my $afterlast=$chain->{$last}[$flow]; # if last=end, afterlast is undef + unless (defined $afterlast) { $afterlast=0; } # keep strict happy + + while (($label)&&($label != $afterlast)) { + $count++; + $label=$chain->{$label}[$flow]; + } + return ($count); # if chain is interrupted, $i will be up to the breaking point +} + +=head2 invert_chain + + Title : invert_chain + Usage : $errorcode=Bio::LiveSeq::Chain::invert_chain($chain) + Function: completely inverts the order of the chain elements; begin is swapped with end and all links updated (PREV&NEXT fields swapped) + Returns : 1 if all OK, 0 if errors + Errorcode: 0 + Args : reference to the chain + +=cut + +sub invert_chain { + my $chain=$_[0]; + unless($chain) { cluck "no chain input"; return (0); } + my $begin=$chain->{'begin'}; # the name of the first element + my $end=$chain->{'end'}; # the name of the last element + my ($label,@array); + $label=$begin; # starts from the beginning + while ($label) { # proceed with linked elements, swapping PREV and NEXT + @array=@{$chain->{$label}}; + ($chain->{$label}[1],$chain->{$label}[2])=($array[2],$array[1]); # swap + $label = $array[1]; # go to the next one + } + # now swap begin and end fields + ($chain->{'begin'},$chain->{'end'})=($end,$begin); + return (1); # that's it +} + +# warning that method has changed name +#sub mutate_element { + #croak "Warning: old method name. Please update code to 'set_value_at_label'\n"; + # &set_value_at_label; +#} + +=head2 down_get_value_at_pos + + Title : down_get_value_at_pos + Usage : $value = Bio::LiveSeq::Chain::down_get_value_at_pos($chain,$position,$first) + Function: used to access the value of the chain at a particular position instead than directly with a label pointer. It will count the position from the start of the chain or from the label $first, if $first is specified + Returns : whatever is stored in the element of the chain + Errorcode: 0 + Args : reference to the chain, integer, [integer] + Note: It works "downstream". To proceed backward use up_get_value_at_pos + +=cut + +#sub get_value_at_pos { + #croak "Please use instead: down_get_value_at_pos"; + ##&down_get_value_at_pos; +#} +sub down_get_value_at_pos { + my ($chain,$position,$first)=@_; + my $label=down_get_label_at_pos($chain,$position,$first); + # check place of change + if (($label eq -1)||($label eq 0)) { # complain if label doesn't exist + warn "not existing element $label"; return (0); } + return _get_value($chain,$label); +} +sub up_get_value_at_pos { + my ($chain,$position,$first)=@_; + my $label=up_get_label_at_pos($chain,$position,$first); + # check place of change + if (($label eq -1)||($label eq 0)) { # complain if label doesn't exist + warn "not existing element $label"; return (0); } + return _get_value($chain,$label); +} + +=head2 down_set_value_at_pos + + Title : down_set_value_at_pos + Usage : $errorcode = Bio::LiveSeq::Chain::down_set_value_at_pos($chain,$newvalue,$position,$first) + Function: used to store a new value inside an element of the chain at a particular position instead than directly with a label pointer. It will count the position from the start of the chain or from the label $first, if $first is specified + Returns : 1 + Errorcode: 0 + Args : reference to the chain, newvalue, integer, [integer] + (newvalue can be: integer, string, object reference, hash ref) + Note: It works "downstream". To proceed backward use up_set_value_at_pos + Note2: If the $newvalue is undef, it will delete the contents of the + element but it won't remove the element from the chain. + +=cut + +#sub set_value_at_pos { + #croak "Please use instead: down_set_value_at_pos"; + ##&down_set_value_at_pos; +#} +sub down_set_value_at_pos { + my ($chain,$value,$position,$first)=@_; + my $label=down_get_label_at_pos($chain,$position,$first); + # check place of change + if (($label eq -1)||($label eq 0)) { # complain if label doesn't exist + warn "not existing element $label"; return (0); } + _set_value($chain,$label,$value); + return (1); +} +sub up_set_value_at_pos { + my ($chain,$value,$position,$first)=@_; + my $label=up_get_label_at_pos($chain,$position,$first); + # check place of change + if (($label eq -1)||($label eq 0)) { # complain if label doesn't exist + warn "not existing element $label"; return (0); } + _set_value($chain,$label,$value); + return (1); +} + + +=head2 down_set_value_at_label + + Title : down_set_value_at_label + Usage : $errorcode = Bio::LiveSeq::Chain::down_set_value_at_label($chain,$newvalue,$label) + Function: used to store a new value inside an element of the chain defined by its label. + Returns : 1 + Errorcode: 0 + Args : reference to the chain, newvalue, integer + (newvalue can be: integer, string, object reference, hash ref) + Note: It works "downstream". To proceed backward use up_set_value_at_label + Note2: If the $newvalue is undef, it will delete the contents of the + element but it won't remove the element from the chain. + +=cut + +sub set_value_at_label { + my ($chain,$value,$label)=@_; + unless($chain) { cluck "no chain input"; return (0); } + + # check place of change + unless($chain->{$label}) { # complain if label doesn't exist + warn "not existing element $label"; return (0); } + _set_value($chain,$label,$value); + return (1); +} + +=head2 down_get_value_at_label + + Title : down_get_value_at_label + Usage : $value = Bio::LiveSeq::Chain::down_get_value_at_label($chain,$label) + Function: used to access the value of the chain from one element defined by its label. + Returns : whatever is stored in the element of the chain + Errorcode: 0 + Args : reference to the chain, integer + Note: It works "downstream". To proceed backward use up_get_value_at_label + +=cut + +sub get_value_at_label { + my $chain=$_[0]; + unless($chain) { cluck "no chain input"; return (0); } + my $label = $_[1]; # the name of the element + + # check place of change + unless($chain->{$label}) { # complain if label doesn't exist + warn "not existing label $label"; return (0); } + return _get_value($chain,$label); +} + +# arguments: CHAIN_REF LABEL VALUE +sub _set_value { + my ($chain,$label,$value)=@_; + $chain->{$label}[0]=$value; +} +# arguments: CHAIN_REF LABEL +sub _get_value { + my ($chain,$label)=@_; + return $chain->{$label}[0]; +} + +=head2 down_get_label_at_pos + + Title : down_get_label_at_pos + Usage : $label = Bio::LiveSeq::Chain::down_get_label_at_pos($chain,$position,$first) + Function: used to retrieve the label of an an element of the chain at a particular position. It will count the position from the start of the chain or from the label $first, if $first is specified + Returns : integer + Errorcode: 0 + Args : reference to the chain, integer, [integer] + Note: It works "downstream". To proceed backward use up_get_label_at_pos + +=cut + +# arguments: CHAIN_REF POSITION [FIRST] +# returns: LABEL of element found counting from FIRST +sub down_get_label_at_pos { + _updown_get_label_at_pos("down",@_); +} +sub up_get_label_at_pos { + _updown_get_label_at_pos("up",@_); +} + +# arguments: [DIRECTION] CHAIN_REF POSITION [FIRST] +# Default DIRECTION="down" +# if FIRST is undefined, FIRST=START (if DIRECTION=down) or FIRST=END (up) + +sub _updown_get_label_at_pos { + my ($direction,$chain,$position,$first)=@_; + unless($chain) { cluck "no chain input"; return (0); } + my $begin=$chain->{'begin'}; # the label of the BEGIN element + my $end=$chain->{'end'}; # the label of the END element + my $flow; + if ($direction eq "up") { $flow=2; unless ($first) { $first=$end; } + } else { $flow=1; unless ($first) { $first=$begin; } } + unless($chain->{$first}) { warn "not existing label $first"; return (0); } + + my $label=$first; + my $i=1; + while ($i < $position) { + $label=$chain->{$label}[$flow]; + $i++; + unless ($label) { return (0); } # chain ended before position reached + } + return ($label); +} + +# for english_concerned, latin_unconcerned people +sub preinsert_string { &praeinsert_string } +sub preinsert_array { &praeinsert_array } + +# praeinsert_string CHAIN_REF STRING [POSITION] +# the chars of STRING are passed to praeinsert_array +# the chars are inserted in CHAIN, before POSITION +# if POSITION is undef, default is to prepend the string to the beginning +# i.e. POSITION is START of CHAIN +sub praeinsert_string { + my @string=split(//,$_[1]); + praeinsert_array($_[0],\@string,$_[2]); +} + +# postinsert_string CHAIN_REF STRING [POSITION] +# the chars of STRING are passed to postinsert_array +# the chars are inserted in CHAIN, after POSITION +# if POSITION is undef, default is to append the string to the end +# i.e. POSITION is END of CHAIN +sub postinsert_string { + my @string=split(//,$_[1]); + postinsert_array($_[0],\@string,$_[2]); +} + +# praeinsert_array CHAIN_REF ARRAY_REF [POSITION] +# the elements of ARRAY are inserted in CHAIN, before POSITION +# if POSITION is undef, default is to prepend the elements to the beginning +# i.e. POSITION is START of CHAIN +sub praeinsert_array { + _praepostinsert_array($_[0],"prae",$_[1],$_[2]); +} + +# postinsert_array CHAIN_REF ARRAY_REF [POSITION] +# the elements of ARRAY are inserted in CHAIN, after POSITION +# if POSITION is undef, default is to append the elements to the end +# i.e. POSITION is END of CHAIN +sub postinsert_array { + _praepostinsert_array($_[0],"post",$_[1],$_[2]); +} + + +=head2 _praepostinsert_array + + Title : _praepostinsert_array + Usage : ($insbegin,$insend) = Bio::LiveSeq::Chain::_praepostinsert_array($chainref,"post",$arrayref,$position) + Function: the elements of the array specified by $arrayref are inserted (creating a new subchain) in the chain specified by $chainref, before or after (depending on the "prae"||"post" keyword passed as second argument) the specified position. + Returns : two labels: the first and the last of the inserted subchain + Defaults: if no position is specified, the new chain will be inserted after + (post) the first element of the chain + Errorcode: 0 + Args : chainref, "prae"||"post", arrayref, integer (position) + +=cut + +# returns: 0 if errors, otherwise returns references of begin and end of +# the insertion +sub _praepostinsert_array { + my $chain=$_[0]; + unless($chain) { cluck "no chain input"; return (0); } + my $praepost=$_[1] || "post"; # defaults to post + my ($prae,$post); + my $position=$_[3]; + my $begin=$chain->{'begin'}; # the name of the first element of the chain + my $end=$chain->{'end'}; # the name of the the last element of the chain + # check if prae or post insertion and prepare accordingly + if ($praepost eq "prae") { + $prae=1; + unless (($position eq 0)||($position)) { $position=$begin; } # if undef, use $begin + } else { + $post=1; + unless (($position eq 0)||($position)) { $position=$end; } # if undef, use $end + } + # check place of insertion + unless($chain->{$position}) { # complain if position doesn't exist + warn ("Warning _praepostinsert_array: not existing element $position"); + return (0); + } + + # check if there are elements to insert + my $elements=$_[2]; # reference to the array containing the new elements + my $elements_count=scalar(@{$elements}); + unless ($elements_count) { + warn ("Warning _praepostinsert_array: no elements input"); return (0); } + + # create new chainelements with offset=firstfree(chain) + my ($insertbegin,$insertend)=_create_chain_elements($chain,$elements); + + # DEBUGGING + #print "Executing ${praepost}insertion of $elements_count elements ('@{$elements}') at position: $position\n"; + + # attach the new chain to the old chain + # 4 cases: prae@begin, prae@middle, post@middle, post@end + # NOTE: in case of double joinings always join wisely so not to + # delete the PREV/NEXT attribute before it is needed + my $noerror=1; + if ($prae) { + if ($position==$begin) { # 1st case: prae@begin + $noerror=_join_chain_elements($chain,$insertend,$begin); + $chain->{'begin'}=$insertbegin; + } else { # 2nd case: prae@middle + $noerror=_join_chain_elements($chain,up_element($chain,$position),$insertbegin); + $noerror=_join_chain_elements($chain,$insertend,$position); + } + } elsif ($post) { + if ($position==$end) { # 4th case: post@end + $noerror=_join_chain_elements($chain,$end,$insertbegin); + $chain->{'end'}=$insertend; + } else { # 3rd case: post@middle # note the order of joins (important) + $noerror=_join_chain_elements($chain,$insertend,down_element($chain,$position)); + $noerror=_join_chain_elements($chain,$position,$insertbegin); + } + } else { # this should never happen + die "_praepostinsert_array: Something went very wrong"; + } + + # check for errors and return begin,end of insertion + if ($noerror) { + return ($insertbegin,$insertend); + } else { # something went wrong with the joinings + warn "Warning _praepostinsert_array: Joining of insertion failed"; + return (0); + } +} + +# create new chain elements with offset=firstfree +# arguments: CHAIN_REF ARRAY_REF +# returns: pointers to BEGIN and END of new chained elements created +# returns 0 if error(s) encountered +sub _create_chain_elements { + my $chain=$_[0]; + unless($chain) { + warn ("Warning _create_chain_elements: no chain input"); return (0); } + my $arrayref=$_[1]; + my $array_count=scalar(@{$arrayref}); + unless ($array_count) { + warn ("Warning _create_chain_elements: no elements input"); return (0); } + my $begin=$chain->{'firstfree'}; + my $i=$begin-1; + my $element; + foreach $element (@{$arrayref}) { + $i++; + $chain->{$i}=[$element,$i+1,$i-1]; + } + my $end=$i; + $chain->{'firstfree'}=$i+1; # what a new added element should be called + $chain->{'size'} += $end-$begin+1; # increase size of chain + # leave sticky edges (to be joined by whoever called this subroutine) + $chain->{$begin}[2]=undef; + $chain->{$end}[1]=undef; + return ($begin,$end); # return pointers to first and last of the newelements +} + +# argument: CHAIN_REF ELEMENT +# returns: name of DOWN/NEXT element (the downstream one) +# returns -1 if error encountered (e.g. chain or elements undefined) +# returns 0 if there's no DOWN element +sub down_element { + _updown_element("down",@_); +} +# argument: CHAIN_REF ELEMENT +# returns: name of UP/PREV element (the upstream one) +# returns -1 if error encountered (e.g. chain or elements undefined) +# returns 0 if there's no UP element +sub up_element { + _updown_element("up",@_); +} + +# used by both is_up_element and down_element +sub _updown_element { + my $direction=$_[0] || "down"; # defaults to downstream + my $flow; + if ($direction eq "up") { + $flow=2; # used to determine the direction of chain navigation + } else { + $flow=1; # used to determine the direction of chain navigation + } + my $chain=$_[1]; + unless($chain) { + warn ("Warning ${direction}_element: no chain input"); return (-1); } + my $me = $_[2]; # the name of the element + my $it = $chain->{$me}[$flow]; # the prev||next one, upstream||downstream + if ($it) { + return ($it); # return the name of prev||next element + } else { + return (0); # there is no prev||next element ($it is undef) + } +} + +# used by both is_downstream and is_upstream +sub _is_updownstream { + my $direction=$_[0] || "down"; # defaults to downstream + my $flow; + if ($direction eq "up") { + $flow=2; # used to determine the direction of chain navigation + } else { + $flow=1; # used to determine the direction of chain navigation + } + my $chain=$_[1]; + unless($chain) { + warn ("Warning is_${direction}stream: no chain input"); return (-1); } + my $first=$_[2]; # the name of the first element + my $second=$_[3]; # the name of the first element + if ($first==$second) { + warn ("Warning is_${direction}stream: first==second!!"); return (0); } + unless($chain->{$first}) { + warn ("Warning is_${direction}stream: first element not defined"); return (-1); } + unless($chain->{$second}) { + warn ("Warning is_${direction}stream: second element not defined"); return (-1); } + my ($label,@array); + $label=$first; + my $found=0; + while (($label)&&(!($found))) { # searches till the end or till found + if ($label==$second) { + $found=1; + } + @array=@{$chain->{$label}}; + $label = $array[$flow]; # go to the prev||next one, upstream||downstream + } + return $found; +} + +=head2 is_downstream + + Title : is_downstream + Usage : Bio::LiveSeq::Chain::is_downstream($chainref,$firstlabel,$secondlabel) + Function: checks if SECONDlabel follows FIRSTlabel + It runs downstream the elements of the chain from FIRST searching + for SECOND. + Returns : 1 if SECOND is found /after/ FIRST; 0 otherwise (i.e. if it + reaches the end of the chain without having found it) + Errorcode -1 + Args : two labels (integer) + +=cut + +sub is_downstream { + _is_updownstream("down",@_); +} + +=head2 is_upstream + + Title : is_upstream + Usage : Bio::LiveSeq::Chain::is_upstream($chainref,$firstlabel,$secondlabel) + Function: checks if SECONDlabel follows FIRSTlabel + It runs upstream the elements of the chain from FIRST searching + for SECOND. + Returns : 1 if SECOND is found /after/ FIRST; 0 otherwise (i.e. if it + reaches the end of the chain without having found it) + Errorcode -1 + Args : two labels (integer) + +=cut + +sub is_upstream { + _is_updownstream("up",@_); +} + +=head2 check_chain + + Title : check_chain + Usage : @errorcodes = Bio::LiveSeq::Chain::check_chain() + Function: a wraparound to a series of check for consistency of the chain + It will check for boundaries, size, backlinking and forwardlinking + Returns : array of 4 warn codes, each can be 1 (all ok) or 0 (something wrong) + Errorcode: 0 + Args : none + Note : this is slow and through. It is not really needed. It is mostly + a code-developer tool. + +=cut + +sub check_chain { + my $chain=$_[0]; + unless($chain) { + warn ("Warning check_chain: no chain input"); return (-1); } + my ($warnbound,$warnsize,$warnbacklink,$warnforlink); + $warnbound=&_boundcheck; # passes on the arguments of the subroutine + $warnsize=&_sizecheck; + $warnbacklink=&_downlinkcheck; + $warnforlink=&_uplinkcheck; + return ($warnbound,$warnsize,$warnbacklink,$warnforlink); +} + +# consistency check for forwardlinks walking upstream +# argument: a chain reference +# returns: 1 all OK 0 problems +sub _uplinkcheck { + _updownlinkcheck("up",@_); +} + +# consistency check for backlinks walking downstream +# argument: a chain reference +# returns: 1 all OK 0 problems +sub _downlinkcheck { + _updownlinkcheck("down",@_); +} + +# consistency check for links, common to _uplinkcheck and _downlinkcheck +# argument: "up"||"down", check_ref +# returns: 1 all OK 0 problems +sub _updownlinkcheck { + my $direction=$_[0] || "down"; # defaults to downstream + my ($flow,$wolf); + my $chain=$_[1]; + unless($chain) { + warn ("Warning _${direction}linkcheck: no chain input"); return (0); } + my $begin=$chain->{'begin'}; # the name of the first element + my $end=$chain->{'end'}; # the name of the last element + my ($label,@array,$me,$it,$itpoints); + if ($direction eq "up") { + $flow=2; # used to determine the direction of chain navigation + $wolf=1; + $label=$end; # start from end + } else { + $flow=1; # used to determine the direction of chain navigation + $wolf=2; + $label=$begin; # start from beginning + } + my $warncode=1; + + while ($label) { # proceed with linked elements, checking neighbours + $me=$label; + @array=@{$chain->{$label}}; + $label = $array[$flow]; # go to the next one + $it=$label; + if ($it) { # no sense in checking if next one not defined (END element) + @array=@{$chain->{$label}}; + $itpoints=$array[$wolf]; + unless ($me==$itpoints) { + warn "Warning: ${direction}LinkCheck: LINK wrong in $it, that doesn't point back to me ($me). It points to $itpoints\n"; + $warncode=0; + } + } + } + return $warncode; +} + +# consistency check for size of chain +# argument: a chain reference +# returns: 1 all OK 0 wrong size +sub _sizecheck { + my $chain=$_[0]; + unless($chain) { + warn ("Warning _sizecheck: no chain input"); return (0); } + my $begin=$chain->{'begin'}; # the name of the first element + my $warncode=1; + my ($label,@array); + my $size=$chain->{'size'}; + my $count=0; + $label=$begin; + while ($label) { # proceed with linked elements, counting + @array=@{$chain->{$label}}; + $label = $array[1]; # go to the next one + $count++; + } + if ($size != $count) { + warn "Size check reports error: assumed size: $size, real size: $count "; + $warncode=0; + } + return $warncode; +} + + +# consistency check for begin and end (boundaries) +# argument: a chain reference +# returns: 1 all OK 0 problems +sub _boundcheck { + my $chain=$_[0]; + unless($chain) { + warn ("Warning _boundcheck: no chain input"); return (0); } + my $begin=$chain->{'begin'}; # the name of the first element + my $end=$chain->{'end'}; # the name of the (supposedly) last element + my $warncode=1; + + # check SYNC of beginning + if (($begin)&&($chain->{$begin})) { # if the BEGIN points to existing element + if ($chain->{$begin}[2]) { # if BEGIN element has PREV not undef + warn "Warning: BEGIN element has PREV field defined \n"; + warn "\tWDEBUG begin: $begin\t"; + warn "\tWDEBUG begin's PREV: $chain->{$begin}[2] \n"; + $warncode=0; + } + } else { + warn "Warning: BEGIN key of chain does not point to existing element!\n"; + warn "\tWDEBUG begin: $begin\n"; + $warncode=0; + } + # check SYNC of end + if (($end)&&($chain->{$end})) { # if the END points to an existing element + if ($chain->{$end}[1]) { # if END element has NEXT not undef + warn "Warning: END element has NEXT field defined \n"; + warn "\tWDEBUG end: $end\t"; + warn "\tWDEBUG end's NEXT: $chain->{$end}[1] \n"; + $warncode=0; + } + } else { + warn "Warning: END key of chain does not point to existing element!\n"; + warn "\tWDEBUG end: $end\n"; + $warncode=0; + } + return $warncode; +} + +# arguments: chain_ref +# returns: the size of the chain (the number of elements) +# return code -1: unexistant chain, errors... +sub chain_length { + my $chain=$_[0]; + unless($chain) { + warn ("Warning chain_length: no chain input"); return (-1); } + my $size=$chain->{'size'}; + if ($size) { + return ($size); + } else { + return (-1); + } +} + +# arguments: chain ref, first element name, second element name +# returns: 1 or 0 (1 ok, 0 errors) +sub _join_chain_elements { + my $chain=$_[0]; + unless($chain) { + warn ("Warning _join_chain_elements: no chain input"); return (0); } + my $leftelem=$_[1]; + my $rightelem=$_[2]; + unless(($leftelem)&&($rightelem)) { + warn ("Warning _join_chain_elements: element arguments??"); return (0); } + if (($chain->{$leftelem})&&($chain->{$rightelem})) { # if the elements exist + $chain->{$leftelem}[1]=$rightelem; + $chain->{$rightelem}[2]=$leftelem; + return 1; + } else { + warn ("Warning _join_chain_elements: elements not defined"); + return 0; + } +} + +=head2 splice_chain + + Title : splice_chain + Usage : @errorcodes = Bio::LiveSeq::Chain::splice_chain($chainref,$first,$length,$last) + Function: removes the elements designated by FIRST and LENGTH from a chain. + The chain shrinks accordingly. If LENGTH is omitted, removes + everything from FIRST onward. + If END is specified, LENGTH is ignored and instead the removal + occurs from FIRST to LAST. + Returns : the elements removed as a string + Errorcode: -1 + Args : chainref, integer, integer, integer + +=cut + +sub splice_chain { + my $chain=$_[0]; + unless($chain) { + warn ("Warning splice_chain: no chain input"); return (-1); } + my $begin=$chain->{'begin'}; # the name of the first element + my $end=$chain->{'end'}; # the name of the (supposedly) last element + my $first=$_[1]; + unless (($first eq 0)||($first)) { $first=$begin; } # if undef, use $begin + my $len=$_[2]; + my $last=$_[3]; + my (@array, $string); + my ($beforecut,$aftercut); + + unless($chain->{$first}) { + warn ("Warning splice_chain: first element not defined"); return (-1); } + if ($last) { # if last is defined, it gets priority and len is not used + unless($chain->{$last}) { + warn ("Warning splice_chain: last element not defined"); return (-1); } + if ($len) { + warn ("Warning splice_chain: argument LAST:$last overriding LEN:$len!"); + undef $len; + } + } else { + $last=$end; # if last not defined, go 'till end (or to len, whichever 1st) + } + + $beforecut=$chain->{$first}[2]; # what's the element before 1st deleted? + # if it is undef then it means we are splicing since the beginning + + my $i=1; + my $label=$first; + my $afterlast=$chain->{$last}[1]; # if $last=$end $afterlast should be undef + unless (defined $afterlast) { $afterlast=0; } # keep strict happy + + # proceed for len elements or until the end, whichever comes first + # if len undef goes till last + while (($label)&&($label != $afterlast) && ($i <= ($len || $i + 1))) { + @array=@{$chain->{$label}}; + $string .= $array[0]; + $aftercut = $array[1]; # what's the element next last deleted? + # also used as savevar to change label posdeletion + delete $chain->{$label}; # this can be deleted now + $label=$aftercut; # label is updated using the savevar + $i++; + } + + # Now fix the chain (sticky edges, fields) + # 4 cases: cut in the middle, cut from beginning, cut till end, cut all + #print "\n\tstickyDEBUG beforecut: $beforecut "; # DEBUG + #print "\taftercut: $aftercut \n"; # DEBUG + if ($beforecut) { + if ($aftercut) { # 1st case, middle cut + _join_chain_elements($chain,$beforecut,$aftercut); + } else { # 3rd case, end cut + $chain->{'end'}=$beforecut; # update the END field + $chain->{$beforecut}[1]=undef; # since we cut till the end + } + } else { + if ($aftercut) { # 2nd case, begin cut + $chain->{'begin'}=$aftercut; # update the BEGIN field + $chain->{$aftercut}[2]=undef; # since we cut from beginning + } else { # 4th case, all has been cut + $chain->{'begin'}=undef; + $chain->{'end'}=undef; + } + } + $chain->{'size'}=($chain->{'size'}) - $i + 1; # update the SIZE field + + return $string; +} + + +# arguments: CHAIN_REF POSITION [FIRST] +# returns: element counting POSITION from FIRST or from START if FIRST undef +# i.e. returns the element at POSITION counting from FIRST +#sub element_at_pos { + #croak "Warning: old method name. Please update code to 'down_get_label_at_position'\n"; + ##&down_element_at_pos; +#} +#sub up_element_at_pos { + ## old wraparound + ##my @array=up_chain2string($_[0],$_[2],$_[1],undef,"elements"); + ##return $array[-1]; + #croak "old method name. Update code to: up_get_label_at_position"; + ##&up_get_label_at_pos; +#} +#sub down_element_at_pos { + ## old wraparound + ##my @array=down_chain2string($_[0],$_[2],$_[1],undef,"elements"); + ##return $array[-1]; + #croak "old method name. Update code to: down_get_label_at_position"; + ##&down_get_label_at_pos; +#} + +# arguments: CHAIN_REF ELEMENT [FIRST] +# returns: the position of ELEMENT counting from FIRST or from START +#i if FIRST is undef +# i.e. returns the Number of elements between FIRST and ELEMENT +# i.e. returns the position of element taking FIRST as 1 of coordinate system +#sub pos_of_element { + #croak ("Warning: old and ambiguous method name. Please update code to 'down_get_pos_of_label'\n"); + ##&down_pos_of_element; +#} +#sub up_pos_of_element { + #croak ("Warning: old method name. Please update code to 'up_get_pos_of_label'\n"); + ##up_chain2string($_[0],$_[2],undef,$_[1],"counting"); +#} +#sub down_pos_of_element { + #croak ("Warning: old method name. Please update code to 'down_get_pos_of_label'\n"); + ##down_chain2string($_[0],$_[2],undef,$_[1],"counting"); +#} + +# wraparounds to calculate length of subchain from first to last +# arguments: chain_ref [first] [last] +#sub subchain_length { + #croak "Warning: old method name. Please update code to 'down_subchain_length'\n"; + ##&down_subchain_length; +#} + +# wraparounds to have elements output +# same arguments as chain2string +# returns label|name of every element +#sub elements { + #croak ("Warning: method no more supported. Please update code to 'down_labels' (NB: now it returns ref to array and doesn't allow length argument!)\n"); + ##&down_elements; +#} +#sub up_elements { + #croak ("Warning: method no more supported. Please update code to 'up_labels' (NB: now it returns ref to array and doesn't allow length argument!)\n"); + ##up_chain2string($_[0],$_[1],$_[2],$_[3],"elements"); +#} +#sub down_elements { + #croak ("Warning: method no more supported. Please update code to 'down_labels' (NB: now it returns ref to array and doesn't allow length argument!)\n"); + ##down_chain2string($_[0],$_[1],$_[2],$_[3],"elements"); +#} + +# wraparounds to have verbose output +# same arguments as chain2string +# returns the chain in a very verbose way +sub chain2string_verbose { + carp "Warning: method no more supported.\n"; + &old_down_chain2string_verbose; +} +sub up_chain2string_verbose { + carp "Warning: method no more supported.\n"; + old_up_chain2string($_[0],$_[1],$_[2],$_[3],"verbose"); +} +sub down_chain2string_verbose { + carp "Warning: method no more supported.\n"; + old_down_chain2string($_[0],$_[1],$_[2],$_[3],"verbose"); +} + +#sub chain2string { + #croak ("Warning: old method name. Please update code to 'down_chain2string'\n"); + ##&down_chain2string; +#} +sub old_up_chain2string { + old_updown_chain2string("up",@_); +} +sub old_down_chain2string { + old_updown_chain2string("down",@_); +} + +# common to up_chain2string and down_chain2string +# arguments: "up"||"down" chain_ref [first] [len] [last] [option] +# [option] can be any of "verbose", "counting", "elements" +# error: return -1 +# defaults: start = first element; if len undef, goes to last +# if last undef, goes to end +# if last def it overrides len (that gets undef) +# returns: a string +# example usage: down_chain2string($chain) -> all the chain from begin to end +# example usage: down_chain2string($chain,6) -> from 6 to the end +# example usage: down_chain2string($chain,6,4) -> from 6, going on 4 elements +# example usage: down_chain2string($chain,6,"",10) -> from 6 to 10 +# example usage: up_chain2string($chain,10,"",6) -> from 10 to 6 upstream +sub old_updown_chain2string { + my ($direction,$chain,$first,$len,$last,$option)=@_; + unless($chain) { + warn ("Warning chain2string: no chain input"); return (-1); } + my $begin=$chain->{'begin'}; # the name of the BEGIN element + my $end=$chain->{'end'}; # the name of the END element + my $flow; + if ($direction eq "up") { + $flow=2; # used to determine the direction of chain navigation + unless ($first) { $first=$end; } # if undef or 0, use $end + } else { # defaults to "down" + $flow=1; # used to determine the direction of chain navigation + unless ($first) { $first=$begin; } # if undef or 0, use $begin + } + + unless($chain->{$first}) { + warn ("Warning chain2string: first element not defined"); return (-1); } + if ($last) { # if last is defined, it gets priority and len is not used + unless($chain->{$last}) { + warn ("Warning chain2string: last element not defined"); return (-1); } + if ($len) { + warn ("Warning chain2string: argument LAST:$last overriding LEN:$len!"); + undef $len; + } + } else { + if ($direction eq "up") { + $last=$begin; # if last not defined, go 'till begin (or upto len elements) + } else { + $last=$end; # if last not defined, go 'till end (or upto len elements) + } + } + my (@array, $string, $count); + # call for verbosity (by way of chain2string_verbose); + my $verbose=0; my $elements=0; my @elements; my $counting=0; + if ($option) { # keep strict happy + if ($option eq "verbose") { $verbose=1; } + if ($option eq "elements") { $elements=1; } + if ($option eq "counting") { $counting=1; } + } + + if ($verbose) { + print "BEGIN=$begin"; print " END=$end"; print " SIZE=$chain->{'size'}"; + print " FIRSTFREE=$chain->{'firstfree'} \n"; + } + + my $i=1; + my $label=$first; + my $afterlast=$chain->{$last}[$flow]; # if $last=$end $afterlast should be undef + unless (defined $afterlast) { $afterlast=0; } # keep strict happy + + # proceed for len elements or until last, whichever comes first + # if $len undef goes till end + while (($label)&&($label != $afterlast) && ($i <= ($len || $i + 1))) { + @array=@{$chain->{$label}}; + if ($verbose) { + $string .= "$array[2]_${label}_$array[1]=$array[0] "; + $count++; + } elsif ($elements) { + push (@elements,$label); # returning element names/references/identifiers + } elsif ($counting) { + $count++; + } else { + $string .= $array[0]; # returning element content + } + $label = $array[$flow]; # go to next||prev i.e. downstream||upstream + $i++; + } +#DEBUG#print "len: $len, first: $first, last: $last, afterlast=$afterlast \n"; + if ($verbose) { print "TOTALprinted: $count\n"; } + if ($counting) { + return $count; + } elsif ($elements) { + return @elements; + } else { + return $string; + } +} + +# sub string2schain +# --------> deleted, no more supported <-------- +# creation of a single linked list/chain from a string +# basically could be recreated by taking the *2chain methods and +# omitting to set the 3rd field (label 2) containing the back links + + +# creation of a double linked list/chain from a string +# returns reference to a hash containing the chain +# arguments: STRING [OFFSET] +# defaults: OFFSET defaults to 1 if undef +# the chain will contain as elements the single characters in the string +sub string2chain { + my @string=split(//,$_[0]); + array2chain(\@string,$_[1]); +} + +=head2 array2chain + + Title : array2chain + Usage : $chainref = Bio::LiveSeq::Chain::array2chain($arrayref,$offset) + Function: creation of a double linked chain from an array + Returns : reference to a hash containing the chain + Defaults: OFFSET defaults to 1 if undef + Error code: 0 + Args : a reference to an array containing the elements to be chainlinked + an optional integer > 0 (this will be the starting count for + the chain labels instead than having them begin from "1") + +=cut + +sub array2chain { + my $arrayref=$_[0]; + my $array_count=scalar(@{$arrayref}); + unless ($array_count) { + warn ("Warning array2chain: no elements input"); return (0); } + my $begin=$_[1]; + if (defined $begin) { + if ($begin < 1) { + warn "Warning array2chain: Zero or Negative offsets not allowed"; return (0); } + } else { + $begin=1; + } + my ($element,%hash); + $hash{'begin'}=$begin; + my $i=$begin-1; + foreach $element (@{$arrayref}) { + $i++; + # hash with keys begin..end pointing to the arrays + $hash{$i}=[$element,$i+1,$i-1]; + } + my $end=$i; + $hash{'end'}=$end; + $hash{firstfree}=$i+1; # what a new added element should be called + $hash{size}=$end-$begin+1; # how many elements in the chain + + # eliminate pointers to unexisting elements + $hash{$begin}[2]=undef; + $hash{$end}[1]=undef; + + return (\%hash); +} + +1; # returns 1 diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/LiveSeq/ChainI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/LiveSeq/ChainI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,288 @@ +# $Id: ChainI.pm,v 1.9 2002/10/22 07:38:34 lapp Exp $ +# +# bioperl module for Bio::LiveSeq::ChainI +# +# Cared for by Joseph Insana +# +# Copyright Joseph Insana +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::LiveSeq::ChainI - Double linked chain data structure + +=head1 SYNOPSIS + + #documentation needed + +=head1 DESCRIPTION + +This class generates and manipulates generic double linked list, chain, +that can be used to manage biological sequences. + +The advantages over strings or plain arrays is the ease of tracking +changes (mutations) in the elements (sequence). The other side of the +coin is that these structures need consideraly more memory, but that +is cheap and constantly inceasing resource in computers. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Joseph A.L. Insana + +Email: Insana@ebi.ac.uk, jinsana@gmx.net +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::LiveSeq::ChainI; +$VERSION=1.9; +# Version history: +# Thu Mar 16 01:38:25 GMT 2000 v.1.4 1st wraparound of methods complete +# tested with chainseq_asobj_test.pl +# Thu Mar 16 19:03:56 GMT 2000 v.1.5 decided to stick with same names as Chain +# Fri Mar 17 05:08:15 GMT 2000 v.1.6 in sync with Chain 2.4 +# Fri Mar 17 15:47:23 GMT 2000 v.1.7 added pos_of_label, enforced down_ or up_ +# Fri Mar 17 20:12:27 GMT 2000 v.1.8 NAMING change: index->label everywhere +# Mon Mar 20 19:20:17 GMT 2000 v.1.81 minor addings, Chain 2.52 +# Mon Mar 20 23:15:09 GMT 2000 v.1.82 in sync with Chain 2.6 +# Tue Mar 21 01:36:29 GMT 2000 v.1.83 added default strand if new(DNA) +# Tue Mar 21 14:19:17 GMT 2000 v.1.9 moved new(DNA) to DNA, added chain2string() + +use Carp qw(croak); +use strict; # this will be moved before when strict enforced in Chain.pm + +use Bio::LiveSeq::Chain 2.6; # package where all the subroutines are defined + + +=head2 new + + Title : new + Usage : $chain = Bio::LiveSeq::ChainI->new(-string => "thequickbrownfoxjumpsoverthelazydog", + -offset => 3 ); + OR $chain = Bio::LiveSeq::ChainI->new(-array => \@array, + -offset => 3 ); + Function: generates a new Bio::LiveSeq:ChainI + Returns : a new Chain + Args : string + OR arrayreference + AND optional offset to create element labels +=cut + +sub new { + my ($thing, %args) = @_; + my $class = ref($thing) || $thing; + my $obj; + + if ($args{-string}) { + $obj = $thing->string2chain($args{-string}, $args{-offset}); + } elsif ($args{-array}) { + $obj = $thing->array2chain($args{-array}, $args{-offset}); + } else { + croak "$class not initialized properly"; + } + + $obj = bless $obj, $class; + return $obj; +} + +# added as of 1.9 +sub string2chain { + shift @_; # so that it doesn't pass the object reference + return Bio::LiveSeq::Chain::string2chain(@_); +} +sub array2chain { + shift @_; # so that it doesn't pass the object reference + return Bio::LiveSeq::Chain::array2chain(@_); +} +# +sub chain2string { + croak "ambiguous method call. Explicit down_ or up_"; +} +sub down_chain2string { + return Bio::LiveSeq::Chain::down_chain2string(@_); +} +sub up_chain2string { + return Bio::LiveSeq::Chain::up_chain2string(@_); +} +sub chain2string_verbose { + croak "ambiguous method call. Explicit down_ or up_"; +} +sub down_chain2string_verbose { + return Bio::LiveSeq::Chain::down_chain2string_verbose(@_); +} +sub up_chain2string_verbose { + return Bio::LiveSeq::Chain::up_chain2string_verbose(@_); +} +sub invert_chain { + return Bio::LiveSeq::Chain::invert_chain(@_); +} +sub mutate_element { + croak "Old method name, please update code to: set_value_at_label"; +} + +# new as of version 2.33 of Chain.pm +sub down_labels { + return Bio::LiveSeq::Chain::down_labels(@_); +} +sub up_labels { + return Bio::LiveSeq::Chain::up_labels(@_); +} + +sub start { + return Bio::LiveSeq::Chain::start(@_); +} +sub end { + return Bio::LiveSeq::Chain::end(@_); +} +sub label_exists { + return Bio::LiveSeq::Chain::label_exists(@_); +} + +sub get_value_at_pos { + croak "ambiguous method call. Explicit down_ or up_"; +} +sub down_get_value_at_pos { + return Bio::LiveSeq::Chain::down_get_value_at_pos(@_); +} +sub up_get_value_at_pos { + return Bio::LiveSeq::Chain::up_get_value_at_pos(@_); +} +sub set_value_at_pos { + croak "ambiguous method call. Explicit down_ or up_"; +} +sub down_set_value_at_pos { + return Bio::LiveSeq::Chain::down_set_value_at_pos(@_); +} +sub up_set_value_at_pos { + return Bio::LiveSeq::Chain::up_set_value_at_pos(@_); +} +sub get_value_at_label { + return Bio::LiveSeq::Chain::get_value_at_label(@_); +} +sub set_value_at_label { + return Bio::LiveSeq::Chain::set_value_at_label(@_); +} +sub get_label_at_pos { + croak "ambiguous method call. Explicit down_ or up_"; +} +sub up_get_label_at_pos { + return Bio::LiveSeq::Chain::up_get_label_at_pos(@_); +} +sub down_get_label_at_pos { + return Bio::LiveSeq::Chain::down_get_label_at_pos(@_); +} +sub get_pos_of_label { + croak "ambiguous method call. Explicit down_ or up_"; +} +sub up_get_pos_of_label { + return Bio::LiveSeq::Chain::up_get_pos_of_label(@_); +} +sub down_get_pos_of_label { + return Bio::LiveSeq::Chain::down_get_pos_of_label(@_); +} +# + +sub preinsert_string { + return Bio::LiveSeq::Chain::praeinsert_string(@_); +} +sub preinsert_array { + return Bio::LiveSeq::Chain::praeinsert_array(@_); +} +sub praeinsert_string { + return Bio::LiveSeq::Chain::praeinsert_string(@_); +} +sub postinsert_string { + return Bio::LiveSeq::Chain::postinsert_string(@_); +} +sub praeinsert_array { + return Bio::LiveSeq::Chain::praeinsert_array(@_); +} +sub postinsert_array { + return Bio::LiveSeq::Chain::postinsert_array(@_); +} +sub down_element{ + return Bio::LiveSeq::Chain::down_element(@_); +} +sub up_element { + return Bio::LiveSeq::Chain::up_element(@_); +} +sub is_downstream { + return Bio::LiveSeq::Chain::is_downstream(@_); +} +sub is_upstream { + return Bio::LiveSeq::Chain::is_upstream(@_); +} +sub check_chain { + return Bio::LiveSeq::Chain::check_chain(@_); +} +sub chain_length { + return Bio::LiveSeq::Chain::chain_length(@_); +} +sub splice_chain { + return Bio::LiveSeq::Chain::splice_chain(@_); +} +sub pos_of_element { + croak "ambiguous and old method name. use: down_pos_of_label"; +} +sub up_pos_of_element { + croak "old method name. use: down_pos_of_label"; + return Bio::LiveSeq::Chain::up_pos_of_element(@_); +} +sub down_pos_of_element { + croak "old method name. use: up_pos_of_label"; + return Bio::LiveSeq::Chain::down_pos_of_element(@_); +} +sub subchain_length { + croak "ambiguous method call. Explicit down_ or up_"; +} +sub down_subchain_length { + return Bio::LiveSeq::Chain::down_subchain_length(@_); +} +sub up_subchain_length { + return Bio::LiveSeq::Chain::up_subchain_length(@_); +} + +# these have to be deleted and changed names to conform to terminology +sub elements { + return Bio::LiveSeq::Chain::down_elements(@_); +} +sub up_elements { + return Bio::LiveSeq::Chain::up_elements(@_); +} +sub down_elements { + return Bio::LiveSeq::Chain::down_elements(@_); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/LiveSeq/DNA.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/LiveSeq/DNA.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,117 @@ +# $Id: DNA.pm,v 1.9 2001/10/22 08:22:51 heikki Exp $ +# +# bioperl module for Bio::LiveSeq::DNA +# +# Cared for by Joseph Insana +# +# Copyright Joseph Insana +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::LiveSeq::DNA - DNA object for LiveSeq + +=head1 SYNOPSIS + + # documentation needed + +=head1 DESCRIPTION + +This holds the DNA sequence (or the RNA in the case of cDNA entries) +and is accessed by exons, genes, transcripts... objects + +=head1 AUTHOR - Joseph A.L. Insana + +Email: Insana@ebi.ac.uk, jinsana@gmx.net + +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::LiveSeq::DNA; +$VERSION=1.4; + +# Version history: +# Mon Mar 20 19:21:22 GMT 2000 v.1.0 begun +# Tue Mar 21 14:20:30 GMT 2000 v.1.1 new() is now here, not inherited +# Wed Mar 22 19:43:20 GMT 2000 v.1.2 length override +# Thu Jun 22 20:02:39 BST 2000 v 1.3 valid() from SeqI now moved here, as override +# Wed Mar 28 17:01:59 BST 2001 v 1.4 changed croaks into throw + +use strict; +use vars qw($VERSION @ISA); +use Bio::LiveSeq::SeqI 3.2; # uses SeqI, inherits from it +@ISA=qw(Bio::LiveSeq::SeqI); + +=head2 new + + Title : new + Usage : $dna = Bio::LiveSeq::DNA->new(-seq => "atcgaccaatggacctca", + -offset => 3 ); + + Function: generates a new Bio::LiveSeq::DNA + Returns : reference to a new object of class DNA + Errorcode -1 + Args : a string + AND an optional offset to create nucleotide labels (default is 1, i.e. + starting the count of labels from "1") -> do not bother using it -> + it could be used by alternative loaders !EMBL format + NOTE : strand of DNA is set to 1 by default + +=cut + +sub new { + my ($thing, %args) = @_; + my $class = ref($thing) || $thing; + my (%empty,$obj); + + if ($args{-seq}) { + $obj = $thing->string2chain($args{-seq},$args{-offset}); # inherited from ChainI + $obj = bless $obj, $class; + } else { + $obj=\%empty; + $obj = bless $obj, $class; + $obj->throw("$class not initialized properly"); + } + + $obj->{'alphabet'}='dna'; # set alphabet default + $obj->{'strand'}=1; # set strand default = 1 + $obj->{'seq'}=$obj; # set seq field to itself + + return $obj; +} + +# START method +# it has to be redefined here because default from SeqI accesses field "start" +sub start { + my $self = shift; + return $self->{'begin'}; # the chain's start is called begin +} + +# it is overridden to provide faster output +sub length { + my $self=shift; + return $self->chain_length(); +} + +# it is overridden to provide MUCH faster output +sub valid { + my $self=shift(@_); + return $self->label_exists(@_); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/LiveSeq/Exon.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/LiveSeq/Exon.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,107 @@ +# $Id: Exon.pm,v 1.8 2001/06/18 08:27:53 heikki Exp $ +# +# bioperl module for Bio::LiveSeq::Exon +# +# Cared for by Joseph Insana +# +# Copyright Joseph Insana +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::LiveSeq::Exon - Range abstract class for LiveSeq + +=head1 SYNOPSIS + + # documentation needed + +=head1 DESCRIPTION + +Class for EXON objects. They consist of a beginlabel, an endlabel (both +referring to a LiveSeq DNA object) and a strand. +The strand could be 1 (forward strand, default), -1 (reverse strand). + +=head1 AUTHOR - Joseph A.L. Insana + +Email: Insana@ebi.ac.uk, jinsana@gmx.net + +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::LiveSeq::Exon; +$VERSION=1.1; + +# Version history: +# Mon Mar 20 22:26:13 GMT 2000 v 1.0 begun +# Wed Apr 12 12:42:56 BST 2000 v 1.1 get_Transcript added + +use strict; +use vars qw($VERSION @ISA); +use Bio::LiveSeq::Range 1.2; # uses Range, inherits from it +@ISA=qw(Bio::LiveSeq::Range); + +=head2 new + + Title : new + Usage : $exon1 = Bio::LiveSeq::Exon-> new(-seq => $objref, + -start => $startlabel, + -end => $endlabel, -strand => 1); + + Function: generates a new Bio::LiveSeq::Exon + Returns : reference to a new object of class Exon + Errorcode -1 + Args : two labels and an integer + +=cut + +=head2 get_Transcript + + Title : get_Transcript + Usage : $transcript = $obj->get_Transcript() + Function: retrieves the reference to the object of class Transcript (if any) + attached to a LiveSeq object + Returns : object reference + Args : none + Note : only Exons that compose a Transcript (i.e. those created out of + a CDS Entry-Feature) will have an attached Transcript + +=cut + +sub get_Transcript { + my $self=shift; + return ($self->{'transcript'}); # this is set on all Exons a Transcript is made of when Transcript->new is called +} + +# this checks if the attached Transcript has a Gene object attached +sub gene { + my ($self,$value) = @_; + if (defined $value) { + $self->{'gene'} = $value; + } + unless (exists $self->{'gene'}) { + unless (exists $self->get_Transcript->{'gene'}) { + return (0); + } else { + return ($self->get_Transcript->{'gene'}); + } + } else { + return $self->{'gene'}; + } +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/LiveSeq/Gene.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/LiveSeq/Gene.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,450 @@ +# $Id: Gene.pm,v 1.13 2001/06/18 08:27:53 heikki Exp $ +# +# bioperl module for Bio::LiveSeq::Gene +# +# Cared for by Joseph Insana +# +# Copyright Joseph Insana +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::LiveSeq::Gene - Range abstract class for LiveSeq + +=head1 SYNOPSIS + + # documentation needed + +=head1 DESCRIPTION + +This is used as storage for all object references concerning a particular gene. + +=head1 AUTHOR - Joseph A.L. Insana + +Email: Insana@ebi.ac.uk, jinsana@gmx.net + +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::LiveSeq::Gene; +$VERSION=2.3; + +# Version history: +# Tue Apr 4 15:22:41 BST 2000 v 1.0 begun +# Tue Apr 4 16:19:27 BST 2000 v 1.1 completed new() +# Tue Apr 4 19:15:41 BST 2000 v 1.2 tested. Working. Simple methods created. +# Wed Apr 5 01:26:58 BST 2000 v 1.21 multiplicity, featuresnum() created +# Wed Apr 5 02:16:01 BST 2000 v 1.22 added upbound and downbound attributes +# Fri Apr 7 02:03:39 BST 2000 v 1.3 added printfeaturesnum and _set_Gene_in_all +# Fri Apr 7 02:53:05 BST 2000 v 2.0 added maxtranscript object creation +# Wed Jun 28 18:38:45 BST 2000 v 2.1 chaged croak to carp + return(-1) +# Wed Jul 12 15:19:26 BST 2000 v 2.11 ->strand call protected by if(ref(transcript)) +# Tue Jan 30 14:15:42 EST 2001 v 2.2 delete_Obj added, to flush circular references +# Wed Apr 4 13:29:59 BST 2001 v 2.3 LiveSeq-wide warn and verbose added + +use strict; +use Carp; +use vars qw($VERSION @ISA); +use Bio::LiveSeq::Prim_Transcript 1.0; # needed to create maxtranscript obj + +#use Bio::LiveSeq::SeqI 2.11; # uses SeqI, inherits from it +#@ISA=qw(Bio::LiveSeq::SeqI); + +=head2 new + + Title : new + Usage : $gene = Bio::LiveSeq::Gene->new(-name => "name", + -features => $hashref + -upbound => $min + -downbound => $max); + + Function: generates a new Bio::LiveSeq::Gene + Returns : reference to a new object of class Gene + Errorcode -1 + Args : one string and one hashreference containing all features defined + for the Gene and the references to the LiveSeq objects for those + features. + Two labels for defining boundaries of the gene. Usually the + boundaries will reflect max span of transcript, exon... features, + while the DNA sequence will be created with some flanking regions + (e.g. with the EMBL_SRS::gene2liveseq routine). + If these two labels are not given, they will default to the start + and end of the DNA object. + Note : the format of the hash has to be like + DNA => reference to LiveSeq::DNA object + Transcripts => reference to array of transcripts objrefs + Transclations => reference to array of transcripts objrefs + Exons => .... + Introns => .... + Prim_Transcripts => .... + Repeat_Units => .... + Repeat_Regions => .... + Only DNA and Transcripts are mandatory + +=cut + +sub new { + my ($thing, %args) = @_; + my $class = ref($thing) || $thing; + my ($i,$self,%gene); + + my ($name,$inputfeatures,$upbound,$downbound)=($args{-name},$args{-features},$args{-upbound},$args{-downbound}); + + unless (ref($inputfeatures) eq "HASH") { + carp "$class not initialised because features hash not given"; + return (-1); + } + + my %features=%{$inputfeatures}; # this is done to make our own hash&ref, not + my $features=\%features; # the ones input'ed, that could get destroyed + + my $DNA=$features->{'DNA'}; + unless (ref($DNA) eq "Bio::LiveSeq::DNA") { + carp "$class not initialised because DNA feature not found"; + return (-1); + } + + my ($minstart,$maxend);# used to calculate Gene->maxtranscript from Exon, Transcript (CDS) and Prim_Transcript features + + my ($start,$end); + + my @Transcripts=@{$features->{'Transcripts'}}; + + my $strand; + unless (ref($Transcripts[0]) eq "Bio::LiveSeq::Transcript") { + $self->warn("$class not initialised: first Transcript not a LiveSeq object"); + return (-1); + } else { + $strand=$Transcripts[0]->strand; # for maxtranscript consistency check + } + + for $i (@Transcripts) { + ($start,$end)=($i->start,$i->end); + unless ((ref($i) eq "Bio::LiveSeq::Transcript")&&($DNA->valid($start))&&($DNA->valid($end))) { + $self->warn("$class not initialised because of problems in Transcripts feature"); + return (-1); + } else { + } + unless($minstart) { $minstart=$start; } # initialize + unless($maxend) { $maxend=$end; } # initialize + if ($i->strand != $strand) { + $self->warn("$class not initialised because exon-CDS-prim_transcript features do not share the same strand!"); + return (-1); + } + if (($strand == 1)&&($start < $minstart)||($strand == -1)&&($start > $minstart)) { $minstart=$start; } + if (($strand == 1)&&($end > $maxend)||($strand == -1)&&($end < $maxend)) { $maxend=$end; } + } + my @Translations; my @Introns; my @Repeat_Units; my @Repeat_Regions; + my @Prim_Transcripts; my @Exons; + if (defined($features->{'Translations'})) { + @Translations=@{$features->{'Translations'}}; } + if (defined($features->{'Exons'})) { + @Exons=@{$features->{'Exons'}}; } + if (defined($features->{'Introns'})) { + @Introns=@{$features->{'Introns'}}; } + if (defined($features->{'Repeat_Units'})) { + @Repeat_Units=@{$features->{'Repeat_Units'}}; } + if (defined($features->{'Repeat_Regions'})) { + @Repeat_Regions=@{$features->{'Repeat_Regions'}}; } + if (defined($features->{'Prim_Transcripts'})) { + @Prim_Transcripts=@{$features->{'Prim_Transcripts'}}; } + + + if (@Translations) { + for $i (@Translations) { + ($start,$end)=($i->start,$i->end); + unless ((ref($i) eq "Bio::LiveSeq::Translation")&&($DNA->valid($start))&&($DNA->valid($end))) { + $self->warn("$class not initialised because of problems in Translations feature"); + return (-1); + } + } + } + if (@Exons) { + for $i (@Exons) { + ($start,$end)=($i->start,$i->end); + unless ((ref($i) eq "Bio::LiveSeq::Exon")&&($DNA->valid($start))&&($DNA->valid($end))) { + $self->warn("$class not initialised because of problems in Exons feature"); + return (-1); + } + if ($i->strand != $strand) { + $self->warn("$class not initialised because exon-CDS-prim_transcript features do not share the same strand!"); + return (-1); + } + if (($strand == 1)&&($start < $minstart)||($strand == -1)&&($start > $minstart)) { $minstart=$start; } + if (($strand == 1)&&($end > $maxend)||($strand == -1)&&($end < $maxend)) { $maxend=$end; } + } + } + if (@Introns) { + for $i (@Introns) { + ($start,$end)=($i->start,$i->end); + unless ((ref($i) eq "Bio::LiveSeq::Intron")&&($DNA->valid($start))&&($DNA->valid($end))) { + $self->warn("$class not initialised because of problems in Introns feature"); + return (-1); + } + } + } + if (@Repeat_Units) { + for $i (@Repeat_Units) { + ($start,$end)=($i->start,$i->end); + unless ((ref($i) eq "Bio::LiveSeq::Repeat_Unit")&&($DNA->valid($start))&&($DNA->valid($end))) { + $self->warn("$class not initialised because of problems in Repeat_Units feature"); + return (-1); + } + } + } + if (@Repeat_Regions) { + for $i (@Repeat_Regions) { + ($start,$end)=($i->start,$i->end); + unless ((ref($i) eq "Bio::LiveSeq::Repeat_Region")&&($DNA->valid($start))&&($DNA->valid($end))) { + $self->warn("$class not initialised because of problems in Repeat_Regions feature"); + return (-1); + } + } + } + if (@Prim_Transcripts) { + for $i (@Prim_Transcripts) { + ($start,$end)=($i->start,$i->end); + unless ((ref($i) eq "Bio::LiveSeq::Prim_Transcript")&&($DNA->valid($start))&&($DNA->valid($end))) { + $self->warn("$class not initialised because of problems in Prim_Transcripts feature"); + return (-1); + } + if ($i->strand != $strand) { + $self->warn("$class not initialised because exon-CDS-prim_transcript features do not share the same strand!"); + return (-1); + } + if (($strand == 1)&&($start < $minstart)||($strand == -1)&&($start > $minstart)) { $minstart=$start; } + if (($strand == 1)&&($end > $maxend)||($strand == -1)&&($end < $maxend)) { $maxend=$end; } + } + } + + # create an array containing all obj references for all Gene Features + # useful for _set_Gene_in_all + my @allfeatures; + push (@allfeatures,$DNA,@Transcripts,@Translations,@Exons,@Introns,@Repeat_Units,@Repeat_Regions,@Prim_Transcripts); + + # create hash holding numbers for Gene Features + my %multiplicity; + my $key; my @array; + foreach $key (keys(%features)) { + unless ($key eq "DNA") { + @array=@{$features{$key}}; + $multiplicity{$key}=scalar(@array); + } + } + $multiplicity{DNA}=1; + + # create maxtranscript object. It's a Prim_Transcript with start as the + # minimum start and end as the maximum end. + # usually these start and end will be the same as the gene->upbound and + # gene->downbound, but maybe there could be cases when this will be false + # (e.g. with repeat_units just before the prim_transcript or first exon, + # but still labelled with the same /gene qualifier) + + my $maxtranscript=Bio::LiveSeq::Prim_Transcript->new(-start => $minstart, -end => $maxend, -strand => $strand, -seq => $DNA); + + + # check the upbound downbound parameters + if (defined($upbound)) { + unless ($DNA->valid($upbound)) { + $self->warn("$class not initialised because upbound label not valid"); + return (-1); + } + } else { + $upbound=$DNA->start; + } + if (defined($downbound)) { + unless ($DNA->valid($downbound)) { + $self->warn("$class not initialised because downbound label not valid"); + return (-1); + } + } else { + $downbound=$DNA->end; + } + + %gene = (name => $name, features => $features,multiplicity => \%multiplicity, + upbound => $upbound, downbound => $downbound, allfeatures => \@allfeatures, maxtranscript => $maxtranscript); + $self = \%gene; + $self = bless $self, $class; + _set_Gene_in_all($self,@allfeatures); + return $self; +} + +# this sets the "gene" objref in all the objects "belonging" to the Gene, +# i.e. in all its Features. +sub _set_Gene_in_all { + my $Gene=shift; + my $self; + foreach $self (@_) { + $self->gene($Gene); + } +} + +# you can get or set the name of the gene +sub name { + my ($self,$value) = @_; + if (defined $value) { + $self->{'name'} = $value; + } + unless (exists $self->{'name'}) { + return "unknown"; + } else { + return $self->{'name'}; + } +} + +# gets the features hash +sub features { + my $self=shift; + return ($self->{'features'}); +} +sub get_DNA { + my $self=shift; + return ($self->{'features'}->{'DNA'}); +} +sub get_Transcripts { + my $self=shift; + return ($self->{'features'}->{'Transcripts'}); +} +sub get_Translations { + my $self=shift; + return ($self->{'features'}->{'Translations'}); +} +sub get_Prim_Transcripts { + my $self=shift; + return ($self->{'features'}->{'Prim_Transcripts'}); +} +sub get_Repeat_Units { + my $self=shift; + return ($self->{'features'}->{'Repeat_Units'}); +} +sub get_Repeat_Regions { + my $self=shift; + return ($self->{'features'}->{'Repeat_Regions'}); +} +sub get_Introns { + my $self=shift; + return ($self->{'features'}->{'Introns'}); +} +sub get_Exons { + my $self=shift; + return ($self->{'features'}->{'Exons'}); +} +sub featuresnum { + my $self=shift; + return ($self->{'multiplicity'}); +} +sub upbound { + my $self=shift; + return ($self->{'upbound'}); +} +sub downbound { + my $self=shift; + return ($self->{'downbound'}); +} +sub printfeaturesnum { + my $self=shift; + my ($key,$value); + my %hash=%{$self->featuresnum}; + foreach $key (keys(%hash)) { + $value=$hash{$key}; + print "\t$key => $value\n"; + } +} +sub maxtranscript { + my $self=shift; + return ($self->{'maxtranscript'}); +} + +sub delete_Obj { + my $self = shift; + my @values= values %{$self}; + my @keys= keys %{$self}; + + foreach my $key ( @keys ) { + delete $self->{$key}; + } + foreach my $value ( @values ) { + if (index(ref($value),"LiveSeq") != -1) { # object case + eval { + # delete $self->{$value}; + $value->delete_Obj; + }; + } elsif (index(ref($value),"ARRAY") != -1) { # array case + my @array=@{$value}; + my $element; + foreach $element (@array) { + eval { + $element->delete_Obj; + }; + } + } elsif (index(ref($value),"HASH") != -1) { # object case + my %hash=%{$value}; + my $element; + foreach $element (%hash) { + eval { + $element->delete_Obj; + }; + } + } + } + return(1); +} + + +=head2 verbose + + Title : verbose + Usage : $self->verbose(0) + Function: Sets verbose level for how ->warn behaves + -1 = silent: no warning + 0 = reduced: minimal warnings + 1 = default: all warnings + 2 = extended: all warnings + stack trace dump + 3 = paranoid: a warning becomes a throw and the program dies + + Note: a quick way to set all LiveSeq objects at the same verbosity + level is to change the DNA level object, since they all look to + that one if their verbosity_level attribute is not set. + But the method offers fine tuning possibility by changing the + verbose level of each object in a different way. + + So for example, after $loader= and $gene= have been retrieved + by a program, the command $gene->verbose(0); would + set the default verbosity level to 0 for all objects. + + Returns : the current verbosity level + Args : -1,0,1,2 or 3 + +=cut + + +sub verbose { + my $self=shift; + my $value = shift; + return $self->{'features'}->{'DNA'}->verbose($value); +} + +sub warn { + my $self=shift; + my $value = shift; + return $self->{'features'}->{'DNA'}->warn($value); +} + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/LiveSeq/IO/BioPerl.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/LiveSeq/IO/BioPerl.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,406 @@ +# $Id: BioPerl.pm,v 1.15 2001/12/14 16:40:15 heikki Exp $ +# +# bioperl module for Bio::LiveSeq::IO::BioPerl +# +# Cared for by Joseph Insana +# +# Copyright Joseph Insana +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::LiveSeq::IO::BioPerl - Loader for LiveSeq from EMBL entries with BioPerl + +=head1 SYNOPSIS + + my $db="EMBL"; + my $file="../data/M20132"; + my $id="HSANDREC"; + + my $loader=Bio::LiveSeq::IO::BioPerl->load(-db=>"$db", -file=>"$file"); + or + my $loader=Bio::LiveSeq::IO::BioPerl->load(-db=>"$db", -id=>"$id"); + + my @translationobjects=$loader->entry2liveseq(); + + my $genename="AR"; + my $gene=$loader->gene2liveseq(-gene_name => "$genename", + -getswissprotinfo => 0); + + NOTE1: The only -db now supported is EMBL. Hence it defaults to EMBL. + NOTE2: -file requires a filename (and path if necessary) containing an + EMBL entry + -id will use Bio::DB::EMBL.pm to fetch the sequence from the web, + (bioperl wraparound to [w]getz from SRS) + NOTE3: To retrieve the swissprot (if possible) attached to the embl entry + (to get protein domains at dna level), only Bio::DB::EMBL.pm + is supported under BioPerl. Refer to Bio::LiveSeq::IO::SRS + otherwise. + NOTE4: NOTE3 is not implemented yet for bioperl, working on it + + +=head1 DESCRIPTION + +This package uses BioPerl (SeqIO) to fetch a sequence database entry, +analyse it and create LiveSeq objects out of it. + +A filename (or an ID that will fetch entry through the web) has to be passed +to this package which will return references to all translation objects +created from the EMBL entry. References to Transcription, DNA and Exon +objects can all be retrieved departing from these. + +Alternatively, a specific "gene" name can be specified, together with +the embl-acc ID. This will create a LiveSeq::Gene object with all +relevant gene features attached/created. + +ATTENTION: if web fetching is requested, the package HTTP::Request needs +to be installed. + + +=head1 AUTHOR - Joseph A.L. Insana + +Email: Insana@ebi.ac.uk, jinsana@gmx.net + +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::LiveSeq::IO::BioPerl; +$VERSION=2.42; + +# Version history: +# Thu Apr 6 00:25:46 BST 2000 v 1.0 begun +# Thu Apr 6 03:40:04 BST 2000 v 1.25 added Division +# Thu Apr 6 03:40:36 BST 2000 v 2.0 working +# Thu Apr 20 02:17:28 BST 2000 v 2.1 mRNA added to valid_feature_names +# Tue Jul 4 14:07:52 BST 2000 v 2.11 note&number added in val_qual_names +# Fri Sep 15 15:41:02 BST 2000 v 2.22 novelaasequence2gene now works without SRS +# Mon Jan 29 17:40:06 EST 2001 v 2.3 made it work with the new split_location of BioPerl 0.7 +# Tue Apr 10 17:00:18 BST 2001 v 2.41 started work on support of DB::EMBL.pm +# Tue Apr 10 17:22:26 BST 2001 v 2.42 -id should work now + +# TODO->TOCHECK +# each_secondary_access not working +# why array from each_tag_value($qual) ? When will there be more than one +# element in such array? +# what is the annotation object? ($seqobj->annotation) +# unsatisfied by both BioPerl binomial and SRS "org" to retrieve Organism info + +use strict; +use Carp qw(cluck croak carp); +use vars qw($VERSION @ISA); +use Bio::SeqIO; # for -file entry loading + +# Note, the following requires HTTP::Request. If the modules are not installed +# uncomment the following and use only -filename and don't request swissprotinfo +use Bio::DB::EMBL; # for -id entry loading + +use Bio::LiveSeq::IO::Loader 2.0; + +@ISA=qw(Bio::LiveSeq::IO::Loader); + +# This package can in the future host other databases loading subroutines. +# e.g. ensembl2hash + +=head2 load + + Title : load + Usage : my $filename="../data/M20132"; + $loader=Bio::LiveSeq::IO::BioPerl->load(-db=>"EMBL", -file=>"$filename"); + or + $loader=Bio::LiveSeq::IO::BioPerl->load(-db=>"EMBL", -id=>"HSANDREC"); + + Function: loads an entry with BioPerl from a database into a hash + Returns : reference to a new object of class IO::BioPerl holding an entry + Errorcode 0 + Args : an filename containing an EMBL entry OR an ID or ACCESSION code + +=cut + +sub load { + my ($thing, %args) = @_; + my $class = ref($thing) || $thing; + my ($obj,%loader); + + my ($db,$filename,$id)=($args{-db},$args{-file},$args{-id}); + + if (defined($db)) { + unless ($db eq "EMBL") { + carp "Note: only EMBL now supported!"; + return(0); + } + } else { + $db="EMBL"; + } + + if (defined($id) && defined($filename)) { + carp "You can either specify a -id or a -filename!"; + return(0); + } + + unless (defined($id) || defined($filename)) { + carp "You must specify either a -id or a -filename!"; + return(0); + } + + my $hashref; + if ($db eq "EMBL") { + my $test_transl=0; # change to 0 to avoid comparison of translation + + # these can be changed for future needs + my @embl_valid_feature_names=qw(CDS CDS_span exon prim_transcript intron repeat_unit repeat_region mRNA); + my @embl_valid_qual_names=qw(gene codon_start db_xref product note number rpt_family transl_table); + + # dunno yet how to implement test_transl again.... + # probably on a one-on-one basis with each translation? + if ($test_transl) { + push (@embl_valid_qual_names,"translation"); # needed for test_transl + } + + my $seqobj; # bioperl sequence object, to be passed to embl2hash + + if (defined($filename)) { + my $stream = Bio::SeqIO->new('-file' => $filename, '-format' => 'EMBL'); + $seqobj = $stream->next_seq(); + } else { # i.e. if -id + my $embl = new Bio::DB::EMBL; + $seqobj = $embl->get_Seq_by_id($id); # EMBL ID or ACC + } + + $hashref=&embl2hash($seqobj,\@embl_valid_feature_names,\@embl_valid_qual_names); + } + unless ($hashref) { return (0); } + + %loader = (db => $db, filename => $filename, id => $id, hash => $hashref); + $obj = \%loader; + $obj = bless $obj, $class; + return $obj; +} + +=head2 embl2hash + + Title : embl2hash + Function: retrieves with BioPerl an EMBL entry, parses it and creates + a hash that contains all the information. + Returns : a reference to a hash + Errorcode: 0 + Args : a BioPerl Sequence Object (from file or web fetching) + two array references to skip features and qualifiers (for + performance) + Example: @valid_features=qw(CDS exon prim_transcript mRNA); + @valid_qualifiers=qw(gene codon_start db_xref product rpt_family); + $hashref=&embl2hash($seqobj,\@valid_features,\@valid_qualifiers); + +=cut + +# arguments: Bioperl $seqobj +# to skip features and qualifiers (for performance), two array +# references must be passed (this can change into string arguments to +# be passed....) +# returns: a reference to a hash containing the important features requested +sub embl2hash { + my $seqobj=$_[0]; + my %valid_features; my %valid_names; + if ($_[1]) { + %valid_features = map {$_, 1} @{$_[1]}; # to skip features + } + if ($_[2]) { + %valid_names = map {$_, 1} @{$_[2]}; # to skip qualifiers + } + + my $annobj = $seqobj->annotation(); # what's this? + + my $entry_Sequence = lc($seqobj->seq()); # SRS returns lowercase + + my $entry_ID = $seqobj->display_id; + my $entry_AccNumber = $seqobj->accession; # or maybe accession_number ? + my $secondary_acc; # to fetch the other acc numbers + foreach $secondary_acc ($seqobj->get_secondary_accessions) { # not working! + $entry_AccNumber .= " $secondary_acc"; + } + my $entry_Molecule = $seqobj->molecule; # this alone returns molec+division + my $entry_Division = $seqobj->division; + # fixed: now Molecule works in BioPerl, no need for next lines + #my @Molecule=split(" ",$entry_Molecule); + #my $entry_Division = pop(@Molecule); # only division + #$entry_Molecule = join(" ",@Molecule); # only molecule + my $entry_Description = $seqobj->desc; + + my $speciesobj = $seqobj->species; + my $entry_Organism = $speciesobj->binomial; + + my $entry_SeqLength = $seqobj->length; + + # put into the hash + my %entryhash; + $entryhash{ID}=$entry_ID; + $entryhash{AccNumber}=$entry_AccNumber; + $entryhash{Molecule}=$entry_Molecule; + $entryhash{Division}=$entry_Division; + $entryhash{Description}=$entry_Description; + $entryhash{Organism}=$entry_Organism; + $entryhash{Sequence}=$entry_Sequence; + $entryhash{SeqLength}=$entry_SeqLength; + + my @topfeatures=$seqobj->top_SeqFeatures(); + # create features array + my $featuresnumber= scalar(@topfeatures); + $entryhash{FeaturesNumber}=$featuresnumber; + my $feature_name; + my @feature_qual_names; my @feature_qual_value; + my ($feature_qual_name,$feature_qual_number); + my @features; + + my ($feat,$qual,$subfeat); + my @subfeat; + my $i=0; + foreach $feat (@topfeatures) { + my %feature; + $feature_name = $feat->primary_tag; + unless ($valid_features{$feature_name}) { + #print "skipping $feature_name\n"; + next; + } +# works ok with 0.6.2 +# if ($feature_name eq "CDS_span") { # case of CDS with various exons 0.6.2 +# $feature_name="CDS"; # 0.6.2 + my $featlocation=$feat->location; # 0.7 + if (($feature_name eq "CDS")&&($featlocation->isa('Bio::Location::SplitLocationI'))) { # case of CDS with various exons BioPerl 0.7 +# @subfeat=$feat->sub_SeqFeature; # 0.6.2 + @subfeat=$featlocation->sub_Location(); # 0.7 + my @transcript; + foreach $subfeat (@subfeat) { + my @range; + if ($subfeat->strand == -1) { + @range=($subfeat->end,$subfeat->start,$subfeat->strand); + } else { + @range=($subfeat->start,$subfeat->end,$subfeat->strand); + } + push (@transcript,\@range); + } + $feature{range}=\@transcript; + } else { + my @range; + ($feat->strand == -1) ? (@range = ($feat->end, $feat->start, $feat->strand) ) : + (@range = ( $feat->start,$feat->end,$feat->strand) ); +# works ok with 0.6.2 + if ($feature_name eq "CDS") { # case of single exon CDS (CDS name but not split location) + my @transcript=(\@range); + $feature{range}=\@transcript; + } else { # all other range features + $feature{range}=\@range; + } + } + $feature{location}="deprecated"; + + $feature{position}=$i; + $feature{name}=$feature_name; + + @feature_qual_names= $feat->all_tags(); + $feature_qual_number= scalar(@feature_qual_names); + + $feature{qual_number}=$feature_qual_number; + + my %feature_qualifiers; + for $qual (@feature_qual_names) { + $feature_qual_name=$qual; + unless ($valid_names{$feature_qual_name}) { + next; + } + @feature_qual_value=$feat->each_tag_value($qual); + #print "$qual => @feature_qual_value \n"; + $feature_qualifiers{$feature_qual_name}=$feature_qual_value[0]; # ? + # maybe the whole array should be entered, not just the 1st element? + # what could be the other elements? TOCHECK! + } + $feature{qualifiers}=\%feature_qualifiers; + push (@features,\%feature); # array of features + $i++; + } + $entryhash{Features}=\@features; # put this also into the hash + + my @cds; # array just of CDSs + for $i (0..$#features) { + if ($features[$i]->{'name'} eq "CDS") { + push(@cds,$features[$i]); + } + } + $entryhash{CDS}=\@cds; # put this also into the hash + return (\%entryhash); +} + +=head2 novelaasequence2gene + + Title : novelaasequence2gene + Usage : $gene=Bio::LiveSeq::IO::BioPerl->novelaasequence2gene(-aasequence => "MGLAAPTRS*"); + : $gene=Bio::LiveSeq::IO::BioPerl->novelaasequence2gene(-aasequence => "MGLAAPTRS*", + -cusg_data => "58 44 7 29 3 3 480 267 105 143 122 39 144 162 14 59 53 25 233 292 19 113 88 246 28 68 161 231 27 102 128 151 67 60 138 131 48 61 153 19 233 73 150 31 129 38 147 71 138 43 181 81 44 15 255 118 312 392 236 82 20 10 14 141"); + : $gene=Bio::LiveSeq::IO::BioPerl->novelaasequence2gene(-aasequence => "MGLAAPTRS*", + -cusg_data => "58 44 7 29 3 3 480 267 105 143 122 39 144 162 14 59 53 25 233 292 19 113 88 246 28 68 161 231 27 102 128 151 67 60 138 131 48 61 153 19 233 73 150 31 129 38 147 71 138 43 181 81 44 15 255 118 312 392 236 82 20 10 14 141", + -translation_table => "2", + -gene_name => "tyr-kinase"); + + Function: creates LiveSeq objects from a novel amino acid sequence, + using codon usage information (loaded from a file) to choose + codons according to relative frequencies. + If a codon_usage information is not specified, + the default is to use Homo sapiens data (taxonomy ID 9606). + If a translation_table ID is not specified, it will default to 1 + (standard code). + Returns : reference to a Gene object containing references to LiveSeq objects + Errorcode 0 + Args : string containing an amino acid sequence + string (optional) with codon usage data (64 integer numbers) + string (optional) specifying a gene_name + integer (optional) specifying a translation_table ID + +=cut + +sub novelaasequence2gene { + my ($self, %args) = @_; + my ($gene_name,$cusg_data,$aasequence,$ttabid)=($args{-gene_name},$args{-cusg_data},$args{-aasequence},$args{-translation_table}); + + my @species_codon_usage; + unless ($aasequence) { + carp "aasequence not given"; + return (0); + } + unless ($gene_name) { + $gene_name="Novel Unknown"; + } + unless ($ttabid) { + $ttabid=1; + } + unless ($cusg_data) { + @species_codon_usage= + qw(68664 118404 126679 51100 125600 123646 75667 210903 435317 + 139009 79303 135218 128429 192616 49456 161556 211962 131222 + 162837 213626 69346 140780 182506 219428 76684 189374 173010 + 310626 82647 202329 180955 250410 180001 118798 76398 160764 + 317359 119013 262630 359627 218376 186915 130857 377006 162826 + 113684 317703 441298 287040 245435 174805 133427 134523 108740 + 225633 185619 78463 240138 174021 244236 142435 8187 5913 + 14381); # updated 21Jul2000 + } else { + @species_codon_usage=split(/ /,$cusg_data); + } + + my $gene=Bio::LiveSeq::IO::Loader::_common_novelaasequence2gene(\@species_codon_usage,$ttabid,$aasequence,$gene_name); + return ($gene); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/LiveSeq/IO/Loader.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/LiveSeq/IO/Loader.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1038 @@ +# $Id: Loader.pm,v 1.15 2001/10/22 08:22:51 heikki Exp $ +# +# bioperl module for Bio::LiveSeq::IO::Loader +# +# Cared for by Joseph Insana +# +# Copyright Joseph Insana +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::LiveSeq::IO::Loader - Parent Loader for LiveSeq + +=head1 SYNOPSIS + + #documentation needed + +=head1 DESCRIPTION + +This package holds common methods used by BioPerl, SRS and file loaders. +It contains methods to create LiveSeq objects out of entire entries or from a +localized sequence region surrounding a particular gene. + +=head1 AUTHOR - Joseph A.L. Insana + +Email: Insana@ebi.ac.uk, jinsana@gmx.net + +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::LiveSeq::IO::Loader; +$VERSION=4.44; + +# Version history: +# Wed Feb 16 17:55:01 GMT 2000 0.1a was a general EMBL entry printer with SRS +# Wed Mar 29 16:53:30 BST 2000 0.2 rewrote as SRS LiveSeq Loader +# Wed Mar 29 19:11:21 BST 2000 0.2.1 used successfully by liveseq3.pl +# Fri Mar 31 02:33:43 BST 2000 v 1.0 begun wrapping into this package +# Fri Mar 31 03:58:43 BST 2000 v 1.1 finished wrapping +# Fri Mar 31 04:24:50 BST 2000 v 1.2 added test_transl +# Fri Mar 31 04:34:48 BST 2000: noticed problem with K02083, if translation is +# included as valid qualifier name -> investigate +# Fri Mar 31 16:55:07 BST 2000 v 1.21 removed chop in test_transl() +# Mon Apr 3 18:25:27 BST 2000 v 1.3 begun working at lightweight loader +# Mon Apr 3 18:42:30 BST 2000 v 1.31 started changing so that CDS is no more +# the only default feature possibly asked for +# Tue Apr 4 16:19:09 BST 2000 v 1.4 started creating hash2gene +# Tue Apr 4 16:41:56 BST 2000 v 1.42 created location2range and rewritten +# cdslocation2transcript +# Tue Apr 4 18:18:42 BST 2000 v 1.44 finished (maybe) hash2gene +# Tue Apr 4 19:14:33 BST 2000 v 1.49 temporary printgene done. All working :) +# Wed Apr 5 02:04:01 BST 2000 v 1.5 added upbound,downbound to hash2gene +# Wed Apr 5 13:06:43 BST 2000 v 2.0 started obj_oriented and inheritance +# Thu Apr 6 03:11:29 BST 2000 v 2.2 transition from $location to @range +# Thu Apr 6 04:26:04 BST 2000 v 2.3 both SRS and BP work with gene and entry! +# Fri Apr 7 01:47:51 BST 2000 v 2.4 genes() created +# Fri Apr 7 03:01:46 BST 2000 v 2.5 changed hash2gene so that if there is +# just 1 CDS in entry it will use all +# features of the entry as Gene features +# Tue Apr 18 18:14:19 BST 2000 v 3.0 printswissprot added +# Wed Apr 19 22:15:12 BST 2000 v 3.2 swisshash2liveseq created +# Thu Apr 20 00:14:09 BST 2000 v 3.4 swisshash2liveseq updated: now it correctly handles cleaved_met and conflicts/mod_res/variants recorded differences between EMBL and SWISSPROT translations sequences. Still some not-recorded conflicts are possible and in these cases the program won't create the AARange -> this could change in the future, if a better stringcomparison is introduced +# Thu Apr 20 01:14:16 BST 2000 v 3.6 changed entry2liveseq and gene2liveseq to namedargument input format; added getswissprotinfo flag/option +# Thu Apr 20 02:18:58 BST 2000 v 3.7 mRNA added as valid_feature -> it gets recorded as prim_transcript object +# Thu Apr 27 16:19:43 BST 2000 v 3.8 translation_table set added to hash2gene +# Mon May 1 22:16:18 BST 2000 v 3.9 -position option added to gene2liveseq +# Tue May 2 02:43:05 BST 2000 v 4.0 moved some code in _findgenefeatures, added the possibility of using cds_position information, created _checkfeatureproximity +# Tue May 2 03:20:20 BST 2000 v 4.01 findgenefeatures debugged +# Wed May 31 13:59:09 BST 2000 v 4.02 chopped $translated to take away STOP +# Fri Jun 2 14:49:12 BST 2000 v 4.1 prints alignment with CLUSTALW +# Wed Jun 7 02:07:54 BST 2000 v 4.2 added code for "simplifying" joinedlocation features (e.g. join() in mRNA features), changing them to plain start-end ones +# Wed Jun 7 04:20:15 BST 2000 v 4.22 added translation->{'offset'} for INIT_MET +# Tue Jun 27 14:05:19 BST 2000 v. 4.3 added if() conditions so that if new() of object creation failed, the object is not passed on +# Tue Jul 4 14:15:58 BST 2000 v 4.4 note and number qualifier added to exon and intron descriptions +# Wed Jul 12 14:06:38 BST 2000 v 4.41 added if() condition out of transcript creation in transexoncreation() +# Fri Sep 15 15:41:02 BST 2000 v 4.44 created _common_novelaasequence2gene + +# Note: test_transl has been left as deprecated and is not really supported now + +use strict; +use Carp qw(cluck croak carp); +use vars qw($VERSION @ISA); +use Bio::LiveSeq::DNA 1.2; +use Bio::LiveSeq::Exon 1.0; +use Bio::LiveSeq::Transcript 2.4; +use Bio::LiveSeq::Translation 1.4; +use Bio::LiveSeq::Gene 1.1; +use Bio::LiveSeq::Intron 1.0; +use Bio::LiveSeq::Prim_Transcript 1.0; +use Bio::LiveSeq::Repeat_Region 1.0; +use Bio::LiveSeq::Repeat_Unit 1.0; +use Bio::LiveSeq::AARange 1.4; +use Bio::Tools::CodonTable; + +#@ISA=qw(Bio::LiveSeq::); # not useful now + +=head2 entry2liveseq + + Title : entry2liveseq + Usage : @translationobjects=$loader->entry2liveseq(); + : @translationobjects=$loader->entry2liveseq(-getswissprotinfo => 0); + Function: creates LiveSeq objects from an entry previously loaded + Returns : array of references to objects of class Translation + Errorcode 0 + Args : optional boolean flag to avoid the retrieval of SwissProt + informations for all Transcripts containing SwissProt x-reference + default is 1 (to retrieve those informations and create AARange + LiveSeq objects) + Note : this method can get really slow for big entries. The lightweight + gene2liveseq method is recommended + +=cut + +sub entry2liveseq { + my ($self, %args) = @_; + my ($getswissprotinfo)=($args{-getswissprotinfo}); + if (defined($getswissprotinfo)) { + if (($getswissprotinfo ne 0)&&($getswissprotinfo ne 1)) { + carp "-getswissprotinfo argument can take only boolean (1 or 0) values. Setting it to 0, i.e. not trying to retrieve SwissProt information...."; + $getswissprotinfo=0; + } + } else { + $getswissprotinfo=1; + } + my $hashref=$self->{'hash'}; + unless ($hashref) { return (0); } + my @translationobjects=$self->hash2liveseq($hashref,$getswissprotinfo); + my $test_transl=0; + if ($test_transl) { $self->test_transl($hashref,\@translationobjects);} + return @translationobjects; +} + +=head2 novelaasequence2gene + + Title : novelaasequence2gene + Usage : $gene=$loader->novelaasequence2gene(-aasequence => "MGLAAPTRS*"); + : $gene=$loader->novelaasequence2gene(-aasequence => "MGLAAPTRS*"); + -taxon => 9606, + -gene_name => "tyr-kinase"); + + Function: creates LiveSeq objects from a novel amino acid sequence, + using codon usage database to choose codons according to + relative frequencies. + If a taxon ID is not specified, the default is to use the human + one (taxonomy ID 9606). + Returns : reference to a Gene object containing references to LiveSeq objects + Errorcode 0 + Args : string containing an amino acid sequence + integer (optional) with a taxonomy ID + string specifying a gene name + +=cut + +=head2 gene2liveseq + + Title : gene2liveseq + Usage : $gene=$loader->gene2liveseq(-gene_name => "gene name"); + : $gene=$loader->gene2liveseq(-gene_name => "gene name", + -flanking => 64); + : $gene=$loader->gene2liveseq(-gene_name => "gene name", + -getswissprotinfo => 0); + : $gene=$loader->gene2liveseq(-position => 4); + + Function: creates LiveSeq objects from an entry previously loaded + It is a "light weight" creation because it creates + a LiveSequence just for the interesting region in an entry + (instead than for the total entry, like in entry2liveseq) and for + the flanking regions up to 500 nucleotides (default) or up to + the specified amount of nucleotides (given as argument) around the + Gene. + Returns : reference to a Gene object containing possibly alternative + Transcripts. + Errorcode 0 + Args : string containing the gene name as in the EMBL feature qualifier + integer (optional) "flanking": amount of flanking bases to be kept + boolean (optional) "getswissprotinfo": if set to "0" it will avoid + trying to fetch information from a crossreference to a SwissProt + entry, avoding the process of creation of AARange objects + It is "1" (on) by default + + Alternative to a gene_name, a position can be given: an + integer (1-) containing the position of the desired CDS in the + loaded entry + +=cut + +sub gene2liveseq { + my ($self, %args) = @_; + my ($gene_name,$flanking,$getswissprotinfo,$cds_position)=($args{-gene_name},$args{-flanking},$args{-getswissprotinfo},$args{-position}); + my $input; + unless (($gene_name)||($cds_position)) { + carp "Gene_Name or Position not specified for gene2liveseq loading function"; + return (0); + } + if (($gene_name)&&($cds_position)) { + carp "Gene_Name and Position cannot be given together, use one"; + return (0); + } elsif ($gene_name) { + $input=$gene_name; + } else { + $input="cds-position:".$cds_position; + } + + if (defined($getswissprotinfo)) { + if (($getswissprotinfo ne 0)&&($getswissprotinfo ne 1)) { + carp "-getswissprotinfo argument can take only boolean (1 or 0) values. Setting it to 0, i.e. not trying to retrieve SwissProt information...."; + $getswissprotinfo=0; + } + } else { + $getswissprotinfo=1; + } + + if (defined($flanking)) { + unless ($flanking >= 0) { + carp "No sense in specifying a number < 0 for flanking regions to be created for gene2liveseq loading function"; + return (0); + } + } else { + $flanking=500; # the default flanking length + } + my $hashref=$self->{'hash'}; + unless ($hashref) { return (0); } + my $gene=$self->hash2gene($hashref,$input,$flanking,$getswissprotinfo); + unless ($gene) { # if $gene == 0 it means problems in hash2gene + carp "gene2liveseq produced error"; + return (0); + } + return $gene; +} + +# TODO: update so that it will work even if CDS is not only accepted FEATURE!! +# this method is for now deprecated and not supported +sub test_transl { + my ($self,$entry)=@_; + my @features=@{$entry->{'Features'}}; + my @translationobjects=@{$_[1]}; + my ($i,$translation); + my ($obj_transl,$hash_transl); + my @cds=@{$entry->{'CDS'}}; + foreach $translation (@translationobjects) { + $obj_transl=$translation->seq; + $hash_transl=$cds[$i]->{'qualifiers'}->{'translation'}; + #before seq was changed in Translation 1.4# chop $obj_transl; # to remove trailing "*" + unless ($obj_transl eq $hash_transl) { + cluck "Serious error: Translation from the Entry does not match Translation from object's seq for CDS at position $i"; + carp "\nEntry's transl: ",$hash_transl,"\n"; + carp "\nObject's transl: ",$obj_transl,"\n"; + exit; + } + $i++; + } +} + +# argument: hashref containing the EMBL entry datas, +# getswissprotinfo boolean flag +# creates the liveseq objects +# returns: an array of Translation object references +sub hash2liveseq { + my ($self,$entry,$getswissprotinfo)=@_; + my $i; + my @transcripts; + my $dna=Bio::LiveSeq::DNA->new(-seq => $entry->{'Sequence'} ); + $dna->alphabet(lc($entry->{'Molecule'})); + $dna->display_id($entry->{'ID'}); + $dna->accession_number($entry->{'AccNumber'}); + $dna->desc($entry->{'Description'}); + my @cds=@{$entry->{'CDS'}}; + my ($swissacc,$swisshash); my @swisshashes; + for $i (0..$#cds) { + #my @transcript=@{$cds[$i]->{'range'}}; + #$transcript=\@transcript; + #push (@transcripts,$transcript); + push (@transcripts,$cds[$i]->{'range'}); + if ($getswissprotinfo) { + $swissacc=$cds[$i]->{'qualifiers'}->{'db_xref'}; + $swisshash=$self->get_swisshash($swissacc); + #$self->printswissprot($swisshash); # DEBUG + push (@swisshashes,$swisshash); + } + } + my @translations=($self->transexonscreation($dna,\@transcripts)); + my $translation; my $j=0; + foreach $translation (@translations) { + if ($swisshashes[$j]) { # if not 0 + $self->swisshash2liveseq($swisshashes[$j],$translation); + } + $j++; + } + return (@translations); +} + +# only features pertaining to a specified gene are created +# only the sequence of the gene and appropriate context flanking regions +# are created as chain +# arguments: hashref, gene_name (OR: cds_position), length_of_flanking_sequences, getswissprotinfo boolean flag +# returns: reference to Gene object +# +# Note: if entry contains just one CDS, all the features get added +# this is useful because often the features in these entries do not +# carry the /gene qualifier +# +# errorcode: 0 +sub hash2gene { + my ($self,$entry,$input,$flanking,$getswissprotinfo)=@_; + my $entryfeature; + my $genefeatureshash; + + my @cds=@{$entry->{'CDS'}}; + + # checking if a position has been given instead than a gene_name + if (index($input,"cds-position:") == 0 ) { + my $cds_position=substr($input,13); # extracting the cds position + if (($cds_position >= 1)&&($cds_position <= scalar(@cds))) { + $genefeatureshash=$self->_findgenefeatures($entry,undef,$cds_position,$getswissprotinfo); + } + } else { + $genefeatureshash=$self->_findgenefeatures($entry,$input,undef,$getswissprotinfo); + } + + unless (($genefeatureshash)&&(scalar(@{$genefeatureshash->{'genefeatures'}}))) { # array empty, no gene features found + my @genes=$self->genes($entry); + my $cds_number=scalar(@cds); + warn "Warning! Not even one genefeature found for /$input/.... + The genes present in this entry are:\n\t@genes\n + The number of CDS in this entry is:\n\t$cds_number\n"; + return(0); + } + + # get max and min, check flankings + my ($min,$max)=$self->rangeofarray(@{$genefeatureshash->{'labels'}}); # gene "boundaries" + my $seqlength=$entry->{'SeqLength'}; + my ($mindna,$maxdna); # some flanking region next to the gene "boundaries" + if ($min-$flanking < 1) { + $mindna=1; + } else { + $mindna=$min-$flanking; + } + if ($max+$flanking > $seqlength) { + $maxdna=$seqlength; + } else { + $maxdna=$max+$flanking; + } + my $subseq=substr($entry->{'Sequence'},$mindna-1,$maxdna-$mindna+1); + + # create LiveSeq objects + + # create DNA + my $dna=Bio::LiveSeq::DNA->new(-seq => $subseq, -offset => $mindna); + $dna->alphabet(lc($entry->{'Molecule'})); + $dna->source($entry->{'Organism'}); + $dna->display_id($entry->{'ID'}); + $dna->accession_number($entry->{'AccNumber'}); + $dna->desc($entry->{'Description'}); + + my @transcripts=@{$genefeatureshash->{'transcripts'}}; + # create Translations, Transcripts, Exons out of the CDS + unless (@transcripts) { + cluck "no CDS feature found for /$input/...."; + return(0); + } + my @translationobjs=$self->transexonscreation($dna,\@transcripts); + my @transcriptobjs; + + # get the Transcript obj_refs + my $translation; + my $j=0; + my @ttables=@{$genefeatureshash->{'ttables'}}; + my @swisshashes=@{$genefeatureshash->{'swisshashes'}}; + foreach $translation (@translationobjs) { + push(@transcriptobjs,$translation->get_Transcript); + if ($ttables[$j]) { # if not undef + $translation->get_Transcript->translation_table($ttables[$j]); + #} else { # DEBUG + # print "\n\t\tno translation table information....\n"; + } + if ($swisshashes[$j]) { # if not 0 + $self->swisshash2liveseq($swisshashes[$j],$translation); + } + $j++; + } + + my %gene; # this is the hash to store created object references + $gene{DNA}=$dna; + $gene{Transcripts}=\@transcriptobjs; + $gene{Translations}=\@translationobjs; + + my @exonobjs; my @intronobjs; + my @repeatunitobjs; my @repeatregionobjs; + my @primtranscriptobjs; + + my ($object,$range,$start,$end,$strand); + + my @exons=@{$genefeatureshash->{'exons'}}; + my @exondescs=@{$genefeatureshash->{'exondescs'}}; + if (@exons) { + my $exoncount = 0; + foreach $range (@exons) { + ($start,$end,$strand)=@{$range}; + $object = Bio::LiveSeq::Exon->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); + if ($object != -1) { + $object->desc($exondescs[$exoncount]) if defined $exondescs[$exoncount]; + $exoncount++; + push (@exonobjs,$object); + } else { + $exoncount++; + } + } + $gene{Exons}=\@exonobjs; + } + my @introns=@{$genefeatureshash->{'introns'}}; + my @introndescs=@{$genefeatureshash->{'introndescs'}}; + if (@introns) { + my $introncount = 0; + foreach $range (@introns) { + ($start,$end,$strand)=@{$range}; + $object=Bio::LiveSeq::Intron->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); + if ($object != -1) { + $object->desc($introndescs[$introncount]); + $introncount++; + push (@intronobjs,$object); + } else { + $introncount++; + } + } + $gene{Introns}=\@intronobjs; + } + my @prim_transcripts=@{$genefeatureshash->{'prim_transcripts'}}; + if (@prim_transcripts) { + foreach $range (@prim_transcripts) { + ($start,$end,$strand)=@{$range}; + $object=Bio::LiveSeq::Prim_Transcript->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); + if ($object != -1) { push (@primtranscriptobjs,$object); } + } + $gene{Prim_Transcripts}=\@primtranscriptobjs; + } + my @repeat_regions=@{$genefeatureshash->{'repeat_regions'}}; + my @repeat_regions_family=@{$genefeatureshash->{'repeat_regions_family'}}; + if (@repeat_regions) { + my $k=0; + foreach $range (@repeat_regions) { + ($start,$end,$strand)=@{$range}; + $object=Bio::LiveSeq::Repeat_Region->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); + if ($object != -1) { + $object->desc($repeat_regions_family[$k]); + $k++; + push (@repeatregionobjs,$object); + } else { + $k++; + } + } + $gene{Repeat_Regions}=\@repeatregionobjs; + } + my @repeat_units=@{$genefeatureshash->{'repeat_units'}}; + my @repeat_units_family=@{$genefeatureshash->{'repeat_units_family'}}; + if (@repeat_units) { + my $k=0; + foreach $range (@repeat_units) { + ($start,$end,$strand)=@{$range}; + $object=Bio::LiveSeq::Repeat_Unit->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); + if ($object != -1) { + $object->desc($repeat_units_family[$k]); + $k++; + push (@repeatunitobjs,$object); + } else { + $k++; + } + } + $gene{Repeat_Units}=\@repeatunitobjs; + } + + # create the Gene + my $gene_name=$genefeatureshash->{'gene_name'}; # either a name or a cdspos + return (Bio::LiveSeq::Gene->new(-name=>$gene_name,-features=>\%gene, + -upbound=>$min,-downbound=>$max)); +} + +# maybe this function will be moved to general utility package +# argument: array of numbers +# returns: (min,max) numbers in the array +sub rangeofarray { + my $self=shift; + my @array=@_; + #print "\n-=-=-=-=-=-=-=-=-=-=array: @array\n"; + my ($max,$min,$element); + $min=$max=shift(@array); + foreach $element (@array) { + $element = 0 unless defined $element; + if ($element < $min) { + $min=$element; + } + if ($element > $max) { + $max=$element; + } + } + #print "\n-=-=-=-=-=-=-=-=-=-=min: $min\tmax: $max\n"; + return ($min,$max); +} + + +# argument: reference to DNA object, reference to array of transcripts +# returns: an array of Translation object references +sub transexonscreation { + my $self=shift; + my $dna=$_[0]; + my @transcripts=@{$_[1]}; + + my (@transexons,$start,$end,$strand,$exonref,$exonobj,$transcript,$transcriptobj); + my $translationobj; + my @translationobjects; + foreach $transcript (@transcripts) { + foreach $exonref (@{$transcript}) { + ($start,$end,$strand)=@{$exonref}; + #print "Creating Exon: start $start end $end strand $strand\n"; + $exonobj=Bio::LiveSeq::Exon->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); + #push (@exonobjects,$exonobj); + push (@transexons,$exonobj); + } + $transcriptobj=Bio::LiveSeq::Transcript->new(-exons => \@transexons ); + if ($transcriptobj != -1) { + $translationobj=Bio::LiveSeq::Translation->new(-transcript=>$transcriptobj); + @transexons=(); # cleans it + #push (@transcriptobjects,$transcriptobj); + push (@translationobjects,$translationobj); + } + } + return (@translationobjects); +} + +#sub printgene { +# deleted. Some functionality placed in Gene->printfeaturesnum + +=head2 printswissprot + + Title : printswissprot + Usage : $loader->printswissprot($hashref); + Function: prints out all informations loaded from a database entry into the + loader. Mainly used for testing purposes. + Args : a hashref containing the SWISSPROT entry datas + Note : the hashref can be obtained with a call to the method + $loader->get_swisshash() (only with SRS loader or + BioPerl via Bio::DB::EMBL.pm) + that takes as argument a string like "SWISS-PROT:P10275" or + from $loader->swissprot2hash() that takes an SRS query string + as its argument (e.g. "swissprot-acc:P10275") + +=cut + +# argument: hashref containing the SWISSPROT entry datas +# prints out that hash, showing the informations loaded +sub printswissprot { + my ($self,$entry)=@_; + unless ($entry) { + return; + } + printf "ID: %s\n", + $entry->{'ID'}; + printf "ACC: %s\n", + $entry->{'AccNumber'}; + printf "GENE: %s\n", + $entry->{'Gene'}; + printf "DES: %s\n", + $entry->{'Description'}; + printf "ORG: %s\n", + $entry->{'Organism'}; + printf "SEQLN: %s\n", + $entry->{'SeqLength'}; + printf "SEQ: %s\n", + substr($entry->{'Sequence'},0,64); + if ($entry->{'Features'}) { + my @features=@{$entry->{'Features'}}; + my $i; + for $i (0..$#features) { + print "|",$features[$i]->{'name'},"|"; + print " at ",$features[$i]->{'location'},": "; + print "",$features[$i]->{'desc'},"\n"; + } + } +} + +=head2 printembl + + Title : printembl + Usage : $loader->printembl(); + Function: prints out all informations loaded from a database entry into the + loader. Mainly used for testing purposes. + Args : none + +=cut + +# argument: hashref containing the EMBL entry datas +# prints out that hash, showing the informations loaded +sub printembl { + my ($self,$entry)=@_; + unless ($entry) { + $entry=$self->{'hash'}; + } + my ($i,$featurename); + printf "ID: %s\n", + $entry->{'ID'}; + printf "ACC: %s\n", + $entry->{'AccNumber'}; + printf "MOL: %s\n", + $entry->{'Molecule'}; + printf "DIV: %s\n", + $entry->{'Division'}; + printf "DES: %s\n", + $entry->{'Description'}; + printf "ORG: %s\n", + $entry->{'Organism'}; + printf "SEQLN: %s\n", + $entry->{'SeqLength'}; + printf "SEQ: %s\n", + substr($entry->{'Sequence'},0,64); + my @features=@{$entry->{'Features'}}; + my @cds=@{$entry->{'CDS'}}; + print "\nFEATURES\nNumber of CDS: ",scalar(@cds)," (out of ",$entry->{'FeaturesNumber'}, " total features)\n"; + my ($exonref,$transcript); + my ($qualifiernumber,$qualifiers,$key); + my ($start,$end,$strand); + my $j=0; + for $i (0..$#features) { + $featurename=$features[$i]->{'name'}; + if ($featurename eq "CDS") { + print "|CDS| number $j at feature position: $i\n"; + #print $features[$i]->{'location'},"\n"; + $transcript=$features[$i]->{'range'}; + foreach $exonref (@{$transcript}) { + ($start,$end,$strand)=@{$exonref}; + print "\tExon: start $start end $end strand $strand\n"; + } + $j++; + } else { + print "|$featurename| at feature position: $i\n"; + print "\trange: "; + print join("\t",@{$features[$i]->{'range'}}),"\n"; + #print $features[$i]->{'location'},"\n"; + } + $qualifiernumber=$features[$i]->{'qual_number'}; + $qualifiers=$features[$i]->{'qualifiers'}; # hash + foreach $key (keys (%{$qualifiers})) { + print "\t\t",$key,": "; + print $qualifiers->{$key},"\n"; + } + } +} + +=head2 genes + + Title : genes + Usage : $loader->genes(); + Function: Returns an array of gene_names (strings) contained in the loaded + entry. + Args : none + +=cut + +# argument: entryhashref +# returns: array of genenames found in the entry +sub genes { + my ($self,$entry)=@_; + unless ($entry) { + $entry=$self->{'hash'}; + } + my @entryfeatures=@{$entry->{'Features'}}; + my ($genename,$genenames,$entryfeature); + for $entryfeature (@entryfeatures) { + $genename=$entryfeature->{'qualifiers'}->{'gene'}; + if ($genename) { + if (index($genenames,$genename) == -1) { # if name is new + $genenames .= $genename . " "; # add the name + } + } + } + return (split(/ /,$genenames)); # assumes no space inbetween each genename +} + +# arguments: swisshash, translation objref +# adds information to the Translation, creates AARange objects, sets the +# aa_range attribute on the Translation, pointing to those objects +sub swisshash2liveseq { + my ($self,$entry,$translation)=@_; + my $translength=$translation->length; + $translation->desc($translation->desc . $entry->{'Description'}); + $translation->display_id("SWISSPROT:" . $entry->{'ID'}); + $translation->accession_number("SWISSPROT:" . $entry->{'AccNumber'}); + $translation->name($entry->{'Gene'}); + $translation->source($entry->{'Organism'}); + my @aarangeobjects; + my ($start,$end,$aarangeobj,$feature); + my @features; my @newfeatures; + if ($entry->{'Features'}) { + @features=@{$entry->{'Features'}}; + } + my $cleavedmet=0; + # check for cleaved Met + foreach $feature (@features) { + if (($feature->{'name'} eq "INIT_MET")&&($feature->{'location'} eq "0 0")) { + $cleavedmet=1; + $translation->{'offset'}="1"; # from swissprot to liveseq protein sequence + } else { + push(@newfeatures,$feature); + } + } + + my $swissseq=$entry->{'Sequence'}; + my $liveseqtransl=$translation->seq; + chop $liveseqtransl; # to take away the trailing STOP "*" + my $translated=substr($liveseqtransl,$cleavedmet); + + my ($liveseq_aa,$swiss_aa,$codes_aa)=$self->_get_alignment($translated,$swissseq); # alignment after cleavage of possible initial met + + if ((index($liveseq_aa,"-") != -1)||(index($swiss_aa,"-") != -1)) { # there are gaps, how to proceed? + print "LIVE-SEQ=\'$liveseq_aa\'\nIDENTITY=\'$codes_aa\'\nSWS-PROT=\'$swiss_aa\'\n"; + carp "Nucleotides translation and SwissProt translation are different in size, cannot attach the SwissSequence to the EMBL one, cannot add any AminoAcidRange object/Domain information!"; + return; + } + + #my $i=0; # debug + @features=@newfeatures; + foreach $feature (@features) { + #print "Processing SwissProtFeature: $i\n"; # debug + ($start,$end)=split(/ /,$feature->{'location'}); + # Note: cleavedmet is taken in account for updating numbering + $aarangeobj=Bio::LiveSeq::AARange->new(-start => $start+$cleavedmet, -end => $end+$cleavedmet, -name => $feature->{'name'}, -desc => $feature->{'desc'}, -translation => $translation, -translength => $translength); + if ($aarangeobj != -1) { + push(@aarangeobjects,$aarangeobj); + } + # $i++; # debug + } + $translation->{'aa_ranges'}=\@aarangeobjects; +} + +# if there is no SRS support, the default will be to return 0 +# i.e. this function is overridden in SRS package +sub get_swisshash { + return (0); +} + +# Args: $entry hashref, gene_name OR cds_position (undef is used to +# choose between the two), getswissprotinfo boolean flag +# Returns: an hash holding various arrayref used in the hash2gene method +# Function: examines the nucleotide entry, identifying features belonging +# to the gene (defined either by its name or by the position of its CDS in +# the entry) + +sub _findgenefeatures { + my ($self,$entry,$gene_name,$cds_position,$getswissprotinfo)=@_; + + my @entryfeatures=@{$entry->{'Features'}}; + my @exons; my @introns; my @prim_transcripts; my @transcripts; + my @repeat_units; my @repeat_regions; + my @repeat_units_family; my @repeat_regions_family; my $rpt_family; + my $entryfeature; my @genefeatures; + my $desc; my @exondescs; my @introndescs; + + # for swissprot xreference + my ($swissacc,$swisshash); my @swisshashes; + + # for translation_tables + my @ttables; + + # to create labels + my ($name,$exon); + my @range; my @cdsexons; my @labels; + + # maybe here also could be added special case when there is no CDS feature + # in the entry (e.g. tRNA entry -> TOCHECK). + # let's deal with the special case in which there is just one gene per entry + # usually without /gene qualifier + my @cds=@{$entry->{'CDS'}}; + + my $skipgenematch=0; + if (scalar(@cds) == 1) { + #carp "Note: only one CDS in this entry. Treating all features found in entry as Gene features."; + $skipgenematch=1; + } + + my ($cds_begin,$cds_end,$proximity); + if ($cds_position) { # if a position has been requested + my @cds_exons=@{$cds[$cds_position-1]->{'range'}}; + ($cds_begin,$cds_end)=($cds_exons[0]->[0],$cds_exons[-1]->[1]); # begin and end of CDS + $gene_name=$cds[$cds_position-1]->{'qualifiers'}->{'gene'}; + # DEBUG + unless ($skipgenematch) { + carp "--DEBUG-- cdsbegin $cds_begin cdsend $cds_end--------"; + } + $proximity=100; # proximity CONSTANT to decide whether a feature "belongs" to the CDS + } + + for $entryfeature (@entryfeatures) { # get only features for the desired gene + if (($skipgenematch)||(($cds_position)&&($self->_checkfeatureproximity($entryfeature->{'range'},$cds_begin,$cds_end,$proximity)))||(!($cds_position)&&($entryfeature->{'qualifiers'}->{'gene'} eq "$gene_name"))) { + push(@genefeatures,$entryfeature); + + my @range=@{$entryfeature->{'range'}}; + $name=$entryfeature->{'name'}; + my %qualifierhash=%{$entryfeature->{'qualifiers'}}; + if ($name eq "CDS") { # that has range containing array of exons + + # swissprot crossindexing (if without SRS support it will fill array + # with zeros and do nothing + if ($getswissprotinfo) { + $swissacc=$entryfeature->{'qualifiers'}->{'db_xref'}; + $swisshash=$self->get_swisshash($swissacc); + #$self->printswissprot($swisshash); # DEBUG + push (@swisshashes,$swisshash); + } + + push (@ttables,$entryfeature->{'qualifiers'}->{'transl_table'}); # undef if not specified + + # create labels array + for $exon (@range) { + push(@labels,$exon->[0],$exon->[1]); # start and end of every exon of the CDS + } + push (@transcripts,$entryfeature->{'range'}); + } else { + # "simplifying" the joinedlocation features. I.e. changing them from + # multijoined ones to simple plain start-end features, taking only + # the start of the first "exon" and the end of the last "exon" as + # start and end of the entire feature + if ($entryfeature->{'locationtype'} && $entryfeature->{'locationtype'} eq "joined") { # joined location + @range=($range[0]->[0],$range[-1]->[1]); + } + push(@labels,$range[0],$range[1]); # start and end of every feature + if ($name eq "exon") { + $desc=$entryfeature->{'qualifiers'}->{'number'}; + if ($entryfeature->{'qualifiers'}->{'note'}) { + if ($desc) { + $desc .= "|" . $entryfeature->{'qualifiers'}->{'note'}; + } else { + $desc = $entryfeature->{'qualifiers'}->{'note'}; + } + } + push (@exondescs,$desc || "unknown"); + push(@exons,\@range); + } + if ($name eq "intron") { + $desc=$entryfeature->{'qualifiers'}->{'number'}; + if ($desc) { + $desc .= "|" . $entryfeature->{'qualifiers'}->{'note'}; + } else { + $desc = $entryfeature->{'qualifiers'}->{'note'}; + } + push (@introndescs,$desc || "unknown"); + push(@introns,\@range); + } + if (($name eq "prim_transcript")||($name eq "mRNA")) { push(@prim_transcripts,\@range); } + if ($name eq "repeat_unit") { push(@repeat_units,\@range); + $rpt_family=$entryfeature->{'qualifiers'}->{'rpt_family'}; + push (@repeat_units_family,$rpt_family || "unknown"); + } + if ($name eq "repeat_region") { push(@repeat_regions,\@range); + $rpt_family=$entryfeature->{'qualifiers'}->{'rpt_family'}; + push (@repeat_regions_family,$rpt_family || "unknown"); + } + } + } + } + unless ($gene_name) { $gene_name="cds-position:".$cds_position; } + my %genefeatureshash; + $genefeatureshash{gene_name}=$gene_name; + $genefeatureshash{genefeatures}=\@genefeatures; + $genefeatureshash{labels}=\@labels; + $genefeatureshash{ttables}=\@ttables; + $genefeatureshash{swisshashes}=\@swisshashes; + $genefeatureshash{transcripts}=\@transcripts; + $genefeatureshash{exons}=\@exons; + $genefeatureshash{exondescs}=\@exondescs; + $genefeatureshash{introns}=\@introns; + $genefeatureshash{introndescs}=\@introndescs; + $genefeatureshash{prim_transcripts}=\@prim_transcripts; + $genefeatureshash{repeat_units}=\@repeat_units; + $genefeatureshash{repeat_regions}=\@repeat_regions; + $genefeatureshash{repeat_units_family}=\@repeat_units_family; + $genefeatureshash{repeat_regions_family}=\@repeat_regions_family; + return (\%genefeatureshash); +} + + +# used by _findgenefeatures, when a CDS at a certain position is requested, +# to retrieve only features quite close to the wanted CDS. +# Args: range hashref, begin and end positions of the CDS, $proximity +# $proximity holds the maximum distance between the extremes of the CDS +# and of the feature under exam. +# Returns: boolean +sub _checkfeatureproximity { + my ($self,$range,$cds_begin,$cds_end,$proximity)=@_; + my @range=@{$range}; + my ($begin,$end,$strand); + if (ref($range[0]) eq "ARRAY") { # like in CDS, whose range equivals to exons + ($begin,$end,$strand)=($range[0]->[0],$range[-1]->[1],$range[0]->[2]); + } else { + ($begin,$end,$strand)=@range; + } + if ($cds_begin > $cds_end) { # i.e. reverse strand CDS + ($cds_begin,$cds_end)=($cds_end,$cds_begin); # swap boundaries + } + if ($strand == -1) { # reverse strand + ($begin,$end)=($end,$begin); # swap boundaries + } + if (($cds_begin-$end)>$proximity) { + carp "--DEBUG-- feature rejected: begin $begin end $end -------"; + return (0); + } + if (($begin-$cds_end)>$proximity) { + carp "--DEBUG-- feature rejected: begin $begin end $end -------"; + return (0); + } + carp "--DEBUG-- feature accepted: begin $begin end $end -------"; + return (1); # otherwise ok, feature considered next to CDS +} + + +# function that calls the external program "align" (on the fasta2 package) +# to create an alignment between two sequences, returning the aligned +# strings and the codes for the identity (:: ::::) + +sub _get_alignment { + my ($self,$seq1,$seq2)=@_; + my $fastafile1="/tmp/tmpfastafile1"; + my $fastafile2="/tmp/tmpfastafile2"; + my $grepcut='egrep -v "[[:digit:]]|^ *$|sequences" | cut -c8-'; # grep/cut + my $alignprogram="/usr/local/etc/bioinfo/fasta2/align -s /usr/local/etc/bioinfo/fasta2/idnaa.mat $fastafile1 $fastafile2 2>/dev/null | $grepcut"; # ALIGN + open TMPFASTAFILE1,">$fastafile1" || croak "Cannot write into $fastafile1 for aa alignment"; + open TMPFASTAFILE2,">$fastafile2" || croak "Cannot write into $fastafile1 for aa alignment"; + print TMPFASTAFILE1 ">firstseq\n$seq1\n"; + print TMPFASTAFILE2 ">secondseq\n$seq2\n"; + close TMPFASTAFILE1; + close TMPFASTAFILE2; + my $alignment=`$alignprogram`; + my @alignlines=split(/\n/,$alignment); + my ($linecount,$seq1_aligned,$seq2_aligned,$codes); + for ($linecount=0; $linecount < @alignlines; $linecount+=3) { + $seq1_aligned .= $alignlines[$linecount]; + $codes .= $alignlines[$linecount+1]; + $seq2_aligned .= $alignlines[$linecount+2]; + } + return ($seq1_aligned,$seq2_aligned,$codes); +} + +# common part of the function to create a novel liveseq gene structure +# from an amino acid sequence, using codon usage frequencies +# args: codon_usage_array transltableid aasequence gene_name +sub _common_novelaasequence2gene { + my ($species_codon_usage,$ttabid,$aasequence,$gene_name)=@_; + my @species_codon_usage=@{$species_codon_usage}; + my @codon_usage_label= + qw (cga cgc cgg cgt aga agg cta ctc ctg ctt tta ttg tca tcc tcg + tct agc agt aca acc acg act cca ccc ccg cct gca gcc gcg gct gga + ggc ggg ggt gta gtc gtg gtt aaa aag aac aat caa cag cac cat gaa + gag gac gat tac tat tgc tgt ttc ttt ata atc att atg tgg taa tag + tga); + my ($i,$j); + my %codon_usage_value; + my $aa_codon_total; + for ($i=0;$i<64;$i++) { + $codon_usage_value{$codon_usage_label[$i]}=$species_codon_usage[$i]; + } + + my $CodonTable = Bio::Tools::CodonTable->new ( -id => $ttabid ); + my @aminoacids = split(//,uc($aasequence)); + my @alt_codons; my ($relativeusage,$dnasequence,$chosen_codon,$dice,$partial,$thiscodon); + for $i (@aminoacids) { + @alt_codons = $CodonTable->revtranslate($i); + unless (@alt_codons) { + carp "No reverse translation possible for aminoacid \'$i\'"; + $dnasequence .= "???"; + } else { + $aa_codon_total=0; + for $j (@alt_codons) { + $aa_codon_total+=$codon_usage_value{$j}; + } + # print "aminoacid $i, codonchoice: "; # verbose + #$partial=0; + #for $j (@alt_codons) { + #printf "%s %.2f ",$j,$partial+$codon_usage_value{$j}/$aa_codon_total; + #$partial+=($codon_usage_value{$j}/$aa_codon_total); + #} + #print "\n"; + $dice=rand(1); + #print "roulette: $dice\n"; # verbose + $partial=0; + $chosen_codon=""; + CODONCHOICE: + for $j (0..@alt_codons) { # last one not accounted + $thiscodon=$alt_codons[$j]; + $relativeusage=($codon_usage_value{$thiscodon}/$aa_codon_total); + if ($dice < $relativeusage+$partial) { + $chosen_codon=$thiscodon; + last CODONCHOICE; + } else { + $partial += $relativeusage; + } + } + unless ($chosen_codon) { + $chosen_codon = $alt_codons[-1]; # the last one + } + # print ".....adding $chosen_codon\n"; # verbose + $dnasequence .= $chosen_codon; + } + } + + my $dna = Bio::LiveSeq::DNA->new(-seq => $dnasequence); + my $min=1; + my $max=length($dnasequence); + my $exon = Bio::LiveSeq::Exon->new(-seq => $dna, -start => $min, -end => $max, -strand => 1); + my @exons=($exon); + my $transcript = Bio::LiveSeq::Transcript->new(-exons => \@exons); + $transcript->translation_table($ttabid); + my @transcripts=($transcript); + my $translation = Bio::LiveSeq::Translation->new(-transcript => $transcript); + my @translations=($translation); + my %features=(DNA => $dna, Transcripts => \@transcripts, Translations => \@translations); + my $gene = Bio::LiveSeq::Gene->new(-name => $gene_name, -features => \%features, -upbound => $min, -downbound => $max); + + # creation of gene + unless ($gene) { # if $gene == 0 it means problems in hash2gene + carp "Error in Gene creation phase"; + return (0); + } + return $gene; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/LiveSeq/IO/README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/LiveSeq/IO/README Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,27 @@ +# $Id: README,v 1.3 2002/03/15 12:46:43 heikki Exp $ + +README for Bio::LiveSeq::IO + +LiveSeq objects representing known gene structures and their sequences +have to be created from nucleotide sequence files. The current IO +files do it by reading in EMBL entries and parsing out sequences as +well as CDS, exon and primary_transcript features from the feature +table. + +Bio::LiveSeq::IO::Loader + + is a superclass holding methods common to other methods. + +Bio::LiveSeq::IO::BioPerl + + is the preferred method which uses Bio::DB::EMBL to retrive + sequences over the Web by accession number. + +Bio::LiveSeq::IO::SRS + + retrieves sequences from a local installation of SRS. It needs + srsperl.pm which is part of SRS. SRS is short for Sequence + Retrieval System, a comprehensive program suite for indexing + and serving biological databases. SRS is a product of Lion + BioSciences (http://www.lionbio.co.uk/). The license for + academic users is free. diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/LiveSeq/IO/SRS.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/LiveSeq/IO/SRS.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,611 @@ +# $Id: SRS.pm,v 1.7 2001/06/18 08:27:55 heikki Exp $ +# +# bioperl module for Bio::LiveSeq::IO::SRS +# +# Cared for by Joseph Insana +# +# Copyright Joseph Insana +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::LiveSeq::IO::SRS - Loader for LiveSeq from EMBL entries with SRS + +=head1 SYNOPSIS + + my $db="EMBL"; + my $acc_id="M20132"; + my $query="embl-acc:$acc_id"; + + my $loader=Bio::LiveSeq::IO::SRS->load(-db=>"EMBL", -query=>"$query"); + + my @translationobjects=$loader->entry2liveseq(); + + my $gene="AR"; + my $gene=$loader->gene2liveseq("gene"); + + NOTE: The only -db now supported is EMBL. Hence it defaults to EMBL. + +=head1 DESCRIPTION + +This package uses the SRS (Sequence Retrieval System) to fetch a sequence +database entry, analyse it and create LiveSeq objects out of it. + +An embl-acc ID has to be passed to this package which will return references +to all translation objects created from the EMBL entry. +References to Transcription, DNA and Exon objects can all be retrieved departing +from these. + +Alternatively, a specific "gene" name can be specified, together with the +embl-acc ID. This will create a LiveSeq::Gene object with all relevant gene +features attached/created. + +=head1 AUTHOR - Joseph A.L. Insana + +Email: Insana@ebi.ac.uk, jinsana@gmx.net + +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::LiveSeq::IO::SRS; +$VERSION=2.4; + +# Version history: +# Wed Apr 5 13:06:43 BST 2000 v 1.0 restarted as a child of Loader.pm +# Wed Apr 5 15:57:22 BST 2000 v 1.1 load() created +# Thu Apr 6 02:52:11 BST 2000 v 1.2 new field "range" in hash +# Thu Apr 6 03:11:29 BST 2000 v 1.22 transition from $location to @range +# Thu Apr 6 03:40:04 BST 2000 v 1.25 added Division +# Tue Apr 18 17:15:26 BST 2000 v 2.0 started coding swissprot2hash +# Thu Apr 20 02:17:28 BST 2000 v 2.1 mRNA added to valid_feature_names +# Wed Jun 7 02:08:57 BST 2000 v 2.2 added support for joinedlocation features +# Thu Jun 29 19:07:59 BST 2000 v 2.22 Gene stripped of possible newlines (horrible characteristic of some entries!!!!) +# Fri Jun 30 14:08:21 BST 2000 v 2.3 SRS querying routines now conveniently wrapped in eval{} blocks to avoid SRS crash the whole LiveSeq +# Tue Jul 4 14:07:52 BST 2000 v 2.31 note&number added in val_qual_names +# Mon Sep 4 17:46:42 BST 2000 v 2.4 novelaasequence2gene() added + +use strict; +use Carp qw(cluck croak carp); +use vars qw($VERSION @ISA); +use lib $ENV{SRSEXE}; +use srsperl; +use Bio::Tools::CodonTable; # for novelaasequence2gene + +use Bio::LiveSeq::IO::Loader 2.2; + +@ISA=qw(Bio::LiveSeq::IO::Loader); + +# This package can in the future host other databases loading subroutines. +# e.g. ensembl2hash + +=head2 load + + Title : load + Usage : my $acc_id="M20132"; + my $query="embl-acc:$acc_id"; + $loader=Bio::LiveSeq::IO::SRS->load(-db=>"EMBL", -query=>"$query"); + + Function: loads an entry with SRS from a database into a hash + Returns : reference to a new object of class IO::SRS holding an entry + Errorcode 0 + Args : an SRS query resulting in one fetched EMBL (by default) entry + +=cut + +sub load { + my ($thing, %args) = @_; + my $class = ref($thing) || $thing; + my ($obj,%loader); + + my ($db,$query)=($args{-db},$args{-query}); + + if (defined($db)) { + unless ($db eq "EMBL") { + carp "Note: only EMBL now supported!"; + return(0); + } + } else { + $db="EMBL"; + } + + my $hashref; + if ($db eq "EMBL") { + my $test_transl=0; # change to 0 to avoid comparison of translation + + # these can be changed for future needs + my @embl_valid_feature_names=qw(CDS exon prim_transcript intron repeat_unit repeat_region mRNA); + my @embl_valid_qual_names=qw(gene codon_start db_xref product note number rpt_family transl_table); + + # dunno yet how to implement test_transl again.... + # probably on a one-on-one basis with each translation? + if ($test_transl) { + push (@embl_valid_qual_names,"translation"); # needed for test_transl + } + + # not to have the whole program die because of SRS death + eval { + $hashref=&embl2hash("$query",\@embl_valid_feature_names,\@embl_valid_qual_names); + }; + my $errormsg; + if ($@) { + foreach $errormsg (split(/\n/,$@)) { + if (index($errormsg,"in cleanup") == -1) { + carp "SRS EMBL Loader: $@"; + } + } + } + } + unless ($hashref) { return (0); } + + %loader = (db => $db, query => $query, hash => $hashref); + $obj = \%loader; + $obj = bless $obj, $class; + return $obj; +} + +=head2 embl2hash + + Title : embl2hash + Function: retrieves with SRS an EMBL entry, parses it and creates + a hash that contains all the information. + Returns : a reference to a hash + Errorcode: 0 + Args : an SRS query resulting in one fetched EMBL entry + i.e. a string in a form like "embl-acc:accession_number" + two array references to skip features and qualifiers (for + performance) + Example: @valid_features=qw(CDS exon prim_transcript mRNA); + @valid_qualifiers=qw(gene codon_start db_xref product rpt_family); + $hashref=&embl2hash("$query",\@valid_features,\@valid_qualifiers); + +=cut + +# this has to be defined here as it is the only thing really proper to +# the /SRS/ loader. Every loader has to sport his own "entry2hash" function +# the main functions will be in the Loader.pm +# arguments: embl SRS query resulting in one fetched EMBL entry +# to skip features and qualifiers (for performance), two array +# references must be passed (this can change into string arguments to +# be passed....) +# returns: a reference to a hash containing the important features requested +sub embl2hash { + my ($i,$j); + my $query=$_[0]; + my %valid_features; my %valid_names; + if ($_[1]) { + %valid_features = map {$_, 1} @{$_[1]}; # to skip features + } + if ($_[2]) { + %valid_names = map {$_, 1} @{$_[2]}; # to skip qualifiers + } + # SRS API used to fetch all relevant fields + my $sess = new Session; + my $set = $sess->query("[$query]", ""); + my $numEntries=$set->size(); + if ($numEntries < 1) { + carp "No sequence entry found for the query $query"; + return (0); + } elsif ($numEntries > 1) { + carp "Too many entries found for the input given"; + return (0); + } else { + my $entry = $set->getEntry(0); + my ($entry_ID,$entry_AccNumber,$entry_Molecule,$entry_Description,$entry_Organism,$entry_SeqLength,$entry_Sequence,$entry_Division); + # Fetch what we can fetch without the loader + $entry_ID = $entry->fieldToString("id","","",""); + $entry_AccNumber = $entry->fieldToString("acc","","",""); + $entry_Molecule = $entry->fieldToString("mol","","",""); + $entry_Division = $entry->fieldToString("div","","",""); + $entry_Description = $entry->fieldToString("des","","",""); + $entry_Description =~ s/\n/ /g; + $entry_Organism = $entry->fieldToString("org","","",""); + $entry_SeqLength = $entry->fieldToString("sl","","",""); + # Now use the loader + my $loadedentry = $entry->load("EMBL"); + # Fetch the rest via the loader + $entry_Sequence = $loadedentry->attrStr("sequence"); + $entry_Sequence =~ s/\n//g; # from plain format to raw string + + # put into the hash + my %entryhash; + $entryhash{ID}=$entry_ID; + $entryhash{AccNumber}=$entry_AccNumber; + $entryhash{Molecule}=$entry_Molecule; + $entryhash{Division}=$entry_Division; + $entryhash{Description}=$entry_Description; + $entryhash{Organism}=$entry_Organism; + $entryhash{Sequence}=$entry_Sequence; + $entryhash{SeqLength}=$entry_SeqLength; + + # create features array + my $features = $loadedentry->attrObjList("features"); + my $featuresnumber= $features->size(); + $entryhash{FeaturesNumber}=$featuresnumber; + my ($feature,$feature_name,$feature_location); + my ($feature_qual_names,$feature_qual_values); + my ($feature_qual_name,$feature_qual_value); + my ($feature_qual_number,$feature_qual_number2); + my @features; + for $i (0..$featuresnumber-1) { + my %feature; + $feature = $features->get($i); + $feature_name = $feature->attrStr("FtKey"); + + #unless ($feature_name eq "CDS") { + unless ($valid_features{$feature_name}) { + #print "not valid feature: $feature_name\n"; + next; + } + #print "now processing feature: $feature_name\n"; + $feature_location = $feature->attrStr("FtLocation"); + $feature_location =~ s/\n//g; + $feature_qual_names= $feature->attrStrList("FtQualNames"); + $feature_qual_values= $feature->attrStrList("FtQualValues"); + $feature_qual_number= $feature_qual_names->size(); + $feature_qual_number2= $feature_qual_values->size(); # paranoia check + if ($feature_qual_number > $feature_qual_number2) { + carp ("Warning with Feature at position $i ($feature_name): There are MORE qualifier names than qualifier values."); + # this can happen, e.g. "/partial", let's move on, just issue a warning + #return (0); + } elsif ($feature_qual_number < $feature_qual_number2) { + carp ("Error with Feature at position $i ($feature_name): There are LESS qualifier names than qualifier values. Stopped"); + return (0); + } + #} else {print "NUMBER OF QUALIFIERS: $feature_qual_number \n";} # DEBUG + + # Put things inside hash + $feature{position}=$i; + $feature{name}=$feature_name; + + # new range field in hash + my @range; + if ($feature_name eq "CDS") { + @range=cdslocation2transcript($feature_location); + $feature{locationtype}="joined"; + } else { + if (index($feature_location,"join") == -1) { + @range=location2range($feature_location); + } else { # complex location in feature different than CDS + @range=joinedlocation2range($feature_location); + $feature{locationtype}="joined"; + } + } + $feature{range}=\@range; + $feature{location}="deprecated"; + my %feature_qualifiers; + for $j (0..$feature_qual_number-1) { + $feature_qual_name=$feature_qual_names->get($j); + $feature_qual_name =~ s/^\///; # eliminates heading "/" + + # skip all not interesting (for now) qualifiers + unless ($valid_names{$feature_qual_name}) { + #print "not valid name: $feature_qual_name\n"; + next; + } + #print "now processing: $feature_qual_name\n"; + $feature_qual_value=$feature_qual_values->get($j); + $feature_qual_value =~ s/^"//; # eliminates heading " + $feature_qual_value =~ s/"$//; # eliminates trailing " + $feature_qual_value =~ s/\n//g; # eliminates all new lines + $feature_qualifiers{$feature_qual_name}=$feature_qual_value; + } + $feature{qualifiers}=\%feature_qualifiers; + push (@features,\%feature); # array of features + } + $entryhash{Features}=\@features; # put this also into the hash + my @cds; # array just of CDSs + for $i (0..$#features) { + if ($features[$i]->{'name'} eq "CDS") { + push(@cds,$features[$i]); + } + } + $entryhash{CDS}=\@cds; # put this also into the hash + return (\%entryhash); + } +} + +# argument: location field of an EMBL feature +# returns: array with correct $start $end and $strand to create LiveSeq obj +sub location2range { + my ($location)=@_; + my ($start,$end,$strand); + if (index($location,"complement") == -1) { # forward strand + $strand=1; + } else { # reverse strand + $location =~ s/complement\(//g; + $location =~ s/\)//g; + $strand=-1; + } + $location =~ s/\//g; + my @range=split(/\.\./,$location); + if (scalar(@range) == 1) { # special case of range with just one position (e.g. polyA_site EMBL features + $start=$end=$range[0]; + } else { + if ($strand == 1) { + ($start,$end)=@range; + } else { # reverse strand + ($end,$start)=@range; + } + } + return ($start,$end,$strand); +} + +# argument: location field of a CDS feature +# returns: array of exons arrayref, useful to create LiveSeq Transcript obj +sub cdslocation2transcript { + my ($location)=@_; + my @exonlocs; + my $exonloc; + my @exon; + my @transcript=(); + $location =~ s/^join\(//; + $location =~ s/\)$//; + @exonlocs = split (/\,/,$location); + for $exonloc (@exonlocs) { + my @exon=location2range($exonloc); + push (@transcript,\@exon); + } + return (@transcript); +} + +# argument: location field of a CDS feature +# returns: array of exons arrayref, useful to create LiveSeq Transcript obj +sub joinedlocation2range { + &cdslocation2transcript; +} + + +=head2 get_swisshash + + Title : get_swisshash + Usage : $loader->get_swisshash(); + Example : $swisshash=$loader->swissprot2hash("SWISS-PROT:P10275") + Function: retrieves with SRS a SwissProt entry, parses it and creates + a hash that contains all the information. + Returns : a reference to a hash + Errorcode: 0 + Args : the db_xref qualifier's value from an EMBL CDS Feature + i.e. a string in the form "SWISS-PROT:accession_number" + Note : this can be modified (adding a second argument) to retrieve + and parse SWTREMBL, SWALL... entries + +=cut + +# argument: db_xref qualifier's value from EMBL CDS +# errorcode: 0 +# returns hashref +sub get_swisshash { + my ($self,$query)=@_; + if (index($query,"SWISS-PROT") == -1) { + return (0); + } + $query =~ s/SWISS-PROT/swissprot-acc/; + my $hashref; + eval { + $hashref=&swissprot2hash("$query"); + }; + my $errormsg; + if ($@) { + foreach $errormsg (split(/\n/,$@)) { + if (index($errormsg,"in cleanup") == -1) { + carp "SRS Swissprot Loader: $@"; + } + } + } + unless ($hashref) { + return (0); + } else { + return ($hashref); + } +} + +=head2 swissprot2hash + + Title : swissprot2hash + Usage : $loader->swissprot2hash(); + Example : $swisshash=$loader->swissprot2hash("swissprot-acc:P10275") + Function: retrieves with SRS a SwissProt entry, parses it and creates + a hash that contains all the information. + Returns : a reference to a hash + Errorcode: 0 + Args : an SRS query resulting in one entry from SwissProt database + i.e. a string in the form "swissprot-acc:accession_number" + Note : this can be modified (adding a second argument) to retrieve + and parse SWTREMBL, SWALL... entries + +=cut + +# arguments: swissprot SRS query resulting in one fetched swissprot entry +# returns: a reference to a hash containing the important features requested +sub swissprot2hash { + my ($i,$j); + my $query=$_[0]; + # SRS API used to fetch all relevant fields + my $sess = new Session; + my $set = $sess->query("[$query]", ""); + my $numEntries = $set->size(); + if ($numEntries < 1) { + carp "No sequence entry found for the query $query"; + return (0); + } elsif ($numEntries > 1) { + carp "Too many entries found for the input given"; + return (0); + } else { + my $entry = $set->getEntry(0); + my ($entry_ID,$entry_AccNumber,$entry_Molecule,$entry_Description,$entry_Organism,$entry_SeqLength,$entry_Sequence,$entry_Gene); + # Fetch what we can fetch without the loader + $entry_ID = $entry->fieldToString("id","","",""); + $entry_AccNumber = $entry->fieldToString("acc","","",""); + $entry_Gene = $entry->fieldToString("gen","","",""); + $entry_Gene =~ s/\n/ /g; + $entry_Description = $entry->fieldToString("des","","",""); + $entry_Description =~ s/\n/ /g; + $entry_Organism = $entry->fieldToString("org","","",""); + chop $entry_Organism; + $entry_SeqLength = $entry->fieldToString("sl","","",""); + # Now use the loader + my $loadedentry = $entry->load("Swissprot"); + # Fetch the rest via the loader + $entry_Sequence = $loadedentry->attrStr("Sequence"); + $entry_Sequence =~ s/\n//g; # from plain format to raw string + + # put into the hash + my %entryhash; + $entryhash{ID}=$entry_ID; + $entryhash{AccNumber}=$entry_AccNumber; + $entryhash{Molecule}=$entry_Molecule; + $entryhash{Gene}=$entry_Gene; + $entryhash{Description}=$entry_Description; + $entryhash{Organism}=$entry_Organism; + $entryhash{Sequence}=$entry_Sequence; + $entryhash{SeqLength}=$entry_SeqLength; + + # create features array + my $features = $loadedentry->attrObjList("Features"); + my $featuresnumber= $features->size(); + $entryhash{FeaturesNumber}=$featuresnumber; + my ($feature,$feature_name,$feature_description,$feature_location); + my @features; + for $i (0..$featuresnumber-1) { + my %feature; + $feature = $features->get($i); + $feature_name = $feature->attrStr("FtKey"); + $feature_location = $feature->attrStr("FtLocation"); + $feature_location =~ s/ +/ /g; + $feature_description = $feature->attrStr("FtDescription"); + chop $feature_description; + $feature_description =~ s/\nFT / /g; + + # Put things inside hash + $feature{position}=$i; + $feature{name}=$feature_name; + $feature{location}=$feature_location; + $feature{description}=$feature_description; + + push (@features,\%feature); # array of features + } + $entryhash{Features}=\@features; # put this also into the hash + return (\%entryhash); + } +} + +=head2 novelaasequence2gene + + Title : novelaasequence2gene + Usage : $gene=Bio::LiveSeq::IO::SRS->novelaasequence2gene(-aasequence => "MGLAAPTRS*"); + : $gene=Bio::LiveSeq::IO::SRS->novelaasequence2gene(-aasequence => "MGLAAPTRS*", + -genome => "Homo sapiens"); + : $gene=Bio::LiveSeq::IO::SRS->novelaasequence2gene(-aasequence => "MGLAAPTRS*", + -genome => "Mitochondrion Homo sapiens", + -gene_name => "tyr-kinase"); + + Function: creates LiveSeq objects from a novel amino acid sequence, + using codon usage database to choose codons according to + relative frequencies. + If a genome latin name is not specified, the default is to use + 'Homo sapiens' (taxonomy ID 9606). + Returns : reference to a Gene object containing references to LiveSeq objects + Errorcode 0 + Args : string containing an amino acid sequence + string (optional) with a species/genome latin name + string specifying a gene name + Note : SRS access to TAXON and CODONUSAGE databases is required + +=cut + +sub novelaasequence2gene { + my ($self, %args) = @_; + my ($gene_name,$species_name,$aasequence)=($args{-gene_name},$args{-genome},$args{-aasequence}); + unless ($aasequence) { + carp "aasequence not given"; + return (0); + } + unless ($gene_name) { + $gene_name="Novel Unknown"; + } + unless ($species_name) { + $species_name="Homo sapiens"; + } + + my $sess = new Session; + my ($e,$numEntries,$set); + + # codonusage lookup + my $codonusage_usage; + my @species_codon_usage; + $set = $sess->query("[codonusage:'$species_name']", ""); + $numEntries = $set->size(); + if ($numEntries > 0) { + $e = $set->getEntry(0); + $species_name = $e->fieldToString("id","","",""); + $codonusage_usage = $e->fieldToString("usage","","",""); + @species_codon_usage=split(/\s/,$codonusage_usage); # spaces or tabs + if ($numEntries > 1) { + carp "Search in Codon Usage DB resulted in $numEntries results. Taking the first one: $species_name"; + } + } else { + carp "Genome not found in codon usage DB."; + return (0); + } + + # taxonomy lookup + my $mito_flag = 0; + my $species_origin; + if ($species_name =~ /^Mitochondrion /) { + $mito_flag = 1; + $species_origin = $species_name; + $species_origin =~ s/^Mitochondrion //; + $set = $sess->query("[taxonomy-species:'$species_origin']", ""); + } elsif ($species_name =~ /^Chloroplast |^Kinetoplast |^Chromoplast /) { + $species_origin = $species_name; + $species_origin =~ s/^Chromoplast //; + $species_origin =~ s/^Kinetoplast //; + $species_origin =~ s/^Chloroplast //; + $set = $sess->query("[taxonomy-species:'$species_origin']", ""); + } else { + $set = $sess->query("[taxonomy-species:'$species_name']", ""); + } + $numEntries = $set->size(); + my ($taxonomy_id,$taxonomy_gc,$taxonomy_mgc,$taxonomy_name); + if ($numEntries > 0) { + $e = $set->getEntry(0); + $taxonomy_id = $e->fieldToString("id","","",""); + $taxonomy_name = $e->fieldToString("species","","",""); + $taxonomy_gc = $e->fieldToString("gc","","",""); + $taxonomy_mgc = $e->fieldToString("mgc","","",""); + if ($numEntries > 1) { + carp "Note! More than one entry found in taxonomy DB for the genome query given. Using the first one: $taxonomy_name ($taxonomy_id)"; + } + } else { + carp "Genome not found in taxonomy DB."; + return (0); + } + # Lookup appropriate translation table + my $ttabid; + if ($mito_flag) { + $ttabid = $taxonomy_mgc; + } else { + $ttabid = $taxonomy_gc; + } + + my $gene=Bio::LiveSeq::IO::Loader::_common_novelaasequence2gene(\@species_codon_usage,$ttabid,$aasequence,$gene_name); + return ($gene); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/LiveSeq/Intron.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/LiveSeq/Intron.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,73 @@ +# $Id: Intron.pm,v 1.7 2001/06/18 08:27:53 heikki Exp $ +# +# bioperl module for Bio::LiveSeq::Intron +# +# Cared for by Joseph Insana +# +# Copyright Joseph Insana +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::LiveSeq::Intron - Range abstract class for LiveSeq + +=head1 SYNOPSIS + + # documentation needed + +=head1 DESCRIPTION + +Class for INTRON objects. They consist of a beginlabel, an endlabel (both +referring to a LiveSeq DNA object) and a strand. +The strand could be 1 (forward strand, default), -1 (reverse strand). + +=head1 AUTHOR - Joseph A.L. Insana + +Email: Insana@ebi.ac.uk, jinsana@gmx.net + +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::LiveSeq::Intron; +$VERSION=1.0; + +# Version history: +# Mon Mar 20 22:26:13 GMT 2000 v 1.0 begun + +use strict; +use vars qw($VERSION @ISA); +use Bio::LiveSeq::Range 1.2; # uses Range, inherits from it +@ISA=qw(Bio::LiveSeq::Range); + +=head2 new + + Title : new + Usage : $intron1=Bio::LiveSeq::Intron->new(-seq => $objref, + -start => $startlabel, + -end => $endlabel, + -strand => 1 + ); + + Function: generates a new Bio::LiveSeq::Intron + Returns : reference to a new object of class Intron + Errorcode -1 + Args : two labels and an integer + +=cut + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/LiveSeq/Mutation.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/LiveSeq/Mutation.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,414 @@ +# $Id: Mutation.pm,v 1.6 2002/10/22 07:38:34 lapp Exp $ +# +# BioPerl module for Bio::LiveSeq::Mutation +# +# Cared for by Heikki Lehvaslaiho +# +# Copyright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::LiveSeq::Mutation - Mutation event descriptor class + +=head1 SYNOPSIS + + # full descrition of a point mutation + $mutation1a = Bio::LiveSeq::Mutation->new ( -seq => 'A', + -seqori => 'T', + -pos => 100, + -len => 1 # optional, defaults to length(seq) + ); + + # minimal information for a point mutation + $mutation1b = Bio::LiveSeq::Mutation->new ( -seq => 'A', + -pos => 100 + ); + # insertion + $mutation2 = Bio::LiveSeq::Mutation->new ( -seq => 'ATT', + -pos => 100, + -len => 0 + ); + # deletion + $mutation3 = Bio::LiveSeq::Mutation->new ( -seq => '', # optional + -seqori => 'TTG', # optional + -pos => 100 + -len => 3 + ); + # complex + $mutation4 = Bio::LiveSeq::Mutation->new ( -seq => 'CC', + -seqori => 'TTG', # optional + -pos => 100 + -len => 3 + ); + + +=head1 DESCRIPTION + +This class describes a local mutation event using minimalistic +description. It is not necessary to know anything about the original +sequence. You need to give the changed sequence, the position of the +mutation in the (unidentified) reference sequence, and the length of +the affected subsequence in the reference sequence. If the original +allele sequence is given, the objects applying the mutation into the +reference sequence (e.g. L) might check for its +validity. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email: heikki@ebi.ac.uk +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::LiveSeq::Mutation; +use vars qw(@ISA); +use strict; + +# Object preamble - inheritance + +use Bio::Root::Root; + +@ISA = qw( Bio::Root::Root ); + +sub new { + my($class,@args) = @_; + my $self; + $self = {}; + bless $self, $class; + + my ($seq, $seqori, $pos, $len, $label) = + $self->_rearrange([qw(SEQ + SEQORI + POS + LEN + )], + @args); + + $seq && $self->seq($seq); + $seqori && $self->seqori($seqori); + $pos && $self->pos($pos); + defined($len) && $self->len($len); # defined() added otherwise won't work for len==0 + + return $self; # success - we hope! +} + + +=head2 seq + + Title : seq + Usage : $obj->seq(); + Function: + + Sets and returns the mutated sequence. No checking is done + to validate the symbols. + + Example : + Returns : string + Args : integer + +=cut + + +sub seq { + my ($self,$value) = @_; + if( defined $value) { + $self->{'seq'} = $value; + } + return $self->{'seq'}; +} + + +=head2 seqori + + Title : seqori + Usage : $obj->seqori(); + Function: + + Sets and returns the original subsequence in the reference + sequence. No checking is done to validate the symbols. + Optional value. + + Example : + Returns : string + Args : string + +=cut + + +sub seqori { + my ($self,$value) = @_; + if( defined $value) { + $self->{'seqori'} = $value; + } + return $self->{'seqori'}; +} + + +=head2 pos + + Title : pos + Usage : $obj->pos(); + Function: + + Sets and returns the position of the first element in the + sequence. + + Example : + Returns : string + Args : integer + +=cut + + +sub pos { + my ($self,$value) = @_; + if( defined $value) { + if ( $value !~ /^([+-])?\d+$/ ) { + $self->throw("[$value] for pos has to be an integer\n"); + } else { + $self->{'pos'} = $value; + } + } + return $self->{'pos'}; +} + +=head2 len + + Title : len + Usage : $obj->len(); + Function: + + Sets and returns the len of the affected original allele + sequence. If value is not set, defaults to the lenght of + the mutated sequence (seq). + + Example : + Returns : string + Args : string + +=cut + +sub len { + my ($self,$value) = @_; + if ( defined $value) { + $self->{'len'} = $value; + } + if ( ! exists $self->{'len'} ) { + return length $self->{'seq'}; + } + return $self->{'len'}; +} + +=head2 label + + Title : label + Usage : $obj->label(); + Function: + + Sets and returns the label of the affected original allele + location. Label is a stable identifier whereas location + can be changed by mutations. Label comes from + l. + + Example : + Returns : string + Args : string + +=cut + +sub label { + my ($self,$value) = @_; + if ( defined $value) { + $self->{'label'} = $value; + } + if ( ! exists $self->{'label'} ) { + return undef; + } + return $self->{'label'}; +} + + +=head2 transpos + + Title : transpos + Usage : $obj->transpos(); + Function: + + Sets and returns the transcript position of the mutation. + Set when associated with a reference sequence. Value + depends on reference molecule and the co-ordinate system + used. + + Example : + Returns : string + Args : integer + +=cut + + +sub transpos { + my ($self,$value) = @_; + if( defined $value) { + if ( $value !~ /^([+-])?\d+$/ ) { + $self->throw("[$value] for transpos has to be an integer\n"); + } else { + $self->{'transpos'} = $value; + } + } + return $self->{'transpos'}; +} + + +=head2 issue + + Title : issue + Usage : $obj->issue(); + Function: + + Sets and returns the position of the mutation in an array + of mutations to be issued. Set after the validity of the + mutation has been confirmed. + + Example : + Returns : string + Args : integer + +=cut + + +sub issue { + my ($self,$value) = @_; + if( defined $value) { + if ( $value !~ /^([+-])?\d+$/ ) { + $self->throw("[$value] for issue has to be an integer\n"); + } else { + $self->{'issue'} = $value; + } + } + return $self->{'issue'}; +} + + +=head2 prelabel + + Title : prelabel + Usage : $obj->prelabel(); + Function: + + Sets and returns the prelabel of the affected original allele + location. Prelabel is a stable identifier whereas location + can be changed by mutations. Prelabel comes from + l. + + Example : + Returns : string + Args : string + +=cut + +sub prelabel { + my ($self,$value) = @_; + if ( defined $value) { + $self->{'prelabel'} = $value; + } + if ( ! exists $self->{'prelabel'} ) { + return undef; + } + return $self->{'prelabel'}; +} + + +=head2 postlabel + + Title : postlabel + Usage : $obj->postlabel(); + Function: + + Sets and returns the postlabel of the affected original allele + location. Postlabel is a stable identifier whereas location + can be changed by mutations. Postlabel comes from + l. + + Example : + Returns : string + Args : string + +=cut + +sub postlabel { + my ($self,$value) = @_; + if ( defined $value) { + $self->{'postlabel'} = $value; + } + if ( ! exists $self->{'postlabel'} ) { + return undef; + } + return $self->{'postlabel'}; +} + + +=head2 lastlabel + + Title : lastlabel + Usage : $obj->lastlabel(); + Function: + + Sets and returns the lastlabel of the affected original allele + location. Lastlabel is a stable identifier whereas location + can be changed by mutations. Lastlabel comes from + l. + + Example : + Returns : string + Args : string + +=cut + +sub lastlabel { + my ($self,$value) = @_; + if ( defined $value) { + $self->{'lastlabel'} = $value; + } + if ( ! exists $self->{'lastlabel'} ) { + return undef; + } + return $self->{'lastlabel'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/LiveSeq/Mutator.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/LiveSeq/Mutator.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1442 @@ +# $Id: Mutator.pm,v 1.26 2002/10/22 07:38:34 lapp Exp $ +# +# bioperl module for Bio::LiveSeq::Mutator +# +# Cared for by Heikki Lehvaslaiho +# +# Copyright Joseph Insana +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::LiveSeq::Mutator - Package mutating LiveSequences + +=head1 SYNOPSIS + + # $gene is a Bio::LiveSeq::Gene object + my $mutate = Bio::LiveSeq::Mutator->new('-gene' => $gene, + '-numbering' => "coding" + ); + # $mut is a Bio::LiveSeq::Mutation object + $mutate->add_Mutation($mut); + # $results is a Bio::Variation::SeqDiff object + my $results=$mutate->change_gene(); + if ($results) { + my $out = Bio::Variation::IO->new( '-format' => 'flat'); + $out->write($results); + } + +=head1 DESCRIPTION + +This class mutates Bio::LiveSeq::Gene objects and returns a +Bio::Variation::SeqDiff object. Mutations are described as +Bio::LiveSeq::Mutation objects. See L, +L, and L for details. + +=head1 FEEDBACK + + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho & Joseph A.L. Insana + + Email: heikki@ebi.ac.uk + insana@ebi.ac.uk, jinsana@gmx.net + + Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 APPENDIX + + The rest of the documentation details each of the object + methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::LiveSeq::Mutator; +use vars qw(@ISA); +use strict; + +use vars qw($VERSION @ISA); +use Bio::Variation::SeqDiff; +use Bio::Variation::DNAMutation; +use Bio::Variation::RNAChange; +use Bio::Variation::AAChange; +use Bio::Variation::Allele; +use Bio::LiveSeq::Mutation; + +#use integer; +# Object preamble - inheritance + +use Bio::Root::Root; + +@ISA = qw( Bio::Root::Root ); + +sub new { + my($class,@args) = @_; + my $self; + $self = {}; + bless $self, $class; + + my ($gene, $numbering) = + $self->_rearrange([qw(GENE + NUMBERING + )], + @args); + + $self->{ 'mutations' } = []; + + $gene && $self->gene($gene); + $numbering && $self->numbering($numbering); + + #class constant; + $self->{'flanklen'} = 25; + return $self; # success - we hope! +} + +=head2 gene + + Title : gene + Usage : $mutobj = $obj->gene; + : $mutobj = $obj->gene($objref); + Function: + + Returns or sets the link-reference to a + Bio::LiveSeq::Gene object. If no value has ben set, it + will return undef + + Returns : an object reference or undef + Args : a Bio::LiveSeq::Gene + +See L for more information. + +=cut + +sub gene { + my ($self,$value) = @_; + if (defined $value) { + if( ! $value->isa('Bio::LiveSeq::Gene') ) { + $self->throw("Is not a Bio::LiveSeq::Gene object but a [$value]"); + return undef; + } + else { + $self->{'gene'} = $value; + } + } + unless (exists $self->{'gene'}) { + return (undef); + } else { + return $self->{'gene'}; + } +} + + +=head2 numbering + + Title : numbering + Usage : $obj->numbering(); + Function: + + Sets and returns coordinate system used in positioning the + mutations. See L for details. + + Example : + Returns : string + Args : string (coding [transcript number] | gene | entry) + +=cut + + +sub numbering { + my ($self,$value) = @_; + if( defined $value) { + if ($value =~ /(coding)( )?(\d+)?/ or $value eq 'entry' or $value eq 'gene') { + $self->{'numbering'} = $value; + } else { # defaulting to 'coding' + $self->{'numbering'} = 'coding'; + } + } + unless (exists $self->{'numbering'}) { + return 'coding'; + } else { + return $self->{'numbering'}; + } +} + +=head2 add_Mutation + + Title : add_Mutation + Usage : $self->add_Mutation($ref) + Function: adds a Bio::LiveSeq::Mutation object + Example : + Returns : + Args : a Bio::LiveSeq::Mutation + +See L for more information. + +=cut + +sub add_Mutation{ + my ($self,$value) = @_; + if( $value->isa('Bio::Liveseq::Mutation') ) { + my $com = ref $value; + $self->throw("Is not a Mutation object but a [$com]" ); + return undef; + } + if (! $value->pos) { + $self->warn("No value for mutation position in the sequence!"); + return undef; + } + if (! $value->seq && ! $value->len) { + $self->warn("Either mutated sequence or length of the deletion must be given!"); + return undef; + } + push(@{$self->{'mutations'}},$value); +} + +=head2 each_Mutation + + Title : each_Mutation + Usage : foreach $ref ( $a->each_Mutation ) + Function: gets an array of Bio::LiveSeq::Mutation objects + Example : + Returns : array of Mutations + Args : + +See L for more information. + +=cut + +sub each_Mutation{ + my ($self) = @_; + return @{$self->{'mutations'}}; +} + + +=head2 mutation + + Title : mutation + Usage : $mutobj = $obj->mutation; + : $mutobj = $obj->mutation($objref); + Function: + + Returns or sets the link-reference to the current mutation + object. If the value is not set, it will return undef. + Internal method. + + Returns : an object reference or undef + +=cut + + +sub mutation { + my ($self,$value) = @_; + if (defined $value) { + if( ! $value->isa('Bio::LiveSeq::Mutation') ) { + $self->throw("Is not a Bio::LiveSeq::Mutation object but a [$value]"); + return undef; + } + else { + $self->{'mutation'} = $value; + } + } + unless (exists $self->{'mutation'}) { + return (undef); + } else { + return $self->{'mutation'}; + } +} + +=head2 DNA + + Title : DNA + Usage : $mutobj = $obj->DNA; + : $mutobj = $obj->DNA($objref); + Function: + + Returns or sets the reference to the LiveSeq object holding + the reference sequence. If there is no link, it will return + undef. + Internal method. + + Returns : an object reference or undef + +=cut + +sub DNA { + my ($self,$value) = @_; + if (defined $value) { + if( ! $value->isa('Bio::LiveSeq::DNA') and ! $value->isa('Bio::LiveSeq::Transcript') ) { + $self->throw("Is not a Bio::LiveSeq::DNA/Transcript object but a [$value]"); + return undef; + } + else { + $self->{'DNA'} = $value; + } + } + unless (exists $self->{'DNA'}) { + return (undef); + } else { + return $self->{'DNA'}; + } +} + + +=head2 RNA + + Title : RNA + Usage : $mutobj = $obj->RNA; + : $mutobj = $obj->RNA($objref); + Function: + + Returns or sets the reference to the LiveSeq object holding + the reference sequence. If the value is not set, it will return + undef. + Internal method. + + Returns : an object reference or undef + +=cut + + +sub RNA { + my ($self,$value) = @_; + if (defined $value) { + if( ! $value->isa('Bio::LiveSeq::Transcript') ) { + $self->throw("Is not a Bio::LiveSeq::RNA/Transcript object but a [$value]"); + return undef; + } + else { + $self->{'RNA'} = $value; + } + } + unless (exists $self->{'RNA'}) { + return (undef); + } else { + return $self->{'RNA'}; + } +} + + +=head2 dnamut + + Title : dnamut + Usage : $mutobj = $obj->dnamut; + : $mutobj = $obj->dnamut($objref); + Function: + + Returns or sets the reference to the current DNAMutation object. + If the value is not set, it will return undef. + Internal method. + + Returns : a Bio::Variation::DNAMutation object or undef + +See L for more information. + +=cut + + +sub dnamut { + my ($self,$value) = @_; + if (defined $value) { + if( ! $value->isa('Bio::Variation::DNAMutation') ) { + $self->throw("Is not a Bio::Variation::DNAMutation object but a [$value]"); + return undef; + } + else { + $self->{'dnamut'} = $value; + } + } + unless (exists $self->{'dnamut'}) { + return (undef); + } else { + return $self->{'dnamut'}; + } +} + + +=head2 rnachange + + Title : rnachange + Usage : $mutobj = $obj->rnachange; + : $mutobj = $obj->rnachange($objref); + Function: + + Returns or sets the reference to the current RNAChange object. + If the value is not set, it will return undef. + Internal method. + + Returns : a Bio::Variation::RNAChange object or undef + +See L for more information. + +=cut + + +sub rnachange { + my ($self,$value) = @_; + if (defined $value) { + if( ! $value->isa('Bio::Variation::RNAChange') ) { + $self->throw("Is not a Bio::Variation::RNAChange object but a [$value]"); + return undef; + } + else { + $self->{'rnachange'} = $value; + } + } + unless (exists $self->{'rnachange'}) { + return (undef); + } else { + return $self->{'rnachange'}; + } +} + + +=head2 aachange + + Title : aachange + Usage : $mutobj = $obj->aachange; + : $mutobj = $obj->aachange($objref); + Function: + + Returns or sets the reference to the current AAChange object. + If the value is not set, it will return undef. + Internal method. + + Returns : a Bio::Variation::AAChange object or undef + +See L for more information. + +=cut + + +sub aachange { + my ($self,$value) = @_; + if (defined $value) { + if( ! $value->isa('Bio::Variation::AAChange') ) { + $self->throw("Is not a Bio::Variation::AAChange object but a [$value]"); + return undef; + } + else { + $self->{'aachange'} = $value; + } + } + unless (exists $self->{'aachange'}) { + return (undef); + } else { + return $self->{'aachange'}; + } +} + + +=head2 exons + + Title : exons + Usage : $mutobj = $obj->exons; + : $mutobj = $obj->exons($objref); + Function: + + Returns or sets the reference to a current array of Exons. + If the value is not set, it will return undef. + Internal method. + + Returns : an array of Bio::LiveSeq::Exon objects or undef + +See L for more information. + +=cut + + +sub exons { + my ($self,$value) = @_; + if (defined $value) { + $self->{'exons'} = $value; + } + unless (exists $self->{'exons'}) { + return (undef); + } else { + return $self->{'exons'}; + } +} + +=head2 change_gene_with_alignment + + Title : change_gene_with_alignment + Usage : $results=$mutate->change_gene_with_alignment($aln); + + Function: + + Returns a Bio::Variation::SeqDiff object containing the + results of the changes in the alignment. The alignment has + to be pairwise and have one sequence named 'QUERY', the + other one is assumed to be a part of the sequence from + $gene. + + This method offers a shortcut to change_gene and + automates the creation of Bio::LiveSeq::Mutation objects. + Use it with almost identical sequnces, e.g. to locate a SNP. + + Args : Bio::SimpleAlign object representing a short local alignment + Returns : Bio::Variation::SeqDiff object or 0 on error + +See L, L, and +L for more information. + +=cut + +sub change_gene_with_alignment { + my ($self, $aln) = @_; + + # + # Sanity checks + # + + $self->throw("Argument is not a Bio::SimpleAlign object but a [$aln]") + unless $aln->isa('Bio::SimpleAlign'); + $self->throw("'Pairwise alignments only, please") + if $aln->no_sequences != 2; + + # find out the order the two sequences are given + my $queryseq_pos = 1; #default + my $refseq_pos = 2; + unless ($aln->get_seq_by_pos(1)->id eq 'QUERY') { + carp('Query sequence has to be named QUERY') + if $aln->get_seq_by_pos(2)->id ne 'QUERY'; + $queryseq_pos = 2; # alternative + $refseq_pos = 1; + } + + # trim the alignment + my $start = $aln->column_from_residue_number('QUERY', 1); + my $end = $aln->column_from_residue_number('QUERY', + $aln->get_seq_by_pos($queryseq_pos)->end ); + + my $aln2 = $aln->slice($start, $end); + + # + # extracting mutations + # + + my $cs = $aln2->consensus_string(51); + my $queryseq = $aln2->get_seq_by_pos($queryseq_pos); + my $refseq = $aln2->get_seq_by_pos($refseq_pos); + + while ( $cs =~ /(\?+)/g) { + # pos in local coordinates + my $pos = pos($cs) - length($1) + 1; + my $mutation = create_mutation($self, + $refseq, + $queryseq, + $pos, + CORE::length($1) + ); + # reset pos to refseq coordinates + $pos += $refseq->start - 1; + $mutation->pos($pos); + + $self->add_Mutation($mutation); + } + return $self->change_gene(); +} + +=head2 create_mutation + + Title : create_mutation + Usage : + Function: + + Formats sequence differences from two sequences into + Bio::LiveSeq::Mutation objects which can be applied to a + gene. + + To keep it generic, sequence arguments need not to be + Bio::LocatableSeq. Coordinate change to parent sequence + numbering needs to be done by the calling code. + + Called from change_gene_with_alignment + + Args : Bio::PrimarySeqI inheriting object for the reference sequence + Bio::PrimarySeqI inheriting object for the query sequence + integer for the start position of the local sequence difference + integer for the length of the sequence difference + Returns : Bio::LiveSeq::Mutation object + +=cut + +sub create_mutation { + my ($self, $refseq, $queryseq, $pos, $len) = @_; + + $self->throw("Is not a Bio::PrimarySeqI object but a [$refseq]") + unless $refseq->isa('Bio::PrimarySeqI'); + $self->throw("Is not a Bio::PrimarySeqI object but a [$queryseq]") + unless $queryseq->isa('Bio::PrimarySeqI'); + $self->throw("Position is not a positive integer but [$pos]") + unless $pos =~ /^\+?\d+$/; + $self->throw("Length is not a positive integer but [$len]") + unless $len =~ /^\+?\d+$/; + + my $mutation; + my $refstring = $refseq->subseq($pos, $pos + $len - 1); + my $varstring = $queryseq->subseq($pos, $pos + $len - 1); + + if ($len == 1 and $refstring =~ /[^\.\-\*\?]/ and + $varstring =~ /[^\.\-\*\?]/ ) { #point + $mutation = new Bio::LiveSeq::Mutation (-seq => $varstring, + -pos => $pos, + ); + } + elsif ( $refstring =~ /^[^\.\-\*\?]+$/ and + $varstring !~ /^[^\.\-\*\?]+$/ ) { # deletion + $mutation = new Bio::LiveSeq::Mutation (-pos => $pos, + -len => $len + ); + } + elsif ( $refstring !~ /^[^\.\-\*\?]+$/ and + $varstring =~ /^[^\.\-\*\?]+$/ ) { # insertion + $mutation = new Bio::LiveSeq::Mutation (-seq => $varstring, + -pos => $pos, + -len => 0 + ); + } else { # complex + $mutation = new Bio::LiveSeq::Mutation (-seq => $varstring, + -pos => $pos, + -len => $len + ); + } + + return $mutation; +} + +=head2 change_gene + + Title : change_gene + Usage : my $mutate = Bio::LiveSeq::Mutator->new(-gene => $gene, + numbering => "coding" + ); + # $mut is Bio::LiveSeq::Mutation object + $mutate->add_Mutation($mut); + my $results=$mutate->change_gene(); + + Function: + + Returns a Bio::Variation::SeqDiff object containing the + results of the changes performed according to the + instructions present in Mutation(s). The -numbering + argument decides what molecule is being changed and what + numbering scheme being used: + + -numbering => "entry" + + determines the DNA level, using the numbering from the + beginning of the sequence + + -numbering => "coding" + + determines the RNA level, using the numbering from the + beginning of the 1st transcript + + Alternative transcripts can be used by specifying + "coding 2" or "coding 3" ... + + -numbering => "gene" + + determines the DNA level, using the numbering from the + beginning of the 1st transcript and inluding introns. + The meaning equals 'coding' if the reference molecule + is cDNA. + + Args : Bio::LiveSeq::Gene object + Bio::LiveSeq::Mutation object(s) + string specifying a numbering scheme (defaults to 'coding') + Returns : Bio::Variation::SeqDiff object or 0 on error + +=cut + +sub change_gene { + my ($self) = @_; + + # + # Sanity check + # + unless ($self->gene) { + $self->warn("Input object Bio::LiveSeq::Gene is not given"); + return 0; + } + # + # Setting the reference sequence based on -numbering + # + my @transcripts=@{$self->gene->get_Transcripts}; + my $refseq; # will hold Bio::LiveSeq:Transcript object or Bio::LiveSeq::DNA + + # 'gene' eq 'coding' if reference sequence is cDNA + $self->numbering ('coding') if $self->gene->get_DNA->alphabet eq 'rna' and $self->numbering eq 'gene'; + + if ($self->numbering =~ /(coding)( )?(\d+)?/ ) { + $self->numbering($1); + my $transnumber = $3; + $transnumber-- if $3; # 1 -> 0, 2 -> 1 + if ($transnumber && $transnumber >= 0 && $transnumber <= $#transcripts) { + $refseq=$transcripts[$transnumber]; + } else { + $transnumber && $self->warn("The alternative transcript number ". $transnumber+1 . + "- does not exist. Reverting to the 1st transcript\n"); + $refseq=$transcripts[0]; + } + } else { + $refseq=$transcripts[0]->{'seq'}; + } + # + # Recording the state: SeqDiff object creation ?? transcript no.?? + # + my $seqDiff = Bio::Variation::SeqDiff->new(); + $seqDiff->alphabet($self->gene->get_DNA->alphabet); + $seqDiff->numbering($self->numbering); + my ($DNAobj, $RNAobj); + if ($refseq->isa("Bio::LiveSeq::Transcript")) { + $self->RNA($refseq); + $self->DNA($refseq->{'seq'}); + $seqDiff->rna_ori($refseq->seq ); + $seqDiff->aa_ori($refseq->get_Translation->seq); + } else { + $self->DNA($refseq); + $self->RNA($transcripts[0]); + $seqDiff->rna_ori($self->RNA->seq); + $seqDiff->aa_ori($self->RNA->get_Translation->seq); + } + $seqDiff->dna_ori($self->DNA->seq); + # put the accession number into the SeqDiff object ID + $seqDiff->id($self->DNA->accession_number); + + # the atg_offset takes in account that DNA object could be a subset of the + # whole entry (via the light_weight loader) + my $atg_label=$self->RNA->start; + my $atg_offset=$self->DNA->position($atg_label)+($self->DNA->start)-1; + $seqDiff->offset($atg_offset - 1); + $self->DNA->coordinate_start($atg_label); + + my @exons = $self->RNA->all_Exons; + $seqDiff->cds_end($exons[$#exons]->end); + + # + # Converting mutation positions to labels + # + $self->warn("no mutations"), return 0 + unless $self->_mutationpos2label($refseq, $seqDiff); + + # need to add more than one rna & aa + #foreach $transcript (@transcripts) { + # $seqDiff{"ori_transcript_${i}_seq"}=$transcript->seq; + # $seqDiff{"ori_translation_${i}_seq"}=$transcript->get_Translation->seq; + #} + + # do changes + my $k; + foreach my $mutation ($self->each_Mutation) { + next unless $mutation->label > 0; + $self->mutation($mutation); + + $mutation->issue(++$k); + # + # current position on the transcript + # + if ($self->numbering =~ /coding/) { + $mutation->transpos($mutation->pos); # transpos given by user + } else { + #transpos of label / It will be 0 if mutating an intron, negative if upstream of ATG + $mutation->transpos($self->RNA->position($mutation->label,$atg_label)); + } + # + # Calculate adjacent labels based on the position on the current sequence + # + $mutation->prelabel($self->DNA->label(-1, $mutation->label)); # 1 before label + if ($mutation->len == 0) { + $mutation->postlabel($mutation->label); + $mutation->lastlabel($mutation->label); + } elsif ($mutation->len == 1) { + $mutation->lastlabel($mutation->label); # last nucleotide affected + $mutation->postlabel($self->DNA->label(2,$mutation->lastlabel)); # $len after label + } else { + $mutation->lastlabel($self->DNA->label($mutation->len,$mutation->label)); + $mutation->postlabel($self->DNA->label(2,$mutation->lastlabel)); + } + my $dnamut = $self->_set_DNAMutation($seqDiff); + # + # + # + if ($self->_rnaAffected) { + $self->_set_effects($seqDiff, $dnamut); + } + elsif ($seqDiff->offset != 0 and $dnamut->region ne 'intron') { + $self->_untranslated ($seqDiff, $dnamut); + } else { + #$self->warn("Mutation starts outside coding region, RNAChange object not created"); + } + + ######################################################################### + # Mutations are done here! # + $refseq->labelchange($mutation->seq, $mutation->label, $mutation->len); # + ######################################################################### + + $self->_post_mutation ($seqDiff); + + $self->dnamut(undef); + $self->rnachange(undef); + $self->aachange(undef); + $self->exons(undef); + } + # record the final state of all three sequences + $seqDiff->dna_mut($self->DNA->seq); + $seqDiff->rna_mut($self->RNA->seq); + if ($refseq->isa("Bio::LiveSeq::Transcript")) { + $seqDiff->aa_mut($refseq->get_Translation->seq); + } else { + $seqDiff->aa_mut($self->RNA->get_Translation->seq); + } + + #$seqDiff{mut_dna_seq}=$gene->get_DNA->seq; + #my $i=1; + #foreach $transcript (@transcripts) { + # $seqDiff{"mut_transcript_${i}_seq"}=$transcript->seq; + # $seqDiff{"mut_translation_${i}_seq"}=$transcript->get_Translation->seq; + #} + return $seqDiff; +} + +=head2 _mutationpos2label + + Title : _mutationpos2label + Usage : + Function: converts mutation positions into labels + Example : + Returns : number of valid mutations + Args : LiveSeq sequence object + +=cut + +sub _mutationpos2label { + my ($self, $refseq, $SeqDiff) = @_; + my $count; + my @bb = @{$self->{'mutations'}}; + my $cc = scalar @bb; + foreach my $mut (@{$self->{'mutations'}}) { +# if ($self->numbering eq 'gene' and $mut->pos < 1) { +# my $tmp = $mut->pos; +# print STDERR "pos: ", "$tmp\n"; +# $tmp++ if $tmp < 1; +# $tmp += $SeqDiff->offset; +# print STDERR "pos2: ", "$tmp\n"; +# $mut->pos($tmp); +# } +# elsif ($self->numbering eq 'entry') { + if ($self->numbering eq 'entry') { + my $tmp = $mut->pos; + $tmp -= $SeqDiff->offset; + $tmp-- if $tmp < 1; + $mut->pos($tmp); + } + + my $label = $refseq->label($mut->pos); # get the label for the position + $mut->label($label), $count++ if $label > 0 ; + #print STDERR "x", $mut->pos,'|' ,$mut->label, "\n"; + } + return $count; +} + +# +# Calculate labels around mutated nucleotide +# + +=head2 _set_DNAMutation + + Title : _set_DNAMutation + Usage : + Function: + + Stores DNA level mutation attributes before mutation into + Bio::Variation::DNAMutation object. Links it to SeqDiff + object. + + Example : + Returns : Bio::Variation::DNAMutation object + Args : Bio::Variation::SeqDiff object + +See L and L. + +=cut + +sub _set_DNAMutation { + my ($self, $seqDiff) = @_; + + my $dnamut_start = $self->mutation->label - $seqDiff->offset; + # if negative DNA positions (before ATG) + $dnamut_start-- if $dnamut_start <= 0; + my $dnamut_end; + ($self->mutation->len == 0 or $self->mutation->len == 1) ? + ($dnamut_end = $dnamut_start) : + ($dnamut_end = $dnamut_start+$self->mutation->len); + #print "start:$dnamut_start, end:$dnamut_end\n"; + my $dnamut = Bio::Variation::DNAMutation->new(-start => $dnamut_start, + -end => $dnamut_end, + ); + $dnamut->mut_number($self->mutation->issue); + $dnamut->isMutation(1); + my $da_m = Bio::Variation::Allele->new; + $da_m->seq($self->mutation->seq) if $self->mutation->seq; + $dnamut->allele_mut($da_m); + $dnamut->add_Allele($da_m); + # allele_ori + my $allele_ori = $self->DNA->labelsubseq($self->mutation->prelabel, + undef, + $self->mutation->postlabel); # get seq + chop $allele_ori; # chop the postlabel nucleotide + $allele_ori=substr($allele_ori,1); # away the prelabel nucleotide + my $da_o = Bio::Variation::Allele->new; + $da_o->seq($allele_ori) if $allele_ori; + $dnamut->allele_ori($da_o); + ($self->mutation->len == 0) ? + ($dnamut->length($self->mutation->len)) : ($dnamut->length(CORE::length $allele_ori)); + #print " --------------- $dnamut_start -$len- $dnamut_end -\n"; + $seqDiff->add_Variant($dnamut); + $self->dnamut($dnamut); + $dnamut->mut_number($self->mutation->issue); + # setting proof + if ($seqDiff->numbering eq "entry" or $seqDiff->numbering eq "gene") { + $dnamut->proof('experimental'); + } else { + $dnamut->proof('computed'); + } + # how many nucleotides to store upstream and downstream of the change + my $flanklen = $self->{'flanklen'}; + #print `date`, " flanking sequences start\n"; + my $uplabel = $self->DNA->label(1-$flanklen,$self->mutation->prelabel); # this could be unavailable! + + my $upstreamseq; + if ($uplabel > 0) { + $upstreamseq = + $self->DNA->labelsubseq($uplabel, undef, $self->mutation->prelabel); + } else { # from start (less than $flanklen nucleotides) + $upstreamseq = + $self->DNA->labelsubseq($self->DNA->start, undef, $self->mutation->prelabel); + } + $dnamut->upStreamSeq($upstreamseq); + my $dnstreamseq = $self->DNA->labelsubseq($self->mutation->postlabel, $flanklen); + $dnamut->dnStreamSeq($dnstreamseq); # $flanklen or less nucleotides + return $dnamut; +} + + +# +### Check if mutation propagates to RNA (and AA) level +# +# side effect: sets intron/exon information +# returns a boolean value +# + +sub _rnaAffected { + my ($self) = @_; + my @exons=$self->RNA->all_Exons; + my $RNAstart=$self->RNA->start; + my $RNAend=$self->RNA->end; + my ($firstexon,$before,$after,$i); + my ($rnaAffected) = 0; + + # check for inserted labels (that require follows instead of <,>) + my $DNAend=$self->RNA->{'seq'}->end; + if ($self->mutation->prelabel > $DNAend or $self->mutation->postlabel > $DNAend) { + #this means one of the two labels is an inserted one + #(coming from a previous mutation. This would falsify all <,> + #checks, so the follow() has to be used + $self->warn("Attention, workaround not fully tested yet! Expect unpredictable results.\n"); + if (($self->mutation->postlabel==$RNAstart) or (follows($self->mutation->postlabel,$RNAstart))) { + $self->warn("RNA not affected because change occurs before RNAstart"); + } + elsif (($RNAend==$self->mutation->prelabel) or (follows($RNAend,$self->mutation->prelabel))) { + $self->warn("RNA not affected because change occurs after RNAend"); + } + elsif (scalar @exons == 1) { + #no introns, just one exon + $rnaAffected = 1; # then RNA is affected! + } else { + # otherwise check for change occurring inside an intron + $firstexon=shift(@exons); + $before=$firstexon->end; + + foreach $i (0..$#exons) { + $after=$exons[$i]->start; + if (follows($self->mutation->prelabel,$before) or + ($after==$self->mutation->prelabel) or + follows($after,$self->mutation->prelabel) or + follows($after,$self->mutation->postlabel)) { + + $rnaAffected = 1; + # $i is number of exon and can be used for proximity check + } + $before=$exons[$i]->end; + } + + } + } else { + my $strand = $exons[0]->strand; + if (($strand == 1 and $self->mutation->postlabel <= $RNAstart) or + ($strand != 1 and $self->mutation->postlabel >= $RNAstart)) { + #$self->warn("RNA not affected because change occurs before RNAstart"); + $rnaAffected = 0; + } + elsif (($strand == 1 and $self->mutation->prelabel >= $RNAend) or + ($strand != 1 and $self->mutation->prelabel <= $RNAend)) { + #$self->warn("RNA not affected because change occurs after RNAend"); + $rnaAffected = 0; + my $dist; + if ($strand == 1){ + $dist = $self->mutation->prelabel - $RNAend; + } else { + $dist = $RNAend - $self->mutation->prelabel; + } + $self->dnamut->region_dist($dist); + } + elsif (scalar @exons == 1) { + #if just one exon -> no introns, + $rnaAffected = 1; # then RNA is affected! + } else { + # otherwise check for mutation occurring inside an intron + $firstexon=shift(@exons); + $before=$firstexon->end; + if ( ($strand == 1 and $self->mutation->prelabel < $before) or + ($strand == -1 and $self->mutation->prelabel > $before) + ) { + $rnaAffected = 1 ; + + #print "Exon 1 : ", $firstexon->start, " - ", $firstexon->end, "
\n"; + my $afterdist = $self->mutation->prelabel - $firstexon->start; + my $beforedist = $firstexon->end - $self->mutation->postlabel; + my $exonvalue = $i + 1; + $self->dnamut->region('exon'); + $self->dnamut->region_value($exonvalue); + if ($afterdist < $beforedist) { + $afterdist++; + $afterdist++; + $self->dnamut->region_dist($afterdist); + #print "splice site $afterdist nt upstream!
"; + } else { + $self->dnamut->region_dist($beforedist); + #print "splice site $beforedist nt downstream!
"; + } + } else { + #print "first exon : ", $firstexon->start, " - ", $firstexon->end, "
\n"; + foreach $i (0..$#exons) { + $after=$exons[$i]->start; + #proximity test for intronic mutations + if ( ($strand == 1 and + $self->mutation->prelabel >= $before and + $self->mutation->postlabel <= $after) + or + ($strand == -1 and + $self->mutation->prelabel <= $before and + $self->mutation->postlabel >= $after) ) { + $self->dnamut->region('intron'); + #$self->dnamut->region_value($i); + my $afterdist = $self->mutation->prelabel - $before; + my $beforedist = $after - $self->mutation->postlabel; + my $intronvalue = $i + 1; + if ($afterdist < $beforedist) { + $afterdist++; + $self->dnamut->region_value($intronvalue); + $self->dnamut->region_dist($afterdist); + #print "splice site $afterdist nt upstream!
"; + } else { + $self->dnamut->region_value($intronvalue); + $self->dnamut->region_dist($beforedist * -1); + #print "splice site $beforedist nt downstream!
"; + } + $self->rnachange(undef); + last; + } + #proximity test for exon mutations + elsif ( ( $strand == 1 and + $exons[$i]->start <= $self->mutation->prelabel and + $exons[$i]->end >= $self->mutation->postlabel) or + ( $strand == -1 and + $exons[$i]->start >= $self->mutation->prelabel and + $exons[$i]->end <= $self->mutation->postlabel) ) { + $rnaAffected = 1; + + my $afterdist = $self->mutation->prelabel - $exons[$i]->start; + my $beforedist = $exons[$i]->end - $self->mutation->postlabel; + my $exonvalue = $i + 1; + $self->dnamut->region('exon'); + if ($afterdist < $beforedist) { + $afterdist++; + $self->dnamut->region_value($exonvalue+1); + $self->dnamut->region_dist($afterdist); + #print "splice site $afterdist nt upstream!
"; + } else { + #$beforedist; + $self->dnamut->region_value($exonvalue+1); + $self->dnamut->region_dist($beforedist * -1); + #print "splice site $beforedist nt downstream!
"; + } + last; + } + $before=$exons[$i]->end; + } + } + } + } + #$self->warn("RNA not affected because change occurs inside an intron"); + #return(0); # if still not returned, then not affected, return 0 + return $rnaAffected; +} + +# +# ### Creation of RNA and AA variation objects +# + +=head2 _set_effects + + Title : _set_effects + Usage : + Function: + + Stores RNA and AA level mutation attributes before mutation + into Bio::Variation::RNAChange and + Bio::Variation::AAChange objects. Links them to + SeqDiff object. + + Example : + Returns : + Args : Bio::Variation::SeqDiff object + Bio::Variation::DNAMutation object + +See L, L, +L, and L. + +=cut + +sub _set_effects { + my ($self, $seqDiff, $dnamut) = @_; + my ($rnapos_end, $upstreamseq, $dnstreamseq); + my $flanklen = $self->{'flanklen'}; + + ($self->mutation->len == 0) ? + ($rnapos_end = $self->mutation->transpos) : + ($rnapos_end = $self->mutation->transpos + $self->mutation->len -1); + my $rnachange = Bio::Variation::RNAChange->new(-start => $self->mutation->transpos, + -end => $rnapos_end + ); + $rnachange->isMutation(1); + + # setting proof + if ($seqDiff->numbering eq "coding") { + $rnachange->proof('experimental'); + } else { + $rnachange->proof('computed'); + } + + $seqDiff->add_Variant($rnachange); + $self->rnachange($rnachange); + $rnachange->DNAMutation($dnamut); + $dnamut->RNAChange($rnachange); + $rnachange->mut_number($self->mutation->issue); + + # setting the codon_position of the "start" nucleotide of the change + $rnachange->codon_pos(($self->RNA->frame($self->mutation->label))+1); # codon_pos=frame+1 + + my @exons=$self->RNA->all_Exons; + $self->exons(\@exons); + #print `date`, " before flank, after exons. RNAObj query\n"; + # if cannot retrieve from Transcript, Transcript::upstream_seq will be used + # before "fac7 g 65" bug discovered + # $uplabel=$self->RNA->label(1-$flanklen,$prelabel); + my $RNAprelabel=$self->RNA->label(-1,$self->mutation->label); # to fix fac7g65 bug + # for the fix, all prelabel used in the next block have been changed to RNAprelabel + my $uplabel=$self->RNA->label(1-$flanklen,$RNAprelabel); + if ($self->RNA->valid($uplabel)) { + $upstreamseq = $self->RNA->labelsubseq($uplabel, undef, $RNAprelabel); + } else { + $upstreamseq = $self->RNA->labelsubseq($self->RNA->start, undef, $RNAprelabel) + if $self->RNA->valid($RNAprelabel); + my $lacking=$flanklen-length($upstreamseq); # how many missing + my $upstream_atg=$exons[0]->subseq(-$lacking,-1); + $upstreamseq=$upstream_atg . $upstreamseq; + } + + $rnachange->upStreamSeq($upstreamseq); + + # won't work OK if postlabel NOT in Transcript + # now added RNApostlabel but this has to be /fully tested/ + # for the fix, all postlabel used in the next block have been changed to RNApostlabel + my $RNApostlabel; # to fix fac7g64 bug + if ($self->mutation->len == 0) { + $RNApostlabel=$self->mutation->label; + } else { + my $mutlen = 1 + $self->mutation->len; + $RNApostlabel=$self->RNA->label($mutlen,$self->mutation->label); + } + $dnstreamseq=$self->RNA->labelsubseq($RNApostlabel, $flanklen); + if ($dnstreamseq eq '-1') { # if out of transcript was requested + my $lastexon=$exons[-1]; + my $lastexonlength=$lastexon->length; + $dnstreamseq=$self->RNA->labelsubseq($RNApostlabel); # retrieves till RNAend + my $lacking=$flanklen-length($dnstreamseq); # how many missing + my $downstream_stop=$lastexon->subseq($lastexonlength+1,undef,$lacking); + $dnstreamseq .= $downstream_stop; + } else { + $rnachange->dnStreamSeq($dnstreamseq); + } + # AAChange creation + my $AAobj=$self->RNA->get_Translation; + # storage of prelabel here, to be used in create_mut_objs_after + my $aachange = Bio::Variation::AAChange->new(-start => $RNAprelabel + ); + $aachange->isMutation(1); + $aachange->proof('computed'); + + $seqDiff->add_Variant($aachange); + $self->aachange($aachange); + $rnachange->AAChange($aachange); + $aachange->RNAChange($rnachange); + + $aachange->mut_number($self->mutation->issue); +# $before_mutation{aachange}=$aachange; + + my $ra_o = Bio::Variation::Allele->new; + $ra_o->seq($dnamut->allele_ori->seq) if $dnamut->allele_ori->seq; + $rnachange->allele_ori($ra_o); + + $rnachange->length(CORE::length $rnachange->allele_ori->seq); + + my $ra_m = Bio::Variation::Allele->new; + $ra_m->seq($self->mutation->seq) if $self->mutation->seq; + $rnachange->allele_mut($ra_m); + $rnachange->add_Allele($ra_m); + + #$rnachange->allele_mut($seq); + $rnachange->end($rnachange->start) if $rnachange->length == 0; + + # this holds the aminoacid sequence that will be affected by the mutation + my $aa_allele_ori=$AAobj->labelsubseq($self->mutation->label,undef, + $self->mutation->lastlabel); + + my $aa_o = Bio::Variation::Allele->new; + $aa_o->seq($aa_allele_ori) if $aa_allele_ori; + $aachange->allele_ori($aa_o); + #$aachange->allele_ori($aa_allele_ori); + + my $aa_length_ori = length($aa_allele_ori); + $aachange->length($aa_length_ori); #print "==========$aa_length_ori\n"; + $aachange->end($aachange->start + $aa_length_ori - 1 ); +} + +=head2 _untranslated + + Title : _untranslated + Usage : + Function: + + Stores RNA change attributes before mutation + into Bio::Variation::RNAChange object. Links it to + SeqDiff object. + + Example : + Returns : + Args : Bio::Variation::SeqDiff object + Bio::Variation::DNAMutation object + +See L, L and +L for details. + +=cut + +sub _untranslated { + my ($self, $seqDiff, $dnamut) = @_; + my $rnapos_end; + ($self->mutation->len == 0) ? + ($rnapos_end = $self->mutation->transpos) : + ($rnapos_end = $self->mutation->transpos + $self->mutation->len -1); + my $rnachange = Bio::Variation::RNAChange->new(-start => $self->mutation->transpos, + -end => $rnapos_end + ); + #my $rnachange = Bio::Variation::RNAChange->new; + + $rnachange->isMutation(1); + my $ra_o = Bio::Variation::Allele->new; + $ra_o->seq($dnamut->allele_ori->seq) if $dnamut->allele_ori->seq; + $rnachange->allele_ori($ra_o); + my $ra_m = Bio::Variation::Allele->new; + $ra_m->seq($dnamut->allele_mut->seq) if $dnamut->allele_mut->seq; + $rnachange->allele_mut($ra_m); + $rnachange->add_Allele($ra_m); + $rnachange->upStreamSeq($dnamut->upStreamSeq); + $rnachange->dnStreamSeq($dnamut->dnStreamSeq); + $rnachange->length($dnamut->length); + $rnachange->mut_number($dnamut->mut_number); + # setting proof + if ($seqDiff->numbering eq "coding") { + $rnachange->proof('experimental'); + } else { + $rnachange->proof('computed'); + } + + my $dist; + if ($rnachange->end < 0) { + $rnachange->region('5\'UTR'); + $dnamut->region('5\'UTR'); + my $dist = $dnamut->end ; + $dnamut->region_dist($dist); + $dist = $seqDiff->offset - $self->gene->maxtranscript->start + 1 + $dist; + $rnachange->region_dist($dist); + return if $dist < 1; # if mutation is not in mRNA + } else { + $rnachange->region('3\'UTR'); + $dnamut->region('3\'UTR'); + my $dist = $dnamut->start - $seqDiff->cds_end + $seqDiff->offset; + $dnamut->region_dist($dist); + $dist = $seqDiff->cds_end - $self->gene->maxtranscript->end -1 + $dist; + $rnachange->region_dist($dist); + return if $dist > 0; # if mutation is not in mRNA + } + $seqDiff->add_Variant($rnachange); + $self->rnachange($rnachange); + $rnachange->DNAMutation($dnamut); + $dnamut->RNAChange($rnachange); +} + +# args: reference to label changearray, reference to position changearray +# Function: take care of the creation of mutation objects, with +# information AFTER the change takes place +sub _post_mutation { + my ($self, $seqDiff) = @_; + + if ($self->rnachange and $self->rnachange->region eq 'coding') { + + #$seqDiff->add_Variant($self->rnachange); + + my $aachange=$self->aachange; + my ($AAobj,$aa_start_prelabel,$aa_start,$mut_translation); + $AAobj=$self->RNA->get_Translation; + $aa_start_prelabel=$aachange->start; + $aa_start=$AAobj->position($self->RNA->label(2,$aa_start_prelabel)); + $aachange->start($aa_start); + $mut_translation=$AAobj->seq; + + # this now takes in account possible preinsertions + my $aa_m = Bio::Variation::Allele->new; + $aa_m->seq(substr($mut_translation,$aa_start-1)) if substr($mut_translation,$aa_start-1); + $aachange->allele_mut($aa_m); + $aachange->add_Allele($aa_m); + #$aachange->allele_mut(substr($mut_translation,$aa_start-1)); + #$aachange->allele_mut($mut_translation); + my ($rlenori, $rlenmut); + $rlenori = CORE::length($aachange->RNAChange->allele_ori->seq); + $rlenmut = CORE::length($aachange->RNAChange->allele_mut->seq); + #point mutation + + if ($rlenori == 1 and $rlenmut == 1 and $aachange->allele_ori->seq ne '*') { + my $alleleseq; + if ($aachange->allele_mut->seq) { + $alleleseq = substr($aachange->allele_mut->seq, 0, 1); + $aachange->allele_mut->seq($alleleseq); + } + $aachange->end($aachange->start); + $aachange->length(1); + } + elsif ( $rlenori == $rlenmut and + $aachange->allele_ori->seq ne '*' ) { #complex inframe mutation + $aachange->allele_mut->seq(substr $aachange->allele_mut->seq, + 0, + length($aachange->allele_ori->seq)); + } + #inframe mutation + elsif ((int($rlenori-$rlenmut))%3 == 0) { + if ($aachange->RNAChange->allele_mut->seq and + $aachange->RNAChange->allele_ori->seq ) { + # complex + my $rna_len = length ($aachange->RNAChange->allele_mut->seq); + my $len = $rna_len/3; + $len++ unless $rna_len%3 == 0; + $aachange->allele_mut->seq(substr $aachange->allele_mut->seq, 0, $len ); + } + elsif ($aachange->RNAChange->codon_pos == 1){ + # deletion + if ($aachange->RNAChange->allele_mut->seq eq '') { + $aachange->allele_mut->seq(''); + $aachange->end($aachange->start + $aachange->length - 1 ); + } + # insertion + elsif ($aachange->RNAChange->allele_ori->seq eq '' ) { + $aachange->allele_mut->seq(substr $aachange->allele_mut->seq, 0, + length ($aachange->RNAChange->allele_mut->seq) / 3); + $aachange->allele_ori->seq(''); + $aachange->end($aachange->start + $aachange->length - 1 ); + $aachange->length(0); + } + } else { + #elsif ($aachange->RNAChange->codon_pos == 2){ + # deletion + if (not $aachange->RNAChange->allele_mut->seq ) { + $aachange->allele_mut->seq(substr $aachange->allele_mut->seq, 0, 1); + } + # insertion + elsif (not $aachange->RNAChange->allele_ori->seq) { + $aachange->allele_mut->seq(substr $aachange->allele_mut->seq, 0, + length ($aachange->RNAChange->allele_mut->seq) / 3 +1); + } + } + } else { + #frameshift + #my $pos = index $aachange->allele_mut + #$aachange->allele_mut(substr($aachange->allele_mut, 0, 1)); + $aachange->length(CORE::length($aachange->allele_ori->seq)); + my $aaend = $aachange->start + $aachange->length -1; + $aachange->end($aachange->start); + } + + # splicing site deletion check + my @beforeexons=@{$self->exons}; + my @afterexons=$self->RNA->all_Exons; + my $i; + if (scalar(@beforeexons) ne scalar(@afterexons)) { + my $mut_number = $self->mutation->issue; + $self->warn("Exons have been modified at mutation n.$mut_number!"); + $self->rnachange->exons_modified(1); + } else { + EXONCHECK: + foreach $i (0..$#beforeexons) { + if ($beforeexons[$i] ne $afterexons[$i]) { + my $mut_number = $self->mutation->issue; + $self->warn("Exons have been modified at mutation n.$mut_number!"); + $self->rnachange->exons_modified(1); + last EXONCHECK; + } + } + } + } else { + #$seqDiff->rnachange(undef); + #print "getting here?"; + } + return 1; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/LiveSeq/Prim_Transcript.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/LiveSeq/Prim_Transcript.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,73 @@ +# $Id: Prim_Transcript.pm,v 1.7 2001/06/18 08:27:53 heikki Exp $ +# +# bioperl module for Bio::LiveSeq::Prim_Transcript +# +# Cared for by Joseph Insana +# +# Copyright Joseph Insana +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::LiveSeq::Prim_Transcript - Prim_Transcript class for LiveSeq + +=head1 SYNOPSIS + + # documentation needed + +=head1 DESCRIPTION + +Class for PRIM_TRANSCRIPT objects. They consist of a beginlabel, an endlabel (both +referring to a LiveSeq DNA object) and a strand. +The strand could be 1 (forward strand, default), -1 (reverse strand). + +=head1 AUTHOR - Joseph A.L. Insana + +Email: Insana@ebi.ac.uk, jinsana@gmx.net + +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::LiveSeq::Prim_Transcript; +$VERSION=1.0; + +# Version history: +# Tue Apr 4 18:11:31 BST 2000 v 1.0 created + +use strict; +use vars qw($VERSION @ISA); +use Bio::LiveSeq::Range 1.2; # uses Range, inherits from it +@ISA=qw(Bio::LiveSeq::Range); + +=head2 new + + Title : new + Usage : $intron1=Bio::LiveSeq::Prim_Transcript->new(-seq => $objref, + -start => $startlabel, + -end => $endlabel, + -strand => 1 + ); + + Function: generates a new Bio::LiveSeq::Prim_Transcript + Returns : reference to a new object of class Prim_Transcript + Errorcode -1 + Args : two labels and an integer + +=cut + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/LiveSeq/Range.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/LiveSeq/Range.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,138 @@ +# $Id: Range.pm,v 1.7 2001/06/18 08:27:53 heikki Exp $ +# +# bioperl module for Bio::LiveSeq::Range +# +# Cared for by Joseph Insana +# +# Copyright Joseph Insana +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::LiveSeq::Range - Range abstract class for LiveSeq + +=head1 SYNOPSIS + + # documentation needed + +=head1 DESCRIPTION + +This is used as parent for exon and intron classes. + +=head1 AUTHOR - Joseph A.L. Insana + +Email: Insana@ebi.ac.uk, jinsana@gmx.net + +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::LiveSeq::Range; +$VERSION=1.6; + +# Version history: +# Mon Mar 20 22:21:44 GMT 2000 v 1.0 begun +# Tue Mar 21 00:50:05 GMT 2000 v 1.1 new() added +# Tue Mar 21 02:44:45 GMT 2000 v 1.2 private start(), more checks in new() +# Thu Mar 23 19:06:03 GMT 2000 v 1.3 follows() replaces is_downstream +# Wed Apr 12 16:35:12 BST 2000 v 1.4 added valid() +# Mon Jun 26 15:25:14 BST 2000 v 1.44 ranges with start=end are now accepted / valid() removed because inherited now from SeqI +# Tue Jun 27 14:06:06 BST 2000 v 1.5 croak changed to carp and return(-1) in new() function +# Wed Mar 28 16:47:36 BST 2001 v 1.6 carp -> warn,throw (coded methods in SeqI) + +use strict; +use vars qw($VERSION @ISA); +use Bio::LiveSeq::SeqI 3.2; # uses SeqI, inherits from it +@ISA=qw(Bio::LiveSeq::SeqI); + +=head2 new + + Title : new + Usage : $range1 = Bio::LiveSeq::Range->new(-seq => $obj_ref, + -start => $beginlabel, + -end => $endlabel, -strand => 1); + + Function: generates a new Bio::LiveSeq::Range + Returns : reference to a new object of class Range + Errorcode -1 + Args : two labels, an obj_ref and an integer + strand 1=forward strand, strand -1=reverse strand + if strand not specified, it defaults to 1 + the -seq argument must point to the underlying DNA LiveSeq object + +=cut + +sub new { + my ($thing, %args) = @_; + my $class = ref($thing) || $thing; + my ($obj,%range); + + my ($seq,$start,$end,$strand)=($args{-seq},$args{-start},$args{-end},$args{-strand}); + + $obj = \%range; + $obj = bless $obj, $class; + + unless ($seq->valid($start)) { + $obj->warn("$class not initialised because start label not valid"); + return (-1); + } + unless ($seq->valid($end)) { + $obj->warn("$class not initialised because end label not valid"); + return (-1); + } + unless (defined $strand) { + $strand = 1; + } + if (($strand != 1)&&($strand != -1)) { + $obj->warn("$class not initialised because strand identifier not valid. Use 1 (forward strand) or -1 (reverse strand)."); + return (-1); + } + if ($start eq $end) { + $obj->warn("$class reports: start and end label are the same...."); + } else { + unless ($seq->follows($start,$end,$strand)==1) { + $obj->warn("Fatal: end label $end doesn't follow start label $start for strand $strand!"); + return (-1); + } + } + #if ($strand == 1) { + # unless ($seq->is_downstream($start,$end)==1) { + # croak "Fatal: end label not downstream of start label for forward strand!"; + # } + #} else { + # unless ($seq->is_upstream($start,$end)==1) { + # croak "Fatal: end label not upstream of start label for reverse strand!"; + # } + #} + $obj->{'seq'}=$seq; + $obj->{'start'}=$start; + $obj->{'end'}=$end; + $obj->{'strand'}=$strand; + return $obj; +} + +=head2 valid + + Title : valid + Usage : $boolean = $obj->valid($label) + Function: tests if a label exists AND is part of the object + Returns : boolean + Args : label + +=cut + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/LiveSeq/Repeat_Region.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/LiveSeq/Repeat_Region.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,71 @@ +# $Id: Repeat_Region.pm,v 1.7 2001/06/18 08:27:53 heikki Exp $ +# +# bioperl module for Bio::LiveSeq::Repeat_Region +# +# Cared for by Joseph Insana +# +# Copyright Joseph Insana +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::LiveSeq::Repeat_Region - Repeat_Region class for LiveSeq + +=head1 SYNOPSIS + + # documentation needed + +=head1 DESCRIPTION + +Class for REPEAT_REGION objects. They consist of a beginlabel, an endlabel (both +referring to a LiveSeq DNA object) and a strand. +The strand could be 1 (forward strand, default), -1 (reverse strand). + +=head1 AUTHOR - Joseph A.L. Insana + +Email: Insana@ebi.ac.uk, jinsana@gmx.net + +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, Regioned Kingdom + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::LiveSeq::Repeat_Region; +$VERSION=1.0; + +# Version history: +# Tue Apr 4 18:11:31 BST 2000 v 1.0 created + +use strict; +use vars qw($VERSION @ISA); +use Bio::LiveSeq::Range 1.2; # uses Range, inherits from it +@ISA=qw(Bio::LiveSeq::Range); + +=head2 new + + Title : new + Usage : $intron1=Bio::LiveSeq::Repeat_Region->new(-seq => $objref, + -start => $startlabel, + -end => $endlabel, -strand => 1); + + Function: generates a new Bio::LiveSeq::Repeat_Region + Returns : reference to a new object of class Repeat_Region + Errorcode -1 + Args : two labels and an integer + +=cut + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/LiveSeq/Repeat_Unit.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/LiveSeq/Repeat_Unit.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,71 @@ +# $Id: Repeat_Unit.pm,v 1.7 2001/06/18 08:27:53 heikki Exp $ +# +# bioperl module for Bio::LiveSeq::Repeat_Unit +# +# Cared for by Joseph Insana +# +# Copyright Joseph Insana +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::LiveSeq::Repeat_Unit - Repeat_Unit class for LiveSeq + +=head1 SYNOPSIS + + # documentation needed + +=head1 DESCRIPTION + +Class for REPEAT_UNIT objects. They consist of a beginlabel, an endlabel (both +referring to a LiveSeq DNA object) and a strand. +The strand could be 1 (forward strand, default), -1 (reverse strand). + +=head1 AUTHOR - Joseph A.L. Insana + +Email: Insana@ebi.ac.uk, jinsana@gmx.net + +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::LiveSeq::Repeat_Unit; +$VERSION=1.0; + +# Version history: +# Tue Apr 4 18:11:31 BST 2000 v 1.0 created + +use strict; +use vars qw($VERSION @ISA); +use Bio::LiveSeq::Repeat_Region 1.0; # uses Repeat_Region, inherits from it +@ISA=qw(Bio::LiveSeq::Repeat_Region); + +=head2 new + + Title : new + Usage : $intron1=Bio::LiveSeq::Repeat_Unit->new(-seq => $objref, + -start => $startlabel, + -end => $endlabel, -strand => 1); + + Function: generates a new Bio::LiveSeq::Repeat_Unit + Returns : reference to a new object of class Repeat_Unit + Errorcode -1 + Args : two labels and an integer + +=cut + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/LiveSeq/SeqI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/LiveSeq/SeqI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1231 @@ +# $Id: SeqI.pm,v 1.25 2002/10/22 07:38:34 lapp Exp $ +# +# bioperl module for Bio::LiveSeq::SeqI +# +# Cared for by Joseph Insana +# +# Copyright Joseph Insana +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::LiveSeq::SeqI - Abstract sequence interface class for LiveSeq + +=head1 SYNOPSIS + + # documentation needed + +=head1 DESCRIPTION + +This class implements BioPerl PrimarySeqI interface for Live Seq objects. + +One of the main difference in LiveSequence compared to traditional +"string" sequences is that coordinate systems are flexible. Typically +gene nucleotide numbering starts from 1 at the first character of the +initiator codon (A in ATG). This means that negative positions are +possible and common! + +Secondly, the sequence manipulation methods do not return a new +sequence object but change the current object. The current status can +be written out to BioPerl sequence objects. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Joseph A.L. Insana + +Email: Insana@ebi.ac.uk, jinsana@gmx.net + +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +Some note on the terminology/notation of method names: + label: a unique pointer to a single nucleotide + position: the position of a nucleotide according to a particular coordinate + system (e.g. counting downstream from a particular label taken as + number 1) + base: the one letter code for a nucleotide (i.e.: "a" "t" "c" "g") + + a base is the "value" that an "element" of a "chain" can assume + (see documentation on the Chain datastructure if interested) + +=cut + +#' +# Let the code begin... + +package Bio::LiveSeq::SeqI; +$VERSION=3.3; +# Version history: +# Thu Mar 16 18:11:18 GMT 2000 v.1.0 Started implementation, interface/inheritance from ChainI.pm +# Thu Mar 16 20:05:51 GMT 2000 v 1.2 implemented up to splice_out +# Fri Mar 17 05:37:37 GMT 2000 v 1.3 implemented lot of new methods and written their documentation / in sync with ChainI 1.6 and Chain 2.4 +# Fri Mar 17 17:17:24 GMT 2000 v 1.7 in sync with ChainI 1.7 +# Fri Mar 17 20:12:27 GMT 2000 v 1.8 NAMING change: index->label everywhere +# Mon Mar 20 19:19:21 GMT 2000 v 2.0 renamed from DNA to SeqI and begun +# working on methods defined with Heikki +# Tue Mar 21 01:37:52 GMT 2000 v 2.1 created strand(), seq() +# Tue Mar 21 02:43:21 GMT 2000 v 2.11 seq() prints correctly also for exons +# Wed Mar 22 19:41:45 GMT 2000 v 2.22 translate, alphabet, length, all_labels +# Thu Mar 23 21:03:42 GMT 2000 v 2.3 follows() label() position() +# Fri Mar 24 18:33:18 GMT 2000 v 2.33 rewritten position(), now works with diverse coordinate_starts +# Sat Mar 25 06:11:55 GMT 2000 v 2.4 started subseq +# Mon Mar 27 19:22:32 BST 2000 v 2.45 subseq should be ok but the thing about reverse strand has to be checked!! +# Tue Mar 28 01:53:31 BST 2000 v 2.46 changed strand behaviour in subseq +# Wed Mar 29 00:05:21 BST 2000 v 2.5 change() begun +# Wed Mar 29 02:06:20 BST 2000 v 2.53 _delete _mutate _praeinsert coded +# Wed Mar 29 02:29:01 BST 2000 v 2.531 _mutate changed to make it more general +# Wed Mar 29 03:38:21 BST 2000 v 2.54 tested and corrected change +# Wed Mar 29 16:23:39 BST 2000 v 2.55 change deals with complex now +# Fri Mar 31 18:26:54 BST 2000 v 2.56 translate_string added +# Sat Apr 1 19:02:28 BST 2000 v 2.57 labelchange() created +# Fri Apr 7 03:31:35 BST 2000 v 2.6 labelsubseq() created +# Sat Apr 8 13:01:09 BST 2000 v 2.61 obj_valid() created +# Wed Apr 12 16:23:21 BST 2000 v 2.7 _deletecheck call added in _delete +# Wed Apr 19 16:21:33 BST 2000 v 2.72 name() source() description() added +# Thu Apr 20 14:42:57 BST 2000 v 2.8 added or rewritten much pod documentation +# Thu Apr 27 16:18:55 BST 2000 v 2.82 translate now accounts for ttable info +# Thu Jun 22 20:02:39 BST 2000 v 2.9 valid() from Transcript now moved here, as the general for all objects inheriting from SeqI +# Thu Jun 22 20:17:32 BST 2000 v 2.91 _unsecure_labelsubseq() added +# Sat Jun 24 00:10:31 BST 2000 v 2.92 unsecure is an option of labelsubseq() now +# Thu Jun 29 16:38:45 BST 2000 v 3.0 labelchange() now calls itself again for the DNAobj if the label for the change is not valid for the object requested but valid for the DNAobj +# Tue Jan 30 14:16:22 EST 2001 v 3.1 delete_Obj added, to flush circular references +# Wed Mar 28 15:16:38 BST 2001 v 3.2 functions warn, verbose, throw, stack_trace, stack_trace_dump added +# Wed Apr 4 13:34:29 BST 2001 v 3.3 moved from carp to warn + +use strict; +use vars qw($VERSION @ISA); +use Bio::LiveSeq::ChainI 1.9; # to inherit from it +use Bio::Tools::CodonTable; # for the translate() function +use Bio::PrimarySeqI; + +@ISA=qw(Bio::Root::Root Bio::LiveSeq::ChainI Bio::PrimarySeqI ); # inherit from ChainI + +=head2 seq + + Title : seq + Usage : $string = $obj->seq() + Function: Returns the complete sequence of an object as a string of letters. + Suggested cases are upper case for proteins and lower case for + DNA sequence (IUPAC standard), + Returns : a string + + +=cut + +sub seq { + my $self = shift; + my ($start,$end) = ($self->start(),$self->end()); + if ($self->strand() == 1) { + return $self->{'seq'}->down_chain2string($start,undef,$end); + } else { # reverse strand + my $str = $self->{'seq'}->up_chain2string($start,undef,$end); + $str =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/; + return $str; + } +} + +=head2 all_labels + + Title : all_labels + Usage : @labels = $obj->all_labels() + Function: all the labels of every nucleotide an object is composed of + Returns : an array of labels + Args : none + +=cut + +sub all_labels { + my $self = shift; + my ($start,$end) = ($self->start(),$self->end()); + my $labels; + if ($self->strand() == 1) { + $labels=$self->{'seq'}->down_labels($start,$end); + } else { + $labels=$self->{'seq'}->up_labels($start,$end); + } + return (@{$labels}); +} + +=head2 labelsubseq + + Title : labelsubseq + Usage : $dna->labelsubseq(); + : $dna->labelsubseq($startlabel); + : $dna->labelsubseq($startlabel,$length); + : $dna->labelsubseq($startlabel,undef,$endlabel); + e.g. : $dna->labelsubseq(4,undef,8); + Function: prints the sequence as string. The difference between labelsubseq + and normal subseq is that it uses /labels/ as arguments, instead + than positions. This allows for faster and more efficient lookup, + skipping the (usually) lengthy conversion of positions into labels. + This is expecially useful for manipulating with high power + LiveSeq objects, knowing the labels and exploiting their + usefulness. + Returns : a string + Errorcode -1 + Args : without arguments it returns the entire sequence + with a startlabel it returns the sequence downstream that label + if a length is specified, it returns only that number of bases + if an endlabel is specified, it overrides the length argument + and prints instead up to that label (included) + Defaults: $startlabel defaults to the beginning of the entire sequence + $endlabel defaults to the end of the entire sequence + +=cut + +# NOTE: unsecuremode is to be used /ONLY/ if sure of the start and end labels, expecially that they follow each other in the correct order!!!! + +sub labelsubseq { + my ($self,$start,$length,$end,$unsecuremode) = @_; + if (defined $unsecuremode && $unsecuremode eq "unsecuremoderequested") + { # to skip security checks (faster) + unless ($start) { + $start=$self->start; + } + if ($end) { + if ($end == $start) { + $length=1; + undef $end; + } else { + undef $length; + } + } else { + unless ($length) { + $end=$self->end; + } + } + } else { + if ($start) { + unless ($self->{'seq'}->valid($start)) { + $self->warn("Start label not valid"); return (-1); + } + } + if ($end) { + if ($end == $start) { + $length=1; + undef $end; + } else { + unless ($self->{'seq'}->valid($end)) { + $self->warn("End label not valid"); return (-1); + } + unless ($self->follows($start,$end) == 1) { + $self->warn("End label does not follow Start label!"); return (-1); + } + undef $length; + } + } + } + if ($self->strand() == 1) { + return $self->{'seq'}->down_chain2string($start,$length,$end); + } else { # reverse strand + my $str = $self->{'seq'}->up_chain2string($start,$length,$end); + $str =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/; + return $str; + } +} + +=head2 subseq + + Title : subseq + Usage : $substring = $obj->subseq(10,40); + : $substring = $obj->subseq(10,undef,4); + Function: returns the subseq from start to end, where the first base + is 1 and the number is inclusive, ie 1-2 are the first two + bases of the sequence + + Start cannot be larger than end but can be equal. + + Allows for negative numbers $obj->subseq(-10,-1). By + definition, there is no 0! + -5 -1 1 5 + gctagcgcccaac atggctcgctg + + This allows to retrieve sequences upstream from given position. + + The precedence is from left to right: if END is given LENGTH is + ignored. + + Examples: $obj->subseq(-10,undef,10) returns 10 elements before position 1 + $obj->subseq(4,8) returns elements from the 4th to the 8th, inclusive + + Returns : a string + Errorcode: -1 + Args : start, integer, defaults to start of the sequence + end, integer, '' or undef, defaults to end of the sequence + length, integer, '' or undef + an optional strand (1 or -1) 4th argument + if strand argument is not given, it will default to the object + argment. This argument is useful when a call is issued from a child + of a parent object containing the subseq method + +=cut + +#' +# check the fact about reverse strand! +# is it feasible? Is it correct? Should we do it? How about exons? Does it +# work when you ask subseq of an exon? +# eliminated now (Mon night) +sub subseq { + ##my ($self,$pos1,$pos2,$length,$strand) = @_; + my ($self,$pos1,$pos2,$length,$strand) = @_; + ##unless (defined ($strand)) { # if optional [strand] argument not given + ## $strand=$self->strand; + ##} + $strand=$self->strand; + my ($str,$startlabel,$endlabel); + if (defined ($length)) { + if ($length < 1) { + $self->warn("No sense asking for a subseq of length < 1"); + return (-1); + } + } + unless (defined ($pos1)) { + #print "\n##### DEBUG pos1 not defined\n"; + $startlabel=$self->start; + } else { + if ($pos1 == 0) { # if position = 0 complain + $self->warn("Position cannot be 0!"); return (-1); + } + ##if ($strand == 1) { # CHECK THIS! + if ((defined ($pos2))&&($pos1>$pos2)) { + $self->warn("1st position($pos1) cannot be > 2nd position($pos2)!"); return (-1); + } + ##} else { # CHECK THIS! + ## if ((defined ($pos2))&&($pos1<$pos2)) { +## $self->warn("1st position($pos1) cannot be < 2nd position($pos2) on reverse strand!)"; return (-1); + ## } + ##} + $startlabel=$self->label($pos1); + if ($startlabel < 1) { + $self->warn("position $pos1 not valid as start of subseq!"); return (-1); + } + } + unless (defined ($pos2)) { + #print "\n##### pos2 not defined\n"; + unless (defined ($length)) { + $endlabel=$self->end; + } + } else { + if ($pos2 == 0) { # if position = 0 complain + $self->warn("Position cannot be 0!"); return (-1); + } + undef $length; + ##if ($strand == 1) { # CHECK THIS! + if ((defined ($pos1))&&($pos1>$pos2)) { + $self->warn("1st position($pos1) cannot be > 2nd position($pos2)!"); return (-1); + } + ##} else { # CHECK THIS! + ## if ((defined ($pos1))&&($pos1<$pos2)) { +## $self->warn("1st position($pos1) cannot be < 2nd position($pos2) on reverse strand!"); return (-1); + ## } + ##} + $endlabel=$self->label($pos2); + if ($endlabel < 1) { + $self->warn("position $pos2 not valid as end of subseq!"); return (-1); + } + } + #print "\n ####DEBUG: start $startlabel end $endlabel length $length strand $strand\n"; + + if ($strand == 1) { + $str = $self->{'seq'}->down_chain2string($startlabel,$length,$endlabel); + } else { # reverse strand + $str = $self->{'seq'}->up_chain2string($startlabel,$length,$endlabel); + $str =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/; + } + return $str; +} + +=head2 length + + Title : length + Usage : $seq->length(); + Function: returns the number of nucleotides (or the number of aminoacids) + in the entire sequence + Returns : an integer + Errorcode -1 + Args : none + +=cut + +sub length { + my $self=shift; + my ($start,$end,$strand)=($self->start(),$self->end(),$self->strand()); + if ($strand == 1) { + return $self->{'seq'}->down_subchain_length($start,$end); + } else { + return $self->{'seq'}->up_subchain_length($start,$end); + } +} + +=head2 display_id + + Title : display_id + Usage : $id_string = $obj->display_id(); + Function: returns the display id, alias the common name of the object + + The semantics of this is that it is the most likely string + to be used as an identifier of the sequence, and likely to + have "human" readability. The id is equivalent to the ID + field of the GenBank/EMBL databanks and the id field of the + Swissprot/sptrembl database. In fasta format, the >(\S+) is + presumed to be the id, though some people overload the id + to embed other information. + + See also: accession_number + Returns : a string + Args : none + +=cut + +sub display_id { + my ($self,$value) = @_; + if(defined $value) { + $self->{'display_id'} = $value; + } + return $self->{'display_id'}; +} + + +=head2 accession_number + + Title : accession_number + Usage : $unique_biological_key = $obj->accession_number; + Function: Returns the unique biological id for a sequence, commonly + called the accession_number. + Notice that primary_id() provides the unique id for the + implemetation, allowing multiple objects to have the same accession + number in a particular implementation. + + For objects with no accession_number this method returns "unknown". + Returns : a string + Args : none + +=cut + +sub accession_number { + my ($self,$value) = @_; + if (defined $value) { + $self->{'accession_number'} = $value; + } + unless (exists $self->{'accession_number'}) { + return "unknown"; + } else { + return $self->{'accession_number'}; + } +} + +=head2 primary_id + + Title : primary_id + Usage : $unique_implementation_key = $obj->primary_id; + Function: Returns the unique id for this object in this + implementation. This allows implementations to manage their own + object ids in a way the implementation can control. Clients can + expect one id to map to one object. + + For sequences with no primary_id, this method returns + a stringified memory location. + + Returns : A string + Args : None + +=cut + + +sub primary_id { + my ($self,$value) = @_; + if(defined $value) { + $self->{'primary_id'} = $value; + } + unless (exists $self->{'primary_id'}) { + return "$self"; + } else { + return $self->{'primary_id'}; + } +} + +=head2 change + + Title : change + Usage : $substring = $obj->change('AA', 10); + Function: changes, modifies, mutates the LiveSequence + Examples: + $obj->change('', 10); delete nucleotide #10 + $obj->change('', 10, 2); delete two nucleotides starting from #10 + $obj->change('G', 10); change nuc #10 to 'G' + $obj->change('GA', 10, 4); replace #10 and 3 following with 'GA' + $obj->change('GA', 10, 2)); is same as $obj->change('GA', 10); + $obj->change('GA', 10, 0 ); insert 'GA' before nucleotide at #10 + $obj->change('GA', 10, 1); GA inserted before #10, #10 deleted + $obj->change('GATC', 10, 2); GATC inserted before #10, #10 deleted + $obj->change('GATC', 10, 6); GATC inserted before #10, #10-#15 deleted + + + Returns : a string of deleted bases (if any) or 1 (everything OK) + Errorcode: -1 + Args : seq, string, or '' ('' = undef = 0 = deletion) + start, integer + length, integer (optional) + +=cut + +sub change { + &positionchange; +} + +=head2 positionchange + + Title : positionchange + Function: Exactly like change. I.e. change() defaults to positionchange() + +=cut + +sub positionchange { + my ($self,$newseq,$position,$length)=@_; + unless ($position) { + $self->warn("Position not given or position 0"); + return (-1); + } + my $label=$self->label($position); + unless ($label > 0) { # label not found or error + $self->warn("No valid label found at that position!"); + return (-1); + } + return ($self->labelchange($newseq,$label,$length)); +} + +=head2 labelchange + + Title : labelchange + Function: Exactly like change but uses a /label/ instead than a position + as second argument. This allows for multiple changes in a LiveSeq + without the burden of recomputing positions. I.e. for a multiple + change in two different points of the LiveSeq, the approach would + be the following: fetch the correct labels out of the two different + positions (method: label($position)) and then use the labelchange() + method to modify the sequence using those labels instead than + relying on the positions (that would have modified after the + first change). + +=cut + +sub labelchange { + my ($self,$newseq,$label,$length)=@_; + unless ($self->valid($label)) { + if ($self->{'seq'}->valid($label)) { + #$self->warn("Label \'$label\' not valid for executing a LiveSeq change for the object asked but it's ok for DNAlevel change, reverting to that"); + shift @_; + return($self->{'seq'}->labelchange(@_)); + } else { + $self->warn("Label \'$label\' not valid for executing a LiveSeq change"); + return (-1); + } + } + unless ($newseq) { # it means this is a simple deletion + if (defined($length)) { + unless ($length >= 0) { + $self->warn("No sense having length < 0 in a deletion"); + return (-1); + } + } else { + $self->warn("Length not defined for deletion!"); + return (-1); + } + return $self->_delete($label,$length); + } + my $newseqlength=CORE::length($newseq); + if (defined($length)) { + unless ($length >= 0) { + $self->warn("No sense having length < 0 in a change()"); + return (-1); + } + } else { + $length=$newseqlength; # defaults to pointmutation(s) + } + if ($length == 0) { # it means this is a simple insertion, length def&==0 + my ($insertbegin,$insertend)=$self->_praeinsert($label,$newseq); + if ($insertbegin == -1) { + return (-1); + } else { + return (1); + } + } + if ($newseqlength == $length) { # it means this is simple pointmutation(s) + return $self->_mutate($label,$newseq,$length); + } + # if we arrived here then change is complex mixture + my $strand=$self->strand(); + my $afterendlabel=$self->label($length+1,$label,$strand); # get the label at $length+1 positions after $label + unless ($afterendlabel > 0) { # label not found or error + $self->warn("No valid afterendlabel found for executing the complex mutation!"); + return (-1); + } + my $deleted=$self->_delete($label,$length); # first delete length nucs + if ($deleted == -1) { # if errors + return (-1); + } else { # then insert the newsequence + my ($insertbegin,$insertend)=$self->_praeinsert($afterendlabel,$newseq); + if ($insertbegin == -1) { + return (-1); + } else { + return (1); + } + } +} + +# internal methods for change() + +# arguments: label for beginning of deletion, new sequence to insert +# returns: labels of beginning and end of the inserted sequence +# errorcode: -1 +sub _praeinsert { + my ($self,$label,$newseq)=@_; + my ($insertbegin,$insertend); + my $strand=$self->strand(); + if ($strand == 1) { + ($insertbegin,$insertend)=($self->{'seq'}->praeinsert_string($newseq,$label)); + } else { # since it's reverse strand and we insert in forward direction.... + $newseq=reverse($newseq); + $newseq =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/; # since it's reverse strand we get the complementary bases + ($insertend,$insertbegin)=($self->{'seq'}->postinsert_string($newseq,$label)); + } + if (($insertbegin==0)||($insertend==0)) { + $self->warn("Some error occurred while inserting!"); + return (-1); + } else { + return ($insertbegin,$insertend); + } +} + +# arguments: label for beginning of deletion, length of deletion +# returns: string of deleted bases +# errorcode: -1 +sub _delete { + my ($self,$label,$length)=@_; + my $strand=$self->strand(); + my $endlabel=$self->label($length,$label,$strand); # get the label at $length positions after $label + unless ($endlabel > 0) { # label not found or error + $self->warn("No valid endlabel found for executing the deletion!"); + return (-1); + } + # this is important in Transcript to fix exon structure + $self->_deletecheck($label,$endlabel); + my $deletedseq; + if ($strand == 1) { + $deletedseq=$self->{'seq'}->splice_chain($label,undef,$endlabel); + } else { + $deletedseq=$self->{'seq'}->splice_chain($endlabel,undef,$label); + $deletedseq=reverse($deletedseq); # because we are on reverse strand and we cut anyway + # in forward direction + $deletedseq =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/; # since it's reverse strand we get the complementary bases + } + return ($deletedseq); +} + +# empty function, overridden in Transcript, not useful here +sub _deletecheck { +} + +# arguments: label for beginning of mutation, newsequence, number of mutations +# returns: 1 all OK +# errorcode: -1 +sub _mutate { + my ($self,$label,$newseq,$length)=@_; # length is equal to length(newseq) + my ($i,$base,$nextlabel); + my @labels; # array of labels + my $strand=$self->strand(); + if ($length == 1) { # special cases first + @labels=($label); + } else { + my $endlabel=$self->label($length,$label,$strand); # get the label at $length positions after $label + unless ($endlabel > 0) { # label not found or error + $self->warn("No valid endlabel found for executing the mutation!"); + return (-1); + } + if ($length == 2) { # another special case + @labels=($label,$endlabel); + } else { # more than 3 bases changed + # this wouldn't work for Transcript + #my $labelsarrayref; + #if ($strand == 1) { + #$labelsarrayref=$self->{'seq'}->down_labels($label,$endlabel); + #} else { + #$labelsarrayref=$self->{'seq'}->up_labels($label,$endlabel); + #} + #@labels=@{$labelsarrayref}; + #if ($length != scalar(@labels)) { # not enough labels returned + #$self->warn("Not enough valid labels found for executing the mutation!"); + #return (-1); + #} + + # this should be more general + @labels=($label); # put the first one + while ($label != $endlabel) { + $nextlabel=$self->label(2,$label,$strand); # retrieve the next label + push (@labels,$nextlabel); + $label=$nextlabel; # move on reference + } + } + } + if ($strand == -1) { # only for reverse strand + $newseq =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/; # since it's reverse strand we get the complementary bases + } + my $errorcheck; # if not equal to $length after summing for all changes, error did occurr + $i = 0; + foreach $base (split(//,$newseq)) { + $errorcheck += $self->{'seq'}->set_value_at_label($base,$labels[$i]); + $i++; + } + if ($errorcheck != $length) { + $self->warn("Some error occurred while mutating!"); + return (-1); + } else { + return (1); + } +} + +=head2 valid + + Title : valid + Usage : $boolean = $obj->valid($label) + Function: tests if a label exists inside the object + Returns : boolean + Args : label + +=cut + +# argument: label +# returns: 1 YES 0 NO +sub valid { + my ($self,$label)=@_; + my $checkme; + my @labels=$self->all_labels; + foreach $checkme (@labels) { + if ($label == $checkme) { + return (1); # found + } + } + return (0); # not found +} + + +=head2 start + + Title : start + Usage : $startlabel=$obj->start() + Function: returns the label of the first nucleotide of the object (exon, CDS) + Returns : label + Args : none + +=cut + +sub start { + my ($self) = @_; + return $self->{'start'}; # common for all classes BUT DNA (which redefines it) and Transcript (that takes the information from the Exons) +} + +=head2 end + + Title : end + Usage : $endlabel=$obj->end() + Function: returns the label of the last nucleotide of the object (exon, CDS) + Returns : label + Args : none + +=cut + +sub end { + my ($self) = @_; + return $self->{'end'}; +} + +=head2 strand + + Title : strand + Usage : $strand=$obj->strand() + $obj->strand($strand) + Function: gets or sets strand information, being 1 or -1 (forward or reverse) + Returns : -1 or 1 + Args : none OR -1 or 1 + +=cut + +sub strand { + my ($self,$strand) = @_; + if ($strand) { + if (($strand != 1)&&($strand != -1)) { + $self->warn("strand information not changed because strand identifier not valid"); + } else { + $self->{'strand'} = $strand; + } + } + return $self->{'strand'}; +} + +=head2 alphabet + + Title : alphabet + Usage : if( $obj->alphabet eq 'dna' ) { /Do Something/ } + Function: Returns the type of sequence being one of + 'dna', 'rna' or 'protein'. This is case sensitive. + + Returns : a string either 'dna','rna','protein'. + Args : none + Note : "circular dna" is set as dna + +=cut + + +sub alphabet { + my %valid_type = map {$_, 1} qw( dna rna protein ); + my ($self,$value) = @_; + if (defined $value) { + $value =~ s/circular dna/dna/; + unless ( $valid_type{$value} ) { + $self->warn("Molecular type '$value' is not a valid type"); + } + $self->{'alphabet'} = $value; + } + return $self->{'alphabet'}; +} + +=head2 coordinate_start + + Title : coordinate_start + Usage : $coordstartlabel=$obj->coordinate_start() + : $coordstartlabel=$obj->coordinate_start($label) + Function: returns and optionally sets the first label of the coordinate + system used + For some objects only labels inside the object or in frame (for + Translation objects) will be allowed to get set as coordinate start + + Returns : label. It returns 0 if label not found. + Errorcode -1 + Args : an optional reference $label that is position 1 + +=cut + + +sub coordinate_start { + my ($self,$label) = @_; + if ($label) { + if ($self->valid($label)) { + $self->{'coordinate_start'} = $label; + } else { + $self->warn("The label you are trying to set as coordinate_start is not valid for this object"); + } + } + my $coord_start = $self->{'coordinate_start'}; + if ($coord_start) { + return $coord_start; + } else { + return $self->start(); + } +} + +=head2 label + + Title : label + Usage : $seq->label($position) + : $seq->label($position,$firstlabel) + Examples: $nextlabel=$seq->label(2,$label) -> retrieves the following label + : $prevlabel=$seq->label(-1,$label) -> retrieves the preceding label + + Function: returns the label of the nucleotide at $position from current + coordinate start + Returns : a label. It returns 0 if label not found. + Errorcode -1 + Args : a position, + an optional reference $firstlabel that is to be used as position 1 + an optional strand (1 or -1) argument + if strand argument is not given, it will default to the object + argument. This argument is useful when a call is issued from a child + of a parent object containing the subseq method + +=cut + + +sub label { + my ($self,$position,$firstlabel,$strand)=@_; + my $label; + unless (defined ($firstlabel)) { + $firstlabel=$self->coordinate_start; + } + unless ($position) { # if position = 0 complain ? + $self->warn("Position not given or position 0"); + return (-1); + } + unless (defined ($strand)) { # if optional [strand] argument not given + $strand=$self->strand; + } + if ($strand == 1) { + if ($position > 0) { + $label=$self->{'seq'}->down_get_label_at_pos($position,$firstlabel) + } else { # if < 0 + $label=$self->{'seq'}->up_get_label_at_pos(1 - $position,$firstlabel) + } + } else { + if ($position > 0) { + $label=$self->{'seq'}->up_get_label_at_pos($position,$firstlabel) + } else { # if < 0 + $label=$self->{'seq'}->down_get_label_at_pos(1 - $position,$firstlabel) + } + } + return $label; +} + + +=head2 position + + Title : position + Usage : $seq->position($label) + : $seq->position($label,$firstlabel) + Function: returns the position of nucleotide at $label + Returns : the position of the label from current coordinate start + Errorcode 0 + Args : a label pointing to a certain nucleotide (e.g. start of exon) + an optional "firstlabel" as reference to count from + an optional strand (1 or -1) argument + if strand argument is not given, it will default to the object + argument. This argument is useful when a call is issued from a child + of a parent object containing the subseq method + +=cut + + +sub position { + my ($self,$label,$firstlabel,$strand)=@_; + unless (defined ($strand)) { # if optional [strand] argument not given + $strand=$self->strand; + } + unless (defined ($firstlabel)) { + $firstlabel=$self->coordinate_start; + } + unless ($self->valid($label)) { + $self->warn("label not valid"); + return (0); + } + if ($firstlabel == $label) { + return (1); + } + my ($coordpos,$position0,$position); + $position0=$self->{'seq'}->down_get_pos_of_label($label); + $coordpos=$self->{'seq'}->down_get_pos_of_label($firstlabel); + $position=$position0-$coordpos+1; + if ($position <= 0) { + $position--; + } + if ($strand == -1) { + #print "\n----------DEBUGSEQPOS label $label firstlabel $firstlabel strand $strand: position=",1-$position; + return (1-$position); + } else { + #print "\n----------DEBUGSEQPOS label $label firstlabel $firstlabel strand $strand: position=",$position; + return ($position); + } +} + +=head2 follows + + Title : follows + Usage : $seq->follows($firstlabel,$secondlabel) + : $seq->follows($firstlabel,$secondlabel,$strand) + Function: checks if SECONDlabel follows FIRSTlabel, undependent of the strand + i.e. it checks downstream for forward strand and + upstream for reverse strand + Returns : 1 or 0 + Errorcode -1 + Args : two labels + an optional strand (1 or -1) argument + if strand argument is not given, it will default to the object + argument. This argument is useful when a call is issued from a child + of a parent object containing the subseq method + +=cut + +#' +# wraparound to is_downstream and is_upstream that chooses the correct one +# depending on the strand +sub follows { + my ($self,$firstlabel,$secondlabel,$strand)=@_; + unless (defined ($strand)) { # if optional [strand] argument not given + $strand=$self->strand; + } + if ($strand == 1) { + return ($self->{'seq'}->is_downstream($firstlabel,$secondlabel)); + } else { + return ($self->{'seq'}->is_upstream($firstlabel,$secondlabel)); + } +} +# +#=head2 translate +# +# Title : translate +# Usage : $protein_seq = $obj->translate +# Function: Provides the translation of the DNA sequence +# using full IUPAC ambiguities in DNA/RNA and amino acid codes. +# +# The resulting translation is identical to EMBL/TREMBL database +# translations. +# +# Returns : a string +# Args : character for terminator (optional) defaults to '*' +# character for unknown amino acid (optional) defaults to 'X' +# frame (optional) valid values 0, 1, 3, defaults to 0 +# codon table id (optional) defaults to 1 +# +#=cut +# +#sub translate { +# my ($self) = shift; +# return ($self->translate_string($self->seq,@_)); +#} +# +#=head2 translate_string +# +# Title : translate_string +# Usage : $protein_seq = $obj->translate_string("attcgtgttgatcgatta"); +# Function: Like translate, but can be used to translate subsequences after +# having retrieved them as string. +# Args : 1st argument is a string. Optional following arguments: like in +# the translate method +# +#=cut +# +# +#sub translate_string { +# my($self) = shift; +# my($seq) = shift; +# my($stop, $unknown, $frame, $tableid) = @_; +# my($i, $len, $output) = (0,0,''); +# my($codon) = ""; +# my $aa; +# +# +# ## User can pass in symbol for stop and unknown codons +# unless(defined($stop) and $stop ne '') { $stop = "*"; } +# unless(defined($unknown) and $unknown ne '') { $unknown = "X"; } +# unless(defined($frame) and $frame ne '') { $frame = 0; } +# +# ## the codon table ID +# if ($self->translation_table) { +# $tableid = $self->translation_table; +# } +# unless(defined($tableid) and $tableid ne '') { $tableid = 1; } +# +# ##Error if monomer is "Amino" +# $self->warn("Can't translate an amino acid sequence.") +# if (defined $self->alphabet && $self->alphabet eq 'protein'); +# +# ##Error if frame is not 0, 1 or 2 +# $self->warn("Valid values for frame are 0, 1, 2, not [$frame].") +# unless ($frame == 0 or $frame == 1 or $frame == 2); +# +# #thows a warning if ID is invalid +# my $codonTable = Bio::Tools::CodonTable->new( -id => $tableid); +# +# # deal with frame offset. +# if( $frame ) { +# $seq = substr ($seq,$frame); +# } +# +# for $codon ( grep { CORE::length == 3 } split(/(.{3})/, $seq) ) { +# my $aa = $codonTable->translate($codon); +# if ($aa eq '*') { +# $output .= $stop; +# } +# elsif ($aa eq 'X') { +# $output .= $unknown; +# } +# else { +# $output .= $aa ; +# } +# } +# #if( substr($output,-1,1) eq $stop ) { +# # chop $output; +# #} +# +# return ($output); +#} + +=head2 gene + + Title : gene + Usage : my $gene=$obj->gene; + Function: Gets or sets the reference to the LiveSeq::Gene object. + Objects that are features of a LiveSeq Gene will have this + attribute set automatically. + + Returns : reference to an object of class Gene + Note : if Gene object is not set, this method will return 0; + Args : none or reference to object of class Bio::LiveSeq::Gene + +=cut + +sub gene { + my ($self,$value) = @_; + if (defined $value) { + $self->{'gene'} = $value; + } + unless (exists $self->{'gene'}) { + return (0); + } else { + return $self->{'gene'}; + } +} + +=head2 obj_valid + + Title : obj_valid + Usage : if ($obj->obj_valid) {do something;} + Function: Checks if start and end labels are still valid for the ojbect, + i.e. tests if the LiveSeq object is still valid + Returns : boolean + Args : none + +=cut + +sub obj_valid { + my $self=shift; + unless (($self->{'seq'}->valid($self->start()))&&($self->{'seq'}->valid($self->end()))) { + return (0); + } + return (1); +} + +=head2 name + + Title : name + Usage : $name = $obj->name; + : $name = $obj->name("ABCD"); + Function: Returns or sets the name of the object. + If there is no name, it will return "unknown"; + Returns : A string + Args : None + +=cut + +sub name { + my ($self,$value) = @_; + if (defined $value) { + $self->{'name'} = $value; + } + unless (exists $self->{'name'}) { + return "unknown"; + } else { + return $self->{'name'}; + } +} + +=head2 desc + + Title : desc + Usage : $desc = $obj->desc; + : $desc = $obj->desc("ABCD"); + Function: Returns or sets the description of the object. + If there is no description, it will return "unknown"; + Returns : A string + Args : None + +=cut + +sub desc { + my ($self,$value) = @_; + if (defined $value) { + $self->{'desc'} = $value; + } + unless (exists $self->{'desc'}) { + return "unknown"; + } else { + return $self->{'desc'}; + } +} + +=head2 source + + Title : source + Usage : $name = $obj->source; + : $name = $obj->source("Homo sapiens"); + Function: Returns or sets the organism that is source of the object. + If there is no source, it will return "unknown"; + Returns : A string + Args : None + +=cut + +sub source { + my ($self,$value) = @_; + if (defined $value) { + $self->{'source'} = $value; + } + unless (exists $self->{'source'}) { + return "unknown"; + } else { + return $self->{'source'}; + } +} + +sub delete_Obj { + my $self = shift; + my @values= values %{$self}; + my @keys= keys %{$self}; + + foreach my $key ( @keys ) { + delete $self->{$key}; + } + foreach my $value ( @values ) { + if (index(ref($value),"LiveSeq") != -1) { # object case + eval { + # delete $self->{$value}; + $value->delete_Obj; + }; + } elsif (index(ref($value),"ARRAY") != -1) { # array case + my @array=@{$value}; + my $element; + foreach $element (@array) { + eval { + $element->delete_Obj; + }; + } + } elsif (index(ref($value),"HASH") != -1) { # object case + my %hash=%{$value}; + my $element; + foreach $element (%hash) { + eval { + $element->delete_Obj; + }; + } + } + } + return(1); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/LiveSeq/Transcript.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/LiveSeq/Transcript.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,858 @@ +# $Id: Transcript.pm,v 1.17 2002/09/25 08:58:23 heikki Exp $ +# +# bioperl module for Bio::LiveSeq::Transcript +# +# Cared for by Joseph Insana +# +# Copyright Joseph Insana +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::LiveSeq::Transcript - Transcript class for LiveSeq + +=head1 SYNOPSIS + + # documentation needed + +=head1 DESCRIPTION + +This stores informations about coding sequences (CDS). +The implementation is that a Transcript object accesses a collection of +Exon objects, inferring from them the nucleotide structure and sequence. + +=head1 AUTHOR - Joseph A.L. Insana + +Email: Insana@ebi.ac.uk, jinsana@gmx.net + +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::LiveSeq::Transcript; +$VERSION=5.2; + +# Version history: +# Tue Mar 21 14:38:02 GMT 2000 v 1.0 begun +# Tue Mar 21 17:45:31 GMT 2000 v 1.1 new() created +# Wed Mar 22 19:40:13 GMT 2000 v 1.4 all_Exons() seq(), length(), all_labels() +# Thu Mar 23 19:08:36 GMT 2000 v 1.5 follows() replaces is_downstream() +# Thu Mar 23 20:59:02 GMT 2000 v 2.0 valid _inside_position label position +# Fri Mar 24 18:33:18 GMT 2000 v 2.2 rewritten position(), now should work with diverse coordinate_starts +# Sat Mar 25 04:08:18 GMT 2000 v 2.21 added firstlabel to position and label so that Translation can exploit it +# Sat Mar 25 06:39:27 GMT 2000 v 2.3 started override of subseq, works just internally +# Mon Mar 27 19:05:15 BST 2000 v 2.4 subseq finished, it works with coord_start +# Fri Mar 31 18:48:07 BST 2000 v 2.5 started downstream_seq() +# Mon Apr 3 17:37:34 BST 2000 v 2.52 upstream_seq added +# Fri Apr 7 03:29:43 BST 2000 v 2.6 up/downstream now can use Gene information +# Sat Apr 8 12:59:58 BST 2000 v 3.0 all_Exons now skips no more valid exons +# Sat Apr 8 13:32:08 BST 2000 v 3.1 get_Translation added +# Wed Apr 12 12:37:08 BST 2000 v 3.2 all_Exons updates Transcript's start/end +# Wed Apr 12 12:41:22 BST 2000 v 3.3 each Exon has "transcript" attribute added +# Wed Apr 12 16:35:56 BST 2000 v 3.4 started coding _deletecheck +# Wed Apr 12 23:40:19 BST 2000 v 3.5 start and end redefined here, no more checks after deletion to refix start/end attributes. And no need of those. Eliminated hence from new() +# Wed Apr 12 23:47:02 BST 2000 v 3.9 finished _deletecheck, debugging starts +# Thu Apr 13 00:37:16 BST 2000 v 4.0 debugging done: seems working OK +# Thu Apr 27 16:18:55 BST 2000 v 4.1 translation_table added +# Tue May 16 17:57:40 BST 2000 v 4.11 corrected bug in docs of downstream_seq +# Wed May 17 16:48:34 BST 2000 v 4.2 frame() added +# Mon May 22 15:22:12 BST 2000 v 4.21 labelsubseq tweaked for cases where startlabel==endlabel (no useless follow() query!) +# Thu Jun 22 20:02:39 BST 2000 v 4.3 valid() moved to SeqI, to be inherited as the general one +# Thu Jun 22 20:27:57 BST 2000 v 4.4 optimized labelsubseq coded! +# Thu Jun 22 21:17:51 BST 2000 v 4.44 in_which_Exon() added +# Sat Jun 24 00:49:55 BST 2000 v 4.5 new subseq() that exploits the new fast labelsubseq +# Thu Jun 29 16:31:19 BST 2000 v 5.0 downsream_seq and upstream_seq recoded so that if entry is RNA it will return sequences up to the entry limits -> it should be properly debugged, expecially the upstream_seq one +# Wed Jul 12 04:01:53 BST 2000 v 5.1 croak -> carp+return(-1) +# Wed Mar 28 15:16:21 BST 2001 v 5.2 carp -> warn,throw (coded methods in SeqI) + +use strict; +# use Carp qw(carp cluck); +use vars qw($VERSION @ISA); +use Bio::LiveSeq::SeqI 3.2; # uses SeqI, inherits from it +use Bio::LiveSeq::Exon 1.0; # uses Exon to create new exon in case of deletion +@ISA=qw(Bio::LiveSeq::SeqI); + +=head2 new + + Title : new + Usage : $transcript = Bio::LiveSeq::Transcript->new(-exons => \@obj_refs); + + Function: generates a new Bio::LiveSeq::Transcript + Returns : reference to a new object of class Transcript + Errorcode -1 + Args : reference to an array of Exon object references + +=cut + +sub new { + my ($thing, %args) = @_; + my $class = ref($thing) || $thing; + my ($obj,%transcript); + + my @exons=@{$args{-exons}}; + + $obj = \%transcript; + $obj = bless $obj, $class; + + unless (@exons) { + $obj->warn("$class not initialised because exons array empty"); + return(-1); + } + + # now useless, after start and end methods have been overridden here + my $firstexon = $exons[0]; + #my $lastexon = $exons[-1]; + #my $start = $firstexon->start; + #my $end = $lastexon->end; + my $strand = $firstexon->strand; + my $seq = $firstexon->{'seq'}; + $obj->alphabet('rna'); + + unless (_checkexons(\@exons)) { + $obj->warn("$class not initialised because of problems in the exon structure"); + return(-1); + } + $obj->{'strand'}=$strand; + $obj->{'exons'}=\@exons; + $obj->{'seq'}=$seq; + + # set Transcript into each Exon + my $exon; + foreach $exon (@exons) { + $exon->{'transcript'}=$obj; + } + return $obj; +} + + +=head2 all_Exons + + Title : all_Exons + Usage : $transcript_obj->all_Exons() + Function: returns references to all Exon objects the Transcript is composed of + Example : foreach $exon ($transcript->all_Exons()) { do_something } + Returns : array of object references + Args : none + +=cut + +sub all_Exons { + my $self=shift; + my $exonsref=$self->{'exons'}; + my @exons=@{$exonsref}; + my @newexons; + my $exon; + foreach $exon (@exons) { + unless ($exon->obj_valid) { + $self->warn("$exon no more valid, start or end label lost, skipping....",1); # ignorable + } else { + push(@newexons,$exon); + } + } + if ($#exons != $#newexons) { + # update exons field + $self->{'exons'}=\@newexons; + } + return (@newexons); +} + +=head2 downstream_seq + + Title : downstream_seq + Usage : $transcript_obj->downstream_seq() + : $transcript_obj->downstream_seq(64) + Function: returns a string of nucleotides downstream of the end of the + CDS. If there is some information of the real mRNA, from features in + an attached Gene object, it will return up to those boundaries. + Otherwise it will return 1000 nucleotides. + If an argument is given it will override the default 1000 number + and return instead /that/ requested number of nucleotides. + But if a Gene object is attached, this argument will be ignored. + Returns : string + Args : an optional integer number of nucleotides to be returned instead of + the default if no gene attached + +=cut + +sub downstream_seq { + my ($self,$howmany)=@_; + my $str; + if (defined ($howmany)) { + unless ($howmany > 0) { + $self->throw("No sense in asking less than 1 downstream nucleotides!"); + } + } else { + unless ($self->{'seq'}->alphabet eq 'rna') { # if rna retrieve until the end + #$str=$DNAobj->labelsubseq($self->end,undef,undef,"unsecuremoderequested"); + #return(substr($str,1)); # delete first nucleotide that is the last of Transcript + if ($self->gene) { # if there is Gene object attached fetch relevant info + $str=$self->{'seq'}->labelsubseq($self->end,undef,$self->gene->maxtranscript->end); # retrieve from end of this Transcript to end of the maxtranscript + $str=substr($str,1); # delete first nucleotide that is the last of Transcript + if (CORE::length($str) > 0) { + return($str); + } else { # if there was no downstream through the gene's maxtranscript, go the usual way + $howmany = 1000; + } + } else { + $howmany = 1000; + } + } + } + my @exons=$self->all_Exons; + my $strand=$self->strand(); + my $lastexon=$exons[-1]; + my $lastexonlength=$lastexon->length; + # $howmany nucs after end of last exon + #my $downstream_seq=$lastexon->subseq($lastexonlength+1,undef,$howmany); + my $downstream_seq; + + if ($howmany) { + $downstream_seq=substr($lastexon->labelsubseq($self->end,$howmany,undef,"unsecuremoderequested"),1); + } else { + if ($strand == 1) { + $downstream_seq=substr($lastexon->labelsubseq($self->end,undef,$self->{'seq'}->end,"unsecuremoderequested"),1); + } else { + $downstream_seq=substr($lastexon->labelsubseq($self->end,undef,$self->{'seq'}->start,"unsecuremoderequested"),1); + } + } + return $downstream_seq; +} + +=head2 upstream_seq + + Title : upstream_seq + Usage : $transcript_obj->upstream_seq() + : $transcript_obj->upstream_seq(64) + Function: just like downstream_seq but returns nucleotides before the ATG + Note : the default, if no Gene information present and no nucleotides + number given, is to return up to 400 nucleotides. + +=cut + +sub upstream_seq { + my ($self,$howmany)=@_; + if (defined ($howmany)) { + unless ($howmany > 0) { + $self->throw("No sense in asking less than 1 upstream nucleotides!"); + } + } else { + unless ($self->{'seq'}->alphabet eq 'rna') { # if rna retrieve from the start + if ($self->gene) { # if there is Gene object attached fetch relevant info + my $str=$self->{'seq'}->labelsubseq($self->gene->maxtranscript->start,undef,$self->start); # retrieve from start of maxtranscript to start of this Transcript + chop $str; # delete last nucleotide that is the A of starting ATG + if (length($str) > 0) { + return($str); + } else { # if there was no upstream through the gene's maxtranscript, go the usual way + $howmany = 400; + } + } else { + $howmany = 400; + } + } + } + my @exons=$self->all_Exons; + my $firstexon=$exons[0]; + + my $upstream_seq; + my $strand=$self->strand(); + + if ($howmany) {# $howmany nucs before begin of first exon + my $labelbefore=$firstexon->label(-$howmany,$firstexon->start); + if ($labelbefore < 1) { + if ($strand == 1) { + $labelbefore=$self->{'seq'}->start; + } else { + $labelbefore=$self->{'seq'}->end; + } + } + $upstream_seq=$firstexon->labelsubseq($labelbefore,undef,$firstexon->start,"unsecuremoderequested"); + chop $upstream_seq; + } else { + if ($strand == 1) { + $upstream_seq=$firstexon->labelsubseq($self->{'seq'}->start,undef,$self->start,"unsecuremoderequested"); + chop $upstream_seq; # delete last nucleotide that is the A of starting ATG + } else { + $upstream_seq=$firstexon->labelsubseq($self->{'seq'}->end,undef,$self->start,"unsecuremoderequested"); + chop $upstream_seq; # delete last nucleotide that is the A of starting ATG + } + } + return $upstream_seq; +} + +# These get redefined here, overriding the SeqI one because they draw their +# information from the Exons a Transcript is built of +# optional argument: firstlabel. If not given, it checks coordinate_start +# This is useful when called by Translation +# also used by _delete +sub label { + my ($self,$position,$firstlabel)=@_; + unless ($position) { # if position = 0 complain ? + $self->warn("Position not given or position 0"); + return (-1); + } + my ($start,$end,$strand)=($self->start(),$self->end(),$self->strand()); + my ($label,@labels,$length,$arraypos); + unless (defined ($firstlabel)) { + $firstlabel=$self->coordinate_start; # this is inside Transcript obj + } + my $coord_pos=$self->_inside_position($firstlabel); + $length=$self->length; + #if ($strand == 1) { + if ($position < 1) { + $position++; # to account for missing of 0 position + } + $arraypos=$position+$coord_pos-2; + #print "\n=-=-=-=-DEBUG: arraypos $arraypos, pos $position, coordpos: $coord_pos"; + if ($arraypos < 0) { + $label=$self->{'seq'}->label($arraypos,$start,$strand); #? + } elsif ($arraypos >= $length) { + $label=$self->{'seq'}->label($arraypos-$length+2,$end,$strand); #? + } else { # inside the Transcript + @labels=$self->all_labels; + $label=$labels[$arraypos]; + } + #} +} + +# argument: label +# returns: position of label according to coord_start +# errorcode: 0 label not found +# optional argument: firstlabel. If not given, it checks coordinate_start +# This is useful when called by Translation +sub position { + my ($self,$label,$firstlabel)=@_; + unless ($self->{'seq'}->valid($label)) { + $self->warn("label is not valid"); + return (0); + } + unless (defined ($firstlabel)) { + $firstlabel=$self->coordinate_start; # this is inside Transcript obj + } + if ($label == $firstlabel) { + return (1); + } + my ($start,$end,$strand)=($self->start(),$self->end(),$self->strand()); + my ($position,$in_pos,$out_pos,$coord_pos); + my $length=$self->length; + $coord_pos=$self->_inside_position($firstlabel); + if ($self->valid($label)) { # if label is inside the Transcript + $in_pos=$self->_inside_position($label); + $position=$in_pos-$coord_pos+1; + if ($position <= 0) { + return ($position-1); # accounts for the missing of the 0 position + } + } else { + if ($self->follows($end,$label)) { # label after end of transcript + $out_pos=$self->{'seq'}->position($label,$end,$strand); + #print "\n+++++++++DEBUG label $label FOLLOWS end $end outpos $out_pos coordpos $coord_pos"; + $position=$out_pos+$length-$coord_pos; + } elsif ($self->follows($label,$start)) { # label before begin of transcript + #print "\n+++++++++DEBUG label $label BEFORE start $start outpos $out_pos coordpos $coord_pos"; + $out_pos=$self->{'seq'}->position($label,$start,$strand); + $position=$out_pos-$coord_pos+1; + } else { # label is in intron (not valid, not after, not before)! + $self->warn("Cannot give position of label pointing to intron according to CDS numbering!",1); + return (0); + } + } + return ($position); +} + +sub seq { + my $self=shift; + my ($exon,$str); + my @exons=$self->all_Exons(); + foreach $exon (@exons) { + $str .= $exon->seq(); + } + return $str; +} + +sub length { + my $self=shift; + my ($exon,$length); + my @exons=$self->all_Exons(); + foreach $exon (@exons) { + $length += $exon->length(); + } + return $length; +} + +sub all_labels { + my $self=shift; + my ($exon,@labels); + my @exons=$self->all_Exons(); + foreach $exon (@exons) { + push (@labels,$exon->all_labels()); + } + return @labels; +} + +# redefined here so that it will retrieve effective subseq without introns +# otherwise it would have retrieved an underlying DNA (possibly with introns) +# subsequence +# Drawback: this is really bulky, label->position and then a call to +# subseq that will do the opposite position-> label +# +# one day this can be rewritten as the main one so that the normal subseq +# will rely on this one and hence avoid this double (useless and lengthy) +# conversion between labels and positions +sub old_labelsubseq { + my ($self,$start,$length,$end)=@_; + my ($pos1,$pos2); + if ($start) { + unless ($self->valid($start)) { + $self->warn("Start label not valid"); return (-1); + } + $pos1=$self->position($start); + } + if ($end) { + if ($end == $start) { + $length=1; + } else { + unless ($self->valid($end)) { + $self->warn("End label not valid"); return (-1); + } + unless ($self->follows($start,$end) == 1) { + $self->warn("End label does not follow Start label!"); return (-1); + } + $pos2=$self->position($end); + undef $length; + } + } + return ($self->subseq($pos1,$pos2,$length)); +} + +# rewritten, eventually + +sub labelsubseq { + my ($self,$start,$length,$end,$unsecuremode)=@_; + unless (defined $unsecuremode && + $unsecuremode eq "unsecuremoderequested") + { # to skip security checks (faster) + if ($start) { + unless ($self->valid($start)) { + $self->warn("Start label not valid"); return (-1); + } + } else { + $start=$self->start; + } + if ($end) { + if ($end == $start) { + $length=1; + undef $end; + } else { + undef $length; # end argument overrides length argument + unless ($self->valid($end)) { + $self->warn("End label not valid"); return (-1); + } + unless ($self->follows($start,$end) == 1) { + $self->warn("End label does not follow Start label!"); return (-1); + } + } + } else { + $end=$self->end; + } + } + my ($seq,$exon,$startexon,$endexon); my @exonlabels; + my @exons=$self->all_Exons; + EXONCHECK: + foreach $exon (@exons) { + if ((!(defined($startexon)))&&($exon->valid($start))) { # checks only if not yet found + $startexon=$exon; + } + if ($exon->valid($end)) { + $endexon=$exon; + } + if ((!(defined($seq)) && (defined($startexon)))) { # initializes only once + if ((defined($endexon)) && ($endexon eq $startexon)) { # then perfect, we are finished + if ($length) { + $seq = $startexon->labelsubseq($start,$length,undef,"unsecuremoderequested"); + + + last EXONCHECK; + } else { + $seq = $startexon->labelsubseq($start,undef,$end,"unsecuremoderequested"); + } + last EXONCHECK; + } else { # get up to the end of the exon + $seq = $startexon->labelsubseq($start,undef,undef,"unsecuremoderequested"); + } + } + if (($startexon)&&($exon ne $startexon)) { + if (defined($endexon)) { # we arrived to the last exon + $seq .= $endexon->labelsubseq(undef,undef,$end,"unsecuremoderequested"); # get from the start of the exon + last EXONCHECK; + + } elsif (defined($startexon)) { # we are in a whole-exon-in-the-middle case + $seq .= $exon->seq; # we add it completely to the seq + } # else, we still have to reach the start point, exon useless, we move on + if ($length) { # if length argument specified + if (($seq && (CORE::length($seq) >= $length))) { + last EXONCHECK; + } + } + } + } + if ($length) { + return (substr($seq,0,$length)); + } else { + return ($seq); + } +} + + +# argument: label +# returns: the objref and progressive number of the Exon containing that label +# errorcode: -1 +sub in_which_Exon { + my ($self,$label)=@_; + my ($count,$exon); + my @exons=$self->all_Exons; + foreach $exon (@exons) { + $count++; # 1st exon is numbered "1" + if ($exon->valid($label)) { + return ($exon,$count) + } + } + return (-1); # if nothing found +} + +# recoded to exploit the new fast labelsubseq() +# valid only inside Transcript +sub subseq { + my ($self,$pos1,$pos2,$length) = @_; + my ($str,$startlabel,$endlabel); + if (defined ($pos1)) { + if ($pos1 == 0) { # if position = 0 complain + $self->warn("Position cannot be 0!"); return (-1); + } + if ((defined ($pos2))&&($pos1>$pos2)) { + $self->warn("1st position($pos1) cannot be > 2nd position($pos2)!"); return (-1); + } + $startlabel=$self->label($pos1); + unless ($self->valid($startlabel)) { + $self->warn("Start label not valid"); return (-1); + } + if ($startlabel < 1) { + $self->warn("position $pos1 not valid as start of subseq!"); return (-1); + } + } else { + $startlabel=$self->start; + } + if (defined ($pos2)) { + if ($pos2 == 0) { # if position = 0 complain + $self->warn("Position cannot be 0!"); return (-1); + } + undef $length; + if ((defined ($pos1))&&($pos1>$pos2)) { + $self->warn("1st position($pos1) cannot be > 2nd position($pos2)!"); return (-1); + } + $endlabel=$self->label($pos2); + unless ($self->valid($endlabel)) { + $self->warn("End label not valid"); return (-1); + } + if ($endlabel < 1) { + $self->warn("position $pos2 not valid as end of subseq!"); return (-1); + } + } else { + unless (defined ($length)) { + $endlabel=$self->end; + } + } + return ($self->labelsubseq($startlabel,$length,$endlabel,"unsecuremoderequested")); +} + +# works only inside the transcript, complains if asked outside +sub old_subseq { + my ($self,$pos1,$pos2,$length) = @_; + my ($str,$startcount,$endcount,$seq,$seqlength); + if (defined ($length)) { + if ($length < 1) { + $self->warn("No sense asking for a subseq of length < 1"); + return (-1); + } + } + my $firstlabel=$self->coordinate_start; # this is inside Transcript obj + my $coord_pos=$self->_inside_position($firstlabel); # TESTME old + $seq=$self->seq; + $seqlength=CORE::length($seq); + unless (defined ($pos1)) { + $startcount=1+$coord_pos-1; # i.e. coord_pos + } else { + if ($pos1 == 0) { # if position = 0 complain + $self->warn("Position cannot be 0!"); return (-1); + } elsif ($pos1 < 0) { + $pos1++; + } + if ((defined ($pos2))&&($pos1>$pos2)) { + $self->warn("1st position ($pos1) cannot be > 2nd position ($pos2)!"); + return (-1); + } + $startcount=$pos1+$coord_pos-1; + } + unless (defined ($pos2)) { + ; + } else { + if ($pos2 == 0) { # if position = 0 complain + $self->warn("Position cannot be 0!"); return (-1); + } elsif ($pos2 < 0) { + $pos2++; + } + if ((defined ($pos1))&&($pos1>$pos2)) { + $self->warn("1st position ($pos1) cannot be > 2nd position ($pos2)!"); + return (-1); + } + $endcount=$pos2+$coord_pos-1; + if ($endcount > $seqlength) { + #print "\n###DEBUG###: pos1 $pos1 pos2 $pos2 coordpos $coord_pos endcount $endcount seqln $seqlength\n"; + $self->warn("Cannot access end position after the end of Transcript"); + return (-1); + } + $length=$endcount-$startcount+1; + } + #print "\n###DEBUG pos1 $pos1 pos2 $pos2 start $startcount end $endcount length $length coordpos $coord_pos\n"; + my $offset=$startcount-1; + if ($offset < 0) { + $self->warn("Cannot access startposition before the beginning of Transcript, returning from start",1); # ignorable + return (substr($seq,0,$length)); + } elsif ($offset >= $seqlength) { + $self->warn("Cannot access startposition after the end of Transcript"); + return (-1); + } else { + $str=substr($seq,$offset,$length); + if (CORE::length($str) < $length) { + $self->warn("Attention, cannot return the length requested ". + "for subseq",1) if $self->verbose > 0; # ignorable + } + return $str; + } +} + +# redefined so that it doesn't require other methods (after deletions) to +# reset it. +sub start { + my $self = shift; + my $exonsref=$self->{'exons'}; + my @exons=@{$exonsref}; + return ($exons[0]->start); +} + +sub end { + my $self = shift; + my $exonsref=$self->{'exons'}; + my @exons=@{$exonsref}; + return ($exons[-1]->end); +} + + +# internal methods begin here + +# returns: position of label in transcript's all_labels +# with STARTlabel == 1 +# errorcode 0 -> label not found +# argument: label +sub _inside_position { + my ($self,$label)=@_; + my ($start,$end,$strand)=($self->start(),$self->end(),$self->strand()); + my ($position,$checkme); + my @labels=$self->all_labels; + foreach $checkme (@labels) { + $position++; + if ($label == $checkme) { + return ($position); + } + } + return (0); +} + +# returns 1 OK or 0 ERROR +# arguments: reference to array of Exon object references +sub _checkexons { + my ($exon,$thisstart); + my $self=$exon; + my $exonsref=$_[0]; + my @exons=@{$exonsref}; + + my $firstexon = $exons[0]; + + unless (ref($firstexon) eq "Bio::LiveSeq::Exon") { + $self->warn("Object not of class Exon"); + return (0); + } + my $strand = $firstexon->strand; + + my $prevend = $firstexon->end; + shift @exons; # skip first one + foreach $exon (@exons) { + unless (ref($exon) eq "Bio::LiveSeq::Exon") { # object class check + $self->warn("Object not of class Exon"); + return (0); + } + if ($exon->strand != $strand) { # strand consistency check + $self->warn("Exons' strands not consistent when trying to create Transcript"); + return (0); + } + $thisstart = $exon->start; + unless ($exon->{'seq'}->follows($prevend,$thisstart,$strand)) { + $self->warn("Exons not in correct order when trying to create Transcript"); + return (0); + } + $prevend = $exon->end; + } + return (1); +} + +=head2 get_Translation + + Title : valid + Usage : $translation = $obj->get_Translation() + Function: retrieves the reference to the object of class Translation (if any) + attached to a LiveSeq object + Returns : object reference + Args : none + +=cut + +sub get_Translation { + my $self=shift; + return ($self->{'translation'}); # this is set when Translation->new is called +} + +# this checks so that deletion spanning multiple exons is +# handled accordingly and correctly +# arguments: begin and end label of a deletion +# this is called BEFORE any deletion in the chain +sub _deletecheck { + my ($self,$startlabel,$endlabel)=@_; + my $exonsref=$self->{'exons'}; + my @exons=@{$exonsref}; + my ($startexon,$endexon,$exon); + $startexon=$endexon=0; + foreach $exon (@exons) { + if (($startexon == 0)&&($exon->valid($startlabel))) { + $startexon=$exon; # exon containing start of deletion + } + if (($endexon == 0)&&($exon->valid($endlabel))) { + $endexon=$exon; # exon containing end of deletion + } + if (($startexon)&&($endexon)) { + last; # don't check further + } + } + my $nextend=$self->label(2,$endlabel); # retrieve the next label + my $prevstart=$self->label(-1,$startlabel); # retrieve the prev label + + if ($startexon eq $endexon) { # intra-exon deletion + if (($startexon->start eq $startlabel) && ($startexon->end eq $endlabel)) { + # let's delete the entire exon + my @newexons; + foreach $exon (@exons) { + unless ($exon eq $startexon) { + push(@newexons,$exon); + } + } + $self->{'exons'}=\@newexons; + } elsif ($startexon->start eq $startlabel) { # special cases + $startexon->{'start'}=$nextend; # set a new start of exon + } elsif ($startexon->end eq $endlabel) { + $startexon->{'end'}=$prevstart; # set a new end of exon + } else { + return; # no problem + } + } else { # two new exons to be created, inter-exons deletion + my @newexons; + my $exonobj; + my $dna=$self->{'seq'}; + my $strand=$self->strand; + my $notmiddle=1; # flag for skipping exons in the middle of deletion + foreach $exon (@exons) { + if ($exon eq $startexon) { + $exonobj=Bio::LiveSeq::Exon->new('-seq'=>$dna,'-start'=>$exon->start,'-end'=>$prevstart,'-strand'=>$strand); # new partial exon + push(@newexons,$exonobj); + $notmiddle=0; # now we enter totally deleted exons + } elsif ($exon eq $endexon) { + $exonobj=Bio::LiveSeq::Exon->new('-seq'=>$dna,'-start'=>$nextend,'-end'=>$exon->end,'-strand'=>$strand); # new partial exon + push(@newexons,$exonobj); + $notmiddle=1; # exiting totally deleted exons + } else { + if ($notmiddle) { # if before or after exons with deletion + push(@newexons,$exon); + }# else skip them + } + } + $self->{'exons'}=\@newexons; + } +} + +=head2 translation_table + + Title : translation_table + Usage : $name = $obj->translation_table; + : $name = $obj->translation_table(11); + Function: Returns or sets the translation_table used for translating the + transcript. + If it has never been set, it will return undef. + Returns : an integer + +=cut + +sub translation_table { + my ($self,$value) = @_; + if (defined $value) { + $self->{'translation_table'} = $value; + } + unless (exists $self->{'translation_table'}) { + return (undef); + } else { + return $self->{'translation_table'}; + } +} + +=head2 frame + + Title : frame + Usage : $frame = $transcript->frame($label); + Function: Returns the frame of a particular nucleotide. + Frame can be 0 1 or 2 and means the position in the codon triplet + of the particulat nucleotide. 0 is the first codon_position. + Codon_position (1 2 3) is simply frame+1. + If the label asked for is not inside the Transcript, -1 will be + returned. + Args : a label + Returns : 0 1 or 2 + Errorcode -1 + +=cut + +# args: label +# returns: frame of nucleotide (0 1 2) +# errorcode: -1 +sub frame { + my ($self,$inputlabel)=@_; + my @labels=$self->all_labels; + my ($label,$frame,$count); + foreach $label (@labels) { + if ($inputlabel == $label) { + return ($count % 3); + } + $count++; # 0 1 2 3 4.... + } + return (-1); # label not found amid Transcript labels +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/LiveSeq/Translation.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/LiveSeq/Translation.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,347 @@ +# $Id: Translation.pm,v 1.12 2002/09/25 08:57:52 heikki Exp $ +# +# bioperl module for Bio::LiveSeq::Translation +# +# Cared for by Joseph Insana +# +# Copyright Joseph Insana +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::LiveSeq::Translation - Translation class for LiveSeq + +=head1 SYNOPSIS + + #documentation needed + +=head1 DESCRIPTION + +This stores informations about aminoacids translations of transcripts. +The implementation is that a Translation object is the translation of +a Transcript object, with different possibilities of manipulation, +different coordinate system and eventually its own ranges (protein domains). + +=head1 AUTHOR - Joseph A.L. Insana + +Email: Insana@ebi.ac.uk, jinsana@gmx.net + +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::LiveSeq::Translation; +$VERSION=1.8; + +# Version history: +# Thu Mar 23 14:41:52 GMT 2000 v.1.0 begun +# Sat Mar 25 04:08:59 GMT 2000 v 1.2 valid(), label(), position() +# Tue Mar 28 03:37:17 BST 2000 v 1.3 added inheritance from Transcript, subseq relies on it! +# Fri Mar 31 16:53:53 BST 2000 v 1.4 new seq() function that checks for stop codons: it now returns only up to the stop but doesn't continue if stop not found +# Fri Mar 31 18:45:07 BST 2000 v 1.41 now it asks for Transcript->downstream_seq +# Fri Mar 31 19:20:04 BST 2000 v 1.49 seq() now works correctly +# Thu Apr 13 00:10:29 BST 2000 v 1.5 start and end now take the information from Transcript +# Thu Apr 27 16:18:55 BST 2000 v 1.6 translation_table info added +# Thu May 11 17:30:41 BST 2000 v 1.66 position method updated so to return a position also for labels not in frame (not at 1st triplet position) +# Mon May 22 14:59:14 BST 2000 v 1.7 labelsubseq added +# Mon May 22 15:22:12 BST 2000 v 1.71 labelsubseq tweaked for cases where startlabel==endlabel (no useless follow() query!) +# Mon May 22 15:28:49 BST 2000 v 1.74 modified seq() so that the "*" is printed +# Wed Jun 7 04:02:18 BST 2000 v 1.75 added offset() +# Thu Jun 29 15:10:22 BST 2000 v 1.76 bug corrected for elongation mutations, if stop codon is not found downstream +# Wed Mar 28 16:37:37 BST 2001 v 1.8 carp -> warn,throw (coded methods in SeqI) + +use strict; +#use Carp qw(croak carp cluck); +use vars qw($VERSION @ISA); +use Bio::LiveSeq::SeqI 3.2; # uses SeqI, inherits from it +use Bio::PrimarySeq; +@ISA=qw(Bio::LiveSeq::Transcript ); + + +=head2 new + + Title : new + Usage : $protein = Bio::LiveSeq::Translation->new(-transcript => $transcr); + + Function: generates a new Bio::LiveSeq::Translation + Returns : reference to a new object of class Translation + Errorcode -1 + Args : reference to an object of class Transcript + +=cut + +sub new { + my ($thing, %args) = @_; + my $class = ref($thing) || $thing; + my ($obj,%translation); + + my $transcript=$args{-transcript}; + + $obj = \%translation; + $obj = bless $obj, $class; + + unless ($transcript) { + $obj->throw("$class not initialised because no -transcript given"); + } + unless (ref($transcript) eq "Bio::LiveSeq::Transcript") { + $obj->throw("$class not initialised because no object of class Transcript given"); + } + + #my $startbase = $transcript->start; + #my $endbase = $transcript->end; + my $strand = $transcript->strand; + my $seq = $transcript->{'seq'}; + + $obj->{'strand'}=$strand; + $obj->{'seq'}=$seq; + $obj->{'transcript'}=$transcript; + $obj->{'alphabet'}="protein"; + + $transcript->{'translation'}=$obj;# set the Translation ref into its Transcript + return $obj; +} + +=head2 get_Transcript + + Title : valid + Usage : $transcript = $obj->get_Transcript() + Function: retrieves the reference to the object of class Transcript (if any) + attached to a LiveSeq object + Returns : object reference + Args : none + +=cut + +sub get_Transcript { + my $self=shift; + return ($self->{'transcript'}); +} + +# These get redefined here, overriding the SeqI ones + +sub change { + my ($self)=@_; + $self->warn("Cannot change a Translation object!\nChanges have to be issued at the nucleotide level!"); + return (-1); +} +sub positionchange { + my ($self)=@_; + $self->warn("Cannot change a Translation object!\nChanges have to be issued at the nucleotide level!"); + return (-1); +} +sub labelchange { + my ($self)=@_; + $self->warn("Cannot change a Translation object!\nChanges have to be issued at the nucleotide level!"); + return (-1); +} + +# this just returns the translation of the transcript, without checking for +# stop codons +sub transl_seq { + my $self=shift; + my $transcript=$self->get_Transcript; + my $translation=$transcript->translate(undef, undef, undef, + $self->translation_table)->seq; + return $translation; +} + +# version 1.74 -> now the "*" is printed +sub seq { + my $self=shift; + my $proteinseq; + my $transcript=$self->get_Transcript; + my $translation=$transcript->translate(undef, undef, undef, + $self->translation_table)->seq; + my $stop_pos=index($translation,"*"); + if ($stop_pos == -1) { # no stop present, continue downstream + my $downstreamseq=$transcript->downstream_seq(); + #carp "the downstream is: $downstreamseq"; # debug + my $cdnaseq=$transcript->seq(); + my $extendedseq = new Bio::PrimarySeq(-seq => "$cdnaseq$downstreamseq", + -alphabet => 'dna' + ); + + $translation=$extendedseq->translate(undef, undef, undef, + $self->translation_table)->seq; + #carp "the new translation is: $translation"; # debug + $stop_pos=index($translation,"*"); + if ($stop_pos == -1) { # still no stop present, return warning + $self->warn("Warning: no stop codon found in the retrieved sequence downstream of Transcript ",1); + undef $stop_pos; + $proteinseq=$translation; + } else { + $proteinseq=substr($translation,0,$stop_pos+1); + #carp "the new stopped translation is: $proteinseq, because the stop is at position $stop_pos"; # debug + } + } else { + $proteinseq=substr($translation,0,$stop_pos+1); + } + return $proteinseq; +} + +sub length { + my $self=shift; + my $seq=$self->seq; + my $length=length($seq); + return $length; +} + +sub all_labels { + my $self=shift; + return $self->get_Transcript->all_labels; +} + +# counts in triplet. Only a label matching the beginning of a triplet coding +# for an aminoacid is considered valid when setting coordinate_start +# (i.e. only in frame!) +sub valid { + my ($self,$label)=@_; + my $i; + my @labels=$self->get_Transcript->all_labels; + my $length=$#labels; + while ($i <= $length) { + if ($label == $labels[$i]) { + return (1); # found + } + $i=$i+3; + } + return (0); # not found +} + +# returns the label to the first nucleotide of the triplet coding for $position aminoacid +sub label { + my ($self,$position)=@_; + my $firstlabel=$self->coordinate_start; # this is in_frame checked + if ($position > 0) { + $position=$position*3-2; + } else { # if position = 0 this will be caught by Transcript, error thrown + $position=$position*3; + } + return $self->get_Transcript->label($position,$firstlabel); + # check for coord_start different +} + +# returns position (aminoacids numbering) of a particular label +# used to return 0 for not in frame labels +# now returns the position anyway (after version 1.66) +sub position { + my ($self,$label)=@_; + my $firstlabel=$self->coordinate_start; # this is in_frame checked + my $position=$self->get_Transcript->position($label,$firstlabel); + use integer; + my $modulus=$position % 3; + if ($position == 0) { + return (0); + } elsif ($position > 0) { + if ($modulus != 1) { + $self->warn("Attention! Label $label is not in frame ". + "(1st position of triplet) with protein",1) if $self->verbose > 0; # ignorable + if ($modulus == 2) { + return ($position / 3 + 1); + } else { # i.e. modulus == 0 + return ($position / 3); + } + } + return ($position / 3 + 1); + } else { # pos < 0 + if ($modulus != 0) { + $self->warn("Attention! Label $label is not in frame ". + "(1st position of triplet) with protein",1) if $self->verbose > 0; # ignorable + return ($position / 3 - 1); # ok for both other positions + } + return ($position / 3); + } + $self->throw( "WEIRD: execution shouldn't have reached here"); + return (0); # this should never happen, but just in case +} + +# note: it inherits subseq and labelsubseq from Transcript! + +sub start { + my $self=shift; + return ($self->{'transcript'}->start); +} + +sub end { + my $self=shift; + return ($self->{'transcript'}->end); +} + +=head2 aa_ranges + + Title : aa_ranges + Usage : @proteinfeatures = $translation->aa_ranges() + Function: to retrieve all the LiveSeq AARange objects attached to a + Translation, usually created out of a SwissProt database entry + crossreferenced from an EMBL CDS feature. + Returns : an array + Args : none + +=cut + +# returns an array of obj_ref of AARange objects attached to the Translation +sub aa_ranges { + my $self=shift; + return ($self->{'aa_ranges'}); +} + +sub translation_table { + my $self=shift; + $self->get_Transcript->translation_table(@_); +} + +# returns all aminoacids "affected" i.e. all aminoacids coded by any codon +# "touched" by the range selected between the labels, even if only partially. + +# it's not optimized for performance but it's useful + +sub labelsubseq { + my ($self,$start,$length,$end)=@_; + my ($pos1,$pos2); + my $transcript=$self->get_Transcript; + if ($start) { + unless ($transcript->valid($start)) { + $self->warn("Start label not valid"); return (-1); + } + $pos1=$self->position($start); + } + if ($end) { + if ($end == $start) { + $length=1; + } else { + unless ($transcript->valid($end)) { + $self->warn("End label not valid"); return (-1); + } + unless ($transcript->follows($start,$end) == 1) { + $self->warn("End label does not follow Start label!"); return (-1); + } + $pos2=$self->position($end); + $length=$pos2-$pos1+1; + } + } + my $sequence=$self->seq; + return (substr($sequence,$pos1-1,$length)); +} + +# return the offset in aminoacids from LiveSeq protein sequence and SwissProt +# sequence (usually as a result of an INIT_MET or a gap) +sub offset { + my $self=shift; + return ($self->{'offset'}); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/LocatableSeq.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/LocatableSeq.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,432 @@ +# $Id: LocatableSeq.pm,v 1.22.2.1 2003/03/31 11:49:51 heikki Exp $ +# +# BioPerl module for Bio::LocatableSeq +# +# Cared for by Ewan Birney +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::LocatableSeq - A Sequence object with start/end points on it + that can be projected into a MSA or have coordinates relative to another seq. + +=head1 SYNOPSIS + + + use Bio::LocatableSeq; + my $seq = new Bio::LocatableSeq(-seq => "CAGT-GGT", + -id => "seq1", + -start => 1, + -end => 7); + + +=head1 DESCRIPTION + + + # a normal sequence object + $locseq->seq(); + $locseq->id(); + + # has start,end points + $locseq->start(); + $locseq->end(); + + # inheriets off RangeI, so range operations possible + +=head1 FEEDBACK + + +=head2 Mailing Lists + + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + + +The locatable sequence object was developed mainly because the +SimpleAlign object requires this functionality, and in the rewrite +of the Sequence object we had to decide what to do with this. + +It is, to be honest, not well integrated with the rest of bioperl, for +example, the trunc() function does not return a LocatableSeq object, +as some might have thought. There are all sorts of nasty gotcha's +about interactions between coordinate systems when these sort of +objects are used. + + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +#' +# Let the code begin... + +package Bio::LocatableSeq; +use vars qw(@ISA); +use strict; + +use Bio::PrimarySeq; +use Bio::RangeI; +use Bio::Location::Simple; +use Bio::Location::Fuzzy; + + +@ISA = qw(Bio::PrimarySeq Bio::RangeI); + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + my ($start,$end,$strand) = + $self->_rearrange( [qw(START END STRAND)], + @args); + + defined $start && $self->start($start); + defined $end && $self->end($end); + defined $strand && $self->strand($strand); + + return $self; # success - we hope! +} + +=head2 start + + Title : start + Usage : $obj->start($newval) + Function: + Returns : value of start + Args : newvalue (optional) + +=cut + +sub start{ + my $self = shift; + if( @_ ) { + my $value = shift; + $self->{'start'} = $value; + } + return $self->{'start'}; + +} + +=head2 end + + Title : end + Usage : $obj->end($newval) + Function: + Returns : value of end + Args : newvalue (optional) + +=cut + +sub end { + my $self = shift; + if( @_ ) { + my $value = shift; + my $string = $self->seq; + if ($string and $self->start) { + my $s2 = $string; + $string =~ s/[.-]+//g; + my $len = CORE::length $string; + my $new_end = $self->start + $len - 1 ; + my $id = $self->id; + $self->warn("In sequence $id residue count gives value $len. +Overriding value [$value] with value $new_end for Bio::LocatableSeq::end().") + and $value = $new_end if $new_end != $value and $self->verbose > 0; + } + + $self->{'end'} = $value; + } + return $self->{'end'}; + +} + +=head2 strand + + Title : strand + Usage : $obj->strand($newval) + Function: + Returns : value of strand + Args : newvalue (optional) + +=cut + +sub strand{ + my $self = shift; + if( @_ ) { + my $value = shift; + $self->{'strand'} = $value; + } + return $self->{'strand'}; +} + +=head2 get_nse + + Title : get_nse + Usage : + Function: read-only name of form id/start-end + Example : + Returns : + Args : + +=cut + +sub get_nse{ + my ($self,$char1,$char2) = @_; + + $char1 ||= "/"; + $char2 ||= "-"; + + $self->throw("Attribute id not set") unless $self->id(); + $self->throw("Attribute start not set") unless $self->start(); + $self->throw("Attribute end not set") unless $self->end(); + + return $self->id() . $char1 . $self->start . $char2 . $self->end ; + +} + + +=head2 no_gap + + Title : no_gaps + Usage :$self->no_gaps('.') + Function: + + Gets number of gaps in the sequence. The count excludes + leading or trailing gap characters. + + Valid bioperl sequence characters are [A-Za-z\-\.\*]. Of + these, '.' and '-' are counted as gap characters unless an + optional argument specifies one of them. + + Returns : number of internal gaps in the sequnce. + Args : a gap character (optional) + +=cut + +sub no_gaps { + my ($self,$char) = @_; + my ($seq, $count) = (undef, 0); + + # default gap characters + $char ||= '-.'; + + $self->warn("I hope you know what you are doing setting gap to [$char]") + unless $char =~ /[-.]/; + + $seq = $self->seq; + return 0 unless $seq; # empty sequence does not have gaps + + $seq =~ s/^([$char]+)//; + $seq =~ s/([$char]+)$//; + $count++ while $seq =~ /[$char]+/g; + + return $count; + +} + + +=head2 column_from_residue_number + + Title : column_from_residue_number + Usage : $col = $seq->column_from_residue_number($resnumber) + Function: + + This function gives the position in the alignment + (i.e. column number) of the given residue number in the + sequence. For example, for the sequence + + Seq1/91-97 AC..DEF.GH + + column_from_residue_number(94) returns 5. + + An exception is thrown if the residue number would lie + outside the length of the aligment + (e.g. column_from_residue_number( "Seq2", 22 ) + + Returns : A column number for the position of the + given residue in the given sequence (1 = first column) + Args : A residue number in the whole sequence (not just that + segment of it in the alignment) + +=cut + +sub column_from_residue_number { + my ($self, $resnumber) = @_; + + $self->throw("Residue number has to be a positive integer, not [$resnumber]") + unless $resnumber =~ /^\d+$/ and $resnumber > 0; + + if ($resnumber >= $self->start() and $resnumber <= $self->end()) { + my @residues = split //, $self->seq; + my $count = $self->start(); + my $i; + for ($i=0; $i < @residues; $i++) { + if ($residues[$i] ne '.' and $residues[$i] ne '-') { + $count == $resnumber and last; + $count++; + } + } + # $i now holds the index of the column. + # The actual column number is this index + 1 + + return $i+1; + } + + $self->throw("Could not find residue number $resnumber"); + +} + + +=head2 location_from_column + + Title : location_from_column + Usage : $loc = $ali->location_from_column( $seq_number, $column_number) + Function: + + This function gives the residue number in the sequence with + the given name for a given position in the alignment + (i.e. column number) of the given. Gaps complicate this + process and force the output to be a L where + values can be undefined. For example, for the alignment + + Seq1/91-97 AC..DEF.G. + Seq2/1-9 .CYHDEFKGK + + location_from_column( Seq1/91-97, 3 ) position 93 + location_from_column( Seq1/91-97, 2 ) position 92^93 + location_from_column( Seq1/91-97, 10) position 97^98 + location_from_column( Seq2/1-9, 1 ) position undef + + An exact position returns a Bio::Location::Simple object + where where location_type() returns 'EXACT', if a position + is between bases location_type() returns 'IN-BETWEEN'. + Column before the first residue returns undef. Note that if + the position is after the last residue in the alignment, + that there is no guarantee that the original sequence has + residues after that position. + + An exception is thrown if the column number is not within + the sequence. + + Returns : Bio::Location::Simple or undef + Args : A column number + Throws : If column is not within the sequence + +See L for more. + +=cut + +sub location_from_column { + my ($self, $column) = @_; + + $self->throw("Column number has to be a positive integer, not [$column]") + unless $column =~ /^\d+$/ and $column > 0; + $self->throw("Column number [column] is larger than". + " sequence length [". $self->length. "]") + unless $column <= $self->length; + + my ($loc); + my $s = $self->subseq(1,$column); + $s =~ s/\W//g; + my $pos = CORE::length $s; + + my $start = $self->start || 0 ; + if ($self->subseq($column, $column) =~ /[a-zA-Z]/ ) { + $loc = new Bio::Location::Simple + (-start => $pos + $start - 1, + -end => $pos + $start - 1, + -strand => 1 + ); + } + elsif ($pos == 0 and $self->start == 1) { + } else { + $loc = new Bio::Location::Simple + (-start => $pos + $start - 1, + -end => $pos +1 + $start - 1, + -strand => 1, + -location_type => 'IN-BETWEEN' + ); + } + return $loc; +} + +=head2 revcom + + Title : revcom + Usage : $rev = $seq->revcom() + Function: Produces a new Bio::LocatableSeq object which + has the reversed complement of the sequence. For protein + sequences this throws an exception of "Sequence is a + protein. Cannot revcom" + + Returns : A new Bio::LocatableSeq object + Args : none + +=cut + +sub revcom { + my ($self) = @_; + + my $new = $self->SUPER::revcom; + $new->strand($self->strand * -1); + $new->start($self->start) if $self->start; + $new->end($self->end) if $self->end; + return $new; +} + + +=head2 trunc + + Title : trunc + Usage : $subseq = $myseq->trunc(10,100); + Function: Provides a truncation of a sequence, + + Example : + Returns : a fresh Bio::PrimarySeqI implementing object + Args : Two integers denoting first and last columns of the + sequence to be included into sub-sequence. + + +=cut + +sub trunc { + + my ($self, $start, $end) = @_; + my $new = $self->SUPER::trunc($start, $end); + + $start = $self->location_from_column($start); + $start ? ($start = $start->start) : ($start = 1); + + $end = $self->location_from_column($end); + $end = $end->start if $end; + + $new->strand($self->strand); + $new->start($start) if $start; + $new->end($end) if $end; + + return $new; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Location/Atomic.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Location/Atomic.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,405 @@ +# $Id: Atomic.pm,v 1.6 2002/12/01 00:05:20 jason Exp $ +# +# BioPerl module for Bio::Location::Atomic +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Location::Atomic - Implementation of a Atomic Location on a Sequence + +=head1 SYNOPSIS + + use Bio::Location::Atomic; + + my $location = new Bio::Location::Atomic(-start => 1, -end => 100, + -strand => 1 ); + + if( $location->strand == -1 ) { + printf "complement(%d..%d)\n", $location->start, $location->end; + } else { + printf "%d..%d\n", $location->start, $location->end; + } + +=head1 DESCRIPTION + +This is an implementation of Bio::LocationI to manage simple location +information on a Sequence. + +=head1 FEEDBACK + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + + +package Bio::Location::Atomic; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::LocationI; + + +@ISA = qw(Bio::Root::Root Bio::LocationI); + +sub new { + my ($class, @args) = @_; + my $self = {}; + + bless $self,$class; + + my ($v,$start,$end,$strand,$seqid) = $self->_rearrange([qw(VERBOSE + START + END + STRAND + SEQ_ID)],@args); + defined $v && $self->verbose($v); + defined $strand && $self->strand($strand); + + defined $start && $self->start($start); + defined $end && $self->end($end); + if( defined $self->start && defined $self->end && + $self->start > $self->end && $self->strand != -1 ) { + $self->warn("When building a location, start ($start) is expected to be less than end ($end), ". + "however it was not. Switching start and end and setting strand to -1"); + + $self->strand(-1); + my $e = $self->end; + my $s = $self->start; + $self->start($e); + $self->end($s); + } + $seqid && $self->seq_id($seqid); + + return $self; +} + +=head2 start + + Title : start + Usage : $start = $loc->start(); + Function: get/set the start of this range + Returns : the start of this range + Args : optionaly allows the start to be set + : using $loc->start($start) + +=cut + +sub start { + my ($self, $value) = @_; + $self->min_start($value) if( defined $value ); + return $self->SUPER::start(); +} + +=head2 end + + Title : end + Usage : $end = $loc->end(); + Function: get/set the end of this range + Returns : the end of this range + Args : optionaly allows the end to be set + : using $loc->end($start) + +=cut + +sub end { + my ($self, $value) = @_; + + $self->min_end($value) if( defined $value ); + return $self->SUPER::end(); +} + +=head2 strand + + Title : strand + Usage : $strand = $loc->strand(); + Function: get/set the strand of this range + Returns : the strandidness (-1, 0, +1) + Args : optionaly allows the strand to be set + : using $loc->strand($strand) + +=cut + +sub strand { + my ($self, $value) = @_; + + if ( defined $value ) { + if ( $value eq '+' ) { $value = 1; } + elsif ( $value eq '-' ) { $value = -1; } + elsif ( $value eq '.' ) { $value = 0; } + elsif ( $value != -1 && $value != 1 && $value != 0 ) { + $self->throw("$value is not a valid strand info"); + } + $self->{'_strand'} = $value + } + # let's go ahead and force to '0' if + # we are requesting the strand without it + # having been set previously + return $self->{'_strand'} || 0; +} + +=head2 length + + Title : length + Usage : $len = $loc->length(); + Function: get the length in the coordinate space this location spans + Example : + Returns : an integer + Args : none + + +=cut + +sub length { + my ($self) = @_; + return abs($self->end() - $self->start()) + 1; +} + +=head2 min_start + + Title : min_start + Usage : my $minstart = $location->min_start(); + Function: Get minimum starting location of feature startpoint + Returns : integer or undef if no minimum starting point. + Args : none + +=cut + +sub min_start { + my ($self,$value) = @_; + + if(defined($value)) { + $self->{'_start'} = $value; + } + return $self->{'_start'}; +} + +=head2 max_start + + Title : max_start + Usage : my $maxstart = $location->max_start(); + Function: Get maximum starting location of feature startpoint. + + In this implementation this is exactly the same as min_start(). + + Returns : integer or undef if no maximum starting point. + Args : none + +=cut + +sub max_start { + my ($self,@args) = @_; + return $self->min_start(@args); +} + +=head2 start_pos_type + + Title : start_pos_type + Usage : my $start_pos_type = $location->start_pos_type(); + Function: Get start position type (ie <,>, ^). + + In this implementation this will always be 'EXACT'. + + Returns : type of position coded as text + ('BEFORE', 'AFTER', 'EXACT','WITHIN', 'BETWEEN') + Args : none + +=cut + +sub start_pos_type { + my($self) = @_; + return 'EXACT'; +} + +=head2 min_end + + Title : min_end + Usage : my $minend = $location->min_end(); + Function: Get minimum ending location of feature endpoint + Returns : integer or undef if no minimum ending point. + Args : none + +=cut + +sub min_end { + my($self,$value) = @_; + + if(defined($value)) { + $self->{'_end'} = $value; + } + return $self->{'_end'}; +} + +=head2 max_end + + Title : max_end + Usage : my $maxend = $location->max_end(); + Function: Get maximum ending location of feature endpoint + + In this implementation this is exactly the same as min_end(). + + Returns : integer or undef if no maximum ending point. + Args : none + +=cut + +sub max_end { + my($self,@args) = @_; + return $self->min_end(@args); +} + +=head2 end_pos_type + + Title : end_pos_type + Usage : my $end_pos_type = $location->end_pos_type(); + Function: Get end position type (ie <,>, ^) + + In this implementation this will always be 'EXACT'. + + Returns : type of position coded as text + ('BEFORE', 'AFTER', 'EXACT','WITHIN', 'BETWEEN') + Args : none + +=cut + +sub end_pos_type { + my($self) = @_; + return 'EXACT'; +} + +=head2 location_type + + Title : location_type + Usage : my $location_type = $location->location_type(); + Function: Get location type encoded as text + Returns : string ('EXACT', 'WITHIN', 'BETWEEN') + Args : none + +=cut + +sub location_type { + my ($self) = @_; + return 'EXACT'; +} + +=head2 is_remote + + Title : is_remote + Usage : $self->is_remote($newval) + Function: Getset for is_remote value + Returns : value of is_remote + Args : newvalue (optional) + + +=cut + +sub is_remote { + my $self = shift; + if( @_ ) { + my $value = shift; + $self->{'is_remote'} = $value; + } + return $self->{'is_remote'}; + +} + +=head2 each_Location + + Title : each_Location + Usage : @locations = $locObject->each_Location($order); + Function: Conserved function call across Location:: modules - will + return an array containing the component Location(s) in + that object, regardless if the calling object is itself a + single location or one containing sublocations. + Returns : an array of Bio::LocationI implementing objects - for + Simple locations, the return value is just itself. + Args : + +=cut + +sub each_Location { + my ($self) = @_; + return ($self); +} + +=head2 to_FTstring + + Title : to_FTstring + Usage : my $locstr = $location->to_FTstring() + Function: returns the FeatureTable string of this location + Returns : string + Args : none + +=cut + +sub to_FTstring { + my($self) = @_; + if( $self->start == $self->end ) { + return $self->start; + } + my $str = $self->start . ".." . $self->end; + if( $self->strand == -1 ) { + $str = sprintf("complement(%s)", $str); + } + return $str; +} + + +sub trunc { + my ($self,$start,$end,$relative_ori) = @_; + + my $newstart = $self->start - $start+1; + my $newend = $self->end - $start+1; + my $newstrand = $relative_ori * $self->strand; + + my $out; + if( $newstart < 1 || $newend > ($end-$start+1) ) { + $out = Bio::Location::Atomic->new(); + $out->start($self->start); + $out->end($self->end); + $out->strand($self->strand); + $out->seq_id($self->seqid); + $out->is_remote(1); + } else { + $out = Bio::Location::Atomic->new(); + $out->start($newstart); + $out->end($newend); + $out->strand($newstrand); + $out->seq_id(); + } + + return $out; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Location/AvWithinCoordPolicy.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Location/AvWithinCoordPolicy.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,129 @@ +# $Id: AvWithinCoordPolicy.pm,v 1.4 2002/12/01 00:05:20 jason Exp $ +# +# BioPerl module for Bio::Location::AvWithinCoordPolicy +# +# Cared for by Hilmar Lapp +# and Jason Stajich +# +# Copyright Hilmar Lapp, Jason Stajich +# +# You may distribute this module under the same terms as perl itself +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Location::AvWithinCoordPolicy - class implementing +Bio::Location::CoordinatePolicy as the average for WITHIN and the widest possible and reasonable range otherwise + +=head1 SYNOPSIS + +See Bio::Location::CoordinatePolicyI + +=head1 DESCRIPTION + +CoordinatePolicyI implementing objects are used by Bio::LocationI +implementing objects to determine integer-valued coordinates when +asked for it. + +This class will compute the coordinates such that for fuzzy locations +of type WITHIN and BETWEEN the average of the two limits will be +returned, and for all other locations it will return the widest +possible range, but by using some common sense. This means that +e.g. locations like "E5..100" (start before position 5) will return 5 +as start (returned values have to be positive integers). + +=head1 FEEDBACK + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp, Jason Stajich + +Email Ehlapp@gmx.netE, Ejason@bioperl.orgE + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + + +package Bio::Location::AvWithinCoordPolicy; +use vars qw(@ISA); +use strict; + +use Bio::Location::WidestCoordPolicy; + +@ISA = qw(Bio::Location::WidestCoordPolicy); + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + return $self; +} + + + +=head2 start + + Title : start + Usage : $start = $policy->start($location); + Function: Get the integer-valued start coordinate of the given location as + computed by this computation policy. + Returns : A positive integer number. + Args : A Bio::LocationI implementing object. + +=cut + +sub start { + my ($self,$loc) = @_; + + if(($loc->start_pos_type() eq 'WITHIN') || + ($loc->start_pos_type() eq 'BETWEEN')) { + my ($min, $max) = ($loc->min_start(), $loc->max_start()); + return int(($min+$max)/2) if($min && $max); + } + return $self->SUPER::start($loc); +} + +=head2 end + + Title : end + Usage : $end = $policy->end($location); + Function: Get the integer-valued end coordinate of the given location as + computed by this computation policy. + Returns : A positive integer number. + Args : A Bio::LocationI implementing object. + +=cut + +sub end { + my ($self,$loc) = @_; + + if(($loc->end_pos_type() eq 'WITHIN') || + ($loc->end_pos_type() eq 'BETWEEN')) { + my ($min, $max) = ($loc->min_end(), $loc->max_end()); + return int(($min+$max)/2) if($min && $max); + } + return $self->SUPER::end($loc); +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Location/CoordinatePolicyI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Location/CoordinatePolicyI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,111 @@ +# $Id: CoordinatePolicyI.pm,v 1.4 2002/10/22 07:38:34 lapp Exp $ +# +# BioPerl module for Bio::Location::CoordinatePolicyI +# Cared for by Hilmar Lapp +# and Jason Stajich +# +# Copyright Hilmar Lapp, Jason Stajich +# +# You may distribute this module under the same terms as perl itself +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Location::CoordinatePolicyI - Abstract interface for objects implementing +a certain policy of computing integer-valued coordinates of a Location + +=head1 SYNOPSIS + + # get a location, e.g., from a SeqFeature + $location = $feature->location(); + # examine its coordinate computation policy + print "Location of feature ", $feature->primary_tag(), " employs a ", + ref($location->coordinate_policy()), + " instance for coordinate computation\n"; + # change the policy, e.g. because the user chose to do so + $location->coordinate_policy(Bio::Location::NarrowestCoordPolicy->new()); + +=head1 DESCRIPTION + +Objects implementing this interface are used by Bio::LocationI +implementing objects to determine integer-valued coordinates when +asked for it. While this may seem trivial for simple locations, there +are different ways to do it for fuzzy or compound (split) +locations. Classes implementing this interface implement a certain +policy, like 'always widest range', 'always smallest range', 'mean for +BETWEEN locations', etc. By installing a different policy object in a +Location object, the behaviour of coordinate computation can be changed +on-the-fly, and with a single line of code client-side. + +=head1 FEEDBACK + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp, Jason Stajich + +Email hlapp@gmx.net, jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + + +package Bio::Location::CoordinatePolicyI; +use vars qw(@ISA); +use strict; +use Bio::Root::RootI; + +@ISA = qw(Bio::Root::RootI); + +=head2 start + + Title : start + Usage : $start = $policy->start($location); + Function: Get the integer-valued start coordinate of the given location as + computed by this computation policy. + Returns : A positive integer number. + Args : A Bio::LocationI implementing object. + +=cut + +sub start { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 end + + Title : end + Usage : $end = $policy->end($location); + Function: Get the integer-valued end coordinate of the given location as + computed by this computation policy. + Returns : A positive integer number. + Args : A Bio::LocationI implementing object. + +=cut + +sub end { + my ($self) = @_; + $self->throw_not_implemented(); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Location/Fuzzy.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Location/Fuzzy.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,560 @@ +# $Id: Fuzzy.pm,v 1.24 2002/12/01 00:05:20 jason Exp $ +# +# BioPerl module for Bio::Location::Fuzzy +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Location::Fuzzy - Implementation of a Location on a Sequence +which has unclear start and/or end locations + +=head1 SYNOPSIS + + use Bio::Location::Fuzzy; + my $fuzzylocation = new Bio::Location::Fuzzy(-start => '<30', + -end => 90, + -location_type => '.'); + + print "location string is ", $fuzzylocation->to_FTstring(), "\n"; + print "location is of the type ", $fuzzylocation->location_type, "\n"; + +=head1 DESCRIPTION + +This module contains the necessary methods for representing a +Fuzzy Location, one that does not have clear start and/or end points. +This will initially serve to handle features from Genbank/EMBL feature +tables that are written as 1^100 meaning between bases 1 and 100 or +E100..300 meaning it starts somewhere before 100. Advanced +implementations of this interface may be able to handle the necessary +logic of overlaps/intersection/contains/union. It was constructed to +handle fuzzy locations that can be represented in Genbank/EMBL. + +=head1 FEEDBACK + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::Location::Fuzzy; +use vars qw(@ISA ); +use strict; + +use Bio::Location::FuzzyLocationI; +use Bio::Location::Atomic; + +@ISA = qw(Bio::Location::Atomic Bio::Location::FuzzyLocationI ); + +BEGIN { + use vars qw( %FUZZYCODES %FUZZYPOINTENCODE %FUZZYRANGEENCODE + @LOCATIONCODESBSANE ); + + @LOCATIONCODESBSANE = (undef, 'EXACT', 'WITHIN', 'BETWEEN', + 'BEFORE', 'AFTER'); + + %FUZZYCODES = ( 'EXACT' => '..', # Position is 'exact + # Exact position is unknown, but is within the range specified, ((1.2)..100) + 'WITHIN' => '.', + # 1^2 + 'BETWEEN' => '^', + # <100 + 'BEFORE' => '<', + # >10 + 'AFTER' => '>'); + + # The following regular expressions map to fuzzy location types. Every + # expression must match the complete encoded point string, and must + # contain two groups identifying min and max. Empty matches are automatic. + # converted to undef, except for 'EXACT', for which max is set to equal + # min. + %FUZZYPOINTENCODE = ( + '\>(\d+)(.{0})' => 'AFTER', + '\<(.{0})(\d+)' => 'BEFORE', + '(\d+)' => 'EXACT', + '(\d+)(.{0})\>' => 'AFTER', + '(.{0})(\d+)\<' => 'BEFORE', + '(\d+)\.(\d+)' => 'WITHIN', + '(\d+)\^(\d+)' => 'BETWEEN', + ); + + %FUZZYRANGEENCODE = ( '\.' => 'WITHIN', + '\.\.' => 'EXACT', + '\^' => 'BETWEEN' ); + +} + +=head2 new + + Title : new + Usage : my $fuzzyloc = new Bio::Location::Fuzzy( @args); + Function: + Returns : + Args : -start => value for start (initialize by superclass) + -end => value for end (initialize by superclass) + -strand => value for strand (initialize by superclass) + -location_type => either ('EXACT', 'WITHIN', 'BETWEEN') OR + ( 1,2,3) + -start_ext=> extension for start - defaults to 0, + -start_fuz= fuzzy code for start can be + ( 'EXACT', 'WITHIN', 'BETWEEN', 'BEFORE', 'AFTER') OR + a value 1 - 5 corresponding to index+1 above + -end_ext=> extension for end - defaults to 0, + -end_fuz= fuzzy code for end can be + ( 'EXACT', 'WITHIN', 'BETWEEN', 'BEFORE', 'AFTER') OR + a value 1 - 5 corresponding to index+1 above + +=cut + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + my ($location_type, $start_ext, $start_fuz, $end_ext, $end_fuz) = + $self->_rearrange([ qw(LOCATION_TYPE START_EXT START_FUZ + END_EXT END_FUZ ) + ], @args); + + $location_type && $self->location_type($location_type); + $start_ext && $self->max_start($self->min_start + $start_ext); + $end_ext && $self->max_end($self->min_end + $end_ext); + $start_fuz && $self->start_pos_type($start_fuz); + $end_fuz && $self->end_pos_type($end_fuz); + + return $self; +} + +=head2 location_type + + Title : location_type + Usage : my $location_type = $location->location_type(); + Function: Get location type encoded as text + Returns : string ('EXACT', 'WITHIN', 'BETWEEN') + Args : none + +=cut + +sub location_type { + my ($self,$value) = @_; + if( defined $value || ! defined $self->{'_location_type'} ) { + $value = 'EXACT' unless defined $value; + if(! defined $FUZZYCODES{$value}) { + $value = uc($value); + if( $value =~ /\.\./ ) { + $value = 'EXACT'; + } elsif( $value =~ /^\.$/ ) { + $value = 'WITHIN'; + } elsif( $value =~ /\^/ ) { + $value = 'BETWEEN'; + + + $self->throw("Use Bio::Location::Simple for IN-BETWEEN locations [". $self->start. "] and [". $self->end. "]") + if defined $self->start && defined $self->end && ($self->end - 1 == $self->start); + + + } elsif( $value ne 'EXACT' && $value ne 'WITHIN' && + $value ne 'BETWEEN' ) { + $self->throw("Did not specify a valid location type"); + } + } + $self->{'_location_type'} = $value; + } + return $self->{'_location_type'}; +} + +=head1 LocationI methods + +=head2 length + + Title : length + Usage : $length = $fuzzy_loc->length(); + Function: Get the length of this location. + + Note that the length of a fuzzy location will always depend + on the currently active interpretation of start and end. The + result will therefore vary for different CoordinatePolicy objects. + + Returns : an integer + Args : none + +=cut + +#sub length { +# my($self) = @_; +# return $self->SUPER::length() if( !$self->start || !$self->end); +# $self->warn('Length is not valid for a FuzzyLocation'); +# return 0; +#} + +=head2 start + + Title : start + Usage : $start = $fuzzy->start(); + Function: get/set start of this range, handling fuzzy_starts + Returns : a positive integer representing the start of the location + Args : start location on set (can be fuzzy point string) + +=cut + +sub start { + my($self,$value) = @_; + if( defined $value ) { + my ($encode,$min,$max) = $self->_fuzzypointdecode($value); + $self->start_pos_type($encode); + $self->min_start($min); + $self->max_start($max); + } + + $self->throw("Use Bio::Location::Simple for IN-BETWEEN locations [". $self->SUPER::start. "] and [". $self->SUPER::end. "]") + if $self->location_type eq 'BETWEEN' && defined $self->SUPER::end && ($self->SUPER::end - 1 == $self->SUPER::start); + + return $self->SUPER::start(); +} + +=head2 end + + Title : end + Usage : $end = $fuzzy->end(); + Function: get/set end of this range, handling fuzzy_ends + Returns : a positive integer representing the end of the range + Args : end location on set (can be fuzzy string) + +=cut + +sub end { + my($self,$value) = @_; + if( defined $value ) { + my ($encode,$min,$max) = $self->_fuzzypointdecode($value); + $self->end_pos_type($encode); + $self->min_end($min); + $self->max_end($max); + } + + $self->throw("Use Bio::Location::Simple for IN-BETWEEN locations [". $self->SUPER::start. "] and [". $self->SUPER::end. "]") + if $self->location_type eq 'BETWEEN' && defined $self->SUPER::start && ($self->SUPER::end - 1 == $self->SUPER::start); + + return $self->SUPER::end(); +} + +=head2 min_start + + Title : min_start + Usage : $min_start = $fuzzy->min_start(); + Function: get/set the minimum starting point + Returns : the minimum starting point from the contained sublocations + Args : integer or undef on set + +=cut + +sub min_start { + my ($self,@args) = @_; + + if(@args) { + $self->{'_min_start'} = $args[0]; # the value may be undef! + } + return $self->{'_min_start'}; +} + +=head2 max_start + + Title : max_start + Usage : my $maxstart = $location->max_start(); + Function: Get/set maximum starting location of feature startpoint + Returns : integer or undef if no maximum starting point. + Args : integer or undef on set + +=cut + +sub max_start { + my ($self,@args) = @_; + + if(@args) { + $self->{'_max_start'} = $args[0]; # the value may be undef! + } + return $self->{'_max_start'}; +} + +=head2 start_pos_type + + Title : start_pos_type + Usage : my $start_pos_type = $location->start_pos_type(); + Function: Get/set start position type. + Returns : type of position coded as text + ('BEFORE', 'AFTER', 'EXACT','WITHIN', 'BETWEEN') + Args : a string on set + +=cut + +sub start_pos_type { + my ($self,$value) = @_; + if(defined $value && $value =~ /^\d+$/ ) { + if( $value == 0 ) { $value = 'EXACT'; } + else { + my $v = $LOCATIONCODESBSANE[$value]; + if( ! defined $v ) { + $self->warn("Provided value $value which I don't understand, reverting to 'EXACT'"); + $v = 'EXACT'; + } + $value = $v; + } + } + if(defined($value)) { + $self->{'_start_pos_type'} = $value; + } + return $self->{'_start_pos_type'}; +} + +=head2 min_end + + Title : min_end + Usage : my $minend = $location->min_end(); + Function: Get/set minimum ending location of feature endpoint + Returns : integer or undef if no minimum ending point. + Args : integer or undef on set + +=cut + +sub min_end { + my ($self,@args) = @_; + + if(@args) { + $self->{'_min_end'} = $args[0]; # the value may be undef! + } + return $self->{'_min_end'}; +} + +=head2 max_end + + Title : max_end + Usage : my $maxend = $location->max_end(); + Function: Get/set maximum ending location of feature endpoint + Returns : integer or undef if no maximum ending point. + Args : integer or undef on set + +=cut + +sub max_end { + my ($self,@args) = @_; + + if(@args) { + $self->{'_max_end'} = $args[0]; # the value may be undef! + } + return $self->{'_max_end'}; +} + +=head2 end_pos_type + + Title : end_pos_type + Usage : my $end_pos_type = $location->end_pos_type(); + Function: Get/set end position type. + Returns : type of position coded as text + ('BEFORE', 'AFTER', 'EXACT','WITHIN', 'BETWEEN') + Args : a string on set + +=cut + +sub end_pos_type { + my ($self,$value) = @_; + if( defined $value && $value =~ /^\d+$/ ) { + if( $value == 0 ) { $value = 'EXACT'; } + else { + my $v = $LOCATIONCODESBSANE[$value]; + if( ! defined $v ) { + $self->warn("Provided value $value which I don't understand, reverting to 'EXACT'"); + $v = 'EXACT'; + } + $value = $v; + } + } + + if(defined($value)) { + $self->{'_end_pos_type'} = $value; + } + return $self->{'_end_pos_type'}; +} + +=head2 seq_id + + Title : seq_id + Usage : my $seqid = $location->seq_id(); + Function: Get/Set seq_id that location refers to + Returns : seq_id + Args : [optional] seq_id value to set + +=cut + +=head2 coordinate_policy + + Title : coordinate_policy + Usage : $policy = $location->coordinate_policy(); + $location->coordinate_policy($mypolicy); # set may not be possible + Function: Get the coordinate computing policy employed by this object. + + See Bio::Location::CoordinatePolicyI for documentation about + the policy object and its use. + + The interface *does not* require implementing classes to accept + setting of a different policy. The implementation provided here + does, however, allow to do so. + + Implementors of this interface are expected to initialize every + new instance with a CoordinatePolicyI object. The implementation + provided here will return a default policy object if none has + been set yet. To change this default policy object call this + method as a class method with an appropriate argument. Note that + in this case only subsequently created Location objects will be + affected. + + Returns : A Bio::Location::CoordinatePolicyI implementing object. + Args : On set, a Bio::Location::CoordinatePolicyI implementing object. + +=cut + +=head2 to_FTstring + + Title : to_FTstring + Usage : my $locstr = $location->to_FTstring() + Function: Get/Set seq_id that location refers to + Returns : seq_id + Args : [optional] seq_id value to set + +=cut + +sub to_FTstring { + my ($self) = @_; + my (%vals) = ( 'start' => $self->start, + 'min_start' => $self->min_start, + 'max_start' => $self->max_start, + 'start_code' => $self->start_pos_type, + 'end' => $self->end, + 'min_end' => $self->min_end, + 'max_end' => $self->max_end, + 'end_code' => $self->end_pos_type ); + + my (%strs) = ( 'start' => '', + 'end' => ''); + my ($delimiter) = $FUZZYCODES{$self->location_type}; + # I'm lazy, lets do this in a loop since behaviour will be the same for + # start and end + foreach my $point ( qw(start end) ) { + if( $vals{$point."_code"} ne 'EXACT' ) { + + if( (!defined $vals{"min_$point"} || + !defined $vals{"max_$point"}) + && ( $vals{$point."_code"} eq 'WITHIN' || + $vals{$point."_code"} eq 'BETWEEN') + ) { + $vals{"min_$point"} = '' unless defined $vals{"min_$point"}; + $vals{"max_$point"} = '' unless defined $vals{"max_$point"}; + + $self->warn("Fuzzy codes for start are in a strange state, (". + join(",", ($vals{"min_$point"}, + $vals{"max_$point"}, + $vals{$point."_code"})). ")"); + return ''; + } + if( defined $vals{$point."_code"} && + ($vals{$point."_code"} eq 'BEFORE' || + $vals{$point."_code"} eq 'AFTER') + ) { + $strs{$point} .= $FUZZYCODES{$vals{$point."_code"}}; + } + if( defined $vals{"min_$point"} ) { + $strs{$point} .= $vals{"min_$point"}; + } + if( defined $vals{$point."_code"} && + ($vals{$point."_code"} eq 'WITHIN' || + $vals{$point."_code"} eq 'BETWEEN') + ) { + $strs{$point} .= $FUZZYCODES{$vals{$point."_code"}}; + } + if( defined $vals{"max_$point"} ) { + $strs{$point} .= $vals{"max_$point"}; + } + if(($vals{$point."_code"} eq 'WITHIN') || + ($vals{$point."_code"} eq 'BETWEEN')) { + $strs{$point} = "(".$strs{$point}.")"; + } + } else { + $strs{$point} = $vals{$point}; + } + + } + my $str = $strs{'start'} . $delimiter . $strs{'end'}; + if($self->is_remote() && $self->seq_id()) { + $str = $self->seq_id() . ":" . $str; + } + if( $self->strand == -1 ) { + $str = "complement(" . $str . ")"; + } elsif($self->location_type() eq "WITHIN") { + $str = "(".$str.")"; + } + return $str; +} + +=head2 _fuzzypointdecode + + Title : _fuzzypointdecode + Usage : ($type,$min,$max) = $self->_fuzzypointdecode('<5'); + Function: Decode a fuzzy string. + Returns : A 3-element array consisting of the type of location, the + minimum integer, and the maximum integer describing the range + of coordinates this start or endpoint refers to. Minimum or + maximum coordinate may be undefined. + : Returns empty array on fail. + Args : fuzzypoint string + +=cut + +sub _fuzzypointdecode { + my ($self, $string) = @_; + return () if( !defined $string); + # strip off leading and trailing space + $string =~ s/^\s*(\S+)\s*/$1/; + foreach my $pattern ( keys %FUZZYPOINTENCODE ) { + if( $string =~ /^$pattern$/ ) { + my ($min,$max) = ($1,$2); + if($FUZZYPOINTENCODE{$pattern} eq 'EXACT') { + $max = $min; + } else { + $max = undef if(length($max) == 0); + $min = undef if(length($min) == 0); + } + return ($FUZZYPOINTENCODE{$pattern},$min,$max); + } + } + if( $self->verbose >= 1 ) { + $self->warn("could not find a valid fuzzy encoding for $string"); + } + return (); +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Location/FuzzyLocationI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Location/FuzzyLocationI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,204 @@ +# $Id: FuzzyLocationI.pm,v 1.17 2002/12/01 00:05:20 jason Exp $ +# +# BioPerl module for Bio::Location::FuzzyLocationI +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Location::FuzzyLocationI - Abstract interface of a Location on a Sequence +which has unclear start/end location + +=head1 SYNOPSIS + + # Get a FuzzyLocationI object somehow + print "Fuzzy FT location string is ", $location->to_FTstring(); + print "location is of the type ", $location->loc_type, "\n"; + +=head1 DESCRIPTION + +This interface encapsulates the necessary methods for representing a +Fuzzy Location, one that does not have clear start and/or end points. +This will initially serve to handle features from Genbank/EMBL feature +tables that are written as 1^100 meaning between bases 1 and 100 or +E100..300 meaning it starts somewhere before 100. Advanced +implementations of this interface may be able to handle the necessary +logic of overlaps/intersection/contains/union. It was constructed to +handle fuzzy locations that can be represented in Genbank/EMBL. + +=head1 FEEDBACK + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + + +package Bio::Location::FuzzyLocationI; +use vars qw(@ISA); +use strict; + +use Bio::LocationI; +use Carp; + +@ISA = qw(Bio::LocationI); + +=head1 LocationI methods + +=head2 location_type + + Title : loc_type + Usage : my $location_type = $location->location_type(); + Function: Get location type encoded as text + Returns : string ('EXACT', 'WITHIN', 'BETWEEN') + Args : none + +=cut + +sub location_type { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head1 Bio::LocationI methods + +Bio::LocationI methods follow + +=head2 min_start + + Title : min_start + Usage : my $minstart = $location->min_start(); + Function: Get minimum starting location of feature startpoint + Returns : integer or undef if no maximum starting point. + Args : none + +=cut + +=head2 max_start + + Title : max_start + Usage : my $maxstart = $location->max_start(); + Function: Get maximum starting location of feature startpoint + Returns : integer or undef if no maximum starting point. + Args : none + +=cut + +=head2 start_pos_type + + Title : start_pos_type + Usage : my $start_pos_type = $location->start_pos_type(); + Function: Get start position type (ie <,>, ^) + Returns : type of position coded as text + ('BEFORE', 'AFTER', 'EXACT','WITHIN', 'BETWEEN') + Args : none + +=cut + +=head2 min_end + + Title : min_end + Usage : my $minend = $location->min_end(); + Function: Get minimum ending location of feature endpoint + Returns : integer or undef if no minimum ending point. + Args : none + +=cut + +=head2 max_end + + Title : max_end + Usage : my $maxend = $location->max_end(); + Function: Get maximum ending location of feature endpoint + Returns : integer or undef if no maximum ending point. + Args : none + +=cut + +=head2 end_pos_type + + Title : end_pos_type + Usage : my $end_pos_type = $location->end_pos_type(); + Function: Get end position type (ie <,>, ^) + Returns : type of position coded as text + ('BEFORE', 'AFTER', 'EXACT','WITHIN', 'BETWEEN') + Args : none + +=cut + +=head2 seq_id + + Title : seq_id + Usage : my $seqid = $location->seq_id(); + Function: Get/Set seq_id that location refers to + Returns : seq_id + Args : [optional] seq_id value to set + +=cut + +=head2 coordinate_policy + + Title : coordinate_policy + Usage : $policy = $location->coordinate_policy(); + $location->coordinate_policy($mypolicy); # set may not be possible + Function: Get the coordinate computing policy employed by this object. + + See Bio::Location::CoordinatePolicyI for documentation about + the policy object and its use. + + The interface *does not* require implementing classes to accept + setting of a different policy. The implementation provided here + does, however, allow to do so. + + Implementors of this interface are expected to initialize every + new instance with a CoordinatePolicyI object. The implementation + provided here will return a default policy object if none has + been set yet. To change this default policy object call this + method as a class method with an appropriate argument. Note that + in this case only subsequently created Location objects will be + affected. + + Returns : A Bio::Location::CoordinatePolicyI implementing object. + Args : On set, a Bio::Location::CoordinatePolicyI implementing object. + +=cut + +=head2 to_FTstring + + Title : to_FTstring + Usage : my $locstr = $location->to_FTstring() + Function: returns the FeatureTable string of this location + Returns : string + Args : none + +=cut + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Location/NarrowestCoordPolicy.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Location/NarrowestCoordPolicy.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,128 @@ +# $Id: NarrowestCoordPolicy.pm,v 1.7 2002/12/01 00:05:20 jason Exp $ +# +# BioPerl module for Bio::Location::NarrowestCoordPolicy +# +# Cared for by Hilmar Lapp +# and Jason Stajich +# +# Copyright Hilmar Lapp, Jason Stajich +# +# You may distribute this module under the same terms as perl itself +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Location::NarrowestCoordPolicy - class implementing +Bio::Location::CoordinatePolicy as the narrowest possible and reasonable range + +=head1 SYNOPSIS + +See Bio::Location::CoordinatePolicyI + +=head1 DESCRIPTION + +CoordinatePolicyI implementing objects are used by Bio::LocationI +implementing objects to determine integer-valued coordinates when +asked for it. + +This class will compute the coordinates such that always the narrowest possible +range is returned, but by using some common sense. This means that e.g. +locations like "E5..100" (start before position 5) will return 5 as start +(returned values have to be positive integers). + +=head1 FEEDBACK + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp, Jason Stajich + +Email hlapp@gmx.net, jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + + +package Bio::Location::NarrowestCoordPolicy; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::Location::CoordinatePolicyI; + +@ISA = qw(Bio::Root::Root Bio::Location::CoordinatePolicyI); + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + return $self; +} + + + +=head2 start + + Title : start + Usage : $start = $policy->start($location); + Function: Get the integer-valued start coordinate of the given location as + computed by this computation policy. + Returns : A positive integer number. + Args : A Bio::LocationI implementing object. + +=cut + +sub start { + my ($self,$loc) = @_; + + # For performance reasons we don't check that it's indeed a Bio::LocationI + # object. Hopefully, Location-object programmers are smart enough. + my $pos = $loc->max_start(); + # if max is not defined or equals 0 we resort to min + $pos = $loc->min_start() if(! $pos); + return $pos; +} + +=head2 end + + Title : end + Usage : $end = $policy->end($location); + Function: Get the integer-valued end coordinate of the given location as + computed by this computation policy. + Returns : A positive integer number. + Args : A Bio::LocationI implementing object. + +=cut + +sub end { + my ($self,$loc) = @_; + + # For performance reasons we don't check that it's indeed a Bio::LocationI + # object. Hopefully, Location-object programmers are smart enough. + my $pos = $loc->min_end(); + # if min is not defined or equals 0 we resort to max + $pos = $loc->max_end() if(! $pos); + return $pos; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Location/Simple.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Location/Simple.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,361 @@ +# $Id: Simple.pm,v 1.31 2002/10/22 07:38:35 lapp Exp $ +# +# BioPerl module for Bio::Location::Simple +# Cared for by Heikki Lehvaslaiho +# +# Copyright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Location::Simple - Implementation of a Simple Location on a Sequence + +=head1 SYNOPSIS + + use Bio::Location::Simple; + + my $location = new Bio::Location::Simple(-start => 1, -end => 100, + -strand => 1 ); + + if( $location->strand == -1 ) { + printf "complement(%d..%d)\n", $location->start, $location->end; + } else { + printf "%d..%d\n", $location->start, $location->end; + } + +=head1 DESCRIPTION + +This is an implementation of Bio::LocationI to manage exact location +information on a Sequence: '22' or '12..15' or '16^17'. + +You can test the type of the location using lenght() function () or +directly location_type() which can one of two values: 'EXACT' or +'IN-BETWEEN'. + + +=head1 FEEDBACK + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email heikki@ebi.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + + +package Bio::Location::Simple; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::Location::Atomic; + + +@ISA = qw( Bio::Location::Atomic ); + +BEGIN { + use vars qw( %RANGEENCODE %RANGEDECODE ); + + %RANGEENCODE = ('\.\.' => 'EXACT', + '\^' => 'IN-BETWEEN' ); + + %RANGEDECODE = ('EXACT' => '..', + 'IN-BETWEEN' => '^' ); + +} + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + my ($locationtype) = $self->_rearrange([qw(LOCATION_TYPE)],@args); + + $locationtype && $self->location_type($locationtype); + + return $self; +} + +=head2 start + + Title : start + Usage : $start = $loc->start(); + Function: get/set the start of this range + Returns : the start of this range + Args : optionaly allows the start to be set + : using $loc->start($start) + +=cut + +sub start { + my ($self, $value) = @_; + + $self->{'_start'} = $value if defined $value ; + + $self->throw("Only adjacent residues when location type ". + "is IN-BETWEEN. Not [". $self->{'_start'}. "] and [". + $self->{'_end'}. "]" ) + if defined $self->{'_start'} && defined $self->{'_end'} && + $self->location_type eq 'IN-BETWEEN' && + ($self->{'_end'} - 1 != $self->{'_start'}); + return $self->{'_start'}; +} + + +=head2 end + + Title : end + Usage : $end = $loc->end(); + Function: get/set the end of this range + Returns : the end of this range + Args : optionaly allows the end to be set + : using $loc->end($start) + +=cut + +sub end { + my ($self, $value) = @_; + + $self->{'_end'} = $value if defined $value ; + $self->throw("Only adjacent residues when location type ". + "is IN-BETWEEN. Not [". $self->{'_start'}. "] and [". + $self->{'_end'}. "]" ) + if defined $self->{'_start'} && defined $self->{'_end'} && + $self->location_type eq 'IN-BETWEEN' && + ($self->{'_end'} - 1 != $self->{'_start'}); + + return $self->{'_end'}; +} + +=head2 strand + + Title : strand + Usage : $strand = $loc->strand(); + Function: get/set the strand of this range + Returns : the strandidness (-1, 0, +1) + Args : optionaly allows the strand to be set + : using $loc->strand($strand) + +=cut + +=head2 length + + Title : length + Usage : $len = $loc->length(); + Function: get the length in the coordinate space this location spans + Example : + Returns : an integer + Args : none + + +=cut + +sub length { + my ($self) = @_; + if ($self->location_type eq 'IN-BETWEEN' ) { + return 0; + } else { + return abs($self->end - $self->start) + 1; + } + +} + +=head2 min_start + + Title : min_start + Usage : my $minstart = $location->min_start(); + Function: Get minimum starting location of feature startpoint + Returns : integer or undef if no minimum starting point. + Args : none + +=cut + +=head2 max_start + + Title : max_start + Usage : my $maxstart = $location->max_start(); + Function: Get maximum starting location of feature startpoint. + + In this implementation this is exactly the same as min_start(). + + Returns : integer or undef if no maximum starting point. + Args : none + +=cut + +=head2 start_pos_type + + Title : start_pos_type + Usage : my $start_pos_type = $location->start_pos_type(); + Function: Get start position type (ie <,>, ^). + + Returns : type of position coded as text + ('BEFORE', 'AFTER', 'EXACT','WITHIN', 'BETWEEN', 'IN-BETWEEN') + Args : none + +=cut + +=head2 min_end + + Title : min_end + Usage : my $minend = $location->min_end(); + Function: Get minimum ending location of feature endpoint + Returns : integer or undef if no minimum ending point. + Args : none + +=cut + + +=head2 max_end + + Title : max_end + Usage : my $maxend = $location->max_end(); + Function: Get maximum ending location of feature endpoint + + In this implementation this is exactly the same as min_end(). + + Returns : integer or undef if no maximum ending point. + Args : none + +=cut + +=head2 end_pos_type + + Title : end_pos_type + Usage : my $end_pos_type = $location->end_pos_type(); + Function: Get end position type (ie <,>, ^) + + Returns : type of position coded as text + ('BEFORE', 'AFTER', 'EXACT','WITHIN', 'BETWEEN', 'IN-BETWEEN') + Args : none + +=cut + +=head2 location_type + + Title : location_type + Usage : my $location_type = $location->location_type(); + Function: Get location type encoded as text + Returns : string ('EXACT' or 'IN-BETWEEN') + Args : 'EXACT' or '..' or 'IN-BETWEEN' or '^' + +=cut + +sub location_type { + my ($self, $value) = @_; + + if( defined $value || ! defined $self->{'_location_type'} ) { + $value = 'EXACT' unless defined $value; + $value = uc $value; + if (! defined $RANGEDECODE{$value}) { + $value = '\^' if $value eq '^'; + $value = '\.\.' if $value eq '..'; + $value = $RANGEENCODE{$value}; + } + $self->throw("Did not specify a valid location type. [$value] is no good") + unless defined $value; + $self->{'_location_type'} = $value; + } + $self->throw("Only adjacent residues when location type ". + "is IN-BETWEEN. Not [". $self->{'_start'}. "] and [". + $self->{'_end'}. "]" ) + if $self->{'_location_type'} eq 'IN-BETWEEN' && + defined $self->{'_start'} && + defined $self->{'_end'} && + ($self->{'_end'} - 1 != $self->{'_start'}); + + return $self->{'_location_type'}; +} + +=head2 is_remote + + Title : is_remote + Usage : $self->is_remote($newval) + Function: Getset for is_remote value + Returns : value of is_remote + Args : newvalue (optional) + + +=cut + +=head2 to_FTstring + + Title : to_FTstring + Usage : my $locstr = $location->to_FTstring() + Function: returns the FeatureTable string of this location + Returns : string + Args : none + +=cut + +sub to_FTstring { + my($self) = @_; + + my $str; + if( $self->start == $self->end ) { + return $self->start; + } + $str = $self->start . $RANGEDECODE{$self->location_type} . $self->end; + if($self->is_remote() && $self->seq_id()) { + $str = $self->seq_id() . ":" . $str; + } + if( $self->strand == -1 ) { + $str = "complement(".$str.")"; + } + return $str; +} + +# +# not tested +# +sub trunc { + my ($self,$start,$end,$relative_ori) = @_; + + my $newstart = $self->start - $start+1; + my $newend = $self->end - $start+1; + my $newstrand = $relative_ori * $self->strand; + + my $out; + if( $newstart < 1 || $newend > ($end-$start+1) ) { + $out = Bio::Location::Simple->new(); + $out->start($self->start); + $out->end($self->end); + $out->strand($self->strand); + $out->seq_id($self->seqid); + $out->is_remote(1); + } else { + $out = Bio::Location::Simple->new(); + $out->start($newstart); + $out->end($newend); + $out->strand($newstrand); + $out->seq_id(); + } + + return $out; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Location/Split.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Location/Split.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,579 @@ +# $Id: Split.pm,v 1.35 2002/12/28 03:26:32 lapp Exp $ +# +# BioPerl module for Bio::Location::SplitLocation +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Location::Split - Implementation of a Location on a Sequence +which has multiple locations (start/end points) + +=head1 SYNOPSIS + + use Bio::Location::Split; + + my $splitlocation = new Bio::Location::Split(); + $splitlocation->add_sub_Location(new Bio::Location::Simple(-start=>1, + -end=>30, + -strand=>1)); + $splitlocation->add_sub_Location(new Bio::Location::Simple(-start=>50, + -end=>61, + -strand=>1)); + my @sublocs = $splitlocation->sub_Location(); + + my $count = 1; + # print the start/end points of the sub locations + foreach my $location ( sort { $a->start <=> $b->start } + @sublocs ) { + printf "sub feature %d [%d..%d]\n", + $count, $location->start,$location->end, "\n"; + $count++; + } + +=head1 DESCRIPTION + +This implementation handles locations which span more than one +start/end location, or and/or lie on different sequences. + +=head1 FEEDBACK + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + + +package Bio::Location::Split; +use vars qw(@ISA @CORBALOCATIONOPERATOR); +use strict; + +use Bio::Root::Root; +use Bio::Location::SplitLocationI; +use Bio::Location::Atomic; + +@ISA = qw(Bio::Location::Atomic Bio::Location::SplitLocationI ); + +BEGIN { + # as defined by BSANE 0.03 + @CORBALOCATIONOPERATOR= ('NONE','JOIN', undef, 'ORDER'); +} + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + # initialize + $self->{'_sublocations'} = []; + my ( $type, $seqid, $locations ) = + $self->_rearrange([qw(SPLITTYPE + SEQ_ID + LOCATIONS + )], @args); + if( defined $locations && ref($locations) =~ /array/i ) { + $self->add_sub_Location(@$locations); + } + $seqid && $self->seq_id($seqid); + $type = lc ($type); + $self->splittype($type || 'JOIN'); + return $self; +} + +=head2 each_Location + + Title : each_Location + Usage : @locations = $locObject->each_Location($order); + Function: Conserved function call across Location:: modules - will + return an array containing the component Location(s) in + that object, regardless if the calling object is itself a + single location or one containing sublocations. + Returns : an array of Bio::LocationI implementing objects + Args : Optional sort order to be passed to sub_Location() + +=cut + +sub each_Location { + my ($self, $order) = @_; + my @locs = (); + foreach my $subloc ($self->sub_Location($order)) { + # Recursively check to get hierarchical split locations: + push @locs, $subloc->each_Location($order); + } + return @locs; +} + +=head2 sub_Location + + Title : sub_Location + Usage : @sublocs = $splitloc->sub_Location(); + Function: Returns the array of sublocations making up this compound (split) + location. Those sublocations referring to the same sequence as + the root split location will be sorted by start position (forward + sort) or end position (reverse sort) and come first (before + those on other sequences). + + The sort order can be optionally specified or suppressed by the + value of the first argument. The default is no sort. + + Returns : an array of Bio::LocationI implementing objects + Args : Optionally 1, 0, or -1 for specifying a forward, no, or reverse + sort order + +=cut + +sub sub_Location { + my ($self, $order) = @_; + $order = 0 unless defined $order; + if( defined($order) && ($order !~ /^-?\d+$/) ) { + $self->throw("value $order passed in to sub_Location is $order, an invalid value"); + } + $order = 1 if($order > 1); + $order = -1 if($order < -1); + + my @sublocs = defined $self->{'_sublocations'} ? @{$self->{'_sublocations'}} : (); + + # return the array if no ordering requested + return @sublocs if( ($order == 0) || (! @sublocs) ); + + # sort those locations that are on the same sequence as the top (`master') + # if the top seq is undefined, we take the first defined in a sublocation + my $seqid = $self->seq_id(); + my $i = 0; + while((! defined($seqid)) && ($i <= $#sublocs)) { + $seqid = $sublocs[$i++]->seq_id(); + } + if((! $self->seq_id()) && $seqid) { + $self->warn("sorted sublocation array requested but ". + "root location doesn't define seq_id ". + "(at least one sublocation does!)"); + } + my @locs = ($seqid ? + grep { $_->seq_id() eq $seqid; } @sublocs : + @sublocs); + if(@locs) { + if($order == 1) { + # Schwartzian transforms for performance boost + @locs = map { $_->[0] } + sort { (defined $a && defined $b) ? + $a->[1] <=> $b->[1] : $a ? -1 : 1 } + map { [$_, $_->start] } @locs; + + } else { # $order == -1 + @locs = map {$_->[0]} + sort { + (defined $a && defined $b) ? + $b->[1] <=> $a->[1] : $a ? -1 : 1 } + map { [$_, $_->end] } @locs; + } + } + # push the rest unsorted + if($seqid) { + push(@locs, grep { $_->seq_id() ne $seqid; } @sublocs); + } + # done! + return @locs; +} + +=head2 add_sub_Location + + Title : add_sub_Location + Usage : $splitloc->add_sub_Location(@locationIobjs); + Function: add an additional sublocation + Returns : number of current sub locations + Args : list of Bio::LocationI implementing object(s) to add + +=cut + +sub add_sub_Location { + my ($self,@args) = @_; + my @locs; + foreach my $loc ( @args ) { + if( !ref($loc) || ! $loc->isa('Bio::LocationI') ) { + $self->throw("Trying to add $loc as a sub Location but it doesn't implement Bio::LocationI!"); + next; + } + push @{$self->{'_sublocations'}}, $loc; + } + + return scalar @{$self->{'_sublocations'}}; +} + +=head2 splittype + + Title : splittype + Usage : $splittype = $fuzzy->splittype(); + Function: get/set the split splittype + Returns : the splittype of split feature (join, order) + Args : splittype to set + +=cut + +sub splittype { + my ($self, $value) = @_; + if( defined $value || ! defined $self->{'_splittype'} ) { + $value = 'JOIN' unless( defined $value ); + $self->{'_splittype'} = uc ($value); + } + return $self->{'_splittype'}; +} + +=head2 is_single_sequence + + Title : is_single_sequence + Usage : if($splitloc->is_single_sequence()) { + print "Location object $splitloc is split ". + "but only across a single sequence\n"; + } + Function: Determine whether this location is split across a single or + multiple sequences. + + This implementation ignores (sub-)locations that do not define + seq_id(). The same holds true for the root location. + + Returns : TRUE if all sublocations lie on the same sequence as the root + location (feature), and FALSE otherwise. + Args : none + +=cut + +sub is_single_sequence { + my ($self) = @_; + + my $seqid = $self->seq_id(); + foreach my $loc ($self->sub_Location(0)) { + $seqid = $loc->seq_id() if(! $seqid); + if(defined($loc->seq_id()) && ($loc->seq_id() ne $seqid)) { + return 0; + } + } + return 1; +} + +=head1 LocationI methods + +=head2 strand + + Title : strand + Usage : $obj->strand($newval) + Function: For SplitLocations, setting the strand of the container + (this object) is a short-cut for setting the strand of all + sublocations. + + In get-mode, checks if no sub-location is remote, and if + all have the same strand. If so, it returns that shared + strand value. Otherwise it returns undef. + + Example : + Returns : on get, value of strand if identical between sublocations + (-1, 1, or undef) + Args : new value (-1 or 1, optional) + + +=cut + +sub strand{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'strand'} = $value; + # propagate to all sublocs + foreach my $loc ($self->sub_Location(0)) { + $loc->strand($value) if ! $loc->is_remote(); + } + } else { + my ($strand, $lstrand); + foreach my $loc ($self->sub_Location(0)) { + # we give up upon any location that's remote or doesn't have + # the strand specified, or has a differing one set than + # previously seen. + # calling strand() is potentially expensive if the subloc is also + # a split location, so we cache it + $lstrand = $loc->strand(); + if((! $lstrand) || + ($strand && ($strand != $lstrand)) || + $loc->is_remote()) { + $strand = undef; + last; + } elsif(! $strand) { + $strand = $lstrand; + } + } + return $strand; + } +} + +=head2 start + + Title : start + Usage : $start = $location->start(); + Function: get the starting point of the first (sorted) sublocation + Returns : integer + Args : none + +=cut + +sub start { + my ($self,$value) = @_; + if( defined $value ) { + $self->throw("Trying to set the starting point of a split location, that is not possible, try manipulating the sub Locations"); + } + return $self->SUPER::start(); +} + +=head2 end + + Title : end + Usage : $end = $location->end(); + Function: get the ending point of the last (sorted) sublocation + Returns : integer + Args : none + +=cut + +sub end { + my ($self,$value) = @_; + if( defined $value ) { + $self->throw("Trying to set the ending point of a split location, that is not possible, try manipulating the sub Locations"); + } + return $self->SUPER::end(); +} + +=head2 min_start + + Title : min_start + Usage : $min_start = $location->min_start(); + Function: get the minimum starting point + Returns : the minimum starting point from the contained sublocations + Args : none + +=cut + +sub min_start { + my ($self, $value) = @_; + + if( defined $value ) { + $self->throw("Trying to set the minimum starting point of a split location, that is not possible, try manipulating the sub Locations"); + } + my @locs = $self->sub_Location(1); + return $locs[0]->min_start() if @locs; + return undef; +} + +=head2 max_start + + Title : max_start + Usage : my $maxstart = $location->max_start(); + Function: Get maximum starting location of feature startpoint + Returns : integer or undef if no maximum starting point. + Args : none + +=cut + +sub max_start { + my ($self,$value) = @_; + + if( defined $value ) { + $self->throw("Trying to set the maximum starting point of a split location, that is not possible, try manipulating the sub Locations"); + } + my @locs = $self->sub_Location(1); + return $locs[0]->max_start() if @locs; + return undef; +} + +=head2 start_pos_type + + Title : start_pos_type + Usage : my $start_pos_type = $location->start_pos_type(); + Function: Get start position type (ie <,>, ^) + Returns : type of position coded as text + ('BEFORE', 'AFTER', 'EXACT','WITHIN', 'BETWEEN') + Args : none + +=cut + +sub start_pos_type { + my ($self,$value) = @_; + + if( defined $value ) { + $self->throw("Trying to set the start_pos_type of a split location, that is not possible, try manipulating the sub Locations"); + } + my @locs = $self->sub_Location(); + return ( @locs ) ? $locs[0]->start_pos_type() : undef; +} + +=head2 min_end + + Title : min_end + Usage : my $minend = $location->min_end(); + Function: Get minimum ending location of feature endpoint + Returns : integer or undef if no minimum ending point. + Args : none + +=cut + +sub min_end { + my ($self,$value) = @_; + + if( defined $value ) { + $self->throw("Trying to set the minimum end point of a split location, that is not possible, try manipulating the sub Locations"); + } + # reverse sort locations by largest ending to smallest ending + my @locs = $self->sub_Location(-1); + return $locs[0]->min_end() if @locs; + return undef; +} + +=head2 max_end + + Title : max_end + Usage : my $maxend = $location->max_end(); + Function: Get maximum ending location of feature endpoint + Returns : integer or undef if no maximum ending point. + Args : none + +=cut + +sub max_end { + my ($self,$value) = @_; + + if( defined $value ) { + $self->throw("Trying to set the maximum end point of a split location, that is not possible, try manipulating the sub Locations"); + } + # reverse sort locations by largest ending to smallest ending + my @locs = $self->sub_Location(-1); + return $locs[0]->max_end() if @locs; + return undef; +} + +=head2 end_pos_type + + Title : end_pos_type + Usage : my $end_pos_type = $location->end_pos_type(); + Function: Get end position type (ie <,>, ^) + Returns : type of position coded as text + ('BEFORE', 'AFTER', 'EXACT','WITHIN', 'BETWEEN') + Args : none + +=cut + +sub end_pos_type { + my ($self,$value) = @_; + + if( defined $value ) { + $self->throw("Trying to set end_pos_type of a split location, that is not possible, try manipulating the sub Locations"); + } + my @locs = $self->sub_Location(); + return ( @locs ) ? $locs[0]->end_pos_type() : undef; +} + + +=head2 seq_id + + Title : seq_id + Usage : my $seqid = $location->seq_id(); + Function: Get/Set seq_id that location refers to + + We override this here in order to propagate to all sublocations + which are not remote (provided this root is not remote either) + Returns : seq_id + Args : [optional] seq_id value to set + + +=cut + +sub seq_id { + my ($self, $seqid) = @_; + + if(! $self->is_remote()) { + foreach my $subloc ($self->sub_Location(0)) { + $subloc->seq_id($seqid) if ! $subloc->is_remote(); + } + } + return $self->SUPER::seq_id($seqid); +} + +=head2 coordinate_policy + + Title : coordinate_policy + Usage : $policy = $location->coordinate_policy(); + $location->coordinate_policy($mypolicy); # set may not be possible + Function: Get the coordinate computing policy employed by this object. + + See Bio::Location::CoordinatePolicyI for documentation about + the policy object and its use. + + The interface *does not* require implementing classes to accept + setting of a different policy. The implementation provided here + does, however, allow to do so. + + Implementors of this interface are expected to initialize every + new instance with a CoordinatePolicyI object. The implementation + provided here will return a default policy object if none has + been set yet. To change this default policy object call this + method as a class method with an appropriate argument. Note that + in this case only subsequently created Location objects will be + affected. + + Returns : A Bio::Location::CoordinatePolicyI implementing object. + Args : On set, a Bio::Location::CoordinatePolicyI implementing object. + +=head2 to_FTstring + + Title : to_FTstring + Usage : my $locstr = $location->to_FTstring() + Function: returns the FeatureTable string of this location + Returns : string + Args : none + +=cut + +sub to_FTstring { + my ($self) = @_; + my @strs; + foreach my $loc ( $self->sub_Location() ) { + my $str = $loc->to_FTstring(); + # we only append the remote seq_id if it hasn't been done already + # by the sub-location (which it should if it knows it's remote) + # (and of course only if it's necessary) + if( (! $loc->is_remote) && + defined($self->seq_id) && defined($loc->seq_id) && + ($loc->seq_id ne $self->seq_id) ) { + $str = sprintf("%s:%s", $loc->seq_id, $str); + } + push @strs, $str; + } + + my $str = sprintf("%s(%s)",lc $self->splittype, join(",", @strs)); + return $str; +} + +# we'll probably need to override the RangeI methods since our locations will +# not be contiguous. + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Location/SplitLocationI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Location/SplitLocationI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,244 @@ +# $Id: SplitLocationI.pm,v 1.14 2002/12/01 00:05:20 jason Exp $ +# +# BioPerl module for Bio::Location::SplitLocationI +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SplitLocationI - Abstract interface of a Location on a Sequence +which has multiple locations (start/end points) + +=head1 SYNOPSIS + + # get a SplitLocationI somehow + print $splitlocation->start, "..", $splitlocation->end, "\n"; + my @sublocs = $splitlocation->sub_Location(); + + my $count = 1; + # print the start/end points of the sub locations + foreach my $location ( sort { $a->start <=> $b->start } + @sublocs ) { + printf "sub feature %d [%d..%d]\n", $location->start,$location->end; + $count++; + } + +=head1 DESCRIPTION + +This interface encapsulates the necessary methods for representing the +location of a sequence feature that has more that just a single +start/end pair. Some examples of this are the annotated exons in a +gene or the annotated CDS in a sequence file. + +=head1 FEEDBACK + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + + +package Bio::Location::SplitLocationI; +use vars qw(@ISA); +use strict; + +use Bio::LocationI; +use Carp; + +@ISA = qw(Bio::LocationI); + + +=head2 sub_Location + + Title : sub_Location + Usage : @locations = $feat->sub_Location(); + Function: Returns an array of LocationI objects + Returns : An array + Args : none + +=cut + +sub sub_Location { + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +=head2 splittype + + Title : splittype + Usage : $splittype = $fuzzy->splittype(); + Function: get/set the split splittype + Returns : the splittype of split feature (join, order) + Args : splittype to set + +=cut + +sub splittype { + my($self) = @_; + $self->throw_not_implemented(); +} + + +=head2 is_single_sequence + + Title : is_single_sequence + Usage : if($splitloc->is_single_sequence()) { + print "Location object $splitloc is split ". + "but only across a single sequence\n"; + } + Function: Determine whether this location is split across a single or + multiple sequences. + Returns : TRUE if all sublocations lie on the same sequence as the root + location (feature), and FALSE otherwise. + Args : none + +=cut + +sub is_single_sequence { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head1 Bio::LocationI methods + +Bio::LocationI inherited methods follow + +=head2 min_start + + Title : min_start + Usage : my $minstart = $location->min_start(); + Function: Get minimum starting location of feature startpoint + Returns : integer or undef if no maximum starting point. + Args : none + +=cut + +=head2 max_start + + Title : max_start + Usage : my $maxstart = $location->max_start(); + Function: Get maximum starting location of feature startpoint + Returns : integer or undef if no maximum starting point. + Args : none + +=cut + +=head2 start_pos_type + + Title : start_pos_type + Usage : my $start_pos_type = $location->start_pos_type(); + Function: Get start position type (ie <,>, ^) + Returns : type of position coded as text + ('BEFORE', 'AFTER', 'EXACT','WITHIN', 'BETWEEN') + Args : none + +=cut + +=head2 min_end + + Title : min_end + Usage : my $minend = $location->min_end(); + Function: Get minimum ending location of feature endpoint + Returns : integer or undef if no minimum ending point. + Args : none + +=cut + +=head2 max_end + + Title : max_end + Usage : my $maxend = $location->max_end(); + Function: Get maximum ending location of feature endpoint + Returns : integer or undef if no maximum ending point. + Args : none + +=cut + +=head2 end_pos_type + + Title : end_pos_type + Usage : my $end_pos_type = $location->end_pos_type(); + Function: Get end position type (ie <,>, ^) + Returns : type of position coded as text + ('BEFORE', 'AFTER', 'EXACT','WITHIN', 'BETWEEN') + Args : none + +=cut + +=head2 seq_id + + Title : seq_id + Usage : my $seqid = $location->seq_id(); + Function: Get/Set seq_id that location refers to + Returns : seq_id + Args : [optional] seq_id value to set + +=cut + +=head2 coordinate_policy + + Title : coordinate_policy + Usage : $policy = $location->coordinate_policy(); + $location->coordinate_policy($mypolicy); # set may not be possible + Function: Get the coordinate computing policy employed by this object. + + See Bio::Location::CoordinatePolicyI for documentation about + the policy object and its use. + + The interface *does not* require implementing classes to accept + setting of a different policy. The implementation provided here + does, however, allow to do so. + + Implementors of this interface are expected to initialize every + new instance with a CoordinatePolicyI object. The implementation + provided here will return a default policy object if none has + been set yet. To change this default policy object call this + method as a class method with an appropriate argument. Note that + in this case only subsequently created Location objects will be + affected. + + Returns : A Bio::Location::CoordinatePolicyI implementing object. + Args : On set, a Bio::Location::CoordinatePolicyI implementing object. + +=cut + +=head2 to_FTstring + + Title : to_FTstring + Usage : my $locstr = $location->to_FTstring() + Function: returns the FeatureTable string of this location + Returns : string + Args : none + +=cut + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Location/WidestCoordPolicy.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Location/WidestCoordPolicy.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,128 @@ +# $Id: WidestCoordPolicy.pm,v 1.6 2002/12/01 00:05:20 jason Exp $ +# +# BioPerl module for Bio::Location::WidestCoordPolicy +# +# Cared for by Hilmar Lapp +# and Jason Stajich +# +# Copyright Hilmar Lapp, Jason Stajich +# +# You may distribute this module under the same terms as perl itself +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Location::WidestCoordPolicy - class implementing +Bio::Location::CoordinatePolicy as the widest possible and reasonable range + +=head1 SYNOPSIS + +See Bio::Location::CoordinatePolicyI + +=head1 DESCRIPTION + +CoordinatePolicyI implementing objects are used by Bio::LocationI +implementing objects to determine integer-valued coordinates when +asked for it. + +This class will compute the coordinates such that always the widest possible +range is returned, but by using some common sense. This means that e.g. +locations like "E5..100" (start before position 5) will return 5 as start +(returned values have to be positive integers). + +=head1 FEEDBACK + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp, Jason Stajich + +Email hlapp@gmx.net, jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + + +package Bio::Location::WidestCoordPolicy; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::Location::CoordinatePolicyI; + +@ISA = qw(Bio::Root::Root Bio::Location::CoordinatePolicyI); + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + return $self; +} + + + +=head2 start + + Title : start + Usage : $start = $policy->start($location); + Function: Get the integer-valued start coordinate of the given location as + computed by this computation policy. + Returns : A positive integer number. + Args : A Bio::LocationI implementing object. + +=cut + +sub start { + my ($self,$loc) = @_; + + # For performance reasons we don't check that it's indeed a Bio::LocationI + # object. Hopefully, Location-object programmers are smart enough. + my $pos = $loc->min_start(); + # if min is not defined or equals 0 we resort to max + $pos = $loc->max_start() if(! $pos); + return $pos; +} + +=head2 end + + Title : end + Usage : $end = $policy->end($location); + Function: Get the integer-valued end coordinate of the given location as + computed by this computation policy. + Returns : A positive integer number. + Args : A Bio::LocationI implementing object. + +=cut + +sub end { + my ($self,$loc) = @_; + + # For performance reasons we don't check that it's indeed a Bio::LocationI + # object. Hopefully, Location-object programmers are smart enough. + my $pos = $loc->max_end(); + # if max is not defined or equals 0 we resort to min + $pos = $loc->min_end() if(! $pos); + return $pos; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/LocationI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/LocationI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,415 @@ +# $Id: LocationI.pm,v 1.18 2002/12/01 00:05:19 jason Exp $ +# +# BioPerl module for Bio::LocationI +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself +# POD documentation - main docs before the code + +=head1 NAME + +Bio::LocationI - Abstract interface of a Location on a Sequence + +=head1 SYNOPSIS + + # get a LocationI somehow + printf( "start = %d, end = %d, strand = %s, seq_id = %s\n", + $location->start, $location->end, $location->strand, + $location->seq_id); + print "location str is ", $location->to_FTstring(), "\n"; + + +=head1 DESCRIPTION + +This Interface defines the methods for a Bio::LocationI, an object +which encapsulates a location on a biological sequence. Locations +need not be attached to actual sequences as they are stand alone +objects. LocationI objects are used by L objects to +manage and represent locations for a Sequence Feature. + +=head1 FEEDBACK + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::LocationI; +use vars qw(@ISA $coord_policy); +use strict; + +use Bio::RangeI; +use Bio::Location::WidestCoordPolicy; +use Carp; + +@ISA = qw(Bio::RangeI); + +BEGIN { + $coord_policy = Bio::Location::WidestCoordPolicy->new(); +} + +=head2 location_type + + Title : location_type + Usage : my $location_type = $location->location_type(); + Function: Get location type encoded as text + Returns : string ('EXACT', 'WITHIN', 'BETWEEN') + Args : none + +=cut + +sub location_type { + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +=head2 start + + Title : start + Usage : $start = $location->start(); + Function: Get the start coordinate of this location as defined by + the currently active coordinate computation policy. In + simple cases, this will return the same number as + min_start() and max_start(), in more ambiguous cases like + fuzzy locations the number may be equal to one or neither + of both. + + We override this here from RangeI in order to delegate + 'get' to a L implementing + object. Implementing classes may also wish to provide + 'set' functionality, in which case they *must* override + this method. The implementation provided here will throw + an exception if called with arguments. + + Returns : A positive integer value. + Args : none + +See L for more information + +=cut + +sub start { + my ($self,@args) = @_; + + # throw if @args means that we don't support updating information + # in the interface but will delegate to the coordinate policy object + # for interpreting the 'start' value + + $self->throw_not_implemented if @args; + return $self->coordinate_policy()->start($self); +} + +=head2 end + + Title : end + Usage : $end = $location->end(); + Function: Get the end coordinate of this location as defined by the + currently active coordinate computation policy. In simple + cases, this will return the same number as min_end() and + max_end(), in more ambiguous cases like fuzzy locations + the number may be equal to one or neither of both. + + We override this here from Bio::RangeI in order to delegate + 'get' to a L implementing + object. Implementing classes may also wish to provide + 'set' functionality, in which case they *must* override + this method. The implementation provided here will throw + an exception if called with arguments. + + Returns : A positive integer value. + Args : none + +See L and L for more +information + +=cut + +sub end { + my ($self,@args) = @_; + + # throw if @args means that we don't support updating information + # in the interface but will delegate to the coordinate policy object + # for interpreting the 'end' value + $self->throw_not_implemented if @args; + return $self->coordinate_policy()->end($self); +} + +=head2 min_start + + Title : min_start + Usage : my $minstart = $location->min_start(); + Function: Get minimum starting point of feature. + + Note that an implementation must not call start() in this method. + + Returns : integer or undef if no minimum starting point. + Args : none + +=cut + +sub min_start { + my($self) = @_; + $self->throw_not_implemented(); +} + +=head2 max_start + + Title : max_start + Usage : my $maxstart = $location->max_start(); + Function: Get maximum starting point of feature. + + Note that an implementation must not call start() in this method + unless start() is overridden such as not to delegate to the + coordinate computation policy object. + + Returns : integer or undef if no maximum starting point. + Args : none + +=cut + +sub max_start { + my($self) = @_; + $self->throw_not_implemented(); +} + +=head2 start_pos_type + + Title : start_pos_type + Usage : my $start_pos_type = $location->start_pos_type(); + Function: Get start position type encoded as text + + Known valid values are 'BEFORE' (<5..100), 'AFTER' (>5..100), + 'EXACT' (5..100), 'WITHIN' ((5.10)..100), 'BETWEEN', (5^6), with + their meaning best explained by their GenBank/EMBL location string + encoding in brackets. + + Returns : string ('BEFORE', 'AFTER', 'EXACT','WITHIN', 'BETWEEN') + Args : none + +=cut + +sub start_pos_type { + my($self) = @_; + $self->throw_not_implemented(); +} + +=head2 min_end + + Title : min_end + Usage : my $minend = $location->min_end(); + Function: Get minimum ending point of feature. + + Note that an implementation must not call end() in this method + unless end() is overridden such as not to delegate to the + coordinate computation policy object. + + Returns : integer or undef if no minimum ending point. + Args : none + +=cut + +sub min_end { + my($self) = @_; + $self->throw_not_implemented(); +} + +=head2 max_end + + Title : max_end + Usage : my $maxend = $location->max_end(); + Function: Get maximum ending point of feature. + + Note that an implementation must not call end() in this method + unless end() is overridden such as not to delegate to the + coordinate computation policy object. + + Returns : integer or undef if no maximum ending point. + Args : none + +=cut + +sub max_end { + my($self) = @_; + $self->throw_not_implemented(); +} + +=head2 end_pos_type + + Title : end_pos_type + Usage : my $end_pos_type = $location->end_pos_type(); + Function: Get end position encoded as text. + + Known valid values are 'BEFORE' (5..<100), 'AFTER' (5..>100), + 'EXACT' (5..100), 'WITHIN' (5..(90.100)), 'BETWEEN', (5^6), with + their meaning best explained by their GenBank/EMBL location string + encoding in brackets. + + Returns : string ('BEFORE', 'AFTER', 'EXACT','WITHIN', 'BETWEEN') + Args : none + +=cut + +sub end_pos_type { + my($self) = @_; + $self->throw_not_implemented(); +} + +=head2 seq_id + + Title : seq_id + Usage : my $seqid = $location->seq_id(); + Function: Get/Set seq_id that location refers to + Returns : seq_id (a string) + Args : [optional] seq_id value to set + +=cut + +sub seq_id { + my ($self, $seqid) = @_; + if( defined $seqid ) { + $self->{'_seqid'} = $seqid; + } + return $self->{'_seqid'}; +} + +=head2 is_remote + + Title : is_remote + Usage : $is_remote_loc = $loc->is_remote() + Function: Whether or not a location is a remote location. + + A location is said to be remote if it is on a different + 'object' than the object which 'has' this + location. Typically, features on a sequence will sometimes + have a remote location, which means that the location of + the feature is on a different sequence than the one that is + attached to the feature. In such a case, $loc->seq_id will + be different from $feat->seq_id (usually they will be the + same). + + While this may sound weird, it reflects the location of the + kind of AB18375:450-900 which can be found in GenBank/EMBL + feature tables. + + Example : + Returns : TRUE if the location is a remote location, and FALSE otherwise + Args : + + +=cut + +sub is_remote{ + shift->throw_not_implemented(); +} + +=head2 coordinate_policy + + Title : coordinate_policy + Usage : $policy = $location->coordinate_policy(); + $location->coordinate_policy($mypolicy); # set may not be possible + Function: Get the coordinate computing policy employed by this object. + + See L for documentation + about the policy object and its use. + + The interface *does not* require implementing classes to + accept setting of a different policy. The implementation + provided here does, however, allow to do so. + + Implementors of this interface are expected to initialize + every new instance with a + L object. The + implementation provided here will return a default policy + object if none has been set yet. To change this default + policy object call this method as a class method with an + appropriate argument. Note that in this case only + subsequently created Location objects will be affected. + + Returns : A L implementing object. + Args : On set, a L implementing object. + +See L for more information + + +=cut + +sub coordinate_policy { + my ($self, $policy) = @_; + + if(defined($policy)) { + if(! $policy->isa('Bio::Location::CoordinatePolicyI')) { + $self->throw("Object of class ".ref($policy)." does not implement". + " Bio::Location::CoordinatePolicyI"); + } + if(ref($self)) { + $self->{'_coordpolicy'} = $policy; + } else { + # called as class method + $coord_policy = $policy; + } + } + return (ref($self) && exists($self->{'_coordpolicy'}) ? + $self->{'_coordpolicy'} : $coord_policy); +} + +=head2 to_FTstring + + Title : to_FTstring + Usage : my $locstr = $location->to_FTstring() + Function: returns the FeatureTable string of this location + Returns : string + Args : none + +=cut + +sub to_FTstring { + my($self) = @_; + $self->throw_not_implemented(); +} + +=head2 each_Location + + Title : each_Location + Usage : @locations = $locObject->each_Location($order); + Function: Conserved function call across Location:: modules - will + return an array containing the component Location(s) in + that object, regardless if the calling object is itself a + single location or one containing sublocations. + Returns : an array of Bio::LocationI implementing objects + Args : Optional sort order to be passed to sub_Location() for Splits + +=cut + +sub each_Location { + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Map/CytoMap.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Map/CytoMap.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,230 @@ +# $Id: CytoMap.pm,v 1.2 2002/10/22 07:45:15 lapp Exp $ +# +# BioPerl module for Bio::Map::CytoMap +# +# Cared for by Heikki Lehvaslaiho +# +# Copyright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Map::CytoMap - A Bio::MapI compliant map implementation handling cytogenic bands + +=head1 SYNOPSIS + + use Bio::Map::CytoMap; + my $map = new Bio::Map::CytoMap(-name => 'human1', + -species => $human); + foreach my $marker ( @markers ) { # get a list of markers somewhere + $map->add_element($marker); + } + +=head1 DESCRIPTION + +This is the simple implementation of cytogenetic maps based on +L. It handles the essential storage of name, species, +type, and units as well as in memory representation of the elements of +a map. + +For CytoMaps type is hard coded to be 'cytogeneticmap' and +units are set to '' but can be set to something else. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email heikki@ebi.ac.uk + +=head1 CONTRIBUTORS + +Jason Stajich jason@bioperl.org +Lincoln Stein lstein@cshl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Map::CytoMap; +use vars qw(@ISA $MAPCOUNT); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::Root::Root; +use Bio::Map::SimpleMap; + +@ISA = qw(Bio::Root::Root Bio::Map::SimpleMap); +BEGIN { $MAPCOUNT = 1; } + +=head2 Modified methods + +All methods present in L are implemted by this +class. Most of the methods are inherited from SimpleMap. The following +methods have been modified to refelect the needs of cytogenetic maps. + +=head2 new + + Title : new + Usage : my $obj = new Bio::Map::CytoMap(); + Function: Builds a new Bio::Map::CytoMap object + Returns : Bio::Map::CytoMap + Args : -name => name of map (string) + -species => species for this map (Bio::Species) [optional] + -elements=> elements to initialize with + (arrayref of Bio::Map::MappableI objects) [optional] + + -uid => Unique Id +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + $self->{'_elements'} = []; + $self->{'_name'} = ''; + $self->{'_species'} = ''; + $self->{'_units'} = ''; + $self->{'_type'} = 'cyto'; + $self->{'_uid'} = $MAPCOUNT++; + my ($name, $type,$species, $units, + $elements,$uid) = $self->_rearrange([qw(NAME TYPE + SPECIES UNITS + ELEMENTS UID)], @args); + defined $name && $self->name($name); + defined $species && $self->species($species); + defined $units && $self->units($units); + defined $type && $self->type($type); + defined $uid && $self->unique_id($uid); + + if( $elements && ref($elements) =~ /array/ ) { + foreach my $item ( @$elements ) { + $self->add_element($item); + } + } + return $self; +} + +=head2 type + + Title : type + Usage : my $type = $map->type + Function: Get hard-coded Map type + Returns : String coding map type + Args : + +=cut + +sub type { + my ($self) = @_; + return $self->{'_type'}; +} + + +=head2 length + + Title : length + Usage : my $length = $map->length(); + Function: Retrieves the length of the map, + Returns : undef since length is not calculatable for + cytogenetic maps + Args : none + +=cut + +sub length{ + my ($self,@args) = @_; + return undef; +} + +=head2 Methods inherited from L + +=cut + +=head2 species + + Title : species + Usage : my $species = $map->species; + Function: Get/Set Species for a map + Returns : Bio::Species object or string + Args : (optional) Bio::Species or string + +=cut + +=head2 units + + Title : units + Usage : $map->units('cM'); + Function: Get/Set units for a map + Returns : units for a map + Args : units for a map (string) + +=cut + +=head2 name + + Title : name + Usage : my $name = $map->name + Function: Get/Set Map name + Returns : Map name + Args : (optional) string + +=cut + +=head2 unique_id + + Title : unique_id + Usage : my $id = $map->unique_id; + Function: Get/Set the unique ID for this map + Returns : a unique identifier + Args : [optional] new identifier to set + +=cut + +=head2 each_element + + Title : each_element + Usage : my @elements = $map->each_element; + Function: Retrieves all the elements in a map + unordered + Returns : Array of Bio::Map::MappableI objects + Args : none + + +=cut + +=head2 New methods + +=cut + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Map/CytoMarker.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Map/CytoMarker.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,320 @@ +# $Id: CytoMarker.pm,v 1.3 2002/10/22 07:45:15 lapp Exp $ +# +# BioPerl module for Bio::Map::CytoMarker +# +# Cared for by Heikki Lehvaslaiho heikki@ebi.ac.uk +# +# Copyright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Map::CytoMarker - An object representing a marker. + +=head1 SYNOPSIS + + $o_usat = new Bio::Map::CytoMarker(-name=>'Chad Super Marker 2', + -position => $pos); + +=head1 DESCRIPTION + +This object handles markers with a positon in a cytogenetic map known. +This marker will have a name and a position. + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email heikki@ebi.ac.uk + +=head1 CONTRIBUTORS + +Chad Matsalla bioinformatics1@dieselwurks.com +Lincoln Stein lstein@cshl.org +Jason Stajich jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::Map::CytoMarker; +use vars qw(@ISA); +use strict; +use Bio::Map::Marker; +use Bio::Map::CytoPosition; +use Bio::RangeI; + +@ISA = qw(Bio::Map::Marker Bio::RangeI ); + + +=head2 Bio::Map::MarkerI methods + +=cut + +=head2 get_position_object + + Title : get_position_class + Usage : my $pos = $marker->get_position_object(); + Function: To get an object of the default Position class + for this Marker. Subclasses should redefine this method. + The Position needs to be L. + Returns : L + Args : none + +=cut + +sub get_position_object { + my ($self) = @_; + return new Bio::Map::CytoPosition(); +} + + +=head2 Comparison methods + +The numeric values for cutogeneic loctions go from the p tip of +chromosome 1, down to the q tip and similarly throgh consecutive +chromosomes, through X and end the the q tip of X. See +L for more details. + +The numeric values for cytogenetic positions are ranges of type +L, so MarkerI type of operators (equals, less_than, +greater_than) are not very meaningful, but they might be of some use +combined with L methods (overlaps, contains, equals, +intersection, union). equals(), present in both interfaces is treated +as a more precice RangeI method. + +CytoMarker has a method L which might turn out to be useful +in this context. + +The L and L methods are implemented by +comparing the end values of the range, so you better first check that +markers do not overlap, or you get an opposite result than expected. +The numerical values are not metric, so avarages are not meaningful. + +Note: These methods always return a value. A false value (0) might +mean that you have not set the position! Check those warnings. + +=cut + +=head2 Bio::Map::MarkerI comparison methods + +=cut + +=head2 tuple + + Title : tuple + Usage : ($me, $you) = $self->_tuple($compare) + Function: Utility method to extract numbers and test for missing values. + Returns : two ranges or tuple of -1 + Args : Bio::Map::MappableI or Bio::Map::PositionI + +=cut + +=head2 less_than + + Title : less_than + Usage : if( $mappable->less_than($m2) ) ... + Function: Tests if a position is less than another position + Returns : boolean + Args : Bio::Map::MappableI or Bio::Map::PositionI + +=cut + + +sub less_than { + my ($self,$compare) = @_; + + my ($me, $you) = $self->tuple($compare); + return 0 if $me == -1 or $you == -1 ; + + $me = $me->end; + $you = $you->start; + + print STDERR "me=$me, you=$you\n" if $self->verbose > 0; + return $me < $you; +} + +=head2 greater_than + + Title : greater_than + Usage : if( $mappable->greater_than($m2) ) ... + Function: Tests if position is greater than another position + Returns : boolean + Args : Bio::Map::MappableI or Bio::Map::PositionI + +=cut + + +sub greater_than { + my ($self,$compare) = @_; + + my ($me, $you) = $self->tuple($compare); + return 0 if $me == -1 or $you == -1 ; + + $me = $me->start; + $you = $you->end; + print STDERR "me=$me, you=$you\n" if $self->verbose > 0; + return $me > $you; +} + +=head2 RangeI methods + +=cut + + +=head2 equals + + Title : equals + Usage : if( $mappable->equals($mapable2)) ... + Function: Test if a position is equal to another position + Returns : boolean + Args : Bio::Map::MappableI or Bio::Map::PositionI + +=cut + +sub equals { + my ($self,$compare) = @_; + + my ($me, $you) = $self->tuple($compare); + return 0 unless $me->isa('Bio::RangeI') and $you->isa('Bio::RangeI'); + + return $me->equals($you); +} + +=head2 overlaps + + Title : overlaps + Usage : if($r1->overlaps($r2)) { do stuff } + Function : tests if $r2 overlaps $r1 + Args : a range to test for overlap with + Returns : true if the ranges overlap, false otherwise + Inherited: Bio::RangeI + +=cut + +sub overlaps { + my ($self,$compare) = @_; + + my ($me, $you) = $self->tuple($compare); + return 0 unless $me->isa('Bio::RangeI') and $you->isa('Bio::RangeI'); + + return $me->overlaps($you); +} + +=head2 contains + + Title : contains + Usage : if($r1->contains($r2) { do stuff } + Function : tests wether $r1 totaly contains $r2 + Args : a range to test for being contained + Returns : true if the argument is totaly contained within this range + Inherited: Bio::RangeI + +=cut + +sub contains { + my ($self,$compare) = @_; + + my ($me, $you) = $self->tuple($compare); + return 0 unless $me->isa('Bio::RangeI') and $you->isa('Bio::RangeI'); + print STDERR "me=", $me->start. "-", $me->end, " ", + "you=", $you->start. "-", $you->end, "\n" + if $self->verbose > 0; + + return $me->contains($you); +} + +=head2 intersection + + Title : intersection + Usage : ($start, $stop, $strand) = $r1->intersection($r2) + Function : gives the range that is contained by both ranges + Args : a range to compare this one to + Returns : nothing if they do not overlap, or the range that they do overlap + Inherited: Bio::RangeI::intersection + +=cut + +sub intersection { + my ($self,$compare) = @_; + + my ($me, $you) = $self->tuple($compare); + return 0 unless $me->isa('Bio::RangeI') and $you->isa('Bio::RangeI'); + + return $me->intersection($you); +} + +=head2 union + + Title : union + Usage : ($start, $stop, $strand) = $r1->union($r2); + : ($start, $stop, $strand) = Bio::Range->union(@ranges); + Function : finds the minimal range that contains all of the ranges + Args : a range or list of ranges to find the union of + Returns : the range containing all of the ranges + Inherited: Bio::RangeI::union + +=cut + +sub union { + my ($self,$compare) = @_; + + my ($me, $you) = $self->tuple($compare); + return 0 unless $me->isa('Bio::RangeI') and $you->isa('Bio::RangeI'); + + return $me->union($you); +} + + +=head2 New methods + +=cut + + +=head2 get_chr + + Title : get_chr + Usage : my $mychr = $marker->get_chr(); + Function: Read only method for the chromosome string of the location. + A shotrcut to $marker->position->chr(). + Returns : chromosome value + Args : [optional] new chromosome value + +=cut + + +sub get_chr { + my ($self) = @_; + return undef unless $self->position; + return $self->position->chr; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Map/CytoPosition.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Map/CytoPosition.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,595 @@ +# $Id: CytoPosition.pm,v 1.4 2002/10/22 07:38:35 lapp Exp $ +# +# BioPerl module for Bio::Map::CytoPosition +# +# Cared for by Heikki Lehvaslaiho +# +# Copyright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Map::CytoPosition - Marker class with cytogenetic band storing attributes + +=head1 SYNOPSIS + + $m1 = Bio::Map::CytoPosition->new ( '-id' => 'A1', + '-value' => '2q1-3' + ); + $m2 = Bio::Map::CytoPosition->new ( '-id' => 'A2', + '-value' => '2q2' + ); + + if ($m1->cytorange->overlaps($m2->cytorange)) { + print "Makers overlap\n"; + } + + +=head1 DESCRIPTION + +CytoPosition is marker (Bio::Map::MarkerI compliant) with a location in a +cytogenetic map. See L for more information. + +Cytogenetic locations are names of bands visible in stained mitotic +eucaryotic chromosomes. The naming follows strict rules which are +consistant at least in higher vertebates, e.g. mammals. The chromosome +name preceds the band names. + +The shorter arm of the chromosome is called 'p' ('petit') and usually +drawn pointing up. The lower arm is called 'q' ('queue'). The bands +are named from the region separting these, a centromere (cen), towards +the tips or telomeric regions (ter) counting from 1 upwards. Depending +of the resolution used the bands are identified with one or more +digit. The first digit determines the major band and subsequent digits +sub bands: p1 band can be divided into subbands p11, p12 and 13 and +p11 can furter be divided into subbands p11.1 and p11.2. The dot after +second digit makes it easier to read the values. A region between ands +is given from the centromere outwards towards the telomere (e.g. 2p2-5 +or 3p21-35) or from a band in the p arm to a band in the q arm. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email: heikki@ebi.ac.uk +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Map::CytoPosition; +use vars qw(@ISA $VERSION); + +use strict; +use integer; + +$VERSION=1.0; + +# Object preamble - inheritance + +use Bio::Variation::VariantI; +use Bio::RangeI; +use Bio::Map::Position; + +@ISA = qw( Bio::Map::Position Bio::Variation::VariantI ); + + +=head2 cytorange + + Title : cytorange + Usage : $obj->cytorange(); + Function: + + Converts cytogenetic location set by value method into + an integer range. The chromosome number determines the + "millions" in the values. Human X and Y chromosome + symbols are represented by values 100 and 101. + + The localization within chromosomes are converted into + values between the range of 0 and 200,000: + + pter cen qter + |------------------------|-------------------------| + 0 100,000 200,000 + + The values between -100,000 through 0 for centromere to + 100,000 would have reflected the band numbering better but + use of positive integers was choosen since the + transformation is very easy. These values are not metric. + + Each band defines a range in a chromosome. A band string + is converted into a range by padding it with lower and and + higher end digits (for q arm: '0' and '9') to the length + of five. The arm and chromosome values are added to these: + e.g. 21000 & 21999 (band 21) + 100,000 (q arm) + 2,000,000 + (chromosome 2) => 2q21 : 2,121,000 .. 2,121,999. Note that + this notation breaks down if there is a band or a subband + using digit 9 in its name! This is not the case in human + karyotype. + + The full algorithm used for bands: + + if arm is 'q' then + pad char for start is '0', for end '9' + range is chromosome + 100,000 + padded range start or end + elsif arm is 'p' then + pad char for start is '9', for end '0' + range is chromosome + 100,000 - padded range start or end + + Example : Returns : Bio::Range object or undef + Args : none + +=cut + + +sub cytorange { + my ($self) = @_; + my ($chr, $r, $band, $band2, $arm, $arm2, $lc, $uc, $lcchar, $ucchar) = undef; + + return $r if not defined $self->value; # returns undef + $self->value =~ + # -----1----- --------2--------- -----3----- -------4------- ---6--- + m/([XY]|[0-9]+)(cen|qcen|pcen|[pq])?(ter|[.0-9]+)?-?([pq]?(cen|ter)?)?([.0-9]+)?/; + $self->warn("Not a valid value: ". $self->value), return $r + if not defined $1 ; # returns undef + + $chr = uc $1; + $self->chr($chr); + + $chr = 100 if $chr eq 'X'; + $chr = 101 if $chr eq 'Y'; + $chr *= 1000000; + + $r = new Bio::Range(); + + $band = ''; + if (defined $3 ) { + $2 || $self->throw("$& does not make sense: 'arm' or 'cen' missing"); + $band = $3; + $band =~ tr/\.//d; + } + if (defined $6 ) { + $arm2 = $4; + $arm2 = $2 if $4 eq ''; # it is not necessary to repeat the arm [p|q] + $band2 = $6; + $band2 =~ tr/\.//d; + #find the correct order +# print STDERR "-|$&|----2|$2|-----3|$band|---4|$4|--------arm2|$arm2|-------------\n"; + if ($band ne '' and (defined $arm2 and $2 ne $arm2 and $arm2 eq 'q') ) { + $lc = 'start'; $lcchar = '9'; + $uc = 'end'; $ucchar = '9'; + } + elsif ($band ne 'ter' and $2 ne $arm2 and $arm2 eq 'p') { + $lc = 'end'; $lcchar = '9'; + $uc = 'start'; $ucchar = '9'; + } + elsif ($band eq 'ter' and $arm2 eq 'p') { + $uc = 'start'; $ucchar = '9'; + } # $2 eq $arm2 + elsif ($arm2 eq 'q') { + if (_pad($band, 5, '0') < _pad($band2, 5, '0')) { + $lc = 'start'; $lcchar = '0'; + $uc = 'end'; $ucchar = '9'; + } else { + $lc = 'end'; $lcchar = '9'; + $uc = 'start'; $ucchar = '0'; + } + } + elsif ($arm2 eq 'p') { + if (_pad($band, 5, '0') < _pad($band2, 5, '0')) { + $lc = 'end'; $lcchar = '0'; + $uc = 'start'; $ucchar = '9'; + } else { + $lc = 'start'; $lcchar = '9'; + $uc = 'end'; $ucchar = '0'; + } + } + else { + $self->throw("How did you end up here? $&"); + } + + #print STDERR "-------$arm2--------$band2---------$ucchar--------------\n"; + if ( (defined $arm2 and $arm2 eq 'p') or (defined $arm2 and $arm2 eq 'p') ) { + $r->$uc(-(_pad($band2, 5, $ucchar)) + 100000 + $chr ); + if (defined $3 and $3 eq 'ter') { + $r->end(200000 + $chr); + } + elsif ($2 eq 'cen' or $2 eq 'qcen' or $2 eq 'pcen'){ + $r->$lc(100000 + $chr); + } + elsif ($2 eq 'q') { + $r->$lc(_pad($band, 5, $lcchar) + 100000 + $chr ); + } else { + $r->$lc(-(_pad($band, 5, $lcchar)) + 100000 + $chr ); + } + } else { #if:$arm2=q e.g. 9p22-q32 + #print STDERR "-------$arm2--------$band2---------$ucchar--------------\n"; + $r->$uc(_pad($band2, 5, $ucchar) + 100000 + $chr); + if ($2 eq 'cen' or $2 eq 'pcen') { + $r->$lc(100000 + $chr); + } + elsif ($2 eq 'p') { + if ($3 eq 'ter') { + $r->$lc(200000 + $chr); + } else { + $r->$lc(-(_pad($band, 5, $lcchar)) + 100000 + $chr); + } + } else { #$2.==q + $r->$lc(_pad($band, 5, $lcchar) + 100000 + $chr); + } + } + } + # + # e.g. 10p22.1-cen + # + elsif (defined $4 and $4 ne '') { + #print STDERR "$4-----$&----\n"; + if ($4 eq 'cen' || $4 eq 'qcen' || $4 eq 'pcen') { # e.g. 10p22.1-cen; + # '10pcen-qter' does not really make sense but lets have it in anyway + $r->end(100000 + $chr); + if ($2 eq 'p') { + if ($3 eq 'ter') { + $r->start($chr); + } else { + $r->start(_pad($band, 5, '9') + $chr); + } + } + elsif ($2 eq 'cen') { + $self->throw("'cen-cen' does not make sense: $&"); + } else { + $self->throw("Only order p-cen is valid: $&"); + } + } + elsif ($4 eq 'qter' || $4 eq 'ter') { # e.g. 10p22.1-qter, 1p21-qter, 10pcen-qter, 7q34-qter + $r->end(200000 + $chr); + if ($2 eq 'p'){ + $r->start(-(_pad($band, 5, '9')) + 100000 + $chr); #??? OK? + } + elsif ($2 eq 'q') { + $r->start(_pad($band, 5, '0') + 100000 + $chr); + } + elsif ($2 eq 'cen' || $2 eq 'qcen' || $2 eq 'pcen' ) { + $r->start(100000 + $chr); + } + } + elsif ($4 eq 'pter' ) { + #print STDERR "$2,$3--$4-----$&----\n"; + $r->start( $chr); + if ($2 eq 'p'){ + $r->end(-(_pad($band, 5, '0')) + 100000 + $chr); + } + elsif ($2 eq 'q') { + $r->end(_pad($band, 5, '9') + 100000 + $chr); + } + elsif ($2 eq 'cen' || $2 eq 'qcen' || $2 eq 'pcen' ) { + $r->end(100000 + $chr); + } + } else { # -p or -q at the end of the range + $self->throw("lone '$4' in $& does not make sense"); + } + } + # + # e.g 10p22.1, 10pter + # + elsif (defined $3 ) { + if ($2 eq 'p') { + if ($3 eq 'ter') { # e.g. 10pter + $r = new Bio::Range('-start' => $chr, + '-end' => $chr, + ); + } else { # e.g 10p22.1 + $r = new Bio::Range('-start' => -(_pad($band, 5, '9')) + 100000 + $chr, + '-end' => -(_pad($band, 5, '0')) + 100000 + $chr, + ); + } + } elsif ($2 eq 'q') { + if ($3 eq 'ter') { # e.g. 10qter + $r = new Bio::Range('-start' => 200000 + $chr, + '-end' => 200000 + $chr, + ); + } else { # e.g 10q22.1 + $r = new Bio::Range('-start' => _pad($band, 5, '0') + 100000 + $chr, + '-end' => _pad($band, 5, '9') + 100000 + $chr, + ); + } + } else { # e.g. 10qcen1.1 ! + $self->throw("'cen' in $& does not make sense"); + } + } + # + # e.g. 10p + # + elsif (defined $2 ) { # e.g. 10p + if ($2 eq'p' ) { + $r = new Bio::Range('-start' => $chr, + '-end' => 100000 + $chr + ); + } + elsif ($2 eq'q' ) { + $r = new Bio::Range('-start' => 100000 + $chr, + '-end' => 200000 + $chr + ); + } else { # $2 eq 'cen' || 'qcen' + $r = new Bio::Range('-start' => 100000 + $chr, + '-end' => 100000 + $chr + ); + } + } + # + # chr only, e.g. X + # + else { + $r = new Bio::Range('-start' => $chr, + '-end' => 200000 + $chr + ); + } + return $r; +} + + +sub _pad { + my ($string, $len, $pad_char) = @_; + die "function _pad needs a positive integer length, not [$len]" + unless $len =~ /^\+?\d+$/; + die "function _pad needs a single character pad_char, not [$pad_char]" + unless length $pad_char == 1; + $string ||= ''; +# $padded = $text . $pad_char x ( $pad_len - length( $text ) ); + return $string . $pad_char x ( $len - length( $string ) ); + +# my $slen = length $string; +# my $add = $len - $slen; +# return $string if $add <= 0; +# return $string .= $char x $add; +} + + +=head2 range2value + + Title : range2value + Usage : $obj->range2value(); + Function: + + Sets and returns the value string based on start and end + values of the Bio::Range object passes as an argument. + + Example : + Returns : string or false + Args : Bio::Range object + +=cut + +sub range2value { + my ($self,$value) = @_; + if( defined $value) { + if( ! $value->isa('Bio::Range') ) { + $self->throw("Is not a Bio::Range object but a [$value]"); + return undef; + } + if( ! $value->start ) { + $self->throw("Start is not defined in [$value]"); + return undef; + } + if( ! $value->end ) { + $self->throw("End is not defined in [$value]"); + return undef; + } + if( $value->start < 100000 ) { + $self->throw("Start value has to be in millions, not ". $value->start); + return undef; + } + if( $value->end < 100000 ) { + $self->throw("End value has to be in millions, not ". $value->end); + return undef; + } + + my ($chr, $arm, $band) = $value->start =~ /(\d+)(\d)(\d{5})/; + my ($chr2, $arm2, $band2) = $value->end =~ /(\d+)(\d)(\d{5})/; + + #print STDERR join ("\t", $value->start, $value->end ),"\n"; + #print STDERR join ("\t", $chr, $arm, $band, $chr2, $arm2, $band2), "\n"; + + my ($chrS, $armS, $bandS, $arm2S, $band2S, $sep) = ('', '', '', '', '', '' ); + LOC: { + # + # chromosome + # + if ($chr == 100) { + $chrS = 'X'; + } + elsif ($chr == 100) { + $chrS = 'Y'; + } else { + $chrS = $chr; + } + last LOC if $arm == 0 and $arm2 == 2 and $band == 0 and $band2 == 0 ; + # + # arm + # + if ($arm == $arm2 ) { + if ($arm == 0) { + $armS = 'p'; + #$armS = 'pter' if $band == 0 and $band2 == 0; + $bandS = 'ter' if $band == 0; + #$arm2S = 'p'; #? + } + elsif ($arm == 2) { + $armS = 'q'; + $bandS = 'ter' if $band == 0; + } else { + $armS = 'q'; + #$arm2S = 'q'; #? + $armS = 'cen', if $band == 0;# and $band2 == 0; + } + } else { + if ($arm == 0) { + $armS = 'p'; + $arm2S = 'q'; + $arm2S = '' if $band == 0 and $band2 == 0; + } else { + $armS = 'q'; + $arm2S = 'p'; + $arm2S = '' if $arm2 == 2 and $band == 0 and $band2 == 0; + } + } + last LOC if $band == $band2 ; + my $c; + # + # first band (ter is hadled with the arm) + # + if ($bandS ne 'ter') { + if ($armS eq 'p') { + $band = 100000 - $band; + $c = '9'; + } else { + $c = '0'; + } + $band =~ s/$c+$//; + $bandS = $band; + $bandS = substr($band, 0, 2). '.'. substr($band, 2) if length $band > 2; + } + last LOC unless $band2; + # + # second band + # + if ($arm2 == 0) { + $arm2S = 'p'; + $band2 = 100000 - $band2; + $c = '0'; + } else { # 1 or 2 + $arm2S = 'q'; + $c = '9'; + } + if ($band2 == 0) { + if ($arm2 == 1) { + $arm2S = 'p'; + $band2S = 'cen'; + } else { + $band2S = 'ter'; + } + last LOC; + } + last LOC if $band eq $band2 and $arm == $arm2; + + $band2 =~ s/$c+$//; + $band2S = $band2; + $band2S = substr($band2, 0, 2). '.'. substr($band2, 2) if length $band2 > 2; + + } # end of LOC: + + if ($armS eq 'p' and $arm2S eq 'p') { + my $tmp = $band2S; + $band2S = $bandS; + $bandS = $tmp; + } + $band2S = '' if $bandS eq $band2S ; + $armS = '' if $bandS eq 'cen'; + $arm2S = '' if $armS eq $arm2S and $band2S ne 'ter'; + $sep = '-' if $arm2S || $band2S; + $self->value( $chrS. $armS. $bandS. $sep. $arm2S. $band2S); + } + return $self->value; +} + +=head2 value + + Title : value + Usage : my $pos = $position->value; + Function: Get/Set the value for this postion + Returns : scalar, value + Args : [optional] new value to set + +=cut + +sub value { + my ($self,$value) = @_; + if( defined $value ) { + $self->{'_value'} = $value; + $self->{'_numeric'} = $self->cytorange($value); + } + return $self->{'_value'}; +} + +=head2 numeric + + Title : numeric + Usage : my $num = $position->numeric; + Function: Read-only method that is guarantied to return a numeric + representation for this position. + + This instanse of the method can also be set, but you better + know what you are doing. + + Returns : Bio::RangeI object + Args : optional Bio::RangeI object + +See L for more information. + +=cut + +sub numeric { + my ($self, $value) = @_; + + if ($value) { + $self->throw("This is not a Bio::RangeI object but a [$value]") + unless $value->isa('Bio::RangeI'); + $self->{'_numeric'} = $value; + $self->{'_value'} = $self->range2value($value); + } + return $self->{'_numeric'}; +} + + +=head2 chr + + Title : chr + Usage : my $mychr = $position->chr(); + Function: Get/Set method for the chromosome string of the location. + Returns : chromosome value + Args : [optional] new chromosome value + +=cut + +sub chr { + my ($self,$chr) = @_; + if( defined $chr ) { + $self->{'_chr'} = $chr; + } + return $self->{'_chr'}; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Map/LinkageMap.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Map/LinkageMap.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,245 @@ +# BioPerl module for Bio::Map::LinkageMap +# +# Cared for by Chad Matsalla +# +# Copyright Chad Matsalla +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Map::LinkageMap - A representation of a genetic linkage map. + +=head1 SYNOPSIS + + use Bio::Map::LinkageMap; + # create a new map + my $map = new Bio::Map::LinkageMap(-name => 'Chads Superterriffic Map', + -type => 'Linkage', + -units=> 'cM'); + # create the location of a marker for that map + my $position = new Bio::Map::LinkagePosition( -positions => 1, + -distance => "22.3"); + # create a marker and place it at that position + my $marker = new Bio::Map::Marker::Microsatellite( + -name => 'SuuuperMarker', + -position => $position); + # place that marker on that map + $map->add_element($marker); + + # done! + +=head1 DESCRIPTION + +This object describes the basic functionality of a genetic linkage map in +Bioperl. Each 'position' can have one or more markers that map some number of +units from the markers at the previous position. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion +http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Chad Matsalla + +Email bioinformatics1@dieselwurks.com + +=head1 CONTRIBUTORS + +Lincoln Stein lstein@cshl.org +Heikki Lehvaslaiho heikki@ebi.ac.uk +Jason Stajich jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Map::LinkageMap; +use vars qw(@ISA); +use strict; +use Bio::Map::SimpleMap; + +@ISA = qw(Bio::Map::SimpleMap); + +=head2 new + + Title : new + Usage : my $linkage_map = new Bio::Map::LinkageMap(); + Function: Builds a new Bio::Map::LinkageMap object + Returns : Bio::Map::LinkageMap + Args : -name => the name of the map (string) [optional] + -type => the type of this map (string, defaults to Linkage) [optional] + -species => species for this map (Bio::Species) [optional] + -units => the map units (string, defaults to cM) [optional] + -elements=> elements to initialize with + (arrayref of Bio::Map::MappableI objects) [optional] + + -uid => Unique ID of this map +=cut + +# new provided by SimpleMap + + + +=head2 length() + + Title : length() + Usage : my $length = $map->length(); + Function: Retrieves the length of the map. In the case of a LinkageMap, the + length is the sum of all marker distances. + Returns : An integer representing the length of this LinkageMap. Will return + undef if length is not calculateable + Args : None. + + +=cut + +sub length { + my ($self) = @_; + my $total_distance; + foreach (@{$self->{'_elements'}}) { + if ($_) { + $total_distance += ($_->position()->each_position_value($self))[0]; + } + } + return $total_distance; +} + +=head2 add_element($marker) + + Title : add_element($marker) + Usage : $map->add_element($marker) + Function: Add a Bio::Map::MappableI object to the Map + Returns : none + Args : Bio::Map::MappableI object + Notes : It is strongly recommended that you use a + Bio::Map::LinkagePosition as the position in any + Bio::Map::Mappable that you create to place on this + map. Using some other Bio::Map::Position might work but might + be unpredictable. + N.B. I've added Bio::Map::OrderedPosition which should achieve + similar things from LinkagePosition and will work for + RH markers too. +=cut + +#' +sub _add_element { + my ($self,$marker) = @_; + + my $o_position = $marker->position(); + + $self->debug( "marker position is ". $marker->position()); +# print("add_element: \$o_position is $o_position\n"); +# print("add_element: \$marker is $marker\n"); + + my $position; + unless ( $o_position->isa('Bio::Map::LinkagePosition') || + $o_position->isa('Bio::Map::OrderedPosition') + ) { + $self->warn("You really should use a Linkage Position for this object. This insures that there is only one position. Trying anyway..."); + my @p = ( $o_position->each_position_value($self)); + $position = shift @p; + if( ! defined $position ) { + $self->throw("This marker ($marker) does not have a position in this map ($self)"); + } + } else { + $position = $o_position->order; + } + + if ($self->{'_elements'}[$position]) { + $self->warn("Replacing the marker in position $position because in a linkage map the position is a key."); + } + $self->{'_elements'}[$position] = $marker; +} + +=head2 each_element + + Title : each_element + Usage : my @elements = $map->each_element; + Function: Retrieves all the elements in a map + _ordered_. + Returns : An array containing MappableI objects. + Args : None. + Notes : This is a useless concept in the context of a linkage map but is + included if you want a list of all of the marker names on the map. + +=cut + +sub each_element { + my ($self) = @_; + return @{$self->{'_elements'}}; +} + +=head2 implemented by Bio::Map::SimpleMap + +=cut + +=head2 name($new_name) + + Title : name($new_name) + Usage : my $name = $map->name($new_name) _or_ + my $length = $map->name() + Function: Get/set the name of the map. + Returns : The current name of the map. + Args : If provided, the name of the map is set to $new_name. + +=head2 species + + Title : species + Usage : my $species = $map->species; + Function: Get/Set Species for a map + Returns : Bio::Species object + Args : (optional) Bio::Species + + +=head2 units + + Title : units + Usage : $map->units('cM'); + Function: Get/Set units for a map + Returns : units for a map + Args : units for a map (string) + + +=head2 type + + Title : type + Usage : my $type = $map->type + Function: Get/Set Map type + Returns : String coding map type + Args : (optional) string + +=head2 unique_id + + Title : unique_id + Usage : my $id = $map->unique_id; + Function: Get/Set the unique ID for this map + Returns : a unique identifier + Args : [optional] new identifier to set + +=cut + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Map/LinkagePosition.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Map/LinkagePosition.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,110 @@ +# BioPerl module for Bio::Map::LinkagePosition +# +# Cared for by Chad Matsalla +# +# Copyright Chad Matsalla +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Map::LinkagePosition - Create a Position for a Marker that will be placed + on a Bio::Map::LinkageMap + +=head1 SYNOPSIS + + use Bio::Map::Position; + my $position = new Bio::Map::LinkagePosition(-positions => 1, + -distance => 22.1 ); + + # can get listing of positions + my @positions = $position->each_position; + + +=head1 DESCRIPTION + +Position for a Bio::Map::MarkerI compliant object that will be +placed on a Bio::Map::LinkageMap. See L and +L for details + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Chad Matsalla + +Email bioinformatics1@dieselwurks.com + +=head1 CONTRIBUTORS + +Lincoln Stein, lstein@cshl.org +Heikki Lehvaslaiho, heikki@ebi.ac.uk +Jason Stajich jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Map::LinkagePosition; +use vars qw(@ISA); +use strict; +require 'dumpvar.pl'; + +use Bio::Map::OrderedPosition; + +@ISA = qw(Bio::Map::OrderedPosition); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Map::LinkagePosition(-positions => $position, + -distance => $distance ); + Function: Builds a new Bio::Map::LinkagePosition object + Returns : Bio::Map::LinkagePosition + Args : -order => the relative order of this marker on a linkage map + -positions => positions on a map +=cut + +=head2 Bio::Map::PositionI methods + +=cut + +=head2 order + + Title : order + Usage : $o_position->order($new_position) _or_ + $o_position->order() + Function: get/set the order position of this position in a map + Returns : + Args : If $new_position is provided, the current position of this Position + will be set to $new_position. + +=cut + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Map/MapI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Map/MapI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,203 @@ +# $Id: MapI.pm,v 1.6 2002/10/22 07:45:15 lapp Exp $ +# +# BioPerl module for Bio::Map::MapI +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Map::MapI - Interface for describing Map objects in bioperl + +=head1 SYNOPSIS + + # get a MapI somehowe + my $name = $map->name(); # string + my $length = $map->length(); # integer + my $species= $map->species; # Bio::Species + my $type = $map->type(); # genetic/sts/rh/ + +=head1 DESCRIPTION + +This object describes the basic functionality of a Map in bioperl. +Maps are anything from Genetic Map to Sequence Map to and Assembly Map +to Restriction Enzyme to FPC. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion +http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +=head1 CONTRIBUTORS + +Lincoln Stein, lstein@cshl.org +Heikki Lehvaslaiho, heikki@ebi.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::Map::MapI; +use vars qw(@ISA); +use strict; +use Bio::Root::RootI; +use Carp; + +@ISA = qw(Bio::Root::RootI); + +=head2 species + + Title : species + Usage : my $species = $map->species; + Function: Get/Set Species for a map + Returns : L object + Args : (optional) Bio::Species + +=cut + +sub species{ + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 units + + Title : units + Usage : $map->units('cM'); + Function: Get/Set units for a map + Returns : units for a map + Args : units for a map (string) + +=cut + +sub units{ + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 type + + Title : type + Usage : my $type = $map->type + Function: Get/Set Map type + Returns : String coding map type + Args : (optional) string + +=cut + +sub type { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 name + + Title : name + Usage : my $name = $map->name + Function: Get/Set Map name + Returns : Map name + Args : (optional) string + +=cut + +sub name { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 length + + Title : length + Usage : my $length = $map->length(); + Function: Retrieves the length of the map, + It is possible for the length to be unknown + for maps such as Restriction Enzyme, will return undef + in that case + Returns : integer representing length of map in current units + will return undef if length is not calculateable + Args : none + +=cut + +sub length{ + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 unique_id + + Title : unique_id + Usage : my $id = $map->unique_id; + Function: Get/Set the unique ID for this map + Returns : a unique identifier + Args : [optional] new identifier to set + +=cut + +sub unique_id{ + my ($self,$id) = @_; + $self->throw_not_implemented(); +} + +=head2 add_element + + Title : add_element + Usage : $map->add_element($marker) + Function: Add a Bio::Map::MappableI object to the Map + Returns : none + Args : Bio::Map::MappableI object + +=cut + +sub add_element{ + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 each_element + + Title : each_element + Usage : my @elements = $map->each_element; + Function: Retrieves all the elements in a map + unordered + Returns : Array of Map elements (L) + Args : + + +=cut + +sub each_element{ + my ($self) = @_; + $self->throw_not_implemented(); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Map/MappableI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Map/MappableI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,145 @@ +# $Id: MappableI.pm,v 1.9 2002/10/22 07:45:15 lapp Exp $ +# +# BioPerl module for Bio::Map::MappableI +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Map::MappableI - An object that can be placed in a map + +=head1 SYNOPSIS + + # get a Bio::Map::MappableI somehow + my $position = $element->map_position(); + # these methods will be important for building sorted lists + if( $position->equals($p2) ) { + # do something + } elsif( $position->less_tha($p2) ) {} + elsif( $position->greater_than($p2) ) { } + + +=head1 DESCRIPTION + +This object handles the generic notion of an element placed on a +(linear) Map. Elements can have multiple positions in a map such as +is the case of Restriction enzyme cut sites on a sequence map. For +exact information about an element's position in a map one must query +the associate PositionI object which is accessible through the +position() method. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +=head1 CONTRIBUTORS + +Heikki Lehvaslaiho heikki@ebi.ac.uk +Lincoln Stein lstein@cshl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# 'Let the code begin... + + +package Bio::Map::MappableI; +use vars qw(@ISA); +use strict; +use Bio::Root::RootI; +use Carp; + +@ISA = qw(Bio::Root::RootI); + +=head2 position + + Title : position + Usage : my $position = $mappable->position(); + Function: Get/Set the Bio::Map::PositionI for a mappable element + Returns : Bio::Map::PositionI + Args : (optional) Bio::Map::PositionI + +=cut + +sub position{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +=head2 equals + + Title : equals + Usage : if( $mappable->equals($mapable2)) ... + Function: Test if a position is equal to another position + Returns : boolean + Args : Bio::Map::MappableI or Bio::Map::PositionI + +=cut + +sub equals{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +=head2 less_than + + Title : less_than + Usage : if( $mappable->less_than($m2) ) ... + Function: Tests if a position is less than another position + Returns : boolean + Args : Bio::Map::MappableI or Bio::Map::PositionI + +=cut + +sub less_than{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +=head2 greater_than + + Title : greater_than + Usage : if( $mappable->greater_than($m2) ) ... + Function: Tests if position is greater than another position + Returns : boolean + Args : Bio::Map::MappableI or Bio::Map::PositionI + +=cut + +sub greater_than{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Map/Marker.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Map/Marker.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,537 @@ +# BioPerl module for Bio::Map::Marker +# +# Cared for by Chad Matsalla +# +# Copyright Chad Matsalla +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Map::Marker - An central map object representing a generic marker +that can have multiple location in several maps. + +=head1 SYNOPSIS + + # get map objects somehow + + # a marker with complex localisation + $o_usat = new Bio::Map::Marker (-name=>'Chad Super Marker 2', + -positions => [ [$map1, $position1], + [$map1, $position2] + ] ); + + # The markers deal with Bio::Map::Position objects which can also + # be explicitely created and passed on to markers as an array ref: + $o_usat2 = new Bio::Map::Marker (-name=>'Chad Super Marker 3', + -positions => [ $pos1, + $pos2 + ] ); + + # a marker with unique position in a map + $marker1 = new Bio::Map::Marker (-name=>'hypervariable1', + -map => $map1, + -position => 100 + ) + + # an other way of creating a marker with unique position in a map: + $marker2 = new Bio::Map::Marker (-name=>'hypervariable2'); + $map1->add_marker($marker2); + $marker2->position(100); + + # position method is a short cut for get/set'ing unigue positions + # which overwrites previous values + # to place a marker to other maps or to have multiple positions + # for a map within the same map use add_position() + + $marker2->add_position(200); # new position in the same map + $marker2->add_position($map2,200); # new map + + # setting a map() in a marker or adding a marker into a map are + # identical mathods. Both set the bidirectional connection which is + # used by the marker to remember its latest, default map. + + # Regardes of how marker positions are created, they are stored and + # returned as Bio::Map::PositionI objects: + + # unique position + print $marker1->position->value, "\n"; + # several positions + foreach $pos ($marker2->each_position($map1)) { + print $pos->value, "\n"; + } + +See L and L for more information. + +=head1 DESCRIPTION + +This object handles the notion of a generic marker. This marker will +have a name and a position in a map. + +This object is intended to be used by a marker parser like Mapmaker.pm +and then blessed into the proper type of marker (ie Microsatellite) by +the calling script. + +=head2 Design principles + +A Marker is a central object in Bio::Map name space. A Map is a holder +class for objects. A Marker has a Position in a Map. A Marker can be +compared to an other Markers using boolean methods. Positions can have +non-numeric values or other methods to store the locations, so they +have a method numeric() which does the conversion. + +A Marker has a convinience method position() which is able to create +Positions of required class from scalars by calling method +get_position_object(). + +For more complex situations, a Marker can have multiple positions in +multiple Maps. It is therefore possible to extract Positions (all or +belonging to certain Map) and compare Markers to them. It is up to the +programmer to make sure position values and Maps they belong to can be +sensibly compared. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Chad Matsalla + +Email bioinformatics1@dieselwurks.com + +=head1 CONTRIBUTORS + +Heikki Lehvaslaiho heikki@ebi.ac.uk +Lincoln Stein lstein@cshl.org +Jason Stajich jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::Map::Marker; +use vars qw(@ISA); +use strict; +use Bio::Root::Root; +use Bio::Map::MarkerI; +use Bio::Map::Position; + +@ISA = qw(Bio::Root::Root Bio::Map::MarkerI); + +=head2 new + + Title : new + Usage : $o_marker = new Bio::Map::Marker( -name => 'Whizzy marker', + -position => $position); + Function: Builds a new Bio::Map::Marker object + Returns : Bio::Map::Marker + Args : + -name => name of this microsatellite + [optional], string,default 'Unknown' + + -positions => map position for this marker, [optional] + Bio::Map::PositionI-inherited obj, no default) + +=cut + +sub new { + my ($class,@args) = @_; + my $self = $class->SUPER::new(@args); + $self->{'_positions'} = []; + my ($name, $map, $position, $positions) = + $self->_rearrange([qw(NAME + MAP + POSITION + POSITIONS + )], @args); + if ($name) { $self->name($name); } + else {$self->name('Unnamed marker'); } + $position && $self->position($position); + $positions && $self->positions($positions); + $map && $self->map($map); + + return $self; +} + +=head2 name + + Title : name + Usage : $o_usat->name($new_name) _or_ + my $name = $o_usat->name() + Function: Get/Set the name for this Microsatellite + Returns : A scalar representing the current name of this marker + Args : If provided, the current name of this marker + will be set to $new_name. + +=cut + +sub name { + my ($self,$name) = @_; + my $last = $self->{'_name'}; + if ($name) { + $self->{'_name'} = $name; + } + return $last; +} + + +=head2 map + + Title : map + Usage : my $mymap = $marker->map(); + Function: Get/Set the default map for the marker. + This is set by L method + Returns : L + Args : [optional] new L + +=cut + +sub map { + my ($self,$map) = @_; + if( defined $map ) { + $self->thow('This is [$map], not Bio::Map::MapI object') + unless $map->isa('Bio::Map::MapI'); + $self->{'_default_map'} = $map; + } + return $self->{'_default_map'}; +} + + + +=head2 get_position_object + + Title : get_position_class + Usage : my $pos = $marker->get_position_object(); + Function: To get an object of the default Position class + for this Marker. Subclasses should redefine this method. + The Position needs to be Bio::Map::PositionI. + Returns : Bio::Map::Position + Args : none + +See L and L for more information. + +=cut + +sub get_position_object { + my ($self) = @_; + return new Bio::Map::Position(); +} + + +=head2 position + + Title : position + Usage : $position = $mappable->position($map); OR + $mappable->position($position); # $position can be Bio::Map::PositionI + $mappable->position(100); # or scalar if the marker has a default map + $mappable->position($map, 100); # if not give explicit $map + Function: Get/Set the Bio::Map::PositionI for a mappable element + in a specific Map + Adds the marker to a map automatically if Map is given. + Altenaitvely, you can add the merker to the map first + (L) to set the default map + Returns : Bio::Map::PositionI + Args : $position - Bio::Map::PositionI # Position we want to set + OR + $map - Bio::Map::MapI AND + scalar + OR + scalar, but only if the marker has been added to a map + +=cut + +sub position { + my ($self, $pos, $secondary_pos) = @_; + my ($map); + POS: { + if ($pos) { + if (ref($pos) eq 'SCALAR' || ref($pos) eq '') { + $map = $self->map; + } + elsif (ref($pos) eq 'ARRAY') { + $map = $pos->[0]; + $pos = $pos->[1]; + } + elsif ($pos->isa('Bio::Map::PositionI')) { + $pos->marker($self); + + $self->purge_positions; + $self->add_position($pos); + $map = $pos->map; + $map->add_element($self) unless defined($self->map) && $self->map eq $map; + last POS; + } + + elsif ($pos->isa('Bio::Map::MapI')) { + $map = $pos; + $pos = $secondary_pos; + } else { + $map = $self->map; + } + $self->throw("You need to add a marker to a map before ". + "you can set positions without explicit map!" ) + unless $map; + $self->throw("Position better be scalar, not [$pos=". ref($pos) ."]") + unless ref($pos) eq 'SCALAR' || ref($pos) eq ''; + + my $newpos = $self->get_position_object; + $newpos->map($map); + $newpos->value($pos); + $newpos->marker($self); + + $map->add_element($self) unless defined($self->map) && $self->map eq $map; + $self->purge_positions; + $self->add_position($newpos) + } + } + my @array = $self->each_position(); + $self->warn('More than one value is associated with this position') + if scalar @array > 1; + return $array[0]; +} + +=head2 add_position + + Title : add_position + Usage : $position->add_position($position) + Function: Add the Position to the Marker container. + If you are using this method, you need to + add the Marker to the Map yourself + Returns : none + Args : Position - Reference to Bio::Map::PositionI + +=cut + +sub add_position{ + my ($self, $pos) = @_; + $self->throw("Must give a Position") unless defined $pos; + + $self->throw("Must give a Bio::Map::PositionI, not [". ref($pos) ."]") + unless $pos->isa('Bio::Map::PositionI'); + + my $map = $pos->map; + $map->add_element($self) unless defined($self->map) && $self->map eq $map; + + push @{$self->{'_positions'}}, $pos; + +} + +=head2 positions + + Title : positions + Usage : $mappable->positions([$pos1, $pos2, $pos3]); + Function: Add multiple Bio::Map::PositionI for a mappable element + in a Map. + Returns : boolean + Args : array ref of $map/value tuples or array ref of Positions + +=cut + +sub positions { + my ($self, $arrayref) = @_; + my ($map); + $self->throw_not_implemented(); +} + +=head2 each_position + + Title : each_position + Usage : my @positions = $position->each_position('mapname'); + Function: Retrieve a list of Positions + Returns : Array of L + Args : none + +=cut + +sub each_position { + my ($self,$mapname) = @_; + $self->warn("Retrieving positions in a named map only is ". + "not implemented. Getting all.") if $mapname; + return @{$self->{'_positions'}}; +} + +=head2 purge_positions + + Title : purge_positions + Usage : $marker->purge_positions + Function: Remove all the position values stored for a Marker + Returns : none + Args : [optional] only purge values for a given map + +=cut + +sub purge_positions{ + my ($self, $map) = @_; + $self->warn("Retrieving positions in a named map only, not implemented ") if $map; + $self->{'_positions'} = []; +} + +=head2 known_maps + + Title : known_maps + Usage : my @maps = $marker->known_maps + Function: Returns the list of maps that this position has values for + Returns : list of Bio::Map::MapI unique ids + Args : none + +=cut + +sub known_maps{ + my ($self) = @_; + my %hash; + foreach my $pos ($self->each_position) { + $hash{$pos->map->unique_id} = 1; + } + return keys %hash; +} + +=head2 in_map + + Title : in_map + Usage : if ( $position->in_map($map) ) {} + Function: Tests if a position has values in a specific map + Returns : boolean + Args : a map unique id OR Bio::Map::MapI + +=cut + +sub in_map{ + my ($self,$map) = @_; + + $self->throw("Need an argument") unless $map; + + if (ref($map) && $map->isa('Bio::Map::MapI')) { + foreach my $pos ($self->each_position) { + return 1 if $pos->map eq $map; + } + } else { # assuming a scalar + foreach my $pos ($self->each_position) { + return 1 if $pos->map->unique_id eq $map; + } + } + return 0; +} + +=head2 Comparison methods + +=cut + +=head2 tuple + + Title : tuple + Usage : ($me, $you) = $self->_tuple($compare) + Function: Utility ethod to extract numbers and test for missing values. + Returns : tuple values + Args : Bio::Map::MappableI or Bio::Map::PositionI + +=cut + +sub tuple { + my ($self,$compare) = @_; + my ($me, $you) = (-1, -1); + + $self->warn("Trying to compare [". $self->name. "] to nothing.") && + return ($me, $you) unless defined $compare; + $self->warn("[". $self->name. "] has no position.") && + return ($me, $you) unless $self->position; + + $me = $self->position->numeric; + + if( $compare->isa('Bio::Map::MappableI') ){ + $self->warn("[". $compare->name. "] has no position.") && + return ($me, $you) unless $compare->position; + + $you = $compare->position->numeric; + return ($me, $you); + + } elsif( $compare->isa('Bio::Map::PositionI') ) { + + $you = $compare->numeric; + return ($me, $you); + + } else { + $self->warn("Can only run equals with Bio::Map::MappableI or ". + "Bio::Map::PositionI not [$compare]"); + } + return ($me, $you); +} + + +=head2 equals + + Title : equals + Usage : if( $mappable->equals($mapable2)) ... + Function: Test if a position is equal to another position + Returns : boolean + Args : Bio::Map::MappableI or Bio::Map::PositionI + +=cut + +sub equals { + my ($self,$compare) = @_; + + my ($me, $you) = $self->tuple($compare); + return 0 if $me == -1 or $you == -1 ; + return $me == $you; +} + +=head2 less_than + + Title : less_than + Usage : if( $mappable->less_than($m2) ) ... + Function: Tests if a position is less than another position + Returns : boolean + Args : Bio::Map::MappableI or Bio::Map::PositionI + +=cut + +sub less_than { + my ($self,$compare) = @_; + + my ($me, $you) = $self->tuple($compare); + return 0 if $me == -1 or $you == -1 ; + return $me < $you; +} + +=head2 greater_than + + Title : greater_than + Usage : if( $mappable->greater_than($m2) ) ... + Function: Tests if position is greater than another position + Returns : boolean + Args : Bio::Map::MappableI or Bio::Map::PositionI + +=cut + +sub greater_than { + my ($self,$compare) = @_; + + + my ($me, $you) = $self->tuple($compare); + return 0 if $me == -1 or $you == -1 ; + return $me > $you; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Map/MarkerI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Map/MarkerI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,256 @@ +# $Id: MarkerI.pm,v 1.9 2002/10/22 07:45:15 lapp Exp $ +# +# BioPerl module for Bio::Map::MarkerI +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Map::MarkerI - Interface for basic marker functionality + +=head1 SYNOPSIS + +Give standard usage here + +=head1 DESCRIPTION + +Describe the interface here + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Heikki Lehvaslaiho heikki@ebi.ac.uk +Lincoln Stein lstein@cshl.org +Jason Stajich jason@bioperl.org +Chad Matsalla bioinformatics1@dieselwurks.com + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Map::MarkerI; +use vars qw(@ISA); +use strict; +use Bio::Map::MappableI; + +@ISA = qw(Bio::Map::MappableI); + +=head2 name($new_name) + + Title : name($new_name) + Usage : my $name = $o_usat->name($new_name) _or_ + my $name = $o_usat->name() + Function: Get/Set the name for this Marker + Returns : A scalar representing the current name of this Marker + Args : If provided, the current name of this Marker + will be set to $new_name. + +=cut + +sub name { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 add_position + + Title : add_position + Usage : $position->add_position($map,'100') + Function: Add a numeric or string position to the PositionI container + Returns : none + Args : Map - Reference to Bio::Map::MapI + String or Numeric coding for a position on a map + +=cut + +sub add_position{ + my ($self,$map,$value) = @_; + $self->throw_not_implemented(); +} + + +=head2 each_position + + Title : positions + Usage : my @positions = $position->each_position_value('mapname'); + Function: Retrieve a list of positions coded as strings or ints + Returns : Array of position values + Args : none + +=cut + +sub each_position { + my ($self,$mapname) = @_; + $self->throw_not_implemented(); +} + +=head2 purge_positions + + Title : purge_positions + Usage : $position->purge_positions + Function: Remove all the position values stored for a Marker + Returns : none + Args : [optional] only purge values for a given map + +=cut + +sub purge_position_values{ + my ($self, $map) = @_; + $self->throw_not_implemented(); +} + +=head2 known_maps + + Title : known_maps + Usage : my @maps = $marker->known_maps + Function: Returns the list of maps that this position has values for + Returns : list of Bio::Map::MapI unique ids + Args : none + +=cut + +sub known_maps{ + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 in_map + + Title : in_map + Usage : if ( $position->in_map($map) ) {} + Function: Tests if a position has values in a specific map + Returns : boolean + Args : a map unique id OR Bio::Map::MapI + + +=cut + +sub in_map{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +=head2 get_position_object + + Title : get_position_class + Usage : my $pos = $marker->get_position_object(); + Function: To get an object of the default Position class + for this Marker. Subclasses should redefine this method. + The Position needs to be L. + Returns : L + Args : none + +=cut + +sub get_position_object { + my ($self) = @_; + $self->throw_not_implemented(); +} + + +=head2 tuple + + Title : tuple + Usage : ($me, $you) = $self->_tuple($compare) + Function: Utility method to extract numbers and test for missing values. + Makes writing subsequent tests easier. + Returns : a tuple of values or ranges + Args : Bio::Map::MappableI or Bio::Map::PositionI + +=cut + +sub tuple { + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + + +=head2 Bio::Map::MappableI methods + +=cut + + +=head2 position + + Title : position + Usage : my position_string = $position->position('mapname'); + Function: Get/Set method for single value positions. + Gives a simplified interface when only one map and + one position per marker is used. + Returns : a position value + Args : optional: + Map - Reference to Bio::Map::MapI + String or Numeric coding for a position on a map + +=cut + +sub position{ + my ($self,$map, $value) = @_; + $self->throw_not_implemented(); +} + +=head2 equals + + Title : equals + Usage : if( $mappable->equals($mapable2)) ... + Function: Test if a position is equal to another position + Returns : boolean + Args : Bio::Map::MappableI + +=head2 less_than + + Title : less_than + Usage : if( $mappable->less_than($m2) ) ... + Function: Tests if a position is less than another position + Returns : boolean + Args : Bio::Map::MappableI + +=head2 greater_than + + Title : greater_than + Usage : if( $mappable->greater_than($m2) ) ... + Function: Tests if position is greater than another position + Returns : boolean + Args : Bio::Map::MappableI + +=cut + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Map/Microsatellite.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Map/Microsatellite.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,381 @@ +# BioPerl module for Bio::Map::Microsatellite +# +# Cared for by Chad Matsalla +# +# Copyright Chad Matsalla +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Map::Microsatellite - An object representing a Microsatellite marker. + +=head1 SYNOPSIS + + $o_usat = new Bio::Map::Microsatellite + (-name=>'Chad Super Marker 2', + -sequence => 'gctgactgatcatatatatatatatatatatatatatatatcgcgatcgtga', + -motif => 'at', + -repeats => 15, + -repeat_start_position => 11 + ); + + $sequence_before_usat = $o_usat->get_leading_flank(); + $sequence_after_usat = $o_usat->get_trailing_flank(); + + +=head1 DESCRIPTION + +This object handles the notion of an Microsatellite. This microsatellite can +be placed on a (linear) Map or used on its own. If this Microsatellites +will be used in a mapping context (it doesn't have to, you know) it can have +multiple positions in a map. For information about a Microsatellite's position +in a map one must query the associate PositionI object which is accessible +through the position() method. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Chad Matsalla + +Email bioinformatics1@dieselwurks.com + +=head1 CONTRIBUTORS + +Heikki Lehvaslaiho heikki@ebi.ac.uk +Lincoln Stein lstein@cshl.org +Jason Stajich jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::Map::Microsatellite; +use vars qw(@ISA); +use strict; +use Bio::Root::Root; +use Bio::Map::Marker; + +@ISA = qw(Bio::Map::Marker); + +=head2 new + + Title : new + Usage : $o_usat = + Function: Builds a new Bio::Map::Microsatellite object + Returns : Bio::Map::Microsatellite + Args : + -name => name of this microsatellite (optional, string, + default 'Unknown microsatellite') + -positions => position(s) for this marker in maps[optional], + An array reference of tuples (array refs themselves) + Each tuple conatins a Bio::Map::MapI-inherited object and a + Bio::Map::PositionI-inherited obj, no default) + -sequence => the sequence of this microsatellite (optional, + scalar, no default) + -motif => the repeat motif of this microsatellite (optional, + scalar, no default) + -repeats => the number of motif repeats for this microsatellite + (optional, scalar, no default) + -repeat_start_position => the starting position of the + microsatellite in this sequence. The first base of the + sequence is position "1". (optional, scalar, no default) + + Note : Creating a Bio::Map::Microsatellite object with no position + might be useful for microsatellite people wanting to embrace + and extend this module. Me! Me! Me! + - using repeat_start_position will trigger a mechinism to + calculate a value for repeat_end_position. + + +=cut + +sub new { + my ($class,@args) = @_; + my $self = $class->SUPER::new(@args); + my ($map, $position, $sequence, $motif, $repeats, $start) = + $self->_rearrange([qw(MAP + POSITION + SEQUENCE + MOTIF + REPEATS + REPEAT_START_POSITION + )], @args); + if( ! $self->name ) { + $self->name('Unnamed microsatellite'); + } + $map && $self->map($map); + $position && $self->position($position); + $sequence && $self->sequence($sequence); + $self->motif(defined $motif ? $motif : 'Unknown motif'); + $repeats && $self->repeats($repeats); + $start && $self->repeat_start_position($start); + return $self; +} + +=head2 Bio::Map::Marker methods + +=cut + +=head2 position + + Title : position + Usage : my $position = $mappable->position($map); OR + $mappable->position($map,$position); OR + Function: Get/Set the Bio::Map::PositionI for a mappable element + in a specific Map + Returns : Bio::Map::PositionI + Args : $map =Bio::Map::MapI # Map we are talking about + $position = Bio::Map::PositionI # Position we want to set + + +=head2 name($new_name) + + Title : name($new_name) + Usage : $o_usat->name($new_name) _or_ + my $name = $o_usat->name() + Function: Get/Set the name for this Microsatellite + Returns : A scalar representing the current name of this Microsatellite + Args : If provided, the current name of this Microsatellite + will be set to $new_name. + +=head2 motif($new_motif) + + Title : motif($new_motif) + Usage : my $motif = $o_usat->motif($new_motif) _or_ + my $motif = $o_usat->motif() + Function: Get/Set the repeat motif for this Microsatellite + Returns : A scalar representing the current repeat motif of this + Microsatellite. + Args : If provided, the current repeat motif of this Microsatellite + will be set to $new_motif. + +=cut + +sub motif { + my ($self,$motif) = @_; + if ($motif) { + $self->{'_motif'} = $motif; + } + return $self->{'_motif'}; +} + +=head2 sequence($new_sequence) + + Title : sequence($new_sequence) + Usage : my $sequence = $o_usat->sequence($new_sequence) _or_ + my $sequence = $o_usat->sequence() + Function: Get/Set the sequence for this Microsatellite + Returns : A scalar representing the current sequence of this + Microsatellite. + Args : If provided, the current sequence of this Microsatellite + will be set to $new_sequence. + +=cut + +sub sequence { + my ($self,$sequence) = @_; + if ($sequence) { + $self->{'_sequence'} = $sequence; + } + return $self->{'_sequence'}; +} + +=head2 repeats($new_repeats) + + Title : repeats($new_repeats) + Usage : my $repeats = $o_usat->repeats($new_repeats) _or_ + my $repeats = $o_usat->repeats() + Function: Get/Set the repeat repeats for this Microsatellite + Returns : A scalar representing the current number of repeats of this + Microsatellite. + Args : If provided, the current number of repeats of this + Microsatellite will be set to $new_repeats. + +=cut + +sub repeats { + my ($self,$repeats) = @_; + if ($repeats) { + $self->{'_repeats'} = $repeats; + } + return $self->{'_repeats'}; +} + +=head2 repeat_start_position($new_repeat_start_position) + + Title : repeat_start_position($new_repeat_start_position) + Usage : my $repeat_start_position = + $o_usat->repeat_start_position($new_repeat_start_position) _or_ + my $repeat_start_position = $o_usat->repeat_start_position() + Function: Get/Set the repeat repeat_start_position for this + Microsatellite + Returns : A scalar representing the repeat start position for this + Microsatellite. + Args : If provided, the current repeat start position of this + Microsatellite will be set to $new_repeat_start_position. + This method will also try to set the repeat end position. This + depends on having information for the motif and the number of + repeats. If you want to use methods like get_trailing_flank or + get_leading flank, be careful to include the right information. + +=cut + +sub repeat_start_position { + my ($self,$repeat_start_position) = @_; + if ($repeat_start_position) { + $self->{'_repeat_start_position'} = $repeat_start_position; + $self->repeat_end_position("set"); + } + return $self->{'_repeat_start_position'}; +} + +=head2 repeat_end_position($value) + + Title : repeat_end_position($set) + Usage : $new_repeat_end_position = + $o_usat->repeat_end_position("set"); _or_ + $new_repeat_end_position = + $o_usat->repeat_end_position($value); _or_ + $current_repeat_end_position = $o_usat->repeat_end_position(); + Function: get/set the end position of the repeat in this sequence + Returns : A scalar representing the base index of the end of the + repeat in this Microsatellite. The first base in the sequence + is base 1. + Args : A scalar representing a value, the string "set", or no + argument (see Notes). + Notes : If you do not provide an argument to this method, the current + end position of the repeat in this Microsatellite will be + returned (a scalar). + If you provide the string "set" to this method it will set the + end position based on the start position, the length of the + motif, and the nuimber of repeats. + If you specify a value the current end position of the repeat + will be set to that value. This is a really bad idea. Don't do + it. + +=cut + +#' +sub repeat_end_position { + my ($self,$caller) = @_; + if( defined $caller ) { + if ($caller eq "set") { + $self->{'_repeat_end_position'} = + $self->{'_repeat_start_position'} + + (length($self->motif()) * $self->repeats()); + } + elsif ($caller) { + $self->{'_repeat_end_position'} = $caller; + } + } + return $self->{'_repeat_end_position'}; +} + +=head2 equals + + Title : equals + Usage : if( $mappable->equals($mapable2)) ... + Function: Test if a position is equal to another position + Returns : boolean + Args : Bio::Map::MappableI + +=cut + +sub equals { + my ($self,@args) = @_; + $self->warn("equals is not yet implemented in ". + ref($self)." yet. Check back real soon!"); +} + +=head2 less_than + + Title : less_than + Usage : if( $mappable->less_than($m2) ) ... + Function: Tests if a position is less than another position + Returns : boolean + Args : Bio::Map::MappableI + +=cut + +sub less_than { + my ($self,@args) = @_; + $self->warn("less_then is not yet implemented in ". + ref($self)." yet. Check back real soon!"); +} + +=head2 greater_than + + Title : greater_than + Usage : if( $mappable->greater_than($m2) ) ... + Function: Tests if position is greater than another position + Returns : boolean + Args : Bio::Map::MappableI + +=cut + +sub greater_than { + my ($self,@args) = @_; + $self->warn("greater_then is not yet implemented in ". + ref($self)." yet. Check back real soon!"); +} + +=head2 get_leading_flank() + + Title : get_leading_flank() + Usage : $leading_sequence = $o_usat->get_leading_flank(); + Returns : A scalar representing the sequence before the repeats in this + Microsatellite. + Args : None. + +=cut + +sub get_leading_flank { + my $self = shift; + return substr $self->sequence(),0,$self->repeat_start_position-1; + +} + +=head2 get_trailing_flank() + + Title : get_trailing_flank() + Usage : $trailing_flank = $o_usat->get_trailing_flank(); + Returns : A scalar representing the sequence after the repeats in this + Microsatellite. + Args : None. + +=cut + +sub get_trailing_flank { + my $self = shift; + return substr $self->sequence(),$self->repeat_end_position()-1; +} + + +1; + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Map/OrderedPosition.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Map/OrderedPosition.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,248 @@ +# BioPerl module for Bio::Map::OrderedPosition +# +# Cared for by Chad Matsalla +# +# Copyright Chad Matsalla +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Map::OrderedPosition - Abstracts the notion of a member + of an ordered list of markers. Each marker is a certain distance + from the one in the ordered list before it. + +=head1 SYNOPSIS + + use Bio::Map::OrderedPosition; + # the first marker in the sequence + my $position = new Bio::Map::OrderedPosition(-order => 1, + -positions => [ [ $map, 22.3] ] ); + # the second marker in the sequence, 15.6 units from the fist one + my $position2 = new Bio::Map::OrderedPosition(-order => 2, + -positions => [ [ $map, 37.9] ] ); + # the third marker in the sequence, coincidental with the second + # marker + my $position3 = new Bio::Map::OrderedPosition(-order => 3, + -posititions => [ [ $map, 37.9]] ); + +=head1 DESCRIPTION + +This object is an implementation of the PositionI interface and the +Position object handles the specific values of a position. +OrderedPosition is intended to be slightly more specific then Position +but only specific enough for a parser from the MarkerIO subsystem to +create and then pass to a client application to bless into the proper +type. For an example of how this is intended to work, see the +Mapmaker.pm. + +No units are assumed here - units are handled by context of which Map +a position is placed in. + +Se Bio::Map::Position for additional information. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Chad Matsalla + +Email bioinformatics1@dieselwurks.com + +=head1 CONTRIBUTORS + +Lincoln Stein, lstein@cshl.org +Heikki Lehvaslaiho, heikki@ebi.ac.uk +Jason Stajich, jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Map::OrderedPosition; +use vars qw(@ISA); +use strict; + +use Bio::Map::Position; + +@ISA = qw(Bio::Map::Position); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Map::OrderedPosition(); + Function: Builds a new Bio::Map::OrderedPosition object + Returns : Bio::Map::OrderedPosition + Args : -order - The order of this position + +=cut + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); +# $self->{'_order'} = []; + + my ($map, $marker, $value, $order) = + $self->_rearrange([qw( MAP + MARKER + VALUE + ORDER + )], @args); +# print join ("|-|", ($map, $marker, $value, $order)), "\n"; + $map && $self->map($map); + $marker && $self->marker($marker); + $value && $self->value($value); + $order && $self->order($order); + + return $self; +} + +=head2 order + + Title : order + Usage : $o_position->order($new_position) _or_ + $o_position->order() + Function: get/set the order position of this position in a map + Returns : + Args : If $new_position is provided, the current position of this Position + will be set to $new_position. + +=cut + +sub order { + my ($self,$order) = @_; + if ($order) { + # no point in keeping the old ones + $self->{'_order'} = $order; + } + return $self->{'_order'}; +} + +=head2 Bio::Map::Position functions + +=cut + +=head2 known_maps + + Title : known_maps + Usage : my @maps = $position->known_maps + Function: Returns the list of maps that this position has values for + Returns : list of Bio::Map::MapI unique ids + Args : none + +=head2 in_map + + Title : in_map + Usage : if ( $position->in_map($map) ) {} + Function: Tests if a position has values in a specific map + Returns : boolean + Args : a map unique id OR Bio::Map::MapI + +=head2 each_position_value + + Title : positions + Usage : my @positions = $position->each_position_value($map); + Function: Retrieve a list of positions coded as strings or ints + Returns : Array of position values for a Map + Args : Bio::Map::MapI object to get positions for + +=head2 add_position_value + + Title : add_position_value + Usage : $position->add_position_value($map,'100'); + Function: Add a numeric or string position to the PositionI container + and assoiciate it with a Bio::Map::MapI + Returns : none + Args : $map - Bio::Map::MapI + String or Numeric coding for a position on a map + +=head2 purge_position_values + + Title : purge_position_values + Usage : $position->purge_position_values + Function: Remove all the position values stored for a position + Returns : none + Args : [optional] only purge values for a given map + +=head2 equals + + Title : equals + Usage : if( $mappable->equals($mapable2)) ... + Function: Test if a position is equal to another position. + Returns : boolean + Args : Bio::Map::PositionI + +=cut + +sub equals{ + my ($self,$compare) = @_; + return 0 if ( ! defined $compare || ! $compare->isa('Bio::Map::OrderedPosition')); + return ( $compare->order == $self->order); +} + +# admittedly these are really the best comparisons in the world +# but it is a first pass we'll need to refine the algorithm or not +# provide general comparisions and require these to be implemented +# by objects closer to the specific type of data + +=head2 less_than + + Title : less_than + Usage : if( $mappable->less_than($m2) ) ... + Function: Tests if a position is less than another position + It is assumed that 2 positions are in the same map. + Returns : boolean + Args : Bio::Map::PositionI + +=cut + + +sub less_than{ + my ($self,$compare) = @_; + return 0 if ( ! defined $compare || ! $compare->isa('Bio::Map::OrderedPosition')); + return ( $compare->order < $self->order); +} + +=head2 greater_than + + Title : greater_than + Usage : if( $mappable->greater_than($m2) ) ... + Function: Tests if position is greater than another position. + It is assumed that 2 positions are in the same map. + Returns : boolean + Args : Bio::Map::PositionI + +=cut + +sub greater_than{ + my ($self,$compare) = @_; + return 0 if ( ! defined $compare || ! $compare->isa('Bio::Map::OrderedPosition')); + return ( $compare->order > $self->order); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Map/OrderedPositionWithDistance.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Map/OrderedPositionWithDistance.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,149 @@ +# BioPerl module for Bio::Map::OrderedPositionWithDistance +# +# Cared for by Chad Matsalla +# +# Copyright Chad Matsalla +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Map::OrderedPositionWithDistance - Abstracts the notion of a member + of an ordered list of markers. Each marker is a certain distance + from the one in the ordered list before it. + +=head1 SYNOPSIS + + use Bio::Map::OrderedPositionWithDistance; + # the first marker in the sequence + my $position = new Bio::Map::OrderedPositionWithDistance(-positions => 1, + -distance => 22.3 ); + # the second marker in the sequence, 15.6 units from the fist one + my $position2 = new Bio::Map::OrderedPositionWithDistance(-positions => 2, + -distance => 15.6 ); + # the third marker in the sequence, coincidental with the second + # marker + my $position3 = new Bio::Map::OrderedPositionWithDistance(-positions => 3, + -distance => 0 ); + + +=head1 DESCRIPTION + +This object is an implementation of the PositionI interface and the +Position object handles the specific values of a position. +OrderedPositionWithDistance is intended to be slightly more specific +then Position but only specific enough for a parser from the MarkerIO +subsystem to create and then pass to a client application to bless into +the proper type. For an example of how this is intended to work, see the +Mapmaker.pm. + +No units are assumed here - units are handled by context of which Map +a position is placed in. + +Se Bio::Map::Position for additional information. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Chad Matsalla + +Email bioinformatics1@dieselwurks.com + +=head1 CONTRIBUTORS + +Lincoln Stein, lstein@cshl.org +Heikki Lehvaslaiho, heikki@ebi.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Map::OrderedPositionWithDistance; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::Map::Position; + +@ISA = qw(Bio::Map::Position); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Map::OrderedPositionWithDistance(); + Function: Builds a new Bio::Map::OrderedPositionWithDistance object + Returns : Bio::Map::OrderedPositionWithDistance + Args : -positions - Should be a single value representing the order + of this marker within the list of markers + -distance - The distance this marker is from the marker before it. + 0 reflects coincidentality. + +=cut + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + $self->{'_positions'} = []; + my ($positions,$distance) = $self->_rearrange([qw(POSITIONS DISTANCE)], @args); + if( ref($positions) =~ /array/i ) { + foreach my $p ( @$positions ) { + $self->add_position($p); + } + } else { + $self->add_position($positions); + } + $distance && $self->distance($distance); + + return $self; + +} + + +=head2 distance($new_distance) + + Title : distance($new_distance) + Usage : $position->distance(new_distance) _or_ + $position->distance() + Function: get/set the distance of this position from the previous marker + Returns : A scalar representing the current distance for this position. + Args : If $new_distance is provided the distance of this Position will + be set to $new_distance + +=cut + +sub distance { + my ($self,$distance) = @_; + if ($distance) { + $self->{'_distance'} = $distance; + } + return $self->{'_distance'}; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Map/Position.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Map/Position.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,202 @@ +# $Id: Position.pm,v 1.9 2002/10/22 07:45:16 lapp Exp $ +# +# BioPerl module for Bio::Map::Position +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Map::Position - A single position of a Marker in a Map + +=head1 SYNOPSIS + + use Bio::Map::Position; + my $position = new Bio::Map::Position(-map => $map, + -marker => $marker + -value => 100 + ); + +=head1 DESCRIPTION + +This object is an implementation of the PositionI interface that +handles the specific values of a position. This allows an element +(e.g. Marker) to have multiple positions within a map and still be +treated as a single entity. + +This does not directly handle the concept of a relative map in which +no known exact positions exist but markers are just ordered relative +to one another - in that case arbitrary values must be assigned for +position values. + +No units are assumed here - units are handled by context of which Map +a position is placed in or the subclass of this Position. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Lincoln Stein, lstein@cshl.org +Heikki Lehvaslaiho, heikki@ebi.ac.uk +Chad Matsalla, bioinformatics1@dieselwurks.com + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Map::Position; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::Root::Root; +use Bio::Map::PositionI; + +@ISA = qw(Bio::Root::Root Bio::Map::PositionI ); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Map::Position(); + Function: Builds a new Bio::Map::Position object + Returns : Bio::Map::Position + Args : -map a object + -marker a object + -value string or number + +=cut + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + + my ($map, $marker, $value) = + $self->_rearrange([qw( MAP + MARKER + VALUE + )], @args); + + $map && $self->map($map); + $marker && $self->marker($marker); + $value && $self->value($value); + + return $self; +} + +=head2 map + + Title : map + Usage : my $id = map->$map; + Function: Get/Set the map the position is in. + Returns : L + Args : [optional] new L + +=cut + +sub map { + my ($self,$map) = @_; + if( defined $map ) { + $self->throw("This is [$map], not a Bio::Map::MapI object") + unless $map->isa('Bio::Map::MapI'); + $self->{'_map'} = $map; + } + return $self->{'_map'}; +} + +=head2 marker + + Title : marker + Usage : my $id = marker->$marker; + Function: Get/Set the marker the position is in. + Returns : L + Args : [optional] new L + +=cut + +sub marker { + my ($self,$marker) = @_; + if( defined $marker ) { + $self->thow("This is [$marker], not a Bio::Map::MarkerI object") + unless $marker->isa('Bio::Map::MarkerI'); + $self->{'_marker'} = $marker; + } + return $self->{'_marker'}; +} + +=head2 value + + Title : value + Usage : my $pos = $position->value; + Function: Get/Set the value for this postion + Returns : scalar, value + Args : [optional] new value to set + +=cut + +sub value { + my ($self,$value) = @_; + if( defined $value ) { + $self->{'_value'} = $value; + } + return $self->{'_value'}; +} + +=head2 numeric + + Title : numeric + Usage : my $num = $position->numeric; + Function: Read-only method that is guarantied to return a numeric + representation for this position. + Returns : numeric (int or real) + Args : none + +=cut + +sub numeric { + my ($self) = @_; + my $num = $self->{'_value'} || 0; + + # expand this to cover scientific notation, too! + $self->throw("This value [$num] is not numeric!") + unless $num && $num =~ /^[+-]?[\d.]+$/; + return $num; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Map/PositionI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Map/PositionI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,144 @@ +# $Id: PositionI.pm,v 1.8 2002/10/22 07:45:16 lapp Exp $ +# +# BioPerl module for Bio::Map::PositionI +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Map::PositionI - Abstracts the notion of a position having + a value in the context of a marker and a Map + +=head1 SYNOPSIS + + # do not use directly + +=head1 DESCRIPTION + +This object stores one of the postions a that a mappable object +(e.g. Marker) may have in a map (e.g. restriction enzymes or a SNP +mapped to several chromosomes). + +The method numeric() returns the position in a form that can be +compared between other positions of the same type. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Lincoln Stein, lstein@cshl.org +Heikki Lehvaslaiho, heikki@ebi.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Map::PositionI; +use vars qw(@ISA); +use strict; +use Bio::Root::RootI; +use Carp; + +@ISA = qw(Bio::Root::RootI); + + +=head2 map + + Title : map + Usage : my $id = map->$map; + Function: Get/Set the map the position is in. + Returns : L + Args : [optional] new L + +=cut + +sub map { + my ($self, $value) = @_; + $self->throw_not_implemented(); +} + +=head2 marker + + Title : marker + Usage : my $id = marker->$marker; + Function: Get/Set the marker the position is in. + Returns : L + Args : [optional] new L + +=cut + +sub marker { + my ($self, $value) = @_; + $self->throw_not_implemented(); +} + + +=head2 value + + Title : value + Usage : my $pos = $position->value; + Function: Get/Set the value for this position + Returns : scalar, value + Args : [optional] new value to set + +=cut + +sub value { + my ($self, $value) = @_; + $self->throw_not_implemented(); +} + +=head2 numeric + + Title : numeric + Usage : my $num = $position->numeric; + Function: Read-only method that is guarantied to return + representation for this position that can be compared with others + Returns : numeric (int, real or range) + Args : none + +=cut + +sub numeric { + my ($self) = @_; + $self->throw_not_implemented(); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Map/SimpleMap.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Map/SimpleMap.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,292 @@ +# $Id: SimpleMap.pm,v 1.8 2002/10/22 07:45:16 lapp Exp $ +# +# BioPerl module for Bio::Map::SimpleMap +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Map::SimpleMap - A MapI implementation handling the basics of a Map + +=head1 SYNOPSIS + + use Bio::Map::SimpleMap; + my $map = new Bio::Map::SimpleMap(-name => 'genethon', + -type => 'Genetic', + -units=> 'cM', + -species => $human); + foreach my $marker ( @markers ) { # get a list of markers somewhere + $map->add_element($marker); + } + +=head1 DESCRIPTION + +This is the basic implementation of a Bio::Map::MapI. It handles the +essential storage of name, species, type, and units as well as in +memory representation of the elements of a map. + +Subclasses might need to redefine or hardcode type(), length() and +units(). + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +=head1 CONTRIBUTORS + +Heikki Lehvaslaiho heikki@ebi.ac.uk +Lincoln Stein lstein@cshl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Map::SimpleMap; +use vars qw(@ISA $MAPCOUNT); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::Root::Root; +use Bio::Map::MapI; + +@ISA = qw(Bio::Root::Root Bio::Map::MapI); +BEGIN { $MAPCOUNT = 1; } + +=head2 new + + Title : new + Usage : my $obj = new Bio::Map::SimpleMap(); + Function: Builds a new Bio::Map::SimpleMap object + Returns : Bio::Map::SimpleMap + Args : -name => name of map (string) + -species => species for this map (Bio::Species) [optional] + -units => map units (string) + -elements=> elements to initialize with + (arrayref of Bio::Map::MappableI objects) [optional] + + -uid => Unique Id +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + $self->{'_elements'} = []; + $self->{'_name'} = ''; + $self->{'_species'} = ''; + $self->{'_units'} = ''; + $self->{'_type'} = ''; + $self->{'_uid'} = $MAPCOUNT++; + my ($name, $type,$species, $units, + $elements,$uid) = $self->_rearrange([qw(NAME TYPE + SPECIES UNITS + ELEMENTS UID)], @args); + defined $name && $self->name($name); + defined $species && $self->species($species); + defined $units && $self->units($units); + defined $type && $self->type($type); + defined $uid && $self->unique_id($uid); + + if( $elements && ref($elements) =~ /array/ ) { + foreach my $item ( @$elements ) { + $self->add_element($item); + } + } + return $self; +} + +=head2 species + + Title : species + Usage : my $species = $map->species; + Function: Get/Set Species for a map + Returns : Bio::Species object or string + Args : (optional) Bio::Species or string + +=cut + +sub species{ + my ($self,$value) = @_; + if( defined $value ) { + $self->{'_species'} = $value; + } + return $self->{'_species'}; +} + +=head2 units + + Title : units + Usage : $map->units('cM'); + Function: Get/Set units for a map + Returns : units for a map + Args : units for a map (string) + +=cut + +sub units{ + my ($self,$value) = @_; + if( defined $value ) { + $self->{'_units'} = $value; + } + return $self->{'_units'}; +} + +=head2 type + + Title : type + Usage : my $type = $map->type + Function: Get/Set Map type + Returns : String coding map type + Args : (optional) string + +=cut + +sub type { + my ($self,$value) = @_; + # this may be hardcoded/overriden by subclasses + + if( defined $value ) { + $self->{'_type'} = $value; + } + return $self->{'_type'}; +} + + +=head2 name + + Title : name + Usage : my $name = $map->name + Function: Get/Set Map name + Returns : Map name + Args : (optional) string + +=cut + +sub name { + my ($self,$value) = @_; + if( defined $value ) { + $self->{'_name'} = $value; + } + return $self->{'_name'}; +} + +=head2 length + + Title : length + Usage : my $length = $map->length(); + Function: Retrieves the length of the map, + It is possible for the length to be unknown + for maps such as Restriction Enzyme, will return undef + in that case + Returns : integer representing length of map in current units + will return undef if length is not calculateable + Args : none + +=cut + +sub length { + my ($self) = @_; + my ($len ) = 0; + + foreach my $marker ($self->each_element) { + $len = $marker->position->numeric if $marker->position->numeric > $len; + } + return $len; +} + + +=head2 unique_id + + Title : unique_id + Usage : my $id = $map->unique_id; + Function: Get/Set the unique ID for this map + Returns : a unique identifier + Args : [optional] new identifier to set + +=cut + +sub unique_id { + my ($self,$id) = @_; + if( defined $id ) { + $self->{'_uid'} = $id; + } + return $self->{'_uid'}; +} + + +=head2 add_element + + Title : add_element + Usage : $map->add_element($marker) + Function: Add a Bio::Map::MappableI object to the Map + Returns : none + Args : Bio::Map::MappableI object + +=cut + +sub add_element{ + my ($self,$mapelement) = @_; + return unless ( defined $mapelement); + + $self->throw("This is not a Bio::Map::MarkerI object but a [$self]") + unless $mapelement->isa('Bio::Map::MarkerI'); + + $mapelement->map($self); # tell the marker its default map + + push @{$self->{'_elements'}}, $mapelement; + +} + +=head2 each_element + + Title : each_element + Usage : my @elements = $map->each_element; + Function: Retrieves all the elements in a map + unordered + Returns : Array of Bio::Map::MappableI objects + Args : none + + +=cut + +sub each_element{ + my ($self) = @_; + return @{$self->{'_elements'}}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/MapIO.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/MapIO.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,247 @@ +# $Id: MapIO.pm,v 1.5 2002/10/22 07:45:09 lapp Exp $ +# +# BioPerl module for Bio::MapIO +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::MapIO - A Map Factory object + +=head1 SYNOPSIS + + use Bio::MapIO; + my $mapio = new Bio::MapIO(-format => "mapmaker", + -file => "mapfile.map"); + + while( my $map = $mapio->next_map ) { + # get each map + foreach my $marker ( $map->each_element ) { + # loop through the markers associated with the map + } + } + +=head1 DESCRIPTION + +This is the Factory object for reading Maps from a data stream or file. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::MapIO; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::Root::IO; +use Bio::Factory::MapFactoryI; + +@ISA = qw(Bio::Root::Root Bio::Root::IO Bio::Factory::MapFactoryI); + +=head2 new + + Title : new + Usage : my $obj = new Bio::MapIO(); + Function: Builds a new Bio::MapIO object + Returns : Bio::MapIO + Args : + + +=cut + +sub new { + my($caller,@args) = @_; + + my $class = ref($caller) || $caller; + + # or do we want to call SUPER on an object if $caller is an + # object? + if( $class =~ /Bio::MapIO::(\S+)/ ) { + my ($self) = $class->SUPER::new(@args); + $self->_initialize(@args); + return $self; + } else { + + my %param = @args; + @param{ map { lc $_ } keys %param } = values %param; # lowercase keys + my $format = $param{'-format'} || + $class->_guess_format( $param{'-file'} || $ARGV[0] ) || + 'mapmaker'; + $format = "\L$format"; # normalize capitalization to lower case + + # normalize capitalization + return undef unless( $class->_load_format_module($format) ); + return "Bio::MapIO::$format"->new(@args); + } + +} + +=head2 Bio::Factory::MapFactoryI methods + +=cut + +=head2 next_map + + Title : next_tree + Usage : my $map = $factory->next_map; + Function: Get a map from the factory + Returns : L + Args : none + + +=head2 write_map + + Title : write_tree + Usage : $factory->write_map($map); + Function: Write a map out through the factory + Returns : none + Args : L + +=cut + + +=head2 attach_EventHandler + + Title : attach_EventHandler + Usage : $parser->attatch_EventHandler($handler) + Function: Adds an event handler to listen for events + Returns : none + Args : L + +=cut + +sub attach_EventHandler{ + my ($self,$handler) = @_; + return if( ! $handler ); + if( ! $handler->isa('Bio::Event::EventHandlerI') ) { + $self->warn("Ignoring request to attatch handler ".ref($handler). ' because it is not a Bio::Event::EventHandlerI'); + } + $self->{'_handler'} = $handler; + return; +} + +=head2 _eventHandler + + Title : _eventHandler + Usage : private + Function: Get the EventHandler + Returns : L + Args : none + + +=cut + +sub _eventHandler{ + my ($self) = @_; + return $self->{'_handler'}; +} + +sub _initialize { + my($self, @args) = @_; + $self->{'_handler'} = undef; + + # initialize the IO part + $self->_initialize_io(@args); +# $self->attach_EventHandler(new Bio::MapIO::MapEventBuilder()); +} + +=head2 _load_format_module + + Title : _load_format_module + Usage : *INTERNAL MapIO stuff* + Function: Loads up (like use) a module at run time on demand + Example : + Returns : + Args : + +=cut + +sub _load_format_module { + my ($self,$format) = @_; + my $module = "Bio::MapIO::" . $format; + my $ok; + eval { + $ok = $self->_load_module($module); + }; + if ( $@ ) { + print STDERR <_guess_format($filename) + Function: + Example : + Returns : guessed format of filename (lower case) + Args : + +=cut + +sub _guess_format { + my $class = shift; + return unless $_ = shift; + return 'mapmaker' if /\.(map)$/i; + return 'mapxml' if /\.(xml)$/i; +} + +sub DESTROY { + my $self = shift; + + $self->close(); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/MapIO/mapmaker.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/MapIO/mapmaker.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,150 @@ +# $Id: mapmaker.pm,v 1.5 2002/10/22 07:45:16 lapp Exp $ +# +# BioPerl module for Bio::MapIO::mapmaker +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::MapIO::mapmaker - A Mapmaker Map reader + +=head1 SYNOPSIS + +# do not use this object directly it is accessed through the Bio::MapIO system + + use Bio::MapIO; + my $mapio = new Bio::MapIO(-format => "mapmaker", + -file => "mapfile.map"); + while( my $map = $mapio->next_map ) { + # get each map + foreach my $marker ( $map->each_element ) { + # loop through the markers associated with the map + } + } + +=head1 DESCRIPTION + +This object contains code for parsing and processing Mapmaker output +and creating L objects from it. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::MapIO::mapmaker; +use vars qw(@ISA); +use strict; + +use Bio::MapIO; +use Bio::Map::SimpleMap; +use Bio::Map::LinkagePosition; +use Bio::Map::Marker; + +@ISA = qw(Bio::MapIO ); + +=head2 next_map + + Title : next_tree + Usage : my $map = $factory->next_map; + Function: Get a map from the factory + Returns : L + Args : none + +=cut + +sub next_map{ + my ($self) = @_; + my ($ready,$map) = (0,new Bio::Map::SimpleMap('-name' => '', + '-units' => 'cM', + '-type' => 'Genetic')); + my @markers; + my $runningDistance = 0; + while( defined($_ = $self->_readline()) ) { + if ( $ready || /^\s+Markers\s+Distance/ ) { + unless ( $ready ) { $ready = 1; next } + } else { next } + + last if ( /-{5,}/); # map terminator is ------- + s/ +/\t/; + my ($number,$name,$distance) = split; + $runningDistance += $distance; + $runningDistance = '0.0' if $runningDistance == 0; +# print "$_|$number-$name-$distance---------"; + my $pos = new Bio::Map::LinkagePosition (-order => $number, + -map => $map, + -value => $runningDistance + ); + my $marker = new Bio::Map::Marker(-name=> $name, + -position => $pos, + ); + $marker->position($pos); +# use Data::Dumper; print Dumper($marker); exit; +# print $marker->position->value, "\n"; +# use Data::Dumper; print Dumper($pos); +# $map->add_element(new Bio::Map::Marker('-name'=> $name, +# '-position' => $pos, +# )); + } +# return undef if( ! $ready ); + return $map; +} + +=head2 write_map + + Title : write_tree + Usage : $factory->write_map($map); + Function: Write a map out through the factory + Returns : none + Args : Bio::Map::MapI + +=cut + +sub write_map{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Matrix/PhylipDist.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Matrix/PhylipDist.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,377 @@ +# BioPerl module for Bio::Matrix::PhylipDist +# +# Cared for by Shawn Hoon +# +# Copyright Shawn Hoon +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Matrix::PhylipDist - A Phylip Distance Matrix object + +=head1 SYNOPSIS + + use Bio::Matrix::PhylipDist; + + my $dist = Bio::Matrix::PhylipDist->new(-file=>"protdist.out",-program=>"ProtDist"); + #or + my $dist = Bio::Matrix::PhylipDist->new(-fh=>$FH,-program=>"ProtDist"); + + #get specific entries + my $distance_value = $dist->get_entry('ALPHA','BETA'); + my @columns = $dist->get_column('ALPHA'); + my @rows = $dist->get_row('BETA'); + my @diagonal = $dist->get_diagonal(); + + #print the matrix in phylip numerical format + print $dist->print_matrix; + +=head1 DESCRIPTION + +Simple object for holding Distance Matrices generated by the following Phylip programs: + +1) dnadist +2) protdist +3) restdist + +It currently handles parsing of the matrix without the data output option. + + 5 +Alpha 0.00000 4.23419 3.63330 6.20865 3.45431 +Beta 4.23419 0.00000 3.49289 3.36540 4.29179 +Gamma 3.63330 3.49289 0.00000 3.68733 5.84929 +Delta 6.20865 3.36540 3.68733 0.00000 4.43345 +Epsilon 3.45431 4.29179 5.84929 4.43345 0.00000 + +=head1 FEEDBACK + + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Shawn Hoon + +Email shawnh@fugu-sg.org + + +=head1 APPENDIX + + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a "_". + +=cut + +# Let the code begin... + +package Bio::Matrix::PhylipDist; +use strict; + +use vars qw(@ISA); + +use Bio::Root::Root; +use Bio::Root::IO; + +@ISA = qw(Bio::Root::Root Bio::Root::IO); + +=head2 new + + Title : new + Usage : my $family = Bio::Matrix::PhylipDist->new(-file=>"protdist.out", + -program=>"protdist"); + Function: Constructor for PhylipDist Object + Returns : L + +=cut + +sub new { + my ($class,@args) = @_; + my $self = $class->SUPER::new(@args); + my ($matrix,$values, $names,$file, $fh,$program) = $self->_rearrange([qw(MATRIX VALUES NAMES FILE FH PROGRAM)],@args); + + ($matrix && $values && $names) || $file || $fh || $self->throw("Need a file or file handle!"); + + $program && $self->program($program); + $self->_initialize_io(@args); + + $self->_matrix($matrix) if $matrix; + $self->_values($values) if $values; + $self->names($names) if $names; + if(!$matrix && !$values && !$names){ + $self->_parse(); + } + + return $self; +} + +=head2 _parse + + Title : _parse + Usage : $matrix->_parse(); + Function: internal method that parses the distance matrix file. + Returns : + Arguments: + +=cut + +sub _parse { + my ($self) = @_; + my @names; + my @values; + while (my $entry = $self->_readline){ + next if ($entry=~/^\s+\d+$/); + my ($n,@line) = split( /\s+/,$entry); + push @names, $n; + push @values, [@line]; + } + + my %dist; + my $i=0; + foreach my $name(@names){ + my $j=0; + foreach my $n(@names) { + $dist{$name}{$n} = [$i,$j]; + $j++; + } + $i++; + } + + $self->_matrix(\%dist); + $self->names(\@names); + $self->_values(\@values); +} + +=head2 get_entry + + Title : get_entry + Usage : $matrix->get_entry(); + Function: returns a particular entry + Returns : a float + Arguments: string id1, string id2 + +=cut + +sub get_entry { + my ($self,$row,$column) = @_; + $row && $column || $self->throw("Need at least 2 ids"); + my %matrix = %{$self->_matrix}; + my @values = @{$self->_values}; + if(ref $matrix{$row}{$column}){ + my ($i,$j) = @{$matrix{$row}{$column}}; + return $values[$i][$j]; + } + return; + +} + +=head2 get_row + + Title : get_row + Usage : $matrix->get_row('ALPHA'); + Function: returns a particular row + Returns : an array of float + Arguments: string id1 + +=cut + +sub get_row { + my ($self,$row) = @_; + $row || $self->throw("Need at least a row id"); + + my %matrix = %{$self->_matrix}; + my @values = @{$self->_values}; + my @names = @{$self->names}; + $matrix{$row} || return; + my @row = %{$matrix{$row}}; + my $row_pointer = $row[1]->[0]; + my $index = scalar(@names)-1; + return @{$values[$row_pointer]}[0..$index]; +} + +=head2 get_column + + Title : get_column + Usage : $matrix->get_column('ALPHA'); + Function: returns a particular column + Returns : an array of floats + Arguments: string id1 + +=cut + +sub get_column { + my ($self,$column) = @_; + $column || $self->throw("Need at least a column id"); + + my %matrix = %{$self->_matrix}; + my @values = @{$self->_values}; + my @names = @{$self->names}; + $matrix{$column} || return; + my @column = %{$matrix{$column}}; + my $row_pointer = $column[1]->[0]; + my @return; + for(my $i=0; $i < scalar(@names); $i++){ + push @return, $values[$i][$row_pointer]; + } + return @return; +} + +=head2 get_diagonal + + Title : get_diagonal + Usage : $matrix->get_diagonal(); + Function: returns the diagonal of the matrix + Returns : an array of float + Arguments: string id1 + +=cut + +sub get_diagonal { + my ($self) = @_; + my %matrix = %{$self->_matrix}; + my @values = @{$self->_values}; + my @return; + foreach my $name (@{$self->names}){ + my ($i,$j) = @{$matrix{$name}{$name}}; + push @return,$values[$i][$j]; + } + return @return; +} + +=head2 print_matrix + + Title : print_matrix + Usage : $matrix->print_matrix(); + Function: returns a string of the matrix in phylip format + Returns : a string + Arguments: + +=cut + +sub print_matrix { + my ($self) = @_; + my @names = @{$self->names}; + my @values = @{$self->_values}; + my %matrix = %{$self->_matrix}; + my $str; + $str.= (" "x 4). scalar(@names)."\n"; + foreach my $name (@names){ + my $newname = $name. (" " x (15-length($name))); + $str.=$newname; + my $count = 0; + foreach my $n (@names){ + my ($i,$j) = @{$matrix{$name}{$n}}; + if($count < $#names){ + $str.= $values[$i][$j]. " "; + } + else { + $str.= $values[$i][$j]; + } + $count++; + } + $str.="\n"; + } + return $str; +} + +=head2 _matrix + + Title : _matrix + Usage : $matrix->_matrix(); + Function: get/set for hash reference of the pointers + to the value matrix + Returns : hash reference + Arguments: hash reference + +=cut + +sub _matrix { + my ($self,$val) = @_; + if($val){ + $self->{'_matrix'} = $val; + } + return $self->{'_matrix'}; +} + + +=head2 names + + Title : names + Usage : $matrix->names(); + Function: get/set for array ref of names of sequences + Returns : an array reference + Arguments: an array reference + +=cut + +sub names { + my ($self,$val) = @_; + if($val){ + $self->{"_names"} = $val; + } + return $self->{'_names'}; +} + +=head2 program + + Title : program + Usage : $matrix->program(); + Function: get/set for the program name generating this + matrix + Returns : string + Arguments: string + +=cut + +sub program { + my ($self,$val) = @_; + if($val){ + $self->{'_program'} = $val; + } + return $self->{'_program'}; +} + +=head2 _values + + Title : _values + Usage : $matrix->_values(); + Function: get/set for array ref of the matrix containing + distance values + Returns : an array reference + Arguments: an array reference + +=cut + +sub _values { + my ($self,$val) = @_; + if($val){ + $self->{'_values'} = $val; + } + return $self->{'_values'}; +} + +1; + + + + + + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Ontology/GOterm.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Ontology/GOterm.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,362 @@ +# $Id: GOterm.pm,v 1.12.2.4 2003/07/03 00:41:40 lapp Exp $ +# +# BioPerl module for Bio::Ontology::GOterm +# +# Cared for by Christian M. Zmasek or +# +# (c) Christian M. Zmasek, czmasek@gnf.org, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + + +=head1 NAME + +GOterm - representation of GO terms + +=head1 SYNOPSIS + + $term = Bio::Ontology::GOterm->new + ( -go_id => "GO:0016847", + -name => "1-aminocyclopropane-1-carboxylate synthase", + -definition => "Catalysis of ...", + -is_obsolete => 0, + -comment => "" ); + + $term->add_definition_references( @refs ); + $term->add_secondary_GO_ids( @ids ); + $term->add_aliases( @aliases ); + + foreach my $dr ( $term->each_definition_reference() ) { + print $dr, "\n"; + } + + # etc. + +=head1 DESCRIPTION + +This is "dumb" class for GO terms (it provides no functionality related to graphs). +Implements Bio::Ontology::TermI. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Christian M. Zmasek + +Email: czmasek@gnf.org or cmzmasek@yahoo.com + +WWW: http://www.genetics.wustl.edu/eddy/people/zmasek/ + +Address: + + Genomics Institute of the Novartis Research Foundation + 10675 John Jay Hopkins Drive + San Diego, CA 92121 + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. + +=cut + + +# Let the code begin... + +package Bio::Ontology::GOterm; +use vars qw( @ISA ); +use strict; +use Bio::Ontology::Term; + +use constant GOID_DEFAULT => "GO:0000000"; +use constant TRUE => 1; +use constant FALSE => 0; + +@ISA = qw( Bio::Ontology::Term ); + + + + +=head2 new + + Title : new + Usage : $term = Bio::Ontology::GOterm->new( -go_id => "GO:0016847", + -name => "1-aminocyclopropane-1-carboxylate synthase", + -definition => "Catalysis of ...", + -is_obsolete => 0, + -comment => "" ); + Function: Creates a new Bio::Ontology::GOterm. + Returns : A new Bio::Ontology::GOterm object. + Args : -go_id => the goid of this GO term [GO:nnnnnnn] + or [nnnnnnn] (nnnnnnn is a zero-padded + integer of seven digits) + -name => the name of this GO term [scalar] + -definition => the definition of this GO term [scalar] + -ontology => the ontology for this term (a + Bio::Ontology::OntologyI compliant object) + -version => version information [scalar] + -is_obsolete => the obsoleteness of this GO term [0 or 1] + -comment => a comment [scalar] + +=cut + +sub new { + + my( $class,@args ) = @_; + + my $self = $class->SUPER::new( @args ); + + my ( $GO_id ) + = $self->_rearrange( [ qw( GO_ID ) ], @args ); + + $GO_id && $self->GO_id( $GO_id ); + + + return $self; + +} # new + + +=head2 init + + Title : init() + Usage : $term->init(); + Function: Initializes this GOterm to all "" and empty lists. + Returns : + Args : + +=cut + +sub init { + + my $self = shift; + + # first call the inherited version to properly chain up the hierarchy + $self->SUPER::init(@_); + + # then only initialize what we implement ourselves here + #$self->GO_id( GOID_DEFAULT ); + +} # init + + + + +=head2 GO_id + + Title : GO_id + Usage : $term->GO_id( "GO:0003947" ); + or + print $term->GO_id(); + Function: Set/get for the goid of this GO term. + + This is essentially an alias to identifier(), with added + format checking. + + Returns : The goid [GO:nnnnnnn]. + Args : The goid [GO:nnnnnnn] or [nnnnnnn] (nnnnnnn is a + zero-padded integer of seven digits) (optional). + +=cut + +sub GO_id { + my $self = shift; + my $value; + + if ( @_ ) { + $value = $self->_check_go_id( shift ); + unshift(@_, $value); + } + + return $self->identifier( @_ ); + +} # GO_id + + +=head2 get_secondary_GO_ids + + Title : get_secondary_GO_ids + Usage : @ids = $term->get_secondary_GO_ids(); + Function: Returns a list of secondary goids of this Term. + + This is aliased to remove_secondary_ids(). + + Returns : A list of secondary goids [array of [GO:nnnnnnn]] + (nnnnnnn is a zero-padded integer of seven digits). + Args : + +=cut + +sub get_secondary_GO_ids { + return shift->get_secondary_ids(@_); +} # get_secondary_GO_ids + + +=head2 add_secondary_GO_id + + Title : add_secondary_GO_id + Usage : $term->add_secondary_GO_id( @ids ); + or + $term->add_secondary_GO_id( $id ); + Function: Pushes one or more secondary goids into + the list of secondary goids. + + This is aliased to remove_secondary_ids(). + + Returns : + Args : One secondary goid [GO:nnnnnnn or nnnnnnn] or a list + of secondary goids [array of [GO:nnnnnnn or nnnnnnn]] + (nnnnnnn is a zero-padded integer of seven digits). + +=cut + +sub add_secondary_GO_id { + return shift->add_secondary_id(@_); +} # add_secondary_GO_id + + +=head2 remove_secondary_GO_ids + + Title : remove_secondary_GO_ids() + Usage : $term->remove_secondary_GO_ids(); + Function: Deletes (and returns) the secondary goids of this Term. + + This is aliased to remove_secondary_ids(). + + Returns : A list of secondary goids [array of [GO:nnnnnnn]] + (nnnnnnn is a zero-padded integer of seven digits). + Args : + +=cut + +sub remove_secondary_GO_ids { + return shift->remove_secondary_ids(@_); +} # remove_secondary_GO_ids + + + + +=head2 to_string + + Title : to_string() + Usage : print $term->to_string(); + Function: to_string method for GO terms. + Returns : A string representation of this GOterm. + Args : + +=cut + +sub to_string { + my( $self ) = @_; + + my $s = ""; + + $s .= "-- GO id:\n"; + $s .= ($self->GO_id() || '')."\n"; + $s .= "-- Name:\n"; + $s .= ($self->name() || '') ."\n"; + $s .= "-- Definition:\n"; + $s .= ($self->definition() || '') ."\n"; + $s .= "-- Category:\n"; + if ( defined( $self->ontology() ) ) { + $s .= $self->ontology()->name()."\n"; + } + else { + $s .= "\n"; + } + $s .= "-- Version:\n"; + $s .= ($self->version() || '') ."\n"; + $s .= "-- Is obsolete:\n"; + $s .= $self->is_obsolete()."\n"; + $s .= "-- Comment:\n"; + $s .= ($self->comment() || '') ."\n"; + $s .= "-- Definition references:\n"; + $s .= $self->_array_to_string( $self->get_dblinks() )."\n"; + $s .= "-- Secondary GO ids:\n"; + $s .= $self->_array_to_string( $self->get_secondary_GO_ids() )."\n"; + $s .= "-- Aliases:\n"; + $s .= $self->_array_to_string( $self->get_synonyms() ); + + return $s; + +} # to_string + + + + +# Title : _check_go_id +# Function: Checks whether the argument is [GO:nnnnnnn]. +# If "GO:" is not present, it adds it. +# Returns : The canonical GO id. +# Args : The value to be checked. +sub _check_go_id { + my ( $self, $value ) = @_; + unless ( $value =~ /^(GO:)?\d{7}$/ || $value eq GOID_DEFAULT ) { + $self->throw( "Found [" . $value + . "] where [GO:nnnnnnn] or [nnnnnnn] expected" ); + } + unless ( $value =~ /^GO:/ ) { + $value = "GO:".$value; + } + return $value; +} # _check_go_id + + + +# Title : _array_to_string +# Function: +# Returns : +# Args : +sub _array_to_string { + my( $self, @value ) = @_; + + my $s = ""; + + for ( my $i = 0; $i < scalar( @value ); ++$i ) { + if ( ! ref( $value[ $i ] ) ) { + $s .= "#" . $i . "\n-- " . $value[ $i ] . "\n"; + } + } + + return $s; + +} # _array_to_string + +################################################################# +# aliases or forwards to maintain backward compatibility +################################################################# + +*each_secondary_GO_id = \&get_secondary_GO_ids; +*add_secondary_GO_ids = \&add_secondary_GO_id; + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Ontology/InterProTerm.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Ontology/InterProTerm.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,644 @@ +# $Id: InterProTerm.pm,v 1.4.2.2 2003/03/25 12:32:16 heikki Exp $ +# +# BioPerl module for Bio::Ontology::InterProTerm +# +# Cared for by Peter Dimitrov +# +# Copyright Peter Dimitrov +# (c) Peter Dimitrov, dimitrov@gnf.org, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# +# POD documentation - main docs before the code + +=head1 NAME + +InterProTerm - Implementation of InterProI term interface + +=head1 SYNOPSIS + + my $term = Bio::Ontology::InterProTerm->new( -interpro_id => "IPR000001", + -name => "Kringle", + -definition => "Kringles are autonomous structural domains ...", + -ontology => "Domain" + ); + print $term->interpro_id(), "\n"; + print $term->name(), "\n"; + print $term->definition(), "\n"; + print $term->is_obsolete(), "\n"; + print $term->ontology->name(), "\n"; + +=head1 DESCRIPTION + +This is a simple extension of L for InterPro terms. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Peter Dimitrov + +Email dimitrov@gnf.org + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Ontology::InterProTerm; +use vars qw(@ISA); +use strict; + +use Bio::Ontology::Term; +use Bio::Annotation::Reference; + +use constant INTERPRO_ID_DEFAULT => "IPR000000"; + +@ISA = qw( Bio::Ontology::Term ); + +=head2 new + + Title : new + Usage : $term = Bio::Ontology::InterProTerm->new( -interpro_id => "IPR000002", + -name => "Cdc20/Fizzy", + -definition => "The Cdc20/Fizzy region is almost always ...", + -ontology => "Domain" + ); + + Function: Creates a new Bio::Ontology::InterProTerm. + Example : + Returns : A new Bio::Ontology::InterProTerm object. + Args : + -interpro_id => the InterPro ID of the term. Has the form IPRdddddd, where dddddd is a zero-padded six digit number + -name => the name of this InterPro term [scalar] + -definition => the definition/abstract of this InterPro term [scalar] + -ontology => ontology of InterPro terms [Bio::Ontology::OntologyI] + -comment => a comment [scalar] + +=cut + +sub new{ + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + my ( $interpro_id, + $short_name) + = $self->_rearrange( [qw( INTERPRO_ID + SHORT_NAME + ) + ], @args ); + + $interpro_id && $self->interpro_id( $interpro_id ); + $short_name && $self->short_name( $short_name ); + + return $self; +} + +=head2 init + + Title : init + Usage : $term->init(); + Function: Initializes this InterProTerm to all "" and empty lists. + Example : + Returns : + Args : + + +=cut + +sub init{ + my $self = shift; + + # first call the inherited version to properly chain up the hierarchy + $self->SUPER::init(@_); + + # then only initialize what we implement ourselves here + $self->interpro_id( INTERPRO_ID_DEFAULT ); + $self->short_name(""); + +} + +=head2 _check_interpro_id + + Title : _check_interpro_id + Usage : + Function: Performs simple check in order to validate that its argument has the form IPRdddddd, where dddddd is a zero-padded six digit number. + Example : + Returns : Returns its argument if valid, otherwise throws exception. + Args : String + + +=cut + +sub _check_interpro_id{ + my ($self, $value) = @_; + + $self->throw( "InterPro ID ".$value." is incorrect\n" ) + unless ( $value =~ /^IPR\d{6}$/ || + $value eq INTERPRO_ID_DEFAULT ); + + return $value; +} + +=head2 interpro_id + + Title : interpro_id + Usage : $obj->interpro_id($newval) + Function: Set/get for the interpro_id of this InterProTerm + Example : + Returns : value of interpro_id (a scalar) + Args : new value (a scalar, optional) + + +=cut + +sub interpro_id{ + my ($self, $value) = @_; + + if( defined $value) { + $value = $self->_check_interpro_id($value); + } + + return $self->identifier($value); +} + +=head2 short_name + + Title : short_name + Usage : $obj->short_name($newval) + Function: Set/get for the short name of this InterProTerm. + Example : + Returns : value of short_name (a scalar) + Args : new value (a scalar, optional) + + +=cut + +sub short_name{ + my ($self, $value) = @_; + + if( defined $value) { + $self->{'short_name'} = $value ? $value : undef; + } + + return $self->{'short_name'}; +} + +=head2 protein_count + + Title : protein_count + Usage : $obj->protein_count($newval) + Function: Set/get for the protein count of this InterProTerm. + Example : + Returns : value of protein_count (a scalar) + Args : new value (a scalar, optional) + + +=cut + +sub protein_count{ + my ($self,$value) = @_; + + if( defined $value) { + $self->{'protein_count'} = $value ? $value : undef; + } + + return $self->{'protein_count'}; +} + +=head2 get_references + + Title : get_references + Usage : + Function: Get the references for this InterPro term. + Example : + Returns : An array of L objects + Args : + + +=cut + +sub get_references{ + my $self = shift; + + return @{$self->{"_references"}} if exists($self->{"_references"}); + return (); +} + +=head2 add_reference + + Title : add_reference + Usage : + Function: Add one or more references to this InterPro term. + Example : + Returns : + Args : One or more L objects. + + +=cut + +sub add_reference{ + my $self = shift; + + $self->{"_references"} = [] unless exists($self->{"_references"}); + push(@{$self->{"_references"}}, @_); +} + +=head2 remove_references + + Title : remove_references + Usage : + Function: Remove all references for this InterPro term. + Example : + Returns : The list of previous references as an array of + L objects. + Args : + + +=cut + +sub remove_references{ + my $self = shift; + + my @arr = $self->get_references(); + $self->{"_references"} = []; + return @arr; +} + +=head2 get_members + + Title : get_members + Usage : @arr = get_members() + Function: Get the list of member(s) for this object. + Example : + Returns : An array of Bio::Annotation::DBLink objects + Args : + + +=cut + +sub get_members{ + my $self = shift; + + return @{$self->{'_members'}} if exists($self->{'_members'}); + return (); +} + +=head2 add_member + + Title : add_member + Usage : + Function: Add one or more member(s) to this object. + Example : + Returns : + Args : One or more Bio::Annotation::DBLink objects. + + +=cut + +sub add_member{ + my $self = shift; + + $self->{'_members'} = [] unless exists($self->{'_members'}); + push(@{$self->{'_members'}}, @_); +} + +=head2 remove_members + + Title : remove_members + Usage : + Function: Remove all members for this class. + Example : + Returns : The list of previous members as an array of + Bio::Annotation::DBLink objects. + Args : + + +=cut + +sub remove_members{ + my $self = shift; + + my @arr = $self->get_members(); + $self->{'_members'} = []; + return @arr; +} + +=head2 get_examples + + Title : get_examples + Usage : @arr = get_examples() + Function: Get the list of example(s) for this object. + + This is an element of the InterPro xml schema. + + Example : + Returns : An array of Bio::Annotation::DBLink objects + Args : + + +=cut + +sub get_examples{ + my $self = shift; + + return @{$self->{'_examples'}} if exists($self->{'_examples'}); + return (); +} + +=head2 add_example + + Title : add_example + Usage : + Function: Add one or more example(s) to this object. + + This is an element of the InterPro xml schema. + + Example : + Returns : + Args : One or more Bio::Annotation::DBLink objects. + + +=cut + +sub add_example{ + my $self = shift; + + $self->{'_examples'} = [] unless exists($self->{'_examples'}); + push(@{$self->{'_examples'}}, @_); +} + +=head2 remove_examples + + Title : remove_examples + Usage : + Function: Remove all examples for this class. + + This is an element of the InterPro xml schema. + + Example : + Returns : The list of previous examples as an array of + Bio::Annotation::DBLink objects. + Args : + + +=cut + +sub remove_examples{ + my $self = shift; + + my @arr = $self->get_examples(); + $self->{'_examples'} = []; + return @arr; +} + +=head2 get_external_documents + + Title : get_external_documents + Usage : @arr = get_external_documents() + Function: Get the list of external_document(s) for this object. + + This is an element of the InterPro xml schema. + + Example : + Returns : An array of Bio::Annotation::DBLink objects + Args : + + +=cut + +sub get_external_documents{ + my $self = shift; + + return @{$self->{'_external_documents'}} if exists($self->{'_external_documents'}); + return (); +} + +=head2 add_external_document + + Title : add_external_document + Usage : + Function: Add one or more external_document(s) to this object. + + This is an element of the InterPro xml schema. + + Example : + Returns : + Args : One or more Bio::Annotation::DBLink objects. + + +=cut + +sub add_external_document{ + my $self = shift; + + $self->{'_external_documents'} = [] unless exists($self->{'_external_documents'}); + push(@{$self->{'_external_documents'}}, @_); +} + +=head2 remove_external_documents + + Title : remove_external_documents + Usage : + Function: Remove all external_documents for this class. + + This is an element of the InterPro xml schema. + + Example : + Returns : The list of previous external_documents as an array of + Bio::Annotation::DBLink objects. + Args : + + +=cut + +sub remove_external_documents{ + my $self = shift; + + my @arr = $self->get_external_documents(); + $self->{'_external_documents'} = []; + return @arr; +} + +=head2 class_list + + Title : class_list + Usage : $obj->class_list($newval) + Function: Set/get for class list element of the InterPro xml schema + Example : + Returns : reference to an array of Bio::Annotation::DBLink objects + Args : reference to an array of Bio::Annotation::DBLink objects + + +=cut + +sub class_list{ + my ($self, $value) = @_; + + if( defined $value) { + $self->{'class_list'} = $value; + } + + return $self->{'class_list'}; +} + +=head2 to_string + + Title : to_string() + Usage : print $term->to_string(); + Function: to_string method for InterPro terms. + Returns : A string representation of this InterPro term. + Args : + +=cut + +sub to_string { + my($self) = @_; + my $s = ""; + + $s .= "-- InterPro id:\n"; + $s .= $self->interpro_id()."\n"; + if (defined $self->name) { + $s .= "-- Name:\n"; + $s .= $self->name()."\n"; + $s .= "-- Definition:\n"; + $s .= $self->definition()."\n"; + $s .= "-- Category:\n"; + if ( defined( $self->ontology() ) ) { + $s .= $self->ontology()->name()."\n"; + } else { + $s .= "\n"; + } + $s .= "-- Version:\n"; + $s .= $self->version()."\n"; + $s .= "-- Is obsolete:\n"; + $s .= $self->is_obsolete()."\n"; + $s .= "-- Comment:\n"; + $s .= $self->comment()."\n"; + if (defined $self->references) { + $s .= "-- References:\n"; + foreach my $ref ( @{$self->references} ) { + $s .= $ref->authors."\n".$ref->title."\n".$ref->location."\n\n"; + }; + $s .= "\n"; + } + if (defined $self->member_list) { + $s .= "-- Member List:\n"; + foreach my $ref ( @{$self->member_list} ) { + $s .= $ref->database."\t".$ref->primary_id."\n"; + }; + $s .= "\n"; + } + if (defined $self->external_doc_list) { + $s .= "-- External Document List:\n"; + foreach my $ref ( @{$self->external_doc_list} ) { + $s .= $ref->database."\t".$ref->primary_id."\n"; + }; + $s .= "\n"; + } + if (defined $self->examples) { + $s .= "-- Examples:\n"; + foreach my $ref ( @{$self->examples} ) { + $s .= $ref->database."\t".$ref->primary_id."\t".$ref->comment."\n"; + }; + $s .= "\n"; + } + if (defined $self->class_list) { + $s .= "-- Class List:\n"; + foreach my $ref ( @{$self->class_list} ) { + $s .= $ref->primary_id."\n"; + }; + $s .= "\n"; + } + if ($self->get_secondary_ids) { + $s .= "-- Secondary IDs:\n"; + foreach my $ref ( $self->get_secondary_ids() ) { + $s .= $ref."\n"; + }; + $s .= "\n"; + } + } + else { + $s .= "InterPro term not fully instantiated\n"; + } + return $s; +} + +=head1 Deprecated methods + +These are here for backwards compatibility. + +=cut + +=head2 secondary_ids + + Title : secondary_ids + Usage : $obj->secondary_ids($newval) + Function: This is deprecated. Use get_secondary_ids() or + add_secondary_id() instead. + Example : + Returns : reference to an array of strings + Args : reference to an array of strings + + +=cut + +sub secondary_ids{ + my $self = shift; + my @ids; + + $self->warn("secondary_ids is deprecated. Use ". + "get_secondary_ids/add_secondary_id instead."); + + # set mode? + if(@_) { + my $sids = shift; + if($sids) { + $self->add_secondary_id(@$sids); + @ids = @$sids; + } else { + # we interpret setting to undef as removing the array + $self->remove_secondary_ids(); + } + } else { + # no; get mode + @ids = $self->get_secondary_ids(); + } + return \@ids; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Ontology/Ontology.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Ontology/Ontology.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,683 @@ +# $Id: Ontology.pm,v 1.2.2.4 2003/03/27 10:07:56 lapp Exp $ +# +# BioPerl module for Bio::Ontology::Ontology +# +# Cared for by Hilmar Lapp +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# +# (c) Hilmar Lapp, hlapp at gmx.net, 2003. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2003. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Ontology::Ontology - standard implementation of an Ontology + +=head1 SYNOPSIS + + use Bio::Ontology::Ontology; + + # create ontology object + my $ont = Bio::Ontology::Ontology->new(-name => "OBF"); + + # add terms, relationships ... + my $bp = Bio::Ontology::Term->new(-name => "Bioperl"); + my $obf = Bio::Ontology::Term->new(-name => "OBF"); + my $partof = Bio::Ontology::RelationshipType->get_instance("PART_OF"); + $ont->add_term($bp); + $ont->add_term($obf); + $ont->add_relationship($bp, $obf, $partof); + + # then query + my @terms = $ont->get_root_terms(); # "OBF" + my @desc = $ont->get_descendant_terms($terms[0], $partof); # "Bioperl" + # ... see methods for other ways to query + + # for advanced users, you can re-use the query engine outside of an + # ontology to let one instance manage multiple ontologies + my $ont2 = Bio::Ontology::Ontology->new(-name => "Foundations", + -engine => $ont->engine()); + + +=head1 DESCRIPTION + +This is a no-frills implementation of L. + +The query functions are implemented by delegation to an +OntologyEngineI implementation. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +the web: + + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp at gmx.net + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Ontology::Ontology; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::Root::Root; +use Bio::Ontology::OntologyI; +use Bio::Ontology::SimpleOntologyEngine; + +@ISA = qw(Bio::Root::Root Bio::Ontology::OntologyI); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Ontology::Ontology(); + Function: Builds a new Bio::Ontology::Ontology object + Returns : an instance of Bio::Ontology::Ontology + Args : + + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + my ($name,$auth,$def,$id,$engine) = + $self->_rearrange([qw(NAME + AUTHORITY + DEFINITION + IDENTIFIER + ENGINE) + ], + @args); + defined($name) && $self->name($name); + defined($auth) && $self->authority($auth); + defined($def) && $self->definition($def); + defined($id) && $self->identifier($id); + $engine = Bio::Ontology::SimpleOntologyEngine->new() unless $engine; + $self->engine($engine); + + return $self; +} + +=head1 Methods from L + +=cut + +=head2 name + + Title : name + Usage : $obj->name($newval) + Function: Get/set the name of the ontology. + Example : + Returns : value of name (a scalar) + Args : on set, new value (a scalar or undef, optional) + + +=cut + +sub name{ + my $self = shift; + + return $self->{'name'} = shift if @_; + return $self->{'name'}; +} + +=head2 authority + + Title : authority + Usage : $obj->authority($newval) + Function: Get/set the authority for this ontology, for instance the + DNS base for the organization granting the name of the + ontology and identifiers for the terms. + + This attribute is optional and should not generally + expected by applications to have been set. It is here to + follow the rules for namespaces, which ontologies serve as + for terms. + + Example : + Returns : value of authority (a scalar) + Args : on set, new value (a scalar or undef, optional) + + +=cut + +sub authority{ + my $self = shift; + + return $self->{'authority'} = shift if @_; + return $self->{'authority'}; +} + +=head2 definition + + Title : definition + Usage : $obj->definition($newval) + Function: Get/set a descriptive definition of the ontology. + Example : + Returns : value of definition (a scalar) + Args : on set, new value (a scalar or undef, optional) + + +=cut + +sub definition{ + my $self = shift; + + return $self->{'definition'} = shift if @_; + return $self->{'definition'}; +} + +=head2 identifier + + Title : identifier + Usage : $id = $obj->identifier() + Function: Get an identifier for this ontology. + + This is primarily intended for look-up purposes. The value + is not modifiable and is determined automatically by the + implementation. Also, the identifier's uniqueness will only + hold within the scope of a particular application's run + time since it is derived from a memory location. + + Example : + Returns : value of identifier (a scalar) + Args : + + +=cut + +sub identifier{ + my $self = shift; + + if(@_) { + $self->throw("cannot modify identifier for ".ref($self)) + if exists($self->{'identifier'}); + my $id = shift; + $self->{'identifier'} = $id if $id; + } + if(! exists($self->{'identifier'})) { + ($self->{'identifier'}) = "$self" =~ /(0x[0-9a-fA-F]+)/; + } + return $self->{'identifier'}; +} + +=head2 close + + Title : close + Usage : + Function: Release any resources this ontology may occupy. In order + to efficiently release unused memory or file handles, you + should call this method once you are finished with an + ontology. + + Example : + Returns : TRUE on success and FALSE otherwise + Args : none + + +=cut + +sub close{ + my $self = shift; + + # if it is in the ontology store, remove it from there + my $store = Bio::Ontology::OntologyStore->get_instance(); + $store->remove_ontology($self); + # essentially we need to dis-associate from the engine here + $self->engine(undef); + return 1; +} + +=head1 Implementation-specific public methods + +=cut + +=head2 engine + + Title : engine + Usage : $engine = $obj->engine() + Function: Get/set the ontology engine to which all the query methods + delegate. + Example : + Returns : an object implementing L + Args : on set, new value (an object implementing + L, or undef) + + +=cut + +sub engine{ + my $self = shift; + + if(@_) { + my $engine = shift; + if($engine && (! (ref($engine) && + $engine->isa("Bio::Ontology::OntologyEngineI")))) { + $self->throw("object of class ".ref($engine)." does not implement". + " Bio::Ontology::OntologyEngineI. Bummer!"); + } + $self->{'engine'} = $engine; + } + return $self->{'engine'}; +} + +=head1 Methods defined in L + +=cut + +=head2 add_term + + Title : add_term + Usage : add_term(TermI term): TermI + Function: Adds TermI object to the ontology engine term store + + If the ontology property of the term object was not set, + this implementation will set it to itself upon adding the + term. + + Example : $oe->add_term($term) + Returns : its argument. + Args : object of class TermI. + + +=cut + +sub add_term{ + my $self = shift; + my $term = shift; + + # set ontology if not set already + $term->ontology($self) if $term && (! $term->ontology()); + return $self->engine->add_term($term,@_); +} + +=head2 add_relationship + + Title : add_relationship + Usage : add_relationship(RelationshipI relationship): RelationshipI + add_relatioship(TermI subject, TermI predicate, TermI object) + Function: Adds a relationship object to the ontology engine. + Example : + Returns : Its argument. + Args : A RelationshipI object. + + +=cut + +sub add_relationship{ + my $self = shift; + my $rel = shift; + + if($rel && $rel->isa("Bio::Ontology::TermI")) { + # we need to construct the relationship object on the fly + my ($predicate,$object) = @_; + $rel = Bio::Ontology::Relationship->new(-subject_term => $rel, + -object_term => $object, + -predicate_term => $predicate, + -ontology => $self); + } + # set ontology if not set already + $rel->ontology($self) unless $rel->ontology(); + return $self->engine->add_relationship($rel); +} + +=head2 get_relationships + + Title : get_relationships + Usage : get_relationships(TermI term): RelationshipI[] + Function: Retrieves all relationship objects in the ontology, or all + relationships of a given term. + Example : + Returns : Array of Bio::Ontology::RelationshipI objects + Args : Optionally, a Bio::Ontology::TermI compliant object + + +=cut + +sub get_relationships{ + my $self = shift; + my $term = shift; + if($term) { + # we don't need to filter in this case + return $self->engine->get_relationships($term); + } + # else we need to filter by ontology + return grep { my $ont = $_->ontology; + # the first condition is a superset of the second, but + # we add it here for efficiency reasons, as many times + # it will short-cut to true and is supposedly faster than + # string comparison + ($ont == $self) || ($ont->name eq $self->name); + } $self->engine->get_relationships(@_); +} + +=head2 get_predicate_terms + + Title : get_predicate_terms + Usage : get_predicate_terms(): TermI[] + Function: Retrieves all relationship types. + Example : + Returns : Array of TermI objects + Args : + + +=cut + +sub get_predicate_terms{ + my $self = shift; + return grep { $_->ontology->name eq $self->name; + } $self->engine->get_predicate_terms(@_); +} + +=head2 get_child_terms + + Title : get_child_terms + Usage : get_child_terms(TermI term, TermI[] predicate_terms): TermI[] + Function: Retrieves all child terms of a given term, that satisfy a + relationship among those that are specified in the second + argument or undef otherwise. get_child_terms is a special + case of get_descendant_terms, limiting the search to the + direct descendants. + + Note that a returned term may possibly be in another + ontology than this one, because the underlying engine may + manage multiple ontologies and the relationships of terms + between them. If you only want descendants within this + ontology, you need to filter the returned array. + + Example : + Returns : Array of TermI objects. + Args : First argument is the term of interest, second is the list + of relationship type terms. + + +=cut + +sub get_child_terms{ + return shift->engine->get_child_terms(@_); +} + +=head2 get_descendant_terms + + Title : get_descendant_terms + Usage : get_descendant_terms(TermI term, TermI[] rel_types): TermI[] + Function: Retrieves all descendant terms of a given term, that + satisfy a relationship among those that are specified in + the second argument or undef otherwise. + + Note that a returned term may possibly be in another + ontology than this one, because the underlying engine may + manage multiple ontologies and the relationships of terms + between them. If you only want descendants within this + ontology, you need to filter the returned array. + + Example : + Returns : Array of TermI objects. + Args : First argument is the term of interest, second is the list + of relationship type terms. + + +=cut + +sub get_descendant_terms{ + return shift->engine->get_descendant_terms(@_); +} + +=head2 get_parent_terms + + Title : get_parent_terms + Usage : get_parent_terms(TermI term, TermI[] predicate_terms): TermI[] + Function: Retrieves all parent terms of a given term, that satisfy a + relationship among those that are specified in the second + argument or undef otherwise. get_parent_terms is a special + case of get_ancestor_terms, limiting the search to the + direct ancestors. + + Note that a returned term may possibly be in another + ontology than this one, because the underlying engine may + manage multiple ontologies and the relationships of terms + between them. If you only want descendants within this + ontology, you need to filter the returned array. + + Example : + Returns : Array of TermI objects. + Args : First argument is the term of interest, second is the list + of relationship type terms. + + +=cut + +sub get_parent_terms{ + return shift->engine->get_parent_terms(@_); +} + +=head2 get_ancestor_terms + + Title : get_ancestor_terms + Usage : get_ancestor_terms(TermI term, TermI[] predicate_terms): TermI[] + Function: Retrieves all ancestor terms of a given term, that satisfy + a relationship among those that are specified in the second + argument or undef otherwise. + + Note that a returned term may possibly be in another + ontology than this one, because the underlying engine may + manage multiple ontologies and the relationships of terms + between them. If you only want descendants within this + ontology, you need to filter the returned array. + + Example : + Returns : Array of TermI objects. + Args : First argument is the term of interest, second is the list + of relationship type terms. + + +=cut + +sub get_ancestor_terms{ + return shift->engine->get_ancestor_terms(@_); +} + +=head2 get_leaf_terms + + Title : get_leaf_terms + Usage : get_leaf_terms(): TermI[] + Function: Retrieves all leaf terms from the ontology. Leaf term is a + term w/o descendants. + + Example : @leaf_terms = $obj->get_leaf_terms() + Returns : Array of TermI objects. + Args : + + +=cut + +sub get_leaf_terms{ + my $self = shift; + return grep { my $ont = $_->ontology; + # the first condition is a superset of the second, but + # we add it here for efficiency reasons, as many times + # it will short-cut to true and is supposedly faster than + # string comparison + ($ont == $self) || ($ont->name eq $self->name); + } $self->engine->get_leaf_terms(@_); +} + +=head2 get_root_terms() + + Title : get_root_terms + Usage : get_root_terms(): TermI[] + Function: Retrieves all root terms from the ontology. Root term is a + term w/o descendants. + + Example : @root_terms = $obj->get_root_terms() + Returns : Array of TermI objects. + Args : + + +=cut + +sub get_root_terms{ + my $self = shift; + return grep { my $ont = $_->ontology; + # the first condition is a superset of the second, but + # we add it here for efficiency reasons, as many times + # it will short-cut to true and is supposedly faster than + # string comparison + ($ont == $self) || ($ont->name eq $self->name); + } $self->engine->get_root_terms(@_); +} + +=head2 get_all_terms + + Title : get_all_terms + Usage : get_all_terms: TermI[] + Function: Retrieves all terms from the ontology. + + We do not mandate an order here in which the terms are + returned. In fact, the default implementation will return + them in unpredictable order. + + Example : @terms = $obj->get_all_terms() + Returns : Array of TermI objects. + Args : + + +=cut + +sub get_all_terms{ + my $self = shift; + return grep { my $ont = $_->ontology; + # the first condition is a superset of the second, but + # we add it here for efficiency reasons, as many times + # it will short-cut to true and is supposedly faster than + # string comparison + ($ont == $self) || ($ont->name eq $self->name); + } $self->engine->get_all_terms(@_); +} + +=head2 find_terms + + Title : find_terms + Usage : ($term) = $oe->find_terms(-identifier => "SO:0000263"); + Function: Find term instances matching queries for their attributes. + + An implementation may not support querying for arbitrary + attributes, but can generally be expected to accept + -identifier and -name as queries. If both are provided, + they are implicitly intersected. + + Example : + Returns : an array of zero or more Bio::Ontology::TermI objects + Args : Named parameters. The following parameters should be recognized + by any implementations: + + -identifier query by the given identifier + -name query by the given name + + +=cut + +sub find_terms{ + my $self = shift; + return grep { $_->ontology->name eq $self->name; + } $self->engine->find_terms(@_); +} + +=head1 Factory for relationships and terms + +=cut + +=head2 relationship_factory + + Title : relationship_factory + Usage : $fact = $obj->relationship_factory() + Function: Get (and set, if the engine supports it) the object + factory to be used when relationship objects are created by + the implementation on-the-fly. + + Example : + Returns : value of relationship_factory (a Bio::Factory::ObjectFactoryI + compliant object) + Args : + + +=cut + +sub relationship_factory{ + return shift->engine->relationship_factory(@_); +} + +=head2 term_factory + + Title : term_factory + Usage : $fact = $obj->term_factory() + Function: Get (and set, if the engine supports it) the object + factory to be used when term objects are created by + the implementation on-the-fly. + + Example : + Returns : value of term_factory (a Bio::Factory::ObjectFactoryI + compliant object) + Args : + + +=cut + +sub term_factory{ + return shift->engine->term_factory(@_); +} + + +################################################################# +# aliases +################################################################# + +*get_relationship_types = \&get_predicate_terms; + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Ontology/OntologyEngineI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Ontology/OntologyEngineI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,413 @@ +# $Id: OntologyEngineI.pm,v 1.2.2.3 2003/03/27 10:07:56 lapp Exp $ +# +# BioPerl module for OntologyEngineI +# +# Cared for by Peter Dimitrov +# +# (c) Peter Dimitrov +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +OntologyEngineI - Interface a minimal Ontology implementation should satisfy + +=head1 SYNOPSIS + + # see documentation of methods + +=head1 DESCRIPTION + +This describes the minimal interface an ontology query engine should +provide. It intentionally doesn't make explicit references to the +ontology being a DAG, nor does it mandate that the ontology be a +vocabulary. Rather, it tries to generically express what should be +accessible (queriable) about an ontology. + +The idea is to allow for different implementations for different +purposes, which may then differ as to which operations are efficient +and which aren't, and how much richer the functionality is on top of +this minimalistic set of methods. Check modules in the Bio::Ontology +namespace to find out which implementations exist. At the time of +writing, there is a SimpleOntologyEngine (which does not use +Graph.pm), and a Graph.pm-based implementation in SimpleGOEngine. + +Ontology parsers in Bio::OntologyIO are required to return an +implementation of this interface. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Peter Dimitrov + +Email dimitrov@gnf.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Ontology::OntologyEngineI; +use vars qw(@ISA); +use strict; +use Carp; +use Bio::Root::RootI; + +@ISA = qw( Bio::Root::RootI ); + +=head2 add_term + + Title : add_term + Usage : add_term(TermI term): TermI + Function: Adds TermI object to the ontology engine term store + Example : $oe->add_term($term) + Returns : its argument. + Args : object of class TermI. + + +=cut + +sub add_term{ + shift->throw_not_implemented(); +} + +=head2 add_relationship + + Title : add_relationship + Usage : add_relationship(RelationshipI relationship): RelationshipI + Function: Adds a relationship object to the ontology engine. + Example : + Returns : Its argument. + Args : A RelationshipI object. + + +=cut + +sub add_relationship{ + shift->throw_not_implemented(); +} + +=head2 get_relationships + + Title : get_relationships + Usage : get_relationships(TermI term): RelationshipI[] + Function: Retrieves all relationship objects from this ontology engine, + or all relationships of a term if a term is supplied. + Example : + Returns : Array of Bio::Ontology::RelationshipI objects + Args : None, or a Bio::Ontology::TermI compliant object for which + to retrieve the relationships. + + +=cut + +sub get_relationships{ + shift->throw_not_implemented(); +} + +=head2 get_predicate_terms + + Title : get_predicate_terms + Usage : get_predicate_terms(): TermI[] + Function: + Example : + Returns : + Args : + + +=cut + +sub get_predicate_terms{ + shift->throw_not_implemented(); +} + +=head2 get_child_terms + + Title : get_child_terms + Usage : get_child_terms(TermI term, TermI[] predicate_terms): TermI[] + Function: Retrieves all child terms of a given term, that satisfy a + relationship among those that are specified in the second + argument or undef otherwise. get_child_terms is a special + case of get_descendant_terms, limiting the search to the + direct descendants. + + Example : + Returns : Array of TermI objects. + Args : First argument is the term of interest, second is the list + of relationship type terms. + + +=cut + +sub get_child_terms{ + shift->throw_not_implemented(); +} + +=head2 get_descendant_terms + + Title : get_descendant_terms + Usage : get_descendant_terms(TermI term, TermI[] rel_types): TermI[] + Function: Retrieves all descendant terms of a given term, that + satisfy a relationship among those that are specified in + the second argument or undef otherwise. + Example : + Returns : Array of TermI objects. + Args : First argument is the term of interest, second is the list + of relationship type terms. + + +=cut + +sub get_descendant_terms{ + shift->throw_not_implemented(); +} + +=head2 get_parent_terms + + Title : get_parent_terms + Usage : get_parent_terms(TermI term, TermI[] predicate_terms): TermI[] + Function: Retrieves all parent terms of a given term, that satisfy a + relationship among those that are specified in the second + argument or undef otherwise. get_parent_terms is a special + case of get_ancestor_terms, limiting the search to the + direct ancestors. + + Example : + Returns : Array of TermI objects. + Args : First argument is the term of interest, second is the list + of relationship type terms. + + +=cut + +sub get_parent_terms{ + shift->throw_not_implemented(); +} + +=head2 get_ancestor_terms + + Title : get_ancestor_terms + Usage : get_ancestor_terms(TermI term, TermI[] predicate_terms): TermI[] + Function: Retrieves all ancestor terms of a given term, that satisfy + a relationship among those that are specified in the second + argument or undef otherwise. + + Example : + Returns : Array of TermI objects. + Args : First argument is the term of interest, second is the list + of relationship type terms. + + +=cut + +sub get_ancestor_terms{ + shift->throw_not_implemented(); +} + +=head2 get_leaf_terms + + Title : get_leaf_terms + Usage : get_leaf_terms(): TermI[] + Function: Retrieves all leaf terms from the ontology. Leaf term is a + term w/o descendants. + + Example : @leaf_terms = $obj->get_leaf_terms() + Returns : Array of TermI objects. + Args : + + +=cut + +sub get_leaf_terms{ + shift->throw_not_implemented(); +} + +=head2 get_root_terms + + Title : get_root_terms + Usage : get_root_terms(): TermI[] + Function: Retrieves all root terms from the ontology. Root term is a + term w/o ancestors. + + Example : @root_terms = $obj->get_root_terms() + Returns : Array of TermI objects. + Args : + + +=cut + +sub get_root_terms{ + shift->throw_not_implemented(); +} + +=head1 Factory for relationships and terms + +=cut + +=head2 relationship_factory + + Title : relationship_factory + Usage : $fact = $obj->relationship_factory() + Function: Get (and set, if the implementation supports it) the object + factory to be used when relationship objects are created by + the implementation on-the-fly. + + Example : + Returns : value of relationship_factory (a Bio::Factory::ObjectFactory + compliant object) + Args : + + +=cut + +sub relationship_factory{ + return shift->throw_not_implemented(); +} + +=head2 term_factory + + Title : term_factory + Usage : $fact = $obj->term_factory() + Function: Get (and set, if the implementation supports it) the object + factory to be used when term objects are created by + the implementation on-the-fly. + + Example : + Returns : value of term_factory (a Bio::Factory::ObjectFactory + compliant object) + Args : + + +=cut + +sub term_factory{ + return shift->throw_not_implemented(); +} + +=head1 Decorator Methods + + These methods come with a default implementation that uses the + abstract methods defined for this interface. This may not be very + efficient, and hence implementors are encouraged to override these + methods if they can provide more efficient implementations. + +=cut + +=head2 get_all_terms + + Title : get_all_terms + Usage : get_all_terms: TermI[] + Function: Retrieves all terms from the ontology. + + This is more a decorator method. We provide a default + implementation here that loops over all root terms and gets + all descendants for each root term. The overall union of + terms is then made unique by name and ontology. + + We do not mandate an order here in which the terms are + returned. In fact, the default implementation will return + them in unpredictable order. + + Engine implementations that can provide a more efficient + method for obtaining all terms should definitely override + this. + + Example : @terms = $obj->get_all_terms() + Returns : Array of TermI objects. + Args : + + +=cut + +sub get_all_terms{ + my $self = shift; + # get all root nodes + my @roots = $self->get_root_terms(); + # accumulate all descendants for each root term + my @terms = map { $self->get_descendant_terms($_); } @roots; + # add on the root terms themselves + push(@terms, @roots); + # make unique by name and ontology + my %name_map = map { ($_->name."@".$_->ontology->name, $_); } @terms; + # done + return values %name_map; +} + +=head2 find_terms + + Title : find_terms + Usage : ($term) = $oe->find_terms(-identifier => "SO:0000263"); + Function: Find term instances matching queries for their attributes. + + An implementation may not support querying for arbitrary + attributes, but can generally be expected to accept + -identifier and -name as queries. If both are provided, + they are implicitly intersected. + + Example : + Returns : an array of zero or more Bio::Ontology::TermI objects + Args : Named parameters. The following parameters should be recognized + by any implementation: + + -identifier query by the given identifier + -name query by the given name + + +=cut + +sub find_terms{ + my $self = shift; + my %params = @_; + @params{ map { lc $_; } keys %params } = values %params; # lowercase keys + + my @terms = grep { + my $ok = exists($params{-identifier}) ? + $_->identifier() eq $params{-identifier} : 1; + $ok && ((! exists($params{-name})) || + ($_->name() eq $params{-name})); + } $self->get_all_terms(); + return @terms; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Ontology/OntologyI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Ontology/OntologyI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,435 @@ +# $Id: OntologyI.pm,v 1.2.2.4 2003/03/27 10:07:56 lapp Exp $ +# +# BioPerl module for Bio::Ontology::OntologyI +# +# Cared for by Hilmar Lapp +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# +# (c) Hilmar Lapp, hlapp at gmx.net, 2003. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2003. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Ontology::OntologyI - Interface for an ontology implementation + +=head1 SYNOPSIS + + # see method documentation + +=head1 DESCRIPTION + +This describes the minimal interface an ontology implementation must +provide. In essence, it represents a namespace with description on top +of the query interface OntologyEngineI. + +This interface inherits from L. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp at gmx.net + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Ontology::OntologyI; +use vars qw(@ISA); +use strict; + +use Bio::Ontology::OntologyEngineI; + +@ISA = qw( Bio::Ontology::OntologyEngineI ); + +=head1 Methods defined in this interface. + +=cut + +=head2 name + + Title : name + Usage : $obj->name($newval) + Function: Get/set the name of this ontology. + Example : + Returns : value of name (a scalar) + Args : + + +=cut + +sub name{ + shift->throw_not_implemented(); +} + +=head2 authority + + Title : authority + Usage : $auth = $obj->authority() + Function: Get/set the authority for this ontology, for instance the + DNS base for the organization granting the name of the + ontology and identifiers for the terms. + + This attribute is optional and should not generally + expected by applications to have been set. It is here to + follow the rules for namespaces, which ontologies serve as + for terms. + + Example : + Returns : value of authority (a scalar) + Args : + + +=cut + +sub authority{ + shift->throw_not_implemented(); +} + +=head2 identifier + + Title : identifier + Usage : $id = $obj->identifier() + Function: Get an identifier for this ontology. + + This is primarily intended for look-up purposes. Clients + should not expect the value to be modifiable, and it may + not be allowed to set its value from outside. Also, the + identifier's uniqueness may only hold within the scope of a + particular application's run time, i.e., it may be a memory + location. + + Example : + Returns : value of identifier (a scalar) + Args : + + +=cut + +sub identifier{ + shift->throw_not_implemented(); +} + +=head2 definition + + Title : definition + Usage : $def = $obj->definition() + Function: Get a descriptive definition for this ontology. + Example : + Returns : value of definition (a scalar) + Args : + + +=cut + +sub definition{ + shift->throw_not_implemented(); +} + +=head2 close + + Title : close + Usage : + Function: Release any resources this ontology may occupy. In order + to efficiently release used memory or file handles, you + should call this method once you are finished with an + ontology. + + Example : + Returns : TRUE on success and FALSE otherwise + Args : none + + +=cut + +sub close{ + shift->throw_not_implemented(); +} + +=head1 Methods inherited from L + +Their documentations are copied here for completeness. In most use +cases, you will want to access the query methods of an ontology, not +just the name and description ... + +=cut + +=head2 add_term + + Title : add_term + Usage : add_term(TermI term): TermI + Function: Adds TermI object to the ontology engine term store. + + For ease of use, if the ontology property of the term + object was not set, an implementation is encouraged to set + it to itself upon adding the term. + + Example : $oe->add_term($term) + Returns : its argument. + Args : object of class TermI. + + +=cut + +=head2 add_relationship + + Title : add_relationship + Usage : add_relationship(RelationshipI relationship): RelationshipI + Function: Adds a relationship object to the ontology engine. + Example : + Returns : Its argument. + Args : A RelationshipI object. + + +=cut + +=head2 get_relationships + + Title : get_relationships + Usage : get_relationships(TermI term): RelationshipI[] + Function: Retrieves all relationship objects from this ontology engine, + or all relationships of a term if a term is supplied. + Example : + Returns : Array of Bio::Ontology::RelationshipI objects + Args : None, or a Bio::Ontology::TermI compliant object for which + to retrieve the relationships. + + +=cut + +=head2 get_predicate_terms + + Title : get_predicate_terms + Usage : get_predicate_terms(): TermI[] + Function: + Example : + Returns : + Args : + + +=cut + +=head2 get_child_terms + + Title : get_child_terms + Usage : get_child_terms(TermI term, TermI[] predicate_terms): TermI[] + Function: Retrieves all child terms of a given term, that satisfy a + relationship among those that are specified in the second + argument or undef otherwise. get_child_terms is a special + case of get_descendant_terms, limiting the search to the + direct descendants. + + Example : + Returns : Array of TermI objects. + Args : First argument is the term of interest, second is the list + of relationship type terms. + + +=cut + +=head2 get_descendant_terms + + Title : get_descendant_terms + Usage : get_descendant_terms(TermI term, TermI[] rel_types): TermI[] + Function: Retrieves all descendant terms of a given term, that + satisfy a relationship among those that are specified in + the second argument or undef otherwise. + Example : + Returns : Array of TermI objects. + Args : First argument is the term of interest, second is the list + of relationship type terms. + + +=cut + +=head2 get_parent_terms + + Title : get_parent_terms + Usage : get_parent_terms(TermI term, TermI[] predicate_terms): TermI[] + Function: Retrieves all parent terms of a given term, that satisfy a + relationship among those that are specified in the second + argument or undef otherwise. get_parent_terms is a special + case of get_ancestor_terms, limiting the search to the + direct ancestors. + + Example : + Returns : Array of TermI objects. + Args : First argument is the term of interest, second is the list + of relationship type terms. + + +=cut + +=head2 get_ancestor_terms + + Title : get_ancestor_terms + Usage : get_ancestor_terms(TermI term, TermI[] predicate_terms): TermI[] + Function: Retrieves all ancestor terms of a given term, that satisfy + a relationship among those that are specified in the second + argument or undef otherwise. + + Example : + Returns : Array of TermI objects. + Args : First argument is the term of interest, second is the list + of relationship type terms. + + +=cut + +=head2 get_leaf_terms + + Title : get_leaf_terms + Usage : get_leaf_terms(): TermI[] + Function: Retrieves all leaf terms from the ontology. Leaf term is a + term w/o descendants. + + Example : @leaf_terms = $obj->get_leaf_terms() + Returns : Array of TermI objects. + Args : + + +=cut + +=head2 get_root_terms() + + Title : get_root_terms + Usage : get_root_terms(): TermI[] + Function: Retrieves all root terms from the ontology. Root term is a + term w/o descendants. + + Example : @root_terms = $obj->get_root_terms() + Returns : Array of TermI objects. + Args : + + +=cut + +=head2 get_all_terms + + Title : get_all_terms + Usage : get_all_terms: TermI[] + Function: Retrieves all terms from the ontology. + + We do not mandate an order here in which the terms are + returned. In fact, the default implementation will return + them in unpredictable order. + + Example : @terms = $obj->get_all_terms() + Returns : Array of TermI objects. + Args : + + +=cut + + +=head2 find_terms + + Title : find_terms + Usage : ($term) = $oe->find_terms(-identifier => "SO:0000263"); + Function: Find term instances matching queries for their attributes. + + An implementation may not support querying for arbitrary + attributes, but can generally be expected to accept + -identifier and -name as queries. If both are provided, + they are implicitly intersected. + + Example : + Returns : an array of zero or more Bio::Ontology::TermI objects + Args : Named parameters. The following parameters should be recognized + by any implementation: + + -identifier query by the given identifier + -name query by the given name + + +=cut + +=head1 Factory for relationships and terms + +=cut + +=head2 relationship_factory + + Title : relationship_factory + Usage : $fact = $obj->relationship_factory() + Function: Get (and set, if the implementation supports it) the object + factory to be used when relationship objects are created by + the implementation on-the-fly. + + Example : + Returns : value of relationship_factory (a Bio::Factory::ObjectFactoryI + compliant object) + Args : + + +=cut + +sub relationship_factory{ + return shift->throw_not_implemented(); +} + +=head2 term_factory + + Title : term_factory + Usage : $fact = $obj->term_factory() + Function: Get (and set, if the implementation supports it) the object + factory to be used when term objects are created by + the implementation on-the-fly. + + Example : + Returns : value of term_factory (a Bio::Factory::ObjectFactoryI + compliant object) + Args : + + +=cut + +sub term_factory{ + return shift->throw_not_implemented(); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Ontology/OntologyStore.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Ontology/OntologyStore.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,260 @@ +# $Id: OntologyStore.pm,v 1.1.2.2 2003/03/27 10:07:56 lapp Exp $ +# +# BioPerl module for Bio::Ontology::OntologyStore +# +# Cared for by Hilmar Lapp +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Ontology::OntologyStore - A repository of ontologies + +=head1 SYNOPSIS + + # see documentation of methods + +=head1 DESCRIPTION + +The primary purpose of this module is that of a singleton repository +of L instances from which an Ontology +instance can be retrieved by name or identifier. This enables TermI +implementations to return their corresponding OntologyI through using +this singleton store instead of storing a direct reference to the +Ontology object. The latter would almost inevitably lead to memory +cycles, and would therefore potentially blow up an application. + +As a user of Ontology objects and Term objects you almost certainly +will not need to deal with this module. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +the web: + + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp at gmx.net + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Ontology::OntologyStore; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::Root::Root; + + +@ISA = qw(Bio::Root::Root ); + +# these are the static ontology stores by name and by identifier - there is +# only one of each in any application +my %ont_store_by_name = (); +my %ont_store_by_id = (); +# also, this is really meant as a singleton object, so we try to enforce it +my $instance = undef; + +=head2 new + + Title : new + Usage : my $obj = new Bio::Ontology::OntologyStore(); + Function: Returns the Bio::Ontology::OntologyStore object. + + Unlike usual implementations of new, this implementation + will try to return a previously instantiated store, if + there is any. It is just a synonym for get_instance. In + order to avoid ambiguities in your code, you may rather + want to call rather get_instance explicitly, which also + usually is better associated with this kind of behaviour. + + Returns : an instance of Bio::Ontology::OntologyStore + Args : + + +=cut + +sub new { + return shift->get_instance(@_); +} + +=head2 get_instance + + Title : get_instance + Usage : + Function: Get an instance of this class for perusal. + + Since by design this class is meant to be used as a + singleton, the implementation will return a previously + instantianted store if there is one, and instantiate a new + one otherwise. In order to use this class by means of an + instance, call this method for added code clarity, not + new(). + + Example : + Returns : an instance of this class + Args : named parameters, if any (currently, there are no + class-specific parameters other than those accepted by + L. + + +=cut + +sub get_instance{ + my ($self,@args) = @_; + + if(! $instance) { + $instance = $self->SUPER::new(@args); + } + return $instance; +} + +=head2 get_ontology + + Title : get_ontology + Usage : + Function: Get a previously instantiated and registered instance of + this class by name or by identifier. + + One of the main purposes of this class is to enable TermI + implementations to return their respective ontology without + keeping a strong reference to the respective ontology + object. Only objects previously registered objects can be + retrieved. + + This is a class method, hence you can call it on the class + name, without dereferencing an object. + + Example : + Returns : a L implementing object, or undef + if the query could not be satisfied + Args : Named parameters specifying the query. The following parameters + are recognized: + -name query the store for an ontology with the given name + -id query for an ontology with the given identifier + If both are specified, an implicit AND logical operator is + assumed. + +=cut + +sub get_ontology{ + my ($self,@args) = @_; + my $ont; + + my ($name,$id) = $self->_rearrange([qw(NAME ID)], @args); + if($id) { + $ont = $ont_store_by_id{$id}; + return unless $ont; # no AND can be satisfied in this case + } + if($name) { + my $o = $ont_store_by_name{$name}; + if((! $ont) || ($ont->identifier() eq $o->identifier())) { + $ont = $o; + } else { + $ont = undef; + } + } + return $ont; +} + +=head2 register_ontology + + Title : register_ontology + Usage : + Function: Registers the given Ontology object for later retrieval + by name and identifier. + + Example : + Returns : TRUE on success and FALSE otherwise + Args : the L object(s) to register + + +=cut + +sub register_ontology{ + my ($self,@args) = @_; + my $ret = 1; + + foreach my $ont (@args) { + if(! (ref($ont) && $ont->isa("Bio::Ontology::OntologyI"))) { + $self->throw((ref($ont) ? ref($ont) : $ont)." does not implement ". + "Bio::Ontology::OntologyI or is not an object"); + } + if($self->get_ontology(-name => $ont->name())) { + $self->warn("ontology with name \"".$ont->name(). + "\" already exists in the store, ignoring new one"); + $ret = 0; + next; + } + if($self->get_ontology(-id => $ont->identifier())) { + $self->warn("ontology with id \"".$ont->identifier(). + "\" already exists in the store, ignoring new one"); + $ret = 0; + next; + } + $ont_store_by_name{$ont->name()} = $ont; + $ont_store_by_id{$ont->identifier()} = $ont; + } + return $ret; +} + +=head2 remove_ontology + + Title : remove_ontology + Usage : + Function: Remove the specified ontology from the store. + Example : + Returns : TRUE on success and FALSE otherwise + Args : the L implementing object(s) + to be removed from the store + + +=cut + +sub remove_ontology{ + my $self = shift; + my $ret = 1; + + foreach my $ont (@_) { + $self->throw(ref($ont)." does not implement Bio::Ontology::OntologyI") + unless $ont && ref($ont) && $ont->isa("Bio::Ontology::OntologyI"); + # remove it from both the id hash and the name hash + delete $ont_store_by_id{$ont->identifier()}; + delete $ont_store_by_name{$ont->name()} if $ont->name(); + } + return 1; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Ontology/Path.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Ontology/Path.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,194 @@ +# $Id: Path.pm,v 1.1.2.2 2003/03/27 10:07:56 lapp Exp $ +# +# BioPerl module for Path +# +# Cared for by Hilmar Lapp +# +# (c) Hilmar Lapp, hlapp at gmx.net, 2003. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2003. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Path - a path for an ontology term graph + +=head1 SYNOPSIS + + $path = Bio::Ontology::Path->new( -identifier => "16847", + -subject_term => $subj, + -object_term => $obj, + -predicate_term => $pred, + -distance => 3 ); + +=head1 DESCRIPTION + +This is a basic implementation of Bio::Ontology::PathI. + +Essiantially this is a very thin extension of +L. It basically adds a method distance(). + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + + Hilmar Lapp + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Ontology::Path; +use vars qw( @ISA ); +use strict; +use Bio::Ontology::PathI; +use Bio::Ontology::Relationship; + +@ISA = qw( Bio::Ontology::Relationship + Bio::Ontology::PathI ); + + + + +=head2 new + + Title : new + Usage : $rel = Bio::Ontology::Path->new(-identifier => "16847", + -subject_term => $subject, + -object_term => $object, + -predicate_term => $type ); + -distance => 3 ); + Function: Creates a new Bio::Ontology::Path. + Returns : A new Bio::Ontology::Path object. + Args : -identifier => the identifier of this relationship [scalar] + -subject_term => the subject term [Bio::Ontology::TermI] + -object_term => the object term [Bio::Ontology::TermI] + -predicate_term => the predicate term [Bio::Ontology::TermI] + -distance => the distance between subject and object + +=cut + +sub new { + + my( $class, @args ) = @_; + + my $self = $class->SUPER::new( @args ); + + my ( $distance ) = + $self->_rearrange( [qw( DISTANCE) + ], @args ); + + $distance && $self->distance($distance); + + return $self; + +} # new + + + +=head2 init + + Title : init() + Usage : $rel->init(); + Function: Initializes this Path to all undef. + Returns : + Args : + +=cut + +sub init { + my $self = shift; + + $self->SUPER::init(@_); + $self->{ "_distance" } = undef; + +} # init + + +=head2 distance + + Title : distance + Usage : $obj->distance($newval) + Function: Get/set the distance between the two terms connected + by this path. + + Note that modifying the distance may not be meaningful. The + implementation here is not connected to any graph engine, + so changing an existing value may simply render the + attribute's value wrong. + + Example : + Returns : value of distance (a scalar) + Args : on set, new value (a scalar or undef, optional) + + +=cut + +sub distance{ + my $self = shift; + + return $self->{'_distance'} = shift if @_; + return $self->{'_distance'}; +} + +=head2 to_string + + Title : to_string() + Usage : print $rel->to_string(); + Function: to_string method for Path. + Returns : A string representation of this Path. + Args : + +=cut + +sub to_string { + my( $self ) = @_; + + my $s = $self->SUPER::to_string(); + $s .= "-- Distance:\n"; + $s .= $self->distance() if defined($self->distance()); + $s .= "\n"; + + return $s; + +} # to_string + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Ontology/PathI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Ontology/PathI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,170 @@ +# $Id: PathI.pm,v 1.1.2.2 2003/03/27 10:07:56 lapp Exp $ +# +# BioPerl module for PathI +# +# Cared for by Hilmar Lapp +# +# (c) Hilmar Lapp, hlapp at gmx.net, 2003. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2003. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +PathI - Interface for a path between ontology terms + +=head1 SYNOPSIS + + # see documentation of methods and an implementation, e.g., + # Bio::Ontology::Path + +=head1 DESCRIPTION + +This is the minimal interface for a path between two terms in +an ontology. Ontology engines may use this. + +Essentially this is a very thin extension of the +L interface. It basically adds an +attribute distance(). For a RelationshipI, you can think of distance as +equal to zero (subject == object) or 1 (subject != object). + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp at gmx.net + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Ontology::PathI; +use vars qw(@ISA); +use strict; +use Bio::Ontology::RelationshipI; + +@ISA = qw( Bio::Ontology::RelationshipI ); + + +=head2 distance + + Title : distance + Usage : $obj->distance($newval) + Function: Get (and set if the implementation allows it) the distance + between the two terms connected by this path. + + Example : + Returns : value of distance (a scalar) + Args : on set, new value (a scalar or undef, optional) + + +=cut + +sub distance{ + return shift->throw_not_implemented(); +} + +=head1 Bio::Ontology::RelationshipI Methods + +=cut + +=head2 subject_term + + Title : subject_term + Usage : $subj = $rel->subject_term(); + Function: Set/get for the subject term of this Relationship. + + The common convention for ontologies is to express + relationships between terms as triples (subject, predicate, + object). + + Returns : The subject term [Bio::Ontology::TermI]. + Args : + +=cut + +=head2 object_term + + Title : object_term + Usage : $object = $rel->object_term(); + Function: Set/get for the object term of this Relationship. + + The common convention for ontologies is to express + relationships between terms as triples (subject, predicate, + object). + + Returns : The object term [Bio::Ontology::TermI]. + Args : + +=cut + +=head2 predicate_term + + Title : predicate_term + Usage : $type = $rel->predicate_term(); + Function: Set/get for the predicate of this relationship. + + For a path the predicate (relationship type) is defined as + the greatest common denominator of all predicates + (relationship types) encountered along the path. I.e., if + predicate A is-a predicate B, the greatest common + denominator for a path containing both predicates A and B is B + + Returns : The predicate term [Bio::Ontology::TermI]. + Args : + +=cut + +=head2 ontology + + Title : ontology + Usage : $ont = $obj->ontology() + Function: Get the ontology that defined this relationship. + Example : + Returns : an object implementing L + Args : + + +=cut + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Ontology/Relationship.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Ontology/Relationship.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,405 @@ +# $Id: Relationship.pm,v 1.4.2.3 2003/03/27 10:07:56 lapp Exp $ +# +# BioPerl module for Relationship +# +# Cared for by Christian M. Zmasek or +# +# (c) Christian M. Zmasek, czmasek@gnf.org, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Relationship - a relationship for an ontology + +=head1 SYNOPSIS + + $rel = Bio::Ontology::Relationship->new( -identifier => "16847", + -subject_term => $subj, + -object_term => $obj, + -predicate_term => $pred ); + +=head1 DESCRIPTION + +This is a basic implementation of Bio::Ontology::RelationshipI. + +The terminology we use here is the one commonly used for ontologies, +namely the triple of (subject, predicate, object), which in addition +is scoped in a namespace (ontology). It is called triple because it is +a tuple of three ontology terms. + +There are other terminologies in use for expressing relationships. For +those who it helps to better understand the concept, the triple of +(child, relationship type, parent) would be equivalent to the +terminology chosen here, disregarding the question whether the notion +of parent and child is sensible in the context of the relationship +type or not. Especially in the case of ontologies with a wide variety +of predicates the parent/child terminology and similar ones can +quickly become ambiguous (e.g., A synthesises B), meaningless (e.g., A +binds B), or even conflicting (e.g., A is-parent-of B), and are +therefore strongly discouraged. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Christian M. Zmasek + +Email: czmasek@gnf.org or cmzmasek@yahoo.com + +WWW: http://www.genetics.wustl.edu/eddy/people/zmasek/ + +Address: + + Genomics Institute of the Novartis Research Foundation + 10675 John Jay Hopkins Drive + San Diego, CA 92121 + +=head1 CONTRIBUTORS + + Hilmar Lapp, email: hlapp at gmx.net + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Ontology::Relationship; +use vars qw( @ISA ); +use strict; +use Bio::Root::Root; +use Bio::Ontology::TermI; +use Bio::Ontology::RelationshipI; + +@ISA = qw( Bio::Root::Root + Bio::Ontology::RelationshipI ); + + + + +=head2 new + + Title : new + Usage : $rel = Bio::Ontology::Relationship->new(-identifier => "16847", + -subject_term => $subject, + -object_term => $object, + -predicate_term => $type ); + Function: Creates a new Bio::Ontology::Relationship. + Returns : A new Bio::Ontology::Relationship object. + Args : -identifier => the identifier of this relationship [scalar] + -subject_term => the subject term [Bio::Ontology::TermI] + -object_term => the object term [Bio::Ontology::TermI] + -predicate_term => the predicate term [Bio::Ontology::TermI] + +=cut + +sub new { + + my( $class, @args ) = @_; + + my $self = $class->SUPER::new( @args ); + + my ( $identifier, + $subject_term, + $child, # for backwards compatibility + $object_term, + $parent, # for backwards compatibility + $predicate_term, + $reltype, # for backwards compatibility + $ont) + = $self->_rearrange( [qw( IDENTIFIER + SUBJECT_TERM + CHILD_TERM + OBJECT_TERM + PARENT_TERM + PREDICATE_TERM + RELATIONSHIP_TYPE + ONTOLOGY) + ], @args ); + + $self->init(); + + $self->identifier( $identifier ); + $subject_term = $child unless $subject_term; + $object_term = $parent unless $object_term; + $predicate_term = $reltype unless $predicate_term; + $self->subject_term( $subject_term) if $subject_term; + $self->object_term( $object_term) if $object_term; + $self->predicate_term( $predicate_term ) if $predicate_term; + $self->ontology($ont) if $ont; + + return $self; + +} # new + + + +=head2 init + + Title : init() + Usage : $rel->init(); + Function: Initializes this Relationship to all undef. + Returns : + Args : + +=cut + +sub init { + my( $self ) = @_; + + $self->{ "_identifier" } = undef; + $self->{ "_subject_term" } = undef; + $self->{ "_object_term" } = undef; + $self->{ "_predicate_term" } = undef; + $self->ontology(undef); + +} # init + + + +=head2 identifier + + Title : identifier + Usage : $rel->identifier( "100050" ); + or + print $rel->identifier(); + Function: Set/get for the identifier of this Relationship. + Returns : The identifier [scalar]. + Args : The identifier [scalar] (optional). + +=cut + +sub identifier { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_identifier" } = $value; + } + + return $self->{ "_identifier" }; +} # identifier + + + + +=head2 subject_term + + Title : subject_term + Usage : $rel->subject_term( $subject ); + or + $subject = $rel->subject_term(); + Function: Set/get for the subject term of this Relationship. + + The common convention for ontologies is to express + relationships between terms as triples (subject, predicate, + object). + + Returns : The subject term [Bio::Ontology::TermI]. + Args : The subject term [Bio::Ontology::TermI] (optional). + +=cut + +sub subject_term { + my ( $self, $term ) = @_; + + if ( defined $term ) { + $self->_check_class( $term, "Bio::Ontology::TermI" ); + $self->{ "_subject_term" } = $term; + } + + return $self->{ "_subject_term" }; + +} # subject_term + + + +=head2 object_term + + Title : object_term + Usage : $rel->object_term( $object ); + or + $object = $rel->object_term(); + Function: Set/get for the object term of this Relationship. + + The common convention for ontologies is to express + relationships between terms as triples (subject, predicate, + object). + + Returns : The object term [Bio::Ontology::TermI]. + Args : The object term [Bio::Ontology::TermI] (optional). + +=cut + +sub object_term { + my ( $self, $term ) = @_; + + if ( defined $term ) { + $self->_check_class( $term, "Bio::Ontology::TermI" ); + $self->{ "_object_term" } = $term; + } + + return $self->{ "_object_term" }; +} + + + +=head2 predicate_term + + Title : predicate_term + Usage : $rel->predicate_term( $type ); + or + $type = $rel->predicate_term(); + Function: Set/get for the predicate (relationship type) of this + relationship. + + The common convention for ontologies is to express + relationships between terms as triples (subject, predicate, + object). + + Returns : The predicate term [Bio::Ontology::TermI]. + Args : The predicate term [Bio::Ontology::TermI] (optional). + +=cut + +sub predicate_term { + my ( $self, $term ) = @_; + + if ( defined $term ) { + $self->_check_class( $term, "Bio::Ontology::TermI" ); + $self->{ "_predicate_term" } = $term; + } + + return $self->{ "_predicate_term" }; +} + + +=head2 ontology + + Title : ontology + Usage : $ont = $obj->ontology() + Function: Get/set the ontology that defined this relationship. + Example : + Returns : an object implementing L + Args : on set, undef or an object implementing + L (optional) + + +=cut + +sub ontology{ + my $self = shift; + my $ont; + + if(@_) { + $ont = shift; + if($ont) { + $ont = Bio::Ontology::Ontology->new(-name => $ont) if ! ref($ont); + if(! $ont->isa("Bio::Ontology::OntologyI")) { + $self->throw(ref($ont)." does not implement ". + "Bio::Ontology::OntologyI. Bummer."); + } + } + return $self->{"_ontology"} = $ont; + } + return $self->{"_ontology"}; +} + +=head2 to_string + + Title : to_string() + Usage : print $rel->to_string(); + Function: to_string method for Relationship. + Returns : A string representation of this Relationship. + Args : + +=cut + +sub to_string { + my( $self ) = @_; + + local $^W = 0; + + my $s = ""; + + $s .= "-- Identifier:\n"; + $s .= $self->identifier()."\n"; + $s .= "-- Subject Term Identifier:\n"; + $s .= $self->subject_term()->identifier()."\n"; + $s .= "-- Object Term Identifier:\n"; + $s .= $self->object_term()->identifier()."\n"; + $s .= "-- Relationship Type Identifier:\n"; + $s .= $self->predicate_term()->identifier(); + + return $s; + +} # to_string + + + +sub _check_class { + my ( $self, $value, $expected_class ) = @_; + + if ( ! defined( $value ) ) { + $self->throw( "Found [undef] where [$expected_class] expected" ); + } + elsif ( ! ref( $value ) ) { + $self->throw( "Found [scalar] where [$expected_class] expected" ); + } + elsif ( ! $value->isa( $expected_class ) ) { + $self->throw( "Found [" . ref( $value ) . "] where [$expected_class] expected" ); + } + +} # _check_type + +################################################################# +# aliases for backwards compatibility +################################################################# + +=head1 Deprecated Methods + + These methods are deprecated and defined here solely to preserve + backwards compatibility. + +=cut + +*child_term = \&subject_term; +*parent_term = \&object_term; +*relationship_type = \&predicate_term; + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Ontology/RelationshipFactory.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Ontology/RelationshipFactory.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,115 @@ +# $Id: RelationshipFactory.pm,v 1.1.2.1 2003/03/27 10:07:56 lapp Exp $ +# +# BioPerl module for Bio::Ontology::RelationshipFactory +# +# Cared for by Hilmar Lapp +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# +# (c) Hilmar Lapp, hlapp at gmx.net, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Ontology::RelationshipFactory - Instantiates a new Bio::Ontology::RelationshipI (or derived class) through a factory + +=head1 SYNOPSIS + + use Bio::Ontology::RelationshipFactory; + + # the default type is Bio::Ontology::Relationship + my $factory = new Bio::Ontology::RelationshipFactory(-type => 'Bio::Ontology::GOterm'); + my $clu = $factory->create_object(-name => 'peroxisome', + -ontology => 'Gene Ontology', + -identifier => 'GO:0005777'); + + +=head1 DESCRIPTION + +This object will build L objects generically. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp at gmx.net + + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Ontology::RelationshipFactory; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::Factory::ObjectFactory; + +@ISA = qw(Bio::Factory::ObjectFactory); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Ontology::RelationshipFactory(); + Function: Builds a new Bio::Ontology::RelationshipFactory object + Returns : Bio::Ontology::RelationshipFactory + Args : -type => string, name of a L + derived class. + The default is L. + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + # make sure this matches our requirements + $self->interface("Bio::Ontology::RelationshipI"); + $self->type($self->type() || "Bio::Ontology::Relationship"); + + return $self; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Ontology/RelationshipI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Ontology/RelationshipI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,191 @@ +# $Id: RelationshipI.pm,v 1.2.2.2 2003/03/27 10:07:56 lapp Exp $ +# +# BioPerl module for RelationshipI +# +# Cared for by Peter Dimitrov +# +# (c) Peter Dimitrov +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +RelationshipI - Interface for a relationship between ontology terms + +=head1 SYNOPSIS + + # see documentation of methods and an implementation, e.g., + # Bio::Ontology::Relationship + +=head1 DESCRIPTION + +This is the minimal interface for a relationship between two terms in +an ontology. Ontology engines will use this. + +The terminology we use here is the one commonly used for ontologies, +namely the triple of (subject, predicate, object), which in addition +is scoped in a namespace (ontology). It is called triple because it is +a tuple of three ontology terms. + +There are other terminologies in use for expressing relationships. For +those who it helps to better understand the concept, the triple of +(child, relationship type, parent) would be equivalent to the +terminology chosen here, disregarding the question whether the notion +of parent and child is sensible in the context of the relationship +type or not. Especially in the case of ontologies with a wide variety +of predicates the parent/child terminology and similar ones can +quickly become ambiguous (e.g., A synthesises B), meaningless (e.g., A +binds B), or even conflicting (e.g., A is-parent-of B), and are +therefore strongly discouraged. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Peter Dimitrov + +Email dimitrov@gnf.org + +=head1 CONTRIBUTORS + + Hilmar Lapp, email: hlapp at gmx.net + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Ontology::RelationshipI; +use vars qw(@ISA); +use strict; +use Bio::Root::RootI; + +@ISA = qw( Bio::Root::RootI ); + +=head2 identifier + + Title : identifier + Usage : print $rel->identifier(); + Function: Set/get for the identifier of this Relationship. + + Note that this may not necessarily be used by a particular + ontology. + + Returns : The identifier [scalar]. + Args : + +=cut + +sub identifier{ + shift->throw_not_implemented(); +} + +=head2 subject_term + + Title : subject_term + Usage : $subj = $rel->subject_term(); + Function: Set/get for the subject term of this Relationship. + + The common convention for ontologies is to express + relationships between terms as triples (subject, predicate, + object). + + Returns : The subject term [Bio::Ontology::TermI]. + Args : + +=cut + +sub subject_term{ + shift->throw_not_implemented(); +} + +=head2 object_term + + Title : object_term + Usage : $object = $rel->object_term(); + Function: Set/get for the object term of this Relationship. + + The common convention for ontologies is to express + relationships between terms as triples (subject, predicate, + object). + + Returns : The object term [Bio::Ontology::TermI]. + Args : + +=cut + +sub object_term{ + shift->throw_not_implemented(); +} + +=head2 predicate_term + + Title : predicate_term + Usage : $type = $rel->predicate_term(); + Function: Set/get for the relationship type of this relationship. + + The common convention for ontologies is to express + relationships between terms as triples (subject, predicate, + object). + + Returns : The relationship type [Bio::Ontology::TermI]. + Args : + +=cut + +sub predicate_term{ + shift->throw_not_implemented(); +} + +=head2 ontology + + Title : ontology + Usage : $ont = $obj->ontology() + Function: Get the ontology that defined (is the scope for) this + relationship. + Example : + Returns : an object implementing L + Args : + + +=cut + +sub ontology{ + shift->throw_not_implemented(); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Ontology/RelationshipType.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Ontology/RelationshipType.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,438 @@ +# $Id: RelationshipType.pm,v 1.5.2.5 2003/09/08 12:16:19 heikki Exp $ +# +# BioPerl module for Bio::Ontology::RelationshipType +# +# Cared for by Christian M. Zmasek or +# +# (c) Christian M. Zmasek, czmasek@gnf.org, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +RelationshipType - a relationship type for an ontology + +=head1 SYNOPSIS + + # + +=head1 DESCRIPTION + +This class can be used to model various types of relationships +(such as "IS_A", "PART_OF", "CONTAINS", "FOUND_IN"). + +This class extends L, so it essentially is-a +L. In addition, all methods are overridden such +as to make the object immutable. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Christian M. Zmasek + +Email: czmasek@gnf.org or cmzmasek@yahoo.com + +WWW: http://www.genetics.wustl.edu/eddy/people/zmasek/ + +Address: + + Genomics Institute of the Novartis Research Foundation + 10675 John Jay Hopkins Drive + San Diego, CA 92121 + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Ontology::RelationshipType; +use vars qw( @ISA ); +use strict; +use Bio::Ontology::Term; + + +use constant PART_OF => "PART_OF"; +use constant IS_A => "IS_A"; +use constant CONTAINS => "CONTAINS"; +use constant FOUND_IN => "FOUND_IN"; + + +@ISA = qw( Bio::Ontology::Term ); + + +# +# cache for terms +# +my %term_name_map = (); + + +=head2 get_instance + + Title : get_instance + Usage : $IS_A = Bio::Ontology::RelationshipType->get_instance( "IS_A" ); + $PART_OF = Bio::Ontology::RelationshipType->get_instance( "PART_OF" ); + $CONTAINS = Bio::Ontology::RelationshipType->get_instance( "CONTAINS" ); + $FOUND_IN = Bio::Ontology::RelationshipType->get_instance( "FOUND_IN" ); + Function: Factory method to create instances of RelationshipType + Returns : [Bio::Ontology::RelationshipType] + Args : "IS_A" or "PART_OF" or "CONTAINS" or "FOUND_IN" [scalar] + the ontology [Bio::Ontology::OntologyI] (optional) + +=cut + +sub get_instance { + my ( $class, $name, $ont ) = @_; + + $class->throw("must provide predicate name") unless $name; + + # is one in the cache? + my $reltype = $term_name_map{$name}; + + if($reltype && + # check whether ontologies match + (($ont && $reltype->ontology() && + ($ont->name() eq $reltype->ontology->name())) || + (! ($reltype->ontology() || $ont)))) { + # we're done, return cached type + return $reltype; + } + # valid relationship type? + if ( ! (($name eq IS_A) || ($name eq PART_OF) || + ($name eq CONTAINS) || ( $name eq FOUND_IN ))) { + my $msg = "Found unknown type of relationship: [" . $name . "]\n"; + $msg .= "Known types are: [" . IS_A . "], [" . PART_OF . "], [" . CONTAINS . "], [" . FOUND_IN . "]"; + $class->throw( $msg ); + } + # if we get here we need to create the rel.type + $reltype = $class->new(-name => $name, + -ontology => $ont); + # cache it (FIXME possibly overrides one from another ontology) + $term_name_map{$name} = $reltype; + return $reltype; +} # get_instance + + + +=head2 init + + Title : init() + Usage : $type->init(); + Function: Initializes this to all undef and empty lists. + Returns : + Args : + +=cut + +sub init { + my $self = shift; + + $self->SUPER::init(); + + # at this point we don't really need to do anything special for us +} # init + + + +=head2 equals + + Title : equals + Usage : if ( $type->equals( $other_type ) ) { ... + Function: Compares this type to another one, based on string "eq" of + the "identifier" field, if at least one of the two types has + the identifier set, or string eq of the name otherwise. + Returns : true or false + Args : [Bio::Ontology::RelationshipType] + +=cut + +sub equals { + my( $self, $type ) = @_; + + $self->_check_class( $type, "Bio::Ontology::RelationshipType" ); + + if ( $self->identifier() xor $type->identifier() ) { + $self->warn("comparing relationship types when only ". + "one has an identifier will always return false" ); + } + + return + ($self->identifier() || $type->identifier()) ? + $self->identifier() eq $type->identifier() : + $self->name() eq $type->name(); + +} # equals + + + +=head2 identifier + + Title : identifier + Usage : $term->identifier( "IS_A" ); + or + print $term->identifier(); + Function: Set/get for the immutable identifier of this Type. + Returns : The identifier [scalar]. + Args : The identifier [scalar] (optional). + +=cut + +sub identifier { + my $self = shift; + my $ret = $self->SUPER::identifier(); + if(@_) { + $self->throw($self->veto_change("identifier",$ret,$_[0])) + if $ret && ($ret ne $_[0]); + $ret = $self->SUPER::identifier(@_); + } + return $ret; +} # identifier + + + + +=head2 name + + Title : name + Usage : $term->name( "is a type" ); + or + print $term->name(); + Function: Set/get for the immutable name of this Type. + Returns : The name [scalar]. + Args : The name [scalar] (optional). + +=cut + +sub name { + my $self = shift; + my $ret = $self->SUPER::name(); + if(@_) { + $self->throw($self->veto_change("name",$ret,$_[0])) + if $ret && ($ret ne $_[0]); + $ret = $self->SUPER::name(@_); + } + return $ret; +} # name + + + + + +=head2 definition + + Title : definition + Usage : $term->definition( "" ); + or + print $term->definition(); + Function: Set/get for the immutable definition of this Type. + Returns : The definition [scalar]. + Args : The definition [scalar] (optional). + +=cut + +sub definition { + my $self = shift; + my $ret = $self->SUPER::definition(); + if(@_) { + $self->veto_change("definition",$ret,$_[0]) + if $ret && ($ret ne $_[0]); + $ret = $self->SUPER::definition(@_); + } + # let's be nice and return something readable here + return $ret if $ret; + return $self->name()." relationship predicate (type)" if $self->name(); +} # definition + + + +=head2 ontology + + Title : ontology + Usage : $term->ontology( $top ); + or + $top = $term->ontology(); + Function: Set/get for the ontology this relationship type lives in. + Returns : The ontology [Bio::Ontology::OntologyI]. + Args : On set, the ontology [Bio::Ontology::OntologyI] (optional). + +=cut + +sub ontology { + my $self = shift; + my $ret = $self->SUPER::ontology(); + if(@_) { + my $ont = shift; + if($ret) { + $self->throw($self->veto_change("ontology",$ret->name, + $ont ? $ont->name : $ont)) + unless $ont && ($ont->name() eq $ret->name()); + } + $ret = $self->SUPER::ontology($ont,@_); + } + return $ret; +} # category + + + +=head2 version + + Title : version + Usage : $term->version( "1.00" ); + or + print $term->version(); + Function: Set/get for immutable version information. + Returns : The version [scalar]. + Args : The version [scalar] (optional). + +=cut + +sub version { + my $self = shift; + my $ret = $self->SUPER::version(); + if(@_) { + $self->throw($self->veto_change("version",$ret,$_[0])) + if $ret && ($ret ne $_[0]); + $ret = $self->SUPER::version(@_); + } + return $ret; +} # version + + + +=head2 is_obsolete + + Title : is_obsolete + Usage : $term->is_obsolete( 1 ); + or + if ( $term->is_obsolete() ) + Function: Set/get for the immutable obsoleteness of this Type. + Returns : the obsoleteness [0 or 1]. + Args : the obsoleteness [0 or 1] (optional). + +=cut + +sub is_obsolete { + my $self = shift; + my $ret = $self->SUPER::is_obsolete(); + if(@_) { + $self->throw($self->veto_change("is_obsolete",$ret,$_[0])) + if $ret && ($ret != $_[0]); + $ret = $self->SUPER::is_obsolete(@_); + } + return $ret; +} # is_obsolete + + + + + +=head2 comment + + Title : comment + Usage : $term->comment( "..." ); + or + print $term->comment(); + Function: Set/get for an arbitrary immutable comment about this Type. + Returns : A comment. + Args : A comment (optional). + +=cut + +sub comment { + my $self = shift; + my $ret = $self->SUPER::comment(); + if(@_) { + $self->throw($self->veto_change("comment",$ret,$_[0])) + if $ret && ($ret ne $_[0]); + $ret = $self->SUPER::comment(@_); + } + return $ret; +} # comment + +=head1 Private methods + +May be overridden in a derived class, but should +never be called from outside. + +=cut + +sub _check_class { + my ( $self, $value, $expected_class ) = @_; + + if ( ! defined( $value ) ) { + $self->throw( "Found [undef] where [$expected_class] expected" ); + } + elsif ( ! ref( $value ) ) { + $self->throw( "Found [scalar] where [$expected_class] expected" ); + } + elsif ( ! $value->isa( $expected_class ) ) { + $self->throw( "Found [" . ref( $value ) . "] where [$expected_class] expected" ); + } + +} # _check_type + +=head2 veto_change + + Title : veto_change + Usage : + Function: Called if an attribute is changed. Setting an attribute is + considered a change if it had a value before and the attempt + to set it would change the value. + + This method returns the message to be printed in the exception. + + Example : + Returns : A string + Args : The name of the attribute that was attempted to change. + Optionally, the old value and the new value for reporting + purposes only. + + +=cut + +sub veto_change{ + my ($self,$attr,$old,$new) = @_; + + my $changetype = $old ? ($new ? "change" : "unset") : "change"; + my $msg = "attempt to $changetype attribute $attr in ".ref($self). + ", which is immutable"; + $msg .= " (\"$old\" to \"$new\")" if $old && $new; + return $msg; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Ontology/SimpleGOEngine.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Ontology/SimpleGOEngine.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,888 @@ +# $Id: SimpleGOEngine.pm,v 1.3.2.6 2003/06/30 05:04:06 lapp Exp $ +# +# BioPerl module for Bio::Ontology::SimpleGOEngine +# +# Cared for by Christian M. Zmasek or +# +# (c) Christian M. Zmasek, czmasek@gnf.org, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +SimpleGOEngine - a Ontology Engine for GO implementing OntologyEngineI + +=head1 SYNOPSIS + + use Bio::Ontology::SimpleGOEngine; + + my $parser = Bio::Ontology::SimpleGOEngine->new + ( -defs_file => "/home/czmasek/GO/GO.defs", + -files => ["/home/czmasek/GO/component.ontology", + "/home/czmasek/GO/function.ontology", + "/home/czmasek/GO/process.ontology"] ); + + my $engine = $parser->parse(); + + my $IS_A = Bio::Ontology::RelationshipType->get_instance( "IS_A" ); + my $PART_OF = Bio::Ontology::RelationshipType->get_instance( "PART_OF" ); + + +=head1 DESCRIPTION + +Needs Graph.pm from CPAN. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Christian M. Zmasek + +Email: czmasek@gnf.org or cmzmasek@yahoo.com + +WWW: http://www.genetics.wustl.edu/eddy/people/zmasek/ + +Address: + + Genomics Institute of the Novartis Research Foundation + 10675 John Jay Hopkins Drive + San Diego, CA 92121 + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + + +package Bio::Ontology::SimpleGOEngine; + +use Graph::Directed; + +use vars qw( @ISA ); +use strict; +use Bio::Root::Root; +use Bio::Ontology::RelationshipType; +use Bio::Ontology::RelationshipFactory; +use Bio::Ontology::OntologyEngineI; + +use constant TRUE => 1; +use constant FALSE => 0; +use constant IS_A => "IS_A"; +use constant PART_OF => "PART_OF"; +use constant TERM => "TERM"; +use constant TYPE => "TYPE"; +use constant ONTOLOGY => "ONTOLOGY"; + +@ISA = qw( Bio::Root::Root + Bio::Ontology::OntologyEngineI ); + + + +=head2 new + + Title : new + Usage : $engine = Bio::Ontology::SimpleGOEngine->new() + Function: Creates a new SimpleGOEngine + Returns : A new SimpleGOEngine object + Args : + +=cut + +sub new { + my( $class, @args ) = @_; + + my $self = $class->SUPER::new( @args ); + + $self->init(); + + return $self; +} # new + + + +=head2 init + + Title : init() + Usage : $engine->init(); + Function: Initializes this Engine. + Returns : + Args : + +=cut + +sub init { + my ( $self ) = @_; + + $self->{ "_is_a_relationship" } = Bio::Ontology::RelationshipType->get_instance( IS_A ); + $self->{ "_part_of_relationship" } = Bio::Ontology::RelationshipType->get_instance( PART_OF ); + + $self->graph( Graph::Directed->new() ); + + # set defaults for the factories + $self->relationship_factory(Bio::Ontology::RelationshipFactory->new( + -type => "Bio::Ontology::Relationship")); + +} # init + + + +=head2 is_a_relationship + + Title : is_a_relationship() + Usage : $IS_A = $engine->is_a_relationship(); + Function: Returns a Bio::Ontology::RelationshipType object for "is-a" + relationships + Returns : Bio::Ontology::RelationshipType set to "IS_A" + Args : + +=cut + +sub is_a_relationship { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->throw( "Attempted to change immutable field" ); + } + + return $self->{ "_is_a_relationship" }; +} # is_a_relationship + + + +=head2 part_of_relationship + + Title : part_of_relationship() + Usage : $PART_OF = $engine->part_of_relationship(); + Function: Returns a Bio::Ontology::RelationshipType object for "part-of" + relationships + Returns : Bio::Ontology::RelationshipType set to "PART_OF" + Args : + +=cut + +sub part_of_relationship { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->throw( "Attempted to change immutable field" ); + } + + return $self->{ "_part_of_relationship" }; +} # part_of_relationship + + + + +=head2 add_term + + Title : add_term + Usage : $engine->add_term( $term_obj ); + Function: Adds a Bio::Ontology::TermI to this engine + Returns : true if the term was added and false otherwise (e.g., if the + term already existed in the ontology engine) + Args : Bio::Ontology::TermI + + +=cut + +sub add_term { + my ( $self, $term ) = @_; + + return FALSE if $self->has_term( $term ); + + my $goid = $self->_get_id($term); + + $self->graph()->add_vertex( $goid ); + $self->graph()->set_attribute( TERM, $goid, $term ); + + return TRUE; + +} # add_term + + + +=head2 has_term + + Title : has_term + Usage : $engine->has_term( $term ); + Function: Checks whether this engine contains a particular term + Returns : true or false + Args : Bio::Ontology::TermI + or + erm identifier (e.g. "GO:0012345") + + +=cut + +sub has_term { + my ( $self, $term ) = @_; + $term = $self->_get_id( $term ); + if ( $self->graph()->has_vertex( $term ) ) { + return TRUE; + } + else { + return FALSE; + } + +} # has_term + + + +=head2 add_relationship + + Title : add_relationship + Usage : $engine->add_relationship( $relationship ); + $engine->add_relatioship( $subject_term, $predicate_term, $object_term, $ontology ); + $engine->add_relatioship( $subject_id, $predicate_id, $object_id, $ontology); + Function: Adds a relationship to this engine + Returns : true if successfully added, false otherwise + Args : term id, Bio::Ontology::TermI (rel.type), term id, ontology + or + Bio::Ontology::TermI, Bio::Ontology::TermI (rel.type), Bio::Ontology::TermI, ontology + or + Bio::Ontology::RelationshipI + +=cut + +# term objs or term ids +sub add_relationship { + my ( $self, $child, $type, $parent, $ont ) = @_; + + if ( scalar( @_ ) == 2 ) { + $self->_check_class( $child, "Bio::Ontology::RelationshipI" ); + $type = $child->predicate_term(); + $parent = $child->object_term(); + $ont = $child->ontology(); + $child = $child->subject_term(); + } + + + $self->_check_class( $type, "Bio::Ontology::TermI" ); + + my $parentid = $self->_get_id( $parent ); + my $childid = $self->_get_id( $child ); + + my $g = $self->graph(); + + $self->add_term($child) unless $g->has_vertex( $childid ); + $self->add_term($parent) unless $g->has_vertex( $parentid ); + + # This prevents multi graphs. + if ( $g->has_edge( $parentid, $childid ) ) { + return FALSE; + } + + $g->add_edge( $parentid, $childid ); + $g->set_attribute( TYPE, $parentid, $childid, $type ); + $g->set_attribute( ONTOLOGY, $parentid, $childid, $ont ); + + return TRUE; + +} # add_relationship + + + + +=head2 get_relationships + + + Title : get_relationships + Usage : $engine->get_relationships( $term ); + Function: Returns all relationships of a term, or all relationships in + the graph if no term is specified. + Returns : Relationship[] + Args : term id + or + Bio::Ontology::TermI + +=cut + +sub get_relationships { + my ( $self, $term ) = @_; + + my $g = $self->graph(); + + # obtain the ID if term provided + my $termid; + if($term) { + $termid = $self->_get_id( $term ); + # check for presence in the graph + if ( ! $g->has_vertex( $termid ) ) { + $self->throw( "no term with identifier \"$termid\" in ontology" ); + } + } + + # now build the relationships + my $relfact = $self->relationship_factory(); + # we'll build the relationships from edges + my @rels = (); + my @edges = $g->edges($termid); + while(@edges) { + my $startid = shift(@edges); + my $endid = shift(@edges); + my $rel = $relfact->create_object( + -subject_term => $self->get_terms($endid), + -object_term => $self->get_terms($startid), + -predicate_term => $g->get_attribute(TYPE, + $startid, $endid), + -ontology => $g->get_attribute(ONTOLOGY, + $startid, $endid)); + push( @rels, $rel ); + } + + return @rels; + +} # get_relationships + +=head2 get_all_relationships + + + Title : get_all_relationships + Usage : @rels = $engine->get_all_relationships(); + Function: Returns all relationships in the graph. + Returns : Relationship[] + Args : + +=cut + +sub get_all_relationships { + return shift->get_relationships(@_); +} # get_all_relationships + + + +=head2 get_predicate_terms + + Title : get_predicate_terms + Usage : $engine->get_predicate_terms(); + Function: Returns the types of relationships this engine contains + Returns : Bio::Ontology::RelationshipType[] + Args : + + +=cut + +sub get_predicate_terms { + my ( $self ) = @_; + + my @a = ( $self->is_a_relationship(), + $self->part_of_relationship() ); + + return @a; +} # get_predicate_terms + + + + +=head2 get_child_terms + + Title : get_child_terms + Usage : $engine->get_child_terms( $term_obj, @rel_types ); + $engine->get_child_terms( $term_id, @rel_types ); + Function: Returns the children of this term + Returns : Bio::Ontology::TermI[] + Args : Bio::Ontology::TermI, Bio::Ontology::RelationshipType[] + or + term id, Bio::Ontology::RelationshipType[] + + if NO Bio::Ontology::RelationshipType[] is indicated: children + of ALL types are returned + +=cut + +sub get_child_terms { + my ( $self, $term, @types ) = @_; + + return $self->_get_child_parent_terms_helper( $term, TRUE, @types ); + +} # get_child_terms + + +=head2 get_descendant_terms + + Title : get_descendant_terms + Usage : $engine->get_descendant_terms( $term_obj, @rel_types ); + $engine->get_descendant_terms( $term_id, @rel_types ); + Function: Returns the descendants of this term + Returns : Bio::Ontology::TermI[] + Args : Bio::Ontology::TermI, Bio::Ontology::RelationshipType[] + or + term id, Bio::Ontology::RelationshipType[] + + if NO Bio::Ontology::RelationshipType[] is indicated: descendants + of ALL types are returned + +=cut + +sub get_descendant_terms { + my ( $self, $term, @types ) = @_; + + my %ids = (); + my @ids = (); + + $term = $self->_get_id( $term ); + + if ( ! $self->graph()->has_vertex( $term ) ) { + $self->throw( "Ontology does not contain a term with an identifier of \"$term\"" ); + } + + $self->_get_descendant_terms_helper( $term, \%ids, \@types ); + + while( ( my $id ) = each ( %ids ) ) { + push( @ids, $id ); + } + + return $self->get_terms( @ids ); + +} # get_descendant_terms + + + + +=head2 get_parent_terms + + Title : get_parent_terms + Usage : $engine->get_parent_terms( $term_obj, @rel_types ); + $engine->get_parent_terms( $term_id, @rel_types ); + Function: Returns the parents of this term + Returns : Bio::Ontology::TermI[] + Args : Bio::Ontology::TermI, Bio::Ontology::RelationshipType[] + or + term id, Bio::Ontology::RelationshipType[] + + if NO Bio::Ontology::RelationshipType[] is indicated: parents + of ALL types are returned + +=cut + +sub get_parent_terms { + my ( $self, $term, @types ) = @_; + + return $self->_get_child_parent_terms_helper( $term, FALSE, @types ); + +} # get_parent_terms + + + +=head2 get_ancestor_terms + + Title : get_ancestor_terms + Usage : $engine->get_ancestor_terms( $term_obj, @rel_types ); + $engine->get_ancestor_terms( $term_id, @rel_types ); + Function: Returns the ancestors of this term + Returns : Bio::Ontology::TermI[] + Args : Bio::Ontology::TermI, Bio::Ontology::RelationshipType[] + or + term id, Bio::Ontology::RelationshipType[] + + if NO Bio::Ontology::RelationshipType[] is indicated: ancestors + of ALL types are returned + +=cut + +sub get_ancestor_terms { + my ( $self, $term, @types ) = @_; + + my %ids = (); + my @ids = (); + + $term = $self->_get_id( $term ); + + if ( ! $self->graph()->has_vertex( $term ) ) { + $self->throw( "Ontology does not contain a term with an identifier of \"$term\"" ); + } + + $self->_get_ancestor_terms_helper( $term, \%ids, \@types ); + + while( ( my $id ) = each ( %ids ) ) { + push( @ids, $id ); + } + + return $self->get_terms( @ids ); + +} # get_ancestor_terms + + + + + +=head2 get_leaf_terms + + Title : get_leaf_terms + Usage : $engine->get_leaf_terms(); + Function: Returns the leaf terms + Returns : Bio::Ontology::TermI[] + Args : + +=cut + +sub get_leaf_terms { + my ( $self ) = @_; + + my @a = $self->graph()->sink_vertices(); + + return $self->get_terms( @a ); + +} + + + +=head2 get_root_terms() + + Title : get_root_terms + Usage : $engine->get_root_terms(); + Function: Returns the root terms + Returns : Bio::Ontology::TermI[] + Args : + +=cut + +sub get_root_terms { + my ( $self ) = @_; + + + my @a = $self->graph()->source_vertices(); + + return $self->get_terms( @a ); + +} + + +=head2 get_terms + + Title : get_terms + Usage : @terms = $engine->get_terms( "GO:1234567", "GO:2234567" ); + Function: Returns term objects with given identifiers + Returns : Bio::Ontology::TermI[], or the term corresponding to the + first identifier if called in scalar context + Args : term ids[] + + +=cut + +sub get_terms { + my ( $self, @ids ) = @_; + + my @terms = (); + + foreach my $id ( @ids ) { + if ( $self->graph()->has_vertex( $id ) ) { + push( @terms, $self->graph()->get_attribute( TERM, $id ) ); + } + } + + return wantarray ? @terms : shift(@terms); + +} # get_terms + + +=head2 get_all_terms + + Title : get_all_terms + Usage : $engine->get_all_terms(); + Function: Returns all terms in this engine + Returns : Bio::Ontology::TermI[] + Args : + +=cut + +sub get_all_terms { + my ( $self ) = @_; + + return( $self->get_terms( $self->graph()->vertices() ) ); + +} # get_all_terms + + +=head2 find_terms + + Title : find_terms + Usage : ($term) = $oe->find_terms(-identifier => "SO:0000263"); + Function: Find term instances matching queries for their attributes. + + This implementation can efficiently resolve queries by + identifier. + + Example : + Returns : an array of zero or more Bio::Ontology::TermI objects + Args : Named parameters. The following parameters should be recognized + by any implementations: + + -identifier query by the given identifier + -name query by the given name + + +=cut + +sub find_terms{ + my ($self,@args) = @_; + my @terms; + + my ($id,$name) = $self->_rearrange([qw(IDENTIFIER NAME)],@args); + + if(defined($id)) { + @terms = $self->get_terms($id); + } else { + @terms = $self->get_all_terms(); + } + if(defined($name)) { + @terms = grep { $_->name() eq $name; } @terms; + } + return @terms; +} + +=head2 relationship_factory + + Title : relationship_factory + Usage : $fact = $obj->relationship_factory() + Function: Get/set the object factory to be used when relationship + objects are created by the implementation on-the-fly. + + Example : + Returns : value of relationship_factory (a Bio::Factory::ObjectFactoryI + compliant object) + Args : on set, a Bio::Factory::ObjectFactoryI compliant object + + +=cut + +sub relationship_factory{ + my $self = shift; + + return $self->{'relationship_factory'} = shift if @_; + return $self->{'relationship_factory'}; +} + +=head2 term_factory + + Title : term_factory + Usage : $fact = $obj->term_factory() + Function: Get/set the object factory to be used when term objects are + created by the implementation on-the-fly. + + Note that this ontology engine implementation does not + create term objects on the fly, and therefore setting this + attribute is meaningless. + + Example : + Returns : value of term_factory (a Bio::Factory::ObjectFactoryI + compliant object) + Args : on set, a Bio::Factory::ObjectFactoryI compliant object + + +=cut + +sub term_factory{ + my $self = shift; + + if(@_) { + $self->warn("setting term factory, but ".ref($self). + " does not create terms on-the-fly"); + return $self->{'term_factory'} = shift; + } + return $self->{'term_factory'}; +} + +=head2 graph + + Title : graph() + Usage : $engine->graph(); + Function: Returns the Graph this engine is based on + Returns : Graph + Args : + +=cut + +sub graph { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->_check_class( $value, "Graph::Directed" ); + $self->{ "_graph" } = $value; + } + + return $self->{ "_graph" }; +} # graph + + + +# Internal methods +# ---------------- + + +# Checks the correct format of a GOBO-formatted id +# Gets the id out of a term or id string +sub _get_id { + my ( $self, $term ) = @_; + + if(ref($term)) { + return $term->GO_id() if $term->isa("Bio::Ontology::GOterm"); + # if not a GOterm, use standard API + $self->throw("Object doesn't implement Bio::Ontology::TermI. ". + "Bummer.") + unless $term->isa("Bio::Ontology::TermI"); + $term = $term->identifier(); + } + # don't fuss if it looks remotely standard + return $term if $term =~ /^[A-Z]{1,8}:\d{3,}$/; + # prefix with something if only numbers + if($term =~ /^\d+$/) { + $self->warn(ref($self).": identifier [$term] is only numbers - ". + "prefixing with 'GO:'"); + return "GO:" . $term; + } + # we shouldn't have gotten here if it's at least a remotely decent ID + $self->warn(ref($self). + ": Are you sure '$term' is a valid identifier? ". + "If you see problems, this may be the cause."); + return $term; +} # _get_id + + +# Helper for getting children and parent terms +sub _get_child_parent_terms_helper { + my ( $self, $term, $do_get_child_terms, @types ) = @_; + + foreach my $type ( @types ) { + $self->_check_class( $type, "Bio::Ontology::TermI" ); + } + + my @relative_terms = (); + + $term = $self->_get_id( $term ); + if ( ! $self->graph()->has_vertex( $term ) ) { + $self->throw( "Ontology does not contain a term with an identifier of \"$term\"" ); + } + + my @all_relative_terms = (); + if ( $do_get_child_terms ) { + @all_relative_terms = $self->graph()->successors( $term ); + } + else { + @all_relative_terms = $self->graph()->predecessors( $term ); + } + + foreach my $relative ( @all_relative_terms ) { + if ( scalar( @types ) > 0 ) { + foreach my $type ( @types ) { + my $relative_type; + if ( $do_get_child_terms ) { + $relative_type = $self->graph()->get_attribute( TYPE, $term, $relative ); + } + else { + $relative_type = $self->graph()->get_attribute( TYPE, $relative, $term ); + } + if ( $relative_type->equals( $type ) ) { + push( @relative_terms, $relative ); + } + } + } + else { + push( @relative_terms, $relative ); + } + } + + return $self->get_terms( @relative_terms ); + +} # get_child_terms + + +# Recursive helper +sub _get_descendant_terms_helper { + my ( $self, $term, $ids_ref, $types_ref ) = @_; + + my @child_terms = $self->get_child_terms( $term, @$types_ref ); + + if ( scalar( @child_terms ) < 1 ) { + return; + } + + foreach my $child_term ( @child_terms ) { + my $child_term_id = $child_term->identifier(); + $ids_ref->{ $child_term_id } = 0; + $self->_get_descendant_terms_helper( $child_term_id, $ids_ref, $types_ref ); + } + +} # _get_descendant_terms_helper + + +# Recursive helper +sub _get_ancestor_terms_helper { + my ( $self, $term, $ids_ref, $types_ref ) = @_; + + my @parent_terms = $self->get_parent_terms( $term, @$types_ref ); + + if ( scalar( @parent_terms ) < 1 ) { + return; + } + + foreach my $parent_term ( @parent_terms ) { + my $parent_term_id = $parent_term->identifier(); + $ids_ref->{ $parent_term_id } = 0; + $self->_get_ancestor_terms_helper( $parent_term_id, $ids_ref, $types_ref ); + } + +} # get_ancestor_terms_helper + + + +sub _check_class { + my ( $self, $value, $expected_class ) = @_; + + if ( ! defined( $value ) ) { + $self->throw( "Found [undef] where [$expected_class] expected" ); + } + elsif ( ! ref( $value ) ) { + $self->throw( "Found [scalar] where [$expected_class] expected" ); + } + elsif ( ! $value->isa( $expected_class ) ) { + $self->throw( "Found [" . ref( $value ) . "] where [$expected_class] expected" ); + } + +} # _check_class + + +################################################################# +# aliases +################################################################# + +*get_relationship_types = \&get_predicate_terms; + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Ontology/SimpleOntologyEngine.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Ontology/SimpleOntologyEngine.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1050 @@ +# $Id: SimpleOntologyEngine.pm,v 1.3.2.5 2003/07/03 00:41:40 lapp Exp $ +# +# BioPerl module for SimpleOntologyEngine +# +# Cared for by Peter Dimitrov +# +# Copyright Peter Dimitrov +# (c) Peter Dimitrov, dimitrov@gnf.org, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# +# POD documentation - main docs before the code + +=head1 NAME + +SimpleOntologyEngine - Implementation of OntologyEngineI interface + +=head1 SYNOPSIS + + my $soe = Bio::Ontology::SimpleOntologyEngine->new; + + +=head1 DESCRIPTION + +This is a "simple" implementation of Bio::Ontology::OntologyEngineI. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bioperl.org/bioperl-bugs/ + +=head1 AUTHOR - Peter Dimitrov + +Email dimitrov@gnf.org + +=head1 CONTRIBUTORS + +Hilmar Lapp, hlapp at gmx.net + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Ontology::SimpleOntologyEngine; +use vars qw(@ISA); +use strict; +use Carp; +use Bio::Root::Root; +use Bio::Ontology::RelationshipFactory; +use Bio::Ontology::OntologyEngineI; +use Data::Dumper; + +@ISA = qw( Bio::Root::Root Bio::Ontology::OntologyEngineI ); + +=head2 new + + Title : new + Usage : $soe = Bio::Ontology::SimpleOntologyEngine->new; + Function: Initializes the ontology engine. + Example : $soe = Bio::Ontology::SimpleOntologyEngine->new; + Returns : Object of class SimpleOntologyEngine. + Args : + + +=cut + +sub new{ + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); +# my %param = @args; + + $self->_term_store( {} ); + $self->_relationship_store( {} ); + $self->_inverted_relationship_store( {} ); + $self->_relationship_type_store( {} ); + $self->_instantiated_terms_store( {} ); + + # set defaults for the factories + $self->relationship_factory(Bio::Ontology::RelationshipFactory->new( + -type => "Bio::Ontology::Relationship")); + return $self; +} + +=head2 _instantiated_terms_store + + Title : _instantiated_terms_store + Usage : $obj->_instantiated_terms_store($newval) + Function: + Example : + Returns : hash + Args : empty hash + + +=cut + +sub _instantiated_terms_store{ + my ($self, $value) = @_; + + if( defined $value) { + $self->{'_instantiated_terms_store'} = $value; + } + return $self->{'_instantiated_terms_store'}; +} + +=head2 mark_instantiated + + Title : mark_instantiated + Usage : $self->mark_instantiated(TermI[] terms): TermI[] + Function: Marks TermI objects as fully instantiated, + allowing for proper counting of the number of terms in the term store. +The TermI objects has to be already stored in the term store in order to + be marked. + Example : $self->mark_instantiated($term); + Returns : its argument or throws an exception if a term is not + in the term store. + Args : array of objects of class TermI. + +=cut + +sub mark_instantiated{ + my ($self, @terms) = @_; + + foreach my $term (@terms) { + $self->throw( "term ".$term->identifier." not in the term store\n" ) + if !defined $self->_term_store->{$term->identifier}; + $self->_instantiated_terms_store->{$term->identifier} = 1; + } + + return @terms; +} + +=head2 mark_uninstantiated + + Title : mark_uninstantiated + Usage : $self->mark_uninstantiated(TermI[] terms): TermI[] + Function: Marks TermI objects as not fully instantiated, + Example : $self->mark_uninstantiated($term); + Returns : its argument or throws an exception if a term is not + in the term store(if the term is not marked it does nothing). + Args : array of objects of class TermI. + + +=cut + +sub mark_uninstantiated{ + my ($self, @terms) = @_; + + foreach my $term (@terms) { + $self->throw( "term ".$term->identifier." not in the term store\n" ) + if !defined $self->_term_store->{$term->identifier}; + delete $self->_instantiated_terms_store->{$term->identifier} + if defined $self->_instantiated_terms_store->{$term->identifier}; + } + + return @terms; +} + +=head2 _term_store + + Title : term_store + Usage : $obj->_term_store($newval) + Function: + Example : + Returns : reference to an array of Bio::Ontology::TermI objects + Args : reference to an array of Bio::Ontology::TermI objects + +=cut + +sub _term_store{ + my ($self, $value) = @_; + + if( defined $value) { + if ( defined $self->{'_term_store'}) { + $self->throw("_term_store already defined\n"); + } + else { + $self->{'_term_store'} = $value; + } + } + + return $self->{'_term_store'}; +} + +=head2 add_term + + Title : add_term + Usage : add_term(TermI term): TermI + Function: Adds TermI object to the ontology engine term store. + Marks the term fully instantiated by default. + Example : $soe->add_term($term) + Returns : its argument. + Args : object of class TermI. + +=cut + +sub add_term{ + my ($self, $term) = @_; + my $term_store = $self->_term_store; + + if ( defined $term_store -> {$term->identifier}) { + $self->throw( "term ".$term->identifier." already defined\n" ); + } + else { + $term_store->{$term->identifier} = $term; + $self->_instantiated_terms_store->{$term->identifier} = 1; + } + + return $term; +} + +=head2 get_term_by_identifier + + Title : get_term_by_identifier + Usage : get_term_by_identifier(String[] id): TermI[] + Function: Retrieves terms from the term store by their identifier + field, or undef if not there. + Example : $term = $soe->get_term_by_identifier("IPR000001"); + Returns : An array of zero or more Bio::Ontology::TermI objects. + Args : An array of identifier strings + + +=cut + +sub get_term_by_identifier{ + my ($self, @ids) = @_; + my @ans = (); + + foreach my $id (@ids) { + my $term = $self->_term_store->{$id}; + push @ans, $term if defined $term; + } + + return @ans; +} + +=head2 _get_number_rels + + Title : get_number_rels + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub _get_number_rels{ + my ($self) = @_; + my $num_rels = 0; + + foreach my $entry ($self->_relationship_store) { + $num_rels += scalar keys %$entry; + } + return $num_rels; +} + +=head2 _get_number_terms + + Title : _get_number_terms + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub _get_number_terms{ + my ($self) = @_; + + return scalar $self->_filter_unmarked( values %{$self->_term_store} ); + +} + +=head2 _relationship_store + + Title : _storerelationship_store + Usage : $obj->relationship_store($newval) + Function: + Example : + Returns : reference to an array of Bio::Ontology::TermI objects + Args : reference to an array of Bio::Ontology::TermI objects + + +=cut + +sub _relationship_store{ + my ($self, $value) = @_; + + if( defined $value) { + if ( defined $self->{'_relationship_store'}) { + $self->throw("_relationship_store already defined\n"); + } + else { + $self->{'_relationship_store'} = $value; + } + } + + return $self->{'_relationship_store'}; +} + +=head2 _inverted_relationship_store + + Title : _inverted_relationship_store + Usage : + Function: + Example : + Returns : reference to an array of Bio::Ontology::TermI objects + Args : reference to an array of Bio::Ontology::TermI objects + + +=cut + +sub _inverted_relationship_store{ + my ($self, $value) = @_; + + if( defined $value) { + if ( defined $self->{'_inverted_relationship_store'}) { + $self->throw("_inverted_relationship_store already defined\n"); + } + else { + $self->{'_inverted_relationship_store'} = $value; + } + } + + return $self->{'_inverted_relationship_store'}; +} + +=head2 _relationship_type_store + + Title : _relationship_type_store + Usage : $obj->_relationship_type_store($newval) + Function: + Example : + Returns : reference to an array of Bio::Ontology::RelationshipType objects + Args : reference to an array of Bio::Ontology::RelationshipType objects + + +=cut + +sub _relationship_type_store{ + my ($self, $value) = @_; + + if( defined $value) { + if ( defined $self->{'_relationship_type_store'}) { + $self->throw("_relationship_type_store already defined\n"); + } + else { + $self->{'_relationship_type_store'} = $value; + } + } + + return $self->{'_relationship_type_store'}; +} + +=head2 _add_relationship_simple + + Title : _add_relationship_simple + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub _add_relationship_simple{ + my ($self, $store, $rel, $inverted) = @_; + my $parent_id; + my $child_id; + + if ($inverted) { + $parent_id = $rel->subject_term->identifier; + $child_id = $rel->object_term->identifier; + } + else { + $parent_id = $rel->object_term->identifier; + $child_id = $rel->subject_term->identifier; + } + if((defined $store->{$parent_id}->{$child_id}) && + ($store->{$parent_id}->{$child_id}->name != $rel->predicate_term->name)){ + $self->throw("relationship ".Dumper($rel->predicate_term). + " between ".$parent_id." and ".$child_id. + " already defined as ". + Dumper($store->{$parent_id}->{$child_id})."\n"); + } + else { + $store->{$parent_id}->{$child_id} = $rel->predicate_term; + } +} + +=head2 add_relationship + + Title : add_relationship + Usage : add_relationship(RelationshipI relationship): RelationshipI + Function: Adds a relationship object to the ontology engine. + Example : + Returns : Its argument. + Args : A RelationshipI object. + + +=cut + +sub add_relationship{ + my ($self, $rel) = @_; + + $self->_add_relationship_simple($self->_relationship_store, + $rel, 0); + $self->_add_relationship_simple($self->_inverted_relationship_store, + $rel, 1); + $self->_relationship_type_store->{ + $self->_unique_termid($rel->predicate_term)} = $rel->predicate_term; + + return $rel; +} + +=head2 get_relationships + + Title : get_relationships + Usage : get_relationships(): RelationshipI[] + Function: Retrieves all relationship objects. + Example : + Returns : Array of RelationshipI objects + Args : + + +=cut + +sub get_relationships{ + my $self = shift; + my $term = shift; + my @rels; + my $store = $self->_relationship_store; + my $relfact = $self->relationship_factory(); + + my @parent_ids = $term ? + # if a term is supplied then only get the term's parents + (map { $_->identifier(); } $self->get_parent_terms($term)) : + # otherwise use all parent ids + (keys %{$store}); + # add the term as a parent too if one is supplied + push(@parent_ids,$term->identifier) if $term; + + foreach my $parent_id (@parent_ids) { + my $parent_entry = $store->{$parent_id}; + + # if a term is supplied, add a relationship for the parent to the term + # except if the parent is the term itself (we added that one before) + if($term && ($parent_id ne $term->identifier())) { + my $parent_term = $self->get_term_by_identifier($parent_id); + push(@rels, + $relfact->create_object(-object_term => $parent_term, + -subject_term => $term, + -predicate_term => + $parent_entry->{$term->identifier}, + -ontology => $term->ontology() + ) + ); + + } else { + # otherwise, i.e., no term supplied, or the parent equals the + # supplied term + my $parent_term = $term ? + $term : $self->get_term_by_identifier($parent_id); + foreach my $child_id (keys %$parent_entry) { + my $rel_info = $parent_entry->{$child_id}; + + push(@rels, + $relfact->create_object(-object_term => $parent_term, + -subject_term => + $self->get_term_by_identifier( + $child_id), + -predicate_term => $rel_info, + -ontology =>$parent_term->ontology + ) + ); + } + } + } + + return @rels; +} + +=head2 get_all_relationships + + Title : get_all_relationships + Usage : get_all_relationships(): RelationshipI[] + Function: Retrieves all relationship objects. + Example : + Returns : Array of RelationshipI objects + Args : + + +=cut + +sub get_all_relationships{ + return shift->get_relationships(); +} + +=head2 get_predicate_terms + + Title : get_predicate_terms + Usage : get_predicate_terms(): TermI[] + Function: Retrives all relationship types stored in the engine + Example : + Returns : reference to an array of Bio::Ontology::RelationshipType objects + Args : + + +=cut + +sub get_predicate_terms{ + my ($self) = @_; + + return values %{$self->_relationship_type_store}; +} + +=head2 _is_rel_type + + Title : _is_rel_type + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub _is_rel_type{ + my ($self, $term, @rel_types) = @_; + + foreach my $rel_type (@rel_types) { + if($rel_type->identifier || $term->identifier) { + return 1 if $rel_type->identifier eq $term->identifier; + } else { + return 1 if $rel_type->name eq $term->name; + } + } + + return 0; +} + +=head2 _typed_traversal + + Title : _typed_traversal + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub _typed_traversal{ + my ($self, $rel_store, $level, $term_id, @rel_types) = @_; + return undef if !defined($rel_store->{$term_id}); + my %parent_entry = %{$rel_store->{$term_id}}; + my @children = keys %parent_entry; + + my @ans; + + if (@rel_types > 0) { + @ans = (); + + foreach my $child_id (@children) { + push @ans, $child_id + if $self->_is_rel_type( $rel_store->{$term_id}->{$child_id}, + @rel_types); + } + } + else { + @ans = @children; + } + if ($level < 1) { + my @ans1 = (); + + foreach my $child_id (@ans) { + push @ans1, $self->_typed_traversal($rel_store, + $level - 1, $child_id, @rel_types) + if defined $rel_store->{$child_id}; + } + push @ans, @ans1; + } + + return @ans; +} + +=head2 get_child_terms + + Title : get_child_terms + Usage : get_child_terms(TermI term, TermI[] predicate_terms): TermI[] + get_child_terms(TermI term, RelationshipType[] predicate_terms): TermI[] + Function: Retrieves all child terms of a given term, that satisfy a + relationship among those that are specified in the second + argument or undef otherwise. get_child_terms is a special + case of get_descendant_terms, limiting the search to the + direct descendants. + + Example : + Returns : Array of TermI objects. + Args : First argument is the term of interest, second is the list of relationship type terms. + + +=cut + +sub get_child_terms{ + my ($self, $term, @relationship_types) = @_; + + $self->throw("must provide TermI compliant object") + unless defined($term) && $term->isa("Bio::Ontology::TermI"); + + return $self->_filter_unmarked( + $self->get_term_by_identifier( + $self->_typed_traversal($self->_relationship_store, + 1, + $term->identifier, + @relationship_types) ) ); +} + +=head2 get_descendant_terms + + Title : get_descendant_terms + Usage : get_descendant_terms(TermI term, TermI[] rel_types): TermI[] + get_child_terms(TermI term, RelationshipType[] predicate_terms): TermI[] + Function: Retrieves all descendant terms of a given term, that + satisfy a relationship among those that are specified in + the second argument or undef otherwise. Uses + _typed_traversal to find all descendants. + + Example : + Returns : Array of TermI objects. + Args : First argument is the term of interest, second is the list of relationship type terms. + + +=cut + +sub get_descendant_terms{ + my ($self, $term, @relationship_types) = @_; + + $self->throw("must provide TermI compliant object") + unless defined($term) && $term->isa("Bio::Ontology::TermI"); + + return $self->_filter_unmarked( + $self->_filter_repeated( + $self->get_term_by_identifier( + $self->_typed_traversal($self->_relationship_store, + 0, + $term->identifier, + @relationship_types) ) ) ); +} + +=head2 get_parent_terms + + Title : get_parent_terms + Usage : get_parent_terms(TermI term, TermI[] predicate_terms): TermI[] + get_child_terms(TermI term, RelationshipType[] predicate_terms): TermI[] + Function: Retrieves all parent terms of a given term, that satisfy a + relationship among those that are specified in the second + argument or undef otherwise. get_parent_terms is a special + case of get_ancestor_terms, limiting the search to the + direct ancestors. + + Example : + Returns : Array of TermI objects. + Args : First argument is the term of interest, second is the list of relationship type terms. + + +=cut + +sub get_parent_terms{ + my ($self, $term, @relationship_types) = @_; + $self->throw("term must be a valid object, not undef") unless defined $term; + + return $self->_filter_unmarked( + $self->get_term_by_identifier( + $self->_typed_traversal($self->_inverted_relationship_store, + 1, + $term->identifier, + @relationship_types) ) ); +} + +=head2 get_ancestor_terms + + Title : get_ancestor_terms + Usage : get_ancestor_terms(TermI term, TermI[] predicate_terms): TermI[] + get_child_terms(TermI term, RelationshipType[] predicate_terms): TermI[] + Function: Retrieves all ancestor terms of a given term, that satisfy + a relationship among those that are specified in the second + argument or undef otherwise. Uses _typed_traversal to find + all ancestors. + + Example : + Returns : Array of TermI objects. + Args : First argument is the term of interest, second is the list + of relationship type terms. + + +=cut + +sub get_ancestor_terms{ + my ($self, $term, @relationship_types) = @_; + $self->throw("term must be a valid object, not undef") unless defined $term; + + return $self->_filter_unmarked( + $self->_filter_repeated( + $self->get_term_by_identifier( + $self->_typed_traversal($self->_inverted_relationship_store, + 0, + $term->identifier, + @relationship_types) ) ) ); +} + +=head2 get_leaf_terms + + Title : get_leaf_terms + Usage : get_leaf_terms(): TermI[] + Function: Retrieves all leaf terms from the ontology. Leaf term is a term w/o descendants. + Example : @leaf_terms = $obj->get_leaf_terms() + Returns : Array of TermI objects. + Args : + + +=cut + +sub get_leaf_terms{ + my ($self) = @_; + my @leaf_terms; + + foreach my $term (values %{$self->_term_store}) { + push @leaf_terms, $term + if !defined $self->_relationship_store->{$term->identifier} && + defined $self->_instantiated_terms_store->{$term->identifier}; + } + + return @leaf_terms; +} + +=head2 get_root_terms + + Title : get_root_terms + Usage : get_root_terms(): TermI[] + Function: Retrieves all root terms from the ontology. Root term is a term w/o descendants. + Example : @root_terms = $obj->get_root_terms() + Returns : Array of TermI objects. + Args : + + +=cut + +sub get_root_terms{ + my ($self) = @_; + my @root_terms; + + foreach my $term (values %{$self->_term_store}) { + push @root_terms, $term + if !defined $self->_inverted_relationship_store->{$term->identifier} && + defined $self->_instantiated_terms_store->{$term->identifier}; + } + + return @root_terms; +} + +=head2 _filter_repeated + + Title : _filter_repeated + Usage : @lst = $self->_filter_repeated(@old_lst); + Function: Removes repeated terms + Example : + Returns : List of unique TermI objects + Args : List of TermI objects + + +=cut + +sub _filter_repeated{ + my ($self, @args) = @_; + my %h; + + foreach my $element (@args) { + $h{$element->identifier} = $element if !defined $h{$element->identifier}; + } + + return values %h; +} + +=head2 get_all_terms + + Title : get_all_terms + Usage : get_all_terms(): TermI[] + Function: Retrieves all terms currently stored in the ontology. + Example : @all_terms = $obj->get_all_terms() + Returns : Array of TermI objects. + Args : + + +=cut + +sub get_all_terms{ + my ($self) = @_; + + return $self->_filter_unmarked( values %{$self->_term_store} ); +} + +=head2 find_terms + + Title : find_terms + Usage : ($term) = $oe->find_terms(-identifier => "SO:0000263"); + Function: Find term instances matching queries for their attributes. + + This implementation can efficiently resolve queries by + identifier. + + Example : + Returns : an array of zero or more Bio::Ontology::TermI objects + Args : Named parameters. The following parameters should be recognized + by any implementations: + + -identifier query by the given identifier + -name query by the given name + + +=cut + +sub find_terms{ + my ($self,@args) = @_; + my @terms; + + my ($id,$name) = $self->_rearrange([qw(IDENTIFIER NAME)],@args); + + if(defined($id)) { + @terms = $self->get_term_by_identifier($id); + } else { + @terms = $self->get_all_terms(); + } + if(defined($name)) { + @terms = grep { $_->name() eq $name; } @terms; + } + return @terms; +} + + +=head2 relationship_factory + + Title : relationship_factory + Usage : $fact = $obj->relationship_factory() + Function: Get/set the object factory to be used when relationship + objects are created by the implementation on-the-fly. + + Example : + Returns : value of relationship_factory (a Bio::Factory::ObjectFactoryI + compliant object) + Args : on set, a Bio::Factory::ObjectFactoryI compliant object + + +=cut + +sub relationship_factory{ + my $self = shift; + + return $self->{'relationship_factory'} = shift if @_; + return $self->{'relationship_factory'}; +} + +=head2 term_factory + + Title : term_factory + Usage : $fact = $obj->term_factory() + Function: Get/set the object factory to be used when term objects are + created by the implementation on-the-fly. + + Note that this ontology engine implementation does not + create term objects on the fly, and therefore setting this + attribute is meaningless. + + Example : + Returns : value of term_factory (a Bio::Factory::ObjectFactoryI + compliant object) + Args : on set, a Bio::Factory::ObjectFactoryI compliant object + + +=cut + +sub term_factory{ + my $self = shift; + + if(@_) { + $self->warn("setting term factory, but ".ref($self). + " does not create terms on-the-fly"); + return $self->{'term_factory'} = shift; + } + return $self->{'term_factory'}; +} + +=head2 _filter_unmarked + + Title : _filter_unmarked + Usage : _filter_unmarked(TermI[] terms): TermI[] + Function: Removes the uninstantiated terms from the list of terms + Example : + Returns : array of fully instantiated TermI objects + Args : array of TermI objects + + +=cut + +sub _filter_unmarked{ + my ($self, @terms) = @_; + my @filtered_terms = (); + + if ( scalar(@terms) >= 1) { + foreach my $term (@terms) { + push @filtered_terms, $term + if defined $self->_instantiated_terms_store->{$term->identifier}; + } + } + + return @filtered_terms; +} + +=head2 remove_term_by_id + + Title : remove_term_by_id + Usage : remove_term_by_id(String id): TermI + Function: Removes TermI object from the ontology engine using the + string id as an identifier. Current implementation does not + enforce consistency of the relationships using that term. + Example : $term = $soe->remove_term_by_id($id); + Returns : Object of class TermI or undef if not found. + Args : The string identifier of a term. + + +=cut + +sub remove_term_by_id{ + my ($self, $id) = @_; + + if ( $self->get_term_by_identifier($id) ) { + my $term = $self->{_term_store}->{$id}; + delete $self->{_term_store}->{$id}; + return $term; + } + else { + $self->warn("Term with id '$id' is not in the term store"); + return undef; + } +} + +=head2 to_string + + Title : to_string + Usage : print $sv->to_string(); + Function: Currently returns formatted string containing the number of + terms and number of relationships from the ontology engine. + Example : print $sv->to_string(); + Returns : + Args : + + +=cut + +sub to_string{ + my ($self) = @_; + my $s = ""; + + $s .= "-- # Terms:\n"; + $s .= scalar($self->get_all_terms)."\n"; + $s .= "-- # Relationships:\n"; + $s .= $self->_get_number_rels."\n"; + + return $s; +} + +=head2 _unique_termid + + Title : _unique_termid + Usage : + Function: Returns a string that can be used as ID using fail-over + approaches. + + If the identifier attribute is not set, it uses the + combination of name and ontology name, provided both are + set. If they aren't, it returns the name alone. + + Note that this is a private method. Call from inheriting + classes but not from outside. + + Example : + Returns : a string + Args : a Bio::Ontology::TermI compliant object + + +=cut + +sub _unique_termid{ + my $self = shift; + my $term = shift; + + return $term->identifier() if $term->identifier(); + my $id = $term->ontology->name() if $term->ontology(); + if($id) { + $id .= '|'; + } else { + $id = ''; + } + $id .= $term->name(); +} + + +################################################################# +# aliases +################################################################# + +*get_relationship_types = \&get_predicate_terms; + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Ontology/Term.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Ontology/Term.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,746 @@ +# $Id: Term.pm,v 1.8.2.3 2003/05/27 22:00:52 lapp Exp $ +# +# BioPerl module for Bio::Ontology::Term +# +# Cared for by Christian M. Zmasek or +# +# (c) Christian M. Zmasek, czmasek@gnf.org, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + + +=head1 NAME + +Term - interface for ontology terms + +=head1 SYNOPSIS + +#get Bio::Ontology::TermI somehow. + + print $term->identifier(), "\n"; + print $term->name(), "\n"; + print $term->definition(), "\n"; + print $term->is_obsolete(), "\n"; + print $term->comment(), "\n"; + + foreach my $synonym ( $term->each_synonym() ) { + print $synonym, "\n"; + } + +=head1 DESCRIPTION + +This is "dumb" interface for ontology terms providing basic methods +(it provides no functionality related to graphs). It implements the +L interface. + +This class also implements L and +L. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Christian M. Zmasek + +Email: czmasek@gnf.org or cmzmasek@yahoo.com + +WWW: http://www.genetics.wustl.edu/eddy/people/zmasek/ + +Address: + + Genomics Institute of the Novartis Research Foundation + 10675 John Jay Hopkins Drive + San Diego, CA 92121 + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. + +=cut + + +# Let the code begin... + +package Bio::Ontology::Term; +use vars qw( @ISA ); +use strict; +use Bio::Root::Object; +use Bio::Ontology::TermI; +use Bio::Ontology::Ontology; +use Bio::Ontology::OntologyStore; +use Bio::IdentifiableI; +use Bio::DescribableI; + +use constant TRUE => 1; +use constant FALSE => 0; + +@ISA = qw( Bio::Root::Root + Bio::Ontology::TermI + Bio::IdentifiableI + Bio::DescribableI + ); + + + +=head2 new + + Title : new + Usage : $term = Bio::Ontology::Term->new( -identifier => "16847", + -name => "1-aminocyclopropane-1-carboxylate synthase", + -definition => "Catalysis of ...", + -is_obsolete => 0, + -comment => "" ); + Function: Creates a new Bio::Ontology::Term. + Returns : A new Bio::Ontology::Term object. + Args : -identifier => the identifier of this term [scalar] + -name => the name of this term [scalar] + -definition => the definition of this term [scalar] + -ontology => the ontology this term lives in + (a L object) + -version => version information [scalar] + -is_obsolete => the obsoleteness of this term [0 or 1] + -comment => a comment [scalar] + +=cut + +sub new { + + my( $class,@args ) = @_; + + my $self = $class->SUPER::new( @args ); + + my ( $identifier, + $name, + $definition, + $category, + $ont, + $version, + $is_obsolete, + $comment, + $dblinks) + = $self->_rearrange( [ qw( IDENTIFIER + NAME + DEFINITION + CATEGORY + ONTOLOGY + VERSION + IS_OBSOLETE + COMMENT + DBLINKS + ) ], @args ); + + $self->init(); + + $identifier && $self->identifier( $identifier ); + $name && $self->name( $name ); + $definition && $self->definition( $definition ); + $category && $self->category( $category ); + $ont && $self->ontology( $ont ); + defined($version) && $self->version( $version ); + defined($is_obsolete) && $self->is_obsolete( $is_obsolete ); + $comment && $self->comment( $comment ); + ref($dblinks) && $self->add_dblink(@$dblinks); + + return $self; + +} # new + + + +sub init { + + my $self = shift; + + $self->identifier(undef); + $self->name(undef); + $self->comment(undef); + $self->definition(undef); + $self->ontology(undef); + $self->is_obsolete(0); + $self->remove_synonyms(); + $self->remove_dblinks(); + $self->remove_secondary_ids(); + +} # init + + + +=head2 identifier + + Title : identifier + Usage : $term->identifier( "0003947" ); + or + print $term->identifier(); + Function: Set/get for the identifier of this Term. + Returns : The identifier [scalar]. + Args : The identifier [scalar] (optional). + +=cut + +sub identifier { + my $self = shift; + + return $self->{'identifier'} = shift if @_; + return $self->{'identifier'}; +} # identifier + + + + +=head2 name + + Title : name + Usage : $term->name( "N-acetylgalactosaminyltransferase" ); + or + print $term->name(); + Function: Set/get for the name of this Term. + Returns : The name [scalar]. + Args : The name [scalar] (optional). + +=cut + +sub name { + my $self = shift; + + return $self->{'name'} = shift if @_; + return $self->{'name'}; +} # name + + + + + +=head2 definition + + Title : definition + Usage : $term->definition( "Catalysis of ..." ); + or + print $term->definition(); + Function: Set/get for the definition of this Term. + Returns : The definition [scalar]. + Args : The definition [scalar] (optional). + +=cut + +sub definition { + my $self = shift; + + return $self->{'definition'} = shift if @_; + return $self->{'definition'}; +} # definition + + +=head2 ontology + + Title : ontology + Usage : $ont = $term->ontology(); + or + $term->ontology( $ont ); + Function: Get the ontology this term is in. + + Note that with the ontology in hand you can query for all + related terms etc. See L. + + Returns : The ontology of this Term as a L + implementing object. + Args : On set, the ontology of this Term as a L + implementing object or a string representing its name. + +=cut + +sub ontology { + my $self = shift; + my $ont; + + if(@_) { + $ont = shift; + if($ont) { + $ont = Bio::Ontology::Ontology->new(-name => $ont) if ! ref($ont); + if(! $ont->isa("Bio::Ontology::OntologyI")) { + $self->throw(ref($ont)." does not implement ". + "Bio::Ontology::OntologyI. Bummer."); + } + } + return $self->{"_ontology"} = $ont; + } + return $self->{"_ontology"}; +} # ontology + + +=head2 version + + Title : version + Usage : $term->version( "1.00" ); + or + print $term->version(); + Function: Set/get for version information. + Returns : The version [scalar]. + Args : The version [scalar] (optional). + +=cut + +sub version { + my $self = shift; + + return $self->{'version'} = shift if @_; + return $self->{'version'}; +} # version + + + +=head2 is_obsolete + + Title : is_obsolete + Usage : $term->is_obsolete( 1 ); + or + if ( $term->is_obsolete() ) + Function: Set/get for the obsoleteness of this Term. + Returns : the obsoleteness [0 or 1]. + Args : the obsoleteness [0 or 1] (optional). + +=cut + +sub is_obsolete{ + my $self = shift; + + return $self->{'is_obsolete'} = shift if @_; + return $self->{'is_obsolete'}; +} # is_obsolete + + + + + +=head2 comment + + Title : comment + Usage : $term->comment( "Consider the term ..." ); + or + print $term->comment(); + Function: Set/get for an arbitrary comment about this Term. + Returns : A comment. + Args : A comment (optional). + +=cut + +sub comment{ + my $self = shift; + + return $self->{'comment'} = shift if @_; + return $self->{'comment'}; +} # comment + + + + +=head2 get_synonyms + + Title : get_synonyms + Usage : @aliases = $term->get_synonyms; + Function: Returns a list of aliases of this Term. + Returns : A list of aliases [array of [scalar]]. + Args : + +=cut + +sub get_synonyms { + my $self = shift; + + return @{ $self->{ "_synonyms" } } if exists($self->{ "_synonyms" }); + return (); +} # get_synonyms + + +=head2 add_synonym + + Title : add_synonym + Usage : $term->add_synonym( @asynonyms ); + or + $term->add_synonym( $synonym ); + Function: Pushes one or more synonyms into the list of synonyms. + Returns : + Args : One synonym [scalar] or a list of synonyms [array of [scalar]]. + +=cut + +sub add_synonym { + my ( $self, @values ) = @_; + + return unless( @values ); + + # avoid duplicates + foreach my $syn (@values) { + next if grep { $_ eq $syn; } @{$self->{ "_synonyms" }}; + push( @{ $self->{ "_synonyms" } }, $syn ); + } + +} # add_synonym + + +=head2 remove_synonyms + + Title : remove_synonyms() + Usage : $term->remove_synonyms(); + Function: Deletes (and returns) the synonyms of this Term. + Returns : A list of synonyms [array of [scalar]]. + Args : + +=cut + +sub remove_synonyms { + my ( $self ) = @_; + + my @a = $self->get_synonyms(); + $self->{ "_synonyms" } = []; + return @a; + +} # remove_synonyms + +=head2 get_dblinks + + Title : get_dblinks() + Usage : @ds = $term->get_dblinks(); + Function: Returns a list of each dblinks of this GO term. + Returns : A list of dblinks [array of [scalars]]. + Args : + +=cut + +sub get_dblinks { + my $self = shift; + + return @{$self->{ "_dblinks" }} if exists($self->{ "_dblinks" }); + return (); +} # get_dblinks + + +=head2 add_dblink + + Title : add_dblink + Usage : $term->add_dblink( @dbls ); + or + $term->add_dblink( $dbl ); + Function: Pushes one or more dblinks onto the list of dblinks. + Returns : + Args : One dblink [scalar] or a list of + dblinks [array of [scalars]]. + +=cut + +sub add_dblink { + my ( $self, @values ) = @_; + + return unless( @values ); + + # avoid duplicates + foreach my $dbl (@values) { + next if grep { $_ eq $dbl; } @{$self->{ "_dblinks" }}; + push( @{ $self->{ "_dblinks" } }, $dbl ); + } + +} # add_dblink + + +=head2 remove_dblinks + + Title : remove_dblinks() + Usage : $term->remove_dblinks(); + Function: Deletes (and returns) the definition references of this GO term. + Returns : A list of definition references [array of [scalars]]. + Args : + +=cut + +sub remove_dblinks { + my ( $self ) = @_; + + my @a = $self->get_dblinks(); + $self->{ "_dblinks" } = []; + return @a; + +} # remove_dblinks + +=head2 get_secondary_ids + + Title : get_secondary_ids + Usage : @ids = $term->get_secondary_ids(); + Function: Returns a list of secondary identifiers of this Term. + + Secondary identifiers mostly originate from merging terms, + or possibly also from splitting terms. + + Returns : A list of secondary identifiers [array of [scalar]] + Args : + +=cut + +sub get_secondary_ids { + my $self = shift; + + return @{$self->{"_secondary_ids"}} if exists($self->{"_secondary_ids"}); + return (); +} # get_secondary_ids + + +=head2 add_secondary_id + + Title : add_secondary_id + Usage : $term->add_secondary_id( @ids ); + or + $term->add_secondary_id( $id ); + Function: Adds one or more secondary identifiers to this term. + Returns : + Args : One or more secondary identifiers [scalars] + +=cut + +sub add_secondary_id { + my $self = shift; + + return unless @_; + + # avoid duplicates + foreach my $id (@_) { + next if grep { $_ eq $id; } @{$self->{ "_secondary_ids" }}; + push( @{ $self->{ "_secondary_ids" } }, $id ); + } + +} # add_secondary_id + + +=head2 remove_secondary_ids + + Title : remove_secondary_ids + Usage : $term->remove_secondary_ids(); + Function: Deletes (and returns) the secondary identifiers of this Term. + Returns : The previous list of secondary identifiers [array of [scalars]] + Args : + +=cut + +sub remove_secondary_ids { + my $self = shift; + + my @a = $self->get_secondary_ids(); + $self->{ "_secondary_ids" } = []; + return @a; + +} # remove_secondary_ids + + +# Title :_is_true_or_false +# Function: Checks whether the argument is TRUE or FALSE. +# Returns : +# Args : The value to be checked. +sub _is_true_or_false { + my ( $self, $value ) = @_; + unless ( $value !~ /\D/ && ( $value == TRUE || $value == FALSE ) ) { + $self->throw( "Found [" . $value + . "] where " . TRUE . " or " . FALSE . " expected" ); + } +} # _is_true_or_false + +=head1 Methods implementing L and L + +=cut + +=head2 object_id + + Title : object_id + Usage : $string = $obj->object_id() + Function: a string which represents the stable primary identifier + in this namespace of this object. + + This is a synonym for identifier(). + + Returns : A scalar + +=cut + +sub object_id { + return shift->identifier(@_); +} + +=head2 authority + + Title : authority + Usage : $authority = $obj->authority() + Function: a string which represents the organisation which + granted the namespace, written as the DNS name for + organisation (eg, wormbase.org) + + This forwards to ontology()->authority(). Note that you + cannot set the authority before having set the ontology or + the namespace (which will set the ontology). + + Returns : A scalar + Args : on set, the new value (a scalar) + +=cut + +sub authority { + my $self = shift; + my $ont = $self->ontology(); + + return $ont->authority(@_) if $ont; + $self->throw("cannot manipulate authority prior to ". + "setting the namespace or ontology") if @_; + return undef; +} + + +=head2 namespace + + Title : namespace + Usage : $string = $obj->namespace() + Function: A string representing the name space this identifier + is valid in, often the database name or the name + describing the collection. + + This forwards to ontology() (set mode) and + ontology()->name() (get mode). I.e., setting the namespace + will set the ontology to one matching that name in the + ontology store, or to one newly created. + + Returns : A scalar + Args : on set, the new value (a scalar) + +=cut + +sub namespace { + my $self = shift; + + $self->ontology(@_) if(@_); + my $ont = $self->ontology(); + return defined($ont) ? $ont->name() : undef; +} + +=head2 display_name + + Title : display_name + Usage : $string = $obj->display_name() + Function: A string which is what should be displayed to the user. + + The definition in L states that the + string should not contain spaces. As this isn't very + sensible for ontology terms, we relax this here. The + implementation just forwards to name(). + + Returns : A scalar + Args : on set, the new value (a scalar) + +=cut + +sub display_name { + return shift->name(@_); +} + + +=head2 description + + Title : description + Usage : $string = $obj->description() + Function: A text string suitable for displaying to the user a + description. This string is likely to have spaces, but + should not have any newlines or formatting - just plain + text. + + This forwards to definition(). The caveat is that the text + will often be longer for ontology term definitions than the + 255 characters stated in the definition in + L. + + Returns : A scalar + Args : on set, the new value (a scalar) + +=cut + +sub description { + return shift->definition(@_); +} + +################################################################# +# aliases or forwards to maintain backward compatibility +################################################################# + +=head1 Deprecated methods + +Used for looking up the methods that supercedes them. + +=cut + +=head2 category + + Title : category + Usage : + Function: This method is deprecated. Use ontology() instead. + Example : + Returns : + Args : + + +=cut + +sub category { + my $self = shift; + + $self->warn("TermI::category is deprecated and being phased out. ". + "Use TermI::ontology instead."); + + # called in set mode? + if(@_) { + # yes; what is incompatible with ontology() is if we were given + # a TermI object + my $arg = shift; + $arg = $arg->name() if ref($arg) && $arg->isa("Bio::Ontology::TermI"); + return $self->ontology($arg,@_); + } else { + # No, called in get mode. This is always incompatible with ontology() + # since category is supposed to return a TermI. + my $ont = $self->ontology(); + my $term; + if(defined($ont)) { + $term = Bio::Ontology::Term->new(-name => $ont->name(), + -identifier =>$ont->identifier()); + } + return $term; + } +} # category + +*each_synonym = \&get_synonyms; +*add_synonyms = \&add_synonym; +*each_dblink = \&get_dblinks; +*add_dblinks = \&add_dblink; + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Ontology/TermFactory.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Ontology/TermFactory.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,134 @@ +# $Id: TermFactory.pm,v 1.1.2.2 2003/03/27 10:07:56 lapp Exp $ +# +# BioPerl module for Bio::Ontology::TermFactory +# +# Cared for by Hilmar Lapp +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# +# (c) Hilmar Lapp, hlapp at gmx.net, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Ontology::TermFactory - Instantiates a new Bio::Ontology::TermI (or derived class) through a factory + +=head1 SYNOPSIS + + use Bio::Ontology::TermFactory; + + # the default type is Bio::Ontology::Term + my $factory = new Bio::Ontology::TermFactory(-type => 'Bio::Ontology::GOterm'); + my $term = $factory->create_object(-name => 'peroxisome', + -ontology => 'Gene Ontology', + -identifier => 'GO:0005777'); + + +=head1 DESCRIPTION + +This object will build L objects generically. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp at gmx.net + + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Ontology::TermFactory; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::Factory::ObjectFactory; + +@ISA = qw(Bio::Factory::ObjectFactory); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Ontology::TermFactory(); + Function: Builds a new Bio::Ontology::TermFactory object + Returns : Bio::Ontology::TermFactory + Args : -type => string, name of a L derived class. + The default is L. + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + # make sure this matches our requirements + $self->interface("Bio::Ontology::TermI"); + $self->type($self->type() || "Bio::Ontology::Term"); + + return $self; +} + + +=head2 create_object + + Title : create_object + Usage : my $term = $factory->create_object(); + Function: Instantiates new Bio::Ontology::TermI (or one of its child classes) + + This object allows us to genericize the instantiation of + Term objects. + + Returns : L compliant object + The return type is configurable using new(-type =>"..."). + Args : initialization parameters specific to the type of term + object we want. Typically + -name => $name + -identifier => identifier for the term + -ontology => ontology for the term + +=cut + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Ontology/TermI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Ontology/TermI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,308 @@ +# $Id: TermI.pm,v 1.8.2.3 2003/05/27 22:00:52 lapp Exp $ +# +# BioPerl module for Bio::Ontology::Term +# +# Cared for by Christian M. Zmasek or +# +# (c) Christian M. Zmasek, czmasek@gnf.org, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + + +=head1 NAME + +TermI - interface for ontology terms + +=head1 SYNOPSIS + + #get Bio::Ontology::TermI somehow. + + print $term->identifier(), "\n"; + print $term->name(), "\n"; + print $term->definition(), "\n"; + print $term->is_obsolete(), "\n"; + print $term->comment(), "\n"; + + foreach my $synonym ( $term->get_synonyms() ) { + print $synonym, "\n"; + } + + +=head1 DESCRIPTION + +This is "dumb" interface for ontology terms providing basic methods +(it provides no functionality related to graphs). + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Christian M. Zmasek + +Email: czmasek@gnf.org or cmzmasek@yahoo.com + +WWW: http://www.genetics.wustl.edu/eddy/people/zmasek/ + +Address: + + Genomics Institute of the Novartis Research Foundation + 10675 John Jay Hopkins Drive + San Diego, CA 92121 + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. + +=cut + + +# Let the code begin... + +package Bio::Ontology::TermI; +use vars qw( @ISA ); +use strict; +use Bio::Root::RootI; + +@ISA = qw( Bio::Root::RootI ); + + + +=head2 identifier + + Title : identifier + Usage : $term->identifier( "0003947" ); + or + print $term->identifier(); + Function: Set/get for the identifier of this Term. + Returns : The identifier [scalar]. + Args : The identifier [scalar] (optional). + +=cut + +sub identifier { + shift->throw_not_implemented(); +} # identifier + + + + +=head2 name + + Title : name + Usage : $term->name( "N-acetylgalactosaminyltransferase" ); + or + print $term->name(); + Function: Set/get for the name of this Term. + Returns : The name [scalar]. + Args : The name [scalar] (optional). + +=cut + +sub name { + shift->throw_not_implemented(); +} # name + + + + + +=head2 definition + + Title : definition + Usage : $term->definition( "Catalysis of ..." ); + or + print $term->definition(); + Function: Set/get for the definition of this Term. + Returns : The definition [scalar]. + Args : The definition [scalar] (optional). + +=cut + +sub definition { + shift->throw_not_implemented(); +} # definition + + + +=head2 ontology + + Title : ontology + Usage : $ont = $term->ontology(); + or + $term->ontology( $ont ); + Function: Get the ontology this term is in. + + An implementation may not permit the value of this + attribute to be changed once it is set, since that may have + serious consequences (note that with the ontology in hand + you can query for all related terms etc). + + Note for implementors: you will almost certainly have to + take special precaution in order not to create cyclical + references in memory. + + Returns : The ontology of this Term as a L + implementing object. + Args : On set, the ontology of this Term as a L + implementing object or a string representing its name. + +=cut + +sub ontology { + shift->throw_not_implemented(); +} # ontology + + + +=head2 version + + Title : version + Usage : $term->version( "1.00" ); + or + print $term->version(); + Function: Set/get for version information. + Returns : The version [scalar]. + Args : The version [scalar] (optional). + +=cut + +sub version { + shift->throw_not_implemented(); +} # version + + + + +=head2 is_obsolete + + Title : is_obsolete + Usage : $term->is_obsolete( 1 ); + or + if ( $term->is_obsolete() ) + Function: Set/get for the obsoleteness of this Term. + Returns : the obsoleteness [0 or 1]. + Args : the obsoleteness [0 or 1] (optional). + +=cut + +sub is_obsolete { + shift->throw_not_implemented(); +} # is_obsolete + + + +=head2 comment + + Title : comment + Usage : $term->comment( "Consider the term ..." ); + or + print $term->comment(); + Function: Set/get for an arbitrary comment about this Term. + Returns : A comment. + Args : A comment (optional). + +=cut + +sub comment { + shift->throw_not_implemented(); +} # comment + + + + +=head2 get_synonyms + + Title : get_synonyms + Usage : @aliases = $term->get_synonyms(); + Function: Returns a list of aliases of this Term. + + If an implementor of this interface permits modification of + this array property, the class should define at least + methods add_synonym() and remove_synonyms(), with obvious + functionality. + + Returns : A list of aliases [array of [scalar]]. + Args : + +=cut + +sub get_synonyms { + shift->throw_not_implemented(); +} # get_synonyms + +=head2 get_dblinks + + Title : get_dblinks() + Usage : @ds = $term->get_dblinks(); + Function: Returns a list of each dblink of this term. + + If an implementor of this interface permits modification of + this array property, the class should define at least + methods add_dblink() and remove_dblinks(), with obvious + functionality. + + Returns : A list of dblinks [array of [scalars]]. + Args : + +=cut + +sub get_dblinks { + shift->throw_not_implemented(); +} # get_dblinks + +=head2 get_secondary_ids + + Title : get_secondary_ids + Usage : @ids = $term->get_secondary_ids(); + Function: Returns a list of secondary identifiers of this Term. + + Secondary identifiers mostly originate from merging terms, + or possibly also from splitting terms. + + If an implementor of this interface permits modification of + this array property, the class should define at least + methods add_secondary_id() and remove_secondary_ids(), with + obvious functionality. + + Returns : A list of secondary identifiers [array of [scalar]] + Args : + +=cut + +sub get_secondary_ids { + shift->throw_not_implemented(); +} # get_secondary_ids + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/OntologyIO.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/OntologyIO.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,288 @@ +# $Id: OntologyIO.pm,v 1.3.2.1 2003/03/13 02:09:19 lapp Exp $ +# +# BioPerl module for Bio::OntologyIO +# +# Cared for by Hilmar Lapp +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# +# (c) Hilmar Lapp, hlapp at gmx.net, 2003. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2003. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::OntologyIO - Parser factory for Ontology formats + +=head1 SYNOPSIS + + use Bio::OntologyIO; + + my $parser = Bio::OntologyIO->new(-format => "go", ...); + + while(my $ont = $parser->next_ontology()) { + print "read ontology ",$ont->name()," with ", + scalar($ont->get_root_terms)," root terms, and ", + scalar($ont->get_leaf_terms)," leaf terms\n"; + } + +=head1 DESCRIPTION + +This is the parser factory for different ontology sources and +formats. Conceptually, it is very similar to L, but the +difference is that the chunk of data returned as an object is an +entire ontology. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +the web: + + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp at gmx.net + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::OntologyIO; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::Root::Root; +use Bio::Root::IO; + +@ISA = qw(Bio::Root::Root Bio::Root::IO); + +# +# Maps from format name to driver suitable for the format. +# +my %format_driver_map = ( + "go" => "goflat", + "so" => "soflat", + "interpro" => "InterProParser", + ); + +=head2 new + + Title : new + Usage : my $parser = Bio::OntologyIO->new(-format => 'go', @args); + Function: Returns a stream of ontologies opened on the specified input + for the specified format. + Returns : An ontology parser (an instance of Bio::OntologyIO) initialized + for the specified format. + Args : Named parameters. Common parameters are + + -format - the format of the input; supported right now are + 'go' (synonymous with goflat), 'so' (synonymous + with soflat), and 'interpro' + -file - the file holding the data + -fh - the stream providing the data (-file and -fh are + mutually exclusive) + -ontology_name - the name of the ontology + -engine - the L object + to be reused (will be created otherwise); note + that every L will + qualify as well since that one inherits from the + former. + -term_factory - the ontology term factory to use. Provide a + value only if you know what you are doing. + + DAG-Edit flat file parsers will usually also accept the + following parameters. + + -defs_file - the name of the file holding the term + definitions + -files - an array ref holding the file names (for GO, + there will usually be 3 files: component.ontology, + function.ontology, process.ontology) + + Other parameters are specific to the parsers. + +=cut + +sub new { + my ($caller,@args) = @_; + my $class = ref($caller) || $caller; + + # or do we want to call SUPER on an object if $caller is an + # object? + if( $class =~ /Bio::OntologyIO::(\S+)/ ) { + my ($self) = $class->SUPER::new(@args); + $self->_initialize(@args); + return $self; + } else { + + my %param = @args; + @param{ map { lc $_ } keys %param } = values %param; # lowercase keys + my $format = $class->_map_format($param{'-format'}); + + # normalize capitalization + return undef unless( $class->_load_format_module($format) ); + return "Bio::OntologyIO::$format"->new(@args); + } +} + +sub _initialize { + my($self, @args) = @_; + + # initialize factories etc + my ($eng,$fact,$ontname) = + $self->_rearrange([qw(TERM_FACTORY) + ], @args); + + # term object factory + $self->term_factory($fact) if $fact; + + # initialize the Bio::Root::IO part + $self->_initialize_io(@args); +} + +=head2 next_ontology + + Title : next_ontology + Usage : $ont = $stream->next_ontology() + Function: Reads the next ontology object from the stream and returns it. + Returns : a L compliant object, or undef at the + end of the stream + Args : none + + +=cut + +sub next_ontology { + shift->throw_not_implemented(); +} + +=head2 term_factory + + Title : term_factory + Usage : $obj->term_factory($newval) + Function: Get/set the ontology term factory to use. + + As a user of this module it is not necessary to call this + method as there will be default. In order to change the + default, the easiest way is to instantiate + L with the proper -type + argument. Most if not all parsers will actually use this + very implementation, so even easier than the aforementioned + way is to simply call + $ontio->term_factory->type("Bio::Ontology::MyTerm"). + + Example : + Returns : value of term_factory (a Bio::Factory::ObjectFactoryI object) + Args : on set, new value (a Bio::Factory::ObjectFactoryI object, optional) + + +=cut + +sub term_factory{ + my $self = shift; + + return $self->{'term_factory'} = shift if @_; + return $self->{'term_factory'}; +} + +=head1 Private Methods + + Some of these are actually 'protected' in OO speak, which means you + may or will want to utilize them in a derived ontology parser, but + you should not call them from outside. + +=cut + +=head2 _load_format_module + + Title : _load_format_module + Usage : *INTERNAL OntologyIO stuff* + Function: Loads up (like use) a module at run time on demand + Example : + Returns : + Args : + +=cut + +sub _load_format_module { + my ($self, $format) = @_; + my $module = "Bio::OntologyIO::" . $format; + my $ok; + + eval { + $ok = $self->_load_module($module); + }; + if ( $@ ) { + print STDERR <close(); +} + +sub _map_format { + my $self = shift; + my $format = shift; + my $mod; + + if($format) { + $mod = $format_driver_map{lc($format)}; + $mod = lc($format) unless $mod; + } else { + $self->throw("unable to guess ontology format, specify -format"); + } + return $mod; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/OntologyIO/Handlers/InterProHandler.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/OntologyIO/Handlers/InterProHandler.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,718 @@ +# $Id: InterProHandler.pm,v 1.7.2.2 2003/03/27 10:07:57 lapp Exp $ +# +# BioPerl module for InterProHandler +# +# Cared for by Peter Dimitrov +# +# Copyright Peter Dimitrov +# (c) Peter Dimitrov, dimitrov@gnf.org, 2003. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2003. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# +# POD documentation - main docs before the code + +=head1 NAME + +InterProHandler - XML handler class for InterProParser + +=head1 SYNOPSIS + + # do not use directly - used and instantiated by InterProParser + +=head1 DESCRIPTION + +Handles xml events generated by InterProParser when parsing InterPro +xml files. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + +bioperl-l@bioperl.org - General discussion +http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + +bioperl-bugs@bioperl.org +http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Peter Dimitrov + +Email dimitrov@gnf.org + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::OntologyIO::Handlers::InterProHandler; +use vars qw(@ISA); +use strict; +use Carp; +use Bio::Root::Root; +use Bio::Ontology::Ontology; +use Bio::Ontology::RelationshipType; +use Bio::Ontology::SimpleOntologyEngine; +use Bio::Annotation::Reference; +use Data::Dumper; + +@ISA = qw(Bio::Root::Root); + +my ($record_count, $processed_count, $is_a_rel, $contains_rel, $found_in_rel); + +=head2 new + + Title : new + Usage : $h = Bio::OntologyIO::Handlers::InterProHandler->new; + Function: Initializes global variables + Example : + Returns : an InterProHandler object + Args : + + +=cut + +sub new{ + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + my ($eng,$ont,$name,$fact) = + $self->_rearrange([qw(ENGINE + ONTOLOGY + ONTOLOGY_NAME + TERM_FACTORY) + ],@args); + + if(defined($ont)) { + $self->ontology($ont); + } else { + $name = "InterPro" unless $name; + $self->ontology(Bio::Ontology::Ontology->new(-name => $name)); + } + $self->ontology_engine($eng) if $eng; + + $self->term_factory($fact) if $fact; + + $is_a_rel = Bio::Ontology::RelationshipType->get_instance( "IS_A" ); + $contains_rel = Bio::Ontology::RelationshipType->get_instance( "CONTAINS" ); + $found_in_rel = Bio::Ontology::RelationshipType->get_instance( "FOUND_IN" ); + $self->_cite_skip(0); + $self->secondary_accessions_map( {} ); + + return $self; +} + +=head2 ontology_engine + + Title : ontology_engine + Usage : $obj->ontology_engine($newval) + Function: Get/set ontology engine. Can be initialized only once. + Example : + Returns : value of ontology_engine (a scalar) + Args : new value (a scalar, optional) + + +=cut + +sub ontology_engine{ + my ($self, $value) = @_; + + if( defined $value) { + if ( defined $self->{'ontology_engine'}) { + $self->throw("ontology_engine already defined"); + } else { + $self->throw(ref($value)." does not implement ". + "Bio::Ontology::OntologyEngineI. Bummer.") + unless $value->isa("Bio::Ontology::OntologyEngineI"); + $self->{'ontology_engine'} = $value; + + # don't forget to set this as the engine of the ontology, otherwise + # those two might not point to the same object + my $ont = $self->ontology(); + if($ont && $ont->can("engine") && (!$ont->engine())) { + $ont->engine($value); + } + + $self->debug(ref($self) . + "::ontology_engine: registering ontology engine (". + ref($value)."):\n". + $value->to_string."\n"); + } + } + + return $self->{'ontology_engine'}; +} + +=head2 ontology + + Title : ontology + Usage : + Function: Get the ontology to add the InterPro terms to. + + The value is determined automatically once ontology_engine + has been set and if it hasn't been set before. + + Example : + Returns : A L implementing object. + Args : On set, a L implementing object. + +=cut + +sub ontology{ + my ($self,$ont) = @_; + + if(defined($ont)) { + $self->throw(ref($ont)." does not implement Bio::Ontology::OntologyI". + ". Bummer.") + unless $ont->isa("Bio::Ontology::OntologyI"); + $self->{'_ontology'} = $ont; + } + return $self->{'_ontology'}; +} + +=head2 term_factory + + Title : term_factory + Usage : $obj->term_factory($newval) + Function: Get/set the ontology term object factory + Example : + Returns : value of term_factory (a Bio::Factory::ObjectFactory instance) + Args : on set, new value (a Bio::Factory::ObjectFactory instance + or undef, optional) + + +=cut + +sub term_factory{ + my $self = shift; + + return $self->{'term_factory'} = shift if @_; + return $self->{'term_factory'}; +} + +=head2 _cite_skip + + Title : _cite_skip + Usage : $obj->_cite_skip($newval) + Function: + Example : + Returns : value of _cite_skip (a scalar) + Args : new value (a scalar, optional) + + +=cut + +sub _cite_skip{ + my ($self, $value) = @_; + + if( defined $value) { + $self->{'_cite_skip'} = $value; + } + + return $self->{'_cite_skip'}; +} + +=head2 _hash + + Title : _hash + Usage : $obj->_hash($newval) + Function: + Example : + Returns : value of _hash (a scalar) + Args : new value (a scalar, optional) + + +=cut + +sub _hash{ + my ($self, $value) = @_; + + if( defined $value) { + $self->{'_hash'} = $value; + } + + return $self->{'_hash'}; +} + +=head2 _stack + + Title : _stack + Usage : $obj->_stack($newval) + Function: + Example : + Returns : value of _stack (a scalar) + Args : new value (a scalar, optional) + + +=cut + +sub _stack{ + my ($self, $value) = @_; + + if( defined $value) { + $self->{'_stack'} = $value; + } + return $self->{'_stack'}; +} + +=head2 _top + + Title : _top + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub _top{ + my ($self, $_stack) = @_; + my @stack = @{$_stack}; + + return (@stack >= 1) ? $stack[@stack - 1] : undef; +} + +=head2 _term + + Title : _term + Usage : $obj->_term($newval) + Function: Get/set method for the term currently processed. + Example : + Returns : value of term (a scalar) + Args : new value (a scalar, optional) + + +=cut + +sub _term{ + my ($self, $value) = @_; + + if(defined $value) { + $self->{'_term'} = $value; + } + + return $self->{'_term'}; +} + +=head2 _clear_term + + Title : _clear_term + Usage : + Function: Removes the current term from the handler + Example : + Returns : + Args : + + +=cut + +sub _clear_term{ + my ($self) = @_; + + delete $self->{'_term'}; +} + +=head2 _names + + Title : _names + Usage : $obj->_names($newval) + Function: + Example : + Returns : value of _names (a scalar) + Args : new value (a scalar, optional) + + +=cut + +sub _names{ + my ($self, $value) = @_; + + if( defined $value) { + $self->{'_names'} = $value; + } + + return $self->{'_names'}; +} + +=head2 _create_relationship + + Title : _create_relationship + Usage : + Function: Helper function. Adds relationships to one of the relationship stores. + Example : + Returns : + Args : + + +=cut + +sub _create_relationship{ + my ($self, $ref_id, $rel_type_term) = @_; + my $ont = $self->ontology(); + my $fact = $self->term_factory(); + my $term_temp = ($ont->engine->get_term_by_identifier($ref_id))[0]; + + my $rel = Bio::Ontology::Relationship->new( -predicate_term => $rel_type_term ); + + if (!defined $term_temp) { + $term_temp = $ont->engine->add_term( $fact->create_object( -InterPro_id => $ref_id ) ); + $ont->engine->mark_uninstantiated($term_temp); + } + my $rel_type_name = $self->_top($self->_names); + + if ($rel_type_name eq 'parent_list' || $rel_type_name eq 'found_in') { + $rel->object_term( $term_temp ); + $rel->subject_term( $self->_term ); + } else { + $rel->object_term( $self->_term ); + $rel->subject_term( $term_temp ); + } + $rel->ontology($ont); + $ont->add_relationship($rel); +} + +=head2 start_element + + Title : start_element + Usage : + Function: This is a method that is derived from XML::SAX::Base and + has to be overridden for processing start of xml element + events. Used internally only. + + Example : + Returns : + Args : + + +=cut + +sub start_element { + my ($self, $element) = @_; + my $ont = $self->ontology(); + my $fact = $self->term_factory(); + + if ($element->{Name} eq 'interprodb') { + $ont->add_term($fact->create_object(-identifier => "Family", + -name => "Family") ); + $ont->add_term($fact->create_object(-identifier => "Domain", + -name => "Domain") ); + $ont->add_term($fact->create_object(-identifier => "Repeat", + -name => "Repeat") ); + $ont->add_term($fact->create_object(-identifier => "PTM", + -name => "post-translational modification")); + } elsif ($element->{Name} eq 'interpro') { + my %record_args = %{$element->{Attributes}}; + my $id = $record_args{"id"}; + my $term_temp = ($ont->engine->get_term_by_identifier($id))[0]; + + $self->_term( + (!defined $term_temp) + ? $ont->add_term( $fact->create_object(-InterPro_id => $id) ) + : $term_temp + ); + + $self->_term->ontology( $ont ); + $self->_term->short_name( $record_args{"short_name"} ); + $self->_term->protein_count( $record_args{"protein_count"} ); + $self->_increment_record_count(); + $self->_stack([{ interpro => undef }]); + $self->_names(["interpro"]); + + ## Adding a relationship between the newly created InterPro term + ## and the term describing its type + + my $rel = Bio::Ontology::Relationship->new( -predicate_term => $is_a_rel ); + $rel->object_term( ($ont->engine->get_term_by_identifier($record_args{"type"}))[0] ); + $rel->subject_term( $self->_term ); + $rel->ontology($ont); + $ont->add_relationship($rel); + } + elsif (defined $self->_stack) { + my %hash = (); + + if (keys %{$element->{Attributes}} > 0) { + foreach my $key (keys %{$element->{Attributes}}) { + $hash{$key} = $element->{Attributes}->{$key}; + } + } + push @{$self->_stack}, \%hash; + if ($element->{Name} eq 'rel_ref') { + my $ref_id = $element->{Attributes}->{"ipr_ref"}; + my $parent = $self->_top($self->_names); + + if ($parent eq 'parent_list' || $parent eq 'child_list') { + $self->_create_relationship($ref_id, $is_a_rel); + } + if ($parent eq 'contains' ) { + $self->_create_relationship($ref_id, $contains_rel); + } + if ($parent eq 'found_in' ) { + $self->_create_relationship($ref_id, $found_in_rel); + } + } + elsif ($element->{Name} eq 'abstract') { + $self->_cite_skip(1); + } + push @{$self->_names}, $element->{Name}; + } + +} + +=head2 _char_storage + + Title : _char_storage + Usage : $obj->_char_storage($newval) + Function: + Example : + Returns : value of _char_storage (a scalar) + Args : new value (a scalar, optional) + + +=cut + +sub _char_storage{ + my ($self, $value) = @_; + + if( defined $value) { + $self->{'_char_storage'} = $value; + } + + return $self->{'_char_storage'}; +} + +=head2 characters + + Title : characters + Usage : + Function: This is a method that is derived from XML::SAX::Base and has to be overridden for processing xml characters events. Used internally only. + Example : + Returns : + Args : + + +=cut + +sub characters { + my ($self, $characters) = @_; + my $text = $characters->{Data}; + + chomp $text; + $text =~ s/^(\s+)//; + $self->{_char_storage} .= $text; + +} + +=head2 end_element + + Title : end_element + Usage : + Function: This is a method that is derived from XML::SAX::Base and has to be overridden for processing end of xml element events. Used internally only. + Example : + Returns : + Args : + + +=cut + +sub end_element { + my ($self, $element) = @_; + + if ($element->{Name} eq 'interprodb') { + $self->debug("Interpro DB Parser Finished: $record_count read, $processed_count processed\n"); + } + elsif ($element->{Name} eq 'interpro') { + $self->_clear_term; + $self->_increment_processed_count(); + } + elsif ($element->{Name} ne 'cite') { + $self->{_char_storage} =~ s/<\/?p>//g; + if ((defined $self->_stack)) { + my $current_hash = pop @{$self->_stack}; + my $parent_hash = $self->_top($self->_stack); + my $current_hash_key = pop @{$self->_names}; + + if (keys %{$current_hash} > 0 && $self->_char_storage ne "") { + $current_hash->{comment} = $self->_char_storage; + push @{ $parent_hash->{$current_hash_key} }, $current_hash + } + elsif ($self->_char_storage ne ""){ + push @{ $parent_hash->{$current_hash_key} }, { 'accumulated_text_12345' => $self->_char_storage }; + } + elsif (keys %{$current_hash} > 0) { + push @{ $parent_hash->{$current_hash_key} }, $current_hash; + } + if ($element->{Name} eq 'pub_list') { + my @refs = (); + + foreach my $pub_record ( @{ $current_hash->{publication} } ) { + my $ref = Bio::Annotation::Reference->new; + my $loc = $pub_record->{location}->[0]; + + $ref->location( $pub_record->{journal}->[0]->{accumulated_text_12345}.", ".$loc->{firstpage}."-".$loc->{lastpage}.", ".$loc->{volume}.", ".$pub_record->{year}->[0]->{accumulated_text_12345}); + $ref->title( $pub_record->{title}->[0]->{accumulated_text_12345} ); + my $ttt = $pub_record->{author_list}->[0]; + + $ref->authors( $ttt->{accumulated_text_12345} ); + $ref->medline( scalar($ttt->{dbkey}) ) + if exists($ttt->{db}) && $ttt->{db} eq "MEDLINE"; + push @refs, $ref; + } + $self->_term->add_reference(@refs); + } + elsif ($element->{Name} eq 'name') { + $self->_term->name( $self->_char_storage ); + } + elsif ($element->{Name} eq 'abstract') { + $self->_term->definition( $self->_char_storage ); + $self->_cite_skip(0); + } + elsif ($element->{Name} eq 'member_list') { + my @refs = (); + + foreach my $db_xref ( @{ $current_hash->{db_xref} } ) { + push @refs, Bio::Annotation::DBLink->new( -database => $db_xref->{db}, + -primary_id => $db_xref->{dbkey} + ); + } + $self->_term->add_member(@refs); + } + elsif ($element->{Name} eq 'sec_list') { + my @refs = (); + + foreach my $sec_ac ( @{ $current_hash->{sec_ac} } ) { + push @refs, $sec_ac->{sec_ac}; + } + $self->_term->add_secondary_id(@refs); + $self->secondary_accessions_map->{$self->_term->identifier} = \@refs; + } + elsif ($element->{Name} eq 'example_list') { + my @refs = (); + + foreach my $example ( @{ $current_hash->{example} } ) { + push @refs, Bio::Annotation::DBLink->new( -database => $example->{db_xref}->[0]->{db}, + -primary_id => $example->{db_xref}->[0]->{dbkey}, + -comment => $example->{comment} + ); + } + $self->_term->add_example(@refs); + } + elsif ($element->{Name} eq 'external_doc_list') { + my @refs = (); + + foreach my $db_xref ( @{ $current_hash->{db_xref} } ) { + push @refs, Bio::Annotation::DBLink->new( -database => $db_xref->{db}, + -primary_id => $db_xref->{dbkey} + ); + } + $self->_term->add_external_document(@refs); + } + elsif ($element->{Name} eq 'class_list') { + my @refs = (); + + foreach my $classification ( @{ $current_hash->{classification} } ) { + push @refs, Bio::Annotation::DBLink->new( -database => $classification->{class_type}, + -primary_id => $classification->{id} + ); + } + $self->_term->class_list(\@refs); + } + elsif ($element->{Name} eq 'deleted_entries') { + my @refs = (); + + foreach my $del_ref ( @{ $current_hash->{del_ref} } ) { + my $term = ($self->ontology_engine->get_term_by_identifier( $del_ref->{id} ))[0]; + + $term->is_obsolete(1) if defined $term; + } + } + } + $self->_char_storage( '' ) if !$self->_cite_skip; + } +} + +=head2 secondary_accessions_map + + Title : secondary_accessions_map + Usage : $obj->secondary_accessions_map($newval) + Function: + Example : $map = $interpro_handler->secondary_accessions_map(); + Returns : Reference to a hash that maps InterPro identifier to an + array reference of secondary accessions following the InterPro + xml schema. + Args : Empty hash reference + + +=cut + +sub secondary_accessions_map{ + my ($self, $value) = @_; + + if( defined $value) { + $self->{'secondary_accessions_map'} = $value; + } + + return $self->{'secondary_accessions_map'}; +} + +=head2 _increment_record_count + + Title : _increment_record_count + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub _increment_record_count{ + $record_count++; +} + +=head2 _increment_processed_count + + Title : _increment_processed_count + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub _increment_processed_count{ + $processed_count++; + print $processed_count."\n" if $processed_count % 100 == 0; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/OntologyIO/InterProParser.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/OntologyIO/InterProParser.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,248 @@ +# $GNF: projects/gi/symgene/src/perl/seqproc/Bio/OntologyIO/InterProParser.pm,v 1.5 2003/02/07 22:05:58 pdimitro Exp $ +# +# BioPerl module for InterProParser +# +# Cared for by Peter Dimitrov +# +# Copyright Peter Dimitrov +# (c) Peter Dimitrov, dimitrov@gnf.org, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# +# POD documentation - main docs before the code + +=head1 NAME + +InterProParser - Parser for InterPro xml files. + +=head1 SYNOPSIS + + # don't use this module directly - use Bio::OntologyIO with instead + my $ipp = Bio::OntologyIO->new( -format => 'interpro', + -file => 't/data/interpro.xml', + -ontology_engine => 'simple' ); + +=head1 DESCRIPTION + + Use InterProParser to parse InterPro files in xml format. Typical + use is the interpro.xml file published by EBI. The xml records + should follow the format described in interpro.dtd, although the dtd + file is not needed, and the XML file will not be validated against + it. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Peter Dimitrov + +Email dimitrov@gnf.org + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::OntologyIO::InterProParser; +use vars qw(@ISA); +use strict; +#use Carp; +use XML::Parser::PerlSAX; +use Bio::Ontology::SimpleOntologyEngine; +use Bio::Ontology::TermFactory; +use Bio::OntologyIO; +use Bio::OntologyIO::Handlers::InterProHandler; + +@ISA = qw( Bio::OntologyIO ); + +=head2 new + + Title : new + Usage : + Function: Initializes objects needed for parsing. + Example : $ipp = Bio::OntologyIO::InterProParser->new( + -file => 't/data/interpro.xml', + -ontology_engine => 'simple' ) + + Returns : Object of class Bio::OntologyIO::InterProParser. + Args : + + -file - file name + -ontology_engine - type of ontology engine. Should satisfy the + OntologyEngine interface requirements. Currently + the only option is 'simple'. In the future + Graph.pm based engine will be added to the + choices. + + +=cut + +# in reality we let OntologyIO handle the first pass initialization +# and instead override _initialize(). +sub _initialize{ + my $self = shift; + + $self->SUPER::_initialize(@_); + + my ($eng,$eng_type,$name) = + $self->_rearrange([qw(ENGINE + ONTOLOGY_ENGINE + ONTOLOGY_NAME) + ], @_); + + my $ip_h = Bio::OntologyIO::Handlers::InterProHandler->new( + -ontology_name => $name); + + if(! $eng) { + if(lc($eng_type) eq 'simple') { + $eng = Bio::Ontology::SimpleOntologyEngine->new(); + } else { + $self->throw("ontology engine type '$eng_type' ". + "not implemented yet"); + } + } + if($eng->isa("Bio::Ontology::OntologyI")) { + $ip_h->ontology($eng); + $eng = $eng->engine() if $eng->can('engine'); + } + $self->{_ontology_engine} = $eng; + $ip_h->ontology_engine($eng); + + $self->{_parser} = XML::Parser::PerlSAX->new( Handler => $ip_h ); + $self->{_interpro_handler} = $ip_h; + + # default term object factory + $self->term_factory(Bio::Ontology::TermFactory->new( + -type => "Bio::Ontology::InterProTerm")) + unless $self->term_factory(); + $ip_h->term_factory($self->term_factory()); + +} + +=head2 parse + + Title : parse + Usage : + Function: Performs the actual parsing. + Example : $ipp->parse(); + Returns : + Args : + + +=cut + +sub parse{ + my $self = shift; + + my $ret = $self->{_parser}->parse( Source => { + SystemId => $self->file() } ); + $self->_is_parsed(1); + return $ret; +} + +=head2 next_ontology + + Title : next_ontology + Usage : $ipp->next_ontology() + Function: Parses the input file and returns the next InterPro ontology + available. + + Usually there will be only one ontology returned from an + InterPro XML input. + + Example : $ipp->next_ontology(); + Returns : Returns the ontology as a L + compliant object. + Args : + + +=cut + +sub next_ontology{ + my $self = shift; + + $self->parse() unless $self->_is_parsed(); + # there is only one ontology in an InterPro source file + if(exists($self->{'_ontology_engine'})) { + my $ont = $self->{_interpro_handler}->ontology(); + delete $self->{_ontology_engine}; + return $ont; + } + return undef; +} + +=head2 _is_parsed + + Title : _is_parsed + Usage : $obj->_is_parsed($newval) + Function: + Example : + Returns : value of _is_parsed (a scalar) + Args : on set, new value (a scalar or undef, optional) + + +=cut + +sub _is_parsed{ + my $self = shift; + + return $self->{'_is_parsed'} = shift if @_; + return $self->{'_is_parsed'}; +} + +=head2 secondary_accessions_map + + Title : secondary_accessions_map + Usage : $obj->secondary_accessions_map() + Function: This method is merely for convenience, and one should + normally use the InterProTerm secondary_ids method to access + the secondary accessions. + Example : $map = $interpro_parser->secondary_accessions_map; + Returns : Reference to a hash that maps InterPro identifier to an + array reference of secondary accessions following the InterPro + xml schema. + Args : Empty hash reference + +=cut + +sub secondary_accessions_map{ + my ($self) = @_; + + return $self->{_interpro_handler}->{secondary_accessions_map}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/OntologyIO/dagflat.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/OntologyIO/dagflat.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,799 @@ +# $Id: dagflat.pm,v 1.2.2.6 2003/06/30 05:04:06 lapp Exp $ +# +# BioPerl module for Bio::OntologyIO::dagflat +# +# Cared for by Hilmar Lapp, hlapp at gmx.net +# +# (c) Christian M. Zmasek, czmasek@gnf.org, 2002. +# (c) Hilmar Lapp, hlapp at gmx.net, 2003. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +dagflat - a base class parser for GO flat-file type formats + +=head1 SYNOPSIS + + use Bio::OntologyIO; + + # do not use directly -- use via Bio::OntologyIO + # e.g., the GO parser is a simple extension of this class + my $parser = Bio::OntologyIO->new + ( -format => "go", + -defs_file => "/home/czmasek/GO/GO.defs", + -files => ["/home/czmasek/GO/component.ontology", + "/home/czmasek/GO/function.ontology", + "/home/czmasek/GO/process.ontology"] ); + + my $go_ontology = $parser->next_ontology(); + + my $IS_A = Bio::Ontology::RelationshipType->get_instance( "IS_A" ); + my $PART_OF = Bio::Ontology::RelationshipType->get_instance( "PART_OF" ); + +=head1 DESCRIPTION + +Needs Graph.pm from CPAN. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Christian M. Zmasek + +Email: czmasek@gnf.org or cmzmasek@yahoo.com + +WWW: http://www.genetics.wustl.edu/eddy/people/zmasek/ + +Address: + + Genomics Institute of the Novartis Research Foundation + 10675 John Jay Hopkins Drive + San Diego, CA 92121 + +=head2 CONTRIBUTOR + + Hilmar Lapp, hlapp at gmx.net + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::OntologyIO::dagflat; + +use vars qw( @ISA ); +use strict; + +use Bio::Root::IO; +use Bio::Ontology::SimpleGOEngine; +use Bio::Ontology::Ontology; +use Bio::Ontology::TermFactory; +use Bio::OntologyIO; + +use constant TRUE => 1; +use constant FALSE => 0; + + +@ISA = qw( Bio::OntologyIO ); + + +=head2 new + + Title : new + Usage : $parser = Bio::OntologyIO->new( + -format => "go", + -defs_file => "/path/to/GO.defs", + -files => ["/path/to/component.ontology", + "/path/to/function.ontology", + "/path/to/process.ontology"] ); + Function: Creates a new dagflat parser. + Returns : A new dagflat parser object, implementing L. + Args : -defs_file => the name of the file holding the term + definitions + -files => a single ontology flat file holding the + term relationships, or an array ref holding + the file names (for GO, there will usually be + 3 files: component.ontology, function.ontology, + process.ontology) + -file => if there is only a single flat file, it may + also be specified via the -file parameter + -ontology_name => the name of the ontology; if not specified the + parser will auto-discover it by using the term + that starts with a '$', and converting underscores + to spaces + -engine => the Bio::Ontology::OntologyEngineI object + to be reused (will be created otherwise); note + that every Bio::Ontology::OntologyI will + qualify as well since that one inherits from the + former. + +=cut + +# in reality, we let OntologyIO::new do the instantiation, and override +# _initialize for all initialization work +sub _initialize { + my ($self, @args) = @_; + + $self->SUPER::_initialize( @args ); + + my ( $defs_file_name,$files,$name,$eng ) = + $self->_rearrange([qw( DEFS_FILE + FILES + ONTOLOGY_NAME + ENGINE) + ], + @args ); + + $self->_done( FALSE ); + $self->_not_first_record( FALSE ); + $self->_term( "" ); + delete $self->{'_ontologies'}; + + # ontology engine (and possibly name if it's an OntologyI) + $eng = Bio::Ontology::SimpleGOEngine->new() unless $eng; + if($eng->isa("Bio::Ontology::OntologyI")) { + $self->ontology_name($eng->name()); + $eng = $eng->engine() if $eng->can('engine'); + } + $self->_ont_engine($eng); + + # flat files to parse + $self->defs_file( $defs_file_name ); + $self->{_flat_files} = $files ? ref($files) ? $files : [$files] : []; + + # ontology name (overrides implicit one through OntologyI engine) + $self->ontology_name($name) if $name; + +} # _initialize + +=head2 ontology_name + + Title : ontology_name + Usage : $obj->ontology_name($newval) + Function: Get/set the name of the ontology parsed by this module. + Example : + Returns : value of ontology_name (a scalar) + Args : on set, new value (a scalar or undef, optional) + + +=cut + +sub ontology_name{ + my $self = shift; + + return $self->{'ontology_name'} = shift if @_; + return $self->{'ontology_name'}; +} + + +=head2 parse + + Title : parse() + Usage : $parser->parse(); + Function: Parses the files set with "new" or with methods + defs_file and _flat_files. + + Normally you should not need to call this method as it will + be called automatically upon the first call to + next_ontology(). + + Returns : [Bio::Ontology::OntologyEngineI] + Args : + +=cut + +sub parse { + my $self = shift; + + # setup the default term factory if not done by anyone yet + $self->term_factory(Bio::Ontology::TermFactory->new( + -type => "Bio::Ontology::Term")) + unless $self->term_factory(); + + # create the ontology object itself + my $ont = Bio::Ontology::Ontology->new(-name => $self->ontology_name(), + -engine => $self->_ont_engine()); + + # parse definitions + while( my $term = $self->_next_term() ) { + $self->_add_term( $term, $ont ); + } + + # set up the ontology of the relationship types + foreach ($self->_part_of_relationship(), $self->_is_a_relationship()) { + $_->ontology($ont); + } + + # pre-seed the IO system with the first flat file if -file wasn't provided + if(! $self->_fh) { + $self->_initialize_io(-file => shift(@{$self->_flat_files()})); + } + + while($self->_fh) { + $self->_parse_flat_file($ont); + # advance to next flat file if more are available + if(@{$self->_flat_files()}) { + $self->close(); + $self->_initialize_io(-file => shift(@{$self->_flat_files()})); + } else { + last; # nothing else to parse so terminate the loop + } + } + $self->_add_ontology($ont); + + # not needed anywhere, only because of backward compatibility + return $self->_ont_engine(); + +} # parse + +=head2 next_ontology + + Title : next_ontology + Usage : + Function: Get the next available ontology from the parser. This is the + method prescribed by Bio::OntologyIO. + Example : + Returns : An object implementing Bio::Ontology::OntologyI, and undef if + there is no more ontology in the input. + Args : + + +=cut + +sub next_ontology{ + my $self = shift; + + # parse if not done already + $self->parse() unless exists($self->{'_ontologies'}); + # return next available ontology + return shift(@{$self->{'_ontologies'}}) if exists($self->{'_ontologies'}); + return undef; +} + + +=head2 defs_file + + Title : defs_file + Usage : $parser->defs_file( "GO.defs" ); + Function: Set/get for the term definitions filename. + Returns : The term definitions file name [string]. + Args : On set, the term definitions file name [string] (optional). + +=cut + +sub defs_file { + my $self = shift; + + if ( @_ ) { + my $f = shift; + $self->{ "_defs_file_name" } = $f; + $self->_defs_io->close() if $self->_defs_io(); + if(defined($f)) { + $self->_defs_io( Bio::Root::IO->new( -input => $f ) ); + } + } + return $self->{ "_defs_file_name" }; +} # defs_file + + +=head2 _flat_files + + Title : _flat_files + Usage : $files_to_parse = $parser->_flat_files(); + Function: Get the array of ontology flat files that need to be parsed. + + Note that this array will decrease in elements over the + parsing process. Therefore, it's value outside of this + module will be limited. Also, be careful not to alter the + array unless you know what you are doing. + + Returns : a reference to an array of zero or more strings + Args : none + +=cut + +sub _flat_files { + my $self = shift; + + $self->{_flat_files} = [] unless exists($self->{_flat_files}); + return $self->{_flat_files}; +} + + +# INTERNAL METHODS +# ---------------- + +=head2 _defs_io + + Title : _defs_io + Usage : $obj->_defs_io($newval) + Function: Get/set the Bio::Root::IO instance representing the + definition file, if provided (see defs_file()). + Example : + Returns : value of _defs_io (a Bio::Root::IO object) + Args : on set, new value (a Bio::Root::IO object or undef, optional) + + +=cut + +sub _defs_io{ + my $self = shift; + + return $self->{'_defs_io'} = shift if @_; + return $self->{'_defs_io'}; +} + +sub _add_ontology { + my $self = shift; + $self->{'_ontologies'} = [] unless exists($self->{'_ontologies'}); + foreach my $ont (@_) { + $self->throw(ref($ont)." does not implement Bio::Ontology::OntologyI") + unless ref($ont) && $ont->isa("Bio::Ontology::OntologyI"); + # the ontology name may have been auto-discovered while parsing + # the file + $ont->name($self->ontology_name) unless $ont->name(); + push(@{$self->{'_ontologies'}}, $ont); + } +} + +# This simply delegates. See SimpleGOEngine. +sub _add_term { + my ( $self, $term, $ont ) = @_; + + $term->ontology($ont) if $ont && (! $term->ontology); + $self->_ont_engine()->add_term( $term ); +} # _add_term + + + +# This simply delegates. See SimpleGOEngine +sub _part_of_relationship { + my $self = shift; + + return $self->_ont_engine()->part_of_relationship(@_); +} # _part_of_relationship + + + +# This simply delegates. See SimpleGOEngine +sub _is_a_relationship { + my $self = shift; + + return $self->_ont_engine()->is_a_relationship(@_); +} # _is_a_relationship + + + +# This simply delegates. See SimpleGOEngine +sub _add_relationship { + my ( $self, $parent, $child, $type, $ont ) = @_; + + # note the triple terminology (subject,predicate,object) corresponds to + # (child,type,parent) + $self->_ont_engine()->add_relationship( $child, $type, $parent, $ont ); + + +} # _add_relationship + + +# This simply delegates. See SimpleGOEngine +sub _has_term { + my $self = shift; + + return $self->_ont_engine()->has_term( @_ ); +} # _add_term + + + +# This parses the relationships files +sub _parse_flat_file { + my $self = shift; + my $ont = shift; + + my @stack = (); + my $prev_spaces = -1; + my $prev_term = ""; + + while( my $line = $self->_readline() ) { + + if ( $line =~ /^!/ ) { + next; + } + + my $current_term = $self->_get_first_termid( $line ); + my @isa_parents = $self->_get_isa_termids( $line ); + my @partof_parents = $self->_get_partof_termids( $line ); + my @syns = $self->_get_synonyms( $line ); + my @sec_go_ids = $self->_get_secondary_termids( $line ); + my @cross_refs = $self->_get_db_cross_refs( $line ); + + + if ( ! $self->_has_term( $current_term ) ) { + my $term =$self->_create_ont_entry($self->_get_name($line, + $current_term), + $current_term ); + $self->_add_term( $term, $ont ); + } + + my $current_term_object = $self->_ont_engine()->get_terms( $current_term ); + + $current_term_object->add_dblink( @cross_refs ); + $current_term_object->add_secondary_id( @sec_go_ids ); + $current_term_object->add_synonym( @syns ); + unless ( $line =~ /^\$/ ) { + $current_term_object->ontology( $ont ); + } + foreach my $parent ( @isa_parents ) { + if ( ! $self->_has_term( $parent ) ) { + my $term = $self->_create_ont_entry($self->_get_name($line, + $parent), + $parent ); + $self->_add_term( $term, $ont ); + } + + $self->_add_relationship( $parent, + $current_term, + $self->_is_a_relationship(), + $ont); + + } + foreach my $parent ( @partof_parents ) { + if ( ! $self->_has_term( $parent ) ) { + my $term = $self->_create_ont_entry($self->_get_name($line, + $parent), + $parent ); + $self->_add_term( $term, $ont ); + } + + $self->_add_relationship( $parent, + $current_term, + $self->_part_of_relationship(), + $ont); + } + + my $current_spaces = $self->_count_spaces( $line ); + + if ( $current_spaces != $prev_spaces ) { + + if ( $current_spaces == $prev_spaces + 1 ) { + push( @stack, $prev_term ); + } + elsif ( $current_spaces < $prev_spaces ) { + my $n = $prev_spaces - $current_spaces; + for ( my $i = 0; $i < $n; ++$i ) { + pop( @stack ); + } + } + else { + $self->throw( "format error (file ".$self->file.")" ); + } + } + + my $parent = $stack[ @stack - 1 ]; + + # add a relationship if the line isn't the one with the root term + # of the ontology (which is also the name of the ontology) + if ( index($line,'$') != 0 ) { + if ( $line !~ /^\s*[<%]/ ) { + $self->throw( "format error (file ".$self->file.")" ); + } + my $reltype = ($line =~ /^\s*_part_of_relationship() : + $self->_is_a_relationship(); + $self->_add_relationship( $parent, $current_term, $reltype, + $ont); + } + + $prev_spaces = $current_spaces; + + $prev_term = $current_term; + + } + return $ont; +} # _parse_relationships_file + + + +# Parses the 1st term id number out of line. +sub _get_first_termid { + my ( $self, $line ) = @_; + + if ( $line =~ /;\s*([A-Z]{1,8}:\d{3,})/ ) { + return $1; + } + else { + $self->throw( "format error: no term id in line \"$line\"" ); + } + +} # _get_first_termid + + + +# Parses the name out of line. +sub _get_name { + my ( $self, $line, $termid ) = @_; + + if ( $line =~ /([^;<%]+);\s*$termid/ ) { + my $name = $1; + # remove trailing and leading whitespace + $name =~ s/\s+$//; + $name =~ s/^\s+//; + # remove leading dollar character; also we default the name of the + # ontology to this name if preset to something else + if(index($name,'$') == 0) { + $name = substr($name,1); + # replace underscores by spaces for setting the ontology name + $self->ontology_name(join(" ",split(/_/,$name))) + unless $self->ontology_name(); + } + return $name; + } + else { + return undef; + } +} # _get_name + + +# Parses the synonyms out of line. +sub _get_synonyms { + my ( $self, $line ) = @_; + + my @synonyms = (); + + while ( $line =~ /synonym\s*:\s*([^;^<^%]+)/g ) { + my $syn = $1; + $syn =~ s/\s+$//; + $syn =~ s/^\s+//; + push( @synonyms, $syn ); + } + return @synonyms; + +} # _get_synonyms + + + +# Parses the db cross refs out of line. +sub _get_db_cross_refs { + my ( $self, $line ) = @_; + + my @refs = (); + + while ( $line =~ /;([^;^<^%^:]+:[^;^<^%^:]+)/g ) { + my $ref = $1; + if ( $ref =~ /synonym/ || $ref =~ /[A-Z]{1,8}:\d{3,}/ ) { + next; + } + $ref =~ s/\s+$//; + $ref =~ s/^\s+//; + push( @refs, $ref ); + } + return @refs; + +} + + +# Parses the secondary go ids out of a line +sub _get_secondary_termids { + my ( $self, $line ) = @_; + my @secs = (); + + while ( $line =~ /,\s*([A-Z]{1,8}:\d{3,})/g ) { + my $sec = $1; + push( @secs, $sec ); + } + return @secs; + +} # _get_secondary_termids + + + +# Parses the is a ids out of a line +sub _get_isa_termids { + my ( $self, $line ) = @_; + + my @ids = (); + + $line =~ s/[A-Z]{1,8}:\d{3,}//; + + while ( $line =~ /%[^<^,]*?([A-Z]{1,8}:\d{3,})/g ) { + push( @ids, $1 ); + } + return @ids; +} # _get_isa_termids + + + +# Parses the part of ids out of a line +sub _get_partof_termids { + my ( $self, $line ) = @_; + + my @ids = (); + + $line =~ s/[A-Z]{1,8}:\d{3,}//; + + while ( $line =~ /<[^%^,]*?([A-Z]{1,8}:\d{3,})/g ) { + push( @ids, $1 ); + } + return @ids; + + +} # _get_partof_termids + + + + +# Counts the spaces at the beginning of a line in the relationships files +sub _count_spaces { + my ( $self, $line ) = @_; + + if ( $line =~ /^(\s+)/ ) { + return length( $1 ); + } + else { + return 0; + } +} # _count_spaces + + + + +# "next" method for parsing the defintions file +sub _next_term { + my ( $self ) = @_; + + if ( ($self->_done() == TRUE) || (! $self->_defs_io())) { + return undef; + } + + my $line = ""; + my $termid = ""; + my $next_term = $self->_term(); + my $def = ""; + my $comment = ""; + my @def_refs = (); + my $isobsolete; + + while( $line = ( $self->_defs_io->_readline() ) ) { + + if ( $line !~ /\S/ + || $line =~ /^\s*!/ ) { + next; + } + elsif ( $line =~ /^\s*term:\s*(.+)/ ) { + $self->_term( $1 ); + last if $self->_not_first_record(); + $next_term = $1; + $self->_not_first_record( TRUE ); + } + elsif ( $line =~ /^\s*[a-z]{1,8}id:\s*(.+)/ ) { + $termid = $1; + } + elsif ( $line =~ /^\s*definition:\s*(.+)/ ) { + $def = $1; + $isobsolete = 1 if index($def,"OBSOLETE") == 0; + } + elsif ( $line =~ /^\s*definition_reference:\s*(.+)/ ) { + push( @def_refs, $1 ); + } + elsif ( $line =~ /^\s*comment:\s*(.+)/ ) { + $comment = $1; + } + } + $self->_done( TRUE ) unless $line; # we'll come back until done + + return $self->_create_ont_entry( $next_term, $termid, $def, + $comment, \@def_refs, $isobsolete); +} # _next_term + + + + + +# Holds the GO engine to be parsed into +sub _ont_engine { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_ont_engine" } = $value; + } + + return $self->{ "_ont_engine" }; +} # _ont_engine + + + + +# Used to create ontology terms. +# Arguments: name, id +sub _create_ont_entry { + my ( $self, $name, $termid, $def, $cmt, $dbxrefs, $obsolete ) = @_; + + if((!defined($obsolete)) && (index(lc($name),"obsolete") == 0)) { + $obsolete = 1; + } + my $term = $self->term_factory->create_object(-name => $name, + -identifier => $termid, + -definition => $def, + -comment => $cmt, + -dblinks => $dbxrefs, + -is_obsolete => $obsolete); + + return $term; +} # _create_ont_entry + + + +# Holds whether first record or not +sub _not_first_record { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_not_first_record" } = $value; + } + + return $self->{ "_not_first_record" }; +} # _not_first_record + + + +# Holds whether done or not +sub _done { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_done" } = $value; + } + + return $self->{ "_done" }; +} # _done + + +# Holds a term. +sub _term { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_term" } = $value; + } + + return $self->{ "_term" }; +} # _term + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/OntologyIO/goflat.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/OntologyIO/goflat.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,161 @@ +# $Id: goflat.pm,v 1.1.2.3 2003/05/27 22:00:52 lapp Exp $ +# +# BioPerl module for Bio::OntologyIO::goflat +# +# Cared for by Christian M. Zmasek or +# +# (c) Christian M. Zmasek, czmasek@gnf.org, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +goflat - a parser for the Gene Ontology flat-file format + +=head1 SYNOPSIS + + use Bio::OntologyIO; + + # do not use directly -- use via Bio::OntologyIO + my $parser = Bio::OntologyIO->new + ( -format => "go", + -defs_file => "/home/czmasek/GO/GO.defs", + -files => ["/home/czmasek/GO/component.ontology", + "/home/czmasek/GO/function.ontology", + "/home/czmasek/GO/process.ontology"] ); + + my $go_ontology = $parser->next_ontology(); + + my $IS_A = Bio::Ontology::RelationshipType->get_instance( "IS_A" ); + my $PART_OF = Bio::Ontology::RelationshipType->get_instance( "PART_OF" ); + +=head1 DESCRIPTION + +Needs Graph.pm from CPAN. + +This is essentially a very thin derivation of the dagflat parser. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Christian M. Zmasek + +Email: czmasek@gnf.org or cmzmasek@yahoo.com + +WWW: http://www.genetics.wustl.edu/eddy/people/zmasek/ + +Address: + + Genomics Institute of the Novartis Research Foundation + 10675 John Jay Hopkins Drive + San Diego, CA 92121 + +=head2 CONTRIBUTOR + + Hilmar Lapp, hlapp at gmx.net + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::OntologyIO::goflat; + +use vars qw( @ISA ); +use strict; + +use Bio::Ontology::TermFactory; +use Bio::OntologyIO::dagflat; + +use constant TRUE => 1; +use constant FALSE => 0; + + +@ISA = qw( Bio::OntologyIO::dagflat ); + + +=head2 new + + Title : new + Usage : $parser = Bio::OntologyIO->new( + -format => "go", + -defs_file => "/path/to/GO.defs", + -files => ["/path/to/component.ontology", + "/path/to/function.ontology", + "/path/to/process.ontology"] ); + Function: Creates a new goflat parser. + Returns : A new goflat parser object, implementing L. + Args : -defs_file => the name of the file holding the term + definitions + -files => a single ontology flat file holding the + term relationships, or an array ref holding + the file names (for GO, there will usually be + 3 files: component.ontology, function.ontology, + process.ontology) + -file => if there is only a single flat file, it may + also be specified via the -file parameter + -ontology_name => the name of the ontology; if not specified the + parser will auto-discover it by using the term + that starts with a '$', and converting underscores + to spaces + -engine => the Bio::Ontology::OntologyEngineI object + to be reused (will be created otherwise); note + that every Bio::Ontology::OntologyI will + qualify as well since that one inherits from the + former. + +=cut + +# in reality, we let OntologyIO::new do the instantiation, and override +# _initialize for all initialization work +sub _initialize { + my ($self, @args) = @_; + + $self->SUPER::_initialize( @args ); + + # default term object factory + $self->term_factory(Bio::Ontology::TermFactory->new( + -type => "Bio::Ontology::GOterm")) + unless $self->term_factory(); + +} # _initialize + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/OntologyIO/soflat.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/OntologyIO/soflat.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,155 @@ +# $Id: soflat.pm,v 1.1.2.3 2003/05/27 22:00:52 lapp Exp $ +# +# BioPerl module for Bio::OntologyIO::soflat +# +# Cared for by Christian M. Zmasek or +# +# (c) Christian M. Zmasek, czmasek@gnf.org, 2002. +# (c) Hilmar Lapp, hlapp at gnf.org, 2003. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002-3. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +soflat - a parser for the Sequence Ontology flat-file format + +=head1 SYNOPSIS + + use Bio::OntologyIO; + + # do not use directly -- use via Bio::OntologyIO + my $parser = Bio::OntologyIO->new + ( -format => "so", # or soflat + -defs_file => "/home/czmasek/SO/SO.defs", + -file => "/home/czmasek/SO/sofa.ontology" ); + + my $sofa_ontology = $parser->next_ontology(); + + my $IS_A = Bio::Ontology::RelationshipType->get_instance( "IS_A" ); + my $PART_OF = Bio::Ontology::RelationshipType->get_instance( "PART_OF" ); + +=head1 DESCRIPTION + +Needs Graph.pm from CPAN. + +This is essentially a very thin derivation of the dagflat base-parser. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Christian M. Zmasek + +Email: czmasek@gnf.org or cmzmasek@yahoo.com + +WWW: http://www.genetics.wustl.edu/eddy/people/zmasek/ + +Address: + + Genomics Institute of the Novartis Research Foundation + 10675 John Jay Hopkins Drive + San Diego, CA 92121 + +=head2 CONTRIBUTOR + + Hilmar Lapp, hlapp at gmx.net + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::OntologyIO::soflat; + +use vars qw( @ISA ); +use strict; + +use Bio::Ontology::TermFactory; +use Bio::OntologyIO::dagflat; + +use constant TRUE => 1; +use constant FALSE => 0; + + +@ISA = qw( Bio::OntologyIO::dagflat ); + + +=head2 new + + Title : new + Usage : $parser = Bio::OntologyIO->new( + -format => "soflat", + -files => ["/path/to/sofa.ontology"] ); + Function: Creates a new soflat parser. + Returns : A new soflat parser object, implementing Bio::OntologyIO. + Args : -defs_file => the name of the file holding the term + definitions + -files => a single ontology flat file holding the + term relationships, or an array ref holding + the file names + -file => if there is only a single flat file, it may + also be specified via the -file parameter + -ontology_name => the name of the ontology; if not specified the + parser will auto-discover it by using the term + that starts with a '$', and converting underscores + to spaces + -engine => the L object + to be reused (will be created otherwise); note + that every L will + qualify as well since that one inherits from the + former. + +=cut + +# in reality, we let OntologyIO::new do the instantiation, and override +# _initialize for all initialization work +sub _initialize { + my ($self, @args) = @_; + + $self->SUPER::_initialize( @args ); + + # default term object factory + $self->term_factory(Bio::Ontology::TermFactory->new( + -type => "Bio::Ontology::GOterm")) + unless $self->term_factory(); + +} # _initialize + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Perl.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Perl.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,697 @@ +# $Id: Perl.pm,v 1.16.2.1 2003/03/25 12:32:15 heikki Exp $ +# +# BioPerl module for Bio::Perl +# +# Cared for by Ewan Birney +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Perl - Functional access to BioPerl for people who don't know objects + +=head1 SYNOPSIS + + use Bio::Perl; + + # will guess file format from extension + $seq_object = read_sequence($filename); + + # forces genbank format + $seq_object = read_sequence($filename,'genbank'); + + # reads an array of sequences + @seq_object_array = read_all_sequences($filename,'fasta'); + + # sequences are Bio::Seq objects, so the following methods work + # for more info see L, or do 'perldoc Bio/Seq.pm' + + print "Sequence name is ",$seq_object->display_id,"\n"; + print "Sequence acc is ",$seq_object->accession_number,"\n"; + print "First 5 bases is ",$seq_object->subseq(1,5),"\n"; + + # get the whole sequence as a single string + + $sequence_as_a_string = $seq_object->seq(); + + # writing sequences + + write_sequence(">$filename",'genbank',$seq_object); + + write_sequence(">$filename",'genbank',@seq_object_array); + + # making a new sequence from just strings you have + # from something else + + $seq_object = new_sequence("ATTGGTTTGGGGACCCAATTTGTGTGTTATATGTA", + "myname","AL12232"); + + + # getting a sequence from a database (assumes internet connection) + + $seq_object = get_sequence('swissprot',"ROA1_HUMAN"); + + $seq_object = get_sequence('embl',"AI129902"); + + $seq_object = get_sequence('genbank',"AI129902"); + + # BLAST a sequence (assummes an internet connection) + + $blast_report = blast_sequence($seq_object); + + write_blast(">blast.out",$blast_report); + + +=head1 DESCRIPTION + +Easy first time access to BioPerl via functions + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bio.perl.org + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney + +Email bioperl-l@bio.perl.org + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + +#' +# Let the code begin... + + +package Bio::Perl; +use vars qw(@ISA @EXPORT @EXPORT_OK $DBOKAY); +use strict; +use Carp; +use Exporter; + +use Bio::SeqIO; +use Bio::Seq; +BEGIN { + eval { + require Bio::DB::EMBL; + require Bio::DB::GenBank; + require Bio::DB::SwissProt; + require Bio::DB::RefSeq; + require Bio::DB::GenPept; + }; + if( $@ ) { + $DBOKAY = 0; + } else { + $DBOKAY = 1; + } +} + +@ISA = qw(Exporter); + +@EXPORT = qw(read_sequence read_all_sequences write_sequence + new_sequence get_sequence translate translate_as_string + reverse_complement revcom revcom_as_string + reverse_complement_as_string blast_sequence write_blast); + +@EXPORT_OK = @EXPORT; + + +=head2 read_sequence + + Title : read_sequence + Usage : $seq = read_sequence('sequences.fa') + $seq = read_sequence($filename,'genbank'); + + # pipes are fine + $seq = read_sequence("my_fetching_program $id |",'fasta'); + + Function: Reads the top sequence from the file. If no format is given, it will + try to guess the format from the filename. If a format is given, it + forces that format. The filename can be any valid perl open() string + - in particular, you can put in pipes + + Returns : A Bio::Seq object. A quick synopsis: + $seq_object->display_id - name of the sequence + $seq_object->seq - sequence as a string + + Args : Two strings, first the filename - any Perl open() string is ok + Second string is the format, which is optional + +For more information on Seq objects see L. + +=cut + +sub read_sequence{ + my ($filename,$format) = @_; + + if( !defined $filename ) { + confess "read_sequence($filename) - usage incorrect"; + } + + my $seqio; + + if( defined $format ) { + $seqio = Bio::SeqIO->new( '-file' => $filename, '-format' => $format); + } else { + $seqio = Bio::SeqIO->new( '-file' => $filename); + } + + my $seq = $seqio->next_seq(); + + return $seq; +} + + +=head2 read_all_sequences + + Title : read_all_sequences + Usage : @seq_object_array = read_all_sequences($filename); + @seq_object_array = read_all_sequences($filename,'genbank'); + + Function: Just as the function above, but reads all the sequences in the + file and loads them into an array. + + For very large files, you will run out of memory. When this + happens, you've got to use the SeqIO system directly (this is + not so hard! Don't worry about it!). + + Returns : array of Bio::Seq objects + + Args : two strings, first the filename (any open() string is ok) + second the format (which is optional) + +See L and L for more information + +=cut + +sub read_all_sequences{ + my ($filename,$format) = @_; + + if( !defined $filename ) { + confess "read_all_sequences($filename) - usage incorrect"; + } + + my $seqio; + + if( defined $format ) { + $seqio = Bio::SeqIO->new( '-file' => $filename, '-format' => $format); + } else { + $seqio = Bio::SeqIO->new( '-file' => $filename); + } + + my @seq_array; + + while( my $seq = $seqio->next_seq() ) { + push(@seq_array,$seq); + } + + return @seq_array; +} + + +=head2 write_sequence + + Title : write_sequence + Usage : write_sequence(">new_file.gb",'genbank',$seq) + write_sequence(">new_file.gb",'genbank',@array_of_sequence_objects) + + Function: writes sequences in the specified format + + Returns : true + + Args : filename as a string, must provide an open() output file + format as a string + one or more sequence objects + + +=cut + +sub write_sequence{ + my ($filename,$format,@sequence_objects) = @_; + + if( scalar(@sequence_objects) == 0 ) { + confess("write_sequence(filename,format,sequence_object)"); + } + + my $error = 0; + my $seqname = "sequence1"; + + # catch users who haven't passed us a filename we can open + if( $filename !~ /^\>/ && $filename !~ /^|/ ) { + $filename = ">".$filename; + } + + my $seqio = Bio::SeqIO->new('-file' => $filename, '-format' => $format); + + foreach my $seq ( @sequence_objects ) { + my $seq_obj; + + if( !ref $seq ) { + if( length $seq > 50 ) { + # odds are this is a sequence as a string, and someone has not figured out + # how to make objects. Warn him/her and then make a sequence object + # from this + if( $error == 0 ) { + carp("WARNING: You have put in a long string into write_sequence.\nI suspect this means that this is the actual sequence\nIn the future try the\n new_sequence method of this module to make a new sequence object.\nDoing this for you here\n"); + $error = 1; + } + + $seq_obj = new_sequence($seq,$seqname); + $seqname++; + } else { + confess("You have a non object [$seq] passed to write_sequence. It maybe that you want to use new_sequence to make this string into a sequence object?"); + } + } else { + if( !$seq->isa("Bio::SeqI") ) { + confess("object [$seq] is not a Bio::Seq object; can't write it out"); + } + $seq_obj = $seq; + } + + # finally... we get to write out the sequence! + $seqio->write_seq($seq_obj); + } + 1; +} + +=head2 new_sequence + + Title : new_sequence + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub new_sequence{ + my ($seq,$name,$accession) = @_; + + if( !defined $seq ) { + confess("new_sequence(sequence_as_string) usage"); + } + + $name ||= "no-name-for-sequence"; + + my $seq_object = Bio::Seq->new( -seq => $seq, -id => $name); + + $accession && $seq_object->accession_number($accession); + + return $seq_object; +} + +=head2 blast_sequence + + Title : blast_sequence + Usage : $blast_result = blast_sequence($seq) + $blast_result = blast_sequence('MFVEGGTFASEDDDSASAEDE'); + + Function: If the computer has Internet accessibility, blasts + the sequence using the NCBI BLAST server against nrdb. + + It choose the flavour of BLAST on the basis of the sequence. + + This function uses Bio::Tools::Run::RemoteBlast, which itself + use Bio::SearchIO - as soon as you want to more, check out + these modules + Returns : Bio::Search::Result::GenericResult.pm + + Args : Either a string of protein letters or nucleotides, or a + Bio::Seq object + +=cut + +sub blast_sequence { + my ($seq,$verbose) = shift; + + if( !defined $verbose ) { + $verbose = 1; + } + + if( !ref $seq ) { + $seq = Bio::Seq->new( -seq => $seq, -id => 'blast-sequence-temp-id'); + } elsif ( !$seq->isa('Bio::PrimarySeqI') ) { + croak("[$seq] is an object, but not a Bio::Seq object, cannot be blasted"); + } + + require Bio::Tools::Run::RemoteBlast; + + my $prog = 'blastp'; + my $e_val= '1e-10'; + + my @params = ( '-prog' => $prog, + '-expect' => $e_val, + '-readmethod' => 'SearchIO' ); + + my $factory = Bio::Tools::Run::RemoteBlast->new(@params); + + my $r = $factory->submit_blast($seq); + if( $verbose ) { + print STDERR "Submitted Blast for [".$seq->id."] "; + } + sleep 5; + + my $result; + + LOOP : + while( my @rids = $factory->each_rid) { + foreach my $rid ( @rids ) { + my $rc = $factory->retrieve_blast($rid); + if( !ref($rc) ) { + if( $rc < 0 ) { + $factory->remove_rid($rid); + } + if( $verbose ) { + print STDERR "."; + } + sleep 10; + } else { + $result = $rc->next_result(); + $factory->remove_rid($rid); + last LOOP; + } + } + } + + if( $verbose ) { + print STDERR "\n"; + } + return $result; +} + +=head2 write_blast + + Title : write_blast + Usage : write_blast($filename,$blast_report); + + Function: Writes a BLAST result object (or more formally + a SearchIO result object) out to a filename + in BLAST-like format + + Returns : none + + Args : filename as a string + Bio::SearchIO::Results object + +=cut + +sub write_blast { + my ($filename,$blast) = @_; + + if( $filename !~ /^\>/ && $filename !~ /^|/ ) { + $filename = ">".$filename; + } + + my $output = Bio::SearchIO->new( -output_format => 'blast', -file => $filename); + + $output->write_result($blast); + +} + +=head2 get_sequence + + Title : get_sequence + Usage : $seq_object = get_sequence('swiss',"ROA1_HUMAN"); + + Function: If the computer has Internet accessibility, gets + the sequence from Internet accessible databases. Currently + this supports Swissprot, EMBL, GenBank and RefSeq. + + Swissprot and EMBL are more robust than GenBank fetching. + + If the user is trying to retrieve a RefSeq entry from + GenBank/EMBL, the query is silently redirected. + + Returns : A Bio::Seq object + + Args : database type - one of swiss, embl, genbank or refseq + identifier or accession number + +=cut + +my $genbank_db = undef; +my $genpept_db = undef; +my $embl_db = undef; +my $swiss_db = undef; +my $refseq_db = undef; + +sub get_sequence{ + my ($db_type,$identifier) = @_; + if( ! $DBOKAY ) { + confess("Your system does not have IO::String installed so the DB retrieval method is not available"); + return; + } + $db_type = lc($db_type); + + my $db; + + if( $db_type =~ /genbank/ ) { + if( !defined $genbank_db ) { + $genbank_db = Bio::DB::GenBank->new(); + } + $db = $genbank_db; + } + if( $db_type =~ /genpept/ ) { + if( !defined $genpept_db ) { + $genpept_db = Bio::DB::GenPept->new(); + } + $db = $genpept_db; + } + + if( $db_type =~ /swiss/ ) { + if( !defined $swiss_db ) { + $swiss_db = Bio::DB::SwissProt->new(); + } + $db = $swiss_db; + } + + if( $db_type =~ /embl/ ) { + if( !defined $embl_db ) { + $embl_db = Bio::DB::EMBL->new(); + } + $db = $embl_db; + } + + if( $db_type =~ /refseq/ or ($db_type !~ /swiss/ and + $identifier =~ /^\s*N\S+_/)) { + if( !defined $refseq_db ) { + $refseq_db = Bio::DB::RefSeq->new(); + } + $db = $refseq_db; + } + + my $seq; + + if( $identifier =~ /^\w+\d+$/ ) { + $seq = $db->get_Seq_by_acc($identifier); + } else { + $seq = $db->get_Seq_by_id($identifier); + } + + return $seq; +} + + +=head2 translate + + Title : translate + Usage : $seqobj = translate($seq_or_string_scalar) + + Function: translates a DNA sequence object OR just a plain + string of DNA to amino acids + Returns : A Bio::Seq object + + Args : Either a sequence object or a string of + just DNA sequence characters + +=cut + +sub translate { + my ($scalar) = shift; + + my $obj; + + if( ref $scalar ) { + if( !$scalar->isa("Bio::PrimarySeqI") ) { + confess("Expecting a sequence object not a $scalar"); + } else { + $obj= $scalar; + + } + + } else { + + # check this looks vaguely like DNA + my $n = ( $scalar =~ tr/ATGCNatgc/ATGCNatgcn/ ); + + if( $n < length($scalar) * 0.85 ) { + confess("Sequence [$scalar] is less than 85% ATGCN, which doesn't look very DNA to me"); + } + + $obj = Bio::PrimarySeq->new(-id => 'internalbioperlseq',-seq => $scalar); + } + + return $obj->translate(); +} + + +=head2 translate_as_string + + Title : translate_as_string + Usage : $seqstring = translate_as_string($seq_or_string_scalar) + + Function: translates a DNA sequence object OR just a plain + string of DNA to amino acids + Returns : A stirng of just amino acids + + Args : Either a sequence object or a string of + just DNA sequence characters + +=cut + +sub translate_as_string { + my ($scalar) = shift; + + my $obj = Bio::Perl::translate($scalar); + + return $obj->seq; +} + + +=head2 reverse_complement + + Title : reverse_complement + Usage : $seqobj = reverse_complement($seq_or_string_scalar) + + Function: reverse complements a string or sequnce argument + producing a Bio::Seq - if you want a string, you + can use reverse_complement_as_string + Returns : A Bio::Seq object + + Args : Either a sequence object or a string of + just DNA sequence characters + +=cut + +sub reverse_complement { + my ($scalar) = shift; + + my $obj; + + if( ref $scalar ) { + if( !$scalar->isa("Bio::PrimarySeqI") ) { + confess("Expecting a sequence object not a $scalar"); + } else { + $obj= $scalar; + + } + + } else { + + # check this looks vaguely like DNA + my $n = ( $scalar =~ tr/ATGCNatgc/ATGCNatgcn/ ); + + if( $n < length($scalar) * 0.85 ) { + confess("Sequence [$scalar] is less than 85% ATGCN, which doesn't look very DNA to me"); + } + + $obj = Bio::PrimarySeq->new(-id => 'internalbioperlseq',-seq => $scalar); + } + + return $obj->revcom(); +} + +=head2 revcom + + Title : revcom + Usage : $seqobj = revcom($seq_or_string_scalar) + + Function: reverse complements a string or sequnce argument + producing a Bio::Seq - if you want a string, you + can use reverse_complement_as_string + + This is an alias for reverse_complement + Returns : A Bio::Seq object + + Args : Either a sequence object or a string of + just DNA sequence characters + +=cut + +sub revcom { + return &Bio::Perl::reverse_complement(@_); +} + + +=head2 reverse_complement_as_string + + Title : reverse_complement_as_string + Usage : $string = reverse_complement_as_string($seq_or_string_scalar) + + Function: reverse complements a string or sequnce argument + producing a string + Returns : A string of DNA letters + + Args : Either a sequence object or a string of + just DNA sequence characters + +=cut + +sub reverse_complement_as_string { + my ($scalar) = shift; + + my $obj = &Bio::Perl::reverse_complement($scalar); + + return $obj->seq; +} + + +=head2 revcom_as_string + + Title : revcom_as_string + Usage : $string = revcom_as_string($seq_or_string_scalar) + + Function: reverse complements a string or sequnce argument + producing a string + Returns : A string of DNA letters + + Args : Either a sequence object or a string of + just DNA sequence characters + +=cut + +sub revcom_as_string { + my ($scalar) = shift; + + my $obj = &Bio::Perl::reverse_complement($scalar); + + return $obj->seq; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Phenotype/Correlate.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Phenotype/Correlate.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,368 @@ +# $Id: Correlate.pm,v 1.5 2002/12/12 18:27:01 czmasek Exp $ +# +# BioPerl module for Bio::Phenotype::Correlate +# +# Cared for by Christian M. Zmasek or +# +# (c) Christian M. Zmasek, czmasek@gnf.org, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + + +=head1 NAME + +Correlate - Representation of a correlating phenotype in a given species + +=head1 SYNOPSIS + + use Bio::Phenotype::Correlate; + + $co = Bio::Phenotype::Correlate->new( -name => "4(Tas1r3)", + -description => "mouse correlate of human phenotype MIM 605865", + -species => $mouse, + -type => "homolog", + -comment => "type=homolog is putative" ); + + print $co->name(); + print $co->description(); + print $co->species()->binomial(); + print $co->type(); + print $co->comment(); + + print $co->to_string(); + +=head1 DESCRIPTION + +This class models correlating phenotypes. +Its creation was inspired by the OMIM database where many human phenotypes +have a correlating mouse phenotype. Therefore, this class is intended +to be used together with a phenotype class. + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Christian M. Zmasek + +Email: czmasek@gnf.org or cmzmasek@yahoo.com + +WWW: http://www.genetics.wustl.edu/eddy/people/zmasek/ + +Address: + + Genomics Institute of the Novartis Research Foundation + 10675 John Jay Hopkins Drive + San Diego, CA 92121 + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. + +=cut + + +# Let the code begin... + +package Bio::Phenotype::Correlate; +use vars qw( @ISA ); +use strict; +use Bio::Root::Object; +use Bio::Species; + +@ISA = qw( Bio::Root::Root ); + + +=head2 new + + Title : new + Usage : $co = Bio::Phenotype::Correlate->new( -name => "4(Tas1r3)", + -description => "mouse correlate of human phenotype MIM 605865", + -species => $mouse, + -type => "homolog", + -comment => "type=homolog is putative" ); + Function: Creates a new Correlate object. + Returns : A new Correlate object. + Args : -name => a name or id + -description => a description + -species => the species of this correlating phenotype [Bio::Species] + -type => the type of correlation + -comment => a comment + +=cut + +sub new { + + my( $class, @args ) = @_; + + my $self = $class->SUPER::new( @args ); + + my ( $name, $desc, $species, $type, $comment ) + = $self->_rearrange( [ qw( NAME + DESCRIPTION + SPECIES + TYPE + COMMENT ) ], @args ); + + $self->init(); + + $name && $self->name( $name ); + $desc && $self->description( $desc ); + $species && $self->species( $species ); + $type && $self->type( $type ); + $comment && $self->comment( $comment ); + + return $self; + +} # new + + + + +=head2 init + + Title : init() + Usage : $co->init(); + Function: Initializes this Correlate to all "". + Returns : + Args : + +=cut + +sub init { + + my( $self ) = @_; + + $self->name( "" ); + $self->description( "" ); + my $species = Bio::Species->new(); + $species->classification( qw( species Undetermined ) ); + $self->species( $species ); + $self->type( "" ); + $self->comment( "" ); + +} # init + + + + +=head2 name + + Title : name + Usage : $co->name( "4(Tas1r3)" ); + or + print $co->name(); + Function: Set/get for the name or id of this Correlate. + Returns : The name or id of this Correlate. + Args : The name or id of this Correlate (optional). + +=cut + +sub name { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_name" } = $value; + } + + return $self->{ "_name" }; + +} # name + + + + +=head2 description + + Title : description + Usage : $co->description( "mouse correlate of human phenotype MIM 03923" ); + or + print $co->description(); + Function: Set/get for the description of this Correlate. + Returns : A description of this Correlate. + Args : A description of this Correlate (optional). + +=cut + +sub description { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_description" } = $value; + } + + return $self->{ "_description" }; + +} # description + + + + +=head2 species + + Title : species + Usage : $co->species( $species ); + or + $species = $co->species(); + Function: Set/get for the species of this Correlate. + Returns : The Bio::Species of this Correlate [Bio::Species]. + Args : The Bio::Species of this Correlate [Bio::Species] (optional). + +=cut + +sub species { + + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->_check_ref_type( $value, "Bio::Species" ); + $self->{ "_species" } = $value; + } + + return $self->{ "_species" }; + +} # species + + + + +=head2 type + + Title : type + Usage : $co->type( "homolog" ); + or + print $co->type(); + Function: Set/get for the type of this Correlate. + Returns : The type of this Correlate. + Args : The type of this Correlate (optional). + +=cut + +sub type { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_type" } = $value; + } + + return $self->{ "_type" }; + +} # type + + + + +=head2 comment + + Title : comment + Usage : $co->comment( "doubtful" ); + or + print $co->comment(); + Function: Set/get for an arbitrary comment about this Correlate. + Returns : A comment. + Args : A comment (optional). + +=cut + +sub comment { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_comment" } = $value; + } + + return $self->{ "_comment" }; + +} # comment + + + +=head2 to_string + + Title : to_string() + Usage : print $co->to_string(); + Function: To string method for Correlate objects. + Returns : A string representations of this Correlate. + Args : + +=cut + +sub to_string { + + my ( $self ) = @_; + + my $s = ""; + + $s .= "-- Name:\n"; + $s .= $self->name()."\n"; + $s .= "-- Description:\n"; + $s .= $self->description()."\n"; + $s .= "-- Species:\n"; + $s .= $self->species()->binomial()."\n"; + $s .= "-- Type of correlation:\n"; + $s .= $self->type()."\n"; + $s .= "-- Comment:\n"; + $s .= $self->comment(); + + return $s; + +} # to_string + + + + +# Title : _check_ref_type +# Function: Checks for the correct type. +# Returns : +# Args : The value to be checked, the expected class. +sub _check_ref_type { + my ( $self, $value, $expected_class ) = @_; + + if ( ! defined( $value ) ) { + $self->throw( ( caller( 1 ) )[ 3 ] .": Found [undef" + ."] where [$expected_class] expected" ); + } + elsif ( ! ref( $value ) ) { + $self->throw( ( caller( 1 ) )[ 3 ] .": Found scalar" + ." where [$expected_class] expected" ); + } + elsif ( ! $value->isa( $expected_class ) ) { + $self->throw( ( caller( 1 ) )[ 3 ] .": Found [". ref( $value ) + ."] where [$expected_class] expected" ); + } +} # _check_ref_type + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Phenotype/Measure.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Phenotype/Measure.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,368 @@ +# $Id: Measure.pm,v 1.5 2002/12/12 18:27:01 czmasek Exp $ +# +# BioPerl module for Bio::Phenotype::Measure +# +# Cared for by Christian M. Zmasek or +# +# (c) Christian M. Zmasek, czmasek@gnf.org, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + + +=head1 NAME + +Measure - Representation of context/value(-range)/unit triplets + +=head1 SYNOPSIS + + use Bio::Phenotype::Measure; + + my $measure = Bio::Phenotype::Measure->new( -context => "length", + -description => "reduced length in 4(Tas1r3)", + -start => 0, + -end => 15, + -unit => "mm", + -comment => "see also Miller et al" ); + + print $measure->context(); + print $measure->description(); + print $measure->start(); + print $measure->end(); + print $measure->unit(); + print $measure->comment(); + + print $measure->to_string(); + +=head1 DESCRIPTION + +Measure is for biochemically defined phenotypes or any other types of measures. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Christian M. Zmasek + +Email: czmasek@gnf.org or cmzmasek@yahoo.com + +WWW: http://www.genetics.wustl.edu/eddy/people/zmasek/ + +Address: + + Genomics Institute of the Novartis Research Foundation + 10675 John Jay Hopkins Drive + San Diego, CA 92121 + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. + +=cut + + +# Let the code begin... + +package Bio::Phenotype::Measure; +use vars qw( @ISA ); +use strict; +use Bio::Root::Object; + +@ISA = qw( Bio::Root::Root ); + + +=head2 new + + Title : new + Usage : my $me = Bio::Phenotype::Measure->new( -context => "length", + -description => "reduced length in 4(Tas1r3)", + -start => 0, + -end => 15, + -unit => "mm", + -comment => "see Miller also et al" ); + Function: Creates a new Measure object. + Returns : A new Measure object. + Args : -context => the context + -description => a description + -start => the start value + -end => the end value + -unit => the unit + -comment => a comment + +=cut + +sub new { + my( $class, @args ) = @_; + + my $self = $class->SUPER::new( @args ); + + my ( $con, $desc, $start, $end, $unit, $comment ) + = $self->_rearrange( [ qw( CONTEXT + DESCRIPTION + START + END + UNIT + COMMENT ) ], @args ); + + $self->init(); + + $con && $self->context( $con ); + $desc && $self->description( $desc ); + $start && $self->start( $start ); + $end && $self->end( $end ); + $unit && $self->unit( $unit ); + $comment && $self->comment( $comment ); + + return $self; + +} # new + + + + +=head2 init + + Title : init() + Usage : $measure->init(); + Function: Initializes this Measure to all "". + Returns : + Args : + +=cut + +sub init { + my( $self ) = @_; + + $self->context( "" ); + $self->description( "" ); + $self->start( "" ); + $self->end( "" ); + $self->unit( "" ); + $self->comment( "" ); + +} # init + + + + +=head2 context + + Title : context + Usage : $measure->context( "Ca-conc" ); + or + print $measure->context(); + Function: Set/get for the context of this Measure. + Returns : The context. + Args : The context (optional). + +=cut + +sub context { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_context" } = $value; + } + + return $self->{ "_context" }; + +} # context + + + + +=head2 description + + Title : description + Usage : $measure->description( "reduced in 4(Tas1r3)" ); + or + print $measure->description(); + Function: Set/get for the description of this Measure. + Returns : A description. + Args : A description (optional). + +=cut + +sub description { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_description" } = $value; + } + + return $self->{ "_description" }; + +} # description + + + + +=head2 start + + Title : start + Usage : $measure->start( 330 ); + or + print $measure->start(); + Function: Set/get for the start value of this Measure. + Returns : The start value. + Args : The start value (optional). + +=cut + +sub start { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_start" } = $value; + } + + return $self->{ "_start" }; + +} # start + + + + +=head2 end + + Title : end + Usage : $measure->end( 459 ); + or + print $measure->end(); + Function: Set/get for the end value of this Measure. + Returns : The end value. + Args : The end value (optional). + +=cut + +sub end { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_end" } = $value; + } + + return $self->{ "_end" }; + +} # end + + + + +=head2 unit + + Title : unit + Usage : $measure->unit( "mM" ); + or + print $measure->unit(); + Function: Set/get for the unit of this Measure. + Returns : The unit. + Args : The unit (optional). + +=cut + +sub unit { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_unit" } = $value; + } + + return $self->{ "_unit" }; + +} # unit + + + + +=head2 comment + + Title : comment + Usage : $measure->comment( "see also Miller et al" ); + or + print $measure->comment(); + Function: Set/get for an arbitrary comment about this Measure. + Returns : A comment. + Args : A comment (optional). + +=cut + +sub comment { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_comment" } = $value; + } + + return $self->{ "_comment" }; + +} # comment + + + + +=head2 to_string + + Title : to_string() + Usage : print $measure->to_string(); + Function: To string method for Measure objects. + Returns : A string representations of this Measure. + Args : + +=cut + +sub to_string { + my ( $self ) = @_; + + my $s = ""; + + $s .= "-- Context:\n"; + $s .= $self->context()."\n"; + $s .= "-- Description:\n"; + $s .= $self->description()."\n"; + $s .= "-- Start:\n"; + $s .= $self->start()."\n"; + $s .= "-- End:\n"; + $s .= $self->end()."\n"; + $s .= "-- Unit:\n"; + $s .= $self->unit()."\n"; + $s .= "-- Comment:\n"; + $s .= $self->comment(); + + return $s; + +} # to_string + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Phenotype/OMIM/MiniMIMentry.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Phenotype/OMIM/MiniMIMentry.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,299 @@ +# $Id: MiniMIMentry.pm,v 1.5 2002/12/12 18:27:01 czmasek Exp $ +# +# BioPerl module for Bio::Phenotype::OMIM::MiniMIMentry +# +# Cared for by Christian M. Zmasek or +# +# (c) Christian M. Zmasek, czmasek@gnf.org, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + + +=head1 NAME + +MiniMIMentry - Representation of a Mini MIM entry + +=head1 SYNOPSIS + + use Bio::Phenotype::OMIM::MiniMIMentry; + + $mm = Bio::Phenotype::OMIM::MiniMIMentry->new( -description => "The central form of ...", + -created => "Victor A. McKusick: 6/4/1986", + -contributors => "Kelly A. Przylepa - revised: 03/18/2002", + -edited => "alopez: 06/03/1997" ); + + +=head1 DESCRIPTION + +This class representats of Mini MIM entries. +This class is intended to be used together with a OMIM entry class. + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Christian M. Zmasek + +Email: czmasek@gnf.org or cmzmasek@yahoo.com + +WWW: http://www.genetics.wustl.edu/eddy/people/zmasek/ + +Address: + + Genomics Institute of the Novartis Research Foundation + 10675 John Jay Hopkins Drive + San Diego, CA 92121 + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. + +=cut + + +# Let the code begin... + +package Bio::Phenotype::OMIM::MiniMIMentry; +use vars qw( @ISA ); +use strict; +use Bio::Root::Object; + +@ISA = qw( Bio::Root::Root ); + + + + +=head2 new + + Title : new + Usage : $mm = Bio::Phenotype::OMIM::MiniMIMentry->new( -description => "The central form of ...", + -created => "Victor A. McKusick: 6/4/1986", + -contributors => "Kelly A. Przylepa - revised: 03/18/2002", + -edited => "alopez: 06/03/1997" ); + + Function: Creates a new MiniMIMentry object. + Returns : A new MiniMIMentry object. + Args : -description => a description + -created => name(s) and date(s) (free form) + -contributors => name(s) and date(s) (free form) + -edited => name(s) and date(s) (free form) + +=cut + +sub new { + + my( $class, @args ) = @_; + + my $self = $class->SUPER::new( @args ); + + my ( $desc, $created, $contributors, $edited ) + = $self->_rearrange( [ qw( DESCRIPTION + CREATED + CONTRIBUTORS + EDITED ) ], @args ); + + $self->init(); + + $desc && $self->description( $desc ); + $created && $self->created( $created ); + $contributors && $self->contributors( $contributors ); + $edited && $self->edited( $edited ); + + return $self; + +} # new + + + + +=head2 init + + Title : init() + Usage : $mm->init(); + Function: Initializes this MiniMIMentry to all "". + Returns : + Args : + +=cut + +sub init { + + my( $self ) = @_; + + $self->description( "" ); + $self->created( "" ); + $self->contributors( "" ); + $self->edited( "" ); + + +} # init + + + + +=head2 description + + Title : description + Usage : $mm->description( "The central form of ..." ); + or + print $mm->description(); + Function: Set/get for the description field of the Mini MIM database. + Returns : The description. + Args : The description (optional). + +=cut + +sub description { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_description" } = $value; + } + + return $self->{ "_description" }; + +} # description + + + + +=head2 created + + Title : created + Usage : $mm->created( "Victor A. McKusick: 6/4/1986" ); + or + print $mm->created(); + Function: Set/get for the created field of the Mini MIM database. + Returns : Name(s) and date(s) [scalar - free form]. + Args : Name(s) and date(s) [scalar - free form] (optional). + +=cut + +sub created { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_created" } = $value; + } + + return $self->{ "_created" }; + +} # created + + + + +=head2 contributors + + Title : contributors + Usage : $mm->contributors( "Kelly A. Przylepa - revised: 03/18/2002" ); + or + print $mm->contributors(); + Function: Set/get for the contributors field of the Mini MIM database. + Returns : Name(s) and date(s) [scalar - free form]. + Args : Name(s) and date(s) [scalar - free form] (optional). + +=cut + +sub contributors { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_contributors" } = $value; + } + + return $self->{ "_contributors" }; + +} # contributors + + + + +=head2 edited + + Title : edited + Usage : $mm->edited( "alopez: 06/03/1997" ); + or + print $mm->edited(); + Function: Set/get for the edited field of the Mini MIM database. + Returns : Name(s) and date(s) [scalar - free form]. + Args : Name(s) and date(s) [scalar - free form] (optional). + +=cut + +sub edited { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_edited" } = $value; + } + + return $self->{ "_edited" }; + +} # edited + + + + +=head2 to_string + + Title : to_string() + Usage : print $mm->to_string(); + Function: To string method for MiniMIMentry objects. + Returns : A string representations of this MiniMIMentry. + Args : + +=cut + +sub to_string { + my ( $self ) = @_; + + my $s = ""; + + $s .= "-- Description:\n"; + $s .= $self->description()."\n"; + $s .= "-- Created:\n"; + $s .= $self->created()."\n"; + $s .= "-- Contributors:\n"; + $s .= $self->contributors()."\n"; + $s .= "-- Edited:\n"; + $s .= $self->edited(); + + return $s; + +} # to_string + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Phenotype/OMIM/OMIMentry.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Phenotype/OMIM/OMIMentry.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,801 @@ +# $Id: OMIMentry.pm,v 1.8 2002/12/12 18:27:01 czmasek Exp $ +# +# BioPerl module for Bio::Phenotype::OMIM::OMIMentry +# +# Cared for by Christian M. Zmasek or +# +# (c) Christian M. Zmasek, czmasek@gnf.org, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +OMIMentry - represents OMIM (Online Mendelian Inheritance in Man) database entries + +=head1 SYNOPSIS + + $obj = Bio::Phenotype::OMIM::OMIMentry->new( -mim_number => 200000, + -description => "This is ...", + -more_than_two_genes => 1 ); + +=head1 DESCRIPTION + +Inherits from Bio::Phenotype::PhenotypeI. +Bio::Phenotype::OMIM::OMIMparser parses the flat file representation +of OMIM (i.e. files "omim.txt" and "genemap") returning OMIMentry objects. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Christian M. Zmasek + +Email: czmasek@gnf.org or cmzmasek@yahoo.com + +WWW: http://www.genetics.wustl.edu/eddy/people/zmasek/ + +Address: + + Genomics Institute of the Novartis Research Foundation + 10675 John Jay Hopkins Drive + San Diego, CA 92121 + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Phenotype::OMIM::OMIMentry; +use vars qw( @ISA ); +use strict; + +use Bio::Phenotype::Phenotype; +use Bio::Phenotype::OMIM::MiniMIMentry; +use Bio::Phenotype::OMIM::OMIMentryAllelicVariant; + +use constant TRUE => 1; +use constant FALSE => 0; +use constant DEFAULT_MIM_NUMER => 0; + +@ISA = qw( Bio::Phenotype::Phenotype ); + + + + +=head2 new + + Title : new + Usage : $obj = Bio::Phenotype::OMIM::OMIMentry->new( -mim_number => 200000, + -description => "This is ...", + -more_than_two_genes => 1 ); + Function: Creates a new OMIMentry object. + Returns : A new OMIMentry object. + Args : -mim_number => the MIM number + -title => the title or name + -alternative_titles_and_symbols => the "alternative titles and symbols" + -more_than_two_genes => can phenotype can be caused by mutation in any of two or more genes? + -is_separate => is this phenotype separate from those represented by other entries + -description => the description of this phenotype + -mapping_method => the mapping method + -gene_status => the gene status of this + -comment => a comment + -species => ref to the the species (human) + -created => created by whom/when + -edited => edited by whom/when + -contributors => contributed by whom/when + -additional_references => "see also" + -clinical_symptoms => the clinical symptoms + -minimim => the Mini MIM associated with this OMIM antry + +=cut + +sub new { + + my( $class,@args ) = @_; + + my $self = $class->SUPER::new( @args ); + + my ( $mim_number, + $title, + $alternative_titles_and_symbols, + $more_than_two_genes, + $is_separate, + $description, + $mapping_method, + $gene_status, + $comment, + $species, + $created, + $edited, + $contributors, + $additional_references, + $clinical_symptoms, + $miniMIM ) + = $self->_rearrange( [ qw( MIM_NUMBER + TITLE + ALTERNATIVE_TITLES_AND_SYMBOLS + MORE_THAN_TWO_GENES + IS_SEPARATE + DESCRIPTION + MAPPING_METHOD + GENE_STATUS + COMMENT + SPECIES + CREATED + EDITED + CONTRIBUTORS + ADDITIONAL_REFERENCES + CLINICAL_SYMPTOMS + MINIMIM ) ], @args ); + + $self->init(); + + $mim_number && $self->MIM_number( $mim_number ); + $title && $self->title( $title ); + $alternative_titles_and_symbols && $self->alternative_titles_and_symbols( $alternative_titles_and_symbols ); + $more_than_two_genes && $self->more_than_two_genes( $more_than_two_genes ); + $is_separate && $self->is_separate( $is_separate ); + $description && $self->description( $description ); + $mapping_method && $self->mapping_method( $mapping_method ); + $gene_status && $self->gene_status( $gene_status ); + $comment && $self->comment( $comment ); + $species && $self->species( $species ); + $created && $self->created( $created ); + $edited && $self->edited( $edited ); + $contributors && $self->contributors( $contributors ); + $additional_references && $self->additional_references( $additional_references ); + $clinical_symptoms && $self->clinical_symptoms( $clinical_symptoms ); + $miniMIM && $self->miniMIM( $miniMIM ); + + return $self; + +} # new + + + +=head2 init + + Title : init() + Usage : $obj->init(); + Function: Initializes this OMIMentry to all "" and empty lists. + Returns : + Args : + +=cut + +sub init { + + my( $self ) = @_; + + $self->MIM_number( DEFAULT_MIM_NUMER ); + $self->title( "" ); + $self->alternative_titles_and_symbols( "" ); + $self->more_than_two_genes( FALSE ); + $self->is_separate( FALSE ); + $self->description( "" ); + $self->mapping_method( "" ); + $self->gene_status( "" ); + $self->comment( "" ); + my $species = Bio::Species->new(); + $species->classification( qw( sapiens Homo ) ); + $self->species( $species ); + $self->created( "" ); + $self->edited( "" ); + $self->contributors( "" ); + $self->additional_references( "" ); + $self->clinical_symptoms( "" ); + $self->remove_Correlates(); + $self->remove_References(); + $self->remove_AllelicVariants(); + $self->remove_CytoPositions(); + $self->remove_gene_symbols(); + $self->remove_Genotypes(); + $self->remove_DBLinks(); + $self->remove_keywords(); + $self->remove_Variants(); + $self->remove_Measures(); + $self->miniMIM( Bio::Phenotype::OMIM::MiniMIMentry->new() ); + +} # init + + + +sub to_string { + + my( $self ) = @_; + + my $s = ""; + + $s .= "-- MIM number:\n"; + $s .= $self->MIM_number()."\n\n"; + $s .= "-- Title:\n"; + $s .= $self->title()."\n\n"; + $s .= "-- Alternative Titles and Symbols:\n"; + $s .= $self->alternative_titles_and_symbols()."\n\n"; + $s .= "-- Can be caused by Mutation in any of two or more Genes:\n"; + $s .= $self->more_than_two_genes()."\n\n"; + $s .= "-- Phenotype is separate:\n"; + $s .= $self->is_separate()."\n\n"; + $s .= "-- Description:\n"; + $s .= $self->description()."\n\n"; + $s .= "-- Species:\n"; + $s .= $self->species()->binomial()."\n\n"; + $s .= "-- Clinical Symptoms:\n"; + $s .= $self->clinical_symptoms()."\n\n"; + $s .= "-- Allelic Variants:\n"; + $s .= $self->_array_to_string( $self->each_AllelicVariant() )."\n"; + $s .= "-- Cyto Positions:\n"; + $s .= $self->_array_to_string( $self->each_CytoPosition() )."\n"; + $s .= "-- Gene Symbols:\n"; + $s .= $self->_array_to_string( $self->each_gene_symbol() )."\n"; + $s .= "-- Correlates:\n"; + $s .= $self->_array_to_string( $self->each_Correlate() )."\n"; + $s .= "-- References:\n"; + $s .= $self->_array_to_string( $self->each_Reference() )."\n"; + $s .= "-- Additional References:\n"; + $s .= $self->additional_references()."\n\n"; + $s .= "-- Mapping Method:\n"; + $s .= $self->mapping_method()."\n\n"; + $s .= "-- Gene status:\n"; + $s .= $self->gene_status()."\n\n"; + $s .= "-- Created:\n"; + $s .= $self->created()."\n\n"; + $s .= "-- Contributors:\n"; + $s .= $self->contributors()."\n\n"; + $s .= "-- Edited:\n"; + $s .= $self->edited()."\n\n"; + $s .= "-- Comment:\n"; + $s .= $self->comment()."\n\n"; + $s .= "-- MiniMIM:\n"; + $s .= $self->miniMIM()->to_string()."\n\n"; + return $s; + + +} # to_string + + + +=head2 MIM_number + + Title : MIM_number + Usage : $omim->MIM_number( "100050" ); + or + print $omim->MIM_number(); + Function: Set/get for the MIM number of this OMIM entry. + Returns : The MIM number [an integer larger than 100000]. + Args : The MIM number [an integer larger than 100000] (optional). + +=cut + +sub MIM_number { + my ( $self, $value ) = @_; + + if ( defined $value ) { + if ( $value =~ /\D/ + || ( $value < 100000 && $value != DEFAULT_MIM_NUMER ) ) { + $self->throw( "Found [$value]" + . " where [integer larger than 100000] expected" ); + } + $self->{ "_MIM_number" } = $value; + } + + return $self->{ "_MIM_number" }; + +} # MIM_number + + + + +=head2 title + + Title : title + Usage : $omim->title( "AARSKOG SYNDROME" ); + or + print $omim->title(); + Function: Set/get for the title or name of this OMIM entry. + This method is an alias to the method "name" of + Bio::Phenotype::PhenotypeI. + Returns : The title [scalar]. + Args : The title [scalar] (optional). + +=cut + +sub title { + my ( $self, $value ) = @_; + + $self->name( $value ); + +} # title + + + + +=head2 alternative_titles_and_symbols + + Title : alternative_titles_and_symbols + Usage : $omim->alternative_titles_and_symbols( "AORTIC ANEURYSM, ABDOMINAL" ); + or + print $omim->alternative_titles_and_symbols(); + Function: Set/get for the "alternative titles and symbols" of this OMIM entry. + Currently, everything after the first line of title (TI) field is + considered "alternative titles and symbols". + Returns : "alternative titles and symbols" [scalar]. + Args : "alternative titles and symbols" [scalar] (optional). + +=cut + +sub alternative_titles_and_symbols { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_alternative_titles_and_symbols" } = $value; + } + + return $self->{ "_alternative_titles_and_symbols" }; + +} # alternative_titles_and_symbols + + + + +=head2 more_than_two_genes + + Title : more_than_two_genes + Usage : $omim->more_than_two_genes( 1 ); + or + print $omim->more_than_two_genes(); + Function: This is true if this phenotype can be caused + by mutation in any of two or more genes. + In OMIM, this is indicated by a number symbol (#) + before an entry number (e.g. #114480 -- BREAST CANCER). + Returns : [1 or 0]. + Args : [1 or 0] (optional). + +=cut + +sub more_than_two_genes { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->_is_true_or_false( $value ); + $self->{ "_more_than_two_genes" } = $value; + } + + return $self->{ "_more_than_two_genes" }; + +} # more_than_two_genes + + + + +=head2 is_separate + + Title : is_separate + Usage : $omim->is_separate( 1 ); + or + print $omim->is_separate(); + Function: This is true if the phenotype determined by the gene at + the given locus is separate from those represented by + other entries where "is_separate" is true and if the mode + of inheritance of the phenotype has been proved + (in the judgment of the authors and editors). + In OMIM, this is indicated by a asterisk (*) + before an entry number (e.g. *113705 BREAST CANCER, + TYPE 1; BRCA1). + Returns : [1 or 0]. + Args : [1 or 0] (optional). + +=cut + +sub is_separate { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->_is_true_or_false( $value ); + $self->{ "_is_separate" } = $value; + } + + return $self->{ "_is_separate" }; + +} # is_separate + + + + +=head2 mapping_method + + Title : mapping_method + Usage : $omim->mapping_method( "PCR of somatic cell hybrid DNA" ); + or + print $omim->mapping_method(); + Function: Set/get for the mapping method of this OMIM entry. + Returns : The mapping method [scalar]. + Args : The mapping method [scalar] (optional). + +=cut + +sub mapping_method { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_mapping_method" } = $value; + } + + return $self->{ "_mapping_method" }; + +} # mapping_method + + + + +=head2 gene_status + + Title : gene_status + Usage : $omim->gene_status( "C" ); + or + print $omim->gene_status(); + Function: Set/get for the gene status of this OMIM entry. + The certainty with which assignment of loci to chromosomes or the linkage + between two loci has been established has been graded into the following + classes: + C = confirmed - observed in at least two laboratories or in several families. + P = provisional - based on evidence from one laboratory or one family. + I = inconsistent - results of different laboratories disagree. + L = limbo - evidence not as strong as that provisional, but included for + heuristic reasons. (Same as `tentative'.) + + Returns : [C, P, I, or L]. + Args : [C, P, I, or L] (optional). + +=cut + +sub gene_status { + my ( $self, $value ) = @_; + + if ( defined $value ) { + #unless ( $value eq "C" + # || $value eq "P" + # || $value eq "I" + # || $value eq "L" + # || $value eq "A" # !? + # || $value eq "H" # !? + # || $value eq "U" # !? + # || $value eq "" ) { + # $self->throw( "Found [$value]" + # . " where [C, P, I, or L] expected" ); + #} + unless ( $value eq "C" + || $value eq "P" + || $value eq "I" + || $value eq "L" + || $value eq "" ) { + $value = ""; + } + + $self->{ "_gene_status" } = $value; + } + + return $self->{ "_gene_status" }; + +} # gene_status + + + + +=head2 clinical_symptoms + + Title : clinical_symptoms + Usage : $omim->clinical_symptoms( "Patients with ..." ); + or + print $omim->clinical_symptoms(); + Function: Set/get for the clinical symptoms of this OMIM entry. + Returns : The clinical symptoms [scalar]. + Args : The clinical symptoms [scalar] (optional). + +=cut + +sub clinical_symptoms { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_clinical_symptoms" } = $value; + } + + return $self->{ "_clinical_symptoms" }; + +} # clinical_symptoms + + + + + +=head2 created + + Title : created + Usage : $omim->created( "Victor A. McKusick: 6/4/1986" ); + or + print $omim->created(); + Function: Set/get for the created field of the OMIM database. + Returns : Name(s) and date(s) [scalar - free form]. + Args : Name(s) and date(s) [scalar - free form] (optional). + +=cut + +sub created { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_created" } = $value; + } + + return $self->{ "_created" }; + +} # created + + + + +=head2 contributors + + Title : contributors + Usage : $omim->contributors( "Kelly A. Przylepa - revised: 03/18/2002" ); + or + print $omim->contributors(); + Function: Set/get for the contributors field of the OMIM database. + Returns : Name(s) and date(s) [scalar - free form]. + Args : Name(s) and date(s) [scalar - free form] (optional). + +=cut + +sub contributors { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_contributors" } = $value; + } + + return $self->{ "_contributors" }; + +} # contributors + + + + +=head2 edited + + Title : edited + Usage : $omim->edited( "alopez: 06/03/1997" ); + or + print $omim->edited(); + Function: Set/get for the edited field of the OMIM database. + Returns : Name(s) and date(s) [scalar - free form]. + Args : Name(s) and date(s) [scalar - free form] (optional). + +=cut + +sub edited { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_edited" } = $value; + } + + return $self->{ "_edited" }; + +} # edited + + + + +=head2 additional_references + + Title : additional_references + Usage : $omim->additional_references( "Miller er al." ); + or + print $omim->additional_references(); + Function: Set/get for the additional references of this OMIM antry + (see also). + Returns : additional reference [scalar]. + Args : additional reference [scalar] (optional). + +=cut + +sub additional_references { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_additional_references" } = $value; + } + + return $self->{ "_additional_references" }; + +} # additional_references + + + + +=head2 miniMIM + + Title : miniMIM + Usage : $omim->miniMIM( $MM ); + or + $MM = $omim->miniMIM(); + Function: Set/get for the Mini MIM associated with this OMIM antry + (see also). + Returns : [Bio::Phenotype::OMIM::MiniMIMentry]. + Args : [Bio::Phenotype::OMIM::MiniMIMentry] (optional). + +=cut + +sub miniMIM { + + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->_check_ref_type( $value, "Bio::Phenotype::OMIM::MiniMIMentry" ); + $self->{ "_mini_mim" } = $value; + } + + return $self->{ "_mini_mim" }; +} + + + + +=head2 each_AllelicVariant + + Title : each_AllelicVariant() + Usage : @avs = $obj->each_AllelicVariant(); + Function: Returns a list of Bio::Phenotype::OMIM::OMIMentryAllelicVariant objects + associated with this OMIM entry. + Returns : A list of Bio::Phenotype::OMIM::OMIMentryAllelicVariant objects. + Args : + +=cut + +sub each_AllelicVariant { + my ( $self ) = @_; + + if ( $self->{ "_allelic_variants" } ) { + return @{ $self->{ "_allelic_variants" } }; + } + else { + return my @a = (); + } + +} # each_AllelicVariant + + +=head2 add_AllelicVariants + + Title : add_AllelicVariants + Usage : $obj->add_AllelicVariants( @avs ); + or + $obj->add_AllelicVariants( $av ); + Function: Pushes one or more OMIMentryAllelicVariant + into the list of OMIMentryAllelicVariants. + Returns : + Args : Bio::Phenotype::OMIM::OMIMentryAllelicVariant object(s). + +=cut + +sub add_AllelicVariants { + my ( $self, @values ) = @_; + + return unless( @values ); + + foreach my $value ( @values ) { + $self->_check_ref_type( $value, "Bio::Phenotype::OMIM::OMIMentryAllelicVariant" ); + } + + push( @{ $self->{ "_allelic_variants" } }, @values ); + +} # add_AllelicVariants + + +=head2 remove_AllelicVariants + + Title : remove_AllelicVariants + Usage : $obj->remove_AllelicVariants(); + Function: Deletes (and returns) the list of OMIMentryAllelicVariant objects + associated with this OMIM entry. + Returns : A list of OMIMentryAllelicVariant objects. + Args : + +=cut + +sub remove_AllelicVariants { + my ( $self ) = @_; + + my @a = $self->each_AllelicVariant(); + $self->{ "_allelic_variants" } = []; + return @a; + +} # remove_AllelicVariants + + + + + +# Title : _array_to_string +# Function: +# Returns : +# Args : +sub _array_to_string { + my( $self, @value ) = @_; + + my $s = ""; + + for ( my $i = 0; $i < scalar( @value ); ++$i ) { + if ( ! ref( $value[ $i ] ) ) { + $s .= "#" . $i . "\n-- Value:\n" . $value[ $i ] . "\n"; + } + elsif ( $value[ $i ]->isa( "Bio::Phenotype::OMIM::OMIMentryAllelicVariant" ) + || $value[ $i ]->isa( "Bio::Phenotype::Correlate" ) ) { + $s .= "#" . $i . "\n" . ( $value[ $i ] )->to_string() . "\n"; + } + elsif ( $value[ $i ]->isa( "Bio::Annotation::Reference" ) ) { + $s .= "#".$i."\n-- Authors:\n".( $value[ $i ] )->authors()."\n"; + $s .= "-- Title:\n".( $value[ $i ] )->title()."\n"; + $s .= "-- Location:\n".( $value[ $i ] )->location()."\n"; + } + elsif ( $value[ $i ]->isa( "Bio::Map::CytoPosition" ) ) { + $s .= "#" . $i . "\n-- Value:\n" . ( $value[ $i ] )->value() . "\n"; + } + } + + return $s; + +} # _array_to_string + + +# Title :_is_true_or_false +# Function: Checks whether the argument is 1 or 0. +# Returns : +# Args : The value to be checked. +sub _is_true_or_false { + my ( $self, $value ) = @_; + unless ( $value !~ /\D/ && ( $value == TRUE || $value == FALSE ) ) { + $self->throw( "Found [" . $value + . "] where " . TRUE . " or " . FALSE . " expected" ); + } +} # _is_true_or_false + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Phenotype/OMIM/OMIMentryAllelicVariant.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Phenotype/OMIM/OMIMentryAllelicVariant.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,432 @@ +# $Id: OMIMentryAllelicVariant.pm,v 1.5 2002/12/12 18:27:01 czmasek Exp $ +# +# BioPerl module for Bio::Phenotype::OMIM::OMIMentryAllelicVariant +# +# Cared for by Christian M. Zmasek or +# +# (c) Christian M. Zmasek, czmasek@gnf.org, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + + +=head1 NAME + +OMIMentryAllelicVariant - Representation of a allelic variant of the OMIM database + +=head1 SYNOPSIS + + use Bio::Phenotype::OMIM::OMIMentryAllelicVariant; + + $av = Bio::Phenotype::OMIM::OMIMentryAllelicVariant->new( -number => ".0001", + -title => "ALCOHOL INTOLERANCE", + -symbol => "ALDH2*2", + -description => "The ALDH2*2-encoded ...", + -aa_ori => "GLU", + -aa_mut => "LYS", + -position => 487, + -additional_mutations => "IVS4DS, G-A, +1" ); + +=head1 DESCRIPTION + +This class models the allelic variant of the OMIM database. +This class is intended to be used together with a OMIM entry class. + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Christian M. Zmasek + +Email: czmasek@gnf.org or cmzmasek@yahoo.com + +WWW: http://www.genetics.wustl.edu/eddy/people/zmasek/ + +Address: + + Genomics Institute of the Novartis Research Foundation + 10675 John Jay Hopkins Drive + San Diego, CA 92121 + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. + +=cut + + +# Let the code begin... + + +package Bio::Phenotype::OMIM::OMIMentryAllelicVariant; +use vars qw( @ISA ); +use strict; +use Bio::Root::Object; + +@ISA = qw( Bio::Root::Root ); + + + +=head2 new + + Title : new + Usage : $av = Bio::Phenotype::OMIM::OMIMentryAllelicVariant->new( -number => ".0001", + -title => "ALCOHOL INTOLERANCE", + -symbol => "ALDH2*2", + -description => "The ALDH2*2-encoded ...", + -aa_ori => "GLU", + -aa_mut => "LYS", + -position => 487, + -additional_mutations => "IVS4DS, G-A, +1" ); + Function: Creates a new OMIMentryAllelicVariant object. + Returns : A new OMIMentryAllelicVariant object. + Args : -number => the OMIM allelic variant number + -title => the title + -symbol => a symbol + -description => a description + -aa_ori => the original amino acid + -aa_mut => the mutated amino acid + -position => the position of the mutation + -additional_mutations => free form description of additional mutations + +=cut + +sub new { + + my( $class, @args ) = @_; + + my $self = $class->SUPER::new( @args ); + + my ( $number, $title, $symbol, $desc, $ori, $mut, $pos, $am ) + = $self->_rearrange( [ qw( NUMBER + TITLE + SYMBOL + DESCRIPTION + AA_ORI + AA_MUT + POSITION + ADDITIONAL_MUTATIONS ) ], @args ); + + $self->init(); + + $number && $self->number( $number ); + $title && $self->title( $title ); + $symbol && $self->symbol( $symbol ); + $desc && $self->description( $desc ); + $ori && $self->aa_ori( $ori ); + $mut && $self->aa_mut( $mut ); + $pos && $self->position( $pos ); + $am && $self->additional_mutations( $am ); + + return $self; + +} # new + + + + +=head2 init + + Title : init() + Usage : $av->init(); + Function: Initializes this OMIMentryAllelicVariant to all "". + Returns : + Args : + +=cut + +sub init { + my( $self ) = @_; + + $self->number( "" ); + $self->title( "" ); + $self->symbol( "" ); + $self->description( "" ); + $self->aa_ori( "" ); + $self->aa_mut( "" ); + $self->position( "" ); + $self->additional_mutations( "" ); + +} # init + + + + +=head2 number + + Title : number + Usage : $av->number( ".0001" ); + or + print $av->number(); + Function: Set/get for the OMIM allelic variant number of this + OMIMentryAllelicVariant. + Returns : The OMIM allelic variant number. + Args : The OMIM allelic variant number (optional). + +=cut + +sub number { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_number" } = $value; + } + + return $self->{ "_number" }; + +} # number + + + +=head2 title + + Title : title + Usage : $av->title( "ALCOHOL INTOLERANCE" ); + or + print $av->title(); + Function: Set/get for the title of this OMIMentryAllelicVariant. + Returns : The title. + Args : The title (optional). + +=cut + +sub title { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_title" } = $value; + } + + return $self->{ "_title" }; + +} # title + + + + +=head2 symbol + + Title : symbol + Usage : $av->symbol( "ALDH2*2" ); + or + print $av->symbol(); + Function: Set/get for the symbol of this OMIMentryAllelicVariant. + Returns : A symbol. + Args : A symbol (optional). + +=cut + +sub symbol { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_symbol" } = $value; + } + + return $self->{ "_symbol" }; + +} # symbol + + + + +=head2 description + + Title : description + Usage : $av->description( "The ALDH2*2-encoded protein has a change ..." ); + or + print $av->description(); + Function: Set/get for the description of this OMIMentryAllelicVariant. + Returns : A description. + Args : A description (optional). + +=cut + +sub description { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_description" } = $value; + } + + return $self->{ "_description" }; + +} # description + + + + +=head2 aa_ori + + Title : aa_ori + Usage : $av->aa_ori( "GLU" ); + or + print $av->aa_ori(); + Function: Set/get for the original amino acid(s). + Returns : The original amino acid(s). + Args : The original amino acid(s) (optional). + +=cut + +sub aa_ori { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_aa_ori" } = $value; + } + + return $self->{ "_aa_ori" }; + +} # aa_ori + + + + +=head2 aa_mut + + Title : aa_mut + Usage : $av->aa_mut( "LYS" ); + or + print $av->aa_mut(); + Function: Set/get for the mutated amino acid(s). + Returns : The mutated amino acid(s). + Args : The mutated amino acid(s) (optional). + +=cut + +sub aa_mut { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_aa_mut" } = $value; + } + + return $self->{ "_aa_mut" }; + +} # aa_mut + + + + +=head2 position + + Title : position + Usage : $av->position( 487 ); + or + print $av->position(); + Function: Set/get for the position of the mutation. + Returns : The position of the mutation. + Args : The position of the mutation (optional). + +=cut + +sub position { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_position" } = $value; + } + + return $self->{ "_position" }; + +} # position + + + + +=head2 additional_mutations + + Title : additional_mutations + Usage : $av->additional_mutations( "1-BP DEL, 911T" ); + or + print $av->additional_mutations(); + Function: Set/get for free form description of (additional) mutation(s). + Returns : description of (additional) mutation(s). + Args : description of (additional) mutation(s) (optional). + +=cut + +sub additional_mutations { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_additional_mutations" } = $value; + } + + return $self->{ "_additional_mutations" }; + +} # additional_mutations + + + +=head2 to_string + + Title : to_string() + Usage : print $av->to_string(); + Function: To string method for OMIMentryAllelicVariant objects. + Returns : A string representations of this OMIMentryAllelicVariant. + Args : + +=cut + +sub to_string { + my( $self ) = @_; + + my $s = ""; + + $s .= "-- Number:\n"; + $s .= $self->number()."\n"; + $s .= "-- Title:\n"; + $s .= $self->title()."\n"; + $s .= "-- Symbol:\n"; + $s .= $self->symbol()."\n"; + $s .= "-- Description:\n"; + $s .= $self->description()."\n"; + $s .= "-- Original AA(s):\n"; + $s .= $self->aa_ori()."\n"; + $s .= "-- Mutated AA(s):\n"; + $s .= $self->aa_mut()."\n"; + $s .= "-- Position:\n"; + $s .= $self->position()."\n"; + $s .= "-- Additional Mutation(s):\n"; + $s .= $self->additional_mutations(); + + return $s; + +} # to_string + + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Phenotype/OMIM/OMIMparser.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Phenotype/OMIM/OMIMparser.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,909 @@ +# $Id: OMIMparser.pm,v 1.8.2.1 2003/03/25 12:32:16 heikki Exp $ +# +# BioPerl module for Bio::Phenotype::OMIM::OMIMparser +# +# Cared for by Christian M. Zmasek or +# +# (c) Christian M. Zmasek, czmasek@gnf.org, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +OMIMparser - parser for the OMIM database + +=head1 SYNOPSIS + + use Bio::Phenotype::OMIM::OMIMparser; + + # The OMIM database is available as textfile at: + # ftp://ncbi.nlm.nih.gov/repository/OMIM/omim.txt.Z + # The genemap is available as textfile at: + # ftp://ncbi.nlm.nih.gov/repository/OMIM/genemap + + $omim_parser = Bio::Phenotype::OMIM::OMIMparser->new( -genemap => "/path/to/genemap", + -omimtext => "/path/to/omim.txt" ); + + while ( my $omim_entry = $omim_parser->next_phenotype() ) { + # This prints everything. + print( $omim_entry->to_string() ); + print "\n\n"; + + # This gets individual data (some of them object-arrays) + # (and illustrates the relevant methods of OMIMentry). + my $numb = $omim_entry->MIM_number(); # *FIELD* NO + my $title = $omim_entry->title(); # *FIELD* TI - first line + my $alt = $omim_entry->alternative_titles_and_symbols(); # *FIELD* TI - additional lines + my $mtt = $omim_entry->more_than_two_genes(); # "#" before title + my $sep = $omim_entry->is_separate(); # "*" before title + my $desc = $omim_entry->description(); # *FIELD* TX + my $mm = $omim_entry->mapping_method(); # from genemap + my $gs = $omim_entry->gene_status(); # from genemap + my $cr = $omim_entry->created(); # *FIELD* CD + my $cont = $omim_entry->contributors(); # *FIELD* CN + my $ed = $omim_entry->edited(); # *FIELD* ED + my $sa = $omim_entry->additional_references(); # *FIELD* SA + my $cs = $omim_entry->clinical_symptoms(); # *FIELD* CS + my $comm = $omim_entry->comment(); # from genemap + + my $mini_mim = $omim_entry->miniMIM(); # *FIELD* MN + # A Bio::Phenotype::OMIM::MiniMIMentry object. + # class Bio::Phenotype::OMIM::MiniMIMentry + # provides the following: + # - description() + # - created() + # - contributors() + # - edited() + # + # Prints the contents of the MINI MIM entry (most OMIM entries do + # not have MINI MIM entries, though). + print $mini_mim->description()."\n"; + print $mini_mim->created()."\n"; + print $mini_mim->contributors()."\n"; + print $mini_mim->edited()."\n"; + + my @corrs = $omim_entry->each_Correlate(); # from genemap + # Array of Bio::Phenotype::Correlate objects. + # class Bio::Phenotype::Correlate + # provides the following: + # - name() + # - description() (not used) + # - species() (always mouse) + # - type() ("OMIM mouse correlate") + # - comment() + + my @refs = $omim_entry->each_Reference(); # *FIELD* RF + # Array of Bio::Annotation::Reference objects. + + + my @avs = $omim_entry->each_AllelicVariant(); # *FIELD* AV + # Array of Bio::Phenotype::OMIM::OMIMentryAllelicVariant objects. + # class Bio::Phenotype::OMIM::OMIMentryAllelicVariant + # provides the following: + # - number (e.g ".0001" ) + # - title (e.g "ALCOHOL INTOLERANCE" ) + # - symbol (e.g "ALDH2*2" ) + # - description (e.g "The ALDH2*2-encoded protein has a change ..." ) + # - aa_ori (used if information in the form "LYS123ARG" is found) + # - aa_mut (used if information in the form "LYS123ARG" is found) + # - position (used if information in the form "LYS123ARG" is found) + # - additional_mutations (used for e.g. "1-BP DEL, 911T") + + my @cps = $omim_entry->each_CytoPosition(); # from genemap + # Array of Bio::Map::CytoPosition objects. + + my @gss = $omim_entry->each_gene_symbol(); # from genemap + # Array of strings. + + # do something ... + } + +=head1 DESCRIPTION + +This parser returns Bio::Phenotype::OMIM::OMIMentry objects +(which inherit from Bio::Phenotype::PhenotypeI). +It parses the OMIM database available as +ftp://ncbi.nlm.nih.gov/repository/OMIM/omim.txt.Z +together with (optionally) the gene map file at +ftp://ncbi.nlm.nih.gov/repository/OMIM/genemap. + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Christian M. Zmasek + +Email: czmasek@gnf.org or cmzmasek@yahoo.com + +WWW: http://www.genetics.wustl.edu/eddy/people/zmasek/ + +Address: + + Genomics Institute of the Novartis Research Foundation + 10675 John Jay Hopkins Drive + San Diego, CA 92121 + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Phenotype::OMIM::OMIMparser; + +use vars qw( @ISA ); +use strict; + +use Bio::Root::IO; +use Bio::Root::Root; +use Bio::Species; +use Bio::Annotation::Reference; +use Bio::Map::CytoPosition; +use Bio::Phenotype::OMIM::OMIMentry; +use Bio::Phenotype::OMIM::OMIMentryAllelicVariant; +use Bio::Phenotype::Correlate; + +@ISA = qw( Bio::Root::Root ); + + +use constant DEFAULT_STATE => 0; +use constant MIM_NUMBER_STATE => 1; +use constant TITLE_STATE => 2; +use constant TEXT_STATE => 3; +use constant MINI_MIM_TEXT_STATE => 4; +use constant ALLELIC_VARIANT_STATE => 5; +use constant SEE_ALSO_STATE => 6; +use constant REF_STATE => 7; +use constant SYMPT_STATE => 8; +use constant CONTRIBUTORS_STATE => 9; +use constant CREATED_BY_STATE => 10; +use constant EDITED_BY_STATE => 11; +use constant MINI_MIM_EDITED_BY_STATE => 12; +use constant MINI_MIM_CREATED_BY_STATE => 13; +use constant MINI_MIM_CONTRIBUTORS_STATE => 14; +use constant TRUE => 1; +use constant FALSE => 0; + + + +=head2 new + + Title : new + Usage : $omim_parser = Bio::Phenotype::OMIM::OMIMparser->new( -genemap => "/path/to/genemap", + -omimtext => "/path/to/omim.txt" ); + Function: Creates a new OMIMparser. + Returns : A new OMIMparser object. + Args : -genemap => the genemap file name (optional) + -omimtext => the omim text file name + +=cut + +sub new { + my( $class, @args ) = @_; + + my $self = $class->SUPER::new( @args ); + + my ( $genemap_file_name, $omimtxt_file_name ) + = $self->_rearrange( [ qw( GENEMAP OMIMTEXT ) ], @args ); + + $self->init(); + + $genemap_file_name && $self->genemap_file_name( $genemap_file_name ); + + $omimtxt_file_name && $self->omimtxt_file_name( $omimtxt_file_name); + + return $self; +} + + + + +=head2 next_phenotype + + Title : next_phenotype() + Usage : while ( my $omim_entry = $omim_parser->next_phenotype() ) { + # do something with $omim_entry + } + Function: Returns an Bio::Phenotype::OMIM::OMIMentry or + undef once the end of the omim text file is reached. + Returns : A Bio::Phenotype::OMIM::OMIMentry. + Args : + +=cut + +sub next_phenotype { + my ( $self ) = @_; + + unless( defined( $self->_OMIM_text_file() ) ) { + $self->_no_OMIM_text_file_provided_error(); + } + + if ( $self->_done() == TRUE ) { + return undef; + } + + my $fieldtag = ""; + my $contents = ""; + my $line = ""; + my $state = DEFAULT_STATE; + my $saw_mini_min_flag = FALSE; + my %record = (); + + while( $line = ( $self->_OMIM_text_file )->_readline() ) { + if ( $line =~ /^\s*\*RECORD\*/ ) { + if ( $self->_is_not_first_record() == TRUE ) { + $self->_add_to_hash( $state, $contents,\%record ); + my $omim_entry = $self->_createOMIMentry( \%record ); + return $omim_entry; + } + else { + $self->_is_not_first_record( TRUE ); + } + + } + elsif ( $line =~ /^\s*\*FIELD\*\s*(\S+)/ ) { + $fieldtag = $1; + if ( $state != DEFAULT_STATE ) { + $self->_add_to_hash( $state, $contents,\%record ); + } + $contents = ""; + + if ( $fieldtag eq "NO" ) { + $state = MIM_NUMBER_STATE; + $saw_mini_min_flag = FALSE; + } + elsif ( $fieldtag eq "TI" ) { + $state = TITLE_STATE; + $saw_mini_min_flag = FALSE; + } + elsif ( $fieldtag eq "TX" ) { + $state = TEXT_STATE; + $saw_mini_min_flag = FALSE; + } + elsif ( $fieldtag eq "MN" ) { + $state = MINI_MIM_TEXT_STATE; + $saw_mini_min_flag = TRUE; + } + elsif ( $fieldtag eq "AV" ) { + $state = ALLELIC_VARIANT_STATE; + $saw_mini_min_flag = FALSE; + } + elsif ( $fieldtag eq "SA" ) { + $state = SEE_ALSO_STATE; + $saw_mini_min_flag = FALSE; + } + elsif ( $fieldtag eq "RF" ) { + $state = REF_STATE; + $saw_mini_min_flag = FALSE; + } + elsif ( $fieldtag eq "CS" ) { + $state = SYMPT_STATE; + $saw_mini_min_flag = FALSE; + } + elsif ( $fieldtag eq "CN" ) { + if ( $saw_mini_min_flag == TRUE ) { + $state = MINI_MIM_CONTRIBUTORS_STATE; + } + else { + $state = CONTRIBUTORS_STATE; + } + } + elsif ( $fieldtag eq "CD" ) { + if ( $saw_mini_min_flag == TRUE ) { + $state = MINI_MIM_CREATED_BY_STATE; + } + else { + $state = CREATED_BY_STATE; + } + } + elsif ( $fieldtag eq "ED" ) { + if ( $saw_mini_min_flag == TRUE ) { + $state = MINI_MIM_EDITED_BY_STATE; + } + else { + $state = EDITED_BY_STATE; + } + } + else { + print "Warning: Unknown tag: $fieldtag\n"; + } + + } + else { + $contents .= $line; + } + } + + $self->_OMIM_text_file()->close(); + $self->_done( TRUE ); + + unless( %record ) { + $self->_not_a_OMIM_text_file_error(); + } + + $self->_add_to_hash( $state, $contents,\%record ); + + my $omim_entry = $self->_createOMIMentry( \%record ); + + return $omim_entry; + +} # next_phenotype + + + + +=head2 init + + Title : init() + Usage : $omim_parser->init(); + Function: Initializes this OMIMparser to all "". + Returns : + Args : + +=cut + +sub init { + my ( $self ) = @_; + + $self->genemap_file_name( "" ); + $self->omimtxt_file_name( "" ); + $self->_genemap_hash( {} ); + $self->_OMIM_text_file( undef ); + $self->_is_not_first_record( FALSE ); + $self->_done( FALSE ); + +} # init + + + + +=head2 genemap_file_name + + Title : genemap_file_name + Usage : $omimparser->genemap_file_name( "genemap" ); + Function: Set/get for the genemap file name. + Returns : The genemap file name [string]. + Args : The genemap file name [string] (optional). + +=cut + +sub genemap_file_name { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_genemap_file_name" } = $value; + if ( $value =~ /\W/ ) { + _genemap_hash( $self->_read_genemap( $value ) ); + } + } + + return $self->{ "_genemap_file_name" }; +} # genemap_file_name + + + + +=head2 omimtxt_file_name + + Title : omimtxt_file_name + Usage : $omimparser->omimtxt_file_name( "omim.txt" ); + Function: Set/get for the omim text file name. + Returns : The the omim text file name [string]. + Args : The the omim text file name [string] (optional). + +=cut + +sub omimtxt_file_name { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_omimtxt_file_name" } = $value; + if ( $value =~ /\W/ ) { + $self->_OMIM_text_file( new Bio::Root::IO->new( -file => $value ) ); + } + } + + return $self->{ "_omimtxt_file_name" }; +} # omimtxt_file_name + + + + + +sub _createOMIMentry { + my ( $self, $record_ref ) = @_; + + my $omim_entry = Bio::Phenotype::OMIM::OMIMentry->new(); + my $mini_mim = Bio::Phenotype::OMIM::MiniMIMentry->new(); + + while ( ( my $key, my $val ) = each( %$record_ref ) ) { + + $val =~ s/^\s+//; + $val =~ s/\s+$//; + + if ( $key == MIM_NUMBER_STATE ) { + $val =~ s/\s+//g; + $val =~ s/\D//g; + + $omim_entry->MIM_number( $val ); + + my $gm = $self->_genemap_hash(); + if ( exists( $$gm{ $val } ) ) { + $self->_parse_genemap( $omim_entry, $val ); + } + + } + elsif ( $key == TITLE_STATE ) { + my ( $title, $alt_titles ) = $self->_parse_title( $val ); + $omim_entry->title( $title ); + $omim_entry->alternative_titles_and_symbols( $alt_titles ); + if ( $title =~ /^\*/ ) { + $omim_entry->is_separate( TRUE ); + } + elsif ( $title =~ /^#/ ) { + $omim_entry->more_than_two_genes( TRUE ); + } + } + elsif ( $key == TEXT_STATE ) { + $omim_entry->description( $val ); + } + elsif ( $key == ALLELIC_VARIANT_STATE ) { + my @allelic_variants = $self->_parse_allelic_variants( $val ); + $omim_entry->add_AllelicVariants( @allelic_variants ); + } + elsif ( $key == SEE_ALSO_STATE ) { + $omim_entry->additional_references( $val ); + } + elsif ( $key == REF_STATE ) { + my @refs = $self->_parse_references( $val ); + $omim_entry->add_References( @refs ); + } + elsif ( $key == SYMPT_STATE ) { + $omim_entry->clinical_symptoms( $val ); + } + elsif ( $key == CONTRIBUTORS_STATE ) { + $omim_entry->contributors( $val ); + } + elsif ( $key == CREATED_BY_STATE ) { + $omim_entry->created( $val ); + } + elsif ( $key == EDITED_BY_STATE ) { + $omim_entry->edited( $val ); + } + elsif ( $key == MINI_MIM_TEXT_STATE ) { + $mini_mim->description( $val ); + } + elsif ( $key == MINI_MIM_CONTRIBUTORS_STATE ) { + $mini_mim->contributors( $val ); + } + elsif ( $key == MINI_MIM_CREATED_BY_STATE ) { + $mini_mim->created( $val ); + } + elsif ( $key == MINI_MIM_EDITED_BY_STATE ) { + $mini_mim->edited( $val ); + } + + } + + my $man = Bio::Species->new(); + $man->classification( qw( sapiens Homo ) ); + $man->common_name( "man" ); + $omim_entry->species( $man ); + $omim_entry->miniMIM( $mini_mim ); + + return $omim_entry; + +} # _createOMIMentry + + + + +sub _parse_genemap { + my ( $self, $omim_entry, $val ) = @_; + + my $genemap_line = ${ $self->_genemap_hash() }{ $val }; + my @a = split( /\|/, $genemap_line ); + + my $locations = $a[ 4 ]; + if ( defined ( $locations ) ) { + $locations =~ s/\s+//g; + my @ls = split( /[,;]/, $locations ); + my @cps; + foreach my $l ( @ls ) { + my $cp = Bio::Map::CytoPosition->new( -value => $l ); + push( @cps, $cp ); + } + $omim_entry->add_CytoPositions( @cps ); + } + + my $gene_symbols = $a[ 5 ]; + if ( defined ( $gene_symbols ) ) { + $gene_symbols =~ s/\s+//g; + my @gss = split( /[,;]/, $gene_symbols ); + $omim_entry->add_gene_symbols( @gss ); + } + + my $mouse_correlates = $a[ 16 ]; + if ( defined ( $mouse_correlates ) ) { + $mouse_correlates =~ s/\s+//g; + my @mcs = split( /[,;]/, $mouse_correlates ); + my @cs; + foreach my $mc ( @mcs ) { + my $mouse = Bio::Species->new(); + $mouse->classification( qw( musculus Mus ) ); + $mouse->common_name( "mouse" ); + my $c = Bio::Phenotype::Correlate->new(); + $c->name( $mc ); + $c->species( $mouse ); + $c->type( "OMIM mouse correlate" ); + + push( @cs, $c ); + } + $omim_entry->add_Correlates( @cs ); + } + + $omim_entry->gene_status( $a[ 6 ] ) if defined $a[ 6 ]; + $omim_entry->mapping_method( $a[ 10 ] ) if defined $a[ 10 ]; + $omim_entry->comment( $a[ 11 ] ) if defined $a[ 11 ]; + +} # _parse_genemap + + + + +sub _parse_allelic_variants { + my ( $self, $text ) = @_; + + my @allelic_variants; + my $number = ""; + my $title = ""; + my $symbol_mut_line = ""; + my $prev_line = ""; + my $description = ""; + my $saw_empty_line = FALSE; + + my @lines = split( /\n/, $text ); + + foreach my $line ( @lines ) { + if ( $line !~ /\w/ ) { + $saw_empty_line = TRUE; + } + elsif ( $line =~ /^\s*(\.\d+)/ ) { + my $current_number = $1; + if ( $number ne "" ) { + my $allelic_variant = $self->_create_allelic_variant( $number, $title, + $symbol_mut_line, $description ); + + push( @allelic_variants, $allelic_variant ); + } + $number = $current_number; + $title = ""; + $prev_line = ""; + $symbol_mut_line = ""; + $description = ""; + $saw_empty_line = FALSE; + } + elsif ( $title eq "" ) { + $title = $line; + } + elsif ( $saw_empty_line == FALSE ) { + $prev_line = $line; + } + elsif ( $saw_empty_line == TRUE ) { + if ( $prev_line ne "" ) { + $symbol_mut_line = $prev_line; + $prev_line = ""; + } + if ( $description ne "" ) { + $description .= "\n" . $line; + } + else { + $description = $line; + } + } + } + + my $allelic_variant = $self->_create_allelic_variant( $number, $title, + $symbol_mut_line, $description ); + + push( @allelic_variants, $allelic_variant ); + + return @allelic_variants; + +} # _parse_allelic_variants + + + + +sub _create_allelic_variant { + my ( $self, $number, $title, $symbol_mut_line, $description ) = @_; + + my $symbol = ""; + my $mutation = ""; + my $aa_ori = ""; + my $aa_mut = ""; + my $position = ""; + + if ( $symbol_mut_line =~ /\s*(.+?)\s*,\s*([a-z]{3})(\d+)([a-z]{3})/i ) { + $symbol = $1; + $aa_ori = $2; + $aa_mut = $4; + $position = $3; + } + elsif ( $symbol_mut_line =~ /\s*(.+?)\s*,\s*(.+)/ ) { + $symbol = $1; + $mutation = $2; + } + else { + $symbol = $symbol_mut_line; + } + + if ( ! defined( $description ) ) { die( "undef desc" ); } + if ( ! defined( $mutation ) ) { die( "undef mutation" ); } + + + my $allelic_variant = Bio::Phenotype::OMIM::OMIMentryAllelicVariant->new(); + $allelic_variant->number( $number ); + $allelic_variant->aa_ori( $aa_ori ); + $allelic_variant->aa_mut( $aa_mut ); + $allelic_variant->position( $position ); + $allelic_variant->title( $title ); + $allelic_variant->symbol( $symbol ); + $allelic_variant->description( $description ); + $allelic_variant->additional_mutations( $mutation ); + + return $allelic_variant; + +} # _create_allelic_variant + + + + +sub _parse_title { + my ( $self, $text ) = @_; + my $title = ""; + if ( $text =~ /^(.+)\n/ ) { + $title = $1; + $text =~ s/^.+\n//; + } + else { + $title = $text; + $text = ""; + + } + + return ( $title, $text ); +} # _parse_title + + + + +sub _parse_references { + my ( $self, $text ) = @_; + + $text =~ s/\A\s+//; + $text =~ s/\s+\z//; + $text =~ s/\A\d+\.\s*//; + + my @references; + + my @texts = split( /\s*\n\s*\n\s*\d+\.\s*/, $text ); + + foreach my $t ( @texts ) { + + my $authors = ""; + my $title = ""; + my $location = ""; + + $t =~ s/\s+/ /g; + + if ( $t =~ /(.+?)\s*:\s*(.+?[.?!])\s+(.+?)\s+(\S+?)\s*:\s*(\w?\d+.*)\s*,\s*(\d+)/ ) { + $authors = $1; + $title = $2; + my $journal = $3; + my $volume = $4; + my $fromto = $5; + my $year = $6; + my $from = "", + my $to = ""; + if ( $fromto =~ /(\d+)-+(\d+)/ ) { + $from = $1; + $to = "-".$2; + } + elsif ( $fromto =~ /\A(\w+)/ ) { + $from = $1; + } + $location = $journal." ".$volume." ".$from.$to." (".$year.")"; + } + + + elsif ( $t =~ /(.+?)\s*:\s*(.+?[.?!])\s*(.+?)\z/ ) { + $authors = $1; + $title = $2; + $location = $3; + } + else { + $title = $t; + } + + my $ref = Bio::Annotation::Reference->new( -title => $title, + -location => $location, + -authors => $authors ); + push( @references, $ref ); + + } + return @references; + +} # _parse_references + + + + +sub _genemap_hash { + my ( $self, $value ) = @_; + + if ( defined $value ) { + unless ( ref( $value ) eq "HASH" ) { + $self->throw( "Argument to method \"_genemap_hash\" is not a reference to an Hash" ); + } + $self->{ "_genemap_hash" } = $value; + + } + + return $self->{ "_genemap_hash" }; +} # _genemap_hash + + + + +sub _is_not_first_record { + + my ( $self, $value ) = @_; + + if ( defined $value ) { + unless ( $value == FALSE || $value == TRUE ) { + $self->throw( "Found [$value] where [" . TRUE + ." or " . FALSE . "] expected" ); + } + $self->{ "_not_first_record" } = $value; + } + + return $self->{ "_not_first_record" }; +} # _is_not_first_record + + + + +sub _done { + my ( $self, $value ) = @_; + + if ( defined $value ) { + unless ( $value == FALSE || $value == TRUE ) { + $self->throw( "Found [$value] where [" . TRUE + ." or " . FALSE . "] expected" ); + } + $self->{ "_done" } = $value; + } + + return $self->{ "_done" }; +} # _done + + + + +sub _OMIM_text_file { + my ( $self, $value ) = @_; + + if ( defined $value ) { + unless ( $value->isa( "Bio::Root::IO" ) ) { + $self->throw( "[$value] is not a valid \"Bio::Root::IO\"" ); + } + $self->{ "_omimtxt_file" } = $value; + + } + + return $self->{ "_omimtxt_file" }; +} # _OMIM_text_file + + + + +sub _read_genemap { + my ( $self, $genemap_file_name ) = @_; + + my $line = ""; + my %genemap_hash = (); + my $genemap_file = new Bio::Root::IO->new( -file => $genemap_file_name ); + my @a = (); + my %gm = (); + + while( $line = $genemap_file->_readline() ) { + @a = split( /\|/, $line ); + unless( scalar( @a ) == 18 ) { + $self->throw( "Gene map file \"".$self->genemap_file_name() + . "\" is not in the expected format" ); + } + $gm{ $a[ 9 ] } = $line; + } + $genemap_file->close(); + $self->_genemap_hash( \%gm ); + +} #_read_genemap + + + + +sub _no_OMIM_text_file_provided_error { + my ( $self ) = @_; + + my $msg = "Need to indicate a OMIM text file to read from with\n"; + $msg .= "either \"OMIMparser->new( -omimtext => \"path/to/omim.txt\" );\"\n"; + $msg .= "or \"\$omim_parser->omimtxt_file_name( \"path/to/omim.txt\" );\""; + $self->throw( $msg ); +} # _no_OMIM_text_file_provided_error + + + + +sub _not_a_OMIM_text_file_error { + my ( $self ) = @_; + + my $msg = "File \"".$self->omimtxt_file_name() . + "\" appears not to be a OMIM text file"; + $self->throw( $msg ); +} # _not_a_OMIM_text_file_error + + + + +sub _add_to_hash { + my ( $self, $state, $contents, $record_ref ) = @_; + + if ( exists( $record_ref->{ $state } ) ) { + chomp( $record_ref->{ $state } ); + $record_ref->{ $state } = $record_ref->{ $state } . $contents; + } + else { + $record_ref->{ $state } = $contents; + } +} # _add_to_hash + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Phenotype/Phenotype.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Phenotype/Phenotype.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1011 @@ +# $Id: Phenotype.pm,v 1.5 2002/12/12 18:27:01 czmasek Exp $ +# +# BioPerl module for Bio::Phenotype::Phenotype +# +# Cared for by Christian M. Zmasek or +# +# (c) Christian M. Zmasek, czmasek@gnf.org, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Phenotype - A class for modeling phenotypes + +=head1 SYNOPSIS + + #get Bio::Phenotype::PhenotypeI somehow + + print $phenotype->name(), "\n"; + print $phenotype->description(), "\n"; + + my @keywords = ( "achondroplasia", "dwarfism" ); + $phenotype->add_keywords( @keywords ); + foreach my $keyword ( $phenotype->each_keyword() ) { + print $keyword, "\n"; + } + $phenotype->remove_keywords(); + + + foreach my $gene_symbol ( $phenotype->each_gene_symbol() ) { + print $gene_symbol, "\n"; + } + + foreach my $corr ( $phenotype->each_Correlate() ) { + # Do something with $corr + } + + foreach my $var ( $phenotype->each_Variant() ) { + # Do something with $var (mutation) + } + + foreach my $measure ( $phenotype->each_Measure() ) { + # Do something with $measure + } + + +=head1 DESCRIPTION + +This superclass implements common methods for classes modelling phenotypes. +Bio::Phenotype::OMIM::OMIMentry is an example of an instantiable phenotype +class (the design of this interface was partially guided by the need +to model OMIM entries). +Please note. This class provides methods to associate mutations +(methods "each_Variant", ...) and genotypes (methods "each_Genotype", ...) +with phenotypes. Yet, these aspects might need some future enhancements, +especially since there is no "genotype" class yet. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Christian M. Zmasek + +Email: czmasek@gnf.org or cmzmasek@yahoo.com + +WWW: http://www.genetics.wustl.edu/eddy/people/zmasek/ + +Address: + + Genomics Institute of the Novartis Research Foundation + 10675 John Jay Hopkins Drive + San Diego, CA 92121 + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Phenotype::Phenotype; +use vars qw( @ISA ); +use strict; + +use Bio::Root::Root; +use Bio::Phenotype::PhenotypeI; +use Bio::Species; +use Bio::Variation::VariantI; +use Bio::Annotation::DBLink; +use Bio::Annotation::Reference; +use Bio::Phenotype::Measure; +use Bio::Phenotype::Correlate; +use Bio::Map::CytoPosition; +use Bio::Range; + + +@ISA = qw( Bio::Phenotype::PhenotypeI ); + + + + +=head2 new + + Title : new + Usage : $obj = Bio::Phenotype::Phenotype->new( -name => "XY", + -description => "This is ..." ); + Function: Creates a new Phenotype object. + Returns : A new Phenotype object. + Args : -name => the name + -description => the description of this phenotype + -species => ref to the the species + -comment => a comment + +=cut + +sub new { + + my( $class,@args ) = @_; + + my $self = $class->SUPER::new( @args ); + + my ( $name, + $description, + $species, + $comment ) + = $self->_rearrange( [ qw( NAME + DESCRIPTION + SPECIES + COMMENT ) ], @args ); + + $self->init(); + + $name && $self->name( $name ); + $description && $self->description( $description ); + $species && $self->species( $species ); + $comment && $self->comment( $comment ); + + return $self; + +} # new + + +=head2 init + + Title : init() + Usage : $obj->init(); + Function: Initializes this OMIMentry to all "" and empty lists. + Returns : + Args : + +=cut + +sub init { + + my( $self ) = @_; + + + $self->name( "" ); + $self->description( "" ); + my $species = Bio::Species->new(); + $species->classification( qw( sapiens Homo ) ); + $self->species( $species ); + $self->comment( "" ); + $self->remove_Correlates(); + $self->remove_References(); + $self->remove_CytoPositions(); + $self->remove_gene_symbols(); + $self->remove_Genotypes(); + $self->remove_DBLinks(); + $self->remove_keywords(); + $self->remove_Variants(); + $self->remove_Measures(); + +} # init + + +=head2 name + + Title : name + Usage : $obj->name( "r1" ); + or + print $obj->name(); + Function: Set/get for the name or id of this phenotype. + Returns : A name or id [scalar]. + Args : A name or id [scalar] (optional). + +=cut + +sub name { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_name" } = $value; + } + + return $self->{ "_name" }; + +} # name + + +=head2 description + + Title : description + Usage : $obj->description( "This is ..." ); + or + print $obj->description(); + Function: Set/get for the description of this phenotype. + Returns : A description [scalar]. + Args : A description [scalar] (optional). + +=cut + +sub description { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_description" } = $value; + } + + return $self->{ "_description" }; + +} # description + + + + +=head2 species + + Title : species + Usage : $obj->species( $species ); + or + $species = $obj->species(); + Function: Set/get for the species of this phenotype. + Returns : A species [Bio::Species]. + Args : A species [Bio::Species] (optional). + +=cut + +sub species { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->_check_ref_type( $value, "Bio::Species" ); + $self->{ "_species" } = $value; + } + + return $self->{ "_species" }; + +} # species + + + + +=head2 comment + + Title : comment + Usage : $obj->comment( "putative" ); + or + print $obj->comment(); + Function: Set/get for a comment about this phenotype. + Returns : A comment [scalar]. + Args : A comment [scalar] (optional). + +=cut + +sub comment { + my ( $self, $value ) = @_; + + if ( defined $value ) { + $self->{ "_comment" } = $value; + } + + return $self->{ "_comment" }; + +} # comment + + + + +=head2 each_gene_symbol + + Title : each_gene_symbol() + Usage : @gs = $obj->each_gene_symbol(); + Function: Returns a list of gene symbols [scalars, most likely Strings] + associated with this phenotype. + Returns : A list of scalars. + Args : + +=cut + +sub each_gene_symbol { + my ( $self ) = @_; + + if ( $self->{ "_gene_symbols" } ) { + return @{ $self->{ "_gene_symbols" } }; + } + else { + return my @a = (); + } + +} # each_gene_symbol + + +=head2 add_gene_symbols + + Title : add_gene_symbols + Usage : $obj->add_gene_symbols( @gs ); + or + $obj->add_gene_symbols( $gs ); + Function: Pushes one or more gene symbols [scalars, most likely Strings] + into the list of gene symbols. + Returns : + Args : scalar(s). + +=cut + +sub add_gene_symbols { + my ( $self, @values ) = @_; + + return unless( @values ); + + push( @{ $self->{ "_gene_symbols" } }, @values ); + +} # add_gene_symbols + + +=head2 remove_gene_symbols + + Usage : $obj->remove_gene_symbols(); + Function: Deletes (and returns) the list of gene symbols [scalars, + most likely Strings] associated with this phenotype. + Returns : A list of scalars. + Args : + +=cut + +sub remove_gene_symbols { + my ( $self ) = @_; + + my @a = $self->each_gene_symbol(); + $self->{ "_gene_symbols" } = []; + return @a; + +} # remove_gene_symbols + + + + +=head2 each_Variant + + Title : each_Variant() + Usage : @vs = $obj->each_Variant(); + Function: Returns a list of Bio::Variation::VariantI implementing objects + associated with this phenotype. + This is for representing the actual mutation(s) causing this + phenotype. + {* The "variants" data member and its methods will/might need to be + changed/improved in one way or another, CZ 09/06/02 *} + Returns : A list of Bio::Variation::VariantI implementing objects. + Args : + +=cut + +sub each_Variant { + my ( $self ) = @_; + + if ( $self->{ "_variants" } ) { + return @{ $self->{ "_variants" } }; + } + else { + return my @a = (); + } + +} # each_Variant + + +=head2 add_Variants + + Usage : $obj->add_Variants( @vs ); + or + $obj->add_Variants( $v ); + Function: Pushes one or more Bio::Variation::VariantI implementing objects + into the list of Variants. + Returns : + Args : Bio::Variation::VariantI implementing object(s). + +=cut + +sub add_Variants { + my ( $self, @values ) = @_; + + return unless( @values ); + + foreach my $value ( @values ) { + $self->_check_ref_type( $value, "Bio::Variation::VariantI" ); + } + + push( @{ $self->{ "_variants" } }, @values ); + +} # add_Variants + + +=head2 remove_Variants + + Title : remove_Variants + Usage : $obj->remove_Variants(); + Function: Deletes (and returns) the list of Bio::Variation::VariantI implementing + objects associated with this phenotype. + Returns : A list of Bio::Variation::VariantI implementing objects. + Args : + +=cut + +sub remove_Variants { + my ( $self ) = @_; + + my @a = $self->each_Variant(); + $self->{ "_variants" } = []; + return @a; + +} # remove_Variants + + + + +=head2 each_Reference + + Title : each_Reference() + Usage : @refs = $obj->each_Reference(); + Function: Returns a list of Bio::Annotation::Reference objects + associated with this phenotype. + Returns : A list of Bio::Annotation::Reference objects. + Args : + +=cut + +sub each_Reference { + my ( $self ) = @_; + + if ( $self->{ "_references" } ) { + return @{ $self->{ "_references" } }; + } + else { + return my @a = (); + } + +} # each_Reference + + +=head2 add_References + + Title : add_References + Usage : $obj->add_References( @refs ); + or + $obj->add_References( $ref ); + Function: Pushes one or more Bio::Annotation::Reference objects + into the list of References. + Returns : + Args : Bio::Annotation::Reference object(s). + +=cut + +sub add_References { + my ( $self, @values ) = @_; + + return unless( @values ); + + foreach my $value ( @values ) { + $self->_check_ref_type( $value, "Bio::Annotation::Reference" ); + } + + push( @{ $self->{ "_references" } }, @values ); + +} # add_References + + +=head2 remove_References + + Title : remove_References() + Usage : $obj->remove_References(); + Function: Deletes (and returns) the list of Bio::Annotation::Reference objects + associated with this phenotype. + Returns : A list of Bio::Annotation::Reference objects. + Args : + +=cut + +sub remove_References { + my ( $self ) = @_; + + my @a = $self->each_Reference(); + $self->{ "_references" } = []; + return @a; + +} # remove_References + + + + +=head2 each_CytoPosition + + Title : each_CytoPosition() + Usage : @cps = $obj->each_CytoPosition(); + Function: Returns a list of Bio::Map::CytoPosition objects + associated with this phenotype. + Returns : A list of Bio::Map::CytoPosition objects. + Args : + +=cut + +sub each_CytoPosition { + my ( $self ) = @_; + + if ( $self->{ "_cyto_positions" } ) { + return @{ $self->{ "_cyto_positions" } }; + } + else { + return my @a = (); + } + +} # each_CytoPosition + + +=head2 add_CytoPositions + + Title : add_CytoPositions + Usage : $obj->add_CytoPositions( @cps ); + or + $obj->add_CytoPositions( $cp ); + Function: Pushes one or more Bio::Map::CytoPosition objects + into the list of CytoPositions. + Returns : + Args : Bio::Map::CytoPosition object(s). + +=cut + +sub add_CytoPositions { + my ( $self, @values ) = @_; + + return unless( @values ); + + foreach my $value ( @values ) { + $self->_check_ref_type( $value, "Bio::Map::CytoPosition" ); + } + + push( @{ $self->{ "_cyto_positions" } }, @values ); + +} # add_CytoPositions + + +=head2 remove_CytoPositions + + Title : remove_CytoPositions + Usage : $obj->remove_CytoPositions(); + Function: Deletes (and returns) the list o fBio::Map::CytoPosition objects + associated with this phenotype. + Returns : A list of Bio::Map::CytoPosition objects. + Args : + +=cut + +sub remove_CytoPositions { + my ( $self ) = @_; + + my @a = $self->each_CytoPosition(); + $self->{ "_cyto_positions" } = []; + return @a; + +} # remove_CytoPositions + + + + +=head2 each_Correlate + + Title : each_Correlate() + Usage : @corrs = $obj->each_Correlate(); + Function: Returns a list of Bio::Phenotype::Correlate objects + associated with this phenotype. + (Correlates are correlating phenotypes in different species; + inspired by mouse correlates of human phenotypes in the OMIM + database.) + Returns : A list of Bio::Phenotype::Correlate objects. + Args : + +=cut + +sub each_Correlate { + my ( $self ) = @_; + + if ( $self->{ "_correlates" } ) { + return @{ $self->{ "_correlates" } }; + } + else { + return my @a = (); + } + +} # each_Correlate + + + + +=head2 add_Correlates + + Title : add_Correlates + Usage : $obj->add_Correlates( @corrs ); + or + $obj->add_Correlates( $corr ); + Function: Pushes one or more Bio::Phenotype::Correlate objects + into the list of Correlates. + Returns : + Args : Bio::Phenotype::Correlate object(s). + +=cut + +sub add_Correlates { + my ( $self, @values ) = @_; + + return unless( @values ); + + foreach my $value ( @values ) { + $self->_check_ref_type( $value, "Bio::Phenotype::Correlate" ); + } + + push( @{ $self->{ "_correlates" } }, @values ); + +} # add_Correlates + + +=head2 remove_Correlates + + Title : remove_Correlates + Usage : $obj->remove_Correlates(); + Function: Deletes (and returns) the list of Bio::Phenotype::Correlate objects + associated with this phenotype. + Returns : A list of Bio::Phenotype::Correlate objects. + Args : + +=cut + +sub remove_Correlates { + my ( $self ) = @_; + + my @a = $self->each_Correlate(); + $self->{ "_correlates" } = []; + return @a; + +} # remove_Correlates + + + + +=head2 each_Measure + + Title : each_Measure() + Usage : @ms = $obj->each_Measure(); + Function: Returns a list of Bio::Phenotype::Measure objects + associated with this phenotype. + (Measure is for biochemically defined phenotypes + or any other types of measures.) + Returns : A list of Bio::Phenotype::Measure objects. + Args : + +=cut + +sub each_Measure { + my ( $self ) = @_; + + if ( $self->{ "_measures" } ) { + return @{ $self->{ "_measures" } }; + } + else { + return my @a = (); + } + +} # each_Measure + + +=head2 add_Measures + + Title : add_Measures + Usage : $obj->add_Measures( @ms ); + or + $obj->add_Measures( $m ); + Function: Pushes one or more Bio::Phenotype::Measure objects + into the list of Measures. + Returns : + Args : Bio::Phenotype::Measure object(s). + +=cut + +sub add_Measures { + my ( $self, @values ) = @_; + + return unless( @values ); + + foreach my $value ( @values ) { + $self->_check_ref_type( $value, "Bio::Phenotype::Measure" ); + } + + push( @{ $self->{ "_measures" } }, @values ); + +} # add_Measures + + +=head2 remove_Measures + + Title : remove_Measures + Usage : $obj->remove_Measures(); + Function: Deletes (and returns) the list of Bio::Phenotype::Measure objects + associated with this phenotype. + Returns : A list of Bio::Phenotype::Measure objects. + Args : + +=cut + +sub remove_Measures { + my ( $self ) = @_; + + my @a = $self->each_Measure(); + $self->{ "_measures" } = []; + return @a; + +} # remove_Measures + + + + +=head2 each_keyword + + Title : each_keyword() + Usage : @kws = $obj->each_keyword(); + Function: Returns a list of key words [scalars, most likely Strings] + associated with this phenotype. + Returns : A list of scalars. + Args : + +=cut + +sub each_keyword { + my ( $self ) = @_; + + if ( $self->{ "_keywords" } ) { + return @{ $self->{ "_keywords" } }; + } + else { + return my @a = (); + } + +} # each_keyword + + +=head2 add_keywords + + Title : add_keywords + Usage : $obj->add_keywords( @kws ); + or + $obj->add_keywords( $kw ); + Function: Pushes one or more keywords [scalars, most likely Strings] + into the list of key words. + Returns : + Args : scalar(s). + +=cut + +sub add_keywords { + my ( $self, @values ) = @_; + + return unless( @values ); + + push( @{ $self->{ "_keywords" } }, @values ); + +} # add_keywords + + +=head2 remove_keywords + + Title : remove_keywords + Usage : $obj->remove_keywords(); + Function: Deletes (and returns) the list of key words [scalars, + most likely Strings] associated with this phenotype. + Returns : A list of scalars. + Args : + +=cut + +sub remove_keywords { + my ( $self ) = @_; + + my @a = $self->each_keyword(); + $self->{ "_keywords" } = []; + return @a; + +} # remove_keywords + + + + +=head2 each_DBLink + + Title : each_DBLink() + Usage : @dbls = $obj->each_DBLink(); + Function: Returns a list of Bio::Annotation::DBLink objects + associated with this phenotype. + Returns : A list of Bio::Annotation::DBLink objects. + Args : + +=cut + +sub each_DBLink { + my ( $self ) = @_; + + if ( $self->{ "_db_links" } ) { + return @{ $self->{ "_db_links" } }; + } + else { + return my @a = (); + } + +} + + +=head2 add_DBLink + + Title : add_DBLink + Usage : $obj->add_DBLinks( @dbls ); + or + $obj->add_DBLinks( $dbl ); + Function: Pushes one or more Bio::Annotation::DBLink objects + into the list of DBLinks. + Returns : + Args : Bio::Annotation::DBLink object(s). + +=cut + +sub add_DBLinks { + my ( $self, @values ) = @_; + + return unless( @values ); + + foreach my $value ( @values ) { + $self->_check_ref_type( $value, "Bio::Annotation::DBLink" ); + } + + push( @{ $self->{ "_db_links" } }, @values ); + +} # add_DBLinks + + +=head2 remove_DBLinks + + Title : remove_DBLinks + Usage : $obj->remove_DBLinks(); + Function: Deletes (and returns) the list of Bio::Annotation::DBLink objects + associated with this phenotype. + Returns : A list of Bio::Annotation::DBLink objects. + Args : + +=cut + +sub remove_DBLinks { + my ( $self ) = @_; + + my @a = $self->each_DBLink(); + $self->{ "_db_links" } = []; + return @a; + +} # remove_DBLinks + + + + +=head2 each_Genotype + + Title : each_Reference() + Usage : @gts = $obj->each_Reference(); + Function: Returns a list of "Genotype" objects + associated with this phenotype. + {* the "genotypes" data member and its methods certainly will/needs to be + changed/improved in one way or another since there is + no "Genotype" class yet, CZ 09/06/02 *} + Returns : A list of "Genotype" objects. + Args : + +=cut + +sub each_Genotype { + my ( $self ) = @_; + + if ( $self->{ "_genotypes" } ) { + return @{ $self->{ "_genotypes" } }; + } + else { + return my @a = (); + } + +} # each_Genotype + + +=head2 add_Genotypes + + Title : add_Genotypes + Usage : $obj->add_Genotypes( @gts ); + or + $obj->add_Genotypes( $gt ); + Function: Pushes one or more "Genotypes" + into the list of "Genotypes". + Returns : + Args : "Genotypes(s)". + +=cut + +sub add_Genotypes { + my ( $self, @values ) = @_; + + return unless( @values ); + + #foreach my $value ( @values ) { + # $self->_check_ref_type( $value, "Bio::GenotypeI" ); + #} + + push( @{ $self->{ "_genotypes" } }, @values ); + +} # add_Genotypes + + +=head2 remove_Genotypes + + Title : remove_Genotypes + Usage : $obj->remove_Genotypes(); + Function: Deletes (and returns) the list of "Genotype" objects + associated with this phenotype. + Returns : A list of "Genotype" objects. + Args : + +=cut + +sub remove_Genotypes { + my ( $self ) = @_; + + my @a = $self->each_Genotype(); + $self->{ "_genotypes" } = []; + return @a; + +} # remove_Genotypes + + +=head2 _check_ref_type + + Title : _check_ref_type + Usage : $self->_check_ref_type( $value, "Bio::Annotation::DBLink" ); + Function: Checks for the correct type. + Returns : + Args : The value to be checked, the expected class. + +=cut + +sub _check_ref_type { + my ( $self, $value, $expected_class ) = @_; + + if ( ! defined( $value ) ) { + $self->throw( ( caller( 1 ) )[ 3 ] .": Found [undef" + ."] where [$expected_class] expected" ); + } + elsif ( ! ref( $value ) ) { + $self->throw( ( caller( 1 ) )[ 3 ] .": Found scalar" + ." where [$expected_class] expected" ); + } + elsif ( ! $value->isa( $expected_class ) ) { + $self->throw( ( caller( 1 ) )[ 3 ] .": Found [". ref( $value ) + ."] where [$expected_class] expected" ); + } +} # _check_ref_type + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Phenotype/PhenotypeI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Phenotype/PhenotypeI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,783 @@ +# $Id: PhenotypeI.pm,v 1.6 2002/12/12 18:27:01 czmasek Exp $ +# +# BioPerl module for Bio::Phenotype::PhenotypeI +# +# Cared for by Christian M. Zmasek or +# +# (c) Christian M. Zmasek, czmasek@gnf.org, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +PhenotypeI - An interface for classes modeling phenotypes + +=head1 SYNOPSIS + + #get Bio::Phenotype::PhenotypeI somehow + + print $phenotype->name(), "\n"; + print $phenotype->description(), "\n"; + + my @keywords = ( "achondroplasia", "dwarfism" ); + $phenotype->add_keywords( @keywords ); + foreach my $keyword ( $phenotype->each_keyword() ) { + print $keyword, "\n"; + } + $phenotype->remove_keywords(); + + + foreach my $gene_symbol ( $phenotype->each_gene_symbol() ) { + print $gene_symbol, "\n"; + } + + foreach my $corr ( $phenotype->each_Correlate() ) { + # Do something with $corr + } + + foreach my $var ( $phenotype->each_Variant() ) { + # Do something with $var (mutation) + } + + foreach my $measure ( $phenotype->each_Measure() ) { + # Do something with $measure + } + + +=head1 DESCRIPTION + +This superclass defines common methods for classes modelling phenotypes. +Bio::Phenotype::OMIM::OMIMentry is an example of an instantiable phenotype +class (the design of this interface was partially guided by the need +to model OMIM entries). +Please note. This interface provides methods to associate mutations +(methods "each_Variant", ...) and genotypes (methods "each_Genotype", ...) +with phenotypes. Yet, these aspects might need some future enhancements, +especially since there is no "genotype" class yet. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Christian M. Zmasek + +Email: czmasek@gnf.org or cmzmasek@yahoo.com + +WWW: http://www.genetics.wustl.edu/eddy/people/zmasek/ + +Address: + + Genomics Institute of the Novartis Research Foundation + 10675 John Jay Hopkins Drive + San Diego, CA 92121 + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Phenotype::PhenotypeI; +use vars qw( @ISA ); +use strict; +use Bio::Root::Root; +use Bio::Species; +use Bio::Variation::VariantI; +use Bio::Annotation::DBLink; +use Bio::Annotation::Reference; +use Bio::Phenotype::Measure; +use Bio::Phenotype::Correlate; +use Bio::Map::CytoPosition; +use Bio::Range; + + +@ISA = qw( Bio::Root::Root ); + + + +=head2 name + + Title : name + Usage : $obj->name( "r1" ); + or + print $obj->name(); + Function: Set/get for the name or id of this phenotype. + Returns : A name or id [scalar]. + Args : A name or id [scalar] (optional). + +=cut + +sub name { + my ( $self ) = @_; + + $self->throw_not_implemented(); + +} # name + + + + +=head2 description + + Title : description + Usage : $obj->description( "This is ..." ); + or + print $obj->description(); + Function: Set/get for the description of this phenotype. + Returns : A description [scalar]. + Args : A description [scalar] (optional). + +=cut + +sub description { + my ( $self ) = @_; + + $self->throw_not_implemented(); + +} # description + + + + +=head2 species + + Title : species + Usage : $obj->species( $species ); + or + $species = $obj->species(); + Function: Set/get for the species of this phenotype. + Returns : A species [Bio::Species]. + Args : A species [Bio::Species] (optional). + +=cut + +sub species { + my ( $self ) = @_; + + $self->throw_not_implemented(); + +} # species + + + + +=head2 comment + + Title : comment + Usage : $obj->comment( "putative" ); + or + print $obj->comment(); + Function: Set/get for a comment about this phenotype. + Returns : A comment [scalar]. + Args : A comment [scalar] (optional). + +=cut + +sub comment { + my ( $self ) = @_; + + $self->throw_not_implemented(); + +} # comment + + + + +=head2 each_gene_symbol + + Title : each_gene_symbol() + Usage : @gs = $obj->each_gene_symbol(); + Function: Returns a list of gene symbols [scalars, most likely Strings] + associated with this phenotype. + Returns : A list of scalars. + Args : + +=cut + +sub each_gene_symbol { + my ( $self ) = @_; + + $self->throw_not_implemented(); + +} # each_gene_symbol + + +=head2 add_gene_symbols + + Title : add_gene_symbols + Usage : $obj->add_gene_symbols( @gs ); + or + $obj->add_gene_symbols( $gs ); + Function: Pushes one or more gene symbols [scalars, most likely Strings] + into the list of gene symbols. + Returns : + Args : scalar(s). + +=cut + +sub add_gene_symbols { + my ( $self ) = @_; + + $self->throw_not_implemented(); + +} # add_gene_symbols + + +=head2 remove_gene_symbols + + Usage : $obj->remove_gene_symbols(); + Function: Deletes (and returns) the list of gene symbols [scalars, + most likely Strings] associated with this phenotype. + Returns : A list of scalars. + Args : + +=cut + +sub remove_gene_symbols { + my ( $self ) = @_; + + $self->throw_not_implemented(); + +} # remove_gene_symbols + + + + +=head2 each_Variant + + Title : each_Variant() + Usage : @vs = $obj->each_Variant(); + Function: Returns a list of Bio::Variation::VariantI implementing objects + associated with this phenotype. + This is for representing the actual mutation(s) causing this + phenotype. + {* The "variants" data member and its methods will/might need to be + changed/improved in one way or another, CZ 09/06/02 *} + Returns : A list of Bio::Variation::VariantI implementing objects. + Args : + +=cut + +sub each_Variant { + my ( $self ) = @_; + + $self->throw_not_implemented(); + +} # each_Variant + + +=head2 add_Variants + + Usage : $obj->add_Variants( @vs ); + or + $obj->add_Variants( $v ); + Function: Pushes one or more Bio::Variation::VariantI implementing objects + into the list of Variants. + Returns : + Args : Bio::Variation::VariantI implementing object(s). + +=cut + +sub add_Variants { + my ( $self ) = @_; + + $self->throw_not_implemented(); + +} # add_Variants + + +=head2 remove_Variants + + Title : remove_Variants + Usage : $obj->remove_Variants(); + Function: Deletes (and returns) the list of Bio::Variation::VariantI implementing + objects associated with this phenotype. + Returns : A list of Bio::Variation::VariantI implementing objects. + Args : + +=cut + +sub remove_Variants { + my ( $self ) = @_; + + $self->throw_not_implemented(); + +} # remove_Variants + + + + +=head2 each_Reference + + Title : each_Reference() + Usage : @refs = $obj->each_Reference(); + Function: Returns a list of Bio::Annotation::Reference objects + associated with this phenotype. + Returns : A list of Bio::Annotation::Reference objects. + Args : + +=cut + +sub each_Reference { + my ( $self ) = @_; + + $self->throw_not_implemented(); + +} # each_Reference + + +=head2 add_References + + Title : add_References + Usage : $obj->add_References( @refs ); + or + $obj->add_References( $ref ); + Function: Pushes one or more Bio::Annotation::Reference objects + into the list of References. + Returns : + Args : Bio::Annotation::Reference object(s). + +=cut + +sub add_References { + my ( $self ) = @_; + + $self->throw_not_implemented(); + +} # add_References + + +=head2 remove_References + + Title : remove_References() + Usage : $obj->remove_References(); + Function: Deletes (and returns) the list of Bio::Annotation::Reference objects + associated with this phenotype. + Returns : A list of Bio::Annotation::Reference objects. + Args : + +=cut + +sub remove_References { + my ( $self ) = @_; + + $self->throw_not_implemented(); + +} # remove_References + + + + +=head2 each_CytoPosition + + Title : each_CytoPosition() + Usage : @cps = $obj->each_CytoPosition(); + Function: Returns a list of Bio::Map::CytoPosition objects + associated with this phenotype. + Returns : A list of Bio::Map::CytoPosition objects. + Args : + +=cut + +sub each_CytoPosition { + my ( $self ) = @_; + + $self->throw_not_implemented(); + +} # each_CytoPosition + + +=head2 add_CytoPositions + + Title : add_CytoPositions + Usage : $obj->add_CytoPositions( @cps ); + or + $obj->add_CytoPositions( $cp ); + Function: Pushes one or more Bio::Map::CytoPosition objects + into the list of CytoPositions. + Returns : + Args : Bio::Map::CytoPosition object(s). + +=cut + +sub add_CytoPositions { + my ( $self ) = @_; + + $self->throw_not_implemented(); + +} # add_CytoPositions + + +=head2 remove_CytoPositions + + Title : remove_CytoPositions + Usage : $obj->remove_CytoPositions(); + Function: Deletes (and returns) the list o fBio::Map::CytoPosition objects + associated with this phenotype. + Returns : A list of Bio::Map::CytoPosition objects. + Args : + +=cut + +sub remove_CytoPositions { + my ( $self ) = @_; + + $self->throw_not_implemented(); + +} # remove_CytoPositions + + + + +=head2 each_Correlate + + Title : each_Correlate() + Usage : @corrs = $obj->each_Correlate(); + Function: Returns a list of Bio::Phenotype::Correlate objects + associated with this phenotype. + (Correlates are correlating phenotypes in different species; + inspired by mouse correlates of human phenotypes in the OMIM + database.) + Returns : A list of Bio::Phenotype::Correlate objects. + Args : + +=cut + +sub each_Correlate { + my ( $self ) = @_; + + $self->throw_not_implemented(); + +} # each_Correlate + + + + +=head2 add_Correlates + + Title : add_Correlates + Usage : $obj->add_Correlates( @corrs ); + or + $obj->add_Correlates( $corr ); + Function: Pushes one or more Bio::Phenotype::Correlate objects + into the list of Correlates. + Returns : + Args : Bio::Phenotype::Correlate object(s). + +=cut + +sub add_Correlates { + my ( $self ) = @_; + + $self->throw_not_implemented(); + +} # add_Correlates + + +=head2 remove_Correlates + + Title : remove_Correlates + Usage : $obj->remove_Correlates(); + Function: Deletes (and returns) the list of Bio::Phenotype::Correlate objects + associated with this phenotype. + Returns : A list of Bio::Phenotype::Correlate objects. + Args : + +=cut + +sub remove_Correlates { + my ( $self ) = @_; + + $self->throw_not_implemented(); + +} # remove_Correlates + + + + +=head2 each_Measure + + Title : each_Measure() + Usage : @ms = $obj->each_Measure(); + Function: Returns a list of Bio::Phenotype::Measure objects + associated with this phenotype. + (Measure is for biochemically defined phenotypes + or any other types of measures.) + Returns : A list of Bio::Phenotype::Measure objects. + Args : + +=cut + +sub each_Measure { + my ( $self ) = @_; + + $self->throw_not_implemented(); + +} # each_Measure + + +=head2 add_Measures + + Title : add_Measures + Usage : $obj->add_Measures( @ms ); + or + $obj->add_Measures( $m ); + Function: Pushes one or more Bio::Phenotype::Measure objects + into the list of Measures. + Returns : + Args : Bio::Phenotype::Measure object(s). + +=cut + +sub add_Measures { + my ( $self ) = @_; + + $self->throw_not_implemented(); + +} # add_Measures + + +=head2 remove_Measures + + Title : remove_Measures + Usage : $obj->remove_Measures(); + Function: Deletes (and returns) the list of Bio::Phenotype::Measure objects + associated with this phenotype. + Returns : A list of Bio::Phenotype::Measure objects. + Args : + +=cut + +sub remove_Measures { + my ( $self ) = @_; + + $self->throw_not_implemented(); + +} # remove_Measures + + + + +=head2 each_keyword + + Title : each_keyword() + Usage : @kws = $obj->each_keyword(); + Function: Returns a list of key words [scalars, most likely Strings] + associated with this phenotype. + Returns : A list of scalars. + Args : + +=cut + +sub each_keyword { + my ( $self ) = @_; + + $self->throw_not_implemented(); + +} # each_keyword + + +=head2 add_keywords + + Title : add_keywords + Usage : $obj->add_keywords( @kws ); + or + $obj->add_keywords( $kw ); + Function: Pushes one or more keywords [scalars, most likely Strings] + into the list of key words. + Returns : + Args : scalar(s). + +=cut + +sub add_keywords { + my ( $self ) = @_; + + $self->throw_not_implemented(); + +} # add_keywords + + +=head2 remove_keywords + + Title : remove_keywords + Usage : $obj->remove_keywords(); + Function: Deletes (and returns) the list of key words [scalars, + most likely Strings] associated with this phenotype. + Returns : A list of scalars. + Args : + +=cut + +sub remove_keywords { + my ( $self ) = @_; + + $self->throw_not_implemented(); + +} # remove_keywords + + + + +=head2 each_DBLink + + Title : each_DBLink() + Usage : @dbls = $obj->each_DBLink(); + Function: Returns a list of Bio::Annotation::DBLink objects + associated with this phenotype. + Returns : A list of Bio::Annotation::DBLink objects. + Args : + +=cut + +sub each_DBLink { + my ( $self ) = @_; + + $self->throw_not_implemented(); + +} + + +=head2 add_DBLink + + Title : add_DBLink + Usage : $obj->add_DBLinks( @dbls ); + or + $obj->add_DBLinks( $dbl ); + Function: Pushes one or more Bio::Annotation::DBLink objects + into the list of DBLinks. + Returns : + Args : Bio::Annotation::DBLink object(s). + +=cut + +sub add_DBLinks { + my ( $self ) = @_; + + $self->throw_not_implemented(); + +} # add_DBLinks + + +=head2 remove_DBLinks + + Title : remove_DBLinks + Usage : $obj->remove_DBLinks(); + Function: Deletes (and returns) the list of Bio::Annotation::DBLink objects + associated with this phenotype. + Returns : A list of Bio::Annotation::DBLink objects. + Args : + +=cut + +sub remove_DBLinks { + my ( $self ) = @_; + + $self->throw_not_implemented(); + +} # remove_DBLinks + + + + +=head2 each_Genotype + + Title : each_Reference() + Usage : @gts = $obj->each_Reference(); + Function: Returns a list of "Genotype" objects + associated with this phenotype. + {* the "genotypes" data member and its methods certainly will/needs to be + changed/improved in one way or another since there is + no "Genotype" class yet, CZ 09/06/02 *} + Returns : A list of "Genotype" objects. + Args : + +=cut + +sub each_Genotype { + my ( $self ) = @_; + + $self->throw_not_implemented(); + +} # each_Genotype + + +=head2 add_Genotypes + + Title : add_Genotypes + Usage : $obj->add_Genotypes( @gts ); + or + $obj->add_Genotypes( $gt ); + Function: Pushes one or more "Genotypes" + into the list of "Genotypes". + Returns : + Args : "Genotypes(s)". + +=cut + +sub add_Genotypes { + my ( $self ) = @_; + + $self->throw_not_implemented(); + +} # add_Genotypes + + +=head2 remove_Genotypes + + Title : remove_Genotypes + Usage : $obj->remove_Genotypes(); + Function: Deletes (and returns) the list of "Genotype" objects + associated with this phenotype. + Returns : A list of "Genotype" objects. + Args : + +=cut + +sub remove_Genotypes { + my ( $self ) = @_; + + $self->throw_not_implemented(); + +} # remove_Genotypes + + + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/PrimarySeq.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/PrimarySeq.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,866 @@ +# $Id: PrimarySeq.pm,v 1.73.2.1 2003/06/29 00:25:27 jason Exp $ +# +# bioperl module for Bio::PrimarySeq +# +# Cared for by Ewan Birney +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::PrimarySeq - Bioperl lightweight Sequence Object + +=head1 SYNOPSIS + + # The Bio::SeqIO for file reading, Bio::DB::GenBank for + # database reading + + use Bio::Seq; + use Bio::SeqIO; + use Bio::DB::GenBank; + + #make from memory + $seqobj = Bio::PrimarySeq->new ( -seq => 'ATGGGGTGGGCGGTGGGTGGTTTG', + -id => 'GeneFragment-12', + -accession_number => 'X78121', + -alphabet => 'dna', + -is_circular => 1 + ); + print "Sequence ", $seqobj->id(), " with accession ", + $seqobj->accession_number, "\n"; + + # read from file + $inputstream = Bio::SeqIO->new(-file => "myseq.fa",-format => 'Fasta'); + $seqobj = $inputstream->next_seq(); + print "Sequence ", $seqobj->id(), " and desc ", $seqobj->desc, "\n"; + + + # to get out parts of the sequence. + + print "Sequence ", $seqobj->id(), " with accession ", + $seqobj->accession_number, " and desc ", $seqobj->desc, "\n"; + + $string = $seqobj->seq(); + $string2 = $seqobj->subseq(1,40); + + +=head1 DESCRIPTION + +PrimarySeq is a lightweight Sequence object, storing little more than +the sequence, its name, a computer useful unique name. It does not +contain sequence features or other information. To have a sequence +with sequence features you should use the Seq object which uses this +object - go perldoc Bio::Seq + +Although newusers will use Bio::PrimarySeq alot, in general you will +be using it from the Bio::Seq object. For more information on Bio::Seq +go perldoc Bio::Seq. For interest you might like to known that +Bio::Seq has-a Bio::PrimarySeq and forwards most of the function calls +to do with sequence to it (the has-a relationship lets us get out of a +otherwise nasty cyclical reference in Perl which would leak memory). + +Sequence objects are defined by the Bio::PrimarySeqI interface, and this +object is a pure Perl implementation of the interface (if that's +gibberish to you, don't worry. The take home message is that this +object is the bioperl default sequence object, but other people can +use their own objects as sequences if they so wish). If you are +interested in wrapping your own objects as compliant Bioperl sequence +objects, then you should read the Bio::PrimarySeqI documentation + +The documenation of this object is a merge of the Bio::PrimarySeq and +Bio::PrimarySeqI documentation. This allows all the methods which you can +call on sequence objects here. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney + +Email birney@sanger.ac.uk + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::PrimarySeq; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::PrimarySeqI; +use Bio::IdentifiableI; +use Bio::DescribableI; + +@ISA = qw(Bio::Root::Root Bio::PrimarySeqI + Bio::IdentifiableI Bio::DescribableI); + +# +# setup the allowed values for alphabet() +# + +my %valid_type = map {$_, 1} qw( dna rna protein ); + +=head2 new + + Title : new + Usage : $seq = Bio::PrimarySeq->new( -seq => 'ATGGGGGTGGTGGTACCCT', + -id => 'human_id', + -accession_number => 'AL000012', + ); + + Function: Returns a new primary seq object from + basic constructors, being a string for the sequence + and strings for id and accession_number. + + Note that you can provide an empty sequence string. However, in + this case you MUST specify the type of sequence you wish to + initialize by the parameter -alphabet. See alphabet() for possible + values. + Returns : a new Bio::PrimarySeq object + Args : -seq => sequence string + -display_id => display id of the sequence (locus name) + -accession_number => accession number + -primary_id => primary id (Genbank id) + -namespace => the namespace for the accession + -authority => the authority for the namespace + -desc => description text + -alphabet => sequence type (alphabet) (dna|rna|protein) + -id => alias for display id + -is_circular => boolean field for whether or not sequence is circular + +=cut + + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + my($seq,$id,$acc,$pid,$ns,$auth,$v,$oid, + $desc,$alphabet,$given_id,$is_circular,$direct,$ref_to_seq,$len) = + $self->_rearrange([qw(SEQ + DISPLAY_ID + ACCESSION_NUMBER + PRIMARY_ID + NAMESPACE + AUTHORITY + VERSION + OBJECT_ID + DESC + ALPHABET + ID + IS_CIRCULAR + DIRECT + REF_TO_SEQ + LENGTH + )], + @args); + if( defined $id && defined $given_id ) { + if( $id ne $given_id ) { + $self->throw("Provided both id and display_id constructor ". + "functions. [$id] [$given_id]"); + } + } + if( defined $given_id ) { $id = $given_id; } + + # let's set the length before the seq -- if there is one, this length is + # going to be invalidated + defined $len && $self->length($len); + + # if alphabet is provided we set it first, so that it won't be guessed + # when the sequence is set + $alphabet && $self->alphabet($alphabet); + + # if there is an alphabet, and direct is passed in, assumme the alphabet + # and sequence is ok + + if( $direct && $ref_to_seq) { + $self->{'seq'} = $$ref_to_seq; + if( ! $alphabet ) { + $self->_guess_alphabet(); + } # else it has been set already above + } else { +# print STDERR "DEBUG: setting sequence to [$seq]\n"; + # note: the sequence string may be empty + $self->seq($seq) if defined($seq); + } + + $id && $self->display_id($id); + $acc && $self->accession_number($acc); + defined $pid && $self->primary_id($pid); + $desc && $self->desc($desc); + $is_circular && $self->is_circular($is_circular); + $ns && $self->namespace($ns); + $auth && $self->authority($auth); + defined($v) && $self->version($v); + defined($oid) && $self->object_id($oid); + + return $self; +} + +sub direct_seq_set { + my $obj = shift; + return $obj->{'seq'} = shift if @_; + return undef; +} + + +=head2 seq + + Title : seq + Usage : $string = $obj->seq() + Function: Returns the sequence as a string of letters. The + case of the letters is left up to the implementer. + Suggested cases are upper case for proteins and lower case for + DNA sequence (IUPAC standard), but you should not rely on this + Returns : A scalar + Args : Optionally on set the new value (a string). An optional second + argument presets the alphabet (otherwise it will be guessed). + Both parameters may also be given in named paramater style + with -seq and -alphabet being the names. + +=cut + +sub seq { + my ($obj,@args) = @_; + + if( scalar(@args) == 0 ) { + return $obj->{'seq'}; + } + + my ($value,$alphabet) = @args; + + + if(@args) { + if(defined($value) && (! $obj->validate_seq($value))) { + $obj->throw("Attempting to set the sequence to [$value] ". + "which does not look healthy"); + } + # if a sequence was already set we make sure that we re-adjust the + # mol.type, otherwise we skip guessing if mol.type is already set + # note: if the new seq is empty or undef, we don't consider that a + # change (we wouldn't have anything to guess on anyway) + my $is_changed_seq = + exists($obj->{'seq'}) && (CORE::length($value || '') > 0); + $obj->{'seq'} = $value; + # new alphabet overridden by arguments? + if($alphabet) { + # yes, set it no matter what + $obj->alphabet($alphabet); + } elsif( # if we changed a previous sequence to a new one + $is_changed_seq || + # or if there is no alphabet yet at all + (! defined($obj->alphabet()))) { + # we need to guess the (possibly new) alphabet + $obj->_guess_alphabet(); + } # else (seq not changed and alphabet was defined) do nothing + # if the seq is changed, make sure we unset a possibly set length + $obj->length(undef) if $is_changed_seq; + } + return $obj->{'seq'}; +} + +=head2 validate_seq + + Title : validate_seq + Usage : if(! $seq->validate_seq($seq_str) ) { + print "sequence $seq_str is not valid for an object of type ", + ref($seq), "\n"; + } + Function: Validates a given sequence string. A validating sequence string + must be accepted by seq(). A string that does not validate will + lead to an exception if passed to seq(). + + The implementation provided here does not take alphabet() into + account. Allowed are all letters (A-Z) and '-','.', '*' and '?'. + + Example : + Returns : 1 if the supplied sequence string is valid for the object, and + 0 otherwise. + Args : The sequence string to be validated. + + +=cut + +sub validate_seq { + my ($self,$seqstr) = @_; + if( ! defined $seqstr ){ $seqstr = $self->seq(); } + return 0 unless( defined $seqstr); + if((CORE::length($seqstr) > 0) && ($seqstr !~ /^([A-Za-z\-\.\*\?]+)$/)) { + $self->warn("seq doesn't validate, mismatch is " . + ($seqstr =~ /([^A-Za-z\-\.\*\?]+)/g)); + return 0; + } + return 1; +} + +=head2 subseq + + Title : subseq + Usage : $substring = $obj->subseq(10,40); + Function: returns the subseq from start to end, where the first base + is 1 and the number is inclusive, ie 1-2 are the first two + bases of the sequence + Returns : a string + Args : integer for start position + integer for end position + OR + Bio::LocationI location for subseq (strand honored) + +=cut + +sub subseq { + my ($self,$start,$end,$replace) = @_; + + if( ref($start) && $start->isa('Bio::LocationI') ) { + my $loc = $start; + $replace = $end; # do we really use this anywhere? scary. HL + my $seq = ""; + foreach my $subloc ($loc->each_Location()) { + my $piece = $self->subseq($subloc->start(), + $subloc->end(), $replace); + if($subloc->strand() < 0) { + $piece = Bio::PrimarySeq->new('-seq' => $piece)->revcom()->seq(); + } + $seq .= $piece; + } + return $seq; + } elsif( defined $start && defined $end ) { + if( $start > $end ){ + $self->throw("in subseq, start [$start] has to be ". + "greater than end [$end]"); + } + if( $start <= 0 || $end > $self->length ) { + $self->throw("You have to have start positive\n\tand length less ". + "than the total length of sequence [$start:$end] ". + "Total ".$self->length.""); + } + + # remove one from start, and then length is end-start + $start--; + if( defined $replace ) { + return substr( $self->seq(), $start, ($end-$start), $replace); + } else { + return substr( $self->seq(), $start, ($end-$start)); + } + } else { + $self->warn("Incorrect parameters to subseq - must be two integers ". + "or a Bio::LocationI object not ($start,$end)"); + } +} + +=head2 length + + Title : length + Usage : $len = $seq->length(); + Function: Get the length of the sequence in number of symbols (bases + or amino acids). + + You can also set this attribute, even to a number that does + not match the length of the sequence string. This is useful + if you don''t want to set the sequence too, or if you want + to free up memory by unsetting the sequence. In the latter + case you could do e.g. + + $seq->length($seq->length); + $seq->seq(undef); + + Note that if you set the sequence to a value other than + undef at any time, the length attribute will be + invalidated, and the length of the sequence string will be + reported again. Also, we won''t let you lie about the length. + + Example : + Returns : integer representing the length of the sequence. + Args : Optionally, the value on set + +=cut + +sub length { + my $self = shift; + my $len = CORE::length($self->seq() || ''); + + if(@_) { + my $val = shift; + if(defined($val) && $len && ($len != $val)) { + $self->throw("You're trying to lie about the length: ". + "is $len but you say ".$val); + } + $self->{'_seq_length'} = $val; + } elsif(defined($self->{'_seq_length'})) { + return $self->{'_seq_length'}; + } + return $len; +} + +=head2 display_id + + Title : display_id or display_name + Usage : $id_string = $obj->display_id(); + Function: returns the display id, aka the common name of the Sequence object. + + The semantics of this is that it is the most likely string to + be used as an identifier of the sequence, and likely to have + "human" readability. The id is equivalent to the ID field of + the GenBank/EMBL databanks and the id field of the + Swissprot/sptrembl database. In fasta format, the >(\S+) is + presumed to be the id, though some people overload the id to + embed other information. Bioperl does not use any embedded + information in the ID field, and people are encouraged to use + other mechanisms (accession field for example, or extending + the sequence object) to solve this. + + With the new Bio::DescribeableI interface, display_name aliases + to this method. + + Returns : A string + Args : None + + +=cut + +sub display_id { + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'display_id'} = $value; + } + return $obj->{'display_id'}; + +} + +=head2 accession_number + + Title : accession_number or object_id + Usage : $unique_key = $obj->accession_number; + Function: Returns the unique biological id for a sequence, commonly + called the accession_number. For sequences from established + databases, the implementors should try to use the correct + accession number. Notice that primary_id() provides the + unique id for the implemetation, allowing multiple objects + to have the same accession number in a particular implementation. + + For sequences with no accession number, this method should + return "unknown". + + [Note this method name is likely to change in 1.3] + + With the new Bio::IdentifiableI interface, this is aliased + to object_id + + Returns : A string + Args : A string (optional) for setting + +=cut + +sub accession_number { + my( $obj, $acc ) = @_; + + if (defined $acc) { + $obj->{'accession_number'} = $acc; + } else { + $acc = $obj->{'accession_number'}; + $acc = 'unknown' unless defined $acc; + } + return $acc; +} + +=head2 primary_id + + Title : primary_id + Usage : $unique_key = $obj->primary_id; + Function: Returns the unique id for this object in this + implementation. This allows implementations to manage their + own object ids in a way the implementaiton can control + clients can expect one id to map to one object. + + For sequences with no natural primary id, this method + should return a stringified memory location. + + Returns : A string + Args : A string (optional, for setting) + +=cut + +sub primary_id { + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'primary_id'} = $value; + } + if( ! exists $obj->{'primary_id'} ) { + return "$obj"; + } + return $obj->{'primary_id'}; + +} + + +=head2 alphabet + + Title : alphabet + Usage : if( $obj->alphabet eq 'dna' ) { /Do Something/ } + Function: Returns the type of sequence being one of + 'dna', 'rna' or 'protein'. This is case sensitive. + + This is not called because this would cause + upgrade problems from the 0.5 and earlier Seq objects. + + Returns : a string either 'dna','rna','protein'. NB - the object must + make a call of the type - if there is no type specified it + has to guess. + Args : none + + +=cut + +sub alphabet { + my ($obj,$value) = @_; + if (defined $value) { + $value = lc $value; + unless ( $valid_type{$value} ) { + $obj->throw("Molecular type '$value' is not a valid type (". + join(',', map "'$_'", sort keys %valid_type) . + ") lowercase"); + } + $obj->{'alphabet'} = $value; + } + return $obj->{'alphabet'}; +} + +=head2 desc + + Title : desc or description + Usage : $obj->desc($newval) + Function: Get/set description of the sequence. + + description is an alias for this for compliance with the + Bio::DescribeableI interface. + + Example : + Returns : value of desc (a string) + Args : newvalue (a string or undef, optional) + + +=cut + +sub desc{ + my $self = shift; + + return $self->{'desc'} = shift if @_; + return $self->{'desc'}; +} + +=head2 can_call_new + + Title : can_call_new + Usage : + Function: + Example : + Returns : true + Args : + + +=cut + +sub can_call_new { + my ($self) = @_; + + return 1; + +} + +=head2 id + + Title : id + Usage : $id = $seq->id() + Function: This is mapped on display_id + Example : + Returns : + Args : + + +=cut + +sub id { + return shift->display_id(@_); +} + +=head2 is_circular + + Title : is_circular + Usage : if( $obj->is_circular) { /Do Something/ } + Function: Returns true if the molecule is circular + Returns : Boolean value + Args : none + +=cut + +sub is_circular{ + my $self = shift; + return $self->{'is_circular'} = shift if @_; + return $self->{'is_circular'}; +} + +=head1 Methods for Bio::IdentifiableI compliance + +=cut + +=head2 object_id + + Title : object_id + Usage : $string = $obj->object_id() + Function: a string which represents the stable primary identifier + in this namespace of this object. For DNA sequences this + is its accession_number, similarly for protein sequences + + This is aliased to accession_number(). + Returns : A scalar + + +=cut + +sub object_id { + return shift->accession_number(@_); +} + +=head2 version + + Title : version + Usage : $version = $obj->version() + Function: a number which differentiates between versions of + the same object. Higher numbers are considered to be + later and more relevant, but a single object described + the same identifier should represent the same concept + + Returns : A number + +=cut + +sub version{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'_version'} = $value; + } + return $self->{'_version'}; +} + + +=head2 authority + + Title : authority + Usage : $authority = $obj->authority() + Function: a string which represents the organisation which + granted the namespace, written as the DNS name for + organisation (eg, wormbase.org) + + Returns : A scalar + +=cut + +sub authority { + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'authority'} = $value; + } + return $obj->{'authority'}; +} + +=head2 namespace + + Title : namespace + Usage : $string = $obj->namespace() + Function: A string representing the name space this identifier + is valid in, often the database name or the name + describing the collection + + Returns : A scalar + + +=cut + +sub namespace{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'namespace'} = $value; + } + return $self->{'namespace'} || ""; +} + +=head1 Methods for Bio::DescribableI compliance + +This comprises of display_name and description. + +=cut + +=head2 display_name + + Title : display_name + Usage : $string = $obj->display_name() + Function: A string which is what should be displayed to the user + the string should have no spaces (ideally, though a cautious + user of this interface would not assumme this) and should be + less than thirty characters (though again, double checking + this is a good idea) + + This is aliased to display_id(). + Returns : A scalar + +=cut + +sub display_name { + return shift->display_id(@_); +} + +=head2 description + + Title : description + Usage : $string = $obj->description() + Function: A text string suitable for displaying to the user a + description. This string is likely to have spaces, but + should not have any newlines or formatting - just plain + text. The string should not be greater than 255 characters + and clients can feel justified at truncating strings at 255 + characters for the purposes of display + + This is aliased to desc(). + Returns : A scalar + +=cut + +sub description { + return shift->desc(@_); +} + +=head1 Methods Inherited from Bio::PrimarySeqI + +These methods are available on Bio::PrimarySeq, although they are +actually implemented on Bio::PrimarySeqI + +=head2 revcom + + Title : revcom + Usage : $rev = $seq->revcom() + Function: Produces a new Bio::SeqI implementing object which + is the reversed complement of the sequence. For protein + sequences this throws an exception of + "Sequence is a protein. Cannot revcom" + + The id is the same id as the orginal sequence, and the + accession number is also indentical. If someone wants to + track that this sequence has be reversed, it needs to + define its own extensions + + To do an inplace edit of an object you can go: + + $seqobj = $seqobj->revcom(); + + This of course, causes Perl to handle the garbage + collection of the old object, but it is roughly speaking as + efficient as an inplace edit. + + Returns : A new (fresh) Bio::SeqI object + Args : none + +=cut + +=head2 trunc + + Title : trunc + Usage : $subseq = $myseq->trunc(10,100); + Function: Provides a truncation of a sequence, + + Example : + Returns : a fresh Bio::SeqI implementing object + Args : + + +=cut + +=head1 Internal methods + +These are internal methods to PrimarySeq + +=cut + +=head2 _guess_alphabet + + Title : _guess_alphabet + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub _guess_alphabet { + my ($self) = @_; + my ($str,$str2,$total,$atgc,$u,$type); + + $str = $self->seq(); + $str =~ s/\-\.\?//g; + + $total = CORE::length($str); + if( $total == 0 ) { + $self->throw("Got a sequence with no letters in - ". + "cannot guess alphabet [$str]"); + } + + $u = ($str =~ tr/Uu//); + $atgc = ($str =~ tr/ATGCNatgcn//); + + if( ($atgc / $total) > 0.85 ) { + $type = 'dna'; + } elsif( (($atgc + $u) / $total) > 0.85 ) { + $type = 'rna'; + } else { + $type = 'protein'; + } + + $self->alphabet($type); + return $type; +} + +############################################################################ +# aliases due to name changes or to compensate for our lack of consistency # +############################################################################ + +sub accession { + my $self = shift; + + $self->warn(ref($self)."::accession is deprecated, ". + "use accession_number() instead"); + return $self->accession_number(@_); +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/PrimarySeqI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/PrimarySeqI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,722 @@ +# $Id: PrimarySeqI.pm,v 1.50.2.3 2003/08/29 15:37:39 birney Exp $ +# +# BioPerl module for Bio::PrimarySeqI +# +# Cared for by Ewan Birney +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::PrimarySeqI [Developers] - Interface definition for a Bio::PrimarySeq + +=head1 SYNOPSIS + + + # Bio::PrimarySeqI is the interface class for sequences. + + # If you are a newcomer to bioperl, you should + # start with Bio::Seq documentation. This + # documentation is mainly for developers using + # Bioperl. + + # to test this is a seq object + + $obj->isa("Bio::PrimarySeqI") || + $obj->throw("$obj does not implement the Bio::PrimarySeqI interface"); + + # accessors + + $string = $obj->seq(); + $substring = $obj->subseq(12,50); + $display = $obj->display_id(); # for human display + $id = $obj->primary_id(); # unique id for this object, + # implementation defined + $unique_key= $obj->accession_number(); + # unique biological id + + # object manipulation + + eval { + $rev = $obj->revcom(); + }; + if( $@ ) { + $obj->throw("Could not reverse complement. ". + "Probably not DNA. Actual exception\n$@\n"); + } + + $trunc = $obj->trunc(12,50); + + # $rev and $trunc are Bio::PrimarySeqI compliant objects + + +=head1 DESCRIPTION + +This object defines an abstract interface to basic sequence +information - for most users of the package the documentation (and +methods) in this class are not useful - this is a developers only +class which defines what methods have to be implmented by other Perl +objects to comply to the Bio::PrimarySeqI interface. Go "perldoc +Bio::Seq" or "man Bio::Seq" for more information on the main class for +sequences. + + +PrimarySeq is an object just for the sequence and its name(s), nothing +more. Seq is the larger object complete with features. There is a pure +perl implementation of this in Bio::PrimarySeq. If you just want to +use Bio::PrimarySeq objects, then please read that module first. This +module defines the interface, and is of more interest to people who +want to wrap their own Perl Objects/RDBs/FileSystems etc in way that +they "are" bioperl sequence objects, even though it is not using Perl +to store the sequence etc. + + +This interface defines what bioperl consideres necessary to "be" a +sequence, without providing an implementation of this. (An +implementation is provided in Bio::PrimarySeq). If you want to provide +a Bio::PrimarySeq 'compliant' object which in fact wraps another +object/database/out-of-perl experience, then this is the correct thing +to wrap, generally by providing a wrapper class which would inheriet +from your object and this Bio::PrimarySeqI interface. The wrapper class +then would have methods lists in the "Implementation Specific +Functions" which would provide these methods for your object. + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney + +Email birney@sanger.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::PrimarySeqI; +use vars qw(@ISA ); +use strict; +use Bio::Root::RootI; +use Bio::Tools::CodonTable; + +@ISA = qw(Bio::Root::RootI); + +=head1 Implementation Specific Functions + +These functions are the ones that a specific implementation must +define. + +=head2 seq + + Title : seq + Usage : $string = $obj->seq() + Function: Returns the sequence as a string of letters. The + case of the letters is left up to the implementer. + Suggested cases are upper case for proteins and lower case for + DNA sequence (IUPAC standard), + but implementations are suggested to keep an open mind about + case (some users... want mixed case!) + Returns : A scalar + Status : Virtual + +=cut + +sub seq { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 subseq + + Title : subseq + Usage : $substring = $obj->subseq(10,40); + Function: returns the subseq from start to end, where the first base + is 1 and the number is inclusive, ie 1-2 are the first two + bases of the sequence + + Start cannot be larger than end but can be equal + + Returns : a string + Args : + Status : Virtual + +=cut + +sub subseq{ + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 display_id + + Title : display_id + Usage : $id_string = $obj->display_id(); + Function: returns the display id, aka the common name of the Sequence object. + + The semantics of this is that it is the most likely string + to be used as an identifier of the sequence, and likely to + have "human" readability. The id is equivalent to the ID + field of the GenBank/EMBL databanks and the id field of the + Swissprot/sptrembl database. In fasta format, the >(\S+) is + presumed to be the id, though some people overload the id + to embed other information. Bioperl does not use any + embedded information in the ID field, and people are + encouraged to use other mechanisms (accession field for + example, or extending the sequence object) to solve this. + + Notice that $seq->id() maps to this function, mainly for + legacy/convience issues + Returns : A string + Args : None + Status : Virtual + + +=cut + +sub display_id { + my ($self) = @_; + $self->throw_not_implemented(); +} + + +=head2 accession_number + + Title : accession_number + Usage : $unique_biological_key = $obj->accession_number; + Function: Returns the unique biological id for a sequence, commonly + called the accession_number. For sequences from established + databases, the implementors should try to use the correct + accession number. Notice that primary_id() provides the + unique id for the implemetation, allowing multiple objects + to have the same accession number in a particular implementation. + + For sequences with no accession number, this method should return + "unknown". + Returns : A string + Args : None + Status : Virtual + + +=cut + +sub accession_number { + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + + + +=head2 primary_id + + Title : primary_id + Usage : $unique_implementation_key = $obj->primary_id; + Function: Returns the unique id for this object in this + implementation. This allows implementations to manage their + own object ids in a way the implementaiton can control + clients can expect one id to map to one object. + + For sequences with no accession number, this method should + return a stringified memory location. + + [Note this method name is likely to change in 1.3] + + Returns : A string + Args : None + Status : Virtual + + +=cut + +sub primary_id { + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + + +=head2 can_call_new + + Title : can_call_new + Usage : if( $obj->can_call_new ) { + $newobj = $obj->new( %param ); + } + Function: can_call_new returns 1 or 0 depending + on whether an implementation allows new + constructor to be called. If a new constructor + is allowed, then it should take the followed hashed + constructor list. + + $myobject->new( -seq => $sequence_as_string, + -display_id => $id + -accession_number => $accession + -alphabet => 'dna', + ); + Example : + Returns : 1 or 0 + Args : + + +=cut + +sub can_call_new{ + my ($self,@args) = @_; + + # we default to 0 here + + return 0; +} + +=head2 alphabet + + Title : alphabet + Usage : if( $obj->alphabet eq 'dna' ) { /Do Something/ } + Function: Returns the type of sequence being one of + 'dna', 'rna' or 'protein'. This is case sensitive. + + This is not called because this would cause + upgrade problems from the 0.5 and earlier Seq objects. + + Returns : a string either 'dna','rna','protein'. NB - the object must + make a call of the type - if there is no type specified it + has to guess. + Args : none + Status : Virtual + + +=cut + +sub alphabet{ + my ( $self ) = @_; + $self->throw_not_implemented(); +} + +sub moltype{ + my ($self,@args) = @_; + + $self->warn("moltype: pre v1.0 method. Calling alphabet() instead..."); + $self->alphabet(@args); +} + + +=head1 Optional Implementation Functions + +The following functions rely on the above functions. An +implementing class does not need to provide these functions, as they +will be provided by this class, but is free to override these +functions. + +All of revcom(), trunc(), and translate() create new sequence +objects. They will call new() on the class of the sequence object +instance passed as argument, unless can_call_new() returns FALSE. In +the latter case a Bio::PrimarySeq object will be created. Implementors +which really want to control how objects are created (eg, for object +persistence over a database, or objects in a CORBA framework), they +are encouraged to override these methods + +=head2 revcom + + Title : revcom + Usage : $rev = $seq->revcom() + Function: Produces a new Bio::PrimarySeqI implementing object which + is the reversed complement of the sequence. For protein + sequences this throws an exception of "Sequence is a + protein. Cannot revcom" + + The id is the same id as the original sequence, and the + accession number is also indentical. If someone wants to + track that this sequence has be reversed, it needs to + define its own extensions + + To do an inplace edit of an object you can go: + + $seq = $seq->revcom(); + + This of course, causes Perl to handle the garbage + collection of the old object, but it is roughly speaking as + efficient as an inplace edit. + + Returns : A new (fresh) Bio::PrimarySeqI object + Args : none + + +=cut + +sub revcom{ + my ($self) = @_; + + # check the type is good first. + my $t = $self->alphabet; + + if( $t eq 'protein' ) { + $self->throw("Sequence is a protein. Cannot revcom"); + } + + if( $t ne 'dna' && $t ne 'rna' ) { + if( $self->can('warn') ) { + $self->warn("Sequence is not dna or rna, but [$t]. ". + "Attempting to revcom, but unsure if this is right"); + } else { + warn("[$self] Sequence is not dna or rna, but [$t]. ". + "Attempting to revcom, but unsure if this is right"); + } + } + + # yank out the sequence string + + my $str = $self->seq(); + + # if is RNA - map to DNA then map back + + if( $t eq 'rna' ) { + $str =~ tr/uU/tT/; + } + + # revcom etc... + + $str =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/; + my $revseq = CORE::reverse $str; + + if( $t eq 'rna' ) { + $revseq =~ tr/tT/uU/; + } + + my $seqclass; + if($self->can_call_new()) { + $seqclass = ref($self); + } else { + $seqclass = 'Bio::PrimarySeq'; + $self->_attempt_to_load_Seq(); + } + my $out = $seqclass->new( '-seq' => $revseq, + '-display_id' => $self->display_id, + '-accession_number' => $self->accession_number, + '-alphabet' => $self->alphabet, + '-desc' => $self->desc(), + '-verbose' => $self->verbose + ); + return $out; + +} + +=head2 trunc + + Title : trunc + Usage : $subseq = $myseq->trunc(10,100); + Function: Provides a truncation of a sequence, + + Example : + Returns : a fresh Bio::PrimarySeqI implementing object + Args : Two integers denoting first and last base of the sub-sequence. + + +=cut + +sub trunc{ + my ($self,$start,$end) = @_; + + my $str; + if( defined $start && ref($start) && + $start->isa('Bio::LocationI') ) { + $str = $self->subseq($start); # start is a location actually + } elsif( !$end ) { + $self->throw("trunc start,end -- there was no end for $start"); + } elsif( $end < $start ) { + my $msg = "start [$start] is greater than end [$end]. \n". + "If you want to truncated and reverse complement, \n". + "you must call trunc followed by revcom. Sorry."; + $self->throw($msg); + } else { + $str = $self->subseq($start,$end); + } + + my $seqclass; + if($self->can_call_new()) { + $seqclass = ref($self); + } else { + $seqclass = 'Bio::PrimarySeq'; + $self->_attempt_to_load_Seq(); + } + + my $out = $seqclass->new( '-seq' => $str, + '-display_id' => $self->display_id, + '-accession_number' => $self->accession_number, + '-alphabet' => $self->alphabet, + '-desc' => $self->desc(), + '-verbose' => $self->verbose + ); + return $out; +} + +=head2 translate + + Title : translate + Usage : $protein_seq_obj = $dna_seq_obj->translate + #if full CDS expected: + $protein_seq_obj = $cds_seq_obj->translate(undef,undef,undef,undef,1); + Function: + + Provides the translation of the DNA sequence using full + IUPAC ambiguities in DNA/RNA and amino acid codes. + + The full CDS translation is identical to EMBL/TREMBL + database translation. Note that the trailing terminator + character is removed before returning the translation + object. + + Note: if you set $dna_seq_obj->verbose(1) you will get a + warning if the first codon is not a valid initiator. + + + Returns : A Bio::PrimarySeqI implementing object + Args : character for terminator (optional) defaults to '*' + character for unknown amino acid (optional) defaults to 'X' + frame (optional) valid values 0, 1, 2, defaults to 0 + codon table id (optional) defaults to 1 + complete coding sequence expected, defaults to 0 (false) + boolean, throw exception if not complete CDS (true) or defaults to +warning (false) + coding sequence expected to be complete at 5', defaults to false + coding sequence expected to be complete at 3', defaults to false + +=cut + +sub translate { + my($self) = shift; + my($stop, $unknown, $frame, $tableid, $fullCDS, $throw, $complete5, +$complete3) = @_; + my($i, $len, $output) = (0,0,''); + my($codon) = ""; + my $aa; + + ## User can pass in symbol for stop and unknown codons + unless(defined($stop) and $stop ne '') { $stop = "*"; } + unless(defined($unknown) and $unknown ne '') { $unknown = "X"; } + unless(defined($frame) and $frame ne '') { $frame = 0; } + + ## the codon table ID + unless(defined($tableid) and $tableid ne '') { $tableid = 1; } + + ##Error if monomer is "Amino" + $self->throw("Can't translate an amino acid sequence.") if + ($self->alphabet eq 'protein'); + + ##Error if frame is not 0, 1 or 2 + $self->throw("Valid values for frame are 0, 1, 2, not [$frame].") unless + ($frame == 0 or $frame == 1 or $frame == 2); + + #warns if ID is invalid + my $codonTable = Bio::Tools::CodonTable->new( -id => $tableid); + + my ($seq) = $self->seq(); + + # deal with frame offset. + if( $frame ) { + $seq = substr ($seq,$frame); + } + + # Translate it + $output = $codonTable->translate($seq); + # Use user-input stop/unknown + $output =~ s/\*/$stop/g; + $output =~ s/X/$unknown/g; + + # $complete5 and $complete3 indicate completeness of + # the coding sequence at the 5' and 3' ends. Complete + # if true, default to false. These are in addition to + # $fullCDS, for backwards compatibility + defined($complete5) or ($complete5 = $fullCDS ? 1 : 0); + defined($complete3) or ($complete3 = $fullCDS ? 1 : 0); + + my $id = $self->display_id; + + # only if we are expecting to be complete at the 5' end + if($complete5) { + # if the initiator codon is not ATG, the amino acid needs to changed into M + if(substr($output,0,1) ne 'M') { + if($codonTable->is_start_codon(substr($seq, 0, 3)) ) { + $output = 'M' . substr($output, 1); + } + elsif($throw) { + $self->throw("Seq [$id]: Not using a valid initiator codon!"); + } else { + $self->warn("Seq [$id]: Not using a valid initiator codon!"); + } + } + } + + # only if we are expecting to be complete at the 3' end + if($complete3) { + #remove the stop character + if(substr($output, -1, 1) eq $stop) { + chop $output; + } else { + $throw && $self->throw("Seq [$id]: Not using a valid terminator codon!"); + $self->warn("Seq [$id]: Not using a valid terminator codon!"); + } + } + + # only if we are expecting to translate a complete coding region + if($complete5 and $complete3) { + # test if there are terminator characters inside the protein sequence! + if($output =~ /\*/) { + $throw && $self->throw("Seq [$id]: Terminator codon inside CDS!"); + $self->warn("Seq [$id]: Terminator codon inside CDS!"); + } + } + + my $seqclass; + if($self->can_call_new()) { + $seqclass = ref($self); + } else { + $seqclass = 'Bio::PrimarySeq'; + $self->_attempt_to_load_Seq(); + } + my $out = $seqclass->new( '-seq' => $output, + '-display_id' => $self->display_id, + '-accession_number' => $self->accession_number, + # is there anything wrong with retaining the + # description? + '-desc' => $self->desc(), + '-alphabet' => 'protein', + '-verbose' => $self->verbose + ); + return $out; + +} + +=head2 id + + Title : id + Usage : $id = $seq->id() + Function: ID of the sequence. This should normally be (and actually is in + the implementation provided here) just a synonym for display_id(). + Example : + Returns : A string. + Args : + + +=cut + +sub id { + return shift->display_id(); +} + + +=head2 length + + Title : length + Usage : $len = $seq->length() + Function: + Example : + Returns : integer representing the length of the sequence. + Args : + + +=cut + +sub length { + shift->throw_not_implemented(); +} + +=head2 desc + + Title : desc + Usage : $seq->desc($newval); + $description = $seq->desc(); + Function: Get/set description text for a seq object + Example : + Returns : value of desc + Args : newvalue (optional) + + +=cut + +sub desc { + my ($self,$value) = @_; + $self->throw_not_implemented(); +} + + +=head2 is_circular + + Title : is_circular + Usage : if( $obj->is_circular) { /Do Something/ } + Function: Returns true if the molecule is circular + Returns : Boolean value + Args : none + +=cut + +sub is_circular{ + shift->throw_not_implemented(); +} + +=head1 Private functions + +These are some private functions for the PrimarySeqI interface. You do not +need to implement these functions + +=head2 _attempt_to_load_Seq + + Title : _attempt_to_load_Seq + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub _attempt_to_load_Seq{ + my ($self) = @_; + + if( $main::{'Bio::PrimarySeq'} ) { + return 1; + } else { + eval { + require Bio::PrimarySeq; + }; + if( $@ ) { + my $text = "Bio::PrimarySeq could not be loaded for [$self]\n". + "This indicates that you are using Bio::PrimarySeqI ". + "without Bio::PrimarySeq loaded or without providing a ". + "complete implementation.\nThe most likely problem is that there ". + "has been a misconfiguration of the bioperl environment\n". + "Actual exception:\n\n"; + $self->throw("$text$@\n"); + return 0; + } + return 1; + } + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Range.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Range.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,293 @@ +# $Id: Range.pm,v 1.17 2002/10/22 07:38:24 lapp Exp $ +# +# BioPerl module for Bio::Range +# +# Cared for by Heikki Lehvaslaiho +# +# Copywright Matthew Pocock +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code +# + +=head1 NAME + +Bio::Range - Pure perl RangeI implementation + +=head1 SYNOPSIS + + $range = new Bio::Range(-start=>10, -end=>30, -strand=>+1); + $r2 = new Bio::Range(-start=>15, -end=>200, -strand=>+1); + + print join(', ', $range->union($r2), "\n"; + print join(', ', $range->intersection($r2), "\n"; + + print $range->overlaps($r2), "\n"; + print $range->contains($r2), "\n"; + +=head1 DESCRIPTION + +This provides a pure perl implementation of the BioPerl range +interface. + +Ranges are modeled as having (start, end, length, strand). They use +Bio-coordinates - all points E= start and E= end are within the +range. End is always greater-than or equal-to start, and length is +greather than or equal to 1. The behaviour of a range is undefined if +ranges with negative numbers or zero are used. + +So, in summary: + + length = end - start + 1 + end >= start + strand = (-1 | 0 | +1) + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email heikki@ebi.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +package Bio::Range; + +use strict; +use Carp; +use integer; +use Bio::RangeI; +use Bio::Root::Root; + +use vars qw(@ISA); + +@ISA = qw(Bio::Root::Root Bio::RangeI); + +=head1 Constructors + +=head2 new + + Title : new + Usage : $range = Bio::Range->new(-start => 100, -end=> 200, -strand = +1); + Function: generates a new Bio::Range + Returns : a new range + Args : two of (-start, -end, '-length') - the third is calculated + : -strand (defaults to 0) + +=cut + +sub new { + my ($caller, @args) = @_; + my $self = $caller->SUPER::new(@args); + my ($strand, $start, $end, $length) = + $self->_rearrange([qw(STRAND + START + END + LENGTH + )],@args); + $self->strand($strand || 0); + + if(defined $start ) { + $self->start($start); + if(defined $end) { + $self->end($end); + } elsif(defined $length) { + $self->end($self->start()+ $length - 1); + } + } elsif(defined $end && defined $length ) { + $self->end($end); + $self->start($self->end() - $length + 1); + } + return $self; +} + +=head1 Member variable access + +These methods let you get at and set the member variables + +=head2 start + + Title : start + Function : return or set the start co-ordinate + Example : $s = $range->start(); $range->start(7); + Returns : the value of the start co-ordinate + Args : optionally, the new start co-ordinate + Overrides: Bio::RangeI::start + +=cut + +sub start { + my ($self,$value) = @_; + if( defined $value) { + $self->throw("'$value' is not an integer.\n") + unless $value =~ /^[-+]?\d+$/; + $self->{'start'} = $value; + } + return $self->{'start'}; +} + +=head2 end + + Title : end + Function : return or set the end co-ordinate + Example : $e = $range->end(); $range->end(2000); + Returns : the value of the end co-ordinate + Args : optionally, the new end co-ordinate + Overrides: Bio::RangeI::end + +=cut + +sub end { + + my ($self,$value) = @_; + if( defined $value) { + $self->throw("'$value' is not an integer.\n") + unless $value =~ /^[-+]?\d+$/; + $self->{'end'} = $value; + } + return $self->{'end'}; +} + +=head2 strand + + Title : strand + Function : return or set the strandidness + Example : $st = $range->strand(); $range->strand(-1); + Returns : the value of the strandedness (-1, 0 or 1) + Args : optionaly, the new strand - (-1, 0, 1) or (-, ., +). + Overrides: Bio::RangeI::Strand + +=cut + +sub strand { + my $self = shift; + if(@_) { + my $val = shift; + $val =~ tr/+/1/; + $val =~ tr/-/-1/; + $val =~ tr/./0/; + if($val == -1 || $val == 0 || $val == 1 ) { + $self->{'strand'} = $val; + } + } + return $self->{'strand'}; +} + +=head2 length + + Title : length + Function : returns the length of this range + Example : $length = $range->length(); + Returns : the length of this range, equal to end - start + 1 + Args : if you attempt to set the length, and exeption will be thrown + Overrides: Bio::RangeI::Length + +=cut + +sub length { + my $self = shift; + if(@_) { + confess ref($self), "->length() is read-only"; + } + return $self->end() - $self->start() + 1; +} + +=head2 toString + + Title : toString + Function: stringifies this range + Example : print $range->toString(), "\n"; + Returns : a string representation of this range + +=cut + +sub toString { + my $self = shift; + return "(${\$self->start}, ${\$self->end}) strand=${\$self->strand}"; +} + +=head1 Boolean Methods + +These methods return true or false. + + $range->overlaps($otherRange) && print "Ranges overlap\n"; + +=head2 overlaps + + Title : overlaps + Usage : if($r1->overlaps($r2)) { do stuff } + Function : tests if $r2 overlaps $r1 + Args : a range to test for overlap with + Returns : true if the ranges overlap, false otherwise + Inherited: Bio::RangeI + +=head2 contains + + Title : contains + Usage : if($r1->contains($r2) { do stuff } + Function : tests wether $r1 totaly contains $r2 + Args : a range to test for being contained + Returns : true if the argument is totaly contained within this range + Inherited: Bio::RangeI + +=head2 equals + + Title : equals + Usage : if($r1->equals($r2)) + Function : test whether $r1 has the same start, end, length as $r2 + Args : a range to test for equality + Returns : true if they are describing the same range + Inherited: Bio::RangeI + +=head1 Geometrical methods + +These methods do things to the geometry of ranges, and return +triplets (start, end, strand) from which new ranges could be built. + +=head2 intersection + + Title : intersection + Usage : ($start, $stop, $strand) = $r1->intersection($r2) + Function : gives the range that is contained by both ranges + Args : a range to compare this one to + Returns : nothing if they do not overlap, or the range that they do overlap + Inherited: Bio::RangeI::intersection + +=cut + +=head2 union + + Title : union + Usage : ($start, $stop, $strand) = $r1->union($r2); + : ($start, $stop, $strand) = Bio::Range->union(@ranges); + Function : finds the minimal range that contains all of the ranges + Args : a range or list of ranges to find the union of + Returns : the range containing all of the ranges + Inherited: Bio::RangeI::union + +=cut + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/RangeI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/RangeI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,443 @@ +# $Id: RangeI.pm,v 1.30 2002/11/05 02:55:12 lapp Exp $ +# +# BioPerl module for Bio::RangeI +# +# Cared for by Lehvaslaiho +# +# Copyright Matthew Pocock +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::RangeI - Range interface + +=head1 SYNOPSIS + + #Do not run this module directly + +=head1 DESCRIPTION + +This provides a standard BioPerl range interface that should be +implemented by any object that wants to be treated as a range. This +serves purely as an abstract base class for implementers and can not +be instantiated. + +Ranges are modeled as having (start, end, length, strand). They use +Bio-coordinates - all points E= start and E= end are within the +range. End is always greater-than or equal-to start, and length is +greather than or equal to 1. The behaviour of a range is undefined if +ranges with negative numbers or zero are used. + +So, in summary: + + length = end - start + 1 + end >= start + strand = (-1 | 0 | +1) + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email: heikki@ebi.ac.uk + +=head1 CONTRIBUTORS + +Juha Muilu (muilu@ebi.ac.uk) + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +package Bio::RangeI; + +use strict; +use Carp; +use Bio::Root::RootI; +use vars qw(@ISA); +use integer; +use vars qw( @ISA %STRAND_OPTIONS ); + +@ISA = qw( Bio::Root::RootI ); + +BEGIN { +# STRAND_OPTIONS contains the legal values for the strand options + %STRAND_OPTIONS = map { $_, '_'.$_ } + ( + 'strong', # ranges must have the same strand + 'weak', # ranges must have the same strand or no strand + 'ignore', # ignore strand information + ); +} + +# utility methods +# + +# returns true if strands are equal and non-zero +sub _strong { + my ($r1, $r2) = @_; + my ($s1, $s2) = ($r1->strand(), $r2->strand()); + + return 1 if $s1 != 0 && $s1 == $s2; +} + +# returns true if strands are equal or either is zero +sub _weak { + my ($r1, $r2) = @_; + my ($s1, $s2) = ($r1->strand(), $r2->strand()); + return 1 if $s1 == 0 || $s2 == 0 || $s1 == $s2; +} + +# returns true for any strandedness +sub _ignore { + return 1; +} + +# works out what test to use for the strictness and returns true/false +# e.g. $r1->_testStrand($r2, 'strong') +sub _testStrand() { + my ($r1, $r2, $comp) = @_; + return 1 unless $comp; + my $func = $STRAND_OPTIONS{$comp}; + return $r1->$func($r2); +} + +=head1 Abstract methods + +These methods must be implemented in all subclasses. + +=head2 start + + Title : start + Usage : $start = $range->start(); + Function: get/set the start of this range + Returns : the start of this range + Args : optionaly allows the start to be set + using $range->start($start) + +=cut + +sub start { + shift->throw_not_implemented(); +} + +=head2 end + + Title : end + Usage : $end = $range->end(); + Function: get/set the end of this range + Returns : the end of this range + Args : optionaly allows the end to be set + using $range->end($end) + +=cut + +sub end { + shift->throw_not_implemented(); +} + +=head2 length + + Title : length + Usage : $length = $range->length(); + Function: get/set the length of this range + Returns : the length of this range + Args : optionaly allows the length to be set + using $range->length($length) + +=cut + +sub length { + shift->throw_not_implemented(); +} + +=head2 strand + + Title : strand + Usage : $strand = $range->strand(); + Function: get/set the strand of this range + Returns : the strandidness (-1, 0, +1) + Args : optionaly allows the strand to be set + using $range->strand($strand) + +=cut + +sub strand { + shift->throw_not_implemented(); +} + +=head1 Boolean Methods + +These methods return true or false. They throw an error if start and +end are not defined. + + $range->overlaps($otherRange) && print "Ranges overlap\n"; + +=head2 overlaps + + Title : overlaps + Usage : if($r1->overlaps($r2)) { do stuff } + Function: tests if $r2 overlaps $r1 + Args : arg #1 = a range to compare this one to (mandatory) + arg #2 = strand option ('strong', 'weak', 'ignore') (optional) + Returns : true if the ranges overlap, false otherwise + +=cut + +sub overlaps { + my ($self, $other, $so) = @_; + + $self->throw("start is undefined") unless defined $self->start; + $self->throw("end is undefined") unless defined $self->end; + $self->throw("not a Bio::RangeI object") unless defined $other && + $other->isa('Bio::RangeI'); + $other->throw("start is undefined") unless defined $other->start; + $other->throw("end is undefined") unless defined $other->end; + + return + ($self->_testStrand($other, $so) + and not ( + ($self->start() > $other->end() or + $self->end() < $other->start() ) + )); +} + +=head2 contains + + Title : contains + Usage : if($r1->contains($r2) { do stuff } + Function: tests whether $r1 totally contains $r2 + Args : arg #1 = a range to compare this one to (mandatory) + alternatively, integer scalar to test + arg #2 = strand option ('strong', 'weak', 'ignore') (optional) + Returns : true if the argument is totaly contained within this range + +=cut + +sub contains { + my ($self, $other, $so) = @_; + $self->throw("start is undefined") unless defined $self->start; + $self->throw("end is undefined") unless defined $self->end; + + if(defined $other && ref $other) { # a range object? + $other->throw("Not a Bio::RangeI object") unless $other->isa('Bio::RangeI'); + $other->throw("start is undefined") unless defined $other->start; + $other->throw("end is undefined") unless defined $other->end; + + return ($self->_testStrand($other, $so) and + $other->start() >= $self->start() and + $other->end() <= $self->end()); + } else { # a scalar? + $self->throw("'$other' is not an integer.\n") unless $other =~ /^[-+]?\d+$/; + return ($other >= $self->start() and $other <= $self->end()); + } +} + +=head2 equals + + Title : equals + Usage : if($r1->equals($r2)) + Function: test whether $r1 has the same start, end, length as $r2 + Args : a range to test for equality + Returns : true if they are describing the same range + +=cut + +sub equals { + my ($self, $other, $so) = @_; + + $self->throw("start is undefined") unless defined $self->start; + $self->throw("end is undefined") unless defined $self->end; + $other->throw("Not a Bio::RangeI object") unless $other->isa('Bio::RangeI'); + $other->throw("start is undefined") unless defined $other->start; + $other->throw("end is undefined") unless defined $other->end; + + return ($self->_testStrand($other, $so) and + $self->start() == $other->start() and + $self->end() == $other->end() ); +} + +=head1 Geometrical methods + +These methods do things to the geometry of ranges, and return +Bio::RangeI compliant objects or triplets (start, stop, strand) from +which new ranges could be built. + + +=head2 intersection + + Title : intersection + Usage : ($start, $stop, $strand) = $r1->intersection($r2) + Function: gives the range that is contained by both ranges + Args : arg #1 = a range to compare this one to (mandatory) + arg #2 = strand option ('strong', 'weak', 'ignore') (optional) + Returns : undef if they do not overlap, + or the range that they do overlap + (in an objectlike the calling one) + +=cut + +sub intersection { + my ($self, $other, $so) = @_; + return unless $self->_testStrand($other, $so); + + $self->throw("start is undefined") unless defined $self->start; + $self->throw("end is undefined") unless defined $self->end; + $other->throw("Not a Bio::RangeI object") unless $other->isa('Bio::RangeI'); + $other->throw("start is undefined") unless defined $other->start; + $other->throw("end is undefined") unless defined $other->end; + + my @start = sort {$a<=>$b} + ($self->start(), $other->start()); + my @end = sort {$a<=>$b} + ($self->end(), $other->end()); + + my $start = pop @start; + my $end = shift @end; + + my $union_strand; # Strand for the union range object. + + if($self->strand == $other->strand) { + $union_strand = $other->strand; + } else { + $union_strand = 0; + } + + if($start > $end) { + return undef; + } else { + return $self->new('-start' => $start, + '-end' => $end, + '-strand' => $union_strand + ); + #return ($start, $end, $union_strand); + } +} + +=head2 union + + Title : union + Usage : ($start, $stop, $strand) = $r1->union($r2); + : ($start, $stop, $strand) = Bio::RangeI->union(@ranges); + my $newrange = Bio::RangeI->union(@ranges); + Function: finds the minimal range that contains all of the ranges + Args : a range or list of ranges to find the union of + Returns : the range object containing all of the ranges + +=cut + +sub union { + my $self = shift; + my @ranges = @_; + if(ref $self) { + unshift @ranges, $self; + } + + my @start = sort {$a<=>$b} + map( { $_->start() } @ranges); + my @end = sort {$a<=>$b} + map( { $_->end() } @ranges); + + my $start = shift @start; + while( !defined $start ) { + $start = shift @start; + } + + my $end = pop @end; + + my $union_strand; # Strand for the union range object. + + foreach(@ranges) { + if(! defined $union_strand) { + $union_strand = $_->strand; + next; + } else { + if($union_strand ne $_->strand) { + $union_strand = 0; + last; + } + } + } + return undef unless $start or $end; + if( wantarray() ) { + return ( $start,$end,$union_strand); + } else { + return $self->new('-start' => $start, + '-end' => $end, + '-strand' => $union_strand + ); + } +} + +=head2 overlap_extent + + Title : overlap_extent + Usage : ($a_unique,$common,$b_unique) = $a->overlap_extent($b) + Function: Provides actual amount of overlap between two different + ranges. + Example : + Returns : array of values for + - the amount unique to a + - the amount common to both + - the amount unique to b + Args : a range + + +=cut + +sub overlap_extent{ + my ($a,$b) = @_; + + $a->throw("start is undefined") unless defined $a->start; + $a->throw("end is undefined") unless defined $a->end; + $b->throw("Not a Bio::RangeI object") unless $b->isa('Bio::RangeI'); + $b->throw("start is undefined") unless defined $b->start; + $b->throw("end is undefined") unless defined $b->end; + + my ($au,$bu,$is,$ie); + if( ! $a->overlaps($b) ) { + return ($a->length,0,$b->length); + } + + if( $a->start < $b->start ) { + $au = $b->start - $a->start; + } else { + $bu = $a->start - $b->start; + } + + if( $a->end > $b->end ) { + $au += $a->end - $b->end; + } else { + $bu += $b->end - $a->end; + } + my $intersect = $a->intersection($b); + $ie = $intersect->end; + $is = $intersect->start; + + return ($au,$ie-$is+1,$bu); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Root/Err.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Root/Err.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1138 @@ +#----------------------------------------------------------------------------- +# PACKAGE : Bio::Root::Err.pm +# AUTHOR : Steve Chervitz (sac@bioperl.org) +# CREATED : 22 July 1996 +# REVISION: $Id: Err.pm,v 1.15 2002/10/22 07:38:37 lapp Exp $ +# STATUS : Alpha +# +# For documentation, run this module through pod2html +# (preferably from Perl v5.004 or better). +# +# Copyright (c) 1996-8 Steve Chervitz. All Rights Reserved. +# This module is free software; you can redistribute it and/or +# modify it under the same terms as Perl itself. +# Retain this notice and note any modifications made. +#----------------------------------------------------------------------------- + +package Bio::Root::Err; +use strict; + +use Bio::Root::Global qw(:devel $CGI); +use Bio::Root::Vector (); +use Bio::Root::Object;# qw(:std); +use Exporter (); + +use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); +@ISA = qw( Bio::Root::Object Bio::Root::Vector Exporter ); +@EXPORT = qw(); +@EXPORT_OK = qw( %ERR_FIELDS @ERR_TYPES &format_stack_entry &throw &warning); +%EXPORT_TAGS = ( + data => [qw(%ERR_FIELDS @ERR_TYPES)], + std => [qw(&throw &warning)] + ); + +use vars qw($ID $VERSION); +$ID = 'Bio::Root::Err'; +$VERSION = 0.041; + +%Bio::Root::Err::ERR_FIELDS = (TYPE=>1, MSG=>1, NOTE=>1, CONTEXT=>1, + TECH=>1, STACK=>1 ); + +@Bio::Root::Err::ERR_TYPES = qw(WARNING EXCEPTION FATAL); + + +## MAIN POD DOCUMENTATION: + +=head1 NAME + +Bio::Root::Err - Exception class for Perl 5 objects + +=head1 SYNOPSIS + +=head2 Object Creation + +B is a wrapper for Bio::Root::Err.pm objects so clients +do not have to create these objects directly. Please see +B as well as L<_initialize>() +for a more complete treatment +of how to create Bio::Root::Err.pm objects. + + use Bio::Root::Err; + + $err = Bio::Root::Err->new(-MSG =>"Bad data: $data", + -STACK =>[\caller(0), \caller(1), ...], + ); + + +To use the L() method directly: + + use Bio::Root::Err (:std); + + throw( $object_ref, 'Error message', 'additional note', 'technical note'); + +The C<$object_ref> argument should be a reference to a Bio::Root::Object.pm. + +See also L. + +=head1 INSTALLATION + +This module is included with the central Bioperl distribution: + + http://bio.perl.org/Core/Latest + ftp://bio.perl.org/pub/DIST + +Follow the installation instructions included in the INSTALL file. + +=head1 DESCRIPTION + +A Bio::Root::Err.pm object encapsulates data and methods that facilitate +working with errors and exceptional conditions that arise in Perl objects. +There are no biological semantics in this module, as one may suspect from its +location in the Bio:: hierarchy. The location of this module serves to +separate it from the namespaces of other Perl Error modules. It also makes it +convenient for use by Bio:: objects. + +The motivation for having an error object is to allow +Perl 5 objects to deal with errors or exceptional conditions that +can arise during their construction or manipulation. For example: + + (1) A complex object can break in many ways. + (2) Tracking errors within a set of nested objects can be difficult. + (3) The way an error is reported should be context-sensitive: + a web-user needs different information than does the + software engineer. + +Bio::Root::Err.pm, along with B, attempt to make such +problems tractable. Please see the L documentation for more +about my error handling philosophy. + +A B object is an example of a Vector-Object: This module +inherits both from B and B. This +permits a single Err object to exist within a linked list of Err objects OR +alone. See the L documentation for more about Vector-Objects. + +B + +=head2 Other Exception Strategies + +Exception handling with Perl 5 objects is currently not as evolved as one +would like. The error handling used by B and Bio::Root::Err.pm +relies on Perl's built-in error/exception handling with eval/die, +which is not very object-aware. What I've attempted to do with these +modules is to make eval/die more object-savvy, as well as make Perl 5 +objects more eval/die-savvy (but the current strategy is basically a hack). + +It would be great if Perl could throw an object reference with die(). +This would permit more intelligent and easy to write exception handlers. +For now the Err.pm object is reconstructed from the output of L(). + +There are some other third-party Exception classes such as +Torsten Ekedahl's B or Ken Steven's Throwable.pm or +Graham Barr's Error.pm (see L). These modules +attempt to introduce a traditional "try-catch-throw" exception handling mechanism +into Perl. Future version of my modules (and perhaps Perl itself) may utilize one +of these. + +=head1 USAGE + +A demo script that illustrates working with Bio::Root::Err objects is +examples/root_object/error.pl. + + +=head1 DEPENDENCIES + +Bio::Root::Err.pm inherits from B and B. + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other Bioperl modules. +Send your comments and suggestions preferably to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track the bugs and +their resolution. Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR Steve Chervitz + +Email sac@bioperl.org + +See L section for where to send bug reports and comments. + +=head1 VERSION + +Bio::Root::Err.pm, 0.041 + + +=head1 SEE ALSO + + Bio::Root::Object.pm - Core object + Bio::Root::Vector.pm - Vector object + Bio::Root::Global.pm - Manages global variables/constants + + http://bio.perl.org/Projects/modules.html - Online module documentation + http://bio.perl.org/ - Bioperl Project Homepage + +=head2 Other Exception Modules + + Experimental::Exception.pm - ftp://ftp.matematik.su.se/pub/teke/ + Error.pm - http://www.cpan.org/authors/id/GBARR/ + Throwable.pm - mailto:kstevens@globeandmail.ca + + http://genome-www.stanford.edu/perlOOP/exceptions.html + +=head1 ACKNOWLEDGEMENTS + +This module was developed under the auspices of the Saccharomyces Genome +Database: + http://genome-www.stanford.edu/Saccharomyces + +Other Bioperl developers contributed ideas including Ewan Birney, Ian Korf, +Chris Dagdigian, Georg Fuellen, and Steven Brenner. + +=head1 COPYRIGHT + +Copyright (c) 1996-8 Steve Chervitz. All Rights Reserved. +This module is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +=head1 TODO + +=over 2 + +=item * Improve documentation. + +=item * Experiment with other Exception modules. + +=back + +=cut + +## END MAIN POD DOCUMENTATION' + +=head1 APPENDIX + +Methods beginning with a leading underscore are considered private +and are intended for internal use by this module. They are +B considered part of the public interface and are described here +for documentation purposes only. + +=cut + + +######################################################## +# CONSTRUCTOR # +######################################################## + + +=head2 _initialize + + Usage : n/a; automatically called by Bio::Root::Object::new() + Purpose : Initializes key Bio::Root::Err.pm data. + Returns : String (the -MAKE constructor option.) + Argument : Named parameters passed from new() + : (PARAMETER TAGS CAN BE UPPER OR LOWER CASE). + : -MSG => basic description of the exception. + : -NOTE => additional note to indicate cause of exception + : or provide information about how to fix/report it + : -TECH => addition note with technical information + : of interest to developer. + : -STACK => array reference containing caller() data + : -TYPE => string, one of @Bio::Root::Err::ERR_TYPES + : (default = exception). + : -CONTEXT => array reference + : -OBJ => Err object to be cloned. + +See Also : B + +=cut + +#---------------- +sub _initialize { +#---------------- + my( $self, @param ) = @_; + + my $make = $self->Bio::Root::Object::_initialize( @param ); + + my( $msg, $note, $tech, $stack, $type, $context, $obj) = + $self->_rearrange([qw(MSG NOTE TECH STACK TYPE CONTEXT OBJ)], @param); + + ## NOTE: Don't eval {} the construction process for Err objects. + + if($make =~ /clone/i) { + $self->_set_clone($obj); + } else { + if(!$self->_build_from_string($msg, $note, $tech)) { +# print "Failed to rebuild: msg = $msg";; + $self->set('msg', $msg ); + $self->_set_type( $type ); + $self->_set_context($context); + $self->_set_list_data('note', $note ); + $self->_set_list_data('tech', $tech ); + $self->_set_list_data('stack', $stack ); + } + $self->set_display(); + } + + $DEBUG and do{ print STDERR "---> Initialized Err (${\ref($self)}).\n\n"; + # $self->print(); + }; + $make; +} + +## +## Destructor: Not needed currently. Perhaps if and when Vector is used by delegation. +## + +##################################################################################### +## ACCESSORS ## +##################################################################################### + + + +=head2 _set_clone + + Usage : n/a; internal method used by _initialize() + Purpose : Copy all Bio::Root::Err.pm data members into a new object reference. + Argument : object ref for object to be cloned. + Comments : Does not cloning the vector since this method is + : typically used to extract a single Err object from its vector. + +=cut + +#--------------- +sub _set_clone { +#--------------- + my($self, $obj) = @_; + + ref($obj) || throw($self, "Can't clone $ID object: Not an object ref ($obj)"); + + $self->{'_type'} = $obj->{'_type'}; + $self->{'_msg'} = $obj->{'_msg'}; + $self->{'_note'} = $obj->{'_note'}; + $self->{'_tech'} = $obj->{'_tech'}; + $self->{'_stack'} = $obj->{'_stack'}; + $self->{'_context'} = $obj->{'_context'}; +# $self->clone_vector($obj); +} + + +=head2 _build_from_string + + Usage : n/a; called by _initialize() + Purpose : Re-create an Err.pm object from a string containing Err data. + Returns : boolean, (was the Err.pm object rebuilt?) + Argument : message, note, tech passed from _initialize() + : The message is examined to see if it contains a stringified error. + +See Also : L<_initialize>(), L(), L<_has_err>() + +=cut + +#---------------------- +sub _build_from_string { +#---------------------- + my ($self, $msg, $note, $tech) = @_; + my @list = split "\n", $msg; + my ($mode,$line); + my $rebuilt = 0; + + # print "$ID: Attempting to build from string: $msg";; + + MEMBER: + foreach $line (@list) { + if($line =~ /^-+$/) { last MEMBER; } + if($line =~ /^-+ (\w+) -+$/) { $self->{'_type'} = $1; $rebuilt = 1; next MEMBER; } + if($line =~ /^MSG: *(\w.*)/) { my $msg = $1; + if($self->_has_err($msg)) { + die "Duplicate error."; + } + $self->{'_msg'} = $msg; + $mode = 'msg'; + next MEMBER; } + if($line =~ /^CONTEXT: *(\w.*)/) { push @{$self->{'_context'}}, $1; $mode = 'context'; next MEMBER; } + if($line =~ /^NOTE: *(\w.*)/) { push @{$self->{'_note'}}, $1; $mode = 'note'; next MEMBER; } + if($line =~ /^TECH: *(\w.*)/) { push @{$self->{'_tech'}}, $1; $mode = 'tech'; next MEMBER; } + if($line =~ /^STACK:/) { $mode = 'stack'; next MEMBER; } + next MEMBER if !$mode; + SWITCH: { + local $_ = $mode; + m/msg/ && do{ $self->{'_msg'} .= "$line\n"; last SWITCH; }; + m/note/ && do{ push @{$self->{'_note'}}, $line; last SWITCH; }; + m/context/ && do{ push @{$self->{'_context'}}, $line; last SWITCH; }; + m/tech/ && do{ push @{$self->{'_tech'}}, $line; last SWITCH; }; + m/stack/ && do{ push @{$self->{'_stack'}}, $line; last SWITCH; }; + next MEMBER; + } + } + if($rebuilt) { + ## Optionally add additional notes. + $self->_set_list_data('note', $note) if defined $note; + $self->_set_list_data('tech', $tech) if defined $tech; + } + + $rebuilt; +} + + +=head2 _has_err + + Usage : n/a; internal method called by _build_from_string() + Purpose : Deterimine if an Err has already been set to prevent duplicate Errs. + Returns : boolean + +See Also : L<_build_from_string>() + +=cut + +#------------- +sub _has_err { +#------------- + my ($self, $msg) = @_; + + $msg =~ s/^\s+//; + $msg =~ s/\s+$//; + + my $err = $self->first; + my ($existing_msg); + do { +# print "checking err object $self\n"; + $existing_msg = $err->msg; + $existing_msg =~ s/^\s+//; + $existing_msg =~ s/\s+$//; +# print " msg: $existing_msg";; + return 1 if $existing_msg eq $msg; + + } while($err = $err->next); + + 0; +} + + +=head2 _set_type + + Usage : n/a; internal method + Purpose : Sets the type of Err (warning, exception, fatal) + : Called by _initialize() + Argument : string + +=cut + +#---------------- +sub _set_type { +#---------------- + my( $self, $data ) = @_; + $data ||= 'EXCEPTION'; + +# printf "\n$ID: Setting type (%s) for err = %s\n", $data, $self->msg;; + + my (@type); + if( @type = grep /$data/i, @Bio::Root::Err::ERR_TYPES ) { + $self->{'_type'} = $type[0]; + } else { + $self->{'_type'} = 'EXCEPTION'; + } + +# print "type = $self->{'_type'} for $self";; +} + + + +=head2 _set_list_data + + Usage : n/a; internal method used by set(). + : $err->_set_list_data( $member, $data); + Purpose : For data members which are anonymous arrays: note, tech, stack, + : adds the given data to the list. + Arguments : $member = any of qw(note tech stack) + : $data = string + Comments : Splits $data on tab. Each item + : of the split is a new entry. + : To clobber the current data (unusual situation), you must first + : call set() with no data then call again with desired data. + +=cut + +#------------------- +sub _set_list_data { +#------------------- + my( $self, $member, $data ) = @_; + + # Sensitive to data member name changes. + $member = "_\l$member"; + +# $DEBUG && do {printf STDERR "\n$ID: Setting \"%s\" list data (%s)\n", $member, $data;; }; + + defined $self->{$member} and return $self->_add_list_data( $member, $data ); + + if( $data ) { + $self->{$member} = []; + if( $member =~ /stack/) { + foreach (@$data) { + push @{ $self->{$member}}, format_stack_entry(@$_); + } + } else { + my @entries = split "\t", $data; + foreach (@entries) { + next if /^$/; +# $DEBUG && do {print STDERR "adding $member: $_";;}; + push @{ $self->{$member}}, $_; + } + } + } else { + $self->{$member} = undef; + } +} + + +=head2 _set_context + + Usage : n/a; internal method used by set(). + Purpose : Sets the object containment context for the exception. + : (this is the hierarchy of objects in which the + : exception occurred.) + +=cut + +#------------------ +sub _set_context { +#------------------ + my($self, $aref) = @_; + + eval { + if (!ref $aref) { +# push @{$aref}, sprintf "object %s \"%s\"",ref($self->parent), $self->parent->name; + push @{$aref}, "UNKNOWN CONTEXT"; + } + }; + if($@) { push @{$aref}, 'undefined object'; } + + if($self->type eq 'EXCEPTION') { + $aref->[0] = "Exception thrown by \l$aref->[0]"; + } else { + $aref->[0] = "Error in \l$aref->[0]"; + } + + my $script = ($0 =~ /([\w\/\.]+)/, $1); + push @$aref, "SCRIPT: $script"; + + $self->{'_context'} = $aref; + +# print "$ID: _set_context():\n"; +# foreach(@$aref) { print " $_\n"; } +# ; +} + + + +=head2 set + + Usage : $err->set( $member, $data ); + Purpose : General accessor for setting any Err.pm data member. + Example : $err->set('note', 'this is an additional note.'); + Returns : n/a + Argument : $member = string, any of qw(msg type note tech stack) + : $data = string + Throws : n/a + Comments : Note, tech, and stack items are appended to any existing + : notes, tech notes, and stack. + : There should be no need to mess with the stack. + +=cut + +#--------- +sub set { +#--------- + my( $self, $member, $data ) = @_; + + local $_ = "\u$member"; + SWITCH: { + /msg/i && do{ $self->{'_msg'} = (defined $data ? $data : 'Unknown error'); last SWITCH; }; + /type/i && do{ $self->_set_type( $data ); last SWITCH; }; + /note|tech|stack/i && do{ $self->_set_list_data( $member, $data); last SWITCH}; + warn "\n*** Invalid or unspecified Err data member: $member\n\n"; + } +} + + +=head2 msg + + Usage : $message = $err->msg; + Purpose : Get the main message associated with the exception. + Returns : String + Argument : optional string to be used as a delimiter. + +See Also : L(), L() + +=cut + + +#------- +sub msg { my($self,$delimiter) = @_; $self->get('msg',$delimiter); } +#------- + + +=head2 type + + Usage : $type = $err->type; + Purpose : Get the type of Err (warning, exception, fatal) + Returns : String + Argument : optional string to be used as a delimiter. + +See Also : L(), L() + +=cut + +#-------- +sub type { my($self,$delimiter) = @_; $self->get('type',$delimiter); } +#-------- + + +=head2 note + + Usage : $note = $err->note; + : $note = $err->note('

'); + Purpose : Get any general note associated with the exception. + Returns : String + Argument : optional string to be used as a delimiter. + +See Also : L(), L() + +=cut + +#--------- +sub note { my($self,$delimiter) = @_; $self->get('note',$delimiter); } +#--------- + + +=head2 tech + + Usage : $tech = $err->tech; + : $tech = $err->tech('

'); + Purpose : Get any technical note associate with the exception. + Returns : String + Argument : optional string to be used as a delimiter. + +See Also : L(), L() + +=cut + +#------------ +sub tech { my($self,$delimiter) = @_; $self->get('tech',$delimiter); } +#------------ + + + +=head2 stack + + Usage : $stack = $err->stack; + : $stack = $err->stack('

'); + Purpose : Get the call stack for the exception. + Returns : String + Argument : optional string to be used as a delimiter. + +See Also : L(), L() + +=cut + +#---------- +sub stack { my($self,$delimiter) = @_; $self->get('stack',$delimiter); } +#---------- + + + +=head2 context + + Usage : $context = $err->context; + : $context = $err->context('

'); + Purpose : Get the containment context of the object which generated the exception. + Returns : String + Argument : optional string to be used as a delimiter. + +See Also : L(), L() + +=cut + +#------------ +sub context { my($self,$delimiter) = @_; $self->get('context',$delimiter); } +#------------ + + + +=head2 get + + Usage : $err->get($member, $delimiter); + Purpose : Get specific data from the Err.pm object. + Returns : String in scalar context. + : Array in list context. + Argument : $member = any of qw(msg type note tech stack context) or combination. + : $delimiter = optional string to be used as a delimiter + : between member data. + +See Also : L(), L(), L(), L(), L(), L(), L() + +=cut + +#--------- +sub get { +#--------- + my( $self, $member, $delimiter ) = @_; + + my $outer_delim = $delimiter || "\n"; +# my $outer_delim = ($CGI ? "\n

" : $delimiter); ## Subtle bug here. + + my (@out); + local $_ = $member; + SWITCH: { + /type/i && do{ push (@out, $self->{'_type'},$outer_delim) }; +# /msg/i && do{ print "getting msg";; push (@out, (defined $self->{'_msg'} ? $self->{'_msg'} : ''),$outer_delim); print "msg: @out<---";; }; + /msg/i && do{ push (@out, (defined $self->{'_msg'} ? $self->{'_msg'} : ''),$outer_delim); }; + /note/i && do{ push (@out, $self->_get_list_data('note', $delimiter ),$outer_delim) }; + /tech/i && do{ push (@out, $self->_get_list_data('tech', $delimiter ),$outer_delim) }; + /stack/i && do{ push (@out, $self->_get_list_data('stack', $delimiter ),$outer_delim) }; + /context/i && do{ push (@out, $self->_get_list_data('context', $delimiter ),$outer_delim) }; + + ## CAN'T USE THE FOLLOWING FORM SINCE IT FAILS WHEN $member EQUALS 'msgnote'. +# /note|tech|stack/ && do{ push @out, $self->_get_list_data( $_, $delimiter ); }; + + last SWITCH; + $self->warn("Invalid or undefined Err data member ($member)."); + } +# $DEBUG && do{ print STDERR "OUTER DELIM = $outer_delim \nOUT: \n @out <---";;}; + wantarray ? @out : join('',@out); +} + + + +=head2 _get_list_data + + Usage : n/a; internal method used by get() + Purpose : Gets data for members which are list refs (note, tech, stack, context) + Returns : Array + Argument : ($member, $delimiter) + +See Also : L() + +=cut + +#------------------- +sub _get_list_data { +#------------------- + my( $self, $member, $delimiter ) = @_; + $delimiter ||= "\t"; + # Sensitive to data member name changes. + $member = "_\l$member"; + return if !defined $self->{$member}; + join( $delimiter, @{$self->{$member}} ); +} + + + +=head2 get_all + + Usage : (same as get()) + Purpose : Get specific data from all errors in an Err.pm object. + Returns : Array in list context. + : String in scalar context. + Argument : (same as get()) + +See Also : L() + +=cut + +#------------ +sub get_all { +#------------ + my( $self, $member, $delimiter ) = @_; + + if( $self->size() == 1) { + return $self->get( $member, $delimiter); + } else { + my $err = $self; + + ### Return data from multiple errors in a list. + if(wantarray) { + my @out; + do{ push @out, $err->get( $member); + } while($err = $err->prev()); + return @out; + + } else { + ### Return data from multiple errors in a string with each error's data + ### bracketed by a "Error #n\n" line and two delimiters. + my $out = ''; + if($err->size() == 1) { + $out = $err->get( $member, $delimiter); + } else { + do{ #$out .= "Error #${\$err->rank()}$delimiter"; + $out .= $err->get( $member, $delimiter); + $out .= $delimiter.$delimiter; + } while($err = $err->prev()); + } + return $out; + } + } +} + +##################################################################################### +## INSTANCE METHODS ## +##################################################################################### + + +=head2 _add_note + + Usage : n/a; internal method called by _add_list_data() + Purpose : adds a new note. + +See Also : L<_add_list_data>() + +=cut + +#--------------- +sub _add_note { +#--------------- + my( $self, $data ) = @_; + + if( defined $self->{'_note'} ) { + push @{ $self->{'_note'}}, $data; + } else { + $self->_set_list_data('note', $data ); + } +} + +#---------------------------------------------------------------------- +=head2 _add_tech() + + Usage : n/a; internal method called by _add_list_data() + Purpose : adds a new technical note. + +See Also : L<_add_list_data>() + +=cut + +#------------- +sub _add_tech { +#------------- + my( $self, $data ) = @_; + + if( defined $self->{'_tech'} ) { + push @{ $self->{'_tech'}}, $data; + } else { + $self->_set_list_data('Tech', $data ); + } +} + + +=head2 _add_list_data + + Usage : n/a; called by _set_list_data() + Purpose : adds a new note or tech note. + +See Also : L<_set_list_data>() + +=cut + +#-------------------- +sub _add_list_data { +#-------------------- + my( $self, $member, $data ) = @_; + + local $_ = $member; + SWITCH: { + /note/i && do{ $self->_add_note( $data ); }; + /tech/i && do{ $self->_add_tech( $data ); }; + } +} + + + +=head2 print + + Usage : $err->print; + Purpose : Prints Err data to STDOUT or a FileHandle. + Returns : Call to print + Argument : Named parameters for string() + Comments : Uses string() to get data. + +See Also : L() + +=cut + +#----------- +sub print { +#----------- + my( $self, %param ) = @_; +# my $OUT = $self->parent->fh(); +# print $OUT $self->string(%param); + print $self->string(%param); +} + + +=head2 string + + Usage : $err->string( %named_parameters); + Purpose : Stringify the data contained in the Err.pm object. + Example : print STDERR $err->string; + Returns : String + Argument : Named parameters (optional) passed to + : Bio::Root::IOManager::set_display(). + +See Also : L(), L<_build_from_string>(), B + +=cut + +#----------- +sub string { +#----------- + my( $self, @param ) = @_; + + my %param = @param; + $self->set_display( @param ); + my $show = $self->show; + my $out = $param{-BEEP} ? "\a" : ''; + + my $err = $param{-CURRENT} ? $self->last : $self->first; + +# my $err1 = $err; +# my $errL = $self->last; +# print "\n\nERR 1: ${\$err1->msg}"; +# print "\nERR L: ${\$errL->msg}";; + + my $numerate = $err->size() >1; + my $count = 0; + my ($title); + my $hasnote = defined $self->{'_note'}; + my $hastech = defined $self->{'_tech'}; + + while (ref $err) { + $count++; +# $out .= sprintf "\nERROR #%d:", $count; + + if(not $title = $err->{'_type'}) { + $err = $err->next(); + next; + } + if( $numerate) { + ## The rank data is a bit screwy at present. + $out .= sprintf "\n%s %s %s\n", '-'x 20, $title,'-'x 20; + } else { + $out .= sprintf "\n%s %s %s\n", '-'x20, $title,'-'x20; + } + $show =~ /msg|default/i and $out .= "MSG: " . $err->msg("\n"); + $show =~ /note|default/i and $hasnote and $out .= "NOTE: ".$err->note("\n"); + $show =~ /tech|default/i and $hastech and $out .= "TECH: ".$err->tech("\n"); + $show =~ /context|default/i and $out .= "CONTEXT: ".$err->context("\n"); + $show =~ /stack|default/i and $out .= "STACK: \n".$err->stack("\n"); + $out .= sprintf "%s%s%s\n",'-'x 20, '-'x (length($title)+2), '-'x 20; + +# print "$ID: string: cumulative err:\n$out\n";; + + $err = $err->next(); + } + + $out; +} + + + +=head2 is_fatal + + Usage : $err->is_fatal; + Purpose : Determine if the error is of type 'FATAL' + Returns : Boolean + Status : Experimental, Deprecated + +=cut + +#-------------- +sub is_fatal { my $self = shift; $self->{'_type'} eq 'FATAL'; } +#-------------- + +##################################################################################### +## CLASS METHODS ## +##################################################################################### + + +=head2 throw + + Usage : throw($object, [message], [note], [technical note]); + : This method is exported. + Purpose : Class method version of Bio::Root::Object::throw(). + Returns : die()s with the contents of the Err object in a string. + : If the global strictness is less than -1, die is not called and + : the error is printed to STDERR. + Argument : [0] = object throwing the error. + : [1] = optional message about the error. + : [2] = optional note about the error. + : [3] = optional technical note about the error. + Comments : The glogal verbosity level is not used. For verbosity-sensitive + : behavior, use Bio::Root::Object::throw(). + Status : Experimental + : This method is an alternative to Bio::Root::Object::throw() + : and is not as well developed or documented as that method. + +See Also : L(), B B() + +=cut + +#---------- +sub throw { +#---------- + my($obj, @param) = @_; + +# print "Throwing exception for object ${\ref $self} \"${\$self->name}\"\n"; + my $err = new Bio::Root::Err( + -MSG =>$param[0], + -NOTE =>$param[1], + -TECH =>$param[2], + -STACK =>scalar(Bio::Root::Object::stack_trace($obj,2)), + -CONTEXT =>Bio::Root::Object::containment($obj), + -TYPE =>'EXCEPTION', + # -PARENT =>$obj, + ); + + if(strictness() < -1) { + print STDERR $err->string(-BEEP=>1) unless verbosity() < 0; + } else { + die $err->string; + } + + 0; +} + + +=head2 warning + + Usage : warning($object, [message], [note], [technical note]); + : This method is exported. + Purpose : Class method version of Bio::Root::Object::warn(). + Returns : Prints the contents of the error to STDERR and returns false (0). + : If the global strictness() is > 1, warn() calls are converted + : into throw() calls. + Argument : [0] = object producing the warning. + : [1] = optional message about the error. + : [2] = optional note about the error. + : [3] = optional technical note about the error. + : + Comments : The glogal verbosity level is not used. For verbosity-sensitive + : behavior, use Bio::Root::Object::warn(). + Status : Experimental + : This method is an alternative to Bio::Root::Object::warn() + : and is not as well developed or documented as that method. + +See Also : L, B, B + +=cut + +#----------- +sub warning { +#----------- + my($obj, @param) = @_; + +# print "Throwing exception for object ${\ref $self} \"${\$self->name}\"\n"; + my $err = new Bio::Root::Err( + -MSG =>$param[0], + -NOTE =>$param[1], + -TECH =>$param[2], + -STACK =>scalar(Bio::Root::Object::stack_trace($obj,2)), + -CONTEXT =>Bio::Root::Object::containment($obj), + -TYPE =>'WARNING', + # -PARENT =>$obj, + ); + + if(strictness() > 1) { + die $err->string; + + } else { + print STDERR $err->string(-BEEP=>1) unless $DONT_WARN; + } + + 0; +} + + +=head2 format_stack_entry + + Usage : &format_stack_entry(,,,,,) + : This function is exported. + Purpose : Creates a single stack entry given a caller() list. + Argument : List of scalars (output of the caller() method). + Returns : String = class_method($line) + : e.g., Bio::Root::Object::name(1234) + +=cut + +#------------------------ +sub format_stack_entry { +#------------------------ + my( $class, $file, $line, $classmethod, $hasargs, $wantarray) = @_; + +# if($DEBUG) { +# print STDERR "format_stack_entry data:\n"; +# foreach(@_) {print STDERR "$_\n"; } ; +# } + + $classmethod ||= 'unknown class/method'; + $line ||= 'unknown line'; + return "$classmethod($line)"; +} + +1; +__END__ + +##################################################################################### +# END OF CLASS # +##################################################################################### + +=head1 FOR DEVELOPERS ONLY + +=head2 Data Members + +Information about the various data members of this module is provided for those +wishing to modify or understand the code. Two things to bear in mind: + +=over 4 + +=item 1 Do NOT rely on these in any code outside of this module. + +All data members are prefixed with an underscore to signify that they are private. +Always use accessor methods. If the accessor doesn't exist or is inadequate, +create or modify an accessor (and let me know, too!). + +=item 2 This documentation may be incomplete and out of date. + +It is easy for this documentation to become obsolete as this module is still evolving. +Always double check this info and search for members not described here. + +=back + +An instance of Bio::Root::Err.pm is a blessed reference to a hash containing +all or some of the following fields: + + FIELD VALUE + ------------------------------------------------------------------------ + _type fatal | warning | exception (one of @Bio::Root::Err::ERR_TYPES). + + _msg Terse description: Main cause of error. + + _note List reference. Verbose description: probable cause & troubleshooting for user. + + _tech List reference. Technical notes of interest to programmer. + + _stack List reference. Stack trace: list of "class::method(line number)" strings. + + + +=cut + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Root/Exception.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Root/Exception.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,468 @@ +#----------------------------------------------------------------- +# $Id: Exception.pm,v 1.14 2002/06/29 00:42:17 sac Exp $ +# +# BioPerl module Bio::Root::Exception +# +# Cared for by Steve Chervitz +# +# You may distribute this module under the same terms as perl itself +#----------------------------------------------------------------- + +=head1 NAME + +Bio::Root::Exception - Generic exception objects for Bioperl + +=head1 SYNOPSIS + +=head2 Throwing exceptions using B: + + use Bio::Root::Exception; + use Error; + + # Set Error::Debug to include stack trace data in the error messages + $Error::Debug = 1; + + $file = shift; + open (IN, $file) || + throw Bio::Root::FileOpenException ( "Can't open file $file for reading", $!); + +=head2 Throwing exceptions using B: + + # Here we have an object that ISA Bio::Root::Root, so it inherits throw(). + + open (IN, $file) || + $object->throw(-class => 'Bio::Root::FileOpenException', + -text => "Can't open file $file for reading", + -value => $!); + +=head2 Catching and handling exceptions using B: + + use Bio::Root::Exception; + use Error qw(:try); + + # Note that we need to import the 'try' tag from Error.pm + + # Set Error::Debug to include stack trace data in the error messages + $Error::Debug = 1; + + $file = shift; + try { + open (IN, $file) || + throw Bio::Root::FileOpenException ( "Can't open file $file for reading", $!); + } + catch Bio::Root::FileOpenException with { + my $err = shift; + print STDERR "Using default input file: $default_file\n"; + open (IN, $default_file) || die "Can't open $default_file"; + } + otherwise { + my $err = shift; + print STDERR "An unexpected exception occurred: \n$err"; + + # By placing an the error object reference within double quotes, + # you're invoking its stringify() method. + } + finally { + # Any code that you want to execute regardless of whether or not + # an exception occurred. + }; + # the ending semicolon is essential! + + +=head2 Defining a new Exception type as a subclass of Bio::Root::Exception: + + @Bio::TestException::ISA = qw( Bio::Root::Exception ); + + +=head1 DESCRIPTION + +=head2 Exceptions defined in B + +These are generic exceptions for typical problem situations that could arise +in any module or script. + +=over 8 + +=item Bio::Root::Exception() + +=item Bio::Root::NotImplemented() + +=item Bio::Root::IOException() + +=item Bio::Root::FileOpenException() + +=item Bio::Root::SystemException() + +=item Bio::Root::BadParameter() + +=item Bio::Root::OutOfRange() + +=item Bio::Root::NoSuchThing() + +=back + +Using defined exception classes like these is a good idea because it +indicates the basic nature of what went wrong in a convenient, +computable way. + +If there is a type of exception that you want to throw +that is not covered by the classes listed above, it is easy to define +a new one that fits your needs. Just write a line like the following +in your module or script where you want to use it (or put it somewhere +that is accessible to your code): + + @NoCanDoException::ISA = qw( Bio::Root::Exception ); + +All of the exceptions defined in this module inherit from a common +base class exception, Bio::Root::Exception. This allows a user to +write a handler for all Bioperl-derived exceptions as follows: + + use Bio::Whatever; + use Error qw(:try); + + try { + # some code that depends on Bioperl + } + catch Bio::Root::Exception with { + my $err = shift; + print "A Bioperl exception occurred:\n$err\n"; + }; + +So if you do create your own exceptions, just be sure they inherit +from Bio::Root::Exception directly, or indirectly by inheriting from a +Bio::Root::Exception subclass. + +The exceptions in Bio::Root::Exception are extensions of Graham Barr's +B module available from CPAN. Despite this dependency, the +Bio::Root::Exception module does not explicitly C. +This permits Bio::Root::Exception to be loaded even when +Error.pm is not available. + +=head2 Throwing exceptions within Bioperl modules + +Error.pm is not part of the Bioperl distibution, and may not be +present within any given perl installation. So, when you want to +throw an exception in a Bioperl module, the safe way to throw it +is to use B which can use Error.pm +when it's available. See documentation in Bio::Root::Root for details. + +=head1 SEE ALSO + +See the C directory of the Bioperl distribution for +working demo code. + +B for information about throwing +Bio::Root::Exception-based exceptions. + +B (available from CPAN, author: GBARR) + +Error.pm is helping to guide the design of exception handling in Perl 6. +See these RFC's: + + http://dev.perl.org/rfc/63.pod + + http://dev.perl.org/rfc/88.pod + + +=head1 AUTHOR + +Steve Chervitz Esac@bioperl.orgE + +=head1 COPYRIGHT + +Copyright (c) 2001 Steve Chervitz. All Rights Reserved. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=head1 EXCEPTIONS + +=cut + +# Define some generic exceptions.' + +package Bio::Root::Exception; + +use strict; + +my $debug = $Error::Debug; # Prevents the "used only once" warning. +my $DEFAULT_VALUE = "__DUMMY__"; # Permits eval{} based handlers to work + +=head2 B + + Purpose : A generic base class for all BioPerl exceptions. + By including a "catch Bio::Root::Exception" block, you + should be able to trap all BioPerl exceptions. + Example : throw Bio::Root::Exception("A generic exception", $!); + +=cut + +#--------------------------------------------------------- +@Bio::Root::Exception::ISA = qw( Error ); +#--------------------------------------------------------- + +=head2 Methods defined by Bio::Root::Exception + +=over 4 + +=item B< new() > + + Purpose : Guarantees that -value is set properly before + calling Error::new(). + + Arguments: key-value style arguments same as for Error::new() + + You can also specify plain arguments as ($message, $value) + where $value is optional. + + -value, if defined, must be non-zero and not an empty string + in order for eval{}-based exception handlers to work. + These require that if($@) evaluates to true, which will not + be the case if the Error has no value (Error overloads + numeric operations to the Error::value() method). + + It is OK to create Bio::Root::Exception objects without + specifing -value. In this case, an invisible dummy value is used. + + If you happen to specify a -value of zero (0), it will + be replaced by the string "The number zero (0)". + + If you happen to specify a -value of empty string (""), it will + be replaced by the string "An empty string ("")". + +=cut + +sub new { + my ($class, @args) = @_; + my ($value, %params); + if( @args % 2 == 0 && $args[0] =~ /^-/) { + %params = @args; + $value = $params{'-value'}; + } + else { + $params{-text} = $args[0]; + $value = $args[1]; + } + + if( defined $value and not $value) { + $value = "The number zero (0)" if $value == 0; + $value = "An empty string (\"\")" if $value eq ""; + } + else { + $value ||= $DEFAULT_VALUE; + } + $params{-value} = $value; + + my $self = $class->SUPER::new( %params ); + return $self; +} + +=item pretty_format() + + Purpose : Get a nicely formatted string containing information about the + exception. Format is similar to that produced by + Bio::Root::Root::throw(), with the addition of the name of + the exception class in the EXCEPTION line and some other + data available via the Error object. + Example : print $error->pretty_format; + +=cut + +sub pretty_format { + my $self = shift; + my $msg = $self->text; + my $stack = ''; + if( $Error::Debug ) { + $stack = $self->_reformat_stacktrace(); + } + my $value_string = $self->value ne $DEFAULT_VALUE ? "VALUE: ".$self->value."\n" : ""; + my $class = ref($self); + + my $title = "------------- EXCEPTION: $class -------------"; + my $footer = "\n" . '-' x CORE::length($title); + my $out = "\n$title\n" . + "MSG: $msg\n". $value_string. $stack. $footer . "\n"; + return $out; +} + + +# Reformatting of the stack performed by _reformat_stacktrace: +# 1. Shift the file:line data in line i to line i+1. +# 2. change xxx::__ANON__() to "try{} block" +# 3. skip the "require" and "Error::subs::try" stack entries (boring) +# This means that the first line in the stack won't have any file:line data +# But this isn't a big issue since it's for a Bio::Root::-based method +# that doesn't vary from exception to exception. + +sub _reformat_stacktrace { + my $self = shift; + my $msg = $self->text; + my $stack = $self->stacktrace(); + $stack =~ s/\Q$msg//; + my @stack = split( /\n/, $stack); + my @new_stack = (); + my ($method, $file, $linenum, $prev_file, $prev_linenum); + my $stack_count = 0; + foreach my $i( 0..$#stack ) { + # print "STACK-ORIG: $stack[$i]\n"; + if( ($stack[$i] =~ /^\s*([^(]+)\s*\(.*\) called at (\S+) line (\d+)/) || + ($stack[$i] =~ /^\s*(require 0) called at (\S+) line (\d+)/)) { + ($method, $file, $linenum) = ($1, $2, $3); + $stack_count++; + } + else{ + next; + } + if( $stack_count == 1 ) { + push @new_stack, "STACK: $method"; + ($prev_file, $prev_linenum) = ($file, $linenum); + next; + } + + if( $method =~ /__ANON__/ ) { + $method = "try{} block"; + } + if( ($method =~ /^require/ and $file =~ /Error\.pm/ ) || + ($method =~ /^Error::subs::try/ ) ) { + last; + } + push @new_stack, "STACK: $method $prev_file:$prev_linenum"; + ($prev_file, $prev_linenum) = ($file, $linenum); + } + push @new_stack, "STACK: $prev_file:$prev_linenum"; + + return join "\n", @new_stack; +} + +=item B< stringify() > + + Purpose : Overrides Error::stringify() to call pretty_format(). + This is called automatically when an exception object + is placed between double quotes. + Example : catch Bio::Root::Exception with { + my $error = shift; + print "$error"; + } + +See Also: L + +=cut + +sub stringify { + my ($self, @args) = @_; + return $self->pretty_format( @args ); +} + + + +=back + +=head1 Subclasses of Bio::Root::Exception + + +=head2 B + + Purpose : Indicates that a method has not been implemented. + Example : throw Bio::Root::NotImplemented( + -text => "Method \"foo\" not implemented in module FooBar.", + -value => "foo" ); + +=cut + +#--------------------------------------------------------- +@Bio::Root::NotImplemented::ISA = qw( Bio::Root::Exception ); +#--------------------------------------------------------- + +=head2 B + + Purpose : Indicates that some input/output-related trouble has occurred. + Example : throw Bio::Root::IOException( + -text => "Can't save data to file $file.", + -value => $! ); + +=cut + +#--------------------------------------------------------- +@Bio::Root::IOException::ISA = qw( Bio::Root::Exception ); +#--------------------------------------------------------- + + +=head2 B + + Purpose : Indicates that a file could not be opened. + Example : throw Bio::Root::FileOpenException( + -text => "Can't open file $file for reading.", + -value => $! ); + +=cut + +#--------------------------------------------------------- +@Bio::Root::FileOpenException::ISA = qw( Bio::Root::IOException ); +#--------------------------------------------------------- + + +=head2 B + + Purpose : Indicates that a system call failed. + Example : unlink($file) or throw Bio::Root::SystemException( + -text => "Can't unlink file $file.", + -value => $! ); + +=cut + +#--------------------------------------------------------- +@Bio::Root::SystemException::ISA = qw( Bio::Root::Exception ); +#--------------------------------------------------------- + + +=head2 B + + Purpose : Indicates that one or more parameters supplied to a method + are invalid, unspecified, or conflicting. + Example : throw Bio::Root::BadParameter( + -text => "Required parameter \"-foo\" was not specified", + -value => "-foo" ); + +=cut + +#--------------------------------------------------------- +@Bio::Root::BadParameter::ISA = qw( Bio::Root::Exception ); +#--------------------------------------------------------- + + +=head2 B + + Purpose : Indicates that a specified (start,end) range or + an index to an array is outside the permitted range. + Example : throw Bio::Root::OutOfRange( + -text => "Start coordinate ($start) cannot be less than zero.", + -value => $start ); + +=cut + +#--------------------------------------------------------- +@Bio::Root::OutOfRange::ISA = qw( Bio::Root::Exception ); +#--------------------------------------------------------- + + +=head2 B + + Purpose : Indicates that a requested thing cannot be located + and therefore could possibly be bogus. + Example : throw Bio::Root::NoSuchThing( + -text => "Accession M000001 could not be found.", + -value => "M000001" ); + +=cut + +#--------------------------------------------------------- +@Bio::Root::NoSuchThing::ISA = qw( Bio::Root::Exception ); +#--------------------------------------------------------- + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Root/Global.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Root/Global.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,261 @@ +#-------------------------------------------------------------------------------- +# PACKAGE : Bio::Root::Global.pm +# PURPOSE : Provides global data, objects, and methods potentially useful to +# many different modules and scripts. +# AUTHOR : Steve Chervitz (sac@bioperl.org) +# CREATED : 3 Sep 1996 +# REVISION: $Id: Global.pm,v 1.8 2002/01/11 08:05:31 sac Exp $ +# +# INSTALLATION: +# This module is included with the central Bioperl distribution: +# http://bio.perl.org/Core/Latest +# ftp://bio.perl.org/pub/DIST +# Follow the installation instructions included in the README file. +# +# COMMENTS: Edit the $AUTHORITY string to a desired e-mail address. +# +# STRICTNESS, VERBOSITY, and variables containing the words WARN and FATAL +# are considered experimental. The purpose & usage of these is explained +# in Bio::Root::Object.pm. +# +# MODIFIED: +# sac --- Fri Jan 8 00:04:28 1999 +# * Added BEGIN block to set $CGI if script is running as a cgi. +# sac --- Tue Dec 1 1998 +# * Added $STRICTNESS and $VERBOSITY. +# * Deprecated WARN_ON_FATAL, FATAL_ON_WARN, DONT_WARN and related methods. +# These will eventually be removed. +# sac --- Fri 5 Jun 1998: Added @DAYS. +# sac --- Sun Aug 16 1998: Added $RECORD_ERR and &record_err(). +#-------------------------------------------------------------------------------- + +### POD Documentation: + +=head1 NAME + +Bio::Root::Global - Global variables and utility functions + +=head1 SYNOPSIS + + # no real synopsis - see Bio::Root::Object + +=head1 DESCRIPTION + +The Bio::Root::Global file contains all the global flags +about erro warning etc, and also utility functions, eg +to map numbers to roman numerals. + +These functions are generally called by Bio::Root::Object +or somewhere similar, and not directly + + +=head1 INSTALLATION + +This module is included with the central Bioperl distribution: + + http://bio.perl.org/Core/Latest + ftp://bio.perl.org/pub/DIST + +Follow the installation instructions included in the README file. + +=cut + +package Bio::Root::Global; +use strict; + +BEGIN { + use vars qw($CGI $TIMEOUT_SECS); + + # $CGI is a boolean to indicate if the script is running as a CGI. + # Useful for conditionally producing HTML-formatted messages + # or suppressing messages appropriate only for interactive sessions. + + $CGI = 1 if $ENV{REMOTE_ADDR} || $ENV{REMOTE_HOST}; +} + +use Exporter (); +use vars qw($BASE_YEAR @DAYS @MONTHS); + +use vars qw(@ISA @EXPORT_OK %EXPORT_TAGS); +@ISA = qw( Exporter ); +@EXPORT_OK = qw($AUTHORITY $NEWLINE + $DEBUG $MONITOR $TESTING + $DONT_WARN $WARN_ON_FATAL $FATAL_ON_WARN $RECORD_ERR + $STRICTNESS $VERBOSITY $TIMEOUT_SECS + $CGI $GLOBAL + $BASE_YEAR %ROMAN_NUMS @MONTHS @DAYS + &roman2int &debug &monitor &testing &dont_warn &record_err + &warn_on_fatal &fatal_on_warn &strictness &verbosity + ); + +%EXPORT_TAGS = ( + + std =>[qw($DEBUG $MONITOR $TESTING $NEWLINE + $DONT_WARN $WARN_ON_FATAL $FATAL_ON_WARN $RECORD_ERR + $STRICTNESS $VERBOSITY + &debug &monitor &testing &dont_warn + &warn_on_fatal &fatal_on_warn &record_err + &strictness &verbosity + &roman2int $AUTHORITY $CGI $GLOBAL)], + + obj =>[qw($GLOBAL)], + + devel =>[qw($DEBUG $MONITOR $TESTING $DONT_WARN + $WARN_ON_FATAL $FATAL_ON_WARN $RECORD_ERR + $STRICTNESS $VERBOSITY $NEWLINE + &debug &monitor &testing &dont_warn + &strictness &verbosity + &warn_on_fatal &fatal_on_warn)], + + data =>[qw($BASE_YEAR %ROMAN_NUMS @MONTHS @DAYS)], + + ); + +# Note: record_err() is not included in the devel tag to allow Bio::Root:Object.pm +# to define it without a name clash. + +###################################### +## Data ## +###################################### + +use vars qw($AUTHORITY $DEBUG $MONITOR $TESTING $DONT_WARN $WARN_ON_FATAL + $FATAL_ON_WARN $RECORD_ERR $STRICTNESS $VERBOSITY $NEWLINE + %ROMAN_NUMS $GLOBAL); + +# Who should receive feedback from users and possibly automatic error messages. +$AUTHORITY = 'sac@bioperl.org'; + +$DEBUG = 0; +$MONITOR = 0; +$TESTING = 0; +$DONT_WARN = 0; +$WARN_ON_FATAL = 0; +$FATAL_ON_WARN = 0; +$RECORD_ERR = 0; +$STRICTNESS = 0; +$VERBOSITY = 0; +$TIMEOUT_SECS = 30; # Number of seconds to wait for input in I/O functions. + +$BASE_YEAR = 1900; +$NEWLINE = $ENV{'NEWLINE'} || undef; + +%ROMAN_NUMS = ('1'=>'I', '2'=>'II', '3'=>'III', '4'=>'IV', '5'=>'V', + '6'=>'VI', '7'=>'VII', '8'=>'VIII', '9'=>'IX', '10'=>'X', + '11'=>'XI', '12'=>'XII', '13'=>'XIII', '14'=>'XIV', '15'=>'XV', + '16'=>'XVI', '17'=>'XVII', '18'=>'XVIII', '19'=>'XIX', '20'=>'XX', + '21'=>'XXI', '22'=>'XXII', + ); + +@MONTHS = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec); +@DAYS = qw(Sun Mon Tue Wed Thu Fri Sat); + +# The implicit global object. Used for trapping miscellaneous errors/exceptions. +# Created without using or requiring Bio::Root::Object.pm, because Object.pm uses Global.pm. +# Just be sure to use Bio::Root::Object.pm, or a module that uses it. + +$GLOBAL = {}; +bless $GLOBAL, 'Bio::Root::Object'; +$GLOBAL->{'_name'} = 'Global object'; + + +###################################### +## Methods ## +###################################### + +sub roman2int { + my $roman = uc(shift); + foreach (keys %ROMAN_NUMS) { + return $_ if $ROMAN_NUMS{$_} eq $roman; + } +# Alternatively: +# my @int = grep $ROMAN_NUMS{$_} eq $roman, keys %ROMAN_NUMS; +# return $int[0]; + undef; +} + +sub debug { + my $level = shift; + if( defined $level) { $DEBUG = $level } + else { $DEBUG = 0 } +# $MONITOR and do{ print STDERR $DEBUG ? "Debug on ($DEBUG).\n\n" : "Debug off.\n\n"; }; + $MONITOR and do{ print STDERR $DEBUG ? "Debug on ($DEBUG).\n\n" : ""; }; + $DEBUG; +} + +sub monitor { + my $level = shift; + if( defined $level) { $MONITOR = $level } + else { $MONITOR = 0 } + $DEBUG and (print STDERR "Monitor on ($MONITOR).\n\n"); + $MONITOR; +} + +sub testing { + my $level = shift; + if( defined $level) { $TESTING = $level } + else { $TESTING = 0 } + $TESTING ? ($MONITOR && print STDERR "Testing on ($TESTING).\n\n") : ($MONITOR && print STDERR "Testing off.\n\n"); + $TESTING; +} + +sub strictness { +# Values can integers from -2 to 2 +# See Bio::Root::Object::strict() for more explanation. + my $arg = shift; + if( defined $arg) { $STRICTNESS = $arg} + $DEBUG && print STDERR "\n*** STRICTNESS: $arg ***\n\n"; + $STRICTNESS; +} + +sub verbosity { +# Values can integers from -1 to 1 +# See Bio::Root::Object::verbose() for more explanation. + my $arg = shift; + if( defined $arg) { $VERBOSITY = $arg} + $DEBUG && print STDERR "\n*** VERBOSITY: $arg ***\n\n"; + $VERBOSITY; +} + +sub record_err { + if( defined shift) { $RECORD_ERR = 1} + else { $RECORD_ERR = 0 } + $RECORD_ERR ? ($DEBUG && print STDERR "\n*** RECORD_ERR on. ***\n\n") : ($DEBUG && print STDERR "RECORD_ERR off.\n\n"); + $RECORD_ERR; +} + +## +## The following methods are deprecated and will eventually be removed. +## + +sub dont_warn { + my $arg = shift; + !$CGI and print STDERR "\n$0: Deprecated method dont_warn() called. Use verbosity(-1) instead\n"; + if( $arg) { verbosity(-1)} + else { verbosity(0); } +} + +sub warn_on_fatal { + my $arg = shift; + !$CGI and print STDERR "\n$0: Deprecated method warn_on_fatal() called. Use strictness(-2) instead\n"; + if( $arg) { strictness(-2)} + else { strictness(0); } +} + +sub fatal_on_warn { + my $arg = shift; + !$CGI and print STDERR "\n$0: Deprecated method fatal_on_warn() called. Use strictness(2) instead\n"; + if( $arg) { strictness(2)} + else { strictness(0); } +} + +##################################################################################### +# END OF PACKAGE +##################################################################################### + +1; + + + + + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Root/HTTPget.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Root/HTTPget.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,315 @@ +# $Id: HTTPget.pm,v 1.4 2002/10/22 07:38:37 lapp Exp $ +# +# BioPerl module for fallback HTTP get operations. +# Module is proxy-aware +# +# Cared for by Chris Dagdigian +# but all of the good stuff was written by +# Lincoln Stein. +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Root::HTTPget - module for fallback HTTP get operations when +LWP:: is unavailable + +=head1 SYNOPSIS + + Use Bio::Root::HTTPget; + + my $response = get('http://localhost'); + $response = get('http://localhost/images'); + + $response = eval { get('http://fred:secret@localhost/ladies_only/') + } or warn $@; + + $response = eval { get('http://jeff:secret@localhost/ladies_only/') + } or warn $@; + + $response = get('http://localhost/images/navauthors.gif'); + $response = get(-url=>'http://www.google.com', + -proxy=>'http://www.modperl.com'); + +=head1 DESCRIPTION + +This is basically an last-chance module for doing network HTTP get requests in +situations where more advanced external CPAN modules such as LWP:: are not +installed. + +The particular reason this module was developed was so that the Open Bio +Database Access code can fallback to fetching the default registry files +from http://open-bio.org/registry/ without having to depend on +external dependencies like Bundle::LWP for network HTTP access. + +The core of this module was written by Lincoln Stein. It can handle proxies +and HTTP-based proxy authentication. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Lincoln Stein + + Cared for by Chris Dagdigian + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Root::HTTPget; + +use strict; +use Bio::Root::Root; +use IO::Socket qw(:DEFAULT :crlf); +use vars '@ISA'; + +@ISA = qw(Bio::Root::Root); + + +=head2 get + + Title : get + Usage : + Function: + Example : + Returns : string + Args : + +=cut + +sub get { + my ($url,$proxy,$timeout,$auth_user,$auth_pass) = + __PACKAGE__->_rearrange([qw(URL PROXY TIMEOUT USER PASS)],@_); + my $dest = $proxy || $url; + + my ($host,$port,$path,$user,$pass) + = _http_parse_url($dest) or __PACKAGE__->throw("invalid URL $url"); + $auth_user ||= $user; + $auth_pass ||= $pass; + $path = $url if $proxy; + + # set up the connection + my $socket = _http_connect($host,$port) or __PACKAGE__->throw("can't connect: $@"); + + # the request + print $socket "GET $path HTTP/1.0$CRLF"; + print $socket "User-Agent: Bioperl fallback fetcher/1.0$CRLF"; + # Support virtual hosts + print $socket "HOST: $host$CRLF"; + + if ($auth_user && $auth_pass) { # authentication information + my $token = _encode_base64("$auth_user:$auth_pass"); + print $socket "Authorization: Basic $token$CRLF"; + } + print $socket "$CRLF"; + + # read the response + my $response; + { + local $/ = "$CRLF$CRLF"; + $response = <$socket>; + } + + my ($status_line,@other_lines) = split $CRLF,$response; + my ($stat_code,$stat_msg) = $status_line =~ m!^HTTP/1\.[01] (\d+) (.+)! + or __PACKAGE__->throw("invalid response from web server: got $response"); + + my %headers = map {/^(\S+): (.+)/} @other_lines; + if ($stat_code == 302 || $stat_code == 301) { # redirect + my $location = $headers{Location} or __PACKAGE__->throw("invalid redirect: no Location header"); + return get($location,$proxy,$timeout); # recursive call + } + + elsif ($stat_code == 401) { # auth required + my $auth_required = $headers{'WWW-Authenticate'}; + $auth_required =~ /^Basic realm="([^\"]+)"/ + or __PACKAGE__->throw("server requires unknown type of authentication: $auth_required"); + __PACKAGE__->throw("request failed: $status_line, realm = $1"); + } + + elsif ($stat_code != 200) { + __PACKAGE__->throw("request failed: $status_line"); + } + + $response = ''; + while (1) { + my $bytes = read($socket,$response,2048,length $response); + last unless $bytes > 0; + } + + $response; +} + +=head2 getFH + + Title : getFH + Usage : + Function: + Example : + Returns : string + Args : + +=cut + +sub getFH { + my ($url,$proxy,$timeout,$auth_user,$auth_pass) = + __PACKAGE__->_rearrange([qw(URL PROXY TIMEOUT USER PASS)],@_); + my $dest = $proxy || $url; + + my ($host,$port,$path,$user,$pass) + = _http_parse_url($dest) or __PACKAGE__->throw("invalid URL $url"); + $auth_user ||= $user; + $auth_pass ||= $pass; + $path = $url if $proxy; + + # set up the connection + my $socket = _http_connect($host,$port) or __PACKAGE__->throw("can't connect: $@"); + + # the request + print $socket "GET $path HTTP/1.0$CRLF"; + print $socket "User-Agent: Bioperl fallback fetcher/1.0$CRLF"; + # Support virtual hosts + print $socket "HOST: $host$CRLF"; + + if ($auth_user && $auth_pass) { # authentication information + my $token = _encode_base64("$auth_user:$auth_pass"); + print $socket "Authorization: Basic $token$CRLF"; + } + print $socket "$CRLF"; + + # read the response + my $response; + { + local $/ = "$CRLF$CRLF"; + $response = <$socket>; + } + + my ($status_line,@other_lines) = split $CRLF,$response; + my ($stat_code,$stat_msg) = $status_line =~ m!^HTTP/1\.[01] (\d+) (.+)! + or __PACKAGE__->throw("invalid response from web server: got $response"); + + my %headers = map {/^(\S+): (.+)/} @other_lines; + if ($stat_code == 302 || $stat_code == 301) { # redirect + my $location = $headers{Location} or __PACKAGE__->throw("invalid redirect: no Location header"); + return get($location,$proxy,$timeout); # recursive call + } + + elsif ($stat_code == 401) { # auth required + my $auth_required = $headers{'WWW-Authenticate'}; + $auth_required =~ /^Basic realm="([^\"]+)"/ + or __PACKAGE__->throw("server requires unknown type of authentication: $auth_required"); + __PACKAGE__->throw("request failed: $status_line, realm = $1"); + } + + elsif ($stat_code != 200) { + __PACKAGE__->throw("request failed: $status_line"); + } + + # Now that we are reasonably sure the socket and request + # are OK we pass the socket back as a filehandle so it can + # be processed by the caller... + + $socket; + +} + + +=head2 _http_parse_url + + Title : + Usage : + Function: + Example : + Returns : + Args : + +=cut + +sub _http_parse_url { + my $url = shift; + my ($user,$pass,$hostent,$path) = + $url =~ m!^http://(?:([^:]+):([^:]+)@)?([^/]+)(/?[^\#]*)! or return; + $path ||= '/'; + my ($host,$port) = split(':',$hostent); + return ($host,$port||80,$path,$user,$pass); +} + +=head2 _http_connect + + Title : + Usage : + Function: + Example : + Returns : + Args : + +=cut + +sub _http_connect { + my ($host,$port,$timeout) = @_; + my $sock = IO::Socket::INET->new(Proto => 'tcp', + Type => SOCK_STREAM, + PeerHost => $host, + PeerPort => $port, + Timeout => $timeout, + ); + $sock; +} + + +=head2 _encode_base64 + + Title : + Usage : + Function: + Example : + Returns : + Args : + +=cut + +sub _encode_base64 { + my $res = ""; + my $eol = $_[1]; + $eol = "\n" unless defined $eol; + pos($_[0]) = 0; # ensure start at the beginning + + $res = join '', map( pack('u',$_)=~ /^.(\S*)/, ($_[0]=~/(.{1,45})/gs)); + + $res =~ tr|` -_|AA-Za-z0-9+/|; # `# help emacs + # fix padding at the end + my $padding = (3 - length($_[0]) % 3) % 3; + $res =~ s/.{$padding}$/'=' x $padding/e if $padding; + # break encoded string into lines of no more than 76 characters each + if (length $eol) { + $res =~ s/(.{1,76})/$1$eol/g; + } + return $res; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Root/IO.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Root/IO.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,930 @@ +# $Id: IO.pm,v 1.37.2.3 2003/06/28 21:57:04 jason Exp $ +# +# BioPerl module for Bio::Root::IO +# +# Cared for by Hilmar Lapp +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Root::IO - module providing several methods often needed when dealing with file IO + +=head1 SYNOPSIS + + # utilize stream I/O in your module + $self->{'io'} = Bio::Root::IO->new(-file => "myfile"); + $self->{'io'}->_print("some stuff"); + $line = $self->{'io'}->_readline(); + $self->{'io'}->_pushback($line); + $self->{'io'}->close(); + + # obtain platform-compatible filenames + $path = Bio::Root::IO->catfile($dir, $subdir, $filename); + # obtain a temporary file (created in $TEMPDIR) + ($handle) = $io->tempfile(); + +=head1 DESCRIPTION + +This module provides methods that will usually be needed for any sort +of file- or stream-related input/output, e.g., keeping track of a file +handle, transient printing and reading from the file handle, a close +method, automatically closing the handle on garbage collection, etc. + +To use this for your own code you will either want to inherit from +this module, or instantiate an object for every file or stream you are +dealing with. In the first case this module will most likely not be +the first class off which your class inherits; therefore you need to +call _initialize_io() with the named parameters in order to set file +handle, open file, etc automatically. + +Most methods start with an underscore, indicating they are private. In +OO speak, they are not private but protected, that is, use them in +your module code, but a client code of your module will usually not +want to call them (except those not starting with an underscore). + +In addition this module contains a couple of convenience methods for +cross-platform safe tempfile creation and similar tasks. There are +some CPAN modules related that may not be available on all +platforms. At present, File::Spec and File::Temp are attempted. This +module defines $PATHSEP, $TEMPDIR, and $ROOTDIR, which will always be set, +and $OPENFLAGS, which will be set if either of File::Spec or File::Temp fails. + +The -noclose boolean (accessed via the noclose method) prevents a +filehandle from being closed when the IO object is cleaned up. This +is special behavior when a object like a parser might share a +filehandle with an object like an indexer where it is not proper to +close the filehandle as it will continue to be reused until the end of the +stream is reached. In general you won't want to play with this flag. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp@gmx.net + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Root::IO; +use vars qw(@ISA $FILESPECLOADED $FILETEMPLOADED $FILEPATHLOADED + $TEMPDIR $PATHSEP $ROOTDIR $OPENFLAGS $VERBOSE); +use strict; + +use Symbol; +use POSIX qw(dup); +use IO::Handle; +use Bio::Root::Root; + +@ISA = qw(Bio::Root::Root); + +my $TEMPCOUNTER; +my $HAS_WIN32 = 0; + +BEGIN { + $TEMPCOUNTER = 0; + $FILESPECLOADED = 0; + $FILETEMPLOADED = 0; + $FILEPATHLOADED = 0; + $VERBOSE = 1; + + # try to load those modules that may cause trouble on some systems + eval { + require File::Path; + $FILEPATHLOADED = 1; + }; + if( $@ ) { + print STDERR "Cannot load File::Path: $@" if( $VERBOSE > 0 ); + # do nothing + } + + + # If on Win32, attempt to find Win32 package + + if($^O =~ /mswin/i) { + eval { + require Win32; + $HAS_WIN32 = 1; + }; + } + + # Try to provide a path separator. Why doesn't File::Spec export this, + # or did I miss it? + if($^O =~ /mswin/i) { + $PATHSEP = "\\"; + } elsif($^O =~ /macos/i) { + $PATHSEP = ":"; + } else { # unix + $PATHSEP = "/"; + } + eval { + require File::Spec; + $FILESPECLOADED = 1; + $TEMPDIR = File::Spec->tmpdir(); + $ROOTDIR = File::Spec->rootdir(); + require File::Temp; # tempfile creation + $FILETEMPLOADED = 1; + }; + if( $@ ) { + if(! defined($TEMPDIR)) { # File::Spec failed + # determine tempdir + if (defined $ENV{'TEMPDIR'} && -d $ENV{'TEMPDIR'} ) { + $TEMPDIR = $ENV{'TEMPDIR'}; + } elsif( defined $ENV{'TMPDIR'} && -d $ENV{'TMPDIR'} ) { + $TEMPDIR = $ENV{'TMPDIR'}; + } + if($^O =~ /mswin/i) { + $TEMPDIR = 'C:\TEMP' unless $TEMPDIR; + $ROOTDIR = 'C:'; + } elsif($^O =~ /macos/i) { + $TEMPDIR = "" unless $TEMPDIR; # what is a reasonable default on Macs? + $ROOTDIR = ""; # what is reasonable?? + } else { # unix + $TEMPDIR = "/tmp" unless $TEMPDIR; + $ROOTDIR = "/"; + } + if (!( -d $TEMPDIR && -w $TEMPDIR )) { + $TEMPDIR = '.'; # last resort + } + } + # File::Temp failed (alone, or File::Spec already failed) + # + # determine open flags for tempfile creation -- we'll have to do this + # ourselves + use Fcntl; + use Symbol; + $OPENFLAGS = O_CREAT | O_EXCL | O_RDWR; + for my $oflag (qw/FOLLOW BINARY LARGEFILE EXLOCK NOINHERIT TEMPORARY/){ + my ($bit, $func) = (0, "Fcntl::O_" . $oflag); + no strict 'refs'; + $OPENFLAGS |= $bit if eval { $bit = &$func(); 1 }; + } + } +} + +=head2 new + + Title : new + Usage : + Function: Overridden here to automatically call _initialize_io(). + Example : + Returns : new instance of this class + Args : named parameters + + +=cut + +sub new { + my ($caller, @args) = @_; + my $self = $caller->SUPER::new(@args); + + $self->_initialize_io(@args); + return $self; +} + +=head2 _initialize_io + + Title : initialize_io + Usage : $self->_initialize_io(@params); + Function: Initializes filehandle and other properties from the parameters. + + Currently recognizes the following named parameters: + -file name of file to open + -input name of file, or GLOB, or IO::Handle object + -fh file handle (mutually exclusive with -file) + -flush boolean flag to autoflush after each write + -noclose boolean flag, when set to true will not close a + filehandle (must explictly call close($io->_fh) + Returns : TRUE + Args : named parameters + + +=cut + +sub _initialize_io { + my($self, @args) = @_; + + $self->_register_for_cleanup(\&_io_cleanup); + + my ($input, $noclose, $file, $fh, $flush) = $self->_rearrange([qw(INPUT + NOCLOSE + FILE FH + FLUSH)], @args); + + delete $self->{'_readbuffer'}; + delete $self->{'_filehandle'}; + $self->noclose( $noclose) if defined $noclose; + # determine whether the input is a file(name) or a stream + if($input) { + if(ref(\$input) eq "SCALAR") { + # we assume that a scalar is a filename + if($file && ($file ne $input)) { + $self->throw("input file given twice: $file and $input disagree"); + } + $file = $input; + } elsif(ref($input) && + ((ref($input) eq "GLOB") || $input->isa('IO::Handle'))) { + # input is a stream + $fh = $input; + } else { + # let's be strict for now + $self->throw("unable to determine type of input $input: ". + "not string and not GLOB"); + } + } + if(defined($file) && defined($fh)) { + $self->throw("Providing both a file and a filehandle for reading - only one please!"); + } + + if(defined($file) && ($file ne '')) { + $fh = Symbol::gensym(); + open ($fh,$file) || + $self->throw("Could not open $file: $!"); + $self->file($file); + } + $self->_fh($fh) if $fh; # if not provided, defaults to STDIN and STDOUT + + $self->_flush_on_write(defined $flush ? $flush : 1); + + return 1; +} + +=head2 _fh + + Title : _fh + Usage : $obj->_fh($newval) + Function: Get/set the file handle for the stream encapsulated. + Example : + Returns : value of _filehandle + Args : newvalue (optional) + +=cut + +sub _fh { + my ($obj, $value) = @_; + if ( defined $value) { + $obj->{'_filehandle'} = $value; + } + return $obj->{'_filehandle'}; +} + +=head2 mode + + Title : mode + Usage : $obj->mode() + Function: + Example : + Returns : mode of filehandle: + 'r' for readable + 'w' for writeable + '?' if mode could not be determined + Args : -force (optional), see notes. + Notes : once mode() has been called, the filehandle's mode is cached + for further calls to mode(). to override this behavior so + that mode() re-checks the filehandle's mode, call with arg + -force + +=cut + +sub mode { + my ($obj, @arg) = @_; + my %param = @arg; + return $obj->{'_mode'} if defined $obj->{'_mode'} and !$param{-force}; + + print STDERR "testing mode... " if $obj->verbose; + + # we need to dup() the original filehandle because + # doing fdopen() calls on an already open handle causes + # the handle to go stale. is this going to work for non-unix + # filehandles? -allen + + my $fh = Symbol::gensym(); + + my $iotest = new IO::Handle; + + #test for a readable filehandle; + $iotest->fdopen( dup(fileno($obj->_fh)) , 'r' ); + if($iotest->error == 0){ + + # note the hack here, we actually have to try to read the line + # and if we get something, pushback() it into the readbuffer. + # this is because solaris and windows xp (others?) don't set + # IO::Handle::error. for non-linux the r/w testing is done + # inside this read-test, instead of the write test below. ugh. + + if($^O eq 'linux'){ + $obj->{'_mode'} = 'r'; + my $line = $iotest->getline; + $obj->_pushback($line) if defined $line; + $obj->{'_mode'} = defined $line ? 'r' : 'w'; + return $obj->{'_mode'}; + } else { + my $line = $iotest->getline; + $obj->_pushback($line) if defined $line; + $obj->{'_mode'} = defined $line ? 'r' : 'w'; + return $obj->{'_mode'}; + } + } + $iotest->clearerr; + + #test for a writeable filehandle; + $iotest->fdopen( dup(fileno($obj->_fh)) , 'w' ); + if($iotest->error == 0){ + $obj->{'_mode'} = 'w'; +# return $obj->{'_mode'}; + } + + #wtf type of filehandle is this? +# $obj->{'_mode'} = '?'; + return $obj->{'_mode'}; +} + +=head2 file + + Title : file + Usage : $obj->file($newval) + Function: Get/set the filename, if one has been designated. + Example : + Returns : value of file + Args : newvalue (optional) + + +=cut + +sub file { + my ($obj, $value) = @_; + if ( defined $value) { + $obj->{'_file'} = $value; + } + return $obj->{'_file'}; +} + +=head2 _print + + Title : _print + Usage : $obj->_print(@lines) + Function: + Example : + Returns : writes output + +=cut + +sub _print { + my $self = shift; + my $fh = $self->_fh() || \*STDOUT; + print $fh @_; +} + +=head2 _readline + + Title : _readline + Usage : $obj->_readline(%args) + Function: Reads a line of input. + + Note that this method implicitely uses the value of $/ that is + in effect when called. + + Note also that the current implementation does not handle pushed + back input correctly unless the pushed back input ends with the + value of $/. + + Example : + Args : Accepts a hash of arguments, currently only -raw is recognized + passing (-raw => 1) prevents \r\n sequences from being changed + to \n. The default value of -raw is undef, allowing \r\n to be + converted to \n. + Returns : + +=cut + +sub _readline { + my $self = shift; + my %param =@_; + my $fh = $self->_fh || \*ARGV; + my $line; + + # if the buffer been filled by _pushback then return the buffer + # contents, rather than read from the filehandle + $line = shift @{$self->{'_readbuffer'}} || <$fh>; + + #don't strip line endings if -raw is specified + $line =~ s/\r\n/\n/g if( (!$param{-raw}) && (defined $line) ); + + return $line; +} + +=head2 _pushback + + Title : _pushback + Usage : $obj->_pushback($newvalue) + Function: puts a line previously read with _readline back into a buffer. + buffer can hold as many lines as system memory permits. + Example : + Returns : + Args : newvalue + +=cut + +sub _pushback { + my ($obj, $value) = @_; + + $obj->{'_readbuffer'} ||= []; + push @{$obj->{'_readbuffer'}}, $value; +} + +=head2 close + + Title : close + Usage : $io->close() + Function: Closes the file handle associated with this IO instance. + Will not close the FH if -noclose is specified + Returns : none + Args : none + +=cut + +sub close { + my ($self) = @_; + return if $self->noclose; # don't close if we explictly asked not to + if( defined $self->{'_filehandle'} ) { + $self->flush; + return if( \*STDOUT == $self->_fh || + \*STDERR == $self->_fh || + \*STDIN == $self->_fh + ); # don't close STDOUT fh + if( ! ref($self->{'_filehandle'}) || + ! $self->{'_filehandle'}->isa('IO::String') ) { + close($self->{'_filehandle'}); + } + } + $self->{'_filehandle'} = undef; + delete $self->{'_readbuffer'}; +} + + +=head2 flush + + Title : flush + Usage : $io->flush() + Function: Flushes the filehandle + Returns : none + Args : none + +=cut + +sub flush { + my ($self) = shift; + + if( !defined $self->{'_filehandle'} ) { + $self->throw("Attempting to call flush but no filehandle active"); + } + + if( ref($self->{'_filehandle'}) =~ /GLOB/ ) { + my $oldh = select($self->{'_filehandle'}); + $| = 1; + select($oldh); + } else { + $self->{'_filehandle'}->flush(); + } +} + +=head2 noclose + + Title : noclose + Usage : $obj->noclose($newval) + Function: Get/Set the NOCLOSE flag - setting this to true will + prevent a filehandle from being closed + when an object is cleaned up or explicitly closed + This is a bit of hack + Returns : value of noclose (a scalar) + Args : on set, new value (a scalar or undef, optional) + + +=cut + +sub noclose{ + my $self = shift; + + return $self->{'_noclose'} = shift if @_; + return $self->{'_noclose'}; +} + +sub _io_cleanup { + my ($self) = @_; + + $self->close(); + my $v = $self->verbose; + + # we are planning to cleanup temp files no matter what + if( exists($self->{'_rootio_tempfiles'}) && + ref($self->{'_rootio_tempfiles'}) =~ /array/i) { + if( $v > 0 ) { + print STDERR "going to remove files ", + join(",", @{$self->{'_rootio_tempfiles'}}), "\n"; + } + unlink (@{$self->{'_rootio_tempfiles'}} ); + } + # cleanup if we are not using File::Temp + if( $self->{'_cleanuptempdir'} && + exists($self->{'_rootio_tempdirs'}) && + ref($self->{'_rootio_tempdirs'}) =~ /array/i) { + + if( $v > 0 ) { + print STDERR "going to remove dirs ", + join(",", @{$self->{'_rootio_tempdirs'}}), "\n"; + } + $self->rmtree( $self->{'_rootio_tempdirs'}); + } +} + +=head2 exists_exe + + Title : exists_exe + Usage : $exists = $obj->exists_exe('clustalw'); + $exists = Bio::Root::IO->exists_exe('clustalw') + $exists = Bio::Root::IO::exists_exe('clustalw') + Function: Determines whether the given executable exists either as file + or within the path environment. The latter requires File::Spec + to be installed. + On Win32-based system, .exe is automatically appended to the program + name unless the program name already ends in .exe. + Example : + Returns : 1 if the given program is callable as an executable, and 0 otherwise + Args : the name of the executable + +=cut + +sub exists_exe { + my ($self, $exe) = @_; + $exe = $self if(!(ref($self) || $exe)); + $exe .= '.exe' if(($^O =~ /mswin/i) && ($exe !~ /\.(exe|com|bat|cmd)$/i)); + return $exe if(-e $exe); # full path and exists + + # Ewan's comment. I don't think we need this. People should not be + # asking for a program with a pathseparator starting it + + # $exe =~ s/^$PATHSEP//; + + # Not a full path, or does not exist. Let's see whether it's in the path. + if($FILESPECLOADED) { + foreach my $dir (File::Spec->path()) { + my $f = Bio::Root::IO->catfile($dir, $exe); + return $f if(-e $f && -x $f ); + } + } + return 0; +} + +=head2 tempfile + + Title : tempfile + Usage : my ($handle,$tempfile) = $io->tempfile(); + Function: Returns a temporary filename and a handle opened for writing and + and reading. + + Caveats : If you do not have File::Temp on your system you should avoid + specifying TEMPLATE and SUFFIX. (We don't want to recode + everything, okay?) + Returns : a 2-element array, consisting of temporary handle and temporary + file name + Args : named parameters compatible with File::Temp: DIR (defaults to + $Bio::Root::IO::TEMPDIR), TEMPLATE, SUFFIX. + +=cut + +#' +sub tempfile { + my ($self, @args) = @_; + my ($tfh, $file); + my %params = @args; + + # map between naming with and without dash + foreach my $key (keys(%params)) { + if( $key =~ /^-/ ) { + my $v = $params{$key}; + delete $params{$key}; + $params{uc(substr($key,1))} = $v; + } else { + # this is to upper case + my $v = $params{$key}; + delete $params{$key}; + $params{uc($key)} = $v; + } + } + $params{'DIR'} = $TEMPDIR if(! exists($params{'DIR'})); + unless (exists $params{'UNLINK'} && + defined $params{'UNLINK'} && + ! $params{'UNLINK'} ) { + $params{'UNLINK'} = 1; + } else { $params{'UNLINK'} = 0 } + + if($FILETEMPLOADED) { + if(exists($params{'TEMPLATE'})) { + my $template = $params{'TEMPLATE'}; + delete $params{'TEMPLATE'}; + ($tfh, $file) = File::Temp::tempfile($template, %params); + } else { + ($tfh, $file) = File::Temp::tempfile(%params); + } + } else { + my $dir = $params{'DIR'}; + $file = $self->catfile($dir, + (exists($params{'TEMPLATE'}) ? + $params{'TEMPLATE'} : + sprintf( "%s.%s.%s", + $ENV{USER} || 'unknown', $$, + $TEMPCOUNTER++))); + + # sneakiness for getting around long filenames on Win32? + if( $HAS_WIN32 ) { + $file = Win32::GetShortPathName($file); + } + + # taken from File::Temp + if ($] < 5.006) { + $tfh = &Symbol::gensym; + } + # Try to make sure this will be marked close-on-exec + # XXX: Win32 doesn't respect this, nor the proper fcntl, + # but may have O_NOINHERIT. This may or may not be in Fcntl. + local $^F = 2; + # Store callers umask + my $umask = umask(); + # Set a known umaskr + umask(066); + # Attempt to open the file + if ( sysopen($tfh, $file, $OPENFLAGS, 0600) ) { + # Reset umask + umask($umask); + } else { + $self->throw("Could not open tempfile $file: $!\n"); + } + } + + if( $params{'UNLINK'} ) { + push @{$self->{'_rootio_tempfiles'}}, $file; + } + + + return wantarray ? ($tfh,$file) : $tfh; +} + +=head2 tempdir + + Title : tempdir + Usage : my ($tempdir) = $io->tempdir(CLEANUP=>1); + Function: Creates and returns the name of a new temporary directory. + + Note that you should not use this function for obtaining "the" + temp directory. Use $Bio::Root::IO::TEMPDIR for that. Calling this + method will in fact create a new directory. + + Returns : The name of a new temporary directory. + Args : args - ( key CLEANUP ) indicates whether or not to cleanup + dir on object destruction, other keys as specified by File::Temp + +=cut + +sub tempdir { + my ( $self, @args ) = @_; + if($FILETEMPLOADED && File::Temp->can('tempdir') ) { + return File::Temp::tempdir(@args); + } + + # we have to do this ourselves, not good + # + # we are planning to cleanup temp files no matter what + my %params = @args; + $self->{'_cleanuptempdir'} = ( defined $params{CLEANUP} && + $params{CLEANUP} == 1); + my $tdir = $self->catfile($TEMPDIR, + sprintf("dir_%s-%s-%s", + $ENV{USER} || 'unknown', $$, + $TEMPCOUNTER++)); + mkdir($tdir, 0755); + push @{$self->{'_rootio_tempdirs'}}, $tdir; + return $tdir; +} + +=head2 catfile + + Title : catfile + Usage : $path = Bio::Root::IO->catfile(@dirs,$filename); + Function: Constructs a full pathname in a cross-platform safe way. + + If File::Spec exists on your system, this routine will merely + delegate to it. Otherwise it tries to make a good guess. + + You should use this method whenever you construct a path name + from directory and filename. Otherwise you risk cross-platform + compatibility of your code. + + You can call this method both as a class and an instance method. + + Returns : a string + Args : components of the pathname (directories and filename, NOT an + extension) + +=cut + +sub catfile { + my ($self, @args) = @_; + + return File::Spec->catfile(@args) if($FILESPECLOADED); + # this is clumsy and not very appealing, but how do we specify the + # root directory? + if($args[0] eq '/') { + $args[0] = $ROOTDIR; + } + return join($PATHSEP, @args); +} + +=head2 rmtree + + Title : rmtree + Usage : Bio::Root::IO->rmtree($dirname ); + Function: Remove a full directory tree + + If File::Path exists on your system, this routine will merely + delegate to it. Otherwise it runs a local version of that code. + + You should use this method to remove directories which contain + files. + + You can call this method both as a class and an instance method. + + Returns : number of files successfully deleted + Args : roots - rootdir to delete or reference to list of dirs + + verbose - a boolean value, which if TRUE will cause + C to print a message each time it + examines a file, giving the name of the file, and + indicating whether it's using C or + C to remove it, or that it's skipping it. + (defaults to FALSE) + + safe - a boolean value, which if TRUE will cause C + to skip any files to which you do not have delete + access (if running under VMS) or write access (if + running under another OS). This will change in the + future when a criterion for 'delete permission' + under OSs other than VMS is settled. (defaults to + FALSE) + +=cut + +# taken straight from File::Path VERSION = "1.0403" +sub rmtree { + my($self,$roots, $verbose, $safe) = @_; + if( $FILEPATHLOADED ) { + return File::Path::rmtree ($roots, $verbose, $safe); + } + + my $force_writeable = ($^O eq 'os2' || $^O eq 'dos' || $^O eq 'MSWin32' + || $^O eq 'amigaos'); + my $Is_VMS = $^O eq 'VMS'; + + my(@files); + my($count) = 0; + $verbose ||= 0; + $safe ||= 0; + if ( defined($roots) && length($roots) ) { + $roots = [$roots] unless ref $roots; + } else { + $self->warn("No root path(s) specified\n"); + return 0; + } + + my($root); + foreach $root (@{$roots}) { + $root =~ s#/\z##; + (undef, undef, my $rp) = lstat $root or next; + $rp &= 07777; # don't forget setuid, setgid, sticky bits + if ( -d _ ) { + # notabene: 0777 is for making readable in the first place, + # it's also intended to change it to writable in case we have + # to recurse in which case we are better than rm -rf for + # subtrees with strange permissions + chmod(0777, ($Is_VMS ? VMS::Filespec::fileify($root) : $root)) + or $self->warn("Can't make directory $root read+writeable: $!") + unless $safe; + if (opendir(DIR, $root) ){ + @files = readdir DIR; + closedir(DIR); + } else { + $self->warn( "Can't read $root: $!"); + @files = (); + } + + # Deleting large numbers of files from VMS Files-11 filesystems + # is faster if done in reverse ASCIIbetical order + @files = reverse @files if $Is_VMS; + ($root = VMS::Filespec::unixify($root)) =~ s#\.dir\z## if $Is_VMS; + @files = map("$root/$_", grep $_!~/^\.{1,2}\z/s,@files); + $count += $self->rmtree([@files],$verbose,$safe); + if ($safe && + ($Is_VMS ? !&VMS::Filespec::candelete($root) : !-w $root)) { + print "skipped $root\n" if $verbose; + next; + } + chmod 0777, $root + or $self->warn( "Can't make directory $root writeable: $!") + if $force_writeable; + print "rmdir $root\n" if $verbose; + if (rmdir $root) { + ++$count; + } + else { + $self->warn( "Can't remove directory $root: $!"); + chmod($rp, ($Is_VMS ? VMS::Filespec::fileify($root) : $root)) + or $self->warn("and can't restore permissions to " + . sprintf("0%o",$rp) . "\n"); + } + } + else { + + if ($safe && + ($Is_VMS ? !&VMS::Filespec::candelete($root) + : !(-l $root || -w $root))) + { + print "skipped $root\n" if $verbose; + next; + } + chmod 0666, $root + or $self->warn( "Can't make file $root writeable: $!") + if $force_writeable; + print "unlink $root\n" if $verbose; + # delete all versions under VMS + for (;;) { + unless (unlink $root) { + $self->warn( "Can't unlink file $root: $!"); + if ($force_writeable) { + chmod $rp, $root + or $self->warn("and can't restore permissions to " + . sprintf("0%o",$rp) . "\n"); + } + last; + } + ++$count; + last unless $Is_VMS && lstat $root; + } + } + } + + $count; +} + +=head2 _flush_on_write + + Title : _flush_on_write + Usage : $obj->_flush_on_write($newval) + Function: Boolean flag to indicate whether to flush + the filehandle on writing when the end of + a component is finished (Sequences,Alignments,etc) + Returns : value of _flush_on_write + Args : newvalue (optional) + + +=cut + +sub _flush_on_write { + my ($self,$value) = @_; + if( defined $value) { + $self->{'_flush_on_write'} = $value; + } + return $self->{'_flush_on_write'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Root/IOManager.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Root/IOManager.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1149 @@ +#----------------------------------------------------------------------------- +# PACKAGE : Bio::Root::IOManager.pm +# AUTHOR : Steve Chervitz (sac@bioperl.org) +# CREATED : 26 Mar 1997 +# REVISION: $Id: IOManager.pm,v 1.13 2002/10/22 07:38:37 lapp Exp $ +# STATUS : Alpha +# +# For documentation, run this module through pod2html +# (preferably from Perl v5.004 or better). +# +# MODIFICATION NOTES: See bottom of file. +# +# Copyright (c) 1997-2000 Steve Chervitz. All Rights Reserved. +# This module is free software; you can redistribute it and/or +# modify it under the same terms as Perl itself. +#----------------------------------------------------------------------------- + +package Bio::Root::IOManager; + +use Bio::Root::Global qw(:devel $CGI $TIMEOUT_SECS); +use Bio::Root::Object (); +use Bio::Root::Utilities qw(:obj); +use FileHandle (); + +@ISA = qw(Bio::Root::Object); + +use strict; +use vars qw($ID $VERSION $revision); +$ID = 'Bio::Root::IOManager'; +$VERSION = 0.043; + +## POD Documentation: + +=head1 NAME + +Bio::Root::IOManager - Input and output manager for Perl5 objects. + +=head1 SYNOPSIS + +=head2 Object Creation + +The creation of Bio::Root::IOManager.pm objects is handled by Bio::Root::Object.pm +which delegates various I/O tasks to this module. + + use Bio::Root::IOManager; + + $myIO = new Bio::Root::IOManager(-WHERE =>'/usr/tmp/data.out', + -PARENT =>$self); + + +=head1 INSTALLATION + +This module is included with the central Bioperl distribution: + + http://bio.perl.org/Core/Latest + ftp://bio.perl.org/pub/DIST + +Follow the installation instructions included in the README file. + + +=head1 DESCRIPTION + +This module encapsulates the data and methods necessary for regulating +input/output (I/O) of data from Perl objects. +It is concerned with "where" to get input or send output as opposed to "what" to get. +IOManager.pm is intended to consolidate various I/O issues for +Perl objects and provide an object-oriented way to do I/O things such as: + +=over 4 + +=item * passing filehandles between objects, + +=item * opening and reading input from files or STDIN, + +=item * routine file management (compressing, uncompressing, and deleting). + +=back + +Subclasses of B have access to all methods defined in +IOManager.pm since B employs Bio::Root::IOManager.pm +by a delegation mechanism. + +It is not clear yet how much objects really need to do the fancy I/O gymnastics as +supported by IOManager. Most of the time, objects simply send output to STDOUT +which is managed at the script/program level. The fancy I/O manipulations are +considered experimental and have not been adequately tested or utilized. +I'm not really satisfied with the current L/L strategy. +The additional functionality is not often utilized in typical +applications. Is the extra complexity worth it? + +B + + +=head2 Generic Data Access & Manipulation + +The L method provided permits the following: + +=over 4 + +=item * read from a file or STDIN. + +=item * read a single record or a stream containing multiple records. + +=item * specify a record separator. + +=item * store all input data in memory or process the data stream as it is being read. + +=back + +=head1 DEPENDENCIES + +Bio::Root::IOManager.pm inherits from B and uses B. +B is also used for routine file manipulations +compression/uncompression/deletion. + +=head1 SEE ALSO + + Bio::Root::Object.pm - Core object + Bio::Root::Utilities.pm - Generic utilty object + Bio::Root::Global.pm - Manages global variables/constants + + http://bio.perl.org/Projects/modules.html - Online module documentation + http://bio.perl.org/ - Bioperl Project Homepage + + FileHandle.pm (included in the Perl distribution or CPAN). + +=head1 TODO + +Experiment with using the newer B included in the Perl distribution, +instead of FileHandle.pm. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other Bioperl modules. +Send your comments and suggestions preferably to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track the bugs and +their resolution. Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Steve Chervitz Esac@bioperl.orgE + +See L for where to send bug reports and comments. + +=head1 VERSION + +Bio::Root::IOManager.pm, 0.043 + +=head1 ACKNOWLEDGEMENTS + +This module was developed under the auspices of the Saccharomyces Genome +Database: + http://genome-www.stanford.edu/Saccharomyces + +=head1 COPYRIGHT + +Copyright (c) 1997-98 Steve Chervitz. All Rights Reserved. +This module is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +=cut + +# +## +### +#### END of main POD documentation. +### +## +#' + + +=head1 APPENDIX + +Methods beginning with a leading underscore are considered private +and are intended for internal use by this module. They are +B considered part of the public interface and are described here +for documentation purposes only. + +=cut + + + +##################################################################################### +## CONSTRUCTOR ## +##################################################################################### + + +## Using default constructor and destructor inherited from Bio::Root::Object.pm + +## Could perhaps set the file data member. + + +##################################################################################### +## ACCESSORS ## +##################################################################################### + + +=head2 file + + Usage : $object->file([filename]); + Purpose : Set/Get name of a file associated with an object. + Example : $object->file('/usr/home/me/data.txt'); + Returns : String (full path name) + Argument : String (full path name) OR a FileHandle or TypeGlob reference + : (argument only required for setting) + Throws : Exception if the file appears to be empty or non-existent + Comments : File can be text or binary. + +See Also : L, L, L + +=cut + +#-------- +sub file { +#-------- + my $self = shift; + if($_[0]) { + my $file = $_[0]; + if(not ref $file and not -s $file) { + $self->throw("File is empty or non-existent: $file"); + } + $self->{'_file'} = $file; + } + $self->{'_file'}; +} + + + +=head2 set_fh + + Usage : $self->set_fh( named_parameters ) + Purpose : Sets various FileHandle data members ('fh', 'fherr'). + : Provides a public interface for _open_fh(). + Returns : n/a + Argument : Named parameters: (TAGS CAN BE UPPER OR LOWER CASE) + : -PATH => string (filename) or a FileHandle object ref. + : -PRE => string, prefix for opening (e.g., '>', '>>'). + : -POST => string, postfix for opening (e.g., '|'), for commands. + : -WHICH => string, 'err' for setting output path for errors. + : + Throws : Exception propagated from _open_fh() + Examples : $self->set_fh(); # Create anonymous FileHandle object + : $self->set_fh(-PATH =>'fileName', # Open for writing + : -PRE =>'>'); + : $self->set_fh(-PATH =>'fileName', # Open error log file in append mode. + : -PRE =>'>>', + : -WHICH =>'err'); + : $self->set_fh(-PATH =>$obj->fh()); # Copy a file handle from another object. + : + Comments : set_read() and set_display() provide + : interfaces for set_fh(). + Status : Experimental + +See also : L, L. + +=cut + +#----------- +sub set_fh { +#----------- + my( $self, %param) = @_; + + no strict 'subs'; + my( $path, $prefix, $postfix, $which) = + $self->_rearrange([PATH,PRE,POST,WHICH],%param); + use strict 'subs'; + $prefix ||= ''; + $postfix ||= ''; + $which ||= ''; + my $fullpath = "$prefix$path$postfix"; + my($fh); + + $DEBUG and print STDERR "set_fh($fullpath) for ${\$self->name()}\n"; + + if($which eq 'err') { + if(ref($path) =~ /FileHandle|GLOB/ ) { + $fh = $path; + } else { + if(defined $self->{'_fherr'}) { $self->_close_fh('err');} + if( not $fh = $self->_open_fh("$fullpath")) { + $fh = $self->_open_fh("errors.$$"); + $fh || return; + $self->warn("Couldn't set error output to $fullpath", + "Set to file errors.$$"); + } + } + $self->{'_fherr_name'} = $fullpath; + $self->{'_fherr'} = $fh; + + } else { + if(ref($path) =~ /FileHandle|GLOB/ ) { + $fh = $path; + } else { + if(defined $self->{'_fh'}) { $self->_close_fh();} + if( not $fh = $self->_open_fh("$fullpath")) { + $fh = $self->_open_fh("out.$$"); + $fh || return; + $self->warn("Couldn't set output to $fullpath", + "Set to file out.$$"); + } + } + $self->{'_fh_name'} = $fullpath; + $self->{'_fh'} = $fh; + $DEBUG && print STDERR "$ID: set fh to: $fh"; + } +} + + + +#=head2 _open_fh +# +# Purpose : Creates a new FileHandle object and returns it. +# : This method can be used when you need to +# : pass FileHandles between objects. +# Returns : The new FileHandle object. +# Throws : Exception: if the call to new FileHandle fails. +# Examples : $self->_open_fh(); # Create anonymous FileHandle object +# : $self->_open_fh('fileName'); # Open for reading +# : $self->_open_fh('>fileName'); # Open for writing +# Status : Experimental +# +#See also : L, L, L, L +# +#=cut + +#------------- +sub _open_fh { +#------------- + my( $self, $arg) = @_; + my( $filehandle); + + $DEBUG and print STDERR "_open_fh() $arg\n"; + + $filehandle = new FileHandle $arg; + +# if($arg =~ /STD[IO]/) { +# $filehandle = new FileHandle; +# $filehandle = *$arg; +# } else { +# $filehandle = new FileHandle $arg; +# } + + (ref $filehandle) || $self->throw("Can't create new FileHandle $arg", + "Cause: $!"); + return $filehandle; +} + + + +#=head2 _close_fh +# +# Purpose : Destroy a FileHandle object. +# Returns : n/a +# Status : Experimental +# +#See also : L<_open_fh()|_open_fh>, L +# +#=cut + +#-------------- +sub _close_fh { +#-------------- + my( $self, $arg) = @_; + $arg ||= ''; + if($arg eq 'err') { + close $self->{'_fherr'}; + undef $self->{'_fherr'}; + } else { + close $self->{'_fh'}; + undef $self->{'_fh'}; + } +} + + +=head2 set_display + + Usage : $self->set_display([-WHERE=>'path'], + : [-SHOW =>'what is to be displayed'], + : [-MODE =>'file open mode']) + Purpose : Sets a new FileHandle object for output. + : - Sets the objects 'show' data member to 'default' if it is not defined. + : - Is a wrapper for setting an object's STDOUT filehandle: + : Checks the -WHERE parameter and the status of the object's current + : filehandle {'_fh'} and does one of three things: + : 1. If $param{-WHERE} is defined and is not 'STDOUT', it is sent to + : set_fh() to open a new fh, + : 2. else, if 'fh' has already been defined, it is returned, + : 3. else, if where equals 'STDOUT', \*STDOUT is returned. + : 4. else, \*STDOUT is returned. + : + : Thus, if an object has already set its 'fh' to some location, + : it can still print to 'STDOUT' by explicitly passing -WHERE='STDOUT' + : to display(). + : + Arguments : Named parameters: (TAGS CAN BE UPPER OR LOWER CASE). + : (all are optional). + : -WHERE => full path name of file to write to or 'STDOUT'. + : -SHOW => what data is to be displayed. Becomes $self->{'_show'} + : Default = 'default'. This results in a call to + : _display_stats() method when display() is called + : -MODE => mode for opening file. Default is overwrite '>'. + : + Returns : FileHandle object reference or typglob reference (\*STDOUT). + Throws : Exception propagated from set_fh(). + Example : $self->set_display(); + : $self->set_display(-WHERE=>'./data.out'); + : $self->set_display(-WHERE=>$obj->fh()); + Status : Experimental + Comments : I'm not satisfied with the current display()/set_display() strategy. + +See also : L, L + +=cut + +#----------------' +sub set_display { +#---------------- + my( $self, @param ) = @_; + my ($show, $where, $mode) = $self->_rearrange([qw(SHOW WHERE MODE)], @param); + + ## Default mode: overwrite any existing file. + $mode ||= '>'; + $where ||= 'STDOUT'; + + $self->{'_show'} = ($show || 'default'); + + $DEBUG and print STDERR "$ID set_display() show: $self->{'_show'}\twhere: -->$where<--\n"; + + if( defined $where and $where !~ /STDOUT/) { +# print "setting file handle object\n"; + $self->set_fh(-PATH =>$where, + -PRE =>$mode); + } elsif( not defined $self->{'_fh'} or $where =~ /STDOUT/) { + return \*STDOUT; + } else { +# print STDERR "filehandle already set for this object: ${\$self->fh('name')}\n"; + } + + return $self->{'_fh'}; +} + + + +=head2 set_read + + Purpose : Sets a new FileHandle object for input. + : Same logic as set_display() but creates filehandle for read only. + Returns : The input FileHandle object or \*STDIN. + Arguments : Named parameters: (TAGS CAN BE UPPER OR LOWER CASE). + : $param{-WHERE} = full path name of file to write to. + Access : Public + Status : Experimental, Deprecated + : + WARNING : THIS METHOD HAS NOT BEEN TESTED AND IS LIKELY UNNECESSARY. + : USE THE read() METHOD INSTEAD. + : + : Note also that set_read() uses the same data member as set_display() + : so it is currently not possible to simultaneously have + : different displaying and reading filehandles. This degree of + : I/O control has not been necessary. + +See also : L, L + +=cut + +#------------- +sub set_read { +#------------- + my( $self, @param ) = @_; + my ($where, $mode) = $self->_rearrange([qw(WHERE MODE)], @param); + + ## Default mode: read only. + $mode ||= '<'; + $where ||= 'STDIN'; + + if( ref($where) and $where !~ /STDIN/) { +# print "setting file handle object\n"; + $self->set_fh(-PATH =>$where, + -PRE =>$mode); + } elsif( not defined $self->{'_fh'} or $where =~ /STDIN/) { + return \*STDIN; + } else { +# print STDERR "filehandle already set for this object: ${\$self->fh('name')}\n"; + } + + return $self->{'_fh'}; +} + + + +=head2 set_display_err + + Purpose : Sets a new FileHandle object for outputing error information. + : Same logic as set_display() but creates a filehandle in + : append mode. + Returns : The output FileHandle object for saving errors or \*STDERR. + Status : Experimental + WARNING : NOT TESTED + +See also : L, L + +=cut + +#-------------------- +sub set_display_err { +#-------------------- + my( $self, @param ) = @_; + my ($where, $mode) = $self->_rearrange([qw(WHERE MODE)], @param); + + ## Default mode: read only. + $mode ||= '>>'; + $where ||= 'STDERR'; + + $DEBUG and print STDERR "set_display_err() object: ${\$self->name()}\n"; + + if( ref($where) and $where !~ /STDERR/) { +# print "setting file handle object\n"; + $self->set_fh(-PATH =>$where, + -PRE =>$mode); + } elsif( not defined $self->{'_fherr'} or $where =~ /STDERR/) { + return \*STDERR; + } else { +# print STDERR "filehandle already set for this object: ${\$self->fh('name')}\n"; + } + + return $self->{'_fherr'}; +} + + +##################################### +# GET ACCESSORS +##################################### + + +=head2 show + + Usage : $self->show() + Purpose : Get the string used to specify what to display + : using the display() method. + Returns : String or undef if no show data member is defined. + Arguments : n/a + +See also : L + +=cut + +#---------- +sub show { my $self= shift; $self->{'_show'}; } +#---------- + + + +=head2 fh + + Usage : $object->fh(['name']) + Purpose : Accessor for an object's FileHandle object or the argument used + : to create that object. + Returns : One of the following: + : 1. The arguments used when the filehandle was created ('fh_name'). + : 2. The FileHandle object reference previously assigned to $self->{'_fh'}. + : 3. Typeglob reference \*STDIN, \*STDOUT or \*STDERR. + Example : $self->fh(); # returns filehandle for the STDIN/STDOUT path. + : $self->fh('err'); # returns filehandle for the err file. + : $self->fh('name'); # returns fh creation arguments. + : $self->fh('errname'); # returns fh creation arguments for the err file. + Status : Experimental + +See also : L, L, L, L + +=cut + +#--------' +sub fh { +#-------- + my( $self, $type, $stream) = @_; + $stream ||= 'out'; + $stream = ($stream eq 'in') ? \*STDIN : \*STDOUT; + + ## Problem: Without named parameters, how do you know if + ## a single argument is to be assigned to $type or $stream? + ## Function prototypes could be used, or separate methods: + ## fh_out(), fh_in(), fh_err(). + $type or return ($self->{'_fh'} || $stream); + + if( $type =~ /name/){ + if($type =~ /err/ ) { return $self->{'_fherr_name'}; } + else { return $self->{'_fh_name'}; } + + } else { + if($type =~ /err/ ) { return ($self->{'_fherr'} || \*STDERR); } + else { return ($self->{'_fh'} || $stream); } + } +} + + +##################################################################################### +## INSTANCE METHODS ## +##################################################################################### + + +## +## INPUT METHODS: +## + + +=head2 read + + Usage : $object->read(); + Purpose : Read raw textual data from a file or STDIN. + : Optionally process each record it as it is read. + Example : $data = $object->read(-FILE =>'usr/people/me/data.txt', + : -REC_SEP =>"\n:", + : -FUNC =>\&process_rec); + : $data = $object->read(-FILE =>\*FILEHANDLE); + : $data = $object->read(-FILE =>new FileHandle $file, 'r'); + : + Argument : Named parameters: (TAGS CAN BE UPPER OR LOWER CASE) + : (all optional) + : -FILE => string (full path to file) or a reference + : to a FileHandle object or typeglob. This is an + : optional parameter (if not defined, STDIN is used). + : -REC_SEP => record separator to be used + : when reading in raw data. If none is supplied, + : the default record separator is used ($/). + : $/ is localized to this method but be careful if + : you do any additional file reading in functions + : called by this method (see the -FUNC parameter). + : Such methods will use the value of $/ set + : by read() (if a -RE_SEP is supplied). + : -FUNC => reference to a function to be called for each + : record. The return value of this function is now checked: + : if false, the reading is terminated. + : Typically -FUNC supplies a closure. + : -HANDLE => reference to a FileHandle object or a + : typeglob to be use for reading input. + : The FileHandle object should be configured to + : read from a desired file before calling this + : method. If both -handle and -file are defined, + : -handle takes precedence. + : (The -HANDLE parameter is no longer necessary + : since -FILE can now contain a FileHandle ref.) + : -WAIT => integer (number of seconds to wait for input + : before timing out. Default = 20 seconds). + : + Returns : string, array, or undef depending on the arguments. + : If a function reference is supplied, this function will be + : called using the contents of each record as it is read in. + : If no function reference is supplied, the data are returned as a + : string in scalar context or as a list in array context. + : The data are not altered; blank lines are not removed. + : + Throws : Exception if no input is read from source. + : Exception if no input is read within WAIT seconds. + : Exception if FUNC is not a function reference. + : Propagates any exceptions thrown by create_filehandle() + : + Comments : Gets the file name from the current file data member. + : If no file has been defined, this method will attempt to + : read from STDIN. + : + : COMPRESSED FILES: + : read() will attempt to use gzip -cd to read the file + : if it appears to be compressed (binary file test). + : + : If the raw data is to be returned, wantarray is used to + : determine how the data are to be returned (list or string). + : + : Sets the file data member to be the supplied file name. + : (if any is supplied). + + : The read() method is a fairly new implementation + : and uses a different approach than display(). + : For example, set_read() is not used. + + Bugs : The following error is generated by Perl's FileHandle.pm module + : when using the -w switch. It can be ignored for now: + "Close on unopened file at /tools/perl/5.003/lib/FileHandle.pm line 255." + +See Also : L, L + +=cut + +#----------' +sub read { +#---------- + my($self, @param) = @_; + my( $rec_sep, $func_ref, $wait ) = + $self->_rearrange([qw( REC_SEP FUNC WAIT)], @param); + + my $fmt = (wantarray ? 'list' : 'string'); + $wait ||= $TIMEOUT_SECS; # seconds to wait before timing out. + + my $FH = $Util->create_filehandle( -client => $self, @param); + + # Set the record separator (if necessary) using dynamic scope. + my $prev_rec_sep; + $prev_rec_sep = $/ if scalar $rec_sep; # save the previous rec_sep + local $/ = $rec_sep if scalar $rec_sep; + + # Verify that we have a proper reference to a function. + if($func_ref) { + if(not ref($func_ref) =~ /CODE/) { + $self->throw("Not a function reference: $func_ref, ${\ref $func_ref}"); + } + } + + $DEBUG && printf STDERR "$ID: read(): rec_sep = %s; func = %s\n",$/, ($func_ref?'defined':'none'); + + my($data, $lines, $alarm_available); + + $alarm_available = 1; + + eval { + alarm(0); + }; + if($@) { + # alarm() not available (ActiveState perl for win32 doesn't have it. + # See jitterbug PR#98) + $alarm_available = 0; + } + + $SIG{ALRM} = sub { die "Timed out!"; }; + + eval { + $alarm_available and alarm($wait); + + READ_LOOP: + while(<$FH>) { + # Default behavior: read all lines. + # If &$func_ref returns false, exit this while loop. + # Uncomment to skip lines with only white space or record separators +# next if m@^(\s*|$/*)$@; + + $lines++; + $alarm_available and alarm(0); # Deactivate the alarm as soon as we start reading. + my($result); + if($func_ref) { + # Need to reset $/ for any called function. + local $/ = $prev_rec_sep if defined $prev_rec_sep; + $result = &$func_ref($_) or last READ_LOOP; + } else { + $data .= $_; + } + } + }; + if($@ =~ /Timed out!/) { + $self->throw("Timed out while waiting for input from $self->{'_input_type'}.", "Timeout period = $wait seconds.\nFor a longer time out period, supply a -wait => parameter\n". + "or edit \$TIMEOUT_SECS in Bio::Root::Global.pm."); + } elsif($@ =~ /\S/) { + my $err = $@; + $self->throw("Unexpected error during read: $err"); + } + + close ($FH) unless $self->{'_input_type'} eq 'STDIN'; + + if($data) { + $DEBUG && do{ + print STDERR "$ID: $lines records read.\nReturning $fmt.\n" }; + + return ($fmt eq 'list') ? split("$/", $data) : $data; + + } elsif(not $func_ref) { + $self->throw("No data input from $self->{'_input_type'}"); + } + delete $self->{'_input_type'}; + undef; +} + + +## +## OUTPUT METHODS: +## + + +=head2 display + + Usage : $self->set_display(named parameters) + Purpose : Provides a default display method which calls set_display() + : and also invokes methods to display an object's stats + : if necessary ( _print_stats_header() and _displayStats() ). + Returns : True (1). + Throws : Propagates any exceptions thrown by set_display(). + Arguments : Named parameters for set_display(). + Comments : I'm not satisfied with the current display()/set_display() strategy. + +See also : L + +=cut + +#------------- +sub display { +#------------- + my( $self, %param ) = @_; + + $DEBUG && print STDERR "$ID display for ${\ref($self)}\n"; + + my $OUT = $self->set_display(%param); +# my $OUT = $self->set_display( %param ); +# print "$ID: OUT = $OUT";; + + $DEBUG && do{ print STDERR "display(): WHERE = $OUT;\nSHOW = $self->{'_show'}";;}; + + if($self->{'_show'} =~ /stats|default/i) { + if($param{-HEADER}) { + $self->_print_stats_header($OUT); + } + $self->parent->_display_stats($OUT); + } + 1; +} + + + +=head2 _print_stats_header + + Usage : n/a; internal method. + : $obj->_print_stats_header(filehandle); + Purpose : Prints a header containing basic info about the object + : such as the class and name of the object followed by a + : line of hyphens. + Status : Experimental + +=cut + +#------------------------ +sub _print_stats_header { +#------------------------ + my($self, $OUT) = @_; + + printf $OUT "\nSTATS FOR %s \"%s\"\n",ref($self->parent),$self->parent->name(); + printf $OUT "%s\n", '-'x60; +} + + + + +## +## FILE MANIPULATION METHODS: +## + + + +=head2 file_date + + Usage : $object->file_date( %named_parameters); + Purpose : Get the last modified date of a file. + Example : $object->file_date(); + : $object->file_date(-FMT =>'yyyy-mmm-dd', + -FILE =>'/usr/people/me/data.txt'); + : $object->file_date(-FMT =>'yyyy-mmm-dd'); + Returns : String (date) + Argument : Named parameters: (TAGS CAN BE UPPER OR LOWER CASE) + : -FILE => string (filename full path) + : -FMT => string (format for the returned date string) + : + Throws : Exception if no file is specified or the file is non-existent + : (Propagated from Utilities::file_date()) + Comments : File can be text or binary. + +See Also : L, L + +=cut + +#--------------- +sub file_date { +#--------------- + my ($self, @param) = @_; + my ($file, $fmt) = $self->_rearrange([qw(FILE FMT)], @param); + + if(not $file ||= $self->{'_file'}) { + $self->throw("Can't get file date: no file specified"); + } + $fmt ||= ''; + $Util->file_date($file, $fmt); +} + + + +=head2 compress_file + + Usage : $object->compress_file([filename]); + Purpose : Compresses a file if not already compressed. + : Compresses to a temorary file if user is not owner of supplied file. + Example : $object->file('/usr/home/me/data.txt'); + : $object->compress_file(); + Argument : String (full path name) (optional). + : If no argument is provided, the file data member is used. + Returns : String (compressed file name, full path). + : Sets the file data member to the compressed name + : when not operating on a file supplied as an argument. + : Returns false (undef) if the file is already compressed + : (binary test). + Throws : Exception if no file is specified. + : Propagates any exception thrown by Bio::Root::Utilities::compress() + : if the file cannot be compressed(). + : Tests if file is already compressed to avoid trivial error due to + : the file already being compressed. + : + Comments : Relies on the compress() method of Bio::Root::Utilities.pm + : to implement the file compression functionality. + : (Currently, Bio::Root::Utilities::compress() uses gzip.) + : + : If the user is not the owner of the file, the file is + : compressed to a tmp file. + : + : All file compressing/uncompressing requests should go through + : compress_file()/uncompress_file(). This serves to confine the + : dependency between IOManager.pm module and Utilities.pm + : which helps maintainability. + : + Bugs : Only compresses text files. This obviates a dependency on + : particular file suffixes but is not good if you + : want to compress a binary file. + : + : May not be taint-safe. + +See Also : L, L, L + +=cut + +#----------------- +sub compress_file { +#----------------- + my ($self, $file) = @_; + my $myfile = 0; + + if(!$file) { + $file = $self->{'_file'}; + $myfile = 1; + } + + $file or $self->throw("Can't compress data file: no file specified"); + + #printf STDERR "$ID: Compressing data file for %s\n $file\n",$self->name(); + + my ($newfile); + if (-T $file) { + $newfile = -o $file ? $Util->compress($file) : $Util->compress($file, 1); + # set the current file to the new name. + $self->file($newfile) if $myfile; + } + $newfile; +} + + + +=head2 uncompress_file + + Usage : $object->uncompress_file([filename]); + Purpose : Uncompresses the file containing the raw report. + : Uncompresses to a temorary file if user is not owner of supplied file. + Example : $object->file('/usr/home/me/data.txt.gz'); + : $object->uncompress_file(); + Argument : String (full path name) (optional). + : If no argument is provided, the file data member is used. + Returns : String (uncompressed file name, full path). + : Sets the file data member to the uncompressed name + : when not operating on a file supplied as an argument. + : Returns false (undef) if the file is already uncompressed. + : + Throws : Exception if no file is specified. + : Propagates any exception thrown by Bio::Root::Utilities::compress() + : if the file cannot be uncompressed(). + : Tests if file is already uncompressed to avoid trivial error due to + : the file already being uncompressed. + Comments : See comments for compress_file(). They apply here as well. + : + Bugs : Considers all binary files to be compressed. This obviates + : a dependency on particular file suffixes. + : May not be taint safe. + +See Also : L, L, L + +=cut + +#-------------------- +sub uncompress_file { +#-------------------- + my ($self, $file) = @_; + my $myfile = 0; + + if(!$file) { + $file = $self->{'_file'}; + $myfile = 1; + } + + $file or $self->throw("Can't compress file: no file specified"); + + #printf STDERR "$ID: Uncompressing data file for %s\n $file",$self->name(); + + my ($newfile); + if (-B $file) { + $newfile = -o $file ? $Util->uncompress($file) : $Util->uncompress($file, 1); + # set the current file to the new name & return it. + $self->file($newfile) if $myfile; + } + $newfile; +} + + +=head2 delete_file + + Usage : $object->delete_file([filename]); + Purpose : Delete a file. + Example : $object->delete_file('/usr/people/me/data.txt'); + Returns : String (name of file which was deleted) if successful, + : undef if file does not exist. + : Sets the file data member to undef + : when not operating on a file supplied as an argument. + Argument : String (full path name) (optional). + : If no argument is provided, the file data member is used. + Throws : Exception if the user is not the owner of the file. + : Propagates any exception thrown by Bio::Root::Utilities::delete(). + : if the file cannot be deleted. + Comments : Be careful with this method: there is no undelete(). + : Relies on the delete() method provided by Bio::Root::Utilities.pm + : to implement the file deletion functionality. + : This method is not taint-safe. + : It is intended for off-line maintenance use only. + +See Also : L, L + +=cut + +#----------------- +sub delete_file { +#----------------- + my ($self, $file) = @_; + my $myfile = 0; + + if(!$file) { + $file = $self->{'_file'}; + $myfile = 1; + } + return undef unless -e $file; + + -o $file or + $self->throw("Can't delete file $file: Not owner."); + +# $DEBUG and print STDERR "$ID: Deleting data file for ",$self->name(); + + eval{ $Util->delete($file); }; + + if(!$@ and $myfile) { + $self->{'_file'} = undef; + } + $file; +} + + + +1; +__END__ + +##################################################################################### +# END OF CLASS # +##################################################################################### + +=head1 FOR DEVELOPERS ONLY + +=head2 Data Members + +Information about the various data members of this module is provided for those +wishing to modify or understand the code. Two things to bear in mind: + +=over 4 + +=item 1 Do NOT rely on these in any code outside of this module. + +All data members are prefixed with an underscore to signify that they are private. +Always use accessor methods. If the accessor doesn't exist or is inadequate, +create or modify an accessor (and let me know, too!). + +=item 2 This documentation may be incomplete and out of date. + +It is easy for this documentation to become obsolete as this module is still evolving. +Always double check this info and search for members not described here. + +=back + +An instance of Bio::Root::IOManager.pm is a blessed reference to a hash containing +all or some of the following fields: + + FIELD VALUE + ------------------------------------------------------------------------ + _show Selects display options. + + _fh FileHandle object for redirecting STDIN or STDOUT. + + _fherr FileHandle object for error data. Append mode. + + _fh_name The arguments used to create fh. + + _fherr_name The arguments used to create fherr. + + INHERITED DATA MEMBERS + + _parent (From Bio::Root::Object.pm> Object reference for the owner of this IOManager. + +=cut + + +MODIFICATION NOTES: +------------------- + +17 Feb 1999, sac: + * Using $Global::TIMEOUT_SECS + +3 Feb 1999, sac: + * Added timeout support to read(). + * Moved the FileHandle creation code out of read() and into + Bio::Root::Utilties since it's of more general use. + + 24 Nov 1998, sac: + * Modified read(), compress(), and uncompress() to properly + deal with file ownership issues. + + 19 Aug 1998, sac: + * Fixed bug in display(), which wasn't returning true (1). + + 0.023, 20 Jul 1998, sac: + * read() can now use a supplied FileHandle or GLOB ref (\*IN). + * A few other touch-ups in read(). + + 0.022, 16 Jun 1998, sac: + * read() now terminates reading when a supplied &$func_ref + returns false. + + 0.021, May 1998, sac: + * Refined documentation to use 5.004 pod2html. + * Properly using typglob refs as necessary + (e.g., set_display(), set_fh()). + +0.031, 2 Sep 1998, sac: + * Doc changes only + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Root/Object.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Root/Object.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,2161 @@ +#----------------------------------------------------------------------------- +# PACKAGE : Bio::Root::Object.pm +# AUTHOR : Steve Chervitz (sac@bioperl.org) +# CREATED : 23 July 1996 +# REVISION: $Id: Object.pm,v 1.23 2002/10/22 07:38:37 lapp Exp $ +# STATUS : Alpha +# +# For documentation, run this module through pod2html +# (preferably from Perl v5.004 or better). +# +# MODIFICATION NOTES: See bottom of file. +# +# Copyright (c) 1996-2000 Steve Chervitz. All Rights Reserved. +# This module is free software; you can redistribute it and/or +# modify it under the same terms as Perl itself. +# Retain this notice and note any modifications made. +#----------------------------------------------------------------------------- + +package Bio::Root::Object; +use strict; + +require 5.002; +use Bio::Root::Global qw(:devel $AUTHORITY $CGI); +use Bio::Root::Root; + +use Exporter (); + +#use AutoLoader; +#*AUTOLOAD = \&AutoLoader::AUTOLOAD; + +use vars qw(@EXPORT_OK %EXPORT_TAGS); +@EXPORT_OK = qw($VERSION &find_object &stack_trace &containment &_rearrange); +%EXPORT_TAGS = ( std => [qw(&stack_trace &containment)] ); + +use vars qw($ID $VERSION %Objects_created $Revision @ISA); + +@ISA = qw(Bio::Root::Root); + + +# %Objects_created can be used for tracking all objects created. +# See _initialize() for details. + +$ID = 'Bio::Root::Object'; +$VERSION = 0.041; +$Revision = '$Id: Object.pm,v 1.23 2002/10/22 07:38:37 lapp Exp $'; #' + +### POD Documentation: + +=head1 NAME + +Bio::Root::Object - A core Perl 5 object. + +=head1 SYNOPSIS + +Use this module as the root of your inheritance tree. + +=head2 Object Creation + + require Bio::Root::Object; + + $dad = new Bio::Root::Object(); + $son = new Bio::Root::Object(-name => 'Junior', + -parent => $dad, + -make => 'full'); + + +See the L method for a complete description of parameters. +See also L. + +=head1 DESCRIPTION + +B attempts to encapsulate the "core" Perl5 +object: What are the key data and behaviors ALL (or at least most) Perl5 +objects should have? + +=head2 Rationale + +Use of B within the Bioperl framework facilitates +operational consistency across the different modules defined within +the B namespace. Not all objects need to derive from +B. However, when generating lots of different types +of potentially complex objects which should all conform to a set of +basic expectations, this module may be handy. + +At the very least, this module saves you from re-writing the L +method for each module you develop. It also permits consistent and +robust handling of C<-tag =E value> method arguments via the +L method and provides a +object-oriented way handle exceptions and warnings via the L and L methods. + +See L for some other handy methods. + +=head2 Fault-Tolerant Objects + +A major motivation for this module was to promote the creation of robust, +fault-tolerant Perl5 objects. The L method relies on Perl's built-in +C exception mechanism to generate fatal exceptions. +The data comprising an exception is managed by the B +module, which essentially allows the data thrown by a C event to be +wrapped into an object that can be easily examined and possibly re-thrown. + +The intent here is three-fold: + +=over 4 + +=item 1 Detailed error reporting. + +Allow objects to report detailed information about the error condition +(who, what, where, why, how). + +=item 2 Handle complex errors in objects. + +The goal is to make it relatively painless to detect and handle the wide +variety of errors possible with a complex Perl object. +Perl's error handling mechanism is a might clunky when it comes to +handling complex errors within complex objects, but it is improving. + +=item 3 Efficient & easy exception handling. + +To enable robust exception handling without incurring a significant +performance penalty in the resulting code. Ideally, exception handling +code should be transparent to the cpu until and unless an exception +arises. + +=back + +These goals may at times be at odds and we are not claiming +to have achieved the perfect balance. Ultimately, we want self- +sufficient object-oriented systems able to deal with their own errors. +This area should improve as the module, and Perl, evolve. +One possible modification might be to utilize Graham Barr's B +module or Torsten Ekedahl's B module +(see L). +Technologies such as these may eventually be +incorporated into future releases of Perl. The exception handling +used by B can be expected to change as Perl's +exception handling mechanism evolves. + +B In this discussion and elsewhere in this module, +the terms "Exception" and "Error" are used interchangeably to mean +"something unexpected occurred" either as a result of incorrect user +input or faulty internal processing. + +=head1 USAGE + +=head2 Basic Exception handling + +Object construction is a common place for exceptions to occur. By wrapping +the construction in an C block, we can prevent the exception from +crashing the script and attempt to recover gracefully: + + # Package Foo.pm IS-A Bio::Root::Object.pm + + $obj = eval { new Foo(@data) }; # ending semicolon required. + if($@) { + print STDERR "\nTrouble creating Foo object: $@\n"; + recover_gracefully($@); + } + +A common strategy when generating lots of objects is to collect +data about which objects failed to build but still permit +the successfully created ones get processed: + + @errs = (); + foreach $thing ( @stuff ) { + my $obj = eval { new Foo($thing) }; + if($@) { + push @err, [$thing, $@]; + } + else { + process_obj($obj); + } + } + +Post-mortem reporting, logging, or analysis of the problems ensues: + + if(@errs) { + printf "\n%d things failed:\n", scalar(@errs); + foreach(@errs) { print "$err->[0], ";} + + print "\n\nTrapped exceptions:\n"; + foreach(@errs) { print "$err->[1]\n";} + } + +New with B is the ability to C with an object +reference in C<$@> instead of just a string. This feature is not yet +exploited in Bio::Root::Object.pm but may be in future versions. +Bio::Root::Err.pm objects can be reconstructed from the contents of C<$@>: + + eval{ # exception-prone code here... }; + if($@) { + $err = new Bio::Root::Err($@); + printf "Trouble: %s\n". $err->msg; + printf "Stack trace: %s\n". $err->stack; + } + + +=head2 Demo Scripts + +Some demo script that illustrate working with Bio::Root::Objects +are included with the distribution in the examples/root_object directory. + + +=head1 STRICTNESS & VERBOSITY + +There are two global variables that can be used to control sensitivity to +exceptions/warnings and the amount of reporting for all objects within a process. +These are accessed via functions B and B exported by +Bio::Root::Global (see L). + + $STRICTNESS - Regulates the sensitivity of the object to exceptions and warnings. + + $VERBOSITY - Regulates the amount of reporting by an object. + + +The L and L methods of B +originally operated at the the object level, to permit individual +strictness and verbosity levels for different objects. This level of +control is not usually required and can often be inconvenient; one +typically wants to set these properties globally for a given +script. While this sacrifices some flexibility, it saves time and +memory when working with lots of objects. For instance, child objects +don't have to worry about checking their parents to determine their +strictness/verbosity levels. Strictness and verbosity are +globally-defined values, but different classes of objects can be +differentially sensitive to these values depending on design criteria. + +Strictness and verbosity can be positive or negative. Negative +verbosity equals terseness; negative strictness equals permissiveness. +In B only the Bio::Root::Root::throw() and +Bio::Root::Root::warn() methods (see L) are sensitive to +these values as indicated in the tables below: + + +---------+ + | throw() | v e r b o s i t y + +---------+ ------------------------------------- + -1 0 1 + s ---------- ----------- ---------- + t + r -2 -- throw() converted into warn() + i + c -1 | Exception Exception Exception + t 0 |_ printed printed printed + n 1 | without with with stack + e 2 | stack trace stack trace trace and + s | sysbeep + s + + + +---------+ + | warn() | v e r b o s i t y + +---------+ -------------------------------------- + -1 0 1 + s ---------- ----------- ----------- + t + r -2 | Warning Warning Warning + i -1 |_ not printed printed + c 0 | printed without with stack + t 1 | but stack trace trace and + n | attached* sysbeep + e + s 2 -- warn() converted into throw() + s + + (*) Warnings will be attached to an object if the + -record_err =>1 flag is set when constructing the object + or if $object->record_err(1) is called subsequent to creation. + +See the methods L, L, L, +Bio::Root::Root::throw(), and Bio::Root::Root::warn() in +L for more details. + + +=head1 DEPENDENCIES + +As the B does not inherit from any modules +but wraps (i.e., provides an interface and delegates +functionality to) other modules in the Bio::Root:: hierarchy: + + Module Purpose + -------------------- ------------------------------------ + Bio::Root::Err.pm Exception handling + Bio::Root::IOManager.pm Input/output of object data or error data + Bio::Root::Xref.pm Arbitrary links between objects + +All of these modules are loaded only when necessary. +B is an object representing an exception. +B and B are more experimental. They are +utilized via delegation, which permits them to be developed and utilized +independently of B. + +Since this module is at the root of potentially many different objects +in a particular application, efficiency is important. Bio::Root::Object.pm is +intended to be a lightweight, lean and mean module. + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other Bioperl modules. +Send your comments and suggestions preferably to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track the bugs and +their resolution. Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Steve Chervitz Esac@bioperl.orgE + +See L for where to send bug reports and comments. + +=head1 VERSION + +Bio::Root::Object.pm, 0.041 + + +=head1 TODO + +=over 0 + +=item * Experiment with other Exception classes. + +Consider incorporating a more widely-used Error/Exception module +(see L). + +=item * Think about integration with Data::Dumper.pm for persisting objects. + +=back + +=head1 SEE ALSO + +L - Error/Exception object +L - Input/Output manager object +L - Manages dynamic lists of objects +L - Cross-reference object +L - Manages global variables/constants + +http://bio.perl.org/Projects/modules.html - Online module documentation +http://bio.perl.org/ - Bioperl Project Homepage + +=head2 Other Exception Modules + +Experimental::Exception.pm - ftp://ftp.matematik.su.se/pub/teke/ +Error.pm - http://www.cpan.org/authors/id/GBARR/ +Throwable.pm - mailto:kstevens@globeandmail.ca + +http://genome-www.stanford.edu/perlOOP/exceptions.html + +=head1 ACKNOWLEDGEMENTS + +This module was developed under the auspices of the Saccharomyces Genome +Database: + +http://genome-www.stanford.edu/Saccharomyces + +Other Bioperl developers contributed ideas including Ewan Birney, Ian Korf, +Chris Dagdigian, Georg Fuellen, and Steven Brenner. + +=head1 COPYRIGHT + +Copyright (c) 1996-98 Steve Chervitz. All Rights Reserved. +This module is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +=cut + + +# +## +### +#### END of main POD documentation. ' +### +## +# + + +=head1 APPENDIX + +Methods beginning with a leading underscore are considered private +and are intended for internal use by this module. They are +B considered part of the public interface and are described here +for documentation purposes only. + +=cut + +# +# This object is deprecated as the root of the inheritance tree, but some +# modules depend on it as a legacy. We issue a deprecation warning for all +# other modules. +# +my @inheriting_modules = ('Bio::Tools::Blast', 'Bio::Root::Object', + 'Bio::Root::IOManager'); + + +####################################################### +# CONSTRUCTOR/DESTRUCTOR # +####################################################### + + +=head2 new + + Purpose : Creates a blessed object reference (hash) for the indicated class + : and calls _initialize() for the class passing it all parameters. + Usage : new CLASS_NAME [ %named_parameters]; + Example : $obj = new Bio::Root::Object 'george'; + : $obj = Bio::Root::Object->new(-name => 'L56163', + : -parent => $obj2 ); + : $obj = Bio::Root::Object->new(); + Returns : Blessed hash reference. + Argument : Named parameters: (PARAMETER TAGS CAN BE UPPER OR LOWERCASE). + : (all are optional) + : -NAME => arbitrary string to identify an object; + : should be unique within its class. + : -PARENT => blessed reference for an object that + : is responsible for the present object + : (e.g., a container). + : -MAKE => string to specify special constructor option. + : -OBJ => object reference for an object to be cloned. + : -RECORD_ERR => boolean (if true, attach all Err.pm objects generated by + : warn() or throw() calls to the present object; + : default = false). + : + : The use of STRICT and VERBOSE in constructors is no longer + : necessary since there is no object-specific strict or verbose setting. + : Use the strictness() and verbosity() functions exported by + : Bio::Root::Global.pm. These options are still provided + : in the constructor but the will affect *all* objects within a + : given process. + : + : -STRICT => integer (level of strictness: -2, -1, 0, 1, 2). + : -VERBOSE => integer (level of verbosity: -1, 0, 1) + : Verbosity can be used to control how much reporting + : an object should do generally. In this module, + : verbosity affects the behavior of throw() and warn() + : only. + : + : + Comments : This method creates blessed HASH references. + : An object is free to define its own strict, and verbose + : behavior as well as its own make (constructor) options. + +See Also : L<_initialize()|_initialize>, L, L, L, L, L, L, and Bio::Root::Root::throw() and Bio::Root::Root::warn() in L + +=cut + +#---------- +sub new { +#---------- + my($class, @param) = @_; + my $self = {}; + bless $self, ref($class) || $class; + $DEBUG==2 && print STDERR "CREATING $self"; + $self->_initialize(@param); + $self; +} + + +=head2 _initialize + + Purpose : Initializes key Bio::Root::Object.pm data (name, parent, make, strict). + : Called by new(). + Usage : n/a; automatically called by Bio::Root::Object::new() + Returns : String containing the -MAKE constructor option or 'default' + : if none defined (if a -MAKE parameter is defined, the value + : returned will be that obtained from the make() method.) + : This return value saves any subclass from having to call + : $self->make() during construction. For example, within a + : subclass _initialize() method, invoke the Bio::Root::Object:: + : initialize() method as follows: + : my $make = $self->SUPER::_initialize(@param); + Argument : Named parameters passed from new() + : (PARAMETER TAGS CAN BE ALL UPPER OR ALL LOWER CASE). + Comments : This method calls name(), make(), parent(), strict(), index() + : and thus enables polymorphism on these methods. To save on method + : call overhead, these methods are called only if the data need + : to be set. + : + : The _set_clone() method is called if the -MAKE option includes + : the string 'clone' (e.g., -MAKE => 'clone'). + : + : The index() method is called if the -MAKE option includes + : the string 'index'. (This is an experimental feature) + : (Example: -MAKE => 'full_index'). + : + : NOTE ON USING _rearrange(): + : + : _rearrange() is a handy method for working with tagged (named) + : parameters and it permits case-insensitive in tag names + : as well as handling tagged or un-tagged parameters. + : _initialize() does not currently call _rearrange() since + : there is a concern about performance when setting many objects. + : One issue is that _rearrange() could be called with many elements + : yet the caller is interested in only a few. Also, derived objects + : typically invoke _rearrange() in their constructors as well. + : This could particularly degrade performance when creating lots + : of objects with extended inheritance hierarchies and lots of tagged + : parameters which are passes along the inheritance hierarchy. + : + : One thing that may help is if _rearrange() deleted all parameters + : it extracted. This would require passing a reference to the param list + : and may add excessive dereferencing overhead. + : It also would cause problems if the same parameters are used by + : different methods or objects. + +See Also : L, L, L, L, L, L, L + +=cut + +#---------------- +sub _initialize { +#---------------- + local($^W) = 0; + my($self, %param) = @_; + + if(! grep { ref($self) =~ /$_/; } @inheriting_modules) { + $self->warn("Class " . ref($self) . + " inherits from Bio::Root::Object, which is deprecated. ". + "Try changing your inheritance to Bio::Root::Root."); + } + my($name, $parent, $make, $strict, $verbose, $obj, $record_err) = ( + ($param{-NAME}||$param{'-name'}), ($param{-PARENT}||$param{'-parent'}), + ($param{-MAKE}||$param{'-make'}), ($param{-STRICT}||$param{'-strict'}), + ($param{-VERBOSE}||$param{'-verbose'}), + ($param{-OBJ}||$param{'-obj'}, $param{-RECORD_ERR}||$param{'-record_err'}) + ); + + ## See "Comments" above regarding use of _rearrange(). +# $self->_rearrange([qw(NAME PARENT MAKE STRICT VERBOSE OBJ)], %param); + + $DEBUG and do{ print STDERR ">>>> Initializing $ID (${\ref($self)}) ",$name||'anon';}; + + if(defined($make) and $make =~ /clone/i) { + $self->_set_clone($obj); + + } else { + $name ||= ($#_ == 1 ? $_[1] : ''); # If a single arg is given, use as name. + + ## Another performance issue: calling name(), parent(), strict(), make() + ## Any speed diff with conditionals to avoid method calls? + + $self->name($name) if $name; + $self->parent($parent) if $parent; + $self->{'_strict'} = $strict || undef; + $self->{'_verbose'} = $verbose || undef; + $self->{'_record_err'} = $record_err || undef; + + if($make) { + $make = $self->make($make); + + # Index the Object in the global object hash only if requested. + # This feature is not used much. If desired, an object can always + # call Bio::Root::Object::index() any time after construction. + $self->index() if $make =~ /index/; + } + } + + $DEBUG and print STDERR "---> Initialized $ID (${\ref($self)}) ",$name,"\n"; + + ## Return data of potential use to subclass constructors. +# return (($make || 'default'), $strict); # maybe (?) + return $make || 'default'; +} + + + +=head2 DESTROY + + Purpose : Provides indication that the object is being reclaimed + : by the GC for debugging purposes only. + Usage : n/a; automatically called by Perl when the ref count + : on the object drops to zero. + Argument : n/a + Comments : Setting the global $DEBUG to 2 will print messages upon + : object destruction. + : Subclasses should override this method to + : clean up any resources (open file handles, etc.) + : The overridden method should end with a call to + : SUPER::DESTROY; + +See Also : L + +=cut + +#----------- +sub DESTROY { +#----------- + my $self=shift; + + $DEBUG==2 && print STDERR "DESTROY called in $ID for ${\$self->to_string} ($self)\n"; +} + + +=head2 destroy + + Purpose : Clean up any resources allocated by the object and + : remove links to all objects connected to the present + : object with the ultimate aim of signaling the GC to + : reclaim all memory allocated for the object. + : This method breaks links to any Err, IOManager, and Xref objects + : and drops the present object as a child from any parent objects. + Usage : $object->destroy(); undef $object; + : undef-ing the object reference signals the GC to reclaim + : the object's memory. + Returns : undef + Argument : n/a + Comments : Circular reference structures are problematic for garbage + : collection schemes such as Perl's which are based on reference + : counting. If you create such structures outside of + : the parent-child relationship, be sure to properly break + : the circularity when destroying the object. + : Subclasses should override this method to call destroy() + : on any contained child objects. The overridden method + : should end with a call to SUPER::destroy(). + Bugs : Bio::Root::Xref.pm objects have not been tested and + : may not be handled properly here. + : Bio::Root::Vector.pm objects are also not yet handled + : properly so beware of crunching lots of Vector objects. + +=cut + +#-------------' +sub destroy { +#------------- +## Note: Cannot delete parent and xref object refs since they are not +## owned by this object, merely associated with it. + my $self = shift; + + if(ref($self->{'_parent'})) { + $self->{'_parent'}->_drop_child($self); + undef $self->{'_parent'}; + } + + if(ref($self->{'_io'})) { + $self->{'_io'}->destroy; + undef $self->{'_io'}; + } + + if(ref($self->{'_err'})) { + $self->{'_err'}->remove_all; + undef $self->{'_err'}; + } + + if(ref($self->{'_xref'})) { + $self->{'_xref'}->remove_all; + undef $self->{'_xref'}; + } + + $self->_remove_from_index if scalar %Objects_created; +} + + +=head2 _drop_child + + Usage : $object->_drop_child(object_ref) + : Used internally by destroy(). + Purpose : To remove a parent-to-child inter-object relationship. + : The aim here is to break cyclical object refs to permit Perl's + : GC to reclaim the object's memory. The expectation is that + : a child object requests of its parent that the parent drop the + : child object making the request. Parents do not drop children + : unless requested by the child in question. + Example : $self->parent->_drop_child($self); + Returns : undef + Argument : Object reference for the child object to be dropped + Throws : Exception if an object ref is not provided as an argument. + Comments : This is a simplistic version that systematically checks every + : data member, searching all top-level array, hash, and scalar + : data members. + : It does not recurse through all levels of complex data members. + : Subclasses could override this method to handle complex child + : data members for more optimal child searching. However, the + : version here is probably sufficient for most situations. + : + : _drop_child() is called by Bio::Root::Object::destroy() for + : all objects with parents. + Status : Experimental + +See Also : L + +=cut + +#---------------' +sub _drop_child { +#--------------- + my ($self, $child) = @_; + my ($member, $found); + + $self->throw("Child not defined or not an object ($child).") unless ref $child; + + local($^W = 0); + foreach $member (keys %{$self}) { + next unless ref($self->{$member}); + # compare references. + if (ref($self->{$member}) eq 'ARRAY') { + my ($i); + for($i=0; $i < @{$self->{$member}}; $i++) { + if ($self->{$member}->[$i] eq $child) { + $DEBUG==2 && print STDERR "Removing array child $child\n"; + undef $self->{$member}->[$i]; + $found = 1; last; + } + } + } elsif(ref($self->{$member}) eq 'HASH') { + foreach(keys %{$self->{$member}}) { + if ($self->{$member}->{$_} eq $child) { + $DEBUG==2 && print STDERR "Removing hash child $child\n"; + undef $self->{$member}->{$_}; + $found = 1; last; + } + } + } else { + if ($self->{$member} eq $child) { + $DEBUG==2 && print STDERR "Removing child $child\n"; + undef $self->{$member}; + $found = 1; last; + } + } + } + # Child not found: + # It is possible that a child object has a parent but has not yet been added to + # the parent due to a failure during construction of the child. Not warning. + #$self->warn(sprintf "Child %s not found in Parent %s.", $child->to_string, $self->to_string) unless $found; + + undef; +} + + +################################################################# +# ACCESSORS & INSTANCE METHODS +################################################################# + + + +=head2 name + + Usage : $object->name([string]); + Purpose : Set/Get an object's common name. + Example : $myName = $myObj->name; + : $myObj->name('fred'); + Returns : String consisting of the object's name or + : "anonymous " if name is not set. + : Thus, this method ALWAYS returns some string. + Argument : String to be used as the common name of the object. + : Should be unique within its class. + +See also : L + +=cut + +#--------- +sub name { +#--------- + my $self = shift; + +# $DEBUG and do{ print STDERR "\n$ID: name(@_) called.";; }; + + if (@_) { $self->{'_name'} = shift } + return defined $self->{'_name'} ? $self->{'_name'} : 'anonymous '.ref($self); +} + + +=head2 to_string + + Usage : $object->to_string(); + Purpose : Get an object as a simple string useful for debugging purposes. + Example : print $myObj->to_string; # prints: Object "" + Returns : String consisting of the package name + object's name + : Object's name is obtained by calling the name() method. + Argument : n/a + Throws : n/a + +See also : L + +=cut + +#------------- +sub to_string { +#------------- + my $self = shift; + return sprintf "Object %s \"%s\"", ref($self), $self->name; +} + + +=head2 parent + + Usage : $object->parent([object | 'null']); + Purpose : Set/Get the current object's source object. + : An object's source object (parent) is defined as the object + : that is responsible for creating the current object (child). + : The parent object may also have a special mechanism for + : destroying the child object. This should be included + : in the parent object's DESTROY method which should end with a + : call to $self->SUPER::DESTROY. + Example : $myObj->parent($otherObject); + Returns : Object reference for the parent object or undef if none is set. + Argument : Blessed object reference (optional) or the string 'null'. + : 'null' = sets the object's _parent field to undef, + : breaking the child object's link to its parent. + Throws : Exception if argument is not an object reference or 'null'. + Comments : This method may be renamed 'parent' in the near future. + : When and if this happens, parent() will still be supported but + : will be deprecated. + +See also : L + +=cut + +#------------' +sub parent { +#------------ + my ($self) = shift; + if (@_) { + my $arg = shift; + if(ref $arg) { + $self->{'_parent'} = $arg; + } elsif($arg =~ /null/i) { + $self->{'_parent'} = undef; + } else { + $self->throw("Can't set parent using $arg: Not an object"); + } + } + $self->{'_parent'}; +} + + +=head2 src_obj + + Usage : $object->src_obj([object | 'null']); + : THIS METHOD IS NOW DEPRECATED. USE parent() INSTEAD. + Purpose : Set/Get the current object's source object (parent). + +See also : L + +=cut + +#------------' +sub src_obj { +#------------ + my ($self) = shift; + $self->warn("DEPRECATED METHOD src_obj() CALLED. USE parent() INSTEAD.\n"); + $self->parent(@_); +} + + +=head2 has_name + + Usage : $object->has_name(); + Purpose : To determine if an object has a name. + Returns : True (1) if the object's {'Name'} data member is defined. + : False otherwise. + Comments : One may argue, why not just use the name() method as a + : combination setter/getter? has_name() is necessary for + : the following reasons: + : (1) If an object's name is not defined, name() returns + : "anonymous ". + : (2) If an object's name is 0 (zero) or '' (empty string), + : conditionals that simply check name() would fail incorrectly. + +See also : L + +=cut + +#--------------' +sub has_name { my $self = shift; return defined $self->{'_name'}; } +#-------------- + + + +=head2 make + + Usage : $object->make([string]); + Purpose : Set/Get an object's constructor option. + : make() is intended for use during object construction + : to essentially permit alternate constructors since + : Perl doesn't have a built-in mechanism for this. + Example : $make = $object->make(); + : $object->make('optionA'); + Returns : String consisting of the object's make option + : or 'default' if make is not set. + : Thus, this method ALWAYS returns some string. + Argument : String to be used as an option during object construction. + Comments : A typical use of a make option is when cloning an object + : from an existing object. In this case, the new() method + : is called with -MAKE => 'clone'. + +See also : L<_initialize()|_initialize>, L + +=cut + +#----------' +sub make { +#---------- + my $self = shift; + if(@_) { $self->{'_make'} = shift; } + $self->{'_make'} || 'default'; +} + + +=head2 err + + Usage : $self->err([$data], [$delimit]) + Purpose : Check for exceptions/warnings and get data about them. + : (object validation and error data retrieval) + Example : $self->err && print "has err"; + : $errCount = $self->err('count'); + : $errMsgs = $self->err('msg',"\t"); + : @errNotes = $self->err('note'); + Returns : One of the following: + : 1. If no arguments are given + : a. If the object has an error, the err data member is + : returned (this is an Bio::Root::Err.pm object), + : b. otherwise, undef is returned. + : 2. The number of Errs in the object's err data member (if $data eq 'count'). + : 3. A string containing data from a specific field from an object's err member. + : -- If the object contains multiple errors, data for all errors will be + : strung together in reverse chronological order with each error's data + : preceeded by "Error #n\n" and followed by two delimiters. + : 4. A list containing data from a specific field from an object's err member. + : -- If the object contains multiple errors, data for all errors will be + : added in reverse chronological order as separate elements in the list + : with NO "Error #n\n" identifier. Individual err list data + : (note,tech,stack) will be tab-delimited. + Arguments : $data = The name of a specific Err data member (see %Bio::Root::Err::ERR_FIELDS) + : OR 'count'. + : $delimit = The delimiter separating a single Err's list data member's elements. + : Default is "\n". For multi-error objects, two of these + : delimiters separate data from different errors. + : If wantarray is true or delimiter is 'list', data from multiple + : errors will be returned as a list + : + Comments : Since Err objects are now fatal and are not attached to the object by default, + : this method is largely moot. It is a relic from the former + : error "polling" days. + : It is handy for accessing non-fatal warnings thrown by the object, + : or in situations where fatal errors are converted to warnings + : as when $self->strict is -1 or $WARN_ON_FATAL is true. + : (Note: an object now only attaches Err objects to itself when + : constructed with -RECORD_ERR =>1 or if the global $RECORD_ERR is true). + : + : This method is intended mainly to test whether or not an object + : has any Err objects associated with it and if so, obtaining the + : Err object or specific data about it. + : For obtaining ALL data about an error, use err_string(). + : For more detailed manipulations with the Err data, retrieve the + : Err object and process its data as necessary. + +See also : L, L, L + +=cut + +#---------- +sub err { +#---------- + my( $self, $data, $delimit) = @_; + + return undef unless defined $self->{'_err'}; + + $data ||= 'member'; +# $delimit ||= (wantarray ? 'list' : "\n"); + $delimit ||= "\n"; + + $data eq 'member' and return $self->{'_err'}; + $data eq 'count' and return $self->{'_err'}->size(); + + return $self->{'_err'}->get_all($data, $delimit ); +} + + +=head2 record_err + + Usage : $object->record_err([0|1]); + Purpose : Set/Get indicator for whether an object should save + : the Bio::Root::Err.pm objects it generates via calls + : to throw() or warn(). + Example : $myObj->record_err(1) + Returns : Boolean (0|1) + Argument : Boolean (0|1) + Comments : Record_err is generally useful only for examining + : warnings produced by an object, since calls to throw() + : are normally fatal (unless strictness is set to -2). + : To turn on recording of errors for all objects in a process, + : use Bio::Root::Global::record_err(). + Status : Experimental + +See also : L, and record_err() in L + +=cut + +#--------------- +sub record_err { +#--------------- + my $self = shift; + + if (@_) { $self->{'_record_err'} = shift } + return $self->{'_record_err'} || 0; +} + + +=head2 err_state + + Usage : $object->err_state(); + Purpose : To assess the status of the object's Err object (if any). + Returns : A string: 'EXCEPTION' | 'WARNING' | 'FATAL' | 'OKAY' + : (OKAY is returned if there are no Errors) + Status : Experimental + +=cut + +#-------------' +sub err_state { +#------------- + my $self = shift; + return 'OKAY' if not defined $self->{'_err'}; + $self->{'_errState'} || 'OKAY'; +} + + +=head2 clear_err + + Purpose : To remove any error associated with the given object. + Usage : $myObj->clear_err; + +See also : L + +=cut + +#------------- +sub clear_err { +#------------- + my $self = shift; + undef $self->{'_err'}; +} + + + + + +=head2 containment + + Usage : $aref = $object->containment(); + : Since this method can be exported, the following can be used: + : $aref = containment($object); + Purpose : To determine the containment hierarchy of a object. + Returns : An array reference in which each element is a string + : containing the class and name of + : the object in which this object is contained. + : Indentation increases progressively as the + : hierarchy is traversed. + : E.g., Object MyClass "Foo" + : Contained in object YourClass "Bar" + : Contained in object HisClass "Moo" + Comments : This method will report only one object at each level + : since an object can currently have only one source object. + Status : Exported + +See also : L + +=cut + +#------------------ +sub containment { +#------------------ + my( $self) = @_; + my(@hierarchy); + +# print "$ID: getting err hierarchy.\n"; + push @hierarchy, $self->to_string; + my $obj = $self; + my $count = 0; + + while( ref $obj->parent) { + $obj = $obj->parent; + push @hierarchy, sprintf "%sContained in %s", ' ' x ++$count, $obj->to_string; + } + return \@hierarchy; +} + + +=head2 set_stats + + Usage : $object->set_stats(KEY => DATA [,KEY2 => DATA2]) + Purpose : To declare and initialize a set of statistics germain + : to an object. Each statistic name becomes a data member + : prefixed with an underscore (if not already) and first + : character after the underscore is lowercased. + Example : $object->set_stats('num_A' =>1, + : 'Num_B' =>10 ): + : This sets : + : $object->{'_num_A'} = 1 + : $object->{'_num_B'} = 10; + Returns : n/a + Comments : This method implements a convention for naming Perl + : object data members with a leading underscore, + : consistent with the naming convention of private methods. + : Data members should not be part of an object's public + : interface. The leading underscore helps flag the members + : as private and also prevents inadvertant clobbering. + +=cut + +#--------------' +sub set_stats { +#-------------- + my( $self, %param ) = @_; + + my ($val); + foreach (keys %param) { + $val = $param{$_};; + s/^(\w)/_\l$1/; + $self->{$_} = $val; + } +} + + +=head2 strict + + Usage : $object->strict( [-2|-1|0|1|2] ); + : warn $message if $object->strict > 0; + Purpose : To make the object hyper- or hyposensitive to exceptions & warnings. + : Strict = 2 : extremely hyper-sensitive, converts warn() into throw(). + : Strict = 1 : hyper-sensitive, but calls to warn are not converted. + : Strict = 0 : no change (throw() = fatal, warn() = non-fatal). + : Strict = -1 : hypo-sensitive, but calls to throw are not converted. + : Strict = -2 : extremely hypo-sensitive, converts throw() into warn() + : + : Two degrees of positive and negative values for strict permit + : the following functionality: + : 1. Setting strict to 2 or -2 leads to more dramatic strictness + : or permissiveness, respectively. With 2, all calls to warn() + : become calls to throw() and are therefore fatal. With -2, + : the opposite is true and calls to throw become non-fatal. + : A strict value of 2 is thus an object-level version of + : Perl's "use strict" pragma. + : + : 2. Setting strict to 1 or -1 does not affect the behavior of + : throw() and warn(). This allows an object to implement its + : its own strictness policy. A strict value of 1 is thus an + : an object-level version of Perl's -w flag. + : + Returns : Integer between -2 to 2. + Comments : This method no longer accesses an object-specific strictness + : level but rather the global $STRICTNESS variable + : defined in Bio::Root::Global.pm and accessed via the + : strictness() method exported by that package. + : Thus, all objects share the same strictness which + : is generally more convenient. + Status : Experimental + +See also : warn() and throw() in L, L, strictness() in L + +=cut + +#------------ +sub strict { +#------------ + my $self = shift; + + # Use global strictness? + if( $self->{'_use_global_strictness'}) { + return &strictness(@_); + } + else { + # Object-specific strictness + if (@_) { $self->{'_strict'} = shift; } + defined($self->{'_strict'}) + ? return $self->{'_strict'} + : (ref $self->{'_parent'} ? $self->{'_parent'}->strict : 0); + } +} + +=head2 use_global_strictness + + Usage : $object->use_global_strictnness( [1|0] ); + Purpose : Set/Get accessor for a flag indicating whether or not + : to use the global strictness setting or to instead use + : object-specific strictness. + Returns : Boolean + Comments : + Status : Experimental + +See also : L, L, strictness() in L + +=cut + +sub use_global_strictness { + my ($self, $value) = @_; + + if( defined $value ) { + $self->{'_use_global_strictness'} = $value; + } + + return $self->{'_use_global_strictness'}; +} + + +=head2 clone + + Purpose : To deeply copy an object. + : Creates a new object reference containing an exact + : copy of an existing object and all its data members. + Usage : $myClone = $myObj->clone; + Comments : This method only clones the Bio::Root::Object data members. + : To fully clone an object that has data members beyond + : those inherited from Bio::Root::Object, you must provide a + : constructor in your class to copy all data of an object + : data into the clone. For an example, see how _set_clone() + : is called by _initialize() in this class. + : + : clone() will pass the named parameters {-MAKE=>'clone'} + : and {-OBJ=>$self} to the object's constructor. The + : constructor should then either check the -MAKE parameter + : directly or should check the return value from + : a call to the superclass constructor (see _initialize() + : for an example) and then copy the required data members from OBJ + : into the new object, bypassing the normal construction process. + : Cloning of objects has not been extensively tested. + : USE WITH CAUTION. + Status : Experimental + +See Also : L<_set_clone()|_set_clone>, L<_initialize()|_initialize> + +=cut + +#-------------' +sub clone { +#------------- + my($self) = shift; + +# warn sprintf "\nCloning %s \"%s\"\n\n", ref($self),$self->name; + + my $clone = $self->new(-MAKE =>'clone', + -OBJ =>$self); + if($self->err()) { $clone->err($self->err); } + $clone; +} + + + +=head2 _set_clone + + Usage : n/a; internal method used by _initialize() + : $self->_set_clone($object_to_be_cloned) + Purpose : Deep copy all Bio::Root::Object.pm data members + : into a new object reference. + : (This is basically a copy constructor). + Argument : object ref for object to be cloned. + Throws : Exception if argument is not an object reference. + Comments : Data members which are objects are cloned (parent, io, err). + : Cloning of objects has not been extensively tested. + : USE WITH CAUTION. + +See Also : L<_initialize()|_initialize> + +=cut + +#---------------- +sub _set_clone { +#---------------- + my($self, $obj) = @_; + + ref($obj) || throw($self, "Can't clone $ID object: Not an object ref ($obj)"); + + local($^W) = 0; # suppress 'uninitialized' warnings. + + $self->{'_name'} = $obj->{'_name'}; + $self->{'_strict'} = $obj->{'_strict'}; + $self->{'_make'} = $obj->{'_make'}; + $self->{'_verbose'} = $obj->{'_verbose'}; + $self->{'_errState'} = $obj->{'_errState'}; + ## Better to use can() with Perl 5.004. + $self->{'_parent'} = ref($obj->{'_parent'}) and $obj->{'_parent'}->clone; + $self->{'_io'} = ref($obj->{'_io'}) and $obj->{'_io'}->clone; + $self->{'_err'} = ref($obj->{'_err'}) and $obj->{'_err'}->clone; +} + + + +=head2 verbose + + Usage : $object->verbose([-1|0|1]); + Purpose : Set/Get an indicator for how much ruporting an object should do. + Argument : integer (-1, 0, or 1) + Returns : integer (-1, 0, or 1) + : Returns 0 if verbosity has not been defined. + : Verbosity > 0 indicates extra reporting. + : Verbosity < 0 indicates minimal reporting. + : Verbosity = 0 or undefined indicates default reporting. + Comments : This method no longer accesses an object-specific verbosity + : level but rather the global $VERBOSITY variable + : defined in Bio::Root::Global.pm and accessed via the + : verbosity() method exported by that package. + : Thus, all objects share the same verbosity which + : is generally more convenient. + Status : Experimental + +See Also : L, L, verbosity() in L + +=cut + +#------------ +sub verbose { +#------------ + my $self = shift; + + # Using global verbosity + return &verbosity(@_); + + # Object-specific verbosity (not used unless above code is commented out) + if(@_) { $self->{'_verbose'} = shift; } + defined($self->{'_verbose'}) + ? return $self->{'_verbose'} + : (ref $self->{'_parent'} ? $self->{'_parent'}->verbose : 0); +} + + + +=head1 I/O-RELATED METHODS (Delegated to B) + +=head2 _io + + Usage : $object->_io() + Purpose : Get the Bio::Root::IOManager.pm object for the current object. + +See also : L, L, L + +=cut + +#------- +sub _io { my $self = shift; return $self->{'_io'}; } +#------- + + + +=head2 _set_io + + Usage : n/a; internal use only. + Purpose : Sets a new Bio::Root::IOManager.pm object for the current object. + +See also : L, L, L + +=cut + +#------------ +sub _set_io { +#------------ + my $self = shift; + + require Bio::Root::IOManager; + +# See PR#192. +# $self->{'_io'} = new Bio::Root::IOManager(-PARENT=>$self, @_); + $self->{'_io'} = new Bio::Root::IOManager(-PARENT=>$self); +} + + + +=head2 set_display + + Usage : $object->set_display( %named_parameters). + : See Bio::Root::IOManager::set_display() for a description of parameters. + Purpose : Sets the output stream for displaying data associated with an object. + : Delegates to Bio::Root::IOManager::set_display(). + Argument : Named parameters (optional). + : See Bio::Root::IOManager::set_display() for a + : description of arguments. + Status : Experimental + Comments : Sets the IOManager.pm object if it is not set. + : I'm not satisfied with the current display()/set_display() strategy. + +See also : set_display() in L + +=cut + +#----------------' +sub set_display { +#---------------- + my($self, @param) = @_; + + $self->_set_io(@param) if !ref($self->{'_io'}); + + eval { $self->{'_io'}->set_display(@param); }; + + if($@) { + my $er = $@; + $self->throw(-MSG=>$er, -NOTE=>"Can't set_display for ${\$self->name}"); + } + + return $self->{'_io'}->fh; +} + + +=head2 display + + Usage : $object->display( named parameters) + : See Bio::Root::IOManager::display() for a description of parameters. + Purpose : Output information about an object's data. + : Delegates this task to Bio::Root::IOManager::display() + Argument : Named parameters for IOManager::set_display() + Status : Experimental + Comments : Sets the IOManager.pm object if it is not set. + : IOManager::set_display()is then called on the new IOManager object. + : + : The motivation behind the display() method and IOManager.pm + : is to allow for flexible control over output of an + : object's data to/from filehandles, pipes, or STDIN/STDOUT, + : and for passing file handles between objects. Currently, + : it is used mainly for output to STDOUT. + : + : There is some concern whether this much functionality is + : actually necessary, hence the "Experimental" status of this + : method. + : + : ------- + : It might be worthwhile to also have a string() method + : that will put an object's data into a string that can be + : further processed as desired. Stringification for persistence + : issues might be best handled by Data::Dumper.pm. + : + : When overriding this method, use the following syntax: + : + : sub display { + : my ($self, %param) = @_; + : $self->SUPER::display(%param); + : my $OUT = $self->fh(); + : print $OUT "\nSome data...\n"; + : ... + : } + : Now $OUT holds a filhandle reference (or the string 'STDOUT') + : which can be passed to other methods to display different + : data for the object. + : _set_display() is automatically called with $OUT as the sole + : argument (after $self) by IOManager.pm::display() + : if the -SHOW parameter is set to 'stats' or 'default'. + : + Bugs : Because the $OUT variable can be a FileHandle or a string, + : it is necessary to include the line before using $OUT in + : print statements: + : I am considering a cleaner way of dealing with this. + : Setting $OUT to a glob (*main::STDOUT) was unsuccessful. + : + : I'm not satisfied with the current display()/set_display() strategy. + +See also : display() in L + +=cut + +#------------- +sub display { +#------------- + my( $self, @param ) = @_; + $self->{'_io'} || $self->set_display(@param); + $self->{'_io'}->display(@param); +} + + + + +=head2 _display_stats + + Usage : n/a; called automatically by Bio::Root::Object::display(-SHOW=>'stats'); + Purpose : Display stereotypical data for an object. + : Automatically called via display(). + Argument : Filehandle reference or string 'STDOUT' 'STDIN' 'STDERR' + Status : Experimental + +See also : L + +=cut + +#------------------- +sub _display_stats { +#------------------- + my($self, $OUT) = @_; + + + printf ( $OUT "%-15s: %s\n","NAME", $self->name()); + printf ( $OUT "%-15s: %s\n","MAKE", $self->make()); + if($self->parent) { + printf ( $OUT "%-15s: %s\n","PARENT", $self->parent->to_string); + } + printf ( $OUT "%-15s: %d\n",'ERRORS', (defined $self->err('count') ? $self->err('count') : 0)); ###JES### + printf ( $OUT "%-15s: %s\n","ERR STATE", $self->err_state()); + if($self->err()) { + print $OUT "ERROR:\n"; + $self->print_err(); + } +} + + + +=head2 read + + Usage : $object->read( named parameters) + : See Bio::Root::IOManager::read() for a description of parameters. + Purpose : Inputs data from an arbitrary source (file or STDIN). + : Delegates this task to Bio::Root::IOManager::read(). + Status : Experimental + Comments : Sets the IOManager.pm object if it is not set. + : See the comments for the display() method for some comments + : about IO issues for objects. + : Note that the read() method uses a different strategy than + : the display() method. + : IO issues are considered experimental. + +See also : L, read() in L + +=cut + +#-------- +sub read { +#-------- + my $self = shift; + + $self->_set_io(@_) if not defined $self->{'_io'}; + + $self->{'_io'}->read(@_); +} + + + +=head2 fh + + Usage : $object->fh(['name']) + : See Bio::Root::IOManager::fh() for a complete usage description. + Purpose : Get an object's current FileHandle object or IO stream indicator. + : Delegates to Bio::Root::IOManager.pm. + Status : Experimental + Comments : Sets the IOManager.pm object if it is not set. + +See also : fh() in L + +=cut + +#--------' +sub fh { +#-------- + my $self = shift; + $self->_set_io(@_) if !defined $self->{'_io'}; + $self->{'_io'}->fh(@_); +} + + +=head2 show + + Usage : $object->show() + : See Bio::Root::IOManager::show() for details. + Purpose : Get the string used to specify what to display + : using the display() method. + : Delegates to Bio::Root::IOManager.pm. + Status : Experimental + Comments : Sets the IOManager.pm object if it is not set. + +See also : show() in L, set_display() in L + +=cut + +#----------- +sub show { +#----------- + my $self = shift; + $self->_set_io(@_) if !defined $self->{'_io'}; + $self->{'_io'}->show; +} + + + +=head2 file + + Usage : $object->file() + : See Bio::Root::IOManager::file() for details. + Purpose : Set/Get name of a file associated with an object. + : Delegates to Bio::Root::IOManager.pm. + Status : Experimental + Comments : Sets the IOManager.pm object if it is not set. + +See also : file() in L + +=cut + +#--------- +sub file { +#--------- + my $self = shift; + $self->_set_io(@_) if !defined $self->{'_io'}; + $self->{'_io'}->file(@_); +} + + +=head2 compress_file + + Usage : $object->compress_file([filename]) + : See Bio::Root::IOManager::compress_file() for details. + Purpose : Compress a file associated with the current object. + : Delegates to Bio::Root::IOManager.pm. + Throws : Propagates exceptions thrown by Bio::Root::IOManager.pm + Status : Experimental + Comments : Sets the IOManager.pm object if it is not set. + +See also : L, compress_file() in L + +=cut + +#------------------- +sub compress_file { +#------------------- + my $self = shift; + $self->_set_io(@_) if !defined $self->{'_io'}; + $self->{'_io'}->compress_file(@_); +} + + + +=head2 uncompress_file + + Usage : $object->uncompress_file([filename]) + : Delegates to Bio::Root::IOManager.pm. + Purpose : Uncompress a file associated with the current object. + Throws : Propagates exceptions thrown by Bio::Root::IOManager.pm + Status : Experimental + Comments : Sets the IOManager.pm object if it is not set. + +See also : L, uncompress_file() in L + +=cut + +#-------------------- +sub uncompress_file { +#-------------------- + my $self = shift; + $self->_set_io(@_) if !defined $self->{'_io'}; + $self->{'_io'}->uncompress_file(@_); +} + + +=head2 delete_file + + Usage : $object->delete_file([filename]) + : See Bio::Root::IOManager::delete_file() for details. + Purpose : Delete a file associated with the current object. + : Delegates to Bio::Root::IOManager.pm. + Throws : Propagates exceptions thrown by Bio::Root::IOManager.pm + Status : Experimental + Comments : Sets the IOManager.pm object if it is not set. + +See also : L, delete_file() in L + +=cut + +#----------------- +sub delete_file { +#----------------- + my $self = shift; + $self->_set_io(@_) if !defined $self->{'_io'}; + $self->{'_io'}->delete_file(@_); +} + + +=head2 file_date + + Usage : $object->file_date( %named_parameters ) + : See Bio::Root::IOManager::file_date() for details. + Purpose : Obtain the last modified data of a file. + : Delegates to Bio::Root::IOManager.pm. + Example : $object->file_date('/usr/home/me/data.txt'); + Throws : Propagates exceptions thrown by Bio::Root::IOManager.pm + Status : Experimental + Comments : Sets the IOManager.pm object if it is not set. + +See also : L, file_date() in L + +=cut + +#--------------- +sub file_date { +#--------------- + my $self = shift; + $self->_set_io(@_) if !defined $self->{'_io'}; + $self->{'_io'}->file_date(@_); +} + + + +=head1 EXPERIMENTAL METHODS + + +=head2 xref + + Usage : $object->xref([object | 'null']); + Purpose : Sets/Gets an object(s) cross-referenced + : to the current object. + Example : $myObj->xref('null'); #remove all xrefs + : $myObj->xref($otherObject); #add a cross referenced object + Argument : Object reference or 'null' ('undef' also accepted). + Returns : Object reference or undef if the object has no xref set. + Throws : fatal error if argument is not an object reference or 'null'. + Comments : An Xref.pm object is a vectorized wrapper for an object. + : Thus, the number of objects cross-referenced can grow + : and shrink at will. + Status : Experimental + WARNING : NOT FULLY TESTED. + +See Also : L + +=cut + +#--------- +sub xref { +#--------- + my $self = shift; + if(@_) { + my $arg = shift; + if(ref $arg) { + require Bio::Root::Xref; + + if( !defined $self->{'_xref'}) { + $self->{'_xref'} = new Bio::Root::Xref(-PARENT =>$self, + -OBJ =>$arg); + } else { + $self->{'_xref'}->add($arg); + } + } elsif($arg =~ /null|undef/i) { + undef $self->{'_xref'}; + } else { + $self->throw("Can't set Xref using $arg: Not an object"); + } + } + + $self->{'_xref'}; +} + + + +=head2 index + + Purpose : To add an object to a package global hash of objects + : for tracking or rapid retrieval. + Usage : $self->index(); + Status : Experimental + Comments : The object's name is used to index it into a hash. Objects in + : different classes (packages) will be indexed in different hashes. + : An object's name should thus be unique within its class. + : To find an object, use find_object(). + : Uses the package global %Objects_created. + +See also : L + +=cut + +#---------- +sub index { +#---------- + my $self = shift; + my $class = ref $self; + my $objName = $self->{'_name'}; + + if( not defined $objName ) { + $self->throw("Can't index $class object \"$objName\"."); + } + + $DEBUG and do{ print STDERR "$ID: Indexing $class object \"$objName\"."; ; }; + + $Objects_created{ $class }->{ $objName } = $self; +} + +#---------------------- +sub _remove_from_index { +#---------------------- + my $self = shift; + my $class = ref $self; + my $objName = $self->{'_name'}; + + undef $Objects_created{$class}->{$objName} if exists $Objects_created{$class}->{$objName}; +} + + + +=head2 find_object + + Purpose : To obtain any object reference based on its unique name + : within its class. + Usage : $myObj = &find_object('fred'); + : No need to specify the class (package) name of the object. + Comments : To use this method, the object must be previously + : indexed by Bio::Root::Object.pm. This can be accomplished + : by including 'index' in the -MAKE parameter during object + : construction OR by calling the index() method on the + : the object at any point after construction. + : This is not an instance method. + Status : Experimental + +See also : L + +=cut + +#--------------- +sub find_object { +#--------------- + my $name = shift; # Assumes name has been validated. + my $class = undef; + my $object = undef; + + foreach $class ( keys %Objects_created ) { + if( exists $Objects_created{ $class }->{ $name } ) { + $object = $Objects_created{ $class }->{ $name }; + last; + } + } + $object; +} + + + +=head2 has_warning + + Purpose : Test whether or not an object has a non-fatal error (warning). + Usage : $self->has_warning; + Comments : This method is not usually needed. Checking err() is + : sufficient since throw()ing an exception is a fatal event + : and must be handled when it occurs. + Status : Experimental + +See also : L, warn() in L, throw() in L + +=cut + +#---------------- +sub has_warning { +#---------------- + my $self = shift; + my $errData = $self->err('type'); + return 1 if $errData =~ /WARNING/; + 0; +} + + + +=head2 print_err + + Usage : print_err([-WHERE=>FileHandle_object [,-SHOW=>msg|note|tech|stack] or any combo]) + Purpose : Reports error data for any errors an object may have + : as a string. This will only print warnings since exceptions + : are fatal (unless a strictness of -2 is used). + Example : $myObj->print_err; + : $myObj->print_err(-WHERE=>$myObj->fh('err'), -SHOW=>'msgtechstack'); + Argument : SHOW parameter : specify a sub-set of the err data. + : WHERE parameter : specify a filehandle for printing. + Returns : n/a + Status : Experimental + +See also : L, L + +=cut + +#------------- +sub print_err { +#------------- + my( $self, %param ) = @_; + +# print "$ID: print_err()\n"; + + my $OUT = $self->set_display(%param); + +# print "$ID: OUT = $OUT\n"; + + print $OUT $self->err_string( %param ); + +# print "$ID: done print_err()\n"; +} + + + +=head2 err_string + + Usage : err_string([-SHOW =>msg|note|tech|stack]) + : err_string([-SHOW =>'msgnote'] or other combos) + Purpose : Reports all warnings generated by the object as a string. + Example : $errData = $myObj->err_string; + : print MYHANDLE $myObj->err_string(); + Argument : SHOW parameter : return a specific sub-set of the err data. + Returns : A string containing the error data of the object. + Comments : This method is provided as a safer and slightly easier to type + : alternative to $self->err->string. + Status : Experimental + +See also : L, string() in L + +=cut + +#---------------- +sub err_string { +#---------------- + my( $self, %param ) = @_; + my($out); + my $errCount = $self->err('count'); + +# print "$ID: err_string(): count = $errCount\n"; + + if( $errCount) { + $out = sprintf("\n%d error%s in %s \"%s\"\n", + $errCount, $errCount>1?'s':'', ref($self), $self->name); + $out .= $self->err->string( %param ); + } else { + $out = sprintf("\nNo errors in %s \"%s\"\n", ref($self), $self->name); + } + $out; +} + + + + +################################################################# +# DEPRECATED or HIGHLY EXPERIMENTAL METHODS +################################################################# + +=head1 HIGHLY EXPERIMENTAL/DEPRECATED METHODS + +=head2 terse + + Usage : $object->terse([0|1]); + Purpose : Set/Get an indicator to report less than the normal amount. + Argument : Boolean (0|1) + Returns : Boolean (0|1) + Comments : This method is for reducing the amount of reporting + : an object will do. + : terse can be set during object construction with the + : -TERSE => 1 flag. + : Not putting this method in IOManager.pm since that class + : is concerned with "where" to report, not "what" or "how much". + Status : Deprecated + : Use verbose() with a negative value instead. + +See also : L + +=cut + +#---------- +sub terse { +#---------- + my $self = shift; + if(@_) { $self->{'_verbose'} = -1 * shift; } + + $self->warn("Deprecated method 'terse()'. Use verbose(-1) instead."); + + my $verbosity = $self->{'_verbose'} or + (ref $self->{'_parent'} and $self->{'_parent'}->verbose) or 0; + + return $verbosity * -1; +} + + +#---------------------- +=head2 set_err_data() +#---------------------- + + Usage : $object->set_err_data( field, data); + Purpose : Alters data within the last error set by the object. + : Interface to Bio::Root::Err::set(). + Returns : Calls Bio::Root::Err::set() + Argument : field = string, name of Bio::Root::Err.pm data field to set. + : data = string, data to set it to. + Throws : Exception if object has no errors. + Status : Deprecated + +See Also : set() in L + +=cut + +#----------------- +sub set_err_data { +#----------------- + my( $self, $field, $data) = @_; + + $self->throw("Object has no errors.") if !$self->{'_err'}; + +# print "$ID: set_err_data($field) with data = $data\n in object ${\$self->name}:\n", $self->err->last->string(-CURRENT=>1); ; + + $self->{'_err'}->last->set( $field, $data ); +} + +=head2 set_read + + Usage : see Bio::Root::IOManager::set_read() + Purpose : Sets an input stream for importing data associated with an object. + : Delegates to Bio::Root::IOManager::set_read(). + Status : Experimental + WARNING! : This method has not been tested. + +See also : set_read() in L + +=cut + +#-------------- +sub set_read { +#-------------- + my($self,%param) = @_; + + $self->_set_io(%param) if !defined $self->{'_io'}; + + $self->{'_io'}->set_read(%param); +} + + + +=head2 set_log_err + + Usage : see Bio::Root::IOManager::set_log_err() + Purpose : Sets the output stream for logging information about + : an object's errors. + : Delegates to Bio::Root::IOManager::set_log_err(). + Status : Experimental + WARNING! : This method has not been tested. + +See also : set_log_err() in L + +=cut + +#---------------' +sub set_log_err { +#--------------- + my($self,%param) = @_; + + $self->_set_io(%param) if !defined $self->{'_io'}; + + $self->{'_io'}->set_log_err(%param); +} + + +1; +__END__ + + +##################################################################################### +# END OF CLASS # +##################################################################################### + +=head1 FOR DEVELOPERS ONLY + +=head2 Data Members + +Information about the various data members of this module is provided for those +wishing to modify or understand the code. Two things to bear in mind: + +=over 4 + +=item 1 Do NOT rely on these in any code outside of this module. + +All data members are prefixed with an underscore to signify that they are private. +Always use accessor methods. If the accessor doesn't exist or is inadequate, +create or modify an accessor (and let me know, too!). + +=item 2 This documentation may be incomplete and out of date. + +It is easy for this documentation to become obsolete as this module is still evolving. +Always double check this info and search for members not described here. + +=back + +An instance of Bio::Root::Object.pm is a blessed reference to a hash containing +all or some of the following fields: + + FIELD VALUE + ------------------------------------------------------------------------ + _name Common name for an object useful for indexing. + Should be unique within its class. + + _parent The object which created and is responsible for this object. + When a parent is destroyed, it takes all of its children with it. + + _err Bio::Root::Err.pm object reference. Undefined if the object has no error + or if the _record_err member is false (which is the default). + If object has multiple errors, err becomes a linked + list of Err objects and the err member always points to latest err. + In theory, an object should care only about whether or not it HAS + an Err not how many it has. I've tried to make the management of + multiple errors as opaque as possible to Bio::Root::Object. + + _errState One of @Bio::Root::Err::ERR_TYPES. Allows an object to quickly determine the + the type of error it has (if any) without having to examine + potentially multiple Err object(s). + + _xref Bio::Root::Xref object (Vector) for tracking other object(s) related to the + present object not by inheritance or composition but by some arbitrary + criteria. This is a new, experimental feature and is not fully implemented. + + _make Used as a switch for custom object initialization. Provides a + mechanism for alternate constructors. This is somewhat experimental. + It may be useful for contruction of complex objects and may be of + use for determining how an object was constructed post facto. + + _io Bio::Root::IOManager.pm object reference. Used primarily for handling the + display of an object's data. + + _strict Integer flag to set the sensitivity to exceptions/warnings + for a given object. + + _verbose Boolean indicator for reporting more or less than the normal amount. + + _record_err Boolean indicator for attaching all thrown exception objects + to the current object. Default = false (don't attach exceptions). + +=cut + + +MODIFICATION NOTES: +----------------------- +0.041, sac --- Thu Feb 4 03:50:58 1999 + * warn() utilizes the Global $CGI indicator to supress output + when script is running as a CGI. + +0.04, sac --- Tue Dec 1 04:32:01 1998 + * Incorporated the new globals $STRICTNESS and $VERBOSITY + and eliminated WARN_ON_FATAL, FATAL_ON_WARN and DONT_WARN. + * Deprecated terse() since it is better to think of terseness + as negative verbosity. + * Removed autoloading-related code and comments. + +0.035, 28 Sep 1998, sac: + * Added _drop_child() method to attempt to break cyclical refs + between parent and child objects. + * Added to_string() method. + * Err objects no longer know their parents (no need). + +0.031, 2 Sep 1998, sac: + * Documentation changes only. Wrapped the data member docs + at the bottom in POD comments which fixes compilation bug + caused by commenting out __END__. + +0.03, 16 Aug 1998, sac: + * Calls to warn() or throw() now no longer result in Err.pm objects + being attached to the current object. For discussion about this + descision, see comments under err(). + * Added the -RECORD_ERR constructor option and Global::record_err() + method to enable the attachment of Err.pm object to the current + object. + * Minor bug fixes with parameter handling (%param -> @param). + * Added note about AUTOLOADing. + +0.023, 20 Jul 1998, sac: + * Changes in Bio::Root::IOManager::read(). + * Improved memory management (destroy(), DESTROY(), and changes + in Bio::Root::Vector.pm). + +0.022, 16 Jun 1998, sac: + * Changes in Bio::Root::IOManager::read(). + +0.021, May 1998, sac: + * Touched up _set_clone(). + * Refined documentation in this and other Bio::Root modules + (converted to use pod2html in Perl 5.004) + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Root/Root.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Root/Root.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,427 @@ +package Bio::Root::Root; +use strict; + +# $Id: Root.pm,v 1.30 2002/12/16 09:44:28 birney Exp $ + +=head1 NAME + +Bio::Root::Root - Hash-based implementation of Bio::Root::RootI + +=head1 SYNOPSIS + + # any bioperl or bioperl compliant object is a RootI + # compliant object + + # Here's how to throw and catch an exception using the eval-based syntax. + + $obj->throw("This is an exception"); + + eval { + $obj->throw("This is catching an exception"); + }; + + if( $@ ) { + print "Caught exception"; + } else { + print "no exception"; + } + + # Alternatively, using the new typed exception syntax in the throw() call: + + $obj->throw( -class => 'Bio::Root::BadParameter', + -text => "Can't open file $file", + -value => $file); + + # Exceptions can be used in an eval{} block as shown above or within + # a try{} block if you have installed the Error.pm module. + # Here's a brief example. For more, see Bio::Root::Exception + + use Error qw(:try); + + try { + $obj->throw( # arguments as above ); + } + catch Bio::Root::FileOpenException with { + my $err = shift; + print "Handling exception $err\n"; + }; + +=head1 DESCRIPTION + +This is a hashref-based implementation of the Bio::Root::RootI +interface. Most bioperl objects should inherit from this. + +See the documentation for Bio::Root::RootI for most of the methods +implemented by this module. Only overridden methods are described +here. + +=head2 Throwing Exceptions + +One of the functionalities that Bio::Root::RootI provides is the +ability to throw() exceptions with pretty stack traces. Bio::Root::Root +enhances this with the ability to use B (available from CPAN) +if it has also been installed. + +If Error.pm has been installed, throw() will use it. This causes an +Error.pm-derived object to be thrown. This can be caught within a +C block, from wich you can extract useful bits of +information. If Error.pm is not installed, it will use the +Bio::Root::RootI-based exception throwing facilty. + +=head2 Typed Exception Syntax + +The typed exception syntax of throw() has the advantage of plainly +indicating the nature of the trouble, since the name of the class +is included in the title of the exception output. + +To take advantage of this capability, you must specify arguments +as named parameters in the throw() call. Here are the parameters: + +=over 4 + +=item -class + +name of the class of the exception. +This should be one of the classes defined in B, +or a custom error of yours that extends one of the exceptions +defined in B. + +=item -text + +a sensible message for the exception + +=item -value + +the value causing the exception or $!, if appropriate. + +=back + +Note that Bio::Root::Exception does not need to be imported into +your module (or script) namespace in order to throw exceptions +via Bio::Root::Root::throw(), since Bio::Root::Root imports it. + +=head2 Try-Catch-Finally Support + +In addition to using an eval{} block to handle exceptions, you can +also use a try-catch-finally block structure if B has been +installed in your system (available from CPAN). See the documentation +for Error for more details. + +Here's an example. See the B module for +other pre-defined exception types: + + try { + open( IN, $file) || $obj->throw( -class => 'Bio::Root::FileOpenException', + -text => "Cannot open file $file for reading", + -value => $!); + } + catch Bio::Root::BadParameter with { + my $err = shift; # get the Error object + # Perform specific exception handling code for the FileOpenException + } + catch Bio::Root::Exception with { + my $err = shift; # get the Error object + # Perform general exception handling code for any Bioperl exception. + } + otherwise { + # A catch-all for any other type of exception + } + finally { + # Any code that you want to execute regardless of whether or not + # an exception occurred. + }; + # the ending semicolon is essential! + + +=head1 CONTACT + +Functions originally from Steve Chervitz. Refactored by Ewan Birney. +Re-refactored by Lincoln Stein. + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +#' + +use vars qw(@ISA $DEBUG $ID $Revision $VERSION $VERBOSITY $ERRORLOADED); +use strict; +use Bio::Root::RootI; +use Bio::Root::IO; + +@ISA = 'Bio::Root::RootI'; + +BEGIN { + + $ID = 'Bio::Root::Root'; + $VERSION = 1.0; + $Revision = '$Id: Root.pm,v 1.30 2002/12/16 09:44:28 birney Exp $ '; + $DEBUG = 0; + $VERBOSITY = 0; + $ERRORLOADED = 0; + + # Check whether or not Error.pm is available. + + # $main::DONT_USE_ERROR is intended for testing purposes and also + # when you don't want to use the Error module, even if it is installed. + # Just put a INIT { $DONT_USE_ERROR = 1; } at the top of your script. + if( not $main::DONT_USE_ERROR ) { + if ( eval "require Error" ) { + import Error qw(:try); + require Bio::Root::Exception; + $ERRORLOADED = 1; + $Error::Debug = 1; # enable verbose stack trace + } + } + if( !$ERRORLOADED ) { + require Carp; import Carp qw( confess ); + } + $main::DONT_USE_ERROR; # so that perl -w won't warn "used only once" + +} + + + +=head2 new + + Purpose : generic instantiation function can be overridden if + special needs of a module cannot be done in _initialize + +=cut + +sub new { +# my ($class, %param) = @_; + my $class = shift; + my $self = {}; + bless $self, ref($class) || $class; + + if(@_ > 1) { + # if the number of arguments is odd but at least 3, we'll give + # it a try to find -verbose + shift if @_ % 2; + my %param = @_; + ## See "Comments" above regarding use of _rearrange(). + $self->verbose($param{'-VERBOSE'} || $param{'-verbose'}); + } + return $self; +} + + +=head2 verbose + + Title : verbose + Usage : $self->verbose(1) + Function: Sets verbose level for how ->warn behaves + -1 = no warning + 0 = standard, small warning + 1 = warning with stack trace + 2 = warning becomes throw + Returns : The current verbosity setting (integer between -1 to 2) + Args : -1,0,1 or 2 + + +=cut + +sub verbose { + my ($self,$value) = @_; + # allow one to set global verbosity flag + return $DEBUG if $DEBUG; + return $VERBOSITY unless ref $self; + + if (defined $value || ! defined $self->{'_root_verbose'}) { + $self->{'_root_verbose'} = $value || 0; + } + return $self->{'_root_verbose'}; +} + +sub _register_for_cleanup { + my ($self,$method) = @_; + if($method) { + if(! exists($self->{'_root_cleanup_methods'})) { + $self->{'_root_cleanup_methods'} = []; + } + push(@{$self->{'_root_cleanup_methods'}},$method); + } +} + +sub _unregister_for_cleanup { + my ($self,$method) = @_; + my @methods = grep {$_ ne $method} $self->_cleanup_methods; + $self->{'_root_cleanup_methods'} = \@methods; +} + + +sub _cleanup_methods { + my $self = shift; + return unless ref $self && $self->isa('HASH'); + my $methods = $self->{'_root_cleanup_methods'} or return; + @$methods; + +} + +=head2 throw + + Title : throw + Usage : $obj->throw("throwing exception message"); + or + $obj->throw( -class => 'Bio::Root::Exception', + -text => "throwing exception message", + -value => $bad_value ); + Function: Throws an exception, which, if not caught with an eval or + a try block will provide a nice stack trace to STDERR + with the message. + If Error.pm is installed, and if a -class parameter is + provided, Error::throw will be used, throwing an error + of the type specified by -class. + If Error.pm is installed and no -class parameter is provided + (i.e., a simple string is given), A Bio::Root::Exception + is thrown. + Returns : n/a + Args : A string giving a descriptive error message, optional + Named parameters: + '-class' a string for the name of a class that derives + from Error.pm, such as any of the exceptions + defined in Bio::Root::Exception. + Default class: Bio::Root::Exception + '-text' a string giving a descriptive error message + '-value' the value causing the exception, or $! (optional) + + Thus, if only a string argument is given, and Error.pm is available, + this is equivalent to the arguments: + -text => "message", + -class => Bio::Root::Exception + Comments : If Error.pm is installed, and you don't want to use it + for some reason, you can block the use of Error.pm by + Bio::Root::Root::throw() by defining a scalar named + $main::DONT_USE_ERROR (define it in your main script + and you don't need the main:: part) and setting it to + a true value; you must do this within a BEGIN subroutine. + +=cut + +#' + +sub throw{ + my ($self,@args) = @_; + + my ( $text, $class ) = $self->_rearrange( [qw(TEXT CLASS)], @args); + + if( $ERRORLOADED ) { +# print STDERR " Calling Error::throw\n\n"; + + # Enable re-throwing of Error objects. + # If the error is not derived from Bio::Root::Exception, + # we can't guarantee that the Error's value was set properly + # and, ipso facto, that it will be catchable from an eval{}. + # But chances are, if you're re-throwing non-Bio::Root::Exceptions, + # you're probably using Error::try(), not eval{}. + # TODO: Fix the MSG: line of the re-thrown error. Has an extra line + # containing the '----- EXCEPTION -----' banner. + if( ref($args[0])) { + if( $args[0]->isa('Error')) { + my $class = ref $args[0]; + throw $class ( @args ); + } else { + my $text .= "\nWARNING: Attempt to throw a non-Error.pm object: " . ref$args[0]; + my $class = "Bio::Root::Exception"; + throw $class ( '-text' => $text, '-value' => $args[0] ); + } + } else { + $class ||= "Bio::Root::Exception"; + + my %args; + if( @args % 2 == 0 && $args[0] =~ /^-/ ) { + %args = @args; + $args{-text} = $text; + $args{-object} = $self; + } + + throw $class ( scalar keys %args > 0 ? %args : @args ); # (%args || @args) puts %args in scalar context! + } + } + else { +# print STDERR " Not calling Error::throw\n\n"; + $class ||= ''; + my $std = $self->stack_trace_dump(); + my $title = "------------- EXCEPTION $class -------------"; + my $footer = "\n" . '-' x CORE::length($title); + $text ||= ''; + + my $out = "\n$title\n" . + "MSG: $text\n". $std . $footer . "\n"; + + die $out; + } +} + +=head2 debug + + Title : debug + Usage : $obj->debug("This is debugging output"); + Function: Prints a debugging message when verbose is > 0 + Returns : none + Args : message string(s) to print to STDERR + +=cut + +sub debug{ + my ($self,@msgs) = @_; + + if( $self->verbose > 0 ) { + print STDERR join("", @msgs); + } +} + +=head2 _load_module + + Title : _load_module + Usage : $self->_load_module("Bio::SeqIO::genbank"); + Function: Loads up (like use) the specified module at run time on demand. + Example : + Returns : TRUE on success. Throws an exception upon failure. +. + Args : The module to load (_without_ the trailing .pm). + +=cut + +sub _load_module { + my ($self, $name) = @_; + my ($module, $load, $m); + $module = "_<$name.pm"; + return 1 if $main::{$module}; + + # untaint operation for safe web-based running (modified after a fix + # a fix by Lincoln) HL + if ($name !~ /^([\w:]+)$/) { + $self->throw("$name is an illegal perl package name"); + } + + $load = "$name.pm"; + my $io = Bio::Root::IO->new(); + # catfile comes from IO + $load = $io->catfile((split(/::/,$load))); + eval { + require $load; + }; + if ( $@ ) { + $self->throw("Failed to load module $name. ".$@); + } + return 1; +} + + +sub DESTROY { + my $self = shift; + my @cleanup_methods = $self->_cleanup_methods or return; + for my $method (@cleanup_methods) { + $method->($self); + } +} + + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Root/RootI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Root/RootI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,580 @@ +# $Id: RootI.pm,v 1.61 2002/12/16 09:44:28 birney Exp $ +# +# BioPerl module for Bio::Root::RootI +# +# Cared for by Ewan Birney +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code +# +# This was refactored to have chained calls to new instead +# of chained calls to _initialize +# +# added debug and deprecated methods --Jason Stajich 2001-10-12 +# + +=head1 NAME + +Bio::Root::RootI - Abstract interface to root object code + +=head1 SYNOPSIS + + # any bioperl or bioperl compliant object is a RootI + # compliant object + + $obj->throw("This is an exception"); + + eval { + $obj->throw("This is catching an exception"); + }; + + if( $@ ) { + print "Caught exception"; + } else { + print "no exception"; + } + + # Using throw_not_implemented() within a RootI-based interface module: + + package Foo; + @ISA = qw( Bio::Root::RootI ); + + sub foo { + my $self = shift; + $self->throw_not_implemented; + } + + +=head1 DESCRIPTION + +This is just a set of methods which do not assume B about the object +they are on. The methods provide the ability to throw exceptions with nice +stack traces. + +This is what should be inherited by all bioperl compliant interfaces, even +if they are exotic XS/CORBA/Other perl systems. + +=head2 Using throw_not_implemented() + +The method L should be +called by all methods within interface modules that extend RootI so +that if an implementation fails to override them, an exception will be +thrown. + +For example, say there is an interface module called C that +provides a method called C. Since this method is considered +abstract within FooI and should be implemented by any module claiming to +implement C, the C method should consist of the +following: + + sub foo { + my $self = shift; + $self->throw_not_implemented; + } + +So, if an implementer of C forgets to implement C +and a user of the implementation calls C, a +B exception will result. + +Unfortunately, failure to implement a method can only be determined at +run time (i.e., you can't verify that an implementation is complete by +running C on it). So it should be standard practice for a test +of an implementation to check each method and verify that it doesn't +throw a B. + +=head1 CONTACT + +Functions originally from Steve Chervitz. Refactored by Ewan +Birney. Re-refactored by Lincoln Stein. + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::Root::RootI; + +use vars qw($DEBUG $ID $Revision $VERSION $VERBOSITY); +use strict; +use Carp 'confess','carp'; + +BEGIN { + $ID = 'Bio::Root::RootI'; + $VERSION = 1.0; + $Revision = '$Id: RootI.pm,v 1.61 2002/12/16 09:44:28 birney Exp $ '; + $DEBUG = 0; + $VERBOSITY = 0; +} + +sub new { + my $class = shift; + my @args = @_; + unless ( $ENV{'BIOPERLDEBUG'} ) { + carp("Use of new in Bio::Root::RootI is deprecated. Please use Bio::Root::Root instead"); + } + eval "require Bio::Root::Root"; + return Bio::Root::Root->new(@args); +} + +# for backwards compatibility +sub _initialize { + my($self,@args) = @_; + return 1; +} + + +=head2 throw + + Title : throw + Usage : $obj->throw("throwing exception message") + Function: Throws an exception, which, if not caught with an eval brace + will provide a nice stack trace to STDERR with the message + Returns : nothing + Args : A string giving a descriptive error message + + +=cut + +sub throw{ + my ($self,$string) = @_; + + my $std = $self->stack_trace_dump(); + + my $out = "\n-------------------- EXCEPTION --------------------\n". + "MSG: ".$string."\n".$std."-------------------------------------------\n"; + die $out; + +} + +=head2 warn + + Title : warn + Usage : $object->warn("Warning message"); + Function: Places a warning. What happens now is down to the + verbosity of the object (value of $obj->verbose) + verbosity 0 or not set => small warning + verbosity -1 => no warning + verbosity 1 => warning with stack trace + verbosity 2 => converts warnings into throw + Example : + Returns : + Args : + +=cut + +sub warn{ + my ($self,$string) = @_; + + my $verbose; + if( $self->can('verbose') ) { + $verbose = $self->verbose; + } else { + $verbose = 0; + } + + if( $verbose == 2 ) { + $self->throw($string); + } elsif( $verbose == -1 ) { + return; + } elsif( $verbose == 1 ) { + my $out = "\n-------------------- WARNING ---------------------\n". + "MSG: ".$string."\n"; + $out .= $self->stack_trace_dump; + + print STDERR $out; + return; + } + + my $out = "\n-------------------- WARNING ---------------------\n". + "MSG: ".$string."\n". + "---------------------------------------------------\n"; + print STDERR $out; +} + +=head2 deprecated + + Title : deprecated + Usage : $obj->deprecated("Method X is deprecated"); + Function: Prints a message about deprecation + unless verbose is < 0 (which means be quiet) + Returns : none + Args : Message string to print to STDERR + +=cut + +sub deprecated{ + my ($self,$msg) = @_; + if( $self->verbose >= 0 ) { + print STDERR $msg, "\n", $self->stack_trace_dump; + } +} + +=head2 stack_trace_dump + + Title : stack_trace_dump + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub stack_trace_dump{ + my ($self) = @_; + + my @stack = $self->stack_trace(); + + shift @stack; + shift @stack; + shift @stack; + + my $out; + my ($module,$function,$file,$position); + + + foreach my $stack ( @stack) { + ($module,$file,$position,$function) = @{$stack}; + $out .= "STACK $function $file:$position\n"; + } + + return $out; +} + + +=head2 stack_trace + + Title : stack_trace + Usage : @stack_array_ref= $self->stack_trace + Function: gives an array to a reference of arrays with stack trace info + each coming from the caller(stack_number) call + Returns : array containing a reference of arrays + Args : none + + +=cut + +sub stack_trace{ + my ($self) = @_; + + my $i = 0; + my @out; + my $prev; + while( my @call = caller($i++)) { + # major annoyance that caller puts caller context as + # function name. Hence some monkeying around... + $prev->[3] = $call[3]; + push(@out,$prev); + $prev = \@call; + } + $prev->[3] = 'toplevel'; + push(@out,$prev); + return @out; +} + + +=head2 _rearrange + + Usage : $object->_rearrange( array_ref, list_of_arguments) + Purpose : Rearranges named parameters to requested order. + Example : $self->_rearrange([qw(SEQUENCE ID DESC)],@param); + : Where @param = (-sequence => $s, + : -desc => $d, + : -id => $i); + Returns : @params - an array of parameters in the requested order. + : The above example would return ($s, $i, $d). + : Unspecified parameters will return undef. For example, if + : @param = (-sequence => $s); + : the above _rearrange call would return ($s, undef, undef) + Argument : $order : a reference to an array which describes the desired + : order of the named parameters. + : @param : an array of parameters, either as a list (in + : which case the function simply returns the list), + : or as an associative array with hyphenated tags + : (in which case the function sorts the values + : according to @{$order} and returns that new array.) + : The tags can be upper, lower, or mixed case + : but they must start with a hyphen (at least the + : first one should be hyphenated.) + Source : This function was taken from CGI.pm, written by Dr. Lincoln + : Stein, and adapted for use in Bio::Seq by Richard Resnick and + : then adapted for use in Bio::Root::Object.pm by Steve Chervitz, + : then migrated into Bio::Root::RootI.pm by Ewan Birney. + Comments : + : Uppercase tags are the norm, + : (SAC) + : This method may not be appropriate for method calls that are + : within in an inner loop if efficiency is a concern. + : + : Parameters can be specified using any of these formats: + : @param = (-name=>'me', -color=>'blue'); + : @param = (-NAME=>'me', -COLOR=>'blue'); + : @param = (-Name=>'me', -Color=>'blue'); + : @param = ('me', 'blue'); + : A leading hyphenated argument is used by this function to + : indicate that named parameters are being used. + : Therefore, the ('me', 'blue') list will be returned as-is. + : + : Note that Perl will confuse unquoted, hyphenated tags as + : function calls if there is a function of the same name + : in the current namespace: + : -name => 'foo' is interpreted as -&name => 'foo' + : + : For ultimate safety, put single quotes around the tag: + : ('-name'=>'me', '-color' =>'blue'); + : This can be a bit cumbersome and I find not as readable + : as using all uppercase, which is also fairly safe: + : (-NAME=>'me', -COLOR =>'blue'); + : + : Personal note (SAC): I have found all uppercase tags to + : be more managable: it involves less single-quoting, + : the key names stand out better, and there are no method naming + : conflicts. + : The drawbacks are that it's not as easy to type as lowercase, + : and lots of uppercase can be hard to read. + : + : Regardless of the style, it greatly helps to line + : the parameters up vertically for long/complex lists. + +=cut + +sub _rearrange { + my $dummy = shift; + my $order = shift; + + return @_ unless (substr($_[0]||'',0,1) eq '-'); + push @_,undef unless $#_ %2; + my %param; + while( @_ ) { + (my $key = shift) =~ tr/a-z\055/A-Z/d; #deletes all dashes! + $param{$key} = shift; + } + map { $_ = uc($_) } @$order; # for bug #1343, but is there perf hit here? + return @param{@$order}; +} + + +#----------------' +sub _rearrange_old { +#---------------- + my($self,$order,@param) = @_; + + # JGRG -- This is wrong, because we don't want + # to assign empty string to anything, and this + # code is actually returning an array 1 less + # than the length of @param: + + ## If there are no parameters, we simply wish to return + ## an empty array which is the size of the @{$order} array. + #return ('') x $#{$order} unless @param; + + # ...all we need to do is return an empty array: + # return unless @param; + + # If we've got parameters, we need to check to see whether + # they are named or simply listed. If they are listed, we + # can just return them. + + # The mod test fixes bug where a single string parameter beginning with '-' gets lost. + # This tends to happen in error messages such as: $obj->throw("-id not defined") + return @param unless (defined($param[0]) && $param[0]=~/^-/o && ($#param % 2)); + + # Tester +# print "\n_rearrange() named parameters:\n"; +# my $i; for ($i=0;$i<@param;$i+=2) { printf "%20s => %s\n", $param[$i],$param[$i+1]; }; ; + + # Now we've got to do some work on the named parameters. + # The next few lines strip out the '-' characters which + # preceed the keys, and capitalizes them. + for (my $i=0;$i<@param;$i+=2) { + $param[$i]=~s/^\-//; + $param[$i]=~tr/a-z/A-Z/; + } + + # Now we'll convert the @params variable into an associative array. + # local($^W) = 0; # prevent "odd number of elements" warning with -w. + my(%param) = @param; + + # my(@return_array); + + # What we intend to do is loop through the @{$order} variable, + # and for each value, we use that as a key into our associative + # array, pushing the value at that key onto our return array. + # my($key); + + #foreach (@{$order}) { + # my($value) = $param{$key}; + # delete $param{$key}; + #push(@return_array,$param{$_}); + #} + + return @param{@{$order}}; + +# print "\n_rearrange() after processing:\n"; +# my $i; for ($i=0;$i<@return_array;$i++) { printf "%20s => %s\n", ${$order}[$i], $return_array[$i]; } ; + + # return @return_array; +} + +=head2 _register_for_cleanup + + Title : _register_for_cleanup + Usage : -- internal -- + Function: Register a method to be called at DESTROY time. This is useful + and sometimes essential in the case of multiple inheritance for + classes coming second in the sequence of inheritance. + Returns : + Args : a code reference + +The code reference will be invoked with the object as the first +argument, as per a method. You may register an unlimited number of +cleanup methods. + +=cut + +sub _register_for_cleanup { + my ($self,$method) = @_; + $self->throw_not_implemented(); +} + +=head2 _unregister_for_cleanup + + Title : _unregister_for_cleanup + Usage : -- internal -- + Function: Remove a method that has previously been registered to be called + at DESTROY time. If called with a methoda method to be called at DESTROY time. + Has no effect if the code reference has not previously been registered. + Returns : nothing + Args : a code reference + +=cut + +sub _unregister_for_cleanup { + my ($self,$method) = @_; + $self->throw_not_implemented(); +} + +=head2 _cleanup_methods + + Title : _cleanup_methods + Usage : -- internal -- + Function: Return current list of registered cleanup methods. + Returns : list of coderefs + Args : none + +=cut + +sub _cleanup_methods { + my $self = shift; + unless ( $ENV{'BIOPERLDEBUG'} || $self->verbose > 0 ) { + carp("Use of Bio::Root::RootI is deprecated. Please use Bio::Root::Root instead"); + } + return; +} + +=head2 throw_not_implemented + + Purpose : Throws a Bio::Root::NotImplemented exception. + Intended for use in the method definitions of + abstract interface modules where methods are defined + but are intended to be overridden by subclasses. + Usage : $object->throw_not_implemented(); + Example : sub method_foo { + $self = shift; + $self->throw_not_implemented(); + } + Returns : n/a + Args : n/a + Throws : A Bio::Root::NotImplemented exception. + The message of the exception contains + - the name of the method + - the name of the interface + - the name of the implementing class + + If this object has a throw() method, $self->throw will be used. + If the object doesn't have a throw() method, + Carp::confess() will be used. + + +=cut + +#' + +sub throw_not_implemented { + my $self = shift; + my $package = ref $self; + my $iface = caller(0); + my @call = caller(1); + my $meth = $call[3]; + + my $message = "Abstract method \"$meth\" is not implemented by package $package.\n" . + "This is not your fault - author of $package should be blamed!\n"; + + # Checking if Error.pm is available in case the object isn't decended from + # Bio::Root::Root, which knows how to check for Error.pm. + + # EB - this wasn't working and I couldn't figure out! + # SC - OK, since most RootI objects will be Root.pm-based, + # and Root.pm can deal with Error.pm. + # Still, I'd like to know why it wasn't working... + + if( $self->can('throw') ) { + $self->throw( -text => $message, + -class => 'Bio::Root::NotImplemented'); + } + else { + confess $message ; + } +} + + +=head2 warn_not_implemented + + Purpose : Generates a warning that a method has not been implemented. + Intended for use in the method definitions of + abstract interface modules where methods are defined + but are intended to be overridden by subclasses. + Generally, throw_not_implemented() should be used, + but warn_not_implemented() may be used if the method isn't + considered essential and convenient no-op behavior can be + provided within the interface. + Usage : $object->warn_not_implemented( method-name-string ); + Example : $self->warn_not_implemented( "get_foobar" ); + Returns : Calls $self->warn on this object, if available. + If the object doesn't have a warn() method, + Carp::carp() will be used. + Args : n/a + + +=cut + +#' + +sub warn_not_implemented { + my $self = shift; + my $package = ref $self; + my $iface = caller(0); + my @call = caller(1); + my $meth = $call[3]; + + my $message = "Abstract method \"$meth\" is not implemented by package $package.\n" . + "This is not your fault - author of $package should be blamed!\n"; + + if( $self->can('warn') ) { + $self->warn( $message ); + } + else { + carp $message ; + } +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Root/Utilities.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Root/Utilities.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1202 @@ +#----------------------------------------------------------------------------- +# PACKAGE : Bio::Root::Utilities.pm +# PURPOSE : Provides general-purpose utilities of potential interest to any Perl script. +# AUTHOR : Steve Chervitz (sac@bioperl.org) +# CREATED : Feb 1996 +# REVISION: $Id: Utilities.pm,v 1.21 2002/10/22 07:38:37 lapp Exp $ +# STATUS : Alpha +# +# This module manages file compression and uncompression using gzip or +# the UNIX compress programs (see the compress() and uncompress() methods). +# Also, it can create filehandles from gzipped files. If you want to use a +# different compression utility (such as zip, pkzip, stuffit, etc.) you +# are on your own. +# +# If you manage to incorporate an alternate compression utility into this +# module, please post a note to the bio.perl.org mailing list +# bioperl-l@bioperl.org +# +# TODO : Configure $GNU_PATH during installation. +# Improve documentation (POD). +# Make use of Date::Manip and/or Date::DateCalc as appropriate. +# +# MODIFICATIONS: See bottom of file. +# +# Copyright (c) 1996-2000 Steve Chervitz. All Rights Reserved. +# This module is free software; you can redistribute it and/or +# modify it under the same terms as Perl itself. +# +#----------------------------------------------------------------------------- + +package Bio::Root::Utilities; +use strict; + +BEGIN { + use vars qw($Loaded_POSIX $Loaded_IOScalar); + $Loaded_POSIX = 1; + unless( eval "require POSIX" ) { + $Loaded_POSIX = 0; + } +} + +use Bio::Root::Global qw(:data :std $TIMEOUT_SECS); +use Bio::Root::Object (); +use Exporter (); +#use AutoLoader; +#*AUTOLOAD = \&AutoLoader::AUTOLOAD; + +use vars qw( @ISA @EXPORT_OK %EXPORT_TAGS ); +@ISA = qw( Bio::Root::Root Exporter); +@EXPORT_OK = qw($Util); +%EXPORT_TAGS = ( obj => [qw($Util)], + std => [qw($Util)],); + +use vars qw($ID $VERSION $Util $GNU_PATH $DEFAULT_NEWLINE); + +$ID = 'Bio::Root::Utilities'; +$VERSION = 0.05; + +# $GNU_PATH points to the directory containing the gzip and gunzip +# executables. It may be required for executing gzip/gunzip +# in some situations (e.g., when $ENV{PATH} doesn't contain this dir. +# Customize $GNU_PATH for your site if the compress() or +# uncompress() functions are generating exceptions. +$GNU_PATH = ''; +#$GNU_PATH = '/tools/gnu/bin/'; + +$DEFAULT_NEWLINE = "\012"; # \n (used if get_newline() fails for some reason) + +## Static UTIL object. +$Util = {}; +bless $Util, $ID; +$Util->{'_name'} = 'Static Utilities object'; + +## POD Documentation: + +=head1 NAME + +Bio::Root::Utilities - General-purpose utility module + +=head1 SYNOPSIS + +=head2 Object Creation + + use Bio::Root::Utilities qw(:obj); + +There is no need to create a new Bio::Root::Utilities.pm object when +the C<:obj> tag is used. This tag will import the static $Util +object created by Bio::Root::Utilities.pm into your name space. This +saves you from having to call C. + +You are free to not use the :obj tag and create the object as you +like, but a Bio::Root::Utilities object is not configurable; any given +script only needs a single copy. + + $date_stamp = $Util->date_format('yyy-mm-dd'); + + $clean = $Util->untaint($dirty); + + $Util->mail_authority("Something you should know about..."); + + ...and other methods. See below. + +=head1 INSTALLATION + +This module is included with the central Bioperl distribution: + + http://bio.perl.org/Core/Latest + ftp://bio.perl.org/pub/DIST + +Follow the installation instructions included in the README file. + +=head1 DESCRIPTION + +Provides general-purpose utilities of potential interest to any Perl script. +Scripts and modules are expected to use the static $Util object exported by +this package with the C<:obj> tag. + +=head1 DEPENDENCIES + +B inherits from B. +It also relies on the GNU gzip program for file compression/uncompression. + +=head1 SEE ALSO + + Bio::Root::Object.pm - Core object + Bio::Root::Global.pm - Manages global variables/constants + + http://bio.perl.org/Projects/modules.html - Online module documentation + http://bio.perl.org/ - Bioperl Project Homepage + + FileHandle.pm (included in the Perl distribution or CPAN). + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other Bioperl modules. +Send your comments and suggestions preferably to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track the bugs and +their resolution. Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Steve Chervitz Esac@bioperl.orgE + +See L for where to send bug reports and comments. + +=head1 VERSION + +Bio::Root::Utilities.pm, 0.042 + +=head1 ACKNOWLEDGEMENTS + +This module was developed under the auspices of the Saccharomyces Genome +Database: + http://genome-www.stanford.edu/Saccharomyces + +=head1 COPYRIGHT + +Copyright (c) 1997-98 Steve Chervitz. All Rights Reserved. +This module is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +=cut + +# +## +### +#### END of main POD documentation. +### +## +#' + + +=head1 APPENDIX + +Methods beginning with a leading underscore are considered private +and are intended for internal use by this module. They are +B considered part of the public interface and are described here +for documentation purposes only. + +=cut + + +############################################################################ +## INSTANCE METHODS ## +############################################################################ + +=head2 date_format + + Title : date_format + Usage : $Util->date_format( [FMT], [DATE]) + Purpose : -- Get a string containing the formated date or time + : taken when this routine is invoked. + : -- Provides a way to avoid using `date`. + : -- Provides an interface to localtime(). + : -- Interconverts some date formats. + : + : (For additional functionality, use Date::Manip or + : Date::DateCalc available from CPAN). + Example : $Util->date_format(); + : $date = $Util->date_format('yyyy-mmm-dd', '11/22/92'); + Returns : String (unless 'list' is provided as argument, see below) + : + : 'yyyy-mm-dd' = 1996-05-03 # default format. + : 'yyyy-dd-mm' = 1996-03-05 + : 'yyyy-mmm-dd' = 1996-May-03 + : 'd-m-y' = 3-May-1996 + : 'd m y' = 3 May 1996 + : 'dmy' = 3may96 + : 'mdy' = May 3, 1996 + : 'ymd' = 96may3 + : 'md' = may3 + : 'year' = 1996 + : 'hms' = 23:01:59 # 'hms' can be tacked on to any of the above options + : # to add the time stamp: eg 'dmyhms' + : 'full' | 'unix' = UNIX-style date: Tue May 5 22:00:00 1998 + : 'list' = the contents of localtime(time) in an array. + Argument : (all are optional) + : FMT = yyyy-mm-dd | yyyy-dd-mm | yyyy-mmm-dd | + : mdy | ymd | md | d-m-y | hms | hm + : ('hms' may be appended to any of these to + : add a time stamp) + : + : DATE = String containing date to be converted. + : Acceptable input formats: + : 12/1/97 (for 1 December 1997) + : 1997-12-01 + : 1997-Dec-01 + Throws : + Comments : Relies on the $BASE_YEAR constant exported by Bio:Root::Global.pm. + : + : If you don't care about formatting or using backticks, you can + : always use: $date = `date`; + : + : For more features, use Date::Manip.pm, (which I should + : probably switch to...) + +See Also : L(), L() + +=cut + +#---------------' +sub date_format { +#--------------- + my $self = shift; + my $option = shift; + my $date = shift; # optional date to be converted. + + my($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst); + + $option ||= 'yyyy-mm-dd'; + + my ($month_txt, $day_txt, $month_num, $fullYear); + my (@date); + + # Load a supplied date for conversion: + if(defined($date) && ($date =~ /[\D-]+/)) { + if( $date =~ /\//) { + ($mon,$mday,$year) = split(/\//, $date); + } elsif($date =~ /(\d{4})-(\d{1,2})-(\d{1,2})/) { + ($year,$mon,$mday) = ($1, $2, $3); + } elsif($date =~ /(\d{4})-(\w{3,})-(\d{1,2})/) { + ($year,$mon,$mday) = ($1, $2, $3); + $mon = $self->month2num($2); + } else { + print STDERR "\n*** Unsupported input date format: $date\n"; + } + if(length($year) == 4) { $year = substr $year, 2; } + $mon -= 1; + } else { + ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = @date = + localtime(($date ? $date : time())); + return @date if $option =~ /list/i; + } + $month_txt = $MONTHS[$mon]; + $day_txt = $DAYS[$wday] if defined $wday; + $month_num = $mon+1; + $fullYear = $BASE_YEAR+$year; + +# print "sec: $sec, min: $min, hour: $hour, month: $mon, m-day: $mday, year: $year\nwday: $wday, yday: $yday, dst: $isdst";; + + if( $option =~ /yyyy-mm-dd/i ) { + $date = sprintf "%4d-%02d-%02d",$fullYear,$month_num,$mday; + } elsif( $option =~ /yyyy-dd-mm/i ) { + $date = sprintf "%4d-%02d-%02d",$fullYear,$mday,$month_num; + } elsif( $option =~ /yyyy-mmm-dd/i ) { + $date = sprintf "%4d-%3s-%02d",$fullYear,$month_txt,$mday; + } elsif( $option =~ /full|unix/i ) { + $date = sprintf "%3s %3s %2d %02d:%02d:%02d %d",$day_txt, $month_txt, $mday, $hour, $min, $sec, $fullYear; + } elsif( $option =~ /mdy/i ) { + $date = "$month_txt $mday, $fullYear"; + } elsif( $option =~ /ymd/i ) { + $date = $year."\l$month_txt$mday"; + } elsif( $option =~ /dmy/i ) { + $date = $mday."\l$month_txt$year"; + } elsif( $option =~ /md/i ) { + $date = "\l$month_txt$mday"; + } elsif( $option =~ /d-m-y/i ) { + $date = "$mday-$month_txt-$fullYear"; + } elsif( $option =~ /d m y/i ) { + $date = "$mday $month_txt $fullYear"; + } elsif( $option =~ /year/i ) { + $date = $fullYear; + } elsif( $option =~ /dmy/i ) { + $date = $mday.'-'.$month_txt.'-'.$fullYear; + } elsif($option and $option !~ /hms/i) { + print STDERR "\n*** Unrecognized date format request: $option\n"; + } + + if( $option =~ /hms/i) { + $date .= " $hour:$min:$sec" if $date; + $date ||= "$hour:$min:$sec"; + } + + return $date || join(" ", @date); +} + + +=head2 month2num + + Title : month2num + Purpose : Converts a string containing a name of a month to integer + : representing the number of the month in the year. + Example : $Util->month2num("march"); # returns 3 + Argument : The string argument must contain at least the first + : three characters of the month's name. Case insensitive. + Throws : Exception if the conversion fails. + +=cut + +#--------------' +sub month2num { +#-------------- + + my ($self, $str) = @_; + + # Get string in proper format for conversion. + $str = substr($str, 0, 3); + for(0..$#MONTHS) { + return $_+1 if $str =~ /$MONTHS[$_]/i; + } + $self->throw("Invalid month name: $str"); +} + +=head2 num2month + + Title : num2month + Purpose : Does the opposite of month2num. + : Converts a number into a string containing a name of a month. + Example : $Util->num2month(3); # returns 'Mar' + Throws : Exception if supplied number is out of range. + +=cut + +#------------- +sub num2month { +#------------- + my ($self, $num) = @_; + + $self->throw("Month out of range: $num") if $num < 1 or $num > 12; + return $MONTHS[$num]; +} + +=head2 compress + + Title : compress + Usage : $Util->compress(filename, [tmp]); + Purpose : Compress a file to conserve disk space. + Example : $Util->compress("/usr/people/me/data.txt"); + Returns : String (name of compressed file, full path). + Argument : filename = String (name of file to be compressed, full path). + : If the supplied filename ends with '.gz' or '.Z', + : that extension will be removed before attempting to compress. + : tmp = boolean, + : If true, (or if user is not the owner of the file) + : the file is compressed to a tmp file + : If false, file is clobbered with the compressed version. + Throws : Exception if file cannot be compressed + : If user is not owner of the file, generates a warning + : and compresses to a tmp file. + : To avoid this warning, use the -o file test operator + : and call this function with a true second argument. + Comments : Attempts to compress using gzip (default compression level). + : If that fails, will attempt to use compress. + : In some situations, the full path to the gzip executable + : may be required. This can be specified with the $GNU_PATH + : package global variable. When installed, $GNU_PATH is an + : empty string. + +See Also : L() + +=cut + +#------------' +sub compress { +#------------ + my $self = shift; + my $fileName = shift; + my $tmp = shift || 0; + + if($fileName =~ /(\.gz|\.Z)$/) { $fileName =~ s/$1$//; }; + $DEBUG && print STDERR "gzipping file $fileName"; + + my ($compressed, @args); + + if($tmp or not -o $fileName) { + if($Loaded_POSIX) { + $compressed = POSIX::tmpnam; + } else { + $compressed = _get_pseudo_tmpnam(); + } + $compressed .= ".tmp.bioperl"; + $compressed .= '.gz'; + @args = ($GNU_PATH."gzip -f < $fileName > $compressed"); + not $tmp and + $self->warn("Not owner of file $fileName\nCompressing to tmp file $compressed."); + $tmp = 1; + } else { + $compressed = "$fileName.gz"; + @args = ($GNU_PATH.'gzip', '-f', $fileName); + } + + if(system(@args) != 0) { + # gzip may not be present. Try compress. + $compressed = "$fileName.Z"; + if($tmp) { + @args = ("/usr/bin/compress -f < $fileName > $compressed"); + } else { + @args = ('/usr/bin/compress', '-f', $fileName); + } + system(@args) == 0 or + $self->throw("Failed to gzip/compress file $fileName: $!", + "Confirm current \$GNU_PATH: $GNU_PATH", + "Edit \$GNU_PATH in Bio::Root::Utilities.pm if necessary."); + } + + return $compressed; +} + + +=head2 uncompress + + Title : uncompress + Usage : $Util->uncompress(filename, [tmp]); + Purpose : Uncompress a file. + Example : $Util->uncompress("/usr/people/me/data.txt.gz"); + Returns : String (name of uncompressed file, full path). + Argument : filename = String (name of file to be uncompressed, full path). + : If the supplied filename does not end with '.gz' or '.Z' + : a '.gz' will be appended before attempting to uncompress. + : tmp = boolean, + : If true, (or if user is not the owner of the file) + : the file is uncompressed to a tmp file + : If false, file is clobbered with the uncompressed version. + Throws : Exception if file cannot be uncompressed + : If user is not owner of the file, generates a warning + : and uncompresses to a tmp file. + : To avoid this warning, use the -o file test operator + : and call this function with a true second argument. + Comments : Attempts to uncompress using gunzip. + : If that fails, will use uncompress. + : In some situations, the full path to the gzip executable + : may be required. This can be specified with the $GNU_PATH + : package global variable. When installed, $GNU_PATH is an + : empty string. + +See Also : L() + +=cut + +#--------------- +sub uncompress { +#--------------- + my $self = shift; + my $fileName = shift; + my $tmp = shift || 0; + + if(not $fileName =~ /(\.gz|\.Z)$/) { $fileName .= '.gz'; } + $DEBUG && print STDERR "gunzipping file $fileName"; + + my($uncompressed, @args); + + if($tmp or not -o $fileName) { + if($Loaded_POSIX) { + $uncompressed = POSIX::tmpnam; + } else { + $uncompressed = _get_pseudo_tmpnam(); + } + $uncompressed .= ".tmp.bioperl"; + @args = ($GNU_PATH."gunzip -f < $fileName > $uncompressed"); + not $tmp and $self->verbose > 0 and + $self->warn("Not owner of file $fileName\nUncompressing to tmp file $uncompressed."); + $tmp = 1; + } else { + @args = ($GNU_PATH.'gunzip', '-f', $fileName); + ($uncompressed = $fileName) =~ s/(\.gz|\.Z)$//; + } + +# $ENV{'PATH'} = '/tools/gnu/bin'; + + if(system(@args) != 0) { + # gunzip may not be present. Try uncompress. + ($uncompressed = $fileName) =~ s/(\.gz|\.Z)$//; + if($tmp) { + @args = ("/usr/bin/uncompress -f < $fileName > $uncompressed"); + } else { + @args = ('/usr/bin/uncompress', '-f', $fileName); + } + system(@args) == 0 or + $self->throw("Failed to gunzip/uncompress file $fileName: $!", + "Confirm current \$GNU_PATH: $GNU_PATH", + "Edit \$GNU_PATH in Bio::Root::Utilities.pm if necessary."); + } + + return $uncompressed; +} + + +=head2 file_date + + Title : file_date + Usage : $Util->file_date( filename [,date_format]) + Purpose : Obtains the date of a given file. + : Provides flexible formatting via date_format(). + Returns : String = date of the file as: yyyy-mm-dd (e.g., 1997-10-15) + Argument : filename = string, full path name for file + : date_format = string, desired format for date (see date_format()). + : Default = yyyy-mm-dd + Thows : Exception if no file is provided or does not exist. + Comments : Uses the mtime field as obtained by stat(). + +=cut + +#-------------- +sub file_date { +#-------------- + my ($self, $file, $fmt) = @_; + + $self->throw("No such file: $file") if not $file or not -e $file; + + $fmt ||= 'yyyy-mm-dd'; + + my @file_data = stat($file); + return $self->date_format($fmt, $file_data[9]); # mtime field +} + + +=head2 untaint + + Title : untaint + Purpose : To remove nasty shell characters from untrusted data + : and allow a script to run with the -T switch. + : Potentially dangerous shell meta characters: &;`'\"|*?!~<>^()[]{}$\n\r + : Accept only the first block of contiguous characters: + : Default allowed chars = "-\w.', ()" + : If $relax is true = "-\w.', ()\/=%:^<>*" + Usage : $Util->untaint($value, $relax) + Returns : String containing the untained data. + Argument: $value = string + : $relax = boolean + Comments: + This general untaint() function may not be appropriate for every situation. + To allow only a more restricted subset of special characters + (for example, untainting a regular expression), then using a custom + untainting mechanism would permit more control. + + Note that special trusted vars (like $0) require untainting. + +=cut + +#------------` +sub untaint { +#------------ + my($self,$value,$relax) = @_; + $relax ||= 0; + my $untainted; + + $DEBUG and print STDERR "\nUNTAINT: $value\n"; + + defined $value || return; + + if( $relax ) { + $value =~ /([-\w.\', ()\/=%:^<>*]+)/; + $untainted = $1 +# } elsif( $relax == 2 ) { # Could have several degrees of relax. +# $value =~ /([-\w.\', ()\/=%:^<>*]+)/; +# $untainted = $1 + } else { + $value =~ /([-\w.\', ()]+)/; + $untainted = $1 + } + + $DEBUG and print STDERR "UNTAINTED: $untainted\n"; + + $untainted; +} + + +=head2 mean_stdev + + Title : mean_stdev + Usage : ($mean, $stdev) = $Util->mean_stdev( @data ) + Purpose : Calculates the mean and standard deviation given a list of numbers. + Returns : 2-element list (mean, stdev) + Argument : list of numbers (ints or floats) + Thows : n/a + +=cut + +#--------------- +sub mean_stdev { +#--------------- + my ($self, @data) = @_; + my $mean = 0; + foreach (@data) { $mean += $_; } + $mean /= scalar @data; + my $sum_diff_sqd = 0; + foreach (@data) { $sum_diff_sqd += ($mean - $_) * ($mean - $_); } + my $stdev = sqrt(abs($sum_diff_sqd/(scalar @data)-1)); + return ($mean, $stdev); +} + + +=head2 count_files + + Title : count_files + Purpose : Counts the number of files/directories within a given directory. + : Also reports the number of text and binary files in the dir + : as well as names of these files and directories. + Usage : count_files(\%data) + : $data{-DIR} is the directory to be analyzed. Default is ./ + : $data{-PRINT} = 0|1; if 1, prints results to STDOUT, (default=0). + Argument : Hash reference (empty) + Returns : n/a; + : Modifies the hash ref passed in as the sole argument. + : $$href{-TOTAL} scalar + : $$href{-NUM_TEXT_FILES} scalar + : $$href{-NUM_BINARY_FILES} scalar + : $$href{-NUM_DIRS} scalar + : $$href{-T_FILE_NAMES} array ref + : $$href{-B_FILE_NAMES} array ref + : $$href{-DIRNAMES} array ref + +=cut + +#---------------- +sub count_files { +#---------------- + my $self = shift; + my $href = shift; # Reference to an empty hash. + my( $name, @fileLine); + my $dir = $$href{-DIR} || './'; + my $print = $$href{-PRINT} || 0; + + ### Make sure $dir ends with / + $dir !~ /\/$/ and do{ $dir .= '/'; $$href{-DIR} = $dir; }; + + open ( PIPE, "ls -1 $dir |" ) || $self->throw("Can't open input pipe: $!"); + + ### Initialize the hash data. + $$href{-TOTAL} = 0; + $$href{-NUM_TEXT_FILES} = $$href{-NUM_BINARY_FILES} = $$href{-NUM_DIRS} = 0; + $$href{-T_FILE_NAMES} = []; + $$href{-B_FILE_NAMES} = []; + $$href{-DIR_NAMES} = []; + while( ) { + chomp(); + $$href{-TOTAL}++; + if( -T $dir.$_ ) { + $$href{-NUM_TEXT_FILES}++; push @{$$href{-T_FILE_NAMES}}, $_; } + if( -B $dir.$_ and not -d $dir.$_) { + $$href{-NUM_BINARY_FILES}++; push @{$$href{-B_FILE_NAMES}}, $_; } + if( -d $dir.$_ ) { + $$href{-NUM_DIRS}++; push @{$$href{-DIR_NAMES}}, $_; } + } + close PIPE; + + if( $print) { + printf( "\n%4d %s\n", $$href{-TOTAL}, "total files+dirs in $dir"); + printf( "%4d %s\n", $$href{-NUM_TEXT_FILES}, "text files"); + printf( "%4d %s\n", $$href{-NUM_BINARY_FILES}, "binary files"); + printf( "%4d %s\n", $$href{-NUM_DIRS}, "directories"); + } +} + + +#=head2 file_info +# +# Title : file_info +# Purpose : Obtains a variety of date for a given file. +# : Provides an interface to Perl's stat(). +# Status : Under development. Not ready. Don't use! +# +#=cut + +#-------------- +sub file_info { +#-------------- + my ($self, %param) = @_; + my ($file, $get, $fmt) = $self->_rearrange([qw(FILE GET FMT)], %param); + $get ||= 'all'; + $fmt ||= 'yyyy-mm-dd'; + + my($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, + $atime, $mtime, $ctime, $blksize, $blocks) = stat $file; + + if($get =~ /date/i) { + ## I can get the elapsed time since the file was modified but + ## it's not so straightforward to get the date in a nice format... + ## Think about using a standard CPAN module for this, like + ## Date::Manip or Date::DateCalc. + + my $date = $mtime; + my $elsec = time - $mtime; + printf "\nFile age: %.0f sec %.0f hrs %.0f days", $elsec, $elsec/3600, $elsec/(3600*24);; + my $days = sprintf "%.0f", $elsec/(3600*24); + } elsif($get eq 'all') { + return stat $file; + } +} + + +#------------ +sub delete { +#------------ + my $self = shift; + my $fileName = shift; + if(not -e $fileName) { + $self->throw("Can't delete file $fileName: Does not exist."); + } elsif(not -o $fileName) { + $self->throw("Can't delete file $fileName: Not owner."); + } + my $ulval = unlink($fileName) > 0 or + $self->throw("Failed to delete file $fileName: $!"); +} + + +=head2 create_filehandle + + Usage : $object->create_filehandle(); + Purpose : Create a FileHandle object from a file or STDIN. + : Mainly used as a helper method by read() and get_newline(). + Example : $data = $object->create_filehandle(-FILE =>'usr/people/me/data.txt') + Argument : Named parameters (case-insensitive): + : (all optional) + : -CLIENT => object reference for the object submitting + : the request. This facilitates use by + : Bio::Root::IOManager::read(). Default = $Util. + : -FILE => string (full path to file) or a reference + : to a FileHandle object or typeglob. This is an + : optional parameter (if not defined, STDIN is used). + Returns : Reference to a FileHandle object. + Throws : Exception if cannot open a supplied file or if supplied with a + : reference that is not a FileHandle ref. + Comments : If given a FileHandle reference, this method simply returns it. + : This method assumes the user wants to read ascii data. So, if + : the file is binary, it will be treated as a compressed (gzipped) + : file and access it using gzip -ce. The problem here is that not + : all binary files are necessarily compressed. Therefore, + : this method should probably have a -mode parameter to + : specify ascii or binary. + +See Also : L(), L(), + +=cut + +#--------------------- +sub create_filehandle { +#--------------------- + my($self, @param) = @_; + my($client, $file, $handle) = + $self->_rearrange([qw( CLIENT FILE HANDLE )], @param); + + if(not ref $client) { $client = $self; } + $file ||= $handle; + if( $client->can('file')) { + $file = $client->file($file); + } + + my $FH; # = new FileHandle; + + my ($handle_ref); + + if($handle_ref = ref($file)) { + if($handle_ref eq 'FileHandle') { + $FH = $file; + $client->{'_input_type'} = "FileHandle"; + } elsif($handle_ref eq 'GLOB') { + $FH = $file; + $client->{'_input_type'} = "Glob"; + } else { + $self->throw("Can't read from $file: Not a FileHandle or GLOB ref."); + } + $self->verbose > 0 and printf STDERR "$ID: reading data from FileHandle\n"; + + } elsif($file) { + $client->{'_input_type'} = "FileHandle for $file"; + + # Use gzip -cd to access compressed data. + if( -B $file ) { + $client->{'_input_type'} .= " (compressed)"; + $file = "${GNU_PATH}gzip -cd $file |" + } + + $FH = new FileHandle; + open ($FH, $file) || $self->throw("Can't access data file: $file", + "$!"); + $self->verbose > 0 and printf STDERR "$ID: reading data from file $file\n"; + + } else { + # Read from STDIN. + $FH = \*STDIN; + $self->verbose > 0 and printf STDERR "$ID: reading data from STDIN\n"; + $client->{'_input_type'} = "STDIN"; + } + + return $FH; + } + +=head2 get_newline + + Usage : $object->get_newline(); + Purpose : Determine the character(s) used for newlines in a given file or + : input stream. Delegates to Bio::Root::Utilities::get_newline() + Example : $data = $object->get_newline(-CLIENT => $anObj, + : -FILE =>'usr/people/me/data.txt') + Argument : Same arguemnts as for create_filehandle(). + Returns : Reference to a FileHandle object. + Throws : Propogates and exceptions thrown by Bio::Root::Utilities::get_newline(). + +See Also : L(), L() + +=cut + +#----------------- +sub get_newline { +#----------------- + my($self, @param) = @_; + + return $NEWLINE if defined $NEWLINE; + + my($client ) = + $self->_rearrange([qw( CLIENT )], @param); + + my $FH = $self->create_filehandle(@param); + + if(not ref $client) { $client = $self; } + + if($client->{'_input_type'} =~ /STDIN|Glob|compressed/) { + # Can't taste from STDIN since we can't seek 0 on it. + # Are other non special Glob refs seek-able? + # Attempt to guess newline based on platform. + # Not robust since we could be reading Unix files on a Mac, e.g. + if(defined $ENV{'MACPERL'}) { + $NEWLINE = "\015"; # \r + } else { + $NEWLINE = "\012"; # \n + } + } else { + $NEWLINE = $self->taste_file($FH); + } + + close ($FH) unless ($client->{'_input_type'} eq 'STDIN' || + $client->{'_input_type'} eq 'FileHandle' || + $client->{'_input_type'} eq 'Glob' ); + + delete $client->{'_input_type'}; + + return $NEWLINE || $DEFAULT_NEWLINE; + } + + +=head2 taste_file + + Usage : $object->taste_file( ); + : Mainly a utility method for get_newline(). + Purpose : Sample a filehandle to determine the character(s) used for a newline. + Example : $char = $Util->taste_file($FH) + Argument : Reference to a FileHandle object. + Returns : String containing an octal represenation of the newline character string. + : Unix = "\012" ("\n") + : Win32 = "\012\015" ("\r\n") + : Mac = "\015" ("\r") + Throws : Exception if no input is read within $TIMEOUT_SECS seconds. + : Exception if argument is not FileHandle object reference. + : Warning if cannot determine neewline char(s). + Comments : Based on code submitted by Vicki Brown (vlb@deltagen.com). + +See Also : L() + +=cut + +#--------------- +sub taste_file { +#--------------- + my ($self, $FH) = @_; + my $BUFSIZ = 256; # Number of bytes read from the file handle. + my ($buffer, $octal, $str, $irs, $i); + my $wait = $TIMEOUT_SECS; + + ref($FH) eq 'FileHandle' or $self->throw("Can't taste file: not a FileHandle ref"); + + $buffer = ''; + + # this is a quick hack to check for availability of alarm(); just copied + # from Bio/Root/IOManager.pm HL 02/19/01 + my $alarm_available = 1; + eval { + alarm(0); + }; + if($@) { + # alarm() not available (ActiveState perl for win32 doesn't have it. + # See jitterbug PR#98) + $alarm_available = 0; + } + $SIG{ALRM} = sub { die "Timed out!"; }; + my $result; + eval { + $alarm_available && alarm( $wait ); + $result = read($FH, $buffer, $BUFSIZ); # read the $BUFSIZ characters of file + $alarm_available && alarm(0); + }; + if($@ =~ /Timed out!/) { + $self->throw("Timed out while waiting for input.", + "Timeout period = $wait seconds.\nFor longer time before timing out, edit \$TIMEOUT_SECS in Bio::Root::Global.pm."); + + } elsif(not $result) { + my $err = $@; + $self->throw("read taste failed to read from FileHandle.", $err); + + } elsif($@ =~ /\S/) { + my $err = $@; + $self->throw("Unexpected error during read: $err"); + } + + seek($FH, 0, 0) or $self->throw("seek failed to seek 0 on FileHandle."); + + my @chars = split(//, $buffer); + + for ($i = 0; $i <$BUFSIZ; $i++) { + if (($chars[$i] eq "\012")) { + unless ($chars[$i-1] eq "\015") { + # Unix + $octal = "\012"; + $str = '\n'; + $irs = "^J"; + last; + } + } elsif (($chars[$i] eq "\015") && ($chars[$i+1] eq "\012")) { + # DOS + $octal = "\015\012"; + $str = '\r\n'; + $irs = "^M^J"; + last; + } elsif (($chars[$i] eq "\015")) { + # Mac + $octal = "\015"; + $str = '\r'; + $irs = "^M"; + last; + } + } + if (not $octal) { + $self->warn("Could not determine newline char. Using '\012'"); + $octal = "\012"; + } else { +# print STDERR "NEWLINE CHAR = $irs\n"; + } + return($octal); +} + +###################################### +##### Mail Functions ######## +###################################### + +=head2 mail_authority + + Title : mail_authority + Usage : $Util->mail_authority( $message ) + Purpose : Syntactic sugar to send email to $Bio::Root::Global::AUTHORITY + +See Also : L() + +=cut + +sub mail_authority { + + my( $self, $message ) = @_; + my $script = $self->untaint($0,1); + + send_mail( -TO=>$AUTHORITY, -SUBJ=>$script, -MSG=>$message); + +} + + +=head2 send_mail + + Title : send_mail + Usage : $Util->send_mail( named_parameters ) + Purpose : Provides an interface to /usr/lib/sendmail + Returns : n/a + Argument : Named parameters: (case-insensitive) + : -TO => e-mail address to send to + : -SUBJ => subject for message (optional) + : -MSG => message to be sent (optional) + : -CC => cc: e-mail address (optional) + Thows : Exception if TO: address appears bad or is missing + Comments : Based on TomC's tip at: + : http://www.perl.com/CPAN-local/doc/FMTEYEWTK/safe_shellings + : + : Using default 'From:' information. + : sendmail options used: + : -t: ignore the address given on the command line and + : get To:address from the e-mail header. + : -oi: prevents send_mail from ending the message if it + : finds a period at the start of a line. + +See Also : L() + +=cut + + +#-------------' +sub send_mail { +#------------- + my( $self, @param) = @_; + my($recipient,$subj,$message,$cc) = $self->_rearrange([qw(TO SUBJ MSG CC)],@param); + + $self->throw("Invalid or missing e-mail address: $recipient") + if not $recipient =~ /\S+\@\S+/; + + $cc ||= ''; $subj ||= ''; $message ||= ''; + + open (SENDMAIL, "|/usr/lib/sendmail -oi -t") || + $self->throw("Can't send mail: sendmail cannot fork: $!"); + +print SENDMAIL <yes_reply( [query_string]); + Purpose : To test an STDIN input value for affirmation. + Example : print +( $Util->yes_reply('Are you ok') ? "great!\n" : "sorry.\n" ); + : $Util->yes_reply('Continue') || die; + Returns : Boolean, true (1) if input string begins with 'y' or 'Y' + Argument: query_string = string to be used to prompt user (optional) + : If not provided, 'Yes or no' will be used. + : Question mark is automatically appended. + +=cut + +#------------- +sub yes_reply { +#------------- + my $self = shift; + my $query = shift; + my $reply; + $query ||= 'Yes or no'; + print "\n$query? (y/n) [n] "; + chomp( $reply = ); + $reply =~ /^y/i; +} + + + +=head2 request_data + + Title : request_data() + Usage : $Util->request_data( [value_name]); + Purpose : To request data from a user to be entered via keyboard (STDIN). + Example : $name = $Util->request_data('Name'); + : # User will see: % Enter Name: + Returns : String, (data entered from keyboard, sans terminal newline.) + Argument: value_name = string to be used to prompt user. + : If not provided, 'data' will be used, (not very helpful). + : Question mark is automatically appended. + +=cut + +#---------------- +sub request_data { +#---------------- + my $self = shift; + my $data = shift || 'data'; + print "Enter $data: "; + # Remove the terminal newline char. + chomp($data = ); + $data; +} + +sub quit_reply { +# Not much used since you can use request_data() +# and test for an empty string. + my $self = shift; + my $reply; + chop( $reply = ); + $reply =~ /^q.*/i; +} + + +=head2 verify_version + + Purpose : Checks the version of Perl used to invoke the script. + : Aborts program if version is less than the given argument. + Usage : verify_version('5.000') + +=cut + +#------------------ +sub verify_version { +#------------------ + my $self = shift; + my $reqVersion = shift; + + $] < $reqVersion and do { + printf STDERR ( "\a\n%s %0.3f.\n", "** Sorry. This Perl script requires at least version", $reqVersion); + printf STDERR ( "%s %0.3f %s\n\n", "You are running Perl version", $], "Please update your Perl!\n\n" ); + exit(1); + } +} + +# Purpose : Returns a string that can be used as a temporary file name. +# Based on localtime. +# This is used if POSIX is not available. + +sub _get_pseudo_tmpnam { + + my $date = localtime(time()); + + my $tmpnam = 'tmpnam'; + + if( $date =~ /([\d:]+)\s+(\d+)\s*$/ ) { + $tmpnam = $2. '_' . $1; + $tmpnam =~ s/:/_/g; + } + return $tmpnam; +} + + +1; +__END__ + +MODIFICATION NOTES: +--------------------- + +17 Feb 1999, sac: + * Using global $TIMEOUT_SECS in taste_file(). + +13 Feb 1999, sac: + * Renamed get_newline_char() to get_newline() since it could be >1 char. + +3 Feb 1999, sac: + * Added three new methods: create_filehandle, get_newline_char, taste_file. + create_filehandle represents functionality that was formerly buried + within Bio::Root::IOManager::read(). + +2 Dec 1998, sac: + * Removed autoloading code. + * Modified compress(), uncompress(), and delete() to properly + deal with file ownership issues. + +3 Jun 1998, sac: + * Improved file_date() to be less reliant on the output of ls. + (Note the word 'less'; it still relies on ls). + +5 Jul 1998, sac: + * compress() & uncompress() will write files to a temporary location + if the first attempt to compress/uncompress fails. + This allows users to access compressed files in directories in which they + lack write permission. + + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Root/Vector.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Root/Vector.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1050 @@ +#----------------------------------------------------------------------------- +# PACKAGE : Bio::Root::Vector.pm +# AUTHOR : Steve Chervitz (sac@bioperl.org) +# CREATED : 15 April 1997 +# REVISION: $Id: Vector.pm,v 1.10 2002/10/22 07:38:37 lapp Exp $ +# STATUS : Alpha +# +# WARNING: This is considered an experimental module. +# +# For documentation, run this module through pod2html +# (preferably from Perl v5.004 or better). +# +# MODIFIED: +# sac --- Fri Nov 6 14:24:48 1998 +# * Added destroy() method (experimental). +# 0.023, 20 Jul 1998, sac: +# * Improved memory management (_destroy_master()). +# +# Copyright (c) 1997 Steve Chervitz. All Rights Reserved. +# This module is free software; you can redistribute it and/or +# modify it under the same terms as Perl itself. +#----------------------------------------------------------------------------- + +package Bio::Root::Vector; + +use Bio::Root::Global qw(:devel); +use Bio::Root::Object (); + +# @ISA = qw(Bio::Root::Object); # Eventually perhaps... + +use vars qw($ID $VERSION); +$ID = 'Bio::Root::Vector'; +$VERSION = 0.04; + +use strict; +my @SORT_BY = ('rank','name'); + +## POD Documentation: + +=head1 NAME + +Bio::Root::Vector - Interface for managing linked lists of Perl5 objects. + +=head1 SYNOPSIS + +=head2 Object Creation + +B This +package is currently designed to be inherited along with another class +that provides a constructor (e.g., B). +The Vector provides a set of methods that can then be used for managing +sets of objects. + +See L. + +=head1 INSTALLATION + +This module is included with the central Bioperl distribution: + + http://bio.perl.org/Core/Latest + ftp://bio.perl.org/pub/DIST + +Follow the installation instructions included in the README file. + + +=head1 DESCRIPTION + +Bio::Root::Vector.pm provides an interface for creating and manipulating +dynamic sets (linked lists) of Perl5 objects. This is an abstract class (ie., +there is no constructor) and as such is expected to be inherited along with +some other class (see note above). + +Vectors are handy when, for example, an object may contain one or more +other objects of a certain class. The container object knows only +that is has at least one such object; the multiplex nature of the contained +object is managed by the contained object via its Vector interface. +The methods for adding, removing, counting, listing, and sorting all objects +are bundled together in Vector.pm. + +Thus, the current Bio::Root::Vector class is somewhat of a cross between an +interator and a composite design pattern. At present, a number of classes +utilize Bio::Root::Vector's composite-like behavior to implement a composite +pattern (Bio::SeqManager.pm, for example). +This is not necessarily ideal and is expected to change. + +=head1 USAGE + +For a usage demo of Bio::Root::Vector.pm see the scripts in the +examples/root_object/vector directory. + + +=head1 DEPENDENCIES + +Bio::Root::Vector.pm does not directly inherit from B but +creates an manager object which does. + +=head1 BUGS/FEATURES + +By default, all Vectors are doubly-linked lists. This relieves one from +the burden of worrying about whether a given Vector is single- or doubly-linked. +However, when generating lots of Vectors or extremely large vectors, memory +becomes an issue. In particular, signaling the GC to free the memory for +an object when you want to remove it. B + +Although it is not required, all objects within a vector are expected +to derive from the same class (package). Support for heterogeneous +Vectors has not been explicitly added (but they should work fine, as long +as you know what you're doing). + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other Bioperl modules. +Send your comments and suggestions preferably to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track the bugs and +their resolution. Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Steve Chervitz Esac@bioperl.orgE + +See L for where to send bug reports and comments. + +=head1 VERSION + +Bio::Root::Vector.pm, 0.04 + +=head1 TODO + +=over 4 + +=item * (Maybe) create an container class version of this module + +to permit Vectors to be instantiated. Thus, instead of inherited +from both Object.pm and Vector.pm, you could create a Vector.pm object. + +=item * Improve documentation. + +=back + +=head1 SEE ALSO + + Bio::Root::Object.pm - Core object + Bio::Root::Err.pm - Error/Exception object + Bio::Root::Global.pm - Manages global variables/constants + + http://bio.perl.org/Projects/modules.html - Online module documentation + http://bio.perl.org/ - Bioperl Project Homepage + +=head1 ACKNOWLEDGEMENTS + +This module was developed under the auspices of the Saccharomyces Genome +Database: + http://genome-www.stanford.edu/Saccharomyces + +=head1 COPYRIGHT + +Copyright (c) 1996-98 Steve Chervitz. All Rights Reserved. +This module is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +=cut + + +#' +## +### +#### END of main POD documentation. +### +## +# + +=head1 APPENDIX + +Methods beginning with a leading underscore are considered private +and are intended for internal use by this module. They are +B considered part of the public interface and are described here +for documentation purposes only. + +=cut + +######################################################## +# CONSTRUCTOR # +######################################################## + +## No constructor. See _set_master() for construction of {Master} data member. + +## Destructor: Use remove_all() or remove(). + +# New Idea for destructor +#------------- +sub destroy { +#------------- + my $self = shift; + local($^W) = 0; + undef $self->{'_prev'}; + undef $self->{'_next'}; + undef $self->{'_rank'}; + undef $self->{'_master'}; +} + +##################################################################################### +## ACCESSORS ## +##################################################################################### + + +=head2 set_rank + + Purpose : To set an object's rank to an arbitrary numeric + : value to be used for sorting the objects of the Vector. + Usage : $self->set_rank(-RANK =>numeric_ranking_data, + : -RANK_BY =>ranking_criterion_string); + : or without the named parameters as (rank, rank_by). + Throws : warning (if rank is set without also setting RANK_BY) + Comments : It is redundant for every object to store RANK_BY data. + : For this reason, the RANK_BY data is stored with the master + : object associated with the vector. + +See Also : L(), L() + +=cut + +#-------------' +sub set_rank { +#------------- + my( $self, %param) = @_; + + $self->_set_master($self) unless $self->{'_master'}->{'_set'}; + + my($rank, $rank_by) = $self->master->_rearrange([qw(RANK RANK_BY)], %param); + + $DEBUG==1 and do{ print STDERR "$ID:set_rank() = $rank; Criteria: $rank_by."; ; }; + + $self->{'_rank'} = ($rank || undef); + $self->{'_master'}->{'_rankBy'} = ($rank_by || undef); + if( defined $self->{'_rank'} and not defined $self->{'_master'}->{'_rankBy'} ) { + return $self->master->warn('Rank defined without ranking criteria.'); + } + 1; +} + +sub _set_rank_by { + my( $self, $arg) = @_; + $self->{'_master'}->{'_rankBy'} = $arg || 'unknown'; +} + +sub _set_master { + ## A vector does not need a master object unless it needs to grow. + my($self,$obj) = @_; + +# print "$ID: _set_master() new Master object for ${\$obj->name}."; ; + + require Bio::Root::Object; + my $master = {}; + bless $master, 'Bio::Root::Object'; + + $master->{'_set'} = 1; ## Special member indicating that this method has been called. + ## Necessary since perl will generate an anonymous {Master} + ## hash ref on the fly. This ref will not be blessed however. + $master->{'_first'} = $obj; + $master->{'_last'} = $obj; + $master->{'_size'} = 1; + $master->{'_index'}->{$obj->name()} = $obj; + $self->{'_master'} = $master; + + $self->{'_rank'} = 1; + $self->{'_prev'} = undef; + $self->{'_next'} = undef; +# $self->{'_master'}->{'_rankBy'} = undef; # Default rank is the order of addition to Vector. +} + +sub _destroy_master { +# This is called when the last object in the vector is being remove()d + my $self = shift; + + return if !$self->master or !$self->master->{'_set'}; + + my $master = $self->master; + + ## Get rid of the Vector master object. + ref $master->{'_first'} and (%{$master->{'_first'}} = (), undef $master->{'_first'}); + ref $master->{'_last'} and (%{$master->{'_last'}} = (), undef $master->{'_last'}); + ref $master->{'_index'} and (%{$master->{'_index'}} = (), undef $master->{'_index'}); + %{$master} = (); + undef $master; +} + + +=head2 clone_vector + + Purpose : Call this method to clone the whole vector. + : NOT calling this method will extract the vector element. + Usage : $self->clone_vector(); + Throws : Exception if argument is not an object reference. + Comments : This method is usually called from within a module's + : _set_clone() method for modules that inherit from + : Bio::Root::Vector.pm. + +=cut + +#-----------------' +sub clone_vector { +#----------------- + my($self, $obj) = @_; + + ref($obj) || throw($self, "Can't clone $ID object: Not an object ref ($obj)"); + + $self->{'_prev'} = $obj->{'_prev'}; + $self->{'_next'} = $obj->{'_next'}; + $self->{'_rank'} = $obj->{'_rank'}; + $self->{'_master'} = $obj->{'_master'}; +} + + +=head2 prev + + Purpose : Returns the previous object in the Vector or undef + : if on first object. + Usage : $self->prev + +=cut + +#-------- +sub prev { my $self = shift; $self->{'_prev'}; } +#-------- + + + +=head2 next + + Purpose : Returns the next object in the Vector or undef + : if on last object. + Usage : $self->next + +=cut + +#-------- +sub next { my $self = shift; $self->{'_next'}; } +#-------- + + + +=head2 first + + Purpose : Returns the first object in the Vector or $self + : if Vector size = 1. + Usage : $self->first + +=cut + +#---------- +sub first { +#---------- + my $self = shift; + defined $self->{'_master'} ? $self->{'_master'}->{'_first'} : $self; +} + + +=head2 last + + Purpose : Returns the last object in the Vector or + : $self if Vector size = 1. + Usage : $self->last + +=cut + +#------- +sub last { +#------- + my $self = shift; + defined $self->{'_master'} ? $self->{'_master'}->{'_last'} : $self; +} + + + +=head2 rank + + Purpose : Returns the rank of the current object or 1 + : if rank is not defined. + Usage : $self->rank + +See Also : L() + +=cut + +#--------- +sub rank { my $self = shift; $self->{'_rank'} || 1; } +#--------- + + + +=head2 rank_by + + Purpose : Returns the ranking criterion or the string 'order of addition' + : if rankBy has not been explicitly set. + Usage : $self->rank_by + +See Also : L() + +=cut + +#----------- +sub rank_by { +#----------- + my $self = shift; + defined $self->{'_master'} ? ($self->{'_master'}->{'_rankBy'}||'order of addition') + : 'unranked'; +} + + + +=head2 size + + Purpose : Returns the number of objects currently in the Vector. + Usage : $self->size + +=cut + +#--------- +sub size { +#--------- + my $self = shift; + defined $self->{'_master'} ? $self->{'_master'}->{'_size'} : 1; +} + + +=head2 master + + Purpose : Returns the master object associated with the Vector. + : (should probably be a private method). + Usage : $self->master + +=cut + +#----------- +sub master { my $self = shift; $self->{'_master'}; } +#----------- + + +## Not sure what these potentially dangerous methods are used for. +## Should be unnecessary and probably can be removed. +sub set_prev { my($self,$obj) = @_; $self->{'_prev'} = $obj; } +sub set_next { my($self,$obj) = @_; $self->{'_next'} = $obj; } + +############################################################################# +# INSTANCE METHODS ## +############################################################################# + + +=head2 is_first + + Purpose : Test whether the current object is the first in the Vector. + Usage : $self->is_first + +=cut + +#------------ +sub is_first { my($self) = shift; return not defined $self->{'_prev'}; } +#------------ + + +=head2 is_last + + Purpose : Test whether the current object is the last in the Vector. + Usage : $self->is_last + +=cut + +#------------ +sub is_last { my($self) = shift; return not defined $self->{'_next'}; } +#------------ + + + + +=head2 get + + Purpose : Retrives an object from the Vector given its name. + : Returns undef if the object cannot be found. + Usage : $self->get(object_name) + Examples : $self->get($obj->name) + +See Also : L() + +=cut + +#-------- +sub get { +#-------- + my($self,$name) = @_; + + my ($obj); +# print "$ID get(): getting $name\n"; + + if($self->{'_master'}->{'_set'}) { +# my @names = keys %{$self->{'_master'}->{'_index'}}; +# print "$ID: names in hash:\n@names";; +# print " returning $self->{'_master'}->{'_index'}->{$name}\n"; + local($^W) = 0; + $obj = $self->{'_master'}->{'_index'}->{$name}; + } + + elsif($self->name =~ /$name/i) { +# print " returning self\n"; + $obj = $self; + } + + if(not ref $obj) { + $self->throw("Can't get object named \"$name\": object not set or name undefined."); + } + $obj; +} + +## Former strategy: hunt through the list for the object. +## No longer needed since master indexes all objects. +# do{ +# if($obj->name eq $name) { return $obj; } +# +# } while($obj = $current->prev()); + + + + +=head2 add + + Purpose : Add an object to the end of a Vector. + Usage : $self->add(object_reference) + +See also : L(), L() + +=cut + +#-------- +sub add { +#-------- + my($self,$new,$index) = @_; + + $self->_set_master($self) unless $self->{'_master'}->{'_set'}; + +# print "\n\nADDING TO VECTOR ${\ref $self} ${\$self->name}\nFOR PARENT: ${\ref $self->parent} ${\$self->parent->name}\n\n"; + + $self->{'_next'} = $new; + $new->{'_prev'} = $self; + $self->{'_master'}->{'_last'} = $new; + $self->{'_master'}->{'_size'}++; + $new->{'_master'} = $self->{'_master'}; + $new->_incrementRank(); + $new->Bio::Root::Vector::_index(); + +# printf "NEW CONTENTS: (n=%s)\n", $self->size; +# my $obj = $self->first; +# my $count=0; +# do { print "\n","Object #",++$count,"\n"; +# $obj->display; +# } while($obj=$obj->next); +# ; +} + + +sub _index { + my($self) = @_; + my $name = $self->name; + + # Generate unique name, if necessary, for indexing purposes. + if( not $name or $name =~ /anonymous/) { + $name ||= ''; + $name .= $self->size(); + } +# print "$ID: _index() called for $name\n"; + + $self->{'_master'}->{'_index'}->{$name} = $self; +} + +sub _incrementRank { + my $self = shift; + return if not defined $self->{'_prev'}; + $self->{'_rank'} = $self->{'_prev'}->rank() + 1; +} + + +=head2 remove + + Purpose : Remove the current object from the Vector. + Usage : $self->remove([-RET=>'first'|'last'|'next'|'prev'], [-UPDATE => 0|1]) + Returns : The first, last, next, or previous object in the Vector + : depending on the value of the -RET parameter. + : Default = next. + Argument : Named parameters: (TAGS CAN BE ALL UPPER OR ALL LOWER CASE) + : -RET => string: 'first', 'last', 'prev' 'next' + : THis indicates the object to be returned. + : -UPDATE => boolean, if non-zero all objects in the vector + : will be re-ranked. + Comments : The -UPDATE parameter should be set to true to force a re-updating + : of the rank data for each object. (default = 0, no update). + +See Also : L(), L(), L(), L() + +=cut + +#----------- +sub remove { +#----------- + my($self,%param) = @_; + my $updateRank = $param{-UPDATE} || $param{'-update'} || 0; + my $ret = $param{-RET} || $param{'-ret'} || 'next'; + + $DEBUG==2 && do{ print STDERR "$ID: removing ${\$self->name}; ret = $ret";; }; + + ## This set of conditionals involves primarily pointer shuffling. + ## The special case of destroying a vector of size 1 is handled. + + if($self->is_first()) { + $DEBUG==2 && print STDERR "---> removing first object: ${\$self->name()}.\n"; + if($self->is_last) { +# print "Removing only object in vector: ${\$self->name}.\n"; + $self->_destroy_master(); + return $self->destroy; + } else { + undef ($self->{'_next'}->{'_prev'}); + $self->_update_first($self->{'_next'}); + } + + } elsif($self->is_last()) { + $DEBUG==2 && print STDERR "---> removing last object: ${\$self->name()}.\n"; + undef ($self->{'_prev'}->{'_next'}); + $self->_update_last($self->{'_prev'}); + + } else { + $DEBUG==2 && print STDERR "---> removing internal object.\n"; + $self->{'_prev'}->{'_next'} = $self->{'_next'}; + $self->{'_next'}->{'_prev'} = $self->{'_prev'}; + } + + $updateRank && $self->_update_rank(); + $self->{'_master'}->{'_size'}--; + +# print "new vector size = ",$self->size,"\n"; ; + + my($retObj); + + if( $self->size) { + if($ret eq 'first') { $retObj = $self->first(); } + elsif($ret eq 'last') { $retObj = $self->last(); } + elsif($ret eq 'next') { $retObj = $self->next(); } + elsif($ret eq 'prev') { $retObj = $self->prev(); } + } + + ## Destroy the object. +# $self->destroy; + + $DEBUG && do{ print STDERR "$ID: returning ${\$retObj->name}";; }; + + $retObj; +} + +sub _update_first { + my($self,$first) = @_; + $DEBUG && print STDERR "Updating first.\n"; + undef ($first->{'_prev'}); + $self->{'_master'}->{'_first'} = $first; +} + +sub _update_last { + my($self,$last) = @_; + $DEBUG && print STDERR "Updating last.\n"; + undef ($last->{'_next'}); + $self->{'_master'}->{'_last'} = $last; +} + + +=head2 remove_all + + Purpose : Remove all objects currently in the Vector. + Usage : $self->remove_all + +See Also : L(), L(), L() + +=cut + +#--------------- +sub remove_all { +#--------------- + my($self,%param) = @_; + + $DEBUG==2 && print STDERR "DESTROYING VECTOR $self ${\$self->name}"; + +# print "$ID Removing all."; + + $self = $self->first(); + + while(ref $self) { +# print "$ID: removing ${\$self->name}\n"; + $self = $self->remove(-RET=>'next'); + } +} + + +=head2 shift + + Purpose : Remove the first object from the Vector. + : This is a wrapper for remove(). + Usage : $self->shift([-RET=>'first'|'last'|'next'|'prev']) + Returns : The object returned by remove(). + +See Also : L(), L() + +=cut + +#--------- +sub shift { +#--------- + my($self,%param) = @_; + $self = $self->first(); + $self = $self->remove(%param); +} + + +=head2 chop + + Purpose : Remove the last object from the Vector. + : This is a wrapper for remove(). + Usage : $self->chop([-RET=>'first'|'last'|'next'|'prev']) + Returns : The object returned by remove(). + +See Also : L(), L() + +=cut + +#---------- +sub chop { +#---------- + my($self,%param) = @_; + $self = $self->last(); + $self = $self->remove(%param); +} + + + +=head2 insert + + Purpose : Insert a new object into the vector relative to the current object. + Usage : $self->insert(object_ref, ['before'|'after']) + Examples : $self->insert($obj) # Default insert after $self + : $self->insert($obj,'before') + Returns : The new number of objects in the vector (int). + Throws : exception if the first argument is not a reference. + +See Also : L(), L() + +=cut + +#----------- +sub insert { +#----------- + my($self,$object,$where) = @_; + my($first); + $where ||= 'after'; + + $self->_set_master($self) unless $self->{'_master'}->{'_set'}; + + ref($object) || return $self->master->throw("Can't insert. Not an object: $object"); + + if($where eq 'before') { + $object->{'_next'} = $self; + $object->{'_prev'} = $self->{'_prev'}; + $object->{'_master'} = $self->{'_master'}; + $self->{'_prev'}->{'_next'} = $object; + $self->{'_prev'} = $object; + } else { + $object->{'_prev'} = $self; + $object->{'_next'} = $self->{'_next'}; + $object->{'_master'} = $self->{'_master'}; + $self->{'_next'}->{'_prev'} = $object; + $self->{'_next'} = $object; + } + $self->{'_master'}->{'_size'}++; + $object->Bio::Root::Vector::_index(); ##Fully qualified to disambiguate a potentially common method name. + $self->_update_rank(); +} + +sub _update_rank { + my($self) = @_; + my $current = $self->first(); + my $count = 0; + $DEBUG && print STDERR "$ID: Updating rank.\n"; + do{ + $count++; + $current->{'_rank'} = $count; + + } while($current = $current->next()); +} + + +=head2 list + + Purpose : Returns objects in the Vector as an array or array slice. + Usage : $self->list([starting_object|'first'] [,ending_object|'last']) + Examples : $self->list + : $self->list('first',$self->prev) + +See Also : L() + +=cut + +#---------- +sub list { +#---------- + my($self,$start,$stop) = @_; + my(@list); + + $start ||= 1; + $stop ||= 'last'; + + if( $start =~ /first|beg|start/i or $start <= 1 ) { + $start = $self->first(); + } + + if( $stop =~ /last|end|stop/i ) { + $stop = $self->last(); + } + + ref($start) || ($start = $self->first()); + ref($stop) || ($stop = $self->last()); + + my $obj = $start; + my $fini = 0; + do{ + push @list, $obj; + if($obj eq $stop) { $fini = 1; } + } while( $obj = $obj->next() and !$fini); + + @list; +} + + +=head2 sort + + Purpose : Sort the objects in the Vector. + Usage : $self->sort(['rank'|'name'], [reverse]) + Returns : The last object of the sorted Vector. + Argument : First argument can be 'name' or 'rank' to sort on + : the object's name or rank data member, respectively. + : If reverse is non-zero, sort will be in reverse order. + Example : $self->sort() # Default sort by rank, not reverse. + : $self->sort('name','reverse') + +=cut + +#---------' +sub sort { +#--------- + my ($self,$sortBy,$reverse) = @_; + my (@unsortedList,@sortedList); + + $sortBy ||= 'rank'; + my $rankBy = $self->rank_by; + + ### Build the initial unsorted list. + my $obj = $self->first(); + do{ + push @unsortedList, $obj; + } while( $obj = $obj->next()); + +# print "UNSORTED LIST:\n"; +# foreach(@unsortedList) {print $_->name().' '};; + + ### Sort it. + if( $sortBy =~ /rank/i) { +# print "sorting by rank"; + if($reverse) { +# print " (reverse).\n"; + @sortedList = reverse sort _sort_by_rank @unsortedList; + } else { + @sortedList = sort _sort_by_rank @unsortedList; + } + } elsif( $sortBy =~ /name/i) { +# print "sorting by name"; + if($reverse) { +# print "(reverse).\n"; + @sortedList = reverse sort _sort_by_name @unsortedList; + } else { + @sortedList = sort _sort_by_name @unsortedList; + } + } else { +# print "unknown sort criteria: $sortBy\n"; + $self->warn("Invalid sorting criteria: $sortBy.", + "Sorting by rank."); + @sortedList = sort _sort_by_rank @unsortedList; + } + + +# if($reverse) { @sortedList = reverse sort @sortedList; } + +# print "SORTED LIST:\n"; +# foreach(@sortedList) {print $_->name().' '};; + + ### Re-load the Vector with the sorted list. + my $count=0; + + $self = $sortedList[0]; + $self->_set_master($self); + $self->_set_rank_by($rankBy); + + my($i); + my $current = $self; + for($i=1; $i<@sortedList; $current=$sortedList[$i], $i++) { + $current->add($sortedList[$i]); + if($i==$#sortedList) { $sortedList[$i]->{'_next'} = undef;} + } + + $self->last(); +} + +sub _sort_by_rank { my $aRank = $a->rank(); my $bRank = $b->rank(); $aRank <=> $bRank; } + +sub _sort_by_name { my $aName = $a->name(); my $bName = $b->name(); $aName cmp $bName; } + + + +=head2 valid_any + + Purpose : Determine if at least one object in the Vector is valid. + Usage : $self->valid_any + Status : Deprecated. + Comments : A non-valid object should throw an exception that must be + : be caught an dealt with on the spot. + +See Also : B + +=cut + +#------------- +sub valid_any { +#------------- + my $self = &shift(@_); + + my $obj = $self->first(); + do{ + return 1 if $obj->valid(); + } while( $obj = $obj->next()); + + return undef; +} + + +=head2 valid_all + + Purpose : Determine if all objects in the Vector are valid. + Usage : $self->valid_all + Comments : A non-valid object should throw an exception that must be + : be caught an dealt with on the spot. + +See Also : B + +=cut + +#-------------- +sub valid_all { +#-------------- + my $self = &shift(@_); + + my $obj = $self->first(); + do{ + return unless $obj->valid(); + } while( $obj = $obj->next()); + + return 1; +} + +sub _display_stats { +# This could be fleshed out a bit... + + my( $self, $OUT ) = @_; + + printf ( $OUT "%-11s %s\n","RANK:", $self->rank()); + printf ( $OUT "%-11s %s\n","RANK BY:", $self->rank_by()); +} + +1; +__END__ + +##################################################################################### +# END OF CLASS # +##################################################################################### + +=head1 FOR DEVELOPERS ONLY + +=head2 Data Members + +Information about the various data members of this module is provided for those +wishing to modify or understand the code. Two things to bear in mind: + +=over 4 + +=item 1 Do NOT rely on these in any code outside of this module. + +All data members are prefixed with an underscore to signify that they are private. +Always use accessor methods. If the accessor doesn't exist or is inadequate, +create or modify an accessor (and let me know, too!). + +=item 2 This documentation may be incomplete and out of date. + +It is easy for this documentation to become obsolete as this module is still evolving. +Always double check this info and search for members not described here. + +=back + +Bio::Root::Vector.pm objects currently cannot be instantiated. Vector.pm must be inherited +along with Bio::Root::Object.pm (or an object that provides a constructor). +Vector.pm defines the following fields: + + FIELD VALUE + ------------------------------------------------------------------------ + _prev Reference to the previous object in the Vector. + + _next Reference to the next object in the Vector. + + _rank Rank relative to other objects in the Vector. + Default rank = chronological order of addition to the Vector. + + _master A reference to an Bio::Root::Object that acts as a manager for + the given Vector. There is only one master per Vector. + A master object is only needed when the Vector size is >1. + The master object manages the following Vector data: + + _first - Reference to the first object in the Vector. + _last - Reference to the last object in the Vector. + _size - Total number of objects in the Vector. + _rankBy - Criteria used to rank the object. + Default: chronological order of addition. + _index - Hash reference for quick access to any object + based on its name. + Bio::Root::Object{'_err'} - Holds any errors affecting the + Vector as a whole. + +=cut + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Root/Xref.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Root/Xref.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,197 @@ +#----------------------------------------------------------------------------- +# PACKAGE : Bio::Root::Xref.pm +# AUTHOR : Steve Chervitz (sac@bioperl.org) +# CREATED : 8 May 1997 +# REVISION: $Id: Xref.pm,v 1.9 2002/10/22 07:38:37 lapp Exp $ +# STATUS : Pre-Alpha +# +# WARNING: This is considered an experimental module. +# +# Copyright (c) 1997-8 Steve Chervitz. All Rights Reserved. +# This module is free software; you can redistribute it and/or +# modify it under the same terms as Perl itself. +#----------------------------------------------------------------------------- + +package Bio::Root::Xref; + +use Bio::Root::Global; +use Bio::Root::Object (); +use Bio::Root::Vector (); + +@Bio::Root::Xref::ISA = qw( Bio::Root::Vector Bio::Root::Object ); + +use vars qw($ID $VERSION); +$ID = 'Bio::Root::Xref'; +$VERSION = 0.01; + +## POD Documentation: + +=head1 NAME + +Bio::Root::Xref - A generic cross-reference object. + +B + +=head1 SYNOPSIS + +=head2 Object Creation + + use Bio::Root::Object; + + $myObj->xref($object_ref); + +=head2 Object Manipulation + + Accessors + --------------------------------------------------------------------- + obj() - Get the cross-referenced object. + desc() - Description of the nature of the cross-reference. + set_desc() - Set description. + type() - Symmetric or assymetric. + + Methods + --------------------------------------------------------------------- + clear() - remove all cross-references within the Xref object (not implemented). + +=head1 DESCRIPTION + +An instance of B manages sets of objects not +necessarily related by inheritance or composition, but by an arbitrary +criterion defined by the client. Currently, Bio::Root::Xref inherits +from both B and B. An Xref +object is an example of a heterogeneous Vector object since different +objects in the vector need not all derive from the same base class. + +The two objects involved in the cross-reference typically involve a +symmetrical relationship in which each will have a Xref object relating it +to the other object. This relationship is not necessarily transitive, +however: if A is an xref of B and B is an xref of C, A is not +necessarily an xref of C. Assymetric Xrefs are also possible. + +The establishment of cross-references is managed by B. +See the xref() method in that module. + +B + + +=head1 SEE ALSO + + Bio::Root::Object.pm - Core object + Bio::Root::Global.pm - Manages global variables/constants + + http://bio.perl.org/Projects/modules.html - Online module documentation + http://bio.perl.org/ - Bioperl Project Homepage + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other Bioperl modules. +Send your comments and suggestions preferably to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track the bugs and +their resolution. Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Steve Chervitz Esac@bioperl.orgE + +See L for where to send bug reports and comments. + +=head1 VERSION + +Bio::Root::Xref.pm, 0.01 pre-alpha + +=head1 COPYRIGHT + +Copyright (c) 1997-8 Steve Chervitz. All Rights Reserved. +This module is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + + +=head1 TODO + +Update documentation to work with pod2html from Perl 5.004. + +=cut + +# +## +### +#### END of main POD documentation. +### +## +# + + + +##################################################################################### +## CONSTRUCTOR ## +##################################################################################### + +sub _initialize { + my( $self, %param ) = @_; + + $self->SUPER::_initialize(%param); + + $self->{'_obj'} = ($param{-OBJ} || undef); + + ## By default, all Xrefs are symmetric. + ## Create symmetric cross-reference in obj. + if(!$param{-ASYM}) { + $self->{'_obj'}->xref(-OBJ=>$param{-PARENT}); + $self->{'_type'} = 'sym'; + } else { + $self->{'_type'} = 'asym'; + } +} + + +##################################################################################### +## ACCESSORS ## +##################################################################################### + +sub obj {my ($self) = shift; return $self->{'_obj'}; } +sub desc {my ($self) = shift; return $self->{'_desc'}; } +sub type {my ($self) = shift; return $self->{'_type'}; } + +sub set_desc {my ($self,$desc) = @_; + $self->{'_desc'} = $desc; + } + +sub clear { +## Not implemented. Need to do this carefully. +## Not sure if this method is needed. + my ($self) = @_; +} + +1; +__END__ + +##################################################################################### +# END OF CLASS # +##################################################################################### + +=head1 DATA MEMBERS + + _obj : The object being cross-referenced to the parent. + _type : Symmetric or asymmetric + _desc : Description associated with the cross-reference + + INHERITED DATA MEMBERS (from Bio::Root::Object) + + _parent : The object receiving the cross-reference. + _name : Descriptive nature of the cross-reference. + +=cut + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Search/BlastUtils.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Search/BlastUtils.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,528 @@ +=head1 NAME + +Bio::Search::BlastUtils - Utility functions for Bio::Search:: BLAST objects + +=head1 SYNOPSIS + +This module is just a collection of subroutines, not an object. + +=head1 DESCRIPTION + +The BlastUtils.pm module is a collection of subroutines used primarily by +Bio::Search::Hit::BlastHit objects for some of the additional +functionality, such as HSP tiling. Right now, the BlastUtils is just a +collection of methods, not an object, and it's tightly coupled to +Bio::Search::Hit::BlastHit. A goal for the future is to generalize it +to work based on the Bio::Search interfaces, then it can work with any +objects that implements them. + +=head1 AUTHOR + +Steve Chervitz Esac@bioperl.orgE + +=cut + +#' + +package Bio::Search::BlastUtils; + + + +=head2 tile_hsps + + Usage : tile_hsps( $sbjct ); + : This is called automatically by Bio::Search::Hit::BlastHit + : during object construction or + : as needed by methods that rely on having tiled data. + Purpose : Collect statistics about the aligned sequences in a set of HSPs. + : Calculates the following data across all HSPs: + : -- total alignment length + : -- total identical residues + : -- total conserved residues + Returns : n/a + Argument : A Bio::Search::Hit::BlastHit object + Throws : n/a + Comments : + : This method is *strongly* coupled to Bio::Search::Hit::BlastHit + : (it accesses BlastHit data members directly). + : TODO: Re-write this to the Bio::Search::Hit::HitI interface. + : + : This method performs more careful summing of data across + : all HSPs in the Sbjct object. Only HSPs that are in the same strand + : and frame are tiled. Simply summing the data from all HSPs + : in the same strand and frame will overestimate the actual + : length of the alignment if there is overlap between different HSPs + : (often the case). + : + : The strategy is to tile the HSPs and sum over the + : contigs, collecting data separately from overlapping and + : non-overlapping regions of each HSP. To facilitate this, the + : HSP.pm object now permits extraction of data from sub-sections + : of an HSP. + : + : Additional useful information is collected from the results + : of the tiling. It is possible that sub-sequences in + : different HSPs will overlap significantly. In this case, it + : is impossible to create a single unambiguous alignment by + : concatenating the HSPs. The ambiguity may indicate the + : presence of multiple, similar domains in one or both of the + : aligned sequences. This ambiguity is recorded using the + : ambiguous_aln() method. + : + : This method does not attempt to discern biologically + : significant vs. insignificant overlaps. The allowable amount of + : overlap can be set with the overlap() method or with the -OVERLAP + : parameter used when constructing the Blast & Sbjct objects. + : + : For a given hit, both the query and the sbjct sequences are + : tiled independently. + : + : -- If only query sequence HSPs overlap, + : this may suggest multiple domains in the sbjct. + : -- If only sbjct sequence HSPs overlap, + : this may suggest multiple domains in the query. + : -- If both query & sbjct sequence HSPs overlap, + : this suggests multiple domains in both. + : -- If neither query & sbjct sequence HSPs overlap, + : this suggests either no multiple domains in either + : sequence OR that both sequences have the same + : distribution of multiple similar domains. + : + : This method can deal with the special case of when multiple + : HSPs exactly overlap. + : + : Efficiency concerns: + : Speed will be an issue for sequences with numerous HSPs. + : + Bugs : Currently, tile_hsps() does not properly account for + : the number of non-tiled but overlapping HSPs, which becomes a problem + : as overlap() grows. Large values overlap() may thus lead to + : incorrect statistics for some hits. For best results, keep overlap() + : below 5 (DEFAULT IS 2). For more about this, see the "HSP Tiling and + : Ambiguous Alignments" section in L. + +See Also : L<_adjust_contigs>(), L + +=cut + +#-------------- +sub tile_hsps { +#-------------- + my $sbjct = shift; + + $sbjct->{'_tile_hsps'} = 1; + $sbjct->{'_gaps_query'} = 0; + $sbjct->{'_gaps_sbjct'} = 0; + + ## Simple summation scheme. Valid if there is only one HSP. + if((defined($sbjct->{'_n'}) and $sbjct->{'_n'} == 1) or $sbjct->num_hsps == 1) { + my $hsp = $sbjct->hsp; + $sbjct->{'_length_aln_query'} = $hsp->length('query'); + $sbjct->{'_length_aln_sbjct'} = $hsp->length('sbjct'); + $sbjct->{'_length_aln_total'} = $hsp->length('total'); + ($sbjct->{'_totalIdentical'},$sbjct->{'_totalConserved'}) = $hsp->matches(); + $sbjct->{'_gaps_query'} = $hsp->gaps('query'); + $sbjct->{'_gaps_sbjct'} = $hsp->gaps('sbjct'); + +# print "_tile_hsps(): single HSP, easy stats.\n"; + return; + } else { +# print STDERR "Sbjct: _tile_hsps: summing multiple HSPs\n"; + $sbjct->{'_length_aln_query'} = 0; + $sbjct->{'_length_aln_sbjct'} = 0; + $sbjct->{'_length_aln_total'} = 0; + $sbjct->{'_totalIdentical'} = 0; + $sbjct->{'_totalConserved'} = 0; + } + + ## More than one HSP. Must tile HSPs. +# print "\nTiling HSPs for $sbjct\n"; + my($hsp, $qstart, $qstop, $sstart, $sstop); + my($frame, $strand, $qstrand, $sstrand); + my(@qcontigs, @scontigs); + my $qoverlap = 0; + my $soverlap = 0; + my $max_overlap = $sbjct->{'_overlap'}; + + foreach $hsp ($sbjct->hsps()) { +# printf " HSP: %s\n%s\n",$hsp->name, $hsp->str('query'); +# printf " Length = %d; Identical = %d; Conserved = %d; Conserved(1-10): %d",$hsp->length, $hsp->length(-TYPE=>'iden'), $hsp->length(-TYPE=>'cons'), $hsp->length(-TYPE=>'cons',-START=>0,-STOP=>10); + ($qstart, $qstop) = $hsp->range('query'); + ($sstart, $sstop) = $hsp->range('sbjct'); + $frame = $hsp->frame; + $frame = -1 unless defined $frame; + ($qstrand, $sstrand) = $hsp->strand; + + my ($qgaps, $sgaps) = $hsp->gaps(); + $sbjct->{'_gaps_query'} += $qgaps; + $sbjct->{'_gaps_sbjct'} += $sgaps; + + $sbjct->{'_length_aln_total'} += $hsp->length; + ## Collect contigs in the query sequence. + $qoverlap = &_adjust_contigs('query', $hsp, $qstart, $qstop, \@qcontigs, $max_overlap, $frame, $qstrand); + + ## Collect contigs in the sbjct sequence (needed for domain data and gapped Blast). + $soverlap = &_adjust_contigs('sbjct', $hsp, $sstart, $sstop, \@scontigs, $max_overlap, $frame, $sstrand); + + ## Collect overall start and stop data for query and sbjct over all HSPs. + if(not defined $sbjct->{'_queryStart'}) { + $sbjct->{'_queryStart'} = $qstart; + $sbjct->{'_queryStop'} = $qstop; + $sbjct->{'_sbjctStart'} = $sstart; + $sbjct->{'_sbjctStop'} = $sstop; + } else { + $sbjct->{'_queryStart'} = ($qstart < $sbjct->{'_queryStart'} ? $qstart : $sbjct->{'_queryStart'}); + $sbjct->{'_queryStop'} = ($qstop > $sbjct->{'_queryStop'} ? $qstop : $sbjct->{'_queryStop'}); + $sbjct->{'_sbjctStart'} = ($sstart < $sbjct->{'_sbjctStart'} ? $sstart : $sbjct->{'_sbjctStart'}); + $sbjct->{'_sbjctStop'} = ($sstop > $sbjct->{'_sbjctStop'} ? $sstop : $sbjct->{'_sbjctStop'}); + } + } + + ## Collect data across the collected contigs. + +# print "\nQUERY CONTIGS:\n"; +# print " gaps = $sbjct->{'_gaps_query'}\n"; + + # TODO: Account for strand/frame issue! + # Strategy: collect data on a per strand+frame basis and save the most significant one. + my (%qctg_dat); + foreach(@qcontigs) { +# print " query contig: $_->{'start'} - $_->{'stop'}\n"; +# print " iden = $_->{'iden'}; cons = $_->{'cons'}\n"; + ($frame, $strand) = ($_->{'frame'}, $_->{'strand'}); + $qctg_dat{ "$frame$strand" }->{'length_aln_query'} += $_->{'stop'} - $_->{'start'} + 1; + $qctg_dat{ "$frame$strand" }->{'totalIdentical'} += $_->{'iden'}; + $qctg_dat{ "$frame$strand" }->{'totalConserved'} += $_->{'cons'}; + $qctg_dat{ "$frame$strand" }->{'qstrand'} = $strand; + } + + # Find longest contig. + my @sortedkeys = reverse sort { $qctg_dat{ $a }->{'length_aln_query'} <=> $qctg_dat{ $b }->{'length_aln_query'} } keys %qctg_dat; + + # Save the largest to the sbjct: + my $longest = $sortedkeys[0]; + $sbjct->{'_length_aln_query'} = $qctg_dat{ $longest }->{'length_aln_query'}; + $sbjct->{'_totalIdentical'} = $qctg_dat{ $longest }->{'totalIdentical'}; + $sbjct->{'_totalConserved'} = $qctg_dat{ $longest }->{'totalConserved'}; + $sbjct->{'_qstrand'} = $qctg_dat{ $longest }->{'qstrand'}; + + ## Collect data for sbjct contigs. Important for gapped Blast. + ## The totalIdentical and totalConserved numbers will be the same + ## as determined for the query contigs. + +# print "\nSBJCT CONTIGS:\n"; +# print " gaps = $sbjct->{'_gaps_sbjct'}\n"; + + my (%sctg_dat); + foreach(@scontigs) { +# print " sbjct contig: $_->{'start'} - $_->{'stop'}\n"; +# print " iden = $_->{'iden'}; cons = $_->{'cons'}\n"; + ($frame, $strand) = ($_->{'frame'}, $_->{'strand'}); + $sctg_dat{ "$frame$strand" }->{'length_aln_sbjct'} += $_->{'stop'} - $_->{'start'} + 1; + $sctg_dat{ "$frame$strand" }->{'frame'} = $frame; + $sctg_dat{ "$frame$strand" }->{'sstrand'} = $strand; + } + + @sortedkeys = reverse sort { $sctg_dat{ $a }->{'length_aln_sbjct'} <=> $sctg_dat{ $b }->{'length_aln_sbjct'} } keys %sctg_dat; + + # Save the largest to the sbjct: + $longest = $sortedkeys[0]; + + $sbjct->{'_length_aln_sbjct'} = $sctg_dat{ $longest }->{'length_aln_sbjct'}; + $sbjct->{'_frame'} = $sctg_dat{ $longest }->{'frame'}; + $sbjct->{'_sstrand'} = $sctg_dat{ $longest }->{'sstrand'}; + + if($qoverlap) { + if($soverlap) { $sbjct->ambiguous_aln('qs'); +# print "\n*** AMBIGUOUS ALIGNMENT: Query and Sbjct\n\n"; + } + else { $sbjct->ambiguous_aln('q'); +# print "\n*** AMBIGUOUS ALIGNMENT: Query\n\n"; + } + } elsif($soverlap) { + $sbjct->ambiguous_aln('s'); +# print "\n*** AMBIGUOUS ALIGNMENT: Sbjct\n\n"; + } + + # Adjust length based on BLAST flavor. + my $prog = $sbjct->algorithm; + if($prog eq 'TBLASTN') { + $sbjct->{'_length_aln_sbjct'} /= 3; + } elsif($prog eq 'BLASTX' ) { + $sbjct->{'_length_aln_query'} /= 3; + } elsif($prog eq 'TBLASTX') { + $sbjct->{'_length_aln_query'} /= 3; + $sbjct->{'_length_aln_sbjct'} /= 3; + } +} + + + +=head2 _adjust_contigs + + Usage : n/a; called automatically during object construction. + Purpose : Builds HSP contigs for a given BLAST hit. + : Utility method called by _tile_hsps() + Returns : + Argument : + Throws : Exceptions propagated from Bio::Search::Hit::BlastHSP::matches() + : for invalid sub-sequence ranges. + Status : Experimental + Comments : This method does not currently support gapped alignments. + : Also, it does not keep track of the number of HSPs that + : overlap within the amount specified by overlap(). + : This will lead to significant tracking errors for large + : overlap values. + +See Also : L(), L + +=cut + +#------------------- +sub _adjust_contigs { +#------------------- + my ($seqType, $hsp, $start, $stop, $contigs_ref, $max_overlap, $frame, $strand) = @_; + + my $overlap = 0; + my ($numID, $numCons); + +# print STDERR "Testing $seqType data: HSP (${\$hsp->name}); $start, $stop, strand=$strand, frame=$frame\n"; + foreach(@$contigs_ref) { +# print STDERR " Contig: $_->{'start'} - $_->{'stop'}, strand=$_->{'strand'}, frame=$_->{'frame'}, iden= $_->{'iden'}, cons= $_->{'cons'}\n"; + + # Don't merge things unless they have matching strand/frame. + next unless ($_->{'frame'} == $frame and $_->{'strand'} == $strand); + + ## Test special case of a nested HSP. Skip it. + if($start >= $_->{'start'} and $stop <= $_->{'stop'}) { +# print STDERR "----> Nested HSP. Skipping.\n"; + $overlap = 1; + next; + } + + ## Test for overlap at beginning of contig. + if($start < $_->{'start'} and $stop > ($_->{'start'} + $max_overlap)) { +# print STDERR "----> Overlaps beg: existing beg,end: $_->{'start'},$_->{'stop'}, new beg,end: $start,$stop\n"; + # Collect stats over the non-overlapping region. + eval { + ($numID, $numCons) = $hsp->matches(-SEQ =>$seqType, + -START =>$start, + -STOP =>$_->{'start'}-1); + }; + if($@) { warn "\a\n$@\n"; } + else { + $_->{'start'} = $start; # Assign a new start coordinate to the contig + $_->{'iden'} += $numID; # and add new data to #identical, #conserved. + $_->{'cons'} += $numCons; + $overlap = 1; + } + } + + ## Test for overlap at end of contig. + if($stop > $_->{'stop'} and $start < ($_->{'stop'} - $max_overlap)) { +# print STDERR "----> Overlaps end: existing beg,end: $_->{'start'},$_->{'stop'}, new beg,end: $start,$stop\n"; + # Collect stats over the non-overlapping region. + eval { + ($numID,$numCons) = $hsp->matches(-SEQ =>$seqType, + -START =>$_->{'stop'}, + -STOP =>$stop); + }; + if($@) { warn "\a\n$@\n"; } + else { + $_->{'stop'} = $stop; # Assign a new stop coordinate to the contig + $_->{'iden'} += $numID; # and add new data to #identical, #conserved. + $_->{'cons'} += $numCons; + $overlap = 1; + } + } + $overlap && do { +# print STDERR " New Contig data:\n"; +# print STDERR " Contig: $_->{'start'} - $_->{'stop'}, iden= $_->{'iden'}, cons= $_->{'cons'}\n"; + last; + }; + } + ## If there is no overlap, add the complete HSP data. + !$overlap && do { +# print STDERR "No overlap. Adding new contig.\n"; + ($numID,$numCons) = $hsp->matches(-SEQ=>$seqType); + push @$contigs_ref, {'start'=>$start, 'stop'=>$stop, + 'iden'=>$numID, 'cons'=>$numCons, + 'strand'=>$strand, 'frame'=>$frame}; + }; + $overlap; +} + +=head2 get_exponent + + Usage : &get_exponent( number ); + Purpose : Determines the power of 10 exponent of an integer, float, + : or scientific notation number. + Example : &get_exponent("4.0e-206"); + : &get_exponent("0.00032"); + : &get_exponent("10."); + : &get_exponent("1000.0"); + : &get_exponent("e+83"); + Argument : Float, Integer, or scientific notation number + Returns : Integer representing the exponent part of the number (+ or -). + : If argument == 0 (zero), return value is "-999". + Comments : Exponents are rounded up (less negative) if the mantissa is >= 5. + : Exponents are rounded down (more negative) if the mantissa is <= -5. + +=cut + +#------------------ +sub get_exponent { +#------------------ + my $data = shift; + + my($num, $exp) = split /[eE]/, $data; + + if( defined $exp) { + $num = 1 if not $num; + $num >= 5 and $exp++; + $num <= -5 and $exp--; + } elsif( $num == 0) { + $exp = -999; + } elsif( not $num =~ /\./) { + $exp = CORE::length($num) -1; + } else { + $exp = 0; + $num .= '0' if $num =~ /\.$/; + my ($c); + my $rev = 0; + if($num !~ /^0/) { + $num = reverse($num); + $rev = 1; + } + do { $c = chop($num); + $c == 0 && $exp++; + } while( $c ne '.'); + + $exp = -$exp if $num == 0 and not $rev; + $exp -= 1 if $rev; + } + return $exp; +} + +=head2 collapse_nums + + Usage : @cnums = collapse_nums( @numbers ); + Purpose : Collapses a list of numbers into a set of ranges of consecutive terms: + : Useful for condensing long lists of consecutive numbers. + : EXPANDED: + : 1 2 3 4 5 6 10 12 13 14 15 17 18 20 21 22 24 26 30 31 32 + : COLLAPSED: + : 1-6 10 12-15 17 18 20-22 24 26 30-32 + Argument : List of numbers sorted numerically. + Returns : List of numbers mixed with ranges of numbers (see above). + Throws : n/a + +See Also : L + +=cut + +#------------------ +sub collapse_nums { +#------------------ +# This is probably not the slickest connectivity algorithm, but will do for now. + my @a = @_; + my ($from, $to, $i, @ca, $consec); + + $consec = 0; + for($i=0; $i < @a; $i++) { + not $from and do{ $from = $a[$i]; next; }; + if($a[$i] == $a[$i-1]+1) { + $to = $a[$i]; + $consec++; + } else { + if($consec == 1) { $from .= ",$to"; } + else { $from .= $consec>1 ? "\-$to" : ""; } + push @ca, split(',', $from); + $from = $a[$i]; + $consec = 0; + $to = undef; + } + } + if(defined $to) { + if($consec == 1) { $from .= ",$to"; } + else { $from .= $consec>1 ? "\-$to" : ""; } + } + push @ca, split(',', $from) if $from; + + @ca; +} + + +=head2 strip_blast_html + + Usage : $boolean = &strip_blast_html( string_ref ); + : This method is exported. + Purpose : Removes HTML formatting from a supplied string. + : Attempts to restore the Blast report to enable + : parsing by Bio::SearchIO::blast.pm + Returns : Boolean: true if string was stripped, false if not. + Argument : string_ref = reference to a string containing the whole Blast + : report containing HTML formatting. + Throws : Croaks if the argument is not a scalar reference. + Comments : Based on code originally written by Alex Dong Li + : (ali@genet.sickkids.on.ca). + : This method does some Blast-specific stripping + : (adds back a '>' character in front of each HSP + : alignment listing). + : + : THIS METHOD IS VERY SENSITIVE TO BLAST FORMATTING CHANGES! + : + : Removal of the HTML tags and accurate reconstitution of the + : non-HTML-formatted report is highly dependent on structure of + : the HTML-formatted version. For example, it assumes that first + : line of each alignment section (HSP listing) starts with a + : anchor tag. This permits the reconstruction of the + : original report in which these lines begin with a ">". + : This is required for parsing. + : + : If the structure of the Blast report itself is not intended to + : be a standard, the structure of the HTML-formatted version + : is even less so. Therefore, the use of this method to + : reconstitute parsable Blast reports from HTML-format versions + : should be considered a temorary solution. + +See Also : B + +=cut + +#-------------------- +sub strip_blast_html { +#-------------------- + # This may not best way to remove html tags. However, it is simple. + # it won't work under following conditions: + # 1) if quoted > appears in a tag (does this ever happen?) + # 2) if a tag is split over multiple lines and this method is + # used to process one line at a time. + + my ($string_ref) = shift; + + ref $string_ref eq 'SCALAR' or + croak ("Can't strip HTML: ". + "Argument is should be a SCALAR reference not a ${\ref $string_ref}\n"); + + my $str = $$string_ref; + my $stripped = 0; + + # Removing "" and adding the '>' character for + # HSP alignment listings. + $str =~ s/(\A|\n)]+> ?/>/sgi and $stripped = 1; + + # Removing all "<>" tags. + $str =~ s/<[^>]+>| //sgi and $stripped = 1; + + # Re-uniting any lone '>' characters. + $str =~ s/(\A|\n)>\s+/\n\n>/sgi and $stripped = 1; + + $$string_ref = $str; + $stripped; +} + + +1; + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Search/DatabaseI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Search/DatabaseI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,149 @@ +#----------------------------------------------------------------- +# $Id: DatabaseI.pm,v 1.6 2002/10/22 07:38:38 lapp Exp $ +# +# BioPerl module Bio::Search::DatabaseI +# +# Cared for by Steve Chervitz +# +# You may distribute this module under the same terms as perl itself +#----------------------------------------------------------------- + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Search::DatabaseI - Interface for a database used in a sequence search + +=head1 SYNOPSIS + +Bio::Search::DatabaseI objects should not be instantiated since this +module defines a pure interface. + +Given an object that implements the Bio::Search::DatabaseI interface, +you can do the following things with it: + + $name = $db->name(); + + $date = $db->date(); + + $num_letters = $db->letters(); + + $num_entries = $db->entries(); + +=head1 DESCRIPTION + +This module defines methods for an object that provides metadata +information about a database used for sequence searching. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Steve Chervitz Esac@bioperl.orgE + +See L for where to send bug reports and comments. + +=head1 COPYRIGHT + +Copyright (c) 2001 Steve Chervitz. All Rights Reserved. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=cut + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. + +=cut + +# Let the code begin... + +package Bio::Search::DatabaseI; + +use strict; +use Bio::Root::RootI; +use vars qw( @ISA ); + +@ISA = qw( Bio::Root::RootI); + + +=head2 name + + Usage : $name = $db->name(); + Purpose : Get the name of the database searched. + Returns : String + Argument : n/a + +=cut + +sub name { + my $self = shift; + $self->throw_not_implemented; +} + +=head2 date + + Usage : $date = $db->date(); + Purpose : Get the creation date of the queried database. + Returns : String + Argument : n/a + +=cut + +sub date { + my $self = shift; + $self->throw_not_implemented; +} + + +=head2 letters + + Usage : $num_letters = $db->letters(); + Purpose : Get the number of letters in the queried database. + Returns : Integer + Argument : n/a + +=cut + +sub letters { + my $self = shift; + $self->throw_not_implemented; +} + + +=head2 entries + + Usage : $num_entries = $db->entries(); + Purpose : Get the number of entries in the queried database. + Returns : Integer + Argument : n/a + +=cut + +sub entries { + my $self = shift; + $self->throw_not_implemented; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Search/GenericDatabase.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Search/GenericDatabase.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,174 @@ +#----------------------------------------------------------------- +# $Id: GenericDatabase.pm,v 1.5 2002/10/22 07:38:38 lapp Exp $ +# +# BioPerl module Bio::Search::GenericDatabase +# +# Cared for by Steve Chervitz +# +# You may distribute this module under the same terms as perl itself +#----------------------------------------------------------------- + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Search::GenericDatabase - Generic implementation of Bio::Search::DatabaseI + +=head1 SYNOPSIS + + use Bio::Search::GenericDatabase; + + $db = Bio::Search::GenericDatabase->new( -name => 'my Blast db', + -date => '2001-03-13', + -length => 2352511, + -entries => 250000 ); + + $name = $db->name(); + $date = $db->date(); + $num_letters = $db->letters(); + $num_entries = $db->entries(); + +=head1 DESCRIPTION + +This module provides a basic implementation of B. +See documentation in that module for more information. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Steve Chervitz Esac@bioperl.orgE + +See L for where to send bug reports and comments. + +=head1 COPYRIGHT + +Copyright (c) 2001 Steve Chervitz. All Rights Reserved. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=cut + +=head1 APPENDIX + + +The rest of the documentation details each of the object methods. + +=cut + +# Let the code begin... + +package Bio::Search::GenericDatabase; + +use strict; +use Bio::Search::DatabaseI; +use Bio::Root::Root; +use vars qw( @ISA ); + +@ISA = qw( Bio::Root::Root Bio::Search::DatabaseI); + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + my ($name, $date, $length, $ents) = + $self->_rearrange( [qw(NAME DATE LENGTH ENTRIES)], @args); + + $name && $self->name($name); + $date && $self->date($date); + $length && $self->letters($length); + $ents && $self->entries($ents); + + return $self; +} + +=head2 name + +See L() for documentation + +This implementation is a combined set/get accessor. + +=cut + +#--------------- +sub name { +#--------------- + my $self = shift; + if(@_) { + my $name = shift; + $name =~ s/(^\s+|\s+$)//g; + $self->{'_db'} = $name; + } + $self->{'_db'}; +} + +=head2 date + +See L() for documentation + +This implementation is a combined set/get accessor. + +=cut + +#----------------------- +sub date { +#----------------------- + my $self = shift; + if(@_) { $self->{'_dbDate'} = shift; } + $self->{'_dbDate'}; +} + + +=head2 letters + +See L() for documentation + +This implementation is a combined set/get accessor. + +=cut + +#---------------------- +sub letters { +#---------------------- + my $self = shift; + if(@_) { $self->{'_dbLetters'} = shift; } + $self->{'_dbLetters'}; +} + + +=head2 entries + +See L() for documentation + +This implementation is a combined set/get accessor. + +=cut + +#------------------ +sub entries { +#------------------ + my $self = shift; + if(@_) { $self->{'_dbEntries'} = shift; } + $self->{'_dbEntries'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Search/HSP/BlastHSP.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Search/HSP/BlastHSP.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1735 @@ +#----------------------------------------------------------------- +# $Id: BlastHSP.pm,v 1.20 2002/12/24 15:45:33 jason Exp $ +# +# BioPerl module Bio::Search::HSP::BlastHSP +# +# (This module was originally called Bio::Tools::Blast::HSP) +# +# Cared for by Steve Chervitz +# +# You may distribute this module under the same terms as perl itself +#----------------------------------------------------------------- + +## POD Documentation: + +=head1 NAME + +Bio::Search::HSP::BlastHSP - Bioperl BLAST High-Scoring Pair object + +=head1 SYNOPSIS + +The construction of BlastHSP objects is performed by +Bio::Factory::BlastHitFactory in a process that is +orchestrated by the Blast parser (B). +The resulting BlastHSPs are then accessed via +B). Therefore, you do not need to +use B) directly. If you need to construct +BlastHSPs directly, see the new() function for details. + +For B BLAST parsing usage examples, see the +B directory of the Bioperl distribution. + +=head1 DESCRIPTION + +A Bio::Search::HSP::BlastHSP object provides an interface to data +obtained in a single alignment section of a Blast report (known as a +"High-scoring Segment Pair"). This is essentially a pairwise +alignment with score information. + +BlastHSP objects are accessed via B +objects after parsing a BLAST report using the B +system. + +=head2 Start and End coordinates + +Sequence endpoints are swapped so that start is always less than +end. This affects For TBLASTN/X hits on the minus strand. Strand +information can be recovered using the strand() method. This +normalization step is standard Bioperl practice. It also facilitates +use of range information by methods such as match(). + +=over 1 + +=item * Supports BLAST versions 1.x and 2.x, gapped and ungapped. + +=back + +Bio::Search::HSP::BlastHSP.pm has the ability to extract a list of all +residue indices for identical and conservative matches along both +query and sbjct sequences. Since this degree of detail is not always +needed, this behavior does not occur during construction of the BlastHSP +object. These data will automatically be collected as necessary as +the BlastHSP.pm object is used. + +=head1 DEPENDENCIES + +Bio::Search::HSP::BlastHSP.pm is a concrete class that inherits from +B and B. +B and B are employed for creating +sequence and alignment objects, respectively. + +=head2 Relationship to SimpleAlign.pm & Seq.pm + +BlastHSP.pm can provide the query or sbjct sequence as a B +object via the L method. The BlastHSP.pm object can also create a +two-sequence B alignment object using the the query +and sbjct sequences via the L method. Creation of alignment +objects is not automatic when constructing the BlastHSP.pm object since +this level of functionality is not always required and would generate +a lot of extra overhead when crunching many reports. + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Steve Chervitz Esac@bioperl.orgE + +See L for where to send bug reports and comments. + +=head1 ACKNOWLEDGEMENTS + +This software was originally developed in the Department of Genetics +at Stanford University. I would also like to acknowledge my +colleagues at Affymetrix for useful feedback. + +=head1 SEE ALSO + + Bio::Search::Hit::BlastHit.pm - Blast hit object. + Bio::Search::Result::BlastResult.pm - Blast Result object. + Bio::Seq.pm - Biosequence object + +=head2 Links: + + http://bio.perl.org/Core/POD/Tools/Blast/BlastHit.pm.html + + http://bio.perl.org/Projects/modules.html - Online module documentation + http://bio.perl.org/Projects/Blast/ - Bioperl Blast Project + http://bio.perl.org/ - Bioperl Project Homepage + +=head1 COPYRIGHT + +Copyright (c) 1996-2001 Steve Chervitz. All Rights Reserved. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=cut + + +# END of main POD documentation. + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::Search::HSP::BlastHSP; + +use strict; +use Bio::SeqFeature::SimilarityPair; +use Bio::SeqFeature::Similarity; +use Bio::Search::HSP::HSPI; + +use vars qw( @ISA $GAP_SYMBOL $Revision %STRAND_SYMBOL ); + +use overload + '""' => \&to_string; + +$Revision = '$Id: BlastHSP.pm,v 1.20 2002/12/24 15:45:33 jason Exp $'; #' + +@ISA = qw(Bio::SeqFeature::SimilarityPair Bio::Search::HSP::HSPI); + +$GAP_SYMBOL = '-'; # Need a more general way to handle gap symbols. +%STRAND_SYMBOL = ('Plus' => 1, 'Minus' => -1 ); + + +=head2 new + + Usage : $hsp = Bio::Search::HSP::BlastHSP->new( %named_params ); + : Bio::Search::HSP::BlastHSP.pm objects are constructed + : automatically by Bio::SearchIO::BlastHitFactory.pm, + : so there is no need for direct instantiation. + Purpose : Constructs a new BlastHSP object and Initializes key variables + : for the HSP. + Returns : A Bio::Search::HSP::BlastHSP object + Argument : Named parameters: + : Parameter keys are case-insensitive. + : -RAW_DATA => array ref containing raw BLAST report data for + : for a single HSP. This includes all lines + : of the HSP alignment from a traditional BLAST + or PSI-BLAST (non-XML) report, + : -RANK => integer (1..n). + : -PROGRAM => string ('TBLASTN', 'BLASTP', etc.). + : -QUERY_NAME => string, id of query sequence + : -HIT_NAME => string, id of hit sequence + : + Comments : Having the raw data allows this object to do lazy parsing of + : the raw HSP data (i.e., not parsed until needed). + : + : Note that there is a fair amount of basic parsing that is + : currently performed in this module that would be more appropriate + : to do within a separate factory object. + : This parsing code will likely be relocated and more initialization + : parameters will be added to new(). + : +See Also : B, B + +=cut + +#---------------- +sub new { +#---------------- + my ($class, @args ) = @_; + + my $self = $class->SUPER::new( @args ); + # Initialize placeholders + $self->{'_queryGaps'} = $self->{'_sbjctGaps'} = 0; + my ($raw_data, $qname, $hname, $qlen, $hlen); + + ($self->{'_prog'}, $self->{'_rank'}, $raw_data, + $qname, $hname) = + $self->_rearrange([qw( PROGRAM + RANK + RAW_DATA + QUERY_NAME + HIT_NAME + )], @args ); + + # _set_data() does a fair amount of parsing. + # This will likely change (see comment above.) + $self->_set_data( @{$raw_data} ); + # Store the aligned query as sequence feature + my ($qb, $hb) = ($self->start()); + my ($qe, $he) = ($self->end()); + my ($qs, $hs) = ($self->strand()); + my ($qf,$hf) = ($self->query->frame(), + $self->hit->frame); + + $self->query( Bio::SeqFeature::Similarity->new (-start =>$qb, + -end =>$qe, + -strand =>$qs, + -bits =>$self->bits, + -score =>$self->score, + -frame =>$qf, + -seq_id => $qname, + -source =>$self->{'_prog'} )); + + $self->hit( Bio::SeqFeature::Similarity->new (-start =>$hb, + -end =>$he, + -strand =>$hs, + -bits =>$self->bits, + -score =>$self->score, + -frame =>$hf, + -seq_id => $hname, + -source =>$self->{'_prog'} )); + + # set lengths + $self->query->seqlength($qlen); # query + $self->hit->seqlength($hlen); # subject + + $self->query->frac_identical($self->frac_identical('query')); + $self->hit->frac_identical($self->frac_identical('hit')); + return $self; +} + +#sub DESTROY { +# my $self = shift; +# #print STDERR "--->DESTROYING $self\n"; +#} + + +# Title : _id_str; +# Purpose : Intended for internal use only to provide a string for use +# within exception messages to help users figure out which +# query/hit caused the problem. +# Returns : Short string with name of query and hit seq +sub _id_str { + my $self = shift; + if( not defined $self->{'_id_str'}) { + my $qname = $self->query->seqname; + my $hname = $self->hit->seqname; + $self->{'_id_str'} = "QUERY=\"$qname\" HIT=\"$hname\""; + } + return $self->{'_id_str'}; +} + +#================================================= +# Begin Bio::Search::HSP::HSPI implementation +#================================================= + +=head2 algorithm + + Title : algorithm + Usage : $alg = $hsp->algorithm(); + Function: Gets the algorithm specification that was used to obtain the hsp + For BLAST, the algorithm denotes what type of sequence was aligned + against what (BLASTN: dna-dna, BLASTP prt-prt, BLASTX translated + dna-prt, TBLASTN prt-translated dna, TBLASTX translated + dna-translated dna). + Returns : a scalar string + Args : none + +=cut + +#---------------- +sub algorithm { +#---------------- + my ($self,@args) = @_; + return $self->{'_prog'}; +} + + + + +=head2 signif() + + Usage : $hsp_obj->signif() + Purpose : Get the P-value or Expect value for the HSP. + Returns : Float (0.001 or 1.3e-43) + : Returns P-value if it is defined, otherwise, Expect value. + Argument : n/a + Throws : n/a + Comments : Provided for consistency with BlastHit::signif() + : Support for returning the significance data in different + : formats (e.g., exponent only), is not provided for HSP objects. + : This is only available for the BlastHit or Blast object. + +See Also : L, L, L + +=cut + +#----------- +sub signif { +#----------- + my $self = shift; + my $val ||= defined($self->{'_p'}) ? $self->{'_p'} : $self->{'_expect'}; + $val; +} + + + +=head2 evalue + + Usage : $hsp_obj->evalue() + Purpose : Get the Expect value for the HSP. + Returns : Float (0.001 or 1.3e-43) + Argument : n/a + Throws : n/a + Comments : Support for returning the expectation data in different + : formats (e.g., exponent only), is not provided for HSP objects. + : This is only available for the BlastHit or Blast object. + +See Also : L + +=cut + +#---------- +sub evalue { shift->{'_expect'} } +#---------- + + +=head2 p + + Usage : $hsp_obj->p() + Purpose : Get the P-value for the HSP. + Returns : Float (0.001 or 1.3e-43) or undef if not defined. + Argument : n/a + Throws : n/a + Comments : P-value is not defined with NCBI Blast2 reports. + : Support for returning the expectation data in different + : formats (e.g., exponent only) is not provided for HSP objects. + : This is only available for the BlastHit or Blast object. + +See Also : L + +=cut + +#----- +sub p { my $self = shift; $self->{'_p'}; } +#----- + +# alias +sub pvalue { shift->p(@_); } + +=head2 length + + Usage : $hsp->length( [seq_type] ) + Purpose : Get the length of the aligned portion of the query or sbjct. + Example : $hsp->length('query') + Returns : integer + Argument : seq_type: 'query' | 'hit' or 'sbjct' | 'total' (default = 'total') + ('sbjct' is synonymous with 'hit') + Throws : n/a + Comments : 'total' length is the full length of the alignment + : as reported in the denominators in the alignment section: + : "Identical = 34/120 Positives = 67/120". + +See Also : L + +=cut + +#----------- +sub length { +#----------- +## Developer note: when using the built-in length function within +## this module, call it as CORE::length(). + my( $self, $seqType ) = @_; + $seqType ||= 'total'; + $seqType = 'sbjct' if $seqType eq 'hit'; + + $seqType ne 'total' and $self->_set_seq_data() unless $self->{'_set_seq_data'}; + + ## Sensitive to member name format. + $seqType = "_\L$seqType\E"; + $self->{$seqType.'Length'}; +} + + + +=head2 gaps + + Usage : $hsp->gaps( [seq_type] ) + Purpose : Get the number of gaps in the query, sbjct, or total alignment. + : Also can return query gaps and sbjct gaps as a two-element list + : when in array context. + Example : $total_gaps = $hsp->gaps(); + : ($qgaps, $sgaps) = $hsp->gaps(); + : $qgaps = $hsp->gaps('query'); + Returns : scalar context: integer + : array context without args: (int, int) = ('queryGaps', 'sbjctGaps') + Argument : seq_type: 'query' or 'hit' or 'sbjct' or 'total' + : ('sbjct' is synonymous with 'hit') + : (default = 'total', scalar context) + : Array context can be "induced" by providing an argument of 'list' or 'array'. + Throws : n/a + +See Also : L, L + +=cut + +#--------- +sub gaps { +#--------- + my( $self, $seqType ) = @_; + + $self->_set_seq_data() unless $self->{'_set_seq_data'}; + + $seqType ||= (wantarray ? 'list' : 'total'); + $seqType = 'sbjct' if $seqType eq 'hit'; + + if($seqType =~ /list|array/i) { + return (($self->{'_queryGaps'} || 0), ($self->{'_sbjctGaps'} || 0)); + } + + if($seqType eq 'total') { + return ($self->{'_queryGaps'} + $self->{'_sbjctGaps'}) || 0; + } else { + ## Sensitive to member name format. + $seqType = "_\L$seqType\E"; + return $self->{$seqType.'Gaps'} || 0; + } +} + + +=head2 frac_identical + + Usage : $hsp_object->frac_identical( [seq_type] ); + Purpose : Get the fraction of identical positions within the given HSP. + Example : $frac_iden = $hsp_object->frac_identical('query'); + Returns : Float (2-decimal precision, e.g., 0.75). + Argument : seq_type: 'query' or 'hit' or 'sbjct' or 'total' + : ('sbjct' is synonymous with 'hit') + : default = 'total' (but see comments below). + Throws : n/a + Comments : Different versions of Blast report different values for the total + : length of the alignment. This is the number reported in the + : denominators in the stats section: + : "Identical = 34/120 Positives = 67/120". + : NCBI-BLAST uses the total length of the alignment (with gaps) + : WU-BLAST uses the length of the query sequence (without gaps). + : Therefore, when called without an argument or an argument of 'total', + : this method will report different values depending on the + : version of BLAST used. + : + : To get the fraction identical among only the aligned residues, + : ignoring the gaps, call this method with an argument of 'query' + : or 'sbjct' ('sbjct' is synonymous with 'hit'). + +See Also : L, L, L + +=cut + +#------------------- +sub frac_identical { +#------------------- +# The value is calculated as opposed to storing it from the parsed results. +# This saves storage and also permits flexibility in determining for which +# sequence (query or sbjct) the figure is to be calculated. + + my( $self, $seqType ) = @_; + $seqType ||= 'total'; + $seqType = 'sbjct' if $seqType eq 'hit'; + + if($seqType ne 'total') { + $self->_set_seq_data() unless $self->{'_set_seq_data'}; + } + ## Sensitive to member name format. + $seqType = "_\L$seqType\E"; + + sprintf( "%.2f", $self->{'_numIdentical'}/$self->{$seqType.'Length'}); +} + + +=head2 frac_conserved + + Usage : $hsp_object->frac_conserved( [seq_type] ); + Purpose : Get the fraction of conserved positions within the given HSP. + : (Note: 'conservative' positions are called 'positives' in the + : Blast report.) + Example : $frac_cons = $hsp_object->frac_conserved('query'); + Returns : Float (2-decimal precision, e.g., 0.75). + Argument : seq_type: 'query' or 'hit' or 'sbjct' or 'total' + : ('sbjct' is synonymous with 'hit') + : default = 'total' (but see comments below). + Throws : n/a + Comments : Different versions of Blast report different values for the total + : length of the alignment. This is the number reported in the + : denominators in the stats section: + : "Identical = 34/120 Positives = 67/120". + : NCBI-BLAST uses the total length of the alignment (with gaps) + : WU-BLAST uses the length of the query sequence (without gaps). + : Therefore, when called without an argument or an argument of 'total', + : this method will report different values depending on the + : version of BLAST used. + : + : To get the fraction conserved among only the aligned residues, + : ignoring the gaps, call this method with an argument of 'query' + : or 'sbjct'. + +See Also : L, L, L + +=cut + +#-------------------- +sub frac_conserved { +#-------------------- +# The value is calculated as opposed to storing it from the parsed results. +# This saves storage and also permits flexibility in determining for which +# sequence (query or sbjct) the figure is to be calculated. + + my( $self, $seqType ) = @_; + $seqType ||= 'total'; + $seqType = 'sbjct' if $seqType eq 'hit'; + + if($seqType ne 'total') { + $self->_set_seq_data() unless $self->{'_set_seq_data'}; + } + + ## Sensitive to member name format. + $seqType = "_\L$seqType\E"; + + sprintf( "%.2f", $self->{'_numConserved'}/$self->{$seqType.'Length'}); +} + +=head2 query_string + + Title : query_string + Usage : my $qseq = $hsp->query_string; + Function: Retrieves the query sequence of this HSP as a string + Returns : string + Args : none + + +=cut + +#---------------- +sub query_string{ shift->seq_str('query'); } +#---------------- + +=head2 hit_string + + Title : hit_string + Usage : my $hseq = $hsp->hit_string; + Function: Retrieves the hit sequence of this HSP as a string + Returns : string + Args : none + + +=cut + +#---------------- +sub hit_string{ shift->seq_str('hit'); } +#---------------- + + +=head2 homology_string + + Title : homology_string + Usage : my $homo_string = $hsp->homology_string; + Function: Retrieves the homology sequence for this HSP as a string. + : The homology sequence is the string of symbols in between the + : query and hit sequences in the alignment indicating the degree + : of conservation (e.g., identical, similar, not similar). + Returns : string + Args : none + +=cut + +#---------------- +sub homology_string{ shift->seq_str('match'); } +#---------------- + +#================================================= +# End Bio::Search::HSP::HSPI implementation +#================================================= + +# Older method delegating to method defined in HSPI. + +=head2 expect + +See L + +=cut + +#---------- +sub expect { shift->evalue( @_ ); } +#---------- + + +=head2 rank + + Usage : $hsp->rank( [string] ); + Purpose : Get the rank of the HSP within a given Blast hit. + Example : $rank = $hsp->rank; + Returns : Integer (1..n) corresponding to the order in which the HSP + appears in the BLAST report. + +=cut + +#' + +#---------- +sub rank { shift->{'_rank'} } +#---------- + +# For backward compatibility +#---------- +sub name { shift->rank } +#---------- + +=head2 to_string + + Title : to_string + Usage : print $hsp->to_string; + Function: Returns a string representation for the Blast HSP. + Primarily intended for debugging purposes. + Example : see usage + Returns : A string of the form: + [BlastHSP] + e.g.: + [BlastHit] 1 + Args : None + +=cut + +#---------- +sub to_string { +#---------- + my $self = shift; + return "[BlastHSP] " . $self->rank(); +} + + +#=head2 _set_data (Private method) +# +# Usage : called automatically during object construction. +# Purpose : Parses the raw HSP section from a flat BLAST report and +# sets the query sequence, sbjct sequence, and the "match" data +# : which consists of the symbols between the query and sbjct lines +# : in the alignment. +# Argument : Array (all lines for a single, complete HSP, from a raw, +# flat (i.e., non-XML) BLAST report) +# Throws : Propagates any exceptions from the methods called ("See Also") +# +#See Also : L<_set_seq()|_set_seq>, L<_set_score_stats()|_set_score_stats>, L<_set_match_stats()|_set_match_stats>, L<_initialize()|_initialize> +# +#=cut + +#-------------- +sub _set_data { +#-------------- + my $self = shift; + my @data = @_; + my @queryList = (); # 'Query' = SEQUENCE USED TO QUERY THE DATABASE. + my @sbjctList = (); # 'Sbjct' = HOMOLOGOUS SEQUENCE FOUND IN THE DATABASE. + my @matchList = (); + my $matchLine = 0; # Alternating boolean: when true, load 'match' data. + my @linedat = (); + + #print STDERR "BlastHSP: set_data()\n"; + + my($line, $aln_row_len, $length_diff); + $length_diff = 0; + + # Collecting data for all lines in the alignment + # and then storing the collections for possible processing later. + # + # Note that "match" lines may not be properly padded with spaces. + # This loop now properly handles such cases: + # Query: 1141 PSLVELTIRDCPRLEVGPMIRSLPKFPMLKKLDLAVANIIEEDLDVIGSLEELVIXXXXX 1200 + # PSLVELTIRDCPRLEVGPMIRSLPKFPMLKKLDLAVANIIEEDLDVIGSLEELVI + # Sbjct: 1141 PSLVELTIRDCPRLEVGPMIRSLPKFPMLKKLDLAVANIIEEDLDVIGSLEELVILSLKL 1200 + + foreach $line( @data ) { + next if $line =~ /^\s*$/; + + if( $line =~ /^ ?Score/ ) { + $self->_set_score_stats( $line ); + } elsif( $line =~ /^ ?(Identities|Positives|Strand)/ ) { + $self->_set_match_stats( $line ); + } elsif( $line =~ /^ ?Frame = ([\d+-]+)/ ) { + # Version 2.0.8 has Frame information on a separate line. + # Storing frame according to SeqFeature::Generic::frame() + # which does not contain strand info (use strand()). + my $frame = abs($1) - 1; + $self->frame( $frame ); + } elsif( $line =~ /^(Query:?[\s\d]+)([^\s\d]+)/ ) { + push @queryList, $line; + $self->{'_match_indent'} = CORE::length $1; + $aln_row_len = (CORE::length $1) + (CORE::length $2); + $matchLine = 1; + } elsif( $matchLine ) { + # Pad the match line with spaces if necessary. + $length_diff = $aln_row_len - CORE::length $line; + $length_diff and $line .= ' 'x $length_diff; + push @matchList, $line; + $matchLine = 0; + } elsif( $line =~ /^Sbjct/ ) { + push @sbjctList, $line; + } + } + # Storing the query and sbjct lists in case they are needed later. + # We could make this conditional to save memory. + $self->{'_queryList'} = \@queryList; + $self->{'_sbjctList'} = \@sbjctList; + + # Storing the match list in case it is needed later. + $self->{'_matchList'} = \@matchList; + + if(not defined ($self->{'_numIdentical'})) { + my $id_str = $self->_id_str; + $self->throw( -text => "Can't parse match statistics. Possibly a new or unrecognized Blast format. ($id_str)"); + } + + if(!scalar @queryList or !scalar @sbjctList) { + my $id_str = $self->_id_str; + $self->throw( "Can't find query or sbjct alignment lines. Possibly unrecognized Blast format. ($id_str)"); + } +} + + +#=head2 _set_score_stats (Private method) +# +# Usage : called automatically by _set_data() +# Purpose : Sets various score statistics obtained from the HSP listing. +# Argument : String with any of the following formats: +# : blast2: Score = 30.1 bits (66), Expect = 9.2 +# : blast2: Score = 158.2 bits (544), Expect(2) = e-110 +# : blast1: Score = 410 (144.3 bits), Expect = 1.7e-40, P = 1.7e-40 +# : blast1: Score = 55 (19.4 bits), Expect = 5.3, Sum P(3) = 0.99 +# Throws : Exception if the stats cannot be parsed, probably due to a change +# : in the Blast report format. +# +#See Also : L<_set_data()|_set_data> +# +#=cut + +#-------------------- +sub _set_score_stats { +#-------------------- + my ($self, $data) = @_; + + my ($expect, $p); + + if($data =~ /Score = +([\d.e+-]+) bits \(([\d.e+-]+)\), +Expect = +([\d.e+-]+)/) { + # blast2 format n = 1 + $self->bits($1); + $self->score($2); + $expect = $3; + } elsif($data =~ /Score = +([\d.e+-]+) bits \(([\d.e+-]+)\), +Expect\((\d+)\) = +([\d.e+-]+)/) { + # blast2 format n > 1 + $self->bits($1); + $self->score($2); + $self->{'_n'} = $3; + $expect = $4; + + } elsif($data =~ /Score = +([\d.e+-]+) \(([\d.e+-]+) bits\), +Expect = +([\d.e+-]+), P = +([\d.e-]+)/) { + # blast1 format, n = 1 + $self->score($1); + $self->bits($2); + $expect = $3; + $p = $4; + + } elsif($data =~ /Score = +([\d.e+-]+) \(([\d.e+-]+) bits\), +Expect = +([\d.e+-]+), +Sum P\((\d+)\) = +([\d.e-]+)/) { + # blast1 format, n > 1 + $self->score($1); + $self->bits($2); + $expect = $3; + $self->{'_n'} = $4; + $p = $5; + + } else { + my $id_str = $self->_id_str; + $self->throw(-class => 'Bio::Root::Exception', + -text => "Can't parse score statistics: unrecognized format. ($id_str)", + -value => $data); + } + $expect = "1$expect" if $expect =~ /^e/i; + $p = "1$p" if defined $p and $p=~ /^e/i; + + $self->{'_expect'} = $expect; + $self->{'_p'} = $p || undef; + $self->significance( $p || $expect ); +} + + +#=head2 _set_match_stats (Private method) +# +# Usage : Private method; called automatically by _set_data() +# Purpose : Sets various matching statistics obtained from the HSP listing. +# Argument : blast2: Identities = 23/74 (31%), Positives = 29/74 (39%), Gaps = 17/74 (22%) +# : blast2: Identities = 57/98 (58%), Positives = 74/98 (75%) +# : blast1: Identities = 87/204 (42%), Positives = 126/204 (61%) +# : blast1: Identities = 87/204 (42%), Positives = 126/204 (61%), Frame = -3 +# : WU-blast: Identities = 310/553 (56%), Positives = 310/553 (56%), Strand = Minus / Plus +# Throws : Exception if the stats cannot be parsed, probably due to a change +# : in the Blast report format. +# Comments : The "Gaps = " data in the HSP header has a different meaning depending +# : on the type of Blast: for BLASTP, this number is the total number of +# : gaps in query+sbjct; for TBLASTN, it is the number of gaps in the +# : query sequence only. Thus, it is safer to collect the data +# : separately by examining the actual sequence strings as is done +# : in _set_seq(). +# +#See Also : L<_set_data()|_set_data>, L<_set_seq()|_set_seq> +# +#=cut + +#-------------------- +sub _set_match_stats { +#-------------------- + my ($self, $data) = @_; + + if($data =~ m!Identities = (\d+)/(\d+)!) { + # blast1 or 2 format + $self->{'_numIdentical'} = $1; + $self->{'_totalLength'} = $2; + } + + if($data =~ m!Positives = (\d+)/(\d+)!) { + # blast1 or 2 format + $self->{'_numConserved'} = $1; + $self->{'_totalLength'} = $2; + } + + if($data =~ m!Frame = ([\d+-]+)!) { + $self->frame($1); + } + + # Strand data is not always present in this line. + # _set_seq() will also set strand information. + if($data =~ m!Strand = (\w+) / (\w+)!) { + $self->{'_queryStrand'} = $1; + $self->{'_sbjctStrand'} = $2; + } + +# if($data =~ m!Gaps = (\d+)/(\d+)!) { +# $self->{'_totalGaps'} = $1; +# } else { +# $self->{'_totalGaps'} = 0; +# } +} + + + +#=head2 _set_seq_data (Private method) +# +# Usage : called automatically when sequence data is requested. +# Purpose : Sets the HSP sequence data for both query and sbjct sequences. +# : Includes: start, stop, length, gaps, and raw sequence. +# Argument : n/a +# Throws : Propagates any exception thrown by _set_match_seq() +# Comments : Uses raw data stored by _set_data() during object construction. +# : These data are not always needed, so it is conditionally +# : executed only upon demand by methods such as gaps(), _set_residues(), +# : etc. _set_seq() does the dirty work. +# +#See Also : L<_set_seq()|_set_seq> +# +#=cut + +#----------------- +sub _set_seq_data { +#----------------- + my $self = shift; + + $self->_set_seq('query', @{$self->{'_queryList'}}); + $self->_set_seq('sbjct', @{$self->{'_sbjctList'}}); + + # Liberate some memory. + @{$self->{'_queryList'}} = @{$self->{'_sbjctList'}} = (); + undef $self->{'_queryList'}; + undef $self->{'_sbjctList'}; + + $self->{'_set_seq_data'} = 1; +} + + + +#=head2 _set_seq (Private method) +# +# Usage : called automatically by _set_seq_data() +# : $hsp_obj->($seq_type, @data); +# Purpose : Sets sequence information for both the query and sbjct sequences. +# : Directly counts the number of gaps in each sequence (if gapped Blast). +# Argument : $seq_type = 'query' or 'sbjct' +# : @data = all seq lines with the form: +# : Query: 61 SPHNVKDRKEQNGSINNAISPTATANTSGSQQINIDSALRDRSSNVAAQPSLSDASSGSN 120 +# Throws : Exception if data strings cannot be parsed, probably due to a change +# : in the Blast report format. +# Comments : Uses first argument to determine which data members to set +# : making this method sensitive data member name changes. +# : Behavior is dependent on the type of BLAST analysis (TBLASTN, BLASTP, etc). +# Warning : Sequence endpoints are normalized so that start < end. This affects HSPs +# : for TBLASTN/X hits on the minus strand. Normalization facilitates use +# : of range information by methods such as match(). +# +#See Also : L<_set_seq_data()|_set_seq_data>, L, L, L, L +# +#=cut + +#------------- +sub _set_seq { +#------------- + my $self = shift; + my $seqType = shift; + my @data = @_; + my @ranges = (); + my @sequence = (); + my $numGaps = 0; + + foreach( @data ) { + if( m/(\d+) *([^\d\s]+) *(\d+)/) { + push @ranges, ( $1, $3 ) ; + push @sequence, $2; + #print STDERR "_set_seq found sequence \"$2\"\n"; + } else { + $self->warn("Bad sequence data: $_"); + } + } + + if( !(scalar(@sequence) and scalar(@ranges))) { + my $id_str = $self->_id_str; + $self->throw("Can't set sequence: missing data. Possibly unrecognized Blast format. ($id_str)"); + } + + # Sensitive to member name changes. + $seqType = "_\L$seqType\E"; + $self->{$seqType.'Start'} = $ranges[0]; + $self->{$seqType.'Stop'} = $ranges[ $#ranges ]; + $self->{$seqType.'Seq'} = \@sequence; + + $self->{$seqType.'Length'} = abs($ranges[ $#ranges ] - $ranges[0]) + 1; + + # Adjust lengths for BLASTX, TBLASTN, TBLASTX sequences + # Converting nucl coords to amino acid coords. + + my $prog = $self->algorithm; + if($prog eq 'TBLASTN' and $seqType eq '_sbjct') { + $self->{$seqType.'Length'} /= 3; + } elsif($prog eq 'BLASTX' and $seqType eq '_query') { + $self->{$seqType.'Length'} /= 3; + } elsif($prog eq 'TBLASTX') { + $self->{$seqType.'Length'} /= 3; + } + + if( $prog ne 'BLASTP' ) { + $self->{$seqType.'Strand'} = 'Plus' if $prog =~ /BLASTN/; + $self->{$seqType.'Strand'} = 'Plus' if ($prog =~ /BLASTX/ and $seqType eq '_query'); + # Normalize sequence endpoints so that start < end. + # Reverse complement or 'minus strand' HSPs get flipped here. + if($self->{$seqType.'Start'} > $self->{$seqType.'Stop'}) { + ($self->{$seqType.'Start'}, $self->{$seqType.'Stop'}) = + ($self->{$seqType.'Stop'}, $self->{$seqType.'Start'}); + $self->{$seqType.'Strand'} = 'Minus'; + } + } + + ## Count number of gaps in each seq. Only need to do this for gapped Blasts. +# if($self->{'_gapped'}) { + my $seqstr = join('', @sequence); + $seqstr =~ s/\s//g; + my $num_gaps = CORE::length($seqstr) - $self->{$seqType.'Length'}; + $self->{$seqType.'Gaps'} = $num_gaps if $num_gaps > 0; +# } +} + + +#=head2 _set_residues (Private method) +# +# Usage : called automatically when residue data is requested. +# Purpose : Sets the residue numbers representing the identical and +# : conserved positions. These data are obtained by analyzing the +# : symbols between query and sbjct lines of the alignments. +# Argument : n/a +# Throws : Propagates any exception thrown by _set_seq_data() and _set_match_seq(). +# Comments : These data are not always needed, so it is conditionally +# : executed only upon demand by methods such as seq_inds(). +# : Behavior is dependent on the type of BLAST analysis (TBLASTN, BLASTP, etc). +# +#See Also : L<_set_seq_data()|_set_seq_data>, L<_set_match_seq()|_set_match_seq>, seq_inds() +# +#=cut + +#------------------ +sub _set_residues { +#------------------ + my $self = shift; + my @sequence = (); + + $self->_set_seq_data() unless $self->{'_set_seq_data'}; + + # Using hashes to avoid saving duplicate residue numbers. + my %identicalList_query = (); + my %identicalList_sbjct = (); + my %conservedList_query = (); + my %conservedList_sbjct = (); + + my $aref = $self->_set_match_seq() if not ref $self->{'_matchSeq'}; + $aref ||= $self->{'_matchSeq'}; + my $seqString = join('', @$aref ); + + my $qseq = join('',@{$self->{'_querySeq'}}); + my $sseq = join('',@{$self->{'_sbjctSeq'}}); + my $resCount_query = $self->{'_queryStop'} || 0; + my $resCount_sbjct = $self->{'_sbjctStop'} || 0; + + my $prog = $self->algorithm; + if($prog !~ /^BLASTP|^BLASTN/) { + if($prog eq 'TBLASTN') { + $resCount_sbjct /= 3; + } elsif($prog eq 'BLASTX') { + $resCount_query /= 3; + } elsif($prog eq 'TBLASTX') { + $resCount_query /= 3; + $resCount_sbjct /= 3; + } + } + + my ($mchar, $schar, $qchar); + while( $mchar = chop($seqString) ) { + ($qchar, $schar) = (chop($qseq), chop($sseq)); + if( $mchar eq '+' ) { + $conservedList_query{ $resCount_query } = 1; + $conservedList_sbjct{ $resCount_sbjct } = 1; + } elsif( $mchar ne ' ' ) { + $identicalList_query{ $resCount_query } = 1; + $identicalList_sbjct{ $resCount_sbjct } = 1; + } + $resCount_query-- if $qchar ne $GAP_SYMBOL; + $resCount_sbjct-- if $schar ne $GAP_SYMBOL; + } + $self->{'_identicalRes_query'} = \%identicalList_query; + $self->{'_conservedRes_query'} = \%conservedList_query; + $self->{'_identicalRes_sbjct'} = \%identicalList_sbjct; + $self->{'_conservedRes_sbjct'} = \%conservedList_sbjct; + +} + + + + +#=head2 _set_match_seq (Private method) +# +# Usage : $hsp_obj->_set_match_seq() +# Purpose : Set the 'match' sequence for the current HSP (symbols in between +# : the query and sbjct lines.) +# Returns : Array reference holding the match sequences lines. +# Argument : n/a +# Throws : Exception if the _matchList field is not set. +# Comments : The match information is not always necessary. This method +# : allows it to be conditionally prepared. +# : Called by _set_residues>() and seq_str(). +# +#See Also : L<_set_residues()|_set_residues>, L +# +#=cut + +#------------------- +sub _set_match_seq { +#------------------- + my $self = shift; + + if( ! ref($self->{'_matchList'}) ) { + my $id_str = $self->_id_str; + $self->throw("Can't set HSP match sequence: No data ($id_str)"); + } + + my @data = @{$self->{'_matchList'}}; + + my(@sequence); + foreach( @data ) { + chomp($_); + ## Remove leading spaces; (note: aln may begin with a space + ## which is why we can't use s/^ +//). + s/^ {$self->{'_match_indent'}}//; + push @sequence, $_; + } + # Liberate some memory. + @{$self->{'_matchList'}} = undef; + $self->{'_matchList'} = undef; + + $self->{'_matchSeq'} = \@sequence; + + return $self->{'_matchSeq'}; +} + + +=head2 n + + Usage : $hsp_obj->n() + Purpose : Get the N value (num HSPs on which P/Expect is based). + : This value is not defined with NCBI Blast2 with gapping. + Returns : Integer or null string if not defined. + Argument : n/a + Throws : n/a + Comments : The 'N' value is listed in parenthesis with P/Expect value: + : e.g., P(3) = 1.2e-30 ---> (N = 3). + : Not defined in NCBI Blast2 with gaps. + : This typically is equal to the number of HSPs but not always. + : To obtain the number of HSPs, use Bio::Search::Hit::BlastHit::num_hsps(). + +See Also : L + +=cut + +#----- +sub n { my $self = shift; $self->{'_n'} || ''; } +#----- + + +=head2 matches + + Usage : $hsp->matches([seq_type], [start], [stop]); + Purpose : Get the total number of identical and conservative matches + : in the query or sbjct sequence for the given HSP. Optionally can + : report data within a defined interval along the seq. + : (Note: 'conservative' matches are called 'positives' in the + : Blast report.) + Example : ($id,$cons) = $hsp_object->matches('hit'); + : ($id,$cons) = $hsp_object->matches('query',300,400); + Returns : 2-element array of integers + Argument : (1) seq_type = 'query' or 'hit' or 'sbjct' (default = query) + : ('sbjct' is synonymous with 'hit') + : (2) start = Starting coordinate (optional) + : (3) stop = Ending coordinate (optional) + Throws : Exception if the supplied coordinates are out of range. + Comments : Relies on seq_str('match') to get the string of alignment symbols + : between the query and sbjct lines which are used for determining + : the number of identical and conservative matches. + +See Also : L, L, L, L + +=cut + +#----------- +sub matches { +#----------- + my( $self, %param ) = @_; + my(@data); + my($seqType, $beg, $end) = ($param{-SEQ}, $param{-START}, $param{-STOP}); + $seqType ||= 'query'; + $seqType = 'sbjct' if $seqType eq 'hit'; + + my($start,$stop); + + if(!defined $beg && !defined $end) { + ## Get data for the whole alignment. + push @data, ($self->{'_numIdentical'}, $self->{'_numConserved'}); + } else { + ## Get the substring representing the desired sub-section of aln. + $beg ||= 0; + $end ||= 0; + ($start,$stop) = $self->range($seqType); + if($beg == 0) { $beg = $start; $end = $beg+$end; } + elsif($end == 0) { $end = $stop; $beg = $end-$beg; } + + if($end >= $stop) { $end = $stop; } ##ML changed from if (end >stop) + else { $end += 1;} ##ML moved from commented position below, makes + ##more sense here +# if($end > $stop) { $end = $stop; } + if($beg < $start) { $beg = $start; } +# else { $end += 1;} + +# my $seq = substr($self->seq_str('match'), $beg-$start, ($end-$beg)); + + ## ML: START fix for substr out of range error ------------------ + my $seq = ""; + my $prog = $self->algorithm; + if (($prog eq 'TBLASTN') and ($seqType eq 'sbjct')) + { + $seq = substr($self->seq_str('match'), + int(($beg-$start)/3), int(($end-$beg+1)/3)); + + } elsif (($prog eq 'BLASTX') and ($seqType eq 'query')) + { + $seq = substr($self->seq_str('match'), + int(($beg-$start)/3), int(($end-$beg+1)/3)); + } else { + $seq = substr($self->seq_str('match'), + $beg-$start, ($end-$beg)); + } + ## ML: End of fix for substr out of range error ----------------- + + + ## ML: debugging code + ## This is where we get our exception. Try printing out the values going + ## into this: + ## +# print STDERR +# qq(*------------MY EXCEPTION --------------------\nSeq: ") , +# $self->seq_str("$seqType"), qq("\n),$self->rank,",( index:"; +# print STDERR $beg-$start, ", len: ", $end-$beg," ), (HSPRealLen:", +# CORE::length $self->seq_str("$seqType"); +# print STDERR ", HSPCalcLen: ", $stop - $start +1 ," ), +# ( beg: $beg, end: $end ), ( start: $start, stop: stop )\n"; + ## ML: END DEBUGGING CODE---------- + + if(!CORE::length $seq) { + my $id_str = $self->_id_str; + $self->throw("Undefined $seqType sub-sequence ($beg,$end). Valid range = $start - $stop ($id_str)"); + } + ## Get data for a substring. +# printf "Collecting HSP subsection data: beg,end = %d,%d; start,stop = %d,%d\n%s<---\n", $beg, $end, $start, $stop, $seq; +# printf "Original match seq:\n%s\n",$self->seq_str('match'); + $seq =~ s/ //g; # remove space (no info). + my $len_cons = CORE::length $seq; + $seq =~ s/\+//g; # remove '+' characters (conservative substitutions) + my $len_id = CORE::length $seq; + push @data, ($len_id, $len_cons); +# printf " HSP = %s\n id = %d; cons = %d\n", $self->rank, $len_id, $len_cons; ; + } + @data; +} + + +=head2 num_identical + + Usage : $hsp_object->num_identical(); + Purpose : Get the number of identical positions within the given HSP. + Example : $num_iden = $hsp_object->num_identical(); + Returns : integer + Argument : n/a + Throws : n/a + +See Also : L, L + +=cut + +#------------------- +sub num_identical { +#------------------- + my( $self) = shift; + + $self->{'_numIdentical'}; +} + + +=head2 num_conserved + + Usage : $hsp_object->num_conserved(); + Purpose : Get the number of conserved positions within the given HSP. + Example : $num_iden = $hsp_object->num_conserved(); + Returns : integer + Argument : n/a + Throws : n/a + +See Also : L, L + +=cut + +#------------------- +sub num_conserved { +#------------------- + my( $self) = shift; + + $self->{'_numConserved'}; +} + + + +=head2 range + + Usage : $hsp->range( [seq_type] ); + Purpose : Gets the (start, end) coordinates for the query or sbjct sequence + : in the HSP alignment. + Example : ($query_beg, $query_end) = $hsp->range('query'); + : ($hit_beg, $hit_end) = $hsp->range('hit'); + Returns : Two-element array of integers + Argument : seq_type = string, 'query' or 'hit' or 'sbjct' (default = 'query') + : ('sbjct' is synonymous with 'hit') + Throws : n/a + +See Also : L, L + +=cut + +#---------- +sub range { +#---------- + my ($self, $seqType) = @_; + + $self->_set_seq_data() unless $self->{'_set_seq_data'}; + + $seqType ||= 'query'; + $seqType = 'sbjct' if $seqType eq 'hit'; + + ## Sensitive to member name changes. + $seqType = "_\L$seqType\E"; + + return ($self->{$seqType.'Start'},$self->{$seqType.'Stop'}); +} + +=head2 start + + Usage : $hsp->start( [seq_type] ); + Purpose : Gets the start coordinate for the query, sbjct, or both sequences + : in the HSP alignment. + : NOTE: Start will always be less than end. + : To determine strand, use $hsp->strand() + Example : $query_beg = $hsp->start('query'); + : $hit_beg = $hsp->start('hit'); + : ($query_beg, $hit_beg) = $hsp->start(); + Returns : scalar context: integer + : array context without args: list of two integers + Argument : In scalar context: seq_type = 'query' or 'hit' or 'sbjct' (default= 'query') + : ('sbjct' is synonymous with 'hit') + : Array context can be "induced" by providing an argument of 'list' or 'array'. + Throws : n/a + +See Also : L, L + +=cut + +#---------- +sub start { +#---------- + my ($self, $seqType) = @_; + + $seqType ||= (wantarray ? 'list' : 'query'); + $seqType = 'sbjct' if $seqType eq 'hit'; + + $self->_set_seq_data() unless $self->{'_set_seq_data'}; + + if($seqType =~ /list|array/i) { + return ($self->{'_queryStart'}, $self->{'_sbjctStart'}); + } else { + ## Sensitive to member name changes. + $seqType = "_\L$seqType\E"; + return $self->{$seqType.'Start'}; + } +} + +=head2 end + + Usage : $hsp->end( [seq_type] ); + Purpose : Gets the end coordinate for the query, sbjct, or both sequences + : in the HSP alignment. + : NOTE: Start will always be less than end. + : To determine strand, use $hsp->strand() + Example : $query_end = $hsp->end('query'); + : $hit_end = $hsp->end('hit'); + : ($query_end, $hit_end) = $hsp->end(); + Returns : scalar context: integer + : array context without args: list of two integers + Argument : In scalar context: seq_type = 'query' or 'hit' or 'sbjct' (default= 'query') + : ('sbjct' is synonymous with 'hit') + : Array context can be "induced" by providing an argument of 'list' or 'array'. + Throws : n/a + +See Also : L, L, L + +=cut + +#---------- +sub end { +#---------- + my ($self, $seqType) = @_; + + $seqType ||= (wantarray ? 'list' : 'query'); + $seqType = 'sbjct' if $seqType eq 'hit'; + + $self->_set_seq_data() unless $self->{'_set_seq_data'}; + + if($seqType =~ /list|array/i) { + return ($self->{'_queryStop'}, $self->{'_sbjctStop'}); + } else { + ## Sensitive to member name changes. + $seqType = "_\L$seqType\E"; + return $self->{$seqType.'Stop'}; + } +} + + + +=head2 strand + + Usage : $hsp_object->strand( [seq_type] ) + Purpose : Get the strand of the query or sbjct sequence. + Example : print $hsp->strand('query'); + : ($query_strand, $hit_strand) = $hsp->strand(); + Returns : -1, 0, or 1 + : -1 = Minus strand, +1 = Plus strand + : Returns 0 if strand is not defined, which occurs + : for BLASTP reports, and the query of TBLASTN + : as well as the hit if BLASTX reports. + : In scalar context without arguments, returns queryStrand value. + : In array context without arguments, returns a two-element list + : of strings (queryStrand, sbjctStrand). + : Array context can be "induced" by providing an argument of 'list' or 'array'. + Argument : seq_type: 'query' or 'hit' or 'sbjct' or undef + : ('sbjct' is synonymous with 'hit') + Throws : n/a + +See Also : B<_set_seq()>, B<_set_match_stats()> + +=cut + +#----------- +sub strand { +#----------- + my( $self, $seqType ) = @_; + + # Hack to deal with the fact that SimilarityPair calls strand() + # which will lead to an error because parsing hasn't yet occurred. + # See SimilarityPair::new(). + return if $self->{'_initializing'}; + + $seqType ||= (wantarray ? 'list' : 'query'); + $seqType = 'sbjct' if $seqType eq 'hit'; + + ## Sensitive to member name format. + $seqType = "_\L$seqType\E"; + + # $seqType could be '_list'. + $self->{'_queryStrand'} or $self->_set_seq_data() unless $self->{'_set_seq_data'}; + + my $prog = $self->algorithm; + + if($seqType =~ /list|array/i) { + my ($qstr, $hstr); + if( $prog eq 'BLASTP') { + $qstr = 0; + $hstr = 0; + } + elsif( $prog eq 'TBLASTN') { + $qstr = 0; + $hstr = $STRAND_SYMBOL{$self->{'_sbjctStrand'}}; + } + elsif( $prog eq 'BLASTX') { + $qstr = $STRAND_SYMBOL{$self->{'_queryStrand'}}; + $hstr = 0; + } + else { + $qstr = $STRAND_SYMBOL{$self->{'_queryStrand'}} if defined $self->{'_queryStrand'}; + $hstr = $STRAND_SYMBOL{$self->{'_sbjctStrand'}} if defined $self->{'_sbjctStrand'}; + } + $qstr ||= 0; + $hstr ||= 0; + return ($qstr, $hstr); + } + local $^W = 0; + $STRAND_SYMBOL{$self->{$seqType.'Strand'}} || 0; +} + + +=head2 seq + + Usage : $hsp->seq( [seq_type] ); + Purpose : Get the query or sbjct sequence as a Bio::Seq.pm object. + Example : $seqObj = $hsp->seq('query'); + Returns : Object reference for a Bio::Seq.pm object. + Argument : seq_type = 'query' or 'hit' or 'sbjct' (default = 'query'). + : ('sbjct' is synonymous with 'hit') + Throws : Propagates any exception that occurs during construction + : of the Bio::Seq.pm object. + Comments : The sequence is returned in an array of strings corresponding + : to the strings in the original format of the Blast alignment. + : (i.e., same spacing). + +See Also : L, L, B + +=cut + +#------- +sub seq { +#------- + my($self,$seqType) = @_; + $seqType ||= 'query'; + $seqType = 'sbjct' if $seqType eq 'hit'; + my $str = $self->seq_str($seqType); + + require Bio::Seq; + + new Bio::Seq (-ID => $self->to_string, + -SEQ => $str, + -DESC => "$seqType sequence", + ); +} + +=head2 seq_str + + Usage : $hsp->seq_str( seq_type ); + Purpose : Get the full query, sbjct, or 'match' sequence as a string. + : The 'match' sequence is the string of symbols in between the + : query and sbjct sequences. + Example : $str = $hsp->seq_str('query'); + Returns : String + Argument : seq_Type = 'query' or 'hit' or 'sbjct' or 'match' + : ('sbjct' is synonymous with 'hit') + Throws : Exception if the argument does not match an accepted seq_type. + Comments : Calls _set_seq_data() to set the 'match' sequence if it has + : not been set already. + +See Also : L, L, B<_set_match_seq()> + +=cut + +#------------ +sub seq_str { +#------------ + my($self,$seqType) = @_; + + $seqType ||= 'query'; + $seqType = 'sbjct' if $seqType eq 'hit'; + ## Sensitive to member name changes. + $seqType = "_\L$seqType\E"; + + $self->_set_seq_data() unless $self->{'_set_seq_data'}; + + if($seqType =~ /sbjct|query/) { + my $seq = join('',@{$self->{$seqType.'Seq'}}); + $seq =~ s/\s+//g; + return $seq; + + } elsif( $seqType =~ /match/i) { + # Only need to call _set_match_seq() if the match seq is requested. + my $aref = $self->_set_match_seq() unless ref $self->{'_matchSeq'}; + $aref = $self->{'_matchSeq'}; + + return join('',@$aref); + + } else { + my $id_str = $self->_id_str; + $self->throw(-class => 'Bio::Root::BadParameter', + -text => "Invalid or undefined sequence type: $seqType ($id_str)\n" . + "Valid types: query, sbjct, match", + -value => $seqType); + } +} + +=head2 seq_inds + + Usage : $hsp->seq_inds( seq_type, class, collapse ); + Purpose : Get a list of residue positions (indices) for all identical + : or conserved residues in the query or sbjct sequence. + Example : @s_ind = $hsp->seq_inds('query', 'identical'); + : @h_ind = $hsp->seq_inds('hit', 'conserved'); + : @h_ind = $hsp->seq_inds('hit', 'conserved', 1); + Returns : List of integers + : May include ranges if collapse is true. + Argument : seq_type = 'query' or 'hit' or 'sbjct' (default = query) + : ('sbjct' is synonymous with 'hit') + : class = 'identical' or 'conserved' (default = identical) + : (can be shortened to 'id' or 'cons') + : (actually, anything not 'id' will evaluate to 'conserved'). + : collapse = boolean, if true, consecutive positions are merged + : using a range notation, e.g., "1 2 3 4 5 7 9 10 11" + : collapses to "1-5 7 9-11". This is useful for + : consolidating long lists. Default = no collapse. + Throws : n/a. + Comments : Calls _set_residues() to set the 'match' sequence if it has + : not been set already. + +See Also : L, B<_set_residues()>, L, L + +=cut + +#--------------- +sub seq_inds { +#--------------- + my ($self, $seqType, $class, $collapse) = @_; + + $seqType ||= 'query'; + $class ||= 'identical'; + $collapse ||= 0; + $seqType = 'sbjct' if $seqType eq 'hit'; + + $self->_set_residues() unless defined $self->{'_identicalRes_query'}; + + $seqType = ($seqType !~ /^q/i ? 'sbjct' : 'query'); + $class = ($class !~ /^id/i ? 'conserved' : 'identical'); + + ## Sensitive to member name changes. + $seqType = "_\L$seqType\E"; + $class = "_\L$class\E"; + + my @ary = sort { $a <=> $b } keys %{ $self->{"${class}Res$seqType"}}; + + require Bio::Search::BlastUtils if $collapse; + + return $collapse ? &Bio::Search::BlastUtils::collapse_nums(@ary) : @ary; +} + + +=head2 get_aln + + Usage : $hsp->get_aln() + Purpose : Get a Bio::SimpleAlign object constructed from the query + sbjct + : sequences of the present HSP object. + Example : $aln_obj = $hsp->get_aln(); + Returns : Object reference for a Bio::SimpleAlign.pm object. + Argument : n/a. + Throws : Propagates any exception ocurring during the construction of + : the Bio::SimpleAlign object. + Comments : Requires Bio::SimpleAlign. + : The Bio::SimpleAlign object is constructed from the query + sbjct + : sequence objects obtained by calling seq(). + : Gap residues are included (see $GAP_SYMBOL). + +See Also : L, L + +=cut + +#------------ +sub get_aln { +#------------ + my $self = shift; + + require Bio::SimpleAlign; + require Bio::LocatableSeq; + my $qseq = $self->seq('query'); + my $sseq = $self->seq('sbjct'); + + my $type = $self->algorithm =~ /P$|^T/ ? 'amino' : 'dna'; + my $aln = new Bio::SimpleAlign(); + $aln->add_seq(new Bio::LocatableSeq(-seq => $qseq->seq(), + -id => 'query_'.$qseq->display_id(), + -start => 1, + -end => CORE::length($qseq))); + + $aln->add_seq(new Bio::LocatableSeq(-seq => $sseq->seq(), + -id => 'hit_'.$sseq->display_id(), + -start => 1, + -end => CORE::length($sseq))); + + return $aln; +} + + +1; +__END__ + + +=head1 FOR DEVELOPERS ONLY + +=head2 Data Members + +Information about the various data members of this module is provided for those +wishing to modify or understand the code. Two things to bear in mind: + +=over 4 + +=item 1 Do NOT rely on these in any code outside of this module. + +All data members are prefixed with an underscore to signify that they are private. +Always use accessor methods. If the accessor doesn't exist or is inadequate, +create or modify an accessor (and let me know, too!). + +=item 2 This documentation may be incomplete and out of date. + +It is easy for these data member descriptions to become obsolete as +this module is still evolving. Always double check this info and search +for members not described here. + +=back + +An instance of Bio::Search::HSP::BlastHSP.pm is a blessed reference to a hash containing +all or some of the following fields: + + FIELD VALUE + -------------------------------------------------------------- + (member names are mostly self-explanatory) + + _score : + _bits : + _p : + _n : Integer. The 'N' value listed in parenthesis with P/Expect value: + : e.g., P(3) = 1.2e-30 ---> (N = 3). + : Not defined in NCBI Blast2 with gaps. + : To obtain the number of HSPs, use Bio::Search::Hit::BlastHit::num_hsps(). + _expect : + _queryLength : + _queryGaps : + _queryStart : + _queryStop : + _querySeq : + _sbjctLength : + _sbjctGaps : + _sbjctStart : + _sbjctStop : + _sbjctSeq : + _matchSeq : String. Contains the symbols between the query and sbjct lines + which indicate identical (letter) and conserved ('+') matches + or a mismatch (' '). + _numIdentical : + _numConserved : + _identicalRes_query : + _identicalRes_sbjct : + _conservedRes_query : + _conservedRes_sbjct : + _match_indent : The number of leading space characters on each line containing + the match symbols. _match_indent is 13 in this example: + Query: 285 QNSAPWGLARISHRERLNLGSFNKYLYDDDAG + Q +APWGLARIS G+ + Y YD+ AG + ^^^^^^^^^^^^^ + + +=cut + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Search/HSP/FastaHSP.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Search/HSP/FastaHSP.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,196 @@ +# $Id: FastaHSP.pm,v 1.4.2.1 2003/02/28 09:47:19 jason Exp $ +# +# BioPerl module for Bio::Search::HSP::FastaHSP +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Search::HSP::FastaHSP - HSP object for FASTA specific data + +=head1 SYNOPSIS + + # get a FastaHSP from a SearchIO stream + my $in = new Bio::SearchIO(-format => 'fasta', -file => 'filename.fasta'); + + while( my $r = $in->next_result) { + while( my $hit = $r->next_result ) { + while( my $hsp = $hit->next_hsp ) { + print "smith-waterman score (if available): ", + $hsp->sw_score(),"\n"; + } + } + } + +=head1 DESCRIPTION + +Describe the object here + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Search::HSP::FastaHSP; +use vars qw(@ISA); +use strict; + +use Bio::Search::HSP::GenericHSP; + +@ISA = qw(Bio::Search::HSP::GenericHSP ); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Search::HSP::FastaHSP(); + Function: Builds a new Bio::Search::HSP::FastaHSP object + Returns : Bio::Search::HSP::FastaHSP + Args : -swscore => smith-waterman score + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + my ($swscore) = $self->_rearrange([qw(SWSCORE)], @args); + + defined $swscore && $self->sw_score($swscore); + + return $self; +} + + +=head2 sw_score + + Title : sw_score + Usage : $obj->sw_score($newval) + Function: Get/Set Smith-Waterman score + Returns : value of sw_score + Args : newvalue (optional) + + +=cut + +sub sw_score{ + my ($self,$value) = @_; + if( defined $value || ! defined $self->{'_sw_score'} ) { + $value = 0 unless defined $value; # default value + $self->{'_sw_score'} = $value; + } + return $self->{'_sw_score'}; +} + + +sub get_aln { + my ($self) = @_; + require Bio::LocatableSeq; + require Bio::SimpleAlign; + my $aln = new Bio::SimpleAlign; + my $hs = $self->hit_string(); + my $qs = $self->query_string(); + + # fasta reports some extra 'regional' sequence information + # we need to clear out first + # this seemed a bit insane to me at first, but it appears to + # work --jason + + # we infer the end of the regional sequence where the first + # non space is in the homology string + # then we use the HSP->length to tell us how far to read + # to cut off the end of the sequence + + my ($start) = 0; + if( $self->homology_string() =~ /^(\s+)/ ) { + $start = CORE::length($1); + } + $self->debug("hs seq is '$hs'\n"); + $self->debug("qs seq is '$qs'\n"); + + + $hs = substr($hs, $start,$self->length('total')); + $qs = substr($qs, $start,$self->length('total')); + foreach my $seq ( $qs,$hs) { + foreach my $f ( '\\', '/', ' ') { + my $index = index($seq,$f); + while( $index >=0 && length($seq) > 0 ) { + substr($hs,$index,1) = ''; + substr($qs,$index,1) = ''; + $self->debug( "$f, $index+1, for ".length($seq). " ($seq)\n"); + $index = index($seq,$f,$index+1); + } + } + } + + my $seqonly = $qs; + $seqonly =~ s/[\-\s]//g; + my ($q_nm,$s_nm) = ($self->query->seq_id(), + $self->hit->seq_id()); + unless( defined $q_nm && CORE::length ($q_nm) ) { + $q_nm = 'query'; + } + unless( defined $s_nm && CORE::length ($s_nm) ) { + $s_nm = 'hit'; + } + my $query = new Bio::LocatableSeq('-seq' => $qs, + '-id' => $q_nm, + '-start' => 1, + '-end' => CORE::length($seqonly), + ); + $seqonly = $hs; + $seqonly =~ s/[\-\s]//g; + my $hit = new Bio::LocatableSeq('-seq' => $hs, + '-id' => $s_nm, + '-start' => 1, + '-end' => CORE::length($seqonly), + ); + $aln->add_seq($query); + $aln->add_seq($hit); + return $aln; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Search/HSP/GenericHSP.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Search/HSP/GenericHSP.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1125 @@ +# $Id: GenericHSP.pm,v 1.40.2.3 2003/03/24 20:44:45 jason Exp $ +# +# BioPerl module for Bio::Search::HSP::GenericHSP +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Search::HSP::GenericHSP - A "Generic" implementation of a High Scoring Pair + +=head1 SYNOPSIS + + my $hsp = new Bio::Search::HSP::GenericHSP( -algorithm => 'blastp', + -evalue => '1e-30', + ); + + $r_type = $hsp->algorithm + + $pvalue = $hsp->p(); + + $evalue = $hsp->evalue(); + + $frac_id = $hsp->frac_identical( ['query'|'hit'|'total'] ); + + $frac_cons = $hsp->frac_conserved( ['query'|'hit'|'total'] ); + + $gaps = $hsp->gaps( ['query'|'hit'|'total'] ); + + $qseq = $hsp->query_string; + + $hseq = $hsp->hit_string; + + $homo_string = $hsp->homology_string; + + $len = $hsp->length( ['query'|'hit'|'total'] ); + + $len = $hsp->length( ['query'|'hit'|'total'] ); + + $rank = $hsp->rank; + + +=head1 DESCRIPTION + +This implementation is "Generic", meaning it is is suitable for +holding information about High Scoring pairs from most Search reports +such as BLAST and FastA. Specialized objects can be derived from +this. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich and Steve Chervitz + +Email jason@bioperl.org +Email sac@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Search::HSP::GenericHSP; +use vars qw(@ISA $GAP_SYMBOL); +use strict; + +use Bio::Root::Root; +use Bio::SeqFeature::Similarity; +use Bio::Search::HSP::HSPI; + +@ISA = qw(Bio::Search::HSP::HSPI Bio::Root::Root ); + +BEGIN { + $GAP_SYMBOL = '-'; +} +=head2 new + + Title : new + Usage : my $obj = new Bio::Search::HSP::GenericHSP(); + Function: Builds a new Bio::Search::HSP::GenericHSP object + Returns : Bio::Search::HSP::GenericHSP + Args : -algorithm => algorithm used (BLASTP, TBLASTX, FASTX, etc) + -evalue => evalue + -pvalue => pvalue + -bits => bit value for HSP + -score => score value for HSP (typically z-score but depends on + analysis) + -hsp_length=> Length of the HSP (including gaps) + -identical => # of residues that that matched identically + -conserved => # of residues that matched conservatively + (only protein comparisions; + conserved == identical in nucleotide comparisons) + -hsp_gaps => # of gaps in the HSP + -query_gaps => # of gaps in the query in the alignment + -hit_gaps => # of gaps in the subject in the alignment + -query_name => HSP Query sequence name (if available) + -query_start => HSP Query start (in original query sequence coords) + -query_end => HSP Query end (in original query sequence coords) + -hit_name => HSP Hit sequence name (if available) + -hit_start => HSP Hit start (in original hit sequence coords) + -hit_end => HSP Hit end (in original hit sequence coords) + -hit_length => total length of the hit sequence + -query_length=> total length of the query sequence + -query_seq => query sequence portion of the HSP + -hit_seq => hit sequence portion of the HSP + -homology_seq=> homology sequence for the HSP + -hit_frame => hit frame (only if hit is translated protein) + -query_frame => query frame (only if query is translated protein) + -rank => HSP rank + + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + my ($algo, $evalue, $pvalue, $identical, $conserved, + $gaps, $query_gaps, $hit_gaps, + $hit_seq, $query_seq, $homology_seq, + $hsp_len, $query_len,$hit_len, + $hit_name,$query_name,$bits,$score, + $hs,$he,$qs,$qe, + $qframe,$hframe, + $rank) = $self->_rearrange([qw(ALGORITHM + EVALUE + PVALUE + IDENTICAL + CONSERVED + HSP_GAPS + QUERY_GAPS + HIT_GAPS + HIT_SEQ + QUERY_SEQ + HOMOLOGY_SEQ + HSP_LENGTH + QUERY_LENGTH + HIT_LENGTH + HIT_NAME + QUERY_NAME + BITS + SCORE + HIT_START + HIT_END + QUERY_START + QUERY_END + QUERY_FRAME + HIT_FRAME + RANK + )], @args); + + $algo = 'GENERIC' unless defined $algo; + $self->algorithm($algo); + +# defined $evalue && $self->evalue($evalue) +# $hsp->significance is initialized by the +# the SimilarityPair object - let's only keep one +# value, don't need 2 slots. + + defined $pvalue && $self->pvalue($pvalue); + defined $bits && $self->bits($bits); + defined $score && $self->score($score); + my ($queryfactor, $hitfactor) = (0,0); + + if( $algo =~ /^(PSI)?T(BLAST|FAST)[NY]/oi ) { + $hitfactor = 1; + } elsif ($algo =~ /^(FAST|BLAST)(X|Y|XY)/oi ) { + $queryfactor = 1; + } elsif ($algo =~ /^T(BLAST|FAST)(X|Y|XY)/oi || + $algo =~ /^(BLAST|FAST)N/oi || + $algo eq 'WABA' || + $algo eq 'EXONERATE' || $algo eq 'MEGABLAST' || + $algo eq 'SMITH-WATERMAN' ){ + $hitfactor = 1; + $queryfactor = 1; + } elsif( $algo eq 'RPSBLAST' ) { + $queryfactor = $hitfactor = 0; + $qframe = $hframe = 0; + } + # Store the aligned query as sequence feature + my $strand; + unless( defined $qe && defined $qs ) { $self->throw("Did not specify a Query End or Query Begin @args ($qs,$qe)"); } + unless( defined $he && defined $hs ) { $self->throw("Did not specify a Hit End or Hit Begin"); } + if ($qe > $qs) { # normal query: start < end + if ($queryfactor) { $strand = 1; } else { $strand = undef; } + } else { # reverse query (i dont know if this is possible, + # but feel free to correct) + if ($queryfactor) { $strand = -1; } else { $strand = undef; } + ($qs,$qe) = ($qe,$qs); + + } + $self->query( new Bio::SeqFeature::Similarity + ('-primary' => $self->primary_tag, + '-start' => $qs, + '-expect' => $evalue, + '-bits' => $bits, + '-end' => $qe, + '-strand' => $strand, + '-seq_id' => $query_name, + '-seqlength'=> $query_len, + '-source' => $algo, + ) ); + + # to determine frame from something like FASTXY which doesn't + # report the frame + if( defined $strand && ! defined $qframe && $queryfactor ) { + $qframe = ( $self->query->start % 3 ) * $strand; + } elsif( ! defined $strand ) { + $qframe = 0; + } + # store the aligned subject as sequence feature + if ($he > $hs) { # normal subject + if ($hitfactor) { $strand = 1; } else { $strand = undef; } + } else { + if ($hitfactor) { $strand = -1; } else { $strand = undef; } + ($hs,$he) = ( $he,$hs); # reverse subject: start bigger than end + } + + $self->hit( Bio::SeqFeature::Similarity->new + ('-start' => $hs, + '-end' => $he, + '-strand' => $strand, + '-expect' => $evalue, + '-bits' => $bits, + '-source' => $algo, + '-seq_id' => $hit_name, + '-seqlength' => $hit_len, + '-primary' => $self->primary_tag )); + + if( defined $strand && ! defined $hframe && $hitfactor ) { + $hframe = ( $hs % 3 ) * $strand; + } elsif( ! defined $strand ) { + $hframe = 0; + } + + $self->frame($qframe,$hframe); + + if( ! defined $query_len || ! defined $hit_len ) { + $self->throw("Must defined hit and query length"); + } + + if( ! defined $identical ) { + $self->warn("Did not defined the number of identical matches in the HSP assuming 0"); + $identical = 0; + } + if( ! defined $conserved ) { + $self->warn("Did not defined the number of conserved matches in the HSP assuming conserved == identical ($identical)") + if( $algo !~ /^((FAST|BLAST)N)|Exonerate/oi); + $conserved = $identical; + } + # protect for divide by zero if user does not specify + # hsp_len, query_len, or hit_len + + $self->num_identical($identical); + $self->num_conserved($conserved); + + if( $hsp_len ) { + $self->length('total', $hsp_len); + $self->frac_identical( 'total', $identical / $self->length('total')); + $self->frac_conserved( 'total', $conserved / $self->length('total')); + } + if( $hit_len ) { +# $self->length('hit', $self->hit->length); + $self->frac_identical( 'hit', $identical / $self->length('hit')); + $self->frac_conserved( 'hit', $conserved / $self->length('hit')); + } + if( $query_len ) { +# $self->length('query', $self->query->length); + $self->frac_identical( 'query', $identical / $self->length('query')) ; + $self->frac_conserved( 'query', $conserved / $self->length('query')); + } + $self->query_string($query_seq); + $self->hit_string($hit_seq); + $self->homology_string($homology_seq); + + if( defined $query_gaps ) { + $self->gaps('query', $query_gaps); + } elsif( defined $query_seq ) { + $self->gaps('query', scalar ( $query_seq =~ tr/\-//)); + } + if( defined $hit_gaps ) { + $self->gaps('hit', $hit_gaps); + } elsif( defined $hit_seq ) { + $self->gaps('hit', scalar ( $hit_seq =~ tr/\-//)); + } + if( ! defined $gaps ) { + $gaps = $self->gaps("query") + $self->gaps("hit"); + } + $self->gaps('total', $gaps); + $self->percent_identity($identical / $hsp_len ) if( $hsp_len > 0 ); + + $rank && $self->rank($rank); + return $self; +} + + + +=head2 Bio::Search::HSP::HSPI methods + +Implementation of Bio::Search::HSP::HSPI methods follow + +=head2 algorithm + + Title : algorithm + Usage : my $r_type = $hsp->algorithm + Function: Obtain the name of the algorithm used to obtain the HSP + Returns : string (e.g., BLASTP) + Args : [optional] scalar string to set value + +=cut + +sub algorithm{ + my ($self,$value) = @_; + my $previous = $self->{'_algorithm'}; + if( defined $value || ! defined $previous ) { + $value = $previous = '' unless defined $value; + $self->{'_algorithm'} = $value; + } + + return $previous; +} + +=head2 pvalue + + Title : pvalue + Usage : my $pvalue = $hsp->pvalue(); + Function: Returns the P-value for this HSP or undef + Returns : float or exponential (2e-10) + P-value is not defined with NCBI Blast2 reports. + Args : [optional] numeric to set value + +=cut + +sub pvalue { + my ($self,$value) = @_; + my $previous = $self->{'_pvalue'}; + if( defined $value ) { + $self->{'_pvalue'} = $value; + } + return $previous; +} + +=head2 evalue + + Title : evalue + Usage : my $evalue = $hsp->evalue(); + Function: Returns the e-value for this HSP + Returns : float or exponential (2e-10) + Args : [optional] numeric to set value + +=cut + +sub evalue { shift->significance(@_) } + +=head2 frac_identical + + Title : frac_identical + Usage : my $frac_id = $hsp->frac_identical( ['query'|'hit'|'total'] ); + Function: Returns the fraction of identitical positions for this HSP + Returns : Float in range 0.0 -> 1.0 + Args : arg 1: 'query' = num identical / length of query seq (without gaps) + 'hit' = num identical / length of hit seq (without gaps) + 'total' = num identical / length of alignment (with gaps) + default = 'total' + arg 2: [optional] frac identical value to set for the type requested + +=cut + +sub frac_identical { + my ($self, $type,$value) = @_; + + $type = lc $type if defined $type; + $type = 'total' if( ! defined $type || + $type !~ /query|hit|total/); + my $previous = $self->{'_frac_identical'}->{$type}; + if( defined $value || ! defined $previous ) { + $value = $previous = '' unless defined $value; + if( $type eq 'hit' || $type eq 'query' ) { + $self->$type()->frac_identical( $value); + } + $self->{'_frac_identical'}->{$type} = $value; + } + return $previous; + +} + +=head2 frac_conserved + + Title : frac_conserved + Usage : my $frac_cons = $hsp->frac_conserved( ['query'|'hit'|'total'] ); + Function : Returns the fraction of conserved positions for this HSP. + This is the fraction of symbols in the alignment with a + positive score. + Returns : Float in range 0.0 -> 1.0 + Args : arg 1: 'query' = num conserved / length of query seq (without gaps) + 'hit' = num conserved / length of hit seq (without gaps) + 'total' = num conserved / length of alignment (with gaps) + default = 'total' + arg 2: [optional] frac conserved value to set for the type requested + +=cut + +sub frac_conserved { + my ($self, $type,$value) = @_; + $type = lc $type if defined $type; + $type = 'total' if( ! defined $type || + $type !~ /query|hit|total/); + my $previous = $self->{'_frac_conserved'}->{$type}; + if( defined $value || ! defined $previous ) { + $value = $previous = '' unless defined $value; + $self->{'_frac_conserved'}->{$type} = $value; + } + return $previous; +} + +=head2 gaps + + Title : gaps + Usage : my $gaps = $hsp->gaps( ['query'|'hit'|'total'] ); + Function : Get the number of gaps in the query, hit, or total alignment. + Returns : Integer, number of gaps or 0 if none + Args : arg 1: 'query' = num gaps in query seq + 'hit' = num gaps in hit seq + 'total' = num gaps in whole alignment + default = 'total' + arg 2: [optional] integer gap value to set for the type requested + +=cut + +sub gaps { + my ($self, $type,$value) = @_; + $type = lc $type if defined $type; + $type = 'total' if( ! defined $type || + $type !~ /query|hit|subject|sbjct|total/); + $type = 'hit' if $type =~ /sbjct|subject/; + my $previous = $self->{'_gaps'}->{$type}; + if( defined $value || ! defined $previous ) { + $value = $previous = '' unless defined $value; + $self->{'_gaps'}->{$type} = $value; + } + return $previous || 0; +} + +=head2 query_string + + Title : query_string + Usage : my $qseq = $hsp->query_string; + Function: Retrieves the query sequence of this HSP as a string + Returns : string + Args : [optional] string to set for query sequence + + +=cut + +sub query_string{ + my ($self,$value) = @_; + my $previous = $self->{'_query_string'}; + if( defined $value || ! defined $previous ) { + $value = $previous = '' unless defined $value; + $self->{'_query_string'} = $value; + # do some housekeeping so we know when to + # re-run _calculate_seq_positions + $self->{'_sequenceschanged'} = 1; + } + return $previous; +} + +=head2 hit_string + + Title : hit_string + Usage : my $hseq = $hsp->hit_string; + Function: Retrieves the hit sequence of this HSP as a string + Returns : string + Args : [optional] string to set for hit sequence + + +=cut + +sub hit_string{ + my ($self,$value) = @_; + my $previous = $self->{'_hit_string'}; + if( defined $value || ! defined $previous ) { + $value = $previous = '' unless defined $value; + $self->{'_hit_string'} = $value; + # do some housekeeping so we know when to + # re-run _calculate_seq_positions + $self->{'_sequenceschanged'} = 1; + } + return $previous; +} + +=head2 homology_string + + Title : homology_string + Usage : my $homo_string = $hsp->homology_string; + Function: Retrieves the homology sequence for this HSP as a string. + : The homology sequence is the string of symbols in between the + : query and hit sequences in the alignment indicating the degree + : of conservation (e.g., identical, similar, not similar). + Returns : string + Args : [optional] string to set for homology sequence + +=cut + +sub homology_string{ + my ($self,$value) = @_; + my $previous = $self->{'_homology_string'}; + if( defined $value || ! defined $previous ) { + $value = $previous = '' unless defined $value; + $self->{'_homology_string'} = $value; + # do some housekeeping so we know when to + # re-run _calculate_seq_positions + $self->{'_sequenceschanged'} = 1; + } + return $previous; +} + +=head2 length + + Title : length + Usage : my $len = $hsp->length( ['query'|'hit'|'total'] ); + Function : Returns the length of the query or hit in the alignment + (without gaps) + or the aggregate length of the HSP (including gaps; + this may be greater than either hit or query ) + Returns : integer + Args : arg 1: 'query' = length of query seq (without gaps) + 'hit' = length of hit seq (without gaps) + 'total' = length of alignment (with gaps) + default = 'total' + arg 2: [optional] integer length value to set for specific type + +=cut + +sub length { + + my $self = shift; + my $type = shift; + + $type = 'total' unless defined $type; + $type = lc $type; + + if( $type =~ /^q/i ) { + return $self->query()->length(shift); + } elsif( $type =~ /^(hit|subject|sbjct)/ ) { + return $self->hit()->length(shift); + } else { + my $v = shift; + if( defined $v ) { + $self->{'_hsplength'} = $v; + } + return $self->{'_hsplength'}; + } + return 0; # should never get here +} + +=head2 hsp_length + + Title : hsp_length + Usage : my $len = $hsp->hsp_length() + Function: shortcut length('hsp') + Returns : floating point between 0 and 100 + Args : none + +=cut + +sub hsp_length { return shift->length('hsp', shift); } + +=head2 percent_identity + + Title : percent_identity + Usage : my $percentid = $hsp->percent_identity() + Function: Returns the calculated percent identity for an HSP + Returns : floating point between 0 and 100 + Args : none + + +=cut + + +=head2 frame + + Title : frame + Usage : $hsp->frame($queryframe,$subjectframe) + Function: Set the Frame for both query and subject and insure that + they agree. + This overrides the frame() method implementation in + FeaturePair. + Returns : array of query and subjects if return type wants an array + or query frame if defined or subject frame + Args : none + Note : Frames are stored in the GFF way (0-2) not 1-3 + as they are in BLAST (negative frames are deduced by checking + the strand of the query or hit) + +=cut + +sub frame { + my ($self, $qframe, $sframe) = @_; + if( defined $qframe ) { + if( $qframe == 0 ) { + $qframe = 0; + } elsif( $qframe !~ /^([+-])?([1-3])/ ) { + $self->warn("Specifying an invalid query frame ($qframe)"); + $qframe = undef; + } else { + my $dir = $1; + $dir = '+' unless defined $dir; + if( ($dir eq '-' && $self->query->strand >= 0) || + ($dir eq '+' && $self->query->strand <= 0) ) { + $self->warn("Query frame ($qframe) did not match strand of query (". $self->query->strand() . ")"); + } + # Set frame to GFF [0-2] - + # what if someone tries to put in a GFF frame! + $qframe = $2 - 1; + } + $self->query->frame($qframe); + } + if( defined $sframe ) { + if( $sframe == 0 ) { + $sframe = 0; + } elsif( $sframe !~ /^([+-])?([1-3])/ ) { + $self->warn("Specifying an invalid subject frame ($sframe)"); + $sframe = undef; + } else { + my $dir = $1; + $dir = '+' unless defined $dir; + if( ($dir eq '-' && $self->hit->strand >= 0) || + ($dir eq '+' && $self->hit->strand <= 0) ) + { + $self->warn("Subject frame ($sframe) did not match strand of subject (". $self->hit->strand() . ")"); + } + + # Set frame to GFF [0-2] + $sframe = $2 - 1; + } + $self->hit->frame($sframe); + } + if (wantarray() && $self->algorithm =~ /^T(BLAST|FAST)(X|Y|XY)/oi) + { + return ($self->query->frame(), $self->hit->frame()); + } elsif (wantarray()) { + ($self->query->frame() && + return ($self->query->frame(), undef)) || + ($self->hit->frame() && + return (undef, $self->hit->frame())); + } else { + ($self->query->frame() && + return $self->query->frame()) || + ($self->hit->frame() && + return $self->hit->frame()); + } +} + + +=head2 get_aln + + Title : get_aln + Usage : my $aln = $hsp->gel_aln + Function: Returns a Bio::SimpleAlign representing the HSP alignment + Returns : Bio::SimpleAlign + Args : none + +=cut + +sub get_aln { + my ($self) = @_; + require Bio::LocatableSeq; + require Bio::SimpleAlign; + my $aln = new Bio::SimpleAlign; + my $hs = $self->hit_string(); + my $qs = $self->query_string(); + # FASTA specific stuff moved to the FastaHSP object + my $seqonly = $qs; + $seqonly =~ s/[\-\s]//g; + my ($q_nm,$s_nm) = ($self->query->seq_id(), + $self->hit->seq_id()); + unless( defined $q_nm && CORE::length ($q_nm) ) { + $q_nm = 'query'; + } + unless( defined $s_nm && CORE::length ($s_nm) ) { + $s_nm = 'hit'; + } + my $query = new Bio::LocatableSeq('-seq' => $qs, + '-id' => $q_nm, + '-start' => 1, + '-end' => CORE::length($seqonly), + ); + $seqonly = $hs; + $seqonly =~ s/[\-\s]//g; + my $hit = new Bio::LocatableSeq('-seq' => $hs, + '-id' => $s_nm, + '-start' => 1, + '-end' => CORE::length($seqonly), + ); + $aln->add_seq($query); + $aln->add_seq($hit); + return $aln; +} + +=head2 num_conserved + + Title : num_conserved + Usage : $obj->num_conserved($newval) + Function: returns the number of conserved residues in the alignment + Returns : inetger + Args : integer (optional) + + +=cut + +sub num_conserved{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'num_conserved'} = $value; + } + return $self->{'num_conserved'}; +} + +=head2 num_identical + + Title : num_identical + Usage : $obj->num_identical($newval) + Function: returns the number of identical residues in the alignment + Returns : integer + Args : integer (optional) + + +=cut + +sub num_identical{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'_num_identical'} = $value; + } + return $self->{'_num_identical'}; +} + +=head2 rank + + Usage : $hsp->rank( [string] ); + Purpose : Get the rank of the HSP within a given Blast hit. + Example : $rank = $hsp->rank; + Returns : Integer (1..n) corresponding to the order in which the HSP + appears in the BLAST report. + +=cut + +sub rank { + my ($self,$value) = @_; + if( defined $value) { + $self->{'_rank'} = $value; + } + return $self->{'_rank'}; +} + + +=head2 seq_inds + + Title : seq_inds + Purpose : Get a list of residue positions (indices) for all identical + : or conserved residues in the query or sbjct sequence. + Example : @s_ind = $hsp->seq_inds('query', 'identical'); + : @h_ind = $hsp->seq_inds('hit', 'conserved'); + @h_ind = $hsp->seq_inds('hit', 'conserved-not-identical'); + : @h_ind = $hsp->seq_inds('hit', 'conserved', 1); + Returns : List of integers + : May include ranges if collapse is true. + Argument : seq_type = 'query' or 'hit' or 'sbjct' (default = query) + : ('sbjct' is synonymous with 'hit') + : class = 'identical' or 'conserved' or 'nomatch' or 'gap' + : (default = identical) + : (can be shortened to 'id' or 'cons') + : or 'conserved-not-identical' + : collapse = boolean, if true, consecutive positions are merged + : using a range notation, e.g., "1 2 3 4 5 7 9 10 11" + : collapses to "1-5 7 9-11". This is useful for + : consolidating long lists. Default = no collapse. + Throws : n/a. + Comments : + +See Also : L, + L + +=cut + +sub seq_inds{ + my ($self, $seqType, $class, $collapse) = @_; + + # prepare the internal structures - this is cached so + # if the strings have not changed we're okay + $self->_calculate_seq_positions(); + + $seqType ||= 'query'; + $class ||= 'identical'; + $collapse ||= 0; + $seqType = 'sbjct' if $seqType eq 'hit'; + my $t = lc(substr($seqType,0,1)); + if( $t eq 'q' ) { + $seqType = 'query'; + } elsif ( $t eq 's' || $t eq 'h' ) { + $seqType = 'sbjct'; + } else { + $self->warn("unknown seqtype $seqType using 'query'"); + $seqType = 'query'; + } + $t = lc(substr($class,0,1)); + + if( $t eq 'c' ) { + if( $class =~ /conserved\-not\-identical/ ) { + $class = 'conserved'; + } else { + $class = 'conservedall'; + } + } elsif( $t eq 'i' ) { + $class = 'identical'; + } elsif( $t eq 'n' ) { + $class = 'nomatch'; + } elsif( $t eq 'g' ) { + $class = 'gap'; + } else { + $self->warn("unknown sequence class $class using 'identical'"); + $class = 'identical'; + } + + ## Sensitive to member name changes. + $seqType = "_\L$seqType\E"; + $class = "_\L$class\E"; + my @ary; + + if( $class eq '_gap' ) { + # this means that we are remapping the gap length that is stored + # in the hash (for example $self->{'_gapRes_query'} ) + # so we'll return an array which has the values of the position of the + # of the gap (the key in the hash) + the gap length (value in the + # hash for this key - 1. + + @ary = map { $_ > 1 ? + $_..($_ + $self->{"${class}Res$seqType"}->{$_} - 1) : + $_ } + sort { $a <=> $b } keys %{ $self->{"${class}Res$seqType"}}; + } elsif( $class eq '_conservedall' ) { + @ary = sort { $a <=> $b } + keys %{ $self->{"_conservedRes$seqType"}}, + keys %{ $self->{"_identicalRes$seqType"}}, + } else { + @ary = sort { $a <=> $b } keys %{ $self->{"${class}Res$seqType"}}; + } + require Bio::Search::BlastUtils if $collapse; + + return $collapse ? &Bio::Search::SearchUtils::collapse_nums(@ary) : @ary; +} + + +=head2 Inherited from Bio::SeqFeature::SimilarityPair + +These methods come from Bio::SeqFeature::SimilarityPair + +=head2 query + + Title : query + Usage : my $query = $hsp->query + Function: Returns a SeqFeature representing the query in the HSP + Returns : Bio::SeqFeature::Similarity + Args : [optional] new value to set + + +=head2 hit + + Title : hit + Usage : my $hit = $hsp->hit + Function: Returns a SeqFeature representing the hit in the HSP + Returns : Bio::SeqFeature::Similarity + Args : [optional] new value to set + + +=head2 significance + + Title : significance + Usage : $evalue = $obj->significance(); + $obj->significance($evalue); + Function: Get/Set the significance value + Returns : numeric + Args : [optional] new value to set + + +=head2 score + + Title : score + Usage : my $score = $hsp->score(); + Function: Returns the score for this HSP or undef + Returns : numeric + Args : [optional] numeric to set value + +=cut + +# overriding + +sub score { + my ($self,$value) = @_; + my $previous = $self->{'_score'}; + if( defined $value ) { + $self->{'_score'} = $value; + } + return $previous; +} + +=head2 bits + + Title : bits + Usage : my $bits = $hsp->bits(); + Function: Returns the bit value for this HSP or undef + Returns : numeric + Args : none + +=cut + +# overriding + +sub bits { + my ($self,$value) = @_; + my $previous = $self->{'_bits'}; + if( defined $value ) { + $self->{'_bits'} = $value; + } + return $previous; +} + + +=head2 strand + + Title : strand + Usage : $hsp->strand('quer') + Function: Retrieves the strand for the HSP component requested + Returns : +1 or -1 (0 if unknown) + Args : 'hit' or 'subject' or 'sbjct' to retrieve the strand of the subject + 'query' to retrieve the query strand (default) + +=cut + +=head1 Private methods + +=cut + +=head2 _calculate_seq_positions + + Title : _calculate_seq_positions + Usage : $self->_calculate_seq_positions + Function: Internal function + Returns : + Args : + + +=cut + +sub _calculate_seq_positions { + my ($self,@args) = @_; + return unless ( $self->{'_sequenceschanged'} ); + $self->{'_sequenceschanged'} = 0; + my ($mchar, $schar, $qchar); + my ($seqString, $qseq,$sseq) = ( $self->homology_string(), + $self->query_string(), + $self->hit_string() ); + + # Using hashes to avoid saving duplicate residue numbers. + my %identicalList_query = (); + my %identicalList_sbjct = (); + my %conservedList_query = (); + my %conservedList_sbjct = (); + + my %gapList_query = (); + my %gapList_sbjct = (); + my %nomatchList_query = (); + my %nomatchList_sbjct = (); + + my $qdir = $self->query->strand || 1; + my $sdir = $self->hit->strand || 1; + my $resCount_query = ($qdir >=0) ? $self->query->end : $self->query->start; + my $resCount_sbjct = ($sdir >=0) ? $self->hit->end : $self->hit->start; + + my $prog = $self->algorithm; + if( $prog =~ /FAST/i ) { + # fasta reports some extra 'regional' sequence information + # we need to clear out first + # this seemed a bit insane to me at first, but it appears to + # work --jason + + # we infer the end of the regional sequence where the first + # non space is in the homology string + # then we use the HSP->length to tell us how far to read + # to cut off the end of the sequence + + # one possible problem is the sequence which + + my ($start) = (0); + if( $seqString =~ /^(\s+)/ ) { + $start = CORE::length($1); + } + + $seqString = substr($seqString, $start,$self->length('total')); + $qseq = substr($qseq, $start,$self->length('total')); + $sseq = substr($sseq, $start,$self->length('total')); + + $qseq =~ s![\\\/]!!g; + $sseq =~ s![\\\/]!!g; + } + + if($prog =~ /^(PSI)?T(BLAST|FAST)N/oi ) { + $resCount_sbjct = int($resCount_sbjct / 3); + } elsif($prog =~ /^(BLAST|FAST)(X|Y|XY)/oi ) { + $resCount_query = int($resCount_query / 3); + } elsif($prog =~ /^T(BLAST|FAST)(X|Y|XY)/oi ) { + $resCount_query = int($resCount_query / 3); + $resCount_sbjct = int($resCount_sbjct / 3); + } + while( $mchar = chop($seqString) ) { + ($qchar, $schar) = (chop($qseq), chop($sseq)); + if( $mchar eq '+' || $mchar eq '.' || $mchar eq ':' ) { + $conservedList_query{ $resCount_query } = 1; + $conservedList_sbjct{ $resCount_sbjct } = 1; + } elsif( $mchar ne ' ' ) { + $identicalList_query{ $resCount_query } = 1; + $identicalList_sbjct{ $resCount_sbjct } = 1; + } elsif( $mchar eq ' ') { + $nomatchList_query{ $resCount_query } = 1; + $nomatchList_sbjct{ $resCount_sbjct } = 1; + } + if( $qchar eq $GAP_SYMBOL ) { + $gapList_query{ $resCount_query } ++; + } else { + $resCount_query -= $qdir; + } + if( $schar eq $GAP_SYMBOL ) { + $gapList_sbjct{ $resCount_query } ++; + } else { + $resCount_sbjct -=$sdir; + } + } + $self->{'_identicalRes_query'} = \%identicalList_query; + $self->{'_conservedRes_query'} = \%conservedList_query; + $self->{'_nomatchRes_query'} = \%nomatchList_query; + $self->{'_gapRes_query'} = \%gapList_query; + + $self->{'_identicalRes_sbjct'} = \%identicalList_sbjct; + $self->{'_conservedRes_sbjct'} = \%conservedList_sbjct; + $self->{'_nomatchRes_sbjct'} = \%nomatchList_sbjct; + $self->{'_gapRes_sbjct'} = \%gapList_sbjct; + return 1; +} + +=head2 n + +See documentation in L + +=cut + +#----- +sub n { + my $self = shift; + if(@_) { $self->{'_n'} = shift; } + defined $self->{'_n'} ? $self->{'_n'} : ''; +} + +=head2 range + +See documentation in L + +=cut + +#---------- +sub range { +#---------- + my ($self, $seqType) = @_; + + $seqType ||= 'query'; + $seqType = 'sbjct' if $seqType eq 'hit'; + + my ($start, $end); + if( $seqType eq 'query' ) { + $start = $self->query->start; + $end = $self->query->end; + } + else { + $start = $self->hit->start; + $end = $self->hit->end; + } + return ($start, $end); +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Search/HSP/HMMERHSP.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Search/HSP/HMMERHSP.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,394 @@ +# $Id: HMMERHSP.pm,v 1.3 2002/10/22 07:45:17 lapp Exp $ +# +# BioPerl module for Bio::Search::HSP::HMMERHSP +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Search::HSP::HMMERHSP - A HSP object for HMMER results + +=head1 SYNOPSIS + + use Bio::Search::HSP::HMMERHSP; + # us it just like a Bio::Search::HSP::GenericHSP object + +=head1 DESCRIPTION + +This object is a specialization of L. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Search::HSP::HMMERHSP; +use vars qw(@ISA); +use strict; +use Bio::Search::HSP::GenericHSP; + +@ISA = qw(Bio::Search::HSP::GenericHSP); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Search::HSP::HMMERHSP(); + Function: Builds a new Bio::Search::HSP::HMMERHSP object + Returns : Bio::Search::HSP::HMMERHSP + Args : + +Plus Bio::Seach::HSP::GenericHSP methods + + -algorithm => algorithm used (BLASTP, TBLASTX, FASTX, etc) + -evalue => evalue + -pvalue => pvalue + -bits => bit value for HSP + -score => score value for HSP (typically z-score but depends on + analysis) + -hsp_length=> Length of the HSP (including gaps) + -identical => # of residues that that matched identically + -conserved => # of residues that matched conservatively + (only protein comparisions; + conserved == identical in nucleotide comparisons) + -hsp_gaps => # of gaps in the HSP + -query_gaps => # of gaps in the query in the alignment + -hit_gaps => # of gaps in the subject in the alignment + -query_name => HSP Query sequence name (if available) + -query_start => HSP Query start (in original query sequence coords) + -query_end => HSP Query end (in original query sequence coords) + -hit_name => HSP Hit sequence name (if available) + -hit_start => HSP Hit start (in original hit sequence coords) + -hit_end => HSP Hit end (in original hit sequence coords) + -hit_length => total length of the hit sequence + -query_length=> total length of the query sequence + -query_seq => query sequence portion of the HSP + -hit_seq => hit sequence portion of the HSP + -homology_seq=> homology sequence for the HSP + -hit_frame => hit frame (only if hit is translated protein) + -query_frame => query frame (only if query is translated protein) + +=cut + +=head2 Bio::Search::HSP::HSPI methods + +Implementation of Bio::Search::HSP::HSPI methods follow + +=head2 algorithm + + Title : algorithm + Usage : my $r_type = $hsp->algorithm + Function: Obtain the name of the algorithm used to obtain the HSP + Returns : string (e.g., BLASTP) + Args : [optional] scalar string to set value + +=cut + +=head2 pvalue + + Title : pvalue + Usage : my $pvalue = $hsp->pvalue(); + Function: Returns the P-value for this HSP or undef + Returns : float or exponential (2e-10) + P-value is not defined with NCBI Blast2 reports. + Args : [optional] numeric to set value + +=cut + +=head2 evalue + + Title : evalue + Usage : my $evalue = $hsp->evalue(); + Function: Returns the e-value for this HSP + Returns : float or exponential (2e-10) + Args : [optional] numeric to set value + +=cut + +=head2 frac_identical + + Title : frac_identical + Usage : my $frac_id = $hsp->frac_identical( ['query'|'hit'|'total'] ); + Function: Returns the fraction of identitical positions for this HSP + Returns : Float in range 0.0 -> 1.0 + Args : arg 1: 'query' = num identical / length of query seq (without gaps) + 'hit' = num identical / length of hit seq (without gaps) + 'total' = num identical / length of alignment (with gaps) + default = 'total' + arg 2: [optional] frac identical value to set for the type requested + +=cut + +=head2 frac_conserved + + Title : frac_conserved + Usage : my $frac_cons = $hsp->frac_conserved( ['query'|'hit'|'total'] ); + Function : Returns the fraction of conserved positions for this HSP. + This is the fraction of symbols in the alignment with a + positive score. + Returns : Float in range 0.0 -> 1.0 + Args : arg 1: 'query' = num conserved / length of query seq (without gaps) + 'hit' = num conserved / length of hit seq (without gaps) + 'total' = num conserved / length of alignment (with gaps) + default = 'total' + arg 2: [optional] frac conserved value to set for the type requested + +=cut + +=head2 gaps + + Title : gaps + Usage : my $gaps = $hsp->gaps( ['query'|'hit'|'total'] ); + Function : Get the number of gaps in the query, hit, or total alignment. + Returns : Integer, number of gaps or 0 if none + Args : arg 1: 'query' = num gaps in query seq + 'hit' = num gaps in hit seq + 'total' = num gaps in whole alignment + default = 'total' + arg 2: [optional] integer gap value to set for the type requested + +=cut + +=head2 query_string + + Title : query_string + Usage : my $qseq = $hsp->query_string; + Function: Retrieves the query sequence of this HSP as a string + Returns : string + Args : [optional] string to set for query sequence + + +=cut + +=head2 hit_string + + Title : hit_string + Usage : my $hseq = $hsp->hit_string; + Function: Retrieves the hit sequence of this HSP as a string + Returns : string + Args : [optional] string to set for hit sequence + + +=cut + + +=head2 homology_string + + Title : homology_string + Usage : my $homo_string = $hsp->homology_string; + Function: Retrieves the homology sequence for this HSP as a string. + : The homology sequence is the string of symbols in between the + : query and hit sequences in the alignment indicating the degree + : of conservation (e.g., identical, similar, not similar). + Returns : string + Args : [optional] string to set for homology sequence + +=cut + +=head2 length + + Title : length + Usage : my $len = $hsp->length( ['query'|'hit'|'total'] ); + Function : Returns the length of the query or hit in the alignment + (without gaps) + or the aggregate length of the HSP (including gaps; + this may be greater than either hit or query ) + Returns : integer + Args : arg 1: 'query' = length of query seq (without gaps) + 'hit' = length of hit seq (without gaps) + 'total' = length of alignment (with gaps) + default = 'total' + arg 2: [optional] integer length value to set for specific type + +=cut + +=head2 percent_identity + + Title : percent_identity + Usage : my $percentid = $hsp->percent_identity() + Function: Returns the calculated percent identity for an HSP + Returns : floating point between 0 and 100 + Args : none + + +=cut + + +=head2 frame + + Title : frame + Usage : $hsp->frame($queryframe,$subjectframe) + Function: Set the Frame for both query and subject and insure that + they agree. + This overrides the frame() method implementation in + FeaturePair. + Returns : array of query and subjects if return type wants an array + or query frame if defined or subject frame + Args : none + Note : Frames are stored in the GFF way (0-2) not 1-3 + as they are in BLAST (negative frames are deduced by checking + the strand of the query or hit) + +=cut + + +=head2 get_aln + + Title : get_aln + Usage : my $aln = $hsp->gel_aln + Function: Returns a Bio::SimpleAlign representing the HSP alignment + Returns : Bio::SimpleAlign + Args : none + +=cut + +sub get_aln { + my ($self) = shift; + $self->warn("Innapropriate to build a Bio::SimpleAlign from a HMMER HSP object"); + return undef; +} + +=head2 num_conserved + + Title : num_conserved + Usage : $obj->num_conserved($newval) + Function: returns the number of conserved residues in the alignment + Returns : inetger + Args : integer (optional) + + +=cut + +=head2 num_identical + + Title : num_identical + Usage : $obj->num_identical($newval) + Function: returns the number of identical residues in the alignment + Returns : integer + Args : integer (optional) + + +=cut + +=head2 seq_inds + + Title : seq_inds + Purpose : Get a list of residue positions (indices) for all identical + : or conserved residues in the query or sbjct sequence. + Example : @s_ind = $hsp->seq_inds('query', 'identical'); + : @h_ind = $hsp->seq_inds('hit', 'conserved'); + : @h_ind = $hsp->seq_inds('hit', 'conserved', 1); + Returns : List of integers + : May include ranges if collapse is true. + Argument : seq_type = 'query' or 'hit' or 'sbjct' (default = query) + : ('sbjct' is synonymous with 'hit') + : class = 'identical' or 'conserved' or 'nomatch' or 'gap' + : (default = identical) + : (can be shortened to 'id' or 'cons') + : + : collapse = boolean, if true, consecutive positions are merged + : using a range notation, e.g., "1 2 3 4 5 7 9 10 11" + : collapses to "1-5 7 9-11". This is useful for + : consolidating long lists. Default = no collapse. + Throws : n/a. + Comments : + +See Also : L, L + +=cut + +=head2 Inherited from Bio::SeqFeature::SimilarityPair + +These methods come from Bio::SeqFeature::SimilarityPair + +=head2 query + + Title : query + Usage : my $query = $hsp->query + Function: Returns a SeqFeature representing the query in the HSP + Returns : Bio::SeqFeature::Similarity + Args : [optional] new value to set + + +=head2 hit + + Title : hit + Usage : my $hit = $hsp->hit + Function: Returns a SeqFeature representing the hit in the HSP + Returns : Bio::SeqFeature::Similarity + Args : [optional] new value to set + + +=head2 significance + + Title : significance + Usage : $evalue = $obj->significance(); + $obj->significance($evalue); + Function: Get/Set the significance value + Returns : numeric + Args : [optional] new value to set + + +=head2 score + + Title : score + Usage : my $score = $hsp->score(); + Function: Returns the score for this HSP or undef + Returns : numeric + Args : [optional] numeric to set value + +=cut + +=head2 bits + + Title : bits + Usage : my $bits = $hsp->bits(); + Function: Returns the bit value for this HSP or undef + Returns : numeric + Args : none + +=cut + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Search/HSP/HSPFactory.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Search/HSP/HSPFactory.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,149 @@ +# $Id: HSPFactory.pm,v 1.4 2002/10/22 07:45:17 lapp Exp $ +# +# BioPerl module for Bio::Search::HSP::HSPFactory +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Search::HSP::HSPFactory - A factory to create Bio::Search::HSP::HSPI objects + +=head1 SYNOPSIS + + use Bio::Search::HSP::HSPFactory; + my $factory = new Bio::Search::HSP::HSPFactory(); + my $resultobj = $factory->create(@args); + +=head1 DESCRIPTION + + +This is a general way of hiding the object creation process so that we +can dynamically change the objects that are created by the SearchIO +parser depending on what format report we are parsing. + +This object is for creating new HSPs. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Search::HSP::HSPFactory; +use vars qw(@ISA $DEFAULT_TYPE); +use strict; + +use Bio::Root::Root; +use Bio::Factory::ObjectFactoryI; + +@ISA = qw(Bio::Root::Root Bio::Factory::ObjectFactoryI ); + +BEGIN { + $DEFAULT_TYPE = 'Bio::Search::HSP::GenericHSP'; +} + +=head2 new + + Title : new + Usage : my $obj = new Bio::Search::HSP::HSPFactory(); + Function: Builds a new Bio::Search::HSP::HSPFactory object + Returns : Bio::Search::HSP::HSPFactory + Args : + + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + my ($type) = $self->_rearrange([qw(TYPE)],@args); + $self->type($type) if defined $type; + return $self; +} + +=head2 create + + Title : create + Usage : $factory->create(%args) + Function: Create a new L object + Returns : L + Args : hash of initialization parameters + + +=cut + +sub create{ + my ($self,@args) = @_; + my $type = $self->type; + eval { $self->_load_module($type) }; + if( $@ ) { $self->throw("Unable to load module $type"); } + return $type->new(@args); +} + + +=head2 type + + Title : type + Usage : $factory->type('Bio::Search::HSP::GenericHSP'); + Function: Get/Set the HSP creation type + Returns : string + Args : [optional] string to set + +=cut + +sub type{ + my ($self,$type) = @_; + if( defined $type ) { + # redundancy with the create method which also calls _load_module + # I know - but this is not a highly called object so I am going + # to leave it in + eval {$self->_load_module($type) }; + if( $@ ){ $self->warn("Cannot find module $type, unable to set type") } + else { $self->{'_type'} = $type; } + } + return $self->{'_type'} || $DEFAULT_TYPE; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Search/HSP/HSPI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Search/HSP/HSPI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,701 @@ +#----------------------------------------------------------------- +# $Id: HSPI.pm,v 1.21.2.1 2003/01/22 22:51:00 jason Exp $ +# +# BioPerl module for Bio::Search::HSP::HSPI +# +# Cared for by Steve Chervitz +# and Jason Stajich +# +# You may distribute this module under the same terms as perl itself +#----------------------------------------------------------------- + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Search::HSP::HSPI - Interface for a High Scoring Pair in a similarity search result + +=head1 SYNOPSIS + + $r_type = $hsp->algorithm + + $pvalue = $hsp->pvalue(); + + $evalue = $hsp->evalue(); + + $frac_id = $hsp->frac_identical( ['query'|'hit'|'total'] ); + + $frac_cons = $hsp->frac_conserved( ['query'|'hit'|'total'] ); + + $gaps = $hsp->gaps( ['query'|'hit'|'total'] ); + + $qseq = $hsp->query_string; + + $hseq = $hsp->hit_string; + + $homology_string = $hsp->homology_string; + + $len = $hsp->length( ['query'|'hit'|'total'] ); + + $rank = $hsp->rank; + +=head1 DESCRIPTION + +Bio::Search::HSP::HSPI objects cannot be instantiated since this +module defines a pure interface. + +Given an object that implements the Bio::Search::HSP::HSPI interface, +you can do the following things with it: + +=head1 SEE ALSO + +This interface inherits methods from these other modules: + +L, +L +L + +Please refer to these modules for documentation of the +many additional inherited methods. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Steve Chervitz, Jason Stajich + +Email sac@bioperl.org +Email jason@bioperl.org + +=head1 COPYRIGHT + +Copyright (c) 2001 Steve Chervitz, Jason Stajich. All Rights Reserved. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Search::HSP::HSPI; +use vars qw(@ISA); + +use Bio::Root::RootI; +use Bio::SeqFeature::SimilarityPair; + +use strict; +use Carp; + +@ISA = qw(Bio::SeqFeature::SimilarityPair Bio::Root::RootI); + + +=head2 algorithm + + Title : algorithm + Usage : my $r_type = $hsp->algorithm + Function: Obtain the name of the algorithm used to obtain the HSP + Returns : string (e.g., BLASTP) + Args : none + +=cut + +sub algorithm{ + my ($self,@args) = @_; + $self->throw_not_implemented; +} + +=head2 pvalue + + Title : pvalue + Usage : my $pvalue = $hsp->pvalue(); + Function: Returns the P-value for this HSP or undef + Returns : float or exponential (2e-10) + P-value is not defined with NCBI Blast2 reports. + Args : none + +=cut + +sub pvalue { + my ($self) = @_; + $self->throw_not_implemented; +} + +=head2 evalue + + Title : evalue + Usage : my $evalue = $hsp->evalue(); + Function: Returns the e-value for this HSP + Returns : float or exponential (2e-10) + Args : none + +=cut + +sub evalue { + my ($self) = @_; + $self->throw_not_implemented; +} + +=head2 frac_identical + + Title : frac_identical + Usage : my $frac_id = $hsp->frac_identical( ['query'|'hit'|'total'] ); + Function: Returns the fraction of identitical positions for this HSP + Returns : Float in range 0.0 -> 1.0 + Args : 'query' = num identical / length of query seq (without gaps) + 'hit' = num identical / length of hit seq (without gaps) + 'total' = num identical / length of alignment (with gaps) + default = 'total' + +=cut + +sub frac_identical { + my ($self, $type) = @_; + $self->throw_not_implemented; +} + +=head2 frac_conserved + + Title : frac_conserved + Usage : my $frac_cons = $hsp->frac_conserved( ['query'|'hit'|'total'] ); + Function : Returns the fraction of conserved positions for this HSP. + This is the fraction of symbols in the alignment with a + positive score. + Returns : Float in range 0.0 -> 1.0 + Args : 'query' = num conserved / length of query seq (without gaps) + 'hit' = num conserved / length of hit seq (without gaps) + 'total' = num conserved / length of alignment (with gaps) + default = 'total' + +=cut + +sub frac_conserved { + my ($self, $type) = @_; + $self->throw_not_implemented; +} + +=head2 num_identical + + Title : num_identical + Usage : $obj->num_identical($newval) + Function: returns the number of identical residues in the alignment + Returns : integer + Args : integer (optional) + + +=cut + +sub num_identical{ + shift->throw_not_implemented; +} + +=head2 num_conserved + + Title : num_conserved + Usage : $obj->num_conserved($newval) + Function: returns the number of conserved residues in the alignment + Returns : inetger + Args : integer (optional) + + +=cut + +sub num_conserved{ + shift->throw_not_implemented(); +} + +=head2 gaps + + Title : gaps + Usage : my $gaps = $hsp->gaps( ['query'|'hit'|'total'] ); + Function : Get the number of gaps in the query, hit, or total alignment. + Returns : Integer, number of gaps or 0 if none + Args : 'query' = num conserved / length of query seq (without gaps) + 'hit' = num conserved / length of hit seq (without gaps) + 'total' = num conserved / length of alignment (with gaps) + default = 'total' + +=cut + +sub gaps { + my ($self, $type) = @_; + $self->throw_not_implemented; +} + +=head2 query_string + + Title : query_string + Usage : my $qseq = $hsp->query_string; + Function: Retrieves the query sequence of this HSP as a string + Returns : string + Args : none + + +=cut + +sub query_string{ + my ($self) = @_; + $self->throw_not_implemented; +} + +=head2 hit_string + + Title : hit_string + Usage : my $hseq = $hsp->hit_string; + Function: Retrieves the hit sequence of this HSP as a string + Returns : string + Args : none + + +=cut + +sub hit_string{ + my ($self) = @_; + $self->throw_not_implemented; +} + +=head2 homology_string + + Title : homology_string + Usage : my $homo_string = $hsp->homology_string; + Function: Retrieves the homology sequence for this HSP as a string. + : The homology sequence is the string of symbols in between the + : query and hit sequences in the alignment indicating the degree + : of conservation (e.g., identical, similar, not similar). + Returns : string + Args : none + +=cut + +sub homology_string{ + my ($self) = @_; + $self->throw_not_implemented; +} + +=head2 length + + Title : length + Usage : my $len = $hsp->length( ['query'|'hit'|'total'] ); + Function : Returns the length of the query or hit in the alignment (without gaps) + or the aggregate length of the HSP (including gaps; + this may be greater than either hit or query ) + Returns : integer + Args : 'query' = length of query seq (without gaps) + 'hit' = length of hit seq (without gaps) + 'total' = length of alignment (with gaps) + default = 'total' + Args : none + +=cut + +sub length{ + shift->throw_not_implemented(); +} + +=head2 percent_identity + + Title : percent_identity + Usage : my $percentid = $hsp->percent_identity() + Function: Returns the calculated percent identity for an HSP + Returns : floating point between 0 and 100 + Args : none + + +=cut + +sub percent_identity{ + my ($self) = @_; + return $self->frac_identical('hsp') * 100; +} + +=head2 get_aln + + Title : get_aln + Usage : my $aln = $hsp->gel_aln + Function: Returns a Bio::SimpleAlign representing the HSP alignment + Returns : Bio::SimpleAlign + Args : none + +=cut + +sub get_aln { + my ($self) = @_; + $self->throw_not_implemented; +} + + +=head2 seq_inds + + Title : seq_inds + Purpose : Get a list of residue positions (indices) for all identical + : or conserved residues in the query or sbjct sequence. + Example : @s_ind = $hsp->seq_inds('query', 'identical'); + : @h_ind = $hsp->seq_inds('hit', 'conserved'); + : @h_ind = $hsp->seq_inds('hit', 'conserved', 1); + Returns : List of integers + : May include ranges if collapse is true. + Argument : seq_type = 'query' or 'hit' or 'sbjct' (default = query) + ('sbjct' is synonymous with 'hit') + class = 'identical' or 'conserved' or 'nomatch' or 'gap' + (default = identical) + (can be shortened to 'id' or 'cons') + + collapse = boolean, if true, consecutive positions are merged + using a range notation, e.g., "1 2 3 4 5 7 9 10 11" + collapses to "1-5 7 9-11". This is useful for + consolidating long lists. Default = no collapse. + Throws : n/a. + Comments : + +See Also : L, L + +=cut + +sub seq_inds { + shift->throw_not_implemented(); +} + +=head2 Inherited from Bio::SeqFeature::SimilarityPair + +These methods come from Bio::SeqFeature::SimilarityPair + +=head2 query + + Title : query + Usage : my $query = $hsp->query + Function: Returns a SeqFeature representing the query in the HSP + Returns : Bio::SeqFeature::Similarity + Args : [optional] new value to set + + +=head2 hit + + Title : hit + Usage : my $hit = $hsp->hit + Function: Returns a SeqFeature representing the hit in the HSP + Returns : Bio::SeqFeature::Similarity + Args : [optional] new value to set + + +=head2 significance + + Title : significance + Usage : $evalue = $obj->significance(); + $obj->significance($evalue); + Function: Get/Set the significance value (see Bio::SeqFeature::SimilarityPair) + Returns : significance value (scientific notation string) + Args : significance value (sci notation string) + + +=head2 score + + Title : score + Usage : my $score = $hsp->score(); + Function: Returns the score for this HSP or undef + Returns : numeric + Args : [optional] numeric to set value + +=head2 bits + + Title : bits + Usage : my $bits = $hsp->bits(); + Function: Returns the bit value for this HSP or undef + Returns : numeric + Args : none + +=cut + +# override + +=head2 strand + + Title : strand + Usage : $hsp->strand('query') + Function: Retrieves the strand for the HSP component requested + Returns : +1 or -1 (0 if unknown) + Args : 'hit' or 'subject' or 'sbjct' to retrieve the strand of the subject + 'query' to retrieve the query strand (default) + 'list' or 'array' to retreive both query and hit together + +=cut + +sub strand { + my $self = shift; + my $val = shift; + $val = 'query' unless defined $val; + $val =~ s/^\s+//; + + if( $val =~ /^q/i ) { + return $self->query->strand(shift); + } elsif( $val =~ /^(hi|s)/i ) { + return $self->hit->strand(shift); + } elsif ( $val =~ m/^(list|array)/) { + return ($self->query->strand(shift), $self->hit->strand(shift)); + } else { + $self->warn("unrecognized component $val requested\n"); + } + return 0; +} + +=head2 start + + Title : start + Usage : $hsp->start('query') + Function: Retrieves the start for the HSP component requested + Returns : integer + Args : 'hit' or 'subject' or 'sbjct' to retrieve the start of the subject + 'query' to retrieve the query start (default) + +=cut + +sub start { + my $self = shift; + my $val = shift; + $val = 'query' unless defined $val; + $val =~ s/^\s+//; + + if( $val =~ /^q/i ) { + return $self->query->start(shift); + } elsif( $val =~ /^(hi|s)/i ) { + return $self->hit->start(shift); + } else { + $self->warn("unrecognized component $val requested\n"); + } + return 0; +} + +=head2 end + + Title : end + Usage : $hsp->end('query') + Function: Retrieves the end for the HSP component requested + Returns : integer + Args : 'hit' or 'subject' or 'sbjct' to retrieve the end of the subject + 'query' to retrieve the query end (default) + +=cut + +sub end { + my $self = shift; + my $val = shift; + $val = 'query' unless defined $val; + $val =~ s/^\s+//; + + if( $val =~ /^q/i ) { + return $self->query->end(shift); + } elsif( $val =~ /^(hi|s)/i ) { + return $self->hit->end(shift); + } else { + $self->warn("unrecognized component $val requested\n"); + } + return 0; +} + +=head2 seq_str + + Usage : $hsp->seq_str( seq_type ); + Purpose : Get the full query, sbjct, or 'match' sequence as a string. + : The 'match' sequence is the string of symbols in between the + : query and sbjct sequences. + Example : $str = $hsp->seq_str('query'); + Returns : String + Argument : seq_Type = 'query' or 'hit' or 'sbjct' or 'match' + : ('sbjct' is synonymous with 'hit') + : default is 'query' + Throws : Exception if the argument does not match an accepted seq_type. + Comments : + +See Also : L, L, B<_set_match_seq()> + +=cut + +sub seq_str { + my $self = shift; + my $type = shift; + if( $type =~ /^q/i ) { return $self->query_string(shift) } + elsif( $type =~ /^(s|hi)/i ) { return $self->hit_string(shift)} + elsif ( $type =~ /^(ho|ma)/i ) { return $self->hit_string(shift) } + else { + $self->warn("unknown sequence type $type"); + } + return ''; +} + + +=head2 rank + + Usage : $hsp->rank( [string] ); + Purpose : Get the rank of the HSP within a given Blast hit. + Example : $rank = $hsp->rank; + Returns : Integer (1..n) corresponding to the order in which the HSP + appears in the BLAST report. + +=cut + +sub rank { shift->throw_not_implemented } + +=head2 matches + + Usage : $hsp->matches([seq_type], [start], [stop]); + Purpose : Get the total number of identical and conservative matches + : in the query or sbjct sequence for the given HSP. Optionally can + : report data within a defined interval along the seq. + : (Note: 'conservative' matches are called 'positives' in the + : Blast report.) + Example : ($id,$cons) = $hsp_object->matches('hit'); + : ($id,$cons) = $hsp_object->matches('query',300,400); + Returns : 2-element array of integers + Argument : (1) seq_type = 'query' or 'hit' or 'sbjct' (default = query) + : ('sbjct' is synonymous with 'hit') + : (2) start = Starting coordinate (optional) + : (3) stop = Ending coordinate (optional) + Throws : Exception if the supplied coordinates are out of range. + Comments : Relies on seq_str('match') to get the string of alignment symbols + : between the query and sbjct lines which are used for determining + : the number of identical and conservative matches. + +See Also : L, L, L, L + +=cut + +#----------- +sub matches { +#----------- + my( $self, %param ) = @_; + my(@data); + my($seqType, $beg, $end) = ($param{-SEQ}, $param{-START}, $param{-STOP}); + $seqType ||= 'query'; + $seqType = 'sbjct' if $seqType eq 'hit'; + + if(!defined $beg && !defined $end) { + ## Get data for the whole alignment. + push @data, ($self->num_identical, $self->num_conserved); + } else { + ## Get the substring representing the desired sub-section of aln. + $beg ||= 0; + $end ||= 0; + my($start,$stop) = $self->range($seqType); + if($beg == 0) { $beg = $start; $end = $beg+$end; } + elsif($end == 0) { $end = $stop; $beg = $end-$beg; } + + if($end >= $stop) { $end = $stop; } ##ML changed from if (end >stop) + else { $end += 1;} ##ML moved from commented position below, makes + ##more sense here +# if($end > $stop) { $end = $stop; } + if($beg < $start) { $beg = $start; } +# else { $end += 1;} + +# my $seq = substr($self->seq_str('match'), $beg-$start, ($end-$beg)); + + ## ML: START fix for substr out of range error ------------------ + my $seq = ""; + if (($self->algorithm eq 'TBLASTN') and ($seqType eq 'sbjct')) + { + $seq = substr($self->seq_str('match'), + int(($beg-$start)/3), int(($end-$beg+1)/3)); + + } elsif (($self->algorithm eq 'BLASTX') and ($seqType eq 'query')) + { + $seq = substr($self->seq_str('match'), + int(($beg-$start)/3), int(($end-$beg+1)/3)); + } else { + $seq = substr($self->seq_str('match'), + $beg-$start, ($end-$beg)); + } + ## ML: End of fix for substr out of range error ----------------- + + + ## ML: debugging code + ## This is where we get our exception. Try printing out the values going + ## into this: + ## +# print STDERR +# qq(*------------MY EXCEPTION --------------------\nSeq: ") , +# $self->seq_str("$seqType"), qq("\n),$self->rank,",( index:"; +# print STDERR $beg-$start, ", len: ", $end-$beg," ), (HSPRealLen:", +# CORE::length $self->seq_str("$seqType"); +# print STDERR ", HSPCalcLen: ", $stop - $start +1 ," ), +# ( beg: $beg, end: $end ), ( start: $start, stop: stop )\n"; + ## ML: END DEBUGGING CODE---------- + + if(!CORE::length $seq) { + $self->throw("Undefined sub-sequence ($beg,$end). Valid range = $start - $stop"); + } + ## Get data for a substring. +# printf "Collecting HSP subsection data: beg,end = %d,%d; start,stop = %d,%d\n%s<---\n", $beg, $end, $start, $stop, $seq; +# printf "Original match seq:\n%s\n",$self->seq_str('match'); + $seq =~ s/ //g; # remove space (no info). + my $len_cons = CORE::length $seq; + $seq =~ s/\+//g; # remove '+' characters (conservative substitutions) + my $len_id = CORE::length $seq; + push @data, ($len_id, $len_cons); +# printf " HSP = %s\n id = %d; cons = %d\n", $self->rank, $len_id, $len_cons; ; + } + @data; +} + +=head2 n + + Usage : $hsp_obj->n() + Purpose : Get the N value (num HSPs on which P/Expect is based). + : This value is not defined with NCBI Blast2 with gapping. + Returns : Integer or null string if not defined. + Argument : n/a + Throws : n/a + Comments : The 'N' value is listed in parenthesis with P/Expect value: + : e.g., P(3) = 1.2e-30 ---> (N = 3). + : Not defined in NCBI Blast2 with gaps. + : This typically is equal to the number of HSPs but not always. + : To obtain the number of HSPs, use Bio::Search::Hit::HitI::num_hsps(). + +See Also : L + +=cut + +sub n { shift->throw_not_implemented } + +=head2 range + + Usage : $hsp->range( [seq_type] ); + Purpose : Gets the (start, end) coordinates for the query or sbjct sequence + : in the HSP alignment. + Example : ($query_beg, $query_end) = $hsp->range('query'); + : ($hit_beg, $hit_end) = $hsp->range('hit'); + Returns : Two-element array of integers + Argument : seq_type = string, 'query' or 'hit' or 'sbjct' (default = 'query') + : ('sbjct' is synonymous with 'hit') + Throws : n/a + Comments : This is a convenience method for constructions such as + ($hsp->query->start, $hsp->query->end) + +=cut + +sub range { shift->throw_not_implemented } + + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Search/HSP/WABAHSP.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Search/HSP/WABAHSP.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,166 @@ +# $Id: WABAHSP.pm,v 1.5 2002/10/22 07:45:17 lapp Exp $ +# +# BioPerl module for Bio::Search::HSP::WABAHSP +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Search::HSP::WABAHSP - HSP object suitable for describing WABA alignments + +=head1 SYNOPSIS + +# use this object as you would a GenericHSP +# a few other methods have been added including state + +=head1 DESCRIPTION + +This object implements a few of the extra methods such as +hmmstate_string which returns the HMM state representation for the +WABA alignment. We also must implement a method to calculate +homology_string since it is not returned by the algorithm in the +machine readable format. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + + +package Bio::Search::HSP::WABAHSP; +use vars qw(@ISA); +use strict; +use Bio::Root::RootI; +use Bio::Search::HSP::GenericHSP; + +@ISA = qw(Bio::Search::HSP::GenericHSP ); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Search::HSP::WABAHSP(); + Function: Builds a new Bio::Search::HSP::WABAHSP object + Returns : Bio::Search::HSP::WABAHSP + Args : -hmmstate_seq => the string representing the state output from WABA + +=cut + +sub new { + my($class,@args) = @_; + + # gotta do some preprocessing before we send the arguments to the superclass + my ($len,$qs,$hs) = Bio::Root::RootI->_rearrange([qw(HSP_LENGTH + QUERY_SEQ + HIT_SEQ)],@args); + if( $len != length($qs) ) { + Bio::Root::RootI->warn("HSP_LENGTH must equal length of query_seq string, using value from QUERY_SEQ\n"); + $len = length($qs); + } + my( $homol_seq,$gapct,$identical) = ('',0,0); + + for(my $i=0;$i<$len;$i++) { + my $q = substr($qs,$i,1); + my $h = substr($hs,$i,1); + if( $q eq '-' || $h eq '-' ) { + $homol_seq .= ' '; + $gapct ++; + } elsif( $q eq $h ) { + $homol_seq .= '|'; + $identical++; + } else { + $homol_seq .= ' '; + } + } + my $self = $class->SUPER::new('-conserved' => $identical, + '-identical' => $identical, + '-gaps' => $gapct, + '-homology_seq' => $homol_seq, + @args); + + my ($hmmst) = $self->_rearrange([qw(HMMSTATE_SEQ)],@args); + defined $hmmst && $self->hmmstate_string($hmmst); + + $self->add_tag_value('Target' , join(" ","Sequence:".$self->hit->seq_id, + $self->hit->start, $self->hit->end)); + + return $self; +} + +=head2 hmmstate_string + + Title : hmmstate_string + Usage : my $hmmseq = $wabahsp->hmmstate_string(); + Function: Get/Set the WABA HMM stateseq + Returns : string + Args : [optional] string + + +=cut + +sub hmmstate_string{ + my ($self,$val) = @_; + if( defined $val ) { + $self->{'_hmmstate_string'} = $val; + } + return $self->{'_hmmstate_string'}; +} + +=head2 homolgy_string + + Title : homolgy_string + Usage : my $homology_str = $hsp->homology_string(); + Function: Homology string must be calculated for a WABA HSP so we can do + so here and cache the result so it is only done once + Returns : string + Args : none + + +=cut + +sub homolgy_string{ + my ($self) = @_; + return ''; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Search/Hit/BlastHit.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Search/Hit/BlastHit.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,2010 @@ +#----------------------------------------------------------------- +# $Id: BlastHit.pm,v 1.13 2002/10/22 09:36:19 sac Exp $ +# +# BioPerl module Bio::Search::Hit::BlastHit +# +# (This module was originally called Bio::Tools::Blast::Sbjct) +# +# Cared for by Steve Chervitz +# +# You may distribute this module under the same terms as perl itself +#----------------------------------------------------------------- + +## POD Documentation: + +=head1 NAME + +Bio::Search::Hit::BlastHit - Bioperl BLAST Hit object + +=head1 SYNOPSIS + +The construction of BlastHit objects is performed by +Bio::SearchIO::blast::BlastHitFactory in a process that is +orchestrated by the Blast parser (B). +The resulting BlastHits are then accessed via +B). Therefore, you do not need to +use B) directly. If you need to construct +BlastHits directly, see the new() function for details. + +For B BLAST parsing usage examples, see the +B directory of the Bioperl distribution. + + +=head1 DESCRIPTION + +The Bio::Search::Hit::BlastHit.pm module encapsulates data and methods +for manipulating "hits" from a BLAST report. A BLAST hit is a +collection of HSPs along with other metadata such as sequence name +and score information. Hit objects are accessed via +B objects after parsing a BLAST report using +the B system. + +In Blast lingo, the "sbjct" sequences are all the sequences +in a target database which were compared against a "query" sequence. +The terms "sbjct" and "hit" will be used interchangeably in this module. +All methods that take 'sbjct' as an argument also support 'hit' as a +synonym. + +This module supports BLAST versions 1.x and 2.x, gapped and ungapped, +and PSI-BLAST. + + +=head2 HSP Tiling and Ambiguous Alignments + +If a Blast hit has more than one HSP, the Bio::Search::Hit::BlastHit.pm +object has the ability to merge overlapping HSPs into contiguous +blocks. This permits the BlastHit object to sum data across all HSPs +without counting data in the overlapping regions multiple times, which +would happen if data from each overlapping HSP are simply summed. HSP +tiling is performed automatically when methods of the BlastHit object +that rely on tiled data are invoked. These include +L, L, L, +L, L, +L, L. + +It also permits the assessment of an "ambiguous alignment" if the +query (or sbjct) sequences from different HSPs overlap +(see L). The existence +of an overlap could indicate a biologically interesting region in the +sequence, such as a repeated domain. The BlastHit object uses the +C<-OVERLAP> parameter to determine when two sequences overlap; if this is +set to 2 -- the default -- then any two sbjct or query HSP sequences +must overlap by more than two residues to get merged into the same +contig and counted as an overlap. See the L section below for +"issues" with HSP tiling. + + +The results of the HSP tiling is reported with the following ambiguity codes: + + 'q' = Query sequence contains multiple sub-sequences matching + a single region in the sbjct sequence. + + 's' = Subject (BlastHit) sequence contains multiple sub-sequences matching + a single region in the query sequence. + + 'qs' = Both query and sbjct sequences contain more than one + sub-sequence with similarity to the other sequence. + + +For addition information about ambiguous BLAST alignments, see +L and + + http://www-genome.stanford.edu/Sacch3D/help/ambig_aln.html + +=head1 DEPENDENCIES + +Bio::Search::Hit::BlastHit.pm is a concrete class that inherits from +B and B. and relies on +B. + + +=head1 BUGS + +One consequence of the HSP tiling is that methods that rely on HSP +tiling such as L, L, L +etc. may report misleading numbers when C<-OVERLAP> is set to a large +number. For example, say we have two HSPs and the query sequence tile +as follows: + + 1 8 22 30 40 60 + Full seq: ------------------------------------------------------------ + * ** * ** + HSP1: --------------- (6 identical matches) + ** ** ** + HSP2: ------------- (6 identical matches) + + +If C<-OVERLAP> is set to some number over 4, HSP1 and HSP2 will not be +tiled into a single contig and their numbers of identical matches will +be added, giving a total of 12, not 10 if they had be combined into +one contig. This can lead to number greater than 1.0 for methods +L and L. This is less of an issue +with gapped Blast since it tends to combine HSPs that would be listed +separately without gapping. (Fractions E1.0 can be viewed as a +signal for an interesting alignment that warrants further inspection, +thus turning this bug into a feature :-). + +Using large values for C<-OVERLAP> can lead to incorrect numbers +reported by methods that rely on HSP tiling but can be useful if you +care more about detecting ambiguous alignments. Setting C<-OVERLAP> +to zero will lead to the most accurate numbers for the +tiling-dependent methods but will be useless for detecting overlapping +HSPs since all HSPs will appear to overlap. + + +=head1 SEE ALSO + + Bio::Search::HSP::BlastHSP.pm - Blast HSP object. + Bio::Search::Result::BlastResult.pm - Blast Result object. + Bio::Search::Hit::HitI.pm - Interface implemented by BlastHit.pm + Bio::Root::Root.pm - Base class for BlastHit.pm + +Links: + + http://bio.perl.org/Core/POD/Search/Hit/Blast/BlastHSP.pm.html + + http://bio.perl.org/Projects/modules.html - Online module documentation + http://bio.perl.org/Projects/Blast/ - Bioperl Blast Project + http://bio.perl.org/ - Bioperl Project Homepage + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bio.perl.org/bioperl-bugs/ + +=head1 AUTHOR + +Steve Chervitz Esac@bioperl.orgE + +See L for where to send bug reports and comments. + +=head1 ACKNOWLEDGEMENTS + +This software was originally developed in the Department of Genetics +at Stanford University. I would also like to acknowledge my +colleagues at Affymetrix for useful feedback. + +=head1 COPYRIGHT + +Copyright (c) 1996-2001 Steve Chervitz. All Rights Reserved. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Search::Hit::BlastHit; + +use strict; +use Bio::Search::Hit::HitI; +use Bio::Root::Root; +require Bio::Search::BlastUtils; +use vars qw( @ISA %SUMMARY_OFFSET $Revision); + +use overload + '""' => \&to_string; + +@ISA = qw( Bio::Root::Root Bio::Search::Hit::HitI ); + +$Revision = '$Id: BlastHit.pm,v 1.13 2002/10/22 09:36:19 sac Exp $'; #' + + +=head2 new + + Usage : $hit = Bio::Search::Hit::BlastHit->new( %named_params ); + : Bio::Search::Hit::BlastHit.pm objects are constructed + : automatically by Bio::SearchIO::BlastHitFactory.pm, + : so there is no need for direct instantiation. + Purpose : Constructs a new BlastHit object and Initializes key variables + : for the hit. + Returns : A Bio::Search::Hit::BlastHit object + Argument : Named Parameters: + : Parameter keys are case-insensitive. + : -RAW_DATA => array reference holding raw BLAST report data + : for a single hit. This includes all lines + : within the HSP alignment listing section of a + : traditional BLAST or PSI-BLAST (non-XML) report, + : starting at (or just after) the leading '>'. + : -HOLD_RAW_DATA => boolean, should -RAW_DATA be saved within the object. + : -QUERY_LEN => Length of the query sequence + : -ITERATION => integer (PSI-BLAST iteration number in which hit was found) + : -OVERLAP => integer (maximum overlap between adjacent + : HSPs when tiling) + : -PROGRAM => string (type of Blast: BLASTP, BLASTN, etc) + : -SIGNIF => significance + : -IS_PVAL => boolean, true if -SIGNIF contains a P-value + : -SCORE => raw BLAST score + : -FOUND_AGAIN => boolean, true if this was a hit from the + : section of a PSI-BLAST with iteration > 1 + : containing sequences that were also found + : in iteration 1. + Comments : This object accepts raw Blast report data not because it + : is required for parsing, but in order to retrieve it + : (only available if -HOLD_RAW_DATA is set to true). + +See Also : L, L + +=cut + +#------------------- +sub new { +#------------------- + my ($class, @args ) = @_; + my $self = $class->SUPER::new( @args ); + + my ($raw_data, $signif, $is_pval, $hold_raw); + + ($self->{'_blast_program'}, $self->{'_query_length'}, $raw_data, $hold_raw, + $self->{'_overlap'}, $self->{'_iteration'}, $signif, $is_pval, + $self->{'_score'}, $self->{'_found_again'} ) = + $self->_rearrange( [qw(PROGRAM + QUERY_LEN + RAW_DATA + HOLD_RAW_DATA + OVERLAP + ITERATION + SIGNIF + IS_PVAL + SCORE + FOUND_AGAIN )], @args ); + + # TODO: Handle this in parser. Just pass in name parameter. + $self->_set_id( $raw_data->[0] ); + + if($is_pval) { + $self->{'_p'} = $signif; + } else { + $self->{'_expect'} = $signif; + } + + if( $hold_raw ) { + $self->{'_hit_data'} = $raw_data; + } + + return $self; +} + +sub DESTROY { + my $self=shift; + #print STDERR "-->DESTROYING $self\n"; +} + + +#================================================= +# Begin Bio::Search::Hit::HitI implementation +#================================================= + +=head2 algorithm + + Title : algorithm + Usage : $alg = $hit->algorithm(); + Function: Gets the algorithm specification that was used to obtain the hit + For BLAST, the algorithm denotes what type of sequence was aligned + against what (BLASTN: dna-dna, BLASTP prt-prt, BLASTX translated + dna-prt, TBLASTN prt-translated dna, TBLASTX translated + dna-translated dna). + Returns : a scalar string + Args : none + +=cut + +#---------------- +sub algorithm { +#---------------- + my ($self,@args) = @_; + return $self->{'_blast_program'}; +} + +=head2 name + + Usage : $hit->name([string]); + Purpose : Set/Get a string to identify the hit. + Example : $name = $hit->name; + : $hit->name('M81707'); + Returns : String consisting of the hit's name or undef if not set. + Comments : The name is parsed out of the "Query=" line as the first chunk of + non-whitespace text. If you want the rest of the line, use + $hit->description(). + +See Also: L + +=cut + +#' + +#---------------- +sub name { +#---------------- + my $self = shift; + if (@_) { + my $name = shift; + $name =~ s/^\s+|(\s+|,)$//g; + $self->{'_name'} = $name; + } + return $self->{'_name'}; +} + +=head2 description + + Usage : $hit_object->description( [integer] ); + Purpose : Set/Get a description string for the hit. + This is parsed out of the "Query=" line as everything after + the first chunk of non-whitespace text. Use $hit->name() + to get the first chunk (the ID of the sequence). + Example : $description = $hit->description; + : $desc_60char = $hit->description(60); + Argument : Integer (optional) indicating the desired length of the + : description string to be returned. + Returns : String consisting of the hit's description or undef if not set. + +=cut + +#' + +#---------------- +sub description { +#---------------- + my( $self, $len ) = @_; + $len = (defined $len) ? $len : (CORE::length $self->{'_description'}); + return substr( $self->{'_description'}, 0 ,$len ); +} + +=head2 accession + + Title : accession + Usage : $acc = $hit->accession(); + Function: Retrieve the accession (if available) for the hit + Returns : a scalar string (empty string if not set) + Args : none + Comments: Accession numbers are extracted based on the assumption that they + are delimited by | characters (NCBI-style). If this is not the case, + use the name() method and parse it as necessary. + +See Also: L + +=cut + +#-------------------- +sub accession { +#-------------------- + my $self = shift; + if(@_) { $self->{'_accession'} = shift; } + $self->{'_accession'} || ''; +} + +=head2 raw_score + + Usage : $hit_object->raw_score(); + Purpose : Gets the BLAST score of the best HSP for the current Blast hit. + Example : $score = $hit_object->raw_score(); + Returns : Integer + Argument : n/a + Throws : n/a + +See Also : L + +=cut + +#---------- +sub raw_score { +#---------- + my $self = shift; + + # The check for $self->{'_score'} is a remnant from the 'query' mode days + # in which the sbjct object would collect data from the description line only. + + my ($score); + if(not defined($self->{'_score'})) { + $score = $self->hsp->score; + } else { + $score = $self->{'_score'}; + } + return $score; +} + + +=head2 length + + Usage : $hit_object->length(); + Purpose : Get the total length of the hit sequence. + Example : $len = $hit_object->length(); + Returns : Integer + Argument : n/a + Throws : n/a + Comments : Developer note: when using the built-in length function within + : this module, call it as CORE::length(). + +See Also : L, L + +=cut + +#----------- +sub length { +#----------- + my $self = shift; + return $self->{'_length'}; +} + +=head2 significance + +Equivalent to L + +=cut + +#---------------- +sub significance { shift->signif( @_ ); } +#---------------- + + +=head2 next_hsp + + Title : next_hsp + Usage : $hsp = $obj->next_hsp(); + Function : returns the next available High Scoring Pair object + Example : + Returns : Bio::Search::HSP::BlastHSP or undef if finished + Args : none + +=cut + +#---------------- +sub next_hsp { +#---------------- + my $self = shift; + + unless($self->{'_hsp_queue_started'}) { + $self->{'_hsp_queue'} = [$self->hsps()]; + $self->{'_hsp_queue_started'} = 1; + } + pop @{$self->{'_hsp_queue'}}; +} + +#================================================= +# End Bio::Search::Hit::HitI implementation +#================================================= + + +# Providing a more explicit method for getting name of hit +# (corresponds with column name in HitTableWriter) +#---------------- +sub hit_name { +#---------------- + my $self = shift; + $self->name( @_ ); +} + +# Older method Delegates to description() +#---------------- +sub desc { +#---------------- + my $self = shift; + return $self->description( @_ ); +} + +# Providing a more explicit method for getting description of hit +# (corresponds with column name in HitTableWriter) +#---------------- +sub hit_description { +#---------------- + my $self = shift; + return $self->description( @_ ); +} + +=head2 score + +Equivalent to L + +=cut + +#---------------- +sub score { shift->raw_score( @_ ); } +#---------------- + + +=head2 hit_length + +Equivalent to L + +=cut + +# Providing a more explicit method for getting length of hit +#---------------- +sub hit_length { shift->length( @_ ); } +#---------------- + + +=head2 signif + + Usage : $hit_object->signif( [format] ); + Purpose : Get the P or Expect value for the best HSP of the given BLAST hit. + : The value returned is the one which is reported in the description + : section of the Blast report. For Blast1 and WU-Blast2, this + : is a P-value, for Blast2, it is an Expect value. + Example : $obj->signif() # returns 1.3e-34 + : $obj->signif('exp') # returns -34 + : $obj->signif('parts') # returns (1.3, -34) + Returns : Float or scientific notation number (the raw P/Expect value, DEFAULT). + : Integer if format == 'exp' (the magnitude of the base 10 exponent). + : 2-element list (float, int) if format == 'parts' and P/Expect value + : is in scientific notation (see Comments). + Argument : format: string of 'raw' | 'exp' | 'parts' + : 'raw' returns value given in report. Default. (1.2e-34) + : 'exp' returns exponent value only (34) + : 'parts' returns the decimal and exponent as a + : 2-element list (1.2, -34) (see Comments). + Throws : n/a + Comments : The signif() method provides a way to deal with the fact that + : Blast1 and Blast2 formats (and WU- vs. NCBI-BLAST) differ in + : what is reported in the description lines of each hit in the + : Blast report. The signif() method frees any client code from + : having to know if this is a P-value or an Expect value, + : making it easier to write code that can process both + : Blast1 and Blast2 reports. This is not necessarily a good thing, + : since one should always know when one is working with P-values or + : Expect values (hence the deprecated status). + : Use of expect() is recommended since all hits will have an Expect value. + : + : Using the 'parts' argument is not recommended since it will not + : work as expected if the expect value is not in scientific notation. + : That is, floats are not converted into sci notation before + : splitting into parts. + +See Also : L, L, L + +=cut + +#------------- +sub signif { +#------------- +# Some duplication of logic for p(), expect() and signif() for the sake of performance. + my ($self, $fmt) = @_; + + my $val = defined($self->{'_p'}) ? $self->{'_p'} : $self->{'_expect'}; + + # $val can be zero. + defined($val) or $self->throw("Can't get P- or Expect value: HSPs may not have been set."); + + return $val if not $fmt or $fmt =~ /^raw/i; + ## Special formats: exponent-only or as list. + return &Bio::Search::BlastUtils::get_exponent($val) if $fmt =~ /^exp/i; + return (split (/eE/, $val)) if $fmt =~ /^parts/i; + + ## Default: return the raw P/Expect-value. + return $val; +} + +#---------------- +sub raw_hit_data { +#---------------- + my $self = shift; + my $data = '>'; + # Need to add blank lines where we've removed them. + foreach( @{$self->{'_hit_data'}} ) { + if( $_ eq 'end') { + $data .= "\n"; + } + else { + $data .= /^\s*(Score|Query)/ ? "\n$_" : $_; + } + } + return $data; +} + + +#=head2 _set_length +# +# Usage : $hit_object->_set_length( "233" ); +# Purpose : Set the total length of the hit sequence. +# Example : $hit_object->_set_length( $len ); +# Returns : n/a +# Argument : Integer (only when setting). Any commas will be stripped out. +# Throws : n/a +# +#=cut + +#----------- +sub _set_length { +#----------- + my ($self, $len) = @_; + $len =~ s/,//g; # get rid of commas + $self->{'_length'} = $len; +} + +#=head2 _set_description +# +# Usage : Private method; called automatically during construction +# Purpose : Sets the description of the hit sequence. +# : For sequence without descriptions, does not set any description. +# Argument : Array containing description (multiple lines). +# Comments : Processes the supplied description: +# 1. Join all lines into one string. +# 2. Remove sequence id at the beginning of description. +# 3. Removes junk charactes at begin and end of description. +# +#=cut + +#-------------- +sub _set_description { +#-------------- + my( $self, @desc ) = @_; + my( $desc); + +# print STDERR "BlastHit: RAW DESC:\n@desc\n"; + + $desc = join(" ", @desc); + + my $name = $self->name; + + if($desc) { + $desc =~ s/^\s*\S+\s+//; # remove the sequence ID(s) + # This won't work if there's no description. + $desc =~ s/^\s*$name//; # ...but this should. + $desc =~ s/^[\s!]+//; + $desc =~ s/ \d+$//; + $desc =~ s/\.+$//; + $self->{'_description'} = $desc; + } + +# print STDERR "BlastHit: _set_description = $desc\n"; +} + +=head2 to_string + + Title : to_string + Usage : print $hit->to_string; + Function: Returns a string representation for the Blast Hit. + Primarily intended for debugging purposes. + Example : see usage + Returns : A string of the form: + [BlastHit] + e.g.: + [BlastHit] emb|Z46660|SC9725 S.cerevisiae chromosome XIII cosmid + Args : None + +=cut + +#---------------- +sub to_string { +#---------------- + my $self = shift; + return "[BlastHit] " . $self->name . " " . $self->description; +} + + +#=head2 _set_id +# +# Usage : Private method; automatically called by new() +# Purpose : Sets the name of the BlastHit sequence from the BLAST summary line. +# : The identifier is assumed to be the first +# : chunk of non-whitespace characters in the description line +# : Does not assume any semantics in the structure of the identifier +# : (Formerly, this method attempted to extract database name from +# : the seq identifiers, but this was prone to break). +# Returns : n/a +# Argument : String containing description line of the hit from Blast report +# : or first line of an alignment section (with or without the leading '>'). +# Throws : Warning if cannot locate sequence ID. +# +#See Also : L, L +# +#=cut + +#--------------- +sub _set_id { +#--------------- + my( $self, $desc ) = @_; + + # New strategy: Assume only that the ID is the first white space + # delimited chunk. Not attempting to extract accession & database name. + # Clients will have to interpret it as necessary. + if($desc =~ /^>?(\S+)\s*(.*)/) { + my ($name, $desc) = ($1, $2); + $self->name($name); + $self->{'_description'} = $desc; + # Note that this description comes from the summary section of the + # BLAST report and so may be truncated. The full description will be + # set from the alignment section. We're setting description here in case + # the alignment section isn't being parsed. + + # Assuming accession is delimited with | symbols (NCBI-style) + my @pieces = split(/\|/,$name); + my $acc = pop @pieces; + $self->accession( $acc ); + } + else { + $self->warn("Can't locate sequence identifier in summary line.", "Line = $desc"); + $desc = 'Unknown sequence ID' if not $desc; + $self->name($desc); + } +} + + +=head2 ambiguous_aln + + Usage : $ambig_code = $hit_object->ambiguous_aln(); + Purpose : Sets/Gets ambiguity code data member. + Example : (see usage) + Returns : String = 'q', 's', 'qs', '-' + : 'q' = query sequence contains overlapping sub-sequences + : while sbjct does not. + : 's' = sbjct sequence contains overlapping sub-sequences + : while query does not. + : 'qs' = query and sbjct sequence contains overlapping sub-sequences + : relative to each other. + : '-' = query and sbjct sequence do not contains multiple domains + : relative to each other OR both contain the same distribution + : of similar domains. + Argument : n/a + Throws : n/a + Status : Experimental + +See Also : L, L + +=cut + +#-------------------- +sub ambiguous_aln { +#-------------------- + my $self = shift; + if(@_) { $self->{'_ambiguous_aln'} = shift; } + $self->{'_ambiguous_aln'} || '-'; +} + + + +=head2 overlap + + Usage : $blast_object->overlap( [integer] ); + Purpose : Gets/Sets the allowable amount overlap between different HSP sequences. + Example : $blast_object->overlap(5); + : $overlap = $blast_object->overlap; + Returns : Integer. + Argument : integer. + Throws : n/a + Status : Experimental + Comments : Any two HSPs whose sequences overlap by less than or equal + : to the overlap() number of resides will be considered separate HSPs + : and will not get tiled by Bio::Search::BlastUtils::_adjust_contigs(). + +See Also : L, L + +=cut + +#------------- +sub overlap { +#------------- + my $self = shift; + if(@_) { $self->{'_overlap'} = shift; } + defined $self->{'_overlap'} ? $self->{'_overlap'} : 0; +} + + + + + + +=head2 bits + + Usage : $hit_object->bits(); + Purpose : Gets the BLAST bit score of the best HSP for the current Blast hit. + Example : $bits = $hit_object->bits(); + Returns : Integer + Argument : n/a + Throws : Exception if bit score is not set. + Comments : For BLAST1, the non-bit score is listed in the summary line. + +See Also : L + +=cut + +#--------- +sub bits { +#--------- + my $self = shift; + + # The check for $self->{'_bits'} is a remnant from the 'query' mode days + # in which the sbjct object would collect data from the description line only. + + my ($bits); + if(not defined($self->{'_bits'})) { + $bits = $self->hsp->bits; + } else { + $bits = $self->{'_bits'}; + } + return $bits; +} + + + +=head2 n + + Usage : $hit_object->n(); + Purpose : Gets the N number for the current Blast hit. + : This is the number of HSPs in the set which was ascribed + : the lowest P-value (listed on the description line). + : This number is not the same as the total number of HSPs. + : To get the total number of HSPs, use num_hsps(). + Example : $n = $hit_object->n(); + Returns : Integer + Argument : n/a + Throws : Exception if HSPs have not been set (BLAST2 reports). + Comments : Note that the N parameter is not reported in gapped BLAST2. + : Calling n() on such reports will result in a call to num_hsps(). + : The num_hsps() method will count the actual number of + : HSPs in the alignment listing, which may exceed N in + : some cases. + +See Also : L + +=cut + +#----- +sub n { +#----- + my $self = shift; + + # The check for $self->{'_n'} is a remnant from the 'query' mode days + # in which the sbjct object would collect data from the description line only. + + my ($n); + if(not defined($self->{'_n'})) { + $n = $self->hsp->n; + } else { + $n = $self->{'_n'}; + } + $n ||= $self->num_hsps; + + return $n; +} + + + +=head2 frame + + Usage : $hit_object->frame(); + Purpose : Gets the reading frame for the best HSP after HSP tiling. + : This is only valid for BLASTX and TBLASTN/X reports. + Example : $frame = $hit_object->frame(); + Returns : Integer (-2 .. +2) + Argument : n/a + Throws : Exception if HSPs have not been set (BLAST2 reports). + Comments : This method requires that all HSPs be tiled. If they have not + : already been tiled, they will be tiled first automatically.. + : If you don't want the tiled data, iterate through each HSP + : calling frame() on each (use hsps() to get all HSPs). + +See Also : L + +=cut + +#----------' +sub frame { +#---------- + my $self = shift; + + Bio::Search::BlastUtils::tile_hsps($self) if not $self->{'_tile_hsps'}; + + # The check for $self->{'_frame'} is a remnant from the 'query' mode days + # in which the sbjct object would collect data from the description line only. + + my ($frame); + if(not defined($self->{'_frame'})) { + $frame = $self->hsp->frame; + } else { + $frame = $self->{'_frame'}; + } + return $frame; +} + + + + + +=head2 p + + Usage : $hit_object->p( [format] ); + Purpose : Get the P-value for the best HSP of the given BLAST hit. + : (Note that P-values are not provided with NCBI Blast2 reports). + Example : $p = $sbjct->p; + : $p = $sbjct->p('exp'); # get exponent only. + : ($num, $exp) = $sbjct->p('parts'); # split sci notation into parts + Returns : Float or scientific notation number (the raw P-value, DEFAULT). + : Integer if format == 'exp' (the magnitude of the base 10 exponent). + : 2-element list (float, int) if format == 'parts' and P-value + : is in scientific notation (See Comments). + Argument : format: string of 'raw' | 'exp' | 'parts' + : 'raw' returns value given in report. Default. (1.2e-34) + : 'exp' returns exponent value only (34) + : 'parts' returns the decimal and exponent as a + : 2-element list (1.2, -34) (See Comments). + Throws : Warns if no P-value is defined. Uses expect instead. + Comments : Using the 'parts' argument is not recommended since it will not + : work as expected if the P-value is not in scientific notation. + : That is, floats are not converted into sci notation before + : splitting into parts. + +See Also : L, L, L + +=cut + +#-------- +sub p { +#-------- +# Some duplication of logic for p(), expect() and signif() for the sake of performance. + my ($self, $fmt) = @_; + + my $val = $self->{'_p'}; + + # $val can be zero. + if(not defined $val) { + # P-value not defined, must be a NCBI Blast2 report. + # Use expect instead. + $self->warn( "P-value not defined. Using expect() instead."); + $val = $self->{'_expect'}; + } + + return $val if not $fmt or $fmt =~ /^raw/i; + ## Special formats: exponent-only or as list. + return &Bio::Search::BlastUtils::get_exponent($val) if $fmt =~ /^exp/i; + return (split (/eE/, $val)) if $fmt =~ /^parts/i; + + ## Default: return the raw P-value. + return $val; +} + + + +=head2 expect + + Usage : $hit_object->expect( [format] ); + Purpose : Get the Expect value for the best HSP of the given BLAST hit. + Example : $e = $sbjct->expect; + : $e = $sbjct->expect('exp'); # get exponent only. + : ($num, $exp) = $sbjct->expect('parts'); # split sci notation into parts + Returns : Float or scientific notation number (the raw expect value, DEFAULT). + : Integer if format == 'exp' (the magnitude of the base 10 exponent). + : 2-element list (float, int) if format == 'parts' and Expect + : is in scientific notation (see Comments). + Argument : format: string of 'raw' | 'exp' | 'parts' + : 'raw' returns value given in report. Default. (1.2e-34) + : 'exp' returns exponent value only (34) + : 'parts' returns the decimal and exponent as a + : 2-element list (1.2, -34) (see Comments). + Throws : Exception if the Expect value is not defined. + Comments : Using the 'parts' argument is not recommended since it will not + : work as expected if the expect value is not in scientific notation. + : That is, floats are not converted into sci notation before + : splitting into parts. + +See Also : L, L, L + +=cut + +#----------- +sub expect { +#----------- +# Some duplication of logic for p(), expect() and signif() for the sake of performance. + my ($self, $fmt) = @_; + + my $val; + + # For Blast reports that list the P value on the description line, + # getting the expect value requires fully parsing the HSP data. + # For NCBI blast, there's no problem. + if(not defined($self->{'_expect'})) { + if( defined $self->{'_hsps'}) { + $self->{'_expect'} = $val = $self->hsp->expect; + } else { + # If _expect is not set and _hsps are not set, + # then this must be a P-value-based report that was + # run without setting the HSPs (shallow parsing). + $self->throw("Can't get expect value. HSPs have not been set."); + } + } else { + $val = $self->{'_expect'}; + } + + # $val can be zero. + defined($val) or $self->throw("Can't get Expect value."); + + return $val if not $fmt or $fmt =~ /^raw/i; + ## Special formats: exponent-only or as list. + return &Bio::Search::BlastUtils::get_exponent($val) if $fmt =~ /^exp/i; + return (split (/eE/, $val)) if $fmt =~ /^parts/i; + + ## Default: return the raw Expect-value. + return $val; +} + + +=head2 hsps + + Usage : $hit_object->hsps(); + Purpose : Get a list containing all HSP objects. + : Get the numbers of HSPs for the current hit. + Example : @hsps = $hit_object->hsps(); + : $num = $hit_object->hsps(); # alternatively, use num_hsps() + Returns : Array context : list of Bio::Search::HSP::BlastHSP.pm objects. + : Scalar context: integer (number of HSPs). + : (Equivalent to num_hsps()). + Argument : n/a. Relies on wantarray + Throws : Exception if the HSPs have not been collected. + +See Also : L, L + +=cut + +#--------- +sub hsps { +#--------- + my $self = shift; + + if (not ref $self->{'_hsps'}) { + $self->throw("Can't get HSPs: data not collected."); + } + + return wantarray + # returning list containing all HSPs. + ? @{$self->{'_hsps'}} + # returning number of HSPs. + : scalar(@{$self->{'_hsps'}}); +} + + + +=head2 hsp + + Usage : $hit_object->hsp( [string] ); + Purpose : Get a single BlastHSP.pm object for the present BlastHit.pm object. + Example : $hspObj = $hit_object->hsp; # same as 'best' + : $hspObj = $hit_object->hsp('best'); + : $hspObj = $hit_object->hsp('worst'); + Returns : Object reference for a Bio::Search::HSP::BlastHSP.pm object. + Argument : String (or no argument). + : No argument (default) = highest scoring HSP (same as 'best'). + : 'best' or 'first' = highest scoring HSP. + : 'worst' or 'last' = lowest scoring HSP. + Throws : Exception if the HSPs have not been collected. + : Exception if an unrecognized argument is used. + +See Also : L, L() + +=cut + +#---------- +sub hsp { +#---------- + my( $self, $option ) = @_; + $option ||= 'best'; + + if (not ref $self->{'_hsps'}) { + $self->throw("Can't get HSPs: data not collected."); + } + + my @hsps = @{$self->{'_hsps'}}; + + return $hsps[0] if $option =~ /best|first|1/i; + return $hsps[$#hsps] if $option =~ /worst|last/i; + + $self->throw("Can't get HSP for: $option\n" . + "Valid arguments: 'best', 'worst'"); +} + + + +=head2 num_hsps + + Usage : $hit_object->num_hsps(); + Purpose : Get the number of HSPs for the present Blast hit. + Example : $nhsps = $hit_object->num_hsps(); + Returns : Integer + Argument : n/a + Throws : Exception if the HSPs have not been collected. + +See Also : L + +=cut + +#------------- +sub num_hsps { +#------------- + my $self = shift; + + if (not defined $self->{'_hsps'}) { + $self->throw("Can't get HSPs: data not collected."); + } + + return scalar(@{$self->{'_hsps'}}); +} + + + +=head2 logical_length + + Usage : $hit_object->logical_length( [seq_type] ); + : (mostly intended for internal use). + Purpose : Get the logical length of the hit sequence. + : For query sequence of BLASTX and TBLASTX reports and the hit + : sequence of TBLASTN and TBLASTX reports, the returned length + : is the length of the would-be amino acid sequence (length/3). + : For all other BLAST flavors, this function is the same as length(). + Example : $len = $hit_object->logical_length(); + Returns : Integer + Argument : seq_type = 'query' or 'hit' or 'sbjct' (default = 'query') + ('sbjct' is synonymous with 'hit') + Throws : n/a + Comments : This is important for functions like frac_aligned_query() + : which need to operate in amino acid coordinate space when dealing + : with T?BLASTX type reports. + +See Also : L, L, L + +=cut + +#-------------------- +sub logical_length { +#-------------------- + my $self = shift; + my $seqType = shift || 'query'; + $seqType = 'sbjct' if $seqType eq 'hit'; + + my $length; + + # For the sbjct, return logical sbjct length + if( $seqType eq 'sbjct' ) { + $length = $self->{'_logical_length'} || $self->{'_length'}; + } + else { + # Otherwise, return logical query length + $length = $self->{'_query_length'}; + + # Adjust length based on BLAST flavor. + if($self->{'_blast_program'} =~ /T?BLASTX/ ) { + $length /= 3; + } + } + return $length; +} + + +=head2 length_aln + + Usage : $hit_object->length_aln( [seq_type] ); + Purpose : Get the total length of the aligned region for query or sbjct seq. + : This number will include all HSPs + Example : $len = $hit_object->length_aln(); # default = query + : $lenAln = $hit_object->length_aln('query'); + Returns : Integer + Argument : seq_Type = 'query' or 'hit' or 'sbjct' (Default = 'query') + ('sbjct' is synonymous with 'hit') + Throws : Exception if the argument is not recognized. + Comments : This method will report the logical length of the alignment, + : meaning that for TBLAST[NX] reports, the length is reported + : using amino acid coordinate space (i.e., nucleotides / 3). + : + : This method requires that all HSPs be tiled. If they have not + : already been tiled, they will be tiled first automatically.. + : If you don't want the tiled data, iterate through each HSP + : calling length() on each (use hsps() to get all HSPs). + +See Also : L, L, L, L, L, L + +=cut + +#---------------' +sub length_aln { +#--------------- + my( $self, $seqType ) = @_; + + $seqType ||= 'query'; + $seqType = 'sbjct' if $seqType eq 'hit'; + + Bio::Search::BlastUtils::tile_hsps($self) if not $self->{'_tile_hsps'}; + + my $data = $self->{'_length_aln_'.$seqType}; + + ## If we don't have data, figure out what went wrong. + if(!$data) { + $self->throw("Can't get length aln for sequence type \"$seqType\"" . + "Valid types are 'query', 'hit', 'sbjct' ('sbjct' = 'hit')"); + } + $data; +} + + +=head2 gaps + + Usage : $hit_object->gaps( [seq_type] ); + Purpose : Get the number of gaps in the aligned query, sbjct, or both sequences. + : Data is summed across all HSPs. + Example : $qgaps = $hit_object->gaps('query'); + : $hgaps = $hit_object->gaps('hit'); + : $tgaps = $hit_object->gaps(); # default = total (query + hit) + Returns : scalar context: integer + : array context without args: two-element list of integers + : (queryGaps, sbjctGaps) + : Array context can be forced by providing an argument of 'list' or 'array'. + : + : CAUTION: Calling this method within printf or sprintf is arrray context. + : So this function may not give you what you expect. For example: + : printf "Total gaps: %d", $hit->gaps(); + : Actually returns a two-element array, so what gets printed + : is the number of gaps in the query, not the total + : + Argument : seq_type: 'query' | 'hit' or 'sbjct' | 'total' | 'list' (default = 'total') + ('sbjct' is synonymous with 'hit') + Throws : n/a + Comments : If you need data for each HSP, use hsps() and then interate + : through each HSP object. + : This method requires that all HSPs be tiled. If they have not + : already been tiled, they will be tiled first automatically.. + : Not relying on wantarray since that will fail in situations + : such as printf "%d", $hit->gaps() in which you might expect to + : be printing the total gaps, but evaluates to array context. + +See Also : L + +=cut + +#---------- +sub gaps { +#---------- + my( $self, $seqType ) = @_; + + $seqType ||= (wantarray ? 'list' : 'total'); + $seqType = 'sbjct' if $seqType eq 'hit'; + + Bio::Search::BlastUtils::tile_hsps($self) if not $self->{'_tile_hsps'}; + + $seqType = lc($seqType); + + if($seqType =~ /list|array/i) { + return ($self->{'_gaps_query'}, $self->{'_gaps_sbjct'}); + } + + if($seqType eq 'total') { + return ($self->{'_gaps_query'} + $self->{'_gaps_sbjct'}) || 0; + } else { + return $self->{'_gaps_'.$seqType} || 0; + } +} + + + +=head2 matches + + Usage : $hit_object->matches( [class] ); + Purpose : Get the total number of identical or conserved matches + : (or both) across all HSPs. + : (Note: 'conservative' matches are indicated as 'positives' + : in the Blast report.) + Example : ($id,$cons) = $hit_object->matches(); # no argument + : $id = $hit_object->matches('id'); + : $cons = $hit_object->matches('cons'); + Returns : Integer or a 2-element array of integers + Argument : class = 'id' | 'cons' OR none. + : If no argument is provided, both identical and conservative + : numbers are returned in a two element list. + : (Other terms can be used to refer to the conservative + : matches, e.g., 'positive'. All that is checked is whether or + : not the supplied string starts with 'id'. If not, the + : conservative matches are returned.) + Throws : Exception if the requested data cannot be obtained. + Comments : If you need data for each HSP, use hsps() and then interate + : through the HSP objects. + : Does not rely on wantarray to return a list. Only checks for + : the presence of an argument (no arg = return list). + +See Also : L, L + +=cut + +#--------------- +sub matches { +#--------------- + my( $self, $arg) = @_; + my(@data,$data); + + if(!$arg) { + @data = ($self->{'_totalIdentical'}, $self->{'_totalConserved'}); + + return @data if @data; + + } else { + + if($arg =~ /^id/i) { + $data = $self->{'_totalIdentical'}; + } else { + $data = $self->{'_totalConserved'}; + } + return $data if $data; + } + + ## Something went wrong if we make it to here. + $self->throw("Can't get identical or conserved data: no data."); +} + + +=head2 start + + Usage : $sbjct->start( [seq_type] ); + Purpose : Gets the start coordinate for the query, sbjct, or both sequences + : in the BlastHit object. If there is more than one HSP, the lowest start + : value of all HSPs is returned. + Example : $qbeg = $sbjct->start('query'); + : $sbeg = $sbjct->start('hit'); + : ($qbeg, $sbeg) = $sbjct->start(); + Returns : scalar context: integer + : array context without args: list of two integers (queryStart, sbjctStart) + : Array context can be "induced" by providing an argument of 'list' or 'array'. + Argument : In scalar context: seq_type = 'query' or 'hit' or 'sbjct' (default = 'query') + ('sbjct' is synonymous with 'hit') + Throws : n/a + Comments : This method requires that all HSPs be tiled. If there is more than one + : HSP and they have not already been tiled, they will be tiled first automatically.. + : Remember that the start and end coordinates of all HSPs are + : normalized so that start < end. Strand information can be + : obtained by calling $hit->strand(). + +See Also : L, L, L, L, L + +=cut + +#---------- +sub start { +#---------- + my ($self, $seqType) = @_; + + $seqType ||= (wantarray ? 'list' : 'query'); + $seqType = 'sbjct' if $seqType eq 'hit'; + + # If there is only one HSP, defer this call to the solitary HSP. + if($self->num_hsps == 1) { + return $self->hsp->start($seqType); + } else { + Bio::Search::BlastUtils::tile_hsps($self) if not $self->{'_tile_hsps'}; + if($seqType =~ /list|array/i) { + return ($self->{'_queryStart'}, $self->{'_sbjctStart'}); + } else { + ## Sensitive to member name changes. + $seqType = "_\L$seqType\E"; + return $self->{$seqType.'Start'}; + } + } +} + + +=head2 end + + Usage : $sbjct->end( [seq_type] ); + Purpose : Gets the end coordinate for the query, sbjct, or both sequences + : in the BlastHit object. If there is more than one HSP, the largest end + : value of all HSPs is returned. + Example : $qend = $sbjct->end('query'); + : $send = $sbjct->end('hit'); + : ($qend, $send) = $sbjct->end(); + Returns : scalar context: integer + : array context without args: list of two integers (queryEnd, sbjctEnd) + : Array context can be "induced" by providing an argument of 'list' or 'array'. + Argument : In scalar context: seq_type = 'query' or 'sbjct' + : (case insensitive). If not supplied, 'query' is used. + Throws : n/a + Comments : This method requires that all HSPs be tiled. If there is more than one + : HSP and they have not already been tiled, they will be tiled first automatically.. + : Remember that the start and end coordinates of all HSPs are + : normalized so that start < end. Strand information can be + : obtained by calling $hit->strand(). + +See Also : L, L, L, L, L + +=cut + +#---------- +sub end { +#---------- + my ($self, $seqType) = @_; + + $seqType ||= (wantarray ? 'list' : 'query'); + $seqType = 'sbjct' if $seqType eq 'hit'; + + # If there is only one HSP, defer this call to the solitary HSP. + if($self->num_hsps == 1) { + return $self->hsp->end($seqType); + } else { + Bio::Search::BlastUtils::tile_hsps($self) if not $self->{'_tile_hsps'}; + if($seqType =~ /list|array/i) { + return ($self->{'_queryStop'}, $self->{'_sbjctStop'}); + } else { + ## Sensitive to member name changes. + $seqType = "_\L$seqType\E"; + return $self->{$seqType.'Stop'}; + } + } +} + +=head2 range + + Usage : $sbjct->range( [seq_type] ); + Purpose : Gets the (start, end) coordinates for the query or sbjct sequence + : in the HSP alignment. + Example : ($qbeg, $qend) = $sbjct->range('query'); + : ($sbeg, $send) = $sbjct->range('hit'); + Returns : Two-element array of integers + Argument : seq_type = string, 'query' or 'hit' or 'sbjct' (default = 'query') + ('sbjct' is synonymous with 'hit') + Throws : n/a + +See Also : L, L + +=cut + +#---------- +sub range { +#---------- + my ($self, $seqType) = @_; + $seqType ||= 'query'; + $seqType = 'sbjct' if $seqType eq 'hit'; + return ($self->start($seqType), $self->end($seqType)); +} + + +=head2 frac_identical + + Usage : $hit_object->frac_identical( [seq_type] ); + Purpose : Get the overall fraction of identical positions across all HSPs. + : The number refers to only the aligned regions and does not + : account for unaligned regions in between the HSPs, if any. + Example : $frac_iden = $hit_object->frac_identical('query'); + Returns : Float (2-decimal precision, e.g., 0.75). + Argument : seq_type: 'query' | 'hit' or 'sbjct' | 'total' + : default = 'query' (but see comments below). + : ('sbjct' is synonymous with 'hit') + Throws : n/a + Comments : Different versions of Blast report different values for the total + : length of the alignment. This is the number reported in the + : denominators in the stats section: + : "Identical = 34/120 Positives = 67/120". + : NCBI BLAST uses the total length of the alignment (with gaps) + : WU-BLAST uses the length of the query sequence (without gaps). + : + : Therefore, when called with an argument of 'total', + : this method will report different values depending on the + : version of BLAST used. Total does NOT take into account HSP + : tiling, so it should not be used. + : + : To get the fraction identical among only the aligned residues, + : ignoring the gaps, call this method without an argument or + : with an argument of 'query' or 'hit'. + : + : If you need data for each HSP, use hsps() and then iterate + : through the HSP objects. + : This method requires that all HSPs be tiled. If they have not + : already been tiled, they will be tiled first automatically. + +See Also : L, L, L, L + +=cut + +#------------------ +sub frac_identical { +#------------------ + my ($self, $seqType) = @_; + $seqType ||= 'query'; + $seqType = 'sbjct' if $seqType eq 'hit'; + + ## Sensitive to member name format. + $seqType = lc($seqType); + + Bio::Search::BlastUtils::tile_hsps($self) if not $self->{'_tile_hsps'}; + + sprintf( "%.2f", $self->{'_totalIdentical'}/$self->{'_length_aln_'.$seqType}); +} + + + +=head2 frac_conserved + + Usage : $hit_object->frac_conserved( [seq_type] ); + Purpose : Get the overall fraction of conserved positions across all HSPs. + : The number refers to only the aligned regions and does not + : account for unaligned regions in between the HSPs, if any. + Example : $frac_cons = $hit_object->frac_conserved('hit'); + Returns : Float (2-decimal precision, e.g., 0.75). + Argument : seq_type: 'query' | 'hit' or 'sbjct' | 'total' + : default = 'query' (but see comments below). + : ('sbjct' is synonymous with 'hit') + Throws : n/a + Comments : Different versions of Blast report different values for the total + : length of the alignment. This is the number reported in the + : denominators in the stats section: + : "Positives = 34/120 Positives = 67/120". + : NCBI BLAST uses the total length of the alignment (with gaps) + : WU-BLAST uses the length of the query sequence (without gaps). + : + : Therefore, when called with an argument of 'total', + : this method will report different values depending on the + : version of BLAST used. Total does NOT take into account HSP + : tiling, so it should not be used. + : + : To get the fraction conserved among only the aligned residues, + : ignoring the gaps, call this method without an argument or + : with an argument of 'query' or 'hit'. + : + : If you need data for each HSP, use hsps() and then interate + : through the HSP objects. + : This method requires that all HSPs be tiled. If they have not + : already been tiled, they will be tiled first automatically. + +See Also : L, L, L + +=cut + +#-------------------- +sub frac_conserved { +#-------------------- + my ($self, $seqType) = @_; + $seqType ||= 'query'; + $seqType = 'sbjct' if $seqType eq 'hit'; + + ## Sensitive to member name format. + $seqType = lc($seqType); + + Bio::Search::BlastUtils::tile_hsps($self) if not $self->{'_tile_hsps'}; + + sprintf( "%.2f", $self->{'_totalConserved'}/$self->{'_length_aln_'.$seqType}); +} + + + + +=head2 frac_aligned_query + + Usage : $hit_object->frac_aligned_query(); + Purpose : Get the fraction of the query sequence which has been aligned + : across all HSPs (not including intervals between non-overlapping + : HSPs). + Example : $frac_alnq = $hit_object->frac_aligned_query(); + Returns : Float (2-decimal precision, e.g., 0.75). + Argument : n/a + Throws : n/a + Comments : If you need data for each HSP, use hsps() and then interate + : through the HSP objects. + : To compute the fraction aligned, the logical length of the query + : sequence is used, meaning that for [T]BLASTX reports, the + : full length of the query sequence is converted into amino acids + : by dividing by 3. This is necessary because of the way + : the lengths of aligned sequences are computed. + : This method requires that all HSPs be tiled. If they have not + : already been tiled, they will be tiled first automatically. + +See Also : L, L, L, L + +=cut + +#---------------------- +sub frac_aligned_query { +#---------------------- + my $self = shift; + + Bio::Search::BlastUtils::tile_hsps($self) if not $self->{'_tile_hsps'}; + + sprintf( "%.2f", $self->{'_length_aln_query'}/$self->logical_length('query')); +} + + + +=head2 frac_aligned_hit + + Usage : $hit_object->frac_aligned_hit(); + Purpose : Get the fraction of the hit (sbjct) sequence which has been aligned + : across all HSPs (not including intervals between non-overlapping + : HSPs). + Example : $frac_alnq = $hit_object->frac_aligned_hit(); + Returns : Float (2-decimal precision, e.g., 0.75). + Argument : n/a + Throws : n/a + Comments : If you need data for each HSP, use hsps() and then interate + : through the HSP objects. + : To compute the fraction aligned, the logical length of the sbjct + : sequence is used, meaning that for TBLAST[NX] reports, the + : full length of the sbjct sequence is converted into amino acids + : by dividing by 3. This is necessary because of the way + : the lengths of aligned sequences are computed. + : This method requires that all HSPs be tiled. If they have not + : already been tiled, they will be tiled first automatically. + +See Also : L, L, , L, L, L + +=cut + +#-------------------- +sub frac_aligned_hit { +#-------------------- + my $self = shift; + + Bio::Search::BlastUtils::tile_hsps($self) if not $self->{'_tile_hsps'}; + + sprintf( "%.2f", $self->{'_length_aln_sbjct'}/$self->logical_length('sbjct')); +} + + +## These methods are being maintained for backward compatibility. + +=head2 frac_aligned_sbjct + +Same as L + +=cut + +#---------------- +sub frac_aligned_sbjct { my $self=shift; $self->frac_aligned_hit(@_); } +#---------------- + +=head2 num_unaligned_sbjct + +Same as L + +=cut + +#---------------- +sub num_unaligned_sbjct { my $self=shift; $self->num_unaligned_hit(@_); } +#---------------- + + + +=head2 num_unaligned_hit + + Usage : $hit_object->num_unaligned_hit(); + Purpose : Get the number of the unaligned residues in the hit sequence. + : Sums across all all HSPs. + Example : $num_unaln = $hit_object->num_unaligned_hit(); + Returns : Integer + Argument : n/a + Throws : n/a + Comments : See notes regarding logical lengths in the comments for frac_aligned_hit(). + : They apply here as well. + : If you need data for each HSP, use hsps() and then interate + : through the HSP objects. + : This method requires that all HSPs be tiled. If they have not + : already been tiled, they will be tiled first automatically.. + +See Also : L, L, L + +=cut + +#--------------------- +sub num_unaligned_hit { +#--------------------- + my $self = shift; + + Bio::Search::BlastUtils::tile_hsps($self) if not $self->{'_tile_hsps'}; + + my $num = $self->logical_length('sbjct') - $self->{'_length_aln_sbjct'}; + ($num < 0 ? 0 : $num ); +} + + +=head2 num_unaligned_query + + Usage : $hit_object->num_unaligned_query(); + Purpose : Get the number of the unaligned residues in the query sequence. + : Sums across all all HSPs. + Example : $num_unaln = $hit_object->num_unaligned_query(); + Returns : Integer + Argument : n/a + Throws : n/a + Comments : See notes regarding logical lengths in the comments for frac_aligned_query(). + : They apply here as well. + : If you need data for each HSP, use hsps() and then interate + : through the HSP objects. + : This method requires that all HSPs be tiled. If they have not + : already been tiled, they will be tiled first automatically.. + +See Also : L, L, L + +=cut + +#----------------------- +sub num_unaligned_query { +#----------------------- + my $self = shift; + + Bio::Search::BlastUtils::tile_hsps($self) if not $self->{'_tile_hsps'}; + + my $num = $self->logical_length('query') - $self->{'_length_aln_query'}; + ($num < 0 ? 0 : $num ); +} + + + +=head2 seq_inds + + Usage : $hit->seq_inds( seq_type, class, collapse ); + Purpose : Get a list of residue positions (indices) across all HSPs + : for identical or conserved residues in the query or sbjct sequence. + Example : @s_ind = $hit->seq_inds('query', 'identical'); + : @h_ind = $hit->seq_inds('hit', 'conserved'); + : @h_ind = $hit->seq_inds('hit', 'conserved', 1); + Returns : Array of integers + : May include ranges if collapse is non-zero. + Argument : [0] seq_type = 'query' or 'hit' or 'sbjct' (default = 'query') + : ('sbjct' is synonymous with 'hit') + : [1] class = 'identical' or 'conserved' (default = 'identical') + : (can be shortened to 'id' or 'cons') + : (actually, anything not 'id' will evaluate to 'conserved'). + : [2] collapse = boolean, if non-zero, consecutive positions are merged + : using a range notation, e.g., "1 2 3 4 5 7 9 10 11" + : collapses to "1-5 7 9-11". This is useful for + : consolidating long lists. Default = no collapse. + Throws : n/a. + Comments : Note that HSPs are not tiled for this. This could be a problem + : for hits containing mutually exclusive HSPs. + : TODO: Consider tiling and then reporting seq_inds for the + : best HSP contig. + +See Also : L + +=cut + +#------------- +sub seq_inds { +#------------- + my ($self, $seqType, $class, $collapse) = @_; + + $seqType ||= 'query'; + $class ||= 'identical'; + $collapse ||= 0; + + $seqType = 'sbjct' if $seqType eq 'hit'; + + my (@inds, $hsp); + foreach $hsp ($self->hsps) { + # This will merge data for all HSPs together. + push @inds, $hsp->seq_inds($seqType, $class); + } + + # Need to remove duplicates and sort the merged positions. + if(@inds) { + my %tmp = map { $_, 1 } @inds; + @inds = sort {$a <=> $b} keys %tmp; + } + + $collapse ? &Bio::Search::BlastUtils::collapse_nums(@inds) : @inds; +} + + +=head2 iteration + + Usage : $sbjct->iteration( ); + Purpose : Gets the iteration number in which the Hit was found. + Example : $iteration_num = $sbjct->iteration(); + Returns : Integer greater than or equal to 1 + Non-PSI-BLAST reports will report iteration as 1, but this number + is only meaningful for PSI-BLAST reports. + Argument : none + Throws : none + +See Also : L + +=cut + +#---------------- +sub iteration { shift->{'_iteration'} } +#---------------- + + +=head2 found_again + + Usage : $sbjct->found_again; + Purpose : Gets a boolean indicator whether or not the hit has + been found in a previous iteration. + This is only applicable to PSI-BLAST reports. + + This method indicates if the hit was reported in the + "Sequences used in model and found again" section of the + PSI-BLAST report or if it was reported in the + "Sequences not found previously or not previously below threshold" + section of the PSI-BLAST report. Only for hits in iteration > 1. + + Example : if( $sbjct->found_again()) { ... }; + Returns : Boolean (1 or 0) for PSI-BLAST report iterations greater than 1. + Returns undef for PSI-BLAST report iteration 1 and non PSI_BLAST + reports. + Argument : none + Throws : none + +See Also : L + +=cut + +#---------------- +sub found_again { shift->{'_found_again'} } +#---------------- + + +=head2 strand + + Usage : $sbjct->strand( [seq_type] ); + Purpose : Gets the strand(s) for the query, sbjct, or both sequences + : in the best HSP of the BlastHit object after HSP tiling. + : Only valid for BLASTN, TBLASTX, BLASTX-query, TBLASTN-hit. + Example : $qstrand = $sbjct->strand('query'); + : $sstrand = $sbjct->strand('hit'); + : ($qstrand, $sstrand) = $sbjct->strand(); + Returns : scalar context: integer '1', '-1', or '0' + : array context without args: list of two strings (queryStrand, sbjctStrand) + : Array context can be "induced" by providing an argument of 'list' or 'array'. + Argument : In scalar context: seq_type = 'query' or 'hit' or 'sbjct' (default = 'query') + ('sbjct' is synonymous with 'hit') + Throws : n/a + Comments : This method requires that all HSPs be tiled. If they have not + : already been tiled, they will be tiled first automatically.. + : If you don't want the tiled data, iterate through each HSP + : calling strand() on each (use hsps() to get all HSPs). + : + : Formerly (prior to 10/21/02), this method would return the + : string "-1/1" for hits with HSPs on both strands. + : However, now that strand and frame is properly being accounted + : for during HSP tiling, it makes more sense for strand() + : to return the strand data for the best HSP after tiling. + : + : If you really want to know about hits on opposite strands, + : you should be iterating through the HSPs using methods on the + : HSP objects. + : + : A possible use case where knowing whether a hit has HSPs + : on both strands would be when filtering via SearchIO for hits with + : this property. However, in this case it would be better to have a + : dedicated method such as $hit->hsps_on_both_strands(). Similarly + : for frame. This could be provided if there is interest. + +See Also : B() + +=cut + +#----------' +sub strand { +#---------- + my ($self, $seqType) = @_; + + Bio::Search::BlastUtils::tile_hsps($self) if not $self->{'_tile_hsps'}; + + $seqType ||= (wantarray ? 'list' : 'query'); + $seqType = 'sbjct' if $seqType eq 'hit'; + + my ($qstr, $hstr); + # If there is only one HSP, defer this call to the solitary HSP. + if($self->num_hsps == 1) { + return $self->hsp->strand($seqType); + } + elsif( defined $self->{'_qstrand'}) { + # Get the data computed during hsp tiling. + $qstr = $self->{'_qstrand'}; + $hstr = $self->{'_sstrand'}; + } + else { + # otherwise, iterate through all HSPs collecting strand info. + # This will return the string "-1/1" if there are HSPs on different strands. + # NOTE: This was the pre-10/21/02 procedure which will no longer be used, + # (unless the above elsif{} is commented out). + my (%qstr, %hstr); + foreach my $hsp( $self->hsps ) { + my ( $q, $h ) = $hsp->strand(); + $qstr{ $q }++; + $hstr{ $h }++; + } + $qstr = join( '/', sort keys %qstr); + $hstr = join( '/', sort keys %hstr); + } + + if($seqType =~ /list|array/i) { + return ($qstr, $hstr); + } elsif( $seqType eq 'query' ) { + return $qstr; + } else { + return $hstr; + } +} + + +1; +__END__ + +##################################################################################### +# END OF CLASS # +##################################################################################### + + +=head1 FOR DEVELOPERS ONLY + +=head2 Data Members + +Information about the various data members of this module is provided for those +wishing to modify or understand the code. Two things to bear in mind: + +=over 4 + +=item 1 Do NOT rely on these in any code outside of this module. + +All data members are prefixed with an underscore to signify that they are private. +Always use accessor methods. If the accessor doesn't exist or is inadequate, +create or modify an accessor (and let me know, too!). (An exception to this might +be for BlastHSP.pm which is more tightly coupled to BlastHit.pm and +may access BlastHit data members directly for efficiency purposes, but probably +should not). + +=item 2 This documentation may be incomplete and out of date. + +It is easy for these data member descriptions to become obsolete as +this module is still evolving. Always double check this info and search +for members not described here. + +=back + +An instance of Bio::Search::Hit::BlastHit.pm is a blessed reference to a hash containing +all or some of the following fields: + + FIELD VALUE + -------------------------------------------------------------- + _hsps : Array ref for a list of Bio::Search::HSP::BlastHSP.pm objects. + : + _db : Database identifier from the summary line. + : + _desc : Description data for the hit from the summary line. + : + _length : Total length of the hit sequence. + : + _score : BLAST score. + : + _bits : BLAST score (in bits). Matrix-independent. + : + _p : BLAST P value. Obtained from summary section. (Blast1/WU-Blast only) + : + _expect : BLAST Expect value. Obtained from summary section. + : + _n : BLAST N value (number of HSPs) (Blast1/WU-Blast2 only) + : + _frame : Reading frame for TBLASTN and TBLASTX analyses. + : + _totalIdentical: Total number of identical aligned monomers. + : + _totalConserved: Total number of conserved aligned monomers (a.k.a. "positives"). + : + _overlap : Maximum number of overlapping residues between adjacent HSPs + : before considering the alignment to be ambiguous. + : + _ambiguous_aln : Boolean. True if the alignment of all HSPs is ambiguous. + : + _length_aln_query : Length of the aligned region of the query sequence. + : + _length_aln_sbjct : Length of the aligned region of the sbjct sequence. + + +=cut + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Search/Hit/Fasta.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Search/Hit/Fasta.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,121 @@ + +# +# BioPerl module for Bio::Search::Hit::Fasta +# +# Cared for by Aaron Mackey +# +# Copyright Aaron Mackey +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Search::Hit::Fasta - Hit object specific for Fasta-generated hits + +=head1 SYNOPSIS + + These objects are generated automatically by Bio::Search::Processor::Fasta, +and shouldn't be used directly. + + +=head1 DESCRIPTION + + Bio::Search::Hit::* objects are data structures that contain information +about specific hits obtained during a library search. Some information will +be algorithm-specific, but others will be generally defined, such as the +ability to obtain alignment objects corresponding to each hit. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Aaron Mackey + +Email amackey@virginia.edu + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +#' +# Let the code begin... + +package Bio::Search::Hit::Fasta; + +use vars qw($AUTOLOAD @ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Object + +use Bio::Search::Hit::HitI; + +@ISA = qw(Bio::Search::Hit::HitI); + +my @AUTOLOAD_OK = qw( _ID + _DESC + _SIZE + _INITN + _INIT1 + _OPT + _ZSC + _E_VAL + ); + +my %AUTOLOAD_OK = (); +@AUTOLOAD_OK{@AUTOLOAD_OK} = (1) x @AUTOLOAD_OK; + +# new() is inherited from Bio::Root::Object + +# _initialize is where the heavy stuff will happen when new is called + +sub _initialize { + my($self, %args) = @_; + + my $make = $self->SUPER::_initialize(%args); + + while (my ($key, $val) = each %args) { + $key = '_' . uc($key); + $self->$key($val); + } + + return $make; # success - we hope! +} + +sub AUTOLOAD { + my ($self, $val) = @_; + + $AUTOLOAD =~ s/.*:://; + + if ( $AUTOLOAD_OK{$AUTOLOAD} ) { + $self->{$AUTOLOAD} = $val if defined $val; + return $self->{$AUTOLOAD}; + } else { + $self->throw("Unallowed accessor: $AUTOLOAD !"); + } +} + +1; + +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Search/Hit/GenericHit.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Search/Hit/GenericHit.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1504 @@ +# $Id: GenericHit.pm,v 1.20.2.1 2003/02/28 09:27:56 jason Exp $ +# +# BioPerl module for Bio::Search::Hit::GenericHit +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Search::Hit::GenericHit - A generic implementation of the Bio::Search::Hit::HitI interface + +=head1 SYNOPSIS + + use Bio::Search::Hit::GenericHit; + my $hit = new Bio::Search::Hit::GenericHit(-algorithm => 'blastp'); + + # more likely + use Bio::SearchIO; + my $parser = new Bio::SearchIO(-format => 'blast', -file => 'result.bls'); + + my $result = $parser->next_result; + my $hit = $result->next_hit; + + +=head1 DESCRIPTION + +This object handles the hit data from a Database Sequence Search such +as FASTA or BLAST. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich and Steve Chervitz + +Email jason@bioperl.org +Email sac@bioperl.org + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Search::Hit::GenericHit; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::Search::Hit::HitI; +require Bio::Search::SearchUtils; + +@ISA = qw(Bio::Root::Root Bio::Search::Hit::HitI ); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Search::Hit::GenericHit(); + Function: Builds a new Bio::Search::Hit::GenericHit object + Returns : Bio::Search::Hit::GenericHit + Args : -name => Name of Hit (required) + -description => Description (optional) + -accession => Accession number (optional) + -length => Length of the Hit (optional) + -score => Raw Score for the Hit (optional) + -significance => Significance value for the Hit (optional) + -algorithm => Algorithm used (BLASTP, FASTX, etc...) + -hsps => Array ref of HSPs for this Hit. + -iteration => integer for the PSI-Blast iteration number + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + my ($hsps, $name,$query_len,$desc, $acc, $locus, $length, + $score,$algo,$signif,$bits, + $iter,$rank) = $self->_rearrange([qw(HSPS + NAME + QUERY_LEN + DESCRIPTION + ACCESSION + LOCUS + LENGTH SCORE ALGORITHM + SIGNIFICANCE BITS ITERATION + RANK )], @args); + + $self->{'_query_length'} = $query_len; + + if( ! defined $name ) { + $self->throw("Must have defined a valid name for Hit"); + } else { + $self->name($name); + } + + defined $acc && $self->accession($acc); + defined $locus && $self->locus($locus); + defined $desc && $self->description($desc); + defined $length && $self->length($length); + defined $algo && $self->algorithm($algo); + defined $signif && $self->significance($signif); + defined $score && $self->raw_score($score); + defined $bits && $self->bits($bits); + defined $iter && $self->iteration($iter); + defined $rank && $self->rank($rank); + + $self->{'_iterator'} = 0; + $self->{'_hsps'} = []; + if( defined $hsps ) { + if( ref($hsps) !~ /array/i ) { + $self->warn("Did not specify a valid array ref for the param HSPS ($hsps)"); + } else { + while( @$hsps ) { + $self->add_hsp(shift @$hsps ); + } + } + } + return $self; +} + +=head2 add_hsp + + Title : add_hsp + Usage : $hit->add_hsp($hsp) + Function: Add a HSP to the collection of HSPs for a Hit + Returns : number of HSPs in the Hit + Args : Bio::Search::HSP::HSPI object + + +=cut + +sub add_hsp { + my ($self,$hsp) = @_; + if( !defined $hsp || ! $hsp->isa('Bio::Search::HSP::HSPI') ) { + $self->warn("Must provide a valid Bio::Search::HSP::HSPI object to object: $self method: add_hsp value: $hsp"); + return undef; + } + push @{$self->{'_hsps'}}, $hsp; + return scalar @{$self->{'_hsps'}}; +} + + + +=head2 Bio::Search::Hit::HitI methods + +Implementation of Bio::Search::Hit::HitI methods + +=head2 name + + Title : name + Usage : $hit_name = $hit->name(); + Function: returns the name of the Hit sequence + Returns : a scalar string + Args : [optional] scalar string to set the name + +=cut + +sub name { + my ($self,$value) = @_; + my $previous = $self->{'_name'}; + if( defined $value || ! defined $previous ) { + $value = $previous = '' unless defined $value; + $self->{'_name'} = $value; + } + return $previous; +} + +=head2 accession + + Title : accession + Usage : $acc = $hit->accession(); + Function: Retrieve the accession (if available) for the hit + Returns : a scalar string (empty string if not set) + Args : none + +=cut + +sub accession { + my ($self,$value) = @_; + my $previous = $self->{'_accession'}; + if( defined $value || ! defined $previous ) { + $value = $previous = '' unless defined $value; + $self->{'_accession'} = $value; + } + return $previous; +} + +=head2 description + + Title : description + Usage : $desc = $hit->description(); + Function: Retrieve the description for the hit + Returns : a scalar string + Args : [optional] scalar string to set the descrition + +=cut + +sub description { + my ($self,$value) = @_; + my $previous = $self->{'_description'}; + if( defined $value || ! defined $previous ) { + $value = $previous = '' unless defined $value; + $self->{'_description'} = $value; + } + return $previous; +} + +=head2 length + + Title : length + Usage : my $len = $hit->length + Function: Returns the length of the hit + Returns : integer + Args : [optional] integer to set the length + +=cut + +sub length { + my ($self,$value) = @_; + my $previous = $self->{'_length'}; + if( defined $value || ! defined $previous ) { + $value = $previous = 0 unless defined $value; + $self->{'_length'} = $value; + } + return $previous; +} + + +=head2 algorithm + + Title : algorithm + Usage : $alg = $hit->algorithm(); + Function: Gets the algorithm specification that was used to obtain the hit + For BLAST, the algorithm denotes what type of sequence was aligned + against what (BLASTN: dna-dna, BLASTP prt-prt, BLASTX translated + dna-prt, TBLASTN prt-translated dna, TBLASTX translated + dna-translated dna). + Returns : a scalar string + Args : [optional] scalar string to set the algorithm + +=cut + +sub algorithm { + my ($self,$value) = @_; + my $previous = $self->{'_algorithm'}; + if( defined $value || ! defined $previous ) { + $value = $previous = '' unless defined $value; + $self->{'_algorithm'} = $value; + } + return $previous; +} + +=head2 raw_score + + Title : raw_score + Usage : $score = $hit->raw_score(); + Function: Gets the "raw score" generated by the algorithm. What + this score is exactly will vary from algorithm to algorithm, + returning undef if unavailable. + Returns : a scalar value + Args : [optional] scalar value to set the raw score + +=cut + +sub raw_score { + my ($self,$value) = @_; + my $previous = $self->{'_score'}; + if( defined $value || ! defined $previous ) { + $value = $previous = '' unless defined $value; + $self->{'_score'} = $value; + } + return $previous; +} + +=head2 significance + + Title : significance + Usage : $significance = $hit->significance(); + Function: Used to obtain the E or P value of a hit, i.e. the probability that + this particular hit was obtained purely by random chance. If + information is not available (nor calculatable from other + information sources), return undef. + Returns : a scalar value or undef if unavailable + Args : [optional] scalar value to set the significance + +=cut + +sub significance { + my ($self,$value) = @_; + my $previous = $self->{'_significance'}; + if( defined $value || ! defined $previous ) { + $value = $previous = '' unless defined $value; + $self->{'_significance'} = $value; + } + return $previous; +} + +=head2 bits + + Usage : $hit_object->bits(); + Purpose : Gets the bit score of the best HSP for the current hit. + Example : $bits = $hit_object->bits(); + Returns : Integer or undef if bit score is not set + Argument : n/a + Comments : For BLAST1, the non-bit score is listed in the summary line. + +See Also : L + +=cut + +#--------- +sub bits { +#--------- + my ($self) = @_; + + my $bits = $self->{'_bits'}; + if( ! defined $bits ) { + $bits = $self->{'_hsps'}->[0]->bits(); + $self->{'_bits'} = $bits; + } + return $bits; +} + +=head2 next_hsp + + Title : next_hsp + Usage : while( $hsp = $obj->next_hsp()) { ... } + Function : Returns the next available High Scoring Pair + Example : + Returns : Bio::Search::HSP::HSPI object or null if finished + Args : none + +=cut + +sub next_hsp { + my ($self) = @_; + $self->{'_iterator'} = 0 unless defined $self->{'_iterator'}; + return undef if $self->{'_iterator'} > scalar @{$self->{'_hsps'}}; + return $self->{'_hsps'}->[$self->{'_iterator'}++]; +} + + +=head2 hsps + + Usage : $hit_object->hsps(); + Purpose : Get a list containing all HSP objects. + : Get the numbers of HSPs for the current hit. + Example : @hsps = $hit_object->hsps(); + : $num = $hit_object->hsps(); # alternatively, use num_hsps() + Returns : Array context : list of Bio::Search::HSP::BlastHSP.pm objects. + : Scalar context: integer (number of HSPs). + : (Equivalent to num_hsps()). + Argument : n/a. Relies on wantarray + Throws : Exception if the HSPs have not been collected. + +See Also : L, L + +=cut + +#--------- +sub hsps { +#--------- + my $self = shift; + + if (not ref $self->{'_hsps'}) { + $self->throw("Can't get HSPs: data not collected."); + } + + return wantarray + # returning list containing all HSPs. + ? @{$self->{'_hsps'}} + # returning number of HSPs. + : scalar(@{$self->{'_hsps'}}); +} + +=head2 num_hsps + + Usage : $hit_object->num_hsps(); + Purpose : Get the number of HSPs for the present Blast hit. + Example : $nhsps = $hit_object->num_hsps(); + Returns : Integer + Argument : n/a + Throws : Exception if the HSPs have not been collected. + +See Also : L + +=cut + +#------------- +sub num_hsps { + my $self = shift; + + if (not defined $self->{'_hsps'}) { + $self->throw("Can't get HSPs: data not collected."); + } + + return scalar(@{$self->{'_hsps'}}); +} + +=head2 rewind + + Title : rewind + Usage : $hit->rewind; + Function: Allow one to reset the HSP iteration to the beginning + Since this is an in-memory implementation + Returns : none + Args : none + +=cut + +sub rewind{ + my ($self) = @_; + $self->{'_iterator'} = 0; +} + +=head2 iteration + + Title : iteration + Usage : $obj->iteration($newval) + Function: PSI-BLAST iteration + Returns : value of iteration + Args : newvalue (optional) + + +=cut + +sub iteration{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'_psiblast_iteration'} = $value; + } + return $self->{'_psiblast_iteration'}; + +} + + +=head2 ambiguous_aln + + Usage : $ambig_code = $hit_object->ambiguous_aln(); + Purpose : Sets/Gets ambiguity code data member. + Example : (see usage) + Returns : String = 'q', 's', 'qs', '-' + : 'q' = query sequence contains overlapping sub-sequences + : while sbjct does not. + : 's' = sbjct sequence contains overlapping sub-sequences + : while query does not. + : 'qs' = query and sbjct sequence contains overlapping sub-sequences + : relative to each other. + : '-' = query and sbjct sequence do not contains multiple domains + : relative to each other OR both contain the same distribution + : of similar domains. + Argument : n/a + Throws : n/a + Status : Experimental + +=cut + +#-------------------- +sub ambiguous_aln { +#-------------------- + my $self = shift; + if(@_) { $self->{'_ambiguous_aln'} = shift; } + $self->{'_ambiguous_aln'} || '-'; +} + +=head2 overlap + +See documentation in L + +=cut + +#------------- +sub overlap { +#------------- + my $self = shift; + if(@_) { $self->{'_overlap'} = shift; } + defined $self->{'_overlap'} ? $self->{'_overlap'} : 0; +} + + +=head2 n + + Usage : $hit_object->n(); + Purpose : Gets the N number for the current hit. + : This is the number of HSPs in the set which was ascribed + : the lowest P-value (listed on the description line). + : This number is not the same as the total number of HSPs. + : To get the total number of HSPs, use num_hsps(). + Example : $n = $hit_object->n(); + Returns : Integer + Argument : n/a + Throws : Exception if HSPs have not been set (BLAST2 reports). + Comments : Note that the N parameter is not reported in gapped BLAST2. + : Calling n() on such reports will result in a call to num_hsps(). + : The num_hsps() method will count the actual number of + : HSPs in the alignment listing, which may exceed N in + : some cases. + +See Also : L + +=cut + +#----- +sub n { +#----- + my $self = shift; + + # The check for $self->{'_n'} is a remnant from the 'query' mode days + # in which the sbjct object would collect data from the description + # line only. + + my ($n); + if(not defined($self->{'_n'})) { + $n = $self->hsp->n; + } else { + $n = $self->{'_n'}; + } + $n ||= $self->num_hsps; + + return $n; +} + +=head2 p + + Usage : $hit_object->p( [format] ); + Purpose : Get the P-value for the best HSP of the given BLAST hit. + : (Note that P-values are not provided with NCBI Blast2 reports). + Example : $p = $sbjct->p; + : $p = $sbjct->p('exp'); # get exponent only. + : ($num, $exp) = $sbjct->p('parts'); # split sci notation into parts + Returns : Float or scientific notation number (the raw P-value, DEFAULT). + : Integer if format == 'exp' (the magnitude of the base 10 exponent). + : 2-element list (float, int) if format == 'parts' and P-value + : is in scientific notation (See Comments). + Argument : format: string of 'raw' | 'exp' | 'parts' + : 'raw' returns value given in report. Default. (1.2e-34) + : 'exp' returns exponent value only (34) + : 'parts' returns the decimal and exponent as a + : 2-element list (1.2, -34) (See Comments). + Throws : Warns if no P-value is defined. Uses expect instead. + Comments : Using the 'parts' argument is not recommended since it will not + : work as expected if the P-value is not in scientific notation. + : That is, floats are not converted into sci notation before + : splitting into parts. + +See Also : L, L, L + +=cut + +#-------- +sub p { +#-------- +# Some duplication of logic for p(), expect() and signif() for the sake of performance. + my ($self, $fmt) = @_; + + my $val = $self->{'_p'}; + + # $val can be zero. + if(not defined $val) { + # P-value not defined, must be a NCBI Blast2 report. + # Use expect instead. + $self->warn( "P-value not defined. Using expect() instead."); + $val = $self->{'_expect'}; + } + + return $val if not $fmt or $fmt =~ /^raw/i; + ## Special formats: exponent-only or as list. + return &Bio::Search::SearchUtils::get_exponent($val) if $fmt =~ /^exp/i; + return (split (/eE/, $val)) if $fmt =~ /^parts/i; + + ## Default: return the raw P-value. + return $val; +} + +=head2 hsp + + Usage : $hit_object->hsp( [string] ); + Purpose : Get a single HSPI object for the present HitI object. + Example : $hspObj = $hit_object->hsp; # same as 'best' + : $hspObj = $hit_object->hsp('best'); + : $hspObj = $hit_object->hsp('worst'); + Returns : Object reference for a Bio::Search::HSP::BlastHSP.pm object. + Argument : String (or no argument). + : No argument (default) = highest scoring HSP (same as 'best'). + : 'best' or 'first' = highest scoring HSP. + : 'worst' or 'last' = lowest scoring HSP. + Throws : Exception if the HSPs have not been collected. + : Exception if an unrecognized argument is used. + +See Also : L, L() + +=cut + +#---------- +sub hsp { +#---------- + my( $self, $option ) = @_; + $option ||= 'best'; + + if (not ref $self->{'_hsps'}) { + $self->throw("Can't get HSPs: data not collected."); + } + + my @hsps = @{$self->{'_hsps'}}; + + return $hsps[0] if $option =~ /best|first|1/i; + return $hsps[$#hsps] if $option =~ /worst|last/i; + + $self->throw("Can't get HSP for: $option\n" . + "Valid arguments: 'best', 'worst'"); +} + +=head2 logical_length + + Usage : $hit_object->logical_length( [seq_type] ); + : (mostly intended for internal use). + Purpose : Get the logical length of the hit sequence. + : If the Blast is a TBLASTN or TBLASTX, the returned length + : is the length of the would-be amino acid sequence (length/3). + : For all other BLAST flavors, this function is the same as length(). + Example : $len = $hit_object->logical_length(); + Returns : Integer + Argument : seq_type = 'query' or 'hit' or 'sbjct' (default = 'query') + ('sbjct' is synonymous with 'hit') + Throws : n/a + Comments : This is important for functions like frac_aligned_query() + : which need to operate in amino acid coordinate space when dealing + : with [T]BLAST[NX] type reports. + +See Also : L, L, L + +=cut + +#-------------------- +sub logical_length { +#-------------------- + my $self = shift; + my $seqType = shift || 'query'; + $seqType = 'sbjct' if $seqType eq 'hit'; + + my $length; + + # For the sbjct, return logical sbjct length + if( $seqType eq 'sbjct' ) { + $length = $self->{'_logical_length'} || $self->{'_length'}; + } + else { + # Otherwise, return logical query length + $length = $self->{'_query_length'}; + $self->throw("Must have defined query_len") unless ( $length ); + + # Adjust length based on BLAST flavor. + if($self->algorithm =~ /T?BLASTX/ ) { + $length /= 3; + } + } + return $length; +} + +=head2 length_aln + + Usage : $hit_object->length_aln( [seq_type] ); + Purpose : Get the total length of the aligned region for query or sbjct seq. + : This number will include all HSPs + Example : $len = $hit_object->length_aln(); # default = query + : $lenAln = $hit_object->length_aln('query'); + Returns : Integer + Argument : seq_Type = 'query' or 'hit' or 'sbjct' (Default = 'query') + ('sbjct' is synonymous with 'hit') + Throws : Exception if the argument is not recognized. + Comments : This method will report the logical length of the alignment, + : meaning that for TBLAST[NX] reports, the length is reported + : using amino acid coordinate space (i.e., nucleotides / 3). + : + : This method requires that all HSPs be tiled. If they have not + : already been tiled, they will be tiled first automatically.. + : If you don't want the tiled data, iterate through each HSP + : calling length() on each (use hsps() to get all HSPs). + +See Also : L, L, L, L, L, L + +=cut + +#---------------' +sub length_aln { +#--------------- + my( $self, $seqType, $num ) = @_; + + $seqType ||= 'query'; + $seqType = 'sbjct' if $seqType eq 'hit'; + + Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'}; + + if( defined $num) { + return $self->{'_length_aln_'.$seqType} = $num; + } + + my $data = $self->{'_length_aln_'.$seqType}; + + ## If we don't have data, figure out what went wrong. + if(!$data) { + $self->throw("Can't get length aln for sequence type \"$seqType\". " . + "Valid types are 'query', 'hit', 'sbjct' ('sbjct' = 'hit')"); + } + return $data; +} + +=head2 gaps + + Usage : $hit_object->gaps( [seq_type] ); + Purpose : Get the number of gaps in the aligned query, hit, or both sequences. + : Data is summed across all HSPs. + Example : $qgaps = $hit_object->gaps('query'); + : $hgaps = $hit_object->gaps('hit'); + : $tgaps = $hit_object->gaps(); # default = total (query + hit) + Returns : scalar context: integer + : array context without args: two-element list of integers + : (queryGaps, hitGaps) + : Array context can be forced by providing an argument of 'list' or 'array'. + : + : CAUTION: Calling this method within printf or sprintf is arrray context. + : So this function may not give you what you expect. For example: + : printf "Total gaps: %d", $hit->gaps(); + : Actually returns a two-element array, so what gets printed + : is the number of gaps in the query, not the total + : + Argument : seq_type: 'query' | 'hit' or 'sbjct' | 'total' | 'list' (default = 'total') + ('sbjct' is synonymous with 'hit') + Throws : n/a + Comments : If you need data for each HSP, use hsps() and then interate + : through each HSP object. + : This method requires that all HSPs be tiled. If they have not + : already been tiled, they will be tiled first automatically.. + : Not relying on wantarray since that will fail in situations + : such as printf "%d", $hit->gaps() in which you might expect to + : be printing the total gaps, but evaluates to array context. + +See Also : L + +=cut + +#---------- +sub gaps { +#---------- + my( $self, $seqType, $num ) = @_; + + $seqType ||= (wantarray ? 'list' : 'total'); + $seqType = 'sbjct' if $seqType eq 'hit'; + + Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'}; + + $seqType = lc($seqType); + + if( defined $num ) { + $self->throw("Can't set gaps for seqType '$seqType'. Must be 'query' or 'hit'\n") unless ($seqType eq 'sbjct' or $seqType eq 'query'); + + return $self->{'_gaps_'.$seqType} = $num; + } + elsif($seqType =~ /list|array/i) { + return ($self->{'_gaps_query'}, $self->{'_gaps_sbjct'}); + } + elsif($seqType eq 'total') { + return ($self->{'_gaps_query'} + $self->{'_gaps_sbjct'}) || 0; + } else { + return $self->{'_gaps_'.$seqType} || 0; + } +} + + +=head2 matches + +See documentation in L + +=cut + +#--------------- +sub matches { +#--------------- + my( $self, $arg1, $arg2) = @_; + my(@data,$data); + + Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'}; + + if(!$arg1) { + @data = ($self->{'_totalIdentical'}, $self->{'_totalConserved'}); + + return @data if @data; + + } else { + + if( defined $arg2 ) { + $self->{'_totalIdentical'} = $arg1; + $self->{'_totalConserved'} = $arg2; + return ( $arg1, $arg2 ); + } + elsif($arg1 =~ /^id/i) { + $data = $self->{'_totalIdentical'}; + } else { + $data = $self->{'_totalConserved'}; + } + return $data if $data; + } + + ## Something went wrong if we make it to here. + $self->throw("Can't get identical or conserved data: no data."); +} + + +=head2 start + + Usage : $sbjct->start( [seq_type] ); + Purpose : Gets the start coordinate for the query, sbjct, or both sequences + : in the BlastHit object. If there is more than one HSP, the lowest start + : value of all HSPs is returned. + Example : $qbeg = $sbjct->start('query'); + : $sbeg = $sbjct->start('hit'); + : ($qbeg, $sbeg) = $sbjct->start(); + Returns : scalar context: integer + : array context without args: list of two integers (queryStart, sbjctStart) + : Array context can be "induced" by providing an argument of 'list' or 'array'. + Argument : In scalar context: seq_type = 'query' or 'hit' or 'sbjct' (default = 'query') + ('sbjct' is synonymous with 'hit') + Throws : n/a + Comments : This method requires that all HSPs be tiled. If there is more than one + : HSP and they have not already been tiled, they will be tiled first automatically.. + : Remember that the start and end coordinates of all HSPs are + : normalized so that start < end. Strand information can be + : obtained by calling $hit->strand(). + +See Also : L, L, L, + L + +=cut + +#---------- +sub start { +#---------- + my ($self, $seqType, $num) = @_; + + $seqType ||= (wantarray ? 'list' : 'query'); + $seqType = 'sbjct' if $seqType eq 'hit'; + + if( defined $num ) { + $seqType = "_\L$seqType\E"; + return $self->{$seqType.'Start'} = $num; + } + + # If there is only one HSP, defer this call to the solitary HSP. + if($self->num_hsps == 1) { + return $self->hsp->start($seqType); + } else { + Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'}; + if($seqType =~ /list|array/i) { + return ($self->{'_queryStart'}, $self->{'_sbjctStart'}); + } else { + ## Sensitive to member name changes. + $seqType = "_\L$seqType\E"; + return $self->{$seqType.'Start'}; + } + } +} + + +=head2 end + + Usage : $sbjct->end( [seq_type] ); + Purpose : Gets the end coordinate for the query, sbjct, or both sequences + : in the BlastHit object. If there is more than one HSP, the largest end + : value of all HSPs is returned. + Example : $qend = $sbjct->end('query'); + : $send = $sbjct->end('hit'); + : ($qend, $send) = $sbjct->end(); + Returns : scalar context: integer + : array context without args: list of two integers (queryEnd, sbjctEnd) + : Array context can be "induced" by providing an argument of 'list' or 'array'. + Argument : In scalar context: seq_type = 'query' or 'sbjct' + : (case insensitive). If not supplied, 'query' is used. + Throws : n/a + Comments : This method requires that all HSPs be tiled. If there is more than one + : HSP and they have not already been tiled, they will be tiled first automatically.. + : Remember that the start and end coordinates of all HSPs are + : normalized so that start < end. Strand information can be + : obtained by calling $hit->strand(). + +See Also : L, L, L + +=cut + +#---------- +sub end { +#---------- + my ($self, $seqType, $num) = @_; + + $seqType ||= (wantarray ? 'list' : 'query'); + $seqType = 'sbjct' if $seqType eq 'hit'; + + + if( defined $num ) { + $seqType = "_\L$seqType\E"; + return $self->{$seqType.'Stop'} = $num; + } + + # If there is only one HSP, defer this call to the solitary HSP. + if($self->num_hsps == 1) { + return $self->hsp->end($seqType); + } else { + Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'}; + if($seqType =~ /list|array/i) { + return ($self->{'_queryStop'}, $self->{'_sbjctStop'}); + } else { + ## Sensitive to member name changes. + $seqType = "_\L$seqType\E"; + return $self->{$seqType.'Stop'}; + } + } +} + +=head2 range + + Usage : $sbjct->range( [seq_type] ); + Purpose : Gets the (start, end) coordinates for the query or sbjct sequence + : in the HSP alignment. + Example : ($qbeg, $qend) = $sbjct->range('query'); + : ($sbeg, $send) = $sbjct->range('hit'); + Returns : Two-element array of integers + Argument : seq_type = string, 'query' or 'hit' or 'sbjct' (default = 'query') + ('sbjct' is synonymous with 'hit') + Throws : n/a + +See Also : L, L + +=cut + +#---------- +sub range { +#---------- + my ($self, $seqType) = @_; + $seqType ||= 'query'; + $seqType = 'sbjct' if $seqType eq 'hit'; + return ($self->start($seqType), $self->end($seqType)); +} + + +=head2 frac_identical + + Usage : $hit_object->frac_identical( [seq_type] ); + Purpose : Get the overall fraction of identical positions across all HSPs. + : The number refers to only the aligned regions and does not + : account for unaligned regions in between the HSPs, if any. + Example : $frac_iden = $hit_object->frac_identical('query'); + Returns : Float (2-decimal precision, e.g., 0.75). + Argument : seq_type: 'query' | 'hit' or 'sbjct' | 'total' + : default = 'query' (but see comments below). + : ('sbjct' is synonymous with 'hit') + Throws : n/a + Comments : Different versions of Blast report different values for the total + : length of the alignment. This is the number reported in the + : denominators in the stats section: + : "Identical = 34/120 Positives = 67/120". + : NCBI BLAST uses the total length of the alignment (with gaps) + : WU-BLAST uses the length of the query sequence (without gaps). + : + : Therefore, when called with an argument of 'total', + : this method will report different values depending on the + : version of BLAST used. Total does NOT take into account HSP + : tiling, so it should not be used. + : + : To get the fraction identical among only the aligned residues, + : ignoring the gaps, call this method without an argument or + : with an argument of 'query' or 'hit'. + : + : If you need data for each HSP, use hsps() and then iterate + : through the HSP objects. + : This method requires that all HSPs be tiled. If they have not + : already been tiled, they will be tiled first automatically. + +See Also : L, L, L, L + +=cut + +#------------------ +sub frac_identical { +#------------------ + my ($self, $seqType) = @_; + $seqType ||= 'query'; + $seqType = 'sbjct' if $seqType eq 'hit'; + + ## Sensitive to member name format. + $seqType = lc($seqType); + + Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'}; + + my $ident = $self->{'_totalIdentical'}; + my $total = $self->{'_length_aln_'.$seqType}; + my $ratio = $ident / $total; + my $ratio_rounded = sprintf( "%.3f", $ratio); + + # Round down iff normal rounding yields 1 (just like blast) + $ratio_rounded = 0.999 if (($ratio_rounded == 1) && ($ratio < 1)); + return $ratio_rounded; +} + + + +=head2 frac_conserved + + Usage : $hit_object->frac_conserved( [seq_type] ); + Purpose : Get the overall fraction of conserved positions across all HSPs. + : The number refers to only the aligned regions and does not + : account for unaligned regions in between the HSPs, if any. + Example : $frac_cons = $hit_object->frac_conserved('hit'); + Returns : Float (2-decimal precision, e.g., 0.75). + Argument : seq_type: 'query' | 'hit' or 'sbjct' | 'total' + : default = 'query' (but see comments below). + : ('sbjct' is synonymous with 'hit') + Throws : n/a + Comments : Different versions of Blast report different values for the total + : length of the alignment. This is the number reported in the + : denominators in the stats section: + : "Positives = 34/120 Positives = 67/120". + : NCBI BLAST uses the total length of the alignment (with gaps) + : WU-BLAST uses the length of the query sequence (without gaps). + : + : Therefore, when called with an argument of 'total', + : this method will report different values depending on the + : version of BLAST used. Total does NOT take into account HSP + : tiling, so it should not be used. + : + : To get the fraction conserved among only the aligned residues, + : ignoring the gaps, call this method without an argument or + : with an argument of 'query' or 'hit'. + : + : If you need data for each HSP, use hsps() and then interate + : through the HSP objects. + : This method requires that all HSPs be tiled. If they have not + : already been tiled, they will be tiled first automatically. + +See Also : L, L, L + +=cut + +#-------------------- +sub frac_conserved { +#-------------------- + my ($self, $seqType) = @_; + $seqType ||= 'query'; + $seqType = 'sbjct' if $seqType eq 'hit'; + + ## Sensitive to member name format. + $seqType = lc($seqType); + + Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'}; + + my $consv = $self->{'_totalConserved'}; + my $total = $self->{'_length_aln_'.$seqType}; + my $ratio = $consv / $total; + my $ratio_rounded = sprintf( "%.3f", $ratio); + + # Round down iff normal rounding yields 1 (just like blast) + $ratio_rounded = 0.999 if (($ratio_rounded == 1) && ($ratio < 1)); + return $ratio_rounded; +} + + + + +=head2 frac_aligned_query + + Usage : $hit_object->frac_aligned_query(); + Purpose : Get the fraction of the query sequence which has been aligned + : across all HSPs (not including intervals between non-overlapping + : HSPs). + Example : $frac_alnq = $hit_object->frac_aligned_query(); + Returns : Float (2-decimal precision, e.g., 0.75). + Argument : n/a + Throws : n/a + Comments : If you need data for each HSP, use hsps() and then interate + : through the HSP objects. + : To compute the fraction aligned, the logical length of the query + : sequence is used, meaning that for [T]BLASTX reports, the + : full length of the query sequence is converted into amino acids + : by dividing by 3. This is necessary because of the way + : the lengths of aligned sequences are computed. + : This method requires that all HSPs be tiled. If they have not + : already been tiled, they will be tiled first automatically. + +See Also : L, L, L, L + +=cut + +#---------------------- +sub frac_aligned_query { +#---------------------- + my $self = shift; + + Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'}; + + sprintf( "%.2f", $self->{'_length_aln_query'}/ + $self->logical_length('query')); +} + + + +=head2 frac_aligned_hit + + Usage : $hit_object->frac_aligned_hit(); + Purpose : Get the fraction of the hit (sbjct) sequence which has been aligned + : across all HSPs (not including intervals between non-overlapping + : HSPs). + Example : $frac_alnq = $hit_object->frac_aligned_hit(); + Returns : Float (2-decimal precision, e.g., 0.75). + Argument : n/a + Throws : n/a + Comments : If you need data for each HSP, use hsps() and then interate + : through the HSP objects. + : To compute the fraction aligned, the logical length of the sbjct + : sequence is used, meaning that for TBLAST[NX] reports, the + : full length of the sbjct sequence is converted into amino acids + : by dividing by 3. This is necessary because of the way + : the lengths of aligned sequences are computed. + : This method requires that all HSPs be tiled. If they have not + : already been tiled, they will be tiled first automatically. + +See Also : L, L, , L, L, L + +=cut + +#-------------------- +sub frac_aligned_hit { +#-------------------- + my $self = shift; + + Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'}; + + sprintf( "%.2f", $self->{'_length_aln_sbjct'}/$self->logical_length('sbjct')); +} + + +## These methods are being maintained for backward compatibility. + +=head2 frac_aligned_sbjct + +Same as L + +=cut + +#---------------- +sub frac_aligned_sbjct { my $self=shift; $self->frac_aligned_hit(@_); } +#---------------- + +=head2 num_unaligned_sbjct + +Same as L + +=cut + +#---------------- +sub num_unaligned_sbjct { my $self=shift; $self->num_unaligned_hit(@_); } +#---------------- + + + +=head2 num_unaligned_hit + + Usage : $hit_object->num_unaligned_hit(); + Purpose : Get the number of the unaligned residues in the hit sequence. + : Sums across all all HSPs. + Example : $num_unaln = $hit_object->num_unaligned_hit(); + Returns : Integer + Argument : n/a + Throws : n/a + Comments : See notes regarding logical lengths in the comments for frac_aligned_hit(). + : They apply here as well. + : If you need data for each HSP, use hsps() and then interate + : through the HSP objects. + : This method requires that all HSPs be tiled. If they have not + : already been tiled, they will be tiled first automatically.. + +See Also : L, L, L + +=cut + +#--------------------- +sub num_unaligned_hit { +#--------------------- + my $self = shift; + + Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'}; + + my $num = $self->logical_length('sbjct') - $self->{'_length_aln_sbjct'}; + ($num < 0 ? 0 : $num ); +} + + +=head2 num_unaligned_query + + Usage : $hit_object->num_unaligned_query(); + Purpose : Get the number of the unaligned residues in the query sequence. + : Sums across all all HSPs. + Example : $num_unaln = $hit_object->num_unaligned_query(); + Returns : Integer + Argument : n/a + Throws : n/a + Comments : See notes regarding logical lengths in the comments for frac_aligned_query(). + : They apply here as well. + : If you need data for each HSP, use hsps() and then interate + : through the HSP objects. + : This method requires that all HSPs be tiled. If they have not + : already been tiled, they will be tiled first automatically.. + +See Also : L, L, L + +=cut + +#----------------------- +sub num_unaligned_query { +#----------------------- + my $self = shift; + + Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'}; + + my $num = $self->logical_length('query') - $self->{'_length_aln_query'}; + ($num < 0 ? 0 : $num ); +} + + + +=head2 seq_inds + + Usage : $hit->seq_inds( seq_type, class, collapse ); + Purpose : Get a list of residue positions (indices) across all HSPs + : for identical or conserved residues in the query or sbjct sequence. + Example : @s_ind = $hit->seq_inds('query', 'identical'); + : @h_ind = $hit->seq_inds('hit', 'conserved'); + : @h_ind = $hit->seq_inds('hit', 'conserved', 1); + Returns : Array of integers + : May include ranges if collapse is non-zero. + Argument : [0] seq_type = 'query' or 'hit' or 'sbjct' (default = 'query') + : ('sbjct' is synonymous with 'hit') + : [1] class = 'identical' or 'conserved' (default = 'identical') + : (can be shortened to 'id' or 'cons') + : (actually, anything not 'id' will evaluate to 'conserved'). + : [2] collapse = boolean, if non-zero, consecutive positions are merged + : using a range notation, e.g., "1 2 3 4 5 7 9 10 11" + : collapses to "1-5 7 9-11". This is useful for + : consolidating long lists. Default = no collapse. + Throws : n/a. + +See Also : L + +=cut + +#------------- +sub seq_inds { +#------------- + my ($self, $seqType, $class, $collapse) = @_; + + $seqType ||= 'query'; + $class ||= 'identical'; + $collapse ||= 0; + + $seqType = 'sbjct' if $seqType eq 'hit'; + + my (@inds, $hsp); + foreach $hsp ($self->hsps) { + # This will merge data for all HSPs together. + push @inds, $hsp->seq_inds($seqType, $class); + } + + # Need to remove duplicates and sort the merged positions. + if(@inds) { + my %tmp = map { $_, 1 } @inds; + @inds = sort {$a <=> $b} keys %tmp; + } + + $collapse ? &Bio::Search::SearchUtils::collapse_nums(@inds) : @inds; +} + + +=head2 strand + +See documentation in L + +=cut + +#----------' +sub strand { +#---------- + my ($self, $seqType, $strnd) = @_; + + Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'}; + + $seqType ||= (wantarray ? 'list' : 'query'); + $seqType = 'sbjct' if $seqType eq 'hit'; + + $seqType = lc($seqType); + + if( defined $strnd ) { + $self->throw("Can't set strand for seqType '$seqType'. Must be 'query' or 'hit'\n") unless ($seqType eq 'sbjct' or $seqType eq 'query'); + + return $self->{'_strand_'.$seqType} = $strnd; + } + + my ($qstr, $hstr); + # If there is only one HSP, defer this call to the solitary HSP. + if($self->num_hsps == 1) { + return $self->hsp->strand($seqType); + } + elsif( defined $self->{'_strand_query'}) { + # Get the data computed during hsp tiling. + $qstr = $self->{'_strand_query'}; + $hstr = $self->{'_strand_sbjct'} + } + else { + # otherwise, iterate through all HSPs collecting strand info. + # This will return the string "-1/1" if there are HSPs on different strands. + # NOTE: This was the pre-10/21/02 procedure which will no longer be used, + # (unless the above elsif{} is commented out). + my (%qstr, %hstr); + foreach my $hsp( $self->hsps ) { + my ( $q, $h ) = $hsp->strand(); + $qstr{ $q }++; + $hstr{ $h }++; + } + $qstr = join( '/', sort keys %qstr); + $hstr = join( '/', sort keys %hstr); + } + + if($seqType =~ /list|array/i) { + return ($qstr, $hstr); + } elsif( $seqType eq 'query' ) { + return $qstr; + } else { + return $hstr; + } +} + +=head2 frame + +See documentation in L + +=cut + +#----------' +sub frame { +#---------- + my( $self, $frm ) = @_; + + Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'}; + + if( defined $frm ) { + return $self->{'_frame'} = $frm; + } + + # The check for $self->{'_frame'} is a remnant from the 'query' mode days + # in which the sbjct object would collect data from the description line only. + + my ($frame); + if(not defined($self->{'_frame'})) { + $frame = $self->hsp->frame; + } else { + $frame = $self->{'_frame'}; + } + return $frame; +} + +=head2 rank + + Title : rank + Usage : $obj->rank($newval) + Function: Get/Set the rank of this Hit in the Query search list + i.e. this is the Nth hit for a specific query + Returns : value of rank + Args : newvalue (optional) + + +=cut + +sub rank{ + my $self = shift; + return $self->{'_rank'} = shift if @_; + return $self->{'_rank'} || 1; +} + +=head2 locus + + Title : locus + Usage : $locus = $hit->locus(); + Function: Retrieve the locus (if available) for the hit + Returns : a scalar string (empty string if not set) + Args : none + +=cut + +sub locus { + my ($self,$value) = @_; + my $previous = $self->{'_locus'}; + if( defined $value || ! defined $previous ) { + unless (defined $value) { + if ($self->{'_name'} =~/(gb|emb|dbj|ref)\|(.*)\|(.*)/) { + $value = $previous = $3; + } else { + $value = $previous = ''; + } + } + $self->{'_locus'} = $value; + } + return $previous; +} + +=head2 each_accession_number + + Title : each_accession_number + Usage : @each_accession_number = $hit->each_accession_number(); + Function: Get each accession number listed in the description of the hit. + If there are no alternatives, then only the primary accession will + be given + Returns : list of all accession numbers in the description + Args : none + +=cut + +sub each_accession_number { + my ($self,$value) = @_; + my $desc = $self->{'_description'}; + #put primary accnum on the list + my @accnums; + push (@accnums,$self->{'_accession'}); + if( defined $desc ) { + while ($desc =~ /(\b\S+\|\S*\|\S*\s?)/g) { + my $id = $1; + my ($acc, $version); + if ($id =~ /(gb|emb|dbj|sp|pdb|bbs|ref|lcl)\|(.*)\|(.*)/) { + ($acc, $version) = split /\./, $2; + } elsif ($id =~ /(pir|prf|pat|gnl)\|(.*)\|(.*)/) { + ($acc, $version) = split /\./, $3; + } else { + #punt, not matching the db's at ftp://ftp.ncbi.nih.gov/blast/db/README + #Database Name Identifier Syntax + #============================ ======================== + #GenBank gb|accession|locus + #EMBL Data Library emb|accession|locus + #DDBJ, DNA Database of Japan dbj|accession|locus + #NBRF PIR pir||entry + #Protein Research Foundation prf||name + #SWISS-PROT sp|accession|entry name + #Brookhaven Protein Data Bank pdb|entry|chain + #Patents pat|country|number + #GenInfo Backbone Id bbs|number + #General database identifier gnl|database|identifier + #NCBI Reference Sequence ref|accession|locus + #Local Sequence identifier lcl|identifier + $acc=$id; + } + push(@accnums, $acc); + } + } + return @accnums; +} + +=head2 tiled_hsps + +See documentation in L + +=cut + +sub tiled_hsps { + my $self = shift; + return $self->{'_tiled_hsps'} = shift if @_; + return $self->{'_tiled_hsps'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Search/Hit/HMMERHit.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Search/Hit/HMMERHit.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,306 @@ +# $Id: HMMERHit.pm,v 1.3 2002/10/22 07:45:17 lapp Exp $ +# +# BioPerl module for Bio::Search::Hit::HMMERHit +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Search::Hit::HMMERHit - A Hit module for HMMER hits + +=head1 SYNOPSIS + + use Bio::Search::Hit::HMMERHit; + my $hit = new Bio::Search::Hit::HMMERHit; + # use it in the same way as Bio::Search::Hit::GenericHit + +=head1 DESCRIPTION + +This is a specialization of L. There +are a few news methods L and L. Note that +L and L make no sense for this object and will +return 0. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Search::Hit::HMMERHit; +use vars qw(@ISA); +use strict; + +use Bio::Search::Hit::GenericHit; + +@ISA = qw(Bio::Search::Hit::GenericHit ); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Search::Hit::HMMERHit(); + Function: Builds a new Bio::Search::Hit::HMMERHit object + Returns : Bio::Search::Hit::HMMERHit + Args : + + Plus the Bio::Search::Hit::GenericHit inherited params + -name => Name of Hit (required) + -description => Description (optional) + -accession => Accession number (optional) + -length => Length of the Hit (optional) + -score => Raw Score for the Hit (optional) + -significance => Significance value for the Hit (optional) + -algorithm => Algorithm used (BLASTP, FASTX, etc...) + -hsps => Array ref of HSPs for this Hit. + + +=cut + + +=head2 next_domain + + Title : next_domain + Usage : my $domain = $hit->next_domain(); + Function: An alias for L, this will return the next HSP + Returns : L object + Args : none + + +=cut + +sub next_domain{ shift->next_hsp } + +=head2 domains + + Title : domains + Usage : my @domains = $hit->domains(); + Function: An alias for L, this will return the full list of hsps + Returns : array of L objects + Args : none + + +=cut + +sub domains{ shift->hsps() } + + +=head2 inherited Bio::Search::Hit::GenericHit methods + +=cut + +=head2 add_hsp + + Title : add_hsp + Usage : $hit->add_hsp($hsp) + Function: Add a HSP to the collection of HSPs for a Hit + Returns : number of HSPs in the Hit + Args : Bio::Search::HSP::HSPI object + + +=cut + +=head2 Bio::Search::Hit::HitI methods + +=cut + +=head2 name + + Title : name + Usage : $hit_name = $hit->name(); + Function: returns the name of the Hit sequence + Returns : a scalar string + Args : [optional] scalar string to set the name + +=cut + +=head2 accession + + Title : accession + Usage : $acc = $hit->accession(); + Function: Retrieve the accession (if available) for the hit + Returns : a scalar string (empty string if not set) + Args : none + +=cut + +=head2 description + + Title : description + Usage : $desc = $hit->description(); + Function: Retrieve the description for the hit + Returns : a scalar string + Args : [optional] scalar string to set the descrition + +=cut + +=head2 length + + Title : length + Usage : my $len = $hit->length + Function: Returns the length of the hit + Returns : integer + Args : [optional] integer to set the length + +=cut + +=head2 algorithm + + Title : algorithm + Usage : $alg = $hit->algorithm(); + Function: Gets the algorithm specification that was used to obtain the hit + For BLAST, the algorithm denotes what type of sequence was aligned + against what (BLASTN: dna-dna, BLASTP prt-prt, BLASTX translated + dna-prt, TBLASTN prt-translated dna, TBLASTX translated + dna-translated dna). + Returns : a scalar string + Args : [optional] scalar string to set the algorithm + +=cut + +=head2 raw_score + + Title : raw_score + Usage : $score = $hit->raw_score(); + Function: Gets the "raw score" generated by the algorithm. What + this score is exactly will vary from algorithm to algorithm, + returning undef if unavailable. + Returns : a scalar value + Args : [optional] scalar value to set the raw score + +=cut + +=head2 significance + + Title : significance + Usage : $significance = $hit->significance(); + Function: Used to obtain the E or P value of a hit, i.e. the probability that + this particular hit was obtained purely by random chance. If + information is not available (nor calculatable from other + information sources), return undef. + Returns : a scalar value or undef if unavailable + Args : [optional] scalar value to set the significance + +=cut + +=head2 bits + + Usage : $hit_object->bits(); + Purpose : Gets the bit score of the best HSP for the current hit. + Example : $bits = $hit_object->bits(); + Returns : Integer or undef if bit score is not set + Argument : n/a + +See Also : L + +=cut + +sub bits { return 0 } + +=head2 next_hsp + + Title : next_hsp + Usage : while( $hsp = $obj->next_hsp()) { ... } + Function : Returns the next available High Scoring Pair + Example : + Returns : Bio::Search::HSP::HSPI object or null if finished + Args : none + +=cut + +=head2 hsps + + Usage : $hit_object->hsps(); + Purpose : Get a list containing all HSP objects. + : Get the numbers of HSPs for the current hit. + Example : @hsps = $hit_object->hsps(); + : $num = $hit_object->hsps(); # alternatively, use num_hsps() + Returns : Array context : list of Bio::Search::HSP::BlastHSP.pm objects. + : Scalar context: integer (number of HSPs). + : (Equivalent to num_hsps()). + Argument : n/a. Relies on wantarray + Throws : Exception if the HSPs have not been collected. + +See Also : L, L + +=cut + +=head2 num_hsps + + Usage : $hit_object->num_hsps(); + Purpose : Get the number of HSPs for the present Blast hit. + Example : $nhsps = $hit_object->num_hsps(); + Returns : Integer + Argument : n/a + Throws : Exception if the HSPs have not been collected. + +See Also : L + +=cut + +=head2 rewind + + Title : rewind + Usage : $hit->rewind; + Function: Allow one to reset the HSP iteration to the beginning + Since this is an in-memory implementation + Returns : none + Args : none + +=cut + +=head2 iteration + + Title : iteration + Usage : $obj->iteration($newval) + Function: PSI-BLAST iteration + Returns : value of iteration + Args : newvalue (optional) + + +=cut + + +sub iteration { return 0 } + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Search/Hit/HitFactory.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Search/Hit/HitFactory.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,149 @@ +# $Id: HitFactory.pm,v 1.3 2002/10/22 07:45:17 lapp Exp $ +# +# BioPerl module for Bio::Search::Hit::HitFactory +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Search::Hit::HitFactory - A factory to create Bio::Search::Hit::HitI objects + +=head1 SYNOPSIS + + use Bio::Search::Hit::HitFactory; + my $factory = new Bio::Search::Hit::HitFactory(); + my $resultobj = $factory->create(@args); + +=head1 DESCRIPTION + +This is a general way of hiding the object creation process so that we +can dynamically change the objects that are created by the SearchIO +parser depending on what format report we are parsing. + +This object is for creating new Hits. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Search::Hit::HitFactory; +use vars qw(@ISA $DEFAULT_TYPE); +use strict; + +use Bio::Root::Root; +use Bio::Factory::ObjectFactoryI; + +@ISA = qw(Bio::Root::Root Bio::Factory::ObjectFactoryI ); + +BEGIN { + $DEFAULT_TYPE = 'Bio::Search::Hit::GenericHit'; +} + +=head2 new + + Title : new + Usage : my $obj = new Bio::Search::Hit::HitFactory(); + Function: Builds a new Bio::Search::Hit::HitFactory object + Returns : Bio::Search::Hit::HitFactory + Args : + + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + my ($type) = $self->_rearrange([qw(TYPE)],@args); + $self->type($type) if defined $type; + return $self; +} + +=head2 create + + Title : create + Usage : $factory->create(%args) + Function: Create a new L object + Returns : L + Args : hash of initialization parameters + + +=cut + +sub create{ + my ($self,@args) = @_; + my $type = $self->type; + eval { $self->_load_module($type) }; + if( $@ ) { $self->throw("Unable to load module $type"); } + return $type->new(@args); +} + + +=head2 type + + Title : type + Usage : $factory->type('Bio::Search::Hit::GenericHit'); + Function: Get/Set the Hit creation type + Returns : string + Args : [optional] string to set + + +=cut + +sub type{ + my ($self,$type) = @_; + if( defined $type ) { + # redundancy with the create method which also calls _load_module + # I know - but this is not a highly called object so I am going + # to leave it in + eval {$self->_load_module($type) }; + if( $@ ){ $self->warn("Cannot find module $type, unable to set type"); } + else { $self->{'_type'} = $type; } + } + return $self->{'_type'} || $DEFAULT_TYPE; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Search/Hit/HitI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Search/Hit/HitI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,711 @@ +#----------------------------------------------------------------- +# $Id: HitI.pm,v 1.17 2002/11/13 11:16:37 sac Exp $ +# +# BioPerl module Bio::Search::Hit::HitI +# +# Cared for by Steve Chervitz +# +# Originally created by Aaron Mackey +# +# You may distribute this module under the same terms as perl itself +#----------------------------------------------------------------- + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Search::Hit::HitI - Interface for a hit in a similarity search result + +=head1 SYNOPSIS + +Bio::Search::Hit::HitI objects should not be instantiated since this +module defines a pure interface. + +Given an object that implements the Bio::Search::Hit::HitI interface, +you can do the following things with it: + + $hit_name = $hit->name(); + + $desc = $hit->description(); + + $len = $hit->length + + $alg = $hit->algorithm(); + + $score = $hit->raw_score(); + + $significance = $hit->significance(); + + $rank = $hit->rank(); # the Nth hit for a specific query + + while( $hsp = $obj->next_hsp()) { ... } # process in iterator fashion + + for my $hsp ( $obj->hsps()()) { ... } # process in list fashion + +=head1 DESCRIPTION + + Bio::Search::Hit::* objects are data structures that contain information +about specific hits obtained during a library search. Some information will +be algorithm-specific, but others will be generally defined. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Aaron Mackey, Steve Chervitz + +Email amackey@virginia.edu (original author) +Email sac@bioperl.org + +=head1 COPYRIGHT + +Copyright (c) 1999-2001 Aaron Mackey, Steve Chervitz. All Rights Reserved. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::Search::Hit::HitI; + +use Bio::Root::RootI; + +use vars qw(@ISA); +use strict; + +@ISA = qw( Bio::Root::RootI ); + + +=head2 name + + Title : name + Usage : $hit_name = $hit->name(); + Function: returns the name of the Hit sequence + Returns : a scalar string + Args : none + +=cut + +sub name { + my ($self,@args) = @_; + $self->throw_not_implemented; +} + +=head2 description + + Title : description + Usage : $desc = $hit->description(); + Function: Retrieve the description for the hit + Returns : a scalar string + Args : none + +=cut + +sub description { + my ($self,@args) = @_; + $self->throw_not_implemented; +} + +=head2 accession + + Title : accession + Usage : $acc = $hit->accession(); + Function: Retrieve the accession (if available) for the hit + Returns : a scalar string (empty string if not set) + Args : none + +=cut + +sub accession { + my ($self,@args) = @_; + $self->throw_not_implemented; +} + +=head2 locus + + Title : locus + Usage : $acc = $hit->locus(); + Function: Retrieve the locus(if available) for the hit + Returns : a scalar string (empty string if not set) + Args : none + +=cut + +sub locus { + my ($self,@args) = @_; + $self->throw_not_implemented; +} + +=head2 length + + Title : length + Usage : my $len = $hit->length + Function: Returns the length of the hit + Returns : integer + Args : none + +=cut + +sub length { + my ($self,@args) = @_; + $self->throw_not_implemented; +} + + +=head2 algorithm + + Title : algorithm + Usage : $alg = $hit->algorithm(); + Function: Gets the algorithm specification that was used to obtain the hit + For BLAST, the algorithm denotes what type of sequence was aligned + against what (BLASTN: dna-dna, BLASTP prt-prt, BLASTX translated + dna-prt, TBLASTN prt-translated dna, TBLASTX translated + dna-translated dna). + Returns : a scalar string + Args : none + +=cut + +sub algorithm { + my ($self,@args) = @_; + $self->throw_not_implemented; +} + +=head2 raw_score + + Title : raw_score + Usage : $score = $hit->raw_score(); + Function: Gets the "raw score" generated by the algorithm. What + this score is exactly will vary from algorithm to algorithm, + returning undef if unavailable. + Returns : a scalar value + Args : none + +=cut + +sub raw_score { + $_[0]->throw_not_implemented; +} + +=head2 significance + + Title : significance + Usage : $significance = $hit->significance(); + Function: Used to obtain the E or P value of a hit, i.e. the probability that + this particular hit was obtained purely by random chance. If + information is not available (nor calculatable from other + information sources), return undef. + Returns : a scalar value or undef if unavailable + Args : none + +=cut + +sub significance { + $_[0]->throw_not_implemented; +} + +=head2 bits + + Usage : $hit_object->bits(); + Purpose : Gets the bit score of the best HSP for the current hit. + Example : $bits = $hit_object->bits(); + Returns : Integer or double for FASTA reports + Argument : n/a + Comments : For BLAST1, the non-bit score is listed in the summary line. + +See Also : L + +=cut + +#--------- +sub bits { +#--------- + $_[0]->throw_not_implemented(); +} + +=head2 next_hsp + + Title : next_hsp + Usage : while( $hsp = $obj->next_hsp()) { ... } + Function : Returns the next available High Scoring Pair + Example : + Returns : Bio::Search::HSP::HSPI object or null if finished + Args : none + +=cut + +sub next_hsp { + my ($self,@args) = @_; + $self->throw_not_implemented; +} + + +=head2 hsps + + Usage : $hit_object->hsps(); + Purpose : Get a list containing all HSP objects. + : Get the numbers of HSPs for the current hit. + Example : @hsps = $hit_object->hsps(); + : $num = $hit_object->hsps(); # alternatively, use num_hsps() + Returns : Array context : list of Bio::Search::HSP::BlastHSP.pm objects. + : Scalar context: integer (number of HSPs). + : (Equivalent to num_hsps()). + Argument : n/a. Relies on wantarray + Throws : Exception if the HSPs have not been collected. + +See Also : L, L + +=cut + +#--------- +sub hsps { +#--------- + my $self = shift; + + $self->throw_not_implemented(); +} + + + +=head2 num_hsps + + Usage : $hit_object->num_hsps(); + Purpose : Get the number of HSPs for the present Blast hit. + Example : $nhsps = $hit_object->num_hsps(); + Returns : Integer + Argument : n/a + Throws : Exception if the HSPs have not been collected. + +See Also : L + +=cut + +#------------- +sub num_hsps { +#------------- + shift->throw_not_implemented(); +} + + +=head2 seq_inds + + Usage : $hit->seq_inds( seq_type, class, collapse ); + Purpose : Get a list of residue positions (indices) across all HSPs + : for identical or conserved residues in the query or sbjct sequence. + Example : @s_ind = $hit->seq_inds('query', 'identical'); + : @h_ind = $hit->seq_inds('hit', 'conserved'); + : @h_ind = $hit->seq_inds('hit', 'conserved', 1); + Returns : Array of integers + : May include ranges if collapse is non-zero. + Argument : [0] seq_type = 'query' or 'hit' or 'sbjct' (default = 'query') + : ('sbjct' is synonymous with 'hit') + : [1] class = 'identical' or 'conserved' (default = 'identical') + : (can be shortened to 'id' or 'cons') + : (actually, anything not 'id' will evaluate to 'conserved'). + : [2] collapse = boolean, if non-zero, consecutive positions are merged + : using a range notation, e.g., "1 2 3 4 5 7 9 10 11" + : collapses to "1-5 7 9-11". This is useful for + : consolidating long lists. Default = no collapse. + Throws : n/a. + +See Also : L + +=cut + +#------------- +sub seq_inds { +#------------- + my ($self, $seqType, $class, $collapse) = @_; + + $seqType ||= 'query'; + $class ||= 'identical'; + $collapse ||= 0; + + $seqType = 'sbjct' if $seqType eq 'hit'; + + my (@inds, $hsp); + foreach $hsp ($self->hsps) { + # This will merge data for all HSPs together. + push @inds, $hsp->seq_inds($seqType, $class); + } + + # Need to remove duplicates and sort the merged positions. + if(@inds) { + my %tmp = map { $_, 1 } @inds; + @inds = sort {$a <=> $b} keys %tmp; + } + + $collapse ? &Bio::Search::BlastUtils::collapse_nums(@inds) : @inds; +} + +=head2 rewind + + Title : rewind + Usage : $hit->rewind; + Function: Allow one to reset the HSP iteration to the beginning + if possible + Returns : none + Args : none + +=cut + +sub rewind{ + my ($self) = @_; + $self->throw_not_implemented(); +} + + +=head2 iteration + + Usage : $hit->iteration( ); + Purpose : Gets the iteration number in which the Hit was found. + Example : $iteration_num = $sbjct->iteration(); + Returns : Integer greater than or equal to 1 + Non-PSI-BLAST reports will report iteration as 1, but this number + is only meaningful for PSI-BLAST reports. + Argument : none + Throws : none + +See Also : L + +=cut + +#---------------- +sub iteration { shift->throw_not_implemented } +#---------------- + +=head2 found_again + + Usage : $hit->found_again; + Purpose : Gets a boolean indicator whether or not the hit has + been found in a previous iteration. + This is only applicable to PSI-BLAST reports. + + This method indicates if the hit was reported in the + "Sequences used in model and found again" section of the + PSI-BLAST report or if it was reported in the + "Sequences not found previously or not previously below threshold" + section of the PSI-BLAST report. Only for hits in iteration > 1. + + Example : if( $sbjct->found_again()) { ... }; + Returns : Boolean (1 or 0) for PSI-BLAST report iterations greater than 1. + Returns undef for PSI-BLAST report iteration 1 and non PSI_BLAST + reports. + Argument : none + Throws : none + +See Also : L + +=cut + +#---------------- +sub found_again { shift->throw_not_implemented } +#---------------- + + +=head2 overlap + + Usage : $hit_object->overlap( [integer] ); + Purpose : Gets/Sets the allowable amount overlap between different HSP sequences. + Example : $hit_object->overlap(5); + : $overlap = $hit_object->overlap; + Returns : Integer. + Argument : integer. + Throws : n/a + Status : Experimental + Comments : Any two HSPs whose sequences overlap by less than or equal + : to the overlap() number of resides will be considered separate HSPs + : and will not get tiled by Bio::Search::BlastUtils::_adjust_contigs(). + +See Also : L, L + +=cut + +#------------- +sub overlap { shift->throw_not_implemented } + + +=head2 n + + Usage : $hit_object->n(); + Purpose : Gets the N number for the current Blast hit. + : This is the number of HSPs in the set which was ascribed + : the lowest P-value (listed on the description line). + : This number is not the same as the total number of HSPs. + : To get the total number of HSPs, use num_hsps(). + Example : $n = $hit_object->n(); + Returns : Integer + Argument : n/a + Throws : Exception if HSPs have not been set (BLAST2 reports). + Comments : Note that the N parameter is not reported in gapped BLAST2. + : Calling n() on such reports will result in a call to num_hsps(). + : The num_hsps() method will count the actual number of + : HSPs in the alignment listing, which may exceed N in + : some cases. + +See Also : L + +=cut + +#----- +sub n { shift->throw_not_implemented } + +=head2 p + + Usage : $hit_object->p( [format] ); + Purpose : Get the P-value for the best HSP of the given BLAST hit. + : (Note that P-values are not provided with NCBI Blast2 reports). + Example : $p = $sbjct->p; + : $p = $sbjct->p('exp'); # get exponent only. + : ($num, $exp) = $sbjct->p('parts'); # split sci notation into parts + Returns : Float or scientific notation number (the raw P-value, DEFAULT). + : Integer if format == 'exp' (the magnitude of the base 10 exponent). + : 2-element list (float, int) if format == 'parts' and P-value + : is in scientific notation (See Comments). + Argument : format: string of 'raw' | 'exp' | 'parts' + : 'raw' returns value given in report. Default. (1.2e-34) + : 'exp' returns exponent value only (34) + : 'parts' returns the decimal and exponent as a + : 2-element list (1.2, -34) (See Comments). + Throws : Warns if no P-value is defined. Uses expect instead. + Comments : Using the 'parts' argument is not recommended since it will not + : work as expected if the P-value is not in scientific notation. + : That is, floats are not converted into sci notation before + : splitting into parts. + +See Also : L, L, L + +=cut + +#-------- +sub p { shift->throw_not_implemented() } + +=head2 hsp + + Usage : $hit_object->hsp( [string] ); + Purpose : Get a single HSPI object for the present HitI object. + Example : $hspObj = $hit_object->hsp; # same as 'best' + : $hspObj = $hit_object->hsp('best'); + : $hspObj = $hit_object->hsp('worst'); + Returns : Object reference for a Bio::Search::HSP::BlastHSP.pm object. + Argument : String (or no argument). + : No argument (default) = highest scoring HSP (same as 'best'). + : 'best' or 'first' = highest scoring HSP. + : 'worst' or 'last' = lowest scoring HSP. + Throws : Exception if the HSPs have not been collected. + : Exception if an unrecognized argument is used. + +See Also : L, L() + +=cut + +#---------- +sub hsp { shift->throw_not_implemented } + +=head2 logical_length + + Usage : $hit_object->logical_length( [seq_type] ); + : (mostly intended for internal use). + Purpose : Get the logical length of the hit sequence. + : If the Blast is a TBLASTN or TBLASTX, the returned length + : is the length of the would-be amino acid sequence (length/3). + : For all other BLAST flavors, this function is the same as length(). + Example : $len = $hit_object->logical_length(); + Returns : Integer + Argument : seq_type = 'query' or 'hit' or 'sbjct' (default = 'query') + ('sbjct' is synonymous with 'hit') + Throws : n/a + Comments : This is important for functions like frac_aligned_query() + : which need to operate in amino acid coordinate space when dealing + : with [T]BLAST[NX] type reports. + +See Also : L, L, L + +=cut + +#-------------------- +sub logical_length { shift->throw_not_implemented() } + + +=head2 rank + + Title : rank + Usage : $obj->rank($newval) + Function: Get/Set the rank of this Hit in the Query search list + i.e. this is the Nth hit for a specific query + Returns : value of rank + Args : newvalue (optional) + + +=cut + +sub rank{ + my ($self,$value) = @_; + $self->throw_not_implemented(); +} + +=head2 each_accession_number + + Title : each_accession_number + Usage : $obj->each_accession_number + Function: Get each accession number listed in the description of the hit. + If there are no alternatives, then only the primary accession will + be given + Returns : list of all accession numbers in the description + Args : none + + +=cut + +sub each_accession_number{ + my ($self,$value) = @_; + $self->throw_not_implemented(); +} + + +=head2 tiled_hsps + + Usage : $hit_object->tiled_hsps( [integer] ); + Purpose : Gets/Sets an indicator for whether or not the HSPs in this Hit + : have been tiled. + : Methods that rely on HSPs being tiled should check this + : and then call SearchUtils::tile_hsps() if not. + Example : $hit_object->tiled_hsps(1); + : if( $hit_object->tiled_hsps ) { # do something } + Returns : Boolean (1 or 0) + Argument : integer (optional) + Throws : n/a + +=cut + +sub tiled_hsps { shift->throw_not_implemented } + + +=head2 strand + + Usage : $sbjct->strand( [seq_type] ); + Purpose : Gets the strand(s) for the query, sbjct, or both sequences + : in the best HSP of the BlastHit object after HSP tiling. + : Only valid for BLASTN, TBLASTX, BLASTX-query, TBLASTN-hit. + Example : $qstrand = $sbjct->strand('query'); + : $sstrand = $sbjct->strand('hit'); + : ($qstrand, $sstrand) = $sbjct->strand(); + Returns : scalar context: integer '1', '-1', or '0' + : array context without args: list of two strings (queryStrand, sbjctStrand) + : Array context can be "induced" by providing an argument of 'list' or 'array'. + Argument : In scalar context: seq_type = 'query' or 'hit' or 'sbjct' (default = 'query') + ('sbjct' is synonymous with 'hit') + Throws : n/a + Comments : This method requires that all HSPs be tiled. If they have not + : already been tiled, they will be tiled first automatically.. + : If you don't want the tiled data, iterate through each HSP + : calling strand() on each (use hsps() to get all HSPs). + : + : Formerly (prior to 10/21/02), this method would return the + : string "-1/1" for hits with HSPs on both strands. + : However, now that strand and frame is properly being accounted + : for during HSP tiling, it makes more sense for strand() + : to return the strand data for the best HSP after tiling. + : + : If you really want to know about hits on opposite strands, + : you should be iterating through the HSPs using methods on the + : HSP objects. + : + : A possible use case where knowing whether a hit has HSPs + : on both strands would be when filtering via SearchIO for hits with + : this property. However, in this case it would be better to have a + : dedicated method such as $hit->hsps_on_both_strands(). Similarly + : for frame. This could be provided if there is interest. + +See Also : B() + +=cut + +#---------' +sub strand { shift->throw_not_implemented } + + +=head2 frame + + Usage : $hit_object->frame(); + Purpose : Gets the reading frame for the best HSP after HSP tiling. + : This is only valid for BLASTX and TBLASTN/X type reports. + Example : $frame = $hit_object->frame(); + Returns : Integer (-2 .. +2) + Argument : n/a + Throws : Exception if HSPs have not been set. + Comments : This method requires that all HSPs be tiled. If they have not + : already been tiled, they will be tiled first automatically.. + : If you don't want the tiled data, iterate through each HSP + : calling frame() on each (use hsps() to get all HSPs). + +See Also : L + +=cut + +#---------' +sub frame { shift->throw_not_implemented } + + +=head2 matches + + Usage : $hit_object->matches( [class] ); + Purpose : Get the total number of identical or conserved matches + : (or both) across all HSPs. + : (Note: 'conservative' matches are indicated as 'positives' + : in BLAST reports.) + Example : ($id,$cons) = $hit_object->matches(); # no argument + : $id = $hit_object->matches('id'); + : $cons = $hit_object->matches('cons'); + Returns : Integer or a 2-element array of integers + Argument : class = 'id' | 'cons' OR none. + : If no argument is provided, both identical and conservative + : numbers are returned in a two element list. + : (Other terms can be used to refer to the conservative + : matches, e.g., 'positive'. All that is checked is whether or + : not the supplied string starts with 'id'. If not, the + : conservative matches are returned.) + Throws : Exception if the requested data cannot be obtained. + Comments : This method requires that all HSPs be tiled. If there is more than one + : HSP and they have not already been tiled, they will be tiled first automatically.. + : + : If you need data for each HSP, use hsps() and then interate + : through the HSP objects. + : Does not rely on wantarray to return a list. Only checks for + : the presence of an argument (no arg = return list). + +See Also : L, L + +=cut + +sub matches { shift->throw_not_implemented } + +1; + + + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Search/Processor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Search/Processor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,115 @@ + +# +# BioPerl module for Bio::Search::Processor +# +# Cared for by Aaron Mackey +# +# Copyright Aaron Mackey +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Search::Processor - DESCRIPTION of Object + +=head1 SYNOPSIS + +Give standard usage here + +=head1 DESCRIPTION + +Describe the object here + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Aaron Mackey + +Email amackey@virginia.edu + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::Search::Processor; + +use strict; +use vars qw(@ISA); + +=head2 new + + Title : new + Usage : $proc = new Bio::Search::Processor -file => $filename, + -algorithm => 'Algorithm' ; + Function: Used to specify and initialize a data processor of search + algorithm results. + Returns : A processor specific to the algorithm type, if it exists. + Args : -file => filename + -algorithm => algorithm specifier + -fh => filehandle to attach to (file or fh required) + +=cut + +sub new { + + my $type = shift; + my $proc; + my ($module, $load, $algorithm); + + my %args = @_; + + exists $args{'-algorithm'} or do { + print STDERR "Must supply an algorithm!"; + return undef; + }; + + $algorithm = $args{'-algorithm'} || $args{'-ALGORITHM'}; + + $module = "_new(@_); + return $proc; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Search/Result/BlastResult.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Search/Result/BlastResult.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,716 @@ +#----------------------------------------------------------------- +# $Id: BlastResult.pm,v 1.13 2002/12/24 15:48:41 jason Exp $ +# +# BioPerl module Bio::Search::Result::BlastResult +# +# Cared for by Steve Chervitz +# +# You may distribute this module under the same terms as perl itself +#----------------------------------------------------------------- + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Search::Result::BlastResult - A top-level BLAST Report object + +=head1 SYNOPSIS + +The construction of BlastResult objects is performed by +by the B parser. +Therefore, you do not need to +use B) directly. If you need to construct +BlastHits directly, see the new() function for details. + +For B BLAST parsing usage examples, see the +B directory of the Bioperl distribution. + +=head1 DESCRIPTION + +This module supports BLAST versions 1.x and 2.x, gapped and ungapped, +and PSI-BLAST. + +=head1 DEPENDENCIES + +Bio::Search::Result::BlastResult.pm is a concrete class that inherits from B and B. It relies on two other modules: + +=over 4 + +=item B + +Encapsulates a single a single BLAST hit. + +=item B + +Provides an interface to a blast database metadata. + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Steve Chervitz Esac@bioperl.orgE + +See L for where to send bug reports and comments. + +=head1 ACKNOWLEDGEMENTS + +This software was originally developed in the Department of Genetics +at Stanford University. I would also like to acknowledge my +colleagues at Affymetrix for useful feedback. + +=head1 COPYRIGHT + +Copyright (c) 2001 Steve Chervitz. All Rights Reserved. + +=cut + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Search::Result::BlastResult; + +use strict; + +use Bio::Search::Result::ResultI; +use Bio::Root::Root; + +use overload + '""' => \&to_string; + +use vars qw(@ISA $Revision ); + +$Revision = '$Id: BlastResult.pm,v 1.13 2002/12/24 15:48:41 jason Exp $'; #' +@ISA = qw( Bio::Root::Root Bio::Search::Result::ResultI); + +#---------------- +sub new { +#---------------- + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + return $self; +} + +#sub DESTROY { +# my $self = shift; +# print STDERR "->DESTROYING $self\n"; +#} + + +#================================================= +# Begin Bio::Search::Result::ResultI implementation +#================================================= + +=head2 next_hit + +See L for documentation + +=cut + +#---------------- +sub next_hit { +#---------------- + my ($self) = @_; + + unless(defined $self->{'_hit_queue'}) { + $self->{'_hit_queue'} = [$self->hits()]; + } + + shift @{$self->{'_hit_queue'}}; +} + +=head2 query_name + +See L for documentation + +=cut + +#---------------- +sub query_name { +#---------------- + my $self = shift; + if (@_) { + my $name = shift; + $name =~ s/^\s+|(\s+|,)$//g; + $self->{'_query_name'} = $name; + } + return $self->{'_query_name'}; +} + +=head2 query_length + +See L for documentation + +=cut + +#---------------- +sub query_length { +#---------------- + my $self = shift; + if(@_) { $self->{'_query_length'} = shift; } + return $self->{'_query_length'}; +} + +=head2 query_description + +See L for documentation + +=cut + +#---------------- +sub query_description { +#---------------- + my $self = shift; + if(@_) { + my $desc = shift; + defined $desc && $desc =~ s/(^\s+|\s+$)//g; + # Remove duplicated ID at beginning of description string + defined $desc && $desc =~ s/^$self->{'_query_name'}//o; + $self->{'_query_query_desc'} = $desc || ''; + } + return $self->{'_query_query_desc'}; +} + + +=head2 analysis_method + +See L for documentation + +This implementation ensures that the name matches /blast/i. + +=cut + +#---------------- +sub analysis_method { +#---------------- + my ($self, $method) = @_; + if($method ) { + if( $method =~ /blast/i) { + $self->{'_analysis_prog'} = $method; + } else { + $self->throw("method $method not supported in " . ref($self)); + } + } + return $self->{'_analysis_prog'}; +} + +=head2 analysis_method_version + +See L for documentation + +=cut + +#---------------- +sub analysis_method_version { +#---------------- + my ($self, $version) = @_; + if($version) { + $self->{'_analysis_progVersion'} = $version; + } + return $self->{'_analysis_progVersion'}; +} + + +=head2 analysis_query + +See L for documentation + +=cut + +#---------------- +sub analysis_query { +#---------------- + + my ($self) = @_; + if(not defined $self->{'_analysis_query'}) { + require Bio::PrimarySeq; + my $moltype = $self->analysis_method =~ /blastp|tblastn/i ? 'protein' : 'dna'; + $self->{'_analysis_query'} = Bio::PrimarySeq->new( -display_id => $self->query_name, + -desc => $self->query_description, + -moltype => $moltype + ); + $self->{'_analysis_query'}->length( $self->query_length ); + } + return $self->{'_analysis_query'}; +} + +=head2 analysis_subject + + Usage : $blastdb = $result->analyis_subject(); + Purpose : Get a Bio::Search::DatabaseI object containing + information about the database used in the BLAST analysis. + Returns : Bio::Search::DatabaseI object. + Argument : n/a + +=cut + +#--------------- +sub analysis_subject { +#--------------- + my ($self, $blastdb) = @_; + if($blastdb) { + if( ref $blastdb and $blastdb->isa('Bio::Search::DatabaseI')) { + $self->{'_analysis_sbjct'} = $blastdb; + } + else { + $self->throw(-class =>'Bio::Root::BadParameter', + -text => "Can't set BlastDB: not a Bio::Search::DatabaseI $blastdb" + ); + } + } + return $self->{'_analysis_sbjct'}; +} + +=head2 next_feature + + Title : next_feature + Usage : while( my $feat = $blast_result->next_feature ) { # do something } + Function: Returns the next feature available in the analysis result, or + undef if there are no more features. + Example : + Returns : A Bio::SeqFeatureI compliant object, in this case, + each Bio::Search::HSP::BlastHSP object within each BlastHit. + Args : None + +=cut + +#--------------- +sub next_feature{ +#--------------- + my ($self) = @_; + my ($hit, $hsp); + $hit = $self->{'_current_hit'}; + unless( defined $hit ) { + $hit = $self->{'_current_hit'} = $self->next_hit; + return undef unless defined $hit; + } + $hsp = $hit->next_hsp; + unless( defined $hsp ) { + $self->{'_current_hit'} = undef; + return $self->next_feature; + } + return $hsp || undef; +} + + +sub algorithm { shift->analysis_method( @_ ); } +sub algorithm_version { shift->analysis_method_version( @_ ); } + +=head2 available_parameters + + Title : available_parameters + Usage : my @params = $report->available_paramters + Function: Returns the names of the available parameters + Returns : Return list of available parameters used for this report + Args : none + +=cut + +sub available_parameters{ + return (); +} + + +=head2 get_parameter + + Title : get_parameter + Usage : my $gap_ext = $report->get_parameter('gapext') + Function: Returns the value for a specific parameter used + when running this report + Returns : string + Args : name of parameter (string) + +=cut + +sub get_parameter{ + return ''; +} + +=head2 get_statistic + + Title : get_statistic + Usage : my $gap_ext = $report->get_statistic('kappa') + Function: Returns the value for a specific statistic available + from this report + Returns : string + Args : name of statistic (string) + +=cut + +sub get_statistic{ + return ''; +} + +=head2 available_statistics + + Title : available_statistics + Usage : my @statnames = $report->available_statistics + Function: Returns the names of the available statistics + Returns : Return list of available statistics used for this report + Args : none + +=cut + +sub available_statistics{ + return (); +} + +#================================================= +# End Bio::Search::Result::ResultI implementation +#================================================= + + +=head2 to_string + + Title : to_string + Usage : print $blast->to_string; + Function: Returns a string representation for the Blast result. + Primarily intended for debugging purposes. + Example : see usage + Returns : A string of the form: + [BlastResult] query= db=analysis_method . " query=" . $self->query_name . " " . $self->query_description .", db=" . $self->database_name; + return $str; +} + +#--------------- +sub database_name { +#--------------- + my $self = shift; + my $dbname = ''; + if( ref $self->analysis_subject) { + $dbname = $self->analysis_subject->name; + } + return $dbname; +} + +=head2 database_entries + + Title : database_entries + Usage : $num_entries = $result->database_entries() + Function: Used to obtain the number of entries contained in the database. + Returns : a scalar integer representing the number of entities in the database + or undef if the information was not available. + Args : [optional] new integer for the number of sequence entries in the db + + +=cut + +#--------------- +sub database_entries { +#--------------- + my $self = shift; + my $dbentries = ''; + if( ref $self->analysis_subject) { + $dbentries = $self->analysis_subject->entries; + } + return $dbentries; +} + + +=head2 database_letters + + Title : database_letters + Usage : $size = $result->database_letters() + Function: Used to obtain the size of database that was searched against. + Returns : a scalar integer (units specific to algorithm, but probably the + total number of residues in the database, if available) or undef if + the information was not available to the Processor object. + Args : [optional] new scalar integer for number of letters in db + + +=cut + +#--------------- +sub database_letters { +#--------------- + my $self = shift; + my $dbletters = ''; + if( ref $self->analysis_subject) { + $dbletters = $self->analysis_subject->letters; + } + return $dbletters; +} + +#--------------- +sub hits { +#--------------- + my $self = shift; + my @hits = (); + if( ref $self->{'_hits'}) { + @hits = @{$self->{'_hits'}}; + } + return @hits; +} + +=head2 add_hit + + Usage : $blast->add_hit( $hit ); + Purpose : Adds a hit object to the collection of hits in this BLAST result. + Returns : n/a + Argument : A Bio::Search::Hit::HitI object + Comments : For PSI-BLAST, hits from all iterations are lumped together. + For any given hit, you can determine the iteration in which it was + found by checking $hit->iteration(). + +=cut + +#--------------- +sub add_hit { +#--------------- + my ($self, $hit) = @_; + my $add_it = 1; + unless( ref $hit and $hit->isa('Bio::Search::Hit::HitI')) { + $add_it = 0; + $self->throw(-class =>'Bio::Root::BadParameter', + -text => "Can't add hit: not a Bio::Search::Hit::HitI: $hit" + ); + } + + # Avoid adding duplicate hits if we're doing multiple iterations (PSI-BLAST) +# if( $self->iterations > 1 ) { +# my $hit_name = $hit->name; +# if( grep $hit_name eq $_, @{$self->{'_hit_names'}}) { +# $add_it = 0; +# } +# } + + if( $add_it ) { + push @{$self->{'_hits'}}, $hit; + push @{$self->{'_hit_names'}}, $hit->name; + } +} + + +=head2 is_signif + + Usage : $blast->is_signif(); + Purpose : Determine if the BLAST report contains significant hits. + Returns : Boolean + Argument : n/a + Comments : BLAST reports without significant hits but with defined + : significance criteria will throw exceptions during construction. + : This obviates the need to check significant() for + : such objects. + +=cut + +#------------ +sub is_signif { my $self = shift; return $self->{'_is_significant'}; } +#------------ + + +=head2 matrix + + Usage : $blast_object->matrix(); + Purpose : Get the name of the scoring matrix used. + : This is extracted from the report. + Argument : n/a + Returns : string or undef if not defined + Comments : TODO: Deprecate this and implement get_parameter('matrix'). + +=cut + +#------------ +sub matrix { +#------------ + my $self = shift; + if(@_) { + $self->{'_matrix'} = shift; + } + $self->{'_matrix'}; +} + + +=head2 raw_statistics + + Usage : @stats = $blast_result->raw_statistics(); + Purpose : Get the raw, unparsed statistical parameter section of the Blast report. + This is the section at the end after the last HSP alignment. + Argument : n/a + Returns : Array of strings + +=cut + +#------------ +sub raw_statistics { +#------------ + my $self = shift; + if(@_) { + my $params = shift; + if( ref $params eq 'ARRAY') { + $self->{'_raw_statistics'} = $params; + } + else { + $self->throw(-class =>'Bio::Root::BadParameter', + -text => "Can't set statistical params: not an ARRAY ref: $params" + ); + } + } + if(not defined $self->{'_raw_statistics'}) { + $self->{'_raw_statistics'} = []; + } + + @{$self->{'_raw_statistics'}}; +} + + + +=head2 no_hits_found + + Usage : $nohits = $blast->no_hits_found( [iteration_number] ); + Purpose : Get boolean indicator indicating whether or not any hits + were present in the report. + + This is NOT the same as determining the number of hits via + the hits() method, which will return zero hits if there were no + hits in the report or if all hits were filtered out during the parse. + + Thus, this method can be used to distinguish these possibilities + for hitless reports generated when filtering. + + Returns : Boolean + Argument : (optional) integer indicating the iteration number (PSI-BLAST) + If iteration number is not specified and this is a PSI-BLAST result, + then this method will return true only if all iterations had + no hits found. + +=cut + +#----------- +sub no_hits_found { +#----------- + my ($self, $round) = @_; + + my $result = 0; # final return value of this method. + # Watch the double negative! + # result = 0 means "yes hits were found" + # result = 1 means "no hits were found" (for the indicated iteration or all iterations) + + # If a iteration was not specified and there were multiple iterations, + # this method should return true only if all iterations had no hits found. + if( not defined $round ) { + if( $self->{'_iterations'} > 1) { + $result = 1; + foreach my $i( 1..$self->{'_iterations'} ) { + if( not defined $self->{"_iteration_$i"}->{'_no_hits_found'} ) { + $result = 0; + last; + } + } + } + else { + $result = $self->{"_iteration_1"}->{'_no_hits_found'}; + } + } + else { + $result = $self->{"_iteration_$round"}->{'_no_hits_found'}; + } + + return $result; +} + + +=head2 set_no_hits_found + + Usage : $blast->set_no_hits_found( [iteration_number] ); + Purpose : Set boolean indicator indicating whether or not any hits + were present in the report. + Returns : n/a + Argument : (optional) integer indicating the iteration number (PSI-BLAST) + +=cut + +#----------- +sub set_no_hits_found { +#----------- + my ($self, $round) = @_; + $round ||= 1; + $self->{"_iteration_$round"}->{'_no_hits_found'} = 1; +} + + +=head2 iterations + + Usage : $num_iterations = $blast->iterations; (get) + $blast->iterations($num_iterations); (set) + Purpose : Set/get the number of iterations in the Blast Report (PSI-BLAST). + Returns : Total number of iterations in the report + Argument : integer (when setting) + +=cut + +#---------------- +sub iterations { +#---------------- + my ($self, $num ) = @_; + if( defined $num ) { + $self->{'_iterations'} = $num; + } + return $self->{'_iterations'}; +} + + +=head2 psiblast + + Usage : if( $blast->psiblast ) { ... } + Purpose : Set/get a boolean indicator whether or not the report + is a PSI-BLAST report. + Returns : 1 if PSI-BLAST, undef if not. + Argument : 1 (when setting) + +=cut + +#---------------- +sub psiblast { +#---------------- + my ($self, $val ) = @_; + if( $val ) { + $self->{'_psiblast'} = 1; + } + return $self->{'_psiblast'}; +} + + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Search/Result/GenericResult.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Search/Result/GenericResult.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,761 @@ +# $Id: GenericResult.pm,v 1.15 2002/12/05 13:46:34 heikki Exp $ +# +# BioPerl module for Bio::Search::Result::GenericResult +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Search::Result::GenericResult - Generic Implementation of Bio::Search::Result::ResultI interface applicable to most search results. + +=head1 SYNOPSIS + + + # typically one gets Results from a SearchIO stream + use Bio::SearchIO; + my $io = new Bio::SearchIO(-format => 'blast', + -file => 't/data/HUMBETGLOA.tblastx'); + while( my $result = $io->next_result) { + # process all search results within the input stream + while( my $hit = $result->next_hits()) { + # insert code here for hit processing + } + } + + use Bio::Search::Result::GenericResult; + my @hits = (); # would be a list of Bio::Search::Hit::HitI objects + # typically these are created from a Bio::SearchIO stream + my $result = new Bio::Search::Result::GenericResult + ( -query_name => 'HUMBETGLOA', + -query_accession => '' + -query_description => 'Human haplotype C4 beta-globin gene, complete cds.' + -query_length => 3002 + -database_name => 'ecoli.aa' + -database_letters => 4662239, + -database_entries => 400, + -parameters => { 'e' => '0.001' }, + -statistics => { 'kappa' => 0.731 }, + -algorithm => 'blastp', + -algorithm_version => '2.1.2', + ); + + my $id = $result->query_name(); + + my $desc = $result->query_description(); + + my $name = $result->database_name(); + + my $size = $result->database_letters(); + + my $num_entries = $result->database_entries(); + + my $gap_ext = $result->get_parameter('e'); + + my @params = $result->available_parameters; + + my $kappa = $result->get_statistic('kappa'); + + my @statnames = $result->available_statistics; + + + +=head1 DESCRIPTION + +This object is an implementation of the Bio::Search::Result::ResultI +interface and provides a generic place to store results from a +sequence database search. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich and Steve Chervitz + +Email jason@bioperl.org +Email sac@bioperl.org + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Search::Result::GenericResult; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::Search::Result::ResultI; + +use overload + '""' => \&to_string; + +@ISA = qw(Bio::Root::Root Bio::Search::Result::ResultI); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Search::Result::GenericResult(); + Function: Builds a new Bio::Search::Result::GenericResult object + Returns : Bio::Search::Result::GenericResult + Args : -query_name => Name of query Sequence + -query_accession => Query accession number (if available) + -query_description => Description of query sequence + -query_length => Length of query sequence + -database_name => Name of database + -database_letters => Number of residues in database + -database_entries => Number of entries in database + -parameters => hash ref of search parameters (key => value) + -statistics => hash ref of search statistics (key => value) + -algorithm => program name (blastx) + -algorithm_version => version of the algorithm (2.1.2) + -algorithm_reference => literature reference string for this algorithm + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + $self->{'_hits'} = []; + $self->{'_hitindex'} = 0; + $self->{'_statistics'} = {}; + $self->{'_parameters'} = {}; + + my ($qname,$qacc,$qdesc,$qlen, + $dbname,$dblet,$dbent,$params, + $stats, $hits, $algo, $algo_v, + $prog_ref, $algo_r) = $self->_rearrange([qw(QUERY_NAME + QUERY_ACCESSION + QUERY_DESCRIPTION + QUERY_LENGTH + DATABASE_NAME + DATABASE_LETTERS + DATABASE_ENTRIES + PARAMETERS + STATISTICS + HITS + ALGORITHM + ALGORITHM_VERSION + PROGRAM_REFERENCE + ALGORITHM_REFERENCE + )],@args); + + $algo_r ||= $prog_ref; + defined $algo && $self->algorithm($algo); + defined $algo_v && $self->algorithm_version($algo_v); + defined $algo_r && $self->algorithm_reference($algo_r); + + defined $qname && $self->query_name($qname); + defined $qacc && $self->query_accession($qacc); + defined $qdesc && $self->query_description($qdesc); + defined $qlen && $self->query_length($qlen); + defined $dbname && $self->database_name($dbname); + defined $dblet && $self->database_letters($dblet); + defined $dbent && $self->database_entries($dbent); + + if( defined $params ) { + if( ref($params) !~ /hash/i ) { + $self->throw("Must specify a hash reference with the the parameter '-parameters"); + } + while( my ($key,$value) = each %{$params} ) { + $self->add_parameter($key,$value); + } + } + if( defined $stats ) { + if( ref($stats) !~ /hash/i ) { + $self->throw("Must specify a hash reference with the the parameter '-statistics"); + } + while( my ($key,$value) = each %{$stats} ) { + $self->add_statistic($key,$value); + } + } + + if( defined $hits ) { + $self->throw("Must define arrayref of Hits when initializing a $class\n") unless ref($hits) =~ /array/i; + + foreach my $s ( @$hits ) { + $self->add_hit($s); + } + } + return $self; +} + +=head2 algorithm + + Title : algorithm + Usage : my $r_type = $hsp->algorithm + Function: Obtain the name of the algorithm used to obtain the Result + Returns : string (e.g., BLASTP) + Args : [optional] scalar string to set value + +=cut + +sub algorithm{ + my ($self,$value) = @_; + my $previous = $self->{'_algorithm'}; + if( defined $value || ! defined $previous ) { + $value = $previous = '' unless defined $value; + $self->{'_algorithm'} = $value; + } + return $previous; +} + +=head2 algorithm_version + + Title : algorithm_version + Usage : my $r_version = $hsp->algorithm_version + Function: Obtain the version of the algorithm used to obtain the Result + Returns : string (e.g., 2.1.2) + Args : [optional] scalar string to set algorithm version value + +=cut + +sub algorithm_version{ + my ($self,$value) = @_; + my $previous = $self->{'_algorithm_version'}; + if( defined $value || ! defined $previous ) { + $value = $previous = '' unless defined $value; + $self->{'_algorithm_version'} = $value; + } + + return $previous; +} + +=head2 Bio::Search::Result::ResultI interface methods + +Bio::Search::Result::ResultI implementation + +=head2 next_hit + + Title : next_hit + Usage : while( $hit = $result->next_hit()) { ... } + Function: Returns the next available Hit object, representing potential + matches between the query and various entities from the database. + Returns : a Bio::Search::Hit::HitI object or undef if there are no more. + Args : none + + +=cut + +sub next_hit { + my ($self,@args) = @_; + my $index = $self->_nexthitindex; + return undef if $index > scalar @{$self->{'_hits'}}; + return $self->{'_hits'}->[$index]; +} + +=head2 query_name + + Title : query_name + Usage : $id = $result->query_name(); + Function: Get the string identifier of the query used by the + algorithm that performed the search. + Returns : a string. + Args : [optional] new string value for query name + +=cut + +sub query_name { + my ($self,$value) = @_; + my $previous = $self->{'_queryname'}; + if( defined $value || ! defined $previous ) { + $value = $previous = '' unless defined $value; + $self->{'_queryname'} = $value; + } + return $previous; +} + +=head2 query_accession + + Title : query_accession + Usage : $id = $result->query_accession(); + Function: Get the accession (if available) for the query sequence + Returns : a string + Args : [optional] new string value for accession + +=cut + +sub query_accession { + my ($self,$value) = @_; + my $previous = $self->{'_queryacc'}; + if( defined $value || ! defined $previous ) { + $value = $previous = '' unless defined $value; + $self->{'_queryacc'} = $value; + } + return $previous; +} + +=head2 query_length + + Title : query_length + Usage : $id = $result->query_length(); + Function: Get the length of the query sequence + used in the search. + Returns : a number + Args : [optional] new integer value for query length + +=cut + +sub query_length { + my ($self,$value) = @_; + my $previous = $self->{'_querylength'}; + if( defined $value || ! defined $previous ) { + $value = $previous = 0 unless defined $value; + $self->{'_querylength'} = $value; + } + return $previous; +} + +=head2 query_description + + Title : query_description + Usage : $id = $result->query_description(); + Function: Get the description of the query sequence + used in the search. + Returns : a string + Args : [optional] new string for the query description + +=cut + +sub query_description { + my ($self,$value) = @_; + my $previous = $self->{'_querydesc'}; + if( defined $value || ! defined $previous ) { + $value = $previous = '' unless defined $value; + $self->{'_querydesc'} = $value; + } + return $previous; +} + + +=head2 database_name + + Title : database_name + Usage : $name = $result->database_name() + Function: Used to obtain the name of the database that the query was searched + against by the algorithm. + Returns : a scalar string + Args : [optional] new string for the db name + +=cut + +sub database_name { + my ($self,$value) = @_; + my $previous = $self->{'_dbname'}; + if( defined $value || ! defined $previous ) { + $value = $previous = '' unless defined $value; + $self->{'_dbname'} = $value; + } + return $previous; +} + +=head2 database_letters + + Title : database_letters + Usage : $size = $result->database_letters() + Function: Used to obtain the size of database that was searched against. + Returns : a scalar integer (units specific to algorithm, but probably the + total number of residues in the database, if available) or undef if + the information was not available to the Processor object. + Args : [optional] new scalar integer for number of letters in db + + +=cut + +sub database_letters { + my ($self,$value) = @_; + my $previous = $self->{'_dbletters'}; + if( defined $value || ! defined $previous ) { + $value = $previous = '' unless defined $value; + $self->{'_dbletters'} = $value; + } + return $previous; +} + +=head2 database_entries + + Title : database_entries + Usage : $num_entries = $result->database_entries() + Function: Used to obtain the number of entries contained in the database. + Returns : a scalar integer representing the number of entities in the database + or undef if the information was not available. + Args : [optional] new integer for the number of sequence entries in the db + + +=cut + +sub database_entries { + my ($self,$value) = @_; + my $previous = $self->{'_dbentries'}; + if( defined $value || ! defined $previous ) { + $value = $previous = '' unless defined $value; + $self->{'_dbentries'} = $value; + } + return $previous; +} + +=head2 get_parameter + + Title : get_parameter + Usage : my $gap_ext = $report->get_parameter('gapext') + Function: Returns the value for a specific parameter used + when running this report + Returns : string + Args : name of parameter (string) + +=cut + +sub get_parameter{ + my ($self,$name) = @_; + return $self->{'_parameters'}->{$name}; +} + +=head2 available_parameters + + Title : available_parameters + Usage : my @params = $report->available_paramters + Function: Returns the names of the available parameters + Returns : Return list of available parameters used for this report + Args : none + +=cut + +sub available_parameters{ + my ($self) = @_; + return keys %{$self->{'_parameters'}}; +} + + +=head2 get_statistic + + Title : get_statistic + Usage : my $gap_ext = $report->get_statistic('kappa') + Function: Returns the value for a specific statistic available + from this report + Returns : string + Args : name of statistic (string) + +=cut + +sub get_statistic{ + my ($self,$key) = @_; + return $self->{'_statistics'}->{$key}; +} + +=head2 available_statistics + + Title : available_statistics + Usage : my @statnames = $report->available_statistics + Function: Returns the names of the available statistics + Returns : Return list of available statistics used for this report + Args : none + +=cut + +sub available_statistics{ + my ($self) = @_; + return keys %{$self->{'_statistics'}}; +} + +=head2 Bio::Search::Report + +Bio::Search::Result::GenericResult specific methods + +=head2 add_hit + + Title : add_hit + Usage : $report->add_hit($hit) + Function: Adds a HitI to the stored list of hits + Returns : Number of HitI currently stored + Args : Bio::Search::Hit::HitI + +=cut + +sub add_hit { + my ($self,$s) = @_; + if( $s->isa('Bio::Search::Hit::HitI') ) { + push @{$self->{'_hits'}}, $s; + } else { + $self->warn("Passed in " .ref($s). + " as a Hit which is not a Bio::Search::HitI... skipping"); + } + return scalar @{$self->{'_hits'}}; +} + + +=head2 rewind + + Title : rewind + Usage : $result->rewind; + Function: Allow one to reset the Hit iteration to the beginning + Since this is an in-memory implementation + Returns : none + Args : none + +=cut + +sub rewind{ + my ($self) = @_; + $self->{'_hitindex'} = 0; +} + + +=head2 _nexthitindex + + Title : _nexthitindex + Usage : private + +=cut + +sub _nexthitindex{ + my ($self,@args) = @_; + return $self->{'_hitindex'}++; +} + + + +=head2 add_parameter + + Title : add_parameter + Usage : $report->add_parameter('gapext', 11); + Function: Adds a parameter + Returns : none + Args : key - key value name for this parama + value - value for this parameter + +=cut + +sub add_parameter{ + my ($self,$key,$value) = @_; + $self->{'_parameters'}->{$key} = $value; +} + + +=head2 add_statistic + + Title : add_statistic + Usage : $report->add_statistic('lambda', 2.3); + Function: Adds a parameter + Returns : none + Args : key - key value name for this parama + value - value for this parameter + +=cut + +sub add_statistic { + my ($self,$key,$value) = @_; + $self->{'_statistics'}->{$key} = $value; + return; +} + + +=head2 num_hits + + Title : num_hits + Usage : my $hitcount= $result->num_hits + Function: returns the number of hits for this query result + Returns : integer + Args : none + + +=cut + +sub num_hits{ + my ($self) = shift; + if (not defined $self->{'_hits'}) { + $self->throw("Can't get Hits: data not collected."); + } + return scalar(@{$self->{'_hits'}}); +} + + +=head2 hits + + Title : hits + Usage : my @hits = $result->hits + Function: Returns the available hits for this Result + Returns : Array of L objects + Args : none + + +=cut + +sub hits{ + my ($self) = shift; + my @hits = (); + if( ref $self->{'_hits'}) { + @hits = @{$self->{'_hits'}}; + } + return @hits; +} + +=head2 algorithm_reference + + Title : algorithm_reference + Usage : $obj->algorithm_reference($newval) + Function: + Returns : string containing literature reference for the algorithm + Args : newvalue string (optional) + Comments: Formerly named program_reference(), which is still supported + for backwards compatibility. + +=cut + +sub algorithm_reference{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'algorithm_reference'} = $value; + } + return $self->{'algorithm_reference'}; +} + + +sub program_reference { shift->algorithm_reference(@_); } + + +=head2 no_hits_found + +See documentation in L + +=cut + +#----------- +sub no_hits_found { +#----------- + my ($self, $round) = @_; + + my $result = 0; # final return value of this method. + # Watch the double negative! + # result = 0 means "yes hits were found" + # result = 1 means "no hits were found" (for the indicated iteration or all iterations) + + # If a iteration was not specified and there were multiple iterations, + # this method should return true only if all iterations had no hits found. + if( not defined $round ) { + if( $self->{'_iterations'} > 1) { + $result = 1; + foreach my $i( 1..$self->{'_iterations'} ) { + if( not defined $self->{"_iteration_$i"}->{'_no_hits_found'} ) { + $result = 0; + last; + } + } + } + else { + $result = $self->{"_iteration_1"}->{'_no_hits_found'}; + } + } + else { + $result = $self->{"_iteration_$round"}->{'_no_hits_found'}; + } + + return $result; +} + + +=head2 set_no_hits_found + +See documentation in L + +=cut + +#----------- +sub set_no_hits_found { +#----------- + my ($self, $round) = @_; + $round ||= 1; + $self->{"_iteration_$round"}->{'_no_hits_found'} = 1; +} + + +=head2 iterations + +See documentation in L + +=cut + +#---------------- +sub iterations { +#---------------- + my ($self, $num ) = @_; + if( defined $num ) { + $self->{'_iterations'} = $num; + } + return $self->{'_iterations'}; +} + + +=head2 psiblast + +See documentation in L + +=cut + +#---------------- +sub psiblast { +#---------------- + my ($self, $val ) = @_; + if( $val ) { + $self->{'_psiblast'} = 1; + } + return $self->{'_psiblast'}; +} + + +=head2 to_string + + Title : to_string + Usage : print $blast->to_string; + Function: Returns a string representation for the Blast result. + Primarily intended for debugging purposes. + Example : see usage + Returns : A string of the form: + [GenericResult] query= db=algorithm . " query=" . $self->query_name . " " . $self->query_description .", db=" . $self->database_name; + return $str; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Search/Result/HMMERResult.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Search/Result/HMMERResult.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,445 @@ +# $Id: HMMERResult.pm,v 1.3 2002/10/22 07:45:18 lapp Exp $ +# +# BioPerl module for Bio::Search::Result::HMMERResult +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Search::Result::HMMERResult - A Result object for HMMER results + +=head1 SYNOPSIS + + use Bio::Search::Result::HMMERResult; + my $result = new Bio::Search::Result::HMMERResult + ( -hmm_name => 'pfam', + -sequence_file => 'roa1.pep', + -hits => \@hits); + + # generally we use Bio::SearchIO to build these objects + use Bio::SearchIO; + my $in = new Bio::SearchIO(-format => 'hmmer', + -file => 'result.hmmer'); + while( my $result = $in->next_result ) { + print $result->query_name, " ", $result->algorithm, " ", $result->num_hits(), " hits\n"; + } + +=head1 DESCRIPTION + +This is a specialization of L. +There are a few extra methods, specifically L, +L, L, and L. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Search::Result::HMMERResult; +use vars qw(@ISA); +use strict; + +use Bio::Search::Result::GenericResult; + + +@ISA = qw(Bio::Search::Result::GenericResult ); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Search::Result::HMMERResult(); + Function: Builds a new Bio::Search::Result::HMMERResult object + Returns : Bio::Search::Result::HMMERResult + Args : -hmm_name => string, name of hmm file + -sequence_file => name of the sequence file + +plus Bio::Search::Result::GenericResult parameters + + -query_name => Name of query Sequence + -query_accession => Query accession number (if available) + -query_description => Description of query sequence + -query_length => Length of query sequence + -database_name => Name of database + -database_letters => Number of residues in database + -database_entries => Number of entries in database + -parameters => hash ref of search parameters (key => value) + -statistics => hash ref of search statistics (key => value) + -algorithm => program name (blastx) + -algorithm_version => version of the algorithm (2.1.2) + -program_reference => literature reference string for this algorithm + +=cut + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + + my ($hmm,$seqfile) = $self->_rearrange([qw(HMM_NAME SEQUENCE_FILE)], + @args); + + defined( $seqfile) && $self->sequence_file($seqfile); + defined( $hmm) && $self->hmm_name($hmm); + + return $self; +} + + +=head2 hmm_name + + Title : hmm_name + Usage : $obj->hmm_name($newval) + Function: Get/Set the value of hmm_name + Returns : value of hmm_name + Args : newvalue (optional) + + +=cut + +sub hmm_name{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'_hmm_name'} = $value; + } + return $self->{'_hmm_name'}; +} + + +=head2 sequence_file + + Title : sequence_file + Usage : $obj->sequence_file($newval) + Function: Get/Set the value of sequence_file + Returns : value of sequence_file + Args : newvalue (optional) + + +=cut + +sub sequence_file{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'_sequence_file'} = $value; + } + return $self->{'_sequence_file'}; + +} + + +=head2 next_model + + Title : next_model + Usage : my $domain = $result->next_model + Function: Returns the next domain - this + is an alias for next_hit + Returns : L object + Args : none + + +=cut + +sub next_model{ shift->next_hit } + +=head2 models + + Title : models + Usage : my @domains = $result->models; + Function: Returns the list of HMM models seen - this + is an alias for hits() + Returns : Array of L objects + Args : none + + +=cut + +sub models{ shift->hits } + +=head2 Bio::Search::Result::GenericResult inherited methods + +=cut + +=head2 algorithm + + Title : algorithm + Usage : my $r_type = $hsp->algorithm + Function: Obtain the name of the algorithm used to obtain the Result + Returns : string (e.g., BLASTP) + Args : [optional] scalar string to set value + +=cut + +=head2 algorithm_version + + Title : algorithm_version + Usage : my $r_version = $hsp->algorithm_version + Function: Obtain the version of the algorithm used to obtain the Result + Returns : string (e.g., 2.1.2) + Args : [optional] scalar string to set algorithm version value + +=cut + +=head2 Bio::Search::Result::ResultI interface methods + +Bio::Search::Result::ResultI implementation + +=head2 next_hit + + Title : next_hit + Usage : while( $hit = $result->next_hit()) { ... } + Function: Returns the next available Hit object, representing potential + matches between the query and various entities from the database. + Returns : a Bio::Search::Hit::HitI object or undef if there are no more. + Args : none + + +=cut + +=head2 query_name + + Title : query_name + Usage : $id = $result->query_name(); + Function: Get the string identifier of the query used by the + algorithm that performed the search. + Returns : a string. + Args : [optional] new string value for query name + +=cut + +=head2 query_accession + + Title : query_accession + Usage : $id = $result->query_accession(); + Function: Get the accession (if available) for the query sequence + Returns : a string + Args : [optional] new string value for accession + +=cut + +=head2 query_length + + Title : query_length + Usage : $id = $result->query_length(); + Function: Get the length of the query sequence + used in the search. + Returns : a number + Args : [optional] new integer value for query length + +=cut + +=head2 query_description + + Title : query_description + Usage : $id = $result->query_description(); + Function: Get the description of the query sequence + used in the search. + Returns : a string + Args : [optional] new string for the query description + +=cut + +=head2 database_name + + Title : database_name + Usage : $name = $result->database_name() + Function: Used to obtain the name of the database that the query was searched + against by the algorithm. + Returns : a scalar string + Args : [optional] new string for the db name + +=cut + +=head2 database_letters + + Title : database_letters + Usage : $size = $result->database_letters() + Function: Used to obtain the size of database that was searched against. + Returns : a scalar integer (units specific to algorithm, but probably the + total number of residues in the database, if available) or undef if + the information was not available to the Processor object. + Args : [optional] new scalar integer for number of letters in db + + +=cut + +=head2 database_entries + + Title : database_entries + Usage : $num_entries = $result->database_entries() + Function: Used to obtain the number of entries contained in the database. + Returns : a scalar integer representing the number of entities in the database + or undef if the information was not available. + Args : [optional] new integer for the number of sequence entries in the db + + +=cut + +=head2 get_parameter + + Title : get_parameter + Usage : my $gap_ext = $report->get_parameter('gapext') + Function: Returns the value for a specific parameter used + when running this report + Returns : string + Args : name of parameter (string) + +=cut + +=head2 available_parameters + + Title : available_parameters + Usage : my @params = $report->available_paramters + Function: Returns the names of the available parameters + Returns : Return list of available parameters used for this report + Args : none + +=cut + +=head2 get_statistic + + Title : get_statistic + Usage : my $gap_ext = $report->get_statistic('kappa') + Function: Returns the value for a specific statistic available + from this report + Returns : string + Args : name of statistic (string) + +=cut + +=head2 available_statistics + + Title : available_statistics + Usage : my @statnames = $report->available_statistics + Function: Returns the names of the available statistics + Returns : Return list of available statistics used for this report + Args : none + +=cut + +=head2 Bio::Search::Result::GenericResult specific methods + +=cut + +=head2 add_hit + + Title : add_hit + Usage : $report->add_hit($hit) + Function: Adds a HitI to the stored list of hits + Returns : Number of HitI currently stored + Args : Bio::Search::Hit::HitI + +=cut + +=head2 rewind + + Title : rewind + Usage : $result->rewind; + Function: Allow one to reset the Hit iteration to the beginning + Since this is an in-memory implementation + Returns : none + Args : none + +=cut + +sub rewind{ + my ($self) = @_; + $self->{'_hitindex'} = 0; +} + + +=head2 add_parameter + + Title : add_parameter + Usage : $report->add_parameter('gapext', 11); + Function: Adds a parameter + Returns : none + Args : key - key value name for this parama + value - value for this parameter + +=cut + +=head2 add_statistic + + Title : add_statistic + Usage : $report->add_statistic('lambda', 2.3); + Function: Adds a parameter + Returns : none + Args : key - key value name for this parama + value - value for this parameter + +=cut + +=head2 num_hits + + Title : num_hits + Usage : my $hitcount= $result->num_hits + Function: returns the number of hits for this query result + Returns : integer + Args : none + + +=cut + +=head2 hits + + Title : hits + Usage : my @hits = $result->hits + Function: Returns the available hits for this Result + Returns : Array of L objects + Args : none + + +=cut + +=head2 program_reference + + Title : program_reference + Usage : $obj->program_reference($newval) + Function: + Returns : value of the literature reference for the algorithm + Args : newvalue (optional) + + +=cut + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Search/Result/ResultFactory.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Search/Result/ResultFactory.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,149 @@ +# $Id: ResultFactory.pm,v 1.3 2002/10/22 07:45:18 lapp Exp $ +# +# BioPerl module for Bio::Search::Result::ResultFactory +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Search::Result::ResultFactory - A factory to create Bio::Search::Result::ResultI objects + +=head1 SYNOPSIS + + use Bio::Search::Result::ResultFactory; + my $factory = new Bio::Search::Result::ResultFactory(); + my $resultobj = $factory->create(@args); + +=head1 DESCRIPTION + +This is a general way of hiding the object creation process so that we +can dynamically change the objects that are created by the SearchIO +parser depending on what format report we are parsing. + +This object is for creating new Results. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Search::Result::ResultFactory; +use vars qw(@ISA $DEFAULT_TYPE); +use strict; + +use Bio::Root::Root; +use Bio::Factory::ObjectFactoryI; + +@ISA = qw(Bio::Root::Root Bio::Factory::ObjectFactoryI ); + +BEGIN { + $DEFAULT_TYPE = 'Bio::Search::Result::GenericResult'; +} + +=head2 new + + Title : new + Usage : my $obj = new Bio::Search::Result::ResultFactory(); + Function: Builds a new Bio::Search::Result::ResultFactory object + Returns : Bio::Search::Result::ResultFactory + Args : + + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + my ($type) = $self->_rearrange([qw(TYPE)],@args); + $self->type($type) if defined $type; + return $self; +} + +=head2 create + + Title : create + Usage : $factory->create(%args) + Function: Create a new L object + Returns : L + Args : hash of initialization parameters + + +=cut + +sub create{ + my ($self,@args) = @_; + my $type = $self->type; + eval { $self->_load_module($type) }; + if( $@ ) { $self->throw("Unable to load module $type: $@"); } + return $type->new(@args); +} + + +=head2 type + + Title : type + Usage : $factory->type('Bio::Search::Result::GenericResult'); + Function: Get/Set the Result creation type + Returns : string + Args : [optional] string to set + + +=cut + +sub type{ + my ($self,$type) = @_; + if( defined $type ) { + # redundancy with the create method which also calls _load_module + # I know - but this is not a highly called object so I am going + # to leave it in + eval {$self->_load_module($type) }; + if( $@ ){ $self->warn("Cannot find module $type, unable to set type"); } + else { $self->{'_type'} = $type; } + } + return $self->{'_type'} || $DEFAULT_TYPE; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Search/Result/ResultI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Search/Result/ResultI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,461 @@ +#----------------------------------------------------------------- +# $Id: ResultI.pm,v 1.16 2002/11/13 11:23:11 sac Exp $ +# +# BioPerl module Bio::Search::Result::ResultI +# +# Cared for by Steve Chervitz +# +# Originally created by Aaron Mackey +# +# You may distribute this module under the same terms as perl itself +#----------------------------------------------------------------- + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Search::Result::ResultI - Abstract interface to Search Result objects + +=head1 SYNOPSIS + +# Bio::Search::Result::ResultI objects cannot be instantiated since this +# module defines a pure interface. + +# Given an object that implements the Bio::Search::Result::ResultI interface, +# you can do the following things with it: + + use Bio::SearchIO; + my $io = new Bio::SearchIO(-format => 'blast', + -file => 't/data/HUMBETGLOA.tblastx'); + my $result = $io->next_result; + while( $hit = $result->next_hit()) { # enter code here for hit processing + } + + my $id = $result->query_name(); + + my $desc = $result->query_description(); + + my $dbname = $result->database_name(); + + my $size = $result->database_letters(); + + my $num_entries = $result->database_entries(); + + my $gap_ext = $result->get_parameter('gapext'); + + my @params = $result->available_parameters; + + my $kappa = $result->get_statistic('kappa'); + + my @statnames = $result->available_statistics; + +=head1 DESCRIPTION + +Bio::Search::Result::ResultI objects are data structures containing +the results from the execution of a search algorithm. As such, it may +contain various algorithm specific information as well as details of +the execution, but will contain a few fundamental elements, including +the ability to return Bio::Search::Hit::HitI objects. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Aaron Mackey Eamackey@virginia.eduE (original author) + +Steve Chervitz Esac@bioperl.orgE + +See L for where to send bug reports and comments. + +=head1 COPYRIGHT + +Copyright (c) 1999-2001 Aaron Mackey, Steve Chervitz. All Rights Reserved. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + +#' +# Let the code begin... + + +package Bio::Search::Result::ResultI; + +use strict; +use vars qw(@ISA); + +use Bio::AnalysisResultI; + +@ISA = qw( Bio::AnalysisResultI ); + + +=head2 next_hit + + Title : next_hit + Usage : while( $hit = $result->next_hit()) { ... } + Function: Returns the next available Hit object, representing potential + matches between the query and various entities from the database. + Returns : a Bio::Search::Hit::HitI object or undef if there are no more. + Args : none + + +=cut + +sub next_hit { + my ($self,@args) = @_; + $self->throw_not_implemented; +} + +=head2 query_name + + Title : query_name + Usage : $id = $result->query_name(); + Function: Get the string identifier of the query used by the + algorithm that performed the search. + Returns : a string. + Args : none + +=cut + +sub query_name { + my ($self,@args) = @_; + $self->throw_not_implemented; +} + +=head2 query_accession + + Title : query_accession + Usage : $id = $result->query_accession(); + Function: Get the accession (if available) for the query sequence + Returns : a string + Args : none + +=cut + +sub query_accession { + my ($self,@args) = @_; + $self->throw_not_implemented; +} + + +=head2 query_length + + Title : query_length + Usage : $id = $result->query_length(); + Function: Get the length of the query sequence + used in the search. + Returns : a number + Args : none + +=cut + +sub query_length { + my ($self,@args) = @_; + $self->throw_not_implemented; +} + +=head2 query_description + + Title : query_description + Usage : $id = $result->query_description(); + Function: Get the description of the query sequence + used in the search. + Returns : a string + Args : none + +=cut + +sub query_description { + my ($self,@args) = @_; + $self->throw_not_implemented; +} + + +=head2 database_name + + Title : database_name + Usage : $name = $result->database_name() + Function: Used to obtain the name of the database that the query was searched + against by the algorithm. + Returns : a scalar string + Args : none + +=cut + +sub database_name { + my ($self,@args) = @_; + + $self->throw_not_implemented; +} + +=head2 database_letters + + Title : database_letters + Usage : $size = $result->database_letters() + Function: Used to obtain the size of database that was searched against. + Returns : a scalar integer (units specific to algorithm, but probably the + total number of residues in the database, if available) or undef if + the information was not available to the Processor object. + Args : none + + +=cut + +sub database_letters { + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +=head2 database_entries + + Title : database_entries + Usage : $num_entries = $result->database_entries() + Function: Used to obtain the number of entries contained in the database. + Returns : a scalar integer representing the number of entities in the database + or undef if the information was not available. + Args : none + + +=cut + +sub database_entries { + my ($self,@args) = @_; + + $self->throw_not_implemented(); +} + +=head2 get_parameter + + Title : get_parameter + Usage : my $gap_ext = $result->get_parameter('gapext') + Function: Returns the value for a specific parameter used + when running this result + Returns : string + Args : name of parameter (string) + +=cut + +sub get_parameter{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +=head2 available_parameters + + Title : available_parameters + Usage : my @params = $result->available_parameters + Function: Returns the names of the available parameters + Returns : Return list of available parameters used for this result + Args : none + +=cut + +sub available_parameters{ + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 get_statistic + + Title : get_statistic + Usage : my $gap_ext = $result->get_statistic('kappa') + Function: Returns the value for a specific statistic available + from this result + Returns : string + Args : name of statistic (string) + +=cut + +sub get_statistic{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +=head2 available_statistics + + Title : available_statistics + Usage : my @statnames = $result->available_statistics + Function: Returns the names of the available statistics + Returns : Return list of available statistics used for this result + Args : none + +=cut + +sub available_statistics{ + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 algorithm + + Title : algorithm + Usage : my $r_type = $result->algorithm + Function: Obtain the name of the algorithm used to obtain the Result + Returns : string (e.g., BLASTP) + Args : [optional] scalar string to set value + +=cut + +sub algorithm{ + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 algorithm_version + + Title : algorithm_version + Usage : my $r_version = $result->algorithm_version + Function: Obtain the version of the algorithm used to obtain the Result + Returns : string (e.g., 2.1.2) + Args : [optional] scalar string to set algorithm version value + +=cut + +sub algorithm_version{ + my ($self) = @_; + $self->throw_not_implemented(); +} + + +=head2 algorithm_reference + + Title : algorithm_reference + Usage : $obj->algorithm_reference($newval) + Function: + Returns : value of the literature reference for the algorithm + Args : newvalue (optional) + Comments: The default implementation in ResultI returns an empty string + rather than throwing a NotImplemented exception, since + the ref may not always be available and is not critical. + +=cut + +sub algorithm_reference{ + my ($self) = @_; + return ''; +} + +=head2 num_hits + + Title : num_hits + Usage : my $hitcount= $result->num_hits + Function: returns the number of hits for this query result + Returns : integer + Args : none + + +=cut + +sub num_hits{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +=head2 hits + + Title : hits + Usage : my @hits = $result->hits + Function: Returns the available hits for this Result + Returns : Array of L objects + Args : none + + +=cut + +sub hits{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +=head2 no_hits_found + + Usage : $nohits = $blast->no_hits_found( [iteration_number] ); + Purpose : Get boolean indicator indicating whether or not any hits + were present in the report. + + This is NOT the same as determining the number of hits via + the hits() method, which will return zero hits if there were no + hits in the report or if all hits were filtered out during the parse. + + Thus, this method can be used to distinguish these possibilities + for hitless reports generated when filtering. + + Returns : Boolean + Argument : (optional) integer indicating the iteration number (PSI-BLAST) + If iteration number is not specified and this is a PSI-BLAST result, + then this method will return true only if all iterations had + no hits found. + +=cut + +#----------- +sub no_hits_found { shift->throw_not_implemented } + + + +=head2 set_no_hits_found + + Usage : $blast->set_no_hits_found( [iteration_number] ); + Purpose : Set boolean indicator indicating whether or not any hits + were present in the report. + Returns : n/a + Argument : (optional) integer indicating the iteration number (PSI-BLAST) + +=cut + +sub set_no_hits_found { shift->throw_not_implemented } + + +=head2 iterations + + Usage : $num_iterations = $blast->iterations; (get) + $blast->iterations($num_iterations); (set) + Purpose : Set/get the number of iterations in the Blast Report (PSI-BLAST). + Returns : Total number of iterations in the report + Argument : integer (when setting) + +=cut + +sub iterations { shift->throw_not_implemented } + + +=head2 psiblast + + Usage : if( $blast->psiblast ) { ... } + Purpose : Set/get a boolean indicator whether or not the report + is a PSI-BLAST report. + Returns : 1 if PSI-BLAST, undef if not. + Argument : 1 (when setting) + +=cut + +sub psiblast { shift->throw_not_implemented } + +1; + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Search/Result/WABAResult.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Search/Result/WABAResult.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,129 @@ +# $Id: WABAResult.pm,v 1.2 2002/10/22 07:45:18 lapp Exp $ +# +# BioPerl module for Bio::Search::Result::WABAResult +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Search::Result::WABAResult - Result object for WABA alignment output + +=head1 SYNOPSIS + +# use this object exactly as you would a GenericResult +# the only extra method is query_database which is the +# name of the file where the query sequence came from + +=head1 DESCRIPTION + +This object is for WABA result output, there is little difference +between this object and a GenericResult save the addition of one +method query_database. Expect many of the fields for GenericResult to +be empty however as WABA was not intended to provide a lot of extra +information other than the alignment. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Search::Result::WABAResult; +use vars qw(@ISA); +use strict; + +use Bio::Search::Result::GenericResult; + +@ISA = qw( Bio::Search::Result::GenericResult ); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Search::Result::WABAResult(); + Function: Builds a new Bio::Search::Result::WABAResult object + Returns : Bio::Search::Result::WABAResult + Args : -query_database => "name of the database where the query came from" + + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + my ($db) = $self->_rearrange([qw(QUERY_DATABASE)], @args); + defined $db && $self->query_database($db); + return $self; +} + +=head2 query_database + + Title : query_database + Usage : $obj->query_database($newval) + Function: Data field for the database filename where the + query sequence came from + Returns : value of query_database + Args : newvalue (optional) + + +=cut + +sub query_database{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'query_database'} = $value; + } + return $self->{'query_database'}; +} + + +=head2 All other methods are inherited from Bio::Search::Result::GenericResult + +See the L for complete +documentation of the rest of the methods that are available for this +module. + +=cut + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Search/SearchUtils.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Search/SearchUtils.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,547 @@ +=head1 NAME + +Bio::Search::SearchUtils - Utility functions for Bio::Search:: objects + +=head1 SYNOPSIS + +This module is just a collection of subroutines, not an object. + +=head1 DESCRIPTION + +The SearchUtils.pm module is a collection of subroutines used primarily by +Bio::Search::Hit::HitI objects for some of the additional +functionality, such as HSP tiling. Right now, the SearchUtils is just a +collection of methods, not an object. + +=head1 AUTHOR + +Steve Chervitz Esac@bioperl.orgE + +=cut + +#' + +package Bio::Search::SearchUtils; + +use strict; +use vars qw($DEBUG); +$DEBUG = 1; + +=head2 tile_hsps + + Usage : tile_hsps( $sbjct ); + : This is called automatically by methods in Bio::Search::Hit::GenericHit + : that rely on having tiled data. + Purpose : Collect statistics about the aligned sequences in a set of HSPs. + : Calculates the following data across all HSPs: + : -- total alignment length + : -- total identical residues + : -- total conserved residues + Returns : n/a + Argument : A Bio::Search::Hit::HitI object + Throws : n/a + Comments : + : This method performs more careful summing of data across + : all HSPs in the Sbjct object. Only HSPs that are in the same strand + : and frame are tiled. Simply summing the data from all HSPs + : in the same strand and frame will overestimate the actual + : length of the alignment if there is overlap between different HSPs + : (often the case). + : + : The strategy is to tile the HSPs and sum over the + : contigs, collecting data separately from overlapping and + : non-overlapping regions of each HSP. To facilitate this, the + : HSP.pm object now permits extraction of data from sub-sections + : of an HSP. + : + : Additional useful information is collected from the results + : of the tiling. It is possible that sub-sequences in + : different HSPs will overlap significantly. In this case, it + : is impossible to create a single unambiguous alignment by + : concatenating the HSPs. The ambiguity may indicate the + : presence of multiple, similar domains in one or both of the + : aligned sequences. This ambiguity is recorded using the + : ambiguous_aln() method. + : + : This method does not attempt to discern biologically + : significant vs. insignificant overlaps. The allowable amount of + : overlap can be set with the overlap() method or with the -OVERLAP + : parameter used when constructing the Hit object. + : + : For a given hit, both the query and the sbjct sequences are + : tiled independently. + : + : -- If only query sequence HSPs overlap, + : this may suggest multiple domains in the sbjct. + : -- If only sbjct sequence HSPs overlap, + : this may suggest multiple domains in the query. + : -- If both query & sbjct sequence HSPs overlap, + : this suggests multiple domains in both. + : -- If neither query & sbjct sequence HSPs overlap, + : this suggests either no multiple domains in either + : sequence OR that both sequences have the same + : distribution of multiple similar domains. + : + : This method can deal with the special case of when multiple + : HSPs exactly overlap. + : + : Efficiency concerns: + : Speed will be an issue for sequences with numerous HSPs. + : + Bugs : Currently, tile_hsps() does not properly account for + : the number of non-tiled but overlapping HSPs, which becomes a problem + : as overlap() grows. Large values overlap() may thus lead to + : incorrect statistics for some hits. For best results, keep overlap() + : below 5 (DEFAULT IS 2). For more about this, see the "HSP Tiling and + : Ambiguous Alignments" section in L. + +See Also : L<_adjust_contigs>(), L + +=cut + +#-------------- +sub tile_hsps { +#-------------- + my $sbjct = shift; + + $sbjct->tiled_hsps(1); + $sbjct->gaps('query', 0); + $sbjct->gaps('hit', 0); + + ## Simple summation scheme. Valid if there is only one HSP. + if( $sbjct->n == 1 or $sbjct->num_hsps == 1) { + my $hsp = $sbjct->hsp; + $sbjct->length_aln('query', $hsp->length('query')); + $sbjct->length_aln('hit', $hsp->length('sbjct')); + $sbjct->length_aln('total', $hsp->length('total')); + $sbjct->matches( $hsp->matches() ); + $sbjct->gaps('query', $hsp->gaps('query')); + $sbjct->gaps('sbjct', $hsp->gaps('sbjct')); + +# print "_tile_hsps(): single HSP, easy stats.\n"; + return; + } else { +# print STDERR "Sbjct: _tile_hsps: summing multiple HSPs\n"; + $sbjct->length_aln('query', 0); + $sbjct->length_aln('sbjct', 0); + $sbjct->length_aln('total', 0); + $sbjct->matches( 0, 0); + } + + ## More than one HSP. Must tile HSPs. +# print "\nTiling HSPs for $sbjct\n"; + my($hsp, $qstart, $qstop, $sstart, $sstop); + my($frame, $strand, $qstrand, $sstrand); + my(@qcontigs, @scontigs); + my $qoverlap = 0; + my $soverlap = 0; + my $max_overlap = $sbjct->overlap; + my $hit_qgaps = 0; + my $hit_sgaps = 0; + my $hit_len_aln = 0; + my %start_stop; + + foreach $hsp ($sbjct->hsps()) { +# printf " HSP: %s\n%s\n",$hsp->name, $hsp->str('query'); +# printf " Length = %d; Identical = %d; Conserved = %d; Conserved(1-10): %d",$hsp->length, $hsp->length(-TYPE=>'iden'), $hsp->length(-TYPE=>'cons'), $hsp->length(-TYPE=>'cons',-START=>0,-STOP=>10); + ($qstart, $qstop) = $hsp->range('query'); + ($sstart, $sstop) = $hsp->range('sbjct'); + $frame = $hsp->frame; + $frame = -1 unless defined $frame; + ($qstrand, $sstrand) = ($hsp->query->strand, + $hsp->hit->strand); + + # Note: No correction for overlap. + my ($qgaps, $sgaps) = ($hsp->gaps('query'), $hsp->gaps('hit')); + $hit_qgaps += $qgaps; + $hit_sgaps += $sgaps; + $hit_len_aln += $hsp->length; + + ## Collect contigs in the query sequence. + $qoverlap = &_adjust_contigs('query', $hsp, $qstart, $qstop, + \@qcontigs, $max_overlap, $frame, + $qstrand); + + ## Collect contigs in the sbjct sequence (needed for domain data and gapped Blast). + $soverlap = &_adjust_contigs('sbjct', $hsp, $sstart, $sstop, + \@scontigs, $max_overlap, $frame, + $sstrand); + + ## Collect overall start and stop data for query and sbjct over all HSPs. + if(not defined $start_stop{'qstart'}) { + $start_stop{'qstart'} = $qstart; + $start_stop{'qstop'} = $qstop; + $start_stop{'sstart'} = $sstart; + $start_stop{'sstop'} = $sstop; + } else { + $start_stop{'qstart'} = ($qstart < $start_stop{'qstart'} ? + $qstart : $start_stop{'qstart'} ); + $start_stop{'qstop'} = ($qstop > $start_stop{'qstop'} ? + $qstop : $start_stop{'qstop'} ); + $start_stop{'sstart'} = ($sstart < $start_stop{'sstart'} ? + $sstart : $start_stop{'sstart'} ); + $start_stop{'sstop'} = ($sstop > $start_stop{'sstop'} ? + $sstop : $start_stop{'sstop'} ); + } + } + + # Store the collected data in the Hit object + $sbjct->gaps('query', $hit_qgaps); + $sbjct->gaps('hit', $hit_sgaps); + $sbjct->length_aln('total', $hit_len_aln); + + $sbjct->start('query',$start_stop{'qstart'}); + $sbjct->end('query', $start_stop{'qstop'}); + $sbjct->start('hit', $start_stop{'sstart'}); + $sbjct->end('hit', $start_stop{'sstop'}); + + ## Collect data across the collected contigs. + +# print "\nQUERY CONTIGS:\n"; +# print " gaps = $sbjct->{'_gaps_query'}\n"; + + # Account for strand/frame. + # Strategy: collect data on a per strand+frame basis and save the most significant one. + my (%qctg_dat); + foreach(@qcontigs) { +# print " query contig: $_->{'start'} - $_->{'stop'}\n"; +# print " iden = $_->{'iden'}; cons = $_->{'cons'}\n"; + ($frame, $strand) = ($_->{'frame'}, $_->{'strand'}); + $qctg_dat{ "$frame$strand" }->{'length_aln_query'} += $_->{'stop'} - $_->{'start'} + 1; + $qctg_dat{ "$frame$strand" }->{'totalIdentical'} += $_->{'iden'}; + $qctg_dat{ "$frame$strand" }->{'totalConserved'} += $_->{'cons'}; + $qctg_dat{ "$frame$strand" }->{'qstrand'} = $strand; + } + + # Find longest contig. + my @sortedkeys = reverse sort { $qctg_dat{ $a }->{'length_aln_query'} <=> $qctg_dat{ $b }->{'length_aln_query'} } keys %qctg_dat; + + # Save the largest to the sbjct: + my $longest = $sortedkeys[0]; + $sbjct->length_aln('query', $qctg_dat{ $longest }->{'length_aln_query'}); + $sbjct->matches($qctg_dat{ $longest }->{'totalIdentical'}, + $qctg_dat{ $longest }->{'totalConserved'}); + $sbjct->strand('query', $qctg_dat{ $longest }->{'qstrand'}); + + ## Collect data for sbjct contigs. Important for gapped Blast. + ## The totalIdentical and totalConserved numbers will be the same + ## as determined for the query contigs. + +# print "\nSBJCT CONTIGS:\n"; +# print " gaps = ", $sbjct->gaps('sbjct'), "\n"; + + my (%sctg_dat); + foreach(@scontigs) { +# print " sbjct contig: $_->{'start'} - $_->{'stop'}\n"; +# print " iden = $_->{'iden'}; cons = $_->{'cons'}\n"; + ($frame, $strand) = ($_->{'frame'}, $_->{'strand'}); + $sctg_dat{ "$frame$strand" }->{'length_aln_sbjct'} += $_->{'stop'} - $_->{'start'} + 1; + $sctg_dat{ "$frame$strand" }->{'frame'} = $frame; + $sctg_dat{ "$frame$strand" }->{'sstrand'} = $strand; + } + + @sortedkeys = reverse sort { $sctg_dat{ $a }->{'length_aln_sbjct'} <=> $sctg_dat{ $b }->{'length_aln_sbjct'} } keys %sctg_dat; + + # Save the largest to the sbjct: + $longest = $sortedkeys[0]; + + $sbjct->length_aln('sbjct', $sctg_dat{ $longest }->{'length_aln_sbjct'}); + $sbjct->frame( $sctg_dat{ $longest }->{'frame'} ); + $sbjct->strand('hit', $sctg_dat{ $longest }->{'sstrand'}); + + if($qoverlap) { + if($soverlap) { $sbjct->ambiguous_aln('qs'); +# print "\n*** AMBIGUOUS ALIGNMENT: Query and Sbjct\n\n"; + } + else { $sbjct->ambiguous_aln('q'); +# print "\n*** AMBIGUOUS ALIGNMENT: Query\n\n"; + } + } elsif($soverlap) { + $sbjct->ambiguous_aln('s'); +# print "\n*** AMBIGUOUS ALIGNMENT: Sbjct\n\n"; + } + + # Adjust length based on BLAST flavor. + my $prog = $sbjct->algorithm; + if($prog eq 'TBLASTN') { + $sbjct->length_aln('sbjct', $sbjct->length_aln('sbjct')/3); + } elsif($prog eq 'BLASTX' ) { + $sbjct->length_aln('query', $sbjct->length_aln('query')/3); + } elsif($prog eq 'TBLASTX') { + $sbjct->length_aln('query', $sbjct->length_aln('query')/3); + $sbjct->length_aln('sbjct', $sbjct->length_aln('sbjct')/3); + } +} + + + +=head2 _adjust_contigs + + Usage : n/a; called automatically during object construction. + Purpose : Builds HSP contigs for a given BLAST hit. + : Utility method called by _tile_hsps() + Returns : + Argument : + Throws : Exceptions propagated from Bio::Search::Hit::BlastHSP::matches() + : for invalid sub-sequence ranges. + Status : Experimental + Comments : This method does not currently support gapped alignments. + : Also, it does not keep track of the number of HSPs that + : overlap within the amount specified by overlap(). + : This will lead to significant tracking errors for large + : overlap values. + +See Also : L(), L + +=cut + +#------------------- +sub _adjust_contigs { +#------------------- + my ($seqType, $hsp, $start, $stop, $contigs_ref, + $max_overlap, $frame, $strand) = @_; + + my $overlap = 0; + my ($numID, $numCons); + +# printf STDERR "Testing $seqType data: HSP (%s); $start, $stop, strand=$strand, frame=$frame\n", $hsp->$seqType()->seq_id if $DEBUG; + + foreach ( @$contigs_ref) { + # print STDERR " Contig: $_->{'start'} - $_->{'stop'}, strand=$_->{'strand'}, frame=$_->{'frame'}, iden= $_->{'iden'}, cons= $_->{'cons'}\n" if $DEBUG; + # Don't merge things unless they have matching strand/frame. + next unless ($_->{'frame'} == $frame and $_->{'strand'} == $strand); + + ## Test special case of a nested HSP. Skip it. + if($start >= $_->{'start'} and $stop <= $_->{'stop'}) { +# print STDERR "----> Nested HSP. Skipping.\n"; + $overlap = 1; + next; + } + + ## Test for overlap at beginning of contig. + if($start < $_->{'start'} and $stop > ($_->{'start'} + $max_overlap)) { +# print STDERR "----> Overlaps beg: existing beg,end: $_->{'start'},$_->{'stop'}, new beg,end: $start,$stop\n"; + # Collect stats over the non-overlapping region. + eval { + ($numID, $numCons) = $hsp->matches(-SEQ =>$seqType, + -START =>$start, + -STOP =>$_->{'start'}-1); + }; + if($@) { warn "\a\n$@\n"; } + else { + $_->{'start'} = $start; # Assign a new start coordinate to the contig + $_->{'iden'} += $numID; # and add new data to #identical, #conserved. + $_->{'cons'} += $numCons; + $overlap = 1; + } + } + + ## Test for overlap at end of contig. + if($stop > $_->{'stop'} and + $start < ($_->{'stop'} - $max_overlap)) { +# print STDERR "----> Overlaps end: existing beg,end: $_->{'start'},$_->{'stop'}, new beg,end: $start,$stop\n"; + # Collect stats over the non-overlapping region. + eval { + ($numID,$numCons) = $hsp->matches(-SEQ =>$seqType, + -START =>$_->{'stop'}, + -STOP =>$stop); + }; + if($@) { warn "\a\n$@\n"; } + else { + $_->{'stop'} = $stop; # Assign a new stop coordinate to the contig + $_->{'iden'} += $numID; # and add new data to #identical, #conserved. + $_->{'cons'} += $numCons; + $overlap = 1; + } + } + $overlap && do { +# print STDERR " New Contig data:\n"; +# print STDERR " Contig: $_->{'start'} - $_->{'stop'}, iden= $_->{'iden'}, cons= $_->{'cons'}\n"; + last; + }; + } + ## If there is no overlap, add the complete HSP data. + !$overlap && do { +# print STDERR "No overlap. Adding new contig.\n"; + ($numID,$numCons) = $hsp->matches(-SEQ=>$seqType); + push @$contigs_ref, {'start'=>$start, 'stop'=>$stop, + 'iden'=>$numID, 'cons'=>$numCons, + 'strand'=>$strand, 'frame'=>$frame}; + }; + $overlap; +} + +=head2 get_exponent + + Usage : &get_exponent( number ); + Purpose : Determines the power of 10 exponent of an integer, float, + : or scientific notation number. + Example : &get_exponent("4.0e-206"); + : &get_exponent("0.00032"); + : &get_exponent("10."); + : &get_exponent("1000.0"); + : &get_exponent("e+83"); + Argument : Float, Integer, or scientific notation number + Returns : Integer representing the exponent part of the number (+ or -). + : If argument == 0 (zero), return value is "-999". + Comments : Exponents are rounded up (less negative) if the mantissa is >= 5. + : Exponents are rounded down (more negative) if the mantissa is <= -5. + +=cut + +#------------------ +sub get_exponent { +#------------------ + my $data = shift; + + my($num, $exp) = split /[eE]/, $data; + + if( defined $exp) { + $num = 1 if not $num; + $num >= 5 and $exp++; + $num <= -5 and $exp--; + } elsif( $num == 0) { + $exp = -999; + } elsif( not $num =~ /\./) { + $exp = CORE::length($num) -1; + } else { + $exp = 0; + $num .= '0' if $num =~ /\.$/; + my ($c); + my $rev = 0; + if($num !~ /^0/) { + $num = reverse($num); + $rev = 1; + } + do { $c = chop($num); + $c == 0 && $exp++; + } while( $c ne '.'); + + $exp = -$exp if $num == 0 and not $rev; + $exp -= 1 if $rev; + } + return $exp; +} + +=head2 collapse_nums + + Usage : @cnums = collapse_nums( @numbers ); + Purpose : Collapses a list of numbers into a set of ranges of consecutive terms: + : Useful for condensing long lists of consecutive numbers. + : EXPANDED: + : 1 2 3 4 5 6 10 12 13 14 15 17 18 20 21 22 24 26 30 31 32 + : COLLAPSED: + : 1-6 10 12-15 17 18 20-22 24 26 30-32 + Argument : List of numbers sorted numerically. + Returns : List of numbers mixed with ranges of numbers (see above). + Throws : n/a + +See Also : L + +=cut + +#------------------ +sub collapse_nums { +#------------------ +# This is probably not the slickest connectivity algorithm, but will do for now. + my @a = @_; + my ($from, $to, $i, @ca, $consec); + + $consec = 0; + for($i=0; $i < @a; $i++) { + not $from and do{ $from = $a[$i]; next; }; + if($a[$i] == $a[$i-1]+1) { + $to = $a[$i]; + $consec++; + } else { + if($consec == 1) { $from .= ",$to"; } + else { $from .= $consec>1 ? "\-$to" : ""; } + push @ca, split(',', $from); + $from = $a[$i]; + $consec = 0; + $to = undef; + } + } + if(defined $to) { + if($consec == 1) { $from .= ",$to"; } + else { $from .= $consec>1 ? "\-$to" : ""; } + } + push @ca, split(',', $from) if $from; + + @ca; +} + + +=head2 strip_blast_html + + Usage : $boolean = &strip_blast_html( string_ref ); + : This method is exported. + Purpose : Removes HTML formatting from a supplied string. + : Attempts to restore the Blast report to enable + : parsing by Bio::SearchIO::blast.pm + Returns : Boolean: true if string was stripped, false if not. + Argument : string_ref = reference to a string containing the whole Blast + : report containing HTML formatting. + Throws : Croaks if the argument is not a scalar reference. + Comments : Based on code originally written by Alex Dong Li + : (ali@genet.sickkids.on.ca). + : This method does some Blast-specific stripping + : (adds back a '>' character in front of each HSP + : alignment listing). + : + : THIS METHOD IS VERY SENSITIVE TO BLAST FORMATTING CHANGES! + : + : Removal of the HTML tags and accurate reconstitution of the + : non-HTML-formatted report is highly dependent on structure of + : the HTML-formatted version. For example, it assumes that first + : line of each alignment section (HSP listing) starts with a + : anchor tag. This permits the reconstruction of the + : original report in which these lines begin with a ">". + : This is required for parsing. + : + : If the structure of the Blast report itself is not intended to + : be a standard, the structure of the HTML-formatted version + : is even less so. Therefore, the use of this method to + : reconstitute parsable Blast reports from HTML-format versions + : should be considered a temorary solution. + +See Also : B + +=cut + +#-------------------- +sub strip_blast_html { +#-------------------- + # This may not best way to remove html tags. However, it is simple. + # it won't work under following conditions: + # 1) if quoted > appears in a tag (does this ever happen?) + # 2) if a tag is split over multiple lines and this method is + # used to process one line at a time. + + my ($string_ref) = shift; + + ref $string_ref eq 'SCALAR' or + croak ("Can't strip HTML: ". + "Argument is should be a SCALAR reference not a ${\ref $string_ref}\n"); + + my $str = $$string_ref; + my $stripped = 0; + + # Removing "" and adding the '>' character for + # HSP alignment listings. + $str =~ s/(\A|\n)]+> ?/>/sgi and $stripped = 1; + + # Removing all "<>" tags. + $str =~ s/<[^>]+>| //sgi and $stripped = 1; + + # Re-uniting any lone '>' characters. + $str =~ s/(\A|\n)>\s+/\n\n>/sgi and $stripped = 1; + + $$string_ref = $str; + $stripped; +} + + +1; + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SearchDist.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SearchDist.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,247 @@ +# $Id: SearchDist.pm,v 1.16 2002/10/22 07:38:24 lapp Exp $ + +# +# BioPerl module for Bio::SearchDist +# +# Cared for by Ewan Birney +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SearchDist - A perl wrapper around Sean Eddy's histogram object + +=head1 SYNOPSIS + + $dis = Bio::SearchDist->new(); + foreach $score ( @scores ) { + $dis->add_score($score); + } + + if( $dis->fit_evd() ) { + foreach $score ( @scores ) { + $evalue = $dis->evalue($score); + print "Score $score had an evalue of $evalue\n"; + } + } else { + warn("Could not fit histogram to an EVD!"); + } + +=head1 DESCRIPTION + +The Bio::SearchDist object is a wrapper around Sean Eddy's excellent +histogram object. The histogram object can bascially take in a number +of scores which are sensibly distributed somewhere around 0 that come +from a supposed Extreme Value Distribution. Having add all the scores +from a database search via the add_score method you can then fit a +extreme value distribution using fit_evd(). Once fitted you can then +get out the evalue for each score (or a new score) using +evalue($score). + +The fitting procedure is better described in Sean Eddy's own code +(available from http://hmmer.wustl.edu, or in the histogram.h header +file in Compile/SW). Bascially it fits a EVD via a maximum likelhood +method with pruning of the top end of the distribution so that real +positives are discarded in the fitting procedure. This comes from +an orginally idea of Richard Mott's and the likelhood fitting +is from a book by Lawless [should ref here]. + + +The object relies on the fact that the scores are sensibly distributed +around about 0 and that integer bins are sensible for the +histogram. Scores based on bits are often ideal for this (bits based +scoring mechanisms is what this histogram object was originally +designed for). + + +=head1 CONTACT + +The original code this was based on comes from the histogram module as +part of the HMMer2 package. Look at http://hmmer.wustl.edu/ + +Its use in Bioperl is via the Compiled XS extension which is cared for +by Ewan Birney (birney@sanger.ac.uk). Please contact Ewan first about +the use of this module + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SearchDist; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; + +BEGIN { + eval { + require Bio::Ext::Align; + }; + if ( $@ ) { +print $@; + print STDERR ("\nThe C-compiled engine for histogram object (Bio::Ext::Align) has not been installed.\n Please install the bioperl-ext package\n\n"); + exit(1); + } +} + + +@ISA = qw(Bio::Root::Root); + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + my($min, $max, $lump) = + $self->_rearrange([qw(MIN MAX LUMP)], @args); + + if( ! $min ) { + $min = -100; + } + + if( ! $max ) { + $max = +100; + } + + if( ! $lump ) { + $lump = 50; + } + + $self->_engine(&Bio::Ext::Align::new_Histogram($min,$max,$lump)); + + return $self; +} + +=head2 add_score + + Title : add_score + Usage : $dis->add_score(300); + Function: Adds a single score to the distribution + Returns : nothing + Args : + + +=cut + +sub add_score{ + my ($self,$score) = @_; + my ($eng); + $eng = $self->_engine(); + #$eng->AddToHistogram($score); + $eng->add($score); +} + +=head2 fit_evd + + Title : fit_evd + Usage : $dis->fit_evd(); + Function: fits an evd to the current distribution + Returns : 1 if it fits successfully, 0 if not + Args : + + +=cut + +sub fit_evd{ + my ($self,@args) = @_; + + return $self->_engine()->fit_EVD(10000,1); +} + +=head2 fit_Gaussian + + Title : fit_Gaussian + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub fit_Gaussian{ + my ($self,$high) = @_; + + if( ! defined $high ) { + $high = 10000; + } + + return $self->_engine()->fit_Gaussian($high); +} + + +=head2 evalue + + Title : evalue + Usage : $eval = $dis->evalue($score) + Function: Returns the evalue of this score + Returns : float + Args : + + +=cut + +sub evalue{ + my ($self,$score) = @_; + + return $self->_engine()->evalue($score); + +} + + + +=head2 _engine + + Title : _engine + Usage : $obj->_engine($newval) + Function: underlyine bp_sw:: histogram engine + Returns : value of _engine + Args : newvalue (optional) + + +=cut + +sub _engine{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'_engine'} = $value; + } + return $self->{'_engine'}; +} + + +## End of Package + +1; +__END__ + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SearchIO.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SearchIO.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,538 @@ +# $Id: SearchIO.pm,v 1.18 2002/12/13 13:54:03 jason Exp $ +# +# BioPerl module for Bio::SearchIO +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SearchIO - Driver for parsing Sequence Database Searches (Blast,FASTA,...) + +=head1 SYNOPSIS + + use Bio::SearchIO; + # format can be 'fasta', 'blast' + my $searchio = new Bio::SearchIO( -format => 'blastxml', + -file => 'blastout.xml' ); + while ( my $result = $searchio->next_result() ) { + while( my $hit = $result->next_hit ) { + # process the Bio::Search::Hit::HitI object + while( my $hsp = $hit->next_hsp ) { + # process the Bio::Search::HSP::HSPI object + } + } + +=head1 DESCRIPTION + +This is a driver for instantiating a parser for report files from +sequence database searches. This object serves as a wrapper for the +format parsers in Bio::SearchIO::* - you should not need to ever +use those format parsers directly. (For people used to the SeqIO +system it, we are deliberately using the same pattern). + +Once you get a SearchIO object, calling next_result() gives you back +a L compliant object, which is an object that +represents one Blast/Fasta/HMMER whatever report. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich & Steve Chervitz + +Email jason@bioperl.org +Email sac@bioperl.org + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SearchIO; +use strict; +use vars qw(@ISA); + +# Object preamble - inherits from Bio::Root::IO + +use Bio::Root::IO; +use Bio::Event::EventGeneratorI; +use Bio::SearchIO::SearchResultEventBuilder; +use Bio::AnalysisParserI; +use Symbol(); + +@ISA = qw( Bio::Root::IO Bio::Event::EventGeneratorI Bio::AnalysisParserI); + +=head2 new + + Title : new + Usage : my $obj = new Bio::SearchIO(); + Function: Builds a new Bio::SearchIO object + Returns : Bio::SearchIO initialized with the correct format + Args : Args : -file => $filename + -format => format + -fh => filehandle to attach to + -result_factory => Object implementing Bio::Factory::ResultFactoryI + -hit_factory => Object implementing Bio::Factory::HitFactoryI + -writer => Object implementing Bio::SearchIO::SearchWriterI + -output_format => output format, which will dynamically load writer + +See L, L, +L + +=cut + +sub new { + my($caller,@args) = @_; + my $class = ref($caller) || $caller; + + # or do we want to call SUPER on an object if $caller is an + # object? + if( $class =~ /Bio::SearchIO::(\S+)/ ) { + my ($self) = $class->SUPER::new(@args); + $self->_initialize(@args); + return $self; + } else { + my %param = @args; + @param{ map { lc $_ } keys %param } = values %param; # lowercase keys + my $format = $param{'-format'} || + $class->_guess_format( $param{'-file'} || $ARGV[0] ) || 'blast'; + + my $output_format = $param{'-output_format'}; + my $writer = undef; + + if( defined $output_format ) { + if( defined $param{'-writer'} ) { + my $dummy = Bio::Root::Root->new(); + $dummy->throw("Both writer and output format specified - not good"); + } + + if( $output_format =~ /^blast$/i ) { + $output_format = 'TextResultWriter'; + } + my $output_module = "Bio::SearchIO::Writer::".$output_format; + $class->_load_module($output_module); + $writer = $output_module->new(); + push(@args,"-writer",$writer); + } + + + # normalize capitalization to lower case + $format = "\L$format"; + + return undef unless( $class->_load_format_module($format) ); + return "Bio::SearchIO::${format}"->new(@args); + } +} + +=head2 newFh + + Title : newFh + Usage : $fh = Bio::SearchIO->newFh(-file=>$filename, + -format=>'Format') + Function: does a new() followed by an fh() + Example : $fh = Bio::SearchIO->newFh(-file=>$filename, + -format=>'Format') + $result = <$fh>; # read a ResultI object + print $fh $result; # write a ResultI object + Returns : filehandle tied to the Bio::SearchIO::Fh class + Args : + +=cut + +sub newFh { + my $class = shift; + return unless my $self = $class->new(@_); + return $self->fh; +} + +=head2 fh + + Title : fh + Usage : $obj->fh + Function: + Example : $fh = $obj->fh; # make a tied filehandle + $result = <$fh>; # read a ResultI object + print $fh $result; # write a ResultI object + Returns : filehandle tied to the Bio::SearchIO::Fh class + Args : + +=cut + + +sub fh { + my $self = shift; + my $class = ref($self) || $self; + my $s = Symbol::gensym; + tie $$s,$class,$self; + return $s; +} + +=head2 attach_EventHandler + + Title : attach_EventHandler + Usage : $parser->attatch_EventHandler($handler) + Function: Adds an event handler to listen for events + Returns : none + Args : Bio::SearchIO::EventHandlerI + +See L + +=cut + +sub attach_EventHandler{ + my ($self,$handler) = @_; + return if( ! $handler ); + if( ! $handler->isa('Bio::SearchIO::EventHandlerI') ) { + $self->warn("Ignoring request to attatch handler ".ref($handler). ' because it is not a Bio::SearchIO::EventHandlerI'); + } + $self->{'_handler'} = $handler; + return; +} + +=head2 _eventHandler + + Title : _eventHandler + Usage : private + Function: Get the EventHandler + Returns : Bio::SearchIO::EventHandlerI + Args : none + +See L + +=cut + +sub _eventHandler{ + my ($self) = @_; + return $self->{'_handler'}; +} + +sub _initialize { + my($self, @args) = @_; + $self->{'_handler'} = undef; + # not really necessary unless we put more in RootI + #$self->SUPER::_initialize(@args); + + # initialize the IO part + $self->_initialize_io(@args); + $self->attach_EventHandler(new Bio::SearchIO::SearchResultEventBuilder()); + $self->{'_reporttype'} = ''; + + my ( $writer, $rfactory, $hfactory, $use_factories ) = + $self->_rearrange([qw(WRITER + RESULT_FACTORY + HIT_FACTORY + USE_FACTORIES)], @args); + + $self->writer( $writer ) if $writer; + + # TODO: Resolve this issue: + # The $use_factories flag is a temporary hack to allow factory-based and + # non-factory based SearchIO objects to co-exist. + # steve --- Sat Dec 22 04:41:20 2001 + if( $use_factories) { + if( not defined $self->{'_result_factory'}) { + $self->result_factory( $rfactory || $self->default_result_factory_class->new ); + } + if( not defined $self->{'_hit_factory'}) { + $self->hit_factory( $hfactory || $self->default_hit_factory_class->new ); + } + } +} + +=head2 next_result + + Title : next_result + Usage : $result = stream->next_result + Function: Reads the next ResultI object from the stream and returns it. + + Certain driver modules may encounter entries in the stream that + are either misformatted or that use syntax not yet understood + by the driver. If such an incident is recoverable, e.g., by + dismissing a feature of a feature table or some other non-mandatory + part of an entry, the driver will issue a warning. In the case + of a non-recoverable situation an exception will be thrown. + Do not assume that you can resume parsing the same stream after + catching the exception. Note that you can always turn recoverable + errors into exceptions by calling $stream->verbose(2) (see + Bio::Root::RootI POD page). + Returns : A Bio::Search::Result::ResultI object + Args : n/a + +See L + +=cut + +sub next_result { + my ($self) = @_; + $self->throw_not_implemented; +} + +=head2 write_result + + Title : write_result + Usage : $stream->write_result($result_result, @other_args) + Function: Writes data from the $result_result object into the stream. + : Delegates to the to_string() method of the associated + : WriterI object. + Returns : 1 for success and 0 for error + Args : Bio::Search:Result::ResultI object, + : plus any other arguments for the Writer + Throws : Bio::Root::Exception if a Writer has not been set. + +See L + +=cut + +sub write_result { + my ($self, $result, @args) = @_; + + if( not ref($self->{'_result_writer'}) ) { + $self->throw("ResultWriter not defined."); + } + my $str = $self->writer->to_string( $result, @args ); + #print "Got string: \n$str\n"; + $self->_print( "$str" ); + + return 1; +} + + +=head2 writer + + Title : writer + Usage : $writer = $stream->writer; + Function: Sets/Gets a SearchWriterI object to be used for this searchIO. + Returns : 1 for success and 0 for error + Args : Bio::SearchIO::SearchWriterI object (when setting) + Throws : Bio::Root::Exception if a non-Bio::SearchIO::SearchWriterI object + is passed in. + +=cut + +sub writer { + my ($self, $writer) = @_; + if( ref($writer) and $writer->isa( 'Bio::SearchIO::SearchWriterI' )) { + $self->{'_result_writer'} = $writer; + } + elsif( defined $writer ) { + $self->throw("Can't set ResultWriter. Not a Bio::SearchIO::SearchWriterI: $writer"); + } + return $self->{'_result_writer'}; +} + + +=head2 hit_factory + + Title : hit_factory + Usage : $hit_factory = $stream->hit_factory; (get) + : $stream->hit_factory( $factory ); (set) + Function: Sets/Gets a factory object to create hit objects for this SearchIO + Returns : Bio::Factory::HitFactoryI object + Args : Bio::Factory::HitFactoryI object (when setting) + Throws : Bio::Root::Exception if a non-Bio::Factory::HitFactoryI object + is passed in. + Comments: A SearchIO implementation should provide a default hit factory. + +See L + +=cut + +sub hit_factory { + my ($self, $fact) = @_; + if( ref $fact and $fact->isa( 'Bio::Factory::HitFactoryI' )) { + $self->{'_hit_factory'} = $fact; + } + elsif( defined $fact ) { + $self->throw("Can't set HitFactory. Not a Bio::Factory::HitFactoryI: $fact"); + } + return $self->{'_hit_factory'}; +} + +=head2 result_factory + + Title : result_factory + Usage : $result_factory = $stream->result_factory; (get) + : $stream->result_factory( $factory ); (set) + Function: Sets/Gets a factory object to create result objects for this + SearchIO. + Returns : Bio::Factory::ResultFactoryI object + Args : Bio::Factory::ResultFactoryI object (when setting) + Throws : Bio::Root::Exception if a non-Bio::Factory::ResultFactoryI object + is passed in. + Comments: A SearchIO implementation should provide a default result factory. + +See L + +=cut + +sub result_factory { + my ($self, $fact) = @_; + if( ref $fact and $fact->isa( 'Bio::Factory::ResultFactoryI' )) { + $self->{'_result_factory'} = $fact; + } + elsif( defined $fact ) { + $self->throw("Can't set ResultFactory. Not a Bio::Factory::ResultFactoryI: $fact"); + } + return $self->{'_result_factory'}; +} + + +=head2 result_count + + Title : result_count + Usage : $num = $stream->result_count; + Function: Gets the number of Blast results that have been parsed. + Returns : integer + Args : none + Throws : none + +=cut + +sub result_count { + my $self = shift; + $self->throw_not_implemented; +} + + +=head2 default_hit_factory_class + + Title : default_hit_factory_class + Usage : $res_factory = $obj->default_hit_factory_class()->new( @args ) + Function: Returns the name of the default class to be used for creating + Bio::Search::Hit::HitI objects. + Example : + Returns : A string containing the name of a class that implements + the Bio::Search::Hit::HitI interface. + Args : none + Comments: Bio::SearchIO does not implement this method. It throws a + NotImplemented exception + +See L + +=cut + +sub default_hit_factory_class { + my $self = shift; +# TODO: Uncomment this when Jason's SearchIO code conforms +# $self->throw_not_implemented; +} + +=head2 _load_format_module + + Title : _load_format_module + Usage : *INTERNAL SearchIO stuff* + Function: Loads up (like use) a module at run time on demand + Example : + Returns : + Args : + +=cut + +sub _load_format_module { + my ($self,$format) = @_; + my $module = "Bio::SearchIO::" . $format; + my $ok; + + eval { + $ok = $self->_load_module($module); + }; + if ( $@ ) { + print STDERR <_guess_format($filename) + Function: + Example : + Returns : guessed format of filename (lower case) + Args : + +=cut + + +sub _guess_format { + my $class = shift; + return unless $_ = shift; + return 'blast' if (/blast/i or /\.bl\w$/i); + return 'fasta' if (/fasta/i or /\.fas$/i); + return 'blastxml' if (/blast/i and /\.xml$/i); + return 'exonerate' if ( /\.exonerate/i or /\.exon/i ); +} + +sub close { + my $self = shift; + if( $self->writer ) { + $self->_print($self->writer->end_report()); + } + $self->SUPER::close(@_); +} + +sub DESTROY { + my $self = shift; + $self->close(); +} + +sub TIEHANDLE { + my $class = shift; + return bless {processor => shift}, $class; +} + +sub READLINE { + my $self = shift; + return $self->{'processor'}->next_result() unless wantarray; + my (@list, $obj); + push @list, $obj while $obj = $self->{'processor'}->next_result(); + return @list; +} + +sub PRINT { + my $self = shift; + $self->{'processor'}->write_result(@_); +} + +1; + +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SearchIO/EventHandlerI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SearchIO/EventHandlerI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,233 @@ +# $Id: EventHandlerI.pm,v 1.8 2002/10/22 07:45:18 lapp Exp $ +# +# BioPerl module for Bio::SearchIO::EventHandlerI +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SearchIO::EventHandlerI - An abstract Event Handler for Search Result parsing + +=head1 SYNOPSIS + +# do not use this object directly it is an interface +# See Bio::SearchIO::SearchResultEventBuilder for an implementation + + use Bio::SearchIO::SearchResultEventBuilder; + my $handler = new Bio::SearchIO::SearchResultEventBuilder(); + +=head1 DESCRIPTION + +This interface describes the basic methods needed to handle Events +thrown from parsing a Search Result such as FASTA, BLAST, or HMMer. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SearchIO::EventHandlerI; +use vars qw(@ISA); +use strict; +use Carp; + +use Bio::Event::EventHandlerI; + +@ISA = qw (Bio::Event::EventHandlerI); + +=head2 start_result + + Title : start_result + Usage : $handler->start_result($data) + Function: Begins a result event cycle + Returns : none + Args : Type of Result + +=cut + +sub start_result { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 end_result + + Title : end_result + Usage : $handler->end_result($data) + Function: Ends a result event cycle + Returns : Bio::Search::Result::ResultI object + Args : none + + +=cut + +sub end_result{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +=head2 start_hsp + + Title : start_hsp + Usage : $handler->start_hsp($data) + Function: Start a HSP event cycle + Returns : none + Args : type of element + associated hashref + +=cut + +sub start_hsp{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +=head2 end_hsp + + Title : end_hsp + Usage : $handler->end_hsp() + Function: Ends a HSP event cycle + Returns : Bio::Search::HSP::HSPI object + Args : type of event and associated hashref + +=cut + +sub end_hsp{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +=head2 start_hit + + Title : start_hit + Usage : $handler->start_hit() + Function: Starts a Hit event cycle + Returns : none + Args : type of event and associated hashref + + +=cut + +sub start_hit { + my ($self,@args) = @_; + $self->throw_not_implemented +} + +=head2 end_hit + + Title : end_hit + Usage : $handler->end_hit() + Function: Ends a Hit event cycle + Returns : Bio::Search::Hit::HitI object + Args : type of event and associated hashref + + +=cut + +sub end_hit { + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + + +=head2 register_factory + + Title : register_factory + Usage : $handler->register_factory('TYPE',$factory); + Function: Register a specific factory for a object type class + Returns : none + Args : string representing the class and + Bio::Factory::ObjectFactoryI + +See L for more information + +=cut + +sub register_factory{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + + +=head2 factory + + Title : factory + Usage : my $f = $handler->factory('TYPE'); + Function: Retrieves the associated factory for requested 'TYPE' + Returns : a Bio::Factory::ObjectFactoryI or undef if none registered + Args : name of factory class to retrieve + +See L for more information + +=cut + +sub factory{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +=head2 Bio::Event::EventHandlerI methods + +=cut + +=head2 will_handle + + Title : will_handle + Usage : if( $handler->will_handle($event_type) ) { ... } + Function: Tests if this event builder knows how to process a specific event + Returns : boolean + Args : event type name + + +=cut + +=head2 SAX methods + +See L for the additional SAX methods. + +=cut + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SearchIO/FastHitEventBuilder.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SearchIO/FastHitEventBuilder.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,286 @@ +# $Id: FastHitEventBuilder.pm,v 1.6 2002/12/05 13:46:35 heikki Exp $ +# +# BioPerl module for Bio::SearchIO::FastHitEventBuilder +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SearchIO::FastHitEventBuilder - Event Handler for SearchIO events. + +=head1 SYNOPSIS + + # Do not use this object directly, this object is part of the SearchIO + # event based parsing system. + + # to use the FastHitEventBuilder do this + + use Bio::SearchIO::FastHitEventBuilder; + + my $searchio = new Bio::SearchIO(-format => $format, -file => $file); + + $searchio->attach_EventHandler(new Bio::SearchIO::FastHitEventBuilder); + + while( my $r = $searchio->next_result ) { + while( my $h = $r->next_hit ) { + # note that Hits will NOT have HSPs + } + } + +=head1 DESCRIPTION + +This object handles Search Events generated by the SearchIO classes +and build appropriate Bio::Search::* objects from them. This object +is intended for lightweight parsers which only want Hits and not deal +with the overhead of HSPs. It is a lot faster than the standard +parser event handler but of course you are getting less information +and less objects out. + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SearchIO::FastHitEventBuilder; +use vars qw(@ISA %KNOWNEVENTS); +use strict; + +use Bio::Root::Root; +use Bio::SearchIO::EventHandlerI; +use Bio::Search::HSP::HSPFactory; +use Bio::Search::Hit::HitFactory; +use Bio::Search::Result::ResultFactory; + +@ISA = qw(Bio::Root::Root Bio::SearchIO::EventHandlerI); + +=head2 new + + Title : new + Usage : my $obj = new Bio::SearchIO::FastHitEventBuilder(); + Function: Builds a new Bio::SearchIO::FastHitEventBuilder object + Returns : Bio::SearchIO::FastHitEventBuilder + Args : -hit_factory => Bio::Factory::ObjectFactoryI + -result_factory => Bio::Factory::ObjectFactoryI + +See L for more information + +=cut + +sub new { + my ($class,@args) = @_; + my $self = $class->SUPER::new(@args); + my ($hspF,$hitF,$resultF) = $self->_rearrange([qw(HIT_FACTORY + RESULT_FACTORY)],@args); + $self->register_factory('hit', $hitF || Bio::Search::Hit::HitFactory->new()); + $self->register_factory('result', $resultF || Bio::Search::Result::ResultFactory->new()); + + return $self; +} + +# new comes from the superclass + +=head2 will_handle + + Title : will_handle + Usage : if( $handler->will_handle($event_type) ) { ... } + Function: Tests if this event builder knows how to process a specific event + Returns : boolean + Args : event type name + + +=cut + +sub will_handle{ + my ($self,$type) = @_; + # these are the events we recognize + return ( $type eq 'hit' || $type eq 'result' ); +} + +=head2 SAX methods + +=cut + +=head2 start_result + + Title : start_result + Usage : $handler->start_result($resulttype) + Function: Begins a result event cycle + Returns : none + Args : Type of Report + +=cut + +sub start_result { + my ($self,$type) = @_; + $self->{'_resulttype'} = $type; + $self->{'_hits'} = []; + return; +} + +=head2 end_result + + Title : end_result + Usage : my @results = $parser->end_result + Function: Finishes a result handler cycle Returns : A Bio::Search::Result::ResultI + Args : none + +=cut + +sub end_result { + my ($self,$type,$data) = @_; + if( defined $data->{'runid'} && + $data->{'runid'} !~ /^\s+$/ ) { + + if( $data->{'runid'} !~ /^lcl\|/) { + $data->{"RESULT-query_name"}= $data->{'runid'}; + } else { + ($data->{"RESULT-query_name"},$data->{"RESULT-query_description"}) = split(/\s+/,$data->{"RESULT-query_description"},2); + } + + if( my @a = split(/\|/,$data->{'RESULT-query_name'}) ) { + my $acc = pop @a ; # this is for accession |1234|gb|AAABB1.1|AAABB1 + # this is for |123|gb|ABC1.1| + $acc = pop @a if( ! defined $acc || $acc =~ /^\s+$/); + $data->{"RESULT-query_accession"}= $acc; + } + delete $data->{'runid'}; + } + my %args = map { my $v = $data->{$_}; s/RESULT//; ($_ => $v); } + grep { /^RESULT/ } keys %{$data}; + + $args{'-algorithm'} = uc( $args{'-algorithm_name'} || $type); + $args{'-hits'} = $self->{'_hits'}; + my $result = $self->factory('result')->create(%args); + $self->{'_hits'} = []; + return $result; +} + +=head2 start_hit + + Title : start_hit + Usage : $handler->start_hit() + Function: Starts a Hit event cycle + Returns : none + Args : type of event and associated hashref + + +=cut + +sub start_hit{ + my ($self,$type) = @_; + return; +} + + +=head2 end_hit + + Title : end_hit + Usage : $handler->end_hit() + Function: Ends a Hit event cycle + Returns : Bio::Search::Hit::HitI object + Args : type of event and associated hashref + + +=cut + +sub end_hit{ + my ($self,$type,$data) = @_; + my %args = map { my $v = $data->{$_}; s/HIT//; ($_ => $v); } grep { /^HIT/ } keys %{$data}; + $args{'-algorithm'} = uc( $args{'-algorithm_name'} || $type); + $args{'-query_len'} = $data->{'RESULT-query_length'}; + my ($hitrank) = scalar @{$self->{'_hits'}} + 1; + $args{'-rank'} = $hitrank; + my $hit = $self->factory('hit')->create(%args); + push @{$self->{'_hits'}}, $hit; + $self->{'_hsps'} = []; + return $hit; +} + +=head2 Factory methods + +=cut + +=head2 register_factory + + Title : register_factory + Usage : $handler->register_factory('TYPE',$factory); + Function: Register a specific factory for a object type class + Returns : none + Args : string representing the class and + Bio::Factory::ObjectFactoryI + +See L for more information + +=cut + +sub register_factory{ + my ($self, $type,$f) = @_; + if( ! defined $f || ! ref($f) || + ! $f->isa('Bio::Factory::ObjectFactoryI') ) { + $self->throw("Cannot set factory to value $f".ref($f)."\n"); + } + $self->{'_factories'}->{lc($type)} = $f; +} + + +=head2 factory + + Title : factory + Usage : my $f = $handler->factory('TYPE'); + Function: Retrieves the associated factory for requested 'TYPE' + Returns : a Bio::Factory::ObjectFactoryI or undef if none registered + Args : name of factory class to retrieve + +See L for more information + +=cut + +sub factory{ + my ($self,$type) = @_; + return $self->{'_factories'}->{lc($type)} || $self->throw("No factory registered for $type"); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SearchIO/SearchResultEventBuilder.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SearchIO/SearchResultEventBuilder.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,365 @@ +# $Id: SearchResultEventBuilder.pm,v 1.25.2.1 2003/01/17 20:32:54 jason Exp $ +# +# BioPerl module for Bio::SearchIO::SearchResultEventBuilder +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SearchIO::SearchResultEventBuilder - Event Handler for SearchIO events. + +=head1 SYNOPSIS + +# Do not use this object directly, this object is part of the SearchIO +# event based parsing system. + +=head1 DESCRIPTION + +This object handles Search Events generated by the SearchIO classes +and build appropriate Bio::Search::* objects from them. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SearchIO::SearchResultEventBuilder; +use vars qw(@ISA %KNOWNEVENTS); +use strict; + +use Bio::Root::Root; +use Bio::SearchIO::EventHandlerI; +use Bio::Search::HSP::HSPFactory; +use Bio::Search::Hit::HitFactory; +use Bio::Search::Result::ResultFactory; + +@ISA = qw(Bio::Root::Root Bio::SearchIO::EventHandlerI); + +=head2 new + + Title : new + Usage : my $obj = new Bio::SearchIO::SearchResultEventBuilder(); + Function: Builds a new Bio::SearchIO::SearchResultEventBuilder object + Returns : Bio::SearchIO::SearchResultEventBuilder + Args : -hsp_factory => Bio::Factory::ObjectFactoryI + -hit_factory => Bio::Factory::ObjectFactoryI + -result_factory => Bio::Factory::ObjectFactoryI + +See L for more information + +=cut + +sub new { + my ($class,@args) = @_; + my $self = $class->SUPER::new(@args); + my ($hspF,$hitF,$resultF) = $self->_rearrange([qw(HSP_FACTORY + HIT_FACTORY + RESULT_FACTORY)],@args); + $self->register_factory('hsp', $hspF || Bio::Search::HSP::HSPFactory->new()); + $self->register_factory('hit', $hitF || Bio::Search::Hit::HitFactory->new()); + $self->register_factory('result', $resultF || Bio::Search::Result::ResultFactory->new()); + + return $self; +} + +# new comes from the superclass + +=head2 will_handle + + Title : will_handle + Usage : if( $handler->will_handle($event_type) ) { ... } + Function: Tests if this event builder knows how to process a specific event + Returns : boolean + Args : event type name + + +=cut + +sub will_handle{ + my ($self,$type) = @_; + # these are the events we recognize + return ( $type eq 'hsp' || $type eq 'hit' || $type eq 'result' ); +} + +=head2 SAX methods + +=cut + +=head2 start_result + + Title : start_result + Usage : $handler->start_result($resulttype) + Function: Begins a result event cycle + Returns : none + Args : Type of Report + +=cut + +sub start_result { + my ($self,$type) = @_; + $self->{'_resulttype'} = $type; + $self->{'_hits'} = []; + $self->{'_hsps'} = []; + return; +} + +=head2 end_result + + Title : end_result + Usage : my @results = $parser->end_result + Function: Finishes a result handler cycle + Returns : A Bio::Search::Result::ResultI + Args : none + +=cut + +sub end_result { + my ($self,$type,$data) = @_; + if( defined $data->{'runid'} && + $data->{'runid'} !~ /^\s+$/ ) { + + if( $data->{'runid'} !~ /^lcl\|/) { + $data->{"RESULT-query_name"}= $data->{'runid'}; + } else { + ($data->{"RESULT-query_name"},$data->{"RESULT-query_description"}) = split(/\s+/,$data->{"RESULT-query_description"},2); + } + + if( my @a = split(/\|/,$data->{'RESULT-query_name'}) ) { + my $acc = pop @a ; # this is for accession |1234|gb|AAABB1.1|AAABB1 + # this is for |123|gb|ABC1.1| + $acc = pop @a if( ! defined $acc || $acc =~ /^\s+$/); + $data->{"RESULT-query_accession"}= $acc; + } + delete $data->{'runid'}; + } + my %args = map { my $v = $data->{$_}; s/RESULT//; ($_ => $v); } + grep { /^RESULT/ } keys %{$data}; + + $args{'-algorithm'} = uc( $args{'-algorithm_name'} || + $data->{'RESULT-algorithm_name'} || $type); + $args{'-hits'} = $self->{'_hits'}; + my $result = $self->factory('result')->create(%args); + $self->{'_hits'} = []; + return $result; +} + +=head2 start_hsp + + Title : start_hsp + Usage : $handler->start_hsp($name,$data) + Function: Begins processing a HSP event + Returns : none + Args : type of element + associated data (hashref) + +=cut + +sub start_hsp { + my ($self,@args) = @_; + return; +} + +=head2 end_hsp + + Title : end_hsp + Usage : $handler->end_hsp() + Function: Finish processing a HSP event + Returns : none + Args : type of event and associated hashref + + +=cut + +sub end_hsp { + my ($self,$type,$data) = @_; + # this code is to deal with the fact that Blast XML data + # always has start < end and one has to infer strandedness + # from the frame which is a problem for the Search::HSP object + # which expect to be able to infer strand from the order of + # of the begin/end of the query and hit coordinates + if( defined $data->{'HSP-query_frame'} && # this is here to protect from undefs + (( $data->{'HSP-query_frame'} < 0 && + $data->{'HSP-query_start'} < $data->{'HSP-query_end'} ) || + $data->{'HSP-query_frame'} > 0 && + ( $data->{'HSP-query_start'} > $data->{'HSP-query_end'} ) ) + ) + { + # swap + ($data->{'HSP-query_start'}, + $data->{'HSP-query_end'}) = ($data->{'HSP-query_end'}, + $data->{'HSP-query_start'}); + } + if( defined $data->{'HSP-hit_frame'} && # this is here to protect from undefs + ((defined $data->{'HSP-hit_frame'} && $data->{'HSP-hit_frame'} < 0 && + $data->{'HSP-hit_start'} < $data->{'HSP-hit_end'} ) || + defined $data->{'HSP-hit_frame'} && $data->{'HSP-hit_frame'} > 0 && + ( $data->{'HSP-hit_start'} > $data->{'HSP-hit_end'} ) ) + ) + { + # swap + ($data->{'HSP-hit_start'}, + $data->{'HSP-hit_end'}) = ($data->{'HSP-hit_end'}, + $data->{'HSP-hit_start'}); + } + $data->{'HSP-query_frame'} ||= 0; + $data->{'HSP-hit_frame'} ||= 0; + # handle Blast 2.1.2 which did not support data member: hsp_align-len + $data->{'HSP-query_length'} ||= length ($data->{'HSP-query_seq'} || ''); + $data->{'HSP-hit_length'} ||= length ($data->{'HSP-hit_seq'} || ''); + $data->{'HSP-hsp_length'} ||= length ($data->{'HSP-homology_seq'} || ''); + + my %args = map { my $v = $data->{$_}; s/HSP//; ($_ => $v) } + grep { /^HSP/ } keys %{$data}; + + $args{'-algorithm'} = uc( $args{'-algorithm_name'} || + $data->{'RESULT-algorithm_name'} || $type); + # copy this over from result + $args{'-query_name'} = $data->{'RESULT-query_name'}; + $args{'-hit_name'} = $data->{'HIT-name'}; + my ($rank) = scalar @{$self->{'_hsps'}} + 1; + $args{'-rank'} = $rank; + + my $hsp = $self->factory('hsp')->create(%args); + push @{$self->{'_hsps'}}, $hsp; + return $hsp; +} + + +=head2 start_hit + + Title : start_hit + Usage : $handler->start_hit() + Function: Starts a Hit event cycle + Returns : none + Args : type of event and associated hashref + + +=cut + +sub start_hit{ + my ($self,$type) = @_; + $self->{'_hsps'} = []; + return; +} + + +=head2 end_hit + + Title : end_hit + Usage : $handler->end_hit() + Function: Ends a Hit event cycle + Returns : Bio::Search::Hit::HitI object + Args : type of event and associated hashref + + +=cut + +sub end_hit{ + my ($self,$type,$data) = @_; + my %args = map { my $v = $data->{$_}; s/HIT//; ($_ => $v); } grep { /^HIT/ } keys %{$data}; + + # I hate special cases, but this is here because NCBI BLAST XML + # doesn't play nice and is undergoing mutation -jason + if( $args{'-name'} =~ /BL_ORD_ID/ ) { + ($args{'-name'}, $args{'-description'}) = split(/\s+/,$args{'-description'},2); + } + $args{'-algorithm'} = uc( $args{'-algorithm_name'} || + $data->{'RESULT-algorithm_name'} || $type); + $args{'-hsps'} = $self->{'_hsps'}; + $args{'-query_len'} = $data->{'RESULT-query_length'}; + my ($hitrank) = scalar @{$self->{'_hits'}} + 1; + $args{'-rank'} = $hitrank; + my $hit = $self->factory('hit')->create(%args); + push @{$self->{'_hits'}}, $hit; + $self->{'_hsps'} = []; + return $hit; +} + +=head2 Factory methods + +=cut + +=head2 register_factory + + Title : register_factory + Usage : $handler->register_factory('TYPE',$factory); + Function: Register a specific factory for a object type class + Returns : none + Args : string representing the class and + Bio::Factory::ObjectFactoryI + +See L for more information + +=cut + +sub register_factory{ + my ($self, $type,$f) = @_; + if( ! defined $f || ! ref($f) || + ! $f->isa('Bio::Factory::ObjectFactoryI') ) { + $self->throw("Cannot set factory to value $f".ref($f)."\n"); + } + $self->{'_factories'}->{lc($type)} = $f; +} + + +=head2 factory + + Title : factory + Usage : my $f = $handler->factory('TYPE'); + Function: Retrieves the associated factory for requested 'TYPE' + Returns : a Bio::Factory::ObjectFactoryI or undef if none registered + Args : name of factory class to retrieve + +See L for more information + +=cut + +sub factory{ + my ($self,$type) = @_; + return $self->{'_factories'}->{lc($type)} || $self->throw("No factory registered for $type"); +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SearchIO/SearchWriterI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SearchIO/SearchWriterI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,127 @@ +#----------------------------------------------------------------- +# $Id: SearchWriterI.pm,v 1.7 2002/12/01 00:05:01 jason Exp $ +# +# BioPerl module Bio::SearchIO::SearchWriterI +# +# Cared for by Steve Chervitz +# +# You may distribute this module under the same terms as perl itself +#----------------------------------------------------------------- + +=head1 NAME + +Bio::SearchIO::SearchWriterI - Interface for outputting parsed Search results + +=head1 SYNOPSIS + +Bio::SearchIO::SearchWriterI objects cannot be instantiated since this +module defines a pure interface. + +Given an object that implements the Bio::SearchIO::SearchWriterI interface, +you can do the following things with it: + + print $writer->to_string( $result_obj, @args ); + +=head1 DESCRIPTION + +This module defines abstract methods that all subclasses must implement +to be used for outputting results from B +objects. + +=head1 AUTHOR + +Steve Chervitz Esac@bioperl.orgE + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. + +=cut + +package Bio::SearchIO::SearchWriterI; + +use Bio::Root::RootI; + +@ISA = qw( Bio::Root::RootI ); + +=head2 to_string + + Purpose : Produces data for each Search::Result::ResultI in a string. + : This is an abstract method. For some useful implementations, + : see ResultTableWriter.pm, HitTableWriter.pm, + : and HSPTableWriter.pm. + Usage : print $writer->to_string( $result_obj, @args ); + Argument : $result_obj = A Bio::Search::Result::ResultI object + : @args = any additional arguments used by your implementation. + Returns : String containing data for each search Result or any of its + : sub-objects (Hits and HSPs). + Throws : n/a + +=cut + +sub to_string { + my ($self, $result, @args) = @_; + $self->throw_not_implemented; +} + +=head2 end_report + + Title : end_report + Usage : $self->end_report() + Function: The method to call when ending a report, this is + mostly for cleanup for formats which require you to + have something at the end of the document () + for HTML + Returns : string + Args : none + + +=cut + +sub end_report { + my $self = shift; + return ''; +} + +=head2 filter + + Title : filter + Usage : $writer->filter('hsp', \&hsp_filter); + Function: Filter out either at HSP,Hit,or Result level + Returns : none + Args : string => data type, + CODE reference + + +=cut + +# yes this is an implementation in the interface, +# yes it assumes that the underlying class is hash-based +# yes that might not be a good idea, but until people +# start extending the SearchWriterI interface I think +# this is an okay way to go + +sub filter { + my ($self,$method,$code) = @_; + return undef unless $method; + $method = uc($method); + if( $method ne 'HSP' && + $method ne 'HIT' && + $method ne 'RESULT' ) { + $self->warn("Unknown method $method"); + return undef; + } + if( $code ) { + $self->throw("Must provide a valid code reference") unless ref($code) =~ /CODE/; + $self->{$method} = $code; + } + return $self->{$method}; +} + +1; + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SearchIO/Writer/HSPTableWriter.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SearchIO/Writer/HSPTableWriter.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,290 @@ +# $Id: HSPTableWriter.pm,v 1.12 2002/11/23 15:32:24 jason Exp $ + +=head1 NAME + +Bio::SearchIO::Writer::HSPTableWriter - Tab-delimited data for Bio::Search::HSP::HSPI objects + +=head1 SYNOPSIS + +=head2 Example 1: Using the default columns + + use Bio::SearchIO; + use Bio::SearchIO::Writer::HSPTableWriter; + + my $in = Bio::SearchIO->new(); + + my $writer = Bio::SearchIO::Writer::HSPTableWriter->new(); + + my $out = Bio::SearchIO->new( -writer => $writer ); + + while ( my $result = $in->next_result() ) { + $out->write_result($result, ($in->report_count - 1 ? 0 : 1) ); + } + +=head2 Example 2: Specifying a subset of columns + + use Bio::SearchIO; + use Bio::SearchIO::Writer::HSPTableWriter; + + my $in = Bio::SearchIO->new(); + + my $writer = Bio::SearchIO::Writer::HSPTableWriter->new( + -columns => [qw( + query_name + query_length + hit_name + hit_length + rank + frac_identical_query + expect + )] ); + + my $out = Bio::SearchIO->new( -writer => $writer, + -file => ">searchio.out" ); + + while ( my $result = $in->next_result() ) { + $out->write_result($result, ($in->report_count - 1 ? 0 : 1) ); + } + +=head2 Custom Labels + +You can also specify different column labels if you don't want to use +the defaults. Do this by specifying a C<-labels> hash reference +parameter when creating the HSPTableWriter object. The keys of the +hash should be the column number (left-most column = 1) for the label(s) +you want to specify. Here's an example: + + my $writer = Bio::SearchIO::Writer::HSPTableWriter->new( + -columns => [qw( query_name + query_length + hit_name + hit_length )], + -labels => { 1 => 'QUERY_GI', + 3 => 'HIT_IDENTIFIER' } ); + + +=head1 DESCRIPTION + +Bio::SearchIO::Writer::HSPTableWriter generates output at the finest +level of granularity for data within a search result. Data for each HSP +within each hit in a search result is output in tab-delimited format, +one row per HSP. + +=head2 Available Columns + +Here are the columns that can be specified in the C<-columns> +parameter when creating a HSPTableWriter object. If a C<-columns> parameter +is not specified, this list, in this order, will be used as the default. + + query_name # Sequence identifier of the query. + query_length # Full length of the query sequence + hit_name # Sequence identifier of the hit + hit_length # Full length of the hit sequence + round # Round number for hit (PSI-BLAST) + rank + expect # Expect value for the alignment + score # Score for the alignment (e.g., BLAST score) + bits # Bit score for the alignment + frac_identical_query # fraction of identical substitutions in query + frac_identical_hit # fraction of identical substitutions in hit + frac_conserved_query # fraction of conserved substitutions in query + frac_conserved_hit # fraction of conserved substitutions in hit + length_aln_query # Length of the aligned portion of the query sequence + length_aln_hit # Length of the aligned portion of the hit sequence + gaps_query # Number of gaps in the aligned query sequence + gaps_hit # Number of gaps in the aligned hit sequence + gaps_total # Number of gaps in the aligned query and hit sequences + start_query # Starting coordinate of the aligned portion of the query sequence + end_query # Ending coordinate of the aligned portion of the query sequence + start_hit # Starting coordinate of the aligned portion of the hit sequence + end_hit # Ending coordinate of the aligned portion of the hit sequence + strand_query # Strand of the aligned query sequence + strand_hit # Strand of the aligned hit sequence + frame # Reading frame of the aligned query sequence + hit_description # Full description of the hit sequence + query_description # Full description of the query sequence + +For more details about these columns, see the documentation for the +corresponding method in Bio::Search::HSP::HSPI. + +=head1 TODO + +Figure out the best way to incorporate algorithm-specific score columns. +The best route is probably to have algorith-specific subclasses +(e.g., BlastHSPTableWriter, FastaHSPTableWriter). + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Steve Chervitz Esac@bioperl.orgE + +See L for where to send bug reports +and comments. + +=head1 COPYRIGHT + +Copyright (c) 2001 Steve Chervitz. All Rights Reserved. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=head1 SEE ALSO + + Bio::SearchIO::Writer::HitTableWriter + Bio::SearchIO::Writer::ResultTableWriter + +=head1 METHODS + +=cut + +package Bio::SearchIO::Writer::HSPTableWriter; + +use strict; +use Bio::SearchIO::Writer::ResultTableWriter; + +use vars qw( @ISA ); +@ISA = qw( Bio::SearchIO::Writer::ResultTableWriter ); + + +# Array fields: column, object, method[/argument], printf format, column label +# Methods for result object are defined in Bio::Search::Result::ResultI. +# Methods for hit object are defined in Bio::Search::Hit::HitI. +# Methods for hsp object are defined in Bio::Search::HSP::HSPI. +# Tech note: If a bogus method is supplied, it will result in all values to be zero. +# Don't know why this is. +# TODO (maybe): Allow specification of signif_format (i.e., separate mantissa/exponent) +my %column_map = ( + 'query_name' => ['1', 'result', 'query_name', 's', 'QUERY' ], + 'query_length' => ['2', 'result', 'query_length', 'd', 'LEN_Q'], + 'hit_name' => ['3', 'hit', 'name', 's', 'HIT'], + 'hit_length' => ['4', 'hit', 'hit_length', 'd', 'LEN_H'], + 'round' => ['5', 'hit', 'iteration', 'd', 'ROUND', 'hit'], + 'rank' => ['6', 'hsp', 'rank', 'd', 'RANK'], + 'expect' => ['7', 'hsp', 'expect', '.1e', 'EXPCT'], + 'score' => ['8', 'hsp', 'score', 'd', 'SCORE'], + 'bits' => ['9', 'hsp', 'bits', 'd', 'BITS'], + 'frac_identical_query' => ['10', 'hsp', 'frac_identical/query', '.2f', 'FR_IDQ'], + 'frac_identical_hit' => ['11', 'hsp', 'frac_identical/hit', '.2f', 'FR_IDH'], + 'frac_conserved_query' => ['12', 'hsp', 'frac_conserved/query', '.2f', 'FR_CNQ'], + 'frac_conserved_hit' => ['13', 'hsp', 'frac_conserved/hit', '.2f', 'FR_CNH'], + 'length_aln_query' => ['14', 'hsp', 'length/query', 'd', 'LN_ALQ'], + 'length_aln_hit' => ['15', 'hsp', 'length/hit', 'd', 'LN_ALH'], + 'gaps_query' => ['16', 'hsp', 'gaps/query', 'd', 'GAPS_Q'], + 'gaps_hit' => ['17', 'hsp', 'gaps/hit', 'd', 'GAPS_H'], + 'gaps_total' => ['18', 'hsp', 'gaps/total', 'd', 'GAPS_QH'], + 'start_query' => ['19', 'hsp', 'start/query', 'd', 'START_Q'], + 'end_query' => ['20', 'hsp', 'end/query', 'd', 'END_Q'], + 'start_hit' => ['21', 'hsp', 'start/hit', 'd', 'START_H'], + 'end_hit' => ['22', 'hsp', 'end/hit', 'd', 'END_H'], + 'strand_query' => ['23', 'hsp', 'strand/query', 'd', 'STRND_Q'], + 'strand_hit' => ['24', 'hsp', 'strand/hit', 'd', 'STRND_H'], + 'frame' => ['25', 'hsp', 'frame', 's', 'FRAME'], + 'hit_description' => ['26', 'hit', 'hit_description', 's', 'DESC_H'], + 'query_description' => ['27', 'result', 'query_description', 's', 'DESC_Q'], + ); + +sub column_map { return %column_map } + + +=head2 to_string() + +Note: this method is not intended for direct use. +The SearchIO::write_result() method calls it automatically +if the writer is hooked up to a SearchIO object as illustrated in +L. + + Title : to_string() + : + Usage : print $writer->to_string( $result_obj, [$include_labels] ); + : + Argument : $result_obj = A Bio::Search::Result::ResultI object + : $include_labels = boolean, if true column labels are included (default: false) + : + Returns : String containing tab-delimited set of data for each HSP + : in each Hit of the supplied ResultI object. + : + Throws : n/a + +=cut + +sub to_string { + my ($self, $result, $include_labels) = @_; + + my $str = $include_labels ? $self->column_labels() : ''; + my ($resultfilter,$hitfilter, + $hspfilter) = ( $self->filter('RESULT'), + $self->filter('HIT'), + $self->filter('HSP')); + if( ! defined $resultfilter || &{$resultfilter}($result) ) { + my $func_ref = $self->row_data_func; + my $printf_fmt = $self->printf_fmt; + $result->can('rewind') && + $result->rewind(); # insure we're at the beginning + while( my $hit = $result->next_hit) { + next if( defined $hitfilter && ! &{$hitfilter}($hit) ); + $hit->can('rewind') && $hit->rewind;# insure we're at the beginning + while(my $hsp = $hit->next_hsp) { + next if ( defined $hspfilter && ! &{$hspfilter}($hsp)); + my @row_data = &{$func_ref}($result, $hit, $hsp); + $str .= sprintf "$printf_fmt\n", @row_data; + } + } + } + $str =~ s/\t\n/\n/gs; + return $str; +} + +=head2 end_report + + Title : end_report + Usage : $self->end_report() + Function: The method to call when ending a report, this is + mostly for cleanup for formats which require you to + have something at the end of the document. Nothing for + a text message. + Returns : string + Args : none + +=cut + +sub end_report { + return ''; +} + +=head2 filter + + Title : filter + Usage : $writer->filter('hsp', \&hsp_filter); + Function: Filter out either at HSP,Hit,or Result level + Returns : none + Args : string => data type, + CODE reference + + +=cut + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SearchIO/Writer/HTMLResultWriter.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SearchIO/Writer/HTMLResultWriter.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,889 @@ +# $Id: HTMLResultWriter.pm,v 1.12.2.4 2003/09/15 16:08:55 jason Exp $ +# +# BioPerl module for Bio::SearchIO::Writer::HTMLResultWriter +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# Changes 2003-07-31 (jason) +# Gary has cleaned up the code a lot to produce better looking +# HTML +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SearchIO::Writer::HTMLResultWriter - Object to implement writing a Bio::Search::ResultI in HTML. + +=head1 SYNOPSIS + + use Bio::SearchIO; + use Bio::SearchIO::Writer::HTMLResultWriter; + + my $in = new Bio::SearchIO(-format => 'blast', + -file => shift @ARGV); + + my $writer = new Bio::SearchIO::Writer::HTMLResultWriter(); + my $out = new Bio::SearchIO(-writer => $writer); + $out->write_result($in->next_result); + + + # to filter your output + my $MinLength = 100; # need a variable with scope outside the method + sub hsp_filter { + my $hsp = shift; + return 1 if $hsp->length('total') > $MinLength; + } + sub result_filter { + my $result = shift; + return $hsp->num_hits > 0; + } + + my $writer = new Bio::SearchIO::Writer::HTMLResultWriter + (-filters => { 'HSP' => \&hsp_filter} ); + my $out = new Bio::SearchIO(-writer => $writer); + $out->write_result($in->next_result); + + # can also set the filter via the writer object + $writer->filter('RESULT', \&result_filter); + +=head1 DESCRIPTION + +This object implements the SearchWriterI interface which will produce +a set of HTML for a specific Bio::Search::Report::ReportI interface. + + +You can also provide the argument -filters => \%hash to filter the at +the hsp, hit, or result level. %hash is an associative array which +contains any or all of the keys (HSP, HIT, RESULT). The values +pointed to by these keys would be references to a subroutine which +expects to be passed an object - one of Bio::Search::HSP::HSPI, +Bio::Search::Hit::HitI, and Bio::Search::Result::ResultI respectively. +Each function needs to return a boolean value as to whether or not the +passed element should be included in the output report - true if it is to be included, false if it to be omitted. + +For example to filter on sequences in the database which are too short +for your criteria you would do the following. + +Define a hit filter method + + sub hit_filter { + my $hit = shift; + return $hit->length E 100; # test if length of the hit sequence + # long enough + } + my $writer = new Bio::SearchIO::Writer::TextResultWriter( + -filters => { 'HIT' =E \&hit_filter } + ); + +Another example would be to filter HSPs on percent identity, let's +only include HSPs which are 75% identical or better. + + sub hsp_filter { + my $hsp = shift; + return $hsp->percent_identity E 75; + } + my $writer = new Bio::SearchIO::Writer::TextResultWriter( + -filters => { 'HSP' =E \&hsp_filter } + ); + +See L for more info on the filter method. + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason-at-bioperl-dot-org + +=head1 CONTRIBUTORS + +Gary Williams G.Williams@hgmp.mrc.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + + +package Bio::SearchIO::Writer::HTMLResultWriter; +use vars qw(@ISA %RemoteURLDefault + $MaxDescLen $DATE $AlignmentLineWidth $Revision); +use strict; +$Revision = '$Id: HTMLResultWriter.pm,v 1.12.2.4 2003/09/15 16:08:55 jason Exp $'; #' + +# Object preamble - inherits from Bio::Root::RootI + +BEGIN { + $DATE = localtime(time); + %RemoteURLDefault = ( 'PROTEIN' => 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=protein&cmd=search&term=%s', + 'NUCLEOTIDE' => 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=nucleotide&cmd=search&term=%s' + ); + + $MaxDescLen = 60; + $AlignmentLineWidth = 60; +} + +use Bio::Root::Root; +use Bio::SearchIO::SearchWriterI; + +@ISA = qw(Bio::Root::Root Bio::SearchIO::SearchWriterI); + +=head2 new + + Title : new + Usage : my $obj = new Bio::SearchIO::Writer::HTMLResultWriter(); + Function: Builds a new Bio::SearchIO::Writer::HTMLResultWriter object + Returns : Bio::SearchIO::Writer::HTMLResultWriter + Args : -filters => hashref with any or all of the keys (HSP HIT RESULT) + which have values pointing to a subroutine reference + which will expect to get a + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + my ($p,$n,$filters) = $self->_rearrange([qw(PROTEIN_URL + NUCLEOTIDE_URL + FILTERS)],@args); + $self->remote_database_url('p',$p || $RemoteURLDefault{'PROTEIN'}); + $self->remote_database_url('n',$n || $RemoteURLDefault{'NUCLEOTIDE'}); + + if( defined $filters ) { + if( !ref($filters) =~ /HASH/i ) { + $self->warn("Did not provide a hashref for the FILTERS option, ignoring."); + } else { + while( my ($type,$code) = each %{$filters} ) { + $self->filter($type,$code); + } + } + } + + return $self; +} + +=head2 remote_database_url + + Title : remote_database_url + Usage : $obj->remote_database_url($type,$newval) + Function: This should return or set a string that contains a %s which can be + filled in with sprintf. + Returns : value of remote_database_url + Args : $type - 'PROTEIN' or 'P' for protein URLS + 'NUCLEOTIDE' or 'N' for nucleotide URLS + $value - new value to set [optional] + + +=cut + +sub remote_database_url{ + my ($self,$type,$value) = @_; + if( ! defined $type || $type !~ /^(P|N)/i ) { + $self->warn("Must provide a type (PROTEIN or NUCLEOTIDE)"); + return ''; + } + $type = uc $1; + if( defined $value) { + $self->{'remote_database_url'}->{$type} = $value; + } + return $self->{'remote_database_url'}->{$type}; +} + +=head2 to_string + + Purpose : Produces data for each Search::Result::ResultI in a string. + : This is an abstract method. For some useful implementations, + : see ResultTableWriter.pm, HitTableWriter.pm, + : and HSPTableWriter.pm. + Usage : print $writer->to_string( $result_obj, @args ); + Argument : $result_obj = A Bio::Search::Result::ResultI object + : @args = any additional arguments used by your implementation. + Returns : String containing data for each search Result or any of its + : sub-objects (Hits and HSPs). + Throws : n/a + +=cut + +sub to_string { + my ($self,$result,$num) = @_; + $num ||= 0; + return unless defined $result; + my ($resultfilter,$hitfilter, $hspfilter) = ( $self->filter('RESULT'), + $self->filter('HIT'), + $self->filter('HSP') ); + return '' if( defined $resultfilter && ! &{$resultfilter}($result) ); + + my ($qtype,$dbtype,$dbseqtype,$type); + my $alg = $result->algorithm; + + # This is actually wrong for the FASTAs I think + if( $alg =~ /T(FAST|BLAST)([XY])/i ) { + $qtype = $dbtype = 'translated'; + $dbseqtype = $type = 'PROTEIN'; + } elsif( $alg =~ /T(FAST|BLAST)N/i ) { + $qtype = ''; + $dbtype = 'translated'; + $type = 'PROTEIN'; + $dbseqtype = 'NUCLEOTIDE'; + } elsif( $alg =~ /(FAST|BLAST)N/i || + $alg =~ /(WABA|EXONERATE)/i ) { + $qtype = $dbtype = ''; + $type = $dbseqtype = 'NUCLEOTIDE'; + } elsif( $alg =~ /(FAST|BLAST)P/ || $alg =~ /SSEARCH/i ) { + $qtype = $dbtype = ''; + $type = $dbseqtype = 'PROTEIN'; + } elsif( $alg =~ /(FAST|BLAST)[XY]/i ) { + $qtype = 'translated'; + $dbtype = 'PROTEIN'; + $dbseqtype = $type = 'PROTEIN'; + } else { + print STDERR "algorithm was ", $result->algorithm, " couldn't match\n"; + } + + + my %baselens = ( 'Sbjct:' => ( $dbtype eq 'translated' ) ? 3 : 1, + 'Query:' => ( $qtype eq 'translated' ) ? 3 : 1); + + my $str; + if( ! defined $num || $num <= 1 ) { + $str = &{$self->start_report}($result); + } + + $str .= &{$self->title}($result); + + $str .= $result->algorithm_reference || $self->algorithm_reference($result); + $str .= &{$self->introduction}($result); + + $str .= "

+ + "; + + my $hspstr = '

'; + if( $result->can('rewind')) { + $result->rewind(); # support stream based parsing routines + } + + while( my $hit = $result->next_hit ) { + next if( $hitfilter && ! &{$hitfilter}($hit) ); + my $nm = $hit->name(); + + $self->debug( "no $nm for name (".$hit->description(). "\n") + unless $nm; + my ($gi,$acc) = &{$self->id_parser}($nm); + my $p = "%-$MaxDescLen". "s"; + my $descsub; + if( length($hit->description) > ($MaxDescLen - 3) ) { + $descsub = sprintf($p, + substr($hit->description,0,$MaxDescLen-3) . "..."); + } else { + $descsub = sprintf($p,$hit->description); + } + + my $url_desc = &{$self->hit_link_desc()}($self,$hit, $result); + my $url_align = &{$self->hit_link_align()}($self,$hit, $result); + + my @hsps = $hit->hsps; + + # failover to first HSP if the data does not contain a + # bitscore/significance value for the Hit (NCBI XML data for one) + + $str .= sprintf('

'."\n", + $url_desc, $descsub, + ($hit->raw_score ? $hit->raw_score : + (defined $hsps[0] ? $hsps[0]->score : ' ')), + $acc, + ( $hit->significance ? $hit->significance : + (defined $hsps[0] ? $hsps[0]->evalue : ' ')) + ); + + $hspstr .= "\n". + sprintf(">%s %s\n
Length = %s

\n\n", $url_align, + defined $hit->description ? $hit->description : '', + &_numwithcommas($hit->length)); + my $ct = 0; + foreach my $hsp (@hsps ) { + next if( $hspfilter && ! &{$hspfilter}($hsp) ); + $hspstr .= sprintf(" Score = %s bits (%s), Expect = %s", + $hsp->bits, $hsp->score, $hsp->evalue); + if( defined $hsp->pvalue ) { + $hspstr .= ", P = ".$hsp->pvalue; + } + $hspstr .= "
\n"; + $hspstr .= sprintf(" Identities = %d/%d (%d%%)", + ( $hsp->frac_identical('total') * + $hsp->length('total')), + $hsp->length('total'), + $hsp->frac_identical('total') * 100); + + if( $type eq 'PROTEIN' ) { + $hspstr .= sprintf(", Positives = %d/%d (%d%%)", + ( $hsp->frac_conserved('total') * + $hsp->length('total')), + $hsp->length('total'), + $hsp->frac_conserved('total') * 100); + } + if( $hsp->gaps ) { + $hspstr .= sprintf(", Gaps = %d/%d (%d%%)", + $hsp->gaps('total'), + $hsp->length('total'), + (100 * $hsp->gaps('total') / + $hsp->length('total'))); + } + + my ($hframe,$qframe) = ( $hsp->hit->frame, $hsp->query->frame); + my ($hstrand,$qstrand) = ($hsp->hit->strand,$hsp->query->strand); + # so TBLASTX will have Query/Hit frames + # BLASTX will have Query frame + # TBLASTN will have Hit frame + if( $hstrand || $qstrand ) { + $hspstr .= ", Frame = "; + my ($signq, $signh); + unless( $hstrand ) { + $hframe = undef; + # if strand is null or 0 then it is protein + # and this no frame + } else { + $signh = $hstrand < 0 ? '-' : '+'; + } + unless( $qstrand ) { + $qframe = undef; + # if strand is null or 0 then it is protein + } else { + $signq =$qstrand < 0 ? '-' : '+'; + } + # remember bioperl stores frames as 0,1,2 (GFF way) + # BLAST reports reports as 1,2,3 so + # we have to add 1 to the frame values + if( defined $hframe && ! defined $qframe) { + $hspstr .= "$signh".($hframe+1); + } elsif( defined $qframe && ! defined $hframe) { + $hspstr .= "$signq".($qframe+1); + } else { + $hspstr .= sprintf(" %s%d / %s%d", + $signq,$qframe+1, + $signh, $hframe+1); + } + } +# $hspstr .= "

\n

";
+	    $hspstr .= "

\n

";
+	    
+	    my @hspvals = ( {'name' => 'Query:',
+			     'seq'  => $hsp->query_string,
+			     'start' => ($qstrand >= 0 ? 
+					 $hsp->query->start : 
+					 $hsp->query->end),
+			     'end'   => ($qstrand >= 0 ? 
+					 $hsp->query->end : 
+					 $hsp->query->start),
+			     'index' => 0,
+			     'direction' => $qstrand || 1
+			     },
+			    { 'name' => ' 'x6,
+			      'seq'  => $hsp->homology_string,
+			      'start' => undef,
+			      'end'   => undef,
+			      'index' => 0,
+			      'direction' => 1
+			      },
+			    { 'name'  => 'Sbjct:',
+			      'seq'   => $hsp->hit_string,
+			      'start' => ($hstrand >= 0 ? 
+					  $hsp->hit->start : 
+					  $hsp->hit->end),
+			      'end'   => ($hstrand >= 0 ? 
+					  $hsp->hit->end : 
+					  $hsp->hit->start),
+			      'index' => 0, 
+			      'direction' => $hstrand || 1
+			      }
+			    );	    
+	    
+	    
+	    # let's set the expected length (in chars) of the starting number
+	    # in an alignment block so we can have things line up
+	    # Just going to try and set to the largest
+	    
+	    my ($numwidth) = sort { $b <=> $a }(length($hspvals[0]->{'start'}),
+						length($hspvals[0]->{'end'}),
+						length($hspvals[2]->{'start'}),
+						length($hspvals[2]->{'end'}));
+	    my $count = 0;
+	    while ( $count <= $hsp->length('total') ) {
+		foreach my $v ( @hspvals ) {
+		    my $piece = substr($v->{'seq'}, $v->{'index'} + $count,
+				       $AlignmentLineWidth);
+		    my $cp = $piece;
+		    my $plen = scalar ( $cp =~ tr/\-//);
+		    my ($start,$end) = ('','');
+		    if( defined $v->{'start'} ) { 
+			$start = $v->{'start'};
+			# since strand can be + or - use the direction
+			# to signify which whether to add or substract from end
+			my $d = $v->{'direction'} * ( $AlignmentLineWidth - $plen )*
+			    $baselens{$v->{'name'}};
+			if( length($piece) < $AlignmentLineWidth ) {
+			    $d = (length($piece) - $plen) * $v->{'direction'} * 
+				$baselens{$v->{'name'}};
+			}
+			$end   = $v->{'start'} + $d - $v->{'direction'};
+			$v->{'start'} += $d;
+		    }
+		    $hspstr .= sprintf("%s %-".$numwidth."s %s %s\n",
+				       $v->{'name'},
+				       $start,
+				       $piece,
+				       $end
+				       );
+		}
+		$count += $AlignmentLineWidth;
+		$hspstr .= "\n\n";
+	    }
+	    $hspstr .= "
\n"; + } +# $hspstr .= "
\n"; + } + + + # make table of search statistics and end the web page + $str .= "
Sequences producing significant alignments:Score
(bits)
E
value
%s %s%s%.2g

\n".$hspstr."


Search Parameters

\n"; + + foreach my $param ( $result->available_parameters ) { + $str .= "\n"; + + } + $str .= "
ParameterValue
$param". $result->get_parameter($param) ."

Search Statistics

\n"; + foreach my $stat ( sort $result->available_statistics ) { + $str .= "\n"; + } + $str .= "
StatisticValue
$stat". $result->get_statistic($stat). "

".$self->footer() . "

\n"; + return $str; +} + +=head2 hit_link_desc + + Title : hit_link_desc + Usage : $self->hit_link_desc(\&link_function); + Function: Get/Set the function which provides an HTML + link(s) for the given hit to be used + within the description section at the top of the BLAST report. + This allows a person reading the report within + a web browser to go to one or more database entries for + the given hit from the description section. + Returns : Function reference + Args : Function reference + See Also: L + +=cut + +sub hit_link_desc{ + my( $self, $code ) = @_; + if ($code) { + $self->{'_hit_link_desc'} = $code; + } + return $self->{'_hit_link_desc'} || \&default_hit_link_desc; +} + +=head2 default_hit_link_desc + + Title : defaulthit_link_desc + Usage : $self->default_hit_link_desc($hit, $result) + Function: Provides an HTML link(s) for the given hit to be used + within the description section at the top of the BLAST report. + This allows a person reading the report within + a web browser to go to one or more database entries for + the given hit from the description section. + Returns : string containing HTML markup ", L, L + +=cut + +sub default_hit_link_desc { + my($self, $hit, $result) = @_; + my $type = ( $result->algorithm =~ /(P|X|Y)$/i ) ? 'PROTEIN' : 'NUCLEOTIDE'; + my ($gi,$acc) = &{$self->id_parser}($hit->name); + + my $url = length($self->remote_database_url($type)) > 0 ? + sprintf('%s', + sprintf($self->remote_database_url($type),$gi || $acc), + $hit->name()) : $hit->name(); + + return $url; +} + + +=head2 hit_link_align + + Title : hit_link_align + Usage : $self->hit_link_align(\&link_function); + Function: Get/Set the function which provides an HTML link(s) + for the given hit to be used + within the HSP alignment section of the BLAST report. + This allows a person reading the report within + a web browser to go to one or more database entries for + the given hit from the alignment section. + Returns : string containing HTML markup ", L, L + +=cut + +sub hit_link_align { + my ($self,$code) = @_; + if ($code) { + $self->{'_hit_link_align'} = $code; + } + return $self->{'_hit_link_align'} || \&default_hit_link_desc; +} + +=head2 start_report + + Title : start_report + Usage : $index->start_report( CODE ) + Function: Stores or returns the code to + write the start of the block, the block + and the start of the <BODY> block of HTML. Useful + for (for instance) specifying alternative + HTML if you are embedding the output in + an HTML page which you have already started. + (For example a routine returning a null string). + Returns \&default_start_report (see below) if not + set. + Example : $index->start_report( \&my_start_report ) + Returns : ref to CODE if called without arguments + Args : CODE + +=cut + +sub start_report { + my( $self, $code ) = @_; + if ($code) { + $self->{'_start_report'} = $code; + } + return $self->{'_start_report'} || \&default_start_report; +} + +=head2 default_start_report + + Title : default_start_report + Usage : $self->default_start_report($result) + Function: The default method to call when starting a report. + Returns : sting + Args : First argument is a Bio::Search::Result::ResultI + +=cut + +sub default_start_report { + my ($result) = @_; + return sprintf( + qq{<HTML> + <HEAD> <CENTER><TITLE>Bioperl Reformatted HTML of %s output with Bioperl Bio::SearchIO system + + + + + + + },$result->algorithm,$Revision); + +} + +=head2 title + + Title : title + Usage : $self->title($CODE) + + Function: Stores or returns the code to provide HTML for the given + BLAST report that will appear at the top of the BLAST report + HTML output. Useful for (for instance) specifying + alternative routines to write your own titles. + Returns \&default_title (see below) if not + set. + Example : $index->title( \&my_title ) + Returns : ref to CODE if called without arguments + Args : CODE + +=cut + +sub title { + my( $self, $code ) = @_; + if ($code) { + $self->{'_title'} = $code; + } + return $self->{'_title'} || \&default_title; +} + +=head2 default_title + + Title : default_title + Usage : $self->default_title($result) + Function: Provides HTML for the given BLAST report that will appear + at the top of the BLAST report HTML output. + Returns : string containing HTML markup + The default implementation returns

HTML + containing text such as: + "Bioperl Reformatted HTML of BLASTP Search Report + for gi|1786183|gb|AAC73113.1|" + Args : First argument is a Bio::Search::Result::ResultI + +=cut + +sub default_title { + my ($result) = @_; + + return sprintf( + qq{

Bioperl Reformatted HTML of %s Search Report
for %s

}, + $result->algorithm, + $result->query_name()); +} + + +=head2 introduction + + Title : introduction + Usage : $self->introduction($CODE) + + Function: Stores or returns the code to provide HTML for the given + BLAST report detailing the query and the + database information. + Useful for (for instance) specifying + routines returning alternative introductions. + Returns \&default_introduction (see below) if not + set. + Example : $index->introduction( \&my_introduction ) + Returns : ref to CODE if called without arguments + Args : CODE + +=cut + +sub introduction { + my( $self, $code ) = @_; + if ($code) { + $self->{'_introduction'} = $code; + } + return $self->{'_introduction'} || \&default_introduction; +} + +=head2 default_introduction + + Title : default_introduction + Usage : $self->default_introduction($result) + Function: Outputs HTML to provide the query + and the database information + Returns : string containing HTML + Args : First argument is a Bio::Search::Result::ResultI + Second argument is string holding literature citation + +=cut + +sub default_introduction { + my ($result) = @_; + + return sprintf( + qq{ + Query= %s %s
(%s letters)
+

+ Database: %s

%s sequences; %s total letters

+

+ }, + $result->query_name, + $result->query_description, + &_numwithcommas($result->query_length), + $result->database_name(), + &_numwithcommas($result->database_entries()), + &_numwithcommas($result->database_letters()), + ); +} + +=head2 end_report + + Title : end_report + Usage : $self->end_report() + Function: The method to call when ending a report, this is + mostly for cleanup for formats which require you to + have something at the end of the document () + for HTML + Returns : string + Args : none + +=cut + +sub end_report { + return "\n\n"; +} + +# copied from Bio::Index::Fasta +# useful here as well + +=head2 id_parser + + Title : id_parser + Usage : $index->id_parser( CODE ) + Function: Stores or returns the code used by record_id to + parse the ID for record from a string. Useful + for (for instance) specifying a different + parser for different flavours of FASTA file. + Returns \&default_id_parser (see below) if not + set. If you supply your own id_parser + subroutine, then it should expect a fasta + description line. An entry will be added to + the index for each string in the list returned. + Example : $index->id_parser( \&my_id_parser ) + Returns : ref to CODE if called without arguments + Args : CODE + +=cut + +sub id_parser { + my( $self, $code ) = @_; + + if ($code) { + $self->{'_id_parser'} = $code; + } + return $self->{'_id_parser'} || \&default_id_parser; +} + + + +=head2 default_id_parser + + Title : default_id_parser + Usage : $id = default_id_parser( $header ) + Function: The default Fasta ID parser for Fasta.pm + Returns $1 from applying the regexp /^>\s*(\S+)/ + to $header. + Returns : ID string + The default implementation checks for NCBI-style + identifiers in the given string ('gi|12345|AA54321'). + For these IDs, it extracts the GI and accession and + returns a two-element list of strings (GI, acc). + Args : a fasta header line string + +=cut + +sub default_id_parser { + my ($string) = @_; + my ($gi,$acc); + if( $string =~ s/gi\|(\d+)\|?// ) + { $gi = $1; $acc = $1;} + + if( $string =~ /(\w+)\|([A-Z\d\.\_]+)(\|[A-Z\d\_]+)?/ ) { + $acc = defined $2 ? $2 : $1; + } else { + $acc = $string; + $acc =~ s/^\s+(\S+)/$1/; + $acc =~ s/(\S+)\s+$/$1/; + } + return ($gi,$acc); +} + +sub MIN { $a <=> $b ? $a : $b; } +sub MAX { $a <=> $b ? $b : $a; } + +sub footer { + my ($self) = @_; + return "


Produced by Bioperl module ".ref($self)." on $DATE
Revision: $Revision
\n" + +} + +=head2 algorithm_reference + + Title : algorithm_reference + Usage : my $reference = $writer->algorithm_reference($result); + Function: Returns the appropriate Bibliographic reference for the + algorithm format being produced + Returns : String + Args : L to reference + + +=cut + +sub algorithm_reference { + my ($self,$result) = @_; + return '' if( ! defined $result || !ref($result) || + ! $result->isa('Bio::Search::Result::ResultI')) ; + if( $result->algorithm =~ /BLAST/i ) { + my $res = $result->algorithm . ' ' . $result->algorithm_version . "

"; + if( $result->algorithm_version =~ /WashU/i ) { + return $res . +"Copyright (C) 1996-2000 Washington University, Saint Louis, Missouri USA.
+All Rights Reserved.

+Reference: Gish, W. (1996-2000) http://blast.wustl.edu

"; + } else { + return $res . +"Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaffer,
+Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997),
+\"Gapped BLAST and PSI-BLAST: a new generation of protein database search
+programs\", Nucleic Acids Res. 25:3389-3402.

"; + + } + } elsif( $result->algorithm =~ /FAST/i ) { + return $result->algorithm . " " . $result->algorithm_version . "
" . + "\nReference: Pearson et al, Genomics (1997) 46:24-36

"; + } else { + return ''; + } +} + +# from Perl Cookbook 2.17 +sub _numwithcommas { + my $num = reverse( $_[0] ); + $num =~ s/(\d{3})(?=\d)(?!\d*\.)/$1,/g; + return scalar reverse $num; +} + +=head2 Methods Bio::SearchIO::SearchWriterI + +L inherited methods. + +=head2 filter + + Title : filter + Usage : $writer->filter('hsp', \&hsp_filter); + Function: Filter out either at HSP,Hit,or Result level + Returns : none + Args : string => data type, + CODE reference + + +=cut + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SearchIO/Writer/HitTableWriter.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SearchIO/Writer/HitTableWriter.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,305 @@ +# $Id: HitTableWriter.pm,v 1.14 2002/12/24 15:46:47 jason Exp $ + +=head1 NAME + +Bio::SearchIO::Writer::HitTableWriter - Tab-delimited data for Bio::Search::Hit::HitI objects + +=head1 SYNOPSIS + +=head2 Example 1: Using the default columns + + use Bio::SearchIO; + use Bio::SearchIO::Writer::HitTableWriter; + + my $in = Bio::SearchIO->new(); + + my $writer = Bio::SearchIO::Writer::HitTableWriter->new(); + + my $out = Bio::SearchIO->new( -writer => $writer ); + + while ( my $result = $in->next_result() ) { + $out->write_result($result, ($in->report_count - 1 ? 0 : 1) ); + } + +=head2 Example 2: Specifying a subset of columns + + use Bio::SearchIO; + use Bio::SearchIO::Writer::HitTableWriter; + + my $in = Bio::SearchIO->new(); + + my $writer = Bio::SearchIO::Writer::HitTableWriter->new( + -columns => [qw( + query_name + query_length + hit_name + hit_length + frac_identical_query + expect + )] ); + + my $out = Bio::SearchIO->new( -writer => $writer, + -file => ">searchio.out" ); + + while ( my $result = $in->next_result() ) { + $out->write_result($result, ($in->report_count - 1 ? 0 : 1) ); + } + +=head2 Custom Labels + +You can also specify different column labels if you don't want to use +the defaults. Do this by specifying a C<-labels> hash reference +parameter when creating the HitTableWriter object. The keys of the +hash should be the column number (left-most column = 1) for the label(s) +you want to specify. Here's an example: + + my $writer = Bio::SearchIO::Writer::HitTableWriter->new( + -columns => [qw( query_name + query_length + hit_name + hit_length )], + -labels => { 1 => 'QUERY_GI', + 3 => 'HIT_IDENTIFIER' } ); + + +=head1 DESCRIPTION + +Bio::SearchIO::Writer::HitTableWriter outputs summary data +for each Hit within a search result. Output is in tab-delimited format, +one row per Hit. + +The reason why this is considered summary data is that if a hit +contains multiple HSPs, the HSPs will be tiled and +the data represents a summary across all HSPs. +See below for which columns are affected. +See the docs in L + for more details on HSP tiling. + +=head2 Available Columns + +Here are the columns that can be specified in the C<-columns> +parameter when creating a HitTableWriter object. If a C<-columns> parameter +is not specified, this list, in this order, will be used as the default. + + query_name # Sequence identifier of the query. + query_length # Full length of the query sequence + hit_name # Sequence identifier of the hit + hit_length # Full length of the hit sequence + round # Round number for hit (PSI-BLAST) + expect # Expect value for the alignment + score # Score for the alignment (e.g., BLAST score) + bits # Bit score for the alignment + num_hsps # Number of HSPs (not the "N" value) + frac_identical_query* # fraction of identical substitutions in query + frac_identical_hit* # fraction of identical substitutions in hit + frac_conserved_query* # fraction of conserved substitutions in query + frac_conserved_hit* # fraction of conserved substitutions in hit + frac_aligned_query* # fraction of the query sequence that is aligned + frac_aligned_hit* # fraction of the hit sequence that is aligned + length_aln_query* # Length of the aligned portion of the query sequence + length_aln_hit* # Length of the aligned portion of the hit sequence + gaps_query* # Number of gaps in the aligned query sequence + gaps_hit* # Number of gaps in the aligned hit sequence + gaps_total* # Number of gaps in the aligned query and hit sequences + start_query* # Starting coordinate of the aligned portion of the query sequence + end_query* # Ending coordinate of the aligned portion of the query sequence + start_hit* # Starting coordinate of the aligned portion of the hit sequence + end_hit* # Ending coordinate of the aligned portion of the hit sequence + strand_query # Strand of the aligned query sequence + strand_hit # Strand of the aligned hit sequence + frame # Frame of the alignment (0,1,2) + ambiguous_aln # Ambiguous alignment indicator ('qs', 'q', 's') + hit_description # Full description of the hit sequence + query_description # Full description of the query sequence + +Items marked with a C<*> report data summed across all HSPs +after tiling them to avoid counting data from overlapping regions +multiple times. + +For more details about these columns, see the documentation for the +corresponding method in Bio::Search::Result::BlastHit. + +=head1 TODO + +Figure out the best way to incorporate algorithm-specific score columns. +The best route is probably to have algorithm-specific subclasses +(e.g., BlastHitTableWriter, FastaHitTableWriter). + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Steve Chervitz Esac@bioperl.orgE + +See L for where to send bug reports +and comments. + +=head1 COPYRIGHT + +Copyright (c) 2001, 2002 Steve Chervitz. All Rights Reserved. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=head1 SEE ALSO + +L, +L + +=head1 METHODS + +=cut + +package Bio::SearchIO::Writer::HitTableWriter; + +use strict; +use Bio::SearchIO::Writer::ResultTableWriter; + +use vars qw( @ISA ); +@ISA = qw( Bio::SearchIO::Writer::ResultTableWriter ); + + +# Array fields: column, object, method[/argument], printf format, +# column label Methods for result object are defined in +# Bio::Search::Result::ResultI. Methods for hit object are defined in +# Bio::Search::Hit::HitI. Tech note: If a bogus method is supplied, +# it will result in all values to be zero. Don't know why this is. + +# TODO (maybe): Allow specification of separate mantissa/exponent for +# significance data. + +my %column_map = ( + 'query_name' => ['1', 'result', 'query_name', 's', 'QUERY' ], + 'query_length' => ['2', 'result', 'query_length', 'd', 'LEN_Q'], + 'hit_name' => ['3', 'hit', 'name', 's', 'HIT'], + 'hit_length' => ['4', 'hit', 'length', 'd', 'LEN_H'], + 'round' => ['5', 'hit', 'iteration', 'd', 'ROUND'], + 'expect' => ['6', 'hit', 'significance', '.1e', 'EXPCT'], + 'score' => ['7', 'hit', 'raw_score', 'd', 'SCORE'], + 'bits' => ['8', 'hit', 'bits', 'd', 'BITS'], + 'num_hsps' => ['9', 'hit', 'num_hsps', 'd', 'HSPS'], + 'frac_identical_query' => ['10', 'hit', 'frac_identical/query', '.2f', 'FR_IDQ'], + 'frac_identical_hit' => ['11', 'hit', 'frac_identical/hit', '.2f', 'FR_IDH'], + 'frac_conserved_query' => ['12', 'hit', 'frac_conserved/query', '.2f', 'FR_CNQ'], + 'frac_conserved_hit' => ['13', 'hit', 'frac_conserved/hit', '.2f', 'FR_CNH'], + 'frac_aligned_query' => ['14', 'hit', 'frac_aligned_query', '.2f', 'FR_ALQ'], + 'frac_aligned_hit' => ['15', 'hit', 'frac_aligned_hit', '.2f', 'FR_ALH'], + 'length_aln_query' => ['16', 'hit', 'length_aln/query', 'd', 'LN_ALQ'], + 'length_aln_hit' => ['17', 'hit', 'length_aln/hit', 'd', 'LN_ALH'], + 'gaps_query' => ['18', 'hit', 'gaps/query', 'd', 'GAPS_Q'], + 'gaps_hit' => ['19', 'hit', 'gaps/hit', 'd', 'GAPS_H'], + 'gaps_total' => ['20', 'hit', 'gaps/total', 'd', 'GAPS_QH'], + 'start_query' => ['21', 'hit', 'start/query', 'd', 'START_Q'], + 'end_query' => ['22', 'hit', 'end/query', 'd', 'END_Q'], + 'start_hit' => ['23', 'hit', 'start/hit', 'd', 'START_H'], + 'end_hit' => ['24', 'hit', 'end/hit', 'd', 'END_H'], + 'strand_query' => ['25', 'hit', 'strand/query', 's', 'STRND_Q'], + 'strand_hit' => ['26', 'hit', 'strand/hit', 's', 'STRND_H'], + 'frame' => ['27', 'hit', 'frame', 'd', 'FRAME'], + 'ambiguous_aln' => ['28', 'hit', 'ambiguous_aln', 's', 'AMBIG'], + 'hit_description' => ['29', 'hit', 'description', 's', 'DESC_H'], + 'query_description' => ['30', 'result', 'query_description', 's', 'DESC_Q'], + ); + +sub column_map { return %column_map } + + +=head2 to_string() + +Note: this method is not intended for direct use. The +SearchIO::write_result() method calls it automatically if the writer +is hooked up to a SearchIO object as illustrated in +L. + + Title : to_string() + : + Usage : print $writer->to_string( $result_obj, [$include_labels] ); + : + Argument : $result_obj = A Bio::Search::Result::BlastResult object + : $include_labels = boolean, if true column labels are included (default: false) + : + Returns : String containing tab-delimited set of data for each hit + : in a BlastResult object. Some data is summed across multiple HSPs. + : + Throws : n/a + +=cut + +#---------------- +sub to_string { +#---------------- + my ($self, $result, $include_labels) = @_; + + my $str = $include_labels ? $self->column_labels() : ''; + my $func_ref = $self->row_data_func; + my $printf_fmt = $self->printf_fmt; + + my ($resultfilter,$hitfilter) = ( $self->filter('RESULT'), + $self->filter('HIT') ); + if( ! defined $resultfilter || + &{$resultfilter}($result) ) { + $result->can('rewind') && + $result->rewind(); # insure we're at the beginning + foreach my $hit($result->hits) { + next if( defined $hitfilter && ! &{$hitfilter}($hit)); + my @row_data = map { defined $_ ? $_ : 0 } &{$func_ref}($result, $hit); + $str .= sprintf "$printf_fmt\n", @row_data; + } + } + $str =~ s/\t\n/\n/gs; + return $str; +} + +=head2 end_report + + Title : end_report + Usage : $self->end_report() + Function: The method to call when ending a report, this is + mostly for cleanup for formats which require you to + have something at the end of the document. Nothing for + a text message. + Returns : string + Args : none + +=cut + +sub end_report { + return ''; +} + + +=head2 filter + + Title : filter + Usage : $writer->filter('hsp', \&hsp_filter); + Function: Filter out either at HSP,Hit,or Result level + Returns : none + Args : string => data type, + CODE reference + + +=cut + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SearchIO/Writer/ResultTableWriter.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SearchIO/Writer/ResultTableWriter.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,458 @@ +# $Id: ResultTableWriter.pm,v 1.13 2002/12/05 13:46:35 heikki Exp $ + +=head1 NAME + +Bio::SearchIO::Writer::ResultTableWriter - Outputs tab-delimited data for each Bio::Search::Result::ResultI object. + +=head1 SYNOPSIS + +=head2 Example 1: Using the default columns + + use Bio::SearchIO; + use Bio::SearchIO::Writer::ResultTableWriter; + + my $in = Bio::SearchIO->new(); + + my $writer = Bio::SearchIO::Writer::ResultTableWriter->new(); + + my $out = Bio::SearchIO->new( -writer => $writer ); + + while ( my $result = $in->next_result() ) { + $out->write_result($result, ($in->report_count - 1 ? 0 : 1) ); + } + +=head2 Example 2: Specifying a subset of columns + + use Bio::SearchIO; + use Bio::SearchIO::Writer::ResultTableWriter; + + my $in = Bio::SearchIO->new(); + + my $writer = Bio::SearchIO::Writer::ResultTableWriter->new( + -columns => [qw( + query_name + query_length + )] ); + + my $out = Bio::SearchIO->new( -writer => $writer, + -file => ">result.out" ); + + while ( my $result = $in->next_result() ) { + $out->write_result($result, ($in->report_count - 1 ? 0 : 1) ); + } + +=head2 Custom Labels + +You can also specify different column labels if you don't want to use +the defaults. Do this by specifying a C<-labels> hash reference +parameter when creating the ResultTableWriter object. The keys of the +hash should be the column number (left-most column = 1) for the label(s) +you want to specify. Here's an example: + + my $writer = Bio::SearchIO::Writer::ResultTableWriter->new( + -columns => [qw( query_name + query_length + query_description )], + -labels => { 1 => 'QUERY_GI', + 2 => 'QUERY_LENGTH' } ); + + +=head1 DESCRIPTION + +Bio::SearchIO::Writer::ResultTableWriter outputs data in tab-delimited +format for each search result, one row per search result. This is a very +coarse-grain level of information since it only includes data +stored in the Bio::Search::Result::ResultI object itself and does not +include any information about hits or HSPs. + +You most likely will never use this object but instead will use one of +its subclasses: Bio::SearchIO::Writer::HitTableWriter or +Bio::SearchIO::Writer::HSPTableWriter. + +=head2 Available Columns + +Here are the columns that can be specified in the C<-columns> +parameter when creating a ResultTableWriter object. If a C<-columns> parameter +is not specified, this list, in this order, will be used as the default. + + query_name + query_length + query_description + +For more details about these columns, see the documentation for the +corresponding method in L. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Steve Chervitz Esac@bioperl.orgE + +See L for where to send bug reports +and comments. + +=head1 COPYRIGHT + +Copyright (c) 2001 Steve Chervitz. All Rights Reserved. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=head1 SEE ALSO + +L, +L + +=head1 METHODS + +=cut + + +package Bio::SearchIO::Writer::ResultTableWriter; + +use strict; +use Bio::Root::Root; +use Bio::SearchIO::SearchWriterI; + +use vars qw( @ISA ); +@ISA = qw( Bio::Root::Root Bio::SearchIO::SearchWriterI ); + +# Array fields: column, object, method[/argument], printf format, column label +# Methods are defined in Bio::Search::Result::ResultI. +# Tech note: If a bogus method is supplied, it will result in all values to be zero. +# Don't know why this is. +my %column_map = ( + 'query_name' => ['1', 'result', 'query_name', 's', 'QUERY' ], + 'query_length' => ['2', 'result', 'query_length', 'd', 'LEN_Q'], + 'query_description' => ['3', 'result', 'query_description', 's', 'DESC_Q'], + ); + +sub column_map { return %column_map } + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + my( $col_spec, $label_spec, + $filters ) = $self->_rearrange( [qw(COLUMNS + LABELS + FILTERS)], @args); + + $self->_set_cols( $col_spec ); + $self->_set_labels( $label_spec ) if $label_spec; + $self->_set_printf_fmt(); + $self->_set_row_data_func(); + $self->_set_column_labels(); + + if( defined $filters ) { + if( !ref($filters) =~ /HASH/i ) { + $self->warn("Did not provide a hashref for the FILTERS option, ignoring."); + } else { + while( my ($type,$code) = each %{$filters} ) { + $self->filter($type,$code); + } + } + } + + + return $self; +} + + +# Purpose : Stores the column spec internally. Also performs QC on the +# user-supplied column specification. +# +sub _set_cols { + my ($self, $col_spec_ref) = @_; + return if defined $self->{'_cols'}; # only set columns once + + my %map = $self->column_map; + + if( not defined $col_spec_ref) { + print STDERR "\nUsing default column map.\n"; + $col_spec_ref = [ map { $_ } sort { $map{$a}->[0] <=> $map{$b}->[0] } keys %map ]; + } + + if( ref($col_spec_ref) eq 'ARRAY') { + # printf "%d columns to process\n", scalar(@$col_spec_ref); + my @col_spec = @{$col_spec_ref}; + while( my $item = lc(shift @col_spec) ) { + if( not defined ($map{$item}) ) { + $self->throw(-class =>'Bio::Root::BadParameter', + -text => "Unknown column name: $item" + ); + } + push @{$self->{'_cols'}}, $item; + #print "pushing on to col $col_num, $inner: $item\n"; + } + } + else { + $self->throw(-class =>'Bio::Root::BadParameter', + -text => "Can't set columns: not a ARRAY ref", + -value => $col_spec_ref + ); + } +} + +sub _set_printf_fmt { + my ($self) = @_; + + my @cols = $self->columns(); + my %map = $self->column_map; + + my $printf_fmt = ''; + + foreach my $col ( @cols ) { + $printf_fmt .= "\%$map{$col}->[3]\t"; + } + + $printf_fmt =~ s/\\t$//; + + $self->{'_printf_fmt'} = $printf_fmt; +} + +sub printf_fmt { shift->{'_printf_fmt'} } + +# Sets the data to be used for the labels. +sub _set_labels { + my ($self, $label_spec) = @_; + if( ref($label_spec) eq 'HASH') { + foreach my $col ( sort { $a <=> $b } keys %$label_spec ) { +# print "LABEL: $col $label_spec->{$col}\n"; + $self->{'_custom_labels'}->{$col} = $label_spec->{$col}; + } + } + else { + $self->throw(-class =>'Bio::Root::BadParameter', + -text => "Can't set labels: not a HASH ref: $label_spec" + ); + } +} + +sub _set_column_labels { + my $self = shift; + + my @cols = $self->columns; + my %map = $self->column_map; + my $printf_fmt = ''; + my (@data, $label, @underbars); + + my $i = 0; + foreach my $col( @cols ) { + $i++; + $printf_fmt .= "\%s\t"; + + if(defined $self->{'_custom_labels'}->{$i}) { + $label = $self->{'_custom_labels'}->{$i}; + } + else { + $label = $map{$col}->[4]; + } + push @data, $label; + push @underbars, '-' x length($label); + + } + $printf_fmt =~ s/\\t$//; + + my $str = sprintf "$printf_fmt\n", @data; + + $str =~ s/\t\n/\n/; + $str .= sprintf "$printf_fmt\n", @underbars; + + $str =~ s/\t\n/\n/gs; + $self->{'_column_labels'} = $str; +} + +# Purpose : Generate a function that will call the appropriate +# methods on the result, hit, and hsp objects to retrieve the column data +# specified in the column spec. +# +# We should only have to go through the column spec once +# for a given ResultTableWriter. To permit this, we'll generate code +# for a method that returns an array of the data for a row of output +# given a result, hit, and hsp object as arguments. +# +sub _set_row_data_func { + my $self = shift; + + # Now we need to generate a string that can be eval'd to get the data. + my @cols = $self->columns(); + my %map = $self->column_map; + my @data; + while( my $col = shift @cols ) { + my $object = $map{$col}->[1]; + my $method = $map{$col}->[2]; + my $arg = ''; + if( $method =~ m!(\w+)/(\w+)! ) { + $method = $1; + $arg = "\"$2\""; + } + push @data, "\$$object->$method($arg)"; + } + my $code = join( ",", @data); + + if( $self->verbose > 0 ) { +## Begin Debugging + $self->debug( "Data to print:\n"); + foreach( 0..$#data) { $self->debug( " [". ($_+ 1) . "] $data[$_]\n");} + $self->debug( "CODE:\n$code\n"); + $self->debug("Printf format: ". $self->printf_fmt. "\n"); +## End Debugging + } + + my $func = sub { + my ($result, $hit, $hsp) = @_; + my @r = eval $code; + # This should reduce the occurrence of those opaque "all zeros" bugs. + if( $@ ) { $self->throw("Trouble in ResultTableWriter::_set_row_data_func() eval: $@\n\n"); + } + return @r; + }; + $self->{'_row_data_func'} = $func; +} + +sub row_data_func { shift->{'_row_data_func'} } + + +=head2 to_string() + +Note: this method is not intended for direct use. The +SearchIO::write_result() method calls it automatically if the writer +is hooked up to a SearchIO object as illustrated in L. + + Title : to_string() + : + Usage : print $writer->to_string( $result_obj, [$include_labels] ); + : + Argument : $result_obj = A Bio::Search::Result::ResultI object + : $include_labels = boolean, if true column labels are included (default: false) + : + Returns : String containing tab-delimited set of data for each hit + : in a ResultI object. Some data is summed across multiple HSPs. + : + Throws : n/a + +=cut + +#---------------- +sub to_string { +#---------------- + my ($self, $result, $include_labels) = @_; + + my $str = $include_labels ? $self->column_labels() : ''; + my $resultfilter = $self->filter('RESULT'); + if( ! defined $resultfilter || + &{$resultfilter}($result) ) { + my @row_data = &{$self->{'_row_data_func'}}( $result ); + $str .= sprintf "$self->{'_printf_fmt'}\n", @row_data; + $str =~ s/\t\n/\n/gs; + } + return $str; +} + + + +sub columns { + my $self = shift; + my @cols; + if( ref $self->{'_cols'} ) { + @cols = @{$self->{'_cols'}}; + } + else { + my %map = $self->column_map; + @cols = sort { $map{$a}->[0] <=> $map{$b}->[0] } keys %map; + } + return @cols; +} + + +=head2 column_labels + + Usage : print $result_obj->column_labels(); + Purpose : Get column labels for to_string(). + Returns : String containing column labels. Tab-delimited. + Argument : n/a + Throws : n/a + +=cut + +sub column_labels { shift->{'_column_labels'} } + +=head2 end_report + + Title : end_report + Usage : $self->end_report() + Function: The method to call when ending a report, this is + mostly for cleanup for formats which require you to + have something at the end of the document. Nothing for + a text message. + Returns : string + Args : none + +=cut + +sub end_report { + return ''; +} + +=head2 filter + + Title : filter + Usage : $writer->filter('hsp', \&hsp_filter); + Function: Filter out either at HSP,Hit,or Result level + Returns : none + Args : string => data type, + CODE reference + + +=cut + + +# Is this really needed? +#=head2 signif_format +# +# Usage : $writer->signif_format( [FMT] ); +# Purpose : Allows retrieval of the P/Expect exponent values only +# : or as a two-element list (mantissa, exponent). +# Usage : $writer->signif_format('exp'); +# : $writer->signif_format('parts'); +# Returns : String or '' if not set. +# Argument : String, FMT = 'exp' (return the exponent only) +# : = 'parts'(return exponent + mantissa in 2-elem list) +# : = undefined (return the raw value) +# Comments : P/Expect values are still stored internally as the full, +# : scientific notation value. +# +#=cut +# +##------------- +#sub signif_format { +##------------- +# my $self = shift; +# if(@_) { $self->{'_signif_format'} = shift; } +# return $self->{'_signif_format'}; +#} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SearchIO/Writer/TextResultWriter.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SearchIO/Writer/TextResultWriter.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,765 @@ +# $Id: TextResultWriter.pm,v 1.5.2.5 2003/09/15 16:19:24 jason Exp $ +# +# BioPerl module for Bio::SearchIO::Writer::TextResultWriter +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SearchIO::Writer::TextResultWriter - Object to implement writing a Bio::Search::ResultI in Text. + +=head1 SYNOPSIS + + use Bio::SearchIO; + use Bio::SearchIO::Writer::TextResultWriter; + + my $in = new Bio::SearchIO(-format => 'blast', + -file => shift @ARGV); + + my $writer = new Bio::SearchIO::Writer::TextResultWriter(); + my $out = new Bio::SearchIO(-writer => $writer); + $out->write_result($in->next_result); + +=head1 DESCRIPTION + +This object implements the SearchWriterI interface which will produce +a set of Text for a specific Bio::Search::Report::ReportI interface. + +You can also provide the argument -filters => \%hash to filter the at +the hsp, hit, or result level. %hash is an associative array which +contains any or all of the keys (HSP, HIT, RESULT). The values +pointed to by these keys would be references to a subroutine which +expects to be passed an object - one of Bio::Search::HSP::HSPI, +Bio::Search::Hit::HitI, and Bio::Search::Result::ResultI respectively. +Each function needs to return a boolean value as to whether or not the +passed element should be included in the output report - true if it is to be included, false if it to be omitted. + +For example to filter on sequences in the database which are too short +for your criteria you would do the following. + +Define a hit filter method + + sub hit_filter { + my $hit = shift; + return $hit->length E 100; # test if length of the hit sequence + # long enough + } + my $writer = new Bio::SearchIO::Writer::TextResultWriter( + -filters => { 'HIT' =E \&hit_filter } + ); + +Another example would be to filter HSPs on percent identity, let's +only include HSPs which are 75% identical or better. + + sub hsp_filter { + my $hsp = shift; + return $hsp->percent_identity E 75; + } + my $writer = new Bio::SearchIO::Writer::TextResultWriter( + -filters => { 'HSP' =E \&hsp_filter } + ); + +See L for more info on the filter method. + + +This module will use the module Text::Wrap if it is installed to wrap +the Query description line. If you do not have Text::Wrap installed +this module will work fine but you won't have the Query line wrapped. +You will see a warning about this when you first instantiate a +TextResultWriter - to avoid these warnings from showing up, simply set +the verbosity upon initialization to -1 like this: my $writer = new +Bio::SearchIO::Writer::TextResultWriter(-verbose => -1); + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SearchIO::Writer::TextResultWriter; +use vars qw(@ISA $MaxNameLen $MaxDescLen $AlignmentLineWidth + $DescLineLen $TextWrapLoaded); +use strict; + +# Object preamble - inherits from Bio::Root::RootI + +BEGIN { + $MaxDescLen = 65; + $AlignmentLineWidth = 60; + eval { require Text::Wrap; $TextWrapLoaded = 1;}; + if( $@ ) { + $TextWrapLoaded = 0; + } +} + +use Bio::Root::Root; +use Bio::SearchIO::SearchWriterI; +use POSIX; + +@ISA = qw(Bio::Root::Root Bio::SearchIO::SearchWriterI); + +=head2 new + + Title : new + Usage : my $obj = new Bio::SearchIO::Writer::TextResultWriter(); + Function: Builds a new Bio::SearchIO::Writer::TextResultWriter object + Returns : Bio::SearchIO::Writer::TextResultWriter + Args : -filters => hashref with any or all of the keys (HSP HIT RESULT) + which have values pointing to a subroutine reference + which will expect to get a + + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + my ($filters) = $self->_rearrange([qw(FILTERS)],@args); + if( defined $filters ) { + if( !ref($filters) =~ /HASH/i ) { + $self->warn("Did not provide a hashref for the FILTERS option, ignoring."); + } else { + while( my ($type,$code) = each %{$filters} ) { + $self->filter($type,$code); + } + } + } + unless( $TextWrapLoaded ) { + $self->warn("Could not load Text::Wrap - the Query Description will not be line wrapped\n"); + } else { + $Text::Wrap::columns = $MaxDescLen; + } + return $self; +} + + +=head2 to_string + + Purpose : Produces data for each Search::Result::ResultI in a string. + : This is an abstract method. For some useful implementations, + : see ResultTableWriter.pm, HitTableWriter.pm, + : and HSPTableWriter.pm. + Usage : print $writer->to_string( $result_obj, @args ); + Argument : $result_obj = A Bio::Search::Result::ResultI object + : @args = any additional arguments used by your implementation. + Returns : String containing data for each search Result or any of its + : sub-objects (Hits and HSPs). + Throws : n/a + +=cut + +sub to_string { + my ($self,$result,$num) = @_; + $num ||= 0; + return unless defined $result; + my ($resultfilter,$hitfilter, $hspfilter) = ( $self->filter('RESULT'), + $self->filter('HIT'), + $self->filter('HSP') ); + return '' if( defined $resultfilter && ! &{$resultfilter}($result) ); + + my ($qtype,$dbtype,$dbseqtype,$type); + my $alg = $result->algorithm; + + # This is actually wrong for the FASTAs I think + if( $alg =~ /T(FAST|BLAST)([XY])/i ) { + $qtype = $dbtype = 'translated'; + $dbseqtype = $type = 'PROTEIN'; + } elsif( $alg =~ /T(FAST|BLAST)N/i ) { + $qtype = ''; + $dbtype = 'translated'; + $type = 'PROTEIN'; + $dbseqtype = 'NUCLEOTIDE'; + } elsif( $alg =~ /(FAST|BLAST)N/i || + $alg =~ /(WABA|EXONERATE)/i ) { + $qtype = $dbtype = ''; + $type = $dbseqtype = 'NUCLEOTIDE'; + } elsif( $alg =~ /(FAST|BLAST)P/ || $alg =~ /SSEARCH/i ) { + $qtype = $dbtype = ''; + $type = $dbseqtype = 'PROTEIN'; + } elsif( $alg =~ /(FAST|BLAST)[XY]/i ) { + $qtype = 'translated'; + $dbtype = 'PROTEIN'; + $dbseqtype = $type = 'PROTEIN'; + } else { + print STDERR "algorithm was ", $result->algorithm, " couldn't match\n"; + } + + + my %baselens = ( 'Sbjct:' => ( $dbtype eq 'translated' ) ? 3 : 1, + 'Query:' => ( $qtype eq 'translated' ) ? 3 : 1); + + my $str; + if( ! defined $num || $num <= 1 ) { + $str = &{$self->start_report}($result); + } + + $str .= &{$self->title}($result); + + $str .= $result->algorithm_reference || $self->algorithm_reference($result); + $str .= &{$self->introduction}($result); + + + $str .= qq{ + Score E +Sequences producing significant alignments: (bits) value +}; + my $hspstr = ''; + if( $result->can('rewind')) { + $result->rewind(); # support stream based parsing routines + } + while( my $hit = $result->next_hit ) { + next if( defined $hitfilter && ! &{$hitfilter}($hit) ); + my $nm = $hit->name(); + $self->debug( "no $nm for name (".$hit->description(). "\n") + unless $nm; + my ($gi,$acc) = &{$self->id_parser}($nm); + my $p = "%-$MaxDescLen". "s"; + my $descsub; + my $desc = sprintf("%s %s",$nm,$hit->description); + if( length($desc) - 3 > $MaxDescLen) { + $descsub = sprintf($p, + substr($desc,0,$MaxDescLen-3) . + "..."); + } else { + $descsub = sprintf($p,$desc); + } + + $str .= sprintf("%s %-4s %s\n", + $descsub, + defined $hit->raw_score ? $hit->raw_score : ' ', + defined $hit->significance ? $hit->significance : '?'); + my @hsps = $hit->hsps; + + $hspstr .= sprintf(">%s %s\n%9sLength = %d\n\n", + $hit->name, + defined $hit->description ? $hit->description : '', + '', # empty is for the %9s in the str formatting + $hit->length); + + foreach my $hsp ( @hsps ) { + next if( defined $hspfilter && ! &{$hspfilter}($hsp) ); + $hspstr .= sprintf(" Score = %4s bits (%s), Expect = %s", + $hsp->bits, $hsp->score, $hsp->evalue); + if( $hsp->pvalue ) { + $hspstr .= ", P = ".$hsp->pvalue; + } + $hspstr .= "\n"; + $hspstr .= sprintf(" Identities = %d/%d (%d%%)", + ( $hsp->frac_identical('total') * + $hsp->length('total')), + $hsp->length('total'), + POSIX::floor($hsp->frac_identical('total') + * 100)); + + if( $type eq 'PROTEIN' ) { + $hspstr .= sprintf(", Positives = %d/%d (%d%%)", + ( $hsp->frac_conserved('total') * + $hsp->length('total')), + $hsp->length('total'), + POSIX::floor($hsp->frac_conserved('total') * 100)); + + } + if( $hsp->gaps ) { + $hspstr .= sprintf(", Gaps = %d/%d (%d%%)", + $hsp->gaps('total'), + $hsp->length('total'), + POSIX::floor(100 * $hsp->gaps('total') / + $hsp->length('total'))); + } + $hspstr .= "\n"; + my ($hframe,$qframe) = ( $hsp->hit->frame, + $hsp->query->frame); + my ($hstrand,$qstrand) = ($hsp->hit->strand,$hsp->query->strand); + # so TBLASTX will have Query/Hit frames + # BLASTX will have Query frame + # TBLASTN will have Hit frame + if( $hstrand || $qstrand ) { + $hspstr .= " Frame = "; + my ($signq, $signh); + unless( $hstrand ) { + $hframe = undef; + # if strand is null or 0 then it is protein + # and this no frame + } else { + $signh = $hstrand < 0 ? '-' : '+'; + } + unless( $qstrand ) { + $qframe = undef; + # if strand is null or 0 then it is protein + } else { + $signq =$qstrand < 0 ? '-' : '+'; + } + # remember bioperl stores frames as 0,1,2 (GFF way) + # BLAST reports reports as 1,2,3 so + # we have to add 1 to the frame values + if( defined $hframe && ! defined $qframe) { + $hspstr .= "$signh".($hframe+1); + } elsif( defined $qframe && ! defined $hframe) { + $hspstr .= "$signq".($qframe+1); + } else { + $hspstr .= sprintf(" %s%d / %s%d", + $signq,$qframe+1, + $signh, $hframe+1); + } + } + $hspstr .= "\n\n"; + + my @hspvals = ( {'name' => 'Query:', + 'seq' => $hsp->query_string, + 'start' => ( $hstrand >= 0 ? + $hsp->query->start : + $hsp->query->end), + 'end' => ($qstrand >= 0 ? + $hsp->query->end : + $hsp->query->start), + 'index' => 0, + 'direction' => $qstrand || 1 + }, + { 'name' => ' 'x6, # this might need to adjust for long coordinates?? + 'seq' => $hsp->homology_string, + 'start' => undef, + 'end' => undef, + 'index' => 0, + 'direction' => 1 + }, + { 'name' => 'Sbjct:', + 'seq' => $hsp->hit_string, + 'start' => ($hstrand >= 0 ? + $hsp->hit->start : $hsp->hit->end), + 'end' => ($hstrand >= 0 ? + $hsp->hit->end : $hsp->hit->start), + 'index' => 0, + 'direction' => $hstrand || 1 + } + ); + + + # let's set the expected length (in chars) of the starting number + # in an alignment block so we can have things line up + # Just going to try and set to the largest + + my ($numwidth) = sort { $b <=> $a }(length($hspvals[0]->{'start'}), + length($hspvals[0]->{'end'}), + length($hspvals[2]->{'start'}), + length($hspvals[2]->{'end'})); + my $count = 0; + while ( $count <= $hsp->length('total') ) { + foreach my $v ( @hspvals ) { + my $piece = substr($v->{'seq'}, $v->{'index'} +$count, + $AlignmentLineWidth); + my $cp = $piece; + my $plen = scalar ( $cp =~ tr/\-//); + my ($start,$end) = ('',''); + if( defined $v->{'start'} ) { + $start = $v->{'start'}; + # since strand can be + or - use the direction + # to signify which whether to add or substract from end + my $d = $v->{'direction'} * ( $AlignmentLineWidth - $plen )* + $baselens{$v->{'name'}}; + if( length($piece) < $AlignmentLineWidth ) { + $d = (length($piece) - $plen) * $v->{'direction'} * + $baselens{$v->{'name'}}; + } + $end = $v->{'start'} + $d - $v->{'direction'}; + $v->{'start'} += $d; + } + $hspstr .= sprintf("%s %-".$numwidth."s %s %s\n", + $v->{'name'}, + $start, + $piece, + $end + ); + } + $count += $AlignmentLineWidth; + $hspstr .= "\n"; + } + } + $hspstr .= "\n"; + } + $str .= "\n\n".$hspstr; + + $str .= sprintf(qq{ Database: %s + Posted date: %s + Number of letters in database: %s + Number of sequences in database: %s + +Matrix: %s +}, + $result->database_name(), + $result->get_statistic('posted_date') || + POSIX::strftime("%b %d, %Y %I:%M %p",localtime), + &_numwithcommas($result->database_entries()), + &_numwithcommas($result->database_letters()), + $result->get_parameter('matrix') || ''); + + if( defined (my $open = $result->get_parameter('gapopen')) ) { + $str .= sprintf("Gap Penalties Existence: %d, Extension: %d\n", + $open || 0, $result->get_parameter('gapext') || 0); + } + + # skip those params we've already output + foreach my $param ( grep { ! /matrix|gapopen|gapext/i } + $result->available_parameters ) { + $str .= "$param: ". $result->get_parameter($param) ."\n"; + + } + $str .= "Search Statistics\n"; + # skip posted date, we already output it + foreach my $stat ( sort grep { ! /posted_date/ } + $result->available_statistics ) { + my $expect = $result->get_parameter('expect'); + my $v = $result->get_statistic($stat); + if( $v =~ /^\d+$/ ) { + $v = &_numwithcommas($v); + } + if( defined $expect && + $stat eq 'seqs_better_than_cutoff' ) { + $str .= "seqs_better_than_$expect: $v\n"; + } else { + my $v = + $str .= "$stat: $v\n"; + } + } + $str .= "\n\n"; + return $str; +} + + +=head2 start_report + + Title : start_report + Usage : $index->start_report( CODE ) + Function: Stores or returns the code to + write the start of the block, the block + and the start of the <BODY> block of HTML. Useful + for (for instance) specifying alternative + HTML if you are embedding the output in + an HTML page which you have already started. + (For example a routine returning a null string). + Returns \&default_start_report (see below) if not + set. + Example : $index->start_report( \&my_start_report ) + Returns : ref to CODE if called without arguments + Args : CODE + +=cut + +sub start_report { + my( $self, $code ) = @_; + if ($code) { + $self->{'_start_report'} = $code; + } + return $self->{'_start_report'} || \&default_start_report; +} + +=head2 default_start_report + + Title : default_start_report + Usage : $self->default_start_report($result) + Function: The default method to call when starting a report. + Returns : sting + Args : First argument is a Bio::Search::Result::ResultI + +=cut + +sub default_start_report { + my ($result) = @_; + return ""; +} + +=head2 title + + Title : title + Usage : $self->title($CODE) + + Function: Stores or returns the code to provide HTML for the given + BLAST report that will appear at the top of the BLAST report + HTML output. Useful for (for instance) specifying + alternative routines to write your own titles. + Returns \&default_title (see below) if not + set. + Example : $index->title( \&my_title ) + Returns : ref to CODE if called without arguments + Args : CODE + +=cut + +sub title { + my( $self, $code ) = @_; + if ($code) { + $self->{'_title'} = $code; + } + return $self->{'_title'} || \&default_title; +} + +=head2 default_title + + Title : default_title + Usage : $self->default_title($result) + Function: Provides HTML for the given BLAST report that will appear + at the top of the BLAST report output. + Returns : empty for text implementation + Args : First argument is a Bio::Search::Result::ResultI + +=cut + +sub default_title { + my ($result) = @_; + return ""; +# The HTML implementation +# return sprintf( +# qq{<CENTER><H1><a href="http://bioperl.org">Bioperl</a> Reformatted HTML of %s Search Report<br> for %s</H1></CENTER>}, +# $result->algorithm, +# $result->query_name()); +} + + +=head2 introduction + + Title : introduction + Usage : $self->introduction($CODE) + + Function: Stores or returns the code to provide HTML for the given + BLAST report detailing the query and the + database information. + Useful for (for instance) specifying + routines returning alternative introductions. + Returns \&default_introduction (see below) if not + set. + Example : $index->introduction( \&my_introduction ) + Returns : ref to CODE if called without arguments + Args : CODE + +=cut + +sub introduction { + my( $self, $code ) = @_; + if ($code) { + $self->{'_introduction'} = $code; + } + return $self->{'_introduction'} || \&default_introduction; +} + +=head2 default_introduction + + Title : default_introduction + Usage : $self->default_introduction($result) + Function: Outputs HTML to provide the query + and the database information + Returns : string containing HTML + Args : First argument is a Bio::Search::Result::ResultI + Second argument is string holding literature citation + +=cut + +sub default_introduction { + my ($result) = @_; + + return sprintf( + qq{ +Query= %s + (%s letters) + +Database: %s + %s sequences; %s total letters +}, + &_linewrap($result->query_name . " " . + $result->query_description), + &_numwithcommas($result->query_length), + $result->database_name(), + &_numwithcommas($result->database_entries()), + &_numwithcommas($result->database_letters()), + ); +} + +=head2 end_report + + Title : end_report + Usage : $self->end_report() + Function: The method to call when ending a report, this is + mostly for cleanup for formats which require you to + have something at the end of the document (</BODY></HTML>) + for HTML + Returns : string + Args : none + +=cut + +sub end_report { + return ""; +} + + +# copied from Bio::Index::Fasta +# useful here as well + +=head2 id_parser + + Title : id_parser + Usage : $index->id_parser( CODE ) + Function: Stores or returns the code used by record_id to + parse the ID for record from a string. Useful + for (for instance) specifying a different + parser for different flavours of FASTA file. + Returns \&default_id_parser (see below) if not + set. If you supply your own id_parser + subroutine, then it should expect a fasta + description line. An entry will be added to + the index for each string in the list returned. + Example : $index->id_parser( \&my_id_parser ) + Returns : ref to CODE if called without arguments + Args : CODE + +=cut + +sub id_parser { + my( $self, $code ) = @_; + + if ($code) { + $self->{'_id_parser'} = $code; + } + return $self->{'_id_parser'} || \&default_id_parser; +} + + + +=head2 default_id_parser + + Title : default_id_parser + Usage : $id = default_id_parser( $header ) + Function: The default Fasta ID parser for Fasta.pm + Returns $1 from applying the regexp /^>\s*(\S+)/ + to $header. + Returns : ID string + Args : a fasta header line string + +=cut + +sub default_id_parser { + my ($string) = @_; + my ($gi,$acc); + if( $string =~ s/gi\|(\d+)\|?// ) + { $gi = $1; $acc = $1;} + + if( $string =~ /(\w+)\|([A-Z\d\.\_]+)(\|[A-Z\d\_]+)?/ ) { + $acc = defined $2 ? $2 : $1; + } else { + $acc = $string; + $acc =~ s/^\s+(\S+)/$1/; + $acc =~ s/(\S+)\s+$/$1/; + } + return ($gi,$acc); +} + +sub MIN { $a <=> $b ? $a : $b; } +sub MAX { $a <=> $b ? $b : $a; } + + +=head2 algorithm_reference + + Title : algorithm_reference + Usage : my $reference = $writer->algorithm_reference($result); + Function: Returns the appropriate Bibliographic reference for the + algorithm format being produced + Returns : String + Args : L<Bio::Search::Result::ResultI> to reference + + +=cut + +sub algorithm_reference{ + my ($self,$result) = @_; + return '' if( ! defined $result || !ref($result) || + ! $result->isa('Bio::Search::Result::ResultI')) ; + if( $result->algorithm =~ /BLAST/i ) { + my $res = $result->algorithm . ' '. $result->algorithm_version. "\n"; + if( $result->algorithm_version =~ /WashU/i ) { + return $res .qq{ +Copyright (C) 1996-2000 Washington University, Saint Louis, Missouri USA. +All Rights Reserved. + +Reference: Gish, W. (1996-2000) http://blast.wustl.edu +}; + } else { + return $res . qq{ +Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaffer, +Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), +"Gapped BLAST and PSI-BLAST: a new generation of protein database search +programs", Nucleic Acids Res. 25:3389-3402. +}; + } + } elsif( $result->algorithm =~ /FAST/i ) { + return $result->algorithm. " ". $result->algorithm_version . "\n". + "\nReference: Pearson et al, Genomics (1997) 46:24-36\n"; + } else { + return ''; + } +} + +# from Perl Cookbook 2.17 +sub _numwithcommas { + my $num = reverse( $_[0] ); + $num =~ s/(\d{3})(?=\d)(?!\d*\.)/$1,/g; + return scalar reverse $num; +} + +sub _linewrap { + my ($str) = @_; + if($TextWrapLoaded) { + return Text::Wrap::wrap("","",$str); # use Text::Wrap + } else { return $str; } # cannot wrap +} +=head2 Methods Bio::SearchIO::SearchWriterI + +L<Bio::SearchIO::SearchWriterI> inherited methods. + +=head2 filter + + Title : filter + Usage : $writer->filter('hsp', \&hsp_filter); + Function: Filter out either at HSP,Hit,or Result level + Returns : none + Args : string => data type, + CODE reference + + +=cut + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SearchIO/blast.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SearchIO/blast.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,988 @@ +# $Id: blast.pm,v 1.42.2.14 2003/09/15 16:19:01 jason Exp $ +# +# BioPerl module for Bio::SearchIO::blast +# +# Cared for by Jason Stajich <jason@bioperl.org> +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SearchIO::blast - Event generator for event based parsing of blast reports + +=head1 SYNOPSIS + + # Do not use this object directly - it is used as part of the + # Bio::SearchIO system. + + use Bio::SearchIO; + my $searchio = new Bio::SearchIO(-format => 'blast', + -file => 't/data/ecolitst.bls'); + while( my $result = $searchio->next_result ) { + while( my $hit = $result->next_hit ) { + while( my $hsp = $hit->next_hsp ) { + # ... + } + } + } + +=head1 DESCRIPTION + +This object encapsulated the necessary methods for generating events +suitable for building Bio::Search objects from a BLAST report file. +Read the L<Bio::SearchIO> for more information about how to use this. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + + +package Bio::SearchIO::blast; +use strict; +use vars qw(@ISA %MAPPING %MODEMAP $DEFAULT_BLAST_WRITER_CLASS); +use Bio::SearchIO; + +@ISA = qw(Bio::SearchIO ); + +BEGIN { + # mapping of NCBI Blast terms to Bioperl hash keys + %MODEMAP = ('BlastOutput' => 'result', + 'Hit' => 'hit', + 'Hsp' => 'hsp' + ); + + # This should really be done more intelligently, like with + # XSLT + + %MAPPING = + ( + 'Hsp_bit-score' => 'HSP-bits', + 'Hsp_score' => 'HSP-score', + 'Hsp_evalue' => 'HSP-evalue', + 'Hsp_pvalue' => 'HSP-pvalue', + 'Hsp_query-from' => 'HSP-query_start', + 'Hsp_query-to' => 'HSP-query_end', + 'Hsp_hit-from' => 'HSP-hit_start', + 'Hsp_hit-to' => 'HSP-hit_end', + 'Hsp_positive' => 'HSP-conserved', + 'Hsp_identity' => 'HSP-identical', + 'Hsp_gaps' => 'HSP-hsp_gaps', + 'Hsp_hitgaps' => 'HSP-hit_gaps', + 'Hsp_querygaps' => 'HSP-query_gaps', + 'Hsp_qseq' => 'HSP-query_seq', + 'Hsp_hseq' => 'HSP-hit_seq', + 'Hsp_midline' => 'HSP-homology_seq', + 'Hsp_align-len' => 'HSP-hsp_length', + 'Hsp_query-frame'=> 'HSP-query_frame', + 'Hsp_hit-frame' => 'HSP-hit_frame', + + 'Hit_id' => 'HIT-name', + 'Hit_len' => 'HIT-length', + 'Hit_accession' => 'HIT-accession', + 'Hit_def' => 'HIT-description', + 'Hit_signif' => 'HIT-significance', + 'Hit_score' => 'HIT-score', + 'Iteration_iter-num' => 'HIT-iteration', + + 'BlastOutput_program' => 'RESULT-algorithm_name', + 'BlastOutput_version' => 'RESULT-algorithm_version', + 'BlastOutput_query-def'=> 'RESULT-query_name', + 'BlastOutput_query-len'=> 'RESULT-query_length', + 'BlastOutput_query-acc'=> 'RESULT-query_accession', + 'BlastOutput_querydesc'=> 'RESULT-query_description', + 'BlastOutput_db' => 'RESULT-database_name', + 'BlastOutput_db-len' => 'RESULT-database_entries', + 'BlastOutput_db-let' => 'RESULT-database_letters', + + 'Parameters_matrix' => { 'RESULT-parameters' => 'matrix'}, + 'Parameters_expect' => { 'RESULT-parameters' => 'expect'}, + 'Parameters_include' => { 'RESULT-parameters' => 'include'}, + 'Parameters_sc-match' => { 'RESULT-parameters' => 'match'}, + 'Parameters_sc-mismatch' => { 'RESULT-parameters' => 'mismatch'}, + 'Parameters_gap-open' => { 'RESULT-parameters' => 'gapopen'}, + 'Parameters_gap-extend'=> { 'RESULT-parameters' => 'gapext'}, + 'Parameters_filter' => {'RESULT-parameters' => 'filter'}, + 'Parameters_allowgaps' => { 'RESULT-parameters' => 'allowgaps'}, + + 'Statistics_db-len' => {'RESULT-statistics' => 'dbentries'}, + 'Statistics_db-let' => { 'RESULT-statistics' => 'dbletters'}, + 'Statistics_hsp-len' => { 'RESULT-statistics' => 'effective_hsplength'}, + 'Statistics_query-len' => { 'RESULT-statistics' => 'querylength'}, + 'Statistics_eff-space' => { 'RESULT-statistics' => 'effectivespace'}, + 'Statistics_eff-spaceused' => { 'RESULT-statistics' => 'effectivespaceused'}, + 'Statistics_eff-dblen' => { 'RESULT-statistics' => 'effectivedblength'}, + 'Statistics_kappa' => { 'RESULT-statistics' => 'kappa' }, + 'Statistics_lambda' => { 'RESULT-statistics' => 'lambda' }, + 'Statistics_entropy' => { 'RESULT-statistics' => 'entropy'}, + 'Statistics_framewindow'=> { 'RESULT-statistics' => 'frameshiftwindow'}, + 'Statistics_decay'=> { 'RESULT-statistics' => 'decayconst'}, + + 'Statistics_T'=> { 'RESULT-statistics' => 'T'}, + 'Statistics_A'=> { 'RESULT-statistics' => 'A'}, + 'Statistics_X1'=> { 'RESULT-statistics' => 'X1'}, + 'Statistics_X2'=> { 'RESULT-statistics' => 'X2'}, + 'Statistics_S1'=> { 'RESULT-statistics' => 'S1'}, + 'Statistics_S2'=> { 'RESULT-statistics' => 'S2'}, + 'Statistics_hit_to_db' => { 'RESULT-statistics' => 'Hits_to_DB'}, + 'Statistics_num_extensions' => { 'RESULT-statistics' => 'num_extensions'}, + 'Statistics_num_extensions' => { 'RESULT-statistics' => 'num_extensions'}, + 'Statistics_num_suc_extensions' => { 'RESULT-statistics' => 'num_successful_extensions'}, + 'Statistics_seqs_better_than_cutoff' => { 'RESULT-statistics' => 'seqs_better_than_cutoff'}, + 'Statistics_posted_date' => { 'RESULT-statistics' => 'posted_date'}, + + # WU-BLAST stats + 'Statistics_DFA_states'=> { 'RESULT-statistics' => 'num_dfa_states'}, + 'Statistics_DFA_size'=> { 'RESULT-statistics' => 'dfa_size'}, + + 'Statistics_search_cputime' => { 'RESULT-statistics' => 'search_cputime'}, + 'Statistics_total_cputime' => { 'RESULT-statistics' => 'total_cputime'}, + 'Statistics_search_actualtime' => { 'RESULT-statistics' => 'search_actualtime'}, + 'Statistics_total_actualtime' => { 'RESULT-statistics' => 'total_actualtime'}, + + 'Statistics_noprocessors' => { 'RESULT-statistics' => 'no_of_processors'}, + 'Statistics_neighbortime' => { 'RESULT-statistics' => 'neighborhood_generate_time'}, + 'Statistics_starttime' => { 'RESULT-statistics' => 'start_time'}, + 'Statistics_endtime' => { 'RESULT-statistics' => 'end_time'}, + ); + + $DEFAULT_BLAST_WRITER_CLASS = 'Bio::Search::Writer::HitTableWriter'; +} + + +=head2 new + + Title : new + Usage : my $obj = new Bio::SearchIO::blast(); + Function: Builds a new Bio::SearchIO::blast object + Returns : Bio::SearchIO::blast + Args : -fh/-file => filehandle/filename to BLAST file + -format => 'blast' + +=cut + +=head2 next_result + + Title : next_result + Usage : my $hit = $searchio->next_result; + Function: Returns the next Result from a search + Returns : Bio::Search::Result::ResultI object + Args : none + +=cut + +sub next_result{ + my ($self) = @_; + + my $data = ''; + my $seentop = 0; + my ($reporttype,$seenquery,$reportline); + $self->start_document(); + my @hit_signifs; + + while( defined ($_ = $self->_readline )) { + next if( /^\s+$/); # skip empty lines + next if( /CPU time:/); + next if( /^>\s*$/); + + if( /^([T]?BLAST[NPX])\s*(.+)$/i || + /^(PSITBLASTN)\s+(.+)$/i || + /^(RPS-BLAST)\s*(.+)$/i || + /^(MEGABLAST)\s*(.+)$/i + ) { + if( $seentop ) { + $self->_pushback($_); + $self->in_element('hsp') && + $self->end_element({ 'Name' => 'Hsp'}); + $self->in_element('hit') && + $self->end_element({ 'Name' => 'Hit'}); + $self->end_element({ 'Name' => 'BlastOutput'}); + return $self->end_document(); + } + $self->start_element({ 'Name' => 'BlastOutput' } ); + $self->{'_result_count'}++; + $seentop = 1; + $reporttype = $1; + $reportline = $_; # to fix the fact that RPS-BLAST output is wrong + $self->element({ 'Name' => 'BlastOutput_program', + 'Data' => $reporttype}); + + $self->element({ 'Name' => 'BlastOutput_version', + 'Data' => $2}); + } elsif ( /^Query=\s*(.+)$/ ) { + my $q = $1; + my $size = 0; + if( defined $seenquery ) { + $self->_pushback($reportline); + $self->_pushback($_); + $self->in_element('hsp') && + $self->end_element({'Name'=> 'Hsp'}); + $self->in_element('hit') && + $self->end_element({'Name'=> 'Hit'}); + $self->end_element({'Name' => 'BlastOutput'}); + return $self->end_document(); + } else { + if( ! defined $reporttype ) { + $self->start_element({'Name' => 'BlastOutput'}); + $seentop = 1; + $self->{'_result_count'}++; + } + } + $seenquery = $q; + $_ = $self->_readline; + while( defined ($_) ) { + if( /^Database:/ ) { + $self->_pushback($_); + last; + } + chomp; + if( /\((\-?[\d,]+)\s+letters.*\)/ ) { + $size = $1; + $size =~ s/,//g; + last; + } else { + $q .= " $_"; + $q =~ s/ +/ /g; + $q =~ s/^ | $//g; + } + + $_ = $self->_readline; + } + chomp($q); + my ($nm,$desc) = split(/\s+/,$q,2); + $self->element({ 'Name' => 'BlastOutput_query-def', + 'Data' => $nm}); + $self->element({ 'Name' => 'BlastOutput_query-len', + 'Data' => $size}); + defined $desc && $desc =~ s/\s+$//; + $self->element({ 'Name' => 'BlastOutput_querydesc', + 'Data' => $desc}); + + if( my @pieces = split(/\|/,$nm) ) { + my $acc = pop @pieces; + $acc = pop @pieces if( ! defined $acc || $acc =~ /^\s+$/); + $self->element({ 'Name' => 'BlastOutput_query-acc', + 'Data' => $acc}); + } + + } elsif( /Sequences producing significant alignments:/ ) { + descline: + while( defined ($_ = $self->_readline() )) { + if( /^>/ ) { + $self->_pushback($_); + last descline; + } elsif( /(\d+)\s+([\d\.\-eE]+)(\s+\d+)?\s*$/) { + # the last match is for gapped BLAST output + # which will report the number of HSPs for the Hit + my ($score, $evalue) = ($1, $2); + # Some data clean-up so e-value will appear numeric to perl + $evalue =~ s/^e/1e/i; + push @hit_signifs, [ $evalue, $score ]; + } + } + } elsif( /Sequences producing High-scoring Segment Pairs:/ ) { + # skip the next line + $_ = $self->_readline(); + + while( defined ($_ = $self->_readline() ) && + ! /^\s+$/ ) { + my @line = split; + pop @line; # throw away first number which is for 'N'col + push @hit_signifs, [ pop @line, pop @line]; + } + } elsif ( /^Database:\s*(.+)$/ ) { + my $db = $1; + + while( defined($_ = $self->_readline) ) { + if( /^\s+(\-?[\d\,]+)\s+sequences\;\s+(\-?[\d,]+)\s+total\s+letters/){ + my ($s,$l) = ($1,$2); + $s =~ s/,//g; + $l =~ s/,//g; + $self->element({'Name' => 'BlastOutput_db-len', + 'Data' => $s}); + $self->element({'Name' => 'BlastOutput_db-let', + 'Data' => $l}); + last; + } else { + chomp; + $db .= $_; + } + } + $self->element({'Name' => 'BlastOutput_db', + 'Data' => $db}); + } elsif( /^>(\S+)\s*(.*)?/ ) { + chomp; + + $self->in_element('hsp') && $self->end_element({ 'Name' => 'Hsp'}); + $self->in_element('hit') && $self->end_element({ 'Name' => 'Hit'}); + + $self->start_element({ 'Name' => 'Hit'}); + my $id = $1; + my $restofline = $2; + $self->element({ 'Name' => 'Hit_id', + 'Data' => $id}); + my ($acc, $version); + if ($id =~ /(gb|emb|dbj|sp|pdb|bbs|ref|lcl)\|(.*)\|(.*)/) { + ($acc, $version) = split /\./, $2; + } elsif ($id =~ /(pir|prf|pat|gnl)\|(.*)\|(.*)/) { + ($acc, $version) = split /\./, $3; + } else { + #punt, not matching the db's at ftp://ftp.ncbi.nih.gov/blast/db/README + #Database Name Identifier Syntax + #============================ ======================== + #GenBank gb|accession|locus + #EMBL Data Library emb|accession|locus + #DDBJ, DNA Database of Japan dbj|accession|locus + #NBRF PIR pir||entry + #Protein Research Foundation prf||name + #SWISS-PROT sp|accession|entry name + #Brookhaven Protein Data Bank pdb|entry|chain + #Patents pat|country|number + #GenInfo Backbone Id bbs|number + #General database identifier gnl|database|identifier + #NCBI Reference Sequence ref|accession|locus + #Local Sequence identifier lcl|identifier + $acc=$id; + } + $self->element({ 'Name' => 'Hit_accession', + 'Data' => $acc}); + + my $v = shift @hit_signifs; + if( defined $v ) { + $self->element({'Name' => 'Hit_signif', + 'Data' => $v->[0]}); + $self->element({'Name' => 'Hit_score', + 'Data' => $v->[1]}); + } + while(defined($_ = $self->_readline()) ) { + next if( /^\s+$/ ); + chomp; + if( /Length\s*=\s*([\d,]+)/ ) { + my $l = $1; + $l =~ s/\,//g; + $self->element({ 'Name' => 'Hit_len', + 'Data' => $l }); + last; + } else { + $restofline .= $_; + } + } + $restofline =~ s/\s+/ /g; + $self->element({ 'Name' => 'Hit_def', + 'Data' => $restofline}); + } elsif( /\s+(Plus|Minus) Strand HSPs:/i ) { + next; + } elsif( ($self->in_element('hit') || + $self->in_element('hsp')) && # wublast + m/Score\s*=\s*(\S+)\s* # Bit score + \(([\d\.]+)\s*bits\), # Raw score + \s*Expect\s*=\s*([^,\s]+), # E-value + \s*(?:Sum)?\s* # SUM + P(?:\(\d+\))?\s*=\s*([^,\s]+) # P-value + /ox + ) { + my ($score, $bits,$evalue,$pvalue) = ($1,$2,$3,$4); + $evalue =~ s/^e/1e/i; + $pvalue =~ s/^e/1e/i; + $self->in_element('hsp') && $self->end_element({'Name' => 'Hsp'}); + $self->start_element({'Name' => 'Hsp'}); + $self->element( { 'Name' => 'Hsp_score', + 'Data' => $score}); + $self->element( { 'Name' => 'Hsp_bit-score', + 'Data' => $bits}); + $self->element( { 'Name' => 'Hsp_evalue', + 'Data' => $evalue}); + $self->element( {'Name' => 'Hsp_pvalue', + 'Data' => $pvalue}); + } elsif( ($self->in_element('hit') || + $self->in_element('hsp')) && # ncbi blast + m/Score\s*=\s*(\S+)\s*bits\s* # Bit score + (?:\((\d+)\))?, # Missing for BLAT pseudo-BLAST fmt + \s*Expect(?:\(\d+\))?\s*=\s*(\S+) # E-value + /ox) { + my ($bits,$score,$evalue) = ($1,$2,$3); + $evalue =~ s/^e/1e/i; + $self->in_element('hsp') && $self->end_element({ 'Name' => 'Hsp'}); + + $self->start_element({'Name' => 'Hsp'}); + $self->element( { 'Name' => 'Hsp_score', + 'Data' => $score}); + $self->element( { 'Name' => 'Hsp_bit-score', + 'Data' => $bits}); + $self->element( { 'Name' => 'Hsp_evalue', + 'Data' => $evalue}); + } elsif( $self->in_element('hsp') && + m/Identities\s*=\s*(\d+)\s*\/\s*(\d+)\s*[\d\%\(\)]+\s* + (?:,\s*Positives\s*=\s*(\d+)\/(\d+)\s*[\d\%\(\)]+\s*)? # pos only valid for Protein alignments + (?:\,\s*Gaps\s*=\s*(\d+)\/(\d+))? # Gaps + /oxi + ) { + $self->element( { 'Name' => 'Hsp_identity', + 'Data' => $1}); + $self->element( {'Name' => 'Hsp_align-len', + 'Data' => $2}); + if( defined $3 ) { + $self->element( { 'Name' => 'Hsp_positive', + 'Data' => $3}); + } else { + $self->element( { 'Name' => 'Hsp_positive', + 'Data' => $1}); + } + if( defined $6 ) { + $self->element( { 'Name' => 'Hsp_gaps', + 'Data' => $5}); + } + + $self->{'_Query'} = { 'begin' => 0, 'end' => 0}; + $self->{'_Sbjct'} = { 'begin' => 0, 'end' => 0}; + + if( /(Frame\s*=\s*.+)$/ ) { + # handle wu-blast Frame listing on same line + $self->_pushback($1); + } + } elsif( $self->in_element('hsp') && + /Strand\s*=\s*(Plus|Minus)\s*\/\s*(Plus|Minus)/i ) { + # consume this event ( we infer strand from start/end) + next; + } elsif( $self->in_element('hsp') && + /Frame\s*=\s*([\+\-][1-3])\s*(\/\s*([\+\-][1-3]))?/ ){ + my ($one,$two)= ($1,$2); + my ($queryframe,$hitframe); + if( $reporttype eq 'TBLASTX' ) { + ($queryframe,$hitframe) = ($one,$two); + $hitframe =~ s/\/\s*//g; + } elsif( $reporttype =~ /^(PSI)?TBLASTN/oi ) { + ($hitframe,$queryframe) = ($one,0); + } elsif( $reporttype eq 'BLASTX' ) { + ($queryframe,$hitframe) = ($one,0); + } + $self->element({'Name' => 'Hsp_query-frame', + 'Data' => $queryframe}); + + $self->element({'Name' => 'Hsp_hit-frame', + 'Data' => $hitframe}); + } elsif( /^Parameters:/ || /^\s+Database:\s+?/ || /^\s+Subset/ || + /^\s+Subset/ || /^\s*Lambda/ || /^\s*Histogram/ || + ( $self->in_element('hsp') && /WARNING|NOTE/ ) ) { + $self->in_element('hsp') && $self->end_element({'Name' => 'Hsp'}); + $self->in_element('hit') && $self->end_element({'Name' => 'Hit'}); + next if /^\s+Subset/; + my $blast = ( /^(\s+Database\:)|(\s*Lambda)/ ) ? 'ncbi' : 'wublast'; + if( /^\s*Histogram/ ) { + $blast = 'btk'; + } + my $last = ''; + # default is that gaps are allowed + $self->element({'Name' => 'Parameters_allowgaps', + 'Data' => 'yes'}); + while( defined ($_ = $self->_readline ) ) { + if( /^(PSI)?([T]?BLAST[NPX])\s*(.+)$/i || + /^(RPS-BLAST)\s*(.+)$/i || + /^(MEGABLAST)\s*(.+)$/i ) { + $self->_pushback($_); + # let's handle this in the loop + last; + } elsif( /^Query=/ ) { + $self->_pushback($reportline); + $self->_pushback($_); + # -- Superfluous I think + $self->in_element('hsp') && + $self->end_element({'Name' => 'Hsp'}); + $self->in_element('hit') && + $self->end_element({'Name' => 'Hit'}); + # -- + $self->end_element({ 'Name' => 'BlastOutput'}); + return $self->end_document(); + } + + # here is where difference between wublast and ncbiblast + # is better handled by different logic + if( /Number of Sequences:\s+([\d\,]+)/i || + /of sequences in database:\s+([\d,]+)/i) { + my $c = $1; + $c =~ s/\,//g; + $self->element({'Name' => 'Statistics_db-len', + 'Data' => $c}); + } elsif ( /letters in database:\s+(\-?[\d,]+)/i) { + my $s = $1; + $s =~ s/,//g; + $self->element({'Name' => 'Statistics_db-let', + 'Data' => $s}); + } elsif( $blast eq 'btk' ) { + next; + } elsif( $blast eq 'wublast' ) { + if( /E=(\S+)/ ) { + $self->element({'Name' => 'Parameters_expect', + 'Data' => $1}); + } elsif( /nogaps/ ) { + $self->element({'Name' => 'Parameters_allowgaps', + 'Data' => 'no'}); + } elsif( $last =~ /(Frame|Strand)\s+MatID\s+Matrix name/i ){ + s/^\s+//; + #throw away first two slots + my @vals = split; + splice(@vals, 0,2); + my ($matrix,$lambda,$kappa,$entropy) = @vals; + $self->element({'Name' => 'Parameters_matrix', + 'Data' => $matrix}); + $self->element({'Name' => 'Statistics_lambda', + 'Data' => $lambda}); + $self->element({'Name' => 'Statistics_kappa', + 'Data' => $kappa}); + $self->element({'Name' => 'Statistics_entropy', + 'Data' => $entropy}); + } elsif( m/^\s+Q=(\d+),R=(\d+)\s+/ox ) { + $self->element({'Name' => 'Parameters_gap-open', + 'Data' => $1}); + $self->element({'Name' => 'Parameters_gap-extend', + 'Data' => $2}); + } elsif( /(\S+\s+\S+)\s+DFA:\s+(\S+)\s+\((.+)\)/ ) { + if( $1 eq 'states in') { + $self->element({'Name' => 'Statistics_DFA_states', + 'Data' => "$2 $3"}); + } elsif( $1 eq 'size of') { + $self->element({'Name' => 'Statistics_DFA_size', + 'Data' => "$2 $3"}); + } + } elsif( /^\s+Time to generate neighborhood:\s+(\S+\s+\S+\s+\S+)/ ) { + $self->element({'Name' => 'Statistics_neighbortime', + 'Data' => $1}); + } elsif( /processors\s+used:\s+(\d+)/ ) { + $self->element({'Name' => 'Statistics_noprocessors', + 'Data' => $1}); + } elsif( m/^\s+(\S+)\s+cpu\s+time:\s+(\S+\s+\S+\s+\S+)\s+ + Elapsed:\s+(\S+)/ox ) { + my $cputype = lc($1); + $self->element({'Name' => "Statistics_$cputype\_cputime", + 'Data' => $2}); + $self->element({'Name' => "Statistics_$cputype\_actualtime", + 'Data' => $3}); + } elsif( /^\s+Start:/ ) { + my ($junk,$start,$stime,$end,$etime) = + split(/\s+(Start|End)\:\s+/,$_); + chomp($stime); + $self->element({'Name' => 'Statistics_starttime', + 'Data' => $stime}); + chomp($etime); + $self->element({'Name' => 'Statistics_endtime', + 'Data' => $etime}); + } elsif( !/^\s+$/ ) { + $self->debug( "unmatched stat $_"); + } + + } elsif ( $blast eq 'ncbi' ) { + if( m/^Matrix:\s+(\S+)/oxi ) { + $self->element({'Name' => 'Parameters_matrix', + 'Data' => $1}); + } elsif( /Lambda/ ) { + $_ = $self->_readline; + s/^\s+//; + my ($lambda, $kappa, $entropy) = split; + $self->element({'Name' => 'Statistics_lambda', + 'Data' => $lambda}); + $self->element({'Name' => 'Statistics_kappa', + 'Data' => $kappa}); + $self->element({'Name' => 'Statistics_entropy', + 'Data' => $entropy}); + } elsif( m/effective\s+search\s+space\s+used:\s+(\d+)/ox ) { + $self->element({'Name' => 'Statistics_eff-spaceused', + 'Data' => $1}); + } elsif( m/effective\s+search\s+space:\s+(\d+)/ox ) { + $self->element({'Name' => 'Statistics_eff-space', + 'Data' => $1}); + } elsif( m/Gap\s+Penalties:\s+Existence:\s+(\d+)\, + \s+Extension:\s+(\d+)/ox) { + $self->element({'Name' => 'Parameters_gap-open', + 'Data' => $1}); + $self->element({'Name' => 'Parameters_gap-extend', + 'Data' => $2}); + } elsif( /effective\s+HSP\s+length:\s+(\d+)/ ) { + $self->element({'Name' => 'Statistics_hsp-len', + 'Data' => $1}); + } elsif( /effective\s+length\s+of\s+query:\s+([\d\,]+)/ ) { + my $c = $1; + $c =~ s/\,//g; + $self->element({'Name' => 'Statistics_query-len', + 'Data' => $c}); + } elsif( m/effective\s+length\s+of\s+database:\s+ + ([\d\,]+)/ox){ + my $c = $1; + $c =~ s/\,//g; + $self->element({'Name' => 'Statistics_eff-dblen', + 'Data' => $c}); + } elsif( m/^(T|A|X1|X2|S1|S2):\s+(.+)/ox ) { + my $v = $2; + chomp($v); + $self->element({'Name' => "Statistics_$1", + 'Data' => $v}); + } elsif( m/frameshift\s+window\,\s+decay\s+const:\s+ + (\d+)\,\s+([\.\d]+)/ox ) { + $self->element({'Name'=> 'Statistics_framewindow', + 'Data' => $1}); + $self->element({'Name'=> 'Statistics_decay', + 'Data' => $2}); + } elsif( m/^Number\s+of\s+Hits\s+to\s+DB:\s+(\S+)/ox ) { + $self->element({'Name' => 'Statistics_hit_to_db', + 'Data' => $1}); + } elsif( m/^Number\s+of\s+extensions:\s+(\S+)/ox ) { + $self->element({'Name' => 'Statistics_num_extensions', + 'Data' => $1}); + } elsif( m/^Number\s+of\s+successful\s+extensions:\s+ + (\S+)/ox ) { + $self->element({'Name' => 'Statistics_num_suc_extensions', + 'Data' => $1}); + } elsif( m/^Number\s+of\s+sequences\s+better\s+than\s+ + (\S+):\s+(\d+)/ox ) { + $self->element({'Name' => 'Parameters_expect', + 'Data' => $1}); + $self->element({'Name' => 'Statistics_seqs_better_than_cutoff', + 'Data' => $2}); + } elsif( /^\s+Posted\s+date:\s+(.+)/ ) { + my $d = $1; + chomp($d); + $self->element({'Name' => 'Statistics_posted_date', + 'Data' => $d}); + } elsif( ! /^\s+$/ ) { + $self->debug( "unmatched stat $_"); + } + } + $last = $_; + } + } elsif( $self->in_element('hsp') ) { + # let's read 3 lines at a time; + my %data = ( 'Query' => '', + 'Mid' => '', + 'Hit' => '' ); + my ($l,$len); + for( my $i = 0; + defined($_) && $i < 3; + $i++ ){ + chomp; + if( ($i == 0 && /^\s+$/) || ($l = /^\s*Lambda/i) ) { + $self->_pushback($_) if defined $_; + # this fixes bug #1443 + $self->end_element({'Name' => 'Hsp'}); + $self->end_element({'Name' => 'Hit'}) if $l; + last; + } + if( /^((Query|Sbjct):\s+(\d+)\s*)(\S+)\s+(\d+)/ ) { + $data{$2} = $4; + $len = length($1); + $self->{"\_$2"}->{'begin'} = $3 unless $self->{"_$2"}->{'begin'}; + $self->{"\_$2"}->{'end'} = $5; + } else { + $self->throw("no data for midline $_") + unless (defined $_ && defined $len); + $data{'Mid'} = substr($_,$len); + } + $_ = $self->_readline(); + } + $self->characters({'Name' => 'Hsp_qseq', + 'Data' => $data{'Query'} }); + $self->characters({'Name' => 'Hsp_hseq', + 'Data' => $data{'Sbjct'}}); + $self->characters({'Name' => 'Hsp_midline', + 'Data' => $data{'Mid'} }); + } else { + $self->debug( "unrecognized line $_"); + } + } + + if( $seentop ) { + # double back check that hits and hsps are closed + # this in response to bug #1443 (may be uncessary due to fix + # above, but making double sure) + $self->in_element('hsp') && + $self->end_element({'Name' => 'Hsp'}); + $self->in_element('hit') && + $self->end_element({'Name' => 'Hit'}); + $self->end_element({'Name' => 'BlastOutput'}); + } +# $self->end_element({'Name' => 'BlastOutput'}) unless ! $seentop; + return $self->end_document(); +} + +=head2 start_element + + Title : start_element + Usage : $eventgenerator->start_element + Function: Handles a start element event + Returns : none + Args : hashref with at least 2 keys 'Data' and 'Name' + + +=cut + +sub start_element{ + my ($self,$data) = @_; + # we currently don't care about attributes + my $nm = $data->{'Name'}; + my $type = $MODEMAP{$nm}; + if( $type ) { + if( $self->_eventHandler->will_handle($type) ) { + my $func = sprintf("start_%s",lc $type); + $self->_eventHandler->$func($data->{'Attributes'}); + } + unshift @{$self->{'_elements'}}, $type; + if( $type eq 'result') { + $self->{'_values'} = {}; + $self->{'_result'}= undef; + } else { + # cleanup some things + if( defined $self->{'_values'} ) { + foreach my $k ( grep { /^\U$type\-/ } + keys %{$self->{'_values'}} ) { + delete $self->{'_values'}->{$k}; + } + } + } + } +} + +=head2 end_element + + Title : start_element + Usage : $eventgenerator->end_element + Function: Handles an end element event + Returns : none + Args : hashref with at least 2 keys 'Data' and 'Name' + + +=cut + +sub end_element { + my ($self,$data) = @_; + my $nm = $data->{'Name'}; + my $type = $MODEMAP{$nm}; + my $rc; + if($nm eq 'BlastOutput_program' && + $self->{'_last_data'} =~ /(t?blast[npx])/i ) { + $self->{'_reporttype'} = uc $1; + } + + # Hsp are sort of weird, in that they end when another + # object begins so have to detect this in end_element for now + if( $nm eq 'Hsp' ) { + foreach ( qw(Hsp_qseq Hsp_midline Hsp_hseq) ) { + $self->element({'Name' => $_, + 'Data' => $self->{'_last_hspdata'}->{$_}}); + } + $self->{'_last_hspdata'} = {}; + $self->element({'Name' => 'Hsp_query-from', + 'Data' => $self->{'_Query'}->{'begin'}}); + $self->element({'Name' => 'Hsp_query-to', + 'Data' => $self->{'_Query'}->{'end'}}); + + $self->element({'Name' => 'Hsp_hit-from', + 'Data' => $self->{'_Sbjct'}->{'begin'}}); + $self->element({'Name' => 'Hsp_hit-to', + 'Data' => $self->{'_Sbjct'}->{'end'}}); + } + if( $type = $MODEMAP{$nm} ) { + if( $self->_eventHandler->will_handle($type) ) { + my $func = sprintf("end_%s",lc $type); + $rc = $self->_eventHandler->$func($self->{'_reporttype'}, + $self->{'_values'}); + } + shift @{$self->{'_elements'}}; + + } elsif( $MAPPING{$nm} ) { + + if ( ref($MAPPING{$nm}) =~ /hash/i ) { + # this is where we shove in the data from the + # hashref info about params or statistics + my $key = (keys %{$MAPPING{$nm}})[0]; + $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'}; + } else { + $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'}; + } + } else { + $self->debug( "unknown nm $nm, ignoring\n"); + } + $self->{'_last_data'} = ''; # remove read data if we are at + # end of an element + $self->{'_result'} = $rc if( defined $type && $type eq 'result' ); + return $rc; + +} + +=head2 element + + Title : element + Usage : $eventhandler->element({'Name' => $name, 'Data' => $str}); + Function: Convience method that calls start_element, characters, end_element + Returns : none + Args : Hash ref with the keys 'Name' and 'Data' + + +=cut + +sub element{ + my ($self,$data) = @_; + $self->start_element($data); + $self->characters($data); + $self->end_element($data); +} + +=head2 characters + + Title : characters + Usage : $eventgenerator->characters($str) + Function: Send a character events + Returns : none + Args : string + + +=cut + +sub characters{ + my ($self,$data) = @_; + return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/ ); + + if( $self->in_element('hsp') && + $data->{'Name'} =~ /Hsp\_(qseq|hseq|midline)/ ) { + $self->{'_last_hspdata'}->{$data->{'Name'}} .= $data->{'Data'}; + } + $self->{'_last_data'} = $data->{'Data'}; +} + +=head2 within_element + + Title : within_element + Usage : if( $eventgenerator->within_element($element) ) {} + Function: Test if we are within a particular element + This is different than 'in' because within can be tested + for a whole block. + Returns : boolean + Args : string element name + + +=cut + +sub within_element{ + my ($self,$name) = @_; + return 0 if ( ! defined $name && + ! defined $self->{'_elements'} || + scalar @{$self->{'_elements'}} == 0) ; + foreach ( @{$self->{'_elements'}} ) { + if( $_ eq $name ) { + return 1; + } + } + return 0; +} + + +=head2 in_element + + Title : in_element + Usage : if( $eventgenerator->in_element($element) ) {} + Function: Test if we are in a particular element + This is different than 'in' because within can be tested + for a whole block. + Returns : boolean + Args : string element name + + +=cut + +sub in_element{ + my ($self,$name) = @_; + return 0 if ! defined $self->{'_elements'}->[0]; + return ( $self->{'_elements'}->[0] eq $name) +} + +=head2 start_document + + Title : start_document + Usage : $eventgenerator->start_document + Function: Handle a start document event + Returns : none + Args : none + + +=cut + +sub start_document{ + my ($self) = @_; + $self->{'_lasttype'} = ''; + $self->{'_values'} = {}; + $self->{'_result'}= undef; + $self->{'_elements'} = []; +} + +=head2 end_document + + Title : end_document + Usage : $eventgenerator->end_document + Function: Handles an end document event + Returns : Bio::Search::Result::ResultI object + Args : none + + +=cut + +sub end_document{ + my ($self,@args) = @_; + return $self->{'_result'}; +} + + +sub write_result { + my ($self, $blast, @args) = @_; + + if( not defined($self->writer) ) { + $self->warn("Writer not defined. Using a $DEFAULT_BLAST_WRITER_CLASS"); + $self->writer( $DEFAULT_BLAST_WRITER_CLASS->new() ); + } + $self->SUPER::write_result( $blast, @args ); +} + +sub result_count { + my $self = shift; + return $self->{'_result_count'}; +} + +sub report_count { shift->result_count } + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SearchIO/blastxml.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SearchIO/blastxml.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,436 @@ +# $Id: blastxml.pm,v 1.24 2002/10/26 09:32:16 sac Exp $ +# +# BioPerl module for Bio::SearchIO::blastxml +# +# Cared for by Jason Stajich <jason@bioperl.org> +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SearchIO::blastxml - A SearchIO implementation of NCBI Blast XML parsing. + +=head1 SYNOPSIS + + use Bio::SearchIO; + my $searchin = new Bio::SearchIO(-format => 'blastxml', + -file => 't/data/plague_yeast.bls.xml'); + while( my $result = $searchin->next_result ) { + } + + # one can also request that the parser NOT keep the XML data in memory + # by using the tempfile initialization flag. + my $searchin = new Bio::SearchIO(-tempfile => 1, + -format => 'blastxml', + -file => 't/data/plague_yeast.bls.xml'); + while( my $result = $searchin->next_result ) { + } + +=head1 DESCRIPTION + +This object implements a NCBI Blast XML parser. + +There is one additional initialization flag from the SearchIO defaults +- that is the -tempfile flag. If specified as true, then the parser +will write out each report to a temporary filehandle rather than +holding the entire report as a string in memory. The reason this is +done in the first place is NCBI reports have an uncessary E<lt>?xml +version="1.0"?E<gt> at the beginning of each report and RPS-BLAST reports +have an additional unecessary RPS-BLAST tag at the top of each report. +So we currently have implemented the work around by preparsing the +file (yes it makes the process slower, but it works). + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::SearchIO::blastxml; +use vars qw(@ISA $DTD %MAPPING %MODEMAP $DEBUG); +use strict; + +$DTD = 'ftp://ftp.ncbi.nlm.nih.gov/blast/documents/NCBI_BlastOutput.dtd'; +# Object preamble - inherits from Bio::Root::Root + +use Bio::Root::Root; +use Bio::SearchIO; +use XML::Parser::PerlSAX; +use XML::Handler::Subs; +use HTML::Entities; +use IO::File; + + +BEGIN { + # mapping of NCBI Blast terms to Bioperl hash keys + %MODEMAP = ('BlastOutput' => 'result', + 'Hit' => 'hit', + 'Hsp' => 'hsp' + ); + + %MAPPING = ( + # HSP specific fields + 'Hsp_bit-score' => 'HSP-bits', + 'Hsp_score' => 'HSP-score', + 'Hsp_evalue' => 'HSP-evalue', + 'Hsp_query-from' => 'HSP-query_start', + 'Hsp_query-to' => 'HSP-query_end', + 'Hsp_hit-from' => 'HSP-hit_start', + 'Hsp_hit-to' => 'HSP-hit_end', + 'Hsp_positive' => 'HSP-conserved', + 'Hsp_identity' => 'HSP-identical', + 'Hsp_gaps' => 'HSP-gaps', + 'Hsp_hitgaps' => 'HSP-hit_gaps', + 'Hsp_querygaps' => 'HSP-query_gaps', + 'Hsp_qseq' => 'HSP-query_seq', + 'Hsp_hseq' => 'HSP-hit_seq', + 'Hsp_midline' => 'HSP-homology_seq', + 'Hsp_align-len' => 'HSP-hsp_length', + 'Hsp_query-frame'=> 'HSP-query_frame', + 'Hsp_hit-frame' => 'HSP-hit_frame', + + # these are ignored for now + 'Hsp_num' => 'HSP-order', + 'Hsp_pattern-from' => 'patternend', + 'Hsp_pattern-to' => 'patternstart', + 'Hsp_density' => 'hspdensity', + + # Hit specific fields + 'Hit_id' => 'HIT-name', + 'Hit_len' => 'HIT-length', + 'Hit_accession' => 'HIT-accession', + 'Hit_def' => 'HIT-description', + 'Hit_num' => 'HIT-order', + 'Iteration_iter-num' => 'HIT-iteration', + 'Iteration_stat' => 'HIT-iteration_statistic', + + 'BlastOutput_program' => 'RESULT-algorithm_name', + 'BlastOutput_version' => 'RESULT-algorithm_version', + 'BlastOutput_query-def' => 'RESULT-query_description', + 'BlastOutput_query-len' => 'RESULT-query_length', + 'BlastOutput_db' => 'RESULT-database_name', + 'BlastOutput_reference' => 'RESULT-program_reference', + 'BlastOutput_query-ID' => 'runid', + + 'Parameters_matrix' => { 'RESULT-parameters' => 'matrix'}, + 'Parameters_expect' => { 'RESULT-parameters' => 'expect'}, + 'Parameters_include' => { 'RESULT-parameters' => 'include'}, + 'Parameters_sc-match' => { 'RESULT-parameters' => 'match'}, + 'Parameters_sc-mismatch' => { 'RESULT-parameters' => 'mismatch'}, + 'Parameters_gap-open' => { 'RESULT-parameters' => 'gapopen'}, + 'Parameters_gap-extend'=> { 'RESULT-parameters' => 'gapext'}, + 'Parameters_filter' => {'RESULT-parameters' => 'filter'}, + 'Statistics_db-num' => 'RESULT-database_entries', + 'Statistics_db-len' => 'RESULT-database_letters', + 'Statistics_hsp-len' => { 'RESULT-statistics' => 'hsplength'}, + 'Statistics_eff-space' => { 'RESULT-statistics' => 'effectivespace'}, + 'Statistics_kappa' => { 'RESULT-statistics' => 'kappa' }, + 'Statistics_lambda' => { 'RESULT-statistics' => 'lambda' }, + 'Statistics_entropy' => { 'RESULT-statistics' => 'entropy'}, + ); + eval { require Time::HiRes }; + if( $@ ) { $DEBUG = 0; } +} + + +@ISA = qw(Bio::SearchIO ); + +=head2 new + + Title : new + Usage : my $searchio = new Bio::SearchIO(-format => 'blastxml', + -file => 'filename', + -tempfile => 1); + Function: Initializes the object - this is chained through new in SearchIO + Returns : Bio::SearchIO::blastxml object + Args : One additional argument from the format and file/fh parameters. + -tempfile => boolean. Defaults to false. Write out XML data + to a temporary filehandle to send to + PerlSAX parser. +=cut + +=head2 _initialize + + Title : _initialize + Usage : private + Function: Initializes the object - this is chained through new in SearchIO + +=cut + +sub _initialize{ + my ($self,@args) = @_; + $self->SUPER::_initialize(@args); + my ($usetempfile) = $self->_rearrange([qw(TEMPFILE)],@args); + defined $usetempfile && $self->use_tempfile($usetempfile); + $self->{'_xmlparser'} = new XML::Parser::PerlSAX(); + $DEBUG = 1 if( ! defined $DEBUG && $self->verbose > 0); +} + +=head2 next_result + + Title : next_result + Usage : my $hit = $searchio->next_result; + Function: Returns the next Result from a search + Returns : Bio::Search::Result::ResultI object + Args : none + +=cut + +sub next_result { + my ($self) = @_; + + my $data = ''; + my $firstline = 1; + my ($tfh); + if( $self->use_tempfile ) { + $tfh = IO::File->new_tmpfile or $self->throw("Unable to open temp file: $!"); + $tfh->autoflush(1); + } + my $okaytoprocess; + while( defined( $_ = $self->_readline) ) { + if( /^RPS-BLAST/i ) { + $self->{'_type'} = 'RPSBLAST'; + next; + } + if( /^<\?xml version/ && ! $firstline) { + $self->_pushback($_); + last; + } + $_ = decode_entities($_); +# s/\'/\`/g; +# s/\>/\>/g; +# s/\</\</g; + $okaytoprocess = 1; + if( defined $tfh ) { + print $tfh $_; + } else { + $data .= $_; + } + $firstline = 0; + } + + return undef unless( $okaytoprocess); + + my %parser_args; + if( defined $tfh ) { + seek($tfh,0,0); + %parser_args = ('Source' => { 'ByteStream' => $tfh }, + 'Handler' => $self); + } else { + %parser_args = ('Source' => { 'String' => $data }, + 'Handler' => $self); + } + my $result; + my $starttime; + if( $DEBUG ) { $starttime = [ Time::HiRes::gettimeofday() ]; } + + eval { + $result = $self->{'_xmlparser'}->parse(%parser_args); + $self->{'_result_count'}++; + }; + if( $@ ) { + $self->warn("error in parsing a report:\n $@"); + $result = undef; + } + if( $DEBUG ) { + $self->debug( sprintf("parsing took %f seconds\n", Time::HiRes::tv_interval($starttime))); + } + # parsing magic here - but we call event handlers rather than + # instantiating things + return $result; +} + +=head2 SAX methods + +=cut + +=head2 start_document + + Title : start_document + Usage : $parser->start_document; + Function: SAX method to indicate starting to parse a new document + Returns : none + Args : none + + +=cut + +sub start_document{ + my ($self) = @_; + $self->{'_lasttype'} = ''; + $self->{'_values'} = {}; + $self->{'_result'}= undef; +} + +=head2 end_document + + Title : end_document + Usage : $parser->end_document; + Function: SAX method to indicate finishing parsing a new document + Returns : Bio::Search::Result::ResultI object + Args : none + +=cut + +sub end_document{ + my ($self,@args) = @_; + return $self->{'_result'}; +} + +=head2 start_element + + Title : start_element + Usage : $parser->start_element($data) + Function: SAX method to indicate starting a new element + Returns : none + Args : hash ref for data + +=cut + +sub start_element{ + my ($self,$data) = @_; + # we currently don't care about attributes + my $nm = $data->{'Name'}; + + if( my $type = $MODEMAP{$nm} ) { + if( $self->_eventHandler->will_handle($type) ) { + my $func = sprintf("start_%s",lc $type); + $self->_eventHandler->$func($data->{'Attributes'}); + } + } + + if($nm eq 'BlastOutput') { + $self->{'_values'} = {}; + $self->{'_result'}= undef; + } +} + +=head2 end_element + + Title : end_element + Usage : $parser->end_element($data) + Function: Signals finishing an element + Returns : Bio::Search object dpending on what type of element + Args : hash ref for data + +=cut + +sub end_element{ + my ($self,$data) = @_; + + my $nm = $data->{'Name'}; + my $rc; + if($nm eq 'BlastOutput_program' && + $self->{'_last_data'} =~ /(t?blast[npx])/i ) { + $self->{'_type'} = uc $1; + } + + if( my $type = $MODEMAP{$nm} ) { + if( $self->_eventHandler->will_handle($type) ) { + my $func = sprintf("end_%s",lc $type); + $rc = $self->_eventHandler->$func($self->{'_type'}, + $self->{'_values'}); + } + } elsif( $MAPPING{$nm} ) { + if ( ref($MAPPING{$nm}) =~ /hash/i ) { + my $key = (keys %{$MAPPING{$nm}})[0]; + $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'}; + } else { + $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'}; + } + } elsif( $nm eq 'Iteration' || $nm eq 'Hit_hsps' || $nm eq 'Parameters' || + $nm eq 'BlastOutput_param' || $nm eq 'Iteration_hits' || + $nm eq 'Statistics' || $nm eq 'BlastOutput_iterations' ){ + + } else { + + $self->debug("ignoring unrecognized element type $nm\n"); + } + $self->{'_last_data'} = ''; # remove read data if we are at + # end of an element + $self->{'_result'} = $rc if( $nm eq 'BlastOutput' ); + return $rc; +} + +=head2 characters + + Title : characters + Usage : $parser->characters($data) + Function: Signals new characters to be processed + Returns : characters read + Args : hash ref with the key 'Data' + + +=cut + +sub characters{ + my ($self,$data) = @_; + return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/ ); + + $self->{'_last_data'} = $data->{'Data'}; +} + +=head2 use_tempfile + + Title : use_tempfile + Usage : $obj->use_tempfile($newval) + Function: Get/Set boolean flag on whether or not use a tempfile + Example : + Returns : value of use_tempfile + Args : newvalue (optional) + + +=cut + +sub use_tempfile{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'_use_tempfile'} = $value; + } + return $self->{'_use_tempfile'}; +} + +sub result_count { + my $self = shift; + return $self->{'_result_count'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SearchIO/chado.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SearchIO/chado.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,254 @@ +# $Id: chado.pm,v 1.1 2002/12/03 08:13:55 cjm Exp $ +# +# BioPerl module for Bio::SearchIO::chado +# +# Chris Mungall <cjm@fruitfly.org> +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SearchIO::chado - chado sequence input/output stream + +=head1 SYNOPSIS + +It is probably best not to use this object directly, but +rather go through the SearchIO handler system. Go: + + $stream = Bio::SearchIO->new(-file => $filename, -format => 'chado'); + + while ( my $seq = $stream->next_seq() ) { + # do something with $seq + } + +=head1 DESCRIPTION + +This object can transform Bio::Seq objects to and from chado flat +file databases. CURRENTLY ONLY TO + + +=head2 Optional functions + +=over 3 + +=item _show_dna() + +(output only) shows the dna or not + +=item _post_sort() + +(output only) provides a sorting func which is applied to the FTHelpers +before printing + + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://www.bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bio.perl.org/bioperl-bugs/ + +=head1 AUTHOR - Chris Mungall + +Email cjm@fruitfly.org + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::SearchIO::chado; +use vars qw(@ISA); +use strict; + +use Bio::SearchIO; +use Bio::SeqFeature::Generic; +use Bio::Seq::SeqFactory; +use Bio::Annotation::Collection; +use Bio::Annotation::Comment; +use Bio::Annotation::Reference; +use Bio::Annotation::DBLink; + + +use Bio::SeqIO::chado; + +use Data::Stag qw(:all); + +# should really inherit off of a chado helper... +@ISA = qw(Bio::SearchIO Bio::SeqIO::chado); + +sub _initialize { + my($self,@args) = @_; + + $self->SUPER::_initialize(@args); + my $wclass = $self->default_handler_class; + $self->handler($wclass->new); + $self->{_end_of_data} = 0; + $self->handler->S("chado"); + return; +} + +sub DESTROY { + my $self = shift; + $self->end_of_data(); + $self->SUPER::DESTROY(); +} + +sub end_of_data { + my $self = shift; + $self->{_end_of_data} = 1; + $self->handler->E("chado"); +} + +sub default_handler_class { + return "Data::Stag::BaseHandler"; +} + +=head2 write_result + + Title : write_result + Usage : $stream->write_result($result) + Function: writes the $result object (must be result) to the stream + Returns : 1 for success and 0 for error + Args : Bio::Result + + +=cut + +sub write_result { + my ($self,$result) = @_; + + if( !defined $result ) { + $self->throw("Attempting to write with no result!"); + } + + my $w = $self->handler; + $w->S("result"); +# my $result_temp_uid = $self->get_temp_uid($result); + + my @stats = + (map { + [analysisprop=>[ + [pkey=>$_], + [pval=>$result->get_statistic($_)]]] + } $result->available_statistics); + my @params = + (map { + [analysisprop=>[ + [pkey=>$_], + [pval=>$result->get_parameter($_)]]] + } $result->available_parameters); + + my $cid = $self->get_temp_uid($result); + $w->ev(companalysis=>[ + [companalysis_id=>$cid], + [datasource=>$result->database_name], + @stats, + @params, + ] + ); + while( my $hit = $result->next_hit ) { + # process the Bio::Search::Hit::HitI object + $self->write_hit($hit, $cid); + } + $w->E("result"); + return 1; +} + +sub write_hit { + my $self = shift; + my $hit = shift; + my $cid = shift; + + my $w = $self->handler; + my $hit_id = $self->get_temp_uid($hit); + + # we should determine the type by the type of blast; + # eg blastx gives CDS for hit and CDS_exon for HSP + my $fnode = + [feature=> [ + [feature_id=>$hit_id], + [name=>$hit->name], + [typename=>"hit"], + [analysisfeature=>[ + [rawscore=>$hit->raw_score], + [significance=>$hit->significance], + [analysis_id=>$cid]]]]]; + $w->ev(@$fnode); + foreach my $hsp ( $hit->hsps) { + $self->write_hsp($hsp, $hit_id); + } + return 1; +} + +sub write_hsp { + my $self = shift; + my $hsp = shift; + my $hid = shift; + + my $w = $self->handler; + my $hsp_id = $self->get_temp_uid($hsp); + my $order = 0; + my @lnodes = + map { + my ($nbeg, $nend, $strand) = + $self->bp2ib([$hsp->start($_), + $hsp->end($_), + $hsp->strand($_) + ]); + my $src = $_ eq 'query' ? $hsp->query->seq_id : $hsp->hit->seq_id; + [featureloc=>[ + [nbeg=>$nbeg], + [nend=>$nend], + [strand=>$strand], + [srcfeature=>$src], + [group=>0], + [order=>$order++], + ] + ] + } qw(query subject); + my $fnode = + [feature => [ + + [feature_id=>$hsp_id], + [typename=>"hsp"], + [analysisfeature=>[ + [rawscore=>$hsp->score], + [significance=>$hsp->significance], + ] + ], + @lnodes, + ] + ]; + $w->ev(@$fnode); + $w->ev(feature_relationship=>[ + [subjfeature_id=>$hsp_id], + [objfeature_id=>$hid] + ] + ); + return 1; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SearchIO/chadosxpr.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SearchIO/chadosxpr.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,78 @@ +# $Id: chadosxpr.pm,v 1.2 2002/12/05 13:46:35 heikki Exp $ +# +# BioPerl module for Bio::SearchIO::chadosxpr +# +# Chris Mungall <cjm@fruitfly.org> +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SearchIO::chadosxpr - chadosxpr sequence input/output stream + +=head1 SYNOPSIS + +It is probably best not to use this object directly, but +rather go through the SearchIO handler system. Go: + + $stream = Bio::SearchIO->new(-file => $filename, -format => 'chadosxpr'); + + while ( my $seq = $stream->next_seq() ) { + # do something with $seq + } + +=head1 DESCRIPTION + +This object can transform Bio::Search objects to and from chadosxpr flat +file databases. CURRENTLY ONLY TO + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://www.bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bio.perl.org/bioperl-bugs/ + +=head1 AUTHOR - Chris Mungall + +Email cjm@fruitfly.org + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::SearchIO::chadosxpr; +use Bio::SearchIO::chado; +use vars qw(@ISA); +use strict; + +use Data::Stag::SxprWriter; + +@ISA = qw(Bio::SearchIO::chado); + +sub default_handler_class { + return "Data::Stag::SxprWriter"; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SearchIO/exonerate.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SearchIO/exonerate.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,578 @@ +# $Id: exonerate.pm,v 1.3.2.3 2003/03/29 20:30:54 jason Exp $ +# +# BioPerl module for Bio::SearchIO::exonerate +# +# Cared for by Jason Stajich <jason@bioperl.org> +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SearchIO::exonerate - parser for Exonerate + +=head1 SYNOPSIS + + # do not use this module directly, it is a driver for SearchIO + + use Bio::SearchIO; + my $searchio = new Bio::SearchIO(-file => 'file.exonerate', + -format => 'exonerate'); + + + while( my $r = $searchio->next_result ) { + print $r->query_name, "\n"; + } + +=head1 DESCRIPTION + +This is a driver for the SearchIO system for parsing Exonerate (Guy +Slater) output. You can get Exonerate at +http://cvsweb.sanger.ac.uk/cgi-bin/cvsweb.cgi/exonerate/?cvsroot=Ensembl +[until Guy puts up a Web reference,publication for it.]). + +An optional parameter -min_intron is supported by the L<new> +initialization method. This is if you run Exonerate with a different +minimum intron length (default is 30) the parser will be able to +detect the difference between standard deletions and an intron. Still +some room to play with there that might cause this to get +misinterpreted that has not been fully tested or explored. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bioperl.org/bioperl-bugs/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SearchIO::exonerate; +use strict; +use vars qw(@ISA @STATES %MAPPING %MODEMAP $DEFAULT_WRITER_CLASS $MIN_INTRON); +use Bio::SearchIO; + +@ISA = qw(Bio::SearchIO ); + +use POSIX; + + +%MODEMAP = ('ExonerateOutput' => 'result', + 'Hit' => 'hit', + 'Hsp' => 'hsp' + ); +%MAPPING = + ( + 'Hsp_query-from'=> 'HSP-query_start', + 'Hsp_query-to' => 'HSP-query_end', + 'Hsp_hit-from' => 'HSP-hit_start', + 'Hsp_hit-to' => 'HSP-hit_end', + 'Hsp_qseq' => 'HSP-query_seq', + 'Hsp_hseq' => 'HSP-hit_seq', + 'Hsp_midline' => 'HSP-homology_seq', + 'Hsp_score' => 'HSP-score', + 'Hsp_qlength' => 'HSP-query_length', + 'Hsp_hlength' => 'HSP-hit_length', + 'Hsp_align-len' => 'HSP-hsp_length', + 'Hsp_identity' => 'HSP-identical', + 'Hsp_gaps' => 'HSP-hsp_gaps', + 'Hsp_hitgaps' => 'HSP-hit_gaps', + 'Hsp_querygaps' => 'HSP-query_gaps', + + 'Hit_id' => 'HIT-name', + 'Hit_desc' => 'HIT-description', + 'Hit_len' => 'HIT-length', + 'Hit_score' => 'HIT-score', + + 'ExonerateOutput_program' => 'RESULT-algorithm_name', + 'ExonerateOutput_query-def' => 'RESULT-query_name', + 'ExonerateOutput_query-desc'=> 'RESULT-query_description', + 'ExonerateOutput_query-len' => 'RESULT-query_length', + ); + +$DEFAULT_WRITER_CLASS = 'Bio::Search::Writer::HitTableWriter'; + +$MIN_INTRON=30; # This is the minimum intron size + +=head2 new + + Title : new + Usage : my $obj = new Bio::SearchIO::exonerate(); + Function: Builds a new Bio::SearchIO::exonerate object + Returns : an instance of Bio::SearchIO::exonerate + Args : + + +=cut + +sub new { + my ($class) = shift; + my $self = $class->SUPER::new(@_); + + my ($min_intron) = $self->_rearrange([qw(MIN_INTRON)], @_); + if( $min_intron ) { + $MIN_INTRON = $min_intron; + } + $self; +} + +=head2 next_result + + Title : next_result + Usage : my $hit = $searchio->next_result; + Function: Returns the next Result from a search + Returns : Bio::Search::Result::ResultI object + Args : none + +=cut + +sub next_result{ + my ($self) = @_; + $self->{'_last_data'} = ''; + my ($reporttype,$seenquery,$reportline); + $self->start_document(); + my @hit_signifs; + my $seentop; + my (@q_ex, @m_ex, @h_ex); ## gc addition + while( defined($_ = $self->_readline) ) { + if( /^Query:\s+(\S+)(\s+(.+))?/ ) { + if( $seentop ) { + $self->end_element({'Name' => 'ExonerateOutput'}); + $self->_pushback($_); + return $self->end_document(); + } + $seentop = 1; + my ($nm,$desc) = ($1,$2); + chomp($desc) if defined $desc; + $self->{'_result_count'}++; + $self->start_element({'Name' => 'ExonerateOutput'}); + $self->element({'Name' => 'ExonerateOutput_query-def', + 'Data' => $nm }); + $self->element({'Name' => 'ExonerateOutput_query-desc', + 'Data' => $desc }); + $self->element({'Name' => 'ExonerateOutput_program', + 'Data' => 'Exonerate' }); + + } elsif ( /^Target:\s+(\S+)(\s+(.+))?/ ) { + my ($nm,$desc) = ($1,$2); + chomp($desc) if defined $desc; + $self->start_element({'Name' => 'Hit'}); + $self->element({'Name' => 'Hit_id', + 'Data' => $nm}); + $self->element({'Name' => 'Hit_desc', + 'Data' => $desc}); + } elsif( s/^cigar:\s+(\S+)\s+ # query sequence id + (\d+)\s+(\d+)\s+([\-\+])\s+ # query start-end-strand + (\S+)\s+ # target sequence id + (\d+)\s+(\d+)\s+([\-\+])\s+ # target start-end-strand + (\d+)\s+ # score + //ox ) { + + ## gc note: + ## $qe and $he are no longer used for calculating the ends, + ## just the $qs and $hs values and the alignment and insert lenghts + my ($qs,$qe,$qstrand) = ($2,$3,$4); + my ($hs,$he,$hstrand) = ($6,$7,$8); + my $score = $9; +# $self->element({'Name' => 'ExonerateOutput_query-len', +# 'Data' => $qe}); +# $self->element({'Name' => 'Hit_len', +# 'Data' => $he}); + + my @rest = split; + if( $qstrand eq '-' ) { + $qstrand = -1; + ($qs,$qe) = ($qe,$qs); # flip-flop if we're on opp strand + $qs--; $qe++; + } else { $qstrand = 1; } + if( $hstrand eq '-' ) { + $hstrand = -1; + ($hs,$he) = ($he,$hs); # flip-flop if we're on opp strand + $hs--; $he++; + } else { $hstrand = 1; } + # okay let's do this right and generate a set of HSPs + # from the cigar line + + ## gc note: + ## add one because these values are zero-based + ## this calculation was originally done lower in the code, + ## but it's clearer to do it just once at the start + $qs++; + $hs++; + + my ($aln_len,$inserts,$deletes) = (0,0,0); + while( @rest >= 2 ) { + my ($state,$len) = (shift @rest, shift @rest); + if( $state eq 'I' ) { + $inserts+=$len; + } elsif( $state eq 'D' ) { + if( $len >= $MIN_INTRON ) { + $self->start_element({'Name' => 'Hsp'}); + + $self->element({'Name' => 'Hsp_score', + 'Data' => $score}); + $self->element({'Name' => 'Hsp_align-len', + 'Data' => $aln_len}); + $self->element({'Name' => 'Hsp_identity', + 'Data' => $aln_len - + ($inserts + $deletes)}); + + # HSP ends where the other begins + $self->element({'Name' => 'Hsp_query-from', + 'Data' => $qs}); + ## gc note: + ## $qs is now the start of the next hsp + ## the end of this hsp is 1 before this position + ## (or 1 after in case of reverse strand) + $qs += $aln_len*$qstrand; + $self->element({'Name' => 'Hsp_query-to', + 'Data' => $qs - ($qstrand*1)}); + + $hs += $deletes*$hstrand; + $self->element({'Name' => 'Hsp_hit-from', + 'Data' => $hs}); + $hs += $aln_len*$hstrand; + $self->element({'Name' => 'Hsp_hit-to', + 'Data' => $hs-($hstrand*1)}); + + $self->element({'Name' => 'Hsp_align-len', + 'Data' => $aln_len + $inserts + + $deletes}); + $self->element({'Name' => 'Hsp_identity', + 'Data' => $aln_len }); + + $self->element({'Name' => 'Hsp_gaps', + 'Data' => $inserts + $deletes}); + $self->element({'Name' => 'Hsp_querygaps', + 'Data' => $inserts}); + $self->element({'Name' => 'Hsp_hitgaps', + 'Data' => $deletes}); + +## gc addition start + + $self->element({'Name' => 'Hsp_qseq', + 'Data' => shift @q_ex, + }); + $self->element({'Name' => 'Hsp_hseq', + 'Data' => shift @h_ex, + }); + $self->element({'Name' => 'Hsp_midline', + 'Data' => shift @m_ex, + }); +## gc addition end + $self->end_element({'Name' => 'Hsp'}); + + $aln_len = $inserts = $deletes = 0; + } + $deletes+=$len; + } else { + $aln_len += $len; + } + } + $self->start_element({'Name' => 'Hsp'}); + +## gc addition start + + $self->element({'Name' => 'Hsp_qseq', + 'Data' => shift @q_ex, + }); + $self->element({'Name' => 'Hsp_hseq', + 'Data' => shift @h_ex, + }); + $self->element({'Name' => 'Hsp_midline', + 'Data' => shift @m_ex, + }); +## gc addition end + + $self->element({'Name' => 'Hsp_score', + 'Data' => $score}); + + $self->element({'Name' => 'Hsp_query-from', + 'Data' => $qs}); + + $qs += $aln_len*$qstrand; + $self->element({'Name' => 'Hsp_query-to', + 'Data' => $qs - ($qstrand*1)}); + + $hs += $deletes*$hstrand; + $self->element({'Name' => 'Hsp_hit-from', + 'Data' => $hs}); + $hs += $aln_len*$hstrand; + $self->element({'Name' => 'Hsp_hit-to', + 'Data' => $hs -($hstrand*1)}); + + $self->element({'Name' => 'Hsp_align-len', + 'Data' => $aln_len}); + + $self->element({'Name' => 'Hsp_identity', + 'Data' => $aln_len - ($inserts + $deletes)}); + + $self->element({'Name' => 'Hsp_gaps', + 'Data' => $inserts + $deletes}); + + $self->element({'Name' => 'Hsp_querygaps', + 'Data' => $inserts}); + $self->element({'Name' => 'Hsp_hitgaps', + 'Data' => $deletes}); + $self->end_element({'Name' => 'Hsp'}); + $self->element({'Name' => 'Hit_score', + 'Data' => $score}); + $self->end_element({'Name' => 'Hit'}); + $self->end_element({'Name' => 'ExonerateOutput'}); + + return $self->end_document(); + } else { + } + } + return $self->end_document() if( $seentop ); +} + +=head2 start_element + + Title : start_element + Usage : $eventgenerator->start_element + Function: Handles a start element event + Returns : none + Args : hashref with at least 2 keys 'Data' and 'Name' + + +=cut + +sub start_element{ + my ($self,$data) = @_; + # we currently don't care about attributes + my $nm = $data->{'Name'}; + my $type = $MODEMAP{$nm}; + + if( $type ) { + if( $self->_eventHandler->will_handle($type) ) { + my $func = sprintf("start_%s",lc $type); + $self->_eventHandler->$func($data->{'Attributes'}); + } + unshift @{$self->{'_elements'}}, $type; + + if($type eq 'result') { + $self->{'_values'} = {}; + $self->{'_result'}= undef; + } + } + +} + +=head2 end_element + + Title : start_element + Usage : $eventgenerator->end_element + Function: Handles an end element event + Returns : none + Args : hashref with at least 2 keys 'Data' and 'Name' + + +=cut + +sub end_element { + my ($self,$data) = @_; + my $nm = $data->{'Name'}; + my $type = $MODEMAP{$nm}; + my $rc; + + if( $type = $MODEMAP{$nm} ) { + if( $self->_eventHandler->will_handle($type) ) { + my $func = sprintf("end_%s",lc $type); + $rc = $self->_eventHandler->$func($self->{'_reporttype'}, + $self->{'_values'}); + } + shift @{$self->{'_elements'}}; + + } elsif( $MAPPING{$nm} ) { + + if ( ref($MAPPING{$nm}) =~ /hash/i ) { + my $key = (keys %{$MAPPING{$nm}})[0]; + $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'}; + } else { + $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'}; + } + } else { + $self->debug( "unknown nm $nm, ignoring\n"); + } + $self->{'_last_data'} = ''; # remove read data if we are at + # end of an element + $self->{'_result'} = $rc if( defined $type && $type eq 'result' ); + return $rc; +} + +=head2 element + + Title : element + Usage : $eventhandler->element({'Name' => $name, 'Data' => $str}); + Function: Convience method that calls start_element, characters, end_element + Returns : none + Args : Hash ref with the keys 'Name' and 'Data' + + +=cut + +sub element{ + my ($self,$data) = @_; + $self->start_element($data); + $self->characters($data); + $self->end_element($data); +} + +=head2 characters + + Title : characters + Usage : $eventgenerator->characters($str) + Function: Send a character events + Returns : none + Args : string + + +=cut + +sub characters{ + my ($self,$data) = @_; + + return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/ ); + + $self->{'_last_data'} = $data->{'Data'}; +} + +=head2 within_element + + Title : within_element + Usage : if( $eventgenerator->within_element($element) ) {} + Function: Test if we are within a particular element + This is different than 'in' because within can be tested + for a whole block. + Returns : boolean + Args : string element name + + +=cut + +sub within_element{ + my ($self,$name) = @_; + return 0 if ( ! defined $name && + ! defined $self->{'_elements'} || + scalar @{$self->{'_elements'}} == 0) ; + foreach ( @{$self->{'_elements'}} ) { + if( $_ eq $name ) { + return 1; + } + } + return 0; +} + + +=head2 in_element + + Title : in_element + Usage : if( $eventgenerator->in_element($element) ) {} + Function: Test if we are in a particular element + This is different than 'in' because within can be tested + for a whole block. + Returns : boolean + Args : string element name + + +=cut + +sub in_element{ + my ($self,$name) = @_; + return 0 if ! defined $self->{'_elements'}->[0]; + return ( $self->{'_elements'}->[0] eq $name) +} + +=head2 start_document + + Title : start_document + Usage : $eventgenerator->start_document + Function: Handle a start document event + Returns : none + Args : none + + +=cut + +sub start_document{ + my ($self) = @_; + $self->{'_lasttype'} = ''; + $self->{'_values'} = {}; + $self->{'_result'}= undef; + $self->{'_elements'} = []; + $self->{'_reporttype'} = 'exonerate'; +} + + +=head2 end_document + + Title : end_document + Usage : $eventgenerator->end_document + Function: Handles an end document event + Returns : Bio::Search::Result::ResultI object + Args : none + + +=cut + +sub end_document{ + my ($self,@args) = @_; + return $self->{'_result'}; +} + + +sub write_result { + my ($self, $blast, @args) = @_; + + if( not defined($self->writer) ) { + $self->warn("Writer not defined. Using a $DEFAULT_WRITER_CLASS"); + $self->writer( $DEFAULT_WRITER_CLASS->new() ); + } + $self->SUPER::write_result( $blast, @args ); +} + +sub result_count { + my $self = shift; + return $self->{'_result_count'}; +} + +sub report_count { shift->result_count } + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SearchIO/fasta.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SearchIO/fasta.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1066 @@ +# $Id: fasta.pm,v 1.33.2.3 2003/08/28 16:01:03 jason Exp $ +# +# BioPerl module for Bio::SearchIO::fasta +# +# Cared for by Jason Stajich <jason@bioperl.org> +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SearchIO::fasta - A SearchIO parser for FASTA results + +=head1 SYNOPSIS + + # Do not use this object directly, use it through the SearchIO system + use Bio::SearchIO; + my $searchio = new Bio::SearchIO(-format => 'fasta', + -file => 'report.FASTA'); + while( my $result = $searchio->next_result ) { + # .... + } + +=head1 DESCRIPTION + +This object contains the event based parsing code for FASTA format reports. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SearchIO::fasta; +use vars qw(@ISA %MODEMAP %MAPPING $IDLENGTH); +use strict; + +# Object preamble - inherits from Bio::Root::RootI + +use Bio::SearchIO; +use POSIX; + +BEGIN { + # Set IDLENGTH to a new value if you have + # compile FASTA with a different ID length + # (actually newest FASTA allows the setting of this + # via -C parameter, default is 6) + $IDLENGTH = 6; + + # mapping of NCBI Blast terms to Bioperl hash keys + %MODEMAP = ('FastaOutput' => 'result', + 'Hit' => 'hit', + 'Hsp' => 'hsp' + ); + + # This should really be done more intelligently, like with + # XSLT + + %MAPPING = + ( + 'Hsp_bit-score' => 'HSP-bits', + 'Hsp_score' => 'HSP-score', + 'Hsp_sw-score' => 'HSP-swscore', + 'Hsp_evalue' => 'HSP-evalue', + 'Hsp_query-from'=> 'HSP-query_start', + 'Hsp_query-to' => 'HSP-query_end', + 'Hsp_hit-from' => 'HSP-hit_start', + 'Hsp_hit-to' => 'HSP-hit_end', + 'Hsp_positive' => 'HSP-conserved', + 'Hsp_identity' => 'HSP-identical', + 'Hsp_gaps' => 'HSP-hsp_gaps', + 'Hsp_hitgaps' => 'HSP-hit_gaps', + 'Hsp_querygaps' => 'HSP-query_gaps', + 'Hsp_qseq' => 'HSP-query_seq', + 'Hsp_hseq' => 'HSP-hit_seq', + 'Hsp_midline' => 'HSP-homology_seq', + 'Hsp_align-len' => 'HSP-hsp_length', + 'Hsp_query-frame'=> 'HSP-query_frame', + 'Hsp_hit-frame' => 'HSP-hit_frame', + + 'Hit_id' => 'HIT-name', + 'Hit_len' => 'HIT-length', + 'Hit_accession' => 'HIT-accession', + 'Hit_def' => 'HIT-description', + 'Hit_signif' => 'HIT-significance', + 'Hit_score' => 'HIT-score', + + 'FastaOutput_program' => 'RESULT-algorithm_name', + 'FastaOutput_version' => 'RESULT-algorithm_version', + 'FastaOutput_query-def'=> 'RESULT-query_name', + 'FastaOutput_querydesc'=> 'RESULT-query_description', + 'FastaOutput_query-len'=> 'RESULT-query_length', + 'FastaOutput_db' => 'RESULT-database_name', + 'FastaOutput_db-len' => 'RESULT-database_entries', + 'FastaOutput_db-let' => 'RESULT-database_letters', + + 'Parameters_matrix' => { 'RESULT-parameters' => 'matrix'}, + 'Parameters_expect' => { 'RESULT-parameters' => 'expect'}, + 'Parameters_include' => { 'RESULT-parameters' => 'include'}, + 'Parameters_sc-match' => { 'RESULT-parameters' => 'match'}, + 'Parameters_sc-mismatch' => { 'RESULT-parameters' => 'mismatch'}, + 'Parameters_gap-open' => { 'RESULT-parameters' => 'gapopen'}, + 'Parameters_gap-ext' => { 'RESULT-parameters' => 'gapext'}, + 'Parameters_word-size' => { 'RESULT-parameters' => 'wordsize'}, + 'Parameters_ktup' => { 'RESULT-parameters' => 'ktup'}, + 'Parameters_filter' => {'RESULT-parameters' => 'filter'}, + 'Statistics_db-num' => { 'RESULT-statistics' => 'dbentries'}, + 'Statistics_db-len' => { 'RESULT-statistics' => 'dbletters'}, + 'Statistics_hsp-len' => { 'RESULT-statistics' => 'hsplength'}, + 'Statistics_eff-space' => { 'RESULT-statistics' => 'effectivespace'}, + 'Statistics_kappa' => { 'RESULT-statistics' => 'kappa' }, + 'Statistics_lambda' => { 'RESULT-statistics' => 'lambda' }, + 'Statistics_entropy' => { 'RESULT-statistics' => 'entropy'}, + ); +} + + +@ISA = qw(Bio::SearchIO ); + +=head2 new + + Title : new + Usage : my $obj = new Bio::SearchIO::fasta(); + Function: Builds a new Bio::SearchIO::fasta object + Returns : Bio::SearchIO::fasta + Args : -idlength - set ID length to something other + than the default (7), this is only + necessary if you have compiled FASTA + with a new default id length to display + in the HSP alignment blocks + +=cut + +sub _initialize { + my($self,@args) = @_; + $self->SUPER::_initialize(@args); + return unless @args; + my ($idlength) = $self->_rearrange([qw(IDLENGTH)],@args); + $self->idlength($idlength || $IDLENGTH); + $self->_eventHandler->register_factory('hsp', Bio::Search::HSP::HSPFactory->new(-type => 'Bio::Search::HSP::FastaHSP')); + + return 1; +} + +=head2 next_result + + Title : next_result + Usage : my $hit = $searchio->next_result; + Function: Returns the next Result from a search + Returns : Bio::Search::Result::ResultI object + Args : none + +=cut + +sub next_result{ + my ($self) = @_; + + my $data = ''; + my $seentop = 0; + my $current_hsp; + $self->start_document(); + my @hit_signifs; + while( defined ($_ = $self->_readline )) { + next if( ! $self->in_element('hsp') && + /^\s+$/); # skip empty lines + if( /(\S+)\s+searches\s+a\s+((protein\s+or\s+DNA\s+sequence)|(sequence\s+database))/i || /(\S+) compares a/ || + ( m/^# / && ($_ = $self->_readline) && + /(\S+)\s+searches\s+a\s+((protein\s+or\s+DNA\s+sequence)|(sequence\s+database))/i || /(\S+) compares a/ + ) + ) { + if( $seentop ) { + $self->_pushback($_); + $self->end_element({ 'Name' => 'FastaOutput'}); + return $self->end_document(); + } + $self->{'_reporttype'} = $1; + $self->start_element({ 'Name' => 'FastaOutput' } ); + $self->{'_result_count'}++; + $seentop = 1; + + $self->element({ 'Name' => 'FastaOutput_program', + 'Data' => $self->{'_reporttype'}}); + $_ = $self->_readline(); + my ($version) = (/version\s+(\S+)/); + $version = '' unless defined $version; + $self->{'_version'} = $version; + $self->element({ 'Name' => 'FastaOutput_version', + 'Data' => $version}); + + my ($last, $leadin, $type, $querylen, $querytype, $querydef); + + while( defined($_ = $self->_readline()) ) { + if( /^ ( + (?:\s+>) | # fa33 lead-in + (?:\s*\d+\s*>>>) # fa34 mlib lead-in + ) + (.*) + /x + ) { + ($leadin, $querydef) = ($1, $2); + if ($leadin =~ m/>>>/) { + if($querydef =~ /^(.*?)\s+(?:\-\s+)?(\d+)\s+(aa|nt)\s*$/o ) { + ($querydef, $querylen, $querytype) = ($1, $2, $3); + last; + } + } else { + if( $last =~ /(\S+)[:,]\s*(\d+)\s+(aa|nt)/ ) { + ($querylen, $querytype) = ($2, $3); + $querydef ||= $1; + last; + } + } + } elsif ( m/^\s*vs\s+\S+/o ) { + if ( $last =~ /(\S+)[,:]\s+(\d+)\s+(aa|nt)/o) { + ($querydef, $querylen, $querytype) = ($1, $2, $3); + last; + } + } + $last = $_; + } + + if( $self->{'_reporttype'} && + $self->{'_reporttype'} eq 'FASTA' + ) { + if( $querytype eq 'nt') { + $self->{'_reporttype'} = 'FASTN' ; + } elsif( $querytype eq 'aa' ) { + $self->{'_reporttype'} = 'FASTP' ; + } + } + my ($name, $descr) = $querydef =~ m/^(\S+)\s*(.*?)\s*$/o; + $self->element({'Name' => 'FastaOutput_query-def', + 'Data' => $name}); + $self->element({'Name' => 'FastaOutput_querydesc', + 'Data' => $descr}); + if ($querylen) { + $self->element({'Name' => 'FastaOutput_query-len', + 'Data' => $querylen}); + } else { + $self->warn("unable to find and set query length"); + } + + if( $last =~ /^\s*vs\s+(\S+)/ || + ($last =~ /^searching\s+(\S+)\s+library/) || + (defined $_ && /^\s*vs\s+(\S+)/) || + (defined ($_ = $self->_readline()) && /^\s*vs\s+(\S+)/) + ) { + $self->element({'Name' => 'FastaOutput_db', + 'Data' => $1}); + } elsif (m/^\s+opt(?:\s+E\(\))?$/o) { + # histogram ... read over it more rapidly than the larger outer loop: + while (defined($_ = $self->_readline)) { + last if m/^>\d+/; + } + } + + } elsif( /(\d+) residues in\s+(\d+)\s+sequences/ ) { + $self->element({'Name' => 'FastaOutput_db-let', + 'Data' => $1}); + $self->element({'Name' => 'FastaOutput_db-len', + 'Data' => $2}); + $self->element({'Name' => 'Statistics_db-len', + 'Data' => $1}); + $self->element({'Name' => 'Statistics_db-num', + 'Data' => $2}); + } elsif( /Lambda=\s*(\S+)/ ) { + $self->element({'Name' => 'Statistics_lambda', + 'Data' => $1}); + } elsif (/K=\s*(\S+)/) { + $self->element({'Name' => 'Statistics_kappa', + 'Data' => $1}); + } elsif( /^\s*(Smith-Waterman).+(\S+)\s*matrix [^\]]*?(xS)?\]/ ) { + $self->element({'Name' => 'Parameters_matrix', + 'Data' => $2}); + $self->element({'Name' => 'Parameters_filter', + 'Data' => defined $3 ? 1 : 0, + }); + $self->{'_reporttype'} = $1; + + $self->element({ 'Name' => 'FastaOutput_program', + 'Data' => $self->{'_reporttype'}}); + + } elsif( /The best( related| unrelated)? scores are:/ ) { + my $rel = $1; + my @labels = split; + @labels = map { + if ($_ =~ m/^E\((\d+)\)$/o) { + $self->element({'Name' => 'Statistics_eff-space', 'Data' => $1}); + "evalue"; + } else { + $_; + } + } @labels[$rel ? 5 : 4 .. $#labels]; + + while( defined ($_ = $self->_readline() ) && + ! /^\s+$/ ) { + my @line = split; + + if ($line[-1] =~ m/\=/o && $labels[-1] eq 'fs') { + # unlabelled alignment hit; + push @labels, "aln_code"; + } + + my %data; + @data{@labels} = splice(@line, @line - @labels); + if ($line[-1] =~ m/\[([1-6rf])\]/o) { + my $fr = $1; + $data{lframe} = ($fr =~ /\d/o ? + ($fr <= 3 ? "+$fr" : "-@{[$fr-3]}") : + ($fr eq 'f' ? '+1' : '-1') + ); + pop @line; + } else { + $data{lframe} = '0'; + } + + if ($line[-1] =~ m/^\(?(\d+)\)$/) { + $data{hit_len} = $1; + pop @line; + if ($line[-1] =~ m/^\($/) { + pop @line; + } + } else { + $data{hit_len} = 0; + } + + # rebuild the first part of the line, preserving spaces: + ($_) = m/^(\S+(?:\s+\S+){$#line})/; + + my ($id, $desc) = split(/\s+/,$_,2); + my @pieces = split(/\|/,$id); + my $acc = pop @pieces; + $acc =~ s/\.\d+$//; + + @data{qw(id desc acc)} = ($id, $desc, $acc); + + push @hit_signifs, \%data; + } + } elsif( /^\s*([T]?FAST[XYAF]).+,\s*(\S+)\s*matrix[^\]]+?(xS)?\]\s*ktup:\s*(\d+)/ ) { + $self->element({'Name' => 'Parameters_matrix', + 'Data' => $2}); + $self->element({'Name' => 'Parameters_filter', + 'Data' => defined $3 ? 1 : 0, + }); + $self->element({'Name' => 'Parameters_ktup', + 'Data' => $4}); + $self->{'_reporttype'} = $1 if( $self->{'_reporttype'} !~ /FAST[PN]/i ) ; + + $self->element({ 'Name' => 'FastaOutput_program', + 'Data' => $self->{'_reporttype'}}); + + } elsif( /(?:gap\-pen|open\/ext):\s+([\-\+]?\d+)\s*\/\s*([\-\+]?\d+).+width:\s+(\d+)/ ) { + $self->element({'Name' => 'Parameters_gap-open', + 'Data' => $1}); + $self->element({'Name' => 'Parameters_gap-ext', + 'Data' => $2}); + $self->element({'Name' => 'Parameters_word-size', + 'Data' => $3}); + } elsif( /^>>(.+?)\s+\((\d+)\s*(aa|nt)\)$/ ) { + if( $self->in_element('hsp') ) { + $self->end_element({ 'Name' => 'Hsp'}); + } + if( $self->in_element('hit') ) { + $self->end_element({ 'Name' => 'Hit'}); + } + + $self->start_element({'Name' => 'Hit'}); + $self->element({ 'Name' => 'Hit_len', + 'Data' => $2}); + my ($id,$desc) = split(/\s+/,$1,2); + $self->element({ 'Name' => 'Hit_id', + 'Data' => $id}); + my @pieces = split(/\|/,$id); + my $acc = pop @pieces; + $acc =~ s/\.\d+$//; + $self->element({ 'Name' => 'Hit_accession', + 'Data' => $acc}); + $self->element({ 'Name' => 'Hit_def', + 'Data' => $desc}); + + $_ = $self->_readline(); + my ($score,$bits,$e) = /Z-score: \s* (\S+) \s* + (?: bits: \s* (\S+) \s+ )? + (?: E|expect ) \s* \(\) :? \s*(\S+)/x; + $bits = $score unless defined $bits; + + my $v = shift @hit_signifs; + if( defined $v ) { + @{$v}{qw(evalue bits z-sc)} = ($e, $bits, $score); + } + + $self->element({'Name' => 'Hit_signif', + 'Data' => $v ? $v->{evalue} : $e }); + $self->element({'Name' => 'Hit_score', + 'Data' => $v ? $v->{bits} : $bits }); + $self->start_element({'Name' => 'Hsp'}); + + $self->element({'Name' => 'Hsp_score', + 'Data' => $v ? $v->{'z-sc'} : $score }); + $self->element({'Name' => 'Hsp_evalue', + 'Data' => $v ? $v->{evalue} : $e }); + $self->element({'Name' => 'Hsp_bit-score', + 'Data' => $v ? $v->{bits} : $bits }); + $_ = $self->_readline(); + if( /Smith-Waterman score:\s*(\d+)/ ) { + $self->element({'Name' => 'Hsp_sw-score', + 'Data' => $1}); + } + if( / (\S+)\% \s* identity + (?:\s* \( \s* (\S+)\% \s* ungapped \) )? + \s* in \s* (\d+) \s+ (?:aa|nt) \s+ overlap \s* + \( (\d+) \- (\d+) : (\d+) \- (\d+) \) + /x ) { + my ($identper,$gapper,$len,$querystart, + $queryend,$hitstart,$hitend) = ($1,$2,$3,$4,$5,$6,$7); + my $ident = POSIX::ceil(($identper/100) * $len); + my $gaps = ( defined $gapper ) ? POSIX::ceil ( ($gapper/100) * $len) : undef; + + $self->element({'Name' => 'Hsp_gaps', + 'Data' => $gaps}); + $self->element({'Name' => 'Hsp_identity', + 'Data' => $ident}); + $self->element({'Name' => 'Hsp_positive', + 'Data' => $ident}); + $self->element({'Name' => 'Hsp_align-len', + 'Data' => $len}); + + $self->element({'Name' => 'Hsp_query-from', + 'Data' => $querystart}); + $self->element({'Name' => 'Hsp_query-to', + 'Data' => $queryend}); + $self->element({'Name' => 'Hsp_hit-from', + 'Data' => $hitstart}); + $self->element({'Name' => 'Hsp_hit-to', + 'Data' => $hitend}); + + } + + if ($v) { + $self->element({'Name' => 'Hsp_querygaps', 'Data' => $v->{qgaps} }) if exists $v->{qgaps}; + $self->element({'Name' => 'Hsp_hitgaps', 'Data' => $v->{lgaps} }) if exists $v->{lgaps}; + + if ($self->{'_reporttype'} =~ m/^FAST[NXY]$/o) { + if( 8 == scalar grep { exists $v->{$_} } qw(an0 ax0 pn0 px0 an1 ax1 pn1 px1) ) { + if ($v->{ax0} < $v->{an0}) { + $self->element({'Name' => 'Hsp_query-frame', 'Data' => "-@{[(($v->{px0} - $v->{ax0}) % 3) + 1]}" }); + } else { + $self->element({'Name' => 'Hsp_query-frame', 'Data' => "+@{[(($v->{an0} - $v->{pn0}) % 3) + 1]}" }); + } + if ($v->{ax1} < $v->{an1}) { + $self->element({'Name' => 'Hsp_hit-frame', 'Data' => "-@{[(($v->{px1} - $v->{ax1}) % 3) + 1]}" }); + } else { + $self->element({'Name' => 'Hsp_hit-frame', 'Data' => "+@{[(($v->{an1} - $v->{pn1}) % 3) + 1]}" }); + } + } else { + $self->element({'Name' => 'Hsp_query-frame', 'Data' => $v->{lframe} }); + $self->element({'Name' => 'Hsp_hit-frame', 'Data' => 0 }); + } + } else { + $self->element({'Name' => 'Hsp_query-frame', 'Data' => 0 }); + $self->element({'Name' => 'Hsp_hit-frame', 'Data' => $v->{lframe} }); + } + + } else { + $self->warn( "unable to parse FASTA score line: $_"); + } + } elsif( /\d+\s*residues\s*in\s*\d+\s*query\s*sequences/ ) { + if( $self->in_element('hsp') ) { + $self->end_element({'Name' => 'Hsp'}); + } + if( $self->in_element('hit') ) { + $self->end_element({'Name' => 'Hit'}); + } + +# $_ = $self->_readline(); +# my ( $liblen,$libsize) = /(\d+)\s+residues\s*in(\d+)\s*library/; + # fast forward to the end of the file as there is + # nothing else left to do with this file and want to be sure and + # reset it + while(defined($_ = $self->_readline() ) ) { + last if( /^Function used was/); + if( /(\S+)\s+searches\s+a\s+(protein\s+or\s+DNA\s+sequence)|(sequence\s+database)/ ) { + $self->_pushback($_); + } + } + + if (@hit_signifs) { + # process remaining best hits + for my $h (@hit_signifs) { + # Hsp_score Hsp_evalue Hsp_bit-score + # Hsp_sw-score Hsp_gaps Hsp_identity Hsp_positive + # Hsp_align-len Hsp_query-from Hsp_query-to + # Hsp_hit-from Hsp_hit-to Hsp_qseq Hsp_midline + + $self->start_element({'Name' => 'Hit'}); + $self->element({ 'Name' => 'Hit_len', + 'Data' => $h->{hit_len} + }) if exists $h->{hit_len}; + $self->element({ 'Name' => 'Hit_id', + 'Data' => $h->{id} + }) if exists $h->{id}; + $self->element({ 'Name' => 'Hit_accession', + 'Data' => $h->{acc} + }) if exists $h->{acc}; + $self->element({ 'Name' => 'Hit_def', + 'Data' => $h->{desc} + }) if exists $h->{desc}; + $self->element({'Name' => 'Hit_signif', + 'Data' => $h->{evalue} + }) if exists $h->{evalue}; + $self->element({'Name' => 'Hit_score', + 'Data' => $h->{bits} + }) if exists $h->{bits}; + + $self->start_element({'Name' => 'Hsp'}); + $self->element({'Name' => 'Hsp_score', 'Data' => $h->{'z-sc'} }) if exists $h->{'z-sc'}; + $self->element({'Name' => 'Hsp_evalue', 'Data' => $h->{evalue} }) if exists $h->{evalue}; + $self->element({'Name' => 'Hsp_bit-score', 'Data' => $h->{bits} }) if exists $h->{bits}; + $self->element({'Name' => 'Hsp_sw-score', 'Data' => $h->{sw} }) if exists $h->{sw}; + $self->element({'Name' => 'Hsp_gaps', 'Data' => $h->{'%_gid'} }) if exists $h->{'%_gid'}; + $self->element({'Name' => 'Hsp_identity', 'Data' => POSIX::ceil($h->{'%_id'} * $h->{alen}) }) + if (exists $h->{'%_id'} && exists $h->{alen}); + $self->element({'Name' => 'Hsp_positive', 'Data' => 100 * $h->{'%_id'} }) if exists $h->{'%_id'}; + $self->element({'Name' => 'Hsp_align-len', 'Data' => $h->{alen} }) if exists $h->{alen}; + $self->element({'Name' => 'Hsp_query-from', 'Data' => $h->{an0} }) if exists $h->{an0}; + $self->element({'Name' => 'Hsp_query-to', 'Data' => $h->{ax0} }) if exists $h->{ax0}; + $self->element({'Name' => 'Hsp_hit-from', 'Data' => $h->{an1} }) if exists $h->{an1}; + $self->element({'Name' => 'Hsp_hit-to', 'Data' => $h->{ax1} }) if exists $h->{ax1}; + + $self->element({'Name' => 'Hsp_querygaps', 'Data' => $h->{qgaps} }) if exists $h->{qgaps}; + $self->element({'Name' => 'Hsp_hitgaps', 'Data' => $h->{lgaps} }) if exists $h->{lgaps}; + + if ($self->{'_reporttype'} =~ m/^FAST[NXY]$/o) { + if( 8 == scalar grep { exists $h->{$_} } qw(an0 ax0 pn0 px0 an1 ax1 pn1 px1) ) { + if ($h->{ax0} < $h->{an0}) { + $self->element({'Name' => 'Hsp_query-frame', 'Data' => "-@{[(($h->{px0} - $h->{ax0}) % 3) + 1]}" }); + } else { + $self->element({'Name' => 'Hsp_query-frame', 'Data' => "+@{[(($h->{an0} - $h->{pn0}) % 3) + 1]}" }); + } + if ($h->{ax1} < $h->{an1}) { + $self->element({'Name' => 'Hsp_hit-frame', 'Data' => "-@{[(($h->{px1} - $h->{ax1}) % 3) + 1]}" }); + } else { + $self->element({'Name' => 'Hsp_hit-frame', 'Data' => "+@{[(($h->{an1} - $h->{pn1}) % 3) + 1]}" }); + } + } else { + $self->element({'Name' => 'Hsp_query-frame', 'Data' => $h->{lframe} }); + $self->element({'Name' => 'Hsp_hit-frame', 'Data' => 0 }); + } + } else { + $self->element({'Name' => 'Hsp_query-frame', 'Data' => 0 }); + $self->element({'Name' => 'Hsp_hit-frame', 'Data' => $h->{lframe} }); + } + + $self->end_element({'Name' => 'Hsp'}); + $self->end_element({'Name' => 'Hit'}); + } + } + + $self->end_element({ 'Name' => 'FastaOutput'}); + return $self->end_document(); + } elsif( /^\s*\d+\s*>>>/) { + if ($self->within_element('FastaOutput')) { + if( $self->in_element('hsp') ) { + $self->end_element({'Name' => 'Hsp'}); + } + if( $self->in_element('hit') ) { + $self->end_element({'Name' => 'Hit'}); + } + + if (@hit_signifs) { + # process remaining best hits + for my $h (@hit_signifs) { + $self->start_element({'Name' => 'Hit'}); + $self->element({ 'Name' => 'Hit_len', + 'Data' => $h->{hit_len} + }) if exists $h->{hit_len}; + $self->element({ 'Name' => 'Hit_id', + 'Data' => $h->{id} + }) if exists $h->{id}; + $self->element({ 'Name' => 'Hit_accession', + 'Data' => $h->{acc} + }) if exists $h->{acc}; + $self->element({ 'Name' => 'Hit_def', + 'Data' => $h->{desc} + }) if exists $h->{desc}; + $self->element({'Name' => 'Hit_signif', + 'Data' => $h->{evalue} + }) if exists $h->{evalue}; + $self->element({'Name' => 'Hit_score', + 'Data' => $h->{bits} + }) if exists $h->{bits}; + + $self->start_element({'Name' => 'Hsp'}); + $self->element({'Name' => 'Hsp_score', 'Data' => $h->{'z-sc'} }) if exists $h->{'z-sc'}; + $self->element({'Name' => 'Hsp_evalue', 'Data' => $h->{evalue} }) if exists $h->{evalue}; + $self->element({'Name' => 'Hsp_bit-score', 'Data' => $h->{bits} }) if exists $h->{bits}; + $self->element({'Name' => 'Hsp_sw-score', 'Data' => $h->{sw} }) if exists $h->{sw}; + $self->element({'Name' => 'Hsp_gaps', 'Data' => $h->{'%_gid'} }) if exists $h->{'%_gid'}; + $self->element({'Name' => 'Hsp_identity', 'Data' => POSIX::ceil($h->{'%_id'} * $h->{alen}) }) + if (exists $h->{'%_id'} && exists $h->{alen}); + $self->element({'Name' => 'Hsp_positive', 'Data' => $h->{'%_id'} }) if exists $h->{'%_id'}; + $self->element({'Name' => 'Hsp_align-len', 'Data' => $h->{alen} }) if exists $h->{alen}; + $self->element({'Name' => 'Hsp_query-from', 'Data' => $h->{an0} }) if exists $h->{an0}; + $self->element({'Name' => 'Hsp_query-to', 'Data' => $h->{ax0} }) if exists $h->{ax0}; + $self->element({'Name' => 'Hsp_hit-from', 'Data' => $h->{an1} }) if exists $h->{an1}; + $self->element({'Name' => 'Hsp_hit-to', 'Data' => $h->{ax1} }) if exists $h->{ax1}; + + $self->element({'Name' => 'Hsp_querygaps', 'Data' => $h->{qgaps} }) if exists $h->{qgaps}; + $self->element({'Name' => 'Hsp_hitgaps', 'Data' => $h->{lgaps} }) if exists $h->{lgaps}; + + if ($self->{'_reporttype'} =~ m/^FAST[NXY]$/o) { + if( 8 == scalar grep { exists $h->{$_} } qw(an0 ax0 pn0 px0 an1 ax1 pn1 px1) ) { + if ($h->{ax0} < $h->{an0}) { + $self->element({'Name' => 'Hsp_query-frame', 'Data' => "-@{[(($h->{px0} - $h->{ax0}) % 3) + 1]}" }); + } else { + $self->element({'Name' => 'Hsp_query-frame', 'Data' => "+@{[(($h->{an0} - $h->{pn0}) % 3) + 1]}" }); + } + if ($h->{ax1} < $h->{an1}) { + $self->element({'Name' => 'Hsp_hit-frame', 'Data' => "-@{[(($h->{px1} - $h->{ax1}) % 3) + 1]}" }); + } else { + $self->element({'Name' => 'Hsp_hit-frame', 'Data' => "+@{[(($h->{an1} - $h->{pn1}) % 3) + 1]}" }); + } + } else { + $self->element({'Name' => 'Hsp_query-frame', 'Data' => $h->{lframe} }); + $self->element({'Name' => 'Hsp_hit-frame', 'Data' => 0 }); + } + } else { + $self->element({'Name' => 'Hsp_query-frame', 'Data' => 0 }); + $self->element({'Name' => 'Hsp_hit-frame', 'Data' => $h->{lframe} }); + } + + $self->end_element({'Name' => 'Hsp'}); + $self->end_element({'Name' => 'Hit'}); + } + } + $self->end_element({ 'Name' => 'FastaOutput' }); + $self->_pushback($_); + return $self->end_document(); + } else { + $self->start_element({ 'Name' => 'FastaOutput' }); + $self->{'_result_count'}++; + $seentop = 1; + $self->element({ 'Name' => 'FastaOutput_program', + 'Data' => $self->{'_reporttype'} }); + $self->element({ 'Name' => 'FastaOutput_version', + 'Data' => $self->{'_version'} }); + + my ($type, $querylen, $querytype, $querydef); + + if( /^\s*\d+\s*>>>(.*)/ ) { + $querydef = $1; + if($querydef =~ /^(.*?)\s+(?:\-\s+)?(\d+)\s+(aa|nt)\s*$/o ) { + ($querydef, $querylen, $querytype) = ($1, $2, $3); + } + } + + if( $self->{'_reporttype'} && + $self->{'_reporttype'} eq 'FASTA' + ) { + if( $querytype eq 'nt') { + $self->{'_reporttype'} = 'FASTN' ; + } elsif( $querytype eq 'aa' ) { + $self->{'_reporttype'} = 'FASTP' ; + } + } + my ($name,$descr) = ($querydef =~ m/^(\S+)(?:\s+(.*))?\s*$/o); + $self->element({'Name' => 'FastaOutput_query-def', + 'Data' => $name}); + $self->element({'Name' => 'FastaOutput_querydesc', + 'Data' => $descr}); + if ($querylen) { + $self->element({'Name' => 'FastaOutput_query-len', + 'Data' => $querylen}); + } else { + $self->warn("unable to find and set query length"); + } + + + if( defined ($_ = $self->_readline()) && /^\s*vs\s+(\S+)/ ) { + $self->element({'Name' => 'FastaOutput_db', + 'Data' => $1}); + } + } + } elsif( $self->in_element('hsp' ) ) { + + my @data = ( '','',''); + my $count = 0; + my $len = $self->idlength + 1; + my ($seq1_id); + while( defined($_ ) ) { + chomp; + $self->debug( "$count $_\n"); + + if( /residues in \d+\s+query\s+sequences/) { + $self->_pushback($_); + last; + } elsif( /^>>/ ) { + $self->_pushback($_); + last; + } elsif (/^\s*\d+\s*>>>/) { + $self->_pushback($_); + last; + } + if( $count == 0 ) { + unless( /^\s+\d+/ || /^\s+$/) { + $self->_pushback($_); + $count = 2; + } + } elsif( $count == 1 || $count == 3 ) { + if( /^(\S+)\s+/ ) { + $len = CORE::length($1) if $len < CORE::length($1); + s/\s+$//; # trim trailing spaces,we don't want them + $data[$count-1] = substr($_,$len); + } elsif( /^\s+(\d+)/ ) { + $count = -1; + $self->_pushback($_); + } elsif( /^\s+$/ || length($_) == 0) { + $count = 5; + # going to skip these + } else { + $self->warn("Unrecognized alignment line ($count) '$_'"); + } + } elsif( $count == 2 ) { + if( /^\s+\d+\s+/ ) { + $self->warn("$_\n"); + $count = 4; + } else { + # toss the first IDLENGTH characters of the line + if( length($_) >= $len ) { + $data[$count-1] = substr($_,$len); + } + } + } + last if( $count++ >= 5); + $_ = $self->_readline(); + } + if( length($data[0]) > 0 || length($data[2]) > 0 ) { + $self->characters({'Name' => 'Hsp_qseq', + 'Data' => $data[0] }); + $self->characters({'Name' => 'Hsp_midline', + 'Data' => $data[1]}); + $self->characters({'Name' => 'Hsp_hseq', + 'Data' => $data[2]}); + } + } else { + if( ! $seentop ) { + $self->debug($_); + $self->warn("unrecognized FASTA Family report file!"); + return undef; + } + } + } +} + + +=head2 start_element + + Title : start_element + Usage : $eventgenerator->start_element + Function: Handles a start element event + Returns : none + Args : hashref with at least 2 keys 'Data' and 'Name' + + +=cut + +sub start_element{ + my ($self,$data) = @_; + # we currently don't care about attributes + my $nm = $data->{'Name'}; + if( my $type = $MODEMAP{$nm} ) { + $self->_mode($type); + if( $self->_eventHandler->will_handle($type) ) { + my $func = sprintf("start_%s",lc $type); + $self->_eventHandler->$func($data->{'Attributes'}); + } + unshift @{$self->{'_elements'}}, $type; + } + if($nm eq 'FastaOutput') { + $self->{'_values'} = {}; + $self->{'_result'}= undef; + $self->{'_mode'} = ''; + } + +} + +=head2 end_element + + Title : start_element + Usage : $eventgenerator->end_element + Function: Handles an end element event + Returns : none + Args : hashref with at least 2 keys 'Data' and 'Name' + + +=cut + +sub end_element { + my ($self,$data) = @_; + my $nm = $data->{'Name'}; + my $rc; + # Hsp are sort of weird, in that they end when another + # object begins so have to detect this in end_element for now + if( $nm eq 'Hsp' ) { + foreach ( qw(Hsp_qseq Hsp_midline Hsp_hseq) ) { + $self->element({'Name' => $_, + 'Data' => $self->{'_last_hspdata'}->{$_}}); + } + $self->{'_last_hspdata'} = {} + } + + if( my $type = $MODEMAP{$nm} ) { + if( $self->_eventHandler->will_handle($type) ) { + my $func = sprintf("end_%s",lc $type); + $rc = $self->_eventHandler->$func($self->{'_reporttype'}, + $self->{'_values'}); + } + shift @{$self->{'_elements'}}; + + } elsif( $MAPPING{$nm} ) { + if ( ref($MAPPING{$nm}) =~ /hash/i ) { + my $key = (keys %{$MAPPING{$nm}})[0]; + $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'}; + } else { + $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'}; + } + } else { + $self->warn( "unknown nm $nm, ignoring\n"); + } + $self->{'_last_data'} = ''; # remove read data if we are at + # end of an element + $self->{'_result'} = $rc if( $nm eq 'FastaOutput' ); + return $rc; + +} + +=head2 element + + Title : element + Usage : $eventhandler->element({'Name' => $name, 'Data' => $str}); + Function: Convience method that calls start_element, characters, end_element + Returns : none + Args : Hash ref with the keys 'Name' and 'Data' + + +=cut + +sub element{ + my ($self,$data) = @_; + $self->start_element($data); + $self->characters($data); + $self->end_element($data); +} + + +=head2 characters + + Title : characters + Usage : $eventgenerator->characters($str) + Function: Send a character events + Returns : none + Args : string + + +=cut + +sub characters{ + my ($self,$data) = @_; + + return unless ( defined $data->{'Data'} ); + if( $data->{'Data'} =~ /^\s+$/ ) { + return unless $data->{'Name'} =~ /Hsp\_(midline|qseq|hseq)/; + } + + if( $self->in_element('hsp') && + $data->{'Name'} =~ /Hsp\_(qseq|hseq|midline)/ ) { + + $self->{'_last_hspdata'}->{$data->{'Name'}} .= $data->{'Data'}; + } + + $self->{'_last_data'} = $data->{'Data'}; +} + +=head2 _mode + + Title : _mode + Usage : $obj->_mode($newval) + Function: + Example : + Returns : value of _mode + Args : newvalue (optional) + + +=cut + +sub _mode{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'_mode'} = $value; + } + return $self->{'_mode'}; +} + +=head2 within_element + + Title : within_element + Usage : if( $eventgenerator->within_element($element) ) {} + Function: Test if we are within a particular element + This is different than 'in' because within can be tested + for a whole block. + Returns : boolean + Args : string element name + + +=cut + +sub within_element{ + my ($self,$name) = @_; + return 0 if ( ! defined $name && + ! defined $self->{'_elements'} || + scalar @{$self->{'_elements'}} == 0) ; + foreach ( @{$self->{'_elements'}} ) { + if( $_ eq $name || $_ eq $MODEMAP{$name} ) { + return 1; + } + } + return 0; +} + +=head2 in_element + + Title : in_element + Usage : if( $eventgenerator->in_element($element) ) {} + Function: Test if we are in a particular element + This is different than 'in' because within can be tested + for a whole block. + Returns : boolean + Args : string element name + + +=cut + +sub in_element{ + my ($self,$name) = @_; + return 0 if ! defined $self->{'_elements'}->[0]; + return ( $self->{'_elements'}->[0] eq $name || + (exists $MODEMAP{$name} && $self->{'_elements'}->[0] eq $MODEMAP{$name}) + ); +} + + +=head2 start_document + + Title : start_document + Usage : $eventgenerator->start_document + Function: Handles a start document event + Returns : none + Args : none + + +=cut + +sub start_document{ + my ($self) = @_; + $self->{'_lasttype'} = ''; + $self->{'_values'} = {}; + $self->{'_result'}= undef; + $self->{'_mode'} = ''; + $self->{'_elements'} = []; +} + + +=head2 end_document + + Title : end_document + Usage : $eventgenerator->end_document + Function: Handles an end document event + Returns : Bio::Search::Result::ResultI object + Args : none + + +=cut + +sub end_document{ + my ($self,@args) = @_; + return $self->{'_result'}; +} + +=head2 idlength + + Title : idlength + Usage : $obj->idlength($newval) + Function: Internal storage of the length of the ID desc + in the HSP alignment blocks. Defaults to + $IDLENGTH class variable value + Returns : value of idlength + Args : newvalue (optional) + + +=cut + +sub idlength{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'_idlength'} = $value; + } + return $self->{'_idlength'} || $IDLENGTH; +} + + +=head2 result_count + + Title : result_count + Usage : my $count = $searchio->result_count + Function: Returns the number of results we have processed + Returns : integer + Args : none + + +=cut + +sub result_count { + my $self = shift; + return $self->{'_result_count'}; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SearchIO/hmmer.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SearchIO/hmmer.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,858 @@ +# $Id: hmmer.pm,v 1.13.2.3 2003/08/07 15:40:36 jason Exp $ +# +# BioPerl module for Bio::SearchIO::hmmer +# +# Cared for by Jason Stajich <jason@bioperl.org> +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SearchIO::hmmer - A parser for HMMER output (hmmpfam, hmmsearch) + +=head1 SYNOPSIS + + # do not use this class directly it is available through Bio::SearchIO + use Bio::SearchIO; + my $in = new Bio::SearchIO(-format => 'hmmer', + -file => 't/data/L77119.hmmer'); + while( my $result = $in->next_result ) { + # this is a Bio::Search::Result::HMMERResult object + print $result->query_name(), " for HMM ", $result->hmm_name(), "\n"; + while( my $hit = $result->next_hit ) { + print $hit->name(), "\n"; + while( my $hsp = $hit->next_hsp ) { + print "length is ", $hsp->length(), "\n"; + } + } + } + +=head1 DESCRIPTION + +This object implements a parser for HMMER output. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + +package Bio::SearchIO::hmmer; +use vars qw(@ISA); +use strict; +use vars qw(@ISA %MAPPING %MODEMAP + $DEFAULT_HSP_FACTORY_CLASS + $DEFAULT_HIT_FACTORY_CLASS + $DEFAULT_RESULT_FACTORY_CLASS + ); +use Bio::SearchIO; +use Bio::Factory::ObjectFactory; + +@ISA = qw(Bio::SearchIO ); + +BEGIN { + # mapping of HMMER items to Bioperl hash keys + %MODEMAP = ('HMMER_Output' => 'result', + 'Hit' => 'hit', + 'Hsp' => 'hsp' + ); + + %MAPPING = ( + 'Hsp_bit-score' => 'HSP-bits', + 'Hsp_score' => 'HSP-score', + 'Hsp_evalue' => 'HSP-evalue', + 'Hsp_query-from' => 'HSP-query_start', + 'Hsp_query-to' => 'HSP-query_end', + 'Hsp_hit-from' => 'HSP-hit_start', + 'Hsp_hit-to' => 'HSP-hit_end', + 'Hsp_positive' => 'HSP-conserved', + 'Hsp_identity' => 'HSP-identical', + 'Hsp_gaps' => 'HSP-hsp_gaps', + 'Hsp_hitgaps' => 'HSP-hit_gaps', + 'Hsp_querygaps' => 'HSP-query_gaps', + 'Hsp_qseq' => 'HSP-query_seq', + 'Hsp_hseq' => 'HSP-hit_seq', + 'Hsp_midline' => 'HSP-homology_seq', + 'Hsp_align-len' => 'HSP-hsp_length', + 'Hsp_query-frame'=> 'HSP-query_frame', + 'Hsp_hit-frame' => 'HSP-hit_frame', + + 'Hit_id' => 'HIT-name', + 'Hit_len' => 'HIT-length', + 'Hit_accession' => 'HIT-accession', + 'Hit_desc' => 'HIT-description', + 'Hit_signif' => 'HIT-significance', + 'Hit_score' => 'HIT-score', + + 'HMMER_program' => 'RESULT-algorithm_name', + 'HMMER_version' => 'RESULT-algorithm_version', + 'HMMER_query-def' => 'RESULT-query_name', + 'HMMER_query-len' => 'RESULT-query_length', + 'HMMER_query-acc' => 'RESULT-query_accession', + 'HMMER_querydesc' => 'RESULT-query_description', + 'HMMER_hmm' => 'RESULT-hmm_name', + 'HMMER_seqfile' => 'RESULT-sequence_file', + ); + $DEFAULT_HIT_FACTORY_CLASS = 'Bio::Factory::HMMERHitFactory'; + $DEFAULT_HSP_FACTORY_CLASS = 'Bio::Factory::HMMERHSPFactory'; + $DEFAULT_RESULT_FACTORY_CLASS = 'Bio::Factory::HMMERResultFactory'; +} + + +=head2 new + + Title : new + Usage : my $obj = new Bio::SearchIO::hmmer(); + Function: Builds a new Bio::SearchIO::hmmer object + Returns : Bio::SearchIO::hmmer + Args : -fh/-file => HMMER filename + -format => 'hmmer' + +=cut + +sub _initialize { + my ($self,@args) = @_; + $self->SUPER::_initialize(@args); + my $handler = $self->_eventHandler; + # new object initialization code + $handler->register_factory('result', + Bio::Factory::ObjectFactory->new( + -type => 'Bio::Search::Result::HMMERResult', + -interface => 'Bio::Search::Result::ResultI')); + + $handler->register_factory('hit', + Bio::Factory::ObjectFactory->new( + -type => 'Bio::Search::Hit::HMMERHit', + -interface => 'Bio::Search::Hit::HitI')); + + $handler->register_factory('hsp', + Bio::Factory::ObjectFactory->new( + -type => 'Bio::Search::HSP::HMMERHSP', + -interface => 'Bio::Search::HSP::HSPI')); + $self->{'_hmmidline'} = 'HMMER 2.2g (August 2001)'; +} +=head2 next_result + + Title : next_result + Usage : my $hit = $searchio->next_result; + Function: Returns the next Result from a search + Returns : Bio::Search::Result::ResultI object + Args : none + +=cut + +sub next_result{ + my ($self) = @_; + my $seentop = 0; + my $reporttype; + my ($last,@hitinfo,@hspinfo,%hspinfo,%hitinfo); + + my @alignemnt_lines; + + $self->start_document(); + while( defined ($_ = $self->_readline )) { + my $lineorig = $_; + chomp; + if( /^HMMER\s+(\S+)\s+\((.+)\)/o ) { + my ($prog,$version) = split; + if( $seentop ) { + $self->_pushback($_); + $self->end_element({'Name' => 'HMMER_Output'}); + return $self->end_document(); + } + $self->{'_hmmidline'} = $_; + $self->start_element({'Name' => 'HMMER_Output'}); + $self->{'_result_count'}++; + $seentop = 1; + if( defined $last ) { + ($reporttype) = split(/\s+/,$last); + $self->element({'Name' => 'HMMER_program', + 'Data' => uc ($reporttype)}); + } + $self->element({'Name' => 'HMMER_version', + 'Data' => $version}); + } elsif( s/^HMM file:\s+//o ) { + $self->{'_hmmfileline'} = $lineorig; + $self->element({'Name' => 'HMMER_hmm', + 'Data' => $_}); + } elsif( s/^Sequence\s+(file|database):\s+//o ) { + $self->{'_hmmseqline'} = $lineorig; + $self->element({'Name' => 'HMMER_seqfile', + 'Data' => $_}); + } elsif( s/^Query(\s+(sequence|HMM))?:\s+//o) { + if( ! $seentop ) { + # we're in a multi-query report + $self->_pushback($self->{'_hmmidline'}); + $self->_pushback($self->{'_hmmfileline'}); + $self->_pushback($self->{'_hmmseqline'}); + $self->_pushback($lineorig); + next; + } + s/\s+$//; + $self->element({'Name' => 'HMMER_query-def', + 'Data' => $_}); + } elsif( s/^Accession:\s+//o ) { + s/\s+$//; + $self->element({'Name' => 'HMMER_query-acc', + 'Data' => $_}); + } elsif( s/^Description:\s+//o ) { + s/\s+$//; + $self->element({'Name' => 'HMMER_querydesc', + 'Data' => $_}); + } elsif( defined $self->{'_reporttype'} && + $self->{'_reporttype'} eq 'HMMSEARCH' ) { + # PROCESS HMMSEARCH RESULTS HERE + if( /^Scores for complete sequences/o ) { + while( defined($_ = $self->_readline) ) { + last if( /^\s+$/); + next if( /^Sequence\s+Description/o || /^\-\-\-/o ); + my @line = split; + my ($name, $n,$evalue,$score)= (shift @line, + pop @line, + pop @line, + pop @line); + my $desc = join(' ', @line); + push @hitinfo, [ $name, $desc,$evalue,$score]; + $hitinfo{$name} = $#hitinfo; + } + } elsif( /^Parsed for domains:/o ) { + @hspinfo = (); + + while( defined($_ = $self->_readline) ) { + last if( /^\s+$/); + next if( /^(Model|Sequence)\s+Domain/o || /^\-\-\-/o ); + if( my ($n,$domainnum,$domainct, @vals) = + (m!^(\S+)\s+ # host name + (\d+)/(\d+)\s+ # num/num (ie 1 of 2) + (\d+)\s+(\d+).+? # sequence start and end + (\d+)\s+(\d+)\s+ # hmm start and end + \S+\s+ # [] + (\S+)\s+ # score + (\S+) # evalue + \s*$!ox ) ) { + # array lookup so that we can get rid of things + # when they've been processed + my $info = $hitinfo[$hitinfo{$n}]; + if( !defined $info ) { + $self->warn("Incomplete Sequence information, can't find $n hitinfo says $hitinfo{$n}"); + next; + } + push @hspinfo, [ $n, @vals]; + } + } + } elsif( /^Alignments of top/o ) { + my ($prelength,$lastdomain,$count,$width); + $count = 0; + my %domaincounter; + my $second_tier=0; + while( defined($_ = $self->_readline) ) { + next if( /^Align/o || + /^\s+RF\s+[x\s]+$/o); + if( /^Histogram/o || m!^//!o ) { + if( $self->in_element('hsp')) { + $self->end_element({'Name' => 'Hsp'}); + } + if( $self->within_element('hit')) { + $self->end_element({'Name' => 'Hit'}); + } + last; + } + chomp; + + if( /^\s*(.+):\s+domain\s+(\d+)\s+of\s+(\d+)\,\s+from\s+(\d+)\s+to\s+(\d+)/o ) { + my ($name,$domainct,$domaintotal, + $from,$to) = ($1,$2,$3,$4,$5); + $domaincounter{$name}++; + if( ! defined $lastdomain || $lastdomain ne $name ) { + if( $self->within_element('hit') ) { + if( $self->within_element('hsp') ) { + $self->end_element({'Name' => 'Hsp'}); + } + $self->end_element({'Name' => 'Hit'}); + } + $self->start_element({'Name' => 'Hit'}); + my $info = [@{$hitinfo[$hitinfo{$name}] || $self->throw("Could not find hit info for $name: Insure that your database contains only unique sequence names")}]; + if( $info->[0] ne $name ) { + $self->throw("Somehow the Model table order does not match the order in the domains (got ".$info->[0].", expected $name)"); + } + $self->element({'Name' => 'Hit_id', + 'Data' => shift @{$info}}); + $self->element({'Name' => 'Hit_desc', + 'Data' => shift @{$info}}); + $self->element({'Name' => 'Hit_signif', + 'Data' => shift @{$info}}); + $self->element({'Name' => 'Hit_score', + 'Data' => shift @{$info}}); + } + $self->end_element({'Name' => 'Hsp'}) + if $self->in_element('hsp'); + + $self->start_element({'Name' => 'Hsp'}); + $self->element({'Name' => 'Hsp_identity', + 'Data' => 0}); + $self->element({'Name' => 'Hsp_positive', + 'Data' => 0}); + my $HSPinfo = shift @hspinfo; + my $id = shift @$HSPinfo; + if( $id ne $name ) { + $self->throw("Somehow the domain list details do not match the table (got $id, expected $name)"); + } + if( $domaincounter{$name} == $domaintotal) { + $hitinfo[$hitinfo{$name}] = undef; + } + + $self->element({'Name' => 'Hsp_hit-from', + 'Data' => shift @$HSPinfo}); + $self->element({'Name' => 'Hsp_hit-to', + 'Data' => shift @$HSPinfo}); + $self->element({'Name' => 'Hsp_query-from', + 'Data' => shift @$HSPinfo}); + $self->element({'Name' => 'Hsp_query-to', + 'Data' => shift @$HSPinfo}); + $self->element({'Name' => 'Hsp_score', + 'Data' => shift @$HSPinfo}); + $self->element({'Name' => 'Hsp_evalue', + 'Data' => shift @$HSPinfo}); + $lastdomain = $name; + } else { + # Might want to change this so that it + # accumulates all the of the alignment lines into + # three array slots and then tests for the + # end of the line + if( /^(\s+\*\-\>)(\S+)/o ) { # start of domain + $prelength = CORE::length($1); + $width = 0; + # $width = CORE::length($2); + $self->element({'Name' =>'Hsp_qseq', + 'Data' => $2}); + $count = 0; + $second_tier = 0; + } elsif ( /^(\s+)(\S+)\<\-\*\s*$/o ) { #end of domain + $self->element({'Name' =>'Hsp_qseq', + 'Data' => $2}); + $width = CORE::length($2); + $count = 0; + } elsif( ($count != 1 && /^\s+$/o) || + CORE::length($_) == 0 || + /^\s+\-?\*\s*$/ ) { + next; + } elsif( $count == 0 ) { + $prelength -= 3 unless ($second_tier++); + unless( defined $prelength) { + # $self->warn("prelength not set"); + next; + } + $self->element({'Name' => 'Hsp_qseq', + 'Data' => substr($_,$prelength)}); + } elsif( $count == 1) { + if( ! defined $prelength ) { + $self->warn("prelength not set"); + } + if( $width ) { + $self->element({'Name' => 'Hsp_midline', + 'Data' => substr($_, + $prelength, + $width)}); + } else { + $self->debug( "midline is $_\n") if( CORE::length($_) <= $prelength && $self->verbose > 0); + $self->element({'Name' => 'Hsp_midline', + 'Data' => substr($_, + $prelength)}); + } + } elsif( $count == 2) { + if( /^\s+(\S+)\s+(\d+|\-)\s+(\S*)\s+(\d+|\-)/o ) { + $self->element({'Name' => 'Hsp_hseq', + 'Data' => $3}); + } else { + $self->warn("unrecognized line: $_\n"); + } + } + $count = 0 if $count++ >= 2; + } + } + } elsif( /^Histogram/o || m!^//!o ) { + + while( my $HSPinfo = shift @hspinfo ) { + my $id = shift @$HSPinfo; + my $info = [@{$hitinfo[$hitinfo{$id}]}]; + next unless defined $info; + $self->start_element({'Name' => 'Hit'}); + $self->element({'Name' => 'Hit_id', + 'Data' => shift @{$info}}); + $self->element({'Name' => 'Hit_desc', + 'Data' => shift @{$info}}); + $self->element({'Name' => 'Hit_signif', + 'Data' => shift @{$info}}); + $self->element({'Name' => 'Hit_score', + 'Data' => shift @{$info}}); + $self->start_element({'Name' => 'Hsp'}); + $self->element({'Name' => 'Hsp_query-from', + 'Data' => shift @$HSPinfo}); + $self->element({'Name' => 'Hsp_query-to', + 'Data' => shift @$HSPinfo}); + $self->element({'Name' => 'Hsp_hit-from', + 'Data' => shift @$HSPinfo}); + $self->element({'Name' => 'Hsp_hit-to', + 'Data' => shift @$HSPinfo}); + $self->element({'Name' => 'Hsp_score', + 'Data' => shift @$HSPinfo}); + $self->element({'Name' => 'Hsp_evalue', + 'Data' => shift @$HSPinfo}); + $self->element({'Name' => 'Hsp_identity', + 'Data' => 0}); + $self->element({'Name' => 'Hsp_positive', + 'Data' => 0}); + $self->element({'Name' => 'Hsp_positive', + 'Data' => 0}); + $self->end_element({'Name' => 'Hsp'}); + $self->end_element({'Name' => 'Hit'}); + } + @hitinfo = (); + %hitinfo = (); + last; + } + } elsif( defined $self->{'_reporttype'} && + $self->{'_reporttype'} eq 'HMMPFAM' ) { + if( /^Scores for sequence family/o ) { + while( defined($_ = $self->_readline) ) { + last if( /^\s+$/); + next if( /^Model\s+Description/o || /^\-\-\-/o ); + chomp; + my @line = split; + my ($model,$n,$evalue,$score) = (shift @line, + pop @line, + pop @line, + pop @line); + my $desc = join(' ', @line); + push @hitinfo, [ $model, $desc,$score,$evalue,$n]; + $hitinfo{$model} = $#hitinfo; + } + } elsif( /^Parsed for domains:/o ) { + @hspinfo = (); + while( defined($_ = $self->_readline) ) { + last if( /^\s+$/); + next if( /^Model\s+Domain/o || /^\-\-\-/o ); + chomp; + if( my ($n,$domainnum,$domainct,@vals) = + (m!^(\S+)\s+ # domain name + \s+(\d+)/(\d+)\s+ # domain num out of num + (\d+)\s+(\d+).+? # seq start, end + (\d+)\s+(\d+)\s+ # hmm start, end + \S+\s+ # [] + (\S+)\s+ # score + (\S+) # evalue + \s*$!ox ) ) { + my $hindex = $hitinfo{$n}; + if( ! defined $hindex ) { + push @hitinfo, [ $n, '',$vals[5],$vals[6], + $domainct]; + $hitinfo{$n} = $#hitinfo; + $hindex = $#hitinfo; + } + my $info = $hitinfo[$hindex]; + if( ! defined $info ) { + $self->warn("incomplete Domain information, can't find $n hitinfo says $hitinfo{$n}"); + next; + } + push @hspinfo, [ $n, @vals]; + } + } + } elsif( /^Alignments of top/o ) { + my ($prelength,$lastdomain,$count,$width); + $count = 0; + my $second_tier=0; + while( defined($_ = $self->_readline) ) { + next if( /^Align/o || /^\s+RF\s+[x\s]+$/o); + + if( /^Histogram/o || m!^//!o || /^Query sequence/o ) { + if( $self->in_element('hsp')) { + $self->end_element({'Name' => 'Hsp'}); + } + if( $self->in_element('hit') ) { + $self->end_element({'Name' => 'Hit'}); + } + $self->end_element({'Name' => 'HMMER_Output'}); + if( /^Query sequence/o ) { $self->_pushback($_); } + return $self->end_document(); + last; + } + chomp; + if( m/(\S+):.*from\s+(\d+)\s+to\s+(\d+)/o ) { + my ($name,$from,$to) = ($1,$2,$3); + if( ! defined $lastdomain || $lastdomain ne $name ) { + + if( $self->within_element('hit') ) { + if( $self->in_element('hsp') ) { + $self->end_element({'Name' => 'Hsp'}); + } + $self->end_element({'Name' => 'Hit'}); + } + $self->start_element({'Name' => 'Hit'}); + my $info = [@{$hitinfo[$hitinfo{$name}]}]; + if( ! defined $info || + $info->[0] ne $name ) { + $self->warn("Somehow the Model table order does not match the order in the domains (got ".$info->[0].", expected $name). We're +back loading this from the alignment information instead"); + $info = [$name, '', + /score\s+([^,\s]+),\s+E\s+=\s+(\S+)/ox]; + push @hitinfo, $info; + $hitinfo{$name} = $#hitinfo; + } + + $self->element({'Name' => 'Hit_id', + 'Data' => shift @{$info}}); + $self->element({'Name' => 'Hit_desc', + 'Data' => shift @{$info}}); + $self->element({'Name' => 'Hit_score', + 'Data' => shift @{$info}}); + $self->element({'Name' => 'Hit_signif', + 'Data' => shift @{$info}}); + } + if( $self->within_element('hsp') ) { + #if( defined $lastdomain ) { + $self->end_element({'Name' => 'Hsp'}); + } + $self->start_element({'Name' => 'Hsp'}); + $self->element({'Name' => 'Hsp_identity', + 'Data' => 0}); + $self->element({'Name' => 'Hsp_positive', + 'Data' => 0}); + my $HSPinfo = shift @hspinfo; + my $id = shift @$HSPinfo; + + if( $id ne $name ) { + $self->throw("Somehow the domain list details do not match the table (got $id, expected $name)"); + } + $self->element({'Name' => 'Hsp_query-from', + 'Data' => shift @$HSPinfo}); + $self->element({'Name' => 'Hsp_query-to', + 'Data' => shift @$HSPinfo}); + $self->element({'Name' => 'Hsp_hit-from', + 'Data' => shift @$HSPinfo}); + $self->element({'Name' => 'Hsp_hit-to', + 'Data' => shift @$HSPinfo}); + $self->element({'Name' => 'Hsp_score', + 'Data' => shift @$HSPinfo}); + $self->element({'Name' => 'Hsp_evalue', + 'Data' => shift @$HSPinfo}); + + $lastdomain = $name; + } else { + if( /^(\s+\*\-\>)(\S+)/o ) { # start of domain + $prelength = CORE::length($1); + $width = 0; + # $width = CORE::length($2); + $self->element({'Name' =>'Hsp_hseq', + 'Data' => $2}); + $count = 0; + $second_tier = 0; + } elsif ( /^(\s+)(\S+)\<\-?\*?\s*$/o ) { #end of domain + $prelength -= 3 unless ($second_tier++); + $self->element({'Name' =>'Hsp_hseq', + 'Data' => $2}); + $width = CORE::length($2); + $count = 0; + } elsif( CORE::length($_) == 0 || + ($count != 1 && /^\s+$/o) || + /^\s+\-?\*\s*$/ ) { + next; + } elsif( $count == 0 ) { + $prelength -= 3 unless ($second_tier++); + unless( defined $prelength) { + # $self->warn("prelength not set"); + next; + } + $self->element({'Name' => 'Hsp_hseq', + 'Data' => substr($_,$prelength)}); + } elsif( $count == 1 ) { + if( ! defined $prelength ) { + $self->warn("prelength not set"); + } + if( $width ) { + $self->element({'Name' => 'Hsp_midline', + 'Data' => substr($_,$prelength,$width)}); + } else { + $self->element({'Name' => 'Hsp_midline', + 'Data' => substr($_,$prelength)}); + } + } elsif( $count == 2 ) { + if( /^\s+(\S+)\s+(\d+)\s+(\S+)\s+(\d+)/o || + /^\s+(\S+)\s+(\-)\s+(\S*)\s+(\-)/o + ) { + $self->element({'Name' => 'Hsp_qseq', + 'Data' => $3}); + } else { + $self->warn("unrecognized line ($count): $_\n"); + } + } + $count = 0 if $count++ >= 2; + } + } + } else { + $self->debug($_); + } + } + $last = $_; + } + + $self->end_element({'Name' => 'HMMER_Output'}) unless ! $seentop; + return $self->end_document(); +} + +=head2 start_element + + Title : start_element + Usage : $eventgenerator->start_element + Function: Handles a start element event + Returns : none + Args : hashref with at least 2 keys 'Data' and 'Name' + + +=cut + +sub start_element{ + my ($self,$data) = @_; + # we currently don't care about attributes + my $nm = $data->{'Name'}; + my $type = $MODEMAP{$nm}; + if( $type ) { + if( $self->_eventHandler->will_handle($type) ) { + my $func = sprintf("start_%s",lc $type); + $self->_eventHandler->$func($data->{'Attributes'}); + } + unshift @{$self->{'_elements'}}, $type; + } + if(defined $type && + $type eq 'result') { + $self->{'_values'} = {}; + $self->{'_result'}= undef; + } +} + +=head2 end_element + + Title : start_element + Usage : $eventgenerator->end_element + Function: Handles an end element event + Returns : none + Args : hashref with at least 2 keys 'Data' and 'Name' + + +=cut + +sub end_element { + my ($self,$data) = @_; + my $nm = $data->{'Name'}; + my $type = $MODEMAP{$nm}; + my $rc; + + if($nm eq 'HMMER_program' ) { + if( $self->{'_last_data'} =~ /(HMM\S+)/i ) { + $self->{'_reporttype'} = uc $1; + } + } + # Hsp are sort of weird, in that they end when another + # object begins so have to detect this in end_element for now + if( $nm eq 'Hsp' ) { + foreach ( qw(Hsp_qseq Hsp_midline Hsp_hseq) ) { + $self->element({'Name' => $_, + 'Data' => $self->{'_last_hspdata'}->{$_}}); + } + $self->{'_last_hspdata'} = {}; + } + if( $type ) { + if( $self->_eventHandler->will_handle($type) ) { + my $func = sprintf("end_%s",lc $type); + $rc = $self->_eventHandler->$func($self->{'_reporttype'}, + $self->{'_values'}); + } + my $lastelem = shift @{$self->{'_elements'}}; + } elsif( $MAPPING{$nm} ) { + if ( ref($MAPPING{$nm}) =~ /hash/i ) { + my $key = (keys %{$MAPPING{$nm}})[0]; + $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'}; + } else { + $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'}; + } + } else { + $self->debug( "unknown nm $nm, ignoring\n"); + } + $self->{'_last_data'} = ''; # remove read data if we are at + # end of an element + $self->{'_result'} = $rc if( defined $type && $type eq 'result' ); + return $rc; +} + +=head2 element + + Title : element + Usage : $eventhandler->element({'Name' => $name, 'Data' => $str}); + Function: Convience method that calls start_element, characters, end_element + Returns : none + Args : Hash ref with the keys 'Name' and 'Data' + + +=cut + +sub element{ + my ($self,$data) = @_; + $self->start_element($data); + $self->characters($data); + $self->end_element($data); +} + +=head2 characters + + Title : characters + Usage : $eventgenerator->characters($str) + Function: Send a character events + Returns : none + Args : string + + +=cut + +sub characters{ + my ($self,$data) = @_; + + if( $self->in_element('hsp') && + $data->{'Name'} =~ /Hsp\_(qseq|hseq|midline)/o && + defined $data->{'Data'} ) { + $self->{'_last_hspdata'}->{$data->{'Name'}} .= $data->{'Data'}; + } + return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/o ); + + $self->{'_last_data'} = $data->{'Data'}; +} + +=head2 within_element + + Title : within_element + Usage : if( $eventgenerator->within_element($element) ) {} + Function: Test if we are within a particular element + This is different than 'in' because within can be tested + for a whole block. + Returns : boolean + Args : string element name + + +=cut + +sub within_element{ + my ($self,$name) = @_; + return 0 if ( ! defined $name || + ! defined $self->{'_elements'} || + scalar @{$self->{'_elements'}} == 0) ; + foreach ( @{$self->{'_elements'}} ) { + return 1 if( $_ eq $name ); + } + return 0; +} + + +=head2 in_element + + Title : in_element + Usage : if( $eventgenerator->in_element($element) ) {} + Function: Test if we are in a particular element + This is different than 'within' because 'in' only + tests its immediete parent. + Returns : boolean + Args : string element name + + +=cut + +sub in_element{ + my ($self,$name) = @_; + return 0 if ! defined $self->{'_elements'}->[0]; + return ( $self->{'_elements'}->[0] eq $name) +} + +=head2 start_document + + Title : start_document + Usage : $eventgenerator->start_document + Function: Handle a start document event + Returns : none + Args : none + + +=cut + +sub start_document{ + my ($self) = @_; + $self->{'_lasttype'} = ''; + $self->{'_values'} = {}; + $self->{'_result'}= undef; + $self->{'_elements'} = []; +} + +=head2 end_document + + Title : end_document + Usage : $eventgenerator->end_document + Function: Handles an end document event + Returns : Bio::Search::Result::ResultI object + Args : none + + +=cut + +sub end_document{ + my ($self) = @_; + return $self->{'_result'}; +} + +=head2 result_count + + Title : result_count + Usage : my $count = $searchio->result_count + Function: Returns the number of results we have processed + Returns : integer + Args : none + + +=cut + +sub result_count { + my $self = shift; + return $self->{'_result_count'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SearchIO/psiblast.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SearchIO/psiblast.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1181 @@ +#------------------------------------------------------------------ +# $Id: psiblast.pm,v 1.17 2002/12/24 15:48:41 jason Exp $ +# +# BioPerl module Bio::SearchIO::psiblast +# +# Cared for by Steve Chervitz <sac@bioperl.org> +# +# You may distribute this module under the same terms as perl itself +#------------------------------------------------------------------ + +=head1 NAME + +Bio::SearchIO::psiblast - Parser for traditional BLAST and PSI-BLAST reports + +=head1 SYNOPSIS + + use Bio::SearchIO; + + my $in = Bio::SearchIO->new( -format => 'psiblast', + -file => 'report.blastp' ); + + while ( my $blast = $in->next_result() ) { + foreach my $hit ( $blast->hits ) { + print "Hit: $hit\n"; + } + } + + # BLAST hit filtering function. All hits of each BLAST report must satisfy + # this criteria to be retained. If a hit fails this test, it is ignored. + # If all hits of a report fail, the report will be considered hitless. + # But we can distinguish this from the case where there were no + # hits in the report by testing the function $blast->no_hits_found(). + + my $filt_func = sub{ my $hit=shift; + $hit->frac_identical('query') >= 0.5 + && $hit->frac_aligned_query >= 0.50 + }; + + # Not supplying a -file or -fh parameter means read from STDIN + + my $in2 = Bio::SearchIO->new( -format => 'psiblast', + -hit_filter => $filt_func + ); + + +=head1 DESCRIPTION + +This module parses BLAST and PSI-BLAST reports and acts as a factory for +objects that encapsulate BLAST results: +L<Bio::Search::Result::BlastResult>, L<Bio::Search::Hit::BlastHit>, +L<Bio::Search::HSP::BlastHSP>. + +This module does not parse XML-formatted BLAST reports. +See L<Bio::SearchIO::blastxml|Bio::SearchIO::blastxml> if you need to do that. + +To use this module, the only module you need to C<use> is +Bio::SearchIO.pm. SearchIO knows how to load this module when you +supply a C<-format =E<gt> 'psiblast'> parameters to its C<new>() +function. For more information about the SearchIO system, see +documentation in Bio::SearchIO.pm. + +=head2 PSI-BLAST Support + +In addition to BLAST1 and BLAST2 reports, this module can also handle +PSI-BLAST reports. When accessing the set of Hits in a result, hits +from different iterations are lumped together but can be distinguished by +interrogating L<Bio::Search::Hit::BlastHit::iteration> and +L<Bio::Search::Hit::BlastHit::found_again>. + +If you want to collect hits only from a certain iteration during parsing, +supply a function using the C<-HIT_FILTER> parameter. + +=head1 EXAMPLES + +To get a feel for how to use this, have look at scripts in the +B<examples/searchio> and B<examples/searchio/writer> directory of the Bioperl +distribution as well as the test script B<t/SearchIO.t>. + +=head1 SEE ALSO + +For more documentation about working with Blast result objects that are +produced by this parser, see L<Bio::Search::Result::BlastResult>, +L<Bio::Search::Hit::BlastHit>, L<Bio::Search::HSP::BlastHSP>. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Steve Chervitz E<lt>sac@bioperl.orgE<gt> + +See L<the FEEDBACK section | FEEDBACK> for where to send bug reports and comments. + +=head1 ACKNOWLEDGEMENTS + +I would like to acknowledge my colleagues at Affymetrix for useful +feedback. + +=head1 COPYRIGHT + +Copyright (c) 2001 Steve Chervitz. All Rights Reserved. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SearchIO::psiblast; + +use strict; +use vars qw( @ISA + $MAX_HSP_OVERLAP + $DEFAULT_MATRIX + $DEFAULT_SIGNIF + $DEFAULT_SCORE + $DEFAULT_BLAST_WRITER_CLASS + $DEFAULT_HIT_FACTORY_CLASS + $DEFAULT_RESULT_FACTORY_CLASS + ); + +use Bio::SearchIO; +use Bio::Search::Result::BlastResult; +use Bio::Factory::BlastHitFactory; +use Bio::Factory::BlastResultFactory; +use Bio::Tools::StateMachine::IOStateMachine qw($INITIAL_STATE $FINAL_STATE); + +@ISA = qw( Bio::SearchIO + Bio::Tools::StateMachine::IOStateMachine ); + +$MAX_HSP_OVERLAP = 2; # Used when tiling multiple HSPs. +$DEFAULT_MATRIX = 'BLOSUM62'; +$DEFAULT_SIGNIF = 999;# Value used as significance cutoff if none supplied. +$DEFAULT_SCORE = 0; # Value used as score cutoff if none supplied. +$DEFAULT_BLAST_WRITER_CLASS = 'Bio::Search::Writer::HitTableWriter'; +$DEFAULT_HIT_FACTORY_CLASS = 'Bio::Factory::BlastHitFactory'; +$DEFAULT_RESULT_FACTORY_CLASS = 'Bio::Factory::BlastResultFactory'; + +my %state = ( + 'header' => 'Header', + 'descr' => 'Descriptions', + 'align' => 'Alignment', + 'footer' => 'Footer', + 'iteration' => 'Iteration', # psiblast + 'nohits' => 'No Hits' + ); + +# These are the expected transitions assuming a "normal" report (Blast2 or PSI-Blast). +my @state_transitions = ( [ $INITIAL_STATE, $state{'header'}], + [ $state{'header'}, $state{'descr'} ], + [ $state{'header'}, $state{'iteration'} ], + [ $state{'iteration'}, $state{'descr'} ], + [ $state{'iteration'}, $state{'nohits'} ], + [ $state{'descr'}, $state{'align'} ], + [ $state{'align'}, $state{'align'} ], + [ $state{'align'}, $state{'footer'} ], + [ $state{'align'}, $state{'iteration'} ], # psiblast + [ $state{'nohits'}, $state{'iteration'} ], # psiblast + [ $state{'nohits'}, $state{'footer'} ], + [ $state{'footer'}, $state{'header'} ], + [ $state{'footer'}, $FINAL_STATE] + ); + +my $current_iteration; # psiblast + +=head2 new + + Usage : Bio::SearchIO::psiblast->new( %named_parameters ) + Purpose : Parse traditional BLAST or PSI-BLAST data a file or input stream. + : Handles Blast1, Blast2, NCBI and WU Blast reports. + : Populates Bio::Search:: objects with data extracted from the report. + : (The exact type of Bio::Search objects depends on the type of + : Bio::Factory::ResultFactory and Bio::Factory::HitFactory you hook up + : to the SearchIO object.) + Returns : Bio::SearchIO::psiblast object. + Argument : Named parameters: (PARAMETER TAGS CAN BE UPPER OR LOWER CASE). + : These are in addition to those specified by Bio::SearchIO::new() (see). + : -SIGNIF => number (float or scientific notation number to be used + : as a P- or Expect value cutoff; default = 999.) + : -SCORE => number (integer or scientific notation number to be used + : as a score value cutoff; default = 0.) + : -HIT_FILTER => func_ref (reference to a function to be used for + : filtering out hits based on arbitrary criteria. + : This function should take a + : Bio::Search::Hit::BlastHit.pm object as its first + : argument and return a boolean value, + : true if the hit should be filtered out). + : Sample filter function: + : -HIT_FILTER => sub { $hit = shift; + : $hit->gaps == 0; }, + : Historical note: This parameter was formerly + called -FILT_FUNC in the older + Bio::Tools::Blast::parse method. Using + -FILT_FUNC will still work for backward + compatibility. + : -CHECK_ALL_HITS => boolean (check all hits for significance against + : significance criteria. Default = false. + : If false, stops processing hits after the first + : non-significant hit or the first hit that fails + : the hit_filter call. This speeds parsing, + : taking advantage of the fact that the hits + : are processed in the order they are ranked.) + : -MIN_LEN => integer (to be used as a minimum query sequence length + : sequences below this length will not be processed). + : default = no minimum length). + : -STATS => boolean (collect key statistical parameters for the report: + : matrix, filters, etc. default = false). + : This requires extra parsing + : so if you aren't interested in this info, don't + : set this parameter. Note that the unparsed + : parameter section of a Blast report is always + : accessible via $blast->raw_statistics(). + : -BEST => boolean (only process the best hit of each report; + : default = false). + : -OVERLAP => integer (the amount of overlap to permit between + : adjacent HSPs when tiling HSPs. A reasonable value is 2. + : Default = $Bio::SearchIO::psiblast::MAX_HSP_OVERLAP) + : -HOLD_RAW_DATA => boolean (store the raw alignment sections for each hit. + : used with the -SHALLOW_PARSE option). + : -SHALLOW_PARSE => boolean (only minimal parsing; does not parse HSPs. + : Hit data is limited to what can be obtained + : from the description line. + : Replaces the older NO_ALIGNS option.) + : + : + Comments : Do NOT remove the HTML from an HTML-formatted Blast report by using the + : "Save As" option of a web browser to save it as text. This renders the + : report unparsable. + Throws : An exception will be thrown if a BLAST report contains a FATAL: error. + +=cut + +sub new { + my ($class, %args) = @_; + + # TODO: Resolve this issue: + # Temporary hack to allow factory-based and non-factory based + # SearchIO objects co-exist. + # steve --- Sat Dec 22 04:41:20 2001 + $args{-USE_FACTORIES} = 1; + + my $self = $class->Bio::SearchIO::new(%args); + $self->_init_state_machine( %args, -transition_table => \@state_transitions); + + $self->_init_parse_params( %args ); + + $self->pause_between_reports( 1 ); + + $self->{'_result_count'} = 0; + + return $self; +} + + +sub default_result_factory_class { + my $self = shift; + return $DEFAULT_RESULT_FACTORY_CLASS; +} + +sub default_hit_factory_class { + my $self = shift; + return $DEFAULT_HIT_FACTORY_CLASS; +} + +sub check_for_new_state { + my ($self) = @_; + + # Ignore blank lines + my $chunk = $self->SUPER::check_for_new_state(1); + + my $newstate = undef; + + # End the run if there's no more input. + if( ! $chunk ) { + return $self->final_state; + } + $self->clear_state_change_cache; + my $curr_state = $self->current_state; + + if( $chunk =~ /^(<.*>)?T?BLAST[NPX]/ ) { + $newstate = $state{header}; + $self->state_change_cache( $chunk ); + } + + elsif ($chunk =~ /^Sequences producing/ ) { + $newstate = $state{descr}; + $self->state_change_cache( $chunk ); + } + + elsif ($chunk =~ /No hits found/i ) { + $newstate = $state{nohits}; + $self->state_change_cache( $chunk ); + } + + elsif ($chunk =~ /^\s*Searching/ ) { + $newstate = $state{iteration}; + } + + elsif ($chunk =~ /^>(.*)/ ) { + $newstate = $state{align}; + $self->state_change_cache( "$1\n" ); + } + + elsif ($chunk =~ /^(CPU time|Parameters):/ ) { + $newstate = $state{footer}; + $self->state_change_cache( $chunk ); + } + + # Necessary to distinguish " Database:" lines that start a footer section + # from those that are internal to a footer section. + elsif ($chunk =~ /^\s+Database:/ && $curr_state ne $state{'footer'}) { + $newstate = $state{footer}; + $self->state_change_cache( $chunk ); + } + + if( $curr_state ne $INITIAL_STATE and not $newstate ) { +# print "Appending input cache with ($curr_state): $chunk\n"; + $self->append_input_cache( $chunk ); + } + + return $newstate; +} + + +sub change_state { + my ($self, $state) = @_; + + my $from_state = $self->current_state; + my $verbose = $self->verbose; + $verbose and print STDERR ">>>>> Changing state from $from_state to $state\n"; + + if ( $self->validate_transition( $from_state, $state ) ) { + + # Now we know the current state is complete + # and all data from it is now in the input cache. + my @data = $self->get_input_cache(); + +# if($from_state eq $state{iteration} ) { +# do{ +# print STDERR "State change cache: ", $self->state_change_cache, "\n"; +# print STDERR "Input cache ($from_state):\n@data\n\n"; +# }; +# } + + # Now we need to process the input cache data. + # Remember, at this point, the current state is the "from" state + # of the state transition. The "to" state doesn't get set until + # the finalize_state_change() call at the end of this method. + + if($from_state eq $state{header} ) { + $self->_process_header( @data ); + } + elsif($from_state eq $state{descr} ) { + $self->_process_descriptions( @data ); + } + elsif($from_state eq $state{iteration} ) { + $self->_process_iteration( @data, $self->state_change_cache() ); + } + elsif($from_state eq $state{align} ) { + $self->_process_alignment( @data ); + } + elsif($from_state eq $state{footer} ) { + my $ok_to_pause = not $state eq $self->final_state; + $self->_process_footer( $ok_to_pause, @data ); + } + + $self->finalize_state_change( $state, 1 ); + } +} + + +sub _add_error { + my ($self, $err) = @_; + if( $err ) { + push @{$self->{'_blast_errs'}}, $err; + } +} + +sub _clear_errors { + my $self = shift; + $self->{'_blast_errs'} = undef; +} + +#--------- +sub errors { +#--------- + my $self = shift; + my @errs = (); + @errs = @{$self->{'_blast_errs'}} if ref $self->{'_blast_errs'}; + return @errs; +} + + +#---------------------- +sub _init_parse_params { +#---------------------- +#Initializes parameters used during parsing of Blast reports. + + my ( $self, @param ) = @_; + # -FILT_FUNC has been replaced by -HIT_FILTER. + # Leaving -FILT_FUNC in place for backward compatibility + my($signif, $filt_func, $hit_filter, $min_len, $check_all, + $gapped, $score, $overlap, $stats, $best, $shallow_parse, + $hold_raw) = + $self->_rearrange([qw(SIGNIF FILT_FUNC HIT_FILTER MIN_LEN + CHECK_ALL_HITS GAPPED SCORE + OVERLAP STATS BEST SHALLOW_PARSE HOLD_RAW_DATA)], @param); + + $self->{'_hit_filter'} = $hit_filter || $filt_func || 0; + $self->{'_check_all'} = $check_all || 0; + $self->{'_shallow_parse'} = $shallow_parse || 0; + $self->{'_hold_raw_data'} = $hold_raw || 0; + + $self->_set_signif($signif, $min_len, $self->{'_hit_filter'}, $score); + $self->best_hit_only($best) if $best; + $self->{'_blast_count'} = 0; + + $self->{'_collect_stats'} = defined($stats) ? $stats : 0; + + # TODO: Automatically determine whether gapping was used. + # e.g., based on version number. Otherwise, have to read params. + $self->{'_gapped'} = $gapped || 1; + + # Clear any errors from previous parse. + $self->_clear_errors; + undef $self->{'_hit_count'}; + undef $self->{'_num_hits_significant'}; +} + +#=head2 _set_signif +# +# Usage : n/a; called automatically by _init_parse_params() +# Purpose : Sets significance criteria for the BLAST object. +# Argument : Obligatory three arguments: +# : $signif = float or sci-notation number or undef +# : $min_len = integer or undef +# : $hit_filter = function reference or undef +# : +# : If $signif is undefined, a default value is set +# : (see $DEFAULT_SIGNIF; min_length = not set). +# Throws : Exception if significance value is defined but appears +# : out of range or invalid. +# : Exception if $hit_filter if defined and is not a func ref. +# Comments : The significance of a BLAST report can be based on +# : the P (or Expect) value and/or the length of the query sequence. +# : P (or Expect) values GREATER than '_max_significance' are not significant. +# : Query sequence lengths LESS than '_min_length' are not significant. +# : +# : Hits can also be screened using arbitrary significance criteria +# : as discussed in the parse() method. +# : +# : If no $signif is defined, the '_max_significance' level is set to +# : $DEFAULT_SIGNIF (999). +# +#See Also : L<signif>(), L<min_length>(), L<_init_parse_params>(), L<parse>() +# +#=cut + +#----------------- +sub _set_signif { +#----------------- + my( $self, $sig, $len, $func, $score ) = @_; + + if(defined $sig) { + $self->{'_confirm_significance'} = 1; + if( $sig =~ /[^\d.e-]/ or $sig <= 0) { + $self->throw(-class => 'Bio::Root::BadParameter', + -text => "Invalid significance value: $sig\n". + "Must be a number greater than zero."); + } + $self->{'_max_significance'} = $sig; + } else { + $self->{'_max_significance'} = $DEFAULT_SIGNIF; + } + + if(defined $score) { + $self->{'_confirm_significance'} = 1; + if( $score =~ /[^\de+]/ or $score <= 0) { + $self->throw(-class => 'Bio::Root::BadParameter', + -text => "Invalid score value: $score\n". + "Must be an integer greater than zero."); + } + $self->{'_min_score'} = $score; + } else { + $self->{'_min_score'} = $DEFAULT_SCORE; + } + + if(defined $len) { + if($len =~ /\D/ or $len <= 0) { + $self->warn("Invalid minimum length value: $len", + "Value must be an integer > 0. Value not set."); + } else { + $self->{'_min_length'} = $len; + } + } + + if(defined $func) { + $self->{'_check_all'} = 1; + $self->{'_confirm_significance'} = 1; + if($func and not ref $func eq 'CODE') { + $self->throw("Not a function reference: $func", + "The -hit_filter parameter must be function reference."); + } + } +} + +=head2 signif + +Synonym for L<max_significance()|max_significance> + +=cut + +#----------- +sub signif { shift->max_significance } + + +=head2 max_significance + + Usage : $obj->max_significance(); + Purpose : Gets the P or Expect value used as significance screening cutoff. + This is the value of the -signif parameter supplied to new(). + Hits with P or E-value above this are skipped. + Returns : Scientific notation number with this format: 1.0e-05. + Argument : n/a + Comments : Screening of significant hits uses the data provided on the + : description line. For NCBI BLAST1 and WU-BLAST, this data + : is P-value. for NCBI BLAST2 it is an Expect value. + +=cut + +#----------- +sub max_significance { +#----------- + my $self = shift; + my $sig = $self->{'_max_significance'}; + sprintf "%.1e", $sig; +} + +=head2 min_score + + Usage : $obj->min_score(); + Purpose : Gets the Blast score used as screening cutoff. + This is the value of the -score parameter supplied to new(). + Hits with scores below this are skipped. + Returns : Integer or scientific notation number. + Argument : n/a + Comments : Screening of significant hits uses the data provided on the + : description line. + +=cut + +#----------- +sub min_score { +#----------- + my $self = shift; + return $self->{'_min_score'}; +} + +=head2 min_length + + Usage : $obj->min_length(); + Purpose : Gets the query sequence length used as screening criteria. + This is the value of the -min_len parameter supplied to new(). + Hits with sequence length below this are skipped. + Returns : Integer + Argument : n/a + +See Also : L<signif()|signif> + +=cut + +#-------------- +sub min_length { +#-------------- + my $self = shift; + $self->{'_min_length'}; +} + +=head2 highest_signif + + Usage : $value = $obj->highest_signif(); + Purpose : Gets the largest significance (P- or E-value) observed in + the report. + : For NCBI BLAST1 and WU-BLAST, this is a P-value. + : For NCBI BLAST2 it is an Expect value. + Returns : Float or sci notation number + Argument : n/a + +=cut + +sub highest_signif { shift->{'_highestSignif'} } + +=head2 lowest_signif + + Usage : $value = $obj->lowest_signif(); + Purpose : Gets the largest significance (P- or E-value) observed in + the report. + : For NCBI BLAST1 and WU-BLAST, this is a P-value. + : For NCBI BLAST2 it is an Expect value. + Returns : Float or sci notation number + Argument : n/a + +=cut + +sub lowest_signif { shift->{'_lowestSignif'} } + +=head2 highest_score + + Usage : $value = $obj->highest_score(); + Purpose : Gets the largest BLAST score observed in the report. + Returns : Integer or sci notation number + Argument : n/a + +=cut + +sub highest_score { shift->{'_highestScore'} } + +=head2 lowest_score + + Usage : $value = $obj->lowest_score(); + Purpose : Gets the largest BLAST score observed in the report. + Returns : Integer or sci notation number + Argument : n/a + +=cut + +sub lowest_score { shift->{'_lowestScore'} } + + +# Throws : Exception if BLAST report contains a FATAL: error. +sub _process_header { + my ($self, @data) = @_; + +# print STDERR "Processing Header...\n"; + + $current_iteration = 0; + $self->{'_result_count'}++; + # Finish off the current Blast object, if any + my $blast = $self->{'_current_blast'} = $self->result_factory->create_result(); + + my ($set_method, $set_query, $set_db, $set_length); + my ($query_start, $query_desc); + + foreach my $line (@data) { + if( $line =~ /^(<.*>)?(T?BLAST[NPX])\s+(.*)$/ ) { + $blast->analysis_method( $2 ); + $blast->analysis_method_version( $3 ); + $set_method = 1; + } + elsif ($line =~ /^Query= *(.+)$/ ) { + $query_start = 1; + my $info = $1; + $info =~ s/TITLE //; + # Split the query line into two parts. + # Using \s instead of ' ' + $info =~ /(\S+?)\s+(.*)/; + # set name of Blast object and return. + $blast->query_name($1 || 'UNKNOWN'); + $query_desc = $2 || ''; + $set_query = 1; + } + elsif ($line =~ /^Database:\s+(.+)$/ ) { + require Bio::Search::GenericDatabase; + my $blastdb = Bio::Search::GenericDatabase->new( -name => $1 ); + $blast->analysis_subject( $blastdb ); + $set_db = 1; + } + elsif( $line =~ m/^\s+\(([\d|,]+) letters\)/ ) { + my $length = $1; + $length =~ s/,//g; + $self->_set_query_length( $length ); + $set_length = 1; + $blast->query_description( $query_desc ); + $query_start = 0; + } + elsif( $line =~ /WARNING: (.+?)/ ) { + $self->warn( $1 ); + } + elsif( $line =~ /FATAL: (.+?)/ ) { + $self->throw("FATAL BLAST Report Error: $1"); + } + # This needs to be the last elsif block. + elsif( $query_start ) { + # Handling multi-line query descriptions. + chomp( $line ); + $query_desc .= " $line"; + } + } + if (!$set_method) { + $self->throw("Can't determine type of BLAST program."); + } + if (!$set_query) { + $self->throw("Can't determine name of query sequence."); + } + if(!$set_db) { + $self->throw("Can't determine name of database."); + } + if(!$set_length) { + $self->throw("Can't determine sequence length from BLAST report."); + } + +} + +sub _process_descriptions { + my ($self, @data) = @_; +# print STDERR "Processing Descriptions...\n"; + + # Step through each line parsing out the P/Expect value + # All we really need to do is check the first one, if it doesn't + # meet the significance requirement, we can skip the report. + # BUT: we want to collect data for all hits anyway to get min/max signif. + + my $max_signif = $self->max_significance; + my $min_score = $self->min_score; + my $layout_set = $self->{'_layout'} || 0; + my ($layout, $sig, $hitid, $score, $is_p_value); + + if( $data[0] =~ /^\s*Sequences producing.*Score\s+P/i ) { + $is_p_value = 1; + } else { + $is_p_value = 0; + } + + my $hit_found_again; + + desc_loop: + foreach my $line (@data) { + last desc_loop if $line =~ / NONE |End of List|Results from round/; + next desc_loop if $line =~ /^\.\./; + + if($line =~ /^Sequences used in model/ ) { + #Sequences used in model and found again: + $hit_found_again = 1; + next; + } + elsif($line =~ /^Sequences not found previously/ ) { + #Sequences not found previously or not previously below threshold: + $hit_found_again = 0; + next; + } + + ## Checking the significance value (P- or Expect value) of the hit + ## in the description line. + + next desc_loop unless $line =~ /\d/; + + # TODO: These regexps need testing on a variety of reports. + if ( $line =~ /^(\S+)\s+.*\s+([\de.+-]+)\s{1,5}[\de.-]+\s*$/) { + $hitid = $1; + $score = $2; + $layout = 2; + } elsif( $line =~ /^(\S+)\s+.*\s+([\de.+-]+)\s{1,5}[\de.-]+\s{1,}\d+\s*$/) { + $hitid = $1; + $score = $2; + $layout = 1; + } else { + $self->warn("Can't parse description line\n $line"); + next desc_loop; + } + not $layout_set and ($self->_layout($layout), $layout_set = 1); + + $sig = &_parse_signif( $line, $layout, $self->{'_gapped'} ); + +# print STDERR " Parsed signif for $hitid: $sig (layout=$layout)\n"; + + $self->{'_hit_hash'}->{$hitid}->{'signif'} = $sig; + $self->{'_hit_hash'}->{$hitid}->{'score'} = $score; + $self->{'_hit_hash'}->{$hitid}->{'found_again'} = $hit_found_again; + $self->{'_hit_hash'}->{'is_pval'} = $is_p_value; + + last desc_loop if (not $self->{'_check_all'} and + ($sig > $max_signif or $score < $min_score)); + + $self->_process_significance($sig, $score); + } + +# printf "\n%d SIGNIFICANT HITS.\nDONE PARSING DESCRIPTIONS.\n", $self->{'_num_hits_significant'}; +} + + +#=head2 _set_query_length +# +# Usage : n/a; called automatically during Blast report parsing. +# Purpose : Sets the length of the query sequence (extracted from report). +# Returns : integer (length of the query sequence) +# Throws : Exception if cannot determine the query sequence length from +# : the BLAST report. +# : Exception if the length is below the min_length cutoff (if any). +# Comments : The logic here is a bit different from the other _set_XXX() +# : methods since the significance of the BLAST report is assessed +# : if MIN_LENGTH is set. +# +#=cut + +#--------------- +sub _set_query_length { +#--------------- + my ($self, $length) = @_; + + my($sig_len); + if(defined($self->{'_min_length'})) { + local $^W = 0; + if($length < $self->{'_min_len'}) { + $self->throw("Query sequence too short (Query= ${\$self->{'_current_blast'}->query_name}, length= $length)", + "Minimum length is $self->{'_min_len'}"); + } + } + + $self->{'_current_blast'}->query_length($length); +} + + +# Records the highest and lowest significance (P- or E-value) and +# score encountered in a given report. +sub _set_hi_low_signif_and_score { + my($self, $sig, $score) = @_; + + my $hiSig = $self->{'_highestSignif'} || 0; + my $lowSig = $self->{'_lowestSignif'} || $DEFAULT_SIGNIF; + my $hiScore = $self->{'_highestScore'} || 0; + my $lowScore = $self->{'_lowestScore'} || $DEFAULT_SIGNIF; + + $self->{'_highestSignif'} = ($sig > $hiSig) + ? $sig : $hiSig; + + $self->{'_lowestSignif'} = ($sig < $lowSig) + ? $sig : $lowSig; + + $self->{'_highestScore'} = ($score > $hiScore) + ? $score : $hiScore; + + $self->{'_lowestScore'} = ($score < $lowScore) + ? $score : $lowScore; +} + + +sub _process_significance { + my($self, $sig, $score) = @_; + + $self->_set_hi_low_signif_and_score($sig, $score); + + # Significance value assessment. + if($sig <= $self->{'_max_significance'} and $score >= $self->{'_min_score'}) { + $self->{'_num_hits_significant'}++; + } + $self->{'_num_hits'}++; + + $self->{'_is_significant'} = 1 if $self->{'_num_hits_significant'}; +} + +#=head2 _layout +# +# Usage : n/a; internal method. +# Purpose : Set/Get indicator for the layout of the report. +# Returns : Integer (1 | 2) +# : Defaults to 2 if not set. +# Comments : Blast1 and WashU-Blast2 have a layout = 1. +# : This is intended for internal use by this and closely +# : allied modules like BlastHit.pm and BlastHSP.pm. +# +#=cut + +#------------ +sub _layout { +#------------ + my $self = shift; + if(@_) { + $self->{'_layout'} = shift; + } + $self->{'_layout'} || 2; +} + +#=head2 _parse_signif +# +# Usage : $signif = _parse_signif(string, layout, gapped); +# : This is a class function. +# Purpose : Extracts the P- or Expect value from a single line of a BLAST description section. +# Example : _parse_signif("PDB_UNIQUEP:3HSC_ heat-shock cognate ... 799 4.0e-206 2", 1); +# : _parse_signif("gi|758803 (U23828) peritrophin-95 precurs 38 0.19", 2); +# Argument : string = line from BLAST description section +# : layout = integer (1 or 2) +# : gapped = boolean (true if gapped Blast). +# Returns : Float (0.001 or 1e-03) +# Status : Static +# +#=cut + +#------------------ +sub _parse_signif { +#------------------ + my ($line, $layout, $gapped) = @_; + + local $_ = $line; + my @linedat = split(); + + # When processing both Blast1 and Blast2 reports + # in the same run, offset needs to be configured each time. + # NOTE: We likely won't be supporting mixed report streams. Not needed. + + my $offset = 0; + $offset = 1 if $layout == 1 or not $gapped; + + my $signif = $linedat[ $#linedat - $offset ]; + + # fail-safe check + if(not $signif =~ /[.-]/) { + $offset = ($offset == 0 ? 1 : 0); + $signif = $linedat[ $#linedat - $offset ]; + } + + $signif = "1$signif" if $signif =~ /^e/i; + return $signif; +} + + +=head2 best_hit_only + + Usage : print "only getting best hit.\n" if $obj->best_hit_only(); + Purpose : Set/Get the indicator for whether or not to processing only + : the best BlastHit. + Returns : Boolean (1 | 0) + Argument : n/a + +=cut + +#---------- +sub best_hit_only { +#---------- + my $self = shift; + if(@_) { $self->{'_best'} = shift; } + $self->{'_best'}; +} + +sub _process_alignment { + my ($self, @data) = @_; +# print STDERR "Processing Alignment...\n"; + + # If all of the significant hits have been parsed, + # return if we're not checking all or if we don't need to get + # the Blast stats (parameters at footer of report). + if(defined $self->{'_hit_count'} and + defined $self->{'_num_hits_significant'}) { + return if $self->{'_hit_count'} >= $self->{'_num_hits_significant'} and + not ($self->{'_check_all'} or $self->{'_collect_stats'}); + } + + # Return if we're only interested in the best hit. + # This has to occur after checking for the parameters section + # in the footer (since we may still be interested in them). + return if $self->best_hit_only and ( defined $self->{'_hit_count'} and $self->{'_hit_count'} >=1); + + push @data, 'end'; + +# print STDERR "\nALIGNMENT DATA:\n@data\n"; + + my $max_signif = $self->max_significance; + my $min_score = $self->min_score; + + my ($hitid, $score, $signif, $is_pval, $found_again); + if( $data[0] =~ /^(\S+)\s+/ ) { + $hitid = $1; + return unless defined $self->{'_hit_hash'}->{$hitid}; + $score = $self->{'_hit_hash'}->{$hitid}->{'score'}; + $signif = $self->{'_hit_hash'}->{$hitid}->{'signif'}; + $found_again = $self->{'_hit_hash'}->{$hitid}->{'found_again'}; + $is_pval = $self->{'_hit_hash'}->{'is_pval'}; +# print STDERR " Got hitid: $hitid ($signif, $score, P?=$is_pval)\n"; + } + + # Now construct the BlastHit objects from the alignment section + + # debug(1); + + $self->{'_hit_count'}++; + + # If not confirming significance, _process_descriptions will not have been run, + # so we need to count the total number of hits here. + if( not $self->{'_confirm_significance'}) { + $self->{'_num_hits'}++; + } + + my %hit_params = ( -RESULT => $self->{'_current_blast'}, + -RAW_DATA =>\@data, + -SIGNIF => $signif, + -IS_PVAL => $is_pval, + -SCORE => $score, + -RANK => $self->{'_hit_count'}, + -RANK_BY => 'order', + -OVERLAP => $self->{'_overlap'} || $MAX_HSP_OVERLAP, + -FOUND_AGAIN => $found_again, + -SHALLOW_PARSE => $self->{'_shallow_parse'}, + -HOLD_RAW_DATA => $self->{'_hold_raw_data'}, + ); + + my $hit; + $hit = $self->hit_factory->create_hit( %hit_params ); + + # printf STDERR "NEW HIT: %s, SIGNIFICANCE = %g\n", $hit->name, $hit->expect; <STDIN>; + # The BLAST report may have not had a description section. + if(not $self->{'_has_descriptions'}) { + $self->_process_significance($hit->signif, $score); + } + + # Collect overall signif data if we don't already have it, + # (as occurs if no -signif or -score parameter are supplied). + my $hit_signif = $hit->signif; + + if (not $self->{'_confirm_significance'} ) { + $self->_set_hi_low_signif_and_score($hit_signif, $score); + } + + # Test significance using custom function (if supplied) + my $add_hit = 0; + + my $hit_filter = $self->{'_hit_filter'} || 0; + + if($hit_filter) { + if(&$hit_filter($hit)) { + $add_hit = 1; + } + } elsif($hit_signif <= $max_signif and $score >= $min_score) { + $add_hit = 1; + } + $add_hit && $self->{'_current_blast'}->add_hit( $hit ); +} + + +sub _process_footer { + my ($self, $ok_to_pause, @data) = @_; +# print STDERR "Processing Footer...\n"; + + $self->{'_current_blast'}->raw_statistics( [@data] ); + + if($self->{'_collect_stats'}) { + foreach my $line (@data) { + if( $line =~ /^\s*Matrix:\s*(\S+)/i ) { + $self->{'_current_blast'}->matrix( $1 ); + } + elsif( $line =~ /^\s*Number of Sequences:\s*(\d+)/i ) { + $self->{'_current_blast'}->analysis_subject->entries( $1 ); + } + elsif( $line =~ /^\s*length of database:\s*(\d+)/i ) { + $self->{'_current_blast'}->analysis_subject->letters( $1 ); + } + elsif( $line =~ /^\s*Posted date:\s*(.+)$/i ) { + $self->{'_current_blast'}->analysis_subject->date( $1 ); + } + } + } + + if( $self->errors ) { + my $num_err = scalar($self->errors); + $self->warn( "$num_err Blast parsing errors occurred."); + foreach( $self->errors ) { print STDERR "$_\n"; }; + } + + if( $self->{'_pause_between_reports'} and $ok_to_pause ) { + $self->pause; + } + +} + +sub _process_nohits { + my $self = shift; +# print STDERR "Processing No Hits (iteration = $current_iteration)\n"; + $self->{'_current_blast'}->set_no_hits_found( $current_iteration ); +} + + +sub _process_iteration { + my ($self, @data) = @_; +# print STDERR "Processing Iteration\n"; +# print STDERR " Incrementing current iteration (was=$current_iteration)\n"; + $current_iteration++; + $self->{'_current_blast'}->iterations( $current_iteration ); + + foreach( @data ) { + if( /Results from round \d+/i ) { + $self->{'_current_blast'}->psiblast( 1 ); + } + elsif( /No hits found/i ) { + $self->_process_nohits(); + last; + } + elsif( /^\s*Sequences/i ) { + $self->_process_descriptions( @data ); + last; + } + } +} + +sub pause_between_reports { + my ($self, $setting) = @_; + if( defined $setting ) { + $self->{'_pause_between_reports'} = $setting; + } + $self->{'_pause_between_reports'}; +} + +sub result_count { + my $self = shift; + return $self->{'_result_count'}; +} + +# For backward compatibility: +sub report_count { shift->result_count } + +sub next_result { + my ($self) = @_; + # print STDERR "next_result() called\n"; + if( not $self->running ) { + $self->run; + } + else { + $self->resume; + } + my $blast = $self->{'_current_blast'}; + $self->{'_current_blast'} = undef; + return $blast; +} + +=head2 write_result + + Title : write_result + Usage : $stream->write_result($result_result, @other_args) + Function: Writes data from the $result_result object into the stream. + : Delegates to the to_string() method of the associated + : WriterI object. + Returns : 1 for success and 0 for error + Args : Bio::Search:Result::ResultI object, + : plus any other arguments for the Writer + Throws : Bio::Root::Exception if a Writer has not been set. + +See L<Bio::Root::Exception> + +=cut + +sub write_result { + my ($self, $blast, @args) = @_; + + if( not defined($self->writer) ) { + $self->warn("Writer not defined. Using a $DEFAULT_BLAST_WRITER_CLASS"); + $self->writer( $DEFAULT_BLAST_WRITER_CLASS->new() ); + } + $self->SUPER::write_result( $blast, @args ); +} + + + +1; +__END__ + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SearchIO/waba.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SearchIO/waba.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,517 @@ +# $Id: waba.pm,v 1.8 2002/12/11 22:12:32 jason Exp $ +# +# BioPerl module for Bio::SearchIO::waba +# +# Cared for by Jason Stajich <jason@bioperl.org> +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SearchIO::waba - SearchIO parser for Jim Kent WABA program +alignment output + +=head1 SYNOPSIS + + # do not use this object directly, rather through Bio::SearchIO + + use Bio::SearchIO; + my $in = new Bio::SearchIO(-format => 'waba', + -file => 'output.wab'); + while( my $result = $in->next_result ) { + while( my $hit = $result->next_hit ) { + while( my $hsp = $result->next_hsp ) { + + } + } + } + +=head1 DESCRIPTION + +This parser will process the waba output (NOT the human readable format). + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SearchIO::waba; +use vars qw(@ISA %MODEMAP %MAPPING @STATES); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::SearchIO; + +use POSIX; + +BEGIN { + # mapping of NCBI Blast terms to Bioperl hash keys + %MODEMAP = ('WABAOutput' => 'result', + 'Hit' => 'hit', + 'Hsp' => 'hsp' + ); + @STATES = qw(Hsp_qseq Hsp_hseq Hsp_stateseq); + %MAPPING = + ( + 'Hsp_query-from'=> 'HSP-query_start', + 'Hsp_query-to' => 'HSP-query_end', + 'Hsp_hit-from' => 'HSP-hit_start', + 'Hsp_hit-to' => 'HSP-hit_end', + 'Hsp_qseq' => 'HSP-query_seq', + 'Hsp_hseq' => 'HSP-hit_seq', + 'Hsp_midline' => 'HSP-homology_seq', + 'Hsp_stateseq' => 'HSP-hmmstate_seq', + 'Hsp_align-len' => 'HSP-hsp_length', + + 'Hit_id' => 'HIT-name', + 'Hit_accession' => 'HIT-accession', + + 'WABAOutput_program' => 'RESULT-algorithm_name', + 'WABAOutput_version' => 'RESULT-algorithm_version', + 'WABAOutput_query-def'=> 'RESULT-query_name', + 'WABAOutput_query-db' => 'RESULT-query_database', + 'WABAOutput_db' => 'RESULT-database_name', + ); +} + + +@ISA = qw(Bio::SearchIO ); + +=head2 new + + Title : new + Usage : my $obj = new Bio::SearchIO::waba(); + Function: Builds a new Bio::SearchIO::waba object + Returns : Bio::SearchIO::waba + Args : see Bio::SearchIO + +=cut + +sub _initialize { + my ($self,@args) = @_; + $self->SUPER::_initialize(@args); + $self->_eventHandler->register_factory('result', Bio::Search::Result::ResultFactory->new(-type => 'Bio::Search::Result::WABAResult')); + + $self->_eventHandler->register_factory('hsp', Bio::Search::HSP::HSPFactory->new(-type => 'Bio::Search::HSP::WABAHSP')); +} + + +=head2 next_result + + Title : next_result + Usage : my $hit = $searchio->next_result; + Function: Returns the next Result from a search + Returns : Bio::Search::Result::ResultI object + Args : none + +=cut + +sub next_result{ + my ($self) = @_; + + my ($curquery,$curhit); + my $state = -1; + $self->start_document(); + my @hit_signifs; + while( defined ($_ = $self->_readline )) { + + if( $state == -1 ) { + my ($qid, $qhspid,$qpercent, $junk, + $alnlen,$qdb,$qacc,$qstart,$qend,$qstrand, + $hitdb,$hacc,$hstart,$hend, + $hstrand) = + ( /^(\S+)\.(\S+)\s+align\s+ # get the queryid + (\d+(\.\d+)?)\%\s+ # get the percentage + of\s+(\d+)\s+ # get the length of the alignment + (\S+)\s+ # this is the query database + (\S+):(\d+)\-(\d+) # The accession:start-end for query + \s+([\-\+]) # query strand + \s+(\S+)\. # hit db + (\S+):(\d+)\-(\d+) # The accession:start-end for hit + \s+([\-\+])\s*$ # hit strand + /ox ); + + # Curses. Jim's code is 0 based, the following is to readjust + $hstart++; $hend++; $qstart++; $qend++; + + if( ! defined $alnlen ) { + $self->warn("Unable to parse the rest of the WABA alignment info for: $_"); + last; + } + $self->{'_reporttype'} = 'WABA'; # hardcoded - only + # one type of WABA AFAIK + if( defined $curquery && + $curquery ne $qid ) { + $self->end_element({'Name' => 'Hit'}); + $self->_pushback($_); + $self->end_element({'Name' => 'WABAOutput'}); + return $self->end_document(); + } + + if( defined $curhit && + $curhit ne $hacc) { + # slight duplication here -- keep these in SYNC + $self->end_element({'Name' => 'Hit'}); + $self->start_element({'Name' => 'Hit'}); + $self->element({'Name' => 'Hit_id', + 'Data' => $hacc}); + $self->element({'Name' => 'Hit_accession', + 'Data' => $hacc}); + + } elsif ( ! defined $curquery ) { + $self->start_element({'Name' => 'WABAOutput'}); + $self->{'_result_count'}++; + $self->element({'Name' => 'WABAOutput_query-def', + 'Data' => $qid }); + $self->element({'Name' => 'WABAOutput_program', + 'Data' => 'WABA'}); + $self->element({'Name' => 'WABAOutput_query-db', + 'Data' => $qdb}); + $self->element({'Name' => 'WABAOutput_db', + 'Data' => $hitdb}); + + # slight duplication here -- keep these N'SYNC ;-) + $self->start_element({'Name' => 'Hit'}); + $self->element({'Name' => 'Hit_id', + 'Data' => $hacc}); + $self->element({'Name' => 'Hit_accession', + 'Data' => $hacc}); + } + + + # strand is inferred by start,end values + # in the Result Builder + if( $qstrand eq '-' ) { + ($qstart,$qend) = ($qend,$qstart); + } + if( $hstrand eq '-' ) { + ($hstart,$hend) = ($hend,$hstart); + } + + $self->start_element({'Name' => 'Hsp'}); + $self->element({'Name' => 'Hsp_query-from', + 'Data' => $qstart}); + $self->element({'Name' => 'Hsp_query-to', + 'Data' => $qend}); + $self->element({'Name' => 'Hsp_hit-from', + 'Data' => $hstart}); + $self->element({'Name' => 'Hsp_hit-to', + 'Data' => $hend}); + $self->element({'Name' => 'Hsp_align-len', + 'Data' => $alnlen}); + + $curquery = $qid; + $curhit = $hacc; + $state = 0; + } elsif( ! defined $curquery ) { + $self->warn("skipping because no Hit begin line was recognized\n$_") if( $_ !~ /^\s+$/ ); + next; + } else { + chomp; + $self->element({'Name' => $STATES[$state++], + 'Data' => $_}); + if( $state >= scalar @STATES ) { + $state = -1; + $self->end_element({'Name' => 'Hsp'}); + } + } + } + if( defined $curquery ) { + $self->end_element({'Name' => 'Hit'}); + $self->end_element({'Name' => 'WABAOutput'}); + return $self->end_document(); + } + return undef; +} + +=head2 start_element + + Title : start_element + Usage : $eventgenerator->start_element + Function: Handles a start element event + Returns : none + Args : hashref with at least 2 keys 'Data' and 'Name' + + +=cut + +sub start_element{ + my ($self,$data) = @_; + # we currently don't care about attributes + my $nm = $data->{'Name'}; + if( my $type = $MODEMAP{$nm} ) { + $self->_mode($type); + if( $self->_eventHandler->will_handle($type) ) { + my $func = sprintf("start_%s",lc $type); + $self->_eventHandler->$func($data->{'Attributes'}); + } + unshift @{$self->{'_elements'}}, $type; + } + if($nm eq 'WABAOutput') { + $self->{'_values'} = {}; + $self->{'_result'}= undef; + $self->{'_mode'} = ''; + } + +} + +=head2 end_element + + Title : start_element + Usage : $eventgenerator->end_element + Function: Handles an end element event + Returns : none + Args : hashref with at least 2 keys 'Data' and 'Name' + + +=cut + +sub end_element { + my ($self,$data) = @_; + my $nm = $data->{'Name'}; + my $rc; + # Hsp are sort of weird, in that they end when another + # object begins so have to detect this in end_element for now + if( $nm eq 'Hsp' ) { + foreach ( qw(Hsp_qseq Hsp_midline Hsp_hseq) ) { + $self->element({'Name' => $_, + 'Data' => $self->{'_last_hspdata'}->{$_}}); + } + $self->{'_last_hspdata'} = {} + } + + if( my $type = $MODEMAP{$nm} ) { + if( $self->_eventHandler->will_handle($type) ) { + my $func = sprintf("end_%s",lc $type); + $rc = $self->_eventHandler->$func($self->{'_reporttype'}, + $self->{'_values'}); + } + shift @{$self->{'_elements'}}; + + } elsif( $MAPPING{$nm} ) { + if ( ref($MAPPING{$nm}) =~ /hash/i ) { + my $key = (keys %{$MAPPING{$nm}})[0]; + $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'}; + } else { + $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'}; + } + } else { + $self->warn( "unknown nm $nm ignoring\n"); + } + $self->{'_last_data'} = ''; # remove read data if we are at + # end of an element + $self->{'_result'} = $rc if( $nm eq 'WABAOutput' ); + return $rc; + +} + +=head2 element + + Title : element + Usage : $eventhandler->element({'Name' => $name, 'Data' => $str}); + Function: Convience method that calls start_element, characters, end_element + Returns : none + Args : Hash ref with the keys 'Name' and 'Data' + + +=cut + +sub element{ + my ($self,$data) = @_; + $self->start_element($data); + $self->characters($data); + $self->end_element($data); +} + + +=head2 characters + + Title : characters + Usage : $eventgenerator->characters($str) + Function: Send a character events + Returns : none + Args : string + + +=cut + +sub characters{ + my ($self,$data) = @_; + + return unless ( defined $data->{'Data'} ); + if( $data->{'Data'} =~ /^\s+$/ ) { + return unless $data->{'Name'} =~ /Hsp\_(midline|qseq|hseq)/; + } + + if( $self->in_element('hsp') && + $data->{'Name'} =~ /Hsp\_(qseq|hseq|midline)/ ) { + + $self->{'_last_hspdata'}->{$data->{'Name'}} .= $data->{'Data'}; + } + + $self->{'_last_data'} = $data->{'Data'}; +} + +=head2 _mode + + Title : _mode + Usage : $obj->_mode($newval) + Function: + Example : + Returns : value of _mode + Args : newvalue (optional) + + +=cut + +sub _mode{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'_mode'} = $value; + } + return $self->{'_mode'}; +} + +=head2 within_element + + Title : within_element + Usage : if( $eventgenerator->within_element($element) ) {} + Function: Test if we are within a particular element + This is different than 'in' because within can be tested + for a whole block. + Returns : boolean + Args : string element name + + +=cut + +sub within_element{ + my ($self,$name) = @_; + return 0 if ( ! defined $name && + ! defined $self->{'_elements'} || + scalar @{$self->{'_elements'}} == 0) ; + foreach ( @{$self->{'_elements'}} ) { + if( $_ eq $name ) { + return 1; + } + } + return 0; +} + +=head2 in_element + + Title : in_element + Usage : if( $eventgenerator->in_element($element) ) {} + Function: Test if we are in a particular element + This is different than 'in' because within can be tested + for a whole block. + Returns : boolean + Args : string element name + + +=cut + +sub in_element{ + my ($self,$name) = @_; + return 0 if ! defined $self->{'_elements'}->[0]; + return ( $self->{'_elements'}->[0] eq $name) +} + + +=head2 start_document + + Title : start_document + Usage : $eventgenerator->start_document + Function: Handles a start document event + Returns : none + Args : none + + +=cut + +sub start_document{ + my ($self) = @_; + $self->{'_lasttype'} = ''; + $self->{'_values'} = {}; + $self->{'_result'}= undef; + $self->{'_mode'} = ''; + $self->{'_elements'} = []; +} + + +=head2 end_document + + Title : end_document + Usage : $eventgenerator->end_document + Function: Handles an end document event + Returns : Bio::Search::Result::ResultI object + Args : none + + +=cut + +sub end_document{ + my ($self,@args) = @_; + return $self->{'_result'}; +} + +=head2 result_count + + Title : result_count + Usage : my $count = $searchio->result_count + Function: Returns the number of results we have processed + Returns : integer + Args : none + + +=cut + +sub result_count { + my $self = shift; + return $self->{'_result_count'}; +} + +sub report_count { shift->result_count } + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Seq.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Seq.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1250 @@ +# $Id: Seq.pm,v 1.76.2.2 2003/07/03 20:01:32 jason Exp $ +# +# BioPerl module for Bio::Seq +# +# Cared for by Ewan Birney <birney@ebi.ac.uk> +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Seq - Sequence object, with features + +=head1 SYNOPSIS + + # This is the main sequence object in Bioperl + + # gets a sequence from a file + $seqio = Bio::SeqIO->new( '-format' => 'embl' , -file => 'myfile.dat'); + $seqobj = $seqio->next_seq(); + + # SeqIO can both read and write sequences; see Bio::SeqIO + # for more information and examples + + # get from database + $db = Bio::DB::GenBank->new(); + $seqobj = $db->get_Seq_by_acc('X78121'); + + # make from strings in script + $seqobj = Bio::Seq->new( -display_id => 'my_id', + -seq => $sequence_as_string); + + # gets sequence as a string from sequence object + $seqstr = $seqobj->seq(); # actual sequence as a string + $seqstr = $seqobj->subseq(10,50); # slice in biological coordinates + + # retrieves information from the sequence + # features must implement Bio::SeqFeatureI interface + + @features = $seqobj->get_SeqFeatures(); # just top level + foreach my $feat ( @features ) { + print "Feature ",$feat->primary_tag," starts ",$feat->start," ends ", + $feat->end," strand ",$feat->strand,"\n"; + + # features retain link to underlying sequence object + print "Feature sequence is ",$feat->seq->seq(),"\n" + } + + # sequences may have a species + + if( defined $seq->species ) { + print "Sequence is from ",$species->binomial_name," [",$species->common_name,"]\n"; + } + + # annotation objects are Bio::AnnotationCollectionI's + $ann = $seqobj->annotation(); # annotation object + + # references is one type of annotations to get. Also get + # comment and dblink. Look at Bio::AnnotationCollection for + # more information + + foreach my $ref ( $ann->get_Annotations('reference') ) { + print "Reference ",$ref->title,"\n"; + } + + # you can get truncations, translations and reverse complements, these + # all give back Bio::Seq objects themselves, though currently with no + # features transfered + + my $trunc = $seqobj->trunc(100,200); + my $rev = $seqobj->revcom(); + + # there are many options to translate - check out the docs + my $trans = $seqobj->translate(); + + # these functions can be chained together + + my $trans_trunc_rev = $seqobj->trunc(100,200)->revcom->translate(); + + + +=head1 DESCRIPTION + +A Seq object is a sequence with sequence features placed on it. The +Seq object contains a PrimarySeq object for the actual sequence and +also implements its interface. + +In Bioperl we have 3 main players that people are going to use frequently + + Bio::PrimarySeq - just the sequence and its names, nothing else. + Bio::SeqFeatureI - a location on a sequence, potentially with a sequence + and annotation. + Bio::Seq - A sequence and a collection of sequence features + (an aggregate) with its own annotation. + +Although Bioperl is not tied heavily to file formats these distinctions do +map to file formats sensibly and for some bioinformaticians this might help + + Bio::PrimarySeq - Fasta file of a sequence + Bio::SeqFeatureI - A single entry in an EMBL/GenBank/DDBJ feature table + Bio::Seq - A single EMBL/GenBank/DDBJ entry + +By having this split we avoid a lot of nasty circular references +(sequence features can hold a reference to a sequence without the sequence +holding a reference to the sequence feature). See L<Bio::PrimarySeq> and +L<Bio::SeqFeatureI> for more information. + +Ian Korf really helped in the design of the Seq and SeqFeature system. + +=head1 EXAMPLES + +A simple and fundamental block of code + + use Bio::SeqIO; + + my $seqIOobj = Bio::SeqIO->new(-file=>"1.fa"); # create a SeqIO object + my $seqobj = $seqIOobj->next_seq; # get a Seq object + +With the Seq object in hand one has access to a powerful set of Bioperl +methods and Bioperl objects. This next script will take a file of sequences +in EMBL format and create a file of the reverse-complemented sequences +in Fasta format using Seq objects. It also prints out details about the +exons it finds as sequence features in Genbank Flat File format. + + use Bio::Seq; + use Bio::SeqIO; + + $seqin = Bio::SeqIO->new( -format => 'EMBL' , -file => 'myfile.dat'); + $seqout= Bio::SeqIO->new( -format => 'Fasta', -file => '>output.fa'); + + while((my $seqobj = $seqin->next_seq())) { + print "Seen sequence ",$seqobj->display_id,", start of seq ", + substr($seqobj->seq,1,10),"\n"; + if( $seqobj->alphabet eq 'dna') { + $rev = $seqobj->revcom; + $id = $seqobj->display_id(); + $id = "$id.rev"; + $rev->display_id($id); + $seqout->write_seq($rev); + } + + foreach $feat ( $seqobj->get_SeqFeatures() ) { + if( $feat->primary_tag eq 'exon' ) { + print STDOUT "Location ",$feat->start,":", + $feat->end," GFF[",$feat->gff_string,"]\n"; + } + } + } + +Let's examine the script. The lines below import the Bioperl modules. +Seq is the main Bioperl sequence object and SeqIO is the Bioperl support +for reading sequences from files and to files + + use Bio::Seq; + use Bio::SeqIO; + +These two lines create two SeqIO streams: one for reading in sequences +and one for outputting sequences: + + $seqin = Bio::SeqIO->new( -format => 'EMBL' , -file => 'myfile.dat'); + $seqout= Bio::SeqIO->new( -format => 'Fasta', -file => '>output.fa'); + +Notice that in the "$seqout" case there is a greater-than sign, +indicating the file is being opened for writing. + +Using the + + '-argument' => value + +syntax is common in Bioperl. The file argument is like an argument +to open() . You can also pass in filehandles or FileHandle objects by +using the -fh argument (see L<Bio::SeqIO> documentation for details). +Many formats in Bioperl are handled, including Fasta, EMBL, GenBank, +Swissprot (swiss), PIR, and GCG. + + $seqin = Bio::SeqIO->new( -format => 'EMBL' , -file => 'myfile.dat'); + $seqout= Bio::SeqIO->new( -format => 'Fasta', -file => '>output.fa'); + +This is the main loop which will loop progressively through sequences +in a file, and each call to $seqio-E<gt>next_seq() provides a new Seq +object from the file: + + while((my $seqobj = $seqio->next_seq())) { + +This print line below accesses fields in the Seq object directly. The +$seqobj-E<gt>display_id is the way to access the display_id attribute +of the Seq object. The $seqobj-E<gt>seq method gets the actual +sequence out as string. Then you can do manipulation of this if +you want to (there are however easy ways of doing truncation, +reverse-complement and translation). + + print "Seen sequence ",$seqobj->display_id,", start of seq ", + substr($seqobj->seq,1,10),"\n"; + +Bioperl has to guess the alphabet of the sequence, being either 'dna', +'rna', or 'protein'. The alphabet attribute is one of these three +possibilities. + + if( $seqobj->alphabet eq 'dna') { + +The $seqobj-E<gt>revcom method provides the reverse complement of the Seq +object as another Seq object. Thus, the $rev variable is a reference to +another Seq object. For example, one could repeat the above print line +for this Seq object (putting $rev in place of $seqobj). In this +case we are going to output the object into the file stream we built +earlier on. + + $rev = $seqobj->revcom; + +When we output it, we want the id of the outputted object +to be changed to "$id.rev", ie, with .rev on the end of the name. The +following lines retrieve the id of the sequence object, add .rev +to this and then set the display_id of the rev sequence object to +this. Notice that to set the display_id attribute you just need +call the same method, display_id(), with the new value as an argument. +Getting and setting values with the same method is common in Bioperl. + + $id = $seqobj->display_id(); + $id = "$id.rev"; + $rev->display_id($id); + +The write_seq method on the SeqIO output object, $seqout, writes the +$rev object to the filestream we built at the top of the script. +The filestream knows that it is outputting in fasta format, and +so it provides fasta output. + + $seqout->write_seq($rev); + +This block of code loops over sequence features in the sequence +object, trying to find ones who have been tagged as 'exon'. +Features have start and end attributes and can be outputted +in Genbank Flat File format, GFF, a standarized format for sequence +features. + + foreach $feat ( $seqobj->get_SeqFeatures() ) { + if( $feat->primary_tag eq 'exon' ) { + print STDOUT "Location ",$feat->start,":", + $feat->end," GFF[",$feat->gff_string,"]\n"; + } + } + +The code above shows how a few Bio::Seq methods suffice to read, parse, +reformat and analyze sequences from a file. A full list of methods +available to Bio::Seq objects is shown below. Bear in mind that some of +these methods come from PrimarySeq objects, which are simpler +than Seq objects, stripped of features (see L<Bio::PrimarySeq> for +more information). + + # these methods return strings, and accept strings in some cases: + + $seqobj->seq(); # string of sequence + $seqobj->subseq(5,10); # part of the sequence as a string + $seqobj->accession_number(); # when there, the accession number + $seqobj->moltype(); # one of 'dna','rna',or 'protein' + $seqobj->seq_version() # when there, the version + $seqobj->keywords(); # when there, the Keywords line + $seqobj->length() # length + $seqobj->desc(); # description + $seqobj->primary_id(); # a unique id for this sequence regardless + # of its display_id or accession number + $seqobj->display_id(); # the human readable id of the sequence + +Some of these values map to fields in common formats. For example, The +display_id() method returns the LOCUS name of a Genbank entry, +the (\S+) following the E<gt> character in a Fasta file, the ID from +a SwissProt file, and so on. The desc() method will return the DEFINITION +line of a Genbank file, the description following the display_id in a +Fasta file, and the DE field in a SwissProt file. + + # the following methods return new Seq objects, but + # do not transfer features across to the new object: + + $seqobj->trunc(5,10) # truncation from 5 to 10 as new object + $seqobj->revcom # reverse complements sequence + $seqobj->translate # translation of the sequence + + # if new() can be called this method returns 1, else 0 + + $seqobj->can_call_new + + # the following method determines if the given string will be accepted + # by the seq() method - if the string is acceptable then validate() + # returns 1, or 0 if not + + $seqobj->validate_seq($string) + + # the following method returns or accepts a Species object: + + $seqobj->species(); + +Please see L<Bio::Species> for more information on this object. + + # the following method returns or accepts an Annotation object + # which in turn allows access to Annotation::Reference + # and Annotation::Comment objects: + + $seqobj->annotation(); + +These annotations typically refer to entire sequences, unlike +features. See L<Bio::AnnotationCollectionI>, +L<Bio::Annotation::Collection>, L<Bio::Annotation::Reference>, and +L<Bio::Annotation::Comment> for details. + +It is also important to be able to describe defined portions of a +sequence. The combination of some description and the corresponding +sub-sequence is called a feature - an exon and its coordinates within +a gene is an example of a feature, or a domain within a protein. + + # the following methods return an array of SeqFeatureI objects: + + $seqobj->get_SeqFeatures # The 'top level' sequence features + $seqobj->get_all_SeqFeatures # All sequence features, including sub-seq + # features, such as features in an exon + + # to find out the number of features use: + + $seqobj->feature_count + +Here are just some of the methods available to SeqFeatureI objects: + + # these methods return numbers: + + $feat->start # start position (1 is the first base) + $feat->end # end position (2 is the second base) + $feat->strand # 1 means forward, -1 reverse, 0 not relevant + + # these methods return or accept strings: + + $feat->primary_tag # the name of the sequence feature, eg + # 'exon', 'glycoslyation site', 'TM domain' + $feat->source_tag # where the feature comes from, eg, 'EMBL_GenBank', + # or 'BLAST' + + # this method returns the more austere PrimarySeq object, not a + # Seq object - the main difference is that PrimarySeq objects do not + # themselves contain sequence features + + $feat->seq # the sequence between start,end on the + # correct strand of the sequence + +See L<Bio::PrimarySeq> for more details on PrimarySeq objects. + + # useful methods for feature comparisons, for start/end points + + $feat->overlaps($other) # do $feat and $other overlap? + $feat->contains($other) # is $other completely within $feat? + $feat->equals($other) # do $feat and $other completely agree? + + # one can also add features + + $seqobj->add_SeqFeature($feat) # returns 1 if successful + $seqobj->add_SeqFeature(@features) # returns 1 if successful + + # sub features. For complex join() statements, the feature + # is one sequence feature with many sub SeqFeatures + + $feat->sub_SeqFeature # returns array of sub seq features + +Please see L<Bio::SeqFeatureI> and L<Bio::SeqFeature::Generic>, +for more information on sequence features. + +It is worth mentioning that one can also retrieve the start and end +positions of a feature using a Bio::LocationI object: + + $location = $feat->location # $location is a Bio::LocationI object + $location->start; # start position + $location->end; # end position + +This is useful because one needs a Bio::Location::SplitLocationI object +in order to retrieve the coordinates inside the Genbank or EMBL join() +statements (e.g. "CDS join(51..142,273..495,1346..1474)"): + + if ( $feat->location->isa('Bio::Location::SplitLocationI') && + $feat->primary_tag eq 'CDS' ) { + foreach $loc ( $feat->location->sub_Location ) { + print $loc->start . ".." . $loc->end . "\n"; + } + } + +See L<Bio::LocationI> and L<Bio::Location::SplitLocationI> for more +information. + +=head1 Implemented Interfaces + +This class implements the following interfaces. + +=over 4 + +=item Bio::SeqI + +Note that this includes implementing Bio::PrimarySeqI. + +=item Bio::IdentifiableI + +=item Bio::DescribableI + +=item Bio::AnnotatableI + +=item Bio::FeatureHolderI + +=back + +=head1 FEEDBACK + + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney, inspired by Ian Korf objects + +Email birney@ebi.ac.uk + +=head1 CONTRIBUTORS + +Jason Stajich E<lt>jason@bioperl.orgE<gt> + +=head1 APPENDIX + + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a "_". + +=cut + +#' +# Let the code begin... + + +package Bio::Seq; +use vars qw(@ISA $VERSION); +use strict; + + +# Object preamble - inherits from Bio::Root::Object + +use Bio::Root::Root; +use Bio::SeqI; +use Bio::Annotation::Collection; +use Bio::PrimarySeq; +use Bio::IdentifiableI; +use Bio::DescribableI; +use Bio::AnnotatableI; +use Bio::FeatureHolderI; + +$VERSION = '1.1'; +@ISA = qw(Bio::Root::Root Bio::SeqI + Bio::IdentifiableI Bio::DescribableI + Bio::AnnotatableI Bio::FeatureHolderI); + +=head2 new + + Title : new + Usage : $seq = Bio::Seq->new( -seq => 'ATGGGGGTGGTGGTACCCT', + -id => 'human_id', + -accession_number => 'AL000012', + ); + + Function: Returns a new Seq object from + basic constructors, being a string for the sequence + and strings for id and accession_number + Returns : a new Bio::Seq object + +=cut + +sub new { + my($caller,@args) = @_; + + if( $caller ne 'Bio::Seq') { + $caller = ref($caller) if ref($caller); + } + + # we know our inherietance heirarchy + my $self = Bio::Root::Root->new(@args); + bless $self,$caller; + + # this is way too sneaky probably. We delegate the construction of + # the Seq object onto PrimarySeq and then pop primary_seq into + # our primary_seq slot + + my $pseq = Bio::PrimarySeq->new(@args); + + # as we have just made this, we know it is ok to set hash directly + # rather than going through the method + + $self->{'primary_seq'} = $pseq; + + # setting this array is now delayed until the final + # moment, again speed ups for non feature containing things + # $self->{'_as_feat'} = []; + + + my ($ann, $pid,$feat,$species) = &Bio::Root::RootI::_rearrange($self,[qw(ANNOTATION PRIMARY_ID FEATURES SPECIES)], @args); + + # for a number of cases - reading fasta files - these are never set. This + # gives a quick optimisation around testing things later on + + if( defined $ann || defined $pid || defined $feat || defined $species ) { + $pid && $self->primary_id($pid); + $species && $self->species($species); + $ann && $self->annotation($ann); + + if( defined $feat ) { + if( ref($feat) !~ /ARRAY/i ) { + if( ref($feat) && $feat->isa('Bio::SeqFeatureI') ) { + $self->add_SeqFeature($feat); + } else { + $self->warn("Must specify a valid Bio::SeqFeatureI or ArrayRef of Bio::SeqFeatureI's with the -features init parameter for ".ref($self)); + } + } else { + foreach my $feature ( @$feat ) { + $self->add_SeqFeature($feature); + } + } + } + } + + return $self; +} + +=head1 PrimarySeq interface + + +The PrimarySeq interface provides the basic sequence getting +and setting methods for on all sequences. + +These methods implement the Bio::PrimarySeq interface by delegating +to the primary_seq inside the object. This means that you +can use a Seq object wherever there is a PrimarySeq, and +of course, you are free to use these functions anyway. + +=cut + +=head2 seq + + Title : seq + Usage : $string = $obj->seq() + Function: Get/Set the sequence as a string of letters. The + case of the letters is left up to the implementer. + Suggested cases are upper case for proteins and lower case for + DNA sequence (IUPAC standard), + but implementations are suggested to keep an open mind about + case (some users... want mixed case!) + Returns : A scalar + Args : Optionally on set the new value (a string). An optional second + argument presets the alphabet (otherwise it will be guessed). + Both parameters may also be given in named paramater style + with -seq and -alphabet being the names. + +=cut + +sub seq { + return shift->primary_seq()->seq(@_); +} + +=head2 validate_seq + + Title : validate_seq + Usage : if(! $seq->validate_seq($seq_str) ) { + print "sequence $seq_str is not valid for an object of type ", + ref($seq), "\n"; + } + Function: Validates a given sequence string. A validating sequence string + must be accepted by seq(). A string that does not validate will + lead to an exception if passed to seq(). + + The implementation provided here does not take alphabet() into + account. Allowed are all letters (A-Z) and '-','.', and '*'. + + Example : + Returns : 1 if the supplied sequence string is valid for the object, and + 0 otherwise. + Args : The sequence string to be validated. + + +=cut + +sub validate_seq { + return shift->primary_seq()->validate_seq(@_); +} + +=head2 length + + Title : length + Usage : $len = $seq->length() + Function: + Example : + Returns : Integer representing the length of the sequence. + Args : None + +=cut + +sub length { + return shift->primary_seq()->length(@_); +} + +=head1 Methods from the Bio::PrimarySeqI interface + +=cut + +=head2 subseq + + Title : subseq + Usage : $substring = $obj->subseq(10,40); + Function: Returns the subseq from start to end, where the first base + is 1 and the number is inclusive, ie 1-2 are the first two + bases of the sequence + + Start cannot be larger than end but can be equal + + Returns : A string + Args : 2 integers + + +=cut + +sub subseq { + return shift->primary_seq()->subseq(@_); +} + +=head2 display_id + + Title : display_id + Usage : $id = $obj->display_id or $obj->display_id($newid); + Function: Gets or sets the display id, also known as the common name of + the Seq object. + + The semantics of this is that it is the most likely string + to be used as an identifier of the sequence, and likely to + have "human" readability. The id is equivalent to the LOCUS + field of the GenBank/EMBL databanks and the ID field of the + Swissprot/sptrembl database. In fasta format, the >(\S+) is + presumed to be the id, though some people overload the id + to embed other information. Bioperl does not use any + embedded information in the ID field, and people are + encouraged to use other mechanisms (accession field for + example, or extending the sequence object) to solve this. + + Notice that $seq->id() maps to this function, mainly for + legacy/convenience issues. + Returns : A string + Args : None or a new id + + +=cut + +sub display_id { + return shift->primary_seq->display_id(@_); +} + + + +=head2 accession_number + + Title : accession_number + Usage : $unique_biological_key = $obj->accession_number; + Function: Returns the unique biological id for a sequence, commonly + called the accession_number. For sequences from established + databases, the implementors should try to use the correct + accession number. Notice that primary_id() provides the + unique id for the implemetation, allowing multiple objects + to have the same accession number in a particular implementation. + + For sequences with no accession number, this method should return + "unknown". + + Can also be used to set the accession number. + Example : $key = $seq->accession_number or $seq->accession_number($key) + Returns : A string + Args : None or an accession number + + +=cut + +sub accession_number { + return shift->primary_seq->accession_number(@_); +} + +=head2 desc + + Title : desc + Usage : $seqobj->desc($string) or $seqobj->desc() + Function: Sets or gets the description of the sequence + Example : + Returns : The description + Args : The description or none + + +=cut + +sub desc { + return shift->primary_seq->desc(@_); +} + +=head2 primary_id + + Title : primary_id + Usage : $unique_implementation_key = $obj->primary_id; + Function: Returns the unique id for this object in this + implementation. This allows implementations to manage + their own object ids in a way the implementation can control + clients can expect one id to map to one object. + + For sequences with no natural id, this method should return + a stringified memory location. + + Can also be used to set the primary_id. + + Also notice that this method is not delegated to the + internal Bio::PrimarySeq object + + [Note this method name is likely to change in 1.3] + + Example : $id = $seq->primary_id or $seq->primary_id($id) + Returns : A string + Args : None or an id + + +=cut + +sub primary_id { + my ($obj,$value) = @_; + + if( defined $value) { + $obj->{'primary_id'} = $value; + } + if( ! exists $obj->{'primary_id'} ) { + return "$obj"; + } + return $obj->{'primary_id'}; +} + +=head2 can_call_new + + Title : can_call_new + Usage : if ( $obj->can_call_new ) { + $newobj = $obj->new( %param ); + } + Function: can_call_new returns 1 or 0 depending + on whether an implementation allows new + constructor to be called. If a new constructor + is allowed, then it should take the followed hashed + constructor list. + + $myobject->new( -seq => $sequence_as_string, + -display_id => $id + -accession_number => $accession + -alphabet => 'dna', + ); + Example : + Returns : 1 or 0 + Args : None + + +=cut + +sub can_call_new { + return 1; +} + +=head2 alphabet + + Title : alphabet + Usage : if ( $obj->alphabet eq 'dna' ) { /Do Something/ } + Function: Returns the type of sequence being one of + 'dna', 'rna' or 'protein'. This is case sensitive. + + This is not called <type> because this would cause + upgrade problems from the 0.5 and earlier Seq objects. + + Returns : A string either 'dna','rna','protein'. NB - the object must + make a call of the type - if there is no type specified it + has to guess. + Args : None + + +=cut + +sub alphabet { + my $self = shift; + return $self->primary_seq->alphabet(@_) if @_ && defined $_[0]; + return $self->primary_seq->alphabet(); +} + +sub is_circular { shift->primary_seq->is_circular } + +=head1 Methods for Bio::IdentifiableI compliance + +=cut + +=head2 object_id + + Title : object_id + Usage : $string = $obj->object_id() + Function: a string which represents the stable primary identifier + in this namespace of this object. For DNA sequences this + is its accession_number, similarly for protein sequences + + This is aliased to accession_number(). + Returns : A scalar + + +=cut + +sub object_id { + return shift->accession_number(@_); +} + +=head2 version + + Title : version + Usage : $version = $obj->version() + Function: a number which differentiates between versions of + the same object. Higher numbers are considered to be + later and more relevant, but a single object described + the same identifier should represent the same concept + + Returns : A number + +=cut + +sub version{ + return shift->primary_seq->version(@_); +} + + +=head2 authority + + Title : authority + Usage : $authority = $obj->authority() + Function: a string which represents the organisation which + granted the namespace, written as the DNS name for + organisation (eg, wormbase.org) + + Returns : A scalar + +=cut + +sub authority { + return shift->primary_seq()->authority(@_); +} + +=head2 namespace + + Title : namespace + Usage : $string = $obj->namespace() + Function: A string representing the name space this identifier + is valid in, often the database name or the name + describing the collection + + Returns : A scalar + + +=cut + +sub namespace{ + return shift->primary_seq()->namespace(@_); +} + +=head1 Methods for Bio::DescribableI compliance + +=cut + +=head2 display_name + + Title : display_name + Usage : $string = $obj->display_name() + Function: A string which is what should be displayed to the user + the string should have no spaces (ideally, though a cautious + user of this interface would not assumme this) and should be + less than thirty characters (though again, double checking + this is a good idea) + + This is aliased to display_id(). + Returns : A scalar + +=cut + +sub display_name { + return shift->display_id(@_); +} + +=head2 description + + Title : description + Usage : $string = $obj->description() + Function: A text string suitable for displaying to the user a + description. This string is likely to have spaces, but + should not have any newlines or formatting - just plain + text. The string should not be greater than 255 characters + and clients can feel justified at truncating strings at 255 + characters for the purposes of display + + This is aliased to desc(). + Returns : A scalar + +=cut + +sub description { + return shift->desc(@_); +} + +=head1 Methods for implementing Bio::AnnotatableI + +=cut + +=head2 annotation + + Title : annotation + Usage : $ann = $seq->annotation or $seq->annotation($annotation) + Function: Gets or sets the annotation + Returns : L<Bio::AnnotationCollectionI> object + Args : None or L<Bio::AnnotationCollectionI> object + +See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection> +for more information + +=cut + +sub annotation { + my ($obj,$value) = @_; + if( defined $value ) { + $obj->throw("object of class ".ref($value)." does not implement ". + "Bio::AnnotationCollectionI. Too bad.") + unless $value->isa("Bio::AnnotationCollectionI"); + $obj->{'_annotation'} = $value; + } elsif( ! defined $obj->{'_annotation'}) { + $obj->{'_annotation'} = new Bio::Annotation::Collection; + } + return $obj->{'_annotation'}; +} + +=head1 Methods to implement Bio::FeatureHolderI + +This includes methods for retrieving, adding, and removing features. + +=cut + +=head2 get_SeqFeatures + + Title : get_SeqFeatures + Usage : + Function: Get the feature objects held by this feature holder. + + Features which are not top-level are subfeatures of one or + more of the returned feature objects, which means that you + must traverse the subfeature arrays of each top-level + feature object in order to traverse all features associated + with this sequence. + + Use get_all_SeqFeatures() if you want the feature tree + flattened into one single array. + + Example : + Returns : an array of Bio::SeqFeatureI implementing objects + Args : none + +At some day we may want to expand this method to allow for a feature +filter to be passed in. + +=cut + +sub get_SeqFeatures{ + my $self = shift; + + if( !defined $self->{'_as_feat'} ) { + $self->{'_as_feat'} = []; + } + + return @{$self->{'_as_feat'}}; +} + +=head2 get_all_SeqFeatures + + Title : get_all_SeqFeatures + Usage : @feat_ary = $seq->get_all_SeqFeatures(); + Function: Returns the tree of feature objects attached to this + sequence object flattened into one single array. Top-level + features will still contain their subfeature-arrays, which + means that you will encounter subfeatures twice if you + traverse the subfeature tree of the returned objects. + + Use get_SeqFeatures() if you want the array to contain only + the top-level features. + + Returns : An array of Bio::SeqFeatureI implementing objects. + Args : None + + +=cut + +# this implementation is inherited from FeatureHolderI + +=head2 feature_count + + Title : feature_count + Usage : $seq->feature_count() + Function: Return the number of SeqFeatures attached to a sequence + Returns : integer representing the number of SeqFeatures + Args : None + + +=cut + +sub feature_count { + my ($self) = @_; + + if (defined($self->{'_as_feat'})) { + return ($#{$self->{'_as_feat'}} + 1); + } else { + return 0; + } +} + +=head2 add_SeqFeature + + Title : add_SeqFeature + Usage : $seq->add_SeqFeature($feat); + $seq->add_SeqFeature(@feat); + Function: Adds the given feature object (or each of an array of feature + objects to the feature array of this + sequence. The object passed is required to implement the + Bio::SeqFeatureI interface. + Returns : 1 on success + Args : A Bio::SeqFeatureI implementing object, or an array of such objects. + + +=cut + +sub add_SeqFeature { + my ($self,@feat) = @_; + + $self->{'_as_feat'} = [] unless $self->{'_as_feat'}; + + foreach my $feat ( @feat ) { + if( !$feat->isa("Bio::SeqFeatureI") ) { + $self->throw("$feat is not a SeqFeatureI and that's what we expect..."); + } + + # make sure we attach ourselves to the feature if the feature wants it + my $aseq = $self->primary_seq; + $feat->attach_seq($aseq) if $aseq; + + push(@{$self->{'_as_feat'}},$feat); + } + return 1; +} + +=head2 remove_SeqFeatures + + Title : remove_SeqFeatures + Usage : $seq->remove_SeqFeatures(); + Function: Flushes all attached SeqFeatureI objects. + + To remove individual feature objects, delete those from the returned + array and re-add the rest. + Example : + Returns : The array of Bio::SeqFeatureI objects removed from this seq. + Args : None + + +=cut + +sub remove_SeqFeatures { + my $self = shift; + + return () unless $self->{'_as_feat'}; + my @feats = @{$self->{'_as_feat'}}; + $self->{'_as_feat'} = []; + return @feats; +} + +=head1 Methods provided in the Bio::PrimarySeqI interface + + +These methods are inherited from the PrimarySeq interface +and work as one expects, building new Bio::Seq objects +or other information as expected. See L<Bio::PrimarySeq> +for more information. + +Sequence Features are B<not> transfered to the new objects. +This is possibly a mistake. Anyone who feels the urge in +dealing with this is welcome to give it a go. + +=head2 revcom + + Title : revcom + Usage : $rev = $seq->revcom() + Function: Produces a new Bio::Seq object which + is the reversed complement of the sequence. For protein + sequences this throws an exception of "Sequence is a protein. + Cannot revcom" + + The id is the same id as the original sequence, and the + accession number is also identical. If someone wants to track + that this sequence has be reversed, it needs to define its own + extensions + + To do an in-place edit of an object you can go: + + $seq = $seq->revcom(); + + This of course, causes Perl to handle the garbage collection of + the old object, but it is roughly speaking as efficient as an + in-place edit. + + Returns : A new (fresh) Bio::Seq object + Args : None + + +=cut + +=head2 trunc + + Title : trunc + Usage : $subseq = $myseq->trunc(10,100); + Function: Provides a truncation of a sequence + + Example : + Returns : A fresh Seq object + Args : A Seq object + + +=cut + +=head2 id + + Title : id + Usage : $id = $seq->id() + Function: This is mapped on display_id + Returns : value of display_id() + Args : [optional] value to update display_id + + +=cut + +sub id { + return shift->display_id(@_); +} + + +=head1 Seq only methods + + +These methods are specific to the Bio::Seq object, and not +found on the Bio::PrimarySeq object + +=head2 primary_seq + + Title : primary_seq + Usage : $seq->primary_seq or $seq->primary_seq($newval) + Function: Get or set a PrimarySeq object + Example : + Returns : PrimarySeq object + Args : None or PrimarySeq object + + +=cut + +sub primary_seq { + my ($obj,$value) = @_; + + if( defined $value) { + if( ! ref $value || ! $value->isa('Bio::PrimarySeqI') ) { + $obj->throw("$value is not a Bio::PrimarySeq compliant object"); + } + + $obj->{'primary_seq'} = $value; + # descend down over all seqfeature objects, seeing whether they + # want an attached seq. + + foreach my $sf ( $obj->get_SeqFeatures() ) { + $sf->attach_seq($value); + } + + } + return $obj->{'primary_seq'}; + +} + +=head2 species + + Title : species + Usage : $species = $seq->species() or $seq->species($species) + Function: Gets or sets the species + Returns : L<Bio::Species> object + Args : None or L<Bio::Species> object + +See L<Bio::Species> for more information + +=cut + +sub species { + my ($self, $species) = @_; + if ($species) { + $self->{'species'} = $species; + } else { + return $self->{'species'}; + } +} + +=head1 Internal methods + +=cut + +# keep AUTOLOAD happy +sub DESTROY { } + +############################################################################ +# aliases due to name changes or to compensate for our lack of consistency # +############################################################################ + +# in all other modules we use the object in the singular -- +# lack of consistency sucks +*flush_SeqFeature = \&remove_SeqFeatures; +*flush_SeqFeatures = \&remove_SeqFeatures; + +# this is now get_SeqFeatures() (from FeatureHolderI) +*top_SeqFeatures = \&get_SeqFeatures; + +# this is now get_all_SeqFeatures() in FeatureHolderI +sub all_SeqFeatures{ + return shift->get_all_SeqFeatures(@_); +} + +sub accession { + my $self = shift; + $self->warn(ref($self)."::accession is deprecated, ". + "use accession_number() instead"); + return $self->accession_number(@_); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Seq/BaseSeqProcessor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Seq/BaseSeqProcessor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,306 @@ +# $Id: BaseSeqProcessor.pm,v 1.2 2002/11/02 21:04:19 lapp Exp $ +# +# BioPerl module for Bio::Seq::BaseSeqProcessor +# +# Cared for by Hilmar Lapp <hlapp at gmx.net> +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# +# (c) Hilmar Lapp, hlapp at gmx.net, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Seq::BaseSeqProcessor - Base implementation for a SequenceProcessor + +=head1 SYNOPSIS + + # you need to derive your own processor from this one + +=head1 DESCRIPTION + +This provides just a basic framework for implementations of +L<Bio::Factory::SequenceProcessorI>. + +Essentially what it does is support a parameter to new() to set +sequence factory and source stream, and a next_seq() implementation +that will use a queue to be filled by a class overriding +process_seq(). + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bioperl.org/bioperl-bugs/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp at gmx.net + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Seq::BaseSeqProcessor; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::Root::Root; +use Bio::Factory::SequenceProcessorI; + +@ISA = qw(Bio::Root::Root Bio::Factory::SequenceProcessorI); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Seq::BaseSeqProcessor(); + Function: Builds a new Bio::Seq::BaseSeqProcessor object + Returns : an instance of Bio::Seq::BaseSeqProcessor + Args : Named parameters. Currently supported are + -seqfactory the Bio::Factory::SequenceFactoryI object to use + -source_stream the Bio::Factory::SequenceStreamI object to + which we are chained + + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + my ($stream,$fact) = + $self->_rearrange([qw(SOURCE_STREAM SEQFACTORY)], @args); + + $self->{'_queue'} = []; + $self->sequence_factory($fact) if $fact; + $self->source_stream($stream) if $stream; + + return $self; +} + +=head1 L<Bio::Factory::SequenceProcessorI> methods + +=cut + +=head2 source_stream + + Title : source_stream + Usage : $obj->source_stream($newval) + Function: Get/set the source sequence stream for this sequence + processor. + + Example : + Returns : A Bio::Factory::SequenceStreamI compliant object + Args : on set, new value (a Bio::Factory::SequenceStreamI compliant + object) + + +=cut + +sub source_stream{ + my $self = shift; + + if(@_) { + my $stream = shift; + my $fact = $stream->sequence_factory(); + $self->sequence_factory($fact) + unless $self->sequence_factory() || (! $fact); + return $self->{'source_stream'} = $stream; + } + return $self->{'source_stream'}; +} + +=head1 L<Bio::Factory::SequenceStreamI> methods + +=cut + +=head2 next_seq + + Title : next_seq + Usage : $seq = stream->next_seq + Function: Reads the next sequence object from the stream and returns it. + + This implementation will obtain objects from the source + stream as necessary and pass them to process_seq() for + processing. This method will return the objects one at a + time that process_seq() returns. + + Returns : a Bio::Seq sequence object + Args : none + +See L<Bio::Factory::SequenceStreamI::next_seq> + +=cut + +sub next_seq{ + my $self = shift; + my $seq; + + # if the queue is empty, fetch next from source and process it + if(@{$self->{'_queue'}} == 0) { + my @seqs = (); + while($seq = $self->source_stream->next_seq()) { + @seqs = $self->process_seq($seq); + # we may get zero seqs returned + last if @seqs; + } + push(@{$self->{'_queue'}}, @seqs) if @seqs; + } + # take next from the queue of seqs + $seq = shift(@{$self->{'_queue'}}); + return $seq; +} + +=head2 write_seq + + Title : write_seq + Usage : $stream->write_seq($seq) + Function: writes the $seq object into the stream + + This implementation passes the sequences to the source + stream unaltered. You need to override this in order to + have sequence objects altered before output. + + Returns : 1 for success and 0 for error + Args : Bio::Seq object + +=cut + +sub write_seq{ + return shift->source_stream->write_seq(@_); +} + +=head2 sequence_factory + + Title : sequence_factory + Usage : $seqio->sequence_factory($seqfactory) + Function: Get the Bio::Factory::SequenceFactoryI + Returns : Bio::Factory::SequenceFactoryI + Args : none + + +=cut + +sub sequence_factory{ + my $self = shift; + + return $self->{'sequence_factory'} = shift if @_; + return $self->{'sequence_factory'}; +} + +=head2 object_factory + + Title : object_factory + Usage : $obj->object_factory($newval) + Function: This is an alias to sequence_factory with a more generic name. + Example : + Returns : a L<Bio::Factory::ObjectFactoryI> compliant object + Args : on set, new value (a L<Bio::Factory::ObjectFactoryI> + compliant object or undef, optional) + + +=cut + +sub object_factory{ + return shift->sequence_factory(@_); +} + +=head2 close + + Title : close + Usage : + Function: Closes the stream. We override this here in order to cascade + to the source stream. + Example : + Returns : + Args : none + + +=cut + +sub close{ + my $self = shift; + return $self->source_stream() ? $self->source_stream->close(@_) : 1; +} + +=head1 To be overridden by a derived class + +=cut + +=head2 process_seq + + Title : process_seq + Usage : + Function: This is the method that is supposed to do the actual + processing. It needs to be overridden to do what you want + it to do. + + Generally, you do not have to override or implement any other + method to derive your own sequence processor. + + The implementation provided here just returns the unaltered + input sequence and hence is not very useful other than + serving as a neutral default processor. + + Example : + Returns : An array of zero or more Bio::PrimarySeqI (or derived + interface) compliant object as the result of processing the + input sequence. + Args : A Bio::PrimarySeqI (or derived interface) compliant object + to be processed. + + +=cut + +sub process_seq{ + my ($self,$seq) = @_; + + return ($seq); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Seq/EncodedSeq.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Seq/EncodedSeq.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,592 @@ +# $Id: EncodedSeq.pm,v 1.5.2.1 2003/04/28 12:11:57 jason Exp $ +# +# BioPerl module for Bio::Seq::EncodedSeq +# +# Cared for by Aaron Mackey +# +# Copyright Aaron Mackey +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Seq::EncodedSeq - subtype of L<Bio::LocatableSeq|Bio::LocatableSeq> to store DNA that encodes a protein + +=head1 SYNOPSIS + + $obj = new Bio::Seq::EncodedSeq(-seq => $dna, + -encoding => "CCCCCCCIIIIICCCCC", + -start => 1, + -strand => 1, + -length => 17); + + # splice out (and possibly revcomp) the coding sequence + $cds = obj->cds; + + # obtain the protein translation of the sequence + $prot = $obj->translate; + + # other access/inspection routines as with Bio::LocatableSeq and + # Bio::SeqI; note that coordinates are relative only to the DNA + # sequence, not it's implicit encoded protein sequence. + +=head1 DESCRIPTION + +Bio::Seq::EncodedSeq is a L<Bio::LocatableSeq|Bio::LocatableSeq> +object that holds a DNA sequence as well as information about the +coding potential of that DNA sequence. It is meant to be useful in an +alignment context, where the DNA may contain frameshifts, gaps and/or +introns, or in describing the transcript of a gene. An EncodedSeq +provides the ability to access the "spliced" coding sequence, meaning +that all introns and gaps are removed, and any frameshifts are +adjusted to provide a "clean" CDS. + +In order to make simultaneous use of either the DNA or the implicit +encoded protein sequence coordinates, please see +L<Bio::Coordinate::EncodingPair>. + +=head1 ENCODING + +We use the term "encoding" here to refer to the series of symbols that +we use to identify which residues of a DNA sequence are protein-coding +(i.e. part of a codon), intronic, part of a 5' or 3', frameshift +"mutations", etc. From this information, a Bio::Seq::EncodedSeq is +able to "figure out" its translational CDS. There are two sets of +coding characters, one termed "implicit" and one termed "explicit". + +The "implict" encoding is a bit simpler than the "explicit" encoding: +'C' is used for any nucleotide that's part of a codon, 'U' for any +UTR, etc. The full list is shown below: + + Code Meaning + ---- ------- + C coding + I intronic + U untranslated + G gapped (for use in alignments) + F a "forward", +1 frameshift + B a "backward", -1 frameshift + +The "explicit" encoding is just an expansion of the "implicit" +encoding, to denote phase: + + Code Meaning + ---- ------- + C coding, 1st codon position + D coding, 2nd codon position + E coding, 3rd codon position + + I intronic, phase 0 (relative to intron begin) + J intronic, phase 1 + K intronic, phase 2 + + U untranslated 3'UTR + V untranslated 5'UTR + + G gapped (for use in alignments) + F a "forward", +1 frameshift + B a "backward", -1 frameshift + +Note that the explicit coding is meant to provide easy access to +position/phase specific nucleotides: + + $obj = new Bio::Seq::EncodedSeq (-seq => "ACAATCAGACTACG...", + -encoding => "CCCCCCIII..." + ); + + # fetch arrays of nucleotides at each codon position: + my @pos1 = $obj->dnaseq(encoding => 'C', explicit => 1); + my @pos2 = $obj->dnaseq(encoding => 'D'); + my @pos3 = $obj->dnaseq(encoding => 'E'); + + # fetch arrays of "3-1" codon dinucleotides, useful for genomic + # signature analyses without compounding influences of codon bias: + my @pairs = $obj->dnaseq(encoding => 'EC'); + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://www.bioperl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Aaron Mackey + +Email amackey@virginia.edu + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Seq::EncodedSeq; + +use strict; +use vars qw(@ISA); + +use Bio::LocatableSeq; + +@ISA = qw(Bio::LocatableSeq); + +=head2 new + + Title : new + Usage : $obj = Bio::Seq::EncodedSeq->new(-seq => "AGTACGTGTCATG", + -encoding => "CCCCCCFCCCCCC", + -id => "myseq", + -start => 1, + -end => 13, + -strand => 1 + ); + Function: creates a new Bio::Seq::EncodedSeq object from a supplied DNA + sequence + Returns : a new Bio::Seq::EncodedSeq object + + Args : seq - primary nucleotide sequence used to encode the + protein; note that any positions involved in a + gap ('G') or backward frameshift ('B') should + have one or more gap characters; if the encoding + specifies G or B, but no (or not enough) gap + characters exist, *they'll be added*; similary, + if there are gap characters without a + corresponding G or B encoding, G's will be + inserted into the encoding. This allows some + flexibility in specifying your sequence and + coding without having to calculate a lot of the + encoding for yourself. + + encoding - a string of characters (see Encoding Table) + describing backwards frameshifts implied by the + encoding but not present in the sequence will be + added (as '-'s) to the sequence. If not + supplied, it will be assumed that all positions + are coding (C). Encoding may include either + implicit phase encoding characters (i.e. "CCC") + and/or explicit encoding characters (i.e. "CDE"). + Additionally, prefixed numbers may be used to + denote repetition (i.e. "27C3I28C"). + + Alternatively, encoding may be a hashref + datastructure, with encoding characters as keys + and Bio::LocationI objects (or arrayrefs of + Bio::LocationI objects) as values, e.g.: + + { C => [ Bio::Location::Simple->new(1,9), + Bio::Location::Simple->new(11,13) ], + F => Bio::Location::Simple->new(10,10), + } # same as "CCCCCCCCCFCCC" + + Note that if the location ranges overlap, the + behavior of the encoding will be undefined + (well, it will be defined, but only according to + the order in which the hash keys are read, which + is basically undefined ... just don't do that). + + id, start, end, strand - as with Bio::LocatableSeq; note + that the coordinates are relative to the + encoding DNA sequence, not the implicit protein + sequence. If strand is reversed, then the + encoding is assumed to be relative to the + reverse strand as well. + +=cut + +#' + +sub new { + + my ($self, @args) = @_; + $self = $self->SUPER::new(@args, -alphabet => 'dna'); + my ($enc) = $self->_rearrange([qw(ENCODING)], @args); + # set the real encoding: + if ($enc) { + $self->encoding($enc); + } else { + $self->_recheck_encoding; + } + return $self; +} + +=head2 encoding + + Title : encoding + Usage : $obj->encoding("CCCCCC"); + $obj->encoding( -encoding => { I => $location } ); + $enc = $obj->encoding(-explicit => 1); + $enc = $obj->encoding("CCCCCC", -explicit => 1); + $enc = $obj->encoding(-location => $location, + -explicit => 1, + -absolute => 1 ); + Function: get/set the objects encoding, either globally or by location(s). + Returns : the (possibly new) encoding string. + Args : encoding - see the encoding argument to the new() function. + + explicit - whether or not to return explicit phase + information in the coding (i.e. "CCC" becomes + "CDE", "III" becomes "IJK", etc); defaults to 0. + + location - optional; location to get/set the encoding. + Defaults to the entire sequence. + + absolute - whether or not the locational elements (either + in the encoding hashref or the location + argument) are relative to the absolute start/end + of the Bio::LocatableSeq, or to the internal, + relative coordinate system (beginning at 1); + defaults to 0 (i.e. relative) + +=cut + +sub encoding { + + my ($self, @args) = @_; + my ($enc, $loc, $exp, $abs) = $self->_rearrange([qw(ENCODING LOCATION EXPLICIT ABSOLUTE)], @args); + + if (!$enc) { + # do nothing; _recheck_encoding will fix for us, if necessary + } elsif (ref $enc eq 'HASH') { + $self->throw( -class => 'Bio::Root::NotImplemented', + -text => "Hashref functionality not yet implemented;\nplease email me if you really need this."); + #TODO: finish all this + while (my ($char, $locs) = each %$enc) { + if (ref $locs eq 'ARRAY') { + } elsif (UNIVERSAL::isa($locs, "Bio::LocationI")) { + } else { + $self->throw("Only a scalar or a ref to a hash; not a ref to a @{{lc ref $enc}}"); + } + } + } elsif (! ref $enc) { + $enc = uc $enc; + $exp = 1 if (!defined $exp && $enc =~ m/[DEJKV]/o); + + if ($enc =~ m/\d/o) { # numerically "enhanced" encoding + my $numenc = $enc; + $enc = ''; + while ($numenc =~ m/\G(\d*)([CDEIJKUVGFB])/g) { + my ($num, $char) = ($1, $2); + $num = 1 unless $num; + $enc .= $char x $num; + } + } + + if (defined $exp && $exp == 0 && $enc =~ m/([^CIUGFB])/) { + $self->throw("Unrecognized character '$1' in implicit encoding"); + } elsif ($enc =~ m/[^CDEIJKUVGFB]/) { + $self->throw("Unrecognized character '$1' in explicit encoding"); + } + + if ($loc) { # a global location, over which to apply the specified encoding. + + # balk if too many non-gap characters present in encoding: + my ($ct) = $enc =~ tr/GB/GB/; + $ct = length($enc) - $ct; + $self->throw("Location length doesn't match number of coding chars in encoding!") + if ($loc->location_type eq 'EXACT' && $loc->length != $ct); + + my $start = $loc->start; + my $end = $loc->end; + + # strip any encoding that hangs off the ends of the sequence: + if ($start < $self->start) { + my $diff = $self->start - $start; + $start = $self->start; + $enc = substr($enc, $diff); + } + if ($end > $self->end) { + my $diff = $end - $self->end; + $end = $self->end; + $enc = substr($enc, -$diff); + } + + my $currenc = $self->{_encoding}; + my $currseq = $self->seq; + + my ($spanstart, $spanend) = ($self->column_from_residue_number($start), + $self->column_from_residue_number($end) ); + + if ($currseq) { + # strip any gaps in sequence spanned by this location: + my ($before, $in, $after) = $currseq =~ m/(.{@{[ $spanstart - ($loc->location_type eq 'IN-BETWEEN' ? 0 : 1) ]}}) + (.{@{[ $spanend - $spanstart + ($loc->location_type eq 'IN-BETWEEN' ? -1 : 1) ]}}) + (.*) + /x; + $in =~ s/[\.\-]+//g; + $currseq = $before . $in . $after; + # change seq without changing the alphabet + $self->seq($currseq,$self->alphabet()); + } + + $currenc = reverse $currenc if $self->strand < 0; + substr($currenc, $spanstart, $spanend - $spanstart + ($loc->location_type eq 'IN-BETWEEN' ? -1 : 1), + $self->strand >= 0 ? $enc : reverse $enc); + $currenc = reverse $currenc if $self->strand < 0; + + $self->{_encoding} = $currenc; + $self->_recheck_encoding; + + $currenc = $self->{_encoding}; + $currenc = reverse $currenc if $self->strand < 0; + $enc = substr($currenc, $spanstart, length $enc); + $enc = reverse $enc if $self->strand < 0; + + return $exp ? $enc: $self->_convert2implicit($enc); + + } else { + # presume a global redefinition; strip any current gap + # characters in the sequence so they don't corrupt the + # encoding + my $dna = $self->seq; + $dna =~ s/[\.\-]//g; + $self->seq($dna, 'dna'); + $self->{_encoding} = $enc; + } + } else { + $self->throw("Only a scalar or a ref to a hash; not a ref to a @{{lc ref $enc}}"); + } + + $self->_recheck_encoding(); + + return $exp ? $self->{_encoding} : $self->_convert2implicit($self->{_encoding}); +} + +sub _convert2implicit { + + my ($self, $enc) = @_; + + $enc =~ s/[DE]/C/g; + $enc =~ s/[JK]/I/g; + $enc =~ s/V/U/g; + + return $enc; +} + +sub _recheck_encoding { + + my $self = shift; + + my @enc = split //, ($self->{_encoding} || ''); + + my @nt = split(//, $self->SUPER::seq); + @nt = reverse @nt if $self->strand && $self->strand < 0; + + # make sure an encoding exists! + @enc = ('C') x scalar grep { !/[\.\-]/o } @nt + unless @enc; + + # check for gaps to be truly present in the sequence + # and vice versa + my $i; + for ($i = 0 ; $i < @nt && $i < @enc ; $i++) { + if ($nt[$i] =~ /[\.\-]/o && $enc[$i] !~ m/[GB]/o) { + splice(@enc, $i, 0, 'G'); + } elsif ($nt[$i] !~ /[\.\-]/o && $enc[$i] =~ m/[GB]/o) { + splice(@nt, $i, 0, '-'); + } + } + if ($i < @enc) { + # extra encoding; presumably all gaps? + for ( ; $i < @enc ; $i++) { + if ($enc[$i] =~ m/[GB]/o) { + push @nt, '-'; + } else { + $self->throw("Extraneous encoding info: " . join('', @enc[$i..$#enc])); + } + } + } elsif ($i < @nt) { + for ( ; $i < @nt ; $i++) { + if ($nt[$i] =~ m/[\.\-]/o) { + push @enc, 'G'; + } else { + push @enc, 'C'; + } + } + } + + my @cde_array = qw(C D E); + my @ijk_array = qw(I J K); + # convert any leftover implicit coding into explicit coding + my ($Cct, $Ict, $Uct, $Vct, $Vwarned) = (0, 0, 0, 0); + for ($i = 0 ; $i < @enc ; $i++) { + if ($enc[$i] =~ m/[CDE]/o) { + my $temp_index = $Cct %3; + $enc[$i] = $cde_array[$temp_index]; + $Cct++; $Ict = 0; $Uct = 1; + $self->warn("3' untranslated encoding (V) seen prior to other coding symbols") + if ($Vct && !$Vwarned++); + } elsif ($enc[$i] =~ m/[IJK]/o) { + $enc[$i] = $ijk_array[$Ict % 3]; + $Ict++; $Uct = 1; + $self->warn("3' untranslated encoding (V) seen before other coding symbols") + if ($Vct && !$Vwarned++); + } elsif ($enc[$i] =~ m/[UV]/o) { + if ($Uct == 1) { + $enc[$i] = 'V'; + $Vct = 1; + } + } elsif ($enc[$i] eq 'B') { + $Cct++; $Ict++ + } elsif ($enc[$i] eq 'G') { + # gap; leave alone + } + } + + @nt = reverse @nt if $self->strand && $self->strand < 0; + + $self->{'seq'} = join('', @nt); + # $self->seq(join('', @nt), 'dna'); + $self->{_encoding} = join '', @enc; +} + +=head2 cds + + Title : cds + Usage : $cds = $obj->cds(-nogaps => 1); + Function: obtain the "spliced" DNA sequence, by removing any + nucleotides that participate in an UTR, forward frameshift + or intron, and replacing any unknown nucleotide implied by + a backward frameshift or gap with N's. + Returns : a Bio::Seq::EncodedSeq object, with an encoding consisting only + of "CCCC..". + Args : nogaps - strip any gap characters (resulting from 'G' or 'B' + encodings), rather than replacing them with N's. + +=cut + +sub cds { + + my ($self, @args) = @_; + + my ($nogaps, $loc) = $self->_rearrange([qw(NOGAPS LOCATION)], @args); + $nogaps = 0 unless defined $nogaps; + + my @nt = split //, $self->strand < 0 ? $self->revcom->seq : $self->seq; + my @enc = split //, $self->_convert2implicit($self->{_encoding}); + + my ($start, $end) = (0, scalar @nt); + + if ($loc) { + $start = $loc->start; + $start++ if $loc->location_type eq 'IN-BETWEEN'; + $start = $self->column_from_residue_number($start); + $start--; + + $end = $loc->end; + $end = $self->column_from_residue_number($end); + + ($start, $end) = ($end, $start) if $self->strand < 0; + $start--; + } + + for (my $i = $start ; $i < $end ; $i++) { + if ($enc[$i] eq 'I' || $enc[$i] eq 'U' || $enc[$i] eq 'F') { + # remove introns, untranslated and forward frameshift nucleotides + $nt[$i] = undef; + } elsif ($enc[$i] eq 'G' || $enc[$i] eq 'B') { + # replace gaps and backward frameshifts with N's, unless asked not to. + $nt[$i] = $nogaps ? undef : 'N'; + } + } + + return ($self->can_call_new ? ref($self) : __PACKAGE__)->new + (-seq => join('', grep { defined } @nt[$start..--$end]), + -start => $self->start, + -end => $self->end, + -strand => 1, -alphabet => 'dna'); +} + +=head2 translate + + Title : translate + Usage : $prot = $obj->translate(@args); + Function: obtain the protein sequence encoded by the underlying DNA + sequence; same as $obj->cds()->translate(@args). + Returns : a Bio::PrimarySeq object. + Args : same as the translate() function of Bio::PrimarySeqI + +=cut + +sub translate { shift->cds(-nogaps => 1, @_)->SUPER::translate(@_) }; + +=head2 protseq + + Title : seq + Usage : $protseq = $obj->protseq(); + Function: obtain the raw protein sequence encoded by the underlying + DNA sequence; This is the same as calling + $obj->translate()->seq(); + Returns : a string of single-letter amino acid codes + Args : same as the seq() function of Bio::PrimarySeq; note that this + function may not be used to set the protein sequence; see + the dnaseq() function for that. + +=cut + +sub protseq { shift->cds(-nogaps => 1, @_)->SUPER::translate(@_)->seq }; + +=head2 dnaseq + + Title : dnaseq + Usage : $dnaseq = $obj->dnaseq(); + $obj->dnaseq("ACGTGTCGT", "CCCCCCCCC"); + $obj->dnaseq(-seq => "ATG", + -encoding => "CCC", + -location => $loc ); + @introns = $obj->$dnaseq(-encoding => 'I') + Function: get/set the underlying DNA sequence; will overwrite any + current DNA and/or encoding information present. + Returns : a string of single-letter nucleotide codes, including any + gaps implied by the encoding. + Args : seq - the DNA sequence to be used as a replacement + encoding - the encoding of the DNA sequence (see the new() + constructor); defaults to all 'C' if setting a + new DNA sequence. If no new DNA sequence is + being provided, then the encoding is used as a + "filter" for which to return fragments of + non-overlapping DNA that match the encoding. + location - optional, the location of the DNA sequence to + get/set; defaults to the entire sequence. + +=cut + +sub dnaseq { + + my ($self, @args) = @_; + my ($seq, $enc, $loc) = $self->_rearrange([qw(DNASEQ ENCODING LOCATION)], @args); + + $self + +} + +# need to overload this so that we truncate both the seq and the encoding! +sub trunc { + + my ($self, $start, $end) = @_; + my $new = $self->SUPER::trunc($start, $end); + $start--; + my $enc = $self->{_encoding}; + $enc = reverse $enc if $self->strand < 0; + $enc = substr($enc, $start, $end - $start); + $enc = reverse $enc if $self->strand < 0; + $new->encoding($enc); + return $new; +} diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Seq/LargePrimarySeq.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Seq/LargePrimarySeq.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,292 @@ +# $Id: LargePrimarySeq.pm,v 1.27 2002/12/01 00:05:21 jason Exp $ +# +# BioPerl module for Bio::Seq::LargePrimarySeq +# +# Cared for by Ewan Birney <birney@ebi.ac.uk> +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself +# +# updated to utilize File::Temp - jason 2000-12-12 +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Seq::LargePrimarySeq - PrimarySeq object that stores sequence as +files in the tempdir (as found by File::Temp) or the default method in +Bio::Root::Root + +=head1 SYNOPSIS + + # normal primary seq usage + +=head1 DESCRIPTION + +This object stores a sequence as a series of files in a temporary +directory. The aim is to allow someone the ability to store very large +sequences (eg, E<gt> 100MBases) in a file system without running out of memory +(eg, on a 64 MB real memory machine!). + +Of course, to actually make use of this functionality, the programs +which use this object B<must> not call $primary_seq-E<gt>seq otherwise the +entire sequence will come out into memory and probably paste your +machine. However, calls $primary_seq-E<gt>subseq(10,100) will cause only +90 characters to be brought into real memory. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney, Jason Stajich + +Email birney@ebi.ac.uk +Email jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Seq::LargePrimarySeq; +use vars qw($AUTOLOAD @ISA); +use strict; + +use Bio::PrimarySeq; +use Bio::Root::IO; + +@ISA = qw(Bio::PrimarySeq Bio::Root::IO); + +sub new { + my ($class, %params) = @_; + + # don't let PrimarySeq set seq until we have + # opened filehandle + + my $seq = $params{'-seq'} || $params{'-SEQ'}; + if($seq ) { + delete $params{'-seq'}; + delete $params{'-SEQ'}; + } + my $self = $class->SUPER::new(%params); + $self->_initialize_io(%params); + my $tempdir = $self->tempdir( CLEANUP => 1); + my ($tfh,$file) = $self->tempfile( DIR => $tempdir ); + + $tfh && $self->_fh($tfh); + $file && $self->_filename($file); + $self->length(0); + $seq && $self->seq($seq); + + return $self; +} + + +sub length { + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'length'} = $value; + } + + return (defined $obj->{'length'}) ? $obj->{'length'} : 0; +} + +=head2 seq + + Title : seq + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub seq { + my ($self, $data) = @_; + if( defined $data ) { + if( $self->length() == 0) { + $self->add_sequence_as_string($data); + } else { + $self->warn("Trying to reset the seq string, cannot do this with a LargePrimarySeq - must allocate a new object"); + } + } + return $self->subseq(1,$self->length); +} + +=head2 subseq + + Title : subseq + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub subseq{ + my ($self,$start,$end) = @_; + my $string; + my $fh = $self->_fh(); + + if( ref($start) && $start->isa('Bio::LocationI') ) { + my $loc = $start; + if( $loc->length == 0 ) { + $self->warn("Expect location lengths to be > 0"); + return ''; + } elsif( $loc->end < $loc->start ) { + # what about circular seqs + $self->warn("Expect location start to come before location end"); + } + my $seq = ''; + if( $loc->isa('Bio::Location::SplitLocationI') ) { + foreach my $subloc ( $loc->sub_Location ) { + if(! seek($fh,$subloc->start() - 1,0)) { + $self->throw("Unable to seek on file $start:$end $!"); + } + my $ret = read($fh, $string, $subloc->length()); + if( !defined $ret ) { + $self->throw("Unable to read $start:$end $!"); + } + if( $subloc->strand < 0 ) { + $string = Bio::PrimarySeq->new(-seq => $string)->revcom()->seq(); + } + $seq .= $string; + } + } else { + if(! seek($fh,$loc->start()-1,0)) { + $self->throw("Unable to seek on file ".$loc->start.":". + $loc->end ." $!"); + } + my $ret = read($fh, $string, $loc->length()); + if( !defined $ret ) { + $self->throw("Unable to read ".$loc->start.":". + $loc->end ." $!"); + } + $seq = $string; + } + if( defined $loc->strand && + $loc->strand < 0 ) { + $seq = Bio::PrimarySeq->new(-seq => $seq)->revcom()->seq(); + } + return $seq; + } + if( $start <= 0 || $end > $self->length ) { + $self->throw("Attempting to get a subseq out of range $start:$end vs ". + $self->length); + } + if( $end < $start ) { + $self->throw("Attempting to subseq with end ($end) less than start ($start). To revcom use the revcom function with trunc"); + } + + if(! seek($fh,$start-1,0)) { + $self->throw("Unable to seek on file $start:$end $!"); + } + my $ret = read($fh, $string, $end-$start+1); + if( !defined $ret ) { + $self->throw("Unable to read $start:$end $!"); + } + return $string; +} + +=head2 add_sequence_as_string + + Title : add_sequence_as_string + Usage : $seq->add_sequence_as_string("CATGAT"); + Function: Appends additional residues to an existing LargePrimarySeq object. + This allows one to build up a large sequence without storing + entire object in memory. + Returns : Current length of sequence + Args : string to append + +=cut + +sub add_sequence_as_string{ + my ($self,$str) = @_; + my $len = $self->length + CORE::length($str); + my $fh = $self->_fh(); + if(! seek($fh,0,2)) { + $self->throw("Unable to seek end of file: $!"); + } + $self->_print($str); + $self->length($len); +} + + +=head2 _filename + + Title : _filename + Usage : $obj->_filename($newval) + Function: + Example : + Returns : value of _filename + Args : newvalue (optional) + + +=cut + +sub _filename{ + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'_filename'} = $value; + } + return $obj->{'_filename'}; + +} +=head2 alphabet + + Title : alphabet + Usage : $obj->alphabet($newval) + Function: + Example : + Returns : value of alphabet + Args : newvalue (optional) + + +=cut + +sub alphabet{ + my ($self,$value) = @_; + if( defined $value) { + $self->SUPER::alphabet($value); + } + return $self->SUPER::alphabet() || 'dna'; + +} + +sub DESTROY { + my $self = shift; + my $fh = $self->_fh(); + close($fh) if( defined $fh ); + # this should be handled by Tempfile removal, but we'll unlink anyways. + unlink $self->_filename() if defined $self->_filename() && -e $self->_filename; + $self->SUPER::DESTROY(); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Seq/LargeSeq.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Seq/LargeSeq.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,141 @@ +# $Id: LargeSeq.pm,v 1.11 2002/10/22 07:38:40 lapp Exp $ +# +# BioPerl module for Bio::Seq::LargeSeq +# +# Cared for by Ewan Birney, Jason Stajich +# +# Copyright Ewan Birney, Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Seq::LargeSeq - SeqI compliant object that stores sequence as files in /tmp + +=head1 SYNOPSIS + + # normal primary seq usage + +=head1 DESCRIPTION + +This object stores a sequence as a series of files in a temporary +directory. The aim is to allow someone the ability to store very large +sequences (eg, E<gt> 100MBases) in a file system without running out of memory +(eg, on a 64 MB real memory machine!). + +Of course, to actually make use of this functionality, the programs +which use this object B<must> not call $primary_seq-E<gt>seq otherwise the +entire sequence will come out into memory and probably paste your +machine. However, calls $primary_seq-E<gt>subseq(10,100) will cause only +90 characters to be brought into real memory. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://www.bioperl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney + +Email birney@ebi.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Seq::LargeSeq; +use vars qw($AUTOLOAD @ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Objecttest 8, + +use Bio::Seq::LargePrimarySeq; +use Bio::Seq; + +@ISA = qw(Bio::Seq); + + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + my ($pseq) = $self->_rearrange([qw(PRIMARYSEQ)], @args); + + if( ! defined $pseq ) { + $pseq = new Bio::Seq::LargePrimarySeq(@args); + } + $self->primary_seq($pseq); + + return $self; +} + + +=head2 trunc + + Title : trunc + Usage : $subseq = $myseq->trunc(10,100); + Function: Provides a truncation of a sequence, + + Example : + Returns : a fresh Bio::SeqI object + Args : + +=cut + +sub trunc { + my ($self, $s, $e) = @_; + return new Bio::Seq::LargeSeq( + '-display_id' => $self->display_id, + '-accession_number' => $self->accession_number, + '-desc' => $self->desc, + '-alphabet' => $self->alphabet, + -primaryseq => + $self->primary_seq->trunc($s,$e)); + +} + +=head2 Bio::Seq::LargePrimarySeq methods + +=cut + +=head2 add_sequence_as_string + + Title : add_sequence_as_string + Usage : $seq->add_sequence_as_string("CATGAT"); + Function: Appends additional residues to an existing LargePrimarySeq object. + This allows one to build up a large sequence without storing + entire object in memory. + Returns : Current length of sequence + Args : string to append + +=cut + +sub add_sequence_as_string { + my ($self,$str) = @_; + return $self->primary_seq->add_sequence_as_string($str); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Seq/PrimaryQual.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Seq/PrimaryQual.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,459 @@ +# $Id: PrimaryQual.pm,v 1.17 2002/10/22 07:38:40 lapp Exp $ +# +# bioperl module for Bio::PrimaryQual +# +# Cared for by Chad Matsalla <bioinformatics@dieselwurks.com> +# +# Copyright Chad Matsalla +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Seq::PrimaryQual - Bioperl lightweight Quality Object + +=head1 SYNOPSIS + + use Bio::Seq::PrimaryQual; + + # you can use either a space-delimited string for quality + + my $string_quals = "10 20 30 40 50 40 30 20 10"; + my $qualobj = Bio::Seq::PrimaryQual->new + ( '-qual' => $string_quals, + '-id' => 'QualityFragment-12', + '-accession_number' => 'X78121', + ); + + # _or_ you can use an array of quality values + + my @q2 = split/ /,$string_quals; + $qualobj = Bio::Seq::PrimaryQual->new( '-qual' => \@q2, + '-primary_id' => 'chads primary_id', + '-desc' => 'chads desc', + '-accession_number' => 'chads accession_number', + '-id' => 'chads id' + ); + + # to get the quality values out: + + my @quals = @{$qualobj->qual()}; + + # to give _new_ quality values + + my $newqualstring = "50 90 1000 20 12 0 0"; + $qualobj->qual($newqualstring); + + +=head1 DESCRIPTION + +This module provides a mechanism for storing quality +values. Much more useful as part of +Bio::Seq::SeqWithQuality where these quality values +are associated with the sequence information. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Chad Matsalla + +Email bioinformatics@dieselwurks.com + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Seq::PrimaryQual; +use vars qw(@ISA %valid_type); +use strict; + +use Bio::Root::Root; +use Bio::Seq::QualI; + +@ISA = qw(Bio::Root::Root Bio::Seq::QualI); + + +=head2 new() + + Title : new() + Usage : $qual = Bio::Seq::PrimaryQual->new + ( -qual => '10 20 30 40 50 50 20 10', + -id => 'human_id', + -accession_number => 'AL000012', + ); + + Function: Returns a new Bio::Seq::PrimaryQual object from basic + constructors, being a string _or_ a reference to an array for the + sequence and strings for id and accession_number. Note that you + can provide an empty quality string. + Returns : a new Bio::Seq::PrimaryQual object + +=cut + + + + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + # default: turn ON the warnings (duh) + my($qual,$id,$acc,$pid,$desc,$given_id) = + $self->_rearrange([qw(QUAL + DISPLAY_ID + ACCESSION_NUMBER + PRIMARY_ID + DESC + ID + )], + @args); + if( defined $id && defined $given_id ) { + if( $id ne $given_id ) { + $self->throw("Provided both id and display_id constructor functions. [$id] [$given_id]"); + } + } + if( defined $given_id ) { $id = $given_id; } + + # note: the sequence string may be empty + $self->qual($qual ? $qual : []); + $id && $self->display_id($id); + $acc && $self->accession_number($acc); + $pid && $self->primary_id($pid); + $desc && $self->desc($desc); + + return $self; +} + +=head2 qual() + + Title : qual() + Usage : @quality_values = @{$obj->qual()}; + Function: Returns the quality as a reference to an array containing the + quality values. The individual elements of the quality array are + not validated and can be any numeric value. + Returns : A reference to an array. + +=cut + +sub qual { + my ($self,$value) = @_; + + if( ! defined $value || length($value) == 0 ) { + $self->{'qual'} ||= []; + } elsif( ref($value) =~ /ARRAY/i ) { + # if the user passed in a reference to an array + $self->{'qual'} = $value; + } elsif(! $self->validate_qual($value)){ + $self->throw("Attempting to set the quality to [$value] which does not look healthy"); + } else { + $self->{'qual'} = [split(/\s+/,$value)]; + } + + return $self->{'qual'}; +} + +=head2 validate_qual($qualstring) + + Title : validate_qual($qualstring) + Usage : print("Valid.") if { &validate_qual($self,$qualities); } + Function: Make sure that the quality, if it has length > 0, contains at + least one digit. Note that quality strings are parsed into arrays + using split/\d+/,$quality_string, so make sure that your quality + scalar looks like this if you want it to be parsed properly. + Returns : 1 for a valid sequence (WHY? Shouldn\'t it return 0? <boggle>) + Args : a scalar (any scalar, why PrimarySeq author?) and a scalar + containing the string to validate. + +=cut + +sub validate_qual { + # how do I validate quality values? + # \d+\s+\d+..., I suppose + my ($self,$qualstr) = @_; + # why the CORE?? -- (Because Bio::PrimarySeqI namespace has a + # length method, you have to qualify + # which length to use) + return 0 if (!defined $qualstr || CORE::length($qualstr) <= 0); + return 1 if( $qualstr =~ /\d/); + + return 0; +} + +=head2 subqual($start,$end) + + Title : subqual($start,$end) + Usage : @subset_of_quality_values = @{$obj->subqual(10,40)}; + Function: returns the quality values from $start to $end, where the + first value is 1 and the number is inclusive, ie 1-2 are the + first two bases of the sequence. Start cannot be larger than + end but can be equal. + Returns : A reference to an array. + Args : a start position and an end position + +=cut + + +sub subqual { + my ($self,$start,$end) = @_; + + if( $start > $end ){ + $self->throw("in subqual, start [$start] has to be greater than end [$end]"); + } + + if( $start <= 0 || $end > $self->length ) { + $self->throw("You have to have start positive and length less than the total length of sequence [$start:$end] Total ".$self->length.""); + } + + # remove one from start, and then length is end-start + + $start--; + $end--; + my @sub_qual_array = @{$self->{qual}}[$start..$end]; + + # return substr $self->seq(), $start, ($end-$start); + return \@sub_qual_array; + +} + +=head2 display_id() + + Title : display_id() + Usage : $id_string = $obj->display_id(); + Function: returns the display id, aka the common name of the Quality + object. + The semantics of this is that it is the most likely string to be + used as an identifier of the quality sequence, and likely to have + "human" readability. The id is equivalent to the ID field of the + GenBank/EMBL databanks and the id field of the Swissprot/sptrembl + database. In fasta format, the >(\S+) is presumed to be the id, + though some people overload the id to embed other information. + Bioperl does not use any embedded information in the ID field, + and people are encouraged to use other mechanisms (accession + field for example, or extending the sequence object) to solve + this. Notice that $seq->id() maps to this function, mainly for + legacy/convience issues + Returns : A string + Args : None + +=cut + +sub display_id { + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'display_id'} = $value; + } + return $obj->{'display_id'}; + +} + +=head2 accession_number() + + Title : accession_number() + Usage : $unique_biological_key = $obj->accession_number(); + Function: Returns the unique biological id for a sequence, commonly + called the accession_number. For sequences from established + databases, the implementors should try to use the correct + accession number. Notice that primary_id() provides the unique id + for the implemetation, allowing multiple objects to have the same + accession number in a particular implementation. For sequences + with no accession number, this method should return "unknown". + Returns : A string + Args : None + +=cut + +sub accession_number { + my( $obj, $acc ) = @_; + + if (defined $acc) { + $obj->{'accession_number'} = $acc; + } else { + $acc = $obj->{'accession_number'}; + $acc = 'unknown' unless defined $acc; + } + return $acc; +} + +=head2 primary_id() + + Title : primary_id() + Usage : $unique_implementation_key = $obj->primary_id(); + Function: Returns the unique id for this object in this implementation. + This allows implementations to manage their own object ids in a + way the implementaiton can control clients can expect one id to + map to one object. For sequences with no accession number, this + method should return a stringified memory location. + Returns : A string + Args : None + +=cut + +sub primary_id { + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'primary_id'} = $value; + } + return $obj->{'primary_id'}; + +} + +=head2 desc() + + Title : desc() + Usage : $qual->desc($newval); + $description = $qual->desc(); + Function: Get/set description text for a qual object + Example : + Returns : Value of desc + Args : newvalue (optional) + +=cut + +sub desc { + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'desc'} = $value; + } + return $obj->{'desc'}; + +} + +=head2 id() + + Title : id() + Usage : $id = $qual->id(); + Function: Return the ID of the quality. This should normally be (and + actually is in the implementation provided here) just a synonym + for display_id(). + Returns : A string. + Args : None. + +=cut + +sub id { + my ($self,$value) = @_; + if( defined $value ) { + return $self->display_id($value); + } + return $self->display_id(); +} + +=head2 length() + + Title : length() + Usage : $length = $qual->length(); + Function: Return the length of the array holding the quality values. + Under most circumstances, this should match the number of quality + values but no validation is done when the PrimaryQual object is + constructed and non-digits could be put into this array. Is this + a bug? Just enough rope... + Returns : A scalar (the number of elements in the quality array). + Args : None. + +=cut + +sub length { + my $self = shift; + if (ref($self->{qual}) ne "ARRAY") { + $self->warn("{qual} is not an array here. Why? It appears to be ".ref($self->{qual})."(".$self->{qual}."). Good thing this can _never_ happen."); + } + return scalar(@{$self->{qual}}); +} + +=head2 qualat($position) + + Title : qualat($position) + Usage : $quality = $obj->qualat(10); + Function: Return the quality value at the given location, where the + first value is 1 and the number is inclusive, ie 1-2 are the first + two bases of the sequence. Start cannot be larger than end but can + be equal. + Returns : A scalar. + Args : A position. + +=cut + +sub qualat { + my ($self,$val) = @_; + my @qualat = @{$self->subqual($val,$val)}; + if (scalar(@qualat) == 1) { + return $qualat[0]; + } + else { + $self->throw("AAAH! qualat provided more then one quality."); + } +} + +=head2 to_string() + + Title : to_string() + Usage : $quality = $obj->to_string(); + Function: Return a textual representation of what the object contains. + For this module, this function will return: + qual + display_id + accession_number + primary_id + desc + id + length + Returns : A scalar. + Args : None. + +=cut + +sub to_string { + my ($self,$out,$result) = shift; + $out = "qual: ".join(',',@{$self->qual()}); + foreach (qw(display_id accession_number primary_id desc id)) { + $result = $self->$_(); + if (!$result) { $result = "<unset>"; } + $out .= "$_: $result\n"; + } + return $out; +} + + +sub to_string_automatic { + my ($self,$sub_result,$out) = shift; + foreach (sort keys %$self) { + print("Working on $_\n"); + eval { $self->$_(); }; + if ($@) { $sub_result = ref($_); } + elsif (!($sub_result = $self->$_())) { + $sub_result = "<unset>"; + } + if (ref($sub_result) eq "ARRAY") { + print("This thing ($_) is an array!\n"); + $sub_result = join(',',@$sub_result); + } + $out .= "$_: ".$sub_result."\n"; + } + return $out; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Seq/PrimedSeq.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Seq/PrimedSeq.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,480 @@ +# BioPerl module for Bio::PrimedSeq +# +# Cared for by Chad Matsalla <bioinformatics1@dieselwurks.com> +# +# Copyright Chad Matsalla +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 Bio::Seq::PrimedSeq + +Bio::Seq::PrimedSeq - A representation of a sequence and two primers flanking a +target region for amplification + +=head1 SYNOPSIS + + # create a sequence + my $sequence = "ctagctagctagctagctagctagctagctgatcgtagctagctagct"; + # create left and right primer seqfeatures + # unfortunately, I haven't created constructors for these yet. + my $left = Bio::SeqFeature::Primer(); + my $right = Bio::SeqFeature::Primer(); + # now create the PrimedSeq + $primedseq = new Bio::Seq::PrimedSeq( + -seq => $sequence, + -display_id => "chads_fantastic_sequence", + -LEFT_PRIMER => $left, + -RIGHT_PRIMER => $right, + -TARGET => '513,26' + -PRIMER_PRODUCT_SIZE_RANGE => '100-500' + -PRIMER_FILE_FLAG => '0' + -PRIMER_LIBERAL_BASE => '1' + -PRIMER_NUM_RETURN => '1' + -PRIMER_FIRST_BASE_INDEX => '1' + -PRIMER_EXPLAIN_FLAG => '1' + -PRIMER_PRODUCT_SIZE => '185' + ); + # get the amplified region + my $amplified_sequence = $primed_seq->get_amplified_sequence(); + +=head1 DESCRIPTION + +This module is a slightly glorified capsule containg a primed seqBuence. It was +created to address the fact that a primer is more the a seqfeature and there +need to be ways to represent the primer-sequence complex and the behaviors and +attributes that are associated with the complex. + +=head1 FEEDBACK + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Seq::PrimedSeq; +use vars qw(@ISA); +use strict; + +use Bio::RangeI; + +@ISA = qw(Bio::Seq); + + +=head2 new + + Title : new() + Usage : $primed_sequence = new Bio::SeqFeature::Primer( -seq => $sequence, + -left_primer => $left_primer, + -right_primer => $right_primer); + Function: A constructor for an object representing a primed sequence + Returns : A Bio::Seq::PrimedSeq object + Args : + -seq => a Bio::Seq object + -left_primer => a Bio::SeqFeature::Primer object + -right_primer => a Bio::SeqFeature::Primer object + Many other parameters can be included including all of the output + parameters from the primer3 program. +Developer Notes: This is incomplete and doesn't work. As of ISMB2002 I am working on it. + + +=cut + +sub new { + my($class,@args) = @_; + my %arguments = @args; + my $self = $class->SUPER::new(@args); + # these are the absolute minimum components required to make + # a primedseq + my $newkey; + foreach my $key (sort keys %arguments) { + ($newkey = $key) =~ s/-//; + $self->{$newkey} = $arguments{$key}; + push @{$self->{arguments}},$newkey; + } + # and now the insurance- make sure that things are ok + if (!$self->{target_sequence} || !$self->{left_primer} || !$self->{right_primer} ) { + $self->throw("You must provide a target_sequence, left_primer, and right_primer to create this object."); + } + if (ref($self->{target_sequence}) ne "Bio::Seq") { + $self->throw("The target_sequence must be a Bio::Seq to create this object."); + } + if (ref($self->{left_primer}) ne "Bio::SeqFeature::Primer" || ref($self->{right_primer}) ne "Bio::SeqFeature::Primer") { + $self->throw("You must provide a left_primer and right_primer, both as Bio::SeqFeature::Primer to create this object."); + } + return $self; +} + + +=head2 get_left_primer + + Title : get_left_primer(); + Usage : $left_primer = $primedseq->get_left_primer(); + Function: A getter for the left primer in thie PrimedSeq object. + Returns : A Bio::SeqFeature::Primer object + Args : None. + +=cut + +sub get_left_primer() { + my $self = shift; + + + + +} + + + + + + + + + + + + +=head2 Bio::RangeI methods + +List of interfaces inherited from Bio::RangeI (see L<Bio::RangeI> +for details). + +=head2 start + + Title : start + Usage : $start = $feat->start + Function: Returns the start coordinate of the feature + Returns : integer + Args : none +Developer Notes: + This is entirely dependent on the sequence to which this primer is attached! + I think that there could be trouble if one takes this primer from sequence 1 + and naively place it on sequence 2 without updating this + ** This is incomplete at this time. +=cut + +sub start() { + my $self = shift; + + +} + + + + +=head2 end + + Title : end + Usage : $end = $feat->end + Function: Returns the end coordinate of the feature + Returns : integer + Args : none +Developer Notes: + ** This is incomplete at this time. +=cut + +sub end() { + my $self = shift; + + +} + +=head2 strand + + Title : strand + Usage : $strand = $feat->strand() + Function: Returns strand information, being 1,-1 or 0 + Returns : -1,1 or 0 + Args : none +Developer Notes: + ** This is incomplete at this time. + + +=cut + +sub strand() { + my $self = shift; +} + + +=head2 SeqFeatureI specific methods + +New method interfaces. + +=head2 sub_SeqFeature + + Title : sub_SeqFeature + Usage : @feats = $feat->sub_SeqFeature(); + Function: Returns an array of sub Sequence Features + Returns : An array + Args : none + +=cut + +sub sub_SeqFeature{ + my ($self,@args) = @_; + + $self->throw_not_implemented(); +} + +=head2 display_id + + Title : display_id + Usage : $name = $feat->display_id() + Function: Returns the human-readable ID of the + feature for displays. + Returns : a string + Args : none + +=cut + +sub display_id { + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +=head2 primary_tag + + Title : primary_tag + Usage : $tag = $feat->primary_tag() + Function: Returns the primary tag for a feature, + eg 'exon' + Returns : a string + Args : none + + +=cut + +sub primary_tag{ + my ($self,@args) = @_; + + $self->throw_not_implemented(); + +} + +=head2 source_tag + + Title : source_tag + Usage : $tag = $feat->source_tag() + Function: Returns the source tag for a feature, + eg, 'genscan' + Returns : a string + Args : none + + +=cut + +sub source_tag{ + my ($self,@args) = @_; + + $self->throw_not_implemented(); +} + +=head2 has_tag + + Title : has_tag + Usage : $tag_exists = $self->has_tag('some_tag') + Function: + Returns : TRUE if the specified tag exists, and FALSE otherwise + Args : + + +=cut + +sub has_tag{ + my ($self,@args) = @_; + + $self->throw_not_implemented(); + +} + +=head2 each_tag_value + + Title : each_tag_value + Usage : @values = $self->each_tag_value('some_tag') + Function: + Returns : An array comprising the values of the specified tag. + Args : + + +=cut + +sub each_tag_value { + my ($self,@args) = @_; + + $self->throw_not_implemented(); +} + +=head2 all_tags + + Title : all_tags + Usage : @tags = $feat->all_tags() + Function: gives all tags for this feature + Returns : an array of strings + Args : none + + +=cut + +sub all_tags{ + my ($self,@args) = @_; + + $self->throw_not_implemented(); +} + +=head2 gff_string + + Title : gff_string + Usage : $str = $feat->gff_string; + $str = $feat->gff_string($gff_formatter); + Function: Provides the feature information in GFF format. + + The implementation provided here returns GFF2 by default. If you + want a different version, supply an object implementing a method + gff_string() accepting a SeqFeatureI object as argument. E.g., to + obtain GFF1 format, do the following: + + my $gffio = Bio::Tools::GFF->new(-gff_version => 1); + $gff1str = $feat->gff_string($gff1io); + + Returns : A string + Args : Optionally, an object implementing gff_string(). + + +=cut + +sub gff_string{ + my ($self,$formatter) = @_; + + $formatter = $self->_static_gff_formatter unless $formatter; + return $formatter->gff_string($self); +} + +my $static_gff_formatter = undef; + +=head2 _static_gff_formatter + + Title : _static_gff_formatter + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub _static_gff_formatter{ + my ($self,@args) = @_; + + if( !defined $static_gff_formatter ) { + $static_gff_formatter = Bio::Tools::GFF->new('-gff_version' => 2); + } + return $static_gff_formatter; +} + + + +=head1 RangeI methods + +These methods are inherited from RangeI and can be used +directly from a SeqFeatureI interface. Remember that a +SeqFeature is-a RangeI, and so wherever you see RangeI you +can use a feature ($r in the below documentation). + +=head2 overlaps + + Title : overlaps + Usage : if($feat->overlaps($r)) { do stuff } + if($feat->overlaps(200)) { do stuff } + Function: tests if $feat overlaps $r + Args : a RangeI to test for overlap with, or a point + Returns : true if the Range overlaps with the feature, false otherwise + + +=head2 contains + + Title : contains + Usage : if($feat->contains($r) { do stuff } + Function: tests whether $feat totally contains $r + Args : a RangeI to test for being contained + Returns : true if the argument is totaly contained within this range + + +=head2 equals + + Title : equals + Usage : if($feat->equals($r)) + Function: test whether $feat has the same start, end, strand as $r + Args : a RangeI to test for equality + Returns : true if they are describing the same range + + +=head1 Geometrical methods + +These methods do things to the geometry of ranges, and return +triplets (start, stop, strand) from which new ranges could be built. + +=head2 intersection + + Title : intersection + Usage : ($start, $stop, $strand) = $feat->intersection($r) + Function: gives the range that is contained by both ranges + Args : a RangeI to compare this one to + Returns : nothing if they do not overlap, or the range that they do overlap + +=head2 union + + Title : union + Usage : ($start, $stop, $strand) = $feat->union($r); + : ($start, $stop, $strand) = Bio::RangeI->union(@ranges); + Function: finds the minimal range that contains all of the ranges + Args : a range or list of ranges to find the union of + Returns : the range containing all of the ranges + +=cut + +=head2 location + + Title : location + Usage : my $location = $seqfeature->location() + Function: returns a location object suitable for identifying location + of feature on sequence or parent feature + Returns : Bio::LocationI object + Args : none + + +=cut + +sub location { + my ($self) = @_; + + $self->throw_not_implemented(); +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Seq/QualI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Seq/QualI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,563 @@ +# $Id: QualI.pm,v 1.4 2002/10/22 07:38:40 lapp Exp $ +# +# BioPerl module for Bio::Seq::QualI +# +# Cared for by Chad Matsalla <bioinformatics@dieselwurks.com +# +# Copyright Chad Matsalla +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Seq::QualI - Interface definition for a Bio::Seq::Qual + +=head1 SYNOPSIS + + # get a Bio::Seq::Qual compliant object somehow + + # to test this is a seq object + + $obj->isa("Bio::Seq::QualI") || $obj->throw("$obj does not implement the Bio::Seq::QualI interface"); + + # accessors + + $string = $obj->qual(); + $substring = $obj->subqual(12,50); + $display = $obj->display_id(); # for human display + $id = $obj->primary_id(); # unique id for this object, implementation defined + $unique_key= $obj->accession_number(); + # unique biological id + + + +=head1 DESCRIPTION + +This object defines an abstract interface to basic quality +information. PrimaryQual is an object just for the quality and its +name(s), nothing more. There is a pure perl implementation of this in +Bio::Seq::PrimaryQual. If you just want to use Bio::Seq::PrimaryQual +objects, then please read that module first. This module defines the +interface, and is of more interest to people who want to wrap their own +Perl Objects/RDBs/FileSystems etc in way that they "are" bioperl quality +objects, even though it is not using Perl to store the sequence etc. + +This interface defines what bioperl consideres necessary to "be" a +sequence of qualities, without providing an implementation of this. (An +implementation is provided in Bio::Seq::PrimaryQual). If you want to +provide a Bio::Seq::PrimaryQual 'compliant' object which in fact wraps +another object/database/out-of-perl experience, then this is the correct +thing to wrap, generally by providing a wrapper class which would inheriet +from your object and this Bio::Seq::QualI interface. The wrapper +class then would have methods lists in the "Implementation Specific +Functions" which would provide these methods for your object. + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Chad Matsalla + +This module is heavily based on Bio::Seq::PrimarySeq and is modeled after +or outright copies sections of it. Thanks Ewan! + +Email bioinformatics@dieselwurks.com + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Seq::QualI; +use vars qw(@ISA); +use strict; +use Carp; + +=head1 Implementation Specific Functions + +These functions are the ones that a specific implementation must +define. + +=head2 qual() + + Title : qual() + Usage : @quality_values = @{$obj->qual()}; + Function: Returns the quality as a reference to an array containing the + quality values. The individual elements of the quality array are + not validated and can be any numeric value. + Returns : A reference to an array. + Status : + +=cut + +sub qual { + my ($self) = @_; + if( $self->can('throw') ) { + $self->throw("Bio::Seq::QualI definition of qual - implementing class did not provide this method"); + } else { + confess("Bio::Seq::QualI definition of qual - implementing class did not provide this method"); + } +} + +=head2 subqual($start,$end) + + Title : subqual($start,$end) + Usage : @subset_of_quality_values = @{$obj->subseq(10,40)}; + Function: returns the quality values from $start to $end, where the + first value is 1 and the number is inclusive, ie 1-2 are the first + two bases of the sequence. Start cannot be larger than end but can + be equal. + Returns : A reference to an array. + Args : a start position and an end position + + +=cut + +sub subqual { + my ($self) = @_; + + if( $self->can('throw') ) { + $self->throw("Bio::Seq::QualI definition of subqual - implementing class did not provide this method"); + } else { + confess("Bio::Seq::QualI definition of subqual - implementing class did not provide this method"); + } + +} + +=head2 display_id() + + Title : display_id() + Usage : $id_string = $obj->display_id() _or_ + $id_string = $obj->display_id($new_display_id); + Function: Returns the display id, aka the common name of the Quality + object. + The semantics of this is that it is the most likely string to be + used as an identifier of the quality sequence, and likely to have + "human" readability. The id is equivalent to the ID field of the + GenBank/EMBL databanks and the id field of the Swissprot/sptrembl + database. In fasta format, the >(\S+) is presumed to be the id, + though some people overload the id to embed other information. + Bioperl does not use any embedded information in the ID field, + and people are encouraged to use other mechanisms (accession field + for example, or extending the sequence object) to solve this. + Notice that $seq->id() maps to this function, mainly for + legacy/convience issues + Returns : A string + Args : If an arg is provided, it will replace the existing display_id + in the object. + + +=cut + +sub display_id { + my ($self) = @_; + + if( $self->can('throw') ) { + $self->throw("Bio::Seq::QualI definition of id - implementing class did not provide this method"); + } else { + confess("Bio::Seq::QualI definition of id - implementing class did not provide this method"); + } + +} + + +=head2 accession_number() + + Title : accession_number() + Usage : $unique_biological_key = $obj->accession_number(); _or_ + $unique_biological_key = $obj->accession_number($new_acc_num); + Function: Returns the unique biological id for a sequence, commonly + called the accession_number. For sequences from established + databases, the implementors should try to use the correct + accession number. Notice that primary_id() provides the unique id + for the implemetation, allowing multiple objects to have the same + accession number in a particular implementation. For sequences + with no accession number, this method should return "unknown". + Returns : A string. + Args : If an arg is provided, it will replace the existing + accession_number in the object. + +=cut + +sub accession_number { + my ($self,@args) = @_; + + if( $self->can('throw') ) { + $self->throw("Bio::Seq::QualI definition of seq - implementing class did not provide this method"); + } else { + confess("Bio::Seq::QualI definition of seq - implementing class did not provide this method"); + } + +} + + + +=head2 primary_id() + + Title : primary_id() + Usage : $unique_implementation_key = $obj->primary_id(); _or_ + $unique_implementation_key = $obj->primary_id($new_prim_id); + Function: Returns the unique id for this object in this implementation. + This allows implementations to manage their own object ids in a + way the implementaiton can control clients can expect one id to + map to one object. For sequences with no accession number, this + method should return a stringified memory location. + Returns : A string + Args : If an arg is provided, it will replace the existing + primary_id in the object. + +=cut + +sub primary_id { + my ($self,@args) = @_; + + if( $self->can('throw') ) { + $self->throw("Bio::Seq::QualI definition of qual - implementing class did not provide this method"); + } else { + confess("Bio::Seq::QualI definition of qual - implementing class did not provide this method"); + } + +} + + +=head2 can_call_new() + + Title : can_call_new() + Usage : if( $obj->can_call_new ) { + $newobj = $obj->new( %param ); + } + Function: can_call_new returns 1 or 0 depending on whether an + implementation allows new constructor to be called. If a new + constructor is allowed, then it should take the followed hashed + constructor list. + $myobject->new( -qual => $quality_as_string, + -display_id => $id, + -accession_number => $accession, + ); + Example : + Returns : 1 or 0 + Args : + + +=cut + +sub can_call_new{ + my ($self,@args) = @_; + # we default to 0 here + return 0; +} + +=head2 qualat($position) + + Title : qualat($position) + Usage : $quality = $obj->qualat(10); + Function: Return the quality value at the given location, where the + first value is 1 and the number is inclusive, ie 1-2 are the first + two bases of the sequence. Start cannot be larger than end but can + be equal. + Returns : A scalar. + Args : A position. + +=cut + +sub qualat { + my ($self,$value) = @_; + if( $self->can('warn') ) { + $self->warn("Bio::Seq::QualI definition of qualat - implementing class did not provide this method"); + } else { + warn("Bio::Seq::QualI definition of qualat - implementing class did not provide this method"); + } + return ''; +} + +=head1 Optional Implementation Functions + +The following functions rely on the above functions. A implementing +class does not need to provide these functions, as they will be +provided by this class, but is free to override these functions. + +All of revcom(), trunc(), and translate() create new sequence +objects. They will call new() on the class of the sequence object +instance passed as argument, unless can_call_new() returns FALSE. In +the latter case a Bio::PrimarySeq object will be created. Implementors +which really want to control how objects are created (eg, for object +persistence over a database, or objects in a CORBA framework), they +are encouraged to override these methods + +=head2 revcom + + Title : revcom + Usage : @rev = @{$qual->revcom()}; + Function: Produces a new Bio::Seq::QualI implementing object which + is reversed from the original quality array. + The id is the same id as the orginal sequence, and the accession number + is also indentical. If someone wants to track that this sequence has + been reversed, it needs to define its own extensions + + To do an inplace edit of an object you can go: + + $qual = $qual->revcom(); + + This of course, causes Perl to handle the garbage collection of the old + object, but it is roughly speaking as efficient as an inplace edit. + Returns : A new (fresh) Bio::Seq::PrimaryQualI object + Args : none + +=cut + +sub revcom{ + my ($self) = @_; + # this is the cleanest way + my @qualities = @{$self->seq()}; + my @reversed_qualities = reverse(@qualities); + my $seqclass; + if($self->can_call_new()) { + $seqclass = ref($self); + } else { + $seqclass = 'Bio::Seq::PrimaryQual'; + # Wassat? + # $self->_attempt_to_load_Seq(); + } + # the \@reverse_qualities thing works simply because I will it to work. + my $out = $seqclass->new( '-qual' => \@reversed_qualities, + '-display_id' => $self->display_id, + '-accession_number' => $self->accession_number, + '-desc' => $self->desc() + ); + return $out; +} + +=head2 trunc() + + Title : trunc + Usage : $subseq = $myseq->trunc(10,100); + Function: Provides a truncation of a sequence, + Returns : a fresh Bio::Seq::QualI implementing object + Args : Two integers denoting first and last base of the sub-sequence. + + +=cut + +sub trunc { + my ($self,$start,$end) = @_; + + if( !$end ) { + if( $self->can('throw') ) { + $self->throw("trunc start,end"); + } else { + confess("[$self] trunc start,end"); + } + } + + if( $end < $start ) { + if( $self->can('throw') ) { + $self->throw("$end is smaller than $start. if you want to truncated and reverse complement, you must call trunc followed by revcom. Sorry."); + } else { + confess("[$self] $end is smaller than $start. If you want to truncated and reverse complement, you must call trunc followed by revcom. Sorry."); + } + } + + my $r_qual = $self->subqual($start,$end); + + my $seqclass; + if($self->can_call_new()) { + $seqclass = ref($self); + } else { + $seqclass = 'Bio::Seq::PrimaryQual'; + # wassat? + # $self->_attempt_to_load_Seq(); + } + my $out = $seqclass->new( '-qual' => $r_qual, + '-display_id' => $self->display_id, + '-accession_number' => $self->accession_number, + '-desc' => $self->desc() + ); + return $out; +} + + +=head2 translate() + + Title : translate() + Usage : $protein_seq_obj = $dna_seq_obj->translate + #if full CDS expected: + $protein_seq_obj = $cds_seq_obj->translate(undef,undef,undef,undef,1); + Function: Completely useless in this interface. + Returns : Nothing. + Args : Nothing. + +=cut + + +sub translate { + return 0; +} + + +=head2 id() + + Title : id() + Usage : $id = $qual->id() + Function: ID of the quality. This should normally be (and actually is in + the implementation provided here) just a synonym for display_id(). + Example : + Returns : A string. + Args : + + +=cut + +sub id { + my ($self)= @_; + return $self->display_id(); +} + +=head2 length() + + Title : length() + Usage : $length = $qual->length(); + Function: Return the length of the array holding the quality values. + Under most circumstances, this should match the number of quality + values but no validation is done when the PrimaryQual object is + constructed and non-digits could be put into this array. Is this a + bug? Just enough rope... + Returns : A scalar (the number of elements in the quality array). + Args : None. + +=cut + +sub length { + my ($self)= @_; + if( $self->can('throw') ) { + $self->throw("Bio::Seq::QualI definition of length - implementing class did not provide this method"); + } else { + confess("Bio::Seq::QualI definition of length - implementing class did not provide this method"); + } +} + + +=head2 desc() + + Title : desc() + Usage : $qual->desc($newval); + $description = $seq->desc(); + Function: Get/set description text for a qual object + Example : + Returns : value of desc + Args : newvalue (optional) + +=cut + +sub desc { + my ($self,$value) = @_; + if( $self->can('warn') ) { + $self->warn("Bio::Seq::QualI definition of desc - implementing class did not provide this method"); + } else { + warn("Bio::Seq::QualI definition of desc - implementing class did not provide this method"); + } + return ''; +} + +# These methods are here for backward compatibility with the old, 0.5 +# Seq objects. They all throw warnings that someone is using a +# deprecated method, and may eventually be removed completely from +# this object. However, they are important to ease the transition from +# the old system. + +=head1 Private functions + +These are some private functions for the PrimarySeqI interface. You do not +need to implement these functions + +=head2 _attempt_to_load_Seq + + Title : _attempt_to_load_Seq + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub _attempt_to_load_Seq{ + my ($self) = @_; + + if( $main::{'Bio::Seq::PrimaryQual'} ) { + return 1; + } else { + eval { + require Bio::Seq::PrimaryQual; + }; + if( $@ ) { + if( $self->can('throw') ) { + $self->throw("Bio::Seq::PrimaryQual could not be loaded for $self\nThis indicates that you are using Bio::Seq::PrimaryQualI without Bio::Seq::PrimaryQual loaded and without providing a complete solution\nThe most likely problem is that there has been a misconfiguration of the bioperl environment\nActual exception\n\n$@\n"); + } else { + confess("Bio::Seq::PrimarySeq could not be loaded for $self\nThis indicates that you are usnig Bio::Seq::PrimaryQualI without Bio::Seq::PrimaryQual loaded and without providing a complete solution\nThe most likely problem is that there has been a misconfiguration of the bioperl environment\nActual exception\n\n$@\n"); + } + return 0; + } + return 1; + } + +} + + +=head2 qualtype() + + Title : qualtype() + Usage : if( $obj->qualtype eq 'phd' ) { /Do Something/ } + Function: At this time, this function is not used for + Bio::Seq::PrimaryQual objects. In fact, now it is a month later and + I just completed the Bio::Seq::SeqWithQuality object and this is + definitely deprecated. + Returns : Nothing. (not implemented) + Args : none + Status : Virtual + + +=cut + +sub qualtype { + my ($self,@args) = @_; + if( $self->can('throw') ) { + # $self->throw("Bio::Seq::QualI definition of qual - implementing class did not provide this method"); + $self->throw("qualtypetype is not used with quality objects."); + } else { + # confess("Bio::Seq::QualI definition of qual - implementing class did not provide this method"); + confess("qualtype is not used with quality objects."); + } + + +} + + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Seq/RichSeq.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Seq/RichSeq.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,390 @@ +# $Id: RichSeq.pm,v 1.9 2002/11/11 18:16:31 lapp Exp $ +# +# BioPerl module for Bio::Seq::RichSeq +# +# Cared for by Ewan Birney <birney@ebi.ac.uk> +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Seq::RichSeq - Module implementing a sequence created from a rich +sequence database entry + +=head1 SYNOPSIS + +See Bio::Seq::RichSeqI and documentation of methods. + +=head1 DESCRIPTION + +This module implements Bio::Seq::RichSeqI, an interface for sequences +created from or created for entries from/of rich sequence databanks, +like EMBL, GenBank, and SwissProt. Methods added to the Bio::SeqI +interface therefore focus on databank-specific information. Note that +not every rich databank format may use all of the properties provided. + +=head1 Implemented Interfaces + +This class implementes the following interfaces. + +=over 4 + +=item Bio::Seq::RichSeqI + +Note that this includes implementing Bio::PrimarySeqI and Bio::SeqI. + +=item Bio::IdentifiableI + +=item Bio::DescribableI + +=item Bio::AnnotatableI + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney + +Email birney@ebi.ac.uk + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Seq::RichSeq; +use vars qw($AUTOLOAD @ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Object + +use Bio::Seq; +use Bio::Seq::RichSeqI; + +@ISA = qw(Bio::Seq Bio::Seq::RichSeqI); + + +=head2 new + + Title : new + Usage : $seq = Bio::Seq::RichSeq->new( -seq => 'ATGGGGGTGGTGGTACCCT', + -id => 'human_id', + -accession_number => 'AL000012', + ); + + Function: Returns a new seq object from + basic constructors, being a string for the sequence + and strings for id and accession_number + Returns : a new Bio::Seq::RichSeq object + +=cut + +sub new { + # standard new call.. + my($caller,@args) = @_; + my $self = $caller->SUPER::new(@args); + + $self->{'_dates'} = []; + $self->{'_secondary_accession'} = []; + + my ($dates, $xtra, $sv, + $keywords, $pid, $mol, + $division ) = $self->_rearrange([qw(DATES + SECONDARY_ACCESSIONS + SEQ_VERSION + KEYWORDS + PID + MOLECULE + DIVISION + )], + @args); + defined $division && $self->division($division); + defined $mol && $self->molecule($mol); + defined $keywords && $self->keywords($keywords); + defined $sv && $self->seq_version($sv); + defined $pid && $self->pid($pid); + + if( defined $dates ) { + if( ref($dates) =~ /array/i ) { + foreach ( @$dates) { + $self->add_date($_); + } + } else { + $self->add_date($dates); + } + } + + if( defined $xtra ) { + if( ref($xtra) =~ /array/i ) { + foreach ( @$xtra) { + $self->add_secondary_accession($_); + } + } else { + $self->add_secondary_accession($xtra); + } + } + + return $self; +} + + +=head2 division + + Title : division + Usage : $obj->division($newval) + Function: + Returns : value of division + Args : newvalue (optional) + + +=cut + +sub division { + my $obj = shift; + if( @_ ) { + my $value = shift; + $obj->{'_division'} = $value; + } + return $obj->{'_division'}; + +} + +=head2 molecule + + Title : molecule + Usage : $obj->molecule($newval) + Function: + Returns : type of molecule (DNA, mRNA) + Args : newvalue (optional) + + +=cut + +sub molecule { + my $obj = shift; + if( @_ ) { + my $value = shift; + $obj->{'_molecule'} = $value; + } + return $obj->{'_molecule'}; + +} + +=head2 add_date + + Title : add_date + Usage : $self->add_date($datestr) + Function: adds a date + Example : + Returns : a date string or an array of such strings + Args : + + +=cut + +sub add_date { + my ($self,@dates) = @_; + foreach my $dt ( @dates ) { + push(@{$self->{'_dates'}},$dt); + } +} + +=head2 get_dates + + Title : get_dates + Usage : + Function: + Example : + Returns : an array of date strings + Args : + + +=cut + +sub get_dates{ + my ($self) = @_; + return @{$self->{'_dates'}}; +} + + +=head2 pid + + Title : pid + Usage : + Function: Get (and set, depending on the implementation) the PID property + for the sequence. + Example : + Returns : a string + Args : + + +=cut + +sub pid { + my ($self,$pid) = @_; + + if(defined($pid)) { + $self->{'_pid'} = $pid; + } + return $self->{'_pid'}; +} + + +=head2 accession + + Title : accession + Usage : $obj->accession($newval) + Function: Whilst the underlying sequence object does not + have an accession, so we need one here. + + In this implementation this is merely a synonym for + accession_number(). + Example : + Returns : value of accession + Args : newvalue (optional) + + +=cut + +sub accession { + my ($obj,@args) = @_; + return $obj->accession_number(@args); +} + +=head2 add_secondary_accession + + Title : add_secondary_accession + Usage : $self->add_domment($ref) + Function: adds a secondary_accession + Example : + Returns : + Args : a string or an array of strings + + +=cut + +sub add_secondary_accession { + my ($self) = shift; + foreach my $dt ( @_ ) { + push(@{$self->{'_secondary_accession'}},$dt); + } +} + +=head2 get_secondary_accessions + + Title : get_secondary_accessions + Usage : + Function: + Example : + Returns : An array of strings + Args : + + +=cut + +sub get_secondary_accessions{ + my ($self,@args) = @_; + return @{$self->{'_secondary_accession'}}; +} + +=head2 seq_version + + Title : seq_version + Usage : $obj->seq_version($newval) + Function: + Example : + Returns : value of seq_version + Args : newvalue (optional) + + +=cut + +sub seq_version{ + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'_seq_version'} = $value; + } + return $obj->{'_seq_version'}; + +} + + +=head2 keywords + + Title : keywords + Usage : $obj->keywords($newval) + Function: + Returns : value of keywords (a string) + Args : newvalue (optional) (a string) + + +=cut + +sub keywords { + my $obj = shift; + if( @_ ) { + my $value = shift; + $obj->{'_keywords'} = $value; + } + return $obj->{'_keywords'}; + +} + +# +## +### Deprecated methods kept for ease of transtion +## +# + +sub each_date { + my ($self) = @_; + $self->warn("Deprecated method... please use get_dates"); + return $self->get_dates; +} + + +sub each_secondary_accession { + my ($self) = @_; + $self->warn("each_secondary_accession - deprecated method. use get_secondary_accessions"); + return $self->get_secondary_accessions; + +} + +sub sv { + my ($obj,$value) = @_; + $obj->warn("sv - deprecated method. use seq_version"); + $obj->seq_version($value); +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Seq/RichSeqI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Seq/RichSeqI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,219 @@ +# $Id: RichSeqI.pm,v 1.8 2002/10/22 07:38:40 lapp Exp $ +# +# BioPerl module for Bio::Seq::RichSeqI +# +# Cared for by Ewan Birney <birney@ebi.ac.uk> +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Seq::RichSeqI - RichSeq interface, mainly for database orientated sequences + +=head1 SYNOPSIS + + @secondary = $richseq->get_secondary_accessions; + $division = $richseq->division; + $mol = $richseq->molecule; + @dates = $richseq->get_dates; + $seq_version = $richseq->seq_version; + $pid = $richseq->pid; + $keywords = $richseq->keywords; + +=head1 DESCRIPTION + +This interface extends the Bio::SeqI interface to give additional functionality +to sequences with richer data sources, in particular from database sequences +(EMBL, GenBank and Swissprot). + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney + +Email birney@ebi.ac.uk + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Seq::RichSeqI; +use vars qw(@ISA); +use strict; +use Bio::SeqI; + +@ISA = ('Bio::SeqI'); + + +=head2 get_secondary_accessions + + Title : get_secondary_accessions + Usage : + Function: Get the secondary accessions for a sequence. + Example : + Returns : an array of strings + Args : none + + +=cut + +sub get_secondary_accessions{ + my ($self,@args) = @_; + + $self->throw("hit get_secondary_accessions in interface definition - error"); + +} + + +=head2 division + + Title : division + Usage : + Function: Get (and set, depending on the implementation) the divison for + a sequence. + + Examples from GenBank are PLN (plants), PRI (primates), etc. + Example : + Returns : a string + Args : + + +=cut + +sub division{ + my ($self,@args) = @_; + + $self->throw("hit division in interface definition - error"); + +} + + +=head2 molecule + + Title : molecule + Usage : + Function: Get (and set, depending on the implementation) the molecule + type for the sequence. + + This is not necessarily the same as Bio::PrimarySeqI::alphabet(), + because it is databank-specific. + Example : + Returns : a string + Args : + + +=cut + +sub molecule{ + my ($self,@args) = @_; + + $self->throw("hit molecule in interface definition - error"); +} + +=head2 pid + + Title : pid + Usage : + Function: Get (and set, depending on the implementation) the PID property + for the sequence. + Example : + Returns : a string + Args : + + +=cut + +sub pid { + my ($self,@args) = @_; + + $self->throw("hit pid in interface definition - error"); +} + +=head2 get_dates + + Title : get_dates + Usage : + Function: Get (and set, depending on the implementation) the dates the + databank entry specified for the sequence + Example : + Returns : an array of strings + Args : + + +=cut + +sub get_dates{ + my ($self,@args) = @_; + + $self->throw("hit get_dates in interface definition - error"); + +} + + +=head2 seq_version + + Title : seq_version + Usage : + Function: Get (and set, depending on the implementation) the version string + of the sequence. + Example : + Returns : a string + Args : + + +=cut + +sub seq_version{ + my ($self,@args) = @_; + + $self->throw("hit seq_version in interface definition - error"); + +} + +=head2 keywords + + Title : keywords + Usage : $obj->keywords($newval) + Function: + Returns : value of keywords (a string) + Args : newvalue (optional) (a string) + + +=cut + +sub keywords { + my ($self) = @_; + $self->throw("hit keywords in interface definition - error"); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Seq/SeqBuilder.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Seq/SeqBuilder.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,673 @@ +# $Id: SeqBuilder.pm,v 1.6 2002/10/22 07:45:20 lapp Exp $ +# +# BioPerl module for Bio::Seq::SeqBuilder +# +# Cared for by Hilmar Lapp <hlapp at gmx.net> +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# +# (c) Hilmar Lapp, hlapp at gmx.net, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Seq::SeqBuilder - Configurable object builder for sequence stream parsers + +=head1 SYNOPSIS + + use Bio::SeqIO; + + # usually you won't instantiate this yourself -- a SeqIO object + # will have one already + my $seqin = Bio::SeqIO->new(-fh => \*STDIN, -format => "genbank"); + my $builder = $seqin->sequence_builder(); + + # if you need only sequence, id, and description (e.g. for + # conversion to FASTA format): + $builder->want_none(); + $builder->add_wanted_slot('display_id','desc','seq'); + + # if you want everything except the sequence and features + $builder->want_all(1); # this is the default if it's untouched + $builder->add_unwanted_slot('seq','features'); + + # if you want only human sequences shorter than 5kb and skip all + # others + $builder->add_object_condition(sub { + my $h = shift; + return 0 if $h->{'-length'} > 5000; + return 0 if exists($h->{'-species'}) && + ($h->{'-species'}->binomial() ne "Homo sapiens"); + return 1; + }); + + # when you are finished with configuring the builder, just use + # the SeqIO API as you would normally + while(my $seq = $seqin->next_seq()) { + # do something + } + +=head1 DESCRIPTION + +This is an implementation of L<Bio::Factory::ObjectBuilderI> used by +parsers of rich sequence streams. It provides for a relatively +easy-to-use configurator of the parsing flow. + +Configuring the parsing process may be for you if you need much less +information, or much less sequences, than the stream actually +contains. Configuration can in both cases speed up the parsing time +considerably, because unwanted sections or the rest of unwanted +sequences are skipped over by the parser. + +See the methods of the class-specific implementation section for +further documentation of what can be configured. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp at gmx.net + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Seq::SeqBuilder; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::Root::Root; +use Bio::Factory::ObjectBuilderI; + +@ISA = qw(Bio::Root::Root Bio::Factory::ObjectBuilderI); + +my %slot_param_map = ("add_SeqFeature" => "features", + ); +my %param_slot_map = ("features" => "add_SeqFeature", + ); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Seq::SeqBuilder(); + Function: Builds a new Bio::Seq::SeqBuilder object + Returns : an instance of Bio::Seq::SeqBuilder + Args : + + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + $self->{'wanted_slots'} = []; + $self->{'unwanted_slots'} = []; + $self->{'object_conds'} = []; + $self->{'_objhash'} = {}; + $self->want_all(1); + + return $self; +} + +=head1 Methods for implementing L<Bio::Factory::ObjectBuilderI> + +=cut + +=head2 want_slot + + Title : want_slot + Usage : + Function: Whether or not the object builder wants to populate the + specified slot of the object to be built. + + The slot can be specified either as the name of the + respective method, or the initialization parameter that + would be otherwise passed to new() of the object to be + built. + + Note that usually only the parser will call this + method. Use add_wanted_slots and add_unwanted_slots for + configuration. + + Example : + Returns : TRUE if the object builder wants to populate the slot, and + FALSE otherwise. + Args : the name of the slot (a string) + + +=cut + +sub want_slot{ + my ($self,$slot) = @_; + my $ok = 0; + + $slot = substr($slot,1) if substr($slot,0,1) eq '-'; + if($self->want_all()) { + foreach ($self->get_unwanted_slots()) { + # this always overrides in want-all mode + return 0 if($slot eq $_); + } + if(! exists($self->{'_objskel'})) { + $self->{'_objskel'} = $self->sequence_factory->create_object(); + } + if(exists($param_slot_map{$slot})) { + $ok = $self->{'_objskel'}->can($param_slot_map{$slot}); + } else { + $ok = $self->{'_objskel'}->can($slot); + } + return $ok if $ok; + # even if the object 'cannot' do this slot, it might have been + # added to the list of wanted slot, so carry on + } + foreach ($self->get_wanted_slots()) { + if($slot eq $_) { + $ok = 1; + last; + } + } + return $ok; +} + +=head2 add_slot_value + + Title : add_slot_value + Usage : + Function: Adds one or more values to the specified slot of the object + to be built. + + Naming the slot is the same as for want_slot(). + + The object builder may further filter the content to be + set, or even completely ignore the request. + + If this method reports failure, the caller should not add + more values to the same slot. In addition, the caller may + find it appropriate to abandon the object being built + altogether. + + This implementation will allow the caller to overwrite the + return value from want_slot(), because the slot is not + checked against want_slot(). + + Note that usually only the parser will call this method, + but you may call it from anywhere if you know what you are + doing. A derived class may be used to further manipulate + the value to be added. + + Example : + Returns : TRUE on success, and FALSE otherwise + Args : the name of the slot (a string) + parameters determining the value to be set + + OR + + alternatively, a list of slotname/value pairs in the style + of named parameters as they would be passed to new(), where + each element at an even index is the parameter (slot) name + starting with a dash, and each element at an odd index is + the value of the preceding name. + + +=cut + +sub add_slot_value{ + my ($self,$slot,@args) = @_; + + my $h = $self->{'_objhash'}; + return unless $h; + # multiple named parameter variant of calling? + if((@args > 1) && (@args % 2) && (substr($slot,0,1) eq '-')) { + unshift(@args, $slot); + while(@args) { + my $key = shift(@args); + $h->{$key} = shift(@args); + } + } else { + if($slot eq 'add_SeqFeature') { + $slot = '-'.$slot_param_map{$slot}; + $h->{$slot} = [] unless $h->{$slot}; + push(@{$h->{$slot}}, @args); + } else { + $slot = '-'.$slot unless substr($slot,0,1) eq '-'; + $h->{$slot} = $args[0]; + } + } + return 1; +} + +=head2 want_object + + Title : want_object + Usage : + Function: Whether or not the object builder is still interested in + continuing with the object being built. + + If this method returns FALSE, the caller should not add any + more values to slots, or otherwise risks that the builder + throws an exception. In addition, make_object() is likely + to return undef after this method returned FALSE. + + Note that usually only the parser will call this + method. Use add_object_condition for configuration. + + Example : + Returns : TRUE if the object builder wants to continue building + the present object, and FALSE otherwise. + Args : none + + +=cut + +sub want_object{ + my $self = shift; + + my $ok = 1; + foreach my $cond ($self->get_object_conditions()) { + $ok = &$cond($self->{'_objhash'}); + last unless $ok; + } + delete $self->{'_objhash'} unless $ok; + return $ok; +} + +=head2 make_object + + Title : make_object + Usage : + Function: Get the built object. + + This method is allowed to return undef if no value has ever + been added since the last call to make_object(), or if + want_object() returned FALSE (or would have returned FALSE) + before calling this method. + + For an implementation that allows consecutive building of + objects, a caller must call this method once, and only + once, between subsequent objects to be built. I.e., a call + to make_object implies 'end_object.' + + Example : + Returns : the object that was built + Args : none + + +=cut + +sub make_object{ + my $self = shift; + + my $obj; + if(exists($self->{'_objhash'}) && %{$self->{'_objhash'}}) { + $obj = $self->sequence_factory->create_object(%{$self->{'_objhash'}}); + } + $self->{'_objhash'} = {}; # reset + return $obj; +} + +=head1 Implementation specific methods + +These methods allow to conveniently configure this sequence object +builder as to which slots are desired, and under which circumstances a +sequence object should be abandoned altogether. The default mode is +want_all(1), which means the builder will report all slots as wanted +that the object created by the sequence factory supports. + +You can add specific slots you want through add_wanted_slots(). In +most cases, you will want to call want_none() before in order to relax +zero acceptance through a list of wanted slots. + +Alternatively, you can add specific unwanted slots through +add_unwanted_slots(). In this case, you will usually want to call +want_all(1) before (which is the default if you never touched the +builder) to restrict unrestricted acceptance. + +I.e., want_all(1) means want all slots except for the unwanted, and +want_none() means only those explicitly wanted. + +If a slot is in both the unwanted and the wanted list, the following +rules hold. In want-all mode, the unwanted list overrules. In +want-none mode, the wanted list overrides the unwanted list. If this +is confusing to you, just try to avoid having slots at the same time +in the wanted and the unwanted lists. + +=cut + +=head2 get_wanted_slots + + Title : get_wanted_slots + Usage : $obj->get_wanted_slots($newval) + Function: Get the list of wanted slots + Example : + Returns : a list of strings + Args : + + +=cut + +sub get_wanted_slots{ + my $self = shift; + + return @{$self->{'wanted_slots'}}; +} + +=head2 add_wanted_slot + + Title : add_wanted_slot + Usage : + Function: Adds the specified slots to the list of wanted slots. + Example : + Returns : TRUE + Args : an array of slot names (strings) + + +=cut + +sub add_wanted_slot{ + my ($self,@slots) = @_; + + my $myslots = $self->{'wanted_slots'}; + foreach my $slot (@slots) { + if(! grep { $slot eq $_; } @$myslots) { + push(@$myslots, $slot); + } + } + return 1; +} + +=head2 remove_wanted_slots + + Title : remove_wanted_slots + Usage : + Function: Removes all wanted slots added previously through + add_wanted_slots(). + Example : + Returns : the previous list of wanted slot names + Args : none + + +=cut + +sub remove_wanted_slots{ + my $self = shift; + my @slots = $self->get_wanted_slots(); + $self->{'wanted_slots'} = []; + return @slots; +} + +=head2 get_unwanted_slots + + Title : get_unwanted_slots + Usage : $obj->get_unwanted_slots($newval) + Function: Get the list of unwanted slots. + Example : + Returns : a list of strings + Args : none + + +=cut + +sub get_unwanted_slots{ + my $self = shift; + + return @{$self->{'unwanted_slots'}}; +} + +=head2 add_unwanted_slot + + Title : add_unwanted_slot + Usage : + Function: Adds the specified slots to the list of unwanted slots. + Example : + Returns : TRUE + Args : an array of slot names (strings) + + +=cut + +sub add_unwanted_slot{ + my ($self,@slots) = @_; + + my $myslots = $self->{'unwanted_slots'}; + foreach my $slot (@slots) { + if(! grep { $slot eq $_; } @$myslots) { + push(@$myslots, $slot); + } + } + return 1; +} + +=head2 remove_unwanted_slots + + Title : remove_unwanted_slots + Usage : + Function: Removes the list of unwanted slots added previously through + add_unwanted_slots(). + Example : + Returns : the previous list of unwanted slot names + Args : none + + +=cut + +sub remove_unwanted_slots{ + my $self = shift; + my @slots = $self->get_unwanted_slots(); + $self->{'unwanted_slots'} = []; + return @slots; +} + +=head2 want_none + + Title : want_none + Usage : + Function: Disables all slots. After calling this method, want_slot() + will return FALSE regardless of slot name. + + This is different from removed_wanted_slots() in that it + also sets want_all() to FALSE. Note that it also resets the + list of unwanted slots in order to avoid slots being in + both lists. + + Example : + Returns : TRUE + Args : none + + +=cut + +sub want_none{ + my $self = shift; + + $self->want_all(0); + $self->remove_wanted_slots(); + $self->remove_unwanted_slots(); + return 1; +} + +=head2 want_all + + Title : want_all + Usage : $obj->want_all($newval) + Function: Whether or not this sequence object builder wants to + populate all slots that the object has. Whether an object + supports a slot is generally determined by what can() + returns. You can add additional 'virtual' slots by calling + add_wanted_slot. + + This will be ON by default. Call $obj->want_none() to + disable all slots. + + Example : + Returns : TRUE if this builder wants to populate all slots, and + FALSE otherwise. + Args : on set, new value (a scalar or undef, optional) + + +=cut + +sub want_all{ + my $self = shift; + + return $self->{'want_all'} = shift if @_; + return $self->{'want_all'}; +} + +=head2 get_object_conditions + + Title : get_object_conditions + Usage : + Function: Get the list of conditions an object must meet in order to + be 'wanted.' See want_object() for where this is used. + + Conditions in this implementation are closures (anonymous + functions) which are passed one parameter, a hash reference + the keys of which are equal to initialization + paramaters. The closure must return TRUE to make the object + 'wanted.' + + Conditions will be implicitly ANDed. + + Example : + Returns : a list of closures + Args : none + + +=cut + +sub get_object_conditions{ + my $self = shift; + + return @{$self->{'object_conds'}}; +} + +=head2 add_object_condition + + Title : add_object_condition + Usage : + Function: Adds a condition an object must meet in order to be 'wanted.' + See want_object() for where this is used. + + Conditions in this implementation must be closures + (anonymous functions). These will be passed one parameter, + which is a hash reference with the sequence object + initialization paramters being the keys. + + Conditions are implicitly ANDed. If you want other + operators, perform those tests inside of one closure + instead of multiple. This will also be more efficient. + + Example : + Returns : TRUE + Args : the list of conditions + + +=cut + +sub add_object_condition{ + my ($self,@conds) = @_; + + if(grep { ref($_) ne 'CODE'; } @conds) { + $self->throw("conditions against which to validate an object ". + "must be anonymous code blocks"); + } + push(@{$self->{'object_conds'}}, @conds); + return 1; +} + +=head2 remove_object_conditions + + Title : remove_object_conditions + Usage : + Function: Removes the conditions an object must meet in order to be + 'wanted.' + Example : + Returns : The list of previously set conditions (an array of closures) + Args : none + + +=cut + +sub remove_object_conditions{ + my $self = shift; + my @conds = $self->get_object_conditions(); + $self->{'object_conds'} = []; + return @conds; +} + +=head1 Methods to control what type of object is built + +=cut + +=head2 sequence_factory + + Title : sequence_factory + Usage : $obj->sequence_factory($newval) + Function: Get/set the sequence factory to be used by this object + builder. + Example : + Returns : the Bio::Factory::SequenceFactoryI implementing object to use + Args : on set, new value (a Bio::Factory::SequenceFactoryI + implementing object or undef, optional) + + +=cut + +sub sequence_factory{ + my $self = shift; + + if(@_) { + delete $self->{'_objskel'}; + return $self->{'sequence_factory'} = shift; + } + return $self->{'sequence_factory'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Seq/SeqFactory.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Seq/SeqFactory.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,156 @@ +# $Id: SeqFactory.pm,v 1.8 2002/10/25 22:49:04 lapp Exp $ +# +# BioPerl module for Bio::Seq::SeqFactory +# +# Cared for by Jason Stajich <jason@bioperl.org> +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Seq::SeqFactory - Instantiates a new Bio::PrimarySeqI (or derived class) through a factory + +=head1 SYNOPSIS + + use Bio::Seq::SeqFactory; + my $factory = new Bio::Seq::SeqFactory; + my $seq = $factory->create(-seq => 'WYRAVLC', + -id => 'name'); + + # If you want the factory to create Bio::Seq objects instead + # of the default Bio::PrimarySeq objects, use the -type parameter: + + my $factory = new Bio::Seq::SeqFactory(-type => 'Bio::Seq'); + + +=head1 DESCRIPTION + +This object will build L<Bio::PrimarySeqI> and L<Bio::SeqI> objects +generically. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Seq::SeqFactory; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::Factory::SequenceFactoryI; + +@ISA = qw(Bio::Root::Root Bio::Factory::SequenceFactoryI); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Seq::SeqFactory(); + Function: Builds a new Bio::Seq::SeqFactory object + Returns : Bio::Seq::SeqFactory + Args : -type => string, name of a PrimarySeqI derived class + This is optional. Default=Bio::PrimarySeq. + +=cut + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + my ($type) = $self->_rearrange([qw(TYPE)], @args); + if( ! defined $type ) { + $type = 'Bio::PrimarySeq'; + } + $self->type($type); + return $self; +} + + +=head2 create + + Title : create + Usage : my $seq = $seqbuilder->create(-seq => 'CAGT', -id => 'name'); + Function: Instantiates new Bio::SeqI (or one of its child classes) + This object allows us to genericize the instantiation of sequence + objects. + Returns : Bio::PrimarySeq object (default) + The return type is configurable using new(-type =>"..."). + Args : initialization parameters specific to the type of sequence + object we want. Typically + -seq => $str, + -display_id => $name + +=cut + +sub create { + my ($self,@args) = @_; + return $self->type->new(-verbose => $self->verbose, @args); +} + +=head2 type + + Title : type + Usage : $obj->type($newval) + Function: + Returns : value of type + Args : newvalue (optional) + + +=cut + +sub type{ + my ($self,$value) = @_; + if( defined $value) { + eval "require $value"; + if( $@ ) { $self->throw("$@: Unrecognized Sequence type for SeqFactory '$value'");} + + my $a = bless {},$value; + unless( $a->isa('Bio::PrimarySeqI') || + $a->isa('Bio::Seq::QualI') ) { + $self->throw("Must provide a valid Bio::PrimarySeqI or Bio::Seq::QualI or child class to SeqFactory Not $value"); + } + $self->{'type'} = $value; + } + return $self->{'type'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Seq/SeqFastaSpeedFactory.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Seq/SeqFastaSpeedFactory.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,142 @@ +# $Id: SeqFastaSpeedFactory.pm,v 1.3 2002/11/07 23:54:23 lapp Exp $ +# +# BioPerl module for Bio::Seq::SeqFactory +# +# Cared for by Jason Stajich <jason@bioperl.org> +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Seq::SeqFactory - Instantiates a new Bio::PrimarySeqI (or derived class) through a factory + +=head1 SYNOPSIS + + use Bio::Seq::SeqFactory; + my $factory = new Bio::Seq::SeqFactory; + my $seq = $factory->create(-seq => 'WYRAVLC', + -id => 'name'); + + # If you want the factory to create Bio::Seq objects instead + # of the default Bio::PrimarySeq objects, use the -type parameter: + + my $factory = new Bio::Seq::SeqFactory(-type => 'Bio::Seq'); + + +=head1 DESCRIPTION + +This object will build Bio::Seq objects generically. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Seq::SeqFastaSpeedFactory; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::Factory::SequenceFactoryI; +use Bio::Seq; +use Bio::PrimarySeq; + +@ISA = qw(Bio::Root::Root Bio::Factory::SequenceFactoryI); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Seq::SeqFactory(); + Function: Builds a new Bio::Seq::SeqFactory object + Returns : Bio::Seq::SeqFactory + Args : -type => string, name of a PrimarySeqI derived class + This is optional. Default=Bio::PrimarySeq. + +=cut + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + return $self; +} + + +=head2 create + + Title : create + Usage : my $seq = $seqbuilder->create(-seq => 'CAGT', -id => 'name'); + Function: Instantiates a new Bio::Seq object, correctly built but very + fast, knowing stuff about Bio::PrimarySeq and Bio::Seq + Returns : Bio::Seq + + Args : initialization parameters specific to the type of sequence + object we want. Typically + -seq => $str, + -id => $name + +=cut + +sub create { + my ($self,%param) = @_; + + my $sequence = $param{'-seq'} || $param{'-SEQ'}; + my $fulldesc = $param{'-desc'} || $param{'-DESC'}; + my $id = $param{'-id'} || $param{'-ID'} || + $param{'-primary_id'} || $param{'-PRIMARY_ID'}; + + my $seq = bless {}, "Bio::Seq"; + my $t_pseq = $seq->{'primary_seq'} = bless {}, "Bio::PrimarySeq"; + $t_pseq->{'seq'} = $sequence; + $t_pseq->{'desc'} = $fulldesc; + $t_pseq->{'display_id'} = $id; + $t_pseq->{'primary_id'} = $id; + $seq->{'primary_id'} = $id; # currently Bio::Seq does not delegate this + if( $sequence ) { + $t_pseq->_guess_alphabet(); + } + + return $seq; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Seq/SeqWithQuality.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Seq/SeqWithQuality.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,916 @@ +# $Id: SeqWithQuality.pm,v 1.17 2002/12/19 22:02:38 matsallac Exp $ +# +# BioPerl module for Bio::Seq::QualI +# +# Cared for by Chad Matsalla <bioinformatics@dieselwurks.com +# +# Copyright Chad Matsalla +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Seq::SeqWithQuality - Bioperl object packaging a sequence with its quality + +=head1 SYNOPSIS + + use Bio::PrimarySeq; + use Bio::Seq::PrimaryQual; + + # make from memory + my $qual = Bio::Seq::SeqWithQuality->new + ( -qual => '10 20 30 40 50 50 20 10', + -seq => 'ATCGATCG', + -id => 'human_id', + -accession_number => 'AL000012', + ); + + # make from objects + # first, make a PrimarySeq object + my $seqobj = Bio::PrimarySeq->new + ( -seq => 'atcgatcg', + -id => 'GeneFragment-12', + -accession_number => 'X78121', + -alphabet => 'dna' + ); + + # now make a PrimaryQual object + my $qualobj = Bio::Seq::PrimaryQual->new + ( -qual => '10 20 30 40 50 50 20 10', + -id => 'GeneFragment-12', + -accession_number => 'X78121', + -alphabet => 'dna' + ); + + # now make the SeqWithQuality object + my $swqobj = Bio::Seq::SeqWithQuality->new + ( -seq => $seqobj, + -qual => $qualobj + ); + # done! + + $swqobj->id(); # the id of the SeqWithQuality object + # may not match the the id of the sequence or + # of the quality (check the pod, luke) + $swqobj->seq(); # the sequence of the SeqWithQuality object + $swqobj->qual(); # the quality of the SeqWithQuality object + + # to get out parts of the sequence. + + print "Sequence ", $seqobj->id(), " with accession ", + $seqobj->accession, " and desc ", $seqobj->desc, "\n"; + + $string2 = $seqobj->subseq(1,40); + +=head1 DESCRIPTION + +This object stores base quality values together with the sequence string. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Chad Matsalla + +Email bioinformatics@dieselwurks.com + +=head1 CONTRIBUTORS + +Jason Stajich, jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +package Bio::Seq::SeqWithQuality; + +use vars qw(@ISA); + +use strict; +use Bio::Root::Root; +use Bio::Seq::QualI; +use Bio::PrimarySeqI; +use Bio::PrimarySeq; +use Bio::Seq::PrimaryQual; + +@ISA = qw(Bio::Root::Root Bio::PrimarySeqI Bio::Seq::QualI); + +=head2 new() + + Title : new() + Usage : $qual = Bio::Seq::SeqWithQuality ->new + ( -qual => '10 20 30 40 50 50 20 10', + -seq => 'ATCGATCG', + -id => 'human_id', + -accession_number => 'AL000012', + -trace_indices => '0 5 10 15 20 25 30 35' + ); + Function: Returns a new Bio::Seq::SeqWithQual object from basic + constructors. + Returns : a new Bio::Seq::PrimaryQual object + Notes : Arguments: + -qual can be a quality string (see Bio::Seq::PrimaryQual for more + information on this) or a reference to a Bio::Seq::PrimaryQual + object. + -seq can be a sequence string (see Bio::PrimarySeq for more + information on this) or a reference to a Bio::PrimaryQual object. + -seq, -id, -accession_number, -primary_id, -desc, -id behave like + this: + 1. if they are provided on construction of the + Bio::Seq::SeqWithQuality they will be set as the descriptors for + the object unless changed by one of the following mechanisms: + a) $obj->set_common_descriptors() is used and both the -seq and + the -qual object have the same descriptors. These common + descriptors will then become the descriptors for the + Bio::Seq::SeqWithQual object. + b) the descriptors are manually set using the seq(), id(), + desc(), or accession_number(), primary_id(), + 2. if no descriptors are provided, the new() constructor will see + if the descriptor used in the PrimarySeq and in the + PrimaryQual objects match. If they do, they will become + the descriptors for the SeqWithQuality object. + + To eliminate ambiguity, I strongly suggest you set the + descriptors manually on construction of the object. Really. + -trace_indices : a space_delimited list of trace indices + (where would the peaks be drawn if this list of qualities + was to be plotted?) + +=cut + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + # default: turn OFF the warnings + $self->{supress_warnings} = 1; + my($qual,$seq,$id,$acc,$pid,$desc,$given_id,$alphabet,$trace_indices) = + $self->_rearrange([qw( + QUAL + SEQ + DISPLAY_ID + ACCESSION_NUMBER + PRIMARY_ID + DESC + ID + ALPHABET + TRACE_INDICES + )], + @args); + # first, deal with the sequence and quality information + if ( defined $id && defined $given_id ) { + if( $id ne $given_id ) { + $self->throw("Provided both id and display_id constructor functions. [$id] [$given_id]"); + } + } + if( defined $given_id ) { + $self->display_id($given_id); + $id = $given_id; + } + if (!$seq) { + my $id; + unless ($self->{supress_warnings} == 1) { + $self->warn("You did not provide sequence information during the construction of a Bio::Seq::SeqWithQuality object. Sequence components for this object will be empty."); + } + if (!$alphabet) { + $self->throw("If you want me to create a PrimarySeq object for your empty sequence <boggle> you must specify a -alphabet to satisfy the constructor requirements for a Bio::PrimarySeq object with no sequence. Read the POD for it, luke."); + } + $self->{seq_ref} = Bio::PrimarySeq->new + ( + -seq => "", + -accession_number => $acc, + -primary_id => $pid, + -desc => $desc, + -display_id => $id, + -alphabet => $alphabet + ); + } + elsif (ref($seq) eq "Bio::PrimarySeq" ) { + $self->{seq_ref} = $seq; + } + + else { + my $seqobj = Bio::PrimarySeq->new + ( + -seq => $seq, + -accession_number => $acc, + -primary_id => $pid, + -desc => $desc, + -display_id => $id, + ); + $self->{seq_ref} = $seqobj; + } + + if (!$qual) { + $self->{qual_ref} = Bio::Seq::PrimaryQual->new + ( + -qual => "", + -accession_number => $acc, + -primary_id => $pid, + -desc => $desc, + -display_id => $id, + ); + } + elsif (ref($qual) eq "Bio::Seq::PrimaryQual") { + $self->{qual_ref} = $qual; + } + else { + my $qualobj = Bio::Seq::PrimaryQual->new + ( + -qual => $qual, + -accession_number => $acc, + -primary_id => $pid, + -desc => $desc, + -display_id => $id, + -trace_indices => $trace_indices + ); + $self->{qual_ref} = $qualobj; + } + + # now try to set the descriptors for this object + $self->_set_descriptors($qual,$seq,$id,$acc,$pid,$desc,$given_id,$alphabet); + $self->length(); + + return $self; +} + +=head2 _common_id() + + Title : _common_id() + Usage : $common_id = $self->_common_id(); + Function: Compare the display_id of {qual_ref} and {seq_ref}. + Returns : Nothing if they don't match. If they do return + {seq_ref}->display_id() + Args : None. + +=cut + +#' +sub _common_id { + my $self = shift; + return if (!$self->{seq_ref} || !$self->{qual_ref}); + my $sid = $self->{seq_ref}->display_id(); + return if (!$sid); + return if (!$self->{qual_ref}->display_id()); + return $sid if ($sid eq $self->{qual_ref}->display_id()); + # should this become a warning? + # print("ids $sid and $self->{qual_ref}->display_id() do not match. Bummer.\n"); +} + +=head2 _common_display_id() + + Title : _common_id() + Usage : $common_id = $self->_common_display_id(); + Function: Compare the display_id of {qual_ref} and {seq_ref}. + Returns : Nothing if they don't match. If they do return + {seq_ref}->display_id() + Args : None. + +=cut + +#' +sub _common_display_id { + my $self = shift; + $self->common_id(); +} + +=head2 _common_accession_number() + + Title : _common_accession_number() + Usage : $common_id = $self->_common_accession_number(); + Function: Compare the accession_number() of {qual_ref} and {seq_ref}. + Returns : Nothing if they don't match. If they do return + {seq_ref}->accession_number() + Args : None. + +=cut + +#' +sub _common_accession_number { + my $self = shift; + return if ($self->{seq_ref} || $self->{qual_ref}); + my $acc = $self->{seq_ref}->accession_number(); + # if (!$acc) { print("the seqref has no acc.\n"); } + return if (!$acc); + # if ($acc eq $self->{qual_ref}->accession_number()) { print("$acc matches ".$self->{qual_ref}->accession_number()."\n"); } + return $acc if ($acc eq $self->{qual_ref}->accession_number()); + # should this become a warning? + # print("accession numbers $acc and $self->{qual_ref}->accession_number() do not match. Bummer.\n"); +} + +=head2 _common_primary_id() + + Title : _common_primary_id() + Usage : $common_primard_id = $self->_common_primary_id(); + Function: Compare the primary_id of {qual_ref} and {seq_ref}. + Returns : Nothing if they don't match. If they do return + {seq_ref}->primary_id() + Args : None. + +=cut + +#' +sub _common_primary_id { + my $self = shift; + return if ($self->{seq_ref} || $self->{qual_ref}); + my $pid = $self->{seq_ref}->primary_id(); + return if (!$pid); + return $pid if ($pid eq $self->{qual_ref}->primary_id()); + # should this become a warning? + # print("primary_ids $pid and $self->{qual_ref}->primary_id() do not match. Bummer.\n"); + +} + +=head2 _common_desc() + + Title : _common_desc() + Usage : $common_desc = $self->_common_desc(); + Function: Compare the desc of {qual_ref} and {seq_ref}. + Returns : Nothing if they don't match. If they do return + {seq_ref}->desc() + Args : None. + +=cut + +#' +sub _common_desc { + my $self = shift; + return if ($self->{seq_ref} || $self->{qual_ref}); + my $des = $self->{seq_ref}->desc(); + return if (!$des); + return $des if ($des eq $self->{qual_ref}->desc()); + # should this become a warning? + # print("descriptions $des and $self->{qual_ref}->desc() do not match. Bummer.\n"); + +} + +=head2 set_common_descriptors() + + Title : set_common_descriptors() + Usage : $self->set_common_descriptors(); + Function: Compare the descriptors (id,accession_number,display_id, + primary_id, desc) for the PrimarySeq and PrimaryQual objects + within the SeqWithQuality object. If they match, make that + descriptor the descriptor for the SeqWithQuality object. + Returns : Nothing. + Args : None. + +=cut + +sub set_common_descriptors { + my $self = shift; + return if ($self->{seq_ref} || $self->{qual_ref}); + &_common_id(); + &_common_display_id(); + &_common_accession_number(); + &_common_primary_id(); + &_common_desc(); +} + +=head2 alphabet() + + Title : alphabet(); + Usage : $molecule_type = $obj->alphabet(); + Function: Get the molecule type from the PrimarySeq object. + Returns : What what PrimarySeq says the type of the sequence is. + Args : None. + +=cut + +sub alphabet { + my $self = shift; + return $self->{seq_ref}->alphabet(); +} + +=head2 display_id() + + Title : display_id() + Usage : $id_string = $obj->display_id(); + Function: Returns the display id, aka the common name of the Quality + object. + The semantics of this is that it is the most likely string to be + used as an identifier of the quality sequence, and likely to have + "human" readability. The id is equivalent to the ID field of the + GenBank/EMBL databanks and the id field of the Swissprot/sptrembl + database. In fasta format, the >(\S+) is presumed to be the id, + though some people overload the id to embed other information. + Bioperl does not use any embedded information in the ID field, + and people are encouraged to use other mechanisms (accession + field for example, or extending the sequence object) to solve + this. Notice that $seq->id() maps to this function, mainly for + legacy/convience issues. + This method sets the display_id for the SeqWithQuality object. + Returns : A string + Args : If a scalar is provided, it is set as the new display_id for + the SeqWithQuality object. + Status : Virtual + +=cut + +sub display_id { + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'display_id'} = $value; + } + return $obj->{'display_id'}; + +} + +=head2 accession_number() + + Title : accession_number() + Usage : $unique_biological_key = $obj->accession_number(); + Function: Returns the unique biological id for a sequence, commonly + called the accession_number. For sequences from established + databases, the implementors should try to use the correct + accession number. Notice that primary_id() provides the unique id + for the implemetation, allowing multiple objects to have the same + accession number in a particular implementation. For sequences + with no accession number, this method should return "unknown". + This method sets the accession_number for the SeqWithQuality + object. + Returns : A string (the value of accession_number) + Args : If a scalar is provided, it is set as the new accession_number + for the SeqWithQuality object. + Status : Virtual + + +=cut + +sub accession_number { + my( $obj, $acc ) = @_; + + if (defined $acc) { + $obj->{'accession_number'} = $acc; + } else { + $acc = $obj->{'accession_number'}; + $acc = 'unknown' unless defined $acc; + } + return $acc; +} + +=head2 primary_id() + + Title : primary_id() + Usage : $unique_implementation_key = $obj->primary_id(); + Function: Returns the unique id for this object in this implementation. + This allows implementations to manage their own object ids in a + way the implementaiton can control clients can expect one id to + map to one object. For sequences with no accession number, this + method should return a stringified memory location. + This method sets the primary_id for the SeqWithQuality + object. + Returns : A string. (the value of primary_id) + Args : If a scalar is provided, it is set as the new primary_id for + the SeqWithQuality object. + +=cut + +sub primary_id { + my ($obj,$value) = @_; + if ($value) { + $obj->{'primary_id'} = $value; + } + return $obj->{'primary_id'}; + +} + +=head2 desc() + + Title : desc() + Usage : $qual->desc($newval); _or_ + $description = $qual->desc(); + Function: Get/set description text for this SeqWithQuality object. + Returns : A string. (the value of desc) + Args : If a scalar is provided, it is set as the new desc for the + SeqWithQuality object. + +=cut + +sub desc { + # a mechanism to set the disc for the SeqWithQuality object. + # probably will be used most often by set_common_features() + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'desc'} = $value; + } + return $obj->{'desc'}; +} + +=head2 id() + + Title : id() + Usage : $id = $qual->id(); + Function: Return the ID of the quality. This should normally be (and + actually is in the implementation provided here) just a synonym + for display_id(). + Returns : A string. (the value of id) + Args : If a scalar is provided, it is set as the new id for the + SeqWithQuality object. + +=cut + +sub id { + my ($self,$value) = @_; + if (!$self) { $self->throw("no value for self in $value"); } + if( defined $value ) { + return $self->display_id($value); + } + return $self->display_id(); +} + +=head2 seq + + Title : seq() + Usage : $string = $obj->seq(); _or_ + $obj->seq("atctatcatca"); + Function: Returns the sequence that is contained in the imbedded in the + PrimarySeq object within the SeqWithQuality object + Returns : A scalar (the seq() value for the imbedded PrimarySeq object.) + Args : If a scalar is provided, the SeqWithQuality object will + attempt to set that as the sequence for the imbedded PrimarySeq + object. Otherwise, the value of seq() for the PrimarySeq object + is returned. + Notes : This is probably not a good idea because you then should call + length() to make sure that the sequence and quality are of the + same length. Even then, how can you make sure that this sequence + belongs with that quality? I provided this to give you rope to + hang yourself with. Tie it to a strong device and use a good + knot. + +=cut + +sub seq { + my ($self,$value) = @_; + if( defined $value) { + $self->{seq_ref}->seq($value); + $self->length(); + } + return $self->{seq_ref}->seq(); +} + +=head2 qual() + + Title : qual() + Usage : @quality_values = @{$obj->qual()}; _or_ + $obj->qual("10 10 20 40 50"); + Function: Returns the quality as imbedded in the PrimaryQual object + within the SeqWithQuality object. + Returns : A reference to an array containing the quality values in the + PrimaryQual object. + Args : If a scalar is provided, the SeqWithQuality object will + attempt to set that as the quality for the imbedded PrimaryQual + object. Otherwise, the value of qual() for the PrimaryQual + object is returned. + Notes : This is probably not a good idea because you then should call + length() to make sure that the sequence and quality are of the + same length. Even then, how can you make sure that this sequence + belongs with that quality? I provided this to give you a strong + board with which to flagellate yourself. + +=cut + +sub qual { + my ($self,$value) = @_; + + if( defined $value) { + $self->{qual_ref}->qual($value); + # update the lengths + $self->length(); + } + return $self->{qual_ref}->qual(); +} + + + +=head2 trace_indices() + + Title : trace_indices() + Usage : @trace_indice_values = @{$obj->trace_indices()}; _or_ + $obj->trace_indices("10 10 20 40 50"); + Function: Returns the trace_indices as imbedded in the Primaryqual object + within the SeqWithQualiity object. + Returns : A reference to an array containing the trace_indice values in the + PrimaryQual object. + Args : If a scalar is provided, the SeqWithuQuality object will + attempt to set that as the trace_indices for the imbedded PrimaryQual + object. Otherwise, the value of trace_indices() for the PrimaryQual + object is returned. + Notes : This is probably not a good idea because you then should call + length() to make sure that the sequence and trace_indices are of the + same length. Even then, how can you make sure that this sequence + belongs with that trace_indicex? I provided this to give you a strong + board with which to flagellate yourself. + +=cut + +sub trace_indices { + my ($self,$value) = @_; + + if( defined $value) { + $self->{qual_ref}->trace_indices($value); + # update the lengths + $self->length(); + } + return $self->{qual_ref}->trace_indices(); +} + + + + +=head2 length() + + Title : length() + Usage : $length = $seqWqual->length(); + Function: Get the length of the SeqWithQuality sequence/quality. + Returns : Returns the length of the sequence and quality if they are + both the same. Returns "DIFFERENT" if they differ. + Args : None. + +=cut + +sub length { + my $self = shift; + if (!$self->{seq_ref}) { + unless ($self->{supress_warnings} == 1) { + $self->warn("Can't find {seq_ref} here in length()."); + } + return; + } + if (!$self->{qual_ref}) { + unless ($self->{supress_warnings} == 1) { + $self->warn("Can't find {qual_ref} here in length()."); + } + return; + } + my $seql = $self->{seq_ref}->length(); + + if ($seql != $self->{qual_ref}->length()) { + unless ($self->{supress_warnings} == 1) { + $self->warn("Sequence length (".$seql.") is different from quality length (".$self->{qual_ref}->length().") in the SeqWithQuality object. This can only lead to problems later."); + } + $self->{'length'} = "DIFFERENT"; + } + else { + $self->{'length'} = $seql; + } + return $self->{'length'}; +} + + +=head2 qual_obj + + Title : qual_obj($different_obj) + Usage : $qualobj = $seqWqual->qual_obj(); _or_ + $qualobj = $seqWqual->qual_obj($ref_to_primaryqual_obj); + Function: Get the PrimaryQual object that is imbedded in the + SeqWithQuality object or if a reference to a PrimaryQual object + is provided, set this as the PrimaryQual object imbedded in the + SeqWithQuality object. + Returns : A reference to a Bio::Seq::SeqWithQuality object. + +=cut + +sub qual_obj { + my ($self,$value) = @_; + if (defined($value)) { + if (ref($value) eq "Bio::Seq::PrimaryQual") { + $self->{qual_ref} = $value; + + $self->debug("You successfully changed the PrimaryQual object within a SeqWithQuality object. ID's for the SeqWithQuality object may now not be what you expect. Use something like set_common_descriptors() to fix them if you care,"); + } + else { + $self->debug("You tried to change the PrimaryQual object within a SeqWithQuality object but you passed a reference to an object that was not a Bio::Seq::PrimaryQual object. Thus your change failed. Sorry.\n"); + } + } + return $self->{qual_ref}; +} + + +=head2 seq_obj + + Title : seq_obj() + Usage : $seqobj = $seqWqual->qual_obj(); _or_ + $seqobj = $seqWqual->seq_obj($ref_to_primary_seq_obj); + Function: Get the PrimarySeq object that is imbedded in the + SeqWithQuality object or if a reference to a PrimarySeq object is + provided, set this as the PrimarySeq object imbedded in the + SeqWithQuality object. + Returns : A reference to a Bio::PrimarySeq object. + +=cut + +sub seq_obj { + my ($self,$value) = @_; + if( defined $value) { + if (ref($value) eq "Bio::PrimarySeq") { + $self->debug("You successfully changed the PrimarySeq object within a SeqWithQuality object. ID's for the SeqWithQuality object may now not be what you expect. Use something like set_common_descriptors() to fix them if you care,"); + } else { + $self->debug("You tried to change the PrimarySeq object within a SeqWithQuality object but you passed a reference to an object that was not a Bio::PrimarySeq object. Thus your change failed. Sorry.\n"); + } + } + return $self->{seq_ref}; +} + +=head2 _set_descriptors + + Title : _set_descriptors() + Usage : $seqWqual->_qual_obj($qual,$seq,$id,$acc,$pid,$desc,$given_id, + $alphabet); + Function: Set the descriptors for the SeqWithQuality object. Try to + match the descriptors in the PrimarySeq object and in the + PrimaryQual object if descriptors were not provided with + construction. + Returns : Nothing. + Args : $qual,$seq,$id,$acc,$pid,$desc,$given_id,$alphabet as found + in the new() method. + Notes : Really only intended to be called by the new() method. If + you want to invoke a similar function try + set_common_descriptors(). + +=cut + + +sub _set_descriptors { + my ($self,$qual,$seq,$id,$acc,$pid,$desc,$given_id,$alphabet) = @_; + my ($c_id,$c_acc,$c_pid,$c_desc); + if (!$self->display_id()) { + if ($c_id = $self->_common_id() ) { $self->display_id($c_id); } + else { + if ($self->{seq_ref}) { + # print("Using seq_ref to set id to ".$self->{seq_ref}->display_id()."\n"); + # ::dumpValue($self->{seq_ref}); + $self->display_id($self->{seq_ref}->id()); + } + elsif ($self->{qual_ref}) { + $self->display_id($self->{qual_ref}->id()); + } + } + } + if ($acc) { $self->accession_number($acc); } + elsif ($c_acc = $self->_common_accession_number() ) { $self->accession_number($c_acc); } + if ($pid) { $self->primary_id($pid); } + elsif ($c_pid = $self->_common_primary_id() ) { $self->primary_id($c_pid); } + if ($desc) { $self->desc($desc); } + elsif ($c_desc = $self->_common_desc() ) { $self->desc($c_desc); } +} + +=head2 subseq($start,$end) + + Title : subseq($start,$end) + Usage : $subsequence = $obj->subseq($start,$end); + Function: Returns the subseq from start to end, where the first base + is 1 and the number is inclusive, ie 1-2 are the first two + bases of the sequence. + Returns : A string. + Args : Two positions. + +=cut + +sub subseq { + my ($self,@args) = @_; + # does a single value work? + return $self->{seq_ref}->subseq(@args); +} + +=head2 baseat($position) + + Title : baseat($position) + Usage : $base_at_position_6 = $obj->baseat("6"); + Function: Returns a single base at the given position, where the first + base is 1 and the number is inclusive, ie 1-2 are the first two + bases of the sequence. + Returns : A scalar. + Args : A position. + +=cut + +sub baseat { + my ($self,$val) = @_; + return $self->{seq_ref}->subseq($val,$val); +} + +=head2 subqual($start,$end) + + Title : subqual($start,$end) + Usage : @qualities = @{$obj->subqual(10,20); + Function: returns the quality values from $start to $end, where the + first value is 1 and the number is inclusive, ie 1-2 are the + first two bases of the sequence. Start cannot be larger than + end but can be equal. + Returns : A reference to an array. + Args : a start position and an end position + +=cut + +sub subqual { + my ($self,@args) = @_; + return $self->{qual_ref}->subqual(@args); +} + +=head2 qualat($position) + + Title : qualat($position) + Usage : $quality = $obj->qualat(10); + Function: Return the quality value at the given location, where the + first value is 1 and the number is inclusive, ie 1-2 are the + first two bases of the sequence. Start cannot be larger than + end but can be equal. + Returns : A scalar. + Args : A position. + +=cut + +sub qualat { + my ($self,$val) = @_; + return $self->{qual_ref}->qualat($val); +} + +=head2 sub_trace_index($start,$end) + + Title : sub_trace_index($start,$end) + Usage : @trace_indices = @{$obj->sub_trace_index(10,20); + Function: returns the trace index values from $start to $end, where the + first value is 1 and the number is inclusive, ie 1-2 are the + first two bases of the sequence. Start cannot be larger than + end but can be e_trace_index. + Returns : A reference to an array. + Args : a start position and an end position + +=cut + +sub sub_trace_index { + my ($self,@args) = @_; + return $self->{qual_ref}->sub_trace_index(@args); +} + +=head2 trace_index_at($position) + + Title : trace_index_at($position) + Usage : $trace_index = $obj->trace_index_at(10); + Function: Return the trace_index value at the given location, where the + first value is 1 and the number is inclusive, ie 1-2 are the + first two bases of the sequence. Start cannot be larger than + end but can be etrace_index_. + Returns : A scalar. + Args : A position. + +=cut + +sub trace_index_at { + my ($self,$val) = @_; + return $self->{qual_ref}->trace_index_at($val); +} + +=head2 to_string() + + Title : to_string() + Usage : $quality = $obj->to_string(); + Function: Return a textual representation of what the object contains. + For this module, this function will return: + qual + seq + display_id + accession_number + primary_id + desc + id + length_sequence + length_quality + Returns : A scalar. + Args : None. + +=cut + +sub to_string { + my ($self,$out,$result) = shift; + $out = "qual: ".join(',',@{$self->qual()})."\n"; + foreach (qw(seq display_id accession_number primary_id desc id)) { + $result = $self->$_(); + if (!$result) { $result = "<unset>"; } + $out .= "$_: $result\n"; + } + return $out; +} +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Seq/SequenceTrace.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Seq/SequenceTrace.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,747 @@ +# $Id: SequenceTrace.pm,v 1.1.2.1 2003/03/25 12:32:16 heikki Exp $ +# +# BioPerl module for Bio::Seq::SeqWithQuality +# +# Cared for by Chad Matsalla <bioinformatics@dieselwurks.com +# +# Copyright Chad Matsalla +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Seq::SequenceTrace - Bioperl object packaging a sequence with its trace + +=head1 SYNOPSIS + + # example code here + +=head1 DESCRIPTION + +This object stores a sequence with its trace. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Chad Matsalla + +Email bioinformatics@dieselwurks.com + +=head1 CONTRIBUTORS + +Jason Stajich, jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +package Bio::Seq::SequenceTrace; + +use vars qw(@ISA); + +use strict; +use Bio::Root::Root; +use Bio::Seq::QualI; +use Bio::PrimarySeqI; +use Bio::PrimarySeq; +use Bio::Seq::PrimaryQual; +use Bio::Seq::TraceI; + +@ISA = qw(Bio::Root::Root Bio::Seq::SeqWithQuality Bio::Seq::TraceI); + +=head2 new() + + Title : new() + Usage : $st = Bio::Seq::SequenceTrace->new + ( -sequencewithquality => Bio::Seq::SequenceWithQuality, + -trace_a => \@trace_values_for_a_channel, + -trace_t => \@trace_values_for_t_channel, + -trace_g => \@trace_values_for_g_channel, + -trace_c => \@trace_values_for_c_channel, + -trace_indices => '0 5 10 15 20 25 30 35' + ); + Function: Returns a new Bio::Seq::SequenceTrace object from basic + constructors. + Returns : a new Bio::Seq::SequenceTrace object +Arguments: I think that these are all describes in the usage above. + +=cut + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + # default: turn OFF the warnings + $self->{supress_warnings} = 1; + my($sequence_with_quality,$trace_indices,$trace_a,$trace_t, + $trace_g,$trace_c) = + $self->_rearrange([qw( + SEQUENCEWITHQUALITY + TRACE_INDICES + TRACE_A + TRACE_T + TRACE_G)], @args); + # first, deal with the sequence and quality information + if ($sequence_with_quality && ref($sequence_with_quality) eq "Bio::Seq::SeqWithQuality") { + $self->{swq} = $sequence_with_quality; + } + else { + $self->throw("A Bio::Seq::SequenceTrace object must be created with a + Bio::Seq::SeqWithQuality object."); + } + $self->{trace_a} = $trace_a ? $trace_a : undef; + $self->{trace_t} = $trace_t ? $trace_t : undef; + $self->{trace_g} = $trace_g ? $trace_g : undef; + $self->{trace_c} = $trace_c ? $trace_c : undef; + $self->{trace_indices} = $trace_indices ? $trace_indices : undef; + return $self; +} + +=head2 trace($base,\@new_values) + + Title : trace($base,\@new_values) + Usage : @trace_Values = @{$obj->trace($base,\@new_values)}; + Function: Returns the trace values as a reference to an array containing the + trace values. The individual elements of the trace array are not validated + and can be any numeric value. + Returns : A reference to an array. + Status : +Arguments: $base : which color channel would you like the trace values for? + - $base must be one of "A","T","G","C" + \@new_values : a reference to an array of values containing trace + data for this base + +=cut + +sub trace { + my ($self,$base_channel,$values) = @_; + $base_channel =~ tr/A-Z/a-z/; + if (length($base_channel) > 1 && $base_channel !~ /a|t|g|c/) { + $self->throw("The base channel must be a, t, g, or c"); + } + if ( $values && ref($values) eq "ARRAY") { + $self->{trace_$base_channel} = $values; + } + elsif ($values) { + $self->warn("You tried to change the traces for the $base_channel but + the values you wave were not a reference to an array."); + } + return $self->{trace_$base_channel}; +} + + +=head2 trace_indices($new_indices) + + Title : trace_indices($new_indices) + Usage : $indices = $obj->trace_indices($new_indices); + Function: Return the trace iindex points for this object. + Returns : A scalar + Args : If used, the trace indices will be set to the provided value. + +=cut + +sub trace_indices { + my ($self,$trace_indices)= @_; + if ($trace_indices) { $self->{trace_indices} = $trace_indices; } + return $self->{trace_indices}; +} + + + + + + + + + + + + + +=head2 _common_id() + + Title : _common_id() + Usage : $common_id = $self->_common_id(); + Function: Compare the display_id of {qual_ref} and {seq_ref}. + Returns : Nothing if they don't match. If they do return + {seq_ref}->display_id() + Args : None. + +=cut + +#' +sub _common_id { + my $self = shift; + return if (!$self->{seq_ref} || !$self->{qual_ref}); + my $sid = $self->{seq_ref}->display_id(); + return if (!$sid); + return if (!$self->{qual_ref}->display_id()); + return $sid if ($sid eq $self->{qual_ref}->display_id()); + # should this become a warning? + # print("ids $sid and $self->{qual_ref}->display_id() do not match. Bummer.\n"); +} + +=head2 _common_display_id() + + Title : _common_id() + Usage : $common_id = $self->_common_display_id(); + Function: Compare the display_id of {qual_ref} and {seq_ref}. + Returns : Nothing if they don't match. If they do return + {seq_ref}->display_id() + Args : None. + +=cut + +#' +sub _common_display_id { + my $self = shift; + $self->common_id(); +} + +=head2 _common_accession_number() + + Title : _common_accession_number() + Usage : $common_id = $self->_common_accession_number(); + Function: Compare the accession_number() of {qual_ref} and {seq_ref}. + Returns : Nothing if they don't match. If they do return + {seq_ref}->accession_number() + Args : None. + +=cut + +#' +sub _common_accession_number { + my $self = shift; + return if ($self->{seq_ref} || $self->{qual_ref}); + my $acc = $self->{seq_ref}->accession_number(); + # if (!$acc) { print("the seqref has no acc.\n"); } + return if (!$acc); + # if ($acc eq $self->{qual_ref}->accession_number()) { print("$acc matches ".$self->{qual_ref}->accession_number()."\n"); } + return $acc if ($acc eq $self->{qual_ref}->accession_number()); + # should this become a warning? + # print("accession numbers $acc and $self->{qual_ref}->accession_number() do not match. Bummer.\n"); +} + +=head2 _common_primary_id() + + Title : _common_primary_id() + Usage : $common_primard_id = $self->_common_primary_id(); + Function: Compare the primary_id of {qual_ref} and {seq_ref}. + Returns : Nothing if they don't match. If they do return + {seq_ref}->primary_id() + Args : None. + +=cut + +#' +sub _common_primary_id { + my $self = shift; + return if ($self->{seq_ref} || $self->{qual_ref}); + my $pid = $self->{seq_ref}->primary_id(); + return if (!$pid); + return $pid if ($pid eq $self->{qual_ref}->primary_id()); + # should this become a warning? + # print("primary_ids $pid and $self->{qual_ref}->primary_id() do not match. Bummer.\n"); + +} + +=head2 _common_desc() + + Title : _common_desc() + Usage : $common_desc = $self->_common_desc(); + Function: Compare the desc of {qual_ref} and {seq_ref}. + Returns : Nothing if they don't match. If they do return + {seq_ref}->desc() + Args : None. + +=cut + +#' +sub _common_desc { + my $self = shift; + return if ($self->{seq_ref} || $self->{qual_ref}); + my $des = $self->{seq_ref}->desc(); + return if (!$des); + return $des if ($des eq $self->{qual_ref}->desc()); + # should this become a warning? + # print("descriptions $des and $self->{qual_ref}->desc() do not match. Bummer.\n"); + +} + +=head2 set_common_descriptors() + + Title : set_common_descriptors() + Usage : $self->set_common_descriptors(); + Function: Compare the descriptors (id,accession_number,display_id, + primary_id, desc) for the PrimarySeq and PrimaryQual objects + within the SeqWithQuality object. If they match, make that + descriptor the descriptor for the SeqWithQuality object. + Returns : Nothing. + Args : None. + +=cut + +sub set_common_descriptors { + my $self = shift; + return if ($self->{seq_ref} || $self->{qual_ref}); + &_common_id(); + &_common_display_id(); + &_common_accession_number(); + &_common_primary_id(); + &_common_desc(); +} + +=head2 alphabet() + + Title : alphabet(); + Usage : $molecule_type = $obj->alphabet(); + Function: Get the molecule type from the PrimarySeq object. + Returns : What what PrimarySeq says the type of the sequence is. + Args : None. + +=cut + +sub alphabet { + my $self = shift; + return $self->{seq_ref}->alphabet(); +} + +=head2 display_id() + + Title : display_id() + Usage : $id_string = $obj->display_id(); + Function: Returns the display id, aka the common name of the Quality + object. + The semantics of this is that it is the most likely string to be + used as an identifier of the quality sequence, and likely to have + "human" readability. The id is equivalent to the ID field of the + GenBank/EMBL databanks and the id field of the Swissprot/sptrembl + database. In fasta format, the >(\S+) is presumed to be the id, + though some people overload the id to embed other information. + Bioperl does not use any embedded information in the ID field, + and people are encouraged to use other mechanisms (accession + field for example, or extending the sequence object) to solve + this. Notice that $seq->id() maps to this function, mainly for + legacy/convience issues. + This method sets the display_id for the SeqWithQuality object. + Returns : A string + Args : If a scalar is provided, it is set as the new display_id for + the SeqWithQuality object. + Status : Virtual + +=cut + +sub display_id { + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'display_id'} = $value; + } + return $obj->{'display_id'}; + +} + +=head2 accession_number() + + Title : accession_number() + Usage : $unique_biological_key = $obj->accession_number(); + Function: Returns the unique biological id for a sequence, commonly + called the accession_number. For sequences from established + databases, the implementors should try to use the correct + accession number. Notice that primary_id() provides the unique id + for the implemetation, allowing multiple objects to have the same + accession number in a particular implementation. For sequences + with no accession number, this method should return "unknown". + This method sets the accession_number for the SeqWithQuality + object. + Returns : A string (the value of accession_number) + Args : If a scalar is provided, it is set as the new accession_number + for the SeqWithQuality object. + Status : Virtual + + +=cut + +sub accession_number { + my( $obj, $acc ) = @_; + + if (defined $acc) { + $obj->{'accession_number'} = $acc; + } else { + $acc = $obj->{'accession_number'}; + $acc = 'unknown' unless defined $acc; + } + return $acc; +} + +=head2 primary_id() + + Title : primary_id() + Usage : $unique_implementation_key = $obj->primary_id(); + Function: Returns the unique id for this object in this implementation. + This allows implementations to manage their own object ids in a + way the implementaiton can control clients can expect one id to + map to one object. For sequences with no accession number, this + method should return a stringified memory location. + This method sets the primary_id for the SeqWithQuality + object. + Returns : A string. (the value of primary_id) + Args : If a scalar is provided, it is set as the new primary_id for + the SeqWithQuality object. + +=cut + +sub primary_id { + my ($obj,$value) = @_; + if ($value) { + $obj->{'primary_id'} = $value; + } + return $obj->{'primary_id'}; + +} + +=head2 desc() + + Title : desc() + Usage : $qual->desc($newval); _or_ + $description = $qual->desc(); + Function: Get/set description text for this SeqWithQuality object. + Returns : A string. (the value of desc) + Args : If a scalar is provided, it is set as the new desc for the + SeqWithQuality object. + +=cut + +sub desc { + # a mechanism to set the disc for the SeqWithQuality object. + # probably will be used most often by set_common_features() + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'desc'} = $value; + } + return $obj->{'desc'}; +} + +=head2 id() + + Title : id() + Usage : $id = $qual->id(); + Function: Return the ID of the quality. This should normally be (and + actually is in the implementation provided here) just a synonym + for display_id(). + Returns : A string. (the value of id) + Args : If a scalar is provided, it is set as the new id for the + SeqWithQuality object. + +=cut + +sub id { + my ($self,$value) = @_; + if (!$self) { $self->throw("no value for self in $value"); } + if( defined $value ) { + return $self->display_id($value); + } + return $self->display_id(); +} + +=head2 seq + + Title : seq() + Usage : $string = $obj->seq(); _or_ + $obj->seq("atctatcatca"); + Function: Returns the sequence that is contained in the imbedded in the + PrimarySeq object within the SeqWithQuality object + Returns : A scalar (the seq() value for the imbedded PrimarySeq object.) + Args : If a scalar is provided, the SeqWithQuality object will + attempt to set that as the sequence for the imbedded PrimarySeq + object. Otherwise, the value of seq() for the PrimarySeq object + is returned. + Notes : This is probably not a good idea because you then should call + length() to make sure that the sequence and quality are of the + same length. Even then, how can you make sure that this sequence + belongs with that quality? I provided this to give you rope to + hang yourself with. Tie it to a strong device and use a good + knot. + +=cut + +sub seq { + my ($self,$value) = @_; + if( defined $value) { + $self->{seq_ref}->seq($value); + $self->length(); + } + return $self->{seq_ref}->seq(); +} + +=head2 qual() + + Title : qual() + Usage : @quality_values = @{$obj->qual()}; _or_ + $obj->qual("10 10 20 40 50"); + Function: Returns the quality as imbedded in the PrimaryQual object + within the SeqWithQuality object. + Returns : A reference to an array containing the quality values in the + PrimaryQual object. + Args : If a scalar is provided, the SeqWithQuality object will + attempt to set that as the quality for the imbedded PrimaryQual + object. Otherwise, the value of qual() for the PrimaryQual + object is returned. + Notes : This is probably not a good idea because you then should call + length() to make sure that the sequence and quality are of the + same length. Even then, how can you make sure that this sequence + belongs with that quality? I provided this to give you a strong + board with which to flagellate yourself. + +=cut + +sub qual { + my ($self,$value) = @_; + + if( defined $value) { + $self->{qual_ref}->qual($value); + # update the lengths + $self->length(); + } + return $self->{qual_ref}->qual(); +} + + + + +=head2 length() + + Title : length() + Usage : $length = $seqWqual->length(); + Function: Get the length of the SeqWithQuality sequence/quality. + Returns : Returns the length of the sequence and quality if they are + both the same. Returns "DIFFERENT" if they differ. + Args : None. + +=cut + +sub length { + my $self = shift; + # what do I return here? Whew. Ambiguity... + ######## + +} + + +=head2 qual_obj + + Title : qual_obj($different_obj) + Usage : $qualobj = $seqWqual->qual_obj(); _or_ + $qualobj = $seqWqual->qual_obj($ref_to_primaryqual_obj); + Function: Get the PrimaryQual object that is imbedded in the + SeqWithQuality object or if a reference to a PrimaryQual object + is provided, set this as the PrimaryQual object imbedded in the + SeqWithQuality object. + Returns : A reference to a Bio::Seq::SeqWithQuality object. + +=cut + +sub qual_obj { + my ($self,$value) = @_; + return $self->{swq}->qual_obj($value); +} + + +=head2 seq_obj + + Title : seq_obj() + Usage : $seqobj = $seqWqual->qual_obj(); _or_ + $seqobj = $seqWqual->seq_obj($ref_to_primary_seq_obj); + Function: Get the PrimarySeq object that is imbedded in the + SeqWithQuality object or if a reference to a PrimarySeq object is + provided, set this as the PrimarySeq object imbedded in the + SeqWithQuality object. + Returns : A reference to a Bio::PrimarySeq object. + +=cut + +sub seq_obj { + my ($self,$value) = @_; + return $self->{swq}->seq_obj($value); +} + +=head2 _set_descriptors + + Title : _set_descriptors() + Usage : $seqWqual->_qual_obj($qual,$seq,$id,$acc,$pid,$desc,$given_id, + $alphabet); + Function: Set the descriptors for the SeqWithQuality object. Try to + match the descriptors in the PrimarySeq object and in the + PrimaryQual object if descriptors were not provided with + construction. + Returns : Nothing. + Args : $qual,$seq,$id,$acc,$pid,$desc,$given_id,$alphabet as found + in the new() method. + Notes : Really only intended to be called by the new() method. If + you want to invoke a similar function try + set_common_descriptors(). + +=cut + + +sub _set_descriptors { + my ($self,$qual,$seq,$id,$acc,$pid,$desc,$given_id,$alphabet) = @_; + $self->{swq}->_seq_descriptors($qual,$seq,$id,$acc,$pid,$desc,$given_id,$alphabet); +} + +=head2 subseq($start,$end) + + Title : subseq($start,$end) + Usage : $subsequence = $obj->subseq($start,$end); + Function: Returns the subseq from start to end, where the first base + is 1 and the number is inclusive, ie 1-2 are the first two + bases of the sequence. + Returns : A string. + Args : Two positions. + +=cut + +sub subseq { + my ($self,@args) = @_; + # does a single value work? + return $self->{swq}->subseq(@args); +} + +=head2 baseat($position) + + Title : baseat($position) + Usage : $base_at_position_6 = $obj->baseat("6"); + Function: Returns a single base at the given position, where the first + base is 1 and the number is inclusive, ie 1-2 are the first two + bases of the sequence. + Returns : A scalar. + Args : A position. + +=cut + +sub baseat { + my ($self,$val) = @_; + return $self->{swq}->subseq($val,$val); +} + +=head2 subqual($start,$end) + + Title : subqual($start,$end) + Usage : @qualities = @{$obj->subqual(10,20); + Function: returns the quality values from $start to $end, where the + first value is 1 and the number is inclusive, ie 1-2 are the + first two bases of the sequence. Start cannot be larger than + end but can be equal. + Returns : A reference to an array. + Args : a start position and an end position + +=cut + +sub subqual { + my ($self,@args) = @_; + return $self->{swq}->subqual(@args); +} + +=head2 qualat($position) + + Title : qualat($position) + Usage : $quality = $obj->qualat(10); + Function: Return the quality value at the given location, where the + first value is 1 and the number is inclusive, ie 1-2 are the + first two bases of the sequence. Start cannot be larger than + end but can be equal. + Returns : A scalar. + Args : A position. + +=cut + +sub qualat { + my ($self,$val) = @_; + return $self->{swq}->qualat($val); +} + +=head2 sub_trace_index($start,$end) + + Title : sub_trace_index($start,$end) + Usage : @trace_indices = @{$obj->sub_trace_index(10,20); + Function: returns the trace index values from $start to $end, where the + first value is 1 and the number is inclusive, ie 1-2 are the + first two bases of the sequence. Start cannot be larger than + end but can be e_trace_index. + Returns : A reference to an array. + Args : a start position and an end position + +=cut + +sub sub_trace_index { + my ($self,$start,$end) = @_; + + if( $start > $end ){ + $self->throw("in sub_trace_index, start [$start] has to be greater than end [$end]"); + } + + if( $start <= 0 || $end > $self->length ) { + $self->throw("You have to have start positive and length less than the total length of sequence [$start:$end] Total ".$self->length.""); + } + + # remove one from start, and then length is end-start + + $start--; + $end--; + my @sub_trace_index_array = @{$self->{trace_indices}}[$start..$end]; + + # return substr $self->seq(), $start, ($end-$start); + return \@sub_trace_index_array; + +} + + + + +=head2 trace_index_at($position) + + Title : trace_index_at($position) + Usage : $trace_index = $obj->trace_index_at(10); + Function: Return the trace_index value at the given location, where the + first value is 1 and the number is inclusive, ie 1-2 are the + first two bases of the sequence. Start cannot be larger than + end but can be etrace_index_. + Returns : A scalar. + Args : A position. + +=cut + +sub trace_index_at { + my ($self,$val) = @_; + my @trace_index_at = @{$self->sub_trace_index($val,$val)}; + if (scalar(@trace_index_at) == 1) { + return $trace_index_at[0]; + } + else { + $self->throw("AAAH! trace_index_at provided more then one quality."); + } +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Seq/TraceI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Seq/TraceI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,237 @@ +# BioPerl module for Bio::Seq::TraceI +# +# Cared for by Chad Matsalla <bioinformatics@dieselwurks.com +# +# Copyright Chad Matsalla +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Seq::TraceI - Interface definition for a Bio::Seq::Trace + +=head1 SYNOPSIS + + # get a Bio::Seq::Qual compliant object somehow + $st = &get_object_somehow(); + + # to test this is a seq object + $st->isa("Bio::Seq::TraceI") + || $obj->throw("$obj does not implement the Bio::Seq::TraceI interface"); + + # set the trace for T to be @trace_points + my $arrayref = $st->trace("T",\@trace_points); + # get the trace points for "C" + my $arrayref = $st->trace("C"); + # get a subtrace for "G" from 10 to 100 + $arrayref = $st->subtrace("G",10,100); + # what is the trace value for "A" at position 355? + my $trace_calue = $st->traceat("A",355); + # create a false trace for "A" with $accuracy + $arrayref = $st->false_trace("A",Bio::Seq::SeqWithQuality, $accuracy); + # does this trace have entries for each base? + $bool = $st->is_complete(); + # how many entries are there in this trace? + $length = $st->length(); + + + +=head1 DESCRIPTION + +This object defines an abstract interface to basic trace information. This +information may have come from an ABI- or scf- formatted file or may have been +made up. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Chad Matsalla + +Email bioinformatics@dieselwurks.com + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Seq::TraceI; +use vars qw(@ISA); +use strict; +use Carp; +use Dumpvalue; + +=head1 Implementation Specific Functions + +These functions are the ones that a specific implementation must +define. + +=head2 trace($base,\@new_values) + + Title : trace($base,\@new_values) + Usage : @trace_Values = @{$obj->trace($base,\@new_values)}; + Function: Returns the trace values as a reference to an array containing the + trace values. The individual elements of the trace array are not validated + and can be any numeric value. + Returns : A reference to an array. + Status : +Arguments: $base : which color channel would you like the trace values for? + - $base must be one of "A","T","G","C" + \@new_values : a reference to an array of values containing trace + data for this base + +=cut + +sub trace { + my ($self) = @_; + if( $self->can('throw') ) { + $self->throw("Bio::Seq::TraceI definition of trace - implementing class did not provide this method"); + } else { + confess("Bio::Seq::TraceI definition of trace - implementing class did not provide this method"); + } +} + +=head2 subtrace($base,$start,$end) + + Title : subtrace($base,$start,$end) + Usage : @subset_of_traces = @{$obj->subtrace("A",10,40)}; + Function: returns the trace values from $start to $end, where the + first value is 1 and the number is inclusive, ie 1-2 are the first + two trace values of this base. Start cannot be larger than end but can + be equal. + Returns : A reference to an array. + Args : $base: "A","T","G" or "C" + $start: a start position + $end : an end position + +=cut + +sub subtrace { + my ($self) = @_; + + if( $self->can('throw') ) { + $self->throw("Bio::Seq::TraceI definition of subtrace - implementing class did not provide this method"); + } else { + confess("Bio::Seq::TraceI definition of subtrace - implementing class did not provide this method"); + } + +} + +=head2 can_call_new() + + Title : can_call_new() + Usage : if( $obj->can_call_new ) { + $newobj = $obj->new( %param ); + } + Function: can_call_new returns 1 or 0 depending on whether an + implementation allows new constructor to be called. If a new + constructor is allowed, then it should take the followed hashed + constructor list. + $myobject->new( -qual => $quality_as_string, + -display_id => $id, + -accession_number => $accession, + ); + Example : + Returns : 1 or 0 + Args : + + +=cut + +sub can_call_new{ + my ($self,@args) = @_; + # we default to 0 here + return 0; +} + +=head2 traceat($channel,$position) + + Title : qualat($channel,$position) + Usage : $trace = $obj->traceat(500); + Function: Return the trace value at the given location, where the + first value is 1 and the number is inclusive, ie 1-2 are the first + two bases of the sequence. Start cannot be larger than end but can + be equal. + Returns : A scalar. + Args : A base and a position. + +=cut + +sub traceat { + my ($self,$value) = @_; + if( $self->can('warn') ) { + $self->warn("Bio::Seq::TraceI definition of traceat - implementing class did not provide this method"); + } else { + warn("Bio::Seq::TraceI definition of traceat - implementing class did not provide this method"); + } + return ''; +} + +=head2 length() + + Title : length() + Usage : $length = $obj->length("A"); + Function: Return the length of the array holding the trace values for the "A" + channel. A check should be done to make sure that this Trace object + is_complete() before doing this to prevent hazardous results. + Returns : A scalar (the number of elements in the quality array). + Args : If used, get the traces from that channel. Default to "A" + +=cut + +sub length { + my ($self)= @_; + if( $self->can('throw') ) { + $self->throw("Bio::Seq::TraceI definition of length - implementing class did not provide this method"); + } else { + confess("Bio::Seq::TraceI definition of length - implementing class did not provide this method"); + } +} + +=head2 trace_indices($new_indices) + + Title : trace_indices($new_indices) + Usage : $indices = $obj->trace_indices($new_indices); + Function: Return the trace iindex points for this object. + Returns : A scalar + Args : If used, the trace indices will be set to the provided value. + +=cut + +sub trace_indices { + my ($self)= @_; + if( $self->can('throw') ) { + $self->throw("Bio::Seq::TraceI definition of trace_indices - implementing class did not provide this method"); + } else { + confess("Bio::Seq::TraceI definition of trace_indices - implementing class did not provide this method"); + } +} + + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqAnalysisParserI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqAnalysisParserI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,104 @@ +# $Id: SeqAnalysisParserI.pm,v 1.12 2002/12/01 00:05:19 jason Exp $ +# +# BioPerl module for Bio::SeqAnalysisParserI +# +# Cared for by Jason Stajich <jason@bioperl.org>, +# and Hilmar Lapp <hlapp@gmx.net> +# +# Copyright Jason Stajich, Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqAnalysisParserI - Sequence analysis output parser interface + +=head1 SYNOPSIS + + # get a SeqAnalysisParserI somehow, e.g. by + my $parser = Bio::Factory::SeqAnalysisParserFactory->get_parser( + '-input' => 'inputfile', '-method' => 'genscan'); + while( my $feature = $parser->next_feature() ) { + print "Feature from ", $feature->start, " to ", $feature->end, "\n"; + } + +=head1 DESCRIPTION + +SeqAnalysisParserI is a generic interface for describing sequence analysis +result parsers. Sequence analysis in this sense is a search for similarities +or the identification of features on the sequence, like a databank search or a +a gene prediction result. + +The concept behind this interface is to have a generic interface in sequence +annotation pipelines (as used e.g. in high-throughput automated +sequence annotation). This interface enables plug-and-play for new analysis +methods and their corresponding parsers without the necessity for modifying +the core of the annotation pipeline. In this concept the annotation pipeline +has to rely on only a list of methods for which to process the results, and a +factory from which it can obtain the corresponding parser implementing this +interface. + +See Bio::Factory::SeqAnalysisParserFactoryI and +Bio::Factory::SeqAnalysisParserFactory for interface and an implementation +of the corresponding factory. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp, Jason Stajich + +Email Hilmar Lapp E<lt>hlapp@gmx.netE<gt>, Jason Stajich E<lt>jason@bioperl.orgE<gt> + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + +package Bio::SeqAnalysisParserI; +use strict; +use vars qw(@ISA); +use Bio::Root::RootI; +use Carp; +@ISA = qw(Bio::Root::RootI); + +=head2 next_feature + + Title : next_feature + Usage : $seqfeature = $obj->next_feature(); + Function: Returns the next feature available in the analysis result, or + undef if there are no more features. + Example : + Returns : A Bio::SeqFeatureI implementing object, or undef if there are no + more features. + Args : none + +=cut + +sub next_feature { + my ($self) = shift; + $self->throw_not_implemented(); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqFeature/AnnotationAdaptor.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqFeature/AnnotationAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,440 @@ +# $Id: AnnotationAdaptor.pm,v 1.4 2002/11/11 18:16:31 lapp Exp $ +# +# BioPerl module for Bio::SeqFeature::AnnotationAdaptor +# +# Cared for by Hilmar Lapp <hlapp at gmx.net> +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# +# (c) Hilmar Lapp, hlapp at gmx.net, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqFeature::AnnotationAdaptor - integrates SeqFeatureIs annotation + +=head1 SYNOPSIS + + use Bio::SeqFeature::Generic; + use Bio::SeqFeature::AnnotationAdaptor; + + # obtain a SeqFeatureI implementing object somehow + my $feat = Bio::SeqFeature::Generic->new(-start => 10, -end => 20); + + # add tag/value annotation + $feat->add_tag_value("mytag", "value of tag mytag"); + $feat->add_tag_value("mytag", "another value of tag mytag"); + + # Bio::SeqFeature::Generic also provides annotation(), which returns a + # Bio::AnnotationCollectionI compliant object + $feat->annotation->add_Annotation("dbxref", $dblink); + + # to integrate tag/value annotation with AnnotationCollectionI + # annotation, use this adaptor, which also implements + # Bio::AnnotationCollectionI + my $anncoll = Bio::SeqFeature::AnnotationAdaptor(-feature => $feat); + + # this will now return tag/value pairs as + # Bio::Annotation::SimpleValue objects + my @anns = $anncoll->get_Annotations("mytag"); + # other added before annotation is available too + my @dblinks = $anncoll->get_Annotations("dbxref"); + + # also supports transparent adding of tag/value pairs in + # Bio::AnnotationI flavor + my $tagval = Bio::Annotation::SimpleValue->new(-value => "some value", + -tagname => "some tag"); + $anncoll->add_Annotation($tagval); + # this is now also available from the feature's tag/value system + my @vals = $feat->each_tag_value("some tag"); + +=head1 DESCRIPTION + +L<Bio::SeqFeatureI> defines light-weight annotation of features +through tag/value pairs. Conversely, L<Bio::AnnotationCollectionI> +together with L<Bio::AnnotationI> defines an annotation bag, which is +better typed, but more heavy-weight because it contains every single +piece of annotation as objects. The frequently used base +implementation of Bio::SeqFeatureI, Bio::SeqFeature::Generic, defines +an additional slot for AnnotationCollectionI-compliant annotation. + +This adaptor provides a L<Bio::AnnotationCollectionI> compliant, +unified, and integrated view on the annotation of L<Bio::SeqFeatureI> +objects, including tag/value pairs, and annotation through the +annotation() method, if the object supports it. Code using this +adaptor does not need to worry about the different ways of possibly +annotating a SeqFeatureI object, but can instead assume that it +strictly follows the AnnotationCollectionI scheme. The price to pay is +that retrieving and adding annotation will always use objects instead +of light-weight tag/value pairs. + +In other words, this adaptor allows us to keep the best of both +worlds. If you create tens of thousands of feature objects, and your +only annotation is tag/value pairs, you are best off using the +features' native tag/value system. If you create a smaller number of +features, but with rich and typed annotation mixed with tag/value +pairs, this adaptor may be for you. Since its implementation is by +double-composition, you only need to create one instance of the +adaptor. In order to transparently annotate a feature object, set the +feature using the feature() method. Every annotation you add will be +added to the feature object, and hence will not be lost when you set +feature() to the next object. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp at gmx.net + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SeqFeature::AnnotationAdaptor; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::Root::Root; +use Bio::AnnotatableI; +use Bio::AnnotationCollectionI; +use Bio::Annotation::SimpleValue; + +@ISA = qw(Bio::Root::Root Bio::AnnotationCollectionI Bio::AnnotatableI); + +=head2 new + + Title : new + Usage : my $obj = new Bio::SeqFeature::AnnotationAdaptor(); + Function: Builds a new Bio::SeqFeature::AnnotationAdaptor object + Returns : an instance of Bio::SeqFeature::AnnotationAdaptor + Args : Named parameters + -feature the Bio::SeqFeatureI implementing object to adapt + (mandatory to be passed here, or set via feature() + before calling other methods) + -annotation the Bio::AnnotationCollectionI implementing object + for storing richer annotation (this will default to + the $feature->annotation() if it supports it) + -tagvalue_factory the object factory to use for creating tag/value + pair representing objects + + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + my ($feat,$anncoll,$fact) = + $self->_rearrange([qw(FEATURE + ANNOTATION + TAGVALUE_FACTORY)], @args); + + $self->feature($feat) if $feat; + $self->annotation($anncoll) if $feat; + $self->tagvalue_object_factory($fact) if $fact; + + return $self; +} + +=head2 feature + + Title : feature + Usage : $obj->feature($newval) + Function: Get/set the feature that this object adapts to an + AnnotationCollectionI. + Example : + Returns : value of feature (a Bio::SeqFeatureI compliant object) + Args : new value (a Bio::SeqFeatureI compliant object, optional) + + +=cut + +sub feature{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'feature'} = $value; + } + return $self->{'feature'}; +} + +=head2 annotation + + Title : annotation + Usage : $obj->annotation($newval) + Function: Get/set the AnnotationCollectionI implementing object used by + this adaptor to store additional annotation that cannot be stored + by the SeqFeatureI itself. + + If requested before having been set, the value will default to the + annotation object of the feature if it has one. + Example : + Returns : value of annotation (a Bio::AnnotationCollectionI compliant object) + Args : new value (a Bio::AnnotationCollectionI compliant object, optional) + + +=cut + +sub annotation{ + my ($self,$value) = @_; + + if( defined $value) { + $self->{'annotation'} = $value; + } + if((! exists($self->{'annotation'})) && + $self->feature()->can('annotation')) { + return $self->feature()->annotation(); + } + return $self->{'annotation'}; +} + +=head1 AnnotationCollectionI implementing methods + +=cut + +=head2 get_all_annotation_keys + + Title : get_all_annotation_keys + Usage : $ac->get_all_annotation_keys() + Function: gives back a list of annotation keys, which are simple text strings + Returns : list of strings + Args : none + +=cut + +sub get_all_annotation_keys{ + my ($self) = @_; + my @keys = (); + + # get the tags from the feature object + push(@keys, $self->feature()->all_tags()); + # ask the annotation implementation in addition, while avoiding duplicates + if($self->annotation()) { + push(@keys, + grep { ! $self->feature->has_tag($_); } + $self->annotation()->get_all_annotation_keys()); + } + # done + return @keys; +} + + +=head2 get_Annotations + + Title : get_Annotations + Usage : my @annotations = $collection->get_Annotations('key') + Function: Retrieves all the Bio::AnnotationI objects for a specific key + Returns : list of Bio::AnnotationI - empty if no objects stored for a key + Args : string which is key for annotations + +=cut + +sub get_Annotations{ + my ($self, $key) = @_; + my @anns = (); + + # if the feature has tag/value pair for this key as the tag + if($self->feature()->has_tag($key)) { + my $fact = $self->tagvalue_object_factory(); + # add each tag/value pair as a SimpleValue object + foreach my $val ($self->feature()->each_tag_value($key)) { + my $ann; + if($fact) { + $ann = $fact->create_object(-value => $val, -tagname => $key); + } else { + $ann = Bio::Annotation::SimpleValue->new(-value => $val, + -tagname => $key); + } + push(@anns, $ann); + } + } + # add what is in the annotation implementation if any + if($self->annotation()) { + push(@anns, $self->annotation->get_Annotations($key)); + } + # done + return @anns; +} + +=head2 get_num_of_annotations + + Title : get_num_of_annotations + Usage : my $count = $collection->get_num_of_annotations() + Function: Returns the count of all annotations stored in this collection + Returns : integer + Args : none + + +=cut + +sub get_num_of_annotations{ + my ($self) = @_; + + # first, count the number of tags on the feature + my $num_anns = 0; + foreach ($self->feature()->all_tags()) { + $num_anns += $self->feature()->each_tag_value($_); + } + # add from the annotation implementation if any + if($self->annotation()) { + $num_anns += $self->annotation()->get_num_of_annotations(); + } + # done + return $num_anns; +} + +=head1 Implementation specific functions - to allow adding + +=cut + +=head2 add_Annotation + + Title : add_Annotation + Usage : $self->add_Annotation('reference',$object); + $self->add_Annotation($object,'Bio::MyInterface::DiseaseI'); + $self->add_Annotation($object); + $self->add_Annotation('disease',$object,'Bio::MyInterface::DiseaseI'); + Function: Adds an annotation for a specific key. + + If the key is omitted, the object to be added must provide a value + via its tagname(). + + If the archetype is provided, this and future objects added under + that tag have to comply with the archetype and will be rejected + otherwise. + + This implementation will add all Bio::Annotation::SimpleValue + objects to the adapted features as tag/value pairs. Caveat: this + may potentially result in information loss if a derived object + is supplied. + + Returns : none + Args : annotation key ('disease', 'dblink', ...) + object to store (must be Bio::AnnotationI compliant) + [optional] object archetype to map future storage of object + of these types to + +=cut + +sub add_Annotation{ + my ($self,$key,$object,$archetype) = @_; + + # if there's no key we use the tagname() as key + if(ref($key) && $key->isa("Bio::AnnotationI") && + (! ($object && ref($object)))) { + $archetype = $object if $object; + $object = $key; + $key = $object->tagname(); + $key = $key->name() if $key && ref($key); # OntologyTermI + $self->throw("Annotation object must have a tagname if key omitted") + unless $key; + } + + if( !defined $object ) { + $self->throw("Must have at least key and object in add_Annotation"); + } + + if( ! (ref($object) && $object->isa("Bio::AnnotationI")) ) { + $self->throw("object must be a Bio::AnnotationI compliant object, otherwise we wont add it!"); + } + + # ready to add -- if it's a SimpleValue, we add to the feature's tags, + # otherwise we'll add to the annotation collection implementation + + if($object->isa("Bio::Annotation::SimpleValue") && + $self->feature()->can('add_tag_value')) { + return $self->feature()->add_tag_value($key, $object->value()); + } else { + my $anncoll = $self->annotation(); + if(! $anncoll) { + $anncoll = Bio::Annotation::Collection->new(); + $self->annotation($anncoll); + } + if($anncoll->can('add_Annotation')) { + return $anncoll->add_Annotation($key,$object,$archetype); + } + $self->throw("Annotation implementation does not allow adding!"); + } +} + +=head1 Additional methods + +=cut + +=head2 tagvalue_object_factory + + Title : tagvalue_object_factory + Usage : $obj->tagval_object_factory($newval) + Function: Get/set the object factory to use for creating objects that + represent tag/value pairs (e.g., + Bio::Annotation::SimpleValue). + + The object to be created is expected to follow + Bio::Annotation::SimpleValue in terms of supported + arguments at creation time, and the methods. + + Example : + Returns : A Bio::Factory::ObjectFactoryI compliant object + Args : new value (a Bio::Factory::ObjectFactoryI compliant object, + optional) + + +=cut + +sub tagvalue_object_factory{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'tagval_object_factory'} = $value; + } + return $self->{'tagval_object_factory'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqFeature/Collection.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqFeature/Collection.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,452 @@ +# $Id: Collection.pm,v 1.9.2.1 2003/02/21 03:07:19 jason Exp $ +# +# BioPerl module for Bio::SeqFeature::Collection +# +# Cared for by Jason Stajich <jason@bioperl.org> +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqFeature::Collection - A container class for SeqFeatures +suitable for performing operations such as finding features within a +range, that match a certain feature type, etc. + +=head1 SYNOPSIS + + use Bio::SeqFeature::Collection; + use Bio::Location::Simple; + use Bio::Tools::GFF; + use Bio::Root::IO; + # let's first input some features + my $gffio = Bio::Tools::GFF->new(-file => Bio::Root::IO->catfile + ("t","data","myco_sites.gff"), + -gff_version => 2); + my @features = (); + # loop over the input stream + while(my $feature = $gffio->next_feature()) { + # do something with feature + push @features, $feature; + } + $gffio->close(); + # build the Collection object + my $col = new Bio::SeqFeature::Collection(); + # add these features to the object + my $totaladded = $col->add_features(\@features); + + my @subset = $col->features_in_range(-start => 1, + -end => 25000, + -strand => 1, + -contain => 0); + # subset should have 18 entries for this dataset + print "size is ", scalar @subset, "\n"; + @subset = $col->features_in_range(-range => Bio::Location::Simple->new + (-start => 70000, + -end => 150000, + -strand => -1), + -contain => 1, + -strandmatch => 'strong'); + + # subset should have 22 entries for this dataset + print "size is ", scalar @subset, "\n"; + print "total number of features in collection is ", + $col->feature_count(),"\n"; + +=head1 DESCRIPTION + +This object will efficiently allow one for query subsets of ranges +within a large collection of sequence features (in fact the objects +just have to be Bio::RangeI compliant). This is done by the creation +of bins which are stored in order in a B-Tree data structure as +provided by the DB_File interface to the Berkeley DB. + +This is based on work done by Lincoln for storage in a mysql instance +- this is intended to be an embedded in-memory implementation for +easily quering for subsets of a large range set. All features are +held in memory even if the -usefile flag is provided. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +=head1 CONTRIBUTORS + +Using code and strategy developed by Lincoln Stein (lstein@cshl.org) +in Bio::DB::GFF implementation. + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SeqFeature::Collection; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::Root::Root; +use Bio::Root::IO; +use Bio::DB::GFF::Util::Binning; +use DB_File; +use Bio::Location::Simple; + +@ISA = qw(Bio::Root::Root ); + + +# This may need to get re-optimized for BDB usage as these +# numbers were derived empirically by Lincoln on a mysql srv +# running on his laptop + +# this is the largest that any reference sequence can be (100 megabases) +use constant MAX_BIN => 100_000_000; + +# this is the smallest bin (1 K) +use constant MIN_BIN => 1_000; + +=head2 new + + Title : new + Usage : my $obj = new Bio::SeqFeature::Collection(); + Function: Builds a new Bio::SeqFeature::Collection object + Returns : Bio::SeqFeature::Collection + Args : + + -minbin minimum value to use for binning + (default is 100,000,000) + -maxbin maximum value to use for binning + (default is 1,000) + -usefile boolean to use a file to store + BTREE rather than an in-memory structure + (default is false or in-memory). + + -features Array ref of features to add initially + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + my ($maxbin,$minbin,$usefile,$features) = $self->_rearrange([qw(MAXBIN MINBIN + USEFILE + FEATURES)],@args); + + defined $maxbin && $self->max_bin($maxbin); + defined $minbin && $self->min_bin($minbin); + + defined $features && $self->add_features($features); + $DB_BTREE->{'flags'} = R_DUP ; + $DB_BTREE->{'compare'} = \&_compare; +# $DB_BTREE->{'compare'} = \&_comparepack; + $self->{'_btreehash'} = {}; + + my $tmpname = undef; + if( $usefile ) { + $self->{'_io'} = new Bio::Root::IO; + (undef,$tmpname) = $self->{'_io'}->tempfile(); + unlink($tmpname); + $self->debug("tmpfile is $tmpname"); + } + $self->{'_btree'} = tie %{$self->{'_btreehash'}}, + 'DB_File', $tmpname, O_RDWR|O_CREAT, 0640, $DB_BTREE; + +# possibly storing/retrieving as floats for speed improvement? +# $self->{'_btree'}->filter_store_key ( sub { $_ = pack ("d", $_) } ); +# $self->{'_btree'}->filter_fetch_key ( sub { $_ = unpack("d", $_) } ); + + $self->{'_features'} = []; + return $self; +} + + +=head2 add_features + + Title : add_features + Usage : $collection->add_features(\@features); + Function: + Returns : number of features added + Args : arrayref of Bio::SeqFeatureI objects to index + + +=cut + +sub add_features{ + my ($self,$feats) = @_; + if( ref($feats) !~ /ARRAY/i ) { + $self->warn("Must provide a valid Array reference to add_features"); + return 0; + } + my $count = 0; + foreach my $f ( @$feats ) { + if( ! $f || ! ref($f) || ! $f->isa('Bio::RangeI') ) { + $self->warn("Must provide valid Bio::RangeI objects to add_features, skipping object '$f'\n"); + next; + } + my $bin = bin($f->start,$f->end,$self->min_bin); + + push @{$self->{'_features'}}, $f; + $self->{'_btreehash'}->{$bin} = $#{$self->{'_features'}}; + $self->debug( "$bin for ". $f->location->to_FTstring(). " matches ".$#{$self->{'_features'}}. "\n"); + $count++; + } + return $count; +} + + +=head2 features_in_range + + Title : features_in_range + Usage : my @features = $collection->features_in_range($range) + Function: Retrieves a list of features which were contained or overlap the + the requested range (see Args for way to specify overlap or + only those containe)d + Returns : List of Bio::SeqFeatureI objects + Args : -range => Bio::RangeI object defining range to search, + OR + -start => start, + -end => end, + -strand => strand + + -contain => boolean - true if feature must be completely + contained with range + OR false if should include features that simply overlap + the range. Default: true. + -strandmatch => 'strong', ranges must have the same strand + 'weak', ranges must have the same + strand or no strand + 'ignore', ignore strand information + Default. 'ignore'. + +=cut + +sub features_in_range{ + my $self = shift; + my (@args) = @_; + my ($range, $contain, $strandmatch,$start,$end,$strand); + if( @args == 1 ) { + $range = shift @args; + } else { + ($start,$end,$strand,$range, + $contain,$strandmatch) = $self->_rearrange([qw(START END + STRAND + RANGE CONTAIN + STRANDMATCH)], + @args); + $contain = 1 unless defined $contain; + } + $strand = 1 unless defined $strand; + if( $strand !~ /^([\-\+])$/ && + $strand !~ /^[\-\+]?1$/ ) { + $self->warn("must provide a valid numeric or +/- for strand"); + return (); + } + if( defined $1 ) { $strand .= 1; } + + if( !defined $start && !defined $end ) { + if( ! defined $range || !ref($range) || ! $range->isa("Bio::RangeI") ) + { + $self->warn("Must defined a valid Range for the method feature_in_range"); + return (); + } + ($start,$end,$strand) = ($range->start,$range->end,$range->strand); + } + my $r = new Bio::Location::Simple(-start => $start, + -end => $end, + -strand => $strand); + + my @features; + my $maxbin = $self->max_bin; + my $minbin = $self->min_bin; + my $tier = $maxbin; + my ($k,$v,@bins) = ("",undef); + while ($tier >= $minbin) { + my ($tier_start,$tier_stop) = (bin_bot($tier,$start), + bin_top($tier,$end)); + if( $tier_start == $tier_stop ) { + my @vals = $self->{'_btree'}->get_dup($tier_start); + if( scalar @vals > 0 ) { + push @bins, map { $self->{'_features'}->[$_] } @vals; + } + } else { + $k = $tier_start; + my @vals; + for( my $rc = $self->{'_btree'}->seq($k,$v,R_CURSOR); + $rc == 0; + $rc = $self->{'_btree'}->seq($k,$v, R_NEXT) ) { + last if( $k > $tier_stop || $k < $tier_start); + push @vals, $v; + } + foreach my $v ( @vals ) { + if( defined $self->{'_features'}->[$v] ) { + push @bins, $self->{'_features'}->[$v] ; + } else { + + } + + } + } + $tier /= 10; + } + + $strandmatch = 'ignore' unless defined $strandmatch; + return ( $contain ) ? grep { $r->contains($_,$strandmatch) } @bins : + grep { $r->overlaps($_,$strandmatch)} @bins; +} + +=head2 remove_features + + Title : remove_features + Usage : $collection->remove_features(\@array) + Function: Removes the requested sequence features (based on features + which have the same location) + Returns : Number of features removed + Args : Arrayref of Bio::RangeI objects + + +=cut + +sub remove_features{ + my ($self,$feats) = @_; + if( ref($feats) !~ /ARRAY/i ) { + $self->warn("Must provide a valid Array reference to remove_features"); + return 0; + } + my $countprocessed = 0; + foreach my $f ( @$feats ) { + next if ! ref($f) || ! $f->isa('Bio::RangeI'); + my $bin = bin($f->start,$f->end,$self->min_bin); + my @vals = $self->{'_btree'}->get_dup($bin); + my $vcount = scalar @vals; + foreach my $v ( @vals ) { + # eventually this array will become sparse... + if( $self->{'_features'}->[$v] == $f ) { + $self->{'_features'}->[$v] = undef; + $self->{'_btree'}->del_dup($bin,$v); + $vcount--; + } + } + if( $vcount == 0 ) { + $self->{'_btree'}->del($bin); + } + } +} + +=head2 get_all_features + + Title : get_all_features + Usage : my @f = $col->get_all_features() + Function: Return all the features stored in this collection (Could be large) + Returns : Array of Bio::RangeI objects + Args : None + + +=cut + +sub get_all_features{ + my ($self) = @_; + return grep {defined $_} @{ $self->{'_features'} }; +} + + +=head2 min_bin + + Title : min_bin + Usage : my $minbin= $self->min_bin; + Function: Get/Set the minimum value to use for binning + Returns : integer + Args : [optional] minimum bin value + + +=cut + +sub min_bin { + my ($self,$min) = @_; + if( defined $min ) { + $self->{'_min_bin'} = $min; + } + return $self->{'_min_bin'} || MIN_BIN; +} + +=head2 max_bin + + Title : max_bin + Usage : my $maxbin= $self->max_bin; + Function: Get/Set the maximum value to use for binning + Returns : integer + Args : [optional] maximum bin value + + +=cut + +sub max_bin { + my ($self,$max) = @_; + if( defined $max ) { + $self->{'_max_bin'} = $max; + } + return $self->{'max_bin'} || MAX_BIN; +} + +=head2 feature_count + + Title : feature_count + Usage : my $c = $col->feature_count() + Function: Retrieve the total number of features in the collection + Returns : integer + Args : none + + +=cut + +sub feature_count{ + my ($self) = @_; + return scalar ( grep {defined $_} @{ $self->{'_features'} }); +} + +sub _compare{ if( defined $_[0] && ! defined $_[1] ) { return -1 } + elsif ( defined $_[1] && ! defined $_[0] ) { return 1} + $_[0] <=> $_[1]} + +sub _comparepack { unpack("d", $_[0]) <=> unpack("d", $_[1]) ;} + +sub DESTROY { + my $self = shift; + $self->SUPER::DESTROY(); + if( defined $self->{'_io'} ) { + $self->{'_io'}->_io_cleanup(); + $self->{'_io'} = undef; + } + $self->{'_btree'} = undef; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqFeature/CollectionI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqFeature/CollectionI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,144 @@ +# $Id: CollectionI.pm,v 1.2 2002/10/22 07:45:20 lapp Exp $ +# +# BioPerl module for Bio::SeqFeature::CollectionI +# +# Cared for by Jason Stajich <jason@bioperl.org> +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqFeature::CollectionI - An interface for a collection of SeqFeatureI objects. + +=head1 SYNOPSIS + + +# get a Bio::SeqFeature::CollectionI somehow +# perhaps a Bio::SeqFeature::Collection + + + use Bio::SeqFeature::Collection; + my $collection = new Bio::SeqFeature::Collection; + $collection->add_features(\@featurelist); + + + $collection->features(-attributes => + [ { 'location' => new Bio::Location::Simple + (-start=> 1, -end => 300) , + 'overlaps' }]); + +=head1 DESCRIPTION + +This interface describes the basic methods needed for a collection of Sequence Features. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SeqFeature::CollectionI; +use vars qw(@ISA); +use strict; +use Carp; +use Bio::Root::RootI; + +@ISA = qw( Bio::Root::RootI ); + + +=head2 add_features + + Title : add_features + Usage : $collection->add_features(\@features); + Function: + Returns : number of features added + Args : arrayref of Bio::SeqFeatureI objects to index + +=cut + +sub add_features{ + shift->throw_not_implemented(); +} + + +=head2 features + + Title : features + Usage : my @f = $collection->features(@args); + Returns : a list of Bio::SeqFeatureI objects + Args : see below + Status : public + +This routine will retrieve features associated with this collection +object. It can be used to return all features, or a subset based on +their type, location, or attributes. + + -types List of feature types to return. Argument is an array + of Bio::Das::FeatureTypeI objects or a set of strings + that can be converted into FeatureTypeI objects. + + -callback A callback to invoke on each feature. The subroutine + will be passed to each Bio::SeqFeatureI object in turn. + + -attributes A hash reference containing attributes to match. + +The -attributes argument is a hashref containing one or more attributes +to match against: + + -attributes => { Gene => 'abc-1', + Note => 'confirmed' } + +Attribute matching is simple exact string matching, and multiple +attributes are ANDed together. See L<Bio::DB::ConstraintsI> for a +more sophisticated take on this. + +If one provides a callback, it will be invoked on each feature in +turn. If the callback returns a false value, iteration will be +interrupted. When a callback is provided, the method returns undef. + +=cut + +sub features{ + shift->throw_not_implemented(); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqFeature/Computation.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqFeature/Computation.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,568 @@ +# $Id: Computation.pm,v 1.11 2002/10/22 07:38:41 lapp Exp $ +# +# BioPerl module for Bio::SeqFeature::Generic +# +# Cared for by mark Fiers <m.w.e.j.fiers@plant.wag-ur.nl> +# +# Copyright Ewan Birney, Mark Fiers +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqFeature::Computation - Computation SeqFeature + +=head1 SYNOPSIS + + $feat = new Bio::SeqFeature::Computation ( + -start => 10, -end => 100, + -strand => -1, -primary => 'repeat', + -program_name => 'GeneMark', + -program_date => '12-5-2000', + -program_version => 'x.y', + -database_name => 'Arabidopsis', + -database_date => '12-dec-2000', + -computation_id => 2231, + -score => { no_score => 334 } ); + + +=head1 DESCRIPTION + +Bio::SeqFeature::Computation extends the Generic seqfeature object with +a set of computation related fields and a more flexible set of storing +more types of score and subseqfeatures. It is compatible with the Generic +SeqFeature object. + +The new way of storing score values is similar to the tag structure in the +Generic object. For storing sets of subseqfeatures the array containg the +subseqfeatures is now a hash which contains arrays of seqfeatures +Both the score and subSeqfeature methods can be called in exactly the same +way, the value's will be stored as a 'default' score or subseqfeature. + +=cut + +#' + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney, Mark Fiers + +Ewan Birney E<lt>birney@sanger.ac.ukE<gt> +Mark Fiers E<lt>m.w.e.j.fiers@plant.wag-ur.nlE<gt> + +=head1 DEVELOPERS + +This class has been written with an eye out of inheritance. The fields +the actual object hash are: + + _gsf_sub_hash = reference to a hash containing sets of sub arrays + _gsf_score_hash= reference to a hash for the score values + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::SeqFeature::Computation; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::SeqFeature::Generic; + +@ISA = qw(Bio::SeqFeature::Generic); + +sub new { + my ( $class, @args) = @_; + + my $self = $class->SUPER::new(@args); + + + my ( $computation_id, + $program_name, $program_date, $program_version, + $database_name, $database_date, $database_version) = + $self->_rearrange([qw(COMPUTATION_ID + PROGRAM_NAME + PROGRAM_DATE + PROGRAM_VERSION + DATABASE_NAME + DATABASE_DATE + DATABASE_VERSION + )],@args); + + $program_name && $self->program_name($program_name); + $program_date && $self->program_date($program_date); + $program_version && $self->program_version($program_version); + $database_name && $self->database_name($database_name); + $database_date && $self->database_date($database_date); + $database_version && $self->database_version($database_version); + $computation_id && $self->computation_id($computation_id); + + return $self; +} + +=head2 has_score + + Title : has_score + Usage : $value = $self->has_score('some_score') + Function: Tests wether a feature contains a score + Returns : TRUE if the SeqFeature has the score, + and FALSE otherwise. + Args : The name of a score + +=cut + +sub has_score { + my ($self, $score) = @_; + return undef unless defined $score; + return exists $self->{'_gsf_score_hash'}->{$score}; +} + +=head2 add_score_value + + Title : add_score_value + Usage : $self->add_score_value('P_value',224); + Returns : TRUE on success + Args : score (string) and value (any scalar) + +=cut + +sub add_score_value { + my ($self, $score, $value) = @_; + if( ! defined $score || ! defined $value ) { + $self->warn("must specify a valid $score and $value to add_score_value"); + return 0; + } + + if ( !defined $self->{'_gsf_score_hash'}->{$score} ) { + $self->{'_gsf_score_hash'}->{$score} = []; + } + + push(@{$self->{'_gsf_score_hash'}->{$score}},$value); +} + +=head2 score + + Title : score + Usage : $value = $comp_obj->score() + $comp_obj->score($value) + Function: Returns the 'default' score or sets the 'default' score + This method exist for compatibility options + It would equal ($comp_obj->each_score_value('default'))[0]; + Returns : A value + Args : (optional) a new value for the 'default' score + +=cut + +sub score { + my ($self, $value) = @_; + my @v; + if (defined $value) { + + if( ref($value) =~ /HASH/i ) { + while( my ($t,$val) = each %{ $value } ) { + $self->add_score_value($t,$val); + } + } else { + @v = $value; + $self->add_score_value('default', $value); + } + + } else { + @v = $self->each_score_value('default'); + } + return $v[0]; +} + +=head2 each_score_value + + Title : each_score_value + Usage : @values = $gsf->each_score_value('note'); + Function: Returns a list of all the values stored + under a particular score. + Returns : A list of scalars + Args : The name of the score + +=cut + +sub each_score_value { + my ($self, $score) = @_; + if ( ! exists $self->{'_gsf_score_hash'}->{$score} ) { + $self->warn("asking for score value that does not exist $score"); + return undef; + } + return @{$self->{'_gsf_score_hash'}->{$score}}; +} + + +=head2 all_scores + + Title : all_scores + Usage : @scores = $feat->all_scores() + Function: Get a list of all the scores in a feature + Returns : An array of score names + Args : none + + +=cut + +sub all_scores { + my ($self, @args) = @_; + + return keys %{$self->{'_gsf_score_hash'}}; +} + + +=head2 remove_score + + Title : remove_score + Usage : $feat->remove_score('some_score') + Function: removes a score from this feature + Returns : nothing + Args : score (string) + + +=cut + +sub remove_score { + my ($self, $score) = @_; + + if ( ! exists $self->{'_gsf_score_hash'}->{$score} ) { + $self->warn("trying to remove a score that does not exist: $score"); + } + + delete $self->{'_gsf_score_hash'}->{$score}; +} + +=head2 computation_id + + Title : computation_id + Usage : $computation_id = $feat->computation_id() + $feat->computation_id($computation_id) + Function: get/set on program name information + Returns : string + Args : none if get, the new value if set + + +=cut + +sub computation_id { + my ($self,$value) = @_; + + if (defined($value)) { + $self->{'_gsf_computation_id'} = $value; + } + + return $self->{'_gsf_computation_id'}; +} + + + + +=head2 program_name + + Title : program_name + Usage : $program_name = $feat->program_name() + $feat->program_name($program_name) + Function: get/set on program name information + Returns : string + Args : none if get, the new value if set + + +=cut + +sub program_name { + my ($self,$value) = @_; + + if (defined($value)) { + $self->{'_gsf_program_name'} = $value; + } + + return $self->{'_gsf_program_name'}; +} + +=head2 program_date + + Title : program_date + Usage : $program_date = $feat->program_date() + $feat->program_date($program_date) + Function: get/set on program date information + Returns : date (string) + Args : none if get, the new value if set + + +=cut + +sub program_date { + my ($self,$value) = @_; + + if (defined($value)) { + $self->{'_gsf_program_date'} = $value; + } + + return $self->{'_gsf_program_date'}; +} + + +=head2 program_version + + Title : program_version + Usage : $program_version = $feat->program_version() + $feat->program_version($program_version) + Function: get/set on program version information + Returns : date (string) + Args : none if get, the new value if set + + +=cut + +sub program_version { + my ($self,$value) = @_; + + if (defined($value)) { + $self->{'_gsf_program_version'} = $value; + } + + return $self->{'_gsf_program_version'}; +} + +=head2 database_name + + Title : database_name + Usage : $database_name = $feat->database_name() + $feat->database_name($database_name) + Function: get/set on program name information + Returns : string + Args : none if get, the new value if set + +=cut + +sub database_name { + my ($self,$value) = @_; + + if (defined($value)) { + $self->{'_gsf_database_name'} = $value; + } + + return $self->{'_gsf_database_name'}; +} + +=head2 database_date + + Title : database_date + Usage : $database_date = $feat->database_date() + $feat->database_date($database_date) + Function: get/set on program date information + Returns : date (string) + Args : none if get, the new value if set + + +=cut + +sub database_date { + my ($self,$value) = @_; + + if (defined($value)) { + $self->{'_gsf_database_date'} = $value; + } + + return $self->{'_gsf_database_date'}; +} + + +=head2 database_version + + Title : database_version + Usage : $database_version = $feat->database_version() + $feat->database_version($database_version) + Function: get/set on program version information + Returns : date (string) + Args : none if get, the new value if set + + +=cut + +sub database_version { + my ($self,$value) = @_; + + if (defined($value)) { + $self->{'_gsf_database_version'} = $value; + } + + return $self->{'_gsf_database_version'}; + +} + +=head2 sub_SeqFeature_type + + Title : sub_SeqFeature_type + Usage : $sub_SeqFeature_type = $feat->sub_SeqFeature_type() + $feat->sub_SeqFeature_type($sub_SeqFeature_type) + Function: sub_SeqFeature_type is automatically set when adding + a sub_computation (sub_SeqFeature) to a computation object + Returns : sub_SeqFeature_type (string) + Args : none if get, the new value if set + +=cut + +sub sub_SeqFeature_type { + my ($self, $value) = @_; + + if (defined($value)) { + $self->{'_gsf_sub_SeqFeature_type'} = $value; + } + return $self->{'_gsf_sub_SeqFeature_type'}; +} + +=head2 all_sub_SeqFeature_types + + Title : all_Sub_SeqFeature_types + Usage : @all_sub_seqfeature_types = $comp->all_Sub_SeqFeature_types(); + Function: Returns an array with all subseqfeature types + Returns : An array + Args : none + +=cut + +sub all_sub_SeqFeature_types { + my ($self) = @_; + return keys ( %{$self->{'gsf_sub_hash'}} ); +} + +=head2 sub_SeqFeature + + Title : sub_SeqFeature('sub_feature_type') + Usage : @feats = $feat->sub_SeqFeature(); + @feats = $feat->sub_SeqFeature('sub_feature_type'); + Function: Returns an array of sub Sequence Features of a specific + type or, if the type is ommited, all sub Sequence Features + Returns : An array + Args : (optional) a sub_SeqFeature type (ie exon, pattern) + +=cut + +sub sub_SeqFeature { + my ($self, $ssf_type) = @_; + my (@return_array) = (); + if ($ssf_type eq '') { + #return all sub_SeqFeatures + foreach (keys ( %{$self->{'gsf_sub_hash'}} )){ + push @return_array, @{$self->{'gsf_sub_hash'}->{$_}}; + } + return @return_array; + } else { + if (defined ($self->{'gsf_sub_hash'}->{$ssf_type})) { + return @{$self->{'gsf_sub_hash'}->{$ssf_type}}; + } else { + $self->warn("$ssf_type is not a valid sub SeqFeature type"); + } + } +} + +=head2 add_sub_SeqFeature + + Title : add_sub_SeqFeature + Usage : $feat->add_sub_SeqFeature($subfeat); + $feat->add_sub_SeqFeature($subfeat,'sub_seqfeature_type') + $feat->add_sub_SeqFeature($subfeat,'EXPAND') + $feat->add_sub_SeqFeature($subfeat,'EXPAND','sub_seqfeature_type') + Function: adds a SeqFeature into a specific subSeqFeature array. + with no 'EXPAND' qualifer, subfeat will be tested + as to whether it lies inside the parent, and throw + an exception if not. + If EXPAND is used, the parents start/end/strand will + be adjusted so that it grows to accommodate the new + subFeature, + optionally a sub_seqfeature type can be defined. + Returns : nothing + Args : An object which has the SeqFeatureI interface + : (optional) 'EXPAND' + : (optional) 'sub_SeqFeature_type' + +=cut + +sub add_sub_SeqFeature{ + my ($self,$feat,$var1, $var2) = @_; + $var1 = '' unless( defined $var1); + $var2 = '' unless( defined $var2); + my ($expand, $ssf_type) = ('', $var1 . $var2); + $expand = 'EXPAND' if ($ssf_type =~ s/EXPAND//); + + if ( !$feat->isa('Bio::SeqFeatureI') ) { + $self->warn("$feat does not implement Bio::SeqFeatureI. Will add it anyway, but beware..."); + } + + if($expand eq 'EXPAND') { + $self->_expand_region($feat); + } else { + if ( !$self->contains($feat) ) { + $self->throw("$feat is not contained within parent feature, and expansion is not valid"); + } + } + + $ssf_type = 'default' if ($ssf_type eq ''); + + if (!(defined ($self->{'gsf_sub_hash'}->{$ssf_type}))) { + @{$self->{'gsf_sub_hash'}->{$ssf_type}} = (); + } + $feat->sub_SeqFeature_type($ssf_type); + push @{$self->{'gsf_sub_hash'}->{$ssf_type}}, $feat; +} + +=head2 flush_sub_SeqFeature + + Title : flush_sub_SeqFeature + Usage : $sf->flush_sub_SeqFeature + $sf->flush_sub_SeqFeature('sub_SeqFeature_type'); + Function: Removes all sub SeqFeature or all sub SeqFeatures + of a specified type + (if you want to remove a more specific subset, take + an array of them all, flush them, and add + back only the guys you want) + Example : + Returns : none + Args : none + + +=cut + +sub flush_sub_SeqFeature { + my ($self, $ssf_type) = @_; + if ($ssf_type) { + if ((defined ($self->{'gsf_sub_hash'}->{$ssf_type}))) { + delete $self->{'gsf_sub_hash'}->{$ssf_type}; + } else { + $self->warn("$ssf_type is not a valid sub SeqFeature type"); + } + } else { + $self->{'_gsf_sub_hash'} = {}; # zap the complete hash implicitly. + } +} + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqFeature/FeaturePair.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqFeature/FeaturePair.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,475 @@ +# $Id: FeaturePair.pm,v 1.17 2002/10/08 08:38:31 lapp Exp $ +# +# BioPerl module for Bio::SeqFeature::FeaturePair +# +# Cared for by Ewan Birney <birney@sanger.ac.uk> +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqFeature::FeaturePair - hold pair feature information e.g. blast hits + +=head1 SYNOPSIS + + my $feat = new Bio::SeqFeature::FeaturePair(-feature1 => $f1, + -feature2 => $f2, + ); + + # Bio::SeqFeatureI methods can be used + + my $start = $feat->start; + my $end = $feat->end; + + # Bio::FeaturePair methods can be used + my $hstart = $feat->hstart; + my $hend = $feat->hend; + + my $feature1 = $feat->feature1; # returns feature1 object + +=head1 DESCRIPTION + +A sequence feature object where the feature is itself a feature on +another sequence - e.g. a blast hit where residues 1-40 of a protein +sequence SW:HBA_HUMAN has hit to bases 100 - 220 on a genomic sequence +HS120G22. The genomic sequence coordinates are used to create one +sequence feature $f1 and the protein coordinates are used to create +feature $f2. A FeaturePair object can then be made + + my $fp = new Bio::SeqFeature::FeaturePair(-feature1 => $f1, # genomic + -feature2 => $f2, # protein + ); + +This object can be used as a standard Bio::SeqFeatureI in which case + + my $gstart = $fp->start # returns start coord on feature1 - genomic seq. + my $gend = $fp->end # returns end coord on feature1. + +In general standard Bio::SeqFeatureI method calls return information +in feature1. + +Data in the feature 2 object are generally obtained using the standard +methods prefixed by h (for hit!) + + my $pstart = $fp->hstart # returns start coord on feature2 = protein seq. + my $pend = $fp->hend # returns end coord on feature2. + +If you wish to swap feature1 and feature2 around : + + $feat->invert + +so... + + $feat->start # etc. returns data in $feature2 object + + +No sub_SeqFeatures or tags can be stored in this object directly. Any +features or tags are expected to be stored in the contained objects +feature1, and feature2. + +=head1 CONTACT + +Ewan Birney E<lt>birney@sanger.ac.ukE<gt> + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SeqFeature::FeaturePair; +use vars qw(@ISA); +use strict; + +use Bio::SeqFeatureI; +use Bio::SeqFeature::Generic; + +@ISA = qw(Bio::SeqFeature::Generic); + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + my ($feature1,$feature2) = + $self->_rearrange([qw(FEATURE1 + FEATURE2 + )],@args); + + # Store the features in the object + $feature1 && $self->feature1($feature1); + $feature2 && $self->feature2($feature2); + return $self; +} + +=head2 feature1 + + Title : feature1 + Usage : $f = $featpair->feature1 + $featpair->feature1($feature) + Function: Get/set for the query feature + Returns : Bio::SeqFeatureI + Args : Bio::SeqFeatureI + + +=cut + +sub feature1 { + my ($self,$arg) = @_; + if ( defined($arg) || !defined $self->{'feature1'} ) { + $arg = new Bio::SeqFeature::Generic() unless( defined $arg); + $self->throw("Argument [$arg] must be a Bio::SeqFeatureI") + unless (ref($arg) && $arg->isa("Bio::SeqFeatureI")); + $self->{'feature1'} = $arg; + } + return $self->{'feature1'}; +} + +=head2 feature2 + + Title : feature2 + Usage : $f = $featpair->feature2 + $featpair->feature2($feature) + Function: Get/set for the hit feature + Returns : Bio::SeqFeatureI + Args : Bio::SeqFeatureI + + +=cut + +sub feature2 { + my ($self,$arg) = @_; + + if ( defined($arg) || ! defined $self->{'feature2'}) { + $arg = new Bio::SeqFeature::Generic unless( defined $arg); + $self->throw("Argument [$arg] must be a Bio::SeqFeatureI") + unless (ref($arg) && $arg->isa("Bio::SeqFeatureI")); + $self->{'feature2'} = $arg; + } + return $self->{'feature2'}; +} + +=head2 start + + Title : start + Usage : $start = $featpair->start + $featpair->start(20) + Function: Get/set on the start coordinate of feature1 + Returns : integer + Args : [optional] beginning of feature + +=cut + +sub start { + my ($self,$value) = @_; + return $self->feature1->start($value); +} + +=head2 end + + Title : end + Usage : $end = $featpair->end + $featpair->end($end) + Function: get/set on the end coordinate of feature1 + Returns : integer + Args : [optional] ending point of feature + + +=cut + +sub end{ + my ($self,$value) = @_; + return $self->feature1->end($value); +} + +=head2 strand + + Title : strand + Usage : $strand = $feat->strand() + $feat->strand($strand) + Function: get/set on strand information, being 1,-1 or 0 + Returns : -1,1 or 0 + Args : [optional] strand information to set + + +=cut + +sub strand{ + my ($self,$arg) = @_; + return $self->feature1->strand($arg); +} + +=head2 location + + Title : location + Usage : $location = $featpair->location + $featpair->location($location) + Function: Get/set location object (using feature1) + Returns : Bio::LocationI object + Args : [optional] LocationI to store + +=cut + +sub location { + my ($self,$value) = @_; + return $self->feature1->location($value); +} + +=head2 score + + Title : score + Usage : $score = $feat->score() + $feat->score($score) + Function: get/set on score information + Returns : float + Args : none if get, the new value if set + + +=cut + +sub score { + my ($self,$arg) = @_; + return $self->feature1->score($arg); +} + +=head2 frame + + Title : frame + Usage : $frame = $feat->frame() + $feat->frame($frame) + Function: get/set on frame information + Returns : 0,1,2 + Args : none if get, the new value if set + + +=cut + +sub frame { + my ($self,$arg) = @_; + return $self->feature1->frame($arg); +} + +=head2 primary_tag + + Title : primary_tag + Usage : $ptag = $featpair->primary_tag + Function: get/set on the primary_tag of feature1 + Returns : 0,1,2 + Args : none if get, the new value if set + + +=cut + +sub primary_tag{ + my ($self,$arg) = @_; + return $self->feature1->primary_tag($arg); +} + +=head2 source_tag + + Title : source_tag + Usage : $tag = $feat->source_tag() + $feat->source_tag('genscan'); + Function: Returns the source tag for a feature, + eg, 'genscan' + Returns : a string + Args : none + + +=cut + +sub source_tag{ + my ($self,$arg) = @_; + return $self->feature1->source_tag($arg); +} + +=head2 seqname + + Title : seqname + Usage : $obj->seq_id($newval) + Function: There are many cases when you make a feature that you + do know the sequence name, but do not know its actual + sequence. This is an attribute such that you can store + the seqname. + + This attribute should *not* be used in GFF dumping, as + that should come from the collection in which the seq + feature was found. + Returns : value of seqname + Args : newvalue (optional) + + +=cut + +sub seqname{ + my ($self,$arg) = @_; + return $self->feature1->seq_id($arg); +} + +=head2 hseqname + + Title : hseqname + Usage : $featpair->hseqname($newval) + Function: Get/set method for the name of + feature2. + Returns : value of $feature2->seq_id + Args : newvalue (optional) + + +=cut + +sub hseqname { + my ($self,$arg) = @_; + return $self->feature2->seq_id($arg); +} + + +=head2 hstart + + Title : hstart + Usage : $start = $featpair->hstart + $featpair->hstart(20) + Function: Get/set on the start coordinate of feature2 + Returns : integer + Args : none + +=cut + +sub hstart { + my ($self,$value) = @_; + return $self->feature2->start($value); +} + +=head2 hend + + Title : hend + Usage : $end = $featpair->hend + $featpair->hend($end) + Function: get/set on the end coordinate of feature2 + Returns : integer + Args : none + + +=cut + +sub hend{ + my ($self,$value) = @_; + return $self->feature2->end($value); +} + + +=head2 hstrand + + Title : hstrand + Usage : $strand = $feat->strand() + $feat->strand($strand) + Function: get/set on strand information, being 1,-1 or 0 + Returns : -1,1 or 0 + Args : none + + +=cut + +sub hstrand{ + my ($self,$arg) = @_; + return $self->feature2->strand($arg); +} + +=head2 hscore + + Title : hscore + Usage : $score = $feat->score() + $feat->score($score) + Function: get/set on score information + Returns : float + Args : none if get, the new value if set + + +=cut + +sub hscore { + my ($self,$arg) = @_; + return $self->feature2->score($arg); +} + +=head2 hframe + + Title : hframe + Usage : $frame = $feat->frame() + $feat->frame($frame) + Function: get/set on frame information + Returns : 0,1,2 + Args : none if get, the new value if set + + +=cut + +sub hframe { + my ($self,$arg) = @_; + return $self->feature2->frame($arg); +} + +=head2 hprimary_tag + + Title : hprimary_tag + Usage : $ptag = $featpair->hprimary_tag + Function: Get/set on the primary_tag of feature2 + Returns : 0,1,2 + Args : none if get, the new value if set + + +=cut + +sub hprimary_tag{ + my ($self,$arg) = @_; + return $self->feature2->primary_tag($arg); +} + +=head2 hsource_tag + + Title : hsource_tag + Usage : $tag = $feat->hsource_tag() + $feat->source_tag('genscan'); + Function: Returns the source tag for a feature, + eg, 'genscan' + Returns : a string + Args : none + + +=cut + +sub hsource_tag{ + my ($self,$arg) = @_; + return $self->feature2->source_tag($arg); +} + +=head2 invert + + Title : invert + Usage : $tag = $feat->invert + Function: Swaps feature1 and feature2 around + Returns : Nothing + Args : none + + +=cut + +sub invert { + my ($self) = @_; + + my $tmp = $self->feature1; + + $self->feature1($self->feature2); + $self->feature2($tmp); + return undef; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqFeature/Gene/Exon.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqFeature/Gene/Exon.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,226 @@ +# $Id: Exon.pm,v 1.8 2002/10/22 07:38:41 lapp Exp $ +# +# BioPerl module for Bio::SeqFeature::Gene::Exon +# +# Cared for by Hilmar Lapp <hlapp@gmx.net> +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqFeature::Gene::Exon - a feature representing an exon + +=head1 SYNOPSIS + + # obtain an exon instance $exon somehow + print "exon from ", $exon->start(), " to ", $exon->end(), + " on seq ", $exon->seq_id(), ", strand ", $exon->strand(), + ", encodes the peptide sequence ", + $exon->cds()->translate()->seq(), "\n"; + +=head1 DESCRIPTION + +This module implements a feature representing an exon by implementing the +Bio::SeqFeature::Gene::ExonI interface. + +Apart from that, this class also implements Bio::SeqFeatureI by inheriting +off Bio::SeqFeature::Generic. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp@gmx.net + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SeqFeature::Gene::Exon; +use vars qw(@ISA); +use strict; + +use Bio::SeqFeature::Generic; +use Bio::SeqFeature::Gene::ExonI; + +@ISA = qw(Bio::SeqFeature::Generic Bio::SeqFeature::Gene::ExonI); + +# +# A list of allowed exon types. See primary_tag(). +# +my @valid_exon_types = ('initial', 'internal', 'terminal'); + +sub new { + my ($caller, @args) = @_; + my $self = $caller->SUPER::new(@args); + my ($primary) = + $self->_rearrange([qw(PRIMARY)],@args); + + $primary = 'exon' unless $primary; + $self->primary_tag($primary); # this will also set is_coding() + $self->strand(0) if(! defined($self->strand())); + return $self; +} + + +=head2 is_coding + + Title : is_coding + Usage : if($exon->is_coding()) { + # do something + } + if($is_utr) { + $exon->is_coding(0); + } + Function: Get/set whether or not the exon codes for amino acid. + Returns : TRUE if the object represents a feature translated into protein, + and FALSE otherwise. + Args : A boolean value on set. + + +=cut + +sub is_coding { + my ($self,$val) = @_; + + if(defined($val)) { + $self->{'_iscoding'} = $val; + } + return $self->{'_iscoding'}; +} + +=head2 primary_tag + + Title : primary_tag + Usage : $tag = $feat->primary_tag() + $feat->primary_tag('exon') + Function: Get/set the primary tag for the exon feature. + + This method is overridden here in order to allow only for + tag values following a certain convention. For consistency reasons, + the tag value must either contain the string 'exon' or the string + 'utr' (both case-insensitive). In the case of 'exon', a string + describing the type of exon may be appended or prefixed. Presently, + the following types are allowed: initial, internal, and terminal + (all case-insensitive). + + If the supplied tag value matches 'utr' (case-insensitive), + is_coding() will automatically be set to FALSE, and to TRUE + otherwise. + + Returns : A string. + Args : A string on set. + + +=cut + +# sub primary_tag { +# my ($self,$value) = @_; + +# if(defined($value)) { +# if((lc($value) =~ /utr/i) || (lc($value) eq "exon") || +# ((lc($value) =~ /exon/i) && +# (grep { $value =~ /$_/i; } @valid_exon_types))) { +# $self->is_coding($value =~ /utr/i ? 0 : 1); +# } else { +# $self->throw("primary tag $value is invalid for object of class ". +# ref($self)); +# } +# } +# return $self->SUPER::primary_tag($value); +# } + +=head2 location + + Title : location + Usage : my $location = $exon->location() + Function: Returns a location object suitable for identifying the location + of the exon on the sequence or parent feature. + + This method is overridden here to restrict allowed location types + to non-compound locations. + + Returns : Bio::LocationI object + Args : none + + +=cut + +sub location { + my ($self,$value) = @_; + + if(defined($value) && $value->isa('Bio::Location::SplitLocationI')) { + $self->throw("split or compound location is not allowed ". + "for an object of type " . ref($self)); + } + return $self->SUPER::location($value); +} + +=head2 cds + + Title : cds() + Usage : $cds = $exon->cds(); + Function: Get the coding sequence of the exon as a sequence object. + + The sequence of the returned object is prefixed by Ns (lower case) + if the frame of the exon is defined and different from zero. The + result is that the first base starts a codon (frame 0). + + This implementation returns undef if the particular exon is + not translated to protein, i.e., is_coding() returns FALSE. Undef + will also be returned if no sequence is attached to this exon + feature. + + Returns : A Bio::PrimarySeqI implementing object. + Args : + + +=cut + +sub cds { + my ($self) = @_; + + # UTR is not translated + return undef if(! $self->is_coding()); + + my $seq = $self->seq(); + if(defined($seq) && defined($self->frame()) && ($self->frame() != 0)) { + my $prefix = "n" x $self->frame(); + $seq->seq($prefix . $seq->seq()); + } + return $seq; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqFeature/Gene/ExonI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqFeature/Gene/ExonI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,117 @@ +# $Id: ExonI.pm,v 1.6 2002/10/22 07:38:41 lapp Exp $ +# +# BioPerl module for Bio::SeqFeature::Gene::ExonI +# +# Cared for by Hilmar Lapp <hlapp@gmx.net> +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqFeature::Gene::ExonI - Interface for a feature representing an exon + +=head1 SYNOPSIS + +See documentation of methods. + +=head1 DESCRIPTION + +A feature representing an exon. An exon in this definition is +transcribed and at least for one particular transcript not spliced out +of the pre-mRNA. However, it does not necessarily code for amino acid. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp@gmx.net + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SeqFeature::Gene::ExonI; +use vars qw(@ISA); +use strict; + +use Carp; +use Bio::SeqFeatureI; + +@ISA = qw(Bio::SeqFeatureI); + + +=head2 is_coding + + Title : is_coding + Usage : if($exon->is_coding()) { + # do something + } + Function: Whether or not the exon codes for amino acid. + Returns : TRUE if the object represents a feature translated into protein, + and FALSE otherwise. + Args : + + +=cut + +sub is_coding { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 cds + + Title : cds() + Usage : $cds = $exon->cds(); + Function: Get the coding sequence of the exon as a sequence object. + + The returned sequence object must be in frame 0, i.e., the first + base starts a codon. + + An implementation may return undef, indicating that a coding + sequence does not exist, e.g. for a UTR (untranslated region). + + Returns : A L<Bio::PrimarySeqI> implementing object. + Args : + + +=cut + +sub cds { + my ($self) = @_; + $self->throw_not_implemented(); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqFeature/Gene/GeneStructure.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqFeature/Gene/GeneStructure.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,401 @@ +# $Id: GeneStructure.pm,v 1.14 2002/10/22 07:38:41 lapp Exp $ +# +# BioPerl module for Bio::SeqFeature::Gene::GeneStructure +# +# Cared for by Hilmar Lapp <hlapp@gmx.net> +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqFeature::Gene::GeneStructure - A feature representing an arbitrarily + complex structure of a gene + +=head1 SYNOPSIS + +See documentation of methods. + +=head1 DESCRIPTION + +A feature representing a gene structure. As of now, a gene structure +really is only a collection of transcripts. See +Bio::SeqFeature::Gene::TranscriptI (interface) and +Bio::SeqFeature::Gene::Transcript (implementation) for the features of +such objects. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp@gmx.net + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SeqFeature::Gene::GeneStructure; +use vars qw(@ISA); +use strict; + +use Bio::SeqFeature::Generic; +use Bio::SeqFeature::Gene::GeneStructureI; + +@ISA = qw(Bio::SeqFeature::Generic Bio::SeqFeature::Gene::GeneStructureI); + + +sub new { + my ($caller, @args) = @_; + my $self = $caller->SUPER::new(@args); + my ($primary) = + $self->_rearrange([qw(PRIMARY + )],@args); + + $primary = 'genestructure' unless $primary; + $self->primary_tag($primary); + $self->strand(0) if(! defined($self->strand())); + return $self; +} + +=head2 transcripts + + Title : transcripts + Usage : @transcripts = $gene->transcripts(); + Function: Get the transcripts of this gene structure. Many gene structures + will have only one transcript. + + Returns : An array of Bio::SeqFeature::Gene::TranscriptI implementing objects. + Args : + + +=cut + +sub transcripts { + my ($self) = @_; + + return () unless exists($self->{'_transcripts'}); + return @{$self->{'_transcripts'}}; +} + +=head2 add_transcript + + Title : add_transcript() + Usage : $gene->add_transcript($transcript); + Function: Add a transcript to this gene structure. + Returns : + Args : A Bio::SeqFeature::Gene::TranscriptI implementing object. + + +=cut + +sub add_transcript { + my ($self, $fea) = @_; + + if(!$fea || ! $fea->isa('Bio::SeqFeature::Gene::TranscriptI') ) { + $self->throw("$fea does not implement Bio::SeqFeature::Gene::TranscriptI"); + } + if(! exists($self->{'_transcripts'})) { + $self->{'_transcripts'} = []; + } + $self->_expand_region($fea); + $fea->parent($self); + push(@{$self->{'_transcripts'}}, $fea); +} + +=head2 flush_transcripts + + Title : flush_transcripts() + Usage : $gene->flush_transcripts(); + Function: Remove all transcripts from this gene structure. + Returns : + Args : + + +=cut + +sub flush_transcripts { + my ($self) = @_; + + if(exists($self->{'_transcripts'})) { + delete($self->{'_transcripts'}); + } +} + +=head2 add_transcript_as_features + + Title : add_transcript_as_features + Usage : $gene->add_transcript_as_features(@featurelist); + Function: take a list of Bio::SeqFeatureI objects and turn them into a + Bio::SeqFeature::Gene::Transcript object. Add that transcript to the gene. + Returns : nothing + Args : a list of Bio::SeqFeatureI compliant objects + + +=cut + +sub add_transcript_as_features{ + my ($self,@features) = @_; + my $transcript=Bio::SeqFeature::Gene::Transcript->new; + foreach my $fea (@features) { + + if ($fea->primary_tag =~ /utr/i) { #UTR / utr/ 3' utr / utr5 etc. + $transcript->add_utr($fea); + } elsif ($fea->primary_tag =~ /promot/i) { #allow for spelling differences + $transcript->add_promoter($fea); + } elsif ($fea->primary_tag =~ /poly.*A/i) { #polyA, POLY_A, etc. + $transcript->poly_A_site($fea); + } else { #assume the rest are exons + $transcript->add_exon($fea); + } + } + $self->add_transcript($transcript); + +} + + +=head2 promoters + + Title : promoters + Usage : @prom_sites = $gene->promoters(); + Function: Get the promoter features of this gene structure. + + This method basically merges the promoters returned by transcripts. + + Note that OO-modeling of regulatory elements is not stable yet. + This means that this method might change or even disappear in a + future release. Be aware of this if you use it. + + Returns : An array of Bio::SeqFeatureI implementing objects. + Args : + + +=cut + +sub promoters { + my ($self) = @_; + my @transcripts = $self->transcripts(); + my @feas = (); + + foreach my $tr (@transcripts) { + push(@feas, $tr->promoters()); + } + return @feas; +} + + +=head2 exons + + Title : exons() + Usage : @exons = $gene->exons(); + @inital_exons = $gene->exons('Initial'); + Function: Get all exon features or all exons of a specified type of this gene + structure. + + Exon type is treated as a case-insensitive regular expression and + optional. For consistency, use only the following types: + initial, internal, terminal, utr, utr5prime, and utr3prime. + A special and virtual type is 'coding', which refers to all types + except utr. + + This method basically merges the exons returned by transcripts. + + Returns : An array of Bio::SeqFeature::Gene::ExonI implementing objects. + Args : An optional string specifying the type of exon. + + +=cut + +sub exons { + my ($self, @args) = @_; + my @transcripts = $self->transcripts(); + my @feas = (); + + foreach my $tr (@transcripts) { + push(@feas, $tr->exons(@args)); + } + return @feas; +} + +=head2 introns + + Title : introns() + Usage : @introns = $gene->introns(); + Function: Get all introns of this gene structure. + + Note that this class currently generates these features on-the-fly, + that is, it simply treats all regions between exons as introns. + It assumes that the exons in the transcripts do not overlap. + + This method basically merges the introns returned by transcripts. + + Returns : An array of Bio::SeqFeatureI implementing objects. + Args : + + +=cut + +sub introns { + my ($self) = @_; + my @transcripts = $self->transcripts(); + my @feas = (); + + foreach my $tr (@transcripts) { + push(@feas, $tr->introns()); + } + return @feas; +} + +=head2 poly_A_sites + + Title : poly_A_sites() + Usage : @polyAsites = $gene->poly_A_sites(); + Function: Get the poly-adenylation sites of this gene structure. + + This method basically merges the poly-adenylation sites returned by + transcripts. + + Returns : An array of Bio::SeqFeatureI implementing objects. + Args : + + +=cut + +sub poly_A_sites { + my ($self) = @_; + my @transcripts = $self->transcripts(); + my @feas = (); + + foreach my $tr (@transcripts) { + push(@feas, $tr->poly_A_site()); + } + return @feas; +} + +=head2 utrs + + Title : utrs() + Usage : @utr_sites = $gene->utrs('3prime'); + @utr_sites = $gene->utrs('5prime'); + @utr_sites = $gene->utrs(); + Function: Get the features representing untranslated regions (UTR) of this + gene structure. + + You may provide an argument specifying the type of UTR. Currently + the following types are recognized: 5prime 3prime for UTR on the + 5' and 3' end of the CDS, respectively. + + This method basically merges the UTRs returned by transcripts. + + Returns : An array of Bio::SeqFeature::Gene::ExonI implementing objects + representing the UTR regions or sites. + Args : Optionally, either 3prime, or 5prime for the the type of UTR + feature. + + +=cut + +sub utrs { + my ($self,@args) = @_; + my @transcripts = $self->transcripts(); + my @feas = (); + + foreach my $tr (@transcripts) { + push(@feas, $tr->utrs(@args)); + } + return @feas; +} + +=head2 sub_SeqFeature + + Title : sub_SeqFeature + Usage : @feats = $gene->sub_SeqFeature(); + Function: Returns an array of all subfeatures. + + This method is defined in Bio::SeqFeatureI. We override this here + to include the transcripts. + + Returns : An array Bio::SeqFeatureI implementing objects. + Args : none + + +=cut + +sub sub_SeqFeature { + my ($self) = @_; + my @feas = (); + + # get what the parent already has + @feas = $self->SUPER::sub_SeqFeature(); + push(@feas, $self->transcripts()); + return @feas; +} + +=head2 flush_sub_SeqFeature + + Title : flush_sub_SeqFeature + Usage : $gene->flush_sub_SeqFeature(); + $gene->flush_sub_SeqFeature(1); + Function: Removes all subfeatures. + + This method is overridden from Bio::SeqFeature::Generic to flush + all additional subfeatures, i.e., transcripts, which is + almost certainly not what you want. To remove only features added + through $gene->add_sub_SeqFeature($feature) pass any + argument evaluating to TRUE. + + Example : + Returns : none + Args : Optionally, an argument evaluating to TRUE will suppress flushing + of all gene structure-specific subfeatures (transcripts). + + +=cut + +sub flush_sub_SeqFeature { + my ($self,$fea_only) = @_; + + $self->SUPER::flush_sub_SeqFeature(); + if(! $fea_only) { + $self->flush_transcripts(); + } +} + +1; + + + + + + + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqFeature/Gene/GeneStructureI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqFeature/Gene/GeneStructureI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,195 @@ +# $Id: GeneStructureI.pm,v 1.8 2002/10/22 07:38:41 lapp Exp $ +# +# BioPerl module for Bio::SeqFeature::Gene::GeneStructureI +# +# Cared for by Hilmar Lapp <hlapp@gmx.net> +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqFeature::Gene::GeneStructure - A feature representing an arbitrarily + complex structure of a gene + +=head1 SYNOPSIS + + #documentaion needed + +=head1 DESCRIPTION + +A feature representing a gene structure. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp@gmx.net + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SeqFeature::Gene::GeneStructureI; +use vars qw(@ISA); +use strict; + +use Carp; +use Bio::SeqFeatureI; + +@ISA = qw(Bio::SeqFeatureI); + +=head2 transcripts + + Title : transcripts() + Usage : @transcripts = $gene->transcripts(); + Function: Get the transcript features/sites of this gene structure. + + See Bio::SeqFeature::Gene::TranscriptI for properties of the + returned objects. + + Returns : An array of Bio::SeqFeature::Gene::TranscriptI implementing objects + representing the promoter regions or sites. + Args : + + +=cut + +sub transcripts { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 promoters + + Title : promoters() + Usage : @prom_sites = $gene->promoters(); + Function: Get the promoter features/sites of this gene structure. + + Note that OO-modeling of regulatory elements is not stable yet. + This means that this method might change or even disappear in a + future release. Be aware of this if you use it. + + Returns : An array of Bio::SeqFeatureI implementing objects representing the + promoter regions or sites. + Args : + +=cut + +sub promoters { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 exons + + Title : exons() + Usage : @exons = $gene->exons(); + @inital = $gene->exons('Initial'); + Function: Get all exon features or all exons of specified type of this gene + structure. + + Refer to the documentation of the class that produced this gene + structure object for information about the possible types. + + See Bio::SeqFeature::Gene::ExonI for properties of the + returned objects. + + Returns : An array of Bio::SeqFeature::Gene::ExonI implementing objects + representing the exon regions. + Args : An optional string specifying the type of the exon. + +=cut + +sub exons { + my ($self, $type) = @_; + $self->throw_not_implemented(); +} + +=head2 introns + + Title : introns() + Usage : @introns = $gene->introns(); + Function: Get all introns of this gene structure. + Returns : An array of Bio::SeqFeatureI implementing objects representing the + introns. + Args : + + +=cut + +sub introns { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 poly_A_sites + + Title : poly_A_sites() + Usage : @polyAsites = $gene->poly_A_sites(); + Function: Get the poly-adenylation features/sites of this gene structure. + Returns : An array of Bio::SeqFeatureI implementing objects representing the + poly-adenylation regions or sites. + Args : + + +=cut + +sub poly_A_sites { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 utrs + + Title : utrs() + Usage : @utr_sites = $gene->utrs(); + Function: Get the UTR features/sites of this gene structure. + + See Bio::SeqFeature::Gene::ExonI for properties of the + returned objects. + + Returns : An array of Bio::SeqFeature::Gene::ExonI implementing objects + representing the UTR regions or sites. + Args : + + +=cut + +sub utrs { + my ($self) = @_; + $self->throw_not_implemented(); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqFeature/Gene/Intron.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqFeature/Gene/Intron.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,83 @@ +# $Id: Intron.pm,v 1.3 2002/10/22 07:45:20 lapp Exp $ +# +# BioPerl module for Bio::SeqFeature::Gene::Intron +# +# Cared for by David Block <dblock@gene.pbi.nrc.ca> +# +# Copyright David Block +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqFeature::Gene::Intron - DESCRIPTION of Object + +=head1 SYNOPSIS + +Give standard usage here + +=head1 DESCRIPTION + +Describe the object here + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - David Block + +Email dblock@gene.pbi.nrc.ca + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SeqFeature::Gene::Intron; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::SeqFeature::Gene::NC_Feature; + +@ISA = qw(Bio::SeqFeature::Gene::NC_Feature); + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqFeature/Gene/NC_Feature.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqFeature/Gene/NC_Feature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,118 @@ +# $Id: NC_Feature.pm,v 1.5 2002/10/22 07:45:20 lapp Exp $ +# +# BioPerl module for Bio::SeqFeature::Gene::NC_Feature.pm +# +# Cared for by David Block <dblock@gene.pbi.nrc.ca> +# +# Copyright David Block +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqFeature::Gene::NC_Feature.pm - DESCRIPTION of Object + +=head1 SYNOPSIS + +Give standard usage here + +=head1 DESCRIPTION + +Describe the object here + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - David Block + +Email dblock@gnf.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::SeqFeature::Gene::NC_Feature; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::SeqFeature::Generic; + +@ISA = qw(Bio::SeqFeature::Generic); +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); +} + + + +=head2 is_coding + + Title : is_coding + Usage : if ($feature->is_coding()) { + #do something + } + Function: Whether or not the feature codes for amino acid. + Returns : FALSE + Args : none + + +=cut + +sub is_coding { + my ($self,@args) = @_; + return; +} + +=head2 cds + + Title : cds + Usage : $cds=$feature->cds(); + Function: get the coding sequence of this feature + Returns : undef + Args : none + + +=cut + +sub cds { + my ($self,@args) = @_; + return; + +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqFeature/Gene/Poly_A_site.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqFeature/Gene/Poly_A_site.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,83 @@ +# $Id: Poly_A_site.pm,v 1.4 2002/10/22 07:45:20 lapp Exp $ +# +# BioPerl module for Bio::SeqFeature::Gene::Poly_A_site +# +# Cared for by David Block <dblock@gene.pbi.nrc.ca> +# +# Copyright David Block +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqFeature::Gene::poly_A_site - DESCRIPTION of Object + +=head1 SYNOPSIS + +Give standard usage here + +=head1 DESCRIPTION + +Describe the object here + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - David Block + +Email dblock@gene.pbi.nrc.ca + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SeqFeature::Gene::Poly_A_site; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::SeqFeature::Gene::NC_Feature; + +@ISA = qw(Bio::SeqFeature::Gene::NC_Feature); + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqFeature/Gene/Promoter.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqFeature/Gene/Promoter.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,84 @@ +# $Id: Promoter.pm,v 1.4 2002/10/22 07:45:20 lapp Exp $ +# +# BioPerl module for Bio::SeqFeature::Gene::Promoter +# +# Cared for by David Block <dblock@gene.pbi.nrc.ca> +# +# Copyright David Block +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqFeature::Gene::Promoter - Describes a promotor + +=head1 SYNOPSIS + +Give standard usage here + +=head1 DESCRIPTION + +Describe the object here + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - David Block + +Email dblock@gene.pbi.nrc.ca + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SeqFeature::Gene::Promoter; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::SeqFeature::Gene::NC_Feature; + +@ISA = qw(Bio::SeqFeature::Gene::NC_Feature); + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqFeature/Gene/Transcript.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqFeature/Gene/Transcript.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,799 @@ +# $Id: Transcript.pm,v 1.25 2002/12/29 09:37:51 lapp Exp $ +# +# BioPerl module for Bio::SeqFeature::Gene::Transcript +# +# Cared for by Hilmar Lapp <hlapp@gmx.net> +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqFeature::Gene::Transcript - A feature representing a transcript + +=head1 SYNOPSIS + +See documentation of methods. + +=head1 DESCRIPTION + +A feature representing a transcript. + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp@gmx.net + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::SeqFeature::Gene::Transcript; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Object + +use Bio::SeqFeature::Gene::TranscriptI; +use Bio::SeqFeature::Generic; +use Bio::PrimarySeq; + +@ISA = qw(Bio::SeqFeature::Generic Bio::SeqFeature::Gene::TranscriptI); + +sub new { + my ($caller, @args) = @_; + my $self = $caller->SUPER::new(@args); + my ($primary) = $self->_rearrange([qw(PRIMARY)],@args); + + $primary = 'transcript' unless $primary; + $self->primary_tag($primary); + $self->strand(0) if(! defined($self->strand())); + return $self; +} + + +=head2 promoters + + Title : promoters() + Usage : @proms = $transcript->promoters(); + Function: Get the promoter features/sites of this transcript. + + Note that OO-modeling of regulatory elements is not stable yet. + This means that this method might change or even disappear in a + future release. Be aware of this if you use it. + + Returns : An array of Bio::SeqFeatureI implementing objects representing the + promoter regions or sites. + Args : + + +=cut + +sub promoters { + my ($self) = @_; + return $self->get_feature_type('Bio::SeqFeature::Gene::Promoter'); +} + +=head2 add_promoter + + Title : add_promoter() + Usage : $transcript->add_promoter($feature); + Function: Add a promoter feature/site to this transcript. + + + Note that OO-modeling of regulatory elements is not stable yet. + This means that this method might change or even disappear in a + future release. Be aware of this if you use it. + + Returns : + Args : A Bio::SeqFeatureI implementing object. + + +=cut + +sub add_promoter { + my ($self, $fea) = @_; + $self->_add($fea,'Bio::SeqFeature::Gene::Promoter'); +} + +=head2 flush_promoters + + Title : flush_promoters() + Usage : $transcript->flush_promoters(); + Function: Remove all promoter features/sites from this transcript. + + Note that OO-modeling of regulatory elements is not stable yet. + This means that this method might change or even disappear in a + future release. Be aware of this if you use it. + + Returns : the removed features as a list + Args : none + + +=cut + +sub flush_promoters { + my ($self) = @_; + return $self->_flush('Bio::SeqFeature::Gene::Promoter'); +} + +=head2 exons + + Title : exons() + Usage : @exons = $gene->exons(); + ($inital_exon) = $gene->exons('Initial'); + Function: Get all exon features or all exons of specified type of this + transcript. + + Exon type is treated as a case-insensitive regular expression and + is optional. For consistency, use only the following types: + initial, internal, terminal. + + Returns : An array of Bio::SeqFeature::Gene::ExonI implementing objects. + Args : An optional string specifying the primary_tag of the feature. + + +=cut + +sub exons { + my ($self, $type) = @_; + return $self->get_unordered_feature_type('Bio::SeqFeature::Gene::ExonI', + $type); +} + +=head2 exons_ordered + + Title : exons_ordered + Usage : @exons = $gene->exons_ordered(); + @exons = $gene->exons_ordered("Internal"); + Function: Get an ordered list of all exon features or all exons of specified + type of this transcript. + + Exon type is treated as a case-insensitive regular expression and + is optional. For consistency, use only the following types: + + Returns : An array of Bio::SeqFeature::Gene::ExonI implementing objects. + Args : An optional string specifying the primary_tag of the feature. + +=cut + +sub exons_ordered { + my ($self,$type) = @_; + return $self->get_feature_type('Bio::SeqFeature::Gene::ExonI', $type); +} + +=head2 add_exon + + Title : add_exon() + Usage : $transcript->add_exon($exon,'initial'); + Function: Add a exon feature to this transcript. + + The second argument denotes the type of exon. Mixing exons with and + without a type is likely to cause trouble in exons(). Either + leave out the type for all exons or for none. + + Presently, the following types are known: initial, internal, + terminal, utr, utr5prime, and utr3prime (all case-insensitive). + UTR should better be added through utrs()/add_utr(). + + If you wish to use other or additional types, you will almost + certainly have to call exon_type_sortorder() in order to replace + the default sort order, or mrna(), cds(), protein(), and exons() + may yield unexpected results. + + Returns : + Args : A Bio::SeqFeature::Gene::ExonI implementing object. + A string indicating the type of the exon (optional). + + +=cut + +sub add_exon { + my ($self, $fea) = @_; + if(! $fea->isa('Bio::SeqFeature::Gene::ExonI') ) { + $self->throw("$fea does not implement Bio::SeqFeature::Gene::ExonI"); + } + $self->_add($fea,'Bio::SeqFeature::Gene::Exon'); +} + +=head2 flush_exons + + Title : flush_exons() + Usage : $transcript->flush_exons(); + $transcript->flush_exons('terminal'); + Function: Remove all or a certain type of exon features from this transcript. + + See add_exon() for documentation about types. + + Calling without a type will not flush UTRs. Call flush_utrs() for + this purpose. + Returns : the deleted features as a list + Args : A string indicating the type of the exon (optional). + + +=cut + +sub flush_exons { + my ($self, $type) = @_; + return $self->_flush('Bio::SeqFeature::Gene::Exon',$type); +} + +=head2 introns + + Title : introns() + Usage : @introns = $gene->introns(); + Function: Get all intron features this gene structure. + + Note that this implementation generates these features + on-the-fly, that is, it simply treats all regions between + exons as introns, assuming that exons do not overlap. A + consequence is that a consistent correspondence between the + elements in the returned array and the array that exons() + returns will exist only if the exons are properly sorted + within their types (forward for plus- strand and reverse + for minus-strand transcripts). To ensure correctness the + elements in the array returned will always be sorted. + + Returns : An array of Bio::SeqFeature::Gene::Intron objects representing + the intron regions. + Args : + + +=cut + +sub introns { + my ($self) = @_; + my @introns = (); + my @exons = $self->exons(); + my ($strand, $rev_order); + + # if there's 1 or less exons we're done + return () unless($#exons > 0); + # record strand and order (a minus-strand transcript is likely to have + # the exons stacked in reverse order) + foreach my $exon (@exons) { + $strand = $exon->strand(); + last if $strand; # we're done if we've got 1 or -1 + } + $rev_order = ($exons[0]->end() < $exons[1]->start() ? 0 : 1); + + # Make sure exons are sorted. Because we assume they don't overlap, we + # simply sort by start position. + if((! defined($strand)) || ($strand != -1) || (! $rev_order)) { + # always sort forward for plus-strand transcripts, and for negative- + # strand transcripts that appear to be unsorted or forward sorted + @exons = map { $_->[0] } sort { $a->[1] <=> $b->[1] } map { [ $_, $_->start()] } @exons; + } else { + # sort in reverse order for transcripts on the negative strand and + # found to be in reverse order + @exons = map { $_->[0] } sort { $b->[1] <=> $a->[1] } map { [ $_, $_->start()] } @exons; + } + # loop over all intervening gaps + for(my $i = 0; $i < $#exons; $i++) { + my ($start, $end); + my $intron; + + if(defined($exons[$i]->strand()) && + (($exons[$i]->strand() * $strand) < 0)) { + $self->throw("Transcript mixes plus and minus strand exons. ". + "Computing introns makes no sense then."); + } + $start = $exons[$i+$rev_order]->end() + 1; # $i or $i+1 + $end = $exons[$i+1-$rev_order]->start() - 1; # $i+1 or $i + $intron = Bio::SeqFeature::Gene::Intron->new( + '-start' => $start, + '-end' => $end, + '-strand' => $strand, + '-primary' => 'intron', + '-source' => ref($self)); + my $seq = $self->entire_seq(); + $intron->attach_seq($seq) if $seq; + $intron->seq_id($self->seq_id()); + push(@introns, $intron); + } + return @introns; +} + +=head2 poly_A_site + + Title : poly_A_site() + Usage : $polyAsite = $transcript->poly_A_site(); + Function: Get/set the poly-adenylation feature/site of this transcript. + Returns : A Bio::SeqFeatureI implementing object representing the + poly-adenylation region. + Args : A Bio::SeqFeatureI implementing object on set, or FALSE to flush + a previously set object. + + +=cut + +sub poly_A_site { + my ($self, $fea) = @_; + if ($fea) { + $self->_add($fea,'Bio::SeqFeature::Gene::Poly_A_site'); + } + return ($self->get_feature_type('Bio::SeqFeature::Gene::Poly_A_site'))[0]; +} + +=head2 utrs + + Title : utrs() + Usage : @utr_sites = $transcript->utrs('utr3prime'); + @utr_sites = $transcript->utrs('utr5prime'); + @utr_sites = $transcript->utrs(); + Function: Get the features representing untranslated regions (UTR) of this + transcript. + + You may provide an argument specifying the type of UTR. Currently + the following types are recognized: utr5prime utr3prime for UTR on the + 5' and 3' end of the CDS, respectively. + + Returns : An array of Bio::SeqFeature::Gene::UTR objects + representing the UTR regions or sites. + Args : Optionally, either utr3prime, or utr5prime for the the type of UTR + feature. + + +=cut + +sub utrs { + my ($self, $type) = @_; + return $self->get_feature_type('Bio::SeqFeature::Gene::UTR',$type); + +} + +=head2 add_utr + + Title : add_utr() + Usage : $transcript->add_utr($utrobj, 'utr3prime'); + $transcript->add_utr($utrobj); + Function: Add a UTR feature/site to this transcript. + + The second parameter is optional and denotes the type of the UTR + feature. Presently recognized types include 'utr5prime' and 'utr3prime' + for UTR on the 5' and 3' end of a gene, respectively. + + Calling this method is the same as calling + add_exon($utrobj, 'utr'.$type). In this sense a UTR object is a + special exon object, which is transcribed, not spliced out, but + not translated. + + Note that the object supplied should return FALSE for is_coding(). + Otherwise cds() and friends will become confused. + + Returns : + Args : A Bio::SeqFeature::Gene::UTR implementing object. + + +=cut + +sub add_utr { + my ($self, $fea, $type) = @_; + $self->_add($fea,'Bio::SeqFeature::Gene::UTR',$type); +} + +=head2 flush_utrs + + Title : flush_utrs() + Usage : $transcript->flush_utrs(); + $transcript->flush_utrs('utr3prime'); + Function: Remove all or a specific type of UTR features/sites from this + transcript. + + Cf. add_utr() for documentation about recognized types. + Returns : a list of the removed features + Args : Optionally a string denoting the type of UTR feature. + + +=cut + +sub flush_utrs { + my ($self, $type) = @_; + return $self->_flush('Bio::SeqFeature::Gene::UTR',$type); +} + +=head2 sub_SeqFeature + + Title : sub_SeqFeature + Usage : @feats = $transcript->sub_SeqFeature(); + Function: Returns an array of all subfeatures. + + This method is defined in Bio::SeqFeatureI. We override this here + to include the exon etc features. + + Returns : An array Bio::SeqFeatureI implementing objects. + Args : none + + +=cut + +sub sub_SeqFeature { + my ($self) = @_; + my @feas; + + # get what the parent already has + @feas = $self->SUPER::sub_SeqFeature(); + # add the features we have in addition + push(@feas, $self->exons()); # this includes UTR features + push(@feas, $self->promoters()); + push(@feas, $self->poly_A_site()) if($self->poly_A_site()); + return @feas; +} + +=head2 flush_sub_SeqFeature + + Title : flush_sub_SeqFeature + Usage : $transcript->flush_sub_SeqFeature(); + $transcript->flush_sub_SeqFeature(1); + Function: Removes all subfeatures. + + This method is overridden from Bio::SeqFeature::Generic to flush + all additional subfeatures like exons, promoters, etc., which is + almost certainly not what you want. To remove only features added + through $transcript->add_sub_SeqFeature($feature) pass any + argument evaluating to TRUE. + + Example : + Returns : none + Args : Optionally, an argument evaluating to TRUE will suppress flushing + of all transcript-specific subfeatures (exons etc.). + + +=cut + +sub flush_sub_SeqFeature { + my ($self,$fea_only) = @_; + + $self->SUPER::flush_sub_SeqFeature(); + if(! $fea_only) { + $self->flush_promoters(); + $self->flush_exons(); + $self->flush_utrs(); + $self->poly_A_site(0); + } +} + + +=head2 cds + + Title : cds + Usage : $seq = $transcript->cds(); + Function: Returns the CDS (coding sequence) as defined by the exons + of this transcript and the attached sequence. + + If no sequence is attached this method will return undef. + + Note that the implementation provided here returns a + concatenation of all coding exons, thereby assuming that + exons do not overlap. + + Note also that you cannot set the CDS via this method. Set + a single CDS feature as a single exon, or derive your own + class if you want to store a predicted CDS. + + Example : + Returns : A Bio::PrimarySeqI implementing object. + Args : + +=cut + +sub cds { + my ($self) = @_; + my @exons = $self->exons_ordered(); #this is always sorted properly according to strand + my $strand; + + return undef unless(@exons); + # record strand (a minus-strand transcript must have the exons sorted in + # reverse order) + foreach my $exon (@exons) { + if(defined($exon->strand()) && (! $strand)) { + $strand = $exon->strand(); + } + if($exon->strand() && (($exon->strand() * $strand) < 0)) { + $self->throw("Transcript mixes coding exons on plus and minus ". + "strand. This makes no sense."); + } + } + my $cds = $self->_make_cds(@exons); + return undef unless $cds; + return Bio::PrimarySeq->new('-id' => $self->seq_id(), + '-seq' => $cds, + '-alphabet' => "dna"); +} + +=head2 protein + + Title : protein() + Usage : $protein = $transcript->protein(); + Function: Get the protein encoded by the transcript as a sequence object. + + The implementation provided here simply calls translate() on the + object returned by cds(). + + Returns : A Bio::PrimarySeqI implementing object. + Args : + + +=cut + +sub protein { + my ($self) = @_; + my $seq; + + $seq = $self->cds(); + return $seq->translate() if $seq; + return undef; +} + +=head2 mrna + + Title : mrna() + Usage : $mrna = $transcript->mrna(); + Function: Get the mRNA of the transcript as a sequence object. + + The difference to cds() is that the sequence object returned by + this methods will also include UTR and the poly-adenylation site, + but not promoter sequence (TBD). + + HL: do we really need this method? + + Returns : A Bio::PrimarySeqI implementing object. + Args : + + +=cut + +sub mrna { + my ($self) = @_; + my ($seq, $mrna, $elem); + + # get the coding part + $seq = $self->cds(); + if(! $seq) { + $seq = Bio::PrimarySeq->new('-id' => $self->seq_id(), + '-alphabet' => "rna", + '-seq' => ""); + } + # get and add UTR sequences + $mrna = ""; + foreach $elem ($self->utrs('utr5prime')) { + $mrna .= $elem->seq()->seq(); + } + $seq->seq($mrna . $seq->seq()); + $mrna = ""; + foreach $elem ($self->utrs('utr3prime')) { + $mrna .= $elem->seq()->seq(); + } + $seq->seq($seq->seq() . $mrna); + if($self->poly_A_site()) { + $seq->seq($seq->seq() . $self->poly_A_site()->seq()->seq()); + } + return undef if($seq->length() == 0); + return $seq; +} + +sub _get_typed_keys { + my ($self, $keyprefix, $type) = @_; + my @keys = (); + my @feas = (); + + # make case-insensitive + $type = ($type ? lc($type) : ""); + # pull out all feature types that exist and match + @keys = grep { /^_$keyprefix$type/i; } (keys(%{$self})); + return @keys; +} + +sub _make_cds { + my ($self,@exons) = @_; + my $cds = ""; + + foreach my $exon (@exons) { + next if((! defined($exon->seq())) || (! $exon->is_coding())); + my $phase = length($cds) % 3; + # let's check the simple case + if((! defined($exon->frame())) || ($phase == $exon->frame())) { + # this one fits exactly, or frame of the exon is undefined (should + # we warn about that?); we bypass the $exon->cds() here (hmm, + # not very clean style, but I don't see where this screws up) + $cds .= $exon->seq()->seq(); + } else { + # this one is probably from exon shuffling and needs some work + my $seq = $exon->cds(); # now $seq is guaranteed to be in frame 0 + next if(! $seq); + $seq = $seq->seq(); + # adjustment needed? + if($phase > 0) { + # how many Ns can we chop off the piece to be added? + my $n_crop = 0; + if($seq =~ /^(n+)/i) { + $n_crop = length($1); + } + if($n_crop >= $phase) { + # chop off to match the phase + $seq = substr($seq, $phase); + } else { + # fill in Ns + $seq = ("n" x (3-$phase)) . $seq; + } + } + $cds .= $seq; + } + } + return $cds; +} + +=head2 features + + Title : features + Usage : my @features=$transcript->features; + Function: returns all the features associated with this transcript + Returns : a list of SeqFeatureI implementing objects + Args : none + + +=cut + + +sub features { + my ($self) = shift; + $self->{'_features'} = [] unless defined $self->{'_features'}; + return @{$self->{'_features'}}; +} + +=head2 features_ordered + + Title : features_ordered + Usage : my @features=$transcript->features_ordered; + Function: returns all the features associated with this transcript, + in order by feature start, according to strand + Returns : a list of SeqFeatureI implementing objects + Args : none + + +=cut + +sub features_ordered{ + my ($self) = @_; + return $self->_stranded_sort(@{$self->{'_features'}}); +} + + +sub get_unordered_feature_type{ + my ($self, $type, $pri)=@_; + my @list; + foreach ($self->features) { + if ($_->isa($type)) { + if ($pri && $_->primary_tag !~ /$pri/i) { + next; + } + push @list,$_; + } + } + return @list; + +} + +sub get_feature_type { + my ($self)=shift; + return $self->_stranded_sort($self->get_unordered_feature_type(@_)); +} + +#This was fixed by Gene Cutler - the indexing on the list being reversed +#fixed a bad bug. Thanks Gene! +sub _flush { + my ($self, $type, $pri)=@_; + my @list=$self->features; + my @cut; + for (reverse (0..$#list)) { + if ($list[$_]->isa($type)) { + if ($pri && $list[$_]->primary_tag !~ /$pri/i) { + next; + } + push @cut, splice @list, $_, 1; #remove the element of $type from @list + #and return each of them in @cut + } + } + $self->{'_features'}=\@list; + return reverse @cut; +} + +sub _add { + my ($self, $fea, $type)=@_; + require Bio::SeqFeature::Gene::Promoter; + require Bio::SeqFeature::Gene::UTR; + require Bio::SeqFeature::Gene::Exon; + require Bio::SeqFeature::Gene::Intron; + require Bio::SeqFeature::Gene::Poly_A_site; + + if(! $fea->isa('Bio::SeqFeatureI') ) { + $self->throw("$fea does not implement Bio::SeqFeatureI"); + } + if(! $fea->isa($type) ) { + $fea=$self->_new_of_type($fea,$type); + } + if (! $self->strand) { + $self->strand($fea->strand); + } else { + if ($self->strand * $fea->strand == -1) { + $self->throw("$fea is on opposite strand from $self"); + } + } + + $self->_expand_region($fea); + if(defined($self->entire_seq()) && (! defined($fea->entire_seq())) && + $fea->can('attach_seq')) { + $fea->attach_seq($self->entire_seq()); + } + if (defined $self->parent) { + $self->parent->_expand_region($fea); + } + push(@{$self->{'_features'}}, $fea); + 1; +} + +sub _stranded_sort { + my ($self,@list)=@_; + my $strand; + foreach my $fea (@list) { + if($fea->strand()) { + # defined and != 0 + $strand = $fea->strand() if(! $strand); + if(($fea->strand() * $strand) < 0) { + $strand = undef; + last; + } + } + } + if (defined $strand && $strand == - 1) { #reverse strand + return map { $_->[0] } sort {$b->[1] <=> $a->[1]} map { [$_, $_->start] } @list; + } else { #undef or forward strand + return map { $_->[0] } sort {$a->[1] <=> $b->[1]} map { [$_, $_->start] } @list; + } +} + +sub _new_of_type { + my ($self, $fea, $type, $pri)= @_; + my $primary; + if ($pri) { + $primary = $pri; #can set new primary tag if desired + } else { + ($primary) = $type =~ /.*::(.+)/; #or else primary is just end of type string + } + bless $fea,$type; + $fea->primary_tag($primary); + return $fea; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqFeature/Gene/TranscriptI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqFeature/Gene/TranscriptI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,253 @@ +# $Id: TranscriptI.pm,v 1.7 2002/10/22 07:38:41 lapp Exp $ +# +# BioPerl module for Bio::SeqFeature::Gene::TranscriptI +# +# Cared for by Hilmar Lapp <hlapp@gmx.net> +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqFeature::Gene::TranscriptI - Interface for a feature representing a + transcript of exons, promoter(s), UTR, and a poly-adenylation site. + +=head1 SYNOPSIS + + #documentation needed + +=head1 DESCRIPTION + +A feature representing a transcript. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp@gmx.net + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SeqFeature::Gene::TranscriptI; +use vars qw(@ISA); +use strict; + +use Carp; +use Bio::SeqFeatureI; + +@ISA = qw(Bio::SeqFeatureI); + +=head2 promoters + + Title : promoters() + Usage : @proms = $transcript->promoters(); + Function: Get the promoter features of this transcript. + + Note that OO-modeling of regulatory elements is not stable yet. + This means that this method might change or even disappear in a + future release. Be aware of this if you use it. + + Returns : An array of Bio::SeqFeatureI implementing objects representing the + promoter regions or sites. + Args : + +=cut + +sub promoters { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 exons + + Title : exons() + Usage : @exons = $transcript->exons(); + @inital = $transcript->exons('Initial'); + Function: Get the individual exons this transcript comprises of, or all exons + of a specified type. + + Refer to the documentation of the class that produced this + transcript object for information about the possible types. + + See Bio::SeqFeature::Gene::ExonI for properties of the + returned objects. + + Returns : An array of Bio::SeqFeature::Gene::ExonI implementing objects + Args : An optional string specifying the type of the exon. + +=cut + +sub exons { + my ($self, $type) = @_; + $self->throw_not_implemented(); +} + +=head2 introns + + Title : introns() + Usage : @introns = $transcript->introns(); + Function: Get all introns this transcript comprises of. + Returns : An array of Bio::SeqFeatureI implementing objects representing the + introns. + Args : + + +=cut + +sub introns { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 poly_A_site + + Title : poly_A_site() + Usage : $polyAsite = $transcript->poly_A_site(); + Function: Get the poly-adenylation site of this transcript. + Returns : A Bio::SeqFeatureI implementing object. + Args : + + +=cut + +sub poly_A_site { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 utrs + + Title : utrs() + Usage : @utr_sites = $transcript->utrs(); + Function: Get the UTR regions this transcript comprises of. + + See Bio::SeqFeature::Gene::ExonI for properties of the + returned objects. + + Returns : An array of Bio::SeqFeature::Gene::ExonI implementing objects + Args : + + +=cut + +sub utrs { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 mrna + + Title : mrna() + Usage : $mrna = $transcript->mrna(); + Function: Get the mRNA of the transcript as a sequence object. + + Returns : A Bio::PrimarySeqI implementing object. + Args : + + +=cut + +sub mrna { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 cds + + Title : cds() + Usage : $cds = $transcript->cds(); + Function: Get the CDS (coding sequence) of the transcript as a sequence + object. + + Returns : A Bio::PrimarySeqI implementing object. + Args : + + +=cut + +sub cds { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 protein + + Title : protein() + Usage : $protein = $transcript->protein(); + Function: Get the protein encoded by the transcript as a sequence object. + + Returns : A Bio::PrimarySeqI implementing object. + Args : + + +=cut + +sub protein { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 parent + + Title : parent + Usage : $obj->parent($newval) + Function: get the parent gene of the transcript + Returns : value of parent - a Bio::SeqFeature::Gene::GeneStructureI-compliant object + Args : a Bio::SeqFeature::Gene::GeneStructureI-compliant object (optional) + + +=cut + +sub parent{ + my ($self,$value) = @_; + if( defined $value) { + if ($value->isa("Bio::SeqFeature::Gene::GeneStructureI")) { + $self->{'parent'} = $value; + } else { + $self->throw("$value must be a Bio::SeqFeature::Gene::GeneStructureI") + } + } + return $self->{'parent'}; +} + + +1; + + + + + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqFeature/Gene/UTR.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqFeature/Gene/UTR.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,106 @@ +# $Id: UTR.pm,v 1.6 2002/10/22 07:45:20 lapp Exp $ +# +# BioPerl module for Bio::SeqFeature::Gene::UTR +# +# Cared for by David Block <dblock@gene.pbi.nrc.ca> +# +# Copyright David Block +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqFeature::Gene::UTR - A feature representing an untranslated region + that is part of a transcription unit + +=head1 SYNOPSIS + +See documentation of methods + +=head1 DESCRIPTION + +A UTR is a Bio::SeqFeature::Gene::ExonI compliant object that is +non-coding, and can be either 5' or 3' in a transcript. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - David Block + +Email dblock@gene.pbi.nrc.ca + +=head1 CONTRIBUTORS + +This is based on the Gene Structure scaffolding erected by Hilmar Lapp +(hlapp@gmx.net). + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SeqFeature::Gene::UTR; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::SeqFeature::Gene::NC_Feature; + +@ISA = qw(Bio::SeqFeature::Gene::NC_Feature); + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + return $self; +} + +=head2 primary_tag + + Title : primary_tag + Usage : $tag = $feat->primary_tag() + Function: Returns the primary tag for a feature, + eg 'utr5prime'. This method insures that 5prime/3prime information + is uniformly stored + Returns : a string + Args : none + +=cut + +sub primary_tag{ + my ($self,$val) = @_; + if( defined $val ) { + if ($val =~ /(3|5)/ ) { $val = "utr$1prime" } + else { $self->warn("tag should contain indication if this is 3 or 5 prime. Preferred text is 'utr3prime' or 'utr5prime'. Using user text of '$val'");} + } + $self->SUPER::primary_tag($val); + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqFeature/Generic.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqFeature/Generic.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1031 @@ +# $Id: Generic.pm,v 1.74.2.1 2003/09/09 20:12:37 lstein Exp $ +# +# BioPerl module for Bio::SeqFeature::Generic +# +# Cared for by Ewan Birney <birney@sanger.ac.uk> +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqFeature::Generic - Generic SeqFeature + +=head1 SYNOPSIS + + $feat = new Bio::SeqFeature::Generic ( -start => 10, -end => 100, + -strand => -1, -primary => 'repeat', + -source_tag => 'repeatmasker', + -score => 1000, + -tag => { + new => 1, + author => 'someone', + sillytag => 'this is silly!' } ); + + $feat = new Bio::SeqFeature::Generic ( -gff_string => $string ); + # if you want explicitly GFF1 + $feat = new Bio::SeqFeature::Generic ( -gff1_string => $string ); + + # add it to an annotated sequence + + $annseq->add_SeqFeature($feat); + + + +=head1 DESCRIPTION + +Bio::SeqFeature::Generic is a generic implementation for the +Bio::SeqFeatureI interface, providing a simple object to provide all +the information for a feature on a sequence. + +For many Features, this is all you will need to use (for example, this +is fine for Repeats in DNA sequence or Domains in protein +sequence). For other features, which have more structure, this is a +good base class to extend using inheritence to have new things: this +is what is done in the Bio::SeqFeature::Gene, +Bio::SeqFeature::Transcript and Bio::SeqFeature::Exon, which provide +well coordinated classes to represent genes on DNA sequence (for +example, you can get the protein sequence out from a transcript +class). + +For many Features, you want to add some piece of information, for +example a common one is that this feature is 'new' whereas other +features are 'old'. The tag system, which here is implemented using a +hash can be used here. You can use the tag system to extend the +SeqFeature::Generic programmatically: that is, you know that you have +read in more information into the tag 'mytag' which you can then +retrieve. This means you do not need to know how to write inherieted +Perl to provide more complex information on a feature, and/or, if you +do know but you do not want to write a new class every time you need +some extra piece of information, you can use the tag system to easily +store and then retrieve information. + +The tag system can be written in/out of GFF format, and also into EMBL +format via the SeqIO system + +=head1 Implemented Interfaces + +This class implementes the following interfaces. + +=over 4 + +=item Bio::SeqFeatureI + +Note that this includes implementing Bio::RangeI. + +=item Bio::AnnotatableI + +=item Bio::FeatureHolderI + +Features held by a feature are essentially sub-features. + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney + +Ewan Birney E<lt>birney@sanger.ac.ukE<gt> + +=head1 DEVELOPERS + +This class has been written with an eye out of inheritence. The fields +the actual object hash are: + + _gsf_tag_hash = reference to a hash for the tags + _gsf_sub_array = reference to an array for subfeatures + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SeqFeature::Generic; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::SeqFeatureI; +use Bio::AnnotatableI; +use Bio::FeatureHolderI; +use Bio::Annotation::Collection; +use Bio::Location::Simple; +use Bio::Tools::GFF; +#use Tie::IxHash; + +@ISA = qw(Bio::Root::Root Bio::SeqFeatureI + Bio::AnnotatableI Bio::FeatureHolderI); + +sub new { + my ( $caller, @args) = @_; + my ($self) = $caller->SUPER::new(@args); + + $self->{'_parse_h'} = {}; + $self->{'_gsf_tag_hash'} = {}; +# tie %{$self->{'_gsf_tag_hash'}}, "Tie::IxHash"; + + # bulk-set attributes + $self->set_attributes(@args); + + # done - we hope + return $self; +} + + +=head2 set_attributes + + Title : set_attributes + Usage : + Function: Sets a whole array of parameters at once. + Example : + Returns : none + Args : Named parameters, in the form as they would otherwise be passed + to new(). Currently recognized are: + + -start start position + -end end position + -strand strand + -primary primary tag + -source source tag + -frame frame + -score score value + -tag a reference to a tag/value hash + -gff_string GFF v.2 string to initialize from + -gff1_string GFF v.1 string to initialize from + -seq_id the display name of the sequence + -annotation the AnnotationCollectionI object + -location the LocationI object + +=cut + +sub set_attributes { + my ($self,@args) = @_; + my ($start, $end, $strand, $primary_tag, $source_tag, $primary, $source, $frame, + $score, $tag, $gff_string, $gff1_string, + $seqname, $seqid, $annot, $location,$display_name) = + $self->_rearrange([qw(START + END + STRAND + PRIMARY_TAG + SOURCE_TAG + PRIMARY + SOURCE + FRAME + SCORE + TAG + GFF_STRING + GFF1_STRING + SEQNAME + SEQ_ID + ANNOTATION + LOCATION + DISPLAY_NAME + )], @args); + $location && $self->location($location); + $gff_string && $self->_from_gff_string($gff_string); + $gff1_string && do { + $self->gff_format(Bio::Tools::GFF->new('-gff_version' => 1)); + $self->_from_gff_stream($gff1_string); + }; + $primary_tag && $self->primary_tag($primary_tag); + $source_tag && $self->source_tag($source_tag); + $primary && $self->primary_tag($primary); + $source && $self->source_tag($source); + defined $start && $self->start($start); + defined $end && $self->end($end); + defined $strand && $self->strand($strand); + defined $frame && $self->frame($frame); + $score && $self->score($score); + $annot && $self->annotation($annot); + defined $display_name && $self->display_name($display_name); + if($seqname) { + $self->warn("-seqname is deprecated. Please use -seq_id instead."); + $seqid = $seqname unless $seqid; + } + $seqid && $self->seq_id($seqid); + $tag && do { + foreach my $t ( keys %$tag ) { + $self->add_tag_value($t,$tag->{$t}); + } + }; +} + + +=head2 direct_new + + Title : direct_new + Usage : my $obj = Bio::SeqFeature::Generic->direct_new + Function: create a blessed hash - for performance improvement in + object creation + Returns : Bio::SeqFeature::Generic object + Args : none + + +=cut + +sub direct_new { + my ( $class) = @_; + my ($self) = {}; + + bless $self,$class; + + return $self; +} + +=head2 location + + Title : location + Usage : my $location = $seqfeature->location() + Function: returns a location object suitable for identifying location + of feature on sequence or parent feature + Returns : Bio::LocationI object + Args : [optional] Bio::LocationI object to set the value to. + + +=cut + +sub location { + my($self, $value ) = @_; + + if (defined($value)) { + unless (ref($value) and $value->isa('Bio::LocationI')) { + $self->throw("object $value pretends to be a location but ". + "does not implement Bio::LocationI"); + } + $self->{'_location'} = $value; + } + elsif (! $self->{'_location'}) { + # guarantees a real location object is returned every time + $self->{'_location'} = Bio::Location::Simple->new(); + } + return $self->{'_location'}; +} + + +=head2 start + + Title : start + Usage : $start = $feat->start + $feat->start(20) + Function: Get/set on the start coordinate of the feature + Returns : integer + Args : none + + +=cut + +sub start { + my ($self,$value) = @_; + return $self->location->start($value); +} + +=head2 end + + Title : end + Usage : $end = $feat->end + $feat->end($end) + Function: get/set on the end coordinate of the feature + Returns : integer + Args : none + + +=cut + +sub end { + my ($self,$value) = @_; + return $self->location->end($value); +} + +=head2 length + + Title : length + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub length { + my ($self) = @_; + return $self->end - $self->start() + 1; +} + +=head2 strand + + Title : strand + Usage : $strand = $feat->strand() + $feat->strand($strand) + Function: get/set on strand information, being 1,-1 or 0 + Returns : -1,1 or 0 + Args : none + + +=cut + +sub strand { + my ($self,$value) = @_; + return $self->location->strand($value); +} + +=head2 score + + Title : score + Usage : $score = $feat->score() + $feat->score($score) + Function: get/set on score information + Returns : float + Args : none if get, the new value if set + + +=cut + +sub score { + my ($self,$value) = @_; + + if (defined($value)) { + if ( $value !~ /^[+-]?\d+\.?\d*(e-\d+)?/ ) { + $self->throw("'$value' is not a valid score"); + } + $self->{'_gsf_score'} = $value; + } + + return $self->{'_gsf_score'}; +} + +=head2 frame + + Title : frame + Usage : $frame = $feat->frame() + $feat->frame($frame) + Function: get/set on frame information + Returns : 0,1,2, '.' + Args : none if get, the new value if set + + +=cut + +sub frame { + my ($self,$value) = @_; + + if ( defined $value ) { + if ( $value !~ /^[0-2.]$/ ) { + $self->throw("'$value' is not a valid frame"); + } + if( $value eq '.' ) { $value = '.'; } + $self->{'_gsf_frame'} = $value; + } + return $self->{'_gsf_frame'}; +} + +=head2 primary_tag + + Title : primary_tag + Usage : $tag = $feat->primary_tag() + $feat->primary_tag('exon') + Function: get/set on the primary tag for a feature, + eg 'exon' + Returns : a string + Args : none + + +=cut + +sub primary_tag { + my ($self,$value) = @_; + if ( defined $value ) { + $self->{'_primary_tag'} = $value; + } + return $self->{'_primary_tag'}; +} + +=head2 source_tag + + Title : source_tag + Usage : $tag = $feat->source_tag() + $feat->source_tag('genscan'); + Function: Returns the source tag for a feature, + eg, 'genscan' + Returns : a string + Args : none + + +=cut + +sub source_tag { + my ($self,$value) = @_; + + if( defined $value ) { + $self->{'_source_tag'} = $value; + } + return $self->{'_source_tag'}; +} + +=head2 has_tag + + Title : has_tag + Usage : $value = $self->has_tag('some_tag') + Function: Tests wether a feature contaings a tag + Returns : TRUE if the SeqFeature has the tag, + and FALSE otherwise. + Args : The name of a tag + + +=cut + +sub has_tag { + my ($self, $tag) = @_; + return exists $self->{'_gsf_tag_hash'}->{$tag}; +} + +=head2 add_tag_value + + Title : add_tag_value + Usage : $self->add_tag_value('note',"this is a note"); + Returns : TRUE on success + Args : tag (string) and value (any scalar) + + +=cut + +sub add_tag_value { + my ($self, $tag, $value) = @_; + $self->{'_gsf_tag_hash'}->{$tag} ||= []; + push(@{$self->{'_gsf_tag_hash'}->{$tag}},$value); +} + + +=head2 get_tag_values + + Title : get_tag_values + Usage : @values = $gsf->get_tag_values('note'); + Function: Returns a list of all the values stored + under a particular tag. + Returns : A list of scalars + Args : The name of the tag + + +=cut + +sub get_tag_values { + my ($self, $tag) = @_; + + if( ! defined $tag ) { return (); } + if ( ! exists $self->{'_gsf_tag_hash'}->{$tag} ) { + $self->throw("asking for tag value that does not exist $tag"); + } + return @{$self->{'_gsf_tag_hash'}->{$tag}}; +} + + +=head2 get_all_tags + + Title : get_all_tags + Usage : @tags = $feat->get_all_tags() + Function: Get a list of all the tags in a feature + Returns : An array of tag names + Args : none + + +=cut + +sub get_all_tags { + my ($self, @args) = @_; + return keys %{ $self->{'_gsf_tag_hash'}}; +} + +=head2 remove_tag + + Title : remove_tag + Usage : $feat->remove_tag('some_tag') + Function: removes a tag from this feature + Returns : the array of values for this tag before removing it + Args : tag (string) + + +=cut + +sub remove_tag { + my ($self, $tag) = @_; + + if ( ! exists $self->{'_gsf_tag_hash'}->{$tag} ) { + $self->throw("trying to remove a tag that does not exist: $tag"); + } + my @vals = @{$self->{'_gsf_tag_hash'}->{$tag}}; + delete $self->{'_gsf_tag_hash'}->{$tag}; + return @vals; +} + +=head2 attach_seq + + Title : attach_seq + Usage : $sf->attach_seq($seq) + Function: Attaches a Bio::Seq object to this feature. This + Bio::Seq object is for the *entire* sequence: ie + from 1 to 10000 + Example : + Returns : TRUE on success + Args : a Bio::PrimarySeqI compliant object + + +=cut + +sub attach_seq { + my ($self, $seq) = @_; + + if ( ! ($seq && ref($seq) && $seq->isa("Bio::PrimarySeqI")) ) { + $self->throw("Must attach Bio::PrimarySeqI objects to SeqFeatures"); + } + + $self->{'_gsf_seq'} = $seq; + + # attach to sub features if they want it + foreach ( $self->sub_SeqFeature() ) { + $_->attach_seq($seq); + } + + return 1; +} + +=head2 seq + + Title : seq + Usage : $tseq = $sf->seq() + Function: returns the truncated sequence (if there) for this + Example : + Returns : sub seq (a Bio::PrimarySeqI compliant object) on attached sequence + bounded by start & end, or undef if there is no sequence attached + Args : none + + +=cut + +sub seq { + my ($self, $arg) = @_; + + if ( defined $arg ) { + $self->throw("Calling SeqFeature::Generic->seq with an argument. You probably want attach_seq"); + } + + if ( ! exists $self->{'_gsf_seq'} ) { + return undef; + } + + # assumming our seq object is sensible, it should not have to yank + # the entire sequence out here. + + my $seq = $self->{'_gsf_seq'}->trunc($self->start(), $self->end()); + + + if ( $self->strand == -1 ) { + + # ok. this does not work well (?) + #print STDERR "Before revcom", $seq->str, "\n"; + $seq = $seq->revcom; + #print STDERR "After revcom", $seq->str, "\n"; + } + + return $seq; +} + +=head2 entire_seq + + Title : entire_seq + Usage : $whole_seq = $sf->entire_seq() + Function: gives the entire sequence that this seqfeature is attached to + Example : + Returns : a Bio::PrimarySeqI compliant object, or undef if there is no + sequence attached + Args : + + +=cut + +sub entire_seq { + my ($self) = @_; + + return $self->{'_gsf_seq'}; +} + + +=head2 seq_id + + Title : seq_id + Usage : $obj->seq_id($newval) + Function: There are many cases when you make a feature that you + do know the sequence name, but do not know its actual + sequence. This is an attribute such that you can store + the ID (e.g., display_id) of the sequence. + + This attribute should *not* be used in GFF dumping, as + that should come from the collection in which the seq + feature was found. + Returns : value of seq_id + Args : newvalue (optional) + + +=cut + +sub seq_id { + my ($obj,$value) = @_; + if ( defined $value ) { + $obj->{'_gsf_seq_id'} = $value; + } + return $obj->{'_gsf_seq_id'}; +} + +=head2 display_name + + Title : display_name + Usage : $featname = $obj->display_name + Function: Implements the display_name() method, which is a human-readable + name for the feature. + Returns : value of display_name (a string) + Args : Optionally, on set the new value or undef + +=cut + +sub display_name{ + my $self = shift; + + return $self->{'display_name'} = shift if @_; + return $self->{'display_name'}; +} + +=head1 Methods for implementing Bio::AnnotatableI + +=cut + +=head2 annotation + + Title : annotation + Usage : $obj->annotation($annot_obj) + Function: Get/set the annotation collection object for annotating this + feature. + + Example : + Returns : A Bio::AnnotationCollectionI object + Args : newvalue (optional) + + +=cut + +sub annotation { + my ($obj,$value) = @_; + + # we are smart if someone references the object and there hasn't been + # one set yet + if(defined $value || ! defined $obj->{'annotation'} ) { + $value = new Bio::Annotation::Collection unless ( defined $value ); + $obj->{'annotation'} = $value; + } + return $obj->{'annotation'}; +} + +=head1 Methods to implement Bio::FeatureHolderI + +This includes methods for retrieving, adding, and removing +features. Since this is already a feature, features held by this +feature holder are essentially sub-features. + +=cut + +=head2 get_SeqFeatures + + Title : get_SeqFeatures + Usage : @feats = $feat->get_SeqFeatures(); + Function: Returns an array of sub Sequence Features + Returns : An array + Args : none + + +=cut + +sub get_SeqFeatures { + my ($self) = @_; + + if ($self->{'_gsf_sub_array'}) { + return @{$self->{'_gsf_sub_array'}}; + } else { + return; + } +} + +=head2 add_SeqFeature + + Title : add_SeqFeature + Usage : $feat->add_SeqFeature($subfeat); + $feat->add_SeqFeature($subfeat,'EXPAND') + Function: adds a SeqFeature into the subSeqFeature array. + with no 'EXPAND' qualifer, subfeat will be tested + as to whether it lies inside the parent, and throw + an exception if not. + + If EXPAND is used, the parent's start/end/strand will + be adjusted so that it grows to accommodate the new + subFeature + Returns : nothing + Args : An object which has the SeqFeatureI interface + + +=cut + +#' +sub add_SeqFeature{ + my ($self,$feat,$expand) = @_; + + if ( !$feat->isa('Bio::SeqFeatureI') ) { + $self->warn("$feat does not implement Bio::SeqFeatureI. Will add it anyway, but beware..."); + } + + if($expand && ($expand eq 'EXPAND')) { + $self->_expand_region($feat); + } else { + if ( !$self->contains($feat) ) { + $self->throw("$feat is not contained within parent feature, and expansion is not valid"); + } + } + + $self->{'_gsf_sub_array'} = [] unless exists($self->{'_gsf_sub_array'}); + push(@{$self->{'_gsf_sub_array'}},$feat); + +} + +=head2 remove_SeqFeatures + + Title : remove_SeqFeatures + Usage : $sf->remove_SeqFeatures + Function: Removes all sub SeqFeatures + + If you want to remove only a subset, remove that subset from the + returned array, and add back the rest. + + Example : + Returns : The array of Bio::SeqFeatureI implementing sub-features that was + deleted from this feature. + Args : none + + +=cut + +sub remove_SeqFeatures { + my ($self) = @_; + + my @subfeats = @{$self->{'_gsf_sub_array'}}; + $self->{'_gsf_sub_array'} = []; # zap the array implicitly. + return @subfeats; +} + +=head1 GFF-related methods + +=cut + +=head2 gff_format + + Title : gff_format + Usage : # get: + $gffio = $feature->gff_format(); + # set (change the default version of GFF2): + $feature->gff_format(Bio::Tools::GFF->new(-gff_version => 1)); + Function: Get/set the GFF format interpreter. This object is supposed to + format and parse GFF. See Bio::Tools::GFF for the interface. + + If this method is called as class method, the default for all + newly created instances will be changed. Otherwise only this + instance will be affected. + Example : + Returns : a Bio::Tools::GFF compliant object + Args : On set, an instance of Bio::Tools::GFF or a derived object. + + +=cut + +sub gff_format { + my ($self, $gffio) = @_; + + if(defined($gffio)) { + if(ref($self)) { + $self->{'_gffio'} = $gffio; + } else { + $Bio::SeqFeatureI::static_gff_formatter = $gffio; + } + } + return (ref($self) && exists($self->{'_gffio'}) ? + $self->{'_gffio'} : $self->_static_gff_formatter); +} + +=head2 gff_string + + Title : gff_string + Usage : $str = $feat->gff_string; + $str = $feat->gff_string($gff_formatter); + Function: Provides the feature information in GFF format. + + We override this here from Bio::SeqFeatureI in order to use the + formatter returned by gff_format(). + + Returns : A string + Args : Optionally, an object implementing gff_string(). + + +=cut + +sub gff_string{ + my ($self,$formatter) = @_; + + $formatter = $self->gff_format() unless $formatter; + return $formatter->gff_string($self); +} + +# =head2 slurp_gff_file +# +# Title : slurp_file +# Usage : @features = Bio::SeqFeature::Generic::slurp_gff_file(\*FILE); +# Function: Sneaky function to load an entire file as in memory objects. +# Beware of big files. +# +# This method is deprecated. Use Bio::Tools::GFF instead, which can +# also handle large files. +# +# Example : +# Returns : +# Args : +# +# =cut + +sub slurp_gff_file { + my ($f) = @_; + my @out; + if ( !defined $f ) { + die "Must have a filehandle"; + } + + Bio::Root::Root->warn("deprecated method slurp_gff_file() called in Bio::SeqFeature::Generic. Use Bio::Tools::GFF instead."); + + while(<$f>) { + + my $sf = Bio::SeqFeature::Generic->new('-gff_string' => $_); + push(@out, $sf); + } + + return @out; + +} + +=head2 _from_gff_string + + Title : _from_gff_string + Usage : + Function: Set feature properties from GFF string. + + This method uses the object returned by gff_format() for the + actual interpretation of the string. Set a different GFF format + interpreter first if you need a specific version, like GFF1. (The + default is GFF2.) + Example : + Returns : + Args : a GFF-formatted string + + +=cut + +sub _from_gff_string { + my ($self, $string) = @_; + + $self->gff_format()->from_gff_string($self, $string); +} + + +=head2 _expand_region + + Title : _expand_region + Usage : $self->_expand_region($feature); + Function: Expand the total region covered by this feature to + accomodate for the given feature. + + May be called whenever any kind of subfeature is added to this + feature. add_sub_SeqFeature() already does this. + Returns : + Args : A Bio::SeqFeatureI implementing object. + + +=cut + +sub _expand_region { + my ($self, $feat) = @_; + if(! $feat->isa('Bio::SeqFeatureI')) { + $self->warn("$feat does not implement Bio::SeqFeatureI"); + } + # if this doesn't have start/end set - forget it! + if((! defined($self->start())) && (! defined $self->end())) { + $self->start($feat->start()); + $self->end($feat->end()); + $self->strand($feat->strand) unless defined($self->strand()); + } else { + my $range = $self->union($feat); + $self->start($range->start); + $self->end($range->end); + $self->strand($range->strand); + } +} + +=head2 _parse + + Title : _parse + Usage : + Function: Parsing hints + Example : + Returns : + Args : + + +=cut + +sub _parse { + my ($self) = @_; + + return $self->{'_parse_h'}; +} + +=head2 _tag_value + + Title : _tag_value + Usage : + Function: For internal use only. Convenience method for those tags that + may only have a single value. + Returns : + Args : + + +=cut + +sub _tag_value { + my ($self, $tag, $value) = @_; + + if(defined($value) || (! $self->has_tag($tag))) { + $self->remove_tag($tag) if($self->has_tag($tag)); + $self->add_tag_value($tag, $value); + } + return ($self->each_tag_value($tag))[0]; +} + +####################################################################### +# aliases for methods that changed their names in an attempt to make # +# bioperl names more consistent # +####################################################################### + +sub seqname { + my $self = shift; + $self->warn("SeqFeatureI::seqname() is deprecated. Please use seq_id() instead."); + return $self->seq_id(@_); +} + +sub display_id { + my $self = shift; + $self->warn("SeqFeatureI::display_id() is deprecated. Please use display_name() instead."); + return $self->display_name(@_); +} + +# this is towards consistent naming +sub each_tag_value { return shift->get_tag_values(@_); } +sub all_tags { return shift->get_all_tags(@_); } + +# we revamped the feature containing property to implementing +# Bio::FeatureHolderI +*sub_SeqFeature = \&get_SeqFeatures; +*add_sub_SeqFeature = \&add_SeqFeature; +*flush_sub_SeqFeatures = \&remove_SeqFeatures; +# this one is because of inconsistent naming ... +*flush_sub_SeqFeature = \&remove_SeqFeatures; + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqFeature/PositionProxy.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqFeature/PositionProxy.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,449 @@ +# $Id: PositionProxy.pm,v 1.4 2002/10/22 07:38:41 lapp Exp $ +# +# BioPerl module for Bio::SeqFeature::PositionProxy +# +# Cared for by Ewan Birney <birney@ebi.ac.uk> +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqFeature::PositionProxy - handle features when truncation/revcom sequences span a feature + +=head1 SYNOPSIS + + $proxy = new Bio::SeqFeature::PositionProxy ( -loc => $loc, + -parent => $basefeature); + + $seq->add_SeqFeature($feat); + + + +=head1 DESCRIPTION + +PositionProxy is a Proxy Sequence Feature to handle truncation +and revcomp without duplicating all the data within the sequence features. +It holds a new location for a sequence feature and the original feature +it came from to provide the additional annotation information. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney + +Ewan Birney E<lt>birney@sanger.ac.ukE<gt> + +=head1 DEVELOPERS + +This class has been written with an eye out of inheritence. The fields +the actual object hash are: + + _gsf_tag_hash = reference to a hash for the tags + _gsf_sub_array = reference to an array for sub arrays + _gsf_start = scalar of the start point + _gsf_end = scalar of the end point + _gsf_strand = scalar of the strand + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SeqFeature::PositionProxy; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::SeqFeatureI; +use Bio::Tools::GFF; + + +@ISA = qw(Bio::Root::Root Bio::SeqFeatureI); + +sub new { + my ($caller, @args) = @_; + my $self = $caller->SUPER::new(@args); + + my ($feature,$location) = $self->_rearrange([qw(PARENT LOC)],@args); + + if( !defined $feature || !ref $feature || !$feature->isa('Bio::SeqFeatureI') ) { + $self->throw("Must have a parent feature, not a [$feature]"); + } + + if( $feature->isa("Bio::SeqFeature::PositionProxy") ) { + $feature = $feature->parent(); + } + + if( !defined $location || !ref $location || !$location->isa('Bio::LocationI') ) { + $self->throw("Must have a location, not a [$location]"); + } + + + return $self; +} + + +=head2 location + + Title : location + Usage : my $location = $seqfeature->location() + Function: returns a location object suitable for identifying location + of feature on sequence or parent feature + Returns : Bio::LocationI object + Args : none + + +=cut + +sub location { + my($self, $value ) = @_; + + if (defined($value)) { + unless (ref($value) and $value->isa('Bio::LocationI')) { + $self->throw("object $value pretends to be a location but ". + "does not implement Bio::LocationI"); + } + $self->{'_location'} = $value; + } + elsif (! $self->{'_location'}) { + # guarantees a real location object is returned every time + $self->{'_location'} = Bio::Location::Simple->new(); + } + return $self->{'_location'}; +} + + +=head2 parent + + Title : parent + Usage : my $sf = $proxy->parent() + Function: returns the seqfeature parent of this proxy + Returns : Bio::SeqFeatureI object + Args : none + + +=cut + +sub parent { + my($self, $value ) = @_; + + if (defined($value)) { + unless (ref($value) and $value->isa('Bio::SeqFeatureI')) { + $self->throw("object $value pretends to be a location but ". + "does not implement Bio::SeqFeatureI"); + } + $self->{'_parent'} = $value; + } + + return $self->{'_parent'}; +} + + + +=head2 start + + Title : start + Usage : $start = $feat->start + $feat->start(20) + Function: Get + Returns : integer + Args : none + + +=cut + +sub start { + my ($self,$value) = @_; + return $self->location->start($value); +} + +=head2 end + + Title : end + Usage : $end = $feat->end + $feat->end($end) + Function: get + Returns : integer + Args : none + + +=cut + +sub end { + my ($self,$value) = @_; + return $self->location->end($value); +} + +=head2 length + + Title : length + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub length { + my ($self) = @_; + return $self->end - $self->start() + 1; +} + +=head2 strand + + Title : strand + Usage : $strand = $feat->strand() + $feat->strand($strand) + Function: get/set on strand information, being 1,-1 or 0 + Returns : -1,1 or 0 + Args : none + + +=cut + +sub strand { + my ($self,$value) = @_; + return $self->location->strand($value); +} + + +=head2 attach_seq + + Title : attach_seq + Usage : $sf->attach_seq($seq) + Function: Attaches a Bio::Seq object to this feature. This + Bio::Seq object is for the *entire* sequence: ie + from 1 to 10000 + Example : + Returns : TRUE on success + Args : + + +=cut + +sub attach_seq { + my ($self, $seq) = @_; + + if ( !defined $seq || !ref $seq || ! $seq->isa("Bio::PrimarySeqI") ) { + $self->throw("Must attach Bio::PrimarySeqI objects to SeqFeatures"); + } + + $self->{'_gsf_seq'} = $seq; + + # attach to sub features if they want it + + foreach my $sf ( $self->sub_SeqFeature() ) { + if ( $sf->can("attach_seq") ) { + $sf->attach_seq($seq); + } + } + return 1; +} + +=head2 seq + + Title : seq + Usage : $tseq = $sf->seq() + Function: returns the truncated sequence (if there) for this + Example : + Returns : sub seq on attached sequence bounded by start & end + Args : none + + +=cut + +sub seq { + my ($self, $arg) = @_; + + if ( defined $arg ) { + $self->throw("Calling SeqFeature::PositionProxy->seq with an argument. You probably want attach_seq"); + } + + if ( ! exists $self->{'_gsf_seq'} ) { + return undef; + } + + # assumming our seq object is sensible, it should not have to yank + # the entire sequence out here. + + my $seq = $self->{'_gsf_seq'}->trunc($self->start(), $self->end()); + + + if ( $self->strand == -1 ) { + $seq = $seq->revcom; + } + + return $seq; +} + +=head2 entire_seq + + Title : entire_seq + Usage : $whole_seq = $sf->entire_seq() + Function: gives the entire sequence that this seqfeature is attached to + Example : + Returns : + Args : + + +=cut + +sub entire_seq { + my ($self) = @_; + + return undef unless exists($self->{'_gsf_seq'}); + return $self->{'_gsf_seq'}; +} + + +=head2 seqname + + Title : seqname + Usage : $obj->seq_id($newval) + Function: There are many cases when you make a feature that you + do know the sequence name, but do not know its actual + sequence. This is an attribute such that you can store + the seqname. + + This attribute should *not* be used in GFF dumping, as + that should come from the collection in which the seq + feature was found. + Returns : value of seqname + Args : newvalue (optional) + + +=cut + +sub seqname { + my ($obj,$value) = @_; + if ( defined $value ) { + $obj->{'_gsf_seqname'} = $value; + } + return $obj->{'_gsf_seqname'}; +} + + + +=head2 Proxies + +These functions chain back to the parent for all non sequence related stuff. + + +=cut + +=head2 primary_tag + + Title : primary_tag + Usage : $tag = $feat->primary_tag() + Function: Returns the primary tag for a feature, + eg 'exon' + Returns : a string + Args : none + + +=cut + +sub primary_tag{ + my ($self,@args) = @_; + + return $self->parent->primary_tag(); +} + +=head2 source_tag + + Title : source_tag + Usage : $tag = $feat->source_tag() + Function: Returns the source tag for a feature, + eg, 'genscan' + Returns : a string + Args : none + + +=cut + +sub source_tag{ + my ($self) = @_; + + return $self->parent->source_tag(); +} + + +=head2 has_tag + + Title : has_tag + Usage : $tag_exists = $self->has_tag('some_tag') + Function: + Returns : TRUE if the specified tag exists, and FALSE otherwise + Args : + + +=cut + +sub has_tag{ + my ($self,$tag) = @_; + + return $self->parent->has_tag($tag); +} + +=head2 each_tag_value + + Title : each_tag_value + Usage : @values = $self->each_tag_value('some_tag') + Function: + Returns : An array comprising the values of the specified tag. + Args : + + +=cut + +sub each_tag_value { + my ($self,$tag) = @_; + + return $self->parent->each_tag_value($tag); +} + +=head2 all_tags + + Title : all_tags + Usage : @tags = $feat->all_tags() + Function: gives all tags for this feature + Returns : an array of strings + Args : none + + +=cut + +sub all_tags{ + my ($self) = @_; + + return $self->parent->all_tags(); +} diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqFeature/Primer.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqFeature/Primer.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,320 @@ +# $Id: Primer.pm,v 1.10 2002/10/30 14:21:58 heikki Exp $ +# +# BioPerl module for Bio::SeqFeature::Primer +# +# Cared for by Chad Matsalla +# +# Copyright Chad Matsalla +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqFeature::Primer - Primer Generic SeqFeature + +=head1 SYNOPSIS + + A synopsis does not yet exist for this module. + +=head1 DESCRIPTION + + A description does not yet exist for this module. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Chad Matsalla + +Chad Matsalla E<lt>bioinformatics1@dieselwurks.comE<gt> + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SeqFeature::Primer; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::SeqFeature::Generic; +use Bio::Seq; +use Dumpvalue qw(dumpValue); + +my $dumper = new Dumpvalue(); + + +@ISA = qw(Bio::Root::Root Bio::SeqFeature::Generic); + + + +=head2 new() + + Title : new() + Usage : + Function: + Example : + Returns : + Args : +Devel notes: I think that I want to accept a hash + +=cut + + +sub new { + my ($class, @args) = @_; + my %arguments = @args; + my $self = $class->SUPER::new(@args); + # these are from generic.pm, with which i started + $self->{'_parse_h'} = {}; + $self->{'_gsf_tag_hash'} = {}; + # things that belong with the primer + my ($sequence, $primer_sequence_id, $id) = + $self->{'_gsf_seqname'} = $self->{primer_sequence_id}; + # i am going to keep an array of the things that have been passed + # into the object on construction. this will aid retrieval of these + # things later + foreach my $argument (sort keys %arguments) { + if ($argument eq "-SEQUENCE" || $argument eq "-sequence") { + if (ref($arguments{$argument}) eq "Bio::Seq") { + $self->{seq} = $arguments{$argument}; + } + else { + $self->{seq} = new Bio::Seq( -seq => $arguments{$argument}, + -id => $arguments{-id}); + } + $self->{tags}->{$argument} = "A Bio::Seq. Use seq() to get this 'tag'"; + } + else { + (my $fixed = $argument) =~ s/-//; + $self->{tags}->{$fixed} = $arguments{$argument}; + } + } + if (!$self->{seq}) { + $self->throw("You must pass in a sequence to construct this object."); + } + + # a bunch of things now need to be set for this SeqFeature + # things like: + # TARGET=513,26 + # PRIMER_FIRST_BASE_INDEX=1 + # PRIMER_LEFT=484,20 + return $self; +} + + +=head2 seq() + + Title : seq() + Usage : $seq = $primer->seq(); + Function: Return the _entire_ sequence associated with this Primer. + Returns : A Bio::Seq object + Args : None. +Develper Note: Do you want to be able to set the sequence associated with this + SeqFeature? + +=cut + +sub seq { + my $self = shift; + return $self->{seq}; +} + + + +=head2 all_tags() + + Title : all_tags() + Usage : @tags = $primer->all_tags(); + Function: Return a list of tag names for this Primer. + Returns : An array of strings representing the names of tags in this Primer + Args : None. + Notes : When the Bio::SeqFeature::Primer object is created, the user can + pass in an arbitrary hash containing key->value pairs. This is allowed + because I didn't want to assume that the user was trying to model a + primer3 construct. + +=cut + +#' + +sub all_tags { + my $self = shift; + my @tags = sort keys %{$self->{tags}}; + return @tags; +} + + +=head2 primary_tag() + + Title : primary_tag() + Usage : $tag = $feature->primary_tag(); + Function: Returns the string "Primer" + Returns : A string. + Args : None. + +=cut + +sub primary_tag { + return "Primer"; +} + +=head2 source_tag() + + Title : source_tag() + Usage : $tag = $feature->source_tag(); + Function: Returns the source of this tag. + Returns : A string. + Args : If an argument is provided, the source of this SeqFeature + is set to that argument. + +=cut + +sub source_tag { + my ($self,$insource) = @_; + if ($insource) { $self->{source} = $insource; } + return $self->{source}; +} + +=head2 has_tag() + + Title : has_tag() + Usage : $true_or_false = $feature->has_tag('MELTING_TEMPERATURE'); + Function: Does this SeqFeature have this tag? + Returns : TRUE or FALSE + Args : A string. + +=cut + +sub has_tag { + my ($self,$tagname) = @_; + if ($self->{tags}->{$tagname}) { return "TRUE"; } + return { "FALSE" }; +} + +=head2 each_tag_value() + + Title : each_tag_value() + Usage : $tag = $feature->each_tag_value('MELTING_TEMPERATURE'); + Function: Returns the value of this tag. + Returns : Unknown. Whatever the value of the given tag was. + Args : None. + +=cut + +sub each_tag_value { + my ($self,$tagname) = @_; + return $self->{tags}->{$tagname}; +} + +=head2 location() + + Title : location() + Usage : $tag = $feature->location(); + Function: returns a location object suitable for identifying location of + feature on sequence or parent feature + Returns : a bio::locationi object. + Args : none. +Developer Notes: Chad has no idea how to implement this at this time. + +=cut + +sub location { + my $self = shift; + $self->warn("Chad has not written the code for this yet."); +} + +=head2 start() + + Title : start() + Usage : $start_position = $feature->start($new_position); + Function: Return the start position of this Primer. + Returns : The start position of this Primer. + Args : If an argument is provided, the start position of this + Primer is set to that position. + +=cut + +sub start { + my ($self,$new_position) = @_; + if ($new_position) { $self->{start_position} = $new_position; } + return $self->{start_position}; +} + +=head2 end() + + Title : end() + Usage : $end_position = $feature->end($new_position); + Function: Return the end position of this Primer. + Returns : The end position of this Primer. + Args : If an argument is provided, the end position of this + Primer is set to that position. + +=cut + +sub end { + my ($self,$new_position) = @_; + if ($new_position) { $self->{end_position} = $new_position; } + return $self->{end_position}; +} + +=head2 strand() + + Title : strand() + Usage : + Function: + Returns : + Args : +Developer Notes: Chad has no idea how to implement this at this time. + +=cut + +sub strand { + my $self = shift; + $self->warn("Chad has not implemented this method at this time."); +} + +=head2 display_id() + + Title : display_id() + Usage : $id = $feature->display_id($new_id) + Function: Returns the display ID for this Primer feature + Returns : A scalar. + Args : If an argument is provided, the display_id of this Primer is + set to that value. + +=cut + +sub display_id { + my ($self,$newid) = @_; + if ($newid) { $self->seq()->display_id($newid); } + return $self->seq()->display_id(); +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqFeature/Similarity.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqFeature/Similarity.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,206 @@ +# $Id: Similarity.pm,v 1.10 2002/11/01 21:39:05 jason Exp $ +# +# BioPerl module for Bio::SeqFeature::Similarity +# +# Cared for by Hilmar Lapp <hlapp@gmx.net> +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqFeature::Similarity - A sequence feature based on similarity + +=head1 SYNOPSIS + + # obtain a similarity feature somehow + print "significance: ", $sim_fea->significance(), "\n"; + print "bit score: ", $sim_fea->bits(), "\n"; + print "score: ", $sim_fea->score(), "\n"; + print "fraction of identical residues: ", $sim_fea->frac_identical(), "\n"; + +=head1 DESCRIPTION + +This module is basically a sequence features based on similarity, and therefore +has support for measures assessing the similarity. + +Everything else is inherited from L<Bio::SeqFeature::Generic>. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp@gmx.net or hilmar.lapp@pharma.novartis.com + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SeqFeature::Similarity; +use vars qw(@ISA); +use strict; + +use Bio::SeqFeature::Generic; + +@ISA = qw(Bio::SeqFeature::Generic); + +sub new { + my ( $caller, @args) = @_; + my ($self) = $caller->SUPER::new(@args); + + my ($primary,$evalue, $bits, $frac,$seqlen,$seqdesc) = + $self->_rearrange([qw(PRIMARY + EXPECT + BITS + FRAC + SEQDESC + SEQLENGTH + )],@args); + + defined $evalue && $self->significance($evalue); + defined $bits && $self->bits($bits); + defined $frac && $self->frac_identical($frac); + defined $seqlen && $self->seqlength($seqlen); + defined $seqdesc && $self->seqdesc($seqdesc); + $primary = 'similarity' unless defined $primary; + $self->primary_tag($primary) unless( defined $self->primary_tag() ); + $self->strand(0) unless( defined $self->strand() ); + + return $self; +} + +=head2 significance + + Title : significance + Usage : $evalue = $obj->significance(); + $obj->significance($evalue); + Function: + Returns : + Args : + + +=cut + +sub significance { + my ($self, $value) = @_; + + return $self->_tag_value('signif', $value); +} + +=head2 bits + + Title : bits + Usage : $bits = $obj->bits(); + $obj->bits($value); + Function: + Returns : + Args : + + +=cut + +sub bits { + my ($self, $value) = @_; + + return $self->_tag_value('Bits', $value); +} + +=head2 frac_identical + + Title : frac_identical + Usage : $fracid = $obj->frac_identical(); + $obj->frac_identical($value); + Function: + Returns : + Args : + + +=cut + +sub frac_identical { + my ($self, $value) = @_; + + return $self->_tag_value('FracId', $value); +} + +=head2 seqlength + + Title : seqlength + Usage : $len = $obj->seqlength(); + $obj->seqlength($len); + Function: + Returns : + Args : + + +=cut + +sub seqlength { + my ($self, $value) = @_; + + return $self->_tag_value('SeqLength', $value); +} + +=head2 seqdesc + + Title : seqdesc + Usage : $desc = $obj->seqdesc(); + $obj->seqdesc($desc); + Function: At present this method is a shorthand for + $obj->annotation()->description(). + + Note that this is not stored in the tag system and hence will + not be included in the return value of gff_string(). + Returns : + Args : + + +=cut + +sub seqdesc { + my ($self, $value) = @_; + + if( defined $value ) { + my $v = Bio::Annotation::SimpleValue->new(); + $v->value($value); + $self->annotation->add_Annotation('description',$v); + } + my ($v) = $self->annotation()->get_Annotations('description'); + return $v ? $v->value : undef; +} + +# +# Everything else is just inherited from SeqFeature::Generic. +# + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqFeature/SimilarityPair.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqFeature/SimilarityPair.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,343 @@ +# $Id: SimilarityPair.pm,v 1.21 2002/12/24 15:15:32 jason Exp $ +# +# BioPerl module for Bio::SeqFeature::SimilarityPair +# +# Cared for by Hilmar Lapp <hlapp@gmx.net> +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqFeature::SimilarityPair - Sequence feature based on the similarity + of two sequences. + +=head1 SYNOPSIS + + $sim_pair = Bio::SeqFeature::SimilarityPair->from_searchResult($blastHit); + + $sim = $sim_pair->query(); # a Bio::SeqFeature::Similarity object - the query + $sim = $sim_pair->hit(); # dto - the hit. + + # some properties for the similarity pair + $expect = $sim_pair->significance(); + $score = $sim_pair->score(); + $bitscore = $sim_pair->bits(); + + # this will not write the description for the sequence (only its name) + print $sim_pair->query()->gff_string(), "\n"; + +=head1 DESCRIPTION + +Lightweight similarity search result as a pair of Similarity +features. This class inherits off Bio::SeqFeature::FeaturePair and +therefore implements Bio::SeqFeatureI, whereas the two features of the +pair are descendants of Bio::SeqFeature::Generic, with better support +for representing similarity search results in a cleaner way. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp@gmx.net or hilmar.lapp@pharma.novartis.com + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SeqFeature::SimilarityPair; +use vars qw(@ISA); +use strict; + +use Bio::SeqFeature::FeaturePair; +use Bio::SeqFeature::Similarity; +use Bio::SearchIO; + +@ISA = qw(Bio::SeqFeature::FeaturePair); + +=head2 new + + Title : new + Usage : my $similarityPair = new Bio::SeqFeature::SimilarityPair + (-hit => $hit, + -query => $query, + -source => 'blastp'); + Function: Initializes a new SimilarityPair object + Returns : Bio::SeqFeature::SimilarityPair + Args : -query => The query in a Feature pair + -hit => (formerly '-subject') the subject/hit in a Feature pair + + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + # Hack to deal with the fact that SimilarityPair calls strand() + # which will lead to an error in Bio::Search::HSP::BlastHSP + # because parsing hasn't yet occurred. + # TODO: Remove this when BlastHSP doesn't do lazy parsing. + $self->{'_initializing'} = 1; + + my ($primary, $hit, $query, $fea1, $source,$sbjct) = + $self->_rearrange([qw(PRIMARY + HIT + QUERY + FEATURE1 + SOURCE + SUBJECT + )],@args); + + if( $sbjct ) { + # undeprecated by Jason before 1.1 release + # $self->deprecated("use of -subject deprecated: SimilarityPair now uses 'hit'"); + if(! $hit) { $hit = $sbjct } + else { + $self->warn("-hit and -subject were specified, using -hit and ignoring -subject"); + } + } + + # make sure at least the query feature exists -- this refers to feature1 + if($query && ! $fea1) { $self->query( $query); } + else { $self->query('null'); } # call with no args sets a default value for query + + $hit && $self->hit($hit); + # the following refer to feature1, which has been ensured to exist + if( defined $primary || ! defined $self->primary_tag) { + $primary = 'similarity' unless defined $primary; + $self->primary_tag($primary); + } + + $source && $self->source_tag($source); + $self->strand(0) unless( defined $self->strand() ); + + $self->{'_initializing'} = 0; # See "Hack" note above + return $self; +} + +# +# Everything else is just inherited from SeqFeature::FeaturePair. +# + +=head2 query + + Title : query + Usage : $query_feature = $obj->query(); + $obj->query($query_feature); + Function: The query object for this similarity pair + Returns : Bio::SeqFeature::Similarity + Args : [optional] Bio::SeqFeature::Similarity + +See L<Bio::SeqFeature::Similarity>, L<Bio::SeqFeature::FeaturePair> + +=cut + +sub query { + my ($self, @args) = @_; + my $f = $self->feature1(); + if( ! @args || ( !ref($args[0]) && $args[0] eq 'null') ) { + if( ! defined( $f) ) { + @args = Bio::SeqFeature::Similarity->new(); + } elsif( ! $f->isa('Bio::SeqFeature::Similarity') && + $f->isa('Bio::SeqFeatureI') ) { + # a Bio::SeqFeature::Generic was placeholder for feature1 + my $newf = new + Bio::SeqFeature::Similarity( -start => $f->start(), + -end => $f->end(), + -strand => $f->strand(), + -primary => $f->primary_tag(), + -source => $f->source_tag(), + -seq_id => $f->seq_id(), + -score => $f->score(), + -frame => $f->frame(), + ); + foreach my $tag ( $newf->all_tags ) { + $tag->add_tag($tag, $newf->each_tag($tag)); + } + @args = $newf; + } else { + @args = (); + } + } + return $self->feature1(@args); +} + + + + +=head2 subject + + Title : subject + Usage : $sbjct_feature = $obj->subject(); + $obj->subject($sbjct_feature); + Function: Get/Set Subject for a SimilarityPair + Returns : Bio::SeqFeature::Similarity + Args : [optional] Bio::SeqFeature::Similarity + Notes : Deprecated. Use the method 'hit' instead + +=cut + +sub subject { + my $self = shift; +# $self->deprecated("Method subject deprecated: use hit() instead"); + $self->hit(@_); +} + +*sbjct = \&subject; + +=head2 hit + + Title : hit + Usage : $sbjct_feature = $obj->hit(); + $obj->hit($sbjct_feature); + Function: Get/Set Hit for a SimilarityPair + Returns : Bio::SeqFeature::Similarity + Args : [optional] Bio::SeqFeature::Similarity + + +=cut + +sub hit { + my ($self, @args) = @_; + my $f = $self->feature2(); + if(! @args || (!ref($args[0]) && $args[0] eq 'null') ) { + if( ! defined( $f) ) { + @args = Bio::SeqFeature::Similarity->new(); + } elsif( ! $f->isa('Bio::SeqFeature::Similarity') && + $f->isa('Bio::SeqFeatureI')) { + # a Bio::SeqFeature::Generic was placeholder for feature2 + my $newf = new + Bio::SeqFeature::Similarity( -start => $f->start(), + -end => $f->end(), + -strand => $f->strand(), + -primary => $f->primary_tag(), + -source => $f->source_tag(), + -seq_id => $f->seq_id(), + -score => $f->score(), + -frame => $f->frame(), + ); + foreach my $tag ( $newf->all_tags ) { + $tag->add_tag($tag, $newf->each_tag($tag)); + } + @args = $newf; + } + } + return $self->feature2(@args); +} + +=head2 source_tag + + Title : source_tag + Usage : $source = $obj->source_tag(); # i.e., program + $obj->source_tag($evalue); + Function: Gets the source tag (program name typically) for a feature + Returns : string + Args : [optional] string + + +=cut + +sub source_tag { + my ($self, @args) = @_; + + if(@args) { + $self->hit()->source_tag(@args); + } + return $self->query()->source_tag(@args); +} + +=head2 significance + + Title : significance + Usage : $evalue = $obj->significance(); + $obj->significance($evalue); + Function: + Returns : + Args : + + +=cut + +sub significance { + my ($self, @args) = @_; + + if(@args) { + $self->hit()->significance(@args); + } + return $self->query()->significance(@args); +} + +=head2 score + + Title : score + Usage : $score = $obj->score(); + $obj->score($value); + Function: + Returns : + Args : + + +=cut + +sub score { + my ($self, @args) = @_; + + if(@args) { + $self->hit()->score(@args); + } + return $self->query()->score(@args); +} + +=head2 bits + + Title : bits + Usage : $bits = $obj->bits(); + $obj->bits($value); + Function: + Returns : + Args : + + +=cut + +sub bits { + my ($self, @args) = @_; + + if(@args) { + $self->hit()->bits(@args); + } + return $self->query()->bits(@args); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqFeatureI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqFeatureI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,630 @@ +# $Id: SeqFeatureI.pm,v 1.43.2.5 2003/08/28 19:29:34 jason Exp $ +# +# BioPerl module for Bio::SeqFeatureI +# +# Cared for by Ewan Birney <birney@sanger.ac.uk> +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqFeatureI - Abstract interface of a Sequence Feature + +=head1 SYNOPSIS + + # get a seqfeature somehow, eg, + + foreach $feat ( $seq->top_SeqFeatures() ) { + print "Feature from ", $feat->start, "to ", + $feat->end, " Primary tag ", $feat->primary_tag, + ", produced by ", $feat->source_tag(), "\n"; + + if( $feat->strand == 0 ) { + print "Feature applicable to either strand\n"; + } else { + print "Feature on strand ", $feat->strand,"\n"; # -1,1 + } + + foreach $tag ( $feat->all_tags() ) { + print "Feature has tag ", $tag, "with values, ", + join(' ',$feat->each_tag_value($tag)), "\n"; + } + print "new feature\n" if $feat->has_tag('new'); + # features can have sub features + my @subfeat = $feat->get_SeqFeatures(); + } + +=head1 DESCRIPTION + +This interface is the functions one can expect for any Sequence +Feature, whatever its implementation or whether it is a more complex +type (eg, a Gene). This object doesn\'t actually provide any +implemention, it just provides the definitions of what methods one can +call. See Bio::SeqFeature::Generic for a good standard implementation +of this object + +=head1 FEEDBACK + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SeqFeatureI; +use vars qw(@ISA $HasInMemory); +use strict; + +BEGIN { + eval { require Bio::DB::InMemoryCache }; + if( $@ ) { $HasInMemory = 0 } + else { $HasInMemory = 1 } +} + +use Bio::RangeI; +use Bio::Seq; + +use Carp; + +@ISA = qw(Bio::RangeI); + +=head1 SeqFeatureI specific methods + +New method interfaces. + +=cut + +=head2 get_SeqFeatures + + Title : get_SeqFeatures + Usage : @feats = $feat->get_SeqFeatures(); + Function: Returns an array of sub Sequence Features + Returns : An array + Args : none + + +=cut + +sub get_SeqFeatures{ + my ($self,@args) = @_; + + $self->throw_not_implemented(); +} + +=head2 display_name + + Title : display_name + Usage : $name = $feat->display_name() + Function: Returns the human-readable name of the feature for displays. + Returns : a string + Args : none + +=cut + +sub display_name { + shift->throw_not_implemented(); +} + +=head2 primary_tag + + Title : primary_tag + Usage : $tag = $feat->primary_tag() + Function: Returns the primary tag for a feature, + eg 'exon' + Returns : a string + Args : none + + +=cut + +sub primary_tag{ + my ($self,@args) = @_; + + $self->throw_not_implemented(); + +} + +=head2 source_tag + + Title : source_tag + Usage : $tag = $feat->source_tag() + Function: Returns the source tag for a feature, + eg, 'genscan' + Returns : a string + Args : none + + +=cut + +sub source_tag{ + my ($self,@args) = @_; + + $self->throw_not_implemented(); +} + +=head2 has_tag + + Title : has_tag + Usage : $tag_exists = $self->has_tag('some_tag') + Function: + Returns : TRUE if the specified tag exists, and FALSE otherwise + Args : + + +=cut + +sub has_tag{ + my ($self,@args) = @_; + + $self->throw_not_implemented(); + +} + +=head2 get_tag_values + + Title : get_tag_values + Usage : @values = $self->get_tag_values('some_tag') + Function: + Returns : An array comprising the values of the specified tag. + Args : + + +=cut + +sub get_tag_values { + shift->throw_not_implemented(); +} + +=head2 get_all_tags + + Title : get_all_tags + Usage : @tags = $feat->get_all_tags() + Function: gives all tags for this feature + Returns : an array of strings + Args : none + + +=cut + +sub get_all_tags{ + shift->throw_not_implemented(); +} + +=head2 attach_seq + + Title : attach_seq + Usage : $sf->attach_seq($seq) + Function: Attaches a Bio::Seq object to this feature. This + Bio::Seq object is for the *entire* sequence: ie + from 1 to 10000 + + Note that it is not guaranteed that if you obtain a feature + from an object in bioperl, it will have a sequence + attached. Also, implementors of this interface can choose + to provide an empty implementation of this method. I.e., + there is also no guarantee that if you do attach a + sequence, seq() or entire_seq() will not return undef. + + The reason that this method is here on the interface is to + enable you to call it on every SeqFeatureI compliant + object, and that it will be implemented in a useful way and + set to a useful value for the great majority of use + cases. Implementors who choose to ignore the call are + encouraged to specifically state this in their + documentation. + + Example : + Returns : TRUE on success + Args : a Bio::PrimarySeqI compliant object + + +=cut + +sub attach_seq { + shift->throw_not_implemented(); +} + +=head2 seq + + Title : seq + Usage : $tseq = $sf->seq() + Function: returns the truncated sequence (if there is a sequence attached) + for this feature + Example : + Returns : sub seq (a Bio::PrimarySeqI compliant object) on attached sequence + bounded by start & end, or undef if there is no sequence attached + Args : none + + +=cut + +sub seq { + shift->throw_not_implemented(); +} + +=head2 entire_seq + + Title : entire_seq + Usage : $whole_seq = $sf->entire_seq() + Function: gives the entire sequence that this seqfeature is attached to + Example : + Returns : a Bio::PrimarySeqI compliant object, or undef if there is no + sequence attached + Args : none + + +=cut + +sub entire_seq { + shift->throw_not_implemented(); +} + + +=head2 seq_id + + Title : seq_id + Usage : $obj->seq_id($newval) + Function: There are many cases when you make a feature that you + do know the sequence name, but do not know its actual + sequence. This is an attribute such that you can store + the ID (e.g., display_id) of the sequence. + + This attribute should *not* be used in GFF dumping, as + that should come from the collection in which the seq + feature was found. + Returns : value of seq_id + Args : newvalue (optional) + + +=cut + +sub seq_id { + shift->throw_not_implemented(); +} + +=head2 gff_string + + Title : gff_string + Usage : $str = $feat->gff_string; + $str = $feat->gff_string($gff_formatter); + Function: Provides the feature information in GFF format. + + The implementation provided here returns GFF2 by default. If you + want a different version, supply an object implementing a method + gff_string() accepting a SeqFeatureI object as argument. E.g., to + obtain GFF1 format, do the following: + + my $gffio = Bio::Tools::GFF->new(-gff_version => 1); + $gff1str = $feat->gff_string($gff1io); + + Returns : A string + Args : Optionally, an object implementing gff_string(). + + +=cut + +sub gff_string{ + my ($self,$formatter) = @_; + + $formatter = $self->_static_gff_formatter unless $formatter; + return $formatter->gff_string($self); +} + +my $static_gff_formatter = undef; + +=head2 _static_gff_formatter + + Title : _static_gff_formatter + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub _static_gff_formatter{ + my ($self,@args) = @_; + + if( !defined $static_gff_formatter ) { + $static_gff_formatter = Bio::Tools::GFF->new('-gff_version' => 2); + } + return $static_gff_formatter; +} + +=head1 Bio::RangeI methods + +List of interfaces inherited from Bio::RangeI (see L<Bio::RangeI> +for details). + +=cut + +=head2 start + + Title : start + Usage : $start = $feat->start + Function: Returns the start coordinate of the feature + Returns : integer + Args : none + + +=head2 end + + Title : end + Usage : $end = $feat->end + Function: Returns the end coordinate of the feature + Returns : integer + Args : none + +=head2 strand + + Title : strand + Usage : $strand = $feat->strand() + Function: Returns strand information, being 1,-1 or 0 + Returns : -1,1 or 0 + Args : none + + +=cut + +=head1 Decorating methods + +These methods have an implementation provided by Bio::SeqFeatureI, +but can be validly overwritten by subclasses + +=head2 spliced_seq + + Title : spliced_seq + + Usage : $seq = $feature->spliced_seq() + $seq = $feature_with_remote_locations->spliced_seq($db_for_seqs) + + Function: Provides a sequence of the feature which is the most + semantically "relevant" feature for this sequence. A + default implementation is provided which for simple cases + returns just the sequence, but for split cases, loops over + the split location to return the sequence. In the case of + split locations with remote locations, eg + + join(AB000123:5567-5589,80..1144) + + in the case when a database object is passed in, it will + attempt to retrieve the sequence from the database object, + and "Do the right thing", however if no database object is + provided, it will generate the correct number of N's (DNA) + or X's (protein, though this is unlikely). + + This function is deliberately "magical" attempting to + second guess what a user wants as "the" sequence for this + feature + + Implementing classes are free to override this method with + their own magic if they have a better idea what the user + wants + + Args : [optional] A Bio::DB::RandomAccessI compliant object + Returns : A Bio::Seq + +=cut + +sub spliced_seq { + my $self = shift; + my $db = shift; + + if( ! $self->location->isa("Bio::Location::SplitLocationI") ) { + return $self->seq(); # nice and easy! + } + + # redundant test, but the above ISA is probably not ideal. + if( ! $self->location->isa("Bio::Location::SplitLocationI") ) { + $self->throw("not atomic, not split, yikes, in trouble!"); + } + + my $seqstr; + my $seqid = $self->entire_seq->display_id; + # This is to deal with reverse strand features + # so we are really sorting features 5' -> 3' on their strand + # i.e. rev strand features will be sorted largest to smallest + # as this how revcom CDSes seem to be annotated in genbank. + # Might need to eventually allow this to be programable? + # (can I mention how much fun this is NOT! --jason) + + my ($mixed,$mixedloc,$fstrand) = (0); + + if( defined $db && + ref($db) && !$db->isa('Bio::DB::RandomAccessI') ) { + $self->warn("Must pass in a valid Bio::DB::RandomAccessI object for access to remote locations for spliced_seq"); + $db = undef; + } elsif( defined $db && + $HasInMemory && ! $db->isa('Bio::DB::InMemoryCache') ) { + $db = new Bio::DB::InMemoryCache(-seqdb => $db); + } + + if( $self->isa('Bio::Das::SegmentI') && + ! $self->absolute ) { + $self->warn("Calling spliced_seq with a Bio::Das::SegmentI ". + "which does have absolute set to 1 -- be warned ". + "you may not be getting things on the correct strand"); + } + + my @locs = map { $_->[0] } + # sort so that most negative is first basically to order + # the features on the opposite strand 5'->3' on their strand + # rather than they way most are input which is on the fwd strand + + sort { $a->[1] <=> $b->[1] } # Yes Tim, Schwartzian transformation + map { + $fstrand = $_->strand unless defined $fstrand; + $mixed = 1 if defined $_->strand && $fstrand != $_->strand; + if( defined $_->seq_id ) { + $mixedloc = 1 if( $_->seq_id ne $seqid ); + } + [ $_, $_->start* ($_->strand || 1)]; + } $self->location->each_Location; + + if ( $mixed ) { + $self->warn("Mixed strand locations, spliced seq using the input ". + "order rather than trying to sort"); + @locs = $self->location->each_Location; + } elsif( $mixedloc ) { + # we'll use the prescribed location order + @locs = $self->location->each_Location; + } + + + foreach my $loc ( @locs ) { + if( ! $loc->isa("Bio::Location::Atomic") ) { + $self->throw("Can only deal with one level deep locations"); + } + my $called_seq; + if( $fstrand != $loc->strand ) { + $self->warn("feature strand is different from location strand!"); + } + # deal with remote sequences + if( defined $loc->seq_id && + $loc->seq_id ne $seqid ) { + if( defined $db ) { + my $sid = $loc->seq_id; + $sid =~ s/\.\d+$//g; + eval { + $called_seq = $db->get_Seq_by_acc($sid); + }; + if( $@ ) { + $self->warn("In attempting to join a remote location, sequence $sid was not in database. Will provide padding N's. Full exception \n\n$@"); + $called_seq = undef; + } + } else { + $self->warn( "cannot get remote location for ".$loc->seq_id ." without a valid Bio::DB::RandomAccessI database handle (like Bio::DB::GenBank)"); + $called_seq = undef; + } + if( !defined $called_seq ) { + $seqstr .= 'N' x $self->length; + next; + } + } else { + $called_seq = $self->entire_seq; + } + + if( $self->isa('Bio::Das::SegmentI') ) { + my ($s,$e) = ($loc->start,$loc->end); + $seqstr .= $called_seq->subseq($s,$e)->seq(); + } else { + # This is dumb subseq should work on locations... + if( $loc->strand == 1 ) { + $seqstr .= $called_seq->subseq($loc->start,$loc->end); + } else { + $seqstr .= $called_seq->trunc($loc->start,$loc->end)->revcom->seq(); + } + } + } + my $out = Bio::Seq->new( -id => $self->entire_seq->display_id . "_spliced_feat", + -seq => $seqstr); + + return $out; +} + +=head1 RangeI methods + +These methods are inherited from RangeI and can be used +directly from a SeqFeatureI interface. Remember that a +SeqFeature is-a RangeI, and so wherever you see RangeI you +can use a feature ($r in the below documentation). + +=head2 overlaps + + Title : overlaps + Usage : if($feat->overlaps($r)) { do stuff } + if($feat->overlaps(200)) { do stuff } + Function: tests if $feat overlaps $r + Args : a RangeI to test for overlap with, or a point + Returns : true if the Range overlaps with the feature, false otherwise + + +=head2 contains + + Title : contains + Usage : if($feat->contains($r) { do stuff } + Function: tests whether $feat totally contains $r + Args : a RangeI to test for being contained + Returns : true if the argument is totaly contained within this range + + +=head2 equals + + Title : equals + Usage : if($feat->equals($r)) + Function: test whether $feat has the same start, end, strand as $r + Args : a RangeI to test for equality + Returns : true if they are describing the same range + + +=head1 Geometrical methods + +These methods do things to the geometry of ranges, and return +triplets (start, stop, strand) from which new ranges could be built. + +=head2 intersection + + Title : intersection + Usage : ($start, $stop, $strand) = $feat->intersection($r) + Function: gives the range that is contained by both ranges + Args : a RangeI to compare this one to + Returns : nothing if they do not overlap, or the range that they do overlap + +=head2 union + + Title : union + Usage : ($start, $stop, $strand) = $feat->union($r); + : ($start, $stop, $strand) = Bio::RangeI->union(@ranges); + Function: finds the minimal range that contains all of the ranges + Args : a range or list of ranges to find the union of + Returns : the range containing all of the ranges + +=cut + +=head2 location + + Title : location + Usage : my $location = $seqfeature->location() + Function: returns a location object suitable for identifying location + of feature on sequence or parent feature + Returns : Bio::LocationI object + Args : none + + +=cut + +sub location { + my ($self) = @_; + + $self->throw_not_implemented(); +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,243 @@ +# $Id: SeqI.pm,v 1.25 2002/12/05 13:46:30 heikki Exp $ +# +# BioPerl module for Bio::SeqI +# +# Cared for by Ewan Birney <birney@ebi.ac.uk> +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqI [Developers] - Abstract Interface of Sequence (with features) + +=head1 SYNOPSIS + + # Bio::SeqI is the interface class for sequences. + + # If you are a newcomer to bioperl, you should + # start with Bio::Seq documentation. This + # documentation is mainly for developers using + # Bioperl. + + # Bio::SeqI implements Bio::PrimarySeqI + $seq = $seqobj->seq(); # actual sequence as a string + $seqstr = $seqobj->subseq(10,50); + + # Bio::SeqI has annotationcollections + + $ann = $seqobj->annotation(); # annotation object + + # Bio::SeqI has sequence features + # features must implement Bio::SeqFeatureI + + @features = $seqobj->get_SeqFeatures(); # just top level + @features = $seqobj->get_all_SeqFeatures(); # descend into sub features + + + +=head1 DESCRIPTION + +Bio::SeqI is the abstract interface of annotated Sequences. These +methods are those which you can be guarenteed to get for any Bio::SeqI +- for most users of the package the documentation (and methods) in +this class are not at useful - this is a developers only class which +defines what methods have to be implmented by other Perl objects to +comply to the Bio::SeqI interface. Go "perldoc Bio::Seq" or "man +Bio::Seq" for more information. + + +There aren't many here, because too many complicated functions here +prevent implementations which are just wrappers around a database or +similar delayed mechanisms. + +Most of the clever stuff happens inside the SeqFeatureI system. + +A good reference implementation is Bio::Seq which is a pure perl +implementation of this class with alot of extra pieces for extra +manipulation. However, if you want to be able to use any sequence +object in your analysis, if you can do it just using these methods, +then you know you will be future proof and compatible with other +implementations of Seq. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney + +Email birney@sanger.ac.uk + + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +#' +# Let the code begin... + + +package Bio::SeqI; +use strict; + +use vars qw(@ISA); +use Bio::PrimarySeqI; +use Bio::AnnotatableI; +use Bio::FeatureHolderI; + +# Object preamble - inheriets from Bio::PrimarySeqI + +@ISA = qw(Bio::PrimarySeqI Bio::AnnotatableI Bio::FeatureHolderI); + +=head2 get_SeqFeatures + + Title : get_SeqFeatures + Usage : my @feats = $seq->get_SeqFeatures(); + Function: retrieve just the toplevel sequence features attached to this seq + Returns : array of Bio::SeqFeatureI objects + Args : none + +This method comes through extension of Bio::FeatureHolderI. See +L<Bio::FeatureHolderI> and L<Bio::SeqFeatureI> for more information. + +=cut + +=head2 get_all_SeqFeatures + + Title : get_all_SeqFeatures + Usage : @features = $annseq->get_all_SeqFeatures() + Function: returns all SeqFeatures, included sub SeqFeatures + Returns : an array of Bio::SeqFeatureI objects + Args : none + +This method comes through extension of Bio::FeatureHolderI. See +L<Bio::FeatureHolderI> and L<Bio::SeqFeatureI> for more information. + +=cut + +=head2 feature_count + + Title : feature_count + Usage : $seq->feature_count() + Function: Return the number of SeqFeatures attached to a sequence + Returns : integer representing the number of SeqFeatures + Args : none + +This method comes through extension of Bio::FeatureHolderI. See +L<Bio::FeatureHolderI> for more information. + +=cut + +=head2 seq + + Title : seq + Usage : my $string = $seq->seq(); + Function: Retrieves the sequence string for the sequence object + Returns : string + Args : none + + +=cut + +sub seq{ + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 write_GFF + + Title : write_GFF + Usage : $seq->write_GFF(\*FILEHANDLE); + Function: Convience method to write out all the sequence features + in GFF format to the provided filehandle (STDOUT by default) + Returns : none + Args : [optional] filehandle to write to (default is STDOUT) + + +=cut + +sub write_GFF{ + my ($self,$fh) = @_; + + $fh || do { $fh = \*STDOUT; }; + + foreach my $sf ( $self->get_all_SeqFeatures() ) { + print $fh $sf->gff_string, "\n"; + } + +} + +=head2 annotation + + Title : annotation + Usage : $obj->annotation($seq_obj) + Function: retrieve the attached annotation object + Returns : Bio::AnnotationCollectionI or none; + +See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection> +for more information. This method comes through extension from +L<Bio::AnnotatableI>. + +=cut + +=head2 species + + Title : species + Usage : + Function: Gets or sets the species + Example : $species = $self->species(); + Returns : Bio::Species object + Args : Bio::Species object or none; + +See L<Bio::Species> for more information + +=cut + +sub species { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 primary_seq + + Title : primary_seq + Usage : $obj->primary_seq($newval) + Function: Retrieve the underlying Bio::PrimarySeqI object if available. + This is in the event one has a sequence with lots of features + but want to be able to narrow the object to just one with + the basics of a sequence (no features or annotations). + Returns : Bio::PrimarySeqI + Args : Bio::PrimarySeqI or none; + +See L<Bio::PrimarySeqI> for more information + +=cut + +sub primary_seq { + my ($self) = @_; + $self->throw_not_implemented; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqIO.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,748 @@ + +# $Id: SeqIO.pm,v 1.59.2.4 2003/09/14 19:16:53 jason Exp $ +# +# BioPerl module for Bio::SeqIO +# +# Cared for by Ewan Birney <birney@sanger.ac.uk> +# and Lincoln Stein <lstein@cshl.org> +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself +# +# _history +# October 18, 1999 Largely rewritten by Lincoln Stein + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO - Handler for SeqIO Formats + +=head1 SYNOPSIS + + use Bio::SeqIO; + + $in = Bio::SeqIO->new(-file => "inputfilename" , '-format' => 'Fasta'); + $out = Bio::SeqIO->new(-file => ">outputfilename" , '-format' => 'EMBL'); + # note: we quote -format to keep older Perls from complaining. + + while ( my $seq = $in->next_seq() ) { + $out->write_seq($seq); + } + +Now, to actually get at the sequence object, use the standard Bio::Seq +methods (look at L<Bio::Seq> if you don't know what they are) + + use Bio::SeqIO; + + $in = Bio::SeqIO->new(-file => "inputfilename" , '-format' => 'genbank'); + + while ( my $seq = $in->next_seq() ) { + print "Sequence ",$seq->id," first 10 bases ",$seq->subseq(1,10),"\n"; + } + + +The SeqIO system does have a filehandle binding. Most people find this +a little confusing, but it does mean you write the world's smallest +reformatter + + use Bio::SeqIO; + + $in = Bio::SeqIO->newFh(-file => "inputfilename" , '-format' => 'Fasta'); + $out = Bio::SeqIO->newFh('-format' => 'EMBL'); + + # World's shortest Fasta<->EMBL format converter: + print $out $_ while <$in>; + + +=head1 DESCRIPTION + +Bio::SeqIO is a handler module for the formats in the SeqIO set (eg, +Bio::SeqIO::fasta). It is the officially sanctioned way of getting at +the format objects, which most people should use. + +The Bio::SeqIO system can be thought of like biological file handles. +They are attached to filehandles with smart formatting rules (eg, +genbank format, or EMBL format, or binary trace file format) and +can either read or write sequence objects (Bio::Seq objects, or +more correctly, Bio::SeqI implementing objects, of which Bio::Seq is +one such object). If you want to know what to do with a Bio::Seq +object, read L<Bio::Seq>. + +The idea is that you request a stream object for a particular format. +All the stream objects have a notion of an internal file that is read +from or written to. A particular SeqIO object instance is configured +for either input or output. A specific example of a stream object is +the Bio::SeqIO::fasta object. + +Each stream object has functions + + $stream->next_seq(); + +and + + $stream->write_seq($seq); + +As an added bonus, you can recover a filehandle that is tied to the +SeqIO object, allowing you to use the standard E<lt>E<gt> and print operations +to read and write sequence objects: + + use Bio::SeqIO; + + $stream = Bio::SeqIO->newFh(-format => 'Fasta'); # read from standard input + + while ( $seq = <$stream> ) { + # do something with $seq + } + +and + + print $stream $seq; # when stream is in output mode + +This makes the simplest ever reformatter + + #!/usr/local/bin/perl + + $format1 = shift; + $format2 = shift || die "Usage: reformat format1 format2 < input > output"; + + use Bio::SeqIO; + + $in = Bio::SeqIO->newFh(-format => $format1 ); + $out = Bio::SeqIO->newFh(-format => $format2 ); + #note: you might want to quote -format to keep older perl's from complaining. + + print $out $_ while <$in>; + + +=head1 CONSTRUCTORS + +=head2 Bio::SeqIO-E<gt>new() + + $seqIO = Bio::SeqIO->new(-file => 'filename', -format=>$format); + $seqIO = Bio::SeqIO->new(-fh => \*FILEHANDLE, -format=>$format); + $seqIO = Bio::SeqIO->new(-format => $format); + +The new() class method constructs a new Bio::SeqIO object. The +returned object can be used to retrieve or print Seq objects. new() +accepts the following parameters: + +=over 4 + +=item -file + +A file path to be opened for reading or writing. The usual Perl +conventions apply: + + 'file' # open file for reading + '>file' # open file for writing + '>>file' # open file for appending + '+<file' # open file read/write + 'command |' # open a pipe from the command + '| command' # open a pipe to the command + +=item -fh + +You may provide new() with a previously-opened filehandle. For +example, to read from STDIN: + + $seqIO = Bio::SeqIO->new(-fh => \*STDIN); + +Note that you must pass filehandles as references to globs. + +If neither a filehandle nor a filename is specified, then the module +will read from the @ARGV array or STDIN, using the familiar E<lt>E<gt> +semantics. + +A string filehandle is handy if you want to modify the output in the +memory, before printing it out. The following program reads in EMBL +formatted entries from a file and prints them out in fasta format with +some HTML tags: + + use Bio::SeqIO; + use IO::String; + my $in = Bio::SeqIO->new('-file' => "emblfile" , + '-format' => 'EMBL'); + while ( my $seq = $in->next_seq() ) { + # the output handle is reset for every file + my $stringio = IO::String->new($string); + my $out = Bio::SeqIO->new('-fh' => $stringio, + '-format' => 'fasta'); + # output goes into $string + $out->write_seq($seq); + # modify $string + $string =~ s|(>)(\w+)|$1<font color="Red">$2</font>|g; + # print into STDOUT + print $string; + } + +=item -format + +Specify the format of the file. Supported formats include: + + Fasta FASTA format + EMBL EMBL format + GenBank GenBank format + swiss Swissprot format + PIR Protein Information Resource format + GCG GCG format + raw Raw format (one sequence per line, no ID) + ace ACeDB sequence format + game GAME XML format + phd phred output + qual Quality values (get a sequence of quality scores) + Fastq Fastq format + SCF SCF tracefile format + ABI ABI tracefile format + ALF ALF tracefile format + CTF CTF tracefile format + ZTR ZTR tracefile format + PLN Staden plain tracefile format + EXP Staden tagged experiment tracefile format + +If no format is specified and a filename is given then the module +will attempt to deduce the format from the filename suffix. If this +is unsuccessful then Fasta format is assumed. + +The format name is case insensitive. 'FASTA', 'Fasta' and 'fasta' are +all valid suffixes. + +Currently, the tracefile formats (except for SCF) require installation +of the external Staden "io_lib" package, as well as the +Bio::SeqIO::staden::read package available from the bioperl-ext +repository. + +=item -flush + +By default, all files (or filehandles) opened for writing sequences +will be flushed after each write_seq() (making the file immediately +usable). If you don't need this facility and would like to marginally +improve the efficiency of writing multiple sequences to the same file +(or filehandle), pass the -flush option '0' or any other value that +evaluates as defined but false: + + my $gb = new Bio::SeqIO -file => "<gball.gbk", + -format => "gb"; + my $fa = new Bio::SeqIO -file => ">gball.fa", + -format => "fasta", + -flush => 0; # go as fast as we can! + while($seq = $gb->next_seq) { $fa->write_seq($seq) } + + +=back + +=head2 Bio::SeqIO-E<gt>newFh() + + $fh = Bio::SeqIO->newFh(-fh => \*FILEHANDLE, -format=>$format); + $fh = Bio::SeqIO->newFh(-format => $format); + # etc. + +This constructor behaves like new(), but returns a tied filehandle +rather than a Bio::SeqIO object. You can read sequences from this +object using the familiar E<lt>E<gt> operator, and write to it using +print(). The usual array and $_ semantics work. For example, you can +read all sequence objects into an array like this: + + @sequences = <$fh>; + +Other operations, such as read(), sysread(), write(), close(), and printf() +are not supported. + +=head1 OBJECT METHODS + +See below for more detailed summaries. The main methods are: + +=head2 $sequence = $seqIO-E<gt>next_seq() + +Fetch the next sequence from the stream. + +=head2 $seqIO-E<gt>write_seq($sequence [,$another_sequence,...]) + +Write the specified sequence(s) to the stream. + +=head2 TIEHANDLE(), READLINE(), PRINT() + +These provide the tie interface. See L<perltie> for more details. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably +to one of the Bioperl mailing lists. + +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney, Lincoln Stein + +Email birney@ebi.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +#' Let the code begin... + +package Bio::SeqIO; + +use strict; +use vars qw(@ISA); + +use Bio::Root::Root; +use Bio::Root::IO; +use Bio::Factory::SequenceStreamI; +use Bio::Factory::FTLocationFactory; +use Bio::Seq::SeqBuilder; +use Symbol(); + +@ISA = qw(Bio::Root::Root Bio::Root::IO Bio::Factory::SequenceStreamI); + +sub BEGIN { + eval { require Bio::SeqIO::staden::read; }; +} + +my %valid_alphabet_cache; + +=head2 new + + Title : new + Usage : $stream = Bio::SeqIO->new(-file => $filename, -format => 'Format') + Function: Returns a new seqstream + Returns : A Bio::SeqIO stream initialised with the appropriate format + Args : Named parameters: + -file => $filename + -fh => filehandle to attach to + -format => format + + Additional arguments may be used to set factories and + builders involved in the sequence object creation. None of + these must be provided, they all have reasonable defaults. + -seqfactory the L<Bio::Factory::SequenceFactoryI> object + -locfactory the L<Bio::Factory::LocationFactoryI> object + -objbuilder the L<Bio::Factory::ObjectBuilderI> object + +See L<Bio::SeqIO::Handler> + +=cut + +my $entry = 0; + +sub new { + my ($caller,@args) = @_; + my $class = ref($caller) || $caller; + + # or do we want to call SUPER on an object if $caller is an + # object? + if( $class =~ /Bio::SeqIO::(\S+)/ ) { + my ($self) = $class->SUPER::new(@args); + $self->_initialize(@args); + return $self; + } else { + + my %param = @args; + @param{ map { lc $_ } keys %param } = values %param; # lowercase keys + my $format = $param{'-format'} || + $class->_guess_format( $param{-file} || $ARGV[0] ) || + 'fasta'; + $format = "\L$format"; # normalize capitalization to lower case + + # normalize capitalization + return undef unless( $class->_load_format_module($format) ); + return "Bio::SeqIO::$format"->new(@args); + } +} + +=head2 newFh + + Title : newFh + Usage : $fh = Bio::SeqIO->newFh(-file=>$filename,-format=>'Format') + Function: does a new() followed by an fh() + Example : $fh = Bio::SeqIO->newFh(-file=>$filename,-format=>'Format') + $sequence = <$fh>; # read a sequence object + print $fh $sequence; # write a sequence object + Returns : filehandle tied to the Bio::SeqIO::Fh class + Args : + +See L<Bio::SeqIO::Fh> + +=cut + +sub newFh { + my $class = shift; + return unless my $self = $class->new(@_); + return $self->fh; +} + +=head2 fh + + Title : fh + Usage : $obj->fh + Function: + Example : $fh = $obj->fh; # make a tied filehandle + $sequence = <$fh>; # read a sequence object + print $fh $sequence; # write a sequence object + Returns : filehandle tied to Bio::SeqIO class + Args : none + +=cut + + +sub fh { + my $self = shift; + my $class = ref($self) || $self; + my $s = Symbol::gensym; + tie $$s,$class,$self; + return $s; +} + +# _initialize is chained for all SeqIO classes + +sub _initialize { + my($self, @args) = @_; + + # flush is initialized by the Root::IO init + + my ($seqfact,$locfact,$objbuilder) = + $self->_rearrange([qw(SEQFACTORY + LOCFACTORY + OBJBUILDER) + ], @args); + + $locfact = Bio::Factory::FTLocationFactory->new(-verbose => $self->verbose) if ! $locfact; + $objbuilder = Bio::Seq::SeqBuilder->new(-verbose => $self->verbose) unless $objbuilder; + $self->sequence_builder($objbuilder); + $self->location_factory($locfact); + # note that this should come last because it propagates the sequence + # factory to the sequence builder + $seqfact && $self->sequence_factory($seqfact); + + # initialize the IO part + $self->_initialize_io(@args); +} + +=head2 next_seq + + Title : next_seq + Usage : $seq = stream->next_seq + Function: Reads the next sequence object from the stream and returns it. + + Certain driver modules may encounter entries in the stream that + are either misformatted or that use syntax not yet understood + by the driver. If such an incident is recoverable, e.g., by + dismissing a feature of a feature table or some other non-mandatory + part of an entry, the driver will issue a warning. In the case + of a non-recoverable situation an exception will be thrown. + Do not assume that you can resume parsing the same stream after + catching the exception. Note that you can always turn recoverable + errors into exceptions by calling $stream->verbose(2). + Returns : a Bio::Seq sequence object + Args : none + +See L<Bio::Root::RootI>, L<Bio::Factory::SeqStreamI>, L<Bio::Seq> + +=cut + +sub next_seq { + my ($self, $seq) = @_; + $self->throw("Sorry, you cannot read from a generic Bio::SeqIO object."); +} + +=head2 write_seq + + Title : write_seq + Usage : $stream->write_seq($seq) + Function: writes the $seq object into the stream + Returns : 1 for success and 0 for error + Args : Bio::Seq object + +=cut + +sub write_seq { + my ($self, $seq) = @_; + $self->throw("Sorry, you cannot write to a generic Bio::SeqIO object."); +} + + +=head2 alphabet + + Title : alphabet + Usage : $self->alphabet($newval) + Function: Set/get the molecule type for the Seq objects to be created. + Example : $seqio->alphabet('protein') + Returns : value of alphabet: 'dna', 'rna', or 'protein' + Args : newvalue (optional) + Throws : Exception if the argument is not one of 'dna', 'rna', or 'protein' + +=cut + +sub alphabet { + my ($self, $value) = @_; + + if ( defined $value) { + $value = lc $value; + unless ($valid_alphabet_cache{$value}) { + # instead of hard-coding the allowed values once more, we check by + # creating a dummy sequence object + eval { + require Bio::PrimarySeq; + my $seq = Bio::PrimarySeq->new('-verbose' => $self->verbose, + '-alphabet' => $value); + + }; + if ($@) { + $self->throw("Invalid alphabet: $value\n. See Bio::PrimarySeq for allowed values."); + } + $valid_alphabet_cache{$value} = 1; + } + $self->{'alphabet'} = $value; + } + return $self->{'alphabet'}; +} + +=head2 _load_format_module + + Title : _load_format_module + Usage : *INTERNAL SeqIO stuff* + Function: Loads up (like use) a module at run time on demand + Example : + Returns : + Args : + +=cut + +sub _load_format_module { + my ($self, $format) = @_; + my $module = "Bio::SeqIO::" . $format; + my $ok; + + eval { + $ok = $self->_load_module($module); + }; + if ( $@ ) { + print STDERR <<END; +$self: $format cannot be found +Exception $@ +For more information about the SeqIO system please see the SeqIO docs. +This includes ways of checking for formats at compile time, not run time +END + ; + } + return $ok; +} + +=head2 _concatenate_lines + + Title : _concatenate_lines + Usage : $s = _concatenate_lines($line, $continuation_line) + Function: Private. Concatenates two strings assuming that the second stems + from a continuation line of the first. Adds a space between both + unless the first ends with a dash. + + Takes care of either arg being empty. + Example : + Returns : A string. + Args : + +=cut + +sub _concatenate_lines { + my ($self, $s1, $s2) = @_; + + $s1 .= " " if($s1 && ($s1 !~ /-$/) && $s2); + return ($s1 ? $s1 : "") . ($s2 ? $s2 : ""); +} + +=head2 _filehandle + + Title : _filehandle + Usage : $obj->_filehandle($newval) + Function: This method is deprecated. Call _fh() instead. + Example : + Returns : value of _filehandle + Args : newvalue (optional) + + +=cut + +sub _filehandle { + my ($self,@args) = @_; + return $self->_fh(@args); +} + +=head2 _guess_format + + Title : _guess_format + Usage : $obj->_guess_format($filename) + Function: guess format based on file suffix + Example : + Returns : guessed format of filename (lower case) + Args : + Notes : formats that _filehandle() will guess include fasta, + genbank, scf, pir, embl, raw, gcg, ace, bsml, swissprot, + fastq and phd/phred + +=cut + +sub _guess_format { + my $class = shift; + return unless $_ = shift; + return 'fasta' if /\.(fasta|fast|seq|fa|fsa|nt|aa)$/i; + return 'genbank' if /\.(gb|gbank|genbank|gbk|gbs)$/i; + return 'scf' if /\.scf$/i; + return 'scf' if /\.scf$/i; + return 'abi' if /\.abi$/i; + return 'alf' if /\.alf$/i; + return 'ctf' if /\.ctf$/i; + return 'ztr' if /\.ztr$/i; + return 'pln' if /\.pln$/i; + return 'exp' if /\.exp$/i; + return 'pir' if /\.pir$/i; + return 'embl' if /\.(embl|ebl|emb|dat)$/i; + return 'raw' if /\.(txt)$/i; + return 'gcg' if /\.gcg$/i; + return 'ace' if /\.ace$/i; + return 'bsml' if /\.(bsm|bsml)$/i; + return 'swiss' if /\.(swiss|sp)$/i; + return 'phd' if /\.(phd|phred)$/i; + return 'fastq' if /\.fastq$/i; +} + +sub DESTROY { + my $self = shift; + + $self->close(); +} + +sub TIEHANDLE { + my ($class,$val) = @_; + return bless {'seqio' => $val}, $class; +} + +sub READLINE { + my $self = shift; + return $self->{'seqio'}->next_seq() unless wantarray; + my (@list, $obj); + push @list, $obj while $obj = $self->{'seqio'}->next_seq(); + return @list; +} + +sub PRINT { + my $self = shift; + $self->{'seqio'}->write_seq(@_); +} + +=head2 sequence_factory + + Title : sequence_factory + Usage : $seqio->sequence_factory($seqfactory) + Function: Get/Set the Bio::Factory::SequenceFactoryI + Returns : Bio::Factory::SequenceFactoryI + Args : [optional] Bio::Factory::SequenceFactoryI + + +=cut + +sub sequence_factory{ + my ($self,$obj) = @_; + if( defined $obj ) { + if( ! ref($obj) || ! $obj->isa('Bio::Factory::SequenceFactoryI') ) { + $self->throw("Must provide a valid Bio::Factory::SequenceFactoryI object to ".ref($self)."::sequence_factory()"); + } + $self->{'_seqio_seqfactory'} = $obj; + my $builder = $self->sequence_builder(); + if($builder && $builder->can('sequence_factory') && + (! $builder->sequence_factory())) { + $builder->sequence_factory($obj); + } + } + $self->{'_seqio_seqfactory'}; +} + +=head2 object_factory + + Title : object_factory + Usage : $obj->object_factory($newval) + Function: This is an alias to sequence_factory with a more generic name. + Example : + Returns : value of object_factory (a scalar) + Args : on set, new value (a scalar or undef, optional) + + +=cut + +sub object_factory{ + return shift->sequence_factory(@_); +} + +=head2 sequence_builder + + Title : sequence_builder + Usage : $seqio->sequence_builder($seqfactory) + Function: Get/Set the L<Bio::Factory::ObjectBuilderI> used to build sequence + objects. + + If you do not set the sequence object builder yourself, it + will in fact be an instance of L<Bio::Seq::SeqBuilder>, and + you may use all methods documented there to configure it. + + Returns : a L<Bio::Factory::ObjectBuilderI> compliant object + Args : [optional] a L<Bio::Factory::ObjectBuilderI> compliant object + + +=cut + +sub sequence_builder{ + my ($self,$obj) = @_; + if( defined $obj ) { + if( ! ref($obj) || ! $obj->isa('Bio::Factory::ObjectBuilderI') ) { + $self->throw("Must provide a valid Bio::Factory::ObjectBuilderI object to ".ref($self)."::sequence_builder()"); + } + $self->{'_object_builder'} = $obj; + } + $self->{'_object_builder'}; +} + +=head2 location_factory + + Title : location_factory + Usage : $seqio->location_factory($locfactory) + Function: Get/Set the Bio::Factory::LocationFactoryI object to be used for + location string parsing + Returns : a L<Bio::Factory::LocationFactoryI> implementing object + Args : [optional] on set, a L<Bio::Factory::LocationFactoryI> implementing + object. + + +=cut + +sub location_factory{ + my ($self,$obj) = @_; + if( defined $obj ) { + if( ! ref($obj) || ! $obj->isa('Bio::Factory::LocationFactoryI') ) { + $self->throw("Must provide a valid Bio::Factory::LocationFactoryI". + " object to ".ref($self)."->location_factory()"); + } + $self->{'_seqio_locfactory'} = $obj; + } + $self->{'_seqio_locfactory'}; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqIO/FTHelper.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/FTHelper.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,315 @@ +# $Id: FTHelper.pm,v 1.55 2002/11/05 02:55:12 lapp Exp $ +# +# BioPerl module for Bio::SeqIO::FTHelper +# +# Cared for by Ewan Birney <birney@ebi.ac.uk> +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::FTHelper - Helper class for Embl/Genbank feature tables + +=head1 SYNOPSIS + +Used by Bio::SeqIO::EMBL to help process the Feature Table + +=head1 DESCRIPTION + +Represents one particular Feature with the following fields + + key - the key of the feature + loc - the location string of the feature + <other fields> - other fields + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://www.bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney + +Email birney@ebi.ac.uk + +Describe contact details here + +=head1 CONTRIBUTORS + +Jason Stajich jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SeqIO::FTHelper; +use vars qw(@ISA); +use strict; + +use Bio::SeqFeature::Generic; +use Bio::Location::Simple; +use Bio::Location::Fuzzy; +use Bio::Location::Split; + + +use Bio::Root::Root; + +@ISA = qw(Bio::Root::Root); + +sub new { + my ($class, @args) = @_; + + # no chained new because we make lots and lots of these. + my $self = {}; + bless $self,$class; + $self->{'_field'} = {}; + return $self; +} + +=head2 _generic_seqfeature + + Title : _generic_seqfeature + Usage : $fthelper->_generic_seqfeature($annseq, "GenBank") + Function: processes fthelper into a generic seqfeature + Returns : TRUE on success and otherwise FALSE + Args : The Bio::Factory::LocationFactoryI object to use for parsing + location strings. The ID (e.g., display_id) of the sequence on which + this feature is located, optionally a string indicating the source + (GenBank/EMBL/SwissProt) + + +=cut + +sub _generic_seqfeature { + my ($fth, $locfac, $seqid, $source) = @_; + my ($sf); + + # set a default if not specified + if(! defined($source)) { + $source = "EMBL/GenBank/SwissProt"; + } + + # initialize feature object + $sf = Bio::SeqFeature::Generic->direct_new(); + + # parse location; this may cause an exception, in which case we gently + # recover and ignore this feature + my $loc; + eval { + $loc = $locfac->from_string($fth->loc); + }; + if(! $loc) { + $fth->warn("exception while parsing location line [" . $fth->loc . + "] in reading $source, ignoring feature " . + $fth->key() . " (seqid=" . $seqid . "): " . $@); + return; + } + + # set additional location attributes + if($seqid && (! $loc->is_remote())) { + $loc->seq_id($seqid); # propagates if it is a split location + } + + # set attributes of feature + $sf->location($loc); + $sf->primary_tag($fth->key); + $sf->source_tag($source); + foreach my $key ( keys %{$fth->field} ){ + foreach my $value ( @{$fth->field->{$key}} ) { + $sf->add_tag_value($key,$value); + } + } + return $sf; +} + + +=head2 from_SeqFeature + + Title : from_SeqFeature + Usage : @fthelperlist = Bio::SeqIO::FTHelper::from_SeqFeature($sf, + $context_annseq); + Function: constructor of fthelpers from SeqFeatures + : + : The additional annseq argument is to allow the building of FTHelper + : lines relevant to particular sequences (ie, when features are spread over + : enteries, knowing how to build this) + Returns : an array of FThelpers + Args : seq features + + +=cut + +sub from_SeqFeature { + my ($sf, $context_annseq) = @_; + my @ret; + + # + # If this object knows how to make FThelpers, then let it + # - this allows us to store *really* weird objects that can write + # themselves to the EMBL/GenBank... + # + + if ( $sf->can("to_FTHelper") ) { + return $sf->to_FTHelper($context_annseq); + } + + my $fth = Bio::SeqIO::FTHelper->new(); + my $key = $sf->primary_tag(); + my $locstr = $sf->location->to_FTstring; + + # ES 25/06/01 Commented out this code, Jason to double check + #The location FT string for all simple subseqfeatures is already + #in the Split location FT string + + # going into sub features + #foreach my $sub ( $sf->sub_SeqFeature() ) { + #my @subfth = &Bio::SeqIO::FTHelper::from_SeqFeature($sub); + #push(@ret, @subfth); + #} + + $fth->loc($locstr); + $fth->key($key); + $fth->field->{'note'} = []; + #$sf->source_tag && do { push(@{$fth->field->{'note'}},"source=" . $sf->source_tag ); }; + + ($sf->can('score') && $sf->score) && do { push(@{$fth->field->{'note'}}, + "score=" . $sf->score ); }; + ($sf->can('frame') && $sf->frame) && do { push(@{$fth->field->{'note'}}, + "frame=" . $sf->frame ); }; + #$sf->strand && do { push(@{$fth->field->{'note'}},"strand=" . $sf->strand ); }; + + foreach my $tag ( $sf->all_tags ) { + # Tags which begin with underscores are considered + # private, and are therefore not printed + next if $tag =~ /^_/; + if ( !defined $fth->field->{$tag} ) { + $fth->field->{$tag} = []; + } + foreach my $val ( $sf->each_tag_value($tag) ) { + push(@{$fth->field->{$tag}},$val); + } + } + push(@ret, $fth); + + unless (@ret) { + $context_annseq->throw("Problem in processing seqfeature $sf - no fthelpers. Error!"); + } + foreach my $ft (@ret) { + if ( !$ft->isa('Bio::SeqIO::FTHelper') ) { + $sf->throw("Problem in processing seqfeature $sf - made a $fth!"); + } + } + + return @ret; + +} + + +=head2 key + + Title : key + Usage : $obj->key($newval) + Function: + Example : + Returns : value of key + Args : newvalue (optional) + + +=cut + +sub key { + my ($obj, $value) = @_; + if ( defined $value ) { + $obj->{'key'} = $value; + } + return $obj->{'key'}; + +} + +=head2 loc + + Title : loc + Usage : $obj->loc($newval) + Function: + Example : + Returns : value of loc + Args : newvalue (optional) + + +=cut + +sub loc { + my ($obj, $value) = @_; + if ( defined $value ) { + $obj->{'loc'} = $value; + } + return $obj->{'loc'}; +} + + +=head2 field + + Title : field + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub field { + my ($self) = @_; + + return $self->{'_field'}; +} + +=head2 add_field + + Title : add_field + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub add_field { + my ($self, $key, $val) = @_; + + if ( !exists $self->field->{$key} ) { + $self->field->{$key} = []; + } + push( @{$self->field->{$key}} , $val); + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqIO/MultiFile.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/MultiFile.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,251 @@ +# $Id: MultiFile.pm,v 1.8 2002/10/22 07:38:42 lapp Exp $ +# +# BioPerl module for Bio::SeqIO::MultiFile +# +# Cared for by Ewan Birney <birney@ebi.ac.uk> +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::MultiFile - Treating a set of files as a single input stream + +=head1 SYNOPSIS + + $seqin = Bio::SeqIO::MultiFile( '-format' => 'Fasta', + '-files' => ['file1','file2'] ); + while((my $seq = $seqin->next_seq)) { + # do something with $seq + } + +=head1 DESCRIPTION + +Bio::SeqIO::MultiFile provides a simple way of bundling a whole +set of identically formatted sequence input files as a single stream. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://www.bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney + +Email birney@ebi.ac.uk + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SeqIO::MultiFile; +use strict; +use vars qw(@ISA); +use Bio::SeqIO; + +@ISA = qw(Bio::SeqIO); + + +# _initialize is where the heavy stuff will happen when new is called + +sub _initialize { + my($self,@args) = @_; + + $self->SUPER::_initialize(@args); + + my ($file_array,$format) = $self->_rearrange([qw( + FILES + FORMAT + )], + @args, + ); + if( !defined $file_array || ! ref $file_array ) { + $self->throw("Must have an array files for MultiFile"); + } + + if( !defined $format ) { + $self->throw("Must have a format for MultiFile"); + } + + $self->{'_file_array'} = []; + + $self->_set_file(@$file_array); + $self->_format($format); + if( $self->_load_file() == 0 ) { + $self->throw("Unable even to initialise the first file"); + } +} + +=head2 next_seq + + Title : next_seq + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub next_seq{ + my ($self,@args) = @_; + + my $seq = $self->_current_seqio->next_seq(); + if( !defined $seq ) { + if( $self->_load_file() == 0) { + return undef; + } else { + return $self->next_seq(); + } + } else { + return $seq; + } + +} + +=head2 next_primary_seq + + Title : next_primary_seq + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub next_primary_seq{ + my ($self,@args) = @_; + + my $seq = $self->_current_seqio->next_primary_seq(); + if( !defined $seq ) { + if( $self->_load_file() == 0) { + return undef; + } else { + return $self->next_primary_seq(); + } + } else { + return $seq; + } + +} + +=head2 _load_file + + Title : _load_file + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub _load_file{ + my ($self,@args) = @_; + + my $file = shift(@{$self->{'_file_array'}}); + if( !defined $file ) { + return 0; + } + my $seqio = Bio::SeqIO->new( '-format' => $self->_format(), -file => $file); + # should throw an exception - but if not... + if( !defined $seqio) { + $self->throw("no seqio built for $file!"); + } + + $self->_current_seqio($seqio); + return 1; +} + +=head2 _set_file + + Title : _set_file + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub _set_file{ + my ($self,@files) = @_; + + push(@{$self->{'_file_array'}},@files); + +} + +=head2 _current_seqio + + Title : _current_seqio + Usage : $obj->_current_seqio($newval) + Function: + Example : + Returns : value of _current_seqio + Args : newvalue (optional) + + +=cut + +sub _current_seqio{ + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'_current_seqio'} = $value; + } + return $obj->{'_current_seqio'}; + +} + +=head2 _format + + Title : _format + Usage : $obj->_format($newval) + Function: + Example : + Returns : value of _format + Args : newvalue (optional) + + +=cut + +sub _format{ + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'_format'} = $value; + } + return $obj->{'_format'}; + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqIO/abi.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/abi.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,141 @@ +# $Id: abi.pm,v 1.7 2002/10/22 07:38:42 lapp Exp $ +# BioPerl module for Bio::SeqIO::abi +# +# Cared for by Aaron Mackey <amackey@virginia.edu> +# +# Copyright Aaron Mackey +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::abi - abi trace sequence input/output stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the Bio::SeqIO class. + +=head1 DESCRIPTION + +This object can transform Bio::Seq objects to and from abi trace +files. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS - Aaron Mackey + +Email: amackey@virginia.edu + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::SeqIO::abi; +use vars qw(@ISA $READ_AVAIL); +use strict; +# Object preamble - inherits from Bio::Root::Object + +use Bio::SeqIO; +use Bio::Seq::SeqFactory; + +push @ISA, qw( Bio::SeqIO ); + +sub BEGIN { + eval { require Bio::SeqIO::staden::read; }; + if ($@) { + $READ_AVAIL = 0; + } else { + push @ISA, "Bio::SeqIO::staden::read"; + $READ_AVAIL = 1; + } +} + +sub _initialize { + my($self,@args) = @_; + $self->SUPER::_initialize(@args); + if( ! defined $self->sequence_factory ) { + $self->sequence_factory(new Bio::Seq::SeqFactory(-verbose => $self->verbose(), -type => 'Bio::Seq')); + } + unless ($READ_AVAIL) { + Bio::Root::Root->throw( -class => 'Bio::Root::SystemException', + -text => "Bio::SeqIO::staden::read is not available; make sure the bioperl-ext package has been installed successfully!" + ); + } +} + +=head2 next_seq + + Title : next_seq + Usage : $seq = $stream->next_seq() + Function: returns the next sequence in the stream + Returns : Bio::SeqWithQuality object + Args : NONE + +=cut + +sub next_seq { + + my ($self) = @_; + + my ($seq, $id, $desc, $qual) = $self->read_trace($self->_fh, 'abi'); + + # create the seq object + $seq = $self->sequence_factory->create(-seq => $seq, + -id => $id, + -primary_id => $id, + -desc => $desc, + -alphabet => 'DNA', + -qual => $qual + ); + return $seq; +} + +=head2 write_seq + + Title : write_seq + Usage : $stream->write_seq(@seq) + Function: writes the $seq object into the stream + Returns : 1 for success and 0 for error + Args : Bio::Seq object + + +=cut + +sub write_seq { + my ($self,@seq) = @_; + + my $fh = $self->_fh; + foreach my $seq (@seq) { + $self->write_trace($fh, $seq, 'abi'); + } + + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqIO/ace.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/ace.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,196 @@ +# $Id: ace.pm,v 1.15 2002/10/25 16:23:16 jason Exp $ +# +# BioPerl module for Bio::SeqIO::ace +# +# Cared for by James Gilbert <jgrg@sanger.ac.uk> +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::ace - ace sequence input/output stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the Bio::SeqIO class. + +=head1 DESCRIPTION + +This object can transform Bio::Seq objects to and +from ace file format. It only parses a DNA or +Peptide objects contained in the ace file, +producing PrimarySeq objects from them. All +other objects in the files will be ignored. It +doesn't attempt to parse any annotation attatched +to the containing Sequence or Protein objects, +which would probably be impossible, since +everyone's ACeDB schema can be different. + +It won't parse ace files containing Timestamps +correctly either. This can easily be added if +considered necessary. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://www.bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS - James Gilbert + +Email: jgrg@sanger.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +#' +# Let the code begin... + +package Bio::SeqIO::ace; +use strict; +use vars qw(@ISA); + +use Bio::SeqIO; +use Bio::Seq; +use Bio::Seq::SeqFactory; + +@ISA = qw(Bio::SeqIO); + +sub _initialize { + my($self,@args) = @_; + $self->SUPER::_initialize(@args); + if( ! defined $self->sequence_factory ) { + $self->sequence_factory(new Bio::Seq::SeqFactory(-verbose => $self->verbose(), -type => 'Bio::PrimarySeq')); + } +} + +=head2 next_seq + + Title : next_seq + Usage : $seq = $stream->next_seq() + Function: returns the next sequence in the stream + Returns : Bio::Seq object + Args : NONE + +=cut + +{ + my %bio_mol_type = ( + 'dna' => 'dna', + 'peptide' => 'protein', + ); + + sub next_seq { + my( $self ) = @_; + local $/ = ""; # Split input on blank lines + + my $fh = $self->_filehandle; + my( $type, $id ); + while (<$fh>) { + if (($type, $id) = /^(DNA|Peptide)[\s:]+(.+?)\s*\n/si) { + s/^.+$//m; # Remove first line + s/\s+//g; # Remove whitespace + last; + } + } + # Return if there weren't any DNA or peptide objects + return unless $type; + + # Choose the molecule type + my $mol_type = $bio_mol_type{lc $type} + or $self->throw("Can't get Bio::Seq molecule type for '$type'"); + + # Remove quotes from $id + $id =~ s/^"|"$//g; + + # Un-escape forward slashes, double quotes, percent signs, + # semi-colons, tabs, and backslashes (if you're mad enough + # to have any of these as part of object names in your acedb + # database). + $id =~ s/\\([\/"%;\t\\])/$1/g; +#" + # Called as next_seq(), so give back a Bio::Seq + return $self->sequence_factory->create( + -seq => $_, + -primary_id => $id, + -display_id => $id, + -alphabet => $mol_type, + ); + } +} + +=head2 write_seq + + Title : write_seq + Usage : $stream->write_seq(@seq) + Function: writes the $seq object into the stream + Returns : 1 for success and 0 for error + Args : Bio::Seq object(s) + + +=cut + +sub write_seq { + my ($self, @seq) = @_; + + foreach my $seq (@seq) { + $self->throw("Did not provide a valid Bio::PrimarySeqI object") + unless defined $seq && ref($seq) && $seq->isa('Bio::PrimarySeqI'); + my $mol_type = $seq->alphabet; + my $id = $seq->display_id; + + # Escape special charachers in id + $id =~ s/([\/"%;\t\\])/\\$1/g; +#" + # Print header for DNA or Protein object + if ($mol_type eq 'dna') { + $self->_print( + qq{\nSequence : "$id"\nDNA "$id"\n}, + qq{\nDNA : "$id"\n}, + ); + } + elsif ($mol_type eq 'protein') { + $self->_print( + qq{\nProtein : "$id"\nPeptide "$id"\n}, + qq{\nPeptide : "$id"\n}, + ); + } + else { + $self->throw("Don't know how to produce ACeDB output for '$mol_type'"); + } + + # Print the sequence + my $str = $seq->seq; + my( $formatted_seq ); + while ($str =~ /(.{1,60})/g) { + $formatted_seq .= "$1\n"; + } + $self->_print($formatted_seq, "\n"); + } + + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqIO/alf.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/alf.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,141 @@ +# $Id: alf.pm,v 1.7 2002/10/22 07:38:42 lapp Exp $ +# BioPerl module for Bio::SeqIO::alf +# +# Cared for by Aaron Mackey <amackey@virginia.edu> +# +# Copyright Aaron Mackey +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::alf - alf trace sequence input/output stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the Bio::SeqIO class. + +=head1 DESCRIPTION + +This object can transform Bio::Seq objects to and from alf trace +files. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS - Aaron Mackey + +Email: amackey@virginia.edu + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::SeqIO::alf; +use vars qw(@ISA $READ_AVAIL); +use strict; +# Object preamble - inherits from Bio::Root::Object + +use Bio::SeqIO; +use Bio::Seq::SeqFactory; + +push @ISA, qw( Bio::SeqIO ); + +sub BEGIN { + eval { require Bio::SeqIO::staden::read; }; + if ($@) { + $READ_AVAIL = 0; + } else { + push @ISA, "Bio::SeqIO::staden::read"; + $READ_AVAIL = 1; + } +} + +sub _initialize { + my($self,@args) = @_; + $self->SUPER::_initialize(@args); + if( ! defined $self->sequence_factory ) { + $self->sequence_factory(new Bio::Seq::SeqFactory(-verbose => $self->verbose(), -type => 'Bio::Seq')); + } + unless ($READ_AVAIL) { + Bio::Root::Root->throw( -class => 'Bio::Root::SystemException', + -text => "Bio::SeqIO::staden::read is not available; make sure the bioperl-ext package has been installed successfully!" + ); + } +} + +=head2 next_seq + + Title : next_seq + Usage : $seq = $stream->next_seq() + Function: returns the next sequence in the stream + Returns : Bio::SeqWithQuality object + Args : NONE + +=cut + +sub next_seq { + + my ($self) = @_; + + my ($seq, $id, $desc, $qual) = $self->read_trace($self->_fh, 'alf'); + + # create the seq object + $seq = $self->sequence_factory->create(-seq => $seq, + -id => $id, + -primary_id => $id, + -desc => $desc, + -alphabet => 'DNA', + -qual => $qual + ); + return $seq; +} + +=head2 write_seq + + Title : write_seq + Usage : $stream->write_seq(@seq) + Function: writes the $seq object into the stream + Returns : 1 for success and 0 for error + Args : Bio::Seq object + + +=cut + +sub write_seq { + my ($self,@seq) = @_; + + my $fh = $self->_fh; + foreach my $seq (@seq) { + $self->write_trace($fh, $seq, 'alf'); + } + + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqIO/bsml.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/bsml.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1470 @@ +# +# BioPerl module for Bio::SeqIO::bsml +# +# Cared for by Charles Tilford (tilfordc@bms.com) +# Copyright (C) Charles Tilford 2001 +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# Also at: http://www.gnu.org/copyleft/lesser.html + + +# Much of the basic documentation in this module has been +# cut-and-pasted from the embl.pm (Ewan Birney) SeqIO module. + + +=head1 NAME + +Bio::SeqIO::bsml - BSML sequence input/output stream + +=head1 SYNOPSIS + + It is probably best not to use this object directly, but rather go + through the SeqIO handler system. To read a BSML file: + + $stream = Bio::SeqIO->new( -file => $filename, -format => 'bsml'); + + while ( my $bioSeqObj = $stream->next_seq() ) { + # do something with $bioSeqObj + } + + To write a Seq object to the current file handle in BSML XML format: + + $stream->write_seq( -seq => $seqObj); + + If instead you would like a XML::DOM object containing the BSML, use: + + my $newXmlObject = $stream->to_bsml( -seq => $seqObj); + +=head1 DEPENDENCIES + + In addition to parts of the Bio:: hierarchy, this module uses: + + XML::DOM + +=head1 DESCRIPTION + + This object can transform Bio::Seq objects to and from BSML (XML) + flatfiles. + +=head2 NOTE: + + 2/1/02 - I have changed the API to more closely match argument + passing used by other BioPerl methods ( -tag => value ). Internal + methods are using the same API, but you should not be calling those + anyway... + +=head1 FEEDBACK + +=head2 Mailing Lists + + User feedback is an integral part of the evolution of this and other + Bioperl modules. Send your comments and suggestions preferably to one + of the Bioperl mailing lists. Your participation is much + appreciated. + + bioperl-l@bioperl.org - General discussion + http://www.bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + + Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head2 Things Still to Do + + * The module now uses the new Collection.pm system. However, + Annotations associated with a Feature object still seem to use the + old system, so parsing with the old methods are included.. + + * Generate Seq objects with no sequence data but an assigned + length. This appears to be an issue with Bio::Seq. It is possible + (and reasonable) to make a BSML document with features but no + sequence data. + + * Support <Seq-data-import>. Do not know how commonly this is used. + + * Some features are awaiting implementation in later versions of + BSML. These include: + + * Nested feature support + + * Complex feature (ie joins) + + * Unambiguity in strand (ie -1,0,1, not just 'complement' ) + + * More friendly dblink structures + + * Location.pm (or RangeI::union?) appears to have a bug when 'expand' + is used. + + * More intelligent hunting for sequence and feature titles? It is not + terribly clear where the most appropriate field is located, better + grepping (eg looking for a reasonable count for spaces and numbers) + may allow for titles better than "AE008041". + +=head1 AUTHOR - Charles Tilford + +Bristol-Myers Squibb Bioinformatics + +Email tilfordc@bms.com + +I have developed the BSML specific code for this package, but have used +code from other SeqIO packages for much of the nuts-and-bolts. In particular +I have used code from the embl.pm module either directly or as a framework +for many of the subroutines that are common to SeqIO modules. + +=cut + +package Bio::SeqIO::bsml; +use vars qw(@ISA); +use strict; + +use Bio::SeqIO; +use Bio::SeqFeature::Generic; +use Bio::Species; +use XML::DOM; +use Bio::Seq::SeqFactory; +use Bio::Annotation::Collection; +use Bio::Annotation::Comment; +use Bio::Annotation::Reference; +use Bio::Annotation::DBLink; + +@ISA = qw(Bio::SeqIO); + +my $idcounter = {}; # Used to generate unique id values +my $nvtoken = ": "; # The token used if a name/value pair has to be stuffed + # into a single line + +=head1 METHODS + +=cut + +# LS: this seems to get overwritten on line 1317, generating a redefinition error. Dead code? +# CAT: This was inappropriately added in revision 1.10 - I added the check for existance of a sequence factory to the actual _initialize +# sub _initialize { +# my($self,@args) = @_; +# $self->SUPER::_initialize(@args); +# if( ! defined $self->sequence_factory ) { +# $self->sequence_factory(new Bio::Seq::SeqFactory(-verbose => $self->verbose(), -type => 'Bio::Seq::RichSeq')); +# } +# } + +=head2 next_seq + + Title : next_seq + Usage : my $bioSeqObj = $stream->next_seq + Function: Retrieves the next sequence from a SeqIO::bsml stream. + Returns : A reference to a Bio::Seq::RichSeq object + Args : + +=cut + +sub next_seq { + my $self = shift; + my ($desc); + my $bioSeq = $self->sequence_factory->create(-verbose =>$self->verbose()); + + unless (exists $self->{'domtree'}) { + $self->throw("A BSML document has not yet been parsed."); + return undef; + } + my $dom = $self->{'domtree'}; + my $seqElements = $dom->getElementsByTagName ("Sequence"); + if ($self->{'current_node'} == $seqElements->getLength ) { + # There are no more <Sequence>s to process + return undef; + } + my $xmlSeq = $seqElements->item($self->{'current_node'}); + + # Assume that title attribute contains the best display id + if (my $val = $xmlSeq->getAttribute( "title")) { + $bioSeq->display_id($val); + } + + # Set the molecule type + if (my $val = $xmlSeq->getAttribute( "molecule" )) { + my %mol = ('dna' => 'DNA', 'rna' => 'RNA', 'aa' => 'protein'); + $bioSeq->molecule($mol{ lc($val) }); + } + + # Set the accession number + if (my $val = $xmlSeq->getAttribute( "ic-acckey" )) { + $bioSeq->accession_number($val); + } + + # Get the sequence data for the element + if (my $seqData = &FIRSTDATA($xmlSeq->getElementsByTagName("Seq-data") + ->item(0) ) ) { + # Sequence data exists, transfer to the Seq object + # Remove white space and CRs (not neccesary?) + $seqData =~ s/[\s\n\r]//g; + $bioSeq->seq($seqData); + } elsif (my $import = $xmlSeq->getElementsByTagName("Seq-dataimport") + ->item(0) ) { +#>>>> # What about <Seq-data-import> ?? + + } elsif (my $val = $xmlSeq->getAttribute("length")) { + # No sequence defined, set the length directly + +#>>>> # This does not appear to work - length is apparently calculated + # from the sequence. How to make a "virtual" sequence??? Such + # creatures are common in BSML... + $bioSeq->length($val); + } + + my $species = Bio::Species->new(); + my @classification = (); + + # Peruse the generic <Attributes> - those that are direct children of + # the <Sequence> or the <Feature-tables> element + # Sticky wicket here - data not controlled by schema, could be anything + my @seqDesc = (); + my %specs = ('common_name' => 'y', + 'genus' => 'y', + 'species' => 'y', + 'sub_species' => 'y', ); + my %seqMap = ( + 'add_date' => [ 'date' ], + 'keywords' => [ 'keyword', ], + 'seq_version' => [ 'version' ], + 'division' => [ 'division' ], + 'add_secondary_accession' => ['accession'], + 'pid' => ['pid'], + 'primary_id' => [ 'primary.id', 'primary_id' ], + ); + my $floppies = &GETFLOPPIES($xmlSeq); + foreach my $attr (@{$floppies}) { + # Don't want to get attributes from <Feature> or <Table> elements yet + my $parent = $attr->getParentNode->getNodeName; + next unless($parent eq "Sequence" || $parent eq "Feature-tables"); + + my ($name, $content) = &FLOPPYVALS($attr); + $name = lc($name); + if (exists $specs{$name}) { # It looks like part of species... + $species->$name($content); + next; + } + my $value = ""; + # Cycle through the Seq methods: + foreach my $method (keys %seqMap) { + # Cycle through potential matching attributes: + foreach my $match (@{$seqMap{$method}}) { + # If the <Attribute> name matches one of the keys, + # set $value, unless it has already been set + $value ||= $content if ($name =~ /$match/i); + } + if ($value ne "") { + $bioSeq->$method($value); + last; + } + } + next if ($value ne ""); + + if ($name =~ /^species$/i) { # Uh, it's the species designation? + if ($content =~ / /) { + # Assume that a full species name has been provided + # This will screw up if the last word is the subspecies... + my @break = split " ", $content; + @classification = reverse @break; + } else { + $classification[0] = $content; + } + next; + } + if ($name =~ /sub[_ ]?species/i) { # Should be the subspecies... + $species->sub_species( $content ); + next; + } + if ($name =~ /classification/i) { # Should be species classification + # We will assume that there are spaces separating the terms: + my @bits = split " ", $content; + # Now make sure there is not other cruft as well (eg semi-colons) + for my $i (0..$#bits) { + $bits[$i] =~ /(\w+)/; + $bits[$i] = $1; + } + $species->classification( @bits ); + next; + } + if ($name =~ /comment/) { + my $com = Bio::Annotation::Comment->new('-text' => $content); + # $bioSeq->annotation->add_Comment($com); + $bioSeq->annotation->add_Annotation('comment', $com); + next; + } + # Description line - collect all descriptions for later assembly + if ($name =~ /descr/) { + push @seqDesc, $content; + next; + } + # Ok, we have no idea what this attribute is. Dump to SimpleValue + my $simp = Bio::Annotation::SimpleValue->new( -value => $content); + $bioSeq->annotation->add_Annotation($name, $simp); + } + unless ($#seqDesc < 0) { + $bioSeq->desc( join "; ", @seqDesc); + } + +#>>>> This should be modified so that any IDREF associated with the + # <Reference> is then used to associate the reference with the + # appropriate Feature + + # Extract out <Reference>s associated with the sequence + my @refs; + my %tags = ( + -title => "RefTitle", + -authors => "RefAuthors", + -location => "RefJournal", + ); + foreach my $ref ( $xmlSeq->getElementsByTagName ("Reference") ) { + my %refVals; + foreach my $tag (keys %tags) { + my $rt = &FIRSTDATA($ref->getElementsByTagName($tags{$tag}) + ->item(0)); + $rt =~ s/^[\s\r\n]+//; # Kill leading space + $rt =~ s/[\s\r\n]+$//; # Kill trailing space + $rt =~ s/[\s\r\n]+/ /; # Collapse internal space runs + $refVals{$tag} = $rt; + } + my $reference = Bio::Annotation::Reference->new( %refVals ); + + # Pull out any <Reference> information hidden in <Attributes> + my %refMap = ( + comment => [ 'comment', 'remark' ], + medline => [ 'medline', ], + pubmed => [ 'pubmed' ], + start => [ 'start', 'begin' ], + end => [ 'stop', 'end' ], + ); + my @refCom = (); + my $floppies = &GETFLOPPIES($ref); + foreach my $attr (@{$floppies}) { + my ($name, $content) = &FLOPPYVALS($attr); + my $value = ""; + # Cycle through the Seq methods: + foreach my $method (keys %refMap) { + # Cycle through potential matching attributes: + foreach my $match (@{$refMap{$method}}) { + # If the <Attribute> name matches one of the keys, + # set $value, unless it has already been set + $value ||= $content if ($name =~ /$match/i); + } + if ($value ne "") { + my $str = '$reference->' . $method . "($value)"; + eval($str); + next; + } + } + next if ($value ne ""); + # Don't know what the <Attribute> is, dump it to comments: + push @refCom, $name . $nvtoken . $content; + } + unless ($#refCom < 0) { + # Random stuff was found, tack it to the comment field + my $exist = $reference->comment; + $exist .= join ", ", @refCom; + $reference->comment($exist); + } + push @refs, $reference; + } + $bioSeq->annotation->add_Annotation('reference'=>$_) foreach @refs; + + # Extract the <Feature>s for this <Sequence> + foreach my $feat ( $xmlSeq->getElementsByTagName("Feature") ) { + $bioSeq->add_SeqFeature( $self->_parse_bsml_feature($feat) ); + } + + $species->classification( @classification ); + $bioSeq->species( $species ); + +# $seq->annotation->add_DBLink(@links); -> + + $self->{'current_node'}++; + return $bioSeq; +} +#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +# Get all the <Attribute> and <Qualifier> children for an object, and +# return them as an array reference +# ('floppy' since these elements have poor/no schema control) +sub GETFLOPPIES { + my $obj = shift; + + my @floppies; + my $attributes = $obj->getElementsByTagName ("Attribute"); + for (my $i = 0; $i < $attributes->getLength; $i++) { + push @floppies, $attributes->item($i); + } + my $qualifiers = $obj->getElementsByTagName ("Qualifier"); + for (my $i = 0; $i < $qualifiers->getLength; $i++) { + push @floppies, $qualifiers->item($i); + } + return \@floppies; +} +#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +# Given a DOM <Attribute> or <Qualifier> object, return the [name, value] pair +sub FLOPPYVALS { + my $obj = shift; + + my ($name, $value); + if ($obj->getNodeName eq "Attribute") { + $name = $obj->getAttribute('name'); + $value = $obj->getAttribute('content'); + } elsif ($obj->getNodeName eq "Qualifier") { + # Wheras <Attribute>s require both 'name' and 'content' attributes, + # <Qualifier>s can technically have either blank (and sometimes do) + my $n = $obj->getAttribute('value-type'); + $name = $n if ($n ne ""); + my $v = $obj->getAttribute('value'); + $value = $v if ($v ne ""); + } + return ($name, $value); +} +#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +# Returns the value of the first TEXT_NODE encountered below an element +# Rational - avoid grabbing a comment rather than the PCDATA. Not foolproof... +sub FIRSTDATA { + my $element = shift; + return undef unless ($element); + + my $hopefuls = $element->getChildNodes; + my $data; + for (my $i = 0; $i < $hopefuls->getLength; $i++) { + if ($hopefuls->item($i)->getNodeType == + XML::DOM::Node::TEXT_NODE() ) { + $data = $hopefuls->item($i)->getNodeValue; + last; + } + } + return $data; +} +#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +# Just collapses whitespace runs in a string +sub STRIP { + my $string = shift; + $string =~ s/[\s\r\n]+/ /g; + return $string; +} + +=head2 to_bsml + + Title : to_bsml + Usage : my $domDoc = $obj->to_bsml(@args) + Function: Generates an XML structure for one or more Bio::Seq objects. + If $seqref is an array ref, the XML tree generated will include + all the sequences in the array. + Returns : A reference to the XML DOM::Document object generated / modified + Args : Argument array in form of -key => val. Recognized keys: + + -seq A Bio::Seq reference, or an array reference of many of them + + -xmldoc Specifies an existing XML DOM document to add the sequences + to. If included, then only data (no page formatting) will + be added. If not, a new XML::DOM::Document will be made, + and will be populated with both <Sequence> data, as well as + <Page> display elements. + + -nodisp Do not generate <Display> elements, or any children + thereof, even if -xmldoc is not set. + + -skipfeat If set to 'all', all <Feature>s will be skipped. If it is + a hash reference, any <Feature> with a class matching a key + in the hash will be skipped - for example, to skip 'source' + and 'score' features, use: + + -skipfeat => { source => 'Y', score => 'Y' } + + -skiptags As above: if set to 'all', no tags are included, and if a + hash reference, those specific tags will be ignored. + + Skipping some or all tags and features can result in + noticable speed improvements. + + -nodata If true, then <Seq-data> will not be included. This may be + useful if you just want annotations and do not care about + the raw ACTG information. + + -return Default is 'xml', which will return a reference to the BSML + XML object. If set to 'seq' will return an array ref of the + <Sequence> objects added (rather than the whole XML object) + + -close Early BSML browsers will crash if an element *could* have + children but does not, and is closed as an empty element + e.g. <Styles/>. If -close is true, then such tags are given + a comment child to explicitly close them e.g. <Styles><!-- + --></Styles>. This is default true, set to "0" if you do + not want this behavior. + + Examples : my $domObj = $stream->to_bsml( -seq => \@fourCoolSequenceObjects, + -skipfeat => { source => 1 }, + ); + + # Or add sequences to an existing BSML document: + $stream->to_bsml( -seq => \@fourCoolSequenceObjects, + -skipfeat => { source => 1 }, + -xmldoc => $myBsmlDocumentInProgress, ); + +=cut + +sub to_bsml { + my $self = shift; + my $args = $self->_parseparams( -close => 1, + -return => 'xml', + @_); + $args->{NODISP} ||= $args->{NODISPLAY}; + my $seqref = $args->{SEQ}; + $seqref = (ref($seqref) eq 'ARRAY') ? $seqref : [ $seqref ]; + + ############################# + # Basic BSML XML Components # + ############################# + + my $xml; + my ($bsmlElem, $defsElem, $seqsElem, $dispElem); + if ($args->{XMLDOC}) { + # The user has provided an existing XML DOM object + $xml = $args->{XMLDOC}; + unless ($xml->isa("XML::DOM::Document")) { + die ('SeqIO::bsml.pm error:\n'. + 'When calling ->to_bsml( { xmldoc => $myDoc }), $myDoc \n' . + 'should be an XML::DOM::Document object, or an object that\n'. + 'inherits from that class (like BsmlHelper.pm)'); + } + } else { + # The user has not provided a new document, make one from scratch + $xml = XML::DOM::Document->new(); + $xml->setXMLDecl( $xml->createXMLDecl("1.0") ); + my $url = "http://www.labbook.com/dtd/bsml2_2.dtd"; + my $doc = $xml->createDocumentType("Bsml",$url); + $xml->setDoctype($doc); + $bsmlElem = $self->_addel( $xml, 'Bsml'); + $defsElem = $self->_addel( $bsmlElem, 'Definitions'); + $seqsElem = $self->_addel( $defsElem, 'Sequences'); + unless ($args->{NODISP}) { + $dispElem = $self->_addel( $bsmlElem, 'Display'); + my $stylElem = $self->_addel( $dispElem, 'Styles'); + my $style = $self->_addel( $stylElem, 'Style', { + type => "text/css" }); + my $styleText = + qq(Interval-widget { display : "1"; }\n) . + qq(Feature { display-auto : "1"; }); + $style->appendChild( $xml->createTextNode($styleText) ); + } + } + + # Establish fundamental BSML elements, if they do not already exist + $bsmlElem ||= $xml->getElementsByTagName("Bsml")->item(0); + $defsElem ||= $xml->getElementsByTagName("Definitions")->item(0); + $seqsElem ||= $xml->getElementsByTagName("Sequences")->item(0); + + ############### + # <Sequences> # + ############### + + # Map over Bio::Seq to BSML + my %mol = ('dna' => 'DNA', 'rna' => 'RNA', 'protein' => 'AA'); + my @xmlSequences; + + foreach my $bioSeq (@{$seqref}) { + my $xmlSeq = $xml->createElement("Sequence"); + my $FTs = $xml->createElement("Feature-tables"); + + # Array references to hold <Reference> objects: + my $seqRefs = []; my $featRefs = []; + # Array references to hold <Attribute> values (not objects): + my $seqDesc = []; + push @{$seqDesc}, ["comment" , "This file generated to BSML 2.2 standards - joins will be collapsed to a single feature enclosing all members of the join"]; + push @{$seqDesc}, ["description" , eval{$bioSeq->desc}]; + foreach my $kwd ( eval{@{$bioSeq->keywords || []}} ) { + push @{$seqDesc}, ["keyword" , $kwd]; + } + push @{$seqDesc}, ["version" , eval{$bioSeq->seq_version}]; + push @{$seqDesc}, ["division" , eval{$bioSeq->division}]; + push @{$seqDesc}, ["pid" , eval{$bioSeq->pid}]; +# push @{$seqDesc}, ["bio_object" , ref($bioSeq)]; + my $pid = eval{$bioSeq->primary_id} || ''; + if( $pid ne $bioSeq ) { + push @{$seqDesc}, ["primary_id" , eval{$bioSeq->primary_id}]; + } + foreach my $dt (eval{$bioSeq->get_dates()} ) { + push @{$seqDesc}, ["date" , $dt]; + } + foreach my $ac (eval{$bioSeq->get_secondary_accessions()} ) { + push @{$seqDesc}, ["secondary_accession" , $ac]; + } + + # Determine the accession number and a unique identifier + my $acc = $bioSeq->accession_number eq "unknown" ? + "" : $bioSeq->accession_number; + my $id; + my $pi = $bioSeq->primary_id; + if ($pi && $pi !~ /Bio::/) { + # Not sure I understand what primary_id is... It sometimes + # is a string describing a reference to a BioSeq object... + $id = "SEQ" . $bioSeq->primary_id; + } else { + # Nothing useful found, make a new unique ID + $id = $acc || ("SEQ-io" . $idcounter->{Sequence}++); + } + # print "$id->",ref($bioSeq->primary_id),"\n"; + # An id field with spaces is interpreted as an idref - kill the spaces + $id =~ s/ /-/g; + # Map over <Sequence> attributes + my %attr = ( 'title' => $bioSeq->display_id, + 'length' => $bioSeq->length, + 'ic-acckey' => $acc, + 'id' => $id, + 'representation' => 'raw', + ); + $attr{molecule} = $mol{ lc($bioSeq->molecule) } if $bioSeq->can('molecule'); + + + foreach my $a (keys %attr) { + $xmlSeq->setAttribute($a, $attr{$a}) if (defined $attr{$a} && + $attr{$a} ne ""); + } + # Orphaned Attributes: + $xmlSeq->setAttribute('topology', 'circular') + if ($bioSeq->is_circular); + # <Sequence> strand, locus + + $self->_add_page($xml, $xmlSeq) if ($dispElem); + ################ + # <Attributes> # + ################ + + # Check for Bio::Annotations on the * <Sequence> *. + $self->_parse_annotation( -xml => $xml, -obj => $bioSeq, + -desc => $seqDesc, -refs => $seqRefs); + + # Incorporate species data + if (ref($bioSeq->species) eq 'Bio::Species') { + # Need to peer into Bio::Species ... + my @specs = ('common_name', 'genus', 'species', 'sub_species'); + foreach my $sp (@specs) { + next unless (my $val = $bioSeq->species()->$sp()); + push @{$seqDesc}, [$sp , $val]; + } + push @{$seqDesc}, ['classification', + (join " ", $bioSeq->species->classification) ]; + # Species::binomial will return "genus species sub_species" ... + } elsif (my $val = $bioSeq->species) { + # Ok, no idea what it is, just dump it in there... + push @{$seqDesc}, ["species", $val]; + } + + # Add the description <Attribute>s for the <Sequence> + foreach my $seqD (@{$seqDesc}) { + $self->_addel($xmlSeq, "Attribute", { + name => $seqD->[0], content => $seqD->[1]}) if ($seqD->[1]); + } + + # If sequence references were added, make a Feature-table for them + unless ($#{$seqRefs} < 0) { + my $seqFT = $self->_addel($FTs, "Feature-table", { + title => "Sequence References", }); + foreach my $feat (@{$seqRefs}) { + $seqFT->appendChild($feat); + } + } + + # This is the appropriate place to add <Feature-tables> + $xmlSeq->appendChild($FTs); + + ############# + # <Feature> # + ############# + +#>>>> # Perhaps it is better to loop through top_Seqfeatures?... +#>>>> # ...however, BSML does not have a hierarchy for Features + + if (defined $args->{SKIPFEAT} && + $args->{SKIPFEAT} eq 'all') { + $args->{SKIPFEAT} = { all => 1}; + } + foreach my $class (keys %{$args->{SKIPFEAT}}) { + $args->{SKIPFEAT}{lc($class)} = $args->{SKIPFEAT}{$class}; + } + # Loop through all the features + my @features = $bioSeq->all_SeqFeatures(); + if (@features && !$args->{SKIPFEAT}{all}) { + my $ft = $self->_addel($FTs, "Feature-table", { + title => "Features", }); + foreach my $bioFeat (@features ) { + my $featDesc = []; + my $class = lc($bioFeat->primary_tag); + # The user may have specified to ignore this type of feature + next if ($args->{SKIPFEAT}{$class}); + my $id = "FEAT-io" . $idcounter->{Feature}++; + my $xmlFeat = $self->_addel( $ft, 'Feature', { + 'id' => $id, + 'class' => $class , + 'value-type' => $bioFeat->source_tag }); + # Check for Bio::Annotations on the * <Feature> *. + $self->_parse_annotation( -xml => $xml, -obj => $bioFeat, + -desc => $featDesc, -id => $id, + -refs =>$featRefs, ); + # Add the description stuff for the <Feature> + foreach my $de (@{$featDesc}) { + $self->_addel($xmlFeat, "Attribute", { + name => $de->[0], content => $de->[1]}) if ($de->[1]); + } + $self->_parse_location($xml, $xmlFeat, $bioFeat); + + # loop through the tags, add them as <Qualifiers> + next if (defined $args->{SKIPTAGS} && + $args->{SKIPTAGS} =~ /all/i); + # Tags can consume a lot of CPU cycles, and can often be + # rather non-informative, so -skiptags can allow total or + # selective omission of tags. + foreach my $tag ($bioFeat->all_tags()) { + next if (exists $args->{SKIPTAGS}{$tag}); + foreach my $val ($bioFeat->each_tag_value($tag)) { + $self->_addel( $xmlFeat, 'Qualifier', { + 'value-type' => $tag , + 'value' => $val }); + } + } + } + } + + ############## + # <Seq-data> # + ############## + + # Add sequence data + if ( (my $data = $bioSeq->seq) && !$args->{NODATA} ) { + my $d = $self->_addel($xmlSeq, 'Seq-data'); + $d->appendChild( $xml->createTextNode($data) ); + } + + # If references were added, make a Feature-table for them + unless ($#{$featRefs} < 0) { + my $seqFT = $self->_addel($FTs, "Feature-table", { + title => "Feature References", }); + foreach my $feat (@{$featRefs}) { + $seqFT->appendChild($feat); + } + } + + # Place the completed <Sequence> tree as a child of <Sequences> + $seqsElem->appendChild($xmlSeq); + push @xmlSequences, $xmlSeq; + } + + # Prevent browser crashes by explicitly closing empty elements: + if ($args->{CLOSE}) { + my @problemChild = ('Sequences', 'Sequence', 'Feature-tables', + 'Feature-table', 'Screen', 'View',); + foreach my $kid (@problemChild) { + foreach my $prob ($xml->getElementsByTagName($kid)) { + unless ($prob->hasChildNodes) { + $prob->appendChild( + $xml->createComment(" Must close <$kid> explicitly ")); + } + } + } + } + + if (defined $args->{RETURN} && + $args->{RETURN} =~ /seq/i) { + return \@xmlSequences; + } else { + return $xml; + } +} + +=head2 write_seq + + Title : write_seq + Usage : $obj->write_seq(@args) + Function: Prints out an XML structure for one or more Bio::Seq objects. + If $seqref is an array ref, the XML tree generated will include + all the sequences in the array. This method is fairly simple, + most of the processing is performed within to_bsml. + Returns : A reference to the XML object generated / modified + Args : Argument array. Recognized keys: + + -seq A Bio::Seq reference, or an array reference of many of them + + Alternatively, the method may be called simply as... + + $obj->write_seq( $bioseq ) + + ... if only a single argument is passed, it is assumed that + it is the sequence object (can also be an array ref of + many Seq objects ) + +-printmime If true prints "Content-type: $mimetype\n\n" at top of + document, where $mimetype is the value designated by this + key. For generic XML use text/xml, for BSML use text/x-bsml + + -return This option will be supressed, since the nature of this + method is to print out the XML document. If you wish to + retrieve the <Sequence> objects generated, use the to_bsml + method directly. + +=cut + +sub write_seq { + my $self = shift; + my $args = $self->_parseparams( @_); + if ($#_ == 0 ) { + # If only a single value is passed, assume it is the seq object + unshift @_, "-seq"; + } + # Build a BSML XML DOM object based on the sequence(s) + my $xml = $self->to_bsml( @_, + -return => undef ); + # Convert to a string + my $out = $xml->toString; + # Print after putting a return after each element - more readable + $out =~ s/>/>\n/g; + $self->_print("Content-type: " . $args->{PRINTMIME} . "\n\n") + if ($args->{PRINTMIME}); + $self->_print( $out ); + # Return the DOM tree in case the user wants to do something with it + + $self->flush if $self->_flush_on_write && defined $self->_fh; + return $xml; +} + +=head1 INTERNAL METHODS +#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#- + + The following methods are used for internal processing, and should probably + not be accessed by the user. + +=head2 _parse_location + + Title : _parse_location + Usage : $obj->_parse_location($xmlDocument, $parentElem, $SeqFeatureObj) + Function: Adds <Interval-loc> and <Site-loc> children to <$parentElem> based + on locations / sublocations found in $SeqFeatureObj. If + sublocations exist, the original location will be ignored. + Returns : An array ref containing the elements added to the parent. + These will have already been added to <$parentElem> + Args : 0 The DOM::Document being modified + 1 The DOM::Element parent that you want to add to + 2 Reference to the Bio::SeqFeature being analyzed + +=cut + + ############################### + # <Interval-loc> & <Site-loc> # + ############################### + +sub _parse_location { + my $self = shift; + my ($xml, $xmlFeat, $bioFeat) = @_; + my $bioLoc = $bioFeat->location; + my @locations; + if (ref($bioLoc) =~ /Split/) { + @locations = $bioLoc->sub_Location; + # BSML 2.2 does not recognize / support joins. For this reason, + # we will just use the upper-level location. The line below can + # be deleted or commented out if/when BSML 3 supports complex + # interval deffinitions: + @locations = ($bioLoc); + } else { + @locations = ($bioLoc); + } + my @added = (); + + # Add the site or interval positional information: + foreach my $loc (@locations) { + my ($start, $end) = ($loc->start, $loc->end); + my %locAttr; + # Strand information is not well described in BSML + $locAttr{complement} = 1 if ($loc->strand == -1); + if ($start ne "" && ($start == $end || $end eq "")) { + $locAttr{sitepos} = $start; + push @added, $self->_addel($xmlFeat,'Site-loc',\%locAttr); + } elsif ($start ne "" && $end ne "") { + if ($start > $end) { + # The feature is on the complementary strand + ($start, $end) = ($end, $start); + $locAttr{complement} = 1; + } + $locAttr{startpos} = $start; + $locAttr{endpos} = $end; + push @added, $self->_addel($xmlFeat,'Interval-loc',\%locAttr); + } else { + warn "Failure to parse SeqFeature location. Start = '$start' & End = '$end'"; + } + } + return \@added; +} + +=head2 _parse_bsml_feature + + Title : _parse_bsml_feature + Usage : $obj->_parse_bsml_feature($xmlFeature ) + Function: Will examine the <Feature> element provided by $xmlFeature and + return a generic seq feature. + Returns : Bio::SeqFeature::Generic + Args : 0 XML::DOM::Element <Feature> being analyzed. + +=cut + +sub _parse_bsml_feature { + my $self = shift; + my ($feat) = @_; + + my $basegsf = new Bio::SeqFeature::Generic; + # score + # frame + # source_tag + + # Use the class as the primary tag value, if it is present + if ( my $val = $feat->getAttribute("class") ) { + $basegsf->primary_tag($val); + } + + # Positional information is in <Interval-loc>s or <Site-loc>s + # We need to grab these in order, to try to recreate joins... + my @locations = (); + foreach my $kid ($feat->getChildNodes) { + my $nodeName = $kid->getNodeName; + next unless ($nodeName eq "Interval-loc" || + $nodeName eq "Site-loc"); + push @locations, $kid; + } + if ($#locations == 0) { + # There is only one location specified + $self->_parse_bsml_location($locations[0], $basegsf); + } elsif ($#locations > 0) { +#>>>> # This is not working, I think the error is somewhere downstream + # of add_sub_SeqFeature, probably in RangeI::union ? + # The sub features are added fine, but the EXPANDed parent feature + # location has a messed up start - Bio::SeqFeature::Generic ref + # instead of an integer - and an incorrect end - the end of the first + # sub feature added, not of the union of all of them. + + # Also, the SeqIO::genbank.pm output is odd - the sub features appear + # to be listed with the *previous* feature, not this one. + + foreach my $location (@locations) { + my $subgsf = $self->_parse_bsml_location($location); + # print "start ", $subgsf->start,"\n"; + # print "end ", $subgsf->end,"\n"; + $basegsf->add_sub_SeqFeature($subgsf, 'EXPAND'); + } + # print $feat->getAttribute('id'),"\n"; + # print $basegsf->primary_tag,"\n"; + + } else { + # What to do if there are no locations? Nothing needed? + } + + # Look at any <Attribute>s or <Qualifier>s that are present: + my $floppies = &GETFLOPPIES($feat); + foreach my $attr (@{$floppies}) { + my ($name, $content) = &FLOPPYVALS($attr); + + if ($name =~ /xref/i) { + # Do we want to put these in DBLinks?? + } + + # Don't know what the object is, dump it to a tag: + $basegsf->add_tag_value(lc($name), $content); + } + + # Mostly this helps with debugging, but may be of utility... + # Add a tag holding the BSML id value + if ( (my $val = $feat->getAttribute('id')) && + !$basegsf->has_tag('bsml-id')) { + # Decided that this got a little sloppy... +# $basegsf->add_tag_value("bsml-id", $val); + } + return $basegsf; +} + +=head2 _parse_bsml_location + + Title : _parse_bsml_location + Usage : $obj->_parse_bsml_feature( $intOrSiteLoc, $gsfObject ) + Function: Will examine the <Interval-loc> or <Site-loc> element provided + Returns : Bio::SeqFeature::Generic + Args : 0 XML::DOM::Element <Interval/Site-loc> being analyzed. + 1 Optional SeqFeature::Generic to use + +=cut + +sub _parse_bsml_location { + my $self = shift; + my ($loc, $gsf) = @_; + + $gsf ||= new Bio::SeqFeature::Generic; + my $type = $loc->getNodeName; + my ($start, $end); + if ($type eq 'Interval-loc') { + $start = $loc->getAttribute('startpos'); + $end = $loc->getAttribute('endpos'); + } elsif ($type eq 'Site-loc') { + $start = $end = $loc->getAttribute('sitepos'); + } else { + warn "Unknown location type '$type', could not make GSF\n"; + return undef; + } + $gsf->start($start); + $gsf->end($end); + + # BSML does not have an explicit method to set undefined strand + if (my $s = $loc->getAttribute("complement")) { + if ($s) { + $gsf->strand(-1); + } else { + $gsf->strand(1); + } + } else { + # We're setting "strand nonspecific" here - bad idea? + # In most cases the user likely meant it to be on the + strand + $gsf->strand(0); + } + + return $gsf; +} + +=head2 _parse_reference + + Title : _parse_reference + Usage : $obj->_parse_reference(@args ) + Function: Makes a new <Reference> object from a ::Reference, which is + then stored in an array provide by -refs. It will be + appended to the XML tree later. + Returns : + Args : Argument array. Recognized keys: + + -xml The DOM::Document being modified + + -refobj The Annotation::Reference Object + + -refs An array reference to hold the new <Reference> DOM object + + -id Optional. If the XML id for the 'calling' element is + provided, it will be placed in any <Reference> refs + attribute. + +=cut + +sub _parse_reference { + my $self = shift; + my $args = $self->_parseparams( @_); + my ($xml, $ref, $refRef) = ($args->{XML}, $args->{REFOBJ}, $args->{REFS}); + + ############### + # <Reference> # + ############### + + my $xmlRef = $xml->createElement("Reference"); +#>> This may not be the right way to make a BSML dbxref... + if (my $link = $ref->medline) { + $xmlRef->setAttribute('dbxref', $link); + } + + # Make attributes for some of the characteristics + my %stuff = ( start => $ref->start, + end => $ref->end, + rp => $ref->rp, + comment => $ref->comment, + pubmed => $ref->pubmed, + ); + foreach my $s (keys %stuff) { + $self->_addel($xmlRef, "Attribute", { + name => $s, content => $stuff{$s} }) if ($stuff{$s}); + } + $xmlRef->setAttribute('refs', $args->{ID}) if ($args->{ID}); + # Add the basic information + # Should probably check for content before creation... + $self->_addel($xmlRef, "RefAuthors")-> + appendChild( $xml->createTextNode(&STRIP($ref->authors)) ); + $self->_addel($xmlRef, "RefTitle")-> + appendChild( $xml->createTextNode(&STRIP($ref->title)) ); + $self->_addel($xmlRef, "RefJournal")-> + appendChild( $xml->createTextNode(&STRIP($ref->location)) ); + # References will be added later in a <Feature-Table> + push @{$refRef}, $xmlRef; +} + +=head2 _parse_annotation + + Title : _parse_annotation + Usage : $obj->_parse_annotation(@args ) + Function: Will examine any Annotations found in -obj. Data found in + ::Comment and ::DBLink structures, as well as Annotation + description fields are stored in -desc for later + generation of <Attribute>s. <Reference> objects are generated + from ::References, and are stored in -refs - these will + be appended to the XML tree later. + Returns : + Args : Argument array. Recognized keys: + + -xml The DOM::Document being modified + + -obj Reference to the Bio object being analyzed + + -descr An array reference for holding description text items + + -refs An array reference to hold <Reference> DOM objects + + -id Optional. If the XML id for the 'calling' element is + provided, it will be placed in any <Reference> refs + attribute. + +=cut + +sub _parse_annotation { + my $self = shift; + my $args = $self->_parseparams( @_); + my ($xml, $obj, $descRef, $refRef) = + ( $args->{XML}, $args->{OBJ}, $args->{DESC}, $args->{REFS} ); + # No good place to put any of this (except for references). Most stuff + # just gets dumped to <Attribute>s + my $ann = $obj->annotation; + return undef unless ($ann); +# use BMS::Branch; my $debug = BMS::Branch->new( ); warn "$obj :"; $debug->branch($ann); + unless (ref($ann) =~ /Collection/) { + # Old style annotation. It seems that Features still use this + # form of object + $self->_parse_annotation_old(@_); + return; + } + + foreach my $key ($ann->get_all_annotation_keys()) { + foreach my $thing ($ann->get_Annotations($key)) { + if ($key eq 'description') { + push @{$descRef}, ["description" , $thing->value]; + } elsif ($key eq 'comment') { + push @{$descRef}, ["comment" , $thing->text]; + } elsif ($key eq 'dblink') { + # DBLinks get dumped to attributes, too + push @{$descRef}, ["db_xref" , $thing->database . ":" + . $thing->primary_id ]; + if (my $com = $thing->comment) { + push @{$descRef}, ["link" , $com->text ]; + } + + } elsif ($key eq 'reference') { + $self->_parse_reference( @_, -refobj => $thing ); + } elsif (ref($thing) =~ /SimpleValue/) { + push @{$descRef}, [$key , $thing->value]; + } else { + # What is this?? + push @{$descRef}, ["error", "bsml.pm did not understand ". + "'$key' = '$thing'" ]; + } + } + } +} + +=head2 _parse_annotation_old + + Title : _parse_annotation_old + Usage : $obj->_parse_annotation_old(@args) + Function: As above, but for the old Annotation system. + Apparently needed because Features are still using the old-style + annotations? + Returns : + Args : Argument array. Recognized keys: + + -xml The DOM::Document being modified + + -obj Reference to the Bio object being analyzed + + -descr An array reference for holding description text items + + -refs An array reference to hold <Reference> DOM objects + + -id Optional. If the XML id for the 'calling' element is + provided, it will be placed in any <Reference> refs + attribute. + +=cut + + ############### + # <Reference> # + ############### + +sub _parse_annotation_old { + my $self = shift; + my $args = $self->_parseparams( @_); + my ($xml, $obj, $descRef, $refRef) = + ( $args->{XML}, $args->{OBJ}, $args->{DESC}, $args->{REFS} ); + # No good place to put any of this (except for references). Most stuff + # just gets dumped to <Attribute>s + if (my $ann = $obj->annotation) { + push @{$descRef}, ["annotation", $ann->description]; + foreach my $com ($ann->each_Comment) { + push @{$descRef}, ["comment" , $com->text]; + } + + # Gene names just get dumped to <Attribute name="gene"> + foreach my $gene ($ann->each_gene_name) { + push @{$descRef}, ["gene" , $gene]; + } + + # DBLinks get dumped to attributes, too + foreach my $link ($ann->each_DBLink) { + push @{$descRef}, ["db_xref" , + $link->database . ":" . $link->primary_id ]; + if (my $com = $link->comment) { + push @{$descRef}, ["link" , $com->text ]; + } + } + + # References get produced and temporarily held + foreach my $ref ($ann->each_Reference) { + $self->_parse_reference( @_, -refobj => $ref ); + } + } +} + +=head2 _add_page + + Title : _add_page + Usage : $obj->_add_page($xmlDocument, $xmlSequenceObject) + Function: Adds a simple <Page> and <View> structure for a <Sequence> + Returns : a reference to the newly created <Page> + Args : 0 The DOM::Document being modified + 1 Reference to the <Sequence> object + +=cut + +sub _add_page { + my $self = shift; + my ($xml, $seq) = @_; + my $disp = $xml->getElementsByTagName("Display")->item(0); + my $page = $self->_addel($disp, "Page"); + my ($width, $height) = ( 7.8, 5.5); + my $screen = $self->_addel($page, "Screen", { + width => $width, height => $height, }); +# $screen->appendChild($xml->createComment("Must close explicitly")); + my $view = $self->_addel($page, "View", { + seqref => $seq->getAttribute('id'), + title => $seq->getAttribute('title'), + title1 => "{NAME}", + title2 => "{LENGTH} {UNIT}", + }); + $self->_addel($view, "View-line-widget", { + shape => 'horizontal', + hcenter => $width/2 + 0.7, + 'linear-length' => $width - 2, + }); + $self->_addel($view, "View-axis-widget"); + return $page; +} + + +=head2 _addel + + Title : _addel + Usage : $obj->_addel($parentElem, 'ChildName', + { anAttr => 'someValue', anotherAttr => 'aValue',}) + Function: Add an element with attribute values to a DOM tree + Returns : a reference to the newly added element + Args : 0 The DOM::Element parent that you want to add to + 1 The name of the new child element + 2 Optional hash reference containing + attribute name => attribute value assignments + +=cut + +sub _addel { + my $self = shift; + my ($root, $name, $attr) = @_; + + # Find the DOM::Document for the parent + my $doc = $root->getOwnerDocument || $root; + my $elem = $doc->createElement($name); + foreach my $a (keys %{$attr}) { + $elem->setAttribute($a, $attr->{$a}); + } + $root->appendChild($elem); + return $elem; +} + +=head2 _show_dna + + Title : _show_dna + Usage : $obj->_show_dna($newval) + Function: (cut-and-pasted directly from embl.pm) + Returns : value of _show_dna + Args : newvalue (optional) + +=cut + +sub _show_dna { + my $obj = shift; + if( @_ ) { + my $value = shift; + $obj->{'_show_dna'} = $value; + } + return $obj->{'_show_dna'}; +} + +=head2 _initialize + + Title : _initialize + Usage : $dom = $obj->_initialize(@args) + Function: Coppied from embl.pm, and augmented with initialization of the + XML DOM tree + Returns : + Args : -file => the XML file to be parsed + +=cut + +sub _initialize { + my($self,@args) = @_; + + $self->SUPER::_initialize(@args); + # hash for functions for decoding keys. + $self->{'_func_ftunit_hash'} = {}; + $self->_show_dna(1); # sets this to one by default. People can change it + + my %param = @args; # From SeqIO.pm + @param{ map { lc $_ } keys %param } = values %param; # lowercase keys + if ( exists $param{-file} && $param{-file} !~ /^>/) { + # Is it blasphemy to add your own keys to an object in another package? + # domtree => the parsed DOM tree retruned by XML::DOM + $self->{'domtree'} = $self->_parse_xml( $param{-file} ); + # current_node => the <Sequence> node next in line for next_seq + $self->{'current_node'} = 0; + } + + $self->sequence_factory( new Bio::Seq::SeqFactory + ( -verbose => $self->verbose(), + -type => 'Bio::Seq::RichSeq')) + if( ! defined $self->sequence_factory ); +} + + +=head2 _parseparams + + Title : _parseparams + Usage : my $paramHash = $obj->_parseparams(@args) + Function: Borrowed from Bio::Parse.pm, who borrowed it from CGI.pm + Lincoln Stein -> Richard Resnick -> here + Returns : A hash reference of the parameter keys (uppercase) pointing to + their values. + Args : An array of key, value pairs. Easiest to pass values as: + -key1 => value1, -key2 => value2, etc + Leading "-" are removed. + +=cut + +sub _parseparams { + my $self = shift; + my %hash = (); + my @param = @_; + + # Hacked out from Parse.pm + # The next few lines strip out the '-' characters which + # preceed the keys, and capitalizes them. + for (my $i=0;$i<@param;$i+=2) { + $param[$i]=~s/^\-//; + $param[$i]=~tr/a-z/A-Z/; + } + pop @param if @param %2; # not an even multiple + %hash = @param; + return \%hash; +} + +=head2 _parse_xml + + Title : _parse_xml + Usage : $dom = $obj->_parse_xml($filename) + Function: uses XML::DOM to construct a DOM tree from the BSML document + Returns : a reference to the parsed DOM tree + Args : 0 Path to the XML file needing to be parsed + +=cut + +sub _parse_xml { + my $self = shift; + my $file = shift; + + unless (-e $file) { + $self->throw("Could not parse non-existant XML file '$file'."); + return undef; + } + my $parser = new XML::DOM::Parser; + my $doc = $parser->parsefile ($file); + return $doc; +} + +sub DESTROY { + my $self = shift; + # Reports off the net imply that DOM::Parser will memory leak if you + # do not explicitly dispose of it: + # http://aspn.activestate.com/ASPN/Mail/Message/perl-xml/788458 + my $dom = $self->{'domtree'}; + # For some reason the domtree can get undef-ed somewhere... + $dom->dispose if ($dom); +} + + +=head1 TESTING SCRIPT + + The following script may be used to test the conversion process. You + will need a file of the format you wish to test. The script will + convert the file to BSML, store it in /tmp/bsmltemp, read that file + into a new SeqIO stream, and write it back as the original + format. Comparison of this second file to the original input file + will allow you to track where data may be lost or corrupted. Note + that you will need to specify $readfile and $readformat. + + use Bio::SeqIO; + # Tests preservation of details during round-trip conversion: + # $readformat -> BSML -> $readformat + my $tempspot = "/tmp/bsmltemp"; # temp folder to hold generated files + my $readfile = "rps4y.embl"; # The name of the file you want to test + my $readformat = "embl"; # The format of the file being tested + + system "mkdir $tempspot" unless (-d $tempspot); + # Make Seq object from the $readfile + my $biostream = Bio::SeqIO->new( -file => "$readfile" ); + my $seq = $biostream->next_seq(); + + # Write BSML from SeqObject + my $bsmlout = Bio::SeqIO->new( -format => 'bsml', + -file => ">$tempspot/out.bsml"); + warn "\nBSML written to $tempspot/out.bsml\n"; + $bsmlout->write_seq($seq); + # Need to kill object for following code to work... Why is this so? + $bsmlout = ""; + + # Make Seq object from BSML + my $bsmlin = Bio::SeqIO->new( -file => "$tempspot/out.bsml", + -format => 'bsml'); + my $seq2 = $bsmlin->next_seq(); + + # Write format back from Seq Object + my $genout = Bio::SeqIO->new( -format => $readformat, + -file => ">$tempspot/out.$readformat"); + $genout->write_seq($seq2); + warn "$readformat written to $tempspot/out.$readformat\n"; + + # BEING LOST: + # Join information (not possible in BSML 2.2) + # Sequence type (??) + +=cut + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqIO/chado.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/chado.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,417 @@ +# $Id: chado.pm,v 1.1 2002/12/03 08:13:55 cjm Exp $ +# +# BioPerl module for Bio::SeqIO::chado +# +# Chris Mungall <cjm@fruitfly.org> +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::chado - chado sequence input/output stream + +=head1 SYNOPSIS + +It is probably best not to use this object directly, but +rather go through the SeqIO handler system. Go: + + $stream = Bio::SeqIO->new(-file => $filename, -format => 'chado'); + + while ( my $seq = $stream->next_seq() ) { + # do something with $seq + } + +=head1 DESCRIPTION + +This object can transform Bio::Seq objects to and from chado flat +file databases. CURRENTLY ONLY TO + + +=head2 Optional functions + +=over 3 + +=item _show_dna() + +(output only) shows the dna or not + +=item _post_sort() + +(output only) provides a sorting func which is applied to the FTHelpers +before printing + + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://www.bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bio.perl.org/bioperl-bugs/ + +=head1 AUTHOR - Chris Mungall + +Email cjm@fruitfly.org + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::SeqIO::chado; +use vars qw(@ISA); +use strict; + +use Bio::SeqIO; +use Bio::SeqFeature::Generic; +use Bio::Species; +use Bio::Seq::SeqFactory; +use Bio::Annotation::Collection; +use Bio::Annotation::Comment; +use Bio::Annotation::Reference; +use Bio::Annotation::DBLink; + +use Data::Stag qw(:all); + +@ISA = qw(Bio::SeqIO); + +sub _initialize { + my($self,@args) = @_; + + $self->SUPER::_initialize(@args); + if( ! defined $self->sequence_factory ) { + $self->sequence_factory(new Bio::Seq::SeqFactory + (-verbose => $self->verbose(), + -type => 'Bio::Seq::RichSeq')); + } + my $wclass = $self->default_handler_class; + $self->handler($wclass->new); + $self->{_end_of_data} = 0; + $self->handler->S("chado"); + return; +} + +sub DESTROY { + my $self = shift; + $self->end_of_data(); + $self->SUPER::DESTROY(); +} + +sub end_of_data { + my $self = shift; + $self->{_end_of_data} = 1; + $self->handler->E("chado"); +} + +sub default_handler_class { + return "Data::Stag::BaseHandler"; +} + +=head2 next_seq + + Title : next_seq + Usage : $seq = $stream->next_seq() + Function: returns the next sequence in the stream + Returns : Bio::Seq object + Args : + +=cut + +sub next_seq { + my ($self,@args) = @_; + my $seq = $self->sequence_factory->create + ( + # '-verbose' =>$self->verbose(), + # %params, + # -seq => $seqc, + # -annotation => $annotation, + # -features => \@features + ); + return $seq; +} + +sub handler { + my $self = shift; + $self->{_handler} = shift if @_; + return $self->{_handler}; +} + + +=head2 write_seq + + Title : write_seq + Usage : $stream->write_seq($seq) + Function: writes the $seq object (must be seq) to the stream + Returns : 1 for success and 0 for error + Args : Bio::Seq + + +=cut + +sub write_seq { + my ($self,$seq) = @_; + + if( !defined $seq ) { + $self->throw("Attempting to write with no seq!"); + } + + if( ! ref $seq || ! $seq->isa('Bio::SeqI') ) { + $self->warn(" $seq is not a SeqI compliant module. Attempting to dump, but may fail!"); + } + + # get a handler - must inherit from Data::Stag::BaseHandler; + my $w = $self->handler; + + # start of data + $w->S("seqset"); + + # my $seq_temp_uid = $self->get_temp_uid($seq); + + my $seq_temp_uid = $seq->accession . '.' . ($seq->can('seq_version') ? $seq->seq_version : $seq->version); + + # data structure representing the core sequence for this record + my $seqnode = + Data::Stag->new(feature=>[ + [feature_id=>$seq_temp_uid], + [dbxrefstr=>$seq->accession_number], + [name=>$seq->display_name], + [residues=>$seq->seq], + ]); + + # soft properties + my %prop = (); + + my ($div, $mol); + my $len = $seq->length(); + + if ( $seq->can('division') ) { + $div=$seq->division; + } + if( !defined $div || ! $div ) { $div = 'UNK'; } + + if( !$seq->can('molecule') || ! defined ($mol = $seq->molecule()) ) { + $mol = $seq->alphabet || 'DNA'; + } + + + my $circular = 'linear '; + $circular = 'circular' if $seq->is_circular; + + # cheeky hack - access symbol table + no strict 'refs'; + map { + $prop{$_} = + $ {*$_}; + } qw(mol div circular); + use strict 'refs'; + + map { + $prop{$_} = $seq->$_() if $seq->can($_); + } qw(desc keywords); + + local($^W) = 0; # supressing warnings about uninitialized fields. + + # Organism lines + if (my $spec = $seq->species) { + my ($species, $genus, @class) = $spec->classification(); + my $OS; + if( $spec->common_name ) { + $OS = $spec->common_name; + } else { + $OS = "$genus $species"; + } + if (my $ssp = $spec->sub_species) { + $OS .= " $ssp"; + } + } + + # Reference lines + my $count = 1; + foreach my $ref ( $seq->annotation->get_Annotations('reference') ) { + # TODO + } + # Comment lines + + foreach my $comment ( $seq->annotation->get_Annotations('comment') ) { + $seqnode->add_featureprop([[pkey=>'comment'],[pval=>$comment->text]]); + } + + # throw the writer an event + $w->ev(@$seqnode); + + $seqnode = undef; # free memory + + # make events for all the features within the record + foreach my $sf ( $seq->top_SeqFeatures ) { + $self->write_sf($sf, $seq_temp_uid); + } + + # data end + $w->E("seqset"); + return 1; +} + +# ---- +# writes a seq feature +# ---- + +sub write_sf { + my $self = shift; + my $sf = shift; + my $seq_temp_uid = shift; + + my $w = $self->handler; + + my %props = + map { + $_=>[$sf->each_tag_value($_)] + } $sf->all_tags; + + my $loc = $sf->location; + my $name = $sf->display_name; + my $type = $sf->primary_tag; + my @subsfs = $sf->sub_SeqFeature; + my @locnodes = (); + my $sid = $loc->is_remote ? $loc->seq_id : $seq_temp_uid; + if( $loc->isa("Bio::Location::SplitLocationI") ) { + # turn splitlocs into subfeatures + my $n = 1; + push(@subsfs, + map { + my $ssf = + Bio::SeqFeature::Generic->new( + + -start=>$_->start, + -end=>$_->end, + -strand=>$_->strand, + -primary=>$self->subpartof($type), + ); + if ($_->is_remote) { + $ssf->location->is_remote(1); + $ssf->location->seq_id($_->seq_id); + } + $ssf; + } $loc->each_Location); + } + elsif( $loc->isa("Bio::Location::RemoteLocationI") ) { + # turn splitlocs into subfeatures + my $n = 1; + push(@subsfs, + map { + Bio::SeqFeature::Generic->new( +# -name=>$name.'.'.$n++, + -start=>$_->start, + -end=>$_->end, + -strand=>$_->strand, + -primary=>$self->subpartof($type), + ) + } $loc->each_Location); + } + else { + my ($beg, $end, $strand) = $self->bp2ib($loc); + @locnodes = ( + [featureloc=>[ + [nbeg=>$beg], + [nend=>$end], + [strand=>$strand], + [srcfeature_id=>$sid], + [group=>0], + [rank=>0], + ] + ] + ); + } + my $feature_id = $self->get_temp_uid($sf); + + my $fnode = + [feature=>[ + [feature_id=>$feature_id], + [name=>$name], + [typename=>$type], + @locnodes, + (map { + my $k = $_; + map { [featureprop=>[[pkey=>$k],[pval=>$_]]] } @{$props{$k}} + } keys %props), + ]]; + $w->ev(@$fnode); + + foreach my $ssf (@subsfs) { + my $ssfid = $self->write_sf($ssf, $sid); + $w->ev(feature_relationship=>[ + [subjfeature_id=>$ssfid], + [objfeature_id=>$feature_id] + ] + ); + } + return $feature_id; +} + +# private; +# an ID for this session that should be +# unique... hmm +sub session_id { + my $self = shift; + $self->{_session_id} = shift if @_; + if (!$self->{_session_id}) { + $self->{_session_id} = $$.time; + } + return $self->{_session_id}; +} + + +our $next_id = 1; +our %obj2id_hash = (); +sub get_temp_uid { + my $self = shift; + my $ob = shift; + my $id = $obj2id_hash{$ob}; + if (!$id) { + $id = $next_id++; + $obj2id_hash{$ob} = $id; + } + return $self->session_id.'.'.$id; +} + +# interbase and directional semantics +sub bp2ib { + my $self = shift; + my $loc = shift; + my ($s, $e, $str) = + ref($loc) eq "ARRAY" ? (@$loc) : ($loc->start, $loc->end, $loc->strand); + if ($str < 0) { + ($s, $e) = ($e, $s); + } + $s--; + return ($s, $e, $str); +} + +sub subpartof { + my $self = shift; + my $type = 'partof_'.shift; + $type =~ s/partof_CDS/CDS_exon/; + $type =~ s/partof_\wRNA/exon/; + return $type; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqIO/chadoitext.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/chadoitext.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,78 @@ +# $Id: chadoitext.pm,v 1.2 2002/12/05 13:46:36 heikki Exp $ +# +# BioPerl module for Bio::SeqIO::chadoitext +# +# Chris Mungall <cjm@fruitfly.org> +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::chadoitext - chadoitext sequence input/output stream + +=head1 SYNOPSIS + +It is probably best not to use this object directly, but +rather go through the SeqIO handler system. Go: + + $stream = Bio::SeqIO->new(-file => $filename, -format => 'chadoitext'); + + while ( my $seq = $stream->next_seq() ) { + # do something with $seq + } + +=head1 DESCRIPTION + +This object can transform Bio::Seq objects to and from chadoitext flat +file databases. CURRENTLY ONLY TO + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://www.bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bio.perl.org/bioperl-bugs/ + +=head1 AUTHOR - Chris Mungall + +Email cjm@fruitfly.org + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::SeqIO::chadoitext; +use Bio::SeqIO::chado; +use vars qw(@ISA); +use strict; + +use Data::Stag::ITextWriter; + +@ISA = qw(Bio::SeqIO::chado); + +sub default_handler_class { + return "Data::Stag::ITextWriter"; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqIO/chadosxpr.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/chadosxpr.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,80 @@ +# $Id: chadosxpr.pm,v 1.2 2002/12/05 13:46:36 heikki Exp $ +# +# BioPerl module for Bio::SeqIO::chadosxpr +# +# Chris Mungall <cjm@fruitfly.org> +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::chadosxpr - chadosxpr sequence input/output stream + +=head1 SYNOPSIS + +It is probably best not to use this object directly, but +rather go through the SeqIO handler system. Go: + + $stream = Bio::SeqIO->new(-file => $filename, -format => 'chadosxpr'); + + while ( my $seq = $stream->next_seq() ) { + # do something with $seq + } + +=head1 DESCRIPTION + +This object can transform Bio::Seq objects to and from chadosxpr flat +file databases. CURRENTLY ONLY TO + + + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://www.bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bio.perl.org/bioperl-bugs/ + +=head1 AUTHOR - Chris Mungall + +Email cjm@fruitfly.org + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::SeqIO::chadosxpr; +use Bio::SeqIO::chado; +use vars qw(@ISA); +use strict; + +use Data::Stag::SxprWriter; + +@ISA = qw(Bio::SeqIO::chado); + +sub default_handler_class { + return "Data::Stag::SxprWriter"; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqIO/chadoxml.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/chadoxml.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,79 @@ +# $Id: chadoxml.pm,v 1.2 2002/12/05 13:46:36 heikki Exp $ +# +# BioPerl module for Bio::SeqIO::chadoxml +# +# Chris Mungall <cjm@fruitfly.org> +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::chadoxml - chadoxml sequence input/output stream + +=head1 SYNOPSIS + +It is probably best not to use this object directly, but +rather go through the SeqIO handler system. Go: + + $stream = Bio::SeqIO->new(-file => $filename, -format => 'chadoxml'); + + while ( my $seq = $stream->next_seq() ) { + # do something with $seq + } + +=head1 DESCRIPTION + +This object can transform Bio::Seq objects to and from chadoxml flat +file databases. CURRENTLY ONLY TO + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://www.bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bio.perl.org/bioperl-bugs/ + +=head1 AUTHOR - Chris Mungall + +Email cjm@fruitfly.org + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::SeqIO::chadoxml; +use Bio::SeqIO::chado; +use vars qw(@ISA); +use strict; + +use Data::Stag::XMLWriter; + +@ISA = qw(Bio::SeqIO::chado); + +sub default_handler_class { + return "Data::Stag::XMLWriter"; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqIO/ctf.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/ctf.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,141 @@ +# $Id: ctf.pm,v 1.8 2002/10/22 07:38:42 lapp Exp $ +# BioPerl module for Bio::SeqIO::ctf +# +# Cared for by Aaron Mackey <amackey@virginia.edu> +# +# Copyright Aaron Mackey +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::ctf - ctf trace sequence input/output stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the Bio::SeqIO class. + +=head1 DESCRIPTION + +This object can transform Bio::Seq objects to and from ctf trace +files. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS - Aaron Mackey + +Email: amackey@virginia.edu + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::SeqIO::ctf; +use vars qw(@ISA $READ_AVAIL); +use strict; +# Object preamble - inherits from Bio::Root::Object + +use Bio::SeqIO; +use Bio::Seq::SeqFactory; + +push @ISA, qw( Bio::SeqIO ); + +sub BEGIN { + eval { require Bio::SeqIO::staden::read; }; + if ($@) { + $READ_AVAIL = 0; + } else { + push @ISA, "Bio::SeqIO::staden::read"; + $READ_AVAIL = 1; + } +} + +sub _initialize { + my($self,@args) = @_; + $self->SUPER::_initialize(@args); + if( ! defined $self->sequence_factory ) { + $self->sequence_factory(new Bio::Seq::SeqFactory(-verbose => $self->verbose(), -type => 'Bio::Seq::SeqWithQuality')); + } + unless ($READ_AVAIL) { + Bio::Root::Root->throw( -class => 'Bio::Root::SystemException', + -text => "Bio::SeqIO::staden::read is not available; make sure the bioperl-ext package has been installed successfully!" + ); + } +} + +=head2 next_seq + + Title : next_seq + Usage : $seq = $stream->next_seq() + Function: returns the next sequence in the stream + Returns : Bio::SeqWithQuality object + Args : NONE + +=cut + +sub next_seq { + + my ($self) = @_; + + my ($seq, $id, $desc, $qual) = $self->read_trace($self->_fh, 'ctf'); + + # create the seq object + $seq = $self->sequence_factory->create(-seq => $seq, + -id => $id, + -primary_id => $id, + -desc => $desc, + -alphabet => 'DNA', + -qual => $qual + ); + return $seq; +} + +=head2 write_seq + + Title : write_seq + Usage : $stream->write_seq(@seq) + Function: writes the $seq object into the stream + Returns : 1 for success and 0 for error + Args : Bio::Seq object + + +=cut + +sub write_seq { + my ($self,@seq) = @_; + + my $fh = $self->_fh; + foreach my $seq (@seq) { + $self->write_trace($fh, $seq, 'ctf'); + } + + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqIO/embl.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/embl.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1240 @@ +# $Id: embl.pm,v 1.57.2.6 2003/09/14 19:06:51 jason Exp $ +# +# BioPerl module for Bio::SeqIO::EMBL +# +# Cared for by Ewan Birney <birney@ebi.ac.uk> +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::embl - EMBL sequence input/output stream + +=head1 SYNOPSIS + +It is probably best not to use this object directly, but +rather go through the AnnSeqIO handler system. Go: + + $stream = Bio::SeqIO->new(-file => $filename, -format => 'EMBL'); + + while ( (my $seq = $stream->next_seq()) ) { + # do something with $seq + } + +=head1 DESCRIPTION + +This object can transform Bio::Seq objects to and from EMBL flat +file databases. + +There is alot of flexibility here about how to dump things which I need +to document fully. + +There should be a common object that this and genbank share (probably +with swissprot). Too much of the magic is identical. + +=head2 Optional functions + +=over 3 + +=item _show_dna() + +(output only) shows the dna or not + +=item _post_sort() + +(output only) provides a sorting func which is applied to the FTHelpers +before printing + +=item _id_generation_func() + +This is function which is called as + + print "ID ", $func($annseq), "\n"; + +To generate the ID line. If it is not there, it generates a sensible ID +line using a number of tools. + +If you want to output annotations in embl format they need to be +stored in a Bio::Annotation::Collection object which is accessible +through the Bio::SeqI interface method L<annotation()|annotation>. + +The following are the names of the keys which are polled from a +L<Bio::Annotation::Collection> object. + +reference - Should contain Bio::Annotation::Reference objects +comment - Should contain Bio::Annotation::Comment objects +dblink - Should contain Bio::Annotation::DBLink objects + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://www.bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney + +Email birney@ebi.ac.uk + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SeqIO::embl; +use vars qw(@ISA); +use strict; +use Bio::SeqIO::FTHelper; +use Bio::SeqFeature::Generic; +use Bio::Species; +use Bio::Seq::SeqFactory; +use Bio::Annotation::Collection; +use Bio::Annotation::Comment; +use Bio::Annotation::Reference; +use Bio::Annotation::DBLink; + +@ISA = qw(Bio::SeqIO); + +sub _initialize { + my($self,@args) = @_; + + $self->SUPER::_initialize(@args); + # hash for functions for decoding keys. + $self->{'_func_ftunit_hash'} = {}; + $self->_show_dna(1); # sets this to one by default. People can change it + if( ! defined $self->sequence_factory ) { + $self->sequence_factory(new Bio::Seq::SeqFactory + (-verbose => $self->verbose(), + -type => 'Bio::Seq::RichSeq')); + } +} + +=head2 next_seq + + Title : next_seq + Usage : $seq = $stream->next_seq() + Function: returns the next sequence in the stream + Returns : Bio::Seq object + Args : + +=cut + +sub next_seq { + my ($self,@args) = @_; + my ($pseq,$c,$line,$name,$desc,$acc,$seqc,$mol,$div, + $date, $comment, @date_arr); + + my ($annotation, %params, @features) = ( new Bio::Annotation::Collection); + + $line = $self->_readline; # This needs to be before the first eof() test + + if( !defined $line ) { + return undef; # no throws - end of file + } + + if( $line =~ /^\s+$/ ) { + while( defined ($line = $self->_readline) ) { + $line =~/^\S/ && last; + } + } + if( !defined $line ) { + return undef; # end of file + } + $line =~ /^ID\s+\S+/ || $self->throw("EMBL stream with no ID. Not embl in my book"); + $line =~ /^ID\s+(\S+)\s+\S+\;\s+([^;]+)\;\s+(\S+)\;/; + $name = $1; + $mol = $2; + $div = $3; + if(! $name) { + $name = "unknown id"; + } + my $alphabet; + + # this is important to have the id for display in e.g. FTHelper, otherwise + # you won't know which entry caused an error + if($mol) { + if ( $mol =~ /circular/ ) { + $params{'-is_circular'} = 1; + $mol =~ s|circular ||; + } + if (defined $mol ) { + if ($mol =~ /DNA/) { + $alphabet='dna'; + } + elsif ($mol =~ /RNA/) { + $alphabet='rna'; + } + elsif ($mol =~ /AA/) { + $alphabet='protein'; + } + } + + } + +# $self->warn("not parsing upper annotation in EMBL file yet!"); + my $buffer = $line; + + local $_; + + BEFORE_FEATURE_TABLE : + until( !defined $buffer ) { + $_ = $buffer; + + # Exit at start of Feature table + last if /^F[HT]/; + + # Description line(s) + if (/^DE\s+(\S.*\S)/) { + $desc .= $desc ? " $1" : $1; + } + + #accession number + if( /^AC\s+(.*)?/ ) { + my @accs = split(/[; ]+/, $1); # allow space in addition + $params{'-accession_number'} = shift @accs + unless defined $params{'-accession_number'}; + push @{$params{'-secondary_accessions'}}, @accs; + } + + #version number + if( /^SV\s+\S+\.(\d+);?/ ) { + my $sv = $1; + #$sv =~ s/\;//; + $params{'-seq_version'} = $sv; + $params{'-version'} = $sv; + } + + #date (NOTE: takes last date line) + if( /^DT\s+(.+)$/ ) { + my $date = $1; + push @{$params{'-dates'}}, $date; + } + + #keywords + if( /^KW\s+(.*)\S*$/ ) { + my @kw = split(/\s*\;\s*/,$1); + push @{$params{'-keywords'}}, @kw; + } + + # Organism name and phylogenetic information + elsif (/^O[SC]/) { + my $species = $self->_read_EMBL_Species(\$buffer); + $params{'-species'}= $species; + } + + # References + elsif (/^R/) { + my @refs = $self->_read_EMBL_References(\$buffer); + foreach my $ref ( @refs ) { + $annotation->add_Annotation('reference',$ref); + } + } + + # DB Xrefs + elsif (/^DR/) { + my @links = $self->_read_EMBL_DBLink(\$buffer); + foreach my $dblink ( @links ) { + $annotation->add_Annotation('dblink',$dblink); + } + } + + # Comments + elsif (/^CC\s+(.*)/) { + $comment .= $1; + $comment .= " "; + while (defined ($_ = $self->_readline) ) { + if (/^CC\s+(.*)/) { + $comment .= $1; + $comment .= " "; + } + else { + last; + } + } + my $commobj = Bio::Annotation::Comment->new(); + $commobj->text($comment); + $annotation->add_Annotation('comment',$commobj); + $comment = ""; + } + + # Get next line. + $buffer = $self->_readline; + } + + while( defined ($_ = $self->_readline) ) { + /^FT \w/ && last; + /^SQ / && last; + /^CO / && last; + } + $buffer = $_; + + if (defined($buffer) && $buffer =~ /^FT /) { + until( !defined ($buffer) ) { + my $ftunit = $self->_read_FTHelper_EMBL(\$buffer); + # process ftunit + + push(@features, + $ftunit->_generic_seqfeature($self->location_factory(), $name)); + + if( $buffer !~ /^FT/ ) { + last; + } + } + } + + + # skip comments + while( defined ($buffer) && $buffer =~ /^XX/ ) { + $buffer = $self->_readline(); + } + + if( $buffer =~ /^CO/ ) { + until( !defined ($buffer) ) { + my $ftunit = $self->_read_FTHelper_EMBL(\$buffer); + # process ftunit + push(@features, + $ftunit->_generic_seqfeature($self->location_factory(), + $name)); + + if( $buffer !~ /^CO/ ) { + last; + } + } + } + if( $buffer !~ /^SQ/ ) { + while( defined ($_ = $self->_readline) ) { + /^SQ/ && last; + } + } + + $seqc = ""; + while( defined ($_ = $self->_readline) ) { + /^\/\// && last; + $_ = uc($_); + s/[^A-Za-z]//g; + $seqc .= $_; + } + my $seq = $self->sequence_factory->create + (-verbose => $self->verbose(), + -division => $div, + -seq => $seqc, + -desc => $desc, + -display_id => $name, + -annotation => $annotation, + -molecule => $mol, + -alphabet => $alphabet, + -features => \@features, + %params); + + return $seq; +} + +=head2 write_seq + + Title : write_seq + Usage : $stream->write_seq($seq) + Function: writes the $seq object (must be seq) to the stream + Returns : 1 for success and 0 for error + Args : array of 1 to n Bio::SeqI objects + + +=cut + +sub write_seq { + my ($self,@seqs) = @_; + + foreach my $seq ( @seqs ) { + $self->throw("Attempting to write with no seq!") unless defined $seq; + unless ( ref $seq && $seq->isa('Bio::SeqI' ) ) { + $self->warn("$seq is not a SeqI compliant sequence object!") + if $self->verbose >= 0; + unless ( ref $seq && $seq->isa('Bio::PrimarySeqI' ) ) { + $self->throw("$seq is not a PrimarySeqI compliant sequence object!"); + } + } + my $str = $seq->seq || ''; + + my $mol; + my $div; + my $len = $seq->length(); + + if ($seq->can('division') && defined $seq->division) { + $div = $seq->division(); + } + $div ||= 'UNK'; + + if ($seq->can('molecule')) { + $mol = $seq->molecule(); + $mol = 'RNA' if defined $mol && $mol =~ /RNA/; # no 'mRNA' + } + elsif ($seq->can('primary_seq') && defined $seq->primary_seq->alphabet) { + my $alphabet =$seq->primary_seq->alphabet; + if ($alphabet eq 'dna') { + $mol ='DNA'; + } + elsif ($alphabet eq 'rna') { + $mol='RNA'; + } + elsif ($alphabet eq 'protein') { + $mol='AA'; + } + } + $mol ||= 'XXX'; + $mol = "circular $mol" if $seq->is_circular; + + my $temp_line; + if( $self->_id_generation_func ) { + $temp_line = &{$self->_id_generation_func}($seq); + } else { + $temp_line = sprintf("%-11sstandard; $mol; $div; %d BP.", $seq->id(), $len); + } + + $self->_print( "ID $temp_line\n","XX\n"); + + # Write the accession line if present + my( $acc ); + { + if( my $func = $self->_ac_generation_func ) { + $acc = &{$func}($seq); + } elsif( $seq->isa('Bio::Seq::RichSeqI') && + defined($seq->accession_number) ) { + $acc = $seq->accession_number; + $acc = join(";", $acc, $seq->get_secondary_accessions); + } elsif ( $seq->can('accession_number') ) { + $acc = $seq->accession_number; + } + + if (defined $acc) { + $self->_print("AC $acc;\n", + "XX\n"); + } + } + + # Write the sv line if present + { + my( $sv ); + if (my $func = $self->_sv_generation_func) { + $sv = &{$func}($seq); + } elsif($seq->isa('Bio::Seq::RichSeqI') && + defined($seq->seq_version)) { + $sv = "$acc.". $seq->seq_version(); + } + if (defined $sv) { + $self->_print( "SV $sv\n", + "XX\n"); + } + } + + # Date lines + my $switch=0; + if( $seq->can('get_dates') ) { + foreach my $dt ( $seq->get_dates() ) { + $self->_write_line_EMBL_regex("DT ","DT ",$dt,'\s+|$',80);#' + $switch=1; + } + if ($switch == 1) { + $self->_print("XX\n"); + } + } + + # Description lines + $self->_write_line_EMBL_regex("DE ","DE ",$seq->desc(),'\s+|$',80); #' + $self->_print( "XX\n"); + + # if there, write the kw line + { + my( $kw ); + if( my $func = $self->_kw_generation_func ) { + $kw = &{$func}($seq); + } elsif( $seq->can('keywords') ) { + $kw = $seq->keywords; + if( ref($kw) =~ /ARRAY/i ) { + $kw = join("; ", @$kw); + } + $kw .= '.' if( defined $kw && $kw !~ /\.$/ ); + } + if (defined $kw) { + $self->_write_line_EMBL_regex("KW ", "KW ", + $kw, '\s+|$', 80); #' + $self->_print( "XX\n"); + + } + } + + # Organism lines + + if ($seq->can('species') && (my $spec = $seq->species)) { + my($species, @class) = $spec->classification(); + my $genus = $class[0]; + my $OS = "$genus $species"; + if (my $ssp = $spec->sub_species) { + $OS .= " $ssp"; + } + if (my $common = $spec->common_name) { + $OS .= " ($common)"; + } + $self->_print("OS $OS\n"); + my $OC = join('; ', reverse(@class)) .'.'; + $self->_write_line_EMBL_regex("OC ","OC ",$OC,'; |$',80); #' + if ($spec->organelle) { + $self->_write_line_EMBL_regex("OG ","OG ",$spec->organelle,'; |$',80); #' + } + $self->_print("XX\n"); + } + + # Reference lines + my $t = 1; + if ( $seq->can('annotation') && defined $seq->annotation ) { + foreach my $ref ( $seq->annotation->get_Annotations('reference') ) { + $self->_print( "RN [$t]\n"); + + # Having no RP line is legal, but we need both + # start and end for a valid location. + my $start = $ref->start; + my $end = $ref->end; + if ($start and $end) { + $self->_print( "RP $start-$end\n"); + } elsif ($start or $end) { + $self->throw("Both start and end are needed for a valid RP line. Got: start='$start' end='$end'"); + } + + if (my $med = $ref->medline) { + $self->_print( "RX MEDLINE; $med.\n"); + } + if (my $pm = $ref->pubmed) { + $self->_print( "RX PUBMED; $pm.\n"); + } + $self->_write_line_EMBL_regex("RA ", "RA ", + $ref->authors . ";", + '\s+|$', 80); #' + + # If there is no title to the reference, it appears + # as a single semi-colon. All titles must end in + # a semi-colon. + my $ref_title = $ref->title || ''; + $ref_title =~ s/[\s;]*$/;/; + $self->_write_line_EMBL_regex("RT ", "RT ", $ref_title, '\s+|$', 80); #' + $self->_write_line_EMBL_regex("RL ", "RL ", $ref->location, '\s+|$', 80); #' + if ($ref->comment) { + $self->_write_line_EMBL_regex("RC ", "RC ", $ref->comment, '\s+|$', 80); #' + } + $self->_print("XX\n"); + $t++; + } + + + # DB Xref lines + if (my @db_xref = $seq->annotation->get_Annotations('dblink') ) { + foreach my $dr (@db_xref) { + my $db_name = $dr->database; + my $prim = $dr->primary_id; + my $opt = $dr->optional_id || ''; + + my $line = "$db_name; $prim; $opt."; + $self->_write_line_EMBL_regex("DR ", "DR ", $line, '\s+|$', 80); #' + } + $self->_print("XX\n"); + } + + # Comment lines + foreach my $comment ( $seq->annotation->get_Annotations('comment') ) { + $self->_write_line_EMBL_regex("CC ", "CC ", $comment->text, '\s+|$', 80); #' + $self->_print("XX\n"); + } + } + # "\\s\+\|\$" + + ## FEATURE TABLE + + $self->_print("FH Key Location/Qualifiers\n"); + $self->_print("FH\n"); + + my @feats = $seq->can('top_SeqFeatures') ? $seq->top_SeqFeatures : (); + if ($feats[0]) { + if( defined $self->_post_sort ) { + # we need to read things into an array. + # Process. Sort them. Print 'em + + my $post_sort_func = $self->_post_sort(); + my @fth; + + foreach my $sf ( @feats ) { + push(@fth,Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq)); + } + + @fth = sort { &$post_sort_func($a,$b) } @fth; + + foreach my $fth ( @fth ) { + $self->_print_EMBL_FTHelper($fth); + } + } else { + # not post sorted. And so we can print as we get them. + # lower memory load... + + foreach my $sf ( @feats ) { + my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq); + foreach my $fth ( @fth ) { + if( $fth->key eq 'CONTIG') { + $self->_show_dna(0); + } + $self->_print_EMBL_FTHelper($fth); + } + } + } + } + + if( $self->_show_dna() == 0 ) { + $self->_print( "//\n"); + return; + } + $self->_print( "XX\n"); + + # finished printing features. + + $str =~ tr/A-Z/a-z/; + + # Count each nucleotide + my $alen = $str =~ tr/a/a/; + my $clen = $str =~ tr/c/c/; + my $glen = $str =~ tr/g/g/; + my $tlen = $str =~ tr/t/t/; + + my $olen = $len - ($alen + $tlen + $clen + $glen); + if( $olen < 0 ) { + $self->warn("Weird. More atgc than bases. Problem!"); + } + + $self->_print("SQ Sequence $len BP; $alen A; $clen C; $glen G; $tlen T; $olen other;\n"); + + my $nuc = 60; # Number of nucleotides per line + my $whole_pat = 'a10' x 6; # Pattern for unpacking a whole line + my $out_pat = 'A11' x 6; # Pattern for packing a line + my $length = length($str); + + # Calculate the number of nucleotides which fit on whole lines + my $whole = int($length / $nuc) * $nuc; + + # Print the whole lines + my( $i ); + for ($i = 0; $i < $whole; $i += $nuc) { + my $blocks = pack $out_pat, + unpack $whole_pat, + substr($str, $i, $nuc); + $self->_print(sprintf(" $blocks%9d\n", $i + $nuc)); + } + + # Print the last line + if (my $last = substr($str, $i)) { + my $last_len = length($last); + my $last_pat = 'a10' x int($last_len / 10) .'a'. $last_len % 10; + my $blocks = pack $out_pat, + unpack($last_pat, $last); + $self->_print(sprintf(" $blocks%9d\n", $length)); # Add the length to the end + } + + $self->_print( "//\n"); + + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; + } +} + +=head2 _print_EMBL_FTHelper + + Title : _print_EMBL_FTHelper + Usage : + Function: Internal function + Returns : + Args : + + +=cut + +sub _print_EMBL_FTHelper { + my ($self,$fth,$always_quote) = @_; + $always_quote ||= 0; + + if( ! ref $fth || ! $fth->isa('Bio::SeqIO::FTHelper') ) { + $fth->warn("$fth is not a FTHelper class. Attempting to print, but there could be tears!"); + } + + + #$self->_print( "FH Key Location/Qualifiers\n"); + #$self->_print( sprintf("FT %-15s %s\n",$fth->key,$fth->loc)); + # let + if( $fth->key eq 'CONTIG' ) { + $self->_print("XX\n"); + $self->_write_line_EMBL_regex("CO ", + "CO ",$fth->loc, + '\,|$',80); #' + return; + } + $self->_write_line_EMBL_regex(sprintf("FT %-15s ",$fth->key), + "FT ",$fth->loc, + '\,|$',80); #' + foreach my $tag ( keys %{$fth->field} ) { + if( ! defined $fth->field->{$tag} ) { next; } + foreach my $value ( @{$fth->field->{$tag}} ) { + $value =~ s/\"/\"\"/g; + if ($value eq "_no_value") { + $self->_write_line_EMBL_regex("FT ", + "FT ", + "/$tag",'.|$',80); #' + } + elsif( $always_quote == 1 || $value !~ /^\d+$/ ) { + my $pat = $value =~ /\s/ ? '\s|$' : '.|$'; + $self->_write_line_EMBL_regex("FT ", + "FT ", + "/$tag=\"$value\"",$pat,80); + } + else { + $self->_write_line_EMBL_regex("FT ", + "FT ", + "/$tag=$value",'.|$',80); #' + } + # $self->_print( "FT /", $tag, "=\"", $value, "\"\n"); + } + } +} + +#' +=head2 _read_EMBL_References + + Title : _read_EMBL_References + Usage : + Function: Reads references from EMBL format. Internal function really + Example : + Returns : + Args : + + +=cut + +sub _read_EMBL_References { + my ($self,$buffer) = @_; + my (@refs); + + # assumme things are starting with RN + + if( $$buffer !~ /^RN/ ) { + warn("Not parsing line '$$buffer' which maybe important"); + } + my $b1; + my $b2; + my $title; + my $loc; + my $au; + my $med; + my $pm; + my $com; + + while( defined ($_ = $self->_readline) ) { + /^R/ || last; + /^RP (\d+)-(\d+)/ && do {$b1=$1;$b2=$2;}; + /^RX MEDLINE;\s+(\d+)/ && do {$med=$1}; + /^RX PUBMED;\s+(\d+)/ && do {$pm=$1}; + /^RA (.*)/ && do { + $au = $self->_concatenate_lines($au,$1); next; + }; + /^RT (.*)/ && do { + $title = $self->_concatenate_lines($title,$1); next; + }; + /^RL (.*)/ && do { + $loc = $self->_concatenate_lines($loc,$1); next; + }; + /^RC (.*)/ && do { + $com = $self->_concatenate_lines($com,$1); next; + }; + } + + my $ref = new Bio::Annotation::Reference; + $au =~ s/;\s*$//g; + $title =~ s/;\s*$//g; + + $ref->start($b1); + $ref->end($b2); + $ref->authors($au); + $ref->title($title); + $ref->location($loc); + $ref->medline($med); + $ref->comment($com); + $ref->pubmed($pm); + + push(@refs,$ref); + $$buffer = $_; + + return @refs; +} + +=head2 _read_EMBL_Species + + Title : _read_EMBL_Species + Usage : + Function: Reads the EMBL Organism species and classification + lines. + Example : + Returns : A Bio::Species object + Args : a reference to the current line buffer + +=cut + +sub _read_EMBL_Species { + my( $self, $buffer ) = @_; + my $org; + + $_ = $$buffer; + my( $sub_species, $species, $genus, $common, @class ); + while (defined( $_ ||= $self->_readline )) { + + if (/^OS\s+(\S+)(?:\s+([^\(]\S*))?(?:\s+([^\(]\S*))?(?:\s+\((.*)\))?/) { + $genus = $1; + $species = $2 || 'sp.'; + $sub_species = $3 if $3; + $common = $4 if $4; + } + elsif (s/^OC\s+//) { + # only split on ';' or '.' so that + # classification that is 2 words will + # still get matched + # use map to remove trailing/leading spaces + chomp; + push(@class, map { s/^\s+//; s/\s+$//; $_; } split /[;\.]+/); + } + elsif (/^OG\s+(.*)/) { + $org = $1; + } + else { + last; + } + + $_ = undef; # Empty $_ to trigger read of next line + } + + $$buffer = $_; + + # Don't make a species object if it is "Unknown" or "None" + return if $genus =~ /^(Unknown|None)$/i; + + # Bio::Species array needs array in Species -> Kingdom direction + if ($class[$#class] eq $genus) { + push( @class, $species ); + } else { + push( @class, $genus, $species ); + } + @class = reverse @class; + + my $make = Bio::Species->new(); + $make->classification( \@class, "FORCE" ); # no name validation please + $make->common_name( $common ) if $common; + $make->sub_species( $sub_species ) if $sub_species; + $make->organelle ( $org ) if $org; + return $make; +} + +=head2 _read_EMBL_DBLink + + Title : _read_EMBL_DBLink + Usage : + Function: Reads the EMBL database cross reference ("DR") lines + Example : + Returns : A list of Bio::Annotation::DBLink objects + Args : + +=cut + +sub _read_EMBL_DBLink { + my( $self,$buffer ) = @_; + my( @db_link ); + + $_ = $$buffer; + while (defined( $_ ||= $self->_readline )) { + + if (my($databse, $prim_id, $sec_id) + = /^DR ([^\s;]+);\s*([^\s;]+);\s*([^\s;]+)?\.$/) { + my $link = Bio::Annotation::DBLink->new(); + $link->database ( $databse ); + $link->primary_id ( $prim_id ); + $link->optional_id( $sec_id ) if $sec_id; + push(@db_link, $link); + } + else { + last; + } + + $_ = undef; # Empty $_ to trigger read of next line + } + + $$buffer = $_; + + return @db_link; +} + +=head2 _filehandle + + Title : _filehandle + Usage : $obj->_filehandle($newval) + Function: + Example : + Returns : value of _filehandle + Args : newvalue (optional) + + +=cut + +sub _filehandle{ + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'_filehandle'} = $value; + } + return $obj->{'_filehandle'}; + +} + +=head2 _read_FTHelper_EMBL + + Title : _read_FTHelper_EMBL + Usage : _read_FTHelper_EMBL($buffer) + Function: reads the next FT key line + Example : + Returns : Bio::SeqIO::FTHelper object + Args : filehandle and reference to a scalar + + +=cut + +sub _read_FTHelper_EMBL { + my ($self,$buffer) = @_; + + my ($key, # The key of the feature + $loc, # The location line from the feature + @qual, # An arrray of lines making up the qualifiers + ); + + if ($$buffer =~ /^FT\s{3}(\S+)\s+(\S+)/) { + $key = $1; + $loc = $2; + # Read all the lines up to the next feature + while ( defined($_ = $self->_readline) ) { + if (/^FT(\s+)(.+?)\s*$/) { + # Lines inside features are preceeded by 19 spaces + # A new feature is preceeded by 3 spaces + if (length($1) > 4) { + # Add to qualifiers if we're in the qualifiers + if (@qual) { + push(@qual, $2); + } + # Start the qualifier list if it's the first qualifier + elsif (substr($2, 0, 1) eq '/') { + @qual = ($2); + } + # We're still in the location line, so append to location + else { + $loc .= $2; + } + } else { + # We've reached the start of the next feature + last; + } + } else { + # We're at the end of the feature table + last; + } + } + } elsif( $$buffer =~ /^CO\s+(\S+)/) { + $key = 'CONTIG'; + $loc = $1; + # Read all the lines up to the next feature + while ( defined($_ = $self->_readline) ) { + if (/^CO\s+(\S+)\s*$/) { + $loc .= $1; + } else { + # We've reached the start of the next feature + last; + } + } + } else { + # No feature key + return; + } + + # Put the first line of the next feature into the buffer + $$buffer = $_; + + # Make the new FTHelper object + my $out = new Bio::SeqIO::FTHelper(); + $out->verbose($self->verbose()); + $out->key($key); + $out->loc($loc); + + # Now parse and add any qualifiers. (@qual is kept + # intact to provide informative error messages.) + QUAL: for (my $i = 0; $i < @qual; $i++) { + $_ = $qual[$i]; + my( $qualifier, $value ) = m{^/([^=]+)(?:=(.+))?} + or $self->throw("Can't see new qualifier in: $_\nfrom:\n" + . join('', map "$_\n", @qual)); + if (defined $value) { + # Do we have a quoted value? + if (substr($value, 0, 1) eq '"') { + # Keep adding to value until we find the trailing quote + # and the quotes are balanced + while ($value !~ /"$/ or $value =~ tr/"/"/ % 2) { #" + $i++; + my $next = $qual[$i]; + unless (defined($next)) { + warn("Unbalanced quote in:\n", map("$_\n", @qual), + "No further qualifiers will be added for this feature"); + last QUAL; + } + + # Join to value with space if value or next line contains a space + $value .= (grep /\s/, ($value, $next)) ? " $next" : $next; + } + # Trim leading and trailing quotes + $value =~ s/^"|"$//g; + # Undouble internal quotes + $value =~ s/""/"/g; #" + } + } else { + $value = '_no_value'; + } + + # Store the qualifier + $out->field->{$qualifier} ||= []; + push(@{$out->field->{$qualifier}},$value); + } + + return $out; +} + +=head2 _write_line_EMBL + + Title : _write_line_EMBL + Usage : + Function: internal function + Example : + Returns : + Args : + + +=cut + +sub _write_line_EMBL { + my ($self,$pre1,$pre2,$line,$length) = @_; + + $length || die "Miscalled write_line_EMBL without length. Programming error!"; + my $subl = $length - length $pre2; + my $linel = length $line; + my $i; + + my $sub = substr($line,0,$length - length $pre1); + + $self->_print( "$pre1$sub\n"); + + for($i= ($length - length $pre1);$i < $linel;) { + $sub = substr($line,$i,($subl)); + $self->_print( "$pre2$sub\n"); + $i += $subl; + } + +} + +=head2 _write_line_EMBL_regex + + Title : _write_line_EMBL_regex + Usage : + Function: internal function for writing lines of specified + length, with different first and the next line + left hand headers and split at specific points in the + text + Example : + Returns : nothing + Args : file handle, first header, second header, text-line, regex for line breaks, total line length + + +=cut + +sub _write_line_EMBL_regex { + my ($self,$pre1,$pre2,$line,$regex,$length) = @_; + + #print STDOUT "Going to print with $line!\n"; + + $length || die "Programming error - called write_line_EMBL_regex without length."; + + my $subl = $length - (length $pre1) -1 ; + + + + my( @lines ); + while(defined $line && + $line =~ m/(.{1,$subl})($regex)/g) { + push(@lines, $1.$2); + } + foreach (@lines) { s/\s+$//; } + + # Print first line + my $s = shift(@lines) || ''; + $self->_print( "$pre1$s\n"); + + # Print the rest + foreach my $s ( @lines ) { + $s = '' if( !defined $s ); + $self->_print( "$pre2$s\n"); + } +} + +=head2 _post_sort + + Title : _post_sort + Usage : $obj->_post_sort($newval) + Function: + Returns : value of _post_sort + Args : newvalue (optional) + + +=cut + +sub _post_sort{ + my $obj = shift; + if( @_ ) { + my $value = shift; + $obj->{'_post_sort'} = $value; + } + return $obj->{'_post_sort'}; + +} + +=head2 _show_dna + + Title : _show_dna + Usage : $obj->_show_dna($newval) + Function: + Returns : value of _show_dna + Args : newvalue (optional) + + +=cut + +sub _show_dna{ + my $obj = shift; + if( @_ ) { + my $value = shift; + $obj->{'_show_dna'} = $value; + } + return $obj->{'_show_dna'}; + +} + +=head2 _id_generation_func + + Title : _id_generation_func + Usage : $obj->_id_generation_func($newval) + Function: + Returns : value of _id_generation_func + Args : newvalue (optional) + + +=cut + +sub _id_generation_func{ + my $obj = shift; + if( @_ ) { + my $value = shift; + $obj->{'_id_generation_func'} = $value; + } + return $obj->{'_id_generation_func'}; + +} + +=head2 _ac_generation_func + + Title : _ac_generation_func + Usage : $obj->_ac_generation_func($newval) + Function: + Returns : value of _ac_generation_func + Args : newvalue (optional) + + +=cut + +sub _ac_generation_func{ + my $obj = shift; + if( @_ ) { + my $value = shift; + $obj->{'_ac_generation_func'} = $value; + } + return $obj->{'_ac_generation_func'}; + +} + +=head2 _sv_generation_func + + Title : _sv_generation_func + Usage : $obj->_sv_generation_func($newval) + Function: + Returns : value of _sv_generation_func + Args : newvalue (optional) + + +=cut + +sub _sv_generation_func{ + my $obj = shift; + if( @_ ) { + my $value = shift; + $obj->{'_sv_generation_func'} = $value; + } + return $obj->{'_sv_generation_func'}; + +} + +=head2 _kw_generation_func + + Title : _kw_generation_func + Usage : $obj->_kw_generation_func($newval) + Function: + Returns : value of _kw_generation_func + Args : newvalue (optional) + + +=cut + +sub _kw_generation_func{ + my $obj = shift; + if( @_ ) { + my $value = shift; + $obj->{'_kw_generation_func'} = $value; + } + return $obj->{'_kw_generation_func'}; + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqIO/exp.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/exp.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,141 @@ +# $Id: exp.pm,v 1.8 2002/10/22 07:38:42 lapp Exp $ +# BioPerl module for Bio::SeqIO::exp +# +# Cared for by Aaron Mackey <amackey@virginia.edu> +# +# Copyright Aaron Mackey +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::exp - exp trace sequence input/output stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the Bio::SeqIO class. + +=head1 DESCRIPTION + +This object can transform Bio::Seq objects to and from exp trace +files. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS - Aaron Mackey + +Email: amackey@virginia.edu + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::SeqIO::exp; +use vars qw(@ISA $READ_AVAIL); +use strict; +# Object preamble - inherits from Bio::Root::Object + +use Bio::SeqIO; +use Bio::Seq::SeqFactory; + +push @ISA, qw( Bio::SeqIO ); + +sub BEGIN { + eval { require Bio::SeqIO::staden::read; }; + if ($@) { + $READ_AVAIL = 0; + } else { + push @ISA, "Bio::SeqIO::staden::read"; + $READ_AVAIL = 1; + } +} + +sub _initialize { + my($self,@args) = @_; + $self->SUPER::_initialize(@args); + if( ! defined $self->sequence_factory ) { + $self->sequence_factory(new Bio::Seq::SeqFactory(-verbose => $self->verbose(), -type => 'Bio::Seq::SeqWithQuality')); + } + unless ($READ_AVAIL) { + Bio::Root::Root->throw( -class => 'Bio::Root::SystemException', + -text => "Bio::SeqIO::staden::read is not available; make sure the bioperl-ext package has been installed successfully!" + ); + } +} + +=head2 next_seq + + Title : next_seq + Usage : $seq = $stream->next_seq() + Function: returns the next sequence in the stream + Returns : Bio::SeqWithQuality object + Args : NONE + +=cut + +sub next_seq { + + my ($self) = @_; + + my ($seq, $id, $desc, $qual) = $self->read_trace($self->_fh, 'exp'); + + # create the seq object + $seq = $self->sequence_factory->create(-seq => $seq, + -id => $id, + -primary_id => $id, + -desc => $desc, + -alphabet => 'DNA', + -qual => $qual + ); + return $seq; +} + +=head2 write_seq + + Title : write_seq + Usage : $stream->write_seq(@seq) + Function: writes the $seq object into the stream + Returns : 1 for success and 0 for error + Args : Bio::Seq object + + +=cut + +sub write_seq { + my ($self,@seq) = @_; + + my $fh = $self->_fh; + foreach my $seq (@seq) { + $self->write_trace($fh, $seq, 'exp'); + } + + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqIO/fasta.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/fasta.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,271 @@ +# $Id: fasta.pm,v 1.41.2.4 2003/09/18 02:43:16 jason Exp $ +# BioPerl module for Bio::SeqIO::fasta +# +# Cared for by Ewan Birney <birney@ebi.ac.uk> +# and Lincoln Stein <lstein@cshl.org> +# +# Copyright Ewan Birney & Lincoln Stein +# +# You may distribute this module under the same terms as perl itself +# _history +# October 18, 1999 Largely rewritten by Lincoln Stein + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::fasta - fasta sequence input/output stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the Bio::SeqIO class. + +=head1 DESCRIPTION + +This object can transform Bio::Seq objects to and from fasta flat +file databases. + +A method L<preferred_id_type()> can be used to specify the type of ID +we would like to parse from the fasta line. By default 'display' is +used, which means it parses everything from the '>' to the first space +and makes that the 'display_id' for the sequence. + +Can be one of: + - accession + - accession.version + - display + - primary + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via the +web: + + http://bugzilla.bioperl.org/ + +=head1 AUTHORS - Ewan Birney & Lincoln Stein + +Email: birney@ebi.ac.uk + lstein@cshl.org + +=head1 CONTRIBUTORS + +Jason Stajich, jason-at-bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::SeqIO::fasta; +use vars qw(@ISA $WIDTH @SEQ_ID_TYPES $DEFAULT_SEQ_ID_TYPE); +use strict; +# Object preamble - inherits from Bio::Root::Object + +use Bio::SeqIO; +use Bio::Seq::SeqFactory; +use Bio::Seq::SeqFastaSpeedFactory; + +@ISA = qw(Bio::SeqIO); + +@SEQ_ID_TYPES = qw(accession accession.version display primary); +$DEFAULT_SEQ_ID_TYPE = 'display'; + +BEGIN { $WIDTH = 60} + +sub _initialize { + my($self,@args) = @_; + $self->SUPER::_initialize(@args); + my ($width) = $self->_rearrange([qw(WIDTH)], @args); + $width && $self->width($width); + unless ( defined $self->sequence_factory ) { + $self->sequence_factory(Bio::Seq::SeqFastaSpeedFactory->new()); + } +} + +=head2 next_seq + + Title : next_seq + Usage : $seq = $stream->next_seq() + Function: returns the next sequence in the stream + Returns : Bio::Seq object + Args : NONE + +=cut + +sub next_seq { + my( $self ) = @_; + my $seq; + my $alphabet; + local $/ = "\n>"; + return unless my $entry = $self->_readline; + + chomp($entry); + if ($entry =~ m/\A\s*\Z/s) { # very first one + return unless $entry = $self->_readline; + chomp($entry); + } + $entry =~ s/^>//; + + my ($top,$sequence) = split(/\n/,$entry,2); + defined $sequence && $sequence =~ s/>//g; +# my ($top,$sequence) = $entry =~ /^>?(.+?)\n+([^>]*)/s +# or $self->throw("Can't parse fasta entry"); + + my ($id,$fulldesc); + if( $top =~ /^\s*(\S+)\s*(.*)/ ) { + ($id,$fulldesc) = ($1,$2); + } + + if (defined $id && $id eq '') {$id=$fulldesc;} # FIX incase no space + # between > and name \AE + defined $sequence && $sequence =~ s/\s//g; # Remove whitespace + + # for empty sequences we need to know the mol.type + $alphabet = $self->alphabet(); + if(defined $sequence && length($sequence) == 0) { + if(! defined($alphabet)) { + # let's default to dna + $alphabet = "dna"; + } + } else { + # we don't need it really, so disable + $alphabet = undef; + } + + $seq = $self->sequence_factory->create( + -seq => $sequence, + -id => $id, + # Ewan's note - I don't think this healthy + # but obviously to taste. + #-primary_id => $id, + -desc => $fulldesc, + -alphabet => $alphabet, + -direct => 1, + ); + + + + + # if there wasn't one before, set the guessed type + unless ( defined $alphabet ) { + $self->alphabet($seq->alphabet()); + } + return $seq; + +} + +=head2 write_seq + + Title : write_seq + Usage : $stream->write_seq(@seq) + Function: writes the $seq object into the stream + Returns : 1 for success and 0 for error + Args : array of 1 to n Bio::PrimarySeqI objects + + +=cut + +sub write_seq { + my ($self,@seq) = @_; + my $width = $self->width; + foreach my $seq (@seq) { + $self->throw("Did not provide a valid Bio::PrimarySeqI object") + unless defined $seq && ref($seq) && $seq->isa('Bio::PrimarySeqI'); + + my $str = $seq->seq; + my $top; + + # Allow for different ids + my $id_type = $self->preferred_id_type; + if( $id_type =~ /^acc/i ) { + $top = $seq->accession_number(); + if( $id_type =~ /vers/i ) { + $top .= "." . $seq->version(); + } + } elsif($id_type =~ /^displ/i ) { + $top = $seq->display_id(); + } elsif($id_type =~ /^pri/i ) { + $top = $seq->primary_id(); + } + + if ($seq->can('desc') and my $desc = $seq->desc()) { + $desc =~ s/\n//g; + $top .= " $desc"; + } + if(length($str) > 0) { + $str =~ s/(.{1,$width})/$1\n/g; + } else { + $str = "\n"; + } + $self->_print (">",$top,"\n",$str) or return; + } + + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; +} + +=head2 width + + Title : width + Usage : $obj->width($newval) + Function: Get/Set the line width for FASTA output + Returns : value of width + Args : newvalue (optional) + + +=cut + +sub width{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'width'} = $value; + } + return $self->{'width'} || $WIDTH; +} + +=head2 preferred_id_type + + Title : preferred_id_type + Usage : $obj->preferred_id_type('accession') + Function: Get/Set the preferred type of identifier to use in the ">ID" position + for FASTA output. + Returns : string, one of values defined in @Bio::SeqIO::fasta::SEQ_ID_TYPES. + Default = $Bio::SeqIO::fasta::DEFAULT_SEQ_ID_TYPE ('display'). + Args : string when setting. This must be one of values defined in + @Bio::SeqIO::fasta::SEQ_ID_TYPES. Allowable values: + accession, accession.version, display, primary + Throws : fatal exception if the supplied id type is not in @SEQ_ID_TYPES. + +=cut + +sub preferred_id_type { + my ($self,$type) = @_; + if( defined $type ) { + if( ! grep lc($type) eq $_, @SEQ_ID_TYPES) { + $self->throw(-class=>'Bio::Root::BadParameter', + -text=>"Invalid ID type \"$type\". Must be one of: @SEQ_ID_TYPES"); + } + $self->{'_seq_id_type'} = lc($type); +# print STDERR "Setting preferred_id_type=$type\n"; + } + $self->{'_seq_id_type'} || $DEFAULT_SEQ_ID_TYPE; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqIO/fastq.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/fastq.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,290 @@ +# BioPerl module for Bio::SeqIO::fastq +# +# Cared for by Tony Cox <avc@sanger.ac.uk> +# +# Copyright Tony Cox +# +# You may distribute this module under the same terms as perl itself +# _history +# October 29, 2001 incept data + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::fastq - fastq sequence input/output stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the Bio::SeqIO class. + +=head1 DESCRIPTION + +This object can transform Bio::Seq and Bio::Seq::SeqWithQuality +objects to and from fastq flat file databases. + +Fastq is a file format used frequently at the Sanger Centre to bundle +a fasta sequence and its quality data. A typical fastaq entry takes +the from: + + @HCDPQ1D0501 + GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT..... + +HCDPQ1D0501 + !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65..... + +Fastq files have sequence and quality data on a single line and the +quality values are single-byte encoded. To retrieve the decimal values +for qualities you need to subtract 33 (or Octal 41) from each byte and +then convert to a '2 digit + 1 space' integer. You can check if 33 is +the right number because the first byte which is always '!' +corresponds to a quality value of 0. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS - Tony Cox + +Email: avc@sanger.ac.uk + + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::SeqIO::fastq; +use vars qw(@ISA); +use strict; +# Object preamble - inherits from Bio::Root::Object + +use Bio::SeqIO; +use Bio::Seq::SeqFactory; + +@ISA = qw(Bio::SeqIO); + +sub _initialize { + my($self,@args) = @_; + $self->SUPER::_initialize(@args); + if( ! defined $self->sequence_factory ) { + $self->sequence_factory(new Bio::Seq::SeqFactory(-verbose => $self->verbose(), -type => 'Bio::Seq::SeqWithQuality')); + } +} + + +=head2 next_seq + + Title : next_seq + Usage : $seq = $stream->next_seq() + Function: returns the next sequence in the stream + Returns : Bio::Seq::SeqWithQuality object + Args : NONE + +=cut + +sub next_seq { + + my( $self ) = @_; + my $seq; + my $alphabet; + local $/ = "\n\@"; + + return unless my $entry = $self->_readline; + + if ($entry eq '@') { # very first one + return unless $entry = $self->_readline; + } + my ($top,$sequence,$top2,$qualsequence) = $entry =~ /^ + \@?(.+?)\n + ([^\@]*?)\n + \+?(.+?)\n + (.*)\n + /xs + or $self->throw("Can't parse fastq entry"); + my ($id,$fulldesc) = $top =~ /^\s*(\S+)\s*(.*)/ + or $self->throw("Can't parse fastq header"); + if ($id eq '') {$id=$fulldesc;} # FIX incase no space between \@ and name + $sequence =~ s/\s//g; # Remove whitespace + $qualsequence =~ s/\s//g; + + if(length($sequence) != length($qualsequence)){ + $self->warn("Fastq sequence/quality data length mismatch error\n"); + $self->warn("Sequence: $top, seq length: ",length($sequence), " Qual length: ", length($qualsequence), " \n"); + $self->warn("$sequence\n"); + $self->warn("$qualsequence\n"); + $self->warn("FROM ENTRY: \n\n$entry\n"); + } + + my @qual = split('', $qualsequence); + + my $qual; + foreach (@qual) {$qual .= (unpack("C",$_) - 33) ." "}; + + + # for empty sequences we need to know the mol.type + $alphabet = $self->alphabet(); + if(length($sequence) == 0) { + if(! defined($alphabet)) { + # let's default to dna + $alphabet = "dna"; + } + } else { + # we don't need it really, so disable + $alphabet = undef; + } + + # create the SeqWithQuality object + $seq = $self->sequence_factory->create( + -qual => $qual, + -seq => $sequence, + -id => $id, + -primary_id => $id, + -desc => $fulldesc, + -alphabet => $alphabet + ); + + # if there wasn't one before, set the guessed type + $self->alphabet($seq->alphabet()); + + return $seq; +} + +=head2 write_seq + + Title : write_seq + Usage : $stream->write_seq(@seq) + Function: writes the $seq object into the stream + Returns : 1 for success and 0 for error + Args : Bio::Seq::SeqWithQuality or Bio::seq object + + +=cut + +sub write_seq { + my ($self,@seq) = @_; + foreach my $seq (@seq) { + my $str = $seq->seq; + my $top = $seq->display_id(); + if ($seq->can('desc') and my $desc = $seq->desc()) { + $desc =~ s/\n//g; + $top .= " $desc"; + } + if(length($str) > 0) { + $str =~ s/(.{1,60})/$1\n/g; + } else { + $str = "\n"; + } + + $self->_print (">",$top,"\n",$str) or return; + } + + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; +} + +=head2 write_qual + + Title : write_qual + Usage : $stream->write_qual(@seq) + Function: writes the $seq object into the stream + Returns : 1 for success and 0 for error + Args : Bio::Seq::SeqWithQuality object + + +=cut + +sub write_qual { + my ($self,@seq) = @_; + foreach my $seq (@seq) { + unless ($seq->isa("Bio::Seq::SeqWithQuality")){ + warn("You can write FASTQ without supplying a Bio::Seq::SeqWithQuality object! ", ref($seq), "\n"); + next; + } + my @qual = @{$seq->qual}; + my $top = $seq->display_id(); + if ($seq->can('desc') and my $desc = $seq->desc()) { + $desc =~ s/\n//g; + $top .= " $desc"; + } + my $qual = "" ; + if(scalar(@qual) > 0) { + my $max = 60; + for (my $q = 0;$q<scalar(@qual);$q++){ + $qual .= $qual[$q] . " "; + if(length($qual) > $max){ + $qual .= "\n"; + $max += 60; + } + } + } else { + $qual = "\n"; + } + + $self->_print (">",$top,"\n",$qual,"\n") or return; + } + return 1; +} + +=head2 write_fastq + + Title : write_fastq + Usage : $stream->write_fastq(@seq) + Function: writes the $seq object into the stream + Returns : 1 for success and 0 for error + Args : Bio::Seq::SeqWithQuality object + + +=cut + +sub write_fastq { + my ($self,@seq) = @_; + foreach my $seq (@seq) { + unless ($seq->isa("Bio::Seq::SeqWithQuality")){ + warn("You can write FASTQ without supplying a Bio::Seq::SeqWithQuality object! ", ref($seq), "\n"); + next; + } + my $str = $seq->seq; + my @qual = @{$seq->qual}; + my $top = $seq->display_id(); + if ($seq->can('desc') and my $desc = $seq->desc()) { + $desc =~ s/\n//g; + $top .= " $desc"; + } + if(length($str) == 0) { + $str = "\n"; + } + my $qual = "" ; + if(scalar(@qual) > 0) { + for (my $q = 0;$q<scalar(@qual);$q++){ + $qual .= chr($qual[$q] + 33); + } + } else { + $qual = "\n"; + } + + $self->_print ("\@",$top,"\n",$str,"\n") or return; + $self->_print ("+",$top,"\n",$qual,"\n") or return; + } + return 1; +} +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqIO/game.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/game.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,672 @@ +# $Id: game.pm,v 1.26.2.1 2003/06/28 22:23:15 jason Exp $ +# +# BioPerl module for Bio::SeqIO::game +# +# Cared for by Brad Marshall <bradmars@yahoo.com> +# +# Copyright Ewan Birney & Lincoln Stein & Brad Marshall +# +# You may distribute this module under the same terms as perl itself +# _history +# June 25, 2000 written by Brad Marshall +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::game - Parses GAME XML 0.1 and higher into and out of Bio::Seq objects. + +=head1 SYNOPSIS + +To use this module you need XML::Parser, XML::Parser::PerlSAX +and XML::Writer. + +Do not use this module directly. Use it via the Bio::SeqIO class. + +=head1 DESCRIPTION + +This object can transform Bio::Seq objects to and from bioxml seq, +computation, feature and annotation dtds,versions 0.1 and higher. +These can be found at http://www.bioxml.org/dtds/current. It does +this using the idHandler, seqHandler and featureHandler modules you +should have gotten with this one. + +The idea is that any bioxml features can be turned into bioperl +annotations. When Annotations and computations are parsed in, they +gain additional info in the bioperl SeqFeature tag attribute. These +can be used to reconstitute a computation or annotation by the bioxml +with the bx-handler module when write_seq is called. + +If you use this to write SeqFeatures that were not generated from +computations or annotations, it will output a list of bioxml features. +Some data may be lost in this step, since bioxml features just have a +span, type and description - nothing about the anlysis performed. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - Technical bioperl discussion + bioxml-dev@bioxml.org - Technical discussion - Moderate volume + bioxml-announce@bioxml.org - General Announcements - Pretty dead + http://www.bioxml.org/MailingLists/ - About the mailing lists + +=head1 AUTHOR - Brad Marshall & Ewan Birney & Lincoln Stein + +Email: bradmars@yahoo.com + birney@sanger.ac.uk + lstein@cshl.org + + +=head1 CONTRIBUTORS + +Jason Stajich E<lt>jason@bioperl.orgE<gt> + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::SeqIO::game; +use vars qw(@ISA); +use strict; +# Object preamble - inherits from Bio::Root::Object + +use Bio::SeqIO; +use Bio::SeqIO::game::seqHandler; +use Bio::SeqIO::game::featureHandler; +use Bio::SeqIO::game::idHandler; +use XML::Parser::PerlSAX; +use Bio::SeqFeature::Generic; +use XML::Writer; + +use Bio::Seq; + +@ISA = qw(Bio::SeqIO); + + +sub _initialize { + my($self,@args) = @_; + $self->SUPER::_initialize(@args); + my $xmlfile = ""; + $self->{'counter'} = 0; + $self->{'id_counter'} = 1; + $self->{'leftovers'} = undef; + $self->{'header'} = undef; + $self->{'chunkable'} = undef; + $self->{'xmldoc'} = undef; + + $self->_export_subfeatures(1); + $self->_group_subfeatures(1); + $self->_subfeature_types('exons', 'promoters','poly_A_sites', + 'utrs','introns','sub_SeqFeature'); + + # filehandle is stored by superclass _initialize +} + + +=head2 _export_subfeatures + + Title : _export_subfeatures + Usage : $obj->_export_subfeatures + Function: export all subfeatures (also in the geneprediction structure) + Returns : value of _export_subfeatures + Args : newvalue (optional) + +=cut + +sub _export_subfeatures{ + my $obj = shift; + if( @_ ) { + my $value = shift; + $obj->{'_export_subfeatures'} = $value; + } + return $obj->{'_export_subfeatures'}; + +} + +=head2 _group_subfeatures + + Title : _group_subfeatures + Usage : $obj->_group_subfeatures + Function: Groups all subfeatures in separate feature_sets + Returns : value of _group_subfeatures + Args : newvalue (optional) + +=cut + +sub _group_subfeatures{ + my $obj = shift; + if( @_ ) { + my $value = shift; + $obj->{'_group_subfeatures'} = $value; + } + return $obj->{'_group_subfeatures'}; +} + +=head2 _subfeature_types + + Title : _subfeature_types + Usage : $obj->_subfeature_types + Function: array of all possible subfeatures, it should be a + name of a function which + : returns an arrau of sub_seqfeatures when called: + @array = $feature->subfeaturetype() + Returns : array of _subfeature_types + Args : array of subfeature types (optional) + +=cut + +sub _subfeature_types{ + my $obj = shift; + if( @_ ) { + my @values = @_; + $obj->{'_subfeature_types'} = \@values; + } + return @{$obj->{'_subfeature_types'}}; + +} + +=head2 _add_subfeature_type + + Title : _add_subfeature_type + Usage : $obj->_add_subfeature_type + Function: add one possible subfeature, it should be a name of a function which + : returns an arrau of sub_seqfeatures when called: @array = $feature->subfeaturetyp() + Returns : 1 + Args : one subfeature type (optional) + +=cut + +sub _add_subfeature_type{ + my $obj = shift; + if( @_ ) { + my @values = @_; + push @{$obj->{'_subfeature_types'}}, @values; + } + return 1; + +} + + +=head2 next_seq + + Title : next_seq + Usage : $seq = $stream->next_seq() + Function: returns the next sequence in the stream + Returns : Bio::Seq object + Args : NONE + +=cut + +sub next_seq { + my $self = shift; + + + # The header is the top level stuff in the XML file. + # IE before the first <bx-seq:seq> tag. + # If you don't include this in each 'chunk', the + # parser will barf. + my $header; + unless ($self->{'header'}) { + while (my $next_line = $self->_readline) { + if($next_line=~/<bx-seq:seq?/) { + $header .= $`; + $self->{'header'}=$header; + $self->{'leftovers'} .= "<bx-seq:seq".$'; + last; + } else { + $header .= $next_line; + } + } + if ($self->{'header'}=~m|<bx-game:flavor>.*chunkable.*</bx-game:flavor>|) { + $self->{'chunkable'}=1; + } + + } + + my $not_top_level; + my $xmldoc; + my $seq; + # If chunkable, we read in the document until the next + # TOP LEVEL sequence. + if ($self->{'chunkable'}) { + $xmldoc = $self->{'header'}.$self->{'leftovers'}; + while (my $next_line = $self->_readline) { + # Maintain depth of computations and annotations. + # We only want TOP LEVEL seqs if chunkable. + while ($next_line=~ m|<bx-computation:computation|g) { + $not_top_level++; + } + while ($next_line=~ m|<bx-annotation:annotation|g) { + $not_top_level++; + } + while ($next_line=~ m|</bx-computation:computation|g) { + $not_top_level--; + } + while ($next_line=~ m|</bx-annotation:annotation|g) { + $not_top_level--; + } + if($next_line=~/<bx-seq:seq?/) { + if (!$not_top_level) { + $xmldoc .= $`; + $self->{'leftovers'} .= "<bx-seq:seq".$'; + last; + } + } else { + $xmldoc .= $next_line; + } + } + # Make sure the 'doc chunk' has a closing tag + # to make the parser happy. + if (!$xmldoc=~m|</bx-game:game>|){ + $xmldoc .= "</bx-game:game>"; + } + # Grab the TOP LEVEL seq.. + if ($xmldoc =~ m|</bx-seq:seq|) { + my $handler = Bio::SeqIO::game::idHandler->new(); + my $options = {Handler=>$handler}; + my $parser = XML::Parser::PerlSAX->new($options); + $self->{'seqs'} = $parser->parse(Source => { String => $xmldoc }); + } else { # No sequences. + return 0; + } + # Get the seq out of the array. + $seq = @{$self->{'seqs'}}[0]; + # If not chunkable, + # only read document into memory once! + } elsif (!$self->{'xmldoc'}) { + $self->{'xmldoc'}=$self->{'header'}.$self->{'leftovers'}; + while (my $next_line = $self->_readline) { + $self->{'xmldoc'} .= $next_line; + } + $xmldoc=$self->{'xmldoc'}; + # Get the seq id index. + if ($xmldoc =~ m|</bx-seq:seq|) { + my $handler = Bio::SeqIO::game::idHandler->new(); + my $options = {Handler=>$handler}; + my $parser = XML::Parser::PerlSAX->new($options); + $self->{'seqs'} = $parser->parse(Source => { String => $xmldoc }); + $seq = shift @{$self->{'seqs'}}; + } else { # No sequences. + return 0; + } + my $seq = @{$self->{'seqs'}}[0]; + # if we already have the doc in memory, + # just get the doc. + } elsif ($self->{'xmldoc'}) { + $xmldoc=$self->{'xmldoc'}; + $seq = shift @{$self->{'seqs'}}; + } + # If there's more sequences: + if ($seq) { + # Get the next seq. + my $handler = Bio::SeqIO::game::seqHandler->new(-seq => $seq); + my $options = {Handler=>$handler}; + my $parser = XML::Parser::PerlSAX->new($options); + my $pseq = $parser->parse(Source => { String => $xmldoc }); + # get the features. + my $fhandler = Bio::SeqIO::game::featureHandler->new($pseq->id(), + $pseq->length(), + $pseq->alphabet()); + $options = {Handler=>$fhandler}; + + $parser = XML::Parser::PerlSAX->new($options); + my $features = $parser->parse(Source => { String => $xmldoc }); + my $seq = Bio::Seq->new(); + # Build the Bioperl Seq and return it. + foreach my $feature (@{$features}) { + $seq->add_SeqFeature($feature); + } + $seq->primary_seq($pseq); + return $seq; + } else { + return 0; + } +} + +=head2 next_primary_seq + + Title : next_primary_seq + Usage : $seq = $stream->next_primary_seq() + Function: returns the next primary sequence (ie no seq_features) in the stream + Returns : Bio::PrimarySeq object + Args : NONE + +=cut + +sub next_primary_seq { + my $self=shift; + + # The header is the top level stuff in the XML file. + # IE before the first <bx-seq:seq> tag. + # If you don't include this in each 'chunk', the + # parser will barf. + my $header; + unless ($self->{'header'}) { + while (my $next_line = $self->_readline) { + if($next_line=~/<bx-seq:seq?/) { + $header .= $`; + $self->{'header'}=$header; + $self->{'leftovers'} .= "<bx-seq:seq".$'; + last; + } else { + $header .= $next_line; + } + } + if ($self->{'header'}=~m|<bx-game:flavor>.*chunkable.*</bx-game:flavor>|) { + $self->{'chunkable'}=1; + } + + } + + my $not_top_level = 0; + my $xmldoc; + my $seq; + # If chunkable, we read in the document until the next + # TOP LEVEL sequence. + if ($self->{'chunkable'}) { + $xmldoc = $self->{'header'}.$self->{'leftovers'}; + while (my $next_line = $self->_readline) { + # Maintain depth of computations and annotations. + # We only want TOP LEVEL seqs if chunkable. + while ($next_line=~ m|<bx-computation:computation|g) { + $not_top_level++; + } + while ($next_line=~ m|<bx-annotation:annotationn|g) { + $not_top_level++; + } + while ($next_line=~ m|</bx-computation:computation|g) { + $not_top_level--; + } + while ($next_line=~ m|</bx-annotation:annotationn|g) { + $not_top_level--; + } + if($next_line=~/<bx-seq:seq?/) { + if (!$not_top_level) { + $xmldoc .= $`; + $self->{'leftovers'} .= "<bx-seq:seq".$'; + last; + } + } else { + $xmldoc .= $next_line; + } + } + # Make sure the 'doc chunk' has a closing tag + # to make the parser happy. + if (!$xmldoc=~m|</bx-game:game>|){ + $xmldoc .= "</bx-game:game>"; + } + # Grab the TOP LEVEL seq.. + if ($xmldoc =~ m|</bx-seq:seq|) { + my $handler = Bio::SeqIO::game::idHandler->new(); + my $options = {Handler=>$handler}; + my $parser = XML::Parser::PerlSAX->new($options); + $self->{'seqs'} = $parser->parse(Source => { String => $xmldoc }); + } else { # No sequences. + return 0; + } + $seq = @{$self->{'seqs'}}[0]; + # If not chunkable, + # only read document into memory once! + } elsif (!$self->{'xmldoc'}) { + $self->{'xmldoc'}=$self->{'header'}.$self->{'leftovers'}; + while (my $next_line = $self->_readline) { + $self->{'xmldoc'} .= $next_line; + } + $xmldoc=$self->{'xmldoc'}; + # Get the seq id index. + if ($xmldoc =~ m|</bx-seq:seq|) { + my $handler = Bio::SeqIO::game::idHandler->new(); + my $options = {Handler=>$handler}; + my $parser = XML::Parser::PerlSAX->new($options); + $self->{'seqs'} = $parser->parse(Source => { String => $xmldoc }); + $seq = shift @{$self->{'seqs'}}; + } else { # No sequences. + return 0; + } + my $seq = @{$self->{'seqs'}}[0]; + # if we already have the doc in memory, + # just get the doc. + } elsif ($self->{'xmldoc'}) { + $xmldoc=$self->{'xmldoc'}; + $seq = shift @{$self->{'seqs'}}; + } + + #print $xmldoc; + + if ($seq) { + # Get the next seq. + my $handler = Bio::SeqIO::game::seqHandler->new(-seq => $seq); + my $options = {Handler=>$handler}; + my $parser = XML::Parser::PerlSAX->new($options); + my $pseq = $parser->parse(Source => { String => $xmldoc }); + return $pseq; + } else { + return 0; + } +} + + +=head2 write_seq + + Title : write_seq + Usage : Not Yet Implemented! $stream->write_seq(@seq) + Function: writes the $seq object into the stream + Returns : 1 for success and 0 for error + Args : Bio::Seq object + + +=cut + +sub write_seq { + my ($self,@seqs) = @_; + + my $bxfeat = "http://www.bioxml.org/dtds/current/feature.dtd"; + my $bxann = "http://www.bioxml.org/dtds/current/annotation.dtd"; + my $bxcomp = "http://www.bioxml.org/dtds/current/computation.dtd"; + my $bxgame = "http://www.bioxml.org/dtds/current/game.dtd"; + my $bxlink = "http://www.bioxml.org/dtds/current/link.dtd"; + my $bxseq = "http://www.bioxml.org/dtds/current/seq.dtd"; + + my $writer = new XML::Writer(OUTPUT => $self->_fh || \*STDOUT, + NAMESPACES => 1, + DATA_MODE => 1, + DATA_INDENT => 4, + PREFIX_MAP => { + '' => '', # to keep undef warnings away in XML::Writer, fill in with something as a default prefix later? + $bxfeat => 'bx-feature', + $bxann => 'bx-annotation', + $bxcomp => 'bx-computation', + $bxgame => 'bx-game', + $bxlink => 'bx-link', + $bxseq => 'bx-seq' + }); + $writer->xmlDecl("UTF-8"); + $writer->doctype("bx-game:game", 'game', $bxgame); + $writer ->startTag ([$bxgame, 'game']); + $writer->startTag ([$bxgame, 'flavor']); + $writer->characters('chunkable'); + $writer->endTag ([$bxgame, 'flavor']); + + foreach my $seq (@seqs) { + $writer->startTag([$bxseq, 'seq'], + [$bxseq, 'id'] => $seq->display_id, + [$bxseq, 'length'] => $seq->length, + [$bxseq, 'type'] => $seq->alphabet); + if ($seq->length > 0) { + $writer->startTag([$bxseq, 'residues']); + $writer->characters($seq->seq); + $writer->endTag([$bxseq, 'residues']); + } + $writer->endTag([$bxseq, 'seq']); + + my @feats = $seq->all_SeqFeatures; + + my $features; + foreach my $feature (@feats) { + if ($feature->has_tag('annotation_id')) { + my @ann_id = $feature->each_tag_value('annotation_id'); + push (@{$features->{'annotations'}->{$ann_id[0]}}, $feature); + } elsif ($feature->has_tag('computation_id')) { + my @comp_id = $feature->each_tag_value('computation_id'); + push (@{$features->{'computations'}->{$comp_id[0]}}, $feature); + } else { + push (@{$features->{'everybody_else'}}, $feature); + } + } + foreach my $key (keys %{$features->{'annotations'}}) { + $writer->startTag([$bxann, 'annotation'], + [$bxann, 'id']=>$key + ); + $writer->startTag([$bxann, 'seq_link']); + $writer->startTag([$bxlink, 'link']); + $writer->emptyTag([$bxlink, 'ref_link'], + [$bxlink, 'ref'] => $seq->display_id()); + $writer->endTag([$bxlink, 'link']); + $writer->endTag([$bxann, 'seq_link']); + $self->__draw_feature_set($writer, $seq, $bxann, "", @{$features->{'annotations'}->{$key}}); + $writer->endTag([$bxann, 'annotation']); + } + + foreach my $key (keys %{$features->{'computations'}}) { + $writer->startTag([$bxcomp, 'computation'], + [$bxcomp, 'id']=>$key + ); + $writer->startTag([$bxcomp, 'seq_link']); + $writer->startTag([$bxlink, 'link']); + $writer->emptyTag([$bxlink, 'ref_link'], + [$bxlink, 'ref'] => $seq->display_id()); + $writer->endTag([$bxlink, 'link']); + $writer->endTag([$bxcomp, 'seq_link']); + $self->__draw_feature_set($writer, $seq, $bxcomp, "", @{$features->{'computations'}->{$key}}); + $writer->endTag([$bxcomp, 'computation']); + } + foreach my $feature (@{$features->{'everybody_else'}}) { + $self->__draw_feature($writer, $feature, $seq, "", + $self->_export_subfeatures()); + } + } + $writer->endTag([$bxgame, 'game']); + + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; +} + + +#these two subroutines are very specific! + +sub __draw_feature_set { + my ($self, $writer, $seq, $namespace, $parent, @features) = @_; + my ($feature_set_id); + + my $bxfeat = "http://www.bioxml.org/dtds/current/feature.dtd"; + + if ($self->_export_subfeatures() && $self->_group_subfeatures()) { + $feature_set_id = $self->{'id_counter'}; $self->{'id_counter'}++; + $writer->startTag([$namespace, 'feature_set'], + [$namespace, 'id'] => $feature_set_id); + foreach my $feature (@features) { + $self->__draw_feature($writer, $feature, $seq, $parent , 0); + } + $writer->endTag([$namespace, 'feature_set']); + foreach my $feature (@features) { + foreach my $subset ($self->_subfeature_types()) { + if (my @subfeatures = eval ( '$feature->' . $subset . '()' )) { + my @id = $feature->each_tag_value('id'); + $self->__draw_feature_set($writer, $seq, $namespace, $id[0], @subfeatures); + } + } + } + + } else { + $feature_set_id = $self->{'id_counter'}; $self->{'id_counter'}++; + $writer->startTag([$namespace, 'feature_set'], + [$namespace, 'id'] => $feature_set_id); + foreach my $feature (@features) { + $self->__draw_feature($writer, $feature, $seq, "" , $self->_export_subfeatures()); + } + $writer->endTag([$namespace, 'feature_set']); + } +} + + +sub __draw_feature { + my ($self, $writer, $feature, $seq, $parent, $recursive) = @_; + my ($subfeature, $subset, @subfeatures, $score, $score_val, $score_no); + my $bxfeat = "http://www.bioxml.org/dtds/current/feature.dtd"; + + if (!$feature->has_tag('id')) { + $feature->add_tag_value('id', $self->{'id_counter'}); + $self->{'id_counter'}++; + } + + my @id = $feature->each_tag_value('id'); + if ($parent) { + $writer->startTag([$bxfeat, 'feature'], + [$bxfeat, 'id'] => $id[0] + ); + } else { + $writer->startTag([$bxfeat, 'feature'], + [$bxfeat, 'id'] => $id[0], + [$bxfeat, 'parent'] => $parent + ); + } + $writer->startTag([$bxfeat, 'type']); + $writer->characters($feature->primary_tag()); + $writer->endTag([$bxfeat, 'type']); + foreach $score ($feature->all_tags()) { + next if ($score eq 'id'); + $writer->startTag([$bxfeat, 'score'], + [$bxfeat, 'type'] => $score + ); + $score_no = 0; + foreach $score_val ($feature->each_tag_value($score)) { + next unless defined $score_val; + $writer->characters(' ') if ($score_no > 0); + $writer->characters($score_val); + $score_no++; + } + $writer->endTag([$bxfeat, 'score']); + } + + $writer->startTag([$bxfeat, 'seq_relationship'], + [$bxfeat, 'seq'] => $seq->display_id, + [$bxfeat, 'type'] => 'query' + ); + + $writer->startTag([$bxfeat, 'span']); + $writer->startTag([$bxfeat, 'start']); + $writer->characters($feature->start()); + $writer->endTag([$bxfeat, 'start']); + $writer->startTag([$bxfeat, 'end']); + $writer->characters($feature->end()); + $writer->endTag([$bxfeat, 'end']); + $writer->endTag([$bxfeat, 'span']); + $writer->endTag([$bxfeat, 'seq_relationship']); + $writer->endTag([$bxfeat, 'feature']); + + #proces subseqfeature's, exons, introns, promotors, whatever... + if ($recursive) { + foreach $subset ($self->_subfeature_types()) { + #determine if it exists + if (@subfeatures = eval ( '$feature->' . $subset . '()' )) { + foreach $subfeature (@subfeatures) { + $self->__draw_feature ($writer, $subfeature, $seq, $id[0], 1); + } + } + } + } +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqIO/game/featureHandler.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/game/featureHandler.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,341 @@ +# $Id: featureHandler.pm,v 1.9 2002/06/04 02:54:48 jason Exp $ +# +# BioPerl module for Bio::SeqIO::game::featureHandler +# +# Cared for by Brad Marshall <bradmars@yahoo.com> +# +# Copyright Brad Marshall +# +# You may distribute this module under the same terms as perl itself +# _history +# June 25, 2000 written by Brad Marshall +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::game::featureHandler - GAME helper via PerlSAX helper. + +=head1 SYNOPSIS + +GAME helper for parsing new Feature objects from GAME XML. Do not use directly. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and +other Bioperl modules. Send your comments and suggestions preferably +to one of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - Bioperl list + bioxml-dev@bioxml.org - Technical discussion - Moderate volume + bioxml-announce@bioxml.org - General Announcements - Pretty dead + http://www.bioxml.org/MailingLists/ - About the mailing lists + +=head1 AUTHOR - Brad Marshall + +Email: bradmars@yahoo.com + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# This template file is in the Public Domain. +# You may do anything you want with this file. +# + +package Bio::SeqIO::game::featureHandler; + +use Bio::SeqFeature::Generic; +use XML::Handler::Subs; + +use vars qw{ $AUTOLOAD @ISA }; +use strict; + +@ISA = qw(XML::Handler::Subs); + +sub new { + my ($caller,$seq,$length,$type) = @_; + my $class = ref($caller) || $caller; + my $self = bless ({ + seq => $seq, + type => $type, + length => $length, + string => '', + feat => {}, + feats => [], + comp_id => 1, + }, $class); + return $self; +} + +=head2 start_document + + Title : start_document + Usage : $obj->start_document + Function: PerlSAX method called when a new document is initialized + Returns : nothing + Args : document name + +=cut + +# Basic PerlSAX +sub start_document { + my ($self, $document) = @_; + + $self->{'Names'} = []; + $self->{'Nodes'} = []; + $self->{'feats'} = []; + +} + +=head2 end_document + + Title : end_document + Usage : $obj->end_document + Function: PerlSAX method called when a document is finished for cleaning up + Returns : list of features seen + Args : document name + +=cut + +sub end_document { + my ($self, $document) = @_; + + delete $self->{'Names'}; + return $self->{'feats'}; +} + +=head2 start_element + + Title : start_element + Usage : $obj->start_element + Function: PerlSAX method called when a new element is reached + Returns : nothing + Args : element object + +=cut + +sub start_element { + my ($self, $element) = @_; + + push @{$self->{'Names'}}, $element->{'Name'}; + $self->{'string'} = ''; + + if ($self->in_element('bx-feature:seq_relationship')) { + if (defined $element->{'Attributes'}->{'bx-feature:seq'} && + defined $self->{'seq'} && + $element->{'Attributes'}->{'bx-feature:seq'} eq $self->{'seq'}) { + $self->{'in_current_seq'} = 'true'; + } + } + + + if ($self->in_element('bx-computation:computation')) { + $self->{'feat'} = {}; + if (defined $element->{'Attributes'}->{'bx-computation:id'}) { + $self->{'feat'}->{'computation_id'} = $element->{'Attributes'}->{'bx-computation:id'}; + } else { + $self->{'feat'}->{'computation_id'} = $self->{'comp_id'}; + $self->{'comp_id'}++; + } + } + + if ($self->in_element('bx-feature:feature')) { + if (defined $element->{'Attributes'}->{'bx-feature:id'}) { + $self->{'feat'}->{'id'} = $element->{'Attributes'}->{'bx-feature:id'}; + } + } + + if ($self->in_element('bx-annotation:annotation')) { + $self->{'feat'} = {}; + $self->{'feat'}->{'annotation_id'} = $element->{'Attributes'}->{'bx-annotation:id'}; + $self->{'feat'}->{'annotation_name'} = $element->{'Attributes'}->{'bx-annotation:name'}; + } + + return 0; +} + +=head2 end_element + + Title : end_element + Usage : $obj->end_element + Function: PerlSAX method called when an element is finished + Returns : nothing + Args : element object + +=cut + +sub end_element { + my ($self, $element) = @_; + + if ($self->in_element('bx-computation:program')) { + $self->{'string'} =~ s/^\s+//g; + $self->{'string'} =~ s/\s+$//; + $self->{'string'} =~ s/\n//g; + $self->{'feat'}->{'source_tag'} = $self->{'string'}; + } + + if ($self->in_element('bx-annotation:author')) { + $self->{'string'} =~ s/^\s+//g; + $self->{'string'} =~ s/\s+$//; + $self->{'string'} =~ s/\n//g; + $self->{'feat'}->{'source_tag'} = "Annotated by $self->{'string'}."; + } + + if ($self->in_element('bx-feature:type')) { + $self->{'string'} =~ s/^\s+//g; + $self->{'string'} =~ s/\s+$//; + $self->{'string'} =~ s/\n//g; + $self->{'feat'}->{'primary_tag'} = $self->{'string'}; + } + + if ($self->in_element('bx-feature:start')) { + $self->{'string'} =~ s/^\s+//g; + $self->{'string'} =~ s/\s+$//; + $self->{'string'} =~ s/\n//g; + $self->{'feat'}->{'start'} = $self->{'string'}; + } + + if ($self->in_element('bx-feature:end')) { + $self->{'string'} =~ s/^\s+//g; + $self->{'string'} =~ s/\s+$//; + $self->{'string'} =~ s/\n//g; + $self->{'feat'}->{'end'} = $self->{'string'}; + } + + if ($self->in_element('bx-computation:score')) { + $self->{'string'} =~ s/^\s+//g; + $self->{'string'} =~ s/\s+$//; + $self->{'string'} =~ s/\n//g; + $self->{'feat'}->{'score'} = $self->{'string'}; + } + + if ($self->in_element('bx-feature:seq_relationship')) { + + if ($self->{'feat'}->{'start'} > $self->{'feat'}->{'end'}) { + my $new_start = $self->{'feat'}->{'end'}; + $self->{'feat'}->{'end'} = $self->{'feat'}->{'start'}; + $self->{'feat'}->{'start'} = $new_start; + $self->{'feat'}->{'strand'} = -1; + } else { + $self->{'feat'}->{'strand'} = 1; + } + my $new_feat = new Bio::SeqFeature::Generic + ( + -start => $self->{'feat'}->{'start'}, + -end => $self->{'feat'}->{'end'}, + -strand => $self->{'feat'}->{'strand'}, + -source => $self->{'feat'}->{'source_tag'}, + -primary => $self->{'feat'}->{'primary_tag'}, + -score => $self->{'feat'}->{'score'}, + ); + + if (defined $self->{'feat'}->{'computation_id'}) { + $new_feat->add_tag_value('computation_id', + $self->{'feat'}->{'computation_id'} ); + } elsif (defined $self->{'feat'}->{'annotation_id'}) { + $new_feat->add_tag_value('annotation_id', + $self->{'feat'}->{'annotation_id'} ); + } + if (defined $self->{'feat'}->{'id'}) { + $new_feat->add_tag_value('id', $self->{'feat'}->{'id'} ); + } + + push @{$self->{'feats'}}, $new_feat; + $self->{'feat'} = { + seqid => $self->{'feat'}->{'curr_seqid'}, + primary_tag => $self->{'feat'}->{'primary_tag'}, + source_tag => $self->{'feat'}->{'source_tag'}, + computation_id => $self->{'feat'}->{'computation_id'}, + annotation_id => $self->{'feat'}->{'annotation_id'} + } + } + + + pop @{$self->{'Names'}}; + pop @{$self->{'Nodes'}}; + +} + +=head2 characters + + Title : characters + Usage : $obj->end_element + Function: PerlSAX method called when text between XML tags is reached + Returns : nothing + Args : text + +=cut + +sub characters { + my ($self, $text) = @_; + $self->{'string'} .= $text->{'Data'}; +} + +=head2 in_element + + Title : in_element + Usage : $obj->in_element + Function: PerlSAX method called to test if state is in a specific element + Returns : boolean + Args : name of element + +=cut + +sub in_element { + my ($self, $name) = @_; + + return (defined $self->{'Names'}[-1] && + $self->{'Names'}[-1] eq $name); +} + +=head2 within_element + + Title : within_element + Usage : $obj->within_element + Function: PerlSAX method called to list depth within specific element + Returns : boolean + Args : name of element + +=cut + +sub within_element { + my ($self, $name) = @_; + + my $count = 0; + foreach my $el_name (@{$self->{'Names'}}) { + $count ++ if ($el_name eq $name); + } + + return $count; +} + +=head2 AUTOLOAD + + Title : AUTOLOAD + Usage : do not use directly + Function: autoload handling of missing DESTROY method + Returns : nothing + Args : text + +=cut + +# Others +sub AUTOLOAD { + my $self = shift; + + my $method = $AUTOLOAD; + $method =~ s/.*:://; + return if $method eq 'DESTROY'; + + print "UNRECOGNIZED $method\n"; +} + +1; + +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqIO/game/idHandler.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/game/idHandler.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,177 @@ +# $Id: idHandler.pm,v 1.8 2001/11/20 02:09:38 lstein Exp $ +# +# BioPerl module for Bio::SeqIO::game::idHandler +# +# Cared for by Brad Marshall <bradmars@yahoo.com> +# +# Copyright Brad Marshall +# +# You may distribute this module under the same terms as perl itself +# _history +# June 25, 2000 written by Brad Marshall +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::game::idHandler - GAME helper via PerlSAX helper. + +=head1 SYNOPSIS + +GAME helper for parsing new ID objects from GAME XML. Do not use directly + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and +other Bioperl modules. Send your comments and suggestions preferably +to one of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - Bioperl list + bioxml-dev@bioxml.org - Technical discussion - Moderate volume + bioxml-announce@bioxml.org - General Announcements - Pretty dead + http://www.bioxml.org/MailingLists/ - About the mailing lists + +=head1 AUTHOR - Brad Marshall + +Email: bradmars@yahoo.com + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# This template file is in the Public Domain. +# You may do anything you want with this file. +# + +package Bio::SeqIO::game::idHandler; +use Bio::Root::Root; + +use vars qw{ $AUTOLOAD @ISA }; +use strict; +@ISA = qw(Bio::Root::Root); +sub new { + my ($class,@args) = @_; + my $self = $class->SUPER::new(@args); + + # initialize ids + $self->{'ids'} = []; + + return $self; +} + +=head2 start_document + + Title : start_document + Usage : $obj->start_document + Function: PerlSAX method called when a new document is initialized + Returns : nothing + Args : document name + +=cut + +# Basic PerlSAX +sub start_document { + my ($self, $document) = @_; +} + +=head2 end_document + + Title : end_document + Usage : $obj->end_document + Function: PerlSAX method called when a document is finished for cleaning up + Returns : list of ids seen + Args : document name + +=cut + +sub end_document { + my ($self, $document) = @_; + return $self->{'ids'}; +} + +=head2 start_element + + Title : start_element + Usage : $obj->start_element + Function: PerlSAX method called when a new element is reached + Returns : nothing + Args : element object + +=cut + +sub start_element { + my ($self, $element) = @_; + + if ($element->{'Name'} eq 'bx-seq:seq') { + if ($element->{'Attributes'}->{'bx-seq:id'}) { + push @{$self->{'ids'}}, $element->{'Attributes'}->{'bx-seq:id'}; + } else { + if ($self->can('warn')) { + $self->warn('WARNING: Attribute bx-seq:id is required on bx-seq:seq. Sequence will not be parsed.'); + } else { + warn('WARNING: Attribute bx-seq:id is required on bx-seq:seq. Sequence will not be parsed.'); + } + } + } + return 0; +} + +=head2 end_element + + Title : end_element + Usage : $obj->end_element + Function: PerlSAX method called when an element is finished + Returns : nothing + Args : element object + +=cut + +sub end_element { + my ($self, $element) = @_; + +} + +=head2 characters + + Title : characters + Usage : $obj->end_element + Function: PerlSAX method called when text between XML tags is reached + Returns : nothing + Args : text + +=cut + +sub characters { + my ($self, $text) = @_; +} + + +=head2 AUTOLOAD + + Title : AUTOLOAD + Usage : do not use directly + Function: autoload handling of missing DESTROY method + Returns : nothing + Args : text + +=cut + +# Others +sub AUTOLOAD { + my $self = shift; + + my $method = $AUTOLOAD; + $method =~ s/.*:://; + return if $method eq 'DESTROY'; + + print "UNRECOGNIZED $method\n"; +} + +1; + +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqIO/game/seqHandler.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/game/seqHandler.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,289 @@ +# $Id: seqHandler.pm,v 1.15 2002/06/24 04:29:31 jason Exp $ +# +# BioPerl module for Bio::SeqIO::game::seqHandler +# +# Cared for by Brad Marshall <bradmars@yahoo.com> +# +# Copyright Brad Marshall +# +# You may distribute this module under the same terms as perl itself +# _history +# June 25, 2000 written by Brad Marshall +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::game::seqHandler - GAME helper via PerlSAX helper. + +=head1 SYNOPSIS + +GAME helper for parsing new Sequence objects from GAME XML. Do not use directly + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and +other Bioperl modules. Send your comments and suggestions preferably +to one of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - Bioperl list + bioxml-dev@bioxml.org - Technical discussion - Moderate volume + bioxml-announce@bioxml.org - General Announcements - Pretty dead + http://www.bioxml.org/MailingLists/ - About the mailing lists + +=head1 AUTHOR - Brad Marshall + +Email: bradmars@yahoo.com + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# This template file is in the Public Domain. +# You may do anything you want with this file. +# + +package Bio::SeqIO::game::seqHandler; +use vars qw{ $AUTOLOAD @ISA }; + +use XML::Handler::Subs; +use Bio::Root::Root; +use Bio::Seq::SeqFactory; + +@ISA = qw(Bio::Root::Root XML::Handler::Subs); + +sub new { + my ($class,@args) = @_; + my $self = $class->SUPER::new(@args); + my ($seq,$sb) = $self->_rearrange([qw(SEQ SEQBUILDER)], @args); + $self->{'string'} = ''; + $self->{'seq'} = $seq; + $self->sequence_factory($sb || new Bio::Seq::SeqFactory(-type => 'Bio::Seq')); + return $self; +} + +=head2 sequence_factory + + Title : sequence_factory + Usage : $seqio->sequence_factory($builder) + Function: Get/Set the Bio::Factory::SequenceFactoryI + Returns : Bio::Factory::SequenceFactoryI + Args : [optional] Bio::Factory::SequenceFactoryI + + +=cut + +sub sequence_factory{ + my ($self,$obj) = @_; + if( defined $obj ) { + if( ! ref($obj) || ! $obj->isa('Bio::Factory::SequenceFactoryI') ) { + $self->throw("Must provide a valid Bio::Factory::SequenceFactoryI object to ".ref($self)." sequence_factory()"); + } + $self->{'_seqio_seqfactory'} = $obj; + } + if( ! defined $self->{'_seqio_seqfactory'} ) { + $self->throw("No SequenceBuilder defined for SeqIO::game::seqHandler object"); + } + + return $self->{'_seqio_seqfactory'}; +} + +=head2 start_document + + Title : start_document + Usage : $obj->start_document + Function: PerlSAX method called when a new document is initialized + Returns : nothing + Args : document name + +=cut + +# Basic PerlSAX +sub start_document { + my ($self, $document) = @_; + $self->{'in_current_seq'} = 'false'; + $self->{'Names'} = []; + $self->{'string'} = ''; +} + +=head2 end_document + + Title : end_document + Usage : $obj->end_document + Function: PerlSAX method called when a document is finished for cleaning up + Returns : list of sequences seen + Args : document name + +=cut + +sub end_document { + my ($self, $document) = @_; + delete $self->{'Names'}; + return $self->sequence_factory->create + ( -seq => $self->{'residues'}, + -alphabet => $self->{'alphabet'}, + -id => $self->{'seq'}, + -accession => $self->{'accession'}, + -desc => $self->{'desc'}, + -length => $self->{'length'}, + ); +} + + +=head2 start_element + + Title : start_element + Usage : $obj->start_element + Function: PerlSAX method called when a new element is reached + Returns : nothing + Args : element object + +=cut + +sub start_element { + my ($self, $element) = @_; + + push @{$self->{'Names'}}, $element->{'Name'}; + $self->{'string'} = ''; + + if ($element->{'Name'} eq 'bx-seq:seq') { + if ($element->{'Attributes'}->{'bx-seq:id'} eq $self->{'seq'}) { + $self->{'in_current_seq'} = 'true'; + $self->{'alphabet'} = $element->{'Attributes'}->{'bx-seq:type'}; + $self->{'length'} = $element->{'Attributes'}->{'bx-seq:length'}; + } else { + #This is not the sequence we want to import, but that's ok + } + } + return 0; +} + +=head2 end_element + + Title : end_element + Usage : $obj->end_element + Function: PerlSAX method called when an element is finished + Returns : nothing + Args : element object + +=cut + +sub end_element { + my ($self, $element) = @_; + + if ($self->{'in_current_seq'} eq 'true') { + if ($self->in_element('bx-seq:residues')) { + while ($self->{'string'} =~ s/\s+//) {}; + $self->{'residues'} = $self->{'string'}; + } + + + if ($self->in_element('bx-seq:name')) { + $self->{'string'} =~ s/^\s+//g; + $self->{'string'} =~ s/\s+$//; + $self->{'string'} =~ s/\n//g; + $self->{'name'} = $self->{'string'}; + } + + + if ($self->in_element('bx-link:id') && $self->within_element('bx-link:dbxref')) { + $self->{'string'} =~ s/^\s+//g; + $self->{'string'} =~ s/\s+$//; + $self->{'string'} =~ s/\n//g; + $self->{'accession'} = $self->{'string'}; + } + + if ($self->in_element('bx-seq:description')) { + $self->{'desc'} = $self->{'string'}; + } + + if ($self->in_element('bx-seq:seq')) { + $self->{'in_current_seq'} = 'false'; + } + } + + pop @{$self->{'Names'}}; + +} + +=head2 characters + + Title : characters + Usage : $obj->end_element + Function: PerlSAX method called when text between XML tags is reached + Returns : nothing + Args : text + +=cut + +sub characters { + my ($self, $text) = @_; + $self->{'string'} .= $text->{'Data'}; +} + +=head2 in_element + + Title : in_element + Usage : $obj->in_element + Function: PerlSAX method called to test if state is in a specific element + Returns : boolean + Args : name of element + +=cut + +sub in_element { + my ($self, $name) = @_; + + return ($self->{'Names'}[-1] eq $name); +} + +=head2 within_element + + Title : within_element + Usage : $obj->within_element + Function: PerlSAX method called to list depth within specific element + Returns : boolean + Args : name of element + +=cut + +sub within_element { + my ($self, $name) = @_; + + my $count = 0; + foreach my $el_name (@{$self->{'Names'}}) { + $count ++ if ($el_name eq $name); + } + + return $count; +} + +=head2 AUTOLOAD + + Title : AUTOLOAD + Usage : do not use directly + Function: autoload handling of missing DESTROY method + Returns : nothing + Args : text + +=cut + +# Others +sub AUTOLOAD { + my $self = shift; + + my $method = $AUTOLOAD; + $method =~ s/.*:://; + return if $method eq 'DESTROY'; + + print "UNRECOGNIZED $method\n"; +} + +1; + +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqIO/gcg.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/gcg.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,298 @@ +# $Id: gcg.pm,v 1.21 2002/10/25 16:22:01 jason Exp $ +# +# BioPerl module for Bio::SeqIO::gcg +# +# Cared for by Ewan Birney <birney@ebi.ac.uk> +# and Lincoln Stein <lstein@cshl.org> +# +# Copyright Ewan Birney & Lincoln Stein +# +# You may distribute this module under the same terms as perl itself +# +# _history +# October 18, 1999 Largely rewritten by Lincoln Stein + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::gcg - GCG sequence input/output stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the Bio::SeqIO class. + +=head1 DESCRIPTION + +This object can transform Bio::Seq objects to and from GCG flat +file databases. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://www.bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS - Ewan Birney & Lincoln Stein + +Email: E<lt>birney@ebi.ac.ukE<gt> + E<lt>lstein@cshl.orgE<gt> + +=head1 CONTRIBUTORS + +Jason Stajich, jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::SeqIO::gcg; +use vars qw(@ISA); +use strict; + +use Bio::SeqIO; +use Bio::Seq::SeqFactory; + +@ISA = qw(Bio::SeqIO); + +sub _initialize { + my($self,@args) = @_; + $self->SUPER::_initialize(@args); + if( ! defined $self->sequence_factory ) { + $self->sequence_factory(new Bio::Seq::SeqFactory + (-verbose => $self->verbose(), + -type => 'Bio::Seq::RichSeq')); + } +} + +=head2 next_seq + + Title : next_seq + Usage : $seq = $stream->next_seq() + Function: returns the next sequence in the stream + Returns : Bio::Seq object + Args : + +=cut + +sub next_seq { + my ($self,@args) = @_; + my($id,$type,$desc,$line,$chksum,$sequence,$date,$len); + + while( defined($_ = $self->_readline()) ) { + + ## Get the descriptive info (anything before the line with '..') + unless( /\.\.$/ ) { $desc.= $_; } + ## Pull ID, Checksum & Type from the line containing '..' + /\.\.$/ && do { $line = $_; chomp; + if(/Check\:\s(\d+)\s/) { $chksum = $1; } + if(/Type:\s(\w)\s/) { $type = $1; } + if(/(\S+)\s+Length/) + { $id = $1; } + if(/Length:\s+(\d+)\s+(\S.+\S)\s+Type/ ) + { $len = $1; $date = $2;} + last; + } + } + return if ( !defined $_); + chomp($desc); # remove last "\n" + + while( defined($_ = $self->_readline()) ) { + + ## This is where we grab the sequence info. + + if( /\.\.$/ ) { + $self->throw("Looks like start of another sequence. See documentation. "); + } + + next if($_ eq "\n"); ## skip whitespace lines in formatted seq + s/[^a-zA-Z]//g; ## remove anything that is not alphabet char + # $_ = uc($_); ## uppercase sequence: NO. Keep the case. HL + $sequence .= $_; + } + ##If we parsed out a checksum, we might as well test it + + if(defined $chksum) { + unless(_validate_checksum($sequence,$chksum)) { + $self->throw("Checksum failure on parsed sequence."); + } + } + + ## Remove whitespace from identifier because the constructor + ## will throw a warning otherwise... + if(defined $id) { $id =~ s/\s+//g;} + + ## Turn our parsed "Type: N" or "Type: P" (if found) into the appropriate + ## keyword that the constructor expects... + if(defined $type) { + if($type eq "N") { $type = "dna"; } + if($type eq "P") { $type = "prot"; } + } + + return $self->sequence_factory->create(-seq => $sequence, + -id => $id, + -desc => $desc, + -type => $type, + -dates => [ $date ] + ); +} + +=head2 write_seq + + Title : write_seq + Usage : $stream->write_seq(@seq) + Function: writes the formatted $seq object into the stream + Returns : 1 for success and 0 for error + Args : array of Bio::PrimarySeqI object + + +=cut + +sub write_seq { + my ($self,@seq) = @_; + for my $seq (@seq) { + $self->throw("Did not provide a valid Bio::PrimarySeqI object") + unless defined $seq && ref($seq) && $seq->isa('Bio::PrimarySeqI'); + + my $str = $seq->seq; + my $comment = $seq->desc; + my $id = $seq->id; + my $type = ( $seq->alphabet() =~ /[dr]na/i ) ? 'N' : 'P'; + my $timestamp; + + if( $seq->can('get_dates') ) { + ($timestamp) = $seq->get_dates; + } else { + $timestamp = localtime(time); + } + my($sum,$offset,$len,$i,$j,$cnt,@out); + + $len = length($str); + ## Set the offset if we have any non-standard numbering going on + $offset=1; + # checksum + $sum = $self->GCG_checksum($seq); + + #Output the sequence header info + push(@out,"$comment\n"); + push(@out,"$id Length: $len $timestamp Type: $type Check: $sum ..\n\n"); + + #Format the sequence + $i = $#out + 1; + for($j = 0 ; $j < $len ; ) { + if( $j % 50 == 0) { + $out[$i] = sprintf("%8d ",($j+$offset)); #numbering + } + $out[$i] .= sprintf("%s",substr($str,$j,10)); + $j += 10; + if( $j < $len && $j % 50 != 0 ) { + $out[$i] .= " "; + }elsif($j % 50 == 0 ) { + $out[$i++] .= "\n\n"; + } + } + local($^W) = 0; + if($j % 50 != 0 ) { + $out[$i] .= "\n"; + } + $out[$i] .= "\n"; + return unless $self->_print(@out); + } + + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; +} + +=head2 GCG_checksum + + Title : GCG_checksum + Usage : $cksum = $gcgio->GCG_checksum($seq); + Function : returns a gcg checksum for the sequence specified + + This method can also be called as a class method. + Example : + Returns : a GCG checksum string + Argument : a Bio::PrimarySeqI implementing object + +=cut + +sub GCG_checksum { + my ($self,$seqobj) = @_; + my $index = 0; + my $checksum = 0; + my $char; + + my $seq = $seqobj->seq(); + $seq =~ tr/a-z/A-Z/; + + foreach $char ( split(/[\.\-]*/, $seq)) { + $index++; + $checksum += ($index * (unpack("c",$char) || 0) ); + if( $index == 57 ) { + $index = 0; + } + } + + return ($checksum % 10000); +} + +=head2 _validate_checksum + + Title : _validate_checksum + Usage : n/a - internal method + Function: if parsed gcg sequence contains a checksum field + : we compare it to a value computed here on the parsed + : sequence. A checksum mismatch would indicate some + : type of parsing failure occured. + : + Returns : 1 for success, 0 for failure + Args : string containing parsed seq, value of parsed cheksum + + +=cut + +sub _validate_checksum { + my($seq,$parsed_sum) = @_; + my($i,$len,$computed_sum,$cnt); + + $len = length($seq); + + #Generate the GCG Checksum value + + for($i=0; $i<$len ;$i++) { + $cnt++; + $computed_sum += $cnt * ord(substr($seq,$i,1)); + ($cnt == 57) && ($cnt=0); + } + $computed_sum %= 10000; + + ## Compare and decide if success or failure + + if($parsed_sum == $computed_sum) { + return 1; + } else { return 0; } + + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqIO/genbank.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/genbank.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1434 @@ +# $Id: genbank.pm,v 1.76.2.12 2003/09/13 23:33:04 jason Exp $ +# +# BioPerl module for Bio::SeqIO::GenBank +# +# Cared for by Elia Stupka <elia@tll.org.sg> +# +# Copyright Elia Stupka +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::GenBank - GenBank sequence input/output stream + +=head1 SYNOPSIS + +It is probably best not to use this object directly, but +rather go through the SeqIO handler system. Go: + + $stream = Bio::SeqIO->new(-file => $filename, -format => 'GenBank'); + + while ( my $seq = $stream->next_seq() ) { + # do something with $seq + } + + +=head1 DESCRIPTION + +This object can transform Bio::Seq objects to and from GenBank flat +file databases. + +There is alot of flexibility here about how to dump things which I need +to document fully. + +=head2 Mapping of record properties to object properties + +This section is supposed to document which sections and properties of +a GenBank databank record end up where in the Bioperl object model. It +is far from complete and presently focuses only on those mappings +which may be non-obvious. $seq in the text refers to the +Bio::Seq::RichSeqI implementing object returned by the parser for each +record. + +=over 4 + +=item GI number + +$seq-E<gt>primary_id + +=back + +=head2 Optional functions + +=over 3 + +=item _show_dna() + +(output only) shows the dna or not + +=item _post_sort() + +(output only) provides a sorting func which is applied to the FTHelpers +before printing + +=item _id_generation_func() + +This is function which is called as + + print "ID ", $func($seq), "\n"; + +To generate the ID line. If it is not there, it generates a sensible ID +line using a number of tools. + + +If you want to output annotations in genbank format they need to be +stored in a Bio::Annotation::Collection object which is accessible +through the Bio::SeqI interface method L<annotation()|annotation>. + +The following are the names of the keys which are polled from a +L<Bio::Annotation::Collection> object. + +reference - Should contain Bio::Annotation::Reference objects +comment - Should contain Bio::Annotation::Comment objects + +segment - Should contain a Bio::Annotation::SimpleValue object +origin - Should contain a Bio::Annotation::SimpleValue object + +=back + +=head1 Where does the data go? + +Data parsed in Bio::SeqIO::genbank is stored in a variety of data +fields in the sequence object that is returned. More information in +the HOWTOs about exactly what each field means and where it goes. +Here is a partial list of fields. + +Items listed as RichSeq or Seq or PrimarySeq and then NAME() tell you +the top level object which defines a function called NAME() which +stores this information. + +Items listed as Annotation 'NAME' tell you the data is stored the +associated Bio::Annotation::Colection object which is associated with +Bio::Seq objects. If it is explictly requested that no annotations +should be stored when parsing a record of course they won't be +available when you try and get them. If you are having this problem +look at the type of SeqBuilder that is being used to contruct your +sequence object. + +Comments Annotation 'comment' +References Annotation 'reference' +Segment Annotation 'segment' +Origin Annotation 'origin' + +Accessions PrimarySeq accession_number() +Secondary accessions RichSeq get_secondary_accessions() +Keywords RichSeq keywords() +Dates RichSeq get_dates() +Molecule RichSeq molecule() +Seq Version RichSeq seq_version() +PID RichSeq pid() +Division RichSeq division() +Features Seq get_SeqFeatures() +Alphabet PrimarySeq alphabet() +Definition PrimarySeq description() or desc() +Version PrimarySeq version() + +Sequence PrimarySeq seq() + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://www.bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Elia Stupka + +Email elia@tll.org.sg + +=head1 CONTRIBUTORS + +Ewan Birney birney@ebi.ac.uk +Jason Stajich jason@bioperl.org +Chris Mungall cjm@fruitfly.bdgp.berkeley.edu +Lincoln Stein lstein@cshl.org +Heikki Lehvaslaiho, heikki@ebi.ac.uk +Hilmar Lapp, hlapp@gmx.net +Donald G. Jackson, donald.jackson@bms.com + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::SeqIO::genbank; +use vars qw(@ISA); +use strict; + +use Bio::SeqIO; +use Bio::SeqIO::FTHelper; +use Bio::SeqFeature::Generic; +use Bio::Species; +use Bio::Seq::SeqFactory; +use Bio::Annotation::Collection; +use Bio::Annotation::Comment; +use Bio::Annotation::Reference; +use Bio::Annotation::DBLink; + +@ISA = qw(Bio::SeqIO); + +sub _initialize { + my($self,@args) = @_; + + $self->SUPER::_initialize(@args); + # hash for functions for decoding keys. + $self->{'_func_ftunit_hash'} = {}; + $self->_show_dna(1); # sets this to one by default. People can change it + if( ! defined $self->sequence_factory ) { + $self->sequence_factory(new Bio::Seq::SeqFactory + (-verbose => $self->verbose(), + -type => 'Bio::Seq::RichSeq')); + } +} + +=head2 next_seq + + Title : next_seq + Usage : $seq = $stream->next_seq() + Function: returns the next sequence in the stream + Returns : Bio::Seq object + Args : + +=cut + +sub next_seq { + my ($self,@args) = @_; + my $builder = $self->sequence_builder(); + my $seq; + my %params; + + RECORDSTART: while (1) { + my $buffer; + my (@acc, @features); + my ($display_id, $annotation); + my $species; + + # initialize; we may come here because of starting over + @features = (); + $annotation = undef; + @acc = (); + $species = undef; + %params = (-verbose => $self->verbose); # reset hash + local($/) = "\n"; + while(defined($buffer = $self->_readline())) { + last if index($buffer,'LOCUS ') == 0; + } + return undef if( !defined $buffer ); # end of file + $buffer =~ /^LOCUS\s+(\S.*)$/ || + $self->throw("GenBank stream with bad LOCUS line. Not GenBank in my book. Got '$buffer'"); + + my @tokens = split(' ', $1); + + # this is important to have the id for display in e.g. FTHelper, + # otherwise you won't know which entry caused an error + $display_id = shift(@tokens); + $params{'-display_id'} = $display_id; + # may still be useful if we don't want the seq + $params{'-length'} = shift(@tokens); + # the alphabet of the entry + $params{'-alphabet'} = (lc(shift @tokens) eq 'bp') ? 'dna' : 'protein'; + # for aa there is usually no 'molecule' (mRNA etc) + if (($params{'-alphabet'} eq 'dna') || (@tokens > 2)) { + $params{'-molecule'} = shift(@tokens); + my $circ = shift(@tokens); + if ($circ eq 'circular') { + $params{'-is_circular'} = 1; + $params{'-division'} = shift(@tokens); + } else { + # 'linear' or 'circular' may actually be omitted altogether + $params{'-division'} = + (CORE::length($circ) == 3 ) ? $circ : shift(@tokens); + } + } else { + $params{'-molecule'} = 'PRT' if($params{'-alphabet'} eq 'aa'); + $params{'-division'} = shift(@tokens); + } + my $date = join(' ', @tokens); # we lump together the rest + # this is per request bug #1513 + # we can handle + # 9-10-2003 + # 9-10-03 + #09-10-2003 + #09-10-03 + if($date =~ s/\s*((\d{1,2})-(\w{3})-(\d{2,4})).*/$1/) { + if( length($date) < 11 ) { # improperly formatted date + # But we'll be nice and fix it for them + my ($d,$m,$y) = ($2,$3,$4); + if( length($d) == 1 ) { + $d = "0$d"; + } + # guess the century here + if( length($y) == 2 ) { + if( $y > 60 ) { # arbitrarily guess that '60' means 1960 + $y = "19$y"; + } else { + $y = "20$y"; + } + $self->warn("Date was malformed, guessing the century for $date to be $y\n"); + } + $params{'-dates'} = [join('-',$d,$m,$y)]; + } else { + $params{'-dates'} = [$date]; + } + } + + # set them all at once + $builder->add_slot_value(%params); + %params = (); + + # parse the rest if desired, otherwise start over + if(! $builder->want_object()) { + $builder->make_object(); + next RECORDSTART; + } + + # set up annotation depending on what the builder wants + if($builder->want_slot('annotation')) { + $annotation = new Bio::Annotation::Collection; + } + $buffer = $self->_readline(); + until( !defined ($buffer) ) { + $_ = $buffer; + + # Description line(s) + if (/^DEFINITION\s+(\S.*\S)/) { + my @desc = ($1); + while ( defined($_ = $self->_readline) ) { + if( /^\s+(.*)/ ) { push (@desc, $1); next }; + last; + } + $builder->add_slot_value(-desc => join(' ', @desc)); + # we'll continue right here because DEFINITION always comes + # at the top of the entry + } + # accession number (there can be multiple accessions) + if( /^ACCESSION\s+(\S.*\S)/ ) { + push(@acc, split(/\s+/,$1)); + while( defined($_ = $self->_readline) ) { + /^\s+(.*)/ && do { push (@acc, split(/\s+/,$1)); next }; + last; + } + $buffer = $_; + next; + } + # PID + elsif( /^PID\s+(\S+)/ ) { + $params{'-pid'} = $1; + } + #Version number + elsif( /^VERSION\s+(.+)$/ ) { + my ($acc,$gi) = split(' ',$1); + if($acc =~ /^\w+\.(\d+)/) { + $params{'-version'} = $1; + $params{'-seq_version'} = $1; + } + if($gi && (index($gi,"GI:") == 0)) { + $params{'-primary_id'} = substr($gi,3); + } + } + #Keywords + elsif( /^KEYWORDS\s+(.*)/ ) { + my @kw = split(/\s*\;\s*/,$1); + while( defined($_ = $self->_readline) ) { + chomp; + /^\s+(.*)/ && do { push (@kw, split(/\s*\;\s*/,$1)); next }; + last; + } + + @kw && $kw[-1] =~ s/\.$//; + $params{'-keywords'} = \@kw; + $buffer = $_; + next; + } + # Organism name and phylogenetic information + elsif (/^SOURCE/) { + if($builder->want_slot('species')) { + $species = $self->_read_GenBank_Species(\$buffer); + $builder->add_slot_value(-species => $species); + } else { + while(defined($buffer = $self->_readline())) { + last if substr($buffer,0,1) ne ' '; + } + } + next; + } + #References + elsif (/^REFERENCE/) { + if($annotation) { + my @refs = $self->_read_GenBank_References(\$buffer); + foreach my $ref ( @refs ) { + $annotation->add_Annotation('reference',$ref); + } + } else { + while(defined($buffer = $self->_readline())) { + last if substr($buffer,0,1) ne ' '; + } + } + next; + } + #Comments + elsif (/^COMMENT\s+(.*)/) { + if($annotation) { + my $comment = $1; + while (defined($_ = $self->_readline)) { + last if (/^\S/); + $comment .= $_; + } + $comment =~ s/\n/ /g; + $comment =~ s/ +/ /g; + $annotation->add_Annotation( + 'comment', + Bio::Annotation::Comment->new(-text => $comment)); + $buffer = $_; + } else { + while(defined($buffer = $self->_readline())) { + last if substr($buffer,0,1) ne ' '; + } + } + next; + } elsif( /^SEGMENT\s+(.+)/ ) { + if($annotation) { + my $segment = $1; + while (defined($_ = $self->_readline)) { + last if (/^\S/); + $segment .= $_; + } + $segment =~ s/\n/ /g; + $segment =~ s/ +/ /g; + $annotation->add_Annotation( + 'segment', + Bio::Annotation::SimpleValue->new(-value => $segment)); + $buffer = $_; + } else { + while(defined($buffer = $self->_readline())) { + last if substr($buffer,0,1) ne ' '; + } + } + next; + } + + # Exit at start of Feature table, or start of sequence + last if( /^(FEATURES|ORIGIN)/ ); + # Get next line and loop again + $buffer = $self->_readline; + } + return undef if(! defined($buffer)); + + # add them all at once for efficiency + $builder->add_slot_value(-accession_number => shift(@acc), + -secondary_accessions => \@acc, + %params); + $builder->add_slot_value(-annotation => $annotation) if $annotation; + %params = (); # reset before possible re-use to avoid setting twice + + # start over if we don't want to continue with this entry + if(! $builder->want_object()) { + $builder->make_object(); + next RECORDSTART; + } + + # some "minimal" formats may not necessarily have a feature table + if($builder->want_slot('features') && defined($_) && /^FEATURES/) { + # need to read the first line of the feature table + $buffer = $self->_readline; + + # DO NOT read lines in the while condition -- this is done as a side + # effect in _read_FTHelper_GenBank! + while( defined($buffer) ) { + # check immediately -- not at the end of the loop + # note: GenPept entries obviously do not have a BASE line + last if(($buffer =~ /^BASE/) || ($buffer =~ /^ORIGIN/) || + ($buffer =~ /^CONTIG/) ); + + # slurp in one feature at a time -- at return, the start of + # the next feature will have been read already, so we need + # to pass a reference, and the called method must set this + # to the last line read before returning + + my $ftunit = $self->_read_FTHelper_GenBank(\$buffer); + + # fix suggested by James Diggans + + if( !defined $ftunit ) { + # GRRRR. We have fallen over. Try to recover + $self->warn("Unexpected error in feature table for ".$params{'-display_id'}." Skipping feature, attempting to recover"); + unless( ($buffer =~ /^\s{5,5}\S+/) or ($buffer =~ /^\S+/)) { + $buffer = $self->_readline(); + } + next; # back to reading FTHelpers + } + + # process ftunit + my $feat = + $ftunit->_generic_seqfeature($self->location_factory(), + $display_id); + # add taxon_id from source if available + if($species && ($feat->primary_tag eq 'source') && + $feat->has_tag('db_xref') && (! $species->ncbi_taxid())) { + foreach my $tagval ($feat->get_tag_values('db_xref')) { + if(index($tagval,"taxon:") == 0) { + $species->ncbi_taxid(substr($tagval,6)); + } + } + } + # add feature to list of features + push(@features, $feat); + } + $builder->add_slot_value(-features => \@features); + $_ = $buffer; + } + if( defined ($_) ) { + if( /^CONTIG/ && $builder->want_slot('features')) { + $b = " $_"; # need 5 spaces to treat it like a feature + my $ftunit = $self->_read_FTHelper_GenBank(\$b); + if( ! defined $ftunit ) { + $self->warn("unable to parse the CONTIG feature\n"); + } else { + push(@features, + $ftunit->_generic_seqfeature($self->location_factory(), + $display_id)); + } + } elsif(! /^(ORIGIN|\/\/)/ ) { # advance to the sequence, if any + while (defined( $_ = $self->_readline) ) { + last if /^(ORIGIN|\/\/)/; + } + } + } + if(! $builder->want_object()) { + $builder->make_object(); # implicit end-of-object + next RECORDSTART; + } + if($builder->want_slot('seq')) { + # the fact that we want a sequence does not necessarily mean that + # there also is a sequence ... + if(defined($_) && s/^ORIGIN//) { + chomp; + if( $annotation && length($_) > 0 ) { + $annotation->add_Annotation('origin', + Bio::Annotation::SimpleValue->new(-value => $_)); + } + my $seqc = ''; + while( defined($_ = $self->_readline) ) { + /^\/\// && last; + $_ = uc($_); + s/[^A-Za-z]//g; + $seqc .= $_; + } + $self->debug("sequence length is ". length($seqc) ."\n"); + $builder->add_slot_value(-seq => $seqc); + } + } elsif ( defined($_) && (substr($_,0,2) ne '//')) { + # advance to the end of the record + while( defined($_ = $self->_readline) ) { + last if substr($_,0,2) eq '//'; + } + } + # Unlikely, but maybe the sequence is so weird that we don't want it + # anymore. We don't want to return undef if the stream's not exhausted + # yet. + $seq = $builder->make_object(); + next RECORDSTART unless $seq; + last RECORDSTART; + } # end while RECORDSTART + + return $seq; +} + +=head2 write_seq + + Title : write_seq + Usage : $stream->write_seq($seq) + Function: writes the $seq object (must be seq) to the stream + Returns : 1 for success and 0 for error + Args : array of 1 to n Bio::SeqI objects + + +=cut + +sub write_seq { + my ($self,@seqs) = @_; + + foreach my $seq ( @seqs ) { + $self->throw("Attempting to write with no seq!") unless defined $seq; + + if( ! ref $seq || ! $seq->isa('Bio::SeqI') ) { + $self->warn(" $seq is not a SeqI compliant module. Attempting to dump, but may fail!"); + } + + my $str = $seq->seq; + + my ($div, $mol); + my $len = $seq->length(); + + if ( $seq->can('division') ) { + $div=$seq->division; + } + if( !defined $div || ! $div ) { $div = 'UNK'; } + my $alpha = $seq->alphabet; + if( !$seq->can('molecule') || ! defined ($mol = $seq->molecule()) ) { + $mol = $alpha || 'DNA'; + } + + my $circular = 'linear '; + $circular = 'circular' if $seq->is_circular; + + local($^W) = 0; # supressing warnings about uninitialized fields. + + my $temp_line; + if( $self->_id_generation_func ) { + $temp_line = &{$self->_id_generation_func}($seq); + } else { + my $date = ''; + if( $seq->can('get_dates') ) { + ($date) = $seq->get_dates(); + } + $temp_line = sprintf ("%-12s%-15s%13s %s%4s%-8s%-8s %3s %-s", + 'LOCUS', $seq->id(),$len, + (lc($alpha) eq 'protein') ? ('aa','', '') : + ('bp', '',$mol),$circular, + $div,$date); + } + + $self->_print("$temp_line\n"); + $self->_write_line_GenBank_regex("DEFINITION ", " ", + $seq->desc(),"\\s\+\|\$",80); + + # if there, write the accession line + + if( $self->_ac_generation_func ) { + $temp_line = &{$self->_ac_generation_func}($seq); + $self->_print("ACCESSION $temp_line\n"); + } else { + my @acc = (); + push(@acc, $seq->accession_number()); + if( $seq->isa('Bio::Seq::RichSeqI') ) { + push(@acc, $seq->get_secondary_accessions()); + } + $self->_print("ACCESSION ", join(" ", @acc), "\n"); + # otherwise - cannot print <sigh> + } + + # if PID defined, print it + if($seq->isa('Bio::Seq::RichSeqI') && $seq->pid()) { + $self->_print("PID ", $seq->pid(), "\n"); + } + + # if there, write the version line + + if( defined $self->_sv_generation_func() ) { + $temp_line = &{$self->_sv_generation_func}($seq); + if( $temp_line ) { + $self->_print("VERSION $temp_line\n"); + } + } else { + if($seq->isa('Bio::Seq::RichSeqI') && defined($seq->seq_version)) { + my $id = $seq->primary_id(); # this may be a GI number + $self->_print("VERSION ", + $seq->accession_number(), ".", $seq->seq_version, + ($id && ($id =~ /^\d+$/) ? " GI:".$id : ""), + "\n"); + } + } + + # if there, write the keywords line + + if( defined $self->_kw_generation_func() ) { + $temp_line = &{$self->_kw_generation_func}($seq); + $self->_print("KEYWORDS $temp_line\n"); + } else { + if( $seq->can('keywords') ) { + my $kw = $seq->keywords; + if( ref($kw) =~ /ARRAY/i ) { + $kw = join("; ", @$kw); + } + $kw .= '.' if( $kw !~ /\.$/ ); + $self->_print("KEYWORDS $kw\n"); + } + } + + # SEGMENT if it exists + foreach my $ref ( $seq->annotation->get_Annotations('segment') ) { + $self->_print(sprintf ("%-11s %s\n",'SEGMENT', + $ref->value)); + } + + # Organism lines + if (my $spec = $seq->species) { + my ($species, $genus, @class) = $spec->classification(); + my $OS; + if( $spec->common_name ) { + $OS = $spec->common_name; + } else { + $OS = "$genus $species"; + } + if (my $ssp = $spec->sub_species) { + $OS .= " $ssp"; + } + $self->_print("SOURCE $OS\n"); + $self->_print(" ORGANISM ", + ($spec->organelle() ? $spec->organelle()." " : ""), + "$genus $species", "\n"); + my $OC = join('; ', (reverse(@class), $genus)) .'.'; + $self->_write_line_GenBank_regex(' 'x12,' 'x12, + $OC,"\\s\+\|\$",80); + } + + # Reference lines + my $count = 1; + foreach my $ref ( $seq->annotation->get_Annotations('reference') ) { + $temp_line = sprintf ("REFERENCE $count (%s %d to %d)", + ($seq->alphabet() eq "protein" ? + "residues" : "bases"), + $ref->start,$ref->end); + $self->_print("$temp_line\n"); + $self->_write_line_GenBank_regex(" AUTHORS ",' 'x12, + $ref->authors,"\\s\+\|\$",80); + $self->_write_line_GenBank_regex(" TITLE "," "x12, + $ref->title,"\\s\+\|\$",80); + $self->_write_line_GenBank_regex(" JOURNAL "," "x12, + $ref->location,"\\s\+\|\$",80); + if ($ref->comment) { + $self->_write_line_GenBank_regex(" REMARK "," "x12, + $ref->comment,"\\s\+\|\$",80); + } + if( $ref->medline) { + $self->_write_line_GenBank_regex(" MEDLINE "," "x12, + $ref->medline, "\\s\+\|\$",80); + # I am assuming that pubmed entries only exist when there + # are also MEDLINE entries due to the indentation + # This could be a wrong assumption + if( $ref->pubmed ) { + $self->_write_line_GenBank_regex(" PUBMED "," "x12, + $ref->pubmed, "\\s\+\|\$", + 80); + } + } + $count++; + } + # Comment lines + + foreach my $comment ( $seq->annotation->get_Annotations('comment') ) { + $self->_write_line_GenBank_regex("COMMENT "," "x12, + $comment->text,"\\s\+\|\$",80); + } + $self->_print("FEATURES Location/Qualifiers\n"); + + my $contig; + if( defined $self->_post_sort ) { + # we need to read things into an array. Process. Sort them. Print 'em + + my $post_sort_func = $self->_post_sort(); + my @fth; + + foreach my $sf ( $seq->top_SeqFeatures ) { + push(@fth,Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq)); + } + + @fth = sort { &$post_sort_func($a,$b) } @fth; + + foreach my $fth ( @fth ) { + $self->_print_GenBank_FTHelper($fth); + } + } else { + # not post sorted. And so we can print as we get them. + # lower memory load... + + foreach my $sf ( $seq->top_SeqFeatures ) { + my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq); + foreach my $fth ( @fth ) { + if( ! $fth->isa('Bio::SeqIO::FTHelper') ) { + $sf->throw("Cannot process FTHelper... $fth"); + } + $self->_print_GenBank_FTHelper($fth); + } + } + } + if( $seq->length == 0 ) { $self->_show_dna(0) } + + if( $self->_show_dna() == 0 ) { + $self->_print("\n//\n"); + return; + } + +# finished printing features. + + $str =~ tr/A-Z/a-z/; + +# Count each nucleotide + unless( $mol eq 'protein' ) { + my $alen = $str =~ tr/a/a/; + my $clen = $str =~ tr/c/c/; + my $glen = $str =~ tr/g/g/; + my $tlen = $str =~ tr/t/t/; + + my $olen = $len - ($alen + $tlen + $clen + $glen); + if( $olen < 0 ) { + $self->warn("Weird. More atgc than bases. Problem!"); + } + + my $base_count = sprintf("BASE COUNT %8s a %6s c %6s g %6s t%s\n", + $alen,$clen,$glen,$tlen, + ( $olen > 0 ) ? sprintf("%6s others",$olen) : ''); + $self->_print($base_count); + } + my ($o) = $seq->annotation->get_Annotations('origin'); + $self->_print(sprintf("%-6s%s\n",'ORIGIN',$o ? $o->value : '')); + +# print out the sequence + my $nuc = 60; # Number of nucleotides per line + my $whole_pat = 'a10' x 6; # Pattern for unpacking a whole line + my $out_pat = 'A11' x 6; # Pattern for packing a line + my $length = length($str); + + # Calculate the number of nucleotides which fit on whole lines + my $whole = int($length / $nuc) * $nuc; + + # Print the whole lines + my $i; + for ($i = 0; $i < $whole; $i += $nuc) { + my $blocks = pack $out_pat, + unpack $whole_pat, + substr($str, $i, $nuc); + chop $blocks; + $self->_print(sprintf("%9d $blocks\n", $i + $nuc - 59)); + } + + # Print the last line + if (my $last = substr($str, $i)) { + my $last_len = length($last); + my $last_pat = 'a10' x int($last_len / 10) .'a'. $last_len % 10; + my $blocks = pack $out_pat, + unpack($last_pat, $last); + $blocks =~ s/ +$//; + $self->_print(sprintf("%9d $blocks\n", $length - $last_len + 1)); + } + + $self->_print("//\n"); + + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; + } +} + +=head2 _print_GenBank_FTHelper + + Title : _print_GenBank_FTHelper + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub _print_GenBank_FTHelper { + my ($self,$fth,$always_quote) = @_; + + if( ! ref $fth || ! $fth->isa('Bio::SeqIO::FTHelper') ) { + $fth->warn("$fth is not a FTHelper class. Attempting to print, but there could be tears!"); + } + if( defined $fth->key && + $fth->key eq 'CONTIG' ) { + $self->_write_line_GenBank_regex(sprintf("%-12s",$fth->key), + ' 'x12,$fth->loc,"\,\|\$",80); + } else { + $self->_write_line_GenBank_regex(sprintf(" %-16s",$fth->key), + " "x21, + $fth->loc,"\,\|\$",80); + } + + if( !defined $always_quote) { $always_quote = 0; } + + foreach my $tag ( keys %{$fth->field} ) { + foreach my $value ( @{$fth->field->{$tag}} ) { + $value =~ s/\"/\"\"/g; + if ($value eq "_no_value") { + $self->_write_line_GenBank_regex(" "x21, + " "x21, + "/$tag","\.\|\$",80); + } + elsif( $always_quote == 1 || $value !~ /^\d+$/ ) { + my ($pat) = ($value =~ /\s/ ? '\s|$' : '.|$'); + $self->_write_line_GenBank_regex(" "x21, + " "x21, + "/$tag=\"$value\"",$pat,80); + } else { + $self->_write_line_GenBank_regex(" "x21, + " "x21, + "/$tag=$value","\.\|\$",80); + } + } + } + +} + + +=head2 _read_GenBank_References + + Title : _read_GenBank_References + Usage : + Function: Reads references from GenBank format. Internal function really + Returns : + Args : + + +=cut + +sub _read_GenBank_References{ + my ($self,$buffer) = @_; + my (@refs); + my $ref; + + # assumme things are starting with RN + + if( $$buffer !~ /^REFERENCE/ ) { + warn("Not parsing line '$$buffer' which maybe important"); + } + + $_ = $$buffer; + + my (@title,@loc,@authors,@com,@medline,@pubmed); + + REFLOOP: while( defined($_) || defined($_ = $self->_readline) ) { + if (/^ AUTHORS\s+(.*)/) { + push (@authors, $1); + while ( defined($_ = $self->_readline) ) { + /^\s{3,}(.*)/ && do { push (@authors, $1);next;}; + last; + } + $ref->authors(join(' ', @authors)); + } + if (/^ TITLE\s+(.*)/) { + push (@title, $1); + while ( defined($_ = $self->_readline) ) { + /^\s{3,}(.*)/ && do { push (@title, $1); + next; + }; + last; + } + $ref->title(join(' ', @title)); + } + if (/^ JOURNAL\s+(.*)/) { + push(@loc, $1); + while ( defined($_ = $self->_readline) ) { + /^\s{3,}(.*)/ && do { push(@loc, $1); + next; + }; + last; + } + $ref->location(join(' ', @loc)); + redo REFLOOP; + } + if (/^ REMARK\s+(.*)/) { + push (@com, $1); + while ( defined($_ = $self->_readline) ) { + /^\s{3,}(.*)/ && do { push(@com, $1); + next; + }; + last; + } + $ref->comment(join(' ', @com)); + redo REFLOOP; + } + if( /^ MEDLINE\s+(.*)/ ) { + push(@medline,$1); + while ( defined($_ = $self->_readline) ) { + /^\s{4,}(.*)/ && do { push(@medline, $1); + next; + }; + last; + } + $ref->medline(join(' ', @medline)); + redo REFLOOP; + } + if( /^ PUBMED\s+(.*)/ ) { + push(@pubmed,$1); + while ( defined($_ = $self->_readline) ) { + /^\s{5,}(.*)/ && do { push(@pubmed, $1); + next; + }; + last; + } + $ref->pubmed(join(' ', @pubmed)); + redo REFLOOP; + } + + /^REFERENCE/ && do { + # store current reference + $self->_add_ref_to_array(\@refs,$ref) if $ref; + # reset + @authors = (); + @title = (); + @loc = (); + @com = (); + @pubmed = (); + @medline = (); + # create the new reference object + $ref = Bio::Annotation::Reference->new(); + # check whether start and end base is given + if (/^REFERENCE\s+\d+\s+\([a-z]+ (\d+) to (\d+)/){ + $ref->start($1); + $ref->end($2); + } + }; + + /^(FEATURES)|(COMMENT)/ && last; + + $_ = undef; # Empty $_ to trigger read of next line + } + + # store last reference + $self->_add_ref_to_array(\@refs,$ref) if $ref; + + $$buffer = $_; + + #print "\nnumber of references found: ", $#refs+1,"\n"; + + return @refs; +} + +# +# This is undocumented as it shouldn't be called by anywhere else as +# read_GenBank_References. For those who still want to know: +# +# Purpose: adds a Reference object to an array of Reference objects, takes +# care of possible cleanups to be done (currently, only author and title +# will be chopped of trailing semicolons). +# Parameters: +# a reference to an array of Reference objects +# the Reference object to be added +# Returns: nothing +# +sub _add_ref_to_array { + my ($self, $refs, $ref) = @_; + + # first, polish author and title by removing possible trailing semicolons + my $au = $ref->authors(); + my $title = $ref->title(); + $au =~ s/;\s*$//g if $au; + $title =~ s/;\s*$//g if $title; + $ref->authors($au); + $ref->title($title); + # the rest should be clean already, so go ahead and add it + push(@{$refs}, $ref); +} + +=head2 _read_GenBank_Species + + Title : _read_GenBank_Species + Usage : + Function: Reads the GenBank Organism species and classification + lines. + Example : + Returns : A Bio::Species object + Args : a reference to the current line buffer + +=cut + +sub _read_GenBank_Species { + my( $self,$buffer) = @_; + my @organell_names = ("chloroplast", "mitochondr"); + # only those carrying DNA, apart from the nucleus + + $_ = $$buffer; + + my( $sub_species, $species, $genus, $common, $organelle, @class ); + # upon first entering the loop, we must not read a new line -- the SOURCE + # line is already in the buffer (HL 05/10/2000) + while (defined($_) || defined($_ = $self->_readline())) { + # de-HTMLify (links that may be encountered here don't contain + # escaped '>', so a simple-minded approach suffices) + s/<[^>]+>//g; + if (/^SOURCE\s+(.*)/) { + # FIXME this is probably mostly wrong (e.g., it yields things like + # Homo sapiens adult placenta cDNA to mRNA + # which is certainly not what you want) + $common = $1; + $common =~ s/\.$//; # remove trailing dot + } elsif (/^\s+ORGANISM/) { + my @spflds = split(' ', $_); + shift(@spflds); # ORGANISM + if(grep { $_ =~ /^$spflds[0]/i; } @organell_names) { + $organelle = shift(@spflds); + } + $genus = shift(@spflds); + if(@spflds) { + $species = shift(@spflds); + } else { + $species = "sp."; + } + $sub_species = shift(@spflds) if(@spflds); + } elsif (/^\s+(.+)/) { + # only split on ';' or '.' so that + # classification that is 2 words will + # still get matched + # use map to remove trailing/leading spaces + push(@class, map { s/^\s+//; s/\s+$//; $_; } split /[;\.]+/, $1); + } else { + last; + } + + $_ = undef; # Empty $_ to trigger read of next line + } + + $$buffer = $_; + + # Don't make a species object if it's empty or "Unknown" or "None" + return unless $genus and $genus !~ /^(Unknown|None)$/i; + + # Bio::Species array needs array in Species -> Kingdom direction + if ($class[$#class] eq $genus) { + push( @class, $species ); + } else { + push( @class, $genus, $species ); + } + @class = reverse @class; + + my $make = Bio::Species->new(); + $make->classification( \@class, "FORCE" ); # no name validation please + $make->common_name( $common ) if $common; + $make->sub_species( $sub_species ) if $sub_species; + $make->organelle($organelle) if $organelle; + return $make; +} + +=head2 _read_FTHelper_GenBank + + Title : _read_FTHelper_GenBank + Usage : _read_FTHelper_GenBank($buffer) + Function: reads the next FT key line + Example : + Returns : Bio::SeqIO::FTHelper object + Args : filehandle and reference to a scalar + + +=cut + +sub _read_FTHelper_GenBank { + my ($self,$buffer) = @_; + + my ($key, # The key of the feature + $loc # The location line from the feature + ); + my @qual = (); # An arrray of lines making up the qualifiers + + if ($$buffer =~ /^ (\S+)\s+(.+?)\s*$/o) { + $key = $1; + $loc = $2; + # Read all the lines up to the next feature + while ( defined($_ = $self->_readline) ) { + if (/^(\s+)(.+?)\s*$/o) { + # Lines inside features are preceded by 21 spaces + # A new feature is preceded by 5 spaces + if (length($1) > 6) { + # Add to qualifiers if we're in the qualifiers, or if it's + # the first qualifier + if (@qual || (index($2,'/') == 0)) { + push(@qual, $2); + } + # We're still in the location line, so append to location + else { + $loc .= $2; + } + } else { + # We've reached the start of the next feature + last; + } + } else { + # We're at the end of the feature table + last; + } + } + } else { + # No feature key + $self->debug("no feature key!\n"); + # change suggested by JDiggans to avoid infinite loop- + # see bugreport 1062. + # reset buffer to prevent infinite loop + $$buffer = $self->_readline(); + return; + } + + # Put the first line of the next feature into the buffer + $$buffer = $_; + + # Make the new FTHelper object + my $out = new Bio::SeqIO::FTHelper(); + $out->verbose($self->verbose()); + $out->key($key); + $out->loc($loc); + + # Now parse and add any qualifiers. (@qual is kept + # intact to provide informative error messages.) + QUAL: for (my $i = 0; $i < @qual; $i++) { + $_ = $qual[$i]; + my( $qualifier, $value ) = (m{^/([^=]+)(?:=(.+))?}) + or $self->warn("cannot see new qualifier in feature $key: ". + $qual[$i]); + #or $self->throw("Can't see new qualifier in: $_\nfrom:\n" + # . join('', map "$_\n", @qual)); + $qualifier = '' unless( defined $qualifier); + if (defined $value) { + # Do we have a quoted value? + if (substr($value, 0, 1) eq '"') { + # Keep adding to value until we find the trailing quote + # and the quotes are balanced + while ($value !~ /\"$/ or $value =~ tr/"/"/ % 2) { + if($i >= $#qual) { + $self->warn("Unbalanced quote in:\n" . + join('', map("$_\n", @qual)) . + "No further qualifiers will " . + "be added for this feature"); + last QUAL; + } + $i++; # modifying a for-loop variable inside of the loop + # is not the best programming style ... + my $next = $qual[$i]; + + # add to value with a space unless the value appears + # to be a sequence (translation for example) + if(($value.$next) =~ /[^A-Za-z"-]/) { + $value .= " "; + } + $value .= $next; + } + # Trim leading and trailing quotes + $value =~ s/^"|"$//g; + # Undouble internal quotes + $value =~ s/""/\"/g; + } + } else { + $value = '_no_value'; + } + # Store the qualifier + $out->field->{$qualifier} ||= []; + push(@{$out->field->{$qualifier}},$value); + } + return $out; +} + +=head2 _write_line_GenBank + + Title : _write_line_GenBank + Usage : + Function: internal function + Example : + Returns : + Args : + + +=cut + +sub _write_line_GenBank{ + my ($self,$pre1,$pre2,$line,$length) = @_; + + $length || $self->throw("Miscalled write_line_GenBank without length. Programming error!"); + my $subl = $length - length $pre2; + my $linel = length $line; + my $i; + + my $sub = substr($line,0,$length - length $pre1); + + $self->_print("$pre1$sub\n"); + + for($i= ($length - length $pre1);$i < $linel;) { + $sub = substr($line,$i,($subl)); + $self->_print("$pre2$sub\n"); + $i += $subl; + } + +} + +=head2 _write_line_GenBank_regex + + Title : _write_line_GenBank_regex + Usage : + Function: internal function for writing lines of specified + length, with different first and the next line + left hand headers and split at specific points in the + text + Example : + Returns : nothing + Args : file handle, first header, second header, text-line, regex for line breaks, total line length + + +=cut + +sub _write_line_GenBank_regex { + my ($self,$pre1,$pre2,$line,$regex,$length) = @_; + + #print STDOUT "Going to print with $line!\n"; + + $length || $self->throw( "Miscalled write_line_GenBank without length. Programming error!"); + +# if( length $pre1 != length $pre2 ) { +# $self->throw( "Programming error - cannot called write_line_GenBank_regex with different length pre1 and pre2 tags!"); +# } + + my $subl = $length - (length $pre1) - 2; + my @lines = (); + + CHUNK: while($line) { + foreach my $pat ($regex, '[,;\.\/-]\s|'.$regex, '[,;\.\/-]|'.$regex) { + if($line =~ m/^(.{1,$subl})($pat)(.*)/) { + $line = $3; + # be strict about not padding spaces according to + # genbank format + my $l = $1.$2; + $l =~ s/\s+$//; + push(@lines, $l); + next CHUNK; + } + } + # if we get here none of the patterns matched $subl or less chars + $self->warn("trouble dissecting \"$line\" into chunks ". + "of $subl chars or less - this tag won't print right"); + # insert a space char to prevent infinite loops + $line = substr($line,0,$subl) . " " . substr($line,$subl); + } + + my $s = shift @lines; + $self->_print("$pre1$s\n"); + foreach my $s ( @lines ) { + $self->_print("$pre2$s\n"); + } +} + +=head2 _post_sort + + Title : _post_sort + Usage : $obj->_post_sort($newval) + Function: + Returns : value of _post_sort + Args : newvalue (optional) + + +=cut + +sub _post_sort{ + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'_post_sort'} = $value; + } + return $obj->{'_post_sort'}; +} + +=head2 _show_dna + + Title : _show_dna + Usage : $obj->_show_dna($newval) + Function: + Returns : value of _show_dna + Args : newvalue (optional) + + +=cut + +sub _show_dna{ + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'_show_dna'} = $value; + } + return $obj->{'_show_dna'}; +} + +=head2 _id_generation_func + + Title : _id_generation_func + Usage : $obj->_id_generation_func($newval) + Function: + Returns : value of _id_generation_func + Args : newvalue (optional) + + +=cut + +sub _id_generation_func{ + my ($obj,$value) = @_; + if( defined $value ) { + $obj->{'_id_generation_func'} = $value; + } + return $obj->{'_id_generation_func'}; +} + +=head2 _ac_generation_func + + Title : _ac_generation_func + Usage : $obj->_ac_generation_func($newval) + Function: + Returns : value of _ac_generation_func + Args : newvalue (optional) + + +=cut + +sub _ac_generation_func{ + my ($obj,$value) = @_; + if( defined $value ) { + $obj->{'_ac_generation_func'} = $value; + } + return $obj->{'_ac_generation_func'}; +} + +=head2 _sv_generation_func + + Title : _sv_generation_func + Usage : $obj->_sv_generation_func($newval) + Function: + Returns : value of _sv_generation_func + Args : newvalue (optional) + + +=cut + +sub _sv_generation_func{ + my ($obj,$value) = @_; + if( defined $value ) { + $obj->{'_sv_generation_func'} = $value; + } + return $obj->{'_sv_generation_func'}; + +} + +=head2 _kw_generation_func + + Title : _kw_generation_func + Usage : $obj->_kw_generation_func($newval) + Function: + Returns : value of _kw_generation_func + Args : newvalue (optional) + + +=cut + +sub _kw_generation_func{ + my ($obj,$value) = @_; + if( defined $value ) { + $obj->{'_kw_generation_func'} = $value; + } + return $obj->{'_kw_generation_func'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqIO/largefasta.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/largefasta.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,170 @@ +# $Id: largefasta.pm,v 1.18 2002/12/27 19:42:32 birney Exp $ +# BioPerl module for Bio::SeqIO::largefasta +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself +# _history +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::largefasta - method i/o on very large fasta sequence files + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the Bio::SeqIO class. + +=head1 DESCRIPTION + +This object can transform Bio::Seq objects to and from fasta flat +file databases. + +This module handles very large sequence files by using the +Bio::Seq::LargePrimarySeq module to store all the sequence data in +a file. This can be a problem if you have limited disk space on your +computer because this will effectively cause 2 copies of the sequence +file to reside on disk for the life of the +Bio::Seq::LargePrimarySeq object. The default location for this is +specified by the L<File::Spec>-E<gt>tmpdir routine which is usually /tmp +on UNIX. If a sequence file is larger than the swap space (capacity +of the /tmp dir) this could cause problems for the machine. It is +possible to set the directory where the temporary file is located by +adding the following line to your code BEFORE calling next_seq. See +L<Bio::Seq::LargePrimarySeq> for more information. + + $Bio::Seq::LargePrimarySeq::DEFAULT_TEMP_DIR = 'newdir'; + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS - Jason Stajich + +Email: jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::SeqIO::largefasta; +use vars qw(@ISA $FASTALINELEN); +use strict; +# Object preamble - inherits from Bio::Root::Object + +use Bio::SeqIO; +use Bio::Seq::SeqFactory; + +$FASTALINELEN = 60; +@ISA = qw(Bio::SeqIO); + +sub _initialize { + my($self,@args) = @_; + $self->SUPER::_initialize(@args); + if( ! defined $self->sequence_factory ) { + $self->sequence_factory(new Bio::Seq::SeqFactory + (-verbose => $self->verbose(), + -type => 'Bio::Seq::LargePrimarySeq')); + } +} + +=head2 next_seq + + Title : next_seq + Usage : $seq = $stream->next_seq() + Function: returns the next sequence in the stream + Returns : Bio::Seq object + Args : NONE + +=cut + +sub next_seq { + my ($self) = @_; +# local $/ = "\n"; + my $largeseq = $self->sequence_factory->create(); + my ($id,$fulldesc,$entry); + my $count = 0; + my $seen = 0; + while( defined ($entry = $self->_readline) ) { + if( $seen == 1 && $entry =~ /^\s*>/ ) { + $self->_pushback($entry); + return $largeseq; + } +# if ( ($entry eq '>') || eof($self->_fh) ) { $seen = 1; next; } + if ( ($entry eq '>') ) { $seen = 1; next; } + elsif( $entry =~ /\s*>(.+?)$/ ) { + $seen = 1; + ($id,$fulldesc) = ($1 =~ /^\s*(\S+)\s*(.*)$/) + or $self->warn("Can't parse fasta header"); + $largeseq->display_id($id); + $largeseq->primary_id($id); + $largeseq->desc($fulldesc); + } else { + $entry =~ s/\s+//g; + $largeseq->add_sequence_as_string($entry); + } + (++$count % 1000 == 0 && $self->verbose() > 0) && print "line $count\n"; + } + if( ! $seen ) { return undef; } + return $largeseq; +} + +=head2 write_seq + + Title : write_seq + Usage : $stream->write_seq(@seq) + Function: writes the $seq object into the stream + Returns : 1 for success and 0 for error + Args : Bio::Seq object + + +=cut + +sub write_seq { + my ($self,@seq) = @_; + foreach my $seq (@seq) { + my $top = $seq->id(); + if ($seq->can('desc') and my $desc = $seq->desc()) { + $desc =~ s/\n//g; + $top .= " $desc"; + } + $self->_print (">",$top,"\n"); + my $end = $seq->length(); + my $start = 1; + while( $start < $end ) { + my $stop = $start + $FASTALINELEN - 1; + $stop = $end if( $stop > $end ); + $self->_print($seq->subseq($start,$stop), "\n"); + $start += $FASTALINELEN; + } + } + + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqIO/locuslink.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/locuslink.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,593 @@ +# $Id: locuslink.pm,v 1.2.2.2 2003/03/13 02:09:20 lapp Exp $ +# +# BioPerl module for Bio::SeqIO::locuslink +# +# Cared for by Keith Ching <kching at gnf.org> +# +# Copyright Keith Ching +# +# You may distribute this module under the same terms as perl itself + +# +# (c) Keith Ching, kching at gnf.org, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::locuslink - DESCRIPTION of Object + +=head1 SYNOPSIS + + # don't instantiate directly - instead do + my $seqio = Bio::SeqIO->new(-format => "locuslink", -file => \STDIN); + +=head1 DESCRIPTION + +This module parses LocusLink into Bio::SeqI objects with rich +annotation, but no sequence. + +The input file has to be in the LL_tmpl format - the tabular format +will not work. + +The way the current implementation populates the object is rather a +draft work than a finished work of art. Note that at this stage the +locuslink entries cannot be round-tripped, because the parser loses +certain information. For instance, most of the alternative transcript +descriptions are not retained. The parser also misses any element +that deals with visual representation (e.g., 'button') except for the +URLs. Almost all of the pieces of the annotation are kept in the +L<Bio::Annotation::Collection> object. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Keith Ching + +Email kching at gnf.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Hilmar Lapp, hlapp at gmx.net + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + +package Bio::SeqIO::locuslink; + +use strict; +use vars qw(@ISA); + +use Bio::SeqIO; +use Bio::Seq::SeqFactory; +use Bio::Species; +use Bio::Annotation::DBLink; +#use Bio::Annotation::Reference; +use Bio::Annotation::Comment; +use Bio::Annotation::SimpleValue; +use Bio::Annotation::OntologyTerm; +use Bio::Annotation::Collection; + +@ISA = qw(Bio::SeqIO); + +# list of all the field names in locuslink +my @locuslink_keys = qw( + ACCNUM + ALIAS_PROT + ALIAS_SYMBOL + ASSEMBLY + BUTTON + CDD + CHR + COMP + CONTIG + CURRENT_LOCUSID + DB_DESCR + DB_LINK + ECNUM + EVID + EXTANNOT + GO + GRIF + LINK + LOCUSID + LOCUS_CONFIRMED + LOCUS_TYPE + MAP + MAPLINK + NC + NG + NM + NP + NR + OFFICIAL_GENE_NAME + OFFICIAL_SYMBOL + OMIM + ORGANISM + PHENOTYPE + PHENOTYPE_ID + PMID + PREFERRED_GENE_NAME + PREFERRED_PRODUCT + PREFERRED_SYMBOL + PRODUCT + PROT + RELL + STATUS + STS + SUMFUNC + SUMMARY + TRANSVAR + TYPE + UNIGENE + XG + XM + XP + XR + ); + +# list of fields to make simple annotations from +# fields not listed here or as a key in feature hash are ignored (lost). +my %anntype_map = ( + SimpleValue => [qw( + ALIAS_PROT + ALIAS_SYMBOL + CDD + CHR + CURRENT_LOCUSID + ECNUM + EXTANNOT + MAP + NC + NR + OFFICIAL_GENE_NAME + OFFICIAL_SYMBOL + PHENOTYPE + PREFERRED_GENE_NAME + PREFERRED_PRODUCT + PREFERRED_SYMBOL + PRODUCT + RELL + SUMFUNC + ) + ], + Comment => [qw( + SUMMARY + ) + ], + ); + + +# certain fields are not named the same as the symgene database list +my %dbname_map = ( + pfam => 'Pfam', + smart => 'SMART', + NM => 'RefSeq', + NP => 'RefSeq', + XP => 'RefSeq', + XM => 'RefSeq', + NG => 'RefSeq', + XG => 'RefSeq', + XR => 'RefSeq', + PROT => 'GenBank', + ACCNUM => 'GenBank', + CONTIG => 'GenBank', + # certain fields are not named the same as the symgene + # database list: rename the fields the symgene database name + # key = field name in locuslink + # value = database name in sym + #GO => 'GO', + OMIM => 'MIM', + GRIF => 'GRIF', + STS => 'STS', + UNIGENE => 'UniGene', + ); + +# certain CDD entries use the wrong prefix for the accession number +# cddprefix will replace the key w/ the value for these entries +my %cddprefix = ( + pfam => 'PF', + smart => 'SM', + ); + +# alternate mappings if one field does not exist +my %alternate_map = ( + OFFICIAL_GENE_NAME => 'PREFERRED_GENE_NAME', + OFFICIAL_SYMBOL => 'PREFERRED_SYMBOL', + ); + +# for these field names, we only care about the first value X in value X|Y|Z +my @ll_firstelements = qw( + NM + NP + NG + XG + XM + XP + XR + PROT + STS + ACCNUM + CONTIG + GRIF + ); + +# these fields need to be flattened into a single string, using the given +# join string +my %flatten_tags = ( + ASSEMBLY => '', + ORGANISM => '', + OFFICIAL_SYMBOL => '', + OFFICIAL_GENE_NAME => '', + LOCUSID => '', + PMID => '', + PREFERRED_SYMBOL => ', ', + PREFERRED_GENE_NAME => ', ' +); + +# set the default search pattern for all the field names +my %feature_pat_map = map { ($_ , "^$_: (.+)\n"); } @locuslink_keys; + +sub _initialize { + my($self,@args) = @_; + + $self->SUPER::_initialize(@args); + + # overwrite the search pattern w/ the first value pattern + foreach my $key(@ll_firstelements){ + $feature_pat_map{$key}="^$key: ([^|]+)"; + } + + # special search pattern for cdd entries + foreach my $key(keys %cddprefix) { + $feature_pat_map{$key}='^CDD: .+\|'.$key.'(\d+)'; + } + + # special patterns for specific fields + $feature_pat_map{MAP} = '^MAP: (.+?)\|'; + $feature_pat_map{MAPHTML} = '^MAP: .+\|(<.+>)\|'; + $feature_pat_map{GO} = '^GO: .+\|.+\|\w+\|(GO:\d+)\|'; + $feature_pat_map{GO_DESC} = '^GO: .+\|(.+)\|\w+\|GO:\d+\|'; + $feature_pat_map{GO_CAT} = '^GO: (.+)\|.+\|\w+\|GO:\d+\|'; + $feature_pat_map{EXTANNOT} = '^EXTANNOT: (.+)\|(.+)\|\w+\|.+\|\d+'; + + # set the sequence factory of none has been set already + if(! $self->sequence_factory()) { + $self->sequence_factory(Bio::Seq::SeqFactory->new( + -type => 'Bio::Seq::RichSeq')); + } +} + + +######################### +# +sub search_pattern{ +# +######################### + my ($self, + $entry, #text to search + $searchconfirm, #to make sure you got the right thing + $searchpattern, + $searchtype) = @_; + my @query = $entry=~/$searchpattern/gm; + if ($searchconfirm ne "FALSE"){ + $self->warn("No $searchtype found\n$entry\n") unless @query; + foreach (@query){ + if (!($_=~/$searchconfirm/)){ + $self->throw("error\n$entry\n$searchtype parse $_ does not match $searchconfirm\n"); + } + }#endforeach + }#endsearchconfirm + return(@query); +}#endsub +############ +# +sub read_species{ +# +############ + my ($spline)=@_; + my $species; + my $genus; + ($genus,$species)=$spline=~/([^ ]+) ([^ ]+)/; + my $make = Bio::Species->new(); + $make->classification( ($species,$genus) ); + return $make; +} +################ +# +sub read_dblink{ +# +################ + my ($ann,$db,$ref)=@_; + my @results=$ref ? @$ref : (); + foreach my $id(@results){ + if($id){ + $ann->add_Annotation('dblink', + Bio::Annotation::DBLink->new( + -database =>$db , + -primary_id =>$id)); + } + } + return($ann); +} + +################ +# +sub read_reference{ +# +################ + my ($ann,$db,$results)=@_; + + if($results){ + chomp($results); + my @ids=split(/,/,$results); + $ann = read_dblink($ann,$db,\@ids) if @ids; + } + return $ann; +}#endsub + +################ +# +sub add_annotation{ +# +################ + my ($ac,$type,$text,$anntype)=@_; + my @args; + + $anntype = 'SimpleValue' unless $anntype; + SWITCH : { + $anntype eq 'SimpleValue' && do { + push(@args, -value => $text, -tagname => $type); + last SWITCH; + }; + $anntype eq 'Comment' && do { + push(@args, -text => $text, -tagname => 'comment'); + last SWITCH; + }; + } + $ac->add_Annotation("Bio::Annotation::$anntype"->new(@args)); + return($ac); +}#endsub + +################ +# +sub add_annotation_ref{ +# +################ + my ($ann,$type,$textref)=@_; + my @text=$textref ? @$textref : (); + + foreach my $text(@text){ + $ann->add_Annotation($type,Bio::Annotation::SimpleValue->new(-value => $text)); + } + return($ann); +}#endsub + +################ +# +sub make_unique{ +# +############## + my ($ann,$key) = @_; + + my %seen = (); + foreach my $dbl ($ann->remove_Annotations($key)) { + if(! $seen{$dbl->as_text()}) { + $seen{$dbl->as_text()} = 1; + $ann->add_Annotation($dbl); + } + } + return $ann; +} + +################ +# +sub next_seq{ +# +############## + my ($self, @args)=@_; + my (@results,$search,$ref,$cddref); + + # LOCUSLINK entries begin w/ >> + local $/="\n>>"; + + # slurp in a whole entry and return if no more entries + return unless my $entry = $self->_readline; + + # strip the leading '>>' is it's the first entry + if (index($entry,'>>') == 0) { #first entry + $entry = substr($entry,2); + } + + # we aren't interested in obsoleted entries, so we need to loop + # and skip those until we've found the next not obsoleted + my %record = (); + while($entry && ($entry =~ /\w/)) { + if (!($entry=~/LOCUSID/)){ + $self->throw("No LOCUSID in first line of record. ". + "Not LocusLink in my book."); + } + # see whether it's an obsoleted entry, and if so jump to the next + # one entry right away + if($entry =~ /^CURRENT_LOCUSID:/m) { + # read next entry and continue + $entry = $self->_readline; + %record = (); + next; + } + # loop through list of features and get field values + # place into record hash as array refs + foreach my $key (keys %feature_pat_map){ + $search=$feature_pat_map{$key}; + @results=$self->search_pattern($entry,'FALSE',$search,$search); + $record{$key} = @results ? [@results] : undef; + }#endfor + # terminate loop as this one hasn't been obsoleted + last; + } + + # we have reached the end-of-file ... + return unless %record; + + # special processing for CDD entries like pfam and smart + my ($PRESENT,@keep); + foreach my $key(keys %cddprefix){ + #print "check CDD $key\n"; + if($record{$key}) { + @keep=(); + foreach my $list (@{$record{$key}}) { + # replace AC with correct AC number + push(@keep,$cddprefix{$key}.$list); + } + # replace CDD ref with correctly prefixed AC number + $record{$key} = [@keep]; + } + } + # modify CDD references @=(); + if($record{CDD}) { + @keep=(); + foreach my $cdd (@{$record{CDD}}) { + $PRESENT = undef; + foreach my $key (keys %cddprefix) { + if ($cdd=~/$key/){ + $PRESENT = 1; + last; + } + } + push(@keep,$cdd) if(! $PRESENT); + } + $record{CDD} = [@keep]; + } + + # create annotation collection - we'll need it now + my $ann = Bio::Annotation::Collection->new(); + + foreach my $field(keys %dbname_map){ + $ann=read_dblink($ann,$dbname_map{$field},$record{$field}); + } + + # add GO link as an OntologyTerm annotation + if($record{GO}) { + for(my $j = 0; $j < @{$record{GO}}; $j++) { + my $goann = Bio::Annotation::OntologyTerm->new( + -identifier => $record{GO}->[$j], + -name => $record{GO_DESC}->[$j], + -ontology => $record{GO_CAT}->[$j]); + $ann->add_Annotation($goann); + } + } + + $ann=add_annotation_ref($ann,'URL',$record{LINK}); + $ann=add_annotation_ref($ann,'URL',$record{DB_LINK}); + + # presently we can't store types of dblinks - hence make unique + make_unique($ann,'dblink'); + + # everything else gets a simple tag or comment value annotation + foreach my $anntype (keys %anntype_map) { + foreach my $key (@{$anntype_map{$anntype}}){ + if($record{$key}){ + foreach (@{$record{$key}}){ + #print "$key\t\t$_\n"; + $ann=add_annotation($ann,$key,$_,$anntype); + } + } + } + } + + # flatten designated attributes into a scalar value + foreach my $field (keys %flatten_tags) { + if($record{$field}) { + $record{$field} = join($flatten_tags{$field}, + @{$record{$field}}); + } + } + + # annotation that expects the array flattened out + $ann=read_reference($ann,'PUBMED',$record{PMID}); + if($record{ASSEMBLY}) { + my @assembly=split(/,/,$record{ASSEMBLY}); + $ann=read_dblink($ann,'GenBank',\@assembly); + } + + # replace fields w/ alternate if original does not exist + foreach my $fieldval (keys %alternate_map){ + if((! $record{$fieldval}) && ($record{$alternate_map{$fieldval}})){ + $record{$fieldval}=$record{$alternate_map{$fieldval}}; + } + } + + # create sequence object (i.e., let seq.factory create one) + my $seq = $self->sequence_factory->create( + -verbose => $self->verbose(), + -accession_number => $record{LOCUSID}, + -desc => $record{OFFICIAL_GENE_NAME}, + -display_id => $record{OFFICIAL_SYMBOL}, + -species => read_species($record{ORGANISM}), + -annotation => $ann); + + # dump out object contents + # show_obj([$seq]); + + return($seq); +} + +################ +# +sub show_obj{ +# +################ + my ($seqlistref)=@_; + my @list=@$seqlistref; + my $out = Bio::SeqIO->new('-fh' => \*STDOUT, -format => 'genbank' ); + my ($ann,@values,$val); + + foreach my $seq(@list){ + $out->write_seq($seq); + $ann=$seq->annotation; + foreach my $key ( $ann->get_all_annotation_keys() ) { + @values = $ann->get_Annotations($key); + foreach my $value ( @values ) { + # value is an Bio::AnnotationI, and defines a "as_text" method + $val=$value->as_text; + print "Annotation ",$key,"\t\t",$val,"\n"; + } + } + } +}#endsub + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqIO/phd.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/phd.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,252 @@ +# $Id: phd.pm,v 1.17 2002/12/09 23:50:23 matsallac Exp $ +# +# Copyright (c) 1997-2001 bioperl, Chad Matsalla. All Rights Reserved. +# This module is free software; you can redistribute it and/or +# modify it under the same terms as Perl itself. +# +# Copyright Chad Matsalla +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::phd - .phd file input/output stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the L<Bio::SeqIO> class. + +=head1 DESCRIPTION + +This object can transform .phd files (from Phil Green's phred basecaller) +to and from Bio::Seq::SeqWithQuality objects + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://www.bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR Chad Matsalla + +Chad Matsalla +bioinformatics@dieselwurks.com + +=head1 CONTRIBUTORS + +Jason Stajich, jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# 'Let the code begin... + +package Bio::SeqIO::phd; +use vars qw(@ISA); +use strict; +use Bio::SeqIO; +use Bio::Seq::SeqFactory; + +@ISA = qw(Bio::SeqIO); + +sub _initialize { + my($self,@args) = @_; + $self->SUPER::_initialize(@args); + if( ! defined $self->sequence_factory ) { + $self->sequence_factory(new Bio::Seq::SeqFactory + (-verbose => $self->verbose(), + -type => 'Bio::Seq::SeqWithQuality')); + } +} + +=head2 next_seq() + + Title : next_seq() + Usage : $swq = $stream->next_seq() + Function: returns the next phred sequence in the stream + Returns : Bio::Seq::SeqWithQuality object + Args : NONE + Notes : This is really redundant because AFAIK there is no such thing as + a .phd file that contains more then one sequence. It is included as + an interface thing and for consistency. + +=cut + +sub next_seq { + my ($self,@args) = @_; + my ($entry,$done,$qual,$seq); + my ($id,@lines, @bases, @qualities) = (''); + if (!($entry = $self->_readline)) { return; } + if ($entry =~ /^BEGIN_SEQUENCE\s+(\S+)/) { + $id = $1; + } + my $in_dna = 0; + my $base_number = 0; + while ($entry = $self->_readline) { + return if (!$entry); + chomp($entry); + if ($entry =~ /^BEGIN_CHROMAT:\s+(\S+)/) { + # this is where I used to grab the ID + if (!$id) { + $id = $1; + } + $entry = $self->_readline(); + } + if ($entry =~ /^BEGIN_DNA/) { + $entry =~ /^BEGIN_DNA/; + $in_dna = 1; + $entry = $self->_readline(); + } + if ($entry =~ /^END_DNA/) { + $in_dna = 0; + } + if ($entry =~ /^END_SEQUENCE/) { + } + if (!$in_dna) { next; } + $entry =~ /(\S+)\s+(\S+)/; + push @bases,$1; + push @qualities,$2; + push(@lines,$entry); + } + # $self->debug("csmCreating objects with id = $id\n"); + my $swq = $self->sequence_factory->create + (-seq => join('',@bases), + -qual => \@qualities, + -id => $id, + -primary_id => $id, + -display_id => $id, + ); + return $swq; +} + +=head2 write_seq + + Title : write_seq(-SeqWithQuality => $swq, <comments>) + Usage : $obj->write_seq( -SeqWithQuality => $swq,); + Function: Write out an scf. + Returns : Nothing. + Args : Requires: a reference to a SeqWithQuality object to form the + basis for the scf. Any other arguments are assumed to be comments + and are put into the comments section of the scf. Read the + specifications for scf to decide what might be good to put in here. + Notes : These are the comments that reside in the header of a phd file + at the present time. If not provided in the parameter list for + write_phd(), the following default values will be used: + CHROMAT_FILE: $swq->id() + ABI_THUMBPRINT: 0 + PHRED_VERSION: 0.980904.e + CALL_METHOD: phred + QUALITY_LEVELS: 99 + TIME: <current time> + TRACE_ARRAY_MIN_INDEX: 0 + TRACE_ARRAY_MAX_INDEX: unknown + CHEM: unknown + DYE: unknown + IMPORTANT: This method does not write the trace index where this + call was made. All base calls are placed at index 1. + + +=cut + +sub write_seq { + my ($self,@args) = @_; + my @phredstack; + my ($label,$arg); + + my ($swq, $chromatfile, $abithumb, + $phredversion, $callmethod, + $qualitylevels,$time, + $trace_min_index, + $trace_max_index, + $chem, $dye + ) = $self->_rearrange([qw(SEQWITHQUALITY + CHROMAT_FILE + ABI_THUMBPRINT + PHRED_VERSION + CALL_METHOD + QUALITY_LEVELS + TIME + TRACE_ARRAY_MIN_INDEX + TRACE_ARRAY_MAX_INDEX + CHEM + DYE + )], @args); + + unless (ref($swq) eq "Bio::Seq::SeqWithQuality") { + $self->throw("You must pass a Bio::Seq::SeqWithQuality object to write_scf as a parameter named \"SeqWithQuality\""); + } + my $id = $swq->id(); + if (!$id) { $id = "UNDEFINED in SeqWithQuality Object"; } + push @phredstack,("BEGIN_SEQUENCE $id","","BEGIN_COMMENT",""); + + $chromatfile = 'undefined in write_phd' unless defined $chromatfile; + push @phredstack,"CHROMAT_FILE: $chromatfile"; + + $abithumb = 0 unless defined $abithumb; + push @phredstack,"ABI_THUMBPRINT: $abithumb"; + + $phredversion = "0.980904.e" unless defined $phredversion; + push @phredstack,"PHRED_VERSION: $phredversion"; + + $callmethod = 'phred' unless defined $callmethod; + push @phredstack,"CALL_METHOD: $callmethod"; + + $qualitylevels = 99 unless defined $qualitylevels; + push @phredstack,"QUALITY_LEVELS: $qualitylevels"; + + $time = localtime() unless defined $time; + push @phredstack,"TIME: $time"; + + $trace_min_index = 0 unless defined $trace_min_index; + push @phredstack,"TRACE_ARRAY_MIN_INDEX: $trace_min_index"; + + $trace_max_index = '10000' unless defined $trace_max_index; + push @phredstack,"TRACE_ARRAY_MAX_INDEX: $trace_max_index"; + + $chem = 'unknown' unless defined $chem; + push @phredstack,"CHEM: $chem"; + + $dye = 'unknown' unless defined $dye; + push @phredstack, "DYE: $dye"; + + push @phredstack,("END_COMMENT","","BEGIN_DNA"); + + foreach (@phredstack) { $self->_print($_."\n"); } + + my $length = $swq->length(); + if ($length eq "DIFFERENT") { + $self->throw("Can't create the phd because the sequence and the quality in the SeqWithQuality object are of different lengths."); + } + for (my $curr = 1; $curr<=$length; $curr++) { + $self->_print (uc($swq->baseat($curr))." ". + $swq->qualat($curr)." 10". + "\n"); + } + $self->_print ("END_DNA\n\nEND_SEQUENCE\n"); + + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; +} + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqIO/pir.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/pir.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,157 @@ +# $Id: pir.pm,v 1.18 2002/10/25 16:23:16 jason Exp $ +# +# BioPerl module for Bio::SeqIO::PIR +# +# Cared for by Aaron Mackey <amackey@virginia.edu> +# +# Copyright Aaron Mackey +# +# You may distribute this module under the same terms as perl itself +# +# _history +# October 18, 1999 Largely rewritten by Lincoln Stein + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::pir - PIR sequence input/output stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the Bio::SeqIO class. + +=head1 DESCRIPTION + +This object can transform Bio::Seq objects to and from pir flat +file databases. + +Note: This does not completely preserve the PIR format - quality +information about sequence is currently discarded since bioperl +does not have a mechanism for handling these encodings in sequence +data. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://www.bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS + +Aaron Mackey E<lt>amackey@virginia.eduE<gt> +Lincoln Stein E<lt>lstein@cshl.orgE<gt> +Jason Stajich E<lt>jason@bioperl.orgE<gt> + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::SeqIO::pir; +use vars qw(@ISA); +use strict; + +use Bio::SeqIO; +use Bio::Seq::SeqFactory; + +@ISA = qw(Bio::SeqIO); + +sub _initialize { + my($self,@args) = @_; + $self->SUPER::_initialize(@args); + if( ! defined $self->sequence_factory ) { + $self->sequence_factory(new Bio::Seq::SeqFactory + (-verbose => $self->verbose(), + -type => 'Bio::Seq')); + } +} + +=head2 next_seq + + Title : next_seq + Usage : $seq = $stream->next_seq() + Function: returns the next sequence in the stream + Returns : Bio::Seq object + Args : NONE + +=cut + +sub next_seq { + my ($self) = @_; + local $/ = "\n>"; + return unless my $line = $self->_readline; + if( $line eq '>' ) { # handle the very first one having no comment + return unless $line = $self->_readline; + } + my ($top, $desc,$seq) = ( $line =~ /^(.+?)\n(.+?)\n([^>]*)/s ) or + $self->throw("Cannot parse entry PIR entry [$line]"); + + + my ( $type,$id ) = ( $top =~ /^>?([PF])1;(\S+)\s*$/ ) or + $self->throw("PIR stream read attempted without leading '>P1;' [ $line ]"); + + # P - indicates complete protein + # F - indicates protein fragment + # not sure how to stuff these into a Bio object + # suitable for writing out. + $seq =~ s/\*//g; + $seq =~ s/[\(\)\.\/\=\,]//g; + $seq =~ s/\s+//g; # get rid of whitespace + + my ($alphabet) = ('protein'); + # TODO - not processing SFS data + return $self->sequence_factory->create + (-seq => $seq, + -primary_id => $id, + -id => $type. '1;' . $id, + -desc => $desc, + -alphabet => $alphabet + ); +} + +=head2 write_seq + + Title : write_seq + Usage : $stream->write_seq(@seq) + Function: writes the $seq object into the stream + Returns : 1 for success and 0 for error + Args : Array of Bio::PrimarySeqI objects + + +=cut + +sub write_seq { + my ($self, @seq) = @_; + for my $seq (@seq) { + $self->throw("Did not provide a valid Bio::PrimarySeqI object") + unless defined $seq && ref($seq) && $seq->isa('Bio::PrimarySeqI'); + my $str = $seq->seq(); + return unless $self->_print(">".$seq->id(), + "\n", $seq->desc(), "\n", + $str, "*\n"); + } + + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqIO/pln.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/pln.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,141 @@ +# $Id: pln.pm,v 1.7 2002/10/22 07:38:42 lapp Exp $ +# BioPerl module for Bio::SeqIO::pln +# +# Cared for by Aaron Mackey <amackey@virginia.edu> +# +# Copyright Aaron Mackey +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::pln - pln trace sequence input/output stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the Bio::SeqIO class. + +=head1 DESCRIPTION + +This object can transform Bio::Seq objects to and from pln trace +files. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS - Aaron Mackey + +Email: amackey@virginia.edu + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::SeqIO::pln; +use vars qw(@ISA $READ_AVAIL); +use strict; +# Object preamble - inherits from Bio::Root::Object + +use Bio::SeqIO; +use Bio::Seq::SeqFactory; + +push @ISA, qw( Bio::SeqIO ); + +sub BEGIN { + eval { require Bio::SeqIO::staden::read; }; + if ($@) { + $READ_AVAIL = 0; + } else { + push @ISA, "Bio::SeqIO::staden::read"; + $READ_AVAIL = 1; + } +} + +sub _initialize { + my($self,@args) = @_; + $self->SUPER::_initialize(@args); + if( ! defined $self->sequence_factory ) { + $self->sequence_factory(new Bio::Seq::SeqFactory(-verbose => $self->verbose(), -type => 'Bio::Seq')); + } + unless ($READ_AVAIL) { + Bio::Root::Root->throw( -class => 'Bio::Root::SystemException', + -text => "Bio::SeqIO::staden::read is not available; make sure the bioperl-ext package has been installed successfully!" + ); + } +} + +=head2 next_seq + + Title : next_seq + Usage : $seq = $stream->next_seq() + Function: returns the next sequence in the stream + Returns : Bio::SeqWithQuality object + Args : NONE + +=cut + +sub next_seq { + + my ($self) = @_; + + my ($seq, $id, $desc, $qual) = $self->read_trace($self->_fh, 'pln'); + + # create the seq object + $seq = $self->sequence_factory->create(-seq => $seq, + -id => $id, + -primary_id => $id, + -desc => $desc, + -alphabet => 'DNA', + -qual => $qual + ); + return $seq; +} + +=head2 write_seq + + Title : write_seq + Usage : $stream->write_seq(@seq) + Function: writes the $seq object into the stream + Returns : 1 for success and 0 for error + Args : Bio::Seq object + + +=cut + +sub write_seq { + my ($self,@seq) = @_; + + my $fh = $self->_fh; + foreach my $seq (@seq) { + $self->write_trace($fh, $seq, 'pln'); + } + + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqIO/qual.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/qual.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,235 @@ +# $Id: qual.pm,v 1.22 2002/12/27 19:42:32 birney Exp $ +# +# Copyright (c) 1997-9 bioperl, Chad Matsalla. All Rights Reserved. +# This module is free software; you can redistribute it and/or +# modify it under the same terms as Perl itself. +# +# Copyright Chad Matsalla +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::Qual - .qual file input/output stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the Bio::SeqIO class +(see L<Bio::SeqIO> for details). + +=head1 DESCRIPTION + +This object can transform .qual (similar to fasta) objects to and from +Bio::Seq::SeqWithQuality objects. See L<Bio::Seq::SeqWithQuality> for +details. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://www.bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR Chad Matsalla + +Chad Matsalla +bioinformatics@dieselwurks.com + +=head1 CONTRIBUTORS + +Jason Stajich, jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::SeqIO::qual; +use vars qw(@ISA); +use strict; +use Bio::SeqIO; +use Bio::Seq::SeqFactory; +require 'dumpvar.pl'; + +@ISA = qw(Bio::SeqIO); + + +sub _initialize { + my($self,@args) = @_; + $self->SUPER::_initialize(@args); + if( ! defined $self->sequence_factory ) { + $self->sequence_factory(new Bio::Seq::SeqFactory + (-verbose => $self->verbose(), + -type => 'Bio::Seq::PrimaryQual')); + } +} + +=head2 next_seq() + + Title : next_seq() + Usage : $scf = $stream->next_seq() + Function: returns the next scf sequence in the stream + Returns : Bio::Seq::PrimaryQual object + Notes : Get the next quality sequence from the stream. + +=cut + +sub next_seq { + my ($self,@args) = @_; + my ($qual,$seq); + my $alphabet; + local $/ = "\n>"; + + return unless my $entry = $self->_readline; + + if ($entry eq '>') { # very first one + return unless $entry = $self->_readline; + } + + # original: my ($top,$sequence) = $entry =~ /^(.+?)\n([^>]*)/s + my ($top,$sequence) = $entry =~ /^(.+?)\n([^>]*)/s + or $self->throw("Can't parse entry [$entry]"); + my ($id,$fulldesc) = $top =~ /^\s*(\S+)\s*(.*)/ + or $self->throw("Can't parse fasta header"); + $id =~ s/^>//; + # create the seq object + $sequence =~ s/\n+/ /g; + return $self->sequence_factory->create + (-qual => $sequence, + -id => $id, + -primary_id => $id, + -display_id => $id, + -desc => $fulldesc + ); +} + +=head2 _next_qual + + Title : _next_qual + Usage : $seq = $stream->_next_qual() (but do not do + that. Use $stream->next_seq() instead) + Function: returns the next quality in the stream + Returns : Bio::Seq::PrimaryQual object + Args : NONE + Notes : An internal method. Gets the next quality in + the stream. + +=cut + +sub _next_qual { + my $qual = next_primary_qual( $_[0], 1 ); + return $qual; +} + +=head2 next_primary_qual() + + Title : next_primary_qual() + Usage : $seq = $stream->next_primary_qual() + Function: returns the next sequence in the stream + Returns : Bio::PrimaryQual object + Args : NONE + +=cut + +sub next_primary_qual { + # print("CSM next_primary_qual!\n"); + my( $self, $as_next_qual ) = @_; + my ($qual,$seq); + local $/ = "\n>"; + + return unless my $entry = $self->_readline; + + if ($entry eq '>') { # very first one + return unless $entry = $self->_readline; + } + + my ($top,$sequence) = $entry =~ /^(.+?)\n([^>]*)/s + or $self->throw("Can't parse entry [$entry]"); + my ($id,$fulldesc) = $top =~ /^\s*(\S+)\s*(.*)/ + or $self->throw("Can't parse fasta header"); + $id =~ s/^>//; + # create the seq object + $sequence =~ s/\n+/ /g; + if ($as_next_qual) { + $qual = Bio::Seq::PrimaryQual->new(-qual => $sequence, + -id => $id, + -primary_id => $id, + -display_id => $id, + -desc => $fulldesc + ); + } + return $qual; +} + +=head2 write_seq + + Title : write_seq(-source => $source, -header => "some information") + Usage : $obj->write_seq( -source => $source, + -header => "some information"); + Function: Write out an list of quality values to a fasta-style file. + Returns : Nothing. + Args : Requires: a reference to a SeqWithQuality object or a + PrimaryQual object as the -source. Optional: information + for the header. + Notes : If no -header is provided, $obj->id() will be used where + $obj is a reference to either a SeqWithQuality object or a + PrimaryQual object. If $source->id() fails, ">unknown" will be + the header. If the SeqWithQuality object has $source->length() of + "DIFFERENT" (read the pod, luke), write_seq will use the length + of the PrimaryQual object within the SeqWithQuality object. + +=cut + +sub write_seq { + my ($self,@args) = @_; + my ($source) = $self->_rearrange([qw(SOURCE)], @args); + + if (!$source || ( !$source->isa('Bio::Seq::SeqWithQuality') && + !$source->isa('Bio::Seq::PrimaryQual') )) { + $self->throw("You must pass a Bio::Seq::SeqWithQuality or a Bio::Seq::PrimaryQual object to write_seq as a parameter named \"source\""); + } + my $header = $source->id(); + if (!$header) { $header = "unknown"; } + my @quals = $source->qual(); + # ::dumpValue(\@quals); + $self->_print (">$header \n"); + my (@slice,$max,$length); + $length = $source->length(); + if ($length eq "DIFFERENT") { + $self->warn("You passed a SeqWithQuality object that contains a sequence and quality of differing lengths. Using the length of the PrimaryQual component of the SeqWithQuality object."); + $length = $source->qual_obj()->length(); + } + # print("Printing $header to a file.\n"); + for (my $count = 1; $count<=$length; $count+= 50) { + if ($count+50 > $length) { $max = $length; } + else { $max = $count+49; } + my @slice = @{$source->subqual($count,$max)}; + $self->_print (join(' ',@slice), "\n"); + } + + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; +} + + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqIO/raw.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/raw.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,178 @@ +#----------------------------------------------------------------------------- +# PACKAGE : Bio::SeqIO::raw +# AUTHOR : Ewan Birney <birney@ebi.ac.uk> +# CREATED : Feb 16 1999 +# REVISION: $Id: raw.pm,v 1.15.2.1 2003/02/05 21:55:21 jason Exp $ +# +# Copyright (c) 1997-9 bioperl, Ewan Birney. All Rights Reserved. +# This module is free software; you can redistribute it and/or +# modify it under the same terms as Perl itself. +# +# _History_ +# +# Ewan Birney <birney@ebi.ac.uk> developed the SeqIO +# schema and the first prototype modules. +# +# This code is based on his Bio::SeqIO::Fasta module with +# the necessary minor tweaks necessary to get it to read +# and write raw formatted sequences made by +# chris dagdigian <dag@sonsorol.org> +# +# October 18, 1999 Largely rewritten by Lincoln Stein +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::raw - raw sequence file input/output stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the L<Bio::SeqIO> class. + +=head1 DESCRIPTION + +This object can transform Bio::Seq objects to and from raw flat +file databases. + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably +to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://www.bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS + + Ewan Birney E<lt>birney@ebi.ac.ukE<gt> + Lincoln Stein E<lt>lstein@cshl.orgE<gt> + +=head1 CONTRIBUTORS + + Jason Stajich E<lt>jason@bioperl.org<gt> + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::SeqIO::raw; +use strict; +use vars qw(@ISA); + +use Bio::SeqIO; +use Bio::Seq::SeqFactory; + +@ISA = qw(Bio::SeqIO); + +sub _initialize { + my($self,@args) = @_; + $self->SUPER::_initialize(@args); + if( ! defined $self->sequence_factory ) { + $self->sequence_factory(new Bio::Seq::SeqFactory + (-verbose => $self->verbose(), + -type => 'Bio::Seq')); + } +} + +=head2 next_seq + + Title : next_seq + Usage : $seq = $stream->next_seq() + Function: returns the next sequence in the stream + Returns : Bio::Seq object + Args : + + +=cut + +sub next_seq{ + my ($self,@args) = @_; + ## When its 1 sequence per line with no formatting at all, + ## grabbing it should be easy :) + + my $nextline = $self->_readline(); + if( !defined $nextline ){ return undef; } + + my $sequence = uc($nextline); + $sequence =~ s/\W//g; + + return $self->sequence_factory->create(-seq => $sequence); +} + +=head2 write_seq + + Title : write_seq + Usage : $stream->write_seq($seq) + Function: writes the $seq object into the stream + Returns : 1 for success and 0 for error + Args : Array of Bio::PrimarySeqI objects + + +=cut + +sub write_seq { + my ($self,@seq) = @_; + foreach my $seq (@seq) { + $self->throw("Must provide a valid Bio::PrimarySeqI object") + unless defined $seq && ref($seq) && $seq->isa('Bio::PrimarySeqI'); + $self->_print($seq->seq, "\n") or return; + } + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; +} + +=head2 write_qual + + Title : write_qual + Usage : $stream->write_qual($seq) + Function: writes the $seq object into the stream + Returns : 1 for success and 0 for error + Args : Bio::Seq object + + +=cut + +sub write_qual { + my ($self,@seq) = @_; + my @qual = (); + foreach (@seq) { + unless ($_->isa("Bio::Seq::SeqWithQuality")){ + warn("You cannot write raw qualities without supplying a Bio::Seq::SeqWithQuality object! You passed a ", ref($_), "\n"); + next; + } + @qual = @{$_->qual}; + if(scalar(@qual) == 0) { + $qual[0] = "\n"; + } + + $self->_print (join " ", @qual,"\n") or return; + + } + return 1; +} +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqIO/scf.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/scf.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1262 @@ +# $Id: scf.pm,v 1.23 2002/11/01 11:16:25 heikki Exp $ +# +# Copyright (c) 1997-2001 bioperl, Chad Matsalla. All Rights Reserved. +# This module is free software; you can redistribute it and/or +# modify it under the same terms as Perl itself. +# +# Copyright Chad Matsalla +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::scf - .scf file input/output stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the Bio::SeqIO class, see +L<Bio::SeqIO> for more information. + +=head1 DESCRIPTION + +This object can transform .scf files to and from +Bio::Seq::SeqWithQuality objects. Mechanisms are present to retrieve +trace data from scf files. + +=head1 FEEDBACK + +=head2 Mailing Lists + + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://www.bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR Chad Matsalla + +Chad Matsalla +bioinformatics@dieselwurks.com + +=head1 CONTRIBUTORS + +Jason Stajich, jason@bioperl.org +Tony Cox, avc@sanger.ac.uk +Heikki Lehvaslaiho, heikki@ebi.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::SeqIO::scf; +use vars qw(@ISA $DEFAULT_QUALITY); +use strict; +use Bio::SeqIO; +use Bio::Seq::SeqFactory; +require 'dumpvar.pl'; + +BEGIN { + $DEFAULT_QUALITY= 10; +} + +@ISA = qw(Bio::SeqIO); + +sub _initialize { + my($self,@args) = @_; + $self->SUPER::_initialize(@args); + if( ! defined $self->sequence_factory ) { + $self->sequence_factory(new Bio::Seq::SeqFactory + (-verbose => $self->verbose(), + -type => 'Bio::Seq::SeqWithQuality')); + } +} + +=head2 next_seq() + + Title : next_seq() + Usage : $scf = $stream->next_seq() + Function: returns the next scf sequence in the stream + Returns : Bio::Seq::SeqWithQuality object + Args : NONE + Notes : Fills the interface specification for SeqIO. + The SCF specification does not provide for having more then + one sequence in a given scf. So once the filehandle has been open + and passed to SeqIO don't expect to run this function more then + once on a given scf unless you embraced and extended the SCF + standard. (But that's just C R A Z Y talk, isn't it.) + +=cut + +#' +sub next_seq { + my ($self) = @_; + my ($seq, $seqc, $fh, $buffer, $offset, $length, $read_bytes, @read, + %names); + # set up a filehandle to read in the scf + $fh = $self->_filehandle(); + unless ($fh) { # simulate the <> function + if ( !fileno(ARGV) or eof(ARGV) ) { + return unless my $ARGV = shift; + open(ARGV,$ARGV) or + $self->throw("Could not open $ARGV for SCF stream reading $!"); + } + $fh = \*ARGV; + } + binmode $fh; # for the Win32/Mac crowds + return unless read $fh, $buffer, 128; # no exception; probably end of file + # the first thing to do is parse the header. This is common + # among all versions of scf. + $self->_set_header($buffer); + # the rest of the the information is different between the + # the different versions of scf. + my $byte = "n"; + if ($self->{'version'} lt "3.00") { + # first gather the trace information + $length = $self->{'samples'}*$self->{sample_size}*4; + $buffer = $self->read_from_buffer($fh,$buffer,$length); + if ($self->{sample_size} == 1) { + $byte = "c"; + } + @read = unpack "${byte}${length}",$buffer; + # these traces need to be split + $self->_set_v2_traces(\@read); + # now go and get the base information + $offset = $self->{bases_offset}; + $length = ($self->{bases} * 12); + seek $fh,$offset,0; + $buffer = $self->read_from_buffer($fh,$buffer,$length); + # now distill the information into its fractions. + $self->_set_v2_bases($buffer); + } else { + my $transformed_read; + foreach (qw(A C G T)) { + $length = $self->{'samples'}*$self->{sample_size}; + $buffer = $self->read_from_buffer($fh,$buffer,$length); + if ($self->{sample_size} == 1) { + $byte = "c"; + } + @read = unpack "${byte}${length}",$buffer; + # this little spurt of nonsense is because + # the trace values are given in the binary + # file as unsigned shorts but they really + # are signed. 30000 is an arbitrary number + # (will there be any traces with a given + # point greater then 30000? I hope not. + # once the read is read, it must be changed + # from relative + for (my $element=0; $element < scalar(@read); $element++) { + if ($read[$element] > 30000) { + $read[$element] = $read[$element] - 65536; + } + } + $transformed_read = $self->_delta(\@read,"backward"); + $self->{'traces'}->{$_} = join(' ',@{$transformed_read}); + } + # now go and get the peak index information + $offset = $self->{bases_offset}; + $length = ($self->{bases} * 4); + seek $fh,$offset,0; + $buffer = $self->read_from_buffer($fh,$buffer,$length); + $self->_set_v3_peak_indices($buffer); + # now go and get the accuracy information + $buffer = $self->read_from_buffer($fh,$buffer,$length); + $self->_set_v3_base_accuracies($buffer); + # OK, now go and get the base information. + $length = $self->{bases}; + $buffer = $self->read_from_buffer($fh,$buffer,$length); + $self->{'parsed'}->{'sequence'} = unpack("a$length",$buffer); + # now, finally, extract the calls from the accuracy information. + $self->_set_v3_quality($self); + } + # now go and get the comment information + $offset = $self->{comments_offset}; + seek $fh,$offset,0; + $length = $self->{comment_size}; + $buffer = $self->read_from_buffer($fh,$buffer,$length); + $self->_set_comments($buffer); + return $self->sequence_factory->create + (-seq => $self->{'parsed'}->{'sequence'}, + -qual => $self->{'parsed'}->{'qualities'}, + -id => $self->{'comments'}->{'NAME'} + ); +} + + +=head2 _set_v3_quality() + + Title : _set_v3_quality() + Usage : $self->_set_v3_quality() + Function: Set the base qualities from version3 scf's + Returns : Nothing. Alters $self. + Args : None. + Notes : + +=cut + +#' +sub _set_v3_quality { + my $self = shift; + my @bases = split//,$self->{'parsed'}->{'sequence'}; + my (@qualities,$currbase,$currqual,$counter); + for ($counter=0; $counter <= $#bases ; $counter++) { + $currbase = uc($bases[$counter]); + if ($currbase eq "A") { $currqual = $self->{'parsed'}->{'base_accuracies'}->{'A'}->[$counter]; } + elsif ($currbase eq "C") { $currqual = $self->{'parsed'}->{'base_accuracies'}->{'C'}->[$counter]; } + elsif ($currbase eq "G") { $currqual = $self->{'parsed'}->{'base_accuracies'}->{'G'}->[$counter]; } + elsif ($currbase eq "T") { $currqual = $self->{'parsed'}->{'base_accuracies'}->{'T'}->[$counter]; } + else { $currqual = "unknown"; } + push @qualities,$currqual; + } + $self->{'parsed'}->{'qualities'} = \@qualities; +} + +=head2 _set_v3_peak_indices($buffer) + + Title : _set_v3_peak_indices($buffer) + Usage : $self->_set_v3_peak_indices($buffer); + Function: Unpacks the base accuracies for version3 scf + Returns : Nothing. Alters $self + Args : A scalar containing binary data. + Notes : + +=cut + +sub _set_v3_peak_indices { + my ($self,$buffer) = @_; + my $length = length($buffer); + my ($offset,@read,@positions); + @read = unpack "N$length",$buffer; + $self->{'parsed'}->{'peak_indices'} = join(' ',@read); +} + +=head2 _set_v3_base_accuracies($buffer) + + Title : _set_v3_base_accuracies($buffer) + Usage : $self->_set_v3_base_accuracies($buffer) + Function: Set the base accuracies for version 3 scf's + Returns : Nothing. Alters $self. + Args : A scalar containing binary data. + Notes : + +=cut + +#' +sub _set_v3_base_accuracies { + my ($self,$buffer) = @_; + my $length = length($buffer); + my $qlength = $length/4; + my $offset = 0; + my (@qualities,@sorter,$counter,$round,$last_base); + foreach (qw(A C G T)) { + my @read; + $last_base = $offset + $qlength; + for (;$offset < $last_base; $offset += $qlength) { + @read = unpack "c$qlength", substr($buffer,$offset,$qlength); + $self->{'parsed'}->{'base_accuracies'}->{"$_"} = \@read; + } + } +} + + +=head2 _set_comments($buffer) + + Title : _set_comments($buffer) + Usage : $self->_set_comments($buffer); + Function: Gather the comments section from the scf and parse it into its + components. + Returns : Nothing. Modifies $self. + Args : The buffer. It is expected that the buffer contains a binary + string for the comments section of an scf file according to + the scf file specifications. + Notes : None. Works like Jello. + +=cut + +sub _set_comments { + my ($self,$buffer) = @_; + my $size = length($buffer); + my $comments_retrieved = unpack "a$size",$buffer; + $comments_retrieved =~ s/\0//; + my @comments_split = split/\n/,$comments_retrieved; + if (@comments_split) { + foreach (@comments_split) { + /(\w+)=(.*)/; + if ($1 && $2) { + $self->{'comments'}->{$1} = $2; + } + } + } + return; +} + +=head2 _set_header() + + Title : _set_header($buffer) + Usage : $self->_set_header($buffer); + Function: Gather the header section from the scf and parse it into its + components. + Returns : Nothing. Modifies $self. + Args : The buffer. It is expected that the buffer contains a binary + string for the header section of an scf file according to the + scf file specifications. + Notes : None. + +=cut + +sub _set_header { + my ($self,$buffer) = @_; + ($self->{'scf'}, + $self->{'samples'}, + $self->{'sample_offset'}, + $self->{'bases'}, + $self->{'bases_left_clip'}, + $self->{'bases_right_clip'}, + $self->{'bases_offset'}, + $self->{'comment_size'}, + $self->{'comments_offset'}, + $self->{'version'}, + $self->{'sample_size'}, + $self->{'code_set'}, + @{$self->{'header_spare'}} ) = unpack "a4 NNNNNNNN a4 NN N20", $buffer; + return; + +} + +=head2 _set_v2_bases($buffer) + + Title : _set_v2_bases($buffer) + Usage : $self->_set_v2_bases($buffer); + Function: Gather the bases section from the scf and parse it into its + components. + Returns : Nothing. Modifies $self. + Args : The buffer. It is expected that the buffer contains a binary + string for the bases section of an scf file according to the + scf file specifications. + Notes : None. + +=cut + +sub _set_v2_bases { + my ($self,$buffer) = @_; + my $length = length($buffer); + my ($offset2,$currbuff,$currbase,$currqual,$sequence,@qualities,@indices); + my @read; + for ($offset2=0;$offset2<$length;$offset2+=12) { + @read = unpack "N C C C C a C3", substr($buffer,$offset2,$length); + push @indices,$read[0]; + $currbase = uc($read[5]); + if ($currbase eq "A") { $currqual = $read[1]; } + elsif ($currbase eq "C") { $currqual = $read[2]; } + elsif ($currbase eq "G") { $currqual = $read[3]; } + elsif ($currbase eq "T") { $currqual = $read[4]; } + else { $currqual = "UNKNOWN"; } + $sequence .= $currbase; + push @qualities,$currqual; + } + unless (!@indices) { + $self->{'parsed'}->{'peak_indices'} = join(' ',@indices); + } + $self->{'parsed'}->{'sequence'} = $sequence; + unless (!@qualities) { + $self->{'parsed'}->{'qualities'} = join(' ',@qualities); + } +} + +=head2 _set_v2_traces(\@traces_array) + + Title : _set_v2_traces(\@traces_array) + Usage : $self->_set_v2_traces(\@traces_array); + Function: Parses an scf Version2 trace array into its base components. + Returns : Nothing. Modifies $self. + Args : A reference to an array of the unpacked traces section of an + scf version2 file. + +=cut + +sub _set_v2_traces { + my ($self,$rread) = @_; + my @read = @$rread; + my $array = 0; + for (my $offset2 = 0; $offset2< scalar(@read); $offset2+=4) { + if ($array) { + push @{$self->{'traces'}->{'A'}},$read[$offset2]; + push @{$self->{'traces'}->{'C'}},$read[$offset2+1]; + push @{$self->{'traces'}->{'G'}},$read[$offset2+3]; + push @{$self->{'traces'}->{'T'}},$read[$offset2+2]; + } else { + $self->{'traces'}->{'A'} .= " ".$read[$offset2]; + $self->{'traces'}->{'C'} .= " ".$read[$offset2+1]; + $self->{'traces'}->{'G'} .= " ".$read[$offset2+2]; + $self->{'traces'}->{'T'} .= " ".$read[$offset2+3]; + } + } + return; +} + +=head2 get_trace($base_channel) + + Title : get_trace($base_channel) + Usage : @a_trace = @{$obj->get_trace("A")}; + Function: Return the trace data for the given base. + Returns : A reference to an array containing the trace data for the + given base. + Args : A,C,G, or T. Any other input throws. + Notes : + +=cut + +sub get_trace { + my ($self,$base_channel) = @_; + $base_channel =~ tr/a-z/A-Z/; + if ($base_channel !~ /A|T|G|C/) { + $self->throw("You tried to ask for a base channel that wasn't A,T,G, or C. Ask for one of those next time."); + } elsif ($base_channel) { + my @temp = split(' ',$self->{'traces'}->{$base_channel}); + return \@temp; + } +} + +=head2 get_peak_indices() + + Title : get_peak_indices() + Usage : @a_trace = @{$obj->get_peak_indices()}; + Function: Return the peak indices for this scf. + Returns : A reference to an array containing the peak indices for this scf. + Args : None. + Notes : + +=cut + +sub get_peak_indices { + my ($self) = shift; + my @temp = split(' ',$self->{'parsed'}->{'peak_indices'}); + return \@temp; +} + + +=head2 get_header() + + Title : get_header() + Usage : %header = %{$obj->get_header()}; + Function: Return the header for this scf. + Returns : A reference to a hash containing the header for this scf. + Args : None. + Notes : + +=cut + +sub get_header { + my ($self) = shift; + my %header; + foreach (qw(scf samples sample_offset bases bases_left_clip + bases_right_clip bases_offset comment_size comments_offset + version sample_size code_set peak_indices)) { + $header{"$_"} = $self->{"$_"}; + } + return \%header; +} + +=head2 _dump_traces_incoming($transformed) + + Title : _dump_traces_incoming("transformed") + Usage : &_dump_traces($ra,$rc,$rg,$rt); + Function: Used in debugging. Prints all traces one beside each other. + Returns : Nothing. + Args : References to the arrays containing the traces for A,C,G,T. + Notes : Beats using dumpValue, I'll tell ya. Much better then using + join' ' too. + - if a scalar is included as an argument (any scalar), this + procedure will dump the _delta'd trace. If you don't know what + that means you should not be using this. + +=cut + +#' +sub _dump_traces_incoming { + my ($self) = @_; + my (@sA,@sT,@sG,@sC); + # @sA = @{$self->{'traces'}->{'A'}}; + # @sC = @{$self->{'traces'}->{'C'}}; + # @sG = @{$self->{'traces'}->{'G'}}; + # @sT = @{$self->{'traces'}->{'T'}}; + @sA = @{$self->get_trace('A')}; + @sC = @{$self->get_trace('C')}; + @sG = @{$self->get_trace('G')}; + @sT = @{$self->get_trace('t')}; + print ("Count\ta\tc\tg\tt\n"); + for (my $curr=0; $curr < scalar(@sG); $curr++) { + print("$curr\t$sA[$curr]\t$sC[$curr]\t$sG[$curr]\t$sT[$curr]\n"); + } + return; +} + +=head2 _dump_traces_outgoing($transformed) + + Title : _dump_traces_outgoing("transformed") + Usage : &_dump_traces_outgoing(($ra,$rc,$rg,$rt); + Function: Used in debugging. Prints all traces one beside each other. + Returns : Nothing. + Args : References to the arrays containing the traces for A,C,G,T. + Notes : Beats using dumpValue, I\'ll tell ya. Much better then using + join' ' too. + - if a scalar is included as an argument (any scalar), this + procedur will dump the _delta'd trace. If you don't know what + that means you should not be using this. + +=cut + +sub _dump_traces_outgoing { + my ($self,$transformed) = @_; + my (@sA,@sT,@sG,@sC); + if ($transformed) { + @sA = @{$self->{'text'}->{'t_samples_a'}}; + @sC = @{$self->{'text'}->{'t_samples_c'}}; + @sG = @{$self->{'text'}->{'t_samples_g'}}; + @sT = @{$self->{'text'}->{'t_samples_t'}}; + } + else { + @sA = @{$self->{'text'}->{'samples_a'}}; + @sC = @{$self->{'text'}->{'samples_c'}}; + @sG = @{$self->{'text'}->{'samples_g'}}; + @sT = @{$self->{'text'}->{'samples_t'}}; + } + print ("Count\ta\tc\tg\tt\n"); + for (my $curr=0; $curr < scalar(@sG); $curr++) { + print("$curr\t$sA[$curr]\t$sC[$curr]\t$sG[$curr]\t$sT[$curr]\n"); + } + return; +} + +=head2 write_seq + + Title : write_seq(-SeqWithQuality => $swq, <comments>) + Usage : $obj->write_seq( -SeqWithQuality => $swq, + -version => 2, + -CONV => "Bioperl-Chads Mighty SCF writer."); + Function: Write out an scf. + Returns : Nothing. + Args : Requires: a reference to a SeqWithQuality object to form the + basis for the scf. + if -version is provided, it should be "2" or "3". A SCF of that + version will be written. + Any other arguments are assumed to be comments and are put into + the comments section of the scf. Read the specifications for scf + to decide what might be good to put in here. + + Notes : + Someday: (All of this stuff is easy easy easy I just don't have + the requirement or the time.) + - Change the peak scaling factor? + - Change the width of the peak? + - Change the overlap between peaks? + +=cut + +#' +sub write_seq { + my ($self,%args) = @_; + my %comments; + my ($label,$arg); + + my ($swq) = $self->_rearrange([qw(SEQWITHQUALITY)], %args); + unless (ref($swq) eq "Bio::Seq::SeqWithQuality") { + $self->throw("You must pass a Bio::Seq::SeqWithQuality object to write_seq as a parameter named \"SeqWithQuality\""); + } + # verify that there is some sequence or some qualities + # If the $swq with quality has no qualities, set all qualities to 0. + # If the $swq has no sequence, set the sequence to N\'s. + $self->_fill_missing_data($swq); + + # all of the rest of the arguments are comments for the scf + foreach $arg (sort keys %args) { + next if ($arg =~ /SeqWithQuality/i); + ($label = $arg) =~ s/^\-//; + $comments{$label} = $args{$arg}; + } + if (!$comments{'NAME'}) { $comments{'NAME'} = $swq->id(); } + # HA! Bwahahahaha. + $comments{'CONV'} = "Bioperl-Chads Mighty SCF writer." unless defined $comments{'CONV'}; + # now deal with the version of scf they want to write + if ($comments{version}) { + if ($comments{version} != 2 && $comments{version} != 3) { + $self->warn("This module can only write version 2.0 or 3.0 scf's. Writing a version 2.0 scf by default."); + $comments{version} = "2.00"; + } + if ($comments{'version'} > 2) { + $comments{'version'} = "3.00"; + } + } + else { + $comments{'version'} = "2.00"; + } + + + + # set a few things in the header + $self->{'header'}->{'magic'} = ".scf"; + $self->{'header'}->{'sample_size'} = "2"; + $self->{'header'}->{'bases'} = length($swq->seq()); + $self->{'header'}->{'bases_left_clip'} = "0"; + $self->{'header'}->{'bases_right_clip'} = "0"; + $self->{'header'}->{'version'} = $comments{'version'}; + $self->{'header'}->{'sample_size'} = "2"; + $self->{'header'}->{'code_set'} = "9"; + @{$self->{'header'}->{'spare'}} = qw(0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0); + + # create the binary for the comments and file it in $self->{'binaries'}->{'comments'} + $self->_set_binary_comments(\%comments); + # create the binary and the strings for the traces, bases, offsets (if necessary), and accuracies (if necessary) + $self->_set_binary_tracesbases($comments{'version'},$swq->seq(),$swq->qual()); + + # now set more things in the header + $self->{'header'}->{'samples_offset'} = "128"; + + my ($b_base_offsets,$b_base_accuracies,$samples_size,$bases_size); + # + # version 2 + # + if ($self->{'header'}->{'version'} == 2) { + $samples_size = $self->{'header'}->{'samples'} * 4 * + $self->{'header'}->{'sample_size'}; + $bases_size = length($swq->seq()) * 12; + $self->{'header'}->{'bases_offset'} = 128 + length($self->{'binaries'}->{'samples_all'}); + $self->{'header'}->{'comments_offset'} = 128 + length($self->{'binaries'}->{'samples_all'}) + length($self->{'binaries'}->{'v2_bases'}); + $self->{'header'}->{'comments_size'} = length($self->{'binaries'}->{'comments'}); + $self->{'header'}->{'private_size'} = "0"; + $self->{'header'}->{'private_offset'} = 128 + $samples_size + + $bases_size + $self->{'header'}->{'comments_size'}; + } + else { + $self->{'header'}->{'bases_offset'} = 128 + length($self->{'binaries'}->{'samples_all'}); + $self->{'header'}->{'comments_size'} = length($self->{'binaries'}->{'comments'}); + # this is: + # bases_offset + base_offsets + accuracies + called_bases + reserved + $self->{'header'}->{'comments_offset'} = $self->{'header'}->{'bases_offset'} + 4*$self->{header}->{'bases'} + 4*$self->{header}->{'bases'} + $self->{header}->{'bases'} + 3*$self->{header}->{'bases'}; + $self->{'header'}->{'private_size'} = "0"; + $self->{'header'}->{'private_offset'} = $self->{'header'}->{'comments_offset'} + $self->{'header'}->{'comments_size'}; + } + + $self->_set_binary_header(); + + # should something better be done rather then returning after + # writing? I don't do any exception trapping here + if ($comments{'version'} == 2) { + # print ("Lengths:\n"); + # print("Header : ".length($self->{'binaries'}->{'header'})."\n"); + # print("Traces : ".length($self->{'binaries'}->{'samples_all'})."\n"); + # print("Bases : ".length($self->{'binaries'}->{'v2_bases'})."\n"); + # print("Comments: ".length($self->{'binaries'}->{'comments'})."\n"); + $self->_print ($self->{'binaries'}->{'header'}) or return; + $self->_print ($self->{'binaries'}->{'samples_all'}) or return; + $self->_print ($self->{'binaries'}->{'v2_bases'}) or return; + $self->_print ($self->{'binaries'}->{'comments'}) or return; + } + elsif ($comments{'version'} ==3) { + # print ("Lengths:\n"); + # print("Header : ".length($self->{'binaries'}->{'header'})."\n"); + # print("Traces : ".length($self->{'binaries'}->{'samples_all'})."\n"); + # print("Offsets : ".length($self->{'binaries'}->{'v3_peak_offsets'})."\n"); + # print("Accuracy: ".length($self->{'binaries'}->{'v3_accuracies_all'})."\n"); + # print("Bases : ".length($self->{'binaries'}->{'v3_called_bases'})."\n"); + # print("Reserved: ".length($self->{'binaries'}->{'v3_reserved'})."\n"); + # print("Comments: ".length($self->{'binaries'}->{'comments'})."\n"); + $self->{'header'}->{'comments_offset'} = + 128+length($self->{'binaries'}->{'samples_all'})+ + length($self->{'binaries'}->{'v3_peak_offsets'})+ + length($self->{'binaries'}->{'v3_accuracies_all'})+ + length($self->{'binaries'}->{'v3_called_bases'})+ + length($self->{'binaries'}->{'v3_reserved'}); + $self->{'header'}->{'spare'}->[1] = + $self->{'header'}->{'comments_offset'} + + length($self->{'binaries'}->{'comments'}); + $self->_set_binary_header(); + $self->_print ($self->{'binaries'}->{'header'}) or print("Couldn't write header\n"); + $self->_print ($self->{'binaries'}->{'samples_all'}) or print("Couldn't write samples\n"); + $self->_print ($self->{'binaries'}->{'v3_peak_offsets'}) or print("Couldn't write peak offsets\n"); + $self->_print ($self->{'binaries'}->{'v3_accuracies_all'}) or print("Couldn't write accuracies\n"); + $self->_print ($self->{'binaries'}->{'v3_called_bases'}) or print("Couldn't write called_bases\n"); + $self->_print ($self->{'binaries'}->{'v3_reserved'}) or print("Couldn't write reserved\n"); + $self->_print ($self->{'binaries'}->{'comments'}) or print ("Couldn't write comments\n"); + } + + # kinda unnecessary, given the close() below, but maybe that'll go + # away someday. + $self->flush if $self->_flush_on_write && defined $self->_fh; + + $self->close(); +} + +=head2 _set_binary_header() + + Title : _set_binary_header(); + Usage : $self->_set_binary_header(); + Function: Provide the binary string that will be used as the header for + a scfv2 document. + Returns : A binary string. + Args : None. Uses the entries in the $self->{'header'} hash. These + are set on construction of the object (hopefully correctly!). + Notes : + +=cut + +sub _set_binary_header { + my ($self) = shift; + my $binary = pack "a4 NNNNNNNN a4 NN N20", + ( + $self->{'header'}->{'magic'}, + $self->{'header'}->{'samples'}, + $self->{'header'}->{'samples_offset'}, + $self->{'header'}->{'bases'}, + $self->{'header'}->{'bases_left_clip'}, + $self->{'header'}->{'bases_right_clip'}, + $self->{'header'}->{'bases_offset'}, + $self->{'header'}->{'comments_size'}, + $self->{'header'}->{'comments_offset'}, + $self->{'header'}->{'version'}, + $self->{'header'}->{'sample_size'}, + $self->{'header'}->{'code_set'}, + @{$self->{'header'}->{'spare'}}); + $self->{'binaries'}->{'header'} = $binary; +} + +=head2 _set_binary_tracesbases($version,$sequence,$ref_quality) + + Title : _set_binary_tracesbases($version,$sequence,$ref_quality) + Usage : $self->_set_binary_tracesbases($version,$sequence, + $ref_quality); + Function: Constructs the trace and base strings for all scfs + Returns : Nothing. Alters self. + Args : $version - "2" or "3" + $sequence - a scalar containing arbitrary sequence data + $ref_quality - a reference to an array containing quality + values + Notes : This is a really complicated thing. + +=cut + +sub _set_binary_tracesbases { + my ($self,$version,$sequence,$rqual) = @_; + $sequence =~ tr/a-z/A-Z/; + $self->{'info'}->{'sequence'} = $sequence; + $self->{'info'}->{'sequence_length'} = length($sequence); + my @quals = @$rqual; + # build the ramp for the first base. + # a ramp looks like this "1 4 13 29 51 71 80 71 51 29 13 4 1" times the quality score. + # REMEMBER: A C G T + # note to self-> smooth this thing out a bit later + @{$self->{'text'}->{'ramp'}} = qw( 1 4 13 29 51 75 80 75 51 29 13 4 1 ); + # the width of the ramp + $self->{'text'}->{'ramp_width'} = scalar(@{$self->{'text'}->{'ramp'}}); + # how far should the peaks overlap? + $self->{'text'}->{'ramp_overlap'} = 1; + # where should the peaks be located? + $self->{'text'}->{'peak_at'} = 7; + $self->{'text'}->{'ramp_total_length'} = + $self->{'info'}->{'sequence_length'} * $self->{'text'}->{'ramp_width'} + - $self->{'info'}->{'sequence_length'} * $self->{'text'}->{'ramp_overlap'}; + # create some empty arrays + # my (@sam_a,@sam_c,@sam_g,@sam_t,$pos); + my $pos; + my $total_length = $self->{'text'}->{ramp_total_length}; + for ($pos=0;$pos<=$total_length;$pos++) { + $self->{'text'}->{'samples_a'}[$pos] = $self->{'text'}->{'samples_c'}[$pos] + = $self->{'text'}->{'samples_g'}[$pos] = $self->{'text'}->{'samples_t'}[$pos] = "0"; + } + # $self->_dump_traces(); + # now populate them + my ($current_base,$place_base_at,$peak_quality,$ramp_counter,$current_ramp,$ramp_position); + my $sequence_length = $self->{'info'}->{'sequence_length'}; + my $half_ramp = int($self->{'text'}->{'ramp_width'}/2); + for ($pos = 0; $pos<$sequence_length;$pos++) { + $current_base = substr($self->{'info'}->{'sequence'},$pos,1); + # where should the peak for this base be placed? Modeled after a mktrace scf + $place_base_at = ($pos * $self->{'text'}->{'ramp_width'}) - + ($pos * $self->{'text'}->{'ramp_overlap'}) - + $half_ramp + $self->{'text'}->{'ramp_width'} - 1; + push @{$self->{'text'}->{'v3_peak_offsets'}},$place_base_at; + $peak_quality = $quals[$pos]; + if ($current_base eq "A") { + $ramp_position = $place_base_at - $half_ramp; + for ($current_ramp = 0; $current_ramp < $self->{'text'}->{'ramp_width'}; $current_ramp++) { + $self->{'text'}->{'samples_a'}[$ramp_position+$current_ramp] = $peak_quality * $self->{'text'}->{'ramp'}[$current_ramp]; + } + push @{$self->{'text'}->{'v2_bases'}},($place_base_at+1,$peak_quality,0,0,0,$current_base,0,0,0); + push @{$self->{'text'}->{'v3_base_accuracy_a'}},$peak_quality; + foreach (qw(g c t)) { + push @{$self->{'text'}->{"v3_base_accuracy_$_"}},0; + } + } + elsif ($current_base eq "C") { + $ramp_position = $place_base_at - $half_ramp; + for ($current_ramp = 0; $current_ramp < $self->{'text'}->{'ramp_width'}; $current_ramp++) { + $self->{'text'}->{'samples_c'}[$ramp_position+$current_ramp] = $peak_quality * $self->{'text'}->{'ramp'}[$current_ramp]; + } + push @{$self->{'text'}->{'v2_bases'}},($place_base_at+1,0,$peak_quality,0,0,$current_base,0,0,0); + push @{$self->{'text'}->{'v3_base_accuracy_c'}},$peak_quality; + foreach (qw(g a t)) { + push @{$self->{'text'}->{"v3_base_accuracy_$_"}},0; + } + } elsif ($current_base eq "G") { + $ramp_position = $place_base_at - $half_ramp; + for ($current_ramp = 0; + $current_ramp < $self->{'text'}->{'ramp_width'}; + $current_ramp++) { + $self->{'text'}->{'samples_g'}[$ramp_position+$current_ramp] = $peak_quality * $self->{'text'}->{'ramp'}[$current_ramp]; + } + push @{$self->{'text'}->{'v2_bases'}},($place_base_at+1,0,0,$peak_quality,0,$current_base,0,0,0); + push @{$self->{'text'}->{"v3_base_accuracy_g"}},$peak_quality; + foreach (qw(a c t)) { + push @{$self->{'text'}->{"v3_base_accuracy_$_"}},0; + } + } + elsif( $current_base eq "T" ) { + $ramp_position = $place_base_at - $half_ramp; + for ($current_ramp = 0; $current_ramp < $self->{'text'}->{'ramp_width'}; $current_ramp++) { + $self->{'text'}->{'samples_t'}[$ramp_position+$current_ramp] = $peak_quality * $self->{'text'}->{'ramp'}[$current_ramp]; + } + push @{$self->{'text'}->{'v2_bases'}},($place_base_at+1,0,0,0,$peak_quality,$current_base,0,0,0); + push @{$self->{'text'}->{'v3_base_accuracy_t'}},$peak_quality; + foreach (qw(g c a)) { + push @{$self->{'text'}->{"v3_base_accuracy_$_"}},0; + } + } elsif ($current_base eq "N") { + $ramp_position = $place_base_at - $half_ramp; + for ($current_ramp = 0; + $current_ramp < $self->{'text'}->{'ramp_width'}; + $current_ramp++) { + $self->{'text'}->{'samples_a'}[$ramp_position+$current_ramp] = $peak_quality * $self->{'text'}->{'ramp'}[$current_ramp]; + } + push @{$self->{'text'}->{'v2_bases'}},($place_base_at+1,$peak_quality, + $peak_quality,$peak_quality,$peak_quality, + $current_base,0,0,0); + foreach (qw(a c g t)) { + push @{$self->{'text'}->{"v3_base_accuracy_$_"}},0; + } + } + else { + # don't print this. + # print ("The current base ($current_base) is not a base. Hmmm.\n"); + } + } + foreach (qw(a c g t)) { + pop @{$self->{'text'}->{"samples_$_"}}; + } + + # set the samples in the header + $self->{'header'}->{'samples'} = scalar(@{$self->{'text'}->{'samples_a'}}); + + # create the final trace string (this is version dependent) + $self->_make_trace_string($version); + # create the binary for v2 bases + if ($self->{'header'}->{'version'} == 2) { + my ($packstring,@pack_array,$pos2,$tester,@unpacked); + for ($pos = 0; $pos<$sequence_length;$pos++) { + my @pack_array = @{$self->{'text'}->{'v2_bases'}}[$pos*9..$pos*9+8]; + $self->{'binaries'}->{'v2_bases'} .= pack "N C C C C a C3",@pack_array; + } + # now create the binary for the traces + my $trace_pack_length = scalar(@{$self->{'text'}->{'samples_all'}}); + $self->{'binaries'}->{'samples_all'} .= pack "n$trace_pack_length",@{$self->{'text'}->{'samples_all'}}; + } + else { + # now for the version 3 stuff! + # delta the trace data + my @temp; + foreach (qw(a c g t)) { + $self->{'text'}->{"t_samples_$_"} = $self->_delta($self->{'text'}->{"samples_$_"},"forward"); + if ($_ eq 'a') { + @temp = @{$self->{'text'}->{"t_samples_a"}}; + @{$self->{'text'}->{'samples_all'}} = @{$self->{'text'}->{"t_samples_a"}}; + } + else { + push @{$self->{'text'}->{'samples_all'}},@{$self->{'text'}->{"t_samples_$_"}}; + } + } + # now create the binary for the traces + my $trace_pack_length = scalar(@{$self->{'text'}->{'samples_all'}}); + + $self->{'binaries'}->{'samples_all'} .= pack "n$trace_pack_length",@{$self->{'text'}->{'samples_all'}}; + + # peak offsets + my $length = scalar(@{$self->{'text'}->{'v3_peak_offsets'}}); + $self->{'binaries'}->{'v3_peak_offsets'} = pack "N$length",@{$self->{'text'}->{'v3_peak_offsets'}}; + # base accuracies + @{$self->{'text'}->{'v3_accuracies_all'}} = @{$self->{'text'}->{"v3_base_accuracy_a"}}; + foreach (qw(c g t)) { + @{$self->{'text'}->{'v3_accuracies_all'}} = (@{$self->{'text'}->{'v3_accuracies_all'}},@{$self->{'text'}->{"v3_base_accuracy_$_"}}); + } + $length = scalar(@{$self->{'text'}->{'v3_accuracies_all'}}); + + $self->{'binaries'}->{'v3_accuracies_all'} = pack "c$length",@{$self->{'text'}->{'v3_accuracies_all'}}; + # called bases + $length = length($self->{'info'}->{'sequence'}); + my @seq = split(//,$self->{'info'}->{'sequence'}); + # pack the string + $self->{'binaries'}->{'v3_called_bases'} = $self->{'info'}->{'sequence'}; + # finally, reserved for future use + $length = $self->{'info'}->{'sequence_length'}; + for (my $counter=0; $counter < $length; $counter++) { + push @temp,0; + } + $self->{'binaries'}->{'v3_reserved'} = pack "N$length",@temp; + } +} + +=head2 _make_trace_string($version) + + Title : _make_trace_string($version) + Usage : $self->_make_trace_string($version) + Function: Merges trace data for the four bases to produce an scf + trace string. _requires_ $version + Returns : Nothing. Alters $self. + Args : $version - a version number. "2" or "3" + Notes : + +=cut + +sub _make_trace_string { + my ($self,$version) = @_; + my @traces; + my @traces_view; + my @as = @{$self->{'text'}->{'samples_a'}}; + my @cs = @{$self->{'text'}->{'samples_c'}}; + my @gs = @{$self->{'text'}->{'samples_g'}}; + my @ts = @{$self->{'text'}->{'samples_t'}}; + if ($version == 2) { + for (my $curr=0; $curr < scalar(@as); $curr++) { + $as[$curr] = $DEFAULT_QUALITY unless defined $as[$curr]; + $cs[$curr] = $DEFAULT_QUALITY unless defined $cs[$curr]; + $gs[$curr] = $DEFAULT_QUALITY unless defined $gs[$curr]; + $ts[$curr] = $DEFAULT_QUALITY unless defined $ts[$curr]; + push @traces,($as[$curr],$cs[$curr],$gs[$curr],$ts[$curr]); + } + } + elsif ($version == 3) { + @traces = (@as,@cs,@gs,@ts); + } + else { + $self->throw("No idea what version required to make traces here. You gave #$version# Bailing."); + } + my $length = scalar(@traces); + $self->{'text'}->{'samples_all'} = \@traces; + +} + +=head2 _set_binary_comments(\@comments) + + Title : _set_binary_comments(\@comments) + Usage : $self->_set_binary_comments(\@comments); + Function: Provide a binary string that will be the comments section of + the scf file. See the scf specifications for detailed + specifications for the comments section of an scf file. Hint: + CODE=something\nBODE=something\n\0 + Returns : Nothing. Alters self. + Args : A reference to an array containing comments. + Notes : None. + +=cut + +sub _set_binary_comments { + my ($self,$rcomments) = @_; + my $comments_string = ''; + my %comments = %$rcomments; + foreach my $key (sort keys %comments) { + $comments{$key} ||= ''; + $comments_string .= "$key=$comments{$key}\n"; + } + $comments_string .= "\n\0"; + $self->{'header'}->{'comments'} = $comments_string; + my $length = length($comments_string); + $self->{'binaries'}->{'comments'} = pack "A$length",$comments_string; + $self->{'header'}->{'comments'} = $comments_string; +} + +=head2 _fill_missing_data($swq) + + Title : _fill_missing_data($swq) + Usage : $self->_fill_missing_data($swq); + Function: If the $swq with quality has no qualities, set all qualities + to 0. + If the $swq has no sequence, set the sequence to N's. + Returns : Nothing. Modifies the SeqWithQuality that was passed as an + argument. + Args : A reference to a Bio::Seq::SeqWithQuality + Notes : None. + +=cut + +#' +sub _fill_missing_data { + my ($self,$swq) = @_; + my $qual_obj = $swq->qual_obj(); + my $seq_obj = $swq->seq_obj(); + if ($qual_obj->length() == 0 && $seq_obj->length() != 0) { + my $fake_qualities = ("$DEFAULT_QUALITY ")x$seq_obj->length(); + $swq->qual($fake_qualities); + } + if ($seq_obj->length() == 0 && $qual_obj->length != 0) { + my $sequence = ("N")x$qual_obj->length(); + $swq->seq($sequence); + } +} + +=head2 _delta(\@trace_data,$direction) + + Title : _delta(\@trace_data,$direction) + Usage : $self->_delta(\@trace_data,$direction); + Function: + Returns : A reference to an array containing modified trace values. + Args : A reference to an array containing trace data and a string + indicating the direction of conversion. ("forward" or + "backward"). + Notes : This code is taken from the specification for SCF3.2. + http://www.mrc-lmb.cam.ac.uk/pubseq/manual/formats_unix_4.html + +=cut + + +sub _delta { + my ($self,$rsamples,$direction) = @_; + my @samples = @$rsamples; + # /* If job == DELTA_IT: + # * change a series of sample points to a series of delta delta values: + # * ie change them in two steps: + # * first: delta = current_value - previous_value + # * then: delta_delta = delta - previous_delta + # * else + # * do the reverse + # */ + # int i; + # uint_2 p_delta, p_sample; + + my ($i,$num_samples,$p_delta,$p_sample,@samples_converted); + + # c-programmers are funny people with their single-letter variables + + if ( $direction eq "forward" ) { + $p_delta = 0; + for ($i=0; $i < scalar(@samples); $i++) { + $p_sample = $samples[$i]; + $samples[$i] = $samples[$i] - $p_delta; + $p_delta = $p_sample; + } + $p_delta = 0; + for ($i=0; $i < scalar(@samples); $i++) { + $p_sample = $samples[$i]; + $samples[$i] = $samples[$i] - $p_delta; + $p_delta = $p_sample; + } + } + elsif ($direction eq "backward") { + $p_sample = 0; + for ($i=0; $i < scalar(@samples); $i++) { + $samples[$i] = $samples[$i] + $p_sample; + $p_sample = $samples[$i]; + } + $p_sample = 0; + for ($i=0; $i < scalar(@samples); $i++) { + $samples[$i] = $samples[$i] + $p_sample; + $p_sample = $samples[$i]; + } + } + else { + $self->warn("Bad direction. Use \"forward\" or \"backward\"."); + } + return \@samples; +} + +=head2 _unpack_magik($buffer) + + Title : _unpack_magik($buffer) + Usage : $self->_unpack_magik($buffer) + Function: What unpack specification should be used? Try them all. + Returns : Nothing. + Args : A buffer containing arbitrary binary data. + Notes : Eliminate the ambiguity and the guesswork. Used in the + adaptation of _delta(), mostly. + +=cut + +sub _unpack_magik { + my ($self,$buffer) = @_; + my $length = length($buffer); + my (@read,$counter); + foreach (qw(c C s S i I l L n N v V)) { + @read = unpack "$_$length", $buffer; + print ("----- Unpacked with $_\n"); + for ($counter=0; $counter < 20; $counter++) { + print("$read[$counter]\n"); + } + } +} + +=head2 read_from_buffer($filehandle,$buffer,$length) + + Title : read_from_buffer($filehandle,$buffer,$length) + Usage : $self->read_from_buffer($filehandle,$buffer,$length); + Function: Read from the buffer. + Returns : $buffer, containing a read of $length + Args : a filehandle, a buffer, and a read length + Notes : I just got tired of typing + "unless (length($buffer) == $length)" so I put it here. + +=cut + +sub read_from_buffer { + my ($self,$fh,$buffer,$length) = @_; + read $fh, $buffer, $length; + unless (length($buffer) == $length) { + $self->warn("The read was incomplete! Trying harder."); + my $missing_length = $length - length($buffer); + my $buffer2; + read $fh,$buffer2,$missing_length; + $buffer .= $buffer2; + if (length($buffer) != $length) { + $self->throw("Unexpected end of file while reading from SCF file. I should have read $length but instead got ".length($buffer)."! Current file position is ".tell($fh)."."); + } + } + + return $buffer; +} + +=head2 _dump_keys() + + Title : _dump_keys() + Usage : &_dump_keys($a_reference_to_some_hash) + Function: Dump out the keys in a hash. + Returns : Nothing. + Args : A reference to a hash. + Notes : A debugging method. + +=cut + +sub _dump_keys { + my $rhash = shift; + if ($rhash !~ /HASH/) { + print("_dump_keys: that was not a hash.\nIt was #$rhash# which was this reference:".ref($rhash)."\n"); + return; + } + print("_dump_keys: The keys for $rhash are:\n"); + foreach (sort keys %$rhash) { + print("$_\n"); + } +} + +=head2 _dump_base_accuracies() + + Title : _dump_base_accuracies() + Usage : $self->_dump_base_accuracies(); + Function: Dump out the v3 base accuracies in an easy to read format. + Returns : Nothing. + Args : None. + Notes : A debugging method. + +=cut + +sub _dump_base_accuracies { + my $self = shift; + print("Dumping base accuracies! for v3\n"); + print("There are this many elements in a,c,g,t:\n"); + print(scalar(@{$self->{'text'}->{'v3_base_accuracy_a'}}).",".scalar(@{$self->{'text'}->{'v3_base_accuracy_c'}}).",".scalar(@{$self->{'text'}->{'v3_base_accuracy_g'}}).",".scalar(@{$self->{'text'}->{'v3_base_accuracy_t'}})."\n"); + my $number_traces = scalar(@{$self->{'text'}->{'v3_base_accuracy_a'}}); + for (my $counter=0; $counter < $number_traces; $counter++ ) { + print("$counter\t"); + print $self->{'text'}->{'v3_base_accuracy_a'}->[$counter]."\t"; + print $self->{'text'}->{'v3_base_accuracy_c'}->[$counter]."\t"; + print $self->{'text'}->{'v3_base_accuracy_g'}->[$counter]."\t"; + print $self->{'text'}->{'v3_base_accuracy_t'}->[$counter]."\t"; + print("\n"); + } +} + +=head2 _dump_peak_indices_incoming() + + Title : _dump_peak_indices_incoming() + Usage : $self->_dump_peak_indices_incoming(); + Function: Dump out the v3 peak indices in an easy to read format. + Returns : Nothing. + Args : None. + Notes : A debugging method. + +=cut + +sub _dump_peak_indices_incoming { + my $self = shift; + print("Dump peak indices incoming!\n"); + my $length = $self->{'bases'}; + print("The length is $length\n"); + for (my $count=0; $count < $length; $count++) { + print("$count\t$self->{parsed}->{peak_indices}->[$count]\n"); + } +} + +=head2 _dump_base_accuracies_incoming() + + Title : _dump_base_accuracies_incoming() + Usage : $self->_dump_base_accuracies_incoming(); + Function: Dump out the v3 base accuracies in an easy to read format. + Returns : Nothing. + Args : None. + Notes : A debugging method. + +=cut + +sub _dump_base_accuracies_incoming { + my $self = shift; + print("Dumping base accuracies! for v3\n"); + # print("There are this many elements in a,c,g,t:\n"); + # print(scalar(@{$self->{'parsed'}->{'v3_base_accuracy_a'}}).",".scalar(@{$self->{'text'}->{'v3_base_accuracy_c'}}).",".scalar(@{$self->{'text'}->{'v3_base_accuracy_g'}}).",".scalar(@{$self->{'text'}->{'v3_base_accuracy_t'}})."\n"); + my $number_traces = $self->{'bases'}; + for (my $counter=0; $counter < $number_traces; $counter++ ) { + print("$counter\t"); + foreach (qw(A T G C)) { + print $self->{'parsed'}->{'base_accuracies'}->{$_}->[$counter]."\t"; + } + print("\n"); + } +} + + +=head2 _dump_comments() + + Title : _dump_comments() + Usage : $self->_dump_comments(); + Function: Debug dump the comments section from the scf. + Returns : Nothing. + Args : Nothing. + Notes : None. + +=cut + +sub _dump_comments { + my ($self) = @_; + warn ("SCF comments:\n"); + foreach my $k (keys %{$self->{'comments'}}) { + warn ("\t {$k} ==> ", $self->{'comments'}->{$k}, "\n"); + } +} + + +1; +__END__ + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqIO/swiss.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/swiss.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1251 @@ +# $Id: swiss.pm,v 1.66.2.4 2003/09/13 22:16:43 jason Exp $ +# +# BioPerl module for Bio::SeqIO::swiss +# +# Cared for by Elia Stupka <elia@tll.org.sg> +# +# Copyright Elia Stupka +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::swiss - Swissprot sequence input/output stream + +=head1 SYNOPSIS + +It is probably best not to use this object directly, but +rather go through the SeqIO handler system. Go: + + $stream = Bio::SeqIO->new(-file => $filename, -format => 'swiss'); + + while ( my $seq = $stream->next_seq() ) { + # do something with $seq + } + +=head1 DESCRIPTION + +This object can transform Bio::Seq objects to and from swissprot flat +file databases. + +There is a lot of flexibility here about how to dump things which I need +to document fully. + + +=head2 Optional functions + +=over 3 + +=item _show_dna() + +(output only) shows the dna or not + +=item _post_sort() + +(output only) provides a sorting func which is applied to the FTHelpers +before printing + +=item _id_generation_func() + +This is function which is called as + + print "ID ", $func($seq), "\n"; + +To generate the ID line. If it is not there, it generates a sensible ID +line using a number of tools. + +If you want to output annotations in swissprot format they need to be +stored in a Bio::Annotation::Collection object which is accessible +through the Bio::SeqI interface method L<annotation()|annotation>. + +The following are the names of the keys which are polled from a +L<Bio::Annotation::Collection> object. + +reference - Should contain Bio::Annotation::Reference objects +comment - Should contain Bio::Annotation::Comment objects +dblink - Should contain Bio::Annotation::DBLink objects +gene_name - Should contain Bio::Annotation::SimpleValue object + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Elia Stupka + +Email elia@tll.org.sg + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SeqIO::swiss; +use vars qw(@ISA); +use strict; +use Bio::SeqIO; +use Bio::SeqIO::FTHelper; +use Bio::SeqFeature::Generic; +use Bio::Species; +use Bio::Tools::SeqStats; +use Bio::Seq::SeqFactory; +use Bio::Annotation::Collection; +use Bio::Annotation::Comment; +use Bio::Annotation::Reference; +use Bio::Annotation::DBLink; +use Bio::Annotation::SimpleValue; +use Bio::Annotation::StructuredValue; + +@ISA = qw(Bio::SeqIO); + + +sub _initialize { + my($self,@args) = @_; + $self->SUPER::_initialize(@args); + + # hash for functions for decoding keys. + $self->{'_func_ftunit_hash'} = {}; + $self->_show_dna(1); # sets this to one by default. People can change it + if( ! defined $self->sequence_factory ) { + $self->sequence_factory(new Bio::Seq::SeqFactory + (-verbose => $self->verbose(), + -type => 'Bio::Seq::RichSeq')); + } +} + +=head2 next_seq + + Title : next_seq + Usage : $seq = $stream->next_seq() + Function: returns the next sequence in the stream + Returns : Bio::Seq object + Args : + + +=cut + +sub next_seq { + my ($self,@args) = @_; + my ($pseq,$c,$line,$name,$desc,$acc,$seqc,$mol,$div, + $date,$comment,@date_arr); + + my $genename = ""; + my ($annotation, %params, @features) = ( new Bio::Annotation::Collection); + + $line = $self->_readline; + + if( !defined $line) { + return undef; # no throws - end of file + } + + if( $line =~ /^\s+$/ ) { + while( defined ($line = $self->_readline) ) { + $line =~ /\S/ && last; + } + } + if( !defined $line ) { + return undef; # end of file + } + + # fixed to allow _DIVISION to be optional for bug #946 + # see bug report for more information + $line =~ /^ID\s+([^\s_]+)(_([^\s_]+))?\s+([^\s;]+);\s+([^\s;]+);/ + || $self->throw("swissprot stream with no ID. Not swissprot in my book"); + + if( $3 ) { + $name = "$1$2"; + $params{'-division'} = $3; + } else { + $name = $1; + $params{'-division'} = 'UNK'; + $params{'-primary_id'} = $1; + } + $params{'-alphabet'} = 'protein'; + # this is important to have the id for display in e.g. FTHelper, otherwise + # you won't know which entry caused an error + $params{'-display_id'} = $name; + + my $buffer = $line; + + BEFORE_FEATURE_TABLE : + until( !defined ($buffer) ) { + $_ = $buffer; + + # Exit at start of Feature table + last if /^FT/; + # and at the sequence at the latest HL 05/11/2000 + last if /^SQ/; + + # Description line(s) + if (/^DE\s+(\S.*\S)/) { + $desc .= $desc ? " $1" : $1; + } + #Gene name + elsif(/^GN\s+(.*)/) { + $genename .= " " if $genename; + $genename .= $1; + # has GN terminated yet? + if($genename =~ s/[\. ]+$//) { + my $gn = Bio::Annotation::StructuredValue->new(); + foreach my $gene (split(/ AND /, $genename)) { + $gene =~ s/^\(//; + $gene =~ s/\)$//; + $gn->add_value([-1,-1], split(/ OR /, $gene)); + } + $annotation->add_Annotation('gene_name',$gn, + "Bio::Annotation::SimpleValue"); + } + } + #accession number(s) + elsif( /^AC\s+(.+)/) { + my @accs = split(/[; ]+/, $1); # allow space in addition + $params{'-accession_number'} = shift @accs + unless defined $params{'-accession_number'}; + push @{$params{'-secondary_accessions'}}, @accs; + } + #version number + elsif( /^SV\s+(\S+);?/ ) { + my $sv = $1; + $sv =~ s/\;//; + $params{'-seq_version'} = $sv; + } + #date + elsif( /^DT\s+(.*)/ ) { + my $date = $1; + $date =~ s/\;//; + $date =~ s/\s+$//; + push @{$params{'-dates'}}, $date; + } + # Organism name and phylogenetic information + elsif (/^O[SCG]/) { + my $species = $self->_read_swissprot_Species(\$buffer); + $params{'-species'}= $species; + # now we are one line ahead -- so continue without reading the next + # line HL 05/11/2000 + next; + } + # References + elsif (/^R/) { + my $refs = $self->_read_swissprot_References(\$buffer); + + foreach my $r (@$refs) { + $annotation->add_Annotation('reference',$r); + } + # now we are one line ahead -- so continue without reading the next + # line HL 05/11/2000 + next; + } + #Comments + elsif (/^CC\s{3}(.*)/) { + $comment .= $1; + $comment .= "\n"; + while (defined ($buffer = $self->_readline)) { + if ($buffer =~ /^CC\s{3}(.*)/) { + $comment .= $1; + $comment .= "\n"; + } + else { + last; + } + } + my $commobj = Bio::Annotation::Comment->new(); + # note: don't try to process comments here -- they may contain + # structure. LP 07/30/2000 + $commobj->text($comment); + $annotation->add_Annotation('comment',$commobj); + $comment = ""; + # now we are one line ahead -- so continue without reading the next + # line HL 05/11/2000 + next; + } + #DBLinks + elsif (/^DR\s+(\S+)\;\s+(\S+)\;\s+(\S+)[\;\.](.*)$/) { + my $dblinkobj = Bio::Annotation::DBLink->new(); + $dblinkobj->database($1); + $dblinkobj->primary_id($2); + $dblinkobj->optional_id($3); + my $comment = $4; + if(length($comment) > 0) { + # edit comment to get rid of leading space and trailing dot + if( $comment =~ /^\s*(\S+)\./ ) { + $dblinkobj->comment($1); + } else { + $dblinkobj->comment($comment); + } + } + $annotation->add_Annotation('dblink',$dblinkobj); + } + #keywords + elsif( /^KW\s+(.*)$/ ) { + my @kw = split(/\s*\;\s*/,$1); + defined $kw[-1] && $kw[-1] =~ s/\.$//; + push @{$params{'-keywords'}}, @kw; + } + + + # Get next line. Getting here assumes that we indeed need to read the + # line. + $buffer = $self->_readline; + } + + $buffer = $_; + + FEATURE_TABLE : + # if there is no feature table, or if we've got beyond, exit loop or don't + # even enter HL 05/11/2000 + while (defined ($buffer) && ($buffer =~ /^FT/)) { + my $ftunit = $self->_read_FTHelper_swissprot(\$buffer); + + # process ftunit + # when parsing of the line fails we get undef returned + if($ftunit) { + push(@features, + $ftunit->_generic_seqfeature($self->location_factory(), + $params{'-seqid'}, "SwissProt")); + } else { + $self->warn("failed to parse feature table line for seq " . + $params{'-display_id'}); + } + } + if( $buffer !~ /^SQ/ ) { + while( defined($_ = $self->_readline) ) { + /^SQ/ && last; + } + } + $seqc = ""; + while( defined ($_ = $self->_readline) ) { + /^\/\// && last; + $_ = uc($_); + s/[^A-Za-z]//g; + $seqc .= $_; + } + + my $seq= $self->sequence_factory->create + (-verbose => $self->verbose, + %params, + -seq => $seqc, + -desc => $desc, + -features => \@features, + -annotation => $annotation, + ); + + # The annotation doesn't get added by the contructor + $seq->annotation($annotation); + + return $seq; +} + +=head2 write_seq + + Title : write_seq + Usage : $stream->write_seq($seq) + Function: writes the $seq object (must be seq) to the stream + Returns : 1 for success and 0 for error + Args : array of 1 to n Bio::SeqI objects + + +=cut + +sub write_seq { + my ($self,@seqs) = @_; + foreach my $seq ( @seqs ) { + $self->throw("Attempting to write with no seq!") unless defined $seq; + + if( ! ref $seq || ! $seq->isa('Bio::SeqI') ) { + $self->warn(" $seq is not a SeqI compliant module. Attempting to dump, but may fail!"); + } + + my $i; + my $str = $seq->seq; + + my $mol; + my $div; + my $len = $seq->length(); + + if ( !$seq->can('division') || ! defined ($div = $seq->division()) ) { + $div = 'UNK'; + } + + if( ! $seq->can('alphabet') || ! defined ($mol = $seq->alphabet) ) { + $mol = 'XXX'; + } + + my $temp_line; + if( $self->_id_generation_func ) { + $temp_line = &{$self->_id_generation_func}($seq); + } else { + #$temp_line = sprintf ("%10s STANDARD; %3s; %d AA.", + # $seq->primary_id()."_".$div,$mol,$len); + # Reconstructing the ID relies heavily upon the input source having + # been in a format that is parsed as this routine expects it -- that is, + # by this module itself. This is bad, I think, and immediately breaks + # if e.g. the Bio::DB::GenPept module is used as input. + # Hence, switch to display_id(); _every_ sequence is supposed to have + # this. HL 2000/09/03 + $mol =~ s/protein/PRT/; + $temp_line = sprintf ("%10s STANDARD; %3s; %d AA.", + $seq->display_id(), $mol, $len); + } + + $self->_print( "ID $temp_line\n"); + + # if there, write the accession line + local($^W) = 0; # supressing warnings about uninitialized fields + + if( $self->_ac_generation_func ) { + $temp_line = &{$self->_ac_generation_func}($seq); + $self->_print( "AC $temp_line\n"); + } else { + if ($seq->can('accession_number') ) { + $self->_print("AC ",$seq->accession_number,";"); + if ($seq->can('get_secondary_accessions') ) { + foreach my $sacc ($seq->get_secondary_accessions) { + $self->_print(" ",$sacc,";"); + } + $self->_print("\n"); + } + else { + $self->_print("\n"); + } + } + # otherwise - cannot print <sigh> + } + + # Date lines + + if( $seq->can('get_dates') ) { + foreach my $dt ( $seq->get_dates() ) { + $self->_write_line_swissprot_regex("DT ","DT ", + $dt,"\\s\+\|\$",80); + } + } + + #Definition lines + $self->_write_line_swissprot_regex("DE ","DE ",$seq->desc(),"\\s\+\|\$",80); + + #Gene name + if ((my @genes = $seq->annotation->get_Annotations('gene_name') ) ) { + $self->_print("GN ", + join(' OR ', + map { + $_->isa("Bio::Annotation::StructuredValue") ? + $_->value(-joins => [" AND ", " OR "]) : + $_->value(); + } @genes), + ".\n"); + } + + # Organism lines + if ($seq->can('species') && (my $spec = $seq->species)) { + my($species, @class) = $spec->classification(); + my $genus = $class[0]; + my $OS = "$genus $species"; + if ($class[$#class] =~ /viruses/i) { + # different OS / OC syntax for viruses LP 09/16/2000 + shift @class; + } + if (my $ssp = $spec->sub_species) { + $OS .= " $ssp"; + } + foreach (($spec->variant, $spec->common_name)) { + $OS .= " ($_)" if $_; + } + $self->_print( "OS $OS.\n"); + my $OC = join('; ', reverse(@class)) .'.'; + $self->_write_line_swissprot_regex("OC ","OC ",$OC,"\; \|\$",80); + if ($spec->organelle) { + $self->_write_line_swissprot_regex("OG ","OG ",$spec->organelle,"\; \|\$",80); + } + if ($spec->ncbi_taxid) { + $self->_print("OX NCBI_TaxID=".$spec->ncbi_taxid.";\n"); + } + } + + # Reference lines + my $t = 1; + foreach my $ref ( $seq->annotation->get_Annotations('reference') ) { + $self->_print( "RN [$t]\n"); + # changed by lorenz 08/03/00 + # j.gilbert and h.lapp agreed that the rp line in swissprot seems + # more like a comment than a parseable value, so print it as is + if ($ref->rp) { + $self->_write_line_swissprot_regex("RP ","RP ",$ref->rp, + "\\s\+\|\$",80); + } + if ($ref->comment) { + $self->_write_line_swissprot_regex("RC ","RC ",$ref->comment, + "\\s\+\|\$",80); + } + if ($ref->medline) { + # new RX format in swissprot LP 09/17/00 + if ($ref->pubmed) { + $self->_write_line_swissprot_regex("RX ","RX ", + "MEDLINE=".$ref->medline. + "; PubMed=".$ref->pubmed.";", + "\\s\+\|\$",80); + } else { + $self->_write_line_swissprot_regex("RX MEDLINE; ","RX MEDLINE; ", + $ref->medline.".","\\s\+\|\$",80); + } + } + my $author = $ref->authors .';' if($ref->authors); + my $title = $ref->title .';' if( $ref->title); + + $self->_write_line_swissprot_regex("RA ","RA ",$author,"\\s\+\|\$",80); + $self->_write_line_swissprot_regex("RT ","RT ",$title,"\\s\+\|\$",80); + $self->_write_line_swissprot_regex("RL ","RL ",$ref->location,"\\s\+\|\$",80); + $t++; + } + + # Comment lines + + foreach my $comment ( $seq->annotation->get_Annotations('comment') ) { + foreach my $cline (split ("\n", $comment->text)) { + while (length $cline > 74) { + $self->_print("CC ",(substr $cline,0,74),"\n"); + $cline = substr $cline,74; + } + $self->_print("CC ",$cline,"\n"); + } + } + + foreach my $dblink ( $seq->annotation->get_Annotations('dblink') ) + { + if (defined($dblink->comment)&&($dblink->comment)) { + $self->_print("DR ",$dblink->database,"; ",$dblink->primary_id,"; ", + $dblink->optional_id,"; ",$dblink->comment,".\n"); + } elsif($dblink->optional_id) { + $self->_print("DR ",$dblink->database,"; ", + $dblink->primary_id,"; ", + $dblink->optional_id,".\n"); + } + else { + $self->_print("DR ",$dblink->database, + "; ",$dblink->primary_id,"; ","-.\n"); + } + } + + # if there, write the kw line + { + my( $kw ); + if( my $func = $self->_kw_generation_func ) { + $kw = &{$func}($seq); + } elsif( $seq->can('keywords') ) { + $kw = $seq->keywords; + if( ref($kw) =~ /ARRAY/i ) { + $kw = join("; ", @$kw); + } + $kw .= '.' if( $kw !~ /\.$/ ); + } + $self->_write_line_swissprot_regex("KW ","KW ", + $kw, "\\s\+\|\$",80); + } + + #Check if there are seqfeatures before printing the FT line + my @feats = $seq->can('top_SeqFeatures') ? $seq->top_SeqFeatures : (); + if ($feats[0]) { + if( defined $self->_post_sort ) { + + # we need to read things into an array. Process. Sort them. Print 'em + + my $post_sort_func = $self->_post_sort(); + my @fth; + + foreach my $sf ( @feats ) { + push(@fth,Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq)); + } + @fth = sort { &$post_sort_func($a,$b) } @fth; + + foreach my $fth ( @fth ) { + $self->_print_swissprot_FTHelper($fth); + } + } else { + # not post sorted. And so we can print as we get them. + # lower memory load... + + foreach my $sf ( @feats ) { + my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq); + foreach my $fth ( @fth ) { + if( ! $fth->isa('Bio::SeqIO::FTHelper') ) { + $sf->throw("Cannot process FTHelper... $fth"); + } + + $self->_print_swissprot_FTHelper($fth); + } + } + } + + if( $self->_show_dna() == 0 ) { + return; + } + } + # finished printing features. + + # molecular weight + my $mw = ${Bio::Tools::SeqStats->get_mol_wt($seq->primary_seq)}[0]; + # checksum + # was crc32 checksum, changed it to crc64 + my $crc64 = $self->_crc64(\$str); + $self->_print( sprintf("SQ SEQUENCE %4d AA; %d MW; %16s CRC64;\n", + $len,$mw,$crc64)); + $self->_print( " "); + my $linepos; + for ($i = 0; $i < length($str); $i += 10) { + $self->_print( substr($str,$i,10), " "); + $linepos += 11; + if( ($i+10)%60 == 0 && (($i+10) < length($str))) { + $self->_print( "\n "); + } + } + $self->_print( "\n//\n"); + + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; + } +} + +# Thanks to James Gilbert for the following two. LP 08/01/2000 + +=head2 _generateCRCTable + + Title : _generateCRCTable + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub _generateCRCTable { + # 10001000001010010010001110000100 + # 32 + my $poly = 0xEDB88320; + my ($self) = shift; + + $self->{'_crcTable'} = []; + foreach my $i (0..255) { + my $crc = $i; + for (my $j=8; $j > 0; $j--) { + if ($crc & 1) { + $crc = ($crc >> 1) ^ $poly; + } + else { + $crc >>= 1; + } + } + ${$self->{'_crcTable'}}[$i] = $crc; + } +} + + +=head2 _crc32 + + Title : _crc32 + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub _crc32 { + my( $self, $str ) = @_; + + $self->throw("Argument to crc32() must be ref to scalar") + unless ref($str) eq 'SCALAR'; + + $self->_generateCRCTable() unless exists $self->{'_crcTable'}; + + my $len = length($$str); + + my $crc = 0xFFFFFFFF; + for (my $i = 0; $i < $len; $i++) { + # Get upper case value of each letter + my $int = ord uc substr $$str, $i, 1; + $crc = (($crc >> 8) & 0x00FFFFFF) ^ + ${$self->{'_crcTable'}}[ ($crc ^ $int) & 0xFF ]; + } + return $crc; +} + +=head2 _crc64 + + Title : _crc64 + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub _crc64{ + my ($self, $sequence) = @_; + my $POLY64REVh = 0xd8000000; + my @CRCTableh = 256; + my @CRCTablel = 256; + my $initialized; + + + my $seq = $$sequence; + + my $crcl = 0; + my $crch = 0; + if (!$initialized) { + $initialized = 1; + for (my $i=0; $i<256; $i++) { + my $partl = $i; + my $parth = 0; + for (my $j=0; $j<8; $j++) { + my $rflag = $partl & 1; + $partl >>= 1; + $partl |= (1 << 31) if $parth & 1; + $parth >>= 1; + $parth ^= $POLY64REVh if $rflag; + } + $CRCTableh[$i] = $parth; + $CRCTablel[$i] = $partl; + } + } + + foreach (split '', $seq) { + my $shr = ($crch & 0xFF) << 24; + my $temp1h = $crch >> 8; + my $temp1l = ($crcl >> 8) | $shr; + my $tableindex = ($crcl ^ (unpack "C", $_)) & 0xFF; + $crch = $temp1h ^ $CRCTableh[$tableindex]; + $crcl = $temp1l ^ $CRCTablel[$tableindex]; + } + my $crc64 = sprintf("%08X%08X", $crch, $crcl); + + return $crc64; + +} + +=head2 _print_swissprot_FTHelper + + Title : _print_swissprot_FTHelper + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub _print_swissprot_FTHelper { + my ($self,$fth,$always_quote) = @_; + $always_quote ||= 0; + my ($start,$end) = ('?', '?'); + + if( ! ref $fth || ! $fth->isa('Bio::SeqIO::FTHelper') ) { + $fth->warn("$fth is not a FTHelper class. ". + "Attempting to print, but there could be tears!"); + } + + if( $fth->loc =~ /(\?|\d+|\>\d+|<\d+)?\.\.(\?|\d+|<\d+|>\d+)?/ ) { + $start = $1 if defined $1; + $end = $2 if defined $2; + + # to_FTString only returns one value when start == end, #JB955 + # so if no match is found, assume it is both start and end #JB955 + } else { + $start = $end = $fth->loc; + } + + my $desc = ""; + $desc = @{$fth->field->{"description"}}[0]."." + if exists $fth->field->{"description"}; + $self->_write_line_swissprot_regex(sprintf("FT %-8s %6s %6s ", + substr($fth->key,0,8), + $start,$end), + "FT ", + $desc.'.','\s+|$',80); +} +#' + +=head2 _read_swissprot_References + + Title : _read_swissprot_References + Usage : + Function: Reads references from swissprot format. Internal function really + Example : + Returns : + Args : + + +=cut + +sub _read_swissprot_References{ + my ($self,$buffer) = @_; + my (@refs); + my ($b1, $b2, $rp, $title, $loc, $au, $med, $com, $pubmed); + + if ($$buffer !~ /^RP/) { + $$buffer = $self->_readline; + } + if( !defined $$buffer ) { return undef; } + if( $$buffer =~ /^RP/ ) { + if ($$buffer =~ /^RP (SEQUENCE OF (\d+)-(\d+).*)/) { + $rp=$1; + $b1=$2; + $b2=$3; + } + elsif ($$buffer =~ /^RP (.*)/) { + $rp=$1; + } + + } + while( defined ($_ = $self->_readline) ) { + #/^CC/ && last; + /^RN/ && last; # separator between references ! LP 07/25/2000 + #/^SQ/ && last; # there may be sequences without CC lines! HL 05/11/2000 + /^[^R]/ && last; # may be the safest exit point HL 05/11/2000 + /^RX MEDLINE;\s+(\d+)/ && do {$med=$1}; + /^RX MEDLINE=(\d+);\s+PubMed=(\d+);/ && do {$med=$1;$pubmed=$2}; + /^RA (.*)/ && do { $au .= $au ? " $1" : $1; next;}; + /^RT (.*)/ && do { $title .= $title ? " $1" : $1; next;}; + /^RL (.*)/ && do { $loc .= $loc ? " $1" : $1; next;}; + /^RC (.*)/ && do { $com .= $com ? " $1" : $1; next;}; + } + + my $ref = new Bio::Annotation::Reference; + $au =~ s/;\s*$//g; + if( defined $title ) { + $title =~ s/;\s*$//g; + } + + $ref->start($b1); + $ref->end($b2); + $ref->authors($au); + $ref->title($title); + $ref->location($loc); + $ref->medline($med); + $ref->pubmed($pubmed) if (defined $pubmed); + $ref->comment($com); + $ref->rp($rp); + + push(@refs,$ref); + $$buffer = $_; + return \@refs; +} + + +=head2 _read_swissprot_Species + + Title : _read_swissprot_Species + Usage : + Function: Reads the swissprot Organism species and classification + lines. + Example : + Returns : A Bio::Species object + Args : + +=cut + +sub _read_swissprot_Species { + my( $self, $buffer ) = @_; + my $org; + + $_ = $$buffer; + my( $subspecies, $species, $genus, $common, $variant, $ncbi_taxid ); + my @class; + my ($binomial, $descr); + my $osline = ""; + + while (defined( $_ ||= $self->_readline )) { + last unless /^O[SCGX]/; + # believe it or not, but OS may come multiple times -- at this time + # we can't capture multiple species + if(/^OS\s+(\S.+)/ && (! defined($binomial))) { + $osline .= " " if $osline; + $osline .= $1; + if($osline =~ s/(,|, and|\.)$//) { + ($binomial, $descr) = $osline =~ /(\S[^\(]+)(.*)/; + ($genus, $species, $subspecies) = split(/\s+/, $binomial); + $species = "sp." unless $species; + while($descr =~ /\(([^\)]+)\)/g) { + my $item = $1; + # strain etc may not necessarily come first (yes, swissprot + # is messy) + if((! defined($variant)) && + (($item =~ /(^|[^\(\w])([Ss]train|isolate|serogroup|serotype|subtype|clone)\b/) || + ($item =~ /^(biovar|pv\.|type\s+)/))) { + $variant = $item; + } elsif($item =~ s/^subsp\.\s+//) { + if(! $subspecies) { + $subspecies = $item; + } elsif(! $variant) { + $variant = $item; + } + } elsif(! defined($common)) { + # we're only interested in the first common name + $common = $item; + if((index($common, '(') >= 0) && + (index($common, ')') < 0)) { + $common .= ')'; + } + } + } + } + } + elsif (s/^OC\s+//) { + push(@class, split /[\;\.]\s*/); + if($class[0] =~ /viruses/i) { + # viruses have different OS/OC syntax + my @virusnames = split(/\s+/, $binomial); + $species = (@virusnames > 1) ? pop(@virusnames) : ''; + $genus = join(" ", @virusnames); + $subspecies = undef; + } + } + elsif (/^OG\s+(.*)/) { + $org = $1; + } + elsif (/^OX\s+(.*)/ && (! defined($ncbi_taxid))) { + my $taxstring = $1; + # we only keep the first one and ignore all others + if ($taxstring =~ /NCBI_TaxID=([\w\d]+)/) { + $ncbi_taxid = $1; + } else { + $self->throw("$taxstring doesn't look like NCBI_TaxID"); + } + } + + $_ = undef; # Empty $_ to trigger read of next line + } + + $$buffer = $_; + + # Don't make a species object if it is "Unknown" or "None" + return if $genus =~ /^(Unknown|None)$/i; + + if ($class[$#class] eq $genus) { + push( @class, $species ); + } else { + push( @class, $genus, $species ); + } + + @class = reverse @class; + + my $taxon = Bio::Species->new(); + $taxon->classification( \@class, "FORCE" ); # no name validation please + $taxon->common_name( $common ) if $common; + $taxon->sub_species( $subspecies ) if $subspecies; + $taxon->organelle ( $org ) if $org; + $taxon->ncbi_taxid ( $ncbi_taxid ) if $ncbi_taxid; + $taxon->variant($variant) if $variant; + + # done + return $taxon; +} + +=head2 _filehandle + + Title : _filehandle + Usage : $obj->_filehandle($newval) + Function: + Example : + Returns : value of _filehandle + Args : newvalue (optional) + + +=cut + +# inherited from SeqIO.pm ! HL 05/11/2000 + +=head2 _read_FTHelper_swissprot + + Title : _read_FTHelper_swissprot + Usage : _read_FTHelper_swissprot(\$buffer) + Function: reads the next FT key line + Example : + Returns : Bio::SeqIO::FTHelper object + Args : filehandle and reference to a scalar + + +=cut + +sub _read_FTHelper_swissprot { + # initial version implemented by HL 05/10/2000 + # FIXME this may not be perfect, so please review + my ($self,$buffer) = @_; + my ($key, # The key of the feature + $loc, # The location line from the feature + $desc, # The descriptive text + ); + + if ($$buffer =~ /^FT (\w+)\s+([\d\?\<]+)\s+([\d\?\>]+)\s*(.*)$/) { + $key = $1; + my $loc1 = $2; + my $loc2 = $3; + $loc = "$loc1..$loc2"; + if($4 && (length($4) > 0)) { + $desc = $4; + chomp($desc); + } else { + $desc = ""; + } + # Read all the continuation lines up to the next feature + while (defined($_ = $self->_readline) && /^FT\s{20,}(\S.*)$/) { + $desc .= $1; + chomp($desc); + } + $desc =~ s/\.$//; + } else { + # No feature key. What's this? + $self->warn("No feature key in putative feature table line: $_"); + return; + } + + # Put the first line of the next feature into the buffer + $$buffer = $_; + + # Make the new FTHelper object + my $out = new Bio::SeqIO::FTHelper(-verbose => $self->verbose()); + $out->key($key); + $out->loc($loc); + + # store the description if there is one + if($desc && (length($desc) > 0)) { + $out->field->{"description"} ||= []; + push(@{$out->field->{"description"}}, $desc); + } + return $out; +} + + +=head2 _write_line_swissprot + + Title : _write_line_swissprot + Usage : + Function: internal function + Example : + Returns : + Args : + + +=cut + +sub _write_line_swissprot{ + my ($self,$pre1,$pre2,$line,$length) = @_; + + $length || die "Miscalled write_line_swissprot without length. Programming error!"; + my $subl = $length - length $pre2; + my $linel = length $line; + my $i; + + my $sub = substr($line,0,$length - length $pre1); + + $self->_print( "$pre1$sub\n"); + + for($i= ($length - length $pre1);$i < $linel;) { + $sub = substr($line,$i,($subl)); + $self->_print( "$pre2$sub\n"); + $i += $subl; + } + +} + +=head2 _write_line_swissprot_regex + + Title : _write_line_swissprot_regex + Usage : + Function: internal function for writing lines of specified + length, with different first and the next line + left hand headers and split at specific points in the + text + Example : + Returns : nothing + Args : file handle, first header, second header, text-line, regex for line breaks, total line length + + +=cut + +sub _write_line_swissprot_regex { + my ($self,$pre1,$pre2,$line,$regex,$length) = @_; + + #print STDOUT "Going to print with $line!\n"; + + $length || die "Miscalled write_line_swissprot without length. Programming error!"; + + if( length $pre1 != length $pre2 ) { + print STDERR "len 1 is ", length $pre1, " len 2 is ", length $pre2, "\n"; + die "Programming error - cannot called write_line_swissprot_regex with different length \npre1 ($pre1) and \npre2 ($pre2) tags!"; + } + + my $subl = $length - (length $pre1) -1 ; + my @lines; + + while($line =~ m/(.{1,$subl})($regex)/g) { + push(@lines, $1.$2); + } + + my $s = shift @lines; + $self->_print( "$pre1$s\n"); + foreach my $s ( @lines ) { + $self->_print( "$pre2$s\n"); + } +} + +=head2 _post_sort + + Title : _post_sort + Usage : $obj->_post_sort($newval) + Function: + Returns : value of _post_sort + Args : newvalue (optional) + + +=cut + +sub _post_sort{ + my $obj = shift; + if( @_ ) { + my $value = shift; + $obj->{'_post_sort'} = $value; + } + return $obj->{'_post_sort'}; + +} + +=head2 _show_dna + + Title : _show_dna + Usage : $obj->_show_dna($newval) + Function: + Returns : value of _show_dna + Args : newvalue (optional) + + +=cut + +sub _show_dna{ + my $obj = shift; + if( @_ ) { + my $value = shift; + $obj->{'_show_dna'} = $value; + } + return $obj->{'_show_dna'}; + +} + +=head2 _id_generation_func + + Title : _id_generation_func + Usage : $obj->_id_generation_func($newval) + Function: + Returns : value of _id_generation_func + Args : newvalue (optional) + + +=cut + +sub _id_generation_func{ + my $obj = shift; + if( @_ ) { + my $value = shift; + $obj->{'_id_generation_func'} = $value; + } + return $obj->{'_id_generation_func'}; + +} + +=head2 _ac_generation_func + + Title : _ac_generation_func + Usage : $obj->_ac_generation_func($newval) + Function: + Returns : value of _ac_generation_func + Args : newvalue (optional) + + +=cut + +sub _ac_generation_func{ + my $obj = shift; + if( @_ ) { + my $value = shift; + $obj->{'_ac_generation_func'} = $value; + } + return $obj->{'_ac_generation_func'}; + +} + +=head2 _sv_generation_func + + Title : _sv_generation_func + Usage : $obj->_sv_generation_func($newval) + Function: + Returns : value of _sv_generation_func + Args : newvalue (optional) + + +=cut + +sub _sv_generation_func{ + my $obj = shift; + if( @_ ) { + my $value = shift; + $obj->{'_sv_generation_func'} = $value; + } + return $obj->{'_sv_generation_func'}; + +} + +=head2 _kw_generation_func + + Title : _kw_generation_func + Usage : $obj->_kw_generation_func($newval) + Function: + Returns : value of _kw_generation_func + Args : newvalue (optional) + + +=cut + +sub _kw_generation_func{ + my $obj = shift; + if( @_ ) { + my $value = shift; + $obj->{'_kw_generation_func'} = $value; + } + return $obj->{'_kw_generation_func'}; + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqIO/ztr.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/ztr.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,173 @@ +# $Id: ztr.pm,v 1.8 2002/10/22 07:38:42 lapp Exp $ +# BioPerl module for Bio::SeqIO::ztr +# +# Cared for by Aaron Mackey <amackey@virginia.edu> +# +# Copyright Aaron Mackey +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::ztr - ztr trace sequence input/output stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the Bio::SeqIO class. + +=head1 DESCRIPTION + +This object can transform Bio::Seq objects to and from ztr trace +files. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS - Aaron Mackey + +Email: amackey@virginia.edu + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::SeqIO::ztr; +use vars qw(@ISA $READ_AVAIL); +use strict; +# Object preamble - inherits from Bio::Root::Object + +use Bio::SeqIO; +use Bio::Seq::SeqFactory; + +push @ISA, qw( Bio::SeqIO ); + +sub BEGIN { + eval { require Bio::SeqIO::staden::read; }; + if ($@) { + $READ_AVAIL = 0; + } else { + push @ISA, "Bio::SeqIO::staden::read"; + $READ_AVAIL = 1; + } +} + +sub _initialize { + my($self,@args) = @_; + $self->SUPER::_initialize(@args); + if( ! defined $self->sequence_factory ) { + $self->sequence_factory(new Bio::Seq::SeqFactory(-verbose => $self->verbose(), -type => 'Bio::Seq::SeqWithQuality')); + } + + my ($compression) = $self->_rearrange([qw[COMPRESSION]], @args); + $compression = 2 unless defined $compression; + $self->compression($compression); + + unless ($READ_AVAIL) { + Bio::Root::Root->throw( -class => 'Bio::Root::SystemException', + -text => "Bio::SeqIO::staden::read is not available; make sure the bioperl-ext package has been installed successfully!" + ); + } +} + +=head2 next_seq + + Title : next_seq + Usage : $seq = $stream->next_seq() + Function: returns the next sequence in the stream + Returns : Bio::SeqWithQuality object + Args : NONE + +=cut + +sub next_seq { + + my ($self) = @_; + + my ($seq, $id, $desc, $qual) = $self->read_trace($self->_fh, 'ztr'); + + # create the seq object + $seq = $self->sequence_factory->create(-seq => $seq, + -id => $id, + -primary_id => $id, + -desc => $desc, + -alphabet => 'DNA', + -qual => $qual + ); + return $seq; +} + +=head2 write_seq + + Title : write_seq + Usage : $stream->write_seq(@seq) + Function: writes the $seq object into the stream + Returns : 1 for success and 0 for error + Args : Bio::Seq object + + +=cut + +sub write_seq { + my ($self,@seq) = @_; + + my $fh = $self->_fh; + foreach my $seq (@seq) { + $self->write_trace($fh, $seq, 'ztr' . $self->compression); + } + + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; +} + +=head2 compression + + Title : compression + Usage : $stream->compression(3); + Function: determines the level of ZTR compression + Returns : the current (or newly set) value. + Args : 1, 2 or 3 - any other (defined) value will cause the compression + to be reset to the default of 2. + + +=cut + +sub compression { + + my ($self, $val) = @_; + + if (defined $val) { + if ($val =~ m/^1|2|3$/o) { + $self->{_compression} = $val; + } else { + $self->{_compression} = 2; + } + } + + return $self->{_compression}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SeqUtils.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqUtils.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,320 @@ +# $Id: SeqUtils.pm,v 1.11.2.1 2003/08/11 20:11:17 jason Exp $ +# +# BioPerl module for Bio::SeqUtils +# +# Cared for by Heikki Lehvaslaiho <heikki@ebi.ac.uk> +# +# Copyright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqUtils - Additional methods for PrimarySeq objects + +=head1 SYNOPSIS + + use Bio::SeqUtils; + # get a Bio::PrimarySeqI compliant object, $seq, somehow + $util = new Bio::SeqUtils; + $polypeptide_3char = $util->seq3($seq); + # or + $polypeptide_3char = Bio::SeqUtils->seq3($seq); + + # set the sequence string (stored in one char code in the object) + Bio::SeqUtils->seq3($seq, $polypeptide_3char); + + # translate a sequence in all six frames + @seqs = Bio::SeqUtils->translate_6frames($seq); + +=head1 DESCRIPTION + +This class is a holder of methods that work on Bio::PrimarySeqI- +compliant sequence objects, e.g. Bio::PrimarySeq and +Bio::Seq. These methods are not part of the Bio::PrimarySeqI +interface and should in general not be essential to the primary function +of sequence objects. If you are thinking of adding essential +functions, it might be better to create your own sequence class. +See L<Bio::PrimarySeqI>, L<Bio::PrimarySeq>, and L<Bio::Seq> for more. + +The methods take as their first argument a sequence object. It is +possible to use methods without first creating a SeqUtils object, +i.e. use it as an anonymous hash. + +The first two methods, seq3() and seq3in(), give out or read in protein +sequences coded in three letter IUPAC amino acid codes. + +The next two methods, translate_3frames() and translate_6frames(), wrap +around the standard translate method to give back an array of three +forward or all six frame translations. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email: heikki@ebi.ac.uk +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SeqUtils; +use vars qw(@ISA %ONECODE %THREECODE); +use strict; +use Carp; + +@ISA = qw(Bio::Root::Root); +# new inherited from RootI + +BEGIN { + + %ONECODE = + ('Ala' => 'A', 'Asx' => 'B', 'Cys' => 'C', 'Asp' => 'D', + 'Glu' => 'E', 'Phe' => 'F', 'Gly' => 'G', 'His' => 'H', + 'Ile' => 'I', 'Lys' => 'K', 'Leu' => 'L', 'Met' => 'M', + 'Asn' => 'N', 'Pro' => 'P', 'Gln' => 'Q', 'Arg' => 'R', + 'Ser' => 'S', 'Thr' => 'T', 'Val' => 'V', 'Trp' => 'W', + 'Xaa' => 'X', 'Tyr' => 'Y', 'Glx' => 'Z', 'Ter' => '*', + 'Sec' => 'U' + ); + + %THREECODE = + ('A' => 'Ala', 'B' => 'Asx', 'C' => 'Cys', 'D' => 'Asp', + 'E' => 'Glu', 'F' => 'Phe', 'G' => 'Gly', 'H' => 'His', + 'I' => 'Ile', 'K' => 'Lys', 'L' => 'Leu', 'M' => 'Met', + 'N' => 'Asn', 'P' => 'Pro', 'Q' => 'Gln', 'R' => 'Arg', + 'S' => 'Ser', 'T' => 'Thr', 'V' => 'Val', 'W' => 'Trp', + 'Y' => 'Tyr', 'Z' => 'Glx', 'X' => 'Xaa', '*' => 'Ter', + 'U' => 'Sec' + ); +} + +=head2 seq3 + + Title : seq3 + Usage : $string = Bio::SeqUtils->seq3($seq) + Function: + + Read only method that returns the amino acid sequence as a + string of three letter codes. alphabet has to be + 'protein'. Output follows the IUPAC standard plus 'Ter' for + terminator. Any unknown character, including the default + unknown character 'X', is changed into 'Xaa'. A noncoded + aminoacid selenocystein is recognized (Sec, U). + + Returns : A scalar + Args : character used for stop in the protein sequence optional, + defaults to '*' string used to separate the output amino + acid codes, optional, defaults to '' + +=cut + +sub seq3 { + my ($self, $seq, $stop, $sep ) = @_; + + $seq->isa('Bio::PrimarySeqI') || + $self->throw('Not a Bio::PrimarySeqI object but [$self]'); + $seq->alphabet eq 'protein' || + $self->throw('Not a protein sequence'); + + if (defined $stop) { + length $stop != 1 and $self->throw('One character stop needed, not [$stop]'); + $THREECODE{$stop} = "Ter"; + } + $sep ||= ''; + + my $aa3s; + foreach my $aa (split //, uc $seq->seq) { + $THREECODE{$aa} and $aa3s .= $THREECODE{$aa}. $sep, next; + $aa3s .= 'Xaa'. $sep; + } + $sep and substr($aa3s, -(length $sep), length $sep) = '' ; + return $aa3s; +} + +=head2 seq3in + + Title : seq3in + Usage : $string = Bio::SeqUtils->seq3in($seq, 'MetGlyTer') + Function: + + Method for in-place changing of the sequence of a + Bio::PrimarySeqI sequence object. The three letter amino + acid input string is converted into one letter code. Any + unknown character triplet, including the default 'Xaa', is + converted into 'X'. + + Returns : Bio::PrimarySeq object; + Args : character to be used for stop in the protein seqence, + optional, defaults to '*' + character to be used for unknown in the protein seqence, + optional, defaults to 'X' + +=cut + +sub seq3in { + my ($self, $seq, $string, $stop, $unknown) = @_; + + $seq->isa('Bio::PrimarySeqI') || + $self->throw('Not a Bio::PrimarySeqI object but [$self]'); + $seq->alphabet eq 'protein' || + $self->throw('Not a protein sequence'); + + if (defined $stop) { + length $stop != 1 and $self->throw('One character stop needed, not [$stop]'); + $ONECODE{'Ter'} = $stop; + } + if (defined $unknown) { + length $unknown != 1 and $self->throw('One character stop needed, not [$unknown]'); + $ONECODE{'Xaa'} = $unknown; + } + + my ($aas, $aa3); + my $length = (length $string) - 2; + for (my $i = 0 ; $i < $length ; $i += 3) { + $aa3 = substr($string, $i, 3); + $ONECODE{$aa3} and $aas .= $ONECODE{$aa3}, next; + $aas .= 'X'; + } + $seq->seq($aas); + return $seq; +} + +=head2 translate_3frames + + Title : translate_3frames + Usage : @prots = Bio::SeqUtils->translate_3frames($seq) + Function: Translate a nucleotide sequence in three forward frames. + The IDs of the sequences are appended with '-0F', '-1F', '-2F'. + Returns : An array of seq objects + Args : sequence object + same arguments as to Bio::PrimarySeqI::translate + +=cut + +sub translate_3frames { + my ($self, $seq, @args ) = @_; + + $self->throw('Object [$seq] '. 'of class ['. ref($seq). '] can not be translated.') + unless $seq->can('translate'); + + my ($stop, $unknown, $frame, $tableid, $fullCDS, $throw) = @args; + my @seqs; + my $f = 0; + while ($f != 3) { + my $translation = $seq->translate($stop, $unknown,$f,$tableid, $fullCDS, $throw ); + $translation->id($seq->id. "-". $f. "F"); + push @seqs, $translation; + $f++; + } + + return @seqs; +} + +=head2 translate_6frames + + Title : translate_6frames + Usage : @prots = Bio::SeqUtils->translate_6frames($seq) + Function: translate a nucleotide sequence in all six frames + The IDs of the sequences are appended with '-0F', '-1F', '-2F', + '-0R', '-1R', '-2R'. + Returns : An array of seq objects + Args : sequence object + same arguments as to Bio::PrimarySeqI::translate + +=cut + +sub translate_6frames { + my ($self, $seq, @args ) = @_; + + my @seqs = $self->translate_3frames($seq, @args); + $seq->seq($seq->revcom->seq); + my @seqs2 = $self->translate_3frames($seq, @args); + foreach my $seq2 (@seqs2) { + my ($tmp) = $seq2->id; + $tmp =~ s/F$/R/g; + $seq2->id($tmp); + } + return @seqs, @seqs2; +} + + +=head2 valid_aa + + Title : valid_aa + Usage : my @aa = $table->valid_aa + Function: Retrieves a list of the valid amino acid codes. + The list is ordered so that first 21 codes are for unique + amino acids. The rest are ['B', 'Z', 'X', '*']. + Returns : array of all the valid amino acid codes + Args : [optional] $code => [0 -> return list of 1 letter aa codes, + 1 -> return list of 3 letter aa codes, + 2 -> return associative array of both ] + +=cut + +sub valid_aa{ + my ($self,$code) = @_; + + if( ! $code ) { + my @codes; + foreach my $c ( sort values %ONECODE ) { + push @codes, $c unless ( $c =~ /[BZX\*]/ ); + } + push @codes, qw(B Z X *); # so they are in correct order ? + return @codes; + } + elsif( $code == 1 ) { + my @codes; + foreach my $c ( sort keys %ONECODE ) { + push @codes, $c unless ( $c =~ /(Asx|Glx|Xaa|Ter)/ ); + } + push @codes, ('Asx', 'Glx', 'Xaa', 'Ter' ); + return @codes; + } + elsif( $code == 2 ) { + my %codes = %ONECODE; + foreach my $c ( keys %ONECODE ) { + my $aa = $ONECODE{$c}; + $codes{$aa} = $c; + } + return %codes; + } else { + $self->warn("unrecognized code in ".ref($self)." method valid_aa()"); + return (); + } +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/SimpleAlign.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SimpleAlign.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1934 @@ +# $Id: SimpleAlign.pm,v 1.65.2.1 2003/07/02 16:00:19 jason Exp $ +# BioPerl module for SimpleAlign +# +# Cared for by Heikki Lehvaslaiho <heikki@ebi.ac.uk> +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code +# +# History: +# 11/3/00 Added threshold feature to consensus and consensus_aa - PS +# May 2001 major rewrite - Heikki Lehvaslaiho + +=head1 NAME + +Bio::SimpleAlign - Multiple alignments held as a set of sequences + +=head1 SYNOPSIS + + # use Bio::AlignIO to read in the alignment + $str = Bio::AlignIO->new('-file' => 't/data/testaln.pfam'); + $aln = $str->next_aln(); + + # some descriptors + print $aln->length, "\n"; + print $aln->no_residues, "\n"; + print $aln->is_flush, "\n"; + print $aln->no_sequences, "\n"; + print $aln->percentage_identity, "\n"; + print $aln->consensus_string(50), "\n"; + + # find the position in the alignment for a sequence location + $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6; + + # extract sequences and check values for the alignment column $pos + foreach $seq ($aln->each_seq) { + $res = $seq->subseq($pos, $pos); + $count{$res}++; + } + foreach $res (keys %count) { + printf "Res: %s Count: %2d\n", $res, $count{$res}; + } + + +=head1 DESCRIPTION + +SimpleAlign handles multiple alignments of sequences. It is very +permissive of types (it won't insist on things being all same length +etc): really it is a SequenceSet explicitly held in memory with a +whole series of built in manipulations and especially file format +systems for read/writing alignments. + +SimpleAlign basically views an alignment as an immutable block of +text. SimpleAlign *is not* the object to be using if you want to +perform complex alignment manipulations. + +However for lightweight display/formatting and minimal manipulation +(e.g. removing all-gaps columns) - this is the one to use. + +SimpleAlign uses a subclass of L<Bio::PrimarySeq> class +L<Bio::LocatableSeq> to store its sequences. These are subsequences +with a start and end positions in the parent reference sequence. + +Tricky concepts. SimpleAlign expects name,start,end to be 'unique' in +the alignment, and this is the key for the internal hashes. +(name,start,end is abbreviated nse in the code). However, in many +cases people don't want the name/start-end to be displayed: either +multiple names in an alignment or names specific to the alignment +(ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called +'displayname', and generally is what is used to print out the +alignment. They default to name/start-end. + +The SimpleAlign Module came from Ewan Birney's Align module. + +=head1 PROGRESS + +SimpleAlign is being slowly converted to bioperl coding standards, +mainly by Ewan. + +=over 3 + +=item Use Bio::Root::Object - done + +=item Use proper exceptions - done + +=item Use hashed constructor - not done! + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Ewan Birney, birney@sanger.ac.uk + +=head1 CONTRIBUTORS + +Richard Adams, Richard.Adams@ed.ac.uk, +David J. Evans, David.Evans@vir.gla.ac.uk, +Heikki Lehvaslaiho, heikki@ebi.ac.uk, +Allen Smith, allens@cpan.org, +Jason Stajich, jason@bioperl.org, +Anthony Underwood, aunderwood@phls.org.uk, +Xintao Wei & Giri Narasimhan, giri@cs.fiu.edu + + +=head1 SEE ALSO + +L<Bio::LocatableSeq.pm> + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# 'Let the code begin... + +package Bio::SimpleAlign; +use vars qw(@ISA %CONSERVATION_GROUPS); +use strict; + +use Bio::Root::Root; +use Bio::LocatableSeq; # uses Seq's as list +use Bio::Align::AlignI; + +BEGIN { + # This data should probably be in a more centralized module... + # it is taken from Clustalw documentation + # These are all the positively scoring groups that occur in the + # Gonnet Pam250 matrix. The strong and weak groups are + # defined as strong score >0.5 and weak score =<0.5 respectively. + + %CONSERVATION_GROUPS = ( 'strong' => [ qw(STA + NEQK + NHQK + NDEQ + QHRK + MILV + MILF + HY + FYW) + ], + 'weak' => [ qw(CSA + ATV + SAG + STNK + STPA + SGND + SNDEQK + NDEQHK + NEQHRK + FVLIM + HFY) ], + ); + +} + +@ISA = qw(Bio::Root::Root Bio::Align::AlignI); + +=head2 new + + Title : new + Usage : my $aln = new Bio::SimpleAlign(); + Function : Creates a new simple align object + Returns : Bio::SimpleAlign + Args : -source => string representing the source program + where this alignment came from + +=cut + + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + my ($src) = $self->_rearrange([qw(SOURCE)], @args); + $src && $self->source($src); + # we need to set up internal hashs first! + + $self->{'_seq'} = {}; + $self->{'_order'} = {}; + $self->{'_start_end_lists'} = {}; + $self->{'_dis_name'} = {}; + $self->{'_id'} = 'NoName'; + $self->{'_symbols'} = {}; + # maybe we should automatically read in from args. Hmmm... + + return $self; # success - we hope! +} + +=head1 Modifier methods + +These methods modify the MSE by adding, removing or shuffling complete +sequences. + +=head2 add_seq + + Title : add_seq + Usage : $myalign->add_seq($newseq); + Function : Adds another sequence to the alignment. *Does not* align + it - just adds it to the hashes. + Returns : nothing + Args : a Bio::LocatableSeq object + order (optional) + +See L<Bio::LocatableSeq> for more information + +=cut + +sub addSeq { + my $self = shift; + $self->warn(ref($self). "::addSeq - deprecated method. Use add_seq() instead."); + $self->add_seq(@_); +} + +sub add_seq { + my $self = shift; + my $seq = shift; + my $order = shift; + my ($name,$id,$start,$end); + + if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) { + $self->throw("Unable to process non locatable sequences [", ref($seq), "]"); + } + + $id = $seq->id() ||$seq->display_id || $seq->primary_id; + $start = $seq->start(); + $end = $seq->end(); + + # build the symbol list for this sequence, + # will prune out the gap and missing/match chars + # when actually asked for the symbol list in the + # symbol_chars + map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq); + + if( !defined $order ) { + $order = keys %{$self->{'_seq'}}; + } + $name = sprintf("%s/%d-%d",$id,$start,$end); + + if( $self->{'_seq'}->{$name} ) { + $self->warn("Replacing one sequence [$name]\n"); + } + else { + #print STDERR "Assigning $name to $order\n"; + + $self->{'_order'}->{$order} = $name; + + unless( exists( $self->{'_start_end_lists'}->{$id})) { + $self->{'_start_end_lists'}->{$id} = []; + } + push @{$self->{'_start_end_lists'}->{$id}}, $seq; + } + + $self->{'_seq'}->{$name} = $seq; + +} + + +=head2 remove_seq + + Title : remove_seq + Usage : $aln->remove_seq($seq); + Function : Removes a single sequence from an alignment + Returns : + Argument : a Bio::LocatableSeq object + +=cut + +sub removeSeq { + my $self = shift; + $self->warn(ref($self). "::removeSeq - deprecated method. Use remove_seq() instead."); + $self->remove_seq(@_); +} + +sub remove_seq { + my $self = shift; + my $seq = shift; + my ($name,$id,$start,$end); + + $self->throw("Need Bio::Locatable seq argument ") + unless ref $seq && $seq->isa('Bio::LocatableSeq'); + + $id = $seq->id(); + $start = $seq->start(); + $end = $seq->end(); + $name = sprintf("%s/%d-%d",$id,$start,$end); + + if( !exists $self->{'_seq'}->{$name} ) { + $self->throw("Sequence $name does not exist in the alignment to remove!"); + } + + delete $self->{'_seq'}->{$name}; + + # we need to remove this seq from the start_end_lists hash + + if (exists $self->{'_start_end_lists'}->{$id}) { + # we need to find the sequence in the array. + + my ($i, $found);; + for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) { + if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) { + $found = 1; + last; + } + } + if ($found) { + splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1; + } + else { + $self->throw("Could not find the sequence to remoce from the start-end list"); + } + } + else { + $self->throw("There is no seq list for the name $id"); + } + return 1; + # we can't do anything about the order hash but that is ok + # because each_seq will handle it +} + + +=head2 purge + + Title : purge + Usage : $aln->purge(0.7); + Function: + + Removes sequences above given sequence similarity + This function will grind on large alignments. Beware! + + Example : + Returns : An array of the removed sequences + Args : float, threshold for similarity + +=cut + +sub purge { + my ($self,$perc) = @_; + my (%duplicate, @dups); + + my @seqs = $self->each_seq(); + + for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment + my $seq = $seqs[$i]; + + #skip if already in duplicate hash + next if exists $duplicate{$seq->display_id} ; + my $one = $seq->seq(); + + my @one = split '', $one; #split to get 1aa per array element + + for (my $j=$i+1;$j < @seqs;$j++) { + my $seq2 = $seqs[$j]; + + #skip if already in duplicate hash + next if exists $duplicate{$seq2->display_id} ; + + my $two = $seq2->seq(); + my @two = split '', $two; + + my $count = 0; + my $res = 0; + for (my $k=0;$k<@one;$k++) { + if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) && + $one[$k] eq $two[$k]) { + $count++; + } + if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) && + $two[$k] ne '.' && $two[$k] ne '-' ) { + $res++; + } + } + + my $ratio = 0; + $ratio = $count/$res unless $res == 0; + + # if above threshold put in duplicate hash and push onto + # duplicate array for returning to get_unique + if ( $ratio > $perc ) { + print STDERR "duplicate!", $seq2->display_id, "\n" if $self->verbose > 0; + $duplicate{$seq2->display_id} = 1; + push @dups, $seq2; + } + } + } + foreach my $seq (@dups) { + $self->remove_seq($seq); + } + return @dups; +} + +=head2 sort_alphabetically + + Title : sort_alphabetically + Usage : $ali->sort_alphabetically + Function : + + Changes the order of the alignemnt to alphabetical on name + followed by numerical by number. + + Returns : + Argument : + +=cut + +sub sort_alphabetically { + my $self = shift; + my ($seq,$nse,@arr,%hash,$count); + + foreach $seq ( $self->each_seq() ) { + $nse = $seq->get_nse; + $hash{$nse} = $seq; + } + + $count = 0; + + %{$self->{'_order'}} = (); # reset the hash; + + foreach $nse ( sort _alpha_startend keys %hash) { + $self->{'_order'}->{$count} = $nse; + + $count++; + } + 1; +} + +=head1 Sequence selection methods + +Methods returning one or more sequences objects. + +=head2 each_seq + + Title : each_seq + Usage : foreach $seq ( $align->each_seq() ) + Function : Gets an array of Seq objects from the alignment + Returns : an array + Argument : + +=cut + +sub eachSeq { + my $self = shift; + $self->warn(ref($self). "::eachSeq - deprecated method. Use each_seq() instead."); + $self->each_seq(); +} + +sub each_seq { + my $self = shift; + my (@arr,$order); + + foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) { + if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) { + push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}}); + } + } + + return @arr; +} + + +=head2 each_alphabetically + + Title : each_alphabetically + Usage : foreach $seq ( $ali->each_alphabetically() ) + Function : + + Returns an array of sequence object sorted alphabetically + by name and then by start point. + Does not change the order of the alignment + + Returns : + Argument : + +=cut + +sub each_alphabetically { + my $self = shift; + my ($seq,$nse,@arr,%hash,$count); + + foreach $seq ( $self->each_seq() ) { + $nse = $seq->get_nse; + $hash{$nse} = $seq; + } + + foreach $nse ( sort _alpha_startend keys %hash) { + push(@arr,$hash{$nse}); + } + + return @arr; + +} + +sub _alpha_startend { + my ($aname,$astart,$bname,$bstart); + ($aname,$astart) = split (/-/,$a); + ($bname,$bstart) = split (/-/,$b); + + if( $aname eq $bname ) { + return $astart <=> $bstart; + } + else { + return $aname cmp $bname; + } + +} + +=head2 each_seq_with_id + + Title : each_seq_with_id + Usage : foreach $seq ( $align->each_seq_with_id() ) + Function : + + Gets an array of Seq objects from the + alignment, the contents being those sequences + with the given name (there may be more than one) + + Returns : an array + Argument : a seq name + +=cut + +sub eachSeqWithId { + my $self = shift; + $self->warn(ref($self). "::eachSeqWithId - deprecated method. Use each_seq_with_id() instead."); + $self->each_seq_with_id(@_); +} + +sub each_seq_with_id { + my $self = shift; + my $id = shift; + + $self->throw("Method each_seq_with_id needs a sequence name argument") + unless defined $id; + + my (@arr, $seq); + + if (exists($self->{'_start_end_lists'}->{$id})) { + @arr = @{$self->{'_start_end_lists'}->{$id}}; + } + return @arr; +} + +=head2 get_seq_by_pos + + Title : get_seq_by_pos + Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment + Function : + + Gets a sequence based on its position in the alignment. + Numbering starts from 1. Sequence positions larger than + no_sequences() will thow an error. + + Returns : a Bio::LocatableSeq object + Args : positive integer for the sequence osition + +=cut + +sub get_seq_by_pos { + + my $self = shift; + my ($pos) = @_; + + $self->throw("Sequence position has to be a positive integer, not [$pos]") + unless $pos =~ /^\d+$/ and $pos > 0; + $self->throw("No sequence at position [$pos]") + unless $pos <= $self->no_sequences ; + + my $nse = $self->{'_order'}->{--$pos}; + return $self->{'_seq'}->{$nse}; +} + +=head1 Create new alignments + +The result of these methods are horizontal or vertical subsets of the +current MSE. + +=head2 select + + Title : select + Usage : $aln2 = $aln->select(1, 3) # three first sequences + Function : + + Creates a new alignment from a continuous subset of + sequences. Numbering starts from 1. Sequence positions + larger than no_sequences() will thow an error. + + Returns : a Bio::SimpleAlign object + Args : positive integer for the first sequence + positive integer for the last sequence to include (optional) + +=cut + +sub select { + my $self = shift; + my ($start, $end) = @_; + + $self->throw("Select start has to be a positive integer, not [$start]") + unless $start =~ /^\d+$/ and $start > 0; + $self->throw("Select end has to be a positive integer, not [$end]") + unless $end =~ /^\d+$/ and $end > 0; + $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]") + unless $start <= $end; + + my $aln = new $self; + foreach my $pos ($start .. $end) { + $aln->add_seq($self->get_seq_by_pos($pos)); + } + $aln->id($self->id); + return $aln; +} + +=head2 select_noncont + + Title : select_noncont + Usage : $aln2 = $aln->select_noncont(1, 3) # first and 3rd sequences + Function : + + Creates a new alignment from a subset of + sequences. Numbering starts from 1. Sequence positions + larger than no_sequences() will thow an error. + + Returns : a Bio::SimpleAlign object + Args : array of integers for the sequences + +=cut + +sub select_noncont { + my $self = shift; + my (@pos) = @_; + my $end = $self->no_sequences; + foreach ( @pos ) { + $self->throw("position must be a positive integer, > 0 and <= $end not [$_]") + unless( /^\d+$/ && $_ > 0 && $_ <= $end ); + } + my $aln = new $self; + foreach my $p (@pos) { + $aln->add_seq($self->get_seq_by_pos($p)); + } + $aln->id($self->id); + return $aln; +} + +=head2 slice + + Title : slice + Usage : $aln2 = $aln->slice(20, 30) + Function : + + Creates a slice from the alignment inclusive of start and + end columns. Sequences with no residues in the slice are + excluded from the new alignment and a warning is printed. + Slice beyond the length of the sequence does not do + padding. + + Returns : a Bio::SimpleAlign object + Args : positive integer for start column + positive integer for end column + +=cut + +sub slice { + my $self = shift; + my ($start, $end) = @_; + + $self->throw("Slice start has to be a positive integer, not [$start]") + unless $start =~ /^\d+$/ and $start > 0; + $self->throw("Slice end has to be a positive integer, not [$end]") + unless $end =~ /^\d+$/ and $end > 0; + $self->throw("Slice $start [$start] has to be smaller than or equal to end [$end]") + unless $start <= $end; + my $aln_length = $self->length; + $self->throw("This alignment has only ". $self->length. + " residues. Slice start [$start] is too bigger.") + if $start > $self->length; + + my $aln = new $self; + $aln->id($self->id); + foreach my $seq ( $self->each_seq() ) { + + my $new_seq = new Bio::LocatableSeq (-id => $seq->id); + + # seq + my $seq_end = $end; + $seq_end = $seq->length if $end > $seq->length; + my $slice_seq = $seq->subseq($start, $seq_end); + $new_seq->seq( $slice_seq ); + + # start + if ($start > 1) { + my $pre_start_seq = $seq->subseq(1, $start - 1); + $pre_start_seq =~ s/\W//g; #print "$pre_start_seq\n"; + $new_seq->start( $seq->start + CORE::length($pre_start_seq) ); + } else { + $new_seq->start( $seq->start); + } + + # end + $slice_seq =~ s/\W//g; + $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 ); + + if ($new_seq->start and $new_seq->end >= $new_seq->start) { + $aln->add_seq($new_seq); + } else { + my $nse = $seq->get_nse(); + $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.". + " Sequence excluded from the new alignment."); + } + + } + + return $aln; +} + +=head2 remove_columns + + Title : remove_column + Usage : $aln2 = $aln->remove_columns(['mismatch','weak']) + Function : + Creates an aligment with columns removed corresponding to + the specified criteria. + Returns : a L<Bio::SimpleAlign> object + Args : array ref of types, 'match'|'weak'|'strong'|'mismatch' + +=cut + +sub remove_columns{ + my ($self,$type) = @_; + my %matchchars = ( 'match' => '\*', + 'weak' => '\.', + 'strong' => ':', + 'mismatch'=> ' ', + ); + #get the characters to delete against + my $del_char; + foreach my $type(@{$type}){ + $del_char.= $matchchars{$type}; + } + + my $match_line = $self->match_line; + my $aln = new $self; + + my @remove; + my $length = 0; + + #do the matching to get the segments to remove + while($match_line=~m/[$del_char]/g){ + my $start = pos($match_line)-1; + $match_line=~/\G[$del_char]+/gc; + my $end = pos($match_line)-1; + + #have to offset the start and end for subsequent removes + $start-=$length; + $end -=$length; + $length += ($end-$start+1); + push @remove, [$start,$end]; + } + + #remove the segments + $aln = $self->_remove_col($aln,\@remove); + + return $aln; +} + +sub _remove_col { + my ($self,$aln,$remove) = @_; + my @new; + + #splice out the segments and create new seq + foreach my $seq($self->each_seq){ + my $new_seq = new Bio::LocatableSeq(-id=>$seq->id); + my $sequence; + foreach my $pair(@{$remove}){ + my $start = $pair->[0]; + my $end = $pair->[1]; + $sequence = $seq->seq unless $sequence; + my $spliced; + $spliced .= $start > 0 ? substr($sequence,0,$start) : ''; + $spliced .= substr($sequence,$end+1,$seq->length-$end+1); + $sequence = $spliced; + if ($start == 1) { + $new_seq->start($end); + } + else { + $new_seq->start( $seq->start); + } + # end + if($end >= $seq->end){ + $new_seq->end( $start); + } + else { + $new_seq->end($seq->end); + } + } + $new_seq->seq($sequence); + push @new, $new_seq; + } + #add the new seqs to the alignment + foreach my $new(@new){ + $aln->add_seq($new); + } + return $aln; +} + +=head1 Change sequences within the MSE + +These methods affect characters in all sequences without changeing the +alignment. + + +=head2 map_chars + + Title : map_chars + Usage : $ali->map_chars('\.','-') + Function : + + Does a s/$arg1/$arg2/ on the sequences. Useful for gap + characters + + Notice that the from (arg1) is interpretted as a regex, + so be careful about quoting meta characters (eg + $ali->map_chars('.','-') wont do what you want) + + Returns : + Argument : 'from' rexexp + 'to' string + +=cut + +sub map_chars { + my $self = shift; + my $from = shift; + my $to = shift; + my ($seq,$temp); + + $self->throw("Need exactly two arguments") + unless defined $from and defined $to; + + foreach $seq ( $self->each_seq() ) { + $temp = $seq->seq(); + $temp =~ s/$from/$to/g; + $seq->seq($temp); + } + return 1; +} + + +=head2 uppercase + + Title : uppercase() + Usage : $ali->uppercase() + Function : Sets all the sequences to uppercase + Returns : + Argument : + +=cut + +sub uppercase { + my $self = shift; + my $seq; + my $temp; + + foreach $seq ( $self->each_seq() ) { + $temp = $seq->seq(); + $temp =~ tr/[a-z]/[A-Z]/; + + $seq->seq($temp); + } + return 1; +} + +=head2 cigar_line + + Title : cigar_line() + Usage : $align->cigar_line() + Function : Generates a "cigar" line for each sequence in the alignment + The format is simply A-1,60;B-1,1:4,60;C-5,10:12,58 + where A,B,C,etc. are the sequence identifiers, and the numbers + refer to conserved positions within the alignment + Args : none + +=cut + +sub cigar_line { + my ($self) = @_; + + my %cigar; + my %clines; + my @seqchars; + my $seqcount = 0; + my $sc; + foreach my $seq ( $self->each_seq ) { + push @seqchars, [ split(//, uc ($seq->seq)) ]; + $sc = scalar(@seqchars); + } + + foreach my $pos ( 0..$self->length ) { + my $i=0; + foreach my $seq ( @seqchars ) { + $i++; +# print STDERR "Seq $i at pos $pos: ".$seq->[$pos]."\n"; + if ($seq->[$pos] eq '.') { + if (defined $cigar{$i} && $clines{$i} !~ $cigar{$i}) { + $clines{$i}.=$cigar{$i}; + } + } + else { + if (! defined $cigar{$i}) { + $clines{$i}.=($pos+1).","; + } + $cigar{$i}=$pos+1; + } + if ($pos+1 == $self->length && ($clines{$i} =~ /\,$/) ) { + $clines{$i}.=$cigar{$i}; + } + } + } + for(my $i=1; $i<$sc+1;$i++) { + print STDERR "Seq $i cigar line ".$clines{$i}."\n"; + } + return %clines; +} + +=head2 match_line + + Title : match_line() + Usage : $align->match_line() + Function : Generates a match line - much like consensus string + except that a line indicating the '*' for a match. + Args : (optional) Match line characters ('*' by default) + (optional) Strong match char (':' by default) + (optional) Weak match char ('.' by default) + +=cut + +sub match_line { + my ($self,$matchlinechar, $strong, $weak) = @_; + my %matchchars = ( 'match' => $matchlinechar || '*', + 'weak' => $weak || '.', + 'strong' => $strong || ':', + 'mismatch'=> ' ', + ); + + + my @seqchars; + my $seqcount = 0; + my $alphabet; + foreach my $seq ( $self->each_seq ) { + push @seqchars, [ split(//, uc ($seq->seq)) ]; + $alphabet = $seq->alphabet unless defined $alphabet; + } + my $refseq = shift @seqchars; + # let's just march down the columns + my $matchline; + POS: foreach my $pos ( 0..$self->length ) { + my $refchar = $refseq->[$pos]; + next unless $refchar; # skip '' + my %col = ($refchar => 1); + my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' '); + foreach my $seq ( @seqchars ) { + $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' || + $seq->[$pos] eq ' ' ); + $col{$seq->[$pos]}++; + } + my @colresidues = sort keys %col; + my $char = $matchchars{'mismatch'}; + # if all the values are the same + if( $dash ) { $char = $matchchars{'mismatch'} } + elsif( @colresidues == 1 ) { $char = $matchchars{'match'} } + elsif( $alphabet eq 'protein' ) { # only try to do weak/strong + # matches for protein seqs + TYPE: foreach my $type ( qw(strong weak) ) { + # iterate through categories + my %groups; + # iterate through each of the aa in the col + # look to see which groups it is in + foreach my $c ( @colresidues ) { + foreach my $f ( grep /\Q$c/, @{$CONSERVATION_GROUPS{$type}} ) { + push @{$groups{$f}},$c; + } + } + GRP: foreach my $cols ( values %groups ) { + @$cols = sort @$cols; + # now we are just testing to see if two arrays + # are identical w/o changing either one + + # have to be same len + next if( scalar @$cols != scalar @colresidues ); + # walk down the length and check each slot + for($_=0;$_ < (scalar @$cols);$_++ ) { + next GRP if( $cols->[$_] ne $colresidues[$_] ); + } + $char = $matchchars{$type}; + last TYPE; + } + } + } + $matchline .= $char; + } + return $matchline; +} + +=head2 match + + Title : match() + Usage : $ali->match() + Function : + + Goes through all columns and changes residues that are + identical to residue in first sequence to match '.' + character. Sets match_char. + + USE WITH CARE: Most MSE formats do not support match + characters in sequences, so this is mostly for output + only. NEXUS format (Bio::AlignIO::nexus) can handle + it. + + Returns : 1 + Argument : a match character, optional, defaults to '.' + +=cut + +sub match { + my ($self, $match) = @_; + + $match ||= '.'; + my ($matching_char) = $match; + $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/ ; #'; + $self->map_chars($matching_char, '-'); + + my @seqs = $self->each_seq(); + return 1 unless scalar @seqs > 1; + + my $refseq = shift @seqs ; + my @refseq = split //, $refseq->seq; + my $gapchar = $self->gap_char; + + foreach my $seq ( @seqs ) { + my @varseq = split //, $seq->seq(); + for ( my $i=0; $i < scalar @varseq; $i++) { + $varseq[$i] = $match if defined $refseq[$i] && + ( $refseq[$i] =~ /[A-Za-z\*]/ || + $refseq[$i] =~ /$gapchar/ ) + && $refseq[$i] eq $varseq[$i]; + } + $seq->seq(join '', @varseq); + } + $self->match_char($match); + return 1; +} + + +=head2 unmatch + + Title : unmatch() + Usage : $ali->unmatch() + Function : Undoes the effect of method match. Unsets match_char. + + Returns : 1 + Argument : a match character, optional, defaults to '.' + +See L<match> and L<match_char> + +=cut + +sub unmatch { + my ($self, $match) = @_; + + $match ||= '.'; + + my @seqs = $self->each_seq(); + return 1 unless scalar @seqs > 1; + + my $refseq = shift @seqs ; + my @refseq = split //, $refseq->seq; + my $gapchar = $self->gap_char; + foreach my $seq ( @seqs ) { + my @varseq = split //, $seq->seq(); + for ( my $i=0; $i < scalar @varseq; $i++) { + $varseq[$i] = $refseq[$i] if defined $refseq[$i] && + ( $refseq[$i] =~ /[A-Za-z\*]/ || + $refseq[$i] =~ /$gapchar/ ) && + $varseq[$i] eq $match; + } + $seq->seq(join '', @varseq); + } + $self->match_char(''); + return 1; +} + +=head1 MSE attibutes + +Methods for setting and reading the MSE attributes. + +Note that the methods defining character semantics depend on the user +to set them sensibly. They are needed only by certain input/output +methods. Unset them by setting to an empty string (''). + +=head2 id + + Title : id + Usage : $myalign->id("Ig") + Function : Gets/sets the id field of the alignment + Returns : An id string + Argument : An id string (optional) + +=cut + +sub id { + my ($self, $name) = @_; + + if (defined( $name )) { + $self->{'_id'} = $name; + } + + return $self->{'_id'}; +} + +=head2 missing_char + + Title : missing_char + Usage : $myalign->missing_char("?") + Function : Gets/sets the missing_char attribute of the alignment + It is generally recommended to set it to 'n' or 'N' + for nucleotides and to 'X' for protein. + Returns : An missing_char string, + Argument : An missing_char string (optional) + +=cut + +sub missing_char { + my ($self, $char) = @_; + + if (defined $char ) { + $self->throw("Single missing character, not [$char]!") if CORE::length($char) > 1; + $self->{'_missing_char'} = $char; + } + + return $self->{'_missing_char'}; +} + +=head2 match_char + + Title : match_char + Usage : $myalign->match_char('.') + Function : Gets/sets the match_char attribute of the alignment + Returns : An match_char string, + Argument : An match_char string (optional) + +=cut + +sub match_char { + my ($self, $char) = @_; + + if (defined $char ) { + $self->throw("Single match character, not [$char]!") if CORE::length($char) > 1; + $self->{'_match_char'} = $char; + } + + return $self->{'_match_char'}; +} + +=head2 gap_char + + Title : gap_char + Usage : $myalign->gap_char('-') + Function : Gets/sets the gap_char attribute of the alignment + Returns : An gap_char string, defaults to '-' + Argument : An gap_char string (optional) + +=cut + +sub gap_char { + my ($self, $char) = @_; + + if (defined $char || ! defined $self->{'_gap_char'} ) { + $char= '-' unless defined $char; + $self->throw("Single gap character, not [$char]!") if CORE::length($char) > 1; + $self->{'_gap_char'} = $char; + } + return $self->{'_gap_char'}; +} + +=head2 symbol_chars + + Title : symbol_chars + Usage : my @symbolchars = $aln->symbol_chars; + Function: Returns all the seen symbols (other than gaps) + Returns : array of characters that are the seen symbols + Args : boolean to include the gap/missing/match characters + +=cut + +sub symbol_chars{ + my ($self,$includeextra) = @_; + if( ! defined $self->{'_symbols'} ) { + $self->warn("Symbol list was not initialized"); + return (); + } + my %copy = %{$self->{'_symbols'}}; + if( ! $includeextra ) { + foreach my $char ( $self->gap_char, $self->match_char, + $self->missing_char) { + delete $copy{$char} if( defined $char ); + } + } + return keys %copy; +} + +=head1 Alignment descriptors + +These read only methods describe the MSE in various ways. + + +=head2 consensus_string + + Title : consensus_string + Usage : $str = $ali->consensus_string($threshold_percent) + Function : Makes a strict consensus + Returns : + Argument : Optional treshold ranging from 0 to 100. + The consensus residue has to appear at least threshold % + of the sequences at a given location, otherwise a '?' + character will be placed at that location. + (Default value = 0%) + +=cut + +sub consensus_string { + my $self = shift; + my $threshold = shift; + my $len; + my ($out,$count); + + $out = ""; + + $len = $self->length - 1; + + foreach $count ( 0 .. $len ) { + $out .= $self->_consensus_aa($count,$threshold); + } + return $out; +} + +sub _consensus_aa { + my $self = shift; + my $point = shift; + my $threshold_percent = shift || -1 ; + my ($seq,%hash,$count,$letter,$key); + + foreach $seq ( $self->each_seq() ) { + $letter = substr($seq->seq,$point,1); + $self->throw("--$point-----------") if $letter eq ''; + ($letter =~ /\./) && next; + # print "Looking at $letter\n"; + $hash{$letter}++; + } + my $number_of_sequences = $self->no_sequences(); + my $threshold = $number_of_sequences * $threshold_percent / 100. ; + $count = -1; + $letter = '?'; + + foreach $key ( sort keys %hash ) { + # print "Now at $key $hash{$key}\n"; + if( $hash{$key} > $count && $hash{$key} >= $threshold) { + $letter = $key; + $count = $hash{$key}; + } + } + return $letter; +} + + +=head2 consensus_iupac + + Title : consensus_iupac + Usage : $str = $ali->consensus_iupac() + Function : + + Makes a consensus using IUPAC ambiguity codes from DNA + and RNA. The output is in upper case except when gaps in + a column force output to be in lower case. + + Note that if your alignment sequences contain a lot of + IUPAC ambiquity codes you often have to manually set + alphabet. Bio::PrimarySeq::_guess_type thinks they + indicate a protein sequence. + + Returns : consensus string + Argument : none + Throws : on protein sequences + +=cut + +sub consensus_iupac { + my $self = shift; + my $out = ""; + my $len = $self->length-1; + + # only DNA and RNA sequences are valid + foreach my $seq ( $self->each_seq() ) { + $self->throw("Seq [". $seq->get_nse. "] is a protein") + if $seq->alphabet eq 'protein'; + } + # loop over the alignment columns + foreach my $count ( 0 .. $len ) { + $out .= $self->_consensus_iupac($count); + } + return $out; +} + +sub _consensus_iupac { + my ($self, $column) = @_; + my ($string, $char, $rna); + + #determine all residues in a column + foreach my $seq ( $self->each_seq() ) { + $string .= substr($seq->seq, $column, 1); + } + $string = uc $string; + + # quick exit if there's an N in the string + if ($string =~ /N/) { + $string =~ /\W/ ? return 'n' : return 'N'; + } + # ... or if there are only gap characters + return '-' if $string =~ /^\W+$/; + + # treat RNA as DNA in regexps + if ($string =~ /U/) { + $string =~ s/U/T/; + $rna = 1; + } + + # the following s///'s only need to be done to the _first_ ambiguity code + # as we only need to see the _range_ of characters in $string + + if ($string =~ /[VDHB]/) { + $string =~ s/V/AGC/; + $string =~ s/D/AGT/; + $string =~ s/H/ACT/; + $string =~ s/B/CTG/; + } + + if ($string =~ /[SKYRWM]/) { + $string =~ s/S/GC/; + $string =~ s/K/GT/; + $string =~ s/Y/CT/; + $string =~ s/R/AG/; + $string =~ s/W/AT/; + $string =~ s/M/AC/; + } + + # and now the guts of the thing + + if ($string =~ /A/) { + $char = 'A'; # A A + if ($string =~ /G/) { + $char = 'R'; # A and G (purines) R + if ($string =~ /C/) { + $char = 'V'; # A and G and C V + if ($string =~ /T/) { + $char = 'N'; # A and G and C and T N + } + } elsif ($string =~ /T/) { + $char = 'D'; # A and G and T D + } + } elsif ($string =~ /C/) { + $char = 'M'; # A and C M + if ($string =~ /T/) { + $char = 'H'; # A and C and T H + } + } elsif ($string =~ /T/) { + $char = 'W'; # A and T W + } + } elsif ($string =~ /C/) { + $char = 'C'; # C C + if ($string =~ /T/) { + $char = 'Y'; # C and T (pyrimidines) Y + if ($string =~ /G/) { + $char = 'B'; # C and T and G B + } + } elsif ($string =~ /G/) { + $char = 'S'; # C and G S + } + } elsif ($string =~ /G/) { + $char = 'G'; # G G + if ($string =~ /C/) { + $char = 'S'; # G and C S + } elsif ($string =~ /T/) { + $char = 'K'; # G and T K + } + } elsif ($string =~ /T/) { + $char = 'T'; # T T + } + + $char = 'U' if $rna and $char eq 'T'; + $char = lc $char if $string =~ /\W/; + + return $char; +} + +=head2 is_flush + + Title : is_flush + Usage : if( $ali->is_flush() ) + : + : + Function : Tells you whether the alignment + : is flush, ie all of the same length + : + : + Returns : 1 or 0 + Argument : + +=cut + +sub is_flush { + my ($self,$report) = @_; + my $seq; + my $length = (-1); + my $temp; + + foreach $seq ( $self->each_seq() ) { + if( $length == (-1) ) { + $length = CORE::length($seq->seq()); + next; + } + + $temp = CORE::length($seq->seq()); + if( $temp != $length ) { + $self->warn("expecting $length not $temp from ". + $seq->display_id) if( $report ); + $self->debug("expecting $length not $temp from ". + $seq->display_id); + $self->debug($seq->seq(). "\n"); + return 0; + } + } + + return 1; +} + + +=head2 length + + Title : length() + Usage : $len = $ali->length() + Function : Returns the maximum length of the alignment. + To be sure the alignment is a block, use is_flush + Returns : + Argument : + +=cut + +sub length_aln { + my $self = shift; + $self->warn(ref($self). "::length_aln - deprecated method. Use length() instead."); + $self->length(@_); +} + +sub length { + my $self = shift; + my $seq; + my $length = (-1); + my ($temp,$len); + + foreach $seq ( $self->each_seq() ) { + $temp = CORE::length($seq->seq()); + if( $temp > $length ) { + $length = $temp; + } + } + + return $length; +} + + +=head2 maxdisplayname_length + + Title : maxdisplayname_length + Usage : $ali->maxdisplayname_length() + Function : + + Gets the maximum length of the displayname in the + alignment. Used in writing out various MSE formats. + + Returns : integer + Argument : + +=cut + +sub maxname_length { + my $self = shift; + $self->warn(ref($self). "::maxname_length - deprecated method.". + " Use maxdisplayname_length() instead."); + $self->maxdisplayname_length(); +} + +sub maxnse_length { + my $self = shift; + $self->warn(ref($self). "::maxnse_length - deprecated method.". + " Use maxnse_length() instead."); + $self->maxdisplayname_length(); +} + +sub maxdisplayname_length { + my $self = shift; + my $maxname = (-1); + my ($seq,$len); + + foreach $seq ( $self->each_seq() ) { + $len = CORE::length $self->displayname($seq->get_nse()); + + if( $len > $maxname ) { + $maxname = $len; + } + } + + return $maxname; +} + +=head2 no_residues + + Title : no_residues + Usage : $no = $ali->no_residues + Function : number of residues in total in the alignment + Returns : integer + Argument : + +=cut + +sub no_residues { + my $self = shift; + my $count = 0; + + foreach my $seq ($self->each_seq) { + my $str = $seq->seq(); + + $count += ($str =~ s/[^A-Za-z]//g); + } + + return $count; +} + +=head2 no_sequences + + Title : no_sequences + Usage : $depth = $ali->no_sequences + Function : number of sequence in the sequence alignment + Returns : integer + Argument : + +=cut + +sub no_sequences { + my $self = shift; + + return scalar($self->each_seq); +} + + +=head2 average_percentage_identity + + Title : average_percentage_identity + Usage : $id = $align->average_percentage_identity + Function: The function uses a fast method to calculate the average + percentage identity of the alignment + Returns : The average percentage identity of the alignment + Args : None + Notes : This method implemented by Kevin Howe calculates a figure that is + designed to be similar to the average pairwise identity of the + alignment (identical in the absence of gaps), without having to + explicitly calculate pairwise identities proposed by Richard Durbin. + Validated by Ewan Birney ad Alex Bateman. + +=cut + +sub average_percentage_identity{ + my ($self,@args) = @_; + + my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M', + 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z'); + + my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes); + + if (! $self->is_flush()) { + $self->throw("All sequences in the alignment must be the same length"); + } + + @seqs = $self->each_seq(); + $len = $self->length(); + + # load the each hash with correct keys for existence checks + + for( my $index=0; $index < $len; $index++) { + foreach my $letter (@alphabet) { + $countHashes[$index]->{$letter} = 0; + } + } + foreach my $seq (@seqs) { + my @seqChars = split //, $seq->seq(); + for( my $column=0; $column < @seqChars; $column++ ) { + my $char = uc($seqChars[$column]); + if (exists $countHashes[$column]->{$char}) { + $countHashes[$column]->{$char}++; + } + } + } + + $total = 0; + $divisor = 0; + for(my $column =0; $column < $len; $column++) { + my %hash = %{$countHashes[$column]}; + $subdivisor = 0; + foreach my $res (keys %hash) { + $total += $hash{$res}*($hash{$res} - 1); + $subdivisor += $hash{$res}; + } + $divisor += $subdivisor * ($subdivisor - 1); + } + return $divisor > 0 ? ($total / $divisor )*100.0 : 0; +} + +=head2 percentage_identity + + Title : percentage_identity + Usage : $id = $align->percentage_identity + Function: The function calculates the average percentage identity + (aliased for average_percentage_identity) + Returns : The average percentage identity + Args : None + +=cut + +sub percentage_identity { + my $self = shift; + return $self->average_percentage_identity(); +} + +=head2 overall_percentage_identity + + Title : percentage_identity + Usage : $id = $align->percentage_identity + Function: The function calculates the percentage identity of + the conserved columns + Returns : The percentage identity of the conserved columns + Args : None + +=cut + +sub overall_percentage_identity{ + my ($self,@args) = @_; + + my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M', + 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z'); + + my ($len, $total, @seqs, @countHashes); + + if (! $self->is_flush()) { + $self->throw("All sequences in the alignment must be the same length"); + } + + @seqs = $self->each_seq(); + $len = $self->length(); + + # load the each hash with correct keys for existence checks + for( my $index=0; $index < $len; $index++) { + foreach my $letter (@alphabet) { + $countHashes[$index]->{$letter} = 0; + } + } + foreach my $seq (@seqs) { + my @seqChars = split //, $seq->seq(); + for( my $column=0; $column < @seqChars; $column++ ) { + my $char = uc($seqChars[$column]); + if (exists $countHashes[$column]->{$char}) { + $countHashes[$column]->{$char}++; + } + } + } + + $total = 0; + for(my $column =0; $column < $len; $column++) { + my %hash = %{$countHashes[$column]}; + foreach ( values %hash ) { + next if( $_ == 0 ); + $total++ if( $_ == scalar @seqs ); + last; + } + } + return ($total / $len ) * 100.0; +} + +=head1 Alignment positions + +Methods to map a sequence position into an alignment column and back. +column_from_residue_number() does the former. The latter is really a +property of the sequence object and can done using +L<Bio::LocatableSeq::location_from_column>: + + # select somehow a sequence from the alignment, e.g. + my $seq = $aln->get_seq_by_pos(1); + #$loc is undef or Bio::LocationI object + my $loc = $seq->location_from_column(5); + + +=head2 column_from_residue_number + + Title : column_from_residue_number + Usage : $col = $ali->column_from_residue_number( $seqname, $resnumber) + Function: + + This function gives the position in the alignment + (i.e. column number) of the given residue number in the + sequence with the given name. For example, for the + alignment + + Seq1/91-97 AC..DEF.GH + Seq2/24-30 ACGG.RTY.. + Seq3/43-51 AC.DDEFGHI + + column_from_residue_number( "Seq1", 94 ) returns 5. + column_from_residue_number( "Seq2", 25 ) returns 2. + column_from_residue_number( "Seq3", 50 ) returns 9. + + An exception is thrown if the residue number would lie + outside the length of the aligment + (e.g. column_from_residue_number( "Seq2", 22 ) + + Note: If the the parent sequence is represented by more than + one alignment sequence and the residue number is present in + them, this method finds only the first one. + + Returns : A column number for the position in the alignment of the + given residue in the given sequence (1 = first column) + Args : A sequence id/name (not a name/start-end) + A residue number in the whole sequence (not just that + segment of it in the alignment) + +=cut + +sub column_from_residue_number { + my ($self, $name, $resnumber) = @_; + + $self->throw("No sequence with name [$name]") unless $self->{'_start_end_lists'}->{$name}; + $self->throw("Second argument residue number missing") unless $resnumber; + + foreach my $seq ($self->each_seq_with_id($name)) { + my $col; + eval { + $col = $seq->column_from_residue_number($resnumber); + }; + next if $@; + return $col; + } + + $self->throw("Could not find a sequence segment in $name ". + "containing residue number $resnumber"); + +} + +=head1 Sequence names + +Methods to manipulate the display name. The default name based on the +sequence id and subsequence positions can be overridden in various +ways. + +=head2 displayname + + Title : displayname + Usage : $myalign->displayname("Ig", "IgA") + Function : Gets/sets the display name of a sequence in the alignment + : + Returns : A display name string + Argument : name of the sequence + displayname of the sequence (optional) + +=cut + +sub get_displayname { + my $self = shift; + $self->warn(ref($self). "::get_displayname - deprecated method. Use displayname() instead."); + $self->displayname(@_); +} + +sub set_displayname { + my $self = shift; + $self->warn(ref($self). "::set_displayname - deprecated method. Use displayname() instead."); + $self->displayname(@_); +} + +sub displayname { + my ($self, $name, $disname) = @_; + + $self->throw("No sequence with name [$name]") unless $self->{'_seq'}->{$name}; + + if( $disname and $name) { + $self->{'_dis_name'}->{$name} = $disname; + return $disname; + } + elsif( defined $self->{'_dis_name'}->{$name} ) { + return $self->{'_dis_name'}->{$name}; + } else { + return $name; + } +} + +=head2 set_displayname_count + + Title : set_displayname_count + Usage : $ali->set_displayname_count + Function : + + Sets the names to be name_# where # is the number of + times this name has been used. + + Returns : + Argument : + +=cut + +sub set_displayname_count { + my $self= shift; + my (@arr,$name,$seq,$count,$temp,$nse); + + foreach $seq ( $self->each_alphabetically() ) { + $nse = $seq->get_nse(); + + #name will be set when this is the second + #time (or greater) is has been seen + + if( defined $name and $name eq ($seq->id()) ) { + $temp = sprintf("%s_%s",$name,$count); + $self->displayname($nse,$temp); + $count++; + } else { + $count = 1; + $name = $seq->id(); + $temp = sprintf("%s_%s",$name,$count); + $self->displayname($nse,$temp); + $count++; + } + } + return 1; +} + +=head2 set_displayname_flat + + Title : set_displayname_flat + Usage : $ali->set_displayname_flat() + Function : Makes all the sequences be displayed as just their name, + not name/start-end + Returns : 1 + Argument : + +=cut + +sub set_displayname_flat { + my $self = shift; + my ($nse,$seq); + + foreach $seq ( $self->each_seq() ) { + $nse = $seq->get_nse(); + $self->displayname($nse,$seq->id()); + } + return 1; +} + +=head2 set_displayname_normal + + Title : set_displayname_normal + Usage : $ali->set_displayname_normal() + Function : Makes all the sequences be displayed as name/start-end + Returns : + Argument : + +=cut + +sub set_displayname_normal { + my $self = shift; + my ($nse,$seq); + + foreach $seq ( $self->each_seq() ) { + $nse = $seq->get_nse(); + $self->displayname($nse,$nse); + } + return 1; +} + +=head2 source + + Title : source + Usage : $obj->source($newval) + Function: sets the Alignment source program + Example : + Returns : value of source + Args : newvalue (optional) + + +=cut + +sub source{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'_source'} = $value; + } + return $self->{'_source'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Species.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Species.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,338 @@ +# $Id: Species.pm,v 1.24 2002/12/05 13:46:30 heikki Exp $ +# +# BioPerl module for Bio::Species +# +# Cared for by James Gilbert <jgrg@sanger.ac.uk> +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Species - Generic species object + +=head1 SYNOPSIS + + $species = Bio::Species->new(-classification => [@classification]); + # Can also pass classification + # array to new as below + + $species->classification(qw( sapiens Homo Hominidae + Catarrhini Primates Eutheria + Mammalia Vertebrata Chordata + Metazoa Eukaryota )); + + $genus = $species->genus(); + + $bi = $species->binomial(); # $bi is now "Homo sapiens" + + # For storing common name + $species->common_name("human"); + + # For storing subspecies + $species->sub_species("accountant"); + +=head1 DESCRIPTION + +Provides a very simple object for storing phylogenetic +information. The classification is stored in an array, +which is a list of nodes in a phylogenetic tree. Access to +getting and setting species and genus is provided, but not +to any of the other node types (eg: "phylum", "class", +"order", "family"). There's plenty of scope for making the +model more sophisticated, if this is ever needed. + +A methods are also provided for storing common +names, and subspecies. + +=head1 CONTACT + +James Gilbert email B<jgrg@sanger.ac.uk> + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +#' Let the code begin... + + +package Bio::Species; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Object + +use Bio::Root::Root; + + +@ISA = qw(Bio::Root::Root); + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + $self->{'classification'} = []; + $self->{'common_name'} = undef; + my ($classification) = $self->_rearrange([qw(CLASSIFICATION)], @args); + if( defined $classification && + (ref($classification) eq "ARRAY") ) { + $self->classification(@$classification); + } + return $self; +} + +=head2 classification + + Title : classification + Usage : $self->classification(@class_array); + @classification = $self->classification(); + Function: Fills or returns the classification list in + the object. The array provided must be in + the order SPECIES, GENUS ---> KINGDOM. + Checks are made that species is in lower case, + and all other elements are in title case. + Example : $obj->classification(qw( sapiens Homo Hominidae + Catarrhini Primates Eutheria Mammalia Vertebrata + Chordata Metazoa Eukaryota)); + Returns : Classification array + Args : Classification array + OR + A reference to the classification array. In the latter case + if there is a second argument and it evaluates to true, + names will not be validated. + + +=cut + + +sub classification { + my ($self,@args) = @_; + + if (@args) { + + my ($classif,$force); + if(ref($args[0])) { + $classif = shift(@args); + $force = shift(@args); + } else { + $classif = \@args; + } + + # Check the names supplied in the classification string + # Species should be in lower case + if(! $force) { + $self->validate_species_name($classif->[0]); + # All other names must be in title case + foreach (@$classif) { + $self->validate_name( $_ ); + } + } + # Store classification + $self->{'classification'} = $classif; + } + return @{$self->{'classification'}}; +} + +=head2 common_name + + Title : common_name + Usage : $self->common_name( $common_name ); + $common_name = $self->common_name(); + Function: Get or set the common name of the species + Example : $self->common_name('human') + Returns : The common name in a string + Args : String, which is the common name (optional) + +=cut + +sub common_name{ + my $self = shift; + + return $self->{'common_name'} = shift if @_; + return $self->{'common_name'}; +} + +=head2 variant + + Title : variant + Usage : $obj->variant($newval) + Function: Get/set variant information for this species object (strain, + isolate, etc). + Example : + Returns : value of variant (a scalar) + Args : new value (a scalar or undef, optional) + + +=cut + +sub variant{ + my $self = shift; + + return $self->{'variant'} = shift if @_; + return $self->{'variant'}; +} + +=head2 organelle + + Title : organelle + Usage : $self->organelle( $organelle ); + $organelle = $self->organelle(); + Function: Get or set the organelle name + Example : $self->organelle('Chloroplast') + Returns : The organelle name in a string + Args : String, which is the organelle name + +=cut + +sub organelle { + my($self, $name) = @_; + + if ($name) { + $self->{'organelle'} = $name; + } else { + return $self->{'organelle'} + } +} + +=head2 species + + Title : species + Usage : $self->species( $species ); + $species = $self->species(); + Function: Get or set the scientific species name. The species + name must be in lower case. + Example : $self->species( 'sapiens' ); + Returns : Scientific species name as string + Args : Scientific species name as string + +=cut + + +sub species { + my($self, $species) = @_; + + if ($species) { + $self->validate_species_name( $species ); + $self->{'classification'}[0] = $species; + } + return $self->{'classification'}[0]; +} + +=head2 genus + + Title : genus + Usage : $self->genus( $genus ); + $genus = $self->genus(); + Function: Get or set the scientific genus name. The genus + must be in title case. + Example : $self->genus( 'Homo' ); + Returns : Scientific genus name as string + Args : Scientific genus name as string + +=cut + + +sub genus { + my($self, $genus) = @_; + + if ($genus) { + $self->validate_name( $genus ); + $self->{'classification'}[1] = $genus; + } + return $self->{'classification'}[1]; +} + +=head2 sub_species + + Title : sub_species + Usage : $obj->sub_species($newval) + Function: + Returns : value of sub_species + Args : newvalue (optional) + + +=cut + +sub sub_species { + my( $self, $sub ) = @_; + + if ($sub) { + $self->{'_sub_species'} = $sub; + } + return $self->{'_sub_species'}; +} + +=head2 binomial + + Title : binomial + Usage : $binomial = $self->binomial(); + $binomial = $self->binomial('FULL'); + Function: Returns a string "Genus species", or "Genus species subspecies", + the first argument is 'FULL' (and the species has a subspecies). + Args : Optionally the string 'FULL' to get the full name including + the subspecies. + +=cut + + +sub binomial { + my( $self, $full ) = @_; + + my( $species, $genus ) = $self->classification(); + unless( defined $species) { + $species = 'sp.'; + $self->warn("classification was not set"); + } + $genus = '' unless( defined $genus); + my $bi = "$genus $species"; + if (defined($full) && ((uc $full) eq 'FULL')) { + my $ssp = $self->sub_species; + $bi .= " $ssp" if $ssp; + } + return $bi; +} + +sub validate_species_name { + my( $self, $string ) = @_; + + return 1 if $string eq "sp."; + return 1 if $string =~ /^[a-z][\w\s]+$/i; + $self->throw("Invalid species name '$string'"); +} + +sub validate_name { + return 1; # checking is disabled as there is really not much we can + # enforce HL 2002/10/03 +# my( $self, $string ) = @_; + +# return 1 if $string =~ /^[\w\s\-\,\.]+$/ or +# $self->throw("Invalid name '$string'"); +} + +=head2 ncbi_taxid + + Title : ncbi_taxid + Usage : $obj->ncbi_taxid($newval) + Function: Get/set the NCBI Taxon ID + Returns : the NCBI Taxon ID as a string + Args : newvalue to set or undef to unset (optional) + + +=cut + +sub ncbi_taxid { + my $self = shift; + + return $self->{'_ncbi_taxid'} = shift if @_; + return $self->{'_ncbi_taxid'}; +} + +1; + +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Structure/Atom.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Structure/Atom.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,613 @@ +# $Id: Atom.pm,v 1.8 2002/10/22 07:38:44 lapp Exp $ +# +# bioperl module for Bio::Structure::Atom +# +# Cared for by Kris Boulez <kris.boulez@algonomics.com> +# +# Copyright Kris Boulez +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Structure::Atom - Bioperl structure Object, describes an Atom + +=head1 SYNOPSIS + + #add synopsis here + +=head1 DESCRIPTION + +This object stores a Bio::Structure::Atom + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Kris Boulez + +Email kris.boulez@algonomics.com + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Structure::Atom; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::Structure::Residue; +@ISA = qw(Bio::Root::Root); + + +=head2 new() + + Title : new() + Usage : $struc = Bio::Structure::Atom->new( + -id => 'human_id', + ); + + Function: Returns a new Bio::Structure::Atom object from basic + constructors. Probably most called from Bio::Structure::IO. + Returns : a new Bio::Structure::Atom object + +=cut + + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + my($id, $x, $y, $z) = + $self->_rearrange([qw( + ID + X + Y + Z + )], + @args); + + $id && $self->id($id); + $x && $self->x($x); + $y && $self->y($y); + $z && $self->z($z); + + return $self; +} + + + +=head2 x() + + Title : x + Usage : $x = $atom->x($x); + Function: Set/gets the X coordinate for an Atom + Returns : The value for the X coordinate of the Atom (This is just a number, + it is expected to be in Angstrom, but no garantees) + Args : The X coordinate as a number + +=cut + +sub x { + my ($self,$value) = @_; + if( defined $value) { + # do we want to check if $value contains really a number ? + $self->{'x'} = $value; + } + return $self->{'x'}; +} + + +=head2 y() + + Title : y + Usage : $y = $atom->y($y); + Function: Set/gets the Y coordinate for an Atom + Returns : The value for the Y coordinate of the Atom (This is just a number, + it is eypected to be in Angstrom, but no garantees) + Args : The Y coordinate as a number + +=cut + +sub y { + my ($self,$value) = @_; + if( defined $value) { + # do we want to check if $value contains really a number ? + $self->{'y'} = $value; + } + return $self->{'y'}; +} + + +=head2 z() + + Title : z + Usage : $z = $atom->z($z); + Function: Set/gets the Z coordinate for an Atom + Returns : The value for the Z coordinate of the Atom (This is just a number, + it is ezpected to be in Angstrom, but no garantees) + Args : The Z coordinate as a number + +=cut + +sub z { + my ($self,$value) = @_; + if( defined $value) { + # do we want to check if $value contains really a number ? + $self->{'z'} = $value; + } + return $self->{'z'}; +} + + +=head2 xyz() + + Title : xyz + Usage : ($x,$y,$z) = $atom->xyz; + Function: Gets the XYZ coordinates for an Atom + Returns : A list with the value for the XYZ coordinate of the Atom + Args : + +=cut + +sub xyz { + my ($self) = @_; + + return ($self->x, $self->y, $self->z); +} + + +=head2 residue() + + Title : residue + Usage : + Function: No code here, all parent/child stuff via Entry + Returns : + Args : + +=cut + +sub residue { + my($self, $value) = @_; + + $self->throw("all parent/child stuff via Entry\n"); +} + + +=head2 icode() + + Title : icode + Usage : $icode = $atom->icode($icode) + Function: Sets/gets the icode + Returns : Returns the icode for this atom + Args : reference to an Atom + +=cut + +sub icode { + my($self, $value) = @_; + + if (defined $value) { + $self->{'icode'} = $value; + } + return $self->{'icode'}; +} + + +=head2 serial() + + Title : serial + Usage : $serial = $atom->serial($serial) + Function: Sets/gets the serial number + Returns : Returns the serial number for this atom + Args : reference to an Atom + +=cut + +sub serial { + my($self, $value) = @_; + + if (defined $value) { + $self->{'serial'} = $value; + } + return $self->{'serial'}; +} + + +=head2 occupancy() + + Title : occupancy + Usage : $occupancy = $atom->occupancy($occupancy) + Function: Sets/gets the occupancy + Returns : Returns the occupancy for this atom + Args : reference to an Atom + +=cut + +sub occupancy { + my($self, $value) = @_; + + if (defined $value) { + $self->{'occupancy'} = $value; + } + return $self->{'occupancy'}; +} + + +=head2 tempfactor() + + Title : tempfactor + Usage : $tempfactor = $atom->tempfactor($tempfactor) + Function: Sets/gets the tempfactor + Returns : Returns the tempfactor for this atom + Args : reference to an Atom + +=cut + +sub tempfactor { + my($self, $value) = @_; + + if (defined $value) { + $self->{'tempfactor'} = $value; + } + return $self->{'tempfactor'}; +} + + +=head2 segID() + + Title : segID + Usage : $segID = $atom->segID($segID) + Function: Sets/gets the segID + Returns : Returns the segID for this atom + Args : reference to an Atom + +=cut + +sub segID { + my($self, $value) = @_; + + if (defined $value) { + $self->{'segID'} = $value; + } + return $self->{'segID'}; +} + + +=head2 pdb_atomname() + + Title : pdb_atomname + Usage : $pdb_atomname = $atom->pdb_atomname($pdb_atomname) + Function: Sets/gets the pdb_atomname (atomname used in the PDB file) + Returns : Returns the pdb_atomname for this atom + Args : reference to an Atom + +=cut + +sub pdb_atomname { + my($self, $value) = @_; + + if (defined $value) { + $self->{'pdb_atomname'} = $value; + } + return $self->{'pdb_atomname'}; +} + + +=head2 element() + + Title : element + Usage : $element = $atom->element($element) + Function: Sets/gets the element + Returns : Returns the element for this atom + Args : reference to an Atom + +=cut + +sub element { + my($self, $value) = @_; + + if (defined $value) { + $self->{'element'} = $value; + } + return $self->{'element'}; +} + + +=head2 charge() + + Title : charge + Usage : $charge = $atom->charge($charge) + Function: Sets/gets the charge + Returns : Returns the charge for this atom + Args : reference to an Atom + +=cut + +sub charge { + my($self, $value) = @_; + + if (defined $value) { + $self->{'charge'} = $value; + } + return $self->{'charge'}; +} + + +=head2 sigx() + + Title : sigx + Usage : $sigx = $atom->sigx($sigx) + Function: Sets/gets the sigx + Returns : Returns the sigx for this atom + Args : reference to an Atom + +=cut + +sub sigx { + my($self, $value) = @_; + + if (defined $value) { + $self->{'sigx'} = $value; + } + return $self->{'sigx'}; +} + + +=head2 sigy() + + Title : sigy + Usage : $sigy = $atom->sigy($sigy) + Function: Sets/gets the sigy + Returns : Returns the sigy for this atom + Args : reference to an Atom + +=cut + +sub sigy { + my($self, $value) = @_; + + if (defined $value) { + $self->{'sigy'} = $value; + } + return $self->{'sigy'}; +} + + +=head2 sigz() + + Title : sigz + Usage : $sigz = $atom->sigz($sigz) + Function: Sets/gets the sigz + Returns : Returns the sigz for this atom + Args : reference to an Atom + +=cut + +sub sigz { + my($self, $value) = @_; + + if (defined $value) { + $self->{'sigz'} = $value; + } + return $self->{'sigz'}; +} + + +=head2 sigocc() + + Title : sigocc + Usage : $sigocc = $atom->sigocc($sigocc) + Function: Sets/gets the sigocc + Returns : Returns the sigocc for this atom + Args : reference to an Atom + +=cut + +sub sigocc { + my($self, $value) = @_; + + if (defined $value) { + $self->{'sigocc'} = $value; + } + return $self->{'sigocc'}; +} + + +=head2 sigtemp() + + Title : sigtemp + Usage : $sigtemp = $atom->sigtemp($sigtemp) + Function: Sets/gets the sigtemp + Returns : Returns the sigtemp for this atom + Args : reference to an Atom + +=cut + +sub sigtemp { + my($self, $value) = @_; + + if (defined $value) { + $self->{'sigtemp'} = $value; + } + return $self->{'sigtemp'}; +} + + +=head2 aniso() + + Title : aniso + Usage : $u12 = $atom->aniso("u12", $u12) + Function: Sets/gets the anisotropic temperature factors + Returns : Returns the requested factor for this atom + Args : reference to an Atom, name of the factor, value for the factor + +=cut + +sub aniso { + my($self, $name, $value) = @_; + + if ( !defined $name) { + $self->throw("You need to supply a name of the anisotropic temp factor you want to get"); + } + if (defined $value) { + $self->{$name} = $value; + } + return $self->{$name}; +} + +# placeholders +sub u11 { + my ($self, $name, $value) = @_; + $self->aniso($name,$value); +} +sub u22 { + my ($self, $name, $value) = @_; + $self->aniso($name,$value); +} +sub u33 { + my ($self, $name, $value) = @_; + $self->aniso($name,$value); +} +sub u12 { + my ($self, $name, $value) = @_; + $self->aniso($name,$value); +} +sub u13 { + my ($self, $name, $value) = @_; + $self->aniso($name,$value); +} +sub u23 { + my ($self, $name, $value) = @_; + $self->aniso($name,$value); +} +sub sigu11 { + my ($self, $name, $value) = @_; + $self->aniso($name,$value); +} +sub sigu22 { + my ($self, $name, $value) = @_; + $self->aniso($name,$value); +} +sub sigu33 { + my ($self, $name, $value) = @_; + $self->aniso($name,$value); +} +sub sigu12 { + my ($self, $name, $value) = @_; + $self->aniso($name,$value); +} +sub sigu13 { + my ($self, $name, $value) = @_; + $self->aniso($name,$value); +} +sub sigu23 { + my ($self, $name, $value) = @_; + $self->aniso($name,$value); +} + + + + + + + + + + + + + +=head2 id() + + Title : id + Usage : $atom->id("CZ2") + Function: Gets/sets the ID for this atom + Returns : the ID + Args : the ID + +=cut + +sub id { + my ($self, $value) = @_;; + if (defined $value) { + $self->{'id'} = $value; + } + return $self->{'id'}; +} + +sub DESTROY { + my $self = shift; + + # dummy, nothing needs to be done here +} + +# +# from here on only private methods +# + +=head2 _remove_residue() + + Title : _remove_residue + Usage : + Function: Removes the Residue this Atom is atttached to. + Returns : + Args : + +=cut + +sub _remove_residue { + my ($self) = shift; + + $self->throw("no code here at the moment\n"); +} + + +=head2 _grandparent() + + Title : _grandparent + Usage : + Function: get/set a symbolic reference to our grandparent + Returns : + Args : + +=cut + +sub _grandparent { + my($self,$symref) = @_; + + if (ref($symref)) { + $self->throw("Thou shall only pass strings in here, no references $symref\n"); + } + if (defined $symref) { + $self->{'grandparent'} = $symref; + } + return $self->{'grandparent'}; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Structure/Chain.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Structure/Chain.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,245 @@ +# $Id: Chain.pm,v 1.6 2002/10/22 07:38:44 lapp Exp $ +# +# bioperl module for Bio::Structure::Chain +# +# Cared for by Kris Boulez <kris.boulez@algonomics.com> +# +# Copyright Kris Boulez +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Structure::Chain - Bioperl structure Object, describes a chain + +=head1 SYNOPSIS + + #add synopsis here + +=head1 DESCRIPTION + +This object stores a Bio::Structure::Chain + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Kris Boulez + +Email kris.boulez@algonomics.com + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Structure::Chain; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::Structure::Entry; +use Bio::Structure::Model; +@ISA = qw(Bio::Root::Root); + + +=head2 new() + + Title : new() + Usage : $struc = Bio::Structure::Chain->new( + -id => 'human_id', + -accession_number => 'AL000012', + ); + + Function: Returns a new Bio::Structure::Chain object from basic + constructors. Probably most called from Bio::Structure::IO. + Returns : a new Bio::Structure::Chain object + +=cut + + + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + my($id, $residue ) = + $self->_rearrange([qw( + ID + RESIDUE + )], + @args); + + $id && $self->id($id); + + $self->{'residue'} = []; + + # the 'smallest' item that can be added to a chain is a residue. + + $residue && $self->throw("use a method based on an Entry object for now"); + + return $self; +} + + + +=head2 residue() + + Title : residue + Usage : + Function: nothing usefull untill I get symbolic references to do what I want + Returns : + Args : + +=cut + +sub residue { + my ($self,$value) = @_; + + $self->throw("use a method on an Entry object to do what you want"); +} + + +=head2 add_residue() + + Title : add_residue + Usage : + Function: nothing usefull untill I get symbolic references to do what I want + Returns : + Args : + +=cut + +sub add_residue { + my($self,$value) = @_; + + $self->throw("you want entry->add_residue(chain, residue)\n"); +} + +=head2 model() + + Title : model + Usage : + Function: nothing usefull untill I get symbolic references to do what I want + Returns : + Args : + +=cut + +sub model { + my($self, $value) = @_; + + $self->throw("go via a Entry object please\n"); +} + + +=head2 id() + + Title : id + Usage : $chain->id("chain B") + Function: Gets/sets the ID for this chain + Returns : the ID + Args : the ID + +=cut + +sub id { + my ($self, $value) = @_;; + if (defined $value) { + $self->{'id'} = $value; + } + return $self->{'id'}; +} + + +sub DESTROY { + my $self = shift; + + # no specific destruction for now +} + + +# +# from here on only private methods +# + +=head2 _remove_residues() + + Title : _remove_residues + Usage : + Function: + Returns : + Args : + +=cut + +sub _remove_residues { + my ($self) = shift; + + $self->throw("nothing usefull in here, go see Entry\n"); +} + + +=head2 _remove_model() + + Title : _remove_model + Usage : + Function: Removes the Model this Chain is atttached to. + Returns : + Args : + +=cut + +sub _remove_model { + my ($self) = shift; + + $self->throw("go see an Entry object, nothing here\n"); +} + + +=head2 _grandparent() + + Title : _grandparent + Usage : + Function: get/set a symbolic reference to our grandparent + Returns : + Args : + +=cut + +sub _grandparent { + my($self,$symref) = @_; + + if (ref($symref)) { + $self->throw("Thou shall only pass strings in here, no references $symref\n"); + } + if (defined $symref) { + $self->{'grandparent'} = $symref; + } + return $self->{'grandparent'}; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Structure/Entry.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Structure/Entry.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,992 @@ +# $Id: Entry.pm,v 1.17 2002/10/22 07:38:44 lapp Exp $ +# +# bioperl module for Bio::Structure::Entry +# +# Cared for by Kris Boulez <kris.boulez@algonomics.com> +# +# Copyright Kris Boulez +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Structure::Entry - Bioperl structure Object, describes the whole entry + +=head1 SYNOPSIS + + #add synopsis here + +=head1 DESCRIPTION + +This object stores a whole Bio::Structure entry. It can consist of one or +more models (Bio::Structure::Model), which in turn consist of one or more +chains (Bio::Structure::Chain). A chain is composed of residues +(Bio::Structure::Residue) and a residue consists of atoms (Bio::Structure::Atom) +If no specific model or chain is chosen, the first one is choosen. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Kris Boulez + +Email kris.boulez@algonomics.com + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Structure::Entry; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::Structure::StructureI; +use Bio::Structure::Model; +use Bio::Structure::Chain; +use Bio::Annotation::Collection; +use Tie::RefHash; + +@ISA = qw(Bio::Root::Root Bio::Structure::StructureI); + + +=head2 new() + + Title : new() + Usage : $struc = Bio::Structure::Entry->new( + -id => 'structure_id', + ); + + Function: Returns a new Bio::Structure::Entry object from basic + constructors. Probably most called from Bio::Structure::IO. + Returns : a new Bio::Structure::Model object + +=cut + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + my($id, $model, $chain, $residue ) = + $self->_rearrange([qw( + ID + MODEL + CHAIN + RESIDUE + )], + @args); + + # where to store parent->child relations (1 -> 1..n) + # value to this hash will be an array ref + # by using Tie::RefHash we can store references in this hash + $self->{'p_c'} = (); + tie %{ $self->{'p_c'} } , "Tie::RefHash"; + + # where to store child->parent relations (1 -> 1) + $self->{'c_p'} = (); + tie %{ $self->{'c_p'} } , "Tie::RefHash"; + + $id && $self->id($id); + + $self->{'model'} = []; + $model && $self->model($model); + + + if($chain) { + if ( ! defined($self->model) ) { # no model yet, create default one + $self->_create_default_model; + } + for my $m ($self->model) { # add this chain on all models + $m->chain($chain); + } + } + + $residue && $self->residue($residue); + + # taken from Bio::Seq (or should we just inherit Bio::Seq and override some methods) + my $ann = Bio::Annotation::Collection->new; + $self->annotation($ann); + + return $self; +} + + +=head2 model() + + Title : model + Function: Connects a (or a list of) Model objects to a Bio::Structure::Entry. + To add a Model (and keep the existing ones) use add_model() + It returns a list of Model objects. + Returns : list of Bio::Structure::Model objects + Args : One Model or a reference to an array of Model objects + +=cut + +sub model { + my ($self, $model) = @_; + + if( defined $model) { + if( (ref($model) eq "ARRAY") || + ($model->isa('Bio::Structure::Model')) ) { + # remove existing ones, tell they've become orphan + my @obj = $self->model; + if (@obj) { + for my $m (@obj) { + $self->_remove_from_graph($m); + $self->{'model'} = []; + } + } + # add the new ones + $self->add_model($self,$model); + } + else { + $self->throw("Supplied a $model to model, we want a Bio::Structure::Model or a list of these\n"); + } + } + # give back list of models via general get method + $self->get_models($self); +} + + + +=head2 add_model() + + Title : add_model + Usage : $structure->add_model($model); + Function: Adds a (or a list of) Model objects to a Bio::Structure::Entry. + Returns : + Args : One Model or a reference to an array of Model objects + +=cut + +sub add_model { + my($self,$entry,$model) = @_; + + # if only one argument and it's a model, change evrything one place + # this is for people calling $entry->add_model($model); + if ( !defined $model && ref($entry) =~ /^Bio::Structure::Model/) { + $model = $entry; + $entry = $self; + } + # $self and $entry are the same here, but it's used for uniformicity + if ( !defined($entry) || ref($entry) !~ /^Bio::Structure::Entry/) { + $self->throw("first argument to add_model needs to be a Bio::Structure::Entry object\n"); + } + if (defined $model) { + if (ref($model) eq "ARRAY") { + # if the user passed in a reference to an array + for my $m ( @{$model} ) { + if( ! $m->isa('Bio::Structure::Model') ) { + $self->throw("$m is not a Model\n"); + } + if ( $self->_parent($m) ) { + $self->throw("$m already assigned to a parent\n"); + } + push @{$self->{'model'}}, $m; + # create a stringified version of our ref + # not used untill we get symbolic ref working + #my $str_ref = "$self"; + #$m->_grandparent($str_ref); + } + } + elsif ( $model->isa('Bio::Structure::Model') ) { + if ( $self->_parent($model) ) { # already assigned to a parent + $self->throw("$model already assigned\n"); + } + push @{$self->{'model'}}, $model; + # create a stringified version of our ref + #my $str_ref = "$self"; + #$model->_grandparent($str_ref); + } + else { + $self->throw("Supplied a $model to add_model, we want a Model or list of Models\n"); + } + } + + my $array_ref = $self->{'model'}; + return $array_ref ? @{$array_ref} : (); +} + + +=head2 get_models() + + Title : get_models + Usage : $structure->get_models($structure); + Function: general get method for models attached to an Entry + Returns : a list of models attached to this entry + Args : an Entry + +=cut + +sub get_models { + my ($self, $entry) = @_; + + # self and entry can be the same + if ( !defined $entry) { + $entry = $self; + } + # pass through to add_model + $self->add_model($entry); +} + + + +=head2 id() + + Title : id + Usage : $entry->id("identity"); + Function: Gets/sets the ID + Returns : + Args : + +=cut + +sub id { + my ($self, $value) = @_; + if (defined $value) { + $self->{'id'} = $value; + } + return $self->{'id'}; +} + + +=head2 chain() + + Title : chain + Usage : @chains = $structure->chain($chain); + Function: Connects a (or a list of) Chain objects to a Bio::Structure::Entry. + Returns : list of Bio::Structure::Residue objects + Args : One Residue or a reference to an array of Residue objects + +=cut + +sub chain { + my ($self, $chain) = @_; + + if ( ! $self->model ) { + $self->_create_default_model; + } + my @models = $self->model; + my $first_model = $models[0]; + + if ( defined $chain) { + + if( (ref($chain) eq "ARRAY") || + ($chain->isa('Bio::Structure::Chain')) ) { + # remove existing ones, tell they've become orphan + my @obj = $self->get_chains($first_model); + if (@obj) { + for my $c (@obj) { + $self->_remove_from_graph($c); + } + } + # add the new ones + $self->add_chain($first_model,$chain); + } + else { + $self->throw("Supplied a $chain to chain, we want a Bio::Structure::Chain or a list of these\n"); + } + } + $self->get_chains($first_model); +} + + +=head2 add_chain() + + Title : add_chain + Usage : @chains = $structure->add_chain($add_chain); + Function: Adds a (or a list of) Chain objects to a Bio::Structure::Entry. + Returns : + Args : + +=cut + +sub add_chain { + my($self, $model, $chain) = @_; + + if (ref($model) !~ /^Bio::Structure::Model/) { + $self->throw("add_chain: first argument needs to be a Model object ($model)\n"); + } + if (defined $chain) { + if (ref($chain) eq "ARRAY") { + # if the user passed in a reference to an array + for my $c ( @{$chain} ) { + if( ! $c->isa('Bio::Structure::Chain') ) { + $self->throw("$c is not a Chain\n"); + } + if ( $self->_parent($c) ) { + $self->throw("$c already assigned to a parent\n"); + } + $self->_parent($c, $model); + $self->_child($model, $c); + # stringify $self ref + #my $str_ref = "$self"; + #$c->_grandparent($str_ref); + } + } + elsif ( $chain->isa('Bio::Structure::Chain') ) { + if ( $self->_parent($chain) ) { # already assigned to parent + $self->throw("$chain already assigned to a parent\n"); + } + $self->_parent($chain,$model); + $self->_child($model, $chain); + # stringify $self ref + #my $str_ref = "$self"; + #$chain->_grandparent($str_ref); + } + else { + $self->throw("Supplied a $chain to add_chain, we want a Chain or list of Chains\n"); + } + } + my $array_ref = $self->_child($model); + return $array_ref ? @{$array_ref} : (); +} + + +=head2 get_chains() + + Title : get_chains + Usage : $entry->get_chains($model); + Function: general get method for chains attached to a Model + Returns : a list of chains attached to this model + Args : a Model + +=cut + +sub get_chains { + my ($self, $model) = @_; + + if (! defined $model) { + $model = ($self->get_models)[0]; + } + # pass through to add_chain + $self->add_chain($model); +} + + +=head2 residue() + + Title : residue + Usage : @residues = $structure->residue($residue); + Function: Connects a (or a list of) Residue objects to a Bio::Structure::Entry. + Returns : list of Bio::Structure::Residue objects + Args : One Residue or a reference to an array of Residue objects + +=cut + +sub residue { + my ($self, $residue) = @_; + + if ( ! $self->model ) { + my $m = $self->_create_default_model; + $self->add_model($self,$m); + } + my @models = $self->model; + my $first_model = $models[0]; + + if ( ! $self->get_chains($first_model) ) { + my $c = $self->_create_default_chain; + $self->add_chain($first_model, $c); + } + my @chains = $self->get_chains($first_model); + my $first_chain = $chains[0]; + + if( defined $residue) { + if( (ref($residue) eq "ARRAY") || + ($residue->isa('Bio::Structure::Residue')) ) { + # remove existing ones, tell they've become orphan + my @obj = $self->get_residues($first_chain); + if (@obj) { + for my $r (@obj) { + $self->_remove_from_graph($r); + } + } + # add the new ones + $self->add_residue($first_chain,$residue); + } + else { + $self->throw("Supplied a $residue to residue, we want a Bio::Structure::Residue or a list of these\n"); + } + } + $self->get_residues($first_chain); +} + + +=head2 add_residue() + + Title : add_residue + Usage : @residues = $structure->add_residue($residue); + Function: Adds a (or a list of) Residue objects to a Bio::Structure::Entry. + Returns : list of Bio::Structure::Residue objects + Args : One Residue or a reference to an array of Residue objects + +=cut + +sub add_residue { + my($self,$chain,$residue) = @_; + + if (ref($chain) !~ /^Bio::Structure::Chain/) { + $self->throw("add_residue: first argument needs to be a Chain object\n"); + } + if (defined $residue) { + if (ref($residue) eq "ARRAY") { + # if the user passed in a reference to an array + for my $r ( @{$residue} ) { + if( ! $r->isa('Bio::Structure::Residue') ) { + $self->throw("$r is not a Residue\n"); + } + if ( $self->_parent($r) ) { + $self->throw("$r already belongs to a parent\n"); + } + $self->_parent($r, $chain); + $self->_child($chain, $r); + # stringify + my $str_ref = "$self"; + $r->_grandparent($str_ref); + } + } + elsif ( $residue->isa('Bio::Structure::Residue') ) { + if ( $self->_parent($residue) ) { + $self->throw("$residue already belongs to a parent\n"); + } + $self->_parent($residue, $chain); + $self->_child($chain, $residue); + # stringify + my $str_ref = "$self"; + $residue->_grandparent($str_ref); + } + else { + $self->throw("Supplied a $residue to add_residue, we want a Residue or list of Residues\n"); + } + } + my $array_ref = $self->_child($chain); + return $array_ref ? @{$array_ref} : (); +} + + +=head2 get_residues() + + Title : get_residues + Usage : $structure->get_residues($chain); + Function: general get method for residues attached to a Chain + Returns : a list of residues attached to this chain + Args : a chain + +=cut + +sub get_residues { + my ($self, $chain) = @_; + + if ( !defined $chain) { + $self->throw("get_residues needs a Chain as argument"); + } + # pass through to add_residue + $self->add_residue($chain); +} + + +=head2 add_atom() + + Title : add_atom + Usage : @atoms = $structure->add_atom($residue,$atom); + Function: Adds a (or a list of) Atom objects to a Bio::Structure::Residue. + Returns : list of Bio::Structure::Atom objects + Args : a residue and an atom + +=cut + +sub add_atom { + my($self,$residue,$atom) = @_; + + if (ref($residue) !~ /^Bio::Structure::Residue/) { + $self->throw("add_atom: first argument needs to be a Residue object\n"); + } + if (defined $atom) { + if (ref($atom) eq "ARRAY") { + # if the user passed in a reference to an array + for my $a ( @{$atom} ) { + if( ! $a->isa('Bio::Structure::Atom') ) { + $self->throw("$a is not an Atom\n"); + } + if ( $self->_parent($a) ) { + $self->throw("$a already belongs to a parent\n"); + } + $self->_parent($a, $residue); + $self->_child($residue, $a); + # stringify + #my $str_ref = "$self"; + #$r->_grandparent($str_ref); + } + } + #elsif ( $atom->isa('Bio::Structure::Atom') ) { + elsif ( ref($atom) =~ /^Bio::Structure::Atom/ ) { + if ( $self->_parent($atom) ) { + $self->throw("$atom already belongs to a parent\n"); + } + $self->_parent($atom, $residue); + $self->_child($residue, $atom); + # stringify + #my $str_ref = "$self"; + #$atom->_grandparent($str_ref); + } + } + my $array_ref = $self->_child($residue); + return $array_ref ? @{$array_ref} : (); +} + + +=head2 get_atoms() + + Title : get_atoms + Usage : $structure->get_atoms($residue); + Function: general get method for atoms attached to a Residue + Returns : a list of atoms attached to this residue + Args : a residue + +=cut + +sub get_atoms { + my ($self, $residue) = @_; + + if ( !defined $residue) { + $self->throw("get_atoms needs a Residue as argument"); + } + # pass through to add_atom + $self->add_atom($residue); +} + + +=head2 parent() + + Title : parent + Usage : $structure->parent($residue); + Function: returns the parent of the argument + Returns : the parent of the argument + Args : a Bio::Structure object + +=cut + +=head2 conect() + + Title : conect + Usage : $structure->conect($source); + Function: get/set method for conect + Returns : a list of serial numbers for atoms connected to source + (together with $entry->get_atom_by_serial($model, $serial) this should be OK for now) + Args : the serial number for the source atom + +=cut + +sub conect { + my ($self, $source, $serial, $type) = @_; + + if ( !defined $source ) { + $self->throw("You need to supply at least a source to conect"); + } + if ( defined $serial && defined $type ) { + if ( !exists(${$self->{'conect'}}{$source}) || ref(${$self->{'conect'}}{$source} !~ /^ARRAY/ ) ) { + ${$self->{'conect'}}{$source} = []; + } + # we also need to store type, a conect object might be better + my $c = $serial . "_" . $type; + push @{ ${$self->{'conect'}}{$source} }, $c; + } + return @{ ${$self->{'conect'}}{$source} }; +} + +=head2 get_all_conect_source() + + Title : get_all_conect_source + Usage : @sources = $structure->get_all_conect_source; + Function: get all the sources for the conect records + Returns : a list of serial numbers for atoms connected to source + (together with $entry->get_atom_by_serial($model, $serial) this should be OK for now) + Args : + Description : This is a bit of a kludge, but it's the best for now. Conect info might need + to go in a sepearte object + +=cut + +sub get_all_conect_source { + my ($self) = shift; + my (@sources); + + for my $source (sort {$a<=>$b} keys %{$self->{'conect'}}) { + push @sources, $source; + } + return @sources; +} + + +=head2 master() + + Title : master + Usage : $structure->master($source); + Function: get/set method for master + Returns : the master line + Args : the master line for this entry + +=cut + +sub master { + my ($self, $value) = @_; + if (defined $value) { + $self->{'master'} = $value; + } + return $self->{'master'}; +} + + +=head2 seqres() + + Title : seqres + Usage : $seqobj = $structure->seqres("A"); + Function: gets a sequence object containing the sequence from the SEQRES record. + if a chain-ID is given , the sequence for this chain is given, if none + is provided the first chain is choosen + Returns : a Bio::PrimarySeq + Args : the chain-ID of the chain you want the sequence from + +=cut + +sub seqres { + my ($self, $chainid) = @_; + my $s_u = "x3 A1 x7 A3 x1 A3 x1 A3 x1 A3 x1 A3 x1 A3 x1 A3 x1 A3 x1 A3 x1 A3 x1 A3 x1 A3 x1 A3"; + my (%seq_ch); + if ( !defined $chainid) { + my $m = ($self->get_models($self))[0]; + my $c = ($self->get_chains($m))[0]; + $chainid = $c->id; + } + my $seqres = ($self->annotation->get_Annotations("seqres"))[0]; + my $seqres_string = $seqres->as_text; +$self->debug("seqres : $seqres_string\n"); + $seqres_string =~ s/^Value: //; + # split into lines of 62 long + my @l = unpack("A62" x (length($seqres_string)/62), $seqres_string); + for my $line (@l) { + # get out chain_id and sequence + # we use a1, as A1 strips all spaces :( + my ($chid, $seq) = unpack("x3 a1 x7 A51", $line); + if ($chid eq " ") { + $chid = "default"; + } + $seq =~ s/(\w+)/\u\L$1/g; # ALA -> Ala (for SeqUtils) + $seq =~ s/\s//g; # strip all spaces + $seq_ch{$chid} .= $seq; +$self->debug("seqres : $chid $seq_ch{$chid}\n"); + } + # do we have a seqres for this chainid + if(! exists $seq_ch{$chainid} ) { + $self->warn("There is no SEQRES known for chainid \"$chainid\""); + return undef; + } + + # this will break for non-protein structures (about 10% for now) XXX KB + my $pseq = Bio::PrimarySeq->new(-alphabet => 'protein'); + $pseq = Bio::SeqUtils->seq3in($pseq,$seq_ch{$chainid}); + my $id = $self->id . "_" . $chainid; + $pseq->id($id); + return $pseq; +} + +=head2 get_atom_by_serial() + + Title : get_atom_by_serial + Usage : $structure->get_atom_by_serial($module, $serial); + Function: get the Atom for a for get_atom_by_serial + Returns : the Atom object with this serial number in the model + Args : Model on which to work, serial number for atom + (if only a number is supplied, the first model is chosen) + +=cut + +sub get_atom_by_serial { + my ($self, $model, $serial) = @_; + + if ($model =~ /^\d+$/ && !defined $serial) { # only serial given + $serial = $model; + my @m = $self->get_models($self); + $model = $m[0]; + } + if ( !defined $model || ref($model) !~ /^Bio::Structure::Model/ ) { + $self->throw("Could not find (first) model\n"); + } + if ( !defined $serial || ($serial !~ /^\d+$/) ) { + $self->throw("The serial number you provided looks fishy ($serial)\n"); + } + for my $chain ($self->get_chains($model) ) { + for my $residue ($self->get_residues($chain) ) { + for my $atom ($self->get_atoms($residue) ) { + # this could get expensive, do we cache ??? + next unless ($atom->serial == $serial); + return $atom; + } + } + } +} + +sub parent { + my ($self, $obj) = @_; + + if ( !defined $obj) { + $self->throw("parent: you need to supply an argument to get the parent from\n"); + } + + # for now we pass on to _parent, untill we get the symbolic ref thing working. + $self->_parent($obj); +} + +sub DESTROY { + my $self = shift; + + #print STDERR "DESTROY on $self being called\n"; + +## for my $pc (keys %{ $self->{'p_c'} } ) { +## next unless ( defined ${ $self->{'p_c'} }{$pc} ); +## delete ${$self->{'p_c'}}{$pc}; +## } +## for my $cp (keys %{ $self->{'c_p'} } ) { +## next unless ( defined ${ $self->{'c_p'} }{$cp} ); +## delete ${$self->{'c_p'}}{$cp}; +## } + %{ $self->{'p_c'} } = (); + %{ $self->{'c_p'} } = (); +} + +# copied from Bio::Seq.pm +# +=head2 annotation + + Title : annotation + Usage : $obj->annotation($seq_obj) + Function: + Example : + Returns : value of annotation + Args : newvalue (optional) + + +=cut + +sub annotation { + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'annotation'} = $value; + } + return $obj->{'annotation'}; + +} + + +# +# from here on only private methods +# + +=head2 _remove_models() + + Title : _remove_models + Usage : + Function: Removes the models attached to an Entry. Tells the models they + don't belong to this Entry any more + Returns : + Args : + +=cut + +#' + +sub _remove_models { + my ($self) = shift; + + ; +} + + +=head2 _create_default_model() + + Title : _create_default_model + Usage : + Function: Creates a default Model for this Entry. Typical situation + in an X-ray structure where there is only one model + Returns : + Args : + +=cut + +sub _create_default_model { + my ($self) = shift; + + my $model = Bio::Structure::Model->new(-id => "default"); + return $model; +} + + +=head2 _create_default_chain() + + Title : _create_default_chain + Usage : + Function: Creates a default Chain for this Model. Typical situation + in an X-ray structure where there is only one chain + Returns : + Args : + +=cut + +sub _create_default_chain { + my ($self) = shift; + + my $chain = Bio::Structure::Chain->new(-id => "default"); + return $chain; +} + + + +=head2 _parent() + + Title : _parent + Usage : This is an internal function only. It is used to have one + place that keeps track of which object has which other object + as parent. Thus allowing the underlying modules (Atom, Residue,...) + to have no knowledge about all this (and thus removing the possibility + of reference cycles). + This method hides the details of manipulating references to an anonymous + hash. + Function: To get/set an objects parent + Returns : a reference to the parent if it exist, undef otherwise. In the + current implementation each node should have a parent (except Entry). + Args : + +=cut + +# manipulating the c_p hash + +sub _parent { + no strict "refs"; + my ($self, $key, $value) = @_; + + if ( (!defined $key) || (ref($key) !~ /^Bio::/) ) { + $self->throw("First argument to _parent needs to be a reference to a Bio:: object ($key)\n"); + } + if ( (defined $value) && (ref($value) !~ /^Bio::/) ) { + $self->throw("Second argument to _parent needs to be a reference to a Bio:: object\n"); + } + # no checking here for consistency of key and value, needs to happen in caller + + if (defined $value) { + # is this value already in, shout + if (defined ( $self->{'c_p'}->{$key}) && + exists ( $self->{'c_p'}->{$key}) + ) { + $self->throw("_parent: $key already has a parent ${$self->{'c_p'}}{$key}\n"); + } + ${$self->{'c_p'}}{$key} = $value; + } + return ${$self->{'c_p'}}{$key}; +} + + +=head2 _child() + + Title : _child + Usage : This is an internal function only. It is used to have one + place that keeps track of which object has which other object + as child. Thus allowing the underlying modules (Atom, Residue,...) + to have no knowledge about all this (and thus removing the possibility + to have no knowledge about all this (and thus removing the possibility + of reference cycles). + This method hides the details of manipulating references to an anonymous + hash. + Function: To get/set an object's child(ren) + Returns : a reference to an array of child(ren) if it exist, undef otherwise. + Args : + +=cut + +# manipulating the p_c hash +sub _child { + my ($self, $key, $value) = @_; + + if ( (!defined $key) || (ref($key) !~ /^Bio::/) ) { + $self->throw("First argument to _child needs to be a reference to a Bio:: object\n"); + } + if ( (defined $value) && (ref($value) !~ /^Bio::/) ) { + $self->throw("Second argument to _child needs to be a reference to a Bio:: object\n"); + } + # no checking here for consistency of key and value, needs to happen in caller + + if (defined $value) { + if ( !exists(${$self->{'p_c'}}{$key}) || ref(${$self->{'p_c'}}{$key}) !~ /^ARRAY/ ) { + ${$self->{'p_c'}}{$key} = []; + } + push @{ ${$self->{'p_c'}}{$key} }, $value; + } + return ${$self->{'p_c'}}{$key}; +} + + + +=head2 _remove_from_graph() + + Title : _remove_from_graph + Usage : This is an internal function only. It is used to remove from + the parent/child graph. We only remove the links from object to + his parent. Not the ones from object to its children. + Function: To remove an object from the parent/child graph + Returns : + Args : the object to be orphaned + +=cut + +sub _remove_from_graph { + my ($self, $object) = @_; + + if ( !defined($object) && ref($object) !~ /^Bio::/) { + $self->throw("_remove_from_graph needs a Bio object as argument"); + } + if ( $self->_parent($object) ) { + my $dad = $self->_parent($object); + # if we have a parent, remove me as being a child + for my $k (0 .. $#{$self->_child($dad)}) { + if ($object eq ${$self->{'p_c'}{$dad}}[$k]) { + splice(@{$self->{'p_c'}{$dad}}, $k,1); + } + } + delete( $self->{'c_p'}{$object}); + } +} + + +sub _print_stats_pc { + # print stats about the parent/child hashes + my ($self) =@_; + my $pc = scalar keys %{$self->{'p_c'}}; + my $cp = scalar keys %{$self->{'c_p'}}; + my $now_time = Time::HiRes::time(); + $self->debug("pc stats: P_C $pc C_P $cp $now_time\n"); +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Structure/IO.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Structure/IO.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,575 @@ +# $Id: IO.pm,v 1.3 2002/10/22 07:45:21 lapp Exp $ +# +# BioPerl module for Bio::Structure::IO +# +# Cared for by Ewan Birney <birney@sanger.ac.uk> +# and Lincoln Stein <lstein@cshl.org> +# and Kris Boulez <kris.boulez@algonomics.com> +# +# Copyright 2001, 2002 Kris Boulez +# +# You may distribute this module under the same terms as perl itself +# +# _history +# October 18, 1999 Largely rewritten by Lincoln Stein +# November 16, 2001 Copied Bio::SeqIO to Bio::Structure::IO and modified +# where needed. Factoring out common methods +# (to Bio::Root::IO) might be a good idea. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Structure::IO - Handler for Structure Formats + +=head1 SYNOPSIS + + use Bio::Structure::IO; + + $in = Bio::Structure::IO->new(-file => "inputfilename" , '-format' => 'pdb'); + $out = Bio::Structure::IO->new(-file => ">outputfilename" , '-format' => 'pdb'); + # note: we quote -format to keep older perl's from complaining. + + while ( my $struc = $in->next_structure() ) { + $out->write_structure($struc); + } + +now, to actually get at the structure object, use the standard Bio::Structure +methods (look at L<Bio::Structure> if you don't know what they are) + + use Bio::Structure::IO; + + $in = Bio::Structure::IO->new(-file => "inputfilename" , '-format' => 'pdb'); + + while ( my $struc = $in->next_structure() ) { + print "Structure ",$struc->id," number of models: ",scalar $struc->model,"\n"; + } + + + +=head1 DESCRIPTION + +[ The following description is a copy-paste from the Bio::SeqIO description. + This is not surprising as the code is also mostly a copy. ] + +Bio::Structure::IO is a handler module for the formats in the Structure::IO set +(eg, Bio::Structure::IO::pdb). It is the officially sanctioned way of getting at +the format objects, which most people should use. + +The Bio::Structure::IO system can be thought of like biological file handles. +They are attached to filehandles with smart formatting rules (eg, PDB format) +and can either read or write structure objects (Bio::Structure objects, or +more correctly, Bio::Structure::StructureI implementing objects, of which +Bio::Structure is one such object). If you want to know what to do with a +Bio::Structure object, read L<Bio::Structure> + +The idea is that you request a stream object for a particular format. +All the stream objects have a notion of an internal file that is read +from or written to. A particular Structure::IO object instance is configured +for either input or output. A specific example of a stream object is +the Bio::Structure::IO::pdb object. + +Each stream object has functions + + $stream->next_structure(); + +and + + $stream->write_structure($struc); + +also + + $stream->type() # returns 'INPUT' or 'OUTPUT' + +As an added bonus, you can recover a filehandle that is tied to the +Structure::IOIO object, allowing you to use the standard E<lt>E<gt> and print operations +to read and write structure::IOuence objects: + + use Bio::Structure::IO; + + $stream = Bio::Structure::IO->newFh(-format => 'pdb'); # read from standard input + + while ( $structure = <$stream> ) { + # do something with $structure + } + +and + + print $stream $structure; # when stream is in output mode + + +=head1 CONSTRUCTORS + +=head2 Bio::Structure::IO-E<gt>new() + + $stream = Bio::Structure::IO->new(-file => 'filename', -format=>$format); + $stream = Bio::Structure::IO->new(-fh => \*FILEHANDLE, -format=>$format); + $stream = Bio::Structure::IO->new(-format => $format); + +The new() class method constructs a new Bio::Structure::IO object. The +returned object can be used to retrieve or print Bio::Structure objects. +new() accepts the following parameters: + +=over 4 + +=item -file + +A file path to be opened for reading or writing. The usual Perl +conventions apply: + + 'file' # open file for reading + '>file' # open file for writing + '>>file' # open file for appending + '+<file' # open file read/write + 'command |' # open a pipe from the command + '| command' # open a pipe to the command + +=item -fh + +You may provide new() with a previously-opened filehandle. For +example, to read from STDIN: + + $strucIO = Bio::Structure::IO->new(-fh => \*STDIN); + +Note that you must pass filehandles as references to globs. + +If neither a filehandle nor a filename is specified, then the module +will read from the @ARGV array or STDIN, using the familiar E<lt>E<gt> +semantics. + +A string filehandle is handy if you want to modify the output in the +memory, before printing it out. The following program reads in EMBL +formatted entries from a file and prints them out in fasta format with +some HTML tags: +[ not relevant for Bio::Structure::IO as only one format is supported + at the moment ] + + use Bio::SeqIO; + use IO::String; + my $in = Bio::SeqIO->new('-file' => "emblfile" , + '-format' => 'EMBL'); + while ( my $seq = $in->next_seq() ) { + # the output handle is reset for every file + my $stringio = IO::String->new($string); + my $out = Bio::SeqIO->new('-fh' => $stringio, + '-format' => 'fasta'); + # output goes into $string + $out->write_seq($seq); + # modify $string + $string =~ s|(>)(\w+)|$1<font color="Red">$2</font>|g; + # print into STDOUT + print $string; + } + +=item -format + +Specify the format of the file. Supported formats include: + + PDB Protein Data Bank format + +If no format is specified and a filename is given, then the module +will attempt to deduce it from the filename. If this is unsuccessful, +PDB format is assumed. + +The format name is case insensitive. 'PDB', 'Pdb' and 'pdb' are +all supported. + +=back + +=head2 Bio::Structure::IO-E<gt>newFh() + + $fh = Bio::Structure::IO->newFh(-fh => \*FILEHANDLE, -format=>$format); + $fh = Bio::Structure::IO->newFh(-format => $format); + # etc. + +This constructor behaves like new(), but returns a tied filehandle +rather than a Bio::Structure::IO object. You can read structures from this +object using the familiar E<lt>E<gt> operator, and write to it using +print(). The usual array and $_ semantics work. For example, you can +read all structure objects into an array like this: + + @structures = <$fh>; + +Other operations, such as read(), sysread(), write(), close(), and printf() +are not supported. + +=head1 OBJECT METHODS + +See below for more detailed summaries. The main methods are: + +=head2 $structure = $structIO-E<gt>next_structure() + +Fetch the next structure from the stream. + +=head2 $structIO-E<gt>write_structure($struc [,$another_struc,...]) + +Write the specified structure(s) to the stream. + +=head2 TIEHANDLE(), READLINE(), PRINT() + +These provide the tie interface. See L<perltie> for more details. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney, Lincoln Stein, Kris Boulez + +Email birney@ebi.ac.uk, kris.boulez@algonomics + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::Structure::IO; + +use strict; +use vars qw(@ISA); + +use Bio::Root::Root; +use Bio::Root::IO; +use Bio::PrimarySeq; +use Symbol(); + +@ISA = qw(Bio::Root::Root Bio::Root::IO); + +=head2 new + + Title : new + Usage : $stream = Bio::Structure::IO->new(-file => $filename, -format => 'Format') + Function: Returns a new structIOstream + Returns : A Bio::Structure::IO handler initialised with the appropriate format + Args : -file => $filename + -format => format + -fh => filehandle to attach to + +=cut + +my $entry = 0; + +sub new { + my ($caller,@args) = @_; + my $class = ref($caller) || $caller; + + # or do we want to call SUPER on an object if $caller is an + # object? + if( $class =~ /Bio::Structure::IO::(\S+)/ ) { + my ($self) = $class->SUPER::new(@args); + $self->_initialize(@args); + return $self; + } else { + + my %param = @args; + @param{ map { lc $_ } keys %param } = values %param; # lowercase keys + my $format = $param{'-format'} || + $class->_guess_format( $param{-file} || $ARGV[0] ) || + 'pdb'; + $format = "\L$format"; # normalize capitalization to lower case + + # normalize capitalization + return undef unless( &_load_format_module($format) ); + return "Bio::Structure::IO::$format"->new(@args); + } +} + +=head2 newFh + + Title : newFh + Usage : $fh = Bio::Structure::IO->newFh(-file=>$filename,-format=>'Format') + Function: does a new() followed by an fh() + Example : $fh = Bio::Structure::IO->newFh(-file=>$filename,-format=>'Format') + $structure = <$fh>; # read a structure object + print $fh $structure; # write a structure object + Returns : filehandle tied to the Bio::Structure::IO::Fh class + Args : + +=cut + +sub newFh { + my $class = shift; + return unless my $self = $class->new(@_); + return $self->fh; +} + +=head2 fh + + Title : fh + Usage : $obj->fh + Function: + Example : $fh = $obj->fh; # make a tied filehandle + $structure = <$fh>; # read a structure object + print $fh $structure; # write a structure object + Returns : filehandle tied to the Bio::Structure::IO::Fh class + Args : + +=cut + + +sub fh { + my $self = shift; + my $class = ref($self) || $self; + my $s = Symbol::gensym; + tie $$s,$class,$self; + return $s; +} + + +# _initialize is chained for all SeqIO classes + +sub _initialize { + my($self, @args) = @_; + + # not really necessary unless we put more in RootI + $self->SUPER::_initialize(@args); + + # initialize the IO part + $self->_initialize_io(@args); +} + +=head2 next_structure + + Title : next_structure + Usage : $structure = stream->next_structure + Function: Reads the next structure object from the stream and returns it. + + Certain driver modules may encounter entries in the stream that + are either misformatted or that use syntax not yet understood + by the driver. If such an incident is recoverable, e.g., by + dismissing a feature of a feature table or some other non-mandatory + part of an entry, the driver will issue a warning. In the case + of a non-recoverable situation an exception will be thrown. + Do not assume that you can resume parsing the same stream after + catching the exception. Note that you can always turn recoverable + errors into exceptions by calling $stream->verbose(2) (see + Bio::RootI POD page). + Returns : a Bio::Structure structure object + Args : none + +=cut + +sub next_structure { + my ($self, $struc) = @_; + $self->throw("Sorry, you cannot read from a generic Bio::Structure::IO object."); +} + +# Do we want people to read out the sequence directly from a $structIO stream +# +##=head2 next_primary_seq +## +## Title : next_primary_seq +## Usage : $seq = $stream->next_primary_seq +## Function: Provides a primaryseq type of sequence object +## Returns : A Bio::PrimarySeqI object +## Args : none +## +## +##=cut +## +##sub next_primary_seq { +## my ($self) = @_; +## +## # in this case, we default to next_seq. This is because +## # Bio::Seq's are Bio::PrimarySeqI objects. However we +## # expect certain sub classes to override this method to provide +## # less parsing heavy methods to retrieving the objects +## +## return $self->next_seq(); +##} + +=head2 write_structure + + Title : write_structure + Usage : $stream->write_structure($structure) + Function: writes the $structure object into the stream + Returns : 1 for success and 0 for error + Args : Bio::Structure object + +=cut + +sub write_seq { + my ($self, $struc) = @_; + $self->throw("Sorry, you cannot write to a generic Bio::Structure::IO object."); +} + + +# De we need this here +# +##=head2 alphabet +## +## Title : alphabet +## Usage : $self->alphabet($newval) +## Function: Set/get the molecule type for the Seq objects to be created. +## Example : $seqio->alphabet('protein') +## Returns : value of alphabet: 'dna', 'rna', or 'protein' +## Args : newvalue (optional) +## Throws : Exception if the argument is not one of 'dna', 'rna', or 'protein' +## +##=cut +## +##sub alphabet { +## my ($self, $value) = @_; +## +## if ( defined $value) { +## # instead of hard-coding the allowed values once more, we check by +## # creating a dummy sequence object +## eval { +## my $seq = Bio::PrimarySeq->new('-alphabet' => $value); +## }; +## if($@) { +## $self->throw("Invalid alphabet: $value\n. See Bio::PrimarySeq for allowed values."); +## } +## $self->{'alphabet'} = "\L$value"; +## } +## return $self->{'alphabet'}; +##} + +=head2 _load_format_module + + Title : _load_format_module + Usage : *INTERNAL Structure::IO stuff* + Function: Loads up (like use) a module at run time on demand + Example : + Returns : + Args : + +=cut + +sub _load_format_module { + my ($format) = @_; + my ($module, $load, $m); + + $module = "_<Bio/Structure/IO/$format.pm"; + $load = "Bio/Structure/IO/$format.pm"; + + return 1 if $main::{$module}; + eval { + require $load; + }; + if ( $@ ) { + print STDERR <<END; +$load: $format cannot be found +Exception $@ +For more information about the Structure::IO system please see the +Bio::Structure::IO docs. This includes ways of checking for formats at +compile time, not run time +END + ; + return; + } + return 1; +} + +=head2 _concatenate_lines + + Title : _concatenate_lines + Usage : $s = _concatenate_lines($line, $continuation_line) + Function: Private. Concatenates two strings assuming that the second stems + from a continuation line of the first. Adds a space between both + unless the first ends with a dash. + + Takes care of either arg being empty. + Example : + Returns : A string. + Args : + +=cut + +sub _concatenate_lines { + my ($self, $s1, $s2) = @_; + $s1 .= " " if($s1 && ($s1 !~ /-$/) && $s2); + return ($s1 ? $s1 : "") . ($s2 ? $s2 : ""); +} + +=head2 _filehandle + + Title : _filehandle + Usage : $obj->_filehandle($newval) + Function: This method is deprecated. Call _fh() instead. + Example : + Returns : value of _filehandle + Args : newvalue (optional) + + +=cut + +sub _filehandle { + my ($self,@args) = @_; + return $self->_fh(@args); +} + +=head2 _guess_format + + Title : _guess_format + Usage : $obj->_guess_format($filename) + Function: + Example : + Returns : guessed format of filename (lower case) + Args : + +=cut + +sub _guess_format { + my $class = shift; + return unless $_ = shift; + return 'fasta' if /\.(fasta|fast|seq|fa|fsa|nt|aa)$/i; + return 'genbank' if /\.(gb|gbank|genbank)$/i; + return 'scf' if /\.scf$/i; + return 'pir' if /\.pir$/i; + return 'embl' if /\.(embl|ebl|emb|dat)$/i; + return 'raw' if /\.(txt)$/i; + return 'gcg' if /\.gcg$/i; + return 'ace' if /\.ace$/i; + return 'bsml' if /\.(bsm|bsml)$/i; + return 'pdb' if /\.(ent|pdb)$/i; +} + +sub DESTROY { + my $self = shift; + + $self->close(); +} + +sub TIEHANDLE { + my ($class,$val) = @_; + return bless {'structio' => $val}, $class; +} + +sub READLINE { + my $self = shift; + return $self->{'structio'}->next_seq() unless wantarray; + my (@list, $obj); + push @list, $obj while $obj = $self->{'structio'}->next_seq(); + return @list; +} + +sub PRINT { + my $self = shift; + $self->{'structio'}->write_seq(@_); +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Structure/IO/pdb.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Structure/IO/pdb.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1452 @@ +# $Id: pdb.pm,v 1.9.2.2 2003/08/29 16:24:14 birney Exp $ +# +# BioPerl module for Bio::Structure::IO::pdb +# +# Cared for by Kris Boulez <kris.boulez@algonomics.com> +# +# Copyright 2001, 2002 Kris Boulez +# +# Framework is a copy of Bio::SeqIO::embl.pm +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Structure::IO::pdb - PDB input/output stream + +=head1 SYNOPSIS + +It is probably best not to use this object directly, but +rather go through the Bio::Structure::IO handler system. Go: + + $stream = Bio::Structure::IO->new(-file => $filename, + -format => 'PDB'); + + while ( (my $structure = $stream->next_structure()) ) { + # do something with $structure + } + +=head1 DESCRIPTION + +This object can transform Bio::Structure objects to and from PDB flat +file databases. The working is similar to that of the Bio::SeqIO handlers. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably +to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://www.bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Kris Boulez + +Email kris.boulez@algonomics.com + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Structure::IO::pdb; +use vars qw(@ISA); +use strict; +use Bio::Structure::IO; +use Bio::Structure::Entry; +#use Bio::Structure::Model; +#use Bio::Structure::Chain; +#use Bio::Structure::Residue; +use Bio::Structure::Atom; +use Bio::SeqFeature::Generic; +use Bio::Annotation::Reference; + +@ISA = qw(Bio::Structure::IO); + +sub _initialize { + my($self,@args) = @_; + + $self->SUPER::_initialize(@args); + + my ($noheader, $noatom) = + $self->_rearrange([qw( + NOHEADER + NOATOM + )], + @args); + $noheader && $self->_noheader($noheader); + $noatom && $self->_noatom($noatom); +} + + +=head2 next_structure; + + Title : next_structure + Usage : $struc = $stream->next_structure() + Function: returns the next structure in the stream + Returns : Bio::Structure object + Args : + + +=cut + +sub next_structure { + my ($self,@args) = @_; + my ($line); + my ($obslte, $title, $caveat, $compnd, $source, $keywds, + $expdta, $author, %revdat, $revdat, $sprsde, $jrnl, %remark, $dbref, + $seqadv, $seqres, $modres, $het, $hetnam, $hetsyn, $formul, $helix, + $sheet, $turn, $ssbond, $link, $hydbnd, $sltbrg, $cispep, + $site, $cryst1, $tvect,); + my $struc = Bio::Structure::Entry->new(-id => 'created from pdb.pm'); + my $all_headers = ( !$self->_noheader ); # we'll parse all headers and store as annotation + my %header; # stores all header RECORDs an is stored as annotations when ATOM is reached + + + $line = $self->_readline; # This needs to be before the first eof() test + + if( !defined $line ) { + return undef; # no throws - end of file + } + + if( $line =~ /^\s+$/ ) { + while( defined ($line = $self->_readline) ) { + $line =~/\S/ && last; + } + } + if( !defined $line ) { + return undef; # end of file + } + $line =~ /^HEADER\s+\S+/ || $self->throw("PDB stream with no HEADER. Not pdb in my book"); + my($header_line) = unpack "x10 a56", $line; + $header{'header'} = $header_line; + my($class, $depdate, $idcode) = unpack "x10 a40 a9 x3 a4", $line; + $idcode =~ s/^\s*(\S+)\s*$/$1/; + $struc->id($idcode); +$self->debug("PBD c $class d $depdate id $idcode\n"); # XXX KB + + my $buffer = $line; + + BEFORE_COORDINATES : + until( !defined $buffer ) { + $_ = $buffer; + + # Exit at start of coordinate section + last if /^(MODEL|ATOM|HETATM)/; + + # OBSLTE line(s) + if (/^OBSLTE / && $all_headers) { + $obslte = $self->_read_PDB_singlecontline("OBSLTE","12-70",\$buffer); + $header{'obslte'} = $obslte; + } + + # TITLE line(s) + if (/^TITLE / && $all_headers) { + $title = $self->_read_PDB_singlecontline("TITLE","11-70",\$buffer); + $header{'title'} = $title; + } + + # CAVEAT line(s) + if (/^CAVEAT / && $all_headers) { + $caveat = $self->_read_PDB_singlecontline("CAVEAT","12-70",\$buffer); + $header{'caveat'} = $caveat; + } + + # COMPND line(s) + if (/^COMPND / && $all_headers) { + $compnd = $self->_read_PDB_singlecontline("COMPND","11-70",\$buffer); + $header{'compnd'} = $compnd; +$self->debug("get COMPND $compnd\n"); + } + + # SOURCE line(s) + if (/^SOURCE / && $all_headers) { + $source = $self->_read_PDB_singlecontline("SOURCE","11-70",\$buffer); + $header{'source'} = $source; + } + + # KEYWDS line(s) + if (/^KEYWDS / && $all_headers) { + $keywds = $self->_read_PDB_singlecontline("KEYWDS","11-70",\$buffer); + $header{'keywds'} = $keywds; + } + + # EXPDTA line(s) + if (/^EXPDTA / && $all_headers) { + $expdta = $self->_read_PDB_singlecontline("EXPDTA","11-70",\$buffer); + $header{'expdta'} = $expdta; + } + + # AUTHOR line(s) + if (/^AUTHOR / && $all_headers) { + $author = $self->_read_PDB_singlecontline("AUTHOR","11-70",\$buffer); + $header{'author'} = $author; + } + + # REVDAT line(s) + # a bit more elaborate as we also store the modification number + if (/^REVDAT / && $all_headers) { + ##my($modnum,$rol) = unpack "x7 A3 x3 A53", $_; + ##$modnum =~ s/\s+//; # remove spaces + ##$revdat{$modnum} .= $rol; + my ($rol) = unpack "x7 a59", $_; + $revdat .= $rol; + $header{'revdat'} = $revdat; + } + + # SPRSDE line(s) + if (/^SPRSDE / && $all_headers) { + $sprsde = $self->_read_PDB_singlecontline("SPRSDE","12-70",\$buffer); + $header{'sprsde'} = $sprsde; + } + + # jRNL line(s) + if (/^JRNL / && $all_headers) { + $jrnl = $self->_read_PDB_jrnl(\$buffer); + $struc->annotation->add_Annotation('reference',$jrnl); + $header{'jrnl'} = 1; # when writing out, we need a way to check there was a JRNL record (not mandatory) + } + + # REMARK line(s) + # we only parse the "REMARK 1" lines (additional references) + # thre rest is stored in %remark (indexed on remarkNum) (pack does space-padding) + if (/^REMARK\s+(\d+)\s*/ && $all_headers) { + my $remark_num = $1; + if ($remark_num == 1) { + my @refs = $self->_read_PDB_remark_1(\$buffer); + # How can we find the primary reference when writing (JRNL record) XXX KB + foreach my $ref (@refs) { + $struc->annotation->add_Annotation('reference', $ref); + } + # $_ still holds the REMARK_1 line, $buffer now contains the first non + # REMARK_1 line. We need to parse it in this pass (so no else block) + $_ = $buffer; + } + # for the moment I don't see a better solution (other then using goto) + if (/^REMARK\s+(\d+)\s*/) { + my $r_num = $1; + if ($r_num != 1) { # other remarks, we store literlly at the moment + my ($rol) = unpack "x11 a59", $_; + $remark{$r_num} .= $rol; + } + } + } # REMARK + + # DBREF line(s) + # references to sequences in other databases + # we store as 'dblink' annotations and whole line as simple annotation (round-trip) + if (/^DBREF / && $all_headers) { + my ($rol) = unpack "x7 a61", $_; + $dbref .= $rol; + $header{'dbref'} = $dbref; + my ($db, $acc) = unpack "x26 a6 x1 a8", $_; + $db =~ s/\s*$//; + $acc =~ s/\s*$//; + my $link = Bio::Annotation::DBLink->new; + $link->database($db); + $link->primary_id($acc); + $struc->annotation->add_Annotation('dblink', $link); + } # DBREF + + # SEQADV line(s) + if (/^SEQADV / && $all_headers) { + my ($rol) = unpack "x7 a63", $_; + $seqadv .= $rol; + $header{'seqadv'} = $seqadv; + } # SEQADV + + # SEQRES line(s) + # this is (I think) the sequence of macromolecule that was analysed + # this will be returned when doing $struc->seq + if (/^SEQRES / && $all_headers) { + my ($rol) = unpack "x8 a62", $_; + $seqres .= $rol; + $header{'seqres'} = $seqres; + } # SEQRES + + # MODRES line(s) + if (/^MODRES / && $all_headers) { + my ($rol) = unpack "x7 a63", $_; + $modres .= $rol; + $header{'modres'} = $modres; + } # MODRES + + # HET line(s) + if (/^HET / && $all_headers) { + my ($rol) = unpack "x7 a63", $_; + $het .= $rol; + $header{'het'} = $het; + } # HET + + # HETNAM line(s) + if (/^HETNAM / && $all_headers) { + my ($rol) = unpack "x8 a62", $_; + $hetnam .= $rol; + $header{'hetnam'} = $hetnam; + } # HETNAM + + # HETSYN line(s) + if (/^HETSYN / && $all_headers) { + my ($rol) = unpack "x8 a62", $_; + $hetsyn .= $rol; + $header{'hetsyn'} = $hetsyn; + } # HETSYN + + # FORMUL line(s) + if (/^FORMUL / && $all_headers) { + my ($rol) = unpack "x8 a62", $_; + $formul .= $rol; + $header{'formul'} = $formul; + } # FORMUL + + # HELIX line(s) + # store as specific object ?? + if (/^HELIX / && $all_headers) { + my ($rol) = unpack "x7 a69", $_; + $helix .= $rol; + $header{'helix'} = $helix; + } # HELIX + + # SHEET line(s) + # store as specific object ?? + if (/^SHEET / && $all_headers) { + my ($rol) = unpack "x7 a63", $_; + $sheet .= $rol; + $header{'sheet'} = $sheet; + } # SHEET + + # TURN line(s) + # store as specific object ?? + if (/^TURN / && $all_headers) { + my ($rol) = unpack "x7 a63", $_; + $turn .= $rol; + $header{'turn'} = $turn; + } # TURN + + # SSBOND line(s) + # store in connection-like object (see parsing of CONECT record) + if (/^SSBOND / && $all_headers) { + my ($rol) = unpack "x7 a65", $_; + $ssbond .= $rol; + $header{'ssbond'} = $ssbond; + } # SSBOND + + # LINK + # store like SSBOND ? + if (/^LINK / && $all_headers) { + my ($rol) = unpack "x12 a60", $_; + $link .= $rol; + $header{'link'} = $link; + } # LINK + + # HYDBND + # store like SSBOND + if (/^HYDBND / && $all_headers) { + my ($rol) = unpack "x12 a60", $_; + $hydbnd .= $rol; + $header{'hydbnd'} = $hydbnd; + } # HYDBND + + # SLTBRG + # store like SSBOND ? + if (/^SLTBRG / && $all_headers) { + my ($rol) = unpack "x12 a60",$_; + $sltbrg .= $rol; + $header{'sltbrg'} = $sltbrg; + } # SLTBRG + + # CISPEP + # store like SSBOND ? + if (/^CISPEP / && $all_headers) { + my ($rol) = unpack "x7 a52", $_; + $cispep .= $rol; + $header{'cispep'} = $cispep; + } + + # SITE line(s) + if (/^SITE / && $all_headers) { + my ($rol) = unpack "x7 a54", $_; + $site .= $rol; + $header{'site'} = $site; + } # SITE + + # CRYST1 line + # store in some crystallographic subobject ? + if (/^CRYST1/ && $all_headers) { + my ($rol) = unpack "x6 a64", $_; + $cryst1 .= $rol; + $header{'cryst1'} = $cryst1; + } # CRYST1 + + # ORIGXn line(s) (n=1,2,3) + if (/^(ORIGX\d) / && $all_headers) { + my $origxn = lc($1); + my ($rol) = unpack "x10 a45", $_; + $header{$origxn} .= $rol; + } # ORIGXn + + # SCALEn line(s) (n=1,2,3) + if (/^(SCALE\d) / && $all_headers) { + my $scalen = lc($1); + my ($rol) = unpack "x10 a45", $_; + $header{$scalen} .= $rol; + } # SCALEn + + # MTRIXn line(s) (n=1,2,3) + if (/^(MTRIX\d) / && $all_headers) { + my $mtrixn = lc($1); + my ($rol) = unpack "x7 a53", $_; + $header{$mtrixn} .= $rol; + } # MTRIXn + + # TVECT line(s) + if (/^TVECT / && $all_headers) { + my ($rol) = unpack "x7 a63", $_; + $tvect .= $rol; + $header{'tvect'} = $tvect; + } + + # Get next line. + $buffer = $self->_readline; + } + + # store %header entries a annotations + if (%header) { + for my $record (keys %header) { + my $sim = Bio::Annotation::SimpleValue->new(); + $sim->value($header{$record}); + $struc->annotation->add_Annotation($record, $sim); + } + } + # store %remark entries as annotations + if (%remark) { + for my $remark_num (keys %remark) { + my $sim = Bio::Annotation::SimpleValue->new(); + $sim->value($remark{$remark_num}); + $struc->annotation->add_Annotation("remark_$remark_num", $sim); + } + } + + # Coordinate section, the real meat + # + # $_ contains a line beginning with (ATOM|MODEL) + + $buffer = $_; + + + if (defined($buffer) && $buffer =~ /^(ATOM |MODEL |HETATM)/ ) { # can you have an entry without ATOM ? + until( !defined ($buffer) ) { # (yes : 1a7z ) + # read in one model at a time + my $model = $self->_read_PDB_coordinate_section(\$buffer, $struc); + # add this to $struc + $struc->add_model($struc, $model); + + if ($buffer !~ /^MODEL /) { # if we get here we have multiple MODELs + last; + } + } + } + else { + $self->throw("Could not find a coordinate section in this record\n"); + } + + + until( !defined $buffer ) { + $_ = $buffer; + + # CONNECT records + if (/^CONECT/) { + # do not differentiate between different type of connect (column dependant) + my $conect_unpack = "x6 a5 a5 a5 a5 a5 a5 a5 a5 a5 a5 a5"; + my (@conect) = unpack $conect_unpack, $_; + for my $k (0 .. $#conect) { + $conect[$k] =~ s/\s//g; + } + my $source = shift @conect; + my $type; + for my $k (0 .. 9) { + next unless ($conect[$k] =~ /^\d+$/); + # 0..3 bond + if( $k <= 3 ) { + $type = "bond"; + } + # 4..5,7..8 hydrogen bonded + elsif( ($k >= 4 && $k <= 5) || ($k >= 7 && $k <= 8) ) { + $type = "hydrogen"; + } + # 6, 9 salt bridged + elsif( $k == 6 || $k == 9 ) { + $type = "saltbridged"; + } else { + $self->throw("k has impossible value ($k), check brain"); + } + $struc->conect($source, $conect[$k], $type); + } + } + + # MASTER record + if (/^MASTER /) { + # the numbers in here a checksums, we should use them :) + my ($rol) = unpack "x10 a60", $_; + $struc->master($rol); + } + + if (/^END/) { + # this it the end ... + } + + $buffer = $self->_readline; + } + + + return $struc; +} + +=head2 write_structure + + Title : write_structure + Usage : $stream->write_structure($struc) + Function: writes the $struc object (must be a Bio::Structure) to the stream + Returns : 1 for success and 0 for error + Args : Bio::Structure object + + +=cut + +sub write_structure { + my ($self, $struc) = @_; + if( !defined $struc ) { + $self->throw("Attempting to write with no structure!"); + } + + if( ! ref $struc || ! $struc->isa('Bio::Structure::StructureI') ) { + $self->throw(" $struc is not a StructureI compliant module."); + } + my ($ann, $string, $output_string, $key); + # HEADER + ($ann) = $struc->annotation->get_Annotations("header"); + if ($ann) { + $string = $ann->as_text; + $string =~ s/^Value: //; + $output_string = pack ("A10 A56", "HEADER", $string); + } else { # not read in via read_structure, create HEADER line + my $id = $struc->id; + if (!$id) { + $id = "UNK1"; + } + if (length($id) > 4) { + $id = substr($id,0,4); + } + my $classification = "DEFAULT CLASSIFICATION"; + my $dep_date = "24-JAN-70"; + $output_string = pack ("A10 A40 A12 A4", "HEADER", $classification, $dep_date, $id); + } + $output_string .= " " x (80 - length($output_string) ); + $self->_print("$output_string\n"); + + my (%header); + for $key ($struc->annotation->get_all_annotation_keys) { + $header{$key} = 1;; + } + + exists $header{'obslte'} && $self->_write_PDB_simple_record(-name => "OBSLTE ", -cont => "9-10", + -annotation => $struc->annotation->get_Annotations("obslte"), -rol => "11-70"); + + exists $header{'title'} && $self->_write_PDB_simple_record(-name => "TITLE ", -cont => "9-10", + -annotation => $struc->annotation->get_Annotations("title"), -rol => "11-70"); + + exists $header{'caveat'} && $self->_write_PDB_simple_record(-name => "CAVEAT ", -cont => "9-10", + -annotation => $struc->annotation->get_Annotations("caveat"), -rol => "12-70"); + + exists $header{'compnd'} && $self->_write_PDB_simple_record(-name => "COMPND ", -cont => "9-10", + -annotation => $struc->annotation->get_Annotations("compnd"), -rol => "11-70"); + + exists $header{'source'} && $self->_write_PDB_simple_record(-name => "SOURCE ", -cont => "9-10", + -annotation => $struc->annotation->get_Annotations("source"), -rol => "11-70"); + + exists $header{'keywds'} && $self->_write_PDB_simple_record(-name => "KEYWDS ", -cont => "9-10", + -annotation => $struc->annotation->get_Annotations("keywds"), -rol => "11-70"); + + exists $header{'expdta'} && $self->_write_PDB_simple_record(-name => "EXPDTA ", -cont => "9-10", + -annotation => $struc->annotation->get_Annotations("expdta"), -rol => "11-70"); + + exists $header{'author'} && $self->_write_PDB_simple_record(-name => "AUTHOR ", -cont => "9-10", + -annotation => $struc->annotation->get_Annotations("author"), -rol => "11-70"); + + exists $header{'revdat'} && $self->_write_PDB_simple_record(-name => "REVDAT ", + -annotation => $struc->annotation->get_Annotations("revdat"), -rol => "8-66"); + + exists $header{'sprsde'} && $self->_write_PDB_simple_record(-name => "SPRSDE ", -cont => "9-10", + -annotation => $struc->annotation->get_Annotations("sprsde"), -rol => "12-70"); + + # JRNL en REMARK 1 + my ($jrnl_done, $remark_1_counter); + if ( !exists $header{'jrnl'} ) { + $jrnl_done = 1; + } + foreach my $ref ($struc->annotation->get_Annotations('reference') ) { + if( !$jrnl_done ) { # JRNL record + $ref->authors && $self->_write_PDB_simple_record(-name => "JRNL AUTH", + -cont => "17-18", -rol => "20-70", -string => $ref->authors ); + $ref->title && $self->_write_PDB_simple_record(-name => "JRNL TITL", + -cont => "17-18", -rol => "20-70", -string => $ref->title ); + $ref->editors && $self->_write_PDB_simple_record(-name => "JRNL EDIT", + -cont => "17-18", -rol => "20-70", -string => $ref->editors ); + $ref->location && $self->_write_PDB_simple_record(-name => "JRNL REF ", + -cont => "17-18", -rol => "20-70", -string => $ref->location ); + $ref->editors && $self->_write_PDB_simple_record(-name => "JRNL EDIT", + -cont => "17-18", -rol => "20-70", -string => $ref->editors ); + $ref->encoded_ref && $self->_write_PDB_simple_record(-name => "JRNL REFN", + -cont => "17-18", -rol => "20-70", -string => $ref->encoded_ref ); + $jrnl_done = 1; + } else { # REMARK 1 + if (!$remark_1_counter) { # header line + my $remark_1_header_line = "REMARK 1" . " " x 70; + $self->_print("$remark_1_header_line\n"); + $remark_1_counter = 1; + } + # per reference header + my $rem_line = "REMARK 1 REFERENCE " . $remark_1_counter; + $rem_line .= " " x (80 - length($rem_line) ); + $self->_print($rem_line,"\n"); + $ref->authors && $self->_write_PDB_simple_record(-name => "REMARK 1 AUTH", + -cont => "17-18", -rol => "20-70", -string => $ref->authors ); + $ref->title && $self->_write_PDB_simple_record(-name => "REMARK 1 TITL", + -cont => "17-18", -rol => "20-70", -string => $ref->title ); + $ref->editors && $self->_write_PDB_simple_record(-name => "REMARK 1 EDIT", + -cont => "17-18", -rol => "20-70", -string => $ref->editors ); + $ref->location && $self->_write_PDB_simple_record(-name => "REMARK 1 REF ", + -cont => "17-18", -rol => "20-70", -string => $ref->location ); + $ref->editors && $self->_write_PDB_simple_record(-name => "REMARK 1 EDIT", + -cont => "17-18", -rol => "20-70", -string => $ref->editors ); + $ref->encoded_ref && $self->_write_PDB_simple_record(-name => "REMARK 1 REFN", + -cont => "17-18", -rol => "20-70", -string => $ref->encoded_ref ); + $remark_1_counter++; + } + } + if (! defined $remark_1_counter ) { # no remark 1 record written yet + my $remark_1_header_line = "REMARK 1" . " " x 70; + $self->_print("$remark_1_header_line\n"); # write dummy (we need this line) + } + + # REMARK's (not 1 at the moment, references) + my (%remarks, $remark_num); + for $key (keys %header) { + next unless ($key =~ /^remark_(\d+)$/); + next if ($1 == 1); + $remarks{$1} = 1; + } + for $remark_num (sort {$a <=> $b} keys %remarks) { + $self->_write_PDB_remark_record($struc, $remark_num); + } + + exists $header{'dbref'} && $self->_write_PDB_simple_record(-name => "DBREF ", + -annotation => $struc->annotation->get_Annotations("dbref"), -rol => "8-68"); + exists $header{'seqadv'} && $self->_write_PDB_simple_record(-name => "SEQADV ", + -annotation => $struc->annotation->get_Annotations("seqadv"), -rol => "8-70"); + exists $header{'seqres'} && $self->_write_PDB_simple_record(-name => "SEQRES ", + -annotation => $struc->annotation->get_Annotations("seqres"), -rol => "9-70"); + exists $header{'modres'} && $self->_write_PDB_simple_record(-name => "MODRES ", + -annotation => $struc->annotation->get_Annotations("modres"), -rol => "8-70"); + exists $header{'het'} && $self->_write_PDB_simple_record(-name => "HET ", + -annotation => $struc->annotation->get_Annotations("het"), -rol => "8-70"); + exists $header{'hetnam'} && $self->_write_PDB_simple_record(-name => "HETNAM ", + -annotation => $struc->annotation->get_Annotations("hetnam"), -rol => "9-70"); + exists $header{'hetsyn'} && $self->_write_PDB_simple_record(-name => "HETSYN ", + -annotation => $struc->annotation->get_Annotations("hetsyn"), -rol => "9-70"); + exists $header{'formul'} && $self->_write_PDB_simple_record(-name => "FORMUL ", + -annotation => $struc->annotation->get_Annotations("formul"), -rol => "9-70"); + exists $header{'helix'} && $self->_write_PDB_simple_record(-name => "HELIX ", + -annotation => $struc->annotation->get_Annotations("helix"), -rol => "8-76"); + exists $header{'sheet'} && $self->_write_PDB_simple_record(-name => "SHEET ", + -annotation => $struc->annotation->get_Annotations("sheet"), -rol => "8-70"); + exists $header{'turn'} && $self->_write_PDB_simple_record(-name => "TURN ", + -annotation => $struc->annotation->get_Annotations("turn"), -rol => "8-70"); + exists $header{'ssbond'} && $self->_write_PDB_simple_record(-name => "SSBOND ", + -annotation => $struc->annotation->get_Annotations("ssbond"), -rol => "8-72"); + exists $header{'link'} && $self->_write_PDB_simple_record(-name => "LINK ", + -annotation => $struc->annotation->get_Annotations("link"), -rol => "13-72"); + exists $header{'hydbnd'} && $self->_write_PDB_simple_record(-name => "HYDBND ", + -annotation => $struc->annotation->get_Annotations("hydbnd"), -rol => "13-72"); + exists $header{'sltbrg'} && $self->_write_PDB_simple_record(-name => "SLTBRG ", + -annotation => $struc->annotation->get_Annotations("sltbrg"), -rol => "13-72"); + exists $header{'cispep'} && $self->_write_PDB_simple_record(-name => "CISPEP ", + -annotation => $struc->annotation->get_Annotations("cispep"), -rol => "8-59"); + exists $header{'site'} && $self->_write_PDB_simple_record(-name => "SITE ", + -annotation => $struc->annotation->get_Annotations("site"), -rol => "8-61"); + exists $header{'cryst1'} && $self->_write_PDB_simple_record(-name => "CRYST1", + -annotation => $struc->annotation->get_Annotations("cryst1"), -rol => "7-70"); + for my $k (1..3) { + my $origxn = "origx".$k; + my $ORIGXN = uc($origxn)." "; + exists $header{$origxn} && $self->_write_PDB_simple_record(-name => $ORIGXN, + -annotation => $struc->annotation->get_Annotations($origxn), -rol => "11-55"); + } + for my $k (1..3) { + my $scalen = "scale".$k; + my $SCALEN = uc($scalen)." "; + exists $header{$scalen} && $self->_write_PDB_simple_record(-name => $SCALEN, + -annotation => $struc->annotation->get_Annotations($scalen), -rol => "11-55"); + } + for my $k (1..3) { + my $mtrixn = "mtrix".$k; + my $MTRIXN = uc($mtrixn)." "; + exists $header{$mtrixn} && $self->_write_PDB_simple_record(-name => $MTRIXN, + -annotation => $struc->annotation->get_Annotations($mtrixn), -rol => "8-60"); + } + exists $header{'tvect'} && $self->_write_PDB_simple_record(-name => "TVECT ", + -annotation => $struc->annotation->get_Annotations("tvect"), -rol => "8-70"); + + # write out coordinate section + # + my %het_res; # hetero residues + $het_res{'HOH'} = 1; # water is default + if (exists $header{'het'}) { + my ($het_line) = ($struc->annotation->get_Annotations("het"))[0]->as_text; + $het_line =~ s/^Value: //; + for ( my $k = 0; $k <= length $het_line ; $k += 63) { + my $l = substr $het_line, $k, 63; + $l =~ s/^\s*(\S+)\s+.*$/$1/; + $het_res{$l} = 1; + } + } + for my $model ($struc->get_models) { + # more then one model ? + if ($struc->get_models > 1) { + my $model_line = sprintf("MODEL %4d", $model->id); + $model_line .= " " x (80 - length($model_line) ); + $self->_print($model_line, "\n"); + } + for my $chain ($struc->get_chains($model)) { + my ($residue, $atom, $resname, $resnum, $atom_line, $atom_serial, $atom_icode, $chain_id); + my ($prev_resname, $prev_resnum, $prev_atomicode); # need these for TER record + my $wr_ter = 0; # have we already written out a TER for this chain + $chain_id = $chain->id; + if ( $chain_id eq "default" ) { + $chain_id = " "; + } +$self->debug("model_id: $model->id chain_id: $chain_id\n"); + for $residue ($struc->get_residues($chain)) { + ($resname, $resnum) = split /-/, $residue->id; + for $atom ($struc->get_atoms($residue)) { + if ($het_res{$resname}) { # HETATM + if ( ! $wr_ter && $resname ne "HOH" ) { # going from ATOM -> HETATM, we have to write TER + my $ter_line = "TER "; + $ter_line .= sprintf("%5d", $atom_serial + 1); + $ter_line .= " "; + $ter_line .= sprintf("%3s ", $prev_resname); + $ter_line .= $chain_id; + $ter_line .= sprintf("%4d", $prev_resnum); + $ter_line .= $atom_icode ? $prev_atomicode : " "; # 27 + $ter_line .= " " x (80 - length $ter_line); # extend to 80 chars + $self->_print($ter_line,"\n"); + $wr_ter = 1; + } + $atom_line = "HETATM"; + } else { + $atom_line = "ATOM "; + } + $atom_line .= sprintf("%5d ", $atom->serial); + $atom_serial = $atom->serial; # we need it for TER record + $atom_icode = $atom->icode; + # remember some stuff if next iteration needs writing TER + $prev_resname = $resname; + $prev_resnum = $resnum; + $prev_atomicode = $atom_icode; + # getting the name of the atom correct is subtrivial + my $atom_id = $atom->id; + # is pdb_atomname set, then use this (most probably set when + # reading in the PDB record) + my $pdb_atomname = $atom->pdb_atomname; + if( defined $pdb_atomname ) { + $atom_line .= sprintf("%-4s", $pdb_atomname); + } else { + # start (educated) guessing + my $element = $atom->element; + if( defined $element && $element ne "H") { + # element should be at first two positions (right justified) + # ie. Calcium should be "CA " + # C alpha should be " CA " + if( length($element) == 2 ) { + $atom_line .= sprintf("%-4s", $atom->id); + } else { + $atom_line .= sprintf(" %-3s", $atom->id); + } + } else { # old behaviour do a best guess + if ($atom->id =~ /^\dH/) { # H: four positions, left justified + $atom_line .= sprintf("%-4s", $atom->id); + } elsif (length($atom_id) == 4) { + if ($atom_id =~ /^(H\d\d)(\d)$/) { # turn H123 into 3H12 + $atom_line .= $2.$1; + } else { # no more guesses, no more alternatives + $atom_line .= $atom_id; + } + } else { # if we get here and it is not correct let me know + $atom_line .= sprintf(" %-3s", $atom->id); + } + } + } + # we don't do alternate location at this moment + $atom_line .= " "; # 17 + $atom_line .= sprintf("%3s",$resname); # 18-20 + $atom_line .= " ".$chain_id; # 21, 22 + $atom_line .= sprintf("%4d", $resnum); # 23-26 + $atom_line .= $atom->icode ? $atom->icode : " "; # 27 + $atom_line .= " "; # 28-30 + $atom_line .= sprintf("%8.3f", $atom->x); # 31-38 + $atom_line .= sprintf("%8.3f", $atom->y); # 39-46 + $atom_line .= sprintf("%8.3f", $atom->z); # 47-54 + $atom_line .= sprintf("%6.2f", $atom->occupancy); # 55-60 + $atom_line .= sprintf("%6.2f", $atom->tempfactor); # 61-66 + $atom_line .= " "; # 67-72 + $atom_line .= $atom->segID ? # segID 73-76 + sprintf("%-4s", $atom->segID) : + " "; + $atom_line .= $atom->element ? + sprintf("%2s", $atom->element) : + " "; + $atom_line .= $atom->charge ? + sprintf("%2s", $atom->charge) : + " "; + + $self->_print($atom_line,"\n"); + } + } + # write out TER record if it hasn't been written yet + if ( $resname ne "HOH" && ! $wr_ter ) { + my $ter_line = "TER "; + $ter_line .= sprintf("%5d", $atom_serial + 1); + $ter_line .= " "; + $ter_line .= sprintf("%3s ", $resname); + $ter_line .= $chain_id; + $ter_line .= sprintf("%4d", $resnum); + $ter_line .= $atom_icode ? $atom_icode : " "; # 27 + $ter_line .= " " x (80 - length $ter_line); # extend to 80 chars + $self->_print($ter_line,"\n"); + $wr_ter = 1; + } + } + if ($struc->get_models > 1) { # we need ENDMDL + my $endmdl_line = "ENDMDL" . " " x 74; + $self->_print($endmdl_line, "\n"); + } + } # for my $model + + # CONECT + my @sources = $struc->get_all_conect_source; + my ($conect_line,@conect, @bond, @hydbond, @saltbridge, $to, $type); + for my $source (@sources) { + # get all conect's + my @conect = $struc->conect($source); + # classify + for my $con (@conect) { + ($to, $type) = split /_/, $con; + if($type eq "bond") { + push @bond, $to; + } elsif($type eq "hydrogenbonded") { + push @hydbond, $to; + } elsif($type eq "saltbridged") { + push @saltbridge, $to; + } else { + $self->throw("type $type is unknown for conect"); + } + } + # and write out CONECT lines as long as there is something + # in one of the arrays + while ( @bond || @hydbond || @saltbridge) { + my ($b, $hb, $sb); + $conect_line = "CONECT". sprintf("%5d", $source); + for my $k (0..3) { + $b = shift @bond; + $conect_line .= $b ? sprintf("%5d", $b) : " "; + } + for my $k (4..5) { + $hb = shift @hydbond; + $conect_line .= $hb ? sprintf("%5d", $hb) : " "; + } + $sb = shift @saltbridge; + $conect_line .= $sb ? sprintf("%5d", $sb) : " "; + for my $k (7..8) { + $hb = shift @hydbond; + $conect_line .= $hb ? sprintf("%5d", $hb) : " "; + } + $sb = shift @saltbridge; + $conect_line .= $sb ? sprintf("%5d", $sb) : " "; + + $conect_line .= " " x (80 - length($conect_line) ); + $self->_print($conect_line, "\n"); + } + } + + + # MASTER line contains checksums, we should calculate them of course :) + my $master_line = "MASTER " . $struc->master; + $master_line .= " " x (80 - length($master_line) ); + $self->_print($master_line, "\n"); + + my $end_line = "END" . " " x 77; + $self->_print($end_line,"\n"); + + #$self->throw("write_structure is not yet implemented, start holding your breath\n"); +} + +=head2 _filehandle + + Title : _filehandle + Usage : $obj->_filehandle($newval) + Function: + Example : + Returns : value of _filehandle + Args : newvalue (optional) + + +=cut + +sub _filehandle{ + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'_filehandle'} = $value; + } + return $obj->{'_filehandle'}; + +} + +=head2 _noatom + + Title : _noatom + Usage : $obj->_noatom($newval) + Function: + Example : + Returns : value of _noatom + Args : newvalue (optional) + + +=cut + +sub _noatom{ + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'_noatom'} = $value; + } + return $obj->{'_noatom'}; + +} + +=head2 _noheader + + Title : _noheader + Usage : $obj->_noheader($newval) + Function: + Example : + Returns : value of _noheader + Args : newvalue (optional) + + +=cut + +sub _noheader{ + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'_noheader'} = $value; + } + return $obj->{'_noheader'}; + +} + +=head2 _read_PDB_singlecontline + + Title : _read_PDB_singlecontline + Usage : $obj->_read_PDB_singlecontline($record, $fromto, $buffer)) + Function: read single continued record from PDB + Returns : concatenated record entry (between $fromto columns) + Args : record, colunm delimiters, buffer + +=cut + +sub _read_PDB_singlecontline { + my ($self, $record, $fromto, $buffer) = @_; + my $concat_line; + + my ($begin, $end) = (split (/-/, $fromto)); + my $unpack_string = "x8 a2 "; + if($begin == 12) { # one additional space + $unpack_string .= "x1 a59"; + } else { + $unpack_string .= "a60"; + } + $_ = $$buffer; + while (defined( $_ ||= $self->_readline ) ) { + if ( /^$record/ ) { + my($cont, $rol) = unpack $unpack_string, $_; + if($cont =~ /\d$/ && $begin == 11) { # continuation line + # and text normally at pos 11 + $rol =~ s/^\s//; # strip leading space + } + ## no space (store litteraly) $concat_line .= $rol . " "; + $concat_line .= $rol; + } else { + last; + } + + $_ = undef; + } + $concat_line =~ s/\s$//; # remove trailing space + $$buffer = $_; + + return $concat_line; +} + + +=head2 _read_PDB_jrnl + + Title : _read_PDB_jrnl + Usage : $obj->_read_PDB_jrnl($\buffer)) + Function: read jrnl record from PDB + Returns : Bio::Annotation::Reference object + Args : + +=cut + +sub _read_PDB_jrnl { + my ($self, $buffer) = @_; + + $_ = $$buffer; + my ($auth, $titl,$edit,$ref,$publ,$refn); + while (defined( $_ ||= $self->_readline )) { + if (/^JRNL /) { + # this code belgons in a seperate method (shared with + # remark 1 parsing) + my ($rec, $subr, $cont, $rol) = unpack "A6 x6 A4 A2 x1 A51", $_; + $auth = $self->_concatenate_lines($auth,$rol) if ($subr eq "AUTH"); + $titl = $self->_concatenate_lines($titl,$rol) if ($subr eq "TITL"); + $edit = $self->_concatenate_lines($edit,$rol) if ($subr eq "EDIT"); + $ref = $self->_concatenate_lines($ref ,$rol) if ($subr eq "REF"); + $publ = $self->_concatenate_lines($publ,$rol) if ($subr eq "PUBL"); + $refn = $self->_concatenate_lines($refn,$rol) if ($subr eq "REFN"); + } else { + last; + } + + $_ = undef; # trigger reading of next line + } # while + + $$buffer = $_; + my $jrnl_ref = Bio::Annotation::Reference->new; + + $jrnl_ref->authors($auth); + $jrnl_ref->title($titl); + $jrnl_ref->location($ref); + $jrnl_ref->publisher($publ); + $jrnl_ref->editors($edit); + $jrnl_ref->encoded_ref($refn); + + return $jrnl_ref; +} # sub _read_PDB_jrnl + + +=head2 _read_PDB_remark_1 + + Title : _read_PDB_remark_1 + Usage : $obj->_read_PDB_remark_1($\buffer)) + Function: read "remark 1" record from PDB + Returns : array of Bio::Annotation::Reference objects + Args : + +=cut + +sub _read_PDB_remark_1 { + my ($self, $buffer) = @_; + + $_ = $$buffer; + my ($auth, $titl,$edit,$ref,$publ,$refn,$refnum); + my @refs; + + while (defined( $_ ||= $self->_readline )) { + if (/^REMARK 1 /) { + if (/^REMARK 1\s+REFERENCE\s+(\d+)\s*/) { + $refnum = $1; + if ($refnum != 1) { # this is first line of a reference + my $rref = Bio::Annotation::Reference->new; + $rref->authors($auth); + $rref->title($titl); + $rref->location($ref); + $rref->publisher($publ); + $rref->editors($edit); + $rref->encoded_ref($refn); + $auth = $titl = $edit = $ref = $publ = $refn = undef; + push @refs, $rref; + } + } else { + # this code belgons in a seperate method (shared with + # remark 1 parsing) + my ($rec, $subr, $cont, $rol) = unpack "A6 x6 A4 A2 x1 A51", $_; + $auth = $self->_concatenate_lines($auth,$rol) if ($subr eq "AUTH"); + $titl = $self->_concatenate_lines($titl,$rol) if ($subr eq "TITL"); + $edit = $self->_concatenate_lines($edit,$rol) if ($subr eq "EDIT"); + $ref = $self->_concatenate_lines($ref ,$rol) if ($subr eq "REF"); + $publ = $self->_concatenate_lines($publ,$rol) if ($subr eq "PUBL"); + $refn = $self->_concatenate_lines($refn,$rol) if ($subr eq "REFN"); + } + } else { + # have we seen any reference at all (could be single REMARK 1 line + if ( ! defined ($refnum) ) { + last; # get out of while() + } + + # create last reference + my $rref = Bio::Annotation::Reference->new; + $rref->authors($auth); + $rref->title($titl); + $rref->location($ref); + $rref->publisher($publ); + $rref->editors($edit); + $rref->encoded_ref($refn); + push @refs, $rref; + last; + } + + $_ = undef; # trigger reading of next line + } # while + + $$buffer = $_; + + return @refs; +} # sub _read_PDB_jrnl + + +=head2 _read_PDB_coordinate_section + + Title : _read_PDB_coordinate_section + Usage : $obj->_read_PDB_coordinate_section($\buffer)) + Function: read one model from a PDB + Returns : Bio::Structure::Model object + Args : + +=cut + +sub _read_PDB_coordinate_section { + my ($self, $buffer, $struc) = @_; + my ($model_num, $chain_name, $residue_name, $atom_name); # to keep track of state + $model_num = ""; + $chain_name = ""; + $residue_name = ""; + $atom_name = ""; + + my $atom_unpack = "x6 a5 x1 a4 a1 a3 x1 a1 a4 a1 x3 a8 a8 a8 a6 a6 x6 a4 a2 a2"; + my $anisou_unpack = "x6 a5 x1 a4 a1 a3 x1 a1 a4 a1 x1 a7 a7 a7 a7 a7 a7 a4 a2 a2"; + + my $model = Bio::Structure::Model->new; + $model->id('default'); + my $noatom = $self->_noatom; + my ($chain, $residue, $atom, $old); + my (%_ch_in_model); # which chains are already in this model + + $_ = $$buffer; + while (defined( $_ ||= $self->_readline )) { + # start of a new model + if (/^MODEL\s+(\d+)/) { + $model_num = $1; +$self->debug("_read_PDB_coor: parsing model $model_num\n"); + $model->id($model_num); + if (/^MODEL\s+\d+\s+\S+/) { # old format (pre 2.1) + $old = 1; + } + } + # old hier ook setten XXX + # ATOM lines, if first set chain + if (/^(ATOM |HETATM|SIGATM)/) { + my @line_elements = unpack $atom_unpack, $_; + my $pdb_atomname = $line_elements[1]; # need to get this before removing spaces + for my $k (0 .. $#line_elements) { + $line_elements[$k] =~ s/^\s+//; # remove leading space + $line_elements[$k] =~ s/\s+$//; # remove trailing space + $line_elements[$k] = undef if ($line_elements[$k] =~ /^\s*$/); + } + my ($serial, $atomname, $altloc, $resname, $chainID, $resseq, $icode, $x, $y, $z, + $occupancy, $tempfactor, $segID, $element, $charge) = @line_elements; + $chainID = 'default' if ( !defined $chainID ); + if ($chainID ne $chain_name) { # possibly a new chain + # fix for bug #1187 + # we can have ATOM/HETATM of an already defined chain (A B A B) + # e.g. 1abm + + if (exists $_ch_in_model{$chainID} ) { # we have already seen this chain in this model + $chain = $_ch_in_model{$chainID}; + } else { # we create a new chain + $chain = Bio::Structure::Chain->new; + $struc->add_chain($model,$chain); + $chain->id($chainID); + $_ch_in_model{$chainID} = $chain; + } + $chain_name = $chain->id; + } + + # fix from bug 1485, by dhoworth@mrc-lmb.cam.ac.uk + # passes visual inspection by Ewan and tests are ok. + # (bug fix was to add $icode here to make unique) + # original looked like + # my $res_name_num = $resname."-".$resseq; + + # to get around warning, set icode to "" if not defined + if( !defined $icode ) { + $icode = ""; + } + + my $res_name_num = $resname."-".$resseq.$icode; + if ($res_name_num ne $residue_name) { # new residue + $residue = Bio::Structure::Residue->new; + $struc->add_residue($chain,$residue); + $residue->id($res_name_num); + $residue_name = $res_name_num; + $atom_name = ""; # only needed inside a residue + } + # get out of here if we don't want the atom objects + if ($noatom) { + $_ = undef; + next; + } + # alternative location: only take first one + if ( $altloc && ($altloc =~ /\S+/) && ($atomname eq $atom_name) ) { + $_ = undef; # trigger reading next line + next; + } + if (/^(ATOM |HETATM)/) { # ATOM / HETATM + $atom_name = $atomname; + $atom = Bio::Structure::Atom->new; + $struc->add_atom($residue,$atom); + $atom->id($atomname); + $atom->pdb_atomname($pdb_atomname); # store away PDB atomname for writing out + $atom->serial($serial); + $atom->icode($icode); + $atom->x($x); + $atom->y($y); + $atom->z($z); + $atom->occupancy($occupancy); + $atom->tempfactor($tempfactor); + $atom->segID($segID); # deprecated but used by people + if (! $old ) { + $atom->element($element); + $atom->charge($charge); + } + } + else { # SIGATM + my $sigx = $x; + my $sigy = $y; + my $sigz = $z; + my $sigocc = $occupancy; + my $sigtemp = $tempfactor; + if ($atom_name ne $atomname) { # something wrong with PDB file + $self->throw("A SIGATM record should have the same $atomname as the previous record $atom_name\n"); + } + $atom->sigx($sigx); + $atom->sigy($sigy); + $atom->sigz($sigz); + $atom->sigocc($sigocc); + $atom->sigtemp($sigtemp); + + } + } # ATOM|HETARM|SIGATM + + # ANISOU | SIGUIJ lines + if (/^(ANISOU|SIGUIJ)/) { + if ($noatom) { + $_ = undef; + next; + } + my @line_elements = unpack $anisou_unpack, $_; + for my $k (0 .. $#line_elements) { + $line_elements[$k] =~ s/^\s+//; # remove leading space + $line_elements[$k] =~ s/\s+$//; # remove trailing space + $line_elements[$k] = undef if ($line_elements[$k] =~ /^\s*$/); + } + my ($serial, $atomname, $altloc, $resname, $chainID, $resseq, $icode, + $u11,$u22, $u33, $u12, $u13, $u23, $segID, $element, $charge) = @line_elements; +$self->debug("read_PDB_coor: parsing ANISOU record: $serial $atomname\n"); + if ( $altloc && ($altloc =~ /\S+/) && ($atomname eq $atom_name) ) { + $_ = undef; + next; + } + if (/^ANISOU/) { + if ($atom_name ne $atomname) { # something wrong with PDB file + $self->throw("A ANISOU record should have the same $atomname as the previous record $atom_name\n"); + } + $atom->aniso("u11",$u11); + $atom->aniso("u22",$u22); + $atom->aniso("u33",$u33); + $atom->aniso("u12",$u12); + $atom->aniso("u13",$u13); + $atom->aniso("u23",$u23); + } + else { # SIGUIJ + if ($atom_name ne $atomname) { # something wrong with PDB file + $self->throw("A SIGUIJ record should have the same $atomname as the previous record $atom_name\n"); + } + # could use different variable names, but hey ... + $atom->aniso("sigu11",$u11); + $atom->aniso("sigu22",$u22); + $atom->aniso("sigu33",$u33); + $atom->aniso("sigu12",$u12); + $atom->aniso("sigu13",$u13); + $atom->aniso("sigu23",$u23); + } + } # ANISOU | SIGUIJ + + if (/^TER /) { + $_ = undef; + next; + } + + if (/^ENDMDL/) { + $_ = $self->_readline; + last; + } + + if (/^(CONECT|MASTER)/) { # get out of here + # current line is OK + last; + } + $_ = undef; + + } # while + + $$buffer = $_; + + return $model; +} # _read_PDB_coordinate_section + + +sub _write_PDB_simple_record { + my ($self, @args) = @_; + my ($name, $cont , $annotation, $rol, $string) = + $self->_rearrange([qw( + NAME + CONT + ANNOTATION + ROL + STRING + )], + @args); + if (defined $string && defined $annotation) { + $self->throw("you can only supply one of -annoation or -string"); + } + my ($output_string, $ann_string, $t_string); + my ($rol_begin, $rol_end) = $rol =~ /^(\d+)-(\d+)$/; + my $rol_length = $rol_end - $rol_begin +1; + if ($string) { + if (length $string > $rol_length) { + # we might need to split $string in multiple lines + while (length $string > $rol_length) { + # other option might be to go for a bunch of substr's + my @c = split//,$string; + my $t = $rol_length; # index into @c + while ($c[$t] ne " ") { # find first space, going backwards +$self->debug("c[t]: $c[$t] $t\n"); + $t--; + if ($t == 0) { $self->throw("Found no space for $string\n"); } + } +$self->debug("t: $t rol_length: $rol_length\n"); + $ann_string .= substr($string, 0, $t); +$self->debug("ann_string: $ann_string\n"); + $ann_string .= " " x ($rol_length - $t ); + $string = substr($string, $t+1); + $string =~ s/^\s+//; +$self->debug("ann_string: $ann_string~~\nstring: $string~~\n"); + } + $ann_string .= $string; + } else { + $ann_string = $string; + } + } else { + $ann_string = $annotation->as_text; + $ann_string =~ s/^Value: //; + } + # ann_string contains the thing to write out, writing out happens below + my $ann_length = length $ann_string; + +$self->debug("ann_string: $ann_string\n"); + if ($cont) { + my ($c_begin, $c_end) = $cont =~ /^(\d+)-(\d+)$/; + if ( $ann_length > $rol_length ) { # we need to continuation lines + my $first_line = 1; + my $cont_number = 2; + my $out_line; + my $num_pos = $rol_length; + my $i = 0; + while( $i < $ann_length ) { + $t_string = substr($ann_string, $i, $num_pos); +$self->debug("t_string: $t_string~~$i $num_pos\n"); + if ($first_line) { + $out_line = $name . " " x ($rol_begin - $c_begin) . $t_string; + $out_line .= " " x (80 - length($out_line) ) . "\n"; + $first_line = 0; + $output_string = $out_line; + $i += $num_pos; # first do counter + if ($rol_begin - $c_end == 1) { # next line one character less + $num_pos--; + } + } else { + $out_line = $name . sprintf("%2d",$cont_number); + # a space after continuation number + if ($rol_begin - $c_end == 1) { # one space after cont number + $out_line .= " "; + $out_line .= $t_string; + } else { + $out_line .= " " x ($rol_begin - $c_end - 1) . $t_string; + } + $out_line .= " " x (80 -length($out_line) ) . "\n"; + $cont_number++; + $output_string .= $out_line; + $i += $num_pos; + } + } + } else { # no continuation + my $spaces = $rol_begin - $c_begin; # number of spaces need to insert + $output_string = $name . " " x $spaces . $ann_string; + $output_string .= " " x (80 - length($output_string) ); + } + } else { # no contintuation lines + if ($ann_length < $rol_length) { + $output_string = $name . $ann_string; + $output_string .= " " x (80 - length($output_string) ); + } else { + for (my $i = 0; $i < $ann_length; $i += $rol_length) { + my $out_line; + $t_string = substr($ann_string, $i, $rol_length); + $out_line = $name . $t_string; + $out_line .= " " x (80 -length($out_line) ) . "\n"; + $output_string .= $out_line; + } + } + } + $output_string =~ s/\n$//; # remove trailing newline + $self->_print("$output_string\n"); + +} + +sub _write_PDB_remark_record { + my ($self, $struc, $remark_num) = @_; + my ($ann) = $struc->annotation->get_Annotations("remark_$remark_num"); + my $name = sprintf("REMARK %3d ",$remark_num); + $self->_write_PDB_simple_record(-name => $name, -annotation => $ann, -rol => "12-70"); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Structure/Model.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Structure/Model.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,294 @@ +# $Id: Model.pm,v 1.6 2002/10/22 07:38:44 lapp Exp $ +# +# bioperl module for Bio::Structure::Model +# +# Cared for by Kris Boulez <kris.boulez@algonomics.com> +# +# Copyright Kris Boulez +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Structure::Model - Bioperl structure Object, describes a Model + +=head1 SYNOPSIS + + #add synopsis here + +=head1 DESCRIPTION + +This object stores a Bio::Structure::Chain + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Kris Boulez + +Email kris.boulez@algonomics.com + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Structure::Model; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::Structure::Entry; +use Bio::Structure::Chain; +@ISA = qw(Bio::Root::Root); + + +=head2 new() + + Title : new() + Usage : $struc = Bio::Structure::Model->new( + -id => 'human_id', + ); + + Function: Returns a new Bio::Structure::Model object from basic + constructors. Probably most called from Bio::Structure::IO. + Returns : a new Bio::Structure::Model object + +=cut + + + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + my($id, $chain, $residue ) = + $self->_rearrange([qw( + ID + CHAIN + RESIDUE + )], + @args); + + $id && $self->id($id); + + $chain && $self->throw("you have to add chain via an Entry object\n"); + + $residue && $self->throw("you have to add residues via an Entry object\n"); + + return $self; +} + + + +=head2 chain() + + Title : chain + Usage : + Function: will eventually allow parent/child navigation not via an Entry object + Returns : + Args : + +=cut + +sub chain { + my ($self,$value) = @_; + + $self->throw("go via an Entry object\n"); +} + + +=head2 add_chain() + + Title : add_chain + Usage : + Function: will eventually allow parent/child navigation not via an Entry object + Returns : + Args : + +=cut + +sub add_chain { + my ($self,$value) = @_; + + $self->throw("go via an Entry object for now\n"); +} + +=head2 entry() + + Title : entry + Usage : + Function: will eventually allow parent/child navigation not via an Entry object + Returns : + Args : + +=cut + +sub entry { + my($self) = @_; + + $self->throw("Model::entry go via an Entry object please\n"); +} + + +=head2 id() + + Title : id + Usage : $model->id("model 5") + Function: Gets/sets the ID for this model + Returns : the ID + Args : the ID + +=cut + +sub id { + my ($self, $value) = @_;; + if (defined $value) { + $self->{'id'} = $value; + } + return $self->{'id'}; +} + +=head2 residue() + + Title : residue + Usage : + Function: will eventually allow parent/child navigation not via an Entry object + Returns : + Args : + +=cut + +sub residue { + my ($self, @args) = @_; + + $self->throw("need to go via Entry object or learn symbolic refs\n"); +} + + +=head2 add_residue() + + Title : add_residue + Usage : + Function: will eventually allow parent/child navigation not via an Entry object + Returns : + Args : + +=cut + +sub add_residue { + my ($self, @args) = @_; + + $self->throw("go via entry->add_residue(chain, residue)\n"); +} + + + +sub DESTROY { + my $self = shift; + + # no specific DESTROY for now +} + +# +# from here on only private methods +# + +=head2 _remove_chains() + + Title : _remove_chains + Usage : + Function: Removes the chains attached to a Model. Tells the chains they + don't belong to this Model any more + Returns : + Args : + +=cut + +sub _remove_chains { + my ($self) = shift; + + $self->throw("use Entry methods pleae\n"); +} + + +=head2 _remove_entry() + + Title : _remove_entry + Usage : + Function: Removes the Entry this Model is atttached to. + Returns : + Args : + +=cut + +sub _remove_entry { + my ($self) = shift; + + $self->throw("use a method based on an Entry object\n"); +} + + +=head2 _create_default_chain() + + Title : _create_default_chain + Usage : + Function: Creates a default Chain for this Model. Typical situation + in an X-ray structure where there is only one chain + Returns : + Args : + +=cut + +sub _create_default_chain { + my ($self) = shift; + + my $chain = Bio::Structure::Chain->new(-id => "default"); +} + + +=head2 _grandparent() + + Title : _grandparent + Usage : + Function: get/set a symbolic reference to our grandparent + Returns : + Args : + +=cut + +sub _grandparent { + my($self,$symref) = @_; + + if (ref($symref)) { + $self->throw("Thou shall only pass strings in here, no references $symref\n"); + } + if (defined $symref) { + $self->{'grandparent'} = $symref; + } + return $self->{'grandparent'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Structure/Residue.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Structure/Residue.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,255 @@ +# $Id: Residue.pm,v 1.7 2002/10/22 07:38:44 lapp Exp $ +# +# bioperl module for Bio::Structure::Residue +# +# Cared for by Kris Boulez <kris.boulez@algonomics.com> +# +# Copyright Kris Boulez +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Structure::Residue - Bioperl structure Object, describes a Residue + +=head1 SYNOPSIS + + #add synopsis here + +=head1 DESCRIPTION + +This object stores a Bio::Structure::Residue + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Kris Boulez + +Email kris.boulez@algonomics.com + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Structure::Residue; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::Structure::Chain; +use Bio::Structure::Atom; +@ISA = qw(Bio::Root::Root); + + +=head2 new() + + Title : new() + Usage : $residue = Bio::Structure::Residue->new( + -id => 'human_id', + ); + + Function: Returns a new Bio::Structure::Residue object from basic + constructors. Probably most called from Bio::Structure::IO. + Returns : a new Bio::Structure::Residue object + +=cut + + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + my($id, $atom ) = + $self->_rearrange([qw( + ID + ATOM + )], + @args); + + $id && $self->id($id); + + $self->{'atom'} = []; + + # the 'smallest' (and only) item that can be added to a residue is an atom + + $atom && $self->throw("add atoms via an Entry object entry->add_atom(residue,atom)\n"); + + return $self; +} + + + +=head2 atom() + + Title : atom + Usage : + Function: nothing usefull untill I get symbolic references to do what I want + Returns : + Args : + +=cut + +sub atom { + my ($self,$value) = @_; + + $self->throw("no code down here, go see an Entry object nearby\n"); +} + + +=head2 add_atom() + + Title : add_atom + Usage : + Function: nothing usefull untill I get symbolic references to do what I want + Returns : + Args : + +=cut + +sub add_atom { + my($self,$value) = @_; + + $self->throw("nothing here, use a method on an Entry object\n"); +} + + +=head2 chain() + + Title : chain + Usage : $chain = $residue->chain($chain) + Function: Sets the Chain this Residue belongs to + Returns : Returns the Chain this Residue belongs to + Args : reference to a Chain + +=cut + +sub chain { + my($self, $value) = @_; + + $self->throw("use an Entry based method please\n"); +} + + +=head2 id() + + Title : id + Usage : $residue->id("TRP-35") + Function: Gets/sets the ID for this residue + Returns : the ID + Args : the ID + +=cut + +sub id { + my ($self, $value) = @_;; + if (defined $value) { + $self->{'id'} = $value; + } + return $self->{'id'}; +} + + +=head2 DESTROY() + + Title : DESTROY + Usage : + Function: destructor ( get rid of circular references ) + Returns : + Args : + +=cut + +sub DESTROY { + my $self = shift; + + # no specific destruction for now +} + + +# +# from here on only private methods +# + +=head2 _remove_atoms() + + Title : _remove_atoms + Usage : + Function: Removes the atoms attached to a Residue. Tells the atoms they + don't belong to this Residue any more + Returns : + Args : + +=cut + +sub _remove_atoms { + my ($self) = shift; + + $self->throw("no code here\n"); +} + + +=head2 _remove_chain() + + Title : _remove_chain + Usage : + Function: Removes the Chain this Residue is atttached to. + Returns : + Args : + +=cut + +sub _remove_chain { + my ($self) = shift; + + $self->{'chain'} = undef; +} + + +=head2 _grandparent() + + Title : _grandparent + Usage : + Function: get/set a symbolic reference to our grandparent + Returns : + Args : + +=cut + +sub _grandparent { + my($self,$symref) = @_; + + if (ref($symref)) { + $self->throw("Thou shall only pass strings in here, no references $symref\n"); + } + if (defined $symref) { + $self->{'grandparent'} = $symref; + } + return $self->{'grandparent'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Structure/SecStr/DSSP/Res.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Structure/SecStr/DSSP/Res.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1319 @@ +# $id $ +# +# bioperl module for Bio::Structure::SecStr::DSSP::Res.pm +# +# Cared for by Ed Green <ed@compbio.berkeley.edu> +# +# Copyright Univ. of California +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Structure::SecStr::DSSP::Res - Module for parsing/accessing dssp output + +=head1 SYNOPSIS + + my $dssp_obj = new Bio::Structure::SecStr::DSSP::Res( '-file' => 'filename.dssp' ); + + # or + + my $dssp-obj = new Bio::Structure::SecStr::DSSP::Res( '-fh' => \*STDOUT ); + + # get DSSP defined Secondary Structure for residue 20 + $sec_str = $dssp_obj->resSecStr( 20 ); + + # get dssp defined sec. structure summary for PDB residue # 10 of chain A + + $sec_str = $dssp_obj->resSecStrSum( '10:A' ); + +=head1 DESCRIPTION + +DSSP::Res is a module for objectifying DSSP output. Methods are then +available for extracting all the information within the output file +and convenient subsets of it. +The principal purpose of DSSP is to determine secondary structural +elements of a given structure. + + ( Dictionary of protein secondary structure: pattern recognition + of hydrogen-bonded and geometrical features. + Biopolymers. 1983 Dec;22(12):2577-637. ) + +The DSSP program is available from: + http://www.cmbi.kun.nl/swift/dssp + +This information is available on a per residue basis ( see resSecStr +and resSecStrSum methods ) or on a per chain basis ( see secBounds +method ). + +resSecStr() & secBounds() return one of the following: + 'H' = alpha helix + 'B' = residue in isolated beta-bridge + 'E' = extended strand, participates in beta ladder + 'G' = 3-helix (3/10 helix) + 'I' = 5 helix (pi helix) + 'T' = hydrogen bonded turn + 'S' = bend + '' = no assignment + +A more general classification is returned using the resSecStrSum() +method. The purpose of this is to have a method for DSSP and STRIDE +derived output whose range is the same. +Its output is one of the following: + + 'H' = helix ( => 'H', 'G', or 'I' from above ) + 'B' = beta ( => 'B' or 'E' from above ) + 'T' = turn ( => 'T' or 'S' from above ) + ' ' = no assignment ( => ' ' from above ) + +The methods are roughly divided into 3 sections: +1. Global features of this structure (PDB ID, total surface area, + etc.). These methods do not require an argument. +2. Residue specific features ( amino acid, secondary structure, + solvent exposed surface area, etc. ). These methods do require an + arguement. The argument is supposed to uniquely identify a + residue described within the structure. It can be of any of the + following forms: + ('#A:B') or ( #, 'A', 'B' ) + || | + || - Chain ID (blank for single chain) + |--- Insertion code for this residue. Blank for most residues. + |--- Numeric portion of residue ID. + + (#) + | + --- Numeric portion of residue ID. If there is only one chain and + it has no ID AND there is no residue with an insertion code at this + number, then this can uniquely specify a residue. + + ('#:C') or ( #, 'C' ) + | | + | -Chain ID + ---Numeric portion of residue ID. + + If a residue is incompletely specified then the first residue that + fits the arguments is returned. For example, if 19 is the argument + and there are three chains, A, B, and C with a residue whose number + is 19, then 19:A will be returned (assuming its listed first). + + Since neither DSSP nor STRIDE correctly handle alt-loc codes, they + are not supported by these modules. + +3. Value-added methods. Return values are not verbatem strings + parsed from DSSP or STRIDE output. + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ed Green + +Email ed@compbio.berkeley.edu + + +=head1 APPENDIX + +The rest of the documentation details each method. +Internal methods are preceded with a _ + +=cut + +package Bio::Structure::SecStr::DSSP::Res; +use strict; +use vars qw(@ISA); +use Bio::Root::Root; +use Bio::Root::IO; +use Bio::PrimarySeq; + +@ISA = qw(Bio::Root::Root); + +# Would be a class variable if Perl had them + + #attribute begin col # columns +our %lookUp = ( 'pdb_resnum' => [ 5, 5 ], + 'insertionco' => [ 10, 1 ], + 'pdb_chain' => [ 11, 1 ], + + 'amino_acid' => [ 13, 1 ], + 'term_sig' => [ 14, 1 ], + + 'ss_summary' => [ 16, 1 ], + '3tph' => [ 18, 1 ], + '4tph' => [ 19, 1 ], + '5tph' => [ 20, 1 ], + 'geo_bend' => [ 21, 1 ], + 'chirality' => [ 22, 1 ], + 'beta_br1la' => [ 23, 1 ], + 'beta_br2la' => [ 24, 1 ], + + 'bb_part1nu' => [ 25, 4 ], + 'bb_part2nu' => [ 29, 4 ], + 'betash_lab' => [ 33, 1 ], + + 'solv_acces' => [ 34, 4 ], + + 'hb1_nh_o_p' => [ 39, 6 ], + 'hb1_nh_o_e' => [ 46, 4 ], + + 'hb1_o_hn_p' => [ 50, 6 ], + 'hb1_o_hn_e' => [ 57, 4 ], + + 'hb2_nh_o_p' => [ 61, 6 ], + 'hb2_nh_o_e' => [ 68, 4 ], + + 'hb2_o_hn_p' => [ 72, 6 ], + 'hb2_o_hn_e' => [ 79, 4 ], + + 'tco' => [ 85, 6 ], + + 'kappa' => [ 91, 6 ], + + 'alpha' => [ 97, 6 ], + + 'phi' => [ 103, 6 ], + + 'psi' => [ 109, 6 ], + + 'x_ca' => [ 115, 7 ], + + 'y_ca' => [ 122, 7 ], + + 'z_ca' => [ 129, 7 ] ); + + +=head1 CONSTRUCTOR + + +=cut + + +=head2 new + + Title : new + Usage : makes new object of this class + Function : Constructor + Example : $dssp_obj = Bio::DSSP:Res->new( filename or FILEHANDLE ) + Returns : object (ref) + Args : filename ( must be proper DSSP output file ) + +=cut + +sub new { + my ( $class, @args ) = @_; + my $self = $class->SUPER::new( @args ); + my $io = Bio::Root::IO->new( @args ); + $self->_parse( $io->_fh() ); + $io->close(); + return $self; +} + +=head1 ACCESSORS + + +=cut + +# GLOBAL FEATURES / INFO / STATS + +=head2 totSurfArea + + Title : totSurfArea + Usage : returns total accessible surface area in square Ang. + Function : + Example : $surArea = $dssp_obj->totSurfArea(); + Returns : scalar + Args : none + +=cut + +sub totSurfArea { + my $self = shift; + return $self->{ 'Head' }->{ 'ProAccSurf' }; +} + +=head2 numResidues + + Title : numResidues + Usage : returns the total number of residues in all chains or + just the specified chain if a chain is specified + Function : + Example : $num_res = $dssp_obj->numResidues(); + Returns : scalar int + Args : none + + +=cut + +sub numResidues { + my $self = shift; + my $chain = shift; + if ( !( $chain ) ) { + return $self->{'Head'}->{'TotNumRes'}; + } + else { + my ( $num_res, + $cont_seg ); + my $cont_seg_pnt = $self->_contSegs(); + foreach $cont_seg ( @{ $cont_seg_pnt } ) { + if ( $chain eq $cont_seg->[ 2 ] ) { + # this segment is part of the chain we want + $num_res += ( $self->_toDsspKey( $cont_seg->[ 1 ] ) + - $self->_toDsspKey( $cont_seg->[ 0 ] ) + + 1 ); # this works because we know the + # the region between the start + # and end of a dssp key is + # continuous + } + } + return $num_res; + } +} + +# STRAIGHT FROM PDB ENTRY + +=head2 pdbID + + Title : pdbID + Usage : returns pdb identifier ( 1FJM, e.g.) + Function : + Example : $pdb_id = $dssp_obj->pdbID(); + Returns : scalar string + Args : none + + +=cut + +sub pdbID { + my $self = shift; + return $self->{'Head'}->{'PDB'}; +} + +=head2 pdbAuthor + + Title : pdbAuthor + Usage : returns author field + Function : + Example : $auth = $dssp_obj->pdbAuthor() + Returns : scalar string + Args : none + + +=cut + +sub pdbAuthor { + my $self = shift; + return $self->{'Head'}->{'AUTHOR'}; +} + +=head2 pdbCompound + + Title : pdbCompound + Usage : returns pdbCompound given in PDB file + Function : + Example : $cmpd = $dssp_obj->pdbCompound(); + Returns : scalar string + Args : none + + +=cut + +sub pdbCompound { + my $self = shift; + return $self->{'Head'}->{'COMPND'}; +} + +=head2 pdbDate + + Title : pdbDate + Usage : returns date given in PDB file + Function : + Example : $pdb_date = $dssp_obj->pdbDate(); + Returns : scalar + Args : none + + +=cut + +sub pdbDate { + my $self = shift; + return $self->{'Head'}->{'DATE'}; +} + +=head2 pdbHeader + + Title : pdbHeader + Usage : returns header info from PDB file + Function : + Example : $header = $dssp_obj->pdbHeader(); + Returns : scalar + Args : none + + +=cut + +sub pdbHeader { + my $self = shift; + return $self->{'Head'}->{'HEADER'}; +} + +=head2 pdbSource + + Title : pdbSource + Usage : returns pdbSource information from PDBSOURCE line + Function : + Example : $pdbSource = $dssp_obj->pdbSource(); + Returns : scalar + Args : none + + +=cut + +sub pdbSource { + my $self = shift; + return $self->{'Head'}->{'SOURCE'}; +} + + +# RESIDUE SPECIFIC ACCESSORS + +=head2 resAA + + Title : resAA + Usage : fetches the 1 char amino acid code, given an id + Function : + Example : $aa = $dssp_obj->aminoAcid( '20:A' ); # pdb id as arg + Returns : 1 character scalar string + Args : RESIDUE_ID + + +=cut + +sub resAA { + my $self = shift; + my @args = @_; + my $dssp_key = $self->_toDsspKey( @args ); + return $self->{ 'Res' }->[ $dssp_key ]->{ 'amino_acid' }; +} + +=head2 resPhi + + Title : resPhi + Usage : returns phi angle of a single residue + Function : accessor + Example : $phi = $dssp_obj->resPhi( RESIDUE_ID ) + Returns : scalar + Args : RESIDUE_ID + + +=cut + +sub resPhi { + my $self = shift; + my @args = @_; + my $dssp_key = $self->_toDsspKey( @args ); + return $self->{ 'Res' }->[ $dssp_key ]->{ 'phi' }; +} + +=head2 resPsi + + Title : resPsi + Usage : returns psi angle of a single residue + Function : accessor + Example : $psi = $dssp_obj->resPsi( RESIDUE_ID ) + Returns : scalar + Args : RESIDUE_ID + + +=cut + +sub resPsi { + my $self = shift; + my @args = @_; + my $dssp_key = $self->_toDsspKey( @args ); + return $self->{ 'Res' }->[ $dssp_key ]->{ 'psi' }; +} + +=head2 resSolvAcc + + Title : resSolvAcc + Usage : returns solvent exposed area of this residue in + square Angstroms + Function : + Example : $solv_acc = $dssp_obj->resSolvAcc( RESIDUE_ID ); + Returns : scalar + Args : RESIDUE_ID + + +=cut + +sub resSolvAcc { + my $self = shift; + my @args = @_; + my $dssp_key = $self->_toDsspKey( @args ); + return $self->{ 'Res' }->[ $dssp_key ]->{ 'solv_acces' }; +} + +=head2 resSurfArea + + Title : resSurfArea + Usage : returns solvent exposed area of this residue in + square Angstroms + Function : + Example : $solv_acc = $dssp_obj->resSurfArea( RESIDUE_ID ); + Returns : scalar + Args : RESIDUE_ID + + +=cut + +sub resSurfArea { + my $self = shift; + my @args = @_; + my $dssp_key = $self->_toDsspKey( @args ); + return $self->{ 'Res' }->[ $dssp_key ]->{ 'solv_acces' }; +} + +=head2 resSecStr + + Title : resSecStr + Usage : $ss = $dssp_obj->resSecStr( RESIDUE_ID ); + Function : returns the DSSP secondary structural designation of this residue + Example : + Returns : a character ( 'B', 'E', 'G', 'H', 'I', 'S', 'T', or ' ' ) + Args : RESIDUE_ID + NOTE : The range of this method differs from that of the + resSecStr method in the STRIDE SecStr parser. That is because of the + slightly different format for STRIDE and DSSP output. The resSecStrSum + method exists to map these different ranges onto an identical range. + +=cut + +sub resSecStr { + my $self = shift; + my @args = @_; + my $dssp_key = $self->_toDsspKey( @args ); + my $ss_char = $self->{ 'Res' }->[ $dssp_key ]->{ 'ss_summary' }; + return $ss_char if $ss_char; + return ' '; +} + + +=head2 resSecStrSum + + Title : resSecStrSum + Usage : $ss = $dssp_obj->resSecStrSum( $id ); + Function : returns what secondary structure group this residue belongs + to. One of: 'H': helix ( H, G, or I ) + 'B': beta ( B or E ) + 'T': turn ( T or S ) + ' ': none ( ' ' ) + This method is similar to resSecStr, but the information + it returns is less specific. + Example : + Returns : a character ( 'H', 'B', 'T', or ' ' ) + Args : dssp residue number of pdb residue identifier + + +=cut + +sub resSecStrSum { + my $self = shift; + my @args = @_; + my $dssp_key = $self->_toDsspKey( @args ); + my $ss_char = $self->{ 'Res' }->[ $dssp_key ]->{ 'ss_summary' }; + if ( $ss_char eq 'H' || $ss_char eq 'G' || $ss_char eq 'I' ) { + return 'H'; + } + if ( $ss_char eq ' ' || !( $ss_char ) ) { + return ' '; + } + if ( $ss_char eq 'B' || $ss_char eq 'E' ) { + return 'B'; + } + else { + return 'T'; + } +} + +# DSSP SPECIFIC + +=head2 hBonds + + Title : hBonds + Usage : returns number of 14 different types of H Bonds + Function : + Example : $hb = $dssp_obj->hBonds + Returns : pointer to 14 element array of ints + Args : none + NOTE : The different type of H-Bonds reported are, in order: + TYPE O(I)-->H-N(J) + IN PARALLEL BRIDGES + IN ANTIPARALLEL BRIDGES + TYPE O(I)-->H-N(I-5) + TYPE O(I)-->H-N(I-4) + TYPE O(I)-->H-N(I-3) + TYPE O(I)-->H-N(I-2) + TYPE O(I)-->H-N(I-1) + TYPE O(I)-->H-N(I+0) + TYPE O(I)-->H-N(I+1) + TYPE O(I)-->H-N(I+2) + TYPE O(I)-->H-N(I+3) + TYPE O(I)-->H-N(I+4) + TYPE O(I)-->H-N(I+5) + +=cut + +sub hBonds { + my $self = shift; + return $self->{ 'HBond'}; +} + +=head2 numSSBr + + Title : numSSBr + Usage : returns info about number of SS-bridges + Function : + Example : @SS_br = $dssp_obj->numSSbr(); + Returns : 3 element scalar int array + Args : none + + +=cut + +sub numSSBr { + my $self = shift; + return ( $self->{'Head'}->{'TotSSBr'}, + $self->{'Head'}->{'TotIaSSBr'}, + $self->{'Head'}->{'TotIeSSBr'} ); +} + +=head2 resHB_O_HN + + Title : resHB_O_HN + Usage : returns pointer to a 4 element array + consisting of: relative position of binding + partner #1, energy of that bond (kcal/mol), + relative positionof binding partner #2, + energy of that bond (kcal/mol). If the bond + is not bifurcated, the second bond is reported + as 0, 0.0 + Function : accessor + Example : $oBonds_ptr = $dssp_obj->resHB_O_HN( RESIDUE_ID ) + Returns : pointer to 4 element array + Args : RESIDUE_ID + + +=cut + +sub resHB_O_HN { + my $self = shift; + my @args = @_; + my $dssp_key = $self->_toDsspKey( @args ); + return ( $self->{ 'Res' }->[ $dssp_key ]->{ 'hb1_o_hn_p' }, + $self->{ 'Res' }->[ $dssp_key ]->{ 'hb1_o_hn_e' }, + $self->{ 'Res' }->[ $dssp_key ]->{ 'hb2_o_hn_p' }, + $self->{ 'Res' }->[ $dssp_key ]->{ 'hb2_o_hn_e' } ); +} + + +=head2 resHB_NH_O + + Title : resHB_NH_O + Usage : returns pointer to a 4 element array + consisting of: relative position of binding + partner #1, energy of that bond (kcal/mol), + relative positionof binding partner #2, + energy of that bond (kcal/mol). If the bond + is not bifurcated, the second bond is reported + as 0, 0.0 + Function : accessor + Example : $nhBonds_ptr = $dssp_obj->resHB_NH_O( RESIDUE_ID ) + Returns : pointer to 4 element array + Args : RESIDUE_ID + + +=cut + +sub resHB_NH_O { + my $self = shift; + my @args = @_; + my $dssp_key = $self->_toDsspKey( @args ); + return ( $self->{ 'Res' }->[ $dssp_key ]->{ 'hb1_nh_o_p' }, + $self->{ 'Res' }->[ $dssp_key ]->{ 'hb1_nh_o_e' }, + $self->{ 'Res' }->[ $dssp_key ]->{ 'hb2_nh_o_p' }, + $self->{ 'Res' }->[ $dssp_key ]->{ 'hb2_nh_o_e' } ); +} + + +=head2 resTco + + Title : resTco + Usage : returns tco angle around this residue + Function : accessor + Example : resTco = $dssp_obj->resTco( RESIDUE_ID ) + Returns : scalar + Args : RESIDUE_ID + + +=cut + +sub resTco { + my $self = shift; + my @args = @_; + my $dssp_key = $self->_toDsspKey( @args ); + return $self->{ 'Res' }->[ $dssp_key ]->{ 'tco' }; +} + + +=head2 resKappa + + Title : resKappa + Usage : returns kappa angle around this residue + Function : accessor + Example : $kappa = $dssp_obj->resKappa( RESIDUE_ID ) + Returns : scalar + Args : RESIDUE_ID ( dssp or PDB ) + + +=cut + +sub resKappa { + my $self = shift; + my @args = @_; + my $dssp_key = $self->_toDsspKey( @args ); + return $self->{ 'Res' }->[ $dssp_key ]->{ 'kappa' }; +} + + +=head2 resAlpha + + Title : resAlpha + Usage : returns alpha angle around this residue + Function : accessor + Example : $alpha = $dssp_obj->resAlpha( RESIDUE_ID ) + Returns : scalar + Args : RESIDUE_ID ( dssp or PDB ) + + +=cut + +sub resAlpha { + my $self = shift; + my @args = @_; + my $dssp_key = $self->_toDsspKey( @args ); + return $self->{ 'Res' }->[ $dssp_key ]->{ 'alpha' }; +} + +# VALUE ADDED METHODS (NOT JUST PARSE/REPORT) + +=head2 secBounds + + Title : secBounds + Usage : gets residue ids of boundary residues in each + contiguous secondary structural element of specified + chain + Function : returns pointer to array of 3 element arrays. First + two elements are the PDB IDs of the start and end points, + respectively and inclusively. The last element is the + DSSP secondary structural assignment code, + i.e. one of : ('B', 'E', 'G', 'H', 'I', 'S', 'T', or ' ') + Example : $ss_elements_pts = $dssp_obj->secBounds( 'A' ); + Returns : pointer to array of arrays + Args : chain id ( 'A', for example ). No arg => no chain id + + +=cut + +sub secBounds { + my $self = shift; + my $chain = shift; + my %sec_bounds; + + $chain = '-' if ( !( $chain ) || $chain eq ' ' || $chain eq '-' ); + + # if we've memoized this chain, use that + if ( $self->{ 'SecBounds' } ) { + # check to make sure chain is valid + if ( !( $self->{ 'SecBounds' }->{ $chain } ) ) { + $self->throw( "No such chain: $chain\n" ); + } + return $self->{ 'SecBounds' }->{ $chain }; + } + + my ( $cur_element, $i, $cur_chain, $beg, ); + + #initialize + $cur_element = $self->{ 'Res' }->[ 1 ]->{ 'ss_summary' }; + $beg = 1; + + for ( $i = 2; $i <= $self->_numResLines() - 1; $i++ ) { + if ( $self->{ 'Res' }->[ $i ]->{ 'amino_acid' } eq '!' ) { + # element is terminated by a chain discontinuity + push( @{ $sec_bounds{ $self->_pdbChain( $beg ) } }, + [ $self->_toPdbId( $beg ), + $self->_toPdbId( $i - 1 ), + $cur_element ] ); + $i++; + $beg = $i; + $cur_element = $self->{ 'Res' }->[ $i ]->{ 'ss_summary' }; + } + + elsif ( $self->{ 'Res' }->[ $i ]->{ 'ss_summary' } ne $cur_element ) { + # element is terminated by beginning of a new element + push( @{ $sec_bounds{ $self->_pdbChain( $beg ) } }, + [ $self->_toPdbId( $beg ), + $self->_toPdbId( $i - 1 ), + $cur_element ] ); + $beg = $i; + $cur_element = $self->{ 'Res' }->[ $i ]->{ 'ss_summary' }; + } + } + #last residue + if ( $self->{ 'Res' }->[ $i ]->{ 'ss_summary' } eq $cur_element ) { + push( @{ $sec_bounds{ $self->_pdbChain( $beg ) } }, + [ $self->_toPdbId( $beg ), + $self->_toPdbId( $i ), + $cur_element ] ); + } + + else { + push( @{ $sec_bounds{ $self->_pdbChain( $beg ) } }, + [ $self->_toPdbId( $beg ), + $self->_toPdbId( $i - 1 ), + $cur_element ] ); + push( @{ $sec_bounds{ $self->_pdbChain( $i ) } }, + [ $self->_toPdbId( $i ), + $self->_toPdbId( $i ), + $self->{ 'Res' }->[ $i ]->{ 'ss_summary' } ] ); + } + + $self->{ 'SecBounds' } = \%sec_bounds; + + # check to make sure chain is valid + if ( !( $self->{ 'SecBounds' }->{ $chain } ) ) { + $self->throw( "No such chain: $chain\n" ); + } + + return $self->{ 'SecBounds' }->{ $chain }; +} + + + +=head2 chains + + Title : chains + Usage : returns pointer to array of chain I.D.s (characters) + Function : + Example : $chains_pnt = $dssp_obj->chains(); + Returns : array of characters, one of which may be ' ' + Args : none + + +=cut + +sub chains { + my $self = shift; + my $cont_segs = $self->_contSegs(); + my %chains; + my $seg; + foreach $seg ( @{ $cont_segs } ) { + $chains{ $seg->[ 2 ] } = 1; + } + my @chains = keys( %chains ); + return \@chains; +} + + +=head2 getSeq + + Title : getSeq + Usage : returns a Bio::PrimarySeq object which represents a good + guess at the sequence of the given chain + Function : For most chains of most entries, the sequence returned by + this method will be very good. However, it is inherently + unsafe to rely on DSSP to extract sequence information about + a PDB entry. More reliable information can be obtained from + the PDB entry itself. + Example : $pso = $dssp_obj->getSeq( 'A' ); + Returns : (pointer to) a PrimarySeq object + Args : Chain identifier. If none given, ' ' is assumed. If no ' ' + chain, the first chain is used. + + +=cut + +sub getSeq { + my $self = shift; + my $chain = shift; + + my ( $pot_chain, + $seq, + $frag_num, + $frag, + $curPdbNum, + $lastPdbNum, + $gap_len, + $i, + $id, + ); + my @frags; + + if ( !( $chain ) ) { + $chain = ' '; + } + + if ( $self->{ 'Seq' }->{ $chain } ) { + return $self->{ 'Seq' }->{ $chain }; + } + + my $contSegs_pnt = $self->_contSegs(); + + # load up specified chain + foreach $pot_chain ( @{ $contSegs_pnt } ) { + if ( $pot_chain->[ 2 ] eq $chain ) { + push( @frags, $pot_chain ); + } + } + + # if that didn't work, just get the first one + if ( !( @frags ) ) { + $chain = $contSegs_pnt->[ 0 ]->[ 2 ]; + foreach $pot_chain ( @{ $contSegs_pnt } ) { + if ( $pot_chain->[ 2 ] eq $chain ) { + push( @frags, $chain ); + } + } + } + + # now build the sequence string + $seq = ""; + $frag_num = 0; + foreach $frag ( @frags ) { + $frag_num++; + if ( $frag_num > 1 ) { # we need to put in some gap seq + $curPdbNum = $self->_pdbNum( $frag->[ 0 ] ); + $gap_len = $curPdbNum - $lastPdbNum - 1; + if ( $gap_len > 0 ) { + $seq .= 'u' x $gap_len; + } + else { + $seq .= 'u'; + } + } + for ( $i = $frag->[ 0 ]; $i <= $frag->[ 1 ]; $i++ ) { + $seq .= $self->_resAA( $i ); + } + $lastPdbNum = $self->_pdbNum( $i - 1 ); + } + + + + $id = $self->pdbID(); + $id .= ":$chain"; + + $self->{ 'Seq' }->{ $chain } = Bio::PrimarySeq->new ( -seq => $seq, + -id => $id, + -moltype => 'protein' + ); + return $self->{ 'Seq' }->{ $chain }; +} + +=head1 INTERNAL METHODS + + +=cut + +=head2 _pdbChain + + Title : _pdbChain + Usage : returns the pdb chain id of given residue + Function : + Example : $chain_id = $dssp_obj->pdbChain( DSSP_KEY ); + Returns : scalar + Args : DSSP_KEY ( dssp or pdb ) + + +=cut + +sub _pdbChain { + my $self = shift; + my $dssp_key = shift; + return $self->{ 'Res' }->[ $dssp_key ]->{ 'pdb_chain' }; +} + +=head2 _resAA + + Title : _resAA + Usage : fetches the 1 char amino acid code, given a dssp id + Function : + Example : $aa = $dssp_obj->_resAA( dssp_id ); + Returns : 1 character scalar string + Args : dssp_id + + +=cut + +sub _resAA { + my $self = shift; + my $dssp_key = shift; + return $self->{ 'Res' }->[ $dssp_key ]->{ 'amino_acid' }; +} + + +=head2 _pdbNum + + Title : _pdbNum + Usage : fetches the numeric portion of the identifier for a given + residue as reported by the pdb entry. Note, this DOES NOT + uniquely specify a residue. There may be an insertion code + and/or chain identifier differences. + Function : + Example : $pdbNum = $self->_pdbNum( DSSP_ID ); + Returns : a scalar + Args : DSSP_ID + + +=cut + +sub _pdbNum { + my $self = shift; + my $dssp_key = shift; + return $self->{ 'Res' }->[ $dssp_key ]->{ 'pdb_resnum' }; +} + +=head2 _pdbInsCo + + Title : _pdbInsCo + Usage : fetches the Insertion Code for this residue, if it has one. + Function : + Example : $pdbNum = $self->_pdbInsCo( DSSP_ID ); + Returns : a scalar + Args : DSSP_ID + + +=cut + +sub _pdbInsCo { + my $self = shift; + my $dssp_key = shift; + return $self->{ 'Res' }->[ $dssp_key ]->{ 'insertionco' }; +} + +=head2 _toPdbId + + Title : _toPdbId + Usage : Takes a dssp key and builds the corresponding + PDB identifier string + Function : + Example : $pdbId = $self->_toPdbId( DSSP_ID ); + Returns : scalar + Args : DSSP_ID + +=cut + +sub _toPdbId { + my $self = shift; + my $dssp_key = shift; + my $pdbId = ( $self->_pdbNum( $dssp_key ). + $self->_pdbInsCo( $dssp_key ) ); + my $chain = $self->_pdbChain( $dssp_key ); + $pdbId = "$pdbId:$chain" if $chain; + return $pdbId; +} + +=head2 _contSegs + + Title : _contSegs + Usage : find the endpoints of continuous regions of this structure + Function : returns pointer to array of 3 element array. + Elements are the dssp keys of the start and end points of each + continuous element and its PDB chain id (may be blank). + Note that it is common to have several + continuous elements with the same chain id. This occurs + when an internal region is disordered and no structural + information is available. + Example : $cont_seg_ptr = $dssp_obj->_contSegs(); + Returns : pointer to array of arrays + Args : none + + +=cut + +sub _contSegs { + my $self = shift; + if ( $self->{ 'contSegs' } ) { + return $self->{ 'contSegs' }; + } + else { + # first time, so make contSegs + my ( $cur_chain, $i, $beg ); + my @contSegs; + #initialize + $cur_chain = $self->_pdbChain( 1 ); + $beg = 1; + #internal residues + for ( $i = 2; $i <= $self->_numResLines() - 1; $i++ ) { + if ( $self->{ 'Res' }->[ $i ]->{ 'amino_acid' } eq '!' ) { + push( @contSegs, [ $beg, $i - 1, $cur_chain ] ); + $beg = $i + 1; + $cur_chain = $self->_pdbChain( $i + 1 ); + } + } + # last residue must be the end of a chain + push( @contSegs, [ $beg, $i, $cur_chain ] ); + + $self->{ 'contSegs' } = \@contSegs; + return $self->{ 'contSegs' }; + } +} + +=head2 _numResLines + + Title : _numResLines + Usage : returns the total number of residue lines in this + dssp file. + This number is DIFFERENT than the number of residues in + the pdb file because dssp has chain termination and chain + discontinuity 'residues'. + Function : + Example : $num_res = $dssp_obj->_numResLines(); + Returns : scalar int + Args : none + + +=cut + +sub _numResLines { + my $self = shift; + return ( $#{$self->{ 'Res' }} ); +} + +=head2 _toDsspKey + + Title : _toDsspKey + Usage : returns the unique dssp integer key given a pdb residue id. + All accessor methods require (internally) + the dssp key. This method is very useful in converting + pdb keys to dssp keys so the accessors can accept pdb keys + as argument. PDB Residue IDs are inherently + problematic since they have multiple parts of + overlapping function and ill-defined or observed + convention in form. Input can be in any of the formats + described in the DESCRIPTION section above. + Function : + Example : $dssp_id = $dssp_obj->_pdbKeyToDsspKey( '10B:A' ) + Returns : scalar int + Args : pdb residue identifier: num[insertion code]:[chain] + + +=cut + +sub _toDsspKey { + # Consider adding lookup table for 'common' name (like 20:A) for + # fast access. Could be built during parse of input. + + my $self = shift; + my $arg_str; + + my ( $key_num, $chain_id, $ins_code ); + + # check to see how many args are given + if ( $#_ > 1 ) { # multiple args + $key_num = shift; + if ( $#_ > 1 ) { # still multiple args => ins. code, too + $ins_code = shift; + $chain_id = shift; + } + else { # just one more arg. => chain_id + $chain_id = shift; + } + } + else { # only single arg. Might be number or string + $arg_str = shift; + if ( $arg_str =~ /:/ ) { + # a chain is specified + ( $chain_id ) = ( $arg_str =~ /:(.)/); + $arg_str =~ s/:.//; + } + if ( $arg_str =~ /[A-Z]|[a-z]/ ) { + # an insertion code is specified + ( $ins_code ) = ( $arg_str =~ /([A-Z]|[a-z])/ ); + $arg_str =~ s/[A-Z]|[a-z]//g; + } + #now, get the number bit-> everything still around + $key_num = $arg_str; + } + + # Now find the residue which fits this description. Linear search is + # probably not the best way to do this, but oh well... + for ( my $i = 1; $i <= $self->_numResLines(); $i++ ) { + if ( $key_num == $self->{'Res'}->[$i]->{'pdb_resnum'} ) { + if ( $chain_id ) { # if a chain was specified + if ( $chain_id eq $self->{'Res'}->[$i]->{'pdb_chain'} ) { + # and it's the right one + if ( $ins_code ) { # if insertion code was specified + if ( $ins_code eq $self->{'Res'}->[$i]->{'insertionco'} ) { + # and it's the right one + return $i; + } + } + else { # no isertion code specified, this is it + return $i; + } + } + } + else { # no chain was specified + return $i; + } + } + } + $self->throw( "PDB key not found." ); +} + +=head2 _parse + + Title : _parse + Usage : parses dssp output + Function : + Example : used by the constructor + Returns : + Args : input source ( handled by Bio::Root:IO ) + + +=cut + +sub _parse { + my $self = shift; + my $file = shift; + my $cur; + my $current_chain; + my ( @elements, @hbond ); + my ( %head, %his, ); + my $element; + my $res_num; + + $cur = <$file>; + unless ( $cur =~ /^==== Secondary Structure Definition/ ) { + $self->throw( "Not dssp output" ); + return; + } + + $cur = <$file>; + ( $element ) = ( $cur =~ /^REFERENCE\s+(.+?)\s+\./ ); + $head{ 'REFERENCE' } = $element; + + $cur = <$file>; + @elements = split( /\s+/, $cur ); + pop( @elements ); # take off that annoying period + $head{ 'PDB' } = pop( @elements ); + $head{ 'DATE' } = pop( @elements ); + # now, everything else is "header" except for the word + # HEADER + shift( @elements ); + $element = shift( @elements ); + while ( @elements ) { + $element = $element." ".shift( @elements ); + } + $head{ 'HEADER' } = $element; + + $cur = <$file>; + ($element) = ( $cur =~ /^COMPND\s+(.+?)\s+\./ ); + $head{ 'COMPND' } = $element; + + $cur = <$file>; + ($element) = ( $cur =~ /^PDBSOURCE\s+(.+?)\s+\./ ); + $head{ 'SOURCE' } = $element; + + $cur = <$file>; + ($element) = ( $cur =~ /^AUTHOR\s+(.+?)\s+/ ); + $head{ 'AUTHOR' } = $element; + + $cur = <$file>; + @elements = split( /\s+/, $cur ); + shift( @elements ); + $head{ 'TotNumRes' } = shift( @elements ); + $head{ 'NumChain' } = shift( @elements ); + $head{ 'TotSSBr' } = shift( @elements ); + $head{ 'TotIaSSBr' } = shift( @elements ); + $head{ 'TotIeSSBr' } = shift( @elements ); + + $cur = <$file>; + ( $element ) = ( $cur =~ /\s*(\d+\.\d*)\s+ACCESSIBLE SURFACE OF PROTEIN/ ); + $head{ 'ProAccSurf' } = $element; + $self->{ 'Head' } = \%head; + + for ( my $i = 1; $i <= 14; $i++ ) { + $cur = <$file>; + ( $element ) = + $cur =~ /\s*(\d+)\s+\d+\.\d+\s+TOTAL NUMBER OF HYDROGEN/; + push( @hbond, $element ); +# $hbond{ $hBondType } = $element; + } + $self->{ 'HBond' } = \@hbond; + + my $histogram_finished = 0; + while ( !($histogram_finished) && chomp( $cur = <$file> ) ) { + if ( $cur =~ /RESIDUE AA STRUCTURE/ ) { + $histogram_finished = 1; + } + } + + while ( chomp( $cur = <$file> ) ) { + $res_num = substr( $cur, 0, 5 ); + $res_num =~ s/\s//g; + $self->{ 'Res' }->[ $res_num ] = &_parseResLine( $cur ); + } +} + + +=head2 _parseResLine + + Title : _parseResLine + Usage : parses a single residue line + Function : + Example : used internally + Returns : + Args : residue line ( string ) + + +=cut + +sub _parseResLine() { + my $cur = shift; + my ( $feat, $value ); + my %elements; + + foreach $feat ( keys %lookUp ) { + $value = substr( $cur, $lookUp{ $feat }->[0], + $lookUp{ $feat }->[1] ); + $value =~ s/\s//g; + $elements{$feat} = $value ; + } + + # if no chain id, make it '-' (like STRIDE...very convenient) + if ( !( $elements{ 'pdb_chain' } ) || $elements{ 'pdb_chain'} eq ' ' ) { + $elements{ 'pdb_chain' } = '-'; + } + return \%elements; +} + +return 1; #just because + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Structure/SecStr/STRIDE/Res.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Structure/SecStr/STRIDE/Res.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1112 @@ +# $id $ +# +# bioperl module for Bio::Structure::SecStr::STRIDE::Res.pm +# +# Cared for by Ed Green <ed@compbio.berkeley.edu> +# +# Copyright Univ. of California +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code +=head1 NAME + +Bio::Structure::SecStr::STRIDE::Res - Module for parsing/accessing stride output + +=head1 SYNOPSIS + + my $stride_obj = new Bio::Structure::SecStr::STRIDE::Res( '-file' => 'filename.stride' ); + + # or + + my $stride_obj = new Bio::Structure::SecStr::STRIDE::Res( '-fh' => \*STDOUT ); + + # Get secondary structure assignment for PDB residue 20 of chain A + $sec_str = $stride_obj->resSecStr( '20:A' ); + + # same + $sec_str = $stride_obj->resSecStr( 20, 'A' ) + +=head1 DESCRIPTION + +STRIDE::Res is a module for objectifying STRIDE output. STRIDE is a +program (similar to DSSP) for assigning secondary structure to +individual residues of a pdb structure file. + + ( Knowledge-Based Protein Secondary Structure Assignment, + PROTEINS: Structure, Function, and Genetics 23:566-579 (1995) ) + +STRIDE is available here: +http://www.embl-heidelberg.de/argos/stride/down_stride.html + +Methods are then available for extracting all of the infomation +present within the output or convenient subsets of it. + +Although they are very similar in function, DSSP and STRIDE differ +somewhat in output format. Thes differences are reflected in the +return value of some methods of these modules. For example, both +the STRIDE and DSSP parsers have resSecStr() methods for returning +the secondary structure of a given residue. However, the range of +return values for DSSP is ( H, B, E, G, I, T, and S ) whereas the +range of values for STRIDE is ( H, G, I, E, B, b, T, and C ). See +individual methods for details. + +The methods are roughly divided into 3 sections: + + 1. Global features of this structure (PDB ID, total surface area, + etc.). These methods do not require an argument. + 2. Residue specific features ( amino acid, secondary structure, + solvent exposed surface area, etc. ). These methods do require an + arguement. The argument is supposed to uniquely identify a + residue described within the structure. It can be of any of the + following forms: + ('#A:B') or ( #, 'A', 'B' ) + || | + || - Chain ID (blank for single chain) + |--- Insertion code for this residue. Blank for most residues. + |--- Numeric portion of residue ID. + + (#) + | + --- Numeric portion of residue ID. If there is only one chain and + it has no ID AND there is no residue with an insertion code at this + number, then this can uniquely specify a residue. + + ('#:C') or ( #, 'C' ) + | | + | -Chain ID + ---Numeric portion of residue ID. + + If a residue is incompletely specified then the first residue that + fits the arguments is returned. For example, if 19 is the argument + and there are three chains, A, B, and C with a residue whose number + is 19, then 19:A will be returned (assuming its listed first). + + Since neither DSSP nor STRIDE correctly handle alt-loc codes, they + are not supported by these modules. + + 3. Value-added methods. Return values are not verbatem strings + parsed from DSSP or STRIDE output. + +=head1 FEEDBACK + +=head2 MailingLists + +UsUser feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ed Green + +Email ed@compbio.berkeley.edu + + +=head1 APPENDIX + +The Rest of the documentation details each method. +Internal methods are preceded with a _. + + +=cut + +package Bio::Structure::SecStr::STRIDE::Res; +use strict; +use vars qw(@ISA); +use Bio::Root::Root; +use Bio::Root::IO; +use Bio::PrimarySeq; + +@ISA = qw(Bio::Root::Root); + +our %ASGTable = ( 'aa' => 0, + 'resNum' => 1, + 'ssAbbr' => 2, + 'ssName' => 3, + 'phi' => 4, + 'psi' => 5, + 'surfArea' => 6 ); + +our %AATable = ( 'ALA' => 'A', 'ARG' => 'R', 'ASN' => 'N', + 'ASP' => 'D', 'CYS' => 'C', 'GLN' => 'Q', + 'GLU' => 'E', 'GLY' => 'G', 'HIS' => 'H', + 'ILE' => 'I', 'LEU' => 'L', 'LYS' => 'K', + 'MET' => 'M', 'PHE' => 'F', 'PRO' => 'P', + 'SER' => 'S', 'THR' => 'T', 'TRP' => 'W', + 'TYR' => 'Y', 'VAL' => 'V' ); + +=head2 new + + Title : new + Usage : makes new object of this class + Function : Constructor + Example : $stride_obj = Bio::Structure::SecStr::STRIDE:Res->new( '-file' => filename + # or + '-fh' => FILEHANDLE ) + Returns : object (ref) + Args : filename or filehandle( must be proper STRIDE output ) + +=cut + +sub new { + my ( $class, @args ) = @_; + my $self = $class->SUPER::new( @args ); + my $io = Bio::Root::IO->new( @args ); + $self->_parse( $io ); # not passing filehandle ! + $io->close(); + return $self; +} + +# GLOBAL FEATURES / INFO / STATS + +=head2 totSurfArea + + Title : totSurfArea + Usage : returns sum of surface areas of all residues of all + chains considered. Result is memoized. + Function : + Example : $tot_SA = $stride_obj->totSurfArea(); + Returns : scalar + Args : none + + +=cut + +sub totSurfArea { + my $self = shift; + my $total = 0; + my ( $chain, $res ); + + if ( $self->{ 'SurfArea' } ) { + return $self->{ 'SurfArea' }; + } + else { + foreach $chain ( keys %{$self->{ 'ASG' }} ) { + for ( my $i = 1; $i <= $#{$self->{'ASG'}->{$chain}}; $i++ ) { + $total += + $self->{'ASG'}->{$chain}->[$i]->[$ASGTable{'surfArea'}]; + } + } + } + + $self->{ 'SurfArea' } = $total; + return $self->{ 'SurfArea' }; + +} + +=head2 numResidues + + Title : numResidues + Usage : returns total number of residues in all chains or + just the specified chain + Function : + Example : $tot_res = $stride_obj->numResidues(); + Returns : scalar int + Args : none or chain id + + +=cut + +sub numResidues { + my $self = shift; + my $chain = shift; + my $total = 0; + my $key; + foreach $key ( keys %{$self->{ 'ASG' }} ) { + if ( $chain ) { + if ( $key eq $chain ) { + $total += $#{$self->{ 'ASG' }{ $key }}; + } + } + else { + $total += $#{$self->{ 'ASG' }{ $key }}; + } + } + return $total; +} + +# STRAIGHT FROM THE PDB ENTRY + +=head2 pdbID + + Title : pdbID + Usage : returns pdb identifier ( 1FJM, e.g. ) + Function : + Example : $pdb_id = $stride_obj->pdbID(); + Returns : scalar string + Args : none + + +=cut + +sub pdbID { + my $self = shift; + return $self->{ 'PDB' }; +} +=head2 pdbAuthor + + Title : pdbAuthor + Usage : returns author of this PDB entry + Function : + Example : $auth = $stride_obj->pdbAuthor() + Returns : scalar string + Args : none + + +=cut + +sub pdbAuthor { + my $self = shift; + return join( ' ', @{ $self->{ 'HEAD' }->{ 'AUT' } } ); +} + +=head2 pdbCompound + + Title : pdbCompound + Usage : returns string of what was found on the + CMP lines + Function : + Example : $cmp = $stride_obj->pdbCompound(); + Returns : string + Args : none + + +=cut + +sub pdbCompound { + my $self = shift; + return join( ' ', @{ $self->{ 'HEAD' }->{ 'CMP' } } ); +} + +=head2 pdbDate + + Title : pdbDate + Usage : returns date given in PDB file + Function : + Example : $pdb_date = $stride_obj->pdbDate(); + Returns : scalar + Args : none + + +=cut + +sub pdbDate { + my $self = shift; + return $self->{ 'DATE' }; +} + +=head2 pdbHeader + + Title : pdbHeader + Usage : returns string of characters found on the PDB header line + Function : + Example : $head = $stride_obj->pdbHeader(); + Returns : scalar + Args : none + + +=cut + +sub pdbHeader { + my $self = shift; + return $self->{ 'HEAD' }->{ 'HEADER' }; +} + +=head2 pdbSource + + Title : pdbSource + Usage : returns string of what was found on SRC lines + Function : + Example : $src = $stride_obj->pdbSource(); + Returns : scalar + Args : none + + +=cut + +sub pdbSource { + my $self = shift; + return join( ' ', @{ $self->{ 'HEAD' }->{ 'SRC' } } ); +} + +# RESIDUE SPECIFIC ACCESSORS + +=head2 resAA + + Title : resAA + Usage : returns 1 letter abbr. of the amino acid specified by + the arguments + Function : + Examples : $aa = $stride_obj->resAA( RESIDUE_ID ); + Returns : scalar character + Args : RESIDUE_ID + + +=cut + +sub resAA { + my $self = shift; + my @args = @_; + my ( $ord, $chain ) = $self->_toOrdChain( @args ); + return ( $AATable{$self->{'ASG'}->{$chain}->[$ord]->[$ASGTable{'aa'}]} ); +} + +=head2 resPhi + + Title : resPhi + Usage : returns phi angle of specified residue + Function : + Example : $phi = $stride_obj->resPhi( RESIDUE_ID ); + Returns : scaler + Args : RESIDUE_ID + + +=cut + +sub resPhi { + my $self = shift; + my @args = @_; + my ( $ord, $chain ) = $self->_toOrdChain( @args ); + return $self->{ 'ASG' }->{ $chain }->[ $ord ]->[ $ASGTable{ 'phi' } ]; +} + +=head2 resPsi + + Title : resPsi + Usage : returns psi angle of specified residue + Function : + Example : $psi = $stride_obj->resPsi( RESIDUE_ID ); + Returns : scalar + Args : RESIDUE_ID + + +=cut + +sub resPsi { + my $self = shift; + my @args = @_; + my ( $ord, $chain ) = $self->_toOrdChain( @args ); + return $self->{ 'ASG' }->{ $chain }->[ $ord ]->[ $ASGTable{ 'psi' } ]; +} + +=head2 resSolvAcc + + Title : resSolvAcc + Usage : returns stride calculated surface area of specified residue + Function : + Example : $sa = $stride_obj->resSolvAcc( RESIDUE_ID ); + Returns : scalar + Args : RESIDUE_ID + + +=cut + +sub resSolvAcc { + my $self = shift; + my @args = @_; + my ( $ord, $chain ) = $self->_toOrdChain( @args ); + return $self->{ 'ASG' }->{ $chain }->[ $ord ]->[ $ASGTable{ 'surfArea' } ]; +} + +=head2 resSurfArea + + Title : resSurfArea + Usage : returns stride calculated surface area of specified residue + Function : + Example : $sa = $stride_obj->resSurfArea( RESIDUE_ID ); + Returns : scalar + Args : RESIDUE_ID + + +=cut + +sub resSurfArea { + my $self = shift; + my @args = @_; + my ( $ord, $chain ) = $self->_toOrdChain( @args ); + return $self->{ 'ASG' }->{ $chain }->[ $ord ]->[ $ASGTable{ 'surfArea' } ]; +} + +=head2 resSecStr + + Title : resSecStr + Usage : gives one letter abbr. of stride determined secondary + structure of specified residue + Function : + Example : $ss = $stride_obj->resSecStr( RESIDUE_ID ); + Returns : one of: 'H' => Alpha Helix + 'G' => 3-10 helix + 'I' => PI-helix + 'E' => Extended conformation + 'B' or 'b' => Isolated bridge + 'T' => Turn + 'C' => Coil + ' ' => None + # NOTE: This range is slightly DIFFERENT from the + # DSSP method of the same name + Args : RESIDUE_ID + + +=cut + +sub resSecStr { + my $self = shift; + my @args = @_; + my ( $ord, $chain ) = $self->_toOrdChain( @args ); + return $self->{ 'ASG' }->{ $chain }->[ $ord ]->[ $ASGTable{ 'ssAbbr' } ]; +} + +=head2 resSecStrSum + + Title : resSecStrSum + Usage : gives one letter summary of secondary structure of + specified residue. More general than secStruc() + Function : + Example : $ss_sum = $stride_obj->resSecStrSum( RESIDUE_ID ); + Returns : one of: 'H' (helix), 'B' (beta), 'T' (turn), or 'C' (coil) + Args : residue identifier(s) ( SEE INTRO NOTE ) + + +=cut + +sub resSecStrSum { + my $self = shift; + my @args = @_; + my $ss_char = $self->resSecStr( @args ); + + if ( $ss_char eq 'H' || $ss_char eq 'G' || $ss_char eq 'I' ) { + return 'H'; + } + if ( $ss_char eq 'E' || $ss_char eq 'B' || $ss_char eq 'b' ) { + return 'B'; + } + if ( $ss_char eq 'T' ) { + return 'T'; + } + else { + return 'C'; + } +} + +# STRIDE SPECIFIC + +=head2 resSecStrName + + Title : resSecStrName + Usage : gives full name of the secondary structural element + classification of the specified residue + Function : + Example : $ss_name = $stride_obj->resSecStrName( RESIDUE_ID ); + Returns : scalar string + Args : RESIDUE_ID + + +=cut + +sub resSecStrName { + my $self = shift; + my @args = @_; + my ( $ord, $chain ) = $self->_toOrdChain( @args ); + return $self->{ 'ASG' }->{ $chain }->[ $ord ]->[ $ASGTable{ 'ssName' } ]; +} + +=head2 strideLocs + + Title : strideLocs + Usage : returns stride determined contiguous secondary + structural elements as specified on the LOC lines + Function : + Example : $loc_pnt = $stride_obj->strideLocs(); + Returns : pointer to array of 5 element arrays. + 0 => stride name of structural element + 1 => first residue pdb key (including insertion code, if app.) + 2 => first residue chain id + 3 => last residue pdb key (including insertion code, if app.) + 4 => last residue chain id + NOTE the differences between this range and the range of SecBounds() + Args : none + + +=cut + +sub strideLocs { + my $self = shift; + return $self->{ 'LOC' }; +} + +# VALUE ADDED METHODS (NOT JUST PARSE/REPORT) + +=head2 secBounds + + Title : secBounds + Usage : gets residue ids of boundary residues in each + contiguous secondary structural element of specified + chain + Function : + Example : $ss_bound_pnt = $stride_obj->secBounds( 'A' ); + Returns : pointer to array of 3 element arrays. First two elements + are the PDB IDs of the start and end points, respectively + and inclusively. The last element is the STRIDE secondary + structural element code (same range as resSecStr). + Args : chain identifier ( one character ). If none, '-' is assumed + + +=cut + +sub secBounds { + # Requires a chain name. If left blank, we assume ' ' which equals '-' + my $self = shift; + my $chain = shift; + my @SecBounds; + + $chain = '-' if ( !( $chain ) || $chain eq ' ' || $chain eq '-' ); + + # if we've memoized this one, use that + if ( $self->{ 'SecBounds' }->{ $chain } ) { + return $self->{ 'SecBounds' }->{ $chain }; + } + + #check to make sure chain is valid + if ( !( $self->{ 'ASG' }->{ $chain } ) ) { + $self->throw( "No such chain: $chain\n" ); + } + + my $cur_element = $self->{ 'ASG' }->{ $chain }->[ 1 ]-> + [ $ASGTable{ 'ssAbbr' } ]; + my $beg = 1; + my $i; + + for ( $i = 2; $i <= $#{$self->{'ASG'}->{$chain}}; $i++ ) { + if ( $self->{ 'ASG' }->{ $chain }->[ $i ]->[ $ASGTable{ 'ssAbbr' } ] + ne $cur_element ) { + push( @SecBounds, [ $beg, $i -1 , $cur_element ] ); + $beg = $i; + $cur_element = $self->{ 'ASG' }->{ $chain }->[ $i ]-> + [ $ASGTable{ 'ssAbbr' } ]; + } + } + + if ( $self->{ 'ASG' }->{ $chain }->[ $i ]->[ $ASGTable{ 'ssAbbr' } ] + eq $cur_element ) { + push( @SecBounds, [ $beg, $i, $cur_element ] ); + } + else { + push( @SecBounds, [ $beg, $i - 1, $cur_element ], + [ $i, $i, $self->{ 'ASG' }->{ $chain }->[ $i ]-> + [ $ASGTable{ 'ssAbbr' } ] ] ); + } + + $self->{ 'SecBounds' }->{ $chain } = \@SecBounds; + return $self->{ 'SecBounds' }->{ $chain }; +} + +=head2 chains + + Title : chains + Usage : gives array chain I.D.s (characters) + Function : + Example : @chains = $stride_obj->chains(); + Returns : array of characters + Args : none + + +=cut + +sub chains { + my $self = shift; + my @chains = keys ( %{ $self->{ 'ASG' } } ); + return \@chains; +} + +=head2 getSeq + + Title : getSeq + Usage : returns a Bio::PrimarySeq object which represents an + approximation at the sequence of the specified chain. + Function : For most chain of most entries, the sequence returned by + this method will be very good. However, it it inherently + unsafe to rely on STRIDE to extract sequence information about + a PDB entry. More reliable information can be obtained from + the PDB entry itself. If a second option is given + (and evaluates to true), the sequence generated will + have 'X' in spaces where the pdb residue numbers are + discontinuous. In some cases this results in a + better sequence object (when the discontinuity is + due to regions which were present, but could not be + resolved). In other cases, it will result in a WORSE + sequence object (when the discontinuity is due to + historical sequence numbering and all sequence is + actually resolved). + Example : $pso = $dssp_obj->getSeq( 'A' ); + Returns : (pointer to) a PrimarySeq object + Args : Chain identifier. If none given, '-' is assumed. + + +=cut + +sub getSeq { + my $self = shift; + my $chain = shift; + my $fill_in = shift; + + if ( !( $chain ) ) { + $chain = '-'; + } + + if ( $self->{ 'Seq' }->{ $chain } ) { + return $self->{ 'Seq' }->{ $chain }; + } + + my ( $seq, + $num_res, + $last_res_num, + $cur_res_num, + $i, + $step, + $id + ); + + $seq = ""; + $num_res = $self->numResidues( $chain ); + $last_res_num = $self->_pdbNum( 1, $chain ); + for ( $i = 1; $i <= $num_res; $i++ ) { + if ( $fill_in ) { + $cur_res_num = $self->_pdbNum( $i, $chain ); + $step = $cur_res_num - $last_res_num; + if ( $step > 1 ) { + $seq .= 'X' x ( $step - 1 ); + } + } + $seq .= $self->_resAA( $i, $chain ); + $last_res_num = $cur_res_num; + } + + $id = $self->pdbID(); + $id .= "$chain"; + + $self->{ 'Seq' }->{ $chain } = Bio::PrimarySeq->new( -seq => $seq, + -id => $id, + -moltype => 'protein' + ); + + return $self->{ 'Seq' }->{ $chain }; +} + +=head1 INTERNAL METHODS + +=head2 _pdbNum + + Title : _pdbNum + Usage : fetches the numeric portion of the identifier for a given + residue as reported by the pdb entry. Note, this DOES NOT + uniquely specify a residue. There may be an insertion code + and/or chain identifier differences. + Function : + Example : $pdbNum = $self->pdbNum( 3, 'A' ); + Returns : a scalar + Args : valid ordinal num / chain combination + + +=cut + +sub _pdbNum { + my $self = shift; + my $ord = shift; + my $chain = shift; + if ( !( $self->{ 'ASG' }->{ $chain }->[ $ord ] ) ) { + $self->throw( "No such ordinal $ord in chain $chain.\n" ); + } + my $pdb_junk = $self->{ 'ASG' }->{ $chain }->[ $ord ]->[ $ASGTable{ 'resNum' } ]; + my $num_part; + ( $num_part ) = ( $pdb_junk =~ /(-*\d+).*/ ); + return $num_part; +} + +=head2 _resAA + + Title : _resAA + Usage : returns 1 letter abbr. of the amino acid specified by + the arguments + Function : + Examples : $aa = $stride_obj->_resAA( 3, '-' ); + Returns : scalar character + Args : ( ord. num, chain ) + + +=cut + +sub _resAA { + my $self = shift; + my $ord = shift; + my $chain = shift; + if ( !( $self->{ 'ASG' }->{ $chain }->[ $ord ] ) ) { + $self->throw( "No such ordinal $ord in chain $chain.\n" ); + } + return ( $AATable{$self->{'ASG'}->{$chain}->[$ord]->[$ASGTable{'aa'}]} ); +} + +=head2 _pdbInsCo + + Title : _pdbInsCo + Usage : fetches the Insertion code for this residue. + Function : + Example : $pdb_ins_co = $self->_pdb_ins_co( 15, 'B' ); + Returns : a scalar + Args : ordinal number and chain + + +=cut + +sub _pdbInsCo { + my $self = shift; + my $ord = shift; + my $chain = shift; + if ( !( $self->{ 'ASG' }->{ $chain }->[ $ord ] ) ) { + $self->throw( "No such ordinal $ord in chain $chain.\n" ); + } + my $pdb_junk = $self->{ 'ASG' }->{ $chain }->[ $ord ]->[ $ASGTable{ 'resNum' } ]; + my $letter_part; + ( $letter_part ) = ( $pdb_junk =~ /\d+(\D+)/ ); # insertion code can be any + # non-word character(s) + return $letter_part; +} + +=head2 _toOrdChain + + Title : _toOrdChain + Usage : takes any set of residue identifying parameters and + wrestles them into a two element array: the chain and the ordinal + number of this residue. This two element array can then be + efficiently used as keys in many of the above accessor methods + ('#A:B') or ( #, 'A', 'B' ) + || | + || - Chain ID (blank for single chain) + |--- Insertion code for this residue. Blank for most residues. + |--- Numeric portion of residue ID. + + (#) + | + --- Numeric portion of residue ID. If there is only one chain and + it has no ID AND there is no residue with an insertion code at this + number, then this can uniquely specify a residue. + + # ('#:C) or ( #, 'C' ) + | | + | -Chain ID + ---Numeric portion of residue ID. + + If a residue is incompletely specified then the first residue that + fits the arguments is returned. For example, if 19 is the argument + and there are three chains, A, B, and C with a residue whose number + is 19, then 19:A will be returned (assuming its listed first). + + Function : + Example : my ( $ord, $chain ) = $self->_toOrdChain( @args ); + Returns : two element array + Args : valid set of residue identifier(s) ( SEE NOTE ABOVE ) + + +=cut + +sub _toOrdChain { + my $self = shift; + my $arg_str; + + my ( $key_num, $chain_id, $ins_code, $key, $i ); + + # check to see how many args are given + if ( $#_ >= 1 ) { # multiple args + $key_num = shift; + if ( $#_ >= 1 ) { # still multiple args => ins. code, too + $ins_code = shift; + $chain_id = shift; + } + else { # just one more arg. => chain_id + $chain_id = shift; + } + } + else { # only single arg. Might be number or string + $arg_str = shift; + if ( $arg_str =~ /:/ ) { + # a chain is specified + ( $chain_id ) = ( $arg_str =~ /:(.)/); + $arg_str =~ s/:.//; + } + if ( $arg_str =~ /[A-Z]|[a-z]/ ) { + # an insertion code is specified + ( $ins_code ) = ( $arg_str =~ /([A-Z]|[a-z])/ ); + $arg_str =~ s/[A-Z]|[a-z]//g; + } + #now, get the number bit-> everything still around + $key_num = $arg_str; + } + + $key = "$key_num$ins_code"; + if ( !( $chain_id ) || $chain_id eq ' ' ) { + $chain_id = '-'; + } + + if ( !( $self->{ 'ASG' }->{ $chain_id } ) ) { + $self->throw( "No such chain: $chain_id" ); + } + + for ( $i = 1; $i <= $#{$self->{ 'ASG' }->{ $chain_id }}; $i++ ) { + if ( $self->{ 'ASG' }->{ $chain_id }->[ $i ]->[ $ASGTable{ 'resNum' } ] eq + $key ) { + return ( $i, $chain_id ); + } + } + + $self->throw( "No such key: $key" ); + +} + +=head2 _parse + + Title : _parse + Usage : as name suggests, parses stride output, creating object + Function : + Example : $self->_parse( $io ); + Returns : + Args : valid Bio::Root::IO object + + +=cut + +sub _parse { + my $self = shift; + my $io = shift; + my $file = $io->_fh(); + + # Parse top lines + if ( $self->_parseTop( $io ) ) { + $self->throw( "Not stride output" ); + } + + # Parse the HDR, CMP, SCR, and AUT lines + $self->_parseHead( $io ); + + # Parse the CHN, SEQ, STR, and LOC lines + $self->_parseSummary( $io ); # we're ignoring this + + # Parse the ASG lines + $self->_parseASG( $io ); +} + +=head2 _parseTop + + Title : _parseTop + Usage : makes sure this looks like stride output + Function : + Example : + Returns : + Args : + + +=cut + +sub _parseTop { + my $self = shift; + my $io = shift; + my $file = $io->_fh(); + my $cur = <$file>; + if ( $cur =~ /^REM ---/ ) { + return 0; + } + return 1; +} + +=head2 _parseHead + + Title : _parseHead + Usage : parses + Function : HDR, CMP, SRC, and AUT lines + Example : + Returns : + Args : + + +=cut + +sub _parseHead { + my $self = shift; + my $io = shift; + my $file = $io->_fh(); + my $cur; + my $element; + my ( @elements, @cmp, @src, @aut ); + my %head = {}; + my $still_head = 1; + + $cur = <$file>; + while ( $cur =~ /^REM / ) { + $cur = <$file>; + } + + if ( $cur =~ /^HDR / ) { + @elements = split( /\s+/, $cur ); + shift( @elements ); + pop( @elements ); + $self->{ 'PDB' } = pop( @elements ); + $self->{ 'DATE' } = pop( @elements ); + # now, everything else is "header" except for the word + # HDR + $element = join( ' ', @elements ); + $head{ 'HEADER' } = $element; + } + + $cur = <$file>; + while ( $cur =~ /^CMP / ) { + ( $cur ) = ( $cur =~ /^CMP\s+(.+?)\s*\w{4}$/ ); + push( @cmp, $cur ); + $cur = <$file>; + } + + while ( $cur =~ /^SRC / ) { + ( $cur ) = ( $cur =~ /^SRC\s+(.+?)\s*\w{4}$/ ); + push( @src, $cur ); + $cur = <$file>; + } + + while ( $cur =~ /^AUT / ) { + ( $cur ) = ( $cur =~ /^AUT\s+(.+?)\s*\w{4}$/ ); + push( @aut, $cur ); + $cur = <$file>; + } + + $head{ 'CMP' } = \@cmp; + $head{ 'SRC' } = \@src; + $head{ 'AUT' } = \@aut; + $self->{ 'HEAD' } = \%head; +} + +=head2 _parseSummary + + Title : _parseSummary + Usage : parses LOC lines + Function : + Example : + Returns : + Args : + + +=cut + +sub _parseSummary { + my $self = shift; + my $io = shift; + my $file = $io->_fh(); + my $cur = <$file>; + my $bound_set; + my $element; + my ( @elements, @cur ); + my @LOC_lookup = ( [ 5, 12 ], # Element name + # reduntdant [ 18, 3 ], # First residue name + [ 22, 5 ], # First residue PDB number + [ 28, 1 ], # First residue Chain ID + # redundant [ 35, 3 ], # Last residue name + [ 40, 5 ], # Last residue PDB number + [ 46, 1 ] ); # Last residue Chain ID + + #ignore these lines + while ( $cur =~ /^REM |^STR |^SEQ |^CHN / ) { + $cur = <$file>; + } + + while ( $cur =~ /^LOC / ) { + foreach $bound_set ( @LOC_lookup ) { + $element = substr( $cur, $bound_set->[ 0 ], $bound_set->[ 1 ] ); + $element =~ s/\s//g; + push( @cur, $element ); + } + push( @elements, [ @cur ] ); + $cur = <$file>; + @cur = (); + } + $self->{ 'LOC' } = \@elements; + +} + +=head2 _parseASG + + Title : _parseASG + Usage : parses ASG lines + Function : + Example : + Returns : + Args : + + +=cut + +sub _parseASG { + my $self = shift; + my $io = shift; + my $file = $io->_fh(); + my $cur = <$file>; + my $bound_set; + my $ord_num; + my ( $chain, $last_chain ); + my $element; + my %ASG; + my ( @cur, @elements ); + my @ASG_lookup = ( [ 5, 3 ], # Residue name + # [ 9, 1 ], # Chain ID + [ 10, 5 ], # PDB residue number (w/ins.code) + # [ 16, 4 ], # ordinal stride number + [ 24, 1 ], # one letter sec. stru. abbr. + [ 26, 13], # full sec. stru. name + [ 42, 7 ], # phi angle + [ 52, 7 ], # psi angle + [ 64, 5 ] );# residue solv. acc. + + while ( $cur =~ /^REM / ) { + $cur = <$file>; + } + + while ( $cur =~ /^ASG / ) { + # get ordinal number for array key + $ord_num = substr( $cur, 16, 4 ); + $ord_num =~ s/\s//g; + + # get the chain id + $chain = substr( $cur, 9, 1 ); + + if ( $last_chain && ( $chain ne $last_chain ) ) { + $ASG{ $last_chain } = [ @elements ]; + @elements = (); + } + + # now get the rest of the info on this line + foreach $bound_set ( @ASG_lookup ) { + $element = substr( $cur, $bound_set->[ 0 ], + $bound_set->[ 1 ] ); + $element =~ s/\s//g; + push( @cur, $element ); + } + $elements[ $ord_num ] = [ @cur ]; + $cur = <$file>; + @cur = (); + $last_chain = $chain; + } + + $ASG{ $chain } = [ @elements ]; + + $self->{ 'ASG' } = \%ASG; +} + +1; + + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Structure/StructureI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Structure/StructureI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,76 @@ +# $Id: StructureI.pm,v 1.3 2002/10/22 07:45:21 lapp Exp $ +# placeholder for Bio::Structure::StructureI + +# $Id: StructureI.pm,v 1.3 2002/10/22 07:45:21 lapp Exp $ +# +# BioPerl module for Bio::Structure::StructureI +# +# Cared for by Kris Boulez <kris.boulez@algonomics.com> +# +# Copyright Kris Boulez +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Structure::StructureI - Abstract Interface for a Structure objects + +=head1 SYNOPSIS + +Give standard usage here + +=head1 DESCRIPTION + +Describe the interface here + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Kris Boulez + +Email kris.boulez@algonomics.com + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Structure::StructureI; +use Bio::Root::RootI; +use vars qw(@ISA); +use strict; + +@ISA = qw(Bio::Root::RootI); + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Symbol/Alphabet.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Symbol/Alphabet.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,191 @@ +# $Id: Alphabet.pm,v 1.6 2002/10/22 07:45:21 lapp Exp $ +# +# BioPerl module for Bio::Symbol::Alphabet +# +# Cared for by Jason Stajich <jason@bioperl.org> +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Symbol::Alphabet - DESCRIPTION of Object + +=head1 SYNOPSIS + + { + my $alphabet = new Bio::Symbols::Alphabet(-symbols => [ @s ], + -subalphabets => [ @alphas ] ); + + my @symbols = $alphabet->symbols; + my @subalphas = $alphabet->alphabets; + if( $alphabet->contains($symbol) ) { + # do something + } + } + +=head1 DESCRIPTION + +Alphabet contains set of symbols, which can be concatenated to +form symbol lists. Sequence string, for example, is stringified +representation of the symbol list (tokens of symbols). + +This module was implemented for the purposes of meeting the +BSANE/BioCORBA spec 0.3 only. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Symbol::Alphabet; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::Root::Root; +use Bio::Symbol::AlphabetI; + +@ISA = qw(Bio::Root::Root Bio::Symbol::AlphabetI ); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Symbol::Alphabet(); + Function: Builds a new Bio::Symbol::Alphabet object + Returns : Bio::Symbol::Alphabet + Args : -symbols => Array ref of Bio::Symbol::SymbolI objects + -subalphas=> Array ref of Bio::Symbol::AlphabetI objects + representing sub alphabets +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + $self->{'_symbols'} = []; + $self->{'_alphabets'} = []; + my ($symbols, $subalphas) = $self->_rearrange([qw(SYMBOLS SUBALPHAS)], + @args); + + defined $symbols && ref($symbols) =~ /array/i && $self->symbols(@$symbols); + defined $subalphas && ref($subalphas) =~ /array/i && $self->alphabets(@$subalphas); + return $self; +} + +=head2 AlphabetI Interface methods + +=cut + +=head2 symbols + + Title : symbols + Usage : my @symbols = $alphabet->symbols(); + Function: Get/Set Symbol list for an alphabet + List of symbols, which make up this alphabet. + Returns : Array of Bio::Symbol::SymbolI objects + Args : (optionalalphabets) Array of Bio::Symbol::SymbolI objects + +=cut + +sub symbols { + my ($self,@args) = @_; + if( @args ) { + $self->{'_symbols'} = []; + foreach my $symbol ( @args ) { + if( ! defined $symbol || ! ref($symbol) || + ! $symbol->isa('Bio::Symbol::SymbolI') ) { + $self->warn("Did not provide a proper Bio::Symbol::SymbolI to method 'symbols' (got $symbol)"); + } else { + push @{$self->{'_symbols'}}, $symbol; + } + } + } + return @{$self->{'_symbols'}}; +} + +=head2 alphabets + + Title : alphabets + Usage : my @alphabets = $alphabet->alphabets(); + Function: Get/Set Sub Alphabet list for an alphabet + Sub-alphabets. E.g. codons made from DNAxDNAxDNA alphabets + Returns : Array of Bio::Symbol::AlphabetI objects + Args : (optional) Array of Bio::Symbol::AlphabetI objects + +=cut + +sub alphabets { + my ($self,@args) = @_; + if( @args ) { + $self->{'_alphabets'} = []; + foreach my $alpha ( @args ) { + if( ! $alpha->isa('Bio::Symbol::AlphabetI') ) { + $self->warn("Did not provide a proper Bio::Symbol::AlphabetI to method 'alphabets' (got $alpha)"); + } else { + push @{$self->{'_alphabets'}}, $alpha; + } + } + } + return @{$self->{'_alphabets'}}; +} + +=head2 contains + + Title : contains + Usage : if($alphabet->contains($symbol)) { } + Function: Tests of Symbol is contained in this alphabet + Returns : Boolean + Args : Bio::Symbol::SymbolI + +=cut + +sub contains{ + my ($self,$testsymbol) = @_; + foreach my $symbol ( $self->symbols ) { + return 1 if( $symbol->equals($testsymbol) ); + } + return 0; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Symbol/AlphabetI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Symbol/AlphabetI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,160 @@ +# $Id: AlphabetI.pm,v 1.5 2002/10/22 07:45:21 lapp Exp $ +# +# BioPerl module for Bio::Symbol::AlphabetI +# +# Cared for by Jason Stajich <jason@bioperl.org> +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Symbol::AlphabetI - A Symbol Alphabet + +=head1 SYNOPSIS + + # get a Bio::Symbol::AlphabetI object somehow + my @symbols = $alphabet->symbols; + my @subalphas = $alphabet->alphabets; + if( $alphabet->contains($symbol) ) { + # do something + } + +=head1 DESCRIPTION + +Alphabet contains set of symbols, which can be concatenated to form +symbol lists. Sequence string, for example, is stringified +representation of the symbol list (tokens of symbols). + +This module was implemented for the purposes of meeting the +BSANE/BioCORBA spec 0.3 only. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::Symbol::AlphabetI; +use strict; +use Bio::Root::RootI; + +=head2 AlphabetI Interface methods + +=cut + +=head2 symbols + + Title : symbols + Usage : my @symbols = $alphabet->symbols(); + Function: Get/Set Symbol list for an alphabet + List of symbols, which make up this alphabet. + Returns : Array of L<Bio::Symbol::SymbolI> objects + Args : (optional) Array of L<Bio::Symbol::SymbolI> objects + +=cut + +sub symbols{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +=head2 alphabets + + Title : alphabets + Usage : my @alphabets = $alphabet->alphabets(); + Function: Get/Set Sub Alphabet list for an alphabet + Sub-alphabets. E.g. codons made from DNAxDNAxDNA alphabets + Returns : Array of L<Bio::Symbol::AlphabetI> objects + Args : (optional) Array of L<Bio::Symbol::AlphabetI> objects + +=cut + +sub alphabets{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +=head2 contains + + Title : contains + Usage : if($alphabet->contains($symbol)) { } + Function: Tests of Symbol is contained in this alphabet + Returns : Boolean + Args : L<Bio::Symbol::SymbolI> + +=cut + +sub contains{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +# Other methods from BSANE - not sure if we will implement here or only in +# BioCORBA implementation + +# Resolve symbols from the token string. +# SymbolList to_symbol(in string tokens) raises ( IllegalSymbolException) ; + +# Convinience method, which returns gap symbol that do not +# match with any other symbols in the alphabet. +# Symbol get_gap_symbol() raises ( DoesNotExist) ; + + +# Returns a ambiguity symbol, which represent list of +# symbols. All symbols in a list must be members of +# this alphabet otherwise IllegalSymbolException is +# thrown. +# Symbol get_ambiguity( in SymbolList symbols) raises( IllegalSymbolException); + + +# Returns a Symbol, which represents ordered list of symbols +# given as a parameter. Each symbol in the list must be member of +# different sub-alphabet in the order defined by the alphabets +# attribute. For example, codons can be represented by a compound +# Alphabet of three DNA Alphabets, in which case the get_symbol( +# SymbolList[ a,g,t]) method of the Alphabet returns Symbol for +# the codon agt.<p> + +# IllegalSymbolException is raised if members of symbols +# are not Symbols over the alphabet defined by +# get_alphabets()-method +# Symbol get_symbol(in SymbolList symbols) raises(IllegalSymbolException) ; + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Symbol/DNAAlphabet.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Symbol/DNAAlphabet.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,121 @@ +# $Id: DNAAlphabet.pm,v 1.3 2002/10/22 07:45:21 lapp Exp $ +# +# BioPerl module for Bio::Symbol::DNAAlphabet +# +# Cared for by Jason Stajich <jason@bioperl.org> +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Symbol::DNAAlphabet - A ready made DNA alphabet + +=head1 SYNOPSIS + + use Bio::Symbol::DNAAlphabet; + my $alpha = new Bio::Symbol::DNAAlphabet(); + foreach my $symbol ( $alpha->symbols ) { + print "symbol is $symbol\n"; + } + +=head1 DESCRIPTION + +This object builds an Alphabet with DNA symbols. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Symbol::DNAAlphabet; +use vars qw(@ISA); +use strict; + +use Bio::Symbol::Alphabet; +use Bio::Symbol::Symbol; +use Bio::Tools::IUPAC; + +@ISA = qw(Bio::Symbol::Alphabet); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Symbol::DNAAlphabet(); + Function: Builds a new Bio::Symbol::DNAAlphabet object + Returns : Bio::Symbol::DNAAlphabet + Args : + + +=cut + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + my %alphabet = Bio::Tools::IUPAC::iupac_iub(); + my %symbols; + foreach my $let ( keys %alphabet ) { + next unless @{$alphabet{$let}} == 1 || $let eq 'U'; + $symbols{$let} = new Bio::Symbol::Symbol(-name => $let, + -token => $let); + } + + foreach my $let ( keys %alphabet ) { + next if( $symbols{$let} || $let eq 'U'); + my @subsymbols; + + foreach my $sublet ( @{$alphabet{$let}} ) { + push @subsymbols, $symbols{$sublet}; + } + my $alpha = new Bio::Symbol::Alphabet(-symbols => \@subsymbols); + $symbols{$let} = new Bio::Symbol::Symbol(-name => $let, + -token => $let, + -matches => $alpha, + -symbols => \@subsymbols); + } + + $self->symbols(values %symbols); + return $self; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Symbol/ProteinAlphabet.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Symbol/ProteinAlphabet.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,123 @@ +# $Id: ProteinAlphabet.pm,v 1.3 2002/10/22 07:45:21 lapp Exp $ +# +# BioPerl module for Bio::Symbol::ProteinAlphabet +# +# Cared for by Jason Stajich <jason@bioperl.org> +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Symbol::ProteinAlphabet - A ready made Protein alphabet + +=head1 SYNOPSIS + + use Bio::Symbol::ProteinAlphabet; + my $alpha = new Bio::Symbol::ProteinAlphabet(); + foreach my $symbol ( $alpha->symbols ) { + print "symbol is $symbol\n"; + } + +=head1 DESCRIPTION + +This object builds an Alphabet with Protein symbols. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Symbol::ProteinAlphabet; +use vars qw(@ISA); +use strict; + +use Bio::Symbol::Alphabet; +use Bio::Symbol::Symbol; +use Bio::Tools::IUPAC; +use Bio::SeqUtils; + +@ISA = qw(Bio::Symbol::Alphabet); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Symbol::ProteinAlphabet(); + Function: Builds a new Bio::Symbol::ProteinAlphabet object + Returns : Bio::Symbol::ProteinAlphabet + Args : + + +=cut + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + my %aa = Bio::SeqUtils->valid_aa(2); + my %codes = Bio::Tools::IUPAC->iupac_iup(); + my %symbols; + my @left; + + foreach my $let ( keys %codes ) { + next if( $let eq 'U'); + if( scalar @{$codes{$let}} != 1) { push @left, $let; next; } + $symbols{$let} = new Bio::Symbol::Symbol(-name => $aa{$let}, + -token => $let); + } + foreach my $l ( @left ) { + my @subsym; + foreach my $sym ( @{$codes{$l}} ) { + push @subsym, $symbols{$sym}; + } + my $alpha = new Bio::Symbol::Alphabet(-symbols => \@subsym); + $symbols{$l} = new Bio::Symbol::Symbol(-name => $aa{$l}, + -token => $l, + -matches => $alpha, + -symbols => \@subsym); + } + + $self->symbols(values %symbols); + return $self; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Symbol/Symbol.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Symbol/Symbol.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,232 @@ +# $Id: Symbol.pm,v 1.6 2002/10/22 07:45:21 lapp Exp $ +# +# BioPerl module for Bio::Symbol::Symbol +# +# Cared for by Jason Stajich <jason@bioperl.org> +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Symbol::Symbol - A biological symbol + +=head1 SYNOPSIS + + use Bio::Symbol::Symbol; + my $thymine = new Bio::Symbol::Symbol(-name => 'Thy', + -token=> 'T'); + my $a = new Bio::Symbol::Symbol(-token => 'A' ); + my $u = new Bio::Symbol::Symbol(-token => 'U' ); + my $g = new Bio::Symbol::Symbol(-token => 'G' ); + + my $M = new Bio::Symbol::Symbol(-name => 'Met', + -token => 'M', + -symbols => [ $a, $u, $g ]); + + my ($name,$token) = ($a->name, $a->token); + my @symbols = $a->symbols; + my $matches = $a->matches; + +=head1 DESCRIPTION + +Symbol represents a single token in the sequence. Symbol can have +multiple synonyms or matches within the same Alphabet, which +makes possible to represent ambiguity codes and gaps. + +Symbols can be also composed from ordered list other symbols. For +example, codons can be represented by single Symbol using a +compound Alphabet made from three DNA Alphabets. + +This module was implemented for the purposes of meeting the +BSANE/BioCORBA spec 0.3 only. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Symbol::Symbol; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::Symbol::SymbolI; +use Bio::Symbol::Alphabet; +use Bio::Root::Root; + +@ISA = qw( Bio::Root::Root Bio::Symbol::SymbolI ); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Symbol::Symbol(); + Function: Builds a new Bio::Symbol::Symbol object + Returns : Bio::Symbol::Symbol + Args : -name => descriptive name (string) [e.g. Met] + -token => Shorthand token (string) [e.g. M] + -symbols => Symbols that make up this symbol (array) [e.g. AUG] + -matches => Alphabet in the event symbol is an ambiguity + code. + +=cut + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + $self->{'_symbols'} = []; + + my ($name, $token, $symbols, + $matches) = $self->_rearrange([qw(NAME TOKEN SYMBOLS + MATCHES)], + @args); + $token && $self->token($token); + $name && $self->name($name); + $symbols && ref($symbols) =~ /array/i && $self->symbols(@$symbols); + $matches && $self->matches($matches); + return $self; +} + +=head2 name + + Title : name + Usage : my $name = $symbol->name(); + Function: Get/Set Descriptive name for Symbol + Returns : string + Args : (optional) string + +=cut + +sub name { + my ($self,$value) = @_; + if( $value ) { + $self->{'_name'} = $value; + } + return $self->{'_name'} || ''; +} + +=head2 token + + Title : token + Usage : my $token = $self->token(); + Function: Get/Set token for this symbol + Example : Letter A,C,G,or T for a DNA alphabet Symbol + Returns : string + Args : (optional) string + +=cut + +sub token{ + my ($self,$value) = @_; + if( $value ) { + $self->{'_token'} = $value; + } + return $self->{'_token'} || ''; +} + +=head2 symbols + + Title : symbols + Usage : my @symbols = $self->symbols(); + Function: Get/Set Symbols this Symbol is composed from + Example : Ambiguity symbols are made up > 1 base symbol + Returns : Array of Bio::Symbol::SymbolI objects + Args : (optional) Array of Bio::Symbol::SymbolI objects + + +=cut + +sub symbols{ + my ($self,@args) = @_; + if( @args ) { + $self->{'_symbols'} = [@args]; + } + return @{$self->{'_symbols'}}; +} + +=head2 matches + + Title : matches + Usage : my $matchalphabet = $symbol->matches(); + Function: Get/Set (Sub) alphabet of symbols matched by this symbol + including the symbol itself (i.e. if symbol is DNA + ambiguity code W then the matches contains symbols for W + and T) + Returns : Bio::Symbol::AlphabetI + Args : (optional) Bio::Symbol::AlphabetI + +=cut + +sub matches{ + my ($self,$matches) = @_; + + if( $matches ) { + if( ! $matches->isa('Bio::Symbol::AlphabetI') ) { + $self->warn("Must pass in a Bio::Symbol::AlphabetI object to matches function"); + # stick with previous value + } else { + $self->{'_matches'} = $matches; + } + } + return $self->{'_matches'}; +} + +=head2 equals + + Title : equals + Usage : if( $symbol->equals($symbol2) ) { } + Function: Tests if a symbol is equal to another + Returns : Boolean + Args : Bio::Symbol::SymbolI + +=cut + +sub equals{ + my ($self,$symbol2) = @_; + # Let's just test based on Tokens for now + # Doesn't handle DNA vs PROTEIN accidential comparisons + return $self->token eq $symbol2->token; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Symbol/SymbolI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Symbol/SymbolI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,168 @@ +# $Id: SymbolI.pm,v 1.6 2002/10/22 07:45:21 lapp Exp $ +# +# BioPerl module for Bio::Symbol::SymbolI +# +# Cared for by Jason Stajich <jason@bioperl.org> +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Symbol::SymbolI - Interface for a Symbol + +=head1 SYNOPSIS + + # get a Bio::Symbol::SymbolI object somehow + + my ($name,$token) = ($symbol->name, $symbol->token); + my @symbols = $symbol->symbols; + my $matches = $symbol->matches; + +=head1 DESCRIPTION + +Symbol represents a single token in the sequence. Symbol can have +multiple synonyms or matches within the same Alphabet, which +makes possible to represent ambiguity codes and gaps. + +Symbols can be also composed from ordered list other symbols. For +example, codons can be represented by single Symbol using a +compound Alphabet made from three DNA Alphabets. + +This module was implemented for the purposes of meeting the +BSANE/BioCORBA spec 0.3 only. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Symbol::SymbolI; +use strict; +use Bio::Root::RootI; +use vars qw(@ISA); +@ISA = qw(Bio::Root::RootI); + +=head2 Bio::Symbol::SymbolI interface methods + +=cut + +=head2 name + + Title : name + Usage : my $name = $symbol->name(); + Function: Get/Set Descriptive name for Symbol + Returns : string + Args : (optional) string + +=cut + +sub name{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +=head2 token + + Title : token + Usage : my $token = $self->token(); + Function: Get/Set token for this symbol + Example : Letter A,C,G,or T for a DNA alphabet Symbol + Returns : string + Args : (optional) string + +=cut + +sub token{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +=head2 symbols + + Title : symbols + Usage : my @symbols = $self->symbols(); + Function: Get/Set Symbols this Symbol is composed from + Example : A codon is composed of 3 DNA symbols + Returns : Array of Bio::Symbol::SymbolI objects + Args : (optional) Array of Bio::Symbol::SymbolI objects + + +=cut + +sub symbols{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +=head2 matches + + Title : matches + Usage : my $matchalphabet = $symbol->matches(); + Function: Get/Set (Sub) alphabet of symbols matched by this symbol + including the symbol itself (i.e. if symbol is DNA + ambiguity code W then the matches contains symbols for W + and T) + Returns : Bio::Symbol::AlphabetI + Args : (optional) Bio::Symbol::AlphabetI + +=cut + +sub matches{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +=head2 equals + + Title : equals + Usage : if( $symbol->equals($symbol2) ) { } + Function: Tests if a symbol is equal to another + Returns : Boolean + Args : Bio::Symbol::SymbolI + +=cut + +sub equals{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Taxonomy.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Taxonomy.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,261 @@ +# $Id: Taxonomy.pm,v 1.1 2002/11/19 00:36:47 kortsch Exp $ +# +# BioPerl module for Bio::Taxonomy +# +# Cared for by Dan Kortschak +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Taxonomy - Conversion used bt the Taxonomy classes + +=head1 SYNOPSIS + + use Bio::Taxonomy; + +=head1 DESCRIPTION + +Provides methods for converting classifications into taxonomic +structures. + +=head1 CONTACT + +Dan Kortschak email B<kortschak@rsbs.anu.edu.au> + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# code begins... + + +package Bio::Taxonomy; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Object +use Bio::Root::Root; + +@ISA = qw(Bio::Root::Root); + + +=head2 new + + Title : new + Usage : my $obj = new Bio::Taxonomy(); + Function: Builds a new Bio::Taxonomy object + Returns : Bio::Taxonomy + Args : -method -> method used to decide classification + (none|trust|lookup) + -ranks -> what ranks are there + +=cut + + +sub new { + my ($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + $self->{'_method'}='none'; + $self->{'_ranks'}=[]; + $self->{'_rank_hash'}={}; + + my ($method,$ranks,$order) = $self->_rearrange([qw(METHOD RANKS ORDER)], @args); + + if ($method) { + $self->method($method); + } + + if (defined $ranks && + (ref($ranks) eq "ARRAY") ) { + $self->ranks(@$ranks); + } else { + # default ranks + # I think these are in the right order, but not sure: + # some parvorder|suborder and varietas|subspecies seem + # to be at the same level - any taxonomists? + # I don't expect that these will actually be used except as a way + # to find what ranks there are in taxonomic use + $self->ranks(('root', + 'superkingdom', + 'kingdom', + 'superphylum', + 'phylum', + 'subphylum', + 'superclass', + 'class', + 'subclass', + 'infraclass', + 'superorder', + 'order', + 'suborder', + 'parvorder', + 'infraorder', + 'superfamily', + 'family', + 'subfamily', + 'tribe', + 'subtribe', + 'genus', + 'subgenus', + 'species group', + 'species subgroup', + 'species', + 'subspecies', + 'varietas', + 'forma', + 'no rank')); + } + + return $self; +} + + +=head2 method + + Title : method + Usage : $obj = taxonomy->method($method); + Function: set or return the method used to decide classification + Returns : $obj + Args : $obj + +=cut + + +sub method { + my ($self,$value) = @_; + if (defined $value && $value=~/none|trust|lookup/) { + $self->{'_method'} = $value; + } + return $self->{'_method'}; +} + + +=head2 classify + + Title : classify + Usage : @obj[][0-1] = taxonomy->classify($species); + Function: return a ranked classification + Returns : @obj of taxa and ranks as word pairs separated by "@" + Args : Bio::Species object + +=cut + + +sub classify { + my ($self,$value) = @_; + my @ranks; + + if (! $value->isa('Bio::Species') ) { + $self->throw("Trying to classify $value which is not a Bio::Species object"); + } + + my @classes=reverse($value->classification); + + if ($self->method eq 'none') { + for (my $i=0; $i < @classes-2; $i++) { + ($ranks[$i][0],$ranks[$i][1])=($classes[$i],'no rank'); + } + push @ranks,[$classes[-2],'genus']; + push @ranks,[$value->binomial,'species']; + } elsif ($self->method eq 'trust') { + if (scalar(@classes)==scalar($self->ranks)) { + for (my $i=0; $i < @classes; $i++) { + if ($self->rank_of_number($i) eq 'species') { + push @ranks,[$value->binomial,$self->rank_of_number($i)]; + } else { + push @ranks,[$classes[$i],$self->rank_of_number($i)]; + } + } + } else { + $self->throw("Species object and taxonomy object cannot be reconciled"); + } + } elsif ($self->method eq 'lookup') { + # this will lookup a DB for the rank of a taxon name + # I imagine that some kind of Bio::DB class will be need to + # be given to the taxonomy object to act as an DB interface + # (I'm not sure how useful this is though - if you have a DB of + # taxonomy - why would you be doing things this way?) + $self->throw("Not yet implemented"); + } + + return @ranks; +} + + +=head2 level_of_rank + + Title : level_of_rank + Usage : $obj = taxonomy->level_of_rank($obj); + Function: returns the level of a rank name + Returns : $obj + Args : $obj + +=cut + + +sub level_of { + my ($self,$value) = @_; + + return $self->{'_rank_hash'}{$value}; +} + + +=head2 rank_of_number + + Title : rank_of_number + Usage : $obj = taxonomy->rank_of_number($obj); + Function: returns the rank name of a rank level + Returns : $obj + Args : $obj + +=cut + + +sub rank_of_number { + my ($self,$value) = @_; + + return ${$self->{'_ranks'}}[$value]; +} + + +=head2 ranks + + Title : ranks + Usage : @obj = taxonomy->ranks(@obj); + Function: set or return all ranks + Returns : @obj + Args : @obj + +=cut + + +sub ranks { + my ($self,@value) = @_; + + # currently this makes no uniqueness sanity check (this should be done) + # I am think that adding a way of converting multiple 'no rank' ranks + # to unique 'no rank #' ranks so that the level of a 'no rank' is + # abstracted way from the user - I'm not sure of the vlaue of this + + if (defined @value) { + $self->{'_ranks'}=\@value; + } + + for (my $i=0; $i <= @{$self->{'_ranks'}}-1; $i++) { + $self->{'_rank_hash'}{$self->{'_ranks'}[$i]}=$i unless $self->{'_ranks'}[$i] eq 'no rank'; + } + + return @{$self->{'_ranks'}}; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Taxonomy/Taxon.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Taxonomy/Taxon.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,677 @@ +# $Id: Taxon.pm,v 1.1 2002/11/18 22:08:33 kortsch Exp $ +# +# BioPerl module for Bio::Taxonomy::Taxon +# +# Cared for by Dan Kortschak but pilfered extensively from +# the Bio::Tree::Node code of Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Taxonomy::Taxon - Generic Taxonomic Entity object + +=head1 SYNOPSIS + + use Bio::Taxonomy::Taxon; + my $taxonA = new Bio::Taxonomy::Taxon(); + my $taxonL = new Bio::Taxonomy::Taxon(); + my $taxonR = new Bio::Taxonomy::Taxon(); + + my $taxon = new Bio::Taxonomy::Taxon(); + $taxon->add_Descendents($nodeL); + $taxon->add_Descendents($nodeR); + + $species = $taxon->species; + +=head1 DESCRIPTION + +Makes a taxonomic unit suitable for use in a taxonomic tree + +=head1 CONTACT + +Dan Kortschak email B<kortschak@rsbs.anu.edu.au> + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# code begins... + +package Bio::Taxonomy::Taxon; +use vars qw(@ISA $CREATIONORDER); +use strict; + +# Object preamble - inherits from Bio::Root::Object, Bio::Tree::NodeI, Bio::Species and Bio::Taxonomy +use Bio::Root::Root; +use Bio::Tree::NodeI; +use Bio::Taxonomy; +use Bio::Species; + +# import rank information from Bio::Taxonomy.pm +use vars qw(@RANK %RANK); + +@ISA = qw(Bio::Root::Root Bio::Tree::NodeI); + +BEGIN { + $CREATIONORDER = 0; +} + +=head2 new + + Title : new + Usage : my $obj = new Bio::Taxonomy::Taxon(); + Function: Builds a new Bio::Taxonomy::Taxon object + Returns : Bio::Taxonomy::Taxon + Args : -descendents => array pointer to descendents (optional) + -branch_length => branch length [integer] (optional) + -taxon => taxon + -id => unique taxon id for node (from NCBI's list preferably) + -rank => the taxonomic level of the node (also from NCBI) + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + my ($children,$branchlen,$id,$taxon,$rank,$desc) = + + $self->_rearrange([qw(DESCENDENTS + BRANCH_LENGTH + ID + TAXON + RANK + DESC)], @args); + + $self->{'_desc'} = {}; + defined $desc && $self->description($desc); + defined $taxon && $self->taxon($taxon); + defined $id && $self->id($id); + defined $branchlen && $self->branch_length($branchlen); + defined $rank && $self->rank($rank); + + if( defined $children ) { + if( ref($children) !~ /ARRAY/i ) { + $self->warn("Must specify a valid ARRAY reference to initialize a Taxon's Descendents"); + } + foreach my $c ( @$children ) { + $self->add_Descendent($c); + } + } + $self->_creation_id($CREATIONORDER++); + return $self; +} + +=head2 add_Descendent + + Title : add_Descendent + Usage : $taxon->add_Descendant($taxon); + Function: Adds a descendent to a taxon + Returns : number of current descendents for this taxon + Args : Bio::Taxonomy::Taxon + boolean flag, true if you want to ignore the fact that you are + adding a second node with the same unique id (typically memory + location reference in this implementation). default is false and + will throw an error if you try and overwrite an existing node. + + +=cut + +sub add_Descendent{ + + my ($self,$node,$ignoreoverwrite) = @_; + + return -1 if( ! defined $node ) ; + if( ! $node->isa('Bio::Taxonomy::Taxon') ) { + $self->warn("Trying to add a Descendent who is not a Bio::Taxonomy::Taxon"); + return -1; + } + # do we care about order? + $node->{'_ancestor'} = $self; + if( $self->{'_desc'}->{$node->internal_id} && ! $ignoreoverwrite ) { + $self->throw("Going to overwrite a taxon which is $node that is already stored here, set the ignore overwrite flag (parameter 2) to true to ignore this in the future"); + } + + $self->{'_desc'}->{$node->internal_id} = $node; # is this safely unique - we've tested before at any rate?? + + $self->invalidate_height(); + + return scalar keys %{$self->{'_desc'}}; +} + + +=head2 each_Descendent + + Title : each_Descendent($sortby) + Usage : my @taxa = $taxon->each_Descendent; + Function: all the descendents for this taxon (but not their descendents + i.e. not a recursive fetchall) + Returns : Array of Bio::Taxonomy::Taxon objects + Args : $sortby [optional] "height", "creation" or coderef to be used + to sort the order of children taxa. + + +=cut + +sub each_Descendent{ + my ($self, $sortby) = @_; + + # order can be based on branch length (and sub branchlength) + + $sortby ||= 'height'; + + if (ref $sortby eq 'CODE') { + return sort $sortby values %{$self->{'_desc'}}; + } else { + if ($sortby eq 'height') { + return map { $_->[0] } + sort { $a->[1] <=> $b->[1] || + $a->[2] <=> $b->[2] } + map { [$_, $_->height, $_->internal_id ] } + values %{$self->{'_desc'}}; + } else { + return map { $_->[0] } + sort { $a->[1] <=> $b->[1] } + map { [$_, $_->height ] } + values %{$self->{'_desc'}}; + } + } +} + +=head2 remove_Descendent + + Title : remove_Descendent + Usage : $taxon->remove_Descedent($taxon_foo); + Function: Removes a specific taxon from being a Descendent of this taxon + Returns : nothing + Args : An array of Bio::taxonomy::Taxon objects which have be previously + passed to the add_Descendent call of this object. + +=cut + +sub remove_Descendent{ + my ($self,@nodes) = @_; + foreach my $n ( @nodes ) { + if( $self->{'_desc'}->{$n->internal_id} ) { + $n->{'_ancestor'} = undef; + $self->{'_desc'}->{$n->internal_id}->{'_ancestor'} = undef; + delete $self->{'_desc'}->{$n->internal_id}; + + } else { + $self->debug(sprintf("no taxon %s (%s) listed as a descendent in this taxon %s (%s)\n",$n->id, $n,$self->id,$self)); + $self->debug("Descendents are " . join(',', keys %{$self->{'_desc'}})."\n"); + } + } + 1; +} + + +=head2 remove_all_Descendents + + Title : remove_all_Descendents + Usage : $taxon->remove_All_Descendents() + Function: Cleanup the taxon's reference to descendents and reset + their ancestor pointers to undef, if you don't have a reference + to these objects after this call they will be cleanedup - so + a get_nodes from the Tree object would be a safe thing to do first + Returns : nothing + Args : none + + +=cut + +sub remove_all_Descendents{ + my ($self) = @_; + # this won't cleanup the taxa themselves if you also have + # a copy/pointer of them (I think)... + while( my ($node,$val) = each %{ $self->{'_desc'} } ) { + $val->{'_ancestor'} = undef; + } + $self->{'_desc'} = {}; + 1; +} + +=head2 get_Descendents + + Title : get_Descendents + Usage : my @taxa = $taxon->get_Descendents; + Function: Recursively fetch all the taxa and their descendents + *NOTE* This is different from each_Descendent + Returns : Array or Bio::Taxonomy::Taxon objects + Args : none + +=cut + +# implemented in the interface + +=head2 ancestor + + Title : ancestor + Usage : $taxon->ancestor($newval) + Function: Set the Ancestor + Returns : value of ancestor + Args : newvalue (optional) + +=cut + +sub ancestor { + my ($self, $value) = @_; + if (defined $value) { + $self->{'_ancestor'} = $value; + } + return $self->{'_ancestor'}; +} + +=head2 branch_length + + Title : branch_length + Usage : $obj->branch_length($newval) + Function: + Example : + Returns : value of branch_length + Args : newvalue (optional) + + +=cut + +sub branch_length { + my ($self,$value) = @_; + if( defined $value) { + $self->{'branch_length'} = $value; + } + return $self->{'branch_length'}; +} + +=head2 description + + Title : description + Usage : $obj->description($newval) + Function: + Example : + Returns : value of description + Args : newvalue (optional) + + +=cut + +sub description { + my ($self,$value) = @_; + if( defined $value ) { + $self->{'_desc'} = $value; + } + return $self->{'_desc'}; +} + + +=head2 rank + + Title : rank + Usage : $obj->rank($newval) + Function: Set the taxonomic rank + Example : + Returns : taxonomic rank of taxon + Args : newvalue (optional) + + +=cut + +sub rank { + my ($self,$value) = @_; + if (defined $value) { + my $ranks=join("|",@RANK); + if ($value=~/$ranks/) { + $self->{'_rank'} = $value; + } else { + $self->throw("Attempted to set unknown taxonomic rank: $value.\n"); + } + } + return $self->{'_rank'}; +} + + +=head2 taxon + + Title : taxon + Usage : $obj->taxon($newtaxon) + Function: Set the name of the taxon + Example : + Returns : name of taxon + Args : newtaxon (optional) + + +=cut + +# because internal taxa have names too... +sub taxon { + my ($self,$value) = @_; + if( defined $value ) { + $self->{'_taxon'} = $value; + } + return $self->{'_taxon'}; +} + + +=head2 id + + Title : id + Usage : $obj->id($newval) + Function: + Example : + Returns : value of id + Args : newvalue (optional) + + +=cut + +sub id { + my ($self,$value) = @_; + if( defined $value ) { + $self->{'_id'} = $value; + } + return $self->{'_id'}; +} + + + +sub DESTROY { + my ($self) = @_; + # try to insure that everything is cleaned up + $self->SUPER::DESTROY(); + if( defined $self->{'_desc'} && + ref($self->{'_desc'}) =~ /ARRAY/i ) { + while( my ($nodeid,$node) = each %{ $self->{'_desc'} } ) { + $node->{'_ancestor'} = undef; # ensure no circular references + $node->DESTROY(); + $node = undef; + } + $self->{'_desc'} = {}; + } +} + +=head2 internal_id + + Title : internal_id + Usage : my $internalid = $taxon->internal_id + Function: Returns the internal unique id for this taxon + (a monotonically increasing number for this in-memory implementation + but could be a database determined unique id in other + implementations) + Returns : unique id + Args : none + +=cut + +sub internal_id { + return $_[0]->_creation_id; +} + + +=head2 _creation_id + + Title : _creation_id + Usage : $obj->_creation_id($newval) + Function: a private method signifying the internal creation order + Returns : value of _creation_id + Args : newvalue (optional) + + +=cut + +sub _creation_id { + my ($self,$value) = @_; + if( defined $value) { + $self->{'_creation_id'} = $value; + } + return $self->{'_creation_id'} || 0; +} + + +# The following methods are implemented by NodeI decorated interface + +=head2 is_Leaf + + Title : is_Leaf + Usage : if( $node->is_Leaf ) + Function: Get Leaf status + Returns : boolean + Args : none + +=cut + +sub is_Leaf { + my ($self) = @_; + my $rc = 0; + $rc = 1 if( ! defined $self->{'_desc'} || + keys %{$self->{'_desc'}} == 0); + return $rc; +} + +=head2 to_string + + Title : to_string + Usage : my $str = $taxon->to_string() + Function: For debugging, provide a taxon as a string + Returns : string + Args : none + +=cut + +=head2 height + + Title : height + Usage : my $len = $taxon->height + Function: Returns the height of the tree starting at this + taxon. Height is the maximum branchlength. + Returns : The longest length (weighting branches with branch_length) to a leaf + Args : none + +=cut + +sub height { + my ($self) = @_; + + return $self->{'_height'} if( defined $self->{'_height'} ); + + if( $self->is_Leaf ) { + if( !defined $self->branch_length ) { + $self->debug(sprintf("Trying to calculate height of a taxon when a taxon (%s) has an undefined branch_length",$self->id || '?' )); + return 0; + } + return $self->branch_length; + } + my $max = 0; + foreach my $subnode ( $self->each_Descendent ) { + my $s = $subnode->height; + if( $s > $max ) { $max = $s; } + } + return ($self->{'_height'} = $max + ($self->branch_length || 1)); +} + + +=head2 invalidate_height + + Title : invalidate_height + Usage : private helper method + Function: Invalidate our cached value of the taxon's height in the tree + Returns : nothing + Args : none + +=cut + + +sub invalidate_height { + my ($self) = @_; + + $self->{'_height'} = undef; + if( $self->ancestor ) { + $self->ancestor->invalidate_height; + } +} + +=head2 classify + + Title : classify + Usage : @obj->classify() + Function: a method to return the classification of a species + Returns : name of taxon and ancestor's taxon recursively + Args : boolean to specify whether we want all taxa not just ranked +levels + + +=cut + +sub classify { + my ($self,$allnodes) = @_; + + my @classification=($self->taxon); + my $node=$self; + + while (defined $node->ancestor) { + push @classification, $node->ancestor->taxon if $allnodes==1; + $node=$node->ancestor; + } + + return (@classification); +} + + +=head2 has_rank + + Title : has_rank + Usage : $obj->has_rank($rank) + Function: a method to query ancestors' rank + Returns : boolean + Args : $rank + + +=cut + +sub has_rank { + my ($self,$rank) = @_; + + return $self if $self->rank eq $rank; + + while (defined $self->ancestor) { + return $self if $self->ancestor->rank eq $rank; + $self=$self->ancestor; + } + + return undef; +} + + +=head2 has_taxon + + Title : has_taxon + Usage : $obj->has_taxon($taxon) + Function: a method to query ancestors' taxa + Returns : boolean + Args : Bio::Taxonomy::Taxon object + + +=cut + +sub has_taxon { + my ($self,$taxon) = @_; + + return $self if + ((defined $self->id && $self->id == $taxon->id) || + ($self->taxon eq $taxon->taxon && $self->rank eq $taxon->rank)); + + while (defined $self->ancestor) { + return $self if + ((defined $self->id && $self->id == $taxon->id) || + ($self->taxon eq $taxon->taxon && $self->rank eq $taxon->rank) && + ($self->taxon ne 'no rank')); + $self=$self->ancestor; + } + + return undef; +} + + +=head2 distance_to_root + + Title : distance_to_root + Usage : $obj->distance_to_root + Function: a method to query ancestors' taxa + Returns : number of links to root + Args : + + +=cut + +sub distance_to_root { + my ($self,$taxon) = @_; + + my $count=0; + + while (defined $self->ancestor) { + $count++; + $self=$self->ancestor; + } + + return $count; +} + + +=head2 recent_common_ancestor + + Title : recent_common_ancestor + Usage : $obj->recent_common_ancestor($taxon) + Function: a method to query find common ancestors + Returns : Bio::Taxonomy::Taxon of query or undef if no ancestor of rank + Args : Bio::Taxonomy::Taxon + + +=cut + +sub recent_common_ancestor { + my ($self,$node) = @_; + + while (defined $node->ancestor) { + my $common=$self->has_taxon($node); + return $common if defined $common; + $node=$node->ancestor; + } + + return undef; +} + +=head2 species + + Title : species + Usage : $obj=$taxon->species; + Function: Returns a Bio::Species object reflecting the taxon's tree position + Returns : a Bio::Species object + Args : none + +=cut + +sub species { + my ($self) = @_; + my $species; + + if ($self->has_rank('subspecies') && $self->ancestor->rank eq 'species') { + $species = Bio::Species->new(-classification => $self->ancestor->classify); + $species->genus($self->ancestor->ancestor->taxon); + $species->species($self->ancestor->taxon); + $species->sub_species($self->taxon); + } elsif ($self->has_rank('species')) { + $species = Bio::Species->new(-classification => $self->classify); + $species->genus($self->ancestor->taxon); + $species->species($self->taxon); + } else { + $self->throw("Trying to create a species from a taxonomic entity without species rank. Use classify instead of species.\n"); + } + return $species; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Taxonomy/Tree.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Taxonomy/Tree.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,450 @@ +# $Id: Tree.pm,v 1.1 2002/11/18 22:08:33 kortsch Exp $ +# +# BioPerl module for Bio::Taxonomy::Tree +# +# Cared for by Dan Kortschak but pilfered extensively from Bio::Tree::Tree by Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Taxonomy::Tree - An Organism Level Implementation of TreeI interface. + +=head1 SYNOPSIS + + # like from a TreeIO + my $treeio = new Bio::TreeIO(-format => 'newick', -file => 'treefile.dnd'); + my $tree = $treeio->next_tree; + my @nodes = $tree->get_nodes; + my $root = $tree->get_root_node; + my @leaves = $tree->get_leaves; + + +=head1 DESCRIPTION + +This object holds handles to Taxonomic Nodes which make up a tree. + +=head1 EXAMPLES + + use Bio::Species; + use Bio::Taxonomy::Tree; + + my $human=new Bio::Species; + my $chimp=new Bio::Species; + my $bonobo=new Bio::Species; + + $human->classification(qw( sapiens Homo Hominidae + Catarrhini Primates Eutheria + Mammalia Euteleostomi Vertebrata + Craniata Chordata + Metazoa Eukaryota )); + $chimp->classification(qw( troglodytes Pan Hominidae + Catarrhini Primates Eutheria + Mammalia Euteleostomi Vertebrata + Craniata Chordata + Metazoa Eukaryota )); + $bonobo->classification(qw( paniscus Pan Hominidae + Catarrhini Primates Eutheria + Mammalia Euteleostomi Vertebrata + Craniata Chordata + Metazoa Eukaryota )); + + # ranks passed to $taxonomy match ranks of species + my @ranks = ('superkingdom','kingdom','phylum','subphylum', + 'no rank 1','no rank 2','class','no rank 3','order', + 'suborder','family','genus','species'); + + my $taxonomy=new Bio::Taxonomy(-ranks => \@ranks, + -method => 'trust', + -order => -1); + + my @nodes; + + my $tree1=new Bio::Taxonomy::Tree; + my $tree2=new Bio::Taxonomy::Tree; + + push @nodes,$tree1->make_species_branch($human,$taxonomy); + push @nodes,$tree2->make_species_branch($chimp,$taxonomy); + + my ($homo_sapiens)=$tree1->get_leaves; + + $tree1->splice($tree2); + + push @nodes,$tree1->add_species($bonobo,$taxonomy); + + my @taxa; + foreach my $leaf ($tree1->get_leaves) { + push @taxa,$leaf->taxon; + } + print join(", ",@taxa)."\n"; + + @taxa=(); + $tree1->remove_branch($homo_sapiens); + foreach my $leaf ($tree1->get_leaves) { + push @taxa,$leaf->taxon; + } + print join(", ",@taxa)."\n"; + +=head1 FEEDBACK + +See AUTHOR + +=head1 AUTHOR - Dan Kortschak + +Email kortschak@rsbs.anu.edu.au + +=head1 CONTRIBUTORS + +Mainly Jason Stajich + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Code begins... + + +package Bio::Taxonomy::Tree; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::Root::Root; +use Bio::Tree::TreeFunctionsI; +use Bio::Tree::TreeI; +use Bio::Taxonomy::Taxon; + +# Import rank information from Bio::Taxonomy.pm +use vars qw(@RANK %RANK); + +@ISA = qw(Bio::Root::Root Bio::Tree::TreeI Bio::Tree::TreeFunctionsI); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Taxonomy::Tree(); + Function: Builds a new Bio::Taxonomy::Tree object + Returns : Bio::Taxonomy::Tree + Args : + + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + $self->{'_rootnode'} = undef; + $self->{'_maxbranchlen'} = 0; + + my ($root)= $self->_rearrange([qw(ROOT)], @args); + if( $root ) { $self->set_root_node($root); } + return $self; +} + + +=head2 get_nodes + + Title : get_nodes + Usage : my @nodes = $tree->get_nodes() + Function: Return list of Bio::Taxonomy::Taxon objects + Returns : array of Bio::Taxonomy::Taxon objects + Args : (named values) hash with one value + order => 'b|breadth' first order or 'd|depth' first order + +=cut + +sub get_nodes{ + my ($self, @args) = @_; + + my ($order, $sortby) = $self->_rearrange([qw(ORDER SORTBY)],@args); + $order ||= 'depth'; + $sortby ||= 'height'; + + if ($order =~ m/^b|(breadth)$/oi) { + my $node = $self->get_root_node; + my @children = ($node); + for (@children) { + push @children, $_->each_Descendent($sortby); + } + return @children; + } + + if ($order =~ m/^d|(depth)$/oi) { + # this is depth-first search I believe + my $node = $self->get_root_node; + my @children = ($node,$node->get_Descendents($sortby)); + return @children; + } +} + +=head2 get_root_node + + Title : get_root_node + Usage : my $node = $tree->get_root_node(); + Function: Get the Top Node in the tree, in this implementation + Trees only have one top node. + Returns : Bio::Taxonomy::Taxon object + Args : none + +=cut + + +sub get_root_node{ + my ($self) = @_; + return $self->{'_rootnode'}; +} + +=head2 set_root_node + + Title : set_root_node + Usage : $tree->set_root_node($node) + Function: Set the Root Node for the Tree + Returns : Bio::Taxonomy::Taxon + Args : Bio::Taxonomy::Taxon + +=cut + + +sub set_root_node{ + my ($self,$value) = @_; + if( defined $value ) { + if( ! $value->isa('Bio::Taxonomy::Taxon') ) { + $self->warn("Trying to set the root node to $value which is not a Bio::Taxonomy::Taxon"); + return $self->get_root_node; + } + $self->{'_rootnode'} = $value; + } + return $self->get_root_node; +} + + +=head2 get_leaves + + Title : get_leaves + Usage : my @nodes = $tree->get_leaves() + Function: Return list of Bio::Taxonomy::Taxon objects + Returns : array of Bio::Taxonomy::Taxon objects + Args : + +=cut + + +sub get_leaves{ + my ($self) = @_; + + my $node = $self->get_root_node; + my @leaves; + my @children = ($node); + for (@children) { + push @children, $_->each_Descendent(); + } + for (@children) { + push @leaves, $_ if $_->is_Leaf; + } + return @leaves; +} + +=head2 make_species_branch + + Title : make_species_branch + Usage : @nodes = $tree->make_species_branch($species,$taxonomy) + Function: Return list of Bio::Taxonomy::Taxon objects based on a Bio::Species object + Returns : array of Bio::Taxonomy::Taxon objects + Args : Bio::Species and Bio::Taxonomy objects + +=cut + +# I'm not happy that make_species_branch and make_branch are seperate routines +# should be able to just make_branch and have it sort things out + +sub make_species_branch{ + my ($self,$species,$taxonomy) = @_; + + if (! $species->isa('Bio::Species') ) { + $self->throw("Trying to classify $species which is not a Bio::Species object"); + } + if (! $taxonomy->isa('Bio::Taxonomy') ) { + $self->throw("Trying to classify with $taxonomy which is not a Bio::Taxonomy object"); + } + + # this is done to make sure we aren't duplicating a path (let God sort them out) + if (defined $self->get_root_node) { + $self->get_root_node->remove_all_Descendents; + } + + my @nodes; + + # nb taxa in [i][0] and ranks in [i][1] + my @taxa=$taxonomy->classify($species); + + for (my $i = 0; $i < @taxa; $i++) { + $nodes[$i]=Bio::Taxonomy::Taxon->new(-taxon => $taxa[$i][0], + -rank => $taxa[$i][1]); + } + + for (my $i = 0; $i < @taxa-1; $i++) { + $nodes[$i]->add_Descendent($nodes[$i+1]); + } + + $self->set_root_node($nodes[0]); + + return @nodes; +} + + +=head2 make_branch + + Title : make_branch + Usage : $tree->make_branch($node) + Function: Make a linear Bio::Taxonomy::Tree object from a leafish node + Returns : + Args : Bio::Taxonomy::Taxon object + +=cut + + +sub make_branch{ + my ($self,$node) = @_; + + # this is done to make sure we aren't duplicating a path (let God sort them out) + # note that if you are using a linked set of node which include node + # already in the tree, this will break + $self->get_root_node->remove_all_Descendents; + + while (defined $node->ancestor) { + $self->set_root_node($node); + $node=$node->ancestor; + } +} + + +=head2 splice + + Title : splice + Usage : @nodes = $tree->splice($tree) + Function: Return a of Bio::Taxonomy::Tree object that is a fusion of two + Returns : array of Bio::Taxonomy::Taxon added to tree + Args : Bio::Taxonomy::Tree object + +=cut + + +sub splice{ + my ($self,$tree) = @_; + + my @nodes; + + my @newleaves = $tree->get_leaves; + foreach my $leaf (@newleaves) { + push @nodes,$self->add_branch($leaf); + } + + return @nodes; +} + +=head2 add_species + + Title : add_species + Usage : @nodes = $tree->add_species($species,$taxonomy) + Function: Return a of Bio::Taxonomy::Tree object with a new species added + Returns : array of Bio::Taxonomy::Taxon added to tree + Args : Bio::Species object + +=cut + + +sub add_species{ + my ($self,$species,$taxonomy) = @_; + + my $branch=Bio::Taxonomy::Tree->new; + my @nodes=$branch->make_species_branch($species,$taxonomy); + + my ($newleaf)=$branch->get_leaves; + + return $self->add_branch($newleaf); +} + +=head2 add_branch + + Title : add_branch + Usage : $tree->add_branch($node,boolean) + Function: Return a of Bio::Taxonomy::Tree object with a new branch added + Returns : array of Bio::Taxonomy::Taxon objects of the resulting tree + Args : Bio::Taxonomy::Taxon object + boolean flag to force overwrite of descendent + (see Bio::Node->add_Descendent) + +=cut + + +sub add_branch { + my ($self,$node,$force) = @_; + + my $best_node_level=0; + my ($best_node,@nodes,$common); + + my @leaves=$self->get_leaves; + foreach my $leaf (@leaves) { + $common=$node->recent_common_ancestor($leaf); # the root of the part to add + if (defined $common && ($common->distance_to_root > $best_node_level)) { + $best_node_level = $common->distance_to_root; + $best_node = $common; + } + } + + return unless defined $best_node; + + push @nodes,($self->get_root_node,$self->get_root_node->get_Descendents); + foreach my $node (@nodes) { + if ((defined $best_node->id && $best_node->id == $node->id) || + ($best_node->rank eq $node->rank && $best_node->taxon eq $node->taxon) && + ($best_node->rank ne 'no rank')) { + foreach my $descendent ($common->each_Descendent) { + $node->add_Descendent($descendent,$force); + } + } + + $self->set_root_node($node) if $node->distance_to_root==0; + } + + return ($common->get_Descendents); +} + +=head2 remove_branch + + Title : remove_branch + Usage : $tree->remove_branch($node) + Function: remove a branch up to the next multifurcation + Returns : + Args : Bio::Taxonomy::Taxon object + +=cut + + +sub remove_branch{ + my ($self,$node) = @_; + + # we can define a branch at any point along it + + while (defined $node->ancestor) { + last if $node->ancestor->each_Descendent > 1; + $node=$node->ancestor; + } + $node->remove_all_Descendents; # I'm not sure if this is necessary, + # but I don't see that remove_Descendent + # has the side effect of deleting + # descendent nodes of the deletee + $node->ancestor->remove_Descendent($node); +} + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/AlignFactory.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/AlignFactory.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,145 @@ +# $Id: AlignFactory.pm,v 1.8 2001/11/20 02:09:40 lstein Exp $ +# +# BioPerl module for Bio::Tools::AlignFactory +# +# Cared for by Ewan Birney <birney@sanger.ac.uk> +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::AlignFactory - Base object for alignment factories + +=head1 SYNOPSIS + +You wont be using this as an object, but using a dervied class +like Bio::Tools::pSW + +=head1 DESCRIPTION + +Holds common Alignment Factory attributes in place + +=head1 CONTACT + +http://bio.perl.org/ or birney@sanger.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Tools::AlignFactory; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +@ISA = qw(Bio::Root::Root); + +BEGIN { + eval { + require Bio::Ext::Align; + }; + if ( $@ ) { + print STDERR ("\nThe C-compiled engine for Smith Waterman alignments (Bio::Ext::Align) has not been installed.\n Please install the bioperl-ext package\n\n"); + exit(1); + } +} + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + $self->_initialize(@args); + # set up defaults + + $self->{'kbyte'} = 20000; + $self->{'report'} = 0; + return $self; +} + + +=head2 kbyte + + Title : kbyte() + Usage : set/gets the amount of memory able to be used + Function : + : $factory->kbyte(200); + : + Returns : + Argument : memory in kilobytes + +=cut + +sub kbyte { + my ($self,$value) = @_; + + if( defined $value ) { + $self->{'kbyte'} = $value; + } + return $self->{'kbyte'}; +} + + +=head2 report + + Title : report() + Usage : set/gets the report boolean to issue reports or not + Function : + : $factory->report(1); # reporting goes on + : + Returns : n/a + Argument : 1 or 0 + +=cut + +sub report { + my ($self,$value) = @_; + + + if( defined $value ) { + if( $value != 1 && $value != 0 ) { + $self->throw("Attempting to modify AlignFactory Report with no boolean value!"); + } + $self->{'report'} = $value; + } + + return $self->{'report'}; +} + +=head2 set_memory_and_report + + Title : set_memory_and_report + Usage : Only used by subclasses. + Function: + Example : + Returns : + Args : + + +=cut + +sub set_memory_and_report{ + my ($self) = @_; + + if( $self->{'kbyte'} < 5 ) { + $self->throw("You can suggest aligning things with less than 5kb"); + } + + &Bio::Ext::Align::change_max_BaseMatrix_kbytes($self->{'kbyte'}); + + if( $self->{'report'} == 0 ) { + &Bio::Ext::Align::error_off(16); + } else { + &Bio::Ext::Align::error_on(16); + } +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Alignment/Consed.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Alignment/Consed.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1808 @@ +# Bio::Tools::Alignment::Consed.pm +# +# Cared for by Chad Matsalla +# +# Copyright Chad Matsalla +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::Alignment::Consed - A module to work with objects from consed .ace files + +=head1 SYNOPSIS + + # a report for sequencing stuff + my $o_consed = new Bio::Tools::Alignment::Consed( + -acefile => "/path/to/an/acefile.ace.1", + -verbose => 1); + my $foo = $o_consed->set_reverse_designator("r"); + my $bar = $o_consed->set_forward_designator("f"); + + # get the contig numbers + my @keys = $o_consed->get_contigs(); + + # construct the doublets + my $setter_doublets = $o_consed->choose_doublets(); + + # get the doublets + my @doublets = $o_consed->get_doublets(); + +=head1 DESCRIPTION + +Bio::Tools::Alignment::Consed provides methods and objects to deal +with the output from the Consed package of programs. Specifically, +Bio::Tools::Alignment::Consed takes in the name of in .ace file and +provides objects for the results. + +A word about doublets: This module was written to accomodate a large +EST sequencing operation. In this case, EST's were sequenced from the +3' and from the 5' end of the EST. The objective was to find a +consensus sequence for these two reads. Thus, a contig of two is what +we wanted, and this contig should consist of the forward and reverse +reads of a getn clone. For example, for a forward designator of "F" +and a reverse designator of "R", if the two reads chad1F and chad1R +were in a single contig (for example Contig 5) it will be determined +that the consensus sequence for Contig 5 will be the sequence for +clone chad1. + +Doublets are good! + +This module parses .ace and related files. A detailed list of methods +can be found at the end of this document. + +I wrote a detailed rationale for design that may explain the reasons +why some things were done the way they were done. That document is +beyond the scope of this pod and can probably be found in the +directory from which this module was 'made' or at +http://www.dieselwurks.com/bioinformatics/consedpm_documentation.pdf + +Note that the pod in that document might be old but the original +rationale still stands. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + + +=head1 AUTHOR - Chad Matsalla + +chad@dieselwurks.com + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +#' + +package Bio::Tools::Alignment::Consed; + +use strict; +use vars qw($VERSION @ISA $Contigs %DEFAULTS); +use FileHandle; +use Dumpvalue qw(dumpValue); +use Bio::Tools::Alignment::Trim; +use Bio::Root::Root; +use Bio::Root::IO; + +@ISA = qw(Bio::Root::Root Bio::Root::IO); + +$VERSION = '0.60'; + +BEGIN { + %DEFAULTS = ( 'f_designator' => 'f', + 'r_designator' => 'r'); +} + +=head2 new() + + Title : new(-acefile => $path_to_some_acefile, -verbose => "1") + Usage : $o_consed = Bio::Tools::Alignment::Consed-> + new(-acefile => $path_to_some_acefile, -verbose => "1"); + Function: Construct the Bio::Tools::Alignment::Consed object. Sets + verbosity for the following procedures, if necessary: + 1. Construct a new Bio::Tools::Alignment::Trim object, to + handle quality trimming 2. Read in the acefile and parse it + + Returns : A reference to a Bio::Tools::Alignment::Consed object. + Args : A hash. (-acefile) is the filename of an acefile. If a full path + is not specified "./" is prepended to the filename and used from + instantiation until destruction. If you want + Bio::Tools::Alignment::Consed to be noisy during parsing of + the acefile, specify some value for (-verbose). + +=cut + +sub new { + my ($class,%args) = @_; + my $self = $class->SUPER::new(%args); + + $self->{'filename'} = $args{'-acefile'}; + # this is special to UNIX and should probably use catfile + if (!($self->{'filename'} =~ /\//)) { + $self->{'filename'} = "./".$self->{'filename'}; + } + $self->{'filename'} =~ m/(.*\/)(.*)ace.*$/; + $self->{'path'} = $1; + $self->_initialize_io('-file'=>$self->{'filename'}); + $self->{'o_trim'} = new Bio::Tools::Alignment::Trim(-verbose => $self->verbose()); + $self->set_forward_designator($DEFAULTS{'f_designator'}); + $self->set_reverse_designator($DEFAULTS{'r_designator'}); + + $self->_read_file(); + return $self; +} + +=head2 verbose() + + Title : verbose() + Usage : $o_consed->verbose(1); + Function: Set the verbosity level for debugging messages. On instantiation + of the Bio::Tools::Alignment::Consed object the verbosity level + is set to 0 (quiet). + Returns : 1 or 0. + Args : The verbosity levels are: + 0 - quiet + 1 - noisy + 2 - noisier + 3 - annoyingly noisy + Notes : This method for setting verbosity has largely been superseeded by + a sub-by-sub way, where for every sub you can provide a (-verbose) + switch. I am doing converting this bit-by-bit so do not be surprised + if some subs do not honour this. + +=cut + +# from RootI + +# backwards compat +sub set_verbose { (shift)->verbose(@_) } + +=head2 get_filename() + + Title : get_filename() + Usage : $o_consed->get_filename(); + Function: Returns the name of the acefile being used by the + Bio::Tools::Alignment::Consed object. + Returns : A scalar containing the name of a file. + Args : None. + +=cut + + +sub get_filename { + my $self = shift; + return $self->{'filename'}; +} + +=head2 count_sequences_with_grep() + + Title : count_sequences_with_grep() + Usage : $o_consed->count_sequences_with_grep(); + Function: Use /bin/grep to scan through the files in the ace project dir + and count sequences in those files. I used this method in the + development of this module to verify that I was getting all of the + sequences. It works, but it is (I think) unix-like platform + dependent. + Returns : A scalar containing the number of sequences in the ace project + directory. + Args : None. + Notes : If you are on a non-UNIX platform, you really do not have to use + this. It is more of a debugging routine designed to address very + specific problems. + + This method was reimplemented to be platform independent with a + pure perl implementation. The above note can be ignored. + +=cut + +sub count_sequences_with_grep { + my $self = shift; + my ($working_dir,$grep_cli,@total_grep_sequences); + # this should be migrated to a pure perl implementation ala + # Tom Christiansen's 'tcgrep' + # http://www.perl.com/language/ppt/src/grep/tcgrep + + open(FILE, $self->{'filename'}) or do { $self->warn("cannot open file ".$self->{'filename'}. " for grepping"); return}; + my $counter =0; + while(<FILE>) { $counter++ if(/^AF/); } + + close FILE; + opendir(SINGLETS,$self->{'path'}); + foreach my $f ( readdir(SINGLETS) ) { + next unless ($f =~ /\.singlets$/); + open(FILE, $self->catfile($self->{'path'},$f)) or do{ $self->warn("cannot open file ".$self->catfile($self->{'path'},$f)); next }; + while(<FILE>) { $counter++ if(/^>/) } + close FILE; + } + return $counter; +# Chad's platform implementation which required grep on the system + + # I tried to cause graceful exiting if not on *ix here + # then i took platforms from Bioperl*/PLATFORMS here. Is that good? + # print("\$^O is $^O\n"); +# if (!($^O =~ /dec_osf|linux|unix|bsd|solaris|darwin/i)) { +# $self->warn("Bio::Tools::Alignment::Consed::count_sequences_with_grep: This sub uses grep which is doesn't run on this operating system, AFAIK. Sorry .".$^O); +# return 1; +# } +# $grep_cli = `which grep`; +# if (!$grep_cli) { +# $self->warn("I couldn't see to find grep on this system, or the which command is broken. Bio::Tools::Alignment::Consed::count_sequences_with_grep requires grep and which to find grep."); +# return 1; +# } +# chomp $grep_cli; +# push(@total_grep_sequences, my @foo = `$grep_cli ^AF $self->{filename}`); +# my $cli = "$grep_cli \\> $self->{'path'}*.singlets"; +# push(@total_grep_sequences, @foo = `$cli`); +# return scalar(@total_grep_sequences); +} + +=head2 get_path() + + Title : get_path() + Usage : $o_consed->get_path(); + Function: Returns the path to the acefile this object is working with. + Returns : Scalar. The path to the working acefile. + Args : None. + +=cut + +sub get_path { + my $self = shift; + return $self->{'path'}; +} + +=head2 get_contigs() + + Title : get_contigs() + Usage : $o_consed->get_contigs(); + Function: Return the keys to the Bio::Tools::Alignment::Consed object. + Returns : An array containing the keynames in the + Bio::Tools::Alignment::Consed object. + Args : None. + Notes : This would normally be used to get the keynames for some sort of + iterator. These keys are worthless in general day-to-day use because + in the Consed acefile they are simply Contig1, Contig2, ... + +=cut + +sub get_contigs { + my ($self,$contig) = @_; + my @keys = (sort keys %{$self->{'contigs'}}); + return @keys; +} + +=head2 get_class($contig_keyname) + + Title : get_class($contig_keyname) + Usage : $o_consed->get_class($contig_keyname); + Function: Return the class name for this contig + Returns : A scalar representing the class of this contig. + Args : None. + Notes : + +=cut + +sub get_class { + my ($self,$contig) = @_; + return $self->{contigs}->{$contig}->{class}; +} + +=head2 get_quality_array($contig_keyname) + + Title : get_quality_array($contig_keyname) + Usage : $o_consed->get_quality_array($contig_keyname); + Function: Returns the quality for the consensus sequence for the given + contig as an array. See get_quality_scalar to get this as a scalar. + Returns : An array containing the quality for the consensus sequence with + the given keyname. + Args : The keyname of a contig. Note: This is a keyname. The key would + normally come from get_contigs. + Notes : Returns an array, not a reference. Is this a bug? <thinking> No. + Well, maybe. + Why was this developed like this? I was using FreezeThaw for object + persistence, and when it froze out these arrays it took a long time + to thaw it. Much better as a scalar. + +See L<get_quality_scalar()|get_quality_scalar> + +=cut + +sub get_quality_array { + my ($self,$contig) = @_; + my $quality = $self->{contigs}->{$contig}->{quality}; + # chad, what is with the s/// ? + # my @qualities = split + # (' ',($self->{contigs}->{$contig}->{quality} =~ s/\s+//)); + my @qualities = split + (' ',$self->{contigs}->{$contig}->{quality}); + return @qualities; +} + +=head2 get_quality_scalar($contig_keyname)) + + Title : get_quality_scalar($contig_keyname) + Usage : $o_consed->get_quality_scalar($contig_keyname); + Function: Returns the quality for the consensus sequence for the given + contig as a scalar. See get_quality_array to get this as an array. + Returns : An scalar containing the quality for the consensus sequence with + the given keyname. + Args : The keyname of a contig. Note this is a _keyname_. The key would + normally come from get_contigs. + Notes : Why was this developed like this? I was using FreezeThaw for object + persistence, and when it froze out these arrays it took a coon's age + to thaw it. Much better as a scalar. + +See L<get_quality_array()|get_quality_array> + +=cut + +#' +sub get_quality_scalar { + my ($self,$contig) = @_; + return $self->{'contigs'}->{$contig}->{'quality'}; +} + +=head2 freeze_hash() + + Title : freeze_hash() + Usage : $o_consed->freeze_hash(); + + Function: Use Ilya's FreezeThaw module to create a persistent data + object for this Bio::Tools::Alignment::Consed data + structure. In the case of AAFC, we use + Bio::Tools::Alignment::Consed to pre-process bunches of + sequences, freeze the structures, and send in a harvesting + robot later to do database stuff. + Returns : 0 or 1; + Args : None. + Notes : This procedure was removed so Consed.pm won't require + FreezeThaw. + + +=cut + +#' +sub freeze_hash { + my $self = shift; + $self->warn("This method (freeze_hash) was removed from the bioperl consed.pm. Sorry.\n"); + if (1==2) { + $self->debug("Bio::Tools::Alignment::Consed::freeze_hash: \$self->{path} is $self->{path}\n"); + my $filename = $self->{'path'}."frozen"; + my %contigs = %{$self->{'contigs'}}; + my $frozen = freeze(%contigs); + umask 0001; + open (FREEZE,">$filename") or do { + $self->warn( "Bio::Tools::Alignment::Consed could not freeze the contig hash because the file ($filename) could not be opened: $!\n"); + return 1; + }; + print FREEZE $frozen; + close FREEZE; + return 0; + } +} + +=head2 get_members($contig_keyname) + + Title : get_members($contig_keyname) + Usage : $o_consed->get_members($contig_keyname); + Function: Return the _names_ of the reads in this contig. + Returns : An array containing the names of the reads in this contig. + Args : The keyname of a contig. Note this is a keyname. The keyname + would normally come from get_contigs. + +See L<get_contigs()|get_contigs> + +=cut + +sub get_members { + my ($self,$contig) = @_; + if (!$contig) { + $self->warn("You need to provide the name of a contig to use Bio::Tools::Alignment::Consed::get_members!\n"); + return; + } + return @{$self->{'contigs'}->{$contig}->{'member_array'}}; +} + +=head2 get_members_by_name($some_arbitrary_name) + + Title : get_members_by_name($some_arbitrary_name) + Usage : $o_consed->get_members_by_name($some_arbitrary_name); + Function: Return the names of the reads in a contig. This is the name given + to $contig{key} based on what is in the contig. This is different + from the keys retrieved through get_contigs(). + Returns : An array containing the names of the reads in the contig with this + name. + Args : The name of a contig. Not a key, but a name. + Notes : Highly inefficient. use some other method if possible. + +See L<get_contigs()|get_contigs> + +=cut + +sub get_members_by_name { + my ($self,$name) = @_; + # build a list to try to screen for redundancy + my @contigs_with_that_name; + foreach my $currkey ( sort keys %{$self->{'contigs'}} ) { + next if (!$self->{'contigs'}->{$currkey}->{'name'}); + if ($self->{'contigs'}->{$currkey}->{'name'} eq "$name") { + push @contigs_with_that_name,$currkey; + } + } + my $count = @contigs_with_that_name; + if ($count == 1) { + my $contig_num = $contigs_with_that_name[0]; + return @{$self->{'contigs'}->{$contig_num}->{'member_array'}}; + } +} + +=head2 get_contig_number_by_name($some_arbitrary_name) + + Title : get_contig_number_by_name($some_arbitrary_name) + Usage : $o_consed->get_contig_number_by_name($some_arbitrary_name); + Function: Return the names of the reads in a contig. This is the name given + to $contig{key} based on what is in the contig. This is different + from the keys retrieved through get_contigs(). + Returns : An array containing the names of the reads in the contig with this + name. + Args : The name of a contig. Not a key, but a name. + +See L<get_contigs()|get_contigs> + +=cut + +sub get_contig_number_by_name { + my ($self,$name) = @_; + foreach my $currkey (sort keys %{$self->{'contigs'}}) { + if ($self->{'contigs'}->{$currkey}->{'name'} && + $self->{'contigs'}->{$currkey}->{'name'} eq "$name") { + return $currkey; + } + } +} + +=head2 get_sequence($contig_keyname) + + Title : get_sequence($contig_keyname) + Usage : $o_consed->get_sequence($contig_keyname); + Function: Returns the consensus sequence for a given contig. + Returns : A scalar containing a sequence. + Args : The keyname of a contig. Note this is a key. The key would + normally come from get_contigs. + +See L<get_contigs()|get_contigs> + +=cut + +sub get_sequence { + my ($self,$contig) = @_; + return $self->{'contigs'}->{$contig}->{'consensus'}; +} + +=head2 set_final_sequence($some_sequence) + + Title : set_final_sequence($name,$some_sequence) + Usage : $o_consed->set_final_sequence($name,$some_sequence); + Function: Provides a manual way to set the sequence for a given key in the + contig hash. Rarely used. + Returns : 0 or 1; + Args : The name (not the keyname) of a contig and an arbitrary string. + Notes : A method with a questionable and somewhat mysterious origin. May + raise the dead or something like that. + +=cut + +sub set_final_sequence { + my ($self,$name,$sequence) = @_; + if (!$self->{'contigs'}->{$name}) { + $self->warn("You cannot set the final sequence for $name because it doesn't exist!\n"); + return 1; + } + else { + $self->{'contigs'}->{$name}->{'final_sequence'} = $sequence; + } + return 0; +} + +=head2 _read_file() + + Title : _read_file(); + Usage : _read_file(); + Function: An internal subroutine used to read in an acefile and parse it + into a Bio::Tools::Alignment::Consed object. + Returns : 0 or 1. + Args : Nothing. + Notes : This routine creates and saves the filhandle for reading the + files in {fh} + +=cut + +sub _read_file { + my ($self) = @_; + my ($line,$in_contig,$in_quality,$contig_number,$top); + # make it easier to type $fhl + while (defined($line=$self->_readline()) ) { + chomp $line; + # check if there is anything on this line + # if not, you can stop gathering consensus sequence + if (!$line) { + # if the line is blank you are no longer to gather consensus + # sequence or quality values + $in_contig = 0; + $in_quality = 0; + } + # you are currently gathering consensus sequence + elsif ($in_contig) { + if ($in_contig == 1) { + $self->debug("Adding $line to consensus of contig number $contig_number.\n"); + $self->{'contigs'}->{$contig_number}->{'consensus'} .= $line; + } + } + elsif ($in_quality) { + if (!$line) { + $in_quality = undef; + } + else { + # I wrote this in here because acefiles produced by cap3 do not have a leading space + # like the acefiles produced by phrap and there is the potential to have concatenated + # quality values like this: 2020 rather then 20 20 whre lines collide. Thanks Andrew for noticing. + if ($self->{'contigs'}->{$contig_number}->{'quality'} && !($self->{'contigs'}->{$contig_number}->{'quality'} =~ m/\ $/)) { + $self->{'contigs'}->{$contig_number}->{'quality'} .= " "; + } + $self->{'contigs'}->{$contig_number}->{'quality'} .= $line; + } + } + elsif ($line =~ /^BQ/) { + $in_quality = 1; + } + # the line /^CO/ like this: + # CO Contig1 796 1 1 U + # can be broken down as follows: + # CO - Contig! + # Contig1 - the name of this contig + # 796 - Number of bases in this contig + # 1 - Number of reads in this contig + # 1 - number of base segments in this contig + # U - Uncomplemented + elsif ($line =~ /^CO/) { + $line =~ m/^CO\ Contig(\d+)\ \d+\ \d+\ \d+\ (\w)/; + $contig_number = $1; + if ($2 eq "C") { + $self->debug("Contig $contig_number is complemented!\n"); + } + $self->{'contigs'}->{$contig_number}->{'member_array'} = []; + $self->{'contigs'}->{$contig_number}->{'contig_direction'} = "$2"; + $in_contig = 1; + } + # 000713 + # this BS is deprecated, I think. + # haha, I am really witty. <ew> + elsif ($line =~ /^BSDEPRECATED/) { + $line =~ m/^BS\s+\d+\s+\d+\s+(.+)/; + my $member = $1; + $self->{'contigs'}->{$contig_number}->{$member}++; + } + # the members of the contigs are determined by the AF line in the ace file + elsif ($line =~ /^AF/) { + $self->debug("I see an AF line here.\n"); + $line =~ /^AF\ (\S+)\ (\w)\ (\S+)/; + # push the name of the current read onto the member array for this contig + push @{$self->{'contigs'}->{$contig_number}->{'member_array'}},$1; + # the first read in the contig will be named the "top" read + if (!$top) { + $self->debug("\$top is not set.\n"); + if ($self->{'contigs'}->{$contig_number}->{'contig_direction'} eq "C") { + $self->debug("Reversing the order of the reads. The bottom will be $1\n"); + # if the contig sequence is marked as the complement, the top becomes the bottom and$ + $self->{'contigs'}->{$contig_number}->{'bottom_name'} = $1; + $self->{'contigs'}->{$contig_number}->{'bottom_complement'} = $2; + $self->{'contigs'}->{$contig_number}->{'bottom_start'} = $3; + } + else { + $self->debug("NOT reversing the order of the reads. The top_name will be $1\n"); + # if the contig sequence is marked as the complement, the top becomes the bottom and$ + $self->{'contigs'}->{$contig_number}->{'top_name'} = $1; + $self->{'contigs'}->{$contig_number}->{'top_complement'} = $2; + $self->{'contigs'}->{$contig_number}->{'top_start'} = $3; + } + $top = 1; + } + else { + # if the contig sequence is marked as the complement, the top becomes the bottom and the bottom becomes the top + if ($self->{'contigs'}->{$contig_number}->{'contig_direction'} eq "C") { + $self->debug("Reversing the order of the reads. The top will be $1\n"); + $self->{'contigs'}->{$contig_number}->{'top_name'} = $1; + $self->{'contigs'}->{$contig_number}->{'top_complement'} = $2; + $self->{'contigs'}->{$contig_number}->{'top_start'} = $3; + } + else { + $self->debug("NOT reversing the order of the reads. The bottom will be $1\n"); + $self->{'contigs'}->{$contig_number}->{'bottom_name'} = $1; + $self->{'contigs'}->{$contig_number}->{'bottom_complement'} = $2; + $self->{'contigs'}->{$contig_number}->{'bottom_start'} = $3; + } + $top = undef; + } + } + } + return 0; +} + +=head2 set_reverse_designator($some_string) + + Title : set_reverse_designator($some_string) + Usage : $o_consed->set_reverse_designator($some_string); + Function: Set the designator for the reverse read of contigs in this + Bio::Tools::Alignment::Consed object. Used to determine if + contigs containing two reads can be named. + Returns : The value of $o_consed->{reverse_designator} so you can check + to see that it was set properly. + Args : An arbitrary string. + Notes : May be useful only to me. <shrug> + +=cut + +sub set_reverse_designator { + my ($self,$reverse_designator) = @_; + $self->{'reverse_designator'} = $reverse_designator; + $self->{'o_trim'}->set_reverse_designator($reverse_designator); + return $self->{'reverse_designator'}; +} # end set_reverse_designator + +=head2 set_forward_designator($some_string) + + Title : set_forward_designator($some_string) + Usage : $o_consed->set_forward_designator($some_string); + Function: Set the designator for the forward read of contigs in this + Bio::Tools::Alignment::Consed object. Used to determine if + contigs containing two reads can be named. + Returns : The value of $o_consed->{forward_designator} so you can check + to see that it was set properly. + Args : An arbitrary string. + Notes : May be useful only to me. <shrug> + +=cut + +sub set_forward_designator { + my ($self,$forward_designator) = @_; + $self->{'forward_designator'} = $forward_designator; + $self->{'o_trim'}->set_forward_designator($forward_designator); + return $self->{'forward_designator'}; +} # end set_forward_designator + +=head2 set_designator_ignore_case("yes") + + Title : set_designator_ignore_case("yes") + Usage : $o_consed->set_designator_ignore_case("yes"); + Function: Deprecated. + Returns : Deprecated. + Args : Deprecated. + Notes : Deprecated. Really. Trust me. + +=cut + +sub set_designator_ignore_case { + my ($self,$ignore_case) = @_; + if ($ignore_case eq "yes") { + $self->{'designator_ignore_case'} = 1; + } + return $self->{'designator_ignore_case'}; +} # end set_designator_ignore_case + +=head2 set_trim_points_singlets_and_singletons() + + Title : set_trim_points_singlets_and_singletons() + Usage : $o_consed->set_trim_points_singlets_and_singletons(); + Function: Set the trim points for singlets and singletons based on + quality. Uses the Bio::Tools::Alignment::Trim object. Use + at your own risk because the Bio::Tools::Alignment::Trim + object was designed specifically for me and is mysterious + in its ways. Every time somebody other then me uses it a + swarm of locusts decends on a small Central American + village so do not say you weren't warned. + Returns : Nothing. + Args : None. + Notes : Working on exceptions and warnings here. + +See L<Bio::Tools::Alignment::Trim> for more information + +=cut + +#' to make my emacs happy + +sub set_trim_points_singlets_and_singletons { + my ($self) = @_; + $self->debug("Consed.pm : \$self is $self\n"); + my (@points,$trimmed_sequence); + if (!$self->{'doublets_set'}) { + $self->debug("You need to set the doublets before you use set_trim_points_singlets_and_doublets. Doing that now."); + $self->set_doublets(); + } + foreach (sort keys %{$self->{'contigs'}}) { + if ($self->{'contigs'}->{$_}->{'class'} eq "singlet") { + $self->debug("Singlet $_\n"); + # this is what Warehouse wants + # my ($self,$sequence,$quality,$name) = @_; + # this is what Bio::Tools::Alignment::Trim::trim_singlet wants: + # my ($self,$sequence,$quality,$name,$class) = @_; + # the following several lines are to make the parameter passing legible. + my ($sequence,$quality,$name,$class); + $sequence = $self->{'contigs'}->{$_}->{'consensus'}; + if (!$self->{'contigs'}->{$_}->{'quality'}) { $quality = "unset"; } + else { $quality = $self->{'contigs'}->{$_}->{'quality'}; } + $name = $self->{'contigs'}->{$_}->{'name'}; + $class = $self->{'contigs'}->{$_}->{'class'}; + (@points) = @{$self->{'o_trim'}->trim_singlet($sequence,$quality,$name,$class)}; + $self->{'contigs'}->{$_}->{'start_point'} = $points[0]; + $self->{'contigs'}->{$_}->{'end_point'} = $points[1]; + $self->{'contigs'}->{$_}->{'sequence_trimmed'} = substr($self->{contigs}->{$_}->{'consensus'},$points[0],$points[1]-$points[0]); + } + } + $self->debug("Bio::Tools::Alignment::Consed::set_trim_points_singlets_and_singletons: Done setting the quality trimpoints.\n"); + return; +} # end set_trim_points_singlet + +=head2 set_trim_points_doublets() + + Title : set_trim_points_doublets() + Usage : $o_consed->set_trim_points_doublets(); + Function: Set the trim points for doublets based on quality. Uses the + Bio::Tools::Alignment::Trim object. Use at your own risk because + the Bio::Tools::Alignment::Trim object was designed specifically + for me and is mysterious in its ways. Every time somebody other + then me uses it you risk a biblical plague being loosed on your + city. + Returns : Nothing. + Args : None. + Notes : Working on exceptions here. + +See L<Bio::Tools::Alignment::Trim> for more information + +=cut + +sub set_trim_points_doublets { + my $self = shift; + my @points; + $self->debug("Bio::Tools::Alignment::Consed::set_trim_points_doublets: Restoring zeros for doublets.\n"); + # &show_missing_sequence($self); + $self->debug("Bio::Tools::Alignment::Consed::set_trim_points_doublets: Setting doublet trim points.\n"); + foreach (sort keys %{$self->{'contigs'}}) { + if ($self->{'contigs'}->{$_}->{'class'} eq "doublet") { + # my ($self,$sequence,$quality,$name,$class) = @_; + my @quals = split(' ',$self->{'contigs'}->{$_}->{'quality'}); + + (@points) = $self->{o_trim}->trim_doublet($self->{'contigs'}->{$_}->{'consensus'},$self->{'contigs'}->{$_}->{'quality'},$self->{'contigs'}->{$_}->{name},$self->{'contigs'}->{$_}->{'class'}); + $self->{'contigs'}->{$_}->{'start_point'} = $points[0]; + $self->{'contigs'}->{$_}->{'end_point'} = $points[1]; + # now set this + $self->{'contigs'}->{$_}->{'sequence_trimmed'} = substr($self->{contigs}->{$_}->{'consensus'},$points[0],$points[1]-$points[0]); + # 010102 the deprecated way to do things: + } + } + $self->debug("Bio::Tools::Alignment::Consed::set_trim_points_doublets: Done setting doublet trim points.\n"); + return; +} # end set_trim_points_doublets + +=head2 get_trimmed_sequence_by_name($name) + + Title : get_trimmed_sequence_by_name($name) + Usage : $o_consed->get_trimmed_sequence_by_name($name); + Function: Returns the trimmed_sequence of a contig with {name} eq $name. + Returns : A scalar- the trimmed sequence. + Args : The {name} of a contig. + Notes : + +=cut + +sub get_trimmed_sequence_by_name { + my ($self,$name) = @_; + my $trimmed_sequence; + my $contigname = &get_contig_number_by_name($self,$name); + my $class = $self->{'contigs'}->{$contigname}->{'class'}; + # what is this business and who was smoking crack while writing this? + # if ($class eq "singlet") { + # send the sequence, the quality, and the name + # $trimmed_sequence = $self->{o_trim}->trim_singlet($self->{'contigs'}->{$contigname}->{consensus},$self->{'contigs'}->{$contigname}->{'quality'},$name); + # } + return $self->{'contigs'}->{$contigname}->{'sequence_trimmed'}; +} + +=head2 set_dash_present_in_sequence_name("yes") + + Title : set_dash_present_in_sequence_name("yes") + Usage : $o_consed->set_dash_present_in_sequence_name("yes"); + Function: Deprecated. Part of an uncompleted thought. ("Oooh! Shiny!") + Returns : Nothing. + Args : "yes" to set {dash_present_in_sequence_name} to 1 + Notes : + +=cut + +sub set_dash_present_in_sequence_name { + my ($self,$dash_present) = @_; + if ($dash_present eq "yes") { + $self->{'dash_present_in_sequence_name'} = 1; + } + else { + $self->{'dash_present_in_sequence_name'} = 0; + } + return $self->{'dash_present_in_sequence_name'}; +} # end set_dash_present_in_sequence_name + +=head2 set_doublets() + + Title : set_doublets() + Usage : $o_consed->set_doublets(); + Function: Find pairs that have similar names and mark them as doublets + and set the {name}. + Returns : 0 or 1. + Args : None. + Notes : A complicated subroutine that iterates over the + Bio::Tools::Alignment::Consed looking for contigs of 2. If the + forward and reverse designator are removed from each of the reads + in {'member_array'} and the remaining reads are the same, {name} + is set to that name and the contig's class is set as "doublet". + If any of those cases fail the contig is marked as a "pair". + +=cut + +#' make my emacs happy + +sub set_doublets { + my ($self) = @_; + # set the designators in the Bio::Tools::Alignment::Trim object + + $self->{'o_trim'}->set_designators($self->{'reverse_designator'}, + $self->{'forward_designator'}); + # + foreach my $key_contig (sort keys %{$self->{'contigs'}}) { + + # if there is a member array (why would there not be? This should be a die()able offence + # but for now I will leave it + if ($self->{'contigs'}->{$key_contig}->{'member_array'}) { + # if there are two reads in this contig + # i am pretty sure that this is wrong but i am keeping it for reference + # if (@{$self->{'contigs'}->{$key_contig}->{'member_array'}} == 2 || !$self->{'contigs'}->{$key_contig}->{'class'}) { + # <seconds later> + # <nod> WRONG. Was I on crack? + if (@{$self->{'contigs'}->{$key_contig}->{'member_array'}} == 2) { + $self->{'contigs'}->{$key_contig}->{'num_members'} = 2; + $self->debug("\tThere are 2 members! Looking for the contig name...\n"); + my $name = _get_contig_name($self,$self->{'contigs'}->{$key_contig}->{'member_array'}); + $self->debug("The name is $name\n") if defined $name; + if ($name) { + $self->{'contigs'}->{$key_contig}->{'name'} = $name; + $self->{'contigs'}->{$key_contig}->{'class'} = "doublet"; + } + else { + $self->debug("$key_contig is a pair.\n"); + $self->{'contigs'}->{$key_contig}->{'class'} = "pair"; + } + } + # this is all fair and good but what about singlets? + # they have one reads in the member_array but certainly are not singletons + elsif (@{$self->{'contigs'}->{$key_contig}->{'member_array'}} == 1) { + # set the name to be the name of the read + $self->{'contigs'}->{$key_contig}->{name} = @{$self->{'contigs'}->{$key_contig}->{'member_array'}}[0]; + # set the number of members to be one + $self->{'contigs'}->{$key_contig}->{num_members} = 1; + # if this was a singlet, it would already belong to the class "singlet" + # so leave it alone + # if it is not a singlet, it is a singleton! lablel it appropriately + unless ($self->{'contigs'}->{$key_contig}->{'class'}) { + $self->{'contigs'}->{$key_contig}->{'class'} = "singleton"; + } + } + # set the multiplet characteristics + elsif (@{$self->{'contigs'}->{$key_contig}->{'member_array'}} >= 3) { + $self->{'contigs'}->{$key_contig}->{'num_members'} = @{$self->{'contigs'}->{$key_contig}->{'member_array'}}; + $self->{'contigs'}->{$key_contig}->{'class'} = "multiplet"; + } + $self->{'contigs'}->{$key_contig}->{'num_members'} = @{$self->{'contigs'}->{$key_contig}->{'member_array'}}; + + } + } + $self->{'doublets_set'} = "done"; + return 0; +} # end set_doublets + +=head2 set_singlets + + Title : set_singlets + Usage : $o_consed->set_singlets(); + Function: Read in a singlets file and place them into the + Bio::Tools::Alignment::Consed object. + Returns : Nothing. + Args : A scalar to turn on verbose parsing of the singlets file. + Notes : + +=cut + +sub set_singlets { + # parse out the contents of the singlets file + my ($self) = @_; + $self->debug("Bio::Tools::Alignment::Consed Adding singlets to the contig hash...\n"); + my $full_filename = $self->{'filename'}; + $self->debug("Bio::Tools::Alignment::Consed::set_singlets: \$full_filename is $full_filename\n"); + $full_filename =~ m/(.*\/)(.*ace.*)$/; + my ($base_path,$filename) = ($1,$2); + $self->debug("Bio::Tools::Alignment::Consed::set_singlets: singlets filename is $filename and \$base_path is $base_path\n"); + $filename =~ m/(.*)ace.*$/; + my $singletsfile = $base_path.$1."singlets"; + $self->debug("\$singletsfile is $singletsfile\n"); + if (-f $singletsfile) { + $self->debug("$singletsfile is indeed a file. Trying to open it...\n"); + } + my $singlets_fh = Bio::Root::IO->new(-file => $singletsfile); + my ($sequence,$name,$count); + while ($_ = $singlets_fh->_readline()) { + chomp $_; + if (/\>/) { + if ($name && $sequence) { + $self->debug("Adding $name with sequence $sequence to hash...\n"); + push @{$self->{'contigs'}->{$name}->{'member_array'}},$name; + $self->{'contigs'}->{$name}->{'consensus'} = $sequence; + $self->{'contigs'}->{$name}->{'name'} = $name; + $self->{'contigs'}->{$name}->{"singlet"} = 1; + $self->{'contigs'}->{$name}->{'class'} = "singlet"; + } + $sequence = $name = undef; + $count++; + m/^\>(.*)\s\sCHROMAT/; + $name = $1; + if (!$name) { + m/\>(\S+)\s/; + $name = $1; + } + } + else { $sequence .= $_; } + } + if ($name && $sequence) { + $self->debug("Pushing the last of the singlets ($name)\n"); + @{$self->{'contigs'}->{$name}->{'member_array'}} = $name; + $self->{'contigs'}->{$name}->{'consensus'} = $sequence; + $self->{'contigs'}->{$name}->{'name'} = $name; + $self->{'contigs'}->{$name}->{"singlet"} = 1; + $self->{'contigs'}->{$name}->{'class'} = "singlet"; + } + $self->debug("Bio::Tools::Alignment::Consed::set_singlets: Done adding singlets to the singlets hash.\n"); + $self->{'singlets_set'} = "done"; + return 0; +} # end sub set_singlets + +=head2 get_singlets() + + Title : get_singlets() + Usage : $o_consed->get_singlets(); + Function: Return the keynames of the singlets. + Returns : An array containing the keynames of all + Bio::Tools::Alignment::Consed sequences in the class "singlet". + Args : None. + Notes : + +=cut + +sub get_singlets { + # returns an array of singlet names + # singlets have "singlet"=1 in the hash + my $self = shift; + if (!$self->{singlets_set}) { + $self->debug("You need to set the singlets before you get them. Doing that now."); + $self->set_singlets(); + } + + my (@singlets,@array); + foreach my $key (sort keys %{$self->{'contigs'}}) { + # @array = @{$Consed::contigs{$key}->{'member_array'}}; + # somethimes a user will try to get a list of singlets before the classes for the rest of the + # contigs has been set (see t/test.t for how I figured this out. <bah> + # so either way, just return class=singlets + if (!$self->{'contigs'}->{$key}->{'class'}) { + # print("$key has no class. why?\n"); + } + elsif ($self->{'contigs'}->{$key}->{'class'} eq "singlet") { + push @singlets,$key; + } + } + return @singlets; +} + +=head2 set_quality_by_name($name,$quality) + + Title : set_quality_by_name($name,$quality) + Usage : $o_consed->set_quality_by_name($name,$quality); + Function: Deprecated. Make the contig with {name} have {'quality'} $quality. + Probably used for testing. + Returns : Nothing. + Args : The name of a contig and a scalar for its quality. + Notes : Deprecated. + +=cut + +sub set_quality_by_name { + # this is likely deprecated + my ($self,$name,$quality) = shift; + my $return; + foreach (sort keys %{$self->{'contigs'}}) { + if ($self->{'contigs'} eq "$name" || $self->{'contigs'}->{'name'} eq "$name") { + $self->{'contigs'}->{'quality'} = $quality; + $return=1; + } + } + if ($return) { return "0"; } else { return "1"; } +} # end set quality by name + +=head2 set_singlet_quality() + + Title : set_singlet_quality() + Usage : $o_consed->set_singlet_quality(); + Function: For each singlet, go to the appropriate file in phd_dir and read + in the phred quality for that read and place it into {'quality'} + Returns : 0 or 1. + Args : None. + Notes : This is the next subroutine that will receive substantial revision + in the next little while. It really should eval the creation of + Bio::Tools::Alignment::Phred objects and go from there. + +=cut + +sub set_singlet_quality { + my $self = shift; + my $full_filename = $self->{'filename'}; + $full_filename =~ m/(.*\/)(.*)ace.*$/; + my ($base_path,$filename) = ($1,"$2"."qual"); + my $singletsfile = $base_path.$filename; + if (-f $singletsfile) { + # print("$singletsfile is indeed a file. Trying to open it...\n"); + } + else { + $self->warn("$singletsfile is not a file. Sorry.\n"); + return; + } + my $singlets_fh = Bio::Root::IO->new(-file => $singletsfile); + my ($sequence,$name,$count); + my ($identity,$line,$quality,@qline); + while ($line = $singlets_fh->_readline()) { + chomp $line; + if ($line =~ /^\>/) { + $quality = undef; + $line =~ m/\>(\S*)\s/; + $identity = $1; + } + else { + if ($self->{'contigs'}->{$identity}) { + $self->{'contigs'}->{$identity}->{'quality'} .= "$line "; + } + } + + } + return 0; +} + +=head2 set_contig_quality() + + Title : set_contig_quality() + Usage : $o_consed->set_contig_quality(); + Function: Deprecated. + Returns : Deprecated. + Args : Deprecated. + Notes : Deprecated. Really. Trust me. + +=cut + +sub set_contig_quality { + # note: contigs _include_ singletons but _not_ singlets + my ($self) = shift; + # the unexpected results I am referring to here are a doubling of quality values. + # the profanity I uttered on discovering this reminded me of the simpsons: + # Ned Flanders: "That is the loudest profanity I have ever heard!" + $self->warn("set_contig_quality is deprecated and will likely produce unexpected results"); + my $full_filename = $self->{'filename'}; + # Run_SRC3700_2000-08-01_73+74.fasta.screen.contigs.qual + # from Consed.pm + $full_filename =~ m/(.*\/)(.*)ace.*$/; + my ($base_path,$filename) = ($1,"$2"."contigs.qual"); + my $singletsfile = $base_path.$filename; + if (-f $singletsfile) { + # print("$singletsfile is indeed a file. Trying to open it...\n"); + } + else { + $self->warn("Bio::Tools::Alignment::Consed::set_contig_quality $singletsfile is not a file. Sorry.\n"); + return; + } + my $contig_quality_fh = Bio::Root::IO->new(-file => $singletsfile); + + my ($sequence,$name,$count,$identity,$line,$quality); + while ($line = $contig_quality_fh->_readline()) { + chomp $line; + if ($line =~ /^\>/) { + $quality = undef; + $line =~ m/\>.*Contig(\d+)\s/; + $identity = $1; + } + else { + if ($self->{'contigs'}->{$identity} ) { + $self->{'contigs'}->{$identity}->{'quality'} .= " $line"; + } + } + } +} # end set_contig_quality + +=head2 get_multiplets() + + Title : get_multiplets() + Usage : $o_consed->get_multiplets(); + Function: Return the keynames of the multiplets. + Returns : Returns an array containing the keynames of all + Bio::Tools::Alignment::Consed sequences in the class "multiplet". + Args : None. + Notes : + +=cut + +sub get_multiplets { + # returns an array of multiplet names + # multiplets have # members > 2 + my $self = shift; + my (@multiplets,@array); + foreach my $key (sort keys %{$self->{'contigs'}}) { + if ($self->{'contigs'}->{$key}->{'class'}) { + if ($self->{'contigs'}->{$key}->{'class'} eq "multiplet") { + push @multiplets,$key; + } + } + } + return @multiplets; +} + +=head2 get_all_members() + + Title : get_all_members() + Usage : @all_members = $o_consed->get_all_members(); + Function: Return a list of all of the read names in the + Bio::Tools::Alignment::Consed object. + Returns : An array containing all of the elements in all of the + {'member_array'}s. + Args : None. + Notes : + +=cut + +sub get_all_members { + my $self = shift; + my @members; + foreach my $key (sort keys %{$self->{'contigs'}}) { + if ($key =~ /^singlet/) { + push @members,$self->{'contigs'}->{$key}->{'member_array'}[0]; + } + elsif ($self->{'contigs'}->{$key}->{'member_array'}) { + push @members,@{$self->{'contigs'}->{$key}->{'member_array'}}; + } + # else { + # print("Bio::Tools::Alignment::Consed: $key is _not_ an array. Pushing $self->{'contigs'}->{$key}->{'member_array'} onto \@members\n"); + # push @members,$self->{'contigs'}->{$key}->{'member_array'}; + # } + } + return @members; +} + +=head2 sum_lets($total_only) + + Title : sum_lets($total_only) + Usage : $statistics = $o_consed->sum_lets($total_only); + Function: Provide numbers for how many sequences were accounted for in the + Bio::Tools::Alignment::Consed object. + Returns : If a scalar is present, returns the total number of + sequences accounted for in all classes. If no scalar passed + then returns a string that looks like this: + Singt/singn/doub/pair/mult/total : 2,0,1(2),0(0),0(0),4 + This example means the following: There were 1 singlets. + There were 0 singletons. There were 1 doublets for a total + of 2 sequences in this class. There were 0 pairs for a + total of 0 sequences in this class. There were 0 + multiplets for a total of 0 sequences in this class. There + were a total of 4 sequences accounted for in the + Bio::Tools::Alignment::Consed object. + Args : A scalar is optional to change the way the numbers are returned. + Notes: + +=cut + +sub sum_lets { + my ($self,$total_only) = @_; + my ($count,$count_multiplets,$multiplet_count); + my $singlets = &get_singlets($self); $count += $singlets; + my $doublets = &get_doublets($self); $count += ($doublets * 2); + my $pairs = &get_pairs($self); $count += ($pairs * 2); + my $singletons = &get_singletons($self); $count += $singletons; + my @multiplets = &get_multiplets($self); + $count_multiplets = @multiplets; + my $return_string; + foreach (@multiplets) { + my $number_members = $self->{'contigs'}->{$_}->{num_members}; + $multiplet_count += $number_members; + } + if ($multiplet_count) { + $count += $multiplet_count; + } + foreach (qw(multiplet_count singlets doublets pairs singletons multiplets count_multiplets)) { + no strict 'refs'; # renege for the block + if (!${$_}) { + ${$_} = 0; + } + } + if (!$multiplet_count) { $multiplet_count = 0; } + if ($total_only) { + return $count; + } + $return_string = "Singt/singn/doub/pair/mult/total : $singlets,$singletons,$doublets(". + ($doublets*2)."),$pairs(".($pairs*2)."),$count_multiplets($multiplet_count),$count"; + return $return_string; +} + +=head2 write_stats() + + Title : write_stats() + Usage : $o_consed->write_stats(); + Function: Write a file called "statistics" containing numbers similar to + those provided in sum_lets(). + Returns : Nothing. Write a file in $o_consed->{path} containing something + like this: + + 0,0,50(100),0(0),0(0),100 + + Where the numbers provided are in the format described in the + documentation for sum_lets(). + Args : None. + Notes : This might break platform independence, I do not know. + +See L<sum_lets()|sum_lets> + +=cut + +sub write_stats { + # worry about platform dependence here? + # oh shucksdarn. + my $self = shift; + my $stats_filename = $self->{'path'}."statistics"; + my $statistics_raw = $self->sum_lets; + (my $statsfilecontents = $statistics_raw) =~ s/.*\ \:\ //g; + umask 0001; + my $fh = new Bio::Root::IO(-file=>"$stats_filename"); + # open(STATSFILE,">$stats_filename") or print("Could not open the statsfile: $!\n"); + $fh->_print("$statsfilecontents"); + # close STATSFILE; + $fh->close(); +} + +=head2 get_singletons() + + Title : get_singletons() + Usage : @singletons = $o_consed->get_singletons(); + Function: Return the keynames of the singletons. + Returns : Returns an array containing the keynames of all + Bio::Tools::Alignment::Consed sequences in the class "singleton". + Args : None. + Notes : + +=cut + +sub get_singletons { + # returns an array of singleton names + # singletons are contigs with one member (see consed documentation) + my $self = shift; + my (@singletons,@array); + foreach my $key (sort keys %{$self->{'contigs'}}) { + if ($self->{'contigs'}->{$key}->{'class'}) { + # print ("$key class: $self->{'contigs'}->{$key}->{'class'}\n"); + } + else { + # print("$key belongs to no class. why?\n"); + } + if ($self->{'contigs'}->{$key}->{'member_array'}) { + @array = @{$self->{'contigs'}->{$key}->{'member_array'}}; + } + my $num_array_elem = @array; + if ($num_array_elem == 1 && $self->{'contigs'}->{$key}->{'class'} && $self->{'contigs'}->{$key}->{'class'} eq "singleton") { push @singletons,$key; } + } + return @singletons; +} + +=head2 get_pairs() + + Title : get_pairs() + Usage : @pairs = $o_consed->get_pairs(); + Function: Return the keynames of the pairs. + Returns : Returns an array containing the keynames of all + Bio::Tools::Alignment::Consed sequences in the class "pair". + Args : None. + Notes : + +=cut + +sub get_pairs { + # returns an array of pair contig names + # a pair is a contig of two where the names do not match + my $self = shift; + my (@pairs,@array); + foreach my $key (sort keys %{$self->{'contigs'}}) { + if ($self->{'contigs'}->{$key}->{'member_array'}) { + if (@{$self->{'contigs'}->{$key}->{'member_array'}} == 2 && + $self->{'contigs'}->{$key}->{'class'} eq "pair") { + push @pairs,$key; + } + } + } + return @pairs; +} + +=head2 get_name($contig_keyname) + + Title : get_name($contig_keyname) + Usage : $name = $o_consed->get_name($contig_keyname); + Function: Return the {name} for $contig_keyname. + Returns : A string. ({name}) + Args : A contig keyname. + Notes : + +=cut + +sub get_name { + my ($self,$contig) = @_; + return $self->{'contigs'}->{$contig}->{'name'}; +} + +=head2 _get_contig_name(\@array_containing_reads) + + Title : _get_contig_name(\@array_containing_reads) + Usage : $o_consed->_get_contig_name(\@array_containing_reads); + Function: The logic for the set_doublets subroutine. + Returns : The name for this contig. + Args : A reference to an array containing read names. + Notes : Depends on reverse_designator. Be sure this is set the way you + intend. + +=cut + +sub _get_contig_name { + my ($self,$r_array) = @_; + my @contig_members = @$r_array; + my @name_nodir; + foreach (@contig_members) { + # how can I distinguish the clone name from the direction label? + # look for $Consed::reverse_designator and $Consed::forward_designator + # what if you do not find _any_ of those? + my $forward_designator = $self->{'forward_designator'} || "f"; + my $reverse_designator = $self->{'reverse_designator'} || "r"; + my $any_hits = /(.+)($forward_designator.*)/ || /(.+)($reverse_designator.*)/||/(.+)(_.+)/; + my $name = $1; + my $suffix = $2; + if ($name) { + # print("\t\$name is $name "); + } + if ($suffix) { + # print("and \$suffix is $suffix.\n"); + } + # Jee, I hope we get a naming convention soon + if ($suffix) { + if ($suffix =~ /^$forward_designator/ || $suffix =~ /^$reverse_designator/) { + push @name_nodir,$name; + } + # bugwatch here! should this be unnested? + else { + push @name_nodir,"$name$suffix"; + } + } + } + # print("\@name_nodir: @name_nodir\n"); + my $mismatch = 0; + for (my $counter=0; $counter<@name_nodir;$counter++) { + next if ($name_nodir[0] eq $name_nodir[$counter]); + $mismatch = 1; + } + if ($mismatch == 0) { + # print("\tYou have a cohesive contig named $name_nodir[0].\n\n"); + return $name_nodir[0]; + } + else { + # print("\tYou have mixed names in this contig.\n\n"); + } +} # end _get_contig_name + +=head2 get_doublets() + + Title : get_doublets() + Usage : @doublets = $o_consed->get_doublets(); + Function: Return the keynames of the doublets. + Returns : Returns an array containing the keynames of all + Bio::Tools::Alignment::Consed sequences in the class "doublet". + Args : None. + Notes : + +=cut + +sub get_doublets { + my $self = shift; + if (!$self->{doublets_set}) { + $self->warn("You need to set the doublets before you can get them. Doing that now."); + $self->set_doublets(); + } + my @doublets; + foreach (sort keys %{$self->{'contigs'}}) { + if ($self->{'contigs'}->{$_}->{name} && $self->{'contigs'}->{$_}->{'class'} eq "doublet") { + push @doublets,$_; + } + } + return @doublets; +} # end get_doublets + +=head2 dump_hash() + + Title : dump_hash() + Usage : $o_consed->dump_hash(); + Function: Use dumpvar.pl to dump out the Bio::Tools::Alignment::Consed + object to STDOUT. + Returns : Nothing. + Args : None. + Notes : I used this a lot in debugging. + +=cut + +sub dump_hash { + my $self = shift; + my $dumper = new Dumpvalue; + $self->debug( "Bio::Tools::Alignment::Consed::dump_hash - ". + "The following is the contents of the contig hash...\n"); + $dumper->dumpValue($self->{'contigs'}); +} + +=head2 dump_hash_compact() + + Title : dump_hash_compact() + Usage : $o_consed->dump_hash_compact(); + Function: Dump out the Bio::Tools::Alignment::Consed object in a compact way. + Returns : Nothing. + Args : Nothing. + Notes : Cleaner then dumpValue(), dumpHash(). I used this a lot in + debugging. + +=cut + +sub dump_hash_compact { + no strict 'refs'; # renege for the block + my ($self,$sequence) = @_; + # get the classes + my @singlets = $self->get_singlets(); + my @singletons = $self->get_singletons(); + my @doublets = $self->get_doublets(); + my @pairs = $self->get_pairs(); + my @multiplets = $self->get_multiplets(); + print("Name\tClass\tMembers\tQuality?\n"); + foreach (@singlets) { + my @members = $self->get_members($_); + print($self->get_name($_)."\tsinglets\t".(join',',@members)."\t"); + if ($self->{'contigs'}->{$_}->{'quality'}) { print("qualities found here\n"); } + else { print("no qualities found here\n"); } + + } + foreach (@singletons) { + my @members = $self->get_members($_); + print($self->get_name($_)."\tsingletons\t".(join',',@members)."\t"); + if ($self->{'contigs'}->{$_}->{'quality'}) { print("qualities found here\n"); } + else { print("no qualities found here\n"); } + } + foreach my $pair (@pairs) { + my @members = $self->get_members($pair); + my $name; + if (!$self->get_name($pair)) { + $name = "BLANK"; + } + else { $name = $self->get_name($pair); } + print("$name\tpairs\t".(join',',@members)."\n"); + } + foreach (@doublets) { + my @members = $self->get_members_by_name($_); + print("$_\tdoublets\t".(join',',@members)."\t"); + my $contig_number = &get_contig_number_by_name($self,$_); + if ($self->{'contigs'}->{$contig_number}->{'quality'}) { print("qualities found here\n"); } + else { print("no qualities found here\n"); } + # print($_."\tdoublets\t".(join',',@members)."\n"); + } + foreach (@multiplets) { + my @members = $self->get_members($_); + print("Contig $_"."\tmultiplets\t".(join',',@members)."\n"); + } +} # end dump_hash_compact + +=head2 get_phreds() + + Title : get_phreds() + Usage : @phreds = $o_consed->get_phreds(); + Function: For each doublet in the Bio::Tools::Alignment::Consed hash, go + and get the phreds for the top and bottom reads. Place them into + {top_phreds} and {bottom_phreds}. + Returns : Nothing. + Args : Nothing. + Notes : Requires parse_phd() and reverse_and_complement(). I realize that + it would be much more elegant to pull qualities as required but there + were certain "features" in the acefile that required a bit more + detailed work be done to get the qualities for certain parts of the + consensus sequence. In order to make _sure_ that this was done + properly I wrote things to do all steps and then I used dump_hash() + and checked each one to ensure expected bahavior. I have never changed + this, so there you are. + +=cut + +sub get_phreds { + # this subroutine is the target of a rewrite to use the Bio::Tools::Alignment::Phred object. + my $self = shift; + my $current_contig; + foreach $current_contig (sort keys %{$self->{'contigs'}}) { + if ($self->{'contigs'}->{$current_contig}->{'class'} eq "doublet") { + $self->debug("$current_contig is a doublet. Going to parse_phd for top($self->{'contigs'}->{$current_contig}->{'top_name'}) and bottom($self->{'contigs'}->{$current_contig}->{'bottom_name'})\n"); + my $r_phreds_top = &parse_phd($self,$self->{'contigs'}->{$current_contig}->{'top_name'}); + my $r_phreds_bottom = &parse_phd($self,$self->{'contigs'}->{$current_contig}->{'bottom_name'}); + if ($self->{'contigs'}->{$current_contig}->{'top_complement'} eq "C") { + # print("Reversing and complementing...\n"); + $r_phreds_top = &reverse_and_complement($r_phreds_top); + } + if ($self->{'contigs'}->{$current_contig}->{'bottom_complement'} eq "C") { + $r_phreds_bottom = &reverse_and_complement($r_phreds_bottom); + } + $self->{'contigs'}->{$current_contig}->{'top_phreds'} = $r_phreds_top; + $self->{'contigs'}->{$current_contig}->{'bottom_phreds'} = $r_phreds_bottom; + } + } +} + +=head2 parse_phd($read_name) + + Title : parse_phd($read_name) + Usage : $o_consed->parse_phd($read_name); + Function: Suck in the contents of a .phd file. + Returns : A reference to an array containing the quality values for the read. + Args : The name of a read. + Notes : This is a significantly weak subroutine because it was always + intended that these functions, along with the functions provided by + get_phreds() be put into the Bio::SeqIO:phd module. This is done + now but the Bio::Tools::Alignment::Consed module has not be + rewritten to reflect this change. + +See L<Bio::SeqIO::phd> for more information. + +=cut + +sub parse_phd { + my ($self,$sequence_name) = @_; + $self->debug("Parsing phd for $sequence_name\n"); + my $in_dna = 0; + my $base_number = 0; + my (@bases,@current_line); + # print("parse_phd: $sequence_name\n"); + my $fh = new Bio::Root::IO(-file=>"$self->{path}/../phd_dir/$sequence_name.phd.1"); + # open(PHD,"<$self->{path}/../phd_dir/$sequence_name.phd.1") or + # die "Couldn't open the phred for $sequence_name\n"; + while ($fh->_readline()) { + # print("Reading a line from a phredfile!\n"); + chomp; + if (/^BEGIN_DNA/) { $in_dna = 1; next} + if (/^END_DNA/) { last; } + if (!$in_dna) { next; } + push(@bases,$_); + } + return \@bases; +} + +=head2 reverse_and_complement(\@source) + + Title : reverse_and_complement(\@source) + Usage : $reference_to_array = $o_consed->reverse_and_complement(\@source); + Function: A stub for the recursive routine reverse_recurse(). + Returns : A reference to a reversed and complemented array of phred data. + Args : A reference to an array of phred data. + Notes : + +=cut + +sub reverse_and_complement { + my $r_source = shift; + my $r_destination; + $r_destination = &reverse_recurse($r_source,$r_destination); + return $r_destination; +} + +=head2 reverse_recurse($r_source,$r_destination) + + Title : reverse_recurse(\@source,\@destination) + Usage : $o_consed->reverse_recurse(\@source,\@destination); + Function: A recursive routine to reverse and complement an array of phred + data. + Returns : A reference to an array containing reversed phred data. + Args : A reference to a source array and a reverence to a destination + array. + Notes : Recursion is kewl, but this sub should likely be _reverse_recurse. + +=cut + +my $current_base; +sub reverse_recurse($$) { + (my $r_source,my $r_destination) = @_; + if (!@$r_source) { return $r_destination; } + $_=pop(@$r_source); + (s/c/g/||s/g/c/||s/a/t/||s/t/a/); + push(@$r_destination,$_); + &reverse_recurse($r_source,$r_destination); +} + +=head2 show_missing_sequence() + + Title : show_missing_sequence(); + Usage : $o_consed->show_missing_sequence(); + Function: Used by set_trim_points_doublets() to fill in quality values where + consed (phrap?) set them to 0 at the beginning and/or end of the + consensus sequences. + Returns : Nothing. + Args : None. + Notes : Acts on doublets only. Really very somewhat quite ugly. A + disgusting kludge. <insert pride here> It was written stepwise with + no real plan because it was not really evident why consed (phrap?) + was doing this. + +=cut + +sub show_missing_sequence() { + # decide which sequence should not have been clipped at consensus position = 0 + my $self = shift; + &get_phreds($self); + my ($current_contig,@qualities); + foreach $current_contig (sort keys %{$self->{'contigs'}}) { + if ($self->{'contigs'}->{$current_contig}->{'class'} eq "doublet") { + my $number_leading_xs = 0; + my $number_trailing_xs = 0; + my $measurer = $self->{'contigs'}->{$current_contig}->{'quality'}; + while ($measurer =~ s/^\ 0\ /\ /) { + $number_leading_xs++; + } + while ($measurer =~ s/\ 0(\s*)$/$1/) { + $number_trailing_xs++; + } + @qualities = split(' ',$self->{'contigs'}->{$current_contig}->{'quality'}); + my $in_initial_zeros = 0; + for (my $count=0;$count<scalar(@qualities); $count++) { + if ($qualities[$count] == 0) { + my ($quality,$top_phred_position,$bottom_phred_position,$top_phred_data,$bottom_phred_data); + # print("The quality of the consensus at ".($count+1)." is zero. Retrieving the real quality value.\n"); + # how do I know which strand to get these quality values from???? + # boggle + my $top_quality_here = $self->{'contigs'}->{$current_contig}->{'top_phreds'}->[0-$self->{'contigs'}->{$current_contig}->{'top_start'}+$count+1]; + my $bottom_quality_here = $self->{'contigs'}->{$current_contig}->{'bottom_phreds'}->[1-$self->{'contigs'}->{$current_contig}->{'bottom_start'}+$count]; + if (!$bottom_quality_here || (1-$self->{'contigs'}->{$current_contig}->{'bottom_start'}+$count)<0) { + $bottom_quality_here = "not found"; + } + if (!$top_quality_here) { + $top_quality_here = "not found"; + } + # print("Looking for quals at position $count of $current_contig: top position ".(0-$self->{'contigs'}->{$current_contig}->{top_start}+$count)." ($self->{'contigs'}->{$current_contig}->{top_name}) $top_quality_here , bottom position ".(1-$self->{'contigs'}->{$current_contig}->{bottom_start}+$count)." ($self->{'contigs'}->{$current_contig}->{bottom_name}) $bottom_quality_here\n"); + if ($count<$number_leading_xs) { + # print("$count is less then $number_leading_xs so I will get the quality from the top strand\n"); + # print("retrieved quality is ".$self->{'contigs'}->{$current_contig}->{top_phreds}[0-$self->{'contigs'}->{$current_contig}->{top_start}+$count+1]."\n"); + my $quality = $top_quality_here; + $quality =~ /\S+\s(\d+)\s+/; + $quality = $1; + # print("retrieved quality for leading zero $count is $quality\n"); + # t 9 9226 + $qualities[$count] = $quality; + } + else { + # this part is tricky + # if the contig is like this + # cccccccccccccccc + # ffffffffffffffffff + # rrrrrrrrrrrrrrrrr + # then take the quality value for the trailing zeros in the cons. seq from the r + # + # but if the contig is like this + # cccccccccccccccccc + # ffffffffffffffffffffffffffffffff + # rrrrrrrrrrrrrrrrrrrrrrrxxxxxxxxr + # ^^^ + # then any zeros that fall in the positions (^) must be decided whether the quality + # is the qual from the f or r strand. I will use the greater number + # does a similar situation exist for the leading zeros? i dunno + # + # print("$count is greater then $number_leading_xs so I will get the quality from the bottom strand\n"); + # print("retrieved quality is ".$contigs->{$current_contig}->{top_phreds}[0-$contigs->{$current_contig}->{top_start}+$count+1]."\n"); + # my ($quality,$top_phred_position,$bottom_phred_position,$top_phred_data,$bottom_phred_data); + if ($bottom_quality_here eq "not found") { + # $top_phred_position = 1-$contigs->{$current_contig}->{bottom_start}+$count; + # print("Going to get quality from here: $top_phred_position of the top.\n"); + # my $temp_quality - $contigs->{$current_contig}->{top_phreds} + # $quality = $contigs->{$current_contig}->{top_phreds}[$top_phred_position]; + $top_quality_here =~ /\w+\s(\d+)\s/; + $quality = $1; + } + elsif ($top_quality_here eq "not found") { + # $bottom_phred_position = 1+$contigs->{$current_contig}->{bottom_start}+$count; + # print("Going to get quality from here: $bottom_phred_position of the bottom.\n"); + # $quality = $contigs->{$current_contig}->{bottom_phreds}[$bottom_phred_position]; + # print("Additional: no top quality but bottom is $quality\n"); + $bottom_quality_here =~ /\w+\s(\d+)\s/; + $quality = $1; + } + else { + # print("Oh jeepers, there are 2 qualities to choose from at this position.\n"); + # print("Going to compare these phred qualities: top: #$top_quality_here# bottom: #$bottom_quality_here#\n"); + # now you have to compare them + # my $top_quality_phred = $contigs->{$current_contig}->{top_phreds}[$top_phred_position]; + # #t 40 875# + # print("regexing #$top_quality_here#... "); + $top_quality_here =~ /\w\ (\d+)\s/; + my $top_quality = $1; + # print("$top_quality\nregexing #$bottom_quality_here#... "); + $bottom_quality_here =~ /\w\ (\d+)\s/; + my $bottom_quality = $1; + # print("$bottom_quality\n"); + # print("top_quality: $top_quality bottom quality: $bottom_quality\n"); + if ($bottom_quality > $top_quality) { + # print("Chose to take the bottom quality: $bottom_quality\n"); + $quality = $bottom_quality; + } else { + # print("Chose to take the top quality: $top_quality\n"); + $quality = $top_quality; + } + } + if (!$quality) { + # print("Warning: no quality value for $current_contig, position $count!\n"); + # print("Additional data: top quality phred: $top_quality_here\n"); + # print("Additional data: bottom quality phred: $bottom_quality_here\n"); + } else { + $qualities[$count] = $quality; + } + } + } + + } + unless (!@qualities) { + $self->{'contigs'}->{$current_contig}->{'quality'} = join(" ",@qualities); + } + $self->{'contigs'}->{$current_contig}->{'bottom_phreds'} = undef; + $self->{'contigs'}->{$current_contig}->{'top_phreds'} = undef; + my $count = 1; + } # end foreach key + } +} + + +# Autoload methods go after =cut, and are processed by the autosplit program. + +1; +__END__ + +=head1 SEE ALSO + +perl(1). + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Alignment/Trim.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Alignment/Trim.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,656 @@ +# Bio::Tools::Alignment::Trim.pm +# +# Cared for by Chad Matsalla +# +# Copyright Chad Matsalla +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::Alignment::Trim - A kludge to do specialized trimming of + sequence based on quality. + +=head1 SYNOPSIS + + use Bio::Tools::Alignment::Trim; + $o_trim = new Bio::Tools::Alignment::Trim; + $o_trim->set_reverse_designator("R"); + $o_trim->set_forward_designator("F"); + + +=head1 DESCRIPTION + +This is a specialized module designed by Chad for Chad to trim sequences +based on a highly specialized list of requirements. In other words, write +something that will trim sequences 'just like the people in the lab would +do manually'. + +I settled on a sliding-window-average style of search which is ugly and +slow but does _exactly_ what I want it to do. + +Mental note: rewrite this. + +It is very important to keep in mind the context in which this module was +written: strictly to support the projects for which Consed.pm was +designed. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing +lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Chad Matsalla + +Email bioinformatics@dieselwurks.com + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + +package Bio::Tools::Alignment::Trim; + +use Bio::Root::Root; +use strict; +use Dumpvalue; + + + +use vars qw($VERSION @ISA %DEFAULTS); + +$VERSION = '0.01'; + +@ISA = qw(Bio::Root::Root); + +BEGIN { + %DEFAULTS = ( 'f_designator' => 'f', + 'r_designator' => 'r', + 'windowsize' => '10', + 'phreds' => '20'); +} + +=head2 new() + + Title : new() + Usage : $o_trim = Bio::Tools::Alignment::Trim->new(); + Function: Construct the Bio::Tools::Alignment::Trim object. No parameters + are required to create this object. It is strictly a bundle of + functions, as far as I am concerned. + Returns : A reference to a Bio::Tools::Alignment::Trim object. + Args : (optional) + -windowsize (default 10) + -phreds (default 20) + + +=cut + +sub new { + my ($class,@args) = @_; + my $self = $class->SUPER::new(@args); + my($windowsize,$phreds) = + $self->_rearrange([qw( + WINDOWSIZE + PHREDS + )], + @args); + $self->{windowsize} = $windowsize || $DEFAULTS{'windowsize'}; + $self->{phreds} = $phreds || $DEFAULTS{'phreds'}; + # print("Constructor set phreds to ".$self->{phreds}."\n") if $self->verbose > 0; + $self->set_designators($DEFAULTS{'f_designator'}, + $DEFAULTS{'r_designator'}); + return $self; +} + +=head2 set_designators($forward_designator,$reverse_designator) + + Title : set_designators(<forward>,<reverse>) + Usage : $o_trim->set_designators("F","R") + Function: Set the string by which the system determines whether a given + sequence represents a forward or a reverse read. + Returns : Nothing. + Args : two scalars: one representing the forward designator and one + representing the reverse designator + +=cut + +sub set_designators { + my $self = shift; + ($self->{'f_designator'},$self->{'r_designator'}) = @_; +} + +=head2 set_forward_designator($designator) + + Title : set_forward_designator($designator) + Usage : $o_trim->set_forward_designator("F") + Function: Set the string by which the system determines if a given + sequence is a forward read. + Returns : Nothing. + Args : A string representing the forward designator of this project. + +=cut + +sub set_forward_designator { + my ($self,$desig) = @_; + $self->{'f_designator'} = $desig; +} + +=head2 set_reverse_designator($reverse_designator) + + Title : set_reverse_designator($reverse_designator) + Function: Set the string by which the system determines if a given + sequence is a reverse read. + Usage : $o_trim->set_reverse_designator("R") + Returns : Nothing. + Args : A string representing the forward designator of this project. + +=cut + +sub set_reverse_designator { + my ($self,$desig) = @_; + $self->{'r_designator'} = $desig; +} + +=head2 get_designators() + + Title : get_designators() + Usage : $o_trim->get_designators() + Returns : A string describing the current designators. + Args : None + Notes : Really for informational purposes only. Duh. + +=cut + +sub get_designators { + my $self = shift; + return("forward: ".$self->{'f_designator'}." reverse: ".$self->{'r_designator'}); +} + +=head2 trim_leading_polys() + + Title : trim_leading_polys() + Usage : $o_trim->trim_leading_polys() + Function: Not implemented. Does nothing. + Returns : Nothing. + Args : None. + Notes : This function is not implemented. Part of something I wanted to + do but never got around to doing. + +=cut + +sub trim_leading_polys { + my ($self, $sequence) = @_; +} + +=head2 dump_hash() + + Title : dump_hash() + Usage : $o_trim->dump_hash() + Function: Unimplemented. + Returns : Nothing. + Args : None. + Notes : Does nothing. + +=cut + +sub dump_hash { + my $self = shift; + my %hash = %{$self->{'qualities'}}; +} # end dump_hash + +=head2 trim_singlet($sequence,$quality,$name,$class) + + Title : trim_singlet($sequence,$quality,$name,$class) + Usage : ($r_trim_points,$trimmed_sequence) = + @{$o_trim->trim_singlet($sequence,$quality,$name,$class)}; + Function: Trim a singlet based on its quality. + Returns : a reference to an array containing the forward and reverse + trim points and the trimmed sequence. + Args : $sequence : A sequence (SCALAR, please) + $quality : A _scalar_ of space-delimited quality values. + $name : the name of the sequence + $class : The class of the sequence. One of qw(singlet + singleton doublet pair multiplet) + Notes : At the time this was written the bioperl objects SeqWithQuality + and PrimaryQual did not exist. This is what is with the clumsy + passing of references and so on. I will rewrite this next time I + have to work with it. I also wasn't sure whether this function + should return just the trim points or the points and the sequence. + I decided that I always wanted both so that's how I implemented + it. + - Note that the size of the sliding windows is set during construction of + the Bio::Tools::Alignment::Trim object. + +=cut + +sub trim_singlet { + my ($self,$sequence,$quality,$name,$class) = @_; + # this split is done because I normally store quality values in a + # space-delimited scalar rather then in an array. + # I do this because serialization of the arrays is tough. + my @qual = split(' ',$quality); + my @points; + my $sequence_length = length($sequence); + my ($returnstring,$processed_sequence); + # smooth out the qualities + my $r_windows = &_sliding_window(\@qual,$self->{windowsize}); + # find out the leading and trailing trimpoints + my $start_base = $self->_get_start($r_windows,$self->{windowsize},$self->{phreds}); + my (@new_points,$trimmed_sequence); + # do you think that any sequence shorter then 100 should be + # discarded? I don't think that this should be the decision of this + # module. + # removed, 020926 + $points[0] = $start_base; + # whew! now for the end base + # required parameters: reference_to_windows,windowsize,$phredvalue,start_base + my $end_base = &_get_end($r_windows,$self->{windowsize}, + $self->{phreds},$start_base); + $points[1] = $end_base; + # now do the actual trimming + # CHAD : I don't think that it is a good idea to call chop_sequence here + # because chop_sequence also removes X's and N's and things + # and that is not always what is wanted + return \@points; +} + +=head2 trim_doublet($sequence,$quality,$name,$class) + + Title : trim_doublet($sequence,$quality,$name,$class) + Usage : ($r_trim_points,$trimmed_sequence) = + @{$o_trim->trim_singlet($sequence,$quality,$name,$class)}; + Function: Trim a singlet based on its quality. + Returns : a reference to an array containing the forward and reverse + Args : $sequence : A sequence + $quality : A _scalar_ of space-delimited quality values. + $name : the name of the sequence + $class : The class of the sequence. One of qw(singlet + singleton doublet pair multiplet) + Notes : At the time this was written the bioperl objects SeqWithQuality + and PrimaryQual did not exist. This is what is with the clumsy + passing of references and so on. I will rewrite this next time I + have to work with it. I also wasn't sure whether this function + should return just the trim points or the points and the sequence. + I decided that I always wanted both so that's how I implemented + it. + +=cut + +#' +sub trim_doublet { + my ($self,$sequence,$quality,$name,$class) = @_; + my @qual = split(' ',$quality); + my @points; + my $sequence_length = length($sequence); + my ($returnstring,$processed_sequence); + # smooth out the qualities + my $r_windows = &_sliding_window(\@qual,$self->{windowsize}); + # determine where the consensus sequence starts + my $offset = 0; + for (my $current = 0; $current<$sequence_length;$current++) { + if ($qual[$current] != 0) { + $offset = $current; + last; + } + } + # start_base required: r_quality,$windowsize,$phredvalue + my $start_base = $self->_get_start($r_windows,$self->{windowsize},$self->{phreds},$offset); + if ($start_base > ($sequence_length - 100)) { + $points[0] = ("FAILED"); + $points[1] = ("FAILED"); + return @points; + } + $points[0] = $start_base; + # + # whew! now for the end base + # + # required parameters: reference_to_windows,windowsize,$phredvalue,start_base + # | + # 010420 NOTE: We will no longer get the end base to avoid the Q/--\___/-- syndrome + my $end_base = $sequence_length; + my $start_of_trailing_zeros = &count_doublet_trailing_zeros(\@qual); + $points[1] = $end_base; + # CHAD : I don't think that it is a good idea to call chop_sequence here + # because chop_sequence also removes X's and N's and things + # and that is not always what is wanted + return @points; +} # end trim_doublet + +=head2 chop_sequence($name,$class,$sequence,@points) + + Title : chop_sequence($name,$class,$sequence,@points) + Usage : ($start_point,$end_point,$chopped_sequence) = + $o_trim->chop_sequence($name,$class,$sequence,@points); + Function: Chop a sequence based on its name, class, and sequence. + Returns : an array containing three scalars: + 1- the start trim point + 2- the end trim point + 3- the chopped sequence + Args : + $name : the name of the sequence + $class : The class of the sequence. One of qw(singlet + singleton doublet pair multiplet) + $sequence : A sequence + @points : An array containing two elements- the first contains + the start trim point and the second conatines the end trim + point. + +=cut + +sub chop_sequence { + my ($self,$name,$class,$sequence,@points) = @_; + print("Coming into chop_sequence, \@points are @points\n"); + my $fdesig = $self->{'f_designator'}; + my $rdesig = $self->{'r_designator'}; + if (!$points[0] && !$points[1]) { + $sequence = "junk"; + return $sequence; + } + if ($class eq "singlet" && $name =~ /$fdesig$/) { + $sequence = substr($sequence,$points[0],$points[1]-$points[0]); + } + elsif ($class eq "singlet" && $name =~ /$rdesig$/) { + $sequence = substr($sequence,$points[0],$points[1]-$points[0]); + } + elsif ($class eq "singleton" && $name =~ /$fdesig$/) { + $sequence = substr($sequence,$points[0],$points[1]-$points[0]); + } + elsif ($class eq "singleton" && $name =~ /$rdesig$/) { + $sequence = substr($sequence,$points[0],$points[1]-$points[0]); + } + elsif ($class eq "doublet") { + $sequence = substr($sequence,$points[0],$points[1]-$points[0]); + } + # this is a _terrible_ to do this! i couldn't seem to find a better way + # i thought something like s/(^.*[Xx]{5,})//g; might work, but no go + # no time to find a fix! + my $length_before_trimming = length($sequence); + my $subs_Xs = $sequence =~ s/^.*[Xx]{5,}//g; + if ($subs_Xs) { + my $length_after_trimming = length($sequence); + my $number_Xs_trimmed = $length_before_trimming - $length_after_trimming; + $points[0] += $number_Xs_trimmed; + } + $length_before_trimming = length($sequence); + my $subs_Ns = $sequence =~ s/[Nn]{1,}$//g; + if ($subs_Ns) { + my $length_after_trimming = length($sequence); + my $number_Ns_trimmed = $length_before_trimming - $length_after_trimming; + $points[1] -= $number_Ns_trimmed; + $points[1] -= 1; + } + push @points,$sequence; + print("chop_sequence \@points are @points\n"); + return @points; +} + +=head2 _get_start($r_quals,$windowsize,$phreds,$offset) + + Title : _get_start($r_quals,$windowsize,$phreds,$offset) + Usage : $start_base = $self->_get_start($r_windows,5,20); + Function: Provide the start trim point for this sequence. + Returns : a scalar representing the start of the sequence + Args : + $r_quals : A reference to an array containing quality values. In + context, this array of values has been smoothed by then + sliding window-look ahead algorithm. + $windowsize : The size of the window used when the sliding window + look-ahead average was calculated. + $phreds : <fill in what this does here> + $offset : <fill in what this does here> + +=cut + +sub _get_start { + my ($self,$r_quals,$windowsize,$phreds,$offset) = @_; + print("Using $phreds phreds\n") if $self->verbose > 0; + # this is to help determine whether the sequence is good at all + my @quals = @$r_quals; + my ($count,$count2,$qualsum); + if ($offset) { $count = $offset; } else { $count = 0; } + # search along the length of the sequence + for (; ($count+$windowsize) <= scalar(@quals); $count++) { + # sum all of the quality values in this window. + my $cumulative=0; + for($count2 = $count; $count2 < $count+$windowsize; $count2++) { + if (!$quals[$count2]) { + # print("Quals don't exist here!\n"); + } + else { + $qualsum += $quals[$count2]; + # print("Incremented qualsum to ($qualsum)\n"); + } + $cumulative++; + } + # print("The sum of this window (starting at $count) is $qualsum. I counted $cumulative bases.\n"); + # if the total of windowsize * phreds is + if ($qualsum && $qualsum >= $windowsize*$phreds) { return $count; } + $qualsum = 0; + } + # if ($count > scalar(@quals)-$windowsize) { return; } + return $count; +} + +=head2 _get_end($r_qual,$windowsize,$phreds,$count) + + Title : _get_end($r_qual,$windowsize,$phreds,$count) + Usage : my $end_base = &_get_end($r_windows,20,20,$start_base); + Function: Get the end trim point for this sequence. + Returns : A scalar representing the end trim point for this sequence. + Args : + $r_qual : A reference to an array containing quality values. In + context, this array of values has been smoothed by then + sliding window-look ahead algorithm. + $windowsize : The size of the window used when the sliding window + look-ahead average was calculated. + $phreds : <fill in what this does here> + $count : Start looking for the end of the sequence here. + +=cut + +sub _get_end { + my ($r_qual,$windowsize,$phreds,$count) = @_; + my @quals = @$r_qual; + my $total_bases = scalar(@quals); + my ($count2,$qualsum,$end_of_quals,$bases_counted); + if (!$count) { $count=0; } + BASE: for (; $count < $total_bases; $count++) { + $bases_counted = 0; + $qualsum = 0; + POSITION: for($count2 = $count; $count2 < $total_bases; $count2++) { + $bases_counted++; + + if ($count2 == $total_bases-1) { + $qualsum += $quals[$count2]; + $bases_counted++; + last BASE; + } + elsif ($bases_counted == $windowsize) { + $qualsum += $quals[$count2]; + if ($qualsum < $bases_counted*$phreds) { + return $count+$bases_counted+$windowsize; + } + next BASE; + } + else { + $qualsum += $quals[$count2]; + } + } + if ($qualsum < $bases_counted*$phreds) { + return $count+$bases_counted+$windowsize; + } + else { } + $qualsum = 0; + } # end for + if ($end_of_quals) { + my $bases_for_average = $total_bases-$count2; + return $count2; + } + else { } + if ($qualsum) { } # print ("$qualsum\n"); + return $total_bases; +} # end get_end + +=head2 count_doublet_trailing_zeros($r_qual) + + Title : count_doublet_trailing_zeros($r_qual) + Usage : my $start_of_trailing_zeros = &count_doublet_trailing_zeros(\@qual); + Function: Find out when the trailing zero qualities start. + Returns : A scalar representing where the zeros start. + Args : A reference to an array of quality values. + Notes : Again, this should be rewritten to use PrimaryQual objects. + A more detailed explanation of why phrap puts these zeros here should + be written and placed here. Please email and hassle the author. + + +=cut + +sub count_doublet_trailing_zeros { + my ($r_qual) = shift; + my $number_of_trailing_zeros = 0; + my @qualities = @$r_qual; + for (my $current=scalar(@qualities);$current>0;$current--) { + if ($qualities[$current] && $qualities[$current] != 0) { + $number_of_trailing_zeros = scalar(@qualities)-$current; + return $current+1; + } + } + return scalar(@qualities); +} # end count_doublet_trailing_zeros + +=head2 _sliding_window($r_quals,$windowsize) + + Title : _sliding_window($r_quals,$windowsize) + Usage : my $r_windows = &_sliding_window(\@qual,$windowsize); + Function: Create a sliding window, look-forward-average on an array + of quality values. Used to smooth out differences in qualities. + Returns : A reference to an array containing the smoothed values. + Args : $r_quals: A reference to an array containing quality values. + $windowsize : The size of the sliding window. + Notes : This was written before PrimaryQual objects existed. They + should use that object but I haven't rewritten this yet. + +=cut + +#' +sub _sliding_window { + my ($r_quals,$windowsize) = @_; + my (@window,@quals,$qualsum,$count,$count2,$average,@averages,$bases_counted); + @quals = @$r_quals; + my $size_of_quality = scalar(@quals); + # do this loop for all of the qualities + for ($count=0; $count <= $size_of_quality; $count++) { + $bases_counted = 0; + BASE: for($count2 = $count; $count2 < $size_of_quality; $count2++) { + $bases_counted++; + # if the search hits the end of the averages, stop + # this is for the case near the end where bases remaining < windowsize + if ($count2 == $size_of_quality) { + $qualsum += $quals[$count2]; + last BASE; + } + # if the search hits the size of the window + elsif ($bases_counted == $windowsize) { + $qualsum += $quals[$count2]; + last BASE; + } + # otherwise add the quality value + unless (!$quals[$count2]) { + $qualsum += $quals[$count2]; + } + } + unless (!$qualsum || !$windowsize) { + $average = $qualsum / $bases_counted; + if (!$average) { $average = "0"; } + push @averages,$average; + } + $qualsum = 0; + } + # 02101 Yes, I repaired the mismatching numbers between averages and windows. + # print("There are ".scalar(@$r_quals)." quality values. They are @$r_quals\n"); + # print("There are ".scalar(@averages)." average values. They are @averages\n"); + return \@averages; + +} + +=head2 _print_formatted_qualities + + Title : _print_formatted_qualities(\@quals) + Usage : &_print_formatted_qualities(\@quals); + Returns : Nothing. Prints. + Args : A reference to an array containing quality values. + Notes : An internal procedure used in debugging. Prints out an array nicely. + +=cut + +sub _print_formatted_qualities { + my $rquals = shift; + my @qual = @$rquals; + for (my $count=0; $count<scalar(@qual) ; $count++) { + if (($count%10)==0) { print("\n$count\t"); } + if ($qual[$count]) { print ("$qual[$count]\t");} + else { print("0\t"); } + } + print("\n"); +} + +=head2 _get_end_old($r_qual,$windowsize,$phreds,$count) + + Title : _get_end_old($r_qual,$windowsize,$phreds,$count) + Usage : Deprecated. Don't use this! + Returns : Deprecated. Don't use this! + Args : Deprecated. Don't use this! + +=cut + +#' +sub _get_end_old { + my ($r_qual,$windowsize,$phreds,$count) = @_; + warn("Do Not Use this function (_get_end_old)"); + my $target = $windowsize*$phreds; + my @quals = @$r_qual; + my $total_bases = scalar(@quals); + my ($count2,$qualsum,$end_of_quals); + if (!$count) { $count=0; } + BASE: for (; $count < $total_bases; $count++) { + for($count2 = $count; $count2 < $count+$windowsize; $count2++) { + if ($count2 == scalar(@quals)-1) { + $qualsum += $quals[$count2]; + $end_of_quals = 1; + last BASE; + + } + $qualsum += $quals[$count2]; + } + if ($qualsum < $windowsize*$phreds) { + return $count+$windowsize; + } + $qualsum = 0; + } # end for +} # end get_end_old + + +# Autoload methods go after =cut, and are processed by the autosplit program. + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/AnalysisResult.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/AnalysisResult.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,323 @@ +# $Id: AnalysisResult.pm,v 1.12 2002/10/22 07:38:45 lapp Exp $ +# +# BioPerl module for Bio::Tools::AnalysisResult +# +# Cared for by Hilmar Lapp <hlapp@gmx.net> +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::AnalysisResult - Base class for analysis result objects and parsers + +=head1 SYNOPSIS + + # obtain a AnalysisResult derived object somehow + print "Method ", $result->analysis_method(), + ", version " $result->analysis_method_version(), + ", performed on ", $result->analysis_date(), "\n"; + # annotate a sequence utilizing SeqAnalysisParserI methods + while($feat = $result->next_feature()) { + $seq->add_SeqFeature($feat); + } + $result->close(); + # query object, e.g. a Bio::SeqI implementing object + $queryseq = $result->analysis_query(); + # Subject of the analysis -- may be undefined. Refer to derived module + # to find out what is returned. + $subject = $result->analysis_subject(); + +=head1 DESCRIPTION + +The AnalysisResult module is supposed to be the base class for modules +encapsulating parsers and interpreters for the result of a analysis that was +carried out with a query sequence. + +The notion of an analysis represented by this base class is that of a unary or +binary operator, taking either one query or a query and a subject and producing +a result. The query is e.g. a sequence, and a subject is either a sequence, +too, or a database of sequences. + +This module also implements the Bio::SeqAnalysisParserI interface, and thus +can be used wherever such an object fits. +See L<Bio::SeqAnalysisParserI|Bio::SeqAnalysisParserI>. +Developers will find a ready-to-use B<parse()> method, but need to implement +B<next_feature()> in an inheriting class. Support for initialization with input +file names and reading from streams is also ready to use. + +Note that this module does not provide support for B<running> an analysis. +Rather, it is positioned in the subsequent parsing step (concerned with +turning raw results into BioPerl objects). + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp@gmx.net + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Tools::AnalysisResult; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::Root::IO; +use Bio::SeqAnalysisParserI; +use Bio::AnalysisResultI; + +@ISA = qw(Bio::Root::Root Bio::SeqAnalysisParserI + Bio::AnalysisResultI Bio::Root::IO); + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + $self->_initialize(@args); + return $self; +} + +sub _initialize { + my($self,@args) = @_; + + my $make = $self->SUPER::_initialize(@args); + + $self->_initialize_state(@args); + return $make; # success - we hope! +} + +=head2 _initialize_state + + Title : _initialize_state + Usage : n/a; usually called by _initialize() + Function: This method is for BioPerl B<developers> only, as indicated by the + leading underscore in its name. + + Performs initialization or reset of the state of this object. The + difference to _initialize() is that it may be called at any time, + and repeatedly within the lifetime of this object. B<Note, however, + that this is potentially dangerous in a multi-threading + environment. In general, calling this method twice is discouraged + for this reason. + + This method is supposed to reset the state such that any 'history' + is lost. State information that does not change during object + lifetime is not considered as history, e.g. parent, name, etc shall + not be reset. An inheriting object should only be concerned with + state information it introduces itself, and for everything else + call SUPER::_initialize_state(@args). + + An example is parsing an input file: a state reset implies + discarding any unread input, and the actual input itself, followed + by setting the new input. + + The argument syntax is the same as for L<new()|new> and L<_initialize()|_initialize>, + i.e., named parameters following the -name=>$value convention. + The following parameters are dealt with by the implementation + provided here: + -INPUT, -FH, -FILE + (tags are case-insensitive). + Example : + Returns : + Args : + +=cut + +sub _initialize_state { + my ($self,@args) = @_; + + $self->close(); + $self->_initialize_io(@args); + + $self->{'_analysis_sbjct'} = undef; + $self->{'_analysis_query'} = undef; + $self->{'_analysis_prog'} = undef; + $self->{'_analysis_progVersion'} = undef; + $self->{'_analysis_date'} = undef; + + return 1; +} + +# =head2 parse +# +# Title : parse +# Usage : $obj->parse(-input=>$inputobj, [ -params=>[@params] ], +# [ -method => $method ] ) +# Function: Sets up parsing for feature retrieval from an analysis file, +# or object. +# +# This method was originally required by SeqAnalysisParserI, but +# is now discouraged due to potential problems in a multi- +# threading environment (CORBA!). If called only once, it doesn't +# add any functionality to calling new() with the same +# parameters. +# +# The implementation provided here calls automatically +# _initialize_state() and passes on -input=>$inputobj and +# @params as final arguments. +# Example : +# Returns : void +# Args : B<input> - object/file where analysis are coming from +# B<params> - parameter to use when parsing/running analysis +# B<method> - method of analysis +# +# =cut + +sub parse { + my ($self, @args) = @_; + + my ($input, $params, $method) = + $self->_rearrange([qw(INPUT + PARAMS + METHOD + )], + @args); + + # initialize with new input + if($params) { + $self->_initialize_state('-input' => $input, @$params); + } else { + $self->_initialize_state('-input' => $input); + } + $self->analysis_method($method) if $method; +} + +=head2 analysis_query + + Usage : $query_obj = $result->analysis_query(); + Purpose : Set/Get the name of the query used to generate the result, that + is, the entity on which the analysis was performed. Will mostly + be a sequence object (Bio::PrimarySeq compatible). + Argument : + Returns : The object set before. Mostly a Bio::PrimarySeq compatible object. + +=cut + +#-------- +sub analysis_query { + my ($self, $obj) = @_; + if($obj) { + $self->{'_analysis_query'} = $obj; + } + return $self->{'_analysis_query'}; +} +#-------- + +=head2 analysis_subject + + Usage : $result->analyis_subject(); + Purpose : Set/Get the subject of the analysis against which it was + performed. For similarity searches it will probably be a database, + and for sequence feature predictions (exons, promoters, etc) it + may be a collection of models or homologous sequences that were + used, or undefined. + Returns : The object that was set before, or undef. + Argument : + +=cut + +#--------------- +sub analysis_subject { +#--------------- + my ($self, $sbjct_obj) = @_; + if($sbjct_obj) { + $self->{'_analysis_sbjct'} = $sbjct_obj; + } + return $self->{'_analysis_sbjct'}; +} + + +=head2 analysis_date + + Usage : $result->analysis_date(); + Purpose : Set/Get the date on which the analysis was performed. + Returns : String + Argument : + Comments : + +=cut + +#---------- +sub analysis_date { + my ($self, $date) = @_; + if($date) { + $self->{'_analysis_date'} = $date; + } + return $self->{'_analysis_date'}; +} +#---------- + +=head2 analysis_method + + Usage : $result->analysis_method(); + Purpose : Set/Get the name of the sequence analysis method that was used + to produce this result (BLASTP, FASTA, etc.). May also be the + actual name of a program. + Returns : String + Argument : n/a + +=cut + +#------------- +sub analysis_method { +#------------- + my ($self, $method) = @_; + if($method) { + $self->{'_analysis_prog'} = $method; + } + return $self->{'_analysis_prog'}; +} + +=head2 analysis_method_version + + Usage : $result->analysis_method_version(); + Purpose : Set/Get the version string of the analysis program. + : (e.g., 1.4.9MP, 2.0a19MP-WashU). + Returns : String + Argument : n/a + +=cut + +#--------------------- +sub analysis_method_version { +#--------------------- + my ($self, $version) = @_; + if($version) { + $self->{'_analysis_progVersion'} = $version; + } + return $self->{'_analysis_progVersion'}; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/BPbl2seq.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/BPbl2seq.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,447 @@ +# $Id: BPbl2seq.pm,v 1.21.2.2 2003/06/03 14:38:18 jason Exp $ +# +# Bioperl module Bio::Tools::BPbl2seq +# based closely on the Bio::Tools::BPlite modules +# Ian Korf (ikorf@sapiens.wustl.edu, http://sapiens.wustl.edu/~ikorf), +# Lorenz Pollak (lorenz@ist.org, bioperl port) +# +# +# Copyright Peter Schattner +# +# You may distribute this module under the same terms as perl itself +# _history +# October 20, 2000 +# May 29, 2001 +# Fixed bug which prevented reading of more than one HSP / hit. +# This fix required changing calling syntax as described below. (PS) +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::BPbl2seq - Lightweight BLAST parser for pair-wise sequence +alignment using the BLAST algorithm. + +=head1 SYNOPSIS + + use Bio::Tools::BPbl2seq; + my $report = Bio::Tools::BPbl2seq->new(-file => 't/bl2seq.out'); + $report->sbjctName; + $report->sbjctLength; + while(my $hsp = $report->next_feature) { + $hsp->score; + $hsp->bits; + $hsp->percent; + $hsp->P; + $hsp->match; + $hsp->positive; + $hsp->length; + $hsp->querySeq; + $hsp->sbjctSeq; + $hsp->homologySeq; + $hsp->query->start; + $hsp->query->end; + $hsp->sbjct->start; + $hsp->sbjct->end; + $hsp->sbjct->seq_id; + $hsp->sbjct->overlaps($exon); + } + +=head1 DESCRIPTION + +BPbl2seq is a package for parsing BLAST bl2seq reports. BLAST bl2seq is a +program for comparing and aligning two sequences using BLAST. Although +the report format is similar to that of a conventional BLAST, there are a +few differences so that BPlite is unable to read bl2seq reports directly. + +From the user's perspective, one difference between bl2seq and +other blast reports is that the bl2seq report does not print out the +name of the first of the two aligned sequences. (The second sequence +name is given in the report as the name of the "hit"). Consequently, +BPbl2seq has no way of identifying the name of the initial sequence +unless it is passed to constructor as a second argument as in: + + my $report = Bio::Tools::BPbl2seq->new(\*FH, "ALEU_HORVU"); + +If the name of the first sequence (the "query") is not passed to +BPbl2seq.pm in this manner, the name of the first sequence will be +left as "unknown". (Note that to preserve a common interface with the +other BLAST programs the two sequences being compared are referred to +in bl2seq as "query" and "subject" although this is perhaps a bit +misleading when simply comparing 2 sequences as opposed to querying a +database.) + +In addition, since there will only be (at most) one "subject" (hit) in +a bl2seq report, one should use the method $report-E<gt>next_feature, +rather than $report-E<gt>nextSbjct-E<gt>nextHSP to obtain the next +high scoring pair. + +One should note that the previous (0.7) version of BPbl2seq used +slightly different syntax. That version had a bug and consequently the +old syntax has been eliminated. Attempts to use the old syntax will +return error messages explaining the (minor) recoding required to use +the current syntax. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Peter Schattner + +Email: schattner@alum.mit.edu + +=head1 ACKNOWLEDGEMENTS + +Based on work of: +Ian Korf (ikorf@sapiens.wustl.edu, http://sapiens.wustl.edu/~ikorf), +Lorenz Pollak (lorenz@ist.org, bioperl port) + +=head1 CONTRIBUTORS + +Jason Stajich, jason@cgt.mc.duke.edu + +=cut + +#' +package Bio::Tools::BPbl2seq; + +use strict; +use vars qw(@ISA); +use Bio::Tools::BPlite; +use Bio::Root::Root; +use Bio::Root::IO; +use Bio::Tools::BPlite::Sbjct; # we want to use Sbjct +use Bio::SeqAnalysisParserI; +use Symbol; + +@ISA = qw(Bio::Root::Root Bio::SeqAnalysisParserI Bio::Root::IO); + +#@ISA = qw(Bio::Tools::BPlite); + +=head2 new + + Title : new + Function: Create a new Bio::Tools::BPbl2seq object + Returns : Bio::Tools::BPbl2seq + Args : -file input file (alternative to -fh) + -fh input stream (alternative to -file) + -queryname name of query sequence + +=cut + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + # initialize IO + $self->_initialize_io(@args); + + my ($queryname,$rt) = $self->_rearrange([qw(QUERYNAME + REPORT_TYPE)], @args); + $queryname = 'unknown' if( ! defined $queryname ); + if( $rt && $rt =~ /BLAST/i ) { + $self->{'BLAST_TYPE'} = uc($rt); + } else { + $self->warn("Must provide which type of BLAST was run (blastp,blastn, tblastn, tblastx, blastx) if you want strand information to get set properly for DNA query or subjects"); + } + my $sbjct = $self->getSbjct(); + $self->{'_current_sbjct'} = $sbjct; + + $self->{'_query'}->{'NAME'} = $queryname; + return $self; +} + + +=head2 getSbjct + + Title : + Usage : $sbjct = $obj->getSbjct(); + Function : Method of obtaining single "subject" of a bl2seq report + Example : my $sbjct = $obj->getSbjct ) {} + Returns : Sbjct object or null if finished + Args : + +=cut + +sub getSbjct { + my ($self) = @_; +# $self->_fastForward or return undef; + + ####################### + # get bl2seq "sbjct" name and length # + ####################### + my $length; + my $def; + READLOOP: while(defined ($_ = $self->_readline) ) { + if ($_ =~ /^>(.+)$/) { + $def = $1; + next READLOOP; + } + elsif ($_ =~ /^\s*Length\s.+\D(\d+)/i) { + $length = $1; + next READLOOP; + } + elsif ($_ =~ /^\s{0,2}Score/) { + $self->_pushback($_); + last READLOOP; + } + } + return undef if ! defined $def; + $def =~ s/\s+/ /g; + $def =~ s/\s+$//g; + + + #################### + # the Sbjct object # + #################### + my $sbjct = new Bio::Tools::BPlite::Sbjct('-name'=>$def, + '-length'=>$length, + '-parent'=>$self); + return $sbjct; +} + + + + +=head2 next_feature + + Title : next_feature + Usage : while( my $feat = $res->next_feature ) { # do something } + Function: calls next_feature function from BPlite. + Example : + Returns : A Bio::SeqFeatureI compliant object, in this case a + Bio::Tools::BPlite::HSP object, and FALSE if there are no more + HSPs. + Args : None + +=cut + +sub next_feature{ + my ($self) = @_; + my ($sbjct, $hsp); + $sbjct = $self->{'_current_sbjct'}; + unless( defined $sbjct ) { + $self->debug(" No hit object found for bl2seq report \n ") ; + return undef; + } + $hsp = $sbjct->nextHSP; + return $hsp || undef; +} + +=head2 queryName + + Title : + Usage : $name = $report->queryName(); + Function : get /set the name of the query + Example : + Returns : name of the query + Args : + +=cut + +sub queryName { + my ($self, $queryname) = @_; + if( $queryname ) { + $self->{'_query'}->{'NAME'} = $queryname; + } + $self->{'_query'}->{'NAME'}; +} + +=head2 sbjctName + + Title : + Usage : $name = $report->sbjctName(); + Function : returns the name of the Sbjct + Example : + Returns : name of the Sbjct + Args : + +=cut + +sub sbjctName { + my $self = shift; +# unless( defined $self->{'_current_sbjct'} ) { +# my $sbjct = $self->{'_current_sbjct'} = $self->nextSbjct; +# return undef unless defined $sbjct; +# } + $self->{'_current_sbjct'}->{'NAME'} || ''; +} + +=head2 sbjctLength + + Title : sbjctLength + Usage : $length = $report->sbjctLength(); + Function : returns the length of the Sbjct + Example : + Returns : name of the Sbjct + Args : + +=cut + +sub sbjctLength { + my $self = shift; +# unless( defined $self->{'_current_sbjct'} ) { +# my $sbjct = $self->{'_current_sbjct'} = $self->nextSbjct; +# return undef unless defined $sbjct; +# } + $self->{'_current_sbjct'}->{'LENGTH'}; +} + +=head2 P + + Title : P + Usage : + Function : Syntax no longer supported, error message only + +=cut + +sub P { + my $self = shift; + $self->throw("Syntax used is no longer supported.\n See BPbl2seq.pm documentation for current syntax.\n "); +} + +=head2 percent + + Title : percent + Usage : $hsp->percent(); + Function : Syntax no longer supported, error message only + +=cut + +sub percent { + my $self = shift; + $self->throw("Syntax used is no longer supported.\n See BPbl2seq.pm documentation for current syntax.\n "); +} + +=head2 match + + Title : match + Usage : $hsp->match(); + Function : Syntax no longer supported, error message only + +=cut + +sub match { + my $self = shift; + $self->throw("Syntax used is no longer supported.\n See BPbl2seq.pm documentation for current syntax.\n "); +} + +=head2 positive + + Title : positive + Usage : $hsp->positive(); + Function : Syntax no longer supported, error message only + +=cut + +sub positive { + my $self = shift; + $self->throw("Syntax used is no longer supported.\n See BPbl2seq.pm documentation for current syntax.\n ") ; +} + +=head2 querySeq + + Title : querySeq + Usage : $hsp->querySeq(); + Function : Syntax no longer supported, error message only + +=cut + +sub querySeq { + my $self = shift; + $self->throw("Syntax used is no longer supported.\n See BPbl2seq.pm documentation for current syntax.\n ") ; +} + +=head2 sbjctSeq + + Title : sbjctSeq + Usage : $hsp->sbjctSeq(); + Function : Syntax no longer supported, error message only + +=cut + +sub sbjctSeq { + my $self = shift; + $self->throw("Syntax used is no longer supported.\n See BPbl2seq.pm documentation for current syntax.\n ") ; +} + +=head2 homologySeq + + Title : homologySeq + Usage : $hsp->homologySeq(); + Function : Syntax no longer supported, error message only + +=cut + +sub homologySeq { + my $self = shift; + $self->throw("Syntax used is no longer supported.\n See BPbl2seq.pm documentation for current syntax.\n ") ; +} + +=head2 qs + + Title : qs + Usage : $hsp->qs(); + Function : Syntax no longer supported, error message only + +=cut + +sub qs { + my $self = shift; + $self->throw("Syntax used is no longer supported.\n See BPbl2seq.pm documentation for current syntax.\n ") ; +} + +=head2 ss + + Title : ss + Usage : $hsp->ss(); + Function : Syntax no longer supported, error message only + +=cut + +sub ss { + my $self = shift; + $self->throw("Syntax used is no longer supported.\n See BPbl2seq.pm documentation for current syntax.\n ") ; +} + +=head2 hs + + Title : hs + Usage : $hsp->hs(); + Function : Syntax no longer supported, error message only + +=cut + +sub hs { + my $self = shift; + $self->throw("Syntax used is no longer supported.\n See BPbl2seq.pm documentation for current syntax.\n ") ; +} + +sub _fastForward { + my ($self) = @_; + return 0 if $self->{'REPORT_DONE'}; # empty report + while(defined( $_ = $self->_readline() ) ) { + if ($_ =~ /^>|^Parameters|^\s+Database:|^\s+Posted date:|^\s*Lambda/) { + $self->_pushback($_); + return 1; + } + } + $self->warn("Possible error (1) while parsing BLAST report!"); +} + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/BPlite.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/BPlite.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,448 @@ +# $Id: BPlite.pm,v 1.36.2.2 2003/02/20 00:39:03 jason Exp $ +############################################################################## +# Bioperl module Bio::Tools::BPlite +############################################################################## +# +# The original BPlite.pm module has been written by Ian Korf ! +# see http://sapiens.wustl.edu/~ikorf +# +# You may distribute this module under the same terms as perl itself + +=head1 NAME + +Bio::Tools::BPlite - Lightweight BLAST parser + +=head1 SYNOPSIS + + use Bio::Tools::BPlite; + my $report = new Bio::Tools::BPlite(-fh=>\*STDIN); + + { + $report->query; + $report->database; + while(my $sbjct = $report->nextSbjct) { + $sbjct->name; + while (my $hsp = $sbjct->nextHSP) { + $hsp->score; + $hsp->bits; + $hsp->percent; + $hsp->P; + $hsp->EXP; + $hsp->match; + $hsp->positive; + $hsp->length; + $hsp->querySeq; + $hsp->sbjctSeq; + $hsp->homologySeq; + $hsp->query->start; + $hsp->query->end; + $hsp->hit->start; + $hsp->hit->end; + $hsp->hit->seq_id; + $hsp->hit->overlaps($exon); + } + } + + # the following line takes you to the next report in the stream/file + # it will return 0 if that report is empty, + # but that is valid for an empty blast report. + # Returns -1 for EOF. + + last if ($report->_parseHeader == -1); + redo; + } + + +=head1 DESCRIPTION + +BPlite is a package for parsing BLAST reports. The BLAST programs are a family +of widely used algorithms for sequence database searches. The reports are +non-trivial to parse, and there are differences in the formats of the various +flavors of BLAST. BPlite parses BLASTN, BLASTP, BLASTX, TBLASTN, and TBLASTX +reports from both the high performance WU-BLAST, and the more generic +NCBI-BLAST. + +Many people have developed BLAST parsers (I myself have made at least three). +BPlite is for those people who would rather not have a giant object +specification, but rather a simple handle to a BLAST report that works well +in pipes. + +=head2 Object + +BPlite has three kinds of objects, the report, the subject, and the HSP. To +create a new report, you pass a filehandle reference to the BPlite constructor. + + my $report = new Bio::Tools::BPlite(-fh=>\*STDIN); # or any other filehandle + +The report has two attributes (query and database), and one method (nextSbjct). + + $report->query; # access to the query name + $report->database; # access to the database name + $report->nextSbjct; # gets the next subject + while(my $sbjct = $report->nextSbjct) { + # canonical form of use is in a while loop + } + +A subject is a BLAST hit, which should not be confused with an HSP (below). A +BLAST hit may have several alignments associated with it. A useful way of +thinking about it is that a subject is a gene and HSPs are the exons. Subjects +have one attribute (name) and one method (nextHSP). + + $sbjct->name; # access to the subject name + $sbjct->nextHSP; # gets the next HSP from the sbjct + while(my $hsp = $sbjct->nextHSP) { + # canonical form is again a while loop + } + +An HSP is a high scoring pair, or simply an alignment. HSP objects +inherit all the useful methods from RangeI/SeqFeatureI/FeaturePair, +but provide an additional set of attributes (score, bits, percent, P, +match, EXP, positive, length, querySeq, sbjctSeq, homologySeq) that +should be familiar to anyone who has seen a blast report. + +For lazy/efficient coders, two-letter abbreviations are available for the +attributes with long names (qs, ss, hs). Ranges of the aligned sequences in +query/subject and other information (like seqname) are stored +in SeqFeature objects (i.e.: $hsp-E<gt>query, $hsp-E<gt>subject which is equal to +$hsp-E<gt>feature1, $hsp-E<gt>feature2). querySeq, sbjctSeq and homologySeq do only +contain the alignment sequences from the blast report. + + $hsp->score; + $hsp->bits; + $hsp->percent; + $hsp->P; + $hsp->match; + $hsp->positive; + $hsp->length; + $hsp->querySeq; $hsp->qs; + $hsp->sbjctSeq; $hsp->ss; + $hsp->homologySeq; $hsp->hs; + $hsp->query->start; + $hsp->query->end; + $hsp->query->seq_id; + $hsp->hit->primary_tag; # "similarity" + $hsp->hit->source_tag; # "BLAST" + $hsp->hit->start; + $hsp->hit->end; + ... + +So a very simple look into a BLAST report might look like this. + + my $report = new Bio::Tools::BPlite(-fh=>\*STDIN); + while(my $sbjct = $report->nextSbjct) { + print ">",$sbjct->name,"\n"; + while(my $hsp = $sbjct->nextHSP) { + print "\t",$hsp->start,"..",$hsp->end," ",$hsp->bits,"\n"; + } + } + +The output of such code might look like this: + + >foo + 100..155 29.5 + 268..300 20.1 + >bar + 100..153 28.5 + 265..290 22.1 + + +=head1 AUTHORS + +Ian Korf (ikorf@sapiens.wustl.edu, http://sapiens.wustl.edu/~ikorf), +Lorenz Pollak (lorenz@ist.org, bioperl port) + +=head1 ACKNOWLEDGEMENTS + +This software was developed at the Genome Sequencing Center at Washington +Univeristy, St. Louis, MO. + +=head1 CONTRIBUTORS + +Jason Stajich, jason@bioperl.org + +=head1 COPYRIGHT + +Copyright (C) 1999 Ian Korf. All Rights Reserved. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=cut + +package Bio::Tools::BPlite; + +use strict; +use vars qw(@ISA); + +use Bio::Root::Root; +use Bio::Root::IO; +use Bio::Tools::BPlite::Sbjct; # we want to use Sbjct +use Bio::SeqAnalysisParserI; +use Symbol; + +@ISA = qw(Bio::Root::Root Bio::SeqAnalysisParserI Bio::Root::IO); + +# new comes from a RootI now + +=head2 new + + Title : new + Function: Create a new Bio::Tools::BPlite object + Returns : Bio::Tools::BPlite + Args : -file input file (alternative to -fh) + -fh input stream (alternative to -file) + +=cut + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + # initialize IO + $self->_initialize_io(@args); + + $self->{'QPATLOCATION'} = []; # Anonymous array of query pattern locations for PHIBLAST + + if ($self->_parseHeader) {$self->{'REPORT_DONE'} = 0} # there are alignments + else {$self->{'REPORT_DONE'} = 1} # empty report + + return $self; # success - we hope! +} + +# for SeqAnalysisParserI compliance + +=head2 next_feature + + Title : next_feature + Usage : while( my $feat = $res->next_feature ) { # do something } + Function: SeqAnalysisParserI implementing function. This implementation + iterates over all HSPs. If the HSPs of the current subject match + are exhausted, it will automatically call nextSbjct(). + Example : + Returns : A Bio::SeqFeatureI compliant object, in this case a + Bio::Tools::BPlite::HSP object, and FALSE if there are no more + HSPs. + Args : None + +=cut + +sub next_feature{ + my ($self) = @_; + my ($sbjct, $hsp); + $sbjct = $self->{'_current_sbjct'}; + unless( defined $sbjct ) { + $sbjct = $self->{'_current_sbjct'} = $self->nextSbjct; + return undef unless defined $sbjct; + } + $hsp = $sbjct->nextHSP; + unless( defined $hsp ) { + $self->{'_current_sbjct'} = undef; + return $self->next_feature; + } + return $hsp || undef; +} + +=head2 query + + Title : query + Usage : $query = $obj->query(); + Function : returns the query object + Example : + Returns : query object + Args : + +=cut + +sub query {shift->{'QUERY'}} + +=head2 qlength + + Title : qlength + Usage : $len = $obj->qlength(); + Function : returns the length of the query + Example : + Returns : length of query + Args : + +=cut + +sub qlength {shift->{'LENGTH'}} + +=head2 pattern + + Title : pattern + Usage : $pattern = $obj->pattern(); + Function : returns the pattern used in a PHIBLAST search + +=cut + +sub pattern {shift->{'PATTERN'}} + +=head2 query_pattern_location + + Title : query_pattern_location + Usage : $qpl = $obj->query_pattern_location(); + Function : returns reference to array of locations in the query sequence + of pattern used in a PHIBLAST search + +=cut + +sub query_pattern_location {shift->{'QPATLOCATION'}} + +=head2 database + + Title : database + Usage : $db = $obj->database(); + Function : returns the database used in this search + Example : + Returns : database used for search + Args : + +=cut + +sub database {shift->{'DATABASE'}} + +=head2 nextSbjct + + Title : nextSbjct + Usage : $sbjct = $obj->nextSbjct(); + Function : Method of iterating through all the Sbjct retrieved + from parsing the report + Example : while ( my $sbjct = $obj->nextSbjct ) {} + Returns : next Sbjct object or null if finished + Args : + +=cut + +sub nextSbjct { + my ($self) = @_; + + $self->_fastForward or return undef; + + ####################### + # get all sbjct lines # + ####################### + my $def = $self->_readline(); + while(defined ($_ = $self->_readline() ) ) { + if ($_ !~ /\w/) {next} + elsif ($_ =~ /Strand HSP/) {next} # WU-BLAST non-data + elsif ($_ =~ /^\s{0,2}Score/) {$self->_pushback($_); last} + elsif ($_ =~ /^Histogram|^Searching|^Parameters|^\s+Database:|^\s+Posted date:/) { + $self->_pushback($_); + last; + } + else {$def .= $_} + } + $def =~ s/\s+/ /g; + $def =~ s/\s+$//g; + $def =~ s/Length = ([\d,]+)$//g; + my $length = $1; + return undef unless $def =~ /^>/; + $def =~ s/^>//; + + #################### + # the Sbjct object # + #################### + my $sbjct = new Bio::Tools::BPlite::Sbjct('-name'=>$def, + '-length'=>$length, + '-parent'=>$self); + return $sbjct; +} + +# begin private routines + +sub _parseHeader { + my ($self) = @_; + + # normally, _parseHeader will break out of the parse as soon as it + # reaches a new Subject (i.e. the first one after the header) if you + # call _parseHeader twice in a row, with nothing in between, all you + # accomplish is a ->nextSubject call.. so we need a flag to + # indicate that we have *entered* a header, before we are allowed to + # leave it! + + my $header_flag = 0; # here is the flag/ It is "false" at first, and + # is set to "true" when any valid header element + # is encountered + + $self->{'REPORT_DONE'} = 0; # reset this bit for a new report + while(defined($_ = $self->_readline() ) ) { + s/\(\s*\)//; + if ($_ =~ /^Query=(?:\s+([^\(]+))?/) { + $header_flag = 1; # valid header element found + my $query = $1; + while( defined($_ = $self->_readline() ) ) { + # Continue reading query name until encountering either + # a line that starts with "Database" or a blank line. + # The latter condition is needed in order to be able to + # parse megablast output correctly, since Database comes + # before (not after) the query. + if( ($_ =~ /^Database/) || ($_ =~ /^$/) ) { + $self->_pushback($_); last; + } + $query .= $_; + } + $query =~ s/\s+/ /g; + $query =~ s/^>//; + + my $length = 0; + if( $query =~ /\(([\d,]+)\s+\S+\)\s*$/ ) { + $length = $1; + $length =~ s/,//g; + } else { + $self->debug("length is 0 for '$query'\n"); + } + $self->{'QUERY'} = $query; + $self->{'LENGTH'} = $length; + } + elsif ($_ =~ /^(<b>)?(T?BLAST[NPX])\s+([\w\.-]+)\s+(\[[\w-]*\])/) { + $self->{'BLAST_TYPE'} = $2; + $self->{'BLAST_VERSION'} = $3; + } # BLAST report type - not a valid header element # JB949 + + # Support Paracel BTK output + elsif ( $_ =~ /(^[A-Z0-9_]+)\s+BTK\s+/ ) { + $self->{'BLAST_TYPE'} = $1; + $self->{'BTK'} = 1; + } + elsif ($_ =~ /^Database:\s+(.+)/) {$header_flag = 1;$self->{'DATABASE'} = $1} # valid header element found + elsif ($_ =~ /^\s*pattern\s+(\S+).*position\s+(\d+)\D/) { + # For PHIBLAST reports + $header_flag = 1; # valid header element found + $self->{'PATTERN'} = $1; + push (@{$self->{'QPATLOCATION'}}, $2); + } + elsif (($_ =~ /^>/) && ($header_flag==1)) {$self->_pushback($_); return 1} # only leave if we have actually parsed a valid header! + elsif (($_ =~ /^Parameters|^\s+Database:/) && ($header_flag==1)) { # if we entered a header, and saw nothing before the stats at the end, then it was empty + $self->_pushback($_); + return 0; # there's nothing in the report + } + # bug fix suggested by MI Sadowski via Martin Lomas + # see bug report #1118 + if( ref($self->_fh()) !~ /GLOB/ && $self->_fh()->can('EOF') && eof($self->_fh()) ) { + $self->warn("unexpected EOF in file\n"); + return -1; + } + } + return -1; # EOF +} + +sub _fastForward { + my ($self) = @_; + return 0 if $self->{'REPORT_DONE'}; # empty report + while(defined( $_ = $self->_readline() ) ) { + if ($_ =~ /^Histogram|^Searching|^Parameters|^\s+Database:|^\s+Posted date:/) { + return 0; + } elsif( $_ =~ /^>/ ) { + $self->_pushback($_); + return 1; + } + } + unless( $self->{'BTK'} ) { # Paracel BTK reports have no footer + $self->warn("Possible error (1) while parsing BLAST report!"); + } +} + +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/BPlite/HSP.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/BPlite/HSP.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,505 @@ +############################################################################### +# Bio::Tools::BPlite::HSP +############################################################################### +# HSP = High Scoring Pair (to all non-experts as I am) +# +# The original BPlite.pm module has been written by Ian Korf ! +# see http://sapiens.wustl.edu/~ikorf +# +# You may distribute this module under the same terms as perl itself + + +# +# BioPerl module for Bio::Tools::BPlite::HSP +# +# Cared for by Peter Schattner <schattner@alum.mit.edu> +# +# Copyright Peter Schattner +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::BPlite::HSP - Blast report High Scoring Pair (HSP) + +=head1 SYNOPSIS + + use Bio::Tools::BPlite; + my $report = new Bio::Tools::BPlite(-fh=>\*STDIN); + { + while(my $sbjct = $report->nextSbjct) { + while (my $hsp = $sbjct->nextHSP) { + $hsp->score; + $hsp->bits; + $hsp->percent; + $hsp->P; + $hsp->match; + $hsp->positive; + $hsp->length; + $hsp->querySeq; + $hsp->sbjctSeq; + $hsp->homologySeq; + $hsp->query->start; + $hsp->query->end; + $hsp->hit->start; + $hsp->hit->end; + $hsp->hit->seq_id; + $hsp->hit->overlaps($exon); + } + } + + # the following line takes you to the next report in the stream/file + # it will return 0 if that report is empty, + # but that is valid for an empty blast report. + # Returns -1 for EOF. + + last if ($report->_parseHeader == -1)); + + redo + } + +=head1 DESCRIPTION + +This object handles the High Scoring Pair data for a Blast report. +This is where the percent identity, query and hit sequence length, +P value, etc are stored and where most of the necessary information is located when building logic around parsing a Blast report. + +See L<Bio::Tools::BPlite> for more detailed information on the entire +BPlite Blast parsing system. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion +http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Peter Schattner + +Email: schattner@alum.mit.edu + +=head1 CONTRIBUTORS + +Jason Stajich, jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::Tools::BPlite::HSP; + +use vars qw(@ISA); +use strict; + +# to disable overloading comment this out: +#use overload '""' => '_overload'; + +# Object preamble - inheriets from Bio::SeqFeature::SimilarityPair + +use Bio::SeqFeature::SimilarityPair; +use Bio::SeqFeature::Similarity; + +@ISA = qw(Bio::SeqFeature::SimilarityPair); + +sub new { + my ($class, @args) = @_; + + # workaround to make sure frame is not set before strand is + # interpreted from query/hit info + # this workaround removes the key from the hash + # so the superclass does not try and work with it + # we'll take care of setting it in this module later on + + my %newargs = @args; + foreach ( keys %newargs ) { + if( /frame$/i ) { + delete $newargs{$_}; + } + } + # done with workaround + + my $self = $class->SUPER::new(%newargs); + + my ($score,$bits,$match,$hsplength,$positive,$gaps,$p,$exp,$qb,$qe,$sb, + $se,$qs,$ss,$hs,$qname,$sname,$qlength,$slength,$qframe,$sframe, + $blasttype) = + $self->_rearrange([qw(SCORE + BITS + MATCH + HSPLENGTH + POSITIVE + GAPS + P + EXP + QUERYBEGIN + QUERYEND + SBJCTBEGIN + SBJCTEND + QUERYSEQ + SBJCTSEQ + HOMOLOGYSEQ + QUERYNAME + SBJCTNAME + QUERYLENGTH + SBJCTLENGTH + QUERYFRAME + SBJCTFRAME + BLASTTYPE + )],@args); + + $blasttype = 'UNKNOWN' unless $blasttype; + $self->report_type($blasttype); + # Determine strand meanings + my ($queryfactor, $sbjctfactor) = (1,0); # default + if ($blasttype eq 'BLASTP' || $blasttype eq 'TBLASTN' ) { + $queryfactor = 0; + } + if ($blasttype eq 'TBLASTN' || $blasttype eq 'TBLASTX' || + $blasttype eq 'BLASTN' ) { + $sbjctfactor = 1; + } + + # Set BLAST type + $self->{'BLAST_TYPE'} = $blasttype; + + # Store the aligned query as sequence feature + my $strand; + if ($qe > $qb) { # normal query: start < end + if ($queryfactor) { $strand = 1; } else { $strand = undef; } + $self->query( Bio::SeqFeature::Similarity->new + (-start=>$qb, -end=>$qe, -strand=>$strand, + -source=>"BLAST" ) ) } + else { # reverse query (i dont know if this is possible, but feel free to correct) + if ($queryfactor) { $strand = -1; } else { $strand = undef; } + $self->query( Bio::SeqFeature::Similarity->new + (-start=>$qe, -end=>$qb, -strand=>$strand, + -source=>"BLAST" ) ) } + + # store the aligned hit as sequence feature + if ($se > $sb) { # normal hit + if ($sbjctfactor) { $strand = 1; } else { $strand = undef; } + $self->hit( Bio::SeqFeature::Similarity->new + (-start=>$sb, -end=>$se, -strand=>$strand, + -source=>"BLAST" ) ) } + else { # reverse hit: start bigger than end + if ($sbjctfactor) { $strand = -1; } else { $strand = undef; } + $self->hit( Bio::SeqFeature::Similarity->new + (-start=>$se, -end=>$sb, -strand=>$strand, + -source=>"BLAST" ) ) } + + # name the sequences + $self->query->seq_id($qname); # query name + $self->hit->seq_id($sname); # hit name + + # set lengths + $self->query->seqlength($qlength); # query length + $self->hit->seqlength($slength); # hit length + + # set object vars + $self->score($score); + $self->bits($bits); + + $self->significance($p); + $self->{'EXP'} = $exp; + + $self->query->frac_identical($match); + $self->hit->frac_identical($match); + $self->{'HSPLENGTH'} = $hsplength; + $self->{'PERCENT'} = int((1000 * $match)/$hsplength)/10; + $self->{'POSITIVE'} = $positive; + $self->{'GAPS'} = $gaps; + $self->{'QS'} = $qs; + $self->{'SS'} = $ss; + $self->{'HS'} = $hs; + + $self->frame($qframe, $sframe); + return $self; # success - we hope! +} + +# to disable overloading comment this out: +sub _overload { + my $self = shift; + return $self->start."..".$self->end." ".$self->bits; +} + +=head2 report_type + + Title : report_type + Usage : $type = $sbjct->report_type() + Function : Returns the type of report from which this hit was obtained. + This usually pertains only to BLAST and friends reports, for which + the report type denotes what type of sequence was aligned against + what (BLASTN: dna-dna, BLASTP prt-prt, BLASTX translated dna-prt, + TBLASTN prt-translated dna, TBLASTX translated dna-translated dna). + Example : + Returns : A string (BLASTN, BLASTP, BLASTX, TBLASTN, TBLASTX, UNKNOWN) + Args : a string on set (you should know what you are doing) + +=cut + +sub report_type { + my ($self, $rpt) = @_; + if($rpt) { + $self->{'_report_type'} = $rpt; + } + return $self->{'_report_type'}; +} + +=head2 EXP + + Title : EXP + Usage : my $exp = $hsp->EXP; + Function: returns the EXP value for the HSP + Returns : string value + Args : none + Note : Patch provided by Sami Ashour for BTK parsing + + +=cut + +sub EXP{ + return $_[0]->{'EXP'}; +} + + +=head2 P + + Title : P + Usage : $hsp->P(); + Function : returns the P (significance) value for a HSP + Returns : (double) significance value + Args : + +=cut + +sub P { + my ($self, @args) = @_; + my $float = $self->significance(@args); + my $match = '([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?'; # Perl Cookbook 2.1 + if ($float =~ /^$match$/) { + # Is a C float + return $float; + } elsif ("1$float" =~ /^$match$/) { + # Almost C float, Jitterbug 974 + return "1$float"; + } else { + $self->warn("[HSP::P()] '$float' is not a known number format. Returning zero (0) instead."); + return 0; + } +} + +=head2 percent + + Title : percent + Usage : $hsp->percent(); + Function : returns the percent matching + Returns : (double) percent matching + Args : none + +=cut + +sub percent {shift->{'PERCENT'}} + + +=head2 match + + Title : match + Usage : $hsp->match(); + Function : returns the match + Example : + Returns : (double) frac_identical + Args : + +=cut + +sub match {shift->query->frac_identical(@_)} + +=head2 hsplength + + Title : hsplength + Usage : $hsp->hsplength(); + Function : returns the HSP length (including gaps) + Returns : (integer) HSP length + Args : none + +=cut + +sub hsplength {shift->{'HSPLENGTH'}} + +=head2 positive + + Title : positive + Usage : $hsp->positive(); + Function : returns the number of positive matches (symbols in the alignment + with a positive score) + Returns : (int) number of positive matches in the alignment + Args : none + +=cut + +sub positive {shift->{'POSITIVE'}} + +=head2 gaps + + Title : gaps + Usage : $hsp->gaps(); + Function : returns the number of gaps or 0 if none + Returns : (int) number of gaps or 0 if none + Args : none + +=cut + +sub gaps {shift->{'GAPS'}} + +=head2 querySeq + + Title : querySeq + Usage : $hsp->querySeq(); + Function : returns the query sequence + Returns : (string) the Query Sequence + Args : none + +=cut + +sub querySeq {shift->{'QS'}} + +=head2 sbjctSeq + + Title : sbjctSeq + Usage : $hsp->sbjctSeq(); + Function : returns the Sbjct sequence + Returns : (string) the Sbjct Sequence + Args : none + +=cut + +sub sbjctSeq {shift->{'SS'}} + +=head2 homologySeq + + Title : homologySeq + Usage : $hsp->homologySeq(); + Function : returns the homologous sequence + Returns : (string) homologous sequence + Args : none + +=cut + +sub homologySeq {shift->{'HS'}} + +=head2 qs + + Title : qs + Usage : $hsp->qs(); + Function : returns the Query Sequence (same as querySeq) + Returns : (string) query Sequence + Args : none + +=cut + +sub qs {shift->{'QS'}} + +=head2 ss + + Title : ss + Usage : $hsp->ss(); + Function : returns the subject sequence ( same as sbjctSeq) + Returns : (string) Sbjct Sequence + Args : none + +=cut + +sub ss {shift->{'SS'}} + +=head2 hs + + Title : hs + Usage : $hsp->hs(); + Function : returns the Homologous Sequence (same as homologySeq ) + Returns : (string) Homologous Sequence + Args : none + +=cut + +sub hs {shift->{'HS'}} + +sub frame { + my ($self, $qframe, $sframe) = @_; + if( defined $qframe ) { + if( $qframe == 0 ) { + $qframe = undef; + } elsif( $qframe !~ /^([+-])?([1-3])/ ) { + $self->warn("Specifying an invalid query frame ($qframe)"); + $qframe = undef; + } else { + if( ($1 eq '-' && $self->query->strand >= 0) || + ($1 eq '+' && $self->query->strand <= 0) ) { + $self->warn("Query frame ($qframe) did not match strand of query (". $self->query->strand() . ")"); + } + # Set frame to GFF [0-2] + $qframe = $2 - 1; + } + $self->{'QFRAME'} = $qframe; + } + if( defined $sframe ) { + if( $sframe == 0 ) { + $sframe = undef; + } elsif( $sframe !~ /^([+-])?([1-3])/ ) { + $self->warn("Specifying an invalid hit frame ($sframe)"); + $sframe = undef; + } else { + if( ($1 eq '-' && $self->hit->strand >= 0) || + ($1 eq '+' && $self->hit->strand <= 0) ) + { + $self->warn("Hit frame ($sframe) did not match strand of hit (". $self->hit->strand() . ")"); + } + + # Set frame to GFF [0-2] + $sframe = $2 - 1; + } + $self->{'SFRAME'} = $sframe; + } + + (defined $qframe && $self->SUPER::frame($qframe) && + ($self->{'FRAME'} = $qframe)) || + (defined $sframe && $self->SUPER::frame($sframe) && + ($self->{'FRAME'} = $sframe)); + + if (wantarray() && + $self->{'BLAST_TYPE'} eq 'TBLASTX') + { + return ($self->{'QFRAME'}, $self->{'SFRAME'}); + } elsif (wantarray()) { + (defined $self->{'QFRAME'} && + return ($self->{'QFRAME'}, undef)) || + (defined $self->{'SFRAME'} && + return (undef, $self->{'SFRAME'})); + } else { + (defined $self->{'QFRAME'} && + return $self->{'QFRAME'}) || + (defined $self->{'SFRAME'} && + return $self->{'SFRAME'}); + } +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/BPlite/Iteration.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/BPlite/Iteration.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,394 @@ +# $Id: Iteration.pm,v 1.15 2002/06/19 00:27:49 jason Exp $ +# Bioperl module Bio::Tools::BPlite::Iteration +# based closely on the Bio::Tools::BPlite modules +# Ian Korf (ikorf@sapiens.wustl.edu, http://sapiens.wustl.edu/~ikorf), +# Lorenz Pollak (lorenz@ist.org, bioperl port) +# +# Copyright Peter Schattner +# +# You may distribute this module under the same terms as perl itself +# _history +# October 20, 2000 +# POD documentation - main docs before the code +# +# Added to get a simple_align object for a psiblast run with the -m 6 flag /AE +# + +=head1 NAME + +Bio::Tools::BPlite::Iteration - object for parsing single iteration +of a PSIBLAST report + +=head1 SYNOPSIS + + use Bio::Tools:: BPpsilite; + + open FH, "t/psiblastreport.out"; + $report = Bio::Tools::BPpsilite->new(-fh=>\*FH); + + # determine number of iterations executed by psiblast + $total_iterations = $report->number_of_iterations; + $last_iteration = $report->round($total_iterations); + + # Process only hits found in last iteration ... + $oldhitarray_ref = $last_iteration->oldhits; + HIT: while($sbjct = $last_iteration->nextSbjct) { + $id = $sbjct->name; + $is_old = grep /\Q$id\E/, @$oldhitarray_ref; + if ($is_old ){next HIT;} + # do something with new hit... + } + +=head2 ALIGNMENTS + + # This assumed that you have $db pointing to a database, $out to an output file + # $slxdir to a directory and $psiout + # note the alignments can only be obtained if the flag "-m 6" is run. + # It might also be necessary to use the flag -v to get all alignments + # + my @psiparams = ('database' => $db , 'output' => $out, 'j' => 3, 'm' => 6, + 'h' => 1.e-3 , 'F' => 'T' , 'Q' => $psiout ); + my $factory = Bio::Tools::Run::StandAloneBlast->new(@psiparams); + my $report = $factory->blastpgp($seq); + my $total_iterations = $report->number_of_iterations(); + my $last_iteration = $report->round($total_iterations); + my $align=$last_iteration->Align; + my $slxfile=$slxdir.$id.".slx"; + my $slx = Bio::AlignIO->new('-format' => 'selex','-file' => ">".$slxfile ); + $slx->write_aln($align); + +=head1 DESCRIPTION + +See the documentation for BPpsilite.pm for a description of the +Iteration.pm module. + +=head1 AUTHORS - Peter Schattner + +Email: schattner@alum.mit.edu + +=head1 CONTRIBUTORS + +Jason Stajich, jason@cgt.mc.duke.edu + +=head1 ACKNOWLEDGEMENTS + +Based on work of: +Ian Korf (ikorf@sapiens.wustl.edu, http://sapiens.wustl.edu/~ikorf), +Lorenz Pollak (lorenz@ist.org, bioperl port) + +=head1 COPYRIGHT + +BPlite.pm is copyright (C) 1999 by Ian Korf. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=cut + +package Bio::Tools::BPlite::Iteration; + +use strict; +use vars qw(@ISA); +use Bio::Root::Root; # root object to inherit from +use Bio::Tools::BPlite; # +use Bio::Tools::BPlite::Sbjct; + +@ISA = qw(Bio::Root::Root); + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + ($self->{'PARENT'},$self->{'ROUND'}) = + $self->_rearrange([qw(PARENT + ROUND + )],@args); + + $self->{'QUERY'} = $self->{'PARENT'}->{'QUERY'}; + $self->{'LENGTH'} = $self->{'PARENT'}->{'LENGTH'}; + + if($self->_parseHeader) {$self->{'REPORT_DONE'} = 0} # there are alignments + else {$self->{'REPORT_DONE'} = 1} # empty report + + return $self; # success - we hope! +} + +=head2 query + + Title : query + Usage : $query = $obj->query(); + Function : returns the query object + Example : + Returns : query object + Args : + +=cut + +sub query {shift->{'QUERY'}} + +=head2 qlength + + Title : qlength + Usage : $len = $obj->qlength(); + Returns : length of query + Args : none + +=cut + +sub qlength {shift->{'LENGTH'}} + +=head2 newhits + + Title : newhits + Usage : $newhits = $obj->newhits(); + Returns : reference to an array listing all the hits + from the current iteration which were not identified + in the previous iteration + Args : none + +=cut + +sub newhits {shift->{'NEWHITS'}} + +=head2 oldhits + + Title : oldhits + Usage : $oldhits = $obj->oldhits(); + Returns : reference to an array listing all the hits from + the current iteration which were identified and + above threshold in the previous iteration + Args : none + +=cut + +sub oldhits {shift->{'OLDHITS'}} + + +=head2 nextSbjct + + Title : nextSbjct + Usage : $sbjct = $obj->nextSbjct(); + Function : Method of iterating through all the Sbjct retrieved + from parsing the report +#Example : while ( my $sbjct = $obj->nextSbjct ) {} + Returns : next Sbjct object or undef if finished + Args : + +=cut + +sub nextSbjct { + my ($self) = @_; + $self->_fastForward or return undef; + + ####################### + # get all sbjct lines # + ####################### + my $def = $self->_readline(); + + while(defined ($_ = $self->_readline) ) { + if ($_ !~ /\w/) {next} + elsif ($_ =~ /Strand HSP/) {next} # WU-BLAST non-data + elsif ($_ =~ /^\s{0,2}Score/) {$self->_pushback( $_); last} + elsif ($_ =~ /^(\d+) .* \d+$/) { # This is not correct at all + $self->_pushback($_); # 1: HSP does not work for -m 6 flag + $def = $1; # 2: length/name are incorrect + my $length = undef; # 3: Names are repeated many times. + my $sbjct = new Bio::Tools::BPlite::Sbjct('-name'=>$def, + '-length'=>$length, + '-parent'=>$self); + return $sbjct; + } # m-6 + elsif ($_ =~ /^Parameters|^\s+Database:|^\s+Posted date:/) { + $self->_pushback( $_); + last; + } else {$def .= $_} +} + $def =~ s/\s+/ /g; + $def =~ s/\s+$//g; + $def =~ s/Length = ([\d,]+)$//g; + my $length = $1; + return 0 unless $def =~ /^>/; + $def =~ s/^>//; + + #################### + # the Sbjct object # + #################### + my $sbjct = new Bio::Tools::BPlite::Sbjct('-name'=>$def, + '-length'=>$length, + '-parent'=>$self); + return $sbjct; +} + + +# This is added by /AE + +=head2 Align + + Title : Align + Usage : $SimpleAlign = $obj->Align(); + Function : Method to obtain a simpleAlign object from psiblast + Example : $SimpleAlign = $obj->Align(); + Returns : SimpleAlign object or undef if not found. + BUG : Only works if psiblast has been run with m 6 flag + Args : + +=cut + +sub Align { + use Bio::SimpleAlign; + my ($self) = @_; + $self->_fastForward or return undef; + my $lastline = $self->_readline(); + return undef unless $lastline =~ /^QUERY/; # If psiblast not run correctly + my (%sequence,%first,%last,$num); + + if ( $lastline =~ /^QUERY\s+(\d*)\s*([-\w]+)\s*(\d*)\s*$/){ + my $name='QUERY'; + my $start=$1; + my $seq=$2; + my $stop=$3; + $seq =~ s/-/\./g; + $start =~ s/ //g; + $stop =~ s/ //g; + $sequence{$name} .= $seq; + if ($first{$name} eq ''){$first{$name}=$start;} + if ($stop ne ''){$last{$name}=$stop;} +# print "FOUND:\t$seq\t$start\t$stop\n"; + $num=0; + } + while(defined($_ = $self->_readline()) ){ + chomp($_); + if ( $_ =~ /^QUERY\s+(\d+)\s*([\-A-Z]+)\s*(\+)\s*$/){ + my $name='QUERY'; + my $start=$1; + my $seq=$2; + my $stop=$3; + $seq =~ s/-/\./g; + $start =~ s/ //g; + $stop =~ s/ //g; + $sequence{$name} .= $seq; + if ($first{$name} eq '') { $first{$name} = $start;} + if ($stop ne '') { $last{$name}=$stop;} + $num=0; + } elsif ( $_ =~ /^(\d+)\s+(\d+)\s*([\-A-Z]+)\s*(\d+)\s*$/ ){ + my $name=$1.".".$num; + my $start=$2; + my $seq=$3; + my $stop=$4; + $seq =~ s/-/\./g; + $start =~ s/ //g; + $stop =~ s/ //g; + $sequence{$name} .= $seq; + if ($first{$name} eq ''){$first{$name}=$start;} + if ($stop ne ''){$last{$name}=$stop;} + $num++; + } + } + my $align = new Bio::SimpleAlign(); + my @keys=sort keys(%sequence); + foreach my $name (@keys){ + my $nse = $name."/".$first{$name}."-".$last{$name}; + my $seqobj = Bio::LocatableSeq->new( -seq => $sequence{$name}, + -id => $name, + -name => $nse, + -start => $first{$name}, + -end => $last{$name} + ); + + $align->add_seq($seqobj); + } + return $align; +} + +# Start of internal subroutines. + +sub _parseHeader { + my ($self) = @_; + my (@old_hits, @new_hits); + + my $newhits_true = ($self->{'ROUND'} < 2) ? 1 : 0 ; + while(defined($_ = $self->_readline()) ) { + if ($_ =~ /(\w\w|.*|\w+.*)\s\s+(\d+)\s+([-\.e\d]+)$/) { + my $id = $1; + my $score= $2; #not used currently + my $evalue= $3; #not used currently + if ($newhits_true) { push ( @new_hits, $id);} + else { push (@old_hits, $id);} + } + elsif ($_ =~ /^Sequences not found previously/) {$newhits_true = 1 ;} +# This is changed for "-m 6" option /AE + elsif ($_ =~ /^>/ || $_ =~ /^QUERY/) + { + $self->_pushback($_); + $self->{'OLDHITS'} = \@old_hits; + $self->{'NEWHITS'} = \@new_hits; + return 1; + } + elsif ($_ =~ /^Parameters|^\s+Database:|^\s*Results from round\s+(d+)/) { + $self->_pushback($_); + return 0; # no sequences found in this iteration + } + } + return 0; # no sequences found in this iteration +} + +sub _fastForward { + my ($self) = @_; + return 0 if $self->{'REPORT_DONE'}; # empty report + + while(defined($_ = $self->_readline()) ) { + if( $_ =~ /^>/ || + $_ =~ /^QUERY|^\d+ .* \d+$/ ) { # Changed to also handle "-m 6" /AE + $self->_pushback($_); + return 1; + } +# print "FASTFORWARD",$_,"\n"; + if ($_ =~ /^>|^Parameters|^\s+Database:/) { + $self->_pushback($_); + return 1; + } + } + $self->warn("Possible error (2) while parsing BLAST report!"); +} + + +=head2 _readline + + Title : _readline + Usage : $obj->_readline + Function: Reads a line of input. + + Note that this method implicitely uses the value of $/ that is + in effect when called. + + Note also that the current implementation does not handle pushed + back input correctly unless the pushed back input ends with the + value of $/. + Example : + Returns : + +=cut + +sub _readline{ + my ($self) = @_; + return $self->{'PARENT'}->_readline(); +} + +=head2 _pushback + + Title : _pushback + Usage : $obj->_pushback($newvalue) + Function: puts a line previously read with _readline back into a buffer + Example : + Returns : + Args : newvalue + +=cut + +sub _pushback { + my ($self, $arg) = @_; + return $self->{'PARENT'}->_pushback($arg); +} +1; +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/BPlite/Sbjct.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/BPlite/Sbjct.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,361 @@ +# $Id: Sbjct.pm,v 1.23.2.1 2003/02/20 00:39:03 jason Exp $ +############################################################################### +# Bio::Tools::BPlite::Sbjct +############################################################################### +# +# The original BPlite.pm module has been written by Ian Korf ! +# see http://sapiens.wustl.edu/~ikorf +# +# You may distribute this module under the same terms as perl itself +# +# BioPerl module for Bio::Tools::BPlite::Sbjct +# +# Cared for by Peter Schattner <schattner@alum.mit.edu> +# +# Copyright Peter Schattner +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::BPlite::Sbjct - A Blast Subject (database search Hit) + +=head1 SYNOPSIS + + use Bio::Tools::BPlite + my $report = new Bio::Tools::BPlite(-fh=>\*STDIN); + while(my $sbjct = $report->nextSbjct) { + $sbjct->name; # access to the hit name + "$sbjct"; # overloaded to return name + $sbjct->nextHSP; # gets the next HSP from the sbjct + while(my $hsp = $sbjct->nextHSP) { + # canonical form is again a while loop + } + +=head1 DESCRIPTION + +See L<Bio::Tools::BPlite> for a more detailed information about the +BPlite BLAST parsing objects. + +The original BPlite.pm module has been written by Ian Korf! +See http://sapiens.wustl.edu/~ikorf + +The Sbjct object encapsulates a Hit in a Blast database +search. The Subjects are the "Hits" for a particular query. A +Subject may be made up of multiple High Scoring Pairs (HSP) which are +accessed through the nextHSP method. + +If you are searching for the P-value or percent identity that is +specific to each HSP and you will need to use the nextHSP method to +get access to that data. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion +http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Peter Schattner + +Email: schattner@alum.mit.edu + +=head1 CONTRIBUTORS + +Jason Stajich, jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::Tools::BPlite::Sbjct; + +use strict; + +use Bio::Root::Root; # root object to inherit from +use Bio::Tools::BPlite::HSP; # we want to use HSP +#use overload '""' => 'name'; +use vars qw(@ISA); + +@ISA = qw(Bio::Root::Root); + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + ($self->{'NAME'},$self->{'LENGTH'}, + $self->{'PARENT'}) = + $self->_rearrange([qw(NAME + LENGTH + PARENT + )],@args); + $self->report_type($self->{'PARENT'}->{'BLAST_TYPE'} || 'UNKNOWN'); + $self->{'HSP_ALL_PARSED'} = 0; + + return $self; +} + +=head2 name + + Title : name + Usage : $name = $obj->name(); + Function : returns the name of the Sbjct + Example : + Returns : name of the Sbjct + Args : + +=cut + +sub name {shift->{'NAME'}} + +=head2 report_type + + Title : report_type + Usage : $type = $sbjct->report_type() + Function : Returns the type of report from which this hit was obtained. + This usually pertains only to BLAST and friends reports, for which + the report type denotes what type of sequence was aligned against + what (BLASTN: dna-dna, BLASTP prt-prt, BLASTX translated dna-prt, + TBLASTN prt-translated dna, TBLASTX translated dna-translated dna). + Example : + Returns : A string (BLASTN, BLASTP, BLASTX, TBLASTN, TBLASTX, UNKNOWN) + Args : a string on set (you should know what you are doing) + +=cut + +sub report_type { + my ($self, $rpt) = @_; + if($rpt) { + $self->{'_report_type'} = $rpt; + } + return $self->{'_report_type'}; +} + +=head2 nextFeaturePair + + Title : nextFeaturePair + Usage : $name = $obj->nextFeaturePair(); + Function : same as the nextHSP function + Example : + Returns : next FeaturePair + Args : + +=cut + +sub nextFeaturePair {shift->nextHSP}; # just another name + +=head2 nextHSP + + Title : nextHSP + Usage : $hsp = $obj->nextHSP(); + Function : returns the next available High Scoring Pair + Example : + Returns : Bio::Tools::HSP or null if finished + Args : + +=cut + +sub nextHSP { + my ($self) = @_; + return undef if $self->{'HSP_ALL_PARSED'}; + + ############################ + # get and parse scorelines # + ############################ + my ($qframe, $sframe); + my $scoreline = $self->_readline(); + my $nextline = $self->_readline(); + return undef if not defined $nextline; + $scoreline .= $nextline; + my ($score, $bits); + if ($scoreline =~ /\d bits\)/) { + ($score, $bits) = $scoreline =~ + /Score = (\d+) \((\S+) bits\)/; # WU-BLAST + } + else { + ($bits, $score) = $scoreline =~ + /Score =\s+(\S+) bits \((\d+)/; # NCBI-BLAST + } + + my ($match, $hsplength) = ($scoreline =~ /Identities = (\d+)\/(\d+)/); + my ($positive) = ($scoreline =~ /Positives = (\d+)/); + my ($gaps) = ($scoreline =~ /Gaps = (\d+)/); + if($self->report_type() eq 'TBLASTX') { + ($qframe, $sframe) = $scoreline =~ /Frame =\s+([+-]\d)\s+\/\s+([+-]\d)/; + } elsif ($self->report_type() eq 'TBLASTN') { + ($sframe) = $scoreline =~ /Frame =\s+([+-]\d)/; + } else { + ($qframe) = $scoreline =~ /Frame =\s+([+-]\d)/; + } + $positive = $match if not defined $positive; + $gaps = '0' if not defined $gaps; + my ($p) = ($scoreline =~ /[Sum ]*P[\(\d+\)]* = (\S+)/); + unless (defined $p) {(undef, $p) = $scoreline =~ /Expect(\(\d+\))? =\s+(\S+)/} + my ($exp) = ($scoreline =~ /Expect(?:\(\d+\))? =\s+([^\s,]+)/); + $exp = -1 unless( defined $exp ); + + $self->throw("Unable to parse '$scoreline'") unless defined $score; + + ####################### + # get alignment lines # + ####################### + my (@hspline); + while( defined($_ = $self->_readline()) ) { + if ($_ =~ /^WARNING:|^NOTE:/) { + while(defined($_ = $self->_readline())) {last if $_ !~ /\S/} + } + elsif ($_ !~ /\S/) {next} + elsif ($_ =~ /Strand HSP/) {next} # WU-BLAST non-data + elsif ($_ =~ /^\s*Strand/) {next} # NCBI-BLAST non-data + elsif ($_ =~ /^\s*Score/) {$self->_pushback($_); last} + + elsif ($_ =~ /^>|^Histogram|^Searching|^Parameters|^\s+Database:|^CPU\stime|^\s*Lambda/) + { + #ps 5/28/01 + # elsif ($_ =~ /^>|^Parameters|^\s+Database:|^CPU\stime/) { + $self->_pushback($_); + + $self->{'HSP_ALL_PARSED'} = 1; + last; + } + elsif( $_ =~ /^\s*Frame/ ) { + if ($self->report_type() eq 'TBLASTX') { + ($qframe, $sframe) = $_ =~ /Frame = ([\+-]\d)\s+\/\s+([\+-]\d)/; + } elsif ($self->report_type() eq 'TBLASTN') { + ($sframe) = $_ =~ /Frame = ([\+-]\d)/; + } else { + ($qframe) = $_ =~ /Frame = ([\+-]\d)/; + } + } + else { + push @hspline, $_; # store the query line + $nextline = $self->_readline(); + # Skip "pattern" line when parsing PHIBLAST reports, otherwise store the alignment line + my $l1 = ($nextline =~ /^\s*pattern/) ? $self->_readline() : $nextline; + push @hspline, $l1; # store the alignment line + my $l2 = $self->_readline(); push @hspline, $l2; # grab/store the sbjct line + } + } + + ######################### + # parse alignment lines # + ######################### + my ($ql, $sl, $as) = ("", "", ""); + my ($qb, $qe, $sb, $se) = (0,0,0,0); + my (@QL, @SL, @AS); # for better memory management + + for(my $i=0;$i<@hspline;$i+=3) { + # warn $hspline[$i], $hspline[$i+2]; + $hspline[$i] =~ /^(?:Query|Trans):\s+(\d+)\s*([\D\S]+)\s+(\d+)/; + $ql = $2; $qb = $1 unless $qb; $qe = $3; + + my $offset = index($hspline[$i], $ql); + $as = substr($hspline[$i+1], $offset, CORE::length($ql)); + + $hspline[$i+2] =~ /^Sbjct:\s+(\d+)\s*([\D\S]+)\s+(\d+)/; + $sl = $2; $sb = $1 unless $sb; $se = $3; + + push @QL, $ql; push @SL, $sl; push @AS, $as; + } + + ################## + # the HSP object # + ################## + $ql = join("", @QL); + $sl = join("", @SL); + $as = join("", @AS); +# Query name and length are not in the report for a bl2seq report so {'PARENT'}->query and +# {'PARENT'}->qlength will not be available. + my ($qname, $qlength) = ('unknown','unknown'); + if ($self->{'PARENT'}->can('query')) { + $qname = $self->{'PARENT'}->query; + $qlength = $self->{'PARENT'}->qlength; + } + + my $hsp = new Bio::Tools::BPlite::HSP + ('-score' => $score, + '-bits' => $bits, + '-match' => $match, + '-positive' => $positive, + '-gaps' => $gaps, + '-hsplength' => $hsplength, + '-p' => $p, + '-exp' => $exp, + '-queryBegin' => $qb, + '-queryEnd' => $qe, + '-sbjctBegin' => $sb, + '-sbjctEnd' => $se, + '-querySeq' => $ql, + '-sbjctSeq' => $sl, + '-homologySeq'=> $as, + '-queryName' => $qname, +# '-queryName'=>$self->{'PARENT'}->query, + '-sbjctName' => $self->{'NAME'}, + '-queryLength'=> $qlength, +# '-queryLength'=>$self->{'PARENT'}->qlength, + '-sbjctLength'=> $self->{'LENGTH'}, + '-queryFrame' => $qframe, + '-sbjctFrame' => $sframe, + '-blastType' => $self->report_type()); + return $hsp; +} + +=head2 _readline + + Title : _readline + Usage : $obj->_readline + Function: Reads a line of input. + + Note that this method implicitely uses the value of $/ that is + in effect when called. + + Note also that the current implementation does not handle pushed + back input correctly unless the pushed back input ends with the + value of $/. + Example : + Returns : + +=cut + +sub _readline{ + my ($self) = @_; + return $self->{'PARENT'}->_readline(); +} + +=head2 _pushback + + Title : _pushback + Usage : $obj->_pushback($newvalue) + Function: puts a line previously read with _readline back into a buffer + Example : + Returns : + Args : newvalue + +=cut + +sub _pushback { + my ($self, $arg) = @_; + return $self->{'PARENT'}->_pushback($arg); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/BPpsilite.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/BPpsilite.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,357 @@ +# $Id: BPpsilite.pm,v 1.22 2002/10/22 07:38:45 lapp Exp $ +# Bioperl module Bio::Tools::BPpsilite +############################################################ +# based closely on the Bio::Tools::BPlite modules +# Ian Korf (ikorf@sapiens.wustl.edu, http://sapiens.wustl.edu/~ikorf), +# Lorenz Pollak (lorenz@ist.org, bioperl port) +# +# +# Copyright Peter Schattner +# +# You may distribute this module under the same terms as perl itself +# _history +# October 20, 2000 +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::BPpsilite - Lightweight BLAST parser for (iterated) psiblast reports + +=head1 SYNOPSIS + + use Bio::Tools::BPpsilite; + open FH, "t/psiblastreport.out"; + $report = Bio::Tools::BPpsilite->new(-fh=>\*FH); + + # determine number of iterations executed by psiblast + $total_iterations = $report->number_of_iterations; + $last_iteration = $report->round($total_iterations); + + # Process only hits found in last iteration ... + $oldhitarray_ref = $last_iteration->oldhits; + HIT: while($sbjct = $last_iteration->nextSbjct) { + $id = $sbjct->name; + $is_old = grep /\Q$id\E/, @$oldhitarray_ref; + if ($is_old ){next HIT;} + # do something with new hit... + } + + +=head1 DESCRIPTION + +BPpsilite is a package for parsing multiple iteration PSIBLAST +reports. It is based closely on Ian Korf's BPlite.pm module for +parsing single iteration BLAST reports (as modified by Lorenz Pollak). + +Two of the four basic objects of BPpsilite.pm are identical to the +corresponding objects in BPlite - the "HSP.pm" and "Sbjct.pm" objects. +This DESCRIPTION documents only the one new object, the "iteration", +as well as the additional methods that are implemented in BPpsilite +that are not in BPlite. See the BPlite documentation for information +on the BPlite, SBJCT and HSP objects. + +The essential difference between PSIBLAST and the other BLAST programs +(in terms of report parsing) is that PSIBLAST performs multiple +iterations of the BLASTing of the database and the results of all of +these iterations are stored in a single PSIBLAST report. (For general +information on PSIBLAST see the README.bla file in the standalone +BLAST distribution and references therein). PSIBLAST's use of multiple +iterations imposes additional demands on the report parser: * There +are several iterations of hits. Many of those hits will be repeated +in more than one iteration. Often only the last iteration will be of +interest. * Each iteration will list two different kinds of hits - +repeated hits that were used in the model and newly identified hits - +which may need to be processed in different manners * The total number +of iterations performed is not displayed in the report until (almost) +the very end of the report. (The user can specify a maximum number of +iterations for the PSIBLAST search, but the program may perform fewer +iterations if convergence is reached) + +BPpsilite addresses these issues by offering the following methods: + +* The total number of iteration used is given by the method + number_of_iterations as in: + + $total_iterations = $report->number_of_iterations; + +* Results from an arbitrary iteration round can be accessed by using + the 'round' method: + + $iteration3_report = $report->round(3); + +* The ids of the sequences which passed the significance threshold for + the first time in the "nth" iteration can be identified by using the + newhits method. Previously identified hits are identified by using + the oldhits method, as in: + + $oldhitarray_ref = $iteration3_report->oldhits; + $newhitarray_ref = $iteration3_report->newhits; + +BPpsilite.pm should work equally well on reports generated by the +StandAloneBlast.pm local BLAST module as with reports generated by +remote psiblast searches. For examples of usage of BPpsilite.pm, the +user is referred to the BPpsilite.t script in the "t" directory. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Peter Schattner + +Email: schattner@alum.mit.edu + +=head1 CONTRIBUTORS + +Jason Stajich, jason@cgt.mc.duke.edu + +=head1 ACKNOWLEDGEMENTS + +Based on work of: +Ian Korf (ikorf@sapiens.wustl.edu, http://sapiens.wustl.edu/~ikorf), +Lorenz Pollak (lorenz@ist.org, bioperl port) + +=head1 COPYRIGHT + +BPlite.pm is copyright (C) 1999 by Ian Korf. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=cut + +package Bio::Tools::BPpsilite; + +use strict; +use vars qw(@ISA); +use Bio::Tools::BPlite::Iteration; # +use Bio::Tools::BPlite::Sbjct; # Debug code +use Bio::Root::Root; # root interface to inherit from +use Bio::Root::IO; +use Bio::Tools::BPlite; + +@ISA = qw(Bio::Root::Root Bio::Root::IO); + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + # initialize IO + $self->_initialize_io(@args); + $self->{'_tempdir'} = $self->tempdir('CLEANUP' => 1); + $self->{'QPATLOCATION'} = []; # Anonymous array of query pattern locations for PHIBLAST + $self->{'NEXT_ITERATION_NUMBER'} = 1; + $self->{'TOTAL_ITERATION_NUMBER'} = -1; # -1 indicates preprocessing not yet done + + if ($self->_parseHeader) {$self->{'REPORT_DONE'} = 0} # there are alignments + else {$self->{'REPORT_DONE'} = 1} # empty report + + return $self; # success - we hope! +} + +=head2 query + + Title : query + Usage : $query = $obj->query(); + Function : returns the query object + Returns : query object + Args : + +=cut + +sub query {shift->{'QUERY'}} + +=head2 qlength + + Title : qlength + Usage : $len = $obj->qlength(); + Function : returns the length of the query + Returns : length of query + Args : + +=cut + +sub qlength {shift->{'LENGTH'}} + +=head2 database + + Title : database + Usage : $db = $obj->database(); + Function : returns the database used in this search + Returns : database used for search + Args : + +=cut + +sub database {shift->{'DATABASE'}} + +=head2 number_of_iterations + + Title : number_of_iterations + Usage : $total_iterations = $obj-> number_of_iterations(); + Function : returns the total number of iterations used in this search + Returns : total number of iterations used for search + Args : none + +=cut + + +=head2 pattern + + Title : database + Usage : $pattern = $obj->pattern(); + Function : returns the pattern used in a PHIBLAST search + +=cut + +sub pattern {shift->{'PATTERN'}} + +=head2 query_pattern_location + + Title : query_pattern_location + Usage : $qpl = $obj->query_pattern_location(); + Function : returns reference to array of locations in the query sequence + of pattern used in a PHIBLAST search + +=cut + +sub query_pattern_location {shift->{'QPATLOCATION'}} + + + + +sub number_of_iterations { + my $self = shift; + if ($self->{'TOTAL_ITERATION_NUMBER'} == -1){&_preprocess($self);} + $self->{'TOTAL_ITERATION_NUMBER'}; +} + +=head2 round + + Title : round + Usage : $Iteration3 = $report->round(3); + Function : Method of retrieving data from a specific iteration + Example : + Returns : reference to requested Iteration object or null if argument + is greater than total number of iterations + Args : number of the requested iteration + +=cut + +sub round { + my $self = shift; + my $iter_num = shift; + $self->_initialize_io(-file => Bio::Root::IO->catfile + ($self->{'_tempdir'},"iteration".$iter_num.".tmp")); + if( ! $self->_fh ) { + $self->throw("unable to re-open iteration file for round ".$iter_num); + } + return Bio::Tools::BPlite::Iteration->new(-round=>$iter_num, + -parent=>$self); +} + +# begin private routines + +sub _parseHeader { + my ($self) = @_; + + + while(defined ($_ = $self->_readline) ) { + if ($_ =~ /^Query=\s+([^\(]+)/) { + my $query = $1; + while(defined ($_ = $self->_readline)) { + last if $_ !~ /\S/; + $query .= $_; + } + $query =~ s/\s+/ /g; + $query =~ s/^>//; + $query =~ /\((\d+)\s+\S+\)\s*$/; + my $length = $1; + $self->{'QUERY'} = $query; + $self->{'LENGTH'} = $length; + } + elsif ($_ =~ /^Database:\s+(.+)/) {$self->{'DATABASE'} = $1} + elsif ($_ =~ /^\s*pattern\s+(\S+).*position\s+(\d+)\D/) + { # For PHIBLAST reports + $self->{'PATTERN'} = $1; + push (@{$self->{'QPATLOCATION'}}, $2); + } elsif ($_ =~ /^>|^Results from round 1/) { + $self->_pushback($_); + return 1; + } elsif ($_ =~ /^Parameters|^\s+Database:/) { + $self->_pushback($_); + return 0; # there's nothing in the report + } + } +} + +=head2 _preprocess + + Title : _preprocess + Usage : internal routine, not called directly + Function : determines number of iterations in report and prepares + data so individual iterations canbe parsed in non-sequential + order + Example : + Returns : nothing. Sets TOTAL_ITERATION_NUMBER in object's hash + Args : reference to calling object + +=cut + +#' +sub _preprocess { + my $self = shift; +# $self->throw(" PSIBLAST report preprocessing not implemented yet!"); + + my $oldround = 0; + my ($currentline, $currentfile, $round); + +# open output file for data from iteration round #1 + $round = 1; + $currentfile = Bio::Root::IO->catfile($self->{'_tempdir'}, + "iteration$round.tmp"); + open (FILEHANDLE, ">$currentfile") || + $self->throw("cannot open filehandle to write to file $currentfile"); + + while(defined ($currentline = $self->_readline()) ) { + if ($currentline =~ /^Results from round\s+(\d+)/) { + if ($oldround) { close (FILEHANDLE) ;} + $round = $1; + $currentfile = Bio::Root::IO->catfile($self->{'_tempdir'}, + "iteration$round.tmp"); + + close FILEHANDLE; + open (FILEHANDLE, ">$currentfile") || + $self->throw("cannot open filehandle to write to file $currentfile"); + $oldround = $round; + }elsif ($currentline =~ /CONVERGED/){ # This is a fix for psiblast parsing with -m 6 /AE + $round--; + } + print FILEHANDLE $currentline ; + + } + $self->{'TOTAL_ITERATION_NUMBER'}= $round; +# It is necessary to close filehandle otherwise the whole +# file will not be read later !! + close FILEHANDLE; +} + +1; + +__END__ diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Blast.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Blast.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,4104 @@ +#---------------------------------------------------------------------------- +# PACKAGE : Bio::Tools::Blast +# PURPOSE : To encapsulate code for running, parsing, and analyzing +# BLAST reports. +# AUTHOR : Steve Chervitz (sac@bioperl.org) +# CREATED : March 1996 +# REVISION: $Id: Blast.pm,v 1.30 2002/11/04 09:12:50 heikki Exp $ +# STATUS : Alpha +# +# For the latest version and documentation, visit: +# http://bio.perl.org/Projects/Blast +# +# To generate documentation, run this module through pod2html +# (preferably from Perl v5.004 or better). +# +# Copyright (c) 1996-2000 Steve Chervitz. All Rights Reserved. +# This module is free software; you can redistribute it and/or +# modify it under the same terms as Perl itself. +#---------------------------------------------------------------------------- + +package Bio::Tools::Blast; +use strict; +use Exporter; + +use Bio::Tools::SeqAnal; +use Bio::Root::Global qw(:std); +use Bio::Root::Utilities qw(:obj); + +require 5.002; + +use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS + $ID $VERSION $Blast @Blast_programs $Revision $Newline); + +@ISA = qw( Bio::Tools::SeqAnal Exporter); +@EXPORT = qw(); +@EXPORT_OK = qw($VERSION $Blast); +%EXPORT_TAGS = ( obj => [qw($Blast)], + std => [qw($Blast)]); + +$ID = 'Bio::Tools::Blast'; +$VERSION = 0.09; +$Revision = '$Id: Blast.pm,v 1.30 2002/11/04 09:12:50 heikki Exp $'; #' + +## Static Blast object. +$Blast = {}; +bless $Blast, $ID; +$Blast->{'_name'} = "Static Blast object"; + +@Blast_programs = qw(blastp blastn blastx tblastn tblastx); + +use vars qw($DEFAULT_MATRIX $DEFAULT_SIGNIF); +my $DEFAULT_MATRIX = 'BLOSUM62'; +my $DEFAULT_SIGNIF = 999;# Value used as significance cutoff if none supplied. +my $MAX_HSP_OVERLAP = 2; # Used when tiling multiple HSPs. + +## POD Documentation: + +=head1 NAME + +Bio::Tools::Blast - Bioperl BLAST sequence analysis object + +=head1 SYNOPSIS + +=head2 Parsing Blast reports + +Parse an existing Blast report from file: + + use Bio::Tools::Blast; + + $blastObj = Bio::Tools::Blast->new( -file => '/tmp/blast.out', + -parse => 1, + -signif => '1e-10', + ); + +Parse an existing Blast report from STDIN: + + $blastObj = Bio::Tools::Blast->new( -parse => 1, + -signif => '1e-10', + ); + +Then send a Blast report to your script via STDIN. + +Full parameters for parsing Blast reports. + + %blastParam = ( + -run => \%runParam, + -file => '', + -parse => 1, + -signif => 1e-5, + -filt_func => \&my_filter, + -min_len => 15, + -check_all_hits => 0, + -strict => 0, + -stats => 1, + -best => 0, + -share => 0, + -exec_func => \&process_blast, + -save_array => \@blast_objs, # not used if -exce_func defined. + ); + +See L<parse()|parse> for a description of parameters and see L<USAGE | USAGE> for +more examples including how to parse streams containing multiple Blast +reports L<Using the Static $Blast Object>. + +See L<Memory Usage Issues> for information about how to make Blast +parsing be more memory efficient. + +=head2 Running Blast reports + +Run a new Blast2 at NCBI and then parse it: + + %runParam = ( + -method => 'remote', + -prog => 'blastp', + -database => 'swissprot', + -seqs => [ $seq ], # Bio::Seq.pm objects. + ); + + $blastObj = Bio::Tools::Blast->new( -run => \%runParam, + -parse => 1, + -signif => '1e-10', + -strict => 1, + ); + +Full parameters for running Blasts at NCBI using Webblast.pm: + + %runParam = ( + -method => 'remote', + -prog => 'blastp', + -version => 2, # BLAST2 + -database =>'swissprot', + -html => 0, + -seqs => [ $seqObject ], # Bio::Seq.pm object(s) + -descr => 250, + -align => 250, + -expect => 10, + -gap => 'on', + -matrix => 'PAM250', + -email => undef, # don't send report via e-mail if parsing. + -filter => undef, # use default + -gap_c => undef, # use default + -gap_e => undef, # use default + -word => undef, # use default + -min_len => undef, # use default + ); + +See L<run()|run> and L<USAGE | USAGE> for more information about running Blasts. + +=head2 HTML-formatting Blast reports + +Print an HTML-formatted version of a Blast report: + + use Bio::Tools::Blast qw(:obj); + + $Blast->to_html($filename); + $Blast->to_html(-file => $filename, + -header => "<H1>Blast Results</H1>"); + $Blast->to_html(-file => $filename, + -out => \@array); # store output + $Blast->to_html(); # use STDIN + +Results are sent directly to STDOUT unless an C<-out =E<gt> array_ref> +parameter is supplied. See L<to_html()|to_html> for details. + +=head1 INSTALLATION + +This module is included with the central Bioperl distribution: + + http://bio.perl.org/Core/Latest + ftp://bio.perl.org/pub/DIST + +Follow the installation instructions included in the README file. + +=head1 DESCRIPTION + +The Bio::Tools::Blast.pm module encapsulates data and methods for +running, parsing, and analyzing pre-existing BLAST reports. This +module defines an application programming interface (API) for working +with Blast reports. A Blast object is constructed from raw Blast +output and encapsulates the Blast results which can then be accessed +via the interface defined by the Blast object. + +The ways in which researchers use Blast data are many and varied. This +module attempts to be general and flexible enough to accommodate +different uses. The Blast module API is still at an early stage of +evolution and I expect it to continue to evolve as new uses for Blast +data are developed. Your L<FEEDBACK | FEEDBACK> is welcome. + +B<FEATURES:> + +=over 2 + +=item * Supports NCBI Blast1.x, Blast2.x, and WashU-Blast2.x, gapped +and ungapped. + +Can parse HTML-formatted as well as non-HTML-formatted reports. + +=item * Launch new Blast analyses remotely or locally. + +Blast objects can be constructed directly from the results of the +run. See L<run()|run>. + +=item * Construct Blast objects from pre-existing files or from a new run. + +Build a Blast object from a single file or build multiple Blast +objects from an input stream containing multiple reports. See +L<parse()|parse>. + +=item * Add hypertext links from a BLAST report. + +See L<to_html()|to_html>. + +=item * Generate sequence and sequence alignment objects from HSP +sequences. + +If you have Bio::Seq.pm and Bio::UnivAln.pm installed on your system, +they can be used for working with high-scoring segment pair (HSP) +sequences in the Blast alignment. (A new version of Bio::Seq.pm is +included in the distribution, see L<INSTALLATION | INSTALLATION>). For more +information about them, see: + + http://bio.perl.org/Projects/Sequence/ + http://bio.perl.org/Projects/SeqAlign/ + +=back + +A variety of different data can be extracted from the Blast report by +querying the Blast.pm object. Some basic examples are given in the +L<USAGE | USAGE> section. For some working scripts, see the links provided in +the L<the DEMO SCRIPTS section | DEMO> section. + +As a part of the incipient Bioperl framework, the Bio::Tools::Blast.pm +module inherits from B<Bio::Tools::SeqAnal.pm>, which provides some +generic functionality for biological sequence analysis. See the +documentation for that module for details +(L<Links to related modules>). + +=head2 The BLAST Program + +BLAST (Basic Local Alignment Search Tool) is a widely used algorithm +for performing rapid sequence similarity searches between a single DNA +or protein sequence and a large dataset of sequences. BLAST analyses +are typically performed by dedicated remote servers, such as the ones +at the NCBI. Individual groups may also run the program on local +machines. + +The Blast family includes 5 different programs: + + Query Seq Database + ------------ ---------- + blastp -- protein protein + blastn -- nucleotide nucleotide + blastx -- nucleotide* protein + tblastn -- protein nucleotide* + tblastx -- nucleotide* nucleotide* + + * = dynamically translated in all reading frames, both strands + +See L<References & Information about the BLAST program>. + +=head2 Versions Supported + +BLAST reports generated by different application front ends are similar +but not exactly the same. Blast reports are not intended to be exchange formats, +making parsing software susceptible to obsolescence. This module aims to +support BLAST reports generated by different implementations: + + Implementation Latest version tested + -------------- -------------------- + NCBI Blast1 1.4.11 [24-Nov-97] + NCBI Blast2 2.0.8 [Jan-5-1999] + WashU-BLAST2 2.0a19MP [05-Feb-1998] + GCG 1.4.8 [1-Feb-95] + +Support for both gapped and ungapped versions is included. Currently, there +is only rudimentary support for PSI-BLAST in that these reports can be parsed but +there is no special treatment of separate iteration rounds (they are all +merged together). + +=head2 References & Information about the BLAST program + +B<WEBSITES:> + + http://www.ncbi.nlm.nih.gov/BLAST/ - Homepage at NCBI + http://www.ncbi.nlm.nih.gov/BLAST/blast_help.html - Help manual + http://blast.wustl.edu/ - WashU-Blast2 + +B<PUBLICATIONS:> (with PubMed links) + + Altschul S.F., Gish W., Miller W., Myers E.W., Lipman D.J. (1990). + "Basic local alignment search tool", J Mol Biol 215: 403-410. + +http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?uid=2231712&form=6&db=m&Dopt=r + + Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaffer, + Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997). + "Gapped BLAST and PSI-BLAST: a new generation of protein database + search programs", Nucleic Acids Res. 25:3389-3402. + +http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?uid=9254694&form=6&db=m&Dopt=r + + Karlin, Samuel and Stephen F. Altschul (1990). Methods for + assessing the statistical significance of molecular sequence + features by using general scoring schemes. Proc. Natl. Acad. + Sci. USA 87:2264-68. + +http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?uid=2315319&form=6&db=m&Dopt=b + + Karlin, Samuel and Stephen F. Altschul (1993). Applications + and statistics for multiple high-scoring segments in molecu- + lar sequences. Proc. Natl. Acad. Sci. USA 90:5873-7. + +http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?uid=8390686&form=6&db=m&Dopt=b + +=head1 USAGE + +=head2 Creating Blast objects + +A Blast object can be constructed from the contents of a Blast report +using a set of named parameters that specify significance criteria for +parsing. The report data can be read in from an existing file +specified with the C<-file =E<gt> 'filename'> parameter or from a +STDIN stream containing potentially multiple Blast reports. If the +C<-file> parameter does not contain a valid filename, STDIN will be +used. Separate Blast objects will be created for each report in the +stream. + +To parse the report, you must include a C<-parse =E<gt> 1> parameter +in addition to any other parsing parameters +See L<parse()|parse> for a full description of parsing parameters. +To run a new report and then parse it, include a C<-run =E<gt> \%runParams> +parameter containing a reference to a hash +that hold the parameters required by the L<run()|run> method. + +The constructor for Blast objects is inherited from Bio::Tools::SeqAnal.pm. +See the B<_initialize>() method of that package for general information +relevant to creating Blast objects. (The B<new>() method, inherited from +B<Bio::Root::Object.pm>, calls B<_initialize>(). See L<Links to related modules>). + +The Blast object can read compressed (gzipped) Blast report +files. Compression/decompression uses the gzip or compress programs +that are standard on Unix systems and should not require special +configuration. If you can't or don't want to use gzip as the file +compression tool, either pre-uncompress your files before parsing with +this module or modify B<Bio::Root::Utilities.pm> to your liking. + +Blast objects can be generated either by direct instantiation as in: + + use Bio::Tools::Blast; + $blast = new Bio::Tools::Blast (%parameters); + +=head2 Using the Static $Blast Object + + use Bio::Tools::Blast qw(:obj); + +This exports the static $Blast object into your namespace. "Static" +refers to the fact that it has class scope and there is one of these +created when you use this module. The static $Blast object is +basically an empty object that is provided for convenience and is also +used for various internal chores. + +It is exported by this module and can be used for +parsing and running reports as well as HTML-formatting without having +to first create an empty Blast object. + +Using the static $Blast object for parsing a STDIN stream of Blast reports: + + use Bio::Tools::Blast qw(:obj); + + sub process_blast { + my $blastObj = shift; + print $blastObj->table(); + $blastObj->destroy; + } + + $Blast->parse( -parse => 1, + -signif => '1e-10', + -exec_func => \&process_blast, + ); + +Then pipe a stream of Blast reports into your script via STDIN. For +each Blast report extracted from the input stream, the parser will +generate a new Blast object and pass it to the function specified by +C<-exec_func>. The destroy() call tells Perl to free the memory +associated with the object, important if you are crunching through +many reports. This method is inherited from B<Bio::Root::Object.pm> +(see L<Links to related modules>). See L<parse()|parse> for a full +description of parameters and L<the DEMO SCRIPTS section | DEMO> for +additional examples. + +=head2 Running Blasts + +To run a Blast, create a new Blast object with a C<-run =E<gt> +\%runParams> parameter. Remote Blasts are performed by including a +C<-method =E<gt> 'remote'> parameter; local Blasts are performed by +including a C<-method =E<gt> 'local'> parameter. See +L<Running Blast reports> as well as the +L<the DEMO SCRIPTS section | DEMO> for examples. +Note that running local Blasts is not yet supported, see below. + +Note that the C<-seqs =E<gt> [ $seqs ]> run parameter must contain a +reference to an array of B<Bio::Seq.pm> objects +(L<Links to related modules>). Encapsulating the sequence in an +object makes sequence information much easier to handle as it can +be supplied in a variety of formats. Bio::Seq.pm is included with +this distribution (L<INSTALLATION | INSTALLATION>). + +Remote Blasts are implemented using the +B<Bio::Tools::Blast::Run::Webblast.pm> module. Local Blasts require +that you customize the B<Bio::Tools::Blast::Run::LocalBlast.pm> +module. The version of LocalBlast.pm included with this distribution +provides the basic framework for running local Blasts. +See L<Links to related modules>. + +=head2 Significance screening + +A C<-signif> parameter can be used to screen out all hits with +P-values (or Expect values) above a certain cutoff. For example, to +exclude all hits with Expect values above 1.0e-10: C<-signif =E<gt> +1e-10>. Providing a C<-signif> cutoff can speed up processing +tremendously, since only a small fraction of the report need be +parsed. This is because the C<-signif> value is used to screen hits +based on the data in the "Description" section of the Blast report: + +For NCBI BLAST2 reports: + + Score E + Sequences producing significant alignments: (bits) Value + + sp|P31376|YAB1_YEAST HYPOTHETICAL 74.1 KD PROTEIN IN CYS3-MDM10... 957 0.0 + +For BLAST1 or WashU-BLAST2 reports: + + Smallest + Sum + High Probability + Sequences producing High-scoring Segment Pairs: Score P(N) N + + PDB:3PRK_E Proteinase K complexed with inhibitor ........... 504 1.8e-50 1 + +Thus, the C<-signif> parameter will screen based on Expect values for +BLAST2 reports and based on P-values for BLAST1/WashU-BLAST2 reports. + +To screen based on other criteria, you can supply a C<-filt_func> +parameter containing a function reference that takes a +B<Bio::Tools::Sbjct.pm> object as an argument and returns a boolean, +true if the hit is to be screened out. See example below for +L<Screening hits using arbitrary criteria>. + +=head2 Get the best hit. + + $hit = $blastObj->hit; + +A "hit" is contained by a B<Bio::Tools::Blast::Sbjct.pm> object. + +=head2 Get the P-value or Expect value of the most significant hit. + + $p = $blastObj->lowest_p; + $e = $blastObj->lowest_expect; + +Alternatively: + + $p = $blastObj->hit->p; + $e = $blastObj->hit->expect; + +Note that P-values are not reported in NCBI Blast2 reports. + +=head2 Iterate through all the hits + + foreach $hit ($blastObj->hits) { + printf "%s\t %.1e\t %d\t %.2f\t %d\n", + $hit->name, $hit->expect, $hit->num_hsps, + $hit->frac_identical, $hit->gaps; + } + +Refer to the documentation for B<Bio::Tools::Blast::Sbjct.pm> +for other ways to work with hit objects (L<Links to related modules>). + +=head2 Screening hits using arbitrary criteria + + sub filter { $hit=shift; + return ($hit->gaps == 0 and + $hit->frac_conserved > 0.5); } + + $blastObj = Bio::Tools::Blast->new( -file => '/tmp/blast.out', + -parse => 1, + -filt_func => \&filter ); + +While the Blast object is parsing the report, each hit checked by calling +&filter($hit). All hits that generate false return values from &filter +are screened out and will not be added to the Blast object. +Note that the Blast object will normally stop parsing the report after +the first non-significant hit or the first hit that does not pass the +filter function. To force the Blast object to check all hits, +include a C<-check_all_hits =E<gt> 1> parameter. +Refer to the documentation for B<Bio::Tools::Blast::Sbjct.pm> +for other ways to work with hit objects. + +=over 4 + +=item Hit start, end coordinates. + + print $sbjct->start('query'); + print $sbjct->end('sbjct'); + +In array context, you can get information for both query and sbjct with one call: + + ($qstart, $sstart) = $sbjct->start(); + ($qend, $send) = $sbjct->end(); + +For important information regarding coordinate information, see +the L<HSP start, end, and strand> section below. +Also check out documentation for the start and end methods in B<Bio::Tools::Blast::Sbjct.pm>, +which explains what happens if there is more than one HSP. + +=back + +=head2 Working with HSPs + +=over 4 + +=item Iterate through all the HSPs of every hit + + foreach $hit ($blastObj->hits) { + foreach $hsp ($hit->hsps) { + printf "%.1e\t %d\t %.1f\t %.2f\t %.2f\t %d\t %d\n", + $hsp->expect, $hsp->score, $hsp->bits, + $hsp->frac_identical, $hsp->frac_conserved, + $hsp->gaps('query'), $hsp->gaps('sbjct'); + } + +Refer to the documentation for B<Bio::Tools::Blast::HSP.pm> +for other ways to work with hit objects (L<Links to related modules>). + +=back + +=over 4 + +=item Extract HSP sequence data as strings or sequence objects + +Get the first HSP of the first hit and the sequences +of the query and sbjct as strings. + + $hsp = $blast_obj->hit->hsp; + $query_seq = $hsp->seq_str('query'); + $hsp_seq = $hsp->seq_str('sbjct'); + +Get the indices of identical and conserved positions in the HSP query seq. + + @query_iden_indices = $hsp->seq_inds('query', 'identical'); + @query_cons_indices = $hsp->seq_inds('query', 'conserved'); + +Similarly for the sbjct sequence. + + @sbjct_iden_indices = $hsp->seq_inds('sbjct', 'identical'); + @sbjct_cons_indices = $hsp->seq_inds('sbjct', 'conserved'); + + print "Query in Fasta format:\n", $hsp->seq('query')->layout('fasta'); + print "Sbjct in Fasta format:\n", $hsp->seq('sbjct')->layout('fasta'); + +See the B<Bio::Seq.pm> package for more information about using these sequence objects +(L<Links to related modules>). + +=back + +=over 4 + +=item Create sequence alignment objects using HSP sequences + + $aln = $hsp->get_aln; + print " consensus:\n", $aln->consensus(); + print $hsp->get_aln->layout('fasta'); + + $ENV{READSEQ_DIR} = '/home/users/sac/bin/solaris'; + $ENV{READSEQ} = 'readseq'; + print $hsp->get_aln->layout('msf'); + +MSF formated layout requires Don Gilbert's ReadSeq program (not included). +See the B<Bio::UnivAln.pm> for more information about using these alignment objects +(L<Links to related modules>)'. + +=back + +=over 4 + +=item HSP start, end, and strand + +To facilitate HSP processing, endpoint data for each HSP sequence are +normalized so that B<start is always less than end>. This affects TBLASTN +and TBLASTX HSPs on the reverse complement or "Minus" strand. + +Some examples of obtaining start, end coordinates for HSP objects: + + print $hsp->start('query'); + print $hsp->end('sbjct'); + ($qstart, $sstart) = $hsp->start(); + ($qend, $send) = $hsp->end(); + +Strandedness of the HSP can be assessed using the strand() method +on the HSP object: + + print $hsp->strand('query'); + print $hsp->strand('sbjct'); + +These will return 'Minus' or 'Plus'. +Or, to get strand information for both query and sbjct with a single call: + + ($qstrand, $sstrand) = $hsp->strand(); + +=back + +=head2 Report Generation + +=over 4 + +=item Generate a tab-delimited table of all results. + + print $blastObj->table; + print $blastObj->table(0); # don't include hit descriptions. + print $blastObj->table_tiled; + +The L<table()|table> method returns data for each B<HSP> of each hit listed one per +line. The L<table_tiled()|table_tiled> method returns data for each B<hit, i.e., Sbjct> +listed one per line; data from multiple HSPs are combined after tiling to +reduce overlaps. See B<Bio::Tools::Blast::Sbjct.pm> for more information about +HSP tiling. These methods generate stereotypical, tab-delimited data for each +hit of the Blast report. The output is suitable for importation into +spreadsheets or database tables. Feel free to roll your own table function if +you need a custom table. + +For either table method, descriptions of each hit can be included if a +single, true argument is supplied (e.g., $blastObj-E<gt>table(1)). The description +will be added as the last field. This will significantly increase the size of +the table. Labels for the table columns can be obtained with L<table_labels()|table_labels> +and L<table_labels_tiled()|table_labels_tiled>. + +=back + +=over 4 + +=item Print a summary of the Blast report + + $blastObj->display(); + $blastObj->display(-show=>'hits'); + +L<display()|display> prints various statistics extracted from the Blast report +such as database name, database size, matrix used, etc. The +C<display(-show=E<gt>'hits')> call prints a non-tab-delimited table +attempting to line the data up into more readable columns. The output +generated is similar to L<table_tiled()|table_tiled>. + +=back + +=over 4 + +=item HTML-format an existing report + + use Bio::Tools::Blast qw(:obj); + + # Going straight from a non HTML report file to HTML output using + # the static $Blast object exported by Bio::Tools::Blast.pm + $Blast->to_html(-file => '/usr/people/me/blast.output.txt', + -header => qq|<H1>BLASTP Results</H1><A HREF="home.html">Home</A>| + ); + + # You can also use a specific Blast object created previously. + $blastObj->to_html; + +L<to_html()|to_html> will send HTML output, line-by-line, directly to STDOUT +unless an C<-out =E<gt> array_ref> parameter is supplied (e.g., C<-out +=E<gt> \@array>), in which case the HTML will be stored in @array, one +line per array element. The direct outputting permits faster response +time since Blast reports can be huge. The -header tag can contain a +string containing any HTML that you want to appear at the top of the +Blast report. + +=back + +=head1 DEMO SCRIPTS + +Sample Scripts are included in the central bioperl distribution in the +'examples/blast/' directory (see L<INSTALLATION | INSTALLATION>): + +=head2 Handy library for working with Bio::Tools::Blast.pm + + examples/blast/blast_config.pl + +=head2 Parsing Blast reports one at a time. + + examples/blast/parse_blast.pl + examples/blast/parse_blast2.pl + examples/blast/parse_positions.pl + +=head2 Parsing sets of Blast reports. + + examples/blast/parse_blast.pl + examples/blast/parse_multi.pl + + B<Warning:> See note about L<Memory Usage Issues>. + +=head2 Running Blast analyses one at a time. + + examples/blast/run_blast_remote.pl + +=head2 Running Blast analyses given a set of sequences. + + examples/blast/blast_seq.pl + +=head2 HTML-formatting Blast reports. + + examples/blast/html.pl + +=head1 TECHNICAL DETAILS + +=head2 Blast Modes + +A BLAST object may be created using one of three different modes as +defined by the B<Bio::Tools::SeqAnal.pm> package +(See L<Links to related modules>): + + -- parse - Load a BLAST report and parse it, storing parsed data in + Blast.pm object. + -- run - Run the BLAST program to generate a new report. + -- read - Load a BLAST report into the Blast object without parsing. + +B<Run mode support has recently been added>. The module +B<Bio::Tools::Blast::Run::Webblast.pm> is an modularized adaptation of +the webblast script by Alex Dong Li: + + http://www.genet.sickkids.on.ca/bioinfo_resources/software.html#webblast + +for running remote Blast analyses and saving the results locally. Run +mode can be combined with a parse mode to generate a Blast report and +then build the Blast object from the parsed results of this report +(see L<run()|run> and L<SYNOPSIS | SYNOPSIS>). + +In read mode, the BLAST report is read in by the Blast object but is +not parsed. This could be used to internalize a Blast report but not +parse it for results (e.g., generating HTML formatted output). + +=head2 Significant Hits + +This module permits the screening of hits on the basis of +user-specified criteria for significance. Currently, Blast reports can +be screened based on: + + CRITERIA PARAMETER VALUE + ---------------------------------- --------- ---------------- + 1) the best Expect (or P) value -signif float or sci-notation + 2) the length of the query sequence -min_length integer + 3) arbitrary criteria -filt_func function reference + +The parameters are used for construction of the BLAST object or when +running the L<parse()|parse> method on the static $Blast object. The +-SIGNIF value represents the number listed in the description section +at the top of the Blast report. For Blast2, this is an Expect value, +for Blast1 and WashU-Blast2, this is a P-value. The idea behind the +C<-filt_func> parameter is that the hit has to pass through a filter +to be considered significant. Refer to the documentation for +B<Bio::Tools::Blast::Sbjct.pm> for ways to work with hit objects. + +Using a C<-signif> parameter allows for the following: + +=over 2 + +=item Faster parsing. + +Each hit can be screened by examination of the description line alone +without fully parsing the HSP alignment section. + +=item Flexibility. + +The C<-signif> tag provides a more semantic-free way to specify the +value to be used as a basis for screening hits. Thus, C<-signif> can +be used for screening Blast1 or Blast2 reports. It is up to the user +to understand whether C<-signif> represents a P-value or an Expect +value. + +=back + +Any hit not meeting the significance criteria will not be added to the +"hit list" of the BLAST object. Also, a BLAST object without any hits +meeting the significance criteria will throw an exception during +object construction (a fatal event). + +=head2 Statistical Parameters + +There are numerous parameters which define the behavior of the BLAST +program and which are useful for interpreting the search +results. These parameters are extracted from the Blast report: + + filter -- for masking out low-complexity sequences or short repeats + matrix -- name of the substitution scoring matrix (e.g., BLOSUM62) + E -- Expect filter (screens out frequent scores) + S -- Cutoff score for segment pairs + W -- Word length + T -- Threshold score for word pairs + Lambda, -- Karlin-Altschul "sum" statistical parameters dependent on + K, H sequence composition. + G -- Gap creation penalty. + E -- Gap extension penalty. + +These parameters are not always needed. Extraction may be turned off +explicitly by including a C<-stats =E<gt> 0> parameter during object +construction. Support for all statistical parameters is not complete. + +For more about the meaning of parameters, check out the NCBI URLs given above. + +=head2 Module Organization + +The modules that comprise this Bioperl Blast distribution are location in the +Bio:: hierarchy as shown in the diagram below. + + Bio/ + | + +--------------------------+ + | | + Bio::Tools Bio::Root + | | + +----------------------+ Object.pm + | | | + SeqAnal.pm Blast.pm Blast/ + | + +---------+---------+------------+ + | | | | + Sbjct.pm HSP.pm HTML.pm Run/ + | + +------------+ + | | + Webblast.pm LocalBlast.pm + +Bio::Tools::Blast.pm is a concrete class that inherits from +B<Bio::Tools::SeqAnal.pm> and relies on other modules for parsing and +managing BLAST data. Worth mentioning about this hierarchy is the +lack of a "Parse.pm" module. Since parsing is considered central to +the purpose of the Bioperl Blast module (and Bioperl in general), it +seems somewhat unnatural to segregate out all parsing code. This +segregation could also lead to inefficiencies and harder to maintain +code. I consider this issue still open for debate. + +Bio::Tools::Blast.pm, B<Bio::Tools::Blast::Sbjct.pm>, and +B<Bio::Tools::Blast::HSP.pm> are mostly dedicated to parsing and all +can be used to instantiate objects. Blast.pm is the main "command and +control" module, inheriting some basic behaviors from SeqAnal.pm +(things that are not specific to Blast I<per se>). + +B<Bio::Tools::Blast::HTML.pm> contains functions dedicated to +generating HTML-formatted Blast reports and does not generate objects. + +=head2 Running Blasts: Details + +B<Bio::Tools::Blast::Run::Webblast.pm> contains a set of functions for +running Blast analyses at a remote server and also does not +instantiate objects. It uses a helper script called postclient.pl, +located in the Run directory. The proposed LocalBlast.pm module would +be used for running Blast reports on local machines and thus would be +customizable for different sites. It would operate in a parallel +fashion to Webblast.pm (i.e., being a collection of functions, taking +in sequence objects or files, returning result files). + +The Run modules are considered experimental. In particular, +Webblast.pm catures an HTML-formatted version of the Blast report from +the NCBI server and strips out the HTML in preparation for parsing. A +more direct approach would be to capture the Blast results directly +from the server using an interface to the NCBI toolkit. This approach +was recently proposed on the Bioperl mailing list: +http://www.uni-bielefeld.de/mailinglists/BCD/vsns-bcd-perl/9805/0000.html + +=head2 Memory Usage Issues + +Parsing large numbers of Blast reports (a few thousand or so) with +Bio::Tools::Blast.pm may lead to unacceptable memory usage situations. +This is somewhat dependent of the size and complexity of the reports. + +While this problem is under investigation, here are some workarounds +that fix the memory usage problem: + +=over 4 + +=item 1 Don't specify a -signif criterion when calling L<parse()|parse>. + +The C<-signif> value is used for imposing a upper limit to the expect- or +P-value for Blast hits to be parsed. For reasons that are still under +investigation, specifying a value for C<-signif> in the L<parse()|parse> +method prevents Blast objects from being fully +garbage collected. When using the B<parse_blast.pl> or B<parse_multi.pl> +scripts in C<examples/blast/> of the bioperl distribution), don't supply +a C<-signif> command-line parameter. + +=item 2 If you want to impose a -signif criterion, put it inside a +-filt_func. + +For the L<parse()|parse> method, a -signif =E<gt> 1e-5 parameter is equivalent +to using a filter function parameter of + + -filt_func => sub { my $hit = shift; return $hit->signif <= 1e-5; } + +Using the B<examples/blast/parse_multi.pl> script, you can supply a +command-line argument of + + -filt_func '$hit->signif <= 1e-5' + +For more information, see L<parse()|parse> and the section +L<Screening hits using arbitrary criteria>. + +=back + +=head1 TODO + +=over 4 + +=item * Develop a functional, prototype Bio::Tools::Blast::Run::LocalBlast.pm module. + +=item * Add support for PSI-BLAST and PHI-BLAST + +=item * Parse histogram of expectations and retrieve gif image in +Blast report (if present). + +=item * Further investigate memory leak that occurs when parsing Blast +streams whe supplying a -signif parameter to L<parse()|parse>. + +=item * Access Blast results directly from the NCBI server using a +Perl interface to the NCBI toolkit or XML formated Blast reports (when +available). + +=item * Further exploit Bio::UnivAln.pm and multiple-sequence +alignment programs using HSP sequence data. Some of this may best go +into a separate, dedicated module or script as opposed to burdening +Blast.pm, Sbjct.pm, and HSP.pm with additional functionality that is +not always required. + +=item * Add an example script for parsing Blast reports containing +HTML formatting. + +=back + +=head1 VERSION + +Bio::Tools::Blast.pm, 0.09 + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Steve Chervitz, sac@bioperl.org + +See the L<FEEDBACK | FEEDBACK> section for where to send bug reports and comments. + +=head1 ACKNOWLEDGEMENTS + +This module was developed under the auspices of the Saccharomyces Genome +Database: + http://genome-www.stanford.edu/Saccharomyces + +Other contributors include: Alex Dong Li (webblast), Chris Dagdigian +(Seq.pm), Steve Brenner (Seq.pm), Georg Fuellen (Seq.pm, UnivAln.pm), +and untold others who have offered comments (noted in the +Bio/Tools/Blast/CHANGES file of the distribution). + +=head1 COPYRIGHT + +Copyright (c) 1996-98 Steve Chervitz. All Rights Reserved. This +module is free software; you can redistribute it and/or modify it +under the same terms as Perl itself. + +=head1 SEE ALSO + + Bio::Tools::SeqAnal.pm - Sequence analysis object base class. + Bio::Tools::Blast::Sbjct.pm - Blast hit object. + Bio::Tools::Blast::HSP.pm - Blast HSP object. + Bio::Tools::Blast::HTML.pm - Blast HTML-formating utility class. + Bio::Tools::Blast::Run::Webblast.pm - Utility module for running Blasts remotely. + Bio::Tools::Blast::Run::LocalBlast.pm - Utility module for running Blasts locally. + Bio::Seq.pm - Biosequence object + Bio::UnivAln.pm - Biosequence alignment object. + Bio::Root::Object.pm - Proposed base class for all Bioperl objects. + +=head2 Links to related modules + + Bio::Tools::SeqAnal.pm + http://bio.perl.org/Core/POD/Bio/Tools/SeqAnal.html + + Bio::Tools::Blast::Sbjct.pm + http://bio.perl.org/Core/POD/Bio/Tools/Blast/Sbjct.html + + Bio::Tools::Blast::HSP.pm + http://bio.perl.org/Core/POD/Bio/Tools/Blast/HSP.html + + Bio::Tools::Blast::HTML.pm + http://bio.perl.org/Core/POD/Bio/Tools/Blast/HTML.html + + Bio::Tools::Blast::Run::Webblast.pm + http://bio.perl.org/Core/POD/Bio/Tools/Blast/Run/Webblast.html + + Bio::Tools::Blast::Run::LocalBlast.pm + http://bio.perl.org/Core/POD/Bio/Tools/Blast/Run/LocalBlast.html + + Bio::Seq.pm + http://bio.perl.org/Core/POD/Seq.html + + Bio::UnivAln.pm + http://bio.perl.org/Projects/SeqAlign/ + Europe: http://www.techfak.uni-bielefeld.de/bcd/Perl/Bio/#univaln + + Bio::Root::Object.pm + http://bio.perl.org/Core/POD/Root/Object.html + + http://bio.perl.org/Projects/modules.html - Online module documentation + http://bio.perl.org/Projects/Blast/ - Bioperl Blast Project + http://bio.perl.org/ - Bioperl Project Homepage + +L<References & Information about the BLAST program>. + +=head1 KNOWN BUGS + +There is a memory leak that occurs when parsing parsing streams +containing large numbers of Blast reports (a few thousand or so) and +specifying a -signif parameter to the L<parse()|parse> method. For a +workaround, see L<Memory Usage Issues>. + +Not sharing statistical parameters between different Blast objects +when parsing a multi-report stream has not been completely tested and +may be a little buggy. + +Documentation inconsistencies or inaccuracies may exist since this +module underwend a fair bit of re-working going from 0.75 to 0.80 +(corresponds to versions 0.04.4 to 0.05 of the bioperl distribution). + +=cut + +# +## +### +#### END of main POD documentation. +### +## +# + +=head1 APPENDIX + +Methods beginning with a leading underscore are considered private and +are intended for internal use by this module. They are B<not> +considered part of the public interface and are described here for +documentation purposes only. + +=cut + +############################################################################## +## CONSTRUCTOR ## +############################################################################## + + +sub new { + my ($class,@args) = @_; + $class->warn("Bio::Tools::BLAST is deprecated, use Bio::SearchIO system or Bio::Tools::BPlite"); +return $class->SUPER::new(@args); + } + +## The Blast.pm object relies on the the superclass constructor: +## Bio::Tools::SeqAnal::_initialize(). See that module for details. + +#------------- +sub destroy { +#------------- + my $self=shift; + $DEBUG==2 && print STDERR "DESTROYING $self ${\$self->name}"; + if($self->{'_hits'}) { + foreach($self->hits) { + $_->destroy; + undef $_; + } + undef $self->{'_hits'}; + #$self->{'_hits'}->remove_all; ## When and if this member becomes a vector. + } + + $self->SUPER::destroy; +} + +##################################################################################### +## ACCESSORS ## +##################################################################################### + +=head2 run + + Usage : $object->run( %named_parameters ) + Purpose : Run a local or remote Blast analysis on one or more sequences. + Returns : String containing name of Blast output file if a single Blast + : is run. + : -- OR -- + : List of Blast objects if multiple Blasts are being run as a group. + Argument : Named parameters: (PARAMETER TAGS CAN BE UPPER OR LOWER CASE). + : -METHOD => 'local' or 'remote' (default = remote), + : -PARSE => boolean, (true if the results are to be parsed after the run) + : -STRICT => boolean, the strict mode to use for the resulting Blast objects. + : ADDITIONAL PARAMETERS: + : See methods _run_remote() and _run_local() for required + : parameters necessary for running the blast report. + Throws : Exception if no Blast output file was obtained. + Comments : This method is called automatically during construction of a + : Blast.pm object when run parameters are sent to the constructor: + : $blastObj = new Bio::Tools::Blast (-RUN =>\%runParam, + : %parseParam ); + : + : The specific run methods (local or remote) called by run() + : must return a list containing the file name(s) with the Blast output. + : + : The run() method can perform single or multiple Blast runs + : (analogous to the way parse() works) depending on how many + : sequences are submitted. However, the running of multiple + : Blasts is probably better handled at the script level. See notes in + : the "TODO" section below. + : + : As for what to do with the Blast result file, that decision is + : left for the user who can direct the Blast object to delete, compress, + : or leave it alone. + : + : This method does not worry about load balancing, which + : is probably best handled at the server level. + : + TODO: : Support for running+parsing multiple Blast analyses with a + : single run() call is incomplete. One can generate multiple + : reports by placing more than one sequence object in the -seqs + : reference parameter. This saves some overhead in the code + : that executes the Blasts since all options are configured once. + : (This is analogous to parsing using the static $Blast object + : see parse() and _parse_stream()). + : + : The trouble is that Blast objects for all runs are constructed, + : parsed (if necessary), and then returned as a group + : This can require lots of memory when run+parsing many Blasts + : but should be fine if you just want to run a bunch Blasts. + : + : For now, when running+parsing Blasts, stick to running one + : Blast at a time, building the Blast object with the results + : of that report, and processing as necessary. + : + : Support for running PSI-Blast is not complete. + +See Also: L<_run_remote()|_run_remote>, L<_run_local()|_run_local>, L<parse()|parse> + +=cut + +#--------- +sub run { +#--------- + my ($self, %param) = @_; + my($method, $parse, $strict) = + $self->_rearrange([qw(METHOD PARSE STRICT)], %param); + + $strict = $self->strict($strict) if $strict; + + my (@files); + if($method =~ /loc/i) { + @files = $self->_run_local(%param); + + } else { + @files = $self->_run_remote(%param); + } + + $self->throw("Run Blast failed: no Blast output created.") if !@files; + + if(scalar(@files) == 1) { + # If there was just one Blast output file, prepare to incorporate it + # into the current Blast object. run() is called before parse() in the + # SeqAnal.pm constructor. + if($files[0] ne 'email') { + $self->file($files[0]); + } else { + # Can't do anything with the report. + $self->throw("Blast report to be sent via e-mail."); + } + + } else { + # If there are multiple report files, build individual Blast objects foreach. + # In this situation, the static $Blast object is being used to run + # a set of related Blasts, similar to the way parse() can be used. + # This strategy is not optimal since all reports are generated first + # before any are parsed. + # Untested. + + my(@objs); + foreach(@files) { + push @objs, new Bio::Tools::Blast(-FILE => $_, + -PARSE => $parse || 0, + -STRICT => $strict, + ); + } + return @objs; + } +} + +=head2 _run_remote + + Usage : n/a; internal method called by run() + : $object->_run_remote( %named_parameters ) + Purpose : Run Blast on a remote server. + Argument : Named parameters: + : See documentation for function &blast_remote in + : Bio::Tools::Blast::Run::Webblast.pm for description + : of parameters. + Comments : This method requires the Bio::Tools::Blast::Run::Webblast.pm + : which conforms to this minimal API: + : * export a method called &blast_remote that accepts a + : Bio::Tools::Blast.pm object + named parameters + : (specified in the Webblast.pm module). + : * return a list of names of files containing the raw Blast reports. + : (When building a Blast object, this list would contain a + : single file from which the Blast object is to be constructed). + +See Also : L<run()|run>, L<_run_local()|_run_local>, B<Bio::Tools::Blast::Run::Webblast.pm::blast_remote()>, L<Links to related modules> + +=cut + +#---------------- +sub _run_remote { +#---------------- + my ($self, %param) = @_; + + require Bio::Tools::Blast::Run::Webblast; + Bio::Tools::Blast::Run::Webblast->import(qw(&blast_remote)); + + &blast_remote($self, %param); +} + +=head2 _run_local + + Usage : n/a; internal method called by run() + : $object->_run_local(%named_parameters) + Purpose : Run Blast on a local machine. + Argument : Named parameters: + : See documentation for function &blast_local in + : Bio::Tools::Blast::Run::LocalBlast.pm for description + : of parameters. + Comments : This method requires the Bio::Tools::Blast::Run::LocalBlast.pm + : module which should be customized for your site. This module would + : contain all the commands, paths, environment variables, and other + : data necessary to run Blast commands on a local machine, but should + : not contain any semantics for specific query sequences. + : + : LocalBlast.pm should also conform to this minimal API: + : * export a method called &blast_local that accepts a + : Bio::Tools::Blast.pm object + named parameters + : (specified in the LocalBlast.pm module). + : * return a list of names of files containing the raw Blast reports. + : (When building a Blast object, this list would contain a + : single file from which the Blast object is to be constructed). + +See Also : L<run()|run>, L<_run_remote()|_run_remote>, B<Bio::Tools::Blast::Run::LocalBlast::blast_local()>, L<Links to related modules> + +=cut + +#-------------- +sub _run_local { +#-------------- + my ($self, %param) = @_; + + require Bio::Tools::Blast::Run::Webblast; + Bio::Tools::Blast::Run::Webblast->import(qw(&blast_local)); + + &blast_local($self, %param); +} + +=head2 db_remote + + Usage : @dbs = $Blast->db_remote( [seq_type] ); + Purpose : Get a list of available sequence databases for remote Blast analysis. + Returns : Array of strings + Argument : seq_type = 'p' or 'n' + : 'p' = Gets databases for peptide searches (default) + : 'n' = Gets databases for nucleotide searches + Throws : n/a + Comments : Peptide databases are a subset of the nucleotide databases. + : It is convenient to call this method on the static $Blast object + : as shown in Usage. + +See Also : L<db_local()|db_local> + +=cut + +#---------------- +sub db_remote { +#---------------- + my ($self, $type) = @_; + $type ||= 'p'; + + require Bio::Tools::Blast::Run::Webblast; + Bio::Tools::Blast::Run::Webblast->import(qw(@Blast_dbp_remote + @Blast_dbn_remote)); + + # We shouldn't have to fully qualify the Blast_dbX_remote arrays. Hm. + + my(@dbs); + if( $type =~ /^p|amino/i) { + @dbs = @Bio::Tools::Blast::Run::Webblast::Blast_dbp_remote; + } else { + @dbs = @Bio::Tools::Blast::Run::Webblast::Blast_dbn_remote; + } + @dbs; +} + +=head2 db_local + + Usage : @dbs = $Blast->db_local( [seq_type] ); + Purpose : Get a list of available sequence databases for local Blast analysis. + Returns : Array of strings + Argument : seq_type = 'p' or 'n' + : 'p' = Gets databases for peptide searches (default) + : 'n' = Gets databases for nucleotide searches + Throws : n/a + Comments : Peptide databases are a subset of the nucleotide databases. + : It is convenient to call this method on the static $Blast object. + as shown in Usage. + +See Also : L<db_remote()|db_remote> + +=cut + +#---------------- +sub db_local { +#---------------- + my ($self, $type) = @_; + $type ||= 'p'; + + require Bio::Tools::Blast::Run::LocalBlast; + Bio::Tools::Blast::Run::LocalBlast->import(qw(@Blast_dbp_local + @Blast_dbn_local)); + + # We shouldn't have to fully qualify the Blast_dbX_local arrays. Hm. + + my(@dbs); + if( $type =~ /^p|amino/i) { + @dbs = @Bio::Tools::Blast::Run::LocalBlast::Blast_dbp_local; + } else { + @dbs = @Bio::Tools::Blast::Run::LocalBlast::Blast_dbn_local; + } + @dbs; +} + +=head2 parse + + Usage : $blast_object->parse( %named_parameters ) + Purpose : Parse a Blast report from a file or STDIN. + : * Parses a raw BLAST data, populating Blast object with report data. + : * Sets the significance cutoff. + : * Extracts statistical parameters about the BLAST run. + : * Handles both single files and streams containing multiple reports. + Returns : integer (number of Blast reports parsed) + Argument : <named parameters>: (PARAMETER TAGS CAN BE UPPER OR LOWER CASE). + : -FILE => string (name of file containing raw Blast output. + : Optional. If a valid file is not supplied, + : STDIN will be used). + : -SIGNIF => number (float or scientific notation number to be used + : as a P- or Expect value cutoff; + : default = $DEFAULT_SIGNIF (999)). + : -FILT_FUNC => func_ref (reference to a function to be used for + : filtering out hits based on arbitrary criteria. + : This function should take a + : Bio::Tools::Blast::Sbjct.pm object as its first + : argument and return a boolean value, + : true if the hit should be filtered out). + : Sample filter function: + : -FILT_FUNC => sub { $hit = shift; + : $hit->gaps == 0; }, + : -CHECK_ALL_HITS => boolean (check all hits for significance against + : significance criteria. Default = false. + : If false, stops processing hits after the first + : non-significant hit or the first hit that fails + : the filt_func call. This speeds parsing, + : taking advantage of the fact that the hits + : are processed in the order they are ranked.) + : -MIN_LEN => integer (to be used as a minimum query sequence length + : sequences below this length will not be processed). + : default = no minimum length). + : -STATS => boolean (collect stats for report: matrix, filters, etc. + : default = false). + : -BEST => boolean (only process the best hit of each report; + : default = false). + : -OVERLAP => integer (the amount of overlap to permit between + : adjacent HSPs when tiling HSPs, + : Default = $MAX_HSP_OVERLAP (2)) + : + : PARAMETERS USED WHEN PARSING MULTI-REPORT STREAMS: + : -------------------------------------------------- + : -SHARE => boolean (set this to true if all reports in stream + : share the same stats. Default = true) + : Must be set to false when parsing both Blast1 and + : Blast2 reports in the same run or if you need + : statistical params for each report, Lambda, K, H). + : -STRICT => boolean (use strict mode for all Blast objects created. + : Increases sensitivity to errors. For single + : Blasts, this is parameter is sent to new().) + : -EXEC_FUNC => func_ref (reference to a function for processing each + : Blast object after it is parsed. Should accept a + : Blast object as its sole argument. Return value + : is ignored. If an -EXEC_FUNC parameter is supplied, + : the -SAVE_ARRAY parameter will be ignored.) + : -SAVE_ARRAY =>array_ref, (reference to an array for storing all + : Blast objects as they are created. + : Experimental. Not recommended.) + : -SIGNIF_FMT => boolean String of 'exp' or 'parts'. Sets the format + : for reporting P/Expect values. 'exp' reports + : only the exponent portion. 'parts' reports + : them as a 2 element list. See signif_fmt().. + : + Throws : Exception if BLAST report contains a FATAL: error. + : Propagates any exception thrown by read(). + : Propagates any exception thrown by called parsing methods. + Comments : This method can be called either directly using the static $Blast object + : or indirectly (by Bio::Tools::SeqAnal.pm) during constuction of an + : individual Blast object. + : + : HTML-formatted reports can be parsed as well. No special flag is required + : since it is detected automatically. The presence of HTML-formatting + : will result in slower performace, however, since it must be removed + : prior to parsing. Parsing HTML-formatted reports is highly + : error prone and is generally not recommended. + : + : If one has an HTML report, do NOT remove the HTML from it by using the + : "Save As" option of a web browser to save it as text. This renders the + : report unparsable. + : HTML-formatted reports can be parsed after running through the strip_html + : function of Blast::HTML.pm as in: + : require Bio::Tools::Blast::HTML; + : Bio::Tools::Blast::HTML->import(&strip_html); + : &strip_html(\$data); + : # where data contains full contents of an HTML-formatted report. + : TODO: write a demo script that does this. + +See Also : L<_init_parse_params()|_init_parse_params>, L<_parse_blast_stream()|_parse_blast_stream>, L<overlap()|overlap>, L<signif_fmt()|signif_fmt>, B<Bio::Root::Object::read()>, B<Bio::Tools::Blast::HTML.pm::strip_html()>, L<Links to related modules> + +=cut + +#--------- +sub parse { +#--------- +# $self might be the static $Blast object. + my ($self, @param) = @_; + + my($signif, $filt_func, $min_len, $check_all, $overlap, $stats, + $share, $strict, $best, $signif_fmt, $no_aligns) = + $self->_rearrange([qw(SIGNIF FILT_FUNC MIN_LEN CHECK_ALL_HITS + OVERLAP STATS SHARE STRICT + BEST EXPONENT NO_ALIGNS )], @param); + + ## Initialize the static Blast object with parameters that + ## apply to all Blast objects within a parsing session. + + &_init_parse_params($share, $filt_func, $check_all, + $signif, $min_len, $strict, + $best, $signif_fmt, $stats, $no_aligns + ); + + my $count = $self->_parse_blast_stream(@param); + +# print STDERR "\nDONE PARSING STREAM.\n"; + + if($Blast->{'_blast_errs'}) { + my @errs = @{$Blast->{'_blast_errs'}}; + printf STDERR "\n*** %d BLAST REPORTS HAD FATAL ERRORS:\n", scalar(@errs); + foreach(@errs) { print STDERR "$_\n"; } + @{$Blast->{'_blast_errs'}} = (); + } + + return $count; +} + +=head2 _init_parse_params + + Title : _init_parse_params + Usage : n/a; called automatically by parse() + Purpose : Initializes parameters used during parsing of Blast reports. + : This is a static method used by the $Blast object. + : Calls _set_signif(). + Example : + Returns : n/a + Args : Args extracted by parse(). + +See Also: L<parse()|parse>, L<_set_signif()|_set_signif> + +=cut + +#---------------------- +sub _init_parse_params { +#---------------------- + my ($share, $filt_func, $check_all, + $signif, $min_len, $strict, + $best, $signif_fmt, $stats, $no_aligns) = @_; + + ## Default is to share stats. + $Blast->{'_share'} = defined($share) ? $share : 1; + $Blast->{'_filt_func'} = $filt_func || 0; + $Blast->{'_check_all'} = $check_all || 0; + $Blast->{'_signif_fmt'} ||= $signif_fmt || ''; + $Blast->{'_no_aligns'} = $no_aligns || 0; + + &_set_signif($signif, $min_len, $filt_func); + $Blast->strict($strict) if defined $strict; + $Blast->best($best) if $best; + $Blast->{'_blast_count'} = 0; + + ## If $stats is false, miscellaneous statistical and other parameters + ## are NOT extracted from the Blast report (e.g., matrix name, filter used, etc.). + ## This can speed processing when crunching tons of Blast reports. + ## Default is to NOT get stats. + $Blast->{'_get_stats'} = defined($stats) ? $stats : 0; + + # Clear any errors from previous parse. + undef $Blast->{'_blast_errs'}; +} + +=head2 _set_signif + + Usage : n/a; called automatically by _init_parse_params() + : This is now a "static" method used only by $Blast. + : _set_signif($signif, $min_len, $filt_func); + Purpose : Sets significance criteria for the BLAST object. + Argument : Obligatory three arguments: + : $signif = float or sci-notation number or undef + : $min_len = integer or undef + : $filt_func = function reference or undef + : + : If $signif is undefined, a default value is set + : (see $DEFAULT_SIGNIF; min_length = not set). + Throws : Exception if significance value is defined but appears + : out of range or invalid. + : Exception if $filt_func if defined and is not a func ref. + Comments : The significance of a BLAST report can be based on + : the P (or Expect) value and/or the length of the query sequence. + : P (or Expect) values GREATER than '_significance' are not significant. + : Query sequence lengths LESS than '_min_length' are not significant. + : + : Hits can also be screened using arbitrary significance criteria + : as discussed in the parse() method. + : + : If no $signif is defined, the '_significance' level is set to + : $Bio::Tools::Blast::DEFAULT_SIGNIF (999). + +See Also : L<signif()|signif>, L<min_length()|min_length>, L<_init_parse_params()|_init_parse_params>, L<parse()|parse> + +=cut + +#----------------- +sub _set_signif { +#----------------- + my( $sig, $len, $func ) = @_; + + if(defined $sig) { + $Blast->{'_confirm_significance'} = 1; + if( $sig =~ /[^\d.e-]/ or $sig <= 0) { + $Blast->throw("Invalid significance value: $sig", + "Must be greater than zero."); + } + $Blast->{'_significance'} = $sig; + } else { + $Blast->{'_significance'} = $DEFAULT_SIGNIF; + $Blast->{'_check_all'} = 1 if not $Blast->{'_filt_func'}; + } + + if(defined $len) { + if($len =~ /\D/ or $len <= 0) { + $Blast->warn("Invalid minimum length value: $len", + "Value must be an integer > 0. Value not set."); + } else { + $Blast->{'_min_length'} = $len; + } + } + + if(defined $func) { + $Blast->{'_confirm_significance'} = 1; + if($func and not ref $func eq 'CODE') { + $Blast->throw("Not a function reference: $func", + "The -filt_func parameter must be function reference."); + } + } + } + +=head2 _parse_blast_stream + + Usage : n/a. Internal method called by parse() + Purpose : Obtains the function to be used during parsing and calls read(). + Returns : Integer (the number of blast reports read) + Argument : Named parameters (forwarded from parse()) + Throws : Propagates any exception thrown by _get_parse_blast_func() and read(). + +See Also : L<_get_parse_blast_func()|_get_parse_blast_func>, B<Bio::Root::Object::read()> + +=cut + +#---------------------- +sub _parse_blast_stream { +#---------------------- + my ($self, %param) = @_; + + my $func = $self->_get_parse_blast_func(%param); +# my $func = sub { my $data = shift; +# printf STDERR "Chunk length = %d\n", length($data); +# sleep(3); +# }; + + # Only setting the newline character once per session. + $Newline ||= $Util->get_newline(-client => $self, %param); + + $self->read(-REC_SEP =>"$Newline>", + -FUNC => $func, + %param); + + return $Blast->{'_blast_count'}; +} + +=head2 _get_parse_blast_func + + Usage : n/a; internal method used by _parse_blast_stream() + : $func_ref = $blast_object->_get_parse_blast_func() + Purpose : Generates a function ref to be used as a closure for parsing + : raw data as it is being loaded by Bio::Root::IOManager::read(). + Returns : Function reference (closure). + Comments : The the function reference contains a fair bit of logic + : at present. It could perhaps be split up into separate + : functions to make it more 'digestible'. + +See Also : L<_parse_blast_stream()|_parse_blast_stream> + +=cut + +#-------------------------- +sub _get_parse_blast_func { +#-------------------------- + my ($self, @param) = @_; + + my ($save_a, $exec_func) = + $self->_rearrange([qw(SAVE_ARRAY EXEC_FUNC)], @param); + +# $MONITOR && print STDERR "\nParsing Blast stream (5/dot, 250/line)\n"; + my $count = 0; + my $strict = $self->strict(); + + # Some parameter validation. + # Remember, all Blast parsing will use this function now. + # You won't need a exec-func or save_array when just creating a Blast object + # as in: $blast = new Bio::Tools::Blast(); + if($exec_func and not ref($exec_func) eq 'CODE') { + $self->throw("The -EXEC_FUNC parameter must be function reference.", + "exec_func = $exec_func"); + + } elsif($save_a and not ref($save_a) eq 'ARRAY') { + $self->throw("The -SAVE_ARRAY parameter must supply an array reference". + "when not using an -EXEC_FUNC parameter."); + } + + ## Might consider breaking this closure up if possible. + + return sub { + my ($data) = @_; + ## $data should contain one of three possible fragment types + ## from a Blast report: + ## 1. Header with description section, + ## 2. An alignment section for a single hit, or + ## 3. The final alignment section plus the footer section. + ## (record separator = "Newline>"). + +# print STDERR "\n(BLAST) DATA CHUNK: $data\n"; + + my ($current_blast, $current_prog, $current_vers, $current_db); + my $prev_blast; + my $contains_translation = 0; + +### steve --- Wed Mar 15 02:48:07 2000 +### In the process of addressing bug PR#95. Tricky. +### Using the $contains_translation to do so. Not complete +### and possibly won't fix. We'll see. + + # Check for header section. Start a new Blast object and + # parse the description section. +# if ($data =~ /\sQuery\s?=/s || ($contains_translation && $data =~ /Database:/s)) { + if ($data =~ /\sQuery\s?=/s) { + $Blast->{'_blast_count'}++; + print STDERR ".", $Blast->{'_blast_count'} % 50 ? '' : "\n" if $MONITOR; + + if($data =~ /$Newline\s+Translating/so) { + print STDERR "\nCONTAINS TRANSLATION\n"; + $contains_translation = 1; + } + + # If we're parsing a stream containing multiple reports, + # all subsequent header sections will contain the last hit of + # the previous report which needs to be parsed and added to that + # report if signifcant. It also contains the run parameters + # at the bottom of the Blast report. +# if($Blast->{'_blast_count'} > 1 || $contains_translation) { + if($Blast->{'_blast_count'} > 1) { +# print STDERR "\nMULTI-BLAST STREAM.\n"; + $Blast->{'_multi_stream'} = 1; + + if($data =~ /(.+?)$Newline(<\w+>)?(T?BLAST[NPX])\s+(.+?)$Newline(.+)/so) { + ($current_prog, $current_vers, $data) = ($3, $4, $5); + # Final chunk containing last hit and last footer. + $Blast->{'_current_blast'}->_parse_alignment($1); + $prev_blast = $Blast->{'_current_blast'}; # finalized. +# } elsif($contains_translation) { +# $data =~ /(T?BLAST[NPX])\s+(.+?)$Newline(.+)/so; +# ($current_prog, $current_vers, $data) = ($1, $2, $3); + } else { + $Blast->throw("Can't determine program type from BLAST report.", + "Checked for: @Blast_programs."); + # This has important implications for how to handle interval + # information for HSPs. TBLASTN uses nucleotides in query HSP + # but amino acids in the sbjct HSP sequence. + } + + if($data =~ m/Database:\s+(.+?)$Newline/so ) { + $current_db = $1; + } else { + # In some reports, the Database is only listed at end. + #$Blast->warn("Can't determine database name from BLAST report."); + } + + # Incyte_Fix: Nasty Invisible Bug. + # Records in blast report are delimited by '>', but... when + # there are no hits for a query, there won't be a '>'. That + # causes several blast reports to run together in the data + # passed to this routine. Need to get rid of non-hits in data + if ($data =~ /.+(No hits? found.+Sequences.+)/so) { + $data = $1; + } + # End Incyte_Fix + + } + + # Determine if we need to create a new Blast object + # or use the $self object for this method. + + if($Blast->{'_multi_stream'} or $self->name eq 'Static Blast object') { + # Strict mode is not object-specific but may be someday. +# print STDERR "\nCreating new Blast object.\n"; + $current_blast = new Bio::Tools::Blast(-STRICT => $strict); + } else { + $current_blast = $self; + } + $Blast->{'_current_blast'} = $current_blast; + + # If we're not sharing stats, set data on current blast object. + if(defined $current_prog and not $Blast->{'_share'}) { + $current_blast->program($current_prog); + $current_blast->program_version($current_vers); + $current_blast->database($current_db); + } + +# print STDERR "CURRENT BLAST = ", $current_blast->name, "\n"; + $current_blast->_parse_header($data); + + # If there were any descriptions in the header, + # we know if there are any significant hits. + # No longer throwing exception if there were no significant hits + # and a -signif parameter was specified. Doing so prevents the + # construction of a Blast object, which could still be useful. +# if($current_blast->{'_has_descriptions'} and $Blast->{'_confirm_significance'} and not $current_blast->is_signif) { +# $current_blast->throw("No significant BLAST hits for ${\$current_blast->name}"); + +# } + + } # Done parsing header/description section + +### For use with $contains_translation - not right - breaks regular report parsing. +# elsif(ref $Blast->{'_current_blast'} && $data !~ /\s*\w*\s*/s) { + elsif(ref $Blast->{'_current_blast'} ) { + # Process an alignment section. + $current_blast = $Blast->{'_current_blast'}; +# print STDERR "\nCONTINUING PROCESSING ALN WITH ", $current_blast->name, "\n"; +# print STDERR "DATA: $data\n"; + eval { + $current_blast->_parse_alignment($data); + }; + if($@) { + # push @{$self->{'_blast_errs'}}, $@; + } + } + + # If the current Blast object has been completely parsed + # (occurs with a single Blast stream), or if there is a previous + # Blast object (occurs with a multi Blast stream), + # execute a supplied function on it or store it in a supplied array. + + if( defined $prev_blast or $current_blast->{'_found_params'}) { + my $finished_blast = defined($prev_blast) ? $prev_blast : $current_blast; + + $finished_blast->_report_errors(); +# print STDERR "\nNEW BLAST OBJECT: ${\$finished_blast->name}\n"; + + if($exec_func) { +# print STDERR " RUNNING EXEC_FUNC...\n"; + &$exec_func($finished_blast); # ignoring any return value. + # Report processed, no longer need object. + $finished_blast->destroy; + undef $finished_blast; + } elsif($save_a) { +# print STDERR " SAVING IN ARRAY...\n"; + # We've already verified that if there is no exec_func + # then there must be a $save_array + push @$save_a, $finished_blast; + } + } + 1; + } + } + +=head2 _report_errors + + Title : _report_errors + Usage : n/a; Internal method called by _get_parse_blast_func(). + Purpose : Throw or warn about any errors encountered. + Returns : n/a + Args : n/a + Throws : If all hits generated exceptions, raise exception + : (a fatal event for the Blast object.) + : If some hits were okay but some were bad, generate a warning + : (a few bad applies should not spoil the bunch). + : This usually indicates a limiting B-value. + : When the parsing code fails, it is either all or nothing. + +=cut + +#------------------- +sub _report_errors { +#------------------- + my $self = shift; + + return unless ref($self->{'_blast_errs'}); +# ref($self->{'_blast_errs'}) || (print STDERR "\nNO ERRORS\n", return ); + + my @errs = @{$self->{'_blast_errs'}}; + + if(scalar @errs) { + my ($str); + @{$self->{'_blast_errs'}} = (); # clear the errs on the object. + # When there are many errors, in most of the cases, they are + # caused by the same problem. Only need to see full data for + # the first one. + if(scalar @errs > 2) { + $str = "SHOWING FIRST EXCEPTION ONLY:\n$errs[0]"; + $self->clear_err(); # clearing the existing set of errors (conserve memory). + # Not necessary, unless the -RECORD_ERR =>1 + # constructor option was used for Blast object. + } else { + $str = join("\n",@errs); + } + + if(not $self->{'_num_hits_significant'}) { + $self->throw(sprintf("Failed to parse any hit data (n=%d).", scalar(@errs)), + "\n\nTRAPPED EXCEPTION(S):\n$str\nEND TRAPPED EXCEPTION(S)\n" + ); + } else { + $self->warn(sprintf("Some potential hits were not parsed (n=%d).", scalar(@errs)), + @errs > 2 ? "This may be due to a limiting B value (max alignment listings)." : "", + "\n\nTRAPPED EXCEPTION(S):\n$str\nEND TRAPPED EXCEPTION(S)\n" + ); + } + } +} + +=head2 _parse_header + + Usage : n/a; called automatically by the _get_parse_blast_func(). + Purpose : Parses the header section of a BLAST report. + Argument : String containing the header+description section of a BLAST report. + Throws : Exception if description data cannot be parsed properly. + : Exception if there is a 'FATAL' error in the Blast report. + : Warning if there is a 'WARNING' in the Blast report. + : Warning if there are no significant hits. + Comments : Description section contains a single line for each hit listing + : the seq id, description, score, Expect or P-value, etc. + +See Also : L<_get_parse_blast_func()|_get_parse_blast_func> + +=cut + +#---------------------- +sub _parse_header { +#---------------------- + my( $self, $data ) = @_; + +# print STDERR "\n$ID: PARSING HEADER\n"; #$data\n"; + + $data =~ s/^\s+|\s+>?$//sg; + + if($data =~ /<HTML/i) { + $self->throw("Can't parse HTML-formatted BLAST reports.", +# "Such reports can be parsed with a special parsing \n". +# "script included in the examples/blast directory \n". +# "of the Bioperl distribution. (TODO)" + ); + # This was the old strategy, can't do it with new strategy + # since we don't have the whole report in one chunk. + # This could be the basis for the "special parsing script". +# require Bio::Tools::Blast::HTML; +# Bio::Tools::Blast::HTML->import(&strip_html); +# &strip_html(\$data); + } + + $data =~ /WARNING: (.+?)$Newline$Newline/so and $self->warn("$1") if $self->strict; + $data =~ /FATAL: (.+?)$Newline$Newline/so and $self->throw("FATAL BLAST ERROR = $1"); + # No longer throwing exception when no hits were found. Still reporting it. + $data =~ /No hits? found/i and $self->warn("No hits were found.") if $self->strict; + + # If this is the first Blast, the program, version, and database info + # pertain to it. Otherwise, they are for the previous report and have + # already been parsed out. + # Data is stored in the static Blast object. Data for subsequent reports + # will be stored in separate objects if the -share parameter is not set. + # See _get_parse_blast_func(). + + if($Blast->{'_blast_count'} == 1) { + if($data =~ /(<\w+>)?(T?BLAST[NPX])\s+(.+?)$Newline/so) { + $Blast->program($2); + $Blast->program_version($3); + } else { + $self->throw("Can't determine program type from BLAST report.", + "Checked for: @Blast_programs."); + # This has important implications for how to handle interval + # information for HSPs. TBLASTN uses nucleotides in query HSP + # but amino acids in the sbjct HSP sequence. + } + + if($data =~ m/Database:\s+(.+?)$Newline/so ) { + $Blast->database($1); + } else { + # In some reports, the Database is only listed at end. + #$self->warn("Can't determine database name from BLAST report (_parse_header)\n$data\n."); + } + } + + my ($header, $descriptions); + + ## For efficiency reasons, we want to to avoid using $' and $`. + ## Therefore using single-line mode pattern matching. + + if($data =~ /(.+?)\nSequences producing.+?\n(.+)/s ) { + ($header, $descriptions) = ($1, $2); + $self->{'_has_descriptions'} = 1; + } else { + $header = $data; + $self->{'_has_descriptions'} = 0; + # Blast reports can legally lack description section. No need to warn. + #push @{$self->{'_blast_errs'}}, "Can't parse description data."; + } + + $self->_set_query($header); # The name of the sequence will appear in error report. +# print STDERR "\nQUERY = ", $Blast->{'_current_blast'}->query, "\n"; + + $self->_set_date($header) if $Blast->{'_get_stats'}; + $self->_set_length($header); + +# not $Blast->{'_confirm_significance'} and print STDERR "\nNOT PARSING DESCRIPTIONS.\n"; + + # Setting the absolute max and min significance levels. + $self->{'_highestSignif'} = 0; + $self->{'_lowestSignif'} = $DEFAULT_SIGNIF; + + if ($Blast->{'_confirm_significance'} || $Blast->{'_no_aligns'}) { + $self->_parse_descriptions($descriptions) if $descriptions; + } else { + $self->{'_is_significant'} = 1; + } + } + +#----------------------- +sub _parse_descriptions { +#----------------------- + my ($self, $desc) = @_; + + # NOTE: This method will not be called if the report lacks + # a description section. + +# print STDERR "\nPARSING DESCRIPTION DATA\n"; + + my @descriptions = split( $Newline, $desc); + my($line); + + # NOW step through each line parsing out the P/Expect value + # All we really need to do is check the first one, if it doesn't + # meet the significance requirement, we can skip the report. + # BUT: we want to collect data for all hits anyway to get min/max signif. + + my $my_signif = $self->signif; + my $layout_set = $Blast->{'_layout'} || 0; + my $layout; + my $count = 0; + my $sig; + + desc_loop: + foreach $line (@descriptions) { + $count++; + last desc_loop if $line =~ / NONE |End of List/; + next desc_loop if $line =~ /^\s*$/; + next desc_loop if $line =~ /^\.\./; + + ## Checking the significance value (P- or Expect value) of the hit + ## in the description line. + + # These regexps need testing on a variety of reports. + if ( $line =~ /\d+\s{1,5}[\de.-]+\s*$/) { + $layout = 2; + } elsif( $line =~ /\d+\s{1,5}[\de.-]+\s{1,}\d+\s*$/) { + $layout = 1; + } else { + $self->warn("Can't parse significance data in description line $line"); + next desc_loop; + } + not $layout_set and ($self->_layout($layout), $layout_set = 1); + + $sig = &_parse_signif( $line, $layout ); + +# print STDERR " Parsed signif ($layout) = $sig\n"; + + last desc_loop if ($sig > $my_signif and not $Blast->{'_check_all'}); + $self->_process_significance($sig, $my_signif); + } + +# printf "\n%d SIGNIFICANT HITS.\nDONE PARSING DESCRIPTIONS.\n", $self->{'_num_hits_significant'}; +} + +sub _process_significance { + my($self, $sig, $my_signif) = @_; + + $self->{'_highestSignif'} = ($sig > $self->{'_highestSignif'}) + ? $sig : $self->{'_highestSignif'}; + + $self->{'_lowestSignif'} = ($sig < $self->{'_lowestSignif'}) + ? $sig : $self->{'_lowestSignif'}; + + # Significance value assessment. + $sig <= $my_signif and $self->{'_num_hits_significant'}++; + $self->{'_num_hits'}++; + + $self->{'_is_significant'} = 1 if $self->{'_num_hits_significant'}; +} + +=head2 _parse_alignment + + Usage : n/a; called automatically by the _get_parse_blast_func(). + Purpose : Parses a single alignment section of a BLAST report. + Argument : String containing the alignment section. + Throws : n/a; All errors are trapped while parsing the hit data + : and are processed as a group when the report is + : completely processed (See _report_errors()). + : + Comments : Alignment section contains all HSPs for a hit. + : Requires Bio::Tools::Blast::Sbjct.pm. + : Optionally calls a filter function to screen the hit on arbitrary + : criteria. If the filter function returns true for a given hit, + : that hit will be skipped. + : + : If the Blast object was created with -check_all_hits set to true, + : all hits will be checked for significance and processed if necessary. + : If this field is false, the parsing will stop after the first + : non-significant hit. + : See parse() for description of parsing parameters. + +See Also : L<parse()|parse>, L<_get_parse_blast_func()|_get_parse_blast_func>, L<_report_errors()|_report_errors>, B<Bio::Tools::Blast::Sbjct()>, L<Links to related modules> + +=cut + +#---------------------- +sub _parse_alignment { +#---------------------- +# This method always needs to check detect if the $data argument +# contains the footer of a Blast report, indicating the last chunk +# of a single Blast stream. + + my( $self, $data ) = @_; + +# printf STDERR "\nPARSING ALIGNMENT DATA for %s $self.\n", $self->name; + + # NOTE: $self->{'_current_hit'} is an instance variable + # The $Blast object will not have this member. + + # If all of the significant hits have been parsed, + # return if we're not checking all or if we don't need to get + # the Blast stats (parameters at footer of report). + if(defined $self->{'_current_hit'} and + defined $self->{'_num_hits_significant'}) { + return if $self->{'_current_hit'} >= $self->{'_num_hits_significant'} and + not ($Blast->{'_check_all'} or $Blast->{'_get_stats'}); + } + + # Check for the presence of the Blast footer section. + # _parse_footer returns the alignment section. + $data = $self->_parse_footer($data); + + # Return if we're only interested in the best hit. + # This has to occur after checking for the parameters section + # in the footer (since we may still be interested in them). + return if $Blast->best and ( defined $self->{'_current_hit'} and $self->{'_current_hit'} >=1); + +# print "RETURNED FROM _parse_footer (", $self->to_string, ")"; +# print "\n --> FOUND PARAMS.\n" if $self->{'_found_params'}; +# print "\n --> DID NOT FIND PARAMS.\n" unless $self->{'_found_params'}; + + require Bio::Tools::Blast::Sbjct; + + $data =~ s/^\s+|\s+>?$//sg; + $data =~ s/$Newline$Newline/$Newline/sog; # remove blank lines. + my @data = split($Newline, $data); + push @data, 'end'; + +# print STDERR "\nALIGNMENT DATA:\n$data\n"; + + my $prog = $self->program; + my $check_all = $Blast->{'_check_all'}; + my $filt_func = $Blast->{'_filt_func'} || 0; + my $signif_fmt = $Blast->{'_signif_fmt'}; + my $my_signif = $self->signif; + my $err; + + # Now construct the Sbjct objects from the alignment section + +# debug(1); + + $self->{'_current_hit'}++; + + # If not confirming significance, _parse_descriptions will not have been run, + # so we need to count the total number of hits here. + if( not $Blast->{'_confirm_significance'}) { + $self->{'_num_hits'}++; + } + + if($Blast->{'_no_aligns'}) { +# printf STDERR "\nNOT PARSING ALIGNMENT DATA\n"; + return; + } + + my $hit; # Must be my'ed within hit_loop. + eval { + $hit = new Bio::Tools::Blast::Sbjct (-DATA =>\@data, + -PARENT =>$self, + -NAME =>$self->{'_current_hit'}, + -RANK =>$self->{'_current_hit'}, + -RANK_BY =>'order', + -PROGRAM =>$prog, + -SIGNIF_FMT=>$signif_fmt, + -OVERLAP =>$Blast->{'_overlap'} || $MAX_HSP_OVERLAP, + ); +# printf STDERR "NEW HIT: %s, SIGNIFICANCE = %g\n", $hit->name, $hit->expect; <STDIN>; + # The BLAST report may have not had a description section. + if(not $self->{'_has_descriptions'}) { + $self->_process_significance($hit->signif, $my_signif); + } + }; + + if($@) { + # Throwing lots of errors can slow down the code substantially. + # Error handling code is not that efficient. + #print STDERR "\nERROR _parse_alignment: $@\n"; + push @{$self->{'_blast_errs'}}, $@; + $hit->destroy if ref $hit; + undef $hit; + } else { + # Collect overall signif data if we don't already have it, + # (as occurs if no -signif parameter is supplied). + my $hit_signif = $hit->signif; + + if (not $Blast->{'_confirm_significance'} ) { + $self->{'_highestSignif'} = ($hit_signif > $self->{'_highestSignif'}) + ? $hit_signif : $self->{'_highestSignif'}; + + $self->{'_lowestSignif'} = ($hit_signif < $self->{'_lowestSignif'}) + ? $hit_signif : $self->{'_lowestSignif'}; + } + + # Test significance using custom function (if supplied) + if($filt_func) { + if(&$filt_func($hit)) { + push @{$self->{'_hits'}}, $hit; + } else { + $hit->destroy; undef $hit; + } + } elsif($hit_signif <= $my_signif) { + push @{$self->{'_hits'}}, $hit; + } + } + + } + +=head2 _parse_footer + + Usage : n/a; internal function. called by _parse_alignment() + Purpose : Extracts statistical and other parameters from the BLAST report. + : Sets various key elements such as the program and version, + : gapping, and the layout for the report (blast1 or blast2). + Argument : Data to be parsed. + Returns : String containing an alignment section for processing by + : _parse_alignment(). + Throws : Exception if cannot find the parameters section of report. + : Warning if cannot determine if gapping was used. + : Warning if cannot determine the scoring matrix used. + Comments : This method must always get called, even if the -STATS + : parse() parameter is false. The reason is that the layout + : of the report and the presence of gapping must always be set. + : The determination whether to set additional stats is made + : by methods called by _parse_footer(). + +See Also : L<parse()|parse>, L<_parse_alignment()|_parse_alignment>, L<_set_database()|_set_database> + +=cut + +#--------------------- +sub _parse_footer { +#--------------------- +# Basic strategy: +# 1. figure out if we're supposed to get the stats, +# 2. figure out if the stats are to be shared. some, not all can be shared +# (eg., db info and matrix can be shared, karlin altschul params cannot. +# However, this method assumes they are all sharable.) +# 3. Parse the stats. +# 4. return the block before the parameters section if the supplied data +# contains a footer parameters section. + + my ($self, $data) = @_; + my ($client, $last_align, $params); + +# printf STDERR "\nPARSING PARAMETERS for %s $self.\n", $self->name; + + # Should the parameters be shared? + # If so, set $self to be the static $Blast object and return if + # the parameters were already set. + # Before returning, we need to extract the last alignment section + # from the parameter section, if any. + + if ($Blast->{'_share'}) { + $client = $self; + $self = $Blast if $Blast->{'_share'}; + } + + my $get_stats = $Blast->{'_get_stats'}; + + if( $data =~ /(.+?)${Newline}CPU time: (.*)/so) { + # NCBI-Blast2 format (v2.04). + ($last_align, $params) = ($1, $2); + return $last_align if $client->{'_found_params'}; + $self->_set_blast2_stats($params); + + } elsif( $data =~ /(.+?)${Newline}Parameters:(.*)/so) { + # NCBI-Blast1 or WashU-Blast2 format. + ($last_align, $params) = ($1, $2); + return $last_align if $client->{'_found_params'}; + $self->_set_blast1_stats($params); + + } elsif( $data =~ /(.+?)$Newline\s+Database:(.*)/so) { + # Gotta watch out for confusion with the Database: line in the header + # which will be present in the last hit of an internal Blast report + # in a multi-report stream. + + # NCBI-Blast2 format (v2.05). + ($last_align, $params) = ($1, $2); + return $last_align if $client->{'_found_params'}; + $self->_set_blast2_stats($params); + + } elsif( $data =~ /(.+?)$Newline\s*Searching/so) { + # trying to detect a Searching at the end of a PSI-blast round. + # Gotta watch out for confusion with the Searching line in the header + # which will be present in the last hit of an internal Blast report + # in a multi-report, non-PSI-blast stream. + + # PSI-Blast format (v2.08). + ($last_align) = ($1); + return $last_align; # if $client->{'_found_params'}; + } + + # If parameter section was found, set a boolean, + # otherwise return original data. + + if (defined($params)) { + $client->{'_found_params'} = 1; + } else { + return $data; + } + + $self->_set_database($params) if $get_stats; + + # The {'_gapped'} member should be set in the _set_blast?_stats() call. + # This is a last minute attempt to deduce it. + + if(!defined($self->{'_gapped'})) { + if($self->program_version() =~ /^1/) { + $self->{'_gapped'} = 0; + } else { + if($self->strict > 0) { + $self->warn("Can't determine if gapping was used. Assuming gapped."); + } + $self->{'_gapped'} = 1; + } + } + + return $last_align; +} + +=head2 _set_blast2_stats + + Usage : n/a; internal function called by _parse_footer() + Purpose : Extracts statistical and other parameters from BLAST2 report footer. + : Stats collected: database release, gapping, + : posted date, matrix used, filter used, Karlin-Altschul parameters, + : E, S, T, X, W. + Throws : Exception if cannot get "Parameters" section of Blast report. + +See Also : L<parse()|parse>, L<_parse_footer()|_parse_footer>, L<_set_database()|_set_database>, B<Bio::Tools::SeqAnal::set_date()>,L<Links to related modules> + +=cut + +#---------------------' +sub _set_blast2_stats { +#--------------------- + my ($self, $data) = (@_); + + if($data =~ /$Newline\s*Gapped/so) { + $self->{'_gapped'} = 1; + } else { + $self->{'_gapped'} = 0; + } + + # Other stats are not always essential. + return unless $Blast->{'_get_stats'}; + + # Blast2 Doesn't report what filter was used in the parameters section. + # It just gives a warning that *some* filter was used in the header. + # You just have to know the defaults (currently: protein = SEG, nucl = DUST). + if($data =~ /\bfiltered\b/si) { + $self->{'_filter'} = 'DEFAULT FILTER'; + } else { + $self->{'_filter'} = 'NONE'; + } + + if($data =~ /Gapped$Newline\s*Lambda +K +H$Newline +(.+?)$Newline/so) { + my ($l, $k, $h) = split(/\s+/, $1); + $self->{'_lambda'} = $l || 'UNKNOWN'; + $self->{'_k'} = $k || 'UNKNOWN'; + $self->{'_h'} = $h || 'UNKNOWN'; + } elsif($data =~ /Lambda +K +H$Newline +(.+?)$Newline/so) { + my ($l, $k, $h) = split(/\s+/, $1); + $self->{'_lambda'} = $l || 'UNKNOWN'; + $self->{'_k'} = $k || 'UNKNOWN'; + $self->{'_h'} = $h || 'UNKNOWN'; + } + + if($data =~ /$Newline\s*Matrix: (.+?)$Newline/so) { + $self->{'_matrix'} = $1; + } else { + $self->{'_matrix'} = $DEFAULT_MATRIX.'?'; + if($self->strict > 0) { + $self->warn("Can't determine scoring matrix. Assuming $DEFAULT_MATRIX."); + } + } + + if($data =~ /$Newline\s*Gap Penalties: Existence: +(\d+), +Extension: (\d+)$Newline/so) { + $self->{'_gapCreation'} = $1; + $self->{'_gapExtension'} = $2; + } + if($data =~ /sequences better than (\d+):/s) { + $self->{'_expect'} = $1; + } + + if($data =~ /$Newline\s*T: (\d+)/o) { $self->{'_word_size'} = $1; } + if($data =~ /$Newline\s*A: (\d+)/o) { $self->{'_a'} = $1; } + if($data =~ /$Newline\s*S1: (\d+)/o) { $self->{'_s'} = $1; } + if($data =~ /$Newline\s*S2: (\d+)/o) { $self->{'_s'} .= ", $1"; } + if($data =~ /$Newline\s*X1: (\d+)/o) { $self->{'_x1'} = $1; } + if($data =~ /$Newline\s*X2: (\d+)/o) { $self->{'_x2'} = $1; } +} + +=head2 _set_blast1_stats + + Usage : n/a; internal function called by _parse_footer() + Purpose : Extracts statistical and other parameters from BLAST 1.x style eports. + : Handles NCBI Blast1 and WashU-Blast2 formats. + : Stats collected: database release, gapping, + : posted date, matrix used, filter used, Karlin-Altschul parameters, + : E, S, T, X, W. + +See Also : L<parse()|parse>, L<_parse_footer()|_parse_footer>, L<_set_database()|_set_database>, B<Bio::Tools::SeqAnal::set_date()>,L<Links to related modules> + +=cut + +#---------------------- +sub _set_blast1_stats { +#---------------------- + my ($self, $data) = (@_); + + if(!$self->{'_gapped'} and $self->program_version() =~ /^2[\w\-\.]+WashU/) { + $self->_set_gapping_wu($data); + } else { + $self->{'_gapped'} = 0; + } + + # Other stats are not always essential. + return unless $Blast->{'_get_stats'}; + + if($data =~ /filter=(.+?)$Newline/so) { + $self->{'_filter'} = $1; + } elsif($data =~ /filter$Newline +(.+?)$Newline/so) { + $self->{'_filter'} = $1; + } else { + $self->{'_filter'} = 'NONE'; + } + + if($data =~ /$Newline\s*E=(\d+)$Newline/so) { $self->{'_expect'} = $1; } + + if($data =~ /$Newline\s*M=(\w+)$Newline/so) { $self->{'_matrix'} = $1; } + + if($data =~ /\s*Frame MatID Matrix name .+?$Newline +(.+?)$Newline/so) { + ## WU-Blast2. + my ($fr, $mid, $mat, $lu, $ku, $hu, $lc, $kc, $hc) = split(/\s+/,$1); + $self->{'_matrix'} = $mat || 'UNKNOWN'; + $self->{'_lambda'} = $lu || 'UNKNOWN'; + $self->{'_k'} = $ku || 'UNKNOWN'; + $self->{'_h'} = $hu || 'UNKNOWN'; + + } elsif($data =~ /Lambda +K +H$Newline +(.+?)$Newline/so) { + ## NCBI-Blast1. + my ($l, $k, $h) = split(/\s+/, $1); + $self->{'_lambda'} = $l || 'UNKNOWN'; + $self->{'_k'} = $k || 'UNKNOWN'; + $self->{'_h'} = $h || 'UNKNOWN'; + } + + if($data =~ /E +S +W +T +X.+?$Newline +(.+?)$Newline/so) { + # WashU-Blast2 + my ($fr, $mid, $len, $elen, $e, $s, $w, $t, $x, $e2, $s2) = split(/\s+/,$1); + $self->{'_expect'} ||= $e || 'UNKNOWN'; + $self->{'_s'} = $s || 'UNKNOWN'; + $self->{'_word_size'} = $w || 'UNKNOWN'; + $self->{'_t'} = $t || 'UNKNOWN'; + $self->{'_x'} = $x || 'UNKNOWN'; + + } elsif($data =~ /E +S +T1 +T2 +X1 +X2 +W +Gap$Newline +(.+?)$Newline/so) { + ## NCBI-Blast1. + my ($e, $s, $t1, $t2, $x1, $x2, $w, $gap) = split(/\s+/,$1); + $self->{'_expect'} ||= $e || 'UNKNOWN'; + $self->{'_s'} = $s || 'UNKNOWN'; + $self->{'_word_size'} = $w || 'UNKNOWN'; + $self->{'_t1'} = $t1 || 'UNKNOWN'; + $self->{'_t2'} = $t2 || 'UNKNOWN'; + $self->{'_x1'} = $x1 || 'UNKNOWN'; + $self->{'_x2'} = $x2 || 'UNKNOWN'; + $self->{'_gap'} = $gap || 'UNKNOWN'; + } + + if(!$self->{'_matrix'}) { + $self->{'_matrix'} = $DEFAULT_MATRIX.'?'; + if($self->strict > 0) { + $self->warn("Can't determine scoring matrix. Assuming $DEFAULT_MATRIX."); + } + } +} + +=head2 _set_gapping_wu + + Usage : n/a; internal function called by _set_blast1_stats() + Purpose : Determine if gapping_wu was on for WashU Blast reports. + Comments : In earlier versions, gapping was always specified + : but in the current version (2.0a19MP), gapping is on by default + : and there is no positive "gapping" indicator in the Parameters + : section. + +See Also : L<_set_blast1_stats()|_set_blast1_stats> + +=cut + +#-------------------- +sub _set_gapping_wu { +#-------------------- + my ($self, $data) = @_; + + if($data =~ /gaps?$Newline/so) { + $self->{'_gapped'} = ($data =~ /nogaps?$Newline/so) ? 0 : 1; + } else { + $self->{'_gapped'} = 1; + } +} + +=head2 _set_date + + Usage : n/a; internal function called by _parse_footer() + Purpose : Determine the date on which the Blast analysis was performed. + Comments : Date information is not consistently added to Blast output. + : Uses superclass method set_date() to set date from the file, + : (if any). + +See Also : L<_parse_footer()|_parse_footer>, B<Bio::Tools::SeqAnal::set_date()>,L<Links to related modules> + +=cut + +#-------------- +sub _set_date { +#-------------- + my $self = shift; + my $data = shift; + + ### Network BLAST reports from NCBI are time stamped as follows: + #Fri Apr 18 15:55:41 EDT 1997, Up 1 day, 19 mins, 1 user, load: 19.54, 19.13, 17.77 + if($data =~ /Start:\s+(.+?)\s+End:/s) { + ## Calling superclass method to set the date. + ## If we can't get date from the report, file date is obtained. + $self->set_date($1); + } elsif($data =~ /Date:\s+(.*?)$Newline/so) { + ## E-mailed reports have a Date: field + $self->set_date($1); + } elsif( $data =~ /done\s+at (.+?)$Newline/so ) { + $self->set_date($1); + } elsif( $data =~ /$Newline([\w:, ]+), Up \d+/so ) { + $self->set_date($1); + } else { + ## Otherwise, let superclass attempt to get the file creation date. + $self->set_date() if $self->file; + } +} + +=head2 _set_length + + Usage : n/a; called automatically during Blast report parsing. + Purpose : Sets the length of the query sequence (extracted from report). + Returns : integer (length of the query sequence) + Throws : Exception if cannot determine the query sequence length from + : the BLAST report. + : Exception if the length is below the min_length cutoff (if any). + Comments : The logic here is a bit different from the other _set_XXX() + : methods since the significance of the BLAST report is assessed + : if MIN_LENGTH is set. + +See Also : B<Bio::Tools::SeqAnal::length()>, L<Links to related modules> + +=cut + +#--------------- +sub _set_length { +#--------------- + my ($self, $data) = @_; + + my ($length); + if( $data =~ m/$Newline\s+\(([\d|,]+) letters[\);]/so ) { + $length = $1; + $length =~ s/,//g; +# printf "Length = $length in BLAST for %s$Newline",$self->name; <STDIN>; + } else { + $self->throw("Can't determine sequence length from BLAST report."); + } + + my($sig_len); + if(defined($Blast->{'_min_length'})) { + local $^W = 0; + if($length < $Blast->{'_min_len'}) { + $self->throw("Query sequence too short for ${\$self->name} ($length)", + "Minimum length is $Blast->{'_min_len'}"); + } + } + + $self->length($length); # defined in superclass. +} + +=head2 _set_database + + Usage : n/a; called automatically during Blast report parsing. + Purpose : Sets the name of the database used by the BLAST analysis. + : Extracted from raw BLAST report. + Throws : Exception if the name of the database cannot be determined. + Comments : The database name is used by methods or related objects + : for database-specific parsing. + +See Also : L<parse()|parse>, B<Bio::Tools::SeqAnal::database()>,B<Bio::Tools::SeqAnal::_set_db_stats()>,L<Links to related modules> + +=cut + +#------------------ +sub _set_database { +#------------------ +# This now only sets data base information extracted from the report footer. + + my ($self, $data) = @_; + + my ($name, $date, $lets, $seqs); + + my $strict = $self->strict > 0; + + # This is fail-safe since DB name usually gets set in _parse_header() + # In some reports, the database is only listed at bottom (NCBI 2.0.8). + if($data =~ m/Database: +(.+?)$Newline/so ) { + $name = $1; + } elsif(not $self->database) { + $self->warn("Can't determine database name from BLAST report."); + } + + if($data =~ m/Posted date: +(.+?)$Newline/so ) { + $date = $1; + } elsif($data =~ m/Release date: +(.+?)$Newline/so ) { + $date = $1; + } elsif($strict) { + $self->warn("Can't determine database release date."); + } + + if($data =~ m/letters in database: +([\d,]+)/si || + $data =~ m/length of database: +([\d,]+)/si ) { + $lets = $1; + } elsif($strict) { + $self->warn("Can't determine number of letters in database.\n$data\n"); + } + + if($data =~ m/sequences in database: +([\d,]+)/si || + $data =~ m/number of sequences: +([\d,]+)/si ) { + $seqs = $1; + } elsif($strict) { + $self->warn("Can't determine number of sequences in database.\n$data\n"); + } + + $self->_set_db_stats( -NAME => $name, + -RELEASE => $date || '', + -LETTERS => $lets || '', + -SEQS => $seqs || '' + ); +} + +=head2 _set_query + + Usage : n/a; called automatically during Blast report parsing. + Purpose : Set the name of the query and the query description. + : Extracted from the raw BLAST report. + Returns : String containing name of query extracted from report. + Throws : Warning if the name of the query cannont be obtained. + +See Also : B<Bio::Tools::SeqAnal::query_desc()>,L<Links to related modules> + +=cut + +#--------------- +sub _set_query { +#--------------- + my $self = shift; + my $data = shift; + + if($data =~ m/${Newline}Query= *(.+?)$Newline/so ) { + my $info = $1; + $info =~ s/TITLE //; + # Split the query line into two parts. + # Using \s instead of ' ' + $info =~ /(\S+?)\s(.*)/; + $self->query_desc($2 || ''); + # set name of Blast object and return. + $self->name($1 || 'UNKNOWN'); + } else { + $self->warn("Can't determine query sequence name from BLAST report."); + } +# print STDERR "$Newline NAME = ${\$self->name}$Newline"; +} + +=head2 _parse_signif + + Usage : &_parse_signif(string, layout, gapped); + : This is a class function. + Purpose : Extracts the P- or Expect value from a single line of a BLAST description section. + Example : &_parse_signif("PDB_UNIQUEP:3HSC_ heat-shock cognate ... 799 4.0e-206 2", 1); + : &_parse_signif("gi|758803 (U23828) peritrophin-95 precurs 38 0.19", 2); + Argument : string = line from BLAST description section + : layout = integer (1 or 2) + : gapped = boolean (true if gapped Blast). + Returns : Float (0.001 or 1e-03) + Status : Static + +=cut + +#------------------ +sub _parse_signif { +#------------------ + my ($line, $layout, $gapped) = @_; + + local $_ = $line; + my @linedat = split(); + + # When processing both Blast1 and Blast2 reports + # in the same run, offset needs to be configured each time. + + my $offset = 0; + $offset = 1 if $layout == 1 or not $gapped; + + my $signif = $linedat[ $#linedat - $offset ]; + + # fail-safe check + if(not $signif =~ /[.-]/) { + $offset = ($offset == 0 ? 1 : 0); + $signif = $linedat[ $#linedat - $offset ]; + } + + $signif = "1$signif" if $signif =~ /^e/i; + return $signif; +} + +## +## BEGIN ACCESSOR METHODS THAT INCORPORATE THE STATIC $Blast OBJECT. +## + +sub program { +## Overridden method to incorporate the BLAST object. + my $self = shift; + return $self->SUPER::program(@_) if @_; # set + $self->SUPER::program || $Blast->SUPER::program; # get +} + +sub program_version { +## Overridden method to incorporate the BLAST object. + my $self = shift; + return $self->SUPER::program_version(@_) if @_; # set + $self->SUPER::program_version || $Blast->SUPER::program_version; # get +} + +sub database { +## Overridden method to incorporate the BLAST object. + my $self = shift; + return $self->SUPER::database(@_) if @_; # set + $self->SUPER::database || $Blast->SUPER::database; # get +} + +sub database_letters { +## Overridden method to incorporate the BLAST object. + my $self = shift; + return $self->SUPER::database_letters(@_) if @_; # set + $self->SUPER::database_letters || $Blast->SUPER::database_letters; # get +} + +sub database_release { +## Overridden method to incorporate the BLAST object. + my $self = shift; + return $self->SUPER::database_release(@_) if @_; # set + $self->SUPER::database_release || $Blast->SUPER::database_release; # get +} + +sub database_seqs { +## Overridden method to incorporate the BLAST object. + my $self = shift; + return $self->SUPER::database_seqs(@_) if @_; # set + $self->SUPER::database_seqs || $Blast->SUPER::database_seqs; # get +} + +sub date { +## Overridden method to incorporate the BLAST object. + my $self = shift; + return $self->SUPER::date(@_) if @_; # set + $self->SUPER::date || $Blast->SUPER::date; # get +} + +sub best { +## Overridden method to incorporate the BLAST object. + my $self = shift; + return $Blast->SUPER::best(@_) if @_; # set + $Blast->SUPER::best; # get +} + +=head2 signif + + Usage : $blast->signif(); + Purpose : Gets the P or Expect value used as significance screening cutoff. + Returns : Scientific notation number with this format: 1.0e-05. + Argument : n/a + Comments : Screening of significant hits uses the data provided on the + : description line. For Blast1 and WU-Blast2, this data is P-value. + : for Blast2 it is an Expect value. + : + : Obtains info from the static $Blast object if it has not been set + : for the current object. + +See Also : L<_set_signif()|_set_signif> + +=cut + +#----------- +sub signif { +#----------- + my $self = shift; + my $sig = $self->{'_significance'} || $Blast->{'_significance'}; + sprintf "%.1e", $sig; +} + +=head2 is_signif + + Usage : $blast->is_signif(); + Purpose : Determine if the BLAST report contains significant hits. + Returns : Boolean + Argument : n/a + Comments : BLAST reports without significant hits but with defined + : significance criteria will throw exceptions during construction. + : This obviates the need to check significant() for + : such objects. + +See Also : L<_set_signif()|_set_signif> + +=cut + +#------------ +sub is_signif { my $self = shift; return $self->{'_is_significant'}; } +#------------ + +# is_signif() doesn't incorporate the static $Blast object but is included +# here to be with the other 'signif' methods. + +=head2 signif_fmt + + Usage : $blast->signif_fmt( [FMT] ); + Purpose : Allows retrieval of the P/Expect exponent values only + : or as a two-element list (mantissa, exponent). + Usage : $blast_obj->signif_fmt('exp'); + : $blast_obj->signif_fmt('parts'); + Returns : String or '' if not set. + Argument : String, FMT = 'exp' (return the exponent only) + : = 'parts'(return exponent + mantissa in 2-elem list) + : = undefined (return the raw value) + Comments : P/Expect values are still stored internally as the full, + : scientific notation value. + : This method uses the static $Blast object since this issue + : will pertain to all Blast reports within a given set. + : This setting is propagated to Bio::Tools::Blast::Sbjct.pm. + +=cut + +#------------- +sub signif_fmt { +#------------- + my $self = shift; + if(@_) { $Blast->{'_signif_fmt'} = shift; } + $Blast->{'_signif_fmt'} || ''; +} + +=head2 min_length + + Usage : $blast->min_length(); + Purpose : Gets the query sequence length used as significance screening criteria. + Returns : Integer + Argument : n/a + Comments : Obtains info from the static $Blast object if it has not been set + : for the current object. + +See Also : L<_set_signif()|_set_signif>, L<signif()|signif> + +=cut + +#-------------- +sub min_length { +#-------------- + my $self = shift; + $self->{'_min_length'} || $Blast->{'_min_length'}; +} + +=head2 gapped + + Usage : $blast->gapped(); + Purpose : Set/Get boolean indicator for gapped BLAST. + Returns : Boolean + Argument : n/a + Comments : Obtains info from the static $Blast object if it has not been set + : for the current object. + +=cut + +#----------- +sub gapped { +#----------- + my $self = shift; + if(@_) { $self->{'_gapped'} = shift; } + $self->{'_gapped'} || $Blast->{'_gapped'}; +} + +=head2 _get_stats + + Usage : n/a; internal method. + Purpose : Set/Get indicator for collecting full statistics from report. + Returns : Boolean (0 | 1) + Comments : Obtains info from the static $Blast object which gets set + : by _init_parse_params(). + +=cut + +#--------------- +sub _get_stats { +#--------------- + my $self = shift; + $Blast->{'_get_stats'}; +} + +=head2 _layout + + Usage : n/a; internal method. + Purpose : Set/Get indicator for the layout of the report. + Returns : Integer (1 | 2) + : Defaults to 2 if not set. + Comments : Blast1 and WashU-Blast2 have a layout = 1. + : This is intended for internal use by this and closely + : allied modules like Sbjct.pm and HSP.pm. + : + : Obtains info from the static $Blast object if it has not been set + : for the current object. + +=cut + +#------------ +sub _layout { +#------------ + my $self = shift; + if(@_) { + # Optimization if we know all reports share the same stats. + if($Blast->{'_share'}) { + $Blast->{'_layout'} = shift; + } else { + $self->{'_layout'} = shift; + } + } + $self->{'_layout'} || $Blast->{'_layout'} || 2; +} + +## +## END ACCESSOR METHODS THAT INCORPORATE THE STATIC $Blast OBJECT. +## + +=head2 hits + + Usage : $blast->hits(); + Purpose : Get a list containing all BLAST hit (Sbjct) objects. + : Get the numbers of significant hits. + Examples : @hits = $blast->hits(); + : $num_signif = $blast->hits(); + Returns : List context : list of Bio::Tools::Blast::Sbjct.pm objects + : or an empty list if there are no hits. + : Scalar context: integer (number of significant hits) + : or zero if there are no hits. + : (Equivalent to num_hits()). + Argument : n/a. Relies on wantarray. + Throws : n/a. + : Not throwing exception because the absence of hits may have + : resulted from stringent significance criteria, not a failure + : set the hits. + +See Also : L<hit()|hit>, L<num_hits()|num_hits>, L<is_signif()|is_signif>, L<_set_signif()|_set_signif> + +=cut + +#---------- +sub hits { +#---------- + my $self = shift; + + if(wantarray) { + my @ary = ref($self->{'_hits'}) ? @{$self->{'_hits'}} : (); + return @ary; + } else { + return $self->num_hits(); + } + +# my $num = ref($self->{'_hits'}) ? scalar(@{$self->{'_hits'}}) : 0; +# my @ary = ref($self->{'_hits'}) ? @{$self->{'_hits'}} : (); +# +# return wantarray +# # returning list containing all hits or empty list. +# ? $self->{'_is_significant'} ? @ary : () +# # returning number of hits or 0. +# : $self->{'_is_significant'} ? $num : 0; +} + +=head2 hit + + Example : $blast_obj->hit( [class] ) + Purpose : Get a specific hit object. + : Provides some syntactic sugar for the hits() method. + Usage : $hitObj = $blast->hit(); + : $hitObj = $blast->hit('best'); + : $hitObj = $blast->hit('worst'); + : $hitObj = $blast->hit( $name ); + Returns : Object reference for a Bio::Tools::Blast::Sbjct.pm object. + : undef if there are no hit (Sbjct) objects defined. + Argument : Class (or no argument). + : No argument (default) = highest scoring hit (same as 'best'). + : 'best' or 'first' = highest scoring hit. + : 'worst' or 'last' = lowest scoring hit. + : $name = retrieve a hit by seq id (case-insensitive). + Throws : Exception if the Blast object has no significant hits. + : Exception if a hit cannot be found when supplying a specific + : hit sequence identifier as an argument. + Comments : 'best' = lowest significance value (P or Expect) among significant hits. + : 'worst' = highest sigificance value (P or Expect) among significant hits. + +See Also : L<hits()|hits>, L<num_hits()|num_hits>, L<is_signif()|is_signif> + +=cut + +#--------- +sub hit { +#--------- + my( $self, $option) = @_; + $option ||= 'best'; + + if($Blast->{'_no_aligns'} || ! ref($self->{'_hits'})) { + return undef; + } + + $self->{'_is_significant'} or + $self->throw("There were no significant hits.", + "Use num_hits(), hits(), is_signif() to check."); + + my @hits = @{$self->{'_hits'}}; + + return $hits[0] if $option =~ /^(best|first|1)$/i; + return $hits[$#hits] if $option =~ /^(worst|last)$/i; + + # Get hit by name. + foreach ( @hits ) { + return $_ if $_->name() =~ /$option/i; + } + + $self->throw("Can't get hit for: $option"); +} + +=head2 num_hits + + Usage : $blast->num_hits( ['total'] ); + Purpose : Get number of significant hits or number of total hits. + Examples : $num_signif = $blast-num_hits; + : $num_total = $blast->num_hits('total'); + Returns : Integer + Argument : String = 'total' (or no argument). + : No argument (Default) = return number of significant hits. + : 'total' = number of total hits. + Throws : n/a. + : Not throwing exception because the absence of hits may have + : resulted from stringent significance criteria, not a failure + : set the hits. + Comments : A significant hit is defined as a hit with an expect value + : (or P value for WU-Blast) at or below the -signif parameter + : used when parsing the report. Additionally, if a filter function + : was supplied, the significant hit must also pass that + : criteria. + +See Also : L<hits()|hits>, L<hit()|hit>, L<is_signif()|is_signif>, L<_set_signif()|_set_signif>, L<parse()|parse> + +=cut + +#------------- +sub num_hits { +#------------- + my( $self, $option) = @_; + $option ||= ''; + + $option =~ /total/i and return $self->{'_num_hits'} || 0; + + # Default: returning number of significant hits. +# return $self->{'_num_hits_significant'} || 0; +# return 0 if not ref $self->{'_hits'}; + + if(ref $self->{'_hits'}) { + return scalar(@{$self->{'_hits'}}); + } else { + return $self->{'_num_hits_significant'} || 0; + } +} + +=head2 lowest_p + + Usage : $blast->lowest_p() + Purpose : Get the lowest P-value among all hits in a BLAST report. + : Syntactic sugar for $blast->hit('best')->p(). + Returns : Float or scientific notation number. + : Returns -1.0 if lowest_p has not been set. + Argument : n/a. + Throws : Exception if the Blast report does not report P-values + : (as is the case for NCBI Blast2). + Comments : A value is returned regardless of whether or not there were + : significant hits ($DEFAULT_SIGNIF, currently 999). + +See Also : L<lowest_expect()|lowest_expect>, L<lowest_signif()|lowest_signif>, L<highest_p()|highest_p>, L<signif_fmt()|signif_fmt> + +=cut + +#------------ +sub lowest_p { +#------------ + my $self = shift; + + # Layout 2 = NCBI Blast 2.x does not report P-values. + $self->_layout == 2 and + $self->throw("Can't get P-value with BLAST2.", + "Use lowest_signif() or lowest_expect()"); + + return $self->{'_lowestSignif'} || -1.0; +} + +=head2 lowest_expect + + Usage : $blast->lowest_expect() + Purpose : Get the lowest Expect value among all hits in a BLAST report. + : Syntactic sugar for $blast->hit('best')->expect() + Returns : Float or scientific notation number. + : Returns -1.0 if lowest_expect has not been set. + Argument : n/a. + Throws : Exception if there were no significant hits and the report + : does not have Expect values on the description lines + : (i.e., Blast1, WashU-Blast2). + +See Also : L<lowest_p()|lowest_p>, L<lowest_signif()|lowest_signif>, L<highest_expect()|highest_expect>, L<signif_fmt()|signif_fmt> + +=cut + +#------------------ +sub lowest_expect { +#------------------ + my $self = shift; + + if ($self->_layout == 2) { + return $self->{'_lowestSignif'} || -1.0; + } + + if($self->{'_is_significant'}) { + my $bestHit = $self->{'_hits'}->[0]; + return $bestHit->expect(); + } else { + $self->throw("Can't get lowest expect value: no significant hits ", + "The format of this report requires expect values to be extracted$Newline". + "from the hits themselves."); + } +} + +=head2 highest_p + + Example : $blast->highest_p( ['overall']) + Purpose : Get the highest P-value among all hits in a BLAST report. + : Syntactic sugar for $blast->hit('worst')->p() + : Can also get the highest P-value overall (not just among signif hits). + Usage : $p_signif = $blast->highest_p(); + : $p_all = $blast->highest_p('overall'); + Returns : Float or scientific notation number. + : Returns -1.0 if highest_p has not been set. + Argument : String 'overall' or no argument. + : No argument = get highest P-value among significant hits. + Throws : Exception if object is created from a Blast2 report + : (which does not report P-values). + +See Also : L<highest_signif()|highest_signif>, L<lowest_p()|lowest_p>, L<_set_signif()|_set_signif>, L<signif_fmt()|signif_fmt> + +=cut + +#--------------- +sub highest_p { +#--------------- + my ($self, $overall) = @_; + + # Layout 2 = NCBI Blast 2.x does not report P-values. + $self->_layout == 2 and + $self->throw("Can't get P-value with BLAST2.", + "Use highest_signif() or highest_expect()"); + + $overall and return $self->{'_highestSignif'} || -1.0; + $self->hit('worst')->p(); +} + +=head2 highest_expect + + Usage : $blast_object->highest_expect( ['overall']) + Purpose : Get the highest Expect value among all significant hits in a BLAST report. + : Syntactic sugar for $blast->hit('worst')->expect() + Examples : $e_sig = $blast->highest_expect(); + : $e_all = $blast->highest_expect('overall'); + Returns : Float or scientific notation number. + : Returns -1.0 if highest_exoect has not been set. + Argument : String 'overall' or no argument. + : No argument = get highest Expect-value among significant hits. + Throws : Exception if there were no significant hits and the report + : does not have Expect values on the description lines + : (i.e., Blast1, WashU-Blast2). + +See Also : L<lowest_expect()|lowest_expect>, L<highest_signif()|highest_signif>, L<signif_fmt()|signif_fmt> + +=cut + +#------------------- +sub highest_expect { +#------------------- + my ($self, $overall) = @_; + + if ( $overall and $self->_layout == 2) { + return $self->{'_highestSignif'} || -1.0; + } + + if($self->{'_is_significant'}) { + return $self->hit('worst')->expect; + } else { + $self->throw("Can't get highest expect value: no significant hits ", + "The format of this report requires expect values to be extracted$Newline". + "from the hits themselves."); + } +} + +=head2 lowest_signif + + Usage : $blast_obj->lowest_signif(); + : Syntactic sugar for $blast->hit('best')->signif() + Purpose : Get the lowest P or Expect value among all hits + : in a BLAST report. + : This method is syntactic sugar for $blast->hit('best')->signif() + : The value returned is the one which is reported in the decription + : section of the Blast report. + : For Blast1 and WU-Blast2, this is a P-value, + : for NCBI Blast2, it is an Expect value. + Example : $blast->lowest_signif(); + Returns : Float or scientific notation number. + : Returns -1.0 if lowest_signif has not been set. + Argument : n/a. + Throws : n/a. + Status : Deprecated. Use lowest_expect() or lowest_p(). + Comments : The signif() method provides a way to deal with the fact that + : Blast1 and Blast2 formats differ in what is reported in the + : description lines of each hit in the Blast report. The signif() + : method frees any client code from having to know if this is a P-value + : or an Expect value, making it easier to write code that can process + : both Blast1 and Blast2 reports. This is not necessarily a good thing, since + : one should always know when one is working with P-values or + : Expect values (hence the deprecated status). + : Use of lowest_expect() is recommended since all hits will have an Expect value. + +See Also : L<lowest_p()|lowest_p>, L<lowest_expect()|lowest_expect>, L<signif()|signif>, L<signif_fmt()|signif_fmt>, L<_set_signif()|_set_signif> + +=cut + +#------------------ +sub lowest_signif { +#------------------ + my ($self) = @_; + + return $self->{'_lowestSignif'} || -1.0; +} + +=head2 highest_signif + + Usage : $blast_obj->highest_signif('overall'); + : Syntactic sugar for $blast->hit('worst')->signif() + Purpose : Get the highest P or Expect value among all hits + : in a BLAST report. + : The value returned is the one which is reported in the decription + : section of the Blast report. + : For Blast1 and WU-Blast2, this is a P-value, + : for NCBI Blast2, it is an Expect value. + Example : $blast->highest_signif(); + Returns : Float or scientific notation number. + : Returns -1.0 if highest_signif has not been set. + Argument : Optional string 'overall' to get the highest overall significance value. + Throws : n/a. + Status : Deprecated. Use highest_expect() or highest_p(). + Comments : Analogous to lowest_signif(), q.v. + +See Also : L<lowest_signif()|lowest_signif>, L<lowest_p()|lowest_p>, L<lowest_expect()|lowest_expect>, L<signif()|signif>, L<signif_fmt()|signif_fmt>, L<_set_signif()|_set_signif> + +=cut + +#--------------------- +sub highest_signif { +#--------------------- + my ($self, $overall) = @_; + + $overall and return $self->{'_highestSignif'} || -1.0; + + if($self->{'_is_significant'}) { + my $worst_hit = $self->hit('worst'); + if(defined $worst_hit) { + return $worst_hit->signif; + } else { + return $self->{'_highestSignif'}; + } + } +} + +=head2 matrix + + Usage : $blast_object->matrix(); + Purpose : Get the name of the scoring matrix used. + : This is extracted from the report. + Argument : n/a + Returns : string or undef if not defined + +=cut + +#------------ +sub matrix { my $self = shift; $self->{'_matrix'} || $Blast->{'_matrix'}; } +#------------ + +=head2 filter + + Usage : $blast_object->filter(); + Purpose : Get the name of the low-complexity sequence filter used. + : (SEG, SEG+XNU, DUST, NONE). + : This is extracted from the report. + Argument : n/a + Returns : string or undef if not defined + +=cut + +#---------- +sub filter { my $self = shift; $self->{'_filter'} || $Blast->{'_filter'}; } +#---------- + +=head2 expect + + Usage : $blast_object->expect(); + Purpose : Get the expect parameter (E) used for the Blast analysis. + : This is extracted from the report. + Argument : n/a + Returns : string or undef if not defined. + +=cut + +#----------- +sub expect { my $self = shift; $self->{'_expect'} || $Blast->{'_expect'}; } +#----------- + +=head2 karlin_altschul + + Usage : $blast_object->karlin_altschul(); + Purpose : Get the Karlin_Altschul sum statistics (Lambda, K, H) + : These are extracted from the report. + Argument : n/a + Returns : list of three floats (Lambda, K, H) + : If not defined, returns list of three zeros) + +=cut + +#--------------------- +sub karlin_altschul { +#--------------------- + my $self = shift; + if(defined($self->{'_lambda'})) { + ($self->{'_lambda'}, $self->{'_k'}, $self->{'_h'}); + } elsif(defined($Blast->{'_lambda'})) { + ($Blast->{'_lambda'}, $Blast->{'_k'}, $Blast->{'_h'}); + } else { + (0, 0, 0); + } +} + +=head2 word_size + + Usage : $blast_object->word_size(); + Purpose : Get the word_size used during the Blast analysis. + : This is extracted from the report. + Argument : n/a + Returns : integer or undef if not defined. + +=cut + +#-------------- +sub word_size { +#-------------- + my $self = shift; + $self->{'_word_size'} || $Blast->{'_word_size'}; +} + +=head2 s + + Usage : $blast_object->s(); + Purpose : Get the s statistic for the Blast analysis. + : This is extracted from the report. + Argument : n/a + Returns : integer or undef if not defined. + +=cut + +#------ +sub s { my $self = shift; $self->{'_s'} || $Blast->{'_s'}; } +#------ + +=head2 gap_creation + + Usage : $blast_object->gap_creation(); + Purpose : Get the gap creation penalty used for a gapped Blast analysis. + : This is extracted from the report. + Argument : n/a + Returns : integer or undef if not defined. + +See Also : L<gap_extension()|gap_extension> + +=cut + +#----------------- +sub gap_creation { +#----------------- + my $self = shift; + $self->{'_gapCreation'} || $Blast->{'_gapCreation'}; +} + +=head2 gap_extension + + Usage : $blast_object->gap_extension(); + Purpose : Get the gap extension penalty used for a gapped Blast analysis. + : This is extracted from the report. + Argument : n/a + Returns : integer or undef if not defined. + +See Also : L<gap_extension()|gap_extension> + +=cut + +#------------------- +sub gap_extension { +#------------------- + my $self = shift; + $self->{'_gapExtension'} || $Blast->{'_gapExtension'}; +} + +=head2 ambiguous_aln + + Usage : $blast_object->ambiguous_aln(); + Purpose : Test all hits and determine if any have an ambiguous alignment. + Example : print "ambiguous" if $blast->ambiguous_aln(); + Returns : Boolean (true if ANY significant hit has an ambiguous alignment) + Argument : n/a + Throws : n/a + Status : Experimental + Comments : An ambiguous BLAST alignment is defined as one where two or more + : different HSPs have significantly overlapping sequences such + : that it is not possible to create a unique alignment + : by simply concatenating HSPs. This may indicate the presence + : of multiple domains in one sequence relative to another. + : This method only indicates the presence of ambiguity in at + : least one significant hit. To determine the nature of the + : ambiguity, each hit must be examined. + +See Also : B<Bio::Tools::Blast::Sbjct::ambiguous_aln()>,L<Links to related modules> + +=cut + +#---------------- +sub ambiguous_aln { +#---------------- + my $self = shift; + foreach($self->hits()) { + return 1 if ($_->ambiguous_aln() ne '-'); + } + 0; +} + +=head2 overlap + + Usage : $blast_object->overlap([integer]); + Purpose : Set/Get the number of overlapping residues allowed when tiling multiple HSPs. + : Delegates to Bio::Tools::Blast::Sbjct::overlap(). + Throws : Exception if there are no significant hits. + Status : Experimental + +See Also : B<Bio::Tools::Blast::Sbjct::overlap()>,L<Links to related modules> + +=cut + +#------------ +sub overlap { +#------------ + my $self = shift; + if(not $self->hits) { + $self->throw("Can't get overlap data without significant hits."); + } + $self->hit->overlap(); +} + +=head2 homol_data + + Usage : @data = $blast_object->homo_data( %named_params ); + Purpose : Gets specific similarity data about each significant hit. + Returns : Array of strings: + : "Homology data" for each HSP is in the format: + : "<integer> <start> <stop>" + : Data for different HSPs are tab-delimited. + Argument : named parameters passed along to the hit objects. + Throws : n/a + Status : Experimental + Comments : This is a very experimental method used for obtaining an + : indication of: + : 1) how many HSPs are in a Blast alignment + : 2) how strong the similarity is between sequences in the HSP + : 3) the endpoints of the alignment (sequence monomer numbers) + +See Also : B<Bio::Tools::Blast::Sbjct::homol_data()>,L<Links to related modules> + +=cut + +#---------------- +sub homol_data { +#---------------- + + my ($self, %param) = @_; + my @hits = $self->hits(); + my @data = (); + + ## Note: Homology data can be either for the query sequence or the hit + ## (Sbjct) sequence. Default is for sbjct. This is specifyable via + ## $param{-SEQ}='sbjct' || 'query'. + + foreach ( @hits ) { + push @data, $_->homol_data(%param); + } + @data; +} + +=head1 REPORT GENERATING METHODS + +=head2 table + + Usage : $blast_obj->table( [get_desc]); + Purpose : Output data for each HSP of each hit in tab-delimited format. + Example : print $blast->table; + : print $blast->table(0); + : # Call table_labels() to print labels. + Argument : get_desc = boolean, if false the description of each hit is not included. + : Default: true (if not defined, include description column). + Returns : String containing tab-delimited set of data for each HSP + : of each significant hit. Different HSPs are separated by newlines. + : Left-to-Right order of fields: + : 1 QUERY_NAME # Sequence identifier of the query. + : 2 QUERY_LENGTH # Full length of the query sequence. + : 3 SBJCT_NAME # Sequence identifier of the sbjct ("hit". + : 4 SBJCT_LENGTH # Full length of the sbjct sequence. + : 5 EXPECT # Expect value for the alignment. + : 6 SCORE # Blast score for the alignment. + : 7 BITS # Bit score for the alignment. + : 8 NUM_HSPS # Number of HSPs (not the "N" value). + : 9 HSP_FRAC_IDENTICAL # fraction of identical substitutions. + : 10 HSP_FRAC_CONSERVED # fraction of conserved ("positive") substitutions. + : 11 HSP_QUERY_ALN_LENGTH # Length of the aligned portion of the query sequence. + : 12 HSP_SBJCT_ALN_LENGTH # Length of the aligned portion of the sbjct sequence. + : 13 HSP_QUERY_GAPS # Number of gaps in the aligned query sequence. + : 14 HSP_SBJCT_GAPS # Number of gaps in the aligned sbjct sequence. + : 15 HSP_QUERY_START # Starting coordinate of the query sequence. + : 16 HSP_QUERY_END # Ending coordinate of the query sequence. + : 17 HSP_SBJCT_START # Starting coordinate of the sbjct sequence. + : 18 HSP_SBJCT_END # Ending coordinate of the sbjct sequence. + : 19 HSP_QUERY_STRAND # Strand of the query sequence (TBLASTN/X only) + : 20 HSP_SBJCT_STRAND # Strand of the sbjct sequence (TBLASTN/X only) + : 21 HSP_FRAME # Frame for the sbjct translation (TBLASTN/X only) + : 22 SBJCT_DESCRIPTION (optional) # Full description of the sbjct sequence from + : # the alignment section. + Throws : n/a + Comments : This method does not collect data based on tiling of the HSPs. + : The table will contains redundant information since the hit name, + : id, and other info for the hit are listed for each HSP. + : If you need more flexibility in the output format than this + : method provides, design a custom function. + +See Also : L<table_tiled()|table_tiled>, L<table_labels()|table_labels>, L<_display_hits()|_display_hits> + +=cut + +#----------- +sub table { +#----------- + my ($self, $get_desc) = @_; + my $str = ''; + + $get_desc = defined($get_desc) ? $get_desc : 1; +# $str .= $self->_table_labels($get_desc) unless $self->{'_labels'}; + + my $sigfmt = $self->signif_fmt(); + $sigfmt eq 'parts' and $sigfmt = 'exp'; # disallow 'parts' format for this table. + my $sigprint = $sigfmt eq 'exp' ? 'd' : '.1e'; + + my ($hit, $hsp); + foreach $hit($self->hits) { + foreach $hsp($hit->hsps) { + # Note: range() returns a 2-element list. + $str .= sprintf "%s\t%d\t%s\t%d\t%$sigprint\t%d\t%d\t%d\t%.2f\t%.2f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s$Newline", + $self->name, $self->length, $hit->name, $hit->length, + $hit->expect($sigfmt), $hit->score, $hit->bits, + $hit->num_hsps, $hsp->frac_identical, $hsp->frac_conserved, + $hsp->length('query'), $hsp->length('sbjct'), + $hsp->gaps('list'), + $hsp->range('query'), $hsp->range('sbjct'), + $hsp->strand('query'), $hsp->strand('sbjct'), $hsp->frame, + ($get_desc ? $hit->desc : ''); + } + } + $str =~ s/\t$Newline/$Newline/gs; + $str; +} + +=head2 table_labels + + Usage : print $blast_obj->table_labels( [get_desc] ); + Purpose : Get column labels for table(). + Returns : String containing column labels. Tab-delimited. + Argument : get_desc = boolean, if false the description column is not included. + : Default: true (if not defined, include description column). + Throws : n/a + +See Also : L<table()|table> + +=cut + +#---------------- +sub table_labels { +#---------------- + my ($self, $get_desc) = @_; + $get_desc = defined($get_desc) ? $get_desc : 1; + my $descstr = $get_desc ? 'DESC' : ''; + my $descln = $get_desc ? '-----' : ''; + + my $str = sprintf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s$Newline", + 'QUERY', 'Q_LEN', 'SBJCT', 'S_LEN', 'EXPCT', 'SCORE', 'BITS', 'HSPS', + 'IDEN', 'CONSV', 'Q_ALN', 'S_ALN', 'Q_GAP', 'S_GAP', + 'Q_BEG', 'Q_END', 'S_BEG', 'S_END', 'Q_STR', 'S_STR', 'FRAM', $descstr; + $str .= sprintf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s$Newline", + '-----', '-----', '-----', '-----', '-----', '-----', '-----', '-----', + '-----', '-----', '-----', '-----', '-----', '-----', + '-----', '-----', '-----','-----', '-----', '-----','-----', $descln; + + $self->{'_labels'} = 1; + $str =~ s/\t$Newline/$Newline/gs; + $str; +} + +=head2 table_tiled + + Purpose : Get data from tiled HSPs in tab-delimited format. + : Allows only minimal flexibility in the output format. + : If you need more flexibility, design a custom function. + Usage : $blast_obj->table_tiled( [get_desc]); + Example : print $blast->table_tiled; + : print $blast->table_tiled(0); + : # Call table_labels_tiled() if you want labels. + Argument : get_desc = boolean, if false the description of each hit is not included. + : Default: true (include description). + Returns : String containing tab-delimited set of data for each HSP + : of each significant hit. Multiple hits are separated by newlines. + : Left-to-Right order of fields: + : 1 QUERY_NAME # Sequence identifier of the query. + : 2 QUERY_LENGTH # Full length of the query sequence. + : 3 SBJCT_NAME # Sequence identifier of the sbjct ("hit". + : 4 SBJCT_LENGTH # Full length of the sbjct sequence. + : 5 EXPECT # Expect value for the alignment. + : 6 SCORE # Blast score for the alignment. + : 7 BITS # Bit score for the alignment. + : 8 NUM_HSPS # Number of HSPs (not the "N" value). + : 9 FRAC_IDENTICAL* # fraction of identical substitutions. + : 10 FRAC_CONSERVED* # fraction of conserved ("positive") substitutions . + : 11 FRAC_ALN_QUERY* # fraction of the query sequence that is aligned. + : 12 FRAC_ALN_SBJCT* # fraction of the sbjct sequence that is aligned. + : 13 QUERY_ALN_LENGTH* # Length of the aligned portion of the query sequence. + : 14 SBJCT_ALN_LENGTH* # Length of the aligned portion of the sbjct sequence. + : 15 QUERY_GAPS* # Number of gaps in the aligned query sequence. + : 16 SBJCT_GAPS* # Number of gaps in the aligned sbjct sequence. + : 17 QUERY_START* # Starting coordinate of the query sequence. + : 18 QUERY_END* # Ending coordinate of the query sequence. + : 19 SBJCT_START* # Starting coordinate of the sbjct sequence. + : 20 SBJCT_END* # Ending coordinate of the sbjct sequence. + : 21 AMBIGUOUS_ALN # Ambiguous alignment indicator ('qs', 'q', 's'). + : 22 SBJCT_DESCRIPTION (optional) # Full description of the sbjct sequence from + : # the alignment section. + : + : * Items marked with a "*" report data summed across all HSPs + : after tiling them to avoid counting data from overlapping regions + : multiple times. + Throws : n/a + Comments : This function relies on tiling of the HSPs since it calls + : frac_identical() etc. on the hit as opposed to each HSP individually. + +See Also : L<table()|table>, L<table_labels_tiled()|table_labels_tiled>, B<Bio::Tools::Blast::Sbjct::"HSP Tiling and Ambiguous Alignments">, L<Links to related modules> + +=cut + +#---------------- +sub table_tiled { +#---------------- + my ($self, $get_desc) = @_; + my $str = ''; + + $get_desc = defined($get_desc) ? $get_desc : 1; + + my ($hit); + my $sigfmt = $self->signif_fmt(); + $sigfmt eq 'parts' and $sigfmt = 'exp'; # disallow 'parts' format for this table. + my $sigprint = $sigfmt eq 'exp' ? 'd' : '.1e'; + + foreach $hit($self->hits) { + $str .= sprintf "%s\t%d\t%s\t%d\t%$sigprint\t%d\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s$Newline", + $self->name, $self->length, $hit->name, $hit->length, + $hit->expect($sigfmt), $hit->score, $hit->bits, + $hit->num_hsps, $hit->frac_identical, $hit->frac_conserved, + $hit->frac_aligned_query, $hit->frac_aligned_hit, + $hit->length_aln('query'), $hit->length_aln('sbjct'), + $hit->gaps('list'), $hit->range('query'), $hit->range('sbjct'), + $hit->ambiguous_aln, ($get_desc ? $hit->desc : ''); + } + $str =~ s/\t$Newline/$Newline/gs; + $str; +} + +=head2 table_labels_tiled + + Usage : print $blast_obj->table_labels_tiled( [get_desc] ); + Purpose : Get column labels for table_tiled(). + Returns : String containing column labels. Tab-delimited. + Argument : get_desc = boolean, if false the description column is not included. + : Default: true (include description column). + Throws : n/a + +See Also : L<table_tiled()|table_tiled> + +=cut + +#--------------------- +sub table_labels_tiled { +#--------------------- + my ($self, $get_desc) = @_; + my $descstr = $get_desc ? 'DESC' : ''; + my $descln = $get_desc ? '-----' : ''; + + my $str = sprintf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s$Newline", + 'QUERY', 'Q_LEN', 'SBJCT', 'S_LEN', 'EXPCT', 'SCORE', 'BITS', + 'HSPS', 'FR_ID', 'FR_CN', 'FR_ALQ', 'FR_ALS', 'Q_ALN', + 'S_ALN', 'Q_GAP', 'S_GAP', 'Q_BEG', 'Q_END', 'S_BEG', 'S_END', + 'AMBIG', $descstr; + $str =~ s/\t$Newline/$Newline/; + $str .= sprintf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s$Newline", + '-----', '-----', '------', '-----', '-----','-----', '-----', + '-----', '-----', '-----', '-----', '-----', '-----', + '-----', '-----', '-----','-----','-----', '-----', + '-----','-----', $descln; + + $self->{'_labels_tiled'} = 1; + $str =~ s/\t$Newline/$Newline/gs; + $str; +} + +=head2 display + + Usage : $blast_object->display( %named_parameters ); + Purpose : Display information about Bio::Tools::Blast.pm data members, + : E.g., parameters of the report, data for each hit., etc. + : Overrides Bio::Root::Object::display(). + Example : $object->display(-SHOW=>'stats'); + : $object->display(-SHOW=>'stats,hits'); + Argument : Named parameters: (TAGS CAN BE UPPER OR LOWER CASE) + : -SHOW => 'file' | 'hits' | 'homol' + : -WHERE => filehandle (default = STDOUT) + Returns : n/a (print/printf is called) + Status : Experimental + Comments : For tab-delimited output, see table(). + +See Also : L<_display_homol()|_display_homol>, L<_display_hits()|_display_hits>, L<_display_stats()|_display_stats>, L<table()|table>, B<Bio::Root::Tools::SeqAnal::display()>,L<Links to related modules>, + +=cut + +#-------------- +sub display { +#-------------- + my( $self, %param ) = @_; + + $self->SUPER::display(%param); + my $OUT = $self->fh(); + + $self->show =~ /homol/i and $self->_display_homol($OUT); + $self->show =~ /hits/i and $self->_display_hits( %param ); + 1; +} + +=head2 _display_homol + + Usage : n/a; called automatically by display() + Purpose : Print homology data for hits in the BLAST report. + Example : n/a + Argument : one argument = filehandle object. + Returns : printf call. + Status : Experimental + +See Also : L<homol_data()|homol_data>, L<display()|display> + +=cut + +#------------------- +sub _display_homol { +#------------------- + my( $self, $OUT ) = @_; + + print $OUT "${Newline}BLAST HOMOLOGY DATA FOR: ${\$self->name()}$Newline"; + print $OUT '-'x40,"$Newline"; + + foreach ( $self->homol_data()) { + print $OUT "$_$Newline"; + } +} + +=head2 _display_stats + + Usage : n/a; called automatically by display() + Purpose : Display information about the Blast report "meta" data. + : Overrides Bio::Tools::SeqAnal::_display_stats() calling it first. + Example : n/a + Argument : one argument = filehandle object. + Returns : printf call. + Status : Experimental + +See Also : L<display()|display>, B<Bio::Tools::SeqAnal::_display_stats()>,L<Links to related modules> + +=cut + +#-------------------- +sub _display_stats { +#-------------------- + my( $self, $OUT ) = @_; + + $self->SUPER::_display_stats($OUT); + printf( $OUT "%-15s: %s$Newline", "GAPPED", $self->gapped ? 'YES' : 'NO'); + printf( $OUT "%-15s: %d$Newline", "TOTAL HITS", $self->num_hits('total')); + printf( $OUT "%-15s: %s$Newline", "CHECKED ALL", $Blast->{'_check_all'} ? 'YES' : 'NO'); + printf( $OUT "%-15s: %s$Newline", "FILT FUNC", $Blast->{'_filt_func'} ? 'YES' : 'NO'); + if($self->min_length) { + printf( $OUT "%-15s: Length >= %s$Newline", "MIN_LENGTH", $self->min_length); + } + + my $num_hits = $self->num_hits; + my $signif_str = ($self->_layout == 1) ? 'P' : 'EXPECT'; + + printf( $OUT "%-15s: %d$Newline", "SIGNIF HITS", $num_hits); + # Blast1: signif = P-value, Blast2: signif = Expect value. + + printf( $OUT "%-15s: %s ($signif_str-VALUE)$Newline", "SIGNIF CUTOFF", $self->signif); + printf( $OUT "%-15s: %s$Newline", "LOWEST $signif_str", $self->lowest_signif()); + printf( $OUT "%-15s: %s$Newline", "HIGHEST $signif_str", $self->highest_signif()); + + printf( $OUT "%-15s: %s (OVERALL)$Newline", "HIGHEST $signif_str", $self->highest_signif('overall')); + + if($Blast->_get_stats) { + my $warn = ($Blast->{'_share'}) ? '(SHARED STATS)' : ''; + printf( $OUT "%-15s: %s$Newline", "MATRIX", $self->matrix() || 'UNKNOWN'); + printf( $OUT "%-15s: %s$Newline", "FILTER", $self->filter() || 'UNKNOWN'); + printf( $OUT "%-15s: %s$Newline", "EXPECT", $self->expect() || 'UNKNOWN'); + printf( $OUT "%-15s: %s, %s, %s %s$Newline", "LAMBDA, K, H", $self->karlin_altschul(), $warn); + printf( $OUT "%-15s: %s$Newline", "WORD SIZE", $self->word_size() || 'UNKNOWN'); + printf( $OUT "%-15s: %s %s$Newline", "S", $self->s() || 'UNKNOWN', $warn); + if($self->gapped) { + printf( $OUT "%-15s: %s$Newline", "GAP CREATION", $self->gap_creation() || 'UNKNOWN'); + printf( $OUT "%-15s: %s$Newline", "GAP EXTENSION", $self->gap_extension() || 'UNKNOWN'); + } + } + print $OUT "$Newline"; +} + +=head2 _display_hits + + Usage : n/a; called automatically by display() + Purpose : Display data for each hit. Not tab-delimited. + Example : n/a + Argument : one argument = filehandle object. + Returns : printf call. + Status : Experimental + Comments : For tab-delimited output, see table(). + +See Also : L<display()|display>, B<Bio::Tools::Blast::Sbjct::display()>, L<table()|table>, L<Links to related modules> + +=cut + +sub _display_hits { + + my( $self, %param ) = @_; + my $OUT = $self->fh(); + my @hits = $self->hits(); + + ## You need a wide screen to see this properly. + # Header. + print $OUT "${Newline}BLAST HITS FOR: ${\$self->name()} length = ${\$self->length}$Newline"; + print "(This table requires a wide display.)$Newline"; + print $OUT '-'x80,"$Newline"; + + print $self->table_labels_tiled(0); + print $self->table_tiled(0); + + ## Doing this interactively since there is potentially a lot of data here. + ## Not quite satisfied with this approach. + + if (not $param{-INTERACTIVE}) { + return 1; + } else { + my ($reply); + print "${Newline}DISPLAY FULL HSP DATA? (y/n): [n] "; + chomp( $reply = <STDIN> ); + $reply =~ /^y.*/i; + + my $count = 0; + foreach ( @hits ) { + $count++; + print $OUT "$Newline$Newline",'-'x80,"$Newline"; + print $OUT "HSP DATA FOR HIT #$count (hit <RETURN>)"; + print $OUT "$Newline",'-'x80;<STDIN>; + $param{-SHOW} = 'hsp'; + $_->display( %param ); + } + } + 1; +} + +=head2 to_html + + Usage : $blast_object->to_html( [%named_parameters] ) + Purpose : To produce an HTML-formatted version of a BLAST report + : for efficient navigation of the report using a web browser. + Example : # Using the static Blast object: + : # Can read from STDIN or from a designated file: + : $Blast->to_html($file); + : $Blast->to_html(-FILE=>$file, -HEADER=>$header); + : (if no file is supplied, STDIN will be used). + : # saving HTML to an array: + : $Blast->to_html(-FILE=>$file, -OUT =>\@out); + : # Using a pre-existing blast object (must have been built from + : # a file, not STDIN: + : $blastObj->to_html(); + Returns : n/a, either prints report to STDOUT or saves to a supplied array + : if an '-OUT' parameter is defined (see below). + Argument : %named_parameters: (TAGS ARE AND CASE INSENSITIVE). + : -FILE => string containing name of a file to be processed. + : If not a valid file or undefined, STDIN will be used. + : Can skip the -FILE tag if supplying a filename + : as a single argument. + : -HEADER => string + : This should be an HTML-formatted string to be used + : as a header for the page, typically describing query sequence, + : database searched, the date of the analysis, and any + : additional links. + : If not supplied, no special header is used. + : Regardless of whether a header is supplied, the + : standard info at the top of the report is highlighted. + : This should include the <HEADER></HEADER> section + : of the page as well. + : + : -IN => array reference containing a raw Blast report. + : each line in a separate element in the array. + : If -IN is not supplied, read() is called + : and data is then read either from STDIN or a file. + : + : -OUT => array reference to hold the HTML output. + : If not supplied, output is sent to STDOUT. + Throws : Exception is propagated from $HTML::get_html_func() + : and Bio::Root::Object::read(). + Comments : The code that does the actual work is located in + : Bio::Tools::Blast::HTML::get_html_func(). + Bugs : Some hypertext links to external databases may not be + : correct. This due in part to the dynamic nature of + : the web. + : Hypertext links are not added to hits without database ids. + TODO : Possibly create a function to produce fancy default header + : using data extracted from the report (requires some parsing). + : For example, it would be nice to always include a date + +See Also : B<Bio::Tools::Blast::HTML::get_html_func()>, B<Bio::Root::Object::read()>, L<Links to related modules> + +=cut + +#------------ +sub to_html { +#------------ + my ($self, @param) = @_; + + # Permits syntax such as: $blast->to_html($filename); + my ($file, $header_html, $in_aref, $out_aref) = + $self->_rearrange([qw(FILE HEADER IN OUT)], @param); + + $self->file($file) if $file; + + # Only setting the newline character once for efficiency. + $Newline ||= $Util->get_newline(-client => $self, @param); + + $header_html ||= ''; + (ref($out_aref) eq 'ARRAY') ? push(@$out_aref, $header_html) : print "$header_html$Newline"; + + require Bio::Tools::Blast::HTML; + Bio::Tools::Blast::HTML->import(qw(&get_html_func)); + + my ($func); + eval{ $func = &get_html_func($out_aref); }; + if($@) { + my $err = $@; + $self->throw($err); + } + + eval { + if(!$header_html) { + $out_aref ? push(@$out_aref, "<html><body>$Newline") : print "<html><body>$Newline"; + } + + if (ref ($in_aref) =~ /ARRAY/) { + # If data is being supplied, process it. + foreach(@$in_aref) { + &$func($_); + } + } else { + # Otherwise, read it, processing as we go. + + $self->read(-FUNC => $func, @param); + } + $out_aref ? push(@$out_aref, "$Newline</pre></body></html>") : print "$Newline</pre></body></html>"; + }; + + if($@) { + # Check for trivial error (report already HTML formatted). + if($@ =~ /HTML formatted/) { + print STDERR "\a${Newline}Blast report appears to be HTML formatted already.$Newline$Newline"; + } else { + my $err = $@; + $self->throw($err); + } + } +} + +1; +__END__ + +##################################################################################### +# END OF CLASS # +##################################################################################### + +=head1 FOR DEVELOPERS ONLY + +=head2 Data Members + +Information about the various data members of this module is provided for those +wishing to modify or understand the code. Two things to bear in mind: + +=over 4 + +=item 1 Do NOT rely on these in any code outside of this module. + +All data members are prefixed with an underscore to signify that they are private. +Always use accessor methods. If the accessor doesn't exist or is inadequate, +create or modify an accessor (and let me know, too!). (An exception to this might +be for Sbjct.pm or HSP.pm which are more tightly coupled to Blast.pm and +may access Blast data members directly for efficiency purposes, but probably +should not). + +=item 2 This documentation may be incomplete and out of date. + +It is easy for these data member descriptions to become obsolete as +this module is still evolving. Always double check this info and search +for members not described here. + +=back + +An instance of Bio::Tools::Blast.pm is a blessed reference to a hash containing +all or some of the following fields: + + FIELD VALUE + -------------------------------------------------------------- + _significance P-value or Expect value cutoff (depends on Blast version: + Blast1/WU-Blast2 = P-value; Blast2 = Expect value). + Values GREATER than this are deemed not significant. + + _significant Boolean. True if the query has one or more significant hit. + + _min_length Integer. Query sequences less than this will be skipped. + + _confirm_significance Boolean. True if client has supplied significance criteria. + + _gapped Boolean. True if BLAST analysis has gapping turned on. + + _hits List of Sbjct.pm objects. + + _num_hits Number of hits obtained from the BLAST report. + + _num_hits_significant Number of significant based on Significant data members. + + _highestSignif Highest P or Expect value overall (not just what is stored in _hits). + + _lowestSignif Lowest P or Expect value overall (not just what is stored in _hits). + +The static $Blast object has a special set of members: + + _errs + _share + _stream + _get_stats + _gapped + _filt_func + + Miscellaneous statistical parameters: + ------------------------------------- + _filter, _matrix, _word_size, _expect, _gapCreation, _gapExtension, _s, + _lambda, _k, _h + + INHERITED DATA MEMBERS + ----------------------- + (See Bio::Tools::SeqAnal.pm for inherited data members.) + +=cut + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Blast/CHANGES --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Blast/CHANGES Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,368 @@ +Revision history for Perl extension Bio::Tools::Blast.pm and related modules. + +0.091 Thu May 18 15:15:35 2000 + (These fixes were made on both branch-06 and the main trunk). + - Fixed Webblast.pm (jitterbug PR #220). It now uses SeqIO and IO::Scalar + for sending formatted query sequence data to the BLAST servers. + - Fixed the examples/blast/ scripts related to remote BLASTing. + - Added single quotes to an '_expect' data member in Sbjct + (reported by Janet Young in jitterbug PR #218). + - Fixed regexp bug in Blast::HTML.pm that caused accessions with + . and _ characters to not be recognized. + + +0.09 Wed Mar 22 00:44:01 2000 + - Handles Blast reports that don't contain description sections. + - Handles parsing of NCBI 2.0.9 Blast2 reports. + - Identifiers of hits are assumed to be the first + chunk of non-whitespace characters in the description line + Does not assume any semantics in the structure of the identifier. + (Blast::Sbjct::_set_id() formerly attempted to extract database + name from the seq identifiers, but this was prone to breaking). + - Fixed bug in the parsing of reports that lack an hits + (incorporated code fix submitted by Michael B. Thornton + jitterbug PR#114, PR#173). + - New helper script retrieve_blast.pl added to examples/blast/ + for retrieving Blast reports from NCBI after running + run_blast_remote.pl See docs in retrieve_blast.pl for details. + - Updated Webblast.pm with new URLs for the NCBI servers. + - Fixed bug (jitterbug PR#84) in the parsing of alignments that would + cause occaisionally cause some significant hits to be missed + (incorporates code suggested by Peter van Heusden). + - Added support to Blast.pm for not parsing alignment data + using the new NO_ALIGNS parsing option. Also added -noaligns + command-line option to blast_config.pl (addresses jitterbug PR#93). + - Blast::hit() method doesn't blow up if there are no hits + (fixes jitterbug PR#129). + - Addressed jitterbug PR#96 by correcting docs. If the significance + is defined but of bad format, no default is set. + - HSP.pm properly handles bit scores expressed using floating point + and scientific notation (fixes jitterbug report PR#110). + - Fixed logic error in HSP.pm that would incorrectly throw an + exception when the number of identical residues was zero + (fixes jitterbug PR#164). + - HSP::strand() returns -1, 0, 1 instead of 'Minus', 'Plus', '' + to be in-line with with behavior of SeqFeature. + +Fixes within Bio::Root +---------------------- + - Fixed bug reported in PR#85. Bio::Root::Utilities::get_newline() + returns "\n" by default. + - IOManager::read doesn't call alarm() if it isn't available + (fixes bug #PR98). + - Root::Object::_set_io() does not pass @_ to the IOManager constructor + call (incorporates suggestion from Scot Harker, jitterbut PR#192). + + +0.085 Tue Jun 29 05:15:40 1999 (bioperl 0.05.1) +(Note: the $VERSION variable was not changed to 0.085 in this release!) + - Fixed bugs reported in PR#s: 30, 31, 41, 44, 45, 47, 48 + - Blast.pm no longer throws exceptions for hitless reports, + even if a -signif criterion is specified. + - Better support for PSI Blasts (running and parsing) based on + comments/suggestions from Ross Crowhurst and Lu Sun. + - Bug fixes in Webblast.pm regarding blastServerURL and filenames + with special characters (bug reports from James Diggans and + Bradford Powell, respectively). + - Removed =head3 tags that were causing pod2man warnings. + - Fixed -best parsing switch. + - Added better POD to Webblast.pm + +0.080 Sat Apr 24 19:39:37 1999 (bioperl 0.05) + - Complete re-implementation of the Blast report parsing strategy. + The parser now always streamifies all Blast report(s) and + segments them hit-wise instead of report-wise. This improves + performance and memory usage (but doesn't solve the memory leak). + - Documented a workaround for the memory leak (see the docs for + Blast.pm under the section "Memory Usage Issues"). + - Webblast.pm now uses LWP instead of postclient.pl to post network + Blast requests (thanks to Ross Crowhurst for code submission). + This is a much more robust mechanism, although it now requires you + to have LWP installed ("a good thing"). For example, it makes it + easy to send requests and receive results through a firewall. + It can run Blast1, Blast2, and PSI-Blast at NCBI and also supports + WashU-Blast servers. + - Fixed some miscellaneous parsing bugs and errors reported by + Greg Colello, Xie Tao, and Richard Copley from the pre-release + - Updated example/blast scripts. Renamed parse.pl and parse2.pl to + parse_blast.pl and parse_blast2.pl. parse_stream.pl is gone + (since parse_blast.pl now handles both files and streams). + Renamed run.pl to run_blast_remote.pl and added additional + options to keep it in synch with the new Webblast.pm. + + +0.075 Wed Feb 17 01:29:57 1999 (bioperl 0.04.4) + - Bug fix in Blast::Run::postclient.pl which was always printing + usage information (reported by Ross Crowhurst). + - Improved error detection & reporting in Blast::Run::Webblast + and Blast::Run::postclient.pl. + - Bug fixes in HSP.pm: _set_match_stats() now properly handles + BLASTN output format and matches() properly handles BLASTX + reports. + - Fixed detection of substitution matrix type in Blast.pm. + - Updated Blast example scripts in examples/blast/ + (in particular, run.pl). + +0.074 Wed Feb 3 04:10:06 1999 (bioperl 0.04.3) + - Modified blast_config.pl to make the -prog and -db command-line + arguments mandatory. There are no default values for $opt_blast + and $opt_db. Also removed the -dna command-line argument. + - Fixed alignment section parsing bug in Blast.pm (bug report #24); + affects _parse_hsp_string(). + - Modified Blast::_display_hits() to use table_tiled(), thus + avoiding the buggy Sbjct::_display_hits(). + - Bug fix in Blast::HTML.pm regarding detection of HTML formatting. + - Changes in HSP::_set_data() to support BLAST-GP version 2.0.8. + - Changed the #! line of Bio/Tools/Blast/Run/postclient.pl + to the more standard /usr/bin/perl and predeclared some vars + to avoid compiler warnings. + - blast_config.pl and seqtools.pl were modified to include support + for the -wait option (sets timeout during file/stream reading). + - Assorted touch-ups to example/blast scripts. + +0.073 Mon Dec 28 15:18:29 1998 (bioperl 0.04.2) + - Made changes in Sbjct.pm and HSP.pm to allow parsing of older + versions of Blast. This affected HSP::_set_data() ("Query:?") + and Sbjct::_set_hsps() ("\s*Score"). + - Improved error handling in Sbjct::_set_hsps() and error + detection in HSP_set_data(). + - Added methods for obtaining the number of identical and conserved + residues (sans gaps) sequences in HSP.pm (num_identical, + num_conserved). + - Modified the default behavior of frac_identical() and + frac_conserved() to use the procedure used in the BLAST report: + BLAST-GP: uses total length of alignment in denominator + WU-BLAST: uses length of query sequence (sans gaps) in denominator. + When these methods are called with an argument ('sbjct' or 'query'), + the length of the sbjct or query sequence (sans gaps) is used in + the denominator (recommended). + - Fixed error in test #29 in the t/Blast.t test script. + +0.072 Wed Dec 16 05:05:21 1998 (bioperl 0.04.1) + - Fixed out of range exception in HSP::matches() as suggested by + Michael Lonetto in bio.perl.org bug report #11. + - Made changes in Sbjct.pm and HSP.pm to deal with the reporting of + TBLAST[NX] data in amino acid coordinate space. This affects + frac_identical() and frac_conserved() in HSP.pm, and _tile_hsps(), + length_aln(), frac_aligned_query(), frac_aligned_hit(), + frac_unaligned_query(), frac_unaligned_hit() in Sbjct.pm. + These also in response to bug report #11. + - Fixed behavior of frac_identical() and frac_aligned() in + both Sbjct.pm and HSP.pm to correspond to the data reported + by Blast. Default behavior now includes gaps instead of ignoring + them as before. This was in response to a report by Eli Venter. + - Cleaned up a few "uninitialized value" warnings. + +0.071 Thu Dec 10 18:41:51 1998 (bioperl 0.04) + - Bio::Tools::Blast::Sbjct::_set_id() to no longer uppercases + sequence ids for sbjct sequences. (Note however that the Blast + dataset may have been built with a tool that uppercases all + sequence ids. Check your raw Blast report.). + - Bio::Tools::Blast::HTML.pm incorporates the new URL for + information about the Bioperl Blast module. + +0.07 Thu Dec 3 13:43:06 1998 + - Reduced memory leak that becomes apparent when parsing many Blast + reports. This resulted in changes to Bio::Tools::Blast::_get_parse_func() + and the addition of a new function to Bio::Root::Object.pm that + explicitly breaks cyclic object reference structures. + (This problem was highlighted by recently Lincoln Stein, who is has + created a BoulderIO-based Blast parser. See BoulderIO-1.10 or later). + - Fixed support in blast_config.pl for supplying a custom filtering + function as a string using the -filt_func command-line option. + Added some examples of this to the parse.pl example script. + - Added start() and end() methods to Bio::Tools::Blast::HSP + and Bio::Tools::Blast::Sbjct.pm. Sbjct also received a range() + method (based on discussion with Ewan Birney). + - Fixed bugs in Blast.pm related to accessing the name of the + database, expecially in conjunction with stream parsing. + - A few fixes in Blast.pm and Sbjct regarding handling of + hit sequence and database names (reported by John Calley and + and Karl-Heinz Ott). + - Fixed regexp in Blast::_parse_hsp_data (reported by Brian Karlak). + - Added columns for start/end of the query/sbjct sequences to the + table_tiled() output. Added columns for HSP query/sbjct strand + and frame information to the table() output. + - Changes in Bio::Root::Object.pm regarding strict() and verbose() + which are no longer object-specific. + - Increased documentation in Bio::Root::Object.pm. + - Removed autoloading and several deprecated methods from the + Bio::Root::Object and Utilities. + + +0.062 Fri Sep 4 00:45:05 1998 + - Fixed bugs in Blast::_set_hits() which caused no hits to be + saved when _parse_hsp_data() threw an exception. + - Assorted documentation changes. This includes a fix in + the docs of Bio::Root::Object.pm that caused a compilation error + when you try to use it without installing it via Makefile.PL. + Most modules now have docs for data members in their PODs. + - Added method strip_html() to Bio::Tools::WWW.pm (a more general + version than that found in Bio::Tools::Blast::HTML.pm). + - Includes Bio::Seq.pm version 0.050 (formerly Bio::PreSeq.pm). + This version has Ewan Birney's modifications (start() and end() + methods added; numbering() now deprecated). + - By popular demand, Bio::Root::Object::src_obj() has been changed + to the more intuitive parent(). All modules have been updated to + incorporate this change. + +0.061 Wed Aug 26 12:30:34 1998 + - Fixed memory leak problem associated with Blast::_set_hits() that + lead to excessive memory usage when parsing large numbers reports, + expecially when using the -check_all=>1 parsing option. + (This may have solved the parse_stream.pl memory leak.) + - Fixed regexp in Sbjct::_set_hsps() to recognize sequence lengths + containing commas (reported by Lincoln Stein). + - Blast::signif() now always returns number in scientific notation. + - Fixed bug in Bio::Root::IOManager::display() (wasn't returning true). + - Changed Makefile.PL to always save backups for in-place edits + and prevents the 'inplace edit without backup' warning (reported by + Bobby Otillar). + - Assorted documentation changes/additions. + +0.06 Sun Aug 16 18:31:27 1998 + - Fixed bug in HSP::_set_match_stats() that failed to recognize Frames + on the + strand. + - Added HSP::strand() method for retrieval of strand information for + query and sbjct HSP sequences from nucleotide Blasts. + - Changed calls to Perl's length() function to CORE::length() + to avoid confusing the Perl 5.005_001 interpreter with + Bio::Tools::HSP::length() and Bio::Tools::Sbjct::length(). + (Reported by Mike Cariaso). + - Added column to the output of the Blast::table_tiled() method + to include fraction-aligned of the sbjct sequence. + - Uses new version of Bio::Root::Object.pm that has new exception + handling policy: exceptions and warnings are no longer by default + attached to the objects that threw them. See notes about this in + Bio::Root::Object.pm. (Main reason for 0.003-point jump in version). + - Added some safety net methods in Sbjct.pm: frac_aligned_sbjct() + and num_unaligned_sbjct(), which map to frac_aligned_hit() and + num_unaligned_hit(), respectively (API consistency issue). + - Minor changes/fixes in some of the demo scripts (seqtools.pl et al.). + - Removed "Modifications" notes from Blast-related modules. Added + pointer to distribution site instead, which consolidates this info. + - Modules can be used by copying directly from the lib/ directory into + your perl/lib directory without loss of functionality but at the expense of + increased startup time. It is still recommended that you use the standard + Makefile.PL procedure, however, since this will properly autosplit + the modules as well as run other checks. + +0.057 Thu Jul 23 02:28:26 1998 + - HSP.pm handles "match" sequence lines that are not properly padded. + Affects HSP::_set_data() and other HSP.pm methods that use residue + strings or indices (seq_str(), seq_inds()). (Reported by Libby Shoop). + - Added signif_fmt() method to Blast.pm to allow retrieval of + P/Expect value data in different formats (e.g., exponent only). + - Changes in the table() methods of Blast.pm to accomodate signif_fmt(). + - Minor changes in destructors to improve memory management. + - Added warnings in Blast.pm and parse_stream.pl regarding + memory usage issue when parsing Blast streams. + - Updated HTML::get_html_func() to work with the new version of + Bio::Root::IOManager::read(). + - Added new rexexps to HTML::_markup_report() to deal with hits + which lack a database identifier (e.g., gi|210095). + Added a few other SGD-specific markups to this method. + - New demo script "parse_multi.pl" provided as an alternative to the + memory-hungry parse_stream.pl when crunching many reports. + - Minor changes to parsing demo scripts and blast_config.pl. + - Added more tests to the t/blast.t test script. + - Small change in error handling in Webblast::_blast(): detects + when the Blast report file is empty and unlinks it. + - When the -email option is used, the list of files returned by + blast_remote() contains the string 'email' as the first element. + This provides a signal to Bio::Tools::Blast.pm. + +0.056 Wed Jul 15 03:42:20 1998 + - Fixed bug in HSP::seq_str() in which white space was not removed + from sequences generated by merging multiple lines, leading + to excessive warnings when building Bio::PreSeq objects. + This also involved related fixes in Bio::PreSeq.pm and + Bio::Root::Utilities.pm (q.v.). (Reported by Tim Dudgeon). + - Fixed bug in PreSeq.pm that caused bounds checking to fail + incorrectly when sequence numbering does not begin at 1. + - Uncompressing Blast reports no longer requires write access + in the directory containing the compressed files. + - Added another example script eg/blast/parse2.pl. + - Bio::Root::IOManager::read() now requires the supplied parsing + function reference to return true for each record parsed + (otherwise parsing stops). + +0.055 Tue Jun 23 13:56:34 1998 + - Fixed parsing problems for hit sequences without descriptions + or without database identifiers. This affected + Blast::_parse_hsp_data() and in Sbjct.pm, the methods + _set_id(), _set_desc(), and _set_hsps(). Most seriously, + _set_hsps() failed to locate the 'Length = ' line for + sequences without descriptions. + - Parses parameter data from NCBI Blast 2.0.5 (format change from 2.0.4) + This affected Blast::_set_parameters() and Blast::_parse_hsp_data(). + (Reported by Elliot Lefkowitz.) + - Minor change in HSP::_set_seq() regexp for identifying sequence data. + - Table output methods remove tabs from ends of lines. + - Updated Bio::Tools::Blast::HTML.pm to work with the + new version of read() updated in 0.054. + - Improved file_date() method of Bio::Root::Utilities.pm + +0.054 Tue Jun 16 20:24:36 1998 + - Minor change in the way Bio::Root::IOManager::read() works + (now checks return value of called func ref). Only affects + module internals. + - Minor changes in eg/ scripts and some documentation additions. + - Fixed a bug in Makefile.PL regarding detection of UnivAln.pm. + +0.053 Fri Jun 12 20:55:27 1998 + - Added the "collapse" functionality to the seq_inds() method of + HSP.pm and added a seq_inds() method to Sbjct.pm. + - Added a new demo script eg/blast/parse_positions.pl. + - Remove the "-residues" and "-tile_hsps" parsing options since + they are not necessary. + +0.052 Thu Jun 11 20:18:45 1998 + - Minor bug fixes in I/O methods. + - Improved documentation for Bio::Root modules. + - Miscellaneous documentation additions. + +0.051 Wed Jun 10 12:26:06 1998 + - Bio::Tools::Blast::HSP.pm does some additional deferring of + sequence data processing for a slight performance enhancement. + - Fixed formatting bug in Bio::Tools::Blast::HTML.pm that caused + uneven formatting of some description lines. + - Minor bug fixes in parsing code of Bio::Tools::Blast.pm. + - Some documentation additions. + - Makefile prints warning if Perl version is < 5.003. + +0.05 Fri Jun 5 15:58:22 1998 + - Improved gapped Blast support. + - Enhanced IO (reads from existing file or STDIN). + - Enhanced parsing: can parse an input stream containing + many reports. + - Created modularized version of Alex Dong Li's webblast. + - Added support for running Blasts remotely at NCBI using + Bio::Tools::Blast::Run::Webblast.pm. + - Added support for running local Blasts using the skeletal + Bio::Tools::Blast::Run::LocalBlast.pm. + _ Bio::Tools::Blast::HSP.pm can create Bio::PreSeq.pm sequence + objects and Bio::UnivAln.pm sequence alignment objects on demand. + - Encapsulated HTML-formatting code into the new module + Bio::Tools::Blast::HTML.pm. Updated markup regexps. + - Blast.pm module can parse HTML-formatted reports + (but beware of unforseen changes in the HTML!). + - Enhanced tab-delimited table output methods. + - Improved documentation: compatible with 5.004 pod2html. + - Created a variety of demo scripts (see eg/blast/) + - Miscellaneous bug fixes and API adjustments.. + - Bundled for distribution with MakeMaker and created test sript. + +0.03 Fri Mar 27 23:35:32 1998 + - Support for gapped Blast added. + - Improved parsing strategy. + - Inherits from Bio::Tools::SeqAnal.pm instead of Bio::Root::Object.pm + - Added more documentation. + - Miscellaneous bug fixes. + +0.01 March 1996 + - original version. + - Parses Blast 1.x report files. + + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Blast/HSP.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Blast/HSP.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1863 @@ +#---------------------------------------------------------------------------- +# PACKAGE : Bio::Tools::Blast::HSP +# AUTHOR : Steve Chervitz (sac@bioperl.org) +# CREATED : March 1996 +# STATUS : Alpha +# REVISION: $Id: HSP.pm,v 1.18 2002/10/22 07:38:48 lapp Exp $ +# +# For the latest version and documentation, visit the distribution site: +# http://genome-www.stanford.edu/perlOOP/bioperl/blast/ +# +# To generate documentation, run this module through pod2html +# (preferably from Perl v5.004 or better). +# +# Copyright (c) 1996-2000 Steve Chervitz. All Rights Reserved. +# This module is free software; you can redistribute it and/or +# modify it under the same terms as Perl itself. +#---------------------------------------------------------------------------- + +package Bio::Tools::Blast::HSP; + +use Bio::Root::Global qw(:devel); +use Bio::Root::Object (); +#use Bio::Root::Err qw(:std); + +@ISA = qw( Bio::Root::Object); + +use strict; +use vars qw($ID $GAP_SYMBOL @SCORE_CUTOFFS $Revision %STRAND_SYMBOL); +$ID = 'Bio::Tools::Blast::HSP'; +$Revision = '$Id: HSP.pm,v 1.18 2002/10/22 07:38:48 lapp Exp $'; #' + +$GAP_SYMBOL = '-'; # Need a more general way to handle gap symbols. +@SCORE_CUTOFFS = ( 100, 30 ); # Bit score cutoffs (see homol_score()). +%STRAND_SYMBOL = ('Plus' => 1, 'Minus' => -1); + +## POD Documentation: + +=head1 NAME + +Bio::Tools::Blast::HSP - Bioperl BLAST High-Scoring Segment Pair object + +=head1 SYNOPSIS + +=head2 Object Creation + +The construction of HSP objects is handled by Bio::Tools::Blast:: Sbjct.pm. +You should not need to use this package directly. See L<_initialize()|_initialize> +for a description of constructor parameters. + + require Bio::Tools::Blast::HSP; + + $hspObj = eval{ new Bio::Tools::Blast::HSP(-DATA =>\@hspData, + -PARENT =>$sbjct_object, + -NAME =>$hspCount, + -PROGRAM =>'TBLASTN', + ); + }; + +@hspData includes the raw BLAST report data for a specific HSP, +and is prepared by Bio::Tools::Blast::Sbjct.pm. + +=head1 INSTALLATION + +This module is included with the central Bioperl distribution: + + http://bio.perl.org/Core/Latest + ftp://bio.perl.org/pub/DIST + +Follow the installation instructions included in the README file. + +=head1 DESCRIPTION + +The Bio::Tools::Blast::HSP.pm module encapsulates data and methods for +manipulating, parsing, and analyzing HSPs ("High-scoring Segment Pairs") +derived from BLAST sequence analysis. + +This module is a utility module used by the B<Bio::Tools::Blast::Sbjct.pm> +and is not intended for separate use. Please see documentation for +B<Bio::Tools::Blast.pm> for some basic information about using +HSP objects (L<Links:>). + +=over 0 + +=item * Supports BLAST versions 1.x and 2.x, gapped and ungapped. + +=back + +Bio::Tools::Blast::HSP.pm has the ability to extract a list of all +residue indices for identical and conservative matches along both +query and sbjct sequences. Since this degree of detail is not always +needed, this behavior does not occur during construction of the HSP +object. These data will automatically be collected as necessary as +the HSP.pm object is used. + +=head1 DEPENDENCIES + +Bio::Tools::Blast::HSP.pm is a concrete class that inherits from +B<Bio::Root::Object.pm> and relies on B<Bio::Tools::Sbjct.pm> as a +container for HSP.pm objects. B<Bio::Seq.pm> and B<Bio::UnivAln.pm> +are employed for creating sequence and alignment objects, +respectively. + + +=head2 Relationship to UnivAln.pm & Seq.pm + +HSP.pm can provide the query or sbjct sequence as a B<Bio::Seq.pm> +object via the L<seq()|seq> method. The HSP.pm object can also create a +two-sequence B<Bio::UnivAln.pm> alignment object using the the query +and sbjct sequences via the L<get_aln()|get_aln> method. Creation of alignment +objects is not automatic when constructing the HSP.pm object since +this level of functionality is not always required and would generate +a lot of extra overhead when crunching many reports. + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Steve Chervitz, E<lt>sac@bioperl.orgE<gt> + +=head1 SEE ALSO + + Bio::Tools::Blast::Sbjct.pm - Blast hit object. + Bio::Tools::Blast.pm - Blast object. + Bio::Seq.pm - Biosequence object + Bio::UnivAln.pm - Biosequence alignment object. + Bio::Root::Object.pm - Proposed base class for all Bioperl objects. + +=head2 Links: + + http://bio.perl.org/Core/POD/Tools/Blast/Sbjct.pm.html + + http://bio.perl.org/Projects/modules.html - Online module documentation + http://bio.perl.org/Projects/Blast/ - Bioperl Blast Project + http://bio.perl.org/ - Bioperl Project Homepage + +=head1 COPYRIGHT + +Copyright (c) 1996-98 Steve Chervitz. All Rights Reserved. +This module is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +=cut + + + +# +## +### +#### END of main POD documentation. +### +## +# + +=head1 APPENDIX + +Methods beginning with a leading underscore are considered private +and are intended for internal use by this module. They are +B<not> considered part of the public interface and are described here +for documentation purposes only. + +=cut + +##################################################################################### +## CONSTRUCTOR ## +##################################################################################### + +=head2 _initialize + + Usage : n/a; automatically called by Bio::Root::Object::new() + : Bio::Tools::Blast::HSP.pm objects are constructed + : automatically by Bio::Tools::Sbjct.pm, so there is no need + : for direct consumption. + Purpose : Initializes HSP data and calls private methods to extract + : the data for a given HSP. + : Calls superclass constructor first (Bio::Root::Object.pm). + Returns : n/a + Argument : Named parameters passed from new(): + : All tags must be uppercase (does not call _rearrange()). + : -DATA => array ref containing raw data for one HSP. + : -PARENT => Sbjct.pm object ref. + : -NAME => integer (1..n). + : -PROGRAM => string ('TBLASTN', 'BLASTP', etc.). + +See Also : L<_set_data()|_set_data>, B<Bio::Root::Object::new()>, B<Bio::Tools::Blast::Sbjct::_set_hsps()> + +=cut + +#---------------- +sub _initialize { +#---------------- + my( $self, %param ) = @_; + + $self->SUPER::_initialize( %param ); + + # The gapped and program booleans may be needed after the HSP object + # is built. +# $self->{'_gapped'} = $param{-GAPPED} || 0; + $self->{'_prog'} = $param{-PROGRAM} || 0; + $self->_set_data( @{$param{-DATA}} ); +} + +##################################################################################### +## ACCESSORS ## +##################################################################################### + + +=head2 _set_data + + Usage : n/a; called automatically during object construction. + Purpose : Sets the query sequence, sbjct sequence, and the "match" data + : which consists of the symbols between the query and sbjct lines + : in the alignment. + Argument : Array (all lines from a single, complete HSP, one line per element) + Throws : Propagates any exceptions from the methods called ("See Also") + +See Also : L<_set_seq()|_set_seq>, L<_set_residues()|_set_residues>, L<_set_score_stats()|_set_score_stats>, L<_set_match_stats()|_set_match_stats>, L<_initialize()|_initialize> + +=cut + +#-------------- +sub _set_data { +#-------------- + my $self = shift; + my @data = @_; + my @queryList = (); # 'Query' = SEQUENCE USED TO QUERY THE DATABASE. + my @sbjctList = (); # 'Sbjct' = HOMOLOGOUS SEQUENCE FOUND IN THE DATABASE. + my @matchList = (); + my $matchLine = 0; # Alternating boolean: when true, load 'match' data. + my @linedat = (); + + $DEBUG and print STDERR "$ID: set_data()\n"; + + my($line, $aln_row_len, $length_diff); + $length_diff = 0; + + # Collecting data for all lines in the alignment + # and then storing the collections for possible processing later. + # + # Note that "match" lines may not be properly padded with spaces. + # This loop now properly handles such cases: + # Query: 1141 PSLVELTIRDCPRLEVGPMIRSLPKFPMLKKLDLAVANIIEEDLDVIGSLEELVIXXXXX 1200 + # PSLVELTIRDCPRLEVGPMIRSLPKFPMLKKLDLAVANIIEEDLDVIGSLEELVI + # Sbjct: 1141 PSLVELTIRDCPRLEVGPMIRSLPKFPMLKKLDLAVANIIEEDLDVIGSLEELVILSLKL 1200 + + foreach $line( @data ) { + next if $line =~ /^\s*$/; + + if( $line =~ /^ ?Score/ ) { + $self->_set_score_stats( $line ); + } elsif( $line =~ /^ ?(Identities|Positives|Strand)/ ) { + $self->_set_match_stats( $line ); + } elsif( $line =~ /^ ?Frame = ([\d+-]+)/ ) { + # Version 2.0.8 has Frame information on a separate line. + $self->{'_frame'} = $1; + } elsif( $line =~ /^(Query:?[\s\d]+)([^\s\d]+)/ ) { + push @queryList, $line; + $self->{'_match_indent'} = CORE::length $1; + $aln_row_len = (CORE::length $1) + (CORE::length $2); + $matchLine = 1; + } elsif( $matchLine ) { + # Pad the match line with spaces if necessary. + $length_diff = $aln_row_len - CORE::length $line; + $length_diff and $line .= ' 'x $length_diff; + push @matchList, $line; + $matchLine = 0; + } elsif( $line =~ /^Sbjct/ ) { + push @sbjctList, $line; + } + } + + # Storing the query and sbjct lists in case they are needed later. + # We could make this conditional to save memory. + $self->{'_queryList'} = \@queryList; + $self->{'_sbjctList'} = \@sbjctList; + + # Storing the match list in case it is needed later. + $self->{'_matchList'} = \@matchList; + + if(not defined ($self->{'_numIdentical'})) { + $self->throw("Can't parse match statistics.", + "Possibly a new or unrecognized Blast format."); + } + + if(!scalar @queryList or !scalar @sbjctList) { + $self->throw("Can't find query or sbjct alignment lines.", + "Possibly unrecognized Blast format."); + } +} + + + +=head2 _set_score_stats + + Usage : n/a; called automatically by _set_data() + Purpose : Sets various score statistics obtained from the HSP listing. + Argument : String with any of the following formats: + : blast2: Score = 30.1 bits (66), Expect = 9.2 + : blast2: Score = 158.2 bits (544), Expect(2) = e-110 + : blast1: Score = 410 (144.3 bits), Expect = 1.7e-40, P = 1.7e-40 + : blast1: Score = 55 (19.4 bits), Expect = 5.3, Sum P(3) = 0.99 + Throws : Exception if the stats cannot be parsed, probably due to a change + : in the Blast report format. + +See Also : L<_set_data()|_set_data> + +=cut + +#-------------------- +sub _set_score_stats { +#-------------------- + my ($self, $data) = @_; + + my ($expect, $p); + + if($data =~ /Score = +([\d.e+-]+) bits \(([\d.e+-]+)\), +Expect = +([\d.e+-]+)/) { + # blast2 format n = 1 + $self->{'_bits'} = $1; + $self->{'_score'} = $2; + $expect = $3; + } elsif($data =~ /Score = +([\d.e+-]+) bits \(([\d.e+-]+)\), +Expect\((\d+)\) = +([\d.e+-]+)/) { + # blast2 format n > 1 + $self->{'_bits'} = $1; + $self->{'_score'} = $2; + $self->{'_n'} = $3; + $expect = $4; + + } elsif($data =~ /Score = +([\d.e+-]+) \(([\d.e+-]+) bits\), +Expect = +([\d.e+-]+), P = +([\d.e-]+)/) { + # blast1 format, n = 1 + $self->{'_score'} = $1; + $self->{'_bits'} = $2; + $expect = $3; + $p = $4; + + } elsif($data =~ /Score = +([\d.e+-]+) \(([\d.e+-]+) bits\), +Expect = +([\d.e+-]+), +Sum P\((\d+)\) = +([\d.e-]+)/) { + # blast1 format, n > 1 + $self->{'_score'} = $1; + $self->{'_bits'} = $2; + $expect = $3; + $self->{'_n'} = $4; + $p = $5; + + } else { + $self->throw("Can't parse score statistics: unrecognized format.", "$data"); + } + + $expect = "1$expect" if $expect =~ /^e/i; + $p = "1$p" if defined $p and $p=~ /^e/i; + + $self->{'_expect'} = $expect; + $self->{'_p'} = $p || undef; + +} + + + +=head2 _set_match_stats + + Usage : n/a; called automatically by _set_data() + Purpose : Sets various matching statistics obtained from the HSP listing. + Argument : blast2: Identities = 23/74 (31%), Positives = 29/74 (39%), Gaps = 17/74 (22%) + : blast2: Identities = 57/98 (58%), Positives = 74/98 (75%) + : blast1: Identities = 87/204 (42%), Positives = 126/204 (61%) + : blast1: Identities = 87/204 (42%), Positives = 126/204 (61%), Frame = -3 + : WU-blast: Identities = 310/553 (56%), Positives = 310/553 (56%), Strand = Minus / Plus + Throws : Exception if the stats cannot be parsed, probably due to a change + : in the Blast report format. + Comments : The "Gaps = " data in the HSP header has a different meaning depending + : on the type of Blast: for BLASTP, this number is the total number of + : gaps in query+sbjct; for TBLASTN, it is the number of gaps in the + : query sequence only. Thus, it is safer to collect the data + : separately by examining the actual sequence strings as is done + : in _set_seq(). + +See Also : L<_set_data()|_set_data>, L<_set_seq()|_set_seq> + +=cut + +#-------------------- +sub _set_match_stats { +#-------------------- + my ($self, $data) = @_; + + if($data =~ m!Identities = (\d+)/(\d+)!) { + # blast1 or 2 format + $self->{'_numIdentical'} = $1; + $self->{'_totalLength'} = $2; + } + + if($data =~ m!Positives = (\d+)/(\d+)!) { + # blast1 or 2 format + $self->{'_numConserved'} = $1; + $self->{'_totalLength'} = $2; + } + + if($data =~ m!Frame = ([\d+-]+)!) { + $self->{'_frame'} = $1; + } + + # Strand data is not always present in this line. + # _set_seq() will also set strand information. + if($data =~ m!Strand = (\w+) / (\w+)!) { + $self->{'_queryStrand'} = $1; + $self->{'_sbjctStrand'} = $2; + } + +# if($data =~ m!Gaps = (\d+)/(\d+)!) { +# $self->{'_totalGaps'} = $1; +# } else { +# $self->{'_totalGaps'} = 0; +# } +} + + + +=head2 _set_seq_data + + Usage : n/a; called automatically when sequence data is requested. + Purpose : Sets the HSP sequence data for both query and sbjct sequences. + : Includes: start, stop, length, gaps, and raw sequence. + Argument : n/a + Throws : Propagates any exception thrown by _set_match_seq() + Comments : Uses raw data stored by _set_data() during object construction. + : These data are not always needed, so it is conditionally + : executed only upon demand by methods such as gaps(), _set_residues(), + : etc. _set_seq() does the dirty work. + +See Also : L<_set_seq()|_set_seq> + +=cut + +sub _set_seq_data { + my $self = shift; + + $self->_set_seq('query', @{$self->{'_queryList'}}); + $self->_set_seq('sbjct', @{$self->{'_sbjctList'}}); + + # Liberate some memory. + @{$self->{'_queryList'}} = @{$self->{'_sbjctList'}} = (); + undef $self->{'_queryList'}; + undef $self->{'_sbjctList'}; + + $self->{'_set_seq_data'} = 1; +} + + + +=head2 _set_seq + + Usage : n/a; called automatically by _set_seq_data() + : $hsp_obj->($seq_type, @data); + Purpose : Sets sequence information for both the query and sbjct sequences. + : Directly counts the number of gaps in each sequence (if gapped Blast). + Argument : $seq_type = 'query' or 'sbjct' + : @data = all seq lines with the form: + : Query: 61 SPHNVKDRKEQNGSINNAISPTATANTSGSQQINIDSALRDRSSNVAAQPSLSDASSGSN 120 + Throws : Exception if data strings cannot be parsed, probably due to a change + : in the Blast report format. + Comments : Uses first argument to determine which data members to set + : making this method sensitive data member name changes. + : Behavior is dependent on the type of BLAST analysis (TBLASTN, BLASTP, etc). + Warning : Sequence endpoints are normalized so that start < end. This affects HSPs + : for TBLASTN/X hits on the minus strand. Normalization facilitates use + : of range information by methods such as match(). + +See Also : L<_set_seq_data()|_set_seq_data>, L<matches()|matches>, L<range()|range>, L<start()|start>, L<end()|end> + +=cut + +#------------- +sub _set_seq { +#------------- + my $self = shift; + my $seqType = shift; + my @data = @_; + my @ranges = (); + my @sequence = (); + my $numGaps = 0; + + foreach( @data ) { + if( m/(\d+) *(\D+) *(\d+)/) { + push @ranges, ( $1, $3 ) ; + push @sequence, $2; + } else { + $self->warn("Bad sequence data: $_"); + } + } + + (scalar(@sequence) and scalar(@ranges)) || $self->throw("Can't set sequence: missing data", + "Possibly unrecognized Blast format."); + + # Sensitive to member name changes. + $seqType = "_\L$seqType\E"; + $self->{$seqType.'Start'} = $ranges[0]; + $self->{$seqType.'Stop'} = $ranges[ $#ranges ]; + $self->{$seqType.'Seq'} = \@sequence; + + $self->{$seqType.'Length'} = abs($ranges[ $#ranges ] - $ranges[0]) + 1; + + # Adjust lengths for BLASTX, TBLASTN, TBLASTX sequences + # Converting nucl coords to amino acid coords. + + my $prog = $self->{'_prog'}; + if($prog eq 'TBLASTN' and $seqType eq '_sbjct') { + $self->{$seqType.'Length'} /= 3; + } elsif($prog eq 'BLASTX' and $seqType eq '_query') { + $self->{$seqType.'Length'} /= 3; + } elsif($prog eq 'TBLASTX') { + $self->{$seqType.'Length'} /= 3; + } + + $self->{$seqType.'Strand'} = 'Plus' if $prog =~ /BLAST[NX]/; + + # Normalize sequence endpoints so that start < end. + # Reverse complement or 'minus strand' HSPs get flipped here. + if($self->{$seqType.'Start'} > $self->{$seqType.'Stop'}) { + ($self->{$seqType.'Start'}, $self->{$seqType.'Stop'}) = + ($self->{$seqType.'Stop'}, $self->{$seqType.'Start'}); + $self->{$seqType.'Strand'} = 'Minus'; + } + + ## Count number of gaps in each seq. Only need to do this for gapped Blasts. +# if($self->{'_gapped'}) { + my $seqstr = join('', @sequence); + $seqstr =~ s/\s//g; + my $num_gaps = CORE::length($seqstr) - $self->{$seqType.'Length'}; + $self->{$seqType.'Gaps'} = $num_gaps if $num_gaps > 0; +# } +} + + +=head2 _set_residues + + Usage : n/a; called automatically when residue data is requested. + Purpose : Sets the residue numbers representing the identical and + : conserved positions. These data are obtained by analyzing the + : symbols between query and sbjct lines of the alignments. + Argument : n/a + Throws : Propagates any exception thrown by _set_seq_data() and _set_match_seq(). + Comments : These data are not always needed, so it is conditionally + : executed only upon demand by methods such as seq_inds(). + : Behavior is dependent on the type of BLAST analysis (TBLASTN, BLASTP, etc). + +See Also : L<_set_seq_data()|_set_seq_data>, L<_set_match_seq()|_set_match_seq>, L<seq_inds()|seq_inds> + +=cut + +#------------------ +sub _set_residues { +#------------------ + my $self = shift; + my @sequence = (); + + $self->_set_seq_data() unless $self->{'_set_seq_data'}; + + # Using hashes to avoid saving duplicate residue numbers. + my %identicalList_query = (); + my %identicalList_sbjct = (); + my %conservedList_query = (); + my %conservedList_sbjct = (); + + my $aref = $self->_set_match_seq() if not ref $self->{'_matchSeq'}; + $aref ||= $self->{'_matchSeq'}; + my $seqString = join('', @$aref ); + + my $qseq = join('',@{$self->{'_querySeq'}}); + my $sseq = join('',@{$self->{'_sbjctSeq'}}); + my $resCount_query = $self->{'_queryStop'} || 0; + my $resCount_sbjct = $self->{'_sbjctStop'} || 0; + + my $prog = $self->{'_prog'}; + if($prog !~ /^BLASTP|^BLASTN/) { + if($prog eq 'TBLASTN') { + $resCount_sbjct /= 3; + } elsif($prog eq 'BLASTX') { + $resCount_query /= 3; + } elsif($prog eq 'TBLASTX') { + $resCount_query /= 3; + $resCount_sbjct /= 3; + } + } + + my ($mchar, $schar, $qchar); + while( $mchar = chop($seqString) ) { + ($qchar, $schar) = (chop($qseq), chop($sseq)); + if( $mchar eq '+' ) { + $conservedList_query{ $resCount_query } = 1; + $conservedList_sbjct{ $resCount_sbjct } = 1; + } elsif( $mchar ne ' ' ) { + $identicalList_query{ $resCount_query } = 1; + $identicalList_sbjct{ $resCount_sbjct } = 1; + } + $resCount_query-- if $qchar ne $GAP_SYMBOL; + $resCount_sbjct-- if $schar ne $GAP_SYMBOL; + } + $self->{'_identicalRes_query'} = \%identicalList_query; + $self->{'_conservedRes_query'} = \%conservedList_query; + $self->{'_identicalRes_sbjct'} = \%identicalList_sbjct; + $self->{'_conservedRes_sbjct'} = \%conservedList_sbjct; + +} + + + + +=head2 _set_match_seq + + Usage : n/a. Internal method. + : $hsp_obj->_set_match_seq() + Purpose : Set the 'match' sequence for the current HSP (symbols in between + : the query and sbjct lines.) + Returns : Array reference holding the match sequences lines. + Argument : n/a + Throws : Exception if the _matchList field is not set. + Comments : The match information is not always necessary. This method + : allows it to be conditionally prepared. + : Called by _set_residues>() and seq_str(). + +See Also : L<_set_residues()|_set_residues>, L<seq_str()|seq_str> + +=cut + +#------------------- +sub _set_match_seq { +#------------------- + my $self = shift; + +## DEBUGGING CODE: +# if($self->parent->name eq '1AK5_' and $self->parent->parent->name eq 'YAR073W') { +# print "\n_set_match_seq() called for HSP ", $self->name, " of hit ${\$self->parent->name} in query ${\$self->parent->parent->name}"; <STDIN>; + # } + + ref($self->{'_matchList'}) || $self->throw("Can't set HSP match sequence: No data"); + + my @data = @{$self->{'_matchList'}}; + + my(@sequence); + foreach( @data ) { + chomp($_); + ## Remove leading spaces; (note: aln may begin with a space + ## which is why we can't use s/^ +//). + s/^ {$self->{'_match_indent'}}//; + push @sequence, $_; + } + # Liberate some memory. + @{$self->{'_matchList'}} = undef; + $self->{'_matchList'} = undef; + + $self->{'_matchSeq'} = \@sequence; + +## DEBUGGING CODE: +# if($self->parent->name eq '1AK5_' and $self->parent->parent->name eq 'YAR073W') { +# print "RETURNING: $self->{'_matchSeq'}:\n @{$self->{'_matchSeq'}}";<STDIN>; +# } + + $self->{'_matchSeq'}; +} + + + +=head2 score + + Usage : $hsp_obj->score() + Purpose : Get the Blast score for the HSP. + Returns : Integer + Argument : n/a + Throws : n/a + +See Also : L<bits()|bits> + +=cut + +#--------- +sub score { my $self = shift; $self->{'_score'}; } +#--------- + + + +=head2 bits + + Usage : $hsp_obj->bits() + Purpose : Get the Blast score in bits for the HSP. + Returns : Float + Argument : n/a + Throws : n/a + + +See Also : L<score()|score> + +=cut + +#-------- +sub bits { my $self = shift; $self->{'_bits'}; } +#-------- + + + +=head2 n + + Usage : $hsp_obj->n() + Purpose : Get the N value (num HSPs on which P/Expect is based). + : This value is not defined with NCBI Blast2 with gapping. + Returns : Integer or null string if not defined. + Argument : n/a + Throws : n/a + Comments : The 'N' value is listed in parenthesis with P/Expect value: + : e.g., P(3) = 1.2e-30 ---> (N = 3). + : Not defined in NCBI Blast2 with gaps. + : This typically is equal to the number of HSPs but not always. + : To obtain the number of HSPs, use Bio::Tools::Blast::Sbjct::num_hsps(). + +See Also : L<score()|score> + +=cut + +#----- +sub n { my $self = shift; $self->{'_n'} || ''; } +#----- + + + +=head2 frame + + Usage : $hsp_obj->frame() + Purpose : Get the reading frame number (-/+ 1, 2, 3) (TBLASTN/X only). + Returns : Integer or null string if not defined. + Argument : n/a + Throws : n/a + +=cut + +#--------- +sub frame { my $self = shift; $self->{'_frame'} || ''; } +#--------- + + + +=head2 signif() + + Usage : $hsp_obj->signif() + Purpose : Get the P-value or Expect value for the HSP. + Returns : Float (0.001 or 1.3e-43) + : Returns P-value if it is defined, otherwise, Expect value. + Argument : n/a + Throws : n/a + Comments : Provided for consistency with Sbjct::signif() + : Support for returning the significance data in different + : formats (e.g., exponent only), is not provided for HSP objects. + : This is only available for the Sbjct or Blast object. + +See Also : L<p()|p>, L<expect()|expect>, B<Bio::Tools::Blast::Sbjct::signif()> + +=cut + +#----------- +sub signif { +#----------- + my $self = shift; + my $val ||= defined($self->{'_p'}) ? $self->{'_p'} : $self->{'_expect'}; + $val; +} + + + +=head2 expect + + Usage : $hsp_obj->expect() + Purpose : Get the Expect value for the HSP. + Returns : Float (0.001 or 1.3e-43) + Argument : n/a + Throws : n/a + Comments : Support for returning the expectation data in different + : formats (e.g., exponent only), is not provided for HSP objects. + : This is only available for the Sbjct or Blast object. + +See Also : L<p()|p> + +=cut + +#---------- +sub expect { my $self = shift; $self->{'_expect'}; } +#---------- + + + +=head2 p + + Usage : $hsp_obj->p() + Purpose : Get the P-value for the HSP. + Returns : Float (0.001 or 1.3e-43) or undef if not defined. + Argument : n/a + Throws : n/a + Comments : P-value is not defined with NCBI Blast2 reports. + : Support for returning the expectation data in different + : formats (e.g., exponent only) is not provided for HSP objects. + : This is only available for the Sbjct or Blast object. + +See Also : L<expect()|expect> + +=cut + +#----- +sub p { my $self = shift; $self->{'_p'}; } +#----- + + +=head2 length + + Usage : $hsp->length( [seq_type] ) + Purpose : Get the length of the aligned portion of the query or sbjct. + Example : $hsp->length('query') + Returns : integer + Argument : seq_type: 'query' | 'sbjct' | 'total' (default = 'total') + Throws : n/a + Comments : 'total' length is the full length of the alignment + : as reported in the denominators in the alignment section: + : "Identical = 34/120 Positives = 67/120". + : Developer note: when using the built-in length function within + : this module, call it as CORE::length(). + +See Also : L<gaps()|gaps> + +=cut + +#----------- +sub length { +#----------- + my( $self, $type ) = @_; + $type ||= 'total'; + + $type ne 'total' and $self->_set_seq_data() unless $self->{'_set_seq_data'}; + + ## Sensitive to member name format. + $type = "_\L$type\E"; + $self->{$type.'Length'}; +} + + + +=head2 gaps + + Usage : $hsp->gaps( [seq_type] ) + Purpose : Get the number of gaps in the query, sbjct, or total alignment. + : Also can return query gaps and sbjct gaps as a two-element list + : when in array context. + Example : $total_gaps = $hsp->gaps(); + : ($qgaps, $sgaps) = $hsp->gaps(); + : $qgaps = $hsp->gaps('query'); + Returns : scalar context: integer + : array context without args: (int, int) = ('queryGaps', 'sbjctGaps') + Argument : seq_type: 'query' | 'sbjct' | 'total' + : (default = 'total', scalar context) + : Array context can be "induced" by providing an argument of 'list' or 'array'. + Throws : n/a + +See Also : L<length()|length>, L<matches()|matches> + +=cut + +#--------- +sub gaps { +#--------- + my( $self, $seqType ) = @_; + + $self->_set_seq_data() unless $self->{'_set_seq_data'}; + + $seqType ||= (wantarray ? 'list' : 'total'); + + if($seqType =~ /list|array/i) { + return (($self->{'_queryGaps'} || 0), ($self->{'_sbjctGaps'} || 0)); + } + + if($seqType eq 'total') { + return ($self->{'_queryGaps'} + $self->{'_sbjctGaps'}) || 0; + } else { + ## Sensitive to member name format. + $seqType = "_\L$seqType\E"; + return $self->{$seqType.'Gaps'} || 0; + } +} + + + +=head2 matches + + Usage : $hsp->matches([seq_type], [start], [stop]); + Purpose : Get the total number of identical and conservative matches + : in the query or sbjct sequence for the given HSP. Optionally can + : report data within a defined interval along the seq. + : (Note: 'conservative' matches are called 'positives' in the + : Blast report.) + Example : ($id,$cons) = $hsp_object->matches('sbjct'); + : ($id,$cons) = $hsp_object->matches('query',300,400); + Returns : 2-element array of integers + Argument : (1) seq_type = 'query' | 'sbjct' (default = query) + : (2) start = Starting coordinate (optional) + : (3) stop = Ending coordinate (optional) + Throws : Exception if the supplied coordinates are out of range. + Comments : Relies on seq_str('match') to get the string of alignment symbols + : between the query and sbjct lines which are used for determining + : the number of identical and conservative matches. + +See Also : L<length()|length>, L<gaps()|gaps>, L<seq_str()|seq_str>, B<Bio::Tools::Blast::Sbjct::_adjust_contigs()> + +=cut + +#----------- +sub matches { +#----------- + my( $self, %param ) = @_; + my(@data); + my($seqType, $beg, $end) = ($param{-SEQ}, $param{-START}, $param{-STOP}); + $seqType ||= 'query'; + + if(!defined $beg && !defined $end) { + ## Get data for the whole alignment. + push @data, ($self->{'_numIdentical'}, $self->{'_numConserved'}); + } else { + ## Get the substring representing the desired sub-section of aln. + $beg ||= 0; + $end ||= 0; + my($start,$stop) = $self->range($seqType); + if($beg == 0) { $beg = $start; $end = $beg+$end; } + elsif($end == 0) { $end = $stop; $beg = $end-$beg; } + + if($end >= $stop) { $end = $stop; } ##ML changed from if (end >stop) + else { $end += 1;} ##ML moved from commented position below, makes + ##more sense here +# if($end > $stop) { $end = $stop; } + if($beg < $start) { $beg = $start; } +# else { $end += 1;} + +# my $seq = substr($self->seq_str('match'), $beg-$start, ($end-$beg)); + + ## ML: START fix for substr out of range error ------------------ + my $seq = ""; + if (($self->{'_prog'} eq 'TBLASTN') and ($seqType eq 'sbjct')) + { + $seq = substr($self->seq_str('match'), + int(($beg-$start)/3), int(($end-$beg+1)/3)); + + } elsif (($self->{'_prog'} eq 'BLASTX') and ($seqType eq 'query')) + { + $seq = substr($self->seq_str('match'), + int(($beg-$start)/3), int(($end-$beg+1)/3)); + } else { + $seq = substr($self->seq_str('match'), + $beg-$start, ($end-$beg)); + } + ## ML: End of fix for substr out of range error ----------------- + + + ## ML: debugging code + ## This is where we get our exception. Try printing out the values going + ## into this: + ## +# print STDERR +# qq(*------------MY EXCEPTION --------------------\nSeq: ") , +# $self->seq_str("$seqType"), qq("\n),$self->name,",( index:"; +# print STDERR $beg-$start, ", len: ", $end-$beg," ), (HSPRealLen:", +# CORE::length $self->seq_str("$seqType"); +# print STDERR ", HSPCalcLen: ", $stop - $start +1 ," ), +# ( beg: $beg, end: $end ), ( start: $start, stop: stop )\n"; + ## ML: END DEBUGGING CODE---------- + + if(!CORE::length $seq) { + $self->throw("Undefined sub-sequence ($beg,$end). Valid range = $start - $stop"); + } + ## Get data for a substring. +# printf "Collecting HSP subsection data: beg,end = %d,%d; start,stop = %d,%d\n%s<---\n", $beg, $end, $start, $stop, $seq; +# printf "Original match seq:\n%s\n",$self->seq_str('match'); + $seq =~ s/ //g; # remove space (no info). + my $len_cons = CORE::length $seq; + $seq =~ s/\+//g; # remove '+' characters (conservative substitutions) + my $len_id = CORE::length $seq; + push @data, ($len_id, $len_cons); +# printf " HSP = %s\n id = %d; cons = %d\n", $self->name, $len_id, $len_cons; <STDIN>; + } + @data; +} + + + +=head2 frac_identical + + Usage : $hsp_object->frac_identical( [seq_type] ); + Purpose : Get the fraction of identical positions within the given HSP. + Example : $frac_iden = $hsp_object->frac_identical('query'); + Returns : Float (2-decimal precision, e.g., 0.75). + Argument : seq_type: 'query' | 'sbjct' | 'total' + : default = 'total' (but see comments below). + Throws : n/a + Comments : Different versions of Blast report different values for the total + : length of the alignment. This is the number reported in the + : denominators in the stats section: + : "Identical = 34/120 Positives = 67/120". + : BLAST-GP uses the total length of the alignment (with gaps) + : WU-BLAST uses the length of the query sequence (without gaps). + : Therefore, when called without an argument or an argument of 'total', + : this method will report different values depending on the + : version of BLAST used. + : + : To get the fraction identical among only the aligned residues, + : ignoring the gaps, call this method with an argument of 'query' + : or 'sbjct'. + +See Also : L<frac_conserved()|frac_conserved>, L<num_identical()|num_identical>, L<matches()|matches> + +=cut + +#------------------- +sub frac_identical { +#------------------- +# The value is calculated as opposed to storing it from the parsed results. +# This saves storage and also permits flexibility in determining for which +# sequence (query or sbjct) the figure is to be calculated. + + my( $self, $seqType ) = @_; + $seqType ||= 'total'; + + if($seqType ne 'total') { + $self->_set_seq_data() unless $self->{'_set_seq_data'}; + } + ## Sensitive to member name format. + $seqType = "_\L$seqType\E"; + + sprintf( "%.2f", $self->{'_numIdentical'}/$self->{$seqType.'Length'}); +} + + +=head2 frac_conserved + + Usage : $hsp_object->frac_conserved( [seq_type] ); + Purpose : Get the fraction of conserved positions within the given HSP. + : (Note: 'conservative' positions are called 'positives' in the + : Blast report.) + Example : $frac_cons = $hsp_object->frac_conserved('query'); + Returns : Float (2-decimal precision, e.g., 0.75). + Argument : seq_type: 'query' | 'sbjct' + : default = 'total' (but see comments below). + Throws : n/a + Comments : Different versions of Blast report different values for the total + : length of the alignment. This is the number reported in the + : denominators in the stats section: + : "Identical = 34/120 Positives = 67/120". + : BLAST-GP uses the total length of the alignment (with gaps) + : WU-BLAST uses the length of the query sequence (without gaps). + : Therefore, when called without an argument or an argument of 'total', + : this method will report different values depending on the + : version of BLAST used. + : + : To get the fraction conserved among only the aligned residues, + : ignoring the gaps, call this method with an argument of 'query' + : or 'sbjct'. + +See Also : L<frac_conserved()|frac_conserved>, L<num_conserved()|num_conserved>, L<matches()|matches> + +=cut + +#-------------------- +sub frac_conserved { +#-------------------- +# The value is calculated as opposed to storing it from the parsed results. +# This saves storage and also permits flexibility in determining for which +# sequence (query or sbjct) the figure is to be calculated. + + my( $self, $seqType ) = @_; + $seqType ||= 'total'; + + if($seqType ne 'total') { + $self->_set_seq_data() unless $self->{'_set_seq_data'}; + } + + ## Sensitive to member name format. + $seqType = "_\L$seqType\E"; + + sprintf( "%.2f", $self->{'_numConserved'}/$self->{$seqType.'Length'}); +} + + +=head2 num_identical + + Usage : $hsp_object->num_identical(); + Purpose : Get the number of identical positions within the given HSP. + Example : $num_iden = $hsp_object->num_identical(); + Returns : integer + Argument : n/a + Throws : n/a + +See Also : L<num_conserved()|num_conserved>, L<frac_identical()|frac_identical> + +=cut + +#------------------- +sub num_identical { +#------------------- + my( $self) = shift; + + $self->{'_numIdentical'}; +} + + +=head2 num_conserved + + Usage : $hsp_object->num_conserved(); + Purpose : Get the number of conserved positions within the given HSP. + Example : $num_iden = $hsp_object->num_conserved(); + Returns : integer + Argument : n/a + Throws : n/a + +See Also : L<num_identical()|num_identical>, L<frac_conserved()|frac_conserved> + +=cut + +#------------------- +sub num_conserved { +#------------------- + my( $self) = shift; + + $self->{'_numConserved'}; +} + + + +=head2 range + + Usage : $hsp->range( [seq_type] ); + Purpose : Gets the (start, end) coordinates for the query or sbjct sequence + : in the HSP alignment. + Example : ($qbeg, $qend) = $hsp->range('query'); + : ($sbeg, $send) = $hsp->range('sbjct'); + Returns : Two-element array of integers + Argument : seq_type = string, 'query' or 'sbjct' (default = 'query') + : (case insensitive). + Throws : n/a + +See Also : L<start()|start>, L<end()|end> + +=cut + +#---------- +sub range { +#---------- + my ($self, $seqType) = @_; + + $self->_set_seq_data() unless $self->{'_set_seq_data'}; + + $seqType ||= 'query'; + ## Sensitive to member name changes. + $seqType = "_\L$seqType\E"; + + return ($self->{$seqType.'Start'},$self->{$seqType.'Stop'}); +} + +=head2 start + + Usage : $hsp->start( [seq_type] ); + Purpose : Gets the start coordinate for the query, sbjct, or both sequences + : in the HSP alignment. + Example : $qbeg = $hsp->start('query'); + : $sbeg = $hsp->start('sbjct'); + : ($qbeg, $sbeg) = $hsp->start(); + Returns : scalar context: integer + : array context without args: list of two integers + Argument : In scalar context: seq_type = 'query' or 'sbjct' + : (case insensitive). If not supplied, 'query' is used. + : Array context can be "induced" by providing an argument of 'list' or 'array'. + Throws : n/a + +See Also : L<end()|end>, L<range()|range> + +=cut + +#---------- +sub start { +#---------- + my ($self, $seqType) = @_; + + $seqType ||= (wantarray ? 'list' : 'query'); + + $self->_set_seq_data() unless $self->{'_set_seq_data'}; + + if($seqType =~ /list|array/i) { + return ($self->{'_queryStart'}, $self->{'_sbjctStart'}); + } else { + ## Sensitive to member name changes. + $seqType = "_\L$seqType\E"; + return $self->{$seqType.'Start'}; + } +} + +=head2 end + + Usage : $hsp->end( [seq_type] ); + Purpose : Gets the end coordinate for the query, sbjct, or both sequences + : in the HSP alignment. + Example : $qbeg = $hsp->end('query'); + : $sbeg = $hsp->end('sbjct'); + : ($qbeg, $sbeg) = $hsp->end(); + Returns : scalar context: integer + : array context without args: list of two integers + Argument : In scalar context: seq_type = 'query' or 'sbjct' + : (case insensitive). If not supplied, 'query' is used. + : Array context can be "induced" by providing an argument of 'list' or 'array'. + Throws : n/a + +See Also : L<start()|start>, L<range()|range> + +=cut + +#---------- +sub end { +#---------- + my ($self, $seqType) = @_; + + $seqType ||= (wantarray ? 'list' : 'query'); + + $self->_set_seq_data() unless $self->{'_set_seq_data'}; + + if($seqType =~ /list|array/i) { + return ($self->{'_queryStop'}, $self->{'_sbjctStop'}); + } else { + ## Sensitive to member name changes. + $seqType = "_\L$seqType\E"; + return $self->{$seqType.'Stop'}; + } +} + + + +=head2 strand + + Usage : $hsp_object->strand( [seq_type] ) + Purpose : Get the strand of the query or sbjct sequence. + Example : print $hsp->strand('query'); + : ($qstrand, $sstrand) = $hsp->strand(); + Returns : -1, 0, or 1 + : -1 = Minus strand, +1 = Plus strand + : Returns 0 if strand is not defined, which occurs + : for non-TBLASTN/X reports. + : In scalar context without arguments, returns queryStrand value. + : In array context without arguments, returns a two-element list + : of strings (queryStrand, sbjctStrand). + : Array context can be "induced" by providing an argument of 'list' or 'array'. + Argument : seq_type: 'query' | 'sbjct' or undef + Throws : n/a + +See Also : L<_set_seq()|_set_seq>, L<_set_match_stats()|_set_match_stats> + +=cut + +#----------- +sub strand { +#----------- + my( $self, $seqType ) = @_; + $seqType ||= (wantarray ? 'list' : 'query'); + + return '' if $seqType eq 'query' and $self->{'_prog'} eq 'TBLASTN'; + + ## Sensitive to member name format. + $seqType = "_\L$seqType\E"; + + # $seqType could be '_list'. + $self->{'_queryStrand'} or $self->_set_seq_data() unless $self->{'_set_seq_data'}; + + if($seqType =~ /list|array/i) { + return ('','') unless defined $self->{'_queryStrand'}; + return ($self->{'_queryStrand'}, $self->{'_sbjctStrand'}); + } + local $^W = 0; + $STRAND_SYMBOL{$self->{$seqType.'Strand'}} || 0; +} + + +##################################################################################### +## INSTANCE METHODS ## +##################################################################################### + + +=head2 seq + + Usage : $hsp->seq( [seq_type] ); + Purpose : Get the query or sbjct sequence as a Bio::Seq.pm object. + Example : $seqObj = $hsp->seq('query'); + Returns : Object reference for a Bio::Seq.pm object. + Argument : seq_type = 'query' or 'sbjct' (default = 'query'). + Throws : Propagates any exception that occurs during construction + : of the Bio::Seq.pm object. + Comments : The sequence is returned in an array of strings corresponding + : to the strings in the original format of the Blast alignment. + : (i.e., same spacing). + +See Also : L<seq_str()|seq_str>, L<seq_inds()|seq_inds>, B<Bio::Seq.pm> + +=cut + +#------- +sub seq { +#------- + my($self,$seqType) = @_; + $seqType ||= 'query'; + my $str = $self->seq_str($seqType); + my $num = $self->name; + my $name = $seqType =~ /query/i + ? $self->parent->parent->name + : $self->parent->name; + + require Bio::Seq; + + new Bio::Seq (-ID => $name, + -SEQ => $str, + -DESC => "Blast HSP #$num, $seqType sequence", + ); +} + + + +=head2 seq_str + + Usage : $hsp->seq_str( seq_type ); + Purpose : Get the full query, sbjct, or 'match' sequence as a string. + : The 'match' sequence is the string of symbols in between the + : query and sbjct sequences. + Example : $str = $hsp->seq_str('query'); + Returns : String + Argument : seq_Type = 'query' or 'sbjct' or 'match' + Throws : Exception if the argument does not match an accepted seq_type. + Comments : Calls _set_residues() to set the 'match' sequence if it has + : not been set already. + +See Also : L<seq()|seq>, L<seq_inds()|seq_inds>, L<_set_match_seq()|_set_match_seq> + +=cut + +#------------ +sub seq_str { +#------------ + my($self,$seqType) = @_; + + ## Sensitive to member name changes. + $seqType = "_\L$seqType\E"; + + $self->_set_seq_data() unless $self->{'_set_seq_data'}; + + if($seqType =~ /sbjct|query/) { + my $seq = join('',@{$self->{$seqType.'Seq'}}); + $seq =~ s/\s+//g; + return $seq; + + } elsif( $seqType =~ /match/i) { + # Only need to call _set_match_seq() if the match seq is requested. + my $aref = $self->_set_match_seq() unless ref $self->{'_matchSeq'}; + $aref = $self->{'_matchSeq'}; + +## DEBUGGING CODE: +# if($self->parent->name eq '1AK5_' and $self->parent->parent->name eq 'YAR073W') { +# print "seq_str():\n @$aref";<STDIN>; +# } + + return join('',@$aref); + + } else { + $self->throw("Invalid or undefined sequence type: $seqType", + "Valid types: query, sbjct, match"); + } +} + + + + +=head2 seq_inds + + Usage : $hsp->seq_inds( seq_type, class, collapse ); + Purpose : Get a list of residue positions (indices) for all identical + : or conserved residues in the query or sbjct sequence. + Example : @ind = $hsp->seq_inds('query', 'identical'); + : @ind = $hsp->seq_inds('sbjct', 'conserved'); + : @ind = $hsp->seq_inds('sbjct', 'conserved', 1); + Returns : List of integers + : May include ranges if collapse is true. + Argument : seq_type = 'query' or 'sbjct' (default = query) + : class = 'identical' or 'conserved' (default = identical) + : (can be shortened to 'id' or 'cons') + : (actually, anything not 'id' will evaluate to 'conserved'). + : collapse = boolean, if true, consecutive positions are merged + : using a range notation, e.g., "1 2 3 4 5 7 9 10 11" + : collapses to "1-5 7 9-11". This is useful for + : consolidating long lists. Default = no collapse. + Throws : n/a. + Comments : Calls _set_residues() to set the 'match' sequence if it has + : not been set already. + +See Also : L<seq()|seq>, L<_set_residues()|_set_residues>, L<collapse_nums()|collapse_nums>, B<Bio::Tools::Blast::Sbjct::seq_inds()> + +=cut + +#--------------- +sub seq_inds { +#--------------- + my ($self, $seq, $class, $collapse) = @_; + + $seq ||= 'query'; + $class ||= 'identical'; + $collapse ||= 0; + + $self->_set_residues() unless defined $self->{'_identicalRes_query'}; + + $seq = ($seq !~ /^q/i ? 'sbjct' : 'query'); + $class = ($class !~ /^id/i ? 'conserved' : 'identical'); + + ## Sensitive to member name changes. + $seq = "_\L$seq\E"; + $class = "_\L$class\E"; + + my @ary = sort { $a <=> $b } keys %{ $self->{"${class}Res$seq"}}; + + return $collapse ? &collapse_nums(@ary) : @ary; +} + + + + +=head2 get_aln + + Usage : $hsp->get_aln() + Purpose : Get a Bio::UnivAln.pm object constructed from the query + sbjct + : sequences of the present HSP object. + Example : $aln_obj = $hsp->get_aln(); + Returns : Object reference for a Bio::UnivAln.pm object. + Argument : n/a. + Throws : Propagates any exception ocurring during the construction of + : the Bio::UnivAln object. + Comments : Requires Bio::UnivAln.pm. + : The Bio::UnivAln.pm object is constructed from the query + sbjct + : sequence objects obtained by calling seq(). + : Gap residues are included (see $GAP_SYMBOL). It is important that + : Bio::UnivAln.pm recognizes the gaps correctly. A strategy for doing + : this is being considered. Currently it is hard-wired. + +See Also : L<seq()|seq>, B<Bio::UnivAln.pm> + +=cut + +#------------ +sub get_aln { +#------------ + my $self = shift; + + require Bio::UnivAln; + + my $qseq = $self->seq('query'); + my $sseq = $self->seq('sbjct'); + + my $desc = sprintf "HSP #%s of query %s vs. sbjct %s", + $self->name, $self->parent->parent->name, $self->parent->name; + + my $type = $self->{'_prog'} =~ /P$|^T/ ? 'amino' : 'dna'; + + Bio::UnivAln->new( -seqs => [$qseq, $sseq], + -desc => $desc, + -type => $type, + ); +} + + +=head2 display + + Usage : $sbjct_object->display( %named_parameters ); + Purpose : Display information about Bio::Tools::Blast::Sbjct.pm data members + : including: length, gaps, score, significance value, + : sequences and sequence indices. + Example : $object->display(-SHOW=>'stats'); + Argument : Named parameters: (TAGS CAN BE UPPER OR LOWER CASE) + : -SHOW => 'hsp', + : -WHERE => filehandle (default = STDOUT) + Returns : n/a + Status : Experimental + Comments : For more control over the display of sequence data, + : use seq(), seq_str(), seq_inds(). + +See Also : L<_display_seq()|_display_seq>, L<seq()|seq>, L<seq_str()|seq_str>, L<seq_inds()|seq_inds>, L<_display_matches()|_display_matches>, B<Bio::Root::Object::display()> + +=cut + +#----------- +sub display { +#----------- + my( $self, %param ) = @_; + + my $sbjctName = $self->parent->name(); + my $queryName = $self->parent->parent->name(); + my $layout = $self->parent->parent->_layout(); + + my $OUT = $self->set_display(%param); + + printf( $OUT "%-15s: %d\n", "LENGTH TOTAL", $self->length('total') ); + printf( $OUT "%-15s: %d\n", "LENGTH QUERY", $self->length('query') ); + printf( $OUT "%-15s: %d\n", "LENGTH SBJCT", $self->length('sbjct') ); + printf( $OUT "%-15s: %d\n", "GAPS QUERY", $self->gaps('query') ); + printf( $OUT "%-15s: %d\n", "GAPS SBJCT", $self->gaps('sbjct') ); + printf( $OUT "%-15s: %d\n", "SCORE", $self->{'_score'} ); + printf( $OUT "%-15s: %0.1f\n", "BITS", $self->{'_bits'} ); + if($layout == 1) { + printf( $OUT "%-15s: %.1e\n", "P-VAL", $self->{'_p'} ); + printf( $OUT "%-15s: %.1e\n", "EXPECT", $self->{'_expect'} ); + } else { + printf( $OUT "%-15s: %.1e\n", "EXPECT", $self->{'_expect'} ); + } + + my $queryLength = $self->length('query'); + + printf( $OUT "%-15s: %d (%0.0f%%)\n", "IDENTICAL", $self->{'_numIdentical'}, + $self->{'_numIdentical'}/$queryLength * 100 ); + printf( $OUT "%-15s: %d (%0.0f%%) %s \n", "CONSERVED", $self->{'_numConserved'}, + $self->{'_numConserved'}/$queryLength * 100, + "includes identical" ); + + $self->_display_seq('query', $queryName, $OUT); + $self->_display_seq('sbjct', $sbjctName, $OUT); + $self->_display_matches($queryName, $sbjctName, $OUT); +} + + + + +=head2 _display_seq + + Usage : n/a; called automatically by display() + Purpose : Display information about query and sbjct HSP sequences. + : Prints the start, stop coordinates and the actual sequence. + Example : n/a + Argument : + Returns : printf call. + Status : Experimental + Comments : For more control, use seq(), seq_str(), or seq_inds(). + +See Also : L<display()|display>, L<seq()|seq>, L<seq_str()|seq_str>, L<seq_inds()|seq_inds>, L<_display_matches()|_display_matches> + +=cut + +#------------------ +sub _display_seq { +#------------------ + my( $self, $seqType, $name, $OUT ) = @_; + + $self->_set_seq_data() unless $self->{'_set_seq_data'}; + + # Sensitive to member name changes. + my $mem = "_\L$seqType\E"; + printf( $OUT "\n%10s: %s\n%10s %s\n", "\U$seqType\E", "$name", "-----", + ('-'x ((CORE::length $name) + 2)) ); + printf( $OUT "%13s: %d\n", "START", $self->{$mem.'Start'} ); + printf( $OUT "%13s: %d\n", "STOP", $self->{$mem.'Stop'} ); + printf( $OUT "%13s: \n", "SEQ" ); + foreach( @{ $self->{$mem.'Seq'}} ) { + printf( $OUT "%15s%s\n", "", $_); + } +} + + +=head2 _display_matches + + Usage : n/a; called automatically by display() + Purpose : Display information about identical and conserved positions + : within both the query and sbjct sequences. + Example : n/a + Argument : + Returns : printf call. + Status : Experimental + Comments : For more control, use seq_inds(). + +See Also : L<display()|display>, L<seq_inds()|seq_inds>, L<_display_seq()|_display_seq>, + +=cut + +#-------------------- +sub _display_matches { +#-------------------- + my( $self, $queryName, $sbjctName, $OUT) = @_; + my($resNum, $count); + + $self->_set_residues() unless defined $self->{'_identicalRes_query'}; + + printf( $OUT "\n%10s: \n%10s\n", "HITS", "-----" ); + foreach( @{ $self->{'_matchSeq'}} ) { + printf( $OUT "%15s%s\n", "", $_ ); + } + + print $OUT "\n\U$queryName\E\n------------\n"; + printf( $OUT "\n%5s%s:\n%5s%s\n\t", "", "IDENTICAL RESIDUES IN $queryName (n=$self->{'_numIdentical'})", + "", "--------------------------------------------" ); + $count = 0; + foreach $resNum ( sort keys %{ $self->{'_identicalRes_query' }} ) { + $count++; + print $OUT "$resNum"; + $count > 0 and print $OUT +( $count % 15 ? ", " : "\n\t"); + } + + print $OUT "\n"; + + my $justConserved = ($self->{'_numConserved'})-($self->{'_numIdentical'}); + printf( $OUT "\n%5s%s:\n%5s%s\n\t", "","CONSERVED RESIDUES IN $queryName (n=$justConserved)", + "", "--------------------------------------------" ); + $count = 0; + foreach $resNum ( sort keys %{ $self->{'_conservedRes_query' }} ) { + $count++; + print $OUT "$resNum"; + $count > 0 and print $OUT +( $count % 15 ? ", " : "\n\t"); + } + + + print $OUT "\n\n\U$sbjctName\E\n------------\n"; + printf( $OUT "\n%5s%s:\n%5s%s\n\t", "", "IDENTICAL RESIDUES IN $sbjctName (n=$self->{'_numIdentical'})", + "", "--------------------------------------------" ); + $count = 0; + foreach $resNum ( sort keys %{ $self->{'_identicalRes_sbjct' }} ) { + $count++; + print $OUT "$resNum"; + $count > 0 and print $OUT +( $count % 15 ? ", " : "\n\t"); + } + + print $OUT "\n"; + $justConserved = ($self->{'_numConserved'})-($self->{'_numIdentical'}); + printf( $OUT "\n%5s%s:\n%5s%s\n\t", "","CONSERVED RESIDUES IN $sbjctName (n=$justConserved)", + "", "--------------------------------------------" ); + $count = 0; + foreach $resNum ( sort keys %{ $self->{'_conservedRes_sbjct' }} ) { + $count++; + print $OUT "$resNum"; + $count > 0 and print $OUT +( $count % 15 ? ", " : "\n\t"); + } +} + + + + +=head2 homol_data + + Usage : $data = $hsp_object->homo_data( %named_params ); + Purpose : Gets similarity data for a single HSP. + Returns : String: + : "Homology data" for each HSP is in the format: + : "<integer> <start> <stop>" + : where integer is the value returned by homol_score(). + Argument : Named params: (UPPER OR LOWERCASE TAGS) + : currently just one param is used: + : -SEQ =>'query' or 'sbjct' + Throws : n/a + Status : Experimental + Comments : This is a very experimental method used for obtaining a + : coarse indication of: + : 1) how strong the similarity is between the sequences in the HSP, + : 3) the endpoints of the alignment (sequence monomer numbers) + +See Also : L<homol_score()|homol_score>, B<Bio::Tools::Blast.::homol_data()>, B<Bio::Tools::Blast::Sbjct::homol_data()> + +=cut + +#--------------- +sub homol_data { +#--------------- + my ($self, %param) = @_; + my $seq = $param{-SEQ} || $param{'-seq'} || 'sbjct'; # 'query' or 'sbjct' + my $homolScore = $self->homol_score(); + # Sensitive to member name changes. + $seq = "_\L$seq\E"; + + $self->_set_seq_data() unless $self->{'_set_seq_data'}; + return ( $homolScore.' '.$self->{$seq.'Start'}.' '.$self->{$seq.'Stop'}); +} + + +=head2 homol_score + + Usage : $self->homol_score(); + Purpose : Get a homology score (integer 1 - 3) as a coarse representation of + : the strength of the similarity independent of sequence composition. + : Based on the Blast bit score. + Example : $hscore = $hsp->homol_score(); + Returns : Integer + Argument : n/a + Throws : n/a + Status : Experimental + Comments : See @Bio::Tools::Blast::HSP::SCORE_CUTOFFS for the specific values. + : Currently, BIT_SCORE HOMOL_SCORE + : --------- ----------- + : >=100 --> 3 + : 30-100 --> 2 + : < 30 --> 1 + +See Also : L<homol_data()|homol_data> + +=cut + +#---------------- +sub homol_score { +#---------------- + my $self = shift; + + if( $self->{'_bits'} >= $SCORE_CUTOFFS[0] ) { 1 } + elsif($self->{'_bits'} < $SCORE_CUTOFFS[0] and + $self->{'_bits'} >= $SCORE_CUTOFFS[1] ) { 2 } + else { 3 } +} + + +##################################################################################### +## CLASS METHODS ## +##################################################################################### + +=head1 CLASS METHODS + +=head2 collapse_nums + + Usage : @cnums = collapse_nums( @numbers ); + Purpose : Collapses a list of numbers into a set of ranges of consecutive terms: + : Useful for condensing long lists of consecutive numbers. + : EXPANDED: + : 1 2 3 4 5 6 10 12 13 14 15 17 18 20 21 22 24 26 30 31 32 + : COLLAPSED: + : 1-6 10 12-15 17 18 20-22 24 26 30-32 + Argument : List of numbers and sorted numerically. + Returns : List of numbers mixed with ranges of numbers (see above). + Throws : n/a + Comments : Probably belongs in a more general utility class. + +See Also : L<seq_inds()|seq_inds> + +=cut + +#------------------ +sub collapse_nums { +#------------------ +# This is not the slickest connectivity algorithm, but will do for now. + my @a = @_; + my ($from, $to, $i, @ca, $consec); + + $consec = 0; + for($i=0; $i < @a; $i++) { + not $from and do{ $from = $a[$i]; next; }; + if($a[$i] == $a[$i-1]+1) { + $to = $a[$i]; + $consec++; + } else { + if($consec == 1) { $from .= ",$to"; } + else { $from .= $consec>1 ? "\-$to" : ""; } + push @ca, split(',', $from); + $from = $a[$i]; + $consec = 0; + $to = undef; + } + } + if(defined $to) { + if($consec == 1) { $from .= ",$to"; } + else { $from .= $consec>1 ? "\-$to" : ""; } + } + push @ca, split(',', $from) if $from; + + @ca; +} + + +1; +__END__ + +##################################################################################### +# END OF CLASS +##################################################################################### + +=head1 FOR DEVELOPERS ONLY + +=head2 Data Members + +Information about the various data members of this module is provided for those +wishing to modify or understand the code. Two things to bear in mind: + +=over 4 + +=item 1 Do NOT rely on these in any code outside of this module. + +All data members are prefixed with an underscore to signify that they are private. +Always use accessor methods. If the accessor doesn't exist or is inadequate, +create or modify an accessor (and let me know, too!). + +=item 2 This documentation may be incomplete and out of date. + +It is easy for these data member descriptions to become obsolete as +this module is still evolving. Always double check this info and search +for members not described here. + +=back + +An instance of Bio::Tools::Blast::HSP.pm is a blessed reference to a hash containing +all or some of the following fields: + + FIELD VALUE + -------------------------------------------------------------- + (member names are mostly self-explanatory) + + _score : + _bits : + _p : + _n : Integer. The 'N' value listed in parenthesis with P/Expect value: + : e.g., P(3) = 1.2e-30 ---> (N = 3). + : Not defined in NCBI Blast2 with gaps. + : To obtain the number of HSPs, use Bio::Tools::Blast::Sbjct::num_hsps(). + _expect : + _queryLength : + _queryGaps : + _queryStart : + _queryStop : + _querySeq : + _sbjctLength : + _sbjctGaps : + _sbjctStart : + _sbjctStop : + _sbjctSeq : + _matchSeq : String. Contains the symbols between the query and sbjct lines + which indicate identical (letter) and conserved ('+') matches + or a mismatch (' '). + _numIdentical : + _numConserved : + _identicalRes_query : + _identicalRes_sbjct : + _conservedRes_query : + _conservedRes_sbjct : + _match_indent : The number of leading space characters on each line containing + the match symbols. _match_indent is 13 in this example: + Query: 285 QNSAPWGLARISHRERLNLGSFNKYLYDDDAG + Q +APWGLARIS G+ + Y YD+ AG + ^^^^^^^^^^^^^ + + INHERITED DATA MEMBERS + + _name : From Bio::Root::Object.pm. + : + _parent : From Bio::Root::Object.pm. This member contains a reference to the + : Bio::Tools::Blast::Sbjct.pm object to which this hit belongs. + + +=cut + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Blast/HTML.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Blast/HTML.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,754 @@ +#------------------------------------------------------------------------------- +# PACKAGE : Bio::Tools::Blast::HTML +# PURPOSE : To encapsulate code for HTML formatting BLAST reports. +# AUTHOR : Steve Chervitz (sac@bioperl.org) +# CREATED : 28 Apr 1998 +# STATUS : Alpha +# REVISION: $Id: HTML.pm,v 1.15 2002/11/04 09:12:51 heikki Exp $ +# +# For the latest version and documentation, visit the distribution site: +# http://bio.perl.org/Projects/Blast/ +# +# To generate documentation, run this module through pod2html +# (preferably from Perl v5.004 or better). +# +# CUSTOMIZATION NOTE: +# +# If your Blast reports are not getting marked up correctly, add or +# modify the regexps in _markup_report() to accomodate the format of +# your reports. +# +# Copyright (c) 1996-98 Steve Chervitz. All Rights Reserved. +# This module is free software; you can redistribute it and/or +# modify it under the same terms as Perl itself. +#------------------------------------------------------------------------------- + +package Bio::Tools::Blast::HTML; +use strict; +use Exporter; + +use Bio::Tools::WWW qw(:obj); + +use vars qw( @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS + $ID %DbUrl %SGDUrl $Revision + $Acc $Pir_acc $Word $Signif $Int $Descrip); + +@ISA = qw(Exporter); +@EXPORT = qw(); +@EXPORT_OK = qw(&get_html_func &strip_html); +%EXPORT_TAGS = ( std => [qw(&get_html_func &strip_html)] ); + +$ID = 'Bio::Tools::Blast::HTML'; +$Revision = '$Id: HTML.pm,v 1.15 2002/11/04 09:12:51 heikki Exp $'; #' + +my $_set_markup = 0; +my $_gi_link = ''; + + +## POD Documentation: + +=head1 NAME + +Bio::Tools::Blast::HTML - Bioperl Utility module for HTML formatting Blast reports + +=head1 SYNOPSIS + +=head2 Adding HTML-formatting + + use Bio::Tools::Blast::HTML qw(&get_html_func); + + $func = &get_html_func(); + + # Now as each line of the report is read, pass it to &$func($line). + +See L<get_html_func()|get_html_func> for details. +Also see B<Bio::Tools::Blast::to_html> for an example of usage. + + +=head2 Removing HTML-formatting + + use Bio::Tools::Blast::HTML qw(&strip_html); + + &strip_html(\$blast_report_string) + +See L<strip_html()|strip_html> for details. + + +=head1 INSTALLATION + +This module is included with the central Bioperl distribution: + + http://bio.perl.org/Core/Latest + ftp://bio.perl.org/pub/DIST + +Follow the installation instructions included in the README file. + +=head1 DESCRIPTION + +This module can be used to add HTML formatting to or remove HTML +formatting from a raw Blast sequence analysis report. Hypertext links +to the appropriate database are added for each hit sequence (GenBank, +Swiss-Prot, PIR, PDB, SGD). + +This module is intended for use by Bio::Tools::Blast.pm and related modules, +which provides a front-end to the methods in Bio::Tools::Blast::HTML.pm. + +=head1 DEPENDENCIES + +Bio::Tools::Blast::HTML.pm does not inherit from any other class +besides Exporter. It is used by B<Bio::Tools::Blast.pm> only. This +class relies on B<Bio::Tools::WWW.pm> to provide key URLS for adding +links in the Blast report to specific databases. + +The greatest dependency comes from the dynamic state of the web. URLs +are are likely to change in the future, so all links cannot be +guaranteed to work indefinitely. Feel free to report broken or +incorrect database links (L<FEEDBACK | FEEDBACK>). Thanks! + +=head1 SEE ALSO + + Bio::Tools::Blast.pm - Blast object. + Bio::Tools::WWW.pm - URL repository. + + http://bio.perl.org/Projects/modules.html - Online module documentation + http://bio.perl.org/Projects/Blast/ - Bioperl Blast Project + http://bio.perl.org/ - Bioperl Project Homepage + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep +track the bugs and their resolution. Bug reports can be submitted +via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Steve Chervitz, E<lt>sac@bioperl.orgE<gt> + +=head1 COPYRIGHT + +Copyright (c) 1998-2000 Steve Chervitz. All Rights Reserved. +This module is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + + +=cut + + +# +## +### +#### END of main POD documentation. +### +## +#' + + +###################### BEGIN FUNCTIONS ######################## + +=head1 APPENDIX + +Methods beginning with a leading underscore are considered private +and are intended for internal use by this module. They are +B<not> considered part of the public interface and are described here +for documentation purposes only. + + + +=head2 get_html_func + + Usage : $func_ref = &get_html_func( [array_ref] ); + : This method is exported. + Purpose : Provides a function that adds HTML formatting to a + : raw Blast report line-by-line. + : Utility method used by to_html() in Bio::Tools::Blast.pm. + Returns : Reference to an anonymous function to be used while reading in + : the raw report. + : The function itself operates on the Blast report line-by-line + : HTML-ifying it and printing it to STDOUT (or saving in the supplied + : array ref) as it goes: + : foreach( @raw_report ) { &$func_ref($_); } + Argument : array ref (optional) for storing the HTML-formatted report. + : If no argument is supplied, HTML output is sent to STDOUT. + Throws : Croaks if an argument is supplied and is not an array ref. + : The anonymous function returned by this method croaks if + : the Blast output appears to be HTML-formatted already. + Comments : Adapted from a script by Keith Robison November 1993 + : krobison@nucleus.harvard.edu; http://golgi.harvard.edu/gilbert.html + : Modified extensively by Steve Chervitz and Mike Cherry. + : Some modifications are customizations for BLAST reports served up + : by the Saccharomyces Genome Database. + : Feel free to modify or replace portions of this code as necessary + : to accomodate new BLAST datasets or changes to the Blast format. + +See Also : B<Bio::Tools::Blast::to_html()> + +=cut + +#-------------------- +sub get_html_func { +#-------------------- + my ($out_aref) = @_; + + ## Key booleans used in parsing. + my $found_table = 0; # Located the table at top of report (a.k.a. 'descriptions'). + my $found_data = 0; # Nothing is done until this is true + my $skip = 0; # Skipping various items in the report header + my $ref_skip = 0; # so we can include nice HTML versions + # (e.g., references for the BLAST program). + my $getNote = 0; + my $getGenBankAlert = 0; + my $str = ''; + my $gi_link = \$_gi_link; + my $prog = ''; + + if( defined($out_aref) and not ref($out_aref) eq 'ARRAY') { + croak("Argument must be an ARRAY ref not a ${\ref $out_aref}."); + } + + my $refs = &_prog_ref_html; + + &_set_markup_data() if not $_set_markup; + + return sub { + # $_ contains a single line from a Blast report. + local $_ = shift; + + croak("Report appears to be HTML formatted already.") if m/<HTML>|<TITLE>|<PRE>/i; + + if(not $found_table) { + if($ref_skip) { + # Replacing an reference data with special HTML. + $ref_skip = 0 if /^\s+$/; + } + if($getNote) { + ## SAC: created this test since we are no longer reading from STDIN. + $out_aref ? push(@$out_aref, $_) : print $_; + $getNote = 0 if m/^\s+$/; + } elsif( m/(.*), Up \d.*/ or /Date: +(.+)/ or /Start: +(.+?) +End:/ ) { + ### Network BLAST reports from NCBI are time stamped as follows: + #Fri Apr 18 15:55:41 EDT 1997, Up 1 day, 19 mins, 1 user, load: 19.54, 19.13, 17.77 + my $date = "<b>BLASTed on:</b> $1<p>\n"; + $out_aref ? push(@$out_aref, $date) : print $date; + } elsif ( /^(<\w+>)?(T?BLAST[NPX])\s+(.*?)/ ) { + $found_data = 1; + local($^W) = 0; + s#(\S+)\s+(.*)#<P><B>Program:</B> $1 $2 $3<br>#o; + $out_aref ? push(@$out_aref, $_) : print $_; + $skip = 1; + $prog = $2; + if($prog =~ /BLASTN/) { + ## Prevent the error at Entrez when you ask for a nucl + ## entry with a protein GI number. + $$gi_link = $DbUrl{'gb_n'}; # nucleotide + } else { + $$gi_link = $DbUrl{'gb_p'}; # protein + } + } elsif ( m/^Query=/ ) { + # Keeping the "Query=" format to keep it parsable by Blast.pm + # (after stripping HTML). + s#Query= *(.*)#<title>$1\n

Query= $1#o; + $out_aref ? push(@$out_aref, $_) : print $_; + $skip = 1; + } elsif ( /Reference:/) { + $ref_skip = 1; + } elsif ( /^Database:/ ) { + &_markup_database(\$_); + $out_aref ? push(@$out_aref, $_) : print $_; + if ( /non-redundant genbank/i and $prog =~ /TBLAST[NX]/i) { + $getGenBankAlert = 1; + } + $skip = 1; + } elsif ( /sequences;/ ) { + $str = "$_

"; + $out_aref ? push(@$out_aref, $str) : print $str; + } elsif ( /^\s+\(\d+ letters\)\s+/ ) { + $str = "
    $_"; + $out_aref ? push(@$out_aref, $str) : print $str; + } elsif ( /^(WARNING|NOTICE):/i ) { + s#WARNING: *(.*)#

$1: $1#o; + $out_aref ? push(@$out_aref, $_) : print $_; + $getNote = 1; + } elsif ( /Score +E\s*$/ or /Probability\s*$/ ) { + # Put the last HTML-formatted lines before the main body of report. + $found_table = 1; + $skip = 0; + $out_aref ? push(@$out_aref, $refs) : print $refs; + if($getGenBankAlert) { + $str = &_genbank_alert; + $out_aref ? push(@$out_aref, $str) : print $str; + } + $str = "\n

";
+		$out_aref ? push(@$out_aref, $str) : print $str;
+	    }
+
+	} else {
+	    &_markup_report(\$_);
+	}
+
+	if ($found_data and not($skip or $ref_skip)) {
+	    $out_aref ? push(@$out_aref, $_) : print $_;
+	}
+	1;
+    } # end sub {}
+}
+
+
+
+
+=head2 _set_markup_data
+
+ Usage     : n/a; utility method used by get_html_func()
+ Purpose   : Sets various hashes and regexps used for adding HTML
+           : to raw Blast output.
+ Returns   : n/a
+ Comments  : These items need be set only once. 
+
+See Also   : L
+
+=cut
+
+#-------------------
+sub _set_markup_data {
+#-------------------
+    %DbUrl      = $BioWWW->search_url('all');
+    %SGDUrl     = $BioWWW->sgd_url('all');
+
+    $Signif  = '[\de.-]{3,}';        # Regexp for a P-value or Expect value. 
+    $Int     = ' *\d\d*';            # Regexp for an integer.
+    $Descrip = ' +.* {2,}?';         # Regexp for a description line.
+    $Acc     = '[A-Z][\d.]+';        # Regexp for GB/EMBL/DDJB/SP accession number
+    $Pir_acc = '[A-Z][A-Z0-9]{5,}';  # Regexp for PIR accession number
+    $Word    = '[\w_.]+';            # Regexp for a word. Include dot for version.
+    
+    $_set_markup = 1;
+}
+
+
+=head2 _markup_database
+
+ Usage     : n/a; utility method used by get_html_func()
+ Purpose   : Converts a cryptic database ID into a readable name.
+ Returns   : n/a
+ Comments  : This is used for converting local database IDs into
+           : understandable terms. At present, it only recognizes
+           : databases used locally at SGD. 
+
+See Also   : L
+
+=cut
+
+#---------------------
+sub _markup_database {
+#---------------------
+    my $line_ref = shift;
+    local $_ = $$line_ref;
+
+    $_ =~ s#YeastN#S. cerevisiae GenBank Data Set; #;
+    $_ =~ s#YeastP#Non-Redundant S. cerevisiae Protein Data Set; #;
+    $_ =~ s#genoSC#Complete DNA Sequence for the S. cerevisiae Genome; #;
+    $_ =~ s#YeastORF-P#Translation of all Standard S.c. ORFs; #;
+    $_ =~ s#YeastORF-N#Coding Sequence of all Standard S.c. ORFs; #;
+    s#Database: *(.*)#

Database: $1#o; + + $$line_ref = $_; +} + + +=head2 _markup_report + + Usage : n/a; utility function used by get_html_func() + Purpose : Adds HTML links to aid navigation of raw Blast output. + Returns : n/a + Comments : HTML-formatting is dependent on the Blast server that + : provided the Blast report. Currently, this function can handle reports + : produced by NCBI and SGD. Feel free to modify this function + : to accomodate reports produced by other servers/sites. + : + : This function is simply a collection of substitution regexps + : that recognize and modify the relevant lines of the Blast report. + : All non-header lines of the report are passed through this function, + : only the ones that match will get modified. + : + : The general scheme for adding links is as follows: + : (Some of the SGD markups do not follow this scheme precisely + : but this is the general trend.) + : + : For description lines in the summary table at the top of report: + : + : DB:SEQUENCE_ID DESCRIPTION SIGNIF_VAL + : DB = links to the indicated database (if not Gen/Embl/Ddbj). + : SEQUENCE_ID = links to GenBank entry for the sequence. + : SIGNIF_VAL = internal link to relevant alignment section. + : + : For the alignment sections in the body of the report: + : + : DB:SEQUENCE_ID (Back | Top) DESCRIPTION + : DB = links to the indicated database (if not Gen/Embl/Ddbj). + : SEQUENCE_ID = links to GenBank entry for the sequence. + : SIGNIF_VAL = internal link to alignment section. + : Back = internal link to description line in summary section. + : Top = internal link to top of page. + : + : 'DB' links are created for PDB, PIR, and SwissProt sequences. + : + : RE_PARSING HTML-FOMRATTED REPORTS: + : ---------------------------------- + : HTML-formatted reports generated by this module, as well as reports + : obtained from the NCBI servers, should be parsable + : by Bio::Tools::Blast.pm. Parsing HTML-formatted reports is + : slow, however, since the HTML must be removed prior to parsing. + : Parsing HTML-formatted reports is dependent on the specific structure + : of the HTML and is generally not recommended. + : + : Note that since URLs can change without notice, links will need updating. + : The links are obtained from Bio::Tools::WWW.pm updating that module + : will update this as well. + : + Bugs : Some links to external databases are incorrect + : (in particular, for 'bbs' and 'prf' databases on NCBI Blast reports. + : Some links may fail as a result of the dynamic nature of the web. + : Hypertext links are not added to hits without database ids. + +See Also : L, B, L() + +=cut + +#-------------------- +sub _markup_report { +#-------------------- + my $line_ref = shift; + local $_ = $$line_ref; +## +## REGEXPS FOR ALIGNMENT SECTIONS (within the body of the report, +## the text above the list of HSPs). +## +## If the HSP alignment sections don't start with a '>' we have no way +## of finding them. This occurs with reports saved from HTML-formatted +## web pages, which we shouldn't be processing here anyway. + +## To facilitate parsing of HTML-formatted reports by Bio::Tools::Blast.pm, +## the anchors should be added at the BEGINNING of the HSP +## alignment section lines and at the END of the description section lines. + + # Removing " ! " addded by GCG. + s/ ! / /; + + ### NCBI-specific markups for HSP alignment section lines: + + local($^W) = 0; + + # GenBank/EMBL, DDBJ hits (GenBank Format): + s@^>(gb|emb|dbj|ref)\|($Word)(\|$Word)?(.*)$@$1:$2$3$4
(Back|Top)@o; + + s@^>(gb|emb|dbj|ref)\|($Word)(\| \(?$Word\)?)(.*)$@$1:$2$3$4
(Back|Top)@o; + + # PIR hits + s@^>pir\|\|($Word)( .*)$@pir:$1 $2
(Back|Top)@o; + + # GI hits (GenBank Format): using a nested (()) + s@^>(gi)\|($Word)( +\(($Word)\))( .*)$@$1:$2$3$5
(Back|Top)@o; + + # GNL PID hits (GenBank Format): + s@^>(gnl)\|($Word)?(\|$Word) +\(($Word)\)( .*)$@$1:$2$3($4)$5
(Back|Top)@o; + + # BBS and PRF hits (what db?) (GenBank Format): + s@^>(bbs|prf)\|\|?($Word)( .*)$@$1:$2$3
(Back|Top)@o; + + # SwissProt hits: + s@^>sp\|($Word)\|($Word)?( .*)$@sp:$1|$2$3
(Back|Top)@o; + + + ## PDB ids with or without a chain identifier (GenBank format) + s@^>pdb\|(\d\w{3})\|[\w ] (.*)$@pdb:$1 (Back|Top) $2@o; + + + ### SGD-specific markups for HSP alignment section lines: + + ## PDB ids without chain identifier + s@^>PDB_UNIQUEP:(\d\w{3})_ (.*)$@PDB:$1 (Back|Top) $2@o; + + ## PDB ids with chain identifier + s@^>PDB_UNIQUEP:(\d\w{3})_([\w ]{1})(.*)$@PDB:$1 Chain:$2, (Back|Top) $3@o; + + s@^>($Word)PEPT:GI_(\d+)(.*)$@$1:GI_$2 $3
(Back|Top)@o; + + # The gcg blast dataset generating tools up-case all sbjct sequence IDs. + # This is fine for yeast but not worm. This is considered a hack here. + s@WORMPEPT:(\w+\.)(\S+)@WORMPEPT:$1\L$2\E@; + + s@^>WORMPEPT:(\S+)(.*)$@WORMPEP:$1 $2
(Back|Top)@o; + + s#^>(GB_$Word):($Word) ($Acc) (.*$)#$2|$3$4\t[GenBank / EMBL / SGD] #o; + + # Sac's version: ORF name is an external link into SGD: + s@^>ORFP:(\S*) +([\w-]+)(.*$)@ORFP:$1 $2$3
     [Gene/Sequence Resources / ORF Map] Back|Top@o; + +# Mike's version: +# s#^>ORFP:(\S*) (.*$)#ORFP:$1 $2\t[Gene/Sequence Resources / ORF Map] #o; + + s#^>ORFN:(\S*) (.*$)#ORFN:$1 $2\t[Gene/Sequence Resources] / ORF Map #o; + + s#^>NR_SC:GP-\S* gi\|(\w+)([\w\|]*) (.*$)#GenPept|$1 gp|$2 $3\t[GenPept / SGD] #o; + + s#^>NR_SC:SW-$Word SW:($Word) ($Acc) (.*$)#SWISS|$1 $2 $3\t[SwissProt / Entrez]#o; + + s#^>NR_SC:PIR-$Word PIR:($Word) (.*$)# PIR|$1 $2\t[PIR / Entrez]#o; + + s#^>CHRS:([A-Z][0-9]*) (.*)$#$1 $2: [Gene/Sequence Resources / ORF Map]#o; + + s#^>NOT:([A-Z]_[0-9]*-[0-9]*)( *)Chromosome ([0-9]*) from ([0-9]*) to ([0-9]*)$#$1 $2Chromosome $3 from $4 to $5 [Gene/Sequence Resources / ORF Map / Retrieve DNA]#o; + + s#^>UTR5_SC_[0-9]*:(\S*) 5' untranslated region, chr(\S*) ([0-9]*) - ([0-9]*)(.*$)#UTR5:$1 $1 5' untranslated region, chr$2 $3 - $4, $5\t[Gene/Sequence Resources / ORF Map]#o; + + # Hits without a db identifier. + # If any of the previous regexps succeed, the leading '>' will be removed. + # Otherwise, this regexp could cause trouble. + s@^>($Word)(.*)$@$1 $2
(Back|Top)@o; + +## +## REGEXPS FOR SUMMARY TABLE LINES AT TOP OF REPORT (a.k.a. 'descriptions') +## (table of sequence id, description, score, P/Expect value, n) +## +## Not using bold face to highlight the sequence id's since this can throw off +## off formatting of the line when the IDs are different lengths. This lead to +## the scores and P/Expect values not lining up properly. + + ### NCBI-specific markups for description lines: + + # GenBank/EMBL, DDBJ hits (GenBank Format): + s@^ ?(gb|emb|dbj|ref)\|($Word)(\|$Word)?($Descrip)($Int +)($Signif)(.*)$@$1:$2$3$4$5$6$7@o; + + s@^ ?(gb|emb|dbj|ref)\|($Word)(\| \(?$Word\)?)($Descrip)($Int +)($Signif)(.*)$@$1:$2$3$4$5$6$7@o; + + # Missing inner ID + s@^ ?pir\|\|($Word)?($Descrip)($Int) ($Signif)(.*)$@pir:$1 $2$3 $4$5@o; + + # GI hits (GenBank Format): using a nested (()) + s@^ ?gi\|($Word)( +\(($Word)\))($Descrip)($Int) ($Signif)(.*)$@gi:$1$2$4$5 $6$7@o; + + s@^ ?(gnl)\|($Word)?(\|$Word +)\(($Word)\)($Descrip)($Int) ($Signif)(.*)$@$1:$2$3($4)$5$6 $7$8@o; + + + s@^ ?(bbs|prf)\|\|?($Word)($Descrip)($Int) ($Signif)(.*)$@$1:$2 $3$4 $5$6@o; + + + ## SwissProt accessions (GenBank format) + s@^ ?sp\|($Word)(\|$Word)?($Descrip)($Int) ($Signif)(.*)$@sp:$1$2$3$4 $5$6@o; + + ## PDB ids with or without a chain ID (GenBank format) + s@^ ?pdb\|($Word)\|($Word)?($Descrip)($Int) ($Signif)(.*)$@pdb:$1_$2$3$4 $5$6@o; + + + ### SGD-specific markups for description lines: + + ## PDB ids without chain identifier + s@^ ?PDB_UNIQUEP:(\d\w{3})_($Descrip)($Int) ($Signif)(.*)$@PDB:$1 $2$3 $4$5@o; + + + ## PDB ids with chain identifier + s@^ ?PDB_UNIQUEP:(\d\w{3})_(\w)($Descrip)($Int) ($Signif)(.*)$@PDB:$1 Chain:$2$3$4 $5$6@o; + + + s@^ ?($Word)PEPT:GI_(\d+)($Descrip)($Int) ($Signif)(.*)$@$1:GI_$2 $3 $4 $5 $6@o; + + s@^ *WORMPEPT:(\S+)($Descrip)($Int) ($Signif)(.*)$@WORMPEP:$1 $2 $3 $4$5@o; + + ## Mike Cherry's markups. SAC note: added back database name to allow + ## the HTML-formatted version to be parsable by Blast.pm. + + s#^ ?(GB_$Word:)($Word)( *)($Acc)($Descrip)($Int) ( *$Signif) ( *\d*)$#GenBank\|$2\|$4 $3$5$6 $7 $8#o; + +# Mike's version: +# s#^ ?(ORFP:)(\S*)($Descrip)($Int) ($Signif) ($Int)$#$1$2 $3 $4 $5 $6#o; + +# My modification: + s@^ ?ORFP:(\S*) +([\w-]+)(.*[ ]{2,3})($Int) ($Signif) ($Int)$@ORFP:$1 $2$3$4 $5 $6@o; + + s#^ ?(ORFN:)(\S*)($Descrip)($Int) ($Signif) ($Int)$#$1$2 $3 $4 $5 $6#o; + + s#^ ?(NR_SC:GP-)(\S*) ( *)gi\|(\w+)([\w\|]*)($Descrip)($Int) ($Signif) ($Int)$#GenPept\|$4$3 gp|$2 $5$6$7 $8 $9#o; + + s#^ ?(NR_SC:SW-)$Word ( *)SW:($Word) ($Acc)($Descrip)($Int) ($Signif) ($Int)$#SWISS\|$3 SW:$3 $4 $5$6 $7 $8#o; + + s#^ ?(NR_SC:PIR-)$Word ( *)PIR:($Word)($Descrip)($Int) ($Signif) ($Int)$#PIR\|$3 $2 PIR:$3 $4$5 $6 $7#o; + + s#^ ?(CHRS:)([A-Z][0-9]*)($Descrip)($Int) ($Signif) ($Int)$#$1Segment:$2 $3 $4 $5 $6#o; + + s#^ ?(CHR[0-9]*)($Descrip)($Int) ($Signif) ($Int)$#$1 $2 $3 $4 $5#o; + + s#^ ?(NOT:)([A-Z]_[0-9]*-[0-9]*)($Descrip)($Int) ($Signif) ($Int)$#$1$2 $3 $4 $5 $6#o; + + s#^ ?(UTR5_SC_[0-9]*:)(\S*)($Descrip)($Int) ($Signif) ($Int)$#UTR5:$2 $3 $4 $5 $6#o; + + # Hits without a db identifier. + s@^ ?($Word)($Descrip)($Int) ($Signif)(.*)$@$1$2$3 $4$5@o; + + $$line_ref = $_; +} + + + + +=head2 _prog_ref_html + + Usage : n/a; utility method used by get_html_func(). + Purpose : Get a special alert for BLAST reports against all of GenBank/EMBL. + Returns : string with HTML + +See Also : L + +=cut + +#------------------ +sub _prog_ref_html { +#------------------ + return <<"QQ_REF_QQ"; +

+ +References: +

    +
  1. Altschul, Stephen F., Warren Gish, Webb Miller, Eugene W. Myers, and David J. Lipman (1990). +Basic local alignment search tool. +J. Mol. Biol. 215: 403-10. +
  2. Altschul et al. (1997), Gapped BLAST and PSI-BLAST: +a new generation of protein database search programs. +Nucl. Acids Res. 25: 3389-3402. +
  3. Program Descriptions: +BLAST2 | +WU-BLAST2 | +Help Manual +
+ +HTML formatting provided by the Bioperl Blast module. + + +

+ +QQ_REF_QQ + +# Not really a reference for the Blast algorithm itself but an interesting usage. +#

  • Gish, Warren, and David J. States (1993). Identification of protein coding regions by database similarity search. +#Nature Genetics 3:266-72. + +} + + +=head2 _genbank_alert + + Usage : n/a; utility method used by get_html_func(). + Purpose : Get a special alert for BLAST reports against all of GenBank/EMBL. + Returns : string with HTML + +See Also : L + +=cut + +#------------------ +sub _genbank_alert { +#------------------ + return << "QQ_GENBANK_QQ"; +

    CAUTION: Hits reported on this page may be derived from DNA sequences + that contain more than one gene. + To avoid mis-interpretation, always check database entries + for any sequence of interest to verify that the similarity + occurs within the described sequence. (E.g., A DNA sequence + for gene X as reported in GenBank may contain a 5' or 3' + fragment of coding sequence for a neighboring gene Y, yet will + be listed as gene X, since gene Y had not yet been identified). +QQ_GENBANK_QQ +} + + + +=head2 strip_html + + Usage : $boolean = &strip_html( string_ref ); + : This method is exported. + Purpose : Removes HTML formatting from a supplied string. + : Attempts to restore the Blast report to enable + : parsing by Bio::Tools::Blast.pm. + Returns : Boolean: true if string was stripped, false if not. + Argument : string_ref = reference to a string containing the whole Blast + : report. + Throws : Croaks if the argument is not a scalar reference. + Comments : Based on code originally written by Alex Dong Li + : (ali@genet.sickkids.on.ca). + : This method does some Blast-specific stripping + : (adds back a '>' character in front of each HSP + : alignment listing). + : + : THIS METHOD IS HIGHLY ERROR-PRONE! + : + : Removal of the HTML tags and accurate reconstitution of the + : non-HTML-formatted report is highly dependent on structure of + : the HTML-formatted version. For example, it assumes that first + : line of each alignment section (HSP listing) starts with a + : anchor tag. This permits the reconstruction of the + : original report in which these lines begin with a ">". + : This is required for parsing. + : + : If the structure of the Blast report itself is not intended to + : be a standard, the structure of the HTML-formatted version + : is even less so. Therefore, the use of this method to + : reconstitute parsable Blast reports from HTML-format versions + : should be considered a temorary solution. + +See Also : B + +=cut + +#--------------- +sub strip_html { +#--------------- + # This may not best way to remove html tags. However, it is simple. + # it won't work under following conditions: + # 1) if quoted > appears in a tag (does this ever happen?) + # 2) if a tag is split over multiple lines and this method is + # used to process one line at a time. + + my $string_ref = shift; + + ref $string_ref eq 'SCALAR' or + croak ("Can't strip HTML: ". + "Argument is should be a SCALAR reference not a ${\ref $string_ref}"); + + my $str = $$string_ref; + my $stripped = 0; + + # Removing "" and adding the '>' character for + # HSP alignment listings. + $str =~ s/(\A|\n)]+> ?/>/sgi and $stripped = 1; + + # Removing all "<>" tags. + $str =~ s/<[^>]+>| //sgi and $stripped = 1; + + # Re-uniting any lone '>' characters. + $str =~ s/(\A|\n)>\s+/\n\n>/sgi and $stripped = 1; + + $$string_ref = $str; + $stripped; +} + +1; +__END__ + +##################################################################################### +# END OF CLASS # +##################################################################################### + + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Blast/README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Blast/README Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,55 @@ + +NOTE NOTE NOTE NOTE + +This modules are no longer supported. Use the Bio::SearchIO system for +your BLAST parsing. + + + +README for the Perl extension Bio::Tools::Blast and related modules +in the Bio/Tools/Blast/ directory of the bioperl distribution. + +$Id: README,v 1.5 2002/06/19 22:25:15 jason Exp $ + +### Notice: +### +### The Bio::Tools::Blast:: modules are no longer being +### maintained due to support issues. Please take a look +### at the Bio::Tools::BPlite modules and the Bio::SearchIO. +### Questions? Drop an email to the bioperl mailing list at +### bioperl-l@bioperl.org +### + + +The Bioperl Blast module provides an API to the BLAST program, and +thereby permits the parsing, running, HTML-formatting, and general +manipulation of Blast data by simple method calls on Perl objects. +It does not implement the Blast algorithm itself, relying on external +applications for this compute-intensive operation. + +Bio::Tools::Blast.pm is the central "brain" module. It makes use of +other modules in the Bio/Tools/Blast/ directory. Bio::Tools:Blast.pm +is the only module you need to import into your namespace for working +with Blast reports. + +Documentation and usage information about Blast.pm is contained in the +module itself. Run Blast.pm through pod2html (preferably from Perl +v5.004 or later) and view the resulting output with a web +browser. Other modules in the hierarchy are similarly documented but +Blast.pm contains the main documentation. + +On-line docs can be found at http://bio.perl.org/Projects/Blast/ + +Send bug reports using the bioperl bug-tracking system at +http://bio.perl.org/Bugs/ or send them via e-mail to +bioperl-bugs@bio.perl.org. + +Send general comments, questions, and feature requests to the bioperl +mailing lists: + + bioperl-l@bioperl.org (discussion) + +Have a Blast! + +Steve Chervitz +sac@neomorphic.com diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Blast/Sbjct.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Blast/Sbjct.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,2417 @@ +#------------------------------------------------------------------------------ +# PACKAGE : Bio::Tools::Blast::Sbjct +# AUTHOR : Steve Chervitz (sac@bioperl.org) +# CREATED : 7 October 1996 +# STATUS : Alpha +# REVISION: $Id: Sbjct.pm,v 1.20 2002/10/22 07:38:48 lapp Exp $ +# +# For the latest version and documentation, visit the distribution site: +# http://genome-www.stanford.edu/perlOOP/bioperl/blast/ +# +# To generate documentation, run this module through pod2html +# (preferably from Perl v5.004 or better). +# +# Copyright (c) 1996-2000 Steve Chervitz. All Rights Reserved. +# This module is free software; you can redistribute it and/or +# modify it under the same terms as Perl itself. +#------------------------------------------------------------------------------ + +package Bio::Tools::Blast::Sbjct; + +use Bio::Root::Global qw(:devel); +use Bio::Root::Object (); + +@ISA = qw( Bio::Root::Object Exporter ); + +use strict; +use vars qw($ID %SUMMARY_OFFSET $Revision); +$ID = 'Bio::Tools::Blast::Sbjct'; +$Revision = '$Id: Sbjct.pm,v 1.20 2002/10/22 07:38:48 lapp Exp $'; #' + +my $_prog = ''; +my $_signif_fmt = ''; + +## POD Documentation: + +=head1 NAME + +Bio::Tools::Blast::Sbjct - Bioperl BLAST "Hit" object + +=head1 SYNOPSIS + +=head2 Object Creation + +The construction of HSP objects is handled by B. +You should not need to use this package directly. See L<_initialize()|_initialize> +for a description of constructor parameters. + + require Bio::Tools::Blast::Sbjct; + + $hit = new Bio::Tools::Blast::Sbjct (-DATA =>\@hitData, + -PARENT =>$self, + -NAME =>5, + -RANK =>5, + -RANK_BY =>'order', + -MAKE =>'query' (or 'sbjct'), + -OVERLAP =>2, + -PROGRAM =>'TBLASTN' + ); + +@hitData includes the summary line for the hit as element [0], plus +all lines from the HSP alignment section of the BLAST report for +the present hit. + +=head1 INSTALLATION + +This module is included with the central Bioperl distribution: + + http://bio.perl.org/Core/Latest + ftp://bio.perl.org/pub/DIST + +Follow the installation instructions included in the README file. + +=head1 DESCRIPTION + +The Bio::Tools::Blast::Sbjct.pm module encapsulates data and methods for +parsing and manipulating "hits" from a BLAST report. +This module is a utility module used by the Bio::Tools::Blast.pm +and is not intended for separate use. + +In Blast lingo, the "sbjct" sequences are all the sequences +in a target database which were compared against a "query" sequence. +The terms "sbjct" and "hit" will be used interchangeably in this and related modules. + +This module supports BLAST versions 1.x and 2.x, gapped and ungapped. + + +=head2 HSP Tiling and Ambiguous Alignments + +If a Blast hit has more than one HSP, the Bio::Tools::Blast::Sbjct.pm +object has the ability to merge overlapping HSPs into contiguous +blocks. This permits the Sbjct object to sum data across all HSPs +without counting data in the overlapping regions multiple times, which +would happen if data from each overlapping HSP are simply summed. HSP +tiling is performed automatically when methods of the Sbjct object +that rely on tiled data are invoked. These include +L, L, L, +L, L, +L, L. + +It also permits the assessment of an "ambiguous alignment" if the +query (or sbjct) sequences from different HSPs overlap. The existence +of an overlap could indicate a biologically interesting region in the +sequence, such as a repeated domain. The Sbjct object uses the +-OVERLAP parameter to determine when two sequences overlap; if this is +set to 2 -- the default -- then any two sbjct or query HSP sequences +must overlap by more than two residues to get merged into the same +contig and counted as an overlap. See the L section below for +"issues" with HSP tiling. + + +The results of the HSP tiling is reported with the following ambiguity codes: + + 'q' = Query sequence contains multiple sub-sequences matching + a single region in the sbjct sequence. + + 's' = Sbjct sequence contains multiple sub-sequences matching + a single region in the query sequence. + + 'qs' = Both query and sbjct sequences contain more than one + sub-sequence with similarity to the other sequence. + + +For addition information about ambiguous BLAST alignments, see +L<_tile_hsps()|_tile_hsps> and + + http://www-genome.stanford.edu/Sacch3D/help/ambig_aln.html + +=head1 DEPENDENCIES + +Bio::Tools::Blast::Sbjct.pm is a concrete class that inherits from B +and relies on two other modules: + +=over 4 + +=item B + +Encapsulates a single high-scoring segment pair within a hit. + +=item B + +Provides a container for Sbjct.pm objects. + +=back + + +Bio::Tools::Blast::Sbjct.pm does not currently inherit from +Bio::Root::Vector.pm since Bio::Root::Vector.pm may be re-designed to +make it usable via delegation. Thus, a Blast.pm object would manage a +vector of Sbjct.pm objects. Stay tuned. + + +=head1 BUGS + +One consequence of the HSP tiling is that methods that rely on HSP +tiling such as L, L, L +etc. may report misleading numbers when C<-OVERLAP> is set to a large +number. For example, say we have two HSPs and the query sequence tile +as follows: + + 1 8 22 30 40 60 + Full seq: ------------------------------------------------------------ + * ** * ** + HSP1: --------------- (6 identical matches) + ** ** ** + HSP2: ------------- (6 identical matches) + + +If C<-OVERLAP> is set to some number over 4, HSP1 and HSP2 will not be +tiled into a single contig and their numbers of identical matches will +be added, giving a total of 12, not 10 if they had be combined into +one contig. This can lead to number greater than 1.0 for methods +L and L. This is less of an issue +with gapped Blast since it tends to combine HSPs that would be listed +separately without gapping. (Fractions E1.0 can be viewed as a +signal for an interesting alignment that warrants further inspection, +thus turning this bug into a feature). + +Using large values for C<-OVERLAP> can lead to incorrect numbers +reported by methods that rely on HSP tiling but can be useful if you +care more about detecting ambiguous alignments. Setting C<-OVERLAP> +to zero will lead to the most accurate numbers for the +tiling-dependent methods but will be useless for detecting overlapping +HSPs since all HSPs will appear to overlap. + + +=head1 SEE ALSO + + Bio::Tools::Blast::HSP.pm - Blast HSP object. + Bio::Tools::Blast.pm - Blast object. + Bio::Root::Object.pm - Proposed base class for all Bioperl objects. + +Links: + + http://bio.perl.org/Core/POD/Tools/Blast/HSP.pm.html + + http://bio.perl.org/Projects/modules.html - Online module documentation + http://bio.perl.org/Projects/Blast/ - Bioperl Blast Project + http://bio.perl.org/ - Bioperl Project Homepage + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Steve Chervitz, sac@bioperl.org + +See the L section for where to send bug reports and comments. + +=head1 COPYRIGHT + +Copyright (c) 1996-2000 Steve Chervitz. All Rights Reserved. +This module is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +=cut + + + +# +## +### +#### END of main POD documentation. +### +## +#' + + +=head1 APPENDIX + +Methods beginning with a leading underscore are considered private +and are intended for internal use by this module. They are +B considered part of the public interface and are described here +for documentation purposes only. + +=cut + +##################################################################################### +## CONSTRUCTOR ## +##################################################################################### + +=head2 _initialize + + Usage : n/a; automatically called by Bio::Root::Object::new() + : Bio::Tools::Blast::Sbjct.pm objects are constructed + : automatically by Bio::Tools::Blast.pm, so there is no need + : for direct consumption. + Purpose : Initializes key varaiables and calls methods to parse a single Blast hit. + : Constructs Bio::Tools::Blast::HSP.pm objects for each + : high-scoring segment pair (HSP). + : Calls superclass constructor first (Bio::Root::Object.pm). + Returns : n/a + Argument : Named Parameters passed from new() + : ALL TAGS MUST BE UPPERCASE (does not call _rearrange()). + : -DATA => array reference holding all data for a single hit. + : The first element should hold the description + : line (from the desctiption section at the top of + : the report), remaining lines should hold all lines + : within the HSP alignment listing section of report. + : -PARENT => object reference to a Bio::Tools::Blast.pm object. + : -NAME => string (typically same as -RANK, just a temporary + : name to use until the actual name of hit is parsed), + : -RANK => integer, + : -RANK_BY => 'order', + : -OVERLAP => integer (maximum overlap between adjacent + : HSPs when tiling) + : -PROGRAM => string (type of Blast blastp, blastn, etc). + +See Also : L<_set_id()|_set_id>, L<_set_hsps()|_set_hsps>, L<_tile_hsps()|_tile_hsps>, B, B + +=cut + +#------------------- +sub _initialize { +#------------------- + my( $self, %param ) = @_; + + # $make not currently used. + my $make = $self->SUPER::_initialize( %param ); + + # Set various class data. + $_prog = $param{-PROGRAM} || ''; + $_signif_fmt = $param{-SIGNIF_FMT}; + + $self->{'_rank'} = $param{-RANK} || ''; + $self->_set_id( $param{-DATA}->[0]); + $self->_set_hsps( @{$param{-DATA}} ); + + $self->{'_overlap'} = $param{-OVERLAP} || 0; +} + +#-------------- +sub destroy { +#-------------- + my $self=shift; + if($self->{'_hsps'}) { + foreach($self->hsps) { + $_->destroy; + undef $_; + } + undef $self->{'_hsps'}; + } + $DEBUG==2 && print STDERR "DESTROYING $self ${\$self->name}"; + $self->SUPER::destroy; +} + +##################################################################################### +## ACCESSORS ## +##################################################################################### + + +=head2 rank + + Usage : $sbjct->rank( integer or string ); + Purpose : Sets/Gets the rank of the current Sbjct object relative to + : other Sbjct objects managed by a given Blast object. + Example : $sbjct->rank(1); + Returns : The current rank value. + Argument : Integer or string to be used for ranking the hit + : relative to other hits. + Throws : n/a + Comments : The rank usually corresponds to the order the listing + : of hits in the BLAST report from lowest to highest p-value. + : Rank need not be restricted to this value. + : rank() may be provided by a delegated or inherited + : iterator class in the future (such as Bio::Root::Vector.pm). + +=cut + +#----------- +sub rank { +#----------- + my $self = shift; + if(@_) {$self->{'_rank'} = shift; } + $self->{'_rank'}; +} + + + +=head2 _set_id + + Usage : n/a; automatically called by _initialize() + Purpose : Sets the name of the Sbjct sequence from the BLAST summary line. + : The identifier is assumed to be the first + : chunk of non-whitespace characters in the description line + : Does not assume any semantics in the structure of the identifier + : (Formerly, this method attempted to extract database name from + : the seq identifiers, but this was prone to break). + Returns : n/a + Argument : String containing description line of the hit from Blast report + : or first line of an alignment section. + Throws : Warning if cannot locate sequence ID. + +See Also : L<_initialize()|_initialize>, B + +=cut + +#--------------- +sub _set_id { +#--------------- + my( $self, $desc ) = @_; + my ($seqID1, $seqID2, $dbID, @seqDat); + + local $_ = $desc; + my @linedat = split(); + my $data = $linedat[0]; + +# New strategy: Assume only that the ID is the first white space +# delimited chunk. Not attempting to extract database name. +# Clients will have to interpret it as necessary. + if($data =~ /^(\S+)\s*/) { + $self->name($1); + } + else { + $self->warn("Can't locate sequence identifier in summary line.", "Line = $data"); + $data = 'Unknown sequence ID' if not $data; + $self->name($data); + } + $self->{'_db'} = '-'; + +# Old strategy: assumes semantics in the identifier +# and tries to separate out database and id components. +# Too fancy and fragile! SAC, 2000-02-18 + +# # Proceeding from more standard (NCBI-like) to less standard. +# if($data =~ /(\S+?)[:\|]+(\S+?)[:\|]+(\S*)/) { +# # matches: database|id1|id2 or database||id1||id2 or database:id1:id2 +# $dbID = $1; +# $seqID1 = $2; +# $seqID2 = $3; +# if($seqID2 eq $seqID1) { undef($seqID2); } +# +# } elsif($data =~ /(\S+?)[:\|]+(\S+)/) { +# # matches: database|id1 or database:id1 +# $dbID = $1; +# $seqID1 = $2; +# +# } elsif($data =~ /^(\S+)\s+([gb|emb|dbj|sp|pir])\s+(\S+)*/) { +# # matches: id1 database id2 +# $seqID1 = $1; +# $dbID = $2; +# $seqID2 = $3; +# +# } elsif($data =~ /^(\S+)\s*/) { +# $seqID1 = $1; +# } +# +# ## Combine the multiple IDs. +# $seqID2 = scalar($seqID2) ? "/$seqID2" : ''; +# +# if( !scalar $seqID1) { +# $self->warn("Can't locate sequence identifier in summary line.", "Line = $data"); +# $self->name('Unknown sequence ID'); +# } else { +# $self->name($seqID1.$seqID2); +# } +# $self->{'_db'} = $dbID || '-'; +} + + +=head2 _set_hsps + + Usage : n/a; called automatically during object construction. + Purpose : Creates HSP.pm objects for each HSP in a BLAST hit alignment. + : Also collects the full description of the hit from the + : HSP alignment section. + Returns : n/a + Argument : List of strings containing raw BLAST report data for + : a single hit's HSP alignment data. + Throws : Warnings for each HSP.pm object that fails to be constructed. + : Exception if no HSP.pm objects can be constructed. + : Exception if can't parse length data for hit sequence. + Comments : Requires Bio::Tools::Blast::HSP.pm. + : Sets the description using the full string present in + : the alignment data. + : Also sets Expect and P-values for the Sbjct object by + : copying from the HSP object. + : While this sacrifices some memory efficiency, it + : improves access speed for these critical data. + +See Also : L<_initialize()|_initialize>, L<_set_desc()|_set_desc> + +=cut + +#-------------- +sub _set_hsps { +#-------------- + + my( $self, @data ) = @_; + my $start = 0; + my $hspCount = 0; + + require Bio::Tools::Blast::HSP; + +# printf STDERR "$ID _set_hsps(). DATA (%d lines) =\n@data\n", scalar(@data); ; + + my( @hspData, @hspList, @errs, @bad_names ); + my($line, $set_desc, @desc); + $set_desc = 0; + + hit_loop: + foreach $line( @data ) { + + if( $line =~ /^\s*Length = ([\d,]+)/ ) { + $self->_set_desc(@desc); + $set_desc = 1; + ($self->{'_length'} = $1) =~ s/,//g; # get rid of commas + next hit_loop; + } elsif( !$set_desc) { + $line =~ s/^\s+|\s+$//g; + push @desc, $line; + next hit_loop; + } elsif( $line =~ /^\s*Score/ ) { + ## This block is for setting multiple HSPs. + + if( not scalar @hspData ) { + $start = 1; + push @hspData, $line; + next hit_loop; + + } elsif( scalar @hspData) { + $hspCount++; + $DEBUG and do{ print STDERR +( $hspCount % 10 ? "+" : "+\n" ); }; + +# print STDERR "\n$ID: setting HSP: ${\$self->name}\n"; + my $hspObj = eval { new Bio::Tools::Blast::HSP(-DATA =>\@hspData, + -PARENT =>$self, + -NAME =>$hspCount, + -PROGRAM =>$_prog, + ); + }; + if($@) { +# print "$ID: ERROR:\n$@";; + push @errs, $@; + push @bad_names, "#$hspCount"; + $hspObj->destroy if ref $hspObj; + undef $hspObj; + } else { + push @hspList, $hspObj; + if (!defined($self->{'_expect'}) || $hspObj->expect() < $self->{'_expect'}) { + $self->{'_expect'} = $hspObj->expect(); + } + if (!defined($self->{'_p'}) || $hspObj->p() < $self->{'_p'}) { + $self->{'_p'} = $hspObj->p(); + } + } + @hspData = (); + push @hspData, $line; + next; + } else { + push @hspData, $line; + } + } elsif( $start ) { + ## This block is for setting the last HSP (which may be the first as well!). + if( $line =~ /^(end|>|Parameters|CPU|Database:)/ ) { + $hspCount++; + $DEBUG and do{ print STDERR +( $hspCount % 10 ? "+" : "+\n" ); }; + +# print STDERR "\n$ID: setting HSP: ${\$self->name}\n"; + + my $hspObj = eval { new Bio::Tools::Blast::HSP(-DATA =>\@hspData, + -PARENT =>$self, + -NAME =>$hspCount, + -PROGRAM =>$_prog, + ); + }; + if($@) { +# print "$ID: ERROR:\n$@";; + push @errs, $@; + push @bad_names, "#$hspCount"; + $hspObj->destroy if ref $hspObj; + undef $hspObj; + } else { + push @hspList, $hspObj; + if (!defined($self->{'_expect'}) || $hspObj->expect() < $self->{'_expect'}) { + $self->{'_expect'} = $hspObj->expect(); + } + if (!defined($self->{'_p'}) || $hspObj->p() < $self->{'_p'}) { + $self->{'_p'} = $hspObj->p(); + } + } + } else { + push @hspData, $line; + } + } + } + +# print STDERR "\n--------> Done building HSPs for ${\$self->name}\n"; + + $self->{'_length'} or $self->throw( "Can't determine hit sequence length."); + + # Adjust logical length based on BLAST flavor. + if($_prog =~ /TBLAST[NX]/) { + $self->{'_logical_length'} = $self->{'_length'} / 3; + } + + # Handling errors as done in Blast.pm. (as of version 0.073) + + if(@errs) { + my ($str); + # When there are many errors, in most of the cases, they are + # caused by the same problem. Only need to see full data for + # the first one. + if(@errs > 2) { + $str = "SHOWING FIRST EXCEPTION ONLY:\n$errs[0]"; + $self->clear_err(); # clearing the existing set of errors. + # Not necessary, unless the -RECORD_ERR =>1 + # constructor option was used for Blast object. + } else { + $str = join("\n",@errs); + } + + if( not scalar @hspList) { + $self->throw("Failed to create any HSP objects for $hspCount potential HSP(s).", + "\n\nTRAPPED EXCEPTION(S):\n$str\nEND TRAPPED EXCEPTION(S)\n" + ); + } else { + $self->warn(sprintf("Could not create HSP objects for %d HSP(s): %s", scalar(@errs), join(', ',@bad_names)), + "\n\nTRAPPED EXCEPTION(S):\n$str\nEND TRAPPED EXCEPTION(S)\n" + ); + } + + } else { + $self->{'_hsps'} = \@hspList; + } +} + +=head2 _set_desc + + Usage : n/a; called automatically by _set_hsps() + Purpose : Sets the description of the hit sequence. + : For sequence without descriptions, sets description to "-". + Argument : Array containing description (multiple lines). + Comments : _set_hsps() calls this method with the data from the + : HSP alignment listing, which contains the complete description. + : (Formerly, this was called from the _set_desc_data() method initially.) + +See Also : _set_hsps() + +=cut + +#-------------- +sub _set_desc { +#-------------- + my( $self, @desc ) = @_; + my( $desc); + +# print "$ID: RAW DESC:\n@desc";; + + $desc = join(" ", @desc); + + if($desc) { + $desc =~ s/^\s*\S+\s+//; # remove the sequence ID(s) + $desc =~ s/^[\s!]+//; + $desc =~ s/ \d+$//; + $desc =~ s/\.+$//; + $self->{'_desc'} = $desc || '-'; + } else { + $self->{'_desc'} = '-'; + } + +# print "$ID: _set_desc = $desc";; +} + + +=head2 _tile_hsps + + Usage : n/a; called automatically during object construction or + : as needed by methods that rely on having tiled data. + Purpose : Collect statistics about the aligned sequences in a set of HSPs. + : Calculates the following data across all HSPs: + : -- total alignment length + : -- total identical residues + : -- total conserved residues + Returns : n/a + Argument : n/a + Throws : n/a + Status : Experimental + Comments : + : This method performs more careful summing of data across + : all HSPs in the Sbjct object. Simply summing the data from all HSPs + : will overestimate the actual length of the alignment if there is + : overlap between different HSPs (often the case). + : The strategy is to tile the HSPs and sum over the + : contigs, collecting data separately from overlapping and + : non-overlapping regions of each HSP. To facilitate this, the + : HSP.pm object now permits extraction of data from sub-sections + : of an HSP. + : + : Additional useful information is collected from the results + : of the tiling. It is possible that sub-sequences in + : different HSPs will overlap significantly. In this case, it + : is impossible to create a single unambiguous alignment by + : concatenating the HSPs. The ambiguity may indicate the + : presence of multiple, similar domains in one or both of the + : aligned sequences. This ambiguity is recorded using the + : ambiguous_aln() method. + : + : This method does not attempt to discern biologically + : significant vs. insignificant overlaps. The allowable amount of + : overlap can be set with the overlap() method or with the -OVERLAP + : parameter used when constructing the Blast & Sbjct objects. + : + : For a given hit, both the query and the sbjct sequences are + : tiled independently. + : + : -- If only query sequence HSPs overlap, + : this may suggest multiple domains in the sbjct. + : -- If only sbjct sequence HSPs overlap, + : this may suggest multiple domains in the query. + : -- If both query & sbjct sequence HSPs overlap, + : this suggests multiple domains in both. + : -- If neither query & sbjct sequence HSPs overlap, + : this suggests either no multiple domains in either + : sequence OR that both sequences have the same + : distribution of multiple similar domains. + : + : This method can deal with the special case of when multiple + : HSPs exactly overlap. + : + : Efficiency concerns: + : Speed will be an issue for sequences with numerous HSPs. + : + Bugs : Currently, _tile_hsps() does not properly account for + : the number of non-tiled but overlapping HSPs, which becomes a problem + : as overlap() grows. Large values overlap() may thus lead to + : incorrect statistics for some hits. For best results, keep overlap() + : below 5 (DEFAULT IS 2). For more about this, see the "HSP Tiling and + : Ambiguous Alignments" section. + +See Also : L<_adjust_contigs()|_adjust_contigs>, L, L, L, L, L, L, L, L, L + +=cut + +#-------------- +sub _tile_hsps { +#-------------- + my $self = shift; +# my $gapped = $self->parent->gapped || 0; # no special treatment + + $self->{'_tile_hsps'} = 1; + $self->{'_gaps_query'} = 0; + $self->{'_gaps_sbjct'} = 0; + + ## Simple summation scheme. Valid if there is only one HSP. + if((defined($self->{'_n'}) and $self->{'_n'} == 1) or $self->num_hsps == 1) { + my $hsp = $self->hsp; + $self->{'_length_aln_query'} = $hsp->length('query'); + $self->{'_length_aln_sbjct'} = $hsp->length('sbjct'); + $self->{'_length_aln_total'} = $hsp->length('total'); + ($self->{'_totalIdentical'},$self->{'_totalConserved'}) = $hsp->matches(); + $self->{'_gaps_query'} = $hsp->gaps('query'); + $self->{'_gaps_sbjct'} = $hsp->gaps('sbjct'); + +# print "_tile_hsps(): single HSP, easy stats.\n"; + return; + } else { +# print STDERR "$ID: _tile_hsps: summing multiple HSPs\n"; + $self->{'_length_aln_query'} = 0; + $self->{'_length_aln_sbjct'} = 0; + $self->{'_length_aln_total'} = 0; + $self->{'_totalIdentical'} = 0; + $self->{'_totalConserved'} = 0; + } + + ## More than one HSP. Must tile HSPs. +# printf "\nTiling HSPs for %s (BLAST: %s)\n",$self->name, $self->parent->name; + my($hsp, $qstart, $qstop, $sstart, $sstop); + my(@qcontigs, @scontigs); + my $qoverlap = 0; + my $soverlap = 0; + my $max_overlap = $self->{'_overlap'}; + + foreach $hsp ($self->hsps()) { +# printf " HSP: %s\n%s\n",$hsp->name, $hsp->str('query'); +# printf " Length = %d; Identical = %d; Conserved = %d; Conserved(1-10): %d",$hsp->length, $hsp->length(-TYPE=>'iden'), $hsp->length(-TYPE=>'cons'), $hsp->length(-TYPE=>'cons',-START=>0,-STOP=>10); ; + ($qstart, $qstop) = $hsp->range('query'); + ($sstart, $sstop) = $hsp->range('sbjct'); + + my ($qgaps, $sgaps) = $hsp->gaps(); + $self->{'_gaps_query'} += $qgaps; + $self->{'_gaps_sbjct'} += $sgaps; + + $self->{'_length_aln_total'} += $hsp->length; + ## Collect contigs in the query sequence. + $qoverlap = &_adjust_contigs('query', $hsp, $qstart, $qstop, \@qcontigs, $max_overlap); + + ## Collect contigs in the sbjct sequence (needed for domain data and gapped Blast). + $soverlap = &_adjust_contigs('sbjct', $hsp, $sstart, $sstop, \@scontigs, $max_overlap); + + ## Collect overall start and stop data for query and sbjct over all HSPs. + if(not defined $self->{'_queryStart'}) { + $self->{'_queryStart'} = $qstart; + $self->{'_queryStop'} = $qstop; + $self->{'_sbjctStart'} = $sstart; + $self->{'_sbjctStop'} = $sstop; + } else { + $self->{'_queryStart'} = ($qstart < $self->{'_queryStart'} ? $qstart : $self->{'_queryStart'}); + $self->{'_queryStop'} = ($qstop > $self->{'_queryStop'} ? $qstop : $self->{'_queryStop'}); + $self->{'_sbjctStart'} = ($sstart < $self->{'_sbjctStart'} ? $sstart : $self->{'_sbjctStart'}); + $self->{'_sbjctStop'} = ($sstop > $self->{'_sbjctStop'} ? $sstop : $self->{'_sbjctStop'}); + } + } + + ## Collect data across the collected contigs. + +# print "\nQUERY CONTIGS:\n"; +# print " gaps = $self->{'_gaps_query'}\n"; + + foreach(@qcontigs) { +# print " query contig: $_->{'start'} - $_->{'stop'}\n"; +# print " iden = $_->{'iden'}; cons = $_->{'cons'}\n"; + $self->{'_length_aln_query'} += $_->{'stop'} - $_->{'start'} + 1; + $self->{'_totalIdentical'} += $_->{'iden'}; + $self->{'_totalConserved'} += $_->{'cons'}; + } + + ## Collect data for sbjct contigs. Important for gapped Blast. + ## The totalIdentical and totalConserved numbers will be the same + ## as determined for the query contigs. + +# print "\nSBJCT CONTIGS:\n"; +# print " gaps = $self->{'_gaps_sbjct'}\n"; + + foreach(@scontigs) { +# print " sbjct contig: $_->{'start'} - $_->{'stop'}\n"; +# print " iden = $_->{'iden'}; cons = $_->{'cons'}\n"; + $self->{'_length_aln_sbjct'} += $_->{'stop'} - $_->{'start'} + 1; + } +# ; + + if($qoverlap) { + if($soverlap) { $self->ambiguous_aln('qs'); +# print "\n*** AMBIGUOUS ALIGNMENT: Query and Sbjct\n\n"; + } + else { $self->ambiguous_aln('q'); +# print "\n*** AMBIGUOUS ALIGNMENT: Query\n\n"; + } + } elsif($soverlap) { + $self->ambiguous_aln('s'); +# print "\n*** AMBIGUOUS ALIGNMENT: Sbjct\n\n"; + } + + # Adjust length based on BLAST flavor. + my $prog = $self->parent->program; + if($prog eq 'TBLASTN') { + $self->{'_length_aln_sbjct'} /= 3; + } elsif($prog eq 'BLASTX' ) { + $self->{'_length_aln_query'} /= 3; + } elsif($prog eq 'TBLASTX') { + $self->{'_length_aln_query'} /= 3; + $self->{'_length_aln_sbjct'} /= 3; + } +} + + + +=head2 _adjust_contigs + + Usage : n/a; called automatically during object construction. + Purpose : Builds HSP contigs for a given BLAST hit. + : Utility method called by _tile_hsps() + Returns : + Argument : + Throws : Exceptions propagated from Bio::Tools::Blast::HSP::matches() + : for invalid sub-sequence ranges. + Status : Experimental + Comments : This method does not currently support gapped alignments. + : Also, it does not keep track of the number of HSPs that + : overlap within the amount specified by overlap(). + : This will lead to significant tracking errors for large + : overlap values. + +See Also : L, L<_tile_hsps()|_tile_hsps>, B::matches + +=cut + +#------------------- +sub _adjust_contigs { +#------------------- + my ($seqType, $hsp, $start, $stop, $contigs_ref, $max_overlap) = @_; + + my $overlap = 0; + my ($numID, $numCons); + +# print "Testing $seqType data: HSP (${\$hsp->name}); $start, $stop\n"; + foreach(@$contigs_ref) { +# print " Contig: $_->{'start'} - $_->{'stop'}, iden= $_->{'iden'}, cons= $_->{'cons'}\n"; + + ## Test special case of a nested HSP. Skip it. + if($start >= $_->{'start'} and $stop <= $_->{'stop'}) { +# print "----> Nested HSP. Skipping.\n"; + $overlap = 1; + next; + } + + ## Test for overlap at beginning of contig. + if($start < $_->{'start'} and $stop > ($_->{'start'} + $max_overlap)) { +# print "----> Overlaps beg: existing beg,end: $_->{'start'},$_->{'stop'}, new beg,end: $start,$stop\n";; + # Collect stats over the non-overlapping region. + eval { + ($numID, $numCons) = $hsp->matches(-SEQ =>$seqType, + -START =>$start, + -STOP =>$_->{'start'}-1); + }; + if($@) { warn "\a\n$@\n"; } + else { + $_->{'start'} = $start; # Assign a new start coordinate to the contig + $_->{'iden'} += $numID; # and add new data to #identical, #conserved. + $_->{'cons'} += $numCons; + $overlap = 1; + } + } + + ## Test for overlap at end of contig. + if($stop > $_->{'stop'} and $start < ($_->{'stop'} - $max_overlap)) { +# print "----> Overlaps end: existing beg,end: $_->{'start'},$_->{'stop'}, new beg,end: $start,$stop\n";; + # Collect stats over the non-overlapping region. + eval { + ($numID,$numCons) = $hsp->matches(-SEQ =>$seqType, + -START =>$_->{'stop'}, + -STOP =>$stop); + }; + if($@) { warn "\a\n$@\n"; } + else { + $_->{'stop'} = $stop; # Assign a new stop coordinate to the contig + $_->{'iden'} += $numID; # and add new data to #identical, #conserved. + $_->{'cons'} += $numCons; + $overlap = 1; + } + } + $overlap && do { +# print " New Contig data:\n"; +# print " Contig: $_->{'start'} - $_->{'stop'}, iden= $_->{'iden'}, cons= $_->{'cons'}\n"; + last; + }; + } + ## If there is no overlap, add the complete HSP data. + !$overlap && do { +# print "No overlap. Adding new contig.\n"; + ($numID,$numCons) = $hsp->matches(-SEQ=>$seqType); + push @$contigs_ref, {'start'=>$start, 'stop'=>$stop, + 'iden'=>$numID, 'cons'=>$numCons }; + }; +# ; + $overlap; +} + + +=head2 ambiguous_aln + + Usage : $ambig_code = $sbjct_object->ambiguous_aln(); + Purpose : Sets/Gets ambiguity code data member. + Example : (see usage) + Returns : String = 'q', 's', 'qs', '-' + : 'q' = query sequence contains overlapping sub-sequences + : while sbjct does not. + : 's' = sbjct sequence contains overlapping sub-sequences + : while query does not. + : 'qs' = query and sbjct sequence contains overlapping sub-sequences + : relative to each other. + : '-' = query and sbjct sequence do not contains multiple domains + : relative to each other OR both contain the same distribution + : of similar domains. + Argument : n/a + Throws : n/a + Status : Experimental + +See Also : L<_tile_hsps()|_tile_hsps>, L + +=cut + +#-------------------- +sub ambiguous_aln { +#-------------------- + my $self = shift; + if(@_) { $self->{'_ambiguous_aln'} = shift; } + $self->{'_ambiguous_aln'} || '-'; +} + + + +=head2 overlap + + Usage : $blast_object->overlap( [integer] ); + Purpose : Gets/Sets the allowable amount overlap between different HSP sequences. + Example : $blast_object->overlap(5); + : $overlap = $blast_object->overlap(); + Returns : Integer. + Argument : integer. + Throws : n/a + Status : Experimental + Comments : Any two HSPs whose sequences overlap by less than or equal + : to the overlap() number of resides will be considered separate HSPs + : and will not get tiled by _adjust_contigs(). + +See Also : L<_adjust_contigs()|_adjust_contigs>, L + +=cut + +#------------- +sub overlap { +#------------- + my $self = shift; + if(@_) { $self->{'_overlap'} = shift; } + defined $self->{'_overlap'} ? $self->{'_overlap'} : 0; +} + + + + +=head2 score + + Usage : $sbjct_object->score(); + Purpose : Gets the BLAST score of the best HSP for the current Blast hit. + Example : $score = $sbjct_object->score(); + Returns : Integer + Argument : n/a + Throws : n/a + +See Also : L + +=cut + +#---------- +sub score { +#---------- + my $self = shift; + + # The check for $self->{'_score'} is a remnant from the 'query' mode days + # in which the sbjct object would collect data from the description line only. + + my ($score); + if(not defined($self->{'_score'})) { + $score = $self->hsp->score; + } else { + $score = $self->{'_score'}; + } + return $score; +} + + + +=head2 bits + + Usage : $sbjct_object->bits(); + Purpose : Gets the BLAST bit score of the best HSP for the current Blast hit. + Example : $bits = $sbjct_object->bits(); + Returns : Integer + Argument : n/a + Throws : Exception if bit score is not set. + Comments : For BLAST1, the non-bit score is listed in the summary line. + +See Also : L + +=cut + +#--------- +sub bits { +#--------- + my $self = shift; + + # The check for $self->{'_bits'} is a remnant from the 'query' mode days + # in which the sbjct object would collect data from the description line only. + + my ($bits); + if(not defined($self->{'_bits'})) { + $bits = $self->hsp->bits; + } else { + $bits = $self->{'_bits'}; + } + return $bits; +} + + + +=head2 n + + Usage : $sbjct_object->n(); + Purpose : Gets the N number for the current Blast hit. + : This is the number of HSPs in the set which was ascribed + : the lowest P-value (listed on the description line). + : This number is not the same as the total number of HSPs. + : To get the total number of HSPs, use num_hsps(). + Example : $n = $sbjct_object->n(); + Returns : Integer + Argument : n/a + Throws : Exception if HSPs have not been set (BLAST2 reports). + Comments : Note that the N parameter is not reported in gapped BLAST2. + : Calling n() on such reports will result in a call to num_hsps(). + : The num_hsps() method will count the actual number of + : HSPs in the alignment listing, which may exceed N in + : some cases. + +See Also : L + +=cut + +#----- +sub n { +#----- + my $self = shift; + + # The check for $self->{'_n'} is a remnant from the 'query' mode days + # in which the sbjct object would collect data from the description line only. + + my ($n); + if(not defined($self->{'_n'})) { + $n = $self->hsp->n; + } else { + $n = $self->{'_n'}; + } + $n ||= $self->num_hsps; + + return $n; +} + + + +=head2 frame + + Usage : $sbjct_object->frame(); + Purpose : Gets the reading frame for the hit sequence (TBLASTN/X only). + Example : $frame = $sbjct_object->frame(); + Returns : Integer (-3 .. +3). + Argument : n/a + Throws : Exception if HSPs have not been set (BLAST2 reports). + +See Also : L + +=cut + +#---------- +sub frame { +#---------- + my $self = shift; + + # The check for $self->{'_frame'} is a remnant from the 'query' mode days + # in which the sbjct object would collect data from the description line only. + + my ($frame); + if(not defined($self->{'_frame'})) { + $frame = $self->hsp->frame; + } else { + $frame = $self->{'_frame'}; + } + return $frame; +} + + + + + +=head2 p + + Usage : $sbjct_object->p( [format] ); + Purpose : Get the P-value for the given BLAST hit. + : (Note that P-values are not provided with NCBI Blast2 reports). + Example : $p = $sbjct->p; + : $p = $sbjct->p('exp'); # get exponent only. + : ($num, $exp) = $sbjct->p('parts'); # split sci notation into parts + Returns : Float or scientific notation number (the raw P-value, DEFAULT). + : Integer if format == 'exp' (the magnitude of the base 10 exponent). + : 2-element list (float, int) if format == 'parts' and P-value + : is in scientific notation (See Comments). + Argument : format: string of 'raw' | 'exp' | 'parts' + : 'raw' returns value given in report. Default. (1.2e-34) + : 'exp' returns exponent value only (34) + : 'parts' returns the decimal and exponent as a + : 2-element list (1.2, -34) (See Comments). + Throws : Exception if the P-value is not defined, which will occur + : with any NCBI Blast2 report. + Comments : Using the 'parts' argument is not recommended since it will not + : work as expected if the P-value is not in scientific notation. + : That is, floats are not converted into sci notation before + : splitting into parts. + +See Also : L, L, L + +=cut + +#-------- +sub p { +#-------- +# Some duplication of logic for p(), expect() and signif() for the sake of performance. + my ($self, $fmt) = @_; + + my ($val); + $fmt ||= $_signif_fmt; + + # $val can be zero. + if(not defined($val = $self->{'_p'})) { + ## P-value not defined, must be a NCBI Blast2 report. + my $note = ''; + if($self->parent->_layout() == 2) { + $note = "Blast2 does not report P-values. Use expect() instead."; + } + $self->throw("Can't get P-value: undefined.", $note); + } + + return $val if not $fmt or $fmt =~ /^raw/i; + ## Special formats: exponent-only or as list. + return &get_exponent($val) if $fmt =~ /^exp/i; + return (split (/eE/, $val)) if $fmt =~ /^parts/i; + + ## Default: return the raw P-value. + return $val; +} + + + +=head2 expect + + Usage : $sbjct_object->expect( [format] ); + Purpose : Get the Expect value for the given BLAST hit. + Example : $e = $sbjct->expect; + : $e = $sbjct->expect('exp'); # get exponent only. + : ($num, $exp) = $sbjct->expect('parts'); # split sci notation into parts + Returns : Float or scientific notation number (the raw expect value, DEFAULT). + : Integer if format == 'exp' (the magnitude of the base 10 exponent). + : 2-element list (float, int) if format == 'parts' and Expect + : is in scientific notation (see Comments). + Argument : format: string of 'raw' | 'exp' | 'parts' + : 'raw' returns value given in report. Default. (1.2e-34) + : 'exp' returns exponent value only (34) + : 'parts' returns the decimal and exponent as a + : 2-element list (1.2, -34) (see Comments). + Throws : Exception if the Expect value is not defined. + Comments : Using the 'parts' argument is not recommended since it will not + : work as expected if the expect value is not in scientific notation. + : That is, floats are not converted into sci notation before + : splitting into parts. + +See Also : L, L, L + +=cut + +#----------- +sub expect { +#----------- +# Some duplication of logic for p(), expect() and signif() for the sake of performance. + my ($self, $fmt) = @_; + + my $val = $self->{'_expect'}; + $fmt ||= $_signif_fmt; + + # $val can be zero. + defined($val) or $self->throw("Can't get Expect value: HSPs may not have been set."); + + return $val if not $fmt or $fmt =~ /^raw/i; + ## Special formats: exponent-only or as list. + return &get_exponent($val) if $fmt =~ /^exp/i; + return (split (/eE/, $val)) if $fmt =~ /^parts/i; + + ## Default: return the raw Expect-value. + return $val; +} + + + +=head2 signif + + Usage : $sbjct_object->signif( [format] ); + Purpose : Get the P or Expect value for the given BLAST hit. + : The value returned is the one which is reported in the description + : section of the Blast report. For Blast1 and WU-Blast2, this + : is a P-value, for Blast2, it is an Expect value. + Example : $obj->signif() # returns 1.3e-34 + : $obj->signif('exp') # returns -34 + : $obj->signif('parts') # returns (1.3, -34) + Returns : Float or scientific notation number (the raw P/Expect value, DEFAULT). + : Integer if format == 'exp' (the magnitude of the base 10 exponent). + : 2-element list (float, int) if format == 'parts' and P/Expect value + : is in scientific notation (see Comments). + Argument : format: string of 'raw' | 'exp' | 'parts' + : 'raw' returns value given in report. Default. (1.2e-34) + : 'exp' returns exponent value only (34) + : 'parts' returns the decimal and exponent as a + : 2-element list (1.2, -34) (see Comments). + Throws : n/a + Status : Deprecated. Use p() or expect(). + Comments : The signif() method provides a way to deal with the fact that + : Blast1 and Blast2 formats differ in what is reported in the + : description lines of each hit in the Blast report. The signif() + : method frees any client code from having to know if this is a P-value + : or an Expect value, making it easier to write code that can process + : both Blast1 and Blast2 reports. This is not necessarily a good thing, since + : one should always know when one is working with P-values or + : Expect values (hence the deprecated status). + : Use of expect() is recommended since all hits will have an Expect value. + : + : Using the 'parts' argument is not recommended since it will not + : work as expected if the expect value is not in scientific notation. + : That is, floats are not converted into sci notation before + : splitting into parts. + +See Also : L, L, L + +=cut + +#------------- +sub signif { +#------------- +# Some duplication of logic for p(), expect() and signif() for the sake of performance. + my ($self, $fmt) = @_; + + my $val = defined($self->{'_p'}) ? $self->{'_p'} : $self->{'_expect'}; + $fmt ||= $_signif_fmt; + + # $val can be zero. + defined($val) or $self->throw("Can't get P- or Expect value: HSPs may not have been set."); + + return $val if not $fmt or $fmt =~ /^raw/i; + ## Special formats: exponent-only or as list. + return &get_exponent($val) if $fmt =~ /^exp/i; + return (split (/eE/, $val)) if $fmt =~ /^parts/i; + + ## Default: return the raw P/Expect-value. + return $val; +} + + + +=head2 desc + + Usage : $sbjct_object->desc( [integer] ); + Purpose : Get the description for the given BLAST hit. + Example : (see usage) + Returns : String + Argument : Integer (optional) indicating the desired length of the + : description string to be returned. + Throws : n/a + +See Also : L<_set_desc()|_set_desc> + +=cut + +#--------- +sub desc { +#--------- + my( $self, $len ) = @_; + $len = (defined $len) ? $len : (CORE::length $self->{'_desc'}); + substr( $self->{'_desc'}, 0 ,$len ); +} + + + +=head2 database + + Usage : $sbjct_object->database(); + Purpose : Get the name of the database for the hit sequence. + Example : (see usage) + Returns : String + Argument : n/a + Throws : n/a + Status : Deprecated. Use Bio::Tools::Blast::database() + Extracting database name from the seq identifier is error prone. + Comments : Database id should be the same for all hits in a given + : BLAST report, however, they do not always have the same + : name as the database name extraced by the Blast.pm object. + : The Sbjct.pm database id is obtained from the summary line. + +=cut + +#-------------- +sub database { + my $self = shift; + $self->warn("Bio::Tools::Sbjct::database() is deprecated.\nNo useful information is provided by this method.\nUse Bio::Tools::Blast::database().\n"); + return $self->{'_db'}; +} +#-------------- + + + + +=head2 hsps + + Usage : $sbjct_object->hsps(); + Purpose : Get a list containing all HSP objects. + : Get the numbers of HSPs for the current hit. + Example : @hsps = $sbjct_object->hsps(); + : $num = $sbjct_object->hsps(); # alternatively, use num_hsps() + Returns : Array context : list of Bio::Tools::Blast::HSP.pm objects. + : Scalar context: integer (number of HSPs). + : (Equivalent to num_hsps()). + Argument : n/a. Relies on wantarray + Throws : Exception if the HSPs have not been collected. + +See Also : L, L, L<_set_hsps()|_set_hsps> + +=cut + +#--------- +sub hsps { +#--------- + my $self = shift; + + if (not ref $self->{'_hsps'}) { + $self->throw("Can't get HSPs: data not collected."); + } + + return wantarray + # returning list containing all HSPs. + ? @{$self->{'_hsps'}} + # returning number of HSPs. + : scalar(@{$self->{'_hsps'}}); +} + + + +=head2 hsp + + Usage : $sbjct_object->hsp( [string] ); + Purpose : Get a single HSP.pm object for the present Sbjct.pm object. + Example : $hspObj = $sbjct_object->hsp; # same as 'best' + : $hspObj = $sbjct_object->hsp('best'); + : $hspObj = $sbjct_object->hsp('worst'); + Returns : Object reference for a Bio::Tools::Blast::HSP.pm object. + Argument : String (or no argument). + : No argument (default) = highest scoring HSP (same as 'best'). + : 'best' or 'first' = highest scoring HSP. + : 'worst' or 'last' = lowest scoring HSP. + Throws : Exception if the HSPs have not been collected. + : Exception if an unrecognized argument is used. + +See Also : L, L, L<_set_hsps()|_set_hsps> + +=cut + +#---------- +sub hsp { +#---------- + my( $self, $option ) = @_; + $option ||= 'best'; + + if (not ref $self->{'_hsps'}) { + $self->throw("Can't get HSPs: data not collected."); + } + + my @hsps = @{$self->{'_hsps'}}; + + return $hsps[0] if $option =~ /best|first|1/i; + return $hsps[$#hsps] if $option =~ /worst|last/i; + + $self->throw("Can't get HSP for: $option", + "Valid arguments: 'best', 'worst'"); +} + + + +=head2 num_hsps + + Usage : $sbjct_object->num_hsps(); + Purpose : Get the number of HSPs for the present Blast hit. + Example : $nhsps = $sbjct_object->num_hsps(); + Returns : Integer + Argument : n/a + Throws : Exception if the HSPs have not been collected. + +See Also : L + +=cut + +#------------- +sub num_hsps { +#------------- + my $self = shift; + + if (not defined $self->{'_hsps'}) { + $self->throw("Can't get HSPs: data not collected."); + } + + return scalar(@{$self->{'_hsps'}}); +} + + + +=head2 length + + Usage : $sbjct_object->length(); + Purpose : Get the total length of the hit sequence. + Example : $len = $sbjct_object->length(); + Returns : Integer + Argument : n/a + Throws : n/a + Comments : Developer note: when using the built-in length function within + : this module, call it as CORE::length(). + +See Also : L, L + +=cut + +#----------- +sub length { +#----------- + my $self = shift; + $self->{'_length'}; +} + + +=head2 logical_length + + Usage : $sbjct_object->logical_length( [seq_type] ); + : (mostly intended for internal use). + Purpose : Get the logical length of the hit sequence. + : If the Blast is a TBLASTN or TBLASTX, the returned length + : is the length of the would-be amino acid sequence (length/3). + : For all other BLAST flavors, this function is the same as length(). + Example : $len = $sbjct_object->logical_length(); + Returns : Integer + Argument : seq_type = 'query' or 'sbjct' (default = 'query') + Throws : n/a + Comments : This is important for functions like frac_aligned_query() + : which need to operate in amino acid coordinate space when dealing + : with [T]BLAST[NX] type reports. + +See Also : L, L, L + +=cut + +#-------------------- +sub logical_length { +#-------------------- + my $self = shift; + my $seqType = shift || 'query'; + + # Return logical sbjct length + $seqType eq 'sbjct' and return + $self->{'_logical_length'} || $self->{'_length'}; + + # Otherwise, return logical query length + my $qlen = $self->parent->length; + + # Adjust length based on BLAST flavor. + my $prog = $self->parent->program; + if($prog =~ /T?BLASTX/ ) { + $qlen /= 3; + } + return $qlen; +} + + + +=head2 length_aln + + Usage : $sbjct_object->length_aln( [seq_type] ); + Purpose : Get the total length of the aligned region for query or sbjct seq. + : This number will include all HSPs + Example : $len = $sbjct_object->length_aln(); # default = query + : $lenAln = $sbjct_object->length_aln('query'); + Returns : Integer + Argument : seq_Type = 'query' | 'sbjct' (Default = 'query') + Throws : Exception if the argument is not recognized. + Comments : This method will report the logical length of the alignment, + : meaning that for TBLAST[NX] reports, the length is reported + : using amino acid coordinate space (i.e., nucleotides / 3). + : + : This method requires that all HSPs be tiled. If they have not + : already been tiled, they will be tiled first. + : If you don't want the tiled data, iterate through each HSP + : calling length() on each (use hsps() to get the HSPs). + +See Also : L, L, L, L, L<_tile_hsps()|_tile_hsps>, B + +=cut + +#---------------' +sub length_aln { +#--------------- + my( $self, $type ) = @_; + + $type ||= 'query'; + + $self->_tile_hsps() if not $self->{'_tile_hsps'}; + + my $data = $self->{'_length_aln_'.$type}; + + ## If we don't have data, figure out what went wrong. + if(!$data) { + $self->throw("Can't get length aln for sequence type \"$type\"", + "Valid types are 'query', 'sbjct'"); + } + $data; +} + + +=head2 gaps + + Usage : $sbjct_object->gaps( [seq_type] ); + Purpose : Get the number of gaps in the aligned query, sbjct, or both sequences. + : Data is summed across all HSPs. + Example : $qgaps = $sbjct_object->gaps('query'); + : $sgaps = $sbjct_object->gaps('sbjct'); + : $tgaps = $sbjct_object->gaps(); # default = total (query + sbjct) + Returns : scalar context: integer + : array context without args: two-element list of integers + : (queryGaps, sbjctGaps) + : Array context can be "induced" by providing an argument of 'list' or 'array'. + Argument : seq_type: 'query' | 'sbjct' | 'total' | 'list' (default = 'total') + Throws : n/a + Comments : If you need data for each HSP, use hsps() and then interate + : through each HSP object. + : This method requires that all HSPs be tiled. If they have not + : already been tiled, they will be tiled first. + : Not relying on wantarray since that will fail in situations + : such as printf "%d", $hit->gaps() in which you might expect to + : be printing the total gaps, but evaluates to array context. + +See Also : L + +=cut + +#---------- +sub gaps { +#---------- + my( $self, $seqType ) = @_; + + $seqType ||= (wantarray ? 'list' : 'total'); + + $self->_tile_hsps() if not $self->{'_tile_hsps'}; + + $seqType = lc($seqType); + + if($seqType =~ /list|array/i) { + return ($self->{'_gaps_query'}, $self->{'_gaps_sbjct'}); + } + + if($seqType eq 'total') { + return ($self->{'_gaps_query'} + $self->{'_gaps_sbjct'}) || 0; + } else { + return $self->{'_gaps_'.$seqType} || 0; + } +} + + + +=head2 matches + + Usage : $sbjct_object->matches( [class] ); + Purpose : Get the total number of identical or conserved matches + : (or both) across all HSPs. + : (Note: 'conservative' matches are indicated as 'positives' + : in the Blast report.) + Example : ($id,$cons) = $sbjct_object->matches(); # no argument + : $id = $sbjct_object->matches('id'); + : $cons = $sbjct_object->matches('cons'); + Returns : Integer or a 2-element array of integers + Argument : class = 'id' | 'cons' OR none. + : If no argument is provided, both identical and conservative + : numbers are returned in a two element list. + : (Other terms can be used to refer to the conservative + : matches, e.g., 'positive'. All that is checked is whether or + : not the supplied string starts with 'id'. If not, the + : conservative matches are returned.) + Throws : Exception if the requested data cannot be obtained. + Comments : If you need data for each HSP, use hsps() and then interate + : through the HSP objects. + : Does not rely on wantarray to return a list. Only checks for + : the presence of an argument (no arg = return list). + +See Also : B, L + +=cut + +#--------------- +sub matches { +#--------------- + my( $self, $arg) = @_; + my(@data,$data); + + if(!$arg) { + @data = ($self->{'_totalIdentical'}, $self->{'_totalConserved'}); + + return @data if @data; + + } else { + + if($arg =~ /^id/i) { + $data = $self->{'_totalIdentical'}; + } else { + $data = $self->{'_totalConserved'}; + } + return $data if $data; + } + + ## Something went wrong if we make it to here. + $self->throw("Can't get identical or conserved data: no data."); +} + + +=head2 start + + Usage : $sbjct->start( [seq_type] ); + Purpose : Gets the start coordinate for the query, sbjct, or both sequences + : in the Sbjct object. If there is more than one HSP, the lowest start + : value of all HSPs is returned. + Example : $qbeg = $sbjct->start('query'); + : $sbeg = $sbjct->start('sbjct'); + : ($qbeg, $sbeg) = $sbjct->start(); + Returns : scalar context: integer + : array context without args: list of two integers (queryStart, sbjctStart) + : Array context can be "induced" by providing an argument of 'list' or 'array'. + Argument : In scalar context: seq_type = 'query' or 'sbjct' + : (case insensitive). If not supplied, 'query' is used. + Throws : n/a + Comments : This method requires that all HSPs be tiled. If there is more than one + : HSP and they have not already been tiled, they will be tiled first. + : Remember that the start and end coordinates of all HSPs are + : normalized so that start < end. Strand information can only be + : obtained on an HSP-by-HSP basis by calling $hsp->strand(). + +See Also : L, L, L, B() + +=cut + +#---------- +sub start { +#---------- + my ($self, $seqType) = @_; + + $seqType ||= (wantarray ? 'list' : 'query'); + + # If there is only one HSP, defer this call to the solitary HSP. + if($self->num_hsps == 1) { + return $self->hsp->start($seqType); + } else { + $self->_tile_hsps() if not $self->{'_tile_hsps'}; + if($seqType =~ /list|array/i) { + return ($self->{'_queryStart'}, $self->{'_sbjctStart'}); + } else { + ## Sensitive to member name changes. + $seqType = "_\L$seqType\E"; + return $self->{$seqType.'Start'}; + } + } +} + + +=head2 end + + Usage : $sbjct->end( [seq_type] ); + Purpose : Gets the end coordinate for the query, sbjct, or both sequences + : in the Sbjct object. If there is more than one HSP, the largest end + : value of all HSPs is returned. + Example : $qend = $sbjct->end('query'); + : $send = $sbjct->end('sbjct'); + : ($qend, $send) = $sbjct->end(); + Returns : scalar context: integer + : array context without args: list of two integers (queryEnd, sbjctEnd) + : Array context can be "induced" by providing an argument of 'list' or 'array'. + Argument : In scalar context: seq_type = 'query' or 'sbjct' + : (case insensitive). If not supplied, 'query' is used. + Throws : n/a + Comments : This method requires that all HSPs be tiled. If there is more than one + : HSP and they have not already been tiled, they will be tiled first. + : Remember that the start and end coordinates of all HSPs are + : normalized so that start < end. Strand information can only be + : obtained on an HSP-by-HSP basis by calling $hsp->strand(). + +See Also : L, L, L, B() + +=cut + +#---------- +sub end { +#---------- + my ($self, $seqType) = @_; + + $seqType ||= (wantarray ? 'list' : 'query'); + + # If there is only one HSP, defer this call to the solitary HSP. + if($self->num_hsps == 1) { + return $self->hsp->end($seqType); + } else { + $self->_tile_hsps() if not $self->{'_tile_hsps'}; + if($seqType =~ /list|array/i) { + return ($self->{'_queryStop'}, $self->{'_sbjctStop'}); + } else { + ## Sensitive to member name changes. + $seqType = "_\L$seqType\E"; + return $self->{$seqType.'Stop'}; + } + } +} + +=head2 range + + Usage : $sbjct->range( [seq_type] ); + Purpose : Gets the (start, end) coordinates for the query or sbjct sequence + : in the HSP alignment. + Example : ($qbeg, $qend) = $sbjct->range('query'); + : ($sbeg, $send) = $sbjct->range('sbjct'); + Returns : Two-element array of integers + Argument : seq_type = string, 'query' or 'sbjct' (default = 'query') + : (case insensitive). + Throws : n/a + +See Also : L, L + +=cut + +#---------- +sub range { +#---------- + my ($self, $seqType) = @_; + $seqType ||= 'query'; + return ($self->start($seqType), $self->end($seqType)); +} + + +=head2 frac_identical + + Usage : $sbjct_object->frac_identical( [seq_type] ); + Purpose : Get the overall fraction of identical positions across all HSPs. + : The number refers to only the aligned regions and does not + : account for unaligned regions in between the HSPs, if any. + Example : $frac_iden = $sbjct_object->frac_identical('query'); + Returns : Float (2-decimal precision, e.g., 0.75). + Argument : seq_type: 'query' | 'sbjct' | 'total' + : default = 'total' (but see comments below). + Throws : n/a + Comments : Different versions of Blast report different values for the total + : length of the alignment. This is the number reported in the + : denominators in the stats section: + : "Identical = 34/120 Positives = 67/120". + : BLAST-GP uses the total length of the alignment (with gaps) + : WU-BLAST uses the length of the query sequence (without gaps). + : Therefore, when called without an argument or an argument of 'total', + : this method will report different values depending on the + : version of BLAST used. + : + : To get the fraction identical among only the aligned residues, + : ignoring the gaps, call this method with an argument of 'query' + : or 'sbjct'. + : + : If you need data for each HSP, use hsps() and then iterate + : through the HSP objects. + : This method requires that all HSPs be tiled. If they have not + : already been tiled, they will be tiled first. + +See Also : L, L, L, L<_tile_hsps()|_tile_hsps> + +=cut + +#------------------ +sub frac_identical { +#------------------ + my ($self, $seqType) = @_; + $seqType ||= 'total'; + + ## Sensitive to member name format. + $seqType = lc($seqType); + + $self->_tile_hsps() if not $self->{'_tile_hsps'}; + + sprintf( "%.2f", $self->{'_totalIdentical'}/$self->{'_length_aln_'.$seqType}); +} + + + +=head2 frac_conserved + + Usage : $sbjct_object->frac_conserved( [seq_type] ); + Purpose : Get the overall fraction of conserved positions across all HSPs. + : The number refers to only the aligned regions and does not + : account for unaligned regions in between the HSPs, if any. + Example : $frac_cons = $sbjct_object->frac_conserved('sbjct'); + Returns : Float (2-decimal precision, e.g., 0.75). + Argument : seq_type: 'query' | 'sbjct' | 'total' + : default = 'total' (but see comments below). + Throws : n/a + Comments : Different versions of Blast report different values for the total + : length of the alignment. This is the number reported in the + : denominators in the stats section: + : "Identical = 34/120 Positives = 67/120". + : BLAST-GP uses the total length of the alignment (with gaps) + : WU-BLAST uses the length of the query sequence (without gaps). + : Therefore, when called without an argument or an argument of 'total', + : this method will report different values depending on the + : version of BLAST used. + : + : To get the fraction conserved among only the aligned residues, + : ignoring the gaps, call this method with an argument of 'query' + : or 'sbjct'. + : + : If you need data for each HSP, use hsps() and then interate + : through the HSP objects. + : This method requires that all HSPs be tiled. If they have not + : already been tiled, they will be tiled first. + +See Also : L, L, L<_tile_hsps()|_tile_hsps> + +=cut + +#-------------------- +sub frac_conserved { +#-------------------- + my ($self, $seqType) = @_; + $seqType ||= 'total'; + + ## Sensitive to member name format. + $seqType = lc($seqType); + + $self->_tile_hsps() if not $self->{'_tile_hsps'}; + + sprintf( "%.2f", $self->{'_totalConserved'}/$self->{'_length_aln_'.$seqType}); +} + + + + +=head2 frac_aligned_query + + Usage : $sbjct_object->frac_aligned_query(); + Purpose : Get the fraction of the query sequence which has been aligned + : across all HSPs (not including intervals between non-overlapping + : HSPs). + Example : $frac_alnq = $sbjct_object->frac_aligned_query(); + Returns : Float (2-decimal precision, e.g., 0.75). + Argument : n/a + Throws : n/a + Comments : If you need data for each HSP, use hsps() and then interate + : through the HSP objects. + : To compute the fraction aligned, the logical length of the query + : sequence is used, meaning that for [T]BLASTX reports, the + : full length of the query sequence is converted into amino acids + : by dividing by 3. This is necessary because of the way + : the lengths of aligned sequences are computed. + : This method requires that all HSPs be tiled. If they have not + : already been tiled, they will be tiled first. + +See Also : L, L<_tile_hsps()|_tile_hsps>, L, L + +=cut + +#---------------------- +sub frac_aligned_query { +#---------------------- + my $self = shift; + + $self->_tile_hsps() if not $self->{'_tile_hsps'}; + + sprintf( "%.2f", $self->{'_length_aln_query'}/$self->logical_length('query')); +} + + + +=head2 frac_aligned_hit + + Usage : $sbjct_object->frac_aligned_hit(); + Purpose : Get the fraction of the hit (sbjct) sequence which has been aligned + : across all HSPs (not including intervals between non-overlapping + : HSPs). + Example : $frac_alnq = $sbjct_object->frac_aligned_hit(); + Returns : Float (2-decimal precision, e.g., 0.75). + Argument : n/a + Throws : n/a + Comments : If you need data for each HSP, use hsps() and then interate + : through the HSP objects. + : To compute the fraction aligned, the logical length of the sbjct + : sequence is used, meaning that for TBLAST[NX] reports, the + : full length of the sbjct sequence is converted into amino acids + : by dividing by 3. This is necessary because of the way + : the lengths of aligned sequences are computed. + : This method requires that all HSPs be tiled. If they have not + : already been tiled, they will be tiled first. + +See Also : L, L, L<_tile_hsps()|_tile_hsps>, L, L + +=cut + +#-------------------- +sub frac_aligned_hit { +#-------------------- + my $self = shift; + + $self->_tile_hsps() if not $self->{'_tile_hsps'}; + + sprintf( "%.2f", $self->{'_length_aln_sbjct'}/$self->logical_length('sbjct')); +} + +# Safety-net methods for those who try don't read or remember the API. +# Redirecting to the proper method. These 'sbjct' versions may be more +# consistent with the API of this module since there are numerous other +# instances of using 'sbjct' in arguments. However, 'sbjct' is a bit tech-ee. + +#----------------------- +sub frac_aligned_sbjct { my $self=shift; $self->frac_aligned_hit(@_); } +#----------------------- +sub num_unaligned_sbjct { my $self=shift; $self->num_unaligned_hit(@_); } +#----------------------- + + + +=head2 num_unaligned_hit + + Usage : $sbjct_object->num_unaligned_hit(); + Purpose : Get the number of the unaligned residues in the hit sequence. + : Sums across all all HSPs. + Example : $num_unaln = $sbjct_object->num_unaligned_hit(); + Returns : Integer + Argument : n/a + Throws : n/a + Comments : See notes regarding logical lengths in the comments for frac_aligned_hit(). + : They apply here as well. + : If you need data for each HSP, use hsps() and then interate + : through the HSP objects. + : This method requires that all HSPs be tiled. If they have not + : already been tiled, they will be tiled first. + +See Also : L, L<_tile_hsps()|_tile_hsps>, L + +=cut + +#--------------------- +sub num_unaligned_hit { +#--------------------- + my $self = shift; + + $self->_tile_hsps() if not $self->{'_tile_hsps'}; + + my $num = $self->logical_length('sbjct') - $self->{'_length_aln_sbjct'}; + ($num < 0 ? 0 : $num ); +} + + +=head2 num_unaligned_query + + Usage : $sbjct_object->num_unaligned_query(); + Purpose : Get the number of the unaligned residues in the query sequence. + : Sums across all all HSPs. + Example : $num_unaln = $sbjct_object->num_unaligned_query(); + Returns : Integer + Argument : n/a + Throws : n/a + Comments : See notes regarding logical lengths in the comments for frac_aligned_query(). + : They apply here as well. + : If you need data for each HSP, use hsps() and then interate + : through the HSP objects. + : This method requires that all HSPs be tiled. If they have not + : already been tiled, they will be tiled first. + +See Also : L, L<_tile_hsps()|_tile_hsps>, L + +=cut + +#----------------------- +sub num_unaligned_query { +#----------------------- + my $self = shift; + + $self->_tile_hsps() if not $self->{'_tile_hsps'}; + + my $num = $self->logical_length('query') - $self->{'_length_aln_query'}; + ($num < 0 ? 0 : $num ); +} + + + +=head2 seq_inds + + Usage : $hit->seq_inds( seq_type, class, collapse ); + Purpose : Get a list of residue positions (indices) across all HSPs + : for identical or conserved residues in the query or sbjct sequence. + Example : @ind = $hit->seq_inds('query', 'identical'); + : @ind = $hit->seq_inds('sbjct', 'conserved'); + : @ind = $hit->seq_inds('sbjct', 'conserved', 1); + Returns : Array of integers + : May include ranges if collapse is non-zero. + Argument : seq_type = 'query' or 'sbjct' (default = query) + : class = 'identical' or 'conserved' (default = identical) + : (can be shortened to 'id' or 'cons') + : (actually, anything not 'id' will evaluate to 'conserved'). + : collapse = boolean, if non-zero, consecutive positions are merged + : using a range notation, e.g., "1 2 3 4 5 7 9 10 11" + : collapses to "1-5 7 9-11". This is useful for + : consolidating long lists. Default = no collapse. + Throws : n/a. + +See Also : B + +=cut + +#------------- +sub seq_inds { +#------------- + my ($self, $seq, $class, $collapse) = @_; + + $seq ||= 'query'; + $class ||= 'identical'; + $collapse ||= 0; + + my (@inds, $hsp); + foreach $hsp ($self->hsps) { + # This will merge data for all HSPs together. + push @inds, $hsp->seq_inds($seq, $class); + } + + # Need to remove duplicates and sort the merged positions. + if(@inds) { + my %tmp = map { $_, 1 } @inds; + @inds = sort {$a <=> $b} keys %tmp; + } + + require Bio::Tools::Blast::HSP; + + $collapse ? &Bio::Tools::Blast::HSP::collapse_nums(@inds) : @inds; +} + + +##################################################################################### +## INSTANCE METHODS ## +##################################################################################### + + +=head2 display + + Usage : $sbjct_object->display( %named_parameters ); + Purpose : Display information about Bio::Tools::Blast::Sbjct.pm data members + Example : $object->display(-SHOW=>'stats'); + Argument : Named parameters: -SHOW => 'hsp', + : -WHERE => filehandle (default = STDOUT) + Returns : n/a + Status : Deprecated, Buggy. + : Use Blast::table() or Blast::table_tiled() instead. + +See Also : L<_display_stats()|_display_stats>, L<_display_hsps()|_display_hsps>, B::display + +=cut + +#------------ +sub display { +#------------ + my( $self, %param) = @_; + + $param{-HEADER} = 0; + $self->SUPER::display(%param); + + $self->show =~ /hsp/i and $self->_display_hsps( %param); +} + + +=head2 _display_stats + + Usage : n/a; called automatically by display() + Purpose : Display information about Bio::Tools::Blast.pm data members. + : Not tab-delimited. + : Prints the rank, name, database, score, p, n, length + : of the hit sequence, length of the aligned region, + : fraction identical, fraction conserved, and the fraction aligned + : for both the query and hit sequences. + Example : n/a + Argument : one argument = filehandle object. + Returns : printf call. + Status : Deprecated, Buggy. + : Use Blast::table() or Blast::table_tiled() instead. + +See Also : L + +=cut + +#------------------- +sub _display_stats { +#------------------- + my( $self, $OUT) = @_; + my $layout = $self->parent->_layout(); + + if($layout == 1) { + printf( $OUT "%-3d %-20s %-11s %-5d %-5d %-9.1e %-9.1e %-4d %-3d %-5d %-5d %-5s %-6.2f %-6.2f %-4d(%.2f) %-4d(%.2f)\n", + $self->rank(), $self->name(), + ($self->database() || 'UNKNOWN DB') , + $self->score(),$self->bits(),$self->p(),$self->expect(), + $self->gaps(), $self->n(), + $self->length(), $self->length_aln('query'), + $self->ambiguous_aln(), + $self->frac_aligned_query, $self->frac_aligned_hit, + $self->matches('iden'), $self->frac_identical('query'), + $self->matches('cons'), $self->frac_conserved('query')); + } else { + printf( $OUT "%-3d %-20s %-11s %-5d %-5d %-9.1e %-4d %-3d %-5d %-5d %-5s %-6.2f %-6.2f %-4d(%.2f) %-4d(%.2f)\n", + $self->rank(), $self->name(), + ($self->database() || 'UNKNOWN DB'), + $self->score(),$self->bits(),$self->expect(), + $self->gaps(), $self->num_hsps, + $self->length(), $self->length_aln('query'), + $self->ambiguous_aln(), + $self->frac_aligned_query, $self->frac_aligned_hit, + $self->matches('iden'), $self->frac_identical('query'), + $self->matches('cons'), $self->frac_conserved('query') ); + } + +} + + +=head2 _display_hsps + + Usage : n/a; called automatically by display() + Purpose : Display information about each HSP in the current BLAST hit. + Example : n/a + Argument : one argument = filehandle object. + Returns : printf call. + Status : Experimental + +See Also : L, B + +=cut + +#---------------- +sub _display_hsps { +#---------------- + my( $self, %param) = @_; + my $OUT = $self->fh(); + my $hspCount = 0; + my $reply = undef; + + not defined $self->{'_hsps'} and do{ print $OUT "\nHSP data not loaded.\n\n"; return; }; + +# print $OUT "\n",$self->num_hsps, " HSPs\n\n"; + + my($hsp); + foreach $hsp ( $self->hsps() ) { + $hspCount++; + print $OUT "\n ", '-'x25, "< HSP #$hspCount >", '-'x25, "\n"; + + $hsp->display( %param ); + + if( $hspCount < $self->num_hsps ) { + print "\n\n\t--------> FOR NEXT HSP, q TO QUIT <--------\n"; + chop( $reply = ); + $reply =~ /^q/i and return; + } + } +} + + + +=head2 homol_data + + Usage : $data = $sbjct_object->homo_data( %named_params ); + Purpose : Gets specific similarity data about all HSPs. + Returns : String + Argument : named parameters forwarded to Bio::Tools::Blast::HSP::homol_data(). + Throws : n/a + Status : Experimental + Comments : This is an experimental method used for obtaining an + : indication of: + : 1) how many HSPs are in a Blast alignment + : 2) how strong the similarity is between sequences in the HSP + : 3) the endpoints of the alignment (sequence monomer numbers) + : "Homology data" for each HSP is in the format: + : " " + : Data for different HSPs are tab-delimited. + +See Also : B, B + +=cut + +#--------------- +sub homol_data { +#--------------- + my ($self,%param) = @_; + my $data = $self->name(); + + foreach ($self->hsps) { + $data .="\t".$_->homol_data(%param); + } + ## Record ambiguous alignment status. + $data .= "\t".$self->ambiguous_aln(); + $data; +} + + +=head2 is_signif + + Usage : $sbjct_object->is_signif(); + Purpose : Determine if the given BLAST hit is significant. + Example : + Returns : Boolean + Argument : n/a + Throws : n/a + Comments : Uses criteria defined in the parent Blast.pm object + : to assess significance. Currently, only relies on + : P-value and length criteria. + : This mehtod is largely obsolete since are hits are now by + : definition significant. + +=cut + +#--------------- +sub is_signif { +#--------------- + my $self = shift; + return ($self->{'_significance'} <= $self->parent->signif and + $self->length > $self->parent->signif_len); +} + +##################################################################################### +## CLASS METHODS ## +##################################################################################### + +=head1 CLASS METHODS + +=head2 get_exponent + + Usage : &get_exponent( number ); + Purpose : Determines the power of 10 exponent of an integer, float, + : or scientific notation number. + Example : &get_exponent("4.0e-206"); + : &get_exponent("0.00032"); + : &get_exponent("10."); + : &get_exponent("1000.0"); + : &get_exponent("e+83"); + Argument : Float, Integer, or scientific notation number + Returns : Integer representing the exponent part of the number (+ or -). + : If argument == 0 (zero), return value is "-999". + Comments : Exponents are rounded up (less negative) if the mantissa is >= 5. + : Exponents are rounded down (more negative) if the mantissa is <= -5. + : This method probably belongs in a more general utility class. + +=cut + +#------------------ +sub get_exponent { +#------------------ + my $data = shift; + + my($num, $exp) = split /[eE]/, $data; + + if( defined $exp) { + $num = 1 if not $num; + $num >= 5 and $exp++; + $num <= -5 and $exp--; + } elsif( $num == 0) { + $exp = -999; + } elsif( not $num =~ /\./) { + $exp = CORE::length($num) -1; + } else { + $exp = 0; + $num .= '0' if $num =~ /\.$/; + my ($c); + my $rev = 0; + if($num !~ /^0/) { + $num = reverse($num); + $rev = 1; + } + do { $c = chop($num); + $c == 0 && $exp++; + } while( $c ne '.'); + + $exp = -$exp if $num == 0 and not $rev; + $exp -= 1 if $rev; + } + return $exp; +} + +1; +__END__ + +##################################################################################### +# END OF CLASS # +##################################################################################### + + +=head1 FOR DEVELOPERS ONLY + +=head2 Data Members + +Information about the various data members of this module is provided for those +wishing to modify or understand the code. Two things to bear in mind: + +=over 4 + +=item 1 Do NOT rely on these in any code outside of this module. + +All data members are prefixed with an underscore to signify that they are private. +Always use accessor methods. If the accessor doesn't exist or is inadequate, +create or modify an accessor (and let me know, too!). (An exception to this might +be for HSP.pm which is more tightly coupled to Sbjct.pm and +may access Sbjct data members directly for efficiency purposes, but probably +should not). + +=item 2 This documentation may be incomplete and out of date. + +It is easy for these data member descriptions to become obsolete as +this module is still evolving. Always double check this info and search +for members not described here. + +=back + +An instance of Bio::Tools::Blast::Sbjct.pm is a blessed reference to a hash containing +all or some of the following fields: + + FIELD VALUE + -------------------------------------------------------------- + _hsps : Array ref for a list of Bio::Tools::Blast::HSP.pm objects. + : + _db : Database identifier from the summary line. + : + _desc : Description data for the hit from the summary line. + : + _length : Total length of the hit sequence. + : + _score : BLAST score. + : + _bits : BLAST score (in bits). Matrix-independent. + : + _p : BLAST P value. Obtained from summary section. (Blast1/WU-Blast only) + : + _expect : BLAST Expect value. Obtained from summary section. + : + _n : BLAST N value (number of HSPs) (Blast1/WU-Blast2 only) + : + _frame : Reading frame for TBLASTN and TBLASTX analyses. + : + _totalIdentical: Total number of identical aligned monomers. + : + _totalConserved: Total number of conserved aligned monomers (a.k.a. "positives"). + : + _overlap : Maximum number of overlapping residues between adjacent HSPs + : before considering the alignment to be ambiguous. + : + _ambiguous_aln : Boolean. True if the alignment of all HSPs is ambiguous. + : + _length_aln_query : Length of the aligned region of the query sequence. + : + _length_aln_sbjct : Length of the aligned region of the sbjct sequence. + + + INHERITED DATA MEMBERS + ---------------------- + _name : From Bio::Root::Object.pm. String representing the name of the + : sbjct sequence obtained from the BLAST report. + : + _parent : From Bio::Root::Object.pm. This member contains a reference to the + : Bio::Tools::Blast.pm object to which this hit belongs. + + +=cut + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/CodonTable.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/CodonTable.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,655 @@ +# $Id: CodonTable.pm,v 1.23 2002/10/22 07:38:45 lapp Exp $ +# +# bioperl module for Bio::Tools::CodonTable +# +# Cared for by Heikki Lehvaslaiho +# +# Copyright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::CodonTable - Bioperl codon table object + +=head1 SYNOPSIS + + This is a read-only class for all known codon tables. The IDs are + the ones used by nucleotide sequence databases. All common IUPAC + ambiguity codes for DNA, RNA and animo acids are recognized. + + # to use + use Bio::Tools::CodonTable; + + # defaults to ID 1 "Standard" + $myCodonTable = Bio::Tools::CodonTable->new(); + $myCodonTable2 = Bio::Tools::CodonTable -> new ( -id => 3 ); + + # change codon table + $myCodonTable->id(5); + + # examine codon table + print join (' ', "The name of the codon table no.", $myCodonTable->id(4), + "is:", $myCodonTable->name(), "\n"); + + # translate a codon + $aa = $myCodonTable->translate('ACU'); + $aa = $myCodonTable->translate('act'); + $aa = $myCodonTable->translate('ytr'); + + # reverse translate an amino acid + @codons = $myCodonTable->revtranslate('A'); + @codons = $myCodonTable->revtranslate('Ser'); + @codons = $myCodonTable->revtranslate('Glx'); + @codons = $myCodonTable->revtranslate('cYS', 'rna'); + + #boolean tests + print "Is a start\n" if $myCodonTable->is_start_codon('ATG'); + print "Is a termianator\n" if $myCodonTable->is_ter_codon('tar'); + print "Is a unknown\n" if $myCodonTable->is_unknown_codon('JTG'); + +=head1 DESCRIPTION + +Codon tables are also called translation tables or genetics codes +since that is what they try to represent. A bit more complete picture +of the full complexity of codon usage in various taxonomic groups +presented at the NCBI Genetic Codes Home page. + +CodonTable is a BioPerl class that knows all current translation +tables that are used by primary nucleotide sequence databases +(GenBank, EMBL and DDBJ). It provides methods to output information +about tables and relationships between codons and amino acids. + +This class and its methods recognized all common IUPAC ambiguity codes +for DNA, RNA and animo acids. The translation method follows the +conventions in EMBL and TREMBL databases. + +It is a nuisance to separate RNA and cDNA representations of nucleic +acid transcripts. The CodonTable object accepts codons of both type as +input and allows the user to set the mode for output when reverse +translating. Its default for output is DNA. + +Note: This class deals primarily with individual codons and amino + acids. However in the interest of speed you can L + longer sequence, too. The full complexity of protein translation + is tackled by L. + + +The amino acid codes are IUPAC recommendations for common amino acids: + + A Ala Alanine + R Arg Arginine + N Asn Asparagine + D Asp Aspartic acid + C Cys Cysteine + Q Gln Glutamine + E Glu Glutamic acid + G Gly Glycine + H His Histidine + I Ile Isoleucine + L Leu Leucine + K Lys Lysine + M Met Methionine + F Phe Phenylalanine + P Pro Proline + S Ser Serine + T Thr Threonine + W Trp Tryptophan + Y Tyr Tyrosine + V Val Valine + B Asx Aspartic acid or Asparagine + Z Glx Glutamine or Glutamic acid + X Xaa Any or unknown amino acid + + +It is worth noting that, "Bacterial" codon table no. 11 produces an +polypeptide that is, confusingly, identical to the standard one. The +only differences are in available initiator codons. + + +NCBI Genetic Codes home page: + http://www.ncbi.nlm.nih.gov/htbin-post/Taxonomy/wprintgc?mode=c + +EBI Translation Table Viewer: + http://www.ebi.ac.uk/cgi-bin/mutations/trtables.cgi + +Amended ASN.1 version with ids 16 and 21 is at: + ftp://ftp.ebi.ac.uk/pub/databases/geneticcode/ + +Thank your for Matteo diTomasso for the original Perl implementation +of these tables. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email: heikki@ebi.ac.uk +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Tools::CodonTable; +use vars qw(@ISA @NAMES @TABLES @STARTS $TRCOL $CODONS %IUPAC_DNA + %IUPAC_AA %THREELETTERSYMBOLS $VALID_PROTEIN $TERMINATOR); +use strict; + +# Object preamble - inherits from Bio::Root::Root +use Bio::Root::Root; +use Bio::Tools::IUPAC; +use Bio::SeqUtils; + +@ISA = qw(Bio::Root::Root); + +# first set internal values for all translation tables + +BEGIN { + @NAMES = #id + ( + 'Standard', #1 + 'Vertebrate Mitochondrial',#2 + 'Yeast Mitochondrial',# 3 + 'Mold, Protozoan, and CoelenterateMitochondrial and Mycoplasma/Spiroplasma',#4 + 'Invertebrate Mitochondrial',#5 + 'Ciliate, Dasycladacean and Hexamita Nuclear',# 6 + '', '', + 'Echinoderm Mitochondrial',#9 + 'Euplotid Nuclear',#10 + '"Bacterial"',# 11 + 'Alternative Yeast Nuclear',# 12 + 'Ascidian Mitochondrial',# 13 + 'Flatworm Mitochondrial',# 14 + 'Blepharisma Nuclear',# 15 + 'Chlorophycean Mitochondrial',# 16 + '', '', '', '', + 'Trematode Mitochondrial',# 21 + 'Scenedesmus obliquus Mitochondrial', #22 + 'Thraustochytrium Mitochondrial' #23 + ); + + @TABLES = + qw( + FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG + FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG + FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG + FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG + FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG + FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG + '' '' + FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG + FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG + FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG + FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG + FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG + FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG + FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG + FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG + '' '' '' '' + FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG + FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG + FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG + ); + + + @STARTS = + qw( + ---M---------------M---------------M---------------------------- + --------------------------------MMMM---------------M------------ + ----------------------------------MM---------------------------- + --MM---------------M------------MMMM---------------M------------ + ---M----------------------------MMMM---------------M------------ + -----------------------------------M---------------------------- + '' '' + -----------------------------------M---------------------------- + -----------------------------------M---------------------------- + ---M---------------M------------MMMM---------------M------------ + -------------------M---------------M---------------------------- + -----------------------------------M---------------------------- + -----------------------------------M---------------------------- + -----------------------------------M---------------------------- + -----------------------------------M---------------------------- + '' '' '' '' + -----------------------------------M---------------M------------ + -----------------------------------M---------------------------- + --------------------------------M--M---------------M------------ + ); + + my @nucs = qw(t c a g); + my $x = 0; + ($CODONS, $TRCOL) = ({}, {}); + for my $i (@nucs) { + for my $j (@nucs) { + for my $k (@nucs) { + my $codon = "$i$j$k"; + $CODONS->{$codon} = $x; + $TRCOL->{$x} = $codon; + $x++; + } + } + } + %IUPAC_DNA = Bio::Tools::IUPAC->iupac_iub(); + %IUPAC_AA = Bio::Tools::IUPAC->iupac_iup(); + %THREELETTERSYMBOLS = Bio::SeqUtils->valid_aa(2); + $VALID_PROTEIN = '['.join('',Bio::SeqUtils->valid_aa(0)).']'; + $TERMINATOR = '*'; +} + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + + my($id) = + $self->_rearrange([qw(ID + )], + @args); + + $id = 1 if ( ! $id ); + $id && $self->id($id); + return $self; # success - we hope! +} + +=head2 id + + Title : id + Usage : $obj->id(3); $id_integer = $obj->id(); + Function: + + Sets or returns the id of the translation table. IDs are + integers from 1 to 15, excluding 7 and 8 which have been + removed as redundant. If an invalid ID is given the method + returns 0, false. + + + Example : + Returns : value of id, a scalar, 0 if not a valid + Args : newvalue (optional) + +=cut + +sub id{ + my ($self,$value) = @_; + if( defined $value) { + if ( !(defined $TABLES[$value-1]) or $TABLES[$value-1] eq '') { + $self->warn("Not a valid codon table ID [$value] "); + $value = 0; + } + $self->{'id'} = $value; + } + return $self->{'id'}; +} + +=head2 name + + Title : name + Usage : $obj->name() + Function: returns the descriptive name of the translation table + Example : + Returns : A string + Args : None + + +=cut + +sub name{ + my ($self) = @_; + + my ($id) = $self->{'id'}; + return $NAMES[$id-1]; +} + +=head2 translate + + Title : translate + Usage : $obj->translate('YTR') + Function: Returns a string of one letter amino acid codes from + nucleotide sequence input. The imput can be of any length. + + Returns 'X' for unknown codons and codons that code for + more than one amino acid. Returns an empty string if input + is not three characters long. Exceptions for these are: + + - IUPAC amino acid code B for Aspartic Acid and + Asparagine, is used. + - IUPAC amino acid code Z for Glutamic Acid, Glutamine is + used. + - if the codon is two nucleotides long and if by adding + an a third character 'N', it codes for a single amino + acid (with exceptions above), return that, otherwise + return empty string. + + Returns empty string for other input strings that are not + three characters long. + + Example : + Returns : a string of one letter ambiguous IUPAC amino acid codes + Args : ambiguous IUPAC nucleotide string + + +=cut + +sub translate { + my ($self, $seq) = @_; + $self->throw("Calling translate without a seq argument!") unless defined $seq; + return '' unless $seq; + + my $id = $self->id; + my ($partial) = 0; + $partial = 2 if length($seq) % 3 == 2; + + $seq = lc $seq; + $seq =~ tr/u/t/; + my $protein = ""; + if ($seq =~ /[^actg]/ ) { #ambiguous chars + for (my $i = 0; $i < (length($seq) - 2 ); $i+=3) { + my $triplet = substr($seq, $i, 3); + if (exists $CODONS->{$triplet}) { + $protein .= substr($TABLES[$id-1], + $CODONS->{$triplet},1); + } else { + $protein .= $self->_translate_ambiguous_codon($triplet); + } + } + } else { # simple, strict translation + for (my $i = 0; $i < (length($seq) - 2 ); $i+=3) { + my $triplet = substr($seq, $i, 3); + if (exists $CODONS->{$triplet}) { + $protein .= substr($TABLES[$id-1], $CODONS->{$triplet}, 1); + } else { + $protein .= 'X'; + } + } + } + if ($partial == 2) { # 2 overhanging nucleotides + my $triplet = substr($seq, ($partial -4)). "n"; + if (exists $CODONS->{$triplet}) { + my $aa = substr($TABLES[$id-1], $CODONS->{$triplet},1); + $protein .= $aa; + } else { + $protein .= $self->_translate_ambiguous_codon($triplet, $partial); + } + } + return $protein; +} + +sub _translate_ambiguous_codon { + my ($self, $triplet, $partial) = @_; + $partial ||= 0; + my $id = $self->id; + my $aa; + my @codons = _unambiquous_codons($triplet); + my %aas =(); + foreach my $codon (@codons) { + $aas{substr($TABLES[$id-1],$CODONS->{$codon},1)} = 1; + } + my $count = scalar keys %aas; + if ( $count == 1 ) { + $aa = (keys %aas)[0]; + } + elsif ( $count == 2 ) { + if ($aas{'D'} and $aas{'N'}) { + $aa = 'B'; + } + elsif ($aas{'E'} and $aas{'Q'}) { + $aa = 'Z'; + } else { + $partial ? ($aa = '') : ($aa = 'X'); + } + } else { + $partial ? ($aa = '') : ($aa = 'X'); + } + return $aa; +} + +=head2 translate_strict + + Title : translate_strict + Usage : $obj->translate_strict('ACT') + Function: returns one letter amino acid code for a codon input + + Fast and simple translation. User is responsible to resolve + ambiguous nucleotide codes before calling this + method. Returns 'X' for unknown codons and an empty string + for input strings that are not three characters long. + + It is not recommended to use this method in a production + environment. Use method translate, instead. + + Example : + Returns : A string + Args : a codon = a three nucleotide character string + + +=cut + +sub translate_strict{ + my ($self, $value) = @_; + my ($id) = $self->{'id'}; + + $value = lc $value; + $value =~ tr/u/t/; + + if (length $value != 3 ) { + return ''; + } + elsif (!(defined $CODONS->{$value})) { + return 'X'; + } + else { + return substr($TABLES[$id-1],$CODONS->{$value},1); + } +} + +=head2 revtranslate + + Title : revtranslate + Usage : $obj->revtranslate('G') + Function: returns codons for an amino acid + + Returns an empty string for unknown amino acid + codes. Ambiquous IUPAC codes Asx,B, (Asp,D; Asn,N) and + Glx,Z (Glu,E; Gln,Q) are resolved. Both single and three + letter amino acid codes are accepted. '*' and 'Ter' are + used for terminator. + + By default, the output codons are shown in DNA. If the + output is needed in RNA (tr/t/u/), add a second argument + 'RNA'. + + Example : $obj->revtranslate('Gly', 'RNA') + Returns : An array of three lower case letter strings i.e. codons + Args : amino acid, 'RNA' + +=cut + +sub revtranslate { + my ($self, $value, $coding) = @_; + my ($id) = $self->{'id'}; + my (@aas, $p); + my (@codons) = (); + + if (length($value) == 3 ) { + $value = lc $value; + $value = ucfirst $value; + $value = $THREELETTERSYMBOLS{$value}; + } + if ( defined $value and $value =~ /$VALID_PROTEIN/ + and length($value) == 1 ) { + $value = uc $value; + @aas = @{$IUPAC_AA{$value}}; + foreach my $aa (@aas) { + #print $aa, " -2\n"; + $aa = '\*' if $aa eq '*'; + while ($TABLES[$id-1] =~ m/$aa/g) { + $p = pos $TABLES[$id-1]; + push (@codons, $TRCOL->{--$p}); + } + } + } + + if ($coding and uc ($coding) eq 'RNA') { + for my $i (0..$#codons) { + $codons[$i] =~ tr/t/u/; + } + } + + return @codons; +} + +=head2 is_start_codon + + Title : is_start_codon + Usage : $obj->is_start_codon('ATG') + Function: returns true (1) for all codons that can be used as a + translation start, false (0) for others. + Example : $myCodonTable->is_start_codon('ATG') + Returns : boolean + Args : codon + + +=cut + +sub is_start_codon{ + my ($self, $value) = @_; + my ($id) = $self->{'id'}; + + $value = lc $value; + $value =~ tr/u/t/; + + if (length $value != 3 ) { + return 0; + } + else { + my $result = 1; + my @ms = map { substr($STARTS[$id-1],$CODONS->{$_},1) } _unambiquous_codons($value); + foreach my $c (@ms) { + $result = 0 if $c ne 'M'; + } + return $result; + } +} + + + +=head2 is_ter_codon + + Title : is_ter_codon + Usage : $obj->is_ter_codon('GAA') + Function: returns true (1) for all codons that can be used as a + translation tarminator, false (0) for others. + Example : $myCodonTable->is_ter_codon('ATG') + Returns : boolean + Args : codon + + +=cut + +sub is_ter_codon{ + my ($self, $value) = @_; + my ($id) = $self->{'id'}; + + $value = lc $value; + $value =~ tr/u/t/; + + if (length $value != 3 ) { + return 0; + } + else { + my $result = 1; + my @ms = map { substr($TABLES[$id-1],$CODONS->{$_},1) } _unambiquous_codons($value); + foreach my $c (@ms) { + $result = 0 if $c ne $TERMINATOR; + } + return $result; + } +} + +=head2 is_unknown_codon + + Title : is_unknown_codon + Usage : $obj->is_unknown_codon('GAJ') + Function: returns false (0) for all codons that are valid, + true (1) for others. + Example : $myCodonTable->is_unknown_codon('NTG') + Returns : boolean + Args : codon + + +=cut + +sub is_unknown_codon{ + my ($self, $value) = @_; + my ($id) = $self->{'id'}; + + $value = lc $value; + $value =~ tr/u/t/; + + if (length $value != 3 ) { + return 1; + } + else { + my $result = 0; + my @cs = map { substr($TABLES[$id-1],$CODONS->{$_},1) } _unambiquous_codons($value); + $result = 1 if scalar @cs == 0; + return $result; + } +} + +=head2 _unambiquous_codons + + Title : _unambiquous_codons + Usage : @codons = _unambiquous_codons('ACN') + Function: + Example : + Returns : array of strings (one letter unambiguous amino acid codes) + Args : a codon = a three IUPAC nucleotide character string + +=cut + +sub _unambiquous_codons{ + my ($value) = @_; + my @nts = (); + my @codons = (); + my ($i, $j, $k); + @nts = map { $IUPAC_DNA{uc $_} } split(//, $value); + for my $i (@{$nts[0]}) { + for my $j (@{$nts[1]}) { + for my $k (@{$nts[2]}) { + push @codons, lc "$i$j$k"; + } + } + } + return @codons; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Coil.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Coil.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,290 @@ +# Parser module for Coil Bio::Tools::Coil +# +# Based on the EnsEMBL module Bio::EnsEMBL::Pipeline::Runnable::Protein::Coil +# originally written by Marc Sohrmann (ms2@sanger.ac.uk) +# Written in BioPipe by Balamurugan Kumarasamy +# Cared for by the Fugu Informatics team (fuguteam@fugu-sg.org) + +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::Coil + +=head1 SYNOPSIS + + use Bio::Tools::Coil + my $parser = new Bio::Tools::Coil(); + while( my $sp_feat = $parser->next_result($file) ) { + #do something + #eg + push @sp_feat, $sp_feat; + } + +=head1 DESCRIPTION + + Parser for Coil output + +=head1 FEEDBACK + +=head2 Mailing Lists + + User feedback is an integral part of the evolution of this and other + Bioperl modules. Send your comments and suggestions preferably to + the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + + Report bugs to the Bioperl bug tracking system to help us keep track + of the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + + Based on the EnsEMBL module Bio::EnsEMBL::Pipeline::Runnable::Protein::Coil + originally written by Marc Sohrmann (ms2@sanger.ac.uk) + Written in BioPipe by Balamurugan Kumarasamy + Cared for by the Fugu Informatics team (fuguteam@fugu-sg.org) + +=head1 APPENDIX + + The rest of the documentation details each of the object methods. + Internal methods are usually preceded with a _ + + +=cut + +package Bio::Tools::Coil; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::SeqFeature::FeaturePair; +use Bio::Root::IO; +use Bio::SeqFeature::Generic; +@ISA = qw(Bio::Root::Root Bio::Root::IO); + + + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + $self->_initialize_io(@args); + + return $self; +} + +=head2 parse_results + + Title : parse_results + Usage : obj->parse_results + Function: Parses the coil output. Automatically called by + next_result() if not yet done. + Example : + Returns : + +=cut + +sub parse_results { + my ($self,$resfile) = @_; + my $filehandle = $resfile; + my %result_hash =_read_fasta($filehandle);#bala no file handle + my @ids = keys %result_hash; + my @feats; + foreach my $id (keys %result_hash){ + my $pep = reverse ($result_hash{$id}); + my $count = my $switch = 0; + my ($start, $end); + while (my $aa = chop $pep) { + $count++; + if (!$switch && $aa eq "x") { + $start = $count; + $switch = 1; + } + elsif ($switch && $aa ne "x") { + $end = $count-1; + my (%feature); + $feature{name} = $id; + $feature{start} = $start; + $feature{end} = $end; + $feature{source} = "Coils"; + $feature{primary} = 'ncoils'; + ($feature{program}) = 'ncoils'; + $feature{logic_name} = 'Coils'; + my $new_feat = $self->create_feature (\%feature); + $self->_add_prediction($new_feat); + $switch = 0; + } + } + } + + $self->_predictions_parsed(1); + +} +=head2 next_result + + Title : next_result + Usage : while($feat = $coil->next_result($file)) { + # do something + } + Function: Returns the next protein feature of the coil output file + Returns : + Args : + +=cut + +sub next_result{ + + my ($self,$resfile) = @_; + my $gene; + + $self->parse_results($resfile) unless $self->_predictions_parsed(); + + $gene = $self->_result(); + + return $gene; + +} + +=head2 _result + + Title : _result + Usage : $feat = $obj->_result() + Function: internal + Example : + Returns : + +=cut + +sub _result{ + my ($self) = @_; + + return undef unless(exists($self->{'_feats'}) && @{$self->{'_feats'}}); + return shift(@{$self->{'_feats'}}); + +} + +=head2 _add_prediction + + Title : _add_prediction() + Usage : $obj->_add_prediction($feat) + Function: internal + Example : + Returns : + +=cut + +sub _add_prediction { + my ($self, $gene) = @_; + + if(! exists($self->{'_feats'})) { + $self->{'_feats'} = []; + } + push(@{$self->{'_feats'}}, $gene); +} + +=head2 _predictions_parsed + + Title : _predictions_parsed + Usage : $obj->_predictions_parsed + Function: internal + Example : + Returns : TRUE or FALSE + +=cut + +sub _predictions_parsed { + my ($self, $val) = @_; + + $self->{'_preds_parsed'} = $val if $val; + if(! exists($self->{'_preds_parsed'})) { + $self->{'_preds_parsed'} = 0; + } + return $self->{'_preds_parsed'}; +} + + +=head2 create_feature + + Title : create_feature + Usage : obj->create_feature(\%feature) + Function: Internal(not to be used directly) + Returns : + Args : + + +=cut + +sub create_feature { + my ($self, $feat) = @_; + + + # create feature object + my $feature = Bio::SeqFeature::Generic->new(-seq_id => $feat->{name}, + -start => $feat->{start}, + -end => $feat->{end}, + -score => $feat->{score}, + -source => $feat->{source}, + -primary => $feat->{primary}, + -logic_name => $feat->{logic_name}, + ); + $feature->add_tag_value('evalue',0); + $feature->add_tag_value('percent_id','NULL'); + $feature->add_tag_value("hid",$feat->{primary}); + + + return $feature; + +} + +=head2 _read_fasta + + Title : _read_fasta + Usage : obj->_read_fasta($file) + Function: Internal(not to be used directly) + Returns : + Args : + + +=cut + +sub _read_fasta { + local (*FILE) = @_; + my( $id , $seq , %name2seq);#bala + while () { + chomp;#bala + if (/^>(\S+)/) { + + my $new_id = $1; + if ($id) { + $name2seq{$id} = $seq; + } + $id = $new_id ; $seq = "" ; + } + elsif (eof) { + if ($id) { + $seq .= $_ ;#bala line instead of $_ + $name2seq{$id} = $seq; + } + } + else { + $seq .= $_ + } + } + return %name2seq; +} + + + +1; + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/ECnumber.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/ECnumber.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,601 @@ +# $Id: ECnumber.pm,v 1.7 2002/12/12 18:27:02 czmasek Exp $ +# +# BioPerl module for Bio::Tools::ECnumber +# +# Cared for by Christian M. Zmasek or +# +# (c) Christian M. Zmasek, czmasek@gnf.org, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# + +# POD documentation - main docs before the code + + +=head1 NAME + +ECnumber - representation of EC numbers + +=head1 SYNOPSIS + + use Bio::Tools::ECnumber; + + + # Creation of ECnumber objects + # ---------------------------- + + my $EC1 = Bio::Tools::ECnumber->new( -ec_string => "4.3.2.1" ); + my $EC2 = Bio::Tools::ECnumber->new( -ec_string => "EC 1.1.1.1" ); + my $EC3 = Bio::Tools::ECnumber->new(); + + + # Copying + # ------- + + my $EC4 = $EC1->copy(); + + + # Modification of ECnumber objects + # -------------------------------- + + print $EC3->EC_string( "1.01.01.001" ); # Prints "1.1.1.1". + + + # To string + # --------- + + print $EC3->EC_string(); + + # or: + + print $EC3->to_string(); + + + + # Test for equality + # ----------------- + + # Against ECnumber object: + if ( $EC3->is_equal( $EC2 ) ) { # Prints "equal". + print "equal"; + } + + # Against string representation of EC number: + if ( ! $EC3->is_equal( "1.1.1.-" ) ) { # Prints "not equal". + print "not equal"; + } + + + # Test for membership + # ------------------- + + my $EC5 = Bio::Tools::ECnumber->new( -ec_string => "4.3.2.-" ); + + # Against ECnumber object. + if ( $EC1->is_member( $EC5 ) ) { # Prints "member". + print "member"; + } + + + # Against string representation of EC number. + if ( ! $EC1->is_member( "4.3.1.-" ) ) { # Prints "not member". + print "not member"; + } + + + +=head1 DESCRIPTION + +ECnumber is a representation of EC numbers [http://www.chem.qmul.ac.uk/iubmb/enzyme/]. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Christian M. Zmasek + +Email: czmasek@gnf.org or cmzmasek@yahoo.com + +WWW: http://www.genetics.wustl.edu/eddy/people/zmasek/ + +Address: + + Genomics Institute of the Novartis Research Foundation + 10675 John Jay Hopkins Drive + San Diego, CA 92121 + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Tools::ECnumber; +use vars qw( @ISA ); +use strict; +use Bio::Root::Object; + +use constant DEFAULT => "-"; +use constant TRUE => 1; +use constant FALSE => 0; + +@ISA = qw( Bio::Root::Root ); + + + + + +=head2 new + + Title : new + Usage : $EC1 = Bio::Tools::ECnumber->new( -ec_string => "4.3.2.1" ); + or + $EC2 = Bio::Tools::ECnumber->new( -ec_string => "4.3.2.2", + -comment => "Is EC 4.3.2.2" ); + or + $EC3 = Bio::Tools::ECnumber->new(); # EC3 is now "-.-.-.-" + Function: Creates a new ECnumber object. + Parses a EC number from "x.x.x.x", "EC x.x.x.x", + "ECx.x.x.x", or "EC:x.x.x.x"; + x being either a positive integer or a "-". + Returns : A new Ecnumber object. + Args : A string representing a EC number, e.g. "4.3.2.1" + or "EC 4.3.2.1" or "1.-.-.-". + +=cut + +sub new { + my( $class, @args ) = @_; + + my $self = $class->SUPER::new( @args ); + + my ( $EC_string, $comment ) + = $self->_rearrange( [ qw( EC_STRING COMMENT ) ], @args ); + + $self->init(); + + $EC_string && $self->EC_string( $EC_string ); + $comment && $self->comment( $comment ); + + return $self; + +} # new + + + +=head2 init + + Title : init() + Usage : $EC1->init(); # EC1 is now "-.-.-.-" + Function: Initializes this ECnumber to default values. + Returns : + Args : + +=cut + +sub init { + my( $self ) = @_; + + $self->enzyme_class( DEFAULT ); + $self->sub_class( DEFAULT ); + $self->sub_sub_class( DEFAULT ); + $self->serial_number( DEFAULT ); + $self->comment( "" ); + +} # init + + + +=head2 copy + + Title : copy() + Usage : $EC2 = $EC1->copy(); + Function: Creates a new ECnumber object which is an exact copy + of this ECnumber. + Returns : A copy of this ECnumber. + Args : + +=cut + +sub copy { + my( $self ) = @_; + + my $new_ec = $self->new(); + $new_ec->enzyme_class( $self->enzyme_class() ); + $new_ec->sub_class( $self->sub_class() ); + $new_ec->sub_sub_class( $self->sub_sub_class() ); + $new_ec->serial_number( $self->serial_number() ); + $new_ec->comment( $self->comment() ); + return $new_ec; + +} # copy + + + +=head2 EC_string + + Title : EC_string + Usage : $EC3->EC_string( "1.1.1.-" ); + or + print $EC3->EC_string(); + Function: Set/get for string representations of EC numbers. + Parses a EC number from "x.x.x.x", "EC x.x.x.x", + "ECx.x.x.x", or "EC:x.x.x.x"; + x being either a positive integer or a "-". + Returns : A string representations of a EC number. + Args : A string representations of a EC number. + +=cut + +sub EC_string { + my ( $self, $value ) = @_; + + if ( defined $value) { + $value =~ s/\s+//g; # Removes white space. + $value =~ s/^EC//i; # Removes "EC". + $value =~ s/^://; # Removes ":". + + if ( $value =~ /^([\d-]*)\.([\d-]*)\.([\d-]*)\.([\d-]*)$/ ) { + $self->enzyme_class( $1 ); + $self->sub_class( $2 ); + $self->sub_sub_class( $3 ); + $self->serial_number( $4 ); + } + else { + $self->throw( "Illegal format error [$value]" ); + } + } + + return $self->to_string(); + +} # EC_string + + + +=head2 to_string + + Title : to_string() + Usage : print $EC3->to_string(); + Function: To string method for EC numbers + (equals the "get" functionality of "EC_string"). + Returns : A string representations of a EC number. + Args : + +=cut + +sub to_string { + my ( $self ) = @_; + + my $s = $self->enzyme_class() . "."; + $s .= $self->sub_class() . "."; + $s .= $self->sub_sub_class() . "."; + $s .= $self->serial_number(); + return $s; + +} # to_string + + + +=head2 is_equal + + Title : is_equal + Usage : if ( $EC3->is_equal( $EC2 ) ) + or + if ( $EC3->is_equal( "1.1.1.-" ) ) + Function: Checks whether this ECnumber is equal to the argument + EC number (please note: "1.1.1.1" != "1.1.1.-"). + Returns : True (1) or false (0). + Args : A ECnumber object or a string representation of a EC number. + +=cut + +sub is_equal { + my ( $self, $value ) = @_; + + if ( $self->_is_not_reference( $value ) ) { + $value = $self->new( -ec_string => $value ); + } + else { + $self->_is_ECnumber_object( $value ); + } + + unless ( $self->enzyme_class() eq $value->enzyme_class() ) { + return FALSE; + } + unless ( $self->sub_class() eq $value->sub_class() ) { + return FALSE; + } + unless ( $self->sub_sub_class() eq $value->sub_sub_class() ) { + return FALSE; + } + unless ( $self->serial_number() eq $value->serial_number() ) { + return FALSE; + } + return TRUE; + +} # is_equal + + + +=head2 is_member + + Title : is_member + Usage : if ( $EC1->is_member( $EC5 ) ) + or + if ( $EC1->is_member( "4.3.-.-" ) ) + Function: Checks whether this ECnumber is a member of the (incomplete) + argument EC number (e.g. "1.1.1.1" is a member of "1.1.1.-" + but not of "1.1.1.2"). + Returns : True (1) or false (0). + Args : A ECnumber object or a string representation of a EC number. + +=cut + +sub is_member { + my ( $self, $value ) = @_; + + if ( $self->_is_not_reference( $value ) ) { + $value = $self->new( -ec_string => $value ); + } + else { + $self->_is_ECnumber_object( $value ); + } + $self->_check_for_illegal_defaults(); + $value->_check_for_illegal_defaults(); + + unless ( $value->enzyme_class() eq DEFAULT + || $self->enzyme_class() eq $value->enzyme_class() ) { + return FALSE; + } + unless ( $value->sub_class() eq DEFAULT + || $self->sub_class() eq $value->sub_class() ) { + return FALSE; + } + unless ( $value->sub_sub_class() eq DEFAULT + || $self->sub_sub_class() eq $value->sub_sub_class() ) { + return FALSE; + } + unless ( $value->serial_number() eq DEFAULT + || $self->serial_number() eq $value->serial_number() ) { + return FALSE; + } + return TRUE; + +} # is_member + + + +=head2 enzyme_class + + Title : enzyme_class + Usage : $EC1->enzyme_class( 1 ); + or + print $EC1->enzyme_class(); + Function: Set/get for the enzyme class number of ECnumbers. + Returns : The enzyme class number of this ECnumber. + Args : A positive integer or "-". + +=cut + +sub enzyme_class { + my ( $self, $value ) = @_; + + if ( defined $value) { + $self->{ "_enzyme_class" } = $self->_check_number( $value ); + } + + return $self->{ "_enzyme_class" }; + +} # enzyme_class + + + +=head2 sub_class + + Title : sub_class + Usage : $EC1->sub_class( 4 ); + or + print $EC1->sub_class(); + Function: Set/get for the enzyme sub class number of ECnumbers. + Returns : The enzyme sub class number of this ECnumber. + Args : A positive integer or "-". + +=cut + +sub sub_class { + my ( $self, $value ) = @_; + + if ( defined $value) { + $self->{ "_sub_class" } = $self->_check_number( $value ); + } + + return $self->{ "_sub_class" }; + +} # sub_class + + + +=head2 sub_sub_class + + Title : sub_sub_class + Usage : $EC1->sub_sub_class( 12 ); + or + print $EC1->sub_sub_class(); + Function: Set/get for the enzyme sub sub class number of ECnumbers. + Returns : The enzyme sub sub class number of this ECnumber. + Args : A positive integer or "-". + +=cut + +sub sub_sub_class { + my ( $self, $value ) = @_; + + if ( defined $value) { + $self->{ "_sub_sub_class" } = $self->_check_number( $value ); + } + + return $self->{ "_sub_sub_class" }; + +} # sub_sub_class + + + +=head2 serial_number + + Title : serial_number + Usage : $EC1->serial_number( 482 ); + or + print $EC1->serial_number(); + Function: Set/get for the serial number of ECnumbers. + Returns : The serial number of this ECnumber. + Args : A positive integer or "-". + +=cut + +sub serial_number { + my ( $self, $value ) = @_; + + if ( defined $value) { + $self->{ "_serial_number" } = $self->_check_number( $value ); + } + + return $self->{ "_serial_number" }; + +} # serial_number + + + +=head2 comment + + Title : comment + Usage : $EC1->comment( "deprecated" ); + or + print $EC1->comment(); + Function: Set/get for a arbitrary comment. + Returns : A comment [scalar]. + Args : A comment [scalar]. + +=cut + +sub comment { + my ( $self, $value ) = @_; + + if ( defined $value) { + $self->{ "_comment" } = $value; + } + + return $self->{ "_comment" }; + +} # comment + + + +# Title : _check_number +# Function: Checks and standardizes the individual numbers of a EC number +# (removes leading zeros, removes white spaces). +# Returns : A standardized number. +# Args : A string representing a number in a EC number. +sub _check_number { + my ( $self, $value ) = @_; + + my $original_value = $value; + $value =~ s/\s+//g; # Removes white space. + if ( $value eq "" ) { + $value = DEFAULT; + } + $value =~ s/^0+//; # Removes leading zeros. + if ( $value eq "" ) { # If it was "0" (or "00"), it would be "" now. + $value = "0"; + } + elsif ( $value ne DEFAULT + && $value =~ /\D/ ) { + $self->throw( "Illegal format error [$original_value]" ); + } + return $value; + +} # _check_number + + + +# Title : _check_for_illegal_defaults() +# Function: Checks for situations like "1.-.1.1", which +# are illegal in membership tests. +# Returns : +# Args : +sub _check_for_illegal_defaults { + my ( $self ) = @_; + + if ( ( $self->sub_sub_class() eq DEFAULT + && $self->serial_number() ne DEFAULT ) || + ( $self->sub_class() eq DEFAULT + && $self->sub_sub_class() ne DEFAULT ) || + ( $self->enzyme_class() eq DEFAULT + && $self->sub_class() ne DEFAULT ) ) { + $self->throw( "Illegal format error for comparison [" + . $self->to_string() . "]" ); + } + +} # _check_for_illegal_defaults + + + +# Title : _is_not_reference +# Function: Checks whether the argument is not a reference. +# Returns : True or false. +# Args : A scalar. +sub _is_not_reference { + my ( $self, $value ) = @_; + + return ( ! ref( $value ) ); + +} # _is_not_reference + + + +# Title : _is_ECnumber_object +# Function: Checks whether the arument is a ECnumber. +# Returns : +# Args : A reference. +sub _is_ECnumber_object { + my ( $self, $value ) = @_; + + unless( $value->isa( "Bio::Tools::ECnumber" ) ) { + $self->throw( "Found [". ref( $value ) + ."] where [Bio::Tools::ECnumber] expected" ); + } + +} # _is_ECnumber_object + + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/EPCR.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/EPCR.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,152 @@ +# $Id: EPCR.pm,v 1.8 2002/12/01 00:05:21 jason Exp $ +# +# BioPerl module for Bio::Tools::EPCR +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::EPCR - Parse ePCR output and make features + +=head1 SYNOPSIS + + # A simple annotation pipeline wrapper for ePCR data + # assuming ePCR data is already generated in file seq1.epcr + # and sequence data is in fasta format in file called seq1.fa + + use Bio::Tools::EPCR; + use Bio::SeqIO; + my $parser = new Bio::Tools::EPCR(-file => 'seq1.epcr'); + my $seqio = new Bio::SeqIO(-format => 'fasta', -file => 'seq1.fa'); + my $seq = $seqio->next_seq || die("cannot get a seq object from SeqIO"); + + while( my $feat = $parser->next_feature ) { + # add EPCR annotation to a sequence + $seq->add_SeqFeature($feat); + } + my $seqout = new Bio::SeqIO(-format => 'embl'); + $seqout->write_seq($seq); + + +=head1 DESCRIPTION + +This object serves as a parser for ePCR data, creating a +Bio::SeqFeatureI for each ePCR hit. These can be processed or added +as annotation to an existing Bio::SeqI object for the purposes of +automated annotation. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion +http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Tools::EPCR; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::SeqAnalysisParserI; +use Bio::SeqFeature::FeaturePair; +use Bio::SeqFeature::Generic; + +@ISA = qw(Bio::Root::Root Bio::SeqAnalysisParserI Bio::Root::IO ); + +=head2 new + + Title : new + Usage : my $epcr = new Bio::Tools::EPCR(-file => $file); + Function: Initializes a new EPCR parser + Returns : Bio::Tools::EPCR + Args : -fh => filehandle + OR + -file => filename + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + $self->_initialize_io(@args); + + return $self; +} + +=head2 next_feature + + Title : next_feature + Usage : $seqfeature = $obj->next_feature(); + Function: Returns the next feature available in the analysis result, or + undef if there are no more features. + Example : + Returns : A Bio::SeqFeatureI implementing object, or undef if there are no + more features. + Args : none + +=cut + +sub next_feature { + my ($self) = @_; + my $line = $self->_readline; + return undef unless defined($line); + chomp($line); + my($seqname,$location,$mkrname, $rest) = split(/\s+/,$line,4); + + my ($start,$end) = ($location =~ /(\S+)\.\.(\S+)/); + + # If we require that e-PCR is run with D=1 we can detect a strand + # for now hardcoded to 0 + + my $strand = 0; + my $markerfeature = new Bio::SeqFeature::Generic ( '-start' => $start, + '-end' => $end, + '-strand' => $strand, + '-source' => 'e-PCR', + '-primary' => 'sts', + '-seq_id' => $seqname, + '-tag' => { + 'name'=> $mkrname, + 'note'=> $rest, + }); + return $markerfeature; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/ESTScan.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/ESTScan.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,369 @@ +# $Id: ESTScan.pm,v 1.10 2002/10/22 07:38:45 lapp Exp $ +# +# BioPerl module for Bio::Tools::ESTScan +# +# Cared for by Hilmar Lapp +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::ESTScan - Results of one ESTScan run + +=head1 SYNOPSIS + + $estscan = Bio::Tools::ESTScan->new(-file => 'result.estscan'); + # filehandle: + $estscan = Bio::Tools::ESTScan->new( -fh => \*INPUT ); + + # parse the results + # note: this class is-a Bio::Tools::AnalysisResult which implements + # Bio::SeqAnalysisParserI, i.e., $genscan->next_feature() is the same + while($gene = $estscan->next_prediction()) { + # $gene is an instance of Bio::Tools::Prediction::Gene + foreach my $orf ($gene->exons()) { + # $orf is an instance of Bio::Tools::Prediction::Exon + $cds_str = $orf->predicted_cds(); + } + } + + # essential if you gave a filename at initialization (otherwise the file + # will stay open) + $estscan->close(); + +=head1 DESCRIPTION + +The ESTScan module provides a parser for ESTScan coding region prediction +output. + +This module inherits off L and therefore +implements the L interface. +See L. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp@gmx.net (or hilmar.lapp@pharma.novartis.com) + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::Tools::ESTScan; +use vars qw(@ISA); +use strict; +use Symbol; + +use Bio::Root::Root; +use Bio::Tools::AnalysisResult; +use Bio::Tools::Prediction::Exon; + +@ISA = qw(Bio::Tools::AnalysisResult); + +sub _initialize_state { + my ($self,@args) = @_; + + # first call the inherited method! + my $make = $self->SUPER::_initialize_state(@args); + + if(! $self->analysis_method()) { + $self->analysis_method('ESTScan'); + } +} + +=head2 analysis_method + + Usage : $estscan->analysis_method(); + Purpose : Inherited method. Overridden to ensure that the name matches + /estscan/i. + Returns : String + Argument : n/a + +=cut + +#------------- +sub analysis_method { +#------------- + my ($self, $method) = @_; + if($method && ($method !~ /estscan/i)) { + $self->throw("method $method not supported in " . ref($self)); + } + return $self->SUPER::analysis_method($method); +} + +=head2 next_feature + + Title : next_feature + Usage : while($orf = $estscan->next_feature()) { + # do something + } + Function: Returns the next gene structure prediction of the ESTScan result + file. Call this method repeatedly until FALSE is returned. + + The returned object is actually a SeqFeatureI implementing object. + This method is required for classes implementing the + SeqAnalysisParserI interface, and is merely an alias for + next_prediction() at present. + + Example : + Returns : A Bio::Tools::Prediction::Gene object. + Args : + +=cut + +sub next_feature { + my ($self,@args) = @_; + # even though next_prediction doesn't expect any args (and this method + # does neither), we pass on args in order to be prepared if this changes + # ever + return $self->next_prediction(@args); +} + +=head2 next_prediction + + Title : next_prediction + Usage : while($gene = $estscan->next_prediction()) { + # do something + } + Function: Returns the next gene structure prediction of the ESTScan result + file. Call this method repeatedly until FALSE is returned. + + So far, this method DOES NOT work for reverse strand predictions, + even though the code looks like. + Example : + Returns : A Bio::Tools::Prediction::Gene object. + Args : + +=cut + +sub next_prediction { + my ($self) = @_; + my ($gene, $seq, $cds, $predobj); + my $numins = 0; + + # predictions are in the format of FASTA sequences and can be parsed one + # at a time + $seq = $self->_fasta_stream()->next_seq(); + return unless $seq; + # there is a new prediction + $gene = Bio::Tools::Prediction::Gene->new('-primary' => "ORFprediction", + '-source' => "ESTScan"); + # score starts the description + $seq->desc() =~ /^([\d.]+)\s*(.*)/ or + $self->throw("unexpected format of description: no score in " . + $seq->desc()); + $gene->score($1); + $seq->desc($2); + # strand may end the description + if($seq->desc() =~ /(.*)minus strand$/) { + my $desc = $1; + $desc =~ s/;\s+$//; + $seq->desc($desc); + $gene->strand(-1); + } else { + $gene->strand(1); + } + # check for the format: default or 'all-in-one' (option -a) + if($seq->desc() =~ /^(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s*(.*)/) { + # default format + $seq->desc($5); + $predobj = Bio::Tools::Prediction::Exon->new('-source' => "ESTScan", + '-start' => $3, + '-end' => $4); + $predobj->strand($gene->strand()); + $predobj->score($gene->score()); # FIXME or $1, or $2 ? + $predobj->primary_tag("InternalExon"); + $predobj->seq_id($seq->display_id()); + # add to gene structure object + $gene->add_exon($predobj); + # add predicted CDS + $cds = $seq->seq(); + $cds =~ s/[a-z]//g; # remove the deletions, but keep the insertions + $cds = Bio::PrimarySeq->new('-seq' => $cds, + '-display_id' => $seq->display_id(), + '-desc' => $seq->desc(), + '-alphabet' => "dna"); + $gene->predicted_cds($cds); + $predobj->predicted_cds($cds); + if($gene->strand() == -1) { + $self->warn("reverse strand ORF, but unable to reverse coordinates!"); + } + } else { + # + # All-in-one format (hopefully). This encodes the following information + # into the sequence: + # 1) untranslated regions: stretches of lower-case letters + # 2) translated regions: stretches of upper-case letters + # 3) insertions in the translated regions: capital X + # 4) deletions in the translated regions: a single lower-case letter + # + # if reverse strand ORF, save a lot of hassle by reversing the sequence + if($gene->strand() == -1) { + $seq = $seq->revcom(); + } + my $seqstr = $seq->seq(); + while($seqstr =~ /^([a-z]*)([A-Z].*)$/) { + # leading 5'UTR + my $utr5 = $1; + # exon + 3'UTR + my $exonseq = $2; + # strip 3'UTR and following exons + if($exonseq =~ s/([a-z]{2,}.*)$//) { + $seqstr = $1; + } else { + $seqstr = ""; + } + # start: take care of yielding the absolute coordinate + my $start = CORE::length($utr5) + 1; + if($predobj) { + $start += $predobj->end() + $numins; + } + # for the end coordinate, we need to subtract the insertions + $cds = $exonseq; + $cds =~ s/[X]//g; + my $end = $start + CORE::length($cds) - 1; + # construct next exon object + $predobj = Bio::Tools::Prediction::Exon->new('-start' => $start, + '-end' => $end); + $predobj->source_tag("ESTScan"); + $predobj->primary_tag("InternalExon"); + $predobj->seq_id($seq->display_id()); + $predobj->strand($gene->strand()); + $predobj->score($gene->score()); + # add the exon to the gene structure object + $gene->add_exon($predobj); + # add the predicted CDS + $cds = $exonseq; + $cds =~ s/[a-z]//g; # remove the deletions, but keep the insertions + $cds = Bio::PrimarySeq->new('-seq' => $cds, + '-display_id' => $seq->display_id(), + '-desc' => $seq->desc(), + '-alphabet' => "dna"); + # only store the first one in the overall prediction + $gene->predicted_cds($cds) unless $gene->predicted_cds(); + $predobj->predicted_cds($cds); + # add the predicted insertions and deletions as subfeatures + # of the exon + my $fea = undef; + while($exonseq =~ /([a-zX])/g) { + my $indel = $1; + # start and end: start looking at the position after the + # previous feature + if($fea) { + $start = $fea->start()+$numins; + $start -= 1 if($fea->primary_tag() eq 'insertion'); + } else { + $start = $predobj->start()+$numins-1; + } + #print "# numins = $numins, indel = $indel, start = $start\n"; + $start = index($seq->seq(), $indel, $start) + 1 - $numins; + $fea = Bio::SeqFeature::Generic->new('-start' => $start, + '-end' => $start); + $fea->source_tag("ESTScan"); + $fea->seq_id($seq->display_id()); + $fea->strand($predobj->strand()); + if($indel eq 'X') { + # an insertion (depends on viewpoint: to get the 'real' + # CDS, a base has to be inserted, i.e., the HMMER model + # inserted a base; however, the sequencing process deleted + # a base that was there). + $fea->primary_tag("insertion"); + # we need to count insertions because these are left out + # of any coordinates saved in the objects (which is correct + # because insertions change the original sequence, so + # coordinates wouldn't match) + $numins++; + } else { + # a deletion (depends on viewpoint: to get the 'real' + # CDS, a base has to be deleted, i.e., the HMMER model + # deleted a base; however, the sequencing process inserted + # a base that wasn't there). + $fea->primary_tag("deletion"); + $fea->add_tag_value('base', $indel); + } + $predobj->add_sub_SeqFeature($fea); + } + } + } + + return $gene; +} + +=head2 close + + Title : close + Usage : $result->close() + Function: Closes the file handle associated with this result file. + Inherited method, overridden. + Example : + Returns : + Args : + +=cut + +sub close { + my ($self, @args) = @_; + + delete($self->{'_fastastream'}); + $self->SUPER::close(@args); +} + +=head2 _fasta_stream + + Title : _fasta_stream + Usage : $result->_fasta_stream() + Function: Gets/Sets the FASTA sequence IO stream for reading the contents of + the file associated with this MZEF result object. + + If called for the first time, creates the stream from the filehandle + if necessary. + Example : + Returns : + Args : + +=cut + +sub _fasta_stream { + my ($self, $stream) = @_; + + if($stream || (! exists($self->{'_fastastream'}))) { + if(! $stream) { + $stream = Bio::SeqIO->new('-fh' => $self->_fh(), + '-format' => "fasta"); + } + $self->{'_fastastream'} = $stream; + } + return $self->{'_fastastream'}; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Eponine.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Eponine.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,299 @@ +# $Id: Eponine.pm,v 1.7 2002/10/22 07:38:45 lapp Exp $ +# +# BioPerl module for Bio::Tools::Eponine +# +# Cared for by Tania Oh +# +# Copyright Tania Oh +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::Eponine - Results of one Eponine run + +=head1 SYNOPSIS + + use Bio::Tools::Run::Eponine; + use strict; + my $seq = "/data/seq.fa"; + my $threshold = "0.999"; + my @params = ( '-seq' => $seq, + '-threshold' => $threshold); + + my $factory = Bio::Tools::Run::Eponine->new(@params); + # run eponine against fasta + my $r = $factory->run_eponine($seq); + my $parser = Bio::Tools::Eponine->new($r); + + while (my $feat = $parser->next_prediction){ + #$feat contains array of SeqFeature + foreach my $orf($feat) { + print $orf->seq_id. "\n"; + } + } + +=head1 DESCRIPTION + +Parser for Eponine, a probabilistic transcription start site detector +optimized for mammalian genomic sequence. This module inherits off +Bio::Tools::AnalysisResult and therefore implements +Bio::SeqAnalysisParserI (see L and +L). + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Tania Oh + + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Tools::Eponine; +use vars qw(@ISA); +use strict; + +use Bio::Tools::AnalysisResult; +use Bio::Tools::Prediction::Gene; +use Bio::Tools::Prediction::Exon; + +@ISA = qw(Bio::Tools::AnalysisResult); + +sub _initialize_state { + my($self,@args) = @_; + + # first call the inherited method! + my $make = $self->SUPER::_initialize_state(@args); + + # handle our own parameters + + # our private state variables + $self->{'_preds_parsed'} = 0; + #array of Bio::SeqFeatures + $self->{'_flist'} =[]; +} + +=head2 analysis_method + + Usage : $mzef->analysis_method(); + Purpose : Inherited method. Overridden to ensure that the name matches + /mzef/i. + Returns : String + Argument : n/a + +=cut + +#------------- +sub analysis_method { +#------------- + my ($self, $method) = @_; + if($method && ($method !~ /epo/i)) { + $self->throw("method $method not supported in " . ref($self)); + } + return $self->SUPER::analysis_method($method); +} + +=head2 next_feature + + Title : next_feature + Usage : while($gene = $mzef->next_feature()) { + # do something + } + Function: Returns the next gene structure prediction of the MZEF result + file. Call this method repeatedly until FALSE is returned. + + The returned object is actually a SeqFeatureI implementing object. + This method is required for classes implementing the + SeqAnalysisParserI interface, and is merely an alias for + next_prediction() at present. + + Note that with the present version of MZEF there will only be one + object returned, because MZEF does not predict individual genes + but just potential internal exons. + Example : + Returns : A Bio::Tools::Prediction::Gene object. + Args : + +=cut + +sub next_feature { + my ($self,@args) = @_; + # even though next_prediction doesn't expect any args (and this method + # does neither), we pass on args in order to be prepared if this changes + # ever + return $self->next_prediction(@args); +} + +=head2 next_prediction + + Title : next_prediction + Usage : while($gene = $mzef->next_prediction()) { + # do something + } + Function: Returns the next gene structure prediction of the MZEF result + file. Call this method repeatedly until FALSE is returned. + + Note that with the present version of MZEF there will only be one + object returned, because MZEF does not predict individual genes + but just potential internal exons. + Example : + Returns : A Bio::Tools::Prediction::Gene object. + Args : + +=cut + +sub next_prediction { + my ($self) = @_; + my $gene; + + # if the prediction section hasn't been parsed yet, we do this now + $self->_parse_predictions() unless $self->_predictions_parsed(); + + # return the next gene structure (transcript) + return $self->_prediction(); +} + +=head2 _parse_predictions + + Title : _parse_predictions() + Usage : $obj->_parse_predictions() + Function: Parses the prediction section. Automatically called by + next_prediction() if not yet done. + Example : + Returns : + +=cut + +sub _parse_predictions { + my ($self) = @_; + + while(defined($_ = $self->_readline())) { + if (! /^\#/){ #ignore introductory lines + + my @element = split; + my (%feature); + $feature {name} = $element[0]; + $feature {score} = $element[5]; + $feature {start} = $element[3]; + $feature {end} = $element[4]; + $feature {strand} = $element[6]; + $feature {source}= 'Eponine'; + $feature {primary}= 'TSS'; + $feature {program} = 'eponine-scan'; + $feature {program_version} = '2'; + + $self->create_feature(\%feature); + next; + + } + } + $self->_predictions_parsed(1); +} + +=head2 create_feature + + Title : create_feature + Usage : obj->create_feature($feature) + Function: Returns an array of features + Returns : Returns an array of features + Args : none + +=cut + +sub create_feature { + my ($self, $feat) = @_; + #create and fill Bio::EnsEMBL::Seqfeature object + + my $tss = Bio::SeqFeature::Generic->new + ( -seq_id => $feat->{'name'}, + -start => $feat->{'start'}, + -end => $feat->{'end'}, + -strand => $feat->{'strand'}, + -score => $feat->{'score'}, + -source_tag => $feat->{'source'}, + -primary_tag => $feat->{'primary'}); + + + + if ($tss) { + # add to _flist + push(@{$self->{'_flist'}}, $tss); + } + + #print $tss->gff_string; +} + + + + + + +=head2 _prediction + + Title : _prediction() + Usage : $gene = $obj->_prediction() + Function: internal + Example : + Returns : + +=cut + +sub _prediction { + my ($self) = @_; + + return undef unless(exists($self->{'_flist'}) && @{$self->{'_flist'}}); + return shift(@{$self->{'_flist'}}); +} + +=head2 _predictions_parsed + + Title : _predictions_parsed + Usage : $obj->_predictions_parsed + Function: internal + Example : + Returns : TRUE or FALSE + +=cut + +sub _predictions_parsed { + my ($self, $val) = @_; + + $self->{'_preds_parsed'} = $val if $val; + # array of pre-parsed predictions + if(! exists($self->{'_preds_parsed'})) { + $self->{'_preds_parsed'} = 0; + } + return $self->{'_preds_parsed'}; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Est2Genome.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Est2Genome.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,259 @@ +# $Id: Est2Genome.pm,v 1.11 2002/12/05 13:46:36 heikki Exp $ +# +# BioPerl module for Bio::Tools::Est2Genome +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::Est2Genome - Parse est2genome output, makes simple Bio::SeqFeature::Generic objects + +=head1 SYNOPSIS + + use Bio::Tools::Est2Genome; + + my $featureiter = new Bio::Tools::Est2Genome(-file => 'output.est2genome'); + + # This is going to be fixed to use the SeqAnalysisI next_feature + # Method eventually when we have the objects to put the data in + # properly + while( my $f = $featureiter->parse_next_gene ) { + # process Bio::SeqFeature::Generic objects here + } + +=head1 DESCRIPTION + +This module is a parser for est2genome [EMBOSS] alignments of est/cdna +sequence to genomic DNA. This is generally accepted as the best +program for predicting splice sites based on est/cdnas*. + +This module currently does not try pull out the ungapped alignments +(Segment) but may in the future. + + +* AFAIK + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Tools::Est2Genome; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::Root::Root; +use Bio::Tools::AnalysisResult; +use Bio::SeqFeature::Gene::Exon; +use Bio::SeqFeature::Gene::Intron; +use Bio::SeqFeature::Gene::GeneStructure; +use Bio::SeqFeature::SimilarityPair; + +@ISA = qw(Bio::Tools::AnalysisResult ); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Tools::Est2Genome(); + Function: Builds a new Bio::Tools::Est2Genome object + Returns : an instance of Bio::Tools::Est2Genome + Args : -file => 'output.est2genome' or + -fh => \*EST2GENOMEOUTPUT + -genomefirst => 1 # genome was the first input (not standard) + +=cut + +sub _initialize_state { + my($self,@args) = @_; + + # call the inherited method first + my $make = $self->SUPER::_initialize_state(@args); + + my ($genome_is_first) = $self->_rearrange([qw(GENOMEFIRST)], @args); + + delete($self->{'_genome_is_first'}); + $self->{'_genome_is_first'} = $genome_is_first if(defined($genome_is_first)); + $self->analysis_method("est2genome"); +} + +=head2 analysis_method + + Usage : $sim4->analysis_method(); + Purpose : Inherited method. Overridden to ensure that the name matches + /est2genome/i. + Returns : String + Argument : n/a + +=cut + +#------------- +sub analysis_method { +#------------- + my ($self, $method) = @_; + if($method && ($method !~ /est2genome/i)) { + $self->throw("method $method not supported in " . ref($self)); + } + return $self->SUPER::analysis_method($method); +} + +=head2 parse_next_gene + + Title : parse_next_gene + Usage : @gene = $est2genome_result->parse_next_gene; + foreach $exon (@exons) { + # do something + } + + Function: Parses the next alignments of the est2genome result file and + returns the found exons as an array of + Bio::SeqFeature::SimilarityPair objects. Call + this method repeatedly until an empty array is returned to get the + results for all alignments. + + The $exon->seq_id() attribute will be set to the identifier of the + respective sequence for both sequences. + The length is accessible via the seqlength() + attribute of $exon->query() and + $exon->est_hit(). + Returns : An array (or array reference) of Bio::SeqFeature::SimilarityPair and Bio::SeqFeature::Generic objects + Args : none + + +=cut + +sub parse_next_gene { + my ($self) = @_; + my $seensegment = 0; + my @features; + my ($qstrand,$hstrand) = (1,1); + my $lasthseqname; + while( defined($_ = $self->_readline) ) { + if( /Note Best alignment is between (reversed|forward) est and (reversed|forward) genome, (but|and) splice\s+sites imply\s+(forward gene|REVERSED GENE)/) { + if( $seensegment ) { + $self->_pushback($_); + return wantarray ? @features : \@features; + } + $hstrand = -1 if $1 eq 'reversed'; + $qstrand = -1 if $4 eq 'REVERSED GENE'; + $self->debug( "1=$1, 2=$2, 4=$4\n"); + } + elsif( /^Exon/ ) { + my ($name,$len,$score,$qstart,$qend,$qseqname, + $hstart,$hend, $hseqname) = split; + $lasthseqname = $hseqname; + my $query = new Bio::SeqFeature::Similarity(-primary => $name, + -source => $self->analysis_method, + -seq_id => $qseqname, # FIXME WHEN WE REDO THE GENERIC NAME CHANGE + -start => $qstart, + -end => $qend, + -strand => $qstrand, + -score => $score, + -tag => { +# 'Location' => "$hstart..$hend", + 'Sequence' => "$hseqname", + } + ); + my $hit = new Bio::SeqFeature::Similarity(-primary => 'exon_hit', + -source => $self->analysis_method, + -seq_id => $hseqname, + -start => $hstart, + -end => $hend, + -strand => $hstrand, + -score => $score, + -tag => { +# 'Location' => "$qstart..$qend", + 'Sequence' => "$qseqname", + + } + ); + push @features, new Bio::SeqFeature::SimilarityPair + (-query => $query, + -hit => $hit, + -source => $self->analysis_method); + } elsif( /^([\-\+\?])(Intron)/) { + my ($name,$len,$score,$qstart,$qend,$qseqname) = split; + push @features, new Bio::SeqFeature::Generic(-primary => $2, + -source => $self->analysis_method, + -start => $qstart, + -end => $qend, + -strand => $qstrand, + -score => $score, + -seq_id => $qseqname, + -tag => { + 'Sequence' => $lasthseqname}); + } elsif( /^Span/ ) { + } elsif( /^Segment/ ) { + $seensegment = 1; + } elsif( /^\s+$/ ) { # do nothing + } else { + $self->warn( "unknown line $_\n"); + } + } + return undef unless( @features ); + return wantarray ? @features : \@features; +} + +=head2 next_feature + + Title : next_feature + Usage : $seqfeature = $obj->next_feature(); + Function: Returns the next feature available in the analysis result, or + undef if there are no more features. + Example : + Returns : A Bio::SeqFeatureI implementing object, or undef if there are no + more features. + Args : none + +=cut + +sub next_feature { + my ($self) = shift; + $self->throw("We haven't really done this right, yet, use parse_next_gene"); +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/FootPrinter.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/FootPrinter.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,254 @@ +# BioPerl module for Bio::Tools::FootPrinter +# +# Cared for by Shawn Hoon +# +# Copyright Shawn Hoon +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::FootPrinter - DESCRIPTION of Object + +=head1 SYNOPSIS + + use Bio::Tools::FootPrinter; + + my $tool = Bio::Tools::FootPrinter->new(-file=>"footprinter.out"); + + while (my $result = $tool->next_feature){ + foreach my $feat($result->sub_SeqFeature){ + print $result->seq_id."\t".$feat->start."\t".$feat->end."\t".$feat->seq->seq."\n"; + } + } + +=head1 DESCRIPTION + +A parser for FootPrinter output + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Shawn Hoon + +Email shawnh@fugu-sg.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Tools::FootPrinter; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::SeqFeature::Generic; +use Bio::PrimarySeq; +use Bio::Root::IO; + +@ISA = qw(Bio::Root::Root Bio::Root::IO ); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Tools::FootPrinter(); + Function: Builds a new Bio::Tools::FootPrinter object + Returns : Bio::Tools::FootPrinter + Args : -fh/-file => $val, # for initing input, see Bio::Root::IO + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + $self->_initialize_io(@args); + + return $self; +} + +=head2 next_feature + + Title : next_feature + Usage : my $r = $footprint->next_feature + Function: Get the next feature from parser data + Returns : L + Args : none + +=cut + +sub next_feature{ + my ($self) = @_; + $self->_parse_predictions() unless $self->_predictions_parsed(); + return shift @{$self->{'_feature'}}; + +} + +=head2 _add_feature + + Title : _add_feature + Usage : $footprint->_add_feature($feat) + Function: Add feature to array + Returns : none + Args : none + +=cut + +sub _add_feature { + my ($self,$feat) = @_; + if($feat){ + push @{$self->{'_feature'}},$feat; + } +} + +=head2 _parse_predictions + + Title : _parse_predictions + Usage : my $r = $footprint->_parse_predictions + Function: do the parsing + Returns : none + Args : none + +=cut + +sub _parse_predictions { + my ($self) = @_; + $/=""; + my ($seq,$third,$name); + while ($_ = $self->_readline) { + chomp; + my @array = split("\n",$_); + if($#array == 3){ + if($name){ + $name=~s/>//; + my $feat = $self->_parse($name,$seq,$third); + $self->_add_feature($feat); + } + $name = shift @array; + $seq=$array[0]; + $third=$array[2]; + next; + } + $seq.=$array[0]; + $third.=$array[2]; + } + $name=~s/>//; + my $feat = $self->_parse($name,$seq,$third); + $self->_add_feature($feat); + + $self->_predictions_parsed(1); +} + +=head2 _predictions_parsed + + Title : _predictions_parsed + Usage : $footprint->_predictions_parsed(1) + Function: Get/Set for whether predictions parsed + Returns : 1/0 + Args : none + +=cut + +sub _predictions_parsed { + my ($self,$val) = @_; + if($val){ + $self->{'_predictions_parsed'} = $val; + } + return $self->{'_predictions_parsed'}; +} + + +=head2 _parse + + Title : _parse + Usage : $footprint->_parse($name,$seq,$pattern) + Function: do the actual parsing + Returns : L + Args : none + +=cut + +sub _parse { + my ($self,$name,$seq,$pattern) = @_; + my @char = split('',$pattern); + my $prev; + my $word; + my @words; + foreach my $c(@char){ + if(!$word){ + $word .= $c; + $prev = $c; + next; + } + if ($c eq $prev){ + $word.=$c; + $prev = $c; + } + else { + #remove words with only \s + $word=~s/\s+//g; + if ($word ne ''){ + push @words, $word; + } + $word=$c; + $prev = $c; + + } + } + $word=~s/\s+//g; + if($word ne ''){ + push @words, $word; + } + my $last; + my $feat = new Bio::SeqFeature::Generic(-seq_id=>$name); + my $offset=0; + foreach my $w(@words){ + if($w !~ /^$/){ + my $index = index($pattern,$w,$offset); + $offset = $index + length($w); + my $subfeat = new Bio::SeqFeature::Generic ( -seq_id=>$name, + -start => $index+1, + -end =>$index+length($w), + -source=>"FootPrinter"); + $feat->add_sub_SeqFeature($subfeat,'EXPAND'); + } + } + my $priseq = Bio::PrimarySeq->new(-id=>$name,-seq=>$seq); + $feat->attach_seq($priseq); + return $feat; + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/GFF.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/GFF.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,621 @@ +# $Id: GFF.pm,v 1.26 2002/11/24 21:35:40 jason Exp $ +# +# BioPerl module for Bio::Tools::GFF +# +# Cared for by the Bioperl core team +# +# Copyright Matthew Pocock +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::GFF - A Bio::SeqAnalysisParserI compliant GFF format parser + +=head1 SYNOPSIS + + use Bio::Tools::GFF; + + # specify input via -fh or -file + my $gffio = Bio::Tools::GFF->new(-fh => \*STDIN, -gff_version => 2); + my $feature; + # loop over the input stream + while($feature = $gffio->next_feature()) { + # do something with feature + } + $gffio->close(); + + # you can also obtain a GFF parser as a SeqAnalasisParserI in + # HT analysis pipelines (see Bio::SeqAnalysisParserI and + # Bio::Factory::SeqAnalysisParserFactory) + my $factory = Bio::Factory::SeqAnalysisParserFactory->new(); + my $parser = $factory->get_parser(-input => \*STDIN, -method => "gff"); + while($feature = $parser->next_feature()) { + # do something with feature + } + +=head1 DESCRIPTION + +This class provides a simple GFF parser and writer. In the sense of a +SeqAnalysisParser, it parses an input file or stream into SeqFeatureI +objects, but is not in any way specific to a particular analysis +program and the output that program produces. + +That is, if you can get your analysis program spit out GFF, here is +your result parser. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Matthew Pocock + +Email mrp@sanger.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::Tools::GFF; + +use vars qw(@ISA); +use strict; + +use Bio::Root::IO; +use Bio::SeqAnalysisParserI; +use Bio::SeqFeature::Generic; + +@ISA = qw(Bio::Root::Root Bio::SeqAnalysisParserI Bio::Root::IO); + +=head2 new + + Title : new + Usage : + Function: Creates a new instance. Recognized named parameters are -file, -fh, + and -gff_version. + + Returns : a new object + Args : names parameters + + +=cut + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + my ($gff_version) = $self->_rearrange([qw(GFF_VERSION)],@args); + + # initialize IO + $self->_initialize_io(@args); + + $gff_version ||= 2; + if(($gff_version != 1) && ($gff_version != 2)) { + $self->throw("Can't build a GFF object with the unknown version ". + $gff_version); + } + $self->gff_version($gff_version); + return $self; +} + +=head2 next_feature + + Title : next_feature + Usage : $seqfeature = $gffio->next_feature(); + Function: Returns the next feature available in the input file or stream, or + undef if there are no more features. + Example : + Returns : A Bio::SeqFeatureI implementing object, or undef if there are no + more features. + Args : none + +=cut + +sub next_feature { + my ($self) = @_; + + my $gff_string; + + # be graceful about empty lines or comments, and make sure we return undef + # if the input's consumed + while(($gff_string = $self->_readline()) && defined($gff_string)) { + next if($gff_string =~ /^\#/ || $gff_string =~ /^\s*$/ || + $gff_string =~ /^\/\//); + last; + } + return undef unless $gff_string; + + my $feat = Bio::SeqFeature::Generic->new(); + $self->from_gff_string($feat, $gff_string); + + return $feat; +} + +=head2 from_gff_string + + Title : from_gff_string + Usage : $gff->from_gff_string($feature, $gff_string); + Function: Sets properties of a SeqFeatureI object from a GFF-formatted + string. Interpretation of the string depends on the version + that has been specified at initialization. + + This method is used by next_feature(). It actually dispatches to + one of the version-specific (private) methods. + Example : + Returns : void + Args : A Bio::SeqFeatureI implementing object to be initialized + The GFF-formatted string to initialize it from + +=cut + +sub from_gff_string { + my ($self, $feat, $gff_string) = @_; + + if($self->gff_version() == 1) { + $self->_from_gff1_string($feat, $gff_string); + } else { + $self->_from_gff2_string($feat, $gff_string); + } +} + +=head2 _from_gff1_string + + Title : _from_gff1_string + Usage : + Function: + Example : + Returns : void + Args : A Bio::SeqFeatureI implementing object to be initialized + The GFF-formatted string to initialize it from + +=cut + +sub _from_gff1_string { + my ($gff, $feat, $string) = @_; + chomp $string; + my ($seqname, $source, $primary, $start, $end, $score, $strand, $frame, @group) = split(/\t/, $string); + + if ( !defined $frame ) { + $feat->throw("[$string] does not look like GFF to me"); + } + $frame = 0 unless( $frame =~ /^\d+$/); + $feat->seq_id($seqname); + $feat->source_tag($source); + $feat->primary_tag($primary); + $feat->start($start); + $feat->end($end); + $feat->frame($frame); + if ( $score eq '.' ) { + #$feat->score(undef); + } else { + $feat->score($score); + } + if ( $strand eq '-' ) { $feat->strand(-1); } + if ( $strand eq '+' ) { $feat->strand(1); } + if ( $strand eq '.' ) { $feat->strand(0); } + foreach my $g ( @group ) { + if ( $g =~ /(\S+)=(\S+)/ ) { + my $tag = $1; + my $value = $2; + $feat->add_tag_value($1, $2); + } else { + $feat->add_tag_value('group', $g); + } + } +} + +=head2 _from_gff2_string + + Title : _from_gff2_string + Usage : + Function: + Example : + Returns : void + Args : A Bio::SeqFeatureI implementing object to be initialized + The GFF2-formatted string to initialize it from + + +=cut + +sub _from_gff2_string { + my ($gff, $feat, $string) = @_; + chomp($string); + # according to the Sanger website, GFF2 should be single-tab separated elements, and the + # free-text at the end should contain text-translated tab symbols but no "real" tabs, + # so splitting on \t is safe, and $attribs gets the entire attributes field to be parsed later + my ($seqname, $source, $primary, $start, $end, $score, $strand, $frame, @attribs) = split(/\t+/, $string); + my $attribs = join '', @attribs; # just in case the rule against tab characters has been broken + if ( !defined $frame ) { + $feat->throw("[$string] does not look like GFF2 to me"); + } + $feat->seq_id($seqname); + $feat->source_tag($source); + $feat->primary_tag($primary); + $feat->start($start); + $feat->end($end); + $feat->frame($frame); + if ( $score eq '.' ) { + #$feat->score(undef); + } else { + $feat->score($score); + } + if ( $strand eq '-' ) { $feat->strand(-1); } + if ( $strand eq '+' ) { $feat->strand(1); } + if ( $strand eq '.' ) { $feat->strand(0); } + + + # + # this routine is necessay to allow the presence of semicolons in + # quoted text Semicolons are the delimiting character for new + # tag/value attributes. it is more or less a "state" machine, with + # the "quoted" flag going up and down as we pass thorugh quotes to + # distinguish free-text semicolon and hash symbols from GFF control + # characters + + + my $flag = 0; # this could be changed to a bit and just be twiddled + my @parsed; + + # run through each character one at a time and check it + # NOTE: changed to foreach loop which is more efficient in perl + # --jasons + + foreach my $a ( split //, $attribs ) { + # flag up on entering quoted text, down on leaving it + if( $a eq '"') { $flag = ( $flag == 0 ) ? 1:0 } + elsif( $a eq ';' && $flag ) { $a = "INSERT_SEMICOLON_HERE"} + elsif( $a eq '#' && ! $flag ) { last } + push @parsed, $a; + } + $attribs = join "", @parsed; # rejoin into a single string + + # + # Please feel free to fix this and make it more "perlish" + + my @key_vals = split /;/, $attribs; # attributes are semicolon-delimited + + foreach my $pair ( @key_vals ) { + # replace semicolons that were removed from free-text above. + $pair =~ s/INSERT_SEMICOLON_HERE/;/g; + + # separate the key from the value + my ($blank, $key, $values) = split /^\s*([\w\d]+)\s/, $pair; + + + if( defined $values ) { + my @values; + # free text is quoted, so match each free-text block + # and remove it from the $values string + while ($values =~ s/"(.*?)"//){ + # and push it on to the list of values (tags may have + # more than one value... and the value may be undef) + push @values, $1; + } + + # and what is left over should be space-separated + # non-free-text values + + my @othervals = split /\s+/, $values; + foreach my $othervalue(@othervals){ + # get rid of any empty strings which might + # result from the split + if (CORE::length($othervalue) > 0) {push @values, $othervalue} + } + + foreach my $value(@values){ + $feat->add_tag_value($key, $value); + } + } + } +} + +=head2 write_feature + + Title : write_feature + Usage : $gffio->write_feature($feature); + Function: Writes the specified SeqFeatureI object in GFF format to the stream + associated with this instance. + Returns : none + Args : An array of Bio::SeqFeatureI implementing objects to be serialized + +=cut + +sub write_feature { + my ($self, @features) = @_; + foreach my $feature ( @features ) { + $self->_print($self->gff_string($feature)."\n"); + } +} + +=head2 gff_string + + Title : gff_string + Usage : $gffstr = $gffio->gff_string($feature); + Function: Obtain the GFF-formatted representation of a SeqFeatureI object. + The formatting depends on the version specified at initialization. + + This method is used by write_feature(). It actually dispatches to + one of the version-specific (private) methods. + Example : + Returns : A GFF-formatted string representation of the SeqFeature + Args : A Bio::SeqFeatureI implementing object to be GFF-stringified + +=cut + +sub gff_string{ + my ($self, $feature) = @_; + + if($self->gff_version() == 1) { + return $self->_gff1_string($feature); + } else { + return $self->_gff2_string($feature); + } +} + +=head2 _gff1_string + + Title : _gff1_string + Usage : $gffstr = $gffio->_gff1_string + Function: + Example : + Returns : A GFF1-formatted string representation of the SeqFeature + Args : A Bio::SeqFeatureI implementing object to be GFF-stringified + +=cut + +sub _gff1_string{ + my ($gff, $feat) = @_; + my ($str,$score,$frame,$name,$strand); + + if( $feat->can('score') ) { + $score = $feat->score(); + } + $score = '.' unless defined $score; + + if( $feat->can('frame') ) { + $frame = $feat->frame(); + } + $frame = '.' unless defined $frame; + + $strand = $feat->strand(); + if(! $strand) { + $strand = "."; + } elsif( $strand == 1 ) { + $strand = '+'; + } elsif ( $feat->strand == -1 ) { + $strand = '-'; + } + + if( $feat->can('seqname') ) { + $name = $feat->seq_id(); + $name ||= 'SEQ'; + } else { + $name = 'SEQ'; + } + + + $str = join("\t", + $name, + $feat->source_tag(), + $feat->primary_tag(), + $feat->start(), + $feat->end(), + $score, + $strand, + $frame); + + foreach my $tag ( $feat->all_tags ) { + foreach my $value ( $feat->each_tag_value($tag) ) { + $str .= " $tag=$value"; + } + } + + + return $str; +} + +=head2 _gff2_string + + Title : _gff2_string + Usage : $gffstr = $gffio->_gff2_string + Function: + Example : + Returns : A GFF2-formatted string representation of the SeqFeature + Args : A Bio::SeqFeatureI implementing object to be GFF2-stringified + +=cut + +sub _gff2_string{ + my ($gff, $feat) = @_; + my ($str,$score,$frame,$name,$strand); + + if( $feat->can('score') ) { + $score = $feat->score(); + } + $score = '.' unless defined $score; + + if( $feat->can('frame') ) { + $frame = $feat->frame(); + } + $frame = '.' unless defined $frame; + + $strand = $feat->strand(); + if(! $strand) { + $strand = "."; + } elsif( $strand == 1 ) { + $strand = '+'; + } elsif ( $feat->strand == -1 ) { + $strand = '-'; + } + + if( $feat->can('seqname') ) { + $name = $feat->seq_id(); + $name ||= 'SEQ'; + } else { + $name = 'SEQ'; + } + $str = join("\t", + $name, + $feat->source_tag(), + $feat->primary_tag(), + $feat->start(), + $feat->end(), + $score, + $strand, + $frame); + + # the routine below is the only modification I made to the original + # ->gff_string routine (above) as on November 17th, 2000, the + # Sanger webpage describing GFF2 format reads: "From version 2 + # onwards, the attribute field must have a tag value structure + # following the syntax used within objects in a .ace file, + # flattened onto one line by semicolon separators. Tags must be + # standard identifiers ([A-Za-z][A-Za-z0-9_]*). Free text values + # must be quoted with double quotes". + + # MW + + my $valuestr; + my @all_tags = $feat->all_tags; + if (@all_tags) { # only play this game if it is worth playing... + $str .= "\t"; # my interpretation of the GFF2 + # specification suggests the need + # for this additional TAB character...?? + foreach my $tag ( @all_tags ) { + my $valuestr; # a string which will hold one or more values + # for this tag, with quoted free text and + # space-separated individual values. + foreach my $value ( $feat->each_tag_value($tag) ) { + if ($value =~ /[^A-Za-z0-9_]/){ + $value =~ s/\t/\\t/g; # substitute tab and newline + # characters + $value =~ s/\n/\\n/g; # to their UNIX equivalents + $value = '"' . $value . '" '} # if the value contains + # anything other than valid + # tag/value characters, then + # quote it + $value = "\"\"" unless defined $value; + # if it is completely empty, + # then just make empty double + # quotes + $valuestr .= $value . " "; # with a trailing space in case + # there are multiple values + # for this tag (allowed in GFF2 and .ace format) + } + $str .= "$tag $valuestr ; "; # semicolon delimited with no '=' sign + } + chop $str; chop $str # remove the trailing semicolon and space + } + return $str; +} + +=head2 gff_version + + Title : _gff_version + Usage : $gffversion = $gffio->gff_version + Function: + Example : + Returns : The GFF version this parser will accept and emit. + Args : none + +=cut + +sub gff_version { + my ($self, $value) = @_; + if(defined $value && (($value == 1) || ($value == 2))) { + $self->{'GFF_VERSION'} = $value; + } + return $self->{'GFF_VERSION'}; +} + +# Make filehandles + +=head2 newFh + + Title : newFh + Usage : $fh = Bio::Tools::GFF->newFh(-file=>$filename,-format=>'Format') + Function: does a new() followed by an fh() + Example : $fh = Bio::Tools::GFF->newFh(-file=>$filename,-format=>'Format') + $feature = <$fh>; # read a feature object + print $fh $feature ; # write a feature object + Returns : filehandle tied to the Bio::Tools::GFF class + Args : + +=cut + +sub newFh { + my $class = shift; + return unless my $self = $class->new(@_); + return $self->fh; +} + +=head2 fh + + Title : fh + Usage : $obj->fh + Function: + Example : $fh = $obj->fh; # make a tied filehandle + $feature = <$fh>; # read a feature object + print $fh $feature; # write a feature object + Returns : filehandle tied to Bio::Tools::GFF class + Args : none + +=cut + + +sub fh { + my $self = shift; + my $class = ref($self) || $self; + my $s = Symbol::gensym; + tie $$s,$class,$self; + return $s; +} + +sub DESTROY { + my $self = shift; + + $self->close(); +} + +sub TIEHANDLE { + my ($class,$val) = @_; + return bless {'gffio' => $val}, $class; +} + +sub READLINE { + my $self = shift; + return $self->{'gffio'}->next_feature() unless wantarray; + my (@list, $obj); + push @list, $obj while $obj = $self->{'gffio'}->next_feature(); + return @list; +} + +sub PRINT { + my $self = shift; + $self->{'gffio'}->write_feature(@_); +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Gel.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Gel.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,237 @@ +# $Id: Gel.pm,v 1.6 2002/10/22 07:45:22 lapp Exp $ +# BioPerl module for Bio::Tools::Gel +# Copyright Allen Day +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::Gel - Calculates relative electrophoretic migration distances + +=head1 SYNOPSIS + + #An example of a virtual restriction digest and subsequent gel run + use Bio::Seq; + use Bio::Tools::RestrictionEnzyme; + use Bio::Tools::Gel; + + my $d = 'AAAAAAAAAGAATTCTTTTTTTTTTTTTTGAATTCGGGGGGGGGGGGGGGGGGGG'; + my $seq1 = Bio::Seq->new(-id=>'groundhog day',-seq=>$d); + my $EcoRI = Bio::Tools::RestrictionEnzyme->new(-NAME=>'EcoRI'); + my @cuts = $EcoRI->cut_seq($seq); + + my $gel = Bio::Tools::Gel->new(-seq=>\@cuts,-dilate=>10); + my %bands = $gel->bands; + foreach my $band (keys %bands){ + print $band,"\t",$bands{$band},"\n"; + } + + #prints: + #25 26.0205999132796 + #10 30 + #20 26.9897000433602 + + +=head1 DESCRIPTION + +This takes a set of sequences or Bio::Seq objects, and calculates their +respective migration distances using: + distance = dilation * (4 - log10(length(dna)); + +Source: Molecular Cloning, a Laboratory Manual. Sambrook, Fritsch, Maniatis. +CSHL Press, 1989. + +Bio::Tools::Gel currently calculates migration distances based solely on +the length of the nucleotide sequence. Secondary or tertiary structure, +curvature, and other biophysical attributes of a sequence are currently +not considered. Polypeptide migration is currently not supported. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion +http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Allen Day + +Email allenday@ucla.edu + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Tools::Gel; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::PrimarySeq; + +@ISA = qw(Bio::Root::Root); + +=head2 new + + Title : new + Usage : my $gel = new Bio::Tools::Gel(-seq => $sequence,-dilate => 3); + Function: Initializes a new Gel + Returns : Bio::Tools::Gel + Args : -seq => Bio::Seq(s), scalar(s) or list of either/both + (default: none) + -dilate => Expand band migration distances (default: 1) + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + my ($seqs,$dilate) = $self->_rearrange([qw(SEQ DILATE)], + @args); + if( ! ref($seqs) ) { + $self->add_band([$seqs]); + } elsif( ref($seqs) =~ /array/i || + $seqs->isa('Bio::PrimarySeqI') ) { + $self->add_band($seqs); + } + $self->dilate($dilate || 1); + + return $self; +} + + +=head2 add_band + + Title : add_band + Usage : $gel->add_band($seq); + Function: Calls _add_band with a (possibly created) Bio::Seq object. + Returns : + Args : Bio::Seq, scalar sequence, or list of either/both. + +=cut + +sub add_band { + my($self,$args) = @_; + + foreach my $arg (@$args){ + my $seq; + if( ! ref($arg) ) { + if( $arg =~ /^\d+/ ) { + $seq= Bio::PrimarySeq->new(-seq=>"N"x$arg, -id => $arg); + } else { + $seq= Bio::PrimarySeq->new(-seq=>$arg,-id=>length($arg)); + } + } elsif( $arg->isa('Bio::PrimarySeqI') ) { + $seq = $arg; + } + + $seq->validate_seq or $seq->throw("invalid symbol in sequence".$seq->seq()."\n"); + $self->_add_band($seq); + } +} + +=head2 _add_band + + Title : _add_band + Usage : $gel->_add_band($seq); + Function: Adds a new band to the gel. + Returns : + Args : Bio::Seq object + +=cut + +sub _add_band { + my($self,$arg) = @_; + if( defined $arg) { + push (@{$self->{'bands'}},$arg); + } +} + +=head2 dilate + + Title : dilate + Usage : $gel->dilate(1); + Function: Sets/retrieves the dilation factor. + Returns : dilation factor + Args : Float or none + +=cut + +sub dilate { + my($self,$arg) = @_; + return $self->{dilate} unless $arg; + $self->throw("-dilate should be numeric") if defined $arg and $arg =~ /[^e\d\.]/; + $self->{dilate} = $arg; + return $self->{dilate}; +} + +sub migrate { + my ($self,$arg) = @_; + $arg = $self unless $arg; + if ( $arg ) { + return 4 - log10($arg); + } else { return 0; } +} + +=head2 bands + + Title : bands + Usage : $gel->bands; + Function: Calculates migration distances of sequences. + Returns : hash of (seq_id => distance) + Args : + +=cut + +sub bands { + my $self = shift; + $self->throw("bands() is read-only") if @_; + + my %bands = (); + + foreach my $band (@{$self->{bands}}){ + my $distance = $self->dilate * migrate($band->length); + $bands{$band->id} = $distance; + } + + return %bands; +} + +=head2 log10 + + Title : log10 + Usage : log10($n); + Function: returns base 10 log of $n. + Returns : float + Args : float + +=cut + +#from programming perl +sub log10 { + my $n = shift; + return log($n)/log(10); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Genemark.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Genemark.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,515 @@ +# $Id: Genemark.pm,v 1.11.2.1 2003/04/24 08:51:48 heikki Exp $ +# +# BioPerl module for Bio::Tools::Genemark +# +# Cared for by Mark Fiers +# +# Copyright Hilmar Lapp, Mark Fiers +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::Genemark - Results of one Genemark run + +=head1 SYNOPSIS + + $Genemark = Bio::Tools::Genemark->new(-file => 'result.Genemark'); + # filehandle: + $Genemark = Bio::Tools::Genemark->new( -fh => \*INPUT ); + + # parse the results + # note: this class is-a Bio::Tools::AnalysisResult which implements + # Bio::SeqAnalysisParserI, i.e., $Genemark->next_feature() is the same + while($gene = $Genemark->next_prediction()) { + # $gene is an instance of Bio::Tools::Prediction::Gene, which inherits + # off Bio::SeqFeature::Gene::Transcript. + # + # $gene->exons() returns an array of + # Bio::Tools::Prediction::Exon objects + # all exons: + @exon_arr = $gene->exons(); + + # initial exons only + @init_exons = $gene->exons('Initial'); + # internal exons only + @intrl_exons = $gene->exons('Internal'); + # terminal exons only + @term_exons = $gene->exons('Terminal'); + # singleton exons: + ($single_exon) = $gene->exons(); + } + + # essential if you gave a filename at initialization (otherwise the file + # will stay open) + $Genemark->close(); + +=head1 DESCRIPTION + +The Genemark module provides a parser for Genemark gene structure prediction +output. It parses one gene prediction into a Bio::SeqFeature::Gene::Transcript- +derived object. + +This module has been developed around genemark.hmm for eukaryots v2.2a and will +probably not work with other versions. + + +This module also implements the Bio::SeqAnalysisParserI interface, and thus +can be used wherever such an object fits. See L. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp, Mark Fiers + +Email hlapp@gmx.net + m.w.e.j.fiers@plant.wag-ur.nl + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Tools::Genemark; +use vars qw(@ISA); +use strict; +use Symbol; + +use Bio::Root::Root; +use Bio::Tools::AnalysisResult; +use Bio::Tools::Prediction::Gene; +use Bio::Tools::Prediction::Exon; +use Bio::Seq; + +@ISA = qw(Bio::Tools::AnalysisResult); + +sub _initialize_state { + my ($self,@args) = @_; + + # first call the inherited method! + $self->SUPER::_initialize_state(@args); + + # our private state variables + $self->{'_preds_parsed'} = 0; + $self->{'_has_cds'} = 0; + # array of pre-parsed predictions + $self->{'_preds'} = []; + # seq stack + $self->{'_seqstack'} = []; +} + +=head2 analysis_method + + Usage : $Genemark->analysis_method(); + Purpose : Inherited method. Overridden to ensure that the name matches + /GeneMark.hmm/i. + Returns : String + Argument : n/a + +=cut + +#------------- +sub analysis_method { +#------------- + my ($self, $method) = @_; + if($method && ($method !~ /Genemark\.hmm/i)) { + $self->throw("method $method not supported in " . ref($self)); + } + return $self->SUPER::analysis_method($method); +} + +=head2 next_feature + + Title : next_feature + Usage : while($gene = $Genemark->next_feature()) { + # do something + } + Function: Returns the next gene structure prediction of the Genemark result + file. Call this method repeatedly until FALSE is returned. + + The returned object is actually a SeqFeatureI implementing object. + This method is required for classes implementing the + SeqAnalysisParserI interface, and is merely an alias for + next_prediction() at present. + + Example : + Returns : A Bio::Tools::Prediction::Gene object. + Args : + +=cut + +sub next_feature { + my ($self,@args) = @_; + # even though next_prediction doesn't expect any args (and this method + # does neither), we pass on args in order to be prepared if this changes + # ever + return $self->next_prediction(@args); +} + +=head2 next_prediction + + Title : next_prediction + Usage : while($gene = $Genemark->next_prediction()) { + # do something + } + Function: Returns the next gene structure prediction of the Genemark result + file. Call this method repeatedly until FALSE is returned. + + Example : + Returns : A Bio::Tools::Prediction::Gene object. + Args : + +=cut + +sub next_prediction { + my ($self) = @_; + my $gene; + + # if the prediction section hasn't been parsed yet, we do this now + $self->_parse_predictions() unless $self->_predictions_parsed(); + + # get next gene structure + $gene = $self->_prediction(); + + return $gene; +} + +=head2 _parse_predictions + + Title : _parse_predictions() + Usage : $obj->_parse_predictions() + Function: Parses the prediction section. Automatically called by + next_prediction() if not yet done. + Example : + Returns : + +=cut + +sub _parse_predictions { + my ($self) = @_; + my %exontags = ('Initial' => 'Initial', + 'Internal' => 'Internal', + 'Terminal' => 'Terminal', + 'Single' => '', + '_na_' => ''); + my $exontag; + my $gene; + my $seqname; + my $exontype; + my $current_gene_no = -1; + + while(defined($_ = $self->_readline())) { + + if( (/^\s*(\d+)\s+(\d+)/) || (/^\s*(\d+)\s+[\+\-]/)) { + + # this is an exon, Genemark doesn't predict anything else + # $prednr corresponds to geneno. + my $prednr = $1; + + #exon no: + my $signalnr = 0; + if ($2) { my $signalnr = $2; } # used in tag: exon_no + + # split into fields + chomp(); + my @flds = split(' ', $_); + + # create the feature (an exon) object + my $predobj = Bio::Tools::Prediction::Exon->new(); + + + # define info depending on it being eu- or prokaryot + my ($start, $end, $orientation, $prediction_source); + + if ($self->analysis_method() =~ /PROKARYOTIC/i) { + $prediction_source = "Genemark.hmm.pro"; + $orientation = ($flds[1] eq '+') ? 1 : -1; + ($start, $end) = @flds[(2,3)]; + $exontag = "_na_"; + + } else { + $prediction_source = "Genemark.hmm.eu"; + $orientation = ($flds[2] eq '+') ? 1 : -1; + ($start, $end) = @flds[(4,5)]; + $exontag = $flds[3]; + } + + #store the data in the exon object + $predobj->source_tag($prediction_source); + $predobj->start($start); + $predobj->end($end); + $predobj->strand($orientation); + + $predobj->primary_tag($exontags{$exontag} . "Exon"); + + $predobj->add_tag_value('exon_no',"$signalnr") if ($signalnr); + + $predobj->is_coding(1); + + + # frame calculation as in the genscan module + # is to be implemented... + + #If the $prednr is not equal to the current gene, we + #need to make a new gene and close the old one + if($prednr != $current_gene_no) { + # a new gene, store the old one if it exists + if (defined ($gene)) { + $gene->seq_id($seqname); + $gene = undef ; + } + #and make a new one + $gene = Bio::Tools::Prediction::Gene->new + ( + '-primary' => "GenePrediction$prednr", + '-source' => $prediction_source); + $self->_add_prediction($gene); + $current_gene_no = $prednr; + } + + # Add the exon to the gene + $gene->add_exon($predobj, ($exontag eq "_na_" ? + undef : $exontags{$exontag})); + + } + + if(/^(Genemark\.hmm\s*[PROKARYOTIC]*)\s+\(Version (.*)\)$/i) { + $self->analysis_method($1); + + my $gm_version = $2; + + $self->analysis_method_version($gm_version); + next; + } + + #Matrix file for eukaryot version + if (/^Matrices file:\s+(\S+)?/i) { + $self->analysis_subject($1); + # since the line after the matrix file is always the date + # (in the output file's I have seen!) extract and store this + # here + if (defined(my $_date = $self->_readline())) { + chomp ($_date); + $self->analysis_date($_date); + } + } + + #Matrix file for prokaryot version + if (/^Model file name:\s+(\S+)/) { + $self->analysis_subject($1); + # since the line after the matrix file is always the date + # (in the output file's I have seen!) extract and store this + # here + my $_date = $self->_readline() ; + if (defined($_date = $self->_readline())) { + chomp ($_date); + $self->analysis_date($_date); + } + } + + if(/^Sequence[ file]? name:\s+(.+)\s*$/i) { + $seqname = $1; + # $self->analysis_subject($seqname); + next; + } + + + /^>/ && do { + $self->_pushback($_); + + # section of predicted aa sequences on recognition + # of a fasta start, read all sequences and find the + # appropriate gene + while (1) { + my ($aa_id, $seq) = $self->_read_fasta_seq(); + last unless ($aa_id); + + #now parse through the predictions to add the pred. protein + FINDPRED: foreach my $gene (@{$self->{'_preds'}}) { + $gene->primary_tag() =~ /[^0-9]([0-9]+)$/; + my $geneno = $1; + if ($aa_id =~ /\|gene.$geneno\|/) { + #print "x SEQ : \n $seq \nXXXX\n"; + my $seqobj = Bio::Seq->new('-seq' => $seq, + '-display_id' => $aa_id, + '-alphabet' => "protein"); + $gene->predicted_protein($seqobj); + last FINDPRED; + } + + } + } + + last; + }; + } + + # if the analysis query object contains a ref to a Seq of PrimarySeq + # object, then extract the predicted sequences and add it to the gene + # object. + if (defined $self->analysis_query()) { + my $orig_seq = $self->analysis_query(); + FINDPREDSEQ: foreach my $gene (@{$self->{'_preds'}}) { + my $predseq = ""; + foreach my $exon ($gene->exons()) { + #print $exon->start() . " " . $exon->end () . "\n"; + $predseq .= $orig_seq->subseq($exon->start(), $exon->end()); + } + + my $seqobj = Bio::PrimarySeq->new('-seq' => $predseq, + '-display_id' => "transl"); + $gene->predicted_cds($seqobj); + } + } + + + $self->_predictions_parsed(1); +} + +=head2 _prediction + + Title : _prediction() + Usage : $gene = $obj->_prediction() + Function: internal + Example : + Returns : + +=cut + +sub _prediction { + my ($self) = @_; + + return undef unless(exists($self->{'_preds'}) && @{$self->{'_preds'}}); + return shift(@{$self->{'_preds'}}); +} + +=head2 _add_prediction + + Title : _add_prediction() + Usage : $obj->_add_prediction($gene) + Function: internal + Example : + Returns : + +=cut + +sub _add_prediction { + my ($self, $gene) = @_; + + if(! exists($self->{'_preds'})) { + $self->{'_preds'} = []; + } + push(@{$self->{'_preds'}}, $gene); +} + +=head2 _predictions_parsed + + Title : _predictions_parsed + Usage : $obj->_predictions_parsed + Function: internal + Example : + Returns : TRUE or FALSE + +=cut + +sub _predictions_parsed { + my ($self, $val) = @_; + + $self->{'_preds_parsed'} = $val if $val; + if(! exists($self->{'_preds_parsed'})) { + $self->{'_preds_parsed'} = 0; + } + return $self->{'_preds_parsed'}; +} + +=head2 _has_cds + + Title : _has_cds() + Usage : $obj->_has_cds() + Function: Whether or not the result contains the predicted CDSs, too. + Example : + Returns : TRUE or FALSE + +=cut + +sub _has_cds { + my ($self, $val) = @_; + + $self->{'_has_cds'} = $val if $val; + if(! exists($self->{'_has_cds'})) { + $self->{'_has_cds'} = 0; + } + return $self->{'_has_cds'}; +} + +=head2 _read_fasta_seq + + Title : _read_fasta_seq() + Usage : ($id,$seqstr) = $obj->_read_fasta_seq(); + Function: Simple but specialised FASTA format sequence reader. Uses + $self->_readline() to retrieve input, and is able to strip off + the traling description lines. + Example : + Returns : An array of two elements. + +=cut + +sub _read_fasta_seq { + my ($self) = @_; + my ($id, $seq); + local $/ = ">"; + + return 0 unless (my $entry = $self->_readline()); + + $entry =~ s/^>//; + # complete the entry if the first line came from a pushback buffer + while(! ($entry =~ />$/)) { + last unless ($_ = $self->_readline()); + $entry .= $_; + } + + # delete everything onwards from an new fasta start (>) + $entry =~ s/\n>.*$//s; + # id and sequence + + if($entry =~ s/^(.+)\n//) { + $id = $1; + $id =~ s/ /_/g; + $seq = $entry; + $seq =~ s/\s//g; + #print "\n@@ $id \n@@ $seq \n##\n"; + } else { + $self->throw("Can't parse Genemark predicted sequence entry"); + } + $seq =~ s/\s//g; # Remove whitespace + return ($id, $seq); +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Genewise.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Genewise.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,294 @@ +# $Id: Genewise.pm,v 1.10 2002/12/18 01:54:51 jason Exp $ +# +# BioPerl module for Bio::Tools::Genewise +# +# Copyright Fugu Team +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::Genewise - Results of one Genewise run + +=head1 SYNOPSIS + + use Bio::Tools::Genewise; + my $gw = Bio::Tools::Genewise(-file=>"genewise.out"); + + while (my $gene = $gw->next_prediction){ + my @transcripts = $gene->transcripts; + foreach my $t(@transcripts){ + my @exons = $t->exons; + foreach my $e(@exons){ + print $e->start." ".$e->end."\n"; + } + } + } + +=head1 DESCRIPTION + +This is the parser for the output of Genewise. It takes either a file handle or +a file name and returns a Bio::SeqFeature::Gene::GeneStructure object. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Fugu Team + + Email: fugui@worf.fugu-sg.org + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Tools::Genewise; +use vars qw(@ISA $Srctag); +use strict; +use Symbol; + +use Bio::Root::Root; +use Bio::Root::IO; +use Bio::Tools::AnalysisResult; +use Bio::SeqFeature::Generic; +use Bio::SeqFeature::Gene::Exon; +use Bio::SeqFeature::FeaturePair; +use Bio::SeqFeature::Gene::Transcript; +use Bio::SeqFeature::Gene::GeneStructure; + +@ISA = qw(Bio::Root::Root Bio::Root::IO); +$Srctag = 'genewise'; + +=head2 new + + Title : new + Usage : $obj->new(-file=>"genewise.out"); + $obj->new(-fh=>\*GW); + Function: Constructor for genewise wrapper. Takes either a file or filehandle + Example : + Returns : L + +=cut + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + $self->_initialize_io(@args); + return $self; +} + +=head2 _get_strand + + Title : _get_strand + Usage : $obj->_get_strand + Function: takes start and end values, swap them if start>end and returns end + Example : + Returns :$start,$end,$strand + +=cut + +sub _get_strand { + my ($self,$start,$end) = @_; + $start || $self->throw("Need a start"); + $end || $self->throw("Need an end"); + my $strand; + if ($start > $end) { + my $tmp = $start; + $start = $end; + $end = $tmp; + $strand = -1; + } + else { + $strand = 1; + } + return ($start,$end,$strand); +} + +=head2 score + + Title : score + Usage : $obj->score + Function: get/set for score info + Example : + Returns : a score value + +=cut + +sub _score { + my ($self,$val) = @_; + if($val){ + $self->{'_score'} = $val; + } + return $self->{'_score'}; +} + +=head2 _prot_id + + Title : _prot_id + Usage : $obj->_prot_id + Function: get/set for protein id + Example : + Returns :a protein id + +=cut + +sub _prot_id { + my ($self,$val) = @_; + if($val){ + $self->{'_prot_id'} = $val; + } + return $self->{'_prot_id'}; +} + +=head2 _target_id + + Title : _target_id + Usage : $obj->_target_id + Function: get/set for genomic sequence id + Example : + Returns :a target id + +=cut + +sub _target_id { + my ($self,$val) = @_; + if($val){ + $self->{'_target_id'} = $val; + } + return $self->{'_target_id'}; +} + + +=head2 next_prediction + + Title : next_prediction + Usage : while($gene = $genewise->next_prediction()) { + # do something + } + Function: Returns the gene structure prediction of the Genewise result + file. Call this method repeatedly until FALSE is returned. + + Example : + Returns : a Bio::SeqFeature::Gene::GeneStructure object + Args : + +=cut + + +sub next_prediction { + my ($self) = @_; + + my $genes = new Bio::SeqFeature::Gene::GeneStructure(-source => $Srctag); + my $transcript = new Bio::SeqFeature::Gene::Transcript(-source => $Srctag); + + local ($/) = "//"; + my $score; + my $prot_id; + my $target_id; + while ( defined($_ = $self->_readline) ) { + $self->debug( $_ ) if( $self->verbose > 0); + ($score) = $_=~m/Score\s+(\d+[\.][\d]+)/; + $self->_score($score) unless defined $self->_score; + ($prot_id) = $_=~m/Query protein:\s+(\S+)/; + $self->_prot_id($prot_id) unless defined $self->_prot_id; + ($target_id) = $_=~m/Target Sequence\s+(\S+)/; + $self->_target_id($target_id) unless defined $self->_target_id; + next unless /Gene\s+\d+\n/; + + #grab exon + supporting feature info + my @exons; + + unless ( @exons = $_ =~ m/(Exon .+\s+Supporting .+)/g ) { + @exons = $_ =~ m/(Exon .+\s+)/g; + + } + my $nbr = 1; + + #loop through each exon-supporting feature pair + foreach my $e (@exons){ + my ($e_start,$e_end,$phase) = $e =~ m/Exon\s+(\d+)\s+(\d+)\s+phase\s+(\d+)/; + my $e_strand; + ($e_start,$e_end,$e_strand) = $self->_get_strand($e_start,$e_end); + $transcript->strand($e_strand) unless $transcript->strand != 0; + + my $exon = new Bio::SeqFeature::Gene::Exon + (-seq_id=>$self->_target_id, + -source => $Srctag, + -start=>$e_start, + -end=>$e_end, + #-frame => $phase, + -strand=>$e_strand); + $exon->add_tag_value('phase',$phase); + if( $self->_prot_id ) { + $exon->add_tag_value('Sequence',"Protein:".$self->_prot_id); + } + $exon->add_tag_value("Exon",$nbr++); + + if( $e =~ m/Supporting\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)/) { + my ($geno_start,$geno_end, + $prot_start, + $prot_end) = ($1,$2,$3,$4); + + my $prot_strand; + ($prot_start,$prot_end, + $prot_strand) = $self->_get_strand($prot_start,$prot_end); + + my $pf = new Bio::SeqFeature::Generic + ( -start => $prot_start, + -end => $prot_end, + -seq_id => $self->_prot_id, + -score => $self->_score, + -strand => $prot_strand, + -source => $Srctag, + -primary=> 'supporting_protein_feature', + ); + my $geno_strand; + ($geno_start,$geno_end, + $geno_strand) = $self->_get_strand($geno_start,$geno_end); + my $gf = new Bio::SeqFeature::Generic + ( -start => $geno_start, + -end => $geno_end, + -seq_id => $self->_target_id, + -score => $self->_score, + -strand => $geno_strand, + -source => $Srctag, + -primary => 'supporting_genomic_feature', + ); + my $fp = new Bio::SeqFeature::FeaturePair(-feature1=>$gf, + -feature2=>$pf); + + $exon->add_tag_value( 'supporting_feature' => $fp ); + } + $transcript->add_exon($exon); + } + $transcript->seq_id($self->_target_id); + $genes->add_transcript($transcript); + $genes->seq_id($self->_target_id); + return $genes; + } +} +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Genomewise.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Genomewise.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,219 @@ +# $Id: Genomewise.pm,v 1.1.2.1 2003/03/25 12:32:16 heikki Exp $ +# +# BioPerl module for Bio::Tools::Genomewise +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::Genomewise - Results of one Genomewise run + +=head1 SYNOPSIS + + use Bio::Tools::Genomewise; + my $gw = Bio::Tools::Genomewise(-file=>"genomewise.out"); + + while (my $gene = $gw->next_prediction){ + my @transcripts = $gw->transcripts; + foreach my $t(@transcripts){ + my @exons = $t->exons; + foreach my $e(@exons){ + print $e->start." ".$e->end."\n"; + } + } + } + +=head1 DESCRIPTION + +This is the parser for the output of Genewise. It takes either a file +handle or a file name and returns a +Bio::SeqFeature::Gene::GeneStructure object. You will need to specify +the proper target sequence id on the object with the +$feature-Eseq_id($seqid). + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Fugu Team + + Email: fugui@worf.fugu-sg.org + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Tools::Genomewise; +use vars qw(@ISA $Srctag); +use strict; + +use Bio::Tools::Genewise; +use Bio::Tools::AnalysisResult; +use Bio::SeqFeature::Generic; +use Bio::SeqFeature::Gene::Exon; +use Bio::SeqFeature::FeaturePair; +use Bio::SeqFeature::Gene::Transcript; +use Bio::SeqFeature::Gene::GeneStructure; + +@ISA = qw(Bio::Tools::Genewise); + +$Srctag = 'genomewise'; + +=head2 new + + Title : new + Usage : $obj->new(-file=>"genewise.out"); + $obj->new(-fh=>\*GW); + Function: Constructor for genomewise wrapper. Takes either a file or filehandle + Example : + Returns : L + +=cut + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + return $self; +} + +=head2 _get_strand + + Title : _get_strand + Usage : $obj->_get_strand + Function: takes start and end values, swap them if start>end and returns end + Example : + Returns :$start,$end,$strand + +=cut + +=head2 score + + Title : score + Usage : $obj->score + Function: get/set for score info + Example : + Returns : a score value + +=cut + +=head2 _prot_id + + Title : _prot_id + Usage : $obj->_prot_id + Function: get/set for protein id + Example : + Returns :a protein id + +=cut + +=head2 _target_id + + Title : _target_id + Usage : $obj->_target_id + Function: get/set for genomic sequence id + Example : + Returns :a target id + +=cut + + +=head2 next_prediction + + Title : next_prediction + Usage : while($gene = $genewise->next_prediction()) { + # do something + } + Function: Returns the gene structure prediction of the Genomewise result + file. Call this method repeatedly until FALSE is returned. + + Example : + Returns : a Bio::SeqFeature::Gene::GeneStructure object + Args : + +=cut + + +sub next_prediction { + my ($self) = @_; + + my $genes; + while ($_ = $self->_readline) { + $self->debug( $_ ) if( $self->verbose > 0); + last if( /^\/\//); + + if( /^Gene\s+\d+\s*$/ ) { + $genes = new Bio::SeqFeature::Gene::GeneStructure + (-source => $Srctag, + -seq_id => $self->_target_id, # if this had been specified + ); + $_ = $self->_readline; + $self->debug( $_ ) if( $self->verbose > 0); + + unless ( /^Gene\s+(\d+)\s+(\d+)\s*$/ ) { + $self->warn("Unparseable genomewise output"); + last; + } + my $transcript = new Bio::SeqFeature::Gene::Transcript + (-source => $Srctag, + -seq_id => $self->_target_id, # if this had been specified + -start => $1, + -end => $2, + ); + my $nbr = 1; + while( $_ = $self->_readline ) { + $self->debug( $_ ) if( $self->verbose > 0); + + unless( m/^\s+Exon\s+(\d+)\s+(\d+)\s+phase\s+(\d+)/ ){ + $self->_pushback($_); + last; + } + my ($e_start,$e_end,$phase,$e_strand) = ($1,$2,$3); + + ($e_start,$e_end,$e_strand) = $self->_get_strand($e_start, + $e_end); + $transcript->strand($e_strand) unless $transcript->strand != 0; + + my $exon = new Bio::SeqFeature::Gene::Exon + (-seq_id=>$self->_target_id, + -source => $Srctag, + -start=>$e_start, + -end=>$e_end, + -frame => $phase, + -strand=>$e_strand); + $exon->add_tag_value("Exon",$nbr++); + $exon->add_tag_value('phase',$phase); + $transcript->add_exon($exon); + } + $genes->add_transcript($transcript); + last; # only process a single gene at a time + } + } + return $genes; +} +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Genscan.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Genscan.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,491 @@ +# $Id: Genscan.pm,v 1.22 2002/10/22 07:38:46 lapp Exp $ +# +# BioPerl module for Bio::Tools::Genscan +# +# Cared for by Hilmar Lapp +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::Genscan - Results of one Genscan run + +=head1 SYNOPSIS + + $genscan = Bio::Tools::Genscan->new(-file => 'result.genscan'); + # filehandle: + $genscan = Bio::Tools::Genscan->new( -fh => \*INPUT ); + + # parse the results + # note: this class is-a Bio::Tools::AnalysisResult which implements + # Bio::SeqAnalysisParserI, i.e., $genscan->next_feature() is the same + while($gene = $genscan->next_prediction()) { + # $gene is an instance of Bio::Tools::Prediction::Gene, which inherits + # off Bio::SeqFeature::Gene::Transcript. + # + # $gene->exons() returns an array of + # Bio::Tools::Prediction::Exon objects + # all exons: + @exon_arr = $gene->exons(); + + # initial exons only + @init_exons = $gene->exons('Initial'); + # internal exons only + @intrl_exons = $gene->exons('Internal'); + # terminal exons only + @term_exons = $gene->exons('Terminal'); + # singleton exons: + ($single_exon) = $gene->exons(); + } + + # essential if you gave a filename at initialization (otherwise the file + # will stay open) + $genscan->close(); + +=head1 DESCRIPTION + +The Genscan module provides a parser for Genscan gene structure prediction +output. It parses one gene prediction into a Bio::SeqFeature::Gene::Transcript- +derived object. + +This module also implements the Bio::SeqAnalysisParserI interface, and thus +can be used wherever such an object fits. See L. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp@gmx.net + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Tools::Genscan; +use vars qw(@ISA); +use strict; +use Symbol; + +use Bio::Root::Root; +use Bio::Tools::AnalysisResult; +use Bio::Tools::Prediction::Gene; +use Bio::Tools::Prediction::Exon; + +@ISA = qw(Bio::Tools::AnalysisResult); + +sub _initialize_state { + my ($self,@args) = @_; + + # first call the inherited method! + $self->SUPER::_initialize_state(@args); + + # our private state variables + $self->{'_preds_parsed'} = 0; + $self->{'_has_cds'} = 0; + # array of pre-parsed predictions + $self->{'_preds'} = []; + # seq stack + $self->{'_seqstack'} = []; +} + +=head2 analysis_method + + Usage : $genscan->analysis_method(); + Purpose : Inherited method. Overridden to ensure that the name matches + /genscan/i. + Returns : String + Argument : n/a + +=cut + +#------------- +sub analysis_method { +#------------- + my ($self, $method) = @_; + if($method && ($method !~ /genscan/i)) { + $self->throw("method $method not supported in " . ref($self)); + } + return $self->SUPER::analysis_method($method); +} + +=head2 next_feature + + Title : next_feature + Usage : while($gene = $genscan->next_feature()) { + # do something + } + Function: Returns the next gene structure prediction of the Genscan result + file. Call this method repeatedly until FALSE is returned. + + The returned object is actually a SeqFeatureI implementing object. + This method is required for classes implementing the + SeqAnalysisParserI interface, and is merely an alias for + next_prediction() at present. + + Example : + Returns : A Bio::Tools::Prediction::Gene object. + Args : + +=cut + +sub next_feature { + my ($self,@args) = @_; + # even though next_prediction doesn't expect any args (and this method + # does neither), we pass on args in order to be prepared if this changes + # ever + return $self->next_prediction(@args); +} + +=head2 next_prediction + + Title : next_prediction + Usage : while($gene = $genscan->next_prediction()) { + # do something + } + Function: Returns the next gene structure prediction of the Genscan result + file. Call this method repeatedly until FALSE is returned. + + Example : + Returns : A Bio::Tools::Prediction::Gene object. + Args : + +=cut + +sub next_prediction { + my ($self) = @_; + my $gene; + + # if the prediction section hasn't been parsed yet, we do this now + $self->_parse_predictions() unless $self->_predictions_parsed(); + + # get next gene structure + $gene = $self->_prediction(); + + if($gene) { + # fill in predicted protein, and if available the predicted CDS + # + my ($id, $seq); + # use the seq stack if there's a seq on it + my $seqobj = pop(@{$self->{'_seqstack'}}); + if(! $seqobj) { + # otherwise read from input stream + ($id, $seq) = $self->_read_fasta_seq(); + # there may be no sequence at all, or none any more + if($id && $seq) { + $seqobj = Bio::PrimarySeq->new('-seq' => $seq, + '-display_id' => $id, + '-alphabet' => "protein"); + } + } + if($seqobj) { + # check that prediction number matches the prediction number + # indicated in the sequence id (there may be incomplete gene + # predictions that contain only signals with no associated protein + # and CDS, like promoters, poly-A sites etc) + $gene->primary_tag() =~ /[^0-9]([0-9]+)$/; + my $prednr = $1; + if($seqobj->display_id() !~ /_predicted_\w+_$prednr\|/) { + # this is not our sequence, so push back for next prediction + push(@{$self->{'_seqstack'}}, $seqobj); + } else { + $gene->predicted_protein($seqobj); + # CDS prediction, too? + if($self->_has_cds()) { + ($id, $seq) = $self->_read_fasta_seq(); + $seqobj = Bio::PrimarySeq->new('-seq' => $seq, + '-display_id' => $id, + '-alphabet' => "dna"); + $gene->predicted_cds($seqobj); + } + } + } + } + + return $gene; +} + +=head2 _parse_predictions + + Title : _parse_predictions() + Usage : $obj->_parse_predictions() + Function: Parses the prediction section. Automatically called by + next_prediction() if not yet done. + Example : + Returns : + +=cut + +sub _parse_predictions { + my ($self) = @_; + my %exontags = ('Init' => 'Initial', + 'Intr' => 'Internal', + 'Term' => 'Terminal', + 'Sngl' => ''); + my $gene; + my $seqname; + + while(defined($_ = $self->_readline())) { + if(/^\s*(\d+)\.(\d+)/) { + # exon or signal + my $prednr = $1; + my $signalnr = $2; # not used presently + if(! defined($gene)) { + $gene = Bio::Tools::Prediction::Gene->new( + '-primary' => "GenePrediction$prednr", + '-source' => 'Genscan'); + } + # split into fields + chomp(); + my @flds = split(' ', $_); + # create the feature object depending on the type of signal + my $predobj; + my $is_exon = grep {$_ eq $flds[1];} (keys(%exontags)); + if($is_exon) { + $predobj = Bio::Tools::Prediction::Exon->new(); + } else { + # PolyA site, or Promoter + $predobj = Bio::SeqFeature::Generic->new(); + } + # set common fields + $predobj->source_tag('Genscan'); + $predobj->score($flds[$#flds]); + $predobj->strand((($flds[2] eq '+') ? 1 : -1)); + my ($start, $end) = @flds[(3,4)]; + if($predobj->strand() == 1) { + $predobj->start($start); + $predobj->end($end); + } else { + $predobj->end($start); + $predobj->start($end); + } + # add to gene structure (should be done only when start and end + # are set, in order to allow for proper expansion of the range) + if($is_exon) { + # first, set fields unique to exons + $predobj->start_signal_score($flds[8]); + $predobj->end_signal_score($flds[9]); + $predobj->coding_signal_score($flds[10]); + $predobj->significance($flds[11]); + $predobj->primary_tag($exontags{$flds[1]} . 'Exon'); + $predobj->is_coding(1); + # Figure out the frame of this exon. This is NOT the frame + # given by Genscan, which is the absolute frame of the base + # starting the first predicted complete codon. By comparing + # to the absolute frame of the first base we can compute the + # offset of the first complete codon to the first base of the + # exon, which determines the frame of the exon. + my $cod_offset; + if($predobj->strand() == 1) { + $cod_offset = $flds[6] - (($predobj->start()-1) % 3); + # Possible values are -2, -1, 0, 1, 2. -1 and -2 correspond + # to offsets 2 and 1, resp. Offset 3 is the same as 0. + $cod_offset += 3 if($cod_offset < 1); + } else { + # On the reverse strand the Genscan frame also refers to + # the first base of the first complete codon, but viewed + # from forward, which is the third base viewed from + # reverse. + $cod_offset = $flds[6] - (($predobj->end()-3) % 3); + # Possible values are -2, -1, 0, 1, 2. Due to the reverse + # situation, {2,-1} and {1,-2} correspond to offsets + # 1 and 2, resp. Offset 3 is the same as 0. + $cod_offset -= 3 if($cod_offset >= 0); + $cod_offset = -$cod_offset; + } + # Offsets 2 and 1 correspond to frame 1 and 2 (frame of exon + # is the frame of the first base relative to the exon, or the + # number of bases the first codon is missing). + $predobj->frame(3 - $cod_offset); + # then add to gene structure object + $gene->add_exon($predobj, $exontags{$flds[1]}); + } elsif($flds[1] eq 'PlyA') { + $predobj->primary_tag("PolyAsite"); + $gene->poly_A_site($predobj); + } elsif($flds[1] eq 'Prom') { + $predobj->primary_tag("Promoter"); + $gene->add_promoter($predobj); + } + next; + } + if(/^\s*$/ && defined($gene)) { + # current gene is completed + $gene->seq_id($seqname); + $self->_add_prediction($gene); + $gene = undef; + next; + } + if(/^(GENSCAN)\s+(\S+)/) { + $self->analysis_method($1); + $self->analysis_method_version($2); + next; + } + if(/^Sequence\s+(\S+)\s*:/) { + $seqname = $1; + next; + } + + if(/^Parameter matrix:\s+(\S+)/i) { + $self->analysis_subject($1); + next; + } + + if(/^Predicted coding/) { + $self->_has_cds(1); + next; + } + /^>/ && do { + # section of predicted sequences + $self->_pushback($_); + last; + }; + } + $self->_predictions_parsed(1); +} + +=head2 _prediction + + Title : _prediction() + Usage : $gene = $obj->_prediction() + Function: internal + Example : + Returns : + +=cut + +sub _prediction { + my ($self) = @_; + + return undef unless(exists($self->{'_preds'}) && @{$self->{'_preds'}}); + return shift(@{$self->{'_preds'}}); +} + +=head2 _add_prediction + + Title : _add_prediction() + Usage : $obj->_add_prediction($gene) + Function: internal + Example : + Returns : + +=cut + +sub _add_prediction { + my ($self, $gene) = @_; + + if(! exists($self->{'_preds'})) { + $self->{'_preds'} = []; + } + push(@{$self->{'_preds'}}, $gene); +} + +=head2 _predictions_parsed + + Title : _predictions_parsed + Usage : $obj->_predictions_parsed + Function: internal + Example : + Returns : TRUE or FALSE + +=cut + +sub _predictions_parsed { + my ($self, $val) = @_; + + $self->{'_preds_parsed'} = $val if $val; + if(! exists($self->{'_preds_parsed'})) { + $self->{'_preds_parsed'} = 0; + } + return $self->{'_preds_parsed'}; +} + +=head2 _has_cds + + Title : _has_cds() + Usage : $obj->_has_cds() + Function: Whether or not the result contains the predicted CDSs, too. + Example : + Returns : TRUE or FALSE + +=cut + +sub _has_cds { + my ($self, $val) = @_; + + $self->{'_has_cds'} = $val if $val; + if(! exists($self->{'_has_cds'})) { + $self->{'_has_cds'} = 0; + } + return $self->{'_has_cds'}; +} + +=head2 _read_fasta_seq + + Title : _read_fasta_seq() + Usage : ($id,$seqstr) = $obj->_read_fasta_seq(); + Function: Simple but specialised FASTA format sequence reader. Uses + $self->_readline() to retrieve input, and is able to strip off + the traling description lines. + Example : + Returns : An array of two elements. + +=cut + +sub _read_fasta_seq { + my ($self) = @_; + my ($id, $seq); + local $/ = ">"; + + my $entry = $self->_readline(); + if($entry) { + $entry =~ s/^>//; + # complete the entry if the first line came from a pushback buffer + while($entry !~ />$/) { + last unless $_ = $self->_readline(); + $entry .= $_; + } + # delete everything onwards from an intervening empty line (at the + # end there might be statistics stuff) + $entry =~ s/\n\n.*$//s; + # id and sequence + if($entry =~ /^(\S+)\n([^>]+)/) { + $id = $1; + $seq = $2; + } else { + $self->throw("Can't parse Genscan predicted sequence entry"); + } + $seq =~ s/\s//g; # Remove whitespace + } + return ($id, $seq); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Grail.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Grail.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,261 @@ +# $Id: Grail.pm,v 1.6 2002/12/01 00:05:21 jason Exp $ +# +# BioPerl module for Bio::Tools::Grail +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::Grail - Results of one Grail run + +=head1 SYNOPSIS + + $grail = Bio::Tools::Grail->new(-file => 'result.grail'); + # filehandle: + $grail = Bio::Tools::Grail->new( -fh => \*INPUT ); + + # parse the results + while($gene = $grail->next_prediction()) { + # $gene is an instance of Bio::Tools::Prediction::Gene + + # $gene->exons() returns an array of + # Bio::Tools::Prediction::Exon objects + # all exons: + @exon_arr = $gene->exons(); + + # initial exons only + @init_exons = $gene->exons('Initial'); + # internal exons only + @intrl_exons = $gene->exons('Internal'); + # terminal exons only + @term_exons = $gene->exons('Terminal'); + # singleton exons only -- should be same as $gene->exons() because + # there are no other exons supposed to exist in this structure + @single_exons = $gene->exons('Single'); + } + + # essential if you gave a filename at initialization (otherwise the file + # will stay open) + $genscan->close(); + +=head1 DESCRIPTION + +The Grail module provides a parser for Grail gene structure prediction +output. + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::Tools::Grail; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::Root::IO; +use Bio::Tools::Prediction::Gene; +use Bio::Tools::Prediction::Exon; +use Symbol; + +@ISA = qw(Bio::Root::IO Bio::Root::Root); + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + $self->_initialize_io(@args); + + return $self; +} + +=head2 next_prediction + + Title : next_prediction + Usage : while($gene = $grail->next_prediction()) { + # do something + } + Function: Returns the next gene structure prediction of the Grail result + file. Call this method repeatedly until FALSE is returned. + + Example : + Returns : A Bio::Tools::Prediction::Gene object. + Args : + +=cut + +sub next_prediction { + my ($self) = @_; + + # get next gene structure + my $gene = $self->_prediction(); + + if($gene) { + # fill in predicted protein, and if available the predicted CDS + # + my ($id, $seq); + # use the seq stack if there's a seq on it + my $seqobj = pop(@{$self->{'_seqstack'}}); + if(! $seqobj) { + # otherwise read from input stream + ($id, $seq) = $self->_read_fasta_seq(); + $seqobj = Bio::PrimarySeq->new('-seq' => $seq, + '-display_id' => $id, + '-alphabet' => "protein"); + } + # check that prediction number matches the prediction number + # indicated in the sequence id (there may be incomplete gene + # predictions that contain only signals with no associated protein + # and CDS, like promoters, poly-A sites etc) + $gene->primary_tag() =~ /[^0-9]([0-9]+)$/; + my $prednr = $1; + if($seqobj->display_id() !~ /_predicted_\w+_$prednr\|/) { + # this is not our sequence, so push back for the next prediction + push(@{$self->{'_seqstack'}}, $seqobj); + } else { + $gene->predicted_protein($seqobj); + # CDS prediction, too? + if($self->_has_cds()) { + ($id, $seq) = $self->_read_fasta_seq(); + $seqobj = Bio::PrimarySeq->new('-seq' => $seq, + '-display_id' => $id, + '-alphabet' => "dna"); + $gene->predicted_cds($seqobj); + } + } + } + return $gene; +} + +=head2 _parse_predictions + + Title : _parse_predictions() + Usage : $obj->_parse_predictions() + Function: Parses the prediction section. Automatically called by + next_prediction() if not yet done. + Example : + Returns : + +=cut + +sub _parse_predictions { + my ($self) = @_; + + # code needs to go here + + $self->_predictions_parsed(1); +} + +=head2 _prediction + + Title : _prediction() + Usage : $gene = $obj->_prediction() + Function: internal + Example : + Returns : + +=cut + +sub _prediction { + my ($self) = @_; + + return undef unless(exists($self->{'_preds'}) && @{$self->{'_preds'}}); + return shift(@{$self->{'_preds'}}); +} + +=head2 _add_prediction + + Title : _add_prediction() + Usage : $obj->_add_prediction($gene) + Function: internal + Example : + Returns : + +=cut + +sub _add_prediction { + my ($self, $gene) = @_; + + if(! exists($self->{'_preds'})) { + $self->{'_preds'} = []; + } + push(@{$self->{'_preds'}}, $gene); +} + +=head2 _predictions_parsed + + Title : _predictions_parsed + Usage : $obj->_predictions_parsed + Function: internal + Example : + Returns : TRUE or FALSE + +=cut + +sub _predictions_parsed { + my ($self, $val) = @_; + + $self->{'_preds_parsed'} = $val if $val; + if(! exists($self->{'_preds_parsed'})) { + $self->{'_preds_parsed'} = 0; + } + return $self->{'_preds_parsed'}; +} + +=head2 _has_cds + + Title : _has_cds() + Usage : $obj->_has_cds() + Function: Whether or not the result contains the predicted CDSs, too. + Example : + Returns : TRUE or FALSE + +=cut + +sub _has_cds { + my ($self, $val) = @_; + + $self->{'_has_cds'} = $val if $val; + if(! exists($self->{'_has_cds'})) { + $self->{'_has_cds'} = 0; + } + return $self->{'_has_cds'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/HMMER/Domain.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/HMMER/Domain.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,363 @@ +# $Id: Domain.pm,v 1.11 2002/10/08 08:38:34 lapp Exp $ +# +# BioPerl module for Bio::Tools::HMMER::Domain +# +# Cared for by Ewan Birney +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::HMMER::Domain - One particular domain hit from HMMER + +=head1 SYNOPSIS + +Read the Bio::Tools::HMMER::Results docs + +=head1 DESCRIPTION + +A particular domain score. We reuse the Homol SeqFeature system +here, so this inherits off Homol SeqFeature. As this code +originally came from a separate project, there are some backward +compatibility stuff provided to keep this working with old code. + +Don't forget this inherits off Bio::SeqFeature, so all your usual +nice start/end/score stuff is ready for use. + +=head1 CONTACT + +Ewan Birney, birney@ebi.ac.uk + +=head1 CONTRIBUTORS + +Jason Stajich, jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +#' +package Bio::Tools::HMMER::Domain; + +use vars qw(@ISA); +use Bio::SeqFeature::FeaturePair; +use Bio::SeqFeature::Generic; +use strict; + + +@ISA = qw(Bio::SeqFeature::FeaturePair); + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + + $self->{'alignlines'} = []; + + my $hmmf1 = Bio::SeqFeature::Generic->new(@args); + my $hmmf2 = Bio::SeqFeature::Generic->new(@args); + + $self->feature1($hmmf1); + $self->feature2($hmmf2); + + return $self; +} + +=head2 add_alignment_line + + Title : add_alignment_line + Usage : $domain->add_alignment_line($line_from_hmmer_output); + Function: add an alignment line to this Domain object + Returns : Nothing + Args : scalar + + Adds an alignment line, mainly for storing the HMMER alignments +as flat text which can be reguritated. You're right. This is *not +nice* and not the right way to do it. C'est la vie. + +=cut + +sub add_alignment_line { + my $self = shift; + my $line = shift; + push(@{$self->{'alignlines'}},$line); +} + +=head2 each_alignment_line + + Title : each_alignment_line + Usage : foreach $line ( $domain->each_alignment_line ) + Function: reguritates the alignment lines as they were fed in. + only useful realistically for printing. + Example : + Returns : + Args : None + + +=cut + +sub each_alignment_line { + my $self = shift; + return @{$self->{'alignlines'}}; +} + +=head2 get_nse + + Title : get_nse + Usage : $domain->get_nse() + Function: Provides a seqname/start-end format, useful + for unique keys. nse stands for name-start-end + It is used alot in Pfam + Example : + Returns : A string + Args : Optional seperator 1 and seperator 2 (default / and -) + + +=cut + + + +sub get_nse { + my $self = shift; + my $sep1 = shift; + my $sep2 = shift; + + if( !defined $sep2 ) { + $sep2 = "-"; + } + if( !defined $sep1 ) { + $sep1 = "/"; + } + + return sprintf("%s%s%d%s%d",$self->seq_id,$sep1,$self->start,$sep2,$self->end); +} + + +# =head2 start_seq + +# Title : start_seq +# Usage : Backward compatibility with old HMMER modules. +# should use $domain->start +# Function: +# Example : +# Returns : +# Args : + +# =cut + +sub start_seq { + my $self = shift; + my $start = shift; + + $self->warn("Using old domain->start_seq. Should use domain->start"); + return $self->start($start); +} + +# =head2 end_seq + +# Title : end_seq +# Usage : Backward compatibility with old HMMER modules. +# should use $domain->end +# Function: +# Example : +# Returns : +# Args : + +# =cut + +sub end_seq { + my $self = shift; + my $end = shift; + + $self->warn("Using old domain->end_seq. Should use domain->end"); + return $self->end($end); +} + +# =head2 start_hmm + +# Title : start_hmm +# Usage : Backward compatibility with old HMMER modules, and +# for convience. Equivalent to $self->homol_SeqFeature->start +# Function: +# Example : +# Returns : +# Args : + +# =cut + +sub start_hmm { + my $self = shift; + my $start = shift; + $self->warn("Using old domain->start_hmm. Should use domain->hstart"); + return $self->hstart($start); +} + +# =head2 end_hmm + +# Title : end_hmm +# Usage : Backward compatibility with old HMMER modules, and +# for convience. Equivalent to $self->homol_SeqFeature->start +# Function: +# Example : +# Returns : +# Args : + +# =cut + +sub end_hmm { + my $self = shift; + my $end = shift; + + $self->warn("Using old domain->end_hmm. Should use domain->hend"); + return $self->hend($end); +} + +=head2 hmmacc + + Title : hmmacc + Usage : $domain->hmmacc($newacc) + Function: set get for HMM accession number. This is placed in the homol + feature of the HMM + Example : + Returns : + Args : + + +=cut + +sub hmmacc{ + my ($self,$acc) = @_; + if( defined $acc ) { + $self->feature2->add_tag_value('accession',$acc); + } + my @vals = $self->feature2->each_tag_value('accession'); + return shift @vals; +} + +=head2 hmmname + + Title : hmmname + Usage : $domain->hmmname($newname) + Function: set get for HMM accession number. This is placed in the homol + feature of the HMM + Example : + Returns : + Args : + +=cut + +sub hmmname { + my ($self,$hname) = @_; + + + if( defined $hname ) { + $self->hseqname($hname); + } + + return $self->hseqname(); +} + +=head2 bits + + Title : bits + Usage : + Function: backward compatibility. Same as score + Example : + Returns : + Args : + +=cut + +sub bits{ + my ($self,$sc) = @_; + + return $self->score($sc); +} + +=head2 evalue + + Title : evalue + Usage : + Function: $domain->evalue($value); + Example : + Returns : + Args : + +=cut + +sub evalue{ + my ($self,$value) = @_; + + if( defined $value ) { + $self->add_tag_value('evalue',$value); + } + my @vals = $self->each_tag_value('evalue'); + return shift @vals; +} + +=head2 seqbits + + Title : seqbits + Usage : + Function: $domain->seqbits($value); + Example : + Returns : + Args : + +=cut + +sub seqbits { + my ($self,$value) = @_; + if( defined $value ) { + $self->add_tag_value('seqbits',$value); + } + my @vals = $self->each_tag_value('seqbits'); + return shift @vals; +} + +=head2 seq_range + + Title : seq_range + Usage : + Function: Throws an exception to catch scripts which need to upgrade + Example : + Returns : + Args : + +=cut + +sub seq_range{ + my ($self,@args) = @_; + + $self->throw("You have accessed an old method. Please recode your script to the new bioperl HMMER module"); +} + +=head2 hmm_range + + Title : hmm_range + Usage : + Function: Throws an exception to catch scripts which need to upgrade + Example : + Returns : + Args : + + +=cut + +sub hmm_range{ + my ($self,@args) = @_; + + $self->throw("You have accessed an old method. Please recode your script to the new bioperl HMMER module"); +} + +1; # says use was ok +__END__ + + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/HMMER/Results.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/HMMER/Results.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,968 @@ +# $Id: Results.pm,v 1.22.2.1 2003/01/07 13:58:01 jason Exp $ +# +# Perl Module for HMMResults +# +# Cared for by Ewan Birney +# +#Copyright Genome Research Limited (1997). + +=head1 NAME + +Bio::Tools::HMMER::Results - Object representing HMMER output results + +=head1 SYNOPSIS + + # parse a hmmsearch file (can also parse a hmmpfam file) + $res = new Bio::Tools::HMMER::Results( -file => 'output.hmm' , -type => 'hmmsearch'); + + # print out the results for each sequence + foreach $seq ( $res->each_Set ) { + print "Sequence bit score is",$seq->bits,"\n"; + foreach $domain ( $seq->each_Domain ) { + print " Domain start ",$domain->start," end ",$domain->end, + " score ",$domain->bits,"\n"; + } + } + + # new result object on a sequence/domain cutoff of 25 bits sequence, 15 bits domain + $newresult = $res->filter_on_cutoff(25,15); + + # alternative way of getting out all domains directly + foreach $domain ( $res->each_Domain ) { + print "Domain on ",$domain->seq_id," with score ", + $domain->bits," evalue ",$domain->evalue,"\n"; + } + +=head1 DESCRIPTION + +This object represents HMMER output, either from hmmsearch or +hmmpfam. For hmmsearch, a series of HMMER::Set objects are made, one +for each sequence, which have the the bits score for the object. For +hmmpfam searches, only one Set object is made. + + +These objects come from the original HMMResults modules used +internally in Pfam, written by Ewan. Ewan then converted them to +bioperl objects in 1999. That conversion is meant to be backwardly +compatible, but may not be (caveat emptor). + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://www.bioperl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://www.bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney + +Email birney@ebi.ac.uk + +=head1 CONTRIBUTORS + +Jason Stajich, jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +package Bio::Tools::HMMER::Results; + +use vars qw(@ISA); +use Carp; +use strict; + +use Bio::Root::Root; +use Bio::Root::IO; +use Bio::Tools::HMMER::Domain; +use Bio::Tools::HMMER::Set; +use Bio::SeqAnalysisParserI; +use Symbol; + +@ISA = qw(Bio::Root::Root Bio::Root::IO Bio::SeqAnalysisParserI); + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + $self->{'domain'} = []; # array of HMMUnits + $self->{'seq'} = {}; + + my ($parsetype) = $self->_rearrange([qw(TYPE)],@args); + $self->_initialize_io(@args); + if( !defined $parsetype ) { + $self->throw("No parse type provided. should be hmmsearch or hmmpfam"); + } + $self->parsetype($parsetype); + if( defined $self->_fh() ) { + if( $parsetype eq 'hmmsearch' ) { + $self->_parse_hmmsearch($self->_fh()); + } elsif ( $parsetype eq 'hmmpfam' ) { + $self->_parse_hmmpfam($self->_fh()); + } else { + $self->throw("Did not recoginise type $parsetype"); + } + } + + return $self; # success - we hope! +} + + +=head2 next_feature + + Title : next_feature + Usage : while( my $feat = $res->next_feature ) { # do something } + Function: SeqAnalysisParserI implementing function + Example : + Returns : A Bio::SeqFeatureI compliant object, in this case, + each DomainUnit object, ie, flattening the Sequence + aspect of this. + Args : None + + +=cut + +sub next_feature{ + my ($self) = @_; + + if( $self->{'_started_next_feature'} == 1 ) { + return shift @{$self->{'_next_feature_array'}}; + } else { + $self->{'_started_next_feature'} = 1; + my @array; + foreach my $seq ( $self->each_Set() ) { + foreach my $unit ( $seq->each_Domain() ) { + push(@array,$unit); + } + } + my $res = shift @array; + $self->{'_next_feature_array'} = \@array; + return $res; + } + + $self->throw("Should not reach here! Error!"); +} + + +=head2 number + + Title : number + Usage : print "There are ",$res->number," domains hit\n"; + Function: provides the number of domains in the HMMER report + +=cut + +sub number { + my $self = shift; + my @val; + my $ref; + $ref = $self->{'domain'}; + + + @val = @{$self->{'domain'}}; + return scalar @val; +} + +=head2 seqfile + + Title : seqfile + Usage : $obj->seqfile($newval) + Function: + Example : + Returns : value of seqfile + Args : newvalue (optional) + + +=cut + +sub seqfile{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'seqfile'} = $value; + } + return $self->{'seqfile'}; + +} + +=head2 hmmfile + + Title : hmmfile + Usage : $obj->hmmfile($newval) + Function: + Example : + Returns : value of hmmfile + Args : newvalue (optional) + + +=cut + +sub hmmfile{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'hmmfile'} = $value; + } + return $self->{'hmmfile'}; + +} + +=head2 add_Domain + + Title : add_Domain + Usage : $res->add_Domain($unit) + Function: adds a domain to the results array. Mainly used internally. + Args : A Bio::Tools::HMMER::Domain + + +=cut + +sub add_Domain { + my $self = shift; + my $unit = shift; + my $name; + + $name = $unit->seq_id(); + + if( ! exists $self->{'seq'}->{$name} ) { + $self->warn("Adding a domain of $name but with no HMMSequence. Will be kept in domain array but not added to a HMMSequence"); + } else { + $self->{'seq'}->{$name}->add_Domain($unit); + } + push(@{$self->{'domain'}},$unit); +} + + +=head2 each_Domain + + Title : each_Domain + Usage : foreach $domain ( $res->each_Domain() ) + Function: array of Domain units which are held in this report + Returns : array + Args : none + + +=cut + +sub each_Domain { + my $self = shift; + my (@arr,$u); + + foreach $u ( @{$self->{'domain'}} ) { + push(@arr,$u); + } + + return @arr; +} + + +=head2 domain_bits_cutoff_from_evalue + + Title : domain_bits_cutoff_from_evalue + Usage : $cutoff = domain_bits_cutoff_from_evalue(0.01); + Function: return a bits cutoff from an evalue using the + scores here. Somewhat interesting logic: + Find the two bit score which straddle the evalue + if( 25 is between these two points) return 25 + else return the midpoint. + + This logic tries to ensure that with large signal to + noise separation one still has sensible 25 bit cutoff + Returns : + Args : + +=cut + +sub domain_bits_cutoff_from_evalue { + my $self = shift; + my $eval = shift; + my ($dom,$prev,@doms,$cutoff,$sep,$seen); + + @doms = $self->each_Domain; + + + @doms = map { $_->[0] } + sort { $b->[1] <=> $a->[1] } + map { [ $_, $_->bits] } @doms; + $seen = 0; + foreach $_ ( @doms ) { + if( $_->evalue > $eval ) { + $seen = 1; + $dom = $_; + last; + } + $prev = $_; + } + + if( ! defined $prev || $seen == 0) { + $self->throw("Evalue is either above or below the list..."); + return undef; + } + + $sep = $prev->bits - $dom->bits ; + + if( $sep < 1 ) { + return $prev->bits(); + } + if( $dom->bits < 25 && $prev->bits > 25 ) { + return 25; + } + + return int( $dom->bits + $sep/2 ) ; + +} + + +sub dictate_hmm_acc { + my $self = shift; + my $acc = shift; + my ($unit); + + + foreach $unit ( $self->eachHMMUnit() ) { + $unit->hmmacc($acc); + } +} + +=head2 write_FT_output + + Title : write_FT_output + Usage : $res->write_FT_output(\*STDOUT,'DOMAIN') + Function: writes feature table output ala swissprot + Returns : + Args : + + +=cut + +sub write_FT_output { + my $self = shift; + my $file = shift; + my $idt = shift; + my ($seq,$unit); + + if( !defined $idt ) { + $idt = "DOMAIN"; + } + + foreach $seq ( $self->each_Set() ) { + print $file sprintf("ID %s\n",$seq->name()); + foreach $unit ( $seq->each_Domain() ) { + print $file sprintf("FT %s %d %d %s\n",$idt, + $unit->start,$unit->end,$unit->hmmname); + } + print $file "//\n"; + } +} + +=head2 filter_on_cutoff + + Title : filter_on_cutoff + Usage : $newresults = $results->filter_on_cutoff(25,15); + Function: Produces a new HMMER::Results module which has + been trimmed at the cutoff. + Returns : a Bio::Tools::HMMER::Results module + Args : sequence cutoff and domain cutoff. in bits score + if you want one cutoff, simply use same number both places + +=cut + +sub filter_on_cutoff { + my $self = shift; + my $seqthr = shift; + my $domthr = shift; + my ($new,$seq,$unit,@array,@narray); + + if( !defined $domthr ) { + $self->throw("hmmresults filter on cutoff needs two arguments"); + } + + $new = Bio::Tools::HMMER::Results->new(-type => $self->parsetype); + + foreach $seq ( $self->each_Set()) { + next if( $seq->bits() < $seqthr ); + $new->add_Set($seq); + foreach $unit ( $seq->each_Domain() ) { + next if( $unit->bits() < $domthr ); + $new->add_Domain($unit); + } + } + $new; +} + +=head2 write_ascii_out + + Title : write_ascii_out + Usage : $res->write_ascii_out(\*STDOUT) + Function: writes as + seq seq_start seq_end model-acc model_start model_end model_name + Returns : + Args : + + FIXME: Now that we have no modelacc, this is probably a bad thing. + +=cut + +# writes as seq sstart send modelacc hstart hend modelname + +sub write_ascii_out { + my $self = shift; + my $fh = shift; + my ($unit,$seq); + + if( !defined $fh) { + $fh = \*STDOUT; + } + + + foreach $seq ( $self->each_Set()) { + foreach $unit ( $seq->each_Domain()) { + print $fh sprintf("%s %4d %4d %s %4d %4d %4.2f %4.2g %s\n", + $unit->seq_id(),$unit->start(),$unit->end(), + $unit->hmmacc,$unit->hstart,$unit->hend, + $unit->bits,$unit->evalue,$unit->hmmname); + } + } + +} + +=head2 write_GDF_bits + + Title : write_GDF_bits + Usage : $res->write_GDF_bits(25,15,\*STDOUT) + Function: writes GDF format with a sequence,domain threshold + Returns : + Args : + +=cut + +sub write_GDF_bits { + my $self = shift; + my $seqt = shift; + my $domt = shift; + my $file = shift; + my $seq; + my $unit; + my (@array,@narray); + + if( !defined $file ) { + $self->throw("Attempting to use write_GDF_bits without passing in correct arguments!"); + return; + } + + foreach $seq ( $self->each_Set()) { + + if( $seq->bits() < $seqt ) { + next; + } + + foreach $unit ( $seq->each_Domain() ) { + if( $unit->bits() < $domt ) { + next; + } + push(@array,$unit); + } + + } + + @narray = sort { my ($aa,$bb,$st_a,$st_b); + $aa = $a->seq_id(); + $bb = $b->seq_id(); + if ( $aa eq $bb) { + $st_a = $a->start(); + $st_b = $b->start(); + return $st_a <=> $st_b; + } + else { + return $aa cmp $bb; + } } @array; + + foreach $unit ( @narray ) { + print $file sprintf("%-24s\t%6d\t%6d\t%15s\t%.1f\t%g\n",$unit->get_nse(),$unit->start(),$unit->end(),$unit->seq_id(),$unit->bits(),$unit->evalue); + } + +} + +sub write_scores_bits { + my $self = shift; + my $seqt = shift; + my $domt = shift; + my $file = shift; + my $seq; + my $unit; + my (@array,@narray); + + if( !defined $file ) { + carp("Attempting to use write_scores_bits without passing in correct arguments!"); + return; + } + + foreach $seq ( $self->eachHMMSequence()) { + + if( $seq->bits() < $seqt ) { + next; + } + + foreach $unit ( $seq->eachHMMUnit() ) { + if( $unit->bits() < $domt ) { + next; + } + push(@array,$unit); + } + + } + + @narray = sort { my ($aa,$bb,$st_a,$st_b); + $aa = $a->bits(); + $bb = $b->bits(); + return $aa <=> $bb; + } @array; + + foreach $unit ( @narray ) { + print $file sprintf("%4.2f %s\n",$unit->bits(),$unit->get_nse()); + } + +} + +sub write_GDF { + my $self = shift; + my $file = shift; + my $unit; + + if( !defined $file ) { + $file = \*STDOUT; + } + + + foreach $unit ( $self->eachHMMUnit() ) { + print $file sprintf("%-24s\t%6d\t%6d\t%15s\t%.1f\t%g\n",$unit->get_nse(),$unit->start(),$unit->end(),$unit->seq_id(),$unit->bits(),$unit->evalue); + } + +} + +sub highest_noise { + my $self = shift; + my $seqt = shift; + my $domt = shift; + my ($seq,$unit,$hseq,$hdom,$noiseseq,$noisedom); + + $hseq = $hdom = -100000; + + foreach $seq ( $self->eachHMMSequence()) { + if( $seq->bits() < $seqt && $seq->bits() > $hseq ) { + $hseq = $seq->bits(); + $noiseseq = $seq; + } + foreach $unit ( $seq->eachHMMUnit() ) { + if( (($seq->bits() < $seqt) || ($seq->bits() > $seqt && $unit->bits < $domt)) && $unit->bits() > $hdom ) { + $hdom = $unit->bits(); + $noisedom = $unit; + } + } + } + + + return ($noiseseq,$noisedom); + +} + + +sub lowest_true { + my $self = shift; + my $seqt = shift; + my $domt = shift; + my ($seq,$unit,$lowseq,$lowdom,$trueseq,$truedom); + + if( ! defined $domt ) { + carp "lowest true needs at least a domain threshold cut-off"; + return (0,0); + } + + $lowseq = $lowdom = 100000; + + foreach $seq ( $self->eachHMMSequence()) { + + if( $seq->bits() >= $seqt && $seq->bits() < $lowseq ) { + $lowseq = $seq->bits(); + $trueseq = $seq; + } + if( $seq->bits() < $seqt ) { + next; + } + + foreach $unit ( $seq->eachHMMUnit() ) { + if( $unit->bits() >= $domt && $unit->bits() < $lowdom ) { + $lowdom = $unit->bits(); + $truedom = $unit; + } + } + } + + + return ($trueseq,$truedom); + +} + + + +=head2 add_Set + + Title : add_Set + Usage : Mainly internal function + Function: + Returns : + Args : + + +=cut + +sub add_Set { + my $self = shift; + my $seq = shift; + my $name; + + $name = $seq->name(); + + if( exists $self->{'seq'}->{$name} ) { + $self->throw("You alredy have $name in HMMResults!"); + } + $self->{'seq'}->{$name} = $seq; +} + + +=head2 each_Set + + Title : each_Set + Usage : + Function: + Returns : + Args : + + +=cut + +sub each_Set { + my $self = shift; + my (@array,$name); + + + foreach $name ( keys %{$self->{'seq'}} ) { + push(@array,$self->{'seq'}->{$name}); + } + return @array; +} + + +=head2 get_Set + + Title : get_Set + Usage : $set = $res->get_Set('sequence-name'); + Function: returns the Set for a particular sequence + Returns : a HMMER::Set object + Args : name of the sequence + + +=cut + +sub get_Set { + my $self = shift; + my $name = shift; + + return $self->{'seq'}->{$name}; +} + + +=head2 _parse_hmmpfam + + Title : _parse_hmmpfam + Usage : $res->_parse_hmmpfam($filehandle) + Function: + Returns : + Args : + + +=cut + +sub _parse_hmmpfam { + my $self = shift; + my $file = shift; + + my ($id,$sqfrom,$sqto,$hmmf,$hmmt,$sc,$ev, + $unit,$nd,$seq,$name,$seqname,$from, + $to,%hash,%acc,$acc); + my $count = 0; + + while(<$file>) { + if( /^HMM file:\s+(\S+)/ ) { $self->hmmfile($1); next; } + elsif( /^Sequence file:\s+(\S+)/ ) { $self->seqfile($1); next } + elsif( /^Query(\s+sequence)?:\s+(\S+)/ ) { + + $seqname = $2; + + $seq = Bio::Tools::HMMER::Set->new(); + + $seq ->name($seqname); + $self->add_Set($seq); + %hash = (); + + while(<$file>){ + + if( /Accession:\s+(\S+)/ ) { $seq->accession($1); next } + elsif( s/^Description:\s+// ) { chomp; $seq->desc($_); next } + /^Parsed for domains/ && last; + + # This is to parse out the accession numbers in old Pfam format. + # now not support due to changes in HMMER. + + if( (($id,$acc, $sc, $ev, $nd) = /^\s*(\S+)\s+(\S+).+?\s(\S+)\s+(\S+)\s+(\d+)\s*$/)) { + $hash{$id} = $sc; # we need this for the sequence + # core of the domains below! + $acc {$id} = $acc; + + # this is the more common parsing routine + } elsif ( (($id,$sc, $ev, $nd) = + /^\s*(\S+).+?\s(\S+)\s+(\S+)\s+(\d+)\s*$/) ) { + + $hash{$id} = $sc; # we need this for the + # sequence score of hte domains below! + + } + } + + while(<$file>) { + /^Align/ && last; + /^\/\// && last; + # this is meant to match + + #Sequence Domain seq-f seq-t hmm-f hmm-t score E-value + #-------- ------- ----- ----- ----- ----- ----- ------- + #PF00621 1/1 198 372 .. 1 207 [] 281.6 1e-80 + + if( (($id, $sqfrom, $sqto, $hmmf,$hmmt,$sc, $ev) = + /(\S+)\s+\S+\s+(\d+)\s+(\d+).+?(\d+)\s+(\d+)\s+\S+\s+(\S+)\s+(\S+)\s*$/)) { + $unit = Bio::Tools::HMMER::Domain->new(); + $unit->seq_id ($seqname); + $unit->hmmname ($id); + $unit->start ($sqfrom); + $unit->end ($sqto); + $unit->hstart($hmmf); + $unit->hend ($hmmt); + $unit->bits ($sc); + $unit->evalue ($ev); + + if( !exists($hash{$id}) ) { + $self->throw("HMMResults parsing error in hmmpfam for $id - can't find sequecne score"); + } + + $unit->seqbits($hash{$id}); + + if( defined $acc{$id} ) { + $unit->hmmacc($acc{$id}); + } + + # this should find it's own sequence! + $self->add_Domain($unit); + } + } + if( /^\/\// ) { next; } + + $_ = <$file>; + # parses alignment lines. Icky as we have to break on the same line + # that we need to read to place the alignment lines with the unit. + + while(1) { + (!defined $_ || /^\/\//) && last; + + # matches: + # PF00621: domain 1 of 1, from 198 to 372 + if( /^\s*(\S+):.*from\s+(\d+)\s+to\s+(\d+)/ ) { + + $name = $1; + $from = $2; + $to = $3; + + # find the HMMUnit which this alignment is from + + $unit = $self->get_unit_nse($seqname,$name,$from,$to); + if( !defined $unit ) { + $self->warn("Could not find $name $from $to unit even though I am reading it in. ugh!"); + $_ = <$file>; + next; + } + while(<$file>) { + /^\/\// && last; + /^\s*\S+:.*from\s+\d+\s+to\s+\d+/ && last; + $unit->add_alignment_line($_); + } + } else { + $_ = <$file>; + } + } + + # back to main 'Query:' loop + } + } +} + +# mainly internal function + +sub get_unit_nse { + my $self = shift; + my $seqname = shift; + my $domname = shift; + my $start = shift; + my $end = shift; + + my($seq,$unit); + + $seq = $self->get_Set($seqname); + + if( !defined $seq ) { + $self->throw("Could not get sequence name $seqname - so can't get its unit"); + } + + foreach $unit ( $seq->each_Domain() ) { + if( $unit->hmmname() eq $domname && $unit->start() == $start && $unit->end() == $end ) { + return $unit; + } + } + + return undef; +} + + +=head2 _parse_hmmsearch + + Title : _parse_hmmsearch + Usage : $res->_parse_hmmsearch($filehandle) + Function: + Returns : + Args : + + +=cut + +sub _parse_hmmsearch { + my $self = shift; + my $file = shift; + my ($id,$sqfrom,$sqto,$sc,$ev,$unit,$nd,$seq,$hmmf,$hmmt, + $hmmfname,$hmmacc, $hmmid, %seqh); + my $count = 0; + + while(<$file>) { + /^HMM file:\s+(\S+)/ and do { $self->hmmfile($1); $hmmfname = $1 }; + /^Accession:\s+(\S+)/ and do { $hmmacc = $1 }; + /^Query HMM:\s+(\S+)/ and do { $hmmid = $1 }; + /^Sequence database:\s+(\S+)/ and do { $self->seqfile($1) }; + /^Scores for complete sequences/ && last; + } + + $hmmfname = "given" if not $hmmfname; + + while(<$file>) { + /^Parsed for domains/ && last; + if( (($id, $sc, $ev, $nd) = /(\S+).+?\s(\S+)\s+(\S+)\s+(\d+)\s*$/)) { + $seq = Bio::Tools::HMMER::Set->new(); + $seq->name($id); + $seq->bits($sc); + $seqh{$id} = $sc; + $seq->evalue($ev); + $self->add_Set($seq); + $seq->accession($hmmacc); + } + } + + while(<$file>) { + /^Alignments of top-scoring domains/ && last; + if( (($id, $sqfrom, $sqto, $hmmf, $hmmt, $sc, $ev) = /(\S+)\s+\S+\s+(\d+)\s+(\d+).+?(\d+)\s+(\d+)\s+\S+\s+(\S+)\s+(\S+)\s*$/)) { + $unit = Bio::Tools::HMMER::Domain->new(); + + $unit->seq_id($id); + $unit->hmmname($hmmfname); + $unit->start($sqfrom); + $unit->end($sqto); + $unit->bits($sc); + $unit->hstart($hmmf); + $unit->hend($hmmt); + $unit->evalue($ev); + $unit->seqbits($seqh{$id}); + $self->add_Domain($unit); + $count++; + } + } + + $_ = <$file>; + + ## Recognize and store domain alignments + + while(1) { + if( !defined $_ ) { + last; + } + /^Histogram of all scores/ && last; + + # matches: + # PF00621: domain 1 of 1, from 198 to 372 + if( /^\s*(\S+):.*from\s+(\d+)\s+to\s+(\d+)/ ) { + my $name = $1; + my $from = $2; + my $to = $3; + + # find the HMMUnit which this alignment is from + $unit = $self->get_unit_nse($name,$hmmfname,$from,$to); + + if( !defined $unit ) { + $self->warn("Could not find $name $from $to unit even though I am reading it in. ugh!"); + next; + } + while(<$file>) { + /^Histogram of all scores/ && last; + /^\s*\S+:.*from\s+\d+\s+to\s+\d+/ && last; + $unit->add_alignment_line($_); + } + } + else { + $_ = <$file>; + } + } + + return $count; +} + +=head2 parsetype + + Title : parsetype + Usage : $obj->parsetype($newval) + Function: + Returns : value of parsetype + Args : newvalue (optional) + + +=cut + +sub parsetype{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'_parsetype'} = $value; + } + return $self->{'_parsetype'}; +} + +1; # says use was ok +__END__ + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/HMMER/Set.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/HMMER/Set.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,254 @@ +# $Id: Set.pm,v 1.13 2002/10/22 07:45:23 lapp Exp $ +# +# BioPerl module for Bio::Tools::HMMER::Set +# +# Cared for by Ewan Birney +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::HMMER::Set - Set of identical domains from HMMER matches + +=head1 SYNOPSIS + + # get a Set object probably from the results object + print "Bits score over set ",$set->bits," evalue ",$set->evalue,"\n"; + + foreach $domain ( $set->each_Domain ) { + print "Domain start ",$domain->start," end ",$domain->end,"\n"; + } + +=head1 DESCRIPTION + +Represents a set of HMMER domains hitting one sequence. HMMER reports two +different scores, a per sequence total score (and evalue) and a per +domain score and evalue. This object represents a collection of the same +domain with the sequence bits score and evalue. (these attributes are also +on the per domain scores, which you can get there). + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://www.bioperl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. + +Bug reports can be submitted via email or the web: + + bioperl-bugs@bioperl.org + http://www.bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney + +Email birney@sanger.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Tools::HMMER::Set; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::Tools::HMMER::Domain; + +@ISA = qw( Bio::Root::Root ); + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + my ($name,$acc,$desc) = $self->_rearrange([qw(NAME ACCESSION DESC)], + @args); + $name && $self->name($name); + $acc && $self->accession($acc); + $desc && $self->desc($desc); + + + $self->{'domains'} = []; + $self->{'domainnames'} = {}; + return $self; +} + +=head2 add_Domain + + Title : add_Domain + Usage : $set->add_Domain($domain) + Function: adds the domain to the list + Returns : nothing + Args : A Bio::Tools::HMMER::Domain object + +=cut + +sub add_Domain{ + my ($self,$domain) = @_; + + + if( ! defined $domain || ! $domain->isa("Bio::Tools::HMMER::Domain") ) { + $self->throw("[$domain] is not a Bio::Tools::HMMER::Domain. aborting"); + } + return if $self->{'domainnames'}->{$domain->get_nse}++; + push(@{$self->{'domains'}},$domain); + +} + +=head2 each_Domain + + Title : each_Domain + Usage : foreach $domain ( $set->each_Domain() ) + Function: returns an array of domain objects in this set + Returns : array + Args : none + + +=cut + +sub each_Domain{ + my ($self,@args) = @_; + + return @{$self->{'domains'}}; +} + +=head2 name + + Title : name + Usage : $obj->name($newval) + Function: + Example : + Returns : value of name + Args : newvalue (optional) + + +=cut + +sub name{ + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'name'} = $value; + } + return $obj->{'name'}; + +} + +=head2 desc + + Title : desc + Usage : $obj->desc($newval) + Function: + Example : + Returns : value of desc + Args : newvalue (optional) + +=cut + +sub desc{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'desc'} = $value; + } + return $self->{'desc'}; + +} + +=head2 accession + + Title : accession + Usage : $obj->accession($newval) + Function: + Example : + Returns : value of accession + Args : newvalue (optional) + + +=cut + +sub accession{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'accession'} = $value; + } + return $self->{'accession'}; +} + + +=head2 bits + + Title : bits + Usage : $obj->bits($newval) + Function: + Example : + Returns : value of bits + Args : newvalue (optional) + + +=cut + +sub bits{ + my ($obj,$value) = @_; + + if( defined $value) { + $obj->{'bits'} = $value; + } + return $obj->{'bits'}; + +} + +=head2 evalue + + Title : evalue + Usage : $obj->evalue($newval) + Function: + Example : + Returns : value of evalue + Args : newvalue (optional) + + +=cut + +sub evalue{ + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'evalue'} = $value; + } + return $obj->{'evalue'}; + +} + + +sub addHMMUnit { + my $self = shift; + my $unit = shift; + + $self->warn("Using old addHMMUnit call on Bio::Tools::HMMER::Set. Should replace with add_Domain"); + return $self->add_Domain($unit); +} + +sub eachHMMUnit { + my $self = shift; + $self->warn("Using old eachHMMUnit call on Bio::Tools::HMMER::Set. Should replace with each_Domain"); + return $self->each_Domain(); +} + +1; # says use was ok +__END__ + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Hmmpfam.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Hmmpfam.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,215 @@ +#BioPerl module for Bio::Tools::Hmmpfam +# +# Cared for by Balamurugan Kumarasamy +# +# You may distribute this module under the same terms as perl itself +# POD documentation - main docs before the code +# + +=head1 NAME + +Bio::Tools::Hmmpfam + +=head1 SYNOPSIS + + use Bio::Tools::Hmmpfam; + my $hmmpfam_parser = new Bio::Tools::Hmmpfam(-fh =>$filehandle ); + while( my $hmmpfam_feat = $hmmpfam_parser->next_result ) { + push @hmmpfam_feat, $hmmpfam_feat; + } + +=head1 DESCRIPTION + + Parser for Hmmpfam program + +=head1 FEEDBACK + +=head2 Mailing Lists + + User feedback is an integral part of the evolution of this and other + Bioperl modules. Send your comments and suggestions preferably to + the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + + Report bugs to the Bioperl bug tracking system to help us keep track + of the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Balamurugan Kumarasamy + + Email: fugui@worf.fugu-sg.org + +=head1 APPENDIX + + The rest of the documentation details each of the object methods. + Internal methods are usually preceded with a _ + + +=cut + +package Bio::Tools::Hmmpfam; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::SeqFeature::FeaturePair; +use Bio::Root::IO; +use Bio::SeqFeature::Generic; +@ISA = qw(Bio::Root::Root Bio::Root::IO ); + + + +=head2 new + + Title : new + Usage : my $obj = new Bio::Tools::Hmmpfam(-fh=>$filehandle); + Function: Builds a new Bio::Tools::Hmmpfam object + Returns : Bio::Tools::Hmmpfam + Args : -filename + -fh (filehandle) + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + $self->_initialize_io(@args); + + return $self; +} + + +=head2 next_result + + Title : next_result + Usage : my $feat = $hmmpfam_parser->next_result + Function: Get the next result set from parser data + Returns : L + Args : none + +=cut + +sub next_result { + my ($self) = @_; + my $filehandle; + + my $line; + + my $id; + while ($_=$self->_readline()) { + $line = $_; + chomp $line; + + + last if $line=~m/^Alignments of top-scoring domains/; + next if ($line=~m/^Model/ || /^\-/ || /^$/); + + if ($line=~m/^Query sequence:\s+(\S+)/) { + $id = $1; + $self->seqname($id); + } + + if (my ($hid, $start, $end, $hstart, $hend, $score, $evalue) = $line=~m/^(\S+)\s+\S+\s+(\d+)\s+(\d+)\s+\S+\s+(\d+)\s+(\d+)\s+\S+\s+(\S+)\s+(\S+)/) { + my %feature; + + ($feature{name}) = $self->seqname; + $feature{score} = $score; + $feature{p_value} = sprintf ("%.3e", $evalue); + $feature{start} = $start; + $feature{end} = $end; + $feature{hname} = $hid; + $feature{hstart} = $hstart; + $feature{hend} = $hend; + ($feature{source}) = 'pfam'; + $feature{primary} = $hid; + ($feature{program}) = 'pfam'; + ($feature{db}) = 'db1'; + ($feature{logic_name}) = 'hmmpfam'; + my $new_feat = $self->create_feature (\%feature); + return $new_feat + + } + next; + + } + return; +} + +=head2 create_feature + + Title : create_feature + Usage : my $feat=$hmmpfam_parser->create_feature($feature,$seqname) + Function: creates a SeqFeature Generic object + Returns : L + Args : + + +=cut + +sub create_feature { + my ($self, $feat) = @_; + + + + my $feature1= Bio::SeqFeature::Generic->new( -seqname =>$feat->{name}, + -start =>$feat->{start}, + -end =>$feat->{end}, + -score =>$feat->{score}, + -source =>$feat->{source}, + -primary =>$feat->{primary}, + ); + + + + my $feature2= Bio::SeqFeature::Generic->new( + -start =>$feat->{hstart}, + -end =>$feat->{hend}, + ); + + + + + my $featurepair = Bio::SeqFeature::FeaturePair->new; + $featurepair->feature1 ($feature1); + $featurepair->feature2 ($feature2); + + $featurepair->add_tag_value('evalue',$feat->{p_value}); + $featurepair->add_tag_value('percent_id','NULL'); + $featurepair->add_tag_value("hid",$feat->{primary}); + return $featurepair; + +} + +=head2 seqname + + Title : seqname + Usage : obj->seqname($seqname) + Function: Internal(not to be used directly) + Returns : + Args : seqname + +=cut + +sub seqname{ + my($self,$seqname)=@_; + + if(defined($seqname)) + { + $self->{'seqname'}=$seqname; + } + + return $self->{'seqname'}; + +} + +1; + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/IUPAC.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/IUPAC.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,308 @@ +# $Id: IUPAC.pm,v 1.19 2002/11/30 15:39:53 jason Exp $ +# +# BioPerl module for IUPAC +# +# Cared for by Aaron Mackey +# +# Copyright Aaron Mackey +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::IUPAC - Generates unique Seq objects from an ambiguous Seq object + +=head1 SYNOPSIS + + use Bio::Seq; + use Bio::Tools::IUPAC; + + my $ambiseq = new Bio::Seq (-seq => 'ARTCGUTGR', -alphabet => 'dna'); + my $stream = new Bio::Tools::IUPAC(-seq => $ambiseq); + + while ($uniqueseq = $stream->next_seq()) { + # process the unique Seq object. + } + +=head1 DESCRIPTION + +IUPAC is a tool that produces a stream of unique, "strict"-satisfying Seq +objects from an ambiquous Seq object (containing non-standard characters given +the meaning shown below) + + Extended Dna / Rna alphabet : + (includes symbols for nucleotide ambiguity) + ------------------------------------------ + Symbol Meaning Nucleic Acid + ------------------------------------------ + A A Adenine + C C Cytosine + G G Guanine + T T Thymine + U U Uracil + M A or C + R A or G + W A or T + S C or G + Y C or T + K G or T + V A or C or G + H A or C or T + D A or G or T + B C or G or T + X G or A or T or C + N G or A or T or C + + IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE: + Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030. + +----------------------------------- + + Amino Acid alphabet: + ------------------------------------------ + Symbol Meaning + ------------------------------------------ + A Alanine + B Aspartic Acid, Asparagine + C Cystine + D Aspartic Acid + E Glutamic Acid + F Phenylalanine + G Glycine + H Histidine + I Isoleucine + K Lysine + L Leucine + M Methionine + N Asparagine + P Proline + Q Glutamine + R Arginine + S Serine + T Threonine + V Valine + W Tryptophan + X Unknown + Y Tyrosine + Z Glutamic Acid, Glutamine + * Terminator + + + IUPAC-IUP AMINO ACID SYMBOLS: + Biochem J. 1984 Apr 15; 219(2): 345-373 + Eur J Biochem. 1993 Apr 1; 213(1): 2 + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://www.bioperl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bioperl.org + http://www.bugzilla.bioperl.org/ + +=head1 AUTHOR - Aaron Mackey + +Email amackey@virginia.edu + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Tools::IUPAC; + +use strict; +use vars qw(@ISA %IUP %IUB $AUTOLOAD); + +BEGIN { + %IUB = ( A => [qw(A)], + C => [qw(C)], + G => [qw(G)], + T => [qw(T)], + U => [qw(U)], + M => [qw(A C)], + R => [qw(A G)], + W => [qw(A T)], + S => [qw(C G)], + Y => [qw(C T)], + K => [qw(G T)], + V => [qw(A C G)], + H => [qw(A C T)], + D => [qw(A G T)], + B => [qw(C G T)], + X => [qw(G A T C)], + N => [qw(G A T C)] + ); + + %IUP = (A => [qw(A)], + B => [qw(D N)], + C => [qw(C)], + D => [qw(D)], + E => [qw(E)], + F => [qw(F)], + G => [qw(G)], + H => [qw(H)], + I => [qw(I)], + K => [qw(K)], + L => [qw(L)], + M => [qw(M)], + N => [qw(N)], + P => [qw(P)], + Q => [qw(Q)], + R => [qw(R)], + S => [qw(S)], + T => [qw(T)], + U => [qw(U)], + V => [qw(V)], + W => [qw(W)], + X => [qw(X)], + Y => [qw(Y)], + Z => [qw(E Q)], + '*' => ['*'] + ); + +} +use Bio::Root::Root; +@ISA = qw(Bio::Root::Root); + +=head2 new + + Title : new + Usage : new Bio::Tools::IUPAC $seq; + Function: returns a new seq stream (akin to SeqIO) + Returns : a Bio::Tools::IUPAC stream object that will produce unique + Seq objects on demand. + Args : an ambiguously coded Seq.pm object that has a specified 'type' + + +=cut + + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + + my ($seq) = $self->_rearrange([qw(SEQ)],@args); + if((! defined($seq)) && @args && ref($args[0])) { + # parameter not passed as named parameter? + $seq = $args[0]; + } + $seq->isa('Bio::Seq') or + $self->throw("Must supply a Seq.pm object to IUPAC!"); + $self->{'_SeqObj'} = $seq; + if ($self->{'_SeqObj'}->alphabet() =~ m/^[dr]na$/i ) { + # nucleotide seq object + $self->{'_alpha'} = [ map { $IUB{uc($_)} } + split('', $self->{'_SeqObj'}->seq()) ]; + } elsif ($self->{'_SeqObj'}->alphabet() =~ m/^protein$/i ) { + # amino acid seq object + $self->{'_alpha'} = [ map { $IUP{uc($_)} } + split('', $self->{'_SeqObj'}->seq()) ]; + } else { # unknown type: we could make a guess, but let's not. + $self->throw("You must specify the 'type' of sequence provided to IUPAC"); + } + $self->{'_string'} = [(0) x length($self->{'_SeqObj'}->seq())]; + scalar @{$self->{'_string'}} or $self->throw("Sequence has zero-length!"); + $self->{'_string'}->[0] = -1; + return $self; +} + +=head2 next_seq + + Title : next_seq + Usage : $iupac->next_seq() + Function: returns the next unique Seq object + Returns : a Seq.pm object + Args : none. + + +=cut + +sub next_seq{ + my ($self) = @_; + + for my $i ( 0 .. $#{$self->{'_string'}} ) { + next unless $self->{'_string'}->[$i] || @{$self->{'_alpha'}->[$i]} > 1; + if ( $self->{'_string'}->[$i] == $#{$self->{'_alpha'}->[$i]} ) { # rollover + if ( $i == $#{$self->{'_string'}} ) { # end of possibilities + return undef; + } else { + $self->{'_string'}->[$i] = 0; + next; + } + } else { + $self->{'_string'}->[$i]++; + my $j = -1; + $self->{'_SeqObj'}->seq(join('', map { $j++; $self->{'_alpha'}->[$j]->[$_]; } @{$self->{'_string'}})); + my $desc = $self->{'_SeqObj'}->desc(); + if ( !defined $desc ) { $desc = ""; } + + $self->{'_num'}++; + 1 while $self->{'_num'} =~ s/(\d)(\d\d\d)(?!\d)/$1,$2/; + $desc =~ s/( \[Bio::Tools::IUPAC-generated\sunique sequence # [^\]]*\])|$/ \[Bio::Tools::IUPAC-generated unique sequence # $self->{'_num'}\]/; + $self->{'_SeqObj'}->desc($desc); + $self->{'_num'} =~ s/,//g; + return $self->{'_SeqObj'}; + } + } +} + +=head2 iupac_iup + + Title : iupac_iup + Usage : my %aasymbols = $iupac->iupac_iup + Function: Returns a hash of PROTEIN symbols -> symbol components + Returns : Hash + Args : none + +=cut + +sub iupac_iup{ + return %IUP; + +} + +=head2 iupac_iub + + Title : iupac_iub + Usage : my %dnasymbols = $iupac->iupac_iub + Function: Returns a hash of DNA symbols -> symbol components + Returns : Hash + Args : none + +=cut + +sub iupac_iub{ + return %IUB; +} + +sub AUTOLOAD { + + my $self = shift @_; + my $method = $AUTOLOAD; + $method =~ s/.*:://; + return $self->{'_SeqObj'}->$method(@_) + unless $method eq 'DESTROY'; +} + +1; + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Lucy.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Lucy.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,691 @@ +# $Id: Lucy.pm,v 1.6 2002/10/22 07:38:46 lapp Exp $ +# +# BioPerl module for Bio::Tools::Lucy +# +# Copyright Her Majesty the Queen of England +# written by Andrew Walsh (paeruginosa@hotmail.com) during employment with +# Agriculture and Agri-food Canada, Cereal Research Centre, Winnipeg, MB +# +# You may distribute this module under the same terms as perl itself +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::Lucy - Object for analyzing the output from Lucy, + a vector and quality trimming program from TIGR + +=head1 SYNOPSIS + + # Create the Lucy object from an existing Lucy output file + @params = ('seqfile' => 'lucy.seq', 'lucy_verbose' => 1); + $lucyObj = Bio::Tools::Lucy->new(@params); + + # Get names of all sequences + $names = $lucyObj->get_sequence_names(); + + # Print seq and qual values for sequences >400 bp in order to run CAP3 + foreach $name (@$names) { + next unless $lucyObj->length_clear($name) > 400; + print SEQ ">$name\n", $lucyObj->sequence($name), "\n"; + print QUAL ">$name\n", $lucyObj->quality($name), "\n"; + } + + # Get an array of Bio::PrimarySeq objects + @seqObjs = $lucyObj->get_Seq_Objs(); + + +=head1 DESCRIPTION + +Bio::Tools::Lucy.pm provides methods for analyzing the sequence and +quality values generated by Lucy program from TIGR. + +Lucy will identify vector, poly-A/T tails, and poor quality regions in +a sequence. (www.genomics.purdue.edu/gcg/other/lucy.pdf) + +The input to Lucy can be the Phred sequence and quality files +generated from running Phred on a set of chromatograms. + +Lucy can be obtained (free of charge to academic users) from +www.tigr.org/softlab + +There are a few methods that will only be available if you make some +minor changes to the source for Lucy and then recompile. The changes +are in the 'lucy.c' file and there is a diff between the original and +the modified file in the Appendix + +Please contact the author of this module if you have any problems +making these modifications. + +You do not have to make these modifications to use this module. + +=head2 Creating a Lucy object + + @params = ('seqfile' => 'lucy.seq', 'adv_stderr' => 1, + 'fwd_desig' => '_F', 'rev_desig' => '_R'); + $lucyObj = Bio::Tools::Lucy->new(@params); + +=head2 Using a Lucy object + + You should get an array with the sequence names in order to use + accessor methods. Note: The Lucy binary program will fail unless + the sequence names provided as input are unique. + + $names_ref = $lucyObj->get_sequence_names(); + + This code snippet will produce a Fasta format file with sequence + lengths and %GC in the description line. + + foreach $name (@$names) { + print FILE ">$name\t", + $lucyObj->length_clear($name), "\t", + $lucyObj->per_GC($name), "\n", + $lucyObj->sequence($name), "\n"; + } + + + Print seq and qual values for sequences >400 bp in order to assemble + them with CAP3 (or other assembler). + + foreach $name (@$names) { + next unless $lucyObj->length_clear($name) > 400; + print SEQ ">$name\n", $lucyObj->sequence($name), "\n"; + print QUAL ">$name\n", $lucyObj->quality($name), "\n"; + } + + Get all the sequences as Bio::PrimarySeq objects (eg., for use with + Bio::Tools::Blast to perform BLAST). + + @seqObjs = $lucyObj->get_Seq_Objs(); + + Or use only those sequences that are full length and have a Poly-A + tail. + + foreach $name (@$names) { + next unless ($lucyObj->full_length($name) and $lucy->polyA($name)); + push @seqObjs, $lucyObj->get_Seq_Obj($name); + } + + + Get the names of those sequences that were rejected by Lucy. + + $rejects_ref = $lucyObj->get_rejects(); + + Print the names of the rejects and 1 letter code for reason they + were rejected. + + foreach $key (sort keys %$rejects_ref) { + print "$key: ", $rejects_ref->{$key}; + } + + There is a lot of other information available about the sequences + analyzed by Lucy (see APPENDIX). This module can be used with the + DBI module to store this sequence information in a database. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Andrew G. Walsh paeruginosa@hotmail.com + +=head1 APPENDIX + +Methods available to Lucy objects are described below. Please note +that any method beginning with an underscore is considered internal +and should not be called directly. + +=cut + + +package Bio::Tools::Lucy; + +use vars qw($VERSION $AUTOLOAD @ISA @ATTR %OK_FIELD); +use strict; +use Bio::PrimarySeq; +use Bio::Root::Root; +use Bio::Root::IO; + +@ISA = qw(Bio::Root::Root Bio::Root::IO); +@ATTR = qw(seqfile qualfile stderrfile infofile lucy_verbose fwd_desig rev_desig adv_stderr); +foreach my $attr (@ATTR) { + $OK_FIELD{$attr}++ +} +$VERSION = "0.01"; + +sub AUTOLOAD { + my $self = shift; + my $attr = $AUTOLOAD; + $attr =~ s/.*:://; + $attr = lc $attr; + $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; + $self->{$attr} = shift if @_; + return $self->{$attr}; +} + +=head2 new + + Title : new + Usage : $lucyObj = Bio::Tools::Lucy->new(seqfile => lucy.seq, rev_desig => '_R', + fwd_desig => '_F') + Function: creates a Lucy object from Lucy analysis files + Returns : reference to Bio::Tools::Lucy object + Args : seqfile Fasta sequence file generated by Lucy + qualfile Quality values file generated by Lucy + infofile Info file created when Lucy is run with -debug 'infofile' option + stderrfile Standard error captured from Lucy when Lucy is run + with -info option and STDERR is directed to stderrfile + (ie. lucy ... 2> stderrfile). + Info in this file will include sequences dropped for low + quality. If you've modified Lucy source (see adv_stderr below), + it will also include info on which sequences were dropped because + they were vector, too short, had no insert, and whether a poly-A + tail was found (if Lucy was run with -cdna option). + lucy_verbose verbosity level (0-1). + fwd_desig The string used to determine whether sequence is a forward read. + The parser will assume that this match will occus at the + end of the sequence name string. + rev_desig As above, for reverse reads. + adv_stderr Can be set to a true value (1). Will only work if you have modified + the Lucy source code as outlined in DESCRIPTION and capture + the standard error from Lucy. + +If you don't provide filenames for qualfile, infofile or stderrfile, +the module will assume that .qual, .info, and .stderr are the file +extensions and search in the same directory as the .seq file for these +files. + +For example, if you create a Lucy object with $lucyObj = +Bio::Tools::Lucy-Enew(seqfile =Elucy.seq), the module will +find lucy.qual, lucy.info and lucy.stderr. + +You can omit any or all of the quality, info or stderr files, but you +will not be able to use all of the object methods (see method +documentation below). + +=cut + +sub new { + my ($class,@args) = @_; + my $self = $class->SUPER::new(@args); + my ($attr, $value); + while (@args) { + $attr = shift @args; + $attr = lc $attr; + $value = shift @args; + $self->{$attr} = $value; + } + &_parse($self); + return $self; +} + +=head2 _parse + + Title : _parse + Usage : n/a (internal function) + Function: called by new() to parse Lucy output files + Returns : nothing + Args : none + +=cut + +sub _parse { + my $self = shift; + $self->{seqfile} =~ /^(\S+)\.\S+$/; + my $file = $1; + + print "Opening $self->{seqfile} for parsing...\n" if $self->{lucy_verbose}; + open SEQ, "$self->{seqfile}" or $self->throw("Could not open sequence file: $self->{seqfile}"); + my ($name, $line); + my $seq = ""; + my @lines = ; + while ($line = pop @lines) { + chomp $line; + if ($line =~ /^>(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)/) { + $name = $1; + if ($self->{fwd_desig}) { + $self->{sequences}{$name}{direction} = "F" if $name =~ /^(\S+)($self->{fwd_desig})$/; + } + if ($self->{rev_desig}) { + $self->{sequences}{$name}{direction} = "R" if $name =~ /^(\S+)($self->{rev_desig})$/; + } + $self->{sequences}{$name}{min_clone_len} = $2; # this is used for TIGR Assembler, as are $3 and $4 + $self->{sequences}{$name}{max_clone_len} = $3; + $self->{sequences}{$name}{med_clone_len} = $4; + $self->{sequences}{$name}{beg_clear} = $5; + $self->{sequences}{$name}{end_clear} = $6; + $self->{sequences}{$name}{length_raw} = $seq =~ tr/[AGCTN]//; # from what I've seen, these are the bases Phred calls. Please let me know if I'm wrong. + my $beg = $5-1; # substr function begins with index 0 + $seq = $self->{sequences}{$name}{sequence} = substr ($seq, $beg, $6-$beg); + my $count = $self->{sequences}{$name}{length_clear} = $seq =~ tr/[AGCTN]//; + my $countGC = $seq =~ tr/[GC]//; + $self->{sequences}{$name}{per_GC} = $countGC/$count * 100; + $seq = ""; + } + else { + $seq = $line.$seq; + } + } + + +# now parse quality values (check for presence of quality file first) + if ($self->{qualfile}) { + open QUAL, "$self->{qualfile}" or $self->throw("Could not open quality file: $self->{qualfile}"); + @lines = ; + } + elsif (-e "$file.qual") { + print "You did not set qualfile, but I'm opening $file.qual\n" if $self->{lucy_verbose}; + $self->qualfile("$file.qual"); + open QUAL, "$file.qual" or $self->throw("Could not open quality file: $file.qual"); + @lines = ; + } + else { + print "I did not find a quality file. You will not be able to use all of the accessor methods.\n" if $self->{lucy_verbose}; + @lines = (); + } + + my (@vals, @slice, $num, $tot, $vals); + my $qual = ""; + while ($line = pop @lines) { + chomp $line; + if ($line =~ /^>(\S+)/) { + $name = $1; + @vals = split /\s/ , $qual; + @slice = @vals[$self->{sequences}{$name}{beg_clear} .. $self->{sequences}{$name}{end_clear}]; + $vals = join "\t", @slice; + $self->{sequences}{$name}{quality} = $vals; + $qual = ""; + foreach $num (@slice) { + $tot += $num; + } + $num = @slice; + $self->{sequences}{$name}{avg_quality} = $tot/$num; + $tot = 0; + } + else { + $qual = $line.$qual; + } + } + +# determine whether reads are full length + + if ($self->{infofile}) { + open INFO, "$self->{infofile}" or $self->throw("Could not open info file: $self->{infofile}"); + @lines = ; + } + elsif (-e "$file.info") { + print "You did not set infofile, but I'm opening $file.info\n" if $self->{lucy_verbose}; + $self->infofile("$file.info"); + open INFO, "$file.info" or $self->throw("Could not open info file: $file.info"); + @lines = ; + } + else { + print "I did not find an info file. You will not be able to use all of the accessor methods.\n" if $self->{lucy_verbose}; + @lines = (); + } + + foreach (@lines) { + /^(\S+).+CLV\s+(\d+)\s+(\d+)$/; + if ($2>0 && $3>0) { + $self->{sequences}{$1}{full_length} = 1 if $self->{sequences}{$1}; # will show cleavage info for rejected sequences too + } + } + + +# parse rejects (and presence of poly-A if Lucy has been modified) + + if ($self->{stderrfile}) { + open STDERR_LUCY, "$self->{stderrfile}" or $self->throw("Could not open quality file: $self->{stderrfile}"); + @lines = ; + + } + elsif (-e "$file.stderr") { + print "You did not set stderrfile, but I'm opening $file.stderr\n" if $self->{lucy_verbose}; + $self->stderrfile("$file.stderr"); + open STDERR_LUCY, "$file.stderr" or $self->throw("Could not open quality file: $file.stderr"); + @lines = ; + } + else { + print "I did not find a standard error file. You will not be able to use all of the accessor methods.\n" if $self->{lucy_verbose}; + @lines = (); + } + + if ($self->{adv_stderr}) { + foreach (@lines) { + $self->{reject}{$1} = "Q" if /dropping\s+(\S+)/; + $self->{reject}{$1} = "V" if /Vector: (\S+)/; + $self->{reject}{$1} = "E" if /Empty: (\S+)/; + $self->{reject}{$1} = "S" if /Short: (\S+)/; + $self->{sequences}{$1}{polyA} = 1 if /(\S+) has PolyA/; + if (/Dropped PolyA: (\S+)/) { + $self->{reject}{$1} = "P"; + delete $self->{sequences}{$1}; + } + } + } + else { + foreach (@lines) { + $self->{reject}{$1} = "R" if /dropping\s+(\S+)/; + } + } + +} + +=head2 get_Seq_Objs + + Title : get_Seq_Objs + Usage : $lucyObj->get_Seq_Objs() + Function: returns an array of references to Bio::PrimarySeq objects + where -id = 'sequence name' and -seq = 'sequence' + + Returns : array of Bio::PrimarySeq objects + Args : none + +=cut + +sub get_Seq_Objs { + my $self = shift; + my($seqobj, @seqobjs); + foreach my $key (sort keys %{$self->{sequences}}) { + $seqobj = Bio::PrimarySeq->new( -seq => "$self->{sequences}{$key}{sequence}", + -id => "$key"); + push @seqobjs, $seqobj; + } + return \@seqobjs; +} + +=head2 get_Seq_Obj + + Title : get_Seq_Obj + Usage : $lucyObj->get_Seq_Obj($seqname) + Function: returns reference to a Bio::PrimarySeq object where -id = 'sequence name' + and -seq = 'sequence' + Returns : reference to Bio::PrimarySeq object + Args : name of a sequence + +=cut + +sub get_Seq_Obj { + my ($self, $key) = @_; + my $seqobj = Bio::PrimarySeq->new( -seq => "$self->{sequences}{$key}{sequence}", + -id => "$key"); + return $seqobj; +} + +=head2 get_sequence_names + + Title : get_sequence_names + Usage : $lucyObj->get_sequence_names + Function: returns reference to an array of names of the sequences analyzed by Lucy. + These names are required for most of the accessor methods. + Note: The Lucy binary will fail unless sequence names are unique. + Returns : array reference + Args : none + +=cut + +sub get_sequence_names { + my $self = shift; + my @keys = sort keys %{$self->{sequences}}; + return \@keys; +} + +=head2 sequence + + Title : sequence + Usage : $lucyObj->sequence($seqname) + Function: returns the DNA sequence of one of the sequences analyzed by Lucy. + Returns : string + Args : name of a sequence + +=cut + +sub sequence { + my ($self, $key) = @_; + return $self->{sequences}{$key}{sequence}; +} + +=head2 quality + + Title : quality + Usage : $lucyObj->quality($seqname) + Function: returns the quality values of one of the sequences analyzed by Lucy. + This method depends on the user having provided a quality file. + Returns : string + Args : name of a sequence + +=cut + +sub quality { + my($self, $key) = @_; + return $self->{sequences}{$key}{quality}; +} + +=head2 avg_quality + + Title : avg_quality + Usage : $lucyObj->avg_quality($seqname) + Function: returns the average quality value for one of the sequences analyzed by Lucy. + Returns : float + Args : name of a sequence + +=cut + +sub avg_quality { + my($self, $key) = @_; + return $self->{sequences}{$key}{avg_quality}; +} + +=head2 direction + + Title : direction + Usage : $lucyObj->direction($seqname) + Function: returns the direction for one of the sequences analyzed by Lucy + providing that 'fwd_desig' or 'rev_desig' were set when the + Lucy object was created. + Strings returned are: 'F' for forward, 'R' for reverse. + Returns : string + Args : name of a sequence + +=cut + +sub direction { + my($self, $key) = @_; + return $self->{sequences}{$key}{direction} if $self->{sequences}{$key}{direction}; + return ""; +} + +=head2 length_raw + + Title : length_raw + Usage : $lucyObj->length_raw($seqname) + Function: returns the length of a DNA sequence prior to quality/ vector + trimming by Lucy. + Returns : integer + Args : name of a sequence + +=cut + +sub length_raw { + my($self, $key) = @_; + return $self->{sequences}{$key}{length_raw}; +} + +=head2 length_clear + + Title : length_clear + Usage : $lucyObj->length_clear($seqname) + Function: returns the length of a DNA sequence following quality/ vector + trimming by Lucy. + Returns : integer + Args : name of a sequence + +=cut + +sub length_clear { + my($self, $key) = @_; + return $self->{sequences}{$key}{length_clear}; +} + +=head2 start_clear + + Title : start_clear + Usage : $lucyObj->start_clear($seqname) + Function: returns the beginning position of good quality, vector free DNA sequence + determined by Lucy. + Returns : integer + Args : name of a sequence + +=cut + +sub start_clear { + my($self, $key) = @_; + return $self->{sequences}{$key}{beg_clear}; +} + + +=head2 end_clear + + Title : end_clear + Usage : $lucyObj->end_clear($seqname) + Function: returns the ending position of good quality, vector free DNA sequence + determined by Lucy. + Returns : integer + Args : name of a sequence + +=cut + +sub end_clear { + my($self, $key) = @_; + return $self->{sequences}{$key}{end_clear}; +} + +=head2 per_GC + + Title : per_GC + Usage : $lucyObj->per_GC($seqname) + Function: returns the percente of the good quality, vector free DNA sequence + determined by Lucy. + Returns : float + Args : name of a sequence + +=cut + +sub per_GC { + my($self, $key) = @_; + return $self->{sequences}{$key}{per_GC}; +} + +=head2 full_length + + Title : full_length + Usage : $lucyObj->full_length($seqname) + Function: returns the truth value for whether or not the sequence read was + full length (ie. vector present on both ends of read). This method + depends on the user having provided the 'info' file (Lucy must be + run with the -debug 'info_filename' option to get this file). + Returns : boolean + Args : name of a sequence + +=cut + +sub full_length { + my($self, $key) = @_; + return 1 if $self->{sequences}{$key}{full_length}; + return 0; +} + +=head2 polyA + + Title : polyA + Usage : $lucyObj->polyA($seqname) + Function: returns the truth value for whether or not a poly-A tail was detected + and clipped by Lucy. This method depends on the user having modified + the source for Lucy as outlined in DESCRIPTION and invoking Lucy with + the -cdna option and saving the standard error. + Note, the final sequence will not show the poly-A/T region. + Returns : boolean + Args : name of a sequence + +=cut + +sub polyA { + my($self, $key) = @_; + return 1 if $self->{sequences}{$key}{polyA}; + return 0; +} + +=head2 get_rejects + + Title : get_rejects + Usage : $lucyObj->get_rejects() + Function: returns a hash containing names of rejects and a 1 letter code for the + reason Lucy rejected the sequence. + Q- rejected because of low quality values + S- sequence was short + V- sequence was vector + E- sequence was empty + P- poly-A/T trimming caused sequence to be too short + In order to get the rejects, you must provide a file with the standard + error from Lucy. You will only get the quality category rejects unless + you have modified the source and recompiled Lucy as outlined in DESCRIPTION. + Returns : hash reference + Args : none + +=cut + +sub get_rejects { + my $self = shift; + return $self->{reject}; +} + +=head2 Diff for Lucy source code + + 352a353,354 + > /* AGW added next line */ + > fprintf(stderr, "Empty: %s\n", seqs[i].name); + 639a642,643 + > /* AGW added next line */ + > fprintf(stderr, "Short/ no insert: %s\n", seqs[i].name); + 678c682,686 + < if (left) seqs[i].left+=left; + --- + > if (left) { + > seqs[i].left+=left; + > /* AGW added next line */ + > fprintf(stderr, "%s has PolyA (left).\n", seqs[i].name); + > } + 681c689,693 + < if (right) seqs[i].right-=right; + --- + > if (right) { + > seqs[i].right-=right; + > /* AGW added next line */ + > fprintf(stderr, "%s has PolyA (right).\n", seqs[i].name); + > } + 682a695,696 + > /* AGW added next line */ + > fprintf(stderr, "Dropped PolyA: %s\n", seqs[i].name); + 734a749,750 + > /* AGW added next line */ + > fprintf(stderr, "Vector: %s\n", seqs[i].name); + +=cut + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/MZEF.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/MZEF.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,344 @@ +# $Id: MZEF.pm,v 1.9 2002/10/22 07:38:46 lapp Exp $ +# +# BioPerl module for Bio::Tools::MZEF +# +# Cared for by Hilmar Lapp +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::MZEF - Results of one MZEF run + +=head1 SYNOPSIS + + $mzef = Bio::Tools::MZEF->new(-file => 'result.mzef'); + # filehandle: + $mzef = Bio::Tools::MZEF->new( -fh => \*INPUT ); + # to indicate that the sequence was reversed prior to feeding it to MZEF + # and that you want to have this reflected in the strand() attribute of + # the exons, as well have the coordinates translated to the non-reversed + # sequence + $mzef = Bio::Tools::MZEF->new( -file => 'result.mzef', + -strand => -1 ); + + # parse the results + # note: this class is-a Bio::Tools::AnalysisResult which implements + # Bio::SeqAnalysisParserI, i.e., $genscan->next_feature() is the same + while($gene = $mzef->next_prediction()) { + # $gene is an instance of Bio::Tools::Prediction::Gene + + # $gene->exons() returns an array of + # Bio::Tools::Prediction::Exon objects + # all exons: + @exon_arr = $gene->exons(); + + # internal exons only + @intrl_exons = $gene->exons('Internal'); + # note that presently MZEF predicts only internal exons! + } + + # essential if you gave a filename at initialization (otherwise the file + # will stay open) + $mzef->close(); + +=head1 DESCRIPTION + +The MZEF module provides a parser for MZEF gene structure prediction +output. + +This module inherits off L and therefore +implements L. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp@gmx.net (or hilmar.lapp@pharma.novartis.com) + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Tools::MZEF; +use vars qw(@ISA); +use strict; + +use Bio::Tools::AnalysisResult; +use Bio::Tools::Prediction::Gene; +use Bio::Tools::Prediction::Exon; + +@ISA = qw(Bio::Tools::AnalysisResult); + +sub _initialize_state { + my($self,@args) = @_; + + # first call the inherited method! + my $make = $self->SUPER::_initialize_state(@args); + + # handle our own parameters + my ($strand, $params) = + $self->_rearrange([qw(STRAND + )], + @args); + + # our private state variables + $strand = 1 unless defined($strand); + $self->{'_strand'} = $strand; + $self->{'_preds_parsed'} = 0; + $self->{'_has_cds'} = 0; + # array of pre-parsed predictions + $self->{'_preds'} = []; +} + +=head2 analysis_method + + Usage : $mzef->analysis_method(); + Purpose : Inherited method. Overridden to ensure that the name matches + /mzef/i. + Returns : String + Argument : n/a + +=cut + +#------------- +sub analysis_method { +#------------- + my ($self, $method) = @_; + if($method && ($method !~ /mzef/i)) { + $self->throw("method $method not supported in " . ref($self)); + } + return $self->SUPER::analysis_method($method); +} + +=head2 next_feature + + Title : next_feature + Usage : while($gene = $mzef->next_feature()) { + # do something + } + Function: Returns the next gene structure prediction of the MZEF result + file. Call this method repeatedly until FALSE is returned. + + The returned object is actually a SeqFeatureI implementing object. + This method is required for classes implementing the + SeqAnalysisParserI interface, and is merely an alias for + next_prediction() at present. + + Note that with the present version of MZEF there will only be one + object returned, because MZEF does not predict individual genes + but just potential internal exons. + Example : + Returns : A Bio::Tools::Prediction::Gene object. + Args : + +=cut + +sub next_feature { + my ($self,@args) = @_; + # even though next_prediction doesn't expect any args (and this method + # does neither), we pass on args in order to be prepared if this changes + # ever + return $self->next_prediction(@args); +} + +=head2 next_prediction + + Title : next_prediction + Usage : while($gene = $mzef->next_prediction()) { + # do something + } + Function: Returns the next gene structure prediction of the MZEF result + file. Call this method repeatedly until FALSE is returned. + + Note that with the present version of MZEF there will only be one + object returned, because MZEF does not predict individual genes + but just potential internal exons. + Example : + Returns : A Bio::Tools::Prediction::Gene object. + Args : + +=cut + +sub next_prediction { + my ($self) = @_; + my $gene; + + # if the prediction section hasn't been parsed yet, we do this now + $self->_parse_predictions() unless $self->_predictions_parsed(); + + # return the next gene structure (transcript) + return $self->_prediction(); +} + +=head2 _parse_predictions + + Title : _parse_predictions() + Usage : $obj->_parse_predictions() + Function: Parses the prediction section. Automatically called by + next_prediction() if not yet done. + Example : + Returns : + +=cut + +sub _parse_predictions { + my ($self) = @_; + my ($method); # set but not used presently + my $exon_tag = "InternalExon"; + my $gene; + # my $seqname; # name given in output is poorly formatted + my $seqlen; + my $prednr = 1; + + while(defined($_ = $self->_readline())) { + if(/^\s*(\d+)\s*-\s*(\d+)\s+/) { + # exon or signal + if(! defined($gene)) { + $gene = Bio::Tools::Prediction::Gene->new( + '-primary' => "GenePrediction$prednr", + '-source' => 'MZEF'); + } + # we handle start-end first because may not be space delimited + # for large numbers + my ($start,$end) = ($1,$2); + s/^\s*(\d+)\s*-\s*(\d+)\s+//; + # split the rest into fields + chomp(); + # format: Coordinates P Fr1 Fr2 Fr3 Orf 3ss Cds 5ss + # index: 0 1 2 3 4 5 6 7 + my @flds = split(' ', $_); + # create the feature object depending on the type of signal -- + # which is always an (internal) exon for MZEF + my $predobj = Bio::Tools::Prediction::Exon->new(); + # set common fields + $predobj->source_tag('MZEF'); + $predobj->significance($flds[0]); + $predobj->score($flds[0]); # what shall we set as overall score? + $predobj->strand($self->{'_strand'}); # MZEF searches only one + if($predobj->strand() == 1) { + $predobj->start($start); + $predobj->end($end); + } else { + $predobj->start($seqlen-$end+1); + $predobj->end($seqlen-$start+1); + } + # set scores + $predobj->start_signal_score($flds[5]); + $predobj->end_signal_score($flds[7]); + $predobj->coding_signal_score($flds[6]); + # frame -- we simply extract the one with highest score from the + # orf field, and store the individual scores for now + my $frm = index($flds[4], "1"); + $predobj->frame(($frm < 0) ? undef : $frm); + $predobj->primary_tag($exon_tag); + $predobj->is_coding(1); + # add to gene structure (should be done only when start and end + # are set, in order to allow for proper expansion of the range) + $gene->add_exon($predobj); + next; + } + if(/^\s*Internal .*(MZEF)/) { + $self->analysis_method($1); + next; + } + if(/^\s*File_Name:\s+(\S+)\s+Sequence_length:\s+(\d+)/) { + # $seqname = $1; # this is too poor currently (file name truncated + # to 10 chars) in order to be sensible enough + $seqlen = $2; + next; + } + } + # $gene->seq_id($seqname); + $self->_add_prediction($gene) if defined($gene); + $self->_predictions_parsed(1); +} + +=head2 _prediction + + Title : _prediction() + Usage : $gene = $obj->_prediction() + Function: internal + Example : + Returns : + +=cut + +sub _prediction { + my ($self) = @_; + + return undef unless(exists($self->{'_preds'}) && @{$self->{'_preds'}}); + return shift(@{$self->{'_preds'}}); +} + +=head2 _add_prediction + + Title : _add_prediction() + Usage : $obj->_add_prediction($gene) + Function: internal + Example : + Returns : + +=cut + +sub _add_prediction { + my ($self, $gene) = @_; + + if(! exists($self->{'_preds'})) { + $self->{'_preds'} = []; + } + push(@{$self->{'_preds'}}, $gene); +} + +=head2 _predictions_parsed + + Title : _predictions_parsed + Usage : $obj->_predictions_parsed + Function: internal + Example : + Returns : TRUE or FALSE + +=cut + +sub _predictions_parsed { + my ($self, $val) = @_; + + $self->{'_preds_parsed'} = $val if $val; + if(! exists($self->{'_preds_parsed'})) { + $self->{'_preds_parsed'} = 0; + } + return $self->{'_preds_parsed'}; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/OddCodes.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/OddCodes.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,443 @@ +#$Id: OddCodes.pm,v 1.10.2.1 2003/04/07 04:27:42 heikki Exp $ +#----------------------------------------------------------------------------- +# PACKAGE : OddCodes.pm +# PURPOSE : To write amino acid sequences in alternative alphabets +# AUTHOR : Derek Gatherer (D.Gatherer@organon.nhe.akzonobel.nl) +# SOURCE : +# CREATED : 8th July 2000 +# MODIFIED : +# DISCLAIMER : I am employed in the pharmaceutical industry but my +# : employers do not endorse or sponsor this module +# : in any way whatsoever. The above email address is +# : given purely for the purpose of easy communication +# : with the author, and does not imply any connection +# : between my employers and anything written below. +# LICENCE : You may distribute this module under the same terms +# : as the rest of BioPerl. +#---------------------------------------------------------------------------- + +=head1 NAME + +Bio::Tools::OddCodes - Object holding alternative alphabet coding for +one protein sequence + +=head1 SYNOPSIS + +Take a sequence object from eg, an inputstream, and creates an object +for the purposes of rewriting that sequence in another alphabet. +These are abbreviated amino acid sequence alphabets, designed to +simplify the statistical aspects of analysing protein sequences, by +reducing the combinatorial explosion of the 20-letter alphabet. These +abbreviated alphabets range in size from 2 to 8. + +Creating the OddCodes object, eg: + + my $inputstream = Bio::SeqIO->new( '-file' => "seqfile", + '-format' => 'Fasta'); + my $seqobj = $inputstream->next_seq(); + my $oddcode_obj = Bio::Tools::Oddcodes->new(-seq => $seqobj); + +or: + + my $seqobj = Bio::PrimarySeq->new + (-seq=>'[cut and paste a sequence here]', + -alphabet => 'protein', + -id => 'test'); + my $oddcode_obj = Bio::Tools::OddCodes->new(-seq => $seqobj); + +do the alternative coding, returning the answer as a reference to a string + + my $output = $oddcode_obj->structural(); + my $output = $oddcode_obj->chemical(); + my $output = $oddcode_obj->functional(); + my $output = $oddcode_obj->charge(); + my $output = $oddcode_obj->hydrophobic(); + my $output = $oddcode_obj->Dayhoff(); + my $output = $oddcode_obj->Sneath(); + my $output = $oddcode_obj->Stanfel(); + + +display sequence in new form, eg: + + my $new_coding = $$output; + print "\n$new_coding"; + +=head1 DESCRIPTION + +Bio::Tools::Oddcodes is a welterweight object for rewriting a protein +sequence in an alternative alphabet. 8 of these are provided, ranging +from the the 2-letter hydrophobic alphabet, to the 8-letter chemical +alphabet. These are useful for the statistical analysis of protein +sequences since they can partially avoid the combinatorial explosion +produced by the full 20-letter alphabet (eg. 400 dimers, 8000 trimers +etc.) + +The objects will print out a warning if the input sequence is not a +protein. If you know what you are doing, you can silence the warning +by setting verbose() to a negetive value. + +See Synopsis above for object creation code. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably +to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://www.bioperl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bioperl.org + http://www.bugzilla.bioperl.org/ + +=head1 AUTHOR + +Derek Gatherer + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + +#' + +package Bio::Tools::OddCodes; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; + +@ISA = qw(Bio::Root::Root); + + +sub new +{ + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + my ($seqobj) = $self->_rearrange([qw(SEQ)],@args); + if((! defined($seqobj)) && @args && ref($args[0])) { + # parameter not passed as named parameter? + $seqobj = $args[0]; + } + unless ($seqobj->isa("Bio::PrimarySeqI")) + { + die("die in _init, OddCodes works only on PrimarySeqI +objects\n"); + } + + $self->{'_seqref'} = $seqobj; + + return $self; +} + +=head2 structural + + Title : structural + Usage : $output = $oddcode_obj->structural(); + Function: turns amino acid sequence into 3-letter structural alphabet + : A (ambivalent), E (external), I (internal) + Example : a sequence ACDEFGH will become AAEEIAE + Returns : Reference to the new sequence string + Args : none + +=cut + +sub structural() +{ + my $self = $_[0]; + my $seqstring = &_pullseq($self); # see _pullseq() below + +# now the real business + + $seqstring =~ tr/[ACGPSTWY]/1/; + $seqstring =~ tr/[RNDQEHK]/2/; + $seqstring =~ tr/[ILMFV]/3/; + $seqstring =~ tr/1/A/; + $seqstring =~ tr/2/E/; + $seqstring =~ tr/3/I/; + + return \$seqstring; + +# and that's that one +} + +=head2 functional + + Title : functional + Usage : $output = $oddcode_obj->functional(); + Function: turns amino acid sequence into 4-letter functional alphabet + : A (acidic), C (basic), H (hydrophobic), P (polar) + Example : a sequence ACDEFGH will become HPAAHHC + Returns : Reference to the new sequence string + Args : none + +=cut + +sub functional() +{ + my $self = $_[0]; + my $seqstring = &_pullseq($self); + +# now the real business + + $seqstring =~ tr/[DE]/1/; + $seqstring =~ tr/[HKR]/2/; + $seqstring =~ tr/[AFILMPVW]/3/; + $seqstring =~ tr/[CGNQSTY]/4/; + $seqstring =~ tr/1/A/; + $seqstring =~ tr/2/C/; + $seqstring =~ tr/3/H/; + $seqstring =~ tr/4/P/; + + return \$seqstring; + +# and that's that one +} + +=head2 hydrophobic + + Title : hydrophobic + Usage : $output = $oddcode_obj->hydrophobic(); + Function: turns amino acid sequence into 2-letter hydrophobicity alphabet + : O (hydrophobic), I (hydrophilic) + Example : a sequence ACDEFGH will become OIIIOII + Returns : Reference to the new sequence string + Args : none + +=cut + +sub hydrophobic() +{ + my $self = $_[0]; + my $seqstring = &_pullseq($self); + +# now the real business + + $seqstring =~ tr/[AFILMPVW]/1/; + $seqstring =~ tr/[CDEGHKNQRSTY]/2/; + $seqstring =~ tr/1/I/; + $seqstring =~ tr/2/O/; + + return \$seqstring; + +# and that's that one +} + +=head2 Dayhoff + + Title : Dayhoff + Usage : $output = $oddcode_obj->Dayhoff(); + Function: turns amino acid sequence into 6-letter Dayhoff alphabet + Example : a sequence ACDEFGH will become CADDGCE + Returns : Reference to the new sequence string + Args : none + +=cut + +sub Dayhoff() +{ + my $self = $_[0]; + my $seqstring = &_pullseq($self); + +# now the real business + + $seqstring =~ tr/[C]/1/; + $seqstring =~ tr/[AGPST]/2/; + $seqstring =~ tr/[DENQ]/3/; + $seqstring =~ tr/[HKR]/4/; + $seqstring =~ tr/[ILMV]/5/; + $seqstring =~ tr/[FWY]/6/; + $seqstring =~ tr/1/A/; + $seqstring =~ tr/2/C/; + $seqstring =~ tr/3/D/; + $seqstring =~ tr/4/E/; + $seqstring =~ tr/5/F/; + $seqstring =~ tr/6/G/; + + return \$seqstring; + +# and that's that one +} + +=head2 Sneath + + Title : Sneath + Usage : $output = $oddcode_obj->Sneath(); + Function: turns amino acid sequence into 7-letter Sneath alphabet + Example : a sequence ACDEFGH will become CEFFHCF + Returns : Reference to the new sequence string + Args : none + +=cut + +sub Sneath() +{ + my $self = $_[0]; + my $seqstring = &_pullseq($self); + +# now the real business + + $seqstring =~ tr/[ILV]/1/; + $seqstring =~ tr/[AGP]/2/; + $seqstring =~ tr/[MNQ]/3/; + $seqstring =~ tr/[CST]/4/; + $seqstring =~ tr/[DE]/5/; + $seqstring =~ tr/[KR]/6/; + $seqstring =~ tr/[FHWY]/7/; + $seqstring =~ tr/1/A/; + $seqstring =~ tr/2/C/; + $seqstring =~ tr/3/D/; + $seqstring =~ tr/4/E/; + $seqstring =~ tr/5/F/; + $seqstring =~ tr/6/G/; + $seqstring =~ tr/7/H/; + + return \$seqstring; + +# and that's that one +} + +=head2 Stanfel + + Title : Stanfel + Usage : $output = $oddcode_obj->Stanfel(); + Function: turns amino acid sequence into 4-letter Stanfel alphabet + Example : a sequence ACDEFGH will become AACCDAE + Returns : Reference to the new sequence string + Args : none + +=cut + +sub Stanfel() +{ + my $self = $_[0]; + my $seqstring = &_pullseq($self); + +# now the real business + + $seqstring =~ tr/[ACGILMPSTV]/1/; + $seqstring =~ tr/[DENQ]/2/; + $seqstring =~ tr/[FWY]/3/; + $seqstring =~ tr/[HKR]/4/; + $seqstring =~ tr/1/A/; + $seqstring =~ tr/2/C/; + $seqstring =~ tr/3/D/; + $seqstring =~ tr/4/E/; + + return \$seqstring; + +# and that's that one +} + +=head2 chemical() + + Title : chemical + Usage : $output = $oddcode_obj->chemical(); + Function: turns amino acid sequence into 8-letter chemical alphabet + : A (acidic), L (aliphatic), M (amide), R (aromatic) + : C (basic), H (hydroxyl), I (imino), S (sulphur) + Example : a sequence ACDEFGH will become LSAARAC + Returns : Reference to the new sequence string + Args : none + +=cut + +sub chemical() +{ + my $self = $_[0]; + my $seqstring = &_pullseq($self); + +# now the real business + + $seqstring =~ tr/[DE]/1/; + $seqstring =~ tr/[AGILV]/2/; + $seqstring =~ tr/[NQ]/3/; + $seqstring =~ tr/[FWY]/4/; + $seqstring =~ tr/[RHK]/5/; + $seqstring =~ tr/[ST]/6/; + $seqstring =~ tr/P/7/; + $seqstring =~ tr/[CM]/8/; + $seqstring =~ tr/1/A/; + $seqstring =~ tr/2/L/; + $seqstring =~ tr/3/M/; + $seqstring =~ tr/4/R/; + $seqstring =~ tr/5/C/; + $seqstring =~ tr/6/H/; + $seqstring =~ tr/7/I/; + $seqstring =~ tr/8/S/; + + return \$seqstring; + +# and that's that one +} + +=head2 charge + + Title : charge + Usage : $output = $oddcode_obj->charge(); + Function: turns amino acid sequence into 3-letter charge alphabet + Example : a sequence ACDEFGH will become NNAANNC + Returns : Reference to the new sequence string + Args : none + +=cut + +sub charge() +{ + my $self = $_[0]; + my $seqstring = &_pullseq($self); + +# now the real business + + $seqstring =~ tr/[DE]/1/; + $seqstring =~ tr/[HKR]/2/; + $seqstring =~ tr/[ACFGILMNPQSTVWY]/3/; + $seqstring =~ tr/1/A/; + $seqstring =~ tr/2/C/; + $seqstring =~ tr/3/N/; + + return \$seqstring; + +# and that's that one +} + +# _pullseq is called within each of the subroutines +# it just checks a few things and returns the sequence + +sub _pullseq +{ + my $self = $_[0]; + + my $seqobj = $self->{'_seqref'}; + + unless ($seqobj->isa("Bio::PrimarySeqI")) + { + die("die, OddCodes works only on PrimarySeqI objects\n"); + } + $self->warn("\tAll OddCode alphabets need a protein sequence,\n". + "\tbut BioPerl thinks this is not: [". $seqobj->id. "]") + unless $seqobj->alphabet eq 'protein' or $self->verbose < 0;; + + my $seqstring = uc $seqobj->seq(); + + if(length($seqstring)<1) + { + die("$seqstring: die, sequence has zero length\n"); + } + return $seqstring; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Phylo/Molphy.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Phylo/Molphy.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,244 @@ +# $Id: Molphy.pm,v 1.3 2002/10/22 07:45:23 lapp Exp $ +# +# BioPerl module for Bio::Tools::Phylo::Molphy +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::Phylo::Molphy - DESCRIPTION of Object + +=head1 SYNOPSIS + + use Bio::Tools::Phylo::Molphy; + my $parser = new Bio::Tools::Phylo::Molphy(-file => 'output.protml'); + while( my $result = $parser->next_result ) { + + } + +=head1 DESCRIPTION + +A parser for Molphy output (protml,dnaml) + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Tools::Phylo::Molphy; +use vars qw(@ISA); +use strict; + +use Bio::Tools::Phylo::Molphy::Result; +use Bio::Root::Root; +use Bio::Root::IO; +use Bio::TreeIO; +use IO::String; + +@ISA = qw(Bio::Root::Root Bio::Root::IO ); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Tools::Phylo::Molphy(); + Function: Builds a new Bio::Tools::Phylo::Molphy object + Returns : Bio::Tools::Phylo::Molphy + Args : -fh/-file => $val, # for initing input, see Bio::Root::IO + + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + $self->_initialize_io(@args); + + return $self; +} + +=head2 next_result + + Title : next_result + Usage : my $r = $molphy->next_result + Function: Get the next result set from parser data + Returns : Bio::Tools::Phylo::Molphy::Result object + Args : none + + +=cut + +sub next_result{ + my ($self) = @_; + + # A little statemachine for the parser here + my ($state,$transition_ct, + @transition_matrix, %transition_mat, @resloc,) = ( 0,0); + my ( %subst_matrix, @treelines, @treedata, %frequencies); + my ( $treenum,$possible_trees, $model); + while( defined ( $_ = $self->_readline()) ) { + if( /^Relative Substitution Rate Matrix/ ) { + if( %subst_matrix ) { + $self->_pushback($_); + last; + } + $state = 0; + my ( @tempdata); + @resloc = (); + while( defined ($_ = $self->_readline) ) { + last if (/^\s+$/); + # remove leading/trailing spaces + s/^\s+//; + s/\s+$//; + my @data = split; + my $i = 0; + for my $l ( @data ) { + if( $l =~ /\D+/ ) { + push @resloc, $l; + } + $i++; + } + push @tempdata, \@data; + } + my $i = 0; + for my $row ( @tempdata ) { + my $j = 0; + for my $col ( @$row ) { + if( $i == $j ) { + # empty string for diagonals + $subst_matrix{$resloc[$i]}->{$resloc[$j]} = ''; + } else { + $subst_matrix{$resloc[$i]}->{$resloc[$j]} = $col; + } + $j++; + } + $i++; + } + } elsif( /^Transition Probability Matrix/ ) { + if( /1\.0e7/ ) { + $state = 1; + $transition_ct = 0; + } else { + $state = 0; + } + } elsif ( /Acid Frequencies/ ) { + $state = 0; + $self->_readline(); # skip the next line + while( defined( $_ = $self->_readline) ) { + unless( /^\s+/) { + $self->_pushback($_); + last; + } + s/^\s+//; + s/\s+$//; + my ($index,$res,$model,$data) = split; + $frequencies{$res} = [ $model,$data]; + } + } elsif( /^(\d+)\s*\/\s*(\d+)\s+(.+)\s+model/ ) { + my @save = ($1,$2,$3); + # finish processing the transition_matrix + my $i =0; + foreach my $row ( @transition_matrix ) { + my $j = 0; + foreach my $col ( @$row ) { + $transition_mat{$resloc[$i]}->{$resloc[$j]} = $col; + $j++; + } + $i++; + } + + if( defined $treenum ) { + $self->_pushback($_); + last; + } + + $state = 2; + ($treenum,$possible_trees, $model) = @save; + $model =~ s/\s+/ /g; + } elsif( $state == 1 ) { + next if( /^\s+$/ ); + s/^\s+//; + s/\s+$//; + # because the matrix is split up into 2-10 column sets + push @{$transition_matrix[$transition_ct++]}, split ; + $transition_ct = 0 if $transition_ct % 20 == 0; + } elsif( $state == 2 ) { + if( s/^(\d+)\s+(\-?\d+(\.\d+)?)\s+// ) { + push @treedata, [ $1,$2]; + } + # save this for the end so that we can + # be efficient and only open one tree parser + push @treelines, $_; + } + } + # waiting till the end to do this, is it better + my @trees; + if( @treelines ) { + my $strdat = IO::String->new(join('',@treelines)); + my $treeio = new Bio::TreeIO(-fh => $strdat, + -format => 'newick'); + while( my $tree = $treeio->next_tree ) { + if( @treedata ) { + my $dat = shift @treedata; + # set the associated information + $tree->id($dat->[0]); + $tree->score($dat->[1]); + } + push @trees, $tree; + } + } + + my $result = new Bio::Tools::Phylo::Molphy::Result + (-trees => \@trees, + -substitution_matrix => \%subst_matrix, + -transition_matrix => \%transition_mat, + -frequencies => \%frequencies, + -model => $model, + -search_space => $possible_trees, + ); + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Phylo/Molphy/Result.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Phylo/Molphy/Result.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,309 @@ +# $Id: Result.pm,v 1.2 2002/10/22 07:45:24 lapp Exp $ +# +# BioPerl module for Bio::Tools::Phylo::Molphy::Result +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::Phylo::Molphy::Result - DESCRIPTION of Object + +=head1 SYNOPSIS + +Give standard usage here + +=head1 DESCRIPTION + +Describe the object here + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Tools::Phylo::Molphy::Result; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::Root::Root; + + +@ISA = qw(Bio::Root::Root ); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Tools::Phylo::Molphy::Result(); + Function: Builds a new Bio::Tools::Phylo::Molphy::Result object + Returns : Bio::Tools::Phylo::Molphy::Result + Args : + + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + my ($trees, + $smat,$tmat,$freq, + $model, $sspace, + ) = $self->_rearrange([qw(TREES SUBSTITUTION_MATRIX + TRANSITION_MATRIX FREQUENCIES + MODEL SEARCH_SPACE)], @args); + + if( $trees ) { + if(ref($trees) !~ /ARRAY/i ) { + $self->warn("Must have provided a valid array reference to initialize trees"); + } else { + foreach my $t ( @$trees ) { + $self->add_tree($t); + } + } + } + # initialize things through object methods to be a good + # little OO programmer + if( ref($smat) =~ /HASH/i ) { + $self->substitution_matrix($smat); + } + if( ref($tmat) =~ /HASH/i ) { + $self->transition_probability_matrix($tmat); + } + if( ref($freq) =~ /HASH/i ) { + $self->residue_frequencies($freq); + } + + $model && $self->model($model); + $sspace && $self->search_space($sspace); + $self->{'_treeiterator'} = 0; + + return $self; +} + +=head2 model + + Title : model + Usage : $obj->model($newval) + Function: + Returns : value of model + Args : newvalue (optional) + + +=cut + +sub model{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'model'} = $value; + } + return $self->{'model'}; + +} + +=head2 substitution_matrix + + Title : substitution_matrix + Usage : my $smat = $result->subsitution_matrix; + Function: Get the relative substitution matrix calculated in the ML procedure + Returns : reference to hash of hashes where key is the aa/nt name and value + is another hash ref which contains keys for all the aa/nt + possibilities + Args : none + + +=cut + +sub substitution_matrix{ + my ($self,$val) = @_; + if(defined $val ) { + if( ref($val) =~ /HASH/ ) { + foreach my $v (values %{$val} ) { + if( ref($v) !~ /HASH/i ) { + $self->warn("Must be a valid hashref of hashrefs for substition_matrix"); + return undef; + } + } + $self->{'_substitution_matrix'} = $val; + } else { + $self->warn("Must be a valid hashref of hashrefs for substition_matrix"); + return undef; + } + } + return $self->{'_substitution_matrix'}; +} + +=head2 transition_probability_matrix + + Title : transition_probability_matrix + Usage : my $matrixref = $molphy->transition_probablity_matrix(); + Function: Gets the observed transition probability matrix + Returns : hash of hashes of aa/nt transition to each other aa/nt + Args : none + + +=cut + +sub transition_probability_matrix{ + my ($self,$val) = @_; + if(defined $val ) { + if( ref($val) =~ /HASH/ ) { + foreach my $v (values %{$val} ) { + if( ref($v) !~ /HASH/i ) { + $self->warn("Must be a valid hashref of hashrefs for transition_probability_matrix"); + return undef; + } + } + $self->{'_TPM'} = $val; + } else { + $self->warn("Must be a valid hashref of hashrefs for transition_probablity_matrix"); + return undef; + } + } + + # fix this for nucml where there are 2 values (one is just a transformation + # of the either, but how to represent?) + return $self->{'_TPM'}; +} + +=head2 residue_frequencies + + Title : residue_frequencies + Usage : my %data = $molphy->residue_frequencies() + Function: Get the modeled and expected frequencies for + each of the residues in the sequence + Returns : hash of either aa (protml) or nt (nucml) frequencies + each key will point to an array reference where + 1st slot is model's expected frequency + 2nd slot is observed frequency in the data + $hash{'A'}->[0] = + Args : none + + +=cut + +#' + +sub residue_frequencies{ + my ($self,$val) = @_; + if(defined $val ) { + if( ref($val) =~ /HASH/ ) { + $self->{'_residue_frequencies'} = $val; + } else { + $self->warn("Must be a valid hashref of hashrefs for residue_frequencies"); + } + } + return %{$self->{'_residue_frequencies'}}; +} + +=head2 next_tree + + Title : next_tree + Usage : my $tree = $factory->next_tree; + Function: Get the next tree from the factory + Returns : L + Args : none + +=cut + +sub next_tree{ + my ($self,@args) = @_; + return $self->{'_trees'}->[$self->{'_treeiterator'}++] || undef; +} + +=head2 rewind_tree + + Title : rewind_tree_iterator + Usage : $result->rewind_tree() + Function: Rewinds the tree iterator so that next_tree can be + called again from the beginning + Returns : none + Args : none + +=cut + +sub rewind_tree_iterator { + shift->{'_treeiterator'} = 0; +} + +=head2 add_tree + + Title : add_tree + Usage : $result->add_tree($tree); + Function: Adds a tree + Returns : integer which is the number of trees stored + Args : L + +=cut + +sub add_tree{ + my ($self,$tree) = @_; + if( $tree && ref($tree) && $tree->isa('Bio::Tree::TreeI') ) { + push @{$self->{'_trees'}},$tree; + } + return scalar @{$self->{'_trees'}}; +} + +=head2 search_space + + Title : search_space + Usage : $obj->search_space($newval) + Function: + Returns : value of search_space + Args : newvalue (optional) + + +=cut + +sub search_space{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'search_space'} = $value; + } + return $self->{'search_space'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Phylo/PAML.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Phylo/PAML.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,647 @@ +# PAML.pm,v 1.3 2002/06/20 18:50:37 amackey Exp +# +# BioPerl module for Bio::Tools::Phylo::PAML +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich, Aaron J Mackey +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::Phylo::PAML - Parses output from the PAML programs codeml, +baseml, basemlg, codemlsites and yn00 + +=head1 SYNOPSIS + + #!/usr/bin/perl -Tw + use strict; + + use Bio::Tools::Phylo::PAML; + + # need to specify the output file name (or a fh) (defaults to + # -file => "codeml.mlc"); also, optionally, the directory in which + # the other result files (rst, 2ML.dS, etc) may be found (defaults + # to "./") + my $parser = new Bio::Tools::Phylo::PAML + (-file => "./results/mlc", -dir => "./results/"); + + # get the first/next result; a Bio::Tools::Phylo::PAML::Result object, + # which isa Bio::SeqAnalysisResultI object. + my $result = $parser->next_result(); + + # get the sequences used in the analysis; returns Bio::PrimarySeq + # objects (OTU = Operational Taxonomic Unit). + my @otus = $result->get_seqs(); + + # codon summary: codon usage of each sequence [ arrayref of { + # hashref of counts for each codon } for each sequence and the + # overall sum ], and positional nucleotide distribution [ arrayref + # of { hashref of frequencies for each nucleotide } for each + # sequence and overall frequencies ]: + my ($codonusage, $ntdist) = $result->get_codon_summary(); + + # example manipulations of $codonusage and $ntdist: + printf "There were %d '%s' codons in the first seq (%s)\n", + $codonusage->[0]->{AAA}, 'AAA', $otus[0]->id(); + printf "There were %d '%s' codons used in all the sequences\n", + $codonusage->[$#{$codonusage}]->{AAA}, 'AAA'; + printf "Nucleotide '%c' was present %g of the time in seq %s\n", + 'A', $ntdist->[1]->{A}, $otus[1]->id(); + + # get Nei & Gojobori dN/dS matrix: + my $NGmatrix = $result->get_NGmatrix(); + + # get ML-estimated dN/dS matrix, if calculated; this corresponds to + # the runmode = -2, pairwise comparison usage of codeml + my $MLmatrix = $result->get_MLmatrix(); + + # These matrices are length(@otu) x length(@otu) "strict lower + # triangle" 2D-matrices, which means that the diagonal and + # everything above it is undefined. Each of the defined cells is a + # hashref of estimates for "dN", "dS", "omega" (dN/dS ratio), "t", + # "S" and "N". If a ML matrix, "lnL" will also be defined. + printf "The omega ratio for sequences %s vs %s was: %g\n", + $otus[0]->id, $otus[1]->id, $MLmatrix->[0]->[1]->{omega}; + + # with a little work, these matrices could also be passed to + # Bio::Tools::Run::Phylip::Neighbor, or other similar tree-building + # method that accepts a matrix of "distances" (using the LOWTRI + # option): + my $distmat = [ map { [ map { $$_{omega} } @$_ ] } @$MLmatrix ]; + + # for runmode's other than -2, get tree topology with estimated + # branch lengths; returns a Bio::Tree::TreeI-based tree object with + # added PAML parameters at each node + my $tree = $result->get_tree(); + for my $node ($tree->get_nodes()) { + # inspect the tree: the "t" (time) parameter is available via + # $node->branch_length(); all other branch-specific parameters + # ("omega", "dN", etc.) are available via $node->param('omega'); + } + + # get any general model parameters: kappa (the + # transition/transversion ratio), NSsites model parameters ("p0", + # "p1", "w0", "w1", etc.), etc. + my $params = $result->get_model_params(); + printf "M1 params: p0 = %g\tp1 = %g\n", $params->{p0}, $params->{p1}; + + # for NSsites models, obtain arrayrefs of posterior probabilities + # for membership in each class for every position; probabilities + # correspond to classes w0, w1, ... etc. + my @probs = $result->get_posteriors(); + + # find, say, positively selected sites! + if ($params->{w2} > 1) { + for (my $i = 0; $i < @probs ; $i++) { + if ($probs[$i]->[2] > 0.5) { + # assumes model M1: three w's, w0, w1 and w2 (positive selection) + printf "position %d: (%g prob, %g omega, %g mean w)\n", + $i, $probs[$i]->[2], $params->{w2}, $probs[$i]->[3]; + } + } + } else { print "No positive selection found!\n"; } + +=head1 DESCRIPTION + +This module is used to parse the output from the PAML programs codeml, +baseml, basemlg, codemlsites and yn00. You can use the +Bio::Tools::Run::Phylo::PAML::* modules to actually run some of the +PAML programs, but this module is only useful to parse the output. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich, Aaron Mackey + +Email jason@bioperl.org +Email amackey@virginia.edu + +=head1 TODO + +check output from pre 1.12 + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Tools::Phylo::PAML; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::Root::Root; +use Bio::AnalysisParserI; +use Bio::Root::IO; + +@ISA = qw(Bio::Root::Root Bio::Root::IO Bio::AnalysisParserI); + +# other objects used: +use IO::String; +use Bio::TreeIO; +use Bio::Tools::Phylo::PAML::Result; +use Bio::PrimarySeq; + +=head2 new + + Title : new + Usage : my $obj = new Bio::Tools::Phylo::PAML(%args); + Function: Builds a new Bio::Tools::Phylo::PAML object + Returns : Bio::Tools::Phylo::PAML + Args : Hash of options: -file, -fh, -dir + -file (or -fh) should contain the contents of the PAML + outfile; -dir is the (optional) name of the directory in + which the PAML program was run (and includes other + PAML-generated files from which we can try to gather data) + +=cut + +sub new { + + my ($class, @args) = @_; + + my $self = $class->SUPER::new(@args); + $self->_initialize_io(@args); + + my ($dir) = $self->_rearrange([qw(DIR)], @args); + $self->{_dir} = $dir if defined $dir; + + return $self; +} + +=head2 Implement Bio::AnalysisParserI interface + +=cut + +=head2 next_result + + Title : next_result + Usage : $result = $obj->next_result(); + Function: Returns the next result available from the input, or + undef if there are no more results. + Example : + Returns : a Bio::Tools::Phylo::PAML::Result object + Args : none + +=cut + +sub next_result { + + my ($self) = @_; + + my %data; + # get the various codon and other sequence summary data, if necessary: + $self->_parse_summary + unless ($self->{'_summary'} && !$self->{'_summary'}->{'multidata'}); + + # OK, depending on seqtype and runmode now, one of a few things can happen: + my $seqtype = $self->{'_summary'}->{'seqtype'}; + if ($seqtype eq 'CODONML' || $seqtype eq 'AAML') { + while ($_ = $self->_readline) { + if ($seqtype eq 'CODONML' && + m/^pairwise comparison, codon frequencies:/o) { + + # runmode = -2, CODONML + $self->_pushback($_); + %data = $self->_parse_PairwiseCodon; + last; + + } elsif ($seqtype eq 'AAML' && m/^ML distances of aa seqs\.$/o) { + + # runmode = -2, AAML + $self->throw( -class => 'Bio::Root::NotImplemented', + -text => "Pairwise AA not yet implemented!" + ); + + # $self->_pushback($_); + # %data = $self->_parse_PairwiseAA; + # last; + } elsif (m/^Model \d+: /o) { + + # NSSitesBatch + $self->throw( -class => 'Bio::Root::NotImplemented', + -text => "NSsitesBatch not yet implemented!" + ); + + # $self->_pushback($_); + # %data = $self->_parse_NSsitesBatch; + # last; + + } elsif (m/^TREE/) { + + # runmode = 0 + $self->_pushback($_); + %data = $self->_parse_Forestry; + last; + + } elsif (m/Heuristic tree search by stepwise addition$/o) { + + # runmode = 3 + $self->throw( -class => 'Bio::Root::NotImplemented', + -text => "StepwiseAddition not yet implemented!" + ); + + # $self->_pushback($_); + # %data = $self->_parse_StepwiseAddition; + # last; + + } elsif (m/Heuristic tree search by NNI perturbation$/o) { + + # runmode = 4 + $self->throw( -class => 'Bio::Root::NotImplemented', + -text => "NNI Perturbation not yet implemented!" + ); + + # $self->_pushback($_); + # %data = $self->_parse_Perturbation; + # last; + + } elsif (m/^stage 0:/o) { + + # runmode = (1 or 2) + $self->throw( -class => 'Bio::Root::NotImplemented', + -text => "StarDecomposition not yet implemented!" + ); + + # $self->_pushback($_); + # %data = $self->_parse_StarDecomposition; + # last; + + } + } + } elsif ($seqtype eq 'BASEML') { + } elsif ($seqtype eq 'YN00') { + while ($_ = $self->_readline) { + if( m/^Estimation by the method/ ) { + $self->_pushback($_); + %data = $self->_parse_YN_Pairwise; + last; + } + } + } + if (%data) { + $data{'-version'} = $self->{'_summary'}->{'version'}; + $data{'-seqs'} = $self->{'_summary'}->{'seqs'}; + $data{'-patterns'} = $self->{'_summary'}->{'patterns'}; + $data{'-ngmatrix'} = $self->{'_summary'}->{'ngmatrix'}; + $data{'-codonpos'} = $self->{'_summary'}->{'codonposition'}; + $data{'-codonfreq'} = $self->{'_summary'}->{'codonfreqs'}; + return new Bio::Tools::Phylo::PAML::Result %data; + } else { + return undef; + } +} + + +sub _parse_summary { + + my ($self) = @_; + + # Depending on whether verbose > 0 or not, and whether the result + # set comes from a multi-data run, the first few lines could be + # various things; we're going to throw away any sequence data + # here, since we'll get it later anyways + + # multidata ? : \n\nData set 1\n + # verbose ? : cleandata ? : \nBefore deleting alignment gaps. \d sites\n + # [ sequence printout ] + # \nAfter deleting gaps. \d sites\n" + # : [ sequence printout ] + # CODONML (in paml 3.12 February 2002) <<-- what we want to see! + + my $SEQTYPES = qr( (?: (?: CODON | AA | BASE | CODON2AA ) ML ) | YN00 )x; + while ($_ = $self->_readline) { + if ( m/^($SEQTYPES) \s+ # seqtype: CODONML, AAML, BASEML, CODON2AAML, YN00, etc + (?: \(in \s+ ([^\)]+?) \s* \) \s* )? # version: "paml 3.12 February 2002"; not present < 3.1 or YN00 + (\S+) \s* # tree filename + (?: (.+?) )? # model description (not there in YN00) + \s* $ # trim any trailing space + /ox + ) { + + @{$self->{_summary}}{qw(seqtype version treefile model)} = ($1, + $2, + $3, + $4); + last; + + } elsif (m/^Data set \d$/o) { + $self->{'_summary'} = {}; + $self->{'_summary'}->{'multidata'}++; + } + } + + unless (defined $self->{'_summary'}->{'seqtype'}) { + $self->throw( -class => 'Bio::Root::NotImplemented', + -text => 'Unknown format of PAML output'); + } + + + my $seqtype = $self->{'_summary'}->{'seqtype'}; + $self->debug( "seqtype is $seqtype\n"); + if ($seqtype eq "CODONML") { + + $self->_parse_inputparams(); # settings from the .ctl file that get printed + $self->_parse_patterns(); # codon patterns - not very interesting + $self->_parse_seqs(); # the sequences data used for analysis + $self->_parse_codoncts(); # counts and distributions of codon/nt usage + $self->_parse_codon_freqs(); # codon frequencies + $self->_parse_distmat(); # NG distance matrices + + + } elsif ($seqtype eq "AAML") { + $self->throw( -class => 'Bio::Root::NotImplemented', + -text => 'AAML parsing not yet implemented!'); + } elsif ($seqtype eq "CODON2AAML") { + $self->throw( -class => 'Bio::Root::NotImplemented', + -text => 'CODON2AAML parsing not yet implemented!'); + } elsif ($seqtype eq "BASEML") { + $self->throw( -class => 'Bio::Root::NotImplemented', + -text => 'BASEML parsing not yet implemented!'); + } elsif ($seqtype eq "YN00") { + $self->_parse_codon_freqs(); + $self->_parse_codoncts(); + $self->_parse_distmat(); # NG distance matrices + + } else { + $self->throw( -class => 'Bio::Root::NotImplemented', + -text => 'Unknown seqtype, not yet implemented!', + -value => $seqtype + ); + } + +} + + +sub _parse_inputparams { + my ($self) = @_; + +} + +sub _parse_codon_freqs { + my ($self) = @_; + my ($okay,$done) = (0,0); + while( defined($_ = $self->_readline ) ) { + if( /^Nei/ ) { $self->_pushback($_); last } + last if( $done); + next if ( /^\s+/); + next unless($okay || /^Codon position x base \(3x4\) table\, overall/ ); + $okay = 1; + if( s/^position\s+(\d+):\s+// ) { + my $pos = $1; + s/\s+$//; + my @bases = split; + foreach my $str ( @bases ) { + my ( $base,$freq) = split(/:/,$str,2); + $self->{'_summary'}->{'codonposition'}->[$pos-1]->{$base} = $freq; + } + $done = 1 if $pos == 3; + } + } + $done = 0; + while( defined( $_ = $self->_readline) ) { + if( /^Nei\s\&\sGojobori/ ) { $self->_pushback($_); last } + last if ( $done ); + if( /^Codon frequencies under model, for use in evolver:/ ){ + while( defined( $_ = $self->_readline) ) { + last if( /^\s+$/ ); + s/^\s+//; + s/\s+$//; + push @{$self->{'_summary'}->{'codonfreqs'}},[split]; + } + $done = 1; + } + } +} + +sub _parse_patterns { + my ($self) = @_; + my ($patternct,@patterns,$ns,$ls); + while( defined($_ = $self->_readline) ) { + if( $patternct ) { +# last unless ( @patterns == $patternct ); + last if( /^\s+$/ ); + s/^\s+//; + push @patterns, split; + } elsif( /^ns\s+\=\s*(\d+)\s+ls\s+\=\s*(\d+)/ ) { + ($ns,$ls) = ($1,$2); + } elsif( /^\# site patterns \=\s*(\d+)/ ) { + $patternct = $1; + } else { +# $self->debug("Unknown line: $_"); + } + } + $self->{'_summary'}->{'patterns'} = { -patterns => \@patterns, + -ns => $ns, + -ls => $ls}; +} + +sub _parse_seqs { + + # this should in fact be packed into a Bio::SimpleAlign object instead of + # an array but we'll stay with this for now + my ($self) = @_; + my (@firstseq,@seqs); + while( defined ($_ = $self->_readline) ) { + last if( /^\s+$/ && @seqs > 0 ); + next if ( /^\s+$/ ); + next if( /^\d+\s+$/ ); + + my ($name,$seqstr) = split(/\s+/,$_,2); + $seqstr =~ s/\s+//g; # remove whitespace + unless( @firstseq) { + @firstseq = split(//,$seqstr); + push @seqs, new Bio::PrimarySeq(-id => $name, + -seq => $seqstr); + } else { + + my $i = 0; + my $v; + while(($v = index($seqstr,'.',$i)) >= $i ) { + # replace the '.' with the correct seq from the + substr($seqstr,$v,1,$firstseq[$v]); + $i = $v; + } + $self->debug( "adding seq $seqstr\n"); + push @seqs, new Bio::PrimarySeq(-id => $name, + -seq => $seqstr); + } + } + $self->{'_summary'}->{'seqs'} = \@seqs; +} + +sub _parse_codoncts { } + +sub _parse_distmat { + my ($self) = @_; + my @results; + while( defined ($_ = $self->_readline) ) { + next if/^\s+$/; + last; + } + return unless (/^Nei\s*\&\s*Gojobori/); + # skip the next 3 lines + if( $self->{'_summary'}->{'seqtype'} eq 'CODONML' ) { + $self->_readline; + $self->_readline; + $self->_readline; + } + my $seqct = 0; + while( defined ($_ = $self->_readline ) ) { + last if( /^\s+$/ && exists $self->{'_summary'}->{'ngmatrix'} ); + next if( /^\s+$/ ); + chomp; + my ($seq,$rest) = split(/\s+/,$_,2); + my $j = 0; + while( $rest =~ + /(\-?\d+(\.\d+)?)\s*\(\-?(\d+(\.\d+)?)\s+(\-?\d+(\.\d+)?)\)/g ) { + $self->{'_summary'}->{'ngmatrix'}->[$j++]->[$seqct] = + { 'omega' => $1, + 'dN' => $3, + 'dS' => $5 }; + } + $seqct++; + } +} + +sub _parse_PairwiseCodon { + my ($self) = @_; + my @result; + my ($a,$b,$log,$model); + while( defined( $_ = $self->_readline) ) { + if( /^pairwise comparison, codon frequencies\:\s*(\S+)\./) { + $model = $1; + } elsif( /^(\d+)\s+\((\S+)\)\s+\.\.\.\s+(\d+)\s+\((\S+)\)/ ) { + ($a,$b) = ($1,$3); + } elsif( /^lnL\s+\=\s*(\-?\d+(\.\d+)?)/ ) { + $log = $1; + } elsif( m/^t\=\s*(\d+(\.\d+)?)\s+ + S\=\s*(\d+(\.\d+)?)\s+ + N\=\s*(\d+(\.\d+)?)\s+ + dN\/dS\=\s*(\d+(\.\d+)?)\s+ + dN\=\s*(\d+(\.\d+)?)\s+ + dS\=\s*(\d+(\.\d+)?)/ox ) { + $result[$b-1]->[$a-1] = { + 'lnL' => $log, + 't' => $1, + 'S' => $3, + 'N' => $5, + 'omega' => $7, + 'dN' => $9, + 'dS' => $11 }; + } elsif( /^\s+$/ ) { + next; + } elsif( /^\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)/ ) { + } else { + $self->debug( "unknown line: $_"); + } + } + return ( -mlmatrix => \@result); +} + +sub _parse_YN_Pairwise { + my ($self) = @_; + my @result; + while( defined( $_ = $self->_readline) ) { + last if( /^seq\.\s+seq\./); + } + while( defined( $_ = $self->_readline) ) { + if( m/^\s+(\d+)\s+ # seq # + (\d+)\s+ # seq # + (\d+(\.\d+))\s+ # S + (\d+(\.\d+))\s+ # N + (\d+(\.\d+))\s+ # t + (\d+(\.\d+))\s+ # kappa + (\d+(\.\d+))\s+ # omega + (\d+(\.\d+))\s+ # dN + \+\-\s+ + (\d+(\.\d+))\s+ # dN SE + (\d+(\.\d+))\s+ # dS + \+\-\s+ + (\d+(\.\d+))\s+ # dS SE + /ox + ) { + + $result[$2-1]->[$1-1] = { + 'S' => $3, + 'N' => $5, + 't' => $7, + 'kappa' => $9, + 'omega' => $11, + 'dN' => $13, + 'dN_SE' => $15, + 'dS' => $17, + 'dS_SE' => $19, + }; + } elsif( /^\s+$/ ) { + next; + } + } + return ( -mlmatrix => \@result); +} + +sub _parse_Forestry { + + my ($self) = @_; + my %data = (-trees => []); + + + return %data +}; + +# parse the mlc file + +sub _parse_mlc { + my ($self) = @_; + my %data; + while( defined( $_ = $self->_readline) ) { + $self->debug( "mlc parse: $_"); + # Aaron this is where the parsing should begin + + # I'll do the Tree objects if you like - I'd do it by building + # an IO::String for the the tree data or does it make more + # sense to parse this out of a collection of files? + if( /^TREE/ ) { + # ... + while( defined($_ = $self->_readline) ) { + if( /^\(/) { + my $treestr = new IO::String($_); + my $treeio = new Bio::TreeIO(-fh => $treestr, + -format => 'newick'); + # this is very tenative here!! + push @{$self->{'_trees'}}, $treeio->next_tree; + } + } + } + } +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Phylo/PAML/Result.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Phylo/PAML/Result.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,396 @@ +# Result.pm,v 1.3 2002/06/20 18:50:39 amackey Exp +# +# BioPerl module for Bio::Tools::Phylo::PAML::Result +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich, Aaron Mackey +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::Phylo::PAML::Result - A PAML result set object + +=head1 SYNOPSIS + +Give standard usage here + +=head1 DESCRIPTION + +Describe the object here + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich, Aaron Mackey + +Email jason@bioperl.org +Email amackey@virginia.edu + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Tools::Phylo::PAML::Result; +use vars qw(@ISA); +use strict; + + +use Bio::Root::Root; +use Bio::AnalysisResultI; +@ISA = qw(Bio::Root::Root Bio::AnalysisResultI); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Tools::Phylo::PAML::Result(%data); + Function: Builds a new Bio::Tools::Phylo::PAML::Result object + Returns : Bio::Tools::Phylo::PAML::Result + Args : -trees => array reference of L objects + -MLmatrix => ML matrix + .... MORE ARGUMENTS LISTED HERE BY AARON AND JASON + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + my ($trees,$mlmat,$seqs,$ngmatrix, + $codonpos,$codonfreq,$version) = $self->_rearrange([qw(TREES MLMATRIX + SEQS NGMATRIX + CODONPOS CODONFREQ + VERSION)], @args); + $self->reset_seqs; + if( $trees ) { + if(ref($trees) !~ /ARRAY/i ) { + $self->warn("Must have provided a valid array reference to initialize trees"); + } else { + foreach my $t ( @$trees ) { + $self->add_tree($t); + } + } + } + $self->{'_treeiterator'} = 0; + + if( $mlmat ) { + if( ref($mlmat) !~ /ARRAY/i ) { + $self->warn("Must have provided a valid array reference to initialize MLmatrix"); + } else { + $self->set_MLmatrix($mlmat); + } + } + if( $seqs ) { + if( ref($seqs) !~ /ARRAY/i ) { + $self->warn("Must have provided a valid array reference to initialize seqs"); + } else { + foreach my $s ( @$seqs ) { + $self->add_seq($s); + } + } + } + if( $ngmatrix ) { + if( ref($ngmatrix) !~ /ARRAY/i ) { + $self->warn("Must have provided a valid array reference to initialize NGmatrix"); + } else { + $self->set_NGmatrix($ngmatrix); + } + } + + if( $codonfreq ) { + + + } + + if( $codonpos ) { + if( ref($codonpos) !~ /ARRAY/i ) { + $self->warn("Must have provided a valid array reference to initialize codonpos"); + } else { + $self->set_codon_pos_basefreq(@$codonpos); + } + } + + $self->version($version) if defined $version; + + return $self; +} + +=head2 next_tree + + Title : next_tree + Usage : my $tree = $factory->next_tree; + Function: Get the next tree from the factory + Returns : L + Args : none + +=cut + +sub next_tree{ + my ($self,@args) = @_; + return $self->{'_trees'}->[$self->{'_treeiterator'}++] || undef; +} + +=head2 rewind_tree + + Title : rewind_tree_iterator + Usage : $result->rewind_tree() + Function: Rewinds the tree iterator so that next_tree can be + called again from the beginning + Returns : none + Args : none + +=cut + +sub rewind_tree_iterator { + shift->{'_treeiterator'} = 0; +} + +=head2 add_tree + + Title : add_tree + Usage : $result->add_tree($tree); + Function: Adds a tree + Returns : integer which is the number of trees stored + Args : L + +=cut + +sub add_tree{ + my ($self,$tree) = @_; + if( $tree && ref($tree) && $tree->isa('Bio::Tree::TreeI') ) { + push @{$self->{'_trees'}},$tree; + } + return scalar @{$self->{'_trees'}}; +} + + +=head2 set_MLmatrix + + Title : set_MLmatrix + Usage : $result->set_MLmatrix($mat) + Function: Set the ML Matrix + Returns : none + Args : Arrayref to MLmatrix (must be arrayref to 2D matrix whic is + lower triangle pairwise) + + +=cut + +sub set_MLmatrix{ + my ($self,$mat) = @_; + return unless ( defined $mat ); + if( ref($mat) !~ /ARRAY/i ) { + $self->warn("Did not provide a valid 2D Array reference for set_MLmatrix"); + return; + } + $self->{'_mlmatrix'} = $mat; +} + +=head2 get_MLmatrix + + Title : get_MLmatrix + Usage : my $mat = $result->get_MLmatrix() + Function: Get the ML matrix + Returns : 2D Array reference + Args : none + + +=cut + +sub get_MLmatrix{ + my ($self,@args) = @_; + return $self->{'_mlmatrix'}; +} + +=head2 set_NGmatrix + + Title : set_NGmatrix + Usage : $result->set_NGmatrix($mat) + Function: Set the Nei & Gojobori Matrix + Returns : none + Args : Arrayref to NGmatrix (must be arrayref to 2D matrix whic is + lower triangle pairwise) + + +=cut + +sub set_NGmatrix{ + my ($self,$mat) = @_; + return unless ( defined $mat ); + if( ref($mat) !~ /ARRAY/i ) { + $self->warn("Did not provide a valid 2D Array reference for set_NGmatrix"); + return; + } + $self->{'_ngmatrix'} = $mat; +} + +=head2 get_NGmatrix + + Title : get_NGmatrix + Usage : my $mat = $result->get_NGmatrix() + Function: Get the Nei & Gojobori matrix + Returns : 2D Array reference + Args : none + + +=cut + +sub get_NGmatrix{ + my ($self,@args) = @_; + return $self->{'_ngmatrix'}; +} + + +=head2 add_seq + + Title : add_seq + Usage : $obj->add_seq($seq) + Function: Add a Bio::PrimarySeq to the Result + Returns : none + Args : Bio::PrimarySeqI +See also : L + +=cut + +sub add_seq{ + my ($self,$seq) = @_; + if( $seq ) { + unless( $seq->isa("Bio::PrimarySeqI") ) { + $self->warn("Must provide a valid Bio::PrimarySeqI to add_seq"); + return; + } + push @{$self->{'_seqs'}},$seq; + } + +} + +=head2 reset_seqs + + Title : reset_seqs + Usage : $result->reset_seqs + Function: Reset the OTU seqs stored + Returns : none + Args : none + + +=cut + +sub reset_seqs{ + my ($self) = @_; + $self->{'_seqs'} = []; +} + +=head2 get_seqs + + Title : get_seqs + Usage : my @otus = $result->get_seqs + Function: Get the seqs Bio::PrimarySeq (OTU = Operational Taxonomic Unit) + Returns : Array of Bio::PrimarySeq + Args : None +See also : L + +=cut + +sub get_seqs{ + my ($self) = @_; + return @{$self->{'_seqs'}}; +} + +=head2 set_codon_pos_basefreq + + Title : set_codon_pos_basefreq + Usage : $result->set_codon_pos_basefreq(@freqs) + Function: Set the codon position base frequencies + Returns : none + Args : Array of length 3 where each slot has a hashref + keyed on DNA base + + +=cut + +sub set_codon_pos_basefreq { + my ($self,@codonpos) = @_; + if( scalar @codonpos != 3 ) { + $self->warn("invalid array to set_codon_pos_basefreq, must be an array of length 3"); + return; + } + foreach my $pos ( @codonpos ) { + if( ref($pos) !~ /HASH/i || + ! exists $pos->{'A'} ) { + $self->warn("invalid array to set_codon_pos_basefreq, must be an array with hashreferences keyed on DNA bases, C,A,G,T"); + } + } + $self->{'_codonposbasefreq'} = [@codonpos]; +} + +=head2 get_codon_pos_basefreq + + Title : get_codon_pos_basefreq + Usage : my @basepos = $result->get_codon_pos_basefreq; + Function: Get the codon position base frequencies + Returns : Array of length 3 (each codon position), each + slot is a hashref keyed on DNA bases, the values are + the frequency of the base at that position for all sequences + Args : none + Note : The array starts at 0 so position '1' is in position '0' + of the array + +=cut + +sub get_codon_pos_basefreq{ + my ($self) = @_; + return @{$self->{'_codonposbasefreq'}}; +} + +=head2 version + + Title : version + Usage : $obj->version($newval) + Function: Get/Set version + Returns : value of version + Args : newvalue (optional) + + +=cut + +sub version{ + my $self = shift; + $self->{'_version'} = shift if @_; + return $self->{'_version'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Phylo/Phylip/ProtDist.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Phylo/Phylip/ProtDist.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,143 @@ +# BioPerl module for Bio::Tools::Phylo::Phylip::ProtDist +# +# Cared for by Shawn Hoon +# +# Copyright Shawn Hoon +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::Phylo::Phylip::ProtDist - DESCRIPTION of Object + +=head1 SYNOPSIS + + use Bio::Tools::Phylo::Phylip::ProtDist; + my $parser = new Bio::Tools::Phylo::Phylip::ProtDist(-file => 'outfile'); + while( my $result = $parser->next_matrix) { + + } + +=head1 DESCRIPTION + +A parser for ProtDist output into a Bio::Matrix::PhylipDist object + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Shawn Hoon + +Email shawnh@fugu-sg.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Tools::Phylo::Phylip::ProtDist; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::Matrix::PhylipDist; + +use Bio::Root::IO; + +@ISA = qw(Bio::Root::Root Bio::Root::IO ); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Tools::Phylo::Phylip::ProtDist(); + Function: Builds a new Bio::Tools::Phylo::Phylip::ProtDist object + Returns : Bio::Tools::ProtDist + Args : -fh/-file => $val, # for initing input, see Bio::Root::IO + + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + $self->_initialize_io(@args); + + return $self; +} + +=head2 next_result + + Title : next_result + Usage : my $matrix = $parser->next_result + Function: Get the next result set from parser data + Returns : L + Args : none + + +=cut + +sub next_matrix{ + my ($self) = @_; + my @names; + my @values; + my $entry; + while ($entry=$self->_readline) { + if($#names >=0 && $entry =~/^\s+\d+$/){ + last; + } + elsif($entry=~/^\s+\d+\n$/){ + next; + } + my ($n,@line) = split( /\s+/,$entry); + push @names, $n; + push @values, [@line]; + } + $#names>=0 || return; + my %dist; + my $i=0; + foreach my $name(@names){ + my $j=0; + foreach my $n(@names) { + $dist{$name}{$n} = [$i,$j]; + $j++; + } + $i++; + } + my $matrix = Bio::Matrix::PhylipDist->new(-matrix=>\%dist, + -names =>\@names, + -values=>\@values); + return $matrix; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Prediction/Exon.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Prediction/Exon.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,224 @@ +# $Id: Exon.pm,v 1.10 2002/10/22 07:38:48 lapp Exp $ +# +# BioPerl module for Bio::Tools::Prediction::Exon +# +# Cared for by Hilmar Lapp +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::Prediction::Exon - A predicted exon feature + +=head1 SYNOPSIS + +See documentation of methods. + +=head1 DESCRIPTION + +A feature representing a predicted exon. This class actually inherits +off Bio::SeqFeature::Gene::Exon and therefore has all that +functionality (also implements Bio::SeqFeatureI), plus a few methods +supporting predicted features, like various scores and a +significance. Even though these were inspired by GenScan results, at +least a subset should be generally useable for exon prediction +results. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp@gmx.net + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Tools::Prediction::Exon; +use vars qw(@ISA); +use strict; + +use Bio::SeqFeature::Gene::Exon; + +@ISA = qw(Bio::SeqFeature::Gene::Exon); + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + my ($primary) = $self->_rearrange([qw(PRIMARY)],@args); + + return $self; +} + + +=head2 predicted_cds + + Title : predicted_cds + Usage : $predicted_cds_dna = $exon->predicted_cds(); + $exon->predicted_cds($predicted_cds_dna); + Function: Get/Set the CDS (coding sequence) as predicted by a program. + + This method is independent of an attached_seq. There is no + guarantee whatsoever that the returned CDS has anything to do + (e.g., matches) with the sequence covered by the exons as annotated + through this object. + + Example : + Returns : A Bio::PrimarySeqI implementing object holding the DNA sequence + defined as coding by a prediction of a program. + Args : On set, a Bio::PrimarySeqI implementing object holding the DNA + sequence defined as coding by a prediction of a program. + +=cut + +sub predicted_cds { + my ($self, $cds) = @_; + + if(defined($cds)) { + $self->{'_predicted_cds'} = $cds; + } + return $self->{'_predicted_cds'}; +} + +=head2 predicted_protein + + Title : predicted_protein + Usage : $predicted_protein_seq = $exon->predicted_protein(); + $exon->predicted_protein($predicted_protein_seq); + Function: Get/Set the protein translation as predicted by a program. + + This method is independent of an attached_seq. There is no + guarantee whatsoever that the returned translation has anything to + do with the sequence covered by the exons as annotated + through this object, or the sequence returned by predicted_cds(), + although it should usually be just the standard translation. + + Example : + Returns : A Bio::PrimarySeqI implementing object holding the protein + translation as predicted by a program. + Args : On set, a Bio::PrimarySeqI implementing object holding the protein + translation as predicted by a program. + +=cut + +sub predicted_protein { + my ($self, $aa) = @_; + + if(defined($aa)) { + $self->{'_predicted_aa'} = $aa; + } + return $self->{'_predicted_aa'}; +} + +=head2 significance + + Title : significance + Usage : $evalue = $obj->significance(); + $obj->significance($evalue); + Function: + Returns : + Args : + + +=cut + +sub significance { + my ($self, $value) = @_; + + return $self->_tag_value('signif', $value); +} + +=head2 start_signal_score + + Title : start_signal_score + Usage : $sc = $obj->start_signal_score(); + $obj->start_signal_score($evalue); + Function: Get/Set a score for the exon start signal (acceptor splice site + or initiation signal). + Returns : + Args : + + +=cut + +sub start_signal_score { + my ($self, $value) = @_; + + return $self->_tag_value('AccScore', $value); +} + +=head2 end_signal_score + + Title : end_signal_score + Usage : $sc = $obj->end_signal_score(); + $obj->end_signal_score($evalue); + Function: Get/Set a score for the exon end signal (donor splice site + or termination signal). + Returns : + Args : + + +=cut + +sub end_signal_score { + my ($self, $value) = @_; + + return $self->_tag_value('DonScore', $value); +} + +=head2 coding_signal_score + + Title : coding_signal_score + Usage : $sc = $obj->coding_signal_score(); + $obj->coding_signal_score($evalue); + Function: Get/Set a score for the exon coding signal (e.g., coding potential). + Returns : + Args : + + +=cut + +sub coding_signal_score { + my ($self, $value) = @_; + + return $self->_tag_value('CodScore', $value); +} + +# +# Everything else is just inherited from SeqFeature::Generic. +# + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Prediction/Gene.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Prediction/Gene.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,157 @@ +# $Id: Gene.pm,v 1.8 2002/10/22 07:38:48 lapp Exp $ +# +# BioPerl module for Bio::Tools::Prediction::Gene +# +# Cared for by Hilmar Lapp +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::Prediction::Gene - a predicted gene structure feature + +=head1 SYNOPSIS + +See documentation of methods. + +=head1 DESCRIPTION + +A feature representing a predicted gene structure. This class actually +inherits off Bio::SeqFeature::Gene::Transcript and therefore has all that +functionality, plus a few methods supporting predicted sequence features, +like a predicted CDS and a predicted translation. + +Exons held by an instance of this class will usually be instances of +Bio::Tools::Prediction::Exon, although they do not have to be. Refer to the +documentation of the class that produced the instance. + +Normally, you will not want to create an instance of this class yourself. +Instead, classes representing the results of gene structure prediction +programs will do that. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp@gmx.net or hilmar.lapp@pharma.novartis.com + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Tools::Prediction::Gene; +use vars qw(@ISA); +use strict; + +use Bio::SeqFeature::Gene::Transcript; + + +@ISA = qw(Bio::SeqFeature::Gene::Transcript); + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + my ($primary) = $self->_rearrange([qw(PRIMARY)],@args); + + $primary = 'predicted_gene' unless $primary; + $self->primary_tag($primary); + return $self; +} + + +=head2 predicted_cds + + Title : predicted_cds + Usage : $predicted_cds_dna = $gene->predicted_cds(); + $gene->predicted_cds($predicted_cds_dna); + Function: Get/Set the CDS (coding sequence) as predicted by a program. + + This method is independent of an attached_seq. There is no + guarantee whatsoever that the returned CDS has anything to do + (e.g., matches) with the sequence covered by the exons as annotated + through this object. + + Example : + Returns : A Bio::PrimarySeqI implementing object holding the DNA sequence + defined as coding by a prediction of a program. + Args : On set, a Bio::PrimarySeqI implementing object holding the DNA + sequence defined as coding by a prediction of a program. + +=cut + +sub predicted_cds { + my ($self, $cds) = @_; + + if(defined($cds)) { + $self->{'_predicted_cds'} = $cds; + } + return $self->{'_predicted_cds'}; +} + +=head2 predicted_protein + + Title : predicted_protein + Usage : $predicted_protein_seq = $gene->predicted_protein(); + $gene->predicted_protein($predicted_protein_seq); + Function: Get/Set the protein translation as predicted by a program. + + This method is independent of an attached_seq. There is no + guarantee whatsoever that the returned translation has anything to + do with the sequence covered by the exons as annotated + through this object, or the sequence returned by predicted_cds(), + although it should usually be just the standard translation. + + Example : + Returns : A Bio::PrimarySeqI implementing object holding the protein + translation as predicted by a program. + Args : On set, a Bio::PrimarySeqI implementing object holding the protein + translation as predicted by a program. + +=cut + +sub predicted_protein { + my ($self, $aa) = @_; + + if(defined($aa)) { + $self->{'_predicted_aa'} = $aa; + } + return $self->{'_predicted_aa'}; +} + +# +# Everything else is just inherited from SeqFeature::GeneStructure. +# + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Primer3.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Primer3.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,566 @@ +# +# Copyright (c) 1997-2001 bioperl, Chad Matsalla. All Rights Reserved. +# This module is free software; you can redistribute it and/or +# modify it under the same terms as Perl itself. +# +# Copyright Chad Matsalla +# +# You may distribute this module under the same terms as perl itself +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::Primer3 - Create input for and work with the output from the +program primer3 + +=head1 SYNOPSIS + +Chad will put synopses here by the end of the second week of october, 2002. + +=head1 DESCRIPTION + +Bio::Tools::Primer3 creates the input files needed to design primers using +primer3 and provides mechanisms to access data in the primer3 output files. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://www.bioperl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + + +=head1 AUTHOR - Chad Matsalla + +bioinformatics1@dieselwurks.com + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::Tools::Primer3; + +use vars qw(@ISA); +use strict; +use Bio::Seq; +use Bio::SeqFeature::Primer; +use Bio::Seq::PrimedSeq; +use Bio::Seq::SeqFactory; + +use Bio::Root::Root; +use Bio::Root::IO; + +use Dumpvalue; + +@ISA = qw(Bio::Root::Root Bio::Root::IO); + + # Chad likes to use this to debug large hashes. +my $dumper = new Dumpvalue; + + # this was a bunch of the seqio things, now deprecated. delete it soon. + # sub _initialize { + # my($self,@args) = @_; + # $self->SUPER::_initialize(@args); + # if( ! defined $self->sequence_factory ) { + # $self->sequence_factory(new Bio::Seq::SeqFactory + # (-verbose => $self->verbose(), + # -type => 'Bio::Seq')); + # } + # } + + +=head2 new() + + Title : new() + Usage : + Function: + Returns : + Args : + Notes : + +=cut + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + my($filename) = $self->_rearrange([qw(FILE)],@args); + if (!$filename) { + print("Ahh grasshopper, you are planning to create a primer3 infile\n"); + return $self; + } + $self->{filename} = $filename; + # check to see that the file exists + # I think that catfile should be used here. + if (!-f $filename) { + print("That file doesn't exist. Bah.\n"); + } + $self->_initialize_io( -file => $filename ); + return $self; +} + + + + +=head2 null + + Title : + Usage : + Function: + Returns : + Args : + Notes : + +=cut + + + + + + + +=head2 next_primer() + + Title : next_primer() + Usage : $primer3 = $stream->next_primer() + Function: returns the next primer in the stream + Returns : Bio::Seq::PrimedSeq containing: + - 2 Bio::SeqFeature::Primer representing the primers + - 1 Bio::Seq representing the target sequence + - 1 Bio::Seq representing the amplified region + Args : NONE + Notes : + +=cut + +sub next_primer { + my $self = shift; + my $fh = $self->_fh(); + my ($line,%primer); + # first, read in the next set of primers + while ($line = $self->_readline()) { + chomp ($line); + last if ($line =~ /^=/); + $line =~ m/(^.*)\=(.*$)/; + $primer{$1} = $2; + } + # then, get the primers as SeqFeature::Primer objects + + my ($left,$right) = &_create_primer_features(\%primer); + # then, create the sequence to place them on + my $sequence = Bio::Seq->new(-seq => $primer{SEQUENCE}, + -id => $primer{PRIMER_SEQUENCE_ID}); + # print("Sequence is ".$primer{SEQUENCE}." and id is ".$primer{PRIMER_SEQUENCE_ID}."\n"); + my $primedseq = new Bio::Seq::PrimedSeq( + -target_sequence => $sequence, + -left_primer => $left, + -right_primer => $right, + -primer_sequence_id => $primer{PRIMER_SEQUENCE_ID}, + -primer_comment => $primer{PRIMER_COMMENT}, + -target => $primer{TARGET}, + -primer_product_size_range => $primer{PRIMER_PRODUCT_SIZE_RANGE}, + -primer_file_flag => $primer{PRIMER_FILE_FLAG}, + -primer_liberal_base => $primer{PRIMER_LIBERAL_BASE}, + -primer_num_return => $primer{PRIMER_NUM_RETURN}, + -primer_first_base_index => $primer{PRIMER_FIRST_BASE_INDEX}, + -primer_explain_flag => $primer{PRIMER_EXPLAIN_FLAG}, + -primer_pair_compl_any => $primer{PRIMER_PAIR_COMPL_ANY}, + -primer_pair_compl_end => $primer{PRIMER_PAIR_COMPL_END}, + -primer_product_size => $primer{PRIMER_PRODUCT_SIZE} + ); + return $primedseq; +} + + +=head2 _create_primer_features() + + Title : _create_primer_features() + Usage : &_create_primer_features() + Function: This is an internal method used by next_seq() to create the + Bio::SeqFeature::Primer objects necessary to represent the primers + themselves. + Returns : An array of 2 Bio::SeqFeature::Primer objects. + Args : None. + Notes : This is an internal method. Do not call this method. + +=cut + + +sub _create_primer_features { + my $rdat = shift; + my (%left,%right,$updir,$downdir,$var,$trunc); + my @variables = qw( + PRIMER_DIRECTION + PRIMER_DIRECTION_END_STABILITY + PRIMER_DIRECTION_EXPLAIN + PRIMER_DIRECTION_GC_PERCENT + PRIMER_DIRECTION_PENALTY + PRIMER_DIRECTION_SELF_ANY + PRIMER_DIRECTION_SELF_END + PRIMER_DIRECTION_SEQUENCE + PRIMER_DIRECTION_TM + PRIMER_FIRST_BASE_INDEX + ); + # create the hash to pass into the creation routine + # I do it this way because the primer3 outfile variables are exactly the same for each of + # left and right. I create two hashes- one for the left and one for the right primer. + foreach $updir (qw(LEFT RIGHT)) { + my %dat; + foreach (@variables) { + ($var = $_) =~ s/DIRECTION/$updir/e; + # should you truncate the name of each variable? + # for example, should the value be: PRIMER_RIGHT_PENALTY or PENALTY? + # i think it should be the second one + if (/^PRIMER_DIRECTION$/) { + $trunc = "PRIMER"; + } + elsif (/^PRIMER_FIRST_BASE_INDEX/) { + $trunc = "FIRST_BASE_INDEX"; + } + else { + ($trunc = $_) =~ s/PRIMER_DIRECTION_//; + } + $dat{"-$trunc"} = $rdat->{$var}; + } + if ($updir eq "LEFT") { + %left = %dat; + $left{-id} = $rdat->{PRIMER_SEQUENCE_ID}."-left"; + } + else { + %right = %dat; + $right{-id} = $rdat->{PRIMER_SEQUENCE_ID}."-right"; + } + } + my $primer_left = new Bio::SeqFeature::Primer(%left); + my $primer_right = new Bio::SeqFeature::Primer(%right); + return($primer_left,$primer_right); +} + + + + + + + + + +=head2 get_amplified_region() + + Title : get_amplified_region() + Usage : $primer->get_amplified_region() + Function: Returns a Bio::Seq object representing the sequence amplified + Returns : (I think) A Bio::Seq object + Args : None. + Notes : This is not implemented at this time. + Note to chad: implement this simple getter. + Developer notes: There obviously isn't a way for a single primer to know about + its amplified region unless it is paired with another primer. At this time + these object will generally be created with another so I will put in this + method. If there is no sequence null is returned. + + THIS DOES NOT BELONG HERE. Put this into something else. + + +=cut + +sub get_amplified_region { + my ($self) = @_; +} # end get_amplified_region + +=head2 get_amplification_error() + + Title : get_amplification_error() + Usage : + Function: + Returns : + Args : + Notes : +Developer Notes: + THIS DOES NOT BELONG HERE. Put this into something else. + +=cut + +sub get_amplification_error { + my $primer = $_[1]; + my $error = $Primer3::primers{$primer}{PRIMER_ERROR}; + if ($error) { return $error; } + else { return "Some error that primer3 didn't define.\n"; } +} + +=head2 _set_target() + + Title : _set_target() + Usage : &_set_target($self); + Function: + Returns : + Args : + Notes : +Developer Notes: Really I have no idea why I put this in here. + It can is referenced by new_deprecated and by run_primer3 + + +=cut + +sub _set_target { + my $self = shift; + my ($sequence,$primer,$primer_left,$primer_right,$position_left,$position_right,$boggle); + $boggle = 1; + foreach $primer (sort keys %{$self->{primers}}) { + $sequence = $self->{primers}{$primer}{SEQUENCE}; + $primer_left = $self->{primers}{$primer}{PRIMER_LEFT}; + $primer_right = $self->{primers}{$primer}{PRIMER_RIGHT}; + if (!$primer_left) { + $self->{primers}{$primer}{design_failed} = "1"; + } + else { + $primer_left =~ m/(.*)\,(.*)/; + $position_left = $1+$2-1; + $primer_right =~ m/(.*)\,(.*)/; + $position_right = $1-$2; + $self->{primers}{$primer}{left} = $position_left; + $self->{primers}{$primer}{right} = $position_right; + $self->{primers}{$primer}{amplified} = substr($sequence,$position_left,$position_right-$position_left); + } + } +} + +=head2 _read_file($self,$filename) + + Title : _read_file($self,$filename) + Usage : + Function: + Returns : A scalar containing the contents of $filename + Args : $self and the name of a file to parse. + Notes : +Developer notes: Honestly, I have no idea what this is for. + + +=cut + +sub _read_file { + # my ($self,$filename) = @_; + # set this to keep track of things.... + # $self->{outfilename} = $filename; + # to make this better for bioperl, chad should really be using catfile and things. + # + # my $fh = new FileHandle; + # open($fh,$filename) or die "I can't open the primer report ($filename) : $!\n"; + # # _parse_report(); + # # my %Primer3::primers; + # my ($output,$line); + # while ($line=<$fh>) { + # # print("Adding $line\n"); + # $output .= $line; + # } # end while + # # print("\$output is $output\n"); + # return $output; +} + + + + + +=head2 _parse_report() + + Title : _parse_report() + Usage : &_parse_report($self,$filename); + Function: Parse a primer3 outfile and place everything into an object under + {primers} with PRIMER_SEQUENCE_ID being the name of the keys for the + {primers} hash. + Returns : Nothing. + Args : $self and the name of a file to parse. + Notes : + +=cut + +sub _parse_report { + # old + # my ($self,$filename) = @_; + my ($self,$outputs) = @_; + # print("\$self is $self, \$outputs are $outputs\n"); + # print("in _parse_report, \$self is $self\n"); + # set this to keep track of things.... + my ($sequence_name,$line,$counter,$variable_name,$variable_value); + my @output = split/\n/,$outputs; + foreach $line (@output) { + # print("Reading line $line\n"); + next if ($line =~ /^\=/); + if ($line =~ m/^PRIMER_SEQUENCE_ID/) { + $line =~ m/(\S+)=(.*$)/; + $variable_name = $1; + $sequence_name = $2; + $variable_value = $2; + } + else { + $line =~ m/(\S+)=(.*$)/; + $variable_name = $1; + $variable_value = $2; + } + # print("$sequence_name\t$variable_name\t$variable_value\n"); + $self->{primers}{$sequence_name}{$variable_name} = $variable_value; + } # end while <> +} # end parse_report + +=head2 _construct_empty() + + Title : _construct_empty() + Usage : &_construct_empty($self); + Function: Construct an empty object that will be used to construct a primer3 + input "file" so that it can be run. + Returns : + Args : + Notes : + +=cut + +sub _construct_empty { + my $self = shift; + $self->{inputs} = {}; + return; +} + +=head2 add_target(%stuff) + + Title : add_target(%stuff) + Usage : $o_primer->add_target(%stuff); + Function: Add an target to the infile constructor. + Returns : + Args : A hash. Looks something like this: + $o_primer2->add_target( + -PRIMER_SEQUENCE_ID => "sN11902", + -PRIMER_COMMENT => "3831", + -SEQUENCE => "some_sequence", + -TARGET => "513,26", + -PRIMER_PRODUCT_SIZE_RANGE => "100-500", + -PRIMER_FILE_FLAG => "0", + -PRIMER_LIBERAL_BASE => "1", + -PRIMER_NUM_RETURN => "1", + -PRIMER_FIRST_BASE_INDEX => "1", + -PRIMER_EXPLAIN_FLAG => "1"); + The add_target() method does not validate the things you put into + this parameter hash. Read the docs for Primer3 to see which fields + do what and how they should be used. + Notes : To design primers, first create a new CSM::Primer3 object with the + -construct_infile parameter. Then, add targets using this method + (add_target()) with the target hash as above in the Args: section. + Be careful. No validation will be done here. All of those parameters + will be fed straight into primer3. + Once you are done adding targets, invoke the function run_primer3(). + Then retrieve the results using something like a loop around the array + from get_primer_sequence_IDs(); + +=cut + + +sub add_target { + my ($self,%args) = @_; + my ($currkey,$renamed,$sequence_id,$value); + if (!$args{-PRIMER_SEQUENCE_ID}) { + print("You cannot add an element to the primer3 infile without specifying the PRIMER_SEQUENCE_ID. Sorry.\n"); + } + else { + $sequence_id = $args{-PRIMER_SEQUENCE_ID}; + foreach $currkey (keys %args) { + # print("\$currkey is $currkey\n"); + next if ($currkey eq "-PRIMER_SEQUENCE_ID"); + ($renamed = $currkey) =~ s/-//; + # print("Adding $renamed to the hash under $sequence_id\n"); + $value = $args{$currkey}; + # print("\$value is $value\n"); + if ($renamed eq "SEQUENCE") { $value =~ s/\n//g; } + $self->{infile}{$sequence_id}{$renamed} = $value; + } + } +} + +=head2 get_primer_sequence_IDs() + + Title : get_primer_sequence_IDs() + Usage : $o_phred->get_primer_sequence_IDs(); + Function: Return the primer sequence ID's. These normally correspond to + the name of a sequence in a database but can be whatever was used when + the primer3 infile was constructed. + Returns : An array containing the names of the primer sequence ID's + Args : None. + Notes : This would be used as the basis for an iterator to loop around each + primer that was designed. + +=cut + +sub get_primer_sequence_IDs { + my $self = shift; + return sort keys %{$self->{primers}}; +} # end get keys + +=head2 dump_hash() + + Title : dump_hash() + Usage : $o_primer->dump_hash(); + Function: Dump out the CSM::Primer3 object. + Returns : Nothing. + Args : None. + Notes : Used extensively in debugging. + +=cut + +sub dump_hash { + my $self = shift; + my $dumper = new Dumpvalue; + $dumper->dumpValue($self); +} # end dump_hash + +=head2 dump_infile_hash() + + Title : dump_infile_hash() + Usage : $o_primer->dump_infile_hash(); + Function: Dump out the contents of the infile hash. + Returns : Nothing. + Args : None. + Notes : Used for debugging the construction of the infile. + +=cut + +sub dump_infile_hash { + my $self = shift; + my $dumper = new Dumpvalue; + $dumper->dumpValue($self->{infile}); +} + + + +1; +__END__ + +=head2 placeholder + + Title : This is a place holder so chad can cut and paste + Usage : + Function: + Returns : + Args : + Notes : + +=cut + +=head1 SEE ALSO + +perl(1). + +=cut diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Prints.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Prints.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,247 @@ +# $Id: Prints.pm,v 1.7 2002/10/22 07:45:22 lapp Exp $ +# +# BioPerl module for Bio::Tools::Prints +# +# Cared for by Balamurugan Kumarasamy +# +# You may distribute this module under the same terms as perl itself +# POD documentation - main docs before the code +# + +=head1 NAME + +Bio::Tools::Prints - Parser for FingerPRINTScanII program + +=head1 SYNOPSIS + + use Bio::Tools::Prints; + my $prints_parser = new Bio::Tools::Prints(-fh =>$filehandle ); + while( my $prints_feat = $prints_parser->next_result ) { + push @prints_feat, $prints_feat; + } + +=head1 DESCRIPTION + + PRINTScan II is a PRINTS fingerprint identification algorithm. + Copyright (C) 1998,1999 Phil Scordis + +=head1 FEEDBACK + +=head2 Mailing Lists + + User feedback is an integral part of the evolution of this and other + Bioperl modules. Send your comments and suggestions preferably to + the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + + Report bugs to the Bioperl bug tracking system to help us keep track + of the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Balamurugan Kumarasamy + + Email: fugui@worf.fugu-sg.org + +=head1 APPENDIX + + The rest of the documentation details each of the object methods. + Internal methods are usually preceded with a _ + + +=cut + +package Bio::Tools::Prints; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::SeqFeature::FeaturePair; +use Bio::Root::IO; +use Bio::SeqFeature::Generic; +@ISA = qw(Bio::Root::Root Bio::Root::IO ); + + + +=head2 new + + Title : new + Usage : my $obj = new Bio::Tools::Prints(-fh=>$filehandle); + Function: Builds a new Bio::Tools::Prints object + Returns : Bio::Tools::Prints + Args : -filename + -fh (filehandle) + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + $self->_initialize_io(@args); + + return $self; +} + + +=head2 next_result + + Title : next_result + Usage : my $feat = $prints_parser->next_result + Function: Get the next result set from parser data + Returns : L + Args : none + +=cut + +sub next_result { + my ($self) = @_; + my %printsac; + my @features; + my $line; + my $sequenceId; + + while ($_=$self->_readline()) { + + + $line = $_; + chomp $line; + + if ($line =~ s/^Sn;//) { # We have identified a Sn; line so there should be the following: + + ($sequenceId) = $line =~ /^\s*(\w+)/; + $self->seqname($sequenceId); + next; + } + + + if ($line =~ s/^1TBH//) { + my ($id) = $line =~ /^\s*(\w+)/; + my ($ac) = $line =~ /(PR\w+)\s*$/; + $printsac{$id} = $ac; + $self->print_sac(\%printsac); + next; + } + + + if ($line =~ s/^3TB//) { + + + + + if ($line =~ s/^[HN]//) { + my($num)=""; + $line =~ s/^\s+//; + + my @elements = split /\s+/, $line; + + my ($fingerprintName,$motifNumber,$temp,$tot,$percentageIdentity,$profileScore,$pvalue,$subsequence,$motifLength,$lowestMotifPosition,$matchPosition,$highestMotifPosition) = @elements; + + my $start = $matchPosition; + my $end = $matchPosition + $motifLength - 1; + my $print_sac = $self->print_sac; + + my %printsac = %{$print_sac}; + my $print = $printsac{$fingerprintName}; + my $seqname=$self->seqname; + my $feat = "$print,$start,$end,$percentageIdentity,$profileScore,$pvalue"; + my $new_feat = $self->create_feature($feat,$seqname); + return $new_feat; + } + if ($line =~ s/^F//) { + return; + } + next; } + next; + + } + + + +} + +=head2 create_feature + + Title : create_feature + Usage : my $feat=$prints_parser->create_feature($feature,$seqname) + Function: creates a SeqFeature Generic object + Returns : L + Args : + + +=cut + +sub create_feature { + my ($self, $feat,$sequenceId) = @_; + + my @f = split (/,/,$feat); + # create feature object + my $feature= Bio::SeqFeature::Generic->new(-seq_id =>$sequenceId, + -start=>$f[1], + -end => $f[2], + -score => $f[4], + -source => "PRINTS", + -primary =>$f[0], + -logic_name => "PRINTS", + ); + $feature->add_tag_value('evalue',$f[5]); + $feature->add_tag_value('percent_id',$f[3]); + + + + return $feature; + +} + +=head2 print_sac + + Title : print_sac + Usage : $prints_parser->print_sac($print_sac) + Function: get/set for print_sac + Returns : + Args : + + +=cut + +sub print_sac{ + my($self,$printsac)=@_; + + if(defined($printsac)) + { + $self->{'print_sac'}=$printsac; + } + return $self->{'print_sac'}; + +} + +=head2 seqname + + Title : seqname + Usage : $prints_parser->seqname($seqname) + Function: get/set for seqname + Returns : + Args : + + +=cut + +sub seqname{ + my($self,$seqname)=@_; + + if(defined($seqname)) + { + $self->{'seqname'}=$seqname; + } + + return $self->{'seqname'}; + +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Profile.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Profile.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,169 @@ +# BioPerl module for Bio::Tools::Profile +# +# Cared for by Balamurugan Kumarasamy +# +# You may distribute this module under the same terms as perl itself +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::Profile - parse Profile output + +=head1 SYNOPSIS + + use Bio::Tools::Profile; + my $profile_parser = new Bio::Tools::Profile(-fh =>$filehandle ); + while( my $profile_feat = $profile_parser->next_result ) { + push @profile_feat, $profile_feat; +} + +=head1 DESCRIPTION + + Parser for Profile output + +=head1 FEEDBACK + +=head2 Mailing Lists + + User feedback is an integral part of the evolution of this and other + Bioperl modules. Send your comments and suggestions preferably to + the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + + Report bugs to the Bioperl bug tracking system to help us keep track + of the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ +=head1 AUTHOR - Balamurugan Kumarasamy + + Email: fugui@worf.fugu-sg.org + +=head1 APPENDIX + + The rest of the documentation details each of the object methods. + Internal methods are usually preceded with a _ + + +=cut + + +package Bio::Tools::Profile; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::SeqFeature::FeaturePair; +use Bio::Root::IO; +use Bio::SeqFeature::Generic; + +@ISA = qw(Bio::Root::Root Bio::Root::IO ); + + + +=head2 new + + Title : new + Usage : my $obj = new Bio::Tools::Profile(); + Function: Builds a new Bio::Tools::Profile object + Returns : Bio::Tools::Profile + Args : -filename + -fh ($filehandle) + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + $self->_initialize_io(@args); + + return $self; +} + +=head2 next_result + + Title : next_result + Usage : my $feat = $profile_parser->next_result + Function: Get the next result set from parser data + Returns : L + Args : none + + +=cut + +sub next_result { + my ($self) = @_; + + my %printsac; + my $line; + my @features; + while ($_=$self->_readline()) { + $line = $_; + chomp $line; + my ($nscore,$rawscore,$from,$to,$hfrom,$hto,$ac) = $line =~ /(\S+)\s+(\d+)\s*pos.\s+(\d*)\s*-\s+(\d*)\s*\[\s+(\d*),\s+(\S*)\]\s*(\w+)/; + #for example in this output line + #38.435 2559 pos. 19958 - 20212 [ 1, -1] PS50011|PROTEIN_KINASE_DOM Protein kinase domain profile. + #$nscore = 38.435 + #$rawscore = 2559 + #$from = 19958 + #$end = 20212 + #$hfrom = 1 + #$hto =-1 + #$ac = PS50011 + my $feat = "$ac,$from,$to,$hfrom,$hto,$nscore"; + my $new_feat= $self->create_feature($feat); + return $new_feat + + } +} + + +=head2 create_feature + + Title : create_feature + Usage : my $feat= $profile_parser->create_feature($feature) + Function: creates a Bio::SeqFeature::FeaturePair object + Returns : L + Args : + + +=cut + +sub create_feature { + my ($self, $feat) = @_; + + my @f = split (/,/,$feat); + + + my $hto = $f[4]; + + if ($f[4] =~ /-1/) { + + $hto = $f[2] - $f[1] + 1; + + } + + + my $feat1 = new Bio::SeqFeature::Generic ( -start => $f[1], + -end => $f[2], + -score => $f[5], + -source=>'pfscan', + -primary=>$f[0]); + + my $feat2 = new Bio::SeqFeature::Generic (-start => $f[3], + -end => $hto, + ); + + my $feature = new Bio::SeqFeature::FeaturePair(-feature1 => $feat1, + -feature2 => $feat2); + + return $feature; + +} +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Promoterwise.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Promoterwise.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,230 @@ +# BioPerl module for Bio::Tools::Promoterwise +# +# Cared for by Shawn Hoon +# +# Copyright Shawn Hoon +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::Promoterwise - DESCRIPTION of Object + +=head1 SYNOPSIS + + + use Bio::Tools::Promoterwise; + + my $pw = Bio::Tools::Promoterwise->new(-file=>"out", + -query1_seq=>$seq1, + -query2_seq=>$seq2); + while (my $fp = $pw->next_result){ + print "Hit Length: ".$fp->feature1->length."\n"; + print "Hit Start: ".$fp->feature1->start."\n"; + print "Hit End: ".$fp->feature1->end."\n"; + print "Hsps: \n"; + my @first_hsp = $fp->feature1->sub_SeqFeature; + my @second_hsp = $fp->feature2->sub_SeqFeature; + foreach my $i (0..$#first_hsp){ + print $first_hsp[$i]->start. " ".$first_hsp[$i]->end." ". + $second_hsp[$i]->start. " ".$second_hsp[$i]->end."\n"; + } + } + +=head1 DESCRIPTION + +Promoteriwise is an alignment algorithm that relaxes the constraint +that local alignments have to be co-linear. Otherwise it provides a +similar model to DBA, which is designed for promoter sequence +alignments. Promoterwise is written by Ewan Birney. It is part of +the wise2 package available at: +ftp://ftp.ebi.ac.uk/pub/software/unix/wise2/ + +This module is the parser for the Promoterwise output in tab format. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Shawn Hoon + +Email shawnh@fugu-sg.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Tools::Promoterwise; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::SeqFeature::FeaturePair; +use Bio::SeqFeature::Generic; +use Bio::Root::IO; + +@ISA = qw(Bio::Root::Root Bio::Root::IO ); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Tools::Promoterwise(); + Function: Builds a new Bio::Tools::Promoterwise object + Returns : L + Args : -fh/-file => $val, # for initing input, see Bio::Root::IO + + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + $self->_initialize_io(@args); + my ($query1,$query2) = $self->_rearrange([qw(QUERY1_SEQ QUERY2_SEQ)],@args); + $self->query1_seq($query1) if ($query1); + $self->query2_seq($query2) if ($query2); + + return $self; +} + +=head2 next_result + + Title : next_result + Usage : my $r = $rpt_masker->next_result + Function: Get the next result set from parser data + Returns : an L + Args : none + + +=cut + +sub next_result { + my ($self) = @_; + $self->_parse unless $self->_parsed; + return $self->_next_result; +} + +sub _parse{ + my ($self) = @_; + my (%hash,@fp); + while ($_=$self->_readline()) { + chomp; + my @array = split; + push @{$hash{$array[$#array]}}, \@array; + } + foreach my $key(keys %hash){ + my $sf1 = Bio::SeqFeature::Generic->new(-primary=>"conserved_element", + -source_tag=>"promoterwise"); + $sf1->attach_seq($self->query1_seq) if $self->query1_seq; + my $sf2 = Bio::SeqFeature::Generic->new(-primary=>"conserved_element", + -source_tag=>"promoterwise"); + $sf2->attach_seq($self->query2_seq) if $self->query2_seq; + foreach my $info(@{$hash{$key}}){ + my ($score,$id1,$start_1,$end_1, $strand_1,$id2,$start_2,$end_2, + $strand_2,$group)= @{$info}; + if(!$sf1->strand && !$sf2->strand){ + $sf1->strand($strand_1); + $sf2->strand($strand_2); + $sf1->seq_id($id1); + $sf2->seq_id($id2); + $sf1->score($score); + $sf2->score($score); + } + my $sub1 = Bio::SeqFeature::Generic->new(-start=>$start_1, + -seq_id=>$id1, + -end =>$end_1, + -strand=>$strand_1, + -primary=>"conserved_element", + -source_tag=>"promoterwise", + -score=>$score); + $sub1->attach_seq($self->query1_seq) if $self->query1_seq; + + my $sub2 = Bio::SeqFeature::Generic->new(-start=>$start_2, + -seq_id=>$id2, + -end =>$end_2, + -strand=>$strand_2, + -primary=>"conserved_element", + -source_tag=>"promoterwise", + -score=>$score); + $sub2->attach_seq($self->query2_seq) if $self->query2_seq; + $sf1->add_SeqFeature($sub1,'EXPAND'); + $sf2->add_SeqFeature($sub2,'EXPAND'); + } + + my $fp = Bio::SeqFeature::FeaturePair->new(-feature1=>$sf1, + -feature2=>$sf2); + push @fp, $fp; + } + $self->_feature_pairs(\@fp); + $self->_parsed(1); + return; +} + +sub _feature_pairs { + my ($self,$fp) = @_; + if($fp){ + $self->{'_feature_pairs'} = $fp; + } + return $self->{'_feature_pairs'}; +} + +sub _next_result { + my ($self) = @_; + return undef unless (exists($self->{'_feature_pairs'}) && @{$self->{'_feature_pairs'}}); + return shift(@{$self->{'_feature_pairs'}}); +} +sub _parsed { + my ($self,$flag) = @_; + if($flag){ + $self->{'_flag'} = 1; + } + return $self->{'_flag'}; +} + +sub query1_seq { + my ($self,$val) = @_; + if($val){ + $self->{'query1_seq'} = $val; + } + return $self->{'query1_seq'}; +} +sub query2_seq { + my ($self,$val) = @_; + if($val){ + $self->{'query2_seq'} = $val; + } + return $self->{'query2_seq'}; +} +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Pseudowise.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Pseudowise.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,263 @@ +# BioPerl module for Bio::Tools::Pseudowise +# +# Copyright Fugu Team +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::Pseudowise - Results of one Pseudowise run + +=head1 SYNOPSIS + + use Bio::Tools::Pseudowise; + + my $parser = Bio::Tools::Pseudowise->new(-file=>"pw.out"); + while(my $feat = $parser->next_result){ + push @feat, $feat; + } + +=head1 DESCRIPTION + +Pseudowise is a pseudogene prediction program written by Ewan Birney as part of the +Wise Package. This module is the parser for the output of the program. + +http://www.sanger.ac.uk/software/wise2 + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Fugu Team + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Tools::Pseudowise; +use vars qw(@ISA); +use strict; +use Symbol; + +use Bio::Root::Root; +use Bio::Tools::AnalysisResult; +use Bio::SeqFeature::Generic; +use Bio::SeqFeature::Gene::Exon; +use Bio::Tools::Run::WrapperBase; +use Bio::SeqFeature::FeaturePair; +use Bio::SeqFeature::Gene::Transcript; +use Bio::SeqFeature::Gene::GeneStructure; + +@ISA = qw(Bio::Tools::AnalysisResult); + +sub _initialize_state { + my ($self,@args) = @_; + + # first call the inherited method! + $self->SUPER::_initialize_state(@args); + + # our private state variables + $self->{'_preds_parsed'} = 0; + $self->{'_has_cds'} = 0; + # array of pre-parsed predictions + $self->{'_preds'} = []; + # seq stack + $self->{'_seqstack'} = []; +} + +=head2 analysis_method + + Usage : $pseudowise->analysis_method(); + Purpose : Inherited method. Overridden to ensure that the name matches + /pseudowise/i. + Returns : String + Argument : n/a + +=cut + +#------------- +sub analysis_method { +#------------- + my ($self, $method) = @_; + if($method && ($method !~ /pseudowise/i)) { + $self->throw("method $method not supported in " . ref($self)); + } + return $self->SUPER::analysis_method($method); +} + +=head2 next_prediction + + Title : next_prediction + Usage : while($gene = $pseudowise->next_prediction()) { + # do something + } + Function: Returns the gene of the Pseudowise result + file. Call this method repeatedly until FALSE is returned. + + Example : + Returns : a Bio::SeqFeature::Generic + Args : + +=cut + +sub next_prediction { + my ($self,$filehandle) = @_; + my $gene; + + # if the prediction section hasn't been parsed yet, we do this now + $self->_parse_predictions($filehandle) unless $self->_predictions_parsed(); + + # get next gene structure + $gene = $self->_prediction(); + + return $gene; +} + +=head2 _parse_predictions + + Title : _parse_predictions() + Usage : $obj->_parse_predictions() + Function: Parses the prediction section. Automatically called by + next_prediction() if not yet done. + Example : + Returns : + +=cut + +sub _parse_predictions { + my ($self, $filehandle) = @_; + my $gene; + my @genes; + #The big parsing loop - parses exons and predicted peptides + while (<$filehandle>) + { + if (/Gene/i) + { + $gene = new Bio::SeqFeature::Generic ( + -primary => 'pseudogene', + -source => 'pseudowise'); + push @genes, $gene; + + while(<$filehandle>) { + my @gene_elements = split; + my $no = scalar(@gene_elements); + if ((/Gene/i) && $no == 3) { + my @element = split; + my $no = scalar(@element); + my $gene_start = $element[1]; + my $gene_end = $element[2]; + $gene->start($gene_start); + $gene->end($gene_end); + } + elsif (/Exon/i) { + my @element = split; + my $no = scalar(@element); + my $exon_start = $element[1]; + my $exon_end = $element[2]; + my $exon_phase = $element[4]; + my $exon = new Bio::SeqFeature::Generic ( + -start => $exon_start, + -end => $exon_end, + -primary => 'exon', + -source => 'pseudowise', + -frame => $exon_phase); + $gene->add_sub_SeqFeature($exon); + } + elsif ((/Gene/i) && $no != 3) { + $gene = new Bio::SeqFeature::Generic ( + -primary => 'pseudogene', + -source => 'pseudowise'); + push @genes, $gene; + } + } + } + } + $self->_add_prediction(\@genes); + $self->_predictions_parsed(1); + +} + +=head1 _prediction + + Title : _prediction() + Usage : $gene = $obj->_prediction() + Function: internal + Example : + Returns : + +=cut + +sub _prediction { + my ($self) = @_; + + return undef unless(exists($self->{'_preds'}) && @{$self->{'_preds'}}); + return shift(@{$self->{'_preds'}}); +} + +=head2 _add_prediction + + Title : _add_prediction() + Usage : $obj->_add_prediction($gene) + Function: internal + Example : + Returns : + +=cut + +sub _add_prediction { + my ($self, $gene) = @_; + + if(! exists($self->{'_preds'})) { + $self->{'_preds'} = []; + } + push(@{$self->{'_preds'}}, $gene); +} + +=head2 _predictions_parsed + + Title : _predictions_parsed + Usage : $obj->_predictions_parsed + Function: internal + Example : + Returns : TRUE or FALSE + +=cut + +sub _predictions_parsed { + my ($self, $val) = @_; + + $self->{'_preds_parsed'} = $val if $val; + if(! exists($self->{'_preds_parsed'})) { + $self->{'_preds_parsed'} = 0; + } + return $self->{'_preds_parsed'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/RepeatMasker.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/RepeatMasker.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,157 @@ +# BioPerl module for Bio::Tools::RepeatMasker +# +# Cared for by Shawn Hoon +# +# Copyright Shawn Hoon +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::RepeatMasker - DESCRIPTION of Object + +=head1 SYNOPSIS + + use Bio::Tools::RepeatMasker; + my $parser = new Bio::Tools::RepeatMasker(-file => 'seq.fa.out'); + while( my $result = $parser->next_result ) { + + } + +=head1 DESCRIPTION + +A parser for RepeatMasker output + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Shawn Hoon + +Email shawnh@fugu-sg.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Tools::RepeatMasker; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::SeqFeature::FeaturePair; +use Bio::Root::IO; + +@ISA = qw(Bio::Root::Root Bio::Root::IO ); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Tools::RepeatMasker(); + Function: Builds a new Bio::Tools::RepeatMasker object + Returns : Bio::Tools::RepeatMasker + Args : -fh/-file => $val, # for initing input, see Bio::Root::IO + + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + $self->_initialize_io(@args); + + return $self; +} + +=head2 next_result + + Title : next_result + Usage : my $r = $rpt_masker->next_result + Function: Get the next result set from parser data + Returns : L + Args : none + + +=cut + +sub next_result{ + my ($self) = @_; + while ($_=$self->_readline()) { + if (/no repetitive sequences detected/) { + print STDERR "RepeatMasker didn't find any repetitive sequences\n"; + return ; + } + if (/\d+/) { #ignore introductory lines + my @element = split; + # ignore features with negatives + next if ($element[11-13] =~ /-/); + my (%feat1, %feat2); + my ($score, $query_name, $query_start, $query_end, $strand, + $repeat_name, $repeat_class ) = (split)[0, 4, 5, 6, 8, 9, 10]; + + my ($hit_start,$hit_end); + if ($strand eq '+') { + ($hit_start, $hit_end) = (split)[11, 12]; + $strand = 1; + } + elsif ($strand eq 'C') { + ($hit_start, $hit_end) = (split)[12, 13]; + $strand = -1; + } + my $rf = Bio::SeqFeature::Generic->new; + $rf->seq_id ($query_name); + $rf->score ($score); + $rf->start ($query_start); + $rf->end ($query_end); + $rf->strand ($strand); + $rf->source_tag ("RepeatMasker"); + $rf->primary_tag ($repeat_class); + my $rf2 = Bio::SeqFeature::Generic->new; + $rf2->seq_id ($repeat_name); + $rf2->score ($score); + $rf2->start ($hit_start); + $rf2->end ($hit_end); + $rf2->strand ($strand); + $rf2->source_tag ("RepeatMasker"); + $rf->primary_tag ($repeat_class); + my $fp = Bio::SeqFeature::FeaturePair->new(-feature1=>$rf, + -feature2=>$rf2); + + return $fp; + } + } +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/RestrictionEnzyme.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/RestrictionEnzyme.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1247 @@ +#------------------------------------------------------------------ +# $Id: RestrictionEnzyme.pm,v 1.25.2.1 2003/06/29 00:53:20 jason Exp $ +# +# BioPerl module Bio::Tools::RestrictionEnzyme +# +# Cared for by Steve Chervitz +# +# You may distribute this module under the same terms as perl itself +#------------------------------------------------------------------ + +## POD Documentation: + +=head1 NAME + +Bio::Tools::RestrictionEnzyme - Bioperl object for a restriction endonuclease +(cuts DNA at specific locations) + +=head1 SYNOPSIS + +=head2 Object Creation + + require Bio::Tools::RestrictionEnzyme; + + ## Create a new object by name. + + $re1 = new Bio::Tools::RestrictionEnzyme(-NAME =>'EcoRI'); + + ## Create a new object using special syntax + ## which specifies the enzyme name, recognition site, and cut position. + ## Used for enzymes not known to this module. + + $re2 = new Bio::Tools::RestrictionEnzyme(-NAME =>'EcoRV--GAT^ATC', + -MAKE =>'custom'); + + ## Get a list of the resulting fragments when a sequence is cut with + ## the given enzyme. The method expects a Bio::Seq object. + + @fragments = $re2->cut_seq($seqobj); + + ## Get a list of names of all available restriction enzymes + ## known to this module. + + @all = $re->available_list(); + + ## Get the names of restriction enzymes that have 6 bp + ## recognition sequences. + + @sixcutters = $re->available_list(6); + + +=head1 INSTALLATION + +This module is included with the central Bioperl distribution: + + http://bio.perl.org/Core/Latest + ftp://bio.perl.org/pub/DIST + +Follow the installation instructions included in the README file. + +=head1 DESCRIPTION + +The Bio::Tools::RestrictionEnzyme.pm module encapsulates generic data and +methods for using restriction endonucleases for in silico restriction +analysis of DNA sequences. + +=head2 Considerations + +This module is a precursor for a more full featured version that may do such +things as download data from online databases such as REBase http://www.neb.com/rebase/. +Thus, there is currently no functionality for obtaining data about commercial +availability for a restriction enzyme. + +At some point in the future, it may make sense to derive RestrictionEnzymes +from a class such as Bio::Enzyme or Bio::Prot::Protein (neither of which now +exist) so that more data about the enzyme and related information can be +easily obtained. + +This module is currently in use at + + http://genome-www.stanford.edu/Sacch3D/analysis/ + + +=head2 Digesting on Runs of N + +To digest a sequence on runs of N's in the sequence. Here's what you can do: + + $re_n = new Bio::Tools::RestrictionEnzyme(-name=>'N--NNNNN', + -make=>'custom'); + +Specify the number of N's you want to match in the -name parameter. +So the above example will recognize and cut at runs of 5 Ns. + If you wanted to cut at runs of 10 N's, you would use + + -name => 'N--NNNNNNNNNN' + +Note that you must use a specific number of N's, you cannot use a regexp to +digest at N+ for example, because the actual number of N's at each site are +not recorded when the sequence is analyzed. So cut_locations( ) wouldn't be +correct. + +=head1 EXAMPLES + +See the script examples/restriction.pl in the Bioperl distribution. + +=head1 DEPENDENCIES + +Bio::Tools::RestrictionEnzyme.pm is a concrete class that inherits from +B and uses by delegation B. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other Bioperl +modules. Send your comments and suggestions preferably to one of the Bioperl +mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track the bugs +and their resolution. Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Steve Chervitz, Esac@bioperl.orgE + +=head1 COPYRIGHT + +Copyright (c) 1997-2002 Steve A. Chervitz. All Rights Reserved. +This module is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +=head1 SEE ALSO + + Bio::Root::Root - Base class. + Bio::PrimarySeq - Lightweight sequence object. + + http://bio.perl.org/ - Bioperl Project Homepage + +=cut + +# +## +### +#### END of main POD documentation. +### +## +#' + + +=head1 APPENDIX + +Methods beginning with a leading underscore are considered private +and are intended for internal use by this module. They are +B considered part of the public interface and are described here +for documentation purposes only. + +=cut + + +package Bio::Tools::RestrictionEnzyme; +use strict; + +use Bio::Root::Root; +use Exporter; + +use vars qw (@ISA @EXPORT_OK %EXPORT_TAGS $ID $version @RE_available $Revision); + +@ISA = qw(Bio::Root::Root Exporter); +@EXPORT_OK = qw(@RE_available); +%EXPORT_TAGS = ( std => [qw(@RE_available)] ); + +$ID = 'Bio::Tools::RestrictionEnzyme'; +$version = 0.04; +$Revision = '$Id: RestrictionEnzyme.pm,v 1.25.2.1 2003/06/29 00:53:20 jason Exp $'; #' + +# Generated from REBASE version 208 (strider format), dated Aug 1 2002 +# using scripts/contributed/rebase2list.pl +# Syntax: RE-name => 'SITE CUTS-AT' where SITE and CUTS-AT are separated +# by a space. + +my %RE = ( + 'AasI' => 'GACNNNNNNGTC 7', + 'AatI' => 'AGGCCT 3', + 'AatII' => 'GACGTC 5', + 'AauI' => 'TGTACA 1', + 'AccI' => 'GTMKAC 2', + 'AccII' => 'CGCG 2', + 'AccIII' => 'TCCGGA 1', + 'Acc16I' => 'TGCGCA 3', + 'Acc65I' => 'GGTACC 1', + 'Acc113I' => 'AGTACT 3', + 'AccB1I' => 'GGYRCC 1', + 'AccB7I' => 'CCANNNNNTGG 7', + 'AclI' => 'AACGTT 2', + 'AcsI' => 'RAATTY 1', + 'AcvI' => 'CACGTG 3', + 'AcyI' => 'GRCGYC 2', + 'AdeI' => 'CACNNNGTG 6', + 'AfaI' => 'GTAC 2', + 'AfeI' => 'AGCGCT 3', + 'AflI' => 'GGWCC 1', + 'AflII' => 'CTTAAG 1', + 'AflIII' => 'ACRYGT 1', + 'AgeI' => 'ACCGGT 1', + 'AhaIII' => 'TTTAAA 3', + 'AhdI' => 'GACNNNNNGTC 6', + 'AhlI' => 'ACTAGT 1', + 'AleI' => 'CACNNNNGTG 5', + 'AluI' => 'AGCT 2', + 'Alw21I' => 'GWGCWC 5', + 'Alw44I' => 'GTGCAC 1', + 'AlwNI' => 'CAGNNNCTG 6', + 'Ama87I' => 'CYCGRG 1', + 'AocI' => 'CCTNAGG 2', + 'Aor51HI' => 'AGCGCT 3', + 'ApaI' => 'GGGCCC 5', + 'ApaBI' => 'GCANNNNNTGC 8', + 'ApaLI' => 'GTGCAC 1', + 'ApoI' => 'RAATTY 1', + 'AscI' => 'GGCGCGCC 2', + 'AseI' => 'ATTAAT 2', + 'AsiAI' => 'ACCGGT 1', + 'AsiSI' => 'GCGATCGC 5', + 'AsnI' => 'ATTAAT 2', + 'AspI' => 'GACNNNGTC 4', + 'Asp700I' => 'GAANNNNTTC 5', + 'Asp718I' => 'GGTACC 1', + 'AspEI' => 'GACNNNNNGTC 6', + 'AspHI' => 'GWGCWC 5', + 'AspLEI' => 'GCGC 3', + 'AspS9I' => 'GGNCC 1', + 'AsuI' => 'GGNCC 1', + 'AsuII' => 'TTCGAA 2', + 'AsuC2I' => 'CCSGG 2', + 'AsuNHI' => 'GCTAGC 1', + 'AvaI' => 'CYCGRG 1', + 'AvaII' => 'GGWCC 1', + 'AviII' => 'TGCGCA 3', + 'AvrII' => 'CCTAGG 1', + 'AxyI' => 'CCTNAGG 2', + 'BalI' => 'TGGCCA 3', + 'BamHI' => 'GGATCC 1', + 'BanI' => 'GGYRCC 1', + 'BanII' => 'GRGCYC 5', + 'BanIII' => 'ATCGAT 2', + 'BbeI' => 'GGCGCC 5', + 'BbrPI' => 'CACGTG 3', + 'BbuI' => 'GCATGC 5', + 'Bbv12I' => 'GWGCWC 5', + 'BclI' => 'TGATCA 1', + 'BcnI' => 'CCSGG 2', + 'BcoI' => 'CYCGRG 1', + 'BcuI' => 'ACTAGT 1', + 'BetI' => 'WCCGGW 1', + 'BfaI' => 'CTAG 1', + 'BfmI' => 'CTRYAG 1', + 'BfrI' => 'CTTAAG 1', + 'BfrBI' => 'ATGCAT 3', + 'BfuCI' => 'GATC 0', + 'BglI' => 'GCCNNNNNGGC 7', + 'BglII' => 'AGATCT 1', + 'BlnI' => 'CCTAGG 1', + 'BloHII' => 'CTGCAG 5', + 'BlpI' => 'GCTNAGC 2', + 'Bme18I' => 'GGWCC 1', + 'Bme1390I' => 'CCNGG 2', + 'Bme1580I' => 'GKGCMC 5', + 'BmtI' => 'GCTAGC 5', + 'BmyI' => 'GDGCHC 5', + 'BoxI' => 'GACNNNNGTC 5', + 'Bpu14I' => 'TTCGAA 2', + 'Bpu1102I' => 'GCTNAGC 2', + 'Bsa29I' => 'ATCGAT 2', + 'BsaAI' => 'YACGTR 3', + 'BsaBI' => 'GATNNNNATC 5', + 'BsaHI' => 'GRCGYC 2', + 'BsaJI' => 'CCNNGG 1', + 'BsaOI' => 'CGRYCG 4', + 'BsaWI' => 'WCCGGW 1', + 'BscI' => 'ATCGAT 2', + 'Bsc4I' => 'CCNNNNNNNGG 7', + 'BscBI' => 'GGNNCC 3', + 'BscFI' => 'GATC 0', + 'Bse8I' => 'GATNNNNATC 5', + 'Bse21I' => 'CCTNAGG 2', + 'Bse118I' => 'RCCGGY 1', + 'BseAI' => 'TCCGGA 1', + 'BseBI' => 'CCWGG 2', + 'BseCI' => 'ATCGAT 2', + 'BseDI' => 'CCNNGG 1', + 'BseJI' => 'GATNNNNATC 5', + 'BseLI' => 'CCNNNNNNNGG 7', + 'BsePI' => 'GCGCGC 1', + 'BseSI' => 'GKGCMC 5', + 'BseX3I' => 'CGGCCG 1', + 'BshI' => 'GGCC 2', + 'Bsh1236I' => 'CGCG 2', + 'Bsh1285I' => 'CGRYCG 4', + 'BshFI' => 'GGCC 2', + 'BshNI' => 'GGYRCC 1', + 'BshTI' => 'ACCGGT 1', + 'BsiBI' => 'GATNNNNATC 5', + 'BsiCI' => 'TTCGAA 2', + 'BsiEI' => 'CGRYCG 4', + 'BsiHKAI' => 'GWGCWC 5', + 'BsiHKCI' => 'CYCGRG 1', + 'BsiLI' => 'CCWGG 2', + 'BsiMI' => 'TCCGGA 1', + 'BsiQI' => 'TGATCA 1', + 'BsiSI' => 'CCGG 1', + 'BsiWI' => 'CGTACG 1', + 'BsiXI' => 'ATCGAT 2', + 'BsiYI' => 'CCNNNNNNNGG 7', + 'BsiZI' => 'GGNCC 1', + 'BslI' => 'CCNNNNNNNGG 7', + 'BsoBI' => 'CYCGRG 1', + 'Bsp13I' => 'TCCGGA 1', + 'Bsp19I' => 'CCATGG 1', + 'Bsp68I' => 'TCGCGA 3', + 'Bsp106I' => 'ATCGAT 2', + 'Bsp119I' => 'TTCGAA 2', + 'Bsp120I' => 'GGGCCC 1', + 'Bsp143I' => 'GATC 0', + 'Bsp143II' => 'RGCGCY 5', + 'Bsp1286I' => 'GDGCHC 5', + 'Bsp1407I' => 'TGTACA 1', + 'Bsp1720I' => 'GCTNAGC 2', + 'BspA2I' => 'CCTAGG 1', + 'BspCI' => 'CGATCG 4', + 'BspDI' => 'ATCGAT 2', + 'BspEI' => 'TCCGGA 1', + 'BspHI' => 'TCATGA 1', + 'BspLI' => 'GGNNCC 3', + 'BspLU11I' => 'ACATGT 1', + 'BspMII' => 'TCCGGA 1', + 'BspTI' => 'CTTAAG 1', + 'BspT104I' => 'TTCGAA 2', + 'BspT107I' => 'GGYRCC 1', + 'BspXI' => 'ATCGAT 2', + 'BsrBRI' => 'GATNNNNATC 5', + 'BsrFI' => 'RCCGGY 1', + 'BsrGI' => 'TGTACA 1', + 'BssAI' => 'RCCGGY 1', + 'BssECI' => 'CCNNGG 1', + 'BssHI' => 'CTCGAG 1', + 'BssHII' => 'GCGCGC 1', + 'BssKI' => 'CCNGG 0', + 'BssNAI' => 'GTATAC 3', + 'BssT1I' => 'CCWWGG 1', + 'Bst98I' => 'CTTAAG 1', + 'Bst1107I' => 'GTATAC 3', + 'BstACI' => 'GRCGYC 2', + 'BstAPI' => 'GCANNNNNTGC 7', + 'BstBI' => 'TTCGAA 2', + 'BstBAI' => 'YACGTR 3', + 'Bst4CI' => 'ACNGT 3', + 'BstC8I' => 'GCNNGC 3', + 'BstDEI' => 'CTNAG 1', + 'BstDSI' => 'CCRYGG 1', + 'BstEII' => 'GGTNACC 1', + 'BstENI' => 'CCTNNNNNAGG 5', + 'BstENII' => 'GATC 0', + 'BstFNI' => 'CGCG 2', + 'BstH2I' => 'RGCGCY 5', + 'BstHHI' => 'GCGC 3', + 'BstHPI' => 'GTTAAC 3', + 'BstKTI' => 'GATC 3', + 'BstMAI' => 'CTGCAG 5', + 'BstMCI' => 'CGRYCG 4', + 'BstMWI' => 'GCNNNNNNNGC 7', + 'BstNI' => 'CCWGG 2', + 'BstNSI' => 'RCATGY 5', + 'BstOI' => 'CCWGG 2', + 'BstPI' => 'GGTNACC 1', + 'BstPAI' => 'GACNNNNGTC 5', + 'BstSCI' => 'CCNGG 0', + 'BstSFI' => 'CTRYAG 1', + 'BstSNI' => 'TACGTA 3', + 'BstUI' => 'CGCG 2', + 'Bst2UI' => 'CCWGG 2', + 'BstXI' => 'CCANNNNNNTGG 8', + 'BstX2I' => 'RGATCY 1', + 'BstYI' => 'RGATCY 1', + 'BstZI' => 'CGGCCG 1', + 'BstZ17I' => 'GTATAC 3', + 'Bsu15I' => 'ATCGAT 2', + 'Bsu36I' => 'CCTNAGG 2', + 'BsuRI' => 'GGCC 2', + 'BsuTUI' => 'ATCGAT 2', + 'BtgI' => 'CCRYGG 1', + 'BthCI' => 'GCNGC 4', + 'Cac8I' => 'GCNNGC 3', + 'CaiI' => 'CAGNNNCTG 6', + 'CauII' => 'CCSGG 2', + 'CciNI' => 'GCGGCCGC 2', + 'CelII' => 'GCTNAGC 2', + 'CfoI' => 'GCGC 3', + 'CfrI' => 'YGGCCR 1', + 'Cfr9I' => 'CCCGGG 1', + 'Cfr10I' => 'RCCGGY 1', + 'Cfr13I' => 'GGNCC 1', + 'Cfr42I' => 'CCGCGG 4', + 'ChaI' => 'GATC 4', + 'ClaI' => 'ATCGAT 2', + 'CpoI' => 'CGGWCCG 2', + 'CspI' => 'CGGWCCG 2', + 'Csp6I' => 'GTAC 1', + 'Csp45I' => 'TTCGAA 2', + 'CspAI' => 'ACCGGT 1', + 'CviAII' => 'CATG 1', + 'CviJI' => 'RGCY 2', + 'CviRI' => 'TGCA 2', + 'CviTI' => 'RGCY 2', + 'CvnI' => 'CCTNAGG 2', + 'DdeI' => 'CTNAG 1', + 'DpnI' => 'GATC 2', + 'DpnII' => 'GATC 0', + 'DraI' => 'TTTAAA 3', + 'DraII' => 'RGGNCCY 2', + 'DraIII' => 'CACNNNGTG 6', + 'DrdI' => 'GACNNNNNNGTC 7', + 'DsaI' => 'CCRYGG 1', + 'DseDI' => 'GACNNNNNNGTC 7', + 'EaeI' => 'YGGCCR 1', + 'EagI' => 'CGGCCG 1', + 'Eam1105I' => 'GACNNNNNGTC 6', + 'Ecl136II' => 'GAGCTC 3', + 'EclHKI' => 'GACNNNNNGTC 6', + 'EclXI' => 'CGGCCG 1', + 'Eco24I' => 'GRGCYC 5', + 'Eco32I' => 'GATATC 3', + 'Eco47I' => 'GGWCC 1', + 'Eco47III' => 'AGCGCT 3', + 'Eco52I' => 'CGGCCG 1', + 'Eco72I' => 'CACGTG 3', + 'Eco81I' => 'CCTNAGG 2', + 'Eco88I' => 'CYCGRG 1', + 'Eco91I' => 'GGTNACC 1', + 'Eco105I' => 'TACGTA 3', + 'Eco130I' => 'CCWWGG 1', + 'Eco147I' => 'AGGCCT 3', + 'EcoHI' => 'CCSGG 0', + 'EcoICRI' => 'GAGCTC 3', + 'EcoNI' => 'CCTNNNNNAGG 5', + 'EcoO65I' => 'GGTNACC 1', + 'EcoO109I' => 'RGGNCCY 2', + 'EcoRI' => 'GAATTC 1', + 'EcoRII' => 'CCWGG 0', + 'EcoRV' => 'GATATC 3', + 'EcoT14I' => 'CCWWGG 1', + 'EcoT22I' => 'ATGCAT 5', + 'EcoT38I' => 'GRGCYC 5', + 'EgeI' => 'GGCGCC 3', + 'EheI' => 'GGCGCC 3', + 'ErhI' => 'CCWWGG 1', + 'EsaBC3I' => 'TCGA 2', + 'EspI' => 'GCTNAGC 2', + 'FatI' => 'CATG 0', + 'FauNDI' => 'CATATG 2', + 'FbaI' => 'TGATCA 1', + 'FblI' => 'GTMKAC 2', + 'FmuI' => 'GGNCC 4', + 'FnuDII' => 'CGCG 2', + 'Fnu4HI' => 'GCNGC 2', + 'FriOI' => 'GRGCYC 5', + 'FseI' => 'GGCCGGCC 6', + 'FspI' => 'TGCGCA 3', + 'FspAI' => 'RTGCGCAY 4', + 'Fsp4HI' => 'GCNGC 2', + 'FunI' => 'AGCGCT 3', + 'FunII' => 'GAATTC 1', + 'HaeI' => 'WGGCCW 3', + 'HaeII' => 'RGCGCY 5', + 'HaeIII' => 'GGCC 2', + 'HapII' => 'CCGG 1', + 'HgiAI' => 'GWGCWC 5', + 'HgiCI' => 'GGYRCC 1', + 'HgiJII' => 'GRGCYC 5', + 'HhaI' => 'GCGC 3', + 'Hin1I' => 'GRCGYC 2', + 'Hin6I' => 'GCGC 1', + 'HinP1I' => 'GCGC 1', + 'HincII' => 'GTYRAC 3', + 'HindII' => 'GTYRAC 3', + 'HindIII' => 'AAGCTT 1', + 'HinfI' => 'GANTC 1', + 'HpaI' => 'GTTAAC 3', + 'HpaII' => 'CCGG 1', + 'Hpy8I' => 'GTNNAC 3', + 'Hpy99I' => 'CGWCG 5', + 'Hpy178III' => 'TCNNGA 2', + 'Hpy188I' => 'TCNGA 3', + 'Hpy188III' => 'TCNNGA 2', + 'HpyCH4I' => 'CATG 3', + 'HpyCH4III' => 'ACNGT 3', + 'HpyCH4IV' => 'ACGT 1', + 'HpyCH4V' => 'TGCA 2', + 'HpyF10VI' => 'GCNNNNNNNGC 8', + 'Hsp92I' => 'GRCGYC 2', + 'Hsp92II' => 'CATG 4', + 'HspAI' => 'GCGC 1', + 'ItaI' => 'GCNGC 2', + 'KasI' => 'GGCGCC 1', + 'KpnI' => 'GGTACC 5', + 'Kpn2I' => 'TCCGGA 1', + 'KspI' => 'CCGCGG 4', + 'Ksp22I' => 'TGATCA 1', + 'KspAI' => 'GTTAAC 3', + 'Kzo9I' => 'GATC 0', + 'LpnI' => 'RGCGCY 3', + 'LspI' => 'TTCGAA 2', + 'MabI' => 'ACCWGGT 1', + 'MaeI' => 'CTAG 1', + 'MaeII' => 'ACGT 1', + 'MaeIII' => 'GTNAC 0', + 'MamI' => 'GATNNNNATC 5', + 'MboI' => 'GATC 0', + 'McrI' => 'CGRYCG 4', + 'MfeI' => 'CAATTG 1', + 'MflI' => 'RGATCY 1', + 'MhlI' => 'GDGCHC 5', + 'MlsI' => 'TGGCCA 3', + 'MluI' => 'ACGCGT 1', + 'MluNI' => 'TGGCCA 3', + 'Mly113I' => 'GGCGCC 2', + 'Mph1103I' => 'ATGCAT 5', + 'MroI' => 'TCCGGA 1', + 'MroNI' => 'GCCGGC 1', + 'MroXI' => 'GAANNNNTTC 5', + 'MscI' => 'TGGCCA 3', + 'MseI' => 'TTAA 1', + 'MslI' => 'CAYNNNNRTG 5', + 'MspI' => 'CCGG 1', + 'Msp20I' => 'TGGCCA 3', + 'MspA1I' => 'CMGCKG 3', + 'MspCI' => 'CTTAAG 1', + 'MspR9I' => 'CCNGG 2', + 'MssI' => 'GTTTAAAC 4', + 'MstI' => 'TGCGCA 3', + 'MunI' => 'CAATTG 1', + 'MvaI' => 'CCWGG 2', + 'MvnI' => 'CGCG 2', + 'MwoI' => 'GCNNNNNNNGC 7', + 'NaeI' => 'GCCGGC 3', + 'NarI' => 'GGCGCC 2', + 'NciI' => 'CCSGG 2', + 'NcoI' => 'CCATGG 1', + 'NdeI' => 'CATATG 2', + 'NdeII' => 'GATC 0', + 'NgoAIV' => 'GCCGGC 1', + 'NgoMIV' => 'GCCGGC 1', + 'NheI' => 'GCTAGC 1', + 'NlaIII' => 'CATG 4', + 'NlaIV' => 'GGNNCC 3', + 'Nli3877I' => 'CYCGRG 5', + 'NmuCI' => 'GTSAC 0', + 'NotI' => 'GCGGCCGC 2', + 'NruI' => 'TCGCGA 3', + 'NruGI' => 'GACNNNNNGTC 6', + 'NsbI' => 'TGCGCA 3', + 'NsiI' => 'ATGCAT 5', + 'NspI' => 'RCATGY 5', + 'NspIII' => 'CYCGRG 1', + 'NspV' => 'TTCGAA 2', + 'NspBII' => 'CMGCKG 3', + 'OliI' => 'CACNNNNGTG 5', + 'PacI' => 'TTAATTAA 5', + 'PaeI' => 'GCATGC 5', + 'PaeR7I' => 'CTCGAG 1', + 'PagI' => 'TCATGA 1', + 'PalI' => 'GGCC 2', + 'PauI' => 'GCGCGC 1', + 'PceI' => 'AGGCCT 3', + 'PciI' => 'ACATGT 1', + 'PdiI' => 'GCCGGC 3', + 'PdmI' => 'GAANNNNTTC 5', + 'Pfl23II' => 'CGTACG 1', + 'PflBI' => 'CCANNNNNTGG 7', + 'PflFI' => 'GACNNNGTC 4', + 'PflMI' => 'CCANNNNNTGG 7', + 'PfoI' => 'TCCNGGA 1', + 'PinAI' => 'ACCGGT 1', + 'Ple19I' => 'CGATCG 4', + 'PmaCI' => 'CACGTG 3', + 'PmeI' => 'GTTTAAAC 4', + 'PmlI' => 'CACGTG 3', + 'Ppu10I' => 'ATGCAT 1', + 'PpuMI' => 'RGGWCCY 2', + 'PpuXI' => 'RGGWCCY 2', + 'PshAI' => 'GACNNNNGTC 5', + 'PshBI' => 'ATTAAT 2', + 'PsiI' => 'TTATAA 3', + 'Psp03I' => 'GGWCC 4', + 'Psp5II' => 'RGGWCCY 2', + 'Psp6I' => 'CCWGG 0', + 'Psp1406I' => 'AACGTT 2', + 'PspAI' => 'CCCGGG 1', + 'Psp124BI' => 'GAGCTC 5', + 'PspEI' => 'GGTNACC 1', + 'PspGI' => 'CCWGG 0', + 'PspLI' => 'CGTACG 1', + 'PspN4I' => 'GGNNCC 3', + 'PspOMI' => 'GGGCCC 1', + 'PspPI' => 'GGNCC 1', + 'PspPPI' => 'RGGWCCY 2', + 'PssI' => 'RGGNCCY 5', + 'PstI' => 'CTGCAG 5', + 'PsuI' => 'RGATCY 1', + 'PsyI' => 'GACNNNGTC 4', + 'PvuI' => 'CGATCG 4', + 'PvuII' => 'CAGCTG 3', + 'RcaI' => 'TCATGA 1', + 'RsaI' => 'GTAC 2', + 'RsrII' => 'CGGWCCG 2', + 'Rsr2I' => 'CGGWCCG 2', + 'SacI' => 'GAGCTC 5', + 'SacII' => 'CCGCGG 4', + 'SalI' => 'GTCGAC 1', + 'SanDI' => 'GGGWCCC 2', + 'SatI' => 'GCNGC 2', + 'SauI' => 'CCTNAGG 2', + 'Sau96I' => 'GGNCC 1', + 'Sau3AI' => 'GATC 0', + 'SbfI' => 'CCTGCAGG 6', + 'ScaI' => 'AGTACT 3', + 'SciI' => 'CTCGAG 3', + 'ScrFI' => 'CCNGG 2', + 'SdaI' => 'CCTGCAGG 6', + 'SduI' => 'GDGCHC 5', + 'SecI' => 'CCNNGG 1', + 'SelI' => 'CGCG 0', + 'SexAI' => 'ACCWGGT 1', + 'SfcI' => 'CTRYAG 1', + 'SfeI' => 'CTRYAG 1', + 'SfiI' => 'GGCCNNNNNGGCC 8', + 'SfoI' => 'GGCGCC 3', + 'Sfr274I' => 'CTCGAG 1', + 'Sfr303I' => 'CCGCGG 4', + 'SfuI' => 'TTCGAA 2', + 'SgfI' => 'GCGATCGC 5', + 'SgrAI' => 'CRCCGGYG 2', + 'SgrBI' => 'CCGCGG 4', + 'SinI' => 'GGWCC 1', + 'SlaI' => 'CTCGAG 1', + 'SmaI' => 'CCCGGG 3', + 'SmiI' => 'ATTTAAAT 4', + 'SmiMI' => 'CAYNNNNRTG 5', + 'SmlI' => 'CTYRAG 1', + 'SnaBI' => 'TACGTA 3', + 'SpaHI' => 'GCATGC 5', + 'SpeI' => 'ACTAGT 1', + 'SphI' => 'GCATGC 5', + 'SplI' => 'CGTACG 1', + 'SrfI' => 'GCCCGGGC 4', + 'Sse9I' => 'AATT 0', + 'Sse232I' => 'CGCCGGCG 2', + 'Sse8387I' => 'CCTGCAGG 6', + 'Sse8647I' => 'AGGWCCT 2', + 'SseBI' => 'AGGCCT 3', + 'SspI' => 'AATATT 3', + 'SspBI' => 'TGTACA 1', + 'SstI' => 'GAGCTC 5', + 'SstII' => 'CCGCGG 4', + 'StuI' => 'AGGCCT 3', + 'StyI' => 'CCWWGG 1', + 'SunI' => 'CGTACG 1', + 'SwaI' => 'ATTTAAAT 4', + 'TaaI' => 'ACNGT 3', + 'TaiI' => 'ACGT 4', + 'TaqI' => 'TCGA 1', + 'TasI' => 'AATT 0', + 'TatI' => 'WGTACW 1', + 'TauI' => 'GCSGC 4', + 'TelI' => 'GACNNNGTC 4', + 'TfiI' => 'GAWTC 1', + 'ThaI' => 'CGCG 2', + 'TliI' => 'CTCGAG 1', + 'Tru1I' => 'TTAA 1', + 'Tru9I' => 'TTAA 1', + 'TscI' => 'ACGT 4', + 'TseI' => 'GCWGC 1', + 'Tsp45I' => 'GTSAC 0', + 'Tsp509I' => 'AATT 0', + 'Tsp4CI' => 'ACNGT 3', + 'TspEI' => 'AATT 0', + 'Tth111I' => 'GACNNNGTC 4', + 'TthHB8I' => 'TCGA 1', + 'UnbI' => 'GGNCC 0', + 'Van91I' => 'CCANNNNNTGG 7', + 'Vha464I' => 'CTTAAG 1', + 'VneI' => 'GTGCAC 1', + 'VpaK11AI' => 'GGWCC 0', + 'VpaK11BI' => 'GGWCC 1', + 'VspI' => 'ATTAAT 2', + 'XagI' => 'CCTNNNNNAGG 5', + 'XapI' => 'RAATTY 1', + 'XbaI' => 'TCTAGA 1', + 'XceI' => 'RCATGY 5', + 'XcmI' => 'CCANNNNNNNNNTGG 8', + 'XhoI' => 'CTCGAG 1', + 'XhoII' => 'RGATCY 1', + 'XmaI' => 'CCCGGG 1', + 'XmaIII' => 'CGGCCG 1', + 'XmaCI' => 'CCCGGG 1', + 'XmaJI' => 'CCTAGG 1', + 'XmiI' => 'GTMKAC 2', + 'XmnI' => 'GAANNNNTTC 5', + 'XspI' => 'CTAG 1', + 'ZhoI' => 'ATCGAT 2', + 'ZraI' => 'GACGTC 3', + 'Zsp2I' => 'ATGCAT 5', +); + +@RE_available = sort keys %RE; + + +=head1 new + + Title : new + Purpose : Initializes the RestrictionEnzyme object and calls + : superclass constructor last (Bio:Seq.pm). + Returns : n/a + Argument : Parameters passed to new() + Comments : A RestrictionEnzyme object manages its recognition sequence + : as a Bio::PrimarySeq object. + +See Also : L<_make_custom>(), L<_make_standard>(), B + +=cut + +#--------------- +sub new { +#--------------- + my($class, @args) = @_; + + my $self = $class->SUPER::new(@args); + my ($name,$make) = $self->_rearrange([qw(NAME MAKE)],@args); + $name && $self->name($name); + my %data; + if(defined $make && $make eq 'custom') { + %data = $self->_make_custom($name); + } else { + %data = $self->_make_standard($name); + } + $self->{'_seq'} = new Bio::PrimarySeq(%data, + -VERBOSE =>$self->verbose, + -alphabet => 'dna', + ); + return $self; +} + + +#=head1 _make_standard +# +# Title : _make_standard +# Usage : n/a; automatically called by _initialize() +# Purpose : Permits custom RE object construction from name. +# : 'EcoRI'. +# Returns : Hash containing named parameters for Bio::PrimarySeq.pm constructor. +# Argument : String containing string with special syntax. +# Throws : Exception if the requested enzyme name is unavailable. +# : NOTE: Case sensitive. +# +#See Also : L, L<_make_custom()|_make_custom> +# +#=cut + +#------------------ +sub _make_standard { +#------------------ + my($self, $name) = @_; + + $name =~ s/^\s+|\s+$//g; + + $self->is_available($name) || + $self->throw("Unavailable or undefined enzyme: $name (Note: CASE SENSITIVE)\n" . + "Currently available enzymes: \n@RE_available\n"); + + my @data = split( ' ', $RE{$name}); + my (%dat); + $dat{-SEQ} = $data[0]; + $dat{-NAME} = $dat{-ID}= $name; + $self->{'_cuts_after'} = $data[1]; + + return %dat; +} + + +#=head1 _make_custom +# +# Title : _make_custom +# Usage : n/a; automatically called by _initialize() +# Purpose : Permits custom RE object construction from strings +# : such as 'EcoRI--G^AATTC' as the name of the enzyme. +# Returns : Hash containing named parameters for Bio::PrimarySeq.pm constructor. +# Argument : String containing string with special syntax. +# Throws : Exception if the string has bad syntax. +# : Warning if the string did not specify cut position. +# : Places cut site after 5'-most position. +# +#See Also : L +# +#=cut + +#' +#----------------- +sub _make_custom { +#----------------- + my($self, $name) = @_; + + $name =~ s/\s+//g; + my @parts = split '--', $name; + my (%dat); + $dat{-NAME} = $dat{-ID} = $parts[0]; + $self->name($parts[0]); ## Reset name + + $parts[1] || return $self->throw("Undefined recognition site for $parts[0].", + "Use this syntax: EcoRV--GAT^ATC"); + ## Determine the cuts_after point. + my $cut_index = index $parts[1], '^'; + if( $cut_index <0) { $cut_index = 0; + $self->warn("Unknown cut position for $parts[0]. Assuming position 0\n" . + "Use carat to specify cut position (e.g., G^AATTC)"); } + $self->{'_cuts_after'} = $cut_index; + + ## Save the recognition sequence after removing the '^' + $parts[1] =~ s/\^//g; + $dat{-SEQ} = $parts[1]; + return %dat; +} + + +=head1 cuts_after + + Title : cuts_after + Usage : $num = $re->cuts_after(); + Purpose : Sets/Gets an integer indicating the position of cleavage + : relative to the 5' end of the recognition sequence. + Returns : Integer + Argument : Integer (optional) + Throws : Exception if argument is non-numeric. + Access : Public + Comments : This method is only needed to change the cuts at + : position. This data is automatically set during + : construction. + +See Also : L<_make_standard()|_make_standard>, L<_make_custom()|_make_custom> + +=cut + +#' +#--------------- +sub cuts_after { +#--------------- + my $self = shift; + if(@_) { my $num = shift; + if($num == 0 and $num ne '0') { + $self->throw("The cuts_after position be an integer ($num)"); + } + $self->{'_cuts_after'} = $num; + } + $self->{'_cuts_after'}; +} + + + +=head1 site + + Title : site + Usage : $re->site(); + Purpose : Gets the recognition sequence for the enzyme. + Example : $seq_string = $re->site(); + Returns : String containing recognition sequence indicating + : cleavage site as in 'G^AATTC'. + Argument : n/a + Throws : n/a + Comments : If you want a simple string representing the site without + any '^', use the string() method. + +See Also : L + +=cut + +#--------- +sub site { +#--------- + my $self = shift; + my $seq = $self->seq; + my $cuts_after = $self->cuts_after; + if($cuts_after > 0) { + if( $cuts_after >= $seq->length) { + return $seq->seq.'^'; + } else { + return $seq->subseq(1, $self->cuts_after).'^'.$seq->subseq($self->cuts_after+1, $seq->length); + } + } else { + return $seq->seq; + } +} + + +=head1 seq + + Title : seq + Usage : $re->seq(); + Purpose : Get the Bio::PrimarySeq.pm-derived object representing + : the recognition sequence + Returns : String + Argument : n/a + Throws : n/a + +See Also : L, L + +=cut + +#--------- +sub seq { my $self = shift; $self->{'_seq'}; } +#--------- + + + +=head1 string + + Title : string + Usage : $re->string(); + Purpose : Get a string representing the recognition sequence. + Returns : String. Does NOT contain a '^' representing the cut location + as returned by the site() method + Argument : n/a + Throws : n/a + Comments : Delegates to the Bio::PrimarySeq-derived object. + +See Also : L, L, L + +=cut + +#----------- +sub string { my $self = shift; $self->{'_seq'}->seq; } +#----------- + + + +=head1 revcom + + Title : revcom + Usage : $re->revcom(); + Purpose : Get a string representing the reverse complement of + : the recognition sequence. + Returns : String + Argument : n/a + Throws : n/a + Comments : Delegates to the Bio::PrimarySeq.pm-derived object, but needs to + get out the string from it, as now Bio::PrimarySeq->revcom makes a + Bio::PrimarySeq object + +See Also : L, L + +=cut + +#----------- +sub revcom { my $self = shift; $self->{'_seq'}->revcom->seq(); } +#----------- + + + +=head1 cut_seq + + Title : cut_seq + Usage : $re->cut_seq(); + Purpose : Conceptually cut or "digest" a DNA sequence with the given enzyme. + Example : $string = $re->cut_seq(); + Returns : List of strings containing the resulting fragments. + Argument : Reference to a Bio::PrimarySeq.pm-derived object. + Throws : Exception if argument is not an object. + : (Does not yet verify that it is derived from Bio::PrimarySeq.pm.) + Comments : Strategy relies on Perl's built-in split() function. + : Since split removes the recognition pattern, the resulting + : fragments are repaired after split()-ing. + : A side-effect of this is that for sites with ambiguous + : recognition sequence (i.e., containing N), the fragments + : will contain ambiguity characters instead of AGCT. + : + : There is currently no support for partial digestions. + : There is currently no support for circular sequences. + : (This should just involve merging the first and last frag + : if $seqObj->is_circular returns true). + +=cut + +#' +#------------- +sub cut_seq { +#------------- + my( $self, $seqObj) = @_; + if( !ref($seqObj) || + ! $seqObj->isa('Bio::PrimarySeqI') ) { + $self->throw( "Can't cut sequence. Missing or invalid object". + "seqObj: $seqObj"); + } + + my $cuts_after = $self->{'_cuts_after'}; + my ($site_3prime_seq, $site_5prime_seq); + my $reSeq = $self->seq; + if($cuts_after == 0) { + $site_3prime_seq = ''; + $site_5prime_seq = $reSeq->seq(); + } elsif($cuts_after == $reSeq->length) { + $site_3prime_seq = $reSeq->seq(); + $site_5prime_seq = ''; + } else { + $site_3prime_seq = $reSeq->subseq(1, $self->{'_cuts_after'}); + $site_5prime_seq = $reSeq->subseq($self->{'_cuts_after'}+1, $reSeq->length); + } + + $self->debug("3' site: $site_3prime_seq\n5' site: $site_5prime_seq\n"); + + my(@re_frags); + my $seq = uc $self->_expanded_string; + + if(!$self->palindromic and $self->name ne 'N') { + my $revseq = $self->_expanded_string( $reSeq->revcom->seq ); + $seq .= '|'.uc($revseq); + } + $self->debug(sprintf("$ID: site seq: %s\n\n", $seq)); + $self->debug(sprintf("$ID: splitting %s\n\n",$reSeq->seq)); + @re_frags = split(/$seq/i, $seqObj->seq); + + $self->debug("$ID: cut_seq, ".scalar @re_frags. " fragments.\n"); + + ## Re-attach the split recognition site back to the frags + ## since perl zapped them in the split() call. + my($i); + my $numFrags = scalar @re_frags; + for($i=0; $i<$numFrags; $i++) { + $i < $#re_frags and $re_frags[$i] = $re_frags[$i].$site_3prime_seq; + $i > 0 and $re_frags[$i] = $site_5prime_seq.$re_frags[$i]; + } + @re_frags; +} + +=head1 cut_locations + + Title : cut_locations + Usage : my $locations = $re->cut_locations(); + Purpose : Report the location of the recognition site(s) within + : an input sequence. + Example : my $locations = $re->annotate_seq($seqObj); + Returns : Arrayref of starting locations where enzyme would cut + Argument : Reference to a Bio::PrimarySeqI-derived sequence object. + Throws : n/a + Comments : + +=cut + +#----------------- +sub cut_locations { +#----------------- + my($self, $seqobj) = @_; + + my $site = $self->_expanded_string; + my $seq = $seqobj->seq; + study($seq); + my @locations; + while( $seq =~ /($site)/ig ) { + # $` is preceding string before pattern so length returns position + push @locations, length($`); + } + return \@locations; +} + +# Purpose : Expand nucleotide ambiguity codes to their representative letters +# Argument: (optional) the string to be expanded. If not supplied, used +# the string returned by $self->string(). +# Returns : String +sub _expanded_string { + my ($self, $str) = @_; + + $str ||= $self->string; + + if( $self->name ne 'N' ) { + $str =~ s/N|X/\./g; + $str =~ s/R/\[AG\]/g; + $str =~ s/Y/\[CT\]/g; + $str =~ s/S/\[GC\]/g; + $str =~ s/W/\[AT\]/g; + $str =~ s/M/\[AC\]/g; + $str =~ s/K/\[TG\]/g; + $str =~ s/B/\[CGT\]/g; + $str =~ s/D/\[AGT\]/g; + $str =~ s/H/\[ACT\]/g; + $str =~ s/V/\[ACG\]/g; + } + return $str; +} + + +=head1 annotate_seq + + Title : annotate_seq + Usage : $re->annotate_seq(); + Purpose : Identify the location of the recognition site(s) within + : an input sequence. Uses HTML. + Example : $annot_seq = $re->annotate_seq($seqObj); + Returns : String containing the annotated sequence. + Argument : Reference to a Bio::PrimarySeq.pm-derived sequence object. + Throws : n/a + Comments : The annotated sequence must be viewed with a web + : browser to see the location(s) of the recognition site(s). + +=cut + +#----------------- +sub annotate_seq { +#----------------- + my($self, $seqObj) = @_; + + my $site = $self->_expanded_string; + my $seq = $seqObj->seq; + + $seq =~ s|$site|$site|g; + return $seq; +} + + +=head1 palindromic + + Title : palindromic + Usage : $re->palindromic(); + Purpose : Determines if the recognition sequence is palindromic + : for the current restriction enzyme. + Returns : Boolean + Argument : n/a + Throws : n/a + Access : Public + Comments : A palindromic site (EcoRI): 5-GAATTC-3 + : 3-CTTAAG-5 + +=cut + +#---------------- +sub palindromic { +#---------------- + my $self = shift; + $self->string eq $self->revcom; +} + + + +=head1 is_available + + Title : is_available + Usage : $re->is_available(); + Purpose : Determine if an enzyme is available (to this module). + : (see the package lexical %RE). + Example : $re->is_available('EcoRI'); + : &Bio::Tools::RestrictionEnzyme::is_available($object,'EcoRI'); + Returns : Boolean + Argument : String + Throws : n/a + Comments : This method does NOT give information about + : commercial availability (yet). + : Enzyme names are CASE SENSITIVE. + +See Also : L + +=cut + +#---------------- +sub is_available { +#---------------- + my($self,$name) = @_; + exists $RE{$name}; +} + +#-------------- +sub available { +#-------------- + my($self,$name) = @_; + print STDERR "\nDeprecated method: $ID:: available(); ". + "use is_available() instead.\n"; + $self->is_available($name); +} + + +=head2 name + + Title : name + Usage : $obj->name($newval) + Function: + Example : + Returns : value of name + Args : newvalue (optional) + + +=cut + +sub name{ + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'name'} = $value; + } + return $obj->{'name'}; + +} + +=head1 available_list + + Title : available_list + Usage : $re->available_list([]); + Purpose : Retrieve a list of currently available enzymes. + Example : @all = $re->available_list(); ## All enzymes + : @six_cutters = $re->available_list(6); ## All 6-cutters + Returns : List of strings + Argument : Integer (optional) + Throws : n/a + Comments : This method may be more appropriate for a REData.pm class. + +See Also : L + +=cut + +#------------------- +sub available_list { +#------------------- + my($self,$size) = @_; + $size ||= 'all'; + + $size eq 'all' and return @RE_available; + + my(@data, @names); + foreach (@RE_available) { + @data = split /\s/, $RE{$_}; + if(length $data[0] == $size) { + push @names, $_; + } + } + @names; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Run/README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Run/README Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,9 @@ + + This directory is now mainly in bioperl-run package +which you should install alongside Bioperl to get over +50 different runnable options. We might decide to merge +the two packages again, but the feeling is that bioperl +"core" is getting a little too big. + + Bioperl-run follows the same release tagging schedule +as Bioperl. Pick it up from www.bioperl.org and/or CPAN diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Run/RemoteBlast.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Run/RemoteBlast.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,552 @@ +# $Id: RemoteBlast.pm,v 1.14.2.2 2003/09/03 18:29:50 jason Exp $ +# +# BioPerl module for Bio::Tools::Run::RemoteBlast +# +# Cared for by Jason Stajich, Mat Wiepert +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::Run::RemoteBlast - Object for remote execution of the NCBI Blast +via HTTP + +=head1 SYNOPSIS + + #Remote-blast "factory object" creation and blast-parameter initialization + + use Bio::Tools::Run::RemoteBlast; + use strict; + my $prog = 'blastp'; + my $db = 'swissprot'; + my $e_val= '1e-10'; + + my @params = ( '-prog' => $prog, + '-data' => $db, + '-expect' => $e_val, + '-readmethod' => 'SearchIO' ); + + my $factory = Bio::Tools::Run::RemoteBlast->new(@params); + + #change a paramter + $Bio::Tools::Run::RemoteBlast::HEADER{'ENTREZ_QUERY'} = 'Homo sapiens [ORGN]'; + + #remove a parameter + delete $Bio::Tools::Run::RemoteBlast::HEADER{'FILTER'}; + + my $v = 1; + #$v is just to turn on and off the messages + + my $str = Bio::SeqIO->new(-file=>'amino.fa' , '-format' => 'fasta' ); + + while (my $input = $str->next_seq()){ + #Blast a sequence against a database: + + #Alternatively, you could pass in a file with many + #sequences rather than loop through sequence one at a time + #Remove the loop starting 'while (my $input = $str->next_seq())' + #and swap the two lines below for an example of that. + my $r = $factory->submit_blast($input); + #my $r = $factory->submit_blast('amino.fa'); + + print STDERR "waiting..." if( $v > 0 ); + while ( my @rids = $factory->each_rid ) { + foreach my $rid ( @rids ) { + my $rc = $factory->retrieve_blast($rid); + if( !ref($rc) ) { + if( $rc < 0 ) { + $factory->remove_rid($rid); + } + print STDERR "." if ( $v > 0 ); + sleep 5; + } else { + my $result = $rc->next_result(); + #save the output + my $filename = $result->query_name()."\.out"; + $factory->save_output($filename); + $factory->remove_rid($rid); + print "\nQuery Name: ", $result->query_name(), "\n"; + while ( my $hit = $result->next_hit ) { + next unless ( $v > 0); + print "\thit name is ", $hit->name, "\n"; + while( my $hsp = $hit->next_hsp ) { + print "\t\tscore is ", $hsp->score, "\n"; + } + } + } + } + } + } + + # This example shows how to change a CGI parameter: + $Bio::Tools::Run::RemoteBlast::HEADER{'MATRIX_NAME'} = 'BLOSUM25'; + + # And this is how to delete a CGI parameter: + delete $Bio::Tools::Run::RemoteBlast::HEADER{'FILTER'}; + + +=head1 DESCRIPTION + +Class for remote execution of the NCBI Blast via HTTP. + +For a description of the many CGI parameters see: +http://www.ncbi.nlm.nih.gov/BLAST/Doc/urlapi.html + +Various additional options and input formats are available. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bio.perl.org/bioperl-bugs/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +package Bio::Tools::Run::RemoteBlast; + +use vars qw($AUTOLOAD @ISA %BLAST_PARAMS $URLBASE %HEADER %RETRIEVALHEADER + $RIDLINE); +use strict; + +use Bio::Root::Root; +use Bio::Root::IO; +use Bio::SeqIO; +use IO::String; +use Bio::Tools::BPlite; +use Bio::SearchIO; +use LWP; +use HTTP::Request::Common; +BEGIN { + $URLBASE = 'http://www.ncbi.nlm.nih.gov/blast/Blast.cgi'; + %HEADER = ('CMD' => 'Put', + 'PROGRAM' => '', + 'DATABASE' => '', + 'FILTER' => 'L', + 'EXPECT' => '', + 'QUERY' => '', + 'CDD_SEARCH' => 'off', + 'COMPOSITION_BASED_STATISTICS' => 'off', + 'FORMAT_OBJECT' => 'Alignment', + 'SERVICE' => 'plain', + ); + + %RETRIEVALHEADER = ('CMD' => 'Get', + 'RID' => '', + 'ALIGNMENT_VIEW' => 'Pairwise', + 'DESCRIPTIONS' => 100, + 'ALIGNMENTS' => 50, + 'FORMAT_TYPE' => 'Text', + ); + + $RIDLINE = 'RID\s+=\s+(\S+)'; + + %BLAST_PARAMS = ( 'prog' => 'blastp', + 'data' => 'nr', + 'expect' => '1e-3', + 'readmethod' => 'SearchIO' + ); + +} + +@ISA = qw(Bio::Root::Root Bio::Root::IO); + +sub new { + my ($caller, @args) = @_; + # chained new + my $self = $caller->SUPER::new(@args); + # so that tempfiles are cleaned up + $self->_initialize_io(); + my ($prog, $data, $expect, + $readmethod) = $self->_rearrange([qw(PROG DATA + EXPECT + READMETHOD)], + @args); + + $readmethod = $BLAST_PARAMS{'readmethod'} unless defined $readmethod; + $prog = $BLAST_PARAMS{'prog'} unless defined $prog; + $data = $BLAST_PARAMS{'data'} unless defined $data; + $expect = $BLAST_PARAMS{'expect'} unless defined $expect; + $self->readmethod($readmethod); + $self->program($prog); + $self->database($data); + $self->expect($expect); + + return $self; +} + +=head2 header + + Title : header + Usage : my $header = $self->header + Function: Get/Set HTTP header for blast query + Returns : string + Args : none + +=cut + +sub header { + my ($self) = @_; + my %h = %HEADER; + $h{'PROGRAM'} = $self->program; + $h{'DATABASE'} = $self->database; + $h{'EXPECT'} = $self->expect; + return %h; +} + +=head2 readmethod + + Title : readmethod + Usage : my $readmethod = $self->readmethod + Function: Get/Set the method to read the blast report + Returns : string + Args : string [ Blast, BPlite ] + +=cut + +sub readmethod { + my ($self, $val) = @_; + if( defined $val ) { + $self->{'_readmethod'} = $val; + } + return $self->{'_readmethod'}; +} + + +=head2 program + + Title : program + Usage : my $prog = $self->program + Function: Get/Set the program to run + Returns : string + Args : string [ blastp, blastn, blastx, tblastn, tblastx ] + +=cut + +sub program { + my ($self, $val) = @_; + if( defined $val ) { + $val = lc $val; + if( $val !~ /t?blast[pnx]/ ) { + $self->warn("trying to set program to an invalid program name ($val) -- defaulting to blastp"); + $val = 'blastp'; + } +# $self->{'_program'} = $val; + $HEADER{'PROGRAM'} = $val; + } + return $HEADER{'PROGRAM'}; +} + + +=head2 database + + Title : database + Usage : my $db = $self->database + Function: Get/Set the database to search + Returns : string + Args : string [ swissprot, nr, nt, etc... ] + +=cut + +sub database { + my ($self, $val) = @_; + if( defined $val ) { +# $self->{'_database'} = $val; + $HEADER{'DATABASE'} = $val; + } + return $HEADER{'DATABASE'}; +} + + +=head2 expect + + Title : expect + Usage : my $expect = $self->expect + Function: Get/Set the E value cutoff + Returns : string + Args : string [ '1e-4' ] + +=cut + +sub expect { + my ($self, $val) = @_; + if( defined $val ) { +# $self->{'_expect'} = $val; + $HEADER{'EXPECT'} = $val; + } + return $HEADER{'EXPECT'}; +} + +=head2 ua + + Title : ua + Usage : my $ua = $self->ua or + $self->ua($ua) + Function: Get/Set a LWP::UserAgent for use + Returns : reference to LWP::UserAgent Object + Args : none + Comments: Will create a UserAgent if none has been requested before. + +=cut + +sub ua { + my ($self, $value) = @_; + if( ! defined $self->{'_ua'} ) { + $self->{'_ua'} = new LWP::UserAgent; + } + return $self->{'_ua'}; +} + +=head2 proxy + + Title : proxy + Usage : $httpproxy = $db->proxy('http') or + $db->proxy(['http','ftp'], 'http://myproxy' ) + Function: Get/Set a proxy for use of proxy + Returns : a string indicating the proxy + Args : $protocol : an array ref of the protocol(s) to set/get + $proxyurl : url of the proxy to use for the specified protocol + +=cut + +sub proxy { + my ($self,$protocol,$proxy) = @_; + return undef if ( !defined $self->ua || !defined $protocol + || !defined $proxy ); + return $self->ua->proxy($protocol,$proxy); +} + +sub add_rid { + my ($self, @vals) = @_; + foreach ( @vals ) { + $self->{'_rids'}->{$_} = 1; + } + return scalar keys %{$self->{'_rids'}}; +} + +sub remove_rid { + my ($self, @vals) = @_; + foreach ( @vals ) { + delete $self->{'_rids'}->{$_}; + } + return scalar keys %{$self->{'_rids'}}; +} + +sub each_rid { + my ($self) = @_; + return keys %{$self->{'_rids'}}; +} + +=head2 submit_blast + + Title : submit_blast + Usage : $self->submit_blast([$seq1,$seq2]); + Function: Submit blast jobs to ncbi blast queue on sequence(s) + Returns : Blast report object as defined by $self->readmethod + Args : input can be: + * sequence object + * array ref of sequence objects + * filename of file containing fasta formatted sequences + +=cut + +sub submit_blast { + my ($self, $input) = @_; + my @seqs = $self->_load_input($input); + return 0 unless ( @seqs ); + my $tcount = 0; + my %header = $self->header; + foreach my $seq ( @seqs ) { + #If query has a fasta header, the output has the query line. + $header{'QUERY'} = ">".(defined $seq->display_id() ? $seq->display_id() : ""). + " ".(defined $seq->desc() ? $seq->desc() : "")."\n".$seq->seq(); + my $request = POST $URLBASE, [%header]; + $self->warn($request->as_string) if ( $self->verbose > 0); + my $response = $self->ua->request( $request); + + if( $response->is_success ) { + if( $self->verbose > 0 ) { + my ($tempfh) = $self->tempfile(); + # Hmm, what exactly are we trying to do here? + print $tempfh $response->content; + close($tempfh); + undef $tempfh; + } + my @subdata = split(/\n/, $response->content ); + my $count = 0; + foreach ( @subdata ) { + if( /$RIDLINE/ ) { + $count++; + print STDERR $_ if( $self->verbose > 0); + $self->add_rid($1); + last; + } + } + if( $count == 0 ) { + $self->warn("req was ". $request->as_string() . "\n"); + $self->warn(join('', @subdata)); + } + $tcount += $count; + } else { + # should try and be a little more verbose here + $self->warn("req was ". $request->as_string() . "\n" . + $response->error_as_HTML); + $tcount = -1; + } + } + return $tcount; +} + +=head2 retrieve_blast + + Title : retrieve_blast + Usage : my $blastreport = $blastfactory->retrieve_blast($rid); + Function: Attempts to retrieve a blast report from remote blast queue + Returns : -1 on error, + 0 on 'job not finished', + Bio::Tools::BPlite or Bio::Tools::Blast object + (depending on how object was initialized) on success + Args : Remote Blast ID (RID) + +=cut + +sub retrieve_blast { + my($self, $rid) = @_; + my (undef,$tempfile) = $self->tempfile(); + my %hdr = %RETRIEVALHEADER; + $hdr{'RID'} = $rid; + my $req = POST $URLBASE, [%hdr]; + if( $self->verbose > 0 ) { + $self->warn("retrieve request is " . $req->as_string()); + } + my $response = $self->ua->request($req, $tempfile); + if( $self->verbose > 0 ) { + open(TMP, $tempfile) or $self->throw("cannot open $tempfile"); + while() { print $_; } + close TMP; + } + if( $response->is_success ) { + my $size = -s $tempfile; + if( $size > 1000 ) { + my $blastobj; + if( $self->readmethod =~ /BPlite/ ) { + $blastobj = new Bio::Tools::BPlite(-file => $tempfile); + } else { + $blastobj = new Bio::SearchIO(-file => $tempfile, + -format => 'blast'); + } + #save tempfile + $self->file($tempfile); + return $blastobj; + } elsif( $size < 500 ) { # search had a problem + open(ERR, "<$tempfile") or $self->throw("cannot open file $tempfile"); + $self->warn(join("", )); + close ERR; + return -1; + } else { # still working + return 0; + } + } else { + $self->warn($response->error_as_HTML); + return -1; + } +} + +=head2 save_output + + Title : saveoutput + Usage : my $saveoutput = $self->save_output($filename) + Function: Method to save the blast report + Returns : 1 (throws error otherwise) + Args : string [rid, filename] + +=cut + +sub save_output { + my ($self, $filename) = @_; + if( ! defined $filename ) { + $self->throw("Can't save blast output. You must specify a filename to save to."); + } + #should be set when retrieving blast + my $blastfile = $self->file; + #open temp file and output file, have to filter out some HTML + open(TMP, $blastfile) or $self->throw("cannot open $blastfile"); + open(SAVEOUT, ">$filename") or $self->throw("cannot open $filename"); + my $seentop=0; + while() { + next if (/

    /);	
    +		if( /^(?:[T]?BLAST[NPX])\s*.+$/i ||
    +	   		/^RPS-BLAST\s*.+$/i ) {
    +	   		$seentop=1;
    +	   	}
    +	   	next if !$seentop;
    +	    if( $seentop ) {
    +			print SAVEOUT;
    +		}
    +	}
    +	close SAVEOUT;
    +	close TMP;
    +	return 1;	
    +}
    +
    +sub _load_input {
    +    my ($self, $input) = @_;
    +
    +    if( ! defined $input ) {
    +	$self->throw("Calling remote blast with no input");	
    +    }
    +    my @seqs;
    +    if( ! ref $input ) {
    +	if( -e $input ) {
    +	    my $seqio = new Bio::SeqIO(-format => 'fasta', -file => $input);
    +	    while( my $seq = $seqio->next_seq ) {
    +		push @seqs, $seq;
    +	    }
    +	} else {
    +	    $self->throw("Input $input was not a valid filename");
    +	}	
    +    } elsif( ref($input) =~ /ARRAY/i ) {
    +	foreach ( @$input ) {
    +	    if( ref($_) && $_->isa('Bio::PrimarySeqI') ) {
    +		push @seqs, $_;
    +	    } else {
    +		$self->warn("Trying to add a " . ref($_) .
    +			    " but expected a Bio::PrimarySeqI");
    +	    }
    +	}
    +	if( ! @seqs) {
    +	    $self->throw("Did not pass in valid input -- no sequence objects found");
    +	}
    +    } elsif( $input->isa('Bio::PrimarySeqI') ) {
    +	push @seqs, $input;
    +    }
    +    return @seqs;
    +}
    +1;
    +__END__
    diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Run/StandAloneBlast.pm
    --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    +++ b/variant_effect_predictor/Bio/Tools/Run/StandAloneBlast.pm	Thu Apr 11 02:01:53 2013 -0400
    @@ -0,0 +1,905 @@
    +# $Id: StandAloneBlast.pm,v 1.23.2.3 2003/03/29 20:18:51 jason Exp $
    +#
    +# BioPerl module for Bio::Tools::StandAloneBlast
    +#
    +# Cared for by Peter Schattner
    +#
    +# Copyright Peter Schattner
    +#
    +# You may distribute this module under the same terms as perl itself
    +
    +# POD documentation - main docs before the code
    +
    +=head1 NAME
    +
    +Bio::Tools::Run::StandAloneBlast - Object for the local execution of the
    +NCBI Blast program suite (blastall, blastpgp, bl2seq)
    +
    +=head1 SYNOPSIS
    +
    +Local-blast "factory object" creation and blast-parameter initialization:
    +
    + @params = ('database' => 'swissprot','outfile' => 'blast1.out', 
    +	    '_READMETHOD' => 'Blast');
    +
    + $factory = Bio::Tools::Run::StandAloneBlast->new(@params);
    +
    +Blast a sequence against a database:
    +
    + $str = Bio::SeqIO->new(-file=>'t/amino.fa' , '-format' => 'Fasta' );
    + $input = $str->next_seq();
    + $input2 = $str->next_seq();
    + $blast_report = $factory->blastall($input);
    +
    +Run an iterated Blast (psiblast) of a sequence against a database:
    +
    + $factory->j(3);    # 'j' is blast parameter for # of iterations
    + $factory->outfile('psiblast1.out');
    + $factory = Bio::Tools::Run::StandAloneBlast->new(@params);
    + $blast_report = $factory->blastpgp($input);
    +
    +Use blast to align 2 sequences against each other:
    +
    + $factory = Bio::Tools::Run::StandAloneBlast->new('outfile' => 'bl2seq.out');
    + $factory->bl2seq($input, $input2);
    +
    +Various additional options and input formats are available.  See the
    +DESCRIPTION section for details.
    +
    +=head1 DESCRIPTION
    +
    +This DESCRIPTION only documents Bio::Tools::Run::StandAloneBlast: - a
    +Bioperl object for running the NCBI standAlone BLAST package.  Blast,
    +itself, is a large & complex program - for more information regarding
    +BLAST, please see the BLAST documentation which accompanies the BLAST
    +distribution. BLAST is available from ftp://ncbi.nlm.nih.gov/blast/.
    +
    +(A source of confusion in documenting a BLAST interface is that the
    +term "program" is used in - at least - three different ways in the
    +BLAST documentation.  In this DESCRIPTION, "program" will refer to the
    +BLAST routine set by BLAST's C<-p> parameter that can be set to blastn,
    +blastp, tblastx etc.  We will use the term Blast "executable" to refer
    +to the various different executable files that may be called - ie
    +blastall, blastpgp or bl2seq.  In addition, there are several BLAST
    +capabilities (which are also referred to as "programs") and are
    +implemented by using specific combinations of BLAST executables,
    +programs and parameters.  They will be referred by their specific
    +names - eg PSIBLAST and PHIBLAST. )
    +
    +StandAloneBlast has been tested so far only under Linux. I expect
    +that it should also work under other Unix systems. However, since the
    +module is implemented using (unix) system calls, modification may be
    +necessary before StandAloneBlast would work under non-Unix
    +operating systems (eg Windows, MacOS).  Before running
    +StandAloneBlast it is necessary: to install BLAST on your system,
    +to edit set the environmental variable $BLASTDIR or your $PATH
    +variable to point to the BLAST directory, and to ensure that users
    +have execute privileges for the BLAST program.  If the databases
    +which will be searched by BLAST are located in the data subdirectory
    +of the blast program directory (the default installation location),
    +StandAloneBlast will find them; however, if the database files are
    +located in any other location, environmental variable $BLASTDATADIR
    +will need to be set to point to that directory.
    +
    +The use of the StandAloneBlast module is as follows: Initially, a
    +local blast "factory object" is created. The constructor may be passed
    +an optional array of (non-default) parameters to be used by the
    +factory, eg:
    +
    + @params = ('program' => 'blastn', 'database' => 'ecoli.nt');
    + $factory = Bio::Tools::Run::StandAloneBlast->new(@params);
    +
    +Any parameters not explicitly set will remain as the defaults of the
    +BLAST executable.  Note each BLAST executable has somewhat different
    +parameters and options.  See the BLAST Documentation for a description
    +or run the BLAST executable from the command line followed solely with
    +a "-" to see a list of options and default values for that executable;
    +eg Eblastall -.
    +
    +BLAST parameters can be changed and/or examined at any time after the
    +factory has been created.  The program checks that any
    +parameter/switch being set/read is valid.  Except where specifically
    +noted, StandAloneBlast uses the same single-letter, case-sensitive
    +parameter names as the actual blast program.  Currently no checks are
    +included to verify that parameters are of the proper type (eg string
    +or numeric) or that their values are within the proper range.
    +
    +As an example, to change the value of the Blast parameter 'e' ('e' is
    +the parameter for expectation-value cutoff) 
    +
    + $expectvalue = 0.01;
    + $factory->e($expectvalue);
    +
    +Note that for improved script readibility one can modify the name of
    +the BLAST parameters as desired as long as the initial letter (and
    +case) of the parameter are preserved, eg
    +$factory-Eexpectvalue($expectvalue); Unfortunately, some of the BLAST
    +parameters are not the single letter one might expect (eg "iteration
    +round" in blastpgp is 'j'). Again one can check by using (eg)
    +
    + > blastpgp - .
    +
    +Once the factory has been created and the appropriate parameters set,
    + one can call one of the supported blast executables.  The input
    + sequence(s) to these executables may be fasta file(s) as described in
    + the BLAST documentation.
    +
    + $inputfilename = 't/testquery.fa';
    + $blast_report = $factory->blastall($inputfilename);
    +
    +In addition, sequence input may be in the form of either a Bio::Seq
    + object or or an array of Bio::Seq objects, eg
    +
    + $input = Bio::Seq->new(-id=>"test query",-seq=>"ACTACCCTTTAAATCAGTGGGGG");
    + $blast_report = $factory->blastall($input);
    +
    +For blastall and non-psiblast blastpgp runs, report object is either a
    +BPlite.pm or Bio::SearchIO object, selected by the user with the
    +parameter _READMETHOD.  (The leading underscore is needed to
    +distinguish this option from options which are passed to the BLAST
    +executable.) The default parser is Bio::SearchIO::blast.  For
    +(multiple iteration) psiblast and bl2seq runs the report is
    +automatically parsed by the BPpsilite.pm and BPbl2seq.pm parsers
    +respectively, since neither Blast.pm nor BPlite can parse these
    +reports. In any case, the "raw" blast report is also available. The
    +filename is set by the in the 'outfile' parameter and has the default
    +value of "blastreport.out".
    +
    +For psiblast execution in BLAST's "jumpstart" mode, the program must
    +be passed (in addition to the query sequence itself) an alignment
    +containing the query sequence (in the form of a SimpleAlign object) as
    +well as a "mask" specifying at what residues position-specific scoring
    +matrices (PSSMs) are to used and at what residues default scoring
    +matrices (eg BLOSUM) are to be used. See psiblast documentation for
    +more details.  The mask itself is a string of 0's and 1's which is the
    +same length as each sequence in the alignment and has a "1" at
    +locations where (PSSMs) are to be used and a "0" at all other
    +locations. So for example:
    +
    + $str = Bio::AlignIO->new(-file=> "cysprot.msf", '-format' => 'msf'  );
    + $aln = $str->next_aln();
    + $len = $aln->length_aln();
    + $mask =   '1' x $len;  # simple case where PSSM's to be used at all residues
    + $report = $factory->blastpgp("cysprot1.fa", $aln, $mask);
    +
    +For bl2seq execution, StandAloneBlast.pm can be combined with
    +AlignIO.pm to directly produce a SimpleAlign object from the alignment
    +of the two sequences produced by bl2seq as in:
    +
    + #Get 2 sequences
    + $str = Bio::SeqIO->new(-file=>'t/amino.fa' , '-format' => 'Fasta', );
    + my $seq3 = $str->next_seq();
    + my $seq4 = $str->next_seq();
    +
    + # Run bl2seq on them
    + $factory = Bio::Tools::Run::StandAloneBlast->new('outfile' => 'bl2seq.out');
    + my $bl2seq_report = $factory->bl2seq($seq3, $seq4);
    +
    + # Use AlignIO.pm to create a SimpleAlign object from the bl2seq report
    + $str = Bio::AlignIO->new(-file=> 'bl2seq.out','-format' => 'bl2seq');
    + $aln = $str->next_aln();
    +
    +For more examples of syntax and use of Blast.pm, the user is
    +encouraged to run the scripts standaloneblast.pl in the bioperl
    +/examples directory and StandAloneBlast.t in the bioperl /t directory.
    +
    +Note: There is a similar (but older) perl object interface offered by
    +nhgri. The nhgri module only supports blastall and does not support
    +blastpgp, psiblast, phiblast, bl2seq etc.  This module can be found at
    +http://genome.nhgri.nih.gov/blastall/.
    +
    +=head1 DEVELOPERS NOTES
    +
    +B
    +
    +Note: This module is still under development.  If you would like that a
    +specific BLAST feature be added to this perl interface, let me know.
    +
    +=head1 FEEDBACK
    +
    +=head2 Mailing Lists
    +
    +User feedback is an integral part of the evolution of this and other
    +Bioperl modules. Send your comments and suggestions preferably to one
    +of the Bioperl mailing lists.  Your participation is much appreciated.
    +
    +  bioperl-l@bioperl.org               - General discussion
    +  http://bio.perl.org/MailList.html   - About the mailing lists
    +
    +=head2 Reporting Bugs
    +
    +Report bugs to the Bioperl bug tracking system to help us keep track
    +the bugs and their resolution.  Bug reports can be submitted via email
    +or the web:
    +
    +  bioperl-bugs@bio.perl.org
    +  http://bio.perl.org/bioperl-bugs/
    +
    +=head1 AUTHOR -  Peter Schattner
    +
    +Email schattner@alum.mit.edu
    +
    +=head1 APPENDIX
    +
    +The rest of the documentation details each of the object
    +methods. Internal methods are usually preceded with a _
    +
    +=cut
    +
    +package Bio::Tools::Run::StandAloneBlast;
    +
    +use vars qw($AUTOLOAD @ISA $PROGRAMDIR  $DATADIR 
    +	    @BLASTALL_PARAMS @BLASTPGP_PARAMS 
    +	    @BL2SEQ_PARAMS @OTHER_PARAMS %OK_FIELD
    +	    );
    +use strict;
    +use Bio::Root::Root;
    +use Bio::Root::IO;
    +use Bio::Seq;
    +use Bio::SeqIO;
    +use Bio::Tools::BPbl2seq;
    +use Bio::Tools::BPpsilite;
    +use Bio::SearchIO;
    +use Bio::Tools::Run::WrapperBase;
    +use Bio::Factory::ApplicationFactoryI;
    +
    +BEGIN {      
    +
    +     @BLASTALL_PARAMS = qw( p d i e m o F G E X I q r v b f g Q
    +			    D a O J M W z K L Y S T l U y Z);
    +     @BLASTPGP_PARAMS = qw(d i A f e m o y P F G E X N g S H a I h c
    +			   j J Z O M v b C R W z K L Y p k T Q B l U);
    +     @BL2SEQ_PARAMS = qw(i j p g o d a G E X W M q r F e S T m);
    +
    +
    +# Non BLAST parameters start with underscore to differentiate them
    +# from BLAST parameters
    +     @OTHER_PARAMS = qw(_READMETHOD);
    +
    +# _READMETHOD = 'BPlite' (default) or 'Blast'
    +# my @other_switches = qw(QUIET);
    +
    +
    +# Authorize attribute fields
    +     foreach my $attr (@BLASTALL_PARAMS,  @BLASTPGP_PARAMS, 
    +		       @BL2SEQ_PARAMS, @OTHER_PARAMS )
    +     { $OK_FIELD{$attr}++; }
    +
    +# You will need to enable Blast to find the Blast program. This can be done
    +# in (at least) two different ways:
    +#  1. define an environmental variable blastDIR:
    +#	export BLASTDIR=/home/peter/blast   or
    +#  2. include a definition of an environmental variable BLASTDIR in every script that will
    +#     use StandAloneBlast.pm.
    +#	BEGIN {$ENV{BLASTDIR} = '/home/peter/blast/'; }
    +     $PROGRAMDIR = $ENV{'BLASTDIR'} || '';
    +     
    +# If local BLAST databases are not stored in the standard
    +# /data directory, the variable BLASTDATADIR will need to be set explicitly 
    +     $DATADIR =  $ENV{'BLASTDATADIR'} || $ENV{'BLASTDB'} || '';
    +}
    +
    +@ISA = qw(Bio::Root::Root 
    +	  Bio::Tools::Run::WrapperBase 
    +	  Bio::Factory::ApplicationFactoryI);
    +
    +=head1 BLAST parameters
    +
    +Essentially all BLAST parameter can be set via StandAloneBlast.pm.
    +Some of the most commonly used parameters are listed below.  All
    +parameters have defaults and are optional (I think.)  For a complete
    +listing of settable parameters, run the relevant executable BLAST
    +program with the option "-" as in blastall -
    +
    +=head2 Blastall
    +
    +  -p  Program Name [String]
    +        Input should be one of "blastp", "blastn", "blastx", 
    +        "tblastn", or "tblastx".
    +  -d  Database [String] default = nr
    +        The database specified must first be formatted with formatdb.
    +        Multiple database names (bracketed by quotations) will be accepted.
    +        An example would be -d "nr est"
    +  -i  Query File [File In]   Set by StandAloneBlast.pm from script.
    +    default = stdin. The query should be in FASTA format.  If multiple FASTA entries are in the input
    +        file, all queries will be searched.
    +  -e  Expectation value (E) [Real] default = 10.0
    +  -o  BLAST report Output File [File Out]  Optional,
    +	default = ./blastreport.out ; set by StandAloneBlast.pm		
    +  -S  Query strands to search against database (for blast[nx], and tblastx).  3 is both, 1 is top, 2 is bottom [Integer]
    +	default = 3
    +
    +=head2 Blastpgp (including Psiblast)
    +
    +  -j   is the maximum number of rounds (default 1; i.e., regular BLAST)
    +  -h   is the e-value threshold for including sequences in the
    +	score matrix model (default 0.001)
    +  -c   is the "constant" used in the pseudocount formula specified in the paper (default 10)
    +  -B  Multiple alignment file for PSI-BLAST "jump start mode"  Optional
    +  -Q  Output File for PSI-BLAST Matrix in ASCII [File Out]  Optional
    +
    +=head2 Bl2seq
    +
    +  -i  First sequence [File In]
    +  -j  Second sequence [File In]
    +  -p  Program name: blastp, blastn, blastx. For blastx 1st argument should be nucleotide [String]
    +    default = blastp
    +  -o  alignment output file [File Out] default = stdout
    +  -e  Expectation value (E) [Real]  default = 10.0
    +  -S  Query strands to search against database (blastn only).  3 is both, 1 is top, 2 is bottom [Integer]
    +    default = 3
    +
    +=cut
    +
    +sub new {
    +    my ($caller, @args) = @_;
    +    # chained new
    +    my $self = $caller->SUPER::new(@args);
    + 
    +    # to facilitiate tempfile cleanup
    +    my ($tfh,$tempfile) = $self->io->tempfile();
    +    close($tfh); # we don't want the filehandle, just a temporary name
    +    $self->outfile($tempfile);
    +    $self->_READMETHOD('Blast');
    +    while (@args)  {
    +	my $attr =   shift @args;
    +	my $value =  shift @args;
    +	next if( $attr eq '-verbose');
    +	# the workaround to deal with initializing
    +	$attr = 'p' if $attr =~ /^\s*program\s*$/;
    +	$self->$attr($value);
    +    }
    +    return $self;
    +}
    +
    +sub AUTOLOAD {
    +    my $self = shift;
    +    my $attr = $AUTOLOAD;
    +    $attr =~ s/.*:://;    
    +    my $attr_letter = substr($attr, 0, 1) ; 
    +
    +    # actual key is first letter of $attr unless first attribute
    +    # letter is underscore (as in _READMETHOD), the $attr is a BLAST
    +    # parameter and should be truncated to its first letter only
    +    $attr = ($attr_letter eq '_') ? $attr : $attr_letter;
    +    $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr};
    +#    $self->throw("Unallowed parameter: $attr !") unless $ok_field{$attr_letter};
    +    $self->{$attr_letter} = shift if @_;
    +    return $self->{$attr_letter};
    +}
    +
    +=head1 Methods
    +
    +=head2 executable
    +
    + Title   : executable
    + Usage   : my $exe = $blastfactory->executable('blastall');
    + Function: Finds the full path to the 'codeml' executable
    + Returns : string representing the full path to the exe
    + Args    : [optional] name of executable to set path to 
    +           [optional] boolean flag whether or not warn when exe is not found
    +
    +
    +=cut
    +
    +sub executable {
    +   my ($self, $exename, $exe,$warn) = @_;
    +   $exename = 'blastall' unless defined $exename;
    +
    +   if( defined $exe && -x $exe ) {
    +     $self->{'_pathtoexe'}->{$exename} = $exe;
    +   }
    +   unless( defined $self->{'_pathtoexe'}->{$exename} ) {
    +       my $f = $self->program_path($exename);	    
    +       $exe = $self->{'_pathtoexe'}->{$exename} = $f if(-e $f && -x $f );
    +        
    +       #  This is how I meant to split up these conditionals --jason
    +       # if exe is null we will execute this (handle the case where
    +       # PROGRAMDIR pointed to something invalid)
    +       unless( $exe )  {  # we didn't find it in that last conditional
    +	   if( ($exe = $self->io->exists_exe($exename)) && -x $exe ) {
    +	       $self->{'_pathtoexe'}->{$exename} = $exe;
    +	   } else { 
    +	       $self->warn("Cannot find executable for $exename") if $warn;
    +	       $self->{'_pathtoexe'}->{$exename} = undef;
    +	   }
    +       }
    +   }
    +   return $self->{'_pathtoexe'}->{$exename};
    +}
    +
    +
    +=head2 program_path
    +
    + Title   : program_path
    + Usage   : my $path = $factory->program_path();
    + Function: Builds path for executable 
    + Returns : string representing the full path to the exe
    + Args    : none
    +
    +=cut
    +
    +sub program_path {
    +    my ($self,$program_name) = @_;
    +    my @path;
    +    push @path, $self->program_dir if $self->program_dir;
    +    push @path, $program_name .($^O =~ /mswin/i ?'.exe':'');
    +
    +    return Bio::Root::IO->catfile(@path);
    +}
    +
    +=head2 program_dir
    +
    + Title   : program_dir
    + Usage   : my $dir = $factory->program_dir();
    + Function: Abstract get method for dir of program. To be implemented
    +           by wrapper.
    + Returns : string representing program directory 
    + Args    : none 
    +
    +=cut
    +
    +sub program_dir {
    +    $PROGRAMDIR;
    +}
    +
    +sub program {
    +    my $self = shift;
    +    if( wantarray ) {
    +	return ($self->executable, $self->p());
    +    } else {
    +	return $self->executable(@_);
    +    }
    +}
    +
    +=head2  blastall
    +
    + Title   : blastall
    + Usage   :  $blast_report = $factory->blastall('t/testquery.fa');
    +	or
    +	       $input = Bio::Seq->new(-id=>"test query",
    +				      -seq=>"ACTACCCTTTAAATCAGTGGGGG");
    +	       $blast_report = $factory->blastall($input);
    +	or 
    +	      $seq_array_ref = \@seq_array;  # where @seq_array is an array of Bio::Seq objects
    +	      $blast_report = $factory->blastall(\@seq_array);
    + Returns :  Reference to a Blast object or BPlite object 
    +           containing the blast report.
    + Args    : Name of a file or Bio::Seq object or an array of 
    +           Bio::Seq object containing the query sequence(s). 
    +           Throws an exception if argument is not either a string 
    +           (eg a filename) or a reference to a Bio::Seq object 
    +           (or to an array of Seq objects).  If argument is string, 
    +           throws exception if file corresponding to string name can 
    +           not be found.
    +
    +=cut
    +
    +sub blastall {
    +    my ($self,$input1) = @_;
    +    $self->io->_io_cleanup();
    +    my $executable = 'blastall';
    +    my $input2;
    +# Create input file pointer
    +    my $infilename1 = $self->_setinput($executable, $input1);
    +    if (! $infilename1) {$self->throw(" $input1 ($infilename1) not Bio::Seq object or array of Bio::Seq objects or file name!");}
    +
    +    $self->i($infilename1);	# set file name of sequence to be blasted to inputfilename1 (-i param of blastall)
    +    
    +    my $blast_report = &_generic_local_blast($self, $executable, 
    +					     $input1, $input2);
    +}
    +
    +=head2  blastpgp
    +
    + Title   : blastpgp
    + Usage   :  $blast_report = $factory-> blastpgp('t/testquery.fa');
    +	or
    +	       $input = Bio::Seq->new(-id=>"test query",
    +				      -seq=>"ACTADDEEQQPPTCADEEQQQVVGG");
    +	       $blast_report = $factory->blastpgp ($input);
    +	or 
    +	      $seq_array_ref = \@seq_array;  # where @seq_array is an array of Bio::Seq objects
    +	      $blast_report = $factory-> blastpgp(\@seq_array);
    + Returns : Reference to a Blast object or BPlite object containing 
    +           the blast report.
    + Args    : Name of a file or Bio::Seq object. In psiblast jumpstart 
    +           mode two additional arguments are required: a SimpleAlign 
    +           object one of whose elements is the query and a "mask" to 
    +           determine how BLAST should select scoring matrices see 
    +           DESCRIPTION above for more details.
    +
    +           Throws an exception if argument is not either a string 
    +           (eg a filename) or a reference to a Bio::Seq object 
    +           (or to an array of Seq objects).  If argument is string, 
    +           throws exception if file corresponding to string name can 
    +           not be found.
    + Returns : Reference to either a BPlite.pm, Blast.pm or BPpsilite.pm  
    +           object containing the blast report.
    +
    +=cut
    +
    +sub blastpgp {
    +    my $self = shift;
    +    my $executable = 'blastpgp';
    +    my $input1 = shift;
    +    my $input2 = shift;
    +    my $mask = shift;		# used by blastpgp's -B option to specify which residues are position aligned
    +
    +    my  ($infilename1, $infilename2 )  = $self->_setinput($executable, 
    +							  $input1, $input2, 
    +							  $mask);
    +    if (!$infilename1) {$self->throw(" $input1  not Bio::Seq object or array of Bio::Seq objects or file name!");}
    +    $self->i($infilename1);	# set file name of sequence to be blasted to inputfilename1 (-i param of blastpgp)
    +    if  ($input2) {
    +	unless ($infilename2) {$self->throw("$input2 not SimpleAlign Object in pre-aligned psiblast\n");}
    +	$self->B($infilename2);	# set file name of partial alignment to inputfilename2 (-B param of blastpgp)
    +    }
    +    my $blast_report = &_generic_local_blast($self, $executable, $input1, $input2);
    +}
    +
    +=head2   bl2seq
    +
    + Title   : bl2seq
    + Usage   : $factory-> blastpgp('t/seq1.fa', 't/seq2.fa');
    +	or
    +	  $input1 = Bio::Seq->new(-id=>"test query1",
    +				  -seq=>"ACTADDEEQQPPTCADEEQQQVVGG");
    +	  $input2 = Bio::Seq->new(-id=>"test query2",
    +				  -seq=>"ACTADDEMMMMMMMDEEQQQVVGG");
    +	  $blast_report = $factory->bl2seq ($input1,  $input2);
    + Returns : Reference to a BPbl2seq object containing the blast report.
    + Args    : Names of 2 files  or 2 Bio::Seq objects containing the 
    +           sequences to be aligned by bl2seq.
    +
    +           Throws an exception if argument is not either a pair of 
    +           strings (eg filenames) or  references to Bio::Seq objects.  
    +           If arguments are strings, throws exception if files 
    +           corresponding to string names can not be found.
    +
    +=cut
    +
    +sub bl2seq {
    +    my $self = shift;
    +    my $executable = 'bl2seq';
    +    my $input1 = shift;
    +    my $input2 = shift;
    +
    +# Create input file pointer
    +    my  ($infilename1, $infilename2 )  = $self->_setinput($executable, 
    +							  $input1, $input2);
    +    if (!$infilename1){$self->throw(" $input1  not Seq Object or file name!");}
    +    if (!$infilename2){$self->throw("$input2  not Seq Object or file name!");}
    +
    +    $self->i($infilename1);	# set file name of first sequence to 
    +                                # be aligned to inputfilename1 
    +                                # (-i param of bl2seq)
    +    $self->j($infilename2);	# set file name of first sequence to 
    +                                # be aligned to inputfilename2 
    +                                # (-j param of bl2seq)
    +
    +    my $blast_report = &_generic_local_blast($self, $executable);    
    +}
    +#################################################
    +
    +=head2  _generic_local_blast
    +
    + Title   : _generic_local_blast
    + Usage   :  internal function not called directly
    + Returns :  Blast or BPlite object
    + Args    :   Reference to calling object and name of BLAST executable 
    +
    +=cut
    +
    +sub _generic_local_blast {
    +    my $self = shift;
    +    my $executable = shift;
    +
    +    # Create parameter string to pass to Blast program
    +    my $param_string = $self->_setparams($executable);
    +
    +    # run Blast
    +    my $blast_report = &_runblast($self, $executable, $param_string);
    +}
    +
    +
    +=head2  _runblast
    +
    + Title   :  _runblast
    + Usage   :  Internal function, not to be called directly	
    + Function:   makes actual system call to Blast program
    + Example :
    + Returns : Report object in the appropriate format (BPlite, 
    +           BPpsilite, Blast, or BPbl2seq)
    + Args    : Reference to calling object, name of BLAST executable, 
    +           and parameter string for executable 
    +
    +=cut
    +
    +sub _runblast {
    +    my ($self,$executable,$param_string) = @_;
    +    my ($blast_obj,$exe);
    +    if( ! ($exe = $self->executable($executable)) ) {
    +	$self->warn("cannot find path to $executable");
    +	return undef;    
    +    }
    +    my $commandstring = $exe. $param_string;
    +   
    +    # next line for debugging
    +    $self->debug( "$commandstring \n");
    +
    +    my $status = system($commandstring);
    +
    +    $self->throw("$executable call crashed: $? $commandstring\n")  unless ($status==0) ;
    +    my $outfile = $self->o() ;	# get outputfilename
    +    my $signif = $self->e()  || 1e-5  ; 
    +
    +# set significance cutoff to set expectation value or default value
    +# (may want to make this value vary for different executables)
    +
    +# If running bl2seq or psiblast (blastpgp with multiple iterations),
    +# the specific parsers for these programs must be used (ie BPbl2seq or
    +# BPpsilite).  Otherwise either the Blast parser or the BPlite
    +# parsers can be selected.
    +
    +    if ($executable =~ /bl2seq/i)  {
    +        if( $self->verbose > 0 ) {
    +	 open(OUT, $outfile) || $self->throw("cannot open $outfile");
    +	 while() { $self->debug($_)}
    +	 close(OUT);
    +        }
    +# Added program info so BPbl2seq can compute strand info
    +	$blast_obj = Bio::Tools::BPbl2seq->new(-file => $outfile,
    +                                               -REPORT_TYPE => $self->p );
    +#	$blast_obj = Bio::Tools::BPbl2seq->new(-file => $outfile);
    +    }
    +    elsif ($executable =~ /blastpgp/i && defined $self->j() && 
    +	   $self->j() > 1)  {
    +	print "using psilite parser\n";
    +	$blast_obj = Bio::Tools::BPpsilite->new(-file => $outfile);
    +    }
    +    elsif ($self->_READMETHOD =~ /^Blast/i )  {
    +	$blast_obj = Bio::SearchIO->new(-file=>$outfile,
    +					-format => 'blast'   )  ;
    +    }
    +    elsif ($self->_READMETHOD =~ /^BPlite/i )  {
    +	$blast_obj = Bio::Tools::BPlite->new(-file=>$outfile);
    +    } else {
    +	$self->warn("Unrecognized readmethod ".$self->_READMETHOD. " or executable $executable\n");
    +    }
    +    return $blast_obj;
    +}
    +
    +=head2  _setinput
    +
    + Title   :  _setinput
    + Usage   :  Internal function, not to be called directly	
    + Function:   Create input file(s) for Blast executable
    + Example :
    + Returns : name of file containing Blast data input
    + Args    : Seq object reference or input file name
    +
    +=cut
    +
    +sub _setinput {
    +    my ($self, $executable, $input1, $input2) = @_;
    +    my ($seq, $temp, $infilename1, $infilename2,$fh ) ;
    +#  If $input1 is not a reference it better be the name of a file with
    +#  the sequence/ alignment data...
    +    $self->io->_io_cleanup();
    +
    +  SWITCH:  {
    +      unless (ref $input1) {
    +	  $infilename1 = (-e $input1) ? $input1 : 0 ;
    +	  last SWITCH; 
    +      }
    +#  $input may be an array of BioSeq objects...
    +      if (ref($input1) =~ /ARRAY/i ) {
    +	  ($fh,$infilename1) = $self->io->tempfile();
    +	  $temp =  Bio::SeqIO->new(-fh=> $fh, '-format' => 'Fasta');
    +	  foreach $seq (@$input1) {
    +	      unless ($seq->isa("Bio::PrimarySeqI")) {return 0;}
    +	      $temp->write_seq($seq);
    +	  }
    +	  close $fh;
    +	  $fh = undef;
    +	  last SWITCH;
    +      }
    +#  $input may be a single BioSeq object...
    +      elsif ($input1->isa("Bio::PrimarySeqI")) {
    +	  ($fh,$infilename1) = $self->io->tempfile();
    +
    +# just in case $input1 is taken from an alignment and has spaces (ie
    +# deletions) indicated within it, we have to remove them - otherwise
    +# the BLAST programs will be unhappy
    +
    +	  my $seq_string =  $input1->seq();
    +	  $seq_string =~ s/\W+//g; # get rid of spaces in sequence
    +	  $input1->seq($seq_string);
    +	  $temp =  Bio::SeqIO->new(-fh=> $fh, '-format' => 'Fasta');
    +	  $temp->write_seq($input1);
    +	  close $fh;
    +	  undef $fh;
    +#		$temp->write_seq($input1);
    +	  last SWITCH;
    +      }
    +      $infilename1 = 0;		# Set error flag if you get here
    +  }				# End SWITCH
    +    unless ($input2) { return $infilename1; }
    +  SWITCH2:  {
    +      unless (ref $input2) {
    +	  $infilename2 =   (-e $input2) ? $input2 : 0 ;
    +	  last SWITCH2; 
    +      }
    +      if ($input2->isa("Bio::PrimarySeqI")  && $executable  eq 'bl2seq' ) {
    +	  ($fh,$infilename2) = $self->io->tempfile();
    +
    +	  $temp =  Bio::SeqIO->new(-fh=> $fh, '-format' => 'Fasta');
    +	  $temp->write_seq($input2);
    +	  close $fh;
    +	  undef $fh;
    +	  last SWITCH2;
    +      }
    +# Option for using psiblast's pre-alignment "jumpstart" feature
    +      elsif ($input2->isa("Bio::SimpleAlign")  && 
    +	     $executable  eq 'blastpgp' ) {
    +           # a bit of a lie since it won't be a fasta file
    +	  ($fh,$infilename2) = $self->io->tempfile(); 
    +
    +# first we retrieve the "mask" that determines which residues should
    +# by scored according to their position and which should be scored
    +# using the non-position-specific matrices
    +
    +	  my @mask = split("", shift );	#  get mask
    +
    +# then we have to convert all the residues in every sequence to upper
    +# case at the positions that we want psiblast to use position specific
    +# scoring
    +
    +	  foreach $seq ( $input2->each_seq() ) {
    +	      my @seqstringlist = split("",$seq->seq());
    +	      for (my $i = 0; $i < scalar(@mask); $i++) {
    +		  unless ( $seqstringlist[$i] =~ /[a-zA-Z]/ ) {next}
    +		  $seqstringlist[$i] = $mask[$i] ? uc $seqstringlist[$i]: lc $seqstringlist[$i] ;
    +	      }
    +	      my $newseqstring = join("", @seqstringlist);
    +	      $seq->seq($newseqstring);
    +	  }
    +          #  Now we need to write out the alignment to a file 
    +          # in the "psi format" which psiblast is expecting
    +	  $input2->map_chars('\.','-');
    +	  $temp =  Bio::AlignIO->new(-fh=> $fh, '-format' => 'psi');
    +	  $temp->write_aln($input2);
    +	  close $fh;
    +	  undef $fh;
    +	  last SWITCH2;
    +      }
    +      $infilename2 = 0;		# Set error flag if you get here
    +  }				# End SWITCH2
    +    return ($infilename1, $infilename2);
    +}
    +
    +=head2  _setparams
    +
    + Title   : _setparams
    + Usage   : Internal function, not to be called directly	
    + Function: Create parameter inputs for Blast program
    + Example :
    + Returns : parameter string to be passed to Blast 
    + Args    : Reference to calling object and name of BLAST executable
    +
    +=cut
    +
    +sub _setparams {
    +    my ($self,$executable) = @_;
    +    my ($attr, $value, @execparams);
    +
    +    if ($executable eq 'blastall') {@execparams = @BLASTALL_PARAMS; }
    +    if ($executable eq 'blastpgp') {@execparams = @BLASTPGP_PARAMS; }
    +    if ($executable eq 'bl2seq') {@execparams = @BL2SEQ_PARAMS; }
    +
    +    my $param_string = "";
    +    for $attr ( @execparams ) {
    +	$value = $self->$attr();
    +	next unless (defined $value);
    +# Need to prepend datadirectory to database name
    +	if ($attr  eq 'd' && ($executable ne 'bl2seq')) { 
    +# This is added so that you can specify a DB with a full path
    +	  if (! (-e $value.".nin" || -e $value.".pin")){ 
    +	    $value = File::Spec->catdir($DATADIR,$value);
    +	  }
    +	}
    +# put params in format expected by Blast
    +	$attr  = '-'. $attr ;       
    +	$param_string .= " $attr  $value ";
    +    }
    +
    +# if ($self->quiet()) { $param_string .= '  >/dev/null';}
    +
    +    return $param_string;
    +}
    +
    +
    +=head1 Bio::Tools::Run::Wrapper methods
    +
    +=cut
    +
    +=head2 no_param_checks
    +
    + Title   : no_param_checks
    + Usage   : $obj->no_param_checks($newval)
    + Function: Boolean flag as to whether or not we should
    +           trust the sanity checks for parameter values  
    + Returns : value of no_param_checks
    + Args    : newvalue (optional)
    +
    +
    +=cut
    +
    +=head2 save_tempfiles
    +
    + Title   : save_tempfiles
    + Usage   : $obj->save_tempfiles($newval)
    + Function: 
    + Returns : value of save_tempfiles
    + Args    : newvalue (optional)
    +
    +
    +=cut
    +
    +=head2 outfile_name
    +
    + Title   : outfile_name
    + Usage   : my $outfile = $tcoffee->outfile_name();
    + Function: Get/Set the name of the output file for this run
    +           (if you wanted to do something special)
    + Returns : string
    + Args    : [optional] string to set value to
    +
    +
    +=cut
    +
    +
    +=head2 tempdir
    +
    + Title   : tempdir
    + Usage   : my $tmpdir = $self->tempdir();
    + Function: Retrieve a temporary directory name (which is created)
    + Returns : string which is the name of the temporary directory
    + Args    : none
    +
    +
    +=cut
    +
    +=head2 cleanup
    +
    + Title   : cleanup
    + Usage   : $tcoffee->cleanup();
    + Function: Will cleanup the tempdir directory after a PAML run
    + Returns : none
    + Args    : none
    +
    +
    +=cut
    +
    +=head2 io
    +
    + Title   : io
    + Usage   : $obj->io($newval)
    + Function:  Gets a L object
    + Returns : L
    + Args    : none
    +
    +
    +=cut
    +
    +sub DESTROY {
    +    my $self= shift;
    +    unless ( $self->save_tempfiles ) {
    +	$self->cleanup();
    +    }
    +    $self->SUPER::DESTROY();
    +}
    +
    +1;
    +__END__
    diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Run/WrapperBase.pm
    --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    +++ b/variant_effect_predictor/Bio/Tools/Run/WrapperBase.pm	Thu Apr 11 02:01:53 2013 -0400
    @@ -0,0 +1,354 @@
    +# $Id: WrapperBase.pm,v 1.7.2.2 2003/03/11 10:52:26 jason Exp $
    +#
    +# BioPerl module for Bio::Tools::Run::WrapperBase
    +#
    +# Cared for by Jason Stajich 
    +#
    +# Copyright Jason Stajich
    +#
    +# You may distribute this module under the same terms as perl itself
    +
    +# POD documentation - main docs before the code
    +
    +=head1 NAME
    +
    +Bio::Tools::Run::WrapperBase - A Base object for wrappers around executables
    +
    +=head1 SYNOPSIS
    +
    +  # do not use this object directly, it provides the following methods
    +  # for its subclasses
    +
    +  my $errstr = $obj->error_string();
    +  my $exe    = $obj->executable();
    +  $obj->save_tempfiles($booleanflag)
    +  my $outfile= $obj->outfile_name();
    +  my $tempdir= $obj->tempdir(); # get a temporary dir for executing
    +  my $io     = $obj->io;  # Bio::Root::IO object
    +  my $cleanup= $obj->cleanup(); # remove tempfiles
    +
    +  $obj->run({-arg1 => $value});
    +
    +=head1 DESCRIPTION
    +
    +This is a basic module from which to build executable wrapper modules.
    +It has some basic methods to help when implementing new modules.
    +
    +=head1 FEEDBACK
    +
    +=head2 Mailing Lists
    +
    +User feedback is an integral part of the evolution of this and other
    +Bioperl modules. Send your comments and suggestions preferably to
    +the Bioperl mailing list.  Your participation is much appreciated.
    +
    +  bioperl-l@bioperl.org              - General discussion
    +  http://bioperl.org/MailList.shtml  - About the mailing lists
    +
    +=head2 Reporting Bugs
    +
    +Report bugs to the Bioperl bug tracking system to help us keep track
    +of the bugs and their resolution. Bug reports can be submitted via
    +email or the web:
    +
    +  bioperl-bugs@bioperl.org
    +  http://bioperl.org/bioperl-bugs/
    +
    +=head1 AUTHOR - Jason Stajich
    +
    +Email jason@bioperl.org
    +
    +Describe contact details here
    +
    +=head1 CONTRIBUTORS
    +
    +Additional contributors names and emails here
    +
    +=head1 APPENDIX
    +
    +The rest of the documentation details each of the object methods.
    +Internal methods are usually preceded with a _
    +
    +=cut
    +
    +
    +# Let the code begin...
    +
    +
    +package Bio::Tools::Run::WrapperBase;
    +use vars qw(@ISA);
    +use strict;
    +
    +# Object preamble - inherits from Bio::Root::Root
    +
    +use Bio::Root::RootI;
    +use Bio::Root::IO;
    +
    +@ISA = qw(Bio::Root::RootI);
    +
    +=head2 run
    +
    + Title   : run
    + Usage   : $wrapper->run({ARGS HERE});
    + Function: Support generic running with args passed in
    +           as a hashref
    + Returns : Depends on the implementation, status OR data
    + Args    : hashref of named arguments
    +
    +
    +=cut
    +
    +sub run {
    +   my ($self,@args) = @_;
    +   $self->throw_not_implemented();
    +}
    +
    +
    +=head2 error_string
    +
    + Title   : error_string
    + Usage   : $obj->error_string($newval)
    + Function: Where the output from the last analysus run is stored.
    + Returns : value of error_string
    + Args    : newvalue (optional)
    +
    +
    +=cut
    +
    +sub error_string{
    +   my ($self,$value) = @_;
    +   if( defined $value) {
    +      $self->{'_error_string'} = $value;
    +    }
    +    return $self->{'_error_string'} || '';
    +}
    +
    +
    +=head2 no_param_checks
    +
    + Title   : no_param_checks
    + Usage   : $obj->no_param_checks($newval)
    + Function: Boolean flag as to whether or not we should
    +           trust the sanity checks for parameter values  
    + Returns : value of no_param_checks
    + Args    : newvalue (optional)
    +
    +
    +=cut
    +
    +sub no_param_checks{
    +   my ($self,$value) = @_;
    +   if( defined $value || ! defined $self->{'no_param_checks'} ) {
    +       $value = 0 unless defined $value;
    +      $self->{'no_param_checks'} = $value;
    +    }
    +    return $self->{'no_param_checks'};
    +}
    +
    +=head2 save_tempfiles
    +
    + Title   : save_tempfiles
    + Usage   : $obj->save_tempfiles($newval)
    + Function: 
    + Returns : value of save_tempfiles
    + Args    : newvalue (optional)
    +
    +
    +=cut
    +
    +sub save_tempfiles{
    +   my ($self,$value) = @_;
    +   if( defined $value) {
    +      $self->{'save_tempfiles'} = $value;
    +    }
    +    return $self->{'save_tempfiles'};
    +}
    +
    +=head2 outfile_name
    +
    + Title   : outfile_name
    + Usage   : my $outfile = $wrapper->outfile_name();
    + Function: Get/Set the name of the output file for this run
    +           (if you wanted to do something special)
    + Returns : string
    + Args    : [optional] string to set value to
    +
    +
    +=cut
    +
    +sub outfile_name{
    +   my ($self,$nm) = @_;
    +   if( defined $nm || ! defined $self->{'_outfilename'} ) { 
    +       $nm = 'mlc' unless defined $nm;
    +       $self->{'_outfilename'} = $nm;
    +   }
    +   return $self->{'_outfilename'};
    +}
    +
    +
    +=head2 tempdir
    +
    + Title   : tempdir
    + Usage   : my $tmpdir = $self->tempdir();
    + Function: Retrieve a temporary directory name (which is created)
    + Returns : string which is the name of the temporary directory
    + Args    : none
    +
    +
    +=cut
    +
    +sub tempdir{
    +   my ($self) = @_;
    +   
    +   unless( $self->{'_tmpdir'} ) {
    +       $self->{'_tmpdir'} = $self->io->tempdir(CLEANUP => ! $self->save_tempfiles );
    +   }
    +   unless( -d $self->{'_tmpdir'} ) { 
    +       mkdir($self->{'_tmpdir'},0777);
    +   }
    +   $self->{'_tmpdir'};
    +}
    +
    +=head2 cleanup
    +
    + Title   : cleanup
    + Usage   : $wrapper->cleanup();
    + Function: Will cleanup the tempdir directory after a PAML run
    + Returns : none
    + Args    : none
    +
    +
    +=cut
    +
    +sub cleanup{
    +   my ($self) = @_;
    +   $self->io->_io_cleanup();
    +   if( defined $self->{'_tmpdir'} &&
    +       -d $self->{'_tmpdir'} ) {
    +       $self->io->rmtree($self->{'_tmpdir'});
    +   }
    +}
    +
    +=head2 io
    +
    + Title   : io
    + Usage   : $obj->io($newval)
    + Function: Gets a L object
    + Returns : L object
    + Args    : none
    +
    +
    +=cut
    +
    +sub io{
    +   my ($self) = @_;
    +   unless( defined $self->{'io'} ) {
    +       $self->{'io'} = new Bio::Root::IO(-verbose => $self->verbose());
    +   }
    +    return $self->{'io'};
    +}
    +
    +=head2 version
    +
    + Title   : version
    + Usage   : $version = $wrapper->version()
    + Function: Returns the program version (if available)
    + Returns : string representing version of the program 
    + Args    : [Optional] value to (re)set version string
    +
    +
    +=cut
    +
    +sub version{
    +   my ($self,@args) = @_;
    +   return undef;
    +}
    +
    +=head2 executable
    +
    + Title   : executable
    + Usage   : my $exe = $factory->executable();
    + Function: Finds the full path to the executable
    + Returns : string representing the full path to the exe
    + Args    : [optional] name of executable to set path to
    +           [optional] boolean flag whether or not warn when exe is not found
    +
    +=cut
    +
    +sub executable{
    +   my ($self, $exe,$warn) = @_;
    +
    +   if( defined $exe ) {
    +     $self->{'_pathtoexe'} = $exe;
    +   }
    +   unless( defined $self->{'_pathtoexe'} ) {
    +       my $prog_path = $self->program_path;
    +       if( $prog_path && -e $prog_path && -x $prog_path ) {
    +           $self->{'_pathtoexe'} = $prog_path;
    +       } else {
    +           my $exe;
    +           if( ( $exe = $self->io->exists_exe($self->program_name) ) &&
    +               -x $exe ) {
    +               $self->{'_pathtoexe'} = $exe;
    +           } else {
    +               $self->warn("Cannot find executable for ".$self->program_name) if $warn;
    +               $self->{'_pathtoexe'} = undef;
    +           }
    +       }
    +   }
    +   $self->{'_pathtoexe'};
    +}
    +
    +=head2 program_path
    +
    + Title   : program_path
    + Usage   : my $path = $factory->program_path();
    + Function: Builds path for executable 
    + Returns : string representing the full path to the exe
    + Args    : none
    +
    +=cut
    +
    +sub program_path {
    +    my ($self) = @_;
    +    my @path;
    +    push @path, $self->program_dir if $self->program_dir;
    +    push @path, $self->program_name.($^O =~ /mswin/i ?'.exe':'');
    +
    +    return Bio::Root::IO->catfile(@path);
    +}
    +
    +=head2 program_dir
    +
    + Title   : program_dir
    + Usage   : my $dir = $factory->program_dir();
    + Function: Abstract get method for dir of program. To be implemented
    +           by wrapper.
    + Returns : string representing program directory 
    + Args    : none 
    +
    +=cut
    +
    +sub program_dir {
    +    my ($self) = @_;
    +    $self->throw_not_implemented();
    +}
    +
    +=head2 program_name
    +
    + Title   : program_name
    + Usage   : my $name = $factory->program_name();
    + Function: Abstract get method for name of program. To be implemented
    +           by wrapper.
    + Returns : string representing program name
    + Args    : none
    +
    +=cut
    +
    +sub program_name {
    +    my ($self) = @_;
    +    $self->throw_not_implemented();
    +}
    +
    +
    +1;
    diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Seg.pm
    --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    +++ b/variant_effect_predictor/Bio/Tools/Seg.pm	Thu Apr 11 02:01:53 2013 -0400
    @@ -0,0 +1,186 @@
    +# $Id: Seg.pm,v 1.6 2002/10/22 07:45:22 lapp Exp $
    +#
    +# BioPerl module for Bio::Tools::Seg
    +#
    +# Copyright Balamurugan Kumarasamy
    +#
    +# You may distribute this module under the same terms as perl itself
    +#
    +# POD documentation - main docs before the code
    +#
    +# Copyright 
    +#
    +# You may distribute this module under the same terms as perl itself
    +# POD documentation - main docs before the code
    +
    +=head1 NAME
    +
    +Bio::Tools::Seg - parse Seg output (filter low complexity protein sequence)
    +
    +=head1 SYNOPSIS
    +
    +  use Bio::Tools::Seg;
    +  my $parser = new Bio::Tools::Seg(-fh =>$filehandle );
    +  while( my $seg_feat = $parser->next_result ) {
    +        #do something
    +        #eg
    +        push @seg_feat, $seg_feat;
    +  }
    +
    +=head1 DESCRIPTION
    +
    +Parser for Seg output
    +
    +=head1 FEEDBACK
    +
    +=head2 Mailing Lists
    +
    +User feedback is an integral part of the evolution of this and other
    +Bioperl modules. Send your comments and suggestions preferably to
    +the Bioperl mailing list.  Your participation is much appreciated.
    +
    +  bioperl-l@bioperl.org              - General discussion
    +  http://bioperl.org/MailList.shtml  - About the mailing lists
    +
    +=head2 Reporting Bugs
    +
    +Report bugs to the Bioperl bug tracking system to help us keep track
    +of the bugs and their resolution. Bug reports can be submitted via
    +email or the web:
    +
    + bioperl-bugs@bioperl.org
    + http://bugzilla.bioperl.org/
    +
    +=head1 AUTHOR - Bala
    +
    +Email savikalpa@fugu-sg.org
    +
    +
    +=head1 CONTRIBUTORS
    +
    +Additional contributors names and emails here
    +
    +=head1 APPENDIX
    +
    +The rest of the documentation details each of the object methods.
    +Internal methods are usually preceded with a _
    +
    +=cut
    +
    +package Bio::Tools::Seg;
    +use vars qw(@ISA);
    +use strict;
    +
    +use Bio::Root::Root;
    +use Bio::SeqFeature::FeaturePair;
    +use Bio::Root::IO;
    +use Bio::SeqFeature::Generic;
    +@ISA = qw(Bio::Root::Root Bio::Root::IO);
    +
    +
    +
    +
    +
    +=head2 new
    +
    + Title   : new
    + Usage   : my $obj = new Bio::Tools::Seg();
    + Function: Builds a new Bio::Tools::Seg object
    + Returns : Bio::Tools::Seg
    + Args    : -fh/-file => $val, # for initing input, see Bio::Root::IO
    +
    +
    +=cut
    +
    +
    +sub new {
    +      my($class,@args) = @_;
    +
    +      my $self = $class->SUPER::new(@args);
    +      $self->_initialize_io(@args);
    +
    +      return $self;
    +}
    +
    +=head2 next_result
    +
    + Title   : next_result
    + Usage   : my $feat = $seg->next_result
    + Function: Get the next result set from parser data
    + Returns : Bio::SeqFeature::Generic
    + Args    : none
    +
    +
    +=cut
    +
    +sub next_result {
    +        my ($self) = @_;
    +
    +        my $line;
    +        # parse
    +        my $id;
    +        while ($_=$self->_readline()) {
    +         $line = $_;
    +         chomp $line;
    +
    +          next if /^$/;
    +           if ($line=~/^\>/) { #if it is a line starting with a ">"
    +               $line=~/^\>\s*(\S+)\s*\((\d+)\-(\d+)\)\s*complexity=(\S+)/;
    +               my $id = $1;
    +               my $start = $2;
    +               my $end = $3;
    +               my $score = $4;
    +
    +               #for example in this line test_prot(214-226) complexity=2.26 (12/2.20/2.50)
    +               #$1 is test_prot  $2 is 214 $3 is 226 and $4 is 2.26
    +
    +               my (%feature);
    +               $feature{name} = $id;
    +               $feature{score} = $score;
    +               $feature{start} = $start;
    +               $feature{end} = $end;
    +               $feature{source} = "Seg";
    +               $feature{primary} = 'low_complexity';
    +               $feature{program} = "Seg";
    +               $feature{logic_name} = 'low_complexity';
    +               my $new_feat =  $self->create_feature (\%feature);
    +               return $new_feat;
    +            }
    +          next;
    +        }
    +
    +}
    +
    +
    +=head2 create_feature 
    +
    + Title   : create_feature
    + Usage   : obj->create_feature(\%feature)
    + Function: Internal(not to be used directly)
    + Returns : 
    + Args    :
    +
    +
    +=cut
    +
    +sub create_feature {
    +       my ($self, $feat) = @_;
    +
    +
    +       # create feature object
    +       my $feature = Bio::SeqFeature::Generic->new(-seq_id => $feat->{name},
    +                                                   -start  => $feat->{start},
    +                                                   -end    => $feat->{end},
    +                                                   -score  => $feat->{score},
    +                                                   -source => $feat->{source},
    +                                                   -primary => $feat->{primary},
    +                                                   -logic_name  => $feat->{logic_name}, 
    +                                               );
    +
    +          return $feature;
    +
    +}
    +
    +1;
    +
    +
    diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/SeqAnal.pm
    --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    +++ b/variant_effect_predictor/Bio/Tools/SeqAnal.pm	Thu Apr 11 02:01:53 2013 -0400
    @@ -0,0 +1,834 @@
    +#-------------------------------------------------------------------------------
    +# PACKAGE : Bio::Tools::SeqAnal
    +# PURPOSE : To provide a base class for different sequence analysis tools.
    +# AUTHOR  : Steve Chervitz (sac@bioperl.org)
    +# CREATED : 27 Mar 1998
    +# REVISION: $Id: SeqAnal.pm,v 1.12 2002/10/22 07:38:46 lapp Exp $
    +# STATUS  : Alpha
    +#
    +# For documentation, run this module through pod2html
    +# (preferably from Perl v5.004 or better).
    +#-------------------------------------------------------------------------------
    +
    +package Bio::Tools::SeqAnal;
    +
    +use Bio::Root::Object ();
    +use Bio::Root::Global qw(:std);
    +
    +use strict;
    +use vars qw($ID $VERSION @ISA);
    +
    +@ISA        = qw( Bio::Root::Object );
    +$ID = 'Bio::Tools::SeqAnal';
    +$VERSION  = 0.011;
    +
    +
    +## POD Documentation:
    +
    +=head1 NAME
    +
    +Bio::Tools::SeqAnal - Bioperl sequence analysis base class.
    +
    +=head1 SYNOPSIS
    +
    +=head2 Object Creation
    +
    +This module is an abstract base class. Perl will let you instantiate it,
    +but it provides little functionality on its own. This module
    +should be used via a specialized subclass. See L<_initialize()|_initialize>
    +for a description of constructor parameters.
    +
    +    require Bio::Tools::SeqAnal;
    +
    +To run and parse a new report:
    +
    +    $hit = new Bio::Tools::SeqAnal ( -run   => \%runParams,
    +				     -parse => 1);
    +
    +To parse an existing report:
    +
    +    $hit = new Bio::Tools::SeqAnal ( -file  => 'filename.data',
    +				     -parse => 1);
    +
    +To run a report without parsing:
    +
    +    $hit = new Bio::Tools::SeqAnal ( -run   => \%runParams
    +				     );
    +
    +To read an existing report without parsing:
    +
    +    $hit = new Bio::Tools::SeqAnal ( -file  => 'filename.data',
    +				     -read  => 1);
    +
    +
    +=head1 INSTALLATION
    +
    +This module is included with the central Bioperl distribution:
    +
    +   http://bio.perl.org/Core/Latest
    +   ftp://bio.perl.org/pub/DIST
    +
    +Follow the installation instructions included in the README file.
    +
    +
    +=head1 DESCRIPTION
    +
    +Bio::Tools::SeqAnal.pm is a base class for specialized
    +sequence analysis modules such as B and B.
    +It provides some basic data and functionalities that are not unique to
    +a specialized module such as:
    +
    +=over 4
    +
    +=item * reading raw data into memory.
    +
    +=item * storing name and version of the program.
    +
    +=item * storing name of the query sequence.
    +
    +=item * storing name and version of the database.
    +
    +=item * storing & determining the date on which the analysis was performed.
    +
    +=item * basic file manipulations (compress, uncompress, delete).
    +
    +=back
    +
    +Some of these functionalities (reading, file maipulation) are inherited from
    +B, from which Bio::Tools::SeqAnal.pm derives.
    +
    +
    +
    +=head1 RUN, PARSE, and READ
    +
    +A SeqAnal.pm object can be created using one of three modes: run, parse, or read.
    +
    +  MODE      DESCRIPTION
    +  -----     -----------
    +  run       Run a new sequence analysis report. New results can then
    +            be parsed or saved for analysis later.
    +
    +  parse     Parse the data from a sequence analysis report loading it
    +            into the SeqAnal.pm object.
    +
    +  read      Read in data from an existing raw analysis report without
    +            parsing it. In the future, this may also permit persistent
    +            SeqAnal.pm objects. This mode is considered experimental.
    +
    +The mode is set by supplying switches to the constructor, see L<_initialize()|_initialize>.
    +
    +
    +
    +A key feature of SeqAnal.pm is the ability to access raw data in a
    +generic fashion. Regardless of what sequence analysis method is used,
    +the raw data always need to be read into memory.  The SeqAnal.pm class
    +utilizes the L method inherited from
    +B to permit the following:
    +
    +=over 4
    +
    +=item * read from a file or STDIN.
    +
    +=item * read a single record or a stream containing multiple records.
    +
    +=item * specify a record separator.
    +
    +=item * store all input data in memory or process the data stream as it is being read.
    +
    +=back
    +
    +By permitting the parsing of data as it is being read, each record can be
    +analyzed as it is being read and saved or discarded as necessary.
    +This can be useful when cruching through thousands of reports.
    +For examples of this, see the L methods defined in B and
    +B.
    +
    +
    +=head2 Parsing & Running
    +
    +Parsing and running of sequence analysis reports must be implemented for each
    +specific subclass of SeqAnal.pm. No-op stubs ("virtual methods") are provided here for
    +the L and L methods. See B and B
    +for examples.
    +
    +
    +=head1 DEPENDENCIES
    +
    +Bio::Tools::SeqAnal.pm is a concrete class that inherits from B.
    +This module also makes use of a number of functionalities inherited from
    +B (file manipulations such as reading, compressing, decompressing,
    +deleting, and obtaining date.
    +
    +
    +=head1 FEEDBACK
    +
    +=head2 Mailing Lists
    +
    +User feedback is an integral part of the evolution of this and other Bioperl modules.
    +Send your comments and suggestions preferably to one of the Bioperl mailing lists.
    +Your participation is much appreciated.
    +
    +    bioperl-l@bioperl.org          - General discussion
    +    http://bio.perl.org/MailList.html             - About the mailing lists
    +
    +=head2 Reporting Bugs
    +
    +Report bugs to the Bioperl bug tracking system to help us keep track the bugs and
    +their resolution. Bug reports can be submitted via email or the web:
    +
    +    bioperl-bugs@bio.perl.org
    +    http://bugzilla.bioperl.org/
    +
    +=head1 AUTHOR
    +
    +Steve Chervitz, sac@bioperl.org
    +
    +See the L section for where to send bug reports and comments.
    +
    +=head1 VERSION
    +
    +Bio::Tools::SeqAnal.pm, 0.011
    +
    +=head1 COPYRIGHT
    +
    +Copyright (c) 1998 Steve Chervitz. All Rights Reserved.
    +This module is free software; you can redistribute it and/or
    +modify it under the same terms as Perl itself.
    +
    +
    +=head1 SEE ALSO
    +
    + http://bio.perl.org/Projects/modules.html  - Online module documentation
    + http://bio.perl.org/Projects/Blast/        - Bioperl Blast Project
    + http://bio.perl.org/                       - Bioperl Project Homepage
    +
    +
    +=cut
    +
    +
    +
    +#
    +##
    +###
    +#### END of main POD documentation.
    +###
    +##
    +#
    +
    +=head1 APPENDIX
    +
    +Methods beginning with a leading underscore are considered private
    +and are intended for internal use by this module. They are
    +B considered part of the public interface and are described here
    +for documentation purposes only.
    +
    +=cut
    +
    +##############################################################################
    +##                          CONSTRUCTOR                                     ##
    +##############################################################################
    +
    +
    +=head2 _initialize
    +
    + Usage     : n/a; automatically called by Bio::Root::Object::new()
    + Purpose   : Calls private methods to extract the raw report data,
    +           : Calls superclass constructor first (Bio::Root::Object.pm).
    + Returns   : string containing the make parameter value.
    + Argument  : Named parameters (TAGS CAN BE ALL UPPER OR ALL LOWER CASE).
    +           : The SeqAnal.pm constructor only processes the following
    +           : parameters passed from new()
    +           :     -RUN     => hash reference for named parameters to be used
    +           :                 for running a sequence analysis program.
    +           :                 These are dereferenced and passed to the run() method.
    +	   :     -PARSE   => boolean,
    +	   :     -READ    => boolean,
    +           :
    +           : If -RUN is HASH ref, the run() method will be called with the
    +           :   dereferenced hash.
    +           : If -PARSE is true, all parameters passed from new() are passed
    +           :   to the parse() method. This occurs after the run method call
    +           :   to enable combined running + parsing.
    +           : If -READ is true, all parameters passed from new() are passed
    +           :   to the read() method.
    +           : Either -PARSE or -READ should be true, not both.
    + Comments  : Does not calls _rearrange() to handle parameters since only
    +           : a few are required and there may be potentially many.
    +
    +See Also   : B, B
    +
    +=cut
    +
    +#-----------------
    +sub _initialize {
    +#-----------------
    +    my( $self, %param ) = @_;
    +
    +    my $make = $self->SUPER::_initialize(%param);
    +
    +    my($read, $parse, $runparam) = (
    +	($param{-READ}||$param{'-read'}), ($param{-PARSE}||$param{'-parse'}),
    +	($param{-RUN}||$param{'-run'})
    +				    );
    +
    +#	$self->_rearrange([qw(READ PARSE RUN)], @param);
    +	
    +    # Issue: How to keep all the arguments for running the analysis
    +    # separate from other arguments needed for parsing the results, etc?
    +    # Solution: place all the run arguments in a separate hash.
    +
    +    $self->run(%$runparam) if ref $runparam eq 'HASH';
    +
    +    if($parse) { $self->parse(%param); }
    +    elsif($read) { $self->read(%param) }
    +
    +    $make;
    +}
    +
    +#--------------
    +sub destroy {
    +#--------------
    +    my $self=shift;
    +    $DEBUG==2 && print STDERR "DESTROYING $self ${\$self->name}";
    +    undef $self->{'_rawData'};
    +    $self->SUPER::destroy;
    +}
    +
    +
    +###############################################################################
    +#                                 ACCESSORS
    +###############################################################################
    +
    +# The mode of the SeqAnal object is no longer explicitly set.
    +# This simplifies the interface somewhat.
    +
    +##----------------------------------------------------------------------
    +#=head2 mode()
    +
    +# Usage     : $object->mode();
    +#	    :
    +# Purpose   : Set/Get the mode for the sequence analysis object.
    +#	    :
    +# Returns   : String
    +#	    :
    +# Argument  : n/a
    +#	    :
    +#	    :
    +# Comments  : The mode specifies how much detail to extract from the
    +#	    : sequence analysis report. There are three modes:
    +#	    :
    +#	    :    'parse' -- Parse the sequence analysis output data.
    +#	    :
    +#	    :     'read' -- Reads in the raw report but does not
    +#	    :               attempt to parse it. Useful when you just
    +#	    :               want to work with the output as-is
    +#	    :               (e.g., create HTML-formatted output).
    +#	    :
    +#	    :     'run'  -- Generates a new report.
    +#	    :
    +#	    : Allowable modes are defined by the exported package global array
    +#	    : @SeqAnal_modes.
    +#
    +#See Also   : _set_mode()
    +#=cut
    +##----------------------------------------------------------------------
    +#sub mode {
    +#    my $self = shift;
    +#    if(@_) { $self->{'_mode'} = lc(shift); }
    +#    $self->{'_mode'};
    +#}
    +#
    +
    +
    +=head2 best
    +
    + Usage     : $object->best();
    + Purpose   : Set/Get the indicator for processing only the best match.
    + Returns   : Boolean (1 | 0)
    + Argument  : n/a
    +
    +=cut
    +
    +#----------
    +sub best {
    +#----------
    +    my $self = shift;
    +    if(@_) { $self->{'_best'} = shift; }
    +    $self->{'_best'};
    +}
    +
    +
    +
    +=head2 _set_db_stats
    +
    + Usage     : $object->_set_db_stats();
    + Purpose   : Set stats about the database searched.
    + Returns   : String
    + Argument  : named parameters:
    +           :   -LETTERS =>   (number of letters in db)
    +           :   -SEQS    =>   (number of sequences in db)
    +
    +=cut
    +
    +#-------------------
    +sub _set_db_stats {
    +#-------------------
    +    my ($self, %param) = @_;
    +
    +    $self->{'_db'}        ||= $param{-NAME}    || '';
    +    $self->{'_dbRelease'}   = $param{-RELEASE} || '';
    +    ($self->{'_dbLetters'}  = $param{-LETTERS} || 0)  =~ s/,//g;
    +    ($self->{'_dbSeqs'}     = $param{-SEQS}    || 0) =~ s/,//g;
    +
    +}
    +
    +
    +=head2 database
    +
    + Usage     : $object->database();
    + Purpose   : Set/Get the name of the database searched.
    + Returns   : String
    + Argument  : n/a
    +
    +=cut
    +
    +#---------------
    +sub database {
    +#---------------
    +    my $self = shift;
    +    if(@_) { $self->{'_db'} = shift; }
    +    $self->{'_db'};
    +}
    +
    +
    +
    +=head2 database_release
    +
    + Usage     : $object->database_release();
    + Purpose   : Set/Get the release date of the queried database.
    + Returns   : String
    + Argument  : n/a
    +
    +=cut
    +
    +#-----------------------
    +sub database_release {
    +#-----------------------
    +    my $self = shift;
    +    if(@_) { $self->{'_dbRelease'} = shift; }
    +    $self->{'_dbRelease'};
    +}
    +
    +
    +=head2 database_letters
    +
    + Usage     : $object->database_letters();
    + Purpose   : Set/Get the number of letters in the queried database.
    + Returns   : Integer
    + Argument  : n/a
    +
    +=cut
    +
    +#----------------------
    +sub database_letters {
    +#----------------------
    +    my $self = shift;
    +    if(@_) { $self->{'_dbLetters'} = shift; }
    +    $self->{'_dbLetters'};
    +}
    +
    +
    +
    +=head2 database_seqs
    +
    + Usage     : $object->database_seqs();
    + Purpose   : Set/Get the number of sequences in the queried database.
    + Returns   : Integer
    + Argument  : n/a
    +
    +=cut
    +
    +#------------------
    +sub database_seqs {
    +#------------------
    +    my $self = shift;
    +    if(@_) { $self->{'_dbSeqs'} = shift; }
    +    $self->{'_dbSeqs'};
    +}
    +
    +
    +
    +=head2 set_date
    +
    + Usage     : $object->set_date([]);
    + Purpose   : Set the name of the date on which the analysis was performed.
    + Argument  : The optional string argument ca be the date or the
    +           : string 'file' in which case the date will be obtained from
    +           : the report file
    + Returns   : String
    + Throws    : Exception if no date is supplied and no file exists.
    + Comments  : This method attempts to set the date in either of two ways:
    +           :   1) using data passed in as an argument,
    +           :   2) using the Bio::Root::Utilities.pm file_date() method
    +           :      on the output file.
    +           : Another way is to extract the date from the contents of the
    +           : raw output data. Such parsing will have to be specialized
    +           : for different seq analysis reports. Override this method
    +           : to create such custom parsing code if desired.
    +
    +See Also   : L, B
    +
    +=cut
    +
    +#---------------
    +sub set_date {
    +#---------------
    +    my $self = shift;
    +    my $date = shift;
    +    my ($file);
    +
    +    if( !$date and ($file = $self->file)) {
    +	# If no date is passed and a file exists, determine date from the file.
    +	# (provided by superclass Bio::Root::Object.pm)
    +	eval {
    +	    $date = $self->SUPER::file_date(-FMT => 'd m y');
    +	};
    +	if($@) {
    +	    $date = 'UNKNOWN';
    +	    $self->warn("Can't set date of report.");
    +	}
    +    }
    +    $self->{'_date'} = $date;
    +}
    +
    +
    +
    +=head2 date
    +
    + Usage     : $object->date();
    + Purpose   : Get the name of the date on which the analysis was performed.
    + Returns   : String
    + Argument  : n/a
    + Comments  : This method is not a combination set/get, it only gets.
    +
    +See Also   : L
    +
    +=cut
    +
    +#----------
    +sub date {  my $self = shift;  $self->{'_date'}; }
    +#----------
    +
    +
    +
    +
    +=head2 length
    +
    + Usage     : $object->length();
    + Purpose   : Set/Get the length of the query sequence (number of monomers).
    + Returns   : Integer
    + Argument  : n/a
    + Comments  : Developer note: when using the built-in length function within
    +           : this module, call it as CORE::length().
    +
    +=cut
    +
    +#------------
    +sub length {
    +#------------
    +    my $self = shift;
    +    if(@_) { $self->{'_length'} = shift; }
    +    $self->{'_length'};
    +}
    +
    +=head2 program
    +
    + Usage     : $object->program();
    + Purpose   : Set/Get the name of the sequence analysis (BLASTP, FASTA, etc.)
    + Returns   : String
    + Argument  : n/a
    +
    +=cut
    +
    +#-------------
    +sub program {
    +#-------------
    +    my $self = shift;
    +    if(@_) { $self->{'_prog'} = shift; }
    +    $self->{'_prog'};
    +}
    +
    +
    +
    +=head2 program_version
    +
    + Usage     : $object->program_version();
    + Purpose   : Set/Get the version number of the sequence analysis program.
    +           : (e.g., 1.4.9MP, 2.0a19MP-WashU).
    + Returns   : String
    + Argument  : n/a
    +
    +=cut
    +
    +#---------------------
    +sub program_version {
    +#---------------------
    +    my $self = shift;
    +    if(@_) { $self->{'_progVersion'} = shift; }
    +    $self->{'_progVersion'};
    +}
    +
    +
    +=head2 query
    +
    + Usage     : $name = $object->query();
    + Purpose   : Get the name of the query sequence used to generate the report.
    + Argument  : n/a
    + Returns   : String
    + Comments  : Equivalent to $object->name().
    +
    +=cut
    +
    +#--------
    +sub query { my $self = shift; $self->name; }
    +#--------
    +
    +
    +=head2 query_desc
    +
    + Usage     : $object->desc();
    + Purpose   : Set/Get the description of the query sequence for the analysis.
    + Returns   : String
    + Argument  : n/a
    +
    +=cut
    +
    +#--------------
    +sub query_desc {
    +#--------------
    +    my $self = shift;
    +    if(@_) { $self->{'_qDesc'} = shift; }
    +    $self->{'_qDesc'};
    +}
    +
    +
    +
    +
    +=head2 display
    +
    + Usage     : $object->display();
    + Purpose   : Display information about Bio::Tools::SeqAnal.pm data members.
    +           : Overrides Bio::Root::Object::display().
    + Example   : $object->display(-SHOW=>'stats');
    + Argument  : Named parameters: -SHOW  => 'file' | 'stats'
    +           :                   -WHERE => filehandle (default = STDOUT)
    + Returns   : n/a
    + Status    : Experimental
    +
    +See Also   : L<_display_stats()|_display_stats>, L<_display_file()|_display_file>, B
    +
    +=cut
    +
    +#---------------
    +sub display {
    +#---------------
    +    my( $self, %param ) = @_;
    +
    +    $self->SUPER::display(%param);
    +
    +    my $OUT = $self->fh();
    +    $self->show =~ /file/i and $self->_display_file($OUT);
    +    1;
    +}
    +
    +
    +
    +=head2 _display_file
    +
    + Usage     : n/a; called automatically by display()
    + Purpose   : Print the contents of the raw report file.
    + Example   : n/a
    + Argument  : one argument = filehandle object.
    + Returns   : true (1)
    + Status    : Experimental
    +
    +See Also   : L
    +
    +=cut
    +
    +#------------------
    +sub _display_file {
    +#------------------
    +    my( $self, $OUT) = @_;
    +
    +    print $OUT scalar($self->read);
    +    1;
    +}
    +
    +
    +
    +=head2 _display_stats
    +
    + Usage     : n/a; called automatically by display()
    + Purpose   : Display information about Bio::Tools::SeqAnal.pm data members.
    +           : Prints the file name, program, program version, database name,
    +           : database version, query name, query length,
    + Example   : n/a
    + Argument  : one argument = filehandle object.
    + Returns   : printf call.
    + Status    : Experimental
    +
    +See Also   : B
    +
    +=cut
    +
    +#--------------------
    +sub _display_stats {
    +#--------------------
    +    my( $self, $OUT ) = @_;
    +
    +    printf( $OUT "\n%-15s: %s\n", "QUERY NAME", $self->query ||'UNKNOWN' );
    +    printf( $OUT "%-15s: %s\n", "QUERY DESC", $self->query_desc || 'UNKNOWN');
    +    printf( $OUT "%-15s: %s\n", "LENGTH", $self->length || 'UNKNOWN');
    +    printf( $OUT "%-15s: %s\n", "FILE", $self->file || 'STDIN');
    +    printf( $OUT "%-15s: %s\n", "DATE", $self->date || 'UNKNOWN');
    +    printf( $OUT "%-15s: %s\n", "PROGRAM", $self->program || 'UNKNOWN');
    +    printf( $OUT "%-15s: %s\n", "VERSION", $self->program_version || 'UNKNOWN');
    +    printf( $OUT "%-15s: %s\n", "DB-NAME", $self->database || 'UNKNOWN');
    +    printf( $OUT "%-15s: %s\n", "DB-RELEASE", ($self->database_release || 'UNKNOWN'));
    +    printf( $OUT "%-15s: %s\n", "DB-LETTERS", ($self->database_letters) ? $self->database_letters : 'UNKNOWN');
    +    printf( $OUT "%-15s: %s\n", "DB-SEQUENCES", ($self->database_seqs) ? $self->database_seqs : 'UNKNOWN');
    +}
    +
    +
    +#####################################################################################
    +##                                 VIRTUAL METHODS                                 ##
    +#####################################################################################
    +
    +=head1 VIRTUAL METHODS
    +
    +=head2 parse
    +
    + Usage     : $object->parse( %named_parameters )
    + Purpose   : Parse a raw sequence analysis report.
    + Returns   : Integer (number of sequence analysis reports parsed).
    + Argument  : Named parameters.
    + Throws    : Exception: virtual method not defined.
    +           : Propagates any exception thrown by read()
    + Status    : Virtual
    + Comments  : This is virtual method that should be overridden to
    +           : parse a specific type of data.
    +
    +See Also   : B
    +
    +=cut
    +
    +#---------
    +sub parse {
    +#---------
    +    my ($self, @param) = @_;
    +
    +    $self->throw("Virtual method parse() not defined ${ref($self)} objects.");
    +
    +    # The first step in parsing is reading in the data:
    +    $self->read(@param);
    +}
    +
    +
    +
    +=head2 run
    +
    + Usage     : $object->run( %named_parameters )
    + Purpose   : Run a sequence analysis program on one or more sequences.
    + Returns   : n/a
    +           : Run mode should be configurable to return a parsed object or
    +           : the raw results data.
    + Argument  : Named parameters:
    + Throws    : Exception: virtual method not defined.
    + Status    : Virtual
    +
    +=cut
    +
    +#--------
    +sub run {
    +#--------
    +    my ($self, %param) = @_;
    +    $self->throw("Virtual method run() not defined ${ref($self)} objects.");
    +}
    +
    +
    +1;
    +__END__
    +
    +#####################################################################################
    +#                                END OF CLASS                                       #
    +#####################################################################################
    +
    +
    +=head1 FOR DEVELOPERS ONLY
    +
    +=head2 Data Members
    +
    +Information about the various data members of this module is provided for those
    +wishing to modify or understand the code. Two things to bear in mind:
    +
    +=over 4
    +
    +=item 1 Do NOT rely on these in any code outside of this module.
    +
    +All data members are prefixed with an underscore to signify that they are private.
    +Always use accessor methods. If the accessor doesn't exist or is inadequate,
    +create or modify an accessor (and let me know, too!).
    +
    +=item 2 This documentation may be incomplete and out of date.
    +
    +It is easy for these data member descriptions to become obsolete as
    +this module is still evolving. Always double check this info and search
    +for members not described here.
    +
    +=back
    +
    +An instance of Bio::Tools::SeqAnal.pm is a blessed reference to a hash containing
    +all or some of the following fields:
    +
    + FIELD           VALUE
    + --------------------------------------------------------------
    +  _file            Full path to file containing raw sequence analysis report.
    +
    +  _mode            Affects how much detail to extract from the raw report.
    + 		   Future mode will also distinguish 'running' from 'parsing'
    +
    +
    + THE FOLLOWING MAY BE EXTRACTABLE FROM THE RAW REPORT FILE:
    +
    +  _prog            Name of the sequence analysis program.
    +
    +  _progVersion     Version number of the program.
    +
    +  _db              Database searched.
    +
    +  _dbRelease       Version or date of the database searched.
    +
    +  _dbLetters       Total number of letters in the database.
    +
    +  _dbSequences     Total number of sequences in the database.
    +
    +  _query           Name of query sequence.
    +
    +  _length          Length of the query sequence.
    +
    +  _date            Date on which the analysis was performed.
    +
    +
    +  INHERITED DATA MEMBERS
    +
    +  _name            From Bio::Root::Object.pm. String representing the name of the query sequence.
    + 		   Typically obtained from the report file.
    +
    +  _parent          From Bio::Root::Object.pm. This member contains a reference to the
    + 		   object to which this seq anal report belongs. Optional & experimenta.
    +                   (E.g., a protein object could create and own a Blast object.)
    +
    +=cut
    +
    +1;
    diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/SeqPattern.pm
    --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    +++ b/variant_effect_predictor/Bio/Tools/SeqPattern.pm	Thu Apr 11 02:01:53 2013 -0400
    @@ -0,0 +1,926 @@
    +# $Id: SeqPattern.pm,v 1.14 2002/10/22 07:38:46 lapp Exp $
    +#
    +# bioperl module for Bio::Tools::SeqPattern
    +#
    +# Cared for by  Steve Chervitz  (sac@bioperl.org)
    +#
    +# Copyright  Steve Chervitz 
    +#
    +# You may distribute this module under the same terms as perl itself
    +
    +# POD documentation - main docs before the code
    +
    +=head1 NAME
    +
    +Bio::Tools::SeqPattern - Bioperl object for a sequence pattern or motif
    +
    +=head1 SYNOPSIS
    +
    +=head2 Object Creation
    +
    +    use Bio::Tools::SeqPattern ();
    +
    +    $pat1     = 'T[GA]AA...TAAT';
    +    $pattern1 = new Bio::Tools::SeqPattern(-SEQ =>$pattern, -TYPE =>'Dna'); 
    +
    +    $pat2     = '[VILM]R(GXX){3,2}...[^PG]';
    +    $pattern2 = new Bio::Tools::SeqPattern(-SEQ =>$pattern, -TYPE =>'Amino'); 
    +
    +=head1 DESCRIPTION
    +
    +The Bio::Tools::SeqPattern.pm module encapsulates generic data and
    +methods for manipulating regular expressions describing nucleic or
    +amino acid sequence patterns (a.k.a, "motifs").
    +
    +Bio::Tools::SeqPattern.pm is a concrete class that inherits from
    +B.
    +
    +This class grew out of a need to have a standard module for doing routine
    +tasks with sequence patterns such as:
    +
    +  -- Forming a reverse-complement version of a nucleotide sequence pattern
    +  -- Expanding patterns containing ambiguity codes
    +  -- Checking for invalid regexp characters
    +  -- Untainting yet preserving special characters in the pattern
    +
    +Other features to look for in the future:
    +
    +  -- Full pattern syntax checking
    +  -- Conversion between expanded and ondensed forms of the pattern
    +
    +=head1 MOTIVATIONS
    +
    +A key motivation for Bio::Tools::SeqPattern.pm is to have a way to
    +generate a reverse complement of a nucleotide sequence pattern.
    +This makes possible simultaneous pattern matching on both sense and 
    +anti-sense strands of a query sequence. 
    +
    +In principle, one could do such a search more inefficiently by testing 
    +against both sense and anti-sense versions of a sequence. 
    +It is entirely equivalent to test a regexp containing both sense and 
    +anti-sense versions of the *pattern* against one copy of the sequence.
    +The latter approach is much more efficient since:
    +
    +   1) You need only one copy of the sequence.
    +   2) Only one regexp is executed.
    +   3) Regexp patterns are typically much smaller than sequences.
    +
    +Patterns can be quite complex and it is often difficult to
    +generate the reverse complement pattern. The Bioperl SeqPattern.pm
    +addresses this problem, providing a convenient set of tools
    +for working with biological sequence regular expressions.
    +
    +Not all patterns have been tested. If you discover a pattern that
    +is not handled properly by Bio::Tools::SeqPattern.pm, please
    +send me some email (sac@bioperl.org). Thanks.
    +
    +=head1 OTHER FEATURES
    +
    +=head2 Extended Alphabet Support
    +
    +This module supports the same set of ambiguity codes for nucleotide 
    +sequences as supported by B. These ambiguity codes
    +define the behavior or the expand() method.
    +
    + ------------------------------------------
    + Symbol       Meaning      Nucleic Acid
    + ------------------------------------------
    +  A            A           Adenine
    +  C            C           Cytosine
    +  G            G           Guanine
    +  T            T           Thymine
    +  U            U           Uracil
    +  M          A or C  
    +  R          A or G        Any purine
    +  W          A or T    
    +  S          C or G     
    +  Y          C or T        Any pyrimidine
    +  K          G or T     
    +  V        A or C or G  
    +  H        A or C or T  
    +  D        A or G or T  
    +  B        C or G or T   
    +  X      G or A or T or C 
    +  N      G or A or T or C 
    +  .      G or A or T or C 
    +
    +
    +
    + ------------------------------------------
    + Symbol           Meaning   
    + ------------------------------------------
    + A        Alanine
    + C        Cysteine
    + D        Aspartic Acid
    + E        Glutamic Acid
    + F        Phenylalanine
    + G        Glycine
    + H        Histidine
    + I        Isoleucine
    + K        Lysine
    + L        Leucine
    + M        Methionine
    + N        Asparagine
    + P        Proline
    + Q        Glutamine
    + R        Arginine
    + S        Serine
    + T        Threonine
    + V        Valine
    + W        Tryptophan
    + Y        Tyrosine
    +
    + B        Aspartic Acid, Asparagine
    + Z        Glutamic Acid, Glutamine
    + X        Any amino acid
    + .        Any amino acid
    +
    +
    +=head2   Multiple Format Support
    +
    +Ultimately, this module should be able to build SeqPattern.pm objects
    +using a variety of pattern formats such as ProSite, Blocks, Prints, GCG, etc.
    +Currently, this module only supports patterns using a grep-like syntax.
    +
    +=head1 USAGE
    +
    +A simple demo script called seq_pattern.pl is included in the examples/
    +directory of the central Bioperl distribution.
    +
    +=head1 SEE ALSO
    +
    +L   - Base class.
    +L            - Lightweight sequence object.
    +
    +http://bio.perl.org/Projects/modules.html  - Online module documentation
    +http://bio.perl.org/                       - Bioperl Project Homepage 
    +
    +=head1 FEEDBACK
    +
    +=head2 Mailing Lists 
    +
    +User feedback is an integral part of the evolution of this and other
    +Bioperl modules.  Send your comments and suggestions preferably to one
    +of the Bioperl mailing lists.  Your participation is much appreciated.
    +
    +    bioperl-l@bioperl.org              - General discussion
    +    http://bio.perl.org/MailList.html  - About the mailing lists
    +
    +=head2 Reporting Bugs
    +
    +Report bugs to the Bioperl bug tracking system to help us keep track
    +the bugs and their resolution. Bug reports can be submitted via email
    +or the web:
    +
    +    bioperl-bugs@bio.perl.org
    +    http://bugzilla.bioperl.org/
    +
    +=head1 AUTHOR
    +
    +Steve Chervitz, sac@bioperl.org
    +
    +=head1 VERSION
    +
    +Bio::Tools::SeqPattern.pm, 0.011
    +
    +=head1 COPYRIGHT
    +
    +Copyright (c) 1997-8 Steve Chervitz. All Rights Reserved.
    +This module is free software; you can redistribute it and/or 
    +modify it under the same terms as Perl itself.
    +
    +=cut
    +
    +#
    +##
    +###
    +#### END of main POD documentation.
    +###
    +##
    +#'
    +# CREATED : 28 Aug 1997
    +
    +
    +package Bio::Tools::SeqPattern;
    +
    +use Bio::Root::Root;
    +@ISA = qw(Bio::Root::Root);
    +use strict;
    +use vars qw ($ID $VERSION);
    +$ID  = 'Bio::Tools::SeqPattern';
    +$VERSION = 0.011;
    +
    +## These constants may be more appropriate in a Bio::Dictionary.pm 
    +## type of class.
    +my $PURINES      = 'AG';
    +my $PYRIMIDINES  = 'CT';
    +my $BEE      = 'DN';
    +my $ZED      = 'EQ';
    +my $Regexp_chars = '\w,.\*()\[\]<>\{\}^\$';  # quoted for use in regexps
    +
    +## Package variables used in reverse complementing.
    +my (%Processed_braces, %Processed_asterics);
    +
    +#####################################################################################
    +##                                 CONSTRUCTOR                                     ##
    +#####################################################################################
    +
    +
    +=head1 new
    +
    + Title     : new
    + Usage     : my $seqpat = new Bio::Tools::SeqPattern();
    + Purpose   : Verifies that the type is correct for superclass (Bio::Seq.pm)
    +           : and calls superclass constructor last.
    + Returns   : n/a
    + Argument  : Parameters passed to new()
    + Throws    : Exception if the pattern string (seq) is empty.
    + Comments  : The process of creating a new SeqPattern.pm object
    +           : ensures that the pattern string is untained.
    +
    +See Also   : B, 
    +             B
    +
    +=cut
    +
    +#----------------
    +sub new {
    +#----------------
    +    my($class, %param) = @_;
    +    
    +    my $self = $class->SUPER::new(%param);
    +    my ($seq,$type) = $self->_rearrange([qw(SEQ TYPE)], %param);
    +
    +    $seq || $self->throw("Empty pattern.");
    +    my $t;
    +    # Get the type ready for Bio::Seq.pm
    +    if ($type =~ /nuc|[dr]na/i) {
    +	$t = 'Dna';
    +    } elsif ($type =~ /amino|pep|prot/i) {
    +	$t = 'Amino';
    +    }
    +    $seq =~ tr/a-z/A-Z/;  #ps 8/8/00 Canonicalize to upper case
    +    $self->str($seq);
    +    $self->type($t);
    +
    +    return $self;
    +}
    +
    +
    +=head1 alphabet_ok
    +
    + Title     : alphabet_ok
    + Usage     : $mypat->alphabet_ok;
    + Purpose   : Checks for invalid regexp characters.
    +           : Overrides Bio::Seq::alphabet_ok() to allow 
    +           : additional regexp characters ,.*()[]<>{}^$ 
    +           : in addition to the standard genetic alphabet.
    +           : Also untaints the pattern and sets the sequence
    +           : object's sequence to the untained string.
    + Returns   : Boolean (1 | 0)
    + Argument  : n/a
    + Throws    : Exception if the pattern contains invalid characters.
    + Comments  : Does not call the superclass method.
    +           : Actually permits any alphanumeric, not just the
    +           : standard genetic alphabet.
    +
    +=cut
    +
    +#----------------'
    +sub alphabet_ok {
    +#----------------
    +    my( $self) = @_;
    +
    +    return 1 if $self->{'_alphabet_checked'};
    +
    +    $self->{'_alphabet_checked'} = 1;
    +
    +    my $pat = $self->seq();
    +
    +    if($pat =~ /[^$Regexp_chars]/io) { 
    +	$self->throw("Pattern contains invalid characters: $pat",
    +		     'Legal characters: a-z,A-Z,0-9,,.*()[]<>{}^$ ');
    +    }
    +
    +    # Untaint pattern (makes code taint-safe).
    +    $pat  =~ /[$Regexp_chars]+/io; 
    +    $self->setseq(uc($&));
    +#    print STDERR "\npattern ok: $pat\n";
    +    1;
    +}
    +
    +=head1 expand
    +
    + Title     : expand
    + Usage     : $seqpat_object->expand();
    + Purpose   : Expands the sequence pattern using special ambiguity codes.
    + Example   : $pat = $seq_pat->expand();
    + Returns   : String containing fully expanded sequence pattern
    + Argument  : n/a
    + Throws    : Exception if sequence type is not recognized 
    +           : (i.e., is not one of [DR]NA, Amino)
    +
    +See Also   : B, L<_expand_pep>(), L<_expand_nuc>()
    +
    +=cut
    +
    +#----------
    +sub expand {
    +#----------
    +    my $self = shift;
    +
    +    if($self->type =~ /[DR]na/i) { $self->_expand_nuc(); }
    +    elsif($self->type =~ /Amino/i) { $self->_expand_pep(); }
    +    else{
    +	$self->throw("Don't know how to expand ${\$self->type} patterns.\n");
    +    }
    +}
    +
    +
    +=head1 _expand_pep
    +
    + Title     : _expand_pep
    + Usage     : n/a; automatically called by expand()
    + Purpose   : Expands peptide patterns
    + Returns   : String (the expanded pattern)
    + Argument  : String (the unexpanded pattern) 
    + Throws    : n/a
    +
    +See Also   : L(), L<_expand_nuc>()
    +
    +=cut
    +
    +#----------------
    +sub _expand_pep {
    +#----------------
    +    my ($self,$pat) = @_;
    +    $pat ||= $self->str;
    +    $pat =~ s/X/./g;
    +    $pat =~ s/^$/\$/;
    +
    +    ## Avoid nested situations: [bmnq] --/--> [[$ZED]mnq]
    +    ## Yet correctly deal with: fze[bmnq] ---> f[$BEE]e[$ZEDmnq]
    +    if($pat =~ /\[\w*[BZ]\w*\]/) {
    +	$pat =~ s/\[(\w*)B(\w*)\]/\[$1$ZED$2\]/g;
    +	$pat =~ s/\[(\w*)Z(\w*)\]/\[$1$BEE$2\]/g;
    +	$pat =~ s/B/\[$ZED\]/g;
    +	$pat =~ s/Z/\[$BEE\]/g;
    +    } else {
    +	$pat =~ s/B/\[$ZED\]/g;
    +	$pat =~ s/Z/\[$BEE\]/g;
    +    }
    +    $pat =~ s/\((.)\)/$1/g;  ## Doing these last since:
    +    $pat =~ s/\[(.)\]/$1/g;  ## Pattern could contain [B] (for example)
    +
    +    return $pat;
    +}
    +
    +
    +
    +=head1 _expand_nuc
    +
    + Title     : _expand_nuc
    + Purpose   : Expands nucleotide patterns
    + Returns   : String (the expanded pattern)
    + Argument  : String (the unexpanded pattern) 
    + Throws    : n/a
    +
    +See Also   : L(), L<_expand_pep>()
    +
    +=cut
    +
    +#---------------
    +sub _expand_nuc {
    +#---------------
    +    my ($self,$pat) = @_;
    +
    +    $pat ||= $self->str;
    +    $pat =~ s/N|X/./g;
    +    $pat =~ s/pu/R/ig;
    +    $pat =~ s/py/Y/ig;
    +    $pat =~ s/U/T/g;
    +    $pat =~ s/^$/\$/;
    +
    +    ## Avoid nested situations: [ya] --/--> [[ct]a]
    +    ## Yet correctly deal with: sg[ya] ---> [gc]g[cta]
    +    if($pat =~ /\[\w*[RYSWMK]\w*\]/) {
    +	$pat =~ s/\[(\w*)R(\w*)\]/\[$1$PURINES$2\]/g;
    +	$pat =~ s/\[(\w*)Y(\w*)\]/\[$1$PYRIMIDINES$2\]/g;
    +	$pat =~ s/\[(\w*)S(\w*)\]/\[$1GC$2\]/g;
    +	$pat =~ s/\[(\w*)W(\w*)\]/\[$1AT$2\]/g;
    +	$pat =~ s/\[(\w*)M(\w*)\]/\[$1AC$2\]/g;
    +	$pat =~ s/\[(\w*)K(\w*)\]/\[$1GT$2\]/g;
    +	$pat =~ s/\[(\w*)V(\w*)\]/\[$1ACG$2\]/g;
    +	$pat =~ s/\[(\w*)H(\w*)\]/\[$1ACT$2\]/g;
    +	$pat =~ s/\[(\w*)D(\w*)\]/\[$1AGT$2\]/g;
    +	$pat =~ s/\[(\w*)B(\w*)\]/\[$1CGT$2\]/g;
    +	$pat =~ s/R/\[$PURINES\]/g;
    +	$pat =~ s/Y/\[$PYRIMIDINES\]/g;
    +	$pat =~ s/S/\[GC\]/g;
    +	$pat =~ s/W/\[AT\]/g;
    +	$pat =~ s/M/\[AC\]/g;
    +	$pat =~ s/K/\[GT\]/g;
    +	$pat =~ s/V/\[ACG\]/g;
    +	$pat =~ s/H/\[ACT\]/g;
    +	$pat =~ s/D/\[AGT\]/g;
    +	$pat =~ s/B/\[CGT\]/g;
    +    } else {
    +	$pat =~ s/R/\[$PURINES\]/g;
    +	$pat =~ s/Y/\[$PYRIMIDINES\]/g;
    +	$pat =~ s/S/\[GC\]/g;
    +	$pat =~ s/W/\[AT\]/g;
    +	$pat =~ s/M/\[AC\]/g;
    +	$pat =~ s/K/\[GT\]/g;
    +	$pat =~ s/V/\[ACG\]/g;
    +	$pat =~ s/H/\[ACT\]/g;
    +	$pat =~ s/D/\[AGT\]/g;
    +	$pat =~ s/B/\[CGT\]/g;
    +    }
    +    $pat =~ s/\((.)\)/$1/g;  ## Doing thses last since:
    +    $pat =~ s/\[(.)\]/$1/g;  ## Pattern could contain [y] (for example)
    +
    +    return $pat;  
    +}
    +
    +
    +
    +=head1 revcom
    +
    + Title     : revcom
    + Usage     : revcom([1]);
    + Purpose   : Forms a pattern capable of recognizing the reverse complement
    +           : version of a nucleotide sequence pattern.
    + Example   : $pattern_object->revcom(); 
    +           : $pattern_object->revcom(1); ## returns expanded rev complement pattern.
    + Returns   : Object reference for a new Bio::Tools::SeqPattern containing
    +           : the revcom of the current pattern as its sequence.
    + Argument  : (1) boolean (optional) (default= false)
    +           :     true : expand the pattern before rev-complementing. 
    +           :     false: don't expand pattern before or after rev-complementing.
    + Throws    : Exception if called for amino acid sequence pattern.
    + Comments  : This method permits the simultaneous searching of both
    +           : sense and anti-sense versions of a nucleotide pattern
    +           : by means of a grep-type of functionality in which any
    +           : number of patterns may be or-ed into the recognition
    +           : pattern.
    +           : Overrides Bio::Seq::revcom() and calls it first thing. 
    +           : The order of _fixpat() calls is critical.
    +
    +See Also   : B, L<_fixpat_1>(), L<_fixpat_2>(), L<_fixpat_3>(), L<_fixpat_4>(), L<_fixpat_5>()
    +
    +=cut
    +
    +#-----------'
    +sub revcom {
    +#-----------
    +    my($self,$expand) = @_;
    +    
    +    if ($self->type !~ /Dna|Rna/i) {
    +	$self->throw("Can't get revcom for ${\$self->type} sequence types.\n");
    +    }
    +#    return $self->{'_rev'} if defined $self->{'_rev'};
    +
    +    $expand ||= 0;
    +    my $str = $self->str;
    +    $str =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
    +    my $rev = CORE::reverse $str;
    +    $rev    =~ tr/[](){}<>/][)(}{>_expand_nuc($rev);
    +#	print "\nExpanded: $rev\n";
    +    }
    +			      
    +    %Processed_braces = ();			      
    +    %Processed_asterics = ();			      
    +
    +    my $fixrev = _fixpat_1($rev);
    +#   print "FIX 1: $fixrev";;
    +
    +     $fixrev = _fixpat_2($fixrev);
    +#   print "FIX 2: $fixrev";;
    +
    +     $fixrev = _fixpat_3($fixrev);
    +#    print "FIX 3: $fixrev";;
    +
    +     $fixrev = _fixpat_4($fixrev);
    +#    print "FIX 4: $fixrev";;
    +    
    +     $fixrev = _fixpat_5($fixrev);
    +#    print "FIX 5: $fixrev";;
    +
    +##### Added by ps 8/7/00 to allow non-greedy matching
    +     $fixrev = _fixpat_6($fixrev);
    +#    print "FIX 6: $fixrev";;
    +
    +#    $self->{'_rev'} = $fixrev;
    +
    +     return new Bio::Tools::SeqPattern(-seq =>$fixrev, -type =>$self->type);
    +}
    +
    +
    +
    +=head1 _fixpat_1
    +
    + Title     : _fixpat_1
    + Usage     : n/a; called automatically by revcom()
    + Purpose   : Utility method for revcom()
    +           : Converts all {7,5} --> {5,7}     (Part I)
    +           :           and [T^] --> [^T]      (Part II)
    +           :           and *N   --> N*        (Part III)
    + Returns   : String (the new, partially reversed pattern)
    + Argument  : String (the expanded pattern)
    + Throws    : n/a
    +
    +See Also   : L()
    +
    +=cut
    +
    +#--------------
    +sub _fixpat_1 {
    +#--------------
    +    my $pat = shift;
    +    
    +    ## Part I:
    +    my (@done,@parts);
    +    while(1) {
    +	$pat =~ /(.*)\{(\S+?)\}(.*)/ or do{ push @done, $pat; last; };
    +	$pat = $1.'#{'.reverse($2).'}'.$3;
    +#	print "1: $1\n2: $2\n3: $3\n";
    +#	print "modified pat: $pat";;
    +	@parts = split '#', $pat;
    +	push @done, $parts[1];
    +	$pat = $parts[0];
    +#	print "done: $parts[1]<---\nnew pat: $pat<---";;
    +	last if not $pat;
    +    }
    +    $pat = join('', reverse @done);
    +
    +    ## Part II:
    +    @done = ();
    +    while(1) {
    +	$pat =~ /(.*)\[(\S+?)\](.*)/ or do{ push @done, $pat; last; };
    +	$pat = $1.'#['.reverse($2).']'.$3;
    +#	print "1: $1\n2: $2\n3: $3\n";
    +#	print "modified pat: $pat";;
    +	@parts = split '#', $pat;
    +	push @done, $parts[1];
    +	$pat = $parts[0];
    +#	print "done: $parts[1]<---\nnew pat: $pat<---";;
    +	last if not $pat;
    +    }
    +    $pat = join('', reverse @done);
    +
    +    ## Part III:
    +    @done = ();
    +    while(1) {
    +	$pat =~ /(.*)\*([\w.])(.*)/ or do{ push @done, $pat; last; };
    +	$pat = $1.'#'.$2.'*'.$3;
    +	$Processed_asterics{$2}++;
    +#	print "1: $1\n2: $2\n3: $3\n";
    +#	print "modified pat: $pat";;
    +	@parts = split '#', $pat;
    +	push @done, $parts[1];
    +	$pat = $parts[0];
    +#	print "done: $parts[1]<---\nnew pat: $pat<---";;
    +	last if not $pat;
    +    }
    +    return join('', reverse @done);
    +    
    +}
    +
    +
    +=head1 _fixpat_2
    +
    + Title     : _fixpat_2
    + Usage     : n/a; called automatically by revcom()
    + Purpose   : Utility method for revcom()
    +           : Converts all {5,7}Y ---> Y{5,7}
    +           :          and {10,}. ---> .{10,}
    + Returns   : String (the new, partially reversed pattern)
    + Argument  : String (the expanded, partially reversed pattern)
    + Throws    : n/a
    +
    +See Also   : L()
    +
    +=cut
    +
    +#--------------
    +sub _fixpat_2 {
    +#--------------
    +    my $pat = shift;
    +    
    +    local($^W) = 0;
    +    my (@done,@parts,$braces);
    +    while(1) {
    +#	$pat =~ s/(.*)([^])])(\{\S+?\})([\w.])(.*)/$1$2#$4$3$5/ or do{ push @done, $pat; last; };
    +	$pat =~ s/(.*)(\{\S+?\})([\w.])(.*)/$1#$3$2$4/ or do{ push @done, $pat; last; };
    +	$braces = $2;
    +	$braces =~ s/[{}]//g;
    +	$Processed_braces{"$3$braces"}++;
    +#	print "modified pat: $pat";;
    +	@parts = split '#', $pat;
    +	push @done, $parts[1];
    +	$pat = $parts[0];
    +#	print "done: $parts[1]<---\nnew pat: $pat<---";;
    +	last if not $pat;
    +    }
    +    return join('', reverse @done);
    +}
    +
    +
    +=head1 _fixpat_3
    +
    + Title     : _fixpat_3
    + Usage     : n/a; called automatically by revcom()
    + Purpose   : Utility method for revcom()
    +           : Converts all {5,7}(XXX) ---> (XXX){5,7}
    + Returns   : String (the new, partially reversed pattern)
    + Argument  : String (the expanded, partially reversed pattern)
    + Throws    : n/a
    +
    +See Also   : L()
    +
    +=cut
    +
    +#-------------
    +sub _fixpat_3 {
    +#-------------
    +    my $pat = shift;
    +    
    +    my (@done,@parts,$braces,$newpat,$oldpat);
    +    while(1) {
    +#	$pat =~ s/(.+)(\{\S+\})(\(\w+\))(.*)/$1#$3$2$4/ or do{ push @done, $pat; last; };
    +	if( $pat =~ /(.*)(.)(\{\S+\})(\(\w+\))(.*)/) {
    +	    $newpat = "$1#$2$4$3$5";
    +##ps	    $oldpat = "$1#$2$3$4$5";
    +#	    print "1: $1\n2: $2\n3: $3\n4: $4\n5: $5\n";
    +##ps	    $braces = $3;
    +##ps	    $braces =~ s/[{}]//g;
    +##ps	    if( exists $Processed_braces{"$2$braces"} || exists $Processed_asterics{$2}) {
    +##ps		$pat = $oldpat;  # Don't change it. Already processed.
    +#		print "saved pat: $pat";;
    +##ps	    } else {
    +#		print "new pat: $newpat";;
    +		$pat = $newpat;  # Change it.
    +##ps	    }
    +	} elsif( $pat =~ /^(\{\S+\})(\(\w+\))(.*)/) {
    +	    $pat = "#$2$1$3";
    +	} else { 
    +	    push @done, $pat; last; 
    +	}
    +	@parts = split '#', $pat;
    +	push @done, $parts[1];
    +	$pat = $parts[0];
    +#	print "done: $parts[1]<---\nnew pat: $pat<---";;
    +	last if not $pat;
    +    }
    +    return join('', reverse @done);
    +}
    +
    +
    +=head1 _fixpat_4
    +
    + Title     : _fixpat_4
    + Usage     : n/a; called automatically by revcom()
    + Purpose   : Utility method for revcom()
    +           : Converts all {5,7}[XXX] ---> [XXX]{5,7}
    + Returns   : String (the new, partially reversed pattern)
    + Argument  : String (the expanded, partially reversed  pattern)
    + Throws    : n/a
    +
    +See Also   : L()
    +
    +=cut
    +
    +#---------------
    +sub _fixpat_4 {
    +#---------------
    +    my $pat = shift;
    +    
    +    my (@done,@parts,$braces,$newpat,$oldpat);
    +    while(1) {
    +#	$pat =~ s/(.*)(\{\S+\})(\[\w+\])(.*)/$1#$3$2$4/ or do{ push @done, $pat; last; };
    +#	$pat =~ s/(.*)([^\w.])(\{\S+\})(\[\w+\])(.*)/$1$2#$4$3$5/ or do{ push @done, $pat; last; };
    +	if( $pat =~ /(.*)(.)(\{\S+\})(\[\w+\])(.*)/) {
    +	    $newpat = "$1#$2$4$3$5";
    +	    $oldpat = "$1#$2$3$4$5";
    +#	    print "1: $1\n2: $2\n3: $3\n4: $4\n5: $5\n";
    +	    $braces = $3;
    +	    $braces =~ s/[{}]//g;
    +	    if( (defined $braces and defined $2) and
    +		exists $Processed_braces{"$2$braces"} || exists $Processed_asterics{$2}) {
    +		$pat = $oldpat;  # Don't change it. Already processed.
    +#		print "saved pat: $pat";;
    +	    } else {
    +		$pat = $newpat;  # Change it.
    +#		print "new pat: $pat";;
    +	    }
    +	} elsif( $pat =~ /^(\{\S+\})(\[\w+\])(.*)/) {  
    +	    $pat = "#$2$1$3";
    +	} else { 
    +	    push @done, $pat; last; 
    +	}
    +	    
    +	@parts = split '#', $pat;
    +	push @done, $parts[1];
    +	$pat = $parts[0];
    +#	print "done: $parts[1]<---\nnew pat: $pat<---";;
    +	last if not $pat;
    +    }
    +    return join('', reverse @done);
    +}
    +
    +
    +=head1 _fixpat_5
    +
    + Title     : _fixpat_5
    + Usage     : n/a; called automatically by revcom()
    + Purpose   : Utility method for revcom()
    +           : Converts all *[XXX]  ---> [XXX]*
    +           :          and *(XXX)  ---> (XXX)*
    + Returns   : String (the new, partially reversed pattern)
    + Argument  : String (the expanded, partially reversed pattern)
    + Throws    : n/a
    +
    +See Also   : L()
    +
    +=cut
    +
    +#--------------
    +sub _fixpat_5 {
    +#--------------
    +    my $pat = shift;
    +    
    +    my (@done,@parts,$newpat,$oldpat);
    +    while(1) {
    +#	$pat =~ s/(.*)(\{\S+\})(\[\w+\])(.*)/$1#$3$2$4/ or do{ push @done, $pat; last; };
    +#	$pat =~ s/(.*)([^\w.])(\{\S+\})(\[\w+\])(.*)/$1$2#$4$3$5/ or do{ push @done, $pat; last; };
    +	if( $pat =~ /(.*)(.)\*(\[\w+\]|\(\w+\))(.*)/) {
    +	    $newpat = "$1#$2$3*$4";
    +	    $oldpat = "$1#$2*$3$4";
    +#	    print "1: $1\n2: $2\n3: $3\n4: $4\n";
    +	    if( exists $Processed_asterics{$2}) {
    +		$pat = $oldpat;  # Don't change it. Already processed.
    +#		print "saved pat: $pat";;
    +	    } else {
    +		$pat = $newpat;  # Change it.
    +#		print "new pat: $pat";;
    +	    }
    +	} elsif( $pat =~ /^\*(\[\w+\]|\(\w+\))(.*)/) {  
    +	    $pat = "#$1*$3";
    +	} else { 
    +	    push @done, $pat; last; 
    +	}
    +	    
    +	@parts = split '#', $pat;
    +	push @done, $parts[1];
    +	$pat = $parts[0];
    +#	print "done: $parts[1]<---\nnew pat: $pat<---";;
    +	last if not $pat;
    +    }
    +    return join('', reverse @done);
    +}
    +
    +
    +
    +
    +
    +############################
    +#
    +#  PS: Added 8/7/00 to allow non-greedy matching patterns
    +#
    +######################################
    +
    +=head1 _fixpat_6
    +
    + Title     : _fixpat_6
    + Usage     : n/a; called automatically by revcom()
    + Purpose   : Utility method for revcom()
    +           : Converts all ?Y{5,7}  ---> Y{5,7}?
    +           :          and ?(XXX){5,7}  ---> (XXX){5,7}?
    +           :          and ?[XYZ]{5,7}  ---> [XYZ]{5,7}?
    + Returns   : String (the new, partially reversed pattern)
    + Argument  : String (the expanded, partially reversed pattern)
    + Throws    : n/a
    +
    +See Also   : L()
    +
    +=cut
    +
    +#--------------
    +sub _fixpat_6 {
    +#--------------
    +    my $pat = shift;
    +    my (@done,@parts);
    +
    +   @done = ();
    +    while(1) {
    +	$pat =~   /(.*)\?(\[\w+\]|\(\w+\)|\w)(\{\S+?\})?(.*)/ or do{ push @done, $pat; last; };
    +     my $quantifier = $3 ? $3 : ""; # Shut up warning if no explicit quantifier
    + 	$pat = $1.'#'.$2.$quantifier.'?'.$4;
    +#	$pat = $1.'#'.$2.$3.'?'.$4;
    +
    +#	print "1: $1\n2: $2\n3: $3\n";
    +#	print "modified pat: $pat";;
    +	@parts = split '#', $pat;
    +	push @done, $parts[1];
    +	$pat = $parts[0];
    +#	print "done: $parts[1]<---\nnew pat: $pat<---";;
    +	last if not $pat;
    +    }
    +    return join('', reverse @done);
    +
    + }
    +
    +=head2 str
    +
    + Title   : str
    + Usage   : $obj->str($newval)
    + Function: 
    + Returns : value of str
    + Args    : newvalue (optional)
    +
    +
    +=cut
    +
    +sub str{
    +   my $obj = shift;
    +   if( @_ ) {
    +      my $value = shift;
    +      $obj->{'str'} = $value;
    +    }
    +    return $obj->{'str'};
    +
    +}
    +
    +=head2 type
    +
    + Title   : type
    + Usage   : $obj->type($newval)
    + Function: 
    + Returns : value of type
    + Args    : newvalue (optional)
    +
    +
    +=cut
    +
    +sub type{
    +   my $obj = shift;
    +   if( @_ ) {
    +      my $value = shift;
    +      $obj->{'type'} = $value;
    +    }
    +    return $obj->{'type'};
    +
    +}
    +
    +1;
    +
    +__END__
    +
    +#########################################################################
    +#  End of class 
    +#########################################################################
    +
    +=head1 FOR DEVELOPERS ONLY
    +
    +=head2 Data Members
    +
    +Information about the various data members of this module is provided
    +for those wishing to modify or understand the code. Two things to bear
    +in mind:
    +
    +=over 2
    +
    +=item 1 Do NOT rely on these in any code outside of this module. 
    +
    +All data members are prefixed with an underscore to signify that they
    +are private.  Always use accessor methods. If the accessor doesn't
    +exist or is inadequate, create or modify an accessor (and let me know,
    +too!).
    +
    +=item 2 This documentation may be incomplete and out of date.
    +
    +It is easy for this documentation to become obsolete as this module is
    +still evolving.  Always double check this info and search for members
    +not described here.
    +
    +=back
    +
    +An instance of Bio::Tools::RestrictionEnzyme.pm is a blessed reference
    +to a hash containing all or some of the following fields:
    +
    + FIELD          VALUE
    + ------------------------------------------------------------------------
    + _rev     : The corrected reverse complement of the fully expanded pattern.
    +
    + INHERITED DATA MEMBERS:
    +
    + _seq     : (From Bio::Seq.pm) The original, unexpanded input sequence after untainting.
    + _type    : (From Bio::Seq.pm) 'Dna' or 'Amino' 
    +
    +
    +=cut
    diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/SeqStats.pm
    --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    +++ b/variant_effect_predictor/Bio/Tools/SeqStats.pm	Thu Apr 11 02:01:53 2013 -0400
    @@ -0,0 +1,712 @@
    +# $Id: SeqStats.pm,v 1.16.2.1 2003/02/28 13:17:06 heikki Exp $
    +#
    +# BioPerl module for Bio::Tools::SeqStats
    +#
    +# Cared for by
    +#
    +# Copyright Peter Schattner
    +#
    +# You may distribute this module under the same terms as perl itself
    +
    +# POD documentation - main docs before the code
    +
    +=head1 NAME
    +
    +Bio::Tools::SeqStats - Object holding statistics for one particular sequence
    +
    +=head1 SYNOPSIS
    +
    +  # build a primary nucleic acid or protein sequence object somehow
    +  # then build a statistics object from the sequence object
    +
    +  $seqobj = Bio::PrimarySeq->new(-seq=>'ACTGTGGCGTCAACTG',
    +                                 -alphabet=>'dna',
    +                                 -id=>'test');
    +  $seq_stats  =  Bio::Tools::SeqStats->new(-seq=>$seqobj);
    +
    +  # obtain a hash of counts of each type of monomer
    +  # (ie amino or nucleic acid)
    +  print "\nMonomer counts using statistics object\n";
    +  $seq_stats  =  Bio::Tools::SeqStats->new(-seq=>$seqobj);
    +  $hash_ref = $seq_stats->count_monomers();  # eg for DNA sequence
    +  foreach $base (sort keys %$hash_ref) {
    +      print "Number of bases of type ", $base, "= ", %$hash_ref->{$base},"\n";
    +  }
    +
    +  # or obtain the count directly without creating a new statistics object
    +  print "\nMonomer counts without statistics object\n";
    +  $hash_ref = Bio::Tools::SeqStats->count_monomers($seqobj);
    +  foreach $base (sort keys %$hash_ref) {
    +      print "Number of bases of type ", $base, "= ", %$hash_ref->{$base},"\n";
    +  }
    +
    +
    +  # obtain hash of counts of each type of codon in a nucleic acid sequence
    +  print "\nCodon counts using statistics object\n";
    +  $hash_ref = $seq_stats-> count_codons();  # for nucleic acid sequence
    +  foreach $base (sort keys %$hash_ref) {
    +      print "Number of codons of type ", $base, "= ", %$hash_ref->{$base},"\n";
    +  }
    +
    +  #  or
    +  print "\nCodon counts without statistics object\n";
    +  $hash_ref = Bio::Tools::SeqStats->count_codons($seqobj);
    +  foreach $base (sort keys %$hash_ref) {
    +      print "Number of codons of type ", $base, "= ", %$hash_ref->{$base},"\n";
    +  }
    +
    +  # Obtain the molecular weight of a sequence. Since the sequence may contain
    +  # ambiguous monomers, the molecular weight is returned as a (reference to) a
    +  # two element array containing greatest lower bound (GLB) and least upper bound
    +  # (LUB) of the molecular weight
    +  $weight = $seq_stats->get_mol_wt();
    +  print "\nMolecular weight (using statistics object) of sequence ", $seqobj->id(),
    +       " is between ", $$weight[0], " and " ,
    +       $$weight[1], "\n";
    +
    +  #  or
    +  $weight = Bio::Tools::SeqStats->get_mol_wt($seqobj);
    +  print "\nMolecular weight (without statistics object) of sequence ", $seqobj->id(),
    +       " is between ", $$weight[0], " and " ,
    +       $$weight[1], "\n";
    +
    +
    +=head1 DESCRIPTION
    +
    +Bio::Tools::SeqStats is a lightweight object for the calculation of
    +simple statistical and numerical properties of a sequence. By
    +"lightweight" I mean that only "primary" sequences are handled by the
    +object.  The calling script needs to create the appropriate primary
    +sequence to be passed to SeqStats if statistics on a sequence feature
    +are required.  Similarly if a codon count is desired for a
    +frame-shifted sequence and/or a negative strand sequence, the calling
    +script needs to create that sequence and pass it to the SeqStats
    +object.
    +
    +Nota that nucleotide sequences in bioperl do not strictly separate RNA
    +and DNA sequences. By convension, sequences from RNA molecules are
    +shown as is they were DNA. Objects are supposed to make the
    +distinction when needed. This class is one of the few where this
    +distinctions needs to be made. Internally, it changes all Ts into Us
    +before weight and monomer count.
    +
    +
    +SeqStats can be called in two distinct manners.  If only a single
    +computation is required on a given sequence object, the method can be
    +called easily using the SeqStats object directly:
    +
    +	$weight = Bio::Tools::SeqStats->get_mol_wt($seqobj);
    +
    +Alternately, if several computations will be required on a given
    +sequence object, an "instance" statistics object can be constructed
    +and used for the method calls:
    +
    +  $seq_stats  =  Bio::Tools::SeqStats->new($seqobj);
    +  $monomers = $seq_stats->count_monomers();
    +  $codons = $seq_stats->count_codons();
    +  $weight = $seq_stats->get_mol_wt();
    +
    +As currently implemented the object can return the following values
    +from a sequence:
    +
    +=over 3
    +
    +=item *
    +
    +The molecular weight of the sequence: get_mol_wt()
    +
    +=item *
    +
    +The number of each type of monomer present: count_monomers()
    +
    +=item *
    +
    +The number of each codon present in a nucleic acid sequence:
    +count_codons()
    +
    +=back
    +
    +For dna (and rna) sequences, single-stranded weights are returned. The
    +molecular weights are calculated for neutral - ie not ionized -
    +nucleic acids. The returned weight is the sum of the
    +base-sugar-phosphate residues of the chain plus one weight of water to
    +to account for the additional OH on the phosphate of the 5' residue
    +and the additional H on the sugar ring of the 3' residue.  Note that
    +this leads to a difference of 18 in calculated molecular weights
    +compared to some other available programs (eg Informax VectorNTI).
    +
    +Note that since sequences may contain ambiguous monomers (eg "M"
    +meaning "A" or "C" in a nucleic acid sequence), the method get_mol_wt
    +returns a two-element array containing the greatest lower bound and
    +least upper bound of the molecule. (For a sequence with no ambiguous
    +monomers, the two elements of the returned array will be equal.) The
    +method count_codons() handles ambiguous bases by simply counting all
    +ambiguous codons together and issuing a warning to that effect.
    +
    +
    +=head1 DEVELOPERS NOTES
    +
    +Ewan moved it from Bio::SeqStats to Bio::Tools::SeqStats
    +
    +=head1 FEEDBACK
    +
    +=head2 Mailing Lists
    +
    +User feedback is an integral part of the evolution of this and other
    +Bioperl modules. Send your comments and suggestions preferably to one
    +of the Bioperl mailing lists.  Your participation is much appreciated.
    +
    +  bioperl-l@bioperl.org               - General discussion
    +  http://bio.perl.org/MailList.html   - About the mailing lists
    +
    +=head2 Reporting Bugs
    +
    +Report bugs to the Bioperl bug tracking system to help us keep track
    + the bugs and their resolution.
    + Bug reports can be submitted via email or the web:
    +
    +  bioperl-bugs@bio.perl.org
    +  http://bugzilla.bioperl.org/
    +
    +=head1 AUTHOR -  Peter Schattner
    +
    +Email schattner@alum.mit.edu
    +
    +=head1 APPENDIX
    +
    +The rest of the documentation details each of the object
    +methods. Internal methods are usually preceded with a _
    +
    +=cut
    +
    +
    +package Bio::Tools::SeqStats;
    +use strict;
    +use vars qw(@ISA %Alphabets %Alphabets_strict $amino_weights
    +	    $rna_weights $dna_weights %Weights );
    +use Bio::Seq;
    +use Bio::Root::Root;
    +@ISA = qw(Bio::Root::Root);
    +
    +BEGIN {
    +    %Alphabets =   (
    +		    'dna'     => [ qw(A C G T R Y M K S W H B V D X N) ],
    +		    'rna'     => [ qw(A C G U R Y M K S W H B V D X N) ],
    +		    'protein' => [ qw(A R N D C Q E G H I L K M F
    +				      P S T W X Y V B Z *) ], # sac: added B, Z
    +		    );
    +
    +# SAC: new strict alphabet: doesn't allow any ambiguity characters.
    +    %Alphabets_strict = (
    +			 'dna'     => [ qw( A C G T ) ],
    +			 'rna'     => [ qw( A C G U ) ],
    +			 'protein'    => [ qw(A R N D C Q E G H I L K M F
    +					      P S T W Y V) ],
    +			 );
    +
    +
    +#  IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE:
    +#   Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030.
    +
    +#  Amino Acid alphabet
    +
    +# ------------------------------------------
    +# Symbol           Meaning
    +# ------------------------------------------
    +
    +    my $amino_A_wt = 89.09;
    +    my $amino_C_wt = 121.15;
    +    my $amino_D_wt = 133.1;
    +    my $amino_E_wt = 147.13;
    +    my $amino_F_wt = 165.19;
    +    my $amino_G_wt = 75.07;
    +    my $amino_H_wt = 155.16;
    +    my $amino_I_wt = 131.18;
    +    my $amino_K_wt = 146.19;
    +    my $amino_L_wt = 131.18;
    +    my $amino_M_wt = 149.22;
    +    my $amino_N_wt = 132.12;
    +    my $amino_P_wt = 115.13;
    +    my $amino_Q_wt = 146.15;
    +    my $amino_R_wt = 174.21;
    +    my $amino_S_wt = 105.09;
    +    my $amino_T_wt = 119.12;
    +    my $amino_V_wt = 117.15;
    +    my $amino_W_wt = 204.22;
    +    my $amino_Y_wt = 181.19;
    +
    +    $amino_weights = {
    +	'A'     => [$amino_A_wt, $amino_A_wt], #    Alanine
    +	'B'      => [$amino_N_wt, $amino_D_wt],	#   Aspartic Acid, Asparagine
    +	'C'      => [$amino_C_wt, $amino_C_wt],	#   Cystine
    +	'D'         => [$amino_D_wt, $amino_D_wt], # Aspartic Acid
    +	'E'        => [$amino_E_wt, $amino_E_wt], # Glutamic Acid
    +	'F'        => [$amino_F_wt, $amino_F_wt], # Phenylalanine
    +	'G'        => [$amino_G_wt, $amino_G_wt], # Glycine
    +	'H'        => [$amino_H_wt, $amino_H_wt], # Histidine
    +	'I'        => [$amino_I_wt, $amino_I_wt], # Isoleucine
    +	'K'        => [$amino_K_wt, $amino_K_wt], # Lysine
    +	'L'        => [$amino_L_wt, $amino_L_wt], # Leucine
    +	'M'        => [$amino_M_wt, $amino_M_wt], # Methionine
    +	'N'        => [$amino_N_wt, $amino_N_wt], # Asparagine
    +	'P'        => [$amino_P_wt, $amino_P_wt], # Proline
    +	'Q'        => [$amino_Q_wt, $amino_Q_wt], # Glutamine
    +	'R'        => [$amino_R_wt, $amino_R_wt], # Arginine
    +	'S'        => [$amino_S_wt, $amino_S_wt], # Serine
    +	'T'        => [$amino_T_wt, $amino_T_wt], # Threonine
    +	'V'        => [$amino_V_wt, $amino_V_wt], # Valine
    +	'W'        => [$amino_W_wt, $amino_W_wt], # Tryptophan
    +	'X'        => [$amino_G_wt, $amino_W_wt], # Unknown
    +	'Y'        => [$amino_Y_wt, $amino_Y_wt], # Tyrosine
    +	'Z'        => [$amino_Q_wt, $amino_E_wt], # Glutamic Acid, Glutamine
    +    };
    +
    +    # Extended Dna / Rna alphabet
    +    use vars ( qw($C $O $N $H $P $water) );
    +    use vars ( qw($adenine   $guanine   $cytosine   $thymine   $uracil));
    +    use vars ( qw($ribose_phosphate   $deoxyribose_phosphate   $ppi));
    +    use vars ( qw($dna_A_wt   $dna_C_wt   $dna_G_wt  $dna_T_wt
    +		  $rna_A_wt   $rna_C_wt   $rna_G_wt   $rna_U_wt));
    +    use vars ( qw($dna_weights   $rna_weights   %Weights));
    +
    +    $C = 12.01;
    +    $O = 16.00;
    +    $N = 14.01;
    +    $H = 1.01;
    +    $P = 30.97;
    +    $water = 18.015;
    +
    +    $adenine = 5 * $C + 5 * $N + 5 * $H;
    +    $guanine = 5 * $C + 5 * $N + 1 * $O + 5 * $H;
    +    $cytosine = 4 * $C + 3 * $N + 1 * $O + 5 * $H;
    +    $thymine = 5 * $C + 2 * $N + 2 * $O + 6 * $H;
    +    $uracil = 4 * $C + 2 * $N + 2 * $O + 4 * $H;
    +
    +    $ribose_phosphate = 5 * $C + 7 * $O + 9 * $H + 1 * $P;      #neutral (unionized) form
    +    $deoxyribose_phosphate = 5 * $C + 6 * $O + 9 * $H + 1 * $P;
    +
    +    # the following are single strand molecular weights / base
    +    $dna_A_wt = $adenine + $deoxyribose_phosphate - $water;
    +    $dna_C_wt = $cytosine + $deoxyribose_phosphate - $water;
    +    $dna_G_wt = $guanine + $deoxyribose_phosphate - $water;
    +    $dna_T_wt = $thymine + $deoxyribose_phosphate - $water;
    +
    +    $rna_A_wt = $adenine + $ribose_phosphate - $water;
    +    $rna_C_wt = $cytosine + $ribose_phosphate - $water;
    +    $rna_G_wt = $guanine + $ribose_phosphate - $water;
    +    $rna_U_wt = $uracil + $ribose_phosphate - $water;
    +
    +    $dna_weights = {
    +	'A'             => [$dna_A_wt,$dna_A_wt],            # Adenine
    +	'C'             => [$dna_C_wt,$dna_C_wt],            # Cytosine
    +	'G'             => [$dna_G_wt,$dna_G_wt],            # Guanine
    +	'T'             => [$dna_T_wt,$dna_T_wt],            # Thymine
    +	'M'             => [$dna_C_wt,$dna_A_wt],            # A or C
    +	'R'             => [$dna_A_wt,$dna_G_wt],            # A or G
    +	'W'             => [$dna_T_wt,$dna_A_wt],            # A or T
    +	'S'             => [$dna_C_wt,$dna_G_wt],            # C or G
    +	'Y'             => [$dna_C_wt,$dna_T_wt],            # C or T
    +	'K'             => [$dna_T_wt,$dna_G_wt],            # G or T
    +	'V'             => [$dna_C_wt,$dna_G_wt],            # A or C or G
    +	'H'             => [$dna_C_wt,$dna_A_wt],            # A or C or T
    +	'D'             => [$dna_T_wt,$dna_G_wt],            # A or G or T
    +	'B'             => [$dna_C_wt,$dna_G_wt],            # C or G or T
    +	'X'             => [$dna_C_wt,$dna_G_wt],            # G or A or T or C
    +	'N'             => [$dna_C_wt,$dna_G_wt],            # G or A or T or C
    +    };
    +
    +    $rna_weights =  {
    +	'A'             => [$rna_A_wt,$rna_A_wt],            # Adenine
    +	'C'             => [$rna_C_wt,$rna_C_wt],            # Cytosine
    +	'G'             => [$rna_G_wt,$rna_G_wt],            # Guanine
    +	'U'             => [$rna_U_wt,$rna_U_wt],            # Uracil
    +	'M'             => [$rna_C_wt,$rna_A_wt],            # A or C
    +	'R'             => [$rna_A_wt,$rna_G_wt],            # A or G
    +	'W'             => [$rna_U_wt,$rna_A_wt],            # A or U
    +	'S'             => [$rna_C_wt,$rna_G_wt],            # C or G
    +	'Y'             => [$rna_C_wt,$rna_U_wt],            # C or U
    +	'K'             => [$rna_U_wt,$rna_G_wt],            # G or U
    +	'V'             => [$rna_C_wt,$rna_G_wt],            # A or C or G
    +	'H'             => [$rna_C_wt,$rna_A_wt],            # A or C or U
    +	'D'             => [$rna_U_wt,$rna_G_wt],            # A or G or U
    +	'B'             => [$rna_C_wt,$rna_G_wt],            # C or G or U
    +	'X'             => [$rna_C_wt,$rna_G_wt],            # G or A or U or C
    +	'N'             => [$rna_C_wt,$rna_G_wt],            # G or A or U or C
    +    };
    +
    +    %Weights =   (
    +		  'dna'     =>  $dna_weights,
    +		  'rna'     =>  $rna_weights,
    +		  'protein' =>  $amino_weights,
    +		  );
    +}
    +
    +sub new {
    +    my($class,@args) = @_;
    +    my $self = $class->SUPER::new(@args);
    +
    +    my ($seqobj) = $self->_rearrange([qw(SEQ)],@args);
    +    unless  ($seqobj->isa("Bio::PrimarySeqI")) {
    +	$self->throw(" SeqStats works only on PrimarySeqI objects  \n");
    +    }
    +    if ( !defined $seqobj->alphabet || ! defined $Alphabets{$seqobj->alphabet}) {
    +	$self->throw("Must have a valid alphabet defined for seq (".
    +		     join(",",keys %Alphabets));
    +    }
    +    $self->{'_seqref'} = $seqobj;
    +    # check the letters in the sequence
    +    $self->{'_is_strict'} = _is_alphabet_strict($seqobj); 
    +    return $self;
    +}
    +
    +=head2 count_monomers
    +
    + Title   : count_monomers
    + Usage   : $rcount = $seq_stats->count_monomers();
    +        or $rcount = $seq_stats->Bio::Tools::SeqStats->($seqobj);
    + Function: Counts the number of each type of monomer (amino acid or
    +	   base) in the sequence.
    +           Ts are counted as Us in RNA sequences.
    + Example :
    + Returns : Reference to a hash in which keys are letters of the
    +           genetic alphabet used and values are number of occurrences
    +           of the letter in the sequence.
    + Args    : None or reference to sequence object
    + Throws  : Throws an exception if type of sequence is unknown (ie amino
    +           or nucleic)or if unknown letter in alphabet. Ambiguous
    +           elements are allowed.
    +
    +=cut
    +
    +sub count_monomers{
    +    my %count  = ();
    +    my $seqobj;
    +    my $_is_strict;
    +    my $element = '';
    +    my $_is_instance = 1 ;
    +    my $self = shift @_;
    +    my $object_argument = shift @_;
    +
    +    # First we need to determine if the present object is an instance
    +    # object or if the sequence object has been passed as an argument
    +
    +    if (defined $object_argument) {
    +	$_is_instance = 0;
    +    }
    +
    +    # If we are using an instance object...
    +    if ($_is_instance) {
    +	if ($self->{'_monomer_count'}) {
    +	    return $self->{'_monomer_count'}; # return count if previously calculated
    +	}
    +	$_is_strict =  $self->{'_is_strict'}; # retrieve "strictness"
    +        $seqobj =  $self->{'_seqref'};
    +    } else {
    +         #  otherwise...
    +	$seqobj =  $object_argument;
    +	
    +    #  Following two lines lead to error in "throw" routine
    +	$seqobj->isa("Bio::PrimarySeqI") ||
    +	    $self->throw(" SeqStats works only on PrimarySeqI objects  \n");
    +        # is alphabet OK? Is it strict?
    +	$_is_strict =  _is_alphabet_strict($seqobj);
    +    }
    +
    +    my $alphabet =  $_is_strict ? $Alphabets_strict{$seqobj->alphabet} :
    +	$Alphabets{$seqobj->alphabet}  ; # get array of allowed letters
    +	
    +    # convert everything to upper case to be safe
    +    my $seqstring = uc $seqobj->seq();
    +
    +    # Since T is used in RichSeq RNA sequences, do conversion locally
    +    $seqstring =~ s/T/U/g if $seqobj->alphabet eq 'rna';
    +
    +    #  For each letter, count the number of times it appears in
    +    #  the sequence
    +  LETTER:
    +    foreach $element (@$alphabet) {
    +        # skip terminator symbol which may confuse regex
    +	next LETTER if $element eq '*';
    +	$count{$element} = ( $seqstring =~ s/$element/$element/g);
    +    }
    +
    +    if ($_is_instance) {
    +	$self->{'_monomer_count'} = \%count;  # Save in case called again later
    +    }
    +
    +    return \%count;
    +}
    +
    +=head2  get_mol_wt
    +
    + Title   : get_mol_wt
    + Usage   : $wt = $seqobj->get_mol_wt() or
    +           $wt = Bio::Tools::SeqStats ->get_mol_wt($seqobj);
    + Function: Calculate molecular weight of sequence
    +           Ts are counted as Us in RNA sequences.
    + Example :
    +
    + Returns : Reference to two element array containing lower and upper
    +           bounds of molecule molecular weight. (For dna (and rna)
    +           sequences, single-stranded weights are returned.)  If
    +           sequence contains no ambiguous elements, both entries in
    +           array are equal to molecular weight of molecule.
    + Args    : None or reference to sequence object
    + Throws  : Exception if type of sequence is unknown (ie not amino or
    +           nucleic) or if unknown letter in alphabet. Ambiguous
    +           elements are allowed.
    +
    +=cut
    +
    +sub get_mol_wt {
    +
    +    my $seqobj;
    +    my $_is_strict;
    +    my $element = '';
    +    my $_is_instance = 1 ;
    +    my $self = shift @_;
    +    my $object_argument = shift @_;
    +    my ($weight_array, $rcount);
    +
    +    if (defined $object_argument) {
    +	$_is_instance = 0;
    +    }
    +
    +    if ($_is_instance) {	
    +	if ($weight_array = $self->{'_mol_wt'}) {
    +            # return mol. weight if previously calculated
    +	    return $weight_array;	
    +	}
    +        $seqobj =  $self->{'_seqref'};
    +        $rcount = $self->count_monomers();
    +    } else {
    +	$seqobj =  $object_argument;
    +	$seqobj->isa("Bio::PrimarySeqI") ||
    +	    die("Error: SeqStats works only on PrimarySeqI objects  \n");
    +	$_is_strict =  _is_alphabet_strict($seqobj); # is alphabet OK?
    +        $rcount =  $self->count_monomers($seqobj);
    +    }
    +
    +    # We will also need to know what type of monomer we are dealing with
    +    my $moltype = $seqobj->alphabet();
    +
    +    # In general,the molecular weight is bounded below by the sum of the
    +    # weights of lower bounds of each alphabet symbol times the number of
    +    # occurrences of the symbol in the sequence. A similar upper bound on
    +    # the weight is also calculated.
    +
    +    # Note that for "strict" (ie unambiguous) sequences there is an
    +    # inefficiency since the upper bound = the lower bound (and is
    +    # calculated twice).  However, this decrease in performance will be
    +    # minor and leads to (IMO) significantly more readable code.
    +
    +    my $weight_lower_bound = 0;
    +    my $weight_upper_bound = 0;
    +    my $weight_table =  $Weights{$moltype};
    +
    +
    +    # compute weight of all the residues
    +    foreach $element (keys %$rcount) {
    +	$weight_lower_bound += $$rcount{$element} * $$weight_table{$element}->[0];
    +	$weight_upper_bound += $$rcount{$element} * $$weight_table{$element}->[1];
    +    }
    +    if ($moltype =~ /protein/) {
    +    	# remove of H2O during peptide bond formation.
    +    	$weight_lower_bound -= $water * ($seqobj->length - 1);
    +    	$weight_upper_bound -= $water * ($seqobj->length - 1);
    +    } else {
    +    	# Correction because phosphate of 5' residue has additional OH and
    +    	# sugar ring of 3' residue has additional H
    +    	$weight_lower_bound += $water;
    +    	$weight_upper_bound += $water;
    +    }
    +
    +    $weight_lower_bound = sprintf("%.0f", $weight_lower_bound);
    +    $weight_upper_bound = sprintf("%.0f", $weight_upper_bound);
    +
    +    $weight_array = [$weight_lower_bound, $weight_upper_bound];
    +
    +    if ($_is_instance) {
    +	$self->{'_mol_wt'} = $weight_array;  # Save in case called again later
    +    }
    +    return $weight_array;
    +}
    +
    +
    +=head2  count_codons
    +
    + Title   : count_codons
    + Usage   : $rcount = $seqstats->count_codons (); or
    +           $rcount = Bio::Tools::SeqStats->count_codons($seqobj);
    +
    + Function: Counts the number of each type of codons in a given frame
    +           for a dna or rna sequence.
    + Example :
    + Returns : Reference to a hash in which keys are codons of the genetic
    +           alphabet used and values are number of occurrences of the
    +           codons in the sequence. All codons with "ambiguous" bases
    +           are counted together.
    + Args    : None or reference to sequence object
    +
    + Throws  : an exception if type of sequence is unknown or protein.
    +
    +=cut
    +
    +sub count_codons {
    +    my $rcount = {};
    +    my $codon ;
    +    my $seqobj;
    +    my $_is_strict;
    +    my $element = '';
    +    my $_is_instance = 1 ;
    +    my $self = shift @_;
    +    my $object_argument = shift @_;
    +
    +    if (defined $object_argument) {
    +	$_is_instance = 0;
    +    }
    +
    +    if ($_is_instance) {
    +	if ($rcount = $self->{'_codon_count'}) {
    +	    return $rcount;        # return count if previously calculated
    +	}
    + 	$_is_strict =  $self->{'_is_strict'}; # retrieve "strictness"
    +        $seqobj =  $self->{'_seqref'};
    +    } else {
    +	$seqobj =  $object_argument;
    +	$seqobj->isa("Bio::PrimarySeqI") ||
    +	    die(" Error: SeqStats works only on PrimarySeqI objects  \n");
    +	$_is_strict =  _is_alphabet_strict($seqobj);
    +    }
    +
    +    # Codon counts only make sense for nucleic acid sequences
    +    my $alphabet = $seqobj->alphabet();
    +
    +    unless ($alphabet =~ /[dr]na/) {
    +	$seqobj->throw(" Codon counts only meaningful for dna or rna, ".
    +                       "not for $alphabet sequences. \n");
    +    }
    +
    +    # If sequence contains ambiguous bases, warn that codons
    +    # containing them will all be lumped together in the count.
    +
    +    if (!$_is_strict ) {
    +	$seqobj->warn(" Sequence $seqobj contains ambiguous bases.  \n".
    +                      " All codons with ambiguous bases will be added together in count.  \n");
    +    }
    +
    +    my $seq = $seqobj->seq();
    +
    +    # Now step through the string by threes and count the codons
    +
    +  CODON:
    +    while (length($seq) > 2) {
    +	$codon = substr($seq,0,3);
    +	$seq = substr($seq,3);
    +	if ($codon =~ /[^ACTGU]/) {
    +	    $$rcount{'ambiguous'}++; #lump together ambiguous codons
    +	    next CODON;
    +	}
    +	if (!defined $$rcount{$codon}) {
    +	    $$rcount{$codon}= 1 ;
    +	    next CODON;
    +	}
    +	$$rcount{$codon}++;  # default
    +    }
    +
    +
    +    if ($_is_instance) {
    +	$self->{'_codon_count'} = $rcount;  # Save in case called again later
    +    }
    +
    +    return $rcount;
    +}
    +
    +
    +=head2  _is_alphabet_strict
    +
    + Title   :   _is_alphabet_strict
    + Usage   :
    + Function: internal function to determine whether there are
    +           any ambiguous elements in the current sequence
    + Example :
    + Returns : 1 if strict alphabet is being used,
    +           0 if ambiguous elements are present
    + Args    :
    +
    + Throws  : an exception if type of sequence is unknown (ie amino or
    +           nucleic) or if unknown letter in alphabet. Ambiguous
    +           monomers are allowed.
    +
    +=cut
    +
    +sub _is_alphabet_strict {
    +
    +    my ($seqobj) = @_;
    +    my $moltype = $seqobj->alphabet();
    +
    +    # convert everything to upper case to be safe
    +    my $seqstring = uc $seqobj->seq();
    +
    +    # Since T is used in RichSeq RNA sequences, do conversion locally
    +    $seqstring =~ s/T/U/g if $seqobj->alphabet eq 'rna';
    +
    +    # First we check if only the 'strict' letters are present in the
    +    # sequence string If not, we check whether the remaining letters
    +    # are ambiguous monomers or whether there are illegal letters in
    +    # the string
    +
    +    # $alpha_array is a ref to an array of the 'strictly' allowed letters
    +    my $alpha_array =   $Alphabets_strict{$moltype} ;
    +
    +    # $alphabet contains the allowed letters in string form
    +    my $alphabet = join ('', @$alpha_array) ;
    +    unless ($seqstring =~ /[^$alphabet]/)  {
    +	return 1 ;
    +    }
    +
    +    # Next try to match with the alphabet's ambiguous letters
    +    $alpha_array =   $Alphabets{$moltype} ;
    +    $alphabet = join ('', @$alpha_array) ;
    +
    +    unless ($seqstring =~ /[^$alphabet]/)  {
    +	return 0 ;
    +    }
    +
    +    # If we got here there is an illegal letter in the sequence
    +    $seqobj->throw(" Alphabet not OK for $seqobj \n");
    +
    +}
    +
    +=head2   _print_data
    +
    + Title   : _print_data
    + Usage   : $seqobj->_print_data() or Bio::Tools::SeqStats->_print_data();
    + Function: Displays dna / rna parameters (used for debugging)
    + Returns : 1
    + Args    : None
    +
    +Used for debugging.
    +
    +=cut
    +
    +sub _print_data {
    +
    +    print "\n adenine = :  $adenine \n";
    +    print "\n guanine = :  $guanine \n";
    +    print "\n cytosine = :  $cytosine \n";
    +    print "\n thymine = :  $thymine \n";
    +    print "\n uracil = :  $uracil \n";
    +
    +    print "\n dna_A_wt = :  $dna_A_wt \n";
    +    print "\n dna_C_wt = :  $dna_C_wt \n";
    +    print "\n dna_G_wt = :  $dna_G_wt \n";
    +    print "\n dna_T_wt = :  $dna_T_wt \n";
    +
    +    print "\n rna_A_wt = :  $rna_A_wt \n";
    +    print "\n rna_C_wt = :  $rna_C_wt \n";
    +    print "\n rna_G_wt = :  $rna_G_wt \n";
    +    print "\n rna_U_wt = :  $rna_U_wt \n";
    +
    +    return 1;
    +}
    diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/SeqWords.pm
    --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    +++ b/variant_effect_predictor/Bio/Tools/SeqWords.pm	Thu Apr 11 02:01:53 2013 -0400
    @@ -0,0 +1,225 @@
    +# $Id: SeqWords.pm,v 1.7.2.1 2003/03/05 19:06:15 jason Exp $
    +
    +#---------------------------------------------------------------------------
    +# PACKAGE    : SeqWords.pm
    +# PURPOSE    : To count n-mers in any sequence of characters
    +# AUTHOR     : Derek Gatherer (D.Gatherer@organon.nhe.akzonobel.nl)
    +# SOURCE     : 
    +# CREATED    : 21st March 2000
    +# MODIFIED   : 
    +# DISCLAIMER : I am employed in the pharmaceutical industry but my 
    +#	     : employers do not endorse or sponsor this module
    +#	     : in any way whatsoever.  The above email address is
    +#	     : given purely for the purpose of easy communication
    +#            : with the author, and does not imply any connection
    +#	     : between my employers and anything written below.
    +# LICENCE    : You may distribute this module under the same terms 
    +#	     : as the rest of BioPerl.
    +#---------------------------------------------------------------------------
    +
    +=head1 NAME
    +
    +Bio::Tools::SeqWords - Object holding n-mer statistics for one sequence
    +
    +=head1 SYNOPSIS
    +
    +Take a sequence object from eg, an inputstream, and creates an object
    +for the purposes of holding n-mer word statistics about that sequence.
    +The sequence can be nucleic acid or protein, but the module is
    +probably most relevant for DNA.  The words are counted in a
    +non-overlapping manner, ie. in the style of a codon table, but with
    +any word length.  For overlapping word counts, a sequence can be
    +'shifted' to remove the first character and then the count repeated.
    +For counts on opposite strand (DNA/RNA), a reverse complement method
    +should be performed, and then the count repeated.
    +
    +Creating the SeqWords object, eg:
    +
    +  my $inputstream = Bio::SeqIO->new( -file => "seqfile", 
    +	                             -format => 'Fasta');
    +  my $seqobj = $inputstream->next_seq();
    +  my $seq_word = Bio::Tools::SeqWords->new(-seq => $seqobj);
    +
    +or:
    +
    +  my $seqobj = Bio::PrimarySeq->new(-seq=>'[cut and paste a sequence here]', 
    +                                    -alphabet => 'dna', 
    +                                    -id => 'test');
    +  my $seq_word  =  Bio::Tools::SeqWords->new(-seq => $seqobj);
    +
    +obtain a hash of word counts, eg:
    +
    +  my $hash_ref = $seq_stats->count_words($word_length);
    +
    +display hash table, eg:
    +
    +  my %hash = %$hash_ref;
    +  foreach my $key(sort keys %hash)
    +  {
    +    print "\n$key\t$hash{$key}";
    +  }
    +
    +or	
    +
    +  my $hash_ref = Bio::SeqWords->count_words($seqobj,$word_length);
    +
    +
    +=head1 DESCRIPTION
    +
    +Bio:SeqWords is a featherweight object for the calculation of n-mer
    +word occurrences in a single sequence.  It is envisaged that the
    +object will be useful for construction of scripts which use n-mer word
    +tables as the raw material for statistical calculations; for instance,
    +hexamer frequency for the calculation of coding protential, or the
    +calculation of periodicity in repetitive DNA.  Triplet frequency is
    +already handled by Bio::SeqStats.pm (author: Peter Schattner).  There
    +are a few possible applications for protein, eg: hypothesised amino
    +acid 7-mers in heat shock proteins, or proteins with multiple simple
    +motifs.  Sometimes these protein periodicities are best seen when the
    +amino acid alphabet is truncated, eg Shulman alphabet.  Since there
    +are quite a few of these shortened alphabets, this module does not
    +specify any particular alphabet.
    +
    +See Synopsis above for object creation code.
    +
    +=head1 FEEDBACK
    +
    +=head2 Mailing Lists
    +
    +User feedback is an integral part of the evolution of this
    +and other Bioperl modules. Send your comments and suggestions preferably
    +to one of the Bioperl mailing lists.
    +Your participation is much appreciated.
    +
    +  bioperl-l@bioperl.org                 - General discussion
    +  http://bio.perl.org/MailList.html     - About the mailing lists
    +
    +=head2 Reporting Bugs
    +
    +Report bugs to the Bioperl bug tracking system to help us keep track
    +the bugs and their resolution.
    +Bug reports can be submitted via the web:
    +
    +  http://bugzilla.bioperl.org/
    +
    +=head1 AUTHOR
    +
    +Derek Gatherer, in the loosest sense of the word 'author'.  The
    +general shape of the module is lifted directly from Peter Schattner's
    +SeqStats.pm module.  The central subroutine to count the words is
    +adapted from original code provided by Dave Shivak, in response to a
    +query on the bioperl mailing list.  At least 2 other people provided
    +alternative means (equally good but not used in the end) of performing
    +the same calculation.  Thanks to all for your assistance.
    +
    +=head1 CONTRUBITORS
    +
    +Jason Stajich, jason-at-bioperl.org
    +
    +=head1 APPENDIX
    +
    +The rest of the documentation details each of the object methods. 
    +Internal methods are usually preceded with a _
    +
    +=cut
    +
    +package Bio::Tools::SeqWords;
    +use vars qw(@ISA);
    +use strict;
    +
    +use Bio::Root::Root;
    +
    +@ISA = qw(Bio::Root::Root);
    +
    +sub new {
    +    my($class,@args) = @_;
    +    # our new standard way of instantiation
    +    my $self = $class->SUPER::new(@args);
    +
    +    my ($seqobj) = $self->_rearrange([qw(SEQ)],@args);
    +    if((! defined($seqobj)) && @args && ref($args[0])) {
    +	# parameter not passed as named parameter?
    +	$seqobj = $args[0];
    +    }
    +    
    +    if(! $seqobj->isa("Bio::PrimarySeqI")) { 
    +	$self->throw(ref($self) . " works only on PrimarySeqI objects\n");
    +    }
    +	
    +    $self->{'_seqref'} = $seqobj;
    +    return $self; 
    +}
    +
    +
    +=head2 count_words
    +
    + Title   : count_words
    + Usage   : $word_count = $seq_stats->count_words($word_length); 
    + or 	 : $word_count = $seq_stats->Bio::SeqWords->($seqobj,$word_length);
    + Function: Counts non-overlapping words within a string
    +	 : any alphabet is used
    + Example : a sequence ACCGTCCGT, counted at word length 4,
    +	 : will give the hash
    +	 : ACCG 1, TCCG 1
    + Returns : Reference to a hash in which keys are words (any length) of the
    +         : alphabet used and values are number of occurrences of the word 
    +         : in the sequence.
    + Args    : Word length as scalar and, reference to sequence object if
    +         : required
    +
    +  Throws an exception word length is not a positive integer
    +  or if word length is longer than the sequence.
    +
    +=cut
    +
    +sub count_words
    +{
    +    my ($self,$seqobj,$word_length) = @_;
    +
    +    # check how we were called, and if necessary rearrange arguments
    +    if(ref($seqobj)) {
    +	# call as SeqWords->count_words($seq, $wordlen)
    +	if(! $seqobj->isa("Bio::PrimarySeqI")) { 
    +	    $self->throw("SeqWords works only on PrimarySeqI objects\n");
    +	}
    +    } else {
    +	# call as $obj->count_words($wordlen)
    +	$word_length = $seqobj;
    +	$seqobj = undef;
    +    }
    +
    +    if($word_length eq "" || $word_length =~ /[a-z]/i)
    +    {
    +	$self->throw("SeqWords cannot accept non-numeric characters".
    +		     " or a null value in the \$word_length variable\n");
    +    }
    +    elsif ($word_length <1 || ($word_length - int($word_length)) >0)
    +    {
    +	$self->throw("SeqWords requires the word length to be a ".
    +		     "positive integer\n");
    +    }
    +
    +    if(! defined($seqobj)) {
    +	$seqobj =  $self->{'_seqref'};
    +    }
    +    my $seqstring = uc $seqobj->seq();
    +
    +    if($word_length > length($seqstring))
    +    {
    +	$self->throw("die in count words, \$word_length is bigger ".
    +		     "than sequence length\n");
    +    }
    +
    +    my %codon = ();
    +
    +    # now the real business
    +    # JS - remove DNA assumption
    +    while($seqstring =~ /((\w){$word_length})/gim) {
    +	$codon{uc($1)}++;
    +    }
    +    return \%codon;
    +
    +# and that's it
    +}
    +
    +1;
    diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Sigcleave.pm
    --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    +++ b/variant_effect_predictor/Bio/Tools/Sigcleave.pm	Thu Apr 11 02:01:53 2013 -0400
    @@ -0,0 +1,604 @@
    +#-----------------------------------------------------------------------------
    +# PACKAGE : Bio::Tools::Sigcleave
    +# AUTHOR  : Chris Dagdigian, dag@sonsorol.org
    +# CREATED : Jan 28 1999
    +# REVISION: $Id: Sigcleave.pm,v 1.17 2002/10/22 07:45:22 lapp Exp $
    +#
    +# Copyright (c) 1997-9 bioperl, Chris Dagdigian and others. All Rights Reserved.
    +#           This module is free software; you can redistribute it and/or 
    +#           modify it under the same terms as Perl itself.
    +#
    +# _History_
    +#
    +# Object framework ripped from Steve Chervits's SeqPattern.pm
    +# 
    +# Core EGCG Sigcleave emulation from perl code developed by
    +# Danh Nguyen & Kamalakar Gulukota which itself was based 
    +# loosely on Colgrove's signal.c program.
    +#
    +# The overall idea is to replicate the output of the sigcleave
    +# program which was distributed with the EGCG extension to the GCG sequence
    +# analysis package. There is also an accessor method for just getting at
    +# the raw results.
    +#
    +#-----------------------------------------------------------------------------
    +
    +=head1 NAME
    +
    +Bio::Tools::Sigcleave - Bioperl object for sigcleave analysis
    +
    +=head1 SYNOPSIS
    +
    +=head2 Object Creation
    +
    +  use Bio::Tools::Sigcleave ();
    +
    +  # to keep the module backwar compatible, you can pass it a sequence string, but
    +  # there recommended say is to pass it a Seq object
    +
    +  # this works
    +  $seq = "MVLLLILSVLLLKEDVRGSAQSSERRVVAHMPGDIIIGALFSVHHQPTVDKVHERKCGAVREQYGI";
    +  $sig = new Bio::Tools::Sigcleave(-seq  => $seq,
    +                                                -type => 'protein',
    +                                                -threshold=>'3.5',
    +                                                );
    +  # but you do:
    +  $seqobj = Bio::PrimarySeq->new(-seq => $seq);
    +
    +  $sig = new Bio::Tools::Sigcleave(-seq  => $seqobj,
    +                                                -threshold=>'3.5',
    +                                                );
    +
    +
    +  # now you can detect procaryotic signal sequences as well as eucaryotic
    +  $sig->matrix('eucaryotic'); # or 'procaryotic'
    +
    +
    +=head2 Object Methods & Accessors
    +
    +  # you can use this method to fine tune the threshod before printing out the results
    +  $sig->result_count:
    +
    +  %raw_results      = $sig->signals;
    +  $formatted_output = $sig->pretty_print;
    +
    +=head1 DESCRIPTION
    +
    +"Sigcleave" was a program distributed as part of the free EGCG add-on
    +to earlier versions of the GCG Sequence Analysis package. A new
    +implementation of the algorithm is now part of EMBOSS package.
    +
    +From the EGCG documentation:
    +
    +  SigCleave uses the von Heijne method to locate signal sequences, and
    +  to identify the cleavage site. The method is 95% accurate in
    +  resolving signal sequences from non-signal sequences with a cutoff
    +  score of 3.5, and 75-80% accurate in identifying the cleavage
    +  site. The program reports all hits above a minimum value.
    +
    +The EGCG Sigcleave program was written by Peter Rice (E-mail:
    +pmr@sanger.ac.uk Post: Informatics Division, The Sanger Centre,
    +Wellcome Trust Genome Campus, Hinxton, Cambs, CB10 1SA, UK).
    +
    +Since EGCG is no longer distributed for the latest versions of GCG,
    +this code was developed to emulate the output of the original program
    +as much as possible for those who lost access to sigcleave when
    +upgrading to newer versions of GCG.
    +
    +There are 2 accessor methods for this object. "signals" will return a
    +perl associative array containing the sigcleave scores keyed by amino
    +acid position.  "pretty_print" returns a formatted string similar to
    +the output of the original sigcleave utility.
    +
    +In both cases, the "threshold" setting controls the score reporting
    +level. If no value for threshold is passed in by the user, the code
    +defaults to a reporting value of 3.5.
    +
    +In this implemntation the accessor will never return any
    +score/position pair which does not meet the threshold limit. This is
    +the slightly different from the behaviour of the 8.1 EGCG sigcleave
    +program which will report the highest of the under-threshold results
    +if nothing else is found.
    +
    +
    +Example of pretty_print output:
    +
    +	SIGCLEAVE of sigtest from: 1 to 146
    +
    +	Report scores over 3.5
    +	Maximum score 4.9 at residue 131
    +
    +	 Sequence:  FVILAAMSIQGSA-NLQTQWKSTASLALET
    +        	    | (signal)    | (mature peptide)
    +          	118            131
    +
    +	 Other entries above 3.5
    +
    +	Maximum score 3.7 at residue 112
    +
    +	 Sequence:  CSRQLFGWLFCKV-HPGAIVFVILAAMSIQGSANLQTQWKSTASLALET
    +         	   | (signal)    | (mature peptide)
    +           	99            112
    +
    +
    +=head1 FEEDBACK
    +
    +When updating and maintaining a module, it helps to know that people
    +are actually using it. Let us know if you find a bug, think this code
    +is useful or have any improvements/features to suggest.
    +
    +=head2 Reporting Bugs
    +
    +Report bugs to the Bioperl bug tracking system to help us keep track
    +the bugs and their resolution. Bug reports can be submitted via email
    +or the web:
    +
    +    bioperl-bugs@bio.perl.org
    +    http://bugzilla.bioperl.org/
    +
    +=head1 AUTHOR
    +
    +Chris Dagdigian, dag@sonsorol.org  & others
    +
    +=head1 CONTRIBUTORS
    +
    +Heikki Lehvaslaiho, heikki@ebi.ac.uk
    +
    +=head1 VERSION
    +
    +Bio::Tools::Sigcleave.pm, $Id: Sigcleave.pm,v 1.17 2002/10/22 07:45:22 lapp Exp $
    +
    +=head1 COPYRIGHT
    +
    +Copyright (c) 1999 Chris Dagdigian & others. All Rights Reserved.
    +This module is free software; you can redistribute it and/or modify it
    +under the same terms as Perl itself.
    +
    +=head1 REFERENCES / SEE ALSO
    +
    +von Heijne G. (1986) "A new method for predicting signal sequences
    +cleavage sites."  Nucleic Acids Res. 14, 4683-4690.
    +
    +von Heijne G. (1987) in "Sequence Analysis in Molecular Biology:
    +Treasure Trove or Trivial Pursuit" (Acad. Press, (1987), 113-117).
    +
    +
    +=head1 APPENDIX
    +
    +The following documentation describes the various functions
    +contained in this module. Some functions are for internal 
    +use and are not meant to be called by the user; they are 
    +preceded by an underscore ("_").
    +
    +
    +=cut
    +
    +#
    +##
    +###
    +#### END of main POD documentation.
    +###
    +##
    +#
    +
    +
    +package Bio::Tools::Sigcleave;
    +
    +use Bio::Root::Root;
    +use Bio::PrimarySeq;
    +
    +@ISA = qw(Bio::Root::Root);
    +use strict;
    +use vars qw ($ID $VERSION %WeightTable_euc  %WeightTable_pro );
    +$ID  = 'Bio::Tools::Sigcleave';
    +$VERSION = 0.02;
    +
    +
    +  %WeightTable_euc = (
    +#Sample: 161 aligned sequences
    +# R     -13 -12 -11 -10  -9  -8  -7  -6  -5  -4  -3  -2  -1  +1  +2 Expect
    + 'A' => [16, 13, 14, 15, 20, 18, 18, 17, 25, 15, 47,  6, 80, 18,  6, 14.5],
    + 'C' => [ 3,  6,  9,  7,  9, 14,  6,  8,  5,  6, 19,  3,  9,  8,  3,  4.5],
    + 'D' => [ 0,  0,  0,  0,  0,  0,  0,  0,  5,  3,  0,  5,  0, 10, 11,  8.9],
    + 'E' => [ 0,  0,  0,  1,  0,  0,  0,  0,  3,  7,  0,  7,  0, 13, 14, 10.0],
    + 'F' => [13,  9, 11, 11,  6,  7, 18, 13,  4,  5,  0, 13,  0,  6,  4,  5.6],
    + 'G' => [ 4,  4,  3,  6,  3, 13,  3,  2, 19, 34,  5,  7, 39, 10,  7, 12.1],
    + 'H' => [ 0,  0,  0,  0,  0,  1,  1,  0,  5,  0,  0,  6,  0,  4,  2,  3.4],
    + 'I' => [15, 15,  8,  6, 11,  5,  4,  8,  5,  1, 10,  5,  0,  8,  7,  7.4],
    + 'K' => [ 0,  0,  0,  1,  0,  0,  1,  0,  0,  4,  0,  2,  0, 11,  9, 11.3],
    + 'L' => [71, 68, 72, 79, 78, 45, 64, 49, 10, 23,  8, 20,  1,  8,  4, 12.1],
    + 'M' => [ 0,  3,  7,  4,  1,  6,  2,  2,  0,  0,  0,  1,  0,  1,  2,  2.7],
    + 'N' => [ 0,  1,  0,  1,  1,  0,  0,  0,  3,  3,  0, 10,  0,  4,  7,  7.1],
    + 'P' => [ 2,  0,  2,  0,  0,  4,  1,  8, 20, 14,  0,  1,  3,  0, 22,  7.4],
    + 'Q' => [ 0,  0,  0,  1,  0,  6,  1,  0, 10,  8,  0, 18,  3, 19, 10,  6.3],
    + 'R' => [ 2,  0,  0,  0,  0,  1,  0,  0,  7,  4,  0, 15,  0, 12,  9,  7.6],
    + 'S' => [ 9,  3,  8,  6, 13, 10, 15, 16, 26, 11, 23, 17, 20, 15, 10, 11.4],
    + 'T' => [ 2, 10,  5,  4,  5, 13,  7,  7, 12,  6, 17,  8,  6,  3, 10,  9.7],
    + 'V' => [20, 25, 15, 18, 13, 15, 11, 27,  0, 12, 32,  3,  0,  8, 17, 11.1],
    + 'W' => [ 4,  3,  3,  1,  1,  2,  6,  3,  1,  3,  0,  9,  0,  2,  0,  1.8],
    + 'Y' => [ 0,  1,  4,  0,  0,  1,  3,  1,  1,  2,  0,  5,  0,  1,  7,  5.6]
    +);
    +
    +  %WeightTable_pro = (
    +#Sample: 36 aligned sequences
    +#  R    -13 -12 -11 -10  -9  -8  -7  -6  -5  -4  -3  -2  -1  +1  +2 Expect
    +  'A' => [0,  8,  8,  9,  6,  7,  5,  6,  7,  7, 24,  2, 31, 18,  4,  3.2],
    +  'C' => [1,  0,  0,  1,  1,  0,  0,  1,  1,  0,  0,  0,  0,  0,  0,  1.0],
    +  'D' => [0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  8,  2.0],
    +  'E' => [0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  4,  8,  2.2],
    +  'F' => [2,  4,  3,  4,  1,  1,  8,  0,  4,  1,  0,  7,  0,  1,  0,  1.3],
    +  'G' => [4,  2,  2,  2,  3,  5,  2,  4,  2,  2,  0,  2,  2,  1,  0,  2.7],
    +  'H' => [0,  0,  1,  0,  0,  0,  0,  1,  1,  0,  0,  7,  0,  1,  0,  0.8],
    +  'I' => [3,  1,  5,  1,  5,  0,  1,  3,  0,  0,  0,  0,  0,  0,  2,  1.7],
    +  'K' => [0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  2,  0,  3,  0,  2.5],
    +  'L' => [8, 11,  9,  8,  9, 13,  1,  0,  2,  2,  1,  2,  0,  0,  1,  2.7],
    +  'M' => [0,  2,  1,  1,  3,  2,  3,  0,  1,  2,  0,  4,  0,  0,  1,  0.6],
    +  'N' => [0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  0,  3,  0,  1,  4,  1.6],
    +  'P' => [0,  1,  1,  1,  1,  1,  2,  3,  5,  2,  0,  0,  0,  0,  5,  1.7],
    +  'Q' => [0,  0,  0,  0,  0,  0,  0,  0,  2,  2,  0,  3,  0,  0,  1,  1.4],
    +  'R' => [0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  1.7],
    +  'S' => [1,  0,  1,  4,  4,  1,  5, 15,  5,  8,  5,  2,  2,  0,  0,  2.6],
    +  'T' => [2,  0,  4,  2,  2,  2,  2,  2,  5,  1,  3,  0,  1,  1,  2,  2.2],
    +  'V' => [5,  7,  1,  3,  1,  4,  7,  0,  0,  4,  3,  0,  0,  2,  0,  2.5],
    +  'W' => [0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0.4],
    +  'Y' => [0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  0,  1,  0,  0,  0,  1.3]
    +);
    +
    +
    +##
    +## Now we calculate the _real_ values for the weight tables
    +##
    +##
    +## yeah yeah yeah there is lots of math here that gets repeated
    +## every single time a sigcleave object gets created. This is
    +## a quick hack to make sure that we get the scores as accurate as
    +## possible. Need all those significant digits....
    +##
    +## suggestions for speedup aproaches welcome
    +##
    +
    +
    +foreach my $i (keys %WeightTable_euc) {
    +    my $expected = $WeightTable_euc{$i}[15];
    +    if ($expected > 0) {
    +	for (my $j=0; $j<16; $j++) {
    +	    if ($WeightTable_euc{$i}[$j] == 0) {
    +		$WeightTable_euc{$i}[$j] = 1; 
    +		if ($j == 10 || $j == 12) {
    +		    $WeightTable_euc{$i}[$j] = 1.e-10;
    +		}
    +	    }
    +	    $WeightTable_euc{$i}[$j] = log($WeightTable_euc{$i}[$j]/$expected);
    +	}
    +    }
    +}
    +
    +
    +foreach my $i (keys %WeightTable_pro) {
    +    my $expected = $WeightTable_pro{$i}[15];
    +    if ($expected > 0) {
    +	for (my $j=0; $j<16; $j++) {
    +	    if ($WeightTable_pro{$i}[$j] == 0) {
    +		$WeightTable_pro{$i}[$j] = 1; 
    +		if ($j == 10 || $j == 12) {
    +		    $WeightTable_pro{$i}[$j] = 1.e-10;
    +		}
    +	    }
    +	    $WeightTable_pro{$i}[$j] = log($WeightTable_pro{$i}[$j]/$expected);
    +	}
    +    }
    +}
    +
    +
    +
    +#####################################################################################
    +##                                 CONSTRUCTOR                                     ##
    +#####################################################################################
    +
    +
    +sub new {
    +    my ($class, @args) = @_;
    +
    +    my $self = $class->SUPER::new(@args);
    +    #my $self = Bio::Seq->new(@args);
    +
    +    my ($seq, $threshold, $matrix) = $self->_rearrange([qw(SEQ THRESHOLD MATRIX)],@args);
    +
    +    defined $threshold && $self->threshold($threshold);
    +    $matrix && $self->matrix($matrix);
    +    $seq && $self->seq($seq);
    +
    +    return $self;
    +}
    +
    +
    +
    +=head1 threshold
    +
    + Title     : threshold
    + Usage     : $value = $self->threshold
    + Purpose   : Read/write method sigcleave score reporting threshold.
    + Returns   : float.
    + Argument  : new value, float
    + Throws    : on non-number argument
    + Comments  : defaults to 3.5
    + See Also   : n/a
    +
    +=cut
    +
    +#----------------
    +sub threshold {
    +#----------------
    +    my ($self, $value) = @_;
    +    if( defined $value) {
    +	$self->throw("I need a number, not [$value]")
    +	    if  $value !~ /^[+-]?[\d\.]+$/;
    +	$self->{'_threshold'} = $value;
    +    }
    +    return $self->{'_threshold'} || 3.5 ;
    +}
    +
    +=head1 matrix
    +
    + Title     : matrix
    + Usage     : $value = $self->matrix('procaryotic')
    + Purpose   : Read/write method sigcleave matrix.
    + Returns   : float.
    + Argument  : new value: 'eucaryotic' or 'procaryotic'
    + Throws    : on non-number argument
    + Comments  : defaults to 3.5
    + See Also   : n/a
    +
    +=cut
    +
    +#----------------
    +sub matrix {
    +#----------------
    +    my ($self, $value) = @_;
    +    if( defined $value) {
    +	$self->throw("I need 'eucaryotic' or 'procaryotic', not [$value]")
    +	    unless  $value eq 'eucaryotic' or $value eq 'procaryotic';
    +	$self->{'_matrix'} = $value;
    +    }
    +    return $self->{'_matrix'} || 'eucaryotic' ;
    +
    +}
    +
    +=head1 seq
    +
    + Title     : seq
    + Usage     : $value = $self->seq('procaryotic')
    + Purpose   : Read/write method sigcleave seq.
    + Returns   : float.
    + Argument  : new value: 'eucaryotic' or 'procaryotic'
    + Throws    : on non-number argument
    + Comments  : defaults to 3.5
    + See Also   : n/a
    +
    +=cut
    +
    +#----------------
    +sub seq {
    +#----------------
    +    my ($self, $value) = @_;
    +    if( defined $value) {
    +	if ($value->isa('Bio::PrimarySeqI')) {
    +	    $self->{'_seq'} = $value;
    +	} else {
    +	    $self->{'_seq'} = Bio::PrimarySeq->new(-seq=>$value, 
    +						   -alphabet=>'protein');
    +	}
    +    }
    +    return $self->{'_seq'};
    +}
    +
    +
    +
    +=head1 _Analyze
    +
    + Title     : _Analyze
    + Usage     : N/A This is an internal method. Not meant to be called from outside
    +           : the package
    +           :
    + Purpose   : calculates sigcleave score and amino acid position for the
    +           : given protein sequence. The score reporting threshold can
    +           : be adjusted by passing in the "threshold" parameter during
    +           : object construction. If no threshold is passed in, the code
    +           : defaults to reporting any scores equal to or above 3.5
    +           :
    + Returns   : nothing. results are added to the object
    + Argument  : none.
    + Throws    : nothing.
    + Comments  : nothing.
    +See Also   : n/a
    +
    +=cut
    +
    +#----------------
    +sub _Analyze {
    +#----------------
    +    my($self) = @_;
    +
    +    my %signals;
    +    my @hitWeight = ();
    +    my @hitsort   = ();
    +    my @hitpos    = ();
    +    my $maxSite   = "";
    +    my $seqPos    = "";
    +    my $istart    = "";
    +    my $iend      = "";
    +    my $icol      = "";
    +    my $i         = "";
    +    my $weight    = "";
    +    my $k         = 0;
    +    my $c         = 0;
    +    my $seqBegin  = 0;
    +    my $pVal      = -13;
    +    my $nVal      = 2;
    +    my $nHits     = 0;
    +    my $seqEnd    = $self->seq->length;
    +    my $pep       = $self->seq->seq;
    +    my $minWeight = $self->threshold;
    +    my $matrix    = $self->matrix;
    +
    +    ## The weight table is keyed by UPPERCASE letters so we uppercase
    +    ## the pep string because we don't want to alter the actual object
    +    ## sequence.
    +
    +    $pep =~ tr/a-z/A-Z/;
    +
    +    for ($seqPos = $seqBegin; $seqPos < $seqEnd; $seqPos++) {
    +	$istart = (0 > $seqPos + $pVal)? 0 : $seqPos + $pVal;
    +	$iend = ($seqPos + $nVal - 1 < $seqEnd)? $seqPos + $nVal - 1 : $seqEnd;
    +	$icol= $iend - $istart + 1;
    +	$weight = 0.00;
    +	for ($k=0; $k<$icol; $k++) {
    +	    $c = substr($pep, $istart + $k, 1);
    +
    +	    ## CD: The if(defined) stuff was put in here because Sigcleave.pm
    +	    ## CD: kept getting warnings about undefined vals during 'make test' ...
    +	    if ($matrix eq 'eucaryotic') {
    +		$weight += $WeightTable_euc{$c}[$k] if defined $WeightTable_euc{$c}[$k];
    +	    } else {
    +		$weight += $WeightTable_pro{$c}[$k] if defined $WeightTable_pro{$c}[$k];
    +	    }
    +	}
    +	$signals{$seqPos+1} = sprintf ("%.1f", $weight)	if $weight >= $minWeight;
    +    }
    +
    +    $self->{"_signal_scores"} = { %signals };
    +}
    +
    +
    +=head1 signals
    +
    + Title     : signals
    + Usage     : %sigcleave_results = $sig->signals;
    +           :
    + Purpose   : Accessor method for sigcleave results
    +           : 
    + Returns   : Associative array. The key value represents the amino acid position
    +           : and the value represents the score. Only scores that
    +           : are greater than or equal to the THRESHOLD value are reported.
    +           : 
    + Argument  : none.
    + Throws    : none.
    + Comments  : none.
    +See Also   : THRESHOLD
    +
    +=cut
    +
    +#----------------
    +sub signals {
    +#----------------
    +    my $self = shift;
    +    my %results;
    +    my $position;
    +
    +    # do the calculations
    +    $self->_Analyze;
    +
    +    foreach $position ( sort keys %{ $self->{'_signal_scores'} } ) {
    +	$results{$position} = $self->{'_signal_scores'}{$position};
    +    }
    +    return %results;
    +}
    +
    +
    +=head1 result_count
    +
    + Title     : result_count
    + Usage     : $count = $sig->result_count;
    +           :
    + Purpose   : Accessor method for sigcleave results
    +           : 
    + Returns   : Integer, number of results above the threshold
    +           : 
    + Argument  : none.
    + Throws    : none.
    + Comments  : none.
    +
    +See Also   : THRESHOLD
    +
    +=cut
    +
    +#----------------
    +sub result_count {
    +#----------------
    +    my $self = shift;
    +    $self->_Analyze;
    +    return keys %{ $self->{'_signal_scores'} };
    +}
    +
    +
    +=head1 pretty_print
    +
    + Title     : pretty_print
    + Usage     : $output = $sig->pretty_print;
    +           : print $sig->pretty_print;
    +           :
    + Purpose   : Emulates the output of the EGCG Sigcleave
    +           : utility.
    +           : 
    + Returns   : A formatted string.
    + Argument  : none.
    + Throws    : none.
    + Comments  : none.
    +See Also   : n/a
    +
    +=cut
    +
    +#----------------
    +sub pretty_print {
    +#----------------
    +    my $self = shift;
    +    my $pos;
    +    my $output;
    +    my $cnt = 1;
    +    my %results  = $self->signals;
    +    my @hits     = keys %results;
    +    my $hitcount = $#hits; $hitcount++;
    +    my $thresh   = $self->threshold;
    +    my $seqlen   = $self->seq->length || 0;
    +    my $name     = $self->seq->id || 'NONAME';
    +    my $pep      = $self->seq->seq;
    +    $pep      =~ tr/a-z/A-Z/;
    +
    +    $output = "SIGCLEAVE of $name from: 1 to $seqlen\n\n";
    +
    +    if ($hitcount > 0) {
    +	$output .= "Report scores over $thresh\n";
    +	foreach $pos ((sort { $results{$b} cmp $results{$a} } keys %results)) {
    +	    my $start = $pos - 15;
    +	    $start = 1 if $start < 1;
    +	    my $sig = substr($pep,$start -1,$pos-$start );
    +
    +	    $output .= sprintf ("Maximum score %1.1f at residue %3d\n",$results{$pos},$pos);
    +	    $output .= "\n";
    +	    $output .= " Sequence:  ";
    +	    $output .= $sig;
    +	    $output .= "-" x (15- length($sig));
    +	    $output .= "-";
    +	    $output .= substr($pep,$pos-1,50);
    +	    $output .= "\n";
    +	    $output .= " " x 12;
    +	    $output .= "| \(signal\)      | \(mature peptide\)\n";
    +	    $output .= sprintf("          %3d             %3d\n\n",$start,$pos);
    +
    +	    if (($hitcount > 1) && ($cnt == 1)) {
    +    		$output .= " Other entries above $thresh\n\n";
    +	    }
    +	    $cnt++;
    + 	}
    +    }
    +    $output;
    +}
    +
    +
    +1;
    +__END__
    +
    +
    +#########################################################################
    +#  End of class 
    +#########################################################################
    diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Signalp.pm
    --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    +++ b/variant_effect_predictor/Bio/Tools/Signalp.pm	Thu Apr 11 02:01:53 2013 -0400
    @@ -0,0 +1,251 @@
    +# Parser module for SignalP Bio::Tools::Signalp
    +#
    +# Based on the EnsEMBL module Bio::EnsEMBL::Pipeline::Runnable::Protein::Signalp
    +# originally written by Marc Sohrmann (ms2@sanger.ac.uk)
    +# Written in BioPipe by Balamurugan Kumarasamy 
    +# Cared for by the Fugu Informatics team (fuguteam@fugu-sg.org)
    +
    +# You may distribute this module under the same terms as perl itself
    +#
    +# POD documentation - main docs before the code
    +
    +=head1 NAME
    +
    +Bio::Tools::SignalP
    +
    +=head1 SYNOPSIS
    +
    + use Bio::Tools::SignalP;
    + my $parser = new Bio::Tools::SignalP(-fh =>$filehandle );
    + while( my $sp_feat = $parser->next_result ) {
    +       #do something
    +       #eg
    +       push @sp_feat, $sp_feat;
    + }
    +
    +=head1 DESCRIPTION
    +
    + Parser for SignalP output
    +
    +=head1 FEEDBACK
    +
    +=head2 Mailing Lists
    +
    + User feedback is an integral part of the evolution of this and other
    + Bioperl modules. Send your comments and suggestions preferably to
    + the Bioperl mailing list.  Your participation is much appreciated.
    +
    + bioperl-l@bioperl.org              - General discussion
    + http://bioperl.org/MailList.shtml  - About the mailing lists
    +
    +=head2 Reporting Bugs
    +
    + Report bugs to the Bioperl bug tracking system to help us keep track
    + of the bugs and their resolution. Bug reports can be submitted via
    + email or the web:
    +
    + bioperl-bugs@bio.perl.org
    + http://bugzilla.bioperl.org/
    +
    +=head1 AUTHOR
    +
    + Based on the EnsEMBL module Bio::EnsEMBL::Pipeline::Runnable::Protein::Signalp
    + originally written by Marc Sohrmann (ms2@sanger.ac.uk)
    + Written in BioPipe by Balamurugan Kumarasamy 
    + Cared for by the Fugu Informatics team (fuguteam@fugu-sg.org)
    +
    +=head1 APPENDIX
    +
    + The rest of the documentation details each of the object methods.
    + Internal methods are usually preceded with a _
    +
    +
    +=cut
    +
    +package Bio::Tools::Signalp;
    +use vars qw(@ISA);
    +use strict;
    +
    +use Bio::Root::Root;
    +use Bio::SeqFeature::FeaturePair;
    +use Bio::Root::IO;
    +use Bio::SeqFeature::Generic;
    +@ISA = qw(Bio::Root::Root Bio::Root::IO );
    +
    +
    +
    +=head2 new
    +
    + Title   : new
    + Usage   : my $obj = new Bio::Tools::SignalP();
    + Function: Builds a new Bio::Tools::SignalP object
    + Returns : Bio::Tools::SignalP
    + Args    : -fh/-file => $val, # for initing input, see Bio::Root::IO
    +
    +
    +=cut
    +
    +sub new {
    +      my($class,@args) = @_;
    +
    +      my $self = $class->SUPER::new(@args);
    +      $self->_initialize_io(@args);
    +
    +      return $self;
    +}
    +
    +=head2 next_result
    +
    + Title   : next_result
    + Usage   : my $feat = $signalp->next_result
    + Function: Get the next result set from parser data
    + Returns : Bio::SeqFeature::Generic
    + Args    : none
    +
    +
    +=cut
    +
    +sub next_result {
    +        my ($self) = @_;
    +        
    +        my $line;
    +        # parse
    +        my $id;
    +        my ( $fact1, $fact2, $end);
    +        while ($_=$self->_readline()) {
    +           $line = $_;
    +           chomp $line;
    +           
    +           if ($line=~/^\>(\S+)/) {
    +              $id = $1;
    +              $self->seqname($id);
    +              next;
    +           }
    +           elsif ($line=~/max\.\s+Y\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
    +              $fact1 = $2;
    +              $self->fact1($fact1);
    +              next;
    +           }
    +           elsif ($line=~/mean\s+S\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
    +              $fact2 = $2;
    +                 $fact1 = $self->fact1;
    +                 $id = $self->seqname;
    +              
    +              if ($fact1 eq "YES" && $fact2 eq "YES") {
    +                  
    +                  my $line = $self->_readline();
    +              
    +                  if ($line =~ /Most likely cleavage site between pos\.\s+(\d+)/) {
    +                      $end = $1;
    +                  }
    +                  else {
    +                      $self->throw ("parsing problem in signalp");
    +                  }
    +                  my (%feature);
    +                  $feature{name} = $id;
    +                  $feature{start} = 1;
    +                  $feature{end} = $end;
    +                  $feature{source} = 'Signalp';
    +                  $feature{primary}= 'signal_peptide';
    +                  $feature{program} = 'Signalp';
    +                  $feature{logic_name} = 'signal_peptide';
    +                  
    +                  my $new_feat = $self->create_feature (\%feature);
    +                  return $new_feat;
    +                  
    +              }
    +               next;
    +               
    +           }
    +        
    +           next;
    +        
    +        }
    +        
    +}
    +
    +=head2 create_feature
    +
    + Title   : create_feature
    + Usage   : obj->create_feature(\%feature)
    + Function: Internal(not to be used directly)
    + Returns :
    + Args    :
    +
    +
    +=cut
    +
    +sub create_feature {
    +       my ($self, $feat) = @_;
    +
    +
    +       # create feature object
    +       my $feature = Bio::SeqFeature::Generic->new(
    +                                                 -seq_id=>$feat->{name},
    +                                                 -start       => $feat->{start},
    +                                                 -end         => $feat->{end},
    +                                                 -score       => $feat->{score},
    +                                                 -source      => $feat->{source},
    +                                                 -primary     => $feat->{primary},
    +                                                 -logic_name  => $feat->{logic_name}, 
    +                                               );
    +           
    +
    +          $feature->add_tag_value('evalue',0);
    +          $feature->add_tag_value('percent_id','NULL');
    +          $feature->add_tag_value("hid",$feat->{primary});
    +          
    +          return $feature; 
    +
    +}
    +=head2 seqname
    +
    + Title   : seqname
    + Usage   : obj->seqname($name)
    + Function: Internal(not to be used directly)
    + Returns :
    + Args    :
    +
    +
    +=cut
    +
    +sub seqname{
    +    my ($self,$seqname)=@_;
    +
    +    if (defined$seqname){
    +
    +        $self->{'seqname'}=$seqname;
    +    }
    +
    +    return $self->{'seqname'};
    +
    +}
    +
    +=head2 fact1
    +
    + Title   : fact1
    + Usage   : obj->fact1($fact1)
    + Function: Internal(not to be used directly)
    + Returns :
    + Args    :
    +
    +
    +=cut
    +
    +sub fact1{
    +    my ($self,$fact1)=@_;
    +
    +    if (defined$fact1){
    +
    +       $self->{'fact1'}=$fact1;
    +    }
    +
    +    return $self->{'fact1'};
    +
    +}
    +
    +
    +
    +1;
    +
    +
    diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Sim4/Exon.pm
    --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    +++ b/variant_effect_predictor/Bio/Tools/Sim4/Exon.pm	Thu Apr 11 02:01:53 2013 -0400
    @@ -0,0 +1,185 @@
    +
    +#
    +# BioPerl module for Bio::Tools::Sim4::Exon
    +#
    +# Cared for by Ewan Birney 
    +# and Hilmar Lapp 
    +#
    +# Copyright Ewan Birney, Hilmar Lapp
    +#
    +# You may distribute this module under the same terms as perl itself
    +
    +# POD documentation - main docs before the code
    +
    +=head1 NAME
    +
    +Bio::Tools::Sim4::Exon - A single exon determined by an alignment
    +
    +=head1 SYNOPSIS
    +
    +  # See Bio::Tools::Sim4::Results for a description of the context.
    +
    +  # an instance of this class is-a Bio::SeqFeature::SimilarityPair
    +
    +  # coordinates of the exon (recommended way):
    +  print "exon from ", $exon->start(),
    +  	" to ", $exon->end(), "\n";
    +
    +  # the same (feature1() inherited from Bio::SeqFeature::FeaturePair)
    +  print "exon from ", $exon->feature1()->start(),
    +  	" to ", $exon->feature1()->end(), "\n";
    +  # also the same (query() inherited from Bio::SeqFeature::SimilarityPair):
    +  print "exon from ", $exon->query()->start(),
    +  	" to ", $exon->query()->end(), "\n";
    +
    +  # coordinates on the matching EST (recommended way):
    +  print "matches on EST from ", $exon->est_hit()->start(),
    +  	" to ", $exon->est_hit()->end(), "\n";
    +
    +  # the same (feature2() inherited from Bio::SeqFeature::FeaturePair)
    +  print "matches on EST from ", $exon->feature2()->start(),
    +  	" to ", $exon->feature2()->end(), "\n";
    +  # also the same (subject() inherited from Bio::SeqFeature::SimilarityPair):
    +  print "exon from ", $exon->subject()->start(),
    +  	" to ", $exon->subject()->end(), "\n";
    +
    +=head1 DESCRIPTION
    +
    +This class inherits from Bio::SeqFeature::SimilarityPair and represents an
    +exon on a genomic sequence determined by similarity, that is, by aligning an
    +EST sequence (using Sim4 in this case). Consequently, the notion of query and
    +subject is always from the perspective of the genomic sequence: query refers
    +to the genomic seq, subject to the aligned EST hit. Because of this,
    +$exon-Estart(), $exon-Eend() etc will always return what you expect. 
    +
    +To get the coordinates on the matching EST, refer to the properties of the
    +feature returned by L().
    +
    +=head1 FEEDBACK
    +
    +=head2 Mailing Lists
    +
    +User feedback is an integral part of the evolution of this
    +and other Bioperl modules. Send your comments and suggestions preferably
    + to one of the Bioperl mailing lists.
    +Your participation is much appreciated.
    +
    +  bioperl-l@bioperl.org          - General discussion
    +  http://bio.perl.org/MailList.html             - About the mailing lists
    +
    +=head2 Reporting Bugs
    +
    +Report bugs to the Bioperl bug tracking system to help us keep track
    + the bugs and their resolution.
    + Bug reports can be submitted via email or the web:
    +
    +  bioperl-bugs@bio.perl.org
    +  http://bugzilla.bioperl.org/
    +
    +=head1 AUTHOR - Ewan Birney, Hilmar Lapp
    +
    +Email birney@sanger.ac.uk
    +Hilmar Lapp Ehlapp@gmx.netE or Ehilmar.lapp@pharma.novartis.comE.
    +
    +Describe contact details here
    +
    +=head1 APPENDIX
    +
    +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
    +
    +=cut
    +
    +
    +# Let the code begin...
    +
    +
    +package Bio::Tools::Sim4::Exon;
    +use vars qw(@ISA);
    +use strict;
    +
    +use Bio::SeqFeature::FeaturePair;
    +use Bio::SeqFeature::Generic;
    +use Bio::SeqFeature::SimilarityPair;
    +
    +@ISA = qw(Bio::SeqFeature::SimilarityPair);
    +
    +sub new {
    +    my ($class,@args) = @_;
    +    my %param = @args;
    +    my $self = $class->SUPER::new(@args);
    +
    +    my ($prim, $source) = $self->_rearrange([qw(PRIMARY SOURCE)], @args);
    +
    +    $self->primary_tag('exon') unless $prim;
    +    $self->source_tag('Sim4') unless $source;
    +    $self->strand(0) unless defined($self->strand());
    +    $self->query();
    +    return $self; 
    +}
    +
    +=head2 percentage_id
    +
    + Title   : percentage_id
    + Usage   : $obj->percentage_id($newval)
    + Function: This is a synonym for 100 * $obj->est_hit()->frac_identical().
    + Returns : value of percentage_id
    + Args    : newvalue (optional)
    +
    +
    +=cut
    +
    +sub percentage_id {
    +    my ($self, @args) = @_;
    +    my $frac;
    +    my $val;
    +    my $delegated = 0;
    +    
    +    if(@args) {
    +	$frac = $args[0];
    +	$frac /= 100.0 if defined($frac);
    +    }
    +    if($self->query()->can('frac_identical')) {
    +	if(defined($frac)) {
    +	    $self->query()->frac_identical($frac);
    +	}
    +	$val = 100.0 * $self->query()->frac_identical();
    +	$delegated = 1;
    +    }
    +    if($self->est_hit()->can('frac_identical')) {
    +	if(defined($frac)) {
    +	    $self->est_hit()->frac_identical($frac);
    +	}
    +	# this intentiously overwrites previous $val
    +	$val = 100.0 * $self->est_hit()->frac_identical();
    +	$delegated = 1;
    +    }
    +    if(! $delegated) {
    +	if(@args) {
    +	    $val = shift(@args);
    +	    $self->{'percentage_id'} = $val;
    +	} else {
    +	    $val = $self->{'percentage_id'};
    +	}
    +    }
    +    return $val;
    +}
    +
    +=head2 est_hit
    +
    + Title   : est_hit
    + Usage   : $est_feature = $obj->est_hit();
    + Function: Returns the EST hit pointing to (i.e., aligned to by Sim4) this
    +           exon (i.e., genomic region). At present, merely a synonym for
    +           $obj->feature2().
    + Returns : An Bio::SeqFeatureI implementing object.
    + Args    : 
    +
    +
    +=cut
    +
    +sub est_hit {
    +    my $self = shift;
    +    return $self->feature2(@_);
    +}
    +
    +1;
    diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Sim4/Results.pm
    --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    +++ b/variant_effect_predictor/Bio/Tools/Sim4/Results.pm	Thu Apr 11 02:01:53 2013 -0400
    @@ -0,0 +1,435 @@
    +
    +#
    +# BioPerl module for Bio::Tools::Sim4::Results
    +#
    +# Cared for by Ewan Birney 
    +#          and Hilmar Lapp 
    +#
    +# Copyright Ewan Birney and Hilmar Lapp
    +#
    +# You may distribute this module under the same terms as perl itself
    +
    +# POD documentation - main docs before the code
    +
    +=head1 NAME
    +
    +Bio::Tools::Sim4::Results - Results of one Sim4 run
    +
    +=head1 SYNOPSIS
    +
    +   # to preset the order of EST and genomic file as given on the sim4 
    +   # command line:
    +   my $sim4 = Bio::Tools::Sim4::Results->new(-file => 'result.sim4',
    +                                             -estfirst => 1);
    +   # to let the order be determined automatically (by length comparison):
    +   $sim4 = Bio::Tools::Sim4::Results->new( -file => 'sim4.results' );
    +   # filehandle:
    +   $sim4 = Bio::Tools::Sim4::Results->new( -fh   => \*INPUT );
    +
    +   # parse the results
    +   while(my $exonset = $sim4->next_exonset()) {
    +       # $exonset is-a Bio::SeqFeature::Generic with Bio::Tools::Sim4::Exons
    +       # as sub features
    +       print "Delimited on sequence ", $exonset->seq_id(), 
    +             "from ", $exonset->start(), " to ", $exonset->end(), "\n";
    +       foreach my $exon ( $exonset->sub_SeqFeature() ) {
    +	  # $exon is-a Bio::SeqFeature::FeaturePair
    +	  print "Exon from ", $exon->start, " to ", $exon->end, 
    +                " on strand ", $exon->strand(), "\n";
    +          # you can get out what it matched using the est_hit attribute
    +          my $homol = $exon->est_hit();
    +          print "Matched to sequence ", $homol->seq_id, 
    +                " at ", $homol->start," to ", $homol->end, "\n";
    +      }
    +   }
    +
    +   # essential if you gave a filename at initialization (otherwise the file
    +   # stays open)
    +   $sim4->close();
    +
    +=head1 DESCRIPTION
    +
    +The sim4 module provides a parser and results object for sim4 output. The
    +sim4 results are specialised types of SeqFeatures, meaning you can add them
    +to AnnSeq objects fine, and manipulate them in the "normal" seqfeature manner.
    +
    +The sim4 Exon objects are Bio::SeqFeature::FeaturePair inherited objects. The 
    +$esthit = $exon-Eest_hit() is the alignment as a feature on the matching 
    +object (normally, an EST), in which the start/end points are where the hit
    +lies.
    +
    +To make this module work sensibly you need to run
    +
    +     sim4 genomic.fasta est.database.fasta
    +or
    +     sim4 est.fasta genomic.database.fasta
    +
    +To get the sequence identifiers recorded for the first sequence, too, use
    +A=4 as output option for sim4.
    +
    +One fiddle here is that there are only two real possibilities to the matching
    +criteria: either one sequence needs reversing or not. Because of this, it
    +is impossible to tell whether the match is in the forward or reverse strand
    +of the genomic DNA. We solve this here by assuming that the genomic DNA is
    +always forward. As a consequence, the strand attribute of the matching EST is
    +unknown, and the strand attribute of the genomic DNA (i.e., the Exon object)
    +will reflect the direction of the hit.
    +
    +See the documentation of parse_next_alignment() for abilities of the parser
    +to deal with the different output format options of sim4.
    +
    +=head1 FEEDBACK
    +
    +=head2 Mailing Lists
    +
    +User feedback is an integral part of the evolution of this and other
    +Bioperl modules. Send your comments and suggestions preferably to one
    +of the Bioperl mailing lists.  Your participation is much appreciated.
    +
    +  bioperl-l@bioperl.org          - General discussion
    +  http://bio.perl.org/MailList.html             - About the mailing lists
    +
    +=head2 Reporting Bugs
    +
    +Report bugs to the Bioperl bug tracking system to help us keep track
    +the bugs and their resolution.  Bug reports can be submitted via email
    +or the web:
    +
    +  bioperl-bugs@bio.perl.org
    +  http://bugzilla.bioperl.org/
    +
    +=head1 AUTHOR - Ewan Birney, Hilmar Lapp
    +
    +Email birney@sanger.ac.uk
    +      hlapp@gmx.net (or hilmar.lapp@pharma.novartis.com)
    +
    +Describe contact details here
    +
    +=head1 APPENDIX
    +
    +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
    +
    +=cut
    +
    +
    +# Let the code begin...
    +
    +
    +package Bio::Tools::Sim4::Results;
    +use vars qw(@ISA);
    +use strict;
    +
    +# Object preamble - inherits from Bio::Root::Object
    +
    +use File::Basename;
    +use Bio::Root::Root;
    +use Bio::Tools::AnalysisResult;
    +use Bio::Tools::Sim4::Exon;
    +
    +@ISA = qw(Bio::Tools::AnalysisResult);
    +
    +
    +sub _initialize_state {
    +    my($self,@args) = @_;
    +
    +    # call the inherited method first
    +    my $make = $self->SUPER::_initialize_state(@args);
    +
    +    my ($est_is_first) = $self->_rearrange([qw(ESTFIRST)], @args);
    +
    +    delete($self->{'_est_is_first'});
    +    $self->{'_est_is_first'} = $est_is_first if(defined($est_is_first));
    +    $self->analysis_method("Sim4");
    +}
    +
    +=head2 analysis_method
    +
    + Usage     : $sim4->analysis_method();
    + Purpose   : Inherited method. Overridden to ensure that the name matches
    +             /sim4/i.
    + Returns   : String
    + Argument  : n/a
    +
    +=cut
    +
    +#-------------
    +sub analysis_method { 
    +#-------------
    +    my ($self, $method) = @_;  
    +    if($method && ($method !~ /sim4/i)) {
    +	$self->throw("method $method not supported in " . ref($self));
    +    }
    +    return $self->SUPER::analysis_method($method);
    +}
    +
    +=head2 parse_next_alignment
    +
    + Title   : parse_next_alignment
    + Usage   : @exons = $sim4_result->parse_next_alignment;
    +           foreach $exon (@exons) {
    +               # do something
    +           }
    + Function: Parses the next alignment of the Sim4 result file and returns the
    +           found exons as an array of Bio::Tools::Sim4::Exon objects. Call
    +           this method repeatedly until an empty array is returned to get the
    +           results for all alignments.
    +
    +           The $exon->seq_id() attribute will be set to the identifier of the
    +           respective sequence for both sequences if A=4 was used in the sim4
    +           run, and otherwise for the second sequence only. If the output does
    +           not contain the identifier, the filename stripped of path and 
    +           extension is used instead. In addition, the full filename 
    +           will be recorded for both features ($exon inherits off 
    +           Bio::SeqFeature::SimilarityPair) as tag 'filename'. The length
    +           is accessible via the seqlength() attribute of $exon->query() and
    +           $exon->est_hit().
    +
    +           Note that this method is capable of dealing with outputs generated
    +           with format 0,1,3, and 4 (via the A=n option to sim4). It
    +           automatically determines which of the two sequences has been 
    +           reversed, and adjusts the coordinates for that sequence. It will
    +           also detect whether the EST sequence(s) were given as first or as
    +           second file to sim4, unless this has been specified at creation
    +           time of the object.
    +
    + Example :
    + Returns : An array of Bio::Tools::Sim4::Exon objects
    + Args    :
    +
    +
    +=cut
    +
    +sub parse_next_alignment {
    +   my ($self) = @_;
    +   my @exons = ();
    +   my %seq1props = ();
    +   my %seq2props = ();
    +   # we refer to the properties of each seq by reference
    +   my ($estseq, $genomseq, $to_reverse);
    +   my $started = 0;
    +   my $hit_direction = 1;
    +   my $output_fmt = 3; # same as 0 and 1 (we cannot deal with A=2 produced
    +                       # output yet)
    +   
    +   while(defined($_ = $self->_readline())) {
    +       #chomp();
    +
    +       #
    +       # bascially, each sim4 'hit' starts with seq1...
    +       #
    +       /^seq1/ && do {
    +	   if($started) {
    +	       $self->_pushback($_);
    +	       last;
    +	   }
    +	   $started = 1;
    +
    +	   # filename and length of seq 1
    +	   /^seq1\s+=\s+(\S+)\,\s+(\d+)/ ||
    +	       $self->throw("Sim4 parsing error on seq1 [$_] line. Sorry!");
    +	   $seq1props{'filename'} = $1;
    +	   $seq1props{'length'} = $2;
    +	   next;
    +       };
    +       /^seq2/ && do {
    +	   # the second hit has also the database name in the >name syntax 
    +	   # (in brackets).
    +	   /^seq2\s+=\s+(\S+)\s+\(>?(\S+)\s*\)\,\s+(\d+)/||
    +	       $self->throw("Sim4 parsing error on seq2 [$_] line. Sorry!");
    +	   $seq2props{'filename'} = $1;
    +	   $seq2props{'seqname'} = $2;
    +	   $seq2props{'length'} = $3;
    +	   next;
    +       };
    +       if(/^>(\S+)\s*(.*)$/) {
    +	   # output option was A=4, which not only gives the complete
    +	   # description lines, but also causes the longer sequence to be
    +	   # reversed if the second file contained one (genomic) sequence
    +	   $seq1props{'seqname'} = $1;
    +	   $seq1props{'description'} = $2 if $2;
    +	   $output_fmt = 4;
    +	   # we handle seq1 and seq2 both here
    +	   if(defined($_ = $self->_readline()) && (/^>(\S+)\s*(.*)$/)) {
    +	       $seq2props{'seqname'} = $1; # redundant, since already set above
    +	       $seq2props{'description'} = $2 if $2;
    +	   }
    +	   next;
    +       }
    +       /^\(complement\)/ && do {
    +	   $hit_direction = -1;
    +	   next;
    +       };
    +       # this matches
    +       # start-end (start-end) pctid%
    +       if(/(\d+)-(\d+)\s+\((\d+)-(\d+)\)\s+(\d+)%/) {
    + 	   $seq1props{'start'} = $1;
    + 	   $seq1props{'end'} = $2;
    + 	   $seq2props{'start'} = $3;
    + 	   $seq2props{'end'} = $4;
    +	   my $pctid   = $5;
    +	   
    +	   if(! defined($estseq)) {
    +	       # for the first time here: need to set the references referring
    +	       # to seq1 and seq2 
    +	       if(! exists($self->{'_est_is_first'})) {
    +		   # detect which one is the EST by looking at the lengths,
    +		   # and assume that this holds throughout the entire result
    +		   # file (i.e., when this method is called for the next
    +		   # alignment, this will not be checked again)
    +		   if($seq1props{'length'} > $seq2props{'length'}) {
    +		       $self->{'_est_is_first'} = 0;
    +		   } else {
    +		       $self->{'_est_is_first'} = 1;
    +		   }
    +	       }
    +	       if($self->{'_est_is_first'}) {
    +		   $estseq = \%seq1props;
    +		   $genomseq = \%seq2props;
    +		   # if the EST is given first, A=4 selects the genomic
    +		   # seq for being reversed (reversing the EST is default)
    +		   $to_reverse = ($output_fmt == 4) ? $genomseq : $estseq;
    +	       } else {
    +		   $estseq = \%seq2props;
    +		   $genomseq = \%seq1props;
    +		   # if the EST is the second, A=4 does not change the
    +		   # seq being reversed (always the EST is reversed)
    +		   $to_reverse = $estseq;
    +	       }
    +	   }
    +	   if($hit_direction == -1) {
    +	       # we have to reverse the coordinates of one of both seqs
    +	       my $tmp = $to_reverse->{'start'};
    +	       $to_reverse->{'start'} =
    +		   $to_reverse->{'length'} - $to_reverse->{'end'} + 1;
    +	       $to_reverse->{'end'} = $to_reverse->{'length'} - $tmp + 1;
    +	   }
    +	   # create and initialize the exon object
    +	   my $exon = Bio::Tools::Sim4::Exon->new(
    +					    '-start' => $genomseq->{'start'},
    +					    '-end'   => $genomseq->{'end'},
    +					    '-strand' => $hit_direction);
    +	   if(exists($genomseq->{'seqname'})) {
    +	       $exon->seq_id($genomseq->{'seqname'});
    +	   } else {
    +	       # take filename stripped of path as fall back
    +	       my ($basename) = &File::Basename::fileparse($genomseq->{'filename'}, '\..*');
    +	       $exon->seq_id($basename);
    +	   }
    +	   $exon->feature1()->add_tag_value('filename',
    +					    $genomseq->{'filename'});
    +	   # feature1 is supposed to be initialized to a Similarity object,
    +           # but we provide a safety net
    +	   if($exon->feature1()->can('seqlength')) {
    +	       $exon->feature1()->seqlength($genomseq->{'length'});
    +	   } else {
    +	       $exon->feature1()->add_tag_value('SeqLength',
    +						$genomseq->{'length'});
    +	   }
    +	   # create and initialize the feature wrapping the 'hit' (the EST)
    +	   my $fea2 = Bio::SeqFeature::Similarity->new(
    +                                            '-start' => $estseq->{'start'},
    +					    '-end'   => $estseq->{'end'},
    +					    '-strand' => 0,
    +					    '-primary' => "aligning_EST");
    +	   if(exists($estseq->{'seqname'})) {
    +	       $fea2->seq_id($estseq->{'seqname'});
    +	   } else {
    +	       # take filename stripped of path as fall back
    +	       my ($basename) =
    +		   &File::Basename::fileparse($estseq->{'filename'}, '\..*');
    +	       $fea2->seq_id($basename);
    +	   }
    +	   $fea2->add_tag_value('filename', $estseq->{'filename'});
    +	   $fea2->seqlength($estseq->{'length'});
    +	   # store
    +	   $exon->est_hit($fea2);	   
    +	   # general properties
    +	   $exon->source_tag($self->analysis_method());
    +	   $exon->percentage_id($pctid);
    +	   $exon->score($exon->percentage_id());
    +	   # push onto array
    +	   push(@exons, $exon);
    +	   next; # back to while loop
    +       }
    +   }
    +   return @exons;
    +}
    +
    +=head2 next_exonset
    +
    + Title   : next_exonset
    + Usage   : $exonset = $sim4_result->parse_next_exonset;
    +           print "Exons start at ", $exonset->start(), 
    +                 "and end at ", $exonset->end(), "\n";
    +           foreach $exon ($exonset->sub_SeqFeature()) {
    +               # do something
    +           }
    + Function: Parses the next alignment of the Sim4 result file and returns the
    +           set of exons as a container of features. The container is itself
    +           a Bio::SeqFeature::Generic object, with the Bio::Tools::Sim4::Exon
    +           objects as sub features. Start, end, and strand of the container
    +           will represent the total region covered by the exons of this set.
    +
    +           See the documentation of parse_next_alignment() for further
    +           reference about parsing and how the information is stored.
    +
    + Example : 
    + Returns : An Bio::SeqFeature::Generic object holding Bio::Tools::Sim4::Exon
    +           objects as sub features.
    + Args    :
    +
    +=cut
    +
    +sub next_exonset {
    +    my $self = shift;
    +    my $exonset;
    +
    +    # get the next array of exons
    +    my @exons = $self->parse_next_alignment();
    +    return if($#exons < 0);
    +    # create the container of exons as a feature object itself, with the
    +    # data of the first exon for initialization
    +    $exonset = Bio::SeqFeature::Generic->new('-start' => $exons[0]->start(),
    +					     '-end' => $exons[0]->end(),
    +					     '-strand' => $exons[0]->strand(),
    +					     '-primary' => "ExonSet");
    +    $exonset->source_tag($exons[0]->source_tag());
    +    $exonset->seq_id($exons[0]->seq_id());
    +    # now add all exons as sub features, with enabling EXPANsion of the region
    +    # covered in total
    +    foreach my $exon (@exons) {
    +	$exonset->add_sub_SeqFeature($exon, 'EXPAND');
    +    }
    +    return $exonset;
    +}
    +
    +=head2 next_feature
    +
    + Title   : next_feature
    + Usage   : while($exonset = $sim4->next_feature()) {
    +                  # do something
    +           }
    + Function: Does the same as L. See there for documentation of
    +           the functionality. Call this method repeatedly until FALSE is
    +           returned.
    +
    +           The returned object is actually a SeqFeatureI implementing object.
    +           This method is required for classes implementing the
    +           SeqAnalysisParserI interface, and is merely an alias for 
    +           next_exonset() at present.
    +
    + Example :
    + Returns : A Bio::SeqFeature::Generic object.
    + Args    :
    +
    +=cut
    +
    +sub next_feature {
    +    my ($self,@args) = @_;
    +    # even though next_exonset doesn't expect any args (and this method
    +    # does neither), we pass on args in order to be prepared if this changes
    +    # ever
    +    return $self->next_exonset(@args);
    +}
    +
    +1;
    diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/StateMachine/AbstractStateMachine.pm
    --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    +++ b/variant_effect_predictor/Bio/Tools/StateMachine/AbstractStateMachine.pm	Thu Apr 11 02:01:53 2013 -0400
    @@ -0,0 +1,616 @@
    +#-----------------------------------------------------------------
    +# $Id: AbstractStateMachine.pm,v 1.9 2002/10/22 07:38:49 lapp Exp $
    +#
    +# BioPerl module Bio::Tools::StateMachine::AbstractStateMachine
    +#
    +# Cared for by Steve Chervitz 
    +#
    +# You may distribute this module under the same terms as perl itself
    +#-----------------------------------------------------------------
    +
    +=head1 NAME
    +
    +Bio::Tools::StateMachine::AbstractStateMachine - Abstract state machine object
    +
    +=head1 SYNOPSIS
    +
    +Here is a portion of an implementation. For the full example, see
    +examples/state-machine.pl in the Bioperl distribution.
    +
    +    package SimpleStateMachine;
    +
    +    use Bio::Root::Root;
    +    use Bio::Tools::StateMachine::AbstractStateMachine qw($INITIAL_STATE 
    +    							  $FINAL_STATE);
    +    use vars qw( @ISA );
    +
    +    @ISA = qw( Bio::Root::Root
    +    	   Bio::Tools::StateMachine::AbstractStateMachine );
    +
    +    my @state_transitions = (  [ $INITIAL_STATE, 'State1'],
    +    			       [ 'State1', 'State2' ],
    +    			       [ 'State2', $FINAL_STATE]
    +    				);
    +    sub new {
    +       my($caller,@args) = @_;
    +       my $self = $caller->SUPER::new( @args);
    +       $self->_init_state_machine( -transition_table => \@state_transitions );
    +       return $self;
    +    }
    +
    +
    +
    +=head1 DESCRIPTION
    +
    +B provides a generic framework for representing a
    +state machine. This is not an event-based framework where you register
    +handlers to be called when certain events occur. Instead, it provides
    +a set of methods that define the basic logic of an object that has
    +state behavior, that logic being:
    +
    +=over 4
    +
    +=item 1. Check for whether or not a new state has occurred in the external world.
    +
    +=item 2. If so, change the state of the machine to the new state.
    +
    +=item 3. Otherwise, keep checking current conditions for a new state.
    +
    +=item 4. Stop checking for new states if we reach the final state, or if an error occurs.
    +
    +=back
    +
    +A B is just a string representing the name of the state.  A
    +state machine is initialized with a B
    +consisting of a set of allowable transitions, where each B
    +is a two-element array in which the first element is the B, and the second element is the B.  This table permits
    +the AbstractStateMachine to determine if a requested transition is
    +valid.
    +
    +This module is flexible enough to represent both deterministic and
    +non-deterministic finite automata (DFAs and NFAs), but it is fairly
    +new and should be considered experimental.
    +
    +The key methods in AbstractStateMachine that define this logic of
    +operation are:
    +
    +=over 4
    +
    +=item check_for_new_state().
    +
    +Does whatever checking is necessary to determine if a state transition
    +should occur (for example, read a line of input from STDIN). If a
    +transition should occur, a string is returned containing the name of
    +the new state. Otherwise, it returns C.
    +
    +This method B as AbstractStateMachine does not
    +define it (and in fact will throw a NotImplemented exception if you
    +fail to implement it).
    +
    +=item change_state( C )
    +
    +Causes the machine to change its state to the state specified in the
    +argument. change_state() allows you to mapping a state transition to a
    +particular handler method that does whatever processing is needed to
    +deal with the state transition.
    +
    +=item run()
    +
    +This method keeps calling check_for_new_state() and if that method
    +returns a defined value (the name of the state to change to), it then
    +calls change_state( $state ), where $state is the value returned by
    +check_for_new_state().
    +
    +Before calling check_for_new_state(), the run() method checks the
    +current state of the machine and exits the loop if the current state
    +ever becomes $PAUSE_STATE, $ERROR_STATE, or $FINAL_STATE.
    +
    +=item append_input_cache( C )
    +
    +Adds data to a buffer for processing at the next state
    +transition. check_for_new_state() should call
    +append_input_cache() passing it any data it receives while checking
    +for a new state that should be processed later.
    +
    +=item get_input_cache()
    +
    +Retrieves the data stored by calling
    +append_input_cache(). change_state() should call get_input_cache() to
    +obtain the data to be processed for the current transition.
    +
    +=back
    +
    +This object defines an abstract class, meaning that some but not all methods 
    +have been implemented. Subclasses must define the methods not implemented here.
    +These include at a minimum:
    +
    +=over 4
    +
    +=item check_for_new_state()
    +
    +=item change_state()
    +
    +A default simple implementation of change_state() is provided, but
    +subclasses of AbstractStateMachine most likely will want to override
    +this method to do something useful according to the particular state
    +change.
    +
    +=back
    +
    +If your state machine needs to cache input while processing, you will
    +also need to provide implementations of these methods (which are no-op
    +in AbstractStateMachine):
    +
    +=over 3
    +
    +=item append_input_cache
    +
    +=item get_input_cache
    +
    +=item clear_input_cache
    +
    +=back
    +
    +There are some other nuances provided by AbstractStateMachine, such as
    +the ability to pause() and resume() the running of the machine.
    +
    +
    +=head1 EXAMPLES
    +
    +To get a feel for how to use this, have look at scripts in the
    +examples/state-machine directory of the Bioperl distribution. Also
    +have a look at Bio::Tools::StateMachine::IOStateMachine which provides
    +a Bio::Root::IO-based implementation of
    +AbstractStateMachine. Bio::SearchIO::psiblast subclasses
    +IOStateMachine.
    +
    +
    +=head1 FEEDBACK
    +
    +=head2 Mailing Lists 
    +
    +User feedback is an integral part of the evolution of this and other
    +Bioperl modules.  Send your comments and suggestions preferably to one
    +of the Bioperl mailing lists.  Your participation is much appreciated.
    +
    +    bioperl-l@bioperl.org              - General discussion
    +    http://bio.perl.org/MailList.html  - About the mailing lists
    +
    +=head2 Reporting Bugs
    +
    +Report bugs to the Bioperl bug tracking system to help us keep track
    +the bugs and their resolution. Bug reports can be submitted via email
    +or the web:
    +
    +    bioperl-bugs@bio.perl.org                   
    +    http://bugzilla.bioperl.org/           
    +
    +=head1 AUTHOR 
    +
    +Steve Chervitz, Esac@bioperl.orgE
    +
    +See the L section for where to send bug reports and comments.
    +
    +=head1 ACKNOWLEDGEMENTS
    +
    +I would like to acknowledge my colleagues at Affymetrix for useful
    +feedback.
    +
    +=head1 COPYRIGHT
    +
    +Copyright (c) 2001 Steve Chervitz. All Rights Reserved.
    +
    +This library is free software; you can redistribute it and/or modify
    +it under the same terms as Perl itself.
    +
    +=cut
    +
    +=head1 DISCLAIMER
    +
    +This software is provided "as is" without warranty of any kind.
    +
    +=head1 APPENDIX
    +
    +The rest of the documentation details each of the object methods.
    +Internal methods are usually preceded with a _
    +
    +=cut
    +
    +
    +# Let the code begin...
    +
    +package Bio::Tools::StateMachine::AbstractStateMachine;
    +
    +use strict;
    +use Bio::Root::RootI;
    +use Exporter ();
    +
    +use vars qw( @ISA @EXPORT_OK $INITIAL_STATE $FINAL_STATE $PAUSE_STATE $ERROR_STATE );
    +@ISA = qw( Bio::Root::RootI  Exporter );
    +@EXPORT_OK = qw( $INITIAL_STATE $FINAL_STATE $PAUSE_STATE $ERROR_STATE );
    +
    +@Bio::Tools::StateMachine::StateException::ISA = qw( Bio::Root::Exception );
    +
    +$INITIAL_STATE = 'Initial';
    +$FINAL_STATE = 'Final';
    +$PAUSE_STATE = 'Pause';
    +$ERROR_STATE = 'Error';
    +
    +sub _init_state_machine {
    +    my  ($self, @args ) = @_;
    +    my ($transition_table) = $self->_rearrange( [qw(TRANSITION_TABLE)], @args);
    +
    +    $self->verbose and print STDERR "Initializing State Machine...\n";
    +
    +    if($transition_table) {
    +        $self->_set_transition_table( $transition_table );
    +    }
    +
    +    $self->add_transition( $INITIAL_STATE, $FINAL_STATE );
    +    $self->_set_current_state( $INITIAL_STATE );
    +}
    +
    +sub reset {
    +    my $self = shift;
    +    $self->verbose and print STDERR "Resetting state machine\n";
    +    $self->_set_current_state( $INITIAL_STATE );
    +}
    +
    +sub _set_current_state {
    +    my ($self, $state) = @_;
    +    if( defined $state) {
    +	$self->verbose and print STDERR "  setting current state to $state\n";
    +	$self->{'_current_state'} = $state;
    +    }
    +}
    +
    +sub current_state { shift->{'_current_state'} }
    +
    +sub initial_state { $INITIAL_STATE }
    +
    +sub final_state { $FINAL_STATE }
    +
    +sub pause_state { $PAUSE_STATE }
    +
    +sub error_state { $ERROR_STATE }
    +
    +sub resume_state {
    +    my ($self, $state) = @_;
    +    if( $state ) {
    +      $self->{'_resume_state'} = $state;
    +    }
    +    $self->{'_resume_state'};
    +}
    +
    +sub _clear_resume_state {
    +    my $self = shift;
    +    undef $self->{'_resume_state'};
    +}
    +
    +=head2 running
    +
    +The machine is either running or not running. 
    +Once the machine has stopped running, it cannot be re-started.
    +Use pause() to temporarily halt a machine without exiting the run state.
    +
    +=cut 
    +
    +sub running { shift->{'_running'} }
    +
    +sub _set_running {
    +    my $self = shift;
    +    $self->{'_running'} = shift;
    +}
    +
    +sub run {
    +    my ($self, @args) = @_;
    +
    +    my $verbose = $self->verbose;
    +    my $curr_state = $self->current_state;
    +    $self->_set_running( 1 );
    +
    +    while( not ($curr_state eq $PAUSE_STATE ||
    +                $curr_state eq $ERROR_STATE ||
    +                $curr_state eq $FINAL_STATE )) {
    +
    +	$verbose and print STDERR "Current state (run): ${\$self->current_state}\n";
    +
    +        if( my $state = $self->check_for_new_state()) {
    +            $self->change_state( $state );
    +        }
    +
    +        $curr_state = $self->current_state;
    +    }
    +
    +    # Handle EOF situations
    +    if( not ($curr_state eq $PAUSE_STATE ||
    +             $curr_state eq $FINAL_STATE )) {
    +
    +        $self->change_state( $FINAL_STATE );
    +	$self->_set_running( 0 );
    +    }
    +
    +    $verbose and print STDERR "StateMachine Run complete ($curr_state).\n";
    +}
    +
    +# The pause() and resume() methods don't go through change_state()
    +sub pause {
    +    my ($self) = @_;
    +#    print "PAUSING...\n";
    +    $self->resume_state( $self->current_state );
    +    $self->_set_current_state( $PAUSE_STATE );
    +#    print "After pause(): Current state: ${\$self->current_state}\n";
    +}
    +
    +sub paused {
    +    my ($self) = @_;
    +    return $self->current_state eq $PAUSE_STATE;
    +}
    +
    +sub throw{
    +   my ($self,@args) = @_;
    +   $self->_set_current_state( $ERROR_STATE );
    +   $self->_set_running( 0 );
    +   $self->SUPER::throw( @args );
    +}
    +
    +sub error {
    +    my ($self, $err) = @_;
    +    return $self->current_state eq $ERROR_STATE;
    +}
    +
    +sub resume {
    +    my ($self) = @_;
    +
    +    # Don't resume if we're done.
    +    return if $self->current_state eq $FINAL_STATE;
    +
    +#    print "RESUMING...\n";
    +    $self->_set_current_state( $self->resume_state );
    +    $self->_clear_resume_state;
    +    $self->run();
    +}
    +
    +=head2 transition_table
    +
    + Arg      : n/a
    + Returns  : An array of array references to two-element arrays.
    +            Each array ref defines a single transition where
    +            the first element is the name of the "from" state and
    +            the second element is the name of the "to" state.
    +
    + Example  : $sm->transition_table( [ $INITIAL_STATE, 'State1'],
    +				   [ 'State1', 'State2' ],
    +				   [ 'State2', 'State3' ],
    +				   [ 'State3', $FINAL_STATE]
    +				 );
    +
    +=cut
    +
    +sub transition_table {
    +    my ($self) = @_;
    +
    +    return @{$self->{'_transition_table'}};
    +}
    +
    +sub _set_transition_table {
    +    my ($self, $table_ref) = @_;
    +
    +    my $verbose = $self->verbose;
    +    $verbose and print STDERR "Setting state transition table:\n";
    +
    +    if( not ref($table_ref) eq 'ARRAY') {
    +	$self->throw( -class => 'Bio::Root::BadParameter',
    +                      -text => "Can't set state transition table: Arg wasn't an array reference."
    +                    );
    +    }
    +
    +    foreach my $t (@$table_ref) {
    +        if( ref($t) and scalar(@$t) == 2 ) {
    +            push @{$self->{'_transition_table'}->{$t->[0]}}, $t->[1];
    +            $verbose and print STDERR "  adding: $t->[0] -> $t->[1]\n";
    +        }
    +        else {
    +            $self->throw( -class => 'Bio::Root::BadParameter',
    +                          -text => "Can't add state transition from table: Not a 2-element array reference ($t)"
    +                        );
    +        }
    +    }
    +}
    +
    +=head2 add_transition
    +
    + Arg      : Two string arguments where:
    +            First string = name of the "from" state.
    +            Second string = name of the "to" state.
    + Throws   : A Bio::Root::BadParameter exception if two arguments
    +            are not provided.
    +
    +=cut
    +
    +sub add_transition {
    +    my ($self, $from, $to) = @_;
    +
    +    if( defined($from) and defined($to) ) {
    +	push @{$self->{'_transition_table'}->{$from}}, $to;
    +    }
    +    else {
    +	$self->throw( -class => 'Bio::Root::BadParameter',
    +                      -text => "Can't add state transition: Insufficient arguments."
    +                    );
    +    }
    +}
    +
    +
    +=head2 change_state
    +
    + Purpose  : To cause the machine to change its state.
    + Argument : A String containing the name of the the new state.
    + Returns  : n/a
    + Throws   : A Bio::Tools::StateMachine::StateException exception if the
    +            state transition cannot be carried out.
    +
    +This is a default implementation that simply validates the state change
    +(by calling  validate_transition) and then calls finalize_state_change()
    +if the transition is valid.
    +
    +Subclasses of AbstractStateMachine most likely will want to override this 
    +method to do something useful according to the particular state change.
    +
    +=cut
    +
    +sub change_state {
    +    my ($self, $new_state) = @_;
    +
    +    $self->verbose and print STDERR "  changing state to $new_state\n";
    +
    +    if ( $self->validate_transition( $self->current_state, $new_state, 1 ) ) {
    +      $self->finalize_state_change( $new_state, 1 );
    +    }
    +
    +}
    +
    +
    +=head2 get_transitions_from
    +
    + Purpose  : Returns a list array references that have the indicated state
    +            in their 'from' slot.
    +
    +=cut
    +
    +sub get_transitions_from {
    +    my ($self, $state) = @_;
    +
    +    my @trans = ();
    +    if( ref $self->{'_transition_table'}->{$state}) {
    +        @trans = @{$self->{'_transition_table'}->{$state}};
    +    }
    +
    +    return @trans;
    +}
    +
    +=head2 validate_transition
    +
    + Purpose  : Determines if the desired state change is defined within 
    +            the set of registered transitions for this StateMachine.
    + Arg      : Two required arguments:
    +            [0] string defining the name of the "from" state (case sensitive)
    +            [1] string defining the name of the "to" state (case sensitive)
    + Returns  : True if the transition is valid.
    +            If not valid, throws an exception.
    + Throws   : A Bio::Tools::StateMachine::StateException if the desired 
    +            transition does not exist with the registered transitions
    +            for this machine.
    + Throws   : A Bio::Root::BadParameter if insufficient arguments are given.
    +
    +=cut
    +
    +sub validate_transition {
    +    my ($self, $from_state, $to_state ) = @_;
    +
    +    #print STDERR "  validating transition $from_state -> $to_state\n";
    +
    +    if( not( defined($from_state) and defined($to_state))) {
    +        $self->throw( -class => 'Bio::Root::BadParameter',
    +                      -text => "Can't validate state transition: Insufficient arguments.");
    +    }
    +
    +    my $is_valid = 0;
    +
    +    foreach my $t ( $self->get_transitions_from( $from_state ) ) {
    +        if( $t eq $to_state ) {
    +#        if( $t->[1] eq $to_state ) {
    +            $is_valid = 1;
    +            last;
    +        }
    +    }
    +
    +    if( not $is_valid ) {
    +        $self->throw( -class => 'Bio::Tools::StateMachine::StateException',
    +                      -text => "The desired state change is not valid for this machine: $from_state -> $to_state");
    +    }
    +
    +    #print STDERR "  valid!\n";
    +
    +    return $to_state;
    +}
    +
    +=head2 check_for_new_state
    +
    + Purpose : To do whatever checking is necessary to determine if 
    +            a state transition should occur. 
    + Argument : Any necessary data required to determine if the state 
    +            machine should change to a new state.
    + Returns  : A string containing the name of the new state if the 
    +            state machine should change to a new state. 
    +            Otherwise returns undef.
    +
    +This is a virtual method and must be implemented by a subclass to do 
    +whatever checking is necessary to determine if a state transition should occur.
    +If not implemented, calling this method will result in a 
    +Bio::Root::NotImplemented exception.
    +
    +=cut
    +
    +sub check_for_new_state {
    +    my ($self, $data) = @_;
    +    $self->throw_not_implemented;
    +}
    +
    +sub append_input_cache {
    +    my ($self, $data) = @_;
    +}
    +
    +sub get_input_cache {
    +    my $self = shift;
    +}
    +
    +sub clear_input_cache {
    +    my $self = shift;
    +}
    +
    +sub state_change_cache {
    +    my ($self, $data) = @_;
    +    if( defined $data ) {
    +        $self->{'_state_change_cache'} = $data;
    +    }
    +    return $self->{'_state_change_cache'};
    +}
    +
    +sub clear_state_change_cache {
    +    my ($self, $data) = @_;
    +    $self->{'_state_change_cache'} = undef;
    +}
    +
    +
    +=head2 finalize_state_change
    +
    + Purpose  : Performs routine operations to finish changing state.
    +            This method should be called at the end of change_state().
    + Usage    : finalize_state_change( $new_state, $clear_input_cache );
    + Argument : $new_state = the name of the state to change to.
    +            $clear_input_cache = boolean whether or not to zap whatever 
    +                                 was in the input cache. Depends on 
    +                                 the logic of your state machine.
    +
    +=cut
    +
    +sub finalize_state_change {
    +    my ($self, $to_state, $clear_input_cache ) = @_;
    +
    +    if( $self->paused ) {
    +        $self->resume_state( $to_state );
    +    }
    +    else {
    +        $self->_set_current_state( $to_state );
    +    }
    +    $self->clear_input_cache() if $clear_input_cache;
    +    $self->append_input_cache( $self->state_change_cache );
    +    $self->clear_state_change_cache();
    +}
    +
    +
    +1;
    +
    +
    diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/StateMachine/IOStateMachine.pm
    --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    +++ b/variant_effect_predictor/Bio/Tools/StateMachine/IOStateMachine.pm	Thu Apr 11 02:01:53 2013 -0400
    @@ -0,0 +1,249 @@
    +#-----------------------------------------------------------------
    +# $Id: IOStateMachine.pm,v 1.6 2002/10/22 07:38:49 lapp Exp $
    +#
    +# BioPerl module Bio::Tools::StateMachine::IOStateMachine
    +#
    +# Cared for by Steve Chervitz 
    +#
    +# You may distribute this module under the same terms as perl itself
    +#-----------------------------------------------------------------
    +
    +=head1 NAME
    +
    +Bio::Tools::StateMachine::IOStateMachine - IO-based implementation of AbstractStateMachine
    +
    +=head1 SYNOPSIS
    +
    +    use Bio::Tools::StateMachine::IOStateMachine;
    +
    +    # A state machine that reads input from a file
    +    my $sm = Bio::Tools::StateMachine::IOStateMachine->new( -file => 'data.txt' );
    +
    +    # A state machine that reads input from a STDIN
    +    my $sm = Bio::Tools::StateMachine::IOStateMachine->new();
    +
    +    # A state machine that reads input from a STDIN
    +    # and times out if input doesn't arrive within 30 seconds.
    +    my $sm = Bio::Tools::StateMachine::IOStateMachine->new( -timeout_sec => 30 );
    +
    +
    +=head1 DESCRIPTION
    +
    +An implementation of AbstractStateMachine that samples an input stream
    +to determine whether a state change has occurred.
    +
    +=head1 EXAMPLES
    +
    +To get a feel for how to use this, have look at
    +Bio::SearchIO::psiblast which subclasses IOStateMachine.
    +
    +
    +=head1 FEEDBACK
    +
    +=head2 Mailing Lists 
    +
    +User feedback is an integral part of the evolution of this and other
    +Bioperl modules.  Send your comments and suggestions preferably to one
    +of the Bioperl mailing lists.  Your participation is much appreciated.
    +
    +    bioperl-l@bioperl.org              - General discussion
    +    http://bio.perl.org/MailList.html  - About the mailing lists
    +
    +=head2 Reporting Bugs
    +
    +Report bugs to the Bioperl bug tracking system to help us keep track
    +the bugs and their resolution. Bug reports can be submitted via email
    +or the web:
    +
    +    bioperl-bugs@bio.perl.org                   
    +    http://bugzilla.bioperl.org/           
    +
    +=head1 AUTHOR
    +
    +Steve Chervitz, Esac@bioperl.orgE
    +
    +See the L section for where to send bug reports and comments.
    +
    +=head1 COPYRIGHT
    +
    +Copyright (c) 2001 Steve Chervitz. All Rights Reserved.
    +
    +This library is free software; you can redistribute it and/or modify
    +it under the same terms as Perl itself.
    +
    +=cut
    +
    +=head1 DISCLAIMER
    +
    +This software is provided "as is" without warranty of any kind.
    +
    +=head1 APPENDIX
    +
    +The rest of the documentation details each of the object methods.
    +Internal methods are usually preceded with a _
    +
    +=cut
    +
    +# Let the code begin...
    +
    +package Bio::Tools::StateMachine::IOStateMachine;
    +
    +use strict;
    +use vars qw( @ISA @EXPORT_OK );
    +
    +use Bio::Root::IO;
    +use Bio::Tools::StateMachine::AbstractStateMachine qw($INITIAL_STATE $FINAL_STATE);
    +
    +@ISA = qw( Bio::Root::IO
    +           Bio::Tools::StateMachine::AbstractStateMachine
    +         );
    +
    +# Propagating the initial and final states from AbstractStateMachine
    +@EXPORT_OK = qw( $INITIAL_STATE $FINAL_STATE );
    +
    +=head2 _init_state_machine()
    +
    + Argument : Named parameter -TIMEOUT_SEC => seconds,
    +            to specify the number of seconds to allow before throwing
    +            an exception if input fails to arrive within that amount of time. 
    +
    +=cut
    +
    +sub _init_state_machine {
    +    my($self, @args) = @_;
    +
    +    $self->SUPER::_init_state_machine(@args);
    +
    +    my ($timeout) = $self->_rearrange( [qw(TIMEOUT_SECS)], @args);
    +
    +    if( defined $timeout ) {
    +	if($timeout =~ /^\d+$/ ) {
    +	    $self->{'_timeout_secs'} = $timeout;
    +	}
    +	else {
    +	    $self->throw(-class =>'Bio::Root::BadParameter',
    +			 -text => "TIMEOUT_SECS must be a number: $timeout",
    +			 -value => $timeout
    +			);
    +	}
    +    }
    +}
    +
    +=head2 check_for_new_state()
    + 
    + Purpose  : Obtains data from the input stream to be checked 
    +            for the existence of a new state.
    + Usage    : check_for_new_state( [$ignore_blank_lines] );
    + Argument : boolean: true if you want to ignore blank lines
    + Returns  : the next chunk of input ($/ is not altered)
    +            If there is no more input, returns undef.
    +
    +Subclasses should override this method and call it to obtain
    +the chunk of data for new state testing. 
    +
    +=cut
    +
    +sub check_for_new_state {
    +    my ($self, $ignore_blank_lines) = @_;
    +
    +    $self->verbose and print STDERR "Checking for new state...\n";
    +
    +    my $chunk = $self->next_input_chunk();
    +
    +    # Determine if we're supposed to ignore blanks and if so, loop
    +    # until we're either out of input or hit a non-blank line.
    +    if( defined $chunk && 
    +	$ignore_blank_lines and $chunk =~ /^\s*$/ ) {
    +        while(  $chunk = $self->next_input_chunk()) {
    +            last unless not $chunk or $chunk =~ /^\s*$/;
    +        }
    +    }
    +
    +    $self->verbose and print STDERR "  Input chunk: " . $chunk, "\n";
    +
    +    return $chunk;
    +}
    +
    +=head2 next_input_chunk()
    +
    + Argument : n/a
    + Returns  : The next chunk of input data from the IO stream
    +            To be used in determining what state the machine should be in.
    +
    +=cut
    +
    +sub next_input_chunk {
    +    my $self = shift;
    +
    +    $self->verbose and print STDERR "Getting next input chunk...\n", ;
    +
    +    if(not defined $self->{'_alarm_available'}) {
    +        $self->_check_if_alarm_available();
    +    }
    +
    +    $SIG{ALRM} = sub { die "Timed out!"; };
    +
    +    my $chunk;
    +
    +    eval {
    +        if( $self->{'_alarm_available'} and defined $self->{'_timeout_secs'}) {
    +	    alarm($self->{'_timeout_secs'});
    +	}
    +
    +        $chunk = $self->_readline();
    +
    +    };
    +    if($@ =~ /Timed out!/) {
    +	 $self->throw(-class => 'Bio::Root::IOException',
    +                      -text => "Timed out while waiting for input (timeout=$self->{'_timeout_secs'}s).");
    +     } elsif($@ =~ /\S/) {
    +         my $err = $@;
    +         $self->throw(-class => 'Bio::Root::IOException',
    +                      -text => "Unexpected error during readline: $err");
    +    }
    +
    +    return $chunk;
    +}
    +
    +
    +
    +# alarm() not available (ActiveState perl for win32 doesn't have it.
    +# See jitterbug PR#98)
    +sub _check_if_alarm_available {
    +    my $self = shift;
    +    eval {
    +        alarm(0);
    +    };
    +    if($@) {
    +        $self->{'_alarm_available'} = 0;
    +    }
    +    else {
    +        $self->{'_alarm_available'} = 1;
    +    }
    +}
    +
    +sub append_input_cache {
    +    my ($self, $data) = @_;
    +    push( @{$self->{'_input_cache'}}, $data) if defined $data;
    +}
    +
    +sub get_input_cache {
    +    my $self = shift;
    +    my @cache =  ();
    +    if( ref $self->{'_input_cache'} ) {
    +       @cache = @{$self->{'_input_cache'}};
    +    }
    +    return @cache;
    +}
    +
    +sub clear_input_cache {
    +    my $self = shift;
    +    @{$self->{'_input_cache'}} = ();
    +}
    +
    +
    +
    +1;
    +
    +
    +
    diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/Tmhmm.pm
    --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    +++ b/variant_effect_predictor/Bio/Tools/Tmhmm.pm	Thu Apr 11 02:01:53 2013 -0400
    @@ -0,0 +1,213 @@
    +# $Id: Tmhmm.pm,v 1.6 2002/10/22 07:45:22 lapp Exp $
    +#
    +# BioPerl module for Bio::Tools::Tmhmm
    +#
    +# Copyright Balamurugan Kumarasamy
    +#
    +# You may distribute this module under the same terms as perl itself
    +#
    +# POD documentation - main docs before the code
    +#
    +# Copyright 
    +#
    +# You may distribute this module under the same terms as perl itself
    +
    +=head1 NAME
    +
    +Bio::Tools::Tmhmm - parse TmHMM output (transmembrane HMM)
    +
    +=head1 SYNOPSIS
    +
    +  use Bio::Tools::Tmhmm;
    +  my $parser = new Bio::Tools::Tmhmm(-fh =>$filehandle );
    +  while( my $tmhmm_feat = $parser->next_result ) {
    +     #do something
    +     #eg
    +     push @tmhmm_feat, $tmhmm_feat;
    +  }
    +
    +=head1 DESCRIPTION
    +
    +Parser for Tmhmm output
    +
    +=head1 FEEDBACK
    +
    +=head2 Mailing Lists
    +
    +user feedback is an integral part of the evolution of this and other
    +Bioperl modules. Send your comments and suggestions preferably to
    +the Bioperl mailing list.  Your participation is much appreciated.
    +
    +  bioperl-l@bioperl.org              - General discussion
    +  http://bioperl.org/MailList.shtml  - About the mailing lists
    +
    +=head2 Reporting Bugs
    +
    +Report bugs to the Bioperl bug tracking system to help us keep track
    +of the bugs and their resolution. Bug reports can be submitted via
    +email or the web:
    +
    +  bioperl-bugs@bioperl.org
    +  http://bugzilla.bioperl.org/
    +
    +=head1 AUTHOR - Bala
    +
    +Email savikalpa@fugu-sg.org
    +
    +
    +=head1 APPENDIX
    +
    +The rest of the documentation details each of the object methods.
    +Internal methods are usually preceded with a _
    +
    +=cut
    +
    +package Bio::Tools::Tmhmm;
    +use vars qw(@ISA);
    +use strict;
    +
    +use Bio::Tools::AnalysisResult;
    +use Bio::Root::Root;
    +use Bio::SeqFeature::FeaturePair;
    +use Bio::Root::IO;
    +use Bio::SeqFeature::Generic;
    +@ISA = qw(Bio::Root::Root Bio::Root::IO Bio::Tools::AnalysisResult);
    +
    +
    +
    +=head2 new
    +
    + Title   : new
    + Usage   : my $obj = new Bio::Tools::Tmhmm();
    + Function: Builds a new Bio::Tools::Tmhmm object
    + Returns : Bio::Tools::Tmhmm
    + Args    : -fh/-file => $val, # for initing input, see Bio::Root::IO
    +
    +
    +=cut
    +
    +sub new {
    +      my($class,@args) = @_;
    +
    +      my $self = $class->SUPER::new(@args);
    +      $self->_initialize_io(@args);
    +
    +      return $self;
    +}
    +
    +
    +=head2 next_result
    +
    + Title   : next_result
    + Usage   : my $feat = $Tmhmm->next_result
    + Function: Get the next result set from parser data
    + Returns : Bio::SeqFeature::Generic
    + Args    : none
    +
    +
    +=cut
    +
    +sub next_result {
    +        my ($self) = @_;
    +
    +        my $line;
    +
    +        # parse
    +        my $id;
    +        while ($_=$self->_readline()) { 
    +           $line = $_;
    +           chomp $line;
    +
    +
    +           next if /^$/;
    +           if ($line=~/^#\s+(\S+)/) { 
    +                   #if the line starts with a '#' for example in # 13 Length: 522 
    +                   #assign 13 as the id.
    +
    +                    $id = $1;
    +                    my ($junk, $values) = split /:/;
    +                   $self->_seqname($id);
    +                    next;
    +           }
    +
    +           elsif ($line=~/^(\S+)\s+(\S+)\s+(\w+)\s+(\d+)\s+(\d+)/) {
    +
    +                    # Example :-  13      TMHMM2.0        inside       1   120
    +                    # assign $orien(inside) $start(1) and $end(120)
    +
    +
    +                    my $orien = $3;
    +                    my $start = $4;
    +                    my $end = $5;
    +                    $orien = uc ($orien);
    +
    +                    if ($orien eq "TMHELIX") {
    +                         my (%feature);
    +                         $feature{name} = $self->_seqname;
    +                         $feature{start} = $start;
    +                         $feature{end} = $end;
    +                         $feature{source} ='tmhmm';
    +                         $feature{primary}= 'transmembrane';
    +                         $feature{program} ='tmhmm';
    +                         $feature{logic_name} = 'TMHelix';
    +                         my $new_feat= $self->create_feature(\%feature);
    +                         return $new_feat;
    +                    }
    +                    next;
    +           }
    +           next;
    +        }
    +}
    +
    +=head2 create_feature
    +
    + Title   : create_feature
    + Usage   : obj->create_feature(\%feature)
    + Function: Internal(not to be used directly)
    + Returns : A Bio::SeqFeature::Generic object
    + Args    :
    +
    +=cut
    +
    +sub create_feature {
    +       my ($self, $feat) = @_;
    +
    +
    +       # create feature object
    +       my $feature = Bio::SeqFeature::Generic->new(-seq_id => $feat->{name},
    +                                                 -start    => $feat->{start},
    +                                                 -end      => $feat->{end},
    +                                                 -score    => $feat->{score},
    +                                                 -source   => $feat->{source},
    +                                                 -primary  => $feat->{primary},
    +                                                 -logic_name  => $feat->{logic_name}, 
    +                                               );
    +       return $feature;
    +   }
    +
    +=head2 _seqname
    +
    + Title   :   _seqname
    + Usage   :   obj->_seqname($seqname)
    + Function:   Internal(not to be used directly)
    + Returns :
    + Args    :   seqname
    +
    +=cut
    +
    +sub _seqname{
    +    my ($self,$seqname)=@_;
    +
    +    if (defined $seqname){
    +
    +        $self->{'seqname'}=$seqname;
    +    }
    +
    +    return $self->{'seqname'};
    +
    +}
    +
    +
    +1;
    +
    +
    diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/WWW.pm
    --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    +++ b/variant_effect_predictor/Bio/Tools/WWW.pm	Thu Apr 11 02:01:53 2013 -0400
    @@ -0,0 +1,1023 @@
    +#-----------------------------------------------------------------------------
    +# PACKAGE : Bio::Tools::WWW
    +# PURPOSE : To encapsulate commonly used URLs for web key websites in bioinformatics.
    +# AUTHOR  : Steve Chervitz
    +# CREATED : 27 Aug 1996 
    +# REVISION: $Id: WWW.pm,v 1.12 2002/10/22 07:38:46 lapp Exp $
    +#
    +# For documentation, run this module through pod2html 
    +# (preferably from Perl v5.004 or better).
    +#
    +# MODIFIED: 
    +#  0.014, sac --- Mon Aug 31 19:41:44 1998
    +#      * Updated and added a few URLs.
    +#      * Added method strip_html().
    +#      * Documentation changes.
    +#
    +#-----------------------------------------------------------------------------
    +
    +package	 Bio::Tools::WWW;
    +use strict;  
    +use Bio::Root::Root;
    +use Exporter      ();
    +use vars qw(@ISA @EXPORT_OK %EXPORT_TAGS $ID $VERSION $BioWWW $Revision 
    +	    $AUTHORITY);
    +$AUTHORITY = 'nobody@localhost';
    +@ISA         = qw( Bio::Root::Root Exporter);
    +@EXPORT_OK   = qw($BioWWW);
    +%EXPORT_TAGS = ( obj => [qw($BioWWW)],
    +		 std => [qw($BioWWW)]);
    +
    +$ID = 'Bio::Tools::WWW';
    +$VERSION = 0.014;
    +$Revision = '$Id: WWW.pm,v 1.12 2002/10/22 07:38:46 lapp Exp $'; #'
    +
    +## Static object.
    +$BioWWW = {};
    +bless $BioWWW, $ID;
    +$BioWWW->{'_name'} = "Static $ID object";
    +
    +
    +## POD Documentation:
    +
    +=head1 NAME
    +
    +Bio::Tools::WWW - Bioperl manager for web resources related to biology.
    +
    +=head1 SYNOPSIS
    +
    +=head2 Object Creation
    +
    +    use Bio::Tools qw(:obj);
    +
    +    $pdb = $BioWWW->home_url('pdb');
    +
    +There is no need to create a new Bio::Tools::WWW.pm object when the
    +C<:obj> tag is used. This tag will import the static $BioWWW object
    +created by Bio::Tools::WWW.pm into your name space. This saves you
    +from having to call C.
    +
    +You are free to not use the :obj tag and create the object as you
    +like, but a Bio::Tools::WWW object is not configurable; any given
    +script only needs a single copy.
    +
    +=head1 INSTALLATION
    +
    +This module is included with the central Bioperl distribution:
    +
    +   http://bio.perl.org/Core/Latest
    +   ftp://bio.perl.org/pub/DIST
    +
    +You also need to define URLs for the following variables in this package:
    +
    +  $Not_found_url : Generic page to show in place of a 404 error.
    +  $Tmp_url       : Web-accessible site that is Used for scripts that 
    +                   need to generate temporary, web-accessible files.
    +                   The files need not necessarily be HTML files, but 
    +                   being on the same disk as the server will permit 
    +                   faster IO from server scripts.
    +
    +=head1 DESCRIPTION
    +
    +Bio::Tools::WWW is primarily a URL broker for a select set 
    +of sites related to bioinformatics/genome analysis. It 
    +definitely represents a biased, unexhaustive set.
    +It might be more accurate to call this module 
    +"Bio::Tools::URL.pm". But this module does handle some non-URL
    +things and it may do more of this in the future. Having one
    +module to cover all biologically relevant web utilities
    +makes it more convenient, especially at this early stage
    +of development. 
    +
    +Maintaining accurate URLs over time can be challenging as 
    +new web sites spring up and old sites are re-organized. Because
    +of this fact, the URLs in this module are not guaranteed to be
    +correct or exhaustive and will require periodic updating.
    +
    +=head2 URL Management
    +
    +By keeping URL management within Bio::Tools::WWW.pm, other generic 
    +modules can easily access a variety of different web sites without 
    +having to know about a potential multitude of specific modules 
    +specialized for one database or another. An alternative approach would
    +be to have addresses defined within modules specialized for different
    +web sites. This, however, may create maintenance headaches when updating
    +these addresses.
    +
    +=head2 Complex Websites
    +
    +Websites with complex datasets may require special treatment
    +within this module. As an example,
    +URLs for the Saccharomyces Genome Database are clustered
    +separately in this module, due to (1) the different ways to
    +access information at this database and (2) the familiarity 
    +of the developer with this database. The Bio::SGD::WWW.pm inherits from
    +Bio::Tools::WWW.pm to permit access to the URLs provided by Bio::Tools::WWW.pm
    +and to SGD-specific HTML and images. 
    +
    +The organization of Bio::Tools::WWW.pm is expected to evolve as 
    +websites get born, die, and mutate their APIs.
    +
    +=head1 SEE ALSO
    +
    + http://bio.perl.org/Projects/modules.html  - Online module documentation
    + http://bio.perl.org/                       - Bioperl Project Homepage
    +
    +=head1 FEEDBACK
    +
    +=head2 Mailing Lists 
    +
    +User feedback is an integral part of the evolution of this and other Bioperl modules.
    +Send your comments and suggestions preferably to one of the Bioperl mailing lists.
    +Your participation is much appreciated.
    +
    +  bioperl-l@bioperl.org                  - General discussion
    +  http://www.bioperl.org/MailList.shtml  - About the mailing lists
    +
    +=head2 Reporting Bugs
    +
    +Report bugs to the Bioperl bug tracking system to help us keep track the bugs and 
    +their resolution. Bug reports can be submitted via email or the web:
    +
    +    bioperl-bugs@bio.perl.org                   
    +    http://bugzilla.bioperl.org/           
    +
    +=head1 AUTHOR
    +
    +Steve Chervitz, sac@bioperl.org
    +
    +=head1 VERSION
    +
    +Bio::Tools::WWW.pm, 0.014
    +
    +=head1 COPYRIGHT
    +
    +Copyright (c) 1996-98 Steve Chervitz. All Rights Reserved.
    +This module is free software; you can redistribute it and/or 
    +modify it under the same terms as Perl itself.
    +
    +
    +=cut
    +
    +
    +#
    +##
    +###
    +#### END of main POD documentation.
    +###
    +##
    +#
    +
    +
    +############################  DATA ##################################
    +
    +### Database homepage links.
    +my %Home_url =
    +    (
    +     'bioperl'         =>'http://bio.perl.org/',
    +     'bioperl-stanford'=>'http://genome-www.stanford.edu/perlOOP/bioperl/',
    +     'bioperl-schema'  =>'http://bio.perl.org/Projects/Schema/',
    +     'biomoo'          =>'http://bioinformatics.weizmann.ac.il/BioMOO/',
    +     'blast_ncbi'      =>'http://www.ncbi.nlm.nih.gov/BLAST/',
    +     'blast_wu'        =>'http://blast.wustl.edu/',
    +     'bsm'             =>'http://www.biochem.ucl.ac.uk/bsm/',
    +     'clustal'         =>'http://www.csc.fi/molbio/progs/clustalw/clustalw.html',
    +     'ebi'             =>'http://www.ebi.ac.uk/',
    +     'emotif'          =>'http://motif.Stanford.EDU/emotif',
    +     'entrez'          =>'http://www3.ncbi.nlm.nih.gov/Entrez/',
    +     'expasy'          =>'http://www.expasy.ch/',
    +     'gdb'             =>'http://www.gdb.org/',  # R.I.P. (Jan 1998); site still functional
    +     'mips'            =>'http://speedy.mips.biochem.mpg.de/',
    +     'mmdb'            =>'http://www.ncbi.nlm.nih.gov/Structure/',
    +     'modbase'         =>'http://guitar.rockefeller.edu/',
    +     'ncbi'            =>'http://www.ncbi.nlm.nih.gov/',
    +     'pedant'          =>'http://pedant.mips.biochem.mpg.de',
    +     'phylip'          =>'http://evolution.genetics.washington.edu/phylip.html',
    +     'pir'             =>'http://www-nbrf.georgetown.edu/pir/',
    +     'pfam'            =>'http://pfam.wustl.edu/',
    +     'pfam_uk'         =>'http://www.sanger.ac.uk/Software/Pfam/',
    +     'pfam_us'         =>'http://pfam.wustl.edu/',
    +     'pdb'             =>'http://www.pdb.bnl.gov/',
    +     'presage'         =>'http://presage.stanford.edu/',
    +     'geneQuiz'        =>'http://www.sander.ebi.ac.uk/genequiz/genomes/sc/',
    +     'molMov'          =>'http://bioinfo.mbb.yale.edu/MolMovDB/',
    +#     'protMot'         =>'http://bioinfo.mbb.yale.edu/ProtMotDB/', # old, use molMov instead
    +     'pubmed'          =>'http://www.ncbi.nlm.nih.gov/PubMed/',
    +     'sacch3d'         =>'http://genome-www.stanford.edu/Sacch3D/',
    +     'sgd'             =>'http://genome-www.stanford.edu/Saccharomyces/',
    +#     'scop'            =>'http://www.pdb.bnl.gov/scop/',
    +     'scop'            =>'http://scop.stanford.edu/scop/',
    +     'swissProt'       =>'http://www.expasy.ch/sprot/sprot-top.html',
    +     'webmol'          =>'http://genome-www.stanford.edu/structure/webmol/',
    +     'ypd'             =>'http://quest7.proteome.com/YPDhome.html',
    +     );
    +
    +### Database access CGI stems. (For some DBs the home URL can be used as the CGI stem)
    +my %Stem_url = 
    +    ( 
    +      'emotif'      =>'http://dna.Stanford.EDU/cgi-bin/emotif/',
    +      'entrez'      =>'http://www3.ncbi.nlm.nih.gov/htbin-post/Entrez/query?',
    +      'pdb'         =>'http://www.pdb.bnl.gov/pdb-bin/',
    +      'pfam_uk'     =>'http://www.sanger.ac.uk/cgi-bin/Pfam/',
    +      'pfam_us'     =>'http://pfam.wustl.edu/cgi-bin/',
    +      'pir'         =>'http://www-nbrf.georgetown.edu/cgi-bin/nbrfget?',
    +      );
    +
    +
    +### Database access stems/links.
    +my %Search_url = 
    +    ( #'3db'       =>'http://pdb.pdb.bnl.gov/cgi-bin/pdbids?3DB_ID=',   # Former stem
    +      '3db'          =>$Stem_url{'pdb'}.'opdbshort?oPDBid=',  # New stem (aug 1997)
    +      'embl'         =>$Home_url{'ebi'}.'htbin/emblfetch?',
    +      'expasy'       =>$Home_url{'expasy'}.'cgi-bin/',  # program name and query string must be supplied.
    +      'cath'         =>$Home_url{'bsm'}.'cath/CATHSrch.pl?type=PDB&query=',
    +      'cog_seq'      =>$Home_url{'ncbi'}.'cgi-bin/COG/nph-cognitor?seq=', # add sequence
    +      # To cog_orf, append ORF name ('YAL005c'). Case-sensitive! YAL005C won't work!
    +      'cog_orf'      =>$Home_url{'ncbi'}.'cgi-bin/COG/cogeseq?', 
    +      'ec1'          =>$Home_url{'gdb'}.'bin/bio/wais_q-bio?object_class_key=30&jhu_id=',
    +      'ec2'          =>$Home_url{'bsm'}.'enzymes/',
    +      'ec3'          =>$Home_url{'expasy'}.'cgi-bin/get-enzyme-entry?',
    +      'emotif_id'    =>$Stem_url{'emotif'}.'nph-identify?sequence=',
    +      'entrez'       =>$Stem_url{'entrez'}."db=p_r?db=1&choseninfo=ORF_NAME%20[Gene%20Name]\@1\@1&form=4&field=Gene%20Name&mode=0&retrievestring=ORF_NAME%20[Gene%20Name]",
    +      'gb_n'         =>$Stem_url{'entrez'}."db=n&form=6&dopt=g&uid=",
    +      'gb_p'         =>$Stem_url{'entrez'}."db=p&form=6&dopt=g&uid=",
    +      'gb_struct'    =>$Stem_url{'entrez'}."db=t&form=6&dopt=s&uid=",
    +      'pdb'          =>$Stem_url{'pdb'}.'send-text?filename=',
    +      'medline'      =>$Stem_url{'entrez'}.'form=6&db=m&Dopt=r&uid=',
    +      'mmdb'         =>$Stem_url{'entrez'}.'db=t&form=6&Dopt=s&uid=',
    +      'modbase_orf'  =>$Home_url{'modbase'}.'gm-cgi-bin/orf_page.cgi?pg1=0.5&pg2=1.0&orf=',
    +      # To the modbase_model, append yeast ORF name &pdb=<4-LETTER_CODE>&chain=
    +      'modbase_model' =>$Home_url{'modbase'}.'gm-cgi-bin/model_page.cgi?pg1=0.5&pg2=1.0&orf=',
    +      'molMov'       =>$Home_url{'molMov'}.'search.cgi?pdb=',
    +      'pdb'          =>$Stem_url{'pdb'}.'opdbshort?oPDBid=',  # same as 3db
    +      'pdb_coord'    =>$Stem_url{'pdb'}.'send-pdb?filename=', # retrieves full coordinate file
    +      'pfam'         =>$Home_url{'pfam'}.'cgi-bin/nph-hmm_search?evalue=1.0&protseq=',  # default: seq search, US
    +      'pfam_sp_uk'   =>$Stem_url{'pfam_uk'}.'swisspfamget.pl?name=',
    +      'pfam_seq_uk'  =>$Stem_url{'pfam_uk'}.'nph-search.cgi?evalue=1.0&type=normal&protseq=',
    +      'pfam_sp_us'   =>$Stem_url{'pfam_us'}.'getswisspfam?key=',
    +      'pfam_seq_us'  =>$Stem_url{'pfam_us'}.'nph-hmm_search?evalue=1.0&protseq=',
    +      'pfam_form'    =>$Home_url{'pfam'}.'cgi-bin/hmm_page.cgi', # interactive search form
    +      'pir_id'       =>$Stem_url{'pir'}.'fmt=c&xref=0&id=',
    +      'pir_acc'      =>$Stem_url{'pir'}.'fmt=c&xref=1&id=',
    +      'pir_uid'      =>$Stem_url{'pir'}.'uid=',
    +      'pdbSum'       =>$Home_url{'bsm'}.'cath/GetPDBSUMCODE.pl?code=',
    +#      'protMot'      =>$Home_url{'protMot'}.'search.cgi?pdb=', # old, use molMov instead
    +      'presage_sp'   =>$Home_url{'presage'}.'search.cgi?spac=',
    +      'swpr'         =>$Home_url{'expasy'}.'cgi-bin/get-sprot-entry?',
    +      'swModel'      =>$Home_url{'expasy'}.'cgi-bin/sprot-swmodel-sub?',
    +      'swprSearch'   =>$Home_url{'expasy'}.'cgi-bin/sprot-search-ful?',
    +      
    +      ###  SCOP tlev options can be appended to the stem after adding a PDB ID.
    +      ###  tlev options are: 'dm'(domain), 'sf'(superfamily), 'fa'(family), 'cf'(common fold), 'cl'(class)
    +      ###  E.g., search.cgi?pdb=1ARD;tlev=dm
    +
    +      'scop'         =>$Home_url{'scop'}.'search.cgi?pdb=',  ### better to use scop_pdb.
    +      'scop_pdb'     =>$Home_url{'scop'}.'search.cgi?pdb=',
    +      'scop_data'    =>$Home_url{'scop'}.'data/scop.',  ### Deprecated: frequent changes.
    +
    +      ## Search URLs for SGD/Sacch3D are contained %SGD_url and %S3d_url (below).
    +
    +      # For wormpep, the query string MUST end with "&keyword=" (after appending a sequence ID)
    +      'wormpep'        =>'http://www.sanger.ac.uk/cgi-bin/wormpep_fetch.pl?entry=', 
    +      'wormace'        =>'http://webace.sanger.ac.uk/cgi-bin/webace?db=wormace&class=Sequence&text=yes&object=',
    +
    +      ### YPD: You must use a valid gene name or ORF name (IFF there is no gene name).
    +      ###      For this reason it is most convenient to use SGD's Protein_Info link
    +      ###      which can accept either and will provide a proper link to YPD.
    +      'ypd'          =>'http://quest7.proteome.com/YPD/',  
    +      );
    +
    +
    +
    +### CGI stems for SGD and Sacch3D.
    +my %SGD_stem_url =
    +    ('stanford'      =>'http://genome-www.stanford.edu/',
    +     'sgd'           =>'http://genome-www.stanford.edu/cgi-bin/SGD/',  
    +     'sgd2'          =>'http://genome-www2.stanford.edu/cgi-bin/SGD/', 
    +     's3d'           =>'http://genome-www.stanford.edu/cgi-bin/SGD/Sacch3D/',  
    +     's3d2'          =>'http://genome-www2.stanford.edu/cgi-bin/SGD/Sacch3D/',  
    +     's3d3'          =>'http://genome-www3.stanford.edu/cgi-bin/SGD/Sacch3D/',  
    +     'sacchdb'       =>'http://genome-www.stanford.edu/cgi-bin/dbrun/SacchDB?',  
    +     );
    +
    +### SGD stems and links.
    +my %SGD_url = 
    +    ('home'         =>$Home_url{'sgd'},
    +     'help'         =>$Home_url{'sgd'}.'help/',
    +     'mammal'       =>$Home_url{'sgd'}.'mammal/',  
    +     'worm'         =>$Home_url{'sgd'}.'worm/',  
    +     'gene'         =>$SGD_stem_url{'sacchdb'}.'find+Locus+',
    +     'locus'        =>$SGD_stem_url{'sacchdb'}.'find+Locus+',
    +     'orf'          =>$SGD_stem_url{'sacchdb'}.'find+Locus+',
    +     'mipsorf'      =>$SGD_stem_url{'sgd'}."mips-orfs?",
    +     'gene_info'    =>$SGD_stem_url{'sacchdb'}.'find+Gene_Info+',
    +     'prot_info'    =>$SGD_stem_url{'sacchdb'}.'find+Protein_Info+',
    +     'seq'          =>$SGD_stem_url{'sgd'}.'seqDisplay?seq=',
    +     'gi'           =>$SGD_stem_url{'sacchdb'}.'find+Sequence+Database+=+GenPept+AND+NEXT+=+',
    +     'chr'          =>$SGD_stem_url{'sgd2'}.'seqTools?chr=',
    +     'chr_old'      =>$SGD_stem_url{'sgd'}.'dnaredir?chr=',
    +     'seq_an'       =>$SGD_stem_url{'sgd2'}.'seqTools?seqname=',
    +     'seq_an_old'   =>$SGD_stem_url{'sgd'}.'dnaredir?seqname=',
    +     'map_chr'      =>$SGD_stem_url{'sgd'}.'ORFMAP/ORFmap?chr=',
    +     'map_orf'      =>$SGD_stem_url{'sgd'}.'ORFMAP/ORFmap?seq=',
    +#     'chr'          =>$SGD_stem_url{'sgd2'}.'seqform?chr=',
    +#     'seg'          =>$SGD_stem_url{'sgd2'}.'seqform?seg=',
    +#     'fea'          =>$SGD_stem_url{'sgd2'}.'featureform?seg=',
    +     'feature'      =>$SGD_stem_url{'sgd2'}.'featureform?chr=', # complete with "5&beg=100&end=400"
    +     'search'       =>$SGD_stem_url{'sgd'}.'search?',
    +     'images'       =>$SGD_stem_url{'stanford'}.'images/',
    +     'suggest'      =>$SGD_stem_url{'stanford'}.'forms/sgd-suggestion.html',
    +     'tmp'          =>$SGD_stem_url{'stanford'}.'tmp/',
    +     );
    +
    +
    +### Sacch3D stems and links.
    +my %S3d_url =
    +    ('home'          =>$Home_url{'sacch3d'},
    +     'search'        =>$Home_url{'sacch3d'}.'search.html',
    +     'help'          =>$Home_url{'sacch3d'}.'help/',
    +     'new'           =>$Home_url{'sacch3d'}.'new/',
    +     'chrm'          =>$Home_url{'sacch3d'}.'data/chr',  
    +     'domains'       =>$Home_url{'sacch3d'}.'domains/',  
    +     'genequiz'      =>$Home_url{'sacch3d'}.'genequiz/',  
    +     'analysis'      =>$Home_url{'sacch3d'}.'analysis/',  
    +     'scop'          =>$SGD_stem_url{'s3d3'}.'getscop?data=',  
    +     'scop_fold'     =>$SGD_stem_url{'s3d3'}.'getscop?type=fold&data=',  
    +     'scop_class'    =>$SGD_stem_url{'s3d3'}.'getscop?type=class&data=',  
    +     'scop_gene'     =>$SGD_stem_url{'s3d3'}.'getscop?type=gene&data=',  
    +     'gene'          =>$SGD_stem_url{'s3d'}.'get?class=gene&item=',
    +     'orf'           =>$SGD_stem_url{'s3d'}.'get?class=orf&item=',
    +     'text'          =>$SGD_stem_url{'s3d'}.'get?class=text&item=',
    +     'pdb'           =>$SGD_stem_url{'s3d'}.'get?class=pdb&item=',
    +     'pdb_coord'     =>$SGD_stem_url{'s3d'}.'pdbcoord.pl?id=',
    +     'dsc'           =>$SGD_stem_url{'s3d'}.'dsc.pl?gene=',
    +     'emotif'        =>$SGD_stem_url{'s3d'}.'seq_search.pl?db=emotif&gene=',
    +     'pfam'          =>$SGD_stem_url{'s3d'}.'seq_search.pl?db=pfam&gene=',
    +     'pfam_uk'       =>$SGD_stem_url{'s3d'}.'seq_search.pl?db=pfam&loc=uk&gene=',
    +     'pfam_us'       =>$SGD_stem_url{'s3d'}.'seq_search.pl?db=pfam&loc=us&gene=',
    +     'blast_pdb'     =>$SGD_stem_url{'s3d'}.'getblast?db=pdb&name=',
    +     'blast_nr'      =>$SGD_stem_url{'s3d'}.'getblast?db=nr&name=',
    +     'blast_est'     =>$SGD_stem_url{'s3d'}.'getblast?db=est&name=',
    +     'blast_mammal'  =>$SGD_stem_url{'s3d'}.'getblast?db=mammal&name=',
    +     'blast_human'   =>$SGD_stem_url{'s3d'}.'getblast?db=human&name=',
    +     'blast_worm'    =>$SGD_stem_url{'s3d'}.'getblast?db=worm&name=',
    +     'blast_yeast'   =>$SGD_stem_url{'s3d'}.'getblast?db=yeast&name=',
    +     'blast_worm_yeast'=>$SGD_stem_url{'s3d'}.'getblast?db=worm&query=worm&name=',
    +     'patmatch'      =>$SGD_stem_url{'s3d2'}.'grepmatch?',  ## deprecated
    +     'grepmatch'     =>$SGD_stem_url{'s3d2'}.'grepmatch?',
    +     'pdb_neighbors' =>$SGD_stem_url{'s3d'}.'pdb_neighbors?id=CHAIN&gene=ORF_NAME',
    +     );
    +
    +
    +### 3D viewer stems.
    +my %Viewer_url = 
    +#    ('java'     =>$SGD_stem_url{'sgd'}.'Sacch3D/pdbViewer.pl?pdbCode=PDB&orf=',
    +    (
    +     'java'     =>$SGD_stem_url{'sgd'}.'Sacch3D/pdbViewer.pl?pdbCode=',  # Default java viewer
    +     'webmol'   =>$SGD_stem_url{'sgd'}.'Sacch3D/pdbViewer.pl?pdbCode=', 
    +     'codebase' =>$SGD_stem_url{'stanford'}.'structure/webmol/lib',
    +     'rasmol'   =>$Stem_url{'pdb'}.'send-ras?filename=',
    +     'chime'    =>$Stem_url{'pdb'}.'ccpeek?id=',
    +     'cn3d'     =>$Stem_url{'entrez'}.'db=t&form=6&Dopt=i&Complexity=Cn3D+Subset&uid=',
    +     'kinemage' =>'http://prosci.org/Kinemage',
    +     );
    +
    +
    +### Stock HTML
    +# The error reporting HTML strings represent some experiments in human psychology: 
    +# how do you induce users to report errors that you should know about yet not
    +# get flooded with trivial problems caused by novices?
    +my %Html = 
    +    ('authority'  =>qq|$AUTHORITY|,
    +     'trouble'    => <<"QQ_TROUBLE_QQ",
    +

    If this problem persists, please notify us. +Include a copy of this error page with your message. Thanks.

    +QQ_TROUBLE_QQ + 'notify' => <<"QQ_NOTIFY_QQ", +Please notify us. +Include a copy of this error page with your message. Thanks.

    +QQ_NOTIFY_QQ + 'ourFault' => <<"QQ_FAULT_QQ", +

    This is our fault! There is apparently a problem with our software +that we may not know about. Please notify us! +Include a copy of this error page with your message. Thanks.

    +QQ_FAULT_QQ + 'techDiff' => <<"QQ_TECH_QQ", +

    We are experiencing technical difficulties now.
    +We will have the problem fixed soon. Sorry for any inconvenience.

    +QQ_TECH_QQ + + ); + + +### Miscellaneous URLs. Configure as desired for your site. +my $Not_found_url = 'http://genome-www.stanford.edu/Sacch3D/notfound.html'; +my $Tmp_url = 'http://genome-www.stanford.edu/tmp/'; + + + +=head1 APPENDIX + +Methods beginning with a leading underscore are considered private +and are intended for internal use by this module. They are +B considered part of the public interface and are described here +for documentation purposes only. + +=cut + +######################################################################### +## ACCESSOR METHODS +######################################################################### + + +=head2 home_url + + Usage : $BioWWW->home_url() + Purpose : To obtain the homepage URL for a biological database or resource. + Returns : String containing the URL (including "http://") + Argument : String + : Currently acceptable arguments are: + : bioperl bioperl-schema biomoo bsm ebi emotif entrez + : expasy mips mmdb ncbi pir pfam pdb geneQuiz + : molMov pubmed sacch3d sgd scop swissProt webmol ypd + Throws : Warns if argument cannot be resolved to a URL. + Comments : The URLs listed here do not represent a complete list. + : Expect this to evolve and grow with time. + +See Also : L() + +=cut + +#------------- +sub home_url { +#------------- + my($self,$arg) = @_; + $arg eq 'all' and return %Home_url; + (exists $Home_url{$arg}) ? $Home_url{$arg} + : ($self->warn("Can't resolve argument to URL: $arg"), + $Not_found_url); +} + + + +=head2 search_url + + Usage : $BioWWW->search_url() + Purpose : To provide a URL stem for a search engine at a biological database + : or resource. + Returns : String containing the URL (including "http://") + Argument : String + : Currently acceptable arguments are: + : 3db embl cath ec1 ec2 ec3 emotif_id entrez gb1 gb2 + : gb3 gb4 gb5 pdb medline mmdb pdb pdb_coord pfam pir_acc + : pdbSum molMov swpr swModel swprSearch scop scop_pdb scop_data + : ypd + Throws : Warns if argument cannot be resolved to a URL. + Comments : Unlike the homepage URLs, this method does not return a complete + : URL but a stem which must be further modified, typically by + : appending data to it, before it can be used. The data appended + : depends on the specific URL; typically, it is a database ID or + : other unique identifier. + : The requirements for each URL will be described here eventually. + : + : The URLs listed here do not represent a complete list. + : Expect this to evolve and grow with time. + : + : Given this complexity, it may be useful to provide special methods + : for these different URLs. This would however result in an + : explosion of methods that might make this module less + : maintainable and harder to use. + +See Also : L() + +=cut + +#-------------- +sub search_url { +#-------------- + my($self,$arg) = @_; + $arg eq 'all' and return %Search_url; + (exists $Search_url{$arg}) ? $Search_url{$arg} + : ($self->warn("Can't resolve argument to URL: $arg"), + $Not_found_url); +} + + + +=head2 stem_url + + Usage : $BioWWW->stem_url() + Purpose : To obtain the minimal stem URL for searching a biological database or resource. + Returns : String containing the URL (including "http://") + Argument : String + : Currently acceptable arguments are: + : emotif entrez pdb + Throws : Warns if argument cannot be resolved to a URL. + Comments : The URLs stems returned by this method are much more minimal than + : this provided by search_url(). Use of these stems requires knowledge + : of the CGI scripts which they invoke. + +See Also : L() + +=cut + +#-------------- +sub stem_url { +#-------------- + my($self,$arg) = @_; + $arg eq 'all' and return %Stem_url; + (exists $Stem_url{$arg}) ? $Stem_url{$arg} + : ($self->warn("Can't resolve argument to URL: $arg"), + $Not_found_url); +} + + + +=head2 viewer_url + + Usage : $BioWWW->viewer_url() + Purpose : To obtain the stem URL for a 3D viewer (RasMol, WebMol, Cn3D) + Returns : String containing the URL (including "http://") + Argument : String + : Currently acceptable arguments are: + : rasmol webmol cn3d java (java is an alias for webmol) + Throws : Warns if argument cannot be resolved to a URL. + Comments : The 4-letter Brookhaven PDB identifier must be appended to the + : URL provided by this method. + : The URLs listed here do not represent a complete list. + : Expect this to evolve and grow with time. + +=cut + +#--------------- +sub viewer_url { +#--------------- + my($self,$arg) = @_; + $arg eq 'all' and return %Viewer_url; + (exists $Viewer_url{$arg}) ? $Viewer_url{$arg} + : ($self->warn("Can't resolve argument to URL: $arg"), + $Not_found_url); +} + + + +=head2 not_found_url + + Usage : $BioWWW->not_found_url() + Purpose : To obtain the URL for a web page to be shown in place of a 404 error. + Returns : String containing the URL (including "http://") + Argument : n/a + Throws : n/a + Comments : This URL should be customized as desired. + +=cut + +#----------------- +sub not_found_url { my $self = shift; $Not_found_url; } +#----------------- + + +=head2 tmp_url + + Usage : $BioWWW->tmp_url() + Purpose : To obtain the URL for a temporary, web-accessible directory. + Returns : String containing the URL (including "http://") + Argument : n/a + Throws : n/a + Comments : This URL should be customized as desired. + +=cut + +#----------- +sub tmp_url { my $self = shift; $Tmp_url; } +#----------- + + + +=head2 search_link + + Usage : $BioWWW->search_link(, , ) + Purpose : Wrapper for search_url() that returns the URL within an HTML anchor. + Returns : String containing the HTML anchor ( qq||) + Argument : = string to be used as argument for search_url() + : = string to be appended to the search URL stem. + : = string to be shown as the link text (default = ). + Throws : n/a + Status : Experimental + +See Also : L() + +=cut + +#--------------- +sub search_link { +#--------------- + my($self,$arg,$value,$text) = @_; + my $url = $self->search_url($arg); + $text ||= $value; + qq|$text|; +} + + + +=head2 viewer_link + + Usage : $BioWWW->viewer_link(, , ) + Purpose : Wrapper for viewer_url() that returns the complete URL within an HTML anchor. + Returns : String containing the HTML anchor ( qq||) + Argument : = string to be used as argument for viewer_url() + : = string to be appended to the viewer URL stem. + : = string to be shown as the link text (default = ). + Throws : n/a + Status : Experimental + +See Also : L() + +=cut + +#---------------- +sub viewer_link { +#---------------- + my($self,$arg,$value,$text) = @_; + my $url = $self->viewer_url($arg); + $text ||= $value; + qq|$text|; +} + + + +=head2 html + + Usage : $BioWWW->html() + Purpose : To obtain HTML-formatted text for frequently needed web-page messages. + Returns : String containing the HTML anchor ( qq||) + Argument : String. + : Currently acceptable arguments are: + : authority (mailto: link for webmaster; shows e-mail address as link) + : notify (wraps mailto:authority link with text for link "please notify us") + : ourFault ("this problem is our fault. If it persists ") + : trouble (same as ourFault but doesn't blame us for the problem) + : techDiff ("we are experiencing technical difficulties. Please stand by.") + Throws : n/a + Comments : The authority (webmaster) is imported from the Bio::Root::Global.pm + : module. The value for $AUTHORITY should be set there, or + : customize this module so that it doesn't use Bio::Root::Global.pm. + +=cut + +#---------- +sub html { +#---------- + my($self,$arg) = @_; + $arg eq 'all' and return %Html; + (exists $Html{$arg}) ? $Html{$arg} : "

    (missing HTML for \"$arg\")
    "; +} + + +### +### Below are accessors specialized for the Saccharomyces Genome Database +### It is possible that they will be moved to Bio::SGD::WWW.pm in the future. +### + + +=head2 sgd_url + + Usage : $BioWWW->sgd_url() + Purpose : To obtain the webpage URL or search stem for SGD. + Returns : String containing the URL (including "http://") + Argument : String + : Currently acceptable arguments (TODO). + Throws : Warns if argument cannot be resolved to a URL. + Comments : This accessor is specialized for the Saccharomyces Genome Database. + : It is possible that it will be moved to SGD::WWW.pm in the future. + +See Also : L() + +=cut + +#------------ +sub sgd_url { +#------------ + my($self,$arg) = @_; + $arg eq 'all' and return %SGD_url; + (exists $SGD_url{$arg}) ? $SGD_url{$arg} + : ($self->warn("Can't resolve argument to URL: $arg"), + $Not_found_url); +} + + + +=head2 s3d_url + + Usage : $BioWWW->s3d_url() + Purpose : To obtain the webpage URL or search stem for Sacch3D. + Returns : String containing the URL (including "http://") + Argument : String + : Currently acceptable arguments (TODO). + Throws : Warns if argument cannot be resolved to a URL. + Comments : This accessor is specialized for the Saccharomyces Genome Database. + : It is possible that it will be moved to SGD::WWW.pm in the future. + +See Also : L() + +=cut + +#----------- +sub s3d_url { +#----------- + my($self,$arg) = @_; + $arg eq 'all' and return %S3d_url; + (exists $S3d_url{$arg}) ? $S3d_url{$arg} + : ($self->warn("Can't resolve argument to URL: $arg"), + $Not_found_url); +} + + + +=head2 sgd_stem_url + + Usage : $BioWWW->sgd_stem_url() + Purpose : To obtain the minimal stem URL for a SGD/Sacch3D CGI script. + Returns : String containing the URL (including "http://") + Argument : String + : Currently acceptable arguments (TODO). + Throws : Warns if argument cannot be resolved to a URL. + Comments : This accessor is specialized for the Saccharomyces Genome Database. + : It is possible that it will be moved to SGD::WWW.pm in the future. + +See Also : L() + +=cut + +#----------------- +sub sgd_stem_url { +#----------------- + my($self,$arg) = @_; + $arg eq 'all' and return %SGD_stem_url; + (exists $SGD_stem_url{$arg}) ? $SGD_stem_url{$arg} + : ($self->warn("Can't resolve argument to URL: $arg"), + $Not_found_url); +} + + + +=head2 s3d_link + + Usage : $BioWWW->s3d_link(, , ) + Purpose : Wrapper for s3d_url() that returns the complete URL within an HTML anchor. + Returns : String containing the URL (including "http://") + Argument : = string to be used as argument for s3d_url() + : = string to be appended to the s3d URL stem. + : = string to be shown as the link text (default = ). + Throws : n/a + Status : Experimental + Comments : This accessor is specialized for the Saccharomyces Genome Database. + : It is possible that it will be moved to SGD::WWW.pm in the future. + +See Also : L(), L() + +=cut + +#-------------- +sub s3d_link { +#-------------- + my($self,$arg,$value,$text) = @_; + my $url = $self->s3d_url($arg); + $text ||= $value; + qq|$text|; +} + + + +=head2 sgd_link + + Usage : $BioWWW->sgd_link(, , ) + Purpose : Wrapper for sgd_url() that returns the complete URL within an HTML anchor. + Returns : String containing the URL (including "http://") + Argument : = string to be used as argument for sgd_url() + : = string to be appended to the sgd URL stem. + : = string to be shown as the link text (default = ). + Throws : n/a + Status : Experimental + Comments : This accessor is specialized for the Saccharomyces Genome Database. + : It is possible that it will be moved to SGD::WWW.pm in the future. + +See Also : L(), L() + +=cut + +#-------------- +sub sgd_link { +#-------------- + my($self,$arg,$value,$text) = @_; + my $url = $self->sgd_url($arg); + $text ||= $value; + qq|$text|; +} + + +######################################################################### +## INSTANCE METHODS +######################################################################### + +## Note that similar functions to those presented below are also availble +## via L. Stein's CGI.pm. These are more experimental versions. + +=head2 start_html + + Usage : $BioWWW->start_html() + Purpose : Prints the "Content-type: text/html\n\n\n" header. + Returns : n/a; This method prints the Content-type string shown above. + Argument : n/a + Throws : n/a + Status : Experimental + Comments : This method prevents redundant invocations thus avoiding th + : accidental printing of the "content-type..." on the page. + : If using L. Stein's CGI.pm, this is similar to $query->header() + : (Does CGI.pm prevent redundant invocation?) + +=cut + +#---------------' +sub start_html { +#--------------- + my $self=shift; + if(!$self->{'_started_html'}) { + print "Content-type: text/html\n\n\n"; + $self->{'_started_html'} = 1; + } +} + + +=head2 redirect + + Usage : $BioWWW->redirect() + Purpose : Prints the header needed to redirect a web browser to a supplied URL. + Returns : n/a; Prints the redirection header. + Argument : String containing the URL to be redirected to. + Throws : n/a + Status : Experimental + +=cut + +#------------- +sub redirect { +#------------- + my($self,$url) = @_; + + print "Location: $url\n"; + print "Content-type: text/html\n\n"; +} + + + +=head2 pre + + Usage : $BioWWW->pre("text to be pre-formatted"); + Purpose : To produce HTML for text that is not to be formated by the brower. + Returns : String containing the "
    " formatted html.
    + Argument  : n/a
    + Throws    : n/a
    + Status    : Experimental
    +
    +=cut
    +
    +#--------
    +sub pre { 
    +#--------
    +    my $self = shift; 
    +    "
    \n".shift()."\n
    "; +} + + +#---------------- +sub html_footer { +#---------------- + my( $self, @param ) = @_; + + my( $linkTo, $linkText, $modified, $mail, $mailText, $top) = + $self->_rearrange([qw(LINKTO LINKTEXT MODIFIED MAIL MAILTEXT TOP)], @param); + + $modified = (scalar $modified) + ? qq|
    Last modified: $modified
    | + : ''; + + $linkTo ||= ''; + +# $top = (defined $top) ? qq|Top
    | : ''; + $top = qq|Top|; ## Utilizing the HTML bug/feature wherein + ## a bogus name anchor defaults to the + ## top of the page. + + return <<"HTML"; +

    +


    +$top | $linkText
    +$modified +$mailText + + +HTML +} + + +=head2 strip_html + + Usage : $boolean = &strip_html( string_ref, [fast] ); + Purpose : Removes HTML formatting from a supplied string. + Returns : Boolean: true if string was stripped, false if not. + Argument : string_ref = reference to a string containing the whole + : web page to be stripped. + : fast = a non-zero value. Optional. If set, a faster + : but perhaps less thorough procedure is used for + : stripping. Default = not fast. + Throws : Exception if the argument is not a scalar reference. + Comments : Based on code originally written by Alex Dong Li + : (ali@genet.sickkids.on.ca). + : This is a more generic version of the function that appears + : in Bio::Tools::Blast::HTML.pm + : This version does not perform any Blast-specific stripping. + : + : This employs a simple method for removing tags that + : will fail under following conditions: + : 1) if quoted > appears in a tag (does this ever happen?) + : 2) if a tag is split over multiple lines and this method is + : used to process one line at a time. + : + : Without fast mode, large HTML files can take exceedingly long times to + : strip (e.g., 1Meg file with many tags can take 10 minutes versus 5 seconds + : in fast mode. Try the swissprot yeast table). If you know the HTML to be + : well-behaved (i.e., tags are not split across mutiple lines), use fast + : mode for large, dense files. + +=cut + +#--------------- +sub strip_html { +#--------------- + my ($self, $string_ref, $fast) = @_; + + ref $string_ref eq 'SCALAR' or + $self->throw("Can't strip HTML: ". + "Argument is should be a SCALAR reference not a ${\ref $string_ref}"); + + my $str = $$string_ref; + my $stripped = 0; + + if($fast) { + # MULTI-STRING-MODE: Much faster than single-string mode + # but will miss tags that span multiple lines. + # This is fine if you know the HTML to be "well-behaved". + + my @lines = split("\n", $str); + foreach (@lines) { + s/<[^>]+>| //gi and $stripped = 1; + } + + # This regexp likely won't work properly in this mode. + foreach (@lines) { + s/(\A|\n)>\s+/\n\n>/gi and $stripped = 1; + } + $$string_ref = join ("\n", @lines); + + } else { + + # SINGLE-STRING-MODE: Can be very slow for long strings with many substitutions. + + # Removing all "<>" tags. + $str =~ s/<[^>]+>| //sgi and $stripped = 1; + + # Re-uniting any lone '>' characters. Not really necessary for functional HTML + $str =~ s/(\A|\n)>\s+/\n\n>/sgi and $stripped = 1; + + $$string_ref = $str; + } + $stripped; +} + + +1; +__END__ + +######################################################################## +## END OF CLASS +######################################################################## + +=head1 FOR DEVELOPERS ONLY + +=head2 Data Members + +An instance of Bio::Tools::WWW.pm is a blessed reference to a hash containing +all or some of the following fields: + + FIELD VALUE + -------------------------------------------------------------- + _started_html Defined the on the initial invocation of start_html() + to avoid duplicate printing out the "Content-type..." header. + + +=cut + +1; + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tools/pSW.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/pSW.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,421 @@ +## $Id: pSW.pm,v 1.21 2002/10/22 07:45:22 lapp Exp $ + +# +# BioPerl module for Bio::Tools::pSW +# +# Cared for by Ewan Birney +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::pSW - pairwise Smith Waterman object + +=head1 SYNOPSIS + + use Bio::Tools::pSW; + use Bio::AlignIO; + my $factory = new Bio::Tools::pSW( '-matrix' => 'blosum62.bla', + '-gap' => 12, + '-ext' => 2, + ); + + #use the factory to make some output + + $factory->align_and_show($seq1,$seq2,STDOUT); + + # make a Bio::SimpleAlign and do something with it + + my $aln = $factory->pairwise_alignment($seq1,$seq2); + my $alnout = new Bio::AlignIO(-format => 'msf', + -fh => \*STDOUT); + + $alnout->write_aln($aln); + +=head1 INSTALLATION + +This module is included with the central Bioperl distribution: + + http://bio.perl.org/Core/Latest + ftp://bio.perl.org/pub/DIST + +Follow the installation instructions included in the INSTALL file. + +=head1 DESCRIPTION + +pSW is an Alignment Factory for protein sequences. It builds pairwise +alignments using the Smith-Waterman algorithm. The alignment algorithm is +implemented in C and added in using an XS extension. The XS extension basically +comes from the Wise2 package, but has been slimmed down to only be the +alignment part of that (this is a good thing!). The XS extension comes +from the bioperl-ext package which is distributed along with bioperl. +I This package will not work if you have not compiled the +bioperl-ext package. + +The mixture of C and Perl is ideal for this sort of +problem. Here are some plus points for this strategy: + +=over 2 + +=item Speed and Memory + +The algorithm is actually implemented in C, which means it is faster than +a pure perl implementation (I have never done one, so I have no idea +how faster) and will use considerably less memory, as it efficiently +assigns memory for the calculation. + +=item Algorithm efficiency + +The algorithm was written using Dynamite, and so contains an automatic +switch to the linear space divide-and-conquer method. This means you +could effectively align very large sequences without killing your machine +(it could take a while though!). + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Ewan Birney, birney@sanger.ac.uk or birney@ebi.ac.uk + +=head1 CONTRIBUTORS + +Jason Stajich, jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with an underscore "_". + +=cut + +# Let the code begin... + +package Bio::Tools::pSW; +use vars qw(@ISA); +use strict; +no strict ( 'refs'); + +BEGIN { + eval { + require Bio::Ext::Align; + }; + if ( $@ ) { + die("\nThe C-compiled engine for Smith Waterman alignments (Bio::Ext::Align) has not been installed.\n Please read the install the bioperl-ext package\n\n"); + exit(1); + } +} + +use Bio::Tools::AlignFactory; +use Bio::SimpleAlign; + + +@ISA = qw(Bio::Tools::AlignFactory); + + + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + my($matrix,$gap,$ext) = $self->_rearrange([qw(MATRIX + GAP + EXT + )],@args); + + #default values - we have to load matrix into memory, so + # we need to check it out now + if( ! defined $matrix || !($matrix =~ /\w/) ) { + $matrix = 'blosum62.bla'; + } + + $self->matrix($matrix); # will throw exception if it can't load it + $self->gap(12) unless defined $gap; + $self->ext(2) unless defined $ext; + + # I'm pretty sure I am not doing this right... ho hum... + # This was not roght ($gap and $ext could not be 0) It is fixed now /AE + if( defined $gap ) { + if( $gap =~ /^\d+$/ ) { + $self->gap($gap); + } else { + $self->throw("Gap penalty must be a number, not [$gap]"); + } + } + if( defined $ext ) { + if( $ext =~ /^\d+$/ ) { + $self->ext($ext); + } else { + $self->throw("Extension penalty must be a number, not [$ext]"); + } + } + + return $self; +} + + +=head2 pairwise_alignment + + Title : pairwise_alignment + Usage : $aln = $factory->pairwise_alignment($seq1,$seq2) + Function: Makes a SimpleAlign object from two sequences + Returns : A SimpleAlign object + Args : + + +=cut + +sub pairwise_alignment{ + my ($self,$seq1,$seq2) = @_; + my($t1,$t2,$aln,$out,@str1,@str2,@ostr1,@ostr2,$alc,$tstr,$tid,$start1,$end1,$start2,$end2,$alctemp); + + if( ! defined $seq1 || ! $seq1->isa('Bio::PrimarySeqI') || + ! defined $seq2 || ! $seq2->isa('Bio::PrimarySeqI') ) { + $self->warn("Cannot call pairwise_alignment without specifing 2 sequences (Bio::PrimarySeqI objects)"); + return undef; + } + # fix Jitterbug #1044 + if( $seq1->length() < 2 || + $seq2->length() < 2 ) { + $self->warn("cannot align sequences with length less than 2"); + return undef; + } + $self->set_memory_and_report(); + # create engine objects + $seq1->display_id('seq1') unless ( defined $seq1->id() ); + $seq2->display_id('seq2') unless ( defined $seq2->id() ); + + $t1 = &Bio::Ext::Align::new_Sequence_from_strings($seq1->id(), + $seq1->seq()); + $t2 = &Bio::Ext::Align::new_Sequence_from_strings($seq2->id(), + $seq2->seq()); + $aln = &Bio::Ext::Align::Align_Sequences_ProteinSmithWaterman($t1,$t2,$self->{'matrix'},-$self->gap,-$self->ext); + if( ! defined $aln || $aln == 0 ) { + $self->throw("Unable to build an alignment"); + } + + # free sequence engine objects + + $t1 = $t2 = 0; + + # now we have to get into the AlnBlock structure and + # figure out what is aligned to what... + + # we are going to need the sequences as arrays for convience + + @str1 = split(//, $seq1->seq()); + @str2 = split(//, $seq2->seq()); + + # get out start points + + # The alignment is in alignment coordinates - ie the first + # residues starts at -1 and ends at 0. (weird I know). + # bio-coordinates are +2 from this... + + $start1 = $aln->start()->alu(0)->start +2; + $start2 = $aln->start()->alu(1)->start +2; + + # step along the linked list of alc units... + + for($alc = $aln->start();$alc->at_end() != 1;$alc = $alc->next()) { + if( $alc->alu(0)->text_label eq 'SEQUENCE' ) { + push(@ostr1,$str1[$alc->alu(0)->start+1]); + } else { + # assumme it is in insert! + push(@ostr1,'-'); + } + + if( $alc->alu(1)->text_label eq 'SEQUENCE' ) { + push(@ostr2,$str2[$alc->alu(1)->start+1]); + } else { + # assumme it is in insert! + push(@ostr2,'-'); + } + $alctemp = $alc; + } + + # + # get out end points + # + + # end points = real residue end in 'C' coordinates = residue + # end in biocoordinates. Oh... the wonder of coordinate systems! + + $end1 = $alctemp->alu(0)->end+1; + $end2 = $alctemp->alu(1)->end+1; + + # get rid of the alnblock + $alc = 0; + $aln = 0; + + # new SimpleAlignment + $out = Bio::SimpleAlign->new(); # new SimpleAlignment + + $tstr = join('',@ostr1); + $tid = $seq1->id(); + $out->add_seq(Bio::LocatableSeq->new( -seq=> $tstr, + -start => $start1, + -end => $end1, + -id=>$tid )); + + $tstr = join('',@ostr2); + $tid = $seq2->id(); + $out->add_seq(Bio::LocatableSeq->new( -seq=> $tstr, + -start => $start2, + -end => $end2, + -id=> $tid )); + + # give'm back the alignment + + return $out; +} + +=head2 align_and_show + + Title : align_and_show + Usage : $factory->align_and_show($seq1,$seq2,STDOUT) + +=cut + +sub align_and_show { + my($self,$seq1,$seq2,$fh) = @_; + my($t1,$t2,$aln,$id,$str); + +if( ! defined $seq1 || ! $seq1->isa('Bio::PrimarySeqI') || + ! defined $seq2 || ! $seq2->isa('Bio::PrimarySeqI') ) { + $self->warn("Cannot call align_and_show without specifing 2 sequences (Bio::PrimarySeqI objects)"); + return undef; + } + # fix Jitterbug #1044 + if( $seq1->length() < 2 || + $seq2->length() < 2 ) { + $self->warn("cannot align sequences with length less than 2"); + return undef; + } + if( ! defined $fh ) { + $fh = \*STDOUT; + } + $self->set_memory_and_report(); + $seq1->display_id('seq1') unless ( defined $seq1->id() ); + $seq2->display_id('seq2') unless ( defined $seq2->id() ); + + $t1 = &Bio::Ext::Align::new_Sequence_from_strings($seq1->id(),$seq1->seq()); + + $t2 = &Bio::Ext::Align::new_Sequence_from_strings($seq2->id(),$seq2->seq()); + $aln = &Bio::Ext::Align::Align_Sequences_ProteinSmithWaterman($t1,$t2,$self->{'matrix'},-$self->gap,-$self->ext); + if( ! defined $aln || $aln == 0 ) { + $self->throw("Unable to build an alignment"); + } + + &Bio::Ext::Align::write_pretty_seq_align($aln,$t1,$t2,12,50,$fh); + +} + +=head2 matrix + + Title : matrix() + Usage : $factory->matrix('blosum62.bla'); + Function : Reads in comparison matrix based on name + : + Returns : + Argument : comparison matrix + +=cut + +sub matrix { + my($self,$comp) = @_; + my $temp; + + if( !defined $comp ) { + $self->throw("You must have a comparison matrix to set!"); + } + + # talking to the engine here... + + $temp = &Bio::Ext::Align::CompMat::read_Blast_file_CompMat($comp); + + if( !(defined $temp) || $temp == 0 ) { + $self->throw("$comp cannot be read as a BLAST comparison matrix file"); + } + + $self->{'matrix'} = $temp; +} + + + +=head2 gap + + Title : gap + Usage : $gap = $factory->gap() #get + : $factory->gap($value) #set + Function : the set get for the gap penalty + Example : + Returns : gap value + Arguments : new value + +=cut + +sub gap { + my ($self,$val) = @_; + + + if( defined $val ) { + if( $val < 0 ) { # Fixed so that gap==0 is allowed /AE + $self->throw("Can't have a gap penalty less than 0"); + } + $self->{'gap'} = $val; + } + return $self->{'gap'}; +} + + +=head2 ext + + Title : ext + Usage : $ext = $factory->ext() #get + : $factory->ext($value) #set + Function : the set get for the ext penalty + Example : + Returns : ext value + Arguments : new value + +=cut + +sub ext { + my ($self,$val) = @_; + + if( defined $val ) { + if( $val < 0 ) { # Fixed so that gap==0 is allowed /AE + $self->throw("Can't have a gap penalty less than 0"); + } + $self->{'ext'} = $val; + } + return $self->{'ext'}; +} + + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tree/AlleleNode.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tree/AlleleNode.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,186 @@ +# $Id: AlleleNode.pm,v 1.4 2002/10/22 07:45:24 lapp Exp $ +# +# BioPerl module for Bio::Tree::AlleleNode +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tree::AlleleNode - DESCRIPTION of Object + +=head1 SYNOPSIS + +Give standard usage here + +=head1 DESCRIPTION + +Describe the object here + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Tree::AlleleNode; +use vars qw(@ISA); +use strict; + +use Bio::Tree::Node; + +@ISA = qw(Bio::Tree::Node ); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Tree::AlleleNode(); + Function: Builds a new Bio::Tree::AlleleNode object + Returns : Bio::Tree::AlleleNode + Args : + + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + my ($alleles) = $self->_rearrange([qw(ALLELES)], @args); + $self->{'_data'} = {}; + if( defined $alleles ) { + if( ref($alleles) !~ /HASH/i ) { + $self->warn("Must specify a valid HASH reference for the -alleles value...Ignoring initializing input"); + + } else { + foreach my $mkr ( keys %{$alleles} ) { + $self->add_alleles($mkr,@{$alleles->{$mkr}}); + } + } + } + return $self; +} + +=head2 add_alleles + + Title : add_alleles + Usage : $node->add_alleles($mkr,@alleles); + Function: Adds allele(s) for $mkr, @alleles can be a single or + multiple alleles. If the same marker is added more than one, the + previous value will be overwritten with a warning. + Returns : none + Args : $marker => marker name + @alleles => alleles for the marker + + +=cut + +sub add_alleles{ + my ($self,$marker,@alleles) = @_; + if( ! defined $marker || $marker eq '' ) { + $self->warn("must specify a valid marker name for add_alleles"); + return; + } + if( $self->{'_data'}->{$marker} ) { + $self->warn("Overwriting value of $marker"); + } + $self->{'_data'}->{$marker} = []; # reset the array ref + foreach my $a ( sort @alleles ) { + next if ! defined $a; # skip undef alleles + push @{$self->{'_data'}->{$marker}},$a; + } +} + +=head2 get_alleles + + Title : get_alleles + Usage : my @alleles = $node->get_alleles($marker); + Function: Return the alleles for a marker $marker + Returns : Array of Alleles for a marker or empty array + Args : $marker name + +=cut + +sub get_alleles{ + my ($self,$marker) = @_; + if( defined $self->{'_data'}->{$marker} ) { + return @{$self->{'_data'}->{$marker}}; + } + return (); +} + +=head2 get_marker_names + + Title : get_marker_names + Usage : my @names =$node->get_marker_names(); + Function: Return the names of the markers that have been added to this node + Returns : List of Marker Names + Args : none + +=cut + +sub get_marker_names{ + my ($self) = @_; + return keys %{$self->{'_data'}}; +} + +=head2 purge_markers + + Title : purge_markers + Usage : $node->purge_markers; + Function: Reset the markers and alleles + Returns : none + Args : none + + +=cut + +sub purge_markers{ + my ($self) = @_; + $self->{'_data'} = {}; + return; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tree/Node.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tree/Node.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,639 @@ +# $Id: Node.pm,v 1.17.2.3 2003/09/14 19:00:35 jason Exp $ +# +# BioPerl module for Bio::Tree::Node +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tree::Node - A Simple Tree Node + +=head1 SYNOPSIS + + use Bio::Tree::Node; + my $nodeA = new Bio::Tree::Node(); + my $nodeL = new Bio::Tree::Node(); + my $nodeR = new Bio::Tree::Node(); + + my $node = new Bio::Tree::Node(); + $node->add_Descendent($nodeL); + $node->add_Descendent($nodeR); + + print "node is not a leaf \n" if( $node->is_leaf); + +=head1 DESCRIPTION + +Makes a Tree Node suitable for building a Tree. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +the web: + + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +=head1 CONTRIBUTORS + +Aaron Mackey amackey@virginia.edu + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Tree::Node; +use vars qw(@ISA $CREATIONORDER); +use strict; + +use Bio::Root::Root; +use Bio::Tree::NodeI; + +@ISA = qw(Bio::Root::Root Bio::Tree::NodeI); + +BEGIN { + $CREATIONORDER = 0; +} + + +=head2 new + + Title : new + Usage : my $obj = new Bio::Tree::Node(); + Function: Builds a new Bio::Tree::Node object + Returns : Bio::Tree::Node + Args : -left => pointer to Left descendent (optional) + -right => pointer to Right descenent (optional) + -branch_length => branch length [integer] (optional) + -bootstrap => value bootstrap value (string) + -description => description of node + -id => human readable id for node + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + my ($children, $branchlen,$id, + $bootstrap, $desc,$d) = $self->_rearrange([qw(DESCENDENTS + BRANCH_LENGTH + ID + BOOTSTRAP + DESC + DESCRIPTION + )], + @args); + $self->_register_for_cleanup(\&node_cleanup); + $self->{'_desc'} = {}; # for descendents + if( $d && $desc ) { + $self->warn("can only accept -desc or -description, not both, accepting -description"); + $desc = $d; + } elsif( defined $d && ! defined $desc ) { + $desc = $d; + } + defined $desc && $self->description($desc); + defined $bootstrap && $self->bootstrap($bootstrap); + defined $id && $self->id($id); + defined $branchlen && $self->branch_length($branchlen); + + if( defined $children ) { + if( ref($children) !~ /ARRAY/i ) { + $self->warn("Must specify a valid ARRAY reference to initialize a Node's Descendents"); + } + foreach my $c ( @$children ) { + $self->add_Descendent($c); + } + } + $self->_creation_id($CREATIONORDER++); + return $self; +} + +=head2 add_Descendent + + Title : add_Descendent + Usage : $node->add_Descendant($node); + Function: Adds a descendent to a node + Returns : number of current descendents for this node + Args : Bio::Node::NodeI + boolean flag, true if you want to ignore the fact that you are + adding a second node with the same unique id (typically memory + location reference in this implementation). default is false and + will throw an error if you try and overwrite an existing node. + +=cut + +sub add_Descendent{ + my ($self,$node,$ignoreoverwrite) = @_; + return -1 if( ! defined $node ) ; + if( ! $node->isa('Bio::Tree::NodeI') ) { + $self->warn("Trying to add a Descendent who is not a Bio::Tree::NodeI"); + return -1; + } + # do we care about order? + $node->ancestor($self); + if( $self->{'_desc'}->{$node->internal_id} && ! $ignoreoverwrite ) { + $self->throw("Going to overwrite a node which is $node that is already stored here, set the ignore overwrite flag (parameter 2) to true to ignore this in the future"); + } + + $self->{'_desc'}->{$node->internal_id} = $node; # is this safely unique - we've tested before at any rate?? + + $self->invalidate_height(); + + return scalar keys %{$self->{'_desc'}}; +} + + +=head2 each_Descendent + + Title : each_Descendent($sortby) + Usage : my @nodes = $node->each_Descendent; + Function: all the descendents for this Node (but not their descendents + i.e. not a recursive fetchall) + Returns : Array of Bio::Tree::NodeI objects + Args : $sortby [optional] "height", "creation" or coderef to be used + to sort the order of children nodes. + +=cut + +sub each_Descendent{ + my ($self, $sortby) = @_; + + # order can be based on branch length (and sub branchlength) + + $sortby ||= 'height'; + + if (ref $sortby eq 'CODE') { + return sort $sortby values %{$self->{'_desc'}}; + } else { + if ($sortby eq 'height') { + return map { $_->[0] } + sort { $a->[1] <=> $b->[1] || + $a->[2] <=> $b->[2] } + map { [$_, $_->height, $_->internal_id ] } + values %{$self->{'_desc'}}; + } else { + return map { $_->[0] } + sort { $a->[1] <=> $b->[1] } + map { [$_, $_->height ] } + values %{$self->{'_desc'}}; + } + } +} + +=head2 remove_Descendent + + Title : remove_Descendent + Usage : $node->remove_Descedent($node_foo); + Function: Removes a specific node from being a Descendent of this node + Returns : nothing + Args : An array of Bio::Node::NodeI objects which have be previously + passed to the add_Descendent call of this object. + +=cut + +sub remove_Descendent{ + my ($self,@nodes) = @_; + my $c= 0; + foreach my $n ( @nodes ) { + if( $self->{'_desc'}->{$n->internal_id} ) { + $n->ancestor(undef); + # should be redundant + $self->{'_desc'}->{$n->internal_id}->ancestor(undef); + delete $self->{'_desc'}->{$n->internal_id}; + my $a1 = $self->ancestor; + # remove unecessary nodes if we have removed the part + # which branches. + # if( $a1 ) { + # my $bl = $self->branch_length || 0; + # my @d = $self->each_Descendent; + # if (scalar @d == 1) { + # $d[0]->branch_length($bl + ($d[0]->branch_length || 0)); + # $a1->add_Descendent($d[0]); + # } + # $a1->remove_Descendent($self); + #} + $c++; + } else { + if( $self->verbose ) { + $self->debug(sprintf("no node %s (%s) listed as a descendent in this node %s (%s)\n",$n->id, $n,$self->id,$self)); + $self->debug("Descendents are " . join(',', keys %{$self->{'_desc'}})."\n"); + } + } + } + $c; +} + + +=head2 remove_all_Descendents + + Title : remove_all_Descendents + Usage : $node->remove_All_Descendents() + Function: Cleanup the node's reference to descendents and reset + their ancestor pointers to undef, if you don't have a reference + to these objects after this call they will be cleaned up - so + a get_nodes from the Tree object would be a safe thing to do first + Returns : nothing + Args : none + + +=cut + +sub remove_all_Descendents{ + my ($self) = @_; + # this won't cleanup the nodes themselves if you also have + # a copy/pointer of them (I think)... + while( my ($node,$val) = each %{ $self->{'_desc'} } ) { + $val->ancestor(undef); + } + $self->{'_desc'} = {}; + 1; +} + +=head2 get_all_Descendents + + Title : get_all_Descendents + Usage : my @nodes = $node->get_all_Descendents; + Function: Recursively fetch all the nodes and their descendents + *NOTE* This is different from each_Descendent + Returns : Array or Bio::Tree::NodeI objects + Args : none + +=cut + +# implemented in the interface + +=head2 ancestor + + Title : ancestor + Usage : $obj->ancestor($newval) + Function: Set the Ancestor + Returns : value of ancestor + Args : newvalue (optional) + +=cut + +sub ancestor{ + my $self = shift; + $self->{'_ancestor'} = shift @_ if @_; + return $self->{'_ancestor'}; +} + +=head2 branch_length + + Title : branch_length + Usage : $obj->branch_length() + Function: Get/Set the branch length + Returns : value of branch_length + Args : newvalue (optional) + + +=cut + +sub branch_length{ + my $self = shift; + if( @_ ) { + my $bl = shift; + if( defined $bl && + $bl =~ s/\[(\d+)\]// ) { + $self->bootstrap($1); + } + $self->{'_branch_length'} = $bl; + } + return $self->{'_branch_length'}; +} + + +=head2 bootstrap + + Title : bootstrap + Usage : $obj->bootstrap($newval) + Function: Get/Set the bootstrap value + Returns : value of bootstrap + Args : newvalue (optional) + + +=cut + +sub bootstrap { + my $self = shift; + if( @_ ) { + if( $self->has_tag('B') ) { + $self->remove_tag('B'); + } + $self->add_tag_value('B',shift); + } + return ($self->get_tag_values('B'))[0]; +} + +=head2 description + + Title : description + Usage : $obj->description($newval) + Function: Get/Set the description string + Returns : value of description + Args : newvalue (optional) + + +=cut + +sub description{ + my $self = shift; + $self->{'_description'} = shift @_ if @_; + return $self->{'_description'}; +} + +=head2 id + + Title : id + Usage : $obj->id($newval) + Function: The human readable identifier for the node + Returns : value of human readable id + Args : newvalue (optional) + Note : id cannot contain the chracters '();:' + +"A name can be any string of printable characters except blanks, +colons, semicolons, parentheses, and square brackets. Because you may +want to include a blank in a name, it is assumed that an underscore +character ("_") stands for a blank; any of these in a name will be +converted to a blank when it is read in." + +from L + +=cut + +sub id{ + my ($self, $value) = @_; + if ($value) { + $self->warn("Illegal characters ();: and space in the id [$value], converting to _ ") + if $value =~ /\(\);:/ and $self->verbose >= 0; + $value =~ s/[\(\);:\s]/_/g; + $self->{'_id'} = $value; + } + return $self->{'_id'}; +} + +=head2 internal_id + + Title : internal_id + Usage : my $internalid = $node->internal_id + Function: Returns the internal unique id for this Node + (a monotonically increasing number for this in-memory implementation + but could be a database determined unique id in other + implementations) + Returns : unique id + Args : none + +=cut + +sub internal_id{ + return $_[0]->_creation_id; +} + + +=head2 _creation_id + + Title : _creation_id + Usage : $obj->_creation_id($newval) + Function: a private method signifying the internal creation order + Returns : value of _creation_id + Args : newvalue (optional) + + +=cut + +sub _creation_id{ + my $self = shift @_; + $self->{'_creation_id'} = shift @_ if( @_); + return $self->{'_creation_id'} || 0; +} + +=head2 Bio::Node::NodeI decorated interface implemented + +The following methods are implemented by L decorated +interface. + +=head2 is_Leaf + + Title : is_Leaf + Usage : if( $node->is_Leaf ) + Function: Get Leaf status + Returns : boolean + Args : none + +=cut + +sub is_Leaf { + my ($self) = @_; + my $isleaf = ! (defined $self->{'_desc'} && + (keys %{$self->{'_desc'}} > 0) ); + return $isleaf; +} + +=head2 to_string + + Title : to_string + Usage : my $str = $node->to_string() + Function: For debugging, provide a node as a string + Returns : string + Args : none + +=head2 height + + Title : height + Usage : my $len = $node->height + Function: Returns the height of the tree starting at this + node. Height is the maximum branchlength. + Returns : The longest length (weighting branches with branch_length) to a leaf + Args : none + +=cut + +sub height { + my ($self) = @_; + + return $self->{'_height'} if( defined $self->{'_height'} ); + + if( $self->is_Leaf ) { + if( !defined $self->branch_length ) { + $self->debug(sprintf("Trying to calculate height of a node when a Node (%s) has an undefined branch_length\n",$self->id || '?' )); + return 0; + } + return $self->branch_length; + } + my $max = 0; + foreach my $subnode ( $self->each_Descendent ) { + my $s = $subnode->height; + if( $s > $max ) { $max = $s; } + } + return ($self->{'_height'} = $max + ($self->branch_length || 1)); +} + + +=head2 invalidate_height + + Title : invalidate_height + Usage : private helper method + Function: Invalidate our cached value of the node's height in the tree + Returns : nothing + Args : none + +=cut + +#' + +sub invalidate_height { + my ($self) = @_; + + $self->{'_height'} = undef; + if( $self->ancestor ) { + $self->ancestor->invalidate_height; + } +} + +=head2 add_tag_value + + Title : add_tag_value + Usage : $node->add_tag_value($tag,$value) + Function: Adds a tag value to a node + Returns : number of values stored for this tag + Args : $tag - tag name + $value - value to store for the tag + + +=cut + +sub add_tag_value{ + my ($self,$tag,$value) = @_; + if( ! defined $tag || ! defined $value ) { + $self->warn("cannot call add_tag_value with an undefined value"); + } + push @{$self->{'_tags'}->{$tag}}, $value; + return scalar @{$self->{'_tags'}->{$tag}}; +} + +=head2 remove_tag + + Title : remove_tag + Usage : $node->remove_tag($tag) + Function: Remove the tag and all values for this tag + Returns : boolean representing success (0 if tag does not exist) + Args : $tag - tagname to remove + + +=cut + +sub remove_tag { + my ($self,$tag) = @_; + if( exists $self->{'_tags'}->{$tag} ) { + $self->{'_tags'}->{$tag} = undef; + delete $self->{'_tags'}->{$tag}; + return 1; + } + return 0; +} + +=head2 remove_all_tags + + Title : remove_all_tags + Usage : $node->remove_all_tags() + Function: Removes all tags + Returns : None + Args : None + + +=cut + +sub remove_all_tags{ + my ($self) = @_; + $self->{'_tags'} = {}; + return; +} + +=head2 get_all_tags + + Title : get_all_tags + Usage : my @tags = $node->get_all_tags() + Function: Gets all the tag names for this Node + Returns : Array of tagnames + Args : None + + +=cut + +sub get_all_tags{ + my ($self) = @_; + return sort keys %{$self->{'_tags'} || {}}; +} + +=head2 get_tag_values + + Title : get_tag_values + Usage : my @values = $node->get_tag_value($tag) + Function: Gets the values for given tag ($tag) + Returns : Array of values or empty list if tag does not exist + Args : $tag - tag name + + +=cut + +sub get_tag_values{ + my ($self,$tag) = @_; + return @{$self->{'_tags'}->{$tag} || []}; +} + +=head2 has_tag + + Title : has_tag + Usage : $node->has_tag($tag) + Function: Boolean test if tag exists in the Node + Returns : Boolean + Args : $tag - tagname + + +=cut + +sub has_tag { + my ($self,$tag) = @_; + return exists $self->{'_tags'}->{$tag}; +} + +sub node_cleanup { + my $self = shift; + if( defined $self->{'_desc'} && + ref($self->{'_desc'}) =~ /ARRAY/i ) { + while( my ($nodeid,$node) = each %{ $self->{'_desc'} } ) { + $node->ancestor(undef); # insure no circular references + $node = undef; + } + } + $self->{'_desc'} = {}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tree/NodeI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tree/NodeI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,473 @@ +# $Id: NodeI.pm,v 1.19.2.2 2003/09/14 19:00:35 jason Exp $ +# +# BioPerl module for Bio::Tree::NodeI +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tree::NodeI - Interface describing a Tree Node + +=head1 SYNOPSIS + + # get a Tree::NodeI somehow + # like from a TreeIO + use Bio::TreeIO; + # read in a clustalw NJ in phylip/newick format + my $treeio = new Bio::TreeIO(-format => 'newick', -file => 'file.dnd'); + + my $tree = $treeio->next_tree; # we'll assume it worked for demo purposes + # you might want to test that it was defined + + my $rootnode = $tree->get_root_node; + + # process just the next generation + foreach my $node ( $rootnode->each_Descendent() ) { + print "branch len is ", $node->branch_length, "\n"; + } + + # process all the children + my $example_leaf_node; + foreach my $node ( $rootnode->get_all_Descendents() ) { + if( $node->is_Leaf ) { + print "node is a leaf ... "; + # for example use below + $example_leaf_node = $node unless defined $example_leaf_node; + } + print "branch len is ", $node->branch_length, "\n"; + } + + # The ancestor() method points to the parent of a node + # A node can only have one parent + + my $parent = $example_leaf_node->ancestor; + + # parent won't likely have an description because it is an internal node + # but child will because it is a leaf + + print "Parent id: ", $parent->id," child id: ", + $example_leaf_node->id, "\n"; + + +=head1 DESCRIPTION + +A NodeI is capable of the basic structure of building a tree and +storing the branch length between nodes. The branch length is the +length of the branch between the node and its ancestor, thus a root +node in a Tree will not typically have a valid branch length. + +Various implementations of NodeI may extend the basic functions and +allow storing of other information (like attatching a species object +or full sequences used to build a tree or alternative sequences). If +you don't know how to extend a Bioperl object please ask, happy to +help, we would also greatly appreciate contributions with improvements +or extensions of the objects back to the Bioperl code base so that +others don't have to reinvent your ideas. + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +the web: + + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +=head1 CONTRIBUTORS + +Aaron Mackey amackey@virginia.edu + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::Tree::NodeI; +use vars qw(@ISA); +use strict; +use Bio::Root::RootI; +@ISA = qw(Bio::Root::RootI); + + +=head2 add_Descendent + + Title : add_Descendent + Usage : $node->add_Descendant($node); + Function: Adds a descendent to a node + Returns : number of current descendents for this node + Args : Bio::Node::NodeI + + +=cut + +sub add_Descendent{ + my ($self,@args) = @_; + + $self->throw_not_implemented(); +} + + +=head2 each_Descendent + + Title : each_Descendent + Usage : my @nodes = $node->each_Descendent; + Function: all the descendents for this Node (but not their descendents + i.e. not a recursive fetchall) + Returns : Array of Bio::Tree::NodeI objects + Args : none + +=cut + +sub each_Descendent{ + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 Decorated Interface methods + +=cut + +=head2 get_all_Descendents + + Title : get_all_Descendents($sortby) + Usage : my @nodes = $node->get_all_Descendents; + Function: Recursively fetch all the nodes and their descendents + *NOTE* This is different from each_Descendent + Returns : Array or Bio::Tree::NodeI objects + Args : $sortby [optional] "height", "creation" or coderef to be used + to sort the order of children nodes. + +=cut + +sub get_all_Descendents{ + my ($self, $sortby) = @_; + $sortby ||= 'height'; + my @nodes; + foreach my $node ( $self->each_Descendent($sortby) ) { + push @nodes, ($node->get_all_Descendents($sortby), $node); + } + return @nodes; +} + +*get_Descendents = \&get_all_Descendents; + +=head2 is_Leaf + + Title : is_Leaf + Usage : if( $node->is_Leaf ) + Function: Get Leaf status + Returns : boolean + Args : none + +=cut + +sub is_Leaf{ + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 descendent_count + + Title : descendent_count + Usage : my $count = $node->descendent_count; + Function: Counts the number of descendents a node has + (and all of their subnodes) + Returns : integer + Args : none + +=cut + +sub descendent_count{ + my ($self) = @_; + my $count = 0; + + foreach my $node ( $self->each_Descendent ) { + $count += 1; + $node->can('descendent_count') ? $count += $node->descendent_count : next; + } + return $count; +} + +=head2 to_string + + Title : to_string + Usage : my $str = $node->to_string() + Function: For debugging, provide a node as a string + Returns : string + Args : none + + +=cut + +sub to_string{ + my ($self) = @_; + return sprintf("%s%s%s", + defined $self->id ? $self->id : '', + defined $self->branch_length ? ':' . $self->branch_length : ' ', + $self->is_Leaf() ? '(leaf)' : '' + ); +} + +=head2 height + + Title : height + Usage : my $len = $node->height + Function: Returns the height of the tree starting at this + node. Height is the maximum branchlength. + Returns : The longest length (weighting branches with branch_length) to a leaf + Args : none + +=cut + +sub height{ + my ($self) = @_; + + if( $self->is_Leaf ) { + if( !defined $self->branch_length ) { + $self->debug(sprintf("Trying to calculate height of a node when a Node (%s) has an undefined branch_length\n",$self->id || '?' )); + return 0; + } + return $self->branch_length; + } + my $max = 0; + foreach my $subnode ( $self->each_Descendent ) { + my $s = $subnode->height; + if( $s > $max ) { $max = $s; } + } + return $max + ($self->branch_length || 1); +} + +=head2 Get/Set methods + +=cut + +=head2 branch_length + + Title : branch_length + Usage : $obj->branch_length() + Function: Get/Set the branch length + Returns : value of branch_length + Args : newvalue (optional) + + +=cut + +sub branch_length{ + my ($self)= @_; + $self->throw_not_implemented(); +} + +=head2 id + + Title : id + Usage : $obj->id($newval) + Function: The human readable identifier for the node + Returns : value of human readable id + Args : newvalue (optional) + + +=cut + +sub id{ + my ($self)= @_; + $self->throw_not_implemented(); +} + +=head2 internal_id + + Title : internal_id + Usage : my $internalid = $node->internal_id + Function: Returns the internal unique id for this Node + Returns : unique id + Args : none + +=cut + +sub internal_id{ + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 description + + Title : description + Usage : $obj->description($newval) + Function: Get/Set the description string + Returns : value of description + Args : newvalue (optional) + + +=cut + +sub description{ + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 bootstrap + + Title : bootstrap + Usage : $obj->bootstrap($newval) + Function: Get/Set the bootstrap value + Returns : value of bootstrap + Args : newvalue (optional) + + +=cut + +sub bootstrap{ + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 ancestor + + Title : ancestor + Usage : my $node = $node->ancestor; + Function: Get/Set the ancestor node pointer for a Node + Returns : Null if this is top level node + Args : none + +=cut + + +sub ancestor{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +=head2 invalidate_height + + Title : invalidate_height + Usage : private helper method + Function: Invalidate our cached value of the node height in the tree + Returns : nothing + Args : none + +=cut + +sub invalidate_height { + shift->throw_not_implemented(); +} + +=head2 Methods for associating Tag/Values with a Node + +These methods associate tag/value pairs with a Node + +=head2 add_tag_value + + Title : add_tag_value + Usage : $node->add_tag_value($tag,$value) + Function: Adds a tag value to a node + Returns : number of values stored for this tag + Args : $tag - tag name + $value - value to store for the tag + + +=cut + +sub add_tag_value{ + shift->throw_not_implemented(); +} + +=head2 remove_tag + + Title : remove_tag + Usage : $node->remove_tag($tag) + Function: Remove the tag and all values for this tag + Returns : boolean representing success (0 if tag does not exist) + Args : $tag - tagname to remove + + +=cut + +sub remove_tag { + shift->throw_not_implemented(); +} + +=head2 remove_all_tags + + Title : remove_all_tags + Usage : $node->remove_all_tags() + Function: Removes all tags + Returns : None + Args : None + + +=cut + +sub remove_all_tags{ + shift->throw_not_implemented(); +} + +=head2 get_all_tags + + Title : get_all_tags + Usage : my @tags = $node->get_all_tags() + Function: Gets all the tag names for this Node + Returns : Array of tagnames + Args : None + + +=cut + +sub get_all_tags { + shift->throw_not_implemented(); +} + +=head2 get_tag_values + + Title : get_tag_values + Usage : my @values = $node->get_tag_value($tag) + Function: Gets the values for given tag ($tag) + Returns : Array of values or empty list if tag does not exist + Args : $tag - tag name + + +=cut + +sub get_tag_values{ + shift->throw_not_implemented(); +} + +=head2 has_tag + + Title : has_tag + Usage : $node->has_tag($tag) + Function: Boolean test if tag exists in the Node + Returns : Boolean + Args : $tag - tagname + + +=cut + +sub has_tag{ + shift->throw_not_implemented(); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tree/NodeNHX.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tree/NodeNHX.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,171 @@ +# $Id: NodeNHX.pm,v 1.5.2.1 2003/09/14 19:00:35 jason Exp $ +# +# BioPerl module for Bio::Tree::NodeNHX +# +# Cared for by Aaron Mackey +# +# Copyright Aaron Mackey +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tree::NodeNHX - A Simple Tree Node with support for NHX tags + +=head1 SYNOPSIS + + use Bio::Tree::NodeNHX; + my $nodeA = new Bio::Tree::NodeNHX(); + my $nodeL = new Bio::Tree::NodeNHX(); + my $nodeR = new Bio::Tree::NodeNHX(); + + my $node = new Bio::Tree::NodeNHX(); + $node->add_Descendents($nodeL); + $node->add_Descendents($nodeR); + + print "node is not a leaf \n" if( $node->is_leaf); + +=head1 DESCRIPTION + +Makes a Tree Node with NHX tags, suitable for building a Tree. See +L for a full list of functionality. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +the web: + + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Aaron Mackey + +Email amackey@virginia.edu + +=head1 CONTRIBUTORS + +The NHX (New Hampshire eXtended) format was created by Chris Zmasek, +and is described at: + + http://www.genetics.wustl.edu/eddy/forester/NHX.html + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Tree::NodeNHX; +use vars qw(@ISA); +use strict; + +use Bio::Tree::Node; + +@ISA = qw(Bio::Tree::Node); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Tree::NodeNHX(); + Function: Builds a new Bio::Tree::NodeNHX object + Returns : Bio::Tree::NodeNHX + Args : -left => pointer to Left descendent (optional) + -right => pointer to Right descenent (optional) + -branch_length => branch length [integer] (optional) + -bootstrap => value bootstrap value (string) + -description => description of node + -id => unique id for node + -nhx => hashref of NHX tags and values + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + my ($nhx) = $self->_rearrange([qw(NHX)], @args); + $self->nhx_tag($nhx); + return $self; +} + +sub DESTROY { + my ($self) = @_; + # try to insure that everything is cleaned up + $self->SUPER::DESTROY(); + if( defined $self->{'_desc'} && + ref($self->{'_desc'}) =~ /ARRAY/i ) { + while( my ($nodeid,$node) = each %{ $self->{'_desc'} } ) { + $node->{'_ancestor'} = undef; # insure no circular references + $node->DESTROY(); + $node = undef; + } + $self->{'_desc'} = {}; + } +} + +sub to_string{ + my ($self) = @_; + return sprintf("%s%s%s", + defined $self->id ? $self->id : '', + defined $self->branch_length ? ':' . + $self->branch_length : ' ', + '[' . join(":", "&&NHX", + map { "$_=" .join(',', + $self->get_tag_values($_))} + $self->get_all_tags() ) . ']' + ); +} + +=head2 nhx_tag + + Title : nhx_tag + Usage : my $tag = $nodenhx->nhx_tag(%tags); + Function: Set tag-value pairs for NHX nodes + Returns : none + Args : hashref to update the tags/value pairs + OR + with a scalar value update the bootstrap value by default + + +=cut + +sub nhx_tag { + my ($self, $tags) = @_; + if (defined $tags && (ref($tags) =~ /HASH/i)) { + while( my ($tag,$val) = each %$tags ) { + if( ref($val) =~ /ARRAY/i ) { + for my $v ( @$val ) { + $self->add_tag_value($tag,$v); + } + } else { + $self->add_tag_value($tag,$val); + } + } + if (exists $tags->{'B'}) { + $self->bootstrap($tags->{'B'}); + } + } elsif (defined $tags and ! ref ($tags)) { + print STDERR "here with $tags\n"; + # bootstrap by default + $self->bootstrap($tags); + } +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tree/RandomFactory.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tree/RandomFactory.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,409 @@ +# $Id: RandomFactory.pm,v 1.8 2002/12/24 17:52:03 jason Exp $ +# +# BioPerl module for Bio::Tree::RandomFactory +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tree::RandomFactory - TreeFactory for generating Random Trees + +=head1 SYNOPSIS + + use Bio::Tree::RandomFactory + my $factory = new Bio::Tree::RandomFactory( -samples => \@taxonnames, + -maxcount => 10); + + # or for anonymous samples + + my $factory = new Bio::Tree::RandomFactory( -sample_size => 6, + -maxcount = 50); + +=head1 DESCRIPTION + +Builds a random tree every time next_tree is called or up to -maxcount times. + +This algorithm is based on the make_tree algorithm from Richard Hudson 1990. + +Hudson, R. R. 1990. Gene genealogies and the coalescent + process. Pp. 1-44 in D. Futuyma and J. Antonovics, eds. Oxford + surveys in evolutionary biology. Vol. 7. Oxford University + Press, New York + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +the web: + + http://bugzilla.bioperl.org/ + + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +=head1 CONTRIBUTORS + +Matthew Hahn, Ematthew.hahn@duke.eduE + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Tree::RandomFactory; +use vars qw(@ISA $PRECISION_DIGITS); +use strict; + +BEGIN { + $PRECISION_DIGITS = 3; # Precision for the branchlength +} + +use Bio::Factory::TreeFactoryI; +use Bio::Root::Root; +use Bio::TreeIO::TreeEventBuilder; +use Bio::Tree::AlleleNode; + +@ISA = qw(Bio::Root::Root Bio::Factory::TreeFactoryI ); + +=head2 new + + Title : new + Usage : my $factory = new Bio::Tree::RandomFactory(-samples => \@samples, + -maxcount=> $N); + Function: Initializes a Bio::Tree::RandomFactory object + Returns : Bio::Tree::RandomFactory + Args : + + +=cut + +sub new{ + my ($class,@args) = @_; + my $self = $class->SUPER::new(@args); + + $self->{'_eventbuilder'} = new Bio::TreeIO::TreeEventBuilder(); + $self->{'_treecounter'} = 0; + $self->{'_maxcount'} = 0; + my ($maxcount, $samps,$samplesize ) = $self->_rearrange([qw(MAXCOUNT + SAMPLES + SAMPLE_SIZE)], + @args); + my @samples; + + if( ! defined $samps ) { + if( ! defined $samplesize || $samplesize <= 0 ) { + $self->throw("Must specify a valid samplesize if parameter -SAMPLE is not specified"); + } + foreach ( 1..$samplesize ) { push @samples, "Samp$_"; } + } else { + if( ref($samps) =~ /ARRAY/i ) { + $self->throw("Must specify a valid ARRAY reference to the parameter -SAMPLES, did you forget a leading '\\'?"); + } + @samples = @$samps; + } + + $self->samples(\@samples); + $self->sample_size(scalar @samples); + if( defined $maxcount ) { + $self->maxcount($maxcount); + } + return $self; +} + +=head2 next_tree + + Title : next_tree + Usage : my $tree = $factory->next_tree + Function: Returns a random tree based on the initialized number of nodes + NOTE: if maxcount is not specified on initialization or + set to a valid integer, subsequent calls to next_tree will + continue to return random trees and never return undef + + Returns : Bio::Tree::TreeI object + Args : none + +=cut + +sub next_tree{ + my ($self) = @_; + # If maxcount is set to something non-zero then next tree will + # continue to return valid trees until maxcount is reached + # otherwise will always return trees + return undef if( $self->maxcount && + $self->{'_treecounter'}++ >= $self->maxcount ); + my $size = $self->sample_size; + + my $in; + my @tree = (); + my @list = (); + + for($in=0;$in < 2*$size -1; $in++ ) { + push @tree, { 'nodenum' => "Node$in" }; + } + # in C we would have 2 arrays + # an array of nodes (tree) + # and array of pointers to these nodes (list) + # and we just shuffle the list items to do the + # tree topology generation + # instead in perl, we will have a list of hashes (nodes) called @tree + # and a list of integers representing the indexes in tree called @list + + for($in=0;$in < $size;$in++) { + $tree[$in]->{'time'} = 0; + $tree[$in]->{'desc1'} = undef; + $tree[$in]->{'desc2'} = undef; + push @list, $in; + } + + my $t=0; + # generate times for the nodes + for($in = $size; $in > 1; $in-- ) { + $t+= -2.0 * log(1 - $self->random(1)) / ( $in * ($in-1) ); + $tree[2 * $size - $in]->{'time'} =$t; + } + # topology generation + for ($in = $size; $in > 1; $in-- ) { + my $pick = int $self->random($in); + my $nodeindex = $list[$pick]; + my $swap = 2 * $size - $in; + $tree[$swap]->{'desc1'} = $nodeindex; + $list[$pick] = $list[$in-1]; + $pick = int rand($in - 1); + $nodeindex = $list[$pick]; + $tree[$swap]->{'desc2'} = $nodeindex; + $list[$pick] = $swap; + } + # Let's convert the hashes into nodes + + my @nodes = (); + foreach my $n ( @tree ) { + push @nodes, + new Bio::Tree::AlleleNode(-id => $n->{'nodenum'}, + -branch_length => $n->{'time'}); + } + my $ct = 0; + foreach my $node ( @nodes ) { + my $n = $tree[$ct++]; + if( defined $n->{'desc1'} ) { + $node->add_Descendent($nodes[$n->{'desc1'}]); + } + if( defined $n->{'desc2'} ) { + $node->add_Descendent($nodes[$n->{'desc2'}]); + } + } + my $T = new Bio::Tree::Tree(-root => pop @nodes ); + return $T; +} + +=head2 add_Mutations + + Title : add_Mutations + Usage : $factory->add_Mutations($tree, $mutcount); + Function: Adds mutations to a tree via a random process weighted by + branch length (it is a poisson distribution + as part of a coalescent process) + Returns : none + Args : $tree - Bio::Tree::TreeI + $nummut - number of mutations + + +=cut + +sub add_Mutations{ + my ($self,$tree, $nummut) = @_; + my @branches; + my @lens; + my $branchlen = 0; + my $last = 0; + my @nodes = $tree->get_nodes(); + my $precision = 10**$PRECISION_DIGITS; + my $i = 0; + + # Jason's somewhat simplistics way of doing a poission + # distribution for a fixed number of mutations + # build an array and put the node number in a slot + # representing the branch to put a mutation on + # but weight the number of slots per branch by the + # length of the branch ( ancestor's time - node time) + + foreach my $node ( @nodes ) { + if( $node->ancestor ) { + my $len = int ( ($node->ancestor->branch_length - + $node->branch_length) * $precision); + if ( $len > 0 ) { + for( my $j =0;$j < $len;$j++) { + push @branches, $i; + } + $last += $len; + } + $branchlen += $len; + } + if( ! $node->isa('Bio::Tree::AlleleNode') ) { + bless $node, 'Bio::Tree::AlleleNode'; # rebless it to the right node + } + $node->purge_markers; + $i++; + } + # sanity check + die("branch len is $branchlen arraylen is $last") + unless ( $branchlen == $last ); + + for( my $j = 0; $j < $nummut; $j++) { + my $index = int(rand($branchlen)); + my $branch = $branches[$index]; + $nodes[$branch]->add_alleles("Mutation$j", [1]); + } +} + +=head2 maxcount + + Title : maxcount + Usage : $obj->maxcount($newval) + Function: + Example : + Returns : value of maxcount + Args : newvalue (optional) + + +=cut + +sub maxcount{ + my ($self,$value) = @_; + if( defined $value) { + if( $value =~ /^(\d+)/ ) { + $self->{'maxcount'} = $1; + } else { + $self->warn("Must specify a valid Positive integer to maxcount"); + $self->{'maxcount'} = 0; + } + } + return $self->{'_maxcount'}; +} + +=head2 samples + + Title : samples + Usage : $obj->samples($newval) + Function: + Example : + Returns : value of samples + Args : newvalue (optional) + + +=cut + +sub samples{ + my ($self,$value) = @_; + if( defined $value) { + if( ref($value) !~ /ARRAY/i ) { + $self->warn("Must specify a valid array ref to the method 'samples'"); + $value = []; + } + $self->{'samples'} = $value; + } + return $self->{'samples'}; + +} + +=head2 sample_size + + Title : sample_size + Usage : $obj->sample_size($newval) + Function: + Example : + Returns : value of sample_size + Args : newvalue (optional) + + +=cut + +sub sample_size{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'sample_size'} = $value; + } + return $self->{'sample_size'}; + +} + +=head2 attach_EventHandler + + Title : attach_EventHandler + Usage : $parser->attatch_EventHandler($handler) + Function: Adds an event handler to listen for events + Returns : none + Args : Bio::Event::EventHandlerI + +=cut + +sub attach_EventHandler{ + my ($self,$handler) = @_; + return if( ! $handler ); + if( ! $handler->isa('Bio::Event::EventHandlerI') ) { + $self->warn("Ignoring request to attatch handler ".ref($handler). ' because it is not a Bio::Event::EventHandlerI'); + } + $self->{'_handler'} = $handler; + return; +} + +=head2 _eventHandler + + Title : _eventHandler + Usage : private + Function: Get the EventHandler + Returns : Bio::Event::EventHandlerI + Args : none + + +=cut + +sub _eventHandler{ + my ($self) = @_; + return $self->{'_handler'}; +} + +=head2 random + + Title : random + Usage : my $rfloat = $node->random($size) + Function: Generates a random number between 0 and $size + This is abstracted so that someone can override and provide their + own special RNG. This is expected to be a uniform RNG. + Returns : Floating point random + Args : $maximum size for random number (defaults to 1) + + +=cut + +sub random{ + my ($self,$max) = @_; + return rand($max); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tree/Statistics.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tree/Statistics.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,149 @@ +# $Id: Statistics.pm,v 1.6 2002/12/24 17:52:03 jason Exp $ +# +# BioPerl module for Bio::Tree::Statistics +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tree::Statistics - Calculate certain statistics for a Tree + +=head1 SYNOPSIS + +Give standard usage here + +=head1 DESCRIPTION + +This object is a place to accumulate routines for calculating various +tree statistics from population genetic and phylogenetic methods. + +Currently Fu and Li's D is implemented. +Tajima's D planned. + +References forthcoming. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +the web: + + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Aaron Mackey + +Email jason@bioperl.org + +=head1 CONTRIBUTORS + +Matt Hahn Ematthew.hahn@duke.dukeE + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Tree::Statistics; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::Root::Root; + +@ISA = qw(Bio::Root::Root); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Tree::Statistics(); + Function: Builds a new Bio::Tree::Statistics object + Returns : Bio::Tree::Statistics + Args : + + +=cut + +=head2 fu_and_li_D + + Title : fu_and_li_D + Usage : my $D = $statistics->fu_an_li_D($tree,$nummut); + Function: + For this we assume that the tree is made up of + Bio::Tree::AlleleNode's which contain markers and alleles + each marker is a 'mutation' + Returns : Fu and Li's D statistic for this Tree + Args : $tree - Bio::Tree::TreeI which contains Bio::Tree::AlleleNodes + +=cut + +sub fu_and_li_D{ + my ($self,$tree) = @_; + + # for this we assume that the tree is made up of + # allele nodes which contain markers and alleles + # each marker is a 'mutation' + my @nodes = $tree->get_nodes(); + my $muttotal =0; + my $tipmutcount = 0; + my $sampsize = 0; + foreach my $n ( @nodes ) { + if ($n->is_Leaf() ) { + $sampsize++; + $tipmutcount += $n->get_marker_names(); + } + $muttotal += $n->get_marker_names(); + } + + if( $muttotal <= 0 ) { + $self->warn("mutation total was not > 0, cannot calculate a Fu and Li D"); + return 0; + } + my $a = 0; + for(my $k= 1; $k < $sampsize; $k++ ) { + $a += ( 1 / $k ); + } + + my $b = 0; + for(my $k= 1; $k < $sampsize; $k++ ) { + $b += ( 1 / $k**2 ); + } + + my $c = 2 * ( ( ( $sampsize * $a ) - (2 * ( $sampsize -1 ))) / + ( ( $sampsize - 1) * ( $sampsize - 2 ) ) ); + + my $v = 1 + ( ( $a**2 / ( $b + $a**2 ) ) * ( $c - ( ( $sampsize + 1) / + ( $sampsize - 1) ) )); + + my $u = $a - 1 - $v; + my $D = ( $muttotal - ( $a * $tipmutcount) ) / + ( sqrt ( ($u * $muttotal) + ( $v * $muttotal**2) ) ); + + return $D; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tree/Tree.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tree/Tree.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,302 @@ +# $Id: Tree.pm,v 1.13.2.2 2003/09/14 20:22:31 jason Exp $ +# +# BioPerl module for Bio::Tree::Tree +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tree::Tree - An Implementation of TreeI interface. + +=head1 SYNOPSIS + + # like from a TreeIO + my $treeio = new Bio::TreeIO(-format => 'newick', -file => 'treefile.dnd'); + my $tree = $treeio->next_tree; + my @nodes = $tree->get_nodes; + my $root = $tree->get_root_node; + + +=head1 DESCRIPTION + +This object holds handles to Nodes which make up a tree. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +the web: + + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +=head1 CONTRIBUTORS + +Aaron Mackey amackey@virginia.edu + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Tree::Tree; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::Root::Root; +use Bio::Tree::TreeFunctionsI; +use Bio::Tree::TreeI; + +@ISA = qw(Bio::Root::Root Bio::Tree::TreeI Bio::Tree::TreeFunctionsI ); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Tree::Tree(); + Function: Builds a new Bio::Tree::Tree object + Returns : Bio::Tree::Tree + Args : -root => L object which is the root + -nodelete => boolean, whether or not to try and cleanup all + the nodes when this this tree goes out + of scope. + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + $self->{'_rootnode'} = undef; + $self->{'_maxbranchlen'} = 0; + $self->_register_for_cleanup(\&cleanup_tree); + my ($root,$nodel)= $self->_rearrange([qw(ROOT NODELETE)], @args); + if( $root ) { $self->set_root_node($root); } + $self->nodelete($nodel || 0); + return $self; +} + + +=head2 nodelete + + Title : nodelete + Usage : $obj->nodelete($newval) + Function: Get/Set Boolean whether or not to delete the underlying + nodes when it goes out of scope. By default this is false + meaning trees are cleaned up. + Returns : boolean + Args : on set, new boolean value + + +=cut + +sub nodelete{ + my $self = shift; + return $self->{'nodelete'} = shift if @_; + return $self->{'nodelete'}; +} + +=head2 get_nodes + + Title : get_nodes + Usage : my @nodes = $tree->get_nodes() + Function: Return list of Tree::NodeI objects + Returns : array of Tree::NodeI objects + Args : (named values) hash with one value + order => 'b|breadth' first order or 'd|depth' first order + +=cut + +sub get_nodes{ + my ($self, @args) = @_; + + my ($order, $sortby) = $self->_rearrange([qw(ORDER SORTBY)],@args); + $order ||= 'depth'; + $sortby ||= 'height'; + return () unless defined $self->get_root_node; + if ($order =~ m/^b|(breadth)$/oi) { + my $node = $self->get_root_node; + my @children = ($node); + for (@children) { + push @children, $_->each_Descendent($sortby); + } + return @children; + } + + if ($order =~ m/^d|(depth)$/oi) { + # this is depth-first search I believe + my $node = $self->get_root_node; + my @children = ($node,$node->get_Descendents($sortby)); + return @children; + } +} + +=head2 get_root_node + + Title : get_root_node + Usage : my $node = $tree->get_root_node(); + Function: Get the Top Node in the tree, in this implementation + Trees only have one top node. + Returns : Bio::Tree::NodeI object + Args : none + +=cut + + +sub get_root_node{ + my ($self) = @_; + return $self->{'_rootnode'}; +} + +=head2 set_root_node + + Title : set_root_node + Usage : $tree->set_root_node($node) + Function: Set the Root Node for the Tree + Returns : Bio::Tree::NodeI + Args : Bio::Tree::NodeI + +=cut + +sub set_root_node{ + my $self = shift; + if( @_ ) { + my $value = shift; + if( defined $value && + ! $value->isa('Bio::Tree::NodeI') ) { + $self->warn("Trying to set the root node to $value which is not a Bio::Tree::NodeI"); + return $self->get_root_node; + } + $self->{'_rootnode'} = $value; + } + return $self->get_root_node; +} + +=head2 total_branch_length + + Title : total_branch_length + Usage : my $size = $tree->total_branch_length + Function: Returns the sum of the length of all branches + Returns : integer + Args : none + +=cut + +sub total_branch_length { + my ($self) = @_; + my $sum = 0; + if( defined $self->get_root_node ) { + for ( $self->get_root_node->get_Descendents() ) { + $sum += $_->branch_length || 0; + } + } + return $sum; +} + +=head2 id + + Title : id + Usage : my $id = $tree->id(); + Function: An id value for the tree + Returns : scalar + Args : [optional] new value to set + + +=cut + +sub id{ + my ($self,$val) = @_; + if( defined $val ) { + $self->{'_treeid'} = $val; + } + return $self->{'_treeid'}; +} + +=head2 score + + Title : score + Usage : $obj->score($newval) + Function: Sets the associated score with this tree + This is a generic slot which is probably best used + for log likelihood or other overall tree score + Returns : value of score + Args : newvalue (optional) + + +=cut + +sub score{ + my ($self,$val) = @_; + if( defined $val ) { + $self->{'_score'} = $val; + } + return $self->{'_score'}; +} + + +# decorated interface TreeI Implements this + +=head2 height + + Title : height + Usage : my $height = $tree->height + Function: Gets the height of tree - this LOG_2($number_nodes) + WARNING: this is only true for strict binary trees. The TreeIO + system is capable of building non-binary trees, for which this + method will currently return an incorrect value!! + Returns : integer + Args : none + +=head2 number_nodes + + Title : number_nodes + Usage : my $size = $tree->number_nodes + Function: Returns the number of nodes + Example : + Returns : + Args : + + +=cut + + +# -- private internal methods -- + +sub cleanup_tree { + my $self = shift; + unless( $self->nodelete ) { + foreach my $node ( $self->get_nodes ) { + $node->ancestor(undef); + $node = undef; + } + } + $self->{'_rootnode'} = undef; +} +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tree/TreeFunctionsI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tree/TreeFunctionsI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,555 @@ +# $Id: TreeFunctionsI.pm,v 1.5.2.3 2003/09/14 20:18:25 jason Exp $ +# +# BioPerl module for Bio::Tree::TreeFunctionsI +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tree::TreeFunctionsI - Decorated Interface implementing basic Tree exploration methods + +=head1 SYNOPSIS + + use Bio::TreeIO; + my $in = new Bio::TreeIO(-format => 'newick', -file => 'tree.tre'); + + my $tree = $in->next_tree; + + my @nodes = $tree->find_node('id1'); + + if( $tree->is_monophyletic(-clade => @nodes, -outgroup => $outnode) ){ + + } + +=head1 DESCRIPTION + +Describe the interface here + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich, Aaron Mackey, Justin Reese + +Email jason-at-bioperl-dot-org +Email amackey-at-virginia.edu +Email jtr4v-at-virginia.edu + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +Rerooting code was worked on by + + Daniel Barker d.barker-at-reading.ac.uk + Ramiro Barrantes Ramiro.Barrantes-at-uvm.edu + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Tree::TreeFunctionsI; +use vars qw(@ISA); +use strict; +use Bio::Tree::TreeI; + +@ISA = qw(Bio::Tree::TreeI); + +=head2 find_node + + Title : find_node + Usage : my @nodes = $self->find_node(-id => 'node1'); + Function: returns all nodes that match a specific field, by default this + is id, but different branch_length, + Returns : List of nodes which matched search + Args : text string to search for + OR + -fieldname => $textstring + +=cut + +sub find_node { + my ($self,$type,$field) = @_; + if( ! defined $type ) { + $self->warn("Must request a either a string or field and string when searching"); + } + + # all this work for a '-' named field + # is so that we could potentially + # expand to other constraints in + # different implementations + # like 'find all nodes with boostrap < XX' + + if( ! defined $field ) { + # only 1 argument, default to searching by id + $field= $type; + $type = 'id'; + } else { + $type =~ s/^-//; + } + + # could actually do this by testing $rootnode->can($type) but + # it is possible that a tree is implemeted with different node types + # - although it is unlikely that the root node would be richer than the + # leaf nodes. Can't handle NHX tags right now + + unless( $type eq 'id' || $type eq 'name' || + $type eq 'bootstrap' || $type eq 'description' || + $type eq 'internal_id') { + $self->warn("unknown search type $type - will try anyways"); + } + my @nodes = grep { $_->can($type) && defined $_->$type() && + $_->$type() eq $field } $self->get_nodes(); + + if ( wantarray) { + return @nodes; + } else { + if( @nodes > 1 ) { + $self->warn("More than 1 node found but caller requested scalar, only returning first node"); + } + return shift @nodes; + } +} + +=head2 remove_Node + + Title : remove_Node + Usage : $tree->remove_Node($node) + Function: Removes a node from the tree + Returns : boolean represent status of success + Args : either Bio::Tree::NodeI or string of the node id + + +=cut + +sub remove_Node { + my ($self,$input) = @_; + my $node = undef; + unless( ref($input) ) { + $node = $self->find_node($input); + } elsif( ! $input->isa('Bio::Tree::NodeI') ) { + $self->warn("Did not provide either a valid Bio::Tree::NodeI object to remove_node or the node name"); + return 0; + } else { + $node = $input; + } + if( ! $node->ancestor && $self->get_root_node->internal_id != $node->internal_id) { + $self->warn("Node (".$node->to_string . ") has no ancestor, can't remove!"); + } else { + $node->ancestor->remove_Descendent($node); + } +} + + +# Added for Justin Reese by Jason + +=head2 get_lca + + Title : get_lca + Usage : get_lca(-nodes => \@nodes ) + Function: given two nodes, returns the lowest common ancestor + Returns : node object + Args : -nodes => arrayref of nodes to test + + +=cut + +sub get_lca { + my ($self,@args) = @_; + my ($nodes) = $self->_rearrange([qw(NODES)],@args); + if( ! defined $nodes ) { + $self->warn("Must supply -nodes parameter to get_lca() method"); + return undef; + } + my ($node1,$node2) = $self->_check_two_nodes($nodes); + return undef unless $node1 && $node2; + + # algorithm: Start with first node, find and save every node from it to + # root. Then start with second node; for it and each of its ancestor + # nodes, check to see if it's in the first node's ancestor list - if + # so it is the lca. + # + # This is very slow and naive, but I somehow doubt the overhead + # of mapping the tree to a complete binary tree and doing the linear + # lca search would be worth the overhead, especially for small trees. + # Maybe someday I'll write a linear get_lca and find out. + + # find and save every ancestor of node1 (including itself) + + my %node1_ancestors; # keys are internal ids, values are objects + my $place = $node1; # start at node1 + + while ( $place ){ + $node1_ancestors{$place->internal_id} = $place; + $place = $place->ancestor; + } + + # now climb up node2, for each node checking whether + # it's in node1_ancestors + $place = $node2; # start at node2 + while ( $place ){ + foreach my $key ( keys %node1_ancestors ){ # ugh + if ( $place->internal_id == $key){ + return $node1_ancestors{$key}; + } + } + $place = $place->ancestor; + } + $self->warn("Could not find lca!"); # should never execute, + # if so, there's a problem + return undef; +} + +# Added for Justin Reese by Jason + +=head2 distance + + Title : distance + Usage : distance(-nodes => \@nodes ) + Function: returns the distance between two given nodes + Returns : numerical distance + Args : -nodes => arrayref of nodes to test + + +=cut + +sub distance { + my ($self,@args) = @_; + my ($nodes) = $self->_rearrange([qw(NODES)],@args); + if( ! defined $nodes ) { + $self->warn("Must supply -nodes parameter to distance() method"); + return undef; + } + my ($node1,$node2) = $self->_check_two_nodes($nodes); + # algorithm: + + # Find lca: Start with first node, find and save every node from it + # to root, saving cumulative distance. Then start with second node; + # for it and each of its ancestor nodes, check to see if it's in + # the first node's ancestor list - if so it is the lca. Return sum + # of (cumul. distance from node1 to lca) and (cumul. distance from + # node2 to lca) + + # find and save every ancestor of node1 (including itself) + + my %node1_ancestors; # keys are internal ids, values are objects + my %node1_cumul_dist; # keys are internal ids, values + # are cumulative distance from node1 to given node + my $place = $node1; # start at node1 + my $cumul_dist = 0; + + while ( $place ){ + $node1_ancestors{$place->internal_id} = $place; + $node1_cumul_dist{$place->internal_id} = $cumul_dist; + if ($place->branch_length) { + $cumul_dist += $place->branch_length; # include current branch + # length in next iteration + } + $place = $place->ancestor; + } + + # now climb up node2, for each node checking whether + # it's in node1_ancestors + $place = $node2; # start at node2 + $cumul_dist = 0; + while ( $place ){ + foreach my $key ( keys %node1_ancestors ){ # ugh + if ( $place->internal_id == $key){ # we're at lca + return $node1_cumul_dist{$key} + $cumul_dist; + } + } + # include current branch length in next iteration + $cumul_dist += $place->branch_length; + $place = $place->ancestor; + } + $self->warn("Could not find distance!"); # should never execute, + # if so, there's a problem + return undef; +} + +# helper function to check lca and distance arguments + +sub _check_two_nodes { + my ($self, $nodes) = @_; + + if( ref($nodes) !~ /ARRAY/i || + !ref($nodes->[0]) || + !ref($nodes->[1]) + ) { + $self->warn("Must provide a valid array reference for -nodes"); + return undef; + } elsif( scalar(@$nodes) > 2 ){ + $self->warn("More than two nodes given, using first two"); + } elsif( scalar(@$nodes) < 2 ){ + $self->warn("-nodes parameter does not contain reference to two nodes"); + return undef; + } + unless( $nodes->[0]->isa('Bio::Tree::NodeI') && + $nodes->[1]->isa('Bio::Tree::NodeI') ) { + $self->warn("Did not provide valid Bio::Tree::NodeI objects as nodes\n"); + return undef; + } + return @$nodes; +} + + +=head2 is_monophyletic + + Title : is_monophyletic + Usage : if( $tree->is_monophyletic(-nodes => \@nodes, + -outgroup => $outgroup) + Function: Will do a test of monophyly for the nodes specified + in comparison to a chosen outgroup + Returns : boolean + Args : -nodes => arrayref of nodes to test + -outgroup => outgroup to serve as a reference + + +=cut + +sub is_monophyletic{ + my ($self,@args) = @_; + my ($nodes,$outgroup) = $self->_rearrange([qw(NODES OUTGROUP)],@args); + + if( ! defined $nodes || ! defined $outgroup ) { + $self->warn("Must supply -nodes and -outgroup parameters to the method +is_monophyletic"); + return undef; + } + if( ref($nodes) !~ /ARRAY/i ) { + $self->warn("Must provide a valid array reference for -nodes"); + } + my $clade_root; + # this is to combine multiple tests into a single node + # order doesn't really matter as long as get_lca does its job right + while( @$nodes > 2 ) { + my ($a,$b) = ( shift @$nodes, shift @$nodes); + $clade_root = $self->get_lca(-nodes => [$a,$b] ); + unshift @$nodes, $clade_root; + } + $clade_root = $self->get_lca(-nodes => $nodes ); + my $og_ancestor = $outgroup->ancestor; + while( defined ($og_ancestor ) ) { + if( $og_ancestor->internal_id == $clade_root->internal_id ) { + # monophyly is violated + return 0; + } + $og_ancestor = $og_ancestor->ancestor; + } + return 1; +} + +=head2 is_paraphyletic + + Title : is_paraphyletic + Usage : if( $tree->is_paraphyletic(-nodes =>\@nodes, + -outgroup => $node) ){ } + Function: Tests whether or not a given set of nodes are paraphyletic + (representing the full clade) given an outgroup + Returns : [-1,0,1] , -1 if the group is not monophyletic + 0 if the group is not paraphyletic + 1 if the group is paraphyletic + Args : -nodes => Array of Bio::Tree::NodeI objects which are in the tree + -outgroup => a Bio::Tree::NodeI to compare the nodes to + + +=cut + +sub is_paraphyletic{ + my ($self,@args) = @_; + my ($nodes,$outgroup) = $self->_rearrange([qw(NODES OUTGROUP)],@args); + + if( ! defined $nodes || ! defined $outgroup ) { + $self->warn("Must suply -nodes and -outgroup parameters to the method is_paraphyletic"); + return undef; + } + if( ref($nodes) !~ /ARRAY/i ) { + $self->warn("Must provide a valid array reference for -nodes"); + return undef; + } + + # Algorithm + # Find the lca + # Find all the nodes beneath the lca + # Test to see that none are missing from the nodes list + my %nodehash; + foreach my $n ( @$nodes ) { + $nodehash{$n->internal_id} = $n; + } + while( @$nodes > 2 ) { + unshift @$nodes, $self->get_lca(-nodes => [( shift @$nodes, + shift @$nodes)] ); + } + my $clade_root = $self->get_lca(-nodes => $nodes ); + unless( defined $clade_root ) { + $self->warn("could not find clade root via lca"); + return undef; + } + my $og_ancestor = $outgroup->ancestor; + + # Is this necessary/correct for paraphyly test? + while( defined ($og_ancestor ) ) { + if( $og_ancestor->internal_id == $clade_root->internal_id ) { + # monophyly is violated, could be paraphyletic + return -1; + } + $og_ancestor = $og_ancestor->ancestor; + } + my $tree = new Bio::Tree::Tree(-root => $clade_root, + -nodelete => 1); + + foreach my $n ( $tree->get_nodes() ) { + next unless $n->is_Leaf(); + # if any leaf node is not in the list + # then it is part of the clade and so the list + # must be paraphyletic + return 1 unless ( $nodehash{$n->internal_id} ); + } + return 0; +} + + +=head2 reroot + + Title : reroot_tree + Usage : $tree->reroot($node); + Function: Reroots a tree either making a new node the root + Returns : 1 on success, 0 on failure + Args : Bio::Tree::NodeI that is in the tree, but is not the current root + +=cut + +sub reroot { + my ($self,$new_root) = @_; + unless (defined $new_root && $new_root->isa("Bio::Tree::NodeI")) { + $self->warn("Must provide a valid Bio::Tree::NodeI when rerooting"); + return 0; + } + if( $new_root->is_Leaf() ) { + $self->warn("Asking to root with a leaf, will use the leaf's ancestor"); + $new_root = $new_root->ancestor; + } + + my $old_root = $self->get_root_node; + if( $new_root == $old_root ) { + $self->warn("Node requested for reroot is already the root node!"); + return 0; + } + + my @path = (); # along tree, from newroot to oldroot + my $node = $new_root; + while ($node) { + push @path, $node; + $node = $node->ancestor; + } + + my @path_from_oldroot = reverse @path; + for (my $i = 0; $i < @path_from_oldroot - 1; $i++) { + my $current = $path_from_oldroot[$i]; + my $next = $path_from_oldroot[$i + 1]; + $current->remove_Descendent($next); + $current->branch_length($next->branch_length); + $next->add_Descendent($current); + + } + $new_root->branch_length(undef); + $self->set_root_node($new_root); + + return 1; +} + +=head2 reverse_edge + + Title : reverse_edge + Usage : $node->reverse_edge(child); + Function: makes child be a parent of node + Requires: child must be a direct descendent of node + Returns : nothing + Args : Bio::Tree::NodeI that is in the tree + +=cut + +sub reverse_edge { + my ($self,$node) = @_; + delete_edge($self, $node); + $node->add_Descendent($self); + 1; +} + +=head2 delete_edge + + Title : delete_edge + Usage : $node->reverse_edge(child); + Function: makes child be a parent of node + Requires: child must be a direct descendent of node + Returns : nothing + Args : Bio::Tree::NodeI that is in the tree + +=cut + +sub delete_edge { + my ($self,$node) = @_; + unless (defined $self && $self->isa("Bio::Tree::NodeI")) { + $self->warn("Must provide a valid Bio::Tree::NodeI when rerooting"); + return 1; + } + unless (defined $node && $node->isa("Bio::Tree::NodeI")) { + $self->warn("Must provide a valid Bio::Tree::NodeI when rerooting"); + return 1; + } + if( $self->{'_desc'}->{$node->internal_id} ) { + $node->ancestor(undef); + $self->{'_desc'}->{$node->internal_id}->ancestor(undef); + delete $self->{'_desc'}->{$node->internal_id}; + } else { + $self->warn("First argument must be direct parent of node"); + return 1; + } + 1; +} + +sub findnode_by_id { + my $tree = shift; + my $id = shift; + my $rootnode = $tree->get_root_node; + if ( ($rootnode->id) and ($rootnode->id eq $id) ) { + return $rootnode; + } + # process all the children + foreach my $node ( $rootnode->get_Descendents ) { + if ( ($node->id) and ($node->id eq $id ) ) { + return $node; + } + } +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Tree/TreeI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tree/TreeI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,218 @@ +# $Id: TreeI.pm,v 1.11.2.1 2003/09/14 20:21:10 jason Exp $ +# +# BioPerl module for Bio::Tree::TreeI +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tree::TreeI - A Tree object suitable for lots of things, designed + originally for Phylogenetic Trees. + +=head1 SYNOPSIS + + # get a Bio::Tree::TreeI somehow + # like from a TreeIO + my $treeio = new Bio::TreeIO(-format => 'newick', -file => 'treefile.dnd'); + my $tree = $treeio->next_tree; + my @nodes = $tree->get_nodes; + my @leaves = $tree->get_leaf_nodes; + my $root = $tree->get_root_node; + +=head1 DESCRIPTION + +This object holds a pointer to the Root of a Tree which is a +Bio::Tree::NodeI. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +the web: + + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +=head1 CONTRIBUTORS + +Aaron Mackey amackey@virginia.edu +Elia Stupka, elia@fugu-sg.org + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Tree::TreeI; +use Bio::Tree::NodeI; +use vars qw(@ISA); +use strict; + +@ISA = qw(Bio::Tree::NodeI); + +=head2 get_nodes + + Title : get_nodes + Usage : my @nodes = $tree->get_nodes() + Function: Return list of Tree::NodeI objects + Returns : array of Tree::NodeI objects + Args : (named values) hash with one value + order => 'b|breadth' first order or 'd|depth' first order + +=cut + +sub get_nodes{ + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 get_root_node + + Title : get_root_node + Usage : my $node = $tree->get_root_node(); + Function: Get the Top Node in the tree, in this implementation + Trees only have one top node. + Returns : Bio::Tree::NodeI object + Args : none + +=cut + +sub get_root_node{ + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 number_nodes + + Title : number_nodes + Usage : my $size = $tree->number_nodes + Function: Returns the number of nodes + Example : + Returns : + Args : + + +=cut + +sub number_nodes{ + my ($self) = @_; + my $root = $self->get_root_node; + if( defined $root && $root->isa('Bio::Tree::NodeI')) { + return $root->descendent_count; + } + return 0; +} + +=head2 total_branch_length + + Title : total_branch_length + Usage : my $size = $tree->total_branch_length + Function: Returns the sum of the length of all branches + Returns : integer + Args : none + +=cut + +sub total_branch_length { + my ($self) = @_; + $self->throw_not_implemented(); +} + +=head2 height + + Title : height + Usage : my $height = $tree->height + Function: Gets the height of tree - this LOG_2($number_nodes) + WARNING: this is only true for strict binary trees. The TreeIO + system is capable of building non-binary trees, for which this + method will currently return an incorrect value!! + Returns : integer + Args : none + +=cut + +sub height{ + my ($self) = @_; + my $nodect = $self->number_nodes; + return 0 if( ! $nodect ); + return log($nodect) / log(2); +} + +=head2 id + + Title : id + Usage : my $id = $tree->id(); + Function: An id value for the tree + Returns : scalar + Args : + + +=cut + +sub id{ + my ($self,@args) = @_; + $self->throw_not_implemented(); +} + +=head2 score + + Title : score + Usage : $obj->score($newval) + Function: Sets the associated score with this tree + This is a generic slot which is probably best used + for log likelihood or other overall tree score + Returns : value of score + Args : newvalue (optional) + + +=cut + +sub score{ + my ($self,$value) = @_; + $self->throw_not_implemented(); +} + +=head2 get_leaf_nodes + + Title : get_leaf_nodes + Usage : my @leaves = $tree->get_leaf_nodes() + Function: Returns the leaves (tips) of the tree + Returns : Array of Bio::Tree::NodeI objects + Args : none + + +=cut + +sub get_leaf_nodes{ + my ($self) = @_; + return grep { $_->is_Leaf() } $self->get_nodes(-sortby => 'creation'); +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/TreeIO.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/TreeIO.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,279 @@ +# $Id: TreeIO.pm,v 1.11 2002/11/05 17:26:04 heikki Exp $ +# +# BioPerl module for Bio::TreeIO +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::TreeIO - Parser for Tree files + +=head1 SYNOPSIS + + { + use Bio::TreeIO; + my $treeio = new Bio::TreeIO('-format' => 'newick', + '-file' => 'globin.dnd'); + while( my $tree = $treeio->next_tree ) { + print "Tree is ", $tree->size, "\n"; + } + } + +=head1 DESCRIPTION + +This is the driver module for Tree reading from data streams and +flatfiles. This is intended to be able to create Bio::Tree::TreeI +objects. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::TreeIO; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::Root::Root; +use Bio::Root::IO; +use Bio::Event::EventGeneratorI; +use Bio::TreeIO::TreeEventBuilder; +use Bio::Factory::TreeFactoryI; + +@ISA = qw(Bio::Root::Root Bio::Root::IO + Bio::Event::EventGeneratorI Bio::Factory::TreeFactoryI); + +=head2 new + + Title : new + Usage : my $obj = new Bio::TreeIO(); + Function: Builds a new Bio::TreeIO object + Returns : Bio::TreeIO + Args : + + +=cut + +sub new { + my($caller,@args) = @_; + my $class = ref($caller) || $caller; + + # or do we want to call SUPER on an object if $caller is an + # object? + if( $class =~ /Bio::TreeIO::(\S+)/ ) { + my ($self) = $class->SUPER::new(@args); + $self->_initialize(@args); + return $self; + } else { + + my %param = @args; + @param{ map { lc $_ } keys %param } = values %param; # lowercase keys + my $format = $param{'-format'} || + $class->_guess_format( $param{'-file'} || $ARGV[0] ) || + 'newick'; + $format = "\L$format"; # normalize capitalization to lower case + + # normalize capitalization + return undef unless( $class->_load_format_module($format) ); + return "Bio::TreeIO::$format"->new(@args); + } +} + + +=head2 next_tree + + Title : next_tree + Usage : my $tree = $treeio->next_tree; + Function: Gets the next tree off the stream + Returns : Bio::Tree::TreeI or undef if no more trees + Args : none + +=cut + +sub next_tree{ + my ($self) = @_; + $self->throw("Cannot call method next_tree on Bio::TreeIO object must use a subclass"); +} + +=head2 write_tree + + Title : write_tree + Usage : $treeio->write_tree($tree); + Function: Writes a tree onto the stream + Returns : none + Args : Bio::Tree::TreeI + + +=cut + +sub write_tree{ + my ($self,$tree) = @_; + $self->throw("Cannot call method write_tree on Bio::TreeIO object must use a subclass"); +} + + +=head2 attach_EventHandler + + Title : attach_EventHandler + Usage : $parser->attatch_EventHandler($handler) + Function: Adds an event handler to listen for events + Returns : none + Args : Bio::Event::EventHandlerI + +=cut + +sub attach_EventHandler{ + my ($self,$handler) = @_; + return if( ! $handler ); + if( ! $handler->isa('Bio::Event::EventHandlerI') ) { + $self->warn("Ignoring request to attatch handler ".ref($handler). ' because it is not a Bio::Event::EventHandlerI'); + } + $self->{'_handler'} = $handler; + return; +} + +=head2 _eventHandler + + Title : _eventHandler + Usage : private + Function: Get the EventHandler + Returns : Bio::Event::EventHandlerI + Args : none + + +=cut + +sub _eventHandler{ + my ($self) = @_; + return $self->{'_handler'}; +} + +sub _initialize { + my($self, @args) = @_; + $self->{'_handler'} = undef; + + # initialize the IO part + $self->_initialize_io(@args); + $self->attach_EventHandler(new Bio::TreeIO::TreeEventBuilder(-verbose => $self->verbose(), @args)); +} + +=head2 _load_format_module + + Title : _load_format_module + Usage : *INTERNAL TreeIO stuff* + Function: Loads up (like use) a module at run time on demand + Example : + Returns : + Args : + +=cut + +sub _load_format_module { + my ($self,$format) = @_; + my $module = "Bio::TreeIO::" . $format; + my $ok; + + eval { + $ok = $self->_load_module($module); + }; + if ( $@ ) { + print STDERR <_guess_format($filename) + Function: + Example : + Returns : guessed format of filename (lower case) + Args : + +=cut + +sub _guess_format { + my $class = shift; + return unless $_ = shift; + return 'newick' if /\.(dnd|newick|nh)$/i; + return 'nhx' if /\.(nhx)$/i; + return 'phyloxml' if /\.(xml)$/i; +} + +sub DESTROY { + my $self = shift; + + $self->close(); +} + +sub TIEHANDLE { + my $class = shift; + return bless {'treeio' => shift},$class; +} + +sub READLINE { + my $self = shift; + return $self->{'treeio'}->next_tree() unless wantarray; + my (@list,$obj); + push @list,$obj while $obj = $self->{'treeio'}->next_tree(); + return @list; +} + +sub PRINT { + my $self = shift; + $self->{'treeio'}->write_tree(@_); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/TreeIO/TreeEventBuilder.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/TreeIO/TreeEventBuilder.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,352 @@ +# $Id: TreeEventBuilder.pm,v 1.11.2.1 2003/09/13 21:51:05 jason Exp $ +# +# BioPerl module for Bio::TreeIO::TreeEventBuilder +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::TreeIO::TreeEventBuilder - Build Bio::Tree::Tree's and + Bio::Tree::Node's from Events + +=head1 SYNOPSIS + +# internal use only + +=head1 DESCRIPTION + +This object will take events and build a Bio::Tree::TreeI compliant +object makde up of Bio::Tree::NodeI objects. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::TreeIO::TreeEventBuilder; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::Event::EventHandlerI; +use Bio::Tree::Tree; +use Bio::Tree::Node; + +@ISA = qw(Bio::Root::Root Bio::Event::EventHandlerI); + +=head2 new + + Title : new + Usage : my $obj = new Bio::TreeIO::TreeEventBuilder(); + Function: Builds a new Bio::TreeIO::TreeEventBuilder object + Returns : Bio::TreeIO::TreeEventBuilder + Args : + + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + my ($treetype, $nodetype) = $self->_rearrange([qw(TREETYPE + NODETYPE)], @args); + $treetype ||= 'Bio::Tree::Tree'; + $nodetype ||= 'Bio::Tree::Node'; + + eval { + $self->_load_module($treetype); + $self->_load_module($nodetype); + }; + + if( $@ ) { + $self->throw("Could not load module $treetype or $nodetype. \n$@\n") + } + $self->treetype($treetype); + $self->nodetype($nodetype); + $self->{'_treelevel'} = 0; + + return $self; +} + +=head2 treetype + + Title : treetype + Usage : $obj->treetype($newval) + Function: + Returns : value of treetype + Args : newvalue (optional) + + +=cut + +sub treetype{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'treetype'} = $value; + } + return $self->{'treetype'}; +} + +=head2 nodetype + + Title : nodetype + Usage : $obj->nodetype($newval) + Function: + Returns : value of nodetype + Args : newvalue (optional) + + +=cut + +sub nodetype{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'nodetype'} = $value; + } + return $self->{'nodetype'}; +} + + +=head2 SAX methods + +=cut + +=head2 start_document + + Title : start_document + Usage : $handler->start_document + Function: Begins a Tree event cycle + Returns : none + Args : none + +=cut + +sub start_document { + my ($self) = @_; + $self->{'_lastitem'} = {}; + $self->{'_currentitems'} = []; + $self->{'_currentnodes'} = []; + return; +} + +=head2 end_document + + Title : end_document + Usage : my @trees = $parser->end_document + Function: Finishes a Phylogeny cycle + Returns : An array Bio::Tree::TreeI + Args : none + +=cut + +sub end_document { + my ($self) = @_; + my $vb = $self->verbose; + my $root = $self->nodetype->new(-verbose => $vb); + # aggregate the nodes into trees basically ad-hoc. + while ( @{$self->{'_currentnodes'}} ) { + my ($node) = ( shift @{$self->{'_currentnodes'}}); + $root->add_Descendent($node); + } + + $self->debug("Root node is " . $root->to_string()."\n"); + if( $self->verbose > 0 ) { + foreach my $node ( $root->get_Descendents ) { + $self->debug("node is ". $node->to_string(). "\n"); + } + } + my $tree = $self->treetype->new(-root => $root, + -verbose => $vb); + return $tree; +} + +=head2 start_element + + Title : start_element + Usage : + Function: + Example : + Returns : + Args : $data => hashref with key 'Name' + +=cut + +sub start_element{ + my ($self,$data) =@_; + $self->{'_lastitem'}->{$data->{'Name'}}++; + + $self->debug("starting element: $data->{Name}\n"); + + push @{$self->{'_lastitem'}->{'current'}},$data->{'Name'}; + + my %data; + + if( $data->{'Name'} eq 'node' ) { + push @{$self->{'_currentitems'}}, \%data; + } elsif ( $data->{Name} eq 'tree' ) { + $self->{'_treelevel'}++; + } +} + +=head2 end_element + + Title : end_element + Usage : + Function: + Returns : none + Args : $data => hashref with key 'Name' + +=cut + +sub end_element{ + my ($self,$data) = @_; + + $self->debug("end of element: $data->{Name}\n"); + + if( $data->{'Name'} eq 'node' ) { + my $tnode; + my $node = pop @{$self->{'_currentitems'}}; + + $tnode = $self->nodetype->new(-verbose => $self->verbose, + %{$node}); + unless ( $node->{'-leaf'} ) { + for ( splice( @{$self->{'_currentnodes'}}, + - $self->{'_nodect'}->[$self->{'_treelevel'}+1])) { + + $self->debug("adding desc: " . $_->to_string . "\n"); + $tnode->add_Descendent($_); + } + $self->{_nodect}->[$self->{_treelevel}+1] = 0; + } + push @{$self->{'_currentnodes'}}, $tnode; + $self->{_nodect}->[$self->{'_treelevel'}]++; + $self->debug ("added node: nodes in stack is ". scalar @{$self->{'_currentnodes'}} . ", treelevel: $self->{_treelevel}, nodect: $self->{_nodect}->[$self->{_treelevel}]\n"); + } elsif( $data->{'Name'} eq 'tree' ) { + $self->debug("end of tree: nodes in stack is ". scalar @{$self->{'_currentnodes'}}. "\n"); + $self->{'_treelevel'}--; + } + $self->{'_lastitem'}->{ $data->{'Name'} }--; + + pop @{$self->{'_lastitem'}->{'current'}}; +} + + +=head2 in_element + + Title : in_element + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub in_element{ + my ($self,$e) = @_; + + return 0 if ! defined $self->{'_lastitem'} || + ! defined $self->{'_lastitem'}->{'current'}->[-1]; + return ($e eq $self->{'_lastitem'}->{'current'}->[-1]); + +} + +=head2 within_element + + Title : within_element + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub within_element{ + my ($self,$e) = @_; + return $self->{'_lastitem'}->{$e}; +} + +=head2 characters + + Title : characters + Usage : $handler->characters($text); + Function: Processes characters + Returns : none + Args : text string + + +=cut + +sub characters{ + my ($self,$ch) = @_; + if( $self->within_element('node') ) { + my $hash = pop @{$self->{'_currentitems'}}; + if( $self->in_element('bootstrap') ) { + $hash->{'-bootstrap'} = $ch; + } elsif( $self->in_element('branch_length') ) { + $hash->{'-branch_length'} = $ch; + } elsif( $self->in_element('id') ) { + $hash->{'-id'} = $ch; + } elsif( $self->in_element('description') ) { + $hash->{'-desc'} = $ch; + } elsif ( $self->in_element('tag_name') ) { + $hash->{'-NHXtagname'} = $ch; + } elsif ( $self->in_element('tag_value') ) { + $hash->{'-nhx'}->{$hash->{'-NHXtagname'}} = $ch; + delete $hash->{'-NHXtagname'}; + } elsif( $self->in_element('leaf') ) { + $hash->{'-leaf'} = $ch; + } + push @{$self->{'_currentitems'}}, $hash; + } + $self->debug("chars: $ch\n"); +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/TreeIO/newick.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/TreeIO/newick.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,256 @@ +# $Id: newick.pm,v 1.13.2.4 2003/09/14 19:00:35 jason Exp $ +# +# BioPerl module for Bio::TreeIO::newick +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::TreeIO::newick - TreeIO implementation for parsing + Newick/New Hampshire/PHYLIP format. + +=head1 SYNOPSIS + + # do not use this module directly + use Bio::TreeIO; + my $treeio = new Bio::TreeIO(-format => 'newick', + -file => 't/data/LOAD_Ccd1.dnd'); + my $tree = $treeio->next_tree; + +=head1 DESCRIPTION + +This module handles parsing and writing of Newick/PHYLIP/New Hampshire format. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::TreeIO::newick; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::TreeIO; +use Bio::Event::EventGeneratorI; +#use XML::Handler::Subs; + + +@ISA = qw(Bio::TreeIO ); + +=head2 next_tree + + Title : next_tree + Usage : my $tree = $treeio->next_tree + Function: Gets the next tree in the stream + Returns : Bio::Tree::TreeI + Args : none + + +=cut + +sub next_tree{ + my ($self) = @_; + local $/ = ";\n"; + return unless $_ = $self->_readline; +# s/\s+//g; + my $despace = sub {my $dirty = shift; $dirty =~ s/\s+//gs; return $dirty}; + my $dequote = sub {my $dirty = shift; $dirty =~ s/^"?\s*(.+?)\s*"?$/$1/; return $dirty}; + s/([^"]*)(".+?")([^"]*)/$despace->($1) . $dequote->($2) . $despace->($3)/egsx; + $self->debug("entry is $_\n"); +# my $empty = chr(20); + + # replace empty labels with a tag +# s/\(,/\($empty,/ig; +# s/,,/,$empty,/ig; +# s/,,/,/ig; +# s/,\)/,$empty\)/ig; +# s/\"/\'/ig; + + my $chars = ''; + $self->_eventHandler->start_document; + my ($prev_event,$lastevent,$id) = ('','',''); + foreach my $ch ( split(//,$_) ) { + if( $ch eq ';' ) { + return $self->_eventHandler->end_document; + } elsif( $ch eq '(' ) { + $chars = ''; + $self->_eventHandler->start_element( {'Name' => 'tree'} ); + } elsif($ch eq ')' ) { + if( length $chars ) { + if( $lastevent eq ':' ) { + $self->_eventHandler->start_element( { 'Name' => 'branch_length'}); + $self->_eventHandler->characters($chars); + $self->_eventHandler->end_element( {'Name' => 'branch_length'}); + $lastevent = $prev_event; + } else { + $self->debug("id with no branchlength is $chars\n"); + $self->_eventHandler->start_element( { 'Name' => 'node' } ); + $self->_eventHandler->start_element( { 'Name' => 'id' } ); + $self->_eventHandler->characters($chars); + $self->_eventHandler->end_element( { 'Name' => 'id' } ); + $id = $chars; + } + my $leafstatus = 0; + if( $lastevent ne ')' ) { + $leafstatus = 1; + } + + $self->_eventHandler->start_element({'Name' => 'leaf'}); + $self->_eventHandler->characters($leafstatus); + $self->_eventHandler->end_element({'Name' => 'leaf'}); + $id = ''; + } else { + $self->_eventHandler->start_element( {'Name' => 'node'} ); + } + + $self->_eventHandler->end_element( {'Name' => 'node'} ); + $self->_eventHandler->end_element( {'Name' => 'tree'} ); + $chars = ''; + } elsif ( $ch eq ',' ) { + if( $chars ) { + if( $lastevent eq ':' ) { + $self->_eventHandler->start_element( { 'Name' => 'branch_length'}); + $self->_eventHandler->characters($chars); + $self->_eventHandler->end_element( {'Name' => 'branch_length'}); + $lastevent = $prev_event; + $chars = ''; + } else { + $self->debug("id with no branchlength is $chars\n"); + $self->_eventHandler->start_element( { 'Name' => 'node' } ); + $self->_eventHandler->start_element( { 'Name' => 'id' } ); + $self->_eventHandler->characters($chars); + $self->_eventHandler->end_element( { 'Name' => 'id' } ); + $id = $chars; + } + } else { + $self->_eventHandler->start_element( { 'Name' => 'node' } ); + } + my $leafstatus = ( $lastevent ne ')' ) ? 1 : 0; + + $self->_eventHandler->start_element({'Name' => 'leaf'}); + $self->_eventHandler->characters($leafstatus); + $self->_eventHandler->end_element({'Name' => 'leaf'}); + $self->_eventHandler->end_element( {'Name' => 'node'} ); + $chars = ''; + $id = ''; + } elsif( $ch eq ':' ) { + $self->debug("id with a branchlength coming is $chars\n"); + $self->_eventHandler->start_element( { 'Name' => 'node' } ); + $self->_eventHandler->start_element( { 'Name' => 'id' } ); + $self->_eventHandler->characters($chars); + $self->_eventHandler->end_element( { 'Name' => 'id' } ); + $id = $chars; + $chars = ''; + } else { + $chars .= $ch; + next; + } + $prev_event = $lastevent; + $lastevent = $ch; + } + return undef; +} + +=head2 write_tree + + Title : write_tree + Usage : $treeio->write_tree($tree); + Function: Write a tree out to data stream in newick/phylip format + Returns : none + Args : Bio::Tree::TreeI object + +=cut + +sub write_tree{ + my ($self,@trees) = @_; + foreach my $tree( @trees ) { + my @data = _write_tree_Helper($tree->get_root_node); + if($data[-1] !~ /\)$/ ) { + $data[0] = "(".$data[0]; + $data[-1] .= ")"; + } + $self->_print(join(',', @data), ";\n"); + } + $self->flush if $self->_flush_on_write && defined $self->_fh; + return; +} + +sub _write_tree_Helper { + my ($node) = @_; + return () if (!defined $node); + + my @data; + + foreach my $n ( $node->each_Descendent() ) { + push @data, _write_tree_Helper($n); + } + + if( @data > 1 ) { + $data[0] = "(" . $data[0]; + $data[-1] .= ")"; + # let's explicitly write out the bootstrap if we've got it + my $b; + if( defined ($b = $node->bootstrap) ) { + $data[-1] .= $b; + } elsif( defined ($b = $node->id) ) { + $data[-1] .= $b; + } + $data[-1] .= ":". $node->branch_length if( $node->branch_length); + + } else { + if( defined $node->id || defined $node->branch_length ) { + push @data, sprintf("%s%s", + defined $node->id ? $node->id : '', + defined $node->branch_length ? ":" . + $node->branch_length : ''); + } + } + return @data; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/TreeIO/nhx.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/TreeIO/nhx.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,296 @@ +# $Id: nhx.pm,v 1.4.2.2 2003/09/14 19:00:36 jason Exp $ +# +# BioPerl module for Bio::TreeIO::nhx +# +# Cared for by Aaron Mackey +# +# Copyright Aaron Mackey +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::TreeIO::nhx - TreeIO implementation for parsing + Newick/New Hampshire eXtendend (NHX) format. + +=head1 SYNOPSIS + + # do not use this module directly + use Bio::TreeIO; + my $treeio = new Bio::TreeIO(-format => 'nhx', -file => 'tree.dnd'); + my $tree = $treeio->next_tree; + +=head1 DESCRIPTION + +This module handles parsing and writing of Newick/New Hampshire eXtended (NHX) format. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Aaron Mackey + +Email amackey@virginia.edu + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::TreeIO::nhx; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::TreeIO; +use Bio::Tree::NodeNHX; +use Bio::Event::EventGeneratorI; +#use XML::Handler::Subs; + + +@ISA = qw(Bio::TreeIO ); + +sub _initialize { + my($self, %args) = @_; + $args{-nodetype} ||= 'Bio::Tree::NodeNHX'; + $self->SUPER::_initialize(%args); +} + +=head2 next_tree + + Title : next_tree + Usage : my $tree = $treeio->next_tree + Function: Gets the next tree in the stream + Returns : Bio::Tree::TreeI + Args : none + + +=cut + +sub next_tree{ + my ($self) = @_; + local $/ = ";\n"; + return unless $_ = $self->_readline; + s/\s+//g; + $self->debug("entry is $_\n"); + my $chars = ''; + $self->_eventHandler->start_document; + my ($prev_event,$lastevent) = ('',''); + my @ch = split(//, $_); + foreach my $ch (@ch) { + if( $ch eq ';' ) { + $self->_eventHandler->in_element('node') && + $self->_eventHandler->end_element( {'Name' => 'node'}); + return $self->_eventHandler->end_document; + } elsif ($ch eq '[') { + if ( length $chars ) { + if ( $lastevent eq ':' ) { + $self->_eventHandler->start_element( { Name => 'branch_length' } ); + $self->_eventHandler->characters($chars); + $self->_eventHandler->end_element( { Name => 'branch_length' }); + $lastevent = $prev_event; + } else { + $self->debug("id with no branchlength is $chars\n"); + $self->_eventHandler->start_element( { 'Name' => 'node' } ); + $self->_eventHandler->start_element( { 'Name' => 'id' } ); + $self->_eventHandler->characters($chars); + $self->_eventHandler->end_element( { 'Name' => 'id' } ); + } + } else { + $self->_eventHandler->start_element( { Name => 'node' } ); + } + my $leafstatus = ( $lastevent ne ')' ) ? 1 : 0; + $self->_eventHandler->start_element({'Name' => 'leaf'}); + $self->_eventHandler->characters($leafstatus); + $self->_eventHandler->end_element({'Name' => 'leaf'}); + $chars = ''; + + $self->_eventHandler->start_element( { Name => 'nhx_tag' }); + } elsif( $ch eq '(' ) { + $chars = ''; + $self->_eventHandler->start_element( {'Name' => 'tree'} ); + } elsif($ch eq ')' ) { + if( length $chars ) { + if( $lastevent eq ':') { + unless ($self->_eventHandler->within_element('nhx_tag')) { + $self->_eventHandler->start_element( { 'Name' => 'branch_length'}); + $self->_eventHandler->characters($chars); + $self->_eventHandler->end_element( {'Name' => 'branch_length'}); + } else { + $self->throw("malformed input; end of node ) before ] found"); + } + } else { + $self->debug("id with no branchlength is $chars\n"); + $self->_eventHandler->start_element( { 'Name' => 'node' } ); + $self->_eventHandler->start_element( { 'Name' => 'id' } ); + $self->_eventHandler->characters($chars); + $self->_eventHandler->end_element( { 'Name' => 'id' } ); + } + + } elsif ( $lastevent ne ']' ) { + $self->_eventHandler->start_element( {'Name' => 'node'} ); + } + my $leafstatus = ( $lastevent ne ')' ) ? 1 : 0; + $self->_eventHandler->start_element({'Name' => 'leaf'}); + $self->_eventHandler->characters($leafstatus); + $self->_eventHandler->end_element({'Name' => 'leaf'}); + + $self->_eventHandler->end_element( {'Name' => 'node'} ); + $self->_eventHandler->end_element( {'Name' => 'tree'} ); + $chars = ''; + } elsif ( $ch eq ',' ) { + if( length $chars ) { + if( $lastevent eq ':' ) { + $self->_eventHandler->start_element( { 'Name' => 'branch_length'}); + $self->_eventHandler->characters($chars); + $self->_eventHandler->end_element( {'Name' => 'branch_length'}); + $lastevent = $prev_event; + } else { + $self->debug("id with no branchlength is $chars\n"); + $self->_eventHandler->start_element( { 'Name' => 'node' } ); + $self->_eventHandler->start_element( { 'Name' => 'id' } ); + $self->_eventHandler->characters($chars); + $self->_eventHandler->end_element( { 'Name' => 'id' } ); + } + } elsif ( $lastevent ne ']' ) { + $self->_eventHandler->start_element( { 'Name' => 'node' } ); + } + $self->_eventHandler->end_element( {'Name' => 'node'} ); + $chars = ''; + } elsif( $ch eq ':' ) { + if ($self->_eventHandler->within_element('nhx_tag')) { + if ($lastevent eq '=') { + $self->_eventHandler->start_element( { Name => 'tag_value' } ); + $self->_eventHandler->characters($chars); + $self->_eventHandler->end_element( { Name => 'tag_value' } ); + $chars = ''; + } else { + if ($chars eq '&&NHX') { + $chars = ''; # get rid of &&NHX: + } else { + $self->throw("Unrecognized, non \&\&NHX string: >>$chars<<"); + } + } + } elsif ($lastevent ne ']') { + $self->debug("id with a branchlength coming is $chars\n"); + $self->_eventHandler->start_element( { 'Name' => 'node' } ); + $self->_eventHandler->start_element( { 'Name' => 'id' } ); + $self->_eventHandler->characters($chars); + $self->_eventHandler->end_element( { 'Name' => 'id' } ); + $chars = ''; + } + } elsif ( $ch eq '=' ) { + if ($self->_eventHandler->within_element('nhx_tag')) { + $self->_eventHandler->start_element( { Name => 'tag_name' } ); + $self->_eventHandler->characters($chars); + $self->_eventHandler->end_element( { Name => 'tag_name' } ); + $chars = ''; + } else { + $chars .= $ch; + } + } elsif ( $ch eq ']' ) { + if ($self->_eventHandler->within_element('nhx_tag') && $lastevent eq '=') { + $self->_eventHandler->start_element( { Name => 'tag_value' } ); + $self->_eventHandler->characters($chars); + $self->_eventHandler->end_element( { Name => 'tag_value' } ); + $chars = ''; + $self->_eventHandler->end_element( { Name => 'nhx_tag' } ); + } else { + $chars .= $ch; + next; + } + } else { + $chars .= $ch; + next; + } + $prev_event = $lastevent; + $lastevent = $ch; + } + return undef; +} + +=head2 write_tree + + Title : write_tree + Usage : $treeio->write_tree($tree); + Function: Write a tree out to data stream in nhx format + Returns : none + Args : Bio::Tree::TreeI object + +=cut + +sub write_tree{ + my ($self,@trees) = @_; + foreach my $tree ( @trees ) { + my @data = _write_tree_Helper($tree->get_root_node); + if($data[-1] !~ /\)$/ ) { + $data[0] = "(".$data[0]; + $data[-1] .= ")"; + } + $self->_print(join(',', @data), ";\n"); + } + $self->flush if $self->_flush_on_write && defined $self->_fh; + return; +} + +sub _write_tree_Helper { + my ($node) = @_; + return () unless defined $node; + + my @data; + + foreach my $n ( $node->each_Descendent() ) { + push @data, _write_tree_Helper($n); + } + + if( @data > 1 ) { + $data[0] = "(" . $data[0]; + $data[-1] .= ")"; + $data[-1] .= ":". $node->branch_length if $node->branch_length; + # this is to not print out an empty NHX for the root node which is + # a convience for how we get a handle to the whole tree + if( $node->ancestor || $node->id || defined $node->branch_length ) { + $data[-1] .= '[' . + join(":", "&&NHX", + map { "$_=" .join(',',$node->get_tag_values($_)) } + $node->get_all_tags() ) . ']'; + } + } else { + push @data, $node->to_string; # a leaf + } + return @data; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/TreeIO/tabtree.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/TreeIO/tabtree.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,153 @@ +# $Id: tabtree.pm,v 1.6 2002/10/22 07:45:25 lapp Exp $ +# +# BioPerl module for Bio::TreeIO::tabtree +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::TreeIO::tabtree - A simple output format which displays a tree as an ASCII drawing + +=head1 SYNOPSIS + + use Bio::TreeIO; + my $in = new Bio::TreeIO(-file => 'input', -format => 'newick'); + my $out = new Bio::TreeIO(-file => '>output', -format => 'tabtree'); + + while( my $tree = $in->next_tree ) { + $out->write_tree($tree); + } + +=head1 DESCRIPTION + +This is a made up format just for outputting trees as an ASCII drawing. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::TreeIO::tabtree; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::TreeIO; + + +@ISA = qw(Bio::TreeIO ); + +=head2 new + + Title : new + Usage : my $obj = new Bio::TreeIO::tabtree(); + Function: Builds a new Bio::TreeIO::tabtree object + Returns : Bio::TreeIO::tabtree + Args : + + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + +} + +=head2 write_tree + + Title : write_tree + Usage : $treeio->write_tree($tree); + Function: Write a tree out to data stream in newick/phylip format + Returns : none + Args : Bio::Tree::TreeI object + +=cut + +sub write_tree{ + my ($self,$tree) = @_; + my @data = _write_tree_Helper($tree->get_root_node,0); + $self->_print(join("\n", @data), "\n"); + $self->flush if $self->_flush_on_write && defined $self->_fh; + return; +} + +sub _write_tree_Helper { + my ($node,$depth) = @_; + return () if (!defined $node); + + my @data; + my @d = $node->each_Descendent(); + + push @data,sprintf("%s%s","\t"x$depth, + $node->to_string); + if( @d ) { + my $c = 0; + foreach my $n ( @d ) { + push @data, _write_tree_Helper($n,$depth+1); + } + } + + return @data; +} + +=head2 next_tree + + Title : next_tree + Usage : + Function: Sorry not possible with this format + Returns : none + Args : none + + +=cut + +sub next_tree{ + $_[0]->throw("Sorry the format 'tabtree' can only be used as an output format at this time"); +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/UpdateableSeqI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/UpdateableSeqI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,107 @@ +# $Id: UpdateableSeqI.pm,v 1.4 2002/10/22 07:45:09 lapp Exp $ +# +# BioPerl module for Bio::UpdateableSeqI +# +# Cared for by David Block +# +# Copyright David Block +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::UpdateableSeqI - Descendant of Bio::SeqI that allows updates + +=head1 SYNOPSIS + +See Bio::SeqI for most of the documentation. +See the documentation of the methods for further details. + +=head1 DESCRIPTION + +Bio::UpdateableSeqI is an interface for Sequence objects which are +expected to allow users to perform basic editing functions (update/delete) +on their component SeqFeatures. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - David Block + +Email dblock@gene.pbi.nrc.ca + +=head1 CONTRIBUTORS + +Ewan Birney forced me to this... + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::UpdateableSeqI; +use vars qw(@ISA); +use strict; +use Carp; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::SeqI; + + +@ISA = qw(Bio::SeqI); + + +=head2 delete_feature + + Title : delete_feature + Usage : my $orphanlist=$self->delete_feature($feature,$transcript,$gene); + Function: deletes the specified $feature from the given transcript, if $transcript is sent and exists and $feature is a feature of $transcript, + or from $gene if the $feature is a feature of $gene, or from $self if $transcript and $gene are not sent. Keeps track of the features + of the $gene object that may be left as orphans and returns them as a listref. + Example : I want to delete transcript 'abc' of gene 'def', with three exons, leaving only transcript 'ghi' with two exons. + This will leave exons 1 and 3 part of 'ghi', but exon 2 will become an orphan. + my $orphanlist=$seq->delete_feature($transcript{'abc'},undef,$gene{'def'}); + $orphanlist is a reference to a list containing $exon{'2'}; + Returns : a listref of orphaned features after the deletion of $feature (optional) + Args : $feature - the feature to be deleted + $transcript - the transcript containing the $feature, so that a $feature can be removed from only one transcript when there are multiple + transcripts in a gene. + $gene - the gene containing the $transcript and/or the $feature + + +=cut + +sub delete_feature{ + my ($self,$feature,$transcript,$gene) = @_; + + $self->throw_not_implemented(); +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Variation/AAChange.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Variation/AAChange.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,477 @@ +# $Id: AAChange.pm,v 1.13 2002/10/22 07:38:49 lapp Exp $ +# +# BioPerl module for Bio::Variation::AAChange +# +# Cared for by Heikki Lehvaslaiho +# +# Copyright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Variation::AAChange - Sequence change class for polypeptides + +=head1 SYNOPSIS + + $aamut = Bio::Variation::AAChange->new + ('-start' => $start, + '-end' => $end, + '-length' => $len, + '-proof' => $proof, + '-isMutation' => 1, + '-mut_number' => $mut_number + ); + + my $a1 = Bio::Variation::Allele->new; + $a1->seq($ori) if $ori; + $aamut->allele_ori($a1); + my $a2 = Bio::Variation::Allele->new; + $a2->seq($mut) if $mut; + $aachange->add_Allele($a2); + $aachange->allele_mut($a2); + + print "\n"; + + # add it to a SeqDiff container object + $seqdiff->add_Variant($rnachange); + + # and create links to and from RNA level variant objects + $aamut->RNAChange($rnachange); + $rnachange->AAChange($rnachange); + +=head1 DESCRIPTION + +The instantiable class Bio::Variation::RNAChange describes basic +sequence changes at polypeptide level. It uses methods defined in +superclass Bio::Variation::VariantI, see L +for details. + +If the variation described by a AAChange object has a known +Bio::Variation::RNAAChange object, create the link with method +AAChange(). See L for more information. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email: heikki@ebi.ac.uk + +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Variation::AAChange; +my $VERSION=1.0; +use vars qw(@ISA $MATRIX); +use strict; + +# Object preamble - inheritance +use Bio::Variation::VariantI; + +@ISA = qw( Bio::Variation::VariantI ); + +BEGIN { + +my $matrix = << "__MATRIX__"; +# Matrix made by matblas from blosum62.iij +# * column uses minimum score +# BLOSUM Clustered Scoring Matrix in 1/2 Bit Units +# Blocks Database = /data/blocks_5.0/blocks.dat +# Cluster Percentage: >= 62 +# Entropy = 0.6979, Expected = -0.5209 + A R N D C Q E G H I L K M F P S T W Y V B Z X * +A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4 +R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4 +N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4 +D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4 +C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4 +Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4 +E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4 +G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4 +H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4 +I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4 +L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4 +K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4 +M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4 +F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4 +P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4 +S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4 +T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4 +W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4 +Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4 +V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4 +B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4 +Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4 +X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4 +* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1 +__MATRIX__ + + my %blosum = (); + $matrix =~ /^ +(.+)$/m; + my @aas = split / +/, $1; + foreach my $aa (@aas) { + my $tmp = $aa; + $tmp = "\\$aa" if $aa eq '*'; + $matrix =~ /^($tmp) +([-+]?\d.*)$/m; + my @scores = split / +/, $2 if defined $2; + my $count = 0; + foreach my $ak (@aas) { + $blosum{$aa}->{$aas[$count]} = $scores[$count]; + $count++; + } + } + sub _matrix; + $MATRIX = \%blosum; +} + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + + my ($start, $end, $length, $strand, $primary, $source, + $frame, $score, $gff_string, + $allele_ori, $allele_mut, $upstreamseq, $dnstreamseq, + $label, $status, $proof, $re_changes, $region, $region_value, + $region_dist, + $numbering, $mut_number, $ismutation) = + $self->_rearrange([qw(START + END + LENGTH + STRAND + PRIMARY + SOURCE + FRAME + SCORE + GFF_STRING + ALLELE_ORI + ALLELE_MUT + UPSTREAMSEQ + DNSTREAMSEQ + LABEL + STATUS + PROOF + RE_CHANGES + REGION + REGION_VALUE + REGION_DIST + NUMBERING + MUT_NUMBER + ISMUTATION + )],@args); + + $self->primary_tag("Variation"); + + $self->{ 'alleles' } = []; + + $start && $self->start($start); + $end && $self->end($end); + $length && $self->length($length); + $strand && $self->strand($strand); + $primary && $self->primary_tag($primary); + $source && $self->source_tag($source); + $frame && $self->frame($frame); + $score && $self->score($score); + $gff_string && $self->_from_gff_string($gff_string); + + $allele_ori && $self->allele_ori($allele_ori); + $allele_mut && $self->allele_mut($allele_mut); + $upstreamseq && $self->upstreamseq($upstreamseq); + $dnstreamseq && $self->dnstreamseq($dnstreamseq); + + $label && $self->label($label); + $status && $self->status($status); + $proof && $self->proof($proof); + $region && $self->region($region); + $region_value && $self->region_value($region_value); + $region_dist && $self->region_dist($region_dist); + $numbering && $self->numbering($numbering); + $mut_number && $self->mut_number($mut_number); + $ismutation && $self->isMutation($ismutation); + + return $self; # success - we hope! +} + +=head2 RNAChange + + Title : RNAChange + Usage : $mutobj = $self->RNAChange; + : $mutobj = $self->RNAChange($objref); + Function: Returns or sets the link-reference to a mutation/change object. + If there is no link, it will return undef + Returns : an obj_ref or undef + +=cut + +sub RNAChange { + my ($self,$value) = @_; + if (defined $value) { + if( ! $value->isa('Bio::Variation::RNAChange') ) { + $self->throw("Is not a Bio::Variation::RNAChange object but a [$self]"); + return (undef); + } + else { + $self->{'RNAChange'} = $value; + } + } + unless (exists $self->{'RNAChange'}) { + return (undef); + } else { + return $self->{'RNAChange'}; + } +} + + + +=head2 label + + Title : label + Usage : $obj->label(); + Function: + + Sets and returns mutation event label(s). If value is not + set, or no argument is given returns false. Each + instantiable subclass of L needs + to implement this method. Valid values are listed in + 'Mutation event controlled vocabulary' in + http://www.ebi.ac.uk/mutations/recommendations/mutevent.html. + + Example : + Returns : string + Args : string + +=cut + + +sub label { + my ($self) = @_; + my ($o, $m, $type); + $o = $self->allele_ori->seq if $self->allele_ori and $self->allele_ori->seq; + $m = $self->allele_mut->seq if $self->allele_mut and $self->allele_mut->seq; + + if ($self->start == 1 ) { + if ($o and substr($o, 0, 1) ne substr($m, 0, 1)) { + $type = 'no translation'; + } + elsif ($o and $m and $o eq $m ) { + $type = 'silent'; + } + # more ... + } + elsif ($o and substr($o, 0, 1) eq '*' ) { + if ($m and substr($o, 0, 1) ne substr($m, 0, 1)) { + $type = 'post-elongation'; + } + elsif ($m and $o eq $m ) { + $type = 'silent, conservative'; + } + } + elsif ($o and $m and $o eq $m) { + $type = 'silent, conservative'; + } + elsif ($m and $m eq '*') { + $type = 'truncation'; + } + elsif ($o and $m and $o eq $m) { + $type = 'silent, conservative'; + } + elsif (not $m or + ($o and $m and length($o) > length($m) and + substr($m, -1, 1) ne '*')) { + $type = 'deletion'; + if ($o and $m and $o !~ $m and $o !~ $m) { + $type .= ', complex'; + } + } + elsif (not $o or + ($o and $m and length($o) < length($m) and + substr($m, -1, 1) ne '*' ) ) { + $type = 'insertion'; + if ($o and $m and $o !~ $m and $o !~ $m) { + $type .= ', complex'; + } + } + elsif ($o and $m and $o ne $m and + length $o == 1 and length $m == 1 ) { + $type = 'substitution'; + my $value = $self->similarity_score; + if (defined $value) { + my $cons = ($value < 0) ? 'nonconservative' : 'conservative'; + $type .= ", ". $cons; + } + } else { + $type = 'out-of-frame translation, truncation'; + } + $self->{'label'} = $type; + return $self->{'label'}; +} + + +=head2 similarity_score + + Title : similarity_score + Usage : $self->similarity_score + Function: Measure for evolutionary conservativeness + of single amino substitutions. Uses BLOSUM62. + Negative numbers are noncoservative changes. + Returns : integer, undef if not single amino acid change + +=cut + +sub similarity_score { + my ($self) = @_; + my ($o, $m, $type); + $o = $self->allele_ori->seq if $self->allele_ori and $self->allele_ori->seq; + $m = $self->allele_mut->seq if $self->allele_mut and $self->allele_mut->seq; + return undef unless $o and $m and length $o == 1 and length $m == 1; + return undef unless $o =~ /[ARNDCQEGHILKMFPSTWYVBZX*]/i and + $m =~ /[ARNDCQEGHILKMFPSTWYVBZX*]/i; + return $MATRIX->{"\U$o"}->{"\U$m"}; +} + +=head2 trivname + + Title : trivname + Usage : $self->trivname + Function: + + Given a Bio::Variation::AAChange object with linked + Bio::Variation::RNAChange and Bio::Variation::DNAMutation + objects, this subroutine creates a string corresponding to + the 'trivial name' of the mutation. Trivial name is + specified in Antonorakis & MDI Nomenclature Working Group: + Human Mutation 11:1-3, 1998. + http://www.interscience.wiley.com/jpages/1059-7794/nomenclature.html + + Returns : string + +=cut + + +sub trivname { + my ($self,$value) = @_; + if( defined $value) { + $self->{'trivname'} = $value; + } else { + my ( $aaori, $aamut,$aamutsymbol, $aatermnumber, $aamutterm) = + ('', '', '', '', ''); + my $o = $self->allele_ori->seq if $self->allele_ori and $self->allele_ori->seq; + #my $m = $self->allele_mut->seq if $self->allele_mut and $self->allele_mut->seq; + + $aaori = substr ($o, 0, 1) if $o; + $aaori =~ tr/\*/X/; + + my $sep; + if ($self->isMutation) { + $sep = '>'; + } else { + $sep = '|'; + } + my $trivname = $aaori. $self->start; + $trivname .= $sep if $sep eq '|'; + + my @alleles = $self->each_Allele; + foreach my $allele (@alleles) { + my $m = $allele->seq if $allele->seq; + + $self->allele_mut($allele); + #$trivname .= $sep. uc $m if $m; + + $aamutterm = substr ($m, -1, 1) if $m; + if ($self->RNAChange->label =~ /initiation codon/ and + ( $o and $m and $o ne $m)) { + $aamut = 'X'; + } + elsif (CORE::length($o) == 1 and CORE::length($m) == 1 ) { + $aamutsymbol = ''; + $aamut = $aamutterm; + } + elsif ($self->RNAChange->label =~ /deletion/) { + $aamutsymbol = 'del'; + if ($aamutterm eq '*') { + $aatermnumber = $self->start + length($m) -1; + $aamut = 'X'. $aatermnumber; + } + if ($self->RNAChange && $self->RNAChange->label =~ /inframe/){ + $aamut = '-'. length($self->RNAChange->allele_ori->seq)/3 ; + } + } + elsif ($self->RNAChange->label =~ /insertion/) { + $aamutsymbol = 'ins'; + if (($aamutterm eq '*') && (length($m)-1 != 0)) { + $aatermnumber = $self->start + length($m)-1; + $aamut = $aatermnumber. 'X'; + } + if ($self->RNAChange->label =~ /inframe/){ + $aamut = '+'. int length($self->RNAChange->allele_mut->seq)/3 ; + } + } + elsif ($self->RNAChange->label =~ /complex/ ) { + my $diff = length($m) - length($o); + if ($diff >= 0 ) { + $aamutsymbol = 'ins'; + } else { + $aamutsymbol = 'del' ; + } + if (($aamutterm eq '*') && (length($m)-1 != 0)) { + $aatermnumber = $self->start + length($m)-1; + $aamut = $aatermnumber. 'X'; + } + if ($self->RNAChange->label =~ /inframe/){ + + if ($diff >= 0 ) { + $aamut = '+'. $diff ; + } else { + $aamut = $diff ; + } + } + } + elsif ($self->label =~ /truncation/) { + $aamut = $m; + } else { + $aamutsymbol = ''; + $aamut = $aamutterm; + } + $aamut =~ tr/\*/X/; + $trivname .= $aamutsymbol. $aamut. $sep; + } + chop $trivname; + $self->{'trivname'} = $trivname; + } + return $self->{'trivname'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Variation/AAReverseMutate.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Variation/AAReverseMutate.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,302 @@ +# $Id: AAReverseMutate.pm,v 1.6 2002/10/22 07:38:49 lapp Exp $ +# +# BioPerl module for Bio::Variation::AAReverseMutate +# +# Cared for by Heikki Lehvaslaiho +# +# Copyright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Variation::AAReverseMutate - point mutation and codon + information from single amino acid changes + +=head1 SYNOPSIS + + $aamut = new Bio::Variation::AAReverseMutate + (-aa_ori => 'F', + -aa_mut => 'S', + -codon_ori => 'ttc', # optional + -codon_table => '3' # defaults to 1 + ); + + @points = $aamut->each_Variant; + + if (scalar @points > 0 ) { + foreach $rnachange ( @points ) { + # $rnachange is a Bio::Variation::RNAChange object + print " ", $rnachange->allele_ori->seq, ">", + $rnachange->allele_mut->seq, " in ", + $rnachange->codon_ori, ">", $rnachange->codon_mut, + " at position ", $rnachange->codon_pos, "\n"; + } + } else { + print "No point mutations possible\n", + } + +=head1 DESCRIPTION + +Bio::Variation::AAReverseMutate objects take in reference and mutated +amino acid information and deduces potential point mutations at RNA +level leading to this change. The choice can be further limited by +letting the object know what is the the codon in the reference +sequence. The results are returned as L +objects. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email: heikki@ebi.ac.uk +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Variation::AAReverseMutate; +my $VERSION=1.0; +use vars qw(@ISA); +use strict; + +# Object preamble - inheritance +use Bio::Tools::CodonTable; +use Bio::Variation::RNAChange; +use Bio::Variation::Allele; + +@ISA = qw( Bio::Root::Root); + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + + my ($aa_ori, $aa_mut, $codon_ori, $codon_table) = + $self->_rearrange([qw(AA_ORI + AA_MUT + CODON + CODON_TABLE + )],@args); + + $aa_ori && $self->aa_ori($aa_ori); + $aa_mut && $self->aa_mut($aa_mut); + $codon_ori && $self->codon_ori($codon_ori); + $codon_table && $self->codon_table($codon_table); + + return $self; # success - we hope! + +} + + +=head2 aa_ori + + Title : aa_ori + Usage : $obj->aa_ori(); + Function: + + Sets and returns original aa sequence. If value is not + set, returns false. + + Amino acid sequences are stored in upper case characters, + others in lower case. + + Example : + Returns : string + Args : single character amino acid code + +=cut + +sub aa_ori { + my ($self,$value) = @_; + if( defined $value) { + if ( uc($value) !~ /^[ARNDCQEGHILKMFPSTWYVBZX*]$/ ) { + $self->throw("'$value' is not a valid one letter amino acid symbol\n"); + } else { + $self->{'aa_ori'} = uc $value; + } + } + return $self->{'aa_ori'}; +} + + +=head2 aa_mut + + Title : aa_mut + Usage : $obj->aa_mut(); + Function: + + Sets and returns the mutated allele sequence. If value is not + set, returns false. + + Example : + Returns : string + Args : single character amino acid code + +=cut + + +sub aa_mut { + my ($self,$value) = @_; + if( defined $value) { + if ( uc($value) !~ /^[ARNDCQEGHILKMFPSTWYVBZX*]$/ ) { + $self->throw("'$value' is not a valid one letter amino acid symbol\n"); + } else { + $self->{'aa_mut'} = uc $value; + } + } + return $self->{'aa_mut'}; +} + + +=head2 codon_ori + + Title : codon_ori + Usage : $obj->codon_ori(); + Function: + + Sets and returns codon_ori triplet. If value is not set, + returns false. The string has to be three characters + long. The chracter content is not checked. + + Example : + Returns : string + Args : string + +=cut + +sub codon_ori { + my ($self,$value) = @_; + if( defined $value) { + if (length $value != 3 or lc $value =~ /[^atgc]/) { + $self->warn("Codon string \"$value\" is not valid unique codon"); + } + $self->{'codon_ori'} = lc $value; + } + return $self->{'codon_ori'}; +} + +=head2 codon_table + + Title : codon_table + Usage : $obj->codon_table(); + Function: + + Sets and returns the codon table id of the RNA + If value is not set, returns 1, 'universal' code, as the default. + + Example : + Returns : integer + Args : none if get, the new value if set + +=cut + + +sub codon_table { + my ($self,$value) = @_; + if( defined $value) { + if ( not $value =~ /^\d+$/ ) { + $self->throw("'$value' is not a valid codon table ID\n". + "Has to be a positive integer. Defaulting to 1\n"); + } else { + $self->{'codon_table'} = $value; + } + } + if( ! exists $self->{'codon_table'} ) { + return 1; + } else { + return $self->{'codon_table'}; + } +} + + +=head2 each_Variant + + Title : each_Variant + Usage : $obj->each_Variant(); + Function: + + Returns a list of Variants. + + Example : + Returns : list of Variants + Args : none + +=cut + +sub each_Variant{ + my ($self,@args) = @_; + + $self->throw("aa_ori is not defined\n") if not defined $self->aa_ori; + $self->throw("aa_mut is not defined\n") if not defined $self->aa_mut; + + my (@points, $codon_pos, $allele_ori, $allele_mut); + my $ct = Bio::Tools::CodonTable->new( '-id' => $self->codon_table ); + foreach my $codon_ori ($ct->revtranslate($self->aa_ori)) { + next if $self->codon_ori and $self->codon_ori ne $codon_ori; + foreach my $codon_mut ($ct->revtranslate($self->aa_mut)) { + my $k = 0; + my $length = 0; + $codon_pos = $allele_ori = $allele_mut = undef; + while ($k<3) { + my $nt_ori = substr ($codon_ori, $k, 1); + my $nt_mut = substr ($codon_mut, $k, 1); + if ($nt_ori ne $nt_mut) { + $length++; + $codon_pos = $k+1; + $allele_ori = $nt_ori; + $allele_mut = $nt_mut; + } + $k++; + } + if ($length == 1) { + my $rna = Bio::Variation::RNAChange->new + ('-length' => '1', + '-codon_ori' => $codon_ori, + '-codon_mut' => $codon_mut, + '-codon_pos' => $codon_pos, + '-isMutation' => 1 + ); + my $all_ori = Bio::Variation::Allele->new('-seq'=>$allele_ori); + $rna->allele_ori($all_ori); + my $all_mut = Bio::Variation::Allele->new('-seq'=>$allele_mut); + $rna->allele_mut($all_mut); + push @points, $rna; + } + } + } + return @points; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Variation/Allele.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Variation/Allele.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,293 @@ +# $Id: Allele.pm,v 1.9 2002/10/22 07:38:49 lapp Exp $ +# +# BioPerl module for Bio::Variation::Allele +# +# Cared for by Heikki Lehvaslaiho +# +# Copyright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Variation::Allele - Sequence object with allele-specific attributes + +=head1 SYNOPSIS + + $allele1 = Bio::Variation::Allele->new ( -seq => 'A', + -id => 'AC00001.1', + -alphabet => 'dna', + -is_reference => 1 + ); + +=head1 DESCRIPTION + +List of alleles describe known sequence alternatives in a variable region. +Alleles are contained in Bio::Variation::VariantI complying objects. +See L for details. + +Bio::Varation::Alleles are PrimarySeqI complying objects which can +contain database cross references as specified in +Bio::DBLinkContainerI interface, too. + +A lot of the complexity with dealing with Allele objects are caused by +null alleles; Allele objects that have zero length sequence string. + +In addition describing the allele by its sequence , it possible to +give describe repeat structure within the sequence. This done using +methods repeat_unit (e.g. 'ca') and repeat_count (e.g. 7). + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email: heikki@ebi.ac.uk +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Variation::Allele; +my $VERSION=1.0; +use vars qw(@ISA); +use strict; + +# Object preamble - inheritance + +use Bio::PrimarySeq; +use Bio::DBLinkContainerI; + +@ISA = qw( Bio::PrimarySeq Bio::DBLinkContainerI ); + +sub new { + my($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + my($is_reference, $repeat_unit, $repeat_count) = + $self->_rearrange([qw(IS_REFERENCE + REPEAT_UNIT + REPEAT_COUNT + )], + @args); + + $is_reference && $self->is_reference($is_reference); + $repeat_unit && $self->repeat_unit($repeat_unit); + $repeat_count && $self->repeat_count($repeat_count); + + return $self; # success - we hope! +} + + +=head2 is_reference + + Title : is_reference + Usage : $obj->is_reference() + Function: sets and returns boolean values. + Unset values return false. + Example : $obj->is_reference() + Returns : boolean + Args : optional true of false value + + +=cut + + +sub is_reference { + my ($self,$value) = @_; + if( defined $value) { + $value ? ($value = 1) : ($value = 0); + $self->{'is_reference'} = $value; + } + if( ! exists $self->{'is_reference'} ) { + return 0; + } + else { + return $self->{'is_reference'}; + } +} + + +=head2 add_DBLink + + Title : add_DBLink + Usage : $self->add_DBLink($ref) + Function: adds a link object + Example : + Returns : + Args : + + +=cut + + +sub add_DBLink{ + my ($self,$com) = @_; + if( ! $com->isa('Bio::Annotation::DBLink') ) { + $self->throw("Is not a link object but a [$com]"); + } + push(@{$self->{'link'}},$com); +} + +=head2 each_DBLink + + Title : each_DBLink + Usage : foreach $ref ( $self->each_DBlink() ) + Function: gets an array of DBlink of objects + Example : + Returns : + Args : + + +=cut + +sub each_DBLink{ + my ($self) = @_; + return @{$self->{'link'}}; +} + +=head2 repeat_unit + + Title : repeat_unit + Usage : $obj->repeat_unit('ca'); + Function: + + Sets and returns the sequence of the repeat_unit the + allele is composed of. + + Example : + Returns : string + Args : string + +=cut + +sub repeat_unit { + my ($self,$value) = @_; + if( defined $value) { + $self->{'repeat_unit'} = $value; + } + if ($self->{'seq'} && $self->{'repeat_unit'} && $self->{'repeat_count'} ) { + $self->warn("Repeats do not add up!") + if ( $self->{'repeat_unit'} x $self->{'repeat_count'}) ne $self->{'seq'}; + } + return $self->{'repeat_unit'}; +} + +=head2 repeat_count + + Title : repeat_count + Usage : $obj->repeat_count(); + Function: + + Sets and returns the number of repeat units in the allele. + + Example : + Returns : string + Args : string + +=cut + + +sub repeat_count { + my ($self,$value) = @_; + if( defined $value) { + if ( not $value =~ /^\d+$/ ) { + $self->throw("[$value] for repeat_count has to be a positive integer\n"); + } else { + $self->{'repeat_count'} = $value; + } + } + if ($self->{'seq'} && $self->{'repeat_unit'} && $self->{'repeat_count'} ) { + $self->warn("Repeats do not add up!") + if ( $self->{'repeat_unit'} x $self->{'repeat_count'}) ne $self->{'seq'}; + } + return $self->{'repeat_count'}; +} + +=head2 count + + Title : count + Usage : $obj->count(); + Function: + + Sets and returns the number of times this allele was observed. + + Example : + Returns : string + Args : string + +=cut + +sub count { + my ($self,$value) = @_; + if( defined $value) { + if ( not $value =~ /^\d+$/ ) { + $self->throw("[$value] for count has to be a positive integer\n"); + } else { + $self->{'count'} = $value; + } + } + return $self->{'count'}; +} + + +=head2 frequency + + Title : frequency + Usage : $obj->frequency(); + Function: + + Sets and returns the frequency of the allele in the observed + population. + + Example : + Returns : string + Args : string + +=cut + +sub frequency { + my ($self,$value) = @_; + if( defined $value) { + if ( not $value =~ /^\d+$/ ) { + $self->throw("[$value] for frequency has to be a positive integer\n"); + } else { + $self->{'frequency'} = $value; + } + } + return $self->{'frequency'}; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Variation/DNAMutation.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Variation/DNAMutation.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,391 @@ +# $Id: DNAMutation.pm,v 1.11 2002/10/22 07:38:49 lapp Exp $ +# +# BioPerl module for Bio::Variation::DNAMutation +# +# Cared for by Heikki Lehvaslaiho +# +# Copyright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Variation::DNAMutation - DNA level mutation class + +=head1 SYNOPSIS + + $dnamut = Bio::Variation::DNAMutation->new + ('-start' => $start, + '-end' => $end, + '-length' => $len, + '-upStreamSeq' => $upflank, + '-dnStreamSeq' => $dnflank, + '-proof' => $proof, + '-isMutation' => 1, + '-mut_number' => $mut_number + ); + $a1 = Bio::Variation::Allele->new; + $a1->seq('a'); + $dnamut->allele_ori($a1); + my $a2 = Bio::Variation::Allele->new; + $a2->seq('t'); + $dnamut->add_Allele($a2); + + print "Restriction changes are ", $dnamut->restriction_changes, "\n"; + + # add it to a SeqDiff container object + $seqdiff->add_Variant($dnamut); + + +=head1 DESCRIPTION + +The instantiable class Bio::Variation::DNAMutation describes basic +sequence changes in genomic DNA level. It uses methods defined in +superclass Bio::Variation::VariantI. See L +for details. + +If the variation described by a DNAMutation object is transcibed, link +the corresponding Bio::Variation::RNAChange object to it using +method RNAChange(). See L for more information. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email: heikki@ebi.ac.uk +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Variation::DNAMutation; +my $VERSION=1.0; +use vars qw(@ISA); +use strict; + +# Object preamble - inheritance +use Bio::Variation::VariantI; + +@ISA = qw( Bio::Variation::VariantI ); + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + + my ($start, $end, $length, $strand, $primary, $source, + $frame, $score, $gff_string, + $allele_ori, $allele_mut, $upstreamseq, $dnstreamseq, + $label, $status, $proof, $region, $region_value, $region_dist, $numbering, + $cpg, $mut_number, $ismutation) = + $self->_rearrange([qw(START + END + LENGTH + STRAND + PRIMARY + SOURCE + FRAME + SCORE + GFF_STRING + ALLELE_ORI + ALLELE_MUT + UPSTREAMSEQ + DNSTREAMSEQ + LABEL + STATUS + PROOF + REGION + REGION_VALUE + REGION_DIST + NUMBERING + CPG + MUT_NUMBER + ISMUTATION + )], + @args); + + $self->primary_tag("Variation"); + + $self->{ 'alleles' } = []; + + $start && $self->start($start); + $end && $self->end($end); + $length && $self->length($length); + $strand && $self->strand($strand); + $primary && $self->primary_tag($primary); + $source && $self->source_tag($source); + $frame && $self->frame($frame); + $score && $self->score($score); + $gff_string && $self->_from_gff_string($gff_string); + + $allele_ori && $self->allele_ori($allele_ori); + $allele_mut && $self->allele_mut($allele_mut); + $upstreamseq && $self->upStreamSeq($upstreamseq); + $dnstreamseq && $self->dnStreamSeq($dnstreamseq); + + $label && $self->label($label); + $status && $self->status($status); + $proof && $self->proof($proof); + $region && $self->region($region); + $region_value && $self->region_value($region_value); + $region_dist && $self->region_dist($region_dist); + $numbering && $self->numbering($numbering); + $mut_number && $self->mut_number($mut_number); + $ismutation && $self->isMutation($ismutation); + + $cpg && $self->CpG($cpg); + + return $self; # success - we hope! +} + + +=head2 CpG + + Title : CpG + Usage : $obj->CpG() + Function: sets and returns boolean values for variation + hitting a CpG site. Unset value return -1. + Example : $obj->CpG() + Returns : boolean + Args : optional true of false value + + +=cut + + +sub CpG { + my ($obj,$value) = @_; + if( defined $value) { + $value ? ($value = 1) : ($value = 0); + $obj->{'cpg'} = $value; + } + elsif (not defined $obj->{'label'}) { + $obj->{'cpg'} = $obj->_CpG_value; + } + else { + return $obj->{'cpg'}; + } +} + + + +sub _CpG_value { + my ($self) = @_; + if ($self->allele_ori eq $self->allele_mut and length ($self->allele_ori) == 1 ) { + + # valid only for point mutations + # CpG methylation-mediated deamination: + # CG -> TG | CG -> CA substitutions + # implementation here is less strict: if CpG dinucleotide was hit + + if ( ( ($self->allele_ori eq 'c') && (substr($self->upStreamSeq, 0, 1) eq 'g') ) || + ( ($self->allele_ori eq 'g') && (substr($self->dnStreamSeq, -1, 1) eq 'c') ) ) { + return 1; + } + else { + return 0; + } + } else { + $self->warn('CpG makes sense only in the context of point mutation'); + return undef; + } +} + + +=head2 RNAChange + + Title : RNAChange + Usage : $mutobj = $obj->RNAChange; + : $mutobj = $obj->RNAChange($objref); + Function: Returns or sets the link-reference to a mutation/change object. + If there is no link, it will return undef + Returns : an obj_ref or undef + +=cut + + +sub RNAChange { + my ($self,$value) = @_; + if (defined $value) { + if( ! $value->isa('Bio::Variation::RNAChange') ) { + $self->throw("Is not a Bio::Variation::RNAChange object but a [$self]"); + return (undef); + } + else { + $self->{'RNAChange'} = $value; + } + } + unless (exists $self->{'RNAChange'}) { + return (undef); + } else { + return $self->{'RNAChange'}; + } +} + + +=head2 label + + Title : label + Usage : $obj->label(); + Function: + + Sets and returns mutation event label(s). If value is not + set, or no argument is given returns false. Each + instantiable subclass of L needs + to implement this method. Valid values are listed in + 'Mutation event controlled vocabulary' in + http://www.ebi.ac.uk/mutations/recommendations/mutevent.html. + + Example : + Returns : string + Args : string + +=cut + + +sub label { + my ($self, $value) = @_; + my ($o, $m, $type); + $o = $self->allele_ori->seq if $self->allele_ori and $self->allele_ori->seq; + $m = $self->allele_mut->seq if $self->allele_mut and $self->allele_mut->seq; + + if (not $o and not $m ) { + $self->warn("[DNAMutation, label] Both alleles should not be empty!\n"); + $type = 'no change'; # is this enough? + } + elsif ($o && $m && length($o) == length($m) && length($o) == 1) { + $type = 'point'; + $type .= ", ". _point_type_label($o, $m); + } + elsif (not $o ) { + $type = 'insertion'; + } + elsif (not $m ) { + $type = 'deletion'; + } + else { + $type = 'complex'; + } + $self->{'label'} = $type; + return $self->{'label'}; +} + + +sub _point_type_label { + my ($o, $m) = @_; + my ($type); + my %transition = ('a' => 'g', + 'g' => 'a', + 'c' => 't', + 't' => 'c'); + $o = lc $o; + $m = lc $m; + if ($o eq $m) { + $type = 'no change'; + } + elsif ($transition{$o} eq $m ) { + $type = 'transition'; + } + else { + $type = 'transversion'; + } +} + + +=head2 sysname + + Title : sysname + Usage : $self->sysname + Function: + + This subroutine creates a string corresponding to the + 'systematic name' of the mutation. Systematic name is + specified in Antonorakis & MDI Nomenclature Working Group: + Human Mutation 11:1-3, 1998. + http://www.interscience.wiley.com/jpages/1059-7794/nomenclature.html + Returns : string + +=cut + + +sub sysname { + my ($self,$value) = @_; + if( defined $value) { + $self->{'sysname'} = $value; + } else { + $self->warn('Mutation start position is not defined') + if not defined $self->start; + my $sysname = ''; + # show the alphabet only if $self->SeqDiff->alphabet is set; + my $mol = ''; + if ($self->SeqDiff->alphabet && $self->SeqDiff->alphabet eq 'dna') { + $mol = 'g.'; + } + elsif ($self->SeqDiff->alphabet && $self->SeqDiff->alphabet eq 'rna') { + $mol = 'c.'; + } + my $sep; + if ($self->isMutation) { + $sep = '>'; + } else { + $sep = '|'; + } + my $sign = '+'; + $sign = '' if $self->start < 1; + $sysname .= $mol ;#if $mol; + $sysname .= $sign. $self->start; + + my @alleles = $self->each_Allele; + $self->allele_mut($alleles[0]); + + $sysname .= 'del' if $self->label =~ /deletion/; + $sysname .= 'ins' if $self->label =~ /insertion/; + $sysname .= uc $self->allele_ori->seq if $self->allele_ori->seq; + + + + #push @alleles, $self->allele_mut if $self->allele_mut; + foreach my $allele (@alleles) { + $self->allele_mut($allele); + $sysname .= $sep if $self->label =~ /point/ or $self->label =~ /complex/; + $sysname .= uc $self->allele_mut->seq if $self->allele_mut->seq; + } + $self->{'sysname'} = $sysname; + #$self->{'sysname'} = $sign. $self->start. + # uc $self->allele_ori->seq. $sep. uc $self->allele_mut->seq; + } + return $self->{'sysname'}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Variation/IO.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Variation/IO.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,349 @@ +# $Id: IO.pm,v 1.14 2002/11/04 09:07:45 heikki Exp $ +# +# BioPerl module for Bio::Variation::IO +# +# Cared for by Heikki Lehvaslaiho +# +# Copyright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Variation::IO - Handler for sequence variation IO Formats + +=head1 SYNOPSIS + + use Bio::Variation::IO; + + $in = Bio::Variation::IO->new(-file => "inputfilename" , '-format' => 'flat'); + $out = Bio::Variation::IO->new(-file => ">outputfilename" , '-format' => 'xml'); + # note: we quote -format to keep older perl's from complaining. + + while ( my $seq = $in->next() ) { + $out->write($seq); + } + +or + + use Bio::Variation::IO; + + #input file format can be read from the file extension (dat|xml) + $in = Bio::Variation::IO->newFh(-file => "inputfilename"); + $out = Bio::Variation::IO->newFh('-format' => 'xml'); + + # World's shortest flat<->xml format converter: + print $out $_ while <$in>; + +=head1 DESCRIPTION + +Bio::Variation::IO is a handler module for the formats in the Variation IO set (eg, +Bio::Variation::IO::flat). It is the officially sanctioned way of getting at +the format objects, which most people should use. + +The structure, conventions and most of the code is inherited from +L module. The main difference is that instead of using +methods next_seq and write_seq, you drop '_seq' from the method names. + +The idea is that you request a stream object for a particular format. +All the stream objects have a notion of an internal file that is read +from or written to. A particular SeqIO object instance is configured +for either input or output. A specific example of a stream object is +the Bio::Variation::IO::flat object. + +Each stream object has functions + + $stream->next(); + +and + + $stream->write($seqDiff); + +also + + $stream->type() # returns 'INPUT' or 'OUTPUT' + +As an added bonus, you can recover a filehandle that is tied to the +SeqIO object, allowing you to use the standard EE and print operations +to read and write sequence objects: + + use Bio::Variation::IO; + + $stream = Bio::Variation::IO->newFh(-format => 'flat'); # read from standard input + + while ( $seq = <$stream> ) { + # do something with $seq + } + +and + + print $stream $seq; # when stream is in output mode + +This makes the simplest ever reformatter + + #!/usr/local/bin/perl + + $format1 = shift; + $format2 = shift || die "Usage: reformat format1 format2 < input > output"; + + use Bio::Variation::IO; + + $in = Bio::Variation::IO->newFh(-format => $format1 ); + $out = Bio::Variation::IO->newFh(-format => $format2 ); + #note: you might want to quote -format to keep older perl's from complaining. + + print $out $_ while <$in>; + + +=head1 CONSTRUCTORS + +=head2 Bio::Variation::IO-Enew() + + $seqIO = Bio::Variation::IO->new(-file => 'filename', -format=>$format); + $seqIO = Bio::Variation::IO->new(-fh => \*FILEHANDLE, -format=>$format); + $seqIO = Bio::Variation::IO->new(-format => $format); + +The new() class method constructs a new Bio::Variation::IO object. The +returned object can be used to retrieve or print BioSeq objects. new() +accepts the following parameters: + +=over 4 + +=item -file + +A file path to be opened for reading or writing. The usual Perl +conventions apply: + + 'file' # open file for reading + '>file' # open file for writing + '>>file' # open file for appending + '+new(-fh => \*STDIN); + +Note that you must pass filehandles as references to globs. + +If neither a filehandle nor a filename is specified, then the module +will read from the @ARGV array or STDIN, using the familiar EE +semantics. + +=item -format + +Specify the format of the file. Supported formats include: + + flat pseudo EMBL format + xml seqvar xml format + +If no format is specified and a filename is given, then the module +will attempt to deduce it from the filename. If this is unsuccessful, +Fasta format is assumed. + +The format name is case insensitive. 'FLAT', 'Flat' and 'flat' are +all supported. + +=back + +=head2 Bio::Variation::IO-EnewFh() + + $fh = Bio::Variation::IO->newFh(-fh => \*FILEHANDLE, -format=>$format); + $fh = Bio::Variation::IO->newFh(-format => $format); + # etc. + + #e.g. + $out = Bio::Variation::IO->newFh( '-FORMAT' => 'flat'); + print $out $seqDiff; + +This constructor behaves like new(), but returns a tied filehandle +rather than a Bio::Variation::IO object. You can read sequences from this +object using the familiar EE operator, and write to it using print(). +The usual array and $_ semantics work. For example, you can read all +sequence objects into an array like this: + + @mutations = <$fh>; + +Other operations, such as read(), sysread(), write(), close(), and printf() +are not supported. + +=head1 OBJECT METHODS + +See below for more detailed summaries. The main methods are: + +=head2 $sequence = $seqIO-Enext() + +Fetch the next sequence from the stream. + +=head2 $seqIO-Ewrite($sequence [,$another_sequence,...]) + +Write the specified sequence(s) to the stream. + +=head2 TIEHANDLE(), READLINE(), PRINT() + +These provide the tie interface. See L for more details. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email: heikki@ebi.ac.uk +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::Variation::IO; +my $VERSION=1.0; + +use strict; +use vars '@ISA'; + +use Bio::SeqIO; + +@ISA = 'Bio::SeqIO'; + +=head2 new + + Title : new + Usage : $stream = Bio::Variation::IO->new(-file => $filename, -format => 'Format') + Function: Returns a new seqstream + Returns : A Bio::Variation::IO::Handler initialised with the appropriate format + Args : -file => $filename + -format => format + -fh => filehandle to attach to + +=cut + + +sub new { + my ($class, %param) = @_; + my ($format); + + @param{ map { lc $_ } keys %param } = values %param; # lowercase keys + $format = $param{'-format'} + || $class->_guess_format( $param{-file} || $ARGV[0] ) + || 'flat'; + $format = "\L$format"; # normalize capitalization to lower case + + return undef unless $class->_load_format_module($format); + return "Bio::Variation::IO::$format"->new(%param); +} + + +sub _load_format_module { + my ($class, $format) = @_; + my $module = "Bio::Variation::IO::" . $format; + my $ok; + eval { + $ok = $class->_load_module($module); + }; + if ( $@ ) { + print STDERR <next + Function: reads the next $seqDiff object from the stream + Returns : a Bio::Variation::SeqDiff object + Args : + +=cut + +sub next { + my ($self, $seq) = @_; + $self->throw("Sorry, you cannot read from a generic Bio::Variation::IO object."); +} + +sub next_seq { + my ($self, $seq) = @_; + $self->throw("These are not sequence objects. Use method 'next' instead of 'next_seq'."); + $self->next($seq); +} + +=head2 write + + Title : write + Usage : $stream->write($seq) + Function: writes the $seq object into the stream + Returns : 1 for success and 0 for error + Args : Bio::Variation::SeqDiff object + +=cut + +sub write { + my ($self, $seq) = @_; + $self->throw("Sorry, you cannot write to a generic Bio::Variation::IO object."); +} + +sub write_seq { + my ($self, $seq) = @_; + $self->warn("These are not sequence objects. Use method 'write' instead of 'write_seq'."); + $self->write($seq); +} + +=head2 _guess_format + + Title : _guess_format + Usage : $obj->_guess_format($filename) + Function: + Example : + Returns : guessed format of filename (lower case) + Args : + +=cut + +sub _guess_format { + my $class = shift; + return unless $_ = shift; + return 'flat' if /\.dat$/i; + return 'xml' if /\.xml$/i; +} + + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Variation/IO/flat.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Variation/IO/flat.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,725 @@ +# $Id: flat.pm,v 1.12 2002/10/22 07:38:50 lapp Exp $ +# BioPerl module for Bio::Variation::IO::flat +# +# Cared for by Heikki Lehvaslaiho +# +# Copyright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself +# + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Variation::IO::flat - flat file sequence variation input/output stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the Bio::Variation::IO class. + +=head1 DESCRIPTION + +This object can transform Bio::Variation::SeqDiff objects to and from +flat file databases. The format used is EMBL like extension of what is +used by the "EBI Mutation Checker" at +http://www.ebi.ac.uk/cgi-bin/mutations/check.cgi and will eventually +replace it. + +More information of the attributes and values use can be found at +http://www.ebi.ac.uk/mutations/recommendations/. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email: heikki@ebi.ac.uk +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + + + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::Variation::IO::flat; +my $VERSION=1.0; +use vars qw(@ISA); +use strict; + +use Text::Wrap; +use Bio::Variation::IO; +use Bio::Variation::SeqDiff; +use Bio::Variation::DNAMutation; +use Bio::Variation::RNAChange; +use Bio::Variation::AAChange; +use Bio::Variation::Allele; + + +@ISA = qw(Bio::Variation::IO); + +sub new { + my($class, @args) = @_; + my $self = bless {}, $class; + $self->_initialize(@args); + return $self; +} + +sub _initialize { + my($self,@args) = @_; + return unless $self->SUPER::_initialize(@args); +} + +=head2 next + + + Title : next + Usage : $haplo = $stream->next() + Function: returns the next seqDiff in the stream + Returns : Bio::Variation::SeqDiff object + Args : NONE + +=cut + +sub next { + my( $self ) = @_; + local $/ = '//'; + return unless my $entry = $self->_readline; + + return if $entry =~ /^\s+$/; + + $entry =~ /\s*ID\s+\S+/ || $self->throw("We do need an ID!"); + + my ($id, $offset, $alphabet) = $entry =~ /\s*ID +([^:]+)..(\d+)[^\)]*.\[?([cg])?/ + or $self->throw("Can't parse ID line"); +# $self->throw("$1|$2|$3"); + my $h =Bio::Variation::SeqDiff->new(-id => $id, + -offset => $offset, + ); + if ($alphabet) { + if ($alphabet eq 'g') { + $alphabet = 'dna'; + } + elsif ($alphabet eq 'c') { + $alphabet = 'rna'; + } + $h->alphabet($alphabet); + } + # + # DNA + # + my @dna = split ( / DNA;/, $entry ); + shift @dna; + my $prevdnaobj; + foreach my $dna (@dna) { + $dna =~ s/Feature[ \t]+//g; + ($dna) = split "RNA; ", $dna; + #$self->warn("|$dna|") ; + #exit; + my ($mut_number, $proof, $location, $upflank, $change, $dnflank) = + $dna =~ m|\W+([\d\.]+).+/proof: (\w+).+/location: ([^ \n]+).+/upflank: ([ \n\w]+).+/change: ([^ /]+).+/dnflank: ([ \n\w]+)|s; + $change =~ s/[ \n]//g; + my ($ori, $mut) = split /[>\|]/, $change; + my ($variation_number, $change_number) = split /\./, $mut_number; + #$self->warn("|$mut_number|>|$variation_number|$change_number|"); + my $dnamut; + if ($change_number and $change_number > 1 ) { + my $a3 = Bio::Variation::Allele->new; + $a3->seq($mut) if $mut; + #$dnamut->add_Allele($a3); + $prevdnaobj->add_Allele($a3); + } else { + $upflank =~ s/[ \n]//g; + $dnflank =~ s/[ \n]//g; + my ($region, $junk, $region_value, $junk2, $region_dist) = + $dna =~ m|.+/region: ([\w\']+)(; )?(\w+)?( ?\(\+?)?(-?\d+)?|s; + #my $s = join ("|", $mut_number, $proof, $location, $upflank, + # $change, $dnflank, $region, $region_value, $region_dist, $1,$2,$3,$4,$5); + #$self->warn($s); + #exit; + my ($start, $sep, $end) = $location =~ /(-?\d+)(.)?\D?(-?\d+)?/; + $end = $start if not $end ; + my ($len) = $end - $start +1; + $len = 0, $start = $end if defined $sep and $sep eq '^'; + my $ismut = 0; + $ismut = 1 if $change =~ m/>/; + + $dnamut = Bio::Variation::DNAMutation->new + ('-start' => $start, + '-end' => $end, + '-length' => $len, + '-upStreamSeq' => $upflank, + '-dnStreamSeq' => $dnflank, + '-proof' => $proof, + '-mut_number' => $mut_number + ); + $prevdnaobj = $dnamut; + my $a1 = Bio::Variation::Allele->new; + $a1->seq($ori) if $ori; + $dnamut->allele_ori($a1); + my $a2 = Bio::Variation::Allele->new; + $a2->seq($mut) if $mut; + $dnamut->add_Allele($a2); + if ($ismut) { + $dnamut->isMutation(1); + $dnamut->allele_mut($a2); + } + $dnamut->region($region) if defined $region; + $dnamut->region_value($region_value) if defined $region_value; + $dnamut->region_dist($region_dist) if defined $region_dist; + + $h->add_Variant($dnamut); + $dnamut->SeqDiff($h); + } + } + + # + # RNA + # + my @rna = split ( / RNA;/, $entry ); + shift @rna; + my $prevrnaobj; + foreach my $rna (@rna) { + $rna = substr ($rna, 0, index($rna, 'Feature AA')); + $rna =~ s/Feature[ \t]+//g; + ($rna) = split "DNA; ", $rna; + #$self->warn("|$rna|") ; + my ($mut_number, $proof, $location, $upflank, $change, $dnflank) = + $rna =~ m|\W+([\d\.]+).+/proof: (\w+).+/location: ([^ \n]+).+/upflank: (\w+).+/change: ([^/]+).+/dnflank: (\w+)|s ;#' + my ($region, $junk, $region_value, $junk2, $region_dist) = + $rna =~ m|.+/region: ([\w\']+)(; )?(\w+)?( ?\(\+?)?(-?\d+)?|s; + #my $s = join ("|", $mut_number, $proof, $location, $upflank, + # $change, $dnflank, $region, $region_value, $region_dist, $1,$2,$3,$4,$5); + #$self->warn($s); + #exit; + $change =~ s/[ \n]//g; + my ($ori, $mut) = split /[>\|]/, $change; + my $rnamut; + my ($variation_number, $change_number) = split /\./, $mut_number; + if ($change_number and $change_number > 1 ) { + my $a3 = Bio::Variation::Allele->new; + $a3->seq($mut) if $mut; + #$rnamut->add_Allele($a3); + $prevrnaobj->add_Allele($a3); + } else { + my ($start, $sep, $end) = $location =~ /(-?\d+)(.)?\D?(-?\d+)?/; + $end = $start if not $end ; + my ($len) = $end - $start + 1; + $len = 0, $start = $end if defined $sep and $sep eq '^'; + my $ismut; + $ismut = 1 if $change =~ m/>/; + my ($codon_table) = $rna =~ m|.+/codon_table: (\d+)|s; + my ($codon_pos) = $rna =~ m|.+/codon:[^;]+; ([123])|s; + + $rnamut = Bio::Variation::RNAChange->new + ('-start' => $start, + '-end' => $end, + '-length' => $len, + '-upStreamSeq' => $upflank, + '-dnStreamSeq' => $dnflank, + '-proof' => $proof, + '-mut_number' => $mut_number + + ); + $prevrnaobj = $rnamut; + my $a1 = Bio::Variation::Allele->new; + $a1->seq($ori) if $ori; + $rnamut->allele_ori($a1); + my $a2 = Bio::Variation::Allele->new; + $a2->seq($mut) if $mut; + $rnamut->add_Allele($a2); + if ($ismut) { + $rnamut->isMutation(1); + $rnamut->allele_mut($a2); + } + $rnamut->region($region) if defined $region; + $rnamut->region_value($region_value) if defined $region_value; + $rnamut->region_dist($region_dist) if defined $region_dist; + + $rnamut->codon_table($codon_table) if $codon_table; + $rnamut->codon_pos($codon_pos) if $codon_pos; + $h->add_Variant($rnamut); + foreach my $mut ($h->each_Variant) { + if ($mut->isa('Bio::Variation::DNAMutation') ) { + if ($mut->mut_number == $rnamut->mut_number) { + $rnamut->DNAMutation($mut); + $mut->RNAChange($rnamut); + } + } + } + } + } + # + # AA + # + my @aa = split ( / AA;/, $entry ); + shift @aa; + my $prevaaobj; + foreach my $aa (@aa) { + $aa = substr ($aa, 0, index($aa, 'Feature AA')); + $aa =~ s/Feature[ \t]+//g; + ($aa) = split "DNA; ", $aa; + #$self->warn("|$aa|") ; + my ($mut_number, $proof, $location, $change) = + $aa =~ m|\W+([\d\.]+).+/proof: (\w+).+/location: ([^ \n]+)./change: ([^/;]+)|s; + $change =~ s/[ \n]//g; + #my $s = join ("|", $mut_number, $proof, $location, $change); + #$self->warn($s); + #exit; + $change =~ s/[ \n]//g; + $change =~ s/DNA$//; + my ($ori, $mut) = split /[>\|]/, $change; + #print "------$location----$ori-$mut-------------\n"; + my ($variation_number, $change_number) = split /\./, $mut_number; + my $aamut; + if ($change_number and $change_number > 1 ) { + my $a3 = Bio::Variation::Allele->new; + $a3->seq($mut) if $mut; + $prevaaobj->add_Allele($a3); + } else { + my ($start, $sep, $end) = $location =~ /(-?\d+)(.)?\D?(-?\d+)?/; + $end = $start if not $end ; + my ($len) = $end - $start + 1; + $len = 0, $start = $end if defined $sep and $sep eq '^'; + my $ismut; + $ismut = 1 if $change =~ m/>/; + my ($region) = $aa =~ m|.+/region: (\w+)|s ; + $aamut = Bio::Variation::AAChange->new + ('-start' => $start, + '-end' => $end, + '-length' => $len, + '-proof' => $proof, + '-mut_number' => $mut_number + ); + $prevaaobj = $aamut; + my $a1 = Bio::Variation::Allele->new; + $a1->seq($ori) if $ori; + $aamut->allele_ori($a1); + my $a2 = Bio::Variation::Allele->new; + $a2->seq($mut) if $mut; + $aamut->add_Allele($a2); + if ($ismut) { + $aamut->isMutation(1); + $aamut->allele_mut($a2); + } + $region && $aamut->region($region); + $h->add_Variant($aamut); + foreach my $mut ($h->each_Variant) { + if ($mut->isa('Bio::Variation::RNAChange') ) { + if ($mut->mut_number == $aamut->mut_number) { + $aamut->RNAChange($mut); + $mut->AAChange($aamut); + } + } + } + + } + } + return $h; +} + +=head2 write + + Title : write + Usage : $stream->write(@seqDiffs) + Function: writes the $seqDiff object into the stream + Returns : 1 for success and 0 for error + Args : Bio::Variation::SeqDiff object + + +=cut + +sub write { + my ($self,@h) = @_; + + #$columns = 75; #default for Text::Wrap + my %tag = + ( + 'ID' => 'ID ', + 'Description' => 'Description ', + 'FeatureKey' => 'Feature ', + 'FeatureQual' => "Feature ", + 'FeatureWrap' => "Feature ", + 'ErrorComment' => 'Comment ' + #'Comment' => 'Comment -!-', + #'CommentLine' => 'Comment ', + ); + + if( !defined $h[0] ) { + $self->throw("Attempting to write with no information!"); + } + + foreach my $h (@h) { + + my @entry =(); + + my ($text, $tmp, $tmp2, $sep); + my ($count) = 0; + + + $text = $tag{ID}; + + $text .= $h->id; + $text .= ":(". $h->offset; + $text .= "+1" if $h->sysname =~ /-/; + $text .= ")". $h->sysname; + $text .= "; ". $h->trivname if $h->trivname; + push (@entry, $text); + + #Variants need to be ordered accoding to mutation_number attribute + #put them into a hash of arrays holding the Variant objects + #This is necessary for cases like several distict mutations present + # in the same sequence. + my @allvariants = $h->each_Variant; + my %variants = (); + foreach my $mut ($h->each_Variant) { + push @{$variants{$mut->mut_number} }, $mut; + } + #my ($variation_number, $change_number) = split /\./, $mut_number; + foreach my $var (sort keys %variants) { + #print $var, ": ", join (" ", @{$variants{$var}}), "\n"; + + foreach my $mut (@{$variants{$var}}) { + # + # DNA + # + if ( $mut->isa('Bio::Variation::DNAMutation') ) { + #collect all non-reference alleles + $self->throw("allele_ori needs to be defined in [$mut]") + if not $mut->allele_ori; + if ($mut->isMutation) { + $sep = '>'; + } else { + $sep = '|'; + } + my @alleles = $mut->each_Allele; + #push @alleles, $mut->allele_mut if $mut->allele_mut; + my $count = 0; # two alleles + foreach my $allele (@alleles) { + $count++; + my ($variation_number, $change_number) = split /\./, $mut->mut_number; + if ($change_number and $change_number != $count){ + $mut->mut_number("$change_number.$count"); + } + $mut->allele_mut($allele); + push (@entry, + $tag{FeatureKey}. 'DNA'. "; ". $mut->mut_number + ); + #label + $text=$tag{FeatureQual}. '/label: '. $mut->label; + push (@entry, $text); + + #proof + if ($mut->proof) { + $text = $tag{FeatureQual}. '/proof: '. $mut->proof; + push (@entry, $text) ; + } + #location + $text = $tag{FeatureQual}. '/location: '; + #$mut->id. '; '. $mut->start; + if ($mut->length > 1 ) {# if ($mut->end - $mut->start ) { + my $l = $mut->start + $mut->length -1; + $text .= $mut->start. '..'. $l; + } + elsif ($mut->length == 0) { + my $tmp_start = $mut->start - 1; + $tmp_start-- if $tmp_start == 0; + $text .= $tmp_start. '^'. $mut->end; + } else { + $text .= $mut->start; + } + + if ($h->alphabet && $h->alphabet eq 'dna') { + $tmp = $mut->start + $h->offset; + $tmp-- if $tmp <= 0; + $mut->start < 1 && $tmp++; + #$text.= ' ('. $h->id. '::'. $tmp; + $tmp2 = $mut->end + $h->offset; + if ( $mut->length > 1 ) { + $mut->end < 1 && $tmp2++; + $text.= ' ('. $h->id. '::'. $tmp. "..". $tmp2; + } + elsif ($mut->length == 0) { + $tmp--; + $tmp-- if $tmp == 0; + $text .= ' ('. $h->id. '::'. $tmp. '^'. $tmp2; + } else { + $text.= ' ('. $h->id. '::'. $tmp; + } + $text .= ')'; + } + push (@entry, $text); + #sequence + push (@entry, + $tag{FeatureQual}. '/upflank: '. $mut->upStreamSeq + ); + $text = ''; + $text = $mut->allele_ori->seq if $mut->allele_ori->seq; + $text .= $sep; + $text .= $mut->allele_mut->seq if $mut->allele_mut->seq; + push (@entry, + wrap($tag{FeatureQual}. '/change: ', $tag{FeatureWrap}, + $text) + ); + + push (@entry, + $tag{FeatureQual}. '/dnflank: '. $mut->dnStreamSeq + ); + #restriction enzyme + if ($mut->restriction_changes ne '') { + $text = $mut->restriction_changes; + $text = wrap($tag{FeatureQual}. '/re_site: ', $tag{FeatureWrap}, $text); + push (@entry, + $text + ); + } + #region + if ($mut->region ) { + $text = $tag{FeatureQual}. '/region: '. $mut->region; + $text .= ';' if $mut->region_value or $mut->region_dist; + $text .= ' '. $mut->region_value if $mut->region_value; + if ($mut->region_dist ) { + $tmp = ''; + $tmp = '+' if $mut->region_dist > 1; + $text .= " (". $tmp. $mut->region_dist. ')'; + } + push (@entry, $text); + } + #CpG + if ($mut->CpG) { + push (@entry, + $tag{FeatureQual}. "/CpG" + ); + } + } + } + # + # RNA + # + elsif ($mut->isa('Bio::Variation::RNAChange') ) { + #collect all non-reference alleles + $self->throw("allele_ori needs to be defined in [$mut]") + if not $mut->allele_ori; + my @alleles = $mut->each_Allele; + #push @alleles, $mut->allele_mut if $mut->allele_mut; + if ($mut->isMutation) { + $sep = '>'; + } else { + $sep = '|'; + } + + my $count = 0; # two alleles + foreach my $allele (@alleles) { + $count++; + my ($variation_number, $change_number) = split /\./, $mut->mut_number; + if ($change_number and $change_number != $count){ + $mut->mut_number("$change_number.$count"); + } + $mut->allele_mut($allele); + push (@entry, + $tag{FeatureKey}. 'RNA'. "; ". $mut->mut_number + ); + #label + $text=$tag{FeatureQual}. '/label: '. $mut->label; + push (@entry, $text); + #proof + if ($mut->proof) { + $text = $tag{FeatureQual}. '/proof: '. $mut->proof; + push (@entry, $text) ; + } + #location + $text = $tag{FeatureQual}. '/location: ' ; + if ($mut->length > 1 ) { + $text .= $mut->start. '..'. $mut->end; + $tmp2 = $mut->end + $h->offset; + } + elsif ($mut->length == 0) { + my $tmp_start = $mut->start; + $tmp_start--; + $tmp_start-- if $tmp_start == 0; + $text .= $tmp_start. '^'. $mut->end; + } else { + $text .= $mut->start; + } + + if ($h->alphabet && $h->alphabet eq 'rna') { + $tmp = $mut->start + $h->offset; + $tmp-- if $tmp <= 0; + #$mut->start < 1 && $tmp++; + #$text.= ' ('. $h->id. '::'. $tmp; + $tmp2 = $mut->end + $h->offset; + #$mut->end < 1 && $tmp2++; + if ( $mut->length > 1 ) { + $text.= ' ('. $h->id. '::'. $tmp. "..". $tmp2; + } + elsif ($mut->length == 0) { + $tmp--; + $text .= ' ('. $h->id. '::'. $tmp. '^'. $tmp2; + } else { + $text.= ' ('. $h->id. '::'. $tmp; + } + + $text .= ')'; + } + push (@entry, $text); + + #sequence + push (@entry, + $tag{FeatureQual}. '/upflank: '. $mut->upStreamSeq + ); + $text = ''; + $text = $mut->allele_ori->seq if $mut->allele_ori->seq; + $text .= $sep; + $text .= $mut->allele_mut->seq if $mut->allele_mut->seq; + push (@entry, + wrap($tag{FeatureQual}. '/change: ', $tag{FeatureWrap}, + $text) + ); + push (@entry, + $tag{FeatureQual}. '/dnflank: '. $mut->dnStreamSeq + ); + #restriction + if ($mut->restriction_changes ne '') { + $text = $mut->restriction_changes; + $text = wrap($tag{FeatureQual}. '/re_site: ', $tag{FeatureWrap}, $text); + push (@entry, + $text + ); + } + #coding + if ($mut->region eq 'coding') { + #codon table + $text = $tag{FeatureQual}. '/codon_table: '; + $text .= $mut->codon_table; + push (@entry, $text); + #codon + + $text = $tag{FeatureQual}. '/codon: '. $mut->codon_ori. $sep; + if ($mut->DNAMutation->label =~ /.*point/) { + $text .= $mut->codon_mut; + } + else { + $text .= '-'; + } + $text .= "; ". $mut->codon_pos; + push (@entry, $text); + } + #region + if ($mut->region ) { + $text = $tag{FeatureQual}. '/region: '. $mut->region; + $text .= ';' if $mut->region_value or $mut->region_dist; + $text .= ' '. $mut->region_value if $mut->region_value; + if ($mut->region_dist ) { + $tmp = ''; + $tmp = '+' if $mut->region_dist > 1; + $text .= " (". $tmp. $mut->region_dist. ')'; + } + push (@entry, $text); + } + } + } + # + # AA + # + elsif ($mut->isa('Bio::Variation::AAChange')) { + #collect all non-reference alleles + $self->throw("allele_ori needs to be defined in [$mut]") + if not $mut->allele_ori; + if ($mut->isMutation) { + $sep = '>'; + } else { + $sep = '|'; + } + my @alleles = $mut->each_Allele; + #push @alleles, $mut->allele_mut if $mut->allele_mut; + my $count = 0; # two alleles + foreach my $allele (@alleles) { + $count++; + my ($variation_number, $change_number) = split /\./, $mut->mut_number; + if ($change_number and $change_number != $count){ + $mut->mut_number("$change_number.$count"); + } + $mut->allele_mut($allele); + push (@entry, + $tag{FeatureKey}. 'AA'. "; ". $mut->mut_number + ); + #label + $text=$tag{FeatureQual}. '/label: '. $mut->label; + push (@entry, $text) ; + #proof + if ($mut->proof) { + $text = $tag{FeatureQual}. '/proof: '. $mut->proof; + push (@entry, $text) ; + } + #location + $text = $tag{FeatureQual}. '/location: '. + #$mut->id. '; '. $mut->start; + $mut->start; + if ($mut->length > 1 ) { + $tmp = $mut->start + $mut->length -1; + $text .= '..'. $tmp; + } + push (@entry, $text); + #sequence + $text = ''; + $text = $mut->allele_ori->seq if $mut->allele_ori->seq; + $text .= $sep; + $text .= $mut->allele_mut->seq if $mut->allele_mut->seq; + push (@entry, + wrap($tag{FeatureQual}. '/change: ', $tag{FeatureWrap}, + $text) + ); + #region + if ($mut->region ) { + $text = $tag{FeatureQual}. '/region: '. $mut->region; + $text .= ';' if $mut->region_value or $mut->region_dist; + $text .= ' '. $mut->region_value if $mut->region_value; + if ($mut->region_dist ) { + $tmp = ''; + $tmp = '+' if $mut->region_dist > 1; + $text .= " (". $tmp. $mut->region_dist. ')'; + } + push (@entry, $text); + } + } + } + } + } + push (@entry, + "//" + ); + my $str = join ("\n", @entry). "\n"; + $str =~ s/\t/ /g; + $self->_print($str); + } + return 1; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Variation/IO/xml.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Variation/IO/xml.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,561 @@ +# $Id: xml.pm,v 1.12.2.1 2003/03/01 17:23:43 jason Exp $ +# BioPerl module for Bio::Variation::IO::xml +# +# Cared for by Heikki Lehvaslaiho +# +# Copyright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself +# + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Variation::IO::xml - XML sequence variation input/output stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the Bio::Variation::IO class. + +=head1 DESCRIPTION + +This object can transform Bio::Variation::SeqDiff objects to and from XML +file databases. + +The XML format, although consistent, is still evolving. The current +DTD for it is at LEwww.ebi.ac.ukEmutationsEDTDEseqDiff.dtd>. + +=head1 REQUIREMENTS + +To use this code you need the module XML::Twig which creates an +interface to XML::Parser to read XML and modules XML::Writer and +IO::String to write XML out. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email: heikki@ebi.ac.uk +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::Variation::IO::xml; +my $VERSION=1.1; +use vars qw(@ISA $seqdiff $var $prevdnaobj $prevrnaobj $prevaaobj); +use strict; + +use XML::Twig; +use XML::Writer 0.4; +use IO::String; +use Bio::Variation::IO; +use Bio::Variation::SeqDiff; +use Bio::Variation::DNAMutation; +use Bio::Variation::RNAChange; +use Bio::Variation::AAChange; +use Bio::Variation::Allele; + +# new() is inherited from Bio::Root::Object +@ISA = qw( Bio::Variation::IO ); + +# _initialize is where the heavy stuff will happen when new is called + +sub new { + my ($class,@args) = @_; + my $self = bless {}, $class; + $self->_initialize(@args); + return $self; +} + +sub _initialize { + my($self,@args) = @_; + return unless $self->SUPER::_initialize(@args); +} + +=head2 next + + Title : next + Usage : $haplo = $stream->next() + Function: returns the next seqDiff in the stream + Returns : Bio::Variation::SeqDiff object + Args : NONE + +=cut + + +sub _seqDiff { + my ($t, $term)= @_; + $seqdiff->id( $term->att('id') ); + $seqdiff->alphabet( $term->att('moltype') ); + $seqdiff->offset( $term->att('offset') ); + + foreach my $child ($term->children) { + _variant($t, $child); + } +} + +sub _variant { + my ($t, $term)= @_; + my $var; + my $att = $term->atts(); + my ($variation_number, $change_number) = split /\./, $att->{number}; + + # if more than two alleles + if ($variation_number and $change_number and $change_number > 1 ) { + my $a3 = Bio::Variation::Allele->new; + $a3->seq( $term->first_child_text('allele_mut') ) + if $term->first_child_text('allele_mut'); + if ($term->gi eq 'DNA') { + $prevdnaobj->add_Allele($a3); + } + elsif ($term->gi eq 'RNA') { + $prevrnaobj->add_Allele($a3); + } else { # AA + $prevaaobj->add_Allele($a3); + } + } else { # create new variants + if ($term->gi eq 'DNA') { + $var = new Bio::Variation::DNAMutation; + } + elsif ($term->gi eq 'RNA') { + $var = new Bio::Variation::RNAChange; + } else { # AA + $var = new Bio::Variation::AAChange; + } + + # these are always present + $var->start( $att->{start} ); + $var->end( $att->{end}); + $var->length($att->{len}); + $var->mut_number( $att->{number}); + $var->upStreamSeq($term->first_child_text('upFlank')); + $var->dnStreamSeq($term->first_child_text('dnFlank')); + $var->proof($term->first_child_text('proof')); + + # region + my $region = $term->first_child('region'); + if ($region) { + $var->region($region->text); + my $region_atts = $region->atts; + $var->region_value( $region_atts->{value} ) + if $region_atts->{value}; + $var->region_dist( $region_atts->{dist} ) + if $region_atts->{dist}; + } + + # alleles + my $a1 = Bio::Variation::Allele->new; + $a1->seq($term->first_child_text('allele_ori') ) + if $term->first_child_text('allele_ori'); + $var->allele_ori($a1); + my $a2 = Bio::Variation::Allele->new; + $a2->seq($term->first_child_text('allele_mut') ) + if $term->first_child_text('allele_mut'); + $var->isMutation(1) if $term->att('isMutation'); + $var->allele_mut($a2); + $var->add_Allele($a2); + $var->length( $term->att('length') ); + $seqdiff->add_Variant($var); + + # variant specific code + if ($term->gi eq 'DNA') { + $prevdnaobj = $var; + } + elsif ($term->gi eq 'RNA') { + my $codon = $term->first_child('codon'); + if ($codon) { + my $codon_atts = $codon->atts; + $var->codon_table( $codon->att('codon_table') ) + if $codon_atts->{codon_table} and $codon_atts->{codon_table} != 1; + $var->codon_pos( $codon->att('codon_pos') ) + if $codon_atts->{codon_pos}; + } + $prevdnaobj->RNAChange($var); + $var->DNAMutation($prevdnaobj); + $prevrnaobj = $var; + } else { + $prevrnaobj->AAChange($var); + $var->RNAChange($prevrnaobj); + $prevaaobj = $var; + } + } +} + +sub next { + my( $self ) = @_; + + local $/ = "\n"; + return unless my $entry = $self->_readline; +# print STDERR "|$entry|"; + return unless $entry =~ /^\W*new; + + # create new parser object + my $twig_handlers = {'seqDiff' => \&_seqDiff }; + my $t = new XML::Twig ( TwigHandlers => $twig_handlers, + KeepEncoding => 1 ); + $t->parse($entry); + + return $seqdiff; +} + +=head2 write + + Title : write + Usage : $stream->write(@haplos) + Function: writes the $seqDiff objects into the stream + Returns : 1 for success and 0 for error + Args : Bio::Variation::SeqDiff object + +=cut + +sub write { + my ($self,@h) = @_; + + if( ! defined $h[0] ) { + $self->throw("Attempting to write with no information!"); + } + my $str; + my $output = IO::String->new($str); + my $w = new XML::Writer(OUTPUT => $output, DATA_MODE => 1, DATA_INDENT => 4 ); + foreach my $h (@h) { + # + # seqDiff + # + $h->alphabet || $self->throw("Moltype of the reference sequence is not set!"); + my $hasAA = 0; + foreach my $mut ($h->each_Variant) { + $hasAA = 1 if $mut->isa('Bio::Variation::AAChange'); + } + if ($hasAA) { + $w->startTag("seqDiff", + "id" => $h->id, + "moltype" => $h->alphabet, + "offset" => $h->offset, + "sysname" => $h->sysname, + "trivname" => $h->trivname + ); + } else { + $w->startTag("seqDiff", + "id" => $h->id, + "moltype" => $h->alphabet, + "offset" => $h->offset, + "sysname" => $h->sysname + ); + } + my @allvariants = $h->each_Variant; + #print "allvars:", scalar @allvariants, "\n"; + my %variants = (); + foreach my $mut ($h->each_Variant) { + #print STDERR $mut->mut_number, "\t", $mut, "\t", + #$mut->proof, "\t", scalar $mut->each_Allele, "\n"; + push @{$variants{$mut->mut_number} }, $mut; + } + foreach my $var (sort keys %variants) { + foreach my $mut (@{$variants{$var}}) { + # + # DNA + # + if( $mut->isa('Bio::Variation::DNAMutation') ) { + $mut->isMutation(0) if not $mut->isMutation; + my @alleles = $mut->each_Allele; + my $count = 0; + foreach my $allele (@alleles) { + $count++; + my ($variation_number, $change_number) = split /\./, $mut->mut_number; + if ($change_number and $change_number != $count){ + $mut->mut_number("$change_number.$count"); + } + $mut->allele_mut($allele); + $w->startTag("DNA", + "number" => $mut->mut_number, + "start" => $mut->start, + "end" => $mut->end, + "length" => $mut->length, + "isMutation" => $mut->isMutation + ); + if ($mut->label) { + foreach my $label (split ', ', $mut->label) { + $w->startTag("label"); + $w->characters($label); + $w->endTag; + } + } + if ($mut->proof) { + $w->startTag("proof"); + $w->characters($mut->proof ); + $w->endTag; + } + if ($mut->upStreamSeq) { + $w->startTag("upFlank"); + $w->characters($mut->upStreamSeq ); + $w->endTag; + } + #if ( $mut->isMutation) { + #if ($mut->allele_ori) { + $w->startTag("allele_ori"); + $w->characters($mut->allele_ori->seq) if $mut->allele_ori->seq ; + $w->endTag; + #} + #if ($mut->allele_mut) { + $w->startTag("allele_mut"); + $w->characters($mut->allele_mut->seq) if $mut->allele_mut->seq; + $w->endTag; + #} + #} + if ($mut->dnStreamSeq) { + $w->startTag("dnFlank"); + $w->characters($mut->dnStreamSeq ); + $w->endTag; + } + if ($mut->restriction_changes) { + $w->startTag("restriction_changes"); + $w->characters($mut->restriction_changes); + $w->endTag; + } + if ($mut->region) { + if($mut->region_value and $mut->region_dist) { + $w->startTag("region", + "value" => $mut->region_value, + "dist" => $mut->region_dist + ); + } + elsif($mut->region_value) { + $w->startTag("region", + "value" => $mut->region_value + ); + } + elsif($mut->region_dist) { + $w->startTag("region", + "dist" => $mut->region_dist + ); + } else { + $w->startTag("region"); + } + $w->characters($mut->region ); + $w->endTag; + } + $w->endTag; #DNA + } + } + # + # RNA + # + elsif( $mut->isa('Bio::Variation::RNAChange') ) { + $mut->isMutation(0) if not $mut->isMutation; + my @alleles = $mut->each_Allele; + my $count = 0; + foreach my $allele (@alleles) { + $count++; + my ($variation_number, $change_number) = split /\./, $mut->mut_number; + if ($change_number and $change_number != $count){ + $mut->mut_number("$change_number.$count"); + } + $mut->allele_mut($allele); + $w->startTag("RNA", + "number" => $mut->mut_number, + "start" => $mut->start, + "end" => $mut->end, + "length" => $mut->length, + "isMutation" => $mut->isMutation + ); + + if ($mut->label) { + foreach my $label (split ', ', $mut->label) { + $w->startTag("label"); + $w->characters($label ); + $w->endTag; + } + } + if ($mut->proof) { + $w->startTag("proof"); + $w->characters($mut->proof ); + $w->endTag; + } + if ($mut->upStreamSeq) { + $w->startTag("upFlank"); + $w->characters($mut->upStreamSeq ); + $w->endTag; + } + #if ( $mut->isMutation) { + if ($mut->allele_ori) { + $w->startTag("allele_ori"); + $w->characters($mut->allele_ori->seq) if $mut->allele_ori->seq ; + $w->endTag; + } + if ($mut->allele_mut) { + $w->startTag("allele_mut"); + $w->characters($mut->allele_mut->seq) if $mut->allele_mut->seq ; + $w->endTag; + } + #} + if ($mut->dnStreamSeq) { + $w->startTag("dnFlank"); + $w->characters($mut->dnStreamSeq ); + $w->endTag; + } + if ($mut->region eq 'coding') { + if (! $mut->codon_mut) { + $w->startTag("codon", + "codon_ori" => $mut->codon_ori, + "codon_pos" => $mut->codon_pos + ); + } else { + $w->startTag("codon", + "codon_ori" => $mut->codon_ori, + "codon_mut" => $mut->codon_mut, + "codon_pos" => $mut->codon_pos + ); + } + $w->endTag; + } + if ($mut->codon_table != 1) { + $w->startTag("codon_table"); + $w->characters($mut->codon_table); + $w->endTag; + } + + if ($mut->restriction_changes) { + $w->startTag("restriction_changes"); + $w->characters($mut->restriction_changes); + $w->endTag; + } + if ($mut->region) { + if($mut->region_value and $mut->region_dist) { + $w->startTag("region", + "value" => $mut->region_value, + "dist" => $mut->region_dist + ); + } + elsif($mut->region_value) { + $w->startTag("region", + "value" => $mut->region_value + ); + } + elsif($mut->region_dist) { + $w->startTag("region", + "dist" => $mut->region_dist + ); + } else { + $w->startTag("region"); + } + $w->characters($mut->region ); + $w->endTag; + } + $w->endTag; #RNA + } + } + # + # AA + # + elsif( $mut->isa('Bio::Variation::AAChange') ) { + $mut->isMutation(0) if not $mut->isMutation; + my @alleles = $mut->each_Allele; + my $count = 0; + foreach my $allele (@alleles) { + $count++; + my ($variation_number, $change_number) = split /\./, $mut->mut_number; + if ($change_number and $change_number != $count){ + $mut->mut_number("$change_number.$count"); + } + $mut->allele_mut($allele); + $w->startTag("AA", + "number" => $mut->mut_number, + "start" => $mut->start, + "end" => $mut->end, + "length" => $mut->length, + "isMutation" => $mut->isMutation + ); + + if ($mut->label) { + foreach my $label (split ', ', $mut->label) { + $w->startTag("label"); + $w->characters($label ); + $w->endTag; + } + } + if ($mut->proof) { + $w->startTag("proof"); + $w->characters($mut->proof ); + $w->endTag; + } + #if ( $mut->isMutation) { + if ($mut->allele_ori) { + $w->startTag("allele_ori"); + $w->characters($mut->allele_ori->seq) if $mut->allele_ori->seq; + $w->endTag; + } + if ($mut->allele_mut) { + $w->startTag("allele_mut"); + $w->characters($mut->allele_mut->seq) if $mut->allele_mut->seq; + $w->endTag; + } + #} + if ($mut->region) { + if($mut->region_value and $mut->region_dist) { + $w->startTag("region", + "value" => $mut->region_value, + "dist" => $mut->region_dist + ); + } + elsif($mut->region_value) { + $w->startTag("region", + "value" => $mut->region_value + ); + } + elsif($mut->region_dist) { + $w->startTag("region", + "dist" => $mut->region_dist + ); + } else { + $w->startTag("region"); + } + $w->characters($mut->region ); + $w->endTag; + } + $w->endTag; #AA + } + } + } + } + } + $w->endTag; + + $w->end; + $self->_print($str); + $output = undef; + return 1; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Variation/README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Variation/README Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,31 @@ + + README for Bio::Variation classes + + +These classes are part of "Computational Mutation Expression Toolkit" +project at European Bioinformatics Institute +, but they are written to be +as general as possinble. + +Bio::Variation name space contains modules to store sequence variation +information as differences between the reference sequence and changes +sequences. Also included are classes to write out and recrete objects +from EMBL-like flat files and XML. Lastly, there are simple classes to +calculate values for sequence change objects. + +See "Computational Mutation Expression Toolkit" web pages for more +information: + + http://www.ebi.ac.uk/mutations/toolkit/ + + +Send bug reports using the bioperl bug-tracking system at +http://bio.perl.org/Bugs/ or send them via e-mail to +bioperl-bugs@bio.perl.org. + +Send general comments, questions, and feature requests to the bioperl +mailing list: + + bioperl-l@bioperl.org + +Heikki Lehväslaiho diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Variation/RNAChange.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Variation/RNAChange.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,617 @@ +# $Id: RNAChange.pm,v 1.10 2002/10/22 07:38:49 lapp Exp $ +# +# BioPerl module for Bio::Variation::RNAChange +# +# Cared for by Heikki Lehvaslaiho +# +# Copyright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Variation::RNAChange - Sequence change class for RNA level + +=head1 SYNOPSIS + + $rnachange = Bio::Variation::RNAChange->new + ('-start' => $start, + '-end' => $end, + '-length' => $len, + '-codon_pos' => $cp, + '-upStreamSeq' => $upflank, + '-dnStreamSeq' => $dnflank, + '-proof' => $proof, + '-isMutation' => 1, + '-mut_number' => $mut_number + ); + $a1 = Bio::Variation::Allele->new; + $a1->seq('a'); + $rnachange->allele_ori($a1); + my $a2 = Bio::Variation::Allele->new; + $a2->seq('t'); + $rnachange->add_Allele($a2); + $rnachange->allele_mut($a2); + + print "The codon change is ", $rnachange->codon_ori, + ">", $rnachange->codon_mut, "\n"; + + # add it to a SeqDiff container object + $seqdiff->add_Variant($rnachange); + + # and create links to and from DNA level mutation objects + $rnachange->DNAMutation($dnamut); + $dnamut->RNAChange($rnachange); + +=head1 DESCRIPTION + +The instantiable class Bio::Variation::DNAMutation describes basic +sequence changes at RNA molecule level. It uses methods defined in +superclass Bio::Variation::VariantI. See L +for details. + +You are normally expected to create a corresponding +Bio::Variation::DNAMutation object even if mutation is defined at +RNA level. The numbering follows then cDNA numbering. Link the +DNAMutation object to the RNAChange object using the method +DNAMutation(). If the variation described by a RNAChange object is +translated, link the corresponding Bio::Variation::AAChange object +to it using method AAChange(). See L and +L for more information. + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email: heikki@ebi.ac.uk +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Variation::RNAChange; +use vars qw(@ISA); +use strict; + +# Object preamble - inheritance +my $VERSION=1.0; +use Bio::Variation::VariantI; +use Bio::Tools::CodonTable; + +@ISA = qw( Bio::Variation::VariantI ); + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + + my ($start, $end, $length, $strand, $primary, $source, + $frame, $score, $gff_string, + $allele_ori, $allele_mut, $upstreamseq, $dnstreamseq, + $label, $status, $proof, $region, $region_value, $region_dist, $numbering, + $mut_number, $isMutation, + $codon_ori, $codon_mut, $codon_pos, $codon_table, $cds_end) = + $self->_rearrange([qw(START + END + LENGTH + STRAND + PRIMARY + SOURCE + FRAME + SCORE + GFF_STRING + ALLELE_ORI + ALLELE_MUT + UPSTREAMSEQ + DNSTREAMSEQ + LABEL + STATUS + PROOF + REGION + REGION_VALUE + REGION_DIST + NUMBERING + MUT_NUMBER + ISMUTATION + CODON_ORI + CODON_MUT + CODON_POS + TRANSLATION_TABLE + CDS_END + )],@args); + + $self->primary_tag("Variation"); + + $self->{ 'alleles' } = []; + + $start && $self->start($start); + $end && $self->end($end); + $length && $self->length($length); + $strand && $self->strand($strand); + $primary && $self->primary_tag($primary); + $source && $self->source_tag($source); + $frame && $self->frame($frame); + $score && $self->score($score); + $gff_string && $self->_from_gff_string($gff_string); + + $allele_ori && $self->allele_ori($allele_ori); + $allele_mut && $self->allele_mut($allele_mut); + $upstreamseq && $self->upStreamSeq($upstreamseq); + $dnstreamseq && $self->dnStreamSeq($dnstreamseq); + + $label && $self->label($label); + $status && $self->status($status); + $proof && $self->proof($proof); + $region && $self->region($region); + $region_value && $self->region_value($region_value); + $region_dist && $self->region_dist($region_dist); + $numbering && $self->numbering($numbering); + $mut_number && $self->mut_number($mut_number); + $isMutation && $self->isMutation($isMutation); + + $codon_ori && $self->codon_ori($codon_ori); + $codon_mut && $self->codon_mut($codon_mut); + $codon_pos && $self->codon_pos($codon_pos); + $codon_table && $self->codon_table($codon_table); + $cds_end && $self->cds_end($cds_end); + return $self; # success - we hope! +} + + +=head2 codon_ori + + Title : codon_ori + Usage : $obj->codon_ori(); + Function: + + Sets and returns codon_ori triplet. If value is not set, + creates the codon triplet from the codon position and + flanking sequences. The string has to be three characters + long. The character content is not checked. + + Example : + Returns : string + Args : string + +=cut + +sub codon_ori { + my ($self,$value) = @_; + if (defined $value) { + if (length $value != 3) { + $self->warn("Codon string \"$value\" is not three characters long"); + } + $self->{'codon_ori'} = $value; + } + elsif (! $self->{'codon_ori'}) { + my $codon_ori = ''; + + if ($self->region eq 'coding' && $self->start && $self->start >= 1) { + + $self->warn('Codon position is not defined') + if not defined $self->codon_pos; + $self->warn('Upstream flanking sequence is not defined') + if not defined $self->upStreamSeq; + $self->warn('Downstream flanking sequence is not defined') + if not defined $self->dnStreamSeq; + + my $cpos = $self->codon_pos; + $codon_ori = substr($self->upStreamSeq, -$cpos +1 , $cpos-1); + $codon_ori .= substr($self->allele_ori->seq, 0, 4-$cpos) + if $self->allele_ori and $self->allele_ori->seq; + $codon_ori .= substr($self->dnStreamSeq, 0, 3-length($codon_ori)); + } + $self->{'codon_ori'} = lc $codon_ori; + } + return $self->{'codon_ori'}; +} + + +=head2 codon_mut + + Title : codon_mut + Usage : $obj->codon_mut(); + Function: + + Sets and returns codon_mut triplet. If value is not + set, creates the codon triplet from the codon position and + flanking sequences. Return undef for other than point mutations. + + Example : + Returns : string + Args : string + +=cut + + +sub codon_mut { + my ($self,$value) = @_; + if (defined $value) { + if (length $value != 3 ) { + $self->warn("Codon string \"$value\" is not three characters long"); + } + $self->{'codon_mut'} = $value; + } + else { + my $codon_mut = ''; + if ($self->allele_ori->seq and $self->allele_mut->seq and + CORE::length($self->allele_ori->seq) == 1 and + CORE::length($self->allele_mut->seq) == 1 and + $self->region eq 'coding' and $self->start >= 1) { + + $self->warn('Codon position is not defined') + if not defined $self->codon_pos; + $self->warn('Upstream flanking sequnce is not defined') + if not defined $self->upStreamSeq; + $self->warn('Downstream flanking sequnce is not defined') + if not defined $self->dnStreamSeq; + $self->throw('Mutated allele is not defined') + if not defined $self->allele_mut; + + my $cpos = $self->codon_pos; + $codon_mut = substr($self->upStreamSeq, -$cpos +1 , $cpos-1); + $codon_mut .= substr($self->allele_mut->seq, 0, 4-$cpos) + if $self->allele_mut and $self->allele_mut->seq; + $codon_mut .= substr($self->dnStreamSeq, 0, 3-length($codon_mut)); + + $self->{'codon_mut'} = lc $codon_mut; + } + } + return $self->{'codon_mut'}; +} + + +=head2 codon_pos + + Title : codon_pos + Usage : $obj->codon_pos(); + Function: + + Sets and returns the position of the mutation start in the + codon. If value is not set, returns false. + + Example : + Returns : 1,2,3 + Args : none if get, the new value if set + +=cut + + +sub codon_pos { + my ($self,$value) = @_; + if( defined $value) { + if ( $value !~ /[123]/ ) { + $self->throw("'$value' is not a valid codon position"); + } + $self->{'codon_pos'} = $value; + } + return $self->{'codon_pos'}; +} + + +=head2 codon_table + + Title : codon_table + Usage : $obj->codon_table(); + Function: + + Sets and returns the codon table id of the RNA + If value is not set, returns 1, 'universal' code, as the default. + + Example : + Returns : integer + Args : none if get, the new value if set + +=cut + + +sub codon_table { + my ($self,$value) = @_; + if( defined $value) { + if ( not $value =~ /^\d$/ ) { + $self->throw("'$value' is not a valid codon table ID\n". + "Has to be a positive integer. Defaulting to 1\n"); + } else { + $self->{'codon_table'} = $value; + } + } + if( ! exists $self->{'codon_table'} ) { + return 1; + } else { + return $self->{'codon_table'}; + } +} + + +=head2 DNAMutation + + Title : DNAMutation + Usage : $mutobj = $obj->DNAMutation; + : $mutobj = $obj->DNAMutation($objref); + Function: Returns or sets the link-reference to a mutation/change object. + If there is no link, it will return undef + Returns : an obj_ref or undef + +=cut + + +sub DNAMutation { + my ($self,$value) = @_; + if (defined $value) { + if( ! $value->isa('Bio::Variation::DNAMutation') ) { + $self->throw("Is not a Bio::Variation::DNAMutation object but a [$self]"); + return (undef); + } + else { + $self->{'DNAMutation'} = $value; + } + } + unless (exists $self->{'DNAMutation'}) { + return (undef); + } else { + return $self->{'DNAMutation'}; + } +} + + +=head2 AAChange + + Title : AAChange + Usage : $mutobj = $obj->AAChange; + : $mutobj = $obj->AAChange($objref); + Function: Returns or sets the link-reference to a mutation/change object. + If there is no link, it will return undef + Returns : an obj_ref or undef + +=cut + +sub AAChange { + my ($self,$value) = @_; + if (defined $value) { + if( ! $value->isa('Bio::Variation::AAChange') ) { + $self->throw("Is not a Bio::Variation::AAChange object but a [$self]"); + return (undef); + } + else { + $self->{'AAChange'} = $value; + } + } + unless (exists $self->{'AAChange'}) { + return (undef); + } else { + return $self->{'AAChange'}; + } +} + + +=head2 exons_modified + + Title : exons_modified + Usage : $modified = $obj->exons_modified; + : $modified = $obj->exons_modified(1); + Function: Returns or sets information (example: a simple boolean flag) about + the modification of exons as a result of a mutation. + +=cut + +sub exons_modified { + my ($self,$value)=@_; + if (defined($value)) { + $self->{'exons_modified'}=$value; + } + return ($self->{'exons_modified'}); +} + +=head2 region + + Title : region + Usage : $obj->region(); + Function: + + Sets and returns the name of the sequence region type or + protein domain at this location. If value is not set, + returns false. + + Example : + Returns : string + Args : string + +=cut + + + +sub region { + my ($self,$value) = @_; + if( defined $value) { + $self->{'region'} = $value; + } + elsif (not defined $self->{'region'}) { + + $self->warn('Mutation start position is not defined') + if not defined $self->start and $self->verbose; + $self->warn('Mutation end position is not defined') + if not defined $self->end and $self->verbose; + $self->warn('Length of the CDS is not defined, the mutation can be beyond coding region!') + if not defined $self->cds_end and $self->verbose; + + $self->region('coding'); + if ($self->end && $self->end < 0 ){ + $self->region('5\'UTR'); + } + elsif ($self->start && $self->cds_end && $self->start > $self->cds_end ) { + $self->region('3\'UTR'); + } + } + return $self->{'region'}; +} + +=head2 cds_end + + Title : cds_end + Usage : $cds_end = $obj->get_cds_end(); + Function: + + Sets or returns the cds_end from the beginning of the DNA sequence + to the coordinate start used to describe variants. + Should be the location of the last nucleotide of the + terminator codon of the gene. + + Example : + Returns : value of cds_end, a scalar + Args : + +=cut + + + +sub cds_end { + my ($self, $value) = @_; + if (defined $value) { + $self->warn("[$value] is not a good value for sequence position") + if not $value =~ /^\d+$/ ; + $self->{'cds_end'} = $value; + } else { + $self->{'cds_end'} = $self->SeqDiff->cds_end if $self->SeqDiff; + } + return $self->{'cds_end'}; +} + + +=head2 label + + Title : label + Usage : $obj->label(); + Function: + + Sets and returns mutation event label(s). If value is not + set, or no argument is given returns false. Each + instantiable subclass of L needs + to implement this method. Valid values are listed in + 'Mutation event controlled vocabulary' in + http://www.ebi.ac.uk/mutations/recommendations/mutevent.html. + + Example : + Returns : string + Args : string + +=cut + +sub label { + my ($self) = @_; + my ($o, $m, $type); + $o = $self->allele_ori->seq if $self->allele_ori and $self->allele_ori->seq; + $m = $self->allele_mut->seq if $self->allele_mut and $self->allele_mut->seq; + + my $ct = Bio::Tools::CodonTable -> new ( -id => $self->codon_table ); + if ($o and $m and CORE::length($o) == 1 and CORE::length($m) == 1) { + if (defined $self->AAChange) { + if ($self->start > 0 and $self->start < 4 ) { + $type = 'initiation codon'; + } + elsif ($self->codon_ori && $ct->is_ter_codon($self->codon_ori) ) { + #AAChange->allele_ori and $self->AAChange->allele_ori->seq eq '*' ) { + $type = 'termination codon'; + } + elsif ($self->codon_mut && $ct->is_ter_codon($self->codon_mut) ) { + #elsif ($self->AAChange->allele_mut and $self->AAChange->allele_mut->seq eq "*") { + $type = 'nonsense'; + } + elsif ($o and $m and ($o eq $m or + $self->AAChange->allele_ori->seq eq + $self->AAChange->allele_mut->seq)) { + $type = 'silent'; + } else { + $type = 'missense'; + } + } else { + $type = 'unknown'; + } + } else { + my $len = 0; + $len = CORE::length($o) if $o; + $len -= CORE::length($m) if $m; + if ($len%3 == 0 ) { + $type = 'inframe'; + } else { + $type = 'frameshift'; + } + if (not $m ) { + $type .= ', '. 'deletion'; + } + elsif (not $o ) { + $type .= ', '. 'insertion'; + } + else { + $type .= ', '. 'complex'; + } + if ($self->codon_ori && $ct->is_ter_codon($self->codon_ori) ) { + $type .= ', '. 'termination codon'; + } + } + + $self->{'label'} = $type; + return $self->{'label'}; +} + + +=head2 _change_codon_pos + + Title : _change_codon_pos + Usage : $newCodonPos = _change_codon_pos($myCodonPos, 5) + Function: + + Keeps track of the codon position in a changeing sequence + + Returns : codon_pos = integer 1, 2 or 3 + Args : valid codon position + signed integer offset to a new location in sequence + +=cut + + +sub _change_codon_pos ($$) { + my ($cpos, $i) = @_; + + $cpos = ($cpos + $i%3)%3; + if ($cpos > 3 ) { + $cpos = $cpos - 3; + } + elsif ($cpos < 1 ) { + $cpos = $cpos + 3; + } + return $cpos; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Variation/SNP.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Variation/SNP.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,222 @@ +# $Id: SNP.pm,v 1.9 2002/10/22 07:38:49 lapp Exp $ +# bioperl module for Bio::Variation::SNP +# +# Copyright Allen Day , Stan Nelson +# Human Genetics, UCLA Medical School, University of California, Los Angeles + +=head1 NAME + +Bio::Variation::SNP - submitted SNP + +=head1 SYNOPSIS + + $SNP = Bio::Variation::SNP->new (); + +=head1 DESCRIPTION + +Inherits from Bio::Variation::SeqDiff and Bio::Variation::Allele, with +additional methods that are (db)SNP specific (ie, refSNP/subSNP IDs, batch +IDs, validation methods). + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Allen Day Eallenday@ucla.eduE + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::Variation::SNP; +my $VERSION=1.0; + +use strict; +use vars qw($VERSION @ISA $AUTOLOAD); +use Bio::Root::Root; +use Bio::Variation::SeqDiff; +use Bio::Variation::Allele; + +@ISA = qw( Bio::Variation::SeqDiff Bio::Variation::Allele); + +=head2 get/set-able methods + + Usage : $is = $snp->method() + Function: for getting/setting attributes + Returns : a value. probably a scalar. + Args : if you're trying to set an attribute, pass in the new value. + + Methods: + -------- + id + type + observed + seq_5 + seq_3 + ncbi_build + ncbi_chr_hits + ncbi_ctg_hits + ncbi_seq_loc + ucsc_build + ucsc_chr_hits + ucsc_ctg_hits + heterozygous + heterozygous_SE + validated + genotype + handle + batch_id + method + locus_id + symbol + mrna + protein + functional_class + + +=cut + + +my %OK_AUTOLOAD = ( + id => '', + type => '', + observed => [], + seq_5 => '', + seq_3 => '', + ncbi_build => '', + ncbi_chr_hits => '', + ncbi_ctg_hits => '', + ncbi_seq_loc => '', + ucsc_build => '', + ucsc_chr_hits => '', + ucsc_ctg_hits => '', + heterozygous => '', + heterozygous_SE => '', + validated => '', + genotype => '', + handle => '', + batch_id => '', + method => '', + locus_id => '', + symbol => '', + mrna => '', + protein => '', + functional_class => '', + ); + +sub AUTOLOAD { + my $self = shift; + my $param = $AUTOLOAD; + $param =~ s/.*:://; + $self->throw(__PACKAGE__." doesn't implement $param") unless defined $OK_AUTOLOAD{$param}; + + if( ref $OK_AUTOLOAD{$param} eq 'ARRAY' ) { + push @{$self->{$param}}, shift if @_; + return $self->{$param}->[scalar(@{$self->{$param}}) - 1]; + } else { + $self->{$param} = shift if @_; + return $self->{$param}; + } +} + + +#foreach my $slot (keys %RWSLOT){ +# no strict "refs"; #add class methods to package +# *$slot = sub { +# shift; +# $RWSLOT{$slot} = shift if @_; +# return $RWSLOT{$slot}; +# }; +#} + + +=head2 is_subsnp + + Title : is_subsnp + Usage : $is = $snp->is_subsnp() + Function: returns 1 if $snp is a subSNP + Returns : 1 or undef + Args : NONE + +=cut + +sub is_subsnp { + return shift->{is_subsnp}; +} + +=head2 subsnp + + Title : subsnp + Usage : $subsnp = $snp->subsnp() + Function: returns the currently active subSNP of $snp + Returns : Bio::Variation::SNP + Args : NONE + +=cut + +sub subsnp { + my $self = shift; + return $self->{subsnps}->[ scalar($self->each_subsnp) - 1 ]; +} + +=head2 add_subsnp + + Title : add_subsnp + Usage : $subsnp = $snp->add_subsnp() + Function: pushes the previous value returned by subsnp() onto a stack, accessible with each_subsnp(). + sets return value of subsnp() to a new Bio::Variation::SNP object, and returns that object. + Returns : Bio::Varitiation::SNP + Args : NONE + +=cut + +sub add_subsnp { + my $self = shift; + $self->throw("add_subsnp(): cannot add sunSNP to subSNP, only refSNP") if $self->is_subsnp; + + my $subsnp = Bio::Variation::SNP->new; + push @{$self->{subsnps}}, $subsnp; + $self->subsnp->{is_subsnp} = 1; + return $self->subsnp; +} + +=head2 each_subsnp + + Title : each_subsnp + Usage : @subsnps = $snp->each_subsnp() + Function: returns a list of the subSNPs of a refSNP + Returns : list + Args : NONE + +=cut + +sub each_subsnp { + my $self = shift; + $self->throw("each_subsnp(): cannot be called on a subSNP") if $self->is_subsnp; + return @{$self->{subsnps}}; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Variation/SeqDiff.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Variation/SeqDiff.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1146 @@ +# $Id: SeqDiff.pm,v 1.16 2002/10/22 07:38:49 lapp Exp $ +# bioperl module for Bio::Variation::SeqDiff +# +# Cared for by Heikki Lehvaslaiho +# +# Copyright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code + +# cds_end definition? + +=head1 NAME + +Bio::Variation::SeqDiff - Container class for mutation/variant descriptions + +=head1 SYNOPSIS + + $seqDiff = Bio::Variation::SeqDiff->new ( + -id => $M20132, + -alphabet => 'rna', + -gene_symbol => 'AR' + -chromosome => 'X', + -numbering => 'coding' + ); + # get a DNAMutation object somehow + $seqDiff->add_Variant($dnamut); + print $seqDiff->sys_name(), "\n"; + +=head1 DESCRIPTION + +SeqDiff stores Bio::Variation::VariantI object references and +descriptive information common to all changes in a sequence. Mutations +are understood to be any kind of sequence markers and are expected to +occur in the same chromosome. See L for details. + +The methods of SeqDiff are geared towards describing mutations in +human genes using gene-based coordinate system where 'A' of the +initiator codon has number 1 and the one before it -1. This is +according to conventions of human genetics. + +There will be class Bio::Variation::Genotype to describe markers in +different chromosomes and diploid genototypes. + +Classes implementing Bio::Variation::VariantI interface are +Bio::Variation::DNAMutation, Bio::Variation::RNAChange, and +Bio::Variation::AAChange. See L, +L, L, and +L for more information. + +Variant objects can be added using two ways: an array passed to the +constructor or as individual Variant objects with add_Variant +method. + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email: heikki@ebi.ac.uk +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 CONTRIBUTORS + +Eckhard Lehmann, ecky@e-lehmann.de + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::Variation::SeqDiff; +my $VERSION=1.0; + +use strict; +use vars qw($VERSION @ISA); +use Bio::Root::Root; +use Bio::Tools::CodonTable; +use Bio::PrimarySeq; + +@ISA = qw( Bio::Root::Root ); + + +=head2 new + + Title : new + Usage : $seqDiff = Bio::Variation::SeqDiff->new; + Function: generates a new Bio::Variation::SeqDiff + Returns : reference to a new object of class SeqDiff + Args : + +=cut + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + + my($id, $sysname, $trivname, $chr, $gene_symbol, + $desc, $alphabet, $numbering, $offset, $rna_offset, $rna_id, $cds_end, + $dna_ori, $dna_mut, $rna_ori, $rna_mut, $aa_ori, $aa_mut + #@variants, @genes + ) = + $self->_rearrange([qw(ID + SYSNAME + TRIVNAME + CHR + GENE_SYMBOL + DESC + ALPHABET + NUMBERING + OFFSET + RNA_OFFSET + RNA_ID + CDS_END + DNA_ORI + DNA_MUT + RNA_ORI + AA_ORI + AA_MUT + )], + @args); + + #my $make = $self->SUPER::_initialize(@args); + + $id && $self->id($id); + $sysname && $self->sysname($sysname); + $trivname && $self->trivname($trivname); + $chr && $self->chromosome($chr); + $gene_symbol && $self->gene_symbol($chr); + $desc && $self->description($desc); + $alphabet && $self->alphabet($alphabet); + $numbering && $self->numbering($numbering); + $offset && $self->offset($offset); + $rna_offset && $self->rna_offset($rna_offset); + $rna_id && $self->rna_id($rna_id); + $cds_end && $self->cds_end($cds_end); + + $dna_ori && $self->dna_ori($dna_ori); + $dna_mut && $self->dna_mut($dna_mut); + $rna_ori && $self->rna_ori($rna_ori); + $rna_mut && $self->rna_mut($rna_mut); + $aa_ori && $self->aa_ori ($aa_ori); + $aa_mut && $self->aa_mut ($aa_mut); + + $self->{ 'variants' } = []; + #@variants && push(@{$self->{'variants'}},@variants); + + $self->{ 'genes' } = []; + #@genes && push(@{$self->{'genes'}},@genes); + + return $self; # success - we hope! +} + + +=head2 id + + Title : id + Usage : $obj->id(H0001); $id = $obj->id(); + Function: + + Sets or returns the id of the seqDiff. + Should be used to give the collection of variants a UID + without semantic associations. + + Example : + Returns : value of id, a scalar + Args : newvalue (optional) + +=cut + + +sub id { + my ($self,$value) = @_; + if (defined $value) { + $self->{'id'} = $value; + } +# unless (exists $self->{'id'}) { +# return "undefined"; +# } + else { + return $self->{'id'}; + } +} + + +=head2 sysname + + Title : sysname + Usage : $obj->sysname('5C>G'); $sysname = $obj->sysname(); + Function: + + Sets or returns the systematic name of the seqDiff. The + name should follow the HUGO Mutation Database Initiative + approved nomenclature. If called without first setting the + value, will generate it from L + objects attached. + + Example : + Returns : value of sysname, a scalar + Args : newvalue (optional) + +=cut + + +sub sysname { + my ($self,$value) = @_; + if (defined $value) { + $self->{'sysname'} = $value; + } + elsif (not defined $self->{'sysname'}) { + + my $sysname = ''; + my $c = 0; + foreach my $mut ($self->each_Variant) { + if( $mut->isa('Bio::Variation::DNAMutation') ) { + $c++; + if ($c == 1 ) { + $sysname = $mut->sysname ; + } + else { + $sysname .= ";". $mut->sysname; + } + } + } + $sysname = "[". $sysname. "]" if $c > 1; + $self->{'sysname'} = $sysname; + } + return $self->{'sysname'}; +} + + +=head2 trivname + + Title : trivname + Usage : $obj->trivname('[A2G;T56G]'); $trivname = $obj->trivname(); + Function: + + Sets or returns the trivial name of the seqDiff. + The name should follow the HUGO Mutation Database Initiative + approved nomenclature. If called without first setting the + value, will generate it from L + objects attached. + + Example : + Returns : value of trivname, a scalar + Args : newvalue (optional) + +=cut + + +sub trivname { + my ($self,$value) = @_; + if (defined $value) { + $self->{'trivname'} = $value; + } + elsif (not defined $self->{'trivname'}) { + + my $trivname = ''; + my $c = 0; + foreach my $mut ($self->each_Variant) { + if( $mut->isa('Bio::Variation::AAChange') ) { + $c++; + if ($c == 1 ) { + $trivname = $mut->trivname ; + } + else { + $trivname .= ";". $mut->trivname; + } + } + } + $trivname = "[". $trivname. "]" if $c > 1; + $self->{'trivname'} = $trivname; + } + + else { + return $self->{'trivname'}; + } +} + + +=head2 chromosome + + Title : chromosome + Usage : $obj->chromosome('X'); $chromosome = $obj->chromosome(); + Function: + + Sets or returns the chromosome ("linkage group") of the seqDiff. + + Example : + Returns : value of chromosome, a scalar + Args : newvalue (optional) + +=cut + + +sub chromosome { + my ($self,$value) = @_; + if (defined $value) { + $self->{'chromosome'} = $value; + } + else { + return $self->{'chromosome'}; + } +} + + +=head2 gene_symbol + + Title : gene_symbol + Usage : $obj->gene_symbol('FOS'); $gene_symbol = $obj->gene_symbol; + Function: + + Sets or returns the gene symbol for the studied CDS. + + Example : + Returns : value of gene_symbol, a scalar + Args : newvalue (optional) + +=cut + + +sub gene_symbol { + my ($self,$value) = @_; + if (defined $value) { + $self->{'gene_symbol'} = $value; + } + else { + return $self->{'gene_symbol'}; + } +} + + + +=head2 description + + Title : description + Usage : $obj->description('short description'); $descr = $obj->description(); + Function: + + Sets or returns the short description of the seqDiff. + + Example : + Returns : value of description, a scalar + Args : newvalue (optional) + +=cut + + +sub description { + my ($self,$value) = @_; + if (defined $value) { + $self->{'description'} = $value; + } + else { + return $self->{'description'}; + } +} + + +=head2 alphabet + + Title : alphabet + Usage : if( $obj->alphabet eq 'dna' ) { /Do Something/ } + Function: Returns the type of primary reference sequence being one of + 'dna', 'rna' or 'protein'. This is case sensitive. + + Returns : a string either 'dna','rna','protein'. + Args : none + + +=cut + +sub alphabet { + my ($self,$value) = @_; + my %type = (dna => 1, + rna => 1, + protein => 1); + if( defined $value ) { + if ($type{$value}) { + $self->{'alphabet'} = $value; + } else { + $self->throw("$value is not valid alphabet value!"); + } + } + return $self->{'alphabet'}; +} + + +=head2 numbering + + Title : numbering + Usage : $obj->numbering('coding'); $numbering = $obj->numbering(); + Function: + + Sets or returns the string giving the numbering schema used + to describe the variants. + + Example : + Returns : value of numbering, a scalar + Args : newvalue (optional) + +=cut + + + +sub numbering { + my ($self,$value) = @_; + if (defined $value) { + $self->{'numbering'} = $value; + } + else { + return $self->{'numbering'}; + } +} + + +=head2 offset + + Title : offset + Usage : $obj->offset(124); $offset = $obj->offset(); + Function: + + Sets or returns the offset from the beginning of the DNA sequence + to the coordinate start used to describe variants. Typically + the beginning of the coding region of the gene. + The cds_start should be 1 + offset. + + Example : + Returns : value of offset, a scalar + Args : newvalue (optional) + +=cut + + + +sub offset { + my ($self,$value) = @_; + if (defined $value) { + $self->{'offset'} = $value; + } + elsif (not defined $self->{'offset'} ) { + return $self->{'offset'} = 0; + } + else { + return $self->{'offset'}; + } +} + + +=head2 cds_start + + Title : cds_start + Usage : $obj->cds_start(123); $cds_start = $obj->cds_start(); + Function: + + Sets or returns the cds_start from the beginning of the DNA + sequence to the coordinate start used to describe + variants. Typically the beginning of the coding region of + the gene. Needs to be and is implemented as 1 + offset. + + Example : + Returns : value of cds_start, a scalar + Args : newvalue (optional) + +=cut + + + +sub cds_start { + my ($self,$value) = @_; + if (defined $value) { + $self->{'offset'} = $value - 1; + } + else { + return $self->{'offset'} + 1; + } +} + + +=head2 cds_end + + Title : cds_end + Usage : $obj->cds_end(321); $cds_end = $obj->cds_end(); + Function: + + Sets or returns the position of the last nucleotitide of the + termination codon. The coordinate system starts from cds_start. + + Example : + Returns : value of cds_end, a scalar + Args : newvalue (optional) + +=cut + + + +sub cds_end { + my ($self,$value) = @_; + if (defined $value) { + $self->{'cds_end'} = $value; + } + else { + return $self->{'cds_end'}; + #$self->{'cds_end'} = CORE::length($self->SeqDiff->rna_ori)/3; + } +} + + +=head2 rna_offset + + Title : rna_offset + Usage : $obj->rna_offset(124); $rna_offset = $obj->rna_offset(); + Function: + + Sets or returns the rna_offset from the beginning of the RNA sequence + to the coordinate start used to describe variants. Typically + the beginning of the coding region of the gene. + + Example : + Returns : value of rna_offset, a scalar + Args : newvalue (optional) + +=cut + + + +sub rna_offset { + my ($self,$value) = @_; + if (defined $value) { + $self->{'rna_offset'} = $value; + } + elsif (not defined $self->{'rna_offset'} ) { + return $self->{'rna_offset'} = 0; + } + else { + return $self->{'rna_offset'}; + } +} + + +=head2 rna_id + + Title : rna_id + Usage : $obj->rna_id('transcript#3'); $rna_id = $obj->rna_id(); + Function: + + Sets or returns the ID for original RNA sequence of the seqDiff. + + Example : + Returns : value of rna_id, a scalar + Args : newvalue (optional) + +=cut + + +sub rna_id { + my ($self,$value) = @_; + if (defined $value) { + $self->{'rna_id'} = $value; + } + else { + return $self->{'rna_id'}; + } +} + + + +=head2 add_Variant + + Title : add_Variant + Usage : $obj->add_Variant($variant) + Function: + + Pushes one Bio::Variation::Variant into the list of variants. + At the same time, creates a link from the Variant to SeqDiff + using its SeqDiff method. + + Example : + Returns : 1 when succeeds, 0 for failure. + Args : Variant object + +=cut + +sub add_Variant { + my ($self,$value) = @_; + if (defined $value) { + if( ! $value->isa('Bio::Variation::VariantI') ) { + $self->throw("Is not a VariantI complying object but a [$self]"); + return 0; + } + else { + push(@{$self->{'variants'}},$value); + $value->SeqDiff($self); + return 1; + } + } + else { + return 0; + } +} + + +=head2 each_Variant + + Title : each_Variant + Usage : $obj->each_Variant(); + Function: + + Returns a list of Variants. + + Example : + Returns : list of Variants + Args : none + +=cut + +sub each_Variant{ + my ($self,@args) = @_; + + return @{$self->{'variants'}}; +} + + + +=head2 add_Gene + + Title : add_Gene + Usage : $obj->add_Gene($gene) + Function: + + Pushes one L into the list of genes. + + Example : + Returns : 1 when succeeds, 0 for failure. + Args : Bio::LiveSeq::Gene object + +See L for more information. + +=cut + + +sub add_Gene { + my ($self,$value) = @_; + if (defined $value) { + if( ! $value->isa('Bio::LiveSeq::Gene') ) { + $value->throw("Is not a Bio::LiveSeq::Gene object but a [$value]"); + return 0; + } + else { + push(@{$self->{'genes'}},$value); + return 1; + } + } + else { + return 0; + } +} + + +=head2 each_Gene + + Title : each_Gene + Usage : $obj->each_Gene(); + Function: + + Returns a list of Ls. + + Example : + Returns : list of Genes + Args : none + +=cut + +sub each_Gene{ + my ($self,@args) = @_; + + return @{$self->{'genes'}}; +} + + +=head2 dna_ori + + Title : dna_ori + Usage : $obj->dna_ori('atgctgctgctgct'); $dna_ori = $obj->dna_ori(); + Function: + + Sets or returns the original DNA sequence string of the seqDiff. + + Example : + Returns : value of dna_ori, a scalar + Args : newvalue (optional) + +=cut + + +sub dna_ori { + my ($self,$value) = @_; + if (defined $value) { + $self->{'dna_ori'} = $value; + } + else { + return $self->{'dna_ori'}; + } +} + + +=head2 dna_mut + + Title : dna_mut + Usage : $obj->dna_mut('atgctggtgctgct'); $dna_mut = $obj->dna_mut(); + Function: + + Sets or returns the mutated DNA sequence of the seqDiff. + If sequence has not been set generates it from the + original sequence and DNA mutations. + + Example : + Returns : value of dna_mut, a scalar + Args : newvalue (optional) + +=cut + + +sub dna_mut { + my ($self,$value) = @_; + if (defined $value) { + $self->{'dna_mut'} = $value; + } + else { + $self->_set_dnamut() unless $self->{'dna_mut'}; + return $self->{'dna_mut'}; + } +} + +sub _set_dnamut { + my $self = shift; + + return undef unless $self->{'dna_ori'} && $self->each_Variant; + + $self->{'dna_mut'} = $self->{'dna_ori'}; + foreach ($self->each_Variant) { + next unless $_->isa('Bio::Variation::DNAMutation'); + next unless $_->isMutation; + + my ($s, $la, $le); + #lies the mutation less than 25 bases after the start of sequence? + if ($_->start < 25) { + $s = 0; $la = $_->start - 1; + } else { + $s = $_->start - 25; $la = 25; + } + + #is the mutation an insertion? + $_->end($_->start) unless $_->allele_ori->seq; + + #does the mutation end greater than 25 bases before the end of + #sequence? + if (($_->end + 25) > length($self->{'dna_mut'})) { + $le = length($self->{'dna_mut'}) - $_->end; + } else { + $le = 25; + } + + $_->dnStreamSeq(substr($self->{'dna_mut'}, $s, $la)); + $_->upStreamSeq(substr($self->{'dna_mut'}, $_->end, $le)); + + my $s_ori = $_->dnStreamSeq . $_->allele_ori->seq . $_->upStreamSeq; + my $s_mut = $_->dnStreamSeq . $_->allele_mut->seq . $_->upStreamSeq; + + (my $str = $self->{'dna_mut'}) =~ s/$s_ori/$s_mut/; + $self->{'dna_mut'} = $str; + } +} + + +=head2 rna_ori + + Title : rna_ori + Usage : $obj->rna_ori('atgctgctgctgct'); $rna_ori = $obj->rna_ori(); + Function: + + Sets or returns the original RNA sequence of the seqDiff. + + Example : + Returns : value of rna_ori, a scalar + Args : newvalue (optional) + +=cut + + +sub rna_ori { + my ($self,$value) = @_; + if (defined $value) { + $self->{'rna_ori'} = $value; + } + else { + return $self->{'rna_ori'}; + } +} + + +=head2 rna_mut + + Title : rna_mut + Usage : $obj->rna_mut('atgctggtgctgct'); $rna_mut = $obj->rna_mut(); + Function: + + Sets or returns the mutated RNA sequence of the seqDiff. + + Example : + Returns : value of rna_mut, a scalar + Args : newvalue (optional) + +=cut + + +sub rna_mut { + my ($self,$value) = @_; + if (defined $value) { + $self->{'rna_mut'} = $value; + } + else { + return $self->{'rna_mut'}; + } +} + + +=head2 aa_ori + + Title : aa_ori + Usage : $obj->aa_ori('MAGVLL*'); $aa_ori = $obj->aa_ori(); + Function: + + Sets or returns the original protein sequence of the seqDiff. + + Example : + Returns : value of aa_ori, a scalar + Args : newvalue (optional) + +=cut + + +sub aa_ori { + my ($self,$value) = @_; + if (defined $value) { + $self->{'aa_ori'} = $value; + } + else { + return $self->{'aa_ori'}; + } +} + + +=head2 aa_mut + + Title : aa_mut + Usage : $obj->aa_mut('MA*'); $aa_mut = $obj->aa_mut(); + Function: + + Sets or returns the mutated protein sequence of the seqDiff. + + Example : + Returns : value of aa_mut, a scalar + Args : newvalue (optional) + +=cut + + +sub aa_mut { + my ($self,$value) = @_; + if (defined $value) { + $self->{'aa_mut'} = $value; + } + else { + return $self->{'aa_mut'}; + } +} + + +=head2 seqobj + + Title : seqobj + Usage : $dnaobj = $obj->seqobj('dna_mut'); + Function: + + Returns the any original or mutated sequences as a + Bio::PrimarySeq object. + + Example : + Returns : Bio::PrimarySeq object for the requested sequence + Args : string, method name for the sequence requested + +See L for more information. + +=cut + +sub seqobj { + my ($self,$value) = @_; + my $out; + my %valid_obj = + map {$_, 1} qw(dna_ori rna_ori aa_ori dna_mut rna_mut aa_mut); + $valid_obj{$value} || + $self->throw("Sequence type '$value' is not a valid type (". + join(',', map "'$_'", sort keys %valid_obj) .") lowercase"); + my ($alphabet) = $value =~ /([^_]+)/; + my $id = $self->id; + $id = $self->rna_id if $self->rna_id; + $alphabet = 'protein' if $alphabet eq 'aa'; + $out = Bio::PrimarySeq->new + ( '-seq' => $self->{$value}, + '-display_id' => $id, + '-accession_number' => $self->id, + '-alphabet' => $alphabet + ) if $self->{$value} ; + return $out; +} + +=head2 alignment + + Title : alignment + Usage : $obj->alignment + Function: + + Returns a pretty RNA/AA sequence alignment from linked + objects. Under construction: Only simple coding region + point mutations work. + + Example : + Returns : + Args : none + +=cut + + +sub alignment { + my $self = shift; + my (@entry, $text); + + my $maxflanklen = 12; + + foreach my $mut ($self->each_Variant) { + if( $mut->isa('Bio::Variation::RNAChange') ) { + + my $upflank = $mut->upStreamSeq; + my $dnflank = $mut->dnStreamSeq; + my $cposd = $mut->codon_pos; + my $rori = $mut->allele_ori->seq; + my $rmut = $mut->allele_mut->seq; + my $rseqoriu = ''; + my $rseqmutu = ''; + my $rseqorid = ''; + my $rseqmutd = ''; + my $aaseqmutu = ''; + my (@rseqori, @rseqmut ); + + # point + if ($mut->DNAMutation->label =~ /point/) { + if ($cposd == 1 ) { + my $nt2d = substr($dnflank, 0, 2); + push @rseqori, $rori. $nt2d; + push @rseqmut, uc ($rmut). $nt2d; + $dnflank = substr($dnflank, 2); + } + elsif ($cposd == 2) { + my $ntu = chop $upflank; + my $ntd = substr($dnflank, 0, 1); + push @rseqori, $ntu. $rori. $ntd; + push @rseqmut, $ntu. uc ($rmut). $ntd; + $dnflank = substr($dnflank, 1); + } + elsif ($cposd == 3) { + my $ntu1 = chop $upflank; + my $ntu2 = chop $upflank; + push (@rseqori, $ntu2. $ntu1. $rori); + push (@rseqmut, $ntu2. $ntu1. uc $rmut); + } + } + #deletion + elsif ($mut->DNAMutation->label =~ /deletion/) { + if ($cposd == 2 ) { + $rseqorid = chop $upflank; + $rseqmutd = $rseqorid; + } + for (my $i=1; $i<=$mut->length; $i++) { + my $ntd .= substr($mut->allele_ori, $i-1, 1); + $rseqorid .= $ntd; + if (length($rseqorid) == 3 ) { + push (@rseqori, $rseqorid); + push (@rseqmut, " "); + $rseqorid = ''; + } + } + + if ($rseqorid) { + $rseqorid .= substr($dnflank, 0, 3-$rseqorid); + push (@rseqori, $rseqorid); + push (@rseqmut, " "); + $dnflank = substr($dnflank,3-$rseqorid); + } + } + $upflank = reverse $upflank; + # loop throught the flanks + for (my $i=1; $i<=length($dnflank); $i++) { + + last if $i > $maxflanklen; + + my $ntd .= substr($dnflank, $i-1, 1); + my $ntu .= substr($upflank, $i-1, 1); + + $rseqmutd .= $ntd; + $rseqorid .= $ntd; + $rseqmutu = $ntu. $rseqmutu; + $rseqoriu = $ntu. $rseqoriu; + + if (length($rseqorid) == 3 and length($rseqorid) == 3) { + push (@rseqori, $rseqorid); + push (@rseqmut, $rseqmutd); + $rseqorid = $rseqmutd =''; + } + if (length($rseqoriu) == 3 and length($rseqoriu) == 3) { + unshift (@rseqori, $rseqoriu); + unshift (@rseqmut, $rseqmutu); + $rseqoriu = $rseqmutu =''; + } + + #print "|i=$i, $cposd, $rseqmutd, $rseqorid\n"; + #print "|i=$i, $cposu, $rseqmutu, $rseqoriu\n\n"; + + } + + push (@rseqori, $rseqorid); + unshift (@rseqori, $rseqoriu); + push (@rseqmut, $rseqmutd); + unshift (@rseqmut, $rseqmutu); + + return unless $mut->AAChange; + #translate + my $tr = new Bio::Tools::CodonTable ('-id' => $mut->codon_table); + my $apos = $mut->AAChange->start; + my $aposmax = CORE::length($self->aa_ori); #terminator codon no + my $rseqori; + my $rseqmut; + my $aaseqori; + my $aaseqmut = ""; + for (my $i = 0; $i <= $#rseqori; $i++) { + my $a = ''; + + $a = $tr->translate($rseqori[$i]) if length($rseqori[$i]) == 3; + + if (length($a) != 1 or + $apos - ( $maxflanklen/2 -1) + $i < 1 or + $apos - ( $maxflanklen/2 -1) + $i > $aposmax ) { + $aaseqori .= " "; + } else { + $aaseqori .= " ". $a. " "; + } + my $b = ''; + if (length($rseqmut[$i]) == 3) { + if ($rseqmut[$i] eq ' ') { + $b = "_"; + } else { + $b = $tr->translate($rseqmut[$i]); + } + } + if (( $b ne $a and + length($b) == 1 and + $apos - ( $maxflanklen/2 -1) + $i >= 1 ) or + ( $apos - ( $maxflanklen/2 -1) + $i >= $aposmax and + $mut->label =~ 'termination') + ) { + $aaseqmut .= " ". $b. " "; + } else { + $aaseqmut .= " "; + } + + if ($i == 0 and length($rseqori[$i]) != 3) { + my $l = 3 - length($rseqori[$i]); + $rseqori[$i] = (" " x $l). $rseqori[$i]; + $rseqmut[$i] = (" " x $l). $rseqmut[$i]; + } + $rseqori .= $rseqori[$i]. " " if $rseqori[$i] ne ''; + $rseqmut .= $rseqmut[$i]. " " if $rseqmut[$i] ne ''; + } + + # collect the results + push (@entry, + "\n" + ); + $text = " ". $aaseqmut; + push (@entry, + $text + ); + $text = "Variant : ". $rseqmut; + push (@entry, + $text + ); + $text = "Reference: ". $rseqori; + push (@entry, + $text + ); + $text = " ". $aaseqori; + push (@entry, + $text + ); + push (@entry, + "\n" + ); + } + + } + + my $res; + foreach my $line (@entry) { + $res .= "$line\n"; + } + return $res; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/Bio/Variation/VariantI.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Variation/VariantI.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1053 @@ +# $Id: VariantI.pm,v 1.12 2002/10/22 07:38:49 lapp Exp $ +# +# BioPerl module for Bio::Variation::VariantI +# +# Cared for by Heikki Lehvaslaiho +# +# Copyright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Variation::VariantI - Sequence Change SeqFeature abstract class + +=head1 SYNOPSIS + + #get Bio::Variant::VariantI somehow + print $var->restriction_changes, "\n"; + foreach $allele ($var->each_Allele) { + #work on Bio::Variation::Allele objects + } + +=head1 DESCRIPTION + +This superclass defines common methods to basic sequence changes. The +instantiable classes Bio::Variation::DNAMutation, +Bio::Variation::RNAChange and Bio::Variation::AAChange use them. +See L, L, +and L for more information. + +These classes store information, heavy computation to detemine allele +sequences is done elsewhere. + +The database cross-references are implemented as +Bio::Annotation::DBLink objects. The methods to access them are +defined in Bio::DBLinkContainerI. See L +and L for details. + +Bio::Variation::VariantI redifines and extends +Bio::SeqFeature::Generic for sequence variations. This class +describes specific sequence change events. These events are always +from a specific reference sequence to something different. See +L for more information. + +IMPORTANT: The notion of reference sequence permeates all +Bio::Variation classes. This is especially important to remember when +dealing with Alleles. In a polymorphic site, there can be a large +number of alleles. One of then has to be selected to be the reference +allele (allele_ori). ALL the rest has to be passed to the Variant +using the method add_Allele, including the mutated allele in a +canonical mutation. The IO modules and generated attributes depend on +it. They ignore the allele linked to using allele_mut and circulate +each Allele returned by each_Allele into allele_mut and calculate +the changes between that and allele_ori. + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email: heikki@ebi.ac.uk +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Variation::VariantI; +$VERSION=1.0; +use vars qw(@ISA); +use strict; +use Bio::Root::Root; +use Bio::DBLinkContainerI; +# Object preamble - inheritance + +use Bio::SeqFeature::Generic; +@ISA = qw(Bio::Root::Root Bio::SeqFeature::Generic Bio::DBLinkContainerI ); + +=head2 id + + Title : id + Usage : $obj->id + Function: + + Read only method. Returns the id of the variation object. + The id is the id of the first DBLink object attached to this object. + + Example : + Returns : scalar + Args : none + +=cut + +sub id { + my ($self) = @_; + my @ids = $self->each_DBLink; + my $id = $ids[0] if scalar @ids > 0; + return $id->database. "::". $id->primary_id if $id; +} + + +=head2 add_Allele + + Title : add_Allele + Usage : $self->add_Allele($allele) + Function: + + Adds one Bio::Variation::Allele into the list of alleles. + Note that the method forces the convention that nucleotide + sequence is in lower case and amino acds are in upper + case. + + Example : + Returns : 1 when succeeds, 0 for failure. + Args : Allele object + +=cut + + +sub add_Allele { + my ($self,$value) = @_; + if (defined $value) { + if( ! $value->isa('Bio::Variation::Allele') ) { + my $com = ref $value; + $self->throw("Is not a Allele object but a [$com]"); + return 0; + } else { + if ( $self->isa('Bio::Variation::AAChange') ) { + $value->seq( uc $value->seq) if $value->seq; + } else { + $value->seq( lc $value->seq) if $value->seq; + } + push(@{$self->{'alleles'}},$value); + $self->allele_mut($value); #???? + return 1; + } + } else { + return 0; + } +} + + +=head2 each_Allele + + Title : alleles + Usage : $obj->each_Allele(); + Function: + + Returns a list of Bio::Variation::Allele objects + + Example : + Returns : list of Alleles + Args : none + +=cut + +sub each_Allele{ + my ($self,@args) = @_; + return @{$self->{'alleles'}}; +} + + + +=head2 isMutation + + Title : isMutation + Usage : print join('/', $obj->each_Allele) if not $obj->isMutation; + Function: + + Returns or sets the boolean value indicating that the + variant descibed is a canonical mutation with two alleles + assinged to be the original (wild type) allele and mutated + allele, respectively. If this value is not set, it is + assumed that the Variant descibes polymorphisms. + + Returns : a boolean + +=cut + +sub isMutation { + my ($self,$value) = @_; + if (defined $value) { + if ($value ) { + $self->{'isMutation'} = 1; + } else { + $self->{'isMutation'} = 0; + } + } + return $self->{'isMutation'}; +} + + +=head2 allele_ori + + Title : allele_ori + Usage : $obj->allele_ori(); + Function: + + Links to and returns the Bio::Variation::Allele object. + If value is not set, returns false. All other Alleles are + compared to this. + + Amino acid sequences are stored in upper case characters, + others in lower case. + + Example : + Returns : string + Args : string + +See L for more. + +=cut + +sub allele_ori { + my ($self,$value) = @_; + if( defined $value) { + if ( ! ref $value || ! $value->isa('Bio::Variation::Allele')) { + $self->throw("Value is not Bio::Variation::Allele but [$value]"); + } else { + if ( $self->isa('Bio::Variation::AAChange') ) { + $value->seq( uc $value->seq) if $value->seq; + } else { + $value->seq( lc $value->seq) if $value->seq; + } + $self->{'allele_ori'} = $value; + } + } + return $self->{'allele_ori'}; +} + + +=head2 allele_mut + + Title : allele_mut + Usage : $obj->allele_mut(); + Function: + + Links to and returns the Bio::Variation::Allele + object. Sets and returns the mutated allele sequence. + If value is not set, returns false. + + Amino acid sequences are stored in upper case characters, + others in lower case. + + Example : + Returns : string + Args : string + +See L for more. + +=cut + + +sub allele_mut { + my ($self,$value) = @_; + if( defined $value) { + if ( ! ref $value || ! $value->isa('Bio::Variation::Allele')) { + $self->throw("Value is not Bio::Variation::Allele but [$value]"); + } else { + if ( $self->isa('Bio::Variation::AAChange') ) { + $value->seq( uc $value->seq) if $value->seq; + } else { + $value->seq( lc $value->seq) if $value->seq; + } + $self->{'allele_mut'} = $value; + } + } + return $self->{'allele_mut'}; +} + +=head2 length + + Title : length + Usage : $obj->length(); + Function: + + Sets and returns the length of the affected original + allele sequence. If value is not set, returns false == 0. + + Value 0 means that the variant position is before the + start=end sequence position. (Value 1 would denote a point + mutation). This follows the convension to report an + insertion (2insT) in equivalent way to a corresponding + deletion (2delT) (Think about indel polymorpism ATC <=> AC + where the origianal state is not known ). + + Example : + Returns : string + Args : string + +=cut + + +sub length { + my ($self,$value) = @_; + if ( defined $value) { + $self->{'length'} = $value; + } + if ( ! exists $self->{'length'} ) { + return 0; + } + return $self->{'length'}; +} + +=head2 upStreamSeq + + Title : upStreamSeq + Usage : $obj->upStreamSeq(); + Function: + + Sets and returns upstream flanking sequence string. If + value is not set, returns false. The sequence should be + >=25 characters long, if possible. + + Example : + Returns : string or false + Args : string + +=cut + + +sub upStreamSeq { + my ($self,$value) = @_; + if( defined $value) { + $self->{'upstreamseq'} = $value; + } + return $self->{'upstreamseq'}; +} + + +=head2 dnStreamSeq + + Title : dnStreamSeq + Usage : $obj->dnStreamSeq(); + Function: + + Sets and returns dnstream flanking sequence string. If + value is not set, returns false. The sequence should be + >=25 characters long, if possible. + + Example : + Returns : string or false + Args : string + +=cut + + +sub dnStreamSeq { + my ($self,$value) = @_; + if( defined $value) { + $self->{'dnstreamseq'} = $value; + } + return $self->{'dnstreamseq'}; + +} + + +=head2 label + + Title : label + Usage : $obj->label(); + Function: + + Sets and returns mutation event label(s). If value is not + set, or no argument is given returns false. Each + instantiable class needs to implement this method. Valid + values are listed in 'Mutation event controlled vocabulary' in + http://www.ebi.ac.uk/mutations/recommendations/mutevent.html. + + Example : + Returns : string + Args : string + +=cut + + +sub label { + my ($self,$value) = @_; + $self->throw("[$self] has not implemeted method 'label'"); +} + + + +=head2 status + + Title : status + Usage : $obj->status() + Function: + + Returns the status of the sequence change object. + Valid values are: 'suspected' and 'proven' + + Example : $obj->status('proven'); + Returns : scalar + Args : valid string (optional, for setting) + + +=cut + + +sub status { + my ($self,$value) = @_; + my %status = (suspected => 1, + proven => 1 + ); + + if( defined $value) { + $value = lc $value; + if ($status{$value}) { + $self->{'status'} = $value; + } + else { + $self->throw("$value is not valid status value!"); + } + } + if( ! exists $self->{'status'} ) { + return "$self"; + } + return $self->{'status'}; +} + + +=head2 proof + + Title : proof + Usage : $obj->proof() + Function: + + Returns the proof of the sequence change object. + Valid values are: 'computed' and 'experimental'. + + Example : $obj->proof('computed'); + Returns : scalar + Args : valid string (optional, for setting) + + +=cut + + +sub proof { + my ($self,$value) = @_; + my %proof = (computed => 1, + experimental => 1 + ); + + if( defined $value) { + $value = lc $value; + if ($proof{$value}) { + $self->{'proof'} = $value; + } else { + $self->throw("$value is not valid proof value!"); + } + } + return $self->{'proof'}; +} + + +=head2 region + + Title : region + Usage : $obj->region(); + Function: + + Sets and returns the name of the sequence region type or + protein domain at this location. If value is not set, + returns false. + + Example : + Returns : string + Args : string + +=cut + + +sub region { + my ($self,$value) = @_; + if( defined $value) { + $self->{'region'} = $value; + } + return $self->{'region'}; +} + + +=head2 region_value + + Title : region_value + Usage : $obj->region_value(); + Function: + + Sets and returns the name of the sequence region_value or + protein domain at this location. If value is not set, + returns false. + + Example : + Returns : string + Args : string + +=cut + + +sub region_value { + my ($self,$value) = @_; + if( defined $value) { + $self->{'region_value'} = $value; + } + return $self->{'region_value'}; +} + +=head2 region_dist + + Title : region_dist + Usage : $obj->region_dist(); + Function: + + Sets and returns the distance tot the closest region + (i.e. intro/exon or domain) boundary. If distance is not + set, returns false. + + Example : + Returns : integer + Args : integer + +=cut + + +sub region_dist { + my ($self,$value) = @_; + if( defined $value) { + if ( not $value =~ /^[+-]?\d+$/ ) { + $self->throw("[$value] for region_dist has to be an integer\n"); + } else { + $self->{'region_dist'} = $value; + } + } + return $self->{'region_dist'}; +} + + +=head2 numbering + + Title : numbering + Usage : $obj->numbering() + Function: + + Returns the numbering chema used locating sequnce features. + Valid values are: 'entry' and 'coding' + + Example : $obj->numbering('coding'); + Returns : scalar + Args : valid string (optional, for setting) + + +=cut + + +sub numbering { + my ($self,$value) = @_; + my %numbering = (entry => 1, + coding => 1 + ); + + if( defined $value) { + $value = lc $value; + if ($numbering{$value}) { + $self->{'numbering'} = $value; + } + else { + $self->throw("'$value' is not a valid for numbering!"); + } + } + if( ! exists $self->{'numbering'} ) { + return "$self"; + } + return $self->{'numbering'}; +} + +=head2 mut_number + + Title : mut_number + Usage : $num = $obj->mut_number; + : $num = $obj->mut_number($number); + Function: + + Returns or sets the number identifying the order in which the + mutation has been issued. Numbers shouldstart from 1. + If the number has never been set, the method will return '' + + If you want the output from IO modules look nice and, for + multivariant/allele variations, make sense you better set + this attribute. + + Returns : an integer + +=cut + + +sub mut_number { + my ($self,$value) = @_; + if (defined $value) { + $self->{'mut_number'} = $value; + } + unless (exists $self->{'mut_number'}) { + return (''); + } else { + return $self->{'mut_number'}; + } +} + + +=head2 SeqDiff + + Title : SeqDiff + Usage : $mutobj = $obj->SeqDiff; + : $mutobj = $obj->SeqDiff($objref); + Function: + + Returns or sets the link-reference to the umbrella + Bio::Variation::SeqDiff object. If there is no link, + it will return undef + + Note: Adding a variant into a SeqDiff object will + automatically set this value. + + Returns : an obj_ref or undef + +See L for more information. + +=cut + +sub SeqDiff { + my ($self,$value) = @_; + if (defined $value) { + if( ! $value->isa('Bio::Variation::SeqDiff') ) { + $self->throw("Is not a Bio::Variation::SeqDiff object but a [$value]"); + return (undef); + } + else { + $self->{'seqDiff'} = $value; + } + } + unless (exists $self->{'seqDiff'}) { + return (undef); + } else { + return $self->{'seqDiff'}; + } +} + +=head2 add_DBLink + + Title : add_DBLink + Usage : $self->add_DBLink($ref) + Function: adds a link object + Example : + Returns : + Args : + + +=cut + + +sub add_DBLink{ + my ($self,$com) = @_; + if( $com && ! $com->isa('Bio::Annotation::DBLink') ) { + $self->throw("Is not a link object but a [$com]"); + } + $com && push(@{$self->{'link'}},$com); +} + +=head2 each_DBLink + + Title : each_DBLink + Usage : foreach $ref ( $self->each_DBlink() ) + Function: gets an array of DBlink of objects + Example : + Returns : + Args : + + +=cut + +sub each_DBLink{ + my ($self) = @_; + + return @{$self->{'link'}}; +} + +=head2 restriction_changes + + Title : restriction_changes + Usage : $obj->restriction_changes(); + Function: + + Returns a string containing a list of restriction + enzyme changes of form +EcoRI, separated by + commas. Strings need to be valid restriction enzyme names + as stored in REBASE. allele_ori and allele_mut need to be assigned. + + Example : + Returns : string + Args : string + +=cut + +sub restriction_changes { + my ($self) = @_; + + if (not $self->{'re_changes'}) { + my %re = &_enzymes; + + # complain if used on AA data + if ($self->isa('Bio::Variation::AAChange')) { + $self->throw('Restriction enzymes do not bite polypeptides!'); + } + + #sanity checks + $self->warn('Upstream sequence is empty!') + if $self->upStreamSeq eq ''; + $self->warn('Downstream sequence is empty!') + if $self->dnStreamSeq eq ''; +# $self->warn('Original allele sequence is empty!') +# if $self->allele_ori eq ''; +# $self->warn('Mutated allele sequence is empty!') +# if $self->allele_mut eq ''; + + #reuse the non empty DNA level list at RNA level if the flanks are identical + #Hint: Check DNAMutation object first + if ($self->isa('Bio::Variation::RNAChange') and $self->DNAMutation and + $self->upStreamSeq eq $self->DNAMutation->upStreamSeq and + $self->dnStreamSeq eq $self->DNAMutation->dnStreamSeq and + $self->DNAMutation->restriction_changes ne '' ) { + $self->{'re_changes'} = $self->DNAMutation->restriction_changes; + } else { + + #maximum length of a type II restriction site in the current REBASE + my ($le_dn) = 15; + my ($le_up) = $le_dn; + + #reduce the flank lengths if the desired length is not available + $le_dn = CORE::length ($self->dnStreamSeq) if $le_dn > CORE::length ($self->dnStreamSeq); + $le_up = CORE::length ($self->upStreamSeq) if $le_up > CORE::length ($self->upStreamSeq); + + #Build sequence strings to compare + my ($oriseq, $mutseq); + $oriseq = $mutseq = substr($self->upStreamSeq, -$le_up, $le_up); + $oriseq .= $self->allele_ori->seq if $self->allele_ori->seq; + $mutseq .= $self->allele_mut->seq if $self->allele_mut->seq; + $oriseq .= substr($self->dnStreamSeq, 0, $le_dn); + $mutseq .= substr($self->dnStreamSeq, 0, $le_dn); + + # ... and their reverse complements + my $oriseq_rev = _revcompl ($oriseq); + my $mutseq_rev = _revcompl ($mutseq); + + # collect results into a string + my $rec = ''; + foreach my $enz (sort keys (%re)) { + my $site = $re{$enz}; + my @ori = ($oriseq=~ /$site/g); + my @mut = ($mutseq=~ /$site/g); + my @ori_r = ($oriseq_rev =~ /$site/g); + my @mut_r = ($mutseq_rev =~ /$site/g); + + $rec .= '+'. $enz. ", " + if (scalar @ori < scalar @mut) or (scalar @ori_r < scalar @mut_r); + $rec .= '-'. $enz. ", " + if (scalar @ori > scalar @mut) or (scalar @ori_r > scalar @mut_r); + + } + $rec = substr($rec, 0, CORE::length($rec) - 2) if $rec ne ''; + $self->{'re_changes'} = $rec; + } + } + return $self->{'re_changes'} +} + + +sub _revcompl { + # side effect: lower case letters + my ($seq) = shift; + + $seq = lc $seq; + $seq =~ tr/acgtrymkswhbvdnx/tgcayrkmswdvbhnx/; + return CORE::reverse $seq; +} + + +sub _enzymes { + #REBASE version 005 type2.005 + my %enzymes = ( + 'AarI' => 'cacctgc', + 'AatII' => 'gacgtc', + 'AccI' => 'gt[ac][gt]ac', + 'AceIII' => 'cagctc', + 'AciI' => 'ccgc', + 'AclI' => 'aacgtt', + 'AcyI' => 'g[ag]cg[ct]c', + 'AflII' => 'cttaag', + 'AflIII' => 'ac[ag][ct]gt', + 'AgeI' => 'accggt', + 'AhaIII' => 'tttaaa', + 'AloI' => 'gaac[acgt][acgt][acgt][acgt][acgt][acgt]tcc', + 'AluI' => 'agct', + 'AlwNI' => 'cag[acgt][acgt][acgt]ctg', + 'ApaBI' => 'gca[acgt][acgt][acgt][acgt][acgt]tgc', + 'ApaI' => 'gggccc', + 'ApaLI' => 'gtgcac', + 'ApoI' => '[ag]aatt[ct]', + 'AscI' => 'ggcgcgcc', + 'AsuI' => 'gg[acgt]cc', + 'AsuII' => 'ttcgaa', + 'AvaI' => 'c[ct]cg[ag]g', + 'AvaII' => 'gg[at]cc', + 'AvaIII' => 'atgcat', + 'AvrII' => 'cctagg', + 'BaeI' => 'ac[acgt][acgt][acgt][acgt]gta[ct]c', + 'BalI' => 'tggcca', + 'BamHI' => 'ggatcc', + 'BbvCI' => 'cctcagc', + 'BbvI' => 'gcagc', + 'BbvII' => 'gaagac', + 'BccI' => 'ccatc', + 'Bce83I' => 'cttgag', + 'BcefI' => 'acggc', + 'BcgI' => 'cga[acgt][acgt][acgt][acgt][acgt][acgt]tgc', + 'BciVI' => 'gtatcc', + 'BclI' => 'tgatca', + 'BetI' => '[at]ccgg[at]', + 'BfiI' => 'actggg', + 'BglI' => 'gcc[acgt][acgt][acgt][acgt][acgt]ggc', + 'BglII' => 'agatct', + 'BinI' => 'ggatc', + 'BmgI' => 'g[gt]gccc', + 'BplI' => 'gag[acgt][acgt][acgt][acgt][acgt]ctc', + 'Bpu10I' => 'cct[acgt]agc', + 'BsaAI' => '[ct]acgt[ag]', + 'BsaBI' => 'gat[acgt][acgt][acgt][acgt]atc', + 'BsaXI' => 'ac[acgt][acgt][acgt][acgt][acgt]ctcc', + 'BsbI' => 'caacac', + 'BscGI' => 'cccgt', + 'BseMII' => 'ctcag', + 'BsePI' => 'gcgcgc', + 'BseRI' => 'gaggag', + 'BseSI' => 'g[gt]gc[ac]c', + 'BsgI' => 'gtgcag', + 'BsiI' => 'cacgag', + 'BsiYI' => 'cc[acgt][acgt][acgt][acgt][acgt][acgt][acgt]gg', + 'BsmAI' => 'gtctc', + 'BsmI' => 'gaatgc', + 'Bsp1407I' => 'tgtaca', + 'Bsp24I' => 'gac[acgt][acgt][acgt][acgt][acgt][acgt]tgg', + 'BspGI' => 'ctggac', + 'BspHI' => 'tcatga', + 'BspLU11I' => 'acatgt', + 'BspMI' => 'acctgc', + 'BspMII' => 'tccgga', + 'BsrBI' => 'ccgctc', + 'BsrDI' => 'gcaatg', + 'BsrI' => 'actgg', + 'BstEII' => 'ggt[acgt]acc', + 'BstXI' => 'cca[acgt][acgt][acgt][acgt][acgt][acgt]tgg', + 'BtrI' => 'cacgtc', + 'BtsI' => 'gcagtg', + 'Cac8I' => 'gc[acgt][acgt]gc', + 'CauII' => 'cc[cg]gg', + 'Cfr10I' => '[ag]ccgg[ct]', + 'CfrI' => '[ct]ggcc[ag]', + 'CjeI' => 'cca[acgt][acgt][acgt][acgt][acgt][acgt]gt', + 'CjePI' => 'cca[acgt][acgt][acgt][acgt][acgt][acgt][acgt]tc', + 'ClaI' => 'atcgat', + 'CviJI' => '[ag]gc[ct]', + 'CviRI' => 'tgca', + 'DdeI' => 'ct[acgt]ag', + 'DpnI' => 'gatc', + 'DraII' => '[ag]gg[acgt]cc[ct]', + 'DraIII' => 'cac[acgt][acgt][acgt]gtg', + 'DrdI' => 'gac[acgt][acgt][acgt][acgt][acgt][acgt]gtc', + 'DrdII' => 'gaacca', + 'DsaI' => 'cc[ag][ct]gg', + 'Eam1105I' => 'gac[acgt][acgt][acgt][acgt][acgt]gtc', + 'EciI' => 'ggcgga', + 'Eco31I' => 'ggtctc', + 'Eco47III' => 'agcgct', + 'Eco57I' => 'ctgaag', + 'EcoNI' => 'cct[acgt][acgt][acgt][acgt][acgt]agg', + 'EcoRI' => 'gaattc', + 'EcoRII' => 'cc[at]gg', + 'EcoRV' => 'gatatc', + 'Esp3I' => 'cgtctc', + 'EspI' => 'gct[acgt]agc', + 'FauI' => 'cccgc', + 'FinI' => 'gggac', + 'Fnu4HI' => 'gc[acgt]gc', + 'FnuDII' => 'cgcg', + 'FokI' => 'ggatg', + 'FseI' => 'ggccggcc', + 'GdiII' => 'cggcc[ag]', + 'GsuI' => 'ctggag', + 'HaeI' => '[at]ggcc[at]', + 'HaeII' => '[ag]gcgc[ct]', + 'HaeIII' => 'ggcc', + 'HaeIV' => 'ga[ct][acgt][acgt][acgt][acgt][acgt][ag]tc', + 'HgaI' => 'gacgc', + 'HgiAI' => 'g[at]gc[at]c', + 'HgiCI' => 'gg[ct][ag]cc', + 'HgiEII' => 'acc[acgt][acgt][acgt][acgt][acgt][acgt]ggt', + 'HgiJII' => 'g[ag]gc[ct]c', + 'HhaI' => 'gcgc', + 'Hin4I' => 'ga[cgt][acgt][acgt][acgt][acgt][acgt][acg]tc', + 'HindII' => 'gt[ct][ag]ac', + 'HindIII' => 'aagctt', + 'HinfI' => 'ga[acgt]tc', + 'HpaI' => 'gttaac', + 'HpaII' => 'ccgg', + 'HphI' => 'ggtga', + 'Hpy178III' => 'tc[acgt][acgt]ga', + 'Hpy188I' => 'tc[acgt]ga', + 'Hpy99I' => 'cg[at]cg', + 'KpnI' => 'ggtacc', + 'Ksp632I' => 'ctcttc', + 'MaeI' => 'ctag', + 'MaeII' => 'acgt', + 'MaeIII' => 'gt[acgt]ac', + 'MboI' => 'gatc', + 'MboII' => 'gaaga', + 'McrI' => 'cg[ag][ct]cg', + 'MfeI' => 'caattg', + 'MjaIV' => 'gt[acgt][acgt]ac', + 'MluI' => 'acgcgt', + 'MmeI' => 'tcc[ag]ac', + 'MnlI' => 'cctc', + 'MseI' => 'ttaa', + 'MslI' => 'ca[ct][acgt][acgt][acgt][acgt][ag]tg', + 'MstI' => 'tgcgca', + 'MwoI' => 'gc[acgt][acgt][acgt][acgt][acgt][acgt][acgt]gc', + 'NaeI' => 'gccggc', + 'NarI' => 'ggcgcc', + 'NcoI' => 'ccatgg', + 'NdeI' => 'catatg', + 'NheI' => 'gctagc', + 'NlaIII' => 'catg', + 'NlaIV' => 'gg[acgt][acgt]cc', + 'NotI' => 'gcggccgc', + 'NruI' => 'tcgcga', + 'NspBII' => 'c[ac]gc[gt]g', + 'NspI' => '[ag]catg[ct]', + 'PacI' => 'ttaattaa', + 'Pfl1108I' => 'tcgtag', + 'PflMI' => 'cca[acgt][acgt][acgt][acgt][acgt]tgg', + 'PleI' => 'gagtc', + 'PmaCI' => 'cacgtg', + 'PmeI' => 'gtttaaac', + 'PpiI' => 'gaac[acgt][acgt][acgt][acgt][acgt]ctc', + 'PpuMI' => '[ag]gg[at]cc[ct]', + 'PshAI' => 'gac[acgt][acgt][acgt][acgt]gtc', + 'PsiI' => 'ttataa', + 'PstI' => 'ctgcag', + 'PvuI' => 'cgatcg', + 'PvuII' => 'cagctg', + 'RleAI' => 'cccaca', + 'RsaI' => 'gtac', + 'RsrII' => 'cgg[at]ccg', + 'SacI' => 'gagctc', + 'SacII' => 'ccgcgg', + 'SalI' => 'gtcgac', + 'SanDI' => 'ggg[at]ccc', + 'SapI' => 'gctcttc', + 'SauI' => 'cct[acgt]agg', + 'ScaI' => 'agtact', + 'ScrFI' => 'cc[acgt]gg', + 'SduI' => 'g[agt]gc[act]c', + 'SecI' => 'cc[acgt][acgt]gg', + 'SexAI' => 'acc[at]ggt', + 'SfaNI' => 'gcatc', + 'SfeI' => 'ct[ag][ct]ag', + 'SfiI' => 'ggcc[acgt][acgt][acgt][acgt][acgt]ggcc', + 'SgfI' => 'gcgatcgc', + 'SgrAI' => 'c[ag]ccgg[ct]g', + 'SimI' => 'gggtc', + 'SmaI' => 'cccggg', + 'SmlI' => 'ct[ct][ag]ag', + 'SnaBI' => 'tacgta', + 'SnaI' => 'gtatac', + 'SpeI' => 'actagt', + 'SphI' => 'gcatgc', + 'SplI' => 'cgtacg', + 'SrfI' => 'gcccgggc', + 'Sse232I' => 'cgccggcg', + 'Sse8387I' => 'cctgcagg', + 'Sse8647I' => 'agg[at]cct', + 'SspI' => 'aatatt', + 'Sth132I' => 'cccg', + 'StuI' => 'aggcct', + 'StyI' => 'cc[at][at]gg', + 'SwaI' => 'atttaaat', + 'TaqI' => 'tcga', + 'TaqII' => 'gaccga', + 'TatI' => '[at]gtac[at]', + 'TauI' => 'gc[cg]gc', + 'TfiI' => 'ga[at]tc', + 'TseI' => 'gc[at]gc', + 'Tsp45I' => 'gt[cg]ac', + 'Tsp4CI' => 'ac[acgt]gt', + 'TspEI' => 'aatt', + 'TspRI' => 'ca[cg]tg[acgt][acgt]', + 'Tth111I' => 'gac[acgt][acgt][acgt]gtc', + 'Tth111II' => 'caa[ag]ca', + 'UbaGI' => 'cac[acgt][acgt][acgt][acgt]gtg', + 'UbaPI' => 'cgaacg', + 'VspI' => 'attaat', + 'XbaI' => 'tctaga', + 'XcmI' => 'cca[acgt][acgt][acgt][acgt][acgt][acgt][acgt][acgt][acgt]tgg', + 'XhoI' => 'ctcgag', + 'XhoII' => '[ag]gatc[ct]', + 'XmaIII' => 'cggccg', + 'XmnI' => 'gaa[acgt][acgt][acgt][acgt]ttc' + ); + + return %enzymes; +} + +1; diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/INSTALL.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/INSTALL.pl Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,440 @@ +#!/usr/bin/perl + +use Getopt::Long; +use LWP::Simple qw($ua getstore get); +use File::Listing qw(parse_dir); + +$| = 1; +our $VERSION = 2.6; + +# CONFIGURE +########### + +my ($DEST_DIR, $ENS_CVS_ROOT, $API_VERSION, $BIOPERL_URL, $CACHE_URL, $help); + +GetOptions( + 'DESTDIR|d=s' => \$DEST_DIR, + 'VERSION|v=i' => \$API_VERSION, + 'BIOPERL|b=s' => \$BIOPERL_URL, + 'CACHEURL|u=s' => \$CACHE_URL, + 'CACHEDIR|c=s' => \$CACHE_DIR, + 'HELP|h' => \$help +); + +if(defined($help)) { + usage(); + exit(0); +} + +my $default_dir_used; + +# check if $DEST_DIR is default +if(defined($DEST_DIR)) { + print "Using non-default installation directory $DEST_DIR - you will probably need to add $DEST_DIR to your PERL5LIB\n"; + $default_dir_used = 0; +} +else { + $DEST_DIR ||= '.'; + $default_dir_used = 1; +} + +my $lib_dir = $DEST_DIR; + +$DEST_DIR .= '/Bio'; +$ENS_CVS_ROOT ||= 'http://cvs.sanger.ac.uk/cgi-bin/viewvc.cgi/'; +$BIOPERL_URL ||= 'http://bioperl.org/DIST/old_releases/bioperl-1.2.3.tar.gz'; +$API_VERSION ||= 68; +$CACHE_URL ||= "ftp://ftp.ensembl.org/pub/release-$API_VERSION/variation/VEP"; +$CACHE_DIR ||= $ENV{HOME}.'/.vep'; + +our $prev_progress = 0; + +print "\nHello! This installer is configured to install v$API_VERSION of the Ensembl API for use by the VEP.\nIt will not affect any existing installations of the Ensembl API that you may have.\n\nIt will also download and install cache files from Ensembl's FTP server.\n\n"; + + +# CHECK EXISTING +################ + +print "Checking for installed versions of the Ensembl API..."; + +# test if the user has the API installed +my $has_api = { + 'ensembl' => 0, + 'ensembl-variation' => 0, + 'ensembl-functgenomics' => 0, +}; + +eval q{ + use Bio::EnsEMBL::Registry; +}; + +my $installed_version; + +unless($@) { + $has_api->{ensembl} = 1; + + $installed_version = Bio::EnsEMBL::Registry->software_version; +} + +eval q{ + use Bio::EnsEMBL::Variation::Utils::VEP; +}; + +$has_api->{'ensembl-variation'} = 1 unless $@; + +eval q{ + use Bio::EnsEMBL::Funcgen::RegulatoryFeature; +}; + +$has_api->{'ensembl-functgenomics'} = 1 unless $@; + + +print "done\n"; + +my $total = 0; +$total += $_ for values %$has_api; + +my $message; + +if($total == 3) { + + if(defined($installed_version)) { + if($installed_version == $API_VERSION) { + $message = "It looks like you already have v$API_VERSION of the API installed.\nYou shouldn't need to install the API"; + } + + elsif($installed_version > $API_VERSION) { + $message = "It looks like this installer is for an older distribution of the API than you already have"; + } + + else { + $message = "It looks like you have an older version (v$installed_version) of the API installed.\nThis installer will install a limited set of the API v$API_VERSION for use by the VEP only"; + } + } + + else { + $message = "It looks like you have an unidentified version of the API installed.\nThis installer will install a limited set of the API v$API_VERSION for use by the VEP only" + } +} + +elsif($total > 0) { + $message = "It looks like you already have the following API modules installed:\n\n".(join "\n", grep {$has_api->{$_}} keys %$has_api)."\n\nThe VEP requires the ensembl, ensembl-variation and optionally ensembl-functgenomics modules"; +} + +if(defined($message)) { + print "$message\n\nAre you sure you want to continue installing the API (y/n)? "; + + my $ok = <>; + + if($ok !~ /^y/i) { + print " - skipping API installation\n"; + goto CACHE; + } +} + + + +# SETUP +####### + +print "\nSetting up directories\n"; + +# check if install dir exists +if(-e $DEST_DIR) { + print "Destination directory $DEST_DIR already exists.\nDo you want to overwrite it (if updating VEP this is probably OK) (y/n)? "; + + my $ok = <>; + + if($ok !~ /^y/i) { + print "Exiting\n"; + exit(0); + } + + else { + unless($default_dir_used) { + print "WARNING: You are using a non-default install directory.\nPressing \"y\" again will remove $DEST_DIR and its contents!!!\nAre you really, really sure (y/n)? "; + $ok = <>; + + if($ok !~ /^y/i) { + print "Exiting\n"; + exit(0); + } + } + + # try to delete the existing dir + `rm -rf $DEST_DIR`; + } +} + +mkdir($DEST_DIR) or die "ERROR: Could not make directory $DEST_DIR\n"; +mkdir($DEST_DIR.'/tmp') or die "ERROR: Could not make directory $DEST_DIR/tmp\n"; + +# set up a user agent's proxy +$ua->env_proxy; + +# enable progress +eval q{ + $ua->show_progress(1); +}; + + + +# API +##### + +print "\nDownloading required files\n"; + +# set up the URLs +my $ensembl_url_tail = '.tar.gz?root=ensembl&view=tar&only_with_tag=branch-ensembl-'; + +foreach my $module(qw(ensembl ensembl-variation ensembl-functgenomics)) { + my $url = $ENS_CVS_ROOT.$module.$ensembl_url_tail.$API_VERSION; + + print " - fetching $module\n"; + + my $target_file = $DEST_DIR.'/tmp/'.$module.'.tar.gz'; + + unless(getstore($url, $target_file) == 200) { + die "ERROR: Failed to fetch $module from $url - perhaps you have a proxy/firewall? Set the http_proxy ENV variable if you do\nError code: $response\n"; + } + + print " - unpacking $target_file\n"; + unpack_tar("$DEST_DIR/tmp/$module.tar.gz", "$DEST_DIR/tmp/"); + + print " - moving files\n"; + + if($module eq 'ensembl') { + `mv -f $DEST_DIR/tmp/$module/modules/Bio/EnsEMBL $DEST_DIR/`; + } + elsif($module eq 'ensembl-variation') { + `mv -f $DEST_DIR/tmp/$module/modules/Bio/EnsEMBL/Variation $DEST_DIR/EnsEMBL/`; + } + elsif($module eq 'ensembl-functgenomics') { + `mv -f $DEST_DIR/tmp/$module/modules/Bio/EnsEMBL/Funcgen $DEST_DIR/EnsEMBL/`; + } + + `rm -rf $DEST_DIR/tmp/$module`;# or die "ERROR: Failed to remove directory $DEST_DIR/$module\n"; +} + + + +# BIOPERL +######### + +# now get BioPerl +print " - fetching BioPerl\n"; + +$bioperl_file = (split /\//, $BIOPERL_URL)[-1]; + +my $target_file = $DEST_DIR.'/tmp/'.$bioperl_file; + +unless(getstore($BIOPERL_URL, $target_file) == 200) { + die "ERROR: Failed to fetch BioPerl from $BIOPERL_URL - perhaps you have a proxy/firewall?\nError code: $response\n"; +} + +print " - unpacking $target_file\n"; +unpack_tar("$DEST_DIR/tmp/$bioperl_file", "$DEST_DIR/tmp/"); + +print " - moving files\n"; + +$bioperl_file =~ /(bioperl.+?)\.tar\.gz/; +my $bioperl_dir = $1; +`mv -f $DEST_DIR/tmp/$bioperl_dir/Bio/* $DEST_DIR/`; +`rm -rf $DEST_DIR/tmp/$bioperl_dir`;# or die "ERROR: Failed to remove directory $DEST_DIR/$bioperl_dir\n"; +`rm -rf $DEST_DIR/tmp`; + + + +# TEST +###### + +print "\nTesting VEP script\n"; + +my $test_vep = `perl variant_effect_predictor.pl --help 2>&1`; + +$test_vep =~ /ENSEMBL VARIANT EFFECT PREDICTOR/ or die "ERROR: Testing VEP script failed with the following error\n$test_vep\n"; + +print " - OK!\n"; + + + +# CACHE FILES +############# + +CACHE: + +print "\nThe VEP can either connect to remote or local databases, or use local cache files.\n"; +print "Cache files will be stored in $CACHE_DIR\n"; +print "Do you want to install any cache files (y/n)? "; + +my $ok = <>; + +if($ok !~ /^y/i) { + print "Exiting\n"; + exit(0); +} + +# check cache dir exists +if(!(-e $CACHE_DIR)) { + print "Cache directory $CACHE_DIR does not exists - do you want to create it (y/n)? "; + + my $ok = <>; + + if($ok !~ /^y/i) { + print "Exiting\n"; + exit(0); + } + + mkdir($CACHE_DIR) or die "ERROR: Could not create directory $CACHE_DIR\n"; +} + +mkdir($CACHE_DIR.'/tmp') unless -e $CACHE_DIR.'/tmp'; + +# get list of species +print "\nDownloading list of available cache files\n"; + +my $num = 1; +my $species_list; +my @files; +push @files, map {$_->[0]} grep {$_->[0] =~ /tar.gz/} @{parse_dir(get($CACHE_URL))}; + +# if we don't have a species list, we'll have to guess +if(!scalar(@files)) { + print "Could not get current species list - using predefined list instead\n"; + + @files = ( + "bos_taurus_vep_$API_VERSION.tar.gz", + "danio_rerio_vep_$API_VERSION.tar.gz", + "homo_sapiens_vep_$API_VERSION.tar.gz", + "homo_sapiens_vep_$API_VERSION\_sift_polyphen.tar.gz", + "mus_musculus_vep_$API_VERSION.tar.gz", + "rattus_norvegicus_vep_$API_VERSION.tar.gz", + ); +} + +foreach my $file(@files) { + $species_list .= $num++." : ".$file."\n"; +} + +print "The following species/files are available; which do you want (can specify multiple separated by spaces): \n$species_list\n? "; + +foreach my $file(split /\s+/, <>) { + my $file_path = $files[$file - 1]; + + my ($species, $file_name); + + if($file_path =~ /\//) { + ($species, $file_name) = (split /\//, $file_path); + } + else { + $file_name = $file_path; + $file_name =~ m/^(\w+?\_\w+?)\_vep/; + $species = $1; + } + + # check if user already has this species and version + if(-e "$CACHE_DIR/$species/$API_VERSION") { + print "\nWARNING: It looks like you already have the cache for $species (v$API_VERSION) installed.\nIf you continue the existing cache will be overwritten.\nAre you sure you want to continue (y/n)? "; + + my $ok = <>; + + if($ok !~ /^y/i) { + print " - skipping $species\n"; + next; + } + + `rm -rf $CACHE_DIR/$species/$API_VERSION`; + } + + my $target_file = "$CACHE_DIR/tmp/$file_name"; + + print " - downloading $CACHE_URL/$file_path\n"; + + unless(getstore("$CACHE_URL/$file_path", $target_file) == 200) { + die "ERROR: Failed to fetch cache file $file_name from $CACHE_URL/$file_path - perhaps you have a proxy/firewall? Set the http_proxy ENV variable if you do\nError code: $response\n"; + } + + print " - unpacking $file_name\n"; + + unpack_tar($target_file, $CACHE_DIR.'/tmp/'); + + # does species dir exist? + if(!-e "$CACHE_DIR/$species") { + mkdir("$CACHE_DIR/$species") or die "ERROR: Could not create directory $CACHE_DIR/$species\n"; + } + + # move files + `mv -f $CACHE_DIR/tmp/$species/$API_VERSION $CACHE_DIR/$species/`; +} + +# cleanup +`rm -rf $CACHE_DIR/tmp`; + +print "\nSuccess\n"; + + +# SUBS +###### + +# unpack a tarball with progress +sub unpack_tar { + my ($file, $dir) = @_; + + my $count = 0; + + open COUNT, "tar -tzvf $file 2>&1 |"; + $count++ while(); + close COUNT; + + my $i = 0; + open EXTRACT, "tar -C $dir -xzvf $file 2>&1 |"; + progress($i++, $count) while(); + close EXTRACT; + + progress(1,1); + $prev_progress = 0; + print "\n"; + + `rm -rf $file`; +} + +# update or initiate progress bar +sub progress { + my ($i, $total) = @_; + + my $width = 60; + my $percent = int(($i/$total) * 100); + my $numblobs = (($i/$total) * $width) - 2; + + return unless $numblobs != $prev_progress; + $prev_progress = $numblobs; + + printf("\r% -${width}s% 1s% 10s", '['.('=' x $numblobs).($numblobs == $width - 2 ? '=' : '>'), ']', "[ " . $percent . "% ]"); +} + +sub usage { + my $usage =<. + +Questions may also be sent to the Ensembl help desk at + + +Quickstart +========== + +Install API and cache files, run in offline mode: + +perl INSTALL.pl +perl variant_effect_predictor.pl --offline + + +Documentation +============= + +For a summary of command line flags, run: + +perl variant_effect_predictor.pl --help + +For full documentation see + +http://www.ensembl.org/info/docs/variation/vep/vep_script.html + + + +Changelog +========= + +New in version 2.6 (July 2012) +------------------------------ + +- support for structural variant consequences + +- Sequence Ontology (SO) consequence terms now default + +- script runtime 3-4x faster when using forking + +- 1000 Genomes global MAF available in cache files + +- improved memory usage + + +New in version 2.5 (May 2012) +----------------------------- + +- SIFT and PolyPhen predictions now available for RefSeq transcripts + +- retrieve cell type-specific regulatory consequences + +- consequences can be retrieved based on a single individual's genotype in + a VCF input file + +- find overlapping structural variants + +- Condel support removed from main script and moved to a plugin + + +New in version 2.4 (February 2012) +---------------------------------- +- offline mode and new installer script make it easy to use the VEP without + the usual dependencies + +- output columns configurable using the --fields flag + +- VCF output support expanded, can now carry all fields + +- output affected exon and intron numbers with --numbers + +- output overlapping protein domains using --domains + +- enhanced support for LRGs + +- plugins now work on variants called as intergenic + + +New in version 2.3 (December 2011) +---------------------------------- + +- Add custom annotations from tabix-indexed files (BED, GFF, GTF, VCF, bigWig) + +- Add new functionality to the VEP with user-written plugins + +- Filter input on consequence type + + +Version 2.2 (September 2011) +---------------------------- + +- SIFT, PolyPhen and Condel predictions and regulatory features now accessible + from the cache + +- Support for calling consequences against RefSeq transcripts + +- Variant identifiers (e.g. dbSNP rsIDs) and HGVS notations supported as input + format + +- Variants can now be filtered by frequency in HapMap and 1000 genomes + populations + +- Script can be used to convert files between formats (Ensembl/VCF/Pileup/HGVS + to Ensembl/VCF/Pileup) + +- Large amount of code moved to API modules to ensure consistency between web + and script VEP + +- Memory usage optimisations + +- VEP script moved to ensembl-tools CVS module + +- Added --canonical, --per_gene and --no_intergenic options + + +Version 2.1 (June 2011) +----------------------- + +- ability to use local file cache in place of or alongside connecting to an + Ensembl database + +- significant improvements to speed of script + +- whole-genome mode now default (no disadvantage for smaller datasets) + +- improved status output with progress bars + +- regulatory region consequences now reinstated and improved + +- modification to output file - Transcript column is now Feature, and is + followed by a Feature_type column + +- full documentation now online + + +Version 2.0 (April 2011) +------------------------ + +Version 2.0 of the Variant Effect Predictor script (VEP) constitutes a complete +overhaul of both the script and the API behind it. It requires at least version +62 of the Ensembl API to function. Here follows a summary of the changes: + +- support for SIFT, PolyPhen and Condel non-synonymous predictions in human + +- per-allele and compound consequence types + +- support for Sequence Ontology (SO) and NCBI consequence terms + +- modified output format + - support for new output fields in Extra column + - header section containing information on database and software versions + - codon change shown in output + - CDS position shown in output + - option to output Ensembl protein identifiers + - option to output HGVS nomenclature for variants + +- support for gzipped input files + +- enhanced configuration options, including the ability to read configuration + from a file + +- verbose output now much more useful + +- whole-genome mode now more stable + +- finding existing co-located variations now ~5x faster diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/example.vcf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/example.vcf Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,175 @@ +##fileformat=VCFv4.0 +#CHROM POS ID REF ALT QUAL FILTER INFO +21 26960070 rs116645811 G A . . . +21 26965148 rs1135638 G A . . . +21 26965172 rs10576 T C . . . +21 26965205 rs1057885 T C . . . +21 26976144 rs116331755 A G . . . +21 26976222 rs7278168 C T . . . +21 26976237 rs7278284 C T . . . +21 26978790 rs75377686 T C . . . +21 26978950 rs3989369 A G . . . +21 26979752 rs61735760 C T . . . +21 34022588 rs115683257 C A . . . +21 34029195 rs114053718 A G . . . +21 34058146 rs114942253 C T . . . +21 34059352 rs2254562 T C . . . +21 34787294 rs4986958 C G . . . +21 34787312 rs9808753 A G . . . +21 34799322 rs17878711 A G . . . +21 34805070 rs115458101 G A . . . +21 38437960 rs73901831 T G . . . +21 38438006 rs16994704 T C . . . +21 38439640 rs73901833 T C . . . +21 38444863 rs2507733 C T . . . +21 38444881 rs73200245 G A . . . +21 40186202 rs34373350 G A . . . +21 40190405 rs115908228 G A . . . +21 40190504 rs116698978 G A . . . +21 40191431 rs457705 T G . . . +21 40191548 rs113417859 C T . . . +21 40191638 rs461155 A G . . . +21 40193613 rs116476013 C T . . . +21 44514601 rs61737060 C T . . . +21 44514809 rs61737061 G A . . . +21 45387915 rs115397760 C T . . . +21 45389031 rs79309256 C G . . . +21 45389040 rs75532875 C T . . . +21 45389127 rs115875072 G A . . . +21 45402270 rs114573236 A G . . . +22 17662793 rs7289170 A G . . . +22 17669306 rs2231495 T C . . . +22 19340928 rs34000365 G A . . . +22 19365526 rs115877869 G C . . . +22 19371166 rs61735928 G A . . . +22 19373100 rs115157927 A T . . . +22 19384355 rs9618556 C T . . . +22 19384439 rs116458396 G A . . . +22 19958829 rs5993890 G A . . . +22 19960499 rs114971216 C T . . . +22 19960666 rs115736959 G A . . . +22 19966562 rs76496156 C T . . . +22 19967288 rs114131171 G A . . . +22 19967372 rs114932611 G A . . . +22 19967480 rs74544696 C G . . . +22 19967543 rs1058399 G A . . . +22 19967567 rs61736862 G A . . . +22 19968933 rs116398106 G A . . . +22 19968971 rs2073748 G A . . . +22 19969043 rs80068543 C T . . . +22 19969075 rs2073747 A G . . . +22 19969106 rs2240717 A G . . . +22 19969495 rs33992092 G T . . . +22 19978218 rs116782322 G A . . . +22 21998194 rs114537607 C T . . . +22 21998215 rs115706909 C T . . . +22 21998219 rs61739341 G A . . . +22 21998280 rs73166641 G A . . . +22 22317132 rs115446312 G A . . . +22 22317154 rs114712273 A G . . . +22 22318344 rs115663049 G A . . . +22 22318538 rs9610728 C T . . . +22 22318671 rs9610729 G A . . . +22 22324605 rs116048331 C T . . . +22 22328751 rs114880937 G A . . . +22 23488859 rs61747143 C T . . . +22 23494678 rs114023206 C T . . . +22 23503082 rs115006916 A T . . . +22 23503121 rs5759611 G A . . . +22 23503170 rs5759612 A G . . . +22 24579503 rs9680526 T A . . . +22 24579530 rs114362926 C T . . . +22 24580157 rs115180746 C G . . . +22 24581865 rs116768581 A G . . . +22 24582041 rs8141797 A G . . . +22 24582236 rs115684319 C T . . . +22 24582237 rs114116736 G A . . . +22 24583384 rs115131208 C T . . . +22 24583544 rs116712499 G A . . . +22 24583567 rs114572804 C T . . . +22 24584021 rs74558263 C T . . . +22 29138293 rs17885497 T C . . . +22 29141917 rs17884212 A G . . . +22 30659983 rs116796967 C T . . . +22 30660313 rs114880151 G C . . . +22 30823196 rs5753130 T C . . . +22 30856121 rs35764129 G A . . . +22 30857373 rs2240345 A C . . . +22 30857448 rs5749104 A G . . . +22 30857645 rs114917409 C G . . . +22 30858149 rs115111929 A C . . . +22 30860830 rs2269961 C T . . . +22 30862373 rs116469787 G A . . . +22 30864529 rs115812777 C T . . . +22 30864610 rs4820853 A G . . . +22 30866240 rs115083767 G A . . . +22 30866506 rs116155456 G A . . . +22 30951226 rs112976399 C T . . . +22 30951404 rs115862749 G A . . . +22 30951482 rs114565111 G C . . . +22 30951882 rs112070427 G A . . . +22 30952023 rs116808844 G A . . . +22 30953295 rs2267161 C T . . . +22 31485923 rs113930388 C T . . . +22 31487682 rs11913728 C T . . . +22 31487733 rs116593437 G A . . . +22 31487793 rs114015937 G A . . . +22 31487814 rs115143483 G A . . . +22 31491295 rs3205187 G C . . . +22 31491332 rs5997872 C T . . . +22 31492781 rs34292278 C T . . . +22 31492783 rs116641021 C T . . . +22 31658156 rs5997927 C T . . . +22 31663842 rs2228619 C G . . . +22 31663875 rs114531016 G A . . . +22 31674299 rs79923828 T C . . . +22 33670428 rs114246562 G A . . . +22 33673170 rs73399520 C T . . . +22 33700301 rs113253213 G A . . . +22 33700397 rs34292743 G A . . . +22 33712133 rs115686643 G A . . . +22 34000484 rs8142483 C T . . . +22 34022284 rs86487 G A . . . +22 34046452 rs59349720 G A . . . +22 34046596 rs63446460 C G . . . +22 34046619 rs115801300 G T . . . +22 39095836 rs114289284 G A . . . +22 39095849 rs6001200 G C . . . +22 39095947 rs114180421 A G . . . +22 39095987 rs2072800 A G . . . +22 39440149 rs5750728 C T . . . +22 39440176 rs114704208 A T . . . +22 39621797 rs35978693 G T . . . +22 39627630 rs55634318 G C . . . +22 39627649 rs116555717 T C . . . +22 39627821 rs17565 T C . . . +22 39770488 rs114689408 C T . . . +22 39770500 rs35429467 G A . . . +22 39772094 rs114671146 C T . . . +22 40139865 rs114948690 A G . . . +22 40140163 rs112945971 G A . . . +22 40140191 rs114600441 T C . . . +22 40140228 rs115673469 A G . . . +22 40140243 rs115302874 A G . . . +22 40161572 rs111738363 C G . . . +22 40217114 rs115492335 A G . . . +22 40231860 rs116765273 A G . . . +22 40231941 rs114100360 C A . . . +22 40257852 rs114714334 C T . . . +22 40258005 rs75296456 A G . . . +22 40283561 rs8139570 A G . . . +22 40415323 rs115365592 C T . . . +22 40415968 rs12330063 G T . . . +22 40415988 rs116438980 A G . . . +22 40417372 rs115837035 G A . . . +22 40417495 rs11914082 C T . . . +22 40417780 rs5995793 C T . . . +22 40417794 rs115818503 C T . . . +22 40417820 rs5995794 A G . . . +22 45287218 rs115264708 T C . . . +22 45312244 rs6519902 G A . . . +22 45312274 rs115694109 G A . . . +22 45312345 rs8135982 C T . . . +22 45316368 rs114043304 A G . . . +22 50616005 rs35195493 C G . . . +22 50616806 rs5771206 A G . . . diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/tool_dependencies.xml Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,19 @@ + + + + + + http://downloads.sourceforge.net/project/picard/picard-tools/1.56/picard-tools-1.56.zip + + picard-tools-1.56 + $INSTALL_DIR/jars + + + $INSTALL_DIR/jars + + + + + + + diff -r 000000000000 -r 1f6dce3d34e0 variant_effect_predictor/variant_effect_predictor.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/variant_effect_predictor.pl Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1759 @@ +#!/usr/bin/perl + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at . + + Questions may also be sent to the Ensembl help desk at + . + +=cut + +=head1 NAME + +Variant Effect Predictor - a script to predict the consequences of genomic variants + +http://www.ensembl.org/info/docs/variation/vep/vep_script.html + +Version 2.6 + +by Will McLaren (wm2@ebi.ac.uk) +=cut + +use strict; +use Getopt::Long; +use FileHandle; +use FindBin qw($Bin); +use lib $Bin; + +use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code); +use Bio::EnsEMBL::Variation::Utils::VEP qw( + parse_line + vf_to_consequences + validate_vf + convert_to_vcf + load_dumped_adaptor_cache + dump_adaptor_cache + get_all_consequences + get_slice + build_full_cache + read_cache_info + get_time + debug + @OUTPUT_COLS + @REG_FEAT_TYPES + %FILTER_SHORTCUTS +); + +# global vars +my $VERSION = '2.6'; + + +# define headers that would normally go in the extra field +# keyed on the config parameter used to turn it on +my %extra_headers = ( + protein => ['ENSP'], + canonical => ['CANONICAL'], + ccds => ['CCDS'], + hgvs => ['HGVSc','HGVSp'], + hgnc => ['HGNC'], + sift => ['SIFT'], + polyphen => ['PolyPhen'], + numbers => ['EXON','INTRON'], + domains => ['DOMAINS'], + regulatory => ['MOTIF_NAME','MOTIF_POS','HIGH_INF_POS','MOTIF_SCORE_CHANGE'], + cell_type => ['CELL_TYPE'], + individual => ['IND'], + xref_refseq => ['RefSeq'], + check_svs => ['SV'], + check_frequency => ['FREQS'], + gmaf => ['GMAF'], + user => ['DISTANCE'], +); + +my %extra_descs = ( + 'CANONICAL' => 'Indicates if transcript is canonical for this gene', + 'CCDS' => 'Indicates if transcript is a CCDS transcript', + 'HGNC' => 'HGNC gene identifier', + 'ENSP' => 'Ensembl protein identifer', + 'HGVSc' => 'HGVS coding sequence name', + 'HGVSp' => 'HGVS protein sequence name', + 'SIFT' => 'SIFT prediction', + 'PolyPhen' => 'PolyPhen prediction', + 'EXON' => 'Exon number(s) / total', + 'INTRON' => 'Intron number(s) / total', + 'DOMAINS' => 'The source and identifer of any overlapping protein domains', + 'MOTIF_NAME' => 'The source and identifier of a transcription factor binding profile (TFBP) aligned at this position', + 'MOTIF_POS' => 'The relative position of the variation in the aligned TFBP', + 'HIGH_INF_POS' => 'A flag indicating if the variant falls in a high information position of the TFBP', + 'MOTIF_SCORE_CHANGE' => 'The difference in motif score of the reference and variant sequences for the TFBP', + 'CELL_TYPE' => 'List of cell types and classifications for regulatory feature', + 'IND' => 'Individual name', + 'SV' => 'IDs of overlapping structural variants', + 'FREQS' => 'Frequencies of overlapping variants used in filtering', + 'GMAF' => 'Minor allele and frequency of existing variation in 1000 Genomes Phase 1', + 'DISTANCE' => 'Shortest distance from variant to transcript', +); + +# set output autoflush for progress bars +$| = 1; + +# configure from command line opts +my $config = &configure(scalar @ARGV); + +# run the main sub routine +&main($config); + +# this is the main sub-routine - it needs the configured $config hash +sub main { + my $config = shift; + + debug("Starting...") unless defined $config->{quiet}; + + $config->{start_time} = time(); + $config->{last_time} = time(); + + my $tr_cache = {}; + my $rf_cache = {}; + + # create a hash to hold slices so we don't get the same one twice + my %slice_cache = (); + + my @vfs; + my ($vf_count, $total_vf_count); + my $in_file_handle = $config->{in_file_handle}; + + # initialize line number in config + $config->{line_number} = 0; + + # read the file + while(<$in_file_handle>) { + chomp; + + $config->{line_number}++; + + # header line? + if(/^\#/) { + + # retain header lines if we are outputting VCF + if(defined($config->{vcf})) { + push @{$config->{headers}}, $_; + } + + # line with sample labels in VCF + if(defined($config->{individual}) && /^#CHROM/) { + my @split = split /\s+/; + + # no individuals + die("ERROR: No individual data found in VCF\n") if scalar @split <= 9; + + # get individual column indices + my %ind_cols = map {$split[$_] => $_} (9..$#split); + + # all? + if(scalar @{$config->{individual}} == 1 && $config->{individual}->[0] =~ /^all$/i) { + $config->{ind_cols} = \%ind_cols; + } + else { + my %new_ind_cols; + + # check we have specified individual(s) + foreach my $ind(@{$config->{individual}}) { + die("ERROR: Individual named \"$ind\" not found in VCF\n") unless defined $ind_cols{$ind}; + $new_ind_cols{$ind} = $ind_cols{$ind}; + } + + $config->{ind_cols} = \%new_ind_cols; + } + } + + next; + } + + # configure output file + $config->{out_file_handle} ||= &get_out_file_handle($config); + + # some lines (pileup) may actually parse out into more than one variant + foreach my $vf(@{&parse_line($config, $_)}) { + + $vf->{_line} = $_ ;#if defined($config->{vcf}) || defined($config->{original}); + + # now get the slice + if(!defined($vf->{slice})) { + my $slice; + + # don't get slices if we're using cache + # we can steal them from transcript objects later + if((!defined($config->{cache}) && !defined($config->{whole_genome})) || defined($config->{check_ref}) || defined($config->{convert})) { + + # check if we have fetched this slice already + if(defined $slice_cache{$vf->{chr}}) { + $slice = $slice_cache{$vf->{chr}}; + } + + # if not create a new one + else { + + $slice = &get_slice($config, $vf->{chr}); + + # if failed, warn and skip this line + if(!defined($slice)) { + warn("WARNING: Could not fetch slice named ".$vf->{chr}." on line ".$config->{line_number}."\n") unless defined $config->{quiet}; + next; + } + + # store the hash + $slice_cache{$vf->{chr}} = $slice; + } + } + + $vf->{slice} = $slice; + } + + # validate the VF + next unless validate_vf($config, $vf); + + # make a name if one doesn't exist + $vf->{variation_name} ||= $vf->{chr}.'_'.$vf->{start}.'_'.($vf->{allele_string} || $vf->{class_SO_term}); + + # jump out to convert here + if(defined($config->{convert})) { + &convert_vf($config, $vf); + next; + } + + if(defined $config->{whole_genome}) { + push @vfs, $vf; + $vf_count++; + $total_vf_count++; + + if($vf_count == $config->{buffer_size}) { + debug("Read $vf_count variants into buffer") unless defined($config->{quiet}); + + print_line($config, $_) foreach @{get_all_consequences($config, \@vfs)}; + + # calculate stats + my $total_rate = sprintf("%.0f vars/sec", $total_vf_count / ((time() - $config->{start_time}) || 1)); + my $rate = sprintf("%.0f vars/sec", $vf_count / ((time() - $config->{last_time}) || 1)); + $config->{last_time} = time(); + + debug("Processed $total_vf_count total variants ($rate, $total_rate total)") unless defined($config->{quiet}); + + @vfs = (); + $vf_count = 0; + } + } + else { + print_line($config, $_) foreach @{vf_to_consequences($config, $vf)}; + $vf_count++; + $total_vf_count++; + debug("Processed $vf_count variants") if $vf_count =~ /0$/ && defined($config->{verbose}); + } + } + } + + # if in whole-genome mode, finish off the rest of the buffer + if(defined $config->{whole_genome} && scalar @vfs) { + debug("Read $vf_count variants into buffer") unless defined($config->{quiet}); + + print_line($config, $_) foreach @{get_all_consequences($config, \@vfs)}; + + # calculate stats + my $total_rate = sprintf("%.0f vars/sec", $total_vf_count / ((time() - $config->{start_time}) || 1)); + my $rate = sprintf("%.0f vars/sec", $vf_count / ((time() - $config->{last_time}) || 1)); + $config->{last_time} = time(); + + debug("Processed $total_vf_count total variants ($rate, $total_rate total)") unless defined($config->{quiet}); + + debug($config->{filter_count}, "/$total_vf_count variants remain after filtering") if defined($config->{filter}) && !defined($config->{quiet}); + } + + debug("Executed ", defined($Bio::EnsEMBL::DBSQL::StatementHandle::count_queries) ? $Bio::EnsEMBL::DBSQL::StatementHandle::count_queries : 'unknown number of', " SQL statements") if defined($config->{count_queries}) && !defined($config->{quiet}); + + debug("Finished!") unless defined $config->{quiet}; +} + +# sets up configuration hash that is used throughout the script +sub configure { + my $args = shift; + + my $config = {}; + + GetOptions( + $config, + 'help', # displays help message + + # input options, + 'config=s', # config file name + 'input_file|i=s', # input file name + 'format=s', # input file format + + # DB options + 'species=s', # species e.g. human, homo_sapiens + 'registry=s', # registry file + 'host=s', # database host + 'port=s', # database port + 'user=s', # database user name + 'password=s', # database password + 'db_version=i', # Ensembl database version to use e.g. 62 + 'genomes', # automatically sets DB params for e!Genomes + 'refseq', # use otherfeatures RefSeq DB instead of Ensembl + #'no_disconnect', # disables disconnect_when_inactive + + # runtime options + 'most_severe', # only return most severe consequence + 'summary', # only return one line per variation with all consquence types + 'per_gene', # only return most severe per gene + 'buffer_size=i', # number of variations to read in before analysis + 'chunk_size=s', # size in bases of "chunks" used in internal hash structure + 'failed=i', # include failed variations when finding existing + 'no_whole_genome', # disables now default whole-genome mode + 'whole_genome', # proxy for whole genome mode - now just warns user + 'gp', # read coords from GP part of INFO column in VCF (probably only relevant to 1KG) + 'chr=s', # analyse only these chromosomes, e.g. 1-5,10,MT + 'check_ref', # check supplied reference allele against DB + 'check_existing', # find existing co-located variations + 'check_svs', # find overlapping structural variations + 'check_alleles', # only attribute co-located if alleles are the same + 'check_frequency', # enable frequency checking + 'gmaf', # add global MAF of existing var + 'freq_filter=s', # exclude or include + 'freq_freq=f', # frequency to filter on + 'freq_gt_lt=s', # gt or lt (greater than or less than) + 'freq_pop=s', # population to filter on + 'allow_non_variant', # allow non-variant VCF lines through + 'individual=s', # give results by genotype for individuals + 'phased', # force VCF genotypes to be interpreted as phased + 'fork=i', # fork into N processes + + # verbosity options + 'verbose|v', # print out a bit more info while running + 'quiet', # print nothing to STDOUT (unless using -o stdout) + 'no_progress', # don't display progress bars + + # output options + 'everything|e', # switch on EVERYTHING :-) + 'output_file|o=s', # output file name + 'force_overwrite', # force overwrite of output file if already exists + 'terms|t=s', # consequence terms to use e.g. NCBI, SO + 'coding_only', # only return results for consequences in coding regions + 'canonical', # indicates if transcript is canonical + 'ccds', # output CCDS identifer + 'xref_refseq', # output refseq mrna xref + 'protein', # add e! protein ID to extra column + 'hgnc', # add HGNC gene ID to extra column + 'hgvs', # add HGVS names to extra column + 'sift=s', # SIFT predictions + 'polyphen=s', # PolyPhen predictions + 'condel=s', # Condel predictions + 'regulatory', # enable regulatory stuff + 'cell_type=s', # filter cell types for regfeats + 'convert=s', # convert input to another format (doesn't run VEP) + 'filter=s', # run in filtering mode + 'no_intergenic', # don't print out INTERGENIC consequences + 'gvf', # produce gvf output + 'vcf', # produce vcf output + 'original', # produce output in input format + 'no_consequences', # don't calculate consequences + 'lrg', # enable LRG-based features + 'fields=s', # define your own output fields + 'domains', # output overlapping protein features + 'numbers', # include exon and intron numbers + + # cache stuff + 'cache', # use cache + 'write_cache', # enables writing to the cache + 'build=s', # builds cache from DB from scratch; arg is either all (all top-level seqs) or a list of chrs + 'no_adaptor_cache', # don't write adaptor cache + 'prefetch', # prefetch exons, translation, introns, codon table etc for each transcript + 'strip', # strips adaptors etc from objects before caching them + 'rebuild=s', # rebuilds cache by reading in existing then redumping - probably don't need to use this any more + 'dir=s', # dir where cache is found (defaults to $HOME/.vep/) + 'cache_region_size=i', # size of region in bases for each cache file + 'no_slice_cache', # tell API not to cache features on slice + 'standalone', # standalone renamed offline + 'offline', # offline mode uses minimal set of modules installed in same dir, no DB connection + 'skip_db_check', # don't compare DB parameters with cached + 'compress=s', # by default we use zcat to decompress; user may want to specify gzcat or "gzip -dc" + 'custom=s' => ($config->{custom} ||= []), # specify custom tabixed bgzipped file with annotation + 'tmpdir=s', # tmp dir used for BigWig retrieval + 'plugin=s' => ($config->{plugin} ||= []), # specify a method in a module in the plugins directory + + # debug + 'cluck', # these two need some mods to Bio::EnsEMBL::DBSQL::StatementHandle to work. Clucks callback trace and SQL + 'count_queries', # counts SQL queries executed + 'admin', # allows me to build off public hosts + 'debug', # print out debug info + 'tabix', # experimental use tabix cache files + ) or die "ERROR: Failed to parse command-line flags\n"; + + # print usage message if requested or no args supplied + if(defined($config->{help}) || !$args) { + &usage; + exit(0); + } + + # dir is where the cache and plugins live + $config->{dir} ||= join '/', ($ENV{'HOME'}, '.vep'); + + # dir gets set to the specific cache directory later on, so take a copy to use + # when configuring plugins + + $config->{toplevel_dir} = $config->{dir}; + + # ini file? + my $ini_file = $config->{dir}.'/vep.ini'; + + if(-e $ini_file) { + read_config_from_file($config, $ini_file); + } + + # config file? + if(defined $config->{config}) { + read_config_from_file($config, $config->{config}); + } + + # can't be both quiet and verbose + die "ERROR: Can't be both quiet and verbose!\n" if defined($config->{quiet}) && defined($config->{verbose}); + + # check forking + if(defined($config->{fork})) { + die "ERROR: Fork number must be greater than 1\n" if $config->{fork} <= 1; + + # check we can use MIME::Base64 + eval q{ use MIME::Base64; }; + + if($@) { + debug("WARNING: Unable to load MIME::Base64, forking disabled") unless defined($config->{quiet}); + delete $config->{fork}; + } + else { + + # try a practice fork + my $pid = fork; + + if(!defined($pid)) { + debug("WARNING: Fork test failed, forking disabled") unless defined($config->{quiet}); + delete $config->{fork}; + } + elsif($pid) { + waitpid($pid, 0); + } + elsif($pid == 0) { + exit(0); + } + } + } + + # check file format + if(defined $config->{format}) { + die "ERROR: Unrecognised input format specified \"".$config->{format}."\"\n" unless $config->{format} =~ /^(pileup|vcf|guess|hgvs|ensembl|id|vep)$/i; + } + + # check convert format + if(defined $config->{convert}) { + die "ERROR: Unrecognised output format for conversion specified \"".$config->{convert}."\"\n" unless $config->{convert} =~ /vcf|ensembl|pileup|hgvs/i; + } + + # check if user still using --standalone + if(defined $config->{standalone}) { + die "ERROR: --standalone replaced by --offline\n"; + } + + # connection settings for Ensembl Genomes + if($config->{genomes}) { + $config->{host} ||= 'mysql.ebi.ac.uk'; + $config->{port} ||= 4157; + } + + # connection settings for main Ensembl + else { + $config->{species} ||= "homo_sapiens"; + $config->{host} ||= 'ensembldb.ensembl.org'; + $config->{port} ||= 5306; + } + + # refseq or core? + if(defined($config->{refseq})) { + $config->{core_type} = 'otherfeatures'; + } + else { + $config->{core_type} = 'core'; + } + + # output term + if(defined $config->{terms}) { + die "ERROR: Unrecognised consequence term type specified \"".$config->{terms}."\" - must be one of ensembl, so, ncbi\n" unless $config->{terms} =~ /ensembl|display|so|ncbi/i; + if($config->{terms} =~ /ensembl|display/i) { + $config->{terms} = 'display'; + } + else { + $config->{terms} = uc($config->{terms}); + } + } + + # everything? + if(defined($config->{everything})) { + my %everything = ( + sift => 'b', + polyphen => 'b', + ccds => 1, + hgvs => 1, + hgnc => 1, + numbers => 1, + domains => 1, + regulatory => 1, + canonical => 1, + protein => 1, + gmaf => 1, + ); + + $config->{$_} = $everything{$_} for keys %everything; + + # these ones won't work with offline + delete $config->{hgvs} if defined($config->{offline}); + } + + # check nsSNP tools + foreach my $tool(grep {defined $config->{lc($_)}} qw(SIFT PolyPhen Condel)) { + die "ERROR: Unrecognised option for $tool \"", $config->{lc($tool)}, "\" - must be one of p (prediction), s (score) or b (both)\n" unless $config->{lc($tool)} =~ /^(s|p|b)/; + + die "ERROR: $tool not available for this species\n" unless $config->{species} =~ /human|homo/i; + + die "ERROR: $tool functionality is now available as a VEP Plugin - see http://www.ensembl.org/info/docs/variation/vep/vep_script.html#plugins\n" if $tool eq 'Condel'; + } + + # force quiet if outputting to STDOUT + if(defined($config->{output_file}) && $config->{output_file} =~ /stdout/i) { + delete $config->{verbose} if defined($config->{verbose}); + $config->{quiet} = 1; + } + + # individual(s) specified? + if(defined($config->{individual})) { + $config->{individual} = [split /\,/, $config->{individual}]; + + # force allow_non_variant + $config->{allow_non_variant} = 1; + } + + # summarise options if verbose + if(defined $config->{verbose}) { + my $header =< $b} map {length($_)} keys %$config)[-1]; + + foreach my $key(sort keys %$config) { + next if ref($config->{$key}) eq 'ARRAY' && scalar @{$config->{$key}} == 0; + print $key.(' ' x (($max_length - length($key)) + 4)).(ref($config->{$key}) eq 'ARRAY' ? join "\t", @{$config->{$key}} : $config->{$key})."\n"; + } + + print "\n".("-" x 20)."\n\n"; + } + + # check custom annotations + for my $i(0..$#{$config->{custom}}) { + my $custom = $config->{custom}->[$i]; + + my ($filepath, $shortname, $format, $type, $coords) = split /\,/, $custom; + $type ||= 'exact'; + $format ||= 'bed'; + $coords ||= 0; + + # check type + die "ERROR: Type $type for custom annotation file $filepath is not allowed (must be one of \"exact\", \"overlap\")\n" unless $type =~ /exact|overlap/; + + # check format + die "ERROR: Format $format for custom annotation file $filepath is not allowed (must be one of \"bed\", \"vcf\", \"gtf\", \"gff\", \"bigwig\")\n" unless $format =~ /bed|vcf|gff|gtf|bigwig/; + + # bigwig format + if($format eq 'bigwig') { + # check for bigWigToWig + die "ERROR: bigWigToWig does not seem to be in your path - this is required to use bigwig format custom annotations\n" unless `which bigWigToWig 2>&1` =~ /bigWigToWig$/; + } + + else { + # check for tabix + die "ERROR: tabix does not seem to be in your path - this is required to use custom annotations\n" unless `which tabix 2>&1` =~ /tabix$/; + + # remote files? + if($filepath =~ /tp\:\/\//) { + my $remote_test = `tabix $filepath 1:1-1 2>&1`; + if($remote_test =~ /fail/) { + die "$remote_test\nERROR: Could not find file or index file for remote annotation file $filepath\n"; + } + elsif($remote_test =~ /get_local_version/) { + debug("Downloaded tabix index file for remote annotation file $filepath") unless defined($config->{quiet}); + } + } + + # check files exist + else { + die "ERROR: Custom annotation file $filepath not found\n" unless -e $filepath; + die "ERROR: Tabix index file $filepath\.tbi not found - perhaps you need to create it first?\n" unless -e $filepath.'.tbi'; + } + } + + $config->{custom}->[$i] = { + 'file' => $filepath, + 'name' => $shortname || 'CUSTOM'.($i + 1), + 'type' => $type, + 'format' => $format, + 'coords' => $coords, + }; + } + + # check if using filter and original + die "ERROR: You must also provide output filters using --filter to use --original\n" if defined($config->{original}) && !defined($config->{filter}); + + # filter by consequence? + if(defined($config->{filter})) { + + my %filters = map {$_ => 1} split /\,/, $config->{filter}; + + # add in shortcuts + foreach my $filter(keys %filters) { + my $value = 1; + if($filter =~ /^no_/) { + delete $filters{$filter}; + $filter =~ s/^no_//g; + $value = 0; + $filters{$filter} = $value; + } + + if(defined($FILTER_SHORTCUTS{$filter})) { + delete $filters{$filter}; + $filters{$_} = $value for keys %{$FILTER_SHORTCUTS{$filter}}; + } + } + + $config->{filter} = \%filters; + + $config->{filter_count} = 0; + } + + # set defaults + $config->{user} ||= 'anonymous'; + $config->{buffer_size} ||= 5000; + $config->{chunk_size} ||= '50kb'; + $config->{output_file} ||= "variant_effect_output.txt"; + $config->{tmpdir} ||= '/tmp'; + $config->{format} ||= 'guess'; + $config->{terms} ||= 'SO'; + $config->{cache_region_size} ||= 1000000; + $config->{compress} ||= 'zcat'; + + # regulatory has to be on for cell_type + if(defined($config->{cell_type})) { + $config->{regulatory} = 1; + $config->{cell_type} = [split /\,/, $config->{cell_type}] if defined($config->{cell_type}); + } + + # can't use a whole bunch of options with most_severe + if(defined($config->{most_severe})) { + foreach my $flag(qw(no_intergenic protein hgnc sift polyphen coding_only ccds canonical xref_refseq numbers domains summary)) { + die "ERROR: --most_severe is not compatible with --$flag\n" if defined($config->{$flag}); + } + } + + # can't use a whole bunch of options with summary + if(defined($config->{summary})) { + foreach my $flag(qw(no_intergenic protein hgnc sift polyphen coding_only ccds canonical xref_refseq numbers domains most_severe)) { + die "ERROR: --summary is not compatible with --$flag\n" if defined($config->{$flag}); + } + } + + # frequency filtering + if(defined($config->{check_frequency})) { + foreach my $flag(qw(freq_freq freq_filter freq_pop freq_gt_lt)) { + die "ERROR: To use --check_frequency you must also specify flag --$flag\n" unless defined $config->{$flag}; + } + + # need to set check_existing + $config->{check_existing} = 1; + } + + $config->{check_existing} = 1 if defined $config->{check_alleles} || defined $config->{gmaf}; + + # warn users still using whole_genome flag + if(defined($config->{whole_genome})) { + debug("INFO: Whole-genome mode is now the default run-mode for the script. To disable it, use --no_whole_genome") unless defined($config->{quiet}); + } + + $config->{whole_genome} = 1 unless defined $config->{no_whole_genome}; + $config->{failed} = 0 unless defined $config->{failed}; + $config->{chunk_size} =~ s/mb?/000000/i; + $config->{chunk_size} =~ s/kb?/000/i; + $config->{cache_region_size} =~ s/mb?/000000/i; + $config->{cache_region_size} =~ s/kb?/000/i; + + # cluck and display executed SQL? + $Bio::EnsEMBL::DBSQL::StatementHandle::cluck = 1 if defined($config->{cluck}); + + # offline needs cache, can't use HGVS + if(defined($config->{offline})) { + $config->{cache} = 1; + + #die("ERROR: Cannot generate HGVS coordinates in offline mode\n") if defined($config->{hgvs}); + die("ERROR: Cannot use HGVS as input in offline mode\n") if $config->{format} eq 'hgvs'; + die("ERROR: Cannot use variant identifiers as input in offline mode\n") if $config->{format} eq 'id'; + die("ERROR: Cannot do frequency filtering in offline mode\n") if defined($config->{check_frequency}); + die("ERROR: Cannot retrieve overlapping structural variants in offline mode\n") if defined($config->{check_sv}); + } + + # write_cache needs cache + $config->{cache} = 1 if defined $config->{write_cache}; + + # no_slice_cache, prefetch and whole_genome have to be on to use cache + if(defined($config->{cache})) { + $config->{prefetch} = 1; + $config->{no_slice_cache} = 1; + $config->{whole_genome} = 1; + $config->{strip} = 1; + } + + $config->{build} = $config->{rebuild} if defined($config->{rebuild}); + + # force options for full build + if(defined($config->{build})) { + $config->{prefetch} = 1; + $config->{hgnc} = 1; + $config->{no_slice_cache} = 1; + $config->{cache} = 1; + $config->{strip} = 1; + $config->{write_cache} = 1; + $config->{cell_type} = 1 if defined($config->{regulatory}); + } + + # connect to databases + $config->{reg} = &connect_to_dbs($config); + + # complete dir with species name and db_version + $config->{dir} .= '/'.( + join '/', ( + defined($config->{offline}) ? $config->{species} : ($config->{reg}->get_alias($config->{species}) || $config->{species}), + $config->{db_version} || $config->{reg}->software_version + ) + ); + + # warn user cache directory doesn't exist + if(!-e $config->{dir}) { + + # if using write_cache + if(defined($config->{write_cache})) { + debug("INFO: Cache directory ", $config->{dir}, " not found - it will be created") unless defined($config->{quiet}); + } + + # want to read cache, not found + elsif(defined($config->{cache})) { + die("ERROR: Cache directory ", $config->{dir}, " not found"); + } + } + + if(defined($config->{cache})) { + # read cache info + if(read_cache_info($config)) { + debug("Read existing cache info") unless defined $config->{quiet}; + } + } + + # we configure plugins here because they can sometimes switch on the + # regulatory config option + configure_plugins($config); + + # include regulatory modules if requested + if(defined($config->{regulatory})) { + # do the use statements here so that users don't have to have the + # funcgen API installed to use the rest of the script + eval q{ + use Bio::EnsEMBL::Funcgen::DBSQL::RegulatoryFeatureAdaptor; + use Bio::EnsEMBL::Funcgen::DBSQL::MotifFeatureAdaptor; + use Bio::EnsEMBL::Funcgen::MotifFeature; + use Bio::EnsEMBL::Funcgen::RegulatoryFeature; + use Bio::EnsEMBL::Funcgen::BindingMatrix; + }; + + if($@) { + die("ERROR: Ensembl Funcgen API must be installed to use --regulatory or plugins that deal with regulatory features\n"); + } + } + + # user defined custom output fields + if(defined($config->{fields})) { + $config->{fields} = [split ',', $config->{fields}]; + debug("Output fields redefined (".scalar @{$config->{fields}}." defined)") unless defined($config->{quiet}); + $config->{fields_redefined} = 1; + } + $config->{fields} ||= \@OUTPUT_COLS; + + # suppress warnings that the FeatureAdpators spit if using no_slice_cache + Bio::EnsEMBL::Utils::Exception::verbose(1999) if defined($config->{no_slice_cache}); + + # get adaptors (don't get them in offline mode) + unless(defined($config->{offline})) { + + if(defined($config->{cache}) && !defined($config->{write_cache})) { + + # try and load adaptors from cache + if(!&load_dumped_adaptor_cache($config)) { + &get_adaptors($config); + &dump_adaptor_cache($config) if defined($config->{write_cache}) && !defined($config->{no_adaptor_cache}); + } + + # check cached adaptors match DB params + else { + my $dbc = $config->{sa}->{dbc}; + + my $ok = 1; + + if($dbc->{_host} ne $config->{host}) { + + # ens-livemirror, useastdb and ensembldb should all have identical DBs + unless( + ( + $dbc->{_host} eq 'ens-livemirror' + || $dbc->{_host} eq 'ensembldb.ensembl.org' + || $dbc->{_host} eq 'useastdb.ensembl.org' + ) && ( + $config->{host} eq 'ens-livemirror' + || $config->{host} eq 'ensembldb.ensembl.org' + || $config->{host} eq 'useastdb.ensembl.org' + ) + ) { + $ok = 0; + } + + unless(defined($config->{skip_db_check})) { + # but we still need to reconnect + debug("INFO: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}, " - reconnecting to host") unless defined($config->{quiet}); + + &get_adaptors($config); + } + } + + if(!$ok) { + if(defined($config->{skip_db_check})) { + debug("INFO: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}) unless defined($config->{quiet}); + } + else { + die "ERROR: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}, ". If you are sure this is OK, rerun with -skip_db_check flag set"; + } + } + } + } + else { + &get_adaptors($config); + &dump_adaptor_cache($config) if defined($config->{write_cache}) && !defined($config->{no_adaptor_cache}); + } + + # reg adaptors (only fetches if not retrieved from cache already) + &get_reg_adaptors($config) if defined($config->{regulatory}); + } + + # check cell types + if(defined($config->{cell_type}) && !defined($config->{build})) { + my $cls = ''; + + if(defined($config->{cache})) { + $cls = $config->{cache_cell_types}; + } + else { + my $cta = $config->{RegulatoryFeature_adaptor}->db->get_CellTypeAdaptor(); + $cls = join ",", map {$_->name} @{$cta->fetch_all}; + } + + foreach my $cl(@{$config->{cell_type}}) { + die "ERROR: cell type $cl not recognised; available cell types are:\n$cls\n" unless $cls =~ /(^|,)$cl(,|$)/; + } + } + + # get terminal width for progress bars + unless(defined($config->{quiet})) { + my $width; + + # module may not be installed + eval q{ + use Term::ReadKey; + }; + + if(!$@) { + my ($w, $h); + + # module may be installed, but e.g. + eval { + #($w, $h) = GetTerminalSize(); + $w = 167; + $h = 30; + }; + + $width = $w if defined $w; + } + + $width ||= 60; + $width -= 12; + $config->{terminal_width} = $width; + } + + # jump out to build cache if requested + if(defined($config->{build})) { + + if($config->{host} =~ /^(ensembl|useast)db\.ensembl\.org$/ && !defined($config->{admin})) { + die("ERROR: Cannot build cache using public database server ", $config->{host}, "\n"); + } + + # build the cache + debug("Building cache for ".$config->{species}) unless defined($config->{quiet}); + build_full_cache($config); + + # exit script + debug("Finished building cache") unless defined($config->{quiet}); + exit(0); + } + + + # warn user DB will be used for SIFT/PolyPhen/HGVS/frequency/LRG + if(defined($config->{cache})) { + + # these two def depend on DB + foreach my $param(grep {defined $config->{$_}} qw(hgvs check_frequency lrg check_sv)) { + debug("INFO: Database will be accessed when using --$param") unless defined($config->{quiet}); + } + + # as does using HGVS or IDs as input + debug("INFO: Database will be accessed when using --format ", $config->{format}) if ($config->{format} eq 'id' || $config->{format} eq 'hgvs') && !defined($config->{quiet}); + + # the rest may be in the cache + foreach my $param(grep {defined $config->{$_}} qw(sift polyphen regulatory)) { + next if defined($config->{'cache_'.$param}); + debug("INFO: Database will be accessed when using --$param; consider using the complete cache containing $param data (see documentation for details)") unless defined($config->{quiet}); + } + } + + # get list of chrs if supplied + if(defined($config->{chr})) { + my %chrs; + + foreach my $val(split /\,/, $config->{chr}) { + my @nnn = split /\-/, $val; + + foreach my $chr($nnn[0]..$nnn[-1]) { + $chrs{$chr} = 1; + } + } + + $config->{chr} = \%chrs; + } + + # get input file handle + $config->{in_file_handle} = &get_in_file_handle($config); + + return $config; +} + +# reads config from a file +sub read_config_from_file { + my $config = shift; + my $file = shift; + + open CONFIG, $file or die "ERROR: Could not open config file \"$file\"\n"; + + while() { + next if /^\#/; + my @split = split /\s+|\=/; + my $key = shift @split; + $key =~ s/^\-//g; + + if(defined($config->{$key}) && ref($config->{$key}) eq 'ARRAY') { + push @{$config->{$key}}, @split; + } + else { + $config->{$key} ||= $split[0]; + } + } + + close CONFIG; + + # force quiet if outputting to STDOUT + if(defined($config->{output_file}) && $config->{output_file} =~ /stdout/i) { + delete $config->{verbose} if defined($config->{verbose}); + $config->{quiet} = 1; + } + + debug("Read configuration from $file") unless defined($config->{quiet}); +} + +# configures custom VEP plugins +sub configure_plugins { + + my $config = shift; + + $config->{plugins} = []; + + if (my @plugins = @{ $config->{plugin} }) { + + # add the Plugins directory onto @INC + + unshift @INC, $config->{toplevel_dir}."/Plugins"; + + for my $plugin (@plugins) { + + # parse out the module name and parameters + + my ($module, @params) = split /,/, $plugin; + + # check we can use the module + + eval qq{ + use $module; + }; + if ($@) { + debug("Failed to compile plugin $module: $@") unless defined($config->{quiet}); + next; + } + + # now check we can instantiate it, passing any parameters to the constructor + + my $instance; + + eval { + $instance = $module->new($config, @params); + }; + if ($@) { + debug("Failed to instantiate plugin $module: $@") unless defined($config->{quiet}); + next; + } + + # check that the versions match + + my $plugin_version; + + if ($instance->can('version')) { + $plugin_version = $instance->version; + } + + my $version_ok = 1; + + if ($plugin_version) { + my ($plugin_major, $plugin_minor, $plugin_maintenance) = split /\./, $plugin_version; + my ($major, $minor, $maintenance) = split /\./, $VERSION; + + if ($plugin_major != $major) { + debug("Warning: plugin $plugin version ($plugin_version) does not match the current VEP version ($VERSION)") unless defined($config->{quiet}); + $version_ok = 0; + } + } + else { + debug("Warning: plugin $plugin does not define a version number") unless defined($config->{quiet}); + $version_ok = 0; + } + + debug("You may experience unexpected behaviour with this plugin") unless defined($config->{quiet}) || $version_ok; + + # check that it implements all necessary methods + + for my $required(qw(run get_header_info check_feature_type check_variant_feature_type)) { + unless ($instance->can($required)) { + debug("Plugin $module doesn't implement a required method '$required', does it inherit from BaseVepPlugin?") unless defined($config->{quiet}); + next; + } + } + + # all's good, so save the instance in our list of plugins + + push @{ $config->{plugins} }, $instance; + + debug("Loaded plugin: $module") unless defined($config->{quiet}); + + # for convenience, check if the plugin wants regulatory stuff and turn on the config option if so + + if (grep { $_ =~ /motif|regulatory/i } @{ $instance->feature_types }) { + debug("Fetching regulatory features for plugin: $module") unless defined($config->{quiet}); + $config->{regulatory} = 1; + } + } + } +} + +# connects to DBs (not done in offline mode) +sub connect_to_dbs { + my $config = shift; + + # get registry + my $reg = 'Bio::EnsEMBL::Registry'; + + unless(defined($config->{offline})) { + # load DB options from registry file if given + if(defined($config->{registry})) { + debug("Loading DB config from registry file ", $config->{registry}) unless defined($config->{quiet}); + $reg->load_all( + $config->{registry}, + $config->{verbose}, + undef, + $config->{no_slice_cache} + ); + } + + # otherwise manually connect to DB server + else { + $reg->load_registry_from_db( + -host => $config->{host}, + -user => $config->{user}, + -pass => $config->{password}, + -port => $config->{port}, + -db_version => $config->{db_version}, + -species => $config->{species} =~ /^[a-z]+\_[a-z]+/i ? $config->{species} : undef, + -verbose => $config->{verbose}, + -no_cache => $config->{no_slice_cache}, + ); + } + + eval { $reg->set_reconnect_when_lost() }; + + if(defined($config->{verbose})) { + # get a meta container adaptors to check version + my $core_mca = $reg->get_adaptor($config->{species}, 'core', 'metacontainer'); + my $var_mca = $reg->get_adaptor($config->{species}, 'variation', 'metacontainer'); + + if($core_mca && $var_mca) { + debug( + "Connected to core version ", $core_mca->get_schema_version, " database ", + "and variation version ", $var_mca->get_schema_version, " database" + ); + } + } + } + + return $reg; +} + +# get adaptors from DB +sub get_adaptors { + my $config = shift; + + die "ERROR: No registry" unless defined $config->{reg}; + + $config->{vfa} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'variationfeature'); + $config->{svfa} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'structuralvariationfeature'); + $config->{tva} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'transcriptvariation'); + $config->{pfpma} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'proteinfunctionpredictionmatrix'); + $config->{va} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'variation'); + + # get fake ones for species with no var DB + if(!defined($config->{vfa})) { + $config->{vfa} = Bio::EnsEMBL::Variation::DBSQL::VariationFeatureAdaptor->new_fake($config->{species}); + $config->{svfa} = Bio::EnsEMBL::Variation::DBSQL::StructuralVariationFeatureAdaptor->new_fake($config->{species}); + $config->{tva} = Bio::EnsEMBL::Variation::DBSQL::TranscriptVariationAdaptor->new_fake($config->{species}); + } + + $config->{sa} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'slice'); + $config->{ga} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'gene'); + $config->{ta} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'transcript'); + $config->{mca} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'metacontainer'); + $config->{csa} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'coordsystem'); + + # cache schema version + $config->{mca}->get_schema_version if defined $config->{mca}; + + # check we got slice adaptor - can't continue without a core DB + die("ERROR: Could not connect to core database\n") unless defined $config->{sa}; +} + +# gets regulatory adaptors +sub get_reg_adaptors { + my $config = shift; + + foreach my $type(@REG_FEAT_TYPES) { + next if defined($config->{$type.'_adaptor'}); + + my $adaptor = $config->{reg}->get_adaptor($config->{species}, 'funcgen', $type); + if(defined($adaptor)) { + $config->{$type.'_adaptor'} = $adaptor; + } + else { + delete $config->{regulatory}; + last; + } + } +} + +# gets file handle for input +sub get_in_file_handle { + my $config = shift; + + # define the filehandle to read input from + my $in_file_handle = new FileHandle; + + if(defined($config->{input_file})) { + + # check defined input file exists + die("ERROR: Could not find input file ", $config->{input_file}, "\n") unless -e $config->{input_file}; + + if($config->{input_file} =~ /\.gz$/){ + $in_file_handle->open($config->{compress}." ". $config->{input_file} . " | " ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n"); + } + else { + $in_file_handle->open( $config->{input_file} ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n"); + } + } + + # no file specified - try to read data off command line + else { + $in_file_handle = 'STDIN'; + debug("Reading input from STDIN (or maybe you forgot to specify an input file?)...") unless defined $config->{quiet}; + } + + return $in_file_handle; +} + +# gets file handle for output and adds header +sub get_out_file_handle { + my $config = shift; + + # define filehandle to write to + my $out_file_handle = new FileHandle; + + # check if file exists + if(-e $config->{output_file} && !defined($config->{force_overwrite})) { + # die("ERROR: Output file ", $config->{output_file}, " already exists. Specify a different output file with --output_file or overwrite existing file with -- force_overwrite\n"); + } + + if($config->{output_file} =~ /stdout/i) { + $out_file_handle = *STDOUT; + } + else { + $out_file_handle->open(">".$config->{output_file}) or die("ERROR: Could not write to output file ", $config->{output_file}, "\n"); + } + + # define headers for a VCF file + my @vcf_headers = ( + '#CHROM', + 'POS', + 'ID', + 'REF', + 'ALT', + 'QUAL', + 'FILTER', + 'INFO' + ); + + # file conversion, don't want to add normal headers + if(defined($config->{convert})) { + # header for VCF + if($config->{convert} =~ /vcf/i) { + print $out_file_handle "##fileformat=VCFv4.0\n"; + print $out_file_handle join "\t", @vcf_headers; + print $out_file_handle "\n"; + } + + return $out_file_handle; + } + + # GVF output, no header + elsif(defined($config->{gvf}) || defined($config->{original})) { + print $out_file_handle join "\n", @{$config->{headers}} if defined($config->{headers}) && defined($config->{original}); + return $out_file_handle; + } + + elsif(defined($config->{vcf})) { + + # create an info string for the VCF header + my @new_headers; + + # if the user has defined the fields themselves, we don't need to worry + if(defined $config->{fields_redefined}) { + @new_headers = @{$config->{fields}}; + } + else { + @new_headers = ( + + # get default headers, minus variation name and location (already encoded in VCF) + grep { + $_ ne 'Uploaded_variation' and + $_ ne 'Location' and + $_ ne 'Extra' + } @{$config->{fields}}, + + # get extra headers + map {@{$extra_headers{$_}}} + grep {defined $config->{$_}} + keys %extra_headers + ); + + # plugin headers + foreach my $plugin_header(split /\n/, get_plugin_headers($config)) { + $plugin_header =~ /\#\# (.+?)\t\:.+/; + push @new_headers, $1; + } + + # redefine the main headers list in config + $config->{fields} = \@new_headers; + } + + # add the newly defined headers as a header to the VCF + my $string = join '|', @{$config->{fields}}; + my @vcf_info_strings = ('##INFO='); + + # add custom headers + foreach my $custom(@{$config->{custom}}) { + push @vcf_info_strings, '##INFO={name}.',Number=.,Type=String,Description="'.$custom->{file}.' ('.$custom->{type}.')">'; + } + + # if this is already a VCF file, we need to add our new headers in the right place + if(defined($config->{headers})) { + + for my $i(0..$#{$config->{headers}}) { + if($config->{headers}->[$i] =~ /^\#CHROM\s+POS\s+ID/) { + splice(@{$config->{headers}}, $i, 0, @vcf_info_strings); + } + } + + print $out_file_handle join "\n", @{$config->{headers}}; + print $out_file_handle "\n"; + } + + else { + print $out_file_handle "##fileformat=VCFv4.0\n"; + print $out_file_handle join "\n", @vcf_info_strings; + print $out_file_handle "\n"; + print $out_file_handle join "\t", @vcf_headers; + print $out_file_handle "\n"; + } + + return $out_file_handle; + } + + # make header + my $time = &get_time; + my $db_string = $config->{mca}->dbc->dbname." on ".$config->{mca}->dbc->host if defined $config->{mca}; + $db_string .= "\n## Using cache in ".$config->{dir} if defined($config->{cache}); + my $version_string = + "Using API version ".$config->{reg}->software_version. + ", DB version ".(defined $config->{mca} && $config->{mca}->get_schema_version ? $config->{mca}->get_schema_version : '?'); + + # add key for extra column headers based on config + my $extra_column_keys = join "\n", + map {'## '.$_.' : '.$extra_descs{$_}} + sort map {@{$extra_headers{$_}}} + grep {defined $config->{$_}} + keys %extra_headers; + + my $header =<{custom})) { + foreach my $custom(@{$config->{custom}}) { + print $out_file_handle '## '.$custom->{name}."\t: ".$custom->{file}.' ('.$custom->{type}.")\n"; + } + } + + # add column headers + print $out_file_handle '#', (join "\t", @{$config->{fields}}); + print $out_file_handle "\n"; + + return $out_file_handle; +} + +sub get_plugin_headers { + + my $config = shift; + + my $header = ""; + + for my $plugin (@{ $config->{plugins} }) { + if (my $hdr = $plugin->get_header_info) { + for my $key (keys %$hdr) { + my $val = $hdr->{$key}; + + $header .= "## $key\t: $val\n"; + } + } + } + + return $header; +} + +# convert a variation feature to a line of output +sub convert_vf { + my $config = shift; + my $vf = shift; + + my $convert_method = 'convert_to_'.lc($config->{convert}); + my $method_ref = \&$convert_method; + + my $line = &$method_ref($config, $vf); + my $handle = $config->{out_file_handle}; + + if(scalar @$line) { + print $handle join "\t", @$line; + print $handle "\n"; + } +} + +# converts to Ensembl format +sub convert_to_ensembl { + my $config = shift; + my $vf = shift; + + return [ + $vf->{chr} || $vf->seq_region_name, + $vf->start, + $vf->end, + $vf->allele_string, + $vf->strand, + $vf->variation_name + ]; +} + + +# converts to pileup format +sub convert_to_pileup { + my $config = shift; + my $vf = shift; + + # look for imbalance in the allele string + my %allele_lengths; + my @alleles = split /\//, $vf->allele_string; + + foreach my $allele(@alleles) { + $allele =~ s/\-//g; + $allele_lengths{length($allele)} = 1; + } + + # in/del + if(scalar keys %allele_lengths > 1) { + + if($vf->allele_string =~ /\-/) { + + # insertion? + if($alleles[0] eq '-') { + shift @alleles; + + for my $i(0..$#alleles) { + $alleles[$i] =~ s/\-//g; + $alleles[$i] = '+'.$alleles[$i]; + } + } + + else { + @alleles = grep {$_ ne '-'} @alleles; + + for my $i(0..$#alleles) { + $alleles[$i] =~ s/\-//g; + $alleles[$i] = '-'.$alleles[$i]; + } + } + + @alleles = grep {$_ ne '-' && $_ ne '+'} @alleles; + + return [ + $vf->{chr} || $vf->seq_region_name, + $vf->start - 1, + '*', + (join "/", @alleles), + ]; + } + + else { + warn "WARNING: Unable to convert variant to pileup format on line number ", $config->{line_number} unless defined($config->{quiet}); + return []; + } + + } + + # balanced sub + else { + return [ + $vf->{chr} || $vf->seq_region_name, + $vf->start, + shift @alleles, + (join "/", @alleles), + ]; + } +} + +# converts to HGVS (hackily returns many lines) +sub convert_to_hgvs { + my $config = shift; + my $vf = shift; + + # ensure we have a slice + $vf->{slice} ||= get_slice($config, $vf->{chr}); + + my $tvs = $vf->get_all_TranscriptVariations; + + my @return = values %{$vf->get_all_hgvs_notations()}; + + if(defined($tvs)) { + push @return, map {values %{$vf->get_all_hgvs_notations($_->transcript, 'c')}} @$tvs; + push @return, map {values %{$vf->get_all_hgvs_notations($_->transcript, 'p')}} @$tvs; + } + + return [join "\n", @return]; +} + +# prints a line of output from the hash +sub print_line { + my $config = shift; + my $line = shift; + return unless defined($line); + + my $output; + + # normal + if(ref($line) eq 'HASH') { + my %extra = %{$line->{Extra}}; + + $line->{Extra} = join ';', map { $_.'='.$line->{Extra}->{$_} } keys %{ $line->{Extra} || {} }; + + # if the fields have been redefined we need to search through in case + # any of the defined fields are actually part of the Extra hash + $output = join "\t", map { + (defined $line->{$_} ? $line->{$_} : (defined $extra{$_} ? $extra{$_} : '-')) + } @{$config->{fields}}; + } + + # gvf/vcf + else { + $output = $$line; + } + + my $fh = $config->{out_file_handle}; + print $fh "$output\n"; +} + +# outputs usage message +sub usage { + my $usage =< + to annotate variants using an ENSEMBL database + + variant_effect_predictor.pl -i=$input -o=$output -species=$species + #if $database_options.database_options_selector == "advanced" + --host=$database_options.host --user=$database_options.username --port=$database_options.portnum + #if $database_options.password + --password=$database_options.password + #end if + #else + ## hardcoded default values - bad? + --host=www.ebi.edu.au --user=anonymous --port=3306 + #end if + #if $parameters.everything + --everything + #else + #if $parameters.sift_options.sift + --sift $parameters.sift_options.sift_value.value + #end if + #if $parameters.polyphen_options.polyphen + --polyphen $parameters.polyphen_options.polyphen_value.value + #end if + #if $parameters.ccds + --ccds + #end if + #if $parameters.hgvs + --hgvs + #end if + #if $parameters.hgnc + --hgnc + #end if + #if $parameters.numbers + --numbers + #end if + #if $parameters.domains + --domains + #end if + #if $parameters.regulatory + --regulatory + #end if + #if $parameters.canonical + --canonical + #end if + #if $parameters.protein + --protein + #end if + #if $parameters.gmaf + --gmaf + #end if + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +============ +Description +============ + This tool connects to the ENSEMBL database using ENSEMBL's Variant Effect Predictor script and retrieves annotations for an input variants file. + +============ +Parameters +============ +everything + Shortcut flag to switch on all of the following: + ``sift b + polyphen b + ccds + hgvs + hgnc + numbers + domains + regulatory + cell_type + canonical + protein + gmaf`` + +sift [both|score|prediction term] + **Human only** SIFT predicts whether an amino acid substitution affects protein function based on sequence homology and the physical properties of amino acids. The VEP can output the prediction term, score or both. *Not used by default* + +polyphen [both|score|prediction term] + **Human only** PolyPhen is a tool which predicts possible impact of an amino acid substitution on the structure and function of a human protein using straightforward physical and comparative considerations. The VEP can output the prediction term, score or both. *Not used by default* + +ccds + Adds the CCDS transcript identifier (where available) to the output. *Not used by default* + +hgvs + Add HGVS nomenclature based on Ensembl stable identifiers to the output. Both coding and protein sequence names are added where appropriate. Currently it is not possible to generate HGVS identifiers from the cache; a database connection must be made. *Not used by default* + +hgnc + Adds the HGNC gene identifer (where available) to the output. *Not used by default* + +numbers + Adds affected exon and intron numbering to to output. Format is Number/Total. *Not used by default* + +domains + Adds names of overlapping protein domains to output. *Not used by default* + +regulatory + Look for overlaps with regulatory regions. The script can also call if a variant falls in a high information position within a transcription factor binding site. Output lines have a Feature type of RegulatoryFeature or MotifFeature. *Not used by default* + +cell_type + Report only regulatory regions that are found in the given cell type(s). Can be a single cell type or a comma-separated list. The functional type in each cell type is reported under CELL_TYPE in the output. To retrieve a list of cell types, use ``--cell_type list``. *Not used by default* + +canonical + Adds a flag indicating if the transcript is the canonical transcript for the gene. *Not used by default* + +protein + Add the Ensembl protein identifier to the output where appropriate. *Not used by default* + +gmaf + Add the global minor allele frequency (MAF) from 1000 Genomes Phase 1 data for any existing variant to the output. *Not used by default* + + +